diff --git a/.gitignore b/.gitignore
index be75938ec401b1d72fa54773c85191aaac7d7f35..828bbe9bd3363853ae3f58f54a8d5f60cefad837 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ Podfile.lock
 /tensorflow/contrib/lite/examples/ios/simple/data/*.txt
 /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
 xcuserdata/**
+/api_init_files_list.txt
 
 # Android
 .gradle
diff --git a/CODEOWNERS b/CODEOWNERS
index 007a304c3e706ce968576ec8979c08f1a3bcc552..b9f0313cc6d59d3fbdcd014e1a528126d863075a 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -45,7 +45,7 @@
 # /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
 # /tensorflow/contrib/slim/ @sguada @thenbasilmanran
 # /tensorflow/contrib/stateless/ @girving
-# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst
+# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank
 # /tensorflow/contrib/testing/ @dandelionmane
 # /tensorflow/contrib/timeseries/ @allenlavoie
 # /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu
diff --git a/README.md b/README.md
index ef5bdc66ef03131318e1dde627e0224cca9137fd..e1a50c87e26d493ba3ac760f357905d89aa40dab 100644
--- a/README.md
+++ b/README.md
@@ -7,14 +7,14 @@
 
 | **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|---------------|
-| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
-between them.  This flexible architecture lets you deploy computation to one
+between them.  This flexible architecture enables you to deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
-code.  TensorFlow also includes TensorBoard, a data visualization toolkit.
+code.  TensorFlow also includes [TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard), a data visualization toolkit.
 
 TensorFlow was originally developed by researchers and engineers
 working on the Google Brain team within Google's Machine Intelligence Research
@@ -22,6 +22,10 @@ organization for the purposes of conducting machine learning and deep neural
 networks research.  The system is general enough to be applicable in a wide
 variety of other domains, as well.
 
+Keep up to date with release announcements and security updates by
+subscribing to
+[announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
+
 ## Installation
 *See [Installing TensorFlow](https://www.tensorflow.org/get_started/os_setup.html) for instructions on how to install our release binaries or how to build from source.*
 
@@ -82,6 +86,7 @@ The TensorFlow project strives to abide by generally accepted best practices in
 
 * [TensorFlow Website](https://www.tensorflow.org)
 * [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
+* [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
 * [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
diff --git a/RELEASE.md b/RELEASE.md
index 6f54dee58f75c29a16545ba25de12fe059baf1eb..2717c75740aeea7821fb6c57dfc85908e86e9d51 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,132 @@
+# Release 1.8.0
+
+## Major Features And Improvements
+* Can now pass `tf.contrib.distribute.MirroredStrategy()` to `tf.estimator.RunConfig()` to run an Estimator model on multiple GPUs on one machine.
+* Add `tf.contrib.data.prefetch_to_device()`, which supports prefetching to GPU memory.
+* Added Gradient Boosted Trees as pre-made Estimators: BoostedTreesClassifier, BoostedTreesRegressor.
+* Add 3rd generation pipeline config for Cloud TPUs which improves performance and usability.
+* `tf.contrib.bayesflow` is moving out to it's own repo.
+* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication.
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * Add `tf.contrib.data.prefetch_to_device`, which enables prefetching dataset elements to GPU memory.
+  * Add `tf.contrib.data.AUTOTUNE`, which allows the tf.data runtime to automatically tune the prefetch buffer sizes based on your system and environment.
+  * Add `tf.contrib.data.make_csv_dataset` for building datasets of CSV files.
+* Eager Execution:
+  * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators when eager execution is enabled.
+  * Automatic device placement has been enabled (i.e., use a GPU if available automatically, without requiring an explicit `with tf.device(“/gpu:0”)`) (Fixes #14133)
+  * `tf.GradientTape` has moved out of contrib.
+* `tf.keras`:
+  * Added the fashion mnist dataset.
+  * New data preprocessing functions: `image/random_brightness`, `sequence/TimeseriesGenerator`, and `text/hashing_trick`.
+* Accelerated Linear Algebra (XLA):
+  * Select and scatter in reference util and evaluator now use lexicographical order to break ties.
+* TensorFlow Debugger (tfdbg) CLI:
+  * During tensor-filter operations, allow exclusion of nodes by regular expressions.
+  * Fix spurious background colors in some text terminals.
+* `tf.contrib`:
+  * Add meta-distribution BatchReshape which reshapes batch dimensions.
+  * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU.
+  * Add `tf.contrib.framework.argsort`.
+  * Allow `DNNBoostedTreeCombinedEstimator` to work with core versions of feature columns and losses.
+  * Add non-linear image warping ops: `tf.contrib.image.sparse_image_warp`, `tf.contrib.image.dense_image_warp`, and `tf.contrib.image.interpolate_spline`.
+  * Fix bug in `tf.contrib.opt.MultitaskOptimizerWrapper` where types of tensors were mismatched.
+* Other:
+  * Low-level graph construction now calls the TensorFlow C API. This change should be invisible to most users, but can be disabled by setting the environment variable `TF_C_API_GRAPH_CONSTRUCTION=0` in this release. Future releases will remove the ability to disable this change. Please [file a bug](https://github.com/tensorflow/tensorflow/issues/new) if you find yourself using this escape hatch.
+  * Add description of shapes and a pointer to tutorial notebook in `tf.distributions.Distribution`.
+  * Update scatter operations:
+    * Add `tf.scatter_min` and `tf.scatter_max`
+    * Extend scatter operations to work with a scalar update parameter.
+  * Move cuDNN RNN ops to core for use in TensorFlow codebase only.
+  * Add `float64` support for `Conv2d`, `Conv2dBackpropInput`, and `Conv2dBackpropFilter`.
+  * Add `float64` support for `AvgPool`/`AvgPoolGrad`.
+  * Make graph name scope thread local so that they work correctly in multi-threaded environments.
+  * Update nsync synchronization library to avoid slow primitives on Linux.
+  * Removed need to put nsync/public on C include path when building custom ops.
+  * Add `tf.image.psnr`, `tf.image.ssim`, `tf.image.ssim_multiscale`, `tf.image.image_gradients`, `tf.image.sobel_edges`.
+  * Add links to https://js.tensorflow.org.
+  * Fix non-uniformity of orthogonal matrices.
+  * Fix bug where multi-image Estimator eval summaries were not displayed correctly.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Aghasy, Alan Du, Alan Lee, Alan Yee, Alex Wiltschko, Animesh Karnewar, Ankit Gupta, Anton Matosov, Aris L, Ben Barsdell, Brent Yi, Brett Koonce, Carl Thomé, cbockman, Chikanaga Tomoyuki, Chris Tava, CéDric Deltheil, Dahan Gong, Dalmo Cirne, Daniel Erenrich, David Norman, DavidNorman, Edd Wilder-James, Fanjin Zeng, Felix Abecassis, fo40225, George Sterpu, Giovanni Terlingen, Gor Baghdasaryan, Guillaume Klein, Hanchen Li, Ilya Polenov, Jakub Kolodziejczyk, Jason Sadler, Jayaram Bobba, Jerry Liu, jinghuangintel, Jiongyan Zhang (张炯衍), Joel Shor, Jong Wook Kim, Julian Eisenschlos, Karl Lessard, Krish Ravindranath, Loo Rong Jie, Lukas Geiger, Luke Iwanski, Mahmoud Abuzaina, ManHyuk, Marvin Richter, Maximilian Mitchell, Mohammad Ashraf Bhuiyan, msofka, Mustafa Kasap, Nathan Burnham, Nathan Luehr, Naveen Marri, ngc92, nio1814, Oleg Zabluda, Ou Changkun, Panos Ipeirotis, Paul Van Eck, Peter Lee, Piotr Czapla, qjivy, Rholais Lii, Rodrigo Formigone, Russell Klopfer, ryantimjohn, Sang Han, SebastiáN RamíRez, shengfuintel, Siby Jose Plathottam, Silver Chan, Stanislaw Antol, Taehoon Lee, Tarang Chugh, Ted Chang, Thomas Bastiani, Xian Xu, Xiaoming (Jason) Cui, Yan Facai (颜发才), yaox12, Yashal Shakti Kanungo, Yong Tang, Yuan (Terry) Tang, Yuxin Wu, Ziyue(Louis) Lu
+
+
+# Release 1.7.0
+
+## Major Features And Improvements
+* Eager mode is moving out of contrib, try `tf.enable_eager_execution()`.
+* Graph rewrites emulating fixed-point quantization compatible with TensorFlow Lite, supported by new `tf.contrib.quantize` package.
+* Easily customize gradient computation with `tf.custom_gradient`.
+* [TensorBoard Debugger Plugin](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md), the graphical user interface (GUI) of TensorFlow Debugger (tfdbg), is now in alpha.
+* Experimental support for reading a sqlite database as a `Dataset` with new `tf.contrib.data.SqlDataset`.
+* Distributed Mutex / CriticalSection added to `tf.contrib.framework.CriticalSection`.
+* Better text processing with `tf.regex_replace`.
+* Easy, efficient sequence input with `tf.contrib.data.bucket_by_sequence_length`
+* Initial support for `tf.contrib.tensorrt` that enables native TensorRT in
+  TensorFlow.
+
+## Bug Fixes and Other Changes
+* Accelerated Linear Algebra (XLA):
+  * Add `MaxPoolGradGrad` support for XLA
+  * CSE pass from Tensorflow is now disabled in XLA.
+* `tf.data`:
+  * `tf.data.Dataset`
+    * Add support for building C++ Dataset op kernels as external libraries, using the `tf.load_op_library()` mechanism.
+    * `Dataset.list_files()` now shuffles its output by default.
+    * `Dataset.shuffle(..., seed=tf.constant(0, dtype=tf.int64))` now yields the same sequence of elements as `Dataset.shuffle(..., seed=0)`.
+  * Add `num_parallel_reads` argument to `tf.data.TFRecordDataset`.
+* `tf.contrib`:
+  * `tf.contrib.bayesflow.halton_sequence` now supports randomization.
+  * Add support for scalars in `tf.contrib.all_reduce`.
+  * Add `effective_sample_size` to `tf.contrib.bayesflow.mcmc_diagnostics`.
+  * Add `potential_scale_reduction` to `tf.contrib.bayesflow.mcmc_diagnostics`.
+  * Add `BatchNormalization`, `Kumaraswamy` bijectors.
+  * Deprecate `tf.contrib.learn`. Please check contrib/learn/README.md for instructions on how to convert existing code.
+  * `tf.contrib.data`
+    * Remove deprecated `tf.contrib.data.Dataset`, `tf.contrib.data.Iterator`, `tf.contrib.data.FixedLengthRecordDataset`, `tf.contrib.data.TextLineDataset`, and `tf.contrib.data.TFRecordDataset` classes.
+    * Added `bucket_by_sequence_length`, `sliding_window_batch`, and `make_batched_features_dataset`
+  * Remove unmaintained `tf.contrib.ndlstm`. You can find it externally at https://github.com/tmbarchive/tfndlstm.
+  * Moved most of `tf.contrib.bayesflow` to its own repo: `tfp`
+* Other:
+  * tf.py_func now reports the full stack trace if an exception occurs.
+  * Integrate `TPUClusterResolver` with GKE's integration for Cloud TPUs.
+  * Add a library for statistical testing of samplers.
+  * Add Helpers to stream data from the GCE VM to a Cloud TPU.
+  * Integrate ClusterResolvers with TPUEstimator.
+  * Unify metropolis_hastings interface with HMC kernel.
+  * Move LIBXSMM convolutions to a separate --define flag so that they are disabled by default.
+  * Fix `MomentumOptimizer` lambda.
+  * Reduce `tfp.layers` boilerplate via programmable docstrings.
+  * Add `auc_with_confidence_intervals`, a method for computing the AUC and confidence interval with linearithmic time complexity.
+  * `regression_head` now accepts customized link function, to satisfy the usage that user can define their own link function if the `array_ops.identity` does not meet the requirement.
+  * Fix `initialized_value` and `initial_value` behaviors for `ResourceVariables` created from `VariableDef` protos.
+  * Add TensorSpec to represent the specification of Tensors.
+  * Constant folding pass is now deterministic.
+  * Support `float16` `dtype` in `tf.linalg.*`.
+  * Add `tf.estimator.export.TensorServingInputReceiver` that allows `tf.estimator.Estimator.export_savedmodel` to pass raw tensors to model functions.
+
+## Deprecations
+
+* TensorFlow 1.7 may be the last time we support Cuda versions below 8.0.
+  Starting with TensorFlow 1.8 release, 8.0 will be the minimum supported
+  version.
+* TensorFlow 1.7 may be the last time we support cuDNN versions below 6.0.
+  Starting with TensorFlow 1.8 release, 6.0 will be the minimum supported
+  version.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Abe, Alistair Low, Andy Kernahan, Appledore, Ben, Ben Barsdell, Boris Pfahringer, Brad Wannow, Brett Koonce, Carl Thomé, cclauss, Chengzhi Chen, Chris Drake, Christopher Yeh, Clayne Robison, Codrut Grosu, Daniel Trebbien, Danny Goodman, David Goodwin, David Norman, Deron Eriksson, Donggeon Lim, Donny Viszneki, DosLin, DylanDmitri, Francisco Guerrero, Fred Reiss, gdh1995, Giuseppe, Glenn Weidner, gracehoney, Guozhong Zhuang, Haichen "Hc" Li, Harald Husum, harumitsu.nobuta, Henry Spivey, hsm207, Jekyll Song, Jerome, Jiongyan Zhang, jjsjann123, John Sungjin Park, Johnson145, JoshVarty, Julian Wolff, Jun Wang, June-One, Kamil Sindi, Kb Sriram, Kdavis-Mozilla, Kenji, lazypanda1, Liang-Chi Hsieh, Loo Rong Jie, Mahesh Bhosale, MandarJKulkarni, ManHyuk, Marcus Ong, Marshal Hayes, Martin Pool, matthieudelaro, mdfaijul, mholzel, Michael Zhou, Ming Li, Minmin Sun, Myungjoo Ham, MyungsungKwak, Naman Kamra, Peng Yu, Penghao Cen, Phil, Raghuraman-K, resec, Rohin Mohanadas, Sandeep N Gupta, Scott Tseng, seaotterman, Seo Sanghyeon, Sergei Lebedev, Ted Chang, terrytangyuan, Tim H, tkunic, Tod, vihanjain, Yan Facai (颜发才), Yin Li, Yong Tang, Yukun Chen, Yusuke Yamada
+
+
+
 # Release 1.6.0
 
 ## Breaking Changes
diff --git a/tensorflow/SECURITY.md b/SECURITY.md
similarity index 90%
rename from tensorflow/SECURITY.md
rename to SECURITY.md
index fea24b273920885ba8a1ae96aafbf7710df46e1f..a5ce3a62ee202f6e7d83f0fedc2777d9c88ba9b5 100644
--- a/tensorflow/SECURITY.md
+++ b/SECURITY.md
@@ -6,7 +6,7 @@ report vulnerabilities in TensorFlow.
 
 ## TensorFlow models are programs
 
-TensorFlow's runtime system interprets and executes programs. What machine 
+TensorFlow's runtime system interprets and executes programs. What machine
 learning practitioners term
 [**models**](https://developers.google.com/machine-learning/glossary/#model) are
 expressed as programs that TensorFlow executes.  TensorFlow programs are encoded
@@ -28,12 +28,12 @@ data you supply to TensorFlow to train a model, or to use a model to run
 inference on the data.
 
 **TensorFlow models are programs, and need to be treated as such from a security
-perspective.** 
+perspective.**
 
 ## Running untrusted models
 
 As a general rule: **Always** execute untrusted models inside a sandbox (e.g.,
-[nsjail](https://github.com/google/nsjail)). 
+[nsjail](https://github.com/google/nsjail)).
 
 There are several ways in which a model could become untrusted. Obviously, if an
 untrusted party supplies TensorFlow kernels, arbitrary code may be executed.
@@ -109,11 +109,11 @@ graphs known to the `ModelServer`. This means that an attacker may run
 graphs using untrusted inputs as described above, but they would not be able to
 execute arbitrary graphs. It is possible to safely expose a `ModelServer`
 directly to an untrusted network, **but only if the graphs it is configured to
-use have been carefully audited to be safe**. 
+use have been carefully audited to be safe**.
 
 Similar to best practices for other servers, we recommend running any
 `ModelServer` with appropriate privileges (i.e., using a separate user with
-reduced permisisons). In the spirit of defense in depth, we recommend
+reduced permissions). In the spirit of defense in depth, we recommend
 authenticating requests to any TensorFlow server connected to an untrusted
 network, as well as sandboxing the server to minimize the adverse effects of
 any breach.
@@ -129,11 +129,11 @@ with specially crafted inputs.
 ### What is a vulnerability?
 
 Given TensorFlow's flexibility, it is possible to specify computation graphs
-which exhibit unexpected or unwanted behaviors. The fact that TensorFlow models
+which exhibit unexpected or unwanted behavior. The fact that TensorFlow models
 can perform arbitrary computations means that they may read and write files,
 communicate via the network, produce deadlocks and infinite loops, or run out
 of memory. It is only when these behaviors are outside the specifications of the
-operations involved that such behavior is a vulnerability. 
+operations involved that such behavior is a vulnerability.
 
 A `FileWriter` writing a file is not unexpected behavior and therefore is not a
 vulnerability in TensorFlow. A `MatMul` allowing arbitrary binary code execution
@@ -170,6 +170,17 @@ Please use a descriptive subject line for your report email. After the initial
 reply to your report, the security team will endeavor to keep you informed of
 the progress being made towards a fix and announcement. 
 
+In addition, please include the following information along with your report:
+
+* Your name and affiliation (if any).
+* A description the technical details of the vulnerabilities. It is very
+  important to let us know how we can reproduce your findings.
+* An explanation who can exploit this vulnerability, and what they gain when
+  doing so -- write an attack scenario. This will help us evaluate your report
+  quickly, especially if the issue is complex.
+* Whether this vulnerability public or known to third parties. If it is, please
+  provide details.
+
 If you believe that an existing (public) issue is security-related, please send
 an email to `security@tensorflow.org`. The email should include the issue ID and
 a short description of why it should be handled according to this security
@@ -233,7 +244,7 @@ v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
 
 ### Known vulnerabilities
 
-| Type              | Versions affected |        Reported by | Additional Information      |
-|-------------------|:-----------------:|--------------------|-----------------------------|
-| out of bounds read|             <=1.4 | TenCent Blade Team | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+| Type               | Versions affected | Reported by           | Additional Information      |
+|--------------------|:-----------------:|-----------------------|-----------------------------|
+| Out Of Bounds Read |             <=1.4 | Blade Team of Tencent | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
 
diff --git a/WORKSPACE b/WORKSPACE
index 1e38a9a8cd754886fc5232531816b875de0879a3..4ddfb9a3832ea1ea639ace887e1d601bdd857086 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
-    strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
+    sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
+    strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",  # 2018-01-16
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",  # 2018-04-13
     ],
 )
 
@@ -14,6 +14,12 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 
 closure_repositories()
 
+# We must check the bazel version before trying to parse any other BUILD
+# files, in case the parsing of those build files depends on the bazel
+# version we require here.
+load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
+check_bazel_version_at_least("0.10.0")
+
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
 # Uncomment and update the paths in these entries to build the Android demo.
diff --git a/configure.py b/configure.py
index b5436dba20ad1aadeffe8057c0a85709914f603e..fe15bfc1a43bac5d9c249bf5b61854ff0e07aec7 100644
--- a/configure.py
+++ b/configure.py
@@ -35,12 +35,13 @@ except ImportError:
 
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
+_DEFAULT_NCCL_VERSION = '1.3'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
 _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
                           'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
-_DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/x86_64-linux-gnu'
+_DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/%s-linux-gnu' % platform.machine()
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
 _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
@@ -225,8 +226,6 @@ def setup_python(environ_cp):
   # Set-up env variables used by python_configure.bzl
   write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
   write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
-  write_to_bazelrc('build --force_python=py%s' % python_major_version)
-  write_to_bazelrc('build --host_force_python=py%s' % python_major_version)
   write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
@@ -250,7 +249,11 @@ def reset_tf_configure_bazelrc(workspace_path):
       if _TF_BAZELRC_FILENAME in l:
         continue
       f.write('%s\n' % l)
-    f.write('import %s\n' % _TF_BAZELRC)
+    if is_windows():
+      tf_bazelrc_path = _TF_BAZELRC.replace("\\", "/")
+    else:
+      tf_bazelrc_path = _TF_BAZELRC
+    f.write('import %s\n' % tf_bazelrc_path)
 
 
 def cleanup_makefile():
@@ -444,7 +447,7 @@ def check_bazel_version(min_version):
   if which('bazel') is None:
     print('Cannot find bazel. Please install bazel.')
     sys.exit(0)
-  curr_version = run_shell(['bazel', '--batch', 'version'])
+  curr_version = run_shell(['bazel', '--batch', '--bazelrc=/dev/null', 'version'])
 
   for line in curr_version.split('\n'):
     if 'Build label: ' in line:
@@ -480,6 +483,8 @@ def set_cc_opt_flags(environ_cp):
   if is_ppc64le():
     # gcc on ppc64le does not support -march, use mcpu instead
     default_cc_opt_flags = '-mcpu=native'
+  elif is_windows():
+    default_cc_opt_flags = '/arch:AVX'
   else:
     default_cc_opt_flags = '-march=native'
   question = ('Please specify optimization flags to use during compilation when'
@@ -490,7 +495,7 @@ def set_cc_opt_flags(environ_cp):
   for opt in cc_opt_flags.split():
     write_to_bazelrc('build:opt --copt=%s' % opt)
   # It should be safe on the same build host.
-  if not is_ppc64le():
+  if not is_ppc64le() and not is_windows():
     write_to_bazelrc('build:opt --host_copt=-march=native')
   write_to_bazelrc('build:opt --define with_default_optimizations=true')
   # TODO(mikecase): Remove these default defines once we are able to get
@@ -498,7 +503,6 @@ def set_cc_opt_flags(environ_cp):
   write_to_bazelrc('build --copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
   write_to_bazelrc('build --host_copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
 
-
 def set_tf_cuda_clang(environ_cp):
   """set TF_CUDA_CLANG action_env.
 
@@ -520,7 +524,7 @@ def set_tf_cuda_clang(environ_cp):
 
 def set_tf_download_clang(environ_cp):
   """Set TF_DOWNLOAD_CLANG action_env."""
-  question = 'Do you want to download a fresh release of clang? (Experimental)'
+  question = 'Do you wish to download a fresh release of clang? (Experimental)'
   yes_reply = 'Clang will be downloaded and used to compile tensorflow.'
   no_reply = 'Clang will not be downloaded.'
   set_action_env_var(
@@ -1044,7 +1048,10 @@ def set_tf_tensorrt_install_path(environ_cp):
 
     for lib_file in possible_files:
       if is_compatible(lib_file, cuda_ver, cudnn_ver):
-        ver_str = nvinfer_pattern.search(lib_file).group(1)
+        matches = nvinfer_pattern.search(lib_file)
+        if len(matches.groups()) == 0:
+          continue
+        ver_str = matches.group(1)
         ver = convert_version_to_int(ver_str) if len(ver_str) else 0
         if ver > highest_ver[0]:
           highest_ver = [ver, ver_str, lib_file]
@@ -1067,7 +1074,7 @@ def set_tf_tensorrt_install_path(environ_cp):
           break
 
     # Reset and Retry
-    if len(possible_files):
+    if possible_files:
       print('TensorRT libraries found in one the following directories',
             'are not compatible with selected cuda and cudnn installations')
       print(trt_install_path)
@@ -1076,7 +1083,8 @@ def set_tf_tensorrt_install_path(environ_cp):
       if search_result:
         print(libnvinfer_path_from_ldconfig)
     else:
-      print('Invalid path to TensorRT. None of the following files can be found:')
+      print(
+          'Invalid path to TensorRT. None of the following files can be found:')
       print(trt_install_path)
       print(os.path.join(trt_install_path, 'lib'))
       print(os.path.join(trt_install_path, 'lib64'))
@@ -1095,6 +1103,81 @@ def set_tf_tensorrt_install_path(environ_cp):
   write_action_env_to_bazelrc('TF_TENSORRT_VERSION', tf_tensorrt_version)
 
 
+def set_tf_nccl_install_path(environ_cp):
+  """Set NCCL_INSTALL_PATH and TF_NCCL_VERSION.
+
+  Args:
+    environ_cp: copy of the os.environ.
+
+  Raises:
+    ValueError: if this method was called under non-Linux platform.
+    UserInputError: if user has provided invalid input multiple times.
+  """
+  if not is_linux():
+    raise ValueError('Currently NCCL is only supported on Linux platforms.')
+
+  ask_nccl_version = (
+      'Please specify the NCCL version you want to use. '
+      '[Leave empty to default to NCCL %s]: ') % _DEFAULT_NCCL_VERSION
+
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
+    tf_nccl_version = get_from_env_or_user_or_default(
+        environ_cp, 'TF_NCCL_VERSION', ask_nccl_version, _DEFAULT_NCCL_VERSION)
+    tf_nccl_version = reformat_version_sequence(str(tf_nccl_version), 1)
+
+    if tf_nccl_version == '1':
+      break  # No need to get install path, NCCL 1 is a GitHub repo.
+
+    # TODO(csigg): Look with ldconfig first if we can find the library in paths
+    # like /usr/lib/x86_64-linux-gnu and the header file in the corresponding
+    # include directory. This is where the NCCL .deb packages install them.
+    # Then ask the user if we should use that. Instead of a single
+    # NCCL_INSTALL_PATH, pass separate NCCL_LIB_PATH and NCCL_HDR_PATH to
+    # nccl_configure.bzl
+    default_nccl_path = environ_cp.get('CUDA_TOOLKIT_PATH')
+    ask_nccl_path = (r'Please specify the location where NCCL %s library is '
+                     'installed. Refer to README.md for more details. [Default '
+                     'is %s]:') % (tf_nccl_version, default_nccl_path)
+    nccl_install_path = get_from_env_or_user_or_default(
+        environ_cp, 'NCCL_INSTALL_PATH', ask_nccl_path, default_nccl_path)
+
+    # Result returned from "read" will be used unexpanded. That make "~"
+    # unusable. Going through one more level of expansion to handle that.
+    nccl_install_path = os.path.realpath(os.path.expanduser(nccl_install_path))
+    if is_windows() or is_cygwin():
+      nccl_install_path = cygpath(nccl_install_path)
+
+    if is_windows():
+      nccl_lib_path = 'lib/x64/nccl.lib'
+    elif is_linux():
+      nccl_lib_path = 'lib/libnccl.so.%s' % tf_nccl_version
+    elif is_macos():
+      nccl_lib_path = 'lib/libnccl.%s.dylib' % tf_nccl_version
+
+    nccl_lib_path = os.path.join(nccl_install_path, nccl_lib_path)
+    nccl_hdr_path = os.path.join(nccl_install_path, 'include/nccl.h')
+    if os.path.exists(nccl_lib_path) and os.path.exists(nccl_hdr_path):
+      # Set NCCL_INSTALL_PATH
+      environ_cp['NCCL_INSTALL_PATH'] = nccl_install_path
+      write_action_env_to_bazelrc('NCCL_INSTALL_PATH', nccl_install_path)
+      break
+
+    # Reset and Retry
+    print('Invalid path to NCCL %s toolkit, %s or %s not found. Please use the '
+          'O/S agnostic package of NCCL 2' % (tf_nccl_version, nccl_lib_path,
+                                              nccl_hdr_path))
+
+    environ_cp['TF_NCCL_VERSION'] = ''
+  else:
+    raise UserInputError('Invalid TF_NCCL setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
+
+  # Set TF_NCCL_VERSION
+  environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
+  write_action_env_to_bazelrc('TF_NCCL_VERSION', tf_nccl_version)
+
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1143,6 +1226,9 @@ def set_tf_cuda_compute_capabilities(environ_cp):
         ask_cuda_compute_capabilities, default_cuda_compute_capabilities)
     # Check whether all capabilities from the input is valid
     all_valid = True
+    # Remove all whitespace characters before splitting the string
+    # that users may insert by accident, as this will result in error 
+    tf_cuda_compute_capabilities = ''.join(tf_cuda_compute_capabilities.split())
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
@@ -1372,7 +1458,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.5.4')
+  check_bazel_version('0.10.0')
 
   reset_tf_configure_bazelrc(args.workspace)
   cleanup_makefile()
@@ -1389,6 +1475,9 @@ def main():
     environ_cp['TF_NEED_OPENCL'] = '0'
     environ_cp['TF_CUDA_CLANG'] = '0'
     environ_cp['TF_NEED_TENSORRT'] = '0'
+    # TODO(ibiryukov): Investigate using clang as a cpu or cuda compiler on
+    # Windows.
+    environ_cp['TF_DOWNLOAD_CLANG'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_JEMALLOC'] = '0'
@@ -1403,7 +1492,7 @@ def main():
   set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
                 'with_s3_support', True, 's3')
   set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
-                'with_kafka_support', False, 'kafka')
+                'with_kafka_support', True, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
@@ -1428,6 +1517,8 @@ def main():
     set_tf_cudnn_version(environ_cp)
     if is_linux():
       set_tf_tensorrt_install_path(environ_cp)
+      set_tf_nccl_install_path(environ_cp)
+
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
         'LD_LIBRARY_PATH') != '1':
@@ -1436,16 +1527,8 @@ def main():
 
     set_tf_cuda_clang(environ_cp)
     if environ_cp.get('TF_CUDA_CLANG') == '1':
-      if not is_windows():
-        # Ask if we want to download clang release while building.
-        set_tf_download_clang(environ_cp)
-      else:
-        # We use bazel's generated crosstool on Windows and there is no
-        # way to provide downloaded toolchain for that yet.
-        # TODO(ibiryukov): Investigate using clang as a cuda compiler on
-        # Windows.
-        environ_cp['TF_DOWNLOAD_CLANG'] = '0'
-
+      # Ask whether we should download the clang toolchain.
+      set_tf_download_clang(environ_cp)
       if environ_cp.get('TF_DOWNLOAD_CLANG') != '1':
         # Set up which clang we should use as the cuda / host compiler.
         set_clang_cuda_compiler_path(environ_cp)
@@ -1455,6 +1538,13 @@ def main():
       if not is_windows():
         set_gcc_host_compiler_path(environ_cp)
     set_other_cuda_vars(environ_cp)
+  else:
+    # CUDA not required. Ask whether we should download the clang toolchain and
+    # use it for the CPU build.
+    set_tf_download_clang(environ_cp)
+    if environ_cp.get('TF_DOWNLOAD_CLANG') == '1':
+      write_to_bazelrc('build --config=download_clang')
+      write_to_bazelrc('test --config=download_clang')
 
   set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False)
   if environ_cp.get('TF_NEED_MPI') == '1':
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index dc995d231d3e591771f801e28024a76610cdba26..f2ad16fa04f5beb6616c58c28d0f0c460c3e3a17 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -240,6 +240,13 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_kafka_support_windows_override",
+    define_values = {"with_kafka_support": "true"},
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "with_gcp_support_android_override",
     define_values = {"with_gcp_support": "true"},
@@ -394,309 +401,6 @@ package_group(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-py_library(
-    name = "tensorflow_py",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python"],
-)
-
-filegroup(
-    name = "all_opensource_files",
-    data = [
-        ":all_files",
-        "//tensorflow/c:all_files",
-        "//tensorflow/cc:all_files",
-        "//tensorflow/cc/saved_model:all_files",
-        "//tensorflow/cc/saved_model/python:all_files",
-        "//tensorflow/cc/tools:all_files",
-        "//tensorflow/compiler/aot:all_files",
-        "//tensorflow/compiler/aot/tests:all_files",
-        "//tensorflow/compiler/jit:all_files",
-        "//tensorflow/compiler/jit/graphcycles:all_files",
-        "//tensorflow/compiler/jit/kernels:all_files",
-        "//tensorflow/compiler/jit/legacy_flags:all_files",
-        "//tensorflow/compiler/jit/ops:all_files",
-        "//tensorflow/compiler/plugin:all_files",
-        "//tensorflow/compiler/tests:all_files",
-        "//tensorflow/compiler/tf2xla:all_files",
-        "//tensorflow/compiler/tf2xla/cc:all_files",
-        "//tensorflow/compiler/tf2xla/kernels:all_files",
-        "//tensorflow/compiler/tf2xla/lib:all_files",
-        "//tensorflow/compiler/tf2xla/ops:all_files",
-        "//tensorflow/compiler/xla:all_files",
-        "//tensorflow/compiler/xla/client:all_files",
-        "//tensorflow/compiler/xla/client/lib:all_files",
-        "//tensorflow/compiler/xla/legacy_flags:all_files",
-        "//tensorflow/compiler/xla/python:all_files",
-        "//tensorflow/compiler/xla/service:all_files",
-        "//tensorflow/compiler/xla/service/cpu:all_files",
-        "//tensorflow/compiler/xla/service/gpu:all_files",
-        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend:all_files",
-        "//tensorflow/compiler/xla/service/interpreter:all_files",
-        "//tensorflow/compiler/xla/service/llvm_ir:all_files",
-        "//tensorflow/compiler/xla/tests:all_files",
-        "//tensorflow/compiler/xla/tools:all_files",
-        "//tensorflow/compiler/xla/tools/parser:all_files",
-        "//tensorflow/contrib:all_files",
-        "//tensorflow/contrib/all_reduce:all_files",
-        "//tensorflow/contrib/android:all_files",
-        "//tensorflow/contrib/batching:all_files",
-        "//tensorflow/contrib/bayesflow:all_files",
-        "//tensorflow/contrib/boosted_trees:all_files",
-        "//tensorflow/contrib/boosted_trees/estimator_batch:all_files",
-        "//tensorflow/contrib/boosted_trees/lib:all_files",
-        "//tensorflow/contrib/boosted_trees/proto:all_files",
-        "//tensorflow/contrib/boosted_trees/resources:all_files",
-        "//tensorflow/contrib/cloud:all_files",
-        "//tensorflow/contrib/cloud/kernels:all_files",
-        "//tensorflow/contrib/cluster_resolver:all_files",
-        "//tensorflow/contrib/coder:all_files",
-        "//tensorflow/contrib/compiler:all_files",
-        "//tensorflow/contrib/copy_graph:all_files",
-        "//tensorflow/contrib/crf:all_files",
-        "//tensorflow/contrib/cudnn_rnn:all_files",
-        "//tensorflow/contrib/data:all_files",
-        "//tensorflow/contrib/data/kernels:all_files",
-        "//tensorflow/contrib/data/python/kernel_tests:all_files",
-        "//tensorflow/contrib/data/python/ops:all_files",
-        "//tensorflow/contrib/decision_trees/proto:all_files",
-        "//tensorflow/contrib/deprecated:all_files",
-        "//tensorflow/contrib/distributions:all_files",
-        "//tensorflow/contrib/eager/proto:all_files",
-        "//tensorflow/contrib/eager/python:all_files",
-        "//tensorflow/contrib/estimator:all_files",
-        "//tensorflow/contrib/factorization:all_files",
-        "//tensorflow/contrib/factorization/examples:all_files",
-        "//tensorflow/contrib/factorization/kernels:all_files",
-        "//tensorflow/contrib/feature_column:all_files",
-        "//tensorflow/contrib/ffmpeg:all_files",
-        "//tensorflow/contrib/ffmpeg/default:all_files",
-        "//tensorflow/contrib/framework:all_files",
-        "//tensorflow/contrib/fused_conv:all_files",
-        "//tensorflow/contrib/gan:all_files",
-        "//tensorflow/contrib/gdr:all_files",
-        "//tensorflow/contrib/graph_editor:all_files",
-        "//tensorflow/contrib/grid_rnn:all_files",
-        "//tensorflow/contrib/hooks:all_files",
-        "//tensorflow/contrib/hvx/clock_cycle_profiling:all_files",
-        "//tensorflow/contrib/hvx/hvx_ops_support_checker:all_files",
-        "//tensorflow/contrib/image:all_files",
-        "//tensorflow/contrib/input_pipeline:all_files",
-        "//tensorflow/contrib/input_pipeline/kernels:all_files",
-        "//tensorflow/contrib/integrate:all_files",
-        "//tensorflow/contrib/keras:all_files",
-        "//tensorflow/contrib/kernel_methods:all_files",
-        "//tensorflow/contrib/kfac:all_files",
-        "//tensorflow/contrib/kfac/examples:all_files",
-        "//tensorflow/contrib/kfac/examples/tests:all_files",
-        "//tensorflow/contrib/kfac/python/kernel_tests:all_files",
-        "//tensorflow/contrib/kfac/python/ops:all_files",
-        "//tensorflow/contrib/labeled_tensor:all_files",
-        "//tensorflow/contrib/layers:all_files",
-        "//tensorflow/contrib/layers/kernels:all_files",
-        "//tensorflow/contrib/learn:all_files",
-        "//tensorflow/contrib/learn/python/learn/datasets:all_files",
-        "//tensorflow/contrib/legacy_seq2seq:all_files",
-        "//tensorflow/contrib/libsvm:all_files",
-        "//tensorflow/contrib/linalg:all_files",
-        "//tensorflow/contrib/linear_optimizer:all_files",
-        "//tensorflow/contrib/lite:all_files",
-        "//tensorflow/contrib/lite/java:all_files",
-        "//tensorflow/contrib/lite/java/demo/app/src/main:all_files",
-        "//tensorflow/contrib/lite/java/demo/app/src/main/assets:all_files",
-        "//tensorflow/contrib/lite/java/src/main/native:all_files",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:all_files",
-        "//tensorflow/contrib/lite/kernels:all_files",
-        "//tensorflow/contrib/lite/kernels/internal:all_files",
-        "//tensorflow/contrib/lite/models/smartreply:all_files",
-        "//tensorflow/contrib/lite/nnapi:all_files",
-        "//tensorflow/contrib/lite/python:all_files",
-        "//tensorflow/contrib/lite/schema:all_files",
-        "//tensorflow/contrib/lite/testing:all_files",
-        "//tensorflow/contrib/lite/toco:all_files",
-        "//tensorflow/contrib/lite/toco/graph_transformations/tests:all_files",
-        "//tensorflow/contrib/lite/toco/python:all_files",
-        "//tensorflow/contrib/lite/toco/tensorflow_graph_matching:all_files",
-        "//tensorflow/contrib/lite/toco/tflite:all_files",
-        "//tensorflow/contrib/lite/tools:all_files",
-        "//tensorflow/contrib/lookup:all_files",
-        "//tensorflow/contrib/losses:all_files",
-        "//tensorflow/contrib/makefile:all_files",
-        "//tensorflow/contrib/memory_stats:all_files",
-        "//tensorflow/contrib/meta_graph_transform:all_files",
-        "//tensorflow/contrib/metrics:all_files",
-        "//tensorflow/contrib/model_pruning:all_files",
-        "//tensorflow/contrib/model_pruning/examples/cifar10:all_files",
-        "//tensorflow/contrib/nccl:all_files",
-        "//tensorflow/contrib/nearest_neighbor:all_files",
-        "//tensorflow/contrib/nn:all_files",
-        "//tensorflow/contrib/opt:all_files",
-        "//tensorflow/contrib/periodic_resample:all_files",
-        "//tensorflow/contrib/predictor:all_files",
-        "//tensorflow/contrib/py2tf:all_files",
-        "//tensorflow/contrib/py2tf/converters:all_files",
-        "//tensorflow/contrib/py2tf/impl:all_files",
-        "//tensorflow/contrib/py2tf/pyct:all_files",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis:all_files",
-        "//tensorflow/contrib/py2tf/utils:all_files",
-        "//tensorflow/contrib/quantize:all_files",
-        "//tensorflow/contrib/receptive_field:all_files",
-        "//tensorflow/contrib/reduce_slice_ops:all_files",
-        "//tensorflow/contrib/remote_fused_graph/pylib:all_files",
-        "//tensorflow/contrib/resampler:all_files",
-        "//tensorflow/contrib/rnn:all_files",
-        "//tensorflow/contrib/saved_model:all_files",
-        "//tensorflow/contrib/saved_model/cc/saved_model:all_files",
-        "//tensorflow/contrib/seq2seq:all_files",
-        "//tensorflow/contrib/session_bundle:all_files",
-        "//tensorflow/contrib/session_bundle/example:all_files",
-        "//tensorflow/contrib/signal:all_files",
-        "//tensorflow/contrib/slim:all_files",
-        "//tensorflow/contrib/slim/python/slim/data:all_files",
-        "//tensorflow/contrib/slim/python/slim/nets:all_files",
-        "//tensorflow/contrib/solvers:all_files",
-        "//tensorflow/contrib/sparsemax:all_files",
-        "//tensorflow/contrib/specs:all_files",
-        "//tensorflow/contrib/staging:all_files",
-        "//tensorflow/contrib/stat_summarizer:all_files",
-        "//tensorflow/contrib/stateless:all_files",
-        "//tensorflow/contrib/summary:all_files",
-        "//tensorflow/contrib/tensor_forest:all_files",
-        "//tensorflow/contrib/tensor_forest/hybrid:all_files",
-        "//tensorflow/contrib/tensor_forest/kernels/v4:all_files",
-        "//tensorflow/contrib/tensor_forest/proto:all_files",
-        "//tensorflow/contrib/tensorboard:all_files",
-        "//tensorflow/contrib/tensorboard/db:all_files",
-        "//tensorflow/contrib/tensorrt:all_files",
-        "//tensorflow/contrib/testing:all_files",
-        "//tensorflow/contrib/text:all_files",
-        "//tensorflow/contrib/tfprof:all_files",
-        "//tensorflow/contrib/timeseries:all_files",
-        "//tensorflow/contrib/timeseries/examples:all_files",
-        "//tensorflow/contrib/timeseries/python/timeseries:all_files",
-        "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:all_files",
-        "//tensorflow/contrib/tpu:all_files",
-        "//tensorflow/contrib/tpu/profiler:all_files",
-        "//tensorflow/contrib/tpu/proto:all_files",
-        "//tensorflow/contrib/training:all_files",
-        "//tensorflow/contrib/util:all_files",
-        "//tensorflow/contrib/verbs:all_files",
-        "//tensorflow/core:all_files",
-        "//tensorflow/core/api_def:all_files",
-        "//tensorflow/core/debug:all_files",
-        "//tensorflow/core/distributed_runtime:all_files",
-        "//tensorflow/core/distributed_runtime/rpc:all_files",
-        "//tensorflow/core/grappler:all_files",
-        "//tensorflow/core/grappler/clusters:all_files",
-        "//tensorflow/core/grappler/costs:all_files",
-        "//tensorflow/core/grappler/inputs:all_files",
-        "//tensorflow/core/grappler/optimizers:all_files",
-        "//tensorflow/core/grappler/utils:all_files",
-        "//tensorflow/core/kernels:all_files",
-        "//tensorflow/core/kernels/batching_util:all_files",
-        "//tensorflow/core/kernels/data:all_files",
-        "//tensorflow/core/kernels/data/sql:all_files",
-        "//tensorflow/core/kernels/fuzzing:all_files",
-        "//tensorflow/core/kernels/hexagon:all_files",
-        "//tensorflow/core/kernels/neon:all_files",
-        "//tensorflow/core/lib/db:all_files",
-        "//tensorflow/core/ops/compat:all_files",
-        "//tensorflow/core/platform/cloud:all_files",
-        "//tensorflow/core/platform/default/build_config:all_files",
-        "//tensorflow/core/platform/hadoop:all_files",
-        "//tensorflow/core/platform/s3:all_files",
-        "//tensorflow/core/profiler:all_files",
-        "//tensorflow/core/profiler/internal:all_files",
-        "//tensorflow/core/profiler/internal/advisor:all_files",
-        "//tensorflow/core/util/ctc:all_files",
-        "//tensorflow/core/util/tensor_bundle:all_files",
-        "//tensorflow/examples/adding_an_op:all_files",
-        "//tensorflow/examples/android:all_files",
-        "//tensorflow/examples/benchmark:all_files",
-        "//tensorflow/examples/get_started/regression:all_files",
-        "//tensorflow/examples/how_tos/reading_data:all_files",
-        "//tensorflow/examples/image_retraining:all_files",
-        "//tensorflow/examples/label_image:all_files",
-        "//tensorflow/examples/learn:all_files",
-        "//tensorflow/examples/multibox_detector:all_files",
-        "//tensorflow/examples/saved_model:all_files",
-        "//tensorflow/examples/speech_commands:all_files",
-        "//tensorflow/examples/tutorials/estimators:all_files",
-        "//tensorflow/examples/tutorials/layers:all_files",
-        "//tensorflow/examples/tutorials/mnist:all_files",
-        "//tensorflow/examples/tutorials/monitors:all_files",
-        "//tensorflow/examples/tutorials/word2vec:all_files",
-        "//tensorflow/examples/wav_to_spectrogram:all_files",
-        "//tensorflow/go:all_files",
-        "//tensorflow/java:all_files",
-        "//tensorflow/java/src/main/java/org/tensorflow/examples:all_files",
-        "//tensorflow/java/src/main/native:all_files",
-        "//tensorflow/python:all_files",
-        "//tensorflow/python/data:all_files",
-        "//tensorflow/python/data/kernel_tests:all_files",
-        "//tensorflow/python/data/ops:all_files",
-        "//tensorflow/python/data/util:all_files",
-        "//tensorflow/python/debug:all_files",
-        "//tensorflow/python/eager:all_files",
-        "//tensorflow/python/estimator:all_files",
-        "//tensorflow/python/feature_column:all_files",
-        "//tensorflow/python/keras:all_files",
-        "//tensorflow/python/kernel_tests:all_files",
-        "//tensorflow/python/kernel_tests/distributions:all_files",
-        "//tensorflow/python/kernel_tests/linalg:all_files",
-        "//tensorflow/python/kernel_tests/random:all_files",
-        "//tensorflow/python/ops/distributions:all_files",
-        "//tensorflow/python/ops/linalg:all_files",
-        "//tensorflow/python/ops/losses:all_files",
-        "//tensorflow/python/profiler:all_files",
-        "//tensorflow/python/profiler/internal:all_files",
-        "//tensorflow/python/saved_model:all_files",
-        "//tensorflow/python/tools:all_files",
-        "//tensorflow/tools/api/generator:all_files",
-        "//tensorflow/tools/api/golden:all_files",
-        "//tensorflow/tools/api/lib:all_files",
-        "//tensorflow/tools/api/tests:all_files",
-        "//tensorflow/tools/benchmark:all_files",
-        "//tensorflow/tools/build_info:all_files",
-        "//tensorflow/tools/ci_build/gpu_build:all_files",
-        "//tensorflow/tools/common:all_files",
-        "//tensorflow/tools/compatibility:all_files",
-        "//tensorflow/tools/dist_test/server:all_files",
-        "//tensorflow/tools/docker:all_files",
-        "//tensorflow/tools/docker/notebooks:all_files",
-        "//tensorflow/tools/docs:all_files",
-        "//tensorflow/tools/git:all_files",
-        "//tensorflow/tools/graph_transforms:all_files",
-        "//tensorflow/tools/mlpbtxt:all_files",
-        "//tensorflow/tools/proto_text:all_files",
-        "//tensorflow/tools/quantization:all_files",
-        "//tensorflow/tools/test:all_files",
-        "//tensorflow/user_ops:all_files",
-        "//third_party/eigen3:all_files",
-        "//third_party/fft2d:all_files",
-        "//third_party/flatbuffers:all_files",
-        "//third_party/hadoop:all_files",
-        "//third_party/sycl:all_files",
-        "//third_party/sycl/sycl:all_files",
-    ],
-    visibility = ["//visibility:public"],
-)
-
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
@@ -746,11 +450,12 @@ tf_cc_shared_object(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:framework_internal_impl",
+        "//tensorflow/core:gpu_runtime_impl",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
         "//tensorflow/core:lib_internal_impl",
-        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/stream_executor:stream_executor_impl",
-        "//tensorflow/core:gpu_runtime_impl",
     ] + tf_additional_binary_deps(),
 )
 
@@ -773,7 +478,7 @@ tf_cc_shared_object(
     linkopts = select({
         "//tensorflow:darwin": [
             "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "//tensorflow/c:exported_symbols.lds",
+            "$(location //tensorflow/c:exported_symbols.lds)",
             "-Wl,-install_name,@rpath/libtensorflow.so",
         ],
         "//tensorflow:windows": [],
@@ -782,11 +487,12 @@ tf_cc_shared_object(
             "-z defs",
             "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "//tensorflow/c:version_script.lds",
+            "$(location //tensorflow/c:version_script.lds)",
         ],
     }),
     deps = [
         "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_experimental",
         "//tensorflow/c:exported_symbols.lds",
         "//tensorflow/c:version_script.lds",
         "//tensorflow/c/eager:c_api",
@@ -799,7 +505,7 @@ tf_cc_shared_object(
     linkopts = select({
         "//tensorflow:darwin": [
             "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "//tensorflow:tf_exported_symbols.lds",
+            "$(location //tensorflow:tf_exported_symbols.lds)",
         ],
         "//tensorflow:windows": [],
         "//tensorflow:windows_msvc": [],
@@ -807,7 +513,7 @@ tf_cc_shared_object(
             "-z defs",
             "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "//tensorflow:tf_version_script.lds",
+            "$(location //tensorflow:tf_version_script.lds)",
         ],
     }),
     deps = [
@@ -829,3 +535,14 @@ exports_files(
         "tf_exported_symbols.lds",
     ],
 )
+
+py_library(
+    name = "tensorflow_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python",
+        "//tensorflow/tools/api/generator:python_api",
+    ],
+)
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 78ad6aec19f3bbbfcb389012ac1577573b3e4901..c8683e3976c90add3f1f54d8e575c798327e9273 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -20,14 +20,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 # pylint: disable=wildcard-import
-from tensorflow.python import *  # pylint: disable=redefined-builtin
+from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
 del LazyLoader
 
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 5dfb743681255d8c03e91ea43fd441d94fdee59d..8a9301d584775cff3ae315e6fd856b00d1734248 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -17,7 +17,10 @@ load(
 
 filegroup(
     name = "headers",
-    srcs = ["c_api.h"],
+    srcs = [
+        "c_api.h",
+        "c_api_experimental.h",
+    ],
     visibility = ["//tensorflow:__subpackages__"],
 )
 
@@ -31,6 +34,8 @@ filegroup(
         exclude = [
             "c_api_experimental.cc",
             "c_api_experimental.h",
+            "python_api.cc",
+            "python_api.h",
             "*test*",
         ],
     ),
@@ -113,6 +118,11 @@ tf_cuda_library(
         ":c_api",
         ":c_api_internal",
         "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
+        "//tensorflow/contrib/tpu:all_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -209,6 +219,27 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "c_api_experimental_test",
+    size = "small",
+    srcs = ["c_api_experimental_test.cc"],
+    data = ["testdata/tf_record"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api_experimental",
+        ":c_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "c_api_function_test",
     size = "small",
@@ -253,20 +284,7 @@ tf_cuda_library(
     deps = [
         ":c_api",
         ":c_api_internal",
+        # TODO(b/74620627): remove when _USE_C_SHAPES is removed
+        "//tensorflow/python:cpp_shape_inference_proto_cc",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 85f1d1639b4d09f2de77d326481a86ec246270d0..18eeb2816807ec9986999cfc2c9a4c0f032683c0 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #endif
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eval_const_tensor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -62,6 +63,7 @@ limitations under the License.
 // brain namespace because we are defining 'extern "C"' functions.
 using tensorflow::AllocationDescription;
 using tensorflow::DataType;
+using tensorflow::ExtendSessionGraphHelper;
 using tensorflow::Graph;
 using tensorflow::GraphDef;
 using tensorflow::mutex_lock;
@@ -73,6 +75,7 @@ using tensorflow::NodeBuilder;
 using tensorflow::NodeDef;
 using tensorflow::OpDef;
 using tensorflow::OpRegistry;
+using tensorflow::OutputTensor;
 using tensorflow::PartialTensorShape;
 using tensorflow::RunMetadata;
 using tensorflow::RunOptions;
@@ -638,17 +641,17 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
 }
 
 void RecordMutation(TF_Graph* graph, const TF_Operation& op,
-                    const char* mutation_type)
-    EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
+                    const char* mutation_type) {
   // If any session has already run this node_id, mark this session as
   // unrunnable.
   for (auto it : graph->sessions) {
+    mutex_lock session_lock(it.first->mu);
     if (it.first->last_num_graph_nodes > op.node.id()) {
-      it.second = FailedPrecondition(
+      it.second = strings::StrCat(
           "Operation '", op.node.DebugString(), "' was changed by ",
           mutation_type,
-          " after it was run by a session. Nodes can be mutated "
-          "only before they are executed by a session. Either don't modify "
+          " after it was run by a session. This mutation will have no effect, "
+          "and will trigger an error in the future. Either don't modify "
           "nodes after running them or create a new session.");
     }
   }
@@ -708,6 +711,61 @@ void TF_GraphSetOutputHandleShapesAndTypes(TF_Graph* graph, TF_Output output,
 Status LoadLibrary(const char* library_filename, void** result,
                    const void** buf, size_t* len);
 
+// TODO(josh11b,mrry): Change Session to be able to use a Graph*
+// directly, instead of requiring us to serialize to a GraphDef and
+// call Session::Extend().
+bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
+  if (session->graph != nullptr) {
+    // Take the graph lock before the session lock to avoid deadlock. This is
+    // safe since session->graph does not change.
+    session->graph->mu.lock();
+    mutex_lock session_lock(session->mu);
+    const Graph& graph = session->graph->graph;
+
+    const string& mutation_warning = session->graph->sessions[session];
+    if (!mutation_warning.empty()) {
+      // TODO(b/74949947): turn this back into an error status
+      LOG(WARNING) << mutation_warning;
+      session->graph->sessions[session].clear();
+    }
+
+    const auto num_nodes = graph.num_node_ids();
+    if (session->last_num_graph_nodes < num_nodes) {
+      status->status = tensorflow::ValidateNoCycles(session->graph->graph);
+      if (!status->status.ok()) {
+        session->graph->mu.unlock();
+        return false;
+      }
+
+      GraphDef graph_def;
+      *graph_def.mutable_versions() = graph.versions();
+      // Fill graph_def with nodes with ids in the range
+      // [session->last_num_graph_nodes, num_nodes), that is the nodes
+      // added since the last TF_SessionRun() call.
+      for (auto id = session->last_num_graph_nodes; id < num_nodes; ++id) {
+        Node* const node = graph.FindNodeId(id);
+        if (node != nullptr && node->IsOp()) {
+          NodeDef* const node_def = graph_def.add_node();
+          *node_def = node->def();
+        }
+      }
+      *graph_def.mutable_library() = graph.flib_def().ToProto();
+      session->graph->mu.unlock();
+      status->status = session->session->Extend(graph_def);
+      if (!status->status.ok()) {
+        // Contract is we always delete input_values[i].
+        return false;
+      }
+      // Note: session->session is not modified if Extend() fails, so
+      // we only set last_num_graph_nodes if it succeeds.
+      session->last_num_graph_nodes = num_nodes;
+    } else {
+      session->graph->mu.unlock();
+    }
+  }
+  return true;
+}
+
 }  // namespace tensorflow
 
 static void TF_Run_Setup(int noutputs, TF_Tensor** c_outputs,
@@ -2408,11 +2466,7 @@ void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx,
 // TF_Session functions ----------------------------------------------
 
 TF_Session::TF_Session(tensorflow::Session* s, TF_Graph* g)
-    : session(s), graph(g), last_num_graph_nodes(0), device_mgr(nullptr) {
-  if (s->LocalDeviceManager(&device_mgr).ok()) {
-    devices = device_mgr->ListDevices();
-  }
-}
+    : session(s), graph(g), last_num_graph_nodes(0), extend_before_run(true) {}
 
 TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
                           TF_Status* status) {
@@ -2422,7 +2476,7 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
     TF_Session* new_session = new TF_Session(session, graph);
     if (graph != nullptr) {
       mutex_lock l(graph->mu);
-      graph->sessions[new_session] = Status::OK();
+      graph->sessions[new_session] = "";
     }
     return new_session;
   } else {
@@ -2488,7 +2542,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
 
   TF_Session* session = new TF_Session(bundle.session.release(), graph);
 
-  graph->sessions[session] = Status::OK();
+  graph->sessions[session] = "";
   session->last_num_graph_nodes = graph->graph.num_node_ids();
   return session;
 #endif  // __ANDROID__
@@ -2512,58 +2566,6 @@ void TF_DeleteSession(TF_Session* s, TF_Status* status) {
   delete s;
 }
 
-// TODO(josh11b,mrry): Change Session to be able to use a Graph*
-// directly, instead of requiring us to serialize to a GraphDef and
-// call Session::Extend().
-static bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
-  if (session->graph != nullptr) {
-    mutex_lock session_lock(session->mu);
-    session->graph->mu.lock();
-    const Graph& graph = session->graph->graph;
-
-    status->status = session->graph->sessions[session];
-    if (!status->status.ok()) {
-      session->graph->mu.unlock();
-      return false;
-    }
-
-    const auto num_nodes = graph.num_node_ids();
-    if (session->last_num_graph_nodes < num_nodes) {
-      status->status = tensorflow::ValidateNoCycles(session->graph->graph);
-      if (!status->status.ok()) {
-        session->graph->mu.unlock();
-        return false;
-      }
-
-      GraphDef graph_def;
-      *graph_def.mutable_versions() = graph.versions();
-      // Fill graph_def with nodes with ids in the range
-      // [session->last_num_graph_nodes, num_nodes), that is the nodes
-      // added since the last TF_SessionRun() call.
-      for (auto id = session->last_num_graph_nodes; id < num_nodes; ++id) {
-        Node* const node = graph.FindNodeId(id);
-        if (node != nullptr && node->IsOp()) {
-          NodeDef* const node_def = graph_def.add_node();
-          *node_def = node->def();
-        }
-      }
-      *graph_def.mutable_library() = graph.flib_def().ToProto();
-      session->graph->mu.unlock();
-      status->status = session->session->Extend(graph_def);
-      if (!status->status.ok()) {
-        // Contract is we always delete input_values[i].
-        return false;
-      }
-      // Note: session->session is not modified if Extend() fails, so
-      // we only set last_num_graph_nodes if it succeeds.
-      session->last_num_graph_nodes = num_nodes;
-    } else {
-      session->graph->mu.unlock();
-    }
-  }
-  return true;
-}
-
 void TF_SessionRun(TF_Session* session, const TF_Buffer* run_options,
                    const TF_Output* inputs, TF_Tensor* const* input_values,
                    int ninputs, const TF_Output* outputs,
@@ -2573,7 +2575,8 @@ void TF_SessionRun(TF_Session* session, const TF_Buffer* run_options,
   // TODO(josh11b,mrry): Change Session to be able to use a Graph*
   // directly, instead of requiring us to serialize to a GraphDef and
   // call Session::Extend().
-  if (!ExtendSessionGraphHelper(session, status)) {
+  if (session->extend_before_run &&
+      !ExtendSessionGraphHelper(session, status)) {
     return;
   }
 
@@ -2610,7 +2613,8 @@ void TF_SessionPRunSetup(TF_Session* session, const TF_Output* inputs,
                          const char** handle, TF_Status* status) {
   *handle = nullptr;
 
-  if (!ExtendSessionGraphHelper(session, status)) {
+  if (session->extend_before_run &&
+      !ExtendSessionGraphHelper(session, status)) {
     return;
   }
 
@@ -2653,7 +2657,8 @@ void TF_SessionPRun(TF_Session* session, const char* handle,
   // TODO(josh11b,mrry): Change Session to be able to use a Graph*
   // directly, instead of requiring us to serialize to a GraphDef and
   // call Session::Extend().
-  if (!ExtendSessionGraphHelper(session, status)) {
+  if (session->extend_before_run &&
+      !ExtendSessionGraphHelper(session, status)) {
     return;
   }
 
@@ -2682,6 +2687,24 @@ void TF_SessionPRun(TF_Session* session, const char* handle,
                 output_values, target_names, nullptr, status);
 }
 
+unsigned char TF_TryEvaluateConstant(TF_Graph* graph, TF_Output output,
+                                     TF_Tensor** result, TF_Status* status) {
+  *result = nullptr;
+  mutex_lock l(graph->mu);
+  OutputTensor tensor(&output.oper->node, output.index);
+  bool evaluated;
+  Tensor result_tensor;
+  status->status = EvaluateConstantTensor(
+      tensor, graph->refiner, *graph->graph.op_registry(),
+      graph->graph.versions().producer(), &evaluated, &result_tensor);
+  if (evaluated) {
+    DCHECK(status->status.ok());
+    *result = TF_TensorFromTensor(result_tensor, status);
+    if (!status->status.ok()) evaluated = false;
+  }
+  return evaluated;
+}
+
 TF_ApiDefMap* TF_NewApiDefMap(TF_Buffer* op_list_buffer, TF_Status* status) {
   tensorflow::OpList op_list;
   if (!op_list.ParseFromArray(op_list_buffer->data, op_list_buffer->length)) {
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index ad592ef70961ef427bfe9fd322a82bd64df7f9f1..c8594347451dffd465d7fa926cc53818dc9e38d4 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -72,7 +72,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -80,7 +80,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
@@ -1275,13 +1275,22 @@ TF_CAPI_EXPORT extern void TF_FunctionGetAttrValueProto(
 // Deleting a function does not remove it from any graphs it was copied to.
 TF_CAPI_EXPORT extern void TF_DeleteFunction(TF_Function* func);
 
+// Attempts to evaluate `output`. This will only be possible if `output` doesn't
+// depend on any graph inputs (this function is safe to call if this isn't the
+// case though).
+//
+// If the evaluation is successful, this function returns true and `output`s
+// value is returned in `result`. Otherwise returns false. An error status is
+// returned if something is wrong with the graph or input. Note that this may
+// return false even if no error status is set.
+TF_CAPI_EXPORT extern unsigned char TF_TryEvaluateConstant(TF_Graph* graph,
+                                                           TF_Output output,
+                                                           TF_Tensor** result,
+                                                           TF_Status* status);
+
 // TODO(josh11b): Register OpDef, available to all operations added
 // to this graph.
 
-// The following two may both benefit from a subgraph-definition API
-// that re-uses most of the graph-definition API.
-// TODO(andydavis): Add functions to a graph.
-
 // --------------------------------------------------------------------------
 // API for driving Graph execution.
 
@@ -1487,7 +1496,8 @@ TF_CAPI_EXPORT extern int TF_DeviceListCount(const TF_DeviceList* list);
 // If index is out of bounds, an error code will be set in the status object,
 // and a null pointer will be returned.
 TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
-                                                    int index, TF_Status*);
+                                                    int index,
+                                                    TF_Status* status);
 
 // Retrieves the type of the device at the given index.
 //
@@ -1497,14 +1507,15 @@ TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
 // If index is out of bounds, an error code will be set in the status object,
 // and a null pointer will be returned.
 TF_CAPI_EXPORT extern const char* TF_DeviceListType(const TF_DeviceList* list,
-                                                    int index, TF_Status*);
+                                                    int index,
+                                                    TF_Status* status);
 
 // Retrieve the amount of memory associated with a given device.
 //
 // If index is out of bounds, an error code will be set in the status object,
 // and -1 will be returned.
 TF_CAPI_EXPORT extern int64_t TF_DeviceListMemoryBytes(
-    const TF_DeviceList* list, int index, TF_Status*);
+    const TF_DeviceList* list, int index, TF_Status* status);
 
 // --------------------------------------------------------------------------
 // Load plugins containing custom ops and kernels
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index be7f85a5bb06dce84579b109d506ded049042b50..82dbd3cdbc6e8fb0c6fbcddb33b6a95c87a83225 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -17,8 +17,27 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
+using tensorflow::FunctionDef;
+using tensorflow::Node;
+using tensorflow::NodeBuilder;
+using tensorflow::Status;
+
+namespace {
+typedef std::unique_ptr<TF_Function, decltype(&TF_DeleteFunction)>
+    UniqueFuncPtr;
+}
+
+// struct TF_Operation { tensorflow::Node node; };
+static TF_Operation* ToTF_Operation(Node* node) {
+  return static_cast<TF_Operation*>(static_cast<void*>(node));
+}
+
 void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
   tensorflow::ConfigProto& config = options->options.config;
   auto* optimizer_options =
@@ -37,3 +56,8354 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
     optimizer_options->set_global_jit_level(tensorflow::OptimizerOptions::OFF);
   }
 }
+
+const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
+  tensorflow::mutex_lock c(graph->mu);
+  const auto& debug_str = graph->graph.ToGraphDefDebug().DebugString();
+  *len = debug_str.size();
+  char* ret = static_cast<char*>(malloc(*len + 1));
+  memcpy(ret, debug_str.c_str(), *len + 1);
+  return ret;
+}
+
+// On success, returns a set of TF_Function instances from `text_proto` of
+// GraphDef type. These functions must be deleted by calling TF_DeleteFunction.
+//
+// If `mutate_proto_func` is non-NULL, run it over each FunctionDef proto,
+// before creating a TF_Function out of the possibly mutated proto.
+static std::vector<UniqueFuncPtr> CreateFunctionsFromTextProto(
+    const char* text_proto,
+    std::function<void(FunctionDef*)>* mutate_proto_func, TF_Status* status) {
+  tensorflow::GraphDef gdef;
+  if (!tensorflow::protobuf::TextFormat::ParseFromString(text_proto, &gdef)) {
+    status->status = tensorflow::errors::Internal(
+        "Invalid text proto for GraphDef: ", text_proto);
+    return {};
+  }
+  const auto& fdef_lib = gdef.library();
+  if (fdef_lib.gradient_size() > 0) {
+    status->status = tensorflow::errors::Internal(
+        "GradientDef is not supported in reading Dataset related functions: ",
+        text_proto);
+    return {};
+  }
+  std::vector<UniqueFuncPtr> ret;
+  for (const FunctionDef& fdef : fdef_lib.function()) {
+    // Make a copy so that we can mutate it.
+    FunctionDef fdef_to_load = fdef;
+    if (mutate_proto_func) {
+      (*mutate_proto_func)(&fdef_to_load);
+    }
+    VLOG(1) << "Adding func to graph: " << fdef_to_load.DebugString();
+    std::vector<char> binary_proto_buf(fdef_to_load.ByteSizeLong());
+    fdef_to_load.SerializeToArray(binary_proto_buf.data(),
+                                  binary_proto_buf.size());
+    TF_Function* func = TF_FunctionImportFunctionDef(
+        binary_proto_buf.data(), binary_proto_buf.size(), status);
+    if (!status->status.ok()) return {};
+    ret.push_back(UniqueFuncPtr(func, TF_DeleteFunction));
+  }
+  return ret;
+}
+
+//  On success, returns a newly created TF_Function instance encoding a dataset
+//  node stack that returns a sequence of 3 floats, and sets `dataset_name` to
+//  the created dataset name. The returned function must be deleted by calling
+//  TF_DeleteFunction.
+static UniqueFuncPtr CreateFakeDatasetFunction(std::string* dataset_name,
+                                               TF_Status* status) {
+  const char* func_def = R"PREFIX(
+library {
+  function {
+    signature {
+      name: "_make_dataset_d8de2712"
+      output_arg {
+        name: "TensorSliceDataset"
+        type: DT_VARIANT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "TensorSliceDataset/tensors/component_0"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+              dim {
+                size: 3
+              }
+            }
+       tensor_content: "\000\000(B\000\000,B\000\0000B"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "TensorSliceDataset"
+      op: "TensorSliceDataset"
+      input: "TensorSliceDataset/tensors/component_0:output:0"
+      attr {
+        key: "Toutput_types"
+        value {
+          list {
+            type: DT_FLOAT
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+    }
+    ret {
+      key: "TensorSliceDataset"
+      value: "TensorSliceDataset:handle:0"
+    }
+  }
+}
+)PREFIX";
+
+  *dataset_name = "_make_dataset_d8de2712";
+  auto functions = CreateFunctionsFromTextProto(
+      func_def, /*mutate_proto_func*/ nullptr, status);
+  DCHECK_EQ(functions.size(), 1);
+  return std::move(functions[0]);
+}
+
+#if not defined(PLATFORM_WINDOWS)
+//  On success, returns a set of TF_Function instances encoding a dataset
+//  node stack that reads a Imagenet TFRecordFile dataset from `file_path`, and
+//  sets `dataset_name` to the created dataset name. The returned functions must
+//  be deleted by calling TF_DeleteFunction.
+static std::vector<UniqueFuncPtr> CreateImagenetDatasetFunctions(
+    const char* file_path, std::string* dataset_name, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return std::vector<UniqueFuncPtr>();
+#else
+  const char* func_def = R"PREFIX(
+library {
+  function {
+    signature {
+      name: "tf_map_func_91295dea"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "FlatMapDataset"
+        type: DT_VARIANT
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+      is_stateful: true
+    }
+    node_def {
+      name: "flat_filenames/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: -1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "flat_filenames"
+      op: "Reshape"
+      input: "arg0"
+      input: "flat_filenames/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "TensorSliceDataset"
+      op: "TensorSliceDataset"
+      input: "flat_filenames:output:0"
+      attr {
+        key: "Toutput_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FlatMapDataset"
+      op: "FlatMapDataset"
+      input: "TensorSliceDataset:handle:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_0cc8c35b"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+    }
+    ret {
+      key: "FlatMapDataset"
+      value: "FlatMapDataset:handle:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_map_func_0cc8c35b"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "TFRecordDataset"
+        type: DT_VARIANT
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+      is_stateful: true
+    }
+    node_def {
+      name: "compression_type"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: ""
+          }
+        }
+      }
+    }
+    node_def {
+      name: "buffer_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 8388608
+          }
+        }
+      }
+    }
+    node_def {
+      name: "TFRecordDataset"
+      op: "TFRecordDataset"
+      input: "arg0"
+      input: "compression_type:output:0"
+      input: "buffer_size:output:0"
+    }
+    ret {
+      key: "TFRecordDataset"
+      value: "TFRecordDataset:handle:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_map_func_74b6b15c"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "Reshape_1"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "sub_1"
+        type: DT_INT32
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+      is_stateful: true
+    }
+    node_def {
+      name: "ParseSingleExample/key_image/class/label"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape"
+      op: "Reshape"
+      input: "ParseSingleExample/key_image/class/label:output:0"
+      input: "ParseSingleExample/Reshape/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/key_image/class/text"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: ""
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_1/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_1"
+      op: "Reshape"
+      input: "ParseSingleExample/key_image/class/text:output:0"
+      input: "ParseSingleExample/Reshape_1/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/key_image/encoded"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: ""
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_2/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_2"
+      op: "Reshape"
+      input: "ParseSingleExample/key_image/encoded:output:0"
+      input: "ParseSingleExample/Reshape_2/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/key_image/format"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "jpeg"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_3/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_3"
+      op: "Reshape"
+      input: "ParseSingleExample/key_image/format:output:0"
+      input: "ParseSingleExample/Reshape_3/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/ParseSingleExample"
+      op: "ParseSingleExample"
+      input: "arg0"
+      input: "ParseSingleExample/Reshape:output:0"
+      input: "ParseSingleExample/Reshape_1:output:0"
+      input: "ParseSingleExample/Reshape_2:output:0"
+      input: "ParseSingleExample/Reshape_3:output:0"
+      attr {
+        key: "Tdense"
+        value {
+          list {
+            type: DT_INT64
+            type: DT_STRING
+            type: DT_STRING
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "dense_keys"
+        value {
+          list {
+            s: "image/class/label"
+            s: "image/class/text"
+            s: "image/encoded"
+            s: "image/format"
+          }
+        }
+      }
+      attr {
+        key: "dense_shapes"
+        value {
+          list {
+            shape {
+            }
+            shape {
+            }
+            shape {
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "num_sparse"
+        value {
+          i: 5
+        }
+      }
+      attr {
+        key: "sparse_keys"
+        value {
+          list {
+            s: "image/object/bbox/xmax"
+            s: "image/object/bbox/xmin"
+            s: "image/object/bbox/ymax"
+            s: "image/object/bbox/ymin"
+            s: "image/object/class/label"
+          }
+        }
+      }
+      attr {
+        key: "sparse_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_INT64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape"
+      op: "Reshape"
+      input: "ParseSingleExample/ParseSingleExample:dense_values:2"
+      input: "Reshape/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/Substr/pos"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/Substr/len"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/Substr"
+      op: "Substr"
+      input: "Reshape:output:0"
+      input: "decode_image/Substr/pos:output:0"
+      input: "decode_image/Substr/len:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Substr/pos"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Substr/len"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Substr"
+      op: "Substr"
+      input: "Reshape:output:0"
+      input: "decode_image/is_jpeg/Substr/pos:output:0"
+      input: "decode_image/is_jpeg/Substr/len:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Equal/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "\377\330\377"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Equal"
+      op: "Equal"
+      input: "decode_image/is_jpeg/Substr:output:0"
+      input: "decode_image/is_jpeg/Equal/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Switch"
+      op: "Switch"
+      input: "decode_image/is_jpeg/Equal:z:0"
+      input: "decode_image/is_jpeg/Equal:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/switch_t"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/switch_f"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/pred_id"
+      op: "Identity"
+      input: "decode_image/is_jpeg/Equal:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/check_jpeg_channels/x"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/check_jpeg_channels/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 4
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/check_jpeg_channels"
+      op: "NotEqual"
+      input: "decode_image/cond_jpeg/check_jpeg_channels/x:output:0"
+      input: "decode_image/cond_jpeg/check_jpeg_channels/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Assert/Const"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 1, 3) when decoding JPEG images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Assert/Assert/data_0"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 1, 3) when decoding JPEG images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Assert/Assert"
+      op: "Assert"
+      input: "decode_image/cond_jpeg/check_jpeg_channels:z:0"
+      input: "decode_image/cond_jpeg/Assert/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/DecodeJpeg"
+      op: "DecodeJpeg"
+      input: "decode_image/cond_jpeg/DecodeJpeg/Switch:output_true:0"
+      input: "^decode_image/cond_jpeg/Assert/Assert"
+      attr {
+        key: "acceptable_fraction"
+        value {
+          f: 1.0
+        }
+      }
+      attr {
+        key: "channels"
+        value {
+          i: 3
+        }
+      }
+      attr {
+        key: "dct_method"
+        value {
+          s: ""
+        }
+      }
+      attr {
+        key: "fancy_upscaling"
+        value {
+          b: true
+        }
+      }
+      attr {
+        key: "ratio"
+        value {
+          i: 1
+        }
+      }
+      attr {
+        key: "try_recover_truncated"
+        value {
+          b: false
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/DecodeJpeg/Switch"
+      op: "Switch"
+      input: "Reshape:output:0"
+      input: "decode_image/cond_jpeg/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/is_png/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "\211PN"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/is_png"
+      op: "Equal"
+      input: "decode_image/cond_jpeg/is_png/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/is_png/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/is_png/Switch"
+      op: "Switch"
+      input: "decode_image/Substr:output:0"
+      input: "decode_image/cond_jpeg/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@decode_image/Substr"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/is_png:z:0"
+      input: "decode_image/cond_jpeg/is_png:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/switch_t"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/switch_f"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/pred_id"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/is_png:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/DecodePng"
+      op: "DecodePng"
+      input: "decode_image/cond_jpeg/cond_png/DecodePng/Switch_1:output_true:0"
+      attr {
+        key: "channels"
+        value {
+          i: 3
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/DecodePng/Switch"
+      op: "Switch"
+      input: "Reshape:output:0"
+      input: "decode_image/cond_jpeg/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/DecodePng/Switch_1"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/DecodePng/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/is_gif/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "GIF"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/is_gif"
+      op: "Equal"
+      input: "decode_image/cond_jpeg/cond_png/is_gif/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/is_gif/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/is_gif/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/is_png/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@decode_image/Substr"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/is_gif:z:0"
+      input: "decode_image/cond_jpeg/cond_png/is_gif:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/pred_id"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/is_gif:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels/x"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels"
+      op: "NotEqual"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels/x:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1/x"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 4
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1"
+      op: "NotEqual"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1/x:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/LogicalAnd"
+      op: "LogicalAnd"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels:z:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1:z:0"
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert/Const"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 3) when decoding GIF images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert/Assert/data_0"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 3) when decoding GIF images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert/Assert"
+      op: "Assert"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/LogicalAnd:z:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Assert/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif"
+      op: "DecodeGif"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch_1:output_true:0"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/Assert/Assert"
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/DecodePng/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch_1"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/pos"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/len"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Substr"
+      op: "Substr"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/pos:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/len:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/is_bmp/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "BM"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/is_bmp"
+      op: "Equal"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/is_bmp/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Const"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Unable to decode bytes as JPEG, PNG, GIF, or BMP"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Assert/data_0"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Unable to decode bytes as JPEG, PNG, GIF, or BMP"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Assert"
+      op: "Assert"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/is_bmp:z:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels/x"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels"
+      op: "NotEqual"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels/x:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Const"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 3) when decoding BMP images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Assert/data_0"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 3) when decoding BMP images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Assert"
+      op: "Assert"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels:z:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeBmp"
+      op: "DecodeBmp"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/Switch:output_false:0"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Assert"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Assert"
+      attr {
+        key: "channels"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Merge"
+      op: "Merge"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeBmp:image:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif:image:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/Merge"
+      op: "Merge"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Merge:output:0"
+      input: "decode_image/cond_jpeg/cond_png/DecodePng:image:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Merge"
+      op: "Merge"
+      input: "decode_image/cond_jpeg/cond_png/Merge:output:0"
+      input: "decode_image/cond_jpeg/DecodeJpeg:image:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "convert_image/Cast"
+      op: "Cast"
+      input: "decode_image/cond_jpeg/Merge:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "convert_image/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.00392156885937
+          }
+        }
+      }
+    }
+    node_def {
+      name: "convert_image"
+      op: "Mul"
+      input: "convert_image/Cast:y:0"
+      input: "convert_image/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Const"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+              dim {
+                size: 1
+              }
+              dim {
+                size: 1
+              }
+              dim {
+                size: 4
+              }
+            }
+            tensor_content: "\000\000\000\000\000\000\000\000\000\000\200?\000\000\200?"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "distorted_bounding_box_crop/Shape"
+      op: "Shape"
+      input: "convert_image:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2/min_object_covered"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.10000000149
+          }
+        }
+      }
+    }
+    node_def {
+      name: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2"
+      op: "SampleDistortedBoundingBoxV2"
+      input: "distorted_bounding_box_crop/Shape:output:0"
+      input: "Const:output:0"
+      input: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2/min_object_covered:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "area_range"
+        value {
+          list {
+            f: 0.0799999982119
+            f: 1.0
+          }
+        }
+      }
+      attr {
+        key: "aspect_ratio_range"
+        value {
+          list {
+            f: 0.75
+            f: 1.33333337307
+          }
+        }
+      }
+      attr {
+        key: "max_attempts"
+        value {
+          i: 1
+        }
+      }
+      attr {
+        key: "seed"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "seed2"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "use_image_if_no_bounding_boxes"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "distorted_bounding_box_crop/Slice"
+      op: "Slice"
+      input: "convert_image:z:0"
+      input: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2:begin:0"
+      input: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2:size:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Shape"
+      op: "Shape"
+      input: "convert_image:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Shape_1"
+      op: "Shape"
+      input: "distorted_bounding_box_crop/Slice:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Equal"
+      op: "Equal"
+      input: "Shape:output:0"
+      input: "Shape_1:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Cast"
+      op: "Cast"
+      input: "Equal:z:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "Const_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Sum"
+      op: "Sum"
+      input: "Cast:y:0"
+      input: "Const_1:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "Tidx"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "keep_dims"
+        value {
+          b: false
+        }
+      }
+    }
+    node_def {
+      name: "GreaterEqual/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "GreaterEqual"
+      op: "GreaterEqual"
+      input: "Sum:output:0"
+      input: "GreaterEqual/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/Switch"
+      op: "Switch"
+      input: "GreaterEqual:z:0"
+      input: "GreaterEqual:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/switch_t"
+      op: "Identity"
+      input: "cond/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/switch_f"
+      op: "Identity"
+      input: "cond/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/pred_id"
+      op: "Identity"
+      input: "GreaterEqual:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape"
+      op: "Shape"
+      input: "cond/Shape/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape/Switch"
+      op: "Switch"
+      input: "convert_image:z:0"
+      input: "cond/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@convert_image"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Cast"
+      op: "Cast"
+      input: "cond/Shape:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice"
+      op: "StridedSlice"
+      input: "cond/Cast:y:0"
+      input: "cond/strided_slice/stack:output:0"
+      input: "cond/strided_slice/stack_1:output:0"
+      input: "cond/strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_1/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_1/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_1/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_1"
+      op: "StridedSlice"
+      input: "cond/Cast:y:0"
+      input: "cond/strided_slice_1/stack:output:0"
+      input: "cond/strided_slice_1/stack_1:output:0"
+      input: "cond/strided_slice_1/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/Greater"
+      op: "Greater"
+      input: "cond/strided_slice:output:0"
+      input: "cond/strided_slice_1:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Switch"
+      op: "Switch"
+      input: "cond/Greater:z:0"
+      input: "cond/Greater:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/switch_t"
+      op: "Identity"
+      input: "cond/cond/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/switch_f"
+      op: "Identity"
+      input: "cond/cond/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/pred_id"
+      op: "Identity"
+      input: "cond/Greater:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice/stack"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice/stack_1"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice/stack_2"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice"
+      op: "StridedSlice"
+      input: "cond/cond/strided_slice/Switch:output_true:0"
+      input: "cond/cond/strided_slice/stack:output:0"
+      input: "cond/cond/strided_slice/stack_1:output:0"
+      input: "cond/cond/strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice/Switch"
+      op: "Switch"
+      input: "cond/Cast:y:0"
+      input: "cond/cond/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@cond/Cast"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_1/stack"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_1/stack_1"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_1/stack_2"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_1"
+      op: "StridedSlice"
+      input: "cond/cond/strided_slice/Switch:output_true:0"
+      input: "cond/cond/strided_slice_1/stack:output:0"
+      input: "cond/cond/strided_slice_1/stack_1:output:0"
+      input: "cond/cond/strided_slice_1/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/truediv"
+      op: "RealDiv"
+      input: "cond/cond/strided_slice:output:0"
+      input: "cond/cond/strided_slice_1:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/mul/y"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 224.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/mul"
+      op: "Mul"
+      input: "cond/cond/truediv:z:0"
+      input: "cond/cond/mul/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast/x/1"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 224.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast/x"
+      op: "Pack"
+      input: "cond/cond/mul:z:0"
+      input: "cond/cond/Cast/x/1:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast"
+      op: "Cast"
+      input: "cond/cond/Cast/x:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2/stack"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2/stack_1"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2/stack_2"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2"
+      op: "StridedSlice"
+      input: "cond/cond/strided_slice_2/Switch:output_false:0"
+      input: "cond/cond/strided_slice_2/stack:output:0"
+      input: "cond/cond/strided_slice_2/stack_1:output:0"
+      input: "cond/cond/strided_slice_2/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2/Switch"
+      op: "Switch"
+      input: "cond/Cast:y:0"
+      input: "cond/cond/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@cond/Cast"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_3/stack"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_3/stack_1"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_3/stack_2"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_3"
+      op: "StridedSlice"
+      input: "cond/cond/strided_slice_2/Switch:output_false:0"
+      input: "cond/cond/strided_slice_3/stack:output:0"
+      input: "cond/cond/strided_slice_3/stack_1:output:0"
+      input: "cond/cond/strided_slice_3/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/truediv_1"
+      op: "RealDiv"
+      input: "cond/cond/strided_slice_2:output:0"
+      input: "cond/cond/strided_slice_3:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/mul_1/y"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 224.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/mul_1"
+      op: "Mul"
+      input: "cond/cond/truediv_1:z:0"
+      input: "cond/cond/mul_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast_1/x/0"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 224.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast_1/x"
+      op: "Pack"
+      input: "cond/cond/Cast_1/x/0:output:0"
+      input: "cond/cond/mul_1:z:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast_1"
+      op: "Cast"
+      input: "cond/cond/Cast_1/x:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Merge"
+      op: "Merge"
+      input: "cond/cond/Cast_1:y:0"
+      input: "cond/cond/Cast:y:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic/images"
+      op: "Pack"
+      input: "cond/Shape/Switch:output_true:0"
+      attr {
+        key: "N"
+        value {
+          i: 1
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic"
+      op: "ResizeBicubic"
+      input: "cond/ResizeBicubic/images:output:0"
+      input: "cond/cond/Merge:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "align_corners"
+        value {
+          b: false
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_2/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_2/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_2/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_2"
+      op: "StridedSlice"
+      input: "cond/ResizeBicubic:resized_images:0"
+      input: "cond/strided_slice_2/stack:output:0"
+      input: "cond/strided_slice_2/stack_1:output:0"
+      input: "cond/strided_slice_2/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape_1"
+      op: "Shape"
+      input: "cond/strided_slice_2:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_3/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_3/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_3/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_3"
+      op: "StridedSlice"
+      input: "cond/Shape_1:output:0"
+      input: "cond/strided_slice_3/stack:output:0"
+      input: "cond/strided_slice_3/stack_1:output:0"
+      input: "cond/strided_slice_3/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape_2"
+      op: "Shape"
+      input: "cond/strided_slice_2:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_4/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_4/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_4/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_4"
+      op: "StridedSlice"
+      input: "cond/Shape_2:output:0"
+      input: "cond/strided_slice_4/stack:output:0"
+      input: "cond/strided_slice_4/stack_1:output:0"
+      input: "cond/strided_slice_4/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/sub/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/sub"
+      op: "Sub"
+      input: "cond/strided_slice_3:output:0"
+      input: "cond/sub/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/add/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/add"
+      op: "Add"
+      input: "cond/sub:z:0"
+      input: "cond/add/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv/Cast"
+      op: "Cast"
+      input: "cond/add:z:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv/Cast_1"
+      op: "Cast"
+      input: "cond/truediv/y:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv"
+      op: "RealDiv"
+      input: "cond/truediv/Cast:y:0"
+      input: "cond/truediv/Cast_1:y:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+    }
+    node_def {
+      name: "cond/sub_1/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/sub_1"
+      op: "Sub"
+      input: "cond/strided_slice_4:output:0"
+      input: "cond/sub_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/add_1/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/add_1"
+      op: "Add"
+      input: "cond/sub_1:z:0"
+      input: "cond/add_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv_1/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv_1/Cast"
+      op: "Cast"
+      input: "cond/add_1:z:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv_1/Cast_1"
+      op: "Cast"
+      input: "cond/truediv_1/y:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv_1"
+      op: "RealDiv"
+      input: "cond/truediv_1/Cast:y:0"
+      input: "cond/truediv_1/Cast_1:y:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape_3"
+      op: "Shape"
+      input: "cond/strided_slice_2:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/Rank"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Equal/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Equal"
+      op: "Equal"
+      input: "cond/Rank:output:0"
+      input: "cond/Equal/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert/Const"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Rank of image must be equal to 3."
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert/Assert/data_0"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Rank of image must be equal to 3."
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert/Assert"
+      op: "Assert"
+      input: "cond/Equal:z:0"
+      input: "cond/Assert/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_5/stack"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_5/stack_1"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_5/stack_2"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_5"
+      op: "StridedSlice"
+      input: "cond/Shape_3:output:0"
+      input: "cond/strided_slice_5/stack:output:0"
+      input: "cond/strided_slice_5/stack_1:output:0"
+      input: "cond/strided_slice_5/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack/0"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack/1"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack"
+      op: "Pack"
+      input: "cond/stack/0:output:0"
+      input: "cond/stack/1:output:0"
+      input: "cond/strided_slice_5:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 3
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_6/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_6/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_6/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_6"
+      op: "StridedSlice"
+      input: "cond/Shape_3:output:0"
+      input: "cond/strided_slice_6/stack:output:0"
+      input: "cond/strided_slice_6/stack_1:output:0"
+      input: "cond/strided_slice_6/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/GreaterEqual/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/GreaterEqual"
+      op: "GreaterEqual"
+      input: "cond/strided_slice_6:output:0"
+      input: "cond/GreaterEqual/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_7/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_7/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_7/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_7"
+      op: "StridedSlice"
+      input: "cond/Shape_3:output:0"
+      input: "cond/strided_slice_7/stack:output:0"
+      input: "cond/strided_slice_7/stack_1:output:0"
+      input: "cond/strided_slice_7/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/GreaterEqual_1/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/GreaterEqual_1"
+      op: "GreaterEqual"
+      input: "cond/strided_slice_7:output:0"
+      input: "cond/GreaterEqual_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/LogicalAnd"
+      op: "LogicalAnd"
+      input: "cond/GreaterEqual:z:0"
+      input: "cond/GreaterEqual_1:z:0"
+    }
+    node_def {
+      name: "cond/Assert_1/Const"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Crop size greater than the image size."
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert_1/Assert/data_0"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Crop size greater than the image size."
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert_1/Assert"
+      op: "Assert"
+      input: "cond/LogicalAnd:z:0"
+      input: "cond/Assert_1/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack_1/2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_DOUBLE
+            tensor_shape {
+            }
+            double_val: 0.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack_1"
+      op: "Pack"
+      input: "cond/truediv:z:0"
+      input: "cond/truediv_1:z:0"
+      input: "cond/stack_1/2:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 3
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/ToInt32"
+      op: "Cast"
+      input: "cond/stack_1:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+    }
+    node_def {
+      name: "cond/Slice"
+      op: "Slice"
+      input: "cond/strided_slice_2:output:0"
+      input: "cond/ToInt32:y:0"
+      input: "cond/stack:output:0"
+      input: "^cond/Assert_1/Assert"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/Reshape"
+      op: "Reshape"
+      input: "cond/Slice:output:0"
+      input: "cond/stack:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic_1/images"
+      op: "Pack"
+      input: "cond/ResizeBicubic_1/images/Switch:output_false:0"
+      attr {
+        key: "N"
+        value {
+          i: 1
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic_1/images/Switch"
+      op: "Switch"
+      input: "distorted_bounding_box_crop/Slice:output:0"
+      input: "cond/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@distorted_bounding_box_crop/Slice"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic_1/size"
+      op: "Const"
+      input: "^cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 2
+              }
+            }
+            tensor_content: "\340\000\000\000\340\000\000\000"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic_1"
+      op: "ResizeBicubic"
+      input: "cond/ResizeBicubic_1/images:output:0"
+      input: "cond/ResizeBicubic_1/size:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "align_corners"
+        value {
+          b: false
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_8/stack"
+      op: "Const"
+      input: "^cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_8/stack_1"
+      op: "Const"
+      input: "^cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_8/stack_2"
+      op: "Const"
+      input: "^cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_8"
+      op: "StridedSlice"
+      input: "cond/ResizeBicubic_1:resized_images:0"
+      input: "cond/strided_slice_8/stack:output:0"
+      input: "cond/strided_slice_8/stack_1:output:0"
+      input: "cond/strided_slice_8/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/Merge"
+      op: "Merge"
+      input: "cond/strided_slice_8:output:0"
+      input: "cond/Reshape:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Const_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+              dim {
+                size: 1
+              }
+              dim {
+                size: 1
+              }
+              dim {
+                size: 3
+              }
+            }
+            tensor_content: "\354Q\370>\325x\351>;\337\317>"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "sub"
+      op: "Sub"
+      input: "cond/Merge:output:0"
+      input: "Const_2:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Const_3"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+              dim {
+                size: 1
+              }
+              dim {
+                size: 1
+              }
+              dim {
+                size: 3
+              }
+            }
+            tensor_content: "\372~j>B`e>fff>"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "truediv"
+      op: "RealDiv"
+      input: "sub:z:0"
+      input: "Const_3:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/control_dependency"
+      op: "Identity"
+      input: "truediv:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@truediv"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/min"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/max"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 1.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/RandomUniform"
+      op: "RandomUniform"
+      input: "random_flip_left_right/random_uniform/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "seed"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "seed2"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/sub"
+      op: "Sub"
+      input: "random_flip_left_right/random_uniform/max:output:0"
+      input: "random_flip_left_right/random_uniform/min:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/mul"
+      op: "Mul"
+      input: "random_flip_left_right/random_uniform/RandomUniform:output:0"
+      input: "random_flip_left_right/random_uniform/sub:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform"
+      op: "Add"
+      input: "random_flip_left_right/random_uniform/mul:z:0"
+      input: "random_flip_left_right/random_uniform/min:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Less/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.5
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Less"
+      op: "Less"
+      input: "random_flip_left_right/random_uniform:z:0"
+      input: "random_flip_left_right/Less/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Switch"
+      op: "Switch"
+      input: "random_flip_left_right/Less:z:0"
+      input: "random_flip_left_right/Less:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/switch_t"
+      op: "Identity"
+      input: "random_flip_left_right/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/switch_f"
+      op: "Identity"
+      input: "random_flip_left_right/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/pred_id"
+      op: "Identity"
+      input: "random_flip_left_right/Less:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/ReverseV2/axis"
+      op: "Const"
+      input: "^random_flip_left_right/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/ReverseV2"
+      op: "ReverseV2"
+      input: "random_flip_left_right/ReverseV2/Switch:output_true:0"
+      input: "random_flip_left_right/ReverseV2/axis:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "Tidx"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/ReverseV2/Switch"
+      op: "Switch"
+      input: "random_flip_left_right/control_dependency:output:0"
+      input: "random_flip_left_right/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@truediv"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Switch_1"
+      op: "Switch"
+      input: "random_flip_left_right/control_dependency:output:0"
+      input: "random_flip_left_right/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@truediv"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Merge"
+      op: "Merge"
+      input: "random_flip_left_right/Switch_1:output_false:0"
+      input: "random_flip_left_right/ReverseV2:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Reshape_1/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 3
+              }
+            }
+            tensor_content: "\340\000\000\000\340\000\000\000\003\000\000\000"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape_1"
+      op: "Reshape"
+      input: "random_flip_left_right/Merge:output:0"
+      input: "Reshape_1/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Reshape_2/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape_2"
+      op: "Reshape"
+      input: "ParseSingleExample/ParseSingleExample:dense_values:0"
+      input: "Reshape_2/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Cast_1"
+      op: "Cast"
+      input: "Reshape_2:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "sub_1/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "sub_1"
+      op: "Sub"
+      input: "Cast_1:y:0"
+      input: "sub_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    ret {
+      key: "Reshape_1"
+      value: "Reshape_1:output:0"
+    }
+    ret {
+      key: "sub_1"
+      value: "sub_1:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_predicate_7089b845"
+      input_arg {
+        name: "arg0"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "arg1"
+        type: DT_INT32
+      }
+      input_arg {
+        name: "Equal/Placeholder"
+        type: DT_INT64
+      }
+      output_arg {
+        name: "Equal"
+        type: DT_BOOL
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+    }
+    node_def {
+      name: "Shape"
+      op: "Shape"
+      input: "arg0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice"
+      op: "StridedSlice"
+      input: "Shape:output:0"
+      input: "strided_slice/stack:output:0"
+      input: "strided_slice/stack_1:output:0"
+      input: "strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "Equal"
+      op: "Equal"
+      input: "strided_slice:output:0"
+      input: "Equal/Placeholder"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    ret {
+      key: "Equal"
+      value: "Equal:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "_make_dataset_5fa5e1f4"
+      output_arg {
+        name: "PrefetchDataset_1"
+        type: DT_VARIANT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "TensorSliceDataset/MatchingFiles/pattern"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "$(DATA_DIR)"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "TensorSliceDataset/MatchingFiles"
+      op: "MatchingFiles"
+      input: "TensorSliceDataset/MatchingFiles/pattern:output:0"
+    }
+    node_def {
+      name: "TensorSliceDataset"
+      op: "TensorSliceDataset"
+      input: "TensorSliceDataset/MatchingFiles:filenames:0"
+      attr {
+        key: "Toutput_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/MatchingFiles/pattern"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "$(DATA_DIR)"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/MatchingFiles"
+      op: "MatchingFiles"
+      input: "ShuffleDataset/MatchingFiles/pattern:output:0"
+    }
+    node_def {
+      name: "ShuffleDataset/Shape"
+      op: "Shape"
+      input: "ShuffleDataset/MatchingFiles:filenames:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/strided_slice/stack"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/strided_slice/stack_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/strided_slice/stack_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/strided_slice"
+      op: "StridedSlice"
+      input: "ShuffleDataset/Shape:output:0"
+      input: "ShuffleDataset/strided_slice/stack:output:0"
+      input: "ShuffleDataset/strided_slice/stack_1:output:0"
+      input: "ShuffleDataset/strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/Maximum/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/Maximum"
+      op: "Maximum"
+      input: "ShuffleDataset/strided_slice:output:0"
+      input: "ShuffleDataset/Maximum/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/seed"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/seed2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset"
+      op: "ShuffleDataset"
+      input: "TensorSliceDataset:handle:0"
+      input: "ShuffleDataset/Maximum:z:0"
+      input: "ShuffleDataset/seed:output:0"
+      input: "ShuffleDataset/seed2:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "reshuffle_each_iteration"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_1/buffer_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1024
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_1/seed_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_1/seed2_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_1"
+      op: "ShuffleDataset"
+      input: "ShuffleDataset:handle:0"
+      input: "ShuffleDataset_1/buffer_size:output:0"
+      input: "ShuffleDataset_1/seed_1:output:0"
+      input: "ShuffleDataset_1/seed2_1:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "reshuffle_each_iteration"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "RepeatDataset/count"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "RepeatDataset"
+      op: "RepeatDataset"
+      input: "ShuffleDataset_1:handle:0"
+      input: "RepeatDataset/count:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/cycle_length"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 8
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/block_length"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/sloppy"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_BOOL
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_BOOL
+            tensor_shape {
+            }
+            bool_val: true
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/buffer_output_elements"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/prefetch_input_elements"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 16
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset"
+      op: "ParallelInterleaveDataset"
+      input: "RepeatDataset:handle:0"
+      input: "ParallelInterleaveDataset/cycle_length:output:0"
+      input: "ParallelInterleaveDataset/block_length:output:0"
+      input: "ParallelInterleaveDataset/sloppy:output:0"
+      input: "ParallelInterleaveDataset/buffer_output_elements:output:0"
+      input: "ParallelInterleaveDataset/prefetch_input_elements:output:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_91295dea"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_2/buffer_size_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1024
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_2/seed_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_2/seed2_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_2"
+      op: "ShuffleDataset"
+      input: "ParallelInterleaveDataset:handle:0"
+      input: "ShuffleDataset_2/buffer_size_1:output:0"
+      input: "ShuffleDataset_2/seed_2:output:0"
+      input: "ShuffleDataset_2/seed2_2:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "reshuffle_each_iteration"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "ParallelMapDataset/num_parallel_calls"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelMapDataset"
+      op: "ParallelMapDataset"
+      input: "ShuffleDataset_2:handle:0"
+      input: "ParallelMapDataset/num_parallel_calls:output:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_74b6b15c"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "PrefetchDataset/buffer_size_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "PrefetchDataset"
+      op: "PrefetchDataset"
+      input: "ParallelMapDataset:handle:0"
+      input: "PrefetchDataset/buffer_size_2:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "BatchDataset/batch_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "BatchDataset"
+      op: "BatchDataset"
+      input: "PrefetchDataset:handle:0"
+      input: "BatchDataset/batch_size:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: -1
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+              dim {
+                size: -1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FilterDataset/batch_size_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FilterDataset"
+      op: "FilterDataset"
+      input: "BatchDataset:handle:0"
+      input: "FilterDataset/batch_size_1:output:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+            type: DT_INT64
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: -1
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+              dim {
+                size: -1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+      attr {
+        key: "predicate"
+        value {
+          func {
+            name: "tf_predicate_7089b845"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "PrefetchDataset_1/buffer_size_3"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "PrefetchDataset_1"
+      op: "PrefetchDataset"
+      input: "FilterDataset:handle:0"
+      input: "PrefetchDataset_1/buffer_size_3:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 64
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+              dim {
+                size: 64
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    ret {
+      key: "PrefetchDataset_1"
+      value: "PrefetchDataset_1:handle:0"
+    }
+  }
+}
+)PREFIX";
+
+  *dataset_name = "_make_dataset_5fa5e1f4";
+  std::function<void(FunctionDef*)> mutate_proto_func =
+      [dataset_name, file_path](FunctionDef* fdef) {
+        VLOG(1) << "Processsing function " << fdef->DebugString();
+        if (std::string(fdef->signature().name()) != *dataset_name) return;
+        // Change the input file pattern to `file_path`.
+        bool found = false;
+        for (auto& node_def : *fdef->mutable_node_def()) {
+          if (node_def.name() != "TensorSliceDataset/MatchingFiles/pattern" &&
+              node_def.name() != "ShuffleDataset/MatchingFiles/pattern")
+            continue;
+          DCHECK_EQ(node_def.op(), "Const");
+          DCHECK_GT(node_def.attr().count("value"), 0);
+          found = true;
+          DCHECK_EQ(node_def.attr().at("value").tensor().string_val(0),
+                    "$(DATA_DIR)");
+          VLOG(1) << "Setting the value of node_def "
+                     "TensorSliceDataset/MatchingFiles/pattern to "
+                  << file_path;
+          auto* tensor = (*node_def.mutable_attr())["value"].mutable_tensor();
+          tensor->clear_string_val();
+          tensor->add_string_val(file_path);
+        }
+        VLOG(1) << "Rewrote function to " << fdef->DebugString();
+        DCHECK(found);
+      };
+  return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
+}
+#endif
+
+#if not defined(PLATFORM_WINDOWS)
+//  On success, returns a set of TF_Function instances encoding a dataset
+//  node stack that reads an MNIST file dataset from `file_path`, and
+//  sets `dataset_name` to the created dataset name. The returned functions must
+//  be deleted by calling TF_DeleteFunction.
+static std::vector<UniqueFuncPtr> CreateMNISTDatasetFunctions(
+    const char* file_path, int batch_size, std::string* dataset_name,
+    TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return nullptr;
+#else
+  const char* func_def = R"PREFIX(
+library {
+  function {
+    signature {
+      name: "tf_map_func_521bfd08"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "truediv"
+        type: DT_FLOAT
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+    }
+    node_def {
+      name: "DecodeRaw"
+      op: "DecodeRaw"
+      input: "arg0"
+      attr {
+        key: "little_endian"
+        value {
+          b: true
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "Cast"
+      op: "Cast"
+      input: "DecodeRaw:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "Reshape/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 784
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape"
+      op: "Reshape"
+      input: "Cast:y:0"
+      input: "Reshape/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "truediv/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 255.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "truediv"
+      op: "RealDiv"
+      input: "Reshape:output:0"
+      input: "truediv/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "truediv"
+      value: "truediv:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_map_func_9a08860d"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "ToInt32"
+        type: DT_INT32
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+    }
+    node_def {
+      name: "DecodeRaw"
+      op: "DecodeRaw"
+      input: "arg0"
+      attr {
+        key: "little_endian"
+        value {
+          b: true
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "Reshape/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape"
+      op: "Reshape"
+      input: "DecodeRaw:output:0"
+      input: "Reshape/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_UINT8
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ToInt32"
+      op: "Cast"
+      input: "Reshape:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    ret {
+      key: "ToInt32"
+      value: "ToInt32:y:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_predicate_7089b845"
+      input_arg {
+        name: "arg0"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "arg1"
+        type: DT_INT32
+      }
+      input_arg {
+        name: "Equal/Placeholder"
+        type: DT_INT64
+      }
+      output_arg {
+        name: "Equal"
+        type: DT_BOOL
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+    }
+    node_def {
+      name: "Shape"
+      op: "Shape"
+      input: "arg0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice"
+      op: "StridedSlice"
+      input: "Shape:output:0"
+      input: "strided_slice/stack:output:0"
+      input: "strided_slice/stack_1:output:0"
+      input: "strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "Equal"
+      op: "Equal"
+      input: "strided_slice:output:0"
+      input: "Equal/Placeholder"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    ret {
+      key: "Equal"
+      value: "Equal:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "_make_dataset_2451e43a"
+      output_arg {
+        name: "FilterDataset"
+        type: DT_VARIANT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/filenames"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "$(DATA_DIR)/train-images-idx3-ubyte"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/header_bytes"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 16
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/record_bytes"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 784
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/footer_bytes"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/buffer_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 262144
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset"
+      op: "FixedLengthRecordDataset"
+      input: "FixedLengthRecordDataset/filenames:output:0"
+      input: "FixedLengthRecordDataset/header_bytes:output:0"
+      input: "FixedLengthRecordDataset/record_bytes:output:0"
+      input: "FixedLengthRecordDataset/footer_bytes:output:0"
+      input: "FixedLengthRecordDataset/buffer_size:output:0"
+    }
+    node_def {
+      name: "MapDataset"
+      op: "MapDataset"
+      input: "FixedLengthRecordDataset:handle:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_521bfd08"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/filenames_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "$(DATA_DIR)/train-labels-idx1-ubyte"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/header_bytes_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 8
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/record_bytes_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/footer_bytes_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/buffer_size_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 262144
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1"
+      op: "FixedLengthRecordDataset"
+      input: "FixedLengthRecordDataset_1/filenames_1:output:0"
+      input: "FixedLengthRecordDataset_1/header_bytes_1:output:0"
+      input: "FixedLengthRecordDataset_1/record_bytes_1:output:0"
+      input: "FixedLengthRecordDataset_1/footer_bytes_1:output:0"
+      input: "FixedLengthRecordDataset_1/buffer_size_1:output:0"
+    }
+    node_def {
+      name: "MapDataset_1"
+      op: "MapDataset"
+      input: "FixedLengthRecordDataset_1:handle:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_9a08860d"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ZipDataset"
+      op: "ZipDataset"
+      input: "MapDataset:handle:0"
+      input: "MapDataset_1:handle:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "CacheDataset/filename"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: ""
+          }
+        }
+      }
+    }
+    node_def {
+      name: "CacheDataset"
+      op: "CacheDataset"
+      input: "ZipDataset:handle:0"
+      input: "CacheDataset/filename:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "RepeatDataset/count"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "RepeatDataset"
+      op: "RepeatDataset"
+      input: "CacheDataset:handle:0"
+      input: "RepeatDataset/count:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/buffer_size_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 50000
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/seed"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/seed2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset"
+      op: "ShuffleDataset"
+      input: "RepeatDataset:handle:0"
+      input: "ShuffleDataset/buffer_size_2:output:0"
+      input: "ShuffleDataset/seed:output:0"
+      input: "ShuffleDataset/seed2:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+      attr {
+        key: "reshuffle_each_iteration"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "BatchDataset/batch_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -123
+          }
+        }
+      }
+    }
+    node_def {
+      name: "BatchDataset"
+      op: "BatchDataset"
+      input: "ShuffleDataset:handle:0"
+      input: "BatchDataset/batch_size:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: -1
+              }
+              dim {
+                size: 784
+              }
+            }
+            shape {
+              dim {
+                size: -1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FilterDataset/batch_size_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -123
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FilterDataset"
+      op: "FilterDataset"
+      input: "BatchDataset:handle:0"
+      input: "FilterDataset/batch_size_1:output:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+            type: DT_INT64
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: -1
+              }
+              dim {
+                size: 784
+              }
+            }
+            shape {
+              dim {
+                size: -1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+      attr {
+        key: "predicate"
+        value {
+          func {
+            name: "tf_predicate_7089b845"
+          }
+        }
+      }
+    }
+    ret {
+      key: "FilterDataset"
+      value: "FilterDataset:handle:0"
+    }
+  }
+}
+)PREFIX";
+
+  *dataset_name = "_make_dataset_2451e43a";
+  std::function<void(FunctionDef*)> mutate_proto_func =
+      [dataset_name, file_path, batch_size](FunctionDef* fdef) {
+        VLOG(1) << "Processsing function " << fdef->DebugString();
+        if (std::string(fdef->signature().name()) != *dataset_name) return;
+        // Change the input file pattern to `file_path`.
+        bool found_file_path = false, found_batch_size = false;
+        // `node_def` may be mutated.
+        for (auto& node_def : *fdef->mutable_node_def()) {
+          if (node_def.name() == "FixedLengthRecordDataset/filenames" ||
+              node_def.name() == "FixedLengthRecordDataset_1/filenames_1") {
+            DCHECK_EQ(node_def.op(), "Const");
+            DCHECK_GT(node_def.attr().count("value"), 0);
+            found_file_path = true;
+            // Replace $(DATA_DIR)/foo with <file_path>/foo
+            // TODO(hongm): Use StringPiece manipulation for better efficiency.
+            const std::string cur_value =
+                node_def.attr().at("value").tensor().string_val(0);
+            const std::string pattern = "$(DATA_DIR)";
+            DCHECK_EQ(cur_value.compare(0, pattern.length(), pattern), 0);
+            const std::string new_value =
+                file_path + cur_value.substr(pattern.length());
+            VLOG(1) << "Setting the value of node_def " << node_def.name()
+                    << " to " << new_value;
+            auto* tensor = (*node_def.mutable_attr())["value"].mutable_tensor();
+            tensor->clear_string_val();
+            tensor->add_string_val(new_value);
+          } else if (node_def.name() == "BatchDataset/batch_size" ||
+                     node_def.name() == "FilterDataset/batch_size_1") {
+            DCHECK_EQ(node_def.op(), "Const");
+            DCHECK_GT(node_def.attr().count("value"), 0);
+            found_batch_size = true;
+            // Replace $(BATCH_SIZE) with `batch_size`
+            DCHECK_EQ(node_def.attr().at("value").tensor().int64_val(0), -123);
+            VLOG(1) << "Setting the batch size attr value of node_def "
+                    << node_def.name() << " to " << batch_size;
+            auto* tensor = (*node_def.mutable_attr())["value"].mutable_tensor();
+            tensor->clear_int64_val();
+            tensor->add_int64_val(batch_size);
+          }
+        }
+        VLOG(1) << "Rewrote function to " << fdef->DebugString();
+        DCHECK(found_file_path);
+        DCHECK(found_batch_size);
+      };
+  return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
+}
+#endif
+
+// Adds the input functions to `graph`.  On success, returns the created
+// IteratorGetNext node.
+static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph(
+    const std::vector<UniqueFuncPtr>& funcs, const std::string& dataset_name,
+    const std::vector<tensorflow::DataType>& output_types,
+    const std::vector<tensorflow::TensorShapeProto>& output_shapes,
+    TF_Graph* graph, TF_Status* status) {
+  DCHECK(!dataset_name.empty());
+  for (auto& func : funcs) {
+    TF_GraphCopyFunction(graph, func.get(), /*gradient*/ nullptr, status);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+  }
+
+  tensorflow::mutex_lock c(graph->mu);
+
+  tensorflow::NameAttrList func;
+  func.set_name(dataset_name);
+  // Run the iterator node on CPU.
+  Node* oneshot_iterator_node;
+  tensorflow::Status s = NodeBuilder("OneShotIterator", "OneShotIterator")
+                             .Device("/device:CPU:0")
+                             .Attr("container", "")
+                             .Attr("dataset_factory", func)
+                             .Attr("output_types", output_types)
+                             .Attr("output_shapes", output_shapes)
+                             .Attr("shared_name", "")
+                             .Finalize(&graph->graph, &oneshot_iterator_node);
+  if (!s.ok()) {
+    status->status = s;
+    return nullptr;
+  }
+  // Run shape inference function for each newly added node, so that more
+  // subsequent nodes can be added to the graph via C API (TF_NewOperation()).
+  s = graph->refiner.AddNode(oneshot_iterator_node);
+  if (!s.ok()) {
+    status->status = s;
+    return nullptr;
+  }
+
+  // Run the iterator node on CPU.
+  Node* getnext_node;
+  s = NodeBuilder("IteratorGetNext", "IteratorGetNext")
+          .Input(oneshot_iterator_node)
+          .Device("/device:CPU:0")
+          .Attr("output_types", output_types)
+          .Attr("output_shapes", output_shapes)
+          .Finalize(&graph->graph, &getnext_node);
+  if (!s.ok()) {
+    status->status = s;
+    return nullptr;
+  }
+  // Run shape inference function for each newly added node, so that more
+  // subsequent nodes can be added to the graph via C API (TF_NewOperation()).
+  s = graph->refiner.AddNode(getnext_node);
+  if (!s.ok()) {
+    status->status = s;
+    return nullptr;
+  }
+
+  VLOG(1) << "Output graph: " << graph->graph.ToGraphDefDebug().DebugString();
+  return ToTF_Operation(getnext_node);
+}
+
+TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph,
+                                                     TF_Status* status) {
+  tensorflow::Status s;
+
+  std::string dataset_name;
+  UniqueFuncPtr result_func = CreateFakeDatasetFunction(&dataset_name, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  std::vector<UniqueFuncPtr> funcs;
+  funcs.push_back(std::move(result_func));
+  std::vector<tensorflow::TensorShapeProto> output_shape_list;
+  output_shape_list.push_back(tensorflow::TensorShapeProto());
+  auto* getnext_node = AddDatasetFunctionAndIteratorNodesToGraph(
+      funcs, dataset_name, {tensorflow::DT_FLOAT}, output_shape_list, graph,
+      status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  return getnext_node;
+}
+
+TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
+    TF_Graph* graph, const char* file_path, int batch_size,
+    unsigned char is_mnist, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  // TODO(ashankar): get these functions working on Windows.
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return nullptr;
+#else
+  tensorflow::Status s;
+
+  std::string dataset_name;
+  const auto& funcs =
+      is_mnist
+          ? CreateMNISTDatasetFunctions(file_path, batch_size, &dataset_name,
+                                        status)
+          : CreateImagenetDatasetFunctions(file_path, &dataset_name, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  std::vector<tensorflow::TensorShapeProto> output_shape_list;
+  // batch_size X 224 X 224 X 3
+  auto image_shape = tensorflow::TensorShapeProto();
+  image_shape.add_dim()->set_size(batch_size);
+  if (is_mnist) {
+    image_shape.add_dim()->set_size(784);
+  } else {
+    image_shape.add_dim()->set_size(224);
+    image_shape.add_dim()->set_size(224);
+    image_shape.add_dim()->set_size(3);
+  }
+  output_shape_list.push_back(image_shape);
+
+  // batch_size
+  auto label_shape = tensorflow::TensorShapeProto();
+  label_shape.add_dim()->set_size(batch_size);
+  output_shape_list.push_back(label_shape);
+  auto* getnext_node = AddDatasetFunctionAndIteratorNodesToGraph(
+      funcs, dataset_name, {tensorflow::DT_FLOAT, tensorflow::DT_INT32},
+      output_shape_list, graph, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  tensorflow::mutex_lock c(graph->mu);
+  VLOG(1) << "The extended graph: "
+          << graph->graph.ToGraphDefDebug().DebugString();
+
+  return getnext_node;
+#endif
+}
+
+TF_Tensor* TF_DequeueNamedTensor(TF_Session* session, int tensor_id,
+                                 TF_Status* status) {
+  assert(session);
+  {
+    tensorflow::mutex_lock c(session->graph->mu);
+    VLOG(1) << "Dequeuing named tensor with id " << tensor_id
+            << ", with input graph: "
+            << session->graph->graph.ToGraphDefDebug().DebugString();
+  }
+
+  TF_Operation* dequeue_op = TF_GraphOperationByName(
+      session->graph,
+      tensorflow::strings::StrCat("fifo_queue_dequeue_", tensor_id).c_str());
+  if (dequeue_op == nullptr) {
+    status->status = tensorflow::errors::Internal(
+        "Unable to find the dequeue node in the TF graph.");
+    return nullptr;
+  }
+
+  VLOG(1) << "Running the dequeue op";
+  TF_Output output{dequeue_op, 0};
+  TF_Tensor* ret;
+  TF_SessionRun(session, /*run_options*/ nullptr,
+                // input related parameters
+                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
+                // output related parameters
+                /*outputs*/ &output, /*output_values*/ &ret,
+                /*noutputs*/ 1,
+                /*targets*/ nullptr, /*ntargets*/ 0,
+                /*run_metadata*/ nullptr, status);
+  if (VLOG_IS_ON(1) && status->status.ok()) {
+    tensorflow::Tensor tensor;
+    if (tensorflow::TF_TensorToTensor(ret, &tensor).ok()) {
+      VLOG(1) << "Dequeued tensor content: " << tensor.DebugString();
+    }
+  }
+  return ret;
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 5a7b007e40aa199889b2d00b2bde5976c19e2966..e6757c065fc540fa789cdbb694e66ca0b00c4832 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -25,6 +25,7 @@ limitations under the License.
 // Experimental C API for TensorFlow.
 //
 // The API here is subject to changes in the future.
+// --------------------------------------------------------------------------
 
 // Macro to control visibility of exported symbols in the shared library (.so,
 // .dylib, .dll).
@@ -34,7 +35,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -42,7 +43,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
@@ -59,6 +60,42 @@ extern "C" {
 TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
                                                    unsigned char enable);
 
+// Returns the graph content in a human-readable format, with length set in
+// `len`. The format is subject to change in the future.
+// The returned string is heap-allocated, and caller should call free() on it.
+TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
+                                                      size_t* len);
+
+// Creates a stack of data set + iterator nodes, currently hard-coded to return
+// a sequence of 3 float values <42.0, 43.0, 44.0> over 3 calls. On success,
+// returns the IteratorGetNext node, which caller can run or feed into an node.
+//
+// TODO(hongm): Extend the API to allow customization of the nodes created.
+TF_CAPI_EXPORT extern TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(
+    TF_Graph* graph, TF_Status* status);
+
+// Similar to the above API, except that the returned iterator reads the
+// file based dataset from `file_path`.
+// If `is_mnist` is 0, the dataset corresponds to ImageNet.
+// The iterators outputs 2 tensors:
+// - A float tensor of shape `batch_size` X 784 when `is_mnist` is non-zero, or
+// `batch_size` X 224 X 224 X 3 otherwise.
+// - An int32 tensor of shape `batch_size`
+// TODO(hongm): Extend the API to allow customization of the nodes created.
+TF_CAPI_EXPORT extern TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
+    TF_Graph* graph, const char* file_path, int batch_size,
+    unsigned char is_mnist, TF_Status* status);
+
+// On success, dequeues a tensor from a TF-managed FifoQueue given by
+// `tensor_id`, associated with `session`. Caller must call TF_DeleteTensor()
+// over the returned tensor. If the queue is empty, this call is blocked.
+//
+// Tensors are enqueued via the corresponding TF enqueue op.
+// TODO(hongm): Add support for `timeout_ms`.
+TF_CAPI_EXPORT extern TF_Tensor* TF_DequeueNamedTensor(TF_Session* session,
+                                                       int tensor_id,
+                                                       TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30fcfd401d9d634962d64aaa3bf348de91f2ecae
--- /dev/null
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/c_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+void TestFakeIteratorStack() {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  TF_Operation* get_next = TF_MakeFakeIteratorGetNextWithDatasets(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  CSession csession(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Run the graph.
+  const float base_value = 42.0;
+  for (int i = 0; i < 3; ++i) {
+    csession.SetOutputs({get_next});
+    csession.Run(s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_Tensor* out = csession.output_tensor(0);
+    ASSERT_TRUE(out != nullptr);
+    ASSERT_EQ(TF_FLOAT, TF_TensorType(out));
+    ASSERT_EQ(0, TF_NumDims(out));  // scalar
+    ASSERT_EQ(sizeof(float), TF_TensorByteSize(out));
+    float* output_contents = static_cast<float*>(TF_TensorData(out));
+    ASSERT_EQ(base_value + i, *output_contents);
+  }
+
+  // This should error out since we've exhausted the iterator.
+  csession.Run(s);
+  ASSERT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s)) << TF_Message(s);
+
+  // Clean up
+  csession.CloseAndDelete(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST(CAPI_EXPERIMENTAL, FakeIteratorGetNext) { TestFakeIteratorStack(); }
+
+TEST(CAPI_EXPERIMENTAL, ImagenetIteratorGetNext) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  const string file_path = tensorflow::io::JoinPath(
+      tensorflow::testing::TensorFlowSrcRoot(), "c/testdata/tf_record");
+  VLOG(1) << "data file path is " << file_path;
+  const int batch_size = 64;
+  TF_Operation* get_next = TF_MakeFileBasedIteratorGetNextWithDatasets(
+      graph, file_path.c_str(), batch_size, /*is_mnist*/ false, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  CSession csession(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Run the graph.
+  // The two output tensors should look like:
+  // Tensor("IteratorGetNext:0", shape=(batch_size, 224, 224, 3), dtype=float32)
+  // Tensor("IteratorGetNext:1", shape=(batch_size, ), dtype=int32)
+  for (int i = 0; i < 3; ++i) {
+    LOG(INFO) << "Running iter " << i;
+    csession.SetOutputs({{get_next, 0}, {get_next, 1}});
+    csession.Run(s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+    {
+      TF_Tensor* image = csession.output_tensor(0);
+      ASSERT_TRUE(image != nullptr);
+      ASSERT_EQ(TF_FLOAT, TF_TensorType(image));
+      // Confirm shape is 224 X 224 X 3
+      ASSERT_EQ(4, TF_NumDims(image));
+      ASSERT_EQ(batch_size, TF_Dim(image, 0));
+      ASSERT_EQ(224, TF_Dim(image, 1));
+      ASSERT_EQ(224, TF_Dim(image, 2));
+      ASSERT_EQ(3, TF_Dim(image, 3));
+      ASSERT_EQ(sizeof(float) * batch_size * 224 * 224 * 3,
+                TF_TensorByteSize(image));
+    }
+
+    {
+      TF_Tensor* label = csession.output_tensor(1);
+      ASSERT_TRUE(label != nullptr);
+      ASSERT_EQ(TF_INT32, TF_TensorType(label));
+      ASSERT_EQ(1, TF_NumDims(label));
+      ASSERT_EQ(batch_size, TF_Dim(label, 0));
+      ASSERT_EQ(sizeof(int32) * batch_size, TF_TensorByteSize(label));
+    }
+  }
+
+  // Clean up
+  csession.CloseAndDelete(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index 7ca50119eafe299b307f06c555aec1388e7e82e2..610274696f5940c063e68f2310cfd9cc1e0bd964 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 91667056e0eeb224b4b8a034766f11a123cd1a03..95652a11378d6276b5ba6540a07baa15aa77cc1c 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -84,19 +84,20 @@ struct TF_Graph {
   std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
       GUARDED_BY(mu);
 
-  // The keys of this map are all the active sessions using this graph.
-  // Each value is the current "runnability" status of the corresponding
-  // session. Under normal conditions all statuses are Status::OK(), but
-  // if some operation is mutated after it was run by a session (this
-  // is detected in RecordMutation function), that session is no longer
-  // safe to run. Its status will contain the error that will be returned
-  // to the user, should she try running this session.
+  // The keys of this map are all the active sessions using this graph. Each
+  // value records whether the graph has been mutated since the corresponding
+  // session has been run (this is detected in RecordMutation function). If the
+  // string is empty, no mutation has occurred. Otherwise the string is a
+  // description of the mutation suitable for returning to the user.
   //
   // Sessions are added to this map in TF_NewSession, and removed in
   // TF_DeleteSession.
   // TF_Graph may only / must be deleted when
   //   sessions.size() == 0 && delete_requested == true
-  tensorflow::gtl::FlatMap<TF_Session*, tensorflow::Status> sessions
+  //
+  // TODO(b/74949947): mutations currently trigger a warning instead of a bad
+  // status, this should be reverted when possible.
+  tensorflow::gtl::FlatMap<TF_Session*, tensorflow::string> sessions
       GUARDED_BY(mu);
   bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
 
@@ -124,15 +125,16 @@ struct TF_Session {
   TF_Session(tensorflow::Session* s, TF_Graph* g);
 
   tensorflow::Session* session;
-  TF_Graph* graph;
+  TF_Graph* const graph;
 
-  tensorflow::mutex mu;
+  tensorflow::mutex mu ACQUIRED_AFTER(TF_Graph::mu);
   int last_num_graph_nodes;
 
-  // NOTE(ashankar): Experimental fields to help keep the
-  // buffers of a TF_Tensor pinned in device memory.
-  const tensorflow::DeviceMgr* device_mgr;   // Owned by session.
-  std::vector<tensorflow::Device*> devices;  // Owned by device_mgr.
+  // If true, TF_SessionRun and similar methods will call
+  // ExtendSessionGraphHelper before running the graph (this is the default
+  // public behavior). Can be set to false if the caller needs to call
+  // ExtendSessionGraphHelper manually.
+  std::atomic<bool> extend_before_run;
 };
 
 struct TF_ImportGraphDefOptions {
@@ -210,7 +212,11 @@ void TF_GraphSetOutputHandleShapesAndTypes(TF_Graph* graph, TF_Output output,
                                            TF_Status* status);
 
 void RecordMutation(TF_Graph* graph, const TF_Operation& op,
-                    const char* mutation_type);
+                    const char* mutation_type)
+    EXCLUSIVE_LOCKS_REQUIRED(graph->mu);
+
+bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status)
+    LOCKS_EXCLUDED(session->graph->mu, session->mu);
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 028f146be31790b211e546978302e81afe26b231..9b86425aa5fbc2be2872b3f5d2809eaa844f9d68 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -53,7 +53,7 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
@@ -1700,7 +1700,7 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) {
   TestGradientsError(false);
 }
 
-// REGISTER_OP for CApiTestAttributesTest test cases.
+// REGISTER_OP for CApiAttributesTest test cases.
 // Registers two ops, each with a single attribute called 'v'.
 // The attribute in one op will have a type 'type', the other
 // will have list(type).
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index 3db2852ce6560ba493d60ef54a110161c112d110..f3b28c1708129d39e451d927a89c0d10e2193b63 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -34,6 +34,10 @@ static void DoubleDeallocator(void* data, size_t, void* arg) {
   delete[] static_cast<double*>(data);
 }
 
+static void FloatDeallocator(void* data, size_t, void* arg) {
+  delete[] static_cast<float*>(data);
+}
+
 TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) {
   int64_t num_values = 1;
   for (int i = 0; i < num_dims; ++i) {
@@ -78,21 +82,34 @@ TF_Tensor* DoubleTensor(double v) {
                       &DoubleDeallocator, nullptr);
 }
 
+TF_Tensor* FloatTensor(float v) {
+  const int num_bytes = sizeof(float);
+  float* values = new float[1];
+  values[0] = v;
+  return TF_NewTensor(TF_FLOAT, nullptr, 0, values, num_bytes,
+                      &FloatDeallocator, nullptr);
+}
+
 // All the *Helper methods are used as a workaround for the restrictions that
 // one cannot call ASSERT_* methods in non-void-returning functions (when
 // exceptions are disabled during compilation)
 void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name,
+                       TF_DataType dtype, const std::vector<int64_t>& dims,
                        TF_Operation** op) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
-  TF_SetAttrType(desc, "dtype", TF_INT32);
+  TF_SetAttrType(desc, "dtype", dtype);
+  if (!dims.empty()) {
+    TF_SetAttrShape(desc, "shape", dims.data(), dims.size());
+  }
   *op = TF_FinishOperation(desc, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   ASSERT_NE(*op, nullptr);
 }
 
-TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name) {
+TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name,
+                          TF_DataType dtype, const std::vector<int64_t>& dims) {
   TF_Operation* op;
-  PlaceholderHelper(graph, s, name, &op);
+  PlaceholderHelper(graph, s, name, dtype, dims, &op);
   return op;
 }
 
@@ -126,6 +143,12 @@ TF_Operation* ScalarConst(double v, TF_Graph* graph, TF_Status* s,
   return Const(tensor.get(), graph, s, name);
 }
 
+TF_Operation* ScalarConst(float v, TF_Graph* graph, TF_Status* s,
+                          const char* name) {
+  unique_tensor_ptr tensor(FloatTensor(v), TF_DeleteTensor);
+  return Const(tensor.get(), graph, s, name);
+}
+
 void AddOpHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                  TF_Status* s, const char* name, TF_Operation** op,
                  bool check) {
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index 2a70177c724c569844a5d8ad42b99bed20209946..cd19cf8d624d9b914b61132f93d918b046cdbd30 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -44,8 +44,12 @@ TF_Tensor* Int32Tensor(int32_t v);
 
 TF_Tensor* DoubleTensor(double v);
 
+TF_Tensor* FloatTensor(float v);
+
 TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s,
-                          const char* name = "feed");
+                          const char* name = "feed",
+                          TF_DataType dtype = TF_INT32,
+                          const std::vector<int64_t>& dims = {});
 
 TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
                     const char* name = "const");
@@ -56,6 +60,9 @@ TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s,
 TF_Operation* ScalarConst(double v, TF_Graph* graph, TF_Status* s,
                           const char* name = "scalar");
 
+TF_Operation* ScalarConst(float v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar");
+
 TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                   TF_Status* s, const char* name = "add");
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index e55cb672e97e1403a3dd864c91c176426eb3f067..14321191625e448637aa44a7f6a17820159b97c2 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -27,6 +27,13 @@ tf_cuda_library(
             ":runtime",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core/common_runtime/eager:context",
+            "//tensorflow/core/common_runtime/eager:eager_executor",
+            "//tensorflow/core/common_runtime/eager:execute",
+            "//tensorflow/core/common_runtime/eager:kernel_and_device",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
+            "//tensorflow/core/common_runtime/eager:copy_to_device_node",
             "//tensorflow/core:core_cpu_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -41,6 +48,7 @@ tf_cuda_library(
         ],
         "//conditions:default": [],
     }) + [
+        "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core:gpu_runtime",
     ],
 )
@@ -54,11 +62,18 @@ tf_cuda_library(
         ":runtime",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:eager_executor",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/common_runtime/eager:kernel_and_device",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
     ],
 )
 
@@ -93,6 +108,7 @@ tf_cuda_library(
         "//conditions:default": [
             "//tensorflow/c:c_api",
             "//tensorflow/core:core_cpu",
+            "//tensorflow/core/common_runtime/eager:kernel_and_device",
             "//tensorflow/core:core_cpu_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 98ef6f0d0ab094eae3e2e21624c3a4ba30d1c3d3..3bf071f3abaac7dfd4113964fd49cd9322913bd5 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -31,8 +31,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -40,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/version.h"
@@ -65,6 +70,7 @@ string DeviceName(const tensorflow::Device* d) {
 #ifdef TENSORFLOW_EAGER_USE_XLA
 std::atomic_int_fast64_t func_id_generator(0);
 #endif  // TENSORFLOW_EAGER_USE_XLA
+
 }  // namespace
 
 extern "C" {
@@ -76,181 +82,140 @@ void TFE_ContextOptionsSetConfig(TFE_ContextOptions* options, const void* proto,
   TF_SetConfig(&options->session_options, proto, proto_len, status);
 }
 
+void TFE_ContextOptionsSetAsync(TFE_ContextOptions* options,
+                                unsigned char async) {
+  options->async = async;
+}
 void TFE_ContextOptionsSetDevicePlacementPolicy(
     TFE_ContextOptions* options, TFE_ContextDevicePlacementPolicy policy) {
   options->policy = policy;
 }
 
+TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
+                                                        unsigned char async,
+                                                        TF_Status* status) {
+  status->status = ctx->context.SetAsyncForThread(async);
+}
+
 void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
-  TF_Graph* graph = TF_NewGraph();
-  TF_Session* session = TF_NewSession(graph, &opts->session_options, status);
-  if (status->status.ok()) {
-    if (session->device_mgr == nullptr || session->devices.empty()) {
-      status->status = tensorflow::errors::InvalidArgument(
-          "Provided TF_SessionOptions are not compatible with eager execution "
-          "(perhaps the TF_SessionOptions alluded to session execution in a "
-          "remote address space?)");
-    }
-  }
+  std::vector<tensorflow::Device*> devices;
+  status->status = tensorflow::DeviceFactory::AddDevices(
+      opts->session_options.options, "/job:localhost/replica:0/task:0",
+      &devices);
   if (!status->status.ok()) {
-    TF_DeleteGraph(graph);
     return nullptr;
   }
-
-  return new TFE_Context(*opts, session);
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
+      new tensorflow::DeviceMgr(devices));
+  tensorflow::Rendezvous* r =
+      new tensorflow::IntraProcessRendezvous(device_mgr.get());
+  return new TFE_Context(opts->session_options.options, opts->policy,
+                         opts->async, std::move(device_mgr), r);
 }
 
-void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) {
-  status->status = tensorflow::Status::OK();
-  {
-    tensorflow::mutex_lock ml(ctx->cache_mu);
-    tensorflow::gtl::STLDeleteValues(&ctx->kernel_cache);
-  }
-  TF_Graph* graph = ctx->session->graph;
-  TF_DeleteSession(ctx->session, status);
-  TF_DeleteGraph(graph);
-  ctx->rendezvous->Unref();
-  delete ctx;
-}
+void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) { delete ctx; }
 
 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
-  return TF_SessionListDevices(ctx->session, status);
+  TF_DeviceList* list = new TF_DeviceList;
+  ctx->context.device_mgr()->ListDeviceAttributes(&list->response);
+  return list;
 }
 
-void TFE_ContextClearCaches(TFE_Context* ctx) {
-  tensorflow::mutex_lock ml(ctx->cache_mu);
-  tensorflow::gtl::STLDeleteValues(&ctx->kernel_cache);
-}
+void TFE_ContextClearCaches(TFE_Context* ctx) { ctx->context.ClearCaches(); }
 
 void TFE_ContextSetThreadLocalDevicePlacementPolicy(
     TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy) {
-  tensorflow::mutex_lock ml(ctx->policy_map_mu);
-  ctx->thread_local_policies[std::this_thread::get_id()] = policy;
+  ctx->context.SetThreadLocalDevicePlacementPolicy(
+      static_cast<tensorflow::ContextDevicePlacementPolicy>(policy));
 }
 
+// Note: this function looks up a thread local policy. So it should be called in
+// the appropriate client thread. In particular, in async mode, it may not be
+// safe to call this function from the async EagerExecutor threads.
 extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
     TFE_Context* ctx) {
-  tensorflow::mutex_lock ml(ctx->policy_map_mu);
-  auto policy_map_it =
-      ctx->thread_local_policies.find(std::this_thread::get_id());
-  if (policy_map_it != ctx->thread_local_policies.end()) {
-    return policy_map_it->second;
-  }
-  return ctx->policy;
+  return static_cast<TFE_ContextDevicePlacementPolicy>(
+      ctx->context.GetDevicePlacementPolicy());
+}
+
+void TFE_ContextAsyncWait(TFE_Context* ctx, TF_Status* status) {
+  status->status = ctx->context.AsyncWait();
+}
+
+void TFE_ContextGetStatus(TFE_Context* ctx, TF_Status* status) {
+  status->status = ctx->context.GetStatus();
+}
+
+void TFE_ContextAsyncClearError(TFE_Context* ctx) {
+  ctx->context.ClearAsyncError();
 }
 
 TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
   tensorflow::Tensor tensor;
   status->status = tensorflow::TF_TensorToTensor(t, &tensor);
   if (!status->status.ok()) return nullptr;
-  return new TFE_TensorHandle(tensor, nullptr);
+  return new TFE_TensorHandle(tensor, nullptr, nullptr);
 }
 
-void TFE_DeleteTensorHandle(TFE_TensorHandle* h) { delete h; }
+void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
+  DCHECK(h);
+  if (h->handle) {
+    h->handle->Unref();
+  }
+  delete h;
+}
 
 TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h) {
-  return static_cast<TF_DataType>(h->t.dtype());
+  return static_cast<TF_DataType>(h->handle->dtype);
 }
 
-int TFE_TensorHandleNumDims(TFE_TensorHandle* h) { return h->t.dims(); }
-
-int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index) {
-  return h->t.dim_size(dim_index);
+int TFE_TensorHandleNumDims(TFE_TensorHandle* h, TF_Status* status) {
+  const tensorflow::Tensor* t = nullptr;
+  status->status = h->handle->Tensor(&t);
+  return t == nullptr ? 0 : t->dims();
 }
 
-const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h) {
-  // TODO(apassos) this will be potentially incorrect in the distributed case as
-  // our local device will have a name which depends on the ClusterSpec and
-  // hence will require the context to resolve.
-  return (h->d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
-                           : h->d->name().c_str();
+int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index,
+                            TF_Status* status) {
+  const tensorflow::Tensor* t = nullptr;
+  status->status = h->handle->Tensor(&t);
+  return t == nullptr ? 0 : t->dim_size(dim_index);
 }
 
-TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
-  if (!IsCPU(h->d)) {
-    TF_SetStatus(status, TF_UNIMPLEMENTED,
-                 tensorflow::strings::StrCat(
-                     "TFE_TensorHandle can be resolved iff it is on CPU (this "
-                     "handle is on ",
-                     h->d->name(),
-                     "). Consider using TFE_TensorHandleCopyToDevice to get a "
-                     "copy of the tensor on CPU")
-                     .c_str());
-    return nullptr;
-  }
-  return tensorflow::TF_TensorFromTensor(h->t, status);
+const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) {
+  tensorflow::Device* d = nullptr;
+  status->status = h->handle->OpDevice(&d);
+  return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
+                        : d->name().c_str();
 }
 
-TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
-                                               TFE_Context* ctx,
-                                               const char* device_name,
-                                               TF_Status* status) {
-  tensorflow::Device* dstd = ctx->devices()[0];
-  if (device_name != nullptr && strlen(device_name) > 0) {
-    status->status = ctx->session->device_mgr->LookupDevice(device_name, &dstd);
-    if (!status->status.ok()) return nullptr;
-  }
-
-  tensorflow::Device* srcd = h->d == nullptr ? ctx->devices()[0] : h->d;
-  bool is_same_device =
-      (srcd == dstd) || (DeviceName(srcd) == DeviceName(dstd));
-  const bool dst_cpu = IsCPU(dstd);
-  const bool src_cpu = IsCPU(srcd);
-  // both_on_cpu can be true and yet is_same_device is false, if one of src/dst
-  // has device type XLA_CPU, and the other CPU.
-  const bool both_on_cpu = src_cpu && dst_cpu;
-  if (is_same_device || both_on_cpu) {
-    return new TFE_TensorHandle(h->t, dst_cpu ? nullptr : dstd);
-  }
-  tensorflow::Tensor* src = &(h->t);
-  if (!dst_cpu && (src->dtype() != tensorflow::DT_VARIANT &&
-                   !tensorflow::DataTypeCanUseMemcpy(src->dtype()))) {
-    TF_SetStatus(
-        status, TF_INVALID_ARGUMENT,
-        tensorflow::strings::StrCat("Can't copy Tensor with type ",
-                                    tensorflow::DataTypeString(src->dtype()),
-                                    " to device ", DeviceName(dstd), ".")
-            .c_str());
-    return nullptr;
-  }
-  tensorflow::AllocatorAttributes attr;
-  if (src->dtype() == tensorflow::DT_VARIANT) {
-    attr.set_on_host(true);
-  }
-  tensorflow::Tensor dst(dstd->GetAllocator(attr), src->dtype(), src->shape());
-  if (src->shape().num_elements() == 0) {
-    return new TFE_TensorHandle(dst, dst_cpu ? nullptr : dstd);
-  }
-  tensorflow::DeviceContext* src_device_context = nullptr;
-  if (!src_cpu) {
-    src_device_context = srcd->tensorflow_gpu_device_info()->default_context;
+TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
+  // TODO(agarwal): move this implementation inside TFE_TensorHandle.
+  tensorflow::Device* d = nullptr;
+  tensorflow::Device* op_device = nullptr;
+  const tensorflow::Tensor* t = nullptr;
+  status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
+  if (!status->status.ok()) return nullptr;
+  tensorflow::TensorHandle* h_cpu = nullptr;
+  if (!IsCPU(d)) {
+    status->status = h->handle->CopyToDevice(
+        h->handle->Context(), h->handle->Context()->HostCPU(), &h_cpu);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+    status->status = h_cpu->TensorAndDevice(&t, &d, &op_device);
+    if (!status->status.ok()) {
+      h_cpu->Unref();
+      return nullptr;
+    }
   }
-  tensorflow::DeviceContext* dst_device_context = nullptr;
-  if (!dst_cpu) {
-    dst_device_context = dstd->tensorflow_gpu_device_info()->default_context;
+  TF_Tensor* retval = tensorflow::TF_TensorFromTensor(*t, status);
+  if (h_cpu != nullptr) {
+    h_cpu->Unref();
   }
-  // TODO(ashankar): The Sync() call below may be more aggressive than
-  // necessary. It is based on knowledge of implementation details - that
-  // GPU devices are implemented using 3 streams - one for host->device copies,
-  // one for device->host copies and one for sending operations to the GPU.
-  // With that setup, Sync()ing across all 3 streams should be sufficient
-  // but more than necessary (since it waits for operations that might have
-  // nothing to do with this tensor to complete).
-  status->status = srcd->Sync();
-  tensorflow::Notification n;
-  tensorflow::CopyTensor::ViaDMA("copy", src_device_context, dst_device_context,
-                                 srcd, dstd, tensorflow::AllocatorAttributes(),
-                                 tensorflow::AllocatorAttributes(), src, &dst,
-                                 [status, &n](const tensorflow::Status& s) {
-                                   status->status = s;
-                                   n.Notify();
-                                 });
-  n.WaitForNotification();
-  return (TF_GetCode(status) == TF_OK)
-             ? new TFE_TensorHandle(dst, dst_cpu ? nullptr : dstd)
-             : nullptr;
+  return retval;
 }
 
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
@@ -260,8 +225,7 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
   status->status = tensorflow::AttrTypeMapForOp(name, &types);
   if (status->status.ok()) return new TFE_Op(ctx, name, types);
   if (TF_GetCode(status) == TF_NOT_FOUND) {
-    tensorflow::mutex_lock l(ctx->functions_mu);
-    if (ctx->func_lib_def.Find(name) != nullptr) {
+    if (ctx->context.FindFunctionByName(name)) {
       status->status = tensorflow::Status::OK();
       return new TFE_Op(ctx, name, nullptr);
     }
@@ -272,23 +236,18 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
 void TFE_DeleteOp(TFE_Op* op) { delete op; }
 
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
-  tensorflow::Device* d = nullptr;
-  if (device_name != nullptr && strlen(device_name) > 0) {
-    status->status =
-        op->ctx->session->device_mgr->LookupDevice(device_name, &d);
-    if (!status->status.ok()) return;
-  }
-  op->device = d;
+  status->status = op->operation.SetDevice(device_name);
 }
 
 const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
-  tensorflow::Device* device =
-      (op->device == nullptr) ? op->ctx->devices()[0] : op->device;
+  tensorflow::Device* device = (op->operation.Device() == nullptr)
+                                   ? op->operation.EagerContext()->HostCPU()
+                                   : op->operation.Device();
   return device->name().c_str();
 }
 
 void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
-  op->use_xla = enable;
+  op->operation.SetUseXla(enable);
 #ifndef TENSORFLOW_EAGER_USE_XLA
   LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not "
                   "built with XLA support.";
@@ -296,31 +255,20 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  // Questionable heuristic ...
-  //
-  // Motivation: After an 'op' is placed on GPU because some of its earlier
-  // inputs are on GPU, we want to keep the 'op' there, even if some later
-  // inputs of it are not on GPU.
-  if (IsCPU(op->device) && !IsCPU(h->d)) {
-    op->device = h->d;
-  }
-  if (!status->status.ok()) return;
-  op->inputs.push_back(h->t);
-  op->input_devices.push_back(h->d);
-  op->attrs.NumInputs(op->inputs.size());
+  op->operation.AddInput(h->handle);
 }
 
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret;
-  if (op->is_function()) {
+  if (op->operation.is_function()) {
     status->status = tensorflow::errors::Unimplemented(
         "TODO(apassos): Support for attributes for TensorFlow functions is not "
         "ready yet.");
     return TF_ATTR_INT;  // The compiler requires that we return something.
   }
-  status->status =
-      tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list);
+  status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
+                                              attr_name, &ret, is_list);
   return ret;
 }
 
@@ -339,23 +287,24 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx,
 }
 
 void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const char* value) {
-  op->attrs.Set(attr_name, value);
+  op->operation.MutableAttrs()->Set(attr_name, value);
 }
 
 void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value) {
-  op->attrs.Set(attr_name, static_cast<int64>(value));
+  op->operation.MutableAttrs()->Set(attr_name, static_cast<int64>(value));
 }
 
 void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value) {
-  op->attrs.Set(attr_name, value);
+  op->operation.MutableAttrs()->Set(attr_name, value);
 }
 
 void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, unsigned char value) {
-  op->attrs.Set(attr_name, (value == 0) ? false : true);
+  op->operation.MutableAttrs()->Set(attr_name, (value == 0) ? false : true);
 }
 
 void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, TF_DataType value) {
-  op->attrs.Set(attr_name, static_cast<tensorflow::DataType>(value));
+  op->operation.MutableAttrs()->Set(attr_name,
+                                    static_cast<tensorflow::DataType>(value));
 }
 
 void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims,
@@ -377,23 +326,24 @@ void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims,
       proto.add_dim()->set_size(dims[d]);
     }
   }
-  op->attrs.Set(attr_name, proto);
+  op->operation.MutableAttrs()->Set(attr_name, proto);
 }
 
 void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name,
                            const TFE_Op* value) {
   tensorflow::AttrValue attr_value;
   tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(value->name);
-  value->attrs.FillAttrValueMap(func->mutable_attr());
-  op->attrs.Set(attr_name, attr_value);
+  func->set_name(value->operation.Name());
+  value->operation.Attrs().FillAttrValueMap(func->mutable_attr());
+  op->operation.MutableAttrs()->Set(attr_name, attr_value);
 }
 
 #define TFE_OP_SET_ATTR_LIST(fn, type)                                \
   void fn(TFE_Op* op, const char* attr_name, const type* values,      \
           int num_values) {                                           \
-    op->attrs.Set(attr_name, tensorflow::gtl::ArraySlice<const type>( \
-                                 values, num_values));                \
+    op->operation.MutableAttrs()->Set(                                \
+        attr_name,                                                    \
+        tensorflow::gtl::ArraySlice<const type>(values, num_values)); \
   }
 TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrStringList, char*)
 TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float)
@@ -401,14 +351,14 @@ TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float)
 
 void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
                           const int64_t* values, int num_values) {
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<const int64>(
-                    reinterpret_cast<const int64*>(values), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<const int64>(
+                     reinterpret_cast<const int64*>(values), num_values));
 }
 
 void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
                            const TF_DataType* values, int num_values) {
-  op->attrs.Set(
+  op->operation.MutableAttrs()->Set(
       attr_name,
       tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
           reinterpret_cast<const tensorflow::DataType*>(values), num_values));
@@ -420,8 +370,8 @@ void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
   for (int i = 0; i < num_values; ++i) {
     b[i] = values[i];
   }
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<const bool>(b.get(), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<const bool>(b.get(), num_values));
 }
 
 void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
@@ -451,9 +401,9 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
       }
     }
   }
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<tensorflow::TensorShapeProto>(
-                    proto.get(), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<tensorflow::TensorShapeProto>(
+                     proto.get(), num_values));
 }
 
 void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
@@ -461,432 +411,41 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
   std::unique_ptr<tensorflow::NameAttrList[]> funcs(
       new tensorflow::NameAttrList[num_values]);
   for (int i = 0; i < num_values; i++) {
-    funcs[i].set_name(value[i]->name);
-    value[i]->attrs.FillAttrValueMap(funcs[i].mutable_attr());
+    funcs[i].set_name(value[i]->operation.Name());
+    value[i]->operation.Attrs().FillAttrValueMap(funcs[i].mutable_attr());
   }
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
-                    funcs.get(), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
+                     funcs.get(), num_values));
 }
 
-namespace {
-
-tensorflow::Status ValidateInputTypeAndPlacement(
-    TFE_Context* ctx, tensorflow::Device* host_device,
-    tensorflow::Device* op_device, TFE_Op* op,
-    const tensorflow::OpKernel* kernel,
-    std::vector<TFE_TensorHandle*>* copied_tensors) {
-  const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types();
-  if (memtypes.size() != op->inputs.size()) {
-    return tensorflow::errors::InvalidArgument(
-        "expected ", memtypes.size(), " inputs, got ", op->inputs.size());
-  }
-  for (int i = 0; i < op->inputs.size(); ++i) {
-    const tensorflow::Device* expected_device =
-        memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device;
-    const tensorflow::Device* actual_device =
-        op->input_devices[i] == nullptr ? host_device : op->input_devices[i];
-    if (expected_device != actual_device) {
-      switch (TFE_ContextGetDevicePlacementPolicy(ctx)) {
-        case TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32:
-          // TODO(xpan): See if we could bubble python related error up
-          // to python level.
-          if (op->inputs[i].dtype() == tensorflow::DT_INT32) {
-            // Note: enabling silent copies of int32 tensors to match behavior
-            // of graph mode.
-            break;
-          }
-          TF_FALLTHROUGH_INTENDED;
-        case TFE_DEVICE_PLACEMENT_EXPLICIT:
-          return tensorflow::errors::InvalidArgument(
-              "Tensors on conflicting devices:"
-              " cannot compute ",
-              op->name, " as input #", i, " was expected to be on ",
-              expected_device->name(), " but is actually on ",
-              actual_device->name(), " (operation running on ",
-              op_device->name(), ")",
-              " Tensors can be copied explicitly using .gpu() or .cpu(),"
-              " or transparently copied by using tfe.enable_eager_execution("
-              "tfe.DEVICE_PLACEMENT_SILENT). Copying tensors between devices"
-              " may slow down your model");
-        case TFE_DEVICE_PLACEMENT_WARN:
-          LOG(WARNING) << "before computing " << op->name << " input #" << i
-                       << " was expected to be on " << expected_device->name()
-                       << " but is actually on " << actual_device->name()
-                       << " (operation running on " << op_device->name()
-                       << "). This triggers a copy which can be a performance "
-                          "bottleneck.";
-          break;
-        case TFE_DEVICE_PLACEMENT_SILENT:  // Do nothing.
-          break;
-      }
-      // We are only here if the policy is warn or silent copies, so we should
-      // trigger a copy.
-      TFE_TensorHandle original{op->inputs[i], op->input_devices[i]};
-      TF_Status* s = TF_NewStatus();
-      TFE_TensorHandle* copied_tensor = TFE_TensorHandleCopyToDevice(
-          &original, ctx, expected_device->name().c_str(), s);
-      if (!s->status.ok()) {
-        tensorflow::Status status = s->status;
-        delete s;
-        return tensorflow::errors::Internal(
-            "Failed copying input tensor from ", actual_device->name(), " to ",
-            expected_device->name(), " in order to run ", op->name, ": ",
-            status.error_message());
-      }
-      op->inputs[i] = copied_tensor->t;
-      copied_tensors->push_back(copied_tensor);
-      op->input_devices[i] = copied_tensor->d;
-      delete s;
-    }
-    if (op->inputs[i].dtype() != kernel->input_type(i)) {
-      return tensorflow::errors::InvalidArgument(
-          "cannot compute ", op->name, " as input #", i,
-          " was expected to be a ",
-          tensorflow::DataTypeString(kernel->input_type(i)),
-          " tensor but is a ",
-          tensorflow::DataTypeString(op->inputs[i].dtype()), " tensor");
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
-#ifdef TENSORFLOW_EAGER_USE_XLA
-// Synthesizes and returns a wrapper function over `op`, which must be a
-// primitive op (e.g. matmul).
-//
-// The wrapper function conforms to the function signature expected by
-// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
-// resources>. For example, if the op has input params <Const1, Arg2, Const3,
-// Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
-// Resource4> as the input params to the synthesized function.
-//
-// It populates `const_input_types`, `arg_input_types` and
-// `op_input_to_func_input` based on the reordering results, that the caller can
-// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
-// `status` accordingly.
-const tensorflow::FunctionDef* OpToFunction(
-    TFE_Op* op, std::vector<TF_DataType>* const_input_types,
-    std::vector<TF_DataType>* arg_input_types,
-    tensorflow::gtl::FlatMap<int, int>* op_input_to_func_input,
-    TF_Status* status) {
-  DCHECK(!op->is_function());
-
-  tensorflow::FunctionDef fdef;
-
-  // Get the OpDef of the op we are trying to encapsulate.
-  TFE_Context* ctx = op->ctx;
-  const tensorflow::OpRegistrationData* op_data;
-  {
-    tensorflow::tf_shared_lock l(ctx->functions_mu);
-    status->status = ctx->func_lib_def.LookUp(op->name, &op_data);
-    if (!status->status.ok()) {
-      return nullptr;
-    }
-  }
-  const tensorflow::OpDef& op_def = op_data->op_def;
-
-  tensorflow::OpDef* signature = fdef.mutable_signature();
-
-  // Handle constant inputs.
-  const std::unordered_set<string> const_inputs(
-      *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(op->name));
-
-  // First add place holders for the input args, so that we can refer to them by
-  // position in the next loop. Also tally up the resource inputs.
-  int num_resource_inputs = 0;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    if (op_def.input_arg(i).type() == tensorflow::DT_RESOURCE) {
-      ++num_resource_inputs;
-    }
-    signature->add_input_arg();
-  }
-
-  // Now we map the input params from `op_def` to `signature`, where the param
-  // ordering for `signature` is: <constants, args, resources>.
-  int const_index = 0;
-  int arg_index = const_inputs.size();
-  int resource_index = op_def.input_arg_size() - num_resource_inputs;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const tensorflow::OpDef::ArgDef& op_input_arg = op_def.input_arg(i);
-    tensorflow::OpDef::ArgDef* func_input_arg = nullptr;
-    if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) {
-      VLOG(1) << "For const input, mapping op input " << i << " to func input "
-              << const_index;
-      (*op_input_to_func_input)[i] = const_index;
-      func_input_arg = signature->mutable_input_arg(const_index++);
-      const_input_types->push_back(
-          static_cast<TF_DataType>(op->inputs[i].dtype()));
-    } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) {
-      VLOG(1) << "For resource input, mapping op input " << i
-              << " to func input " << resource_index;
-      (*op_input_to_func_input)[i] = resource_index;
-      func_input_arg = signature->mutable_input_arg(resource_index++);
-    } else {
-      VLOG(1) << "For arg input, mapping op input " << i << " to func input "
-              << arg_index;
-      (*op_input_to_func_input)[i] = arg_index;
-      func_input_arg = signature->mutable_input_arg(arg_index++);
-      arg_input_types->push_back(
-          static_cast<TF_DataType>(op->inputs[i].dtype()));
-    }
-
-    func_input_arg->set_name(op_input_arg.name());
-    func_input_arg->set_type(op->inputs[i].dtype());
-  }
-  VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
-
-  // Resources args are at the end of the function input params, and we should
-  // have iterated over all of them.
-  DCHECK_EQ(signature->input_arg_size(), resource_index);
-
-  // Make the synthesized function's name unique.
-  signature->set_name(tensorflow::strings::StrCat(
-      op_def.name(), func_id_generator.fetch_add(1)));
-
-  // Add the node def and set its input names to match op_def's names.
-  const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
-  DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
-  *fdef.add_node_def() = ndef;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name());
-  }
-  VLOG(1) << "Added NodeDef: " << fdef.DebugString();
-
-  // Fix the output names and set output types.
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    tensorflow::OpDef::ArgDef* arg = signature->add_output_arg();
-    const tensorflow::OpDef::ArgDef& op_def_arg = op_def.output_arg(i);
-    const string& out_tensor_name = tensorflow::strings::StrCat(
-        ndef.name(), ":", op_def_arg.name(), ":", 0);
-    arg->set_name(op_def_arg.name());
-    (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name;
-    const string& type_attr = op_def_arg.type_attr();
-    if (!type_attr.empty()) {
-      auto i = ndef.attr().find(type_attr);
-      if (i == ndef.attr().end()) {
-        status->status = tensorflow::errors::InvalidArgument(
-            tensorflow::strings::StrCat("Could not find attr ", type_attr,
-                                        " in NodeDef ", ndef.DebugString()));
-        return nullptr;
-      }
-      arg->set_type(i->second.type());
-    }
-  }
-  VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString();
-
-  tensorflow::mutex_lock l(ctx->functions_mu);
-  status->status = ctx->func_lib_def.AddFunctionDef(fdef);
-  if (!status->status.ok()) return nullptr;
-  const auto ret = ctx->func_lib_def.Find(signature->name());
-  DCHECK(ret != nullptr);
-  return ret;
-}
-
-// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
-// via XLA.
-std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
-  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->name;
-  auto launch_op =
-      std::unique_ptr<TFE_Op>(TFE_NewOp(op->ctx, "_XlaLaunch", status));
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  if (op->device) {
-    TFE_OpSetDevice(launch_op.get(), op->device->name().c_str(), status);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
-  }
-
-  const tensorflow::FunctionDef* fdef;
-  {
-    tensorflow::tf_shared_lock l(op->ctx->functions_mu);
-    fdef = op->ctx->func_lib_def.Find(op->name);
-  }
-  std::vector<TF_DataType> const_input_types;
-  std::vector<TF_DataType> arg_input_types;
-  tensorflow::gtl::FlatMap<int, int> op_input_to_func_input;
-  if (fdef == nullptr) {
-    // See if this is a primitive op, and if so create a function for it, so
-    // that _XlaLaunchOp can access it.
-    fdef = OpToFunction(op, &const_input_types, &arg_input_types,
-                        &op_input_to_func_input, status);
-    if (!status->status.ok()) return nullptr;
-  } else {
-    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
-    // functions, so we need to find another way to handle constant inputs.
-    for (int i = const_input_types.size();
-         i < fdef->signature().input_arg_size(); ++i) {
-      VLOG(1) << "Adding Targs from input arg " << i;
-      const tensorflow::OpDef::ArgDef& arg = fdef->signature().input_arg(i);
-      arg_input_types.push_back(static_cast<TF_DataType>(arg.type()));
-    }
-  }
-  DCHECK(fdef != nullptr);
-
-  // Copy inputs and their devices.
-  // Since input param reordering may have occurred between `op` and `launch_op`
-  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
-  launch_op->inputs = op->inputs;
-  launch_op->input_devices = op->input_devices;
-  if (!op_input_to_func_input.empty()) {
-    DCHECK_EQ(op->inputs.size(), op_input_to_func_input.size());
-    if (!op->input_devices.empty()) {
-      DCHECK_EQ(op->input_devices.size(), op_input_to_func_input.size());
-    }
-    for (int i = 0; i < op_input_to_func_input.size(); ++i) {
-      VLOG(1) << "mapping op input " << i << " to func input "
-              << op_input_to_func_input[i];
-
-      launch_op->inputs[op_input_to_func_input[i]] = op->inputs[i];
-      if (!op->input_devices.empty()) {
-        launch_op->input_devices[op_input_to_func_input[i]] =
-            op->input_devices[i];
-      }
-    }
-  }
-  launch_op->attrs.NumInputs(op->inputs.size());
-
-  TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
-                        const_input_types.size());
-
-  // Set Targs and Nresources attrs.
-  TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(),
-                        arg_input_types.size());
-  const int num_resource_inputs = fdef->signature().input_arg_size() -
-                                  const_input_types.size() -
-                                  arg_input_types.size();
-  TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs);
-
-  // Set Tresults attr.
-  std::vector<TF_DataType> tresults;
-  for (const tensorflow::OpDef::ArgDef& arg : fdef->signature().output_arg()) {
-    tresults.push_back(static_cast<TF_DataType>(arg.type()));
-  }
-  TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(),
-                        tresults.size());
-
-  // Set function attr.
-  tensorflow::AttrValue attr_value;
-  tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(fdef->signature().name());
-  launch_op->attrs.Set("function", attr_value);
-
-  return launch_op;
-}
-#endif  // TENSORFLOW_EAGER_USE_XLA
-}  // namespace
-
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
-  TFE_Context* ctx = op->ctx;
-  // TODO(ashankar): ASSUMPTION: ctx->devices()[0] is always CPU
-  tensorflow::Device* device =
-      (op->device == nullptr) ? ctx->devices()[0] : op->device;
-
-#ifdef TENSORFLOW_EAGER_USE_XLA
-  std::unique_ptr<TFE_Op> xla_launch_op;
-  if (op->use_xla && op->name != "_XlaLaunch") {
-    xla_launch_op = BuildXlaLaunch(op, status);
-    if (!status->status.ok()) {
-      return;
-    }
-    op = xla_launch_op.get();
-  }
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
-  std::vector<tensorflow::Tensor> outputs(1);
-  const tensorflow::MemoryTypeVector* output_memory_types = nullptr;
-  tensorflow::Fprint128 cache_key = op->attrs.CacheKey(device->name());
-  tensorflow::KernelAndDevice* kernel;
-  {
-    tensorflow::tf_shared_lock l(ctx->cache_mu);
-    kernel = tensorflow::gtl::FindPtrOrNull(ctx->kernel_cache, cache_key);
-  }
-  if (kernel == nullptr) {
-    const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
-    kernel = new tensorflow::KernelAndDevice(ctx->rendezvous);
-    // Knowledge of the implementation of Init (and in-turn
-    // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
-    // will be accessed, so grab on to the lock.
-    // See WARNING comment below - would be nice to rework to avoid this
-    // subtlety.
-    tensorflow::tf_shared_lock l(ctx->functions_mu);
-    status->status =
-        tensorflow::KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel);
-    if (!status->status.ok()) {
-      delete kernel;
-      return;
-    }
-    tensorflow::mutex_lock ml(ctx->cache_mu);
-    tensorflow::gtl::InsertOrUpdate(&(ctx->kernel_cache), cache_key, kernel);
-  }
-  std::vector<TFE_TensorHandle*> copied_tensors;
-  status->status = ValidateInputTypeAndPlacement(
-      ctx, ctx->devices()[0], device, op, kernel->kernel(), &copied_tensors);
-  output_memory_types = &kernel->kernel()->output_memory_types();
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
+      *num_retvals);
+  status->status =
+      tensorflow::EagerExecute(&op->operation, &handle_retvals, num_retvals);
   if (!status->status.ok()) {
-    for (auto* t : copied_tensors) {
-      TFE_DeleteTensorHandle(t);
-    }
     return;
   }
-  std::unique_ptr<tensorflow::NodeExecStats> maybe_stats;
-  if (ctx->should_store_metadata.load()) {
-    maybe_stats.reset(new tensorflow::NodeExecStats);
-    maybe_stats->set_node_name(op->name);
-    maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros());
-    maybe_stats->set_op_start_rel_micros(0);
-    maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros());
-    // TODO(apassos) track referenced tensors
-  }
-  // WARNING: kernel->Run utilizes the FunctionLibraryRuntime
-  // (ctx->func_lib(device)), which in turn holds a pointer to func_lib_def,
-  // which is GUARDED_BY(ctx->functions_mu). But knowledge of the implementation
-  // of FunctionLibraryRuntime tells us that func_lib_def is not accessed by
-  // FunctionLibraryRuntime::Run(), so there is no thread-safety concern here.
-  // This is quite subtle. Re-work things to make this better?  (Would it make
-  // sense for FunctionLibraryRuntime to ensure thread-safe access to
-  // FunctionLibraryDefinition?).  TODO(apassos) figure out how to record stats
-  // for ops which are a part of functions.
-  status->status = kernel->Run(&op->inputs, &outputs, maybe_stats.get());
-  for (auto* t : copied_tensors) {
-    TFE_DeleteTensorHandle(t);
-  }
-  if (!status->status.ok()) return;
-  if (maybe_stats != nullptr) {
-    maybe_stats->set_op_end_rel_micros(tensorflow::Env::Default()->NowMicros() -
-                                       maybe_stats->all_start_micros());
-    tensorflow::mutex_lock ml(ctx->metadata_mu);
-    if (ctx->should_store_metadata.load()) {
-      auto* step_stats = ctx->run_metadata.mutable_step_stats();
-      // Lazily initialize the RunMetadata with information about all devices if
-      // this is the first call.
-      while (step_stats->dev_stats_size() < ctx->devices().size()) {
-        step_stats->add_dev_stats();
-      }
-      // Find the current device's index.
-      int device_idx = 0;
-      for (int i = 0; i < ctx->devices().size(); ++i) {
-        if (ctx->devices()[i] == device) {
-          device_idx = i;
-          break;
-        }
-      }
-      // Populate the device stats for this device.
-      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-      dev_stats->set_device(device->name());
-      *dev_stats->add_node_stats() = *maybe_stats;
-    }
-  }
-  *num_retvals = std::min<int>(*num_retvals, outputs.size());
   for (int i = 0; i < *num_retvals; ++i) {
-    tensorflow::Device* d = IsCPU(device) ? nullptr : device;
-    if (d != nullptr && output_memory_types != nullptr &&
-        (*output_memory_types)[i] == tensorflow::HOST_MEMORY) {
-      d = nullptr;
-    }
-    retvals[i] = new TFE_TensorHandle(outputs[i], d);
+    retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
   }
 }
 
+TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
+                                               TFE_Context* ctx,
+                                               const char* device_name,
+                                               TF_Status* status) {
+  tensorflow::TensorHandle* handle;
+  status->status = tensorflow::EagerCopyToDevice(h->handle, &ctx->context,
+                                                 device_name, &handle);
+  if (status->status.ok()) {
+    return new TFE_TensorHandle(handle);
+  }
+  return nullptr;
+}
+
 void TFE_ContextAddFunctionDef(TFE_Context* ctx,
                                const char* serialized_function_def, size_t size,
                                TF_Status* status) {
@@ -896,46 +455,120 @@ void TFE_ContextAddFunctionDef(TFE_Context* ctx,
         tensorflow::errors::InvalidArgument("Invalid FunctionDef proto");
     return;
   }
-  tensorflow::mutex_lock l(ctx->functions_mu);
-  status->status = ctx->func_lib_def.AddFunctionDef(function_def);
+  status->status = ctx->context.AddFunctionDef(function_def);
 }
 
 void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
                             TF_Status* status) {
-  tensorflow::mutex_lock l(ctx->functions_mu);
-  status->status = ctx->func_lib_def.AddFunctionDef(function->fdef);
+  status->status = ctx->context.AddFunctionDef(function->fdef);
+}
+
+void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
+  ctx->context.SetShouldStoreMetadata(true);
+}
+
+void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
+  ctx->context.SetShouldStoreMetadata(false);
 }
 
 }  // extern "C"
 
 TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t) {
-  return new TFE_TensorHandle(t, nullptr);
+  return new TFE_TensorHandle(t, nullptr, nullptr);
 }
 
 const tensorflow::Tensor* TFE_TensorHandleUnderlyingTensorInHostMemory(
     TFE_TensorHandle* h, TF_Status* status) {
-  if (h->d != nullptr) {
+  tensorflow::Device* d = nullptr;
+  tensorflow::Device* op_device = nullptr;
+  const tensorflow::Tensor* t = nullptr;
+  status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
+  if (!status->status.ok()) return nullptr;
+  if (d != nullptr) {
     status->status = tensorflow::errors::FailedPrecondition(
         "TFE_TensorHandle is placed in device (not host) memory. Cannot return "
         "a tensorflow::Tensor");
     return nullptr;
   }
-  return &h->t;
+  return t;
 }
 
-void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
-  ctx->should_store_metadata.store(true);
+void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
+                                  TF_Status* status) {
+  TFE_ContextAsyncWait(ctx, status);
+  if (!status->status.ok()) return;
+  tensorflow::mutex_lock ml(*ctx->context.MetadataMu());
+  status->status = MessageToBuffer(*ctx->context.RunMetadataProto(), buf);
+  ctx->context.RunMetadataProto()->Clear();
 }
 
-void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
-  tensorflow::mutex_lock ml(ctx->metadata_mu);
-  ctx->should_store_metadata.store(false);
-  ctx->run_metadata.Clear();
+namespace {
+TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
+                TF_Status* status) {
+  TFE_Op* func_op = TFE_NewOp(ctx, func.name().data(), status);
+  for (const auto& attr : func.attr()) {
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    SetOpAttrValueScalar(ctx, func_op, attr.second, attr.first.data(), status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  return func_op;
 }
+}  // namespace
 
-void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
-                                  TF_Status* status) {
-  tensorflow::mutex_lock ml(ctx->metadata_mu);
-  status->status = MessageToBuffer(ctx->run_metadata, buf);
-  ctx->run_metadata.Clear();
-}
+namespace tensorflow {
+void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
+                          const tensorflow::AttrValue& default_value,
+                          const char* attr_name, TF_Status* status) {
+  switch (default_value.value_case()) {
+    case tensorflow::AttrValue::kS:
+      TFE_OpSetAttrString(op, attr_name, default_value.s().data());
+      break;
+    case tensorflow::AttrValue::kI:
+      TFE_OpSetAttrInt(op, attr_name, static_cast<int64_t>(default_value.i()));
+      break;
+    case tensorflow::AttrValue::kF:
+      TFE_OpSetAttrFloat(op, attr_name, default_value.f());
+      break;
+    case tensorflow::AttrValue::kB:
+      TFE_OpSetAttrBool(op, attr_name, default_value.b());
+      break;
+    case tensorflow::AttrValue::kType:
+      TFE_OpSetAttrType(op, attr_name,
+                        static_cast<TF_DataType>(default_value.type()));
+      break;
+    case tensorflow::AttrValue::kShape: {
+      const auto& tensor_shape = default_value.shape();
+      if (tensor_shape.unknown_rank()) {
+        TFE_OpSetAttrShape(op, attr_name, nullptr, -1, status);
+      } else {
+        const auto num_dims = tensor_shape.dim_size();
+        std::unique_ptr<int64_t[]> dims(new int64_t[num_dims]);
+        for (int i = 0; i < num_dims; ++i) {
+          dims[i] = tensor_shape.dim(i).size();
+        }
+        TFE_OpSetAttrShape(op, attr_name, dims.get(), num_dims, status);
+      }
+    } break;
+    case tensorflow::AttrValue::kFunc: {
+      const auto func_op = GetFunc(ctx, default_value.func(), status);
+      if (TF_GetCode(status) != TF_OK) return;
+      // TODO(nareshmodi): TFE_OpSetAttrFunction and TFE_OpSetAttrFunctionList
+      // require TFE_Op* and just convert it internally a NameAttrValue, so
+      // consider adding an overload to the C API to make this case easier.
+      TFE_OpSetAttrFunction(op, attr_name, func_op);
+    } break;
+    case tensorflow::AttrValue::kList:
+      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::kTensor:
+      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::kPlaceholder:
+      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::VALUE_NOT_SET:
+      TF_SetStatus(
+          status, TF_UNIMPLEMENTED,
+          tensorflow::strings::StrCat("Unable to get setfor default value: ",
+                                      default_value.DebugString())
+              .data());
+  }
+}
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 7a321b54da343fd2b8912187bc620c1e7456db0c..c06ce84a8c578aa60dd626c24bd58098b78ae750 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -30,7 +30,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -38,7 +38,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
@@ -65,14 +65,19 @@ typedef enum TFE_ContextDevicePlacementPolicy {
   TFE_DEVICE_PLACEMENT_EXPLICIT = 0,
   // Copy the tensor to the right device but log a warning.
   TFE_DEVICE_PLACEMENT_WARN = 1,
-  // Silently copy the tensor, which has a performance cost since the
-  // operation will be blocked till the copy completes.
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default placement
+  // policy.
   TFE_DEVICE_PLACEMENT_SILENT = 2,
-  // Default placement policy which silently copies int32 tensors but not other
-  // dtypes.
+  // Placement policy which silently copies int32 tensors but not other dtypes.
   TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 } TFE_ContextDevicePlacementPolicy;
 
+// Sets the default execution mode (sync/async). Note that this can be
+// overridden per thread using TFE_ContextSetAsyncForThread.
+TF_CAPI_EXPORT extern void TFE_ContextOptionsSetAsync(TFE_ContextOptions*,
+                                                      unsigned char async);
+
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy(
     TFE_ContextOptions*, TFE_ContextDevicePlacementPolicy);
 
@@ -108,6 +113,30 @@ TF_CAPI_EXPORT extern void TFE_ContextSetThreadLocalDevicePlacementPolicy(
 TF_CAPI_EXPORT extern TFE_ContextDevicePlacementPolicy
 TFE_ContextGetDevicePlacementPolicy(TFE_Context*);
 
+// Overrides the execution mode (sync/async) for the current thread.
+TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context*,
+                                                        unsigned char async,
+                                                        TF_Status* status);
+
+// Causes the calling thread to block till all ops dispatched in async mode
+// have been executed. Note that "execution" here refers to kernel execution /
+// scheduling of copies, etc. Similar to sync execution, it doesn't guarantee
+// that lower level device queues (like GPU streams) have been flushed.
+//
+// This call may not block for execution of ops enqueued concurrently with this
+// call.
+TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context*,
+                                                TF_Status* status);
+
+// When an error happens, any pending operations are discarded and newly issued
+// ops return an error. This call clears the error state and re-enables
+// execution of newly issued ops.
+//
+// Note that outputs of discarded ops remain in a corrupt state and should not
+// be used for future calls.
+// TODO(agarwal): mark the affected handles and raise errors if they are used.
+TF_CAPI_EXPORT extern void TFE_ContextAsyncClearError(TFE_Context*);
+
 // A handle to a tensor on a device.
 //
 // Like a TF_Tensor, a TFE_TensorHandle refers to a tensor with a value, shape,
@@ -117,13 +146,25 @@ typedef struct TFE_TensorHandle TFE_TensorHandle;
 
 TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t,
                                                             TF_Status* status);
+// Indicates that the caller will not be using `h` any more.
 TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
 TF_CAPI_EXPORT extern TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h);
-TF_CAPI_EXPORT extern int TFE_TensorHandleNumDims(TFE_TensorHandle* h);
+// This function will block till the operation that produces `h` has completed.
+TF_CAPI_EXPORT extern int TFE_TensorHandleNumDims(TFE_TensorHandle* h,
+                                                  TF_Status* status);
+// This function will block till the operation that produces `h` has completed.
 TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
-                                                  int dim_index);
+                                                  int dim_index,
+                                                  TF_Status* status);
+// This function will block till the operation that produces `h` has completed.
 TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
-    TFE_TensorHandle* h);
+    TFE_TensorHandle* h, TF_Status* status);
+
+// This function will block till the operation that produces `h` has
+// completed. The memory returned might alias the internal memory used by
+// TensorFlow. Hence, callers should not mutate this memory (for example by
+// modifying the memory region pointed to by TF_TensorData() on the returned
+// TF_Tensor).
 TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
                                                          TF_Status* status);
 
@@ -133,6 +174,9 @@ TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
 // that shares the underlying buffer. Otherwise, it currently requires at least
 // one of the source or destination devices to be CPU (i.e., for the source or
 // destination tensor to be placed in host memory).
+// If async execution is enabled, the copy may be enqueued and the call will
+// return "non-ready" handle. Else, this function returns after the copy has
+// been done.
 TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(
     TFE_TensorHandle* h, TFE_Context* ctx, const char* device_name,
     TF_Status* status);
@@ -153,6 +197,7 @@ typedef struct TFE_Op TFE_Op;
 TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx,
                                         const char* op_or_function_name,
                                         TF_Status* status);
+
 TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);
 
 TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name,
@@ -238,13 +283,21 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrFunctionList(TFE_Op* op,
                                                      int num_values);
 
 // Execute the operation defined by 'op' and return handles to computed
-// tensors in 'retvals'.
+// tensors in `retvals`.
+//
+// 'retvals' must point to a pre-allocated array of TFE_TensorHandle* and
+// '*num_retvals' should be set to the size of this array. It is an error if
+// the size of 'retvals' is less than the number of outputs. This call sets
+// *num_retvals to the number of outputs.
 //
-// 'retvals' must point to a pre-allocated array of TFE_TensorHandle*
-// and '*num_retvals' should be set to the size of this array.
+// If async execution is enabled, the call may simply enqueue the execution
+// and return "non-ready" handles in `retvals`. Note that any handles contained
+// in 'op' should not be mutated till the kernel execution actually finishes.
 //
-// On return, 'num_retvals' will be set to the actual number of outputs
-// returned by the operation.
+// For sync execution, if any of the inputs to `op` are not ready, this call
+// will block till they become ready and then return when the kernel execution
+// is done.
+// TODO(agarwal): change num_retvals to int from int*.
 TF_CAPI_EXPORT extern void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals,
                                        int* num_retvals, TF_Status* status);
 
@@ -270,6 +323,8 @@ TF_CAPI_EXPORT extern void TFE_ContextDisableRunMetadata(TFE_Context* ctx);
 // Populates the passed-in buffer with a serialized RunMetadata protocol buffer
 // containing any run metadata information accumulated so far and clears this
 // information.
+// If async mode is enabled, this call blocks till all currently pending ops are
+// done.
 TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx,
                                                         TF_Buffer* buf,
                                                         TF_Status* status);
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 7b9f1db02ed9c53a280c7bd1284165cac4fb6353..49e1aab1cef9577256d9b081858cf094c788caf8 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -19,7 +19,9 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <map>
 #include <memory>
+#include <queue>
 #include <string>
 #include <thread>
 #include <vector>
@@ -28,9 +30,16 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/runtime.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -39,89 +48,53 @@ limitations under the License.
 
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
-  TFE_ContextDevicePlacementPolicy policy{
-      TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32};
+  // true if async execution is enabled.
+  bool async = false;
+  TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_SILENT};
 };
 
 struct TFE_Context {
-  explicit TFE_Context(const TFE_ContextOptions& opts, TF_Session* s)
-      : policy(opts.policy),
-        session(s),
-        rendezvous(new tensorflow::IntraProcessRendezvous(s->device_mgr)),
-        pflr(new tensorflow::ProcessFunctionLibraryRuntime(
-            session->device_mgr, opts.session_options.options.env,
-            TF_GRAPH_DEF_VERSION, &func_lib_def, {})) {}
-
-  const TFE_ContextDevicePlacementPolicy policy;
-
-  // Note: we cannot use C++11 thread_local here as there is no concept of a
-  // thread-local-object-local variable in C++11.
-  tensorflow::mutex policy_map_mu;
-  std::unordered_map<std::thread::id, TFE_ContextDevicePlacementPolicy>
-      thread_local_policies GUARDED_BY(policy_map_mu);
-
-  // TFE_Context is an extension of TF_Session. And TF_Session needs a TF_Graph.
-  TF_Session* const session;
-  tensorflow::Rendezvous* const rendezvous;
-
-  tensorflow::mutex functions_mu;
-  tensorflow::FunctionLibraryDefinition func_lib_def GUARDED_BY(functions_mu){
-      tensorflow::OpRegistry::Global(), {}};
-
-  // One FunctionLibraryRuntime per device.
-  // func_libs[i] is the FunctionLibraryRuntime corresponding to
-  // session->devices[i].
-  const std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
-
-  tensorflow::mutex cache_mu;
-  std::unordered_map<tensorflow::Fprint128, tensorflow::KernelAndDevice*,
-                     tensorflow::Fprint128Hasher>
-      kernel_cache GUARDED_BY(cache_mu);
-
-  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) const {
-    return pflr->GetFLR(d->name());
-  }
-
-  const std::vector<tensorflow::Device*>& devices() { return session->devices; }
-
-  // Whether we should compute RunMetadata.
-  std::atomic<bool> should_store_metadata{false};
-  tensorflow::mutex metadata_mu;
-  tensorflow::RunMetadata run_metadata GUARDED_BY(metadata_mu);
+  explicit TFE_Context(const tensorflow::SessionOptions& opts,
+                       TFE_ContextDevicePlacementPolicy default_policy,
+                       bool async,
+                       std::unique_ptr<tensorflow::DeviceMgr> device_mgr,
+                       tensorflow::Rendezvous* rendezvous)
+      : context(opts,
+                static_cast<tensorflow::ContextDevicePlacementPolicy>(
+                    default_policy),
+                async, std::move(device_mgr), rendezvous) {}
+
+  tensorflow::EagerContext context;
 };
 
 struct TFE_TensorHandle {
-  TFE_TensorHandle(const tensorflow::Tensor& t, tensorflow::Device* d)
-      : t(t), d(d) {}
-
-  tensorflow::Tensor t;
-  // TODO(ashankar): d == nullptr iff local CPU
-  // This was expedient, but perhaps worth revisiting ('d' should always be a
-  // valid pointer?)
-  // This can be done if TFE_NewOp() and the TFE_TensorHandle constructors are
-  // provided with the appropriate TFE_Context.
-  //
-  // TODO(ashankar): Reference count TFE_Context to ensure that 'd' of a
-  // TFE_TensorHandle does not outlive the TFE_Context from which it came?
-  tensorflow::Device* d;
+  TFE_TensorHandle(const tensorflow::Tensor& t, tensorflow::Device* d,
+                   tensorflow::Device* op_device)
+      : handle(new tensorflow::TensorHandle(t, d, op_device, nullptr)) {}
+
+  TFE_TensorHandle(tensorflow::uint64 node_id, tensorflow::DataType dtype,
+                   tensorflow::EagerContext* ctx)
+      : handle(new tensorflow::TensorHandle(node_id, dtype, ctx)) {}
+
+  TFE_TensorHandle(tensorflow::TensorHandle* handle) : handle(handle) {}
+
+  tensorflow::TensorHandle* handle;
 };
 
 struct TFE_Op {
   // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
   // primitive operation.
   TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
-      : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {}
-
-  bool const is_function() const { return attr_types == nullptr; }
-
-  TFE_Context* ctx;  // Must outlive the TFE_Op.
-  const tensorflow::string name;
-  tensorflow::AttrBuilder attrs;
-  const tensorflow::AttrTypeMap* attr_types;
-  std::vector<tensorflow::Tensor> inputs;
-  std::vector<tensorflow::Device*> input_devices;
-  tensorflow::Device* device;
-  bool use_xla = false;
+      : operation(&ctx->context, op, t) {}
+
+  tensorflow::EagerOperation operation;
 };
 
+namespace tensorflow {
+// Set an AttrValue on the op. Doesn't handle the list types.
+void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
+                          const tensorflow::AttrValue& default_value,
+                          const char* attr_name, TF_Status* status);
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 4a3ecbc0abb16296a84c0d2184dc3fc9f7f3ebb4..701175e4943d1d23532fe595319f67711316ed4d 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -29,6 +29,20 @@ using tensorflow::string;
 
 namespace {
 
+TFE_TensorHandle* DoubleTestMatrixTensorHandle() {
+  int64_t dims[] = {2, 2};
+  double data[] = {1.0, 2.0, 3.0, 4.0};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_DOUBLE, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
 TFE_TensorHandle* TestMatrixTensorHandle() {
   int64_t dims[] = {2, 2};
   float data[] = {1.0f, 2.0f, 3.0f, 4.0f};
@@ -43,6 +57,20 @@ TFE_TensorHandle* TestMatrixTensorHandle() {
   return th;
 }
 
+TFE_TensorHandle* TestMatrixTensorHandle3X2() {
+  int64_t dims[] = {3, 2};
+  double data[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
 TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   TF_Status* status = TF_NewStatus();
 
@@ -139,10 +167,12 @@ void BM_InitOp(int iters) {
 }
 BENCHMARK(BM_InitOp);
 
-void BM_Execute(int iters) {
+void BM_Execute(int iters, int async) {
   tensorflow::testing::StopTiming();
+  tensorflow::testing::SetLabel(async ? "ExecuteAsync" : "Execute");
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
@@ -156,6 +186,9 @@ void BM_Execute(int iters) {
     TFE_Execute(matmul, &retvals[0], &num_retvals, status);
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
+  if (async) {
+    TFE_ContextAsyncWait(ctx, status);
+  }
   tensorflow::testing::StopTiming();
   TFE_DeleteOp(matmul);
   TFE_DeleteTensorHandle(m);
@@ -163,7 +196,7 @@ void BM_Execute(int iters) {
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
-BENCHMARK(BM_Execute);
+BENCHMARK(BM_Execute)->Arg(0)->Arg(1);
 
 TEST(CAPI, Context) {
   TF_Status* status = TF_NewStatus();
@@ -205,10 +238,11 @@ TEST(CAPI, TensorHandle) {
   TFE_DeleteTensorHandle(h);
 }
 
-TEST(CAPI, TensorHandleCopyBetweenDevices) {
+void TensorHandleCopyBetweenDevices(bool async) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status.get());
   TFE_DeleteContextOptions(opts);
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
@@ -274,10 +308,56 @@ TEST(CAPI, TensorHandleCopyBetweenDevices) {
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
 
-TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevices) {
+TEST(CAPI, TensorHandleCopyBetweenDevices) {
+  TensorHandleCopyBetweenDevices(false);
+}
+
+TEST(CAPI, TensorHandleCopyBetweenDevicesAsync) {
+  TensorHandleCopyBetweenDevices(true);
+}
+
+void TensorHandleCopyBetweenDevicesError(bool async) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
+  const char* kErrorDevice = "NoSuchDevice:0";
+  TFE_TensorHandle* hdevice =
+      TFE_TensorHandleCopyToDevice(hcpu, ctx, kErrorDevice, status.get());
+  EXPECT_NE(TF_OK, TF_GetCode(status.get()));
+  const char* msg = "NoSuchDevice:0 unknown device";
+  EXPECT_TRUE(strstr(TF_Message(status.get()), msg) != nullptr)
+      << TF_Message(status.get());
+  TF_SetStatus(status.get(), TF_OK, "");
+  const char* kCPUDevice = "CPU:0";
+  TFE_TensorHandle* hcopy =
+      TFE_TensorHandleCopyToDevice(hcpu, ctx, kCPUDevice, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_ContextAsyncWait(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get()));
+  TFE_DeleteTensorHandle(hcopy);
+  TFE_DeleteTensorHandle(hcpu);
+  if (hdevice != nullptr) TFE_DeleteTensorHandle(hdevice);
+  TFE_DeleteContext(ctx, status.get());
+}
+
+TEST(CAPI, TensorHandleCopyBetweenDevicesError) {
+  TensorHandleCopyBetweenDevicesError(false);
+}
+
+TEST(CAPI, TensorHandleCopyBetweenDevicesErrorAsync) {
+  TensorHandleCopyBetweenDevicesError(true);
+}
+
+void TensorHandleCopyBetweenTwoGPUDevices(bool async) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status.get());
   TFE_DeleteContextOptions(opts);
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
@@ -332,11 +412,20 @@ TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevices) {
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
 
-TEST(CAPI, TensorHandleSilentCopy) {
+TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevices) {
+  TensorHandleCopyBetweenTwoGPUDevices(false);
+}
+
+TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevicesAsync) {
+  TensorHandleCopyBetweenTwoGPUDevices(true);
+}
+
+void TensorHandleSilentCopy(bool async) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
   TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status.get());
   TFE_DeleteContextOptions(opts);
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
@@ -366,14 +455,20 @@ TEST(CAPI, TensorHandleSilentCopy) {
 
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
+  TFE_ContextAsyncWait(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_DeleteContext(ctx, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
 
-TEST(CAPI, TensorHandleSilentCopyLocal) {
+TEST(CAPI, TensorHandleSilentCopy) { TensorHandleSilentCopy(false); }
+TEST(CAPI, TensorHandleSilentCopyAsync) { TensorHandleSilentCopy(true); }
+
+void TensorHandleSilentCopyLocal(bool async) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_ContextOptionsSetDevicePlacementPolicy(opts,
                                              TFE_DEVICE_PLACEMENT_EXPLICIT);
   TFE_Context* ctx = TFE_NewContext(opts, status.get());
@@ -407,11 +502,17 @@ TEST(CAPI, TensorHandleSilentCopyLocal) {
 
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
+  TFE_ContextAsyncWait(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_DeleteContext(ctx, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
+TEST(CAPI, TensorHandleSilentCopyLocal) { TensorHandleSilentCopyLocal(false); }
+TEST(CAPI, TensorHandleSilentCopyLocalAsync) {
+  TensorHandleSilentCopyLocal(true);
+}
 
-TEST(CAPI, SetAndGetOpDevices) {
+void SetAndGetOpDevices(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
   TFE_Context* ctx = TFE_NewContext(opts, status);
@@ -442,27 +543,28 @@ TEST(CAPI, SetAndGetOpDevices) {
   TF_DeleteStatus(status);
 }
 
-TEST(CAPI, Execute_MatMul_CPU) {
+void Execute_MatMul_CPU(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
   TFE_TensorHandle* m = TestMatrixTensorHandle();
   TFE_Op* matmul = MatMulOp(ctx, m, m);
-  TFE_TensorHandle* retvals[2] = {nullptr};
-  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_TensorHandle* retvals[2] = {nullptr, nullptr};
+  int num_retvals = 2;
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(1, num_retvals);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteOp(matmul);
   TFE_DeleteTensorHandle(m);
-  TFE_DeleteContext(ctx, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  ASSERT_EQ(1, num_retvals);
 
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx, status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   float product[4] = {0};
   EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
@@ -474,7 +576,107 @@ TEST(CAPI, Execute_MatMul_CPU) {
   EXPECT_EQ(22, product[3]);
   TF_DeleteStatus(status);
 }
+TEST(CAPI, Execute_MatMul_CPU) { Execute_MatMul_CPU(false); }
+TEST(CAPI, Execute_MatMul_CPUAsync) { Execute_MatMul_CPU(true); }
+
+void Execute_MatMul_CPU_Runtime_Error(bool async) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m1 = TestMatrixTensorHandle();
+  TFE_TensorHandle* m2 = TestMatrixTensorHandle3X2();
+  TFE_Op* matmul = MatMulOp(ctx, m1, m2);
+  TFE_OpSetDevice(matmul, "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Op* matmul2 = MatMulOp(ctx, m1, m1);
+  TFE_OpSetDevice(matmul2, "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  TFE_DeleteOp(matmul);
+  if (!async) {
+    EXPECT_NE(TF_OK, TF_GetCode(status));
+  } else {
+    TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+    EXPECT_NE(TF_OK, TF_GetCode(status));
+    EXPECT_EQ(nullptr, t);
+    const char* msg = "Matrix size-incompatible: In[0]: [2,2], In[1]: [3,2]";
+    EXPECT_TRUE(strstr(TF_Message(status), msg) != nullptr)
+        << TF_Message(status);
+    // Since error is not cleared, the following copy with correct device will
+    // still fail.
+    TF_SetStatus(status, TF_OK, "");
+    TFE_DeleteTensorHandle(retvals[0]);
+    retvals[0] = nullptr;
+    TFE_Execute(matmul2, &retvals[0], &num_retvals, status);
+    EXPECT_NE(TF_OK, TF_GetCode(status));
+    TFE_ContextAsyncClearError(ctx);
+    TFE_ContextAsyncWait(ctx, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+  }
+  // Following works in async mode since TFE_ContextAsyncClearError was called.
+  TF_SetStatus(status, TF_OK, "");
+  if (retvals[0] != nullptr) {
+    TFE_DeleteTensorHandle(retvals[0]);
+  }
+  retvals[0] = nullptr;
+  TFE_Execute(matmul2, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status));
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status));
+  TF_DeleteTensor(t);
+  TFE_DeleteOp(matmul2);
+  TFE_DeleteTensorHandle(m1);
+  TFE_DeleteTensorHandle(m2);
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx, status);
+  TF_DeleteStatus(status);
+}
+TEST(CAPI, Execute_MatMul_CPU_Runtime_Error) {
+  Execute_MatMul_CPU_Runtime_Error(false);
+}
+TEST(CAPI, Execute_MatMul_CPU_Runtime_ErrorAsync) {
+  Execute_MatMul_CPU_Runtime_Error(true);
+}
+
+void Execute_MatMul_CPU_Type_Error(bool async) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m1 = TestMatrixTensorHandle();
+  TFE_TensorHandle* m2 = DoubleTestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m1, m2);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_NE(TF_OK, TF_GetCode(status));
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m1);
+  TFE_DeleteTensorHandle(m2);
+  if (retvals[0] != nullptr) {
+    TFE_DeleteTensorHandle(retvals[0]);
+  }
+  TFE_DeleteContext(ctx, status);
+  TF_DeleteStatus(status);
+}
 
+TEST(CAPI, Execute_MatMul_CPU_Type_Error) {
+  Execute_MatMul_CPU_Type_Error(false);
+}
+TEST(CAPI, Execute_MatMul_CPU_Type_ErrorAsync) {
+  Execute_MatMul_CPU_Type_Error(true);
+}
 TEST(CAPI, Execute_Min_CPU) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -485,33 +687,34 @@ TEST(CAPI, Execute_Min_CPU) {
   TFE_TensorHandle* input = TestMatrixTensorHandle();
   TFE_TensorHandle* axis = TestAxisTensorHandle();
   TFE_Op* minOp = MinOp(ctx, input, axis);
-  TFE_TensorHandle* retvals[2] = {nullptr};
-  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
   TFE_Execute(minOp, &retvals[0], &num_retvals, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteOp(minOp);
   TFE_DeleteTensorHandle(input);
   TFE_DeleteTensorHandle(axis);
-  TFE_DeleteContext(ctx, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   ASSERT_EQ(1, num_retvals);
 
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
-  TFE_DeleteTensorHandle(retvals[0]);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(retvals[0]);
   float output[2] = {0};
   EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
   memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
   TF_DeleteTensor(t);
   EXPECT_EQ(1, output[0]);
   EXPECT_EQ(3, output[1]);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
 
 #ifdef TENSORFLOW_EAGER_USE_XLA
-TEST(CAPI, Execute_MatMul_XLA_CPU) {
+void Execute_MatMul_XLA_CPU(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
@@ -521,15 +724,14 @@ TEST(CAPI, Execute_MatMul_XLA_CPU) {
 
   TFE_OpSetXLACompilation(matmul, true);
 
-  TFE_TensorHandle* retvals[2] = {nullptr};
-  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
   // Running a primitive TF operator via XLA is not yet supported.
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_DeleteOp(matmul);
   TFE_DeleteTensorHandle(m);
-  TFE_DeleteContext(ctx, status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   EXPECT_EQ(1, num_retvals);
@@ -545,13 +747,16 @@ TEST(CAPI, Execute_MatMul_XLA_CPU) {
   EXPECT_EQ(10, product[1]);
   EXPECT_EQ(15, product[2]);
   EXPECT_EQ(22, product[3]);
-
+  TFE_DeleteContext(ctx, status);
   TF_DeleteStatus(status);
 }
+TEST(CAPI, Execute_MatMul_XLA_CPU) { Execute_MatMul_XLA_CPU(false); }
+TEST(CAPI, Execute_MatMul_XLA_CPUAsync) { Execute_MatMul_XLA_CPU(true); }
 
-TEST(CAPI, Execute_Min_XLA_CPU) {
+void Execute_Min_XLA_CPU(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
@@ -562,14 +767,13 @@ TEST(CAPI, Execute_Min_XLA_CPU) {
 
   TFE_OpSetXLACompilation(minOp, true);
 
-  TFE_TensorHandle* retvals[2] = {nullptr};
-  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
   TFE_Execute(minOp, &retvals[0], &num_retvals, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteOp(minOp);
   TFE_DeleteTensorHandle(input);
   TFE_DeleteTensorHandle(axis);
-  TFE_DeleteContext(ctx, status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   ASSERT_EQ(1, num_retvals);
 
@@ -582,13 +786,17 @@ TEST(CAPI, Execute_Min_XLA_CPU) {
   TF_DeleteTensor(t);
   EXPECT_EQ(1, output[0]);
   EXPECT_EQ(3, output[1]);
+  TFE_DeleteContext(ctx, status);
   TF_DeleteStatus(status);
 }
+TEST(CAPI, Execute_Min_XLA_CPU) { Execute_Min_XLA_CPU(false); }
+TEST(CAPI, Execute_Min_XLA_CPUAsync) { Execute_Min_XLA_CPU(true); }
 #endif  // TENSORFLOW_EAGER_USE_XLA
 
-TEST(CAPI, ExecuteWithTracing) {
+void ExecuteWithTracing(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
   TFE_ContextEnableRunMetadata(ctx);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
@@ -596,8 +804,8 @@ TEST(CAPI, ExecuteWithTracing) {
 
   TFE_TensorHandle* m = TestMatrixTensorHandle();
   TFE_Op* matmul = MatMulOp(ctx, m, m);
-  TFE_TensorHandle* retvals[2] = {nullptr};
-  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteOp(matmul);
@@ -609,12 +817,12 @@ TEST(CAPI, ExecuteWithTracing) {
   EXPECT_TRUE(
       rm.ParseFromString({reinterpret_cast<const char*>(b->data), b->length}));
   TF_DeleteBuffer(b);
-  TFE_DeleteContext(ctx, status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   ASSERT_EQ(1, num_retvals);
 
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
   TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx, status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   float product[4] = {0};
   EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
@@ -626,6 +834,8 @@ TEST(CAPI, ExecuteWithTracing) {
   EXPECT_EQ(22, product[3]);
   TF_DeleteStatus(status);
 }
+TEST(CAPI, ExecuteWithTracing) { ExecuteWithTracing(false); }
+TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithTracing(true); }
 
 TEST(CAPI, Function_ident_CPU) {
   // First create a simple identity function.
@@ -657,32 +867,37 @@ TEST(CAPI, Function_ident_CPU) {
   ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
   TF_DeleteFunction(fn);
 
-  TF_Tensor* t =
-      TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
-  *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
-  TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteTensor(t);
+  for (bool async : {false, true, false}) {
+    TFE_ContextSetAsyncForThread(ctx, static_cast<unsigned char>(async),
+                                 status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK);
+    TF_Tensor* t =
+        TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
+    *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
+    TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TF_DeleteTensor(t);
 
-  TFE_Op* op = TFE_NewOp(ctx, "ident", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_OpAddInput(op, h, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TFE_Op* op = TFE_NewOp(ctx, "ident", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TFE_OpAddInput(op, h, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
 
-  std::vector<TFE_TensorHandle*> result;
-  result.push_back(nullptr);
-  int num_retvals = 1;
-  TFE_Execute(op, result.data(), &num_retvals, status);
-  TFE_DeleteOp(op);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  ASSERT_EQ(num_retvals, 1);
+    std::vector<TFE_TensorHandle*> result;
+    result.push_back(nullptr);
+    int num_retvals = 1;
+    TFE_Execute(op, result.data(), &num_retvals, status);
+    TFE_DeleteOp(op);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    ASSERT_EQ(num_retvals, 1);
 
-  TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
-  TFE_DeleteTensorHandle(h);
-  TF_DeleteTensor(r);
-  TFE_DeleteTensorHandle(result[0]);
+    TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
+    TFE_DeleteTensorHandle(h);
+    TF_DeleteTensor(r);
+    TFE_DeleteTensorHandle(result[0]);
+  }
   TFE_DeleteContext(ctx, status);
   ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
   TF_DeleteStatus(status);
@@ -719,35 +934,40 @@ TEST(CAPI, Function_ident_XLA_CPU) {
   ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
   TF_DeleteFunction(fn);
 
-  TF_Tensor* t =
-      TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
-  *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
-  TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteTensor(t);
+  for (bool async : {false, true, false}) {
+    TFE_ContextSetAsyncForThread(ctx, static_cast<unsigned char>(async),
+                                 status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK);
+    TF_Tensor* t =
+        TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
+    *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
+    TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TF_DeleteTensor(t);
 
-  TFE_Op* op = TFE_NewOp(ctx, "ident", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_OpAddInput(op, h, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TFE_Op* op = TFE_NewOp(ctx, "ident", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TFE_OpAddInput(op, h, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
 
-  // Now run it via XLA.
-  TFE_OpSetXLACompilation(op, true);
+    // Now run it via XLA.
+    TFE_OpSetXLACompilation(op, true);
 
-  std::vector<TFE_TensorHandle*> result;
-  result.push_back(nullptr);
-  int num_retvals = 1;
-  TFE_Execute(op, result.data(), &num_retvals, status);
-  TFE_DeleteOp(op);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  ASSERT_EQ(num_retvals, 1);
+    std::vector<TFE_TensorHandle*> result;
+    result.push_back(nullptr);
+    int num_retvals = 1;
+    TFE_Execute(op, result.data(), &num_retvals, status);
+    TFE_DeleteOp(op);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    ASSERT_EQ(num_retvals, 1);
 
-  TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
-  TFE_DeleteTensorHandle(h);
-  TF_DeleteTensor(r);
-  TFE_DeleteTensorHandle(result[0]);
+    TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
+    TFE_DeleteTensorHandle(h);
+    TF_DeleteTensor(r);
+    TFE_DeleteTensorHandle(result[0]);
+  }
   TFE_DeleteContext(ctx, status);
   ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
   TF_DeleteStatus(status);
@@ -788,9 +1008,10 @@ string MatMulFunction() {
   return def.SerializeAsString();
 }
 
-TEST(CAPI, FunctionDefAndExecute) {
+void FunctionDefAndExecute(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
@@ -827,11 +1048,16 @@ TEST(CAPI, FunctionDefAndExecute) {
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
+TEST(CAPI, FunctionDefAndExecute) { FunctionDefAndExecute(false); }
+TEST(CAPI, FunctionDefAndExecuteAsync) { FunctionDefAndExecute(true); }
 
-void BM_ExecuteFunction(int iters) {
+void BM_ExecuteFunction(int iters, int async) {
   tensorflow::testing::StopTiming();
+  tensorflow::testing::SetLabel(async ? "ExecuteFunctionAsync"
+                                      : "ExecuteFunction");
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
@@ -853,6 +1079,9 @@ void BM_ExecuteFunction(int iters) {
     TFE_Execute(matmul, &retval[0], &num_retvals, status);
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
+  if (async) {
+    TFE_ContextAsyncWait(ctx, status);
+  }
   tensorflow::testing::StopTiming();
   TFE_DeleteTensorHandle(m);
   TFE_DeleteTensorHandle(retval[0]);
@@ -860,7 +1089,7 @@ void BM_ExecuteFunction(int iters) {
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
-BENCHMARK(BM_ExecuteFunction);
+BENCHMARK(BM_ExecuteFunction)->Arg(0)->Arg(1);
 
 TFE_TensorHandle* CreateVariable(TFE_Context* ctx, float value,
                                  TF_Status* status) {
@@ -932,7 +1161,8 @@ TEST(CAPI, Variables) {
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   ASSERT_EQ(1, num_retvals);
   EXPECT_EQ(TF_FLOAT, TFE_TensorHandleDataType(value_handle));
-  EXPECT_EQ(0, TFE_TensorHandleNumDims(value_handle));
+  EXPECT_EQ(0, TFE_TensorHandleNumDims(value_handle, status));
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   float value = 0.0f;
   TF_Tensor* t = TFE_TensorHandleResolve(value_handle, status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
@@ -974,7 +1204,8 @@ void BM_ReadVariable(int iters) {
     CHECK_EQ(1, num_retvals);
     CHECK(h);
     CHECK_EQ(TF_FLOAT, TFE_TensorHandleDataType(h));
-    CHECK_EQ(0, TFE_TensorHandleNumDims(h));
+    CHECK_EQ(0, TFE_TensorHandleNumDims(h, status));
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
     h = nullptr;
   }
   tensorflow::testing::StopTiming();
diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/c/eager/runtime.cc
index f77a937f1ffc2d146224cb3191a5ca127daefc22..e6c51ab17a867a0697f15d7683d8ca52c062035d 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/c/eager/runtime.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/eager/runtime.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -41,17 +42,26 @@ const uint32 kIsList = 1U << 31;
 
 }  // namespace
 
+Status OpDefForOp(const char* op_name, const OpDef** op_def) {
+  const OpRegistrationData* op_reg_data = nullptr;
+  Status s = OpRegistry::Global()->LookUp(op_name, &op_reg_data);
+  if (s.ok()) {
+    *op_def = &op_reg_data->op_def;
+  }
+  return s;
+}
+
 Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
   mutex_lock l(g_op_name_to_attr_type_map_lock);
   *out = gtl::FindPtrOrNull(*OpNameToAttrTypeMap(), op_name);
   if (*out != nullptr) return Status::OK();
-  const OpRegistrationData* op_reg_data = nullptr;
-  Status s = OpRegistry::Global()->LookUp(op_name, &op_reg_data);
+  const OpDef* op_def = nullptr;
+  Status s = OpDefForOp(op_name, &op_def);
   if (!s.ok()) return s;
   std::unique_ptr<AttrTypeMap> m(new AttrTypeMap);
   // TODO(agarwal): Avoid having to create this "registry" at runtime,
   // perhaps can be done at op registration time?
-  for (const auto& attr : op_reg_data->op_def.attr()) {
+  for (const auto& attr : op_def->attr()) {
     string type = attr.type();
     const bool is_list = (type.length() > 6 && type.compare(0, 4, "list") == 0);
     if (is_list) {
@@ -86,22 +96,6 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
   return Status::OK();
 }
 
-Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
-                      TF_AttrType* out, unsigned char* is_list) {
-  auto* t = gtl::FindOrNull(m, attr_name);
-  if (t == nullptr) {
-    return errors::InvalidArgument("Attribute '", attr_name,
-                                   "' does not exist for this operation");
-  }
-  *out = static_cast<TF_AttrType>(*t & ~kIsList);
-  if (*t & kIsList) {
-    *is_list = 1;
-  } else {
-    *is_list = 0;
-  }
-  return Status::OK();
-}
-
 #define DEFINE_SET_ATTR(value_type, value_field)                             \
   template <>                                                                \
   AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
@@ -159,6 +153,22 @@ const NodeDef& AttrBuilder::BuildNodeDef() {
   return *node_def_;
 }
 
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
+                      TF_AttrType* out, unsigned char* is_list) {
+  auto* t = gtl::FindOrNull(m, attr_name);
+  if (t == nullptr) {
+    return errors::InvalidArgument("Attribute '", attr_name,
+                                   "' does not exist for this operation");
+  }
+  *out = static_cast<TF_AttrType>(*t & ~kIsList);
+  if (*t & kIsList) {
+    *is_list = 1;
+  } else {
+    *is_list = 0;
+  }
+  return Status::OK();
+}
+
 namespace {
 inline tensorflow::Fprint128 FingerprintCat128(const tensorflow::Fprint128& a,
                                                const tensorflow::Fprint128& b) {
@@ -174,8 +184,7 @@ void CombineUnordered(const tensorflow::Fprint128& a,
 
 inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s,
                                             const tensorflow::Fprint128& b) {
-  // TODO(agarwal): avoid ToString().
-  tensorflow::Fprint128 a = tensorflow::Fingerprint128(s.ToString());
+  tensorflow::Fprint128 a = tensorflow::Fingerprint128(s);
   return FingerprintCat128(a, b);
 }
 
@@ -203,10 +212,8 @@ tensorflow::Fprint128 AttrBuilder::CacheKey(const string& device) const {
     if (node_def_finalized_) return f;
   }
   for (const auto& p : string_attrs_) {
-    // TODO(agarwal): avoid ToString().
-    CombineUnordered(CacheKeyHelper(p.first, tensorflow::Fingerprint128(
-                                                 p.second.ToString())),
-                     &f);
+    CombineUnordered(
+        CacheKeyHelper(p.first, tensorflow::Fingerprint128(p.second)), &f);
   }
   for (const auto& p : int_attrs_) {
     CombineUnordered(CacheKeyHelper(p.first, static_cast<uint64>(p.second)),
@@ -236,93 +243,4 @@ void AttrBuilder::MayBeInitializeNodeDef() {
   }
 }
 
-// static
-Status KernelAndDevice::InitOp(Device* device, const NodeDef& ndef,
-                               KernelAndDevice* out) {
-  OpKernel* k = nullptr;
-  Status s = CreateOpKernel(device->device_type().c_str(), device,
-                            device->GetAllocator(AllocatorAttributes()),
-                            nullptr, ndef, TF_GRAPH_DEF_VERSION, &k);
-  out->device_ = device;
-  out->kernel_.reset(k);
-  out->flib_ = nullptr;
-  return s;
-}
-
-// static
-Status KernelAndDevice::Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
-                             KernelAndDevice* out) {
-  OpKernel* k = nullptr;
-  Status s = flib->CreateKernel(ndef, &k);
-  out->device_ = flib->device();
-  out->kernel_.reset(k);
-  out->flib_ = flib;
-  return s;
-}
-
-Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
-                            std::vector<Tensor>* output_tensors,
-                            NodeExecStats* stats) {
-  gtl::InlinedVector<TensorValue, 4> inputs;
-  for (Tensor& t : *input_tensors) {
-    inputs.push_back(TensorValue(&t));
-  }
-
-  std::vector<AllocatorAttributes> out_attrs(kernel_->num_outputs());
-  for (size_t i = 0; i < out_attrs.size(); ++i) {
-    out_attrs[i].set_on_host(kernel_->output_memory_types()[i] ==
-                             tensorflow::HOST_MEMORY);
-  }
-
-  OpKernelContext::Params params;
-  params.device = device_;
-  params.frame_iter = FrameAndIter(0, 0);
-  params.inputs = &inputs;
-  params.op_kernel = kernel_.get();
-  params.resource_manager = device_->resource_manager();
-  params.output_attr_array = gtl::vector_as_array(&out_attrs);
-  params.function_library = flib_;
-  params.slice_reader_cache = &slice_reader_cache_;
-  params.rendezvous = rendez_;
-  if (stats != nullptr) {
-    params.track_allocations = true;
-  }
-  // TODO(apassos): use a thread pool.
-  std::function<void(std::function<void()>)> runner =
-      [](std::function<void()> f) { f(); };
-  params.runner = &runner;
-
-  OpKernelContext context(&params);
-  device_->Compute(kernel_.get(), &context);
-  if (!context.status().ok()) return context.status();
-
-  output_tensors->clear();
-  for (int i = 0; i < context.num_outputs(); ++i) {
-    output_tensors->push_back(Tensor(*context.mutable_output(i)));
-  }
-  if (stats != nullptr) {
-    for (const auto& allocator_pair : context.wrapped_allocators()) {
-      AllocatorMemoryUsed* memory = stats->add_memory();
-      memory->set_allocator_name(allocator_pair.first->Name());
-      auto sizes = allocator_pair.second->GetSizes();
-      memory->set_total_bytes(std::get<0>(sizes));
-      memory->set_peak_bytes(std::get<1>(sizes));
-      memory->set_live_bytes(std::get<2>(sizes));
-
-      AllocatorStats allocator_stats;
-      allocator_pair.first->GetStats(&allocator_stats);
-      memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
-      allocator_pair.second->GetRecordsAndUnRef();
-    }
-    auto* ms = stats->mutable_memory_stats();
-    ms->set_temp_memory_size(context.temp_memory_allocated());
-    for (const auto& alloc_id : context.persistent_alloc_ids()) {
-      ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
-    }
-
-    ms->set_persistent_memory_size(context.persistent_memory_allocated());
-  }
-  return Status::OK();
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/runtime.h b/tensorflow/c/eager/runtime.h
index 4d20b5244a46fcde2eed0a429dced2a77b86aedd..929b1b8296faf61c11c68af06ffc4ca3770ae929 100644
--- a/tensorflow/c/eager/runtime.h
+++ b/tensorflow/c/eager/runtime.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -39,9 +40,16 @@ namespace tensorflow {
 // represent the TF_AttrType type of the values in the list.
 typedef std::unordered_map<string, uint32> AttrTypeMap;
 
+// Look up OpDef for `op_name`.
+Status OpDefForOp(const char* op_name, const OpDef** op_def);
+
 // Returns the AttrTypeMap for the TensorFlow operation named op_name.
 Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out);
 
+// Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
+                      TF_AttrType* out, unsigned char* is_list);
+
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list);
@@ -146,47 +154,6 @@ template <>
 AttrBuilder& AttrBuilder::Set(StringPiece attr_name,
                               tensorflow::DataType&& value);
 
-// KernelAndDevice encapsulates an instantiated kernel and the device it is on.
-//
-// Also see:
-// https://www.tensorflow.org/code/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
-// and
-// https://www.tensorflow.org/code/tensorflow/core/kernels/ops_testutil.h
-class KernelAndDevice {
- public:
-  // Populates 'out' with a kernel appropriate for 'ndef'.
-  //
-  // The provided FunctionLibraryRuntime MUST outlive all calls to
-  // Run() on the returned KernelAndDevice.
-  //
-  // TODO(ashankar): Figure out thread-safety concerns around
-  // FunctionLibraryRuntime (in particular, how the underlying
-  // FunctionLibraryDefinition might be mutated by another thread as new
-  // functions are registered with it).  Conservatively, thread-safe usage of
-  // the FunctionLibraryRuntime is pushed on to the caller (see locking in
-  // c_api.cc).
-  static Status Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
-                     KernelAndDevice* out);
-  // TODO(ashankar): Remove this
-  static Status InitOp(Device* device, const NodeDef& ndef,
-                       KernelAndDevice* out);
-
-  KernelAndDevice(tensorflow::Rendezvous* rendez)
-      : device_(nullptr), flib_(nullptr), rendez_(rendez) {}
-
-  // TODO(ashankar): Handle list-valued inputs.
-  Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
-             NodeExecStats* stats);
-
-  const OpKernel* kernel() const { return kernel_.get(); }
-
- private:
-  std::unique_ptr<OpKernel> kernel_;
-  Device* device_;
-  FunctionLibraryRuntime* flib_;
-  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
-  Rendezvous* rendez_;
-};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/eager/runtime_test.cc b/tensorflow/c/eager/runtime_test.cc
index 643153058ce3d6f0c88dd23a0dec4c6eff060319..27ebeb0508844ee1ee89e0733b66f6ed129b7757 100644
--- a/tensorflow/c/eager/runtime_test.cc
+++ b/tensorflow/c/eager/runtime_test.cc
@@ -33,27 +33,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-class TestEnv {
- public:
-  TestEnv() : flib_def_(OpRegistry::Global(), {}) {
-    Device* device =
-        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
-    device_mgr_.reset(new DeviceMgr({device}));
-    flib_runtime_ = NewFunctionLibraryRuntime(device_mgr_.get(), Env::Default(),
-                                              device, TF_GRAPH_DEF_VERSION,
-                                              &flib_def_, {}, nullptr);
-  }
-
-  FunctionLibraryRuntime* function_library_runtime() const {
-    return flib_runtime_.get();
-  }
-
- private:
-  FunctionLibraryDefinition flib_def_;
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
-};
-
 TEST(AttrTypeMap, Lookup) {
   const AttrTypeMap* m = nullptr;
   Status s = AttrTypeMapForOp("ThisOpCannotPossiblyExist", &m);
@@ -79,113 +58,5 @@ TEST(AttrTypeMap, Lookup) {
   EXPECT_NE(is_list, 0);
 }
 
-TEST(KernelAndDevice, Run) {
-  Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
-  std::vector<Tensor> inputs;
-  inputs.push_back(t);
-  inputs.push_back(t);
-  NodeDef ndef(AttrBuilder("MatMul")
-                   .Set("T", DT_FLOAT)
-                   .Set("transpose_a", false)
-                   .Set("transpose_b", false)
-                   .NumInputs(inputs.size())
-                   .BuildNodeDef());
-  TestEnv env;
-  KernelAndDevice kernel(nullptr);
-  Status s =
-      KernelAndDevice::Init(ndef, env.function_library_runtime(), &kernel);
-  ASSERT_TRUE(s.ok()) << s;
-  std::vector<Tensor> outputs;
-  s = kernel.Run(&inputs, &outputs, nullptr);
-  ASSERT_TRUE(s.ok()) << s;
-  ASSERT_EQ(1, outputs.size());
-  const Tensor& out = outputs[0];
-  EXPECT_EQ(7, out.matrix<float>()(0, 0));
-  EXPECT_EQ(10, out.matrix<float>()(0, 1));
-  EXPECT_EQ(15, out.matrix<float>()(1, 0));
-  EXPECT_EQ(22, out.matrix<float>()(1, 1));
-}
-
-void BM_CreateGraph(int iters) {
-  for (int i = 0; i < iters; ++i) {
-    Scope root = Scope::NewRootScope();
-    auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
-    auto M = ops::MatMul(root, C, C);
-    TF_CHECK_OK(root.status());
-  }
-}
-BENCHMARK(BM_CreateGraph);
-
-void BM_RunGraph(int iters) {
-  tensorflow::testing::StopTiming();
-  Scope root = Scope::NewRootScope();
-  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
-  auto M = ops::MatMul(root, C, C);
-  SessionOptions opts;
-  opts.config.set_inter_op_parallelism_threads(1);
-  opts.config.set_intra_op_parallelism_threads(1);
-  ClientSession sess(root, opts);
-  std::vector<Tensor> outputs;
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    outputs.clear();
-    TF_CHECK_OK(sess.Run({M}, &outputs));
-  }
-}
-BENCHMARK(BM_RunGraph);
-
-void BM_CreateAndDestroySession(int iters) {
-  tensorflow::testing::StopTiming();
-  Scope root = Scope::NewRootScope();
-  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
-  auto M = ops::MatMul(root, C, C);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    ClientSession sess(root);
-  }
-}
-BENCHMARK(BM_CreateAndDestroySession);
-
-void BM_KernelAndDeviceInit(int iters) {
-  tensorflow::testing::StopTiming();
-  NodeDef ndef(AttrBuilder("MatMul")
-                   .Set("T", DT_FLOAT)
-                   .Set("transpose_a", false)
-                   .Set("transpose_b", false)
-                   .NumInputs(2)
-                   .BuildNodeDef());
-  TestEnv env;
-  KernelAndDevice k(nullptr);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(
-        KernelAndDevice::Init(ndef, env.function_library_runtime(), &k));
-  }
-}
-BENCHMARK(BM_KernelAndDeviceInit);
-
-void BM_KernelAndDeviceRun(int iters) {
-  tensorflow::testing::StopTiming();
-  Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
-  std::vector<Tensor> inputs;
-  inputs.push_back(t);
-  inputs.push_back(t);
-  std::vector<Tensor> outputs;
-  NodeDef ndef(AttrBuilder("MatMul")
-                   .Set("T", DT_FLOAT)
-                   .Set("transpose_a", false)
-                   .Set("transpose_b", false)
-                   .NumInputs(inputs.size())
-                   .BuildNodeDef());
-  TestEnv env;
-  KernelAndDevice kernel(nullptr);
-  TF_CHECK_OK(
-      KernelAndDevice::Init(ndef, env.function_library_runtime(), &kernel));
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(kernel.Run(&inputs, &outputs, nullptr));
-  }
-}
-BENCHMARK(BM_KernelAndDeviceRun);
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index bdb0815d6b68444ec1c89b835d563db20ce4d8a1..97c323b87228039ba10f4ed5e434aa83621b1220 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -152,6 +152,8 @@ class GradientTape {
                          gtl::ArraySlice<Gradient*> output_gradients,
                          std::vector<Gradient*>* result);
 
+  bool IsPersistent() const { return persistent_; }
+
  private:
   TensorTape tensor_tape_;
   OpTape<BackwardFunction> op_tape_;
@@ -599,23 +601,28 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
   }
   CHECK(state.op_tape.empty());
   result->reserve(source_tensor_ids.size());
+  gtl::FlatSet<int64> used_gradient_ids(source_tensor_ids.size());
   for (auto is : source_tensor_ids) {
     auto grad_it = gradients.find(is);
     if (grad_it == gradients.end()) {
       result->push_back(nullptr);
     } else {
-      if (grad_it->second.size() == 1) {
-        result->push_back(grad_it->second[0]);
-      } else {
-        result->push_back(vspace.AggregateGradients(grad_it->second));
+      if (grad_it->second.size() > 1) {
+        Gradient* grad = vspace.AggregateGradients(grad_it->second);
+        grad_it->second.clear();
+        grad_it->second.push_back(grad);
       }
-      gradients.erase(grad_it);
+      result->push_back(grad_it->second[0]);
+      used_gradient_ids.insert(is);
     }
   }
-  VLOG(1) << "Final gradients size: " << gradients.size();
+  VLOG(1) << "Final gradients size: "
+          << gradients.size() - used_gradient_ids.size();
   for (auto grad_pair : gradients) {
-    for (const auto& g : grad_pair.second) {
-      vspace.DeleteGradient(g);
+    if (used_gradient_ids.find(grad_pair.first) == used_gradient_ids.end()) {
+      for (const auto& g : grad_pair.second) {
+        vspace.DeleteGradient(g);
+      }
     }
   }
   return Status::OK();
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index 6e37cdb5f4beea53d4a2ded0705ae482d0bc2d68..e18fdf6c57bd3f432d8cb73536fb816df90b3963 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/python_api.h"
 
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/python/framework/cpp_shape_inference.pb.h"
 
 namespace tensorflow {
 
@@ -99,4 +100,65 @@ void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op) {
   }
 }
 
+void SetRequireShapeInferenceFns(TF_Graph* graph, bool require) {
+  mutex_lock l(graph->mu);
+  graph->refiner.set_require_shape_inference_fns(require);
+}
+
+void ExtendSession(TF_Session* session, TF_Status* status) {
+  ExtendSessionGraphHelper(session, status);
+  session->extend_before_run = false;
+}
+
+std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) {
+  Node* node = &output.oper->node;
+  CppShapeInferenceResult::HandleData handle_data;
+  handle_data.set_is_set(true);
+  {
+    mutex_lock l(graph->mu);
+    tensorflow::shape_inference::InferenceContext* ic =
+        graph->refiner.GetContext(node);
+    CHECK(ic != nullptr);
+    CHECK_LT(output.index, ic->num_outputs());
+    const auto* shapes_and_types =
+        ic->output_handle_shapes_and_types(output.index);
+    if (shapes_and_types == nullptr) return "";
+
+    for (const auto& p : *shapes_and_types) {
+      auto* out_shape_and_type = handle_data.add_shape_and_type();
+      ic->ShapeHandleToProto(p.shape, out_shape_and_type->mutable_shape());
+      out_shape_and_type->set_dtype(p.dtype);
+    }
+  }
+  string result;
+  handle_data.SerializeToString(&result);
+  return result;
+}
+
+void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
+                                   const void* proto, size_t proto_len,
+                                   TF_Status* status) {
+  tensorflow::CppShapeInferenceResult::HandleData handle_data;
+  if (!handle_data.ParseFromArray(proto, proto_len)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Couldn't deserialize HandleData proto");
+    return;
+  }
+  DCHECK(handle_data.is_set());
+
+  tensorflow::mutex_lock l(graph->mu);
+  tensorflow::shape_inference::InferenceContext* ic =
+      graph->refiner.GetContext(&output.oper->node);
+
+  std::vector<tensorflow::shape_inference::ShapeAndType> shapes_and_types;
+  for (const auto& shape_and_type_proto : handle_data.shape_and_type()) {
+    tensorflow::shape_inference::ShapeHandle shape;
+    status->status =
+        ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape);
+    if (status->status.ok()) return;
+    shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype());
+  }
+  ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index aa9d9e06b28c54cb8869eb547d36ee3cb0d4e6b8..4bcb5bde62c8a4df4e68c1ce0daaf459434ceb5d 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_PYTHON_API_H_
 #define TENSORFLOW_C_PYTHON_API_H_
 
+#include <string>
+
 #include "tensorflow/c/c_api.h"
 
 // These functions can be removed without notice. They exist to facilitate some
@@ -37,6 +39,31 @@ void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
 
 void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op);
 
+// Sets whether ops missing a shape inference function should trigger an
+// error. The default is true.
+void SetRequireShapeInferenceFns(TF_Graph* graph, bool require);
+
+// Extends `session` with any new operations added to its associated graph.
+// Usually this happens automatically in TF_SessionRun. After this is called,
+// TF_SessionRun will no longer extend the session on every call.
+//
+// We expose this here to allow fine-grained synchronization in multi-threaded
+// workloads, which is required since the Python implementation depends on the
+// above mutation methods. This allows us to prevent modifications to nodes in
+// the graph after the session has been made aware of them.
+void ExtendSession(TF_Session* session, TF_Status* status);
+
+// Returns the serialized CppShapeInferenceResult::HandleData proto for
+// `output` if its a resource tensor, or otherwise returns the empty string.
+std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output);
+
+// Sets `output` based on `proto`, which should be a serialized
+// CppShapeInferenceResult::HandleData proto.
+// NOTE(skyewm): `proto` is passed a void*/size_t pair instead of a std::string
+// because I couldn't get SWIG to work otherwise.
+void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
+                                   const void* proto, size_t proto_len,
+                                   TF_Status* status);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/c/testdata/tf_record b/tensorflow/c/testdata/tf_record
new file mode 100644
index 0000000000000000000000000000000000000000..6e16076bfb79ad8151952e96567565e8820b0f5b
Binary files /dev/null and b/tensorflow/c/testdata/tf_record differ
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 9060c19e9d2cf965c2b9be07be07c42017da45a8..079e063d3e3fbdaf833e9031f5f9438853c14099 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -620,18 +620,6 @@ tf_cc_binary(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "queue_runner",
     srcs = ["training/queue_runner.cc"],
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index a40ad1ffc3b262840e6ca0043139b1b61e04510d..d73121c7b701ec06c03836d1a765f4b35d88fe92 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb_text.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
@@ -697,7 +698,8 @@ string OpInfo::GetOpAttrStruct() const {
     attr_comment = MakeComment(attr_comment, "    ");
 
     strings::StrAppend(&setters, attr_comment);
-    strings::StrAppend(&setters, "    Attrs ", attr_func_def, " x) {\n");
+    strings::StrAppend(&setters, "    TF_MUST_USE_RESULT Attrs ", attr_func_def,
+                       " x) {\n");
     strings::StrAppend(&setters, "      Attrs ret = *this;\n");
     strings::StrAppend(&setters, "      ret.", api_def_attr.rename_to(),
                        "_ = x;\n");
diff --git a/tensorflow/cc/framework/cc_op_gen_test.cc b/tensorflow/cc/framework/cc_op_gen_test.cc
index 1e0f2d241bb350897a840dda90d6d0c009b1daad..5d9dfd95a5538ae0f3d2d111a1f989552c3363b8 100644
--- a/tensorflow/cc/framework/cc_op_gen_test.cc
+++ b/tensorflow/cc/framework/cc_op_gen_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -61,12 +62,12 @@ op {
 )";
 
 void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(s.contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
 void ExpectDoesNotHaveSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_FALSE(s.contains(expected))
+  EXPECT_FALSE(str_util::StrContains(s, expected))
       << "'" << s << "' contains '" << expected << "'";
 }
 
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 71642492627422e09c19b7bcb4dc522846cf08b1..c143b978338815ebc7134eb0a07867c5d8b13dca 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -218,7 +219,7 @@ std::unordered_set<string> Scope::Impl::GetColocationConstraints(
   if (GetNodeAttr(attrs, kColocationAttrName, &node_constraints).ok()) {
     for (const string& entry : node_constraints) {
       StringPiece s(entry);
-      if (s.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&s, kColocationGroupPrefix)) {
         current_constraints.insert(s.ToString());
       }
     }
diff --git a/tensorflow/cc/framework/while_gradients.cc b/tensorflow/cc/framework/while_gradients.cc
index 0734075fc6144d7c9f4fdb48c5e097faa58b8355..81870a0efa309ae6dbd5cc05a5dbe8c3e2d437c8 100644
--- a/tensorflow/cc/framework/while_gradients.cc
+++ b/tensorflow/cc/framework/while_gradients.cc
@@ -72,9 +72,9 @@ Status AddForwardLoopCounter(WhileContext* while_ctx, const Scope& scope,
   };
 
   // Body function that adds one to input.
-  BodyGraphBuilderFn body_fn = [while_ctx](const Scope& scope,
-                                           const std::vector<Output>& inputs,
-                                           std::vector<Output>* outputs) {
+  BodyGraphBuilderFn body_fn = [](const Scope& scope,
+                                  const std::vector<Output>& inputs,
+                                  std::vector<Output>* outputs) {
     DCHECK_EQ(inputs.size(), 1);
     outputs->emplace_back(ops::Add(scope, inputs[0], 1));
     return scope.status();
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index 6545e4ee3eb406436937a43ddac66d017af8e108..ff348fadb24e29a83bd6c8853aa67931f6df4182 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -385,6 +385,42 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("MirrorPadGrad", MirrorPadGradGrad);
 
+Status StridedSliceGradHelper(const Scope& scope, const Operation& op,
+                              const std::vector<Output>& grad_inputs,
+                              std::vector<Output>* grad_outputs) {
+  Input x = Shape(scope, op.input(0));
+  Input begin = op.input(1);
+  Input end = op.input(2);
+  Input strides = op.input(3);
+  int64 begin_mask;
+  int64 end_mask;
+  int64 ellipsis_mask;
+  int64 new_axis_mask;
+  int64 shrink_axis_mask;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "begin_mask", &begin_mask));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "end_mask", &end_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "ellipsis_mask", &ellipsis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "new_axis_mask", &new_axis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "shrink_axis_mask", &shrink_axis_mask));
+  grad_outputs->push_back(
+      StridedSliceGrad(scope, x, begin, end, strides, grad_inputs[0],
+                       StridedSliceGrad::BeginMask(begin_mask)
+                           .EndMask(end_mask)
+                           .EllipsisMask(ellipsis_mask)
+                           .NewAxisMask(new_axis_mask)
+                           .ShrinkAxisMask(shrink_axis_mask)));
+  // No gradients returned for begin, end and strides
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("StridedSlice", StridedSliceGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc
index 4a215fcc9299cf8b8da04cbf151640631ed0d449..de3bd0fc9e2493f8ff76163f5be6bd4327c58c5a 100644
--- a/tensorflow/cc/gradients/array_grad_test.cc
+++ b/tensorflow/cc/gradients/array_grad_test.cc
@@ -354,5 +354,29 @@ TEST_F(ArrayGradTest, MirrorPadGradGrad_Symmetric) {
   RunTest(x, x_shape, y, y_shape);
 }
 
+TEST_F(ArrayGradTest, StridedSliceGrad) {
+  TensorShape x_shape({6, 4, 4});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+
+  // y = x[2:6:2, 1:3, 1:3]
+  auto y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1});
+  // y.shape = [2, 2, 2];
+  RunTest(x, x_shape, y, {2, 2, 2});
+
+  // y = x[2:6:2, 1:3, 1:3]
+  // begin_mask = 1<<1 (ignore begin_index = 1)
+  // end_mask = 1<<2 (ignore end_index = 2)
+  y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1},
+                   StridedSlice::BeginMask(1 << 1).EndMask(1 << 2));
+  // y.shape = [2, 3, 3];
+  RunTest(x, x_shape, y, {2, 3, 3});
+
+  // y = [tf.newaxis, 2:6:2, 1:3, 1:3]
+  y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1},
+                   StridedSlice::NewAxisMask(1 << 0));
+  // y.shape = [1, 2, 2, 2];
+  RunTest(x, x_shape, y, {1, 2, 2, 2});
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 13a3bba5e6d5ca19ff3f0eca76665ba7d3ab628d..0cb3132e94e381f672d69aefe4a199d2b590830c 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -48,8 +48,8 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op,
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
 Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
-                   const std::vector<Output>& grad_inputs,
-                   std::vector<Output>* grad_outputs) {
+                      const std::vector<Output>& grad_inputs,
+                      std::vector<Output>* grad_outputs) {
   auto softmax = Exp(scope, op.output(0));
   auto sum = Sum(scope, grad_inputs[0], {1}, Sum::KeepDims(true));
   auto mul = Mul(scope, sum, softmax);
@@ -107,11 +107,10 @@ Status BiasAddGradHelper(const Scope& scope, const Operation& op,
                          const std::vector<Output>& grad_inputs,
                          std::vector<Output>* grad_outputs) {
   string data_format;
-  BiasAddGrad::Attrs input_attrs;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(op.output(0).node()->attrs(), "data_format", &data_format));
-  input_attrs.DataFormat(data_format);
-  auto dx_1 = BiasAddGrad(scope, grad_inputs[0], input_attrs);
+  auto dx_1 =
+      BiasAddGrad(scope, grad_inputs[0], BiasAddGrad::DataFormat(data_format));
   grad_outputs->push_back(Identity(scope, grad_inputs[0]));
   grad_outputs->push_back(dx_1);
   return scope.status();
@@ -130,19 +129,16 @@ Status Conv2DGrad(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "use_cudnn_on_gpu", &use_cudnn_on_gpu));
-  Conv2DBackpropInput::Attrs input_attrs;
-  input_attrs.DataFormat(data_format);
-  input_attrs.UseCudnnOnGpu(use_cudnn_on_gpu);
-  auto dx_1 = Conv2DBackpropInput(scope, Shape(scope, op.input(0)),
-                                  op.input(1), grad_inputs[0],
-                                  strides, padding, input_attrs);
+  auto dx_1 = Conv2DBackpropInput(scope, Shape(scope, op.input(0)), op.input(1),
+                                  grad_inputs[0], strides, padding,
+                                  Conv2DBackpropInput::DataFormat(data_format)
+                                      .UseCudnnOnGpu(use_cudnn_on_gpu));
   grad_outputs->push_back(dx_1);
-  Conv2DBackpropFilter::Attrs filter_attrs;
-  filter_attrs.DataFormat(data_format);
-  filter_attrs.UseCudnnOnGpu(use_cudnn_on_gpu);
-  auto dx_2 = Conv2DBackpropFilter(scope, op.input(0),
-                                   Shape(scope, op.input(1)), grad_inputs[0],
-                                   strides, padding, filter_attrs);
+  auto dx_2 =
+      Conv2DBackpropFilter(scope, op.input(0), Shape(scope, op.input(1)),
+                           grad_inputs[0], strides, padding,
+                           Conv2DBackpropFilter::DataFormat(data_format)
+                               .UseCudnnOnGpu(use_cudnn_on_gpu));
   grad_outputs->push_back(dx_2);
   return scope.status();
 }
@@ -160,13 +156,9 @@ Status MaxPoolGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
-  internal::MaxPoolGrad::Attrs grad_attrs;
-  grad_attrs.DataFormat(data_format);
-  auto dx = internal::MaxPoolGrad(scope, op.input(0),
-                                  op.output(0),
-                                  grad_inputs[0],
-                                  ksize, strides,
-                                  padding, grad_attrs);
+  auto dx = internal::MaxPoolGrad(
+      scope, op.input(0), op.output(0), grad_inputs[0], ksize, strides, padding,
+      internal::MaxPoolGrad::DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -180,15 +172,9 @@ Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op,
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
-  MaxPoolGradV2::Attrs grad_attrs;
-  grad_attrs.DataFormat(data_format);
-  auto dx = MaxPoolGradV2(scope, op.input(0),
-                          op.output(0),
-                          grad_inputs[0],
-                          op.input(1),
-                          op.input(2),
-                          padding,
-                          grad_attrs);
+  auto dx = MaxPoolGradV2(scope, op.input(0), op.output(0), grad_inputs[0],
+                          op.input(1), op.input(2), padding,
+                          MaxPoolGradV2::DataFormat(data_format));
   grad_outputs->push_back(dx);
   grad_outputs->push_back(NoGradient());
   grad_outputs->push_back(NoGradient());
@@ -196,13 +182,74 @@ Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("MaxPoolV2", MaxPoolGradV2Helper);
 
+Status MaxPool3DGradHelper(const Scope& scope, const Operation& op,
+                           const std::vector<Output>& grad_inputs,
+                           std::vector<Output>* grad_outputs) {
+  std::vector<int32> ksize;
+  std::vector<int32> strides;
+  string padding;
+  string data_format;
+  auto attrs = op.output(0).node()->attrs();
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  MaxPool3DGrad::Attrs grad_attrs;
+  auto dx = MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0],
+                          ksize, strides, padding,
+                          grad_attrs.DataFormat(data_format));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("MaxPool3D", MaxPool3DGradHelper);
+
+Status AvgPoolGradHelper(const Scope& scope, const Operation& op,
+                         const std::vector<Output>& grad_inputs,
+                         std::vector<Output>* grad_outputs) {
+  std::vector<int32> ksize;
+  std::vector<int32> strides;
+  string padding;
+  string data_format;
+  auto attrs = op.output(0).node()->attrs();
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  internal::AvgPoolGrad::Attrs grad_attrs;
+  auto dx =
+      internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
+                            ksize, strides, padding,
+                            grad_attrs.DataFormat(data_format));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("AvgPool", AvgPoolGradHelper);
+
+Status AvgPool3DGradHelper(const Scope& scope, const Operation& op,
+                           const std::vector<Output>& grad_inputs,
+                           std::vector<Output>* grad_outputs) {
+  std::vector<int32> ksize;
+  std::vector<int32> strides;
+  string padding;
+  string data_format;
+  auto attrs = op.output(0).node()->attrs();
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  AvgPool3DGrad::Attrs grad_attrs;
+  auto dx = AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
+                          ksize, strides, padding,
+                          grad_attrs.DataFormat(data_format));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("AvgPool3D", AvgPool3DGradHelper);
+
 Status LRNGradHelper(const Scope& scope, const Operation& op,
                      const std::vector<Output>& grad_inputs,
-                     std::vector<Output>* grad_outputs){
-  internal::LRNGrad::Attrs grad_attrs;
-
-  auto dx = internal::LRNGrad(scope, grad_inputs[0], op.input(0), op.output(0),
-                              grad_attrs);
+                     std::vector<Output>* grad_outputs) {
+  auto dx = internal::LRNGrad(scope, grad_inputs[0], op.input(0), op.output(0));
   grad_outputs->push_back(dx);
   return scope.status();
 }
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index 0cfe5f6e3c49f7c4a3cafbf48ff4e54a0ffd0d47..c4eba7ecb017fe4628140d75a63bc7f0f09deb7f 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -31,8 +31,11 @@ using ops::Elu;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
+using ops::AvgPool;
+using ops::AvgPool3D;
 using ops::MaxPool;
 using ops::MaxPoolV2;
+using ops::MaxPool3D;
 using ops::Placeholder;
 using ops::Relu;
 using ops::Relu6;
@@ -70,9 +73,9 @@ class NNGradTest : public ::testing::Test {
 
   // Sets tensor with random values, ensuring that the max value is largest by
   // a reasonable amount.
-  // This is an issue for MaxPool and MaxPoolV2, in which perturbations by the
-  // numeric gradient computation in the gradient checker can change the max
-  // value if values are too close together.
+  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
+  // perturbations by the numeric gradient computation in the gradient checker
+  // can change the max value if values are too close together.
   template <typename T>
   void SetRandomValuesWithBumpedMax(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
@@ -203,6 +206,41 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   RunTest(x, x_init_value, y, y_shape);
 }
 
+TEST_F(NNGradTest, MaxPool3DGradHelper) {
+  TensorShape x_shape({1, 3, 3, 3, 1});
+  TensorShape y_shape({1, 1, 1, 1, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Setup window and strides so that we only do one MaxPool3D.
+  const std::vector<int> ksize{1, 3, 3, 3, 1};
+  const std::vector<int> strides{1, 3, 3, 3, 1};
+  auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  RunTest(x, x_init_value, y, y_shape);
+}
+
+TEST_F(NNGradTest, AvgPoolGradHelper) {
+  TensorShape x_shape({1, 2, 2, 1});
+  TensorShape y_shape({1, 1, 1, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Setup window and strides so that we only do one AvgPool.
+  const std::vector<int> ksize{1, 2, 2, 1};
+  const std::vector<int> strides{1, 2, 2, 1};
+  auto y = AvgPool(scope_, x, ksize, strides, "SAME");
+  RunTest(x, x_shape, y, y_shape);
+}
+
+TEST_F(NNGradTest, AvgPool3DGradHelper) {
+  TensorShape x_shape({1, 3, 3, 3, 1});
+  TensorShape y_shape({1, 1, 1, 1, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Setup window and strides so that we only do one AvgPool3D.
+  const std::vector<int> ksize{1, 3, 3, 3, 1};
+  const std::vector<int> strides{1, 3, 3, 3, 1};
+  auto y = AvgPool3D(scope_, x, ksize, strides, "SAME");
+  RunTest(x, x_shape, y, y_shape);
+}
+
 TEST_F(NNGradTest, LRN){
   TensorShape x_shape({1, 1, 2, 1});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
index 00799526fce572e7bb80199ccb8ce1cc89874031..cf65fe1ab99b49207a64e86310178141b30d07d7 100644
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@@ -9,6 +9,9 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 tf_cuda_cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
+    tags = [
+        "noguitar",  # b/77649654
+    ],
     deps = [
         ":profiler",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/cc/profiler/profiler.h b/tensorflow/cc/profiler/profiler.h
index 6077c45c5854fd5812ccb7c91522f93ed4e54883..64edbb5766c3604fbe0f15c2299843718381aa3f 100644
--- a/tensorflow/cc/profiler/profiler.h
+++ b/tensorflow/cc/profiler/profiler.h
@@ -61,18 +61,18 @@ class Profiler {
   /// Adds tracing information `run_meta` to profiler. A `run_meta` is
   /// generated by a TensorFlow session run call. `step` is the key
   /// to the `run_meta`. When calling ProfileXXX methods, caller can specify
-  /// `step` in `options` to seletively profile the corresponding `run_meta`.
+  /// `step` in `options` to selectively profile the corresponding `run_meta`.
   /// Multiple different `run_meta` can be keyed by the same `step` in order
   /// to group them together.
   void AddStep(int64 step, const RunMetadata& run_meta);
 
   /// Profiles the model by organizing nodes in graph structure.
-  /// Each node is an op and the nodes are contected by the op inputs/outputs.
+  /// Each node is an op and the nodes are connected by the op inputs/outputs.
   GraphNodeProto ProfileGraph(const Options& options);
 
   /// Profiles the model by organizing nodes in name scope structure.
   /// Each node is an op, and nodes are organized by the ops' name
-  /// scope, similar to a filesystem tree.
+  /// scope, similar to a file system tree.
   /// E.g. /foo is the root of operation /foo/matmul_1 and foo/conv_2.
   GraphNodeProto ProfileNameScope(const Options& options);
 
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index d29ad3ebcbe29087d5572b51c7713e0c98d0d840..06a3be18e08f611d3ecf9804908d791d15fdab13 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -94,18 +94,3 @@ filegroup(
         "testdata/half_plus_two/**",
     ]),
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 4c64d2cfe3c10e6c7ed82a2d72460a0b34283bb2..72b8bc18710b0ee77cb01ed3ad0c2abb5183efb2 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -133,9 +134,9 @@ TEST_F(LoaderTest, NoTagMatch) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {"missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(StringPiece(st.error_message())
-                  .contains("Could not find meta graph def matching supplied "
-                            "tags: { missing-tag }"))
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: { missing-tag }"))
       << st.error_message();
 }
 
@@ -149,9 +150,9 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe, "missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(
-      StringPiece(st.error_message())
-          .contains("Could not find meta graph def matching supplied tags: "))
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: "))
       << st.error_message();
 }
 
@@ -169,7 +170,7 @@ TEST_F(LoaderTest, SessionCreationFailure) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget))
+  EXPECT_TRUE(str_util::StrContains(st.error_message(), kInvalidTarget))
       << st.error_message();
 }
 
diff --git a/tensorflow/cc/saved_model/python/BUILD b/tensorflow/cc/saved_model/python/BUILD
index f5fbc75edcba9d5ae9ef7432de224df766bcab9e..6f04ebdc55cda329527c95f62efc37c8dfbb4ae5 100644
--- a/tensorflow/cc/saved_model/python/BUILD
+++ b/tensorflow/cc/saved_model/python/BUILD
@@ -7,18 +7,6 @@ package(
     default_visibility = ["//visibility:public"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_py_clif_cc")
 
 tf_py_clif_cc(
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index 97f66e79b8ad9f383b22f56e9385fc6d2080e1f8..6f1c87354076565af22f7ba0610a5c6bb999d25c 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -32,6 +32,7 @@ tf_cc_test(
     deps = [
         ":freeze_saved_model",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:protos_all_cc",
@@ -40,18 +41,3 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/cc/tools/freeze_saved_model.cc b/tensorflow/cc/tools/freeze_saved_model.cc
index ddf372cdef21e1b3892c9a03714478d5a5785517..4ddddcb5863c9ffb1e5367db750b0d2ffd29cd5e 100644
--- a/tensorflow/cc/tools/freeze_saved_model.cc
+++ b/tensorflow/cc/tools/freeze_saved_model.cc
@@ -75,16 +75,13 @@ void GetNodeNameToNodeDefMap(
 // variable nodes to convert.
 void GetReachableNodesAndVariables(
     GraphDef* graph_def, const std::unordered_set<string>& outputs,
+    const std::unordered_map<string, NodeDef*>& name_to_node_map,
     std::unordered_set<string>* reachable_node_names,
     std::unordered_set<string>* variable_node_names) {
   // TODO(suharshs): Add support for ResourceVariables.
   static const std::unordered_set<string>* kVariableTypes =
-      new std::unordered_set<string>({"Variable", "VariableV2"});
-  // name_to_node_map is needed to get the inputs from the NodeDef corresponding
-  // the a string node name. These inputs are used when doing our backwards
-  // traversal.
-  std::unordered_map<string, NodeDef*> name_to_node_map;
-  GetNodeNameToNodeDefMap(graph_def, &name_to_node_map);
+      new std::unordered_set<string>({"Variable", "VariableV2", "VarHandleOp"});
+
   std::queue<string> nodes_to_visit;
   for (const string& tensor_name : outputs) {
     // We need to strip off the tensor part to get the node name.
@@ -99,7 +96,7 @@ void GetReachableNodesAndVariables(
       continue;
     }
     reachable_node_names->insert(node_name);
-    NodeDef* node = name_to_node_map[node_name];
+    NodeDef* node = name_to_node_map.at(node_name);
     if (kVariableTypes->find(node->op()) != kVariableTypes->end()) {
       variable_node_names->insert(node->name());
     }
@@ -111,7 +108,9 @@ void GetReachableNodesAndVariables(
 
 // Gets a map from variable name to variable value.
 Status GetVariableNameToTensorMap(
-    Session* session, std::unordered_set<string> variable_names_set,
+    Session* session,
+    const std::unordered_map<string, NodeDef*>& name_to_node_map,
+    std::unordered_set<string> variable_names_set,
     std::unordered_map<string, Tensor>* variable_name_to_value_map) {
   if (variable_names_set.empty()) {
     return Status::OK();
@@ -120,8 +119,14 @@ Status GetVariableNameToTensorMap(
   std::vector<string> tensor_names;
   for (const string& node_name : variable_names_set) {
     variable_names.push_back(node_name);
-    // We need to run tensors, so append ":0".
-    tensor_names.push_back(node_name + ":0");
+    NodeDef* node_def = name_to_node_map.at(node_name);
+    if (node_def->op() == "VarHandleOp") {
+      // If this is a resource variable, we have to run the corresponding
+      // ReadVariableOp.
+      tensor_names.push_back(node_name + "/Read/ReadVariableOp:0");
+    } else {
+      tensor_names.push_back(node_name + ":0");
+    }
   }
   std::vector<Tensor> outputs;
   TF_RETURN_IF_ERROR(
@@ -143,6 +148,15 @@ void ConvertVariableToConstant(const NodeDef& variable_node,
       (*const_node->mutable_attr())["value"].mutable_tensor());
 }
 
+// Converts a ReadVariableOp NodeDef to an Identity NodeDef.
+void ConvertReadVariableOpToIdentity(const NodeDef& node,
+                                     NodeDef* identity_node) {
+  identity_node->set_name(node.name());
+  identity_node->set_op("Identity");
+  (*identity_node->mutable_attr())["T"] = node.attr().at("dtype");
+  identity_node->add_input(node.input(0));
+}
+
 // Freezes the subgraph of all nodes needed by `outputs`.
 Status FreezeGraphDef(const SavedModelBundle& saved_model_bundle,
                       const std::unordered_set<string>& outputs,
@@ -155,14 +169,19 @@ Status FreezeGraphDef(const SavedModelBundle& saved_model_bundle,
   if (graph_def.node_size() == 0) {
     return Status::OK();
   }
+  // name_to_node_map is needed to get the inputs from the NodeDef corresponding
+  // the a string node name. These inputs are used when doing our backwards
+  // traversal.
+  std::unordered_map<string, NodeDef*> name_to_node_map;
+  GetNodeNameToNodeDefMap(&graph_def, &name_to_node_map);
   std::unordered_set<string> reachable_node_names;
   std::unordered_set<string> variable_node_names;
-  GetReachableNodesAndVariables(&graph_def, outputs, &reachable_node_names,
-                                &variable_node_names);
+  GetReachableNodesAndVariables(&graph_def, outputs, name_to_node_map,
+                                &reachable_node_names, &variable_node_names);
   std::unordered_map<string, Tensor> variable_to_value_map;
-  TF_RETURN_IF_ERROR(
-      GetVariableNameToTensorMap(saved_model_bundle.session.get(),
-                                 variable_node_names, &variable_to_value_map));
+  TF_RETURN_IF_ERROR(GetVariableNameToTensorMap(
+      saved_model_bundle.session.get(), name_to_node_map, variable_node_names,
+      &variable_to_value_map));
   // We copy the nodes in the same order they were in the original graph_def.
   for (const NodeDef& node : graph_def.node()) {
     if (reachable_node_names.find(node.name()) == reachable_node_names.end()) {
@@ -171,6 +190,12 @@ Status FreezeGraphDef(const SavedModelBundle& saved_model_bundle,
     if (variable_node_names.find(node.name()) != variable_node_names.end()) {
       ConvertVariableToConstant(node, variable_to_value_map[node.name()],
                                 frozen_graph_def->add_node());
+    } else if (node.op() == "ReadVariableOp" &&
+               variable_node_names.find(node.input(0)) !=
+                   variable_node_names.end()) {
+      // If the node is a ReadVariableOp, its input VarHandleOp will be
+      // converted to a Constant, so we will need to convert it to an Identity.
+      ConvertReadVariableOpToIdentity(node, frozen_graph_def->add_node());
     } else {
       // If the node isn't a variable, just copy the node as-is.
       *frozen_graph_def->add_node() = node;
diff --git a/tensorflow/cc/tools/freeze_saved_model_test.cc b/tensorflow/cc/tools/freeze_saved_model_test.cc
index 52a81a50284aec36bba4e56a0232c886cb0cb6cf..cd35fd3b95deec669218cfa4f25fea2c3ac9e56e 100644
--- a/tensorflow/cc/tools/freeze_saved_model_test.cc
+++ b/tensorflow/cc/tools/freeze_saved_model_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/cc/tools/freeze_saved_model.h"
 
+#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -113,6 +114,160 @@ class FreezeTest : public ::testing::Test {
 
     test::ExpectTensorEqual<float>(unfrozen_outputs[0], frozen_outputs[0]);
   }
+
+  void TestFreezeGraphWithoutDependentVariables(bool use_resource) {
+    // Test freezing a graph with variables that are not needed by the outputs
+    // in the SignatureDef. The resulting graph shouldn't be frozen, but
+    // non-dependent nodes should be pruned.
+    SavedModelBundle saved_model_bundle;
+    GraphDef graph_def;
+    Scope scope = Scope::NewRootScope();
+    Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
+    Output b = ops::Const(scope.WithOpName("b"), 10.0f, {});
+    Output c = ops::Mul(scope.WithOpName("c"), a, b);
+    if (use_resource) {
+      Output var =
+          ops::VarHandleOp(scope.WithOpName("var"), DataType::DT_FLOAT, {});
+      Output read_var = ops::ReadVariableOp(
+          scope.WithOpName("var/Read/ReadVariableOp"), var, DataType::DT_FLOAT);
+      auto assign = ops::AssignVariableOp(scope.WithOpName("assign"), var, a);
+    } else {
+      Output var =
+          ops::Variable(scope.WithOpName("var"), {}, DataType::DT_FLOAT);
+      Output assign = ops::Assign(scope.WithOpName("assign"), var, a);
+    }
+
+    TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+    // "c" isnt dependent on the variable, so nothing should be frozen.
+    TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(
+        graph_def, {"c:0"}, "assign", &saved_model_bundle));
+
+    GraphDef frozen_graph_def;
+    std::unordered_set<string> inputs;
+    std::unordered_set<string> outputs;
+    TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def,
+                                  &inputs, &outputs));
+
+    GraphDef expected_graph_def;
+    Scope expected_scope = Scope::NewRootScope();
+    Output expected_a = ops::Const(expected_scope.WithOpName("a"), 10.0f, {});
+    Output expected_b = ops::Const(expected_scope.WithOpName("b"), 10.0f, {});
+    Output expected_c =
+        ops::Mul(expected_scope.WithOpName("c"), expected_a, expected_b);
+    TF_ASSERT_OK(expected_scope.ToGraphDef(&expected_graph_def));
+
+    GraphDefEqual(frozen_graph_def, expected_graph_def);
+
+    RunAndCompareFrozenAndUnfrozenGraphs(saved_model_bundle.session.get(),
+                                         frozen_graph_def, "c:0");
+  }
+
+  void TestFreezeGraphWithDependentVariables(bool use_resource) {
+    // Test freezing a graph with variables that are needed by outputs in the
+    // SignatureDef. The variables should be frozen.
+    SavedModelBundle saved_model_bundle;
+    GraphDef graph_def;
+    Scope scope = Scope::NewRootScope();
+    Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
+    Output read_var;
+    if (use_resource) {
+      Output var =
+          ops::VarHandleOp(scope.WithOpName("var"), DataType::DT_FLOAT, {});
+      read_var = ops::ReadVariableOp(
+          scope.WithOpName("var/Read/ReadVariableOp"), var, DataType::DT_FLOAT);
+      auto assign = ops::AssignVariableOp(scope.WithOpName("assign"), var, a);
+    } else {
+      Output read_var =
+          ops::Variable(scope.WithOpName("var"), {}, DataType::DT_FLOAT);
+      Output assign = ops::Assign(scope.WithOpName("assign"), read_var, a);
+    }
+    Output c = ops::Mul(scope.WithOpName("c"), a, read_var);
+    TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+    // "c" isnt dependent on the variable, so nothing should be frozen.
+    TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(
+        graph_def, {"c:0"}, "assign", &saved_model_bundle));
+
+    GraphDef frozen_graph_def;
+    std::unordered_set<string> inputs;
+    std::unordered_set<string> outputs;
+    TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def,
+                                  &inputs, &outputs));
+
+    // If using normal variables there should be 3 nodes in the resulting
+    // graph_def. If using resource variables there should be 4 nodes in the
+    // resulting graph_def.
+    // In both cases, none should be variables.
+    size_t expected_nodes = use_resource ? 4 : 3;
+    EXPECT_EQ(frozen_graph_def.node_size(), expected_nodes);
+    for (const NodeDef& node : frozen_graph_def.node()) {
+      EXPECT_NE(node.op(), "Variable") << node.name();
+      EXPECT_NE(node.op(), "VariableV2") << node.name();
+      EXPECT_NE(node.op(), "VarHandleOp") << node.name();
+      EXPECT_NE(node.op(), "ReadVariableOp") << node.name();
+    }
+
+    RunAndCompareFrozenAndUnfrozenGraphs(saved_model_bundle.session.get(),
+                                         frozen_graph_def, "c:0");
+  }
+
+  void TestFreezeGraphWithAndWithoutDependentVariables(bool use_resource) {
+    // Test freezing a graph with some variables that are needed and not needed
+    // by
+    // the outputs in the SignatureDef. The resulting graph should only freeze
+    // dependent variables.
+    SavedModelBundle saved_model_bundle;
+    GraphDef graph_def;
+    Scope scope = Scope::NewRootScope();
+    Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
+    Output read_var;
+
+    if (use_resource) {
+      Output var =
+          ops::VarHandleOp(scope.WithOpName("var"), DataType::DT_FLOAT, {});
+      read_var = ops::ReadVariableOp(
+          scope.WithOpName("var/Read/ReadVariableOp"), var, DataType::DT_FLOAT);
+      auto assign = ops::AssignVariableOp(scope.WithOpName("assign"), var, a);
+      Output var_1 =
+          ops::VarHandleOp(scope.WithOpName("var_1"), DataType::DT_FLOAT, {});
+      Output read_var_1 =
+          ops::ReadVariableOp(scope.WithOpName("var_1/Read/ReadVariableOp"),
+                              var, DataType::DT_FLOAT);
+      auto assign_1 =
+          ops::AssignVariableOp(scope.WithOpName("assign_1"), var_1, a);
+    } else {
+      read_var = ops::Variable(scope.WithOpName("var"), {}, DataType::DT_FLOAT);
+      Output assign = ops::Assign(scope.WithOpName("assign"), read_var, a);
+      Output var_1 =
+          ops::Variable(scope.WithOpName("var_1"), {}, DataType::DT_FLOAT);
+      Output assign_1 = ops::Assign(scope.WithOpName("assign_1"), var_1, a);
+    }
+
+    Output c = ops::Mul(scope.WithOpName("c"), a, read_var);
+    TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+    // "c" isnt dependent on the variable, so nothing should be frozen.
+    TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(
+        graph_def, {"c:0"}, "assign", &saved_model_bundle));
+
+    GraphDef frozen_graph_def;
+    std::unordered_set<string> inputs;
+    std::unordered_set<string> outputs;
+    TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def,
+                                  &inputs, &outputs));
+
+    // There should be 3 nodes in the resulting graph_def, and none should be
+    // variables.
+    size_t expected_nodes = use_resource ? 4 : 3;
+    EXPECT_EQ(frozen_graph_def.node_size(), expected_nodes);
+    for (const NodeDef& node : frozen_graph_def.node()) {
+      EXPECT_NE(node.op(), "Variable") << node.name();
+      EXPECT_NE(node.op(), "VariableV2") << node.name();
+      EXPECT_NE(node.op(), "VarHandleOp") << node.name();
+      EXPECT_NE(node.op(), "ReadVariableOp") << node.name();
+    }
+
+    RunAndCompareFrozenAndUnfrozenGraphs(saved_model_bundle.session.get(),
+                                         frozen_graph_def, "c:0");
+  }
 };
 
 TEST_F(FreezeTest, InputsAndOutputsSingleSignatureDef) {
@@ -196,111 +351,28 @@ TEST_F(FreezeTest, GraphDefWithNoVariables) {
   GraphDefEqual(frozen_graph_def, graph_def);
 }
 
-TEST_F(FreezeTest, GraphDefWithVariablesNotNeededByOutputs) {
-  // Test freezing a graph with variables that are not needed by the outputs in
-  // the SignatureDef. The resulting graph shouldn't be frozen, but
-  // non-dependent nodes should be pruned.
-  SavedModelBundle saved_model_bundle;
-  GraphDef graph_def;
-  Scope scope = Scope::NewRootScope();
-  Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
-  Output b = ops::Const(scope.WithOpName("b"), 10.0f, {});
-  Output c = ops::Mul(scope.WithOpName("c"), a, b);
-  Output var = ops::Variable(scope.WithOpName("var"), {}, DataType::DT_FLOAT);
-  Output assign = ops::Assign(scope.WithOpName("assign"), var, a);
-  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
-  // "c" isnt dependent on the variable, so nothing should be frozen.
-  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(
-      graph_def, {"c:0"}, assign.name(), &saved_model_bundle));
-
-  GraphDef frozen_graph_def;
-  std::unordered_set<string> inputs;
-  std::unordered_set<string> outputs;
-  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
-                                &outputs));
-
-  GraphDef expected_graph_def;
-  Scope expected_scope = Scope::NewRootScope();
-  Output expected_a = ops::Const(expected_scope.WithOpName("a"), 10.0f, {});
-  Output expected_b = ops::Const(expected_scope.WithOpName("b"), 10.0f, {});
-  Output expected_c =
-      ops::Mul(expected_scope.WithOpName("c"), expected_a, expected_b);
-  TF_ASSERT_OK(expected_scope.ToGraphDef(&expected_graph_def));
-
-  GraphDefEqual(frozen_graph_def, expected_graph_def);
-
-  RunAndCompareFrozenAndUnfrozenGraphs(saved_model_bundle.session.get(),
-                                       frozen_graph_def, "c:0");
+TEST_F(FreezeTest, GraphDefWithoutDependentVariables) {
+  TestFreezeGraphWithoutDependentVariables(false);
 }
 
-TEST_F(FreezeTest, GraphDefWithVariablesNeededByOutputs) {
-  // Test freezing a graph with variables that are needed by outputs in the
-  // SignatureDef. The variables should be frozen.
-  SavedModelBundle saved_model_bundle;
-  GraphDef graph_def;
-  Scope scope = Scope::NewRootScope();
-  Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
-  Output var = ops::Variable(scope.WithOpName("var"), {}, DataType::DT_FLOAT);
-  Output c = ops::Mul(scope.WithOpName("c"), a, var);
-  Output assign = ops::Assign(scope.WithOpName("assign"), var, a);
-  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
-  // "c" isnt dependent on the variable, so nothing should be frozen.
-  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(
-      graph_def, {"c:0"}, assign.name(), &saved_model_bundle));
-
-  GraphDef frozen_graph_def;
-  std::unordered_set<string> inputs;
-  std::unordered_set<string> outputs;
-  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
-                                &outputs));
-
-  // There should be 3 nodes in the resulting graph_def, and none should be
-  // variables.
-  EXPECT_EQ(frozen_graph_def.node_size(), 3);
-  for (const NodeDef& node : frozen_graph_def.node()) {
-    EXPECT_NE(node.op(), "Variable") << node.name();
-    EXPECT_NE(node.op(), "VariableV2") << node.name();
-  }
-
-  RunAndCompareFrozenAndUnfrozenGraphs(saved_model_bundle.session.get(),
-                                       frozen_graph_def, "c:0");
+TEST_F(FreezeTest, GraphDefWithoutDependentResourceVariables) {
+  TestFreezeGraphWithoutDependentVariables(true);
 }
 
-TEST_F(FreezeTest, GraphDefWithVariablesNeededAndNotNeededByOutputs) {
-  // Test freezing a graph with some variables that are needed and not needed by
-  // the outputs in the SignatureDef. The resulting graph should only freeze
-  // dependent variables.
-  SavedModelBundle saved_model_bundle;
-  GraphDef graph_def;
-  Scope scope = Scope::NewRootScope();
-  Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
-  Output var = ops::Variable(scope.WithOpName("var"), {}, DataType::DT_FLOAT);
-  Output c = ops::Mul(scope.WithOpName("c"), a, var);
-  Output assign = ops::Assign(scope.WithOpName("assign"), var, a);
-  Output var_1 =
-      ops::Variable(scope.WithOpName("var_1"), {}, DataType::DT_FLOAT);
-  Output assign_1 = ops::Assign(scope.WithOpName("assign_1"), var, a);
-  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
-  // "c" isnt dependent on the variable, so nothing should be frozen.
-  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(
-      graph_def, {"c:0"}, assign.name(), &saved_model_bundle));
+TEST_F(FreezeTest, GraphDefWithDependentVariables) {
+  TestFreezeGraphWithDependentVariables(false);
+}
 
-  GraphDef frozen_graph_def;
-  std::unordered_set<string> inputs;
-  std::unordered_set<string> outputs;
-  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
-                                &outputs));
+TEST_F(FreezeTest, GraphDefWithDependentResourceVariables) {
+  TestFreezeGraphWithDependentVariables(true);
+}
 
-  // There should be 3 nodes in the resulting graph_def, and none should be
-  // variables.
-  EXPECT_EQ(frozen_graph_def.node_size(), 3);
-  for (const NodeDef& node : frozen_graph_def.node()) {
-    EXPECT_NE(node.op(), "Variable") << node.name();
-    EXPECT_NE(node.op(), "VariableV2") << node.name();
-  }
+TEST_F(FreezeTest, GraphDefWithAndWithoutDependentVariables) {
+  TestFreezeGraphWithAndWithoutDependentVariables(false);
+}
 
-  RunAndCompareFrozenAndUnfrozenGraphs(saved_model_bundle.session.get(),
-                                       frozen_graph_def, "c:0");
+TEST_F(FreezeTest, GraphDefWithAndWithoutDependentResourceVariables) {
+  TestFreezeGraphWithAndWithoutDependentVariables(true);
 }
 
 }  // namespace
diff --git a/tensorflow/cc/tutorials/example_trainer.cc b/tensorflow/cc/tutorials/example_trainer.cc
index 3675d72ee354533a7d84b5e8783cde452d8d60c9..5dbc4f5f6aa389978e55ca2656c17ff97202203d 100644
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/graph/default_device.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -166,7 +167,8 @@ namespace {
 
 bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     int32* dst) {
-  if (arg.Consume(flag) && arg.Consume("=")) {
+  if (tensorflow::str_util::ConsumePrefix(&arg, flag) &&
+      tensorflow::str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     return (sscanf(arg.data(), "%d%c", dst, &extra) == 1);
   }
@@ -176,7 +178,7 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 
 bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    bool* dst) {
-  if (arg.Consume(flag)) {
+  if (tensorflow::str_util::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *dst = true;
       return true;
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 0900e87ebabd378e6237b77ca0ef01677c07c244..19e6bf68e77725bb3cae4e1d338c52dff472cb18 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -60,6 +60,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -72,6 +73,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -249,17 +251,3 @@ exports_files([
     "benchmark_main.template",  # used by tf_library(...,gen_benchmark=True)
     "test.cc",  # used by tf_library(...,gen_test=True)
 ])
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 2cae85e8965216eaaee4d3032015d0016258a5c1..0025842aead53973befc794378a26fa8db2ae1cb 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -333,6 +333,20 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
           R"(#include "tensorflow/compiler/xla/xla_data.pb.h")"
           : "";
 
+  const string include_hlo_profile_printer_data_proto =
+      opts.gen_hlo_profile_printer_data
+          ? R"(#include "tensorflow/compiler/xla/service/hlo_profile_printer_data.pb.h")"
+          : "";
+
+  // When HLO profiling is disabled we only forward declare the
+  // HloProfilePrinter protobuf.  So we can only conditionally emit this code
+  // calling HloProfilePrinter::profile_counters_size.
+  const string assign_profile_counters_size =
+      opts.gen_hlo_profile_printer_data
+          ? "data->profile_counters_size = "
+            "data->hlo_profile_printer_data->profile_counters_size();"
+          : "";
+
   // Use a poor-man's text templating mechanism; first populate the full header
   // with placeholder tokens, and then rewrite the tokens with real values.
   *header =
@@ -348,6 +362,7 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
 #define TFCOMPILE_GENERATED_{{ENTRY}}_H_  // NOLINT(build/header_guard)
 
 {{INCLUDE_XLA_DATA_PROTO}}
+{{INCLUDE_HLO_PROFILE_PRINTER_DATA_PROTO}}
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -418,6 +433,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
+      data->hlo_profile_printer_data = StaticHloProfilePrinterData();
+      {{ASSIGN_PROFILE_COUNTERS_SIZE}}
       return data;
     }();
     return *kStaticData;
@@ -487,6 +504,13 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
     static const xla::ProgramShape* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}};
     return kShape;
   }
+
+  // Metadata that can be used to pretty-print profile counters.
+  static const xla::HloProfilePrinterData* StaticHloProfilePrinterData() {
+    static const xla::HloProfilePrinterData* kHloProfilePrinterData =
+      {{HLO_PROFILE_PRINTER_DATA_SHIM_EXPRESSION}};
+    return kHloProfilePrinterData;
+  }
 };
 {{NS_END}}
 
@@ -501,35 +525,41 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{ARG_NAMES_CODE}}", arg_names_code},
       {"{{ARG_NUM}}", strings::StrCat(arg_sizes.size())},
       {"{{ARG_SIZES}}", str_util::Join(arg_sizes, ", ")},
+      {"{{ASSIGN_PROFILE_COUNTERS_SIZE}}", assign_profile_counters_size},
       {"{{CLASS}}", opts.class_name},
+      {"{{DECLS_FROM_OBJ_FILE}}",
+       str_util::Join(metadata_result.header_variable_decls, "\n")},
       {"{{ENTRY}}", compile_result.entry_point},
+      {"{{HLO_PROFILE_PRINTER_DATA_SHIM_EXPRESSION}}",
+       metadata_result.hlo_profile_printer_data_access_shim},
       {"{{INCLUDE_XLA_DATA_PROTO}}", include_xla_data_proto},
+      {"{{INCLUDE_HLO_PROFILE_PRINTER_DATA_PROTO}}",
+       include_hlo_profile_printer_data_proto},
       {"{{METHODS_ARG}}\n", methods_arg},
       {"{{METHODS_RESULT}}\n", methods_result},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
       {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(ps)},
+      {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
+       metadata_result.program_shape_access_shim},
       {"{{RESULT_INDEX}}", strings::StrCat(result_index)},
       {"{{RESULT_NAMES_CODE}}", result_names_code},
       {"{{TEMP_BYTES_ALIGNED}}", strings::StrCat(temp_bytes_aligned)},
       {"{{TEMP_BYTES_TOTAL}}", strings::StrCat(temp_bytes_total)},
       {"{{TEMP_NUM}}", strings::StrCat(temp_sizes.size())},
-      {"{{TEMP_SIZES}}", str_util::Join(temp_sizes, ", ")},
-      {"{{DECLS_FROM_OBJ_FILE}}",
-       str_util::Join(metadata_result.header_variable_decls, "\n")},
-      {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
-       metadata_result.program_shape_access_shim}};
+      {"{{TEMP_SIZES}}", str_util::Join(temp_sizes, ", ")}};
   str_util::ReplaceAllPairs(header, rewrites);
   return Status::OK();
 }
 
-static string CreateUniqueIdentifierForProgramShape(const CodegenOpts& opts) {
+static string CreateUniqueIdentifier(const CodegenOpts& opts,
+                                     StringPiece suffix) {
   string result = "__tfcompile";
   for (const string& n : opts.namespaces) {
     strings::StrAppend(&result, "_", n);
   }
 
-  strings::StrAppend(&result, "_", opts.class_name, "_ProgramShape");
+  strings::StrAppend(&result, "_", opts.class_name, "_", suffix);
   return result;
 }
 
@@ -550,18 +580,31 @@ Status GenerateMetadata(const CodegenOpts& opts,
   // When asked to serialize a null protobuf, CreateEmbeddedProtocolBuffer gives
   // a shim that evaluates to nullptr, which is what we want.
 
+  ProtobufToEmbed program_shape_protobuf{
+      CreateUniqueIdentifier(opts, "ProgramShape"), "xla::ProgramShape",
+      program_shape.get()};
+
+  ProtobufToEmbed hlo_profile_printer_data_protobuf{
+      CreateUniqueIdentifier(opts, "HloProfilePrinterData"),
+      "xla::HloProfilePrinterData",
+      compile_result.aot->hlo_profile_printer_data()};
+
   TF_ASSIGN_OR_RETURN(
-      EmbeddedProtocolBuffer embedded_program_shape,
-      CreateEmbeddedProtocolBuffer(opts.target_triple,
-                                   CreateUniqueIdentifierForProgramShape(opts),
-                                   "xla::ProgramShape", program_shape.get()));
+      EmbeddedProtocolBuffers embedded_protobufs,
+      CreateEmbeddedProtocolBuffers(
+          opts.target_triple,
+          {program_shape_protobuf, hlo_profile_printer_data_protobuf}));
 
   metadata_result->program_shape_access_shim =
-      std::move(embedded_program_shape.cpp_shim_expression);
+      std::move(embedded_protobufs.cpp_shims[0].expression);
+  metadata_result->hlo_profile_printer_data_access_shim =
+      std::move(embedded_protobufs.cpp_shims[1].expression);
+  metadata_result->header_variable_decls.emplace_back(
+      std::move(embedded_protobufs.cpp_shims[0].variable_decl));
   metadata_result->header_variable_decls.emplace_back(
-      std::move(embedded_program_shape.cpp_variable_decl));
+      std::move(embedded_protobufs.cpp_shims[1].variable_decl));
   metadata_result->object_file_data =
-      std::move(embedded_program_shape.object_file_data);
+      std::move(embedded_protobufs.object_file_data);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h
index 3430b1f96cf4d3c035b76c77ccf124c5d164751e..83f2d3ee11d09d66f16d7ecdc11945ebe994a82a 100644
--- a/tensorflow/compiler/aot/codegen.h
+++ b/tensorflow/compiler/aot/codegen.h
@@ -44,6 +44,10 @@ struct CodegenOpts {
 
   // If true, generate program shape data for the ProgramShape method.
   bool gen_program_shape = false;
+
+  // If true, emit a serialized HloProfilePrinterData protobuf that can be used
+  // to pretty print HLO profile counters.
+  bool gen_hlo_profile_printer_data = false;
 };
 
 // Describes a generated metadata object file.
@@ -57,6 +61,12 @@ struct MetadataResult {
   // GenerateMetadata.
   string program_shape_access_shim;
 
+  // hlo_profile_printer_data_access_shim is a C++ expression that constructs
+  // the xla::HloProfilePrinterData instance for the CompileResult passed to
+  // GenerateMetadata.  If the xla::HloProfilePrinterData is null then this is a
+  // C++ expression that evaluates to nullptr at runtime.
+  string hlo_profile_printer_data_access_shim;
+
   // The contents of the object (".o") file.
   string object_file_data;
 };
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 972b7d51ecb3798e61757ac55e973075a23b433a..29bc9c13b889c86c2ba8776c7b067c54cb05bc43 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -33,7 +34,7 @@ namespace {
 
 void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
-  EXPECT_TRUE(StringPiece(status.error_message()).contains(str))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), str))
       << "expected error: " << status.error_message() << " to contain: " << str;
 }
 
@@ -171,7 +172,7 @@ TEST(CodegenTest, Golden) {
   fetch->set_name("myfetch");
   CompileResult compile_result;
   compile_result.aot.reset(
-      new xla::cpu::CpuAotCompilationResult({}, {1, -1, 2, -1, 3, 120}, 5));
+      new xla::cpu::CpuAotCompilationResult({}, {1, -1, 2, -1, 3, 120}, 5, {}));
   compile_result.program_shape = xla::ShapeUtil::MakeProgramShape(
       {
           xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index ac3b5873318873b5fdf41bd556a0b2abddc2b30b..6e050cf56494e6d26e3647e3261a657eeaad64fa 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -10,6 +10,7 @@
 #define TFCOMPILE_GENERATED_entry_point_H_  // NOLINT(build/header_guard)
 
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -23,6 +24,7 @@ extern "C" void entry_point(
 
 extern "C" char __tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[];
 
+
 namespace foo {
 namespace bar {
 
@@ -82,6 +84,8 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
+      data->hlo_profile_printer_data = StaticHloProfilePrinterData();
+      
       return data;
     }();
     return *kStaticData;
@@ -243,6 +247,13 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   }();
     return kShape;
   }
+
+  // Metadata that can be used to pretty-print profile counters.
+  static const xla::HloProfilePrinterData* StaticHloProfilePrinterData() {
+    static const xla::HloProfilePrinterData* kHloProfilePrinterData =
+      nullptr;
+    return kHloProfilePrinterData;
+  }
 };
 
 }  // end namespace bar
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index c87f2b75dfa18ad5c3eda4bd6fcbcb3083ef73fd..31044ff85d6f0d72b34d03669fe508866d7d3358 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -87,9 +88,8 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   // Converts the graph into an XLA computation, and compiles the
   // computation.
   // TODO(toddw): Should we let the user pick the XLA cpu vs. gpu client?
-  namespace gpu = perftools::gputools;
-  gpu::Platform* cpu_platform =
-      gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
+  se::Platform* cpu_platform =
+      se::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
   xla::CompileOnlyClient* client =
       xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
           .ValueOrDie();
@@ -110,6 +110,7 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
       flags.target_triple, flags.target_cpu, flags.target_features,
       flags.entry_point,
       xla::cpu::CpuAotCompilationOptions::RelocationModel::BigPic);
+
   return CompileXla(client, computation, aot_opts, compile_result);
 }
 
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
index 6489929a576d6469c4ff1358ca5ee9d27fb578bb..63d22de1ca4aa0872b6fad3e0ac0182306d7cb8c 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.cc
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 
 #include "llvm/ADT/Triple.h"
-#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -37,9 +36,8 @@ namespace tfcompile {
 
 using xla::llvm_ir::AsStringRef;
 
-static std::unique_ptr<llvm::Module> CreateModuleWithEmbeddedProtocolBuffer(
-    llvm::LLVMContext* llvm_context, llvm::TargetMachine* target_machine,
-    const ::tensorflow::protobuf::MessageLite& proto,
+static void AddEmbeddedProtocolBufferToLlvmModule(
+    llvm::Module* module, const ::tensorflow::protobuf::MessageLite& proto,
     StringPiece unique_identifier, string* protobuf_array_symbol_name,
     int64* protobuf_array_size) {
   string protobuf_array_contents = proto.SerializeAsString();
@@ -47,19 +45,14 @@ static std::unique_ptr<llvm::Module> CreateModuleWithEmbeddedProtocolBuffer(
       strings::StrCat(unique_identifier, "_protobuf_array_contents");
   *protobuf_array_size = protobuf_array_contents.size();
 
-  std::unique_ptr<llvm::Module> module =
-      MakeUnique<llvm::Module>("embedded_data_module", *llvm_context);
-
   llvm::Constant* protobuf_array_initializer =
-      llvm::ConstantDataArray::getString(*llvm_context,
+      llvm::ConstantDataArray::getString(module->getContext(),
                                          AsStringRef(protobuf_array_contents),
                                          /*AddNull=*/false);
   new llvm::GlobalVariable(
       *module, protobuf_array_initializer->getType(),
       /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
       protobuf_array_initializer, AsStringRef(*protobuf_array_symbol_name));
-
-  return module;
 }
 
 static string CreateCPPShimExpression(StringPiece qualified_cpp_protobuf_name,
@@ -116,42 +109,44 @@ GetTargetMachineFromTriple(StringPiece target_triple) {
       /*Features=*/"", llvm::TargetOptions(), llvm::None));
 }
 
-StatusOr<EmbeddedProtocolBuffer> CreateEmbeddedProtocolBuffer(
-    StringPiece target_triple, StringPiece symbol_prefix,
-    StringPiece qualified_cpp_protobuf_name,
-    const ::tensorflow::protobuf::MessageLite* proto) {
+StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
+    StringPiece target_triple,
+    gtl::ArraySlice<ProtobufToEmbed> protobufs_to_embed) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
                       GetTargetMachineFromTriple(target_triple));
 
   llvm::LLVMContext llvm_context;
-  string object_file, cpp_shim, cpp_variable_decl;
-
-  if (proto) {
-    string protobuf_array_symbol_name;
-    int64 protobuf_array_size;
-
-    std::unique_ptr<llvm::Module> module_with_serialized_proto =
-        CreateModuleWithEmbeddedProtocolBuffer(
-            &llvm_context, target_machine.get(), *proto, symbol_prefix,
-            &protobuf_array_symbol_name, &protobuf_array_size);
-    TF_ASSIGN_OR_RETURN(object_file,
-                        CodegenModule(target_machine.get(),
-                                      std::move(module_with_serialized_proto)));
-    cpp_shim = CreateCPPShimExpression(qualified_cpp_protobuf_name,
-                                       protobuf_array_symbol_name,
-                                       protobuf_array_size);
-
-    cpp_variable_decl = strings::StrCat("extern \"C\" char ",
-                                        protobuf_array_symbol_name, "[];");
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        object_file,
-        CodegenModule(target_machine.get(),
-                      MakeUnique<llvm::Module>("empty_module", llvm_context)));
-    cpp_shim = "nullptr";
+  std::unique_ptr<llvm::Module> module_with_serialized_proto =
+      MakeUnique<llvm::Module>("embedded_data_module", llvm_context);
+
+  EmbeddedProtocolBuffers result;
+
+  for (const ProtobufToEmbed& protobuf_to_embed : protobufs_to_embed) {
+    string cpp_shim, cpp_variable_decl;
+    if (protobuf_to_embed.message) {
+      string protobuf_array_symbol_name;
+      int64 protobuf_array_size;
+
+      AddEmbeddedProtocolBufferToLlvmModule(
+          module_with_serialized_proto.get(), *protobuf_to_embed.message,
+          protobuf_to_embed.symbol_prefix, &protobuf_array_symbol_name,
+          &protobuf_array_size);
+      cpp_shim = CreateCPPShimExpression(
+          protobuf_to_embed.qualified_cpp_protobuf_name,
+          protobuf_array_symbol_name, protobuf_array_size);
+
+      cpp_variable_decl = strings::StrCat("extern \"C\" char ",
+                                          protobuf_array_symbol_name, "[];");
+    } else {
+      cpp_shim = "nullptr";
+    }
+    result.cpp_shims.push_back({cpp_shim, cpp_variable_decl});
   }
 
-  return {{cpp_shim, cpp_variable_decl, object_file}};
+  TF_ASSIGN_OR_RETURN(result.object_file_data,
+                      CodegenModule(target_machine.get(),
+                                    std::move(module_with_serialized_proto)));
+  return result;
 }
 
 }  // namespace tfcompile
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index 8436e0ff67f352a24e3d16b46f16c1ad2f3a5957..ebfe4806c203e901358d5c5096c10c03d4c738c3 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -21,51 +21,70 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace tfcompile {
 using xla::StatusOr;
 
-// Represents a protocol buffer embedded into an object file and describes a way
-// to access it at runtime.
-struct EmbeddedProtocolBuffer {
-  // cpp_shim_expression is a C++ expression that creates an instance of said
-  // protocol buffer when executed.
-  string cpp_shim_expression;
-
-  // cpp_variable_decl is an "extern C" array declaration that is used in
-  // cpp_shim_expression.  It must be visible wherever cpp_shim_expression is
-  // emitted.
-  string cpp_variable_decl;
-
-  // The contents of the object (".o") file the protocol buffer is embbed in.
-  // This needs to be linked in to any program that wants to execute
-  // cpp_variable_decl .
+// Represents a set of protocol buffers embedded into an object file and
+// describes how to access them at runtime.
+struct EmbeddedProtocolBuffers {
+  // Each instance CPPShim describes how to generate C++ code to instantiate a
+  // protobuf instance from the corresponding static data emitted into the
+  // object file.
+  struct CPPShim {
+    // `expression` is a C++ expression that creates an instance of said
+    // protocol buffer when executed.
+    string expression;
+
+    // `variable_decl` is an "extern C" array declaration that is used in
+    // `expression`.  It must be visible wherever `expression` is emitted.
+    string variable_decl;
+  };
+
+  // Each cpp_shim corresponds to one embedded protocol buffer.
+  std::vector<CPPShim> cpp_shims;
+
+  // The contents of the object (".o") file the protocol buffers are embbed in.
+  // This needs to be linked in to any program that wants to execute any of the
+  // expressions in `cpp_shims`.
   string object_file_data;
 };
 
-// Creates an object file that contains `proto`.
-//
-// `proto` is allowed to be nullptr, in which case the generated C++ shim
-// expression is just `nullptr`, and the generated object file does not define
-// any symbols.
+// Describes a protocol buffer to embed into an object file.
+struct ProtobufToEmbed {
+  // `symbol_prefix` is prefix that is guaranteed to be unique across the binary
+  // or DSO the generated object file will be linked into.
+  string symbol_prefix;
+
+  // `qualified_cpp_protobuf_name` is a qualified ("qualified" as in C++
+  // namespace qualified) protocol buffer name.  This is only used in
+  // CPPShim::expression so relatively qualified names are fine as long as
+  // they're valid wherever CPPShim::expression is emitted.
+  string qualified_cpp_protobuf_name;
+
+  // `message` is the protocol buffer to be embedded.  It is allowed to be
+  // nullptr, in which case the generated C++ shim expression is just `nullptr`,
+  // and the generated object file does not define any symbols.
+  const ::tensorflow::protobuf::MessageLite* message;
+};
+
+// Embeds a a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
 //
-// `symbol_prefix` is prefix that is guaranteed to be unique across the binary
-// or DSO the generated object file will be linked into.
-//
-// `qualified_cpp_protobuf_name` is a qualified ("qualified" as in C++
-// namespace qualified) protocol buffer name.  This needs is only used in
-// EmbeddedProtocolBuffer::cpp_shim_expression so relatively qualified
-// names are fine as long as they're valid wherever cpp_shim_expression
-// is emitted.
-StatusOr<EmbeddedProtocolBuffer> CreateEmbeddedProtocolBuffer(
-    StringPiece target_triple, StringPiece symbol_prefix,
-    StringPiece qualified_cpp_protobuf_name,
-    const ::tensorflow::protobuf::MessageLite* proto);
+// `protobufs_to_embed` describes the protocol buffers to embed into the
+// resulting object file.  The C++ shim for protobufs_to_embed[i] is
+// cpp_shims[i] in the returned EmbeddedProtocolBuffers instance.  The contents
+// of all the protocol buffers are embedded into a single .o file whose content
+// is stored in the object_file_data field in the returned
+// EmbeddedProtocolBuffers instance.
+StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
+    StringPiece target_triple,
+    gtl::ArraySlice<ProtobufToEmbed> protobufs_to_embed);
 
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/aot/runtime.cc
index 5772776666129ed55a479c8917e69df3f3ce2fc0..5e74079fc158379b8977ada6412141e39142c3d3 100644
--- a/tensorflow/compiler/aot/runtime.cc
+++ b/tensorflow/compiler/aot/runtime.cc
@@ -31,7 +31,7 @@ namespace {
 inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN)
   return memalign(minimum_alignment, size);
-#elif defined(COMPILER_MSVC)
+#elif defined(_WIN32)
   return _aligned_malloc(size, minimum_alignment);
 #else  // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN
   void* ptr = nullptr;
@@ -48,7 +48,7 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) {
 }
 
 inline void aligned_free(void* aligned_memory) {
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
   _aligned_free(aligned_memory);
 #else
   free(aligned_memory);
diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc
index 47ef5f82cbc718ea300afa0c4eb4b73e1ca22fd0..6b098049cbd7539a2b2e2696b13139a8a6b28e0f 100644
--- a/tensorflow/compiler/aot/test.cc
+++ b/tensorflow/compiler/aot/test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 // clang-format on
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 28aab6eb614ca7123d9e00f7f5cc3661b62e23f7..222e26810ac1157152ea81a56749b6652aa1f137 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -14,6 +14,7 @@ test_suite(
         ":test_graph_tfadd_test",
         ":test_graph_tfadd_with_ckpt_saver_test",
         ":test_graph_tfadd_with_ckpt_test",
+        ":test_graph_tfassert_eq_test",
         ":test_graph_tffunction_test",
         ":test_graph_tfgather_test",
         ":test_graph_tfmatmul_test",
@@ -33,6 +34,7 @@ py_binary(
         "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
@@ -52,6 +54,7 @@ genrule(
         "test_graph_tfadd_with_ckpt_saver.ckpt",
         "test_graph_tfadd_with_ckpt_saver.pb",
         "test_graph_tfadd_with_ckpt_saver.saver",
+        "test_graph_tfassert_eq.pb",
         "test_graph_tffunction.pb",
         "test_graph_tfgather.pb",
         "test_graph_tfmatmul.pb",
@@ -104,6 +107,17 @@ tf_library(
     ],
 )
 
+tf_library(
+    name = "test_graph_tfassert_eq",
+    testonly = 1,
+    config = "test_graph_tfassert_eq.config.pbtxt",
+    cpp_class = "AssertComp",
+    graph = "test_graph_tfassert_eq.pb",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_library(
     name = "test_graph_tffunction",
     testonly = 1,
@@ -149,6 +163,15 @@ tf_library(
     tfcompile_flags = "--gen_name_to_index --gen_program_shape",
 )
 
+tf_library(
+    name = "test_graph_tfmatmulandadd_with_profiling",
+    testonly = 1,
+    config = "test_graph_tfmatmulandadd.config.pbtxt",
+    cpp_class = "MatMulAndAddCompWithProfiling",
+    enable_xla_hlo_profiling = True,
+    graph = "test_graph_tfmatmulandadd.pb",
+)
+
 tf_library(
     name = "test_graph_tfsplits",
     testonly = 1,
@@ -170,29 +193,20 @@ tf_cc_test(
         ":test_graph_tfadd",
         ":test_graph_tfadd_with_ckpt",
         ":test_graph_tfadd_with_ckpt_saver",
+        ":test_graph_tfassert_eq",
         ":test_graph_tffunction",
         ":test_graph_tfgather",
         ":test_graph_tfmatmul",
         ":test_graph_tfmatmulandadd",
+        ":test_graph_tfmatmulandadd_with_profiling",
         ":test_graph_tfsplits",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_profile_printer",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//third_party/eigen3",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 89c7cd4507cbd476104a039d6083d8f89de11278..67767f55dae9b15aafbd8b129328bde2c59a9ef3 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import app
@@ -125,6 +126,14 @@ def tfsplits(_):
   array_ops.identity(y, name='result')
 
 
+def tfassert_eq(_):
+  x = array_ops.placeholder(dtypes.int32, name='x_hold')
+  y = array_ops.placeholder(dtypes.int32, name='y_hold')
+  control_flow_ops.Assert(
+      math_ops.equal(x, y), ['Expected x == y.'], name='assert_eq')
+  math_ops.add(x, math_ops.negative(y), name='x_y_diff')
+
+
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
@@ -144,6 +153,7 @@ def main(_):
   write_graph(tfmatmulandadd, FLAGS.out_dir)
   write_graph(tffunction, FLAGS.out_dir)
   write_graph(tfsplits, FLAGS.out_dir)
+  write_graph(tfassert_eq, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8732d1709e809bb47d3769c483483c2c4f350e1c
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt
@@ -0,0 +1,16 @@
+# Text form of tensorflow.tf2xla.Config proto.
+feed {
+  id { node_name: "x_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+fetch {
+  id { node_name: "x_y_diff" }
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 413efd9cea3b6f71574615ad9ca92471ff925781..aa9d968265b4619ff2e3c910e3d7455ae07bc49d 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -20,19 +20,27 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt_saver.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfassert_eq.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tffunction.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfgather.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd_with_profiling.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfsplits.h"
+#include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace tfcompile {
 namespace {
 
+using ::testing::HasSubstr;
+using ::testing::UnorderedElementsAre;
+
 TEST(TFCompileTest, Add) {
   AddComp add;
   EXPECT_EQ(add.arg0_data(), add.args()[0]);
@@ -413,6 +421,23 @@ TEST(TFCompileTest, Splits) {
   EXPECT_NEAR(expected[3], fn.result0(1, 1), 1e4);
 }
 
+TEST(TFCompileTest, AssertEqAndReturnDiff) {
+  // Assert is converted into a no-op in XLA, so there is no failure even if the
+  // two args are different.
+  AssertComp assert;
+  EXPECT_EQ(assert.arg0_data(), assert.args()[0]);
+  EXPECT_EQ(assert.arg1_data(), assert.args()[1]);
+
+  assert.arg0() = 2;
+  assert.arg1() = 1;
+  const int32 expected_result = assert.arg0() - assert.arg1();
+  EXPECT_TRUE(assert.Run());
+  EXPECT_EQ(assert.error_msg(), "");
+  EXPECT_EQ(assert.result0(), expected_result);
+  EXPECT_EQ(assert.result0_data()[0], expected_result);
+  EXPECT_EQ(assert.result0_data(), assert.results()[0]);
+}
+
 TEST(TFCompileTest, LookupNameIndex) {
   // add doesn't have any names defined in its config.
   AddComp add;
@@ -466,6 +491,59 @@ TEST(TFCompileTest, ProgramShape) {
   EXPECT_TRUE(ShapeUtil::Compatible(muladd_result1, f32_2x2));
 }
 
+TEST(TFCompileTest, HloProfiling) {
+  Eigen::ThreadPool tp(1);
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+  MatMulAndAddCompWithProfiling fn;
+  ASSERT_TRUE(fn.hlo_profiling_enabled());
+
+  fn.set_thread_pool(&device);
+
+  // x = [[1, 2], [3, 4]]
+  fn.arg0(0, 0) = 1;
+  fn.arg0(0, 1) = 2;
+  fn.arg0(1, 0) = 3;
+  fn.arg0(1, 1) = 4;
+
+  // y = [[10, 20], [30, 40]]
+  fn.arg1(0, 0) = 10;
+  fn.arg1(0, 1) = 20;
+  fn.arg1(1, 0) = 30;
+  fn.arg1(1, 1) = 40;
+
+  EXPECT_TRUE(fn.Run());
+
+  string hlo_profile_as_string =
+      xla::PrintHloProfile(fn.hlo_profile_printer_data(), fn.profile_counters(),
+                           /*clock_rate_ghz=*/1.0);
+  VLOG(1) << "HLO profile string:\n" << hlo_profile_as_string;
+
+  std::vector<string> hlo_profile_lines =
+      tensorflow::str_util::Split(hlo_profile_as_string, '\n');
+
+  auto header = HasSubstr("Execution profile for");
+  auto total_cycles_profile_line = HasSubstr("[total]");
+  auto dot_profile_line = HasSubstr(
+      "%dot = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1)");
+  auto add_profile_line = HasSubstr(
+      "%add = f32[2,2]{1,0} add(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1)");
+  auto tuple_profile_line = HasSubstr(
+      "%tuple.2 = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(f32[2,2]{1,0} %dot, "
+      "f32[2,2]{1,0} %add)");
+  auto arg0_profile_line = HasSubstr("%arg0 = f32[2,2]{1,0} parameter(0)");
+  auto arg1_profile_line = HasSubstr("%arg1 = f32[2,2]{1,0} parameter(1)");
+
+  hlo_profile_lines.erase(hlo_profile_lines.begin() + 7,
+                          hlo_profile_lines.end());
+
+  EXPECT_THAT(
+      hlo_profile_lines,
+      UnorderedElementsAre(header, total_cycles_profile_line, dot_profile_line,
+                           add_profile_line, tuple_profile_line,
+                           arg0_profile_line, arg1_profile_line));
+}
+
 }  // namespace
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 9dff1be09fede6f65f82c2f36d94be07e781949f..5c57fee326ca743dcb8aaae354d261ed4d7f44be 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -25,7 +25,8 @@ def tf_library(name, graph, config,
                visibility=None, testonly=None,
                tfcompile_flags=None,
                tfcompile_tool="//tensorflow/compiler/aot:tfcompile",
-               include_standard_runtime_deps=True, deps=None, tags=None):
+               include_standard_runtime_deps=True,
+               enable_xla_hlo_profiling=False, deps=None, tags=None):
   """Runs tfcompile to compile a TensorFlow graph into executable code.
 
   Given an invocation of tf_library(name="foo", ...), generates the following
@@ -68,6 +69,8 @@ def tf_library(name, graph, config,
     include_standard_runtime_deps: If True, the standard list of kernel/runtime
       deps is added to deps.  If False, deps must contain the full set of deps
       needed by the generated library.
+    enable_xla_hlo_profiling: Enable XLA HLO profiling in the generated program,
+      and emit metadata that lets us pretty-print the gathered profile counters.
     deps: a list of deps to include on the build rules for the generated
       library, added to the standard deps if standard_runtime_deps is True.
     tags: tags to apply to subsidiary build rules.
@@ -132,11 +135,15 @@ def tf_library(name, graph, config,
   header_file = name + ".h"
   metadata_object_file = name + "_tfcompile_metadata.o"
   function_object_file = name + "_tfcompile_function.o"
-  ep = ("__" + PACKAGE_NAME + "__" + name).replace("/", "_")
+  ep = ("__" + native.package_name() + "__" + name).replace("/", "_")
   if type(tfcompile_flags) == type(""):
     flags = tfcompile_flags
   else:
     flags = " ".join(["'" + arg.replace("'", "'\\''") + "'" for arg in (tfcompile_flags or [])])
+  if enable_xla_hlo_profiling:
+    profiling_flag = "--xla_hlo_profile"
+  else:
+    profiling_flag = ""
   native.genrule(
       name=("gen_" + name),
       srcs=[
@@ -157,7 +164,7 @@ def tf_library(name, graph, config,
            " --out_header=$(@D)/" + header_file +
            " --out_metadata_object=$(@D)/" + metadata_object_file +
            " --out_function_object=$(@D)/" + function_object_file +
-           " " + flags),
+           " " + flags + " " + profiling_flag),
       tools=[tfcompile_tool],
       visibility=visibility,
       testonly=testonly,
@@ -220,6 +227,8 @@ def tf_library(name, graph, config,
       ] + (need_xla_data_proto and [
           # If we're generating the program shape, we must depend on the proto.
           "//tensorflow/compiler/xla:xla_data_proto",
+      ] or []) + (enable_xla_hlo_profiling and [
+          "//tensorflow/compiler/xla/service:hlo_profile_printer_data"
       ] or []) + (include_standard_runtime_deps and [
           # TODO(cwhipkey): only depend on kernel code that the model actually needed.
           "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index e2f01179d4e2e4f6ef72b2761d06e130ffa3a94f..839e1588b7be6c91cf30c87bbaf75402446bd169 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -55,7 +55,7 @@ const char kUsageHeader[] =
     "\n";
 
 Status ReadProtoFile(const string& fname, protobuf::Message* proto) {
-  if (StringPiece(fname).ends_with(".pbtxt")) {
+  if (str_util::EndsWith(fname, ".pbtxt")) {
     return ReadTextProto(Env::Default(), fname, proto);
   } else {
     return ReadBinaryProto(Env::Default(), fname, proto);
@@ -100,6 +100,8 @@ Status Main(const MainFlags& flags) {
   if (flags.cpp_class.empty()) {
     return errors::InvalidArgument("Must specify --cpp_class");
   }
+  codegen_opts.gen_hlo_profile_printer_data =
+      xla::legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile();
   TF_RETURN_IF_ERROR(ParseCppClass(flags.cpp_class, &codegen_opts.class_name,
                                    &codegen_opts.namespaces));
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index a711319607f4ff2b83aa0ebe50e215b3d0e2258e..af2965bba5b91a66e206f05bb8945b0dcde1d2b4 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -29,7 +29,10 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 # Target that bundles up the XLA CPU and GPU JIT devices.
 cc_library(
     name = "jit",
-    visibility = [":friends"],
+    visibility = [
+        ":friends",
+        "//learning/tfx:__subpackages__",
+    ],
     deps = [
         ":xla_cpu_device",
         ":xla_cpu_jit",
@@ -73,6 +76,7 @@ cc_library(
         ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_launch_op",
+        "//tensorflow/compiler/jit/legacy_flags:xla_device_flags",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
@@ -102,22 +106,46 @@ cc_library(
 cc_library(
     name = "xla_interpreter_device",
     srcs = ["xla_interpreter_device.cc"],
+    visibility = [":friends"],
     deps = [
+        ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_launch_op",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",  # buildcleaner: keep
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xla_tensor",
+    srcs = ["xla_tensor.cc"],
+    hdrs = ["xla_tensor.h"],
+    deps = [
+        ":common",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
-    alwayslink = True,
 )
 
 cc_library(
     name = "xla_device",
     srcs = [
+        "xla_compile_on_demand_op.cc",
         "xla_device.cc",
         "xla_device_context.cc",
         "xla_device_ops.cc",
     ],
     hdrs = [
+        "xla_compile_on_demand_op.h",
         "xla_device.h",
         "xla_device_context.h",
         "xla_device_ops.h",
@@ -127,6 +155,8 @@ cc_library(
     deps = [
         ":common",
         ":jit_compilation_passes",
+        ":xla_launch_util",
+        ":xla_tensor",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
@@ -150,9 +180,17 @@ cc_library(
         "//tensorflow/core/kernels:no_op",
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:variable_ops",
+        "@com_google_absl//absl/memory",
     ],
 )
 
+cc_library(
+    name = "shape_inference_helpers",
+    srcs = ["shape_inference_helpers.cc"],
+    hdrs = ["shape_inference_helpers.h"],
+    deps = ["//tensorflow/core:graph"],
+)
+
 # Internal targets below this point.
 
 cc_library(
@@ -166,6 +204,29 @@ cc_library(
     visibility = [":friends"],
 )
 
+cc_library(
+    name = "xla_launch_util",
+    srcs = ["xla_launch_util.cc"],
+    hdrs = ["xla_launch_util.h"],
+    deps = [
+        ":common",
+        ":xla_compilation_cache",
+        ":xla_tensor",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:variable_ops",
+    ],
+)
+
 cc_library(
     name = "xla_compilation_cache",
     srcs = ["xla_compilation_cache.cc"],
@@ -196,18 +257,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "graph_to_functiondef",
-    srcs = ["graph_to_functiondef.cc"],
-    hdrs = ["graph_to_functiondef.h"],
-    deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 cc_library(
     name = "create_xla_launch_op",
     srcs = [
@@ -238,7 +287,7 @@ cc_library(
     ],
     deps = [
         ":common",
-        ":graph_to_functiondef",
+        ":shape_inference_helpers",
         ":union_find",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/kernels:parallel_check_op",
@@ -256,6 +305,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:bounds_check",
     ],
 )
 
@@ -264,22 +314,19 @@ cc_library(
     hdrs = ["union_find.h"],
 )
 
+cc_library(
+    name = "producer_consumer_queue",
+    hdrs = ["producer_consumer_queue.h"],
+    deps = ["//tensorflow/core:lib"],
+)
+
 tf_cc_test(
-    name = "graph_to_functiondef_test",
+    name = "producer_consumer_queue_test",
     size = "small",
-    srcs = [
-        "graph_to_functiondef_test.cc",
-    ],
+    srcs = ["producer_consumer_queue_test.cc"],
     deps = [
-        ":graph_to_functiondef",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:cc_ops_internal",
-        "//tensorflow/cc:function_ops",
-        "//tensorflow/cc:ops",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework_internal",
+        ":producer_consumer_queue",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -306,26 +353,13 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
 
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
 cc_header_only_library(
     name = "xla_jit_headers_lib",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 9c372a012789fc25ca0a711349c09ca62edc6754..f06debaf316c0172a5683e56aa5de6ebb83fbece 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -22,9 +22,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/compiler/jit/graph_to_functiondef.h"
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -33,9 +34,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
@@ -53,6 +56,8 @@ namespace tensorflow {
 const char* const kXlaCompiledKernelAttr = "_XlaCompiledKernel";
 const char* const kXlaNumConstantArgsAttr = "_XlaNumConstantArgs";
 const char* const kXlaNumResourceArgsAttr = "_XlaNumResourceArgs";
+const char* const kXlaHostTransferSequencerAttr =
+    "_xla_host_transfer_sequencer";
 
 namespace {
 
@@ -143,7 +148,7 @@ struct NodeSlot {
 // everything to use it.
 static const char* const kArgOp = "_Arg";
 static const char* const kRetValOp = "_Retval";
-static const char* const kHostComputeOp = "_XlaHostCompute";
+static const char* const kHostComputeOp = "XlaHostCompute";
 static const char* const kSendFromHostOp = "_XlaSendFromHost";
 static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
 
@@ -156,6 +161,11 @@ class Encapsulator {
             std::move(outside_compilation_attribute)),
         graph_in_(graph_in) {}
 
+  // Find dependencies between subgraphs and outside_compilation clusters that
+  // only manifest via edges between outside_compilation clusters in the outer
+  // (non-compiled) graph.
+  Status FindClusterDependencies();
+
   // Find subgraphs marked with 'group_attribute', and build a new
   // subgraph, one for each value of 'group_attribute'.
   Status SplitIntoSubgraphs();
@@ -226,6 +236,19 @@ class Encapsulator {
   // the shapes of any ancestor RAH outputs. If it can be determined that the
   // shape of the SFH inputs will not be inferrable even once the shapes of the
   // RAH outputs are known, an error is returned by the rewriter.
+  //
+  // Once edges between compiled and outside_compilation clusters have been
+  // replaced by send/recv ops, some dependencies may no longer be apparent.
+  // A clustering pass finds all the dependencies between HC nodes that are only
+  // present as a result of edges between nodes in outside_compilaton clusters.
+  // Suppose there is a path from outside_compilation cluster C in subgraph S
+  // to outside_compilation cluster D in subgraph T. If S != T then a control
+  // edge is added from the call node for S to the call node for T, which
+  // ensures that C will execute before D because S executes before T. If S==T
+  // then a control dependency is added between the HC nodes for C and D in S,
+  // and the HC node for C is added to an 'ancestors' attr in the HC node for D
+  // so that during compilation of the HC node for D, an XLA control dependency
+  // can be added to ensure C's SendToHost executes before D's RecvFromHost.
   class Subgraph {
    public:
     // Creates a graph to build the subgraph in, if it doesn't already exist,
@@ -252,7 +275,8 @@ class Encapsulator {
 
     // Adds _RecvAtHost and _SendFromHost nodes, where needed, to graph_out.
     Status AddOutsideCompilationHostIONodes(
-        const string& subgraph_name,
+        const string& group_attribute, const string& subgraph_name,
+        const string& outside_compilation_attribute,
         const std::unordered_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
@@ -319,6 +343,18 @@ class Encapsulator {
     void RecordOutsideCompilationOutputOrControl(
         const string& outside_compilation_id, const Edge* edge);
 
+    // Records the fact that there is a path from a node in outside_compilation
+    // cluster ancestor to node in cluster successor that does not go through
+    // the subgraph.
+    void RecordOutsideCompilationDependency(const string& successor,
+                                            const string& ancestor);
+
+    // Returns the mapping from outside_compilation cluster C to the set of
+    // outside_compilation clusters that have a path to C entirely outside
+    // compiled subgraphs.
+    const std::unordered_map<string, std::unordered_set<string>>
+    OutsideCompilationAncestorMap() const;
+
     // Adds the HostCompute nodes for each outside_compilation subgraph.
     Status AddHostComputes(
         const string& subgraph_name,
@@ -328,12 +364,14 @@ class Encapsulator {
     Status MakeSequencingNode(const string& subgraph_name, Graph* graph_out);
 
     // If there is a sequencer node, adds a control edge from the sequencer to
-    // all the downstream nodes of call_node_outputs.
-    void ConnectSequencerToOutputs(Graph* graph_out);
+    // the call node.
+    void ConnectSequencerToCallNode(Graph* graph_out);
 
     Status AddShapeInferenceInfo(
+        const string& subgraph_name,
         const string& outside_compilation_subgraph_name,
-        const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph);
+        const std::vector<TensorShapeProto>& shapes, Graph* inference_graph,
+        FunctionLibraryDefinition* library);
 
     Status ReplaceFunctionDef(FunctionLibraryDefinition* library);
 
@@ -381,15 +419,36 @@ class Encapsulator {
       Node* send_from_host = nullptr;
     };
 
+    // Creates an outside_compilation subgraph for outside_compilation_id if
+    // none exists yet. Returns the (possible newly created) subgraph for
+    // outside_compilation_id.
+    OutsideCompilationSubgraph* LookupOrCreateOutsideCompilationSubgraph(
+        const string& outside_compilation_id);
+
     // Builds a ParallelCheck op that compares the output of the original
     // subgraph with the encapsulated subgraph.
     Status BuildParallelCheckOp(
         const std::unordered_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
+    // Builds a placeholder node used to provide the key input to a RecvAtHost
+    // or SendFromHost node. This placeholder node will be removed by a later
+    // pass.
+    Status AddHostComputeKeyPlaceholder(OutsideCompilationSubgraph* oc_subgraph,
+                                        Graph* graph_out);
+
+    // Get the set of outside_compilation clusters and the dependency edges
+    // between them.
+    void GetActiveClusterDependencyGraph(
+        std::unordered_set<string>* clusters,
+        std::unordered_set<string>* has_successor,
+        std::unordered_map<string, std::unordered_set<string>>* ancestors_map);
+
     // Builds a _RecvAtHost node producing all the inputs of an
     // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host.
-    Status AddRecvAtHostNode(const string& subgraph_name,
+    Status AddRecvAtHostNode(const string& group_attribute,
+                             const string& subgraph_name,
+                             const string& outside_compilation_attribute,
                              const string& oc_subgraph_name,
                              OutsideCompilationSubgraph* oc_subgraph,
                              Graph* graph_out);
@@ -398,8 +457,10 @@ class Encapsulator {
     // outside_compilation subgraph and stores it in oc_subgraph.send_from_host.
     Status AddSendFromHostNode(
         const std::unordered_map<const Node*, Node*>& node_images,
-        const string& subgraph_name, const string& oc_subgraph_name,
-        OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out);
+        const string& group_attribute, const string& subgraph_name,
+        const string& outside_compilation_attribute,
+        const string& oc_subgraph_name, OutsideCompilationSubgraph* oc_subgraph,
+        Graph* graph_out);
 
     // The subgraph extracted from the input graph, suitable for being turned
     // into a FunctionDef. Inputs are fed by _Arg nodes, and outputs are
@@ -413,6 +474,14 @@ class Encapsulator {
     // NodeDef for the function call node.
     NodeDef call_node_def_;
 
+    // Name that is used for the call node. This may not be
+    // call_node_def_.name() if the client supplies a rewrite lambda.
+    string function_def_name_;
+
+    // Placeholder node simulating the host compute key in the output graph.
+    // Not owned.
+    Node* host_compute_key_placeholder_ = nullptr;
+
     // Function call node(s) in the output graph. Not owned.
     // If parallel_checking is enabled, 'call_node_inputs' is the function call
     // node to which inputs should be fed, and 'call_node_outputs' is the
@@ -437,6 +506,14 @@ class Encapsulator {
     // The outside_compilation clusters in this subgraph.
     std::unordered_map<string, OutsideCompilationSubgraph>
         outside_compilation_subgraphs_;
+    // For each outside_compilation cluster C, the outside_compilation clusters
+    // that have a path to C outside the compiled graph.
+    std::unordered_map<string, std::unordered_set<string>>
+        outside_compilation_ancestors_;
+    // For each outside_compilation cluster C, the outside_compilation clusters
+    // that have a path from C outside the compiled graph.
+    std::unordered_map<string, std::unordered_set<string>>
+        outside_compilation_successors_;
 
     // NoOp node in the output graph that is sequenced after the call node and
     // used to prevent host-side outside_compilation sends and recvs from being
@@ -525,6 +602,10 @@ class Encapsulator {
       std::unordered_set<std::pair<NodeSlot, NodeSlot>, NodeSlot::PairHasher>*
           edges_added);
 
+  // Adds control dependencies between subgraph call nodes that have
+  // dependencies via outside_compilation edges.
+  Status AddCallNodeDependencies(Graph* graph_out);
+
   // Adds all edges to the output graph.
   Status AddEdgesToOutputGraph(
       const std::unordered_map<const Node*, Node*>& node_images,
@@ -547,11 +628,12 @@ class Encapsulator {
   // satisfied, e.g., because send_node depends on a node that doesn't have a
   // registered shape inference function.
   Status DoStaticShapeInferenceForOutsideCompilationSend(
-      const Graph& graph_in, const ShapeRefiner& shape_refiner,
+      const Graph& graph_in, const BackEdgeHelper& back_edge_helper,
+      const ShapeRefiner& shape_refiner,
       const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
       FunctionLibraryDefinition* library,
       std::vector<TensorShapeProto>* static_shape_out,
-      std::unique_ptr<GraphDef>* graphdef_out);
+      std::unique_ptr<Graph>* graph_out);
 
   // Makes a copy of graph containing only nodes that are ancestors of at least
   // one node in send_from_host_nodes and store it in pruned_graph. On exit
@@ -570,7 +652,7 @@ class Encapsulator {
   // to nodes in pruned_graph.
   Status MakeGraphForOutsideCompilationSends(
       const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-      ShapeRefiner* shape_refiner,
+      BackEdgeHelper* back_edge_helper, ShapeRefiner* shape_refiner,
       std::unordered_map<const Node*, Node*>* node_images,
       FunctionLibraryDefinition* library);
 
@@ -588,10 +670,65 @@ class Encapsulator {
   const Graph* graph_in_;
 
   std::unordered_map<string, Subgraph> subgraphs_;
+  // For each subgraph S the subgraphs S' such that there is a path in some
+  // outside_compilation cluster C in S to some outside_compilation cluster C'
+  // in S', that goes only through the uncompiled graph.
+  std::unordered_map<string, std::unordered_set<string>> subgraph_ancestors_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator);
 };
 
+namespace {
+
+// Return in 'sorted' a topological sort of clusters according to the
+// dependencies encoded in ancestors. clusters is the list of all clusters
+// including clusters that are not present in the ancestors map. has_successors
+// is the set of clusters that are ancestors of some other cluster.
+void TopologicalClusterSort(
+    const std::unordered_set<string>& clusters,
+    const std::unordered_set<string>& has_successors,
+    const std::unordered_map<string, std::unordered_set<string>>& ancestors,
+    std::vector<string>* sorted) {
+  // The nodes are placed in 'sorted' in topological order.
+  sorted->clear();
+  // We don't use the standard DFS because we are not operating on Node*
+  // objects.
+  struct Work {
+    string cluster;
+    bool leave;
+  };
+  std::set<string> visited;
+  std::vector<Work> stack;
+  // Seed the processing list with clusters that have no successors.
+  for (const auto& cluster : clusters) {
+    if (has_successors.find(cluster) == has_successors.end()) {
+      stack.push_back({cluster, false});
+    }
+  }
+  while (!stack.empty()) {
+    const Work item = stack.back();
+    stack.pop_back();
+    if (item.leave) {
+      sorted->push_back(item.cluster);
+      continue;
+    }
+
+    if (visited.find(item.cluster) != visited.end()) continue;
+    visited.insert(item.cluster);
+
+    stack.push_back({item.cluster, true});
+    const auto& iter = ancestors.find(item.cluster);
+    if (iter != ancestors.end()) {
+      for (const auto& ancestor : iter->second) {
+        stack.push_back({ancestor, false});
+      }
+    }
+  }
+  CHECK(sorted->size() == clusters.size());
+}
+
+}  // namespace
+
 Node* Encapsulator::Subgraph::GetCallNodeForInputs() const {
   return call_node_inputs_;
 }
@@ -712,49 +849,113 @@ Status Encapsulator::Subgraph::RecordResult(
   return Status::OK();
 }
 
-void Encapsulator::Subgraph::RecordOutsideCompilationInputOrControl(
-    const string& outside_compilation_id, const Edge* edge) {
+Encapsulator::Subgraph::OutsideCompilationSubgraph*
+Encapsulator::Subgraph::LookupOrCreateOutsideCompilationSubgraph(
+    const string& outside_compilation_id) {
   auto iter = outside_compilation_subgraphs_
                   .emplace(outside_compilation_id, OutsideCompilationSubgraph())
                   .first;
-  OutsideCompilationSubgraph& outside_subgraph = iter->second;
+  OutsideCompilationSubgraph* outside_subgraph = &iter->second;
+  return outside_subgraph;
+}
+
+void Encapsulator::Subgraph::RecordOutsideCompilationInputOrControl(
+    const string& outside_compilation_id, const Edge* edge) {
+  OutsideCompilationSubgraph* outside_subgraph =
+      LookupOrCreateOutsideCompilationSubgraph(outside_compilation_id);
   if (edge->IsControlEdge()) {
-    outside_subgraph.control_inputs.insert(edge->src());
+    outside_subgraph->control_inputs.insert(edge->src());
   } else {
-    int input_index = outside_subgraph.inputs.size();
-    outside_subgraph.inputs.emplace(NodeSlot(edge->src(), edge->src_output()),
-                                    input_index);
+    int input_index = outside_subgraph->inputs.size();
+    outside_subgraph->inputs.emplace(NodeSlot(edge->src(), edge->src_output()),
+                                     input_index);
   }
 }
 
 void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl(
     const string& outside_compilation_id, const Edge* edge) {
-  auto subgraph_iter =
-      outside_compilation_subgraphs_
-          .emplace(outside_compilation_id, OutsideCompilationSubgraph())
-          .first;
-  OutsideCompilationSubgraph& outside_subgraph = subgraph_iter->second;
+  OutsideCompilationSubgraph* outside_subgraph =
+      LookupOrCreateOutsideCompilationSubgraph(outside_compilation_id);
   if (edge->IsControlEdge()) {
-    outside_subgraph.control_outputs.insert(edge->dst());
+    outside_subgraph->control_outputs.insert(edge->dst());
   } else {
     DataType dtype = edge->dst()->input_type(edge->dst_input());
     auto output_iter =
-        outside_subgraph.outputs_by_src
+        outside_subgraph->outputs_by_src
             .emplace(NodeSlot(edge->src(), edge->src_output(), dtype),
-                     outside_subgraph.outputs_by_src.size())
+                     outside_subgraph->outputs_by_src.size())
             .first;
     int output_index = output_iter->second;
-    outside_subgraph.outputs_by_dst[NodeSlot(edge->dst(), edge->dst_input())] =
+    outside_subgraph->outputs_by_dst[NodeSlot(edge->dst(), edge->dst_input())] =
         output_index;
   }
 }
 
+void Encapsulator::Subgraph::RecordOutsideCompilationDependency(
+    const string& successor, const string& ancestor) {
+  outside_compilation_ancestors_[successor].insert(ancestor);
+  outside_compilation_successors_[ancestor].insert(successor);
+}
+
+const std::unordered_map<string, std::unordered_set<string>>
+Encapsulator::Subgraph::OutsideCompilationAncestorMap() const {
+  return outside_compilation_ancestors_;
+}
+
+void Encapsulator::Subgraph::GetActiveClusterDependencyGraph(
+    std::unordered_set<string>* clusters,
+    std::unordered_set<string>* has_successor,
+    std::unordered_map<string, std::unordered_set<string>>* ancestors_map) {
+  // During initial clustering the ancestor and successor datastructures may
+  // have been built including oc_cluster names that never turned into subgraphs
+  // because they had no edges into or out of the compiled cluster. Remove them
+  // before proceeding to simplify the logic. Get the set of clusters that was
+  // actually added, then remove references to the others.
+  for (const auto& oc_subgraph : outside_compilation_subgraphs_) {
+    clusters->insert(oc_subgraph.first);
+  }
+  for (const auto& cluster : outside_compilation_successors_) {
+    if (clusters->find(cluster.first) != clusters->end()) {
+      for (const auto& successor : cluster.second) {
+        if (clusters->find(successor) != clusters->end()) {
+          has_successor->insert(cluster.first);
+          break;
+        }
+      }
+    }
+  }
+  for (const auto& cluster : outside_compilation_ancestors_) {
+    if (clusters->find(cluster.first) != clusters->end()) {
+      std::unordered_set<string>& ancestors = (*ancestors_map)[cluster.first];
+      for (const auto& ancestor : cluster.second) {
+        if (clusters->find(ancestor) != clusters->end()) {
+          ancestors.insert(ancestor);
+        }
+      }
+    }
+  }
+}
+
 Status Encapsulator::Subgraph::AddHostComputes(
     const string& subgraph_name,
     const std::unordered_map<const Node*, Node*>& node_images) {
-  for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
-    const string& oc_subgraph_name = oc_subgraph_iter.first;
-    OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
+  // Get the set of outside_compilation clusters and the dependency edges
+  // between them.
+  std::unordered_set<string> clusters;
+  std::unordered_set<string> has_successor;
+  std::unordered_map<string, std::unordered_set<string>> ancestors_map;
+  GetActiveClusterDependencyGraph(&clusters, &has_successor, &ancestors_map);
+  // Topologically sort the outside_compilation clusters according to their
+  // dependency relation.
+  std::vector<string> sorted_clusters;
+  TopologicalClusterSort(clusters, has_successor, ancestors_map,
+                         &sorted_clusters);
+
+  // The host compute nodes added for each outside_compilation_cluster;
+  std::unordered_map<string, Node*> host_compute_node;
+  for (const string& oc_subgraph_name : sorted_clusters) {
+    OutsideCompilationSubgraph& oc_subgraph =
+        outside_compilation_subgraphs_[oc_subgraph_name];
     if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() ||
         !oc_subgraph.outputs_by_src.empty() ||
         !oc_subgraph.control_outputs.empty()) {
@@ -774,13 +975,22 @@ Status Encapsulator::Subgraph::AddHostComputes(
         inputs[input_index].Reset(src_image->name(), src_slot, dtype);
         input_dtypes[input_index] = dtype;
       }
-
       for (const auto& output : oc_subgraph.outputs_by_src) {
         DataType dtype = output.first.dtype;
         int output_index = output.second;
         output_dtypes[output_index] = dtype;
       }
 
+      std::vector<string> host_compute_ancestors;
+      const auto iter = ancestors_map.find(oc_subgraph_name);
+      if (iter != ancestors_map.end()) {
+        for (const string& ancestor_cluster : iter->second) {
+          host_compute_ancestors.push_back(
+              outside_compilation_subgraphs_[ancestor_cluster]
+                  .host_compute_name);
+        }
+      }
+
       NodeDef host_compute_def;
       NodeDefBuilder builder(strings::StrCat("outside_compilation_",
                                              oc_subgraph_name, "_host_compute"),
@@ -788,14 +998,17 @@ Status Encapsulator::Subgraph::AddHostComputes(
       builder.Input(inputs);
       builder.Attr("Tinputs", input_dtypes);
       builder.Attr("Toutputs", output_dtypes);
+      builder.Attr("ancestors", host_compute_ancestors);
       builder.Attr("key",
                    strings::StrCat("host_compute_channel_", subgraph_name, "_",
                                    oc_subgraph_name));
+      builder.Attr("_outside_compilation_subgraph", oc_subgraph_name);
       Status s = builder.Finalize(&host_compute_def);
       if (!s.ok()) return s;
 
       Node* host_compute = graph_->AddNode(host_compute_def, &s);
       if (!s.ok()) return s;
+      host_compute_node[host_compute->name()] = host_compute;
       oc_subgraph.host_compute_name = host_compute->name();
 
       // Connect the _HostCompute node to its producers in the subgraph.
@@ -814,6 +1027,12 @@ Status Encapsulator::Subgraph::AddHostComputes(
         graph_->AddControlEdge(src_image, host_compute);
       }
 
+      // Connect the _HostCompute node to its ancestor host compute nodes.
+      for (const auto& ancestor_name : host_compute_ancestors) {
+        Node* ancestor = host_compute_node[ancestor_name];
+        graph_->AddControlEdge(ancestor, host_compute);
+      }
+
       // Connect the consumers in the subgraph to the _HostCompute node.
       for (const auto& output : oc_subgraph.outputs_by_dst) {
         const Node* dst_node = output.first.node;
@@ -842,25 +1061,21 @@ Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name,
     NodeDef seq_def;
     NodeDefBuilder builder(strings::StrCat(subgraph_name, "_sequencer"),
                            "NoOp");
+    builder.Attr(kXlaHostTransferSequencerAttr, subgraph_name);
+    builder.Device(device_);
     Status s = builder.Finalize(&seq_def);
     if (!s.ok()) return s;
 
     sequencer_ = graph_out->AddNode(seq_def, &s);
     if (!s.ok()) return s;
-    sequencer_->set_assigned_device_name(device_);
   }
   return Status::OK();
 }
 
-void Encapsulator::Subgraph::ConnectSequencerToOutputs(Graph* graph_out) {
+void Encapsulator::Subgraph::ConnectSequencerToCallNode(Graph* graph_out) {
   if (sequencer_ != nullptr) {
-    std::unordered_set<Node*> output_dependencies;
-    for (Node* node : call_node_outputs_->out_nodes()) {
-      output_dependencies.insert(node);
-    }
-    for (Node* node : output_dependencies) {
-      graph_out->AddControlEdge(sequencer_, node);
-    }
+    VLOG(2) << "ConnectSequencerToCallNode";
+    graph_out->AddControlEdge(sequencer_, call_node_inputs_);
   }
 }
 
@@ -906,6 +1121,8 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
     name = call_node_def_.op();
   }
 
+  function_def_name_ = name;
+
   FunctionDef fdef;
   TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
 
@@ -924,8 +1141,10 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
 }
 
 Status Encapsulator::Subgraph::AddShapeInferenceInfo(
+    const string& subgraph_name,
     const string& outside_compilation_subgraph_name,
-    const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph) {
+    const std::vector<TensorShapeProto>& shapes, Graph* inference_graph,
+    FunctionLibraryDefinition* library) {
   OutsideCompilationSubgraph& oc_subgraph =
       outside_compilation_subgraphs_.at(outside_compilation_subgraph_name);
 
@@ -947,21 +1166,22 @@ Status Encapsulator::Subgraph::AddShapeInferenceInfo(
     host_compute->AddAttr("shape_inference_graph", "");
     host_compute->AddAttr("shapes", shapes);
   } else {
-    string serialized_graph;
-    if (!inference_graph->SerializeToString(&serialized_graph)) {
-      return errors::Internal(
-          "Failed to serialize graph for outside compilation subgraph ",
-          oc_subgraph.host_compute_name);
-    }
-    host_compute->AddAttr("shape_inference_graph", serialized_graph);
+    string inference_graph_name =
+        strings::StrCat("_outside_compilation_shape_inference_", subgraph_name,
+                        "_", outside_compilation_subgraph_name);
+    FunctionDef fdef;
+    TF_RETURN_IF_ERROR(
+        GraphToFunctionDef(*inference_graph, inference_graph_name, &fdef));
+    host_compute->AddAttr("shape_inference_graph", inference_graph_name);
     host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());
+    TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
   }
   return Status::OK();
 }
 
 Status Encapsulator::Subgraph::ReplaceFunctionDef(
     FunctionLibraryDefinition* library) {
-  const string& name = call_node_def_.name();
+  const string& name = function_def_name_;
 
   FunctionDef fdef;
   TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
@@ -1060,9 +1280,37 @@ Status Encapsulator::Subgraph::AddFunctionCallNode(
   return Status::OK();
 }
 
+Status Encapsulator::Subgraph::AddHostComputeKeyPlaceholder(
+    OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
+  TensorShapeProto shape_proto;
+  TensorShape shape({2});
+  shape.AsProto(&shape_proto);
+  GraphDefBuilder::Options options(graph_out, /*status=*/nullptr);
+  NodeDef key_def;
+  NodeDefBuilder builder(
+      strings::StrCat(call_node_def_.name(), "_key_placeholder"),
+      "Placeholder");
+  builder.Attr("dtype", DT_STRING);
+  builder.Attr("shape", shape_proto);
+  builder.Attr("_host_compute_call_node", call_node_def_.name());
+  Status s = builder.Finalize(&key_def);
+  if (!s.ok()) return s;
+
+  host_compute_key_placeholder_ = graph_out->AddNode(key_def, &s);
+  if (!s.ok()) return s;
+  host_compute_key_placeholder_->set_assigned_device_name(device_);
+
+  return Status::OK();
+}
+
 Status Encapsulator::Subgraph::AddRecvAtHostNode(
-    const string& subgraph_name, const string& oc_subgraph_name,
+    const string& group_attribute, const string& subgraph_name,
+    const string& outside_compilation_attribute, const string& oc_subgraph_name,
     OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
+  if (host_compute_key_placeholder_ == nullptr) {
+    TF_RETURN_IF_ERROR(AddHostComputeKeyPlaceholder(oc_subgraph, graph_out));
+  }
+
   std::vector<DataType> dtypes(oc_subgraph->inputs.size(), DT_INVALID);
 
   for (const auto& input : oc_subgraph->inputs) {
@@ -1078,15 +1326,23 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_recv"),
                          kRecvAtHostOp);
+  builder.Device(device_);
   builder.Attr("Toutputs", dtypes);
+  // The correct device_ordinal will be inserted during replication in a
+  // subsequent rewrite.
+  builder.Attr("device_ordinal", 0);
   builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
                                       "_", oc_subgraph_name));
+  builder.Attr(group_attribute, subgraph_name);
+  builder.Attr(outside_compilation_attribute, oc_subgraph_name);
+  builder.Input(host_compute_key_placeholder_->name(), 0, DT_STRING);
   Status s = builder.Finalize(&recv_def);
   if (!s.ok()) return s;
 
   oc_subgraph->recv_at_host = graph_out->AddNode(recv_def, &s);
   if (!s.ok()) return s;
-  oc_subgraph->recv_at_host->set_assigned_device_name(device_);
+  graph_out->AddEdge(host_compute_key_placeholder_, 0,
+                     oc_subgraph->recv_at_host, 0);
 
   // Add a control dependency forcing the RecvAtHost to run before the subgraph
   // completes. This has no effect on execution order but prevents the
@@ -1099,8 +1355,13 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
 
 Status Encapsulator::Subgraph::AddSendFromHostNode(
     const std::unordered_map<const Node*, Node*>& node_images,
-    const string& subgraph_name, const string& oc_subgraph_name,
+    const string& group_attribute, const string& subgraph_name,
+    const string& outside_compilation_attribute, const string& oc_subgraph_name,
     OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
+  if (host_compute_key_placeholder_ == nullptr) {
+    TF_RETURN_IF_ERROR(AddHostComputeKeyPlaceholder(oc_subgraph, graph_out));
+  }
+
   std::vector<DataType> dtypes(oc_subgraph->outputs_by_src.size(), DT_INVALID);
   std::vector<NodeDefBuilder::NodeOut> inputs(
       oc_subgraph->outputs_by_src.size());
@@ -1120,16 +1381,24 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_send"),
                          kSendFromHostOp);
+  builder.Device(device_);
   builder.Attr("Tinputs", dtypes);
   builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
                                       "_", oc_subgraph_name));
+  // The correct device_ordinal will be inserted during replication in a
+  // subsequent rewrite.
+  builder.Attr("device_ordinal", 0);
+  builder.Attr(group_attribute, subgraph_name);
+  builder.Attr(outside_compilation_attribute, oc_subgraph_name);
   builder.Input(inputs);
+  builder.Input(host_compute_key_placeholder_->name(), 0, DT_STRING);
   Status s = builder.Finalize(&send_def);
   if (!s.ok()) return s;
 
   oc_subgraph->send_from_host = graph_out->AddNode(send_def, &s);
   if (!s.ok()) return s;
-  oc_subgraph->send_from_host->set_assigned_device_name(device_);
+  graph_out->AddEdge(host_compute_key_placeholder_, 0,
+                     oc_subgraph->send_from_host, inputs.size());
 
   // Add a control dependency forcing the SendFromHost to run before the
   // subgraph completes. This has no effect on execution order but prevents the
@@ -1141,7 +1410,8 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
 }
 
 Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
-    const string& subgraph_name,
+    const string& group_attribute, const string& subgraph_name,
+    const string& outside_compilation_attribute,
     const std::unordered_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
   for (auto& outside_compilation_subgraph_entry :
@@ -1151,14 +1421,16 @@ Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
         outside_compilation_subgraph_entry.second;
 
     if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) {
-      TF_RETURN_IF_ERROR(
-          AddRecvAtHostNode(subgraph_name, oc_name, &oc_subgraph, graph_out));
+      TF_RETURN_IF_ERROR(AddRecvAtHostNode(group_attribute, subgraph_name,
+                                           outside_compilation_attribute,
+                                           oc_name, &oc_subgraph, graph_out));
     }
 
     if (!oc_subgraph.outputs_by_src.empty() ||
         !oc_subgraph.control_outputs.empty()) {
-      TF_RETURN_IF_ERROR(AddSendFromHostNode(node_images, subgraph_name,
-                                             oc_name, &oc_subgraph, graph_out));
+      TF_RETURN_IF_ERROR(AddSendFromHostNode(
+          node_images, group_attribute, subgraph_name,
+          outside_compilation_attribute, oc_name, &oc_subgraph, graph_out));
     }
   }
   return Status::OK();
@@ -1375,8 +1647,6 @@ Status Encapsulator::CopyNodesToOutputGraph(
             "Parallel checking is not supported when outside_compilation "
             "clusters are present.");
       }
-      image->ClearAttr(group_attribute_);
-      image->ClearAttr(outside_compilation_attribute_);
     }
     (*node_images)[node] = image;
   }
@@ -1402,7 +1672,8 @@ Status Encapsulator::AddOutsideCompilationHostIONodes(
     const string& subgraph_name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
     TF_RETURN_IF_ERROR(subgraph.AddOutsideCompilationHostIONodes(
-        subgraph_name, node_images, graph_out));
+        group_attribute_, subgraph_name, outside_compilation_attribute_,
+        node_images, graph_out));
   }
   return Status::OK();
 }
@@ -1564,6 +1835,17 @@ Status Encapsulator::CopyEdgeToOutputGraph(
   return Status::OK();
 }
 
+Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) {
+  for (const auto& ancestors : subgraph_ancestors_) {
+    const string& subgraph = ancestors.first;
+    for (const string& ancestor : ancestors.second) {
+      graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNodeForOutputs(),
+                                subgraphs_[subgraph].GetCallNodeForInputs());
+    }
+  }
+  return Status::OK();
+}
+
 Status Encapsulator::AddEdgesToOutputGraph(
     const std::unordered_map<const Node*, Node*>& node_images,
     bool parallel_checking, Graph* graph_out) {
@@ -1611,8 +1893,9 @@ Status Encapsulator::AddEdgesToOutputGraph(
 
   for (auto& subgraph_entry : subgraphs_) {
     Subgraph& subgraph = subgraph_entry.second;
-    subgraph.ConnectSequencerToOutputs(graph_out);
+    subgraph.ConnectSequencerToCallNode(graph_out);
   }
+  TF_RETURN_IF_ERROR(AddCallNodeDependencies(graph_out));
 
   return Status::OK();
 }
@@ -1625,9 +1908,13 @@ namespace {
 // matter because it will only be used subsequently for shape inference. (It
 // would be possible to add a switch statement over data_type to create a value
 // for the constant, but that would entail maintaining the logic as new types
-// are added, and is not necessary.)
-Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
-                         Graph* graph_out) {
+// are added, and is not necessary.) If the node being replaced was within a
+// control flow frame, adds appropriate Enter nodes so that the use of the Const
+// is well-formed.
+Node* AddDummyShapedNode(const Node* src_node, int src_port,
+                         const std::vector<ControlFlowInfo>& control_flow_info,
+                         const TensorShapeProto& shape, Graph* graph_out) {
+  DataType data_type = src_node->output_type(src_port);
   TensorProto dummy_proto;
   dummy_proto.set_dtype(data_type);
   *dummy_proto.mutable_tensor_shape() = shape;
@@ -1638,7 +1925,23 @@ Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
   NodeBuilder node_builder(options.GetNameForOp("KnownShape"), "Const",
                            options.op_registry());
   node_builder.Attr("dtype", data_type).Attr("value", dummy_proto);
-  return options.FinalizeBuilder(&node_builder);
+  Node* node = options.FinalizeBuilder(&node_builder);
+  // Add any Enter nodes required to bring the constant to the correct control
+  // flow frame.
+  while (!control_flow_info[src_node->id()].frame_name.empty()) {
+    NodeBuilder enter_builder(options.GetNameForOp("Enter"), "Enter",
+                              options.op_registry());
+    enter_builder.Attr("frame_name",
+                       control_flow_info[src_node->id()].frame_name);
+    enter_builder.Attr("is_constant", true);
+    enter_builder.Input(node, 0);
+    Node* enter_node = options.FinalizeBuilder(&enter_builder);
+    // Adopt the new Enter node as the value in the current frame.
+    node = enter_node;
+    // Recurse to the parent frame to see if more Enter nodes need to be added.
+    src_node = control_flow_info[src_node->id()].parent_frame;
+  }
+  return node;
 }
 
 // Adds a copy of node_in to graph_out and adds the mapping to
@@ -1680,17 +1983,30 @@ Status CopyShapeInferenceNodeToGraph(
       }
     }
   }
+  // Work around the fact that Enter nodes refuse to propagate shape information
+  // unless they are marked loop invariant. Since we are never going to execute
+  // this graph, marking them all loop invariant is fine.
+  if (node_out->type_string() == "Enter") {
+    node_out->ClearAttr("is_constant");
+    node_out->AddAttr("is_constant", true);
+  }
   return Status::OK();
 }
 
 }  // namespace
 
 Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
-    const Graph& graph_in, const ShapeRefiner& shape_refiner,
+    const Graph& graph_in, const BackEdgeHelper& back_edge_helper,
+    const ShapeRefiner& shape_refiner,
     const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
     FunctionLibraryDefinition* library,
     std::vector<TensorShapeProto>* static_shape_out,
-    std::unique_ptr<GraphDef>* graphdef_out) {
+    std::unique_ptr<Graph>* graph_out) {
+  // Get the control flow structure of the input graph so we can build
+  // well-formed output graphs.
+  std::vector<ControlFlowInfo> control_flow_info;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(&graph_in, &control_flow_info));
+
   // Maps from nodes in graph_in to nodes in graph_out.
   //
   // When an edge has fully defined shape the source node in graph_in is
@@ -1707,13 +2023,14 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
   std::unordered_map<Node*, Node*> dummy_node_images;
   std::unordered_map<Node*, Node*> copied_node_images;
 
-  std::unique_ptr<Graph> graph_out(new Graph(graph_in.op_registry()));
-  graph_out->set_versions(graph_in.versions());
-  static_shape_out->resize(send_node->num_inputs());
+  graph_out->reset(new Graph(graph_in.op_registry()));
+  (*graph_out)->set_versions(graph_in.versions());
+  // The final input to the send node is the dynamic key, which we don't include
+  // in the static shapes.
+  static_shape_out->resize(send_node->num_inputs() - 1);
 
   // We don't use the standard ReverseDFS because we want to cut off traversal
   // whenever we find an output with fully defined shape.
-  // TODO(misard) make this work properly in the presence of control flow.
   struct Work {
     Node* node;
     bool leave;  // Are we entering or leaving node?
@@ -1728,7 +2045,7 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
     if (w.leave) {
       TF_RETURN_IF_ERROR(CopyShapeInferenceNodeToGraph(
           n, send_node, dummy_node_images, library, &copied_node_images,
-          graph_out.get()));
+          graph_out->get()));
     } else {
       if (visited[n->id()]) continue;
       visited[n->id()] = true;
@@ -1750,14 +2067,24 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
             // continue.
             TensorShapeProto proto;
             context->ShapeHandleToProto(shape, &proto);
-            dummy_node_images[src_node] = AddDummyShapedNode(
-                src_node->output_type(src_port), proto, graph_out.get());
-            if (n == send_node) {
+            if (dummy_node_images.find(src_node) == dummy_node_images.end()) {
+              dummy_node_images[src_node] =
+                  AddDummyShapedNode(src_node, src_port, control_flow_info,
+                                     proto, graph_out->get());
+            }
+            // The final input to the send node is the dynamic key, which we
+            // don't include in the static shapes.
+            if (n == send_node &&
+                in_edge->dst_input() < static_shape_out->size()) {
               (*static_shape_out)[in_edge->dst_input()] = proto;
             }
           } else {
+            has_parent_with_unknown_shape = true;
             if (!visited[src_node->id()]) {
-              has_parent_with_unknown_shape = true;
+              if (VLOG_IS_ON(2)) {
+                TensorShapeProto proto;
+                context->ShapeHandleToProto(shape, &proto);
+              }
               stack.push_back({src_node, false});
             }
           }
@@ -1768,7 +2095,7 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
           // The shapes of all the inputs to send_node are statically known. We
           // won't have to do any inference at compile time so return now: the
           // shapes were stored in static_shape_out above.
-          graphdef_out->reset();
+          graph_out->reset();
           return Status::OK();
         } else {
           // Any shape that is being processed is either the original send node
@@ -1791,12 +2118,217 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
     }
   }
 
-  graphdef_out->reset(new GraphDef());
-  graph_out->ToGraphDef(graphdef_out->get());
+  for (const auto edge : back_edge_helper.RemovedEdges()) {
+    if (copied_node_images.find(edge.dst) != copied_node_images.end()) {
+      // The destination of this back edge was added to the inference graph, so
+      // fix it up.
+      Node* dst = copied_node_images[edge.dst];
+      if (dst->type_string() != "Merge") {
+        return errors::InvalidArgument(
+            "outside_compilation cluster contains a back-edge to node ",
+            dst->name(), " of type ", dst->type_string(),
+            ". The analysis pass only supports back-edges to Merge nodes.");
+      }
+      const Edge* existing_input_edge;
+      if (edge.dst_input != 1 || dst->num_inputs() != 2 ||
+          !dst->input_edge(0, &existing_input_edge).ok()) {
+        // TODO(misard) if we see graphs built with a different structure, relax
+        // this constraint. Leaving it here for now to avoid writing unnecessary
+        // complex code since we believe graphs generated by front ends all have
+        // the back edge as the second input to the merge node.
+        return errors::Internal(
+            "Internal assumption failed while rewriting an outside_compilation "
+            "cluster that contains a while loop. Logic assumes back-edge is to "
+            "port 1 of a 2-input "
+            "Merge node.");
+      }
+      // Connect the existing edge to both inputs of the Merge node so that the
+      // graph will be well-formed.
+      (*graph_out)
+          ->AddEdge(existing_input_edge->src(),
+                    existing_input_edge->src_output(), dst, edge.dst_input);
+    }
+  }
 
   return Status::OK();
 }
 
+namespace {
+
+// Helper struct for building cluster dependencies and also debugging cycles in
+// the dependencies. While computing dependencies we construct a mapping from
+// Node* to PathDetails.
+struct PathDetails {
+  struct SubgraphAndCluster {
+    string subgraph;
+    string outside_compilation_cluster;
+    bool operator==(const SubgraphAndCluster& other) const {
+      return subgraph == other.subgraph &&
+             outside_compilation_cluster == other.outside_compilation_cluster;
+    }
+  };
+
+  struct SubgraphAndClusterHash {
+    inline std::size_t operator()(const SubgraphAndCluster& v) const {
+      return hash<string>()(
+          strings::StrCat(v.subgraph, v.outside_compilation_cluster));
+    }
+  };
+
+  typedef std::unordered_set<SubgraphAndCluster, SubgraphAndClusterHash>
+      SubgraphAndClusterSet;
+
+  // Returns the set of (subgraph, oc_cluster) pairs that should be recorded as
+  // ancestors for any successor of this node. If the node is in the outer
+  // graph, it returns the transitive union of the ancestors of the node's
+  // inputs. If the node is in an outside_compilation cluster, it returns just
+  // that cluster. If the node is compiled, it returns the empty set.
+  SubgraphAndClusterSet AncestorsForSuccessor() {
+    if (subgraph.empty()) {
+      return ancestor_clusters;
+    } else if (outside_compilation_cluster.empty()) {
+      return SubgraphAndClusterSet();
+    } else {
+      SubgraphAndCluster entry;
+      entry.subgraph = subgraph;
+      entry.outside_compilation_cluster = outside_compilation_cluster;
+      return SubgraphAndClusterSet({entry});
+    }
+  }
+
+  // The transitive union of the ancestor's of this node's inputs. This is only
+  // saved for debugging in order to print out enough information to debug a
+  // discovered cycle.
+  SubgraphAndClusterSet ancestor_clusters;
+  // The subgraph attr on this node.
+  string subgraph;
+  // The outside_compilation attr on this node.
+  string outside_compilation_cluster;
+};
+
+// Adds an edge from ancestor to successor to the cycle detector, and returns an
+// error if that edge causes the formation of a cycle. In the error case, logs
+// the contents of the node_ancestors_map to facilitate debugging.
+Status CheckClusterDependencyForCycles(
+    const string& ancestor, const string& successor,
+    const std::unordered_map<string, std::unordered_set<string>>& ancestors,
+    const std::unordered_map<Node*, PathDetails>& node_ancestors_map,
+    GraphCycles* cycle_detector, std::map<string, int>* cycle_detector_map) {
+  if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) {
+    (*cycle_detector_map)[ancestor] = cycle_detector->NewNode();
+  }
+  if (cycle_detector_map->find(successor) == cycle_detector_map->end()) {
+    (*cycle_detector_map)[successor] = cycle_detector->NewNode();
+  }
+
+  if (!cycle_detector->InsertEdge((*cycle_detector_map)[ancestor],
+                                  (*cycle_detector_map)[successor])) {
+    LOG(ERROR) << "Cycle in outside_compilation clusters";
+    for (const auto& cluster : ancestors) {
+      LOG(ERROR) << "Cluster " << cluster.first << " depends on:";
+      for (const auto& ancestor : cluster.second) {
+        LOG(ERROR) << "  " << ancestor;
+      }
+    }
+    for (const auto& node_ancestors : node_ancestors_map) {
+      LOG(ERROR) << "Node " << node_ancestors.first->name() << " ("
+                 << node_ancestors.second.subgraph << ";"
+                 << node_ancestors.second.outside_compilation_cluster
+                 << ") has ancestor clusters:";
+      for (const auto& ancestor : node_ancestors.second.ancestor_clusters) {
+        LOG(ERROR) << "  " << ancestor.subgraph << ";"
+                   << ancestor.outside_compilation_cluster;
+      }
+    }
+    return errors::InvalidArgument(
+        "Can't compile outside_compilation clusters because there is a "
+        "dependency cycle: see error log for details.");
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status Encapsulator::FindClusterDependencies() {
+  // Map from nodes to ancestor details. A node is entered into the map if it is
+  // in a compilation subgraph, and outside_compilation cluster, or appears on a
+  // path in the outer graph leading from an outside_compilation subgraph.
+  std::unordered_map<Node*, PathDetails> node_ancestors_map;
+  // We check that clusters are acyclic using this cycle detector.
+  GraphCycles cycle_detector;
+  // Map from cluster name to cycle detector node id.
+  std::map<string, int> cycle_detector_map;
+  // Process the nodes in topologically-sorted order.
+  std::vector<Node*> nodes;
+  GetReversePostOrder(*graph_in_, &nodes);
+  for (Node* node : nodes) {
+    string subgraph_name;
+    string oc_cluster;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &subgraph_name, &oc_cluster));
+    // First create an entry in the ancestors map if the node is in a compiled
+    // subgraph or outside_compilation cluster, or if any incoming edge is from
+    // a node with an ancestor map entry; and find the union of all the
+    // ancestors.
+    if (!subgraph_name.empty()) {
+      node_ancestors_map[node].subgraph = subgraph_name;
+      node_ancestors_map[node].outside_compilation_cluster = oc_cluster;
+    }
+    for (Node* src : node->in_nodes()) {
+      const auto iter = node_ancestors_map.find(src);
+      if (iter != node_ancestors_map.end()) {
+        const auto& ancestors_to_follow = iter->second.AncestorsForSuccessor();
+        for (const auto& ancestor : ancestors_to_follow) {
+          if (ancestor.subgraph != subgraph_name ||
+              ancestor.outside_compilation_cluster != oc_cluster) {
+            node_ancestors_map[node].ancestor_clusters.insert(ancestor);
+          }
+        }
+      }
+    }
+    if (!subgraph_name.empty()) {
+      // The node is in a compiled subgraph or an outside_compilation cluster.
+      if (oc_cluster.empty()) {
+        // The node is not in an outside_compilation cluster. Record the
+        // subgraph's ancestor dependencies.
+        for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) {
+          if (cluster.subgraph != subgraph_name) {
+            subgraph_ancestors_[subgraph_name].insert(cluster.subgraph);
+            TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
+                cluster.subgraph, subgraph_name, subgraph_ancestors_,
+                node_ancestors_map, &cycle_detector, &cycle_detector_map));
+          }
+        }
+      } else {
+        Subgraph& subgraph = subgraphs_[subgraph_name];
+        // The node is in an outside_compilation cluster. Record the cluster
+        // and/or subgraph ancestor dependencies.
+        for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) {
+          if (cluster.subgraph == subgraph_name) {
+            // The ancestor is in the same subgraph.
+            if (cluster.outside_compilation_cluster != oc_cluster) {
+              // But not in the same oc_cluster, so record the dependency.
+              subgraph.RecordOutsideCompilationDependency(
+                  oc_cluster, cluster.outside_compilation_cluster);
+              TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
+                  cluster.outside_compilation_cluster, oc_cluster,
+                  subgraph.OutsideCompilationAncestorMap(), node_ancestors_map,
+                  &cycle_detector, &cycle_detector_map));
+            }
+          } else {
+            // The ancestor is in a different subgraph, so record the
+            // dependency.
+            subgraph_ancestors_[subgraph_name].insert(cluster.subgraph);
+            TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
+                cluster.subgraph, subgraph_name, subgraph_ancestors_,
+                node_ancestors_map, &cycle_detector, &cycle_detector_map));
+          }
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status Encapsulator::MakePrunedGraphCopyAndInline(
     const Graph& graph, const std::vector<Node*>& sink_nodes,
     std::unique_ptr<Graph>* pruned_graph,
@@ -1861,7 +2393,7 @@ Status Encapsulator::MakePrunedGraphCopyAndInline(
 
 Status Encapsulator::MakeGraphForOutsideCompilationSends(
     const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-    ShapeRefiner* shape_refiner,
+    BackEdgeHelper* back_edge_helper, ShapeRefiner* shape_refiner,
     std::unordered_map<const Node*, Node*>* node_images,
     FunctionLibraryDefinition* library) {
   // Find all the send_from_host nodes in all subgraphs, to use as roots for the
@@ -1883,10 +2415,15 @@ Status Encapsulator::MakeGraphForOutsideCompilationSends(
   // nodes, inlining any functions as needed.
   TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline(
       graph, send_from_host_nodes, pruned_graph, node_images, library));
+  FixupSourceAndSinkEdges(pruned_graph->get());
+
+  // Remove back edges from any cycles in the pruned graph to simplify shape
+  // inference traversal. They will be fixed up in the per-subgraph shape
+  // inference graphs stored in the function library.
+  TF_RETURN_IF_ERROR(back_edge_helper->Remove(pruned_graph->get()));
 
   // Perform shape inference on the pruned graph.
   shape_refiner->set_require_shape_inference_fns(false);
-  FixupSourceAndSinkEdges(pruned_graph->get());
   std::vector<Node*> post_order;
   GetReversePostOrder(*(*pruned_graph), &post_order);
   for (auto node : post_order) {
@@ -1904,20 +2441,28 @@ Status Encapsulator::MakeGraphForOutsideCompilationSends(
 
 Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
     Graph* graph_out, FunctionLibraryDefinition* library) {
+  BackEdgeHelper back_edge_helper;
   std::unique_ptr<Graph> pruned_graph;
   ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry());
   std::unordered_map<const Node*, Node*> node_images;
   TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
-      *graph_out, &pruned_graph, &shape_refiner, &node_images, library));
+      *graph_out, &pruned_graph, &back_edge_helper, &shape_refiner,
+      &node_images, library));
+
+  if (VLOG_IS_ON(1)) {
+    dump_graph::DumpGraphToFile("pruned_graph_for_shape_inference",
+                                *pruned_graph, library);
+  }
 
   for (auto& subgraph_entry : subgraphs_) {
+    const string& subgraph_name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
     // Find all the recv_at_host nodes in this subgraph.
     std::vector<string> outside_compilation_names;
     subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
     std::unordered_set<string> recv_at_host_names;
-    for (const auto& name : outside_compilation_names) {
-      Node* recv_node = subgraph.GetRecvAtHostNode(name);
+    for (const auto& oc_name : outside_compilation_names) {
+      Node* recv_node = subgraph.GetRecvAtHostNode(oc_name);
       if (recv_node != nullptr) {
         recv_at_host_names.insert(recv_node->name());
       }
@@ -1926,26 +2471,30 @@ Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
     // without knowing the shape of the recv_at_host nodes, and store the
     // result, along with enough information to complete the job at compile time
     // once the recv_at_host shapes are known.
-    for (const auto& name : outside_compilation_names) {
-      Node* send_node = subgraph.GetSendFromHostNode(name);
+    for (const auto& oc_name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(oc_name);
       std::vector<TensorShapeProto> static_shape;
-      std::unique_ptr<GraphDef> graphdef;
+      std::unique_ptr<Graph> graph;
       if (send_node != nullptr) {
         TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
-            *pruned_graph, shape_refiner, recv_at_host_names,
-            node_images[send_node], library, &static_shape, &graphdef));
-        if (graphdef == nullptr) {
+            *pruned_graph, back_edge_helper, shape_refiner, recv_at_host_names,
+            node_images[send_node], library, &static_shape, &graph));
+        if (graph == nullptr) {
           VLOG(2) << "Send node  " << send_node->name() << " shapes";
           for (int i = 0; i < static_shape.size(); ++i) {
             VLOG(2) << static_shape[i].DebugString();
           }
         } else {
-          VLOG(2) << "Send node " << send_node->name() << " graph\n"
-                  << graphdef->DebugString();
+          if (VLOG_IS_ON(2)) {
+            GraphDef graphdef;
+            graph->ToGraphDef(&graphdef);
+            VLOG(2) << "Send node " << send_node->name() << " graph\n"
+                    << graphdef.DebugString();
+          }
         }
       }
-      TF_RETURN_IF_ERROR(
-          subgraph.AddShapeInferenceInfo(name, static_shape, graphdef.get()));
+      TF_RETURN_IF_ERROR(subgraph.AddShapeInferenceInfo(
+          subgraph_name, oc_name, static_shape, graph.get(), library));
     }
     if (!outside_compilation_names.empty()) {
       TF_RETURN_IF_ERROR(subgraph.ReplaceFunctionDef(library));
@@ -1986,6 +2535,7 @@ Status EncapsulateSubgraphsInFunctions(
   Encapsulator encapsulator(std::move(group_attribute),
                             std::move(outside_compilation_attribute),
                             &graph_in);
+  TF_RETURN_IF_ERROR(encapsulator.FindClusterDependencies());
   TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs());
 
   TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs(
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index aed9cae0f1799c4524da8ee309344849798755d5..5ec24d39a2c40a766dbb0ec51ebe798de620e24b 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <utility>
 
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
@@ -20,15 +21,38 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 namespace {
 
+const char* const kXlaHostTransferSequencerAttr =
+    "_xla_host_transfer_sequencer";
+
+Status AddGraphDefToFunctionLibrary(const GraphDefBuilder& graphdef_builder,
+                                    const string& name_suffix,
+                                    FunctionDefLibrary* library) {
+  GraphDef graphdef;
+  TF_RETURN_IF_ERROR(graphdef_builder.ToGraphDef(&graphdef));
+  std::unique_ptr<Graph> graph =
+      std::unique_ptr<Graph>(new Graph(OpRegistry::Global()));
+  GraphConstructorOptions opts;
+  opts.allow_internal_ops = true;
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graphdef, graph.get()));
+  FunctionDef* fdef = library->add_function();
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(
+      *graph,
+      strings::StrCat("_outside_compilation_shape_inference_", name_suffix),
+      fdef));
+  return Status::OK();
+}
+
 template <class Tkey, class Tvalue>
 bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
                    const ::tensorflow::protobuf::Map<Tkey, Tvalue>& b,
@@ -50,7 +74,7 @@ bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
     if (!compare(elt_a.first, elt_a.second, iter->second)) {
       if (diff) {
         *diff = strings::StrCat(map_name, " expected: element with key '",
-                                key_to_string(elt_a.first), " has value '",
+                                key_to_string(elt_a.first), "' has value '",
                                 value_to_string(elt_a.second), "' got: '",
                                 value_to_string(iter->second), "'");
       }
@@ -97,8 +121,22 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
     }
     return false;
   }
+  std::unordered_set<string> control_input_a;
+  std::unordered_set<string> control_input_b;
   for (int i = 0; i < a.input_size(); ++i) {
-    if (a.input(i) != b.input(i)) {
+    if (str_util::StartsWith(a.input(i), "^")) {
+      if (!str_util::StartsWith(b.input(i), "^")) {
+        if (diff) {
+          *diff = strings::StrCat(
+              diff_preamble, " mismatch for node ", a.name(), " input ", i,
+              ", expected control input ", a.input(i), " got ", b.input(i),
+              " expected:\n", a.DebugString(), "\ngot:\n", b.DebugString());
+        }
+        return false;
+      }
+      control_input_a.insert(a.input(i));
+      control_input_b.insert(b.input(i));
+    } else if (a.input(i) != b.input(i)) {
       if (diff) {
         *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
                                 " input ", i, ", expected ", a.input(i),
@@ -108,24 +146,26 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
       return false;
     }
   }
+  if (control_input_a != control_input_b) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              " control inputs differ expected:\n",
+                              a.DebugString(), "\ngot:\n", b.DebugString());
+    }
+    return false;
+  }
   return EqualProtoMap<string, AttrValue>(
       a.attr(), b.attr(), [](const string& s) { return s; },
       [](const AttrValue& v) { return v.DebugString(); },
       [](const string& key, const AttrValue& av, const AttrValue& bv) {
-        if (key == "shape_inference_graph") {
-          // Default serialization of GraphDef is unstable because maps don't
-          // serialize deterministically. Rather than go through the hoops to
-          // turn on deterministic serialization of this attr just for this
-          // test, add logic here to compare determinstically.
-          GraphDef ga;
-          if (!ga.ParseFromString(av.s())) {
-            return false;
-          }
-          GraphDef gb;
-          if (!gb.ParseFromString(bv.s())) {
-            return false;
-          }
-          return EqualGraphDef(ga, gb, nullptr);
+        if (key == "ancestors") {
+          // The ancestors are added from a set so the order is unpredictable;
+          // just compare set equality not list equality.
+          std::unordered_set<string> a_set(av.list().s().begin(),
+                                           av.list().s().end());
+          std::unordered_set<string> b_set(bv.list().s().begin(),
+                                           bv.list().s().end());
+          return a_set == b_set;
         } else {
           return av.DebugString() == bv.DebugString();
         }
@@ -246,26 +286,33 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
         << diff << "\nActual: " << actual.DebugString();          \
   } while (false)
 
-// TODO(misard): remove these fake registrations once there are real Ops to be
-// compiled.
-REGISTER_OP("_XlaHostCompute")
+// These dummy Op registrations are here because the real Op registrations live
+// in contrib and there can't be a dependence from this test to contrib.
+REGISTER_OP("XlaHostCompute")
     .Input("inputs: Tinputs")
     .Output("outputs: Toutputs")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Toutputs: list(type) >= 0")
+    .Attr("ancestors: list(string) >= 0")
     .Attr("key: string")
+    .Attr("shape_inference_graph: string = ''")
+    .Attr("shapes: list(shape) >= 0")
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("_XlaSendFromHost")
-    .Input("input: Tinputs")
+    .Input("inputs: Tinputs")
+    .Input("dynamic_key: string")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("key: string")
+    .Attr("device_ordinal: int")
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("_XlaRecvAtHost")
-    .Output("output: Toutputs")
+    .Input("dynamic_key: string")
+    .Output("outputs: Toutputs")
     .Attr("Toutputs: list(type) >= 0")
     .Attr("key: string")
+    .Attr("device_ordinal: int")
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("InputTest")
@@ -315,8 +362,13 @@ REGISTER_OP("AddNLikeTest")
     .SetIsCommutative()
     .SetIsAggregate();
 
-Node* NoOp(const GraphDefBuilder::Options& opts) {
-  return ops::SourceOp("NoOp", opts);
+Node* Sequencer(const GraphDefBuilder::Options& opts,
+                const string& call_node_name) {
+  if (opts.HaveError()) return nullptr;
+  NodeBuilder node_builder(opts.GetNameForOp("NoOp"), "NoOp",
+                           opts.op_registry());
+  return opts.WithAttr(kXlaHostTransferSequencerAttr, call_node_name)
+      .FinalizeBuilder(&node_builder);
 }
 
 Node* Input(const GraphDefBuilder::Options& opts) {
@@ -327,43 +379,85 @@ Node* InputShaped(const GraphDefBuilder::Options& opts) {
   return ops::SourceOp("InputTestShaped", opts);
 }
 
-Node* KnownShape(const gtl::ArraySlice<int>& shape,
-                 const GraphDefBuilder::Options& opts) {
+Node* KnownShapeBase(DataType dtype, const gtl::ArraySlice<int>& shape,
+                     const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("Const"), "Const",
                            opts.op_registry());
   TensorProto value;
-  value.set_dtype(DT_FLOAT);
+  value.set_dtype(dtype);
   for (int dim : shape) {
     value.mutable_tensor_shape()->add_dim()->set_size(dim);
   }
   return opts.WithAttr("value", value)
-      .WithAttr("dtype", DT_FLOAT)
+      .WithAttr("dtype", dtype)
       .FinalizeBuilder(&node_builder);
 }
 
-Node* RecvAtHost(const string& key, const gtl::ArraySlice<DataType>& dtypes,
+Node* KnownShape(const gtl::ArraySlice<int>& shape,
+                 const GraphDefBuilder::Options& opts) {
+  return KnownShapeBase(DT_FLOAT, shape, opts);
+}
+
+Node* KeyPlaceholderShape(const GraphDefBuilder::Options& opts) {
+  return KnownShapeBase(DT_STRING, {2}, opts);
+}
+
+Node* KeyPlaceholder(const string& call_node,
+                     const GraphDefBuilder::Options& opts) {
+  if (opts.HaveError()) return nullptr;
+  NodeBuilder node_builder(opts.GetNameForOp("Placeholder"), "Placeholder",
+                           opts.op_registry());
+  TensorShapeProto shape;
+  shape.add_dim()->set_size(2);
+  return opts.WithAttr("shape", shape)
+      .WithAttr("dtype", DT_STRING)
+      .WithAttr("_host_compute_call_node", call_node)
+      .FinalizeBuilder(&node_builder);
+}
+
+Node* RecvAtHost(ops::NodeOut key_input, const string& cluster,
+                 const string& oc_cluster,
+                 const gtl::ArraySlice<DataType>& dtypes,
                  const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  NodeBuilder node_builder(opts.GetNameForOp("_XlaRecvAtHost"),
+  string key =
+      strings::StrCat("host_compute_channel_", cluster, "_", oc_cluster);
+  string name = strings::StrCat("outside_compilation_", cluster, "_",
+                                oc_cluster, "_recv");
+  NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaRecvAtHost"),
                            "_XlaRecvAtHost", opts.op_registry());
+  node_builder.Input(std::move(key_input));
   return opts.WithAttr("Toutputs", dtypes)
       .WithAttr("key", key)
+      .WithAttr("device_ordinal", 0)
+      .WithAttr("_encapsulate", cluster)
+      .WithAttr("_outside", oc_cluster)
       .FinalizeBuilder(&node_builder);
 }
 
-Node* SendFromHost(const string& key, const std::vector<ops::NodeOut>& inputs,
+Node* SendFromHost(ops::NodeOut key_input, const string& cluster,
+                   const string& oc_cluster,
+                   const std::vector<ops::NodeOut>& inputs,
                    const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  NodeBuilder node_builder(opts.GetNameForOp("_XlaSendFromHost"),
+  string key =
+      strings::StrCat("host_compute_channel_", cluster, "_", oc_cluster);
+  string name = strings::StrCat("outside_compilation_", cluster, "_",
+                                oc_cluster, "_send");
+  NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaSendFromHost"),
                            "_XlaSendFromHost", opts.op_registry());
   node_builder.Input(inputs);
+  node_builder.Input(std::move(key_input));
   std::vector<DataType> dtypes;
   for (const auto& node : inputs) {
     dtypes.push_back(node.dt);
   }
-  return opts.WithAttr("key", key)
-      .WithAttr("Tinputs", dtypes)
+  return opts.WithAttr("Tinputs", dtypes)
+      .WithAttr("key", key)
+      .WithAttr("device_ordinal", 0)
+      .WithAttr("_encapsulate", cluster)
+      .WithAttr("_outside", oc_cluster)
       .FinalizeBuilder(&node_builder);
 }
 
@@ -711,7 +805,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) {
         Graph* graph = graph_ptr->get();
         for (const Node* n : graph->nodes()) {
           if (n->type_string() == "_Arg" &&
-              StringPiece(n->name()).starts_with("const")) {
+              str_util::StartsWith(n->name(), "const")) {
             ++guaranteed_consts;
             EXPECT_TRUE(HasGuaranteeConstAttr(*n));
           } else {
@@ -756,7 +850,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) {
         Graph* graph = graph_ptr->get();
         for (const Node* n : graph->nodes()) {
           if (n->type_string() == "_Arg" &&
-              StringPiece(n->name()).starts_with("const")) {
+              str_util::StartsWith(n->name(), "const")) {
             ++guaranteed_consts;
             EXPECT_TRUE(HasGuaranteeConstAttr(*n));
           } else {
@@ -806,19 +900,20 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected;
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* key_constant =
+        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts().WithName("E"));
-    SendFromHost("host_compute_channel_F1_O1", {e},
-                 shape.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape_graph;
-    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
-    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+                     shape.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
   *library_expected.add_function() = test::function::XTimesTwo();
@@ -833,13 +928,16 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"C:o:0", "c:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -851,24 +949,29 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    NodeBuilder node_builder("F1", "F1", lib_def.get());
-    node_builder.Input(a).Input(b);
-    Node* call = b2.opts().FinalizeBuilder(&node_builder);
-
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
-                              b2.opts()
-                                  .WithName("outside_compilation_F1_O1_send")
-                                  .WithControlInput(e));
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                              b2.opts().WithControlInput(e));
 
-    Node* s = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
+    Node* s = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
 
-    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(a).Input(b);
+    Node* call =
+        b2.opts().WithControlInputs({s}).FinalizeBuilder(&node_builder);
+
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({e}));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -918,38 +1021,43 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected_1;
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   shape1.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* key_constant =
+        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape1.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape1.opts().WithName("E"));
-    SendFromHost("host_compute_channel_F1_O1", {e},
-                 shape1.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape1_graph;
-    TF_EXPECT_OK(shape1.ToGraphDef(&shape1_graph));
-    EXPECT_TRUE(shape1_graph.SerializeToString(&shape_string_expected_1));
+                     shape1.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
   }
 
-  string shape_string_expected_2;
   {
     GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
-    Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   shape2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* key_constant =
+        KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     shape2.opts().WithName("E"));
-    Node* recv2 =
-        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
-                   shape2.opts().WithName("outside_compilation_F1_O2_recv"));
-    Node* h = Binary(ops::NodeOut(recv2, 0), e, shape2.opts().WithName("H"));
-    SendFromHost("host_compute_channel_F1_O2", {h},
-                 shape2.opts().WithName("outside_compilation_F1_O2_send"));
-    GraphDef shape2_graph;
-    TF_EXPECT_OK(shape2.ToGraphDef(&shape2_graph));
-    EXPECT_TRUE(shape2_graph.SerializeToString(&shape_string_expected_2));
+                     shape2.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
+    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+                     shape2.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, shape2.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
   }
 
   *library_expected.add_function() = FunctionDefHelper::Create(
@@ -966,22 +1074,29 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O2_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"D:o:0", "F:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors",
+             gtl::ArraySlice<string>({"outside_compilation_O1_host_compute"})},
             {"key", "host_compute_channel_F1_O2"},
-            {"shape_inference_graph", shape_string_expected_2},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
-           {"F"}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O2"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O2"}},
+           {"F", "outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"C:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected_1},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"i_0_retval", "I:o:0"}});
@@ -993,35 +1108,45 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    NodeBuilder node_builder("F1", "F1", lib_def.get());
-    node_builder.Input(a).Input(b);
-    Node* call = b2.opts().FinalizeBuilder(&node_builder);
-
-    Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
-                               b2.opts()
-                                   .WithName("outside_compilation_F1_O1_send")
-                                   .WithControlInput(e));
-
-    Node* recv2 =
-        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O2_recv"));
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv1, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                               b2.opts().WithControlInput(e));
+
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* g = Binary(e, ops::NodeOut(recv2, 1),
-                     b2.opts().WithName("G").WithControlInputs({recv2, e}));
-    Node* h = Binary(ops::NodeOut(recv2, 0), e, b2.opts().WithName("H"));
+                     b2.opts()
+                         .WithName("G")
+                         .WithControlInputs({recv2, e})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+                     b2.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
     Node* send2 =
-        SendFromHost("host_compute_channel_F1_O2", {h},
-                     b2.opts().WithName("outside_compilation_F1_O2_send"));
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, b2.opts());
+
+    Node* s = Sequencer(b2.opts()
+                            .WithName("F1_sequencer")
+                            .WithControlInputs({recv1, send1, recv2, send2}),
+                        "F1");
 
-    Node* s = NoOp(b2.opts()
-                       .WithName("F1_sequencer")
-                       .WithControlInputs({recv1, send1, recv2, send2}));
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(a).Input(b);
+    Node* call = b2.opts().WithControlInput(s).FinalizeBuilder(&node_builder);
 
-    Binary(g, call, b2.opts().WithName("J").WithControlInput(s));
+    Binary(g, call, b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1070,19 +1195,20 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected;
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* key_constant =
+        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts().WithName("E"));
-    SendFromHost("host_compute_channel_F1_O1", {e},
-                 shape.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape_graph;
-    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
-    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+                     shape.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
   TensorShapeProto shape_proto_expected;
@@ -1100,13 +1226,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"C:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
@@ -1120,14 +1249,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            "BinaryTest",
            {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"G:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F2_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
 
@@ -1138,39 +1269,221 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = InputShaped(b2.opts().WithName("B"));
 
-    Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* key_constant1 =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
-                               b2.opts()
-                                   .WithName("outside_compilation_F1_O1_send")
-                                   .WithControlInput(e));
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv1, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
+                               b2.opts().WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
+
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
-    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Node* recv2 =
-        RecvAtHost("host_compute_channel_F2_O1", {DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F2_O1_recv"));
+    Node* key_constant2 =
+        KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1",
+                             {DT_FLOAT}, b2.opts());
     Node* h = Binary(ops::NodeOut(call1, 1), recv2,
-                     b2.opts().WithName("H").WithControlInput(s1));
-    Node* send2 =
-        SendFromHost("host_compute_channel_F2_O1", {h},
-                     b2.opts().WithName("outside_compilation_F2_O1_send"));
+                     b2.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F2")
+                         .WithAttr("_outside", "O1"));
+    Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
+                               b2.opts());
 
+    Node* s2 = Sequencer(
+        b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}),
+        "F2");
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
     node_builder2.Input(e).Input(call1);
     Node* call2 = b2.opts()
-                      .WithControlInputs({s1, e, call1})
+                      .WithControlInputs({s2, e, call1})
                       .FinalizeBuilder(&node_builder2);
-    Node* s2 = NoOp(
-        b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}));
-    Binary(call2, ops::NodeOut(call2, 1),
-           b2.opts().WithName("J").WithControlInput(s2));
+    Binary(call2, ops::NodeOut(call2, 1), b2.opts().WithName("J"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with two functions to transform, each with one outside_compilation
+// cluster, with the dependency between them purely from an outside_compilation
+// edge.
+TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = InputShaped(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Binary(c, d,
+                     b1.opts()
+                         .WithName("E")
+                         .WithControlInputs({b, d})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Node* g =
+        Binary(a, b, b1.opts().WithName("G").WithAttr("_encapsulate", "F2"));
+    Node* h = Unary(g, b1.opts()
+                           .WithName("H")
+                           .WithAttr("_encapsulate", "F2")
+                           .WithAttr("_outside", "O1")
+                           .WithControlInput(e));
+    Node* i = Unary(h, b1.opts().WithName("I").WithAttr("_encapsulate", "F2"));
+    Binary(f, i, b1.opts().WithName("J"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape.opts());
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
+  }
+
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F2", "O1",
+                            {DT_FLOAT}, shape.opts());
+    Node* h = Unary(recv, shape.opts()
+                              .WithName("H")
+                              .WithAttr("_encapsulate", "F2")
+                              .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F2", "O1", {h}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F2_O1", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "BinaryTest",
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"C:o:0", "D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}},
+           {"D"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F2", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
+      {
+          {{"G"}, "BinaryTest", {"a_0_arg", "b_0_arg"}},
+          {{"I"},
+           "UnaryTest",
+           {"outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"G:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F2_O1"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F2_O1"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
+      },
+      {{"i_0_retval", "I:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = InputShaped(b2.opts().WithName("B"));
+
+    Node* key_constant1 =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv1, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
+                               b2.opts().WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
+
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b);
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
+
+    Node* key_constant2 =
+        KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* h = Unary(recv2, b2.opts()
+                               .WithName("H")
+                               .WithAttr("_encapsulate", "F2")
+                               .WithAttr("_outside", "O1")
+                               .WithControlInput(e));
+    Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
+                               b2.opts());
+
+    Node* s2 = Sequencer(
+        b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}),
+        "F2");
+    NodeBuilder node_builder2("F2", "F2", lib_def.get());
+    node_builder2.Input(a).Input(b);
+    Node* call2 = b2.opts()
+                      .WithControlInputs({s2, call1})
+                      .FinalizeBuilder(&node_builder2);
+    Binary(call1, call2, b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1218,14 +1531,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
            "BinaryTest",
            {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {},
            {{"Tinputs", gtl::ArraySlice<DataType>({})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1236,16 +1551,22 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts().WithName("E"));
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
     Node* send1 =
-        SendFromHost("host_compute_channel_F1_O1", {e},
-                     b2.opts().WithName("outside_compilation_F1_O1_send"));
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInput(send1), "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
-    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(b2.opts().WithName("F1_sequencer").WithControlInput(send1));
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Unary(call1, b2.opts().WithName("G").WithControlInput(s1));
+    Unary(call1, b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1294,14 +1615,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
            "BinaryTest",
            {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {},
            {{"Tinputs", gtl::ArraySlice<DataType>({})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -1313,20 +1636,26 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
     Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Unary(a, b2.opts().WithName("E").WithControlInput(recv1));
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {}, b2.opts());
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithControlInput(recv1)
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
     Node* send1 =
-        SendFromHost("host_compute_channel_F1_O1", {e},
-                     b2.opts().WithName("outside_compilation_F1_O1_send"));
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
-    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Unary(call1, b2.opts().WithName("G").WithControlInput(s1));
+    Unary(call1, b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1368,13 +1697,15 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"}, "UnaryTest", {"D:o:0"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1385,16 +1716,22 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Unary(recv1, b2.opts().WithName("E"));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInput(recv1), "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
-    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(b2.opts().WithName("F1_sequencer").WithControlInput(recv1));
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("G").WithControlInput(s1));
+    Binary(e, call1, b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1441,13 +1778,15 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1458,21 +1797,390 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Unary(recv1, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {},
-                               b2.opts()
-                                   .WithName("outside_compilation_F1_O1_send")
-                                   .WithControlInput(e));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {},
+                               b2.opts().WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("G"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with two outside_compilation clusters that interact outside the compiled
+// subgraph, where the ancestor has no HostCompute Op.
+TEST(EncapsulateSubgraphsTest,
+     OutsideCompilationClusterDependencyNoSrcCluster) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(a, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(d, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Node* g = Unary(f, b1.opts()
+                           .WithName("G")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O2")
+                           .WithControlInput(e));
+    Node* h = Unary(g, b1.opts().WithName("H").WithAttr("_encapsulate", "F1"));
+    Binary(e, h, b1.opts().WithName("I"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT}, shape2.opts());
+    Node* g = Unary(ops::NodeOut(recv2, 0), shape2.opts()
+                                                .WithName("G")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O2"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, shape2.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"}, "UnaryTest", {"D:o:0"}},
+          {{"H"},
+           "UnaryTest",
+           {"outside_compilation_O2_host_compute:outputs:0"}},
+          {{"outside_compilation_O2_host_compute"},
+           "XlaHostCompute",
+           {"F:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F1_O2"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O2"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O2"}}},
+      },
+      {{"h_0_retval", "H:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                            {DT_FLOAT}, b2.opts());
+    Node* g = Unary(recv, b2.opts()
+                              .WithName("G")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O2")
+                              .WithControlInput(e));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, b2.opts());
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b).ControlInput(s1);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("I"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with two outside_compilation clusters that interact outside the compiled
+// subgraph, where the successor has no HostCompute Op.
+TEST(EncapsulateSubgraphsTest,
+     OutsideCompilationClusterDependencyNoDstCluster) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(d, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    /*Node* g =*/Unary(a, b1.opts()
+                              .WithName("G")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O2")
+                              .WithControlInput(e));
+    Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1"));
+    Binary(e, h, b1.opts().WithName("I"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, shape1.opts());
+    Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "UnaryTest",
+           {"outside_compilation_O1_host_compute:outputs:0"}},
+          {{"H"}, "UnaryTest", {"F:o:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
+      },
+      {{"h_0_retval", "H:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv, b2.opts()
+                              .WithName("E")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O1"));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    /*Node* g =*/Unary(a, b2.opts()
+                              .WithName("G")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O2")
+                              .WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b).ControlInput(s1);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("I"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with two outside_compilation clusters that interact outside the compiled
+// subgraph.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(d, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Node* g = Unary(d, b1.opts()
+                           .WithName("G")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O2")
+                           .WithControlInput(e));
+    Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1"));
+    /*Node* i =*/Binary(d, e,
+                        b1.opts()
+                            .WithName("I")
+                            .WithAttr("_encapsulate", "F1")
+                            .WithAttr("_outside", "O3")
+                            .WithControlInput(g));
+    Binary(e, h, b1.opts().WithName("J"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, shape1.opts());
+    Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      {{{"C"}, "UnaryTest", {"a_0_arg"}},
+       {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+       {{"F"}, "UnaryTest", {"outside_compilation_O1_host_compute:outputs:0"}},
+       {{"H"}, "UnaryTest", {"F:o:0"}},
+       {{"outside_compilation_O1_host_compute"},
+        "XlaHostCompute",
+        {"D:o:0"},
+        {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"ancestors", gtl::ArraySlice<string>({})},
+         {"key", "host_compute_channel_F1_O1"},
+         {"shape_inference_graph",
+          "_outside_compilation_shape_inference_F1_O1"},
+         {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+         {"_outside_compilation_subgraph", "O1"}}},
+       {{"outside_compilation_O2_host_compute"},
+        "XlaHostCompute",
+        {"D:o:0"},
+        {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"Toutputs", gtl::ArraySlice<DataType>({})},
+         {"ancestors",
+          gtl::ArraySlice<string>({"outside_compilation_O1_host_compute"})},
+         {"key", "host_compute_channel_F1_O2"},
+         {"shape_inference_graph", ""},
+         {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+         {"_outside_compilation_subgraph", "O2"}},
+        {"outside_compilation_O1_host_compute"}},
+       {{"outside_compilation_O3_host_compute"},
+        "XlaHostCompute",
+        {"D:o:0"},
+        {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"Toutputs", gtl::ArraySlice<DataType>({})},
+         {"ancestors",
+          gtl::ArraySlice<string>({"outside_compilation_O1_host_compute",
+                                   "outside_compilation_O2_host_compute"})},
+         {"key", "host_compute_channel_F1_O3"},
+         {"shape_inference_graph", ""},
+         {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+         {"_outside_compilation_subgraph", "O3"}},
+        {"outside_compilation_O1_host_compute",
+         "outside_compilation_O2_host_compute"}}},
+      {{"h_0_retval", "H:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT}, b2.opts());
+    Node* g = Unary(recv2, b2.opts()
+                               .WithName("G")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O2")
+                               .WithControlInput(e));
+    Node* recv3 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3",
+                             {DT_FLOAT}, b2.opts());
+    /*Node* i =*/Binary(recv3, e,
+                        b2.opts()
+                            .WithName("I")
+                            .WithAttr("_encapsulate", "F1")
+                            .WithAttr("_outside", "O3")
+                            .WithControlInput(g));
+    Node* s1 = Sequencer(b2.opts()
+                             .WithName("F1_sequencer")
+                             .WithControlInputs({recv1, send, recv2, recv3}),
+                         "F1");
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b).ControlInput(s1);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
 
-    Binary(e, call1, b2.opts().WithName("G").WithControlInput(s1));
+    Binary(e, call1, b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1523,7 +2231,10 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts().WithName("E"));
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -1569,19 +2280,21 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected;
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_0"));
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = BinaryUnknownShape(known, recv, shape.opts().WithName("E"));
-    SendFromHost("host_compute_channel_F1_O1", {e},
-                 shape.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape_graph;
-    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
-    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+    Node* key_constant =
+        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_1"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT}, shape.opts());
+    Node* e = BinaryUnknownShape(known, recv,
+                                 shape.opts()
+                                     .WithName("E")
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
   *library_expected.add_function() = test::function::XTimesTwo();
@@ -1595,13 +2308,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"c:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -1614,26 +2330,29 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
     Node* b = Input(b2.opts().WithName("B"));
     Node* c = Unary(a, b2.opts().WithName("C"));
 
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT}, b2.opts());
+    Node* e = BinaryUnknownShape(c, ops::NodeOut(recv, 0),
+                                 b2.opts()
+                                     .WithName("E")
+                                     .WithControlInputs({recv, b})
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                              b2.opts().WithControlInput(e));
+
+    Node* s = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
+
     NodeBuilder node_builder("F1", "F1", lib_def.get());
     node_builder.Input(b).Input(c);
     Node* call =
-        b2.opts().WithControlInputs({c}).FinalizeBuilder(&node_builder);
-
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = BinaryUnknownShape(
-        c, ops::NodeOut(recv, 0),
-        b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
-                              b2.opts()
-                                  .WithName("outside_compilation_F1_O1_send")
-                                  .WithControlInput(e));
-
-    Node* s = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
-
-    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
+        b2.opts().WithControlInputs({s, c}).FinalizeBuilder(&node_builder);
+
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({e}));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
diff --git a/tensorflow/compiler/jit/graphcycles/BUILD b/tensorflow/compiler/jit/graphcycles/BUILD
index 15507b3851751c681044a744c07c247410fb3e2d..676f71a75aede2a7720ae0c8a579d64cc184509a 100644
--- a/tensorflow/compiler/jit/graphcycles/BUILD
+++ b/tensorflow/compiler/jit/graphcycles/BUILD
@@ -27,17 +27,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 9bea5663319c8a25249fdc265cee0191556a7c04..00a6f4075f9a18efc3895b033eb6d08e36088a53 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -14,6 +14,7 @@ cc_library(
         "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:xla_compilation_cache",
         "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/compiler/jit:xla_launch_util",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:statusor",
@@ -40,17 +41,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 6353149e4afdf739fe44dd5c76502ef5d98b8477..049d170fa48928474b894f2d0e1f2243c5f87275 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -36,115 +37,8 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace gpu = perftools::gputools;
-
 namespace tensorflow {
 
-// Adapter class that wraps a Tensorflow allocator as an XLA allocator.
-// Assumes that the Tensorflow allocator permits asynchronous deallocation:
-// see comment on `AllowsAsynchronousDeallocation()`.
-class XlaAllocator : public xla::DeviceMemoryAllocator {
- public:
-  XlaAllocator(const gpu::Platform* platform, OpKernelContext* op_context);
-  ~XlaAllocator() override;
-  xla::StatusOr<gpu::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
-                                                bool retry_on_failure) override;
-  Status Deallocate(int device_ordinal, gpu::DeviceMemoryBase* mem) override;
-
-  // Register an Tensor (input or resource variable) with the allocator. If
-  // the operation returns an alias to one of its inputs, then the allocator
-  // needs to be able to handle it.
-  Status RegisterArgument(const Tensor* t);
-
-  // Makes 'tensor' a wrapper around the data buffer at 'ptr'. The buffer is
-  // interpreted as having data type 'dtype' and shape 'shape'.
-  Status MakeTensorFromBuffer(gpu::DeviceMemoryBase buffer, DataType dtype,
-                              const TensorShape& shape, Tensor* tensor) const;
-
-  // The Tensorflow BFC allocator used on GPU allows host-side deallocation
-  // before GPU execution takes place. Tensorflow uses the ordering of the main
-  // compute stream to enforce a happens-before relationship between a memory
-  // allocation and code that reuses the same memory. If Tensorflow adds
-  // support for multiple GPU streams or allocators with different ordering
-  // requirements, this code may need to change.
-  // (This attribute has no effect on CPU.)
-  bool AllowsAsynchronousDeallocation() const override { return true; }
-
- private:
-  OpKernelContext* const op_context_;
-
-  // Map from pointer address to the owning Tensor; used by
-  // MakeTensorFromBuffer. Also used to automatically release Tensors when the
-  // allocator is freed.
-  std::unordered_map<void*, Tensor> tensors_;
-};
-
-XlaAllocator::XlaAllocator(const gpu::Platform* platform,
-                           OpKernelContext* op_context)
-    : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {}
-
-XlaAllocator::~XlaAllocator() = default;
-
-xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
-    int device_ordinal, uint64 size, bool retry_on_failure) {
-  AllocatorAttributes allocator_attrs;
-  allocator_attrs.set_on_host(false);
-
-  AllocationAttributes allocation_attrs;
-  allocation_attrs.no_retry_on_failure = !retry_on_failure;
-
-  Tensor t;
-  Status status = op_context_->allocate_temp(
-      DT_UINT8, TensorShape({static_cast<int64>(size)}), &t, allocator_attrs,
-      allocation_attrs);
-  if (!status.ok()) {
-    VLOG(2) << "Allocation failed " << size;
-    return status;
-  }
-  void* data =
-      reinterpret_cast<void*>(const_cast<char*>(t.tensor_data().data()));
-  tensors_[data] = t;
-  return gpu::DeviceMemoryBase(data, size);
-}
-
-Status XlaAllocator::RegisterArgument(const Tensor* t) {
-  void* data =
-      reinterpret_cast<void*>(const_cast<char*>(t->tensor_data().data()));
-  tensors_[data] = *t;
-  return Status::OK();
-}
-
-Status XlaAllocator::Deallocate(int device_ordinal,
-                                gpu::DeviceMemoryBase* mem) {
-  if (mem->opaque() != nullptr) {
-    if (tensors_.erase(mem->opaque()) == 0) {
-      return tensorflow::errors::InvalidArgument("Unknown tensor address");
-    }
-  }
-  return Status::OK();
-}
-
-Status XlaAllocator::MakeTensorFromBuffer(gpu::DeviceMemoryBase buffer,
-                                          DataType dtype,
-                                          const TensorShape& shape,
-                                          Tensor* out_tensor) const {
-  void* ptr = const_cast<void*>(buffer.opaque());
-  auto it = tensors_.find(ptr);
-  if (it == tensors_.end()) {
-    return errors::InvalidArgument("Unknown tensor address");
-  }
-  const Tensor& tensor = it->second;
-
-  int64 output_size = DataTypeSize(dtype) * shape.num_elements();
-  if (tensor.TotalBytes() == output_size) {
-    out_tensor->UnsafeCopyFromInternal(tensor, dtype, shape);
-  } else {
-    Tensor slice = tensor.Slice(0, output_size);
-    out_tensor->UnsafeCopyFromInternal(slice, dtype, shape);
-  }
-  return Status::OK();
-}
-
 XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
     : OpKernel(ctx), device_type_(ctx->device_type()) {
   const NameAttrList* func;
@@ -155,9 +49,9 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
   num_constant_args_ = constant_types.size();
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Nresources", &num_resource_args_));
   if (device_type_ == DeviceType(DEVICE_CPU)) {
-    platform_id_ = gpu::host::kHostPlatformId;
+    platform_id_ = se::host::kHostPlatformId;
   } else if (device_type_ == DeviceType(DEVICE_GPU)) {
-    platform_id_ = gpu::cuda::kCudaPlatformId;
+    platform_id_ = se::cuda::kCudaPlatformId;
   } else {
     platform_id_ = nullptr;
   }
@@ -173,9 +67,9 @@ Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
     return Status::OK();
   }
 
-  auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id_);
+  auto platform = se::MultiPlatformManager::PlatformWithId(platform_id_);
   if (!platform.ok()) {
-    return StreamExecutorUtil::ConvertStatus(platform.status());
+    return platform.status();
   }
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform.ValueOrDie());
@@ -196,23 +90,6 @@ Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
   return Status::OK();
 }
 
-std::vector<OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
-                                                      int num_variables) {
-  std::vector<OptionalTensor> snapshot(num_variables);
-  int first_variable = ctx->num_inputs() - num_variables;
-  for (int i = 0; i < num_variables; ++i) {
-    Var* variable = nullptr;
-    ResourceHandle handle = HandleFromInput(ctx, first_variable + i);
-    if (LookupResource(ctx, handle, &variable).ok()) {
-      tf_shared_lock lock(*variable->mu());
-      snapshot[i].name = handle.name();
-      snapshot[i].present = true;
-      snapshot[i].value = *variable->tensor();
-    }
-  }
-  return snapshot;
-}
-
 void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XlaLocalLaunchOp::Compute "
           << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
@@ -221,7 +98,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   ResourceMgr* rm = ctx->resource_manager();
   OP_REQUIRES(ctx, rm, errors::Internal("No resource manager."));
 
-  gpu::Stream* stream =
+  se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
   XlaCompilationCache* cache;
@@ -235,173 +112,85 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   // this is more obviously correct.)
   core::ScopedUnref cache_ref(cache);
 
+  const XlaDevice::Metadata* metadata;
+  Status s = XlaDevice::GetMetadata(ctx, &metadata);
+  bool allocate_xla_tensors = s.ok();
+
   // Get the platform_id_ for XLA_* devices.
   if (platform_id_ == nullptr) {
-    const XlaDevice::Metadata* metadata;
-    Status s = XlaDevice::GetMetadata(ctx, &metadata);
     if (s.ok()) {
       platform_id_ = metadata->platform()->id();
     }
   }
 
-  std::vector<OptionalTensor> variables =
+  std::map<int, OptionalTensor> variables =
       SnapshotResourceVariables(ctx, num_resource_args_);
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
-  // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
+  XlaAllocator local_xla_allocator(client->backend().platform(),
+                                   ctx->device()->GetAllocator({}));
+  xla::DeviceMemoryAllocator* xla_allocator;
+  // If we are on an XlaDevice, use the underlying XLA platform's allocator
+  // directly. We could use the StreamExecutor's allocator which may
+  // theoretically be more correct, but XLA returns a nice OOM message in a
+  // Status and StreamExecutor does not.
+  //
+  // Importantly we can't use ctx->device()->GetAllocator() as the allocator
+  // (which local_xla_allocator above uses) as on an XlaDevice, this is a
+  // dummy allocator that returns XlaTensor objects. The XlaCompiler needs a
+  // real allocator to allocate real buffers.
+  if (allocate_xla_tensors) {
+    xla_allocator = client->backend().memory_allocator();
+  } else {
+    xla_allocator = &local_xla_allocator;
+  }
 
   XlaCompiler::Options options;
   options.client = client;
   options.device_type = &cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
-  options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
-  options.device_allocator = &xla_allocator;
+  options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId);
+  options.device_allocator = xla_allocator;
+  // TODO(b/77671268): We don't set variable_representation_shape_fn here. This
+  // is restricted to Variables, but we need something like this to apply to
+  // normal Tensors too.
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
 
-  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, num_constant_args_,
+  std::map<int, Tensor> constant_args;
+  for (int i = 0; i < num_constant_args_; ++i) {
+    constant_args.insert({i, ctx->input(i)});
+  }
+  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, constant_args,
                                      variables, ctx, &kernel, &executable,
                                      /*compile_options=*/nullptr));
 
   VLOG(1) << "Executing XLA Computation...";
 
-  std::unique_ptr<xla::ShapedBuffer> output;
-  // Build xla::ShapedBuffers that point directly to the Tensor buffers.
-  std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers;
-  arg_buffers.reserve(kernel->xla_input_shapes.size() + 1);
-  arg_buffers.resize(kernel->xla_input_shapes.size());
-  std::vector<xla::ShapedBuffer*> arg_ptrs(arg_buffers.size());
-
-  const int first_variable_arg = ctx->num_inputs() - num_resource_args_;
-  // Pass remaining parameters.
-  const Tensor* t;
-  for (int i = 0; i < kernel->xla_input_shapes.size(); ++i) {
-    int arg_num = kernel->input_mapping[i];
-    const xla::Shape& shape = kernel->xla_input_shapes[i];
-    if (arg_num >= first_variable_arg) {
-      t = &(variables[arg_num - first_variable_arg].value);
-    } else {
-      t = &(ctx->input(arg_num));
-    }
-
-    gpu::DeviceMemoryBase dmem = gpu::DeviceMemoryBase(
-        const_cast<char*>(t->tensor_data().data()), t->tensor_data().size());
-
-    const xla::Shape on_device_shape =
-        client->backend().transfer_manager()->HostShapeToDeviceShape(shape);
-    CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
-        << "On-device shape "
-        << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
-        << " not the same as on-host shape "
-        << xla::ShapeUtil::HumanStringWithLayout(shape);
-    arg_buffers[i] = xla::MakeUnique<xla::ShapedBuffer>(
-        /*on_host_shape=*/shape, /*on_device_shape=*/shape, client->platform(),
-        client->default_device_ordinal());
-    arg_buffers[i]->set_buffer(dmem, /*index=*/{});
-    arg_ptrs[i] = arg_buffers[i].get();
-
-    OP_REQUIRES_OK(ctx, xla_allocator.RegisterArgument(t));
-  }
+  XlaComputationLaunchContext launch_context(
+      num_resource_args_, client, xla_allocator, allocate_xla_tensors);
+  launch_context.PopulateInputs(ctx, kernel, variables);
 
   // Execute the computation.
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
-  run_options.set_allocator(&xla_allocator);
+  run_options.set_allocator(xla_allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
+  run_options.set_rng_seed(ctx->step_id());
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
-  auto run_result = executable->Run(arg_ptrs, run_options);
+
+  auto run_result = executable->Run(launch_context.arguments(), run_options);
   OP_REQUIRES(ctx, run_result.ok(), run_result.status());
 
-  output = run_result.ConsumeValueOrDie()->release();
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
-  // Computation output should always be a tuple.
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString();
-  }
-  CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
-
-  // Copy XLA results to the OpOutputList.
-  int output_num = 0;
-  for (int i = 0; i < ctx->num_outputs(); ++i) {
-    if (kernel->outputs[i].is_constant) {
-      // Output is a constant.
-      const Tensor& const_tensor = kernel->outputs[i].constant_value;
-      const size_t total_bytes = const_tensor.TotalBytes();
-      if (stream && total_bytes > 0) {
-        // Copy host -> device. (Empty tensors don't have backing buffers.)
-        VLOG(1) << "Constant output tensor on device";
-        Tensor* output_tensor;
-        TF_CHECK_OK(
-            ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
-
-        const void* src_ptr = DMAHelper::base(&const_tensor);
-        void* dst_ptr = DMAHelper::base(output_tensor);
-        gpu::DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
-        stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
-      } else {
-        // No copy required.
-        ctx->set_output(i, const_tensor);
-      }
-    } else {
-      const TensorShape& shape = kernel->outputs[i].shape;
-      VLOG(2) << "Retval " << i << " shape " << shape.DebugString();
-
-      gpu::DeviceMemoryBase buffer = output->buffer({output_num});
-      Tensor output_tensor;
-      // Looks up the owning Tensor by buffer address.
-      OP_REQUIRES_OK(ctx, xla_allocator.MakeTensorFromBuffer(
-                              buffer, ctx->expected_output_dtype(i), shape,
-                              &output_tensor));
-      ctx->set_output(i, output_tensor);
-      ++output_num;
-    }
-
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << ctx->mutable_output(i)->DebugString();
-    }
-  }
-
-  // Apply variable updates, if any.
-  VLOG(2) << "Applying variable updates";
-  for (int i = 0; i < kernel->resource_updates.size(); ++i) {
-    const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
-    OP_REQUIRES(ctx,
-                write.input_index >= 0 && write.input_index < ctx->num_inputs(),
-                errors::Internal("Invalid input index for variable write."));
-
-    gpu::DeviceMemoryBase buffer = output->buffer({output_num});
-
-    Var* variable = nullptr;
-    // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor, not
-    // a Tensor.
-    OP_REQUIRES_OK(ctx, LookupOrCreateResource<Var>(
-                            ctx, HandleFromInput(ctx, write.input_index),
-                            &variable, [this, ctx, &write](Var** ptr) {
-                              *ptr = new Var(write.type);
-                              return Status::OK();
-                            }));
-
-    core::ScopedUnref s(variable);
-
-    mutex_lock ml(*variable->mu());
-    OP_REQUIRES(ctx, variable->tensor()->dtype() == write.type,
-                errors::Internal("Mismatched type in variable write"));
-
-    // Looks up the owning Tensor by buffer address.
-    OP_REQUIRES_OK(
-        ctx, xla_allocator.MakeTensorFromBuffer(buffer, write.type, write.shape,
-                                                variable->tensor()));
-    ++output_num;
-  }
-
+  launch_context.PopulateOutputs(ctx, kernel, run_result.ConsumeValueOrDie());
   VLOG(1) << "Done";
 }
 
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.h b/tensorflow/compiler/jit/kernels/xla_launch_op.h
index 47fd912b12abbbe876e933ab57f6f586fd299909..8f8e646f0ff6d94dfdf56721cacfce7fa658beb6 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.h
@@ -26,14 +26,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Takes a snapshot of the values of resource variable arguments, which are
-// the last `num_variables` arguments. We snapshot tensors that back
-// resource variables since concurrent updates may modify the shape, and it is
-// important that the shapes used for compilation match the true shapes of the
-// buffers.
-std::vector<OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
-                                                      int num_variables);
-
 // XlaLocalLaunchOp is used to replace a region of the TensorFlow graph
 // which will be compiled and executed using XLA.  The XlaLocalLaunchOp is
 // responsible for handling interactions with the TensorFlow executor.
@@ -61,7 +53,7 @@ class XlaLocalLaunchOp : public OpKernel {
   // Number of resource variable arguments.
   int num_resource_args_;
 
-  perftools::gputools::Platform::Id platform_id_;
+  se::Platform::Id platform_id_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp);
 };
diff --git a/tensorflow/compiler/jit/legacy_flags/BUILD b/tensorflow/compiler/jit/legacy_flags/BUILD
index 4491dd6ac8f2b84f341162eb469cc8194f817c9a..5d211f4d733d8d807426e62dd116092799184f35 100644
--- a/tensorflow/compiler/jit/legacy_flags/BUILD
+++ b/tensorflow/compiler/jit/legacy_flags/BUILD
@@ -52,16 +52,14 @@ cc_library(
         ],
 )
 
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
+cc_library(
+    name = "xla_device_flags",
+    srcs = ["xla_device_flags.cc"],
+    hdrs = ["xla_device_flags.h"],
+    deps =
+        [
+            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
         ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
 )
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
index 4bc209b7ecf499d82e7567f7eff12b17cefa9863..7277a1d1f8ad5fa045645ead839ab9efa01e89c7 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
+++ b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
@@ -40,6 +40,8 @@ static void AllocateFlags() {
   flags->tf_xla_max_cluster_size = std::numeric_limits<int32>::max();
   flags->tf_xla_clustering_debug = false;
   flags->tf_xla_cpu_global_jit = false;
+  flags->tf_xla_clustering_fuel = std::numeric_limits<int64>::max();
+  flags->tf_xla_fusion_only = false;
   flag_list = new std::vector<Flag>(
       {Flag("tf_xla_auto_jit", &flags->tf_xla_auto_jit,
             "Control compilation of operators into XLA computations on CPU and "
@@ -55,7 +57,13 @@ static void AllocateFlags() {
        Flag("tf_xla_clustering_debug", &flags->tf_xla_clustering_debug,
             "Dump graphs during XLA compilation."),
        Flag("tf_xla_cpu_global_jit", &flags->tf_xla_cpu_global_jit,
-            "Enables global JIT compilation for CPU via SessionOptions.")});
+            "Enables global JIT compilation for CPU via SessionOptions."),
+       Flag("tf_xla_clustering_fuel", &flags->tf_xla_clustering_fuel,
+            "Places an artificial limit on the number of ops marked as "
+            "eligible for clustering."),
+       Flag("tf_xla_fusion_only", &flags->tf_xla_fusion_only,
+            "enable fusion of element-wise operations only using XLA when "
+            "global_jit_level is ON*.")});
   xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
 }
 
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
index e1ccd7ddb8706ca445b6811ca1fec369af7cd5d5..2affda6ab4e0fbad32a246744fa5b38aeb629c1b 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
+++ b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
@@ -48,6 +48,13 @@ typedef struct {
   bool tf_xla_clustering_debug;   // Dump graphs during XLA compilation.
   bool tf_xla_cpu_global_jit;     // Enables global JIT compilation for CPU
                                   // via SessionOptions.
+  int64 tf_xla_clustering_fuel;   // "Compiler fuel" for clustering.  Only this
+                                  // many ops will be marked as eligible for
+                                  // clustering.
+  bool tf_xla_fusion_only;  // This flag is effective only when global_jit_level
+                            // is set to ON* and overrides its behavior. If
+                            // true, enable fusion of element-wise operations
+                            // only using XLA.
 } MarkForCompilationPassFlags;
 
 // Return a pointer to the MarkForCompilationPassFlags struct;
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1bb2fce2dbad5bffce2e33b665b7222090d0855a
--- /dev/null
+++ b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Legacy flags for the XLA bridge's xla_device module.
+
+#include <mutex>
+#include <vector>
+
+#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace legacy_flags {
+
+// Pointers to the parsed value of the flags and flag descriptors, initialized
+// via flags_init.
+static XlaDeviceFlags* flags;
+static std::vector<Flag>* flag_list;
+static std::once_flag flags_init;
+
+// Allocate *flags.  Called via call_once(&flags_init,...).
+static void AllocateFlags() {
+  flags = new XlaDeviceFlags;
+  flags->tf_xla_compile_on_demand = false;
+  flag_list = new std::vector<Flag>({
+      Flag("tf_xla_compile_on_demand", &flags->tf_xla_compile_on_demand,
+           "Switch a device into 'on-demand' mode, where instead of "
+           "autoclustering ops are compiled one by one just-in-time."),
+  });
+  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
+}
+
+// Return a pointer to the XlaDeviceFlags struct;
+// repeated calls return the same pointer.
+// This should be called only after Flags::Parse() has returned.
+XlaDeviceFlags* GetXlaDeviceFlags() {
+  std::call_once(flags_init, &AllocateFlags);
+  return flags;
+}
+
+}  // namespace legacy_flags
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..27b22121ac1e089bd5d5a494e1e3fb60b05bc76d
--- /dev/null
+++ b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
+#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
+
+// Legacy flags for the XLA bridge's xla_device module.
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace legacy_flags {
+
+// The values of flags associated with the XLA bridge's
+// xla_device module.
+typedef struct {
+  // Switch the CPU device into "on-demand" mode, where instead of
+  // autoclustering ops are compiled one by one just-in-time.
+  // Enabling this mode by a legacy flag is a temporary mechanism. When this
+  // feature is battle-tested, we will switch this to be a session option.
+  bool tf_xla_compile_on_demand;
+} XlaDeviceFlags;
+
+// Return a pointer to the XlaDeviceFlags struct;
+// repeated calls return the same pointer.
+// This should be called only after Flags::Parse() has returned.
+XlaDeviceFlags* GetXlaDeviceFlags();
+
+}  // namespace legacy_flags
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index a0211acbbe9eec77d30c7d14293650de8826f41c..8e2ee0f1d71bc17b4c12c792c38002af4f9eb5eb 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/public/version.h"
 
@@ -50,6 +51,15 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   // is really a kind of function call and will be handled by
   // IsCompilableCall().
   if (node.type_string() == "SymbolicGradient") return false;
+  if (node.type_string() == "Const") {
+    // Skip Const op with type DT_STRING, since XLA doesn't support it, but the
+    // registered Const KernelDef says that it does, to support no-op Assert for
+    // tfcompile.
+    const AttrValue* attr = node.attrs().Find("dtype");
+    if (attr != nullptr && attr->type() == DT_STRING) {
+      return false;
+    }
+  }
   return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok();
 }
 
@@ -174,10 +184,164 @@ bool HasResourceInputOrOutput(const Node& node) {
 }
 
 struct NodeCompare {
-  bool operator()(const Node* a, const Node* b) { return a->id() < b->id(); }
+  bool operator()(const Node* a, const Node* b) const {
+    return a->id() < b->id();
+  }
 };
 using OrderedNodeSet = std::set<Node*, NodeCompare>;
 
+// Returns true if the op can be decomposed into XLA ops for which
+// there are fusable elemental implementations.
+//
+// TODO(hpucha): Consider a black list instead of a white list as
+// implemented below.
+bool IsXlaFusable(const NodeDef& node) {
+  static const std::unordered_set<std::string>* elementwise_ops =
+      new std::unordered_set<std::string>(
+          {// tf2xla/kernels/aggregate_ops.cc
+           "AddN",
+           // tf2xla/kernels/batchtospace_op.cc
+           "BatchToSpace", "BatchToSpaceND",
+           // tf2xla/kernels/bcast_ops.cc
+           "BroadcastArgs", "BroadcastGradientArgs",
+           // tf2xla/kernels/bias_ops.cc
+           "BiasAdd", "BiasAddV1", "BiasAddGrad" /*(Reduce)*/,
+           // tf2xla/kernels/binary_ops.cc
+           "Add", "Sub", "Mul", "Div", "Atan2", "Complex", "FloorDiv",
+           "FloorMod", "BitwiseAnd", "BitwiseOr", "LeftShift", "RightShift",
+           "LogicalAnd", "LogicalOr", "Mod", "Maximum", "Minimum", "RealDiv",
+           "ReciprocalGrad", "RsqrtGrad", "SqrtGrad", "SquaredDifference",
+           "TruncateDiv", "TruncateMod", "Equal", "NotEqual", "Greater",
+           "GreaterEqual", "Less", "LessEqual", "SigmoidGrad", "SoftplusGrad",
+           "SoftsignGrad", "TanhGrad", "Pow", "ApproximateEqual",
+           // tf2xla/kernels/cast_op.cc
+           "Cast",
+           // tf2xla/kernels/categorical_op.cc
+           "Multinomial" /* (Rng ops are disabled on GPU backend currently)*/,
+           // tf2xla/kernels/concat_op.cc
+           "Concat", "ConcatV2", "ConcatOffset",
+           // tf2xla/kernels/const_op.cc
+           "Const",
+           // tf2xla/kernels/cross_op.cc
+           "Cross",
+           // tf2xla/kernels/depthtospace_op.cc
+           "DepthToSpace",
+           // tf2xla/kernels/diag_op.cc
+           "Diag", "DiagPart", "MatrixDiag", "MatrixDiagPart",
+           // tf2xla/kernels/dynamic_stitch_op.cc
+           "DynamicStitch", "ParallelDynamicStitch",
+           // tf2xla/kernels/elu_op.cc
+           "Elu", "EluGrad", "Selu", "SeluGrad",
+           // tf2xla/kernels/fake_quantize_ops.cc
+           "FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxArgsGradient",
+           "FakeQuantWithMinMaxVars",
+           "FakeQuantWithMinMaxVarsGradient" /*(Reduce)*/,
+           // tf2xla/kernels/fill_op.cc
+           "Fill",
+           // tf2xla/kernels/gather_op.cc
+           "Gather", "GatherV2", "GatherNd",
+           // tf2xla/kernels/identity_op.cc
+           "Identity", "IdentityN", "PreventGradient", "StopGradient",
+           "Snapshot",
+           // tf2xla/kernels/image_ops.cc
+           "RGBToHSV", "HSVToRGB", "AdjustContrastv2" /*(Reduce)*/,
+           "AdjustSaturation", "AdjustHue",
+           // tf2xla/kernels/index_ops.cc
+           "ArgMax", "ArgMin",
+           // tf2xla/kernels/l2loss_op.cc
+           "L2Loss" /*(Reduce)*/,
+           // tf2xla/kernels/lrn_ops.cc (ReduceWindow)
+           "LRN", "LRNGrad",
+           // tf2xla/kernels/matrix_band_part_op.cc
+           "MatrixBandPart",
+           // tf2xla/kernels/matrix_set_diag_op.cc
+           "MatrixSetDiag",
+           // tf2xla/kernels/mirror_pad_op.cc
+           "MirrorPad",
+           // tf2xla/kernels/no_op.cc
+           "NoOp", "ControlTrigger",
+           // tf2xla/kernels/one_hot_op.cc
+           "OneHot",
+           // tf2xla/kernels/pack_op.cc
+           "Pack",
+           // tf2xla/kernels/pad_op.cc
+           "Pad", "PadV2",
+           // tf2xla/kernels/pooling_ops.cc
+           "MaxPool", "MaxPoolV2", "MaxPool3D", "AvgPool",
+           "AvgPool3D", /*(all the pooling ops use ReduceWindow)*/
+           "MaxPoolGrad", "MaxPoolGradV2", "MaxPool3DGrad", "AvgPoolGrad",
+           "AvgPool3DGrad",
+           // tf2xla/kernels/quantize_and_dequantize_op.cc (Reduce)
+           "QuantizeAndDequantizeV2",
+           // tf2xla/kernels/random_ops.cc (Rng ops are disabled on GPU backend
+           // currently)
+           "RandomUniform", "RandomUniformInt", "RandomStandardNormal",
+           "TruncatedNormal",
+           // tf2xla/kernels/reduction_ops.cc (Reduce)
+           "Sum", "Prod", "Min", "Max", "Mean", "All", "Any",
+           // tf2xla/kernels/relu_op.cc
+           "Relu", "Relu6", "ReluGrad", "Relu6Grad",
+           // tf2xla/kernels/reshape_op.cc
+           "Reshape",
+           // tf2xla/kernels/reverse_op.cc
+           "Reverse", "ReverseV2",
+           // tf2xla/kernels/reverse_sequence_op.cc
+           "ReverseSequence",
+           // tf2xla/kernels/scan_ops.cc (ReduceWindow)
+           "Cumsum", "Cumprod",
+           // tf2xla/kernels/scatter_nd_op.cc (Reduce)
+           "ScatterNd",
+           // tf2xla/kernels/segment_reduction_ops.cc (Reduce)
+           "UnsortedSegmentSum",
+           // tf2xla/kernels/select_op.cc
+           "Select",
+           // tf2xla/kernels/sequence_ops.cc
+           "Range", "LinSpace",
+           // tf2xla/kernels/shape_op.cc
+           "Shape", "ShapeN", "Rank", "Size", "ExpandDims", "Squeeze",
+           "ZerosLike", "OnesLike",
+           // tf2xla/kernels/slice_op.cc
+           "Slice",
+           // tf2xla/kernels/softmax_op.cc (Reduce)
+           "Softmax", "LogSoftmax", "SoftmaxCrossEntropyWithLogits",
+           "SparseSoftmaxCrossEntropyWithLogits",
+           // tf2xla/kernels/spacetobatch_op.cc
+           "SpaceToBatchND", "SpaceToBatch",
+           // tf2xla/kernels/spacetodepth_op.cc
+           "SpaceToDepth",
+           // tf2xla/kernels/split_op.cc
+           "Split", "SplitV",
+           // tf2xla/kernels/stack_ops.cc
+           "StackV2", "StackPushV2", "StackPopV2", "StackCloseV2",
+           // tf2xla/kernels/stateless_random_ops.cc (Rng ops are disabled on
+           // GPU
+           // backend currently)
+           "StatelessRandomUniform",
+           "StatelessRandomNormal"
+           // tf2xla/kernels/strided_slice_op.cc
+           "StridedSlice",
+           "StridedSliceGrad", "ResourceStridedSliceAssign",
+           // tf2xla/kernels/tile_ops.cc
+           "Tile",
+           // tf2xla/kernels/training_ops.cc
+           "ResourceApplyGradientDescent", "ResourceApplyMomentum",
+           "ResourceApplyAdagrad", "ResourceApplyAdam", "ResourceApplyRMSProp",
+           "ResourceApplyFtrl", "ResourceApplyFtrlV2",
+           // tf2xla/kernels/transpose_op.cc
+           "Transpose", "InvertPermutation",
+           // tf2xla/kernels/unary_ops.cc
+           "ComplexAbs", "Angle", "Conj", "Abs", "Acos", "Acosh", "Asin",
+           "Asinh", "Atan", "Atanh", "Ceil", "Cos", "Cosh", "Sin", "Exp",
+           "Expm1", "Floor", "IsFinite", "IsInf", "IsNan", "Inv", "Reciprocal",
+           "Log", "Log1p", "Invert", "LogicalNot", "Neg", "Rint", "Round",
+           "Rsqrt", "Sigmoid", "Sign", "Sinh", "Softplus", "Softsign", "Sqrt",
+           "Square", "Tan", "Tanh", "Real", "Imag",
+           // tf2xla/kernels/unpack_op.cc
+           "Unpack"});
+
+  return elementwise_ops->count(node.op()) > 0;
+}
+
 Status FindCompilationCandidates(
     const Graph& graph, FunctionLibraryDefinition* flib_def, Env* env,
     const std::function<bool(const Node*, const DeviceType&)>& is_compilable_fn,
@@ -189,7 +353,27 @@ Status FindCompilationCandidates(
   FunctionLibraryRuntime* lib_runtime =
       pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
 
+  int64& fuel =
+      legacy_flags::GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
+
+  // Iterate over nodes in sorted order so that compiler fuel is deterministic.
+  // We can't simply pass op_nodes().begin() and op_nodes().end to the
+  // std::vector constructor because they're not proper iterators, with
+  // iterator_traits defined and so on.
+  std::vector<Node*> sorted_nodes;
   for (Node* node : graph.op_nodes()) {
+    sorted_nodes.push_back(node);
+  }
+  std::sort(sorted_nodes.begin(), sorted_nodes.end(), NodeCompare());
+
+  for (Node* node : sorted_nodes) {
+    VLOG(2) << "Fuel: " << fuel;
+    if (fuel <= 0) {
+      VLOG(2)
+          << "Hit fuel limit; not marking any remaining ops as clusterable.";
+      break;
+    }
+
     VLOG(2) << "FindCompilationCandidates(): Processing "
             << node->DebugString();
 
@@ -234,7 +418,9 @@ Status FindCompilationCandidates(
       continue;
     }
     candidates->insert(node);
+    --fuel;
   }
+  VLOG(2) << "candidates->size() = " << candidates->size();
   return Status::OK();
 }
 
@@ -256,6 +442,9 @@ string DescribeCycle(const GraphCycles& cycles, const Graph& graph, int src,
   }
 
   auto node_name = [&cycles, &graph](int node_id) {
+    if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
+      return string("(null)");
+    }
     auto* node = graph.FindNodeId(node_id);
     if (node == nullptr) {
       return string("(null)");
@@ -314,10 +503,13 @@ Status MarkForCompilationPass::Run(
         static_cast<OptimizerOptions::GlobalJitLevel>(flags->tf_xla_auto_jit);
   }
   bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+  bool fusion_only = flags->tf_xla_fusion_only;
+
   VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
+  VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
   const FunctionLibraryDefinition* fld = options.flib_def;
 
-  auto is_compilable = [global_jit_level, cpu_global_jit, fld](
+  auto is_compilable = [global_jit_level, cpu_global_jit, fusion_only, fld](
                            const Node* node, const DeviceType& device_type) {
     const XlaOpRegistry::DeviceRegistration* registration;
     if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
@@ -340,6 +532,11 @@ Status MarkForCompilationPass::Run(
     status = fld->GetAttr(*node, kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
+    // Check for fusable ops only if requested.
+    if (global_jit_level > 0 && fusion_only && !IsXlaFusable(node->def())) {
+      return false;
+    }
+
     // Otherwise use the value of global_jit_level.
     // Ignore enable_jit_by_default if global jit compilation for CPU
     // is explicitly requested via tf_xla_cpu_global_jit flag
@@ -544,11 +741,15 @@ Status MarkForCompilationPass::RunImpl(
     }
   }
 
-  // Count the number of elements in each cluster.
-  std::vector<int> cluster_sizes(graph->num_node_ids());
+  // Count the number of non-trivial elements in each cluster.
+  std::vector<int> effective_cluster_sizes(graph->num_node_ids());
   for (const Node* n : compilation_candidates) {
     int cluster = clusters[n->id()].Get().representative;
-    cluster_sizes[cluster]++;
+    // Identity nodes will be removed if the node gets marked for compilation.
+    // Therefore we don't want to count them towards the effective cluster size.
+    if (n->def().op() != "Identity") {
+      effective_cluster_sizes[cluster]++;
+    }
   }
 
   // Names for each cluster.
@@ -581,9 +782,12 @@ Status MarkForCompilationPass::RunImpl(
     const XlaOpRegistry::DeviceRegistration* registration;
     XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
-    // Or compile if this is a cluster of >= min_cluster_size compilable
-    // operators.
-    if (cluster_sizes[cluster] >= min_cluster_size || marked_for_compilation ||
+    // Compile if this is a cluster of >= min_cluster_size compilable operators.
+    // Also, always compile if the operator is placed on a device that requires
+    // compilation, or if it contains at least one op that is marked for
+    // compilation that is not an Identity op.
+    if (effective_cluster_sizes[cluster] >= min_cluster_size ||
+        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation) ||
         registration->requires_compilation) {
       string& name = cluster_names[cluster];
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 1a8858cccef623185709ab5dc2187a313dd130f7..703d8825d74ced8d4d69c31ccd730adc89a8bffe 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 
 #include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -27,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -137,7 +140,7 @@ TEST(XlaCompilationTest, CompilableCycles) {
   EXPECT_EQ(clusters["A"], clusters["C"]);
 }
 
-TEST(XlaCompilationTest, UnsupportedTypes) {
+TEST(XlaCompilationTest, Complex128Unsupported) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
   {
@@ -157,6 +160,27 @@ TEST(XlaCompilationTest, UnsupportedTypes) {
   EXPECT_TRUE(clusters.empty());
 }
 
+TEST(XlaCompilationTest, HalfSupported) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Tensor t(DT_HALF, TensorShape());
+    t.scalar<Eigen::half>()() = static_cast<Eigen::half>(0.0f);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_HALF)
+                                         .WithAttr("value", t));
+    Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
+    ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+  EXPECT_FALSE(clusters.empty());
+}
+
 TEST(XlaCompilationTest, ConcatWithConstArg) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
@@ -519,11 +543,11 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
 
   Status status = MarkForCompilation(&graph);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.ToString())
-                  .contains("Edge from c to a would create a cycle.\n"
-                            "+-> a\n"
-                            "|   b\n"
-                            "+-- c\n"));
+  EXPECT_TRUE(str_util::StrContains(status.ToString(),
+                                    "Edge from c to a would create a cycle.\n"
+                                    "+-> a\n"
+                                    "|   b\n"
+                                    "+-- c\n"));
 }
 
 TEST(XlaCompilationTest, Retval) {
@@ -553,5 +577,61 @@ TEST(XlaCompilationTest, Retval) {
   EXPECT_EQ(clusters["A"], clusters["B"]);
 }
 
+TEST(XlaCompilationTest, DontCountIdentityOps) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Scope root = Scope::NewRootScope().ExitOnError();
+  {
+    auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+    auto b = ops::Identity(root.WithOpName("B"), a);
+    auto c = ops::Identity(root.WithOpName("C"), b);
+    auto r = ops::_Retval(root.WithOpName("R"), c, 0);
+  }
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_TRUE(clusters.empty());
+}
+
+TEST(XlaCompilationTest, DontCountIdentityOpsWithLocalJit) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Scope root = Scope::NewRootScope().ExitOnError();
+  {
+    auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+    auto b = ops::Identity(root.WithOpName("B"), a);
+    b.node()->AddAttr(kXlaCompileAttr, true);
+    auto r = ops::_Retval(root.WithOpName("R"), b, 0);
+  }
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_TRUE(clusters.empty());
+}
+
+TEST(XlaCompilationTest, ConstOp) {
+  // valid data type
+  {
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    Scope root = Scope::NewRootScope().ExitOnError();
+    auto c = ops::Const(root.WithOpName("const"), 0.5f);
+    c.node()->AddAttr(kXlaCompileAttr, true);
+    TF_ASSERT_OK(root.ToGraph(graph.get()));
+    TF_ASSERT_OK(MarkForCompilation(&graph));
+    EXPECT_EQ(1, GetClusters(*graph).size());
+  }
+
+  // invalid data type
+  {
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    Scope root = Scope::NewRootScope().ExitOnError();
+    auto c = ops::Const(root.WithOpName("const"), string("string"));
+    c.node()->AddAttr(kXlaCompileAttr, true);
+    TF_ASSERT_OK(root.ToGraph(graph.get()));
+    TF_ASSERT_OK(MarkForCompilation(&graph));
+    EXPECT_TRUE(GetClusters(*graph).empty());
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index e5787ca4c8cff436e4404b8488970248b24a5eda..c9e46bc1475aed0e35a48765ad70eef4362e8281 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -17,17 +17,3 @@ cc_library(
     deps = ["//tensorflow/core:framework"],
     alwayslink = 1,
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/producer_consumer_queue.h b/tensorflow/compiler/jit/producer_consumer_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c8c04152d2f3a0fd46711df24756b7e68b967ea
--- /dev/null
+++ b/tensorflow/compiler/jit/producer_consumer_queue.h
@@ -0,0 +1,132 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
+#define TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
+
+#include <deque>
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// A thread-safe, first-in-first-out queue.
+template <typename T>
+class ProducerConsumerQueue {
+ public:
+  ProducerConsumerQueue()
+      : capacity_(std::numeric_limits<std::size_t>::max()) {}
+  ~ProducerConsumerQueue() = default;
+
+  // Wait until the queue is non-full, then append a copy of v.
+  void Put(const T &v);
+
+  // Wait until the queue is non-empty, then remove and return the head value.
+  T Get();
+
+  // If the queue is non-empty, remove the head value, placing it in *pv, and
+  // return true; otherwise return false.
+  bool TryGet(T *pv);
+
+  // Set the capacity of the queue; the queue is full whenever count() >=
+  // capacity().  The initial value is the maximum size_t.  Requires size > 0.
+  void set_capacity(std::size_t size);
+
+  // Return the capacity of the queue.
+  std::size_t capacity() const;
+
+  // Return the number of elements in the queue.
+  std::size_t count() const;
+
+  // Implementation details follow.  Clients should ignore.
+ private:
+  mutable tensorflow::mutex mu_;  // protects all fields below
+  tensorflow::condition_variable non_empty_ GUARDED_BY(mu_);
+  tensorflow::condition_variable non_full_ GUARDED_BY(mu_);
+  std::size_t capacity_ GUARDED_BY(mu_);
+  std::deque<T> queue_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ProducerConsumerQueue);
+};
+
+// ------------------------------------------------------
+// Implementation details follow.  Clients should ignore.
+
+// Wait until the queue is non-full, then append a copy of v.
+template <typename T>
+void ProducerConsumerQueue<T>::Put(const T &v) {
+  mutex_lock lock(mu_);
+  while (queue_.size() >= capacity_) {
+    non_full_.wait(lock);
+  }
+  queue_.push_back(v);
+  non_empty_.notify_one();
+}
+
+// Wait until the queue is non-empty, then remove and return the head value.
+template <typename T>
+T ProducerConsumerQueue<T>::Get() {
+  mutex_lock lock(mu_);
+  while (queue_.empty()) {
+    non_empty_.wait(lock);
+  }
+  non_full_.notify_one();
+  T result_value = queue_.front();
+  queue_.pop_front();
+  return result_value;
+}
+
+// If the queue is non-empty, remove the head value, placing it in *pv, and
+// return true; otherwise return false.
+template <typename T>
+bool ProducerConsumerQueue<T>::TryGet(T *pv) {
+  mutex_lock lock(mu_);
+  bool got_element = !queue_.empty();
+  if (got_element) {
+    non_full_.notify_one();
+    *pv = queue_.front();
+    queue_.pop_front();
+  }
+  return got_element;
+}
+
+// Set the capacity of the queue; the queue is full whenever count() >=
+// capacity().  The initial value is the maximum size_t.  Requires size > 0.
+template <typename T>
+void ProducerConsumerQueue<T>::set_capacity(std::size_t size) {
+  mutex_lock lock(mu_);
+  CHECK_NE(size, 0);
+  capacity_ = size;
+  non_full_.notify_all();
+}
+
+// Return the capacity of the queue.
+template <typename T>
+std::size_t ProducerConsumerQueue<T>::capacity() const {
+  mutex_lock lock(mu_);
+  std::size_t max_elements = capacity_;
+  return max_elements;
+}
+
+// Return the number of elements in the queue.
+template <typename T>
+std::size_t ProducerConsumerQueue<T>::count() const {
+  mutex_lock lock(mu_);
+  std::size_t num_elements = queue_.size();
+  return num_elements;
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
diff --git a/tensorflow/compiler/jit/producer_consumer_queue_test.cc b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f61260c6e52756ee039829afdc7452f5f760c221
--- /dev/null
+++ b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/producer_consumer_queue.h"
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+typedef ProducerConsumerQueue<int> IntQueue;
+
+// Insert integers between low inclusive and high exclusive into q.
+void PushRange(IntQueue *q, int low, int high) {
+  while (low != high) {
+    q->Put(low);
+    VLOG(2) << "Pushing " << low;
+    ++low;
+  }
+}
+
+// Push the numbers between 0 and 999 inclusive from several threads in the
+// pool.
+void PushRanges(IntQueue *queue, thread::ThreadPool *pool) {
+  VLOG(1) << "Adding 20-36";
+  pool->Schedule([queue] { PushRange(queue, 20, 36); });
+  VLOG(1) << "Adding 7-20";
+  pool->Schedule([queue] { PushRange(queue, 7, 20); });
+  VLOG(1) << "Adding 36-501";
+  pool->Schedule([queue] { PushRange(queue, 36, 501); });
+  VLOG(1) << "Adding 501-1000";
+  pool->Schedule([queue] { PushRange(queue, 501, 1000); });
+  VLOG(1) << "Adding 0-5";
+  pool->Schedule([queue] { PushRange(queue, 0, 5); });
+  VLOG(1) << "Adding 5-7";
+  pool->Schedule([queue] { PushRange(queue, 5, 7); });
+}
+
+// Pop elements from queue using Get().  Make sure that exactly <high> elements
+// were present and their values are all integers between 0 and high-1
+// inclusive.
+void GetRange(IntQueue *queue, int high) {
+  VLOG(1) << "Testing Wait";
+  std::vector<int> results;
+  for (int i = 0; i != high; ++i) {
+    int r = queue->Get();
+    VLOG(2) << "Waited and got " << r;
+    results.push_back(r);
+  }
+  CHECK_EQ(queue->count(), 0);
+  std::sort(results.begin(), results.end());
+  for (int i = 0; i != high; ++i) {
+    CHECK(results[i] == i);
+  }
+}
+
+// Pop elements from queue using TryGet().  Make sure that exactly <high>
+// elements were present and their values are all integers between 0 and high-1
+// inclusive.
+void TryGetRange(IntQueue *queue, int high) {
+  std::vector<int> results;
+  // Give up if we don't get all the elements back from the queue
+  // in 10 seconds.
+  int timeout = 10;
+  int r;
+  for (int i = 0; i != high; ++i) {
+    while (!queue->TryGet(&r)) {
+      if (!timeout--) {
+        LOG(FATAL) << "Can't find all elements in the queue";
+      }
+      VLOG(1) << "Sleeping for a second...";
+      sleep(1);
+    }
+    VLOG(2) << "Popped " << r;
+    results.push_back(r);
+  }
+  CHECK_EQ(queue->count(), 0);
+  CHECK(!queue->TryGet(&r));
+  std::sort(results.begin(), results.end());
+  for (int i = 0; i != high; ++i) {
+    CHECK_EQ(i, results[i]);
+  }
+}
+
+const int kNumThreads = 15;
+
+TEST(ProducerConsumerQueue, GetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    PushRanges(&queue, &pool);
+  }
+  GetRange(&queue, 1000);
+}
+
+TEST(ProducerConsumerQueue, TryGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    PushRanges(&queue, &pool);
+  }
+  TryGetRange(&queue, 1000);
+}
+
+TEST(ProducerConsumerQueue, ParallelGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    pool.Schedule([&queue] { GetRange(&queue, 1000); });
+    PushRanges(&queue, &pool);
+  }
+}
+
+TEST(ProducerConsumerQueue, ParallelTryGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    pool.Schedule([&queue] { TryGetRange(&queue, 1000); });
+    PushRanges(&queue, &pool);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/shape_inference_helpers.cc b/tensorflow/compiler/jit/shape_inference_helpers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9cfa16526bc5d809942a35e86075b4ec6e88a59
--- /dev/null
+++ b/tensorflow/compiler/jit/shape_inference_helpers.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains helpers for use in shape inference.
+
+#include "tensorflow/compiler/jit/shape_inference_helpers.h"
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+Status BackEdgeHelper::Remove(Graph* graph) {
+  if (graph_ != nullptr) {
+    return errors::Internal("BackEdgeHelper duplicate call to Remove.");
+  }
+  graph_ = graph;
+  for (Node* n : graph_->nodes()) {
+    if (n->IsMerge()) {
+      for (const Edge* e : n->in_edges()) {
+        if (e->src()->IsNextIteration()) {
+          back_edges_.push_back(
+              BackEdge{e, e->src(), e->src_output(), e->dst(), e->dst_input()});
+        }
+      }
+    }
+  }
+  for (const BackEdge& be : back_edges_) {
+    graph_->RemoveEdge(be.edge);
+  }
+  return Status::OK();
+}
+
+const std::vector<BackEdgeHelper::BackEdge>& BackEdgeHelper::RemovedEdges()
+    const {
+  return back_edges_;
+}
+
+Status BackEdgeHelper::Replace() {
+  if (graph_ == nullptr) {
+    return errors::Internal("BackEdgeHelper Replace called before Remove.");
+  }
+  if (replaced_) {
+    return errors::Internal("BackEdgeHelper Replace called more than once.");
+  }
+  replaced_ = true;
+  for (const BackEdge& be : back_edges_) {
+    graph_->AddEdge(be.src, be.src_output, be.dst, be.dst_input);
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/shape_inference_helpers.h b/tensorflow/compiler/jit/shape_inference_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f053c9a45dd47ca1b056634d2248d6181e77d68
--- /dev/null
+++ b/tensorflow/compiler/jit/shape_inference_helpers.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
+#define TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Helper class to temporarily remove, then replace, the back edges in a
+// graph. Simple algorithms for shape inference don't work with cycles, and this
+// class can be used to remove cycles before running inference and replace them
+// after. Correct usage requires exactly one call to Remove(), followed by any
+// number of calls to RemovedEdges() and at most one call to Replace(). The call
+// to Replace() is optional if the graph will be discarded without being
+// executed, e.g., if it is being used purely for a shape inference pass.
+class BackEdgeHelper {
+ public:
+  struct BackEdge {
+    const Edge* edge;
+    Node* src;
+    int src_output;
+    Node* dst;
+    int dst_input;
+  };
+
+  BackEdgeHelper() = default;
+  // Disallows copy and assign.
+  BackEdgeHelper(const BackEdgeHelper& other) = delete;
+  BackEdgeHelper& operator=(const BackEdgeHelper& other) = delete;
+
+  // Temporarily removes all the back edges in graph.
+  Status Remove(Graph* graph);
+
+  // Gets the list of removed edges.
+  const std::vector<BackEdge>& RemovedEdges() const;
+
+  // Replaces the back edges removed by a prior call to Remove.
+  Status Replace();
+
+ private:
+  Graph* graph_ = nullptr;  // not owned
+  std::vector<BackEdge> back_edges_;
+  // Set once Replace has been called.
+  bool replaced_ = false;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 6d854a920eb0b4c01b09024ceaef5035e847d392..6430975335f5eef5b53c80213e6090ffd6166a91 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -92,38 +92,30 @@ uint64 XlaCompilationCache::Signature::Hash::operator()(
 }
 
 Status XlaCompilationCache::BuildSignature(
-    const NameAttrList& function, int num_constant_args,
-    const std::vector<OptionalTensor>& variable_args, OpKernelContext* ctx,
+    const NameAttrList& function, const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
     Signature* signature) {
   signature->name = Canonicalize(function.name(), AttrSlice(&function.attr()));
-  signature->arg_values.resize(num_constant_args);
-
-  signature->arg_types.reserve(ctx->num_inputs() - num_constant_args);
-
-  // Inputs are in the order: constants, non-constants, resource variables.
-  int input_num = 0;
-  // Use the values of compile time constants in the signature->
-  while (input_num < num_constant_args) {
-    signature->arg_values[input_num] = ctx->input(input_num);
-    ++input_num;
-  }
-  // Add the types and shapes of the remaining arguments.
-  while (input_num < ctx->num_inputs() - variable_args.size()) {
-    signature->arg_types.emplace_back(ctx->input_dtype(input_num),
-                                      ctx->input(input_num).shape());
-    ++input_num;
-  }
-  // For variable signatures, use the type and shape of the variable's
-  // current value.
-  for (const OptionalTensor& variable : variable_args) {
-    TF_RET_CHECK(input_num < ctx->num_inputs());
-    if (variable.present) {
-      signature->arg_types.emplace_back(variable.value.dtype(),
-                                        variable.value.shape());
+  signature->arg_values.reserve(constant_args.size());
+
+  signature->arg_types.reserve(ctx->num_inputs() - constant_args.size());
+
+  for (int i = 0; i < ctx->num_inputs(); ++i) {
+    if (constant_args.count(i) > 0) {
+      // Use the values of compile time constants in the signature.
+      signature->arg_values.push_back(constant_args.at(i));
+    } else if (variable_args.count(i) > 0) {
+      const OptionalTensor& variable = variable_args.at(i);
+      if (variable.present) {
+        signature->arg_types.emplace_back(variable.value.dtype(),
+                                          variable.value.shape());
+      } else {
+        signature->arg_types.emplace_back(DT_INVALID, TensorShape());
+      }
     } else {
-      signature->arg_types.emplace_back(DT_INVALID, TensorShape());
+      signature->arg_types.emplace_back(ctx->input_dtype(i),
+                                        ctx->input(i).shape());
     }
-    ++input_num;
   }
   return Status::OK();
 }
@@ -131,74 +123,58 @@ Status XlaCompilationCache::BuildSignature(
 namespace {
 
 // Builds a XlaCompiler::Argument vector from the arguments to the _XlaLaunch
-// op. The first `num_constant_args` arguments must be host-memory Tensors.
-Status BuildArguments(int num_constant_args,
-                      const std::vector<OptionalTensor>& variable_args,
+// op.
+Status BuildArguments(const std::map<int, Tensor>& constant_args,
+                      const std::map<int, OptionalTensor>& variable_args,
                       OpKernelContext* ctx,
                       std::vector<XlaCompiler::Argument>* args) {
   args->resize(ctx->num_inputs());
 
-  int input_num = 0;
-
-  // Handles compile-time constants.
-  TF_RET_CHECK(num_constant_args <= ctx->num_inputs());
-  while (input_num < num_constant_args) {
-    const Tensor& input = ctx->input(input_num);
-    TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-    XlaCompiler::Argument& arg = (*args)[input_num];
-    arg.kind = XlaCompiler::Argument::kConstant;
-    arg.type = input.dtype();
-    arg.shape = input.shape();
-    arg.constant_value = input;
-    ++input_num;
-  }
-
-  // Handles the non-constant arguments.
-  int num_variable_args = variable_args.size();
-  int num_nonconst_args =
-      ctx->num_inputs() - num_variable_args - num_constant_args;
-  TF_RET_CHECK(num_nonconst_args >= 0);
-  while (input_num < num_constant_args + num_nonconst_args) {
-    const Tensor& input = ctx->input(input_num);
-    TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
     XlaCompiler::Argument& arg = (*args)[input_num];
-    if (input.NumElements() > 0) {
-      arg.kind = XlaCompiler::Argument::kParameter;
-    } else {
+    if (constant_args.count(input_num) > 0) {
+      // Handles compile-time constants.
+      const Tensor& input = constant_args.at(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
       arg.kind = XlaCompiler::Argument::kConstant;
+      arg.type = input.dtype();
+      arg.shape = input.shape();
       arg.constant_value = input;
-    }
-    arg.type = input.dtype();
-    arg.shape = input.shape();
-    ++input_num;
-  }
-
-  // Handles resource variables.
-  TF_RET_CHECK(input_num + num_variable_args == ctx->num_inputs());
-  for (int variable_id = 0; variable_id < num_variable_args; ++variable_id) {
-    const Tensor& input = ctx->input(input_num);
-    TF_RET_CHECK(input.dtype() == DT_RESOURCE);
-
-    XlaCompiler::Argument& arg = (*args)[input_num];
-
-    arg.name = variable_args[variable_id].name;
-    arg.kind = XlaCompiler::Argument::kResource;
-    arg.resource_kind = XlaResource::kVariable;
-    if (variable_args[variable_id].present) {
-      const Tensor& value = variable_args[variable_id].value;
-      arg.type = value.dtype();
-      arg.shape = value.shape();
-      arg.initialized = true;
+    } else if (variable_args.count(input_num) == 0) {
+      // Handles the non-constant arguments.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      if (input.NumElements() > 0) {
+        arg.kind = XlaCompiler::Argument::kParameter;
+      } else {
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = input;
+      }
+      arg.type = input.dtype();
+      arg.shape = input.shape();
     } else {
-      // The values of uninitialized variables are not passed as inputs, since
-      // they are meaningless. However, it is legal to assign to a resource
-      // variable for the first time inside the XLA computation, so we do permit
-      // uninitialized variables.
-      arg.initialized = false;
-      arg.type = DT_INVALID;
-      arg.shape = TensorShape();
+      // Handles resource variables.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
+      const OptionalTensor& variable = variable_args.at(input_num);
+      arg.name = variable.name;
+      arg.kind = XlaCompiler::Argument::kResource;
+      arg.resource_kind = XlaResource::kVariable;
+      if (variable.present) {
+        const Tensor& value = variable.value;
+        arg.type = value.dtype();
+        arg.shape = value.shape();
+        arg.initialized = true;
+      } else {
+        // The values of uninitialized variables are not passed as inputs, since
+        // they are meaningless. However, it is legal to assign to a resource
+        // variable for the first time inside the XLA computation, so we do
+        // permit uninitialized variables.
+        arg.initialized = false;
+        arg.type = DT_INVALID;
+        arg.shape = TensorShape();
+      }
     }
-    ++input_num;
   }
 
   return Status::OK();
@@ -233,16 +209,43 @@ Status XlaCompilationCache::BuildExecutable(
 
 Status XlaCompilationCache::Compile(
     const XlaCompiler::Options& options, const NameAttrList& function,
-    int num_constant_args, const std::vector<OptionalTensor>& variable_args,
-    OpKernelContext* ctx,
+    const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable,
     const XlaCompiler::CompileOptions* compile_options) {
+  return CompileImpl(options, function, constant_args, variable_args, ctx,
+                     compilation_result, executable, compile_options, false);
+}
+
+Status XlaCompilationCache::CompileSingleOp(
+    const XlaCompiler::Options& options,
+    const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult** compilation_result,
+    xla::LocalExecutable** executable,
+    const XlaCompiler::CompileOptions* compile_options) {
+  const NodeDef& def = ctx->op_kernel().def();
+  NameAttrList name;
+  name.set_name(def.op());
+  *name.mutable_attr() = def.attr();
+  return CompileImpl(options, name, constant_args, variable_args, ctx,
+                     compilation_result, executable, compile_options, true);
+}
+
+Status XlaCompilationCache::CompileImpl(
+    const XlaCompiler::Options& options, const NameAttrList& function,
+    const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult** compilation_result,
+    xla::LocalExecutable** executable,
+    const XlaCompiler::CompileOptions* compile_options,
+    bool compile_single_op) {
   VLOG(1) << "XlaCompilationCache::Compile " << DebugString();
 
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "num_inputs=" << ctx->num_inputs()
-            << " num_constant_args=" << num_constant_args
+            << " num_constant_args=" << constant_args.size()
             << " num_variable_args=" << variable_args.size();
     for (int i = 0; i < ctx->num_inputs(); i++) {
       TensorShape shape = ctx->input(i).shape();
@@ -250,10 +253,12 @@ Status XlaCompilationCache::Compile(
               << " present=" << ctx->has_input(i)
               << " shape=" << shape.DebugString();
     }
-    for (const OptionalTensor& variable : variable_args) {
+    for (auto& iterator : variable_args) {
+      const OptionalTensor& variable = iterator.second;
       VLOG(2) << "variable present=" << variable.present
               << " type=" << DataTypeString(variable.value.dtype())
-              << " shape=" << variable.value.shape().DebugString();
+              << " shape=" << variable.value.shape().DebugString()
+              << " TF arg= " << iterator.first;
     }
     VLOG(2) << "num_outputs = " << ctx->num_outputs();
     for (int i = 0; i < ctx->num_outputs(); i++) {
@@ -261,11 +266,12 @@ Status XlaCompilationCache::Compile(
     }
   }
 
-  TF_RET_CHECK(num_constant_args + variable_args.size() <= ctx->num_inputs());
+  TF_RET_CHECK(constant_args.size() + variable_args.size() <=
+               ctx->num_inputs());
 
   Signature signature;
-  TF_RETURN_IF_ERROR(BuildSignature(function, num_constant_args, variable_args,
-                                    ctx, &signature));
+  TF_RETURN_IF_ERROR(
+      BuildSignature(function, constant_args, variable_args, ctx, &signature));
 
   VLOG(2) << "Signature: " << SignatureDebugString(signature);
   // The outer lock protects the existence of the cache entry. It does not
@@ -292,13 +298,20 @@ Status XlaCompilationCache::Compile(
     // a long time.)
     std::vector<XlaCompiler::Argument> args;
     TF_RETURN_IF_ERROR(
-        BuildArguments(num_constant_args, variable_args, ctx, &args));
+        BuildArguments(constant_args, variable_args, ctx, &args));
 
     XlaCompiler compiler(options);
     entry->compiled = true;
-    entry->compilation_status = compiler.CompileFunction(
-        compile_options ? *compile_options : XlaCompiler::CompileOptions(),
-        function, args, &entry->compilation_result);
+
+    if (compile_single_op) {
+      entry->compilation_status = compiler.CompileSingleOp(
+          compile_options ? *compile_options : XlaCompiler::CompileOptions(),
+          signature.name, ctx, args, &entry->compilation_result);
+    } else {
+      entry->compilation_status = compiler.CompileFunction(
+          compile_options ? *compile_options : XlaCompiler::CompileOptions(),
+          function, args, &entry->compilation_result);
+    }
   }
   *compilation_result = &entry->compilation_result;
   if (entry->compilation_status.ok() && executable) {
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 0858020716fcf4763e42dc0699ad22cfda756942..be1043d8c3fc0573922837e541615114a6d7a1a5 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -52,29 +52,52 @@ class XlaCompilationCache : public ResourceBase {
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
   // to execute an XLA Computation. Compilation results are cached.
   // `function` is the name of a Tensorflow function to compile.
-  // `num_constant_args` is the number of compile-time constant arguments to
-  // `function`. `variable_args` is a snapshot of the current values of the
+  // `constant_args` is a map of tensorflow argument number to its constant
+  //  value.
+  // `variable_args` is a snapshot of the current values of the
   // resource variable arguments to `function`; uninitialized variables are
   // represented by an absent OptionalTensor.
   // The result of compilation is written to `*compilation_result`, which must
   // be non-null. If `executable` is non-null, also builds an
-  // xla::LocalExecutable and sets `executable to point to it. The resulting
+  // xla::LocalExecutable and sets `executable` to point to it. The resulting
   // executable pointer may be null if the computation has no non-constant
   // outputs.
   Status Compile(const XlaCompiler::Options& options,
-                 const NameAttrList& function, int num_constant_args,
-                 const std::vector<OptionalTensor>& variable_args,
+                 const NameAttrList& function,
+                 const std::map<int, Tensor>& constant_args,
+                 const std::map<int, OptionalTensor>& variable_args,
                  OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** compilation_result,
                  xla::LocalExecutable** executable,
                  const XlaCompiler::CompileOptions* compile_options);
 
+  // As above, but calls XlaCompiler::CompileSingleOp instead of
+  // XlaCompiler::CompileFunction.
+  Status CompileSingleOp(
+      const XlaCompiler::Options& options,
+      const std::map<int, Tensor>& constant_args,
+      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      const XlaCompiler::CompilationResult** compilation_result,
+      xla::LocalExecutable** executable,
+      const XlaCompiler::CompileOptions* compile_options);
+
   xla::LocalClient* client() const { return client_; }
   const DeviceType& device_type() const { return device_type_; }
 
   string DebugString() override;
 
  private:
+  // Common implementation of Compile and CompileSingleOp.
+  Status CompileImpl(const XlaCompiler::Options& options,
+                     const NameAttrList& function,
+                     const std::map<int, Tensor>& constant_args,
+                     const std::map<int, OptionalTensor>& variable_args,
+                     OpKernelContext* ctx,
+                     const XlaCompiler::CompilationResult** compilation_result,
+                     xla::LocalExecutable** executable,
+                     const XlaCompiler::CompileOptions* compile_options,
+                     bool compile_single_op);
+
   // Takes `result` which has been compiled from a Tensorflow subgraph to a
   // XLA computation already, and generates an XLA LocalExecutable `executable`.
   Status BuildExecutable(const XlaCompiler::Options& options,
@@ -104,8 +127,9 @@ class XlaCompilationCache : public ResourceBase {
   static string SignatureDebugString(const Signature& sig);
 
   // Builds the signature for a compilation.
-  Status BuildSignature(const NameAttrList& function, int num_constant_args,
-                        const std::vector<OptionalTensor>& variable_args,
+  Status BuildSignature(const NameAttrList& function,
+                        const std::map<int, Tensor>& constant_args,
+                        const std::map<int, OptionalTensor>& variable_args,
                         OpKernelContext* ctx, Signature* signature);
 
   // The value associated with a cache entry.
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60458f6f3314b2c3b65be1c90e051b2a670383bc
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -0,0 +1,176 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the XlaCompileOnDemandOp.
+
+#include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+namespace {
+std::map<int, OptionalTensor> GetVariables(OpKernelContext* ctx) {
+  std::map<int, OptionalTensor> variables;
+  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+    if (ctx->input(i).dtype() == DT_RESOURCE) {
+      Var* variable = nullptr;
+      ResourceHandle handle = HandleFromInput(ctx, i);
+      OptionalTensor& optional = variables[i];
+      optional.name = handle.name();
+      if (LookupResource(ctx, handle, &variable).ok()) {
+        tf_shared_lock lock(*variable->mu());
+        optional.present = true;
+        optional.value = *variable->tensor();
+      }
+    }
+  }
+  return variables;
+}
+}  // namespace
+
+Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
+                                 const XlaDevice::Metadata& metadata,
+                                 const XlaCompiler::CompilationResult* result,
+                                 xla::LocalExecutable* executable) {
+  std::map<int, OptionalTensor> variables = GetVariables(ctx);
+  int64 num_resource_args = variables.size();
+
+  xla::LocalClient* client = metadata.client();
+
+  // Builds an XLA allocator for the device.
+  XlaComputationLaunchContext launch_context(
+      num_resource_args, client, client->backend().memory_allocator(), true);
+
+  launch_context.PopulateInputs(ctx, result, variables);
+
+  se::Stream* stream =
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
+  TF_RET_CHECK(stream);
+
+  VLOG(2) << "Executing computation.";
+  xla::ExecutableRunOptions run_options;
+  run_options.set_stream(stream);
+  run_options.set_allocator(client->backend().memory_allocator());
+  run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
+  run_options.set_rng_seed(ctx->step_id());
+
+  auto run_result = executable->Run(launch_context.arguments(), run_options);
+  TF_RETURN_IF_ERROR(run_result.status());
+
+  launch_context.PopulateOutputs(ctx, result, run_result.ConsumeValueOrDie());
+  return Status::OK();
+}
+
+bool XlaCompileOnDemandOp::MustArgumentBeConstant(const OpKernel* op_kernel,
+                                                  int64 argument_idx) {
+  // TODO(jmolloy): This could be expensive, so memoize.
+  auto* constant_inputs = tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
+      op_kernel->def().op());
+  CHECK(constant_inputs);
+  std::set<int64> constant_input_indices;
+  for (const auto& name : *constant_inputs) {
+    int start, stop;
+    TF_CHECK_OK(op_kernel->InputRange(name, &start, &stop));
+    for (int i = start; i < stop; ++i) {
+      constant_input_indices.insert(i);
+    }
+  }
+  return constant_input_indices.count(argument_idx) > 0;
+}
+
+bool XlaCompileOnDemandOp::ShouldArgumentBeConstant(const OpKernel* op_kernel,
+                                                    int64 argument_idx) {
+  // Right now we only create kConstant arguments when absolutely required, but
+  // there may be benefit in eagerly constant-folding a larger subset of
+  // arguments in the future.
+  return MustArgumentBeConstant(op_kernel, argument_idx);
+}
+
+Status XlaCompileOnDemandOp::Compile(
+    OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+    const XlaCompiler::CompilationResult** result,
+    xla::LocalExecutable** executable) {
+  std::map<int, Tensor> constant_arguments;
+  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+    const Tensor& device_tensor = ctx->input(i);
+    if (const XlaTensor* xla_tensor = XlaTensor::FromTensor(&device_tensor)) {
+      if (xla_tensor->has_host_tensor() &&
+          ShouldArgumentBeConstant(&ctx->op_kernel(), i)) {
+        constant_arguments[i] = xla_tensor->host_tensor();
+      }
+    }
+    if (constant_arguments.count(i) == 0 &&
+        MustArgumentBeConstant(&ctx->op_kernel(), i)) {
+      // Slow path; the argument is not available as a host constant so we must
+      // fetch it synchronously.
+      Tensor host_tensor;
+      AllocatorAttributes attrs;
+      attrs.set_on_host(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_temp(
+          device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
+      Notification n;
+      ctx->op_device_context()->CopyDeviceTensorToCPU(
+          &device_tensor, "ConstantArgument",
+          reinterpret_cast<Device*>(ctx->device()), &host_tensor,
+          [&](Status status) { n.Notify(); });
+      n.WaitForNotification();
+      constant_arguments[i] = host_tensor;
+    }
+  }
+
+  // We store information about the JIT-compiled XLA computation
+  // in the ResourceMgr.
+  ResourceMgr* rm = ctx->resource_manager();
+  CHECK(rm);
+
+  XlaCompilationCache* cache;
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
+      rm->default_container(), "xla_cache", &cache,
+      [&](XlaCompilationCache** cache) {
+        *cache = new XlaCompilationCache(metadata.client(),
+                                         metadata.jit_device_type());
+        return Status::OK();
+      }));
+  // Hold the reference to the JIT during evaluation. (We could probably
+  // free it sooner because the ResourceMgr will retain a reference, but
+  // this is more obviously correct.)
+  core::ScopedUnref cache_ref(cache);
+
+  XlaCompiler::Options options;
+  DeviceType device_type = metadata.jit_device_type();
+  options.device_type = &device_type;
+  options.client = metadata.client();
+  options.flib_def =
+      new FunctionLibraryDefinition(OpRegistry::Global(), FunctionDefLibrary{});
+
+  std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
+  return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx,
+                                result, executable,
+                                /*compile_options=*/nullptr);
+}
+
+void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
+  const XlaCompiler::CompilationResult* result;
+  xla::LocalExecutable* executable;
+  const XlaDevice::Metadata* metadata;
+  OP_REQUIRES_OK(ctx, XlaDevice::GetMetadata(ctx, &metadata));
+  OP_REQUIRES_OK(ctx, Compile(ctx, *metadata, &result, &executable));
+  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..23c6f3903f841a6c39104983c6f7f409757a7319
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The XlaCompileOnDemandOp is an OpKernel that, when its Compute method is
+// called, will generate an xla::Computation and run it asynchronously.
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
+
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// An OpKernel that compiles an op to an XLA computation and runs it. Unlike
+// _XlaLaunch this doesn't rely on any rewrites of the graphdef - it will run a
+// vanilla TensorFlow op as long as the bridge supports it.
+//
+// Importantly _XlaLaunch assumes all input and output tensors are on the host,
+// whereas XlacompileOnDemandOp works with tensors in device memory.
+class XlaCompileOnDemandOp : public OpKernel {
+ public:
+  explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  XlaCompiler::Argument CreateCompilerArgument(OpKernelContext* ctx, int64 i);
+  bool ShouldArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx);
+  bool MustArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx);
+  Status Compile(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+                 const XlaCompiler::CompilationResult** result,
+                 xla::LocalExecutable** executable);
+  Status Run(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+             const XlaCompiler::CompilationResult* result,
+             xla::LocalExecutable* executable);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index e238252751e677eb947f6df03e3b2f2e948ffe19..bc07dbd7bdf005fde781f7a1e6775080e363abfb 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -17,6 +17,8 @@ limitations under the License.
 // operators using XLA via the XLA "Host" (CPU) backend.
 
 #include "tensorflow/compiler/jit/kernels/xla_launch_op.h"
+#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
+#include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -34,14 +36,24 @@ class XlaCpuDeviceFactory : public DeviceFactory {
 Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& options,
                                           const string& name_prefix,
                                           std::vector<Device*>* devices) {
+  legacy_flags::XlaDeviceFlags* flags = legacy_flags::GetXlaDeviceFlags();
+  bool compile_on_demand = flags->tf_xla_compile_on_demand;
+
+  XlaOpRegistry::DeviceRegistration registration;
+  registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
+  registration.requires_compilation = !compile_on_demand;
+  registration.enable_jit_by_default = false;
+  registration.compile_resource_ops = true;
+
   static XlaDeviceOpRegistrations* registrations =
       RegisterXlaDeviceKernels(DEVICE_XLA_CPU, DEVICE_CPU_XLA_JIT);
   (void)registrations;
 
   std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create(
-      "Host", DEVICE_XLA_CPU, 0, DEVICE_CPU_XLA_JIT, options, name_prefix,
-      /*register_device_for_compilation=*/true, &device));
+  TF_RETURN_IF_ERROR(XlaDevice::Create("Host", DEVICE_XLA_CPU, 0,
+                                       DEVICE_CPU_XLA_JIT, options, name_prefix,
+                                       registration,
+                                       /*transfer_as_literal=*/false, &device));
   devices->push_back(device.release());
   return Status::OK();
 }
@@ -50,8 +62,8 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_CPU, XlaCpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 6> kAllXlaCpuTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 7> kAllXlaCpuTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_CPU, XlaLocalLaunchOp, kAllXlaCpuTypes);
 REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_CPU, kAllXlaCpuTypes);
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index d4d8fe1c1d575b4e35d624621cc709e3a16569d5..70263b1ff936757101a3c47d192b2ba58271dc79 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include <stdlib.h>
 #include <unordered_set>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
@@ -49,8 +51,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace se = ::perftools::gputools;
-
 namespace tensorflow {
 
 // Caches a XlaDeviceAllocator per <backend, device ordinal> pair. A
@@ -99,7 +99,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   }
 
   std::unique_ptr<XlaDeviceAllocator> alloc =
-      xla::MakeUnique<XlaDeviceAllocator>(backend, device_ordinal);
+      xla::MakeUnique<XlaDeviceAllocator>();
   XlaDeviceAllocator* alloc_ptr = alloc.get();
   state.allocators_[{backend, device_ordinal}] = std::move(alloc);
   return alloc_ptr;
@@ -108,25 +108,19 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
 /* static */ Status XlaDevice::Create(
     const string& platform_name, const string& device_name, int device_ordinal,
     const string& jit_device_name, const SessionOptions& options,
-    const string& name_prefix, bool register_device_for_compilation,
-    std::unique_ptr<XlaDevice>* device) {
+    const string& name_prefix,
+    const XlaOpRegistry::DeviceRegistration& registration,
+    bool transfer_as_literal, std::unique_ptr<XlaDevice>* device) {
   VLOG(1) << "XlaDevice::Create " << platform_name << " " << device_name << ":"
           << device_ordinal;
 
-  if (register_device_for_compilation) {
-    // These are no-ops if they have already been done previously for
-    // this device_name/compilation_device_name pair.
-    XlaOpRegistry::DeviceRegistration registration;
-    registration.compilation_device_name = jit_device_name;
-    registration.requires_compilation = true;
-    registration.enable_jit_by_default = false;
-    registration.compile_resource_ops = true;
-    XlaOpRegistry::RegisterCompilationDevice(device_name, registration);
-  }
+  // These are no-ops if they have already been done previously for
+  // this device_name/compilation_device_name pair.
+  XlaOpRegistry::RegisterCompilationDevice(device_name, registration);
 
   auto platform = se::MultiPlatformManager::PlatformWithName(platform_name);
   if (!platform.ok()) {
-    return StreamExecutorUtil::ConvertStatus(platform.status());
+    return platform.status();
   }
 
   const DeviceAttributes attrs = Device::BuildDeviceAttributes(
@@ -137,7 +131,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
 
   device->reset(new XlaDevice(options, attrs, device_ordinal,
                               DeviceType(jit_device_name),
-                              platform.ValueOrDie()));
+                              platform.ValueOrDie(), transfer_as_literal));
   return Status::OK();
 }
 
@@ -162,6 +156,7 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
 
 /* static */ Status XlaDevice::GetMetadata(OpKernelContext* ctx,
                                            const Metadata** metadata) {
+  *metadata = nullptr;
   XlaDevice* xla_device =
       dynamic_cast<XlaDevice*>(ctx->device()->UnderlyingDevice());
   if (xla_device == nullptr) {
@@ -177,15 +172,23 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
 
 XlaDevice::XlaDevice(const SessionOptions& options,
                      const DeviceAttributes& attrs, int device_ordinal,
-                     const DeviceType& jit_device_name, se::Platform* platform)
+                     const DeviceType& jit_device_name, se::Platform* platform,
+                     bool transfer_as_literal)
     : LocalDevice(options, attrs),
       xla_metadata_(device_ordinal, platform, jit_device_name),
       device_ordinal_(device_ordinal),
       jit_device_name_(jit_device_name),
       xla_allocator_(nullptr),
-      platform_(platform) {}
+      platform_(platform),
+      transfer_as_literal_(transfer_as_literal) {
+  VLOG(1) << "Created XLA device " << jit_device_name;
+}
 
-XlaDevice::~XlaDevice() {}
+XlaDevice::~XlaDevice() {
+  if (gpu_device_info_ != nullptr) {
+    gpu_device_info_->default_context->Unref();
+  }
+}
 
 xla::LocalClient* XlaDevice::client() const {
   // We lazily create the client because the platform commits to the
@@ -193,9 +196,8 @@ xla::LocalClient* XlaDevice::client() const {
   // don't want to do it until we get a chance to hook the platform up
   // to a simulator.
 
-  // For now GetOrCreateLocalClient always returns success when passed
-  // a non-null platform. If that changes we may have to plumb in some
-  // way to pass Status back.
+  // TODO(b/78468222): This can fail, at least when the backend is GPU and
+  // there is no GPU on the host.
   return xla::ClientLibrary::GetOrCreateLocalClient(platform_).ValueOrDie();
 }
 
@@ -220,12 +222,32 @@ xla::StatusOr<se::Stream*> XlaDevice::GetStream() {
   return stream_.get();
 }
 
+Status XlaDevice::CreateAndSetGpuDeviceInfo() {
+  if (gpu_device_info_ == nullptr) {
+    TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
+    // Call GetAllocator for the side-effect of ensuring the allocator
+    // is created.
+    GetAllocator({});
+    // XlaDevice owns both gpu_device_info_ and
+    // gpu_device_info_->default_context.
+    gpu_device_info_ = absl::make_unique<GpuDeviceInfo>();
+    gpu_device_info_->stream = stream;
+    gpu_device_info_->default_context =
+        new XlaDeviceContext(stream, client(), transfer_as_literal_);
+    set_tensorflow_gpu_device_info(gpu_device_info_.get());
+  }
+
+  return Status::OK();
+}
+
 Status XlaDevice::FillContextMap(const Graph* graph,
                                  DeviceContextMap* device_context_map) {
   VLOG(1) << "XlaDevice::FillContextMap";
   device_context_map->resize(graph->num_node_ids());
   TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-  auto ctx = new XlaDeviceContext(stream);
+  // Call GetAllocator for the side-effect of ensuring the allocator is created.
+  GetAllocator({});
+  auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_);
   for (Node* n : graph->nodes()) {
     VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name();
     ctx->Ref();
@@ -238,11 +260,10 @@ Status XlaDevice::FillContextMap(const Graph* graph,
 void XlaDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   VLOG(1) << "XlaDevice::Compute " << op_kernel->name() << ":"
           << op_kernel->type_string();
-  // When TraceMe profiling is off (which is the default), the
-  // following TraceMe constructor is simply a conditional test of
-  // false value. Measurements show that its overhead is negligible.
-  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string(),
-                                  op_kernel->IsExpensive());
+  // When Xprof profiling is off (which is the default), constructing the
+  // activity is simple enough that its overhead is negligible.
+  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
+                                   op_kernel->IsExpensive());
   op_kernel->Compute(context);
 }
 
@@ -250,8 +271,8 @@ void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                              AsyncOpKernel::DoneCallback done) {
   VLOG(1) << "XlaDevice::ComputeAsync " << op_kernel->name() << ":"
           << op_kernel->type_string();
-  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string(),
-                                  op_kernel->IsExpensive());
+  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
+                                   op_kernel->IsExpensive());
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -273,7 +294,7 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
     Notification n;
     TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-    XlaTransferManager manager(stream);
+    XlaTransferManager manager(stream, client(), transfer_as_literal_);
     manager.CopyCPUTensorToDevice(&parsed, this, &copy,
                                   [&n, &status](const Status& s) {
                                     status = s;
@@ -288,19 +309,23 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
 
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
                                                    const char* jit_device) {
+  // Any op assigned to the device that isn't rewritten by the graph rewriter
+  // gets executed by a n XlaCompileOnDemandOp, which compiles it and executes
+  // it just-in-time.
+  kernel_factory::OpKernelRegistrar::Factory factory =
+      [](OpKernelConstruction* context) -> OpKernel* {
+    return new XlaCompileOnDemandOp(context);
+  };
   XlaOpRegistry::RegisterCompilationKernels();
   XlaDeviceOpRegistrations* registrations = new XlaDeviceOpRegistrations;
-  auto dummy_factory = [](OpKernelConstruction* context) -> OpKernel* {
-    return new XlaDeviceDummyOp(context);
-  };
   for (const KernelDef* jit_def : XlaOpRegistry::DeviceKernels(
            jit_device,
            /*include_compilation_only_kernels=*/false)) {
     KernelDef* def = new KernelDef(*jit_def);
     def->set_device_type(device);
     registrations->op_kernel_registrars.emplace_back(
-        new kernel_factory::OpKernelRegistrar(def, "XlaDeviceDummyOp",
-                                              dummy_factory));
+        new kernel_factory::OpKernelRegistrar(def, "XlaCompileOnDemandOp",
+                                              factory));
   }
   return registrations;
 }
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index d2ec38293c429f04f088bf3726ba97eb4e4b0dba..3ae87308cc7cffa916e178893df70a3f314b11b0 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -26,6 +26,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
@@ -47,20 +49,20 @@ class XlaDevice : public LocalDevice {
   // retrieved e.g., when lazily creating the XlaCompilationCache device.
   class Metadata {
    public:
-    Metadata(int device_ordinal, perftools::gputools::Platform* platform,
+    Metadata(int device_ordinal, se::Platform* platform,
              const DeviceType& device_type);
 
     // The index of the device on this host.
     int device_ordinal() const;
 
-    perftools::gputools::Platform* platform() const;
+    se::Platform* platform() const;
     xla::LocalClient* client() const;
     const DeviceType& jit_device_type() const;
 
    private:
     const int device_ordinal_;
     const DeviceType device_type_;
-    perftools::gputools::Platform* platform_;  // Not owned.
+    se::Platform* platform_;  // Not owned.
 
     TF_DISALLOW_COPY_AND_ASSIGN(Metadata);
   };
@@ -71,15 +73,19 @@ class XlaDevice : public LocalDevice {
   // Factory function. 'platform_name' is the name of the XLA platform.
   // 'device_name' is the name of the Tensorflow device to create.
   // 'jit_device_name' is the name of the corresponding JIT device.
+  // 'transfer_as_literal' is true if device<->host transfers must be done using
+  // XLA's TransferLiteral{To,From}Device interface. If false, we can use
+  // ThenMemcpy instead.
   static Status Create(const string& platform_name, const string& device_name,
                        int device_ordinal, const string& jit_device_name,
                        const SessionOptions& options, const string& name_prefix,
-                       bool register_device_for_compilation,
+                       const XlaOpRegistry::DeviceRegistration& registration,
+                       bool transfer_as_literal,
                        std::unique_ptr<XlaDevice>* device);
 
   XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs,
             int device_ordinal, const DeviceType& jit_device_name,
-            ::perftools::gputools::Platform* platform);
+            se::Platform* platform, bool transfer_as_literal);
   ~XlaDevice() override;
 
   Allocator* GetAllocator(AllocatorAttributes attr) override;
@@ -96,7 +102,11 @@ class XlaDevice : public LocalDevice {
                              Tensor* tensor) override;
 
   xla::LocalClient* client() const;
-  xla::StatusOr<::perftools::gputools::Stream*> GetStream();
+  xla::StatusOr<se::Stream*> GetStream();
+
+  // If not already set, create and set GpuDeviceInfo.
+  // Not thread-safe
+  Status CreateAndSetGpuDeviceInfo();
 
  private:
   // The metadata of this XlaDevice.
@@ -104,18 +114,25 @@ class XlaDevice : public LocalDevice {
   // Which hardware device in the client's platform this XlaDevice controls.
   const int device_ordinal_;
   // The name of the device that is used to compile Ops for this XlaDevice.
-  const DeviceType& jit_device_name_;
+  DeviceType jit_device_name_;
   // Memory allocator associated with this device.
   Allocator* xla_allocator_;                   // Not owned.
-  ::perftools::gputools::Platform* platform_;  // Not owned.
+  se::Platform* platform_;                     // Not owned.
   // Stream associated with this device. Operations enqueued on this
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
   // computations enqueued by XLA.
   xla::Backend::StreamPtr stream_;
+  // Must we use XLA's transfer manager for correct host<->device transfers? if
+  // false, we can use ThenMemcpy() instead.
+  bool transfer_as_literal_;
+
+  // If set, holds default device context (that we must Unref)
+  // and its stream.
+  std::unique_ptr<GpuDeviceInfo> gpu_device_info_;
 };
 
-// Builds dummy OpKernel registrations on 'device' for the JIT operators
+// Builds OpKernel registrations on 'device' for the JIT operators
 // registered on 'jit_device'. Returns ownership of a XlaDeviceOpRegistrations
 // object that encapsulates the kernel registrations.
 struct XlaDeviceOpRegistrations {
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index c936222f32056e92efced82d5adb3a96c8041a17..bf8c1886a022310eeaacdf69463f575a393dd8d0 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -15,44 +15,69 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_device_context.h"
 
+#include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/platform/mem.h"
 
-namespace se = ::perftools::gputools;
-
 namespace tensorflow {
 
 // The allocator used for Tensors assigned to the XLA device.
-XlaDeviceAllocator::XlaDeviceAllocator(const xla::Backend* backend,
-                                       int device_ordinal)
-    : backend_(backend), device_ordinal_(device_ordinal) {}
-
+XlaDeviceAllocator::XlaDeviceAllocator() {}
 XlaDeviceAllocator::~XlaDeviceAllocator() = default;
 
 string XlaDeviceAllocator::Name() { return "xla"; }
 
 void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
-  se::DeviceMemoryBase dmem =
-      backend_->memory_allocator()
-          ->Allocate(device_ordinal_, num_bytes, /*retry_on_failure=*/false)
-          .ValueOrDie();
-  VLOG(2) << "Allocated XLA device tensor " << dmem.opaque() << "(" << num_bytes
-          << ")";
-  return dmem.opaque();
+  // We always return an empty XlaTensor object, encoded as an opaque tagged
+  // pointer. We can return an empty object and ignore num_bytes here because we
+  // have control over all of the uses of this device tensor, and can lazily
+  // allocate memory when used. This allows us to also know the shape of the
+  // allocated Tensor, which is useful if the device's tensor representation
+  // differs from the host.
+  return XlaTensor::ToOpaquePointer(new XlaTensor());
 }
 
 void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
-  se::DeviceMemoryBase dmem(ptr);
-  TF_CHECK_OK(backend_->memory_allocator()->Deallocate(device_ordinal_, &dmem));
-  VLOG(2) << "Deallocated XLA device tensor " << ptr;
+  delete XlaTensor::FromOpaquePointer(ptr);
 }
 
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
 
-XlaTransferManager::XlaTransferManager(se::Stream* stream) : stream_(stream) {}
+XlaTransferManager::XlaTransferManager(se::Stream* stream,
+                                       xla::LocalClient* client,
+                                       bool transfer_as_literal)
+    : stream_(stream),
+      client_(client),
+      transfer_manager_(client->backend().transfer_manager()),
+      transfer_as_literal_(transfer_as_literal) {}
+
+Status XlaTransferManager::TransferLiteralToDevice(
+    const Tensor& host_tensor, Tensor* device_tensor) const {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(HostTensorToLiteral(host_tensor, &literal));
+  VLOG(1) << "Transfer to device as literal: " << literal.ToString();
+
+  const xla::ShapedBuffer& shaped_buffer =
+      XlaTensor::FromTensor(device_tensor)->shaped_buffer();
+  return transfer_manager_->TransferLiteralToDevice(stream_->parent(), literal,
+                                                    shaped_buffer);
+}
+
+Status XlaTransferManager::TransferLiteralFromDevice(
+    Tensor* host_tensor, const Tensor& device_tensor) const {
+  const xla::ShapedBuffer& shaped_buffer =
+      XlaTensor::FromTensor(&device_tensor)->shaped_buffer();
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
+                      transfer_manager_->TransferLiteralFromDevice(
+                          stream_->parent(), shaped_buffer));
+  VLOG(1) << "Transfer from device as literal: " << literal->ToString();
+  return LiteralToHostTensor(*literal, host_tensor->dtype(), host_tensor);
+}
 
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                                Device* device,
@@ -68,18 +93,35 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
 
     void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
     const int64 total_bytes = cpu_tensor->TotalBytes();
-    void* dst_ptr = DMAHelper::base(device_tensor);
-    se::DeviceMemoryBase dev_dst_ptr(dst_ptr, total_bytes);
 
+    XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+    CHECK(xla_tensor);
+    if (!xla_tensor->has_shaped_buffer()) {
+      Status s = xla_tensor->AllocateShapedBuffer(
+          device_tensor->dtype(), device_tensor->shape(), client_,
+          stream_->parent()->device_ordinal());
+      if (!s.ok()) {
+        done(s);
+        return;
+      }
+    }
+
+    se::DeviceMemoryBase dev_dst_ptr =
+        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     Status status;
-    stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
-    // TODO(hpucha): Make this asynchronous.
-    Status block_status = stream_->BlockHostUntilDone();
-    if (!block_status.ok()) {
-      status = xla::InternalError(
-          "Failed to complete data transfer on stream %p: %s", stream_,
-          block_status.error_message().c_str());
+    if (transfer_as_literal_) {
+      status = TransferLiteralToDevice(*cpu_tensor, device_tensor);
+    } else {
+      stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
+      // TODO(hpucha): Make this asynchronous.
+      Status block_status = stream_->BlockHostUntilDone();
+      if (!block_status.ok()) {
+        status = xla::InternalError(
+            "Failed to complete data transfer on stream %p: %s", stream_,
+            block_status.error_message().c_str());
+      }
     }
+    xla_tensor->set_host_tensor(*cpu_tensor);
 
     done(status);
     return;
@@ -103,18 +145,22 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
             << device_tensor->NumElements();
 
     const int64 total_bytes = cpu_tensor->TotalBytes();
-    void* src_ptr = const_cast<void*>(DMAHelper::base(device_tensor));
-    se::DeviceMemoryBase dev_src_ptr(src_ptr, total_bytes);
+    se::DeviceMemoryBase dev_src_ptr =
+        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     void* dst_ptr = DMAHelper::base(cpu_tensor);
 
     Status status;
-    stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
-    // TODO(hpucha): Make this asynchronous.
-    Status block_status = stream_->BlockHostUntilDone();
-    if (!block_status.ok()) {
-      status = xla::InternalError(
-          "Failed to complete data transfer on stream %p: %s", stream_,
-          block_status.error_message().c_str());
+    if (transfer_as_literal_) {
+      status = TransferLiteralFromDevice(cpu_tensor, *device_tensor);
+    } else {
+      stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
+      // TODO(hpucha): Make this asynchronous.
+      Status block_status = stream_->BlockHostUntilDone();
+      if (!block_status.ok()) {
+        status = xla::InternalError(
+            "Failed to complete data transfer on stream %p: %s", stream_,
+            block_status.error_message().c_str());
+      }
     }
 
     done(status);
@@ -125,7 +171,9 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   done(Status::OK());
 }
 
-XlaDeviceContext::XlaDeviceContext(se::Stream* stream) : manager_(stream) {}
+XlaDeviceContext::XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
+                                   bool transfer_as_literal)
+    : manager_(stream, client, transfer_as_literal) {}
 
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index c4edcd474e48f791af9340c3cd6e4d031407bb68..d7f5f1d208989256f8043d2e6d93cf9bd89333b2 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -26,11 +27,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-// The allocator used for Tensors assigned to the XLA device. It uses
-// XLA backend's allocator.
+// The allocator used for Tensors assigned to the XLA device. The allocator
+// ignores the alignment and size of the request and always returns a new,
+// empty, XlaTensor.
 class XlaDeviceAllocator : public Allocator {
  public:
-  XlaDeviceAllocator(const xla::Backend* backend, int device_ordinal);
+  XlaDeviceAllocator();
   ~XlaDeviceAllocator() override;
 
   string Name() override;
@@ -38,30 +40,36 @@ class XlaDeviceAllocator : public Allocator {
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
-
- private:
-  // Which backend in the client this allocator belongs to.
-  const xla::Backend* backend_;
-  // Which hardware device in the client's backend this allocator belongs to.
-  const int device_ordinal_;
 };
 
 // Helper class for managing data transfers between host and XLA devices.
 class XlaTransferManager {
  public:
-  explicit XlaTransferManager(perftools::gputools::Stream* stream);
+  explicit XlaTransferManager(se::Stream* stream, xla::LocalClient* client,
+                              bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor, StatusCallback done) const;
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              StringPiece tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done);
-  perftools::gputools::Stream* stream() const { return stream_; }
+  se::Stream* stream() const { return stream_; }
 
  private:
+  Status TransferLiteralToDevice(const Tensor& host_tensor,
+                                 Tensor* device_tensor) const;
+  Status TransferLiteralFromDevice(Tensor* host_tensor,
+                                   const Tensor& device_tensor) const;
+
   // Stream obtained from a Device, used to transfer tensors between
   // CPU and device.
-  perftools::gputools::Stream* stream_;
+  se::Stream* stream_;
+  // For the underlying memory allocator and XLA's TransferManager.
+  xla::LocalClient* client_;
+  // Transfer manager, for marshalling data to and from the device.
+  xla::TransferManager* transfer_manager_;
+  // True if we must use XLA's TransferManager for correct device transfers.
+  bool transfer_as_literal_;
 };
 
 // DeviceContext for operators assigned to XlaDevice devices. The
@@ -69,7 +77,8 @@ class XlaTransferManager {
 // wraps the methods in XlaTransferManager.
 class XlaDeviceContext : public DeviceContext {
  public:
-  explicit XlaDeviceContext(perftools::gputools::Stream* stream);
+  explicit XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
+                            bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor,
@@ -77,9 +86,7 @@ class XlaDeviceContext : public DeviceContext {
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              StringPiece tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done) override;
-  perftools::gputools::Stream* stream() const override {
-    return manager_.stream();
-  }
+  se::Stream* stream() const override { return manager_.stream(); }
 
  private:
   XlaTransferManager manager_;
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 2326070358d67c0cf30ef17fab5c93862cd8932c..a8afbf9dcd736bb292b7c5f52c7cce2b47fb85b6 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -34,19 +34,35 @@ class XlaGpuDeviceFactory : public DeviceFactory {
 Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options,
                                           const string& name_prefix,
                                           std::vector<Device*>* devices) {
+  XlaOpRegistry::DeviceRegistration registration;
+  registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
+  registration.requires_compilation = true;
+  registration.enable_jit_by_default = false;
+  registration.compile_resource_ops = true;
+
   static XlaDeviceOpRegistrations* registrations =
       RegisterXlaDeviceKernels(DEVICE_XLA_GPU, DEVICE_GPU_XLA_JIT);
   (void)registrations;
 
   std::unique_ptr<XlaDevice> device;
-  Status status = XlaDevice::Create(
-      "CUDA", DEVICE_XLA_GPU, 0, DEVICE_GPU_XLA_JIT, options, name_prefix,
-      /*register_device_for_compilation=*/true, &device);
+  Status status =
+      XlaDevice::Create("CUDA", DEVICE_XLA_GPU, 0, DEVICE_GPU_XLA_JIT, options,
+                        name_prefix, registration,
+                        /*transfer_as_literal=*/false, &device);
   if (!status.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
     VLOG(1) << "Failed to create XLA_GPU device: " << status;
     return Status::OK();
   }
+
+  // TODO(b/78468222): Uncomment after fixing this bug
+  // status = device->CreateAndSetGpuDeviceInfo();
+  // if (!status.ok()) {
+  //  errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT,
+  //                          " device");
+  //  return status;
+  // }
+
   devices->push_back(device.release());
   return Status::OK();
 }
@@ -55,8 +71,9 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_GPU, XlaGpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 6> kAllXlaGpuTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 8> kAllXlaGpuTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL,
+     DT_BFLOAT16}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_GPU, XlaLocalLaunchOp, kAllXlaGpuTypes);
 REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_GPU, kAllXlaGpuTypes);
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index 2614deefd8823dcb8f38e9e22ae4e78145d0d96a..9e098c46f422b436c722bb909dc58930ab7c0ef6 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -25,8 +25,8 @@ namespace tensorflow {
 const char* const DEVICE_XLA_INTERPRETER = "XLA_INTERPRETER";
 const char* const DEVICE_INTERPRETER_XLA_JIT = "XLA_INTERPRETER_JIT";
 
-constexpr std::array<DataType, 5> kExecAllTypes = {
-    {DT_INT32, DT_FLOAT, DT_BOOL, DT_DOUBLE, DT_INT64}};
+constexpr std::array<DataType, 6> kExecAllTypes = {
+    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
 
 class XlaInterpreterDeviceFactory : public DeviceFactory {
  public:
@@ -41,10 +41,17 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
       DEVICE_XLA_INTERPRETER, DEVICE_INTERPRETER_XLA_JIT);
   (void)registrations;
 
+  XlaOpRegistry::DeviceRegistration registration;
+  registration.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
+  registration.requires_compilation = true;
+  registration.enable_jit_by_default = false;
+  registration.compile_resource_ops = true;
+
   std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create(
-      "Interpreter", DEVICE_XLA_INTERPRETER, 0, DEVICE_INTERPRETER_XLA_JIT,
-      options, name_prefix, /*register_device_for_compilation=*/true, &device));
+  TF_RETURN_IF_ERROR(XlaDevice::Create("Interpreter", DEVICE_XLA_INTERPRETER, 0,
+                                       DEVICE_INTERPRETER_XLA_JIT, options,
+                                       name_prefix, registration,
+                                       /*transfer_as_literal=*/false, &device));
   devices->push_back(device.release());
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a7f04271d4b7ea330f32b88ea1e3f4037988a91
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -0,0 +1,296 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/util/stream_executor_util.h"
+
+namespace tensorflow {
+namespace {
+using xla::ScopedShapedBuffer;
+using xla::ShapedBuffer;
+}  // anonymous namespace
+
+std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
+                                                        int num_variables) {
+  std::map<int, OptionalTensor> snapshot;
+  int first_variable = ctx->num_inputs() - num_variables;
+  for (int i = 0; i < num_variables; ++i) {
+    Var* variable = nullptr;
+    ResourceHandle handle = HandleFromInput(ctx, first_variable + i);
+    OptionalTensor& tensor = snapshot[first_variable + i];
+    if (LookupResource(ctx, handle, &variable).ok()) {
+      tf_shared_lock lock(*variable->mu());
+      tensor.name = handle.name();
+      tensor.present = true;
+      tensor.value = *variable->tensor();
+    }
+  }
+  return snapshot;
+}
+
+XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
+    : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
+
+XlaAllocator::~XlaAllocator() {}
+
+xla::StatusOr<se::DeviceMemoryBase> XlaAllocator::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
+  void* data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size);
+  if (data == nullptr) {
+    return errors::ResourceExhausted("Out of memory while trying to allocate ",
+                                     size, " bytes.");
+  } else {
+    return se::DeviceMemoryBase(data, size);
+  }
+}
+
+Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) {
+  wrapped_->DeallocateRaw(mem->opaque());
+  return Status::OK();
+}
+
+namespace {
+// Return the 'index''th subtree of the given ShapedBuffer as a
+// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
+// subtree, and sets the input's buffer pointers to nullptr for the subtree.
+ScopedShapedBuffer ExtractSubShapedBuffer(
+    ShapedBuffer* shaped_buffer, int index,
+    xla::DeviceMemoryAllocator* allocator) {
+  xla::Shape on_host_shape = xla::ShapeUtil::GetTupleElementShape(
+      shaped_buffer->on_host_shape(), index);
+  xla::Shape on_device_shape = xla::ShapeUtil::GetTupleElementShape(
+      shaped_buffer->on_device_shape(), index);
+
+  ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
+                                 shaped_buffer->platform(),
+                                 shaped_buffer->device_ordinal());
+
+  auto& shape_tree = shaped_buffer->buffers();
+  auto& sub_shape_tree = sub_shaped_buffer.buffers();
+  sub_shape_tree.CopySubtreeFrom(shape_tree,
+                                 /*source_base_index=*/{index},
+                                 /*target_base_index=*/{});
+  for (auto& index_to_buffer : shape_tree) {
+    if (!index_to_buffer.first.empty() && index_to_buffer.first[0] == index) {
+      index_to_buffer.second = se::DeviceMemoryBase(nullptr, 0);
+    }
+  }
+  return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
+}
+}  // namespace
+
+XlaComputationLaunchContext::XlaComputationLaunchContext(
+    int64 num_resource_args, xla::LocalClient* client,
+    xla::DeviceMemoryAllocator* xla_allocator, bool allocate_xla_tensors)
+    : num_resource_args_(num_resource_args),
+      client_(client),
+      xla_allocator_(xla_allocator),
+      allocate_xla_tensors_(allocate_xla_tensors) {}
+
+void XlaComputationLaunchContext::PopulateInputs(
+    OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
+    const std::map<int, OptionalTensor>& variables) {
+  // Build ShapedBuffers that point directly to the Tensor buffers.
+  arg_buffers_.reserve(kernel->xla_input_shapes.size() + 1);
+  arg_buffers_.resize(kernel->xla_input_shapes.size());
+  arg_ptrs_ = std::vector<ShapedBuffer*>(arg_buffers_.size());
+
+  // Pass remaining parameters.
+  const Tensor* t;
+  for (int i = 0; i < kernel->xla_input_shapes.size(); ++i) {
+    int arg_num = kernel->input_mapping[i];
+    const xla::Shape& shape = kernel->xla_input_shapes[i];
+    if (variables.count(arg_num)) {
+      t = &(variables.at(arg_num).value);
+      CHECK(t);
+    } else {
+      t = &(ctx->input(arg_num));
+    }
+
+    const xla::Shape on_device_shape =
+        client_->backend().transfer_manager()->HostShapeToDeviceShape(shape);
+    if (xla::ShapeUtil::IsTuple(on_device_shape)) {
+      const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
+      CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
+      arg_ptrs_[i] = const_cast<ShapedBuffer*>(&xla_tensor->shaped_buffer());
+    } else {
+      CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
+          << "On-device shape "
+          << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
+          << " not the same as on-host shape "
+          << xla::ShapeUtil::HumanStringWithLayout(shape);
+      se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
+      arg_buffers_[i] = xla::MakeUnique<ShapedBuffer>(
+          /*on_host_shape=*/shape, /*on_device_shape=*/shape,
+          client_->platform(), client_->default_device_ordinal());
+      arg_buffers_[i]->set_buffer(dmem, /*index=*/{});
+      arg_ptrs_[i] = arg_buffers_[i].get();
+    }
+  }
+}
+
+void XlaComputationLaunchContext::PopulateOutputs(
+    OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
+    ScopedShapedBuffer output) {
+  se::Stream* stream =
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
+
+  // Computation output should always be a tuple.
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Result tuple shape: " << output.on_host_shape().DebugString();
+    VLOG(2) << "Result tuple shape (on device): "
+            << output.on_device_shape().DebugString();
+  }
+  CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
+
+  // Copy XLA results to the OpOutputList.
+  int output_num = 0;
+  for (int i = 0; i < ctx->num_outputs(); ++i) {
+    Allocator* allocator = ctx->device()->GetAllocator({});
+    if (kernel->outputs[i].is_constant) {
+      // Output is a constant.
+      const Tensor& const_tensor = kernel->outputs[i].constant_value;
+      Tensor* output_tensor;
+      const size_t total_bytes = const_tensor.TotalBytes();
+      if (stream && total_bytes > 0) {
+        // Copy host -> device. (Empty tensors don't have backing buffers.)
+        // Manually allocate memory using an XlaTensorBuffer so we can allocate
+        // as much memory as the device requires (as given by
+        // GetByteSizeRequirement). This avoids XlaTransferManager having to
+        // reallocate the device buffer later.
+        VLOG(1) << "Constant output tensor on device";
+
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
+        if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
+          OP_REQUIRES_OK(ctx, xla_tensor->AllocateShapedBuffer(
+                                  const_tensor.dtype(), const_tensor.shape(),
+                                  client_, stream->parent()->device_ordinal()));
+        }
+
+        Device* device = dynamic_cast<Device*>(ctx->device());
+        OP_REQUIRES(ctx, device != nullptr,
+                    errors::Internal("DeviceBase was not a Device."));
+        ctx->op_device_context()->CopyCPUTensorToDevice(
+            &const_tensor, device, output_tensor,
+            [&](Status status) { TF_CHECK_OK(status); });
+
+        if (device->device_type() == DEVICE_GPU) {
+          // The GPUDeviceContext enqueues the host->device transfer in a
+          // separate stream from the main compute stream. We must ensure the
+          // compute stream is synchronized with the host->device transfer
+          // stream now otherwise we will create a race condition.
+          auto* gpu_device_context =
+              static_cast<GPUDeviceContext*>(ctx->op_device_context());
+          gpu_device_context->stream()->ThenWaitFor(
+              gpu_device_context->host_to_device_stream());
+        }
+      } else {
+        // No copy required.
+        ctx->set_output(i, const_tensor);
+        output_tensor = ctx->mutable_output(i);
+      }
+      if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
+        xla_tensor->set_host_tensor(const_tensor);
+      }
+    } else {
+      const TensorShape& shape = kernel->outputs[i].shape;
+      VLOG(2) << "Retval " << i << " shape " << shape.DebugString();
+
+      se::DeviceMemoryBase buffer = output.buffer({output_num});
+      if (allocate_xla_tensors_) {
+        Tensor* output_tensor;
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor));
+        XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
+        CHECK(xla_tensor);
+        xla_tensor->set_shaped_buffer(ScopedShapedBuffer(
+            ExtractSubShapedBuffer(&output, output_num, xla_allocator_)));
+      } else {
+        Tensor output_tensor = XlaTensorBuffer::MakeTensor(
+            ctx->expected_output_dtype(i), shape, buffer, allocator);
+        output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
+        ctx->set_output(i, output_tensor);
+      }
+      ++output_num;
+    }
+
+    if (VLOG_IS_ON(3)) {
+      VLOG(3) << ctx->mutable_output(i)->DebugString();
+    }
+  }
+
+  // Apply variable updates, if any.
+  VLOG(2) << "Applying variable updates";
+  for (int i = 0; i < kernel->resource_updates.size(); ++i) {
+    Allocator* allocator = ctx->device()->GetAllocator({});
+    const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
+    OP_REQUIRES(ctx,
+                write.input_index >= 0 && write.input_index < ctx->num_inputs(),
+                errors::Internal("Invalid input index for variable write."));
+
+    se::DeviceMemoryBase buffer = output.buffer({output_num});
+
+    Var* variable = nullptr;
+    // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
+    // not a Tensor.
+    OP_REQUIRES_OK(ctx, LookupOrCreateResource<Var>(
+                            ctx, HandleFromInput(ctx, write.input_index),
+                            &variable, [this, ctx, &write](Var** ptr) {
+                              *ptr = new Var(write.type);
+                              return Status::OK();
+                            }));
+
+    core::ScopedUnref s(variable);
+
+    mutex_lock ml(*variable->mu());
+    OP_REQUIRES(ctx, variable->tensor()->dtype() == write.type,
+                errors::Internal("Mismatched type in variable write"));
+
+    if (allocate_xla_tensors_) {
+      Tensor output_tensor;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(write.type, write.shape, &output_tensor));
+      XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
+      CHECK(xla_tensor);
+      xla_tensor->set_shaped_buffer(
+          ExtractSubShapedBuffer(&output, output_num, xla_allocator_));
+      *variable->tensor() = output_tensor;
+    } else {
+      Tensor output_tensor = XlaTensorBuffer::MakeTensor(
+          write.type, write.shape, buffer, allocator);
+      output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
+      *variable->tensor() = output_tensor;
+    }
+    ++output_num;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a6ff3b0c751206d184da63ef1a36e750a1252a5
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains utilities for launching compiled XLA kernels for a KernelContext.
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
+
+#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class XlaAllocator;
+
+// Takes a snapshot of the values of resource variable arguments, which are
+// the last `num_variables` arguments. We snapshot tensors that back
+// resource variables since concurrent updates may modify the shape, and it is
+// important that the shapes used for compilation match the true shapes of the
+// buffers.
+//
+// Returns a map of TensorFlow argument index to resource variable.
+std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
+                                                        int num_variables);
+
+// Adapter class that wraps a Tensorflow allocator as an XLA allocator.
+// Assumes that the Tensorflow allocator permits asynchronous deallocation:
+// see comment on `AllowsAsynchronousDeallocation()`.
+class XlaAllocator : public xla::DeviceMemoryAllocator {
+ public:
+  XlaAllocator(const se::Platform* platform, Allocator* wrapped);
+  ~XlaAllocator() override;
+  xla::StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
+                                               bool retry_on_failure) override;
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) override;
+
+  // The Tensorflow BFC allocator used on GPU allows host-side deallocation
+  // before GPU execution takes place. Tensorflow uses the ordering of the main
+  // compute stream to enforce a happens-before relationship between a memory
+  // allocation and code that reuses the same memory. If Tensorflow adds
+  // support for multiple GPU streams or allocators with different ordering
+  // requirements, this code may need to change.
+  // (This attribute has no effect on CPU.)
+  bool AllowsAsynchronousDeallocation() const override { return true; }
+
+ private:
+  Allocator* wrapped_;
+};
+
+// Helper class to perform the marshalling of TensorFlow inputs and outputs to
+// ShapedBuffers suitable for passing to an XLA computation.
+class XlaComputationLaunchContext {
+ public:
+  // Create a new launch context. 'allocate_xla_tensors' is true if allocated
+  // output tensors and variables are always XlaTensors. If false they are
+  // assumed to be "normal" device pointers.
+  XlaComputationLaunchContext(int64 num_resource_args, xla::LocalClient* client,
+                              xla::DeviceMemoryAllocator* xla_allocator,
+                              bool allocate_xla_tensors);
+
+  // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
+  // `variables` is a map from TensorFlow argument number to resource variable.
+  void PopulateInputs(OpKernelContext* ctx,
+                      const XlaCompiler::CompilationResult* kernel,
+                      const std::map<int, OptionalTensor>& variables);
+
+  // Given the XLA output in `output`, populate all outputs of `ctx`.
+  void PopulateOutputs(OpKernelContext* ctx,
+                       const XlaCompiler::CompilationResult* kernel,
+                       xla::ScopedShapedBuffer output);
+
+  // Return the argument list. Only valid after PopulateInputs() has been
+  // called.
+  const std::vector<xla::ShapedBuffer*>& arguments() const { return arg_ptrs_; }
+
+ private:
+  int64 num_resource_args_;
+  xla::LocalClient* client_;
+  xla::DeviceMemoryAllocator* xla_allocator_;
+  bool allocate_xla_tensors_;
+  std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
+  std::vector<xla::ShapedBuffer*> arg_ptrs_;
+};
+
+// A simple TensorBuffer implementation that allows us to create Tensors that
+// take ownership of pre-allocated memory.
+class XlaTensorBuffer : public TensorBuffer {
+ public:
+  XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size,
+                  Allocator* allocator)
+      : expected_size_(expected_size),
+        actual_size_(actual_size),
+        allocator_(allocator) {
+    data_ = const_cast<void*>(ptr);
+  }
+
+  ~XlaTensorBuffer() override { allocator_->DeallocateRaw(data_); }
+
+  void* data() const override { return data_; }
+  size_t size() const override { return expected_size_; }
+
+  TensorBuffer* root_buffer() override { return this; }
+
+  void FillAllocationDescription(AllocationDescription* proto) const override {
+    proto->set_allocated_bytes(actual_size_);
+  }
+
+  static Tensor MakeTensor(DataType dtype, const TensorShape& shape,
+                           se::DeviceMemoryBase buffer, Allocator* allocator) {
+    size_t expected_size = shape.num_elements() * DataTypeSize(dtype);
+    auto* tensor_buffer = new XlaTensorBuffer(buffer.opaque(), expected_size,
+                                              buffer.size(), allocator);
+    Tensor t(dtype, shape, tensor_buffer);
+    tensor_buffer->Unref();
+    return t;
+  }
+
+ private:
+  void* data_;
+  size_t expected_size_;
+  size_t actual_size_;
+  Allocator* allocator_;
+};
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce6456880bc1b3bc15ac0ef4bae35a83771098ef
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+
+namespace tensorflow {
+
+/*static*/ XlaTensor* XlaTensor::FromTensor(Tensor* tensor) {
+  if (tensor->NumElements() == 0) {
+    return nullptr;
+  }
+  XlaTensor* xla_tensor =
+      FromOpaquePointer(const_cast<char*>(tensor->tensor_data().data()));
+  return xla_tensor;
+}
+
+/*static*/ const XlaTensor* XlaTensor::FromTensor(const Tensor* tensor) {
+  return FromTensor(const_cast<Tensor*>(tensor));
+}
+
+/*static*/ se::DeviceMemoryBase XlaTensor::DeviceMemoryFromTensor(
+    const Tensor& tensor) {
+  const XlaTensor* xla_tensor = FromTensor(&tensor);
+  if (xla_tensor) {
+    CHECK(xla_tensor->has_shaped_buffer());
+    return xla_tensor->shaped_buffer().root_buffer();
+  } else {
+    return se::DeviceMemoryBase(const_cast<char*>(tensor.tensor_data().data()),
+                                tensor.tensor_data().size());
+  }
+}
+
+Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+                                       xla::LocalClient* client,
+                                       int device_ordinal) {
+  xla::Shape on_host_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &on_host_shape));
+  xla::Shape on_device_shape =
+      client->backend().transfer_manager()->HostShapeToDeviceShape(
+          on_host_shape);
+
+  xla::ShapedBuffer buffer(on_host_shape, on_device_shape, client->platform(),
+                           device_ordinal);
+  for (auto& index_to_buffer : buffer.buffers()) {
+    xla::Shape subshape =
+        xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
+    uint64 size =
+        client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
+    TF_ASSIGN_OR_RETURN(index_to_buffer.second,
+                        client->backend().memory_allocator()->Allocate(
+                            device_ordinal, size, /*retry_on_failure=*/false));
+  }
+
+  set_shaped_buffer(xla::ScopedShapedBuffer(
+      std::move(buffer), client->backend().memory_allocator()));
+  return Status::OK();
+}
+
+// The pointer tag, OR-ed into the XlaTensor's address to distinguish it from
+// device-side tensors, which are either CPU or GPU memory pointers. This works
+// because we're guaranteed that CPU and GPU pointers are aligned to > 1 bits.
+namespace {
+constexpr uintptr_t kTag = 0x1ULL;
+}
+
+/*static*/ XlaTensor* XlaTensor::FromOpaquePointer(void* ptr) {
+  uintptr_t value = reinterpret_cast<uintptr_t>(ptr);
+  if (value & kTag) {
+    return reinterpret_cast<XlaTensor*>(value & ~kTag);
+  } else {
+    return nullptr;
+  }
+}
+
+/*static*/ void* XlaTensor::ToOpaquePointer(XlaTensor* tensor) {
+  uintptr_t value = reinterpret_cast<uintptr_t>(tensor);
+  CHECK_EQ(value & kTag, 0);
+  value |= kTag;
+  return reinterpret_cast<XlaTensor*>(value);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..922a91897312096e4bb6ee2a1cc153e0039e2c7a
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
+
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// The implementation of a Tensor for an XlaDevice. All device tensors are
+// actually one of these.
+//
+// To distinguish between "normal" device tensors and XlaTensors, the raw
+// pointer data stored in the TensorBuffer is a tagged pointer.
+class XlaTensor {
+ public:
+  // Downcast from a Tensor to an XlaTensor. Return nullptr if the downcast
+  // fails.
+  static XlaTensor* FromTensor(Tensor* tensor);
+  // Downcast from a Tensor to an XlaTensor. Return nullptr if the downcast
+  // fails.
+  static const XlaTensor* FromTensor(const Tensor* tensor);
+
+  // Create a DeviceMemoryBase from a Tensor. The Tensor can be an XlaTensor, in
+  // which case the returned value is shaped_buffer()->root_buffer(), or a
+  // normal Tensor in which case the returned value is
+  // {tensor.tensor_data().data(), tensor.tensor_data().size}.
+  static se::DeviceMemoryBase DeviceMemoryFromTensor(const Tensor& tensor);
+
+  // Assign the internal ShapedBuffer to new memory for the given dtype and
+  // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it
+  // is replaced and the managed memory deallocated.
+  Status AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+                              xla::LocalClient* client, int device_ordinal);
+
+  // Some Tensors can have complex on-device shapes, including tuple shapes. To
+  // manage the memory for these tensors a ShapedBuffer may be required.
+
+  // Return true if this TensorInfo contains a ShapedBuffer.
+  bool has_shaped_buffer() const { return shaped_buffer_ != nullptr; }
+  // Return the contained ShapedBuffer.
+  // REQUIRES: has_shaped_buffer()
+  const xla::ShapedBuffer& shaped_buffer() const {
+    CHECK(has_shaped_buffer());
+    return *shaped_buffer_;
+  }
+  // Mutates the TensorInfo to set the ShapedBuffer.
+  void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) {
+    shaped_buffer_ =
+        xla::MakeUnique<xla::ScopedShapedBuffer>(std::move(shaped_buffer));
+  }
+
+  // Some tensors on the device may have known values on the host. We use these
+  // in on-demand mode to avoid re-copying values from the device if we know the
+  // host value already.
+
+  // Return true if this TensorInfo contains a host tensor.
+  bool has_host_tensor() const { return host_tensor_ != nullptr; }
+  // Return the contained host tensor.
+  // REQUIRES: has_host_tensor()
+  const Tensor& host_tensor() const { return *host_tensor_; }
+  // Sets the contained host tensor.
+  void set_host_tensor(const Tensor& tensor) {
+    host_tensor_.reset(new Tensor(tensor));
+  }
+
+  // Convert from a raw pointer to an XlaTensor, removing the pointer tag.
+  static XlaTensor* FromOpaquePointer(void* ptr);
+  // Convert to a raw pointer from an XlaTensor, adding the pointer tag.
+  static void* ToOpaquePointer(XlaTensor* tensor);
+
+ private:
+  // The optional contained ShapedBuffer.
+  std::unique_ptr<xla::ScopedShapedBuffer> shaped_buffer_;
+  // An optional host tensor value.
+  std::unique_ptr<Tensor> host_tensor_;
+};
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
index da4bc44c7a75c9f8faf16c537a17a1f2d16d5d61..238fd15166c0b08ee109d6a3888e16c39f87a603 100644
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@@ -49,17 +49,3 @@ cc_library(
         "//tensorflow/compiler/jit:xla_device",
     ],
 )
-
-#-----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 782bf82d4149968d5e5fbfb93bbd4ff1dcd75494..a94b298f87832057c6ec86a1ea250a54ed1b4ee0 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -86,7 +86,10 @@ tf_xla_py_test(
     # ArgMax needs CustomCall on CPU, which is not available in normal
     # (not precompiled) TensorFlow. The flag below excludes the CPU
     # backend.
-    disabled_backends = "cpu",
+    disabled_backends = [
+        "cpu",
+        "cpu_ondemand",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -98,7 +101,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "binary_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["binary_ops_test.py"],
     shard_count = 5,
     tags = [
@@ -121,6 +124,7 @@ tf_xla_py_test(
     name = "categorical_op_test",
     size = "small",
     srcs = ["categorical_op_test.py"],
+    tags = ["optonly"],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -188,6 +192,31 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "oom_test",
+    size = "medium",
+    srcs = ["oom_test.py"],
+    disabled_backends = [
+        "cpu",
+        "cpu_ondemand",
+    ],
+    tags = [
+        # Allocates very large amounts of memory and does not work under TSAN.
+        "notsan",
+        "optonly",  # Times out frequently in fastbuild.
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "conv2d_test",
     size = "medium",
@@ -242,6 +271,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "dynamic_slice_ops_test",
+    size = "small",
+    srcs = ["dynamic_slice_ops_test.py"],
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 tf_xla_py_test(
     name = "dynamic_stitch_test",
     size = "small",
@@ -267,6 +308,25 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "eager_test",
+    size = "small",
+    srcs = ["eager_test.py"],
+    disabled_backends = [
+        # TODO(b/78199195) Support XLA CPU devices in eager runtime
+        "cpu",
+        "cpu_ondemand",
+        # TODO(b/78468222) Enable GPU backend
+        "gpu",
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "fft_test",
     size = "medium",
@@ -299,7 +359,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "ftrl_test",
-    size = "small",
+    size = "medium",
     srcs = ["ftrl_test.py"],
     deps = [
         ":xla_test",
@@ -315,6 +375,8 @@ tf_xla_py_test(
     name = "function_test",
     size = "small",
     srcs = ["function_test.py"],
+    # Functions are not implemented in the on-demand compilation model yet.
+    disabled_backends = "cpu_ondemand",
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -466,6 +528,22 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "reduce_window_test",
+    size = "small",
+    srcs = ["reduce_window_test.py"],
+    disabled_backends = ["cpu_ondemand"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "reverse_ops_test",
     size = "medium",
@@ -550,6 +628,8 @@ tf_xla_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
+    # Stack ops are not implemented in the on-demand compilation model yet.
+    disabled_backends = "cpu_ondemand",
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -576,6 +656,8 @@ tf_xla_py_test(
     name = "tensor_array_ops_test",
     size = "small",
     srcs = ["tensor_array_ops_test.py"],
+    # TensorArray ops are not implemented in the on-demand compilation model yet.
+    disabled_backends = "cpu_ondemand",
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -654,10 +736,26 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "while_test",
+    size = "small",
+    srcs = ["while_test.py"],
+    disabled_backends = ["cpu_ondemand"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "gather_test",
     size = "medium",
     srcs = ["gather_test.py"],
+    tags = ["noasan"],  # times out, http://b/78599043
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -720,13 +818,10 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-    ],
-    # TODO(b/62961789): Test fails with SIGABRT
-    tags = [
-        "manual",
-        "notap",
+        "//tensorflow/python:variables",
     ],
 )
 
@@ -827,16 +922,14 @@ tf_xla_py_test(
     ],
 )
 
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+tf_xla_py_test(
+    name = "placeholder_test",
+    size = "small",
+    srcs = ["placeholder_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
 )
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 30a6d3a74d64f90ad33062df6d1e16e3a575bd63..1e4dd32916c3a40282735fb8f75670b0e9ef0dc9 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -71,7 +71,7 @@ class BinaryOpsTest(XLATestCase):
           expected=np.array([[[[False, True], [True, False]]]], dtype=dtype))
 
       self._testBinary(
-          gen_math_ops._real_div,
+          gen_math_ops.real_div,
           np.array([3, 3, -1.5, -8, 44], dtype=dtype),
           np.array([2, -2, 7, -4, 0], dtype=dtype),
           expected=np.array(
@@ -108,57 +108,57 @@ class BinaryOpsTest(XLATestCase):
               [0, np.pi / 4, np.pi / 2, np.pi * 3 / 4, np.pi], dtype=dtype))
 
       self._testBinary(
-          gen_math_ops._reciprocal_grad,
+          gen_math_ops.reciprocal_grad,
           np.array([4, -3, -2, 1], dtype=dtype),
           np.array([5, -6, 7, -8], dtype=dtype),
           expected=np.array([-80, 54, -28, 8], dtype=dtype))
 
       self._testBinary(
-          gen_math_ops._sigmoid_grad,
+          gen_math_ops.sigmoid_grad,
           np.array([4, 3, 2, 1], dtype=dtype),
           np.array([5, 6, 7, 8], dtype=dtype),
           expected=np.array([-60, -36, -14, 0], dtype=dtype))
 
       self._testBinary(
-          gen_math_ops._rsqrt_grad,
+          gen_math_ops.rsqrt_grad,
           np.array([4, 3, 2, 1], dtype=dtype),
           np.array([5, 6, 7, 8], dtype=dtype),
           expected=np.array([-160, -81, -28, -4], dtype=dtype))
 
       self._testBinary(
-          gen_math_ops._sqrt_grad,
+          gen_math_ops.sqrt_grad,
           np.array([4, 3, 2, 1], dtype=dtype),
           np.array([5, 6, 7, 8], dtype=dtype),
           expected=np.array([0.625, 1, 1.75, 4], dtype=dtype))
 
       self._testBinary(
-          gen_nn_ops._softplus_grad,
+          gen_nn_ops.softplus_grad,
           np.array([4, 3, 2, 1], dtype=dtype),
           np.array([5, 6, 7, 8], dtype=dtype),
           expected=np.array(
               [3.97322869, 2.99258232, 1.99817801, 0.99966466], dtype=dtype))
 
       self._testBinary(
-          gen_nn_ops._softsign_grad,
+          gen_nn_ops.softsign_grad,
           np.array([4, 3, 2, 1], dtype=dtype),
           np.array([5, 6, 7, 8], dtype=dtype),
           expected=np.array(
               [0.11111111, 0.06122449, 0.03125, 0.01234568], dtype=dtype))
 
       self._testBinary(
-          gen_math_ops._tanh_grad,
+          gen_math_ops.tanh_grad,
           np.array([4, 3, 2, 1], dtype=dtype),
           np.array([5, 6, 7, 8], dtype=dtype),
           expected=np.array([-75, -48, -21, 0], dtype=dtype))
 
       self._testBinary(
-          gen_nn_ops._elu_grad,
+          gen_nn_ops.elu_grad,
           np.array([1, 2, 3, 4, 5, 6], dtype=dtype),
           np.array([-.6, -.4, -.2, 0, .2, .4], dtype=dtype),
           expected=np.array([0.4, 1.2, 2.4, 4, 5, 6], dtype=dtype))
 
       self._testBinary(
-          gen_nn_ops._selu_grad,
+          gen_nn_ops.selu_grad,
           np.array([1, 2, 3, 4, 5, 6], dtype=dtype),
           np.array([-.6, -.4, -.2, .2, .4, .6], dtype=dtype),
           expected=np.array(
@@ -166,20 +166,20 @@ class BinaryOpsTest(XLATestCase):
                4.202803949422, 5.2535049367774, 6.30420592413], dtype=dtype))
 
       self._testBinary(
-          gen_nn_ops._relu_grad,
+          gen_nn_ops.relu_grad,
           np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
           np.array([0, 0, 0, 0, 0, 0.1, 0.3, 0.5, 0.7, 0.9], dtype=dtype),
           expected=np.array([0, 0, 0, 0, 0, 6, 7, 8, 9, 10], dtype=dtype))
 
       self._testBinary(
-          gen_nn_ops._relu6_grad,
+          gen_nn_ops.relu6_grad,
           np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=dtype),
           np.array(
               [0, 0, 0, 0, 0, 0.1, 0.3, 0.5, 0.7, 0.9, 6.1, 10.0], dtype=dtype),
           expected=np.array([0, 0, 0, 0, 0, 6, 7, 8, 9, 10, 0, 0], dtype=dtype))
 
       self._testBinary(
-          gen_nn_ops._softmax_cross_entropy_with_logits,
+          gen_nn_ops.softmax_cross_entropy_with_logits,
           np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=dtype),
           np.array([[0.1, 0.2, 0.3, 0.4], [0.4, 0.3, 0.2, 0.1]], dtype=dtype),
           expected=[
@@ -190,24 +190,29 @@ class BinaryOpsTest(XLATestCase):
           ],
           equality_test=self.ListsAreClose)
 
-      self._testBinary(
-          gen_nn_ops._sparse_softmax_cross_entropy_with_logits,
-          np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
-                    [0.9, 1.0, 1.1, 1.2]], dtype=dtype),
-          np.array([2, 1, 7], dtype=np.int32),
-          expected=[
-              np.array([1.342536, 1.442536, np.nan], dtype=dtype),
-              np.array([[0.213838, 0.236328, -0.738817, 0.288651],
-                        [0.213838, -0.763672, 0.261183, 0.288651],
-                        [np.nan, np.nan, np.nan, np.nan]],
-                       dtype=dtype),
-          ],
-          equality_test=self.ListsAreClose)
+      # TODO(b/68813416): Fails with bfloat16.
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        self._testBinary(
+            gen_nn_ops.sparse_softmax_cross_entropy_with_logits,
+            np.array(
+                [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
+                 [0.9, 1.0, 1.1, 1.2]],
+                dtype=dtype),
+            np.array([2, 1, 7], dtype=np.int32),
+            expected=[
+                np.array([1.342536, 1.442536, np.nan], dtype=dtype),
+                np.array(
+                    [[0.213838, 0.236328, -0.738817, 0.288651], [
+                        0.213838, -0.763672, 0.261183, 0.288651
+                    ], [np.nan, np.nan, np.nan, np.nan]],
+                    dtype=dtype),
+            ],
+            equality_test=self.ListsAreClose)
 
   def testIntOps(self):
     for dtype in self.int_types:
       self._testBinary(
-          gen_math_ops._truncate_div,
+          gen_math_ops.truncate_div,
           np.array([3, 3, -1, -9, -8], dtype=dtype),
           np.array([2, -2, 7, 2, -4], dtype=dtype),
           expected=np.array([1, -1, 0, -4, 2], dtype=dtype))
@@ -232,11 +237,16 @@ class BinaryOpsTest(XLATestCase):
           expected=np.right_shift(lhs, rhs))
 
       if dtype in [np.int8, np.int16, np.int32, np.int64]:
-        lhs = np.array([-1, -5, -3, -14], dtype=dtype)
-        rhs = np.array([5, 0, 1, 11], dtype=dtype)
-        self._testBinary(
-            bitwise_ops.right_shift, lhs, rhs,
-            expected=np.right_shift(lhs, rhs))
+        lhs = np.array([-1, -5, -3, -14, -2], dtype=dtype)
+        rhs = np.array([5, 0, 1, 11, 36], dtype=dtype)
+        # HLO has saturating shift behavior.
+        bits = np.ceil(
+            np.log(np.iinfo(dtype).max - np.iinfo(dtype).min) / np.log(2))
+        expected = [
+            np.right_shift(l, r) if r < bits else np.sign(l)
+            for l, r in zip(lhs, rhs)
+        ]
+        self._testBinary(bitwise_ops.right_shift, lhs, rhs, expected=expected)
 
   def testNumericOps(self):
     for dtype in self.numeric_types:
@@ -258,9 +268,9 @@ class BinaryOpsTest(XLATestCase):
 
       self._testBinary(
           math_ops.subtract,
-          np.array([1, 2], dtype=dtype),
-          np.array([10, 20], dtype=dtype),
-          expected=np.array([-9, -18], dtype=dtype))
+          np.array([1, 2, 100], dtype=dtype),
+          np.array([10, 20, -1], dtype=dtype),
+          expected=np.array([-9, -18, 101], dtype=dtype))
       self._testBinary(
           math_ops.subtract,
           dtype(5),
@@ -350,6 +360,14 @@ class BinaryOpsTest(XLATestCase):
           np.array([2, -1], dtype=dtype),
           expected=np.array([[[[3, 1], [5, 3]]]], dtype=dtype))
 
+    if np.int64 in self.numeric_types:
+      self._testBinary(
+          math_ops.add,
+          np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
+          np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
+          expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36],
+                            dtype=np.int64))
+
   def testComplexOps(self):
     for dtype in self.complex_types:
       ctypes = {np.complex64: np.float32}
@@ -369,7 +387,7 @@ class BinaryOpsTest(XLATestCase):
           expected=np.array([[[[False, True], [True, False]]]], dtype=dtype))
 
       self._testBinary(
-          gen_math_ops._real_div,
+          gen_math_ops.real_div,
           np.array([3, 3j, -1.5j, -8, 2 + 3j, 2 + 4j], dtype=dtype),
           np.array([2, -2, 7j, -4j, 4 - 6j, 1 + 2j], dtype=dtype),
           expected=np.array(
@@ -378,7 +396,7 @@ class BinaryOpsTest(XLATestCase):
 
       # Test inf/nan scenarios.
       self._testBinary(
-          gen_math_ops._real_div,
+          gen_math_ops.real_div,
           np.array([4 + 3j, 4, 3j, -4, -4j, 2 - 3j], dtype=dtype),
           np.array([0, 0, 0, 0, 0, 0], dtype=dtype),
           expected=np.array(
@@ -418,19 +436,19 @@ class BinaryOpsTest(XLATestCase):
       lhs = np.array([4 + 2j, -3 - 1j, 2j, 1], dtype=dtype)
       rhs = np.array([5, -6j, 7 - 3j, -8j], dtype=dtype)
       self._testBinary(
-          gen_math_ops._reciprocal_grad, lhs, rhs, expected=-rhs * lhs * lhs)
+          gen_math_ops.reciprocal_grad, lhs, rhs, expected=-rhs * lhs * lhs)
 
       self._testBinary(
-          gen_math_ops._sigmoid_grad, lhs, rhs, expected=rhs * lhs * (1 - lhs))
+          gen_math_ops.sigmoid_grad, lhs, rhs, expected=rhs * lhs * (1 - lhs))
 
       self._testBinary(
-          gen_math_ops._rsqrt_grad, lhs, rhs, expected=lhs**3 * rhs / -2)
+          gen_math_ops.rsqrt_grad, lhs, rhs, expected=lhs**3 * rhs / -2)
 
       self._testBinary(
-          gen_math_ops._sqrt_grad, lhs, rhs, expected=rhs / (2 * lhs))
+          gen_math_ops.sqrt_grad, lhs, rhs, expected=rhs / (2 * lhs))
 
       self._testBinary(
-          gen_math_ops._tanh_grad, lhs, rhs, expected=rhs * (1 - lhs * lhs))
+          gen_math_ops.tanh_grad, lhs, rhs, expected=rhs * (1 - lhs * lhs))
 
   def testComplexMath(self):
     for dtype in self.complex_types:
@@ -538,7 +556,7 @@ class BinaryOpsTest(XLATestCase):
 
     if dtype not in self.complex_types:  # floordiv unsupported for complex.
       self._testBinary(
-          gen_math_ops._floor_div,
+          gen_math_ops.floor_div,
           np.array([3, 3, -1, -9, -8], dtype=dtype),
           np.array([2, -2, 7, 2, -4], dtype=dtype),
           expected=np.array([1, -2, -1, -5, 2], dtype=dtype))
@@ -554,12 +572,12 @@ class BinaryOpsTest(XLATestCase):
   def _testRemainder(self, dtype):
     """Test cases for remainder operators."""
     self._testBinary(
-        gen_math_ops._floor_mod,
+        gen_math_ops.floor_mod,
         np.array([3, 3, -1, -8], dtype=dtype),
         np.array([2, -2, 7, -4], dtype=dtype),
         expected=np.array([1, -1, 6, 0], dtype=dtype))
     self._testBinary(
-        gen_math_ops._truncate_mod,
+        gen_math_ops.truncate_mod,
         np.array([3, 3, -1, -8], dtype=dtype),
         np.array([2, -2, 7, -4], dtype=dtype),
         expected=np.array([1, 1, -1, 0], dtype=dtype))
@@ -668,6 +686,11 @@ class BinaryOpsTest(XLATestCase):
           np.array([[10], [7], [2]], dtype=np.float32),
           np.float32(7),
           expected=np.array([[False], [False], [True]], dtype=np.bool))
+      self._testBinary(
+          less_op,
+          np.array([[10], [7], [2], [-1]], dtype=np.int64),
+          np.int64(7),
+          expected=np.array([[False], [False], [True], [True]], dtype=np.bool))
 
     for less_equal_op in [math_ops.less_equal, (lambda x, y: x <= y)]:
       self._testBinary(
@@ -686,6 +709,80 @@ class BinaryOpsTest(XLATestCase):
           np.float32(7),
           expected=np.array([[False], [True], [True]], dtype=np.bool))
 
+  def testS64Comparisons(self):
+    for op in [(lambda x, y: x < y), (lambda x, y: x <= y),
+               (lambda x, y: x >= y), (lambda x, y: x > y)]:
+      lhs = np.array(
+          [
+              np.int64(0x000000007FFFFFFF),
+              np.int64(0x000000007FFFFFFF),
+              np.int64(0x0000000080000000),
+              np.int64(0x0000000080000000),
+              np.int64(0x0000000080000001),
+              np.int64(0x00000000FFFF0000),
+              np.int64(0x00000000FFFF0000),
+              np.int64(0x00000000FFFFFFFE),
+              np.int64(0x00000000FFFFFFFF),
+              np.int64(0x00000000FFFFFFFF),
+              np.int64(0x0000000100000000),
+              np.int64(0x0000000200000002),
+              np.int64(0x0000000200000002),
+              np.int64(0x0000000200000002),
+              np.int64(0x0000000200000002),
+              np.int64(0x0000000200000002),
+              np.int64(0x0000000200000002),
+              np.int64(0x0000000200000002),
+              np.int64(0x0000000200000002),
+              np.int64(0x0000000200000002),
+              np.int64(-0x7FFFFFFF00000002),
+              np.int64(-0x7FFFFFFF00000002),
+              np.int64(-0x7FFFFFFF00000001),
+              np.int64(-0x7FFFFFFF00000001),
+              np.int64(-0x7FFFFFFF00000001),
+              np.int64(-0x7FFFFFFF00000001),
+              np.int64(0x7ffffffefff00010),
+              np.int64(0x7ffffffefff00010),
+              np.int64(-1),
+              np.int64(-1)
+          ],
+          dtype=np.int64)
+      rhs = np.array(
+          [
+              np.int64(0x000000007FFFFFFE),
+              np.int64(0x000000007FFFFFFF),
+              np.int64(0x000000007FFFFFFF),
+              np.int64(0x0000000080000000),
+              np.int64(0x0000000080000001),
+              np.int64(0x00000000FFFF0000),
+              np.int64(0x00000000FFFF0001),
+              np.int64(0x00000000FFFFFFFF),
+              np.int64(0x00000000FFFFFFFE),
+              np.int64(0x00000000FFFFFFFF),
+              np.int64(0x00000000FFFFFFFF),
+              np.int64(0x0000000100000001),
+              np.int64(0x0000000100000002),
+              np.int64(0x0000000100000003),
+              np.int64(0x0000000200000001),
+              np.int64(0x0000000200000002),
+              np.int64(0x0000000200000003),
+              np.int64(0x0000000300000001),
+              np.int64(0x0000000300000002),
+              np.int64(0x0000000300000003),
+              np.int64(0x00000000FFFFFFFF),
+              np.int64(-0x7FFFFFFF00000001),
+              np.int64(0x00000000FFFFFFFE),
+              np.int64(0x00000000FFFFFFFF),
+              np.int64(-0x7FFFFFFF00000002),
+              np.int64(-0x7FFFFFFF00000001),
+              np.int64(0x00000000FFFFFFFF),
+              np.int64(-0x7FFFFFFF00000001),
+              np.int64(-2),
+              np.int64(-1)
+          ],
+          dtype=np.int64)
+      expected = np.array([op(l, r) for l, r in zip(lhs, rhs)], dtype=np.bool)
+      self._testBinary(op, lhs, rhs, expected=expected)
+
   def testBroadcasting(self):
     """Tests broadcasting behavior of an operator."""
 
@@ -1045,6 +1142,20 @@ class BinaryOpsTest(XLATestCase):
             ],
             equality_test=self.ListsAreClose)
 
+      def splitvOp(x, y):  # pylint: disable=invalid-name
+        return array_ops.split(value=y, num_or_size_splits=[2, 3], axis=x)
+      for axis in [1, -1]:
+        self._testBinary(
+            splitvOp,
+            np.int32(axis),
+            np.array([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]],
+                     dtype=dtype),
+            expected=[
+                np.array([[0, 1], [5, 6]], dtype=dtype),
+                np.array([[2, 3, 4], [7, 8, 9]], dtype=dtype),
+            ],
+            equality_test=self.ListsAreClose)
+
   def testTile(self):
     for dtype in self.numeric_types:
       self._testBinary(
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 0528a5415d579a844e68403ace1bb8982a10a841..7b114d4f85d3a5cadc6af25b55c5a21f90d2a768 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -51,12 +51,12 @@ def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
     if backend == "cpu":
       backend_args += [
           "--test_device=XLA_CPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
+          "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
       ]
     elif backend == "gpu":
       backend_args += [
           "--test_device=XLA_GPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
+          "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_BFLOAT16"
       ]
       backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
@@ -89,4 +89,3 @@ def generate_backend_suites(backends=[]):
     backends = all_backends()
   for backend in backends:
     native.test_suite(name="%s_tests" % backend, tags=["tf_xla_%s" % backend])
-
diff --git a/tensorflow/compiler/tests/cholesky_op_test.py b/tensorflow/compiler/tests/cholesky_op_test.py
index 5010fe5e21d0782e68d4e6d5bf6b4df1b44793a3..1a8989d7c2f617525c301f30fd899a01362310bf 100644
--- a/tensorflow/compiler/tests/cholesky_op_test.py
+++ b/tensorflow/compiler/tests/cholesky_op_test.py
@@ -34,6 +34,13 @@ from tensorflow.python.platform import test
 
 class CholeskyOpTest(XLATestCase):
 
+  # Cholesky defined for float64, float32, complex64, complex128
+  # (https://www.tensorflow.org/api_docs/python/tf/cholesky)
+  @property
+  def float_types(self):
+    return set(super(CholeskyOpTest, self).float_types).intersection(
+        (np.float64, np.float32, np.complex64, np.complex128))
+
   def _verifyCholeskyBase(self, sess, placeholder, x, chol, verification, atol):
     chol_np, verification_np = sess.run([chol, verification], {placeholder: x})
     self.assertAllClose(x, verification_np, atol=atol)
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 81734082d9aab86f8bc763681265ef64ef32bd31..f10973e19f1945515b776cf86349445ed7334629 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -301,7 +301,7 @@ class ConcatOffsetTest(XLATestCase):
         s0 = constant_op.constant([2, 3, 5], dtypes.int32)
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
         s2 = constant_op.constant([2, 20, 5], dtypes.int32)
-        off = gen_array_ops._concat_offset(cdim, [s0, s1, s2])
+        off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
         ans = sess.run(off)
         self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
diff --git a/tensorflow/compiler/tests/dynamic_slice_ops_test.py b/tensorflow/compiler/tests/dynamic_slice_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a46d2ec3e7aee3a4ecfbf1ab9f622d8eb659e3c
--- /dev/null
+++ b/tensorflow/compiler/tests/dynamic_slice_ops_test.py
@@ -0,0 +1,93 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA dynamic slicing ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class DynamicUpdateSliceOpsTest(XLATestCase):
+
+  def _assertOpOutputMatchesExpected(self, op, args, expected):
+    with self.test_session() as session:
+      with self.test_scope():
+        placeholders = [
+            array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape)
+            for arg in args
+        ]
+        feeds = {placeholders[i]: args[i] for i in range(0, len(args))}
+        output = op(*placeholders)
+      result = session.run(output, feeds)
+      self.assertAllClose(result, expected, rtol=1e-3)
+
+  def testUpdateSlice(self):
+    for dtype in self.numeric_types:
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array([], dtype=dtype),
+              np.array([], dtype=dtype),
+              np.array([0], dtype=np.int32)
+          ],
+          expected=np.array([], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
+              np.array([-1, -2, -3], dtype=dtype),
+              np.array([6], dtype=np.int32)
+          ],
+          expected=np.array([1, 2, 3, 4, 5, 6, -1, -2, -3, 10], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array(
+                  [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype),
+              np.array([[42, 43], [44, 45]], dtype=dtype),
+              np.array([1, 2], dtype=np.int32)
+          ],
+          expected=np.array(
+              [[1, 2, 3, 4], [5, 6, 42, 43], [9, 10, 44, 45]], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array(
+                  [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype),
+              np.array([[], []], dtype=dtype),
+              np.array([1, 2], dtype=np.int32)
+          ],
+          expected=np.array(
+              [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array(
+                  [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype),
+              np.ones([3, 4], dtype=dtype),
+              np.array([0, 0], dtype=np.int32)
+          ],
+          expected=np.ones([3, 4], dtype=dtype))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdd0185dfe4abe9d9acecc5381ff82c54b8c0705
--- /dev/null
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -0,0 +1,137 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test cases for eager execution using XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import googletest
+
+
+class EagerTest(XLATestCase):
+
+  def testBasic(self):
+    with self.test_scope():
+      three = constant_op.constant(3)
+      five = constant_op.constant(5)
+      product = three * five
+      self.assertAllEqual(15, product)
+
+  def testExecuteListOutputLen0(self):
+    with self.test_scope():
+      empty = constant_op.constant([], dtype=dtypes.int32)
+      result = array_ops.unstack(empty, 0)
+      self.assertTrue(isinstance(result, list))
+      self.assertEqual(0, len(result))
+
+  def testExecuteListOutputLen1(self):
+    with self.test_scope():
+      split_dim = constant_op.constant(1)
+      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      result = array_ops.split(value, 1, axis=split_dim)
+      self.assertTrue(isinstance(result, list))
+      self.assertEqual(1, len(result))
+      self.assertAllEqual([[0, 1, 2], [3, 4, 5]], result[0])
+
+  def testExecuteListOutputLen3(self):
+    with self.test_scope():
+      split_dim = constant_op.constant(1)
+      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      result = array_ops.split(value, 3, axis=split_dim)
+      self.assertTrue(isinstance(result, list))
+      self.assertEqual(3, len(result))
+      self.assertAllEqual([[0], [3]], result[0])
+      self.assertAllEqual([[1], [4]], result[1])
+      self.assertAllEqual([[2], [5]], result[2])
+
+  def testBasicGraph(self):
+    # Run some ops eagerly
+    with self.test_scope():
+      three = constant_op.constant(3)
+      five = constant_op.constant(5)
+      product = three * five
+      self.assertAllEqual(15, product)
+
+    # Run some ops graphly
+    with context.graph_mode(), self.test_session() as sess:
+      with self.test_scope():
+        three = constant_op.constant(3)
+        five = constant_op.constant(5)
+        product = three * five
+        self.assertAllEqual(15, sess.run(product))
+
+  def testDegenerateSlices(self):
+    with self.test_scope():
+      npt = np.arange(1, 19, dtype=np.float32).reshape(3, 2, 3)
+      t = constant_op.constant(npt)
+      # degenerate by offering a forward interval with a negative stride
+      self.assertAllEqual(npt[0:-1:-1, :, :], t[0:-1:-1, :, :])
+      # degenerate with a reverse interval with a positive stride
+      self.assertAllEqual(npt[-1:0, :, :], t[-1:0, :, :])
+      # empty interval in every dimension
+      self.assertAllEqual(npt[-1:0, 2:2, 2:3:-1], t[-1:0, 2:2, 2:3:-1])
+
+  def testIdentity(self):
+    with self.test_scope():
+      self.assertAllEqual(2, array_ops.identity(2))
+
+  def testIdentityOnVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(True)
+      i = array_ops.identity(v)
+    self.assertAllEqual(True, i.numpy())
+
+  def testAssignAddVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      v.assign_add(2.0)
+    self.assertEqual(3.0, v.numpy())
+
+  def testGradient(self):
+    def f(x):
+      return x
+
+    with self.test_scope():
+      grad_fn = backprop.gradients_function(f)
+      self.assertAllEqual(2., grad_fn(1., dy=2.)[0])
+
+  def testVariableGradient(self):
+    with self.test_scope():
+      v0 = resource_variable_ops.ResourceVariable(1.0)
+
+      def f():
+        x = v0 * v0
+        return x
+
+      grads = backprop.implicit_grad(f)()
+    self.assertEqual(2., grads[0][0].numpy())
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(log_device_placement=True))
+  googletest.main()
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index f9db4cf2017c0b4b6dc0cfeeda6dca7bb9d14f19..8e6407dffdac3adbcda8cbca2109ef9196defa8c 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -134,9 +134,15 @@ class FtrlOptimizerTest(XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-2.60260963, -4.29698515]), var0.eval(), float_rtol=1e-5)
+            np.array([-2.60260963, -4.29698515]),
+            var0.eval(),
+            float_rtol=1e-5,
+            half_rtol=1e-2)
         self.assertAllCloseAccordingToType(
-            np.array([-0.28432083, -0.56694895]), var1.eval(), float_rtol=1e-5)
+            np.array([-0.28432083, -0.56694895]),
+            var1.eval(),
+            float_rtol=1e-5,
+            half_rtol=1e-2)
 
   def testFtrlwithoutRegularization2(self):
     for dtype in self.float_types:
@@ -272,8 +278,8 @@ class FtrlOptimizerTest(XLATestCase):
       with self.test_session(), self.test_scope():
         val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype)
 
-    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4)
-    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-4)
+    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4, half_rtol=1e-2)
+    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-4, half_rtol=1e-2)
 
   def testEquivGradientDescentwithoutRegularization(self):
     steps = 5
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index 11d8a99ffe1a136a54b16e20f1792062203f7969..fbc3c994d163a504351fcccd1ba71a0997e6516f 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -105,6 +105,28 @@ class FunctionTest(XLATestCase):
       result = sess.run(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
+  def testCompileTimeConstantsInDefun(self):
+    """Tests that XLA handles compile-time constants in defuns."""
+    with self.test_session() as sess:
+
+      @function.Defun(dtypes.float32, dtypes.int32, dtypes.int32)
+      def Foo(a, c, d):
+        # c and d must be known at compile time
+        x = array_ops.slice(a, c, d)
+        return x
+
+      a = array_ops.placeholder(dtypes.float32)
+      c = array_ops.placeholder(dtypes.int32, shape=[4])
+      d = array_ops.placeholder(dtypes.int32, shape=[4])
+      with self.test_scope():
+        call_f = Foo(a, c, d)
+      result = sess.run(call_f, feed_dict={
+          a: np.ones([1, 4, 4, 1]),
+          c: [0, 0, 0, 0],
+          d: [1, 2, 2, 1]})
+
+    self.assertAllEqual(np.ones([1, 2, 2, 1]), result)
+
   # TODO(b/36139787): Re-enable this test when noinline works again.
   def DISABLED_testFunctionsNoInline(self):
 
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 538fa8e8e570b83ed681ecc0501285520cabdecb..42e637734c578fcc70473060cb156e172a0a1995 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -34,6 +34,13 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
 
+def GenerateNumpyRandomRGB(shape):
+  # Only generate floating points that are fractions like n / 256, since they
+  # are RGB pixels. Some low-precision floating point types in this test can't
+  # handle arbitrary precision floating points well.
+  return np.random.randint(0, 256, shape) / 256.
+
+
 class RGBToHSVTest(XLATestCase):
 
   def testBatch(self):
@@ -43,7 +50,7 @@ class RGBToHSVTest(XLATestCase):
     shape = (batch_size, 2, 7, 3)
 
     for nptype in self.float_types:
-      inp = np.random.rand(*shape).astype(nptype)
+      inp = GenerateNumpyRandomRGB(shape).astype(nptype)
 
       # Convert to HSV and back, as a batch and individually
       with self.test_session() as sess:
@@ -65,7 +72,8 @@ class RGBToHSVTest(XLATestCase):
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1)
       self.assertAllClose(batch2, join2)
-      self.assertAllCloseAccordingToType(batch2, inp, bfloat16_atol=0.03)
+      self.assertAllCloseAccordingToType(
+          batch2, inp, bfloat16_atol=0.03, half_rtol=0.02)
 
   def testRGBToHSVRoundTrip(self):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
@@ -82,7 +90,7 @@ class RGBToHSVTest(XLATestCase):
   def testRGBToHSVNumpy(self):
     """Tests the RGB to HSV conversion matches a reference implementation."""
     for nptype in self.float_types:
-      rgb_flat = np.random.random(64 * 3).reshape((64, 3)).astype(nptype)
+      rgb_flat = GenerateNumpyRandomRGB((64, 3)).astype(nptype)
       rgb_np = rgb_flat.reshape(4, 4, 4, 3)
       hsv_np = np.array([
           colorsys.rgb_to_hsv(
@@ -426,7 +434,7 @@ class ResizeBilinearTest(XLATestCase):
     with self.test_session() as sess, self.test_scope():
       dtype = dtype or np.float32
       grads = array_ops.placeholder(np.float32)
-      resized = gen_image_ops._resize_bilinear_grad(
+      resized = gen_image_ops.resize_bilinear_grad(
           grads,
           np.zeros([1, input_shape[0], input_shape[1], 1], dtype=dtype),
           align_corners=True)
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 2d8236e2cbdfafb35626cd582ee39b1f917aec7f..1ad83d80409734efd1f5a0a9fc39f5b7d064d54b 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -18,25 +18,41 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import numpy as np
 
 from tensorflow.contrib.compiler import jit
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 jit_scope = jit.experimental_jit_scope
 
 
+# Disable rewrites to make sure we don't end up having to update this test
+# whenever we implement new ones.
+def NoRewriteSessionConfig():
+  rewriter_config = rewriter_config_pb2.RewriterConfig(
+      disable_model_pruning=True,
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      function_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+  return config_pb2.ConfigProto(graph_options=graph_options)
+
+
 def CompiledKernel(fn, *inputs, **kwargs):
   """Execute 'fn' as a compiled XLA kernel, with 'inputs'."""
   name = kwargs.pop("name", None)
@@ -80,7 +96,7 @@ class JitLaunchTest(test.TestCase):
   # actually ran. However, it is sometimes possible for _XlaLaunch ops to be
   # constant-folded away, so the check is optional.
   def _compare(self, fn, args, require_kernel_launch=True, noinline=None):
-    with session_lib.Session() as sess:
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
       placeholders = []
       feeds = {}
       for arg in args:
@@ -257,7 +273,7 @@ class XlaCompilationTest(test.TestCase):
   def testReshape(self):
     """Tests an operator with compile-time constant and non-constant inputs."""
 
-    with self.test_session() as sess:
+    with self.test_session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -281,7 +297,7 @@ class XlaCompilationTest(test.TestCase):
   def testIgnoredArguments(self):
     """Tests that JIT computations can ignore formal parameters."""
 
-    with self.test_session() as sess:
+    with self.test_session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.int32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -305,7 +321,7 @@ class XlaCompilationTest(test.TestCase):
   def testLoops(self):
     """Tests that compilation accepts computations containing loops."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         c = lambda i, _: math_ops.less(i, 5)
@@ -323,7 +339,7 @@ class XlaCompilationTest(test.TestCase):
   def testCond(self):
     """Tests that compilation handles switch operators."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.float32)
       c = array_ops.placeholder(dtypes.bool)
@@ -364,7 +380,8 @@ class XlaCompilationTest(test.TestCase):
       inp = array_ops.placeholder(dtypes.float32)
       out = Entry(inp)
 
-    with self.test_session(graph=g, use_gpu=True) as sess:
+    with self.test_session(
+        config=NoRewriteSessionConfig(), graph=g, use_gpu=True) as sess:
       run_metadata = config_pb2.RunMetadata()
       val = sess.run(out,
                      feed_dict={inp: [2., 10.]},
@@ -376,7 +393,7 @@ class XlaCompilationTest(test.TestCase):
   def testLoopDeadlock(self):
     """Regression test for bug that caused deadlocks in graphs with loops."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         y = x + 1.0
@@ -403,10 +420,10 @@ class XlaCompilationTest(test.TestCase):
         y = Forward(x)
         dx, = gradients_impl.gradients(y, [x], 1.0)
 
-      cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-          optimizer_options=config_pb2.OptimizerOptions(
-              opt_level=config_pb2.OptimizerOptions.L1,
-              do_function_inlining=True)))
+      cfg = NoRewriteSessionConfig()
+      cfg.graph_options.optimizer_options.opt_level = (
+          config_pb2.OptimizerOptions.L1)
+      cfg.graph_options.optimizer_options.do_function_inlining = True
       with session_lib.Session(graph=g, config=cfg) as sess:
         run_metadata = config_pb2.RunMetadata()
         dx_val = sess.run(dx,
@@ -435,6 +452,74 @@ class XlaCompilationTest(test.TestCase):
     self.assertFalse(InLabels(labels, "Mul"))
     self.assertTrue(InLabels(labels, "_XlaLaunch"))
 
+  def testDenseLayer(self):
+    """Tests that the dense layer node is properly compiled."""
+
+    with self.test_session(config=NoRewriteSessionConfig()) as sess:
+      x = array_ops.placeholder(shape=[2, 3], dtype=np.float32)
+      with jit_scope():
+        y = layers.dense(x, 3)
+
+      sess.run(variables.initialize_all_variables())
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(y, {x: np.array([[1, 2, 3], [4, 5, 6]])},
+               run_metadata=run_metadata,
+               options=config_pb2.RunOptions(
+                   trace_level=config_pb2.RunOptions.FULL_TRACE))
+
+    self.assert_(MetadataHasXlaLaunch(run_metadata))
+
+
+class ElementWiseFusionTest(test.TestCase):
+
+  # Runs a simple test with the input jit_level and fusion_only flag.
+  def simpleTest(self, arg0, arg1, global_jit_level):
+    config = config_pb2.ConfigProto()
+    config.graph_options.optimizer_options.global_jit_level = global_jit_level
+
+    with session_lib.Session(config=config) as sess:
+      a1 = array_ops.placeholder(dtypes.float32, [2, 2], name="a1")
+      a2 = array_ops.placeholder(dtypes.float32, [2, 2], name="a2")
+      # Two element-wise ops. We need at least two ops since single
+      # element clusters are not passed to XLA in fusion_only mode.
+      a3 = a1 * a2
+      a4 = a3 + a1
+      # A matmul to break XLA clustering.
+      a5 = math_ops.matmul(a4, a1)
+      # Two more element-wise ops.
+      a6 = a5 - a4
+      a7 = a6 + a2
+
+      run_metadata = config_pb2.RunMetadata()
+      output = sess.run(
+          a7, {
+              a1: arg0,
+              a2: arg1
+          },
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+
+      labels = RunMetadataLabels(run_metadata)
+      count = sum("_XlaLaunch(" in x for x in labels)
+
+      return output, count
+
+  def testElementWiseClustering(self):
+    arg0 = np.random.rand(2, 2).astype(np.float32)
+    arg1 = np.random.rand(2, 2).astype(np.float32)
+    os.environ["TF_XLA_FLAGS"] = ("--tf_xla_fusion_only=true "
+                                  "--tf_xla_cpu_global_jit")
+    tf_op, tf_count = self.simpleTest(arg0, arg1,
+                                      config_pb2.OptimizerOptions.OFF)
+    self.assertEqual(0, tf_count)
+
+    tfef_op, tfef_count = self.simpleTest(arg0, arg1,
+                                          config_pb2.OptimizerOptions.ON_1)
+    self.assertEqual(2, tfef_count)
+
+    self.assertAllClose(tf_op, tfef_op, rtol=1e-1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/lrn_ops_test.py b/tensorflow/compiler/tests/lrn_ops_test.py
index 5d8d89224d4a778d84803811710bb095872e86b2..69bd8f7230d4394c45764d02a88fb0ec097c5756 100644
--- a/tensorflow/compiler/tests/lrn_ops_test.py
+++ b/tensorflow/compiler/tests/lrn_ops_test.py
@@ -115,11 +115,11 @@ class LRNTest(XLATestCase):
       out_image = constant_op.constant(out_image_vals, shape=shape)
       out_grads = constant_op.constant(out_grads_vals, shape=shape)
       with ops.device(CPU_DEVICE):
-        expected = gen_nn_ops._lrn_grad(out_grads, in_image, out_image,
-                                        depth_radius, bias, alpha, beta)
+        expected = gen_nn_ops.lrn_grad(out_grads, in_image, out_image,
+                                       depth_radius, bias, alpha, beta)
       with self.test_scope():
-        actual = gen_nn_ops._lrn_grad(out_grads, in_image, out_image,
-                                      depth_radius, bias, alpha, beta)
+        actual = gen_nn_ops.lrn_grad(out_grads, in_image, out_image,
+                                     depth_radius, bias, alpha, beta)
       expected_val = expected.eval()
       actual_val = actual.eval()
     self.assertAllClose(actual_val, expected_val, rtol=1e-3)
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index cccb7f5789dce39ef8c3d4b3a7573aaa983b3fbd..5819b2bf2b55b9213a039c0ba82dd0bf1c738b00 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -37,6 +37,14 @@ def MakePlaceholder(x):
 
 class MatrixTriangularSolveOpTest(XLATestCase):
 
+  #  MatrixTriangularSolve defined for float64, float32, complex64, complex128
+  # (https://www.tensorflow.org/api_docs/python/tf/matrix_triangular_solve)
+  @property
+  def float_types(self):
+    return set(super(MatrixTriangularSolveOpTest,
+                     self).float_types).intersection(
+                         (np.float64, np.float32, np.complex64, np.complex128))
+
   def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca,
                                  placeholder_b, a, clean_a, b, verification,
                                  atol):
diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1434e965e3d7eaeca94ad0fa97498f884e30e115
--- /dev/null
+++ b/tensorflow/compiler/tests/oom_test.py
@@ -0,0 +1,61 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for out-of-memory conditions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class OutOfMemoryTest(xla_test.XLATestCase):
+
+  def testOutputOutOfMemory(self):
+    """Allocates tensors until out of memory.
+
+    Generates a large rank-1 tensor. The tensor is an output of an XLA
+    computation, not constant.
+
+    Check that a ResourceExhaustedError is raised and can be caught.
+
+    We spin in a loop generating larger and larger tensors until an OOM event
+    happens. We may be running sandboxed, so have a small host memory limit, so
+    any hardcoded value is unlikely to land in the sweet spot between device
+    memory size and host memory size with stability.
+    """
+
+    def test_loop():
+      size = 2e8
+      while True:
+        with self.test_session():
+          # Force the compiled code to not be constant by feeding in an addend.
+          p = array_ops.placeholder(dtypes.float32, shape=[])
+          with self.test_scope():
+            # Create a large R1 tensor.
+            c = array_ops.zeros([size, 1]) + p
+
+            c.eval(feed_dict={p: 1.0})
+            size *= 2
+
+    self.assertRaises(errors.ResourceExhaustedError, test_loop)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e6d1313bd0336eba71fcf3658d949bd3342ae11
--- /dev/null
+++ b/tensorflow/compiler/tests/placeholder_test.py
@@ -0,0 +1,48 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for xla handling of placeholder_with_default."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class PlaceholderTest(XLATestCase):
+
+  def test_placeholder_with_default_default(self):
+    with self.test_session() as sess, self.test_scope():
+      v = resource_variable_ops.ResourceVariable(4.0)
+      ph = array_ops.placeholder_with_default(v, shape=[])
+      out = ph * 2
+      sess.run(variables.variables_initializer([v]))
+      self.assertEqual(8.0, sess.run(out))
+
+  def test_placeholder_with_default_fed(self):
+    with self.test_session() as sess, self.test_scope():
+      v = resource_variable_ops.ResourceVariable(4.0)
+      ph = array_ops.placeholder_with_default(v, shape=[])
+      out = ph * 2
+      sess.run(variables.variables_initializer([v]))
+      self.assertEqual(2.0, sess.run(out, {ph: 1.0}))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tests/pooling_ops_3d_test.py b/tensorflow/compiler/tests/pooling_ops_3d_test.py
index eb48fe555a0b182ea7983cbd8c3b217d56350408..4eed903963a34a253ea5c409782d9a89a97a4fdf 100644
--- a/tensorflow/compiler/tests/pooling_ops_3d_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_3d_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.platform import test
 # MaxPoolGrad.
 def _AvgPoolGrad(inputs, outputs, output_gradients, ksize, strides, padding):
   del outputs  # Unused by average-pooling gradients.
-  return gen_nn_ops._avg_pool3d_grad(
+  return gen_nn_ops.avg_pool3d_grad(
       inputs.get_shape().as_list(),
       output_gradients,
       ksize=ksize,
@@ -263,7 +263,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradValidPadding1_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops._max_pool3d_grad,
+        gen_nn_ops.max_pool3d_grad,
         input_sizes=[1, 3, 3, 3, 1],
         ksize=[1, 1, 1],
         strides=[1, 1, 1],
@@ -272,7 +272,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradValidPadding2_1_6_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops._max_pool3d_grad,
+        gen_nn_ops.max_pool3d_grad,
         input_sizes=[2, 3, 3, 6, 3],
         ksize=[2, 2, 2],
         strides=[1, 1, 1],
@@ -281,7 +281,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradValidPadding2_1_7_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops._max_pool3d_grad,
+        gen_nn_ops.max_pool3d_grad,
         input_sizes=[2, 3, 5, 7, 3],
         ksize=[2, 2, 2],
         strides=[1, 1, 1],
@@ -290,7 +290,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradValidPadding2_2_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops._max_pool3d_grad,
+        gen_nn_ops.max_pool3d_grad,
         input_sizes=[2, 2, 2, 2, 3],
         ksize=[2, 2, 2],
         strides=[2, 2, 2],
@@ -299,7 +299,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradSamePadding1_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops._max_pool3d_grad,
+        gen_nn_ops.max_pool3d_grad,
         input_sizes=[2, 3, 2, 4, 1],
         ksize=[1, 1, 1],
         strides=[1, 1, 1],
@@ -308,7 +308,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradSamePadding2_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops._max_pool3d_grad,
+        gen_nn_ops.max_pool3d_grad,
         input_sizes=[2, 3, 2, 4, 1],
         ksize=[2, 2, 2],
         strides=[1, 1, 1],
@@ -317,7 +317,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradSamePadding2_2_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops._max_pool3d_grad,
+        gen_nn_ops.max_pool3d_grad,
         input_sizes=[2, 5, 2, 4, 3],
         ksize=[2, 2, 2],
         strides=[2, 2, 2],
@@ -326,7 +326,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradSamePadding3_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops._max_pool3d_grad,
+        gen_nn_ops.max_pool3d_grad,
         input_sizes=[1, 3, 3, 7, 1],
         ksize=[3, 3, 3],
         strides=[1, 1, 1],
diff --git a/tensorflow/compiler/tests/pooling_ops_test.py b/tensorflow/compiler/tests/pooling_ops_test.py
index 7c19a99c4eb4be3ca34b3ce949216e557b0a681d..fe270af3d636c0824621f36360ce9e7d14d8fc91 100644
--- a/tensorflow/compiler/tests/pooling_ops_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_test.py
@@ -292,8 +292,15 @@ class PoolGradTest(XLATestCase):
 
   CPU_DEVICE = "/job:localhost/replica:0/task:0/cpu:0"
 
-  def _VerifyOneTest(self, pool_func, pool_grad_func, input_sizes, ksize,
-                     strides, padding, data_format):
+  def _VerifyOneTest(self,
+                     pool_func,
+                     pool_grad_func,
+                     input_sizes,
+                     ksize,
+                     strides,
+                     padding,
+                     data_format,
+                     pool_grad_grad_func=None):
     """Verifies the output values of the pooling gradient function.
 
     Args:
@@ -304,9 +311,19 @@ class PoolGradTest(XLATestCase):
       strides: The stride dimensions
       padding: Padding type.
       data_format: The data format we use to run the pooling operation.
+      pool_grad_grad_func: Second-order gradient function, if available.
     """
     total_size = np.prod(input_sizes)
-    x = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_sizes)
+    # TODO(b/73062247): MaxPoolGradGrad can confuse gradients when x is equally
+    # maximal at 16 bits. Switch to np.random.randn when resolved.
+    x = np.arange(1, total_size + 1, dtype=np.float32)
+    x *= (np.random.randint(2, size=total_size) * 2 - 1)  # Flip signs randomly
+    # Verify some specifically interesting values...
+    x[np.random.choice(total_size)] = np.inf
+    x[np.random.choice(total_size)] = -np.inf
+    # TODO(b/74222344): Fix nan handling for max pool grad.
+    # x[np.random.choice(total_size)] = np.nan
+    x = x.reshape(input_sizes)
     with self.test_session() as sess:
       # Use the forward pool function to compute some corresponding outputs
       # (needed for the CPU device, and we need the shape in both cases).
@@ -323,6 +340,8 @@ class PoolGradTest(XLATestCase):
       output_gradient_vals = np.arange(
           1, output_vals.size + 1, dtype=np.float32)
       output_gradient_vals = output_gradient_vals.reshape(output_vals.shape)
+      output_grad_grad_vals = np.arange(1, x.size + 1, dtype=np.float32)
+      output_grad_grad_vals = output_grad_grad_vals.reshape(x.shape)
 
       # Use the Tensorflow CPU pooling gradient to compute the expected input
       # gradients.
@@ -342,18 +361,36 @@ class PoolGradTest(XLATestCase):
             {inputs: x,
              output_gradients: output_gradient_vals})
 
+        output_grad_gradients = array_ops.placeholder(
+            dtypes.float32, shape=expected_input_gradient_vals.shape)
+        if pool_grad_grad_func is not None:
+          expected_grad_gradients = pool_grad_grad_func(
+              inputs,
+              outputs,
+              output_grad_gradients,
+              ksize=ksize,
+              strides=strides,
+              padding=padding,
+              data_format="NHWC")
+          expected_grad_gradients_vals = sess.run(expected_grad_gradients, {
+              inputs: x,
+              output_grad_gradients: output_grad_grad_vals
+          })
+
       # Run the gradient op on the XLA device
       with self.test_scope():
         outputs = array_ops.placeholder(dtypes.float32, shape=output_vals.shape)
         xla_inputs = inputs
         xla_outputs = outputs
         xla_output_gradients = output_gradients
+        xla_output_grad_gradients = output_grad_gradients
         xla_ksize = ksize
         xla_strides = strides
         if data_format == "NCHW":
           xla_inputs = NHWCToNCHW(inputs)
           xla_outputs = NHWCToNCHW(outputs)
           xla_output_gradients = NHWCToNCHW(output_gradients)
+          xla_output_grad_gradients = NHWCToNCHW(output_grad_gradients)
           xla_ksize = NHWCToNCHW(ksize)
           xla_strides = NHWCToNCHW(strides)
         actual_input_gradients = pool_grad_func(
@@ -366,22 +403,54 @@ class PoolGradTest(XLATestCase):
             data_format=data_format)
         if data_format == "NCHW":
           actual_input_gradients = NCHWToNHWC(actual_input_gradients)
-      actual = sess.run(actual_input_gradients, {
+        if pool_grad_grad_func is not None:
+          actual_grad_gradients = pool_grad_grad_func(
+              xla_inputs,
+              xla_outputs,
+              xla_output_grad_gradients,
+              ksize=xla_ksize,
+              strides=xla_strides,
+              padding=padding,
+              data_format=data_format)
+          if data_format == "NCHW":
+            actual_grad_gradients = NCHWToNHWC(actual_grad_gradients)
+      actual_input_gradients_vals = sess.run(actual_input_gradients, {
           inputs: x,
           outputs: output_vals,
           output_gradients: output_gradient_vals
       })
-
       # Compare the Tensorflow and XLA results.
       self.assertAllClose(
-          expected_input_gradient_vals.flatten(),
-          actual.flatten(),
+          expected_input_gradient_vals,
+          actual_input_gradients_vals,
           rtol=1e-4,
           atol=1e-6)
-      self.assertShapeEqual(actual, inputs)
-
-  def _VerifyValues(self, pool_func, pool_grad_func, input_sizes, ksize,
-                    strides, padding):
+      self.assertShapeEqual(actual_input_gradients_vals, inputs)
+
+      if pool_grad_grad_func is not None:
+        actual_grad_gradients_vals = sess.run(
+            actual_grad_gradients, {
+                inputs: x,
+                outputs: output_vals,
+                output_grad_gradients: output_grad_grad_vals
+            })
+
+        # Compare the Tensorflow and XLA results.
+        self.assertAllClose(
+            expected_grad_gradients_vals,
+            actual_grad_gradients_vals,
+            rtol=1e-4,
+            atol=1e-6)
+        self.assertShapeEqual(actual_grad_gradients_vals, outputs)
+
+  def _VerifyValues(self,
+                    pool_func,
+                    pool_grad_func,
+                    input_sizes,
+                    ksize,
+                    strides,
+                    padding,
+                    pool_grad_grad_func=None):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -391,12 +460,20 @@ class PoolGradTest(XLATestCase):
       ksize: The kernel size dimensions
       strides: The stride dimensions
       padding: Padding type.
+      pool_grad_grad_func: Second-order gradient function, if available.
     """
     for data_format in GetTestConfigs():
-      self._VerifyOneTest(pool_func, pool_grad_func, input_sizes, ksize,
-                          strides, padding, data_format)
-
-  def _TestPooling(self, forward_op, backward_op):
+      self._VerifyOneTest(
+          pool_func,
+          pool_grad_func,
+          input_sizes,
+          ksize,
+          strides,
+          padding,
+          data_format,
+          pool_grad_grad_func=pool_grad_grad_func)
+
+  def _TestPooling(self, forward_op, backward_op, pool_grad_grad_func=None):
     # VALID padding
     self._VerifyValues(
         forward_op,
@@ -404,7 +481,8 @@ class PoolGradTest(XLATestCase):
         input_sizes=[1, 3, 3, 3],
         ksize=[1, 2, 2, 1],
         strides=[1, 2, 2, 1],
-        padding="VALID")
+        padding="VALID",
+        pool_grad_grad_func=pool_grad_grad_func)
 
     # SAME padding
     self._VerifyValues(
@@ -413,7 +491,8 @@ class PoolGradTest(XLATestCase):
         input_sizes=[1, 2, 3, 3],
         ksize=[1, 2, 2, 1],
         strides=[1, 2, 2, 1],
-        padding="SAME")
+        padding="SAME",
+        pool_grad_grad_func=pool_grad_grad_func)
 
     # SAME padding, non square window
     self._VerifyValues(
@@ -422,7 +501,8 @@ class PoolGradTest(XLATestCase):
         input_sizes=[1, 2, 2, 1],
         ksize=[1, 1, 2, 1],
         strides=[1, 1, 1, 1],
-        padding="SAME")
+        padding="SAME",
+        pool_grad_grad_func=pool_grad_grad_func)
 
     # VALID padding, uneven stride
     self._VerifyValues(
@@ -431,14 +511,16 @@ class PoolGradTest(XLATestCase):
         input_sizes=[1, 4, 4, 1],
         ksize=[1, 2, 2, 1],
         strides=[1, 1, 2, 1],
-        padding="VALID")
+        padding="VALID",
+        pool_grad_grad_func=pool_grad_grad_func)
     self._VerifyValues(
         forward_op,
         backward_op,
         input_sizes=[1, 4, 4, 1],
         ksize=[1, 2, 2, 1],
         strides=[1, 2, 1, 1],
-        padding="VALID")
+        padding="VALID",
+        pool_grad_grad_func=pool_grad_grad_func)
 
     # SAME padding, size 4 input
     self._VerifyValues(
@@ -447,7 +529,8 @@ class PoolGradTest(XLATestCase):
         input_sizes=[1, 4, 4, 4],
         ksize=[1, 2, 2, 1],
         strides=[1, 2, 2, 1],
-        padding="SAME")
+        padding="SAME",
+        pool_grad_grad_func=pool_grad_grad_func)
 
     # SAME padding, size 8 input
     self._VerifyValues(
@@ -456,10 +539,14 @@ class PoolGradTest(XLATestCase):
         input_sizes=[1, 8, 8, 8],
         ksize=[1, 3, 3, 1],
         strides=[1, 2, 2, 1],
-        padding="SAME")
+        padding="SAME",
+        pool_grad_grad_func=pool_grad_grad_func)
 
   def testMaxPool(self):
-    self._TestPooling(nn_ops.max_pool, gen_nn_ops._max_pool_grad)
+    self._TestPooling(
+        nn_ops.max_pool,
+        gen_nn_ops.max_pool_grad,
+        pool_grad_grad_func=gen_nn_ops.max_pool_grad_grad)
 
   def testAvgPool(self):
     # Wrapper around AvgPoolGrad that ignores extra arguments needed by
@@ -467,7 +554,7 @@ class PoolGradTest(XLATestCase):
     def AvgPoolGrad(inputs, outputs, output_gradients, ksize, strides, padding,
                     data_format):
       del outputs  # Unused by average-pooling gradients.
-      return gen_nn_ops._avg_pool_grad(
+      return gen_nn_ops.avg_pool_grad(
           inputs.get_shape().as_list(),
           output_gradients,
           ksize=ksize,
@@ -483,7 +570,7 @@ class PoolGradTest(XLATestCase):
   def testMaxPoolKernelSmallerThanStrideValid(self):
     self._VerifyValues(
         nn_ops.max_pool,
-        gen_nn_ops._max_pool_grad,
+        gen_nn_ops.max_pool_grad,
         input_sizes=[1, 7, 7, 1],
         ksize=[1, 2, 2, 1],
         strides=[1, 3, 3, 1],
@@ -492,7 +579,7 @@ class PoolGradTest(XLATestCase):
   def testMaxPoolKernelSmallerThanStrideSame(self):
     self._VerifyValues(
         nn_ops.max_pool,
-        gen_nn_ops._max_pool_grad,
+        gen_nn_ops.max_pool_grad,
         input_sizes=[1, 3, 3, 1],
         ksize=[1, 1, 1, 1],
         strides=[1, 2, 2, 1],
@@ -500,7 +587,7 @@ class PoolGradTest(XLATestCase):
 
     self._VerifyValues(
         nn_ops.max_pool,
-        gen_nn_ops._max_pool_grad,
+        gen_nn_ops.max_pool_grad,
         input_sizes=[1, 4, 4, 1],
         ksize=[1, 1, 1, 1],
         strides=[1, 2, 2, 1],
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index e72dd4eea9f127e1df96ab166103c4c16372adb6..e53efc3091d8935e745122af29abd7b8063b1d01 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -83,8 +83,8 @@ string LocalDeviceToFullDeviceName(const string& device) {
   return strings::StrCat("/job:localhost/replica:0/task:0/device:", device);
 }
 
-constexpr std::array<DataType, 4> kAllXlaTypes = {
-    {DT_INT32, DT_FLOAT, DT_BOOL, DT_COMPLEX64}};
+constexpr std::array<DataType, 5> kAllXlaTypes = {
+    {DT_INT32, DT_FLOAT, DT_BOOL, DT_COMPLEX64, DT_INT64}};
 
 // An OpTestBuilder is a graph builder class that takes as input an operator to
 // test, its inputs and attributes, and builds a graph that executes the
diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py
index 965fdf684b973498d0b3c3cde17711cca7279705..2c084b04fa2f67ad0d86508109522d7bead206eb 100644
--- a/tensorflow/compiler/tests/reduce_ops_test.py
+++ b/tensorflow/compiler/tests/reduce_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import numpy as np
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
@@ -30,8 +31,13 @@ from tensorflow.python.platform import googletest
 
 class ReduceOpsTest(XLATestCase):
 
-  def _testReduction(self, tf_reduce_fn, np_reduce_fn, dtype, test_inputs,
-                     rtol=1e-4, atol=1e-4):
+  def _testReduction(self,
+                     tf_reduce_fn,
+                     np_reduce_fn,
+                     dtype,
+                     test_inputs,
+                     rtol=1e-4,
+                     atol=1e-4):
     """Tests that the output of 'tf_reduce_fn' matches numpy's output."""
 
     for test_input in test_inputs:
@@ -41,16 +47,16 @@ class ReduceOpsTest(XLATestCase):
           index = array_ops.placeholder(dtypes.int32)
           out = tf_reduce_fn(a, index)
         result = sess.run(out, {a: test_input, index: [0]})
-        self.assertAllClose(result, np_reduce_fn(test_input, axis=0),
-                            rtol=rtol, atol=atol)
+        self.assertAllClose(
+            result, np_reduce_fn(test_input, axis=0), rtol=rtol, atol=atol)
 
         result = sess.run(out, {a: test_input, index: [1]})
-        self.assertAllClose(result, np_reduce_fn(test_input, axis=1),
-                            rtol=rtol, atol=atol)
+        self.assertAllClose(
+            result, np_reduce_fn(test_input, axis=1), rtol=rtol, atol=atol)
 
         result = sess.run(out, {a: test_input, index: [-1]})
-        self.assertAllClose(result, np_reduce_fn(test_input, axis=1),
-                            rtol=rtol, atol=atol)
+        self.assertAllClose(
+            result, np_reduce_fn(test_input, axis=1), rtol=rtol, atol=atol)
 
         with self.assertRaisesWithPredicateMatch(
             errors_impl.InvalidArgumentError, 'Invalid reduction dim'):
@@ -60,7 +66,7 @@ class ReduceOpsTest(XLATestCase):
             errors_impl.InvalidArgumentError, 'Invalid reduction dim'):
           sess.run(out, {a: test_input, index: [2]})
 
-  FLOAT_DATA = [
+  REAL_DATA = [
       np.zeros(shape=(2, 0)),
       np.zeros(shape=(0, 30)),
       np.arange(1, 7).reshape(2, 3),
@@ -74,7 +80,7 @@ class ReduceOpsTest(XLATestCase):
       np.arange(-14, -2, dtype=np.float32).view(np.complex64).reshape(2, 3),
       np.arange(-4, 8, dtype=np.float32).view(np.complex64).reshape(2, 3),
   ]
-  NONEMPTY_FLOAT_DATA = [x for x in FLOAT_DATA if np.size(x) > 0]
+  NONEMPTY_REAL_DATA = [x for x in REAL_DATA if np.size(x) > 0]
   NONEMPTY_COMPLEX_DATA = [x for x in COMPLEX_DATA if np.size(x) > 0]
   BOOL_DATA = [
       np.array([], dtype=np.bool).reshape(2, 0),
@@ -83,8 +89,7 @@ class ReduceOpsTest(XLATestCase):
   ]
 
   def testReduceSumF32(self):
-    self._testReduction(math_ops.reduce_sum, np.sum, np.float32,
-                        self.FLOAT_DATA)
+    self._testReduction(math_ops.reduce_sum, np.sum, np.float32, self.REAL_DATA)
 
   def testReduceSumC64(self):
     self._testReduction(math_ops.reduce_sum, np.sum, np.complex64,
@@ -92,7 +97,7 @@ class ReduceOpsTest(XLATestCase):
 
   def testReduceProdF32(self):
     self._testReduction(math_ops.reduce_prod, np.prod, np.float32,
-                        self.FLOAT_DATA)
+                        self.REAL_DATA)
 
   def testReduceProdC64(self):
     self._testReduction(math_ops.reduce_prod, np.prod, np.complex64,
@@ -100,31 +105,44 @@ class ReduceOpsTest(XLATestCase):
 
   def testReduceMin(self):
 
-    def reference_min(inp, axis):
+    def reference_min(dtype, inp, axis):
       """Wrapper around np.amin that returns +infinity for an empty input."""
       if inp.shape[axis] == 0:
-        return np.full(inp.shape[0:axis] + inp.shape[axis + 1:], float('inf'))
+        if np.issubdtype(dtype, np.floating):
+          return np.full(inp.shape[0:axis] + inp.shape[axis + 1:], float('inf'))
+        return np.full(inp.shape[0:axis] + inp.shape[axis + 1:],
+                       np.iinfo(dtype).max)
       return np.amin(inp, axis)
 
-    self._testReduction(math_ops.reduce_min, reference_min, np.float32,
-                        self.FLOAT_DATA)
+    for dtype in set(self.all_types).intersection(
+        [np.float32, np.int32, np.int64]):
+      self._testReduction(math_ops.reduce_min,
+                          functools.partial(reference_min, dtype), dtype,
+                          self.REAL_DATA)
 
   def testReduceMax(self):
 
-    def reference_max(inp, axis):
+    def reference_max(dtype, inp, axis):
       """Wrapper around np.amax that returns -infinity for an empty input."""
       if inp.shape[axis] == 0:
-        return np.full(inp.shape[0:axis] + inp.shape[axis + 1:], float('-inf'))
+        if np.issubdtype(dtype, np.floating):
+          return np.full(inp.shape[0:axis] + inp.shape[axis + 1:],
+                         float('-inf'))
+        return np.full(inp.shape[0:axis] + inp.shape[axis + 1:],
+                       np.iinfo(dtype).min)
       return np.amax(inp, axis)
 
-    self._testReduction(math_ops.reduce_max, reference_max, np.float32,
-                        self.FLOAT_DATA)
+    for dtype in set(self.all_types).intersection(
+        [np.float32, np.int32, np.int64]):
+      self._testReduction(math_ops.reduce_max,
+                          functools.partial(reference_max, dtype), dtype,
+                          self.REAL_DATA)
 
   def testReduceMeanF32(self):
     # TODO(phawkins): mean on XLA currently returns 0 instead of NaN when
     # reducing across zero inputs.
     self._testReduction(math_ops.reduce_mean, np.mean, np.float32,
-                        self.NONEMPTY_FLOAT_DATA)
+                        self.NONEMPTY_REAL_DATA)
 
   def testReduceMeanC64(self):
     self._testReduction(math_ops.reduce_mean, np.mean, np.complex64,
diff --git a/tensorflow/compiler/tests/reduce_window_test.py b/tensorflow/compiler/tests/reduce_window_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e78a63465b80644d8810d9fa7433653bc4639fed
--- /dev/null
+++ b/tensorflow/compiler/tests/reduce_window_test.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for xla.reduce_window."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class ReduceWindowTest(XLATestCase):
+  """Test cases for xla.reduce_window."""
+
+  def _reduce_window(self, operand, init, reducer, **kwargs):
+    with self.test_session():
+      placeholder = array_ops.placeholder(operand.dtype)
+      with self.test_scope():
+        output = xla.reduce_window(placeholder, init, reducer, **kwargs)
+      return output.eval(feed_dict={placeholder: operand})
+
+  def testReduceWindow(self):
+
+    # TODO(b/77644762): float16 and float64 ReduceWindow are unimplemented.
+    for dtype in set(self.numeric_types).intersection(
+        set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
+
+      @function.Defun(dtype, dtype)
+      def sum_reducer(x, y):
+        return x + y
+
+      @function.Defun(dtype, dtype)
+      def mul_reducer(x, y):
+        return x * y
+
+      self.assertAllClose(
+          np.array([3, 5, 7, 9, 11, 13], dtype=dtype),
+          self._reduce_window(
+              np.array([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[2]))
+
+      self.assertAllClose(
+          np.array([3, 7, 11], dtype=dtype),
+          self._reduce_window(
+              np.array([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[2],
+              window_strides=[2]))
+
+      self.assertAllClose(
+          np.array([1, 4, 7], dtype=dtype),
+          self._reduce_window(
+              np.array([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[1],
+              window_strides=[3]))
+
+      self.assertAllClose(
+          np.array([[24, 36, 24], [96, 0, 0]], dtype=dtype),
+          self._reduce_window(
+              np.array([[1, 2, 3, 4], [4, 3, 2, 1], [2, 4, 0, 1]], dtype=dtype),
+              1.0,
+              mul_reducer,
+              window_dimensions=[2, 2],
+              window_strides=[1, 1]))
+
+      self.assertAllClose(
+          np.array([[0, 0, 0], [5, 10, 5], [2, 4, 1], [0, 0, 0]], dtype=dtype),
+          self._reduce_window(
+              np.array([[1, 2, 3, 4], [4, 3, 2, 1], [2, 4, 0, 1]], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[2, 2],
+              window_strides=[2, 2],
+              padding=[[2, 3], [1, 2]]))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tests/slice_ops_test.py b/tensorflow/compiler/tests/slice_ops_test.py
index a7cbfb04003c397212a35e16c6b23d7c2a18f7df..305ca0c6b78d3ef985deb38816f9388e7983906b 100644
--- a/tensorflow/compiler/tests/slice_ops_test.py
+++ b/tensorflow/compiler/tests/slice_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
 
@@ -137,6 +138,34 @@ class StridedSliceTest(XLATestCase):
 
         self.assertAllEqual([6, 4], result)
 
+  def test2DDegenerate(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[2, 3])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [-1, 0], [0, 3])
+        params = {
+            i: [[0, 1, 2],
+                [3, 4, 5]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertEqual(tensor_shape.TensorShape((0, 3)), result.shape)
+
+  def test2DDegenerateNegativeStride(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[2, 3])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [0, 0], [-1, 3], [-1, 1])
+        params = {
+            i: [[0, 1, 2],
+                [3, 4, 5]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertEqual(tensor_shape.TensorShape((0, 3)), result.shape)
+
   def test3D(self):
     for dtype in self.numeric_types:
       with self.test_session():
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
index c013f4b50a4cf95be8028248c52b10b1c3be2bd3..f37c34156f96761632247be4bc1b62fca54f666e 100644
--- a/tensorflow/compiler/tests/spacetobatch_op_test.py
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.platform import test
@@ -75,11 +76,11 @@ class SpaceToBatchTest(XLATestCase):
       for dtype in self.float_types:
         # outputs = space_to_batch(inputs)
         placeholder = array_ops.placeholder(dtype)
-        x_tf = gen_array_ops._space_to_batch(
+        x_tf = gen_array_ops.space_to_batch(
             placeholder, paddings, block_size=block_size)
         self.assertAllEqual(sess.run(x_tf, {placeholder: inputs}), outputs)
         # inputs = batch_to_space(outputs)
-        x_tf = gen_array_ops._batch_to_space(
+        x_tf = gen_array_ops.batch_to_space(
             placeholder, paddings, block_size=block_size)
         self.assertAllEqual(sess.run(x_tf, {placeholder: outputs}), inputs)
 
@@ -156,14 +157,32 @@ class SpaceToBatchNDTest(XLATestCase):
     paddings = np.array(paddings).reshape((len(block_shape), 2))
     with self.test_session() as sess, self.test_scope():
       for dtype in self.float_types:
+        # TODO(b/68813416): Skip bfloat16's as the input type for direct is
+        # float32 and results in a mismatch, while making testDirect provide the
+        # correctly typed input results in 'no fill-function for data-type'
+        # error.
+        if dtype == dtypes.bfloat16.as_numpy_dtype:
+          continue
+        if dtype == np.float16:
+          actual_inputs = np.array(inputs).astype(dtype)
+          actual_paddings = np.array(paddings).astype(dtype)
+          expected_outputs = np.array(outputs).astype(dtype)
+        else:
+          actual_inputs = inputs
+          actual_paddings = paddings
+          expected_outputs = outputs
         placeholder = array_ops.placeholder(dtype)
         # outputs = space_to_batch(inputs)
-        x_tf = array_ops.space_to_batch_nd(placeholder, block_shape, paddings)
-        self.assertAllEqual(sess.run(x_tf, {placeholder: inputs}), outputs)
+        x_tf = array_ops.space_to_batch_nd(placeholder, block_shape,
+                                           actual_paddings)
+        self.assertAllEqual(
+            sess.run(x_tf, {placeholder: actual_inputs}), expected_outputs)
         # inputs = batch_to_space(outputs)
         placeholder = array_ops.placeholder(dtype)
-        x_tf = array_ops.batch_to_space_nd(placeholder, block_shape, paddings)
-        self.assertAllEqual(sess.run(x_tf, {placeholder: outputs}), inputs)
+        x_tf = array_ops.batch_to_space_nd(placeholder, block_shape,
+                                           actual_paddings)
+        self.assertAllEqual(
+            sess.run(x_tf, {placeholder: expected_outputs}), actual_inputs)
 
   def _testDirect(self, input_shape, block_shape, paddings):
     inputs = np.arange(np.prod(input_shape), dtype=np.float32)
diff --git a/tensorflow/compiler/tests/stack_ops_test.py b/tensorflow/compiler/tests/stack_ops_test.py
index 2b9c2279737ccee531d488d27ccdb0cafa1dc8fc..94342f9567ca71274609e63b0482d55637c98d51 100644
--- a/tensorflow/compiler/tests/stack_ops_test.py
+++ b/tensorflow/compiler/tests/stack_ops_test.py
@@ -34,33 +34,33 @@ class StackOpTest(XLATestCase):
     with self.test_session(), self.test_scope():
       size = array_ops.placeholder(dtypes.int32)
       v = array_ops.placeholder(dtypes.float32)
-      h = gen_data_flow_ops._stack_v2(size, dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push_v2(h, v)
+      h = gen_data_flow_ops.stack_v2(size, dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops.stack_push_v2(h, v)
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
+        c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
       self.assertAllClose([[4.0, 5.0]], c1.eval({size: 5, v: [[4.0, 5.0]]}))
 
   def testStackPushPopSwap(self):
     with self.test_session(), self.test_scope():
       a = np.arange(2000)
       x = array_ops.placeholder(dtypes.float32)
-      h = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push_v2(h, x, swap_memory=True)
+      h = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops.stack_push_v2(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
+        c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
       self.assertAllClose(a, c1.eval({x: a}))
 
   def testMultiStack(self):
     with self.test_session(), self.test_scope():
       v = array_ops.placeholder(dtypes.float32)
-      h1 = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
-      c1 = gen_data_flow_ops._stack_push_v2(h1, v)
+      h1 = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo")
+      c1 = gen_data_flow_ops.stack_push_v2(h1, v)
       with ops.control_dependencies([c1]):
-        c1 = gen_data_flow_ops._stack_pop_v2(h1, dtypes.float32)
-      h2 = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="bar")
-      c2 = gen_data_flow_ops._stack_push_v2(h2, 5.0)
+        c1 = gen_data_flow_ops.stack_pop_v2(h1, dtypes.float32)
+      h2 = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="bar")
+      c2 = gen_data_flow_ops.stack_push_v2(h2, 5.0)
       with ops.control_dependencies([c2]):
-        c2 = gen_data_flow_ops._stack_pop_v2(h2, dtypes.float32)
+        c2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
       r = c1 + c2
       self.assertAllClose(9.0, r.eval({v: 4.0}))
 
@@ -69,15 +69,15 @@ class StackOpTest(XLATestCase):
     with self.test_session() as sess, self.test_scope():
       v1 = array_ops.placeholder(dtypes.float32)
       v2 = array_ops.placeholder(dtypes.float32)
-      h1 = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
-      h2 = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
+      h1 = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo")
+      h2 = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo")
 
-      c1 = gen_data_flow_ops._stack_push_v2(h1, v1)
+      c1 = gen_data_flow_ops.stack_push_v2(h1, v1)
       with ops.control_dependencies([c1]):
-        c2 = gen_data_flow_ops._stack_push_v2(h2, v2)
+        c2 = gen_data_flow_ops.stack_push_v2(h2, v2)
       with ops.control_dependencies([c2]):
-        pop1 = gen_data_flow_ops._stack_pop_v2(h1, dtypes.float32)
-        pop2 = gen_data_flow_ops._stack_pop_v2(h2, dtypes.float32)
+        pop1 = gen_data_flow_ops.stack_pop_v2(h1, dtypes.float32)
+        pop2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
 
       out1, out2 = sess.run([pop1, pop2], {v1: 4.0, v2: 5.0})
       self.assertAllClose(out1, 4.0)
@@ -86,17 +86,17 @@ class StackOpTest(XLATestCase):
   def testCloseStack(self):
     with self.test_session() as sess, self.test_scope():
       size = array_ops.placeholder(dtypes.int32)
-      h = gen_data_flow_ops._stack_v2(size, dtypes.float32, stack_name="foo")
-      c1 = gen_data_flow_ops._stack_close_v2(h)
+      h = gen_data_flow_ops.stack_v2(size, dtypes.float32, stack_name="foo")
+      c1 = gen_data_flow_ops.stack_close_v2(h)
       sess.run(c1, {size: 5})
 
   def testPushCloseStack(self):
     with self.test_session() as sess, self.test_scope():
       v = array_ops.placeholder(dtypes.float32)
-      h = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push_v2(h, v)
+      h = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops.stack_push_v2(h, v)
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_close_v2(h)
+        c1 = gen_data_flow_ops.stack_close_v2(h)
       sess.run(c1, {v: [[4.0, 5.0]]})
 
 
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index a62925a1818da00cb0a9e82e1281db20fb38b208..f332aa2e9b97e13654cf9b10588c18fed32f7ad4 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -338,7 +338,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         w0 = ta.write(0, [[4.0, 5.0]])
 
         # Test reading wrong datatype.
-        r0_bad = gen_data_flow_ops._tensor_array_read_v3(
+        r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtype2, flow_in=w0.flow)
         with self.assertRaisesOpError("TensorArray dtype is "):
           r0_bad.eval()
@@ -472,7 +472,9 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(c([[-2.0, -10.0]]), grad_vals[1])
 
   def testTensorArrayGradientWriteRead(self):
-    for dtype in self.numeric_types:
+    for dtype in self.float_types:
+      self._testTensorArrayGradientWriteReadType(dtype)
+    for dtype in self.complex_types:
       self._testTensorArrayGradientWriteReadType(dtype)
 
   def _testTensorArrayGradientWritePackConcatAndRead(self):
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index ba5f829936fd82ca0cc53eda34aefbca6d80482b..ef047005b60bd156a677050368ef67ae030d6c3a 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
@@ -68,40 +69,41 @@ class TernaryOpsTest(XLATestCase):
         expected=np.array([1, 3, 5], dtype=np.int32))
 
   def testSelect(self):
-    self._testTernary(
-        array_ops.where,
-        np.array(0, dtype=np.bool),
-        np.array(2, dtype=np.float32),
-        np.array(7, dtype=np.float32),
-        expected=np.array(7, dtype=np.float32))
+    for dtype in self.numeric_types:
+      self._testTernary(
+          array_ops.where,
+          np.array(0, dtype=np.bool),
+          np.array(2, dtype=dtype),
+          np.array(7, dtype=dtype),
+          expected=np.array(7, dtype=dtype))
 
-    self._testTernary(
-        array_ops.where,
-        np.array(1, dtype=np.bool),
-        np.array([1, 2, 3, 4], dtype=np.float32),
-        np.array([5, 6, 7, 8], dtype=np.float32),
-        expected=np.array([1, 2, 3, 4], dtype=np.float32))
+      self._testTernary(
+          array_ops.where,
+          np.array(1, dtype=np.bool),
+          np.array([1, 2, 3, 4], dtype=dtype),
+          np.array([5, 6, 7, 8], dtype=dtype),
+          expected=np.array([1, 2, 3, 4], dtype=dtype))
 
-    self._testTernary(
-        array_ops.where,
-        np.array(0, dtype=np.bool),
-        np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
-        np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32),
-        expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32))
+      self._testTernary(
+          array_ops.where,
+          np.array(0, dtype=np.bool),
+          np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype),
+          np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
+          expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype))
 
-    self._testTernary(
-        array_ops.where,
-        np.array([0, 1, 1, 0], dtype=np.bool),
-        np.array([1, 2, 3, 4], dtype=np.float32),
-        np.array([5, 6, 7, 8], dtype=np.float32),
-        expected=np.array([5, 2, 3, 8], dtype=np.float32))
+      self._testTernary(
+          array_ops.where,
+          np.array([0, 1, 1, 0], dtype=np.bool),
+          np.array([1, 2, 3, 4], dtype=dtype),
+          np.array([5, 6, 7, 8], dtype=dtype),
+          expected=np.array([5, 2, 3, 8], dtype=dtype))
 
-    self._testTernary(
-        array_ops.where,
-        np.array([0, 1, 0], dtype=np.bool),
-        np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
-        np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32),
-        expected=np.array([[7, 8], [3, 4], [11, 12]], dtype=np.float32))
+      self._testTernary(
+          array_ops.where,
+          np.array([0, 1, 0], dtype=np.bool),
+          np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype),
+          np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
+          expected=np.array([[7, 8], [3, 4], [11, 12]], dtype=dtype))
 
   def testSlice(self):
     for dtype in self.numeric_types:
@@ -119,6 +121,23 @@ class TernaryOpsTest(XLATestCase):
           np.array([2, 1], dtype=np.int32),
           expected=np.array([[2], [5]], dtype=dtype))
 
+  def testClipByValue(self):
+    # TODO(b/78258593): enable integer types here too.
+    for dtype in self.float_types:
+      test_cases = [
+          (np.array([2, 4, 5], dtype=dtype), dtype(7)),  #
+          (dtype(1), np.array([2, 4, 5], dtype=dtype)),  #
+          (np.array([-2, 7, 7], dtype=dtype), np.array([-2, 9, 8], dtype=dtype))
+      ]
+      x = np.array([-2, 10, 6], dtype=dtype)
+      for lower, upper in test_cases:
+        self._testTernary(
+            gen_math_ops._clip_by_value,
+            x,
+            lower,
+            upper,
+            expected=np.minimum(np.maximum(x, lower), upper))
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 3d3e112f4821ea8e57ea9589a5b4433647ad294b..ba79f393a8f9b24ac506d2130957c38ecd442509 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -154,6 +154,9 @@ class UnaryOpsTest(XLATestCase):
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      # TODO(b/77694432): Half test failed on CPU, last ran on 04-06-2018.
+      if dtype == np.float16 and self.device == "XLA_CPU":
+        continue
       x = np.arange(-0.90, 0.90, 0.25)
       self._assertOpOutputMatchesExpected(
           math_ops.acos,
@@ -600,6 +603,20 @@ class UnaryOpsTest(XLATestCase):
               src,
               expected=dst)
 
+  def testBitcast(self):
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.bitcast(x, dtypes.int32),
+        np.array([1, 0x3f800000], np.int32),
+        expected=np.array([1, 0x3f800000], np.int32))
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.bitcast(x, dtypes.float32),
+        np.array([1, 0x3f800000], np.int32),
+        expected=np.array([1e-45, 1.0], np.float32))
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.bitcast(x, dtypes.int32),
+        np.array([1e-45, 1.0], np.float32),
+        expected=np.array([1, 0x3f800000], np.int32))
+
   def testInvertPermutation(self):
     self._assertOpOutputMatchesExpected(
         array_ops.invert_permutation,
@@ -779,7 +796,10 @@ class UnaryOpsTest(XLATestCase):
       self._assertSoftplusMatchesExpected([[-2, 0, 8]], dtype)
       self._assertSoftplusMatchesExpected(
           [[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]], dtype)
-      log_eps = np.log(np.finfo(dtype).eps)
+      if dtype == dtypes.bfloat16.as_numpy_dtype:
+        log_eps = np.log(np.finfo(np.float32).eps)
+      else:
+        log_eps = np.log(np.finfo(dtype).eps)
       one = dtype(1)
       ten = dtype(10)
       self._assertSoftplusMatchesExpected([
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index b08d6ab21e0746558cb3d4818d4c822c45d2e9ee..8ecad00f6e23b3a7746bbb473102ac847bf4cbfd 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -230,7 +230,10 @@ class SliceAssignTest(XLATestCase):
       # shrink shape changes
       checker[1:2, 1] = [66]
       checker[1, 1:2] = [66]
-      checker[1, 1] = 66
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        # TODO(b/68813416): valnp call above results in an ndarray and not a
+        # number for bfloat16s.
+        checker[1, 1] = 66
       # newaxis shape changes
       checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
       # shrink and newaxis
@@ -243,8 +246,11 @@ class SliceAssignTest(XLATestCase):
 
       # Assign vector to scalar (rank-0) using newaxis
       checker2 = StridedSliceAssignChecker(self, 222, dtype=dtype)
-      checker2[()] = 6  # no indices
-      checker2[...] = 6  # ellipsis
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        # TODO(b/68813416): valnp call above results in an ndarray and not a
+        # number for bfloat16s.
+        checker2[()] = 6  # no indices
+        checker2[...] = 6  # ellipsis
       checker2[None] = [6]  # new axis
 
   def testUninitialized(self):
diff --git a/tensorflow/compiler/tests/while_test.py b/tensorflow/compiler/tests/while_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f79eb27435cc954cebde4357c1d946a320f4ed75
--- /dev/null
+++ b/tensorflow/compiler/tests/while_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for while loops in XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class WhileTest(XLATestCase):
+
+  def testSingletonLoopHandrolled(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32)
+    def loop_body(step):
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      return step_out
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32)
+    def loop_cond(step):
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index], loop_cond, loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0})
+      self.assertAllClose(result, [10], rtol=1e-3)
+
+  def testCountingLoopHandrolled(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def loop_body(step, rsum):
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      sum_out = rsum + constant_op.constant(1.5, dtype=dtypes.float32)
+      return step_out, sum_out
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def loop_cond(step, rsum):
+      del rsum
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      init_sum = array_ops.placeholder(dtypes.float32, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index, init_sum], loop_cond,
+                                      loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0, init_sum: 0.0})
+      self.assertAllClose(result, [10, 15.0], rtol=1e-3)
+      no_iters_result = sess.run(loop_outputs, {init_index: 10, init_sum: 0.0})
+      self.assertAllClose(no_iters_result, [10, 0.0], rtol=1e-3)
+
+  def testCountingLoopHandrolledC64(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32, dtypes.complex64)
+    def loop_body(step, rsum):
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      sum_out = rsum + constant_op.constant(1.5 + 2j, dtype=dtypes.complex64)
+      return step_out, sum_out
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32, dtypes.complex64)
+    def loop_cond(step, rsum):
+      del rsum
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      init_sum = array_ops.placeholder(dtypes.complex64, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index, init_sum], loop_cond,
+                                      loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0, init_sum: 0.0})
+      self.assertAllClose(result[1], np.complex64(15 + 20j), rtol=1e-3)
+      no_iters_result = sess.run(loop_outputs, {init_index: 10, init_sum: 0.0})
+      self.assertAllClose(no_iters_result[1], np.complex64(0), rtol=1e-3)
+
+  def testLoopWithConstantOutput(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def loop_body(step, x):
+      del x
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      return (step_out, 7)
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def loop_cond(step, x):
+      del x
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index, 42], loop_cond, loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0})
+      self.assertAllClose(result, [10, 7], rtol=1e-3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index 7e1f5c76ed65946363cc3c113ab1a9862f87b289..e924fe1e61454aefda622a5a46a0e483d26db5c1 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import os
 import random
 import re
 
@@ -44,6 +45,8 @@ flags.DEFINE_string('test_device', None,
 flags.DEFINE_string('types', None, 'Types to test. Comma-separated list.')
 flags.DEFINE_string('disabled_manifest', None,
                     'Path to a file with a list of tests that should not run.')
+flags.DEFINE_string('tf_xla_flags', None,
+                    'Value to set the TF_XLA_FLAGS environment variable to')
 
 
 class XLATestCase(test.TestCase):
@@ -71,14 +74,14 @@ class XLATestCase(test.TestCase):
 
     self._all_types = set(
         [dtype.as_numpy_dtype for dtype in self._all_tf_types])
-    self.int_types = set([dtype.as_numpy_dtype for dtype in self.int_tf_types])
+    self._int_types = set([dtype.as_numpy_dtype for dtype in self.int_tf_types])
     self._float_types = set(
         [dtype.as_numpy_dtype for dtype in self._float_tf_types])
     self.complex_types = set([
         dtype.as_numpy_dtype for dtype in self.complex_tf_types
     ])
-    self._numeric_types = set(
-        self.int_types | self._float_types | self.complex_types)
+    self._numeric_types = set(self._int_types | self._float_types
+                              | self.complex_types)
 
     # Parse the manifest file, if any, into a regex identifying tests to
     # disable
@@ -97,6 +100,8 @@ class XLATestCase(test.TestCase):
       disabled_tests = []
       disabled_method_types = []
       for l in manifest_file.read().splitlines():
+        if not l:
+          continue
         entry = comments_re.sub('', l).strip().split(' ')
         if len(entry) == 1:
           disabled_tests.append(entry[0])
@@ -113,6 +118,9 @@ class XLATestCase(test.TestCase):
             for name in types])
       manifest_file.close()
 
+    if FLAGS.tf_xla_flags is not None:
+      os.environ['TF_XLA_FLAGS'] = FLAGS.tf_xla_flags
+
   @property
   def all_tf_types(self):
     name = '{}.{}'.format(type(self).__name__, self._testMethodName)
@@ -130,6 +138,11 @@ class XLATestCase(test.TestCase):
     name = '{}.{}'.format(type(self).__name__, self._testMethodName)
     return self._float_tf_types - self._method_types_filter.get(name, set())
 
+  @property
+  def int_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._int_types - self._method_types_filter.get(name, set())
+
   @property
   def numeric_tf_types(self):
     name = '{}.{}'.format(type(self).__name__, self._testMethodName)
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index fb82c2601c432cee425a46a3b6dc2c55febeda87..942504e6bd4c9ce93c9482251823efcbb46ab1c8 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -58,6 +58,15 @@ xla_proto_library(
     ],
 )
 
+xla_proto_library(
+    name = "host_compute_metadata_proto",
+    srcs = ["host_compute_metadata.proto"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tf2xla",
     srcs = ["tf2xla.cc"],
@@ -149,6 +158,7 @@ cc_library(
         ":common",
         ":dump_graph",
         ":functionalize_control_flow",
+        ":host_compute_metadata_proto",
         ":sharding_util",
         ":tf2xla_util",
         "//tensorflow/compiler/tf2xla/lib:util",
@@ -322,6 +332,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -401,10 +412,9 @@ cc_library(
     hdrs = ["functionalize_control_flow.h"],
     deps = [
         ":tf2xla_util",
-        "//tensorflow/compiler/jit:graph_to_functiondef",
         "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla:dump_graph",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:core_cpu",
@@ -426,7 +436,7 @@ tf_cc_test(
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:resource_variable_ops",
-        "//tensorflow/compiler/tf2xla/cc:functional_ops",
+        "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -452,17 +462,3 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index 311dddca94c458a60fd00afe5532840e0dbf0437..4f8bb8ad743afe69a6544c2ae0dc7309891b2df3 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -7,61 +7,23 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_cc")
 
 tf_gen_op_wrapper_cc(
-    name = "functional_ops_gen",
-    include_internal_ops = 1,
-    out_ops_file = "ops/functional_ops",
-    deps = ["//tensorflow/compiler/tf2xla/ops:functional_ops"],
+    name = "xla_ops_gen",
+    out_ops_file = "ops/xla_ops",
+    deps = ["//tensorflow/compiler/tf2xla/ops:xla_ops"],
 )
 
 cc_library(
-    name = "functional_ops",
-    srcs = ["ops/functional_ops.cc"],
-    hdrs = ["ops/functional_ops.h"],
+    name = "xla_ops",
+    srcs = ["ops/xla_ops.cc"],
+    hdrs = ["ops/xla_ops.h"],
     deps = [
         "//tensorflow/cc:const_op",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-tf_gen_op_wrapper_cc(
-    name = "sendrecv_ops_gen",
-    include_internal_ops = 1,
-    out_ops_file = "ops/sendrecv_ops",
-    deps = ["//tensorflow/compiler/tf2xla/ops:sendrecv_ops"],
-)
-
-cc_library(
-    name = "sendrecv_ops",
-    srcs = ["ops/sendrecv_ops.cc"],
-    hdrs = ["ops/sendrecv_ops.h"],
-    deps = [
-        "//tensorflow/cc:const_op",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index 82923722c54d235716b9138d95a75a441df924ca..de1008803d69fefa415c7bdbe6c27a62e625b417 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -37,7 +37,7 @@ Status BackwardsConstAnalysis(const Graph& g,
   };
 
   Status status;
-  std::unordered_set<Node*> must_be_const;
+  std::unordered_set<const Node*> must_be_const;
   auto visit = [&status, &metadata_ops, &must_be_const,
                 compile_time_const_args](Node* node) {
     if (!status.ok()) return;
@@ -55,8 +55,10 @@ Status BackwardsConstAnalysis(const Graph& g,
         compile_time_const_args->at(index) = true;
         return;
       }
-      for (Node* pred : node->in_nodes()) {
-        must_be_const.insert(pred);
+      for (const Edge* pred : node->in_edges()) {
+        if (!pred->IsControlEdge()) {
+          must_be_const.insert(pred->src());
+        }
       }
       return;
     }
diff --git a/tensorflow/compiler/tf2xla/const_analysis_test.cc b/tensorflow/compiler/tf2xla/const_analysis_test.cc
index 9d125f8d499863cfaa0e26b5b633ca02914d1b7d..992b12c06db5efc0ae54284d0ea77017c1c79aca 100644
--- a/tensorflow/compiler/tf2xla/const_analysis_test.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis_test.cc
@@ -79,5 +79,24 @@ TEST(ConstAnalysisTest, TopologicalOrder) {
   }
 }
 
+TEST(ConstAnalysisTest, DontFollowControlDependencies) {
+  Scope root = Scope::NewRootScope();
+
+  Output arg0 = ops::_Arg(root.WithOpName("Arg0"), DT_INT32, 0);
+  Output arg1 = ops::_Arg(root.WithOpName("Arg1"), DT_INT32, 1);
+  Output c1 =
+      ops::Const(root.WithOpName("c1").WithControlDependencies(arg0), 1, {1});
+  Output add = ops::Add(root, arg1, c1);
+  Output reshape = ops::Reshape(root, arg1, add);
+
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph));
+
+  std::vector<bool> const_args(2, false);
+  TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args));
+
+  EXPECT_EQ(const_args, std::vector<bool>({false, true}));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index f8169795ddfb7fd4e93d3f136c51623385868951..8d1f2684909e876fe5521ba6a63d745c7d3956e0 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -21,13 +21,13 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
@@ -583,13 +583,15 @@ class FunctionalizeCond {
   // CondArgNode represents a input to the conditional and its corresponding
   // switch nodes.
   struct CondArgNode {
-    explicit CondArgNode(Node* input) : input(input) {}
+    explicit CondArgNode(Node* src, int src_output)
+        : src(src), src_output(src_output) {}
     string ToString() const {
-      return strings::StrCat("input=", input->name(),
+      return strings::StrCat("src=", src->name(), ":", src_output,
                              " switches=", NodesToString(switches));
     }
 
-    Node* input;
+    Node* src;
+    int src_output;
     std::vector<Node*> switches;
   };
   using CondArgNodes = std::vector<CondArgNode>;
@@ -606,14 +608,15 @@ class FunctionalizeCond {
 
   // Group of switch nodes that will be part of the same XlaIf.
   struct SwitchCluster {
-    explicit SwitchCluster(Node* predicate) : predicate(predicate) {}
+    explicit SwitchCluster(const Edge* predicate_edge)
+        : predicate_edge(predicate_edge) {}
     string ToString() const {
-      return strings::StrCat(name, " predicate=", predicate->name(),
+      return strings::StrCat(name, " predicate=", predicate_edge->src()->name(),
                              " switches=", NodesToString(switches));
     }
 
     string name;
-    Node* predicate;
+    const Edge* predicate_edge;
     std::vector<Node*> switches;
   };
 
@@ -653,8 +656,8 @@ class FunctionalizeCond {
                      Graph* body);
 
   // Adds all the input edges to `if_node` corresponding to the arguments.
-  Status AddInputEdges(const CondArgNodes& cond_arg_nodes, Node* predicate,
-                       Node* if_node);
+  Status AddInputEdges(const CondArgNodes& cond_arg_nodes,
+                       const Edge* predicate_edge, Node* if_node);
 
   // Adds all output edges from the `if_node`.
   Status AddOutputEdges(const std::vector<Node*>& outputs, Node* if_node);
@@ -756,8 +759,8 @@ Status FunctionalizeCond::Join(const ForwardFlowNode& src_state,
     if (IsMerge(dst)) {
       dst_state->branch = Branch::kBoth;
     } else {
-      return errors::Internal("Illegal merge: ", src_state.ToString(), " with ",
-                              dst_state->ToString(), " for ",
+      return errors::Internal("Illegal merge:\n", src_state.ToString(),
+                              " with ", dst_state->ToString(), " for\n",
                               dst->DebugString());
     }
   }
@@ -861,12 +864,15 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
     if (IsSwitch(n)) {
       Node* input;
       TF_CHECK_OK(n->input_node(0, &input));
-      entry_cluster[n->id()] = &clusters[input->id()];
-      UnionFind<Cluster>* cluster = find_output_cluster(input);
+      entry_cluster[n->id()] = find_output_cluster(input);
+      UnionFind<Cluster>* cluster = entry_cluster[n->id()];
       int cluster_depth = switch_depth[cluster->Get().representative];
       // Merge the inputs of the switch node with one another. This results in
       // predicates and control input residing in the same cluster.
       for (const Edge* e : n->in_edges()) {
+        // Only consider the data inputs to the Switch node.
+        if (e->IsControlEdge()) continue;
+
         Node* src = e->src();
         UnionFind<Cluster>* src_cluster = find_output_cluster(src);
         int src_cluster_depth = switch_depth[src_cluster->Get().representative];
@@ -898,6 +904,14 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
       int src_depth = switch_depth[src_id];
       if (!e->IsControlEdge() || new_switch_depth == src_depth) {
         if (src_depth != new_switch_depth) {
+          // TODO(b/77601805) remove this when outside_compilation supports
+          // control flow.
+          if (str_util::StrContains(src->name(), "outside_compilation") ||
+              str_util::StrContains(n->name(), "outside_compilation")) {
+            return errors::InvalidArgument(
+                "outside_compilation is not yet supported within TensorFlow "
+                "control flow constructs b/77601805");
+          }
           return errors::InvalidArgument(
               "Unable to functionalize control flow in graph: Operand ('",
               src->name(), "') and operator ('", n->name(),
@@ -956,16 +970,21 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
   // node whose cluster is later in the topological order of clustered
   // switches).
   for (auto it = switch_order.rbegin(); it != switch_order.rend(); ++it) {
-    Node* pred;
-    TF_CHECK_OK((*it)->input_node(1, &pred));
-    auto repr = std::make_pair(pred, clusters[(*it)->id()].Get());
+    const Edge* pred_edge;
+    TF_CHECK_OK((*it)->input_edge(1, &pred_edge));
+    // The predicate can be preceded by a identity node. Look through identity
+    // nodes to predicate.
+    while (pred_edge->src()->IsIdentity()) {
+      TF_CHECK_OK(pred_edge->src()->input_edge(0, &pred_edge));
+    }
+    auto repr = std::make_pair(pred_edge->src(), clusters[(*it)->id()].Get());
     if (predicate_index.find(repr) == predicate_index.end()) {
       predicate_index[repr] = switch_clusters.size();
-      switch_clusters.emplace_back(pred);
+      switch_clusters.emplace_back(pred_edge);
       // Generate a name by concatenating with the cluster representative as
       // there could be multiple switch clusters with the same predicate.
-      switch_clusters[predicate_index[repr]].name =
-          strings::StrCat(pred->name(), "_", repr.second.representative, "_If");
+      switch_clusters[predicate_index[repr]].name = strings::StrCat(
+          pred_edge->src()->name(), "_", repr.second.representative, "_If");
     }
     switch_clusters[predicate_index[repr]].switches.push_back(*it);
   }
@@ -1044,9 +1063,12 @@ FunctionalizeCond::DetermineBranchMapAndFrontier(
       ForwardFlowNode& ffn = branch_map[out];
       if (IsSwitch(n)) {
         int index = e->IsControlEdge() ? Branch::kNeither : e->src_output();
-        TF_RETURN_IF_ERROR(Join(ForwardFlowNode(Branch(index)), out, &ffn));
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(
+            Join(ForwardFlowNode(Branch(index)), out, &ffn), " when joining ",
+            e->DebugString());
       } else {
-        TF_RETURN_IF_ERROR(Join(branch_map[n], out, &ffn));
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(Join(branch_map[n], out, &ffn),
+                                        " when joining ", e->DebugString());
       }
       if (IsMerge(out)) {
         if (out->in_edges().size() == ffn.count) {
@@ -1083,8 +1105,7 @@ Status FunctionalizeCond::FunctionalizeInternal() {
   for (auto it = predicate_switch_order.rbegin();
        it != predicate_switch_order.rend(); ++it) {
     auto& ps = *it;
-    VLOG(3) << "Flow down from: " << NodesToString(ps.switches) << " ("
-            << ps.predicate->name() << ")";
+    VLOG(3) << "Flow down from: " << ps.ToString();
 
     std::unordered_map<Node*, ForwardFlowNode> branch_map;
     std::unordered_set<Node*> frontier;
@@ -1097,21 +1118,29 @@ Status FunctionalizeCond::FunctionalizeInternal() {
                                                library_);
     TF_RETURN_IF_ERROR(ValidateFrontier(branch_map, frontier));
 
+    struct Hash {
+      size_t operator()(const std::pair<Node*, int>& item) const {
+        return Hash64Combine(hash<Node*>()(item.first),
+                             std::hash<int>()(item.second));
+      }
+    };
+
     // Sort the merge and switch nodes using NodeCmp. The switch-nodes are
     // further grouped (post sorting) by input to the switch node as in the
     // functionalized form each input will be passed in only once. This grouping
     // should retain the sorted order.
     CondArgNodes cond_arg_nodes;
-    std::unordered_map<Node*, int> input_index;
     std::sort(ps.switches.begin(), ps.switches.end(), NodeCmp());
+    std::unordered_map<std::pair<Node*, int>, int, Hash> input_index;
     for (Node* switch_node : ps.switches) {
-      Node* in;
-      TF_RETURN_IF_ERROR(switch_node->input_node(0, &in));
-      if (input_index.find(in) == input_index.end()) {
-        input_index[in] = cond_arg_nodes.size();
-        cond_arg_nodes.emplace_back(in);
+      const Edge* e;
+      TF_RETURN_IF_ERROR(switch_node->input_edge(0, &e));
+      std::pair<Node*, int> key = std::make_pair(e->src(), e->src_output());
+      if (input_index.find(key) == input_index.end()) {
+        input_index[key] = cond_arg_nodes.size();
+        cond_arg_nodes.emplace_back(key.first, key.second);
       }
-      cond_arg_nodes.at(input_index.at(in)).switches.push_back(switch_node);
+      cond_arg_nodes.at(input_index.at(key)).switches.push_back(switch_node);
     }
     std::vector<Node*> merge_nodes(frontier.begin(), frontier.end());
     std::sort(merge_nodes.begin(), merge_nodes.end(), NodeCmp());
@@ -1200,11 +1229,12 @@ StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
   builder.Attr("Tout", out_type);
 
   builder.Attr("Tcond", DT_BOOL);
-  builder.Device(switch_cluster.predicate->assigned_device_name());
+  builder.Device(switch_cluster.predicate_edge->src()->assigned_device_name());
   // Conditional should be the first input ...
-  builder.Input(
-      NodeDefBuilder::NodeOut(switch_cluster.predicate->name(), 0,
-                              switch_cluster.predicate->output_type(0)));
+  builder.Input(NodeDefBuilder::NodeOut(
+      switch_cluster.predicate_edge->src()->name(),
+      switch_cluster.predicate_edge->src_output(),
+      switch_cluster.predicate_edge->src()->output_type(0)));
   // ... followed by the other inputs.
   builder.Input(inputs);
 
@@ -1264,24 +1294,17 @@ Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes,
 }
 
 Status FunctionalizeCond::AddInputEdges(const CondArgNodes& cond_arg_nodes,
-                                        Node* predicate, Node* if_node) {
+                                        const Edge* predicate_edge,
+                                        Node* if_node) {
   VLOG(3) << "AddInputEdges for " << if_node->name();
   int index = 0;
-  graph_->AddEdge(predicate, 0, if_node, index++);
-  for (auto& kv : cond_arg_nodes) {
-    bool inserted = false;
-    for (const Node* arg : kv.switches) {
-      const Edge* in_edge;
-      TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
-      if (in_edge->IsControlEdge()) {
-        graph_->AddControlEdge(in_edge->src(), if_node);
-      } else {
-        if (!inserted) {
-          graph_->AddEdge(in_edge->src(), in_edge->src_output(), if_node,
-                          index++);
-          inserted = true;
-        }
-      }
+  graph_->AddEdge(predicate_edge->src(), predicate_edge->src_output(), if_node,
+                  index++);
+  for (auto& arg : cond_arg_nodes) {
+    if (arg.src_output == Graph::kControlSlot) {
+      graph_->AddControlEdge(arg.src, if_node);
+    } else {
+      graph_->AddEdge(arg.src, arg.src_output, if_node, index++);
     }
   }
   return Status::OK();
@@ -1302,10 +1325,10 @@ Status FunctionalizeCond::AddOutputEdges(const std::vector<Node*>& outputs,
         return errors::Unimplemented("Output of index (", edge->src_output(),
                                      ") of merge node ", node->name());
       }
-      graph_->RemoveEdge(edge);
 
       int src_output =
           dst_input == Graph::kControlSlot ? Graph::kControlSlot : i;
+      graph_->RemoveEdge(edge);
       graph_->AddEdge(if_node, src_output, dst, dst_input);
     }
   }
@@ -1323,7 +1346,7 @@ StatusOr<Node*> FunctionalizeCond::ConvertToXlaIf(
       Node * if_node,
       BuildAndAddXlaIfOp(cond_arg_nodes, switch_cluster, merge_nodes));
   TF_RETURN_IF_ERROR(
-      AddInputEdges(cond_arg_nodes, switch_cluster.predicate, if_node));
+      AddInputEdges(cond_arg_nodes, switch_cluster.predicate_edge, if_node));
   TF_RETURN_IF_ERROR(AddOutputEdges(merge_nodes, if_node));
 
   return if_node;
@@ -1345,6 +1368,7 @@ Status FunctionalizeControlFlow(Graph* graph,
   VLOG(2) << "FunctionalizeControlFlow (initial): "
           << dump_graph::DumpGraphToFile("functionalize_initial", *graph,
                                          library);
+
   // Note: BuildControlFlowInfo() requires that the graph's source node is
   // connected to all source nodes in the graph. Many graphs violate this
   // invariant.
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index bc7276c3afd5060d6faeceb4d479416299ecc5da..e494f42e8ed254ac0c7c7a23a13728d3f015e9d3 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/compiler/tf2xla/cc/ops/functional_ops.h"
+#include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
 #include "tensorflow/compiler/tf2xla/test_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
diff --git a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
index 91351421bcacd26c41b5c9f98ea833730e4aef30..20179b67991d3d23d678cf1df2642e029ea037fd 100644
--- a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
@@ -3,6 +3,7 @@
 Operator                              | Type Constraint
 ------------------------------------- | ---------------
 `Abs`                                 | `T={double,float,int32,int64}`
+`Acos`                                | `T={complex64,double,float,int32,int64}`
 `Acosh`                               | `T={complex64,double,float}`
 `Add`                                 | `T={complex64,double,float,int32,int64}`
 `AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
@@ -15,10 +16,12 @@ Operator                              | Type Constraint
 `ApproximateEqual`                    | `T={complex64,double,float,int32,int64,uint32,uint64}`
 `ArgMax`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={float}`
 `ArgMin`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Asin`                                | `T={complex64,double,float,int32,int64}`
 `Asinh`                               | `T={complex64,double,float}`
 `AssignAddVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `AssignSubVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `AssignVariableOp`                    | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Atan`                                | `T={complex64,double,float,int32,int64}`
 `Atan2`                               | `T={double,float}`
 `Atanh`                               | `T={complex64,double,float}`
 `AvgPool`                             | `T={double,float}`
@@ -75,6 +78,10 @@ Operator                              | Type Constraint
 `FFT`                                 |
 `FFT2D`                               |
 `FFT3D`                               |
+`FakeQuantWithMinMaxArgs`             |
+`FakeQuantWithMinMaxArgsGradient`     |
+`FakeQuantWithMinMaxVars`             |
+`FakeQuantWithMinMaxVarsGradient`     |
 `Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Floor`                               | `T={double,float}`
 `FloorDiv`                            | `T={complex64,double,float,int32,int64}`
@@ -84,6 +91,7 @@ Operator                              | Type Constraint
 `FusedBatchNormGradV2`                | `U={float}`<br>`T={float}`
 `FusedBatchNormV2`                    | `U={float}`<br>`T={float}`
 `Gather`                              | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`GatherNd`                            | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
 `GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
@@ -117,14 +125,18 @@ Operator                              | Type Constraint
 `LogicalNot`                          |
 `LogicalOr`                           |
 `MatMul`                              | `T={complex64,double,float}`
+`MatrixBandPart`                      | `Tindex={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixSetDiag`                       | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixTriangularSolve`               | `T={complex64,double,float}`
 `Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `MaxPool`                             | `T={double,float,int32,int64}`
 `MaxPool3D`                           | `T={float}`
 `MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
 `MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradGrad`                     | `T={float}`
+`MaxPoolGradGradV2`                   | `T={float}`
 `MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
 `MaxPoolV2`                           | `T={double,float,int32,int64}`
 `Maximum`                             | `T={double,float,int32,int64}`
@@ -186,6 +198,7 @@ Operator                              | Type Constraint
 `Round`                               | `T={complex64,double,float,int32,int64}`
 `Rsqrt`                               | `T={complex64,double,float}`
 `RsqrtGrad`                           | `T={complex64,double,float}`
+`ScatterNd`                           | `Tindices={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Select`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Selu`                                | `T={double,float}`
 `SeluGrad`                            | `T={double,float}`
@@ -198,6 +211,7 @@ Operator                              | Type Constraint
 `Sinh`                                | `T={complex64,double,float}`
 `Size`                                | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Slice`                               | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Snapshot`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Softmax`                             | `T={double,float}`
 `SoftmaxCrossEntropyWithLogits`       | `T={double,float}`
 `Softplus`                            | `T={double,float,int32,int64,uint32,uint64}`
diff --git a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
index b9bdb829d773825005a8921f48d28b6892d8f0cd..55f0538dba7c1941dfea88e0631cd299e51f76d0 100644
--- a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
@@ -3,6 +3,7 @@
 Operator                              | Type Constraint
 ------------------------------------- | ---------------
 `Abs`                                 | `T={double,float,int32,int64}`
+`Acos`                                | `T={complex64,double,float,int32,int64}`
 `Acosh`                               | `T={complex64,double,float}`
 `Add`                                 | `T={complex64,double,float,int32,int64}`
 `AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
@@ -15,10 +16,12 @@ Operator                              | Type Constraint
 `ApproximateEqual`                    | `T={complex64,double,float,int32,int64,uint32,uint64}`
 `ArgMax`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `ArgMin`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Asin`                                | `T={complex64,double,float,int32,int64}`
 `Asinh`                               | `T={complex64,double,float}`
 `AssignAddVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `AssignSubVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `AssignVariableOp`                    | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Atan`                                | `T={complex64,double,float,int32,int64}`
 `Atan2`                               | `T={double,float}`
 `Atanh`                               | `T={complex64,double,float}`
 `AvgPool`                             | `T={double,float}`
@@ -75,6 +78,10 @@ Operator                              | Type Constraint
 `FFT`                                 |
 `FFT2D`                               |
 `FFT3D`                               |
+`FakeQuantWithMinMaxArgs`             |
+`FakeQuantWithMinMaxArgsGradient`     |
+`FakeQuantWithMinMaxVars`             |
+`FakeQuantWithMinMaxVarsGradient`     |
 `Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Floor`                               | `T={double,float}`
 `FloorDiv`                            | `T={complex64,double,float,int32,int64}`
@@ -84,6 +91,7 @@ Operator                              | Type Constraint
 `FusedBatchNormGradV2`                | `U={float}`<br>`T={float}`
 `FusedBatchNormV2`                    | `U={float}`<br>`T={float}`
 `Gather`                              | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`GatherNd`                            | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
 `GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
@@ -117,14 +125,18 @@ Operator                              | Type Constraint
 `LogicalNot`                          |
 `LogicalOr`                           |
 `MatMul`                              | `T={complex64,double,float}`
+`MatrixBandPart`                      | `Tindex={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixSetDiag`                       | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixTriangularSolve`               | `T={complex64,double,float}`
 `Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `MaxPool`                             | `T={double,float,int32,int64}`
 `MaxPool3D`                           | `T={float}`
 `MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
 `MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradGrad`                     | `T={float}`
+`MaxPoolGradGradV2`                   | `T={float}`
 `MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
 `MaxPoolV2`                           | `T={double,float,int32,int64}`
 `Maximum`                             | `T={double,float,int32,int64}`
@@ -183,6 +195,7 @@ Operator                              | Type Constraint
 `Round`                               | `T={complex64,double,float,int32,int64}`
 `Rsqrt`                               | `T={complex64,double,float}`
 `RsqrtGrad`                           | `T={complex64,double,float}`
+`ScatterNd`                           | `Tindices={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Select`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Selu`                                | `T={double,float}`
 `SeluGrad`                            | `T={double,float}`
@@ -195,6 +208,7 @@ Operator                              | Type Constraint
 `Sinh`                                | `T={complex64,double,float}`
 `Size`                                | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Slice`                               | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Snapshot`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Softmax`                             | `T={double,float}`
 `SoftmaxCrossEntropyWithLogits`       | `T={double,float}`
 `Softplus`                            | `T={double,float,int32,int64,uint32,uint64}`
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 058a1f2621c64a735bd9d9c9d0ae007f93aa4dea..b20c1ffc7d8956f3f5530ee63e9b711a26439be5 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -130,7 +130,7 @@ Status GraphCompiler::Compile() {
     // Set up inputs from outputs of previous nodes.
     for (auto* e : n->in_edges()) {
       if (e->IsControlEdge()) continue;
-      Node* src = e->src();
+      const Node* src = e->src();
       TF_RET_CHECK(src->id() < output_registry.size());
       const NodeOutputs& src_outputs = output_registry[src->id()];
 
diff --git a/tensorflow/compiler/tf2xla/host_compute_metadata.proto b/tensorflow/compiler/tf2xla/host_compute_metadata.proto
new file mode 100644
index 0000000000000000000000000000000000000000..43ab371a217e6c4521a160715104c96e3c8782c6
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/host_compute_metadata.proto
@@ -0,0 +1,38 @@
+syntax = "proto3";
+
+package tensorflow.tf2xla;
+option cc_enable_arenas = true;
+option java_outer_classname = "Tf2XlaProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.tf2xla";
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+// TensorMetadata indicates the type and shape of a Tensor that is
+// part of a host compute transfer.
+message TensorMetadata {
+  DataType type = 1;
+  TensorShapeProto shape = 2;
+}
+
+// HostTransferMetadata describes a transfer either from host to device
+// or device to host. It has a key that is unique to the computation,
+// and metadata about the list of tensors being transferred.
+message HostTransferMetadata {
+  // The key used to identify this transfer.
+  string key = 1;
+
+  // For each Tensor being transferred, its type and shape.
+  repeated TensorMetadata metadata = 2;
+}
+
+// HostComputeMetadata describes all the sends and recvs
+// from all host compute transfer ops in a computation.
+message HostComputeMetadata {
+  // Metadata about each device_to_host transfer
+  repeated HostTransferMetadata device_to_host = 1;
+
+  // Metadata about each host_to_device transfer
+  repeated HostTransferMetadata host_to_device = 2;
+}
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d2fa933cf9c085f92b2f442827a94d72938e4bb2..00fd08b1a0750739445a124adc7ccf436a4a9b71 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -21,6 +21,7 @@ tf_kernel_library(
         "cast_op.cc",
         "categorical_op.cc",
         "cholesky_op.cc",
+        "clip_by_value_op.cc",
         "concat_op.cc",
         "const_op.cc",
         "conv_ops.cc",
@@ -29,6 +30,7 @@ tf_kernel_library(
         "cwise_ops.h",
         "depthtospace_op.cc",
         "diag_op.cc",
+        "dynamic_slice_ops.cc",
         "dynamic_stitch_op.cc",
         "elu_op.cc",
         "extract_image_patches_op.cc",
@@ -56,6 +58,7 @@ tf_kernel_library(
         "pooling_ops.cc",
         "quantize_and_dequantize_op.cc",
         "random_ops.cc",
+        "reduce_window_op.cc",
         "reduction_ops.cc",
         "reduction_ops.h",
         "reduction_ops_common.cc",
@@ -93,6 +96,7 @@ tf_kernel_library(
         "shape_util.h",
     ],
     deps = [
+        ":if_op",
         ":while_op",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -102,7 +106,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla/lib:triangular_solve",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/lib:while_loop",
-        "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -145,7 +149,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/core:framework",
@@ -154,6 +158,39 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "if_op",
+    srcs = ["if_op.cc"],
+    hdrs = ["if_op.h"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+# Kernels that have a dummy (no-op) implementation.
+tf_kernel_library(
+    name = "xla_dummy_ops",
+    srcs = [
+        "assert_op.cc",
+        "check_numerics_op.cc",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:logging_ops_op_lib",
+    ],
+    alwayslink = 1,
+)
+
 # Kernels that only work on CPU, because they use XLA custom calls.
 # Only link this when using the CPU backend for XLA.
 tf_kernel_library(
@@ -200,17 +237,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/kernels/assert_op.cc b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af4ab5e8ef6e268226edc90515706405ac36858c
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+// This TensorFlow op supports the Assert primitve.
+class AssertOp : public XlaOpKernel {
+ public:
+  explicit AssertOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  ~AssertOp() override {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    static mutex mu(tensorflow::LINKER_INITIALIZED);
+    static int log_counter = 0;
+
+    mutex_lock l(mu);
+    if (log_counter < 20) {
+      ++log_counter;
+      LOG(WARNING) << "Ignoring Assert operator " << name();
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(AssertOp);
+};
+
+REGISTER_XLA_OP(Name("Assert"), AssertOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index a249b1869f547f8e5aa725f9f5cf391b10429928..931175be1111ed5f70afbdf351ee53c59c1367de 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -118,30 +118,24 @@ class FusedBatchNormGradOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
-
-    auto grad_backprop = ctx->Input(0);
-    auto activations = ctx->Input(1);
-    auto scale = ctx->Input(2);
-    auto mean = ctx->Input(3);
-    auto var = ctx->Input(4);
-
-    TensorShape input_shape = ctx->InputShape(0);
-    int feature_index =
-        GetTensorFeatureDimIndex(input_shape.dims(), data_format_);
-
+    xla::ComputationBuilder* const b = ctx->builder();
     DataType input_dtype = ctx->input_type(0);
     DataType scale_dtype = ctx->input_type(2);
-    xla::PrimitiveType input_type;
-    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_dtype, &input_type));
-    xla::PrimitiveType scale_type;
-    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(scale_dtype, &scale_type));
 
     // TODO(b/69928690): support mixed precision in the XLA batch normalization
     // operators. For now, cast everything to the statistics type (which
     // may be more precise than the input type).
-    grad_backprop = b->ConvertElementType(grad_backprop, scale_type);
-    activations = b->ConvertElementType(activations, scale_type);
+    auto grad_backprop =
+        XlaHelpers::ConvertElementType(b, ctx->Input(0), scale_dtype);
+    auto activations =
+        XlaHelpers::ConvertElementType(b, ctx->Input(1), scale_dtype);
+    auto scale = ctx->Input(2);
+    auto mean = ctx->Input(3);
+    auto var = ctx->Input(4);
+
+    const int input_dims = ctx->InputShape(0).dims();
+    const int feature_index =
+        GetTensorFeatureDimIndex(input_dims, data_format_);
 
     xla::ComputationDataHandle x_backprop;
     xla::ComputationDataHandle scale_backprop;
@@ -156,7 +150,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       offset_backprop = b->GetTupleElement(output, 2);
     } else {
       // Reduce over all dimensions except the feature dim.
-      std::vector<int64> reduction_dims(input_shape.dims() - 1);
+      std::vector<int64> reduction_dims(input_dims - 1);
       std::iota(reduction_dims.begin(), reduction_dims.begin() + feature_index,
                 0);
       std::iota(reduction_dims.begin() + feature_index, reduction_dims.end(),
@@ -165,9 +159,14 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       // scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var +
       // epsilon))
       // x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon))
-      offset_backprop =
-          b->Reduce(grad_backprop, XlaHelpers::Zero(b, scale_dtype),
-                    *ctx->GetOrCreateAdd(scale_dtype), reduction_dims);
+      const DataType accumulation_type =
+          XlaHelpers::SumAccumulationType(scale_dtype);
+      auto converted =
+          XlaHelpers::ConvertElementType(b, grad_backprop, accumulation_type);
+      auto reduce =
+          b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                    *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
+      offset_backprop = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
 
       // scratch1 = rsqrt(pop_var + epsilon)
       auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
@@ -175,17 +174,21 @@ class FusedBatchNormGradOp : public XlaOpKernel {
           b->Pow(b->Add(var, b->ConstantR0<float>(epsilon_)), neg_half);
 
       // scratch2 = sum(y_backprop * (x - mean))
-      auto scratch2 = b->Reduce(
-          b->Mul(grad_backprop, b->Sub(activations, mean, {feature_index})),
-          XlaHelpers::Zero(b, scale_dtype), *ctx->GetOrCreateAdd(scale_dtype),
-          reduction_dims);
+      auto mul =
+          b->Mul(grad_backprop, b->Sub(activations, mean, {feature_index}));
+      converted = XlaHelpers::ConvertElementType(b, mul, accumulation_type);
+      reduce =
+          b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                    *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
+      auto scratch2 = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
 
       x_backprop =
           b->Mul(grad_backprop, b->Mul(scratch1, scale), {feature_index});
       scale_backprop = b->Mul(scratch1, scratch2);
     }
 
-    ctx->SetOutput(0, b->ConvertElementType(x_backprop, input_type));
+    ctx->SetOutput(0,
+                   XlaHelpers::ConvertElementType(b, x_backprop, input_dtype));
     ctx->SetOutput(1, scale_backprop);
     ctx->SetOutput(2, offset_backprop);
     ctx->SetConstantOutput(3, Tensor(scale_dtype, {}));
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 344a2ab2b6835c518c41de6f7a30fb2a34d130d2..569950c2dfaeb61028049a263a962dfa54a62e09 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -159,7 +159,9 @@ class BatchToSpaceNDOp : public XlaOpKernel {
                  block_shape, crops);
   }
 };
-REGISTER_XLA_OP(Name("BatchToSpaceND").CompileTimeConstInput("crops"),
+REGISTER_XLA_OP(Name("BatchToSpaceND")
+                    .CompileTimeConstInput("block_shape")
+                    .CompileTimeConstInput("crops"),
                 BatchToSpaceNDOp);
 
 class BatchToSpaceOp : public XlaOpKernel {
@@ -182,9 +184,7 @@ class BatchToSpaceOp : public XlaOpKernel {
  private:
   int block_size_;
 };
-REGISTER_XLA_OP(Name("BatchToSpace")
-                    .CompileTimeConstInput("crops")
-                    .CompileTimeConstInput("block_shape"),
+REGISTER_XLA_OP(Name("BatchToSpace").CompileTimeConstInput("crops"),
                 BatchToSpaceOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index c667b4e3e326b776faba49387760abbd582fcc68..ed33b8ed2e823f313a9a7fe220390bc617288405 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -103,10 +103,15 @@ class BiasAddGradOp : public XlaOpKernel {
     std::iota(reduce_dims.begin(), reduce_dims.begin() + feature_dim, 0);
     std::iota(reduce_dims.begin() + feature_dim, reduce_dims.end(),
               feature_dim + 1);
-    xla::ComputationDataHandle result = ctx->builder()->Reduce(
-        ctx->Input(0), XlaHelpers::Zero(ctx->builder(), input_type(0)),
-        *ctx->GetOrCreateAdd(input_type(0)), reduce_dims);
-    ctx->SetOutput(0, result);
+    xla::ComputationBuilder* const b = ctx->builder();
+    const DataType accumulation_type =
+        XlaHelpers::SumAccumulationType(input_type(0));
+    auto converted =
+        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+    auto reduce =
+        b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                  *ctx->GetOrCreateAdd(accumulation_type), reduce_dims);
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, reduce, input_type(0)));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index 43a6a747c6bcc441f33f276fde4a66f367d99731..c52b2dcb7e9ef81fd52565dfbda05e33a52ed43a 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -62,5 +62,50 @@ class CastOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("Cast"), CastOp);
 
+class BitcastOp : public XlaOpKernel {
+ public:
+  explicit BitcastOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &src_dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("type", &dst_dtype_));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(src_dtype_, &src_type_));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dst_dtype_, &dst_type_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* builder = ctx->builder();
+    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::ComputationDataHandle output;
+
+    if (src_dtype_ == dst_dtype_) {
+      output = input;
+    } else {
+      // The only complex type in XLA is C64, so error out if the bitcast has a
+      // complex source or destination type and the bitcast is not trivial.
+      OP_REQUIRES(ctx,
+                  !xla::primitive_util::IsComplexType(src_type_) &&
+                      !xla::primitive_util::IsComplexType(dst_type_),
+                  errors::Unimplemented("Complex types not supported."));
+      // XLA bitcast requires that the bit-width of the source and destination
+      // matches, and currently only the simple lowering is performed.
+      OP_REQUIRES(ctx,
+                  xla::primitive_util::BitWidth(src_type_) ==
+                      xla::primitive_util::BitWidth(dst_type_),
+                  errors::Unimplemented(
+                      "Only bitcasts between equally sized types supported."));
+      output = builder->BitcastConvertType(input, dst_type_);
+    }
+
+    ctx->SetOutput(0, output);
+  }
+
+ protected:
+  DataType src_dtype_, dst_dtype_;
+  xla::PrimitiveType src_type_, dst_type_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BitcastOp);
+};
+
+REGISTER_XLA_OP(Name("Bitcast"), BitcastOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc b/tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6061e822d8d9c6c807a63aad4e9e9526a49e456c
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace {
+
+class CheckNumericsOp : public XlaOpKernel {
+ public:
+  explicit CheckNumericsOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // TODO(b/32223192): add a real implementation of CheckNumerics
+    {
+      static mutex mu(tensorflow::LINKER_INITIALIZED);
+      static int log_counter = 0;
+      mutex_lock l(mu);
+      if (log_counter < 20) {
+        ++log_counter;
+        LOG(WARNING) << "Ignoring CheckNumerics operator " << name();
+      }
+    }
+    ctx->SetOutput(0, ctx->Input(0));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CheckNumericsOp);
+};
+
+REGISTER_XLA_OP(Name("CheckNumerics"), CheckNumericsOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fdf75be7b1156540d762e3bc04a51f2478f00f46
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class ClipByValueOp : public XlaOpKernel {
+ public:
+  explicit ClipByValueOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape shape = ctx->InputShape(0);
+    const TensorShape min_shape = ctx->InputShape(1);
+    const TensorShape max_shape = ctx->InputShape(2);
+
+    xla::ComputationBuilder* builder = ctx->builder();
+    auto input = ctx->Input(0);
+    auto min = ctx->Input(1);
+    auto max = ctx->Input(2);
+
+    auto shape_error = [&]() -> tensorflow::Status {
+      return errors::InvalidArgument(
+          "clip_value_min and clip_value_max must be either of "
+          "the same shape as input, or a scalar. ",
+          "Input shape: ", shape.DebugString(),
+          " clip_value_min shape: ", min_shape.DebugString(),
+          " clip_value_max shape: ", max_shape.DebugString());
+    };
+
+    if (shape != min_shape) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(min_shape), shape_error());
+      min = builder->Broadcast(min, shape.dim_sizes());
+    }
+    if (shape != max_shape) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(max_shape), shape_error());
+      max = builder->Broadcast(max, shape.dim_sizes());
+    }
+    ctx->SetOutput(0, builder->Clamp(min, input, max));
+  }
+};
+
+REGISTER_XLA_OP(Name("ClipByValue"), ClipByValueOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 81cea6d376d02c956a5257c5475fe5c10b83deb9..c0ee0c9c2ea849a692bee70bba36d32335eed9b5 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -58,7 +58,7 @@ xla::ComputationDataHandle CreateExpandedZero(
 
 // Create a mask for depthwise convolution that will make a normal convolution
 // produce the same results as a depthwise convolution. For a [2, 2, 3, 2]
-// depthwise filter this returns a [2, 2, 3, 6] tesnsor
+// depthwise filter this returns a [2, 2, 3, 6] tensor
 //   1 1 0 0 0 0   1 1 0 0 0 0
 //   0 0 1 1 0 0   0 0 1 1 0 0
 //   0 0 0 0 1 1   0 0 0 0 1 1
@@ -166,6 +166,10 @@ xla::ComputationDataHandle ContractFilterForDepthwiseBackprop(
       CreateExpandedFilterMask(filter_shape, builder), filter_backprop,
       CreateExpandedZero(filter_shape, dtype, builder));
   return builder->Reshape(
+      // This reduce does not need inputs to be converted with
+      // XlaHelpers::SumAccumulationType() since the ExpandedFilterMask with
+      // ExpandedZero guarantees that only one element is non zero, so there
+      // cannot be accumulated precision error.
       builder->Reduce(masked_expanded_filter, XlaHelpers::Zero(builder, dtype),
                       *ctx->GetOrCreateAdd(dtype),
                       {expanded_filter_shape.dims() - 2}),
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..800ef5ab98d70ad822c6efffb33db28b46ae50fe
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+namespace {
+
+class DynamicUpdateSliceOp : public XlaOpKernel {
+ public:
+  explicit DynamicUpdateSliceOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(3) << "DynamicUpdateSliceOp::Compile";
+
+    DataType index_type = input_type(2);
+    OP_REQUIRES(ctx, index_type == DT_INT32 || index_type == DT_INT64,
+                errors::InvalidArgument("index must be int32 or int64"));
+
+    const TensorShape input_shape = ctx->InputShape(0);
+    const TensorShape update_shape = ctx->InputShape(1);
+    const TensorShape index_shape = ctx->InputShape(2);
+
+    OP_REQUIRES(
+        ctx,
+        TensorShapeUtils::IsVector(index_shape) &&
+            index_shape.num_elements() == input_shape.dims(),
+        errors::InvalidArgument("index must be a vector with length equal to "
+                                "the number of input dimensions"));
+    OP_REQUIRES(
+        ctx, input_shape.dims() == update_shape.dims(),
+        errors::InvalidArgument("input and update must have the same rank,"
+                                " input shape is ",
+                                input_shape.DebugString(), "; update shape is ",
+                                update_shape.DebugString()));
+
+    xla::ComputationDataHandle result = ctx->builder()->DynamicUpdateSlice(
+        ctx->Input(0), ctx->Input(1), ctx->Input(2));
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaDynamicUpdateSlice"), DynamicUpdateSliceOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index 453a32c494b42e9922bc35fc526f3306530054fd..99470d70e709ddb5593c5eaae061bb897befc168 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -247,6 +247,8 @@ class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel {
     const TensorShape gradient_shape = ctx->InputShape(0);
     xla::ComputationDataHandle input = ctx->Input(1);
     const DataType data_type = ctx->input_type(1);
+    const DataType accumulation_type =
+        XlaHelpers::SumAccumulationType(data_type);
     xla::ComputationDataHandle input_min = ctx->Input(2);
     xla::ComputationDataHandle input_max = ctx->Input(3);
 
@@ -265,15 +267,23 @@ class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel {
     ctx->SetOutput(0, output0);
 
     xla::ComputationDataHandle below_min = b->Lt(input, nudged_input_min);
+    xla::ComputationDataHandle select1 = b->Select(below_min, gradient, zeroes);
+    xla::ComputationDataHandle reduce1 = b->ReduceAll(
+        XlaHelpers::ConvertElementType(b, select1, accumulation_type),
+        XlaHelpers::Zero(b, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type));
     xla::ComputationDataHandle output1 =
-        b->ReduceAll(b->Select(below_min, gradient, zeroes), zero,
-                     *ctx->GetOrCreateAdd(data_type));
+        XlaHelpers::ConvertElementType(b, reduce1, data_type);
     ctx->SetOutput(1, output1);
 
     xla::ComputationDataHandle above_max = b->Gt(input, nudged_input_max);
+    xla::ComputationDataHandle select2 = b->Select(above_max, gradient, zeroes);
+    xla::ComputationDataHandle reduce2 = b->ReduceAll(
+        XlaHelpers::ConvertElementType(b, select2, accumulation_type),
+        XlaHelpers::Zero(b, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type));
     xla::ComputationDataHandle output2 =
-        b->ReduceAll(b->Select(above_max, gradient, zeroes), zero,
-                     *ctx->GetOrCreateAdd(data_type));
+        XlaHelpers::ConvertElementType(b, reduce2, data_type);
     ctx->SetOutput(2, output2);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 7945c05af40df21a798a2cff51fe7f8e935793f6..0b79cb0916ee8a7d0e26c5dc12557639336f8ab1 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -29,52 +29,54 @@ namespace tensorflow {
 Status XlaGather(const xla::ComputationDataHandle& input,
                  const TensorShape& input_shape,
                  const xla::ComputationDataHandle& indices,
-                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
-                 DataType dtype, DataType index_type,
+                 const TensorShape& indices_shape, int64 axis,
+                 bool indices_are_nd, DataType dtype, DataType index_type,
                  xla::ComputationBuilder* builder,
                  xla::ComputationDataHandle* gather_output) {
+  // There is no deep reason why we need this precondition, but this is the only
+  // combination that is used and tested today.
+  CHECK(!indices_are_nd || axis == 0);
+
+  // num_index_dims is the number of components in each index in the indices
+  // tensor.
+  //
+  // num_indices is the total number of (n dimensional or scalar) indices in the
+  // indices tensor.
+  //
   // If the indices are N-dimensional, then the minor dimension of indices
   // should be of size N and correspond to the N indices.
-  int64 num_index_dims = 1;
+  int64 num_index_dims;
+  int64 num_indices = 1;
   if (indices_are_nd) {
     CHECK_GE(indices_shape.dims(), 1);
     num_index_dims = indices_shape.dim_size(indices_shape.dims() - 1);
-    indices_shape.RemoveLastDims(1);
+    for (int64 i = 0, e = indices_shape.dims() - 1; i < e; i++) {
+      num_indices *= indices_shape.dim_size(i);
+    }
+  } else {
+    num_index_dims = 1;
+    for (int64 i = 0, e = indices_shape.dims(); i < e; i++) {
+      num_indices *= indices_shape.dim_size(i);
+    }
   }
 
-  // Although the indices Tensor is flattened into rank 1 during the lookup,
-  // and each scalar entry is used as an index into the first dimension of the
-  // input, the output is returned with shape:
-  // input.shape[:axis] + indices.shape + input.shape[axis+1:]
-
-  const int64 num_indices = indices_shape.num_elements();
-  TensorShape input_shape_pre_axis(input_shape);
-  input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims());
-  TensorShape input_shape_post_axis(input_shape);
-  input_shape_post_axis.RemoveDimRange(0, axis + num_index_dims);
-  // Each slice of the input tensor has shape:
-  // [<input_shape_pre_axis>, 1, ..., 1, <input shape_post_axis>]
-  TensorShape slice_shape(input_shape);
-  for (int64 i = 0; i < num_index_dims; ++i) {
-    slice_shape.set_dim(axis + i, 1);
-  }
+  // Degenerate case: empty indices.
+  if (num_indices == 0) {
+    TensorShape input_shape_pre_axis{input_shape};
+    input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims());
+    TensorShape input_shape_post_axis{input_shape};
+    input_shape_post_axis.RemoveDimRange(0, axis + num_index_dims);
 
-  TensorShape loop_out_shape;
-  loop_out_shape.AppendShape(input_shape_pre_axis);
-  loop_out_shape.AddDim(num_indices);
-  loop_out_shape.AppendShape(input_shape_post_axis);
-  TensorShape loop_out_slice_shape;
-  loop_out_slice_shape.AppendShape(input_shape_pre_axis);
-  loop_out_slice_shape.AddDim(1);
-  loop_out_slice_shape.AppendShape(input_shape_post_axis);
+    TensorShape indices_shape_no_index_vectors{indices_shape};
+    if (indices_are_nd) {
+      indices_shape_no_index_vectors.RemoveLastDims(1);
+    }
 
-  TensorShape out_shape;
-  out_shape.AppendShape(input_shape_pre_axis);
-  out_shape.AppendShape(indices_shape);
-  out_shape.AppendShape(input_shape_post_axis);
+    TensorShape out_shape;
+    out_shape.AppendShape(input_shape_pre_axis);
+    out_shape.AppendShape(indices_shape_no_index_vectors);
+    out_shape.AppendShape(input_shape_post_axis);
 
-  // Degenerate case: empty indices.
-  if (num_indices == 0) {
     *gather_output = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
                                         out_shape.dim_sizes());
     return Status::OK();
@@ -88,76 +90,61 @@ Status XlaGather(const xla::ComputationDataHandle& input,
     }
   }
 
-  // Flatten the major dimensions of indices into a single dimension for ease of
-  // iteration. If there is an axis dimension, we must leave it alone.
-  std::vector<int64> flat_indices_shape = {num_indices};
-  if (indices_are_nd) {
-    flat_indices_shape.push_back(num_index_dims);
-  }
-
-  // Specify the shape of the loop-carried Tensor tuple.
-
-  // Construct the initial values of the loop-carried Tensors.
-  auto flat_indices = builder->Reshape(indices, flat_indices_shape);
-  auto init_out = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
-                                     loop_out_shape.dim_sizes());
-  auto init = {input, flat_indices, init_out};
-
-  // Construct the while loop body's function. The implementation of gather is:
-  // for i in range(num_indices):
-  //   index = dynamic-slice(indices, i)
-  //   xi = dynamic-slice(input, index)
-  //   output = dynamic-update-slice(output, xi, i)
-  auto body_fn = [&](xla::ComputationDataHandle i,
-                     gtl::ArraySlice<xla::ComputationDataHandle> loop_vars,
-                     xla::ComputationBuilder* bodyb) {
-    auto input = loop_vars[0];
-    auto indices = loop_vars[1];
-    auto output = loop_vars[2];
-
-    auto zero_index = XlaHelpers::Zero(bodyb, index_type);
-
-    // Slice the i-th index from the indices array.
-    xla::ComputationDataHandle index;
-    auto indices_offset = bodyb->Reshape(i, {1});
-    if (indices_are_nd) {
-      // Slice out the entire nd index, if applicable.
-      indices_offset = bodyb->Pad(indices_offset, zero_index,
-                                  xla::MakeEdgePaddingConfig({{0, 1}}));
-      index = bodyb->DynamicSlice(indices, indices_offset, {1, num_index_dims});
-      index = bodyb->Collapse(index, {0, 1});
+  // Example of a 1-D gather with axis=1, pulling two [3,1] tensors out of a
+  // tensor of shape [3,3].
+  //
+  //  operand = s32[3,3] parameter(0)
+  //  indices = s32[2] parameter(1)
+  //  gather = s32[3,2] gather(operand, indices),
+  //       output_window_dims={0},
+  //       elided_window_dims={1},
+  //       gather_dims_to_operand_dims={1},
+  //       index_vector_dim=1,
+  //       window_bounds={3, 1}
+  //
+  //
+  // Example of an N-D gather pulling out slices of shape [1,1,2] out of a
+  // tensor of shape [3,3,2].
+  //
+  //  operand = s32[3,3,2] parameter(0)
+  //  indices = s32[2,2] parameter(1)
+  //  gather = s32[2,2] gather(operand, indices),
+  //       output_window_dims={1},
+  //       elided_window_dims={0,1},
+  //       gather_dims_to_operand_dims={0,1},
+  //       index_vector_dim=0,
+  //       window_bounds={1,1,2}
+
+  xla::GatherDimensionNumbers dim_numbers;
+  std::vector<int64> window_bounds;
+  window_bounds.reserve(input_shape.dims());
+  for (int64 i = 0; i < input_shape.dims(); i++) {
+    int64 window_bound;
+    if (axis <= i && i < (axis + num_index_dims)) {
+      dim_numbers.add_elided_window_dims(i);
+      window_bound = 1;
     } else {
-      index = bodyb->DynamicSlice(indices, indices_offset, {1});
+      window_bound = input_shape.dim_size(i);
+    }
+
+    window_bounds.push_back(window_bound);
+
+    if (i < axis) {
+      dim_numbers.add_output_window_dims(i);
+    } else if (i >= (axis + num_index_dims)) {
+      int64 indices_rank =
+          indices_are_nd ? (indices_shape.dims() - 1) : indices_shape.dims();
+      dim_numbers.add_output_window_dims(i + indices_rank - num_index_dims);
     }
+  }
+
+  dim_numbers.set_index_vector_dim(indices_are_nd ? (indices_shape.dims() - 1)
+                                                  : indices_shape.dims());
+  for (int64 i = axis; i < axis + num_index_dims; i++) {
+    dim_numbers.add_gather_dims_to_operand_dims(i);
+  }
 
-    // Slice the corresponding data from the input array.
-    auto start_indices = bodyb->Pad(
-        index, zero_index,
-        xla::MakeEdgePaddingConfig(
-            {{input_shape_pre_axis.dims(), input_shape_post_axis.dims()}}));
-    auto slice_i = bodyb->Reshape(
-        bodyb->DynamicSlice(input, start_indices, slice_shape.dim_sizes()),
-        loop_out_slice_shape.dim_sizes());
-
-    // Construct the index into the output Tensor 0, ..., <index>, 0, ...
-    std::vector<xla::ComputationDataHandle> out_index_vals(
-        loop_out_shape.dims(), bodyb->Reshape(zero_index, {1}));
-    out_index_vals[input_shape_pre_axis.dims()] = bodyb->Reshape(i, {1});
-    auto out_index = bodyb->ConcatInDim(out_index_vals, 0);
-
-    // Update the output Tensor
-    auto updated_output = bodyb->DynamicUpdateSlice(output, slice_i, out_index);
-
-    return std::vector<xla::ComputationDataHandle>{input, indices,
-                                                   updated_output};
-  };
-
-  // Construct the While loop, extract and reshape the output.
-  xla::PrimitiveType ptype;
-  TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(index_type, &ptype));
-  TF_ASSIGN_OR_RETURN(auto outputs, XlaForEachIndex(num_indices, ptype, body_fn,
-                                                    init, "gather", builder));
-  *gather_output = builder->Reshape(outputs[2], out_shape.dim_sizes());
+  *gather_output = builder->Gather(input, indices, dim_numbers, window_bounds);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index bd8b92c22d71fe89ab8951ec79f411feef6505e3..f9376f0eabdc0f0c565eb4b9f86425de96b5aa22 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -36,8 +36,8 @@ namespace tensorflow {
 Status XlaGather(const xla::ComputationDataHandle& input,
                  const TensorShape& input_shape,
                  const xla::ComputationDataHandle& indices,
-                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
-                 DataType dtype, DataType index_type,
+                 const TensorShape& indices_shape, int64 axis,
+                 bool indices_are_nd, DataType dtype, DataType index_type,
                  xla::ComputationBuilder* builder,
                  xla::ComputationDataHandle* gather_output);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
index 39af662b638cb9d723118e58fcfc983633fed497..e72200bfbcff20c55ac03030f1afc4bacaabf7ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
@@ -38,6 +38,7 @@ class IdentityOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("Identity").CompilationOnly(), IdentityOp);
 
 REGISTER_XLA_OP(Name("IdentityN").CompilationOnly(), IdentityOp);
+REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp);
 REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp);
 REGISTER_XLA_OP(Name("StopGradient"), IdentityOp);
 REGISTER_XLA_OP(Name("Snapshot"), IdentityOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eefbe55c815d80a608bdf62d454a69d722adb158
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -0,0 +1,226 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/if_op.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+XlaIfOp::XlaIfOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+  const NameAttrList* name_attr;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("then_branch", &name_attr));
+  then_branch_ = *name_attr;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("else_branch", &name_attr));
+  else_branch_ = *name_attr;
+
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tcond", &cond_type_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tin", &input_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &output_types_));
+}
+
+// TODO(b/35949885): There is duplication here with the handling of the
+// while_op. Refactor the common code out/rework.
+void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
+  xla::ComputationBuilder* b = ctx->builder();
+
+  OP_REQUIRES(ctx, cond_type_ == DT_BOOL,
+              errors::InvalidArgument(
+                  "Condition argument must be a boolean for XLA compilation"));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(0)),
+              errors::InvalidArgument(
+                  "Condition argument must be a scalar for XLA compilation"));
+
+  VLOG(1) << "Building If: " << input_types_.size() << " inputs";
+
+  std::vector<xla::ComputationDataHandle> inputs(input_types_.size());
+  std::vector<XlaCompiler::Argument> arguments(input_types_.size());
+  for (int i = 0; i < input_types_.size(); ++i) {
+    XlaCompiler::Argument& arg = arguments[i];
+    DataType type = ctx->input_type(i + 1);
+    if (type == DT_RESOURCE) {
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx, ctx->GetResourceInput(i + 1, &resource));
+
+      arg.initialized = resource->initialized();
+      arg.kind = XlaCompiler::Argument::kResource;
+      arg.resource_kind = resource->kind();
+      OP_REQUIRES_OK(ctx, resource->Pack(&inputs[i], b));
+
+      arg.type = resource->type();
+      arg.shape = resource->shape();
+      OP_REQUIRES(ctx, arg.initialized,
+                  errors::Unimplemented("Uninitialized arguments: ", arg.name));
+      arg.tensor_array_size = resource->tensor_array_size();
+      for (const auto& gradient : resource->tensor_array_gradients()) {
+        arg.tensor_array_gradients.insert(gradient.first);
+      }
+      arg.name = resource->name();
+      VLOG(2) << "Resource " << resource->name()
+              << " type: " << DataTypeString(arg.type)
+              << " shape: " << arg.shape.DebugString()
+              << " initialized: " << arg.initialized;
+    } else {
+      arg.kind = XlaCompiler::Argument::kParameter;
+      arg.type = input_types_[i];
+      arg.shape = ctx->InputShape(i + 1);
+      inputs[i] = ctx->Input(i + 1);
+      VLOG(2) << "Arg type: " << DataTypeString(arg.type)
+              << " shape: " << arg.shape.DebugString();
+    }
+  }
+
+  // Compile both branches of the conditional.
+  XlaCompiler::CompileOptions options;
+  options.use_tuple_arg = true;
+  options.resolve_compile_time_constants = false;
+  options.return_updated_values_for_all_resources = true;
+  options.is_entry_computation = false;
+  XlaCompiler* compiler = ctx->compiler();
+
+  XlaCompiler::CompilationResult then_result;
+  OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, then_branch_,
+                                                arguments, &then_result));
+  XlaCompiler::CompilationResult else_result;
+  OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, else_branch_,
+                                                arguments, &else_result));
+
+  for (XlaCompiler::CompilationResult* result : {&then_result, &else_result}) {
+    for (const XlaCompiler::ResourceUpdate& update : result->resource_updates) {
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetResourceInput(update.input_index + 1, &resource));
+      XlaCompiler::Argument& arg = arguments[update.input_index];
+
+      // Add any TensorArray gradients touched by the then/else computation to
+      // the enclosing graph.
+      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+        VLOG(5) << "TensorArray " << resource->name() << " accessed gradient "
+                << grad_source;
+        XlaResource* gradient;
+        OP_REQUIRES_OK(ctx, resource->GetOrCreateTensorArrayGradient(
+                                grad_source, b, &gradient));
+      }
+      // Add all of the TensorArray gradients to the argument. For simplicity,
+      // we always pass all known gradients.
+      for (const auto& gradient : resource->tensor_array_gradients()) {
+        arg.tensor_array_gradients.insert(gradient.first);
+      }
+    }
+  }
+
+  // Check that both branches have identical input shapes.
+  OP_REQUIRES(ctx, then_result.xla_input_shapes.size() == 1,
+              errors::FailedPrecondition("Expected one input shape"));
+  xla::Shape then_input_shape = then_result.xla_input_shapes[0];
+  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(then_input_shape),
+              errors::FailedPrecondition("Expected tuple shape"));
+  OP_REQUIRES(ctx, else_result.xla_input_shapes.size() == 1,
+              errors::FailedPrecondition("Expected one input shape"));
+  xla::Shape else_input_shape = else_result.xla_input_shapes[0];
+  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(else_input_shape),
+              errors::FailedPrecondition("Expected tuple shape"));
+  OP_REQUIRES(ctx,
+              xla::ShapeUtil::Compatible(then_input_shape, else_input_shape),
+              errors::InvalidArgument(
+                  "Input shapes of then and else branches do not match: ",
+                  xla::ShapeUtil::HumanString(then_input_shape), " vs. ",
+                  xla::ShapeUtil::HumanString(else_input_shape)));
+
+  // Check that both branches have identical output shapes.
+  OP_REQUIRES(
+      ctx,
+      xla::ShapeUtil::Compatible(then_result.xla_output_shape,
+                                 else_result.xla_output_shape),
+      errors::InvalidArgument(
+          "Output shapes of then and else branches do not match: ",
+          xla::ShapeUtil::HumanString(then_result.xla_output_shape), " vs. ",
+          xla::ShapeUtil::HumanString(else_result.xla_output_shape)));
+
+  VLOG(2) << "Input shape: " << xla::ShapeUtil::HumanString(then_input_shape);
+  VLOG(2) << "Output shape: "
+          << xla::ShapeUtil::HumanString(then_result.xla_output_shape);
+
+  // We set return_updated_values_for_all_resources=true and we pass the same
+  // arguments to both computations, so the resource update count must match.
+  OP_REQUIRES(ctx,
+              then_result.resource_updates.size() ==
+                  else_result.resource_updates.size(),
+              errors::FailedPrecondition(
+                  "Different number of resources in then and else branch"));
+  for (int i = 0; i < then_result.resource_updates.size(); ++i) {
+    const auto& lhs = then_result.resource_updates[i];
+    const auto& rhs = else_result.resource_updates[i];
+    bool equal = lhs.input_index == rhs.input_index && lhs.shape == rhs.shape &&
+                 lhs.tensor_array_gradients_accessed ==
+                     rhs.tensor_array_gradients_accessed;
+    OP_REQUIRES(
+        ctx, equal,
+        errors::FailedPrecondition(
+            "Mismatch in resource of then and else branch for resource ", i));
+  }
+
+  xla::ComputationDataHandle outputs =
+      b->Conditional(ctx->Input(0), b->Tuple(inputs), *then_result.computation,
+                     b->Tuple(inputs), *else_result.computation);
+  // Sets non-variable outputs.
+  for (int i = 0; i < output_types_.size(); ++i) {
+    if (ctx->input_type(i) != DT_RESOURCE) {
+      xla::ComputationDataHandle output_handle = b->GetTupleElement(outputs, i);
+      if (VLOG_IS_ON(2)) {
+        LOG(INFO) << "Setting output " << i;
+        auto shape_or = b->GetShape(output_handle);
+        if (shape_or.ok()) {
+          LOG(INFO) << "Shape for output " << i << ": "
+                    << xla::ShapeUtil::HumanString(*shape_or.ValueOrDie());
+        } else {
+          LOG(INFO) << "Shape unknown for output " << i;
+        }
+      }
+      ctx->SetOutput(i, output_handle);
+    }
+  }
+
+  // Updates the values of any resource variables modified by the conditional
+  // bodies.
+  for (XlaCompiler::CompilationResult* result : {&then_result, &else_result}) {
+    for (int i = 0; i < result->resource_updates.size(); ++i) {
+      const XlaCompiler::ResourceUpdate& update = result->resource_updates[i];
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetResourceInput(update.input_index + 1, &resource));
+      if (update.modified) {
+        int pos = result->outputs.size() + i;
+        OP_REQUIRES_OK(ctx,
+                       resource->SetFromPack(
+                           arguments[update.input_index].tensor_array_gradients,
+                           b->GetTupleElement(outputs, pos), b));
+      }
+      VLOG(2) << "If variable: pos: " << update.input_index
+              << " name: " << resource->name()
+              << " modified: " << update.modified
+              << " type: " << DataTypeString(update.type)
+              << " shape: " << update.shape.DebugString();
+    }
+  }
+  VLOG(1) << "Done building If";
+}
+
+REGISTER_XLA_OP(Name("XlaIf").AllowResourceTypes(), XlaIfOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.h b/tensorflow/compiler/tf2xla/kernels/if_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9bc98a198a72dcc0594e61971713bf890ce30b6
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+
+namespace tensorflow {
+
+// This TensorFlow op provides a functional conditional primitive.
+//
+// The outputs of the then/else branches must agree on the number, types, and
+// shapes of the Tensors carried around the two bodies.
+//
+// Computations in then/else bodies may read from and write to resource
+// variables.
+// Resource variables may be passed as arguments to the then/else function's
+// bodies. The XlaCompiler converts resource variable arguments
+// into parameters to the XLA computation and moves them to the end of the
+// parameter list, and by using the `return_updated_values_for_all_variables`
+// we ensure that all variables that appear in the input also appear at the
+// end of the then/else bodies output. This ensures the then/else bodies output
+// signatures match.
+//
+// It is the user's responsibility to ensure that each non-variable _Arg matches
+// the corresponding _Retval.
+class XlaIfOp : public XlaOpKernel {
+ public:
+  explicit XlaIfOp(OpKernelConstruction* ctx);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaIfOp);
+
+  NameAttrList then_branch_;
+  NameAttrList else_branch_;
+  DataType cond_type_;
+  DataTypeVector input_types_;
+  DataTypeVector output_types_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index f22f384256a8ddd8c05de4a1322aba741dc4d7fd..5eeda79a935e8194a596d322b52add27846d378c 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -180,9 +180,13 @@ class AdjustContrastOpV2 : public XlaOpKernel {
 
     DataType type = context->input_type(0);
 
-    auto output = b->Reduce(input, /*init_value=*/XlaHelpers::Zero(b, type),
-                            /*computation=*/*context->GetOrCreateAdd(type),
+    const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
+    auto converted =
+        XlaHelpers::ConvertElementType(b, input, accumulation_type);
+    auto reduce = b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                            *context->GetOrCreateAdd(accumulation_type),
                             {height_dim, width_dim});
+    auto output = XlaHelpers::ConvertElementType(b, reduce, type);
     output = b->Div(output, XlaHelpers::FloatLiteral(b, type, height * width));
 
     std::vector<int64> broadcast_dims(input_shape.dims() - 2);
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index d096415087e47a73503a06526ab133ac34803c5d..c177f08d9c4687bb13b98a4328bb3960519799c4 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -29,21 +29,22 @@ class L2LossOp : public XlaOpKernel {
   explicit L2LossOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
+    std::vector<int64> dims(ctx->InputShape(0).dims());
+    std::iota(dims.begin(), dims.end(), 0);
 
     DataType dtype = ctx->input_type(0);
-    xla::ComputationBuilder* b = ctx->builder();
-
-    auto zero = XlaHelpers::Zero(b, dtype);
-    auto two = XlaHelpers::IntegerLiteral(b, dtype, 2);
-    const xla::Computation& add = *ctx->GetOrCreateAdd(dtype);
-
-    std::vector<int64> dims(input_shape.dims());
-    std::iota(dims.begin(), dims.end(), 0);
+    xla::ComputationBuilder* const b = ctx->builder();
 
     //  output = sum(t ** 2) / 2
-    auto x = ctx->Input(0);
-    ctx->SetOutput(0, b->Div(b->Reduce(b->Mul(x, x), zero, add, dims), two));
+    const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype);
+    auto t =
+        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+    auto square = b->Mul(t, t);
+    auto reduce = b->Reduce(square, XlaHelpers::Zero(b, accumulation_type),
+                            *ctx->GetOrCreateAdd(accumulation_type), dims);
+    auto deconverted = XlaHelpers::ConvertElementType(b, reduce, dtype);
+    auto two = XlaHelpers::IntegerLiteral(b, dtype, 2);
+    ctx->SetOutput(0, b->Div(deconverted, two));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
index 759d1a1a2d996d4f5deb1774be7014bb6de30f40..1cfee3070f384af0a7441a9c860c530dd1b42187 100644
--- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
@@ -47,12 +47,17 @@ class LRNOp : public XlaOpKernel {
 
     // We use a window of depth_radius_ * 2 + 1, to account for the current
     // element and a depth_radius_ on either side.
-    auto squared = builder->Mul(input, input);
-    auto sqr_sum = builder->ReduceWindow(
-        squared, XlaHelpers::Zero(builder, input_type(0)),
-        *ctx->GetOrCreateAdd(input_type(0)),
+    auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
+    auto converted =
+        XlaHelpers::ConvertElementType(builder, input, accumulation_type);
+    auto squared = builder->Mul(converted, converted);
+    auto reduce = builder->ReduceWindow(
+        squared, XlaHelpers::Zero(builder, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
+    auto sqr_sum =
+        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
 
     auto scale = builder->Pow(
         builder->Add(builder->ConstantR0<float>(bias_),
@@ -130,12 +135,17 @@ class LRNGradOp : public XlaOpKernel {
     //     dyi *= out_grads[j]
     //     grads[k] += dyi
 
-    auto squared = builder->Mul(in_image, in_image);
-    auto sqr_sum = builder->ReduceWindow(
-        squared, XlaHelpers::Zero(builder, input_type(0)),
-        *ctx->GetOrCreateAdd(input_type(0)),
+    auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
+    auto converted =
+        XlaHelpers::ConvertElementType(builder, in_image, accumulation_type);
+    auto squared = builder->Mul(converted, converted);
+    auto reduce = builder->ReduceWindow(
+        squared, XlaHelpers::Zero(builder, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
+    auto sqr_sum =
+        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
 
     auto norm =
         builder->Add(builder->ConstantR0<float>(bias_),
@@ -146,11 +156,15 @@ class LRNGradOp : public XlaOpKernel {
                      builder->Div(out_image, norm)),
         in_grads);
 
-    auto dy_reduced = builder->ReduceWindow(
-        dy, XlaHelpers::Zero(builder, input_type(0)),
-        *ctx->GetOrCreateAdd(input_type(0)),
+    auto converted_dy =
+        XlaHelpers::ConvertElementType(builder, dy, accumulation_type);
+    auto dy_reduce = builder->ReduceWindow(
+        converted_dy, XlaHelpers::Zero(builder, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
+    auto dy_reduced =
+        XlaHelpers::ConvertElementType(builder, dy_reduce, input_type(0));
 
     xla::ComputationDataHandle gradients = builder->Add(
         builder->Mul(in_image, dy_reduced),
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index d4fb5dd4e06c7c70591262c0d63a91c383a2a6e0..5f635dd1bc6122cfcac8163baafd95b13f157715 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -35,8 +35,11 @@ namespace {
 // Superclass of pooling ops.
 class PoolingOp : public XlaOpKernel {
  public:
-  PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims)
-      : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
+  PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims,
+            const DataType reduction_type)
+      : XlaOpKernel(ctx),
+        num_spatial_dims_(num_spatial_dims),
+        reduction_type_(reduction_type) {
     if (ctx->num_inputs() == 1) {
       std::vector<int32> ksize_int;
       std::vector<int32> stride_int;
@@ -63,12 +66,10 @@ class PoolingOp : public XlaOpKernel {
   int num_dims() const { return num_spatial_dims_ + 2; }
 
   // Method that builds an initial value to use in reductions.
-  virtual xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b,
-                                               DataType data_type) = 0;
+  virtual xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b) = 0;
 
   // The reduction operation to apply to each window.
-  virtual const xla::Computation* Reduction(XlaOpKernelContext* ctx,
-                                            DataType dtype) = 0;
+  virtual const xla::Computation* Reduction(XlaOpKernelContext* ctx) = 0;
 
   // A post-processing operation to apply on the outputs of the ReduceWindow.
   virtual xla::ComputationDataHandle PostProcessOutput(
@@ -76,9 +77,6 @@ class PoolingOp : public XlaOpKernel {
       DataType dtype, const TensorShape& input_shape) = 0;
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle input = ctx->Input(0);
-    const TensorShape input_shape = ctx->InputShape(0);
-
     std::vector<int64> ksize = ksize_;
     std::vector<int64> stride = stride_;
     if (ctx->num_inputs() != 1) {
@@ -106,16 +104,20 @@ class PoolingOp : public XlaOpKernel {
       stride.clear();
       OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride));
     }
+    const TensorShape input_shape = ctx->InputShape(0);
     OP_REQUIRES(ctx, input_shape.dims() == num_dims(),
                 errors::InvalidArgument("Input to ", type_string(),
                                         " operator must have ", num_dims(),
                                         " dimensions"));
 
-    const DataType type = input_type(0);
-    xla::ComputationDataHandle pooled = ctx->builder()->ReduceWindow(
-        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize,
-        stride, padding_);
-    ctx->SetOutput(0, PostProcessOutput(ctx, pooled, type, input_shape));
+    xla::ComputationBuilder* const b = ctx->builder();
+    auto input =
+        XlaHelpers::ConvertElementType(b, ctx->Input(0), reduction_type_);
+    auto reduce = ctx->builder()->ReduceWindow(
+        input, InitValue(b), *Reduction(ctx), ksize, stride, padding_);
+    auto pooled = XlaHelpers::ConvertElementType(b, reduce, input_type(0));
+    ctx->SetOutput(0,
+                   PostProcessOutput(ctx, pooled, input_type(0), input_shape));
   }
 
  protected:
@@ -124,21 +126,21 @@ class PoolingOp : public XlaOpKernel {
   std::vector<int64> stride_;
   xla::Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
+  DataType reduction_type_;
 };
 
 class MaxPoolOp : public PoolingOp {
  public:
   MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
-      : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims) {}
+      : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
+                  /*reduction_type=*/ctx->input_type(0)) {}
 
-  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b,
-                                       DataType data_type) override {
-    return XlaHelpers::MinValue(b, data_type);
+  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b) override {
+    return XlaHelpers::MinValue(b, reduction_type_);
   }
 
-  const xla::Computation* Reduction(XlaOpKernelContext* ctx,
-                                    DataType dtype) override {
-    return ctx->GetOrCreateMax(dtype);
+  const xla::Computation* Reduction(XlaOpKernelContext* ctx) override {
+    return ctx->GetOrCreateMax(reduction_type_);
   }
 
   xla::ComputationDataHandle PostProcessOutput(
@@ -209,15 +211,17 @@ static xla::ComputationDataHandle AvgPoolDivideByCount(
     }
 
     // Build a matrix of all 1s, with the same width/height as the input.
+    const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype);
     auto ones = ctx->builder()->Broadcast(
-        XlaHelpers::One(ctx->builder(), dtype), input_dim_sizes);
+        XlaHelpers::One(ctx->builder(), accumulation_type), input_dim_sizes);
 
     // Perform a ReduceWindow with the same window size, strides, and padding
     // to count the number of contributions to each result element.
-    auto counts = ctx->builder()->ReduceWindow(
-        ones, XlaHelpers::Zero(ctx->builder(), dtype),
-        *ctx->GetOrCreateAdd(dtype), window_ksize, window_stride,
+    auto reduce = ctx->builder()->ReduceWindow(
+        ones, XlaHelpers::Zero(ctx->builder(), accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type), window_ksize, window_stride,
         xla::Padding::kSame);
+    auto counts = XlaHelpers::ConvertElementType(ctx->builder(), reduce, dtype);
 
     return ctx->builder()->Div(output, counts, window_dims);
   }
@@ -226,16 +230,16 @@ static xla::ComputationDataHandle AvgPoolDivideByCount(
 class AvgPoolOp : public PoolingOp {
  public:
   AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
-      : PoolingOp(ctx, num_spatial_dims) {}
+      : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
+                  /*reduction_type=*/
+                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
 
-  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b,
-                                       DataType data_type) override {
-    return XlaHelpers::Zero(b, data_type);
+  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b) override {
+    return XlaHelpers::Zero(b, reduction_type_);
   }
 
-  const xla::Computation* Reduction(XlaOpKernelContext* ctx,
-                                    DataType dtype) override {
-    return ctx->GetOrCreateAdd(dtype);
+  const xla::Computation* Reduction(XlaOpKernelContext* ctx) override {
+    return ctx->GetOrCreateAdd(reduction_type_);
   }
 
   xla::ComputationDataHandle PostProcessOutput(
@@ -455,14 +459,12 @@ class AvgPoolGradOp : public XlaOpKernel {
                  gradients_shape, filter_shape, out_backprop_shape, stride_,
                  padding_, data_format_, &dims));
 
+    // The input gradients are computed by a convolution of the output gradients
+    // and the filter, with some appropriate padding. See the comment at the top
+    // of conv_grad_ops.h for details.
+    xla::ComputationBuilder* const b = ctx->builder();
     auto out_backprop = ctx->Input(1);
-
-    // The input gradients are computed by a convolution of the output
-    // gradients
-    // and the filter, with some appropriate padding. See the comment at
-    // the top of conv_grad_ops.h for details.
-    DataType dtype = input_type(1);
-
+    auto dtype = input_type(1);
     xla::Padding xla_padding =
         (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame;
 
@@ -483,17 +485,18 @@ class AvgPoolGradOp : public XlaOpKernel {
       padding->set_interior_padding(dims.spatial_dims[i].stride - 1);
     }
 
-    auto zero = XlaHelpers::Zero(ctx->builder(), dtype);
-    auto padded_gradients =
-        ctx->builder()->Pad(out_backprop_div, zero, padding_config);
+    auto zero = XlaHelpers::Zero(b, dtype);
+    auto padded_gradients = b->Pad(out_backprop_div, zero, padding_config);
 
     // in_backprop = padded_gradients <conv> ones
     std::vector<int64> ones(num_dims(), 1LL);
-    xla::ComputationDataHandle in_backprop = ctx->builder()->ReduceWindow(
-        padded_gradients, zero, *ctx->GetOrCreateAdd(dtype), ksize_,
+    auto accumulation_type = XlaHelpers::SumAccumulationType(dtype);
+    auto in_backprop = b->ReduceWindow(
+        XlaHelpers::ConvertElementType(b, padded_gradients, accumulation_type),
+        XlaHelpers::Zero(b, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type), ksize_,
         /* window_strides=*/ones, xla::Padding::kValid);
-
-    ctx->SetOutput(0, in_backprop);
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, in_backprop, dtype));
   }
 
  protected:
@@ -525,5 +528,172 @@ class AvgPool3DGradOp : public AvgPoolGradOp {
 REGISTER_XLA_OP(Name("AvgPool3DGrad").CompileTimeConstInput("orig_input_shape"),
                 AvgPool3DGradOp);
 
+class MaxPoolGradGradOp : public XlaOpKernel {
+ public:
+  MaxPoolGradGradOp(OpKernelConstruction* ctx, int num_spatial_dims)
+      : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
+    if (ctx->num_inputs() == 3) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
+    }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
+  }
+
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    if (ctx->num_inputs() != 3) {
+      OP_REQUIRES(
+          ctx, ctx->num_inputs() == 5,
+          errors::InvalidArgument("Must supply ksize and stride arguments."));
+      const TensorShape ksize_shape = ctx->InputShape(3);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(3, &ksize_));
+
+      const TensorShape stride_shape = ctx->InputShape(4);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(4, &stride_));
+    }
+
+    OP_REQUIRES(ctx, ksize_.size() == num_dims(),
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(ctx, stride_.size() == num_dims(),
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+
+    const TensorShape tensor_in_shape = ctx->InputShape(0);
+    const TensorShape tensor_out_shape = ctx->InputShape(1);
+    const TensorShape out_backprop_shape = ctx->InputShape(2);
+
+    // For maxpooling, tensor_in should have num_dims() dimensions.
+    OP_REQUIRES(ctx, tensor_in_shape.dims() == num_dims(),
+                errors::InvalidArgument("tensor_in must be ", num_dims(),
+                                        "-dimensional"));
+    OP_REQUIRES(ctx, tensor_out_shape.dims() == num_dims(),
+                errors::InvalidArgument("tensor_out must be ", num_dims(),
+                                        "-dimensional"));
+    // For maxpooling, out_backprop should have num_dims() dimensions.
+    OP_REQUIRES(ctx, out_backprop_shape.dims() == num_dims(),
+                errors::InvalidArgument("out_backprop must be ", num_dims(),
+                                        "-dimensional"));
+
+    // What we want to compute:
+    // Given y = MaxPool(x), and xs_grad = MaxPoolGrad(x, y, ys_grad)
+    // MaxPoolGradGrad computes {ys_grad}_grad given x, y, and {xs_grad}_grad.
+    //
+    // In the regular TF op, this amounts to selecting for each window the
+    // incoming backprop value from xs_grad_grad that corresponds to the maximal
+    // value in the corresponding window of x.
+    //
+    // TODO(b/73062247): What we really want is a ReduceWindow with different
+    // arrays for index selection vs return value selection--a select-to-gather.
+    //
+    // Here, we implement a bitwise hack: we use the hi 16 bits of input for
+    // separate max pooling alongside each of the hi and lo 16 bits of
+    // out_backprop packed into 16 lo bits, which we then glue back together at
+    // the end to get a full 32 bits of gradient.
+    //
+    // This could select the wrong backprop value for two x values that are
+    // equally maximal up to the first 16 bits, in which case we are taking the
+    // latter.
+    //
+    // Note that in principle we could use 32 separate maxpools to recover each
+    // of 32 bits of the gradient while preserving 31 bits of input for the max
+    // pooling criteria; here, we just truncate to the first 16 bits of input.
+
+    auto input = ctx->Input(0);
+    auto out_backprop = ctx->Input(2);
+
+    auto b = ctx->builder();
+
+    auto sixteen = b->ConstantR0<uint32>(16);
+    // in (f32) -> round to bf16 -> f32 for correct bitwidth -> 16-high-bit u32
+    auto in_hi = b->BitcastConvertType(
+        b->ConvertElementType(b->ConvertElementType(input, xla::BF16),
+                              xla::F32),
+        xla::U32);
+    auto bp_int = b->BitcastConvertType(out_backprop, xla::U32);
+    auto bp_hi = b->ShiftRightLogical(bp_int, sixteen);
+    auto bp_lo = b->ShiftRightLogical(b->ShiftLeft(bp_int, sixteen), sixteen);
+    auto in_hi_bp_hi = b->Add(in_hi, bp_hi);  // Want an unsigned add.
+    auto in_hi_bp_lo = b->Add(in_hi, bp_lo);  // Want an unsigned add.
+
+    auto init_value = XlaHelpers::MinValue(b, DT_FLOAT);
+    // We will reduce by taking the maximal value up to 16 bits (ignoring the lo
+    // 16 bits of packed-in hi/lo backprop value).
+    auto rb = b->CreateSubBuilder("GreaterOrEqOf_ByFirst16Bits");
+    {
+      // F32 parameters to satisfy lowering type restriction for reduce opcode.
+      const xla::Shape scalar = xla::ShapeUtil::MakeShape(xla::F32, {});
+      auto lhs = rb->Parameter(0, scalar, "lhs");
+      auto rhs = rb->Parameter(1, scalar, "rhs");
+      auto sixteen = rb->ConstantR0<int32>(16);
+      auto lhs_criteria = rb->ShiftLeft(
+          rb->ShiftRightLogical(rb->BitcastConvertType(lhs, xla::S32), sixteen),
+          sixteen);
+      auto rhs_criteria = rb->ShiftLeft(
+          rb->ShiftRightLogical(rb->BitcastConvertType(rhs, xla::S32), sixteen),
+          sixteen);
+      // Must use a F32 comparison, because S32 would not work for negatives.
+      rb->Select(rb->Ge(rb->BitcastConvertType(lhs_criteria, xla::F32),
+                        rb->BitcastConvertType(rhs_criteria, xla::F32)),
+                 lhs, rhs);
+    }
+    auto reduce = rb->BuildAndNoteError();
+    xla::Padding xla_padding =
+        (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame;
+    auto pooled_hi =
+        b->ReduceWindow(b->BitcastConvertType(in_hi_bp_hi, xla::F32),
+                        init_value, reduce, ksize_, stride_, xla_padding);
+    auto pooled_lo =
+        b->ReduceWindow(b->BitcastConvertType(in_hi_bp_lo, xla::F32),
+                        init_value, reduce, ksize_, stride_, xla_padding);
+    auto grads_hi =
+        b->ShiftLeft(b->BitcastConvertType(pooled_hi, xla::U32), sixteen);
+    auto grads_lo = b->ShiftRightLogical(
+        b->ShiftLeft(b->BitcastConvertType(pooled_lo, xla::U32), sixteen),
+        sixteen);
+    auto grads = b->Add(grads_hi, grads_lo);  // Want an unsigned add.
+
+    xla::PrimitiveType element_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(2), &element_type));
+    ctx->SetOutput(0, b->BitcastConvertType(grads, element_type));
+  }
+
+ protected:
+  const int num_spatial_dims_;
+  std::vector<int64> ksize_;
+  std::vector<int64> stride_;
+  Padding padding_;
+  TensorFormat data_format_ = FORMAT_NHWC;
+};
+
+class MaxPool2DGradGradOp : public MaxPoolGradGradOp {
+ public:
+  explicit MaxPool2DGradGradOp(OpKernelConstruction* ctx)
+      : MaxPoolGradGradOp(ctx, /*num_spatial_dims=*/2) {
+    string data_format;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
+    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+};
+REGISTER_XLA_OP(Name("MaxPoolGradGrad").TypeConstraint("T", DT_FLOAT),
+                MaxPool2DGradGradOp);
+REGISTER_XLA_OP(Name("MaxPoolGradGradV2")
+                    .TypeConstraint("T", DT_FLOAT)
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DGradGradOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb144bea9e429b7c8bcc3d07f688ed6a254c3be0
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/while_op.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class ReduceWindowOp : public XlaOpKernel {
+ public:
+  explicit ReduceWindowOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("computation", &computation_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("window_dimensions", &window_dimensions_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("window_strides", &window_strides_));
+    OP_REQUIRES_OK(context, context->GetAttr("padding_low", &padding_low_));
+    OP_REQUIRES_OK(context, context->GetAttr("padding_high", &padding_high_));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const DataType dtype = context->input_type(0);
+
+    const int rank = input_shape.dims();
+    OP_REQUIRES(context, rank == window_dimensions_.size(),
+                errors::InvalidArgument(
+                    "The size of window_dimensions must be equal to the input "
+                    "rank (",
+                    window_dimensions_.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == window_strides_.size(),
+                errors::InvalidArgument(
+                    "The size of window_strides must be equal to the input "
+                    "rank (",
+                    window_strides_.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == padding_low_.size(),
+                errors::InvalidArgument(
+                    "The size of padding_low must be equal to the input "
+                    "rank (",
+                    padding_low_.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == padding_high_.size(),
+                errors::InvalidArgument(
+                    "The size of padding_high must be equal to the input "
+                    "rank (",
+                    padding_high_.size(), " vs. ", rank, ")"));
+
+    xla::ComputationBuilder* builder = context->builder();
+
+    // Build the reducer function.
+    XlaCompiler::Argument reducer_arg;
+    reducer_arg.kind = XlaCompiler::Argument::kParameter;
+    reducer_arg.type = dtype;
+    reducer_arg.shape = TensorShape();
+
+    XlaCompiler::CompileOptions compile_options;
+    compile_options.use_tuple_arg = false;
+    compile_options.resolve_compile_time_constants = false;
+    compile_options.is_entry_computation = false;
+    XlaCompiler::CompilationResult reducer;
+    OP_REQUIRES_OK(context, context->compiler()->CompileFunction(
+                                compile_options, *computation_,
+                                {reducer_arg, reducer_arg}, &reducer));
+
+    xla::Shape scalar_shape;
+    OP_REQUIRES_OK(context,
+                   TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape));
+    OP_REQUIRES(context,
+                xla::ShapeUtil::Compatible(
+                    reducer.xla_output_shape,
+                    xla::ShapeUtil::MakeTupleShape({scalar_shape})),
+                errors::InvalidArgument(
+                    "Invalid output shape of ReduceWindow reducer. Expected ",
+                    xla::ShapeUtil::HumanString(scalar_shape), " got ",
+                    xla::ShapeUtil::HumanString(reducer.xla_output_shape)));
+
+    // Wraps the reducer in a computation that unpacks the output tuple.
+    xla::Computation wrapper;
+    {
+      std::unique_ptr<xla::ComputationBuilder> cb =
+          builder->CreateSubBuilder("wrapper");
+      auto x = cb->Parameter(0, scalar_shape, "x");
+      auto y = cb->Parameter(1, scalar_shape, "y");
+      auto outputs = cb->Call(*reducer.computation, {x, y});
+      cb->GetTupleElement(outputs, 0);
+      xla::StatusOr<xla::Computation> result = cb->Build();
+      OP_REQUIRES_OK(context, result.status());
+      wrapper = std::move(result.ValueOrDie());
+    }
+
+    std::vector<std::pair<int64, int64>> padding(rank);
+    for (int i = 0; i < rank; ++i) {
+      padding[i] = {padding_low_[i], padding_high_[i]};
+    }
+
+    xla::ComputationDataHandle output = builder->ReduceWindowWithGeneralPadding(
+        context->Input(0), context->Input(1), wrapper, window_dimensions_,
+        window_strides_, padding);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  const NameAttrList* computation_;
+  std::vector<int64> window_dimensions_;
+  std::vector<int64> window_strides_;
+  std::vector<int64> padding_low_;
+  std::vector<int64> padding_high_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ReduceWindowOp);
+};
+
+REGISTER_XLA_OP(Name("XlaReduceWindow"), ReduceWindowOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 03b13b2924f4b81c1017804c91d5ffb81c44ea0b..812d258cd1677e18ef49952044126c76a2f55b19 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -27,7 +27,13 @@ namespace {
 
 class SumOp : public XlaReductionOp {
  public:
-  explicit SumOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit SumOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx,
+                       XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
+  xla::ComputationDataHandle InitialValue(
+      xla::ComputationBuilder* builder) override {
+    return XlaHelpers::Zero(builder, reduction_type_);
+  }
   void BuildReducer(xla::ComputationBuilder* builder,
                     const xla::ComputationDataHandle& scalar_lhs,
                     const xla::ComputationDataHandle& scalar_rhs) override {
@@ -39,11 +45,13 @@ REGISTER_XLA_OP(Name("Sum").CompileTimeConstInput("reduction_indices"), SumOp);
 
 class ProdOp : public XlaReductionOp {
  public:
-  explicit ProdOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit ProdOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx,
+                       XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
-    return XlaHelpers::One(builder, input_type(0));
+    return XlaHelpers::One(builder, reduction_type_);
   }
 
   void BuildReducer(xla::ComputationBuilder* builder,
@@ -58,13 +66,12 @@ REGISTER_XLA_OP(Name("Prod").CompileTimeConstInput("reduction_indices"),
 
 class MinOp : public XlaReductionOp {
  public:
-  explicit MinOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit MinOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
-    xla::PrimitiveType type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type));
-    return builder->ConstantLiteral(xla::Literal::MaxValue(type));
+    return XlaHelpers::MaxValue(builder, reduction_type_);
   }
 
   void BuildReducer(xla::ComputationBuilder* builder,
@@ -78,13 +85,12 @@ REGISTER_XLA_OP(Name("Min").CompileTimeConstInput("reduction_indices"), MinOp);
 
 class MaxOp : public XlaReductionOp {
  public:
-  explicit MaxOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit MaxOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
-    xla::PrimitiveType type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type));
-    return builder->ConstantLiteral(xla::Literal::MinValue(type));
+    return XlaHelpers::MinValue(builder, reduction_type_);
   }
 
   void BuildReducer(xla::ComputationBuilder* builder,
@@ -98,8 +104,14 @@ REGISTER_XLA_OP(Name("Max").CompileTimeConstInput("reduction_indices"), MaxOp);
 
 class MeanOp : public XlaReductionOp {
  public:
-  explicit MeanOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit MeanOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx,
+                       XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
 
+  xla::ComputationDataHandle InitialValue(
+      xla::ComputationBuilder* builder) override {
+    return XlaHelpers::Zero(builder, reduction_type_);
+  }
   void BuildReducer(xla::ComputationBuilder* builder,
                     const xla::ComputationDataHandle& scalar_lhs,
                     const xla::ComputationDataHandle& scalar_rhs) override {
@@ -121,7 +133,8 @@ REGISTER_XLA_OP(Name("Mean").CompileTimeConstInput("reduction_indices"),
 
 class AllOp : public XlaReductionOp {
  public:
-  explicit AllOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit AllOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
@@ -139,7 +152,8 @@ REGISTER_XLA_OP(Name("All").CompileTimeConstInput("reduction_indices"), AllOp);
 
 class AnyOp : public XlaReductionOp {
  public:
-  explicit AnyOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit AnyOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 9aca6d8fedf92f176b3b7b40c5961d4a2e557a8a..f3181f0dadc2d3f45abb145e009e2663c10490f0 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -33,12 +33,12 @@ namespace tensorflow {
 // xla::ComputationBuilder.
 class XlaReductionOp : public XlaOpKernel {
  public:
-  explicit XlaReductionOp(OpKernelConstruction* ctx);
+  XlaReductionOp(OpKernelConstruction* ctx, DataType reduction_type);
   ~XlaReductionOp() override {}
 
-  // Return the base case for the reduction. Defaults to zero.
+  // Return the base case for the reduction.
   virtual xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder);
+      xla::ComputationBuilder* builder) = 0;
 
   // Implement the (scalar,scalar)->scalar lambda that should be
   // applied to each pair of elements to be reduced. The desired
@@ -63,6 +63,9 @@ class XlaReductionOp : public XlaOpKernel {
  private:
   // True if the number of dimensions should be maintained.
   bool keep_dims_;
+
+ protected:
+  DataType reduction_type_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 4b5d09eb9fd4110cdc4221099ff55767e9132540..64fe765ae9a945c58ea60bc157b1520c83b0d8e7 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -24,19 +24,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx,
+                               DataType reduction_type)
+    : XlaOpKernel(ctx), reduction_type_(reduction_type) {
   const DataType dt = BaseType(input_type(0));
   OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, DT_INT32}, {dt}));
 
   OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
 }
 
-// Return the base case for the reduction. Defaults to zero.
-xla::ComputationDataHandle XlaReductionOp::InitialValue(
-    xla::ComputationBuilder* builder) {
-  return XlaHelpers::Zero(builder, input_type(0));
-}
-
 // Unless BuildFinalizer is overridden the reduction has no
 // finalizer.
 xla::ComputationDataHandle XlaReductionOp::BuildFinalizer(
@@ -100,36 +96,26 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
 
   string desc = ctx->op_kernel().name();
 
-  // Call virtual method to get the initial value.
-  const xla::ComputationDataHandle initial = InitialValue(ctx->builder());
+  xla::ComputationBuilder* const b = ctx->builder();
   // Construct the builder for the reduction lambda.
-  xla::ComputationBuilder r(ctx->builder()->client(),
-                            strings::StrCat(desc, "-reduction"));
+  xla::ComputationBuilder r(b->client(), strings::StrCat(desc, "-reduction"));
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type));
-  // Make two scalar parameters of the desired type for the lambda.
-  xla::ComputationDataHandle rx =
-      r.Parameter(0, xla::ShapeUtil::MakeShape(type, {}), "x");
-  xla::ComputationDataHandle ry =
-      r.Parameter(1, xla::ShapeUtil::MakeShape(type, {}), "y");
-
-  auto data = ctx->Input(0);
+  TF_CHECK_OK(DataTypeToPrimitiveType(reduction_type_, &type));
 
+  auto data = b->ConvertElementType(ctx->Input(0), type);
+  // Call virtual method to get the initial value.
+  auto initial = b->ConvertElementType(InitialValue(b), type);
+  // Make two scalar parameters of the desired type for the lambda.
+  auto rx = r.Parameter(0, xla::ShapeUtil::MakeShape(type, {}), "x");
+  auto ry = r.Parameter(1, xla::ShapeUtil::MakeShape(type, {}), "y");
   // Call virtual method to build the reduction lambda.
   BuildReducer(&r, rx, ry);
   xla::Computation reduction_computation = r.Build().ConsumeValueOrDie();
-  xla::ComputationDataHandle reduce =
-      ctx->builder()->Reduce(data, initial, reduction_computation, xla_axes);
 
-  xla::ComputationDataHandle finalized =
-      BuildFinalizer(ctx->builder(), reduce, num_elements_reduced);
-
-  xla::ComputationDataHandle result;
-  if (keep_dims_) {
-    result = ctx->builder()->Reshape(finalized, final_shape);
-  } else {
-    result = finalized;
-  }
+  auto reduce = b->Reduce(data, initial, reduction_computation, xla_axes);
+  auto deconverted = XlaHelpers::ConvertElementType(b, reduce, input_type(0));
+  auto finalized = BuildFinalizer(b, deconverted, num_elements_reduced);
+  auto result = keep_dims_ ? b->Reshape(finalized, final_shape) : finalized;
   ctx->SetOutput(0, result);
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index ee4a94164c4a43828eb4feedbfa9d1a9e231ef8f..4cfa28a0ce3d7d1f24196ef6ef2775f840b2bcf1 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -66,7 +66,7 @@ class ScanOp : public XlaOpKernel {
                                 -input_shape.dims(), ", ", input_shape.dims(),
                                 "), but got ", axis));
 
-    DataType dtype = ctx->input_type(0);
+    DataType dtype = XlaHelpers::SumAccumulationType(ctx->input_type(0));
 
     if (input_shape.num_elements() == 0) {
       // Exit early if there is nothing to compute.
@@ -91,7 +91,6 @@ class ScanOp : public XlaOpKernel {
       std::swap(padding[axis].first, padding[axis].second);
     }
 
-    xla::ComputationDataHandle input = ctx->Input(0);
     xla::ComputationDataHandle init;
     const xla::Computation* reducer;
     if (sum_) {
@@ -102,7 +101,10 @@ class ScanOp : public XlaOpKernel {
       reducer = ctx->GetOrCreateMul(dtype);
     }
     auto output = builder->ReduceWindowWithGeneralPadding(
-        ctx->Input(0), init, *reducer, window_dims, window_strides, padding);
+        XlaHelpers::ConvertElementType(builder, ctx->Input(0), dtype), init,
+        *reducer, window_dims, window_strides, padding);
+    output =
+        XlaHelpers::ConvertElementType(builder, output, ctx->input_type(0));
 
     // In exclusive mode, we have computed an extra element containing the sum
     // of all the input elements. Slice off this extra "last" element.
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index 80d6df6c48b0141734dcee1c2a3c413926931feb..498342a98881df0c6ff50007eacc1d5ef6196b57 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -83,7 +83,9 @@ class UnsortedSegmentSum : public XlaOpKernel {
   DataType dtype_;
 };
 
-REGISTER_XLA_OP(Name("UnsortedSegmentSum"), UnsortedSegmentSum);
+REGISTER_XLA_OP(
+    Name("UnsortedSegmentSum").CompileTimeConstInput("num_segments"),
+    UnsortedSegmentSum);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index 5172781c0d05b6682fe92086654e3b86961949ee..d079b89861817a5639ac72b5ee49d76cb4506ae8 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -48,7 +48,7 @@ void SendOp::Compile(XlaOpKernelContext* ctx) {
   ctx->builder()->Send(ctx->Input(0), channel);
 }
 
-REGISTER_XLA_OP(Name("_XLASend"), SendOp);
+REGISTER_XLA_OP(Name("XlaSend"), SendOp);
 
 class RecvOp : public XlaOpKernel {
  public:
@@ -68,7 +68,7 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
   TensorShape tensor_shape;
   DataType dtype;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &tensor_shape));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype));
   OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, tensor_shape, &shape_));
 }
 
@@ -79,7 +79,7 @@ void RecvOp::Compile(XlaOpKernelContext* ctx) {
   ctx->SetOutput(0, ctx->builder()->Recv(shape_, channel));
 }
 
-REGISTER_XLA_OP(Name("_XLARecv"), RecvOp);
+REGISTER_XLA_OP(Name("XlaRecv"), RecvOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index 750a4c2dec8154f97f307978b3d8884271292279..463788b8b461c370a8e7ab4d79a94fc0143b8b45 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace {
@@ -28,7 +29,7 @@ namespace {
 class SoftmaxOp : public XlaOpKernel {
  public:
   explicit SoftmaxOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -42,9 +43,8 @@ class SoftmaxOp : public XlaOpKernel {
     const DataType type = input_type(0);
     auto logits = ctx->Input(0);
 
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationBuilder* const b = ctx->builder();
     const xla::Computation& max_func = *ctx->GetOrCreateMax(type);
-    const xla::Computation& add_func = *ctx->GetOrCreateAdd(type);
 
     // Find the max in each batch, resulting in a tensor of shape [batch]
     auto logits_max =
@@ -52,21 +52,20 @@ class SoftmaxOp : public XlaOpKernel {
     // Subtract the max in batch b from every element in batch b. Broadcasts
     // along the batch dimension.
     auto shifted_logits = b->Sub(logits, logits_max, {kBatchDim});
-    xla::ComputationDataHandle softmax;
-    if (log_) {
-      // softmax = shifted_logits - log(sum(exp(shifted_logits)))
-      auto log_sum_exp =
-          b->Log(b->Reduce(b->Exp(shifted_logits), XlaHelpers::Zero(b, type),
-                           add_func, {kClassDim}));
-      softmax = b->Sub(shifted_logits, log_sum_exp, {kBatchDim});
-    } else {
-      // softmax = exp(shifted_logits) / sum(exp(shifted_logits))
-      auto exp_shifted = b->Exp(shifted_logits);
-      auto sum_exp = b->Reduce(exp_shifted, XlaHelpers::Zero(b, type), add_func,
-                               {kClassDim});
-      softmax = b->Div(exp_shifted, sum_exp, {kBatchDim});
-    }
-
+    auto exp_shifted = b->Exp(shifted_logits);
+    const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
+    auto converted =
+        XlaHelpers::ConvertElementType(b, exp_shifted, accumulation_type);
+    auto reduce =
+        b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                  *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
+    auto sum = XlaHelpers::ConvertElementType(b, reduce, type);
+    auto softmax =
+        log_
+            // softmax = shifted_logits - log(sum(exp(shifted_logits)))
+            ? b->Sub(shifted_logits, b->Log(sum), {kBatchDim})
+            // softmax = exp(shifted_logits) / sum(exp(shifted_logits))
+            : b->Div(exp_shifted, sum, {kBatchDim});
     ctx->SetOutput(0, softmax);
   }
 
@@ -82,7 +81,6 @@ CrossEntropyWithLogits(XlaOpKernelContext* ctx, DataType type,
                        const xla::ComputationDataHandle& logits,
                        const xla::ComputationDataHandle& labels) {
   const xla::Computation& max_func = *ctx->GetOrCreateMax(type);
-  const xla::Computation& add_func = *ctx->GetOrCreateAdd(type);
 
   const int kBatchDim = 0;
   const int kClassDim = 1;
@@ -100,8 +98,12 @@ CrossEntropyWithLogits(XlaOpKernelContext* ctx, DataType type,
   auto exp_shifted_logits = b->Exp(shifted_logits);
 
   // sum_{class} (exp(logits - max_logits))
-  auto sum_exp = b->Reduce(exp_shifted_logits, XlaHelpers::Zero(b, type),
-                           add_func, {kClassDim});
+  const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
+  auto converted =
+      XlaHelpers::ConvertElementType(b, exp_shifted_logits, accumulation_type);
+  auto reduce = b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                          *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
+  auto sum_exp = XlaHelpers::ConvertElementType(b, reduce, type);
 
   // log(sum(exp(logits - max_logits)))
   auto log_sum_exp = b->Log(sum_exp);
@@ -110,9 +112,13 @@ CrossEntropyWithLogits(XlaOpKernelContext* ctx, DataType type,
   //    ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
   // along classes
   // (The subtraction broadcasts along the batch dimension.)
-  xla::ComputationDataHandle loss = b->Reduce(
-      b->Mul(b->Neg(labels), b->Sub(shifted_logits, log_sum_exp, {kBatchDim})),
-      XlaHelpers::Zero(b, type), add_func, {kClassDim});
+  auto sub = b->Sub(shifted_logits, log_sum_exp, {kBatchDim});
+  auto mul = b->Mul(b->Neg(labels), sub);
+  auto sum =
+      b->Reduce(XlaHelpers::ConvertElementType(b, mul, accumulation_type),
+                XlaHelpers::Zero(b, accumulation_type),
+                *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
+  auto loss = XlaHelpers::ConvertElementType(b, sum, type);
 
   // backprop: prob - labels, where
   //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 79c435c90a1f57250be90c2c2523bf3d7d231461..43c15e753805352875034dfd2c70a2a1ed9a4114 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -111,27 +111,24 @@ class SplitVOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const int32 num_split = num_outputs();
+    const TensorShape input_shape = ctx->InputShape(0);
     const TensorShape index_shape = ctx->InputShape(2);
-    xla::Literal literal_index;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &literal_index));
 
-    int32 split_dim;
-    OP_REQUIRES(ctx, index_shape.dims() == 0,
-                errors::InvalidArgument("split_dim input to Split Op must be a "
-                                        "scalar"));
-    split_dim = literal_index.Get<int>({});
+    int64 split_dim_orig;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(2, &split_dim_orig));
+    int64 split_dim = split_dim_orig < 0 ? split_dim_orig + input_shape.dims()
+                                         : split_dim_orig;
+    OP_REQUIRES(ctx, 0 <= split_dim && split_dim < input_shape.dims(),
+                errors::InvalidArgument("-input rank(-", input_shape.dims(),
+                                        ") <= split_dim < input rank (",
+                                        input_shape.dims(), "), but got ",
+                                        split_dim_orig));
 
     xla::ComputationDataHandle input = ctx->Input(0);
-    const TensorShape input_shape = ctx->InputShape(0);
 
     OP_REQUIRES(ctx, input_shape.dims() > 0,
                 errors::InvalidArgument("Can't split a 0 dimensional input"));
 
-    OP_REQUIRES(
-        ctx, 0 <= split_dim && split_dim < input_shape.dims(),
-        errors::InvalidArgument("0 <= split_dim < number of input dimensions (",
-                                input_shape.dims(), "), but got ", split_dim));
-
     OP_REQUIRES(
         ctx, num_split > 0,
         errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index b10880de77e6b9811008076cd4a959c284e558d1..5bb773d97fc5ce90dabceeefd5c29d916597f5ff 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -239,6 +239,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
 
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
 REGISTER_XLA_OP(Name("StatelessRandomUniform")
+                    .CompileTimeConstInput("shape")
                     .TypeConstraint("dtype", DT_FLOAT)
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessRandomUniformOp);
@@ -272,6 +273,7 @@ class StatelessRandomNormalOp : public XlaOpKernel {
 
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
 REGISTER_XLA_OP(Name("StatelessRandomNormal")
+                    .CompileTimeConstInput("shape")
                     .TypeConstraint("dtype", DT_FLOAT)
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessRandomNormalOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 91c169428c7a88a8d107a97445aeea999946e3e9..6204aa4e27000fddec7f5b82b2198d37956f6aba 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -77,13 +77,14 @@ class StridedSliceOp : public XlaOpKernel {
     for (int i = 0; i < begin.size(); ++i) {
       if (strides[i] > 0) {
         slice_begin.push_back(begin[i]);
-        slice_end.push_back(end[i]);
+        slice_end.push_back(std::max(end[i], begin[i]));
         slice_strides.push_back(strides[i]);
       } else {
         // Negative stride: swap begin and end, add 1 because the interval
         // is semi-open, and mark the dimension to be reversed.
         slice_begin.push_back(input_shape.dim_size(i) - begin[i] - 1);
-        slice_end.push_back(input_shape.dim_size(i) - end[i] - 1);
+        slice_end.push_back(std::max(input_shape.dim_size(i) - end[i] - 1,
+                                     input_shape.dim_size(i) - begin[i] - 1));
         slice_strides.push_back(-strides[i]);
         dimensions_to_reverse.push_back(i);
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 0c5ad9e5255ffc3dfcfb83335060ae833937b3ce..7cb47f908d4ff43f455f1e77c53cd3cc956579ee 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -60,11 +60,13 @@ XLAJIT_MAKE_UNARY(
                     b->Add(XlaHelpers::One(b, input_type(0)), x))));
 
 // acosh(x) = log(x + sqrt(x^2 - 1))
+//          = log(x + sqrt((x+1)*(x-1)))
 XLAJIT_MAKE_UNARY(
     Acosh,
-    b->Log(b->Add(x, b->Pow(b->Sub(b->Mul(x, x),
-                                   XlaHelpers::One(b, input_type(0))),
-                            XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
+    b->Log(b->Add(x,
+                  b->Pow(b->Mul(b->Add(x, XlaHelpers::One(b, input_type(0))),
+                                b->Sub(x, XlaHelpers::One(b, input_type(0)))),
+                         XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
 
 // asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
 XLAJIT_MAKE_UNARY(
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 488fda74bf7b5c1d66f8d706a1be3cc1fc29a492..12fdfb605d667bf2cc96e79e84954b89229a7340 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -39,6 +39,7 @@ cc_library(
         ":batch_dot",
         ":triangular_solve",
         ":util",
+        ":while_loop",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -90,6 +91,7 @@ cc_library(
 xla_test(
     name = "triangular_solve_test",
     srcs = ["triangular_solve_test.cc"],
+    tags = ["noasan"],  # sometimes times out, http://b/78650012
     deps = [
         ":triangular_solve",
         "//tensorflow/compiler/xla:array2d",
@@ -126,6 +128,30 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "util_test",
+    srcs = ["util_test.cc"],
+    deps = [
+        ":batch_dot",
+        ":util",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "while_loop",
     srcs = ["while_loop.cc"],
@@ -140,17 +166,3 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index e795701181dd80a2ff544743d513bffd52fd2399..203365e2ab07e0da1abfac5452a8ec41a4ddf406 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
 #include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -31,68 +32,122 @@ namespace tensorflow {
 
 namespace {
 
+// The Cholesky–Banachiewicz algorithm. See
+// https://en.wikipedia.org/wiki/Cholesky_decomposition#The_Cholesky–Banachiewicz_and_Cholesky–Crout_algorithms
+// for a description.
+//
 // def cholesky_unblocked(a):
 //   assert len(a.shape) == 2 and a.shape[-2] == a.shape[-1]
 //   n = a.shape[-2]
 //   l = np.zeros_like(a)
 //   for j in xrange(n):
-//     r = l[..., j, :j]
-//     l[..., j, j] = np.sqrt(a[..., j, j] - np.dot(r, r))
-//     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j],
-//         np.transpose(r))) / l[..., j, j]
+//     row = l[..., j, :j]
+//     row_t = np.swapaxes(row, -1, -2)
+//     l[..., j, j] = np.sqrt(a[..., j, j] - np.dot(row, row_t))
+//     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j], row_t)) /
+//                       l[..., j, j]
 //   return l
 xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(a));
-  xla::ComputationDataHandle l = Zeros(builder, *shape);
-  const int64 n = xla::ShapeUtil::GetDimension(*shape, -2);
-  for (int j = 0; j < n; ++j) {
-    // Picture of block structure:
-    // ...   \
-    //        \
-    // -- r -- d
-    //         |\
-    //    B    c \
-    //         |  \
-    //         |  ...
-    //
-    //         ^
-    //      column j
-    TF_ASSIGN_OR_RETURN(auto d,
-                        SliceInMinorDims(builder, a, {j, j}, {j + 1, j + 1}));
-    TF_ASSIGN_OR_RETURN(auto c,
-                        SliceInMinorDims(builder, a, {j + 1, j}, {n, j + 1}));
-    xla::ComputationDataHandle new_d_squared = d;
-    xla::ComputationDataHandle br;
-    if (j > 0) {
-      TF_ASSIGN_OR_RETURN(auto r,
-                          SliceInMinorDims(builder, l, {j, 0}, {j + 1, j}));
-      TF_ASSIGN_OR_RETURN(auto b,
-                          SliceInMinorDims(builder, l, {j + 1, 0}, {n, j}));
-      TF_ASSIGN_OR_RETURN(auto r_squared,
-                          BatchDot(builder, r, r, /*transpose_x=*/false,
-                                   /*transpose_y=*/true, /*conjugate_x=*/false,
-                                   /*conjugate_y=*/false));
-      new_d_squared = builder->Sub(new_d_squared, r_squared);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
+                      builder->GetShape(a));
+  const int n_dims = xla::ShapeUtil::Rank(*a_shape);
+  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
+  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(a_shape->dimensions()),
+                                    /*pos=*/0,
+                                    /*len=*/n_dims - 2);
 
-      TF_ASSIGN_OR_RETURN(br, BatchDot(builder, b, r, /*transpose_x=*/false,
-                                       /*transpose_y=*/true,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false));
-    }
-    auto new_d_inv = builder->Pow(
-        new_d_squared, FloatLiteral(builder, shape->element_type(), -0.5));
-    auto new_d = builder->Mul(new_d_inv, new_d_squared);
-    TF_ASSIGN_OR_RETURN(l, UpdateSliceInMinorDims(builder, l, new_d, {j, j}));
+  xla::ComputationDataHandle l = Zeros(builder, *a_shape);
 
-    if (j > 0) {
-      c = builder->Sub(c, br);
+  // Construct the for loop body to iterate over rows.
+  auto body_fn = [&](xla::ComputationDataHandle i,
+                     gtl::ArraySlice<xla::ComputationDataHandle> loop_vars,
+                     xla::ComputationBuilder* body_builder)
+      -> xla::StatusOr<std::vector<xla::ComputationDataHandle>> {
+    xla::Shape col_shape;
+    xla::Shape row_shape;
+    for (int64 d : major_dims) {
+      row_shape.add_dimensions(d);
+      col_shape.add_dimensions(d);
     }
-    auto new_c = builder->Mul(c, new_d_inv);
-    TF_ASSIGN_OR_RETURN(l,
-                        UpdateSliceInMinorDims(builder, l, new_c, {j + 1, j}));
-  }
-  return l;
+    row_shape.add_dimensions(1);
+    row_shape.add_dimensions(n);
+    row_shape.set_element_type(a_shape->element_type());
+    auto mask_zeros_row = Zeros(body_builder, row_shape);
+
+    col_shape.add_dimensions(n);
+    col_shape.add_dimensions(1);
+    col_shape.set_element_type(a_shape->element_type());
+    auto mask_zeros_col = Zeros(body_builder, col_shape);
+
+    std::vector<int32> mask_vector(n);
+    std::iota(mask_vector.begin(), mask_vector.end(), 0);
+    auto mask_range = body_builder->ConstantR1<int32>(mask_vector);
+    auto mask_range_row = body_builder->Broadcast(
+        body_builder->Reshape(mask_range, {0}, {1, n}), major_dims);
+    auto mask_range_col = body_builder->Broadcast(
+        body_builder->Reshape(mask_range, {0}, {n, 1}), major_dims);
+    auto body_a = loop_vars[0];
+    auto body_l = loop_vars[1];
+
+    // row = l[..., i, :i]
+    // select the whole i-th row, then mask out all columns past i-1
+    auto zero = body_builder->ConstantR0<int32>(0);
+    TF_ASSIGN_OR_RETURN(auto l_i, DynamicSliceInMinorDims(body_builder, body_l,
+                                                          {i, zero}, {1, n}));
+    auto row = body_builder->Select(body_builder->Ge(mask_range_row, i),
+                                    mask_zeros_row, l_i);
+    // a[..., i, i]
+    TF_ASSIGN_OR_RETURN(auto a_ii, DynamicSliceInMinorDims(body_builder, body_a,
+                                                           {i, i}, {1, 1}));
+    // np.dot(row, np.swapaxes(row, -1, -2))
+    xla::ComputationDataHandle diag_dot;
+    TF_ASSIGN_OR_RETURN(diag_dot, BatchDot(body_builder, row, row,
+                                           /*transpose_x=*/false,
+                                           /*transpose_y=*/true));
+    // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
+    //                                              np.swapaxes(row, -1, -2)))
+    auto l_ii = body_builder->Pow(
+        body_builder->Sub(a_ii, diag_dot),
+        FloatLiteral(body_builder, a_shape->element_type(), 0.5));
+
+    // a[..., i+1:, i]
+    auto ip1 = body_builder->Add(i, body_builder->ConstantR0<int32>(1));
+    // select the whole i-th column, then mask out all rows above i+1
+    TF_ASSIGN_OR_RETURN(
+        auto a_0i, DynamicSliceInMinorDims(body_builder, body_a, {i}, {1}));
+    auto a_ip1i = body_builder->Select(body_builder->Le(mask_range_col, i),
+                                       mask_zeros_col, a_0i);
+
+    // l[..., i+1:, i] = (a[..., i+1:, i] - np.dot(l[..., i+1:, :i], r.T)) /
+    //                   l[..., i, i]
+    // The columns in [i, n] are zeroed out in `row`, so we just have to
+    // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i],
+    // r.T)
+    TF_ASSIGN_OR_RETURN(auto dot, BatchDot(body_builder, body_l, row,
+                                           /*transpose_x=*/false,
+                                           /*transpose_y=*/true));
+    // np.dot(l[..., i+1:, :i], r.T)
+    auto dot_ip1 = body_builder->Select(body_builder->Le(mask_range_col, i),
+                                        mask_zeros_col, dot);
+
+    auto col_update =
+        body_builder->Div(body_builder->Sub(a_ip1i, dot_ip1), l_ii);
+    TF_ASSIGN_OR_RETURN(body_l, DynamicUpdateSliceInMinorDims(
+                                    body_builder, body_l, col_update, {i}));
+    // Assign the diagonal after the rest of the column because otherwise the
+    // column assign will wrap around and overwrite the diagonal assign.
+    TF_ASSIGN_OR_RETURN(body_l, DynamicUpdateSliceInMinorDims(
+                                    body_builder, body_l, l_ii, {i, i}));
+
+    return std::vector<xla::ComputationDataHandle>{body_a, body_l};
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      auto cholesky_while,
+      XlaForEachIndex(n, xla::S32, body_fn, {a, l}, "unblocked", builder));
+
+  return cholesky_while[1];
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index e083a383be4be0d1b556b63214fe5f70323b4149..17da8d8b22d107701ce768ac945c1404df6d47e8 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace tensorflow {
 // the block size to use.
 // TODO(phawkins): check for negative values on the diagonal and return an
 // error, instead of silently yielding NaNs.
-// TODO(mattjj): handle the complex Hermitian case
+// TODO(znado): handle the complex Hermitian case
 xla::StatusOr<xla::ComputationDataHandle> Cholesky(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
     int64 block_size = 256);
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 7f72a6073df218b9e2bd4cc0c0b5bb10b5cd4b84..9bf5821b54abe3994085ad72043ff143077824c5 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -83,15 +83,6 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
         block_size);
   }
 
-  // Returns [b1, b2, ... , bn, indices[0], indices[1]].
-  auto prepend_batch_dims = [&](std::array<int64, 2> indices) {
-    std::vector<int64> output(ndims);
-    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
-    std::copy(indices.begin(), indices.end(),
-              output.begin() + batch_dimensions.size());
-    return output;
-  };
-
   // Applies a complex conjugation operation if `a` is complex and `conjugate_a`
   // is true, otherwise returns its argument.
   auto maybe_conj = [&](xla::ComputationBuilder* builder,
@@ -108,11 +99,12 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
       std::unique_ptr<xla::ComputationBuilder> sub = builder->CreateSubBuilder(
           tensorflow::strings::StrCat("trsm_base_", k));
 
-      auto a_param =
-          sub->Parameter(0,
-                         xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims({k, k})),
-                         "a");
+      auto a_param = sub->Parameter(
+          0,
+          xla::ShapeUtil::MakeShape(
+              b_shape->element_type(),
+              PrependMajorDims(sub.get(), batch_dimensions, {k, k})),
+          "a");
 
       std::array<int64, 2> b_lastd;
       if (left_side) {
@@ -120,11 +112,12 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
       } else {
         b_lastd = {m, k};
       }
-      auto b_param =
-          sub->Parameter(1,
-                         xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims(b_lastd)),
-                         "b");
+      auto b_param = sub->Parameter(
+          1,
+          xla::ShapeUtil::MakeShape(
+              b_shape->element_type(),
+              PrependMajorDims(sub.get(), batch_dimensions, b_lastd)),
+          "b");
 
       // We use a left-looking subroutine on the block diagonal in some common
       // cases, while falling back to a recursive call in unsupported cases. The
@@ -380,14 +373,6 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
     batch_dimensions.push_back(a_size);
   }
 
-  auto prepend_batch_dims = [&](std::array<int64, 2> indices) {
-    std::vector<int64> output(ndims);
-    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
-    std::copy(indices.begin(), indices.end(),
-              output.begin() + batch_dimensions.size());
-    return output;
-  };
-
   auto maybe_conj = [&](xla::ComputationBuilder* builder,
                         xla::ComputationDataHandle x) {
     auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
@@ -479,30 +464,6 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
     auto body_b = bodyb->GetTupleElement(input_tuple, 3);
     auto zero = bodyb->ConstantR0<int32>(0);
 
-    // Set up some helper functions.
-    auto prepend_zeros = [&](std::array<xla::ComputationDataHandle, 2> starts) {
-      auto zero = bodyb->Reshape(bodyb->ConstantR0<int32>(0), {1});
-      std::vector<xla::ComputationDataHandle> padded_starts(ndims, zero);
-      padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1});
-      padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1});
-      return bodyb->ConcatInDim(padded_starts, 0);
-    };
-
-    auto dynamic_slice = [&](xla::ComputationDataHandle x,
-                             std::array<xla::ComputationDataHandle, 2> starts,
-                             std::array<int64, 2> sizes) {
-      auto padded_starts = prepend_zeros(starts);
-      auto padded_sizes = prepend_batch_dims(sizes);
-      return bodyb->DynamicSlice(x, padded_starts, padded_sizes);
-    };
-
-    auto update = [&](xla::ComputationDataHandle x,
-                      xla::ComputationDataHandle update,
-                      std::array<xla::ComputationDataHandle, 2> starts) {
-      auto padded_starts = prepend_zeros(starts);
-      return bodyb->DynamicUpdateSlice(x, update, padded_starts);
-    };
-
     // We'd like to implement this:
     //   if transpose_a:
     //     a_row = T(a[..., i+1:, i:i+1])
@@ -516,22 +477,29 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
     // all zeros and use that as zero-padding (doing unnecessary FLOPs).
     xla::ComputationDataHandle a_row;
     if (transpose_a) {
-      a_row = dynamic_slice(body_a, {zero, i}, {m, 1});
+      TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                         {zero, i}, {m, 1}));
     } else {
-      a_row = dynamic_slice(body_a, {i, zero}, {1, m});
+      TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                         {i, zero}, {1, m}));
     }
     TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out,
                                                 /*transpose_x=*/transpose_a,
                                                 /*transpose_y=*/false,
                                                 /*conjugate_x=*/conjugate_a,
                                                 /*conjugate_y=*/false));
-    auto result_row =
-        bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update);
+    TF_ASSIGN_OR_RETURN(
+        auto result_row_slice,
+        DynamicSliceInMinorDims(bodyb.get(), body_b, {i, zero}, {1, n}));
+    auto result_row = bodyb->Sub(result_row_slice, b_update);
 
     // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
-    auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1});
+    TF_ASSIGN_OR_RETURN(auto a_elt, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                            {i, i}, {1, 1}));
     auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt));
-    body_out = update(body_out, div_result, {i, zero});
+    TF_ASSIGN_OR_RETURN(body_out,
+                        DynamicUpdateSliceInMinorDims(bodyb.get(), body_out,
+                                                      div_result, {i, zero}));
 
     // if transpose_a:
     //   return (i - 1, body_out, a, b)
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index f579669bbd852b514e021ce71d635f8ce5e4fe4d..31d823ca336039f691f2c16e37028c0de98b1ee5 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -140,13 +140,47 @@ xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
   return builder->Slice(x, padded_start, padded_end, strides);
 }
 
+std::vector<int64> PrependMajorDims(xla::ComputationBuilder* builder,
+                                    const gtl::ArraySlice<int64>& major_dims,
+                                    const gtl::ArraySlice<int64>& indices) {
+  std::vector<int64> output(indices.size() + major_dims.size());
+  std::copy(major_dims.begin(), major_dims.end(), output.begin());
+  std::copy(indices.begin(), indices.end(), output.begin() + major_dims.size());
+  return output;
+}
+
+xla::StatusOr<xla::ComputationDataHandle> DynamicSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const std::vector<xla::ComputationDataHandle>& starts,
+    const gtl::ArraySlice<int64>& sizes) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  int64 n_minor_dims = starts.size();
+  TF_RET_CHECK(n_minor_dims == sizes.size());
+  TF_RET_CHECK(n_minor_dims <= n_dims);
+  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(shape->dimensions()),
+                                    /*pos=*/0,
+                                    /*len=*/n_dims - sizes.size());
+  TF_ASSIGN_OR_RETURN(auto padded_starts,
+                      PrependZerosInMajorDims(builder, x, starts));
+  auto padded_sizes = PrependMajorDims(builder, major_dims, sizes);
+  return builder->DynamicSlice(x, padded_starts, padded_sizes);
+}
+
 xla::StatusOr<xla::ComputationDataHandle> UpdateSlice(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start) {
   // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
   std::vector<int32> start_as_int32(start.begin(), start.end());
-  return builder->DynamicUpdateSlice(
-      x, update, builder->ConstantR1<int32>(start_as_int32));
+  auto start_constant = builder->ConstantR1<int32>(start_as_int32);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> start_constant_shape,
+                      builder->GetShape(start_constant));
+  const int64 start_length =
+      xla::ShapeUtil::GetDimension(*start_constant_shape, -1);
+  TF_RET_CHECK(start_length == n_dims);
+  return builder->DynamicUpdateSlice(x, update, start_constant);
 }
 
 xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
@@ -162,6 +196,29 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
   return UpdateSlice(builder, x, update, padded_start);
 }
 
+xla::StatusOr<xla::ComputationDataHandle> DynamicUpdateSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const xla::ComputationDataHandle& update,
+    const std::vector<xla::ComputationDataHandle>& starts) {
+  TF_ASSIGN_OR_RETURN(auto padded_starts,
+                      PrependZerosInMajorDims(builder, x, starts));
+  return builder->DynamicUpdateSlice(x, update, padded_starts);
+}
+
+xla::StatusOr<xla::ComputationDataHandle> PrependZerosInMajorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const std::vector<xla::ComputationDataHandle>& starts) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  auto zero = builder->Reshape(builder->ConstantR0<int32>(0), {1});
+  std::vector<xla::ComputationDataHandle> padded_starts(n_dims, zero);
+  for (int i = 0; i < starts.size(); ++i) {
+    padded_starts[n_dims - starts.size() + i] =
+        builder->Reshape(starts[i], {1});
+  }
+  return builder->ConcatInDim(padded_starts, 0);
+}
+
 xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 51f8baaf00bd8fd25baa1a87be8cb0089dfb22b5..b684123f1363cff9e6ac4314cc3a8ae7630cbdf3 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -32,16 +32,39 @@ xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
 xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
                                         xla::PrimitiveType type, double value);
 
+// Makes a 1D tensor [0, ..., x, y] from two tensors x and y with zeros
+// prepended until the array is length n_dims.
+xla::ComputationDataHandle PrependZerosInMajorDims(
+    xla::ComputationBuilder* builder,
+    gtl::ArraySlice<xla::ComputationDataHandle> starts);
+
 // Returns a integer scalar constant of 'type' with 'value'.
 // If 'type' is complex, returns a real value with zero imaginary component.
 xla::ComputationDataHandle IntegerLiteral(xla::ComputationBuilder* builder,
                                           xla::PrimitiveType type, int64 value);
 
+// Builds a vector of zeros of length rank(x) with the last two values being
+// those in `starts`.
+xla::StatusOr<xla::ComputationDataHandle> PrependZerosInMajorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const std::vector<xla::ComputationDataHandle>& starts);
+
 // Performs a slice in the minor dimensions of a Tensor.
 xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     gtl::ArraySlice<int64> start, gtl::ArraySlice<int64> end);
 
+// Builds a 1-d vector out of a concatenation of `major_dims` and `starts`.
+std::vector<int64> PrependMajorDims(xla::ComputationBuilder* builder,
+                                    const gtl::ArraySlice<int64>& major_dims,
+                                    const gtl::ArraySlice<int64>& indices);
+
+// Performs a dynamic slice in the minor dimensions of a Tensor.
+xla::StatusOr<xla::ComputationDataHandle> DynamicSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const std::vector<xla::ComputationDataHandle>& starts,
+    const gtl::ArraySlice<int64>& sizes);
+
 // Updates a slice of 'x', i.e.,
 // x[start[0], ..., start[n]] = update
 xla::StatusOr<xla::ComputationDataHandle> UpdateSlice(
@@ -54,6 +77,11 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
 
+xla::StatusOr<xla::ComputationDataHandle> DynamicUpdateSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const xla::ComputationDataHandle& update,
+    const std::vector<xla::ComputationDataHandle>& starts);
+
 // Transposes a stack of matrices `x` by swapping the last two dimensions.
 xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x);
diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/tf2xla/lib/util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6bd33af2e42a4ab93a22528fd49ef53c46bb479
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/util_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+using UtilTest = xla::ClientLibraryTestBase;
+using UtilLeftLookingTest = xla::ClientLibraryTestBase;
+
+xla::Array2D<float> BValsRight() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<float> BValsLeft() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<float> AValsFull() {
+  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+xla::Array3D<float> BatchedAValsFull() {
+  return {{
+              {2, 0, 1, 2},
+              {3, 6, 0, 1},
+              {4, 7, 9, 0},
+              {5, 8, 10, 11},
+          },
+          {
+              {16, 24, 8, 12},
+              {24, 61, 82, 48},
+              {8, 82, 456, 106},
+              {12, 48, 106, 62},
+          }};
+}
+
+XLA_TEST_F(UtilTest, Simple2dLookup) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, x, y;
+  auto a_data = CreateR2Parameter<float>(BValsRight(), 0, "a", &builder, &a);
+  auto x_data = CreateR0Parameter<int>(2, 1, "x", &builder, &x);
+  auto y_data = CreateR0Parameter<int>(1, 2, "y", &builder, &y);
+  auto result = DynamicSliceInMinorDims(&builder, a, {x, y}, {1, 1});
+  TF_ASSERT_OK(result.status());
+
+  ComputeAndCompareR2<float>(&builder, {{10}},
+                             {a_data.get(), x_data.get(), y_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(UtilTest, Simple3dLookup) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto index_data = CreateR0Parameter<int>(1, 1, "index", &builder, &index);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto l_index,
+      DynamicSliceInMinorDims(&builder, a,
+                              {index, builder.ConstantR0<int32>(0)}, {1, 4}));
+
+  ComputeAndCompareR3<float>(&builder, {{{3, 6, 0, 1}}, {{24, 61, 82, 48}}},
+                             {a_data.get(), index_data.get()});
+}
+
+XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b, x, y;
+  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>({{9, 1, -10}}, 1, "b", &builder, &b);
+  auto x_data = CreateR0Parameter<int>(2, 2, "x", &builder, &x);
+  auto y_data = CreateR0Parameter<int>(1, 3, "y", &builder, &y);
+
+  auto result = DynamicUpdateSliceInMinorDims(&builder, a, b, {x, y});
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected(
+      {{{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 9, 1, -10}, {5, 8, 10, 11}}});
+
+  ComputeAndCompareR2<float>(
+      &builder, expected,
+      {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
+}
+
+XLA_TEST_F(UtilTest, RowBatchDot) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  int n = 4;
+
+  xla::ComputationDataHandle a, row, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
+                                           "row", &builder, &row);
+  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
+  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto l_index,
+      DynamicSliceInMinorDims(&builder, a,
+                              {index, builder.ConstantR0<int32>(0)}, {1, n}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto dot, BatchDot(&builder, l_index, row,
+                         /*transpose_x=*/false, /*transpose_y=*/true));
+
+  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
+                             {a_data.get(), row_data.get(), index_data.get()});
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/tf2xla/lib/while_loop.cc
index 86c02ac2e65c12d3527c4022df0cc603e522ef7a..495d9c60780b0a728e8dbfb4537d33d92b4bb5b7 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.cc
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.cc
@@ -54,7 +54,6 @@ xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
         auto result,
         condition_function(unpack_tuple(parameter, arity, cond_builder.get()),
                            cond_builder.get()));
-    TF_RETURN_IF_ERROR(cond_builder->SetReturnValue(result));
   }
   TF_ASSIGN_OR_RETURN(auto cond, cond_builder->Build());
 
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 98f72b3792eb147f5a1847c5e1ecef18bccbca5f..bb9168fa358154f3db9dab87bacc9bf28dd16406 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -7,17 +7,13 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 cc_library(
-    name = "functional_ops",
-    srcs = ["functional_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework",
+    name = "xla_ops",
+    srcs = [
+        "dynamic_slice_ops.cc",
+        "functional_ops.cc",
+        "reduce_window_op.cc",
+        "sendrecv_ops.cc",
     ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "sendrecv_ops",
-    srcs = ["sendrecv_ops.cc"],
     deps = [
         "//tensorflow/core:framework",
     ],
@@ -25,31 +21,9 @@ cc_library(
 )
 
 tf_gen_op_wrapper_py(
-    name = "gen_functional_ops",
-    out = "gen_functional_ops.py",
-    deps = [
-        ":functional_ops",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "gen_sendrecv_ops",
-    out = "gen_sendrecv_ops.py",
+    name = "gen_xla_ops",
+    out = "gen_xla_ops.py",
     deps = [
-        ":sendrecv_ops",
+        ":xla_ops",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6c0edbb889b1751ac9d9d47d0c9534b543196ff
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("XlaDynamicUpdateSlice")
+    .Input("input: T")
+    .Input("update: T")
+    .Input("indices: Tindices")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Wraps the XLA DynamicUpdateSlice operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#dynamicupdateslice
+.
+
+XlaDynamicUpdateSlice generates a result which is the value of the `input`
+operand, with a slice update overwritten at `indices`. The shape of `update`
+determines the shape of the sub-array of the result which is updated. The shape
+of indices must be rank == 1, with dimension size equal to the rank of `input`.
+
+Handling of out-of-bounds slice indices is implementation-defined.
+
+input: A `Tensor` of type T.
+indices: A vector of indices into `input`. Must have length equal to the rank of
+  `input`.
+update: A `Tensor` of type T. Same rank as `input`.
+output: A `Tensor` of type T.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc b/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9af982adc090ea78c711fd4656ba429c53b18c9
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("XlaReduceWindow")
+    .Input("input: T")
+    .Input("init_value: T")
+    .Attr("T: numbertype")
+    .Attr("computation: func")
+    .Attr("window_dimensions: list(int)")
+    .Attr("window_strides: list(int)")
+    .Attr("padding_low: list(int)")
+    .Attr("padding_high: list(int)")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Wraps the XLA ReduceWindow operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow .
+
+input: the input tensor
+init_value: a scalar representing the initial value for the reduction
+computation: a reducer function to apply
+window_dimensions: the shape of the window
+window_strides: the inter-window strides
+padding_low: the padding to apply at the start of each input dimensions
+padding_high: the padding to apply at the end of each input dimension.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc
index 4b41c16a8b3fdc0c3412c76d29d3ec2b7bdfd0aa..7ec7b50e905a6cbdecea4543dcb87322b5a7e844 100644
--- a/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc
@@ -18,22 +18,24 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("_XLASend")
+REGISTER_OP("XlaSend")
     .Input("tensor: T")
     .Attr("T: type")
     .Attr("tensor_name: string")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Sends the named tensor to another XLA computation.
+Sends the named tensor to another XLA computation. Wraps the XLA Send operator
+documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#send .
 
 tensor: The tensor to send.
-tensor_name: The name of the tensor to send.
+tensor_name: A string key that identifies the channel.
 )doc");
 
-REGISTER_OP("_XLARecv")
-    .Output("tensor: T")
-    .Attr("T: type")
+REGISTER_OP("XlaRecv")
+    .Output("tensor: dtype")
+    .Attr("dtype: type")
     .Attr("tensor_name: string")
     .Attr("shape: shape")
     .SetIsStateful()
@@ -46,11 +48,14 @@ REGISTER_OP("_XLARecv")
       return Status::OK();
     })
     .Doc(R"doc(
-Receives the named tensor from another XLA computation.
+Receives the named tensor from another XLA computation. Wraps the XLA Recv
+operator documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#recv .
 
 tensor: The tensor to receive.
-tensor_name: The name of the tensor to receive.
-shape: The shape of the input tensor.
+dtype: The type of the tensor.
+tensor_name: A string key that identifies the channel.
+shape: The shape of the tensor.
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index f0a2ef0651ff6115bd201a3b1c34b3c061a22a3d..42b6292f79ffddd155c05758a1420a2a583eb0c6 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -22,3 +22,11 @@ tf_py_clif_cc(
         "//tensorflow/compiler/tf2xla:xla_compiler",
     ],
 )
+
+py_library(
+    name = "xla",
+    srcs = ["xla.py"],
+    deps = [
+        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ce65bec950fdfd38c3ca5bc62ac745ef8ca4a7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental library that exposes XLA operations directly in TensorFlow.
+
+It is sometimes useful to be able to build HLO programs directly from
+TensorFlow. This file provides Tensorflow operators that map as closely as
+possible to HLO operators.
+
+There is no promise of backward or forward compatibility for operators defined
+in this module.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tf2xla.ops import gen_xla_ops
+
+# TODO(phawkins): provide wrappers for all XLA operators.
+
+dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice
+
+
+def reduce_window(operand,
+                  init,
+                  reducer,
+                  window_dimensions,
+                  window_strides=None,
+                  padding=None,
+                  name=None):
+  """Wraps the XLA ReduceWindow operator.
+
+  ReduceWindow is documented at
+  https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow .
+
+  Args:
+    operand: the input tensor
+    init: a scalar tensor representing the initial value for the reduction
+    reducer: a reduction function that combines a pair of scalars.
+    window_dimensions: shape of the window, as a list of integers
+    window_strides: inter-window strides, as a list of integers. Optional;
+      if omitted, defaults to strides of 1.
+    padding: padding to apply to 'operand'. List of (low, high) pairs of
+      integers that specify the padding to apply before and after each
+      dimension. Optional; if omitted, defaults to no padding.
+    name: the operator name, or None.
+  Returns:
+    A tensor that represents the output of the reduce_window operator.
+  """
+  window_strides = window_strides or [1] * len(window_dimensions)
+  padding = padding or [(0, 0)] * len(window_dimensions)
+  padding_low = [x for (x, _) in padding]
+  padding_high = [y for (_, y) in padding]
+  return gen_xla_ops.xla_reduce_window(
+      operand,
+      init,
+      reducer,
+      window_dimensions,
+      window_strides,
+      padding_low,
+      padding_high,
+      name=name)
+
+
+recv = gen_xla_ops.xla_recv
+send = gen_xla_ops.xla_send
+
+while_loop = gen_xla_ops.xla_while
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index 1a0e09758f7cc6714793300c6ece14093a8ad246..5759c72af301785f3ca1110b58eeb2fe7dead713 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -65,8 +66,8 @@ ParseShardingFromDevice(
   if (explicit_sharding.has_value()) {
     return explicit_sharding;
   } else if (!parsed_device.has_type || !parsed_device.has_id ||
-             !StringPiece(parsed_device.type)
-                  .contains(kDeviceSuffixReplicatedCore)) {
+             !str_util::StrContains(parsed_device.type,
+                                    kDeviceSuffixReplicatedCore)) {
     return tensorflow::gtl::optional<xla::OpSharding>();
   } else {
     const int core = parsed_device.id;
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index a9978e697b091715ce120f0d18fdddd259e08b32..b813668a9edd3a704a9dca1eaa588c1eced6ac31 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -90,6 +90,11 @@ TEST(ConvertGraphDefToXla, Sum) {
   TF_EXPECT_OK(result_or.status());
   std::unique_ptr<xla::Literal> result = std::move(result_or.ValueOrDie());
   EXPECT_EQ("(s32[]) (\n42\n)", result->ToString());
+
+  config.mutable_feed(0)->mutable_id()->set_output_index(
+      123); /* invalid output_index */
+  EXPECT_TRUE(errors::IsInvalidArgument(
+      ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index f428a194328935fec1210ea96245344de859e611..7ec85aa3cdec622cae509f45c5ba7740222025f9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -151,8 +151,15 @@ Status AddPlaceholdersForFeeds(
       Status status;
       Node* feed_node = g.AddNode(gd.node(0), &status);
       TF_RETURN_IF_ERROR(status);
-      info.data_type =
-          BaseType(feed_node->output_type(info.feed->id().output_index()));
+
+      if (info.feed->id().output_index() < feed_node->num_outputs()) {
+        info.data_type =
+            BaseType(feed_node->output_type(info.feed->id().output_index()));
+      } else {
+        return errors::InvalidArgument(
+            "Invalid output_index ", info.feed->id().output_index(),
+            " for feed node ", info.feed->id().node_name());
+      }
     }
   }
 
@@ -281,4 +288,13 @@ Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) {
   return Status::OK();
 }
 
+void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype,
+                                   KernelDef* kdef) {
+  for (KernelDef::AttrConstraint& constraint : *kdef->mutable_constraint()) {
+    if (constraint.name() == name) {
+      constraint.mutable_allowed_values()->mutable_list()->add_type(dtype);
+    }
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index e5fba8ede7745febbb42c572a7b52247213afc95..745beb39c1d917cd0d1cd219536ee26a96253ec9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -51,6 +52,10 @@ string TensorIdToString(const tf2xla::TensorId& id);
 // edges are considered.
 Status SetNodeShardingFromNeighbors(Node* n, bool out_edges);
 
+// Add an allowed data type to the AttrConstraint with the given name.
+void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype,
+                                   KernelDef* kdef);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index ed10d80609641b090cf78bf2e17364fe2fa89c31..ae51446204baf14dc03fc6305641048dbf3872b0 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -33,7 +34,7 @@ namespace {
 
 void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
-  EXPECT_TRUE(StringPiece(status.error_message()).contains(str))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), str))
       << "expected error: " << status.error_message() << " to contain: " << str;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 15bba46ac62a97592656942afc767a303c9b97f3..c0e996768491a6315c21021ce874b8a11557de6e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -365,6 +365,13 @@ Status BuildComputation(
               return a->arg_num() < b->arg_num();
             });
 
+  // Attach a common operator name as metadata. This has no semantic effect — it
+  // merely makes the HLO graph more readable when visualized via TensorBoard,
+  // since TensorBoard forms groups out of operators with similar names.
+  xla::OpMetadata retval_metadata;
+  retval_metadata.set_op_name("XLA_Retvals");
+  builder->SetOpMetadata(retval_metadata);
+
   for (const XlaResource* resource : arg_resources) {
     const XlaCompiler::Argument& arg = args[resource->arg_num()];
     const int core = arg_cores[resource->arg_num()];
@@ -412,6 +419,8 @@ Status BuildComputation(
 
   // Builds the XLA computation.
   builder->Tuple(elems);
+  builder->ClearOpMetadata();
+
   xla::StatusOr<xla::Computation> computation_status = builder->Build();
   if (!computation_status.ok()) {
     return computation_status.status();
@@ -514,6 +523,13 @@ Status XlaCompiler::BuildArguments(
     }
   }
 
+  // Attach a common operator name as metadata. This has no semantic effect — it
+  // merely makes the HLO graph more readable when visualized via TensorBoard,
+  // since TensorBoard forms groups out of operators with similar names.
+  xla::OpMetadata arg_metadata;
+  arg_metadata.set_op_name("XLA_Args");
+  builder->SetOpMetadata(arg_metadata);
+
   // Build parameter handles for non-constant arguments.
   std::vector<xla::ComputationDataHandle> arg_handles(input_mapping->size());
   if (use_tuple_arg) {
@@ -552,6 +568,8 @@ Status XlaCompiler::BuildArguments(
     }
   }
 
+  builder->ClearOpMetadata();
+
   // Fill in the handles in non-constant arguments.
   VLOG(2) << "XLA computation inputs:";
   for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
@@ -582,6 +600,48 @@ Status XlaCompiler::BuildArguments(
   return Status::OK();
 }
 
+Status XlaCompiler::CompileSingleOp(
+    const XlaCompiler::CompileOptions& options, string const& name,
+    OpKernelContext* ctx, const std::vector<XlaCompiler::Argument>& args,
+    CompilationResult* result) {
+  // TODO(b/74182462): We implement this by creating a new dummy Graph including
+  // _Arg nodes, and let CompileGraph walk it. This could be optimized.
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  Status status;
+  // First create the actual node we care about computing.
+  Node* main_node = graph->AddNode(ctx->op_kernel().def(), &status);
+  TF_RETURN_IF_ERROR(status);
+
+  // Create dummy _Arg nodes. Link these to `node` and also via a control
+  // dependency edge to the _SOURCE node.
+  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+    Node* node;
+    string name = strings::StrCat(ctx->op_kernel().name(), "_", i, "_arg");
+    Status status = NodeBuilder(name, "_Arg")
+                        .ControlInput(graph->source_node())
+                        .Attr("T", ctx->input_dtype(i))
+                        .Attr("index", i)
+                        .Finalize(graph.get(), &node);
+    TF_RETURN_IF_ERROR(status);
+    graph->AddEdge(node, 0, main_node, i);
+  }
+
+  // Similarly with return values, create dummy _Retval nodes fed by `node`.
+  for (int64 i = 0; i < ctx->num_outputs(); ++i) {
+    Node* node;
+    string name = strings::StrCat(ctx->op_kernel().name(), "_", i, "_retval");
+    Status status = NodeBuilder(name, "_Retval")
+                        .Input(main_node, i)
+                        .Attr("T", ctx->expected_output_dtype(i))
+                        .Attr("index", i)
+                        .Finalize(graph.get(), &node);
+    TF_RETURN_IF_ERROR(status);
+  }
+
+  return CompileGraph(options, name, std::move(graph), args, result);
+}
+
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  string const& name,
                                  std::unique_ptr<Graph> graph,
@@ -656,6 +716,14 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   VLOG(2) << "XLA output shape: "
           << xla::ShapeUtil::HumanString(result->xla_output_shape);
 
+  // Copy the host transfer metadata to the result.
+  for (const auto& send : host_compute_sends_) {
+    *result->host_compute_metadata.add_device_to_host() = send.second;
+  }
+  for (const auto& recv : host_compute_recvs_) {
+    *result->host_compute_metadata.add_host_to_device() = recv.second;
+  }
+
   // Tensorflow expects a major-to-minor order of results.
   xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
 
@@ -690,4 +758,84 @@ Status XlaCompiler::GetChannelHandle(const string& key,
   return Status::OK();
 }
 
+namespace {
+
+void SetTransfer(const string& key, gtl::ArraySlice<DataType> types,
+                 gtl::ArraySlice<TensorShape> shapes,
+                 tf2xla::HostTransferMetadata* transfer) {
+  transfer->set_key(key);
+  CHECK(types.size() == shapes.size());
+  for (int i = 0; i < types.size(); ++i) {
+    tf2xla::TensorMetadata* metadata = transfer->add_metadata();
+    metadata->set_type(types[i]);
+    shapes[i].AsProto(metadata->mutable_shape());
+  }
+}
+
+}  // namespace
+
+Status XlaCompiler::SetDeviceToHostMetadata(
+    const string& key, gtl::ArraySlice<DataType> types,
+    gtl::ArraySlice<TensorShape> shapes) {
+  if (host_compute_sends_.find(key) != host_compute_sends_.end()) {
+    return errors::InvalidArgument(
+        "Duplicate calls to SetDeviceToHostMetadata with key ", key);
+  }
+  tf2xla::HostTransferMetadata& transfer = host_compute_sends_[key];
+  SetTransfer(key, types, shapes, &transfer);
+  return Status::OK();
+}
+
+Status XlaCompiler::GetDeviceToHostShapes(
+    const string& key, std::vector<TensorShape>* shapes) const {
+  const auto iter = host_compute_sends_.find(key);
+  if (iter == host_compute_sends_.end()) {
+    return errors::InvalidArgument(
+        "No host compute send shapes registered for key ", key);
+  }
+  shapes->clear();
+  for (int i = 0; i < iter->second.metadata_size(); ++i) {
+    TensorShape shape(iter->second.metadata(i).shape());
+    shapes->push_back(shape);
+  }
+  return Status::OK();
+}
+
+Status XlaCompiler::SetHostToDeviceMetadata(
+    const string& key, gtl::ArraySlice<DataType> types,
+    gtl::ArraySlice<TensorShape> shapes) {
+  if (host_compute_recvs_.find(key) != host_compute_sends_.end()) {
+    return errors::InvalidArgument(
+        "Duplicate calls to SetHostToDeviceMetadata with key ", key);
+  }
+  tf2xla::HostTransferMetadata& transfer = host_compute_recvs_[key];
+  SetTransfer(key, types, shapes, &transfer);
+  return Status::OK();
+}
+
+Status XlaCompiler::GetHostComputeControlDependency(
+    const string& host_compute_name, xla::ComputationDataHandle* handle) {
+  const auto iter = host_compute_control_output_.find(host_compute_name);
+  if (iter == host_compute_control_output_.end()) {
+    return errors::InvalidArgument(
+        "No registered control handle for host compute Op '", host_compute_name,
+        "'");
+  } else {
+    *handle = iter->second;
+  }
+  return Status::OK();
+}
+
+Status XlaCompiler::SetHostComputeControlDependency(
+    const string& host_compute_name, const xla::ComputationDataHandle& handle) {
+  if (host_compute_control_output_.find(host_compute_name) !=
+      host_compute_control_output_.end()) {
+    return errors::InvalidArgument(
+        "Duplicate control handles registered for for host compute Op ",
+        host_compute_name);
+  }
+  host_compute_control_output_[host_compute_name] = handle;
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index c4449bc4be06daff856eff70c6d89be6ddbcf0ee..8f564f35ec81765e8998513dfd4805d221200c6c 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILER_H_
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILER_H_
 
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -216,6 +217,10 @@ class XlaCompiler {
     // containing both constant and non-constant results.
     std::vector<OutputDescription> outputs;
 
+    // TensorFlow shapes and types of sends/recvs from HostCompute Ops to their
+    // matching RecvAtHost/SendFromHost Ops in the outer graph.
+    tf2xla::HostComputeMetadata host_compute_metadata;
+
     // Resources whose values were updated by the computation, ordered
     // by return value position. Resource updates follow the non-constant
     // results in the outputs of XLA computation.
@@ -284,6 +289,14 @@ class XlaCompiler {
                       const std::vector<Argument>& args,
                       CompilationResult* result);
 
+  // Compiles a single Op, given by an OpKernelContext, into an
+  // xla::Computation. Similar to CompileFunction but takes a single Op as
+  // input.
+  Status CompileSingleOp(const CompileOptions& options, string const& name,
+                         OpKernelContext* ctx,
+                         const std::vector<Argument>& args,
+                         CompilationResult* result);
+
   // Returns the shape of the XLA parameter for an argument 'arg'.
   // See the class comment for more details about the argument passing
   // convention.
@@ -296,6 +309,39 @@ class XlaCompiler {
   // same XlaCompiler.
   Status GetChannelHandle(const string& key, xla::ChannelHandle* channel);
 
+  // Sets the shapes and types for the device to host transfer associated with
+  // 'key'.
+  Status SetDeviceToHostMetadata(const string& key,
+                                 gtl::ArraySlice<DataType> types,
+                                 gtl::ArraySlice<TensorShape> shapes);
+
+  // Gets the shapes the device to host transfer associated with 'key'.
+  Status GetDeviceToHostShapes(const string& key,
+                               std::vector<TensorShape>* shapes) const;
+
+  // Sets the shapes and types for the host to device transfer associated with
+  // 'key'.
+  Status SetHostToDeviceMetadata(const string& key,
+                                 gtl::ArraySlice<DataType> types,
+                                 gtl::ArraySlice<TensorShape> shapes);
+
+  // In order to avoid deadlocks from dependencies in host computations, it can
+  // be necessary to enforce a partial order on the execution of HostCompute
+  // Ops. In particular it may be necessary to constrain the SendToHost for one
+  // HostCompute to run before blocking on the RecvAtHost for another
+  // HostCompute. The compiler maintains a mapping from 'host_compute_name' to
+  // handle, where the handle is an 'output' of the HostCompute Op corresponding
+  // to 'host_compute_name'. Another HostCompute Op that needs to be sequenced
+  // later can add the handle as an 'input' to enforce the constraints.
+  // 'host_compute_name' can be any string the client wishes to use to identify
+  // a given HostCompute Op as long as the names are unique within the
+  // compilation.
+  Status GetHostComputeControlDependency(const string& host_compute_name,
+                                         xla::ComputationDataHandle* handle);
+  Status SetHostComputeControlDependency(
+      const string& host_compute_name,
+      const xla::ComputationDataHandle& handle);
+
   const Options& options() const { return options_; }
   xla::Client* client() const { return options_.client; }
   FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_; }
@@ -359,6 +405,12 @@ class XlaCompiler {
 
   std::unordered_map<string, xla::ChannelHandle> channels_;
 
+  std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_sends_;
+  std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_recvs_;
+
+  std::unordered_map<string, xla::ComputationDataHandle>
+      host_compute_control_output_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
 };
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index a18eeacd41808884fac9ec5d617cb0d274ea27d8..096dc7160bfc0a3a751f33e7d646471ebea56070 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -257,10 +258,10 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
                             std::move(graph), args, &result);
   EXPECT_FALSE(status.ok());
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("depends on a parameter"))
+      str_util::StrContains(status.error_message(), "depends on a parameter"))
       << status.error_message();
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("[[Node: C = Reshape"))
+      str_util::StrContains(status.error_message(), "[[Node: C = Reshape"))
       << status.error_message();
 }
 
@@ -597,7 +598,8 @@ TEST_F(XlaCompilerTest, UndefinedFunctionFails) {
       compiler.CompileFunction(XlaCompiler::CompileOptions(), name_attr,
                                /*args=*/{}, &result);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("is not defined."))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
+                                    "is not defined."))
       << status.error_message();
 }
 
@@ -676,11 +678,12 @@ TEST_F(XlaCompilerTest, LocalFunctionWithWrongArgumentsFail) {
 
   ASSERT_FALSE(status.ok());
   // Flib lookup failure.
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("is not defined."))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
+                                    "is not defined."))
       << status.error_message();
   // Local flib lookup failure.
-  EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Attr T is not found"))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
+                                    "Attr T is not found"))
       << status.error_message();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
index 8286480e0ea07429adbe31ec4f16d043e321df0a..ead229aaccc292d4944db0c1eaf98c82583533cd 100644
--- a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 
@@ -30,6 +31,12 @@ bool CpuOpFilter(KernelDef* kdef) {
         DT_FLOAT);
     return true;
   }
+  if (kdef->op() == "Const") {
+    AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (kdef->op() == "Assert") {
+    AddDtypeToKernalDefConstraint("T", DT_STRING, kdef);
+  }
   return true;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
index 8ca757e72355d890c13b8b448d35c327d3986696..62168b648331844bfe2db1a4d5dcad895c8726f3 100644
--- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 
@@ -25,6 +26,12 @@ bool GpuOpFilter(KernelDef* kdef) {
       kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal") {
     return false;
   }
+  if (kdef->op() == "Const") {
+    AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (kdef->op() == "Assert") {
+    AddDtypeToKernalDefConstraint("T", DT_STRING, kdef);
+  }
   return true;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index f048662953e20b2a612271e2daeef6e370c4822a..62a5114837e07f35134ad99e28880d6a9233a213 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
@@ -121,6 +122,9 @@ xla::ComputationDataHandle XlaHelpers::One(xla::ComputationBuilder* b,
 xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
                                                DataType data_type) {
   switch (data_type) {
+    case DT_HALF:
+      return b->ConstantR0<Eigen::half>(
+          static_cast<Eigen::half>(Eigen::NumTraits<Eigen::half>::epsilon()));
     case DT_BFLOAT16:
       return b->ConstantR0<bfloat16>(bfloat16::epsilon());
     case DT_FLOAT:
@@ -273,4 +277,20 @@ Status XlaHelpers::OneHot(xla::ComputationBuilder* builder, int64 depth,
   return Status::OK();
 }
 
+DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
+  if (dtype == DT_BFLOAT16) {
+    return DT_FLOAT;
+  }
+  return dtype;
+}
+
+xla::ComputationDataHandle XlaHelpers::ConvertElementType(
+    xla::ComputationBuilder* const builder,
+    const xla::ComputationDataHandle& operand,
+    const DataType new_element_type) {
+  xla::PrimitiveType convert_to;
+  TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
+  return builder->ConvertElementType(operand, convert_to);
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 2a027db4c839c917f3a7acd27184792d157356bf..68ab93b64a5fa87ad99e0f44d84f6473fc8bbebd 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -107,6 +107,18 @@ class XlaHelpers {
                        const xla::ComputationDataHandle& on_value,
                        const xla::ComputationDataHandle& off_value,
                        xla::ComputationDataHandle* one_hot);
+
+  // Certain DataTypes should use increased precision DataTypes when performing
+  // reductions.  This function remaps a given DataType to a higher precision
+  // DataType if needed.
+  static DataType SumAccumulationType(const DataType& dtype);
+
+  // A helper for creating a ConvertElementType xla op given a DataType rather
+  // than the xla::PrimitiveType.
+  static xla::ComputationDataHandle ConvertElementType(
+      xla::ComputationBuilder* const builder,
+      const xla::ComputationDataHandle& operand,
+      const DataType new_element_type);
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index ff7453194af3a85bded86a5ce298f8779422dccb..e255b01dd7fdcb095c7992d4352d2d9bb7d36ac3 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -51,13 +51,13 @@ constexpr std::array<DataType, 9> kNumericTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BFLOAT16}};
 
-constexpr std::array<DataType, 8> kCpuAllTypes = {
-    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
+constexpr std::array<DataType, 9> kCpuAllTypes = {
+    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BOOL}};
 
-constexpr std::array<DataType, 8> kGpuAllTypes = {
-    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
-     DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 10> kGpuAllTypes = {
+    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
+     DT_COMPLEX64, DT_BOOL, DT_BFLOAT16}};
 
 // Class that manages registrations of operators and devices for the XLA JIT.
 // Not thread-safe.
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 34e733bc8d80b364cec1783006eba0a5468b55ea..1af9cb6d2ab15a33b56f1df0410f47d7e139a1ba 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -52,6 +52,7 @@ xla_proto_library(
     visibility = ["//visibility:public"],
     deps = [
         ":xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:session_proto",
     ],
 )
@@ -372,7 +373,6 @@ tf_cc_test(
 
 cc_library(
     name = "array2d",
-    srcs = ["array2d.cc"],
     hdrs = ["array2d.h"],
     visibility = ["//visibility:public"],
     deps = [
@@ -443,6 +443,9 @@ cc_library(
     srcs = ["executable_run_options.cc"],
     hdrs = ["executable_run_options.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":types",
+    ],
 )
 
 cc_library(
@@ -602,8 +605,8 @@ cc_library(
         ":util",
         ":window_util",
         ":xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_evaluator",
         "//tensorflow/compiler/xla/service:shape_inference",
@@ -654,18 +657,6 @@ tf_cc_test(
 
 # -----------------------------------------------------------------------------
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
 cc_header_only_library(
     name = "xla_headers_lib",
diff --git a/tensorflow/compiler/xla/README.md b/tensorflow/compiler/xla/README.md
index c93c39e180655e7930e943e6aa6514c47f2859d7..39f8caaa961dc7b57d2b45f974fc6ecf89cf6748 100644
--- a/tensorflow/compiler/xla/README.md
+++ b/tensorflow/compiler/xla/README.md
@@ -1 +1,7 @@
-This is the home of XLA.
+<p align="center">
+  <img width="200" src="xlalogo.png"/>
+</p>
+
+XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
+algebra that optimizes TensorFlow computations. See the
+[documentation](https://www.tensorflow.org/performance/xla/) for more details.
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 46ee4e64c9ae7ca111d9d04bedcb74ff02a42386..ea75ad32d5df7bbadd37e89de6144b264ab6d5d1 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -121,10 +122,31 @@ class Array {
     CHECK(idx == num_elements());
   }
 
-  // Creates a 2D array of Eigen::half from the given nested initializer list of
-  // float values.
+  // Creates a 1D array of a floating-point type (half, bfloat16, float,
+  // or double) from an initializer list of float values.
   template <typename T2, typename = typename std::enable_if<
-                             std::is_same<T, Eigen::half>::value &&
+                             (std::is_same<T, Eigen::half>::value ||
+                              std::is_same<T, bfloat16>::value ||
+                              std::is_same<T, float>::value ||
+                              std::is_same<T, double>::value) &&
+                             std::is_same<T2, float>::value>::type>
+  Array(std::initializer_list<T2> values)
+      : Array(ToInt64Vector({values.size()})) {
+    int64 idx = 0;
+    for (const auto& it1 : values) {
+      values_[idx] = static_cast<T>(it1);
+      ++idx;
+    }
+    CHECK(idx == num_elements());
+  }
+
+  // Creates a 2D array of a floating-point type (half, bfloat16, float,
+  // or double) from an initializer list of float values.
+  template <typename T2, typename = typename std::enable_if<
+                             (std::is_same<T, Eigen::half>::value ||
+                              std::is_same<T, bfloat16>::value ||
+                              std::is_same<T, float>::value ||
+                              std::is_same<T, double>::value) &&
                              std::is_same<T2, float>::value>::type>
   Array(std::initializer_list<std::initializer_list<T2>> values)
       : Array(ToInt64Vector({values.size(), values.begin()->size()})) {
@@ -155,10 +177,13 @@ class Array {
     CHECK(idx == num_elements());
   }
 
-  // Creates a 3D array of Eigen::half from the given nested initializer list of
-  // float values.
+  // Creates a 3D array of a floating-point type (half, bfloat16, float,
+  // or double) from an initializer list of float values.
   template <typename T2, typename = typename std::enable_if<
-                             std::is_same<T, Eigen::half>::value &&
+                             (std::is_same<T, Eigen::half>::value ||
+                              std::is_same<T, bfloat16>::value ||
+                              std::is_same<T, float>::value ||
+                              std::is_same<T, double>::value) &&
                              std::is_same<T2, float>::value>::type>
   Array(std::initializer_list<std::initializer_list<std::initializer_list<T2>>>
             values)
@@ -196,10 +221,13 @@ class Array {
     CHECK(idx == num_elements());
   }
 
-  // Creates a 4D array of Eigen::half from the given nested initializer list of
-  // float values.
+  // Creates a 4D array of a floating-point type (half, bfloat16, float,
+  // or double) from an initializer list of float values.
   template <typename T2, typename = typename std::enable_if<
-                             std::is_same<T, Eigen::half>::value &&
+                             (std::is_same<T, Eigen::half>::value ||
+                              std::is_same<T, bfloat16>::value ||
+                              std::is_same<T, float>::value ||
+                              std::is_same<T, double>::value) &&
                              std::is_same<T2, float>::value>::type>
   Array(std::initializer_list<
         std::initializer_list<std::initializer_list<std::initializer_list<T2>>>>
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index 41f563486d21e42e88dcf6c751ce4a64da5e3213..a17e81f44832f272fd93dce9f854042b4a84fde4 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -52,10 +53,13 @@ class Array2D : public Array<T> {
   Array2D(std::initializer_list<std::initializer_list<T>> values)
       : Array<T>(values) {}
 
-  // Creates an array of Eigen::half from the given nested initializer list of
-  // float values.
+  // Creates an array of a floating-point type (half, bfloat16, float,
+  // or double) from the given nested initializer list of float values.
   template <typename T2, typename = typename std::enable_if<
-                             std::is_same<T, Eigen::half>::value &&
+                             (std::is_same<T, Eigen::half>::value ||
+                              std::is_same<T, bfloat16>::value ||
+                              std::is_same<T, float>::value ||
+                              std::is_same<T, double>::value) &&
                              std::is_same<T2, float>::value>::type>
   Array2D(std::initializer_list<std::initializer_list<T2>> values)
       : Array<T>(values) {}
@@ -94,9 +98,23 @@ class Array2D : public Array<T> {
 
 // Returns a linspace-populated Array2D in the range [from, to] (inclusive)
 // with dimensions n1 x n2.
-std::unique_ptr<Array2D<float>> MakeLinspaceArray2D(float from, float to,
-                                                    int64 n1, int64 n2);
-
+template <typename NativeT = float>
+std::unique_ptr<Array2D<NativeT>> MakeLinspaceArray2D(double from, double to,
+                                                      int64 n1, int64 n2) {
+  auto array = MakeUnique<Array2D<NativeT>>(n1, n2);
+  int64 count = n1 * n2;
+  NativeT step =
+      static_cast<NativeT>((count > 1) ? (to - from) / (count - 1) : 0);
+  auto set = [&array, n1, n2](int64 index, NativeT value) {
+    (*array)(index / n2, index % n2) = value;
+  };
+  for (int64 i = 0; i < count - 1; ++i) {
+    set(i, (static_cast<NativeT>(from) +
+            static_cast<NativeT>(i) * static_cast<NativeT>(step)));
+  }
+  set(count - 1, static_cast<NativeT>(to));
+  return array;
+}
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_ARRAY2D_H_
diff --git a/tensorflow/compiler/xla/array3d.h b/tensorflow/compiler/xla/array3d.h
index e5eb235d45d160d486d1499db665ed14a8509043..0e9a0722ae43e1dc6ecddde9cbc3daf1db058840 100644
--- a/tensorflow/compiler/xla/array3d.h
+++ b/tensorflow/compiler/xla/array3d.h
@@ -57,10 +57,13 @@ class Array3D : public Array<T> {
               values)
       : Array<T>(values) {}
 
-  // Creates an array of Eigen::half from the given nested initializer list of
-  // float values.
+  // Creates an array of a floating-point type (half, bfloat16, float,
+  // or double) from the given nested initializer list of float values.
   template <typename T2, typename = typename std::enable_if<
-                             std::is_same<T, Eigen::half>::value &&
+                             (std::is_same<T, Eigen::half>::value ||
+                              std::is_same<T, bfloat16>::value ||
+                              std::is_same<T, float>::value ||
+                              std::is_same<T, double>::value) &&
                              std::is_same<T2, float>::value>::type>
   Array3D(
       std::initializer_list<std::initializer_list<std::initializer_list<T2>>>
diff --git a/tensorflow/compiler/xla/array4d.h b/tensorflow/compiler/xla/array4d.h
index cff70e54bad0116bdd08674b626b3bf99dc89e1f..a75fffc605aa0df3e1e2eeb6d3129718cbbba0e4 100644
--- a/tensorflow/compiler/xla/array4d.h
+++ b/tensorflow/compiler/xla/array4d.h
@@ -82,10 +82,13 @@ class Array4D : public Array<T> {
               values)
       : Array<T>(values) {}
 
-  // Creates an array of Eigen::half from the given nested initializer list of
-  // float values.
+  // Creates an array of a floating-point type (half, bfloat16, float,
+  // or double) from the given nested initializer list of float values.
   template <typename T2, typename = typename std::enable_if<
-                             std::is_same<T, Eigen::half>::value &&
+                             (std::is_same<T, Eigen::half>::value ||
+                              std::is_same<T, bfloat16>::value ||
+                              std::is_same<T, float>::value ||
+                              std::is_same<T, double>::value) &&
                              std::is_same<T2, float>::value>::type>
   Array4D(std::initializer_list<std::initializer_list<
               std::initializer_list<std::initializer_list<T2>>>>
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 02356699a25e47be50eb15872df4c9c302fc289b..286d06d12ffca7410067f2d33398497576986807 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -74,6 +74,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
@@ -129,6 +130,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -213,17 +215,3 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index d15ccb0c28522c647617153aaa8e738d029dfaba..328e1b8fa84e7baaca41c6c9a65e9a1598ac32ae 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -177,6 +177,50 @@ StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
   return Transfer(*data, shape_with_output_layout);
 }
 
+StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const ExecutionOptions* execution_options,
+    ExecutionProfile* execution_profile) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<GlobalData> data,
+      Execute(computation, arguments, execution_options, execution_profile));
+
+  const Shape* shape_with_output_layout = nullptr;
+  if (execution_options && execution_options->has_shape_with_output_layout()) {
+    shape_with_output_layout = &execution_options->shape_with_output_layout();
+  }
+  return Transfer(*data, shape_with_output_layout);
+}
+
+StatusOr<std::unique_ptr<Literal>> Client::ComputeConstant(
+    const XlaComputation& computation, const Layout* output_layout) const {
+  ComputeConstantGraphRequest request;
+  *request.mutable_computation() = computation.proto();
+  if (output_layout != nullptr) {
+    *request.mutable_output_layout() = *output_layout;
+  }
+
+  ComputeConstantResponse response;
+
+  VLOG(2) << "making compute-constant-graph request";
+  Status s = stub_->ComputeConstantGraph(&request, &response);
+  VLOG(2) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  VLOG(3) << "ComputeConstant: {" << response.DebugString() << "}";
+
+  if (!response.has_literal()) {
+    return InternalError(
+        "no computed literal in the provided response in ComputeConstantGraph "
+        "request");
+  }
+  return Literal::CreateFromProto(response.literal());
+}
+
 StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
   LoadComputationSnapshotRequest request;
   *request.mutable_module() = module;
@@ -191,6 +235,11 @@ StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
   return Computation(stub_, response.computation());
 }
 
+StatusOr<XlaComputation> Client::LoadSnapshot(const HloSnapshot& module) {
+  TF_RET_CHECK(module.has_hlo() && module.hlo().has_hlo_module());
+  return XlaComputation(module.hlo().hlo_module());
+}
+
 StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -231,6 +280,46 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
   return MakeUnique<GlobalData>(stub_, response.output());
 }
 
+StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const ExecutionOptions* execution_options,
+    ExecutionProfile* execution_profile) {
+  ExecuteGraphRequest request;
+  *request.mutable_computation() = computation.proto();
+
+  if (execution_options == nullptr) {
+    *request.mutable_execution_options() = CreateDefaultExecutionOptions();
+  } else {
+    *request.mutable_execution_options() = *execution_options;
+  }
+  for (GlobalData* argument : arguments) {
+    CHECK(argument != nullptr) << "Argument pointers must not be null.";
+    *request.add_arguments() = argument->handle();
+  }
+
+  ExecuteResponse response;
+  VLOG(1) << "making execute request: " << request.ShortDebugString();
+  Status s = stub_->ExecuteGraph(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (execution_profile != nullptr) {
+    *execution_profile = response.profile();
+    if (VLOG_IS_ON(1)) {
+      TF_ASSIGN_OR_RETURN(
+          auto execution_stats,
+          ExecutionStatsAsString(computation, response.profile()));
+      VLOG(1) << execution_stats;
+    }
+  }
+
+  return MakeUnique<GlobalData>(stub_, response.output());
+}
+
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
     tensorflow::gtl::ArraySlice<ComputationInstance> computations) {
   ExecuteParallelRequest request;
@@ -266,6 +355,42 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   return std::move(outputs);
 }
 
+StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
+    tensorflow::gtl::ArraySlice<XlaComputationInstance> computations) {
+  ExecuteGraphParallelRequest request;
+
+  for (const XlaComputationInstance& computation : computations) {
+    ExecuteGraphRequest single_request;
+    *single_request.mutable_computation() = computation.computation.proto();
+    for (GlobalData* argument : computation.arguments) {
+      *single_request.add_arguments() = argument->handle();
+    }
+    *single_request.mutable_execution_options() = computation.execution_options;
+    *request.add_requests() = single_request;
+  }
+
+  ExecuteParallelResponse response;
+  VLOG(1) << "making execute-graph-parallel request: "
+          << request.ShortDebugString();
+  tensorflow::Status s = stub_->ExecuteGraphParallel(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<std::unique_ptr<GlobalData>> outputs;
+  for (size_t i = 0; i < computations.size(); ++i) {
+    outputs.push_back(
+        MakeUnique<GlobalData>(stub_, response.responses(i).output()));
+    if (computations[i].execution_profile != nullptr) {
+      *computations[i].execution_profile = response.responses(i).profile();
+    }
+  }
+
+  return std::move(outputs);
+}
+
 StatusOr<std::vector<DeviceHandle>> Client::GetDeviceHandles(
     int64 device_count) {
   if (device_count < 1) {
@@ -342,6 +467,27 @@ StatusOr<ComputationStats> Client::GetComputationStats(
   return response.stats();
 }
 
+StatusOr<ComputationStats> Client::GetComputationStats(
+    const XlaComputation& computation,
+    const DebugOptions& debug_options) const {
+  ComputationGraphStatsRequest request;
+
+  // TODO(b/74197823): Find a way to avoid the copy of the hlo proto.
+  *request.mutable_computation() = computation.proto();
+  *request.mutable_debug_options() = debug_options;
+  ComputationStatsResponse response;
+
+  VLOG(1) << "making computation graph stats request";
+  Status s = stub_->GetComputationGraphStats(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+  CHECK(response.has_stats());
+  return response.stats();
+}
+
 StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
     const Computation& computation) {
   GetComputationShapeRequest request;
@@ -359,6 +505,12 @@ StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
   return WrapUnique(response.release_program_shape());
 }
 
+StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
+    const XlaComputation& computation) {
+  TF_ASSIGN_OR_RETURN(const auto& result, computation.GetProgramShape());
+  return MakeUnique<ProgramShape>(result);
+}
+
 StatusOr<Shape> Client::GetShape(const GlobalData& data) {
   GetShapeRequest request;
   *request.mutable_data() = data.handle();
@@ -397,6 +549,28 @@ StatusOr<string> Client::ExecutionStatsAsString(
   return string("[Execution Statistics] not available.");
 }
 
+StatusOr<string> Client::ExecutionStatsAsString(
+    const XlaComputation& computation, const ExecutionProfile& profile) {
+  TF_ASSIGN_OR_RETURN(
+      auto computation_stats,
+      GetComputationStats(computation,
+                          legacy_flags::GetDebugOptionsFromFlags()));
+  int64 total_flops =
+      computation_stats.flop_count() + computation_stats.transcendental_count();
+  if (profile.compute_time_ns() > 0) {
+    int64 nanoseconds = profile.compute_time_ns();
+    int64 cycle_count = profile.compute_cycle_count();
+    double gflops = total_flops / nanoseconds;
+    return tensorflow::strings::StrCat(
+        "[Execution Statistics] flop count: ", computation_stats.flop_count(),
+        ", transcendental count: ", computation_stats.transcendental_count(),
+        ", compute execution time: ", nanoseconds, " nsec",
+        ", compute cycles: ", cycle_count, ", performance: ", gflops,
+        "gflop/s");
+  }
+  return string("[Execution Statistics] not available.");
+}
+
 StatusOr<ChannelHandle> Client::CreateChannelHandle() {
   CreateChannelHandleRequest request;
   CreateChannelHandleResponse response;
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index c28380b689c7a0e16bf0bcbf15003f4aa15e42a7..a63ff4c56d1dd78c7abfa2bf163b5fbd54d82b2b 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service_interface.h"
@@ -57,6 +58,21 @@ class Client {
       const ExecutionOptions* execution_options = nullptr,
       ExecutionProfile* execution_profile = nullptr);
 
+  // Executes the computation with the given arguments and returns the global
+  // data that was produced from the execution.
+  // * If execution_options is not nullptr, these options are passed to the
+  //   service to affect how it compiles our computation.  (The pointer does not
+  //   need to live beyond this call.)
+  // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
+  //   will be filled with profile data from the execution.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<GlobalData>> Execute(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const ExecutionOptions* execution_options = nullptr,
+      ExecutionProfile* execution_profile = nullptr);
+
   // A struct to represent a computation instance to be executed.
   // * If execution_options.device_handles is not empty, the computation is
   //   executed on the devices associated with the handles by partitioning the
@@ -83,6 +99,36 @@ class Client {
   StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
       tensorflow::gtl::ArraySlice<ComputationInstance> computations);
 
+  // A struct to represent a computation instance to be executed.
+  // * If execution_options.device_handles is not empty, the computation is
+  //   executed on the devices associated with the handles by partitioning the
+  //   computation based on the attached sharding attributes. Otherwise, a
+  //   device is chosen by the service.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  struct XlaComputationInstance {
+    const XlaComputation& computation;
+    std::vector<GlobalData*> arguments;
+    ExecutionOptions execution_options;
+    ExecutionProfile* execution_profile;
+
+    XlaComputationInstance(const XlaComputation& computation,
+                           std::vector<GlobalData*> arguments,
+                           ExecutionOptions execution_options,
+                           ExecutionProfile* execution_profile)
+        : computation(computation),
+          arguments(std::move(arguments)),
+          execution_options(execution_options),
+          execution_profile(execution_profile) {}
+  };
+
+  // Executes a list XlaComputationInstances and returns global data produced
+  // from each computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
+      tensorflow::gtl::ArraySlice<XlaComputationInstance> computations);
+
   // Requests device_count device handles available on the target. The returned
   // device handles are used to specify the devices to execute the computations
   // (see ExecuteParallel) or to transfer data (see TransferToServer or
@@ -137,6 +183,38 @@ class Client {
       const ExecutionOptions* execution_options = nullptr,
       ExecutionProfile* execution_profile = nullptr);
 
+  // Executes the computation with the given arguments and transfers the result
+  // to the client as a literal. Parameters are defined the same as for
+  // Execute() and Transfer().
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const ExecutionOptions* execution_options = nullptr,
+      ExecutionProfile* execution_profile = nullptr);
+
+  // Computes the value of the given computation using a non-optimized
+  // interpreter on the host.
+  //
+  // The computation must not depend on any parameters, or on stateful operators
+  // such as `RngNormal` or `Infeed`.
+  //
+  // This functionality can be useful when translating a computation into XLA
+  // where something that looked dynamic is required by XLA to be specified as a
+  // constant. E.g. the source computation (outside of XLA) may include a
+  // dynamic computation of the shape of something and ComputeConstant lets you
+  // determine what the value of that computation is in the case where the value
+  // can be determined at compile time.
+  //
+  // If output_layout is non-null, then the output of the computation will be
+  // stored using that layout.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Literal>> ComputeConstant(
+      const XlaComputation& computation,
+      const Layout* output_layout = nullptr) const;
+
   // Unregister the memory for the given GlobalData on the device.
   Status Unregister(const GlobalData& data);
 
@@ -148,6 +226,13 @@ class Client {
   StatusOr<ComputationStats> GetComputationStats(
       const Computation& computation, const DebugOptions& debug_options) const;
 
+  // Retrieves the statistics of the given computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<ComputationStats> GetComputationStats(
+      const XlaComputation& computation,
+      const DebugOptions& debug_options) const;
+
   // Returns the Shape of the given array specified by 'data'. The shape
   // includes the Layout of the array as it is stored on the service.
   StatusOr<Shape> GetShape(const GlobalData& data);
@@ -157,12 +242,22 @@ class Client {
   StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
       const Computation& computation);
 
+  // As above, but returns the shape of the provided computation (parameter
+  // types/names and return type).
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
+      const XlaComputation& computation);
+
   // Creates a channel handle that can be used to transfer data between
   // two computations via a pair of Send and Recv instructions.
   StatusOr<ChannelHandle> CreateChannelHandle();
 
   StatusOr<Computation> LoadSnapshot(const SessionModule& module);
 
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<XlaComputation> LoadSnapshot(const HloSnapshot& module);
+
   ServiceInterface* stub() { return stub_; }
 
  private:
@@ -170,6 +265,8 @@ class Client {
   // ExecutionProfile returned from an execution of the computation.
   StatusOr<string> ExecutionStatsAsString(const Computation& computation,
                                           const ExecutionProfile& profile);
+  StatusOr<string> ExecutionStatsAsString(const XlaComputation& computation,
+                                          const ExecutionProfile& profile);
 
   ServiceInterface* stub_;  // Stub that this client is connected on.
 
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index b1663bc815719c3da75b37593ac665b1f3493db8..803a9e40094391ba47ed27713f4538caf875c4f6 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -23,22 +23,19 @@ limitations under the License.
 
 namespace xla {
 
-LocalClientOptions::LocalClientOptions(perftools::gputools::Platform* platform,
+LocalClientOptions::LocalClientOptions(se::Platform* platform,
                                        int number_of_replicas,
                                        int intra_op_parallelism_threads)
     : platform_(platform),
       number_of_replicas_(number_of_replicas),
       intra_op_parallelism_threads_(intra_op_parallelism_threads) {}
 
-LocalClientOptions& LocalClientOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+LocalClientOptions& LocalClientOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
   return *this;
 }
 
-perftools::gputools::Platform* LocalClientOptions::platform() const {
-  return platform_;
-}
+se::Platform* LocalClientOptions::platform() const { return platform_; }
 
 LocalClientOptions& LocalClientOptions::set_number_of_replicas(
     int number_of_replicas) {
@@ -69,7 +66,7 @@ ClientLibrary::ClientLibrary() = default;
 ClientLibrary::~ClientLibrary() = default;
 
 /* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   LocalClientOptions default_options;
   default_options.set_platform(platform);
   return GetOrCreateLocalClient(default_options);
@@ -77,7 +74,7 @@ ClientLibrary::~ClientLibrary() = default;
 
 /* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
     const LocalClientOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   int replica_count = options.number_of_replicas();
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
@@ -115,7 +112,7 @@ ClientLibrary::~ClientLibrary() = default;
 }
 
 /* static */ LocalService* ClientLibrary::GetXlaService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
   auto it = client_library.local_instances_.find(platform->id());
@@ -124,8 +121,7 @@ ClientLibrary::~ClientLibrary() = default;
 }
 
 /* static */ StatusOr<CompileOnlyClient*>
-ClientLibrary::GetOrCreateCompileOnlyClient(
-    perftools::gputools::Platform* platform) {
+ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) {
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
 
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index a6f30d82e43587135697e76e8bc7d122edc0f602..3ad558fa532931937fab898f7b855f0a3370eaec 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -43,13 +43,13 @@ namespace xla {
 // Options to configure the local client when it is created.
 class LocalClientOptions {
  public:
-  LocalClientOptions(perftools::gputools::Platform* platform = nullptr,
+  LocalClientOptions(se::Platform* platform = nullptr,
                      int number_of_replicas = 1,
                      int intra_op_parallelism_threads = -1);
 
   // Set the platform backing the service, or nullptr for the default platform.
-  LocalClientOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  LocalClientOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
 
   // Set the number of replicas to use when compiling replicated
   // programs.
@@ -61,7 +61,7 @@ class LocalClientOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
   int number_of_replicas_;
   int intra_op_parallelism_threads_;
 };
@@ -74,7 +74,7 @@ class ClientLibrary {
   //   platform : The platform the underlying XLA service should target. If
   //     null then default platform is used.
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
       const LocalClientOptions& options);
 
@@ -84,14 +84,14 @@ class ClientLibrary {
 
   // Returns the service from the service thread. Only used in unit tests to
   // access user computations from client.
-  static LocalService* GetXlaService(perftools::gputools::Platform* platform);
+  static LocalService* GetXlaService(se::Platform* platform);
 
   // Singleton constructor-or-accessor for compile-only clients. Arguments:
   //
   //   platform : The platform the underlying XLA service should target. If
   //     null then default platform is used.
   static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
 
   // Clears the local instance and compile only instance caches. The client
   // pointers returned by the previous GetOrCreateLocalClient() or
@@ -120,12 +120,10 @@ class ClientLibrary {
   };
 
   tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
-  std::unordered_map<perftools::gputools::Platform::Id,
-                     std::unique_ptr<LocalInstance>>
+  std::unordered_map<se::Platform::Id, std::unique_ptr<LocalInstance>>
       local_instances_ GUARDED_BY(service_mutex_);
 
-  std::unordered_map<perftools::gputools::Platform::Id,
-                     std::unique_ptr<CompileOnlyInstance>>
+  std::unordered_map<se::Platform::Id, std::unique_ptr<CompileOnlyInstance>>
       compile_only_instances_ GUARDED_BY(service_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
index c7e2c4367b89ca2112022fa40449ae3ebe28463e..96e38bca01087991943aff40ed1cb3e21f9e6cba 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.cc
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -39,16 +39,33 @@ CompileOnlyClient::CompileAheadOfTime(
   return compiler_service_->CompileAheadOfTime(service_instances, options);
 }
 
-int64 CompileOnlyClient::PointerSizeForTriple(
-    tensorflow::StringPiece target_triple) {
-  llvm::Triple triple(llvm::Triple::normalize(
-      llvm::StringRef(target_triple.data(), target_triple.size())));
-  if (triple.isArch64Bit()) {
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyClient::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<CompileOnlyService::AotXlaComputationInstance> service_instances;
+  service_instances.reserve(computations.size());
+  for (const AotXlaComputationInstance& instance : computations) {
+    service_instances.emplace_back();
+    CompileOnlyService::AotXlaComputationInstance& service_instance =
+        service_instances.back();
+    TF_RET_CHECK(instance.computation != nullptr);
+    service_instance.computation = instance.computation->proto();
+    service_instance.argument_layouts = instance.argument_layouts;
+    service_instance.result_layout = instance.result_layout;
+  }
+  return compiler_service_->CompileAheadOfTime(service_instances, options);
+}
+
+int64 CompileOnlyClient::PointerSizeForTriple(tensorflow::StringPiece triple) {
+  llvm::Triple llvm_triple(
+      llvm::Triple::normalize(llvm::StringRef(triple.data(), triple.size())));
+  if (llvm_triple.isArch64Bit()) {
     return 8;
-  } else if (triple.isArch32Bit()) {
+  } else if (llvm_triple.isArch32Bit()) {
     return 4;
   } else {
-    CHECK(triple.isArch16Bit());
+    CHECK(llvm_triple.isArch16Bit());
     return 2;
   }
 }
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
index 5900048711384e0240a3cd502260eb388eb40f51..c8725b8517484acdaf093bc3b34adb00f69155b1 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -54,6 +55,27 @@ class CompileOnlyClient : public Client {
       const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
       const AotCompilationOptions& options);
 
+  // A description of an xla computation to compile using CompileAheadOfTime.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  struct AotXlaComputationInstance {
+    const XlaComputation* computation;
+    // Inform the compiler of the expected layout for arguments.
+    std::vector<const Shape*> argument_layouts;
+    // Specifies the expected result layout.
+    const Shape* result_layout;
+  };
+
+  // Compiles a list of xla computations for ahead-of-time execution.  This is
+  // intended for use in static compilation. The |options| parameter describes
+  // the target for which the compiler should emit code.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+      const AotCompilationOptions& options);
+
   // Returns the size of a pointer in bytes for a given triple.
   static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
 
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 2a6e02649d15bc9fd47a893c41f9c8a62ac076c6..83c7cb174402133706fbde6a734a29afd8edfe80 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -253,26 +253,6 @@ StatusOr<ProgramShape> ComputationBuilder::GetProgramShape() {
   return std::move(*response.mutable_program_shape());
 }
 
-ComputationDataHandle ComputationBuilder::CheckShape(
-    const ComputationDataHandle& operand, const Shape& expected_shape) {
-  std::unique_ptr<Shape> actual_shape = GetShape(operand).ConsumeValueOrDie();
-  CHECK(ShapeUtil::Equal(expected_shape, *actual_shape))
-      << "want " << ShapeUtil::HumanString(expected_shape) << " got "
-      << ShapeUtil::HumanString(*actual_shape);
-  return operand;
-}
-
-void ComputationBuilder::CheckSameShape(const ComputationDataHandle& lhs,
-                                        const ComputationDataHandle& rhs) {
-  std::unique_ptr<Shape> lhs_shape = GetShape(lhs).ConsumeValueOrDie();
-  std::unique_ptr<Shape> rhs_shape = GetShape(rhs).ConsumeValueOrDie();
-  VLOG(2) << "checking " << ShapeUtil::HumanString(*lhs_shape) << " equals "
-          << ShapeUtil::HumanString(*rhs_shape);
-  CHECK(ShapeUtil::Equal(*lhs_shape, *rhs_shape))
-      << "lhs " << ShapeUtil::HumanString(*lhs_shape) << " rhs "
-      << ShapeUtil::HumanString(*rhs_shape);
-}
-
 ComputationDataHandle ComputationBuilder::Slice(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
@@ -408,7 +388,7 @@ ComputationDataHandle ComputationBuilder::Reshape(
 
 ComputationDataHandle ComputationBuilder::Collapse(
     const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dims_to_collapse) {
+    tensorflow::gtl::ArraySlice<int64> dimensions) {
   if (!first_error_.ok()) {
     return ComputationDataHandle();
   }
@@ -416,8 +396,8 @@ ComputationDataHandle ComputationBuilder::Collapse(
   // Don't support out-of-order collapse here.
   // Checks that the collapsed dimensions are in order and consecutive.
   for (tensorflow::gtl::ArraySlice<int64>::size_type i = 1;
-       i < dims_to_collapse.size(); ++i) {
-    if (dims_to_collapse[i] - 1 != dims_to_collapse[i - 1]) {
+       i < dimensions.size(); ++i) {
+    if (dimensions[i] - 1 != dimensions[i - 1]) {
       NoteError(InvalidArgument(
           "Collapsed dimensions are not in order and consecutive."));
       return ComputationDataHandle();
@@ -434,9 +414,9 @@ ComputationDataHandle ComputationBuilder::Collapse(
 
   VLOG(3) << "original shape: " << ShapeUtil::HumanString(*original_shape);
   VLOG(3) << "dims to collapse: "
-          << tensorflow::str_util::Join(dims_to_collapse, ",");
+          << tensorflow::str_util::Join(dimensions, ",");
 
-  if (dims_to_collapse.size() <= 1) {
+  if (dimensions.size() <= 1) {
     // Not collapsing anything, trivially we can return the operand versus
     // enqueueing a trivial reshape.
     return operand;
@@ -444,7 +424,7 @@ ComputationDataHandle ComputationBuilder::Collapse(
 
   std::vector<int64> new_sizes;
   for (int i = 0; i < ShapeUtil::Rank(*original_shape); ++i) {
-    if (i <= dims_to_collapse.front() || i > dims_to_collapse.back()) {
+    if (i <= dimensions.front() || i > dimensions.back()) {
       new_sizes.push_back(original_shape->dimensions(i));
     } else {
       new_sizes.back() *= original_shape->dimensions(i);
@@ -753,13 +733,13 @@ ComputationDataHandle ComputationBuilder::Infeed(const Shape& shape,
 }
 
 void ComputationBuilder::Outfeed(const ComputationDataHandle& operand,
-                                 const Shape& shape,
+                                 const Shape& shape_with_layout,
                                  const string& outfeed_config) {
   OpRequest op_request;
   OutfeedRequest* request = op_request.mutable_outfeed_request();
   request->set_outfeed_config(outfeed_config);
   *request->mutable_operand() = operand;
-  *request->mutable_shape() = shape;
+  *request->mutable_shape() = shape_with_layout;
   RunOpAndNoteError(&op_request);
 }
 
@@ -868,6 +848,14 @@ ComputationDataHandle ComputationBuilder::Or(
   return BinaryOp(BINOP_OR, lhs, rhs, broadcast_dimensions);
 }
 
+// TODO(b/65209188): Create a dedicated lowering for Xor
+ComputationDataHandle ComputationBuilder::Xor(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return Or(And(Not(lhs), rhs, broadcast_dimensions),
+            And(lhs, Not(rhs), broadcast_dimensions));
+}
+
 ComputationDataHandle ComputationBuilder::Not(
     const ComputationDataHandle& operand) {
   return UnaryOp(UNOP_NOT, operand);
@@ -1058,6 +1046,11 @@ ComputationDataHandle ComputationBuilder::Neg(
   return UnaryOp(UNOP_NEGATE, operand);
 }
 
+ComputationDataHandle ComputationBuilder::Clz(
+    const ComputationDataHandle& operand) {
+  return UnaryOp(UNOP_CLZ, operand);
+}
+
 ComputationDataHandle ComputationBuilder::Clamp(
     const ComputationDataHandle& min, const ComputationDataHandle& operand,
     const ComputationDataHandle& max) {
@@ -1382,15 +1375,16 @@ ComputationDataHandle ComputationBuilder::BatchNormInference(
 
 ComputationDataHandle ComputationBuilder::BatchNormGrad(
     const ComputationDataHandle& operand, const ComputationDataHandle& scale,
-    const ComputationDataHandle& mean, const ComputationDataHandle& var,
+    const ComputationDataHandle& batch_mean,
+    const ComputationDataHandle& batch_var,
     const ComputationDataHandle& grad_output, float epsilon,
     int64 feature_index) {
   OpRequest op_request;
   BatchNormGradRequest* request = op_request.mutable_batch_norm_grad_request();
   *request->mutable_operand() = operand;
   *request->mutable_scale() = scale;
-  *request->mutable_mean() = mean;
-  *request->mutable_variance() = var;
+  *request->mutable_mean() = batch_mean;
+  *request->mutable_variance() = batch_var;
   *request->mutable_grad_output() = grad_output;
   request->set_epsilon(epsilon);
   request->set_feature_index(feature_index);
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index e3facb3f258bb0bdf4b2dd3648e55421dfd56e79..9431c2c459a564e3cf509d9dae16e71fc27ee2c0 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -104,15 +104,6 @@ class ComputationBuilder {
   // Retrieves the (inferred) result for the current computation's shape.
   StatusOr<ProgramShape> GetProgramShape();
 
-  // Checks that the operand has the given expected shape. Returns the operand
-  // if yes, fails with a CHECK error if no.
-  ComputationDataHandle CheckShape(const ComputationDataHandle& operand,
-                                   const Shape& expected_shape);
-
-  // Checks that the lhs and rhs results have the same shape.
-  void CheckSameShape(const ComputationDataHandle& lhs,
-                      const ComputationDataHandle& rhs);
-
   // Enqueues a constant with the value of the given literal onto the
   // computation.
   ComputationDataHandle ConstantLiteral(const Literal& literal);
@@ -198,9 +189,8 @@ class ComputationBuilder {
                                 tensorflow::gtl::ArraySlice<int64> new_sizes);
 
   // Enqueues an operation onto the computation that collapses the operand, from
-  // minor to major order, then reshapes it into the shape with the given
-  // dimension sizes, also from major to minor. Conceptually, this is a limited
-  // form of "shape casting".
+  // first to last dimension (C order), then reshapes it to the given dimension
+  // sizes. Conceptually, this is a limited form of "shape casting".
   ComputationDataHandle Reshape(const ComputationDataHandle& operand,
                                 tensorflow::gtl::ArraySlice<int64> new_sizes);
 
@@ -513,6 +503,10 @@ class ComputationBuilder {
       const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
 
+  ComputationDataHandle Xor(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
   ComputationDataHandle Not(const ComputationDataHandle& operand);
 
   ComputationDataHandle ShiftLeft(
@@ -663,6 +657,9 @@ class ComputationBuilder {
   // Enqueues a negate instruction onto the computation.
   ComputationDataHandle Neg(const ComputationDataHandle& operand);
 
+  // Enqueues a count-leading-zeros instruction onto the computation.
+  ComputationDataHandle Clz(const ComputationDataHandle& operand);
+
   // Enqueues a transpose instruction onto the computation.
   ComputationDataHandle Transpose(
       const ComputationDataHandle& operand,
@@ -873,7 +870,7 @@ class ComputationBuilder {
                   Window* window);
 
   // Internal helper method that does the building for an arbitrary unary op.
-  ComputationDataHandle UnaryOp(UnaryOperation binop,
+  ComputationDataHandle UnaryOp(UnaryOperation unop,
                                 const ComputationDataHandle& operand);
 
   // Internal helper method that does the building for an arbitrary binary op.
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 804e34f5e75ce2d153ac7627b94a543fda88e810..6e3c5cb484b8f1ef053fa287a4d462aeb886e530 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -76,4 +76,35 @@ ExecutableBuildOptions::generate_hlo_graph() const {
   return generate_hlo_graph_;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_dump_optimized_hlo_proto_to(
+    tensorflow::StringPiece dirpath) {
+  dump_optimized_hlo_proto_to_ = dirpath.ToString();
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::dump_optimized_hlo_proto_to() const {
+  return dump_optimized_hlo_proto_to_;
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_dump_per_pass_hlo_proto_to(
+    tensorflow::StringPiece dirpath) {
+  dump_per_pass_hlo_proto_to_ = dirpath.ToString();
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::dump_per_pass_hlo_proto_to() const {
+  return dump_per_pass_hlo_proto_to_;
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_hlo_profile(bool enabled) {
+  hlo_profile_ = enabled;
+  return *this;
+}
+
+tensorflow::gtl::optional<bool> ExecutableBuildOptions::hlo_profile() const {
+  return hlo_profile_;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 3a52dbac9adb155ad9a7d91a8102707f70fe2fbf..11f10983606fe02b1edb11a260edde8e5f9a726f 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
@@ -57,15 +58,36 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_generate_hlo_graph(string regex);
   const tensorflow::gtl::optional<string>& generate_hlo_graph() const;
 
+  // If set, specifies a dirpath to dump the end-of-optimization-pipeline HLO
+  // protobuf to (as in DebugOptions).
+  ExecutableBuildOptions& set_dump_optimized_hlo_proto_to(
+      tensorflow::StringPiece dirpath);
+  const tensorflow::gtl::optional<string>& dump_optimized_hlo_proto_to() const;
+
+  // If set, specifies a dirpath to dump the per-pass-in-pipeline HLO protobufs
+  // to (as in DebugOptions).
+  ExecutableBuildOptions& set_dump_per_pass_hlo_proto_to(
+      tensorflow::StringPiece dirpath);
+  const tensorflow::gtl::optional<string>& dump_per_pass_hlo_proto_to() const;
+
+  // If true, specifies that we should record an HLO profile during execution
+  // and log it after execution (as in DebugOptions). If nullopt the default is
+  // used.
+  ExecutableBuildOptions& set_hlo_profile(bool enabled);
+  tensorflow::gtl::optional<bool> hlo_profile() const;
+
   // Returns a string representation of the build options, suitable for
   // debugging.
   string ToString() const;
 
  private:
+  tensorflow::gtl::optional<bool> hlo_profile_;
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
   tensorflow::gtl::optional<string> generate_hlo_graph_;
+  tensorflow::gtl::optional<string> dump_optimized_hlo_proto_to_;
+  tensorflow::gtl::optional<string> dump_per_pass_hlo_proto_to_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
 };
 
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index fca2bf2688cd21b44f099da3bae3b890cbb069ab..59c4a53c05a45490a7c8e732840a4e70767c46c2 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -24,6 +24,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -44,21 +46,8 @@ cc_library(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 24048a1e5a782661ba577ba50e3b5b2914f17c0a..63df449e0b3bdd642d548319dd7d621ca2f59b1d 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 namespace xla {
 namespace {
+
 using InstructionGenerator =
     ComputationDataHandle (*)(ComputationBuilder*, const ComputationDataHandle&,
                               const ComputationDataHandle&);
@@ -47,6 +48,27 @@ Computation CreateScalarComputation(const string& name, PrimitiveType type,
   generator(b.get(), lhs, rhs);
   return b->BuildAndNoteError();
 }
+
+using XlaOpGenerator = XlaOp (*)(XlaBuilder*, const XlaOp&, const XlaOp&);
+
+XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
+                                       XlaBuilder* builder,
+                                       XlaOpGenerator generator) {
+  std::unique_ptr<XlaBuilder> b;
+  if (type == PRED) {
+    b = builder->CreateSubBuilder(name);
+  } else {
+    b = builder->CreateSubBuilder(
+        tensorflow::strings::StrCat(name, "_", PrimitiveType_Name(type)));
+  }
+
+  const Shape scalar = ShapeUtil::MakeShape(type, {});
+  auto lhs = b->Parameter(0, scalar, "lhs");
+  auto rhs = b->Parameter(1, scalar, "rhs");
+  generator(b.get(), lhs, rhs);
+  return b->BuildAndNoteError();
+}
+
 }  // namespace
 
 Computation CreateScalarAddComputation(PrimitiveType type,
@@ -60,7 +82,7 @@ Computation CreateScalarAddComputation(PrimitiveType type,
 Computation CreateScalarMultiplyComputation(PrimitiveType type,
                                             ComputationBuilder* builder) {
   return CreateScalarComputation(
-      "add", type, builder,
+      "mul", type, builder,
       [](ComputationBuilder* b, const ComputationDataHandle& lhs,
          const ComputationDataHandle& rhs) { return b->Mul(lhs, rhs); });
 }
@@ -114,4 +136,75 @@ StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
   return builder->Reduce(predicates, f, logical_or, all_dimensions);
 }
 
+XlaComputation CreateScalarAddComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "add", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Add(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
+                                               XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "mul", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Mul(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarGeComputation(PrimitiveType type,
+                                         XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "ge", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Ge(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMaxComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "max", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Max(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMinComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "min", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Min(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarAndComputation(XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "and", PRED, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->And(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarOrComputation(XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "or", PRED, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Or(lhs, rhs);
+      });
+}
+
+StatusOr<XlaOp> Any(const XlaOp& predicates, XlaBuilder* builder) {
+  auto f = builder->ConstantR0<bool>(false);
+  XlaComputation logical_or = CreateScalarOrComputation(builder);
+  TF_ASSIGN_OR_RETURN(const Shape& predicates_shape,
+                      builder->GetShape(predicates));
+  std::vector<int64> all_dimensions(ShapeUtil::Rank(predicates_shape));
+  std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
+  return builder->Reduce(predicates, f, logical_or, all_dimensions);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index ae89784bc227d837cf15f0a89687dd00dccc2745..f4d3fc801590fedbb84ed3d6283e62f47c56d5c7 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -56,6 +58,48 @@ Computation CreateScalarOrComputation(ComputationBuilder* builder);
 StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
                                     ComputationBuilder* builder);
 
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar add computation and returns it.
+XlaComputation CreateScalarAddComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar multiply computation and returns it.
+XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
+                                               XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar ge computation and returns it.
+XlaComputation CreateScalarGeComputation(PrimitiveType type,
+                                         XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar max computation and returns it.
+XlaComputation CreateScalarMaxComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar min computation and returns it.
+XlaComputation CreateScalarMinComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar logical AND computation and returns it.
+XlaComputation CreateScalarAndComputation(XlaBuilder* builder);
+
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar logical OR computation and returns it.
+XlaComputation CreateScalarOrComputation(XlaBuilder* builder);
+
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Returns whether any predicate in "predicates" is set.
+//
+// Note: if predicates is zero-sized, Any() vacuously returns false.
+StatusOr<XlaOp> Any(const XlaOp& predicates, XlaBuilder* builder);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index b63a1465ea755b906853860d47768ecbeaa0dcdd..311dc4bdd72cfd7999e83a26e11614d6ca005bce 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -111,4 +111,20 @@ std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
   return fake_arguments;
 }
 
+std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
+    const XlaComputation& computation, Client* client) {
+  CHECK(computation.proto().has_program_shape())
+      << "Computation should have progran shape.";
+  auto program_shape = computation.proto().program_shape();
+
+  // For every (unbound) parameter that the computation wants, we manufacture
+  // some arbitrary data so that we can invoke the computation.
+  std::vector<std::unique_ptr<GlobalData>> fake_arguments;
+  for (const Shape& parameter : program_shape.parameters()) {
+    fake_arguments.push_back(MakeFakeDataOrDie(parameter, client));
+  }
+
+  return fake_arguments;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index 7e640d1307edcc3e2c021f4391c456f578a015ee..1dc2622972d5fd3da6991d70b800cc3fd5a638f4 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -38,6 +39,12 @@ std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
     const Computation& computation, Client* client);
 
+// Returns vector of GlobalData handles of fake data (created using
+// MakeFakeDataOrDie) that are correctly shaped arguments for the given
+// xla computation.
+std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
+    const XlaComputation& computation, Client* client);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TESTING_H_
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 91396f055fe4a3ecbd436139be9470e2a35e1c63..1c1270590375ab54e5d7b56344db1b2d40e5b89c 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
-namespace se = ::perftools::gputools;
-
 using xla::source_map_util::InvalidParameterArgument;
 
 namespace xla {
@@ -136,7 +134,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
+StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     ExecutableRunOptions run_options) {
   TF_RETURN_IF_ERROR(
@@ -168,28 +166,23 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
   if (executable_->dumping()) {
     return ExecuteAndDump(&service_options, arguments);
   }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result,
-      executable_->ExecuteOnStreamWrapper(
-          &service_options, run_options.execution_profile(), arguments));
-
-  return MakeUnique<ScopedShapedBuffer>(std::move(*result),
-                                        run_options.allocator());
+  return executable_->ExecuteOnStreamWrapper(
+      &service_options, run_options.execution_profile(), arguments);
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::ExecuteAndDump(
+StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
     const ServiceExecutableRunOptions* run_options,
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   executable_->session_module()->set_execution_platform(
       backend_->platform()->Name());
   TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module()));
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result,
+      ScopedShapedBuffer result,
       executable_->ExecuteOnStream(run_options, arguments,
                                    /*hlo_execution_profile=*/nullptr));
-  TF_RETURN_IF_ERROR(RecordResult(result.get(), executable_->session_module()));
+  TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module()));
   TF_RETURN_IF_ERROR(executable_->DumpSessionModule());
-  return ScopedShapedBuffer::MakeScoped(result.get(), run_options->allocator());
+  return std::move(result);
 }
 
 tensorflow::Status LocalExecutable::RecordArguments(
@@ -265,9 +258,27 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
                                         updated_options));
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
-                                   DeviceMemoryAllocator* allocator) {
+StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
+    const XlaComputation& computation,
+    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
+    const ExecutableBuildOptions& options) {
+  ExecutableBuildOptions updated_options = options;
+  if (options.device_ordinal() == -1) {
+    updated_options.set_device_ordinal(default_device_ordinal());
+    VLOG(3) << "Set device ordinal to default value of: "
+            << updated_options.device_ordinal();
+  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      local_service_->CompileExecutable(
+                          computation, argument_layouts, updated_options));
+  return WrapUnique(new LocalExecutable(std::move(executable),
+                                        local_service_->mutable_backend(),
+                                        updated_options));
+}
+
+StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
+    const Literal& literal, int device_ordinal,
+    DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
@@ -277,7 +288,7 @@ LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       backend().stream_executor(device_ordinal));
   TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-      executor, literal, *scoped_buffer));
+      executor, literal, scoped_buffer));
   return std::move(scoped_buffer);
 }
 
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index b52a30f5a0b92e0094e6b0de3241c10a5a909cad..f306c520ede0014be52d1b952849c8894b092baf 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -38,7 +38,7 @@ class LocalExecutable {
  public:
   // Run the compiled computation with the given arguments and options and
   // return the result.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> Run(
+  StatusOr<ScopedShapedBuffer> Run(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       ExecutableRunOptions run_options);
 
@@ -69,11 +69,11 @@ class LocalExecutable {
   // of the computation.
   tensorflow::Status ValidateExecutionOptions(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutableRunOptions& options, const Backend& backend);
+      const ExecutableRunOptions& run_options, const Backend& backend);
 
   // Records the computation in a SessionModule proto with the arguments used to
   // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteAndDump(
+  StatusOr<ScopedShapedBuffer> ExecuteAndDump(
       const ServiceExecutableRunOptions* run_options,
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
@@ -123,11 +123,20 @@ class LocalClient : public Client {
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
       const ExecutableBuildOptions& options);
 
+  // Build and return a LocalExecutable object. The executable is compiled using
+  // the given XlaComputation, argument layouts and options.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<LocalExecutable>> Compile(
+      const XlaComputation& computation,
+      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
+      const ExecutableBuildOptions& options);
+
   // Copy the literal data to the device with the given ordinal and return as a
   // ScopedShapedBuffer. If non-null the given memory allocator is used for
   // device memory allocation. If null, the default memory allocator for the
   // device is used.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> LiteralToShapedBuffer(
+  StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const Literal& literal, int device_ordinal,
       DeviceMemoryAllocator* allocator = nullptr);
 
@@ -158,7 +167,7 @@ class LocalClient : public Client {
   StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
 
   // Returns the platform that the underlying service targets.
-  perftools::gputools::Platform* platform() const;
+  se::Platform* platform() const;
 
   // Returns the number of devices on the system of the service platform
   // type. Not all devices may be supported by the service (see
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0d6e207971ec64515ec5e6da292910920edd101a
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -0,0 +1,79 @@
+# Description:
+#   The new XLA client libraries.
+#
+# This is NOT YET ready to use.
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = [":friends"])
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+# Filegroup used to collect source files for dependency checking.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "xla_computation",
+    srcs = ["xla_computation.cc"],
+    hdrs = ["xla_computation.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+    ],
+)
+
+# TODO(b/74197823): Replace computation_builder with xla_builder.
+cc_library(
+    name = "xla_builder",
+    srcs = ["xla_builder.cc"],
+    hdrs = ["xla_builder.h"],
+    deps = [
+        ":xla_computation",
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:shape_inference",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_builder_test",
+    srcs = ["xla_builder_test.cc"],
+    deps = [
+        ":xla_builder",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1899983e442116d3ebf8a3e79b0515653cd624cb
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -0,0 +1,1967 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+
+#include <functional>
+#include <numeric>
+#include <queue>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace xla {
+
+using tensorflow::strings::StrCat;
+
+namespace {
+
+int64 GetUniqueId() {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static int64 built_counter = 0;
+  tensorflow::mutex_lock loc(mu);
+  const int64 id = built_counter++;
+  return id;
+}
+
+// Returns true if an instruction with the given opcode can be the root of the
+// computation.
+bool CanBeRoot(HloOpcode opcode) {
+  switch (opcode) {
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kTrace:
+      return false;
+    default:
+      return true;
+  }
+}
+
+StatusOr<std::vector<Shape>> GetOperandShapes(
+    tensorflow::gtl::ArraySlice<XlaOp> operands) {
+  std::vector<Shape> operand_shapes;
+  for (const XlaOp& operand : operands) {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, operand.GetShape());
+    operand_shapes.push_back(shape);
+  }
+  return operand_shapes;
+}
+
+}  // namespace
+
+StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  TF_ASSIGN_OR_RETURN(auto instr, LookUpInstruction(op));
+  return instr->shape();
+}
+
+StatusOr<Shape> XlaOp::GetShape() const {
+  if (builder_ == nullptr) {
+    return InvalidArgument(
+        "cannot GetShape for an invalid XlaOp with handle %lld", handle());
+  }
+  return builder_->GetShape(*this);
+}
+
+XlaBuilder::XlaBuilder(const string& computation_name)
+    : name_(computation_name) {}
+
+XlaBuilder::~XlaBuilder() {}
+
+void XlaBuilder::NoteError(const Status& error) {
+  CHECK(!error.ok());
+  if (die_immediately_on_error_) {
+    LOG(FATAL) << "error building computation: " << error;
+  }
+
+  if (first_error_.ok()) {
+    first_error_ = error;
+    first_error_backtrace_.CreateCurrent(/*skip_count=*/1);
+  }
+}
+
+XlaOp XlaBuilder::NoteErrorOrReturn(
+    const std::function<StatusOr<XlaOp>()>& op_creator) {
+  if (!first_error_.ok()) {
+    return {};
+  }
+  auto op = op_creator();
+  if (!op.ok()) {
+    NoteError(op.status());
+    return {};
+  }
+  return op.ConsumeValueOrDie();
+}
+
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  TF_RET_CHECK(root_id != nullptr);
+
+  ProgramShape program_shape;
+
+  // Not all instructions can be roots. Walk backwards from the last added
+  // instruction until a valid root is found.
+  int64 index = instructions_.size() - 1;
+  for (; index >= 0; index--) {
+    TF_ASSIGN_OR_RETURN(HloOpcode opcode,
+                        StringToHloOpcode(instructions_[index].opcode()));
+    if (CanBeRoot(opcode)) {
+      break;
+    }
+  }
+  if (index < 0) {
+    return FailedPrecondition("no root instruction was found");
+  }
+  *root_id = instructions_[index].id();
+  *program_shape.mutable_result() = instructions_[index].shape();
+
+  // Check that the parameter numbers are continuous from 0, and add parameter
+  // shapes and names to the program shape.
+  const int64 param_count = parameter_numbers_.size();
+  for (int64 i = 0; i < param_count; i++) {
+    program_shape.add_parameters();
+    program_shape.add_parameter_names();
+  }
+  for (const HloInstructionProto& instr : instructions_) {
+    // Parameter number uniqueness is guaranteed in XlaBuilder::Parameter(). So
+    // to verify continuity, we just need to verify that every parameter is in
+    // the right range.
+    if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter)) {
+      const int64 index = instr.parameter_number();
+      TF_RET_CHECK(index >= 0 && index < param_count)
+          << "invalid parameter number: " << index;
+      *program_shape.mutable_parameters(index) = instr.shape();
+      *program_shape.mutable_parameter_names(index) = instr.name();
+    }
+  }
+  return program_shape;
+}
+
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape() const {
+  int64 root;
+  return GetProgramShape(&root);
+}
+
+void XlaBuilder::IsConstantVisitor(const int64 op_handle,
+                                   std::set<int64>* visited,
+                                   bool* is_constant) const {
+  if (visited->count(op_handle) != 0 || !*is_constant) {
+    return;
+  }
+
+  CHECK(op_handle < instructions_.size() && op_handle >= 0);
+
+  const HloInstructionProto& instr = instructions_[op_handle];
+  const HloOpcode opcode = StringToHloOpcode(instr.opcode()).ValueOrDie();
+  switch (opcode) {
+    default:
+      for (const int64 operand_id : instr.operand_ids()) {
+        IsConstantVisitor(operand_id, visited, is_constant);
+      }
+      // TODO(b/32495713): We aren't checking the called computations.
+      break;
+
+    // Non functional ops.
+    case HloOpcode::kRng:
+    case HloOpcode::kCrossReplicaSum:
+      // TODO(b/33009255): Implmement constant folding for cross replica sum.
+    case HloOpcode::kInfeed:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kHostCompute:
+    case HloOpcode::kCall:
+      // TODO(b/32495713): We aren't checking the to_apply computation itself,
+      // so we conservatively say that computations containing the Call op
+      // cannot be constant.  We cannot set is_functional=false in other similar
+      // cases since we're already relying on IsConstant to return true.
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kWhile:
+      // TODO(b/32495713): We aren't checking the condition and body
+      // computations themselves.
+    case HloOpcode::kSend:
+    case HloOpcode::kRecv:
+    case HloOpcode::kParameter:
+      *is_constant = false;
+      break;
+  }
+  if (!*is_constant) {
+    VLOG(1) << "Non-constant: " << instr.name();
+  }
+  visited->insert(op_handle);
+}
+
+XlaComputation XlaBuilder::BuildAndNoteError() {
+  DCHECK(parent_builder_ != nullptr);
+  auto build_status = Build();
+  if (!build_status.ok()) {
+    parent_builder_->NoteError(
+        AddStatus(build_status.status(),
+                  tensorflow::strings::StrCat("error from: ", name_)));
+    return {};
+  }
+  return build_status.ConsumeValueOrDie();
+}
+
+StatusOr<XlaComputation> XlaBuilder::Build() {
+  if (!first_error_.ok()) {
+    string backtrace;
+    first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
+    return AppendStatus(first_error_, backtrace);
+  }
+
+  HloComputationProto entry;
+  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
+  entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
+
+  {
+    int64 root_id;
+    TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(),
+                        GetProgramShape(&root_id));
+    entry.set_root_id(root_id);
+  }
+
+  for (auto& instruction : instructions_) {
+    // Ensures that the instruction names are unique among the whole graph.
+    const string& new_name =
+        StrCat(instruction.name(), ".", entry.id(), ".", instruction.id());
+    instruction.set_name(new_name);
+    entry.add_instructions()->Swap(&instruction);
+  }
+
+  XlaComputation computation(entry.id());
+  HloModuleProto* module = computation.mutable_proto();
+  module->set_name(entry.name());
+  module->set_id(entry.id());
+  module->set_entry_computation_name(entry.name());
+  module->set_entry_computation_id(entry.id());
+  *module->mutable_program_shape() = entry.program_shape();
+  for (auto& e : embedded_) {
+    module->add_computations()->Swap(&e.second);
+  }
+  module->add_computations()->Swap(&entry);
+
+  // Clear data held by this builder.
+  this->instructions_.clear();
+  this->embedded_.clear();
+  this->parameter_numbers_.clear();
+
+  return std::move(computation);
+}
+
+StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
+    const Shape& shape, const XlaOp& operand,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape;
+  for (int64 dim : broadcast_dimensions) {
+    instr.add_dimensions(dim);
+  }
+  return AddInstruction(std::move(instr), HloOpcode::kBroadcast, {operand});
+}
+
+StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
+                                                 const XlaOp& operand) {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+
+  CHECK(ShapeUtil::IsScalar(operand_shape) ||
+        ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape));
+  Shape broadcast_shape =
+      ShapeUtil::ChangeElementType(output_shape, operand_shape.element_type());
+
+  // Do explicit broadcast for scalar.
+  if (ShapeUtil::IsScalar(operand_shape)) {
+    return InDimBroadcast(broadcast_shape, operand, {});
+  }
+
+  // Do explicit broadcast for degenerate broadcast.
+  std::vector<int64> broadcast_dimensions;
+  std::vector<int64> reshaped_dimensions;
+  for (int i = 0; i < ShapeUtil::Rank(operand_shape); i++) {
+    if (operand_shape.dimensions(i) == output_shape.dimensions(i)) {
+      broadcast_dimensions.push_back(i);
+      reshaped_dimensions.push_back(operand_shape.dimensions(i));
+    } else {
+      TF_RET_CHECK(operand_shape.dimensions(i) == 1)
+          << "An explicit broadcast sequence requires the broadcasted "
+             "dimensions to be trivial; operand shape: "
+          << operand_shape << "; output_shape: " << output_shape;
+    }
+  }
+  // Eliminate the size one dimensions.
+  TF_ASSIGN_OR_RETURN(XlaOp reshaped_operand,
+                      Reshape(ShapeUtil::MakeShape(operand_shape.element_type(),
+                                                   reshaped_dimensions),
+                              operand));
+  // Broadcast 'reshape' up to the larger size.
+  return InDimBroadcast(broadcast_shape, reshaped_operand,
+                        broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferUnaryOpShape(unop, operand_shape));
+    return AddInstruction(std::move(instr), unop, {operand});
+  });
+}
+
+XlaOp XlaBuilder::BinaryOp(
+    HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, lhs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, rhs.GetShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferBinaryOpShape(
+                            binop, lhs_shape, rhs_shape, broadcast_dimensions));
+
+    const int64 lhs_rank = ShapeUtil::Rank(lhs_shape);
+    const int64 rhs_rank = ShapeUtil::Rank(rhs_shape);
+
+    XlaOp updated_lhs = lhs;
+    XlaOp updated_rhs = rhs;
+
+    if (!broadcast_dimensions.empty() && lhs_rank != rhs_rank) {
+      const bool should_broadcast_lhs = lhs_rank < rhs_rank;
+      XlaOp from = should_broadcast_lhs ? lhs : rhs;
+      const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape;
+
+      std::vector<int64> to_size;
+      for (int64 size : instr.shape().dimensions()) {
+        to_size.push_back(size);
+      }
+      for (int64 from_dim = 0; from_dim < ShapeUtil::Rank(from_shape);
+           from_dim++) {
+        int64 to_dim = broadcast_dimensions[from_dim];
+        to_size[to_dim] = from_shape.dimensions(from_dim);
+      }
+
+      const Shape& broadcasted_shape =
+          ShapeUtil::MakeShape(from_shape.element_type(), to_size);
+      TF_ASSIGN_OR_RETURN(
+          XlaOp broadcasted_operand,
+          InDimBroadcast(broadcasted_shape, from, broadcast_dimensions));
+
+      updated_lhs = should_broadcast_lhs ? broadcasted_operand : lhs;
+      updated_rhs = !should_broadcast_lhs ? broadcasted_operand : rhs;
+    }
+
+    TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, updated_lhs.GetShape());
+    if (!ShapeUtil::SameDimensions(instr.shape(), updated_lhs_shape)) {
+      TF_ASSIGN_OR_RETURN(updated_lhs,
+                          AddBroadcastSequence(instr.shape(), updated_lhs));
+    }
+    TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, updated_rhs.GetShape());
+    if (!ShapeUtil::SameDimensions(instr.shape(), updated_rhs_shape)) {
+      TF_ASSIGN_OR_RETURN(updated_rhs,
+                          AddBroadcastSequence(instr.shape(), updated_rhs));
+    }
+
+    return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs});
+  });
+}
+
+XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
+                            const XlaOp& ehs) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, lhs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, rhs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, ehs.GetShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferTernaryOpShape(
+                            triop, lhs_shape, rhs_shape, ehs_shape));
+    XlaOp updated_lhs = lhs;
+    XlaOp updated_rhs = rhs;
+    XlaOp updated_ehs = ehs;
+    if (!ShapeUtil::IsTuple(instr.shape())) {
+      if (!ShapeUtil::IsTuple(lhs_shape) &&
+          !ShapeUtil::SameDimensions(instr.shape(), lhs_shape)) {
+        // lhs is being implicitly broadcasted. Change to explicit.
+        TF_ASSIGN_OR_RETURN(updated_lhs,
+                            AddBroadcastSequence(instr.shape(), lhs));
+      }
+      if (!ShapeUtil::IsTuple(rhs_shape) &&
+          !ShapeUtil::SameDimensions(instr.shape(), rhs_shape)) {
+        // rhs is being implicitly broadcasted. Change to explicit.
+        TF_ASSIGN_OR_RETURN(updated_rhs,
+                            AddBroadcastSequence(instr.shape(), rhs));
+      }
+      if (!ShapeUtil::IsTuple(ehs_shape) &&
+          !ShapeUtil::SameDimensions(instr.shape(), ehs_shape)) {
+        // ehs is being implicitly broadcasted. Change to explicit.
+        TF_ASSIGN_OR_RETURN(updated_ehs,
+                            AddBroadcastSequence(instr.shape(), ehs));
+      }
+    }
+    return AddInstruction(std::move(instr), triop,
+                          {updated_lhs, updated_rhs, updated_ehs});
+  });
+}
+
+XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kAdd, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kMultiply, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::ConstantLiteral(const Literal& literal) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = literal.shape();
+    *instr.mutable_literal() = literal.ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kConstant);
+  });
+}
+
+XlaOp XlaBuilder::Call(const XlaComputation& computation,
+                       tensorflow::gtl::ArraySlice<XlaOp> operands) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferCallShape(operand_shape_ptrs,
+                                       /*to_apply=*/called_program_shape));
+
+    AddCalledComputation(computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kCall, operands);
+  });
+}
+
+XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
+                            const string& name) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (!parameter_numbers_.insert(parameter_number).second) {
+      return InvalidArgument("parameter %lld already registered",
+                             parameter_number);
+    }
+    instr.set_parameter_number(parameter_number);
+    instr.set_name(name);
+    *instr.mutable_shape() = shape;
+    return AddInstruction(std::move(instr), HloOpcode::kParameter);
+  });
+}
+
+XlaOp XlaBuilder::Broadcast(
+    const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(
+        const Shape& shape,
+        ShapeInference::InferBroadcastShape(operand_shape, broadcast_sizes));
+
+    // The client-level broadcast op just appends dimensions on the left (adds
+    // lowest numbered dimensions). The HLO broadcast instruction is more
+    // flexible and can add new dimensions anywhere. The instruction's
+    // dimensions field maps operand dimensions to dimensions in the broadcast
+    // output, so to append dimensions on the left the instruction's dimensions
+    // should just be the n highest dimension numbers of the output shape where
+    // n is the number of input dimensions.
+    const int64 operand_rank = ShapeUtil::Rank(operand_shape);
+    std::vector<int64> dimensions(operand_rank);
+    for (int i = 0; i < operand_rank; ++i) {
+      dimensions[i] = i + ShapeUtil::Rank(shape) - operand_rank;
+    }
+    return InDimBroadcast(shape, operand, dimensions);
+  });
+}
+
+StatusOr<XlaOp> XlaBuilder::Reshape(const Shape& shape, const XlaOp& operand) {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape;
+  return AddInstruction(std::move(instr), HloOpcode::kReshape, {operand});
+}
+
+XlaOp XlaBuilder::Slice(const XlaOp& operand,
+                        tensorflow::gtl::ArraySlice<int64> start_indices,
+                        tensorflow::gtl::ArraySlice<int64> limit_indices,
+                        tensorflow::gtl::ArraySlice<int64> strides) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferSliceShape(operand_shape, start_indices,
+                                        limit_indices, strides));
+    for (int i = 0; i < start_indices.size(); i++) {
+      auto* slice_config = instr.add_slice_dimensions();
+      slice_config->set_start(start_indices[i]);
+      slice_config->set_limit(limit_indices[i]);
+      slice_config->set_stride(strides[i]);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kSlice, {operand});
+  });
+}
+
+XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index,
+                             int64 limit_index, int64 stride, int64 dimno) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    std::vector<int64> starts(ShapeUtil::Rank(shape), 0);
+    std::vector<int64> limits(shape.dimensions().begin(),
+                              shape.dimensions().end());
+    std::vector<int64> strides(ShapeUtil::Rank(shape), 1);
+    starts[dimno] = start_index;
+    limits[dimno] = limit_index;
+    strides[dimno] = stride;
+    return Slice(operand, starts, limits, strides);
+  });
+}
+
+XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                               tensorflow::gtl::ArraySlice<int64> slice_sizes) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDynamicSliceShape(
+                            operand_shape, start_indices_shape, slice_sizes));
+
+    for (int64 size : slice_sizes) {
+      instr.add_dynamic_slice_sizes(size);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice,
+                          {operand, start_indices});
+  });
+}
+
+XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                     const XlaOp& start_indices) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDynamicUpdateSliceShape(
+                            operand_shape, update_shape, start_indices_shape));
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                          {operand, update, start_indices});
+  });
+}
+
+XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                              int64 dimension) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension));
+
+    instr.add_dimensions(dimension);
+
+    return AddInstruction(std::move(instr), HloOpcode::kConcatenate, operands);
+  });
+}
+
+XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
+                      const PaddingConfig& padding_config) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& padding_value_shape,
+                        GetShape(padding_value));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferPadShape(operand_shape, padding_value_shape,
+                                      padding_config));
+
+    *instr.mutable_padding_config() = padding_config;
+
+    return AddInstruction(std::move(instr), HloOpcode::kPad,
+                          {operand, padding_value});
+  });
+}
+
+XlaOp XlaBuilder::Reshape(const XlaOp& operand,
+                          tensorflow::gtl::ArraySlice<int64> dimensions,
+                          tensorflow::gtl::ArraySlice<int64> new_sizes) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& shape,
+                        ShapeInference::InferReshapeShape(
+                            operand_shape, dimensions, new_sizes));
+    XlaOp transposed = IsIdentityPermutation(dimensions)
+                           ? operand
+                           : Transpose(operand, dimensions);
+    return Reshape(shape, transposed);
+  });
+}
+
+XlaOp XlaBuilder::Reshape(const XlaOp& operand,
+                          tensorflow::gtl::ArraySlice<int64> new_sizes) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, operand.GetShape());
+    std::vector<int64> dimensions(shape.dimensions_size());
+    std::iota(dimensions.begin(), dimensions.end(), 0);
+    return Reshape(operand, dimensions, new_sizes);
+  });
+}
+
+XlaOp XlaBuilder::Collapse(const XlaOp& operand,
+                           tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (dimensions.size() <= 1) {
+      // Not collapsing anything, trivially we can return the operand versus
+      // enqueueing a trivial reshape.
+      return operand;
+    }
+
+    // Out-of-order collapse is not supported.
+    // Checks that the collapsed dimensions are in order and consecutive.
+    for (tensorflow::gtl::ArraySlice<int64>::size_type i = 1;
+         i < dimensions.size(); ++i) {
+      if (dimensions[i] - 1 != dimensions[i - 1]) {
+        return InvalidArgument(
+            "Collapsed dimensions are not in consecutive order.");
+      }
+    }
+
+    // Create a new sizes vector from the old shape, replacing the collapsed
+    // dimensions by the product of their sizes.
+    TF_ASSIGN_OR_RETURN(const Shape& original_shape, GetShape(operand));
+
+    VLOG(3) << "original shape: " << ShapeUtil::HumanString(original_shape);
+    VLOG(3) << "dims to collapse: "
+            << tensorflow::str_util::Join(dimensions, ",");
+
+    std::vector<int64> new_sizes;
+    for (int i = 0; i < ShapeUtil::Rank(original_shape); ++i) {
+      if (i <= dimensions.front() || i > dimensions.back()) {
+        new_sizes.push_back(original_shape.dimensions(i));
+      } else {
+        new_sizes.back() *= original_shape.dimensions(i);
+      }
+    }
+
+    VLOG(3) << "new sizes: [" << tensorflow::str_util::Join(new_sizes, ",")
+            << "]";
+
+    return Reshape(operand, new_sizes);
+  });
+}
+
+void XlaBuilder::Trace(const string& tag, const XlaOp& operand) {
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = ShapeUtil::MakeNil();
+    *instr.mutable_literal() = Literal::CreateR1U8(tag)->ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kTrace, {operand});
+  });
+}
+
+XlaOp XlaBuilder::Select(const XlaOp& pred, const XlaOp& on_true,
+                         const XlaOp& on_false) {
+  return TernaryOp(HloOpcode::kSelect, pred, on_true, on_false);
+}
+
+XlaOp XlaBuilder::Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferVariadicOpShape(
+                            HloOpcode::kTuple, operand_shape_ptrs));
+    return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
+  });
+}
+
+XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& tuple_shape, GetShape(tuple_data));
+    if (!ShapeUtil::IsTuple(tuple_shape)) {
+      return InvalidArgument(
+          "Operand to GetTupleElement() is not a tuple; got %s",
+          ShapeUtil::HumanString(tuple_shape).c_str());
+    }
+    *instr.mutable_shape() =
+        ShapeUtil::GetTupleElementShape(tuple_shape, index);
+
+    instr.set_tuple_index(index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kGetTupleElement,
+                          {tuple_data});
+  });
+}
+
+XlaOp XlaBuilder::Eq(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kEq, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Ne(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kNe, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Ge(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kGe, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Gt(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kGt, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Le(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kLe, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kLt, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+
+    DotDimensionNumbers dimension_numbers;
+    dimension_numbers.add_lhs_contracting_dimensions(
+        lhs_shape.dimensions_size() == 1 ? 0 : 1);
+    dimension_numbers.add_rhs_contracting_dimensions(0);
+    return DotGeneral(lhs, rhs, dimension_numbers);
+  });
+}
+
+XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                             const DotDimensionNumbers& dimension_numbers) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
+                                                        dimension_numbers));
+    *instr.mutable_dot_dimension_numbers() = dimension_numbers;
+    return AddInstruction(std::move(instr), HloOpcode::kDot, {lhs, rhs});
+  });
+}
+
+Status XlaBuilder::VerifyConvolution(
+    const Shape& lhs_shape, const Shape& rhs_shape,
+    const ConvolutionDimensionNumbers& dimension_numbers) const {
+  if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) {
+    return InvalidArgument(
+        "Convolution arguments must have same number of "
+        "dimensions. Got: %s and %s",
+        ShapeUtil::HumanString(lhs_shape).c_str(),
+        ShapeUtil::HumanString(rhs_shape).c_str());
+  }
+  int num_dims = ShapeUtil::Rank(lhs_shape);
+  if (num_dims < 2) {
+    return InvalidArgument(
+        "Convolution expects argument arrays with >= 3 dimensions. "
+        "Got: %s and %s",
+        ShapeUtil::HumanString(lhs_shape).c_str(),
+        ShapeUtil::HumanString(rhs_shape).c_str());
+  }
+  int num_spatial_dims = num_dims - 2;
+
+  const auto check_spatial_dimensions =
+      [&](const char* const field_name,
+          const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>&
+              numbers) {
+        if (numbers.size() != num_spatial_dims) {
+          return InvalidArgument("Expected %d elements for %s, but got %d.",
+                                 num_spatial_dims, field_name, numbers.size());
+        }
+        for (int i = 0; i < numbers.size(); ++i) {
+          if (numbers.Get(i) < 0 || numbers.Get(i) >= num_dims) {
+            return InvalidArgument("Convolution %s[%d] is out of bounds: %lld",
+                                   field_name, i, numbers.Get(i));
+          }
+        }
+        return Status::OK();
+      };
+  TF_RETURN_IF_ERROR(
+      check_spatial_dimensions("input_spatial_dimensions",
+                               dimension_numbers.input_spatial_dimensions()));
+  TF_RETURN_IF_ERROR(
+      check_spatial_dimensions("kernel_spatial_dimensions",
+                               dimension_numbers.kernel_spatial_dimensions()));
+  return check_spatial_dimensions(
+      "output_spatial_dimensions",
+      dimension_numbers.output_spatial_dimensions());
+}
+
+XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs,
+                       tensorflow::gtl::ArraySlice<int64> window_strides,
+                       Padding padding) {
+  return ConvWithGeneralDimensions(
+      lhs, rhs, window_strides, padding,
+      CreateDefaultConvDimensionNumbers(window_strides.size()));
+}
+
+XlaOp XlaBuilder::ConvWithGeneralPadding(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
+  return ConvGeneral(lhs, rhs, window_strides, padding,
+                     CreateDefaultConvDimensionNumbers(window_strides.size()));
+}
+
+XlaOp XlaBuilder::ConvWithGeneralDimensions(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+    const ConvolutionDimensionNumbers& dimension_numbers) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+
+    TF_RETURN_IF_ERROR(
+        VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers));
+
+    std::vector<int64> base_area_dimensions(
+        dimension_numbers.input_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < base_area_dimensions.size();
+         ++i) {
+      base_area_dimensions[i] =
+          lhs_shape.dimensions(dimension_numbers.input_spatial_dimensions(i));
+    }
+
+    std::vector<int64> window_dimensions(
+        dimension_numbers.kernel_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < window_dimensions.size();
+         ++i) {
+      window_dimensions[i] =
+          rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i));
+    }
+
+    return ConvGeneral(lhs, rhs, window_strides,
+                       MakePadding(base_area_dimensions, window_dimensions,
+                                   window_strides, padding),
+                       dimension_numbers);
+  });
+}
+
+XlaOp XlaBuilder::ConvGeneral(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    const ConvolutionDimensionNumbers& dimension_numbers) {
+  return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {},
+                            dimension_numbers);
+}
+
+XlaOp XlaBuilder::ConvGeneralDilated(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+    tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_RETURN_IF_ERROR(
+        VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers));
+
+    std::vector<int64> window_dimensions(
+        dimension_numbers.kernel_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < window_dimensions.size();
+         ++i) {
+      window_dimensions[i] =
+          rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i));
+    }
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   lhs_dilation, rhs_dilation));
+
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape, instr.window(),
+                                           dimension_numbers));
+
+    *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+
+    return AddInstruction(std::move(instr), HloOpcode::kConvolution,
+                          {lhs, rhs});
+  });
+}
+
+StatusOr<Window> XlaBuilder::MakeWindow(
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+    tensorflow::gtl::ArraySlice<int64> rhs_dilation) const {
+  const auto verify_size = [&](const size_t x, const char* x_name) {
+    if (x == 0 || x == window_dimensions.size()) {
+      return Status::OK();
+    } else {
+      return InvalidArgument(
+          "%s", tensorflow::strings::StrCat(
+                    "Window has different number of window dimensions than of ",
+                    x_name,
+                    "\nNumber of window dimensions: ", window_dimensions.size(),
+                    "\nNumber of ", x_name, ": ", x, "\n")
+                    .c_str());
+    }
+  };
+  TF_RETURN_IF_ERROR(verify_size(window_strides.size(), "window strides"));
+  TF_RETURN_IF_ERROR(verify_size(padding.size(), "padding entries"));
+  TF_RETURN_IF_ERROR(verify_size(lhs_dilation.size(), "lhs dilation factors"));
+  TF_RETURN_IF_ERROR(verify_size(rhs_dilation.size(), "rhs dilation factors"));
+
+  Window window;
+  for (size_t i = 0; i < window_dimensions.size(); i++) {
+    auto dim = window.add_dimensions();
+    dim->set_size(window_dimensions[i]);
+    if (!window_strides.empty()) {
+      dim->set_stride(window_strides[i]);
+    } else {
+      dim->set_stride(1);
+    }
+    if (!padding.empty()) {
+      dim->set_padding_low(padding[i].first);
+      dim->set_padding_high(padding[i].second);
+    } else {
+      dim->set_padding_low(0);
+      dim->set_padding_high(0);
+    }
+    if (!lhs_dilation.empty()) {
+      dim->set_base_dilation(lhs_dilation[i]);
+    } else {
+      dim->set_base_dilation(1);
+    }
+    if (!rhs_dilation.empty()) {
+      dim->set_window_dilation(rhs_dilation[i]);
+    } else {
+      dim->set_window_dilation(1);
+    }
+    dim->set_window_reversal(false);
+  }
+  return window;
+}
+
+XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
+                      const tensorflow::gtl::ArraySlice<int64> fft_length) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferFftShape(operand_shape, fft_type, fft_length));
+
+    instr.set_fft_type(fft_type);
+    for (int64 i : fft_length) {
+      instr.add_fft_length(i);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kFft, {operand});
+  });
+}
+
+XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (!LayoutUtil::HasLayout(shape)) {
+      return InvalidArgument("Given shape to Infeed must have a layout");
+    }
+    *instr.mutable_shape() = shape;
+    instr.set_infeed_config(config);
+    return AddInstruction(std::move(instr), HloOpcode::kInfeed);
+  });
+}
+
+void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+                         const string& outfeed_config) {
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    *instr.mutable_shape() = ShapeUtil::MakeNil();
+
+    // Check and set outfeed shape.
+    if (!LayoutUtil::HasLayout(shape_with_layout)) {
+      return InvalidArgument("Given shape to Outfeed must have a layout");
+    }
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) {
+      return InvalidArgument(
+          "Outfeed shape %s must be compatible with operand shape %s",
+          ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str(),
+          ShapeUtil::HumanStringWithLayout(operand_shape).c_str());
+    }
+    *instr.mutable_outfeed_shape() = shape_with_layout;
+
+    instr.set_outfeed_config(outfeed_config);
+
+    return AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand});
+  });
+}
+
+XlaOp XlaBuilder::CustomCall(const string& call_target_name,
+                             tensorflow::gtl::ArraySlice<XlaOp> operands,
+                             const Shape& shape) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (tensorflow::str_util::StartsWith(call_target_name, "$")) {
+      return InvalidArgument(
+          "Invalid custom_call_target \"%s\": Call targets that start with '$' "
+          "are reserved for internal use.",
+          call_target_name.c_str());
+    }
+    *instr.mutable_shape() = shape;
+    instr.set_custom_call_target(call_target_name);
+    return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+  });
+}
+
+XlaOp XlaBuilder::HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                              const string& channel_name,
+                              int64 cost_estimate_ns, const Shape& shape) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape;
+    instr.set_channel_name(channel_name);
+    instr.set_cost_estimate_ns(cost_estimate_ns);
+    return AddInstruction(std::move(instr), HloOpcode::kHostCompute, operands);
+  });
+}
+
+XlaOp XlaBuilder::Complex(
+    const XlaOp& real, const XlaOp& imag,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Conj(const XlaOp& operand) {
+  return Complex(Real(operand), Neg(Imag(operand)));
+}
+
+XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kSubtract, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Div(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kDivide, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Rem(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kRemainder, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Max(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kMaximum, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Min(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kMinimum, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::And(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kAnd, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Or(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kOr, lhs, rhs, broadcast_dimensions);
+}
+
+// TODO(b/65209188): Create a dedicated lowering for Xor.
+XlaOp XlaBuilder::Xor(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return Or(And(Not(lhs), rhs, broadcast_dimensions),
+            And(lhs, Not(rhs), broadcast_dimensions));
+}
+
+XlaOp XlaBuilder::Not(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kNot, operand);
+}
+
+XlaOp XlaBuilder::ShiftLeft(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kShiftLeft, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::ShiftRightArithmetic(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs,
+                  broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::ShiftRightLogical(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs,
+                  broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Abs(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kAbs, operand);
+}
+
+XlaOp XlaBuilder::Atan2(
+    const XlaOp& y, const XlaOp& x,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kAtan2, y, x, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Exp(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kExp, operand);
+}
+
+XlaOp XlaBuilder::Floor(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kFloor, operand);
+}
+
+XlaOp XlaBuilder::Ceil(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kCeil, operand);
+}
+
+XlaOp XlaBuilder::Round(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kRoundNearestAfz, operand);
+}
+
+XlaOp XlaBuilder::Log(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kLog, operand);
+}
+
+XlaOp XlaBuilder::Sign(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kSign, operand);
+}
+
+XlaOp XlaBuilder::Clz(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kClz, operand);
+}
+
+XlaOp XlaBuilder::Cos(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kCos, operand);
+}
+
+XlaOp XlaBuilder::Sin(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kSin, operand);
+}
+
+XlaOp XlaBuilder::Tanh(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kTanh, operand);
+}
+
+XlaOp XlaBuilder::Real(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kReal, operand);
+}
+
+XlaOp XlaBuilder::Imag(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kImag, operand);
+}
+
+XlaOp XlaBuilder::IsFinite(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kIsFinite, operand);
+}
+
+XlaOp XlaBuilder::Transpose(const XlaOp& operand,
+                            tensorflow::gtl::ArraySlice<int64> permutation) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferTransposeShape(operand_shape, permutation));
+    for (int64 dim : permutation) {
+      instr.add_dimensions(dim);
+    }
+    return AddInstruction(std::move(instr), HloOpcode::kTranspose, {operand});
+  });
+}
+
+XlaOp XlaBuilder::Rev(const XlaOp& operand,
+                      tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferReverseShape(operand_shape, dimensions));
+    for (int64 dim : dimensions) {
+      instr.add_dimensions(dim);
+    }
+    return AddInstruction(std::move(instr), HloOpcode::kReverse, {operand});
+  });
+}
+
+XlaOp XlaBuilder::Sort(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kSort, operand);
+}
+
+XlaOp XlaBuilder::SqrtF32(const XlaOp& operand) {
+  return BinaryOp(HloOpcode::kPower, operand, ConstantR0<float>(0.5),
+                  /*broadcast_dimensions=*/{});
+}
+
+XlaOp XlaBuilder::Pow(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kPower, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
+                                     PrimitiveType new_element_type) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand});
+  });
+}
+
+XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
+                                     PrimitiveType new_element_type) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
+                          {operand});
+  });
+}
+
+XlaOp XlaBuilder::SquareF32(const XlaOp& operand) {
+  return BinaryOp(HloOpcode::kPower, operand, ConstantR0<float>(2.0),
+                  /*broadcast_dimensions=*/{});
+}
+
+XlaOp XlaBuilder::ReciprocalF32(const XlaOp& operand) {
+  return BinaryOp(HloOpcode::kPower, operand, ConstantR0<float>(-1.0),
+                  /*broadcast_dimensions=*/{});
+}
+
+XlaOp XlaBuilder::Neg(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kNegate, operand);
+}
+
+XlaOp XlaBuilder::Clamp(const XlaOp& min, const XlaOp& operand,
+                        const XlaOp& max) {
+  return TernaryOp(HloOpcode::kClamp, min, operand, max);
+}
+
+XlaOp XlaBuilder::Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                      const XlaComputation& computation,
+                      tensorflow::gtl::ArraySlice<int64> dimensions,
+                      tensorflow::gtl::ArraySlice<XlaOp> static_operands) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (!static_operands.empty()) {
+      return Unimplemented("static_operands is not supported in Map");
+    }
+
+    HloInstructionProto instr;
+
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferMapShape(operand_shape_ptrs, called_program_shape,
+                                      dimensions));
+
+    AddCalledComputation(computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kMap, operands);
+  });
+}
+
+XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
+                        tensorflow::gtl::ArraySlice<XlaOp> parameters,
+                        const Shape& shape) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Check the number of parameters per RNG distribution.
+    switch (distribution) {
+      case RandomDistribution::RNG_NORMAL:
+      case RandomDistribution::RNG_UNIFORM:
+        if (parameters.size() != 2) {
+          return InvalidArgument(
+              "RNG distribution (%s) expects 2 parameters, but got %ld",
+              RandomDistribution_Name(distribution).c_str(), parameters.size());
+        }
+        break;
+      default:
+        LOG(FATAL) << "unhandled distribution " << distribution;
+    }
+
+    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
+    *instr.mutable_shape() = shape;
+
+    instr.set_distribution(distribution);
+
+    return AddInstruction(std::move(instr), HloOpcode::kRng, parameters);
+  });
+}
+
+XlaOp XlaBuilder::RngNormal(const XlaOp& mu, const XlaOp& sigma,
+                            const Shape& shape) {
+  return RngOp(RandomDistribution::RNG_NORMAL, {mu, sigma}, shape);
+}
+
+XlaOp XlaBuilder::RngUniform(const XlaOp& a, const XlaOp& b,
+                             const Shape& shape) {
+  return RngOp(RandomDistribution::RNG_UNIFORM, {a, b}, shape);
+}
+
+XlaOp XlaBuilder::While(const XlaComputation& condition,
+                        const XlaComputation& body, const XlaOp& init) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Infer shape.
+    TF_ASSIGN_OR_RETURN(const auto& body_program_shape, body.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
+                        condition.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferWhileShape(condition_program_shape,
+                                        body_program_shape, init_shape));
+    // Body comes before condition computation in the vector.
+    AddCalledComputation(body, &instr);
+    AddCalledComputation(condition, &instr);
+    return AddInstruction(std::move(instr), HloOpcode::kWhile, {init});
+  });
+}
+
+XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices,
+                         const GatherDimensionNumbers& dimension_numbers,
+                         tensorflow::gtl::ArraySlice<int64> window_bounds) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
+    TF_ASSIGN_OR_RETURN(const Shape& gather_indices_shape,
+                        GetShape(gather_indices));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferGatherShape(input_shape, gather_indices_shape,
+                                         dimension_numbers, window_bounds));
+
+    *instr.mutable_gather_dimension_numbers() = dimension_numbers;
+    for (int64 bound : window_bounds) {
+      instr.add_gather_window_bounds(bound);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kGather,
+                          {input, gather_indices});
+  });
+}
+
+XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                              const XlaComputation& true_computation,
+                              const XlaOp& false_operand,
+                              const XlaComputation& false_computation) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& predicate_shape, GetShape(predicate));
+    TF_ASSIGN_OR_RETURN(const Shape& true_operand_shape,
+                        GetShape(true_operand));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& true_computation_shape,
+                        true_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const Shape& false_operand_shape,
+                        GetShape(false_operand));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape,
+                        false_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConditionalShape(
+            predicate_shape, true_operand_shape, false_operand_shape,
+            true_computation_shape, false_computation_shape));
+
+    // The index of true_computation must be 0 and that of false computation
+    // must be 1.
+    AddCalledComputation(true_computation, &instr);
+    AddCalledComputation(false_computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kConditional,
+                          {predicate, true_operand, false_operand});
+  });
+}
+
+XlaOp XlaBuilder::Reduce(
+    const XlaOp& operand, const XlaOp& init_value,
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferReduceShape(
+                            operand_shape, init_shape, dimensions_to_reduce,
+                            called_program_shape));
+
+    for (int64 dim : dimensions_to_reduce) {
+      instr.add_dimensions(dim);
+    }
+
+    AddCalledComputation(computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kReduce,
+                          {operand, init_value});
+  });
+}
+
+XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                            const XlaComputation& computation) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    std::vector<int64> all_dimnos(ShapeUtil::Rank(operand_shape));
+    std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
+    return Reduce(operand, init_value, computation, all_dimnos);
+  });
+}
+
+XlaOp XlaBuilder::ReduceWindow(
+    const XlaOp& operand, const XlaOp& init_value,
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_RETURN_IF_ERROR(
+        ValidatePaddingValues(AsInt64Slice(operand_shape.dimensions()),
+                              window_dimensions, window_strides));
+
+    std::vector<std::pair<int64, int64>> padding_values =
+        MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
+                    window_strides, padding);
+    return ReduceWindowWithGeneralPadding(operand, init_value, computation,
+                                          window_dimensions, window_strides,
+                                          padding_values);
+  });
+}
+
+XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
+    const XlaOp& operand, const XlaOp& init_value,
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferReduceWindowShape(operand_shape, init_shape,
+                                               instr.window(), to_apply_shape));
+
+    AddCalledComputation(computation, &instr);
+    return AddInstruction(std::move(instr), HloOpcode::kReduceWindow,
+                          {operand, init_value});
+  });
+}
+
+XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                                    const XlaOp& offset, float epsilon,
+                                    int64 feature_index) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferBatchNormTrainingShape(
+            operand_shape, scale_shape, offset_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormTraining,
+                          {operand, scale, offset});
+  });
+}
+
+XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                                     const XlaOp& offset, const XlaOp& mean,
+                                     const XlaOp& variance, float epsilon,
+                                     int64 feature_index) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
+    TF_ASSIGN_OR_RETURN(const Shape& mean_shape, GetShape(mean));
+    TF_ASSIGN_OR_RETURN(const Shape& variance_shape, GetShape(variance));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferBatchNormInferenceShape(
+                            operand_shape, scale_shape, offset_shape,
+                            mean_shape, variance_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormInference,
+                          {operand, scale, offset, mean, variance});
+  });
+}
+
+XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                                const XlaOp& batch_mean, const XlaOp& batch_var,
+                                const XlaOp& grad_output, float epsilon,
+                                int64 feature_index) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& batch_mean_shape, GetShape(batch_mean));
+    TF_ASSIGN_OR_RETURN(const Shape& batch_var_shape, GetShape(batch_var));
+    TF_ASSIGN_OR_RETURN(const Shape& grad_output_shape, GetShape(grad_output));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferBatchNormGradShape(
+                            operand_shape, scale_shape, batch_mean_shape,
+                            batch_var_shape, grad_output_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormGrad,
+                          {operand, scale, batch_mean, batch_var, grad_output});
+  });
+}
+
+XlaOp XlaBuilder::CrossReplicaSum(const XlaOp& operand) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
+
+    return AddInstruction(std::move(instr), HloOpcode::kCrossReplicaSum,
+                          {operand});
+  });
+}
+
+XlaOp XlaBuilder::SelectAndScatter(
+    const XlaOp& operand, const XlaComputation& select,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+    const XlaOp& source, const XlaOp& init_value,
+    const XlaComputation& scatter) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    return SelectAndScatterWithGeneralPadding(
+        operand, select, window_dimensions, window_strides,
+        MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
+                    window_strides, padding),
+        source, init_value, scatter);
+  });
+}
+
+XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
+    const XlaOp& operand, const XlaComputation& select,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    const XlaOp& source, const XlaOp& init_value,
+    const XlaComputation& scatter) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& source_shape, GetShape(source));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& select_shape,
+                        select.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const ProgramShape& scatter_shape,
+                        scatter.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferSelectAndScatterShape(
+                            operand_shape, select_shape, instr.window(),
+                            source_shape, init_shape, scatter_shape));
+
+    AddCalledComputation(select, &instr);
+    AddCalledComputation(scatter, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kSelectAndScatter,
+                          {operand, source, init_value});
+  });
+}
+
+XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                                  const int mantissa_bits) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferReducePrecisionShape(
+                            operand_shape, exponent_bits, mantissa_bits));
+    instr.set_exponent_bits(exponent_bits);
+    instr.set_mantissa_bits(mantissa_bits);
+    return AddInstruction(std::move(instr), HloOpcode::kReducePrecision,
+                          {operand});
+  });
+}
+
+void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) {
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Send instruction produces a tuple of {aliased operand, U32 context}.
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    *instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
+    instr.set_channel_id(handle.handle());
+    TF_ASSIGN_OR_RETURN(
+        XlaOp send,
+        AddInstruction(std::move(instr), HloOpcode::kSend, {operand}));
+
+    HloInstructionProto send_done_instr;
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeNil();
+    send_done_instr.set_channel_id(handle.handle());
+    return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
+                          {send});
+  });
+}
+
+XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Recv instruction produces a tuple of {receive buffer, U32 context}.
+    *instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
+    instr.set_channel_id(handle.handle());
+    TF_ASSIGN_OR_RETURN(XlaOp recv,
+                        AddInstruction(std::move(instr), HloOpcode::kRecv, {}));
+
+    HloInstructionProto recv_done_instr;
+    *recv_done_instr.mutable_shape() = shape;
+    recv_done_instr.set_channel_id(handle.handle());
+    return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
+                          {recv});
+  });
+}
+
+StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  // Verify that the handle is valid.
+  TF_RETURN_IF_ERROR(LookUpInstruction(operand).status());
+
+  bool is_constant = true;
+  std::set<int64> visited;
+  IsConstantVisitor(operand.handle(), &visited, &is_constant);
+  return is_constant;
+}
+
+StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
+    const XlaOp& root_op) const {
+  TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
+  if (!is_constant) {
+    auto op_status = LookUpInstruction(root_op);
+    string op_string =
+        op_status.ok() ? op_status.ValueOrDie()->name() : "<unknown operation>";
+    return InvalidArgument(
+        "Operand to BuildConstantSubGraph depends on a parameter.\n\n"
+        "  op requested for constant subgraph: %s\n\n"
+        "This is an internal error that typically happens when the XLA user "
+        "(e.g. TensorFlow) is attempting to determine a value that must be a "
+        "compile-time constant (e.g. an array dimension) but it is not capable "
+        "of being evaluated at XLA compile time.\n\n"
+        "Please file a usability bug with the framework being used (e.g. "
+        "TensorFlow).",
+        op_string.c_str());
+  }
+
+  TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
+                      LookUpInstruction(root_op));
+  TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
+  if (!CanBeRoot(opcode)) {
+    return InvalidArgument("the operand with opcode %s cannot be root",
+                           root->opcode().c_str());
+  }
+
+  HloComputationProto entry;
+  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
+  entry.set_name(StrCat(name_, entry.id(), "_compute_constant"));
+  entry.set_root_id(root->id());
+  ProgramShape* program_shape = entry.mutable_program_shape();
+  *program_shape->mutable_result() = root->shape();
+
+  // We use std::set to keep the instruction ids in ascending order (which is
+  // also a valid denpendency order). The related ops will be added to the
+  // subgraph in the same order.
+  std::set<int64> related_ops;
+  tensorflow::gtl::FlatSet<int64> related_calls;  // Related computations.
+  std::queue<int64> worklist;
+  worklist.push(root->id());
+  related_ops.insert(root->id());
+  while (!worklist.empty()) {
+    int64 node = worklist.front();
+    worklist.pop();
+    for (int64 id : instructions_[node].operand_ids()) {
+      if (related_ops.insert(id).second) {
+        worklist.push(id);
+      }
+    }
+    for (int64 called_id : instructions_[node].called_computation_ids()) {
+      related_calls.insert(called_id);
+    }
+  }
+
+  // Add related ops to the computation.
+  for (int64 id : related_ops) {
+    auto* instr = entry.add_instructions();
+    *instr = instructions_[id];
+    // Ensures that the instruction names are unique among the graph.
+    const string& new_name =
+        StrCat(instr->name(), ".", entry.id(), ".", instr->id());
+    instr->set_name(new_name);
+  }
+
+  XlaComputation computation(entry.id());
+  HloModuleProto* module = computation.mutable_proto();
+  module->set_name(entry.name());
+  module->set_id(entry.id());
+  module->set_entry_computation_name(entry.name());
+  module->set_entry_computation_id(entry.id());
+  *module->mutable_program_shape() = *program_shape;
+  for (auto& e : embedded_) {
+    if (related_calls.find(e.second.id()) != related_calls.end()) {
+      *module->add_computations() = e.second;
+    }
+  }
+  *module->add_computations() = std::move(entry);
+
+  return std::move(computation);
+}
+
+std::unique_ptr<XlaBuilder> XlaBuilder::CreateSubBuilder(
+    const string& computation_name) {
+  auto sub_builder = MakeUnique<XlaBuilder>(computation_name);
+  sub_builder->parent_builder_ = this;
+  sub_builder->die_immediately_on_error_ = this->die_immediately_on_error_;
+  return sub_builder;
+}
+
+/* static */ ConvolutionDimensionNumbers
+XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
+  ConvolutionDimensionNumbers dimension_numbers;
+  dimension_numbers.set_input_batch_dimension(kConvBatchDimension);
+  dimension_numbers.set_input_feature_dimension(kConvFeatureDimension);
+  dimension_numbers.set_output_batch_dimension(kConvBatchDimension);
+  dimension_numbers.set_output_feature_dimension(kConvFeatureDimension);
+  dimension_numbers.set_kernel_output_feature_dimension(
+      kConvKernelOutputDimension);
+  dimension_numbers.set_kernel_input_feature_dimension(
+      kConvKernelInputDimension);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    dimension_numbers.add_input_spatial_dimensions(i + 2);
+    dimension_numbers.add_kernel_spatial_dimensions(i + 2);
+    dimension_numbers.add_output_spatial_dimensions(i + 2);
+  }
+  return dimension_numbers;
+}
+
+/* static */ Status XlaBuilder::Validate(
+    const ConvolutionDimensionNumbers& dnum) {
+  if (dnum.input_spatial_dimensions_size() < 2) {
+    return FailedPrecondition("input spacial dimension < 2: %d",
+                              dnum.input_spatial_dimensions_size());
+  }
+  if (dnum.kernel_spatial_dimensions_size() < 2) {
+    return FailedPrecondition("kernel spacial dimension < 2: %d",
+                              dnum.kernel_spatial_dimensions_size());
+  }
+  if (dnum.output_spatial_dimensions_size() < 2) {
+    return FailedPrecondition("output spacial dimension < 2: %d",
+                              dnum.output_spatial_dimensions_size());
+  }
+
+  if (std::set<int64>(
+          {dnum.input_batch_dimension(), dnum.input_feature_dimension(),
+           dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1)})
+          .size() != 4) {
+    return FailedPrecondition(
+        "dimension numbers for the input are not unique: (%lld, %lld, %lld, "
+        "%lld)",
+        dnum.input_batch_dimension(), dnum.input_feature_dimension(),
+        dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1));
+  }
+  if (std::set<int64>({dnum.kernel_output_feature_dimension(),
+                       dnum.kernel_input_feature_dimension(),
+                       dnum.kernel_spatial_dimensions(0),
+                       dnum.kernel_spatial_dimensions(1)})
+          .size() != 4) {
+    return FailedPrecondition(
+        "dimension numbers for the weight are not unique: (%lld, %lld, %lld, "
+        "%lld)",
+        dnum.kernel_output_feature_dimension(),
+        dnum.kernel_input_feature_dimension(),
+        dnum.kernel_spatial_dimensions(0), dnum.kernel_spatial_dimensions(1));
+  }
+  if (std::set<int64>({dnum.output_batch_dimension(),
+                       dnum.output_feature_dimension(),
+                       dnum.output_spatial_dimensions(0),
+                       dnum.output_spatial_dimensions(1)})
+          .size() != 4) {
+    return FailedPrecondition(
+        "dimension numbers for the output are not unique: (%lld, %lld, %lld, "
+        "%lld)",
+        dnum.output_batch_dimension(), dnum.output_feature_dimension(),
+        dnum.output_spatial_dimensions(0), dnum.output_spatial_dimensions(1));
+  }
+  return Status::OK();
+}
+
+StatusOr<XlaOp> XlaBuilder::AddInstruction(
+    HloInstructionProto&& instr, HloOpcode opcode,
+    tensorflow::gtl::ArraySlice<XlaOp> operands) {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  const int64 handle = instructions_.size();
+  instr.set_id(handle);
+  instr.set_opcode(HloOpcodeString(opcode));
+  if (instr.name().empty()) {
+    instr.set_name(StrCat(instr.opcode()));
+  }
+  for (const auto& operand : operands) {
+    if (operand.builder_ == nullptr) {
+      return InvalidArgument("invalid XlaOp with handle %lld",
+                             operand.handle());
+    }
+    if (operand.builder_ != this) {
+      return InvalidArgument("Do not add XlaOp from builder %s to builder %s",
+                             operand.builder_->name().c_str(),
+                             this->name().c_str());
+    }
+    instr.add_operand_ids(operand.handle());
+  }
+
+  *instr.mutable_metadata() = metadata_;
+  if (sharding_) {
+    *instr.mutable_sharding() = *sharding_;
+  }
+
+  instructions_.push_back(instr);
+
+  XlaOp op(handle, this);
+  return op;
+}
+
+void XlaBuilder::AddCalledComputation(const XlaComputation& computation,
+                                      HloInstructionProto* instr) {
+  instr->add_called_computation_ids(computation.proto().entry_computation_id());
+  for (const HloComputationProto& e : computation.proto().computations()) {
+    embedded_.insert({e.id(), e});
+  }
+}
+
+StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
+    const XlaOp& op) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  if (op.builder_ != this) {
+    return InvalidArgument("invalid XlaOp with handle %lld", op.handle());
+  }
+
+  TF_RET_CHECK(op.builder_ == this);
+  if (op.handle() >= instructions_.size() || op.handle() < 0) {
+    return InvalidArgument("no XlaOp value %lld", op.handle());
+  }
+  return &instructions_[op.handle()];
+}
+
+XlaOp XlaBuilder::UnimplementedOp() {
+  NoteError(Unimplemented("Op not implemented"));
+  return {};
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..4955f1515d66af00ddf72e4c7621292a590e662c
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -0,0 +1,1015 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TODO(b/74197823): Replace computation_builder.h with this file.
+//
+// This is NOT YET ready to use.
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
+
+#include <map>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stacktrace.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+class XlaBuilder;
+
+// This represents an instruction that has been enqueued using the XlaBuilder.
+// This is used to pass to subsequent computations that depends upon the
+// instruction as an operand.
+//
+// TODO(b/74197823): Replace xla::ComputationDataHandle with this one.
+class XlaOp {
+ public:
+  XlaOp() : handle_(0), builder_(nullptr) {}
+  ~XlaOp() {}
+
+  StatusOr<Shape> GetShape() const;
+
+  const XlaBuilder* builder() const { return builder_; }
+
+  bool operator==(const XlaOp& rhs) const {
+    return handle_ == rhs.handle_ && builder_ == rhs.builder_;
+  }
+
+  bool operator!=(const XlaOp& rhs) const {
+    return handle_ != rhs.handle_ || builder_ != rhs.builder_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) {
+    out << op.handle();
+    return out;
+  }
+
+ private:
+  XlaOp(int64 handle, XlaBuilder* builder)
+      : handle_(handle), builder_(builder) {}
+
+  int64 handle() const { return handle_; }
+
+  friend class XlaBuilder;
+
+  int64 handle_;
+  XlaBuilder* builder_;  // Not owned.
+};
+
+// A convenient interface for building up computations.
+//
+// Thread-compatible.
+//
+// TODO(b/74197823): Replace xla::ComputationBuilder with this one.
+class XlaBuilder {
+ public:
+  // computation_name: name to use for the built computation.
+  XlaBuilder(const string& computation_name);
+
+  XlaBuilder(const XlaBuilder&) = delete;
+  XlaBuilder& operator=(const XlaBuilder&) = delete;
+
+  ~XlaBuilder();
+
+  // Returns the computation name.
+  const string& name() const { return name_; }
+
+  // Sets OpMetadata that will be added to all instructions until cleared.
+  //
+  // OpMetadata is often applied to a series of XLA HLO instructions. As a
+  // result, OpMetadata is set on the Computation Builder. All subsequent
+  // instructions generated via this Computation Builder will have the same
+  // OpMetadata attached until a call to ClearOpMetadata.
+  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
+
+  // Clears the HloMetadata state.
+  void ClearOpMetadata() { metadata_.Clear(); }
+
+  // Sets an OpSharding that will be attached to all instructions until cleared.
+  void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
+
+  // Clears the sharding. Ops will be sharded according to the default placement
+  // policy.
+  void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
+
+  // Returns the OpSharding that will be attached to all instructions.
+  const tensorflow::gtl::optional<OpSharding>& sharding() const {
+    return sharding_;
+  }
+
+  // Sets the builder to a mode where it will die immediately when an error is
+  // encountered, rather than producing it in a deferred fashion when Build() is
+  // called (which is the default).
+  void set_die_immediately_on_error(bool enabled) {
+    die_immediately_on_error_ = enabled;
+  }
+
+  // Enqueues a "retrieve parameter value" instruction for a parameter that was
+  // passed to the computation.
+  XlaOp Parameter(int64 parameter_number, const Shape& shape,
+                  const string& name);
+
+  // Enqueues a constant with the value of the given literal onto the
+  // computation.
+  XlaOp ConstantLiteral(const Literal& literal);
+
+  // Enqueues a constant onto the computation. Methods are templated on the
+  // native host type (NativeT) which corresponds to a specific XLA
+  // PrimitiveType as given in the following table:
+  //
+  //  Native Type   PrimitiveType
+  // -----------------------------
+  //   bool           PRED
+  //   int32          S32
+  //   int64          S64
+  //   uint32         U32
+  //   uint64         U64
+  //   float          F32
+  //   double         F64
+  //
+  // Note: not all primitive types defined in xla_data.proto have a
+  // corresponding native type yet.
+  template <typename NativeT>
+  XlaOp ConstantR0(NativeT value);
+  template <typename NativeT>
+  XlaOp ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values);
+  XlaOp ConstantR1(const tensorflow::core::Bitmap& values);
+  template <typename NativeT>
+  XlaOp ConstantR2(
+      std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  XlaOp ConstantFromArrayWithLayout(const Array<NativeT>& values,
+                                    const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantFromArray(const Array<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR2FromArray2DWithLayout(const Array2D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR2FromArray2D(const Array2D<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR3FromArray3DWithLayout(const Array3D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR3FromArray3D(const Array3D<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR4FromArray4DWithLayout(const Array4D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
+
+  // Enqueues a rank one constant (vector) onto the computation. The vector has
+  // size 'length' and every element has the value 'value'.
+  template <typename NativeT>
+  XlaOp ConstantR1(int64 length, NativeT value);
+
+  // Adds dimensions to an array by duplicating the data in the array.
+  //
+  // The new dimensions are inserted on the left, i.e. if
+  // broadcast_sizes has values {a0, ..., aN} and the operand shape
+  // has dimensions {b0, ..., bM} then the shape of the output has
+  // dimensions {a0, ..., aN, b0, ..., bM}.
+  //
+  // The new dimensions index into copies of the operand, i.e.
+  //
+  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
+  XlaOp Broadcast(const XlaOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+  // Enqueues a pad operation onto the computation that pads the given value on
+  // the edges as well as between the elements of the input. padding_config
+  // specifies the padding amount for each dimension.
+  XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+            const PaddingConfig& padding_config);
+
+  // Enqueues an operation onto the computation that flattens the operand based
+  // on the dimension order (major/slowest-varying to minor/fastest-varying)
+  // given, followed by reshaping it into the shape with the given dimension
+  // sizes (also major to minor). Conceptually, this is a limited form of
+  // "shape casting".
+  XlaOp Reshape(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> dimensions,
+                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  // Enqueues an operation onto the computation that collapses the operand, from
+  // first to last dimension (C order), then reshapes it to the given dimension
+  // sizes. Conceptually, this is a limited form of "shape casting".
+  XlaOp Reshape(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  // Wrapper for Reshape.
+  // Enqueues an operation to collapse the provided dimensions; e.g. an
+  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
+  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
+  // be a consecutive, in-order subsequence of the operand dimensions.
+  //
+  // Note that collapsing a single dimension does nothing:
+  //
+  //    {256} collapsing {0} => {256}
+  //    {1} collapsing {0} => {1}
+  //
+  // Collapsing multiple dimensions produces a single result dimension:
+  //
+  //    {256, 2} collapsing {0,1} => {512}
+  //    {256, 2, 3} collapsing {0,1} => {512, 3}
+  //
+  // This could potentially cause data to be moved -- it provides a more
+  // structured form of reshaping than an arbitrary Reshape operation.
+  XlaOp Collapse(const XlaOp& operand,
+                 tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Enqueues a slice operation onto the computation that slices the operand
+  // from the start indices to the limit indices; e.g.
+  //
+  //        x
+  //   [ 0 1 2 3 ]
+  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
+  //   [ 8 9 a b ]
+  //
+  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
+  // range notation.
+  // The strides parameter determines the stride over the slice
+  XlaOp Slice(const XlaOp& operand,
+              tensorflow::gtl::ArraySlice<int64> start_indices,
+              tensorflow::gtl::ArraySlice<int64> limit_indices,
+              tensorflow::gtl::ArraySlice<int64> strides);
+
+  // Enqueues a slice operation in a given dimension, taking all other
+  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
+  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
+  // for:
+  //
+  //  array[:, 2:4:1, :]
+  XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
+                   int64 stride, int64 dimno);
+
+  // Enqueues a slice operation onto the computation that slices the 'operand'
+  // from dynamic start indices which are passed in 'start_indices'.
+  // The size of the slice in each dimension is passed in 'slice_sizes',
+  // which specify the end point of exclusive slice intervals in each
+  // dimension [start, start + size).
+  // The shape of 'start_indices' must be rank == 1, with dimension size
+  // equal to the rank of the 'operand'.
+  // Slice index calculations are computed modulo input dimension sizes to
+  // prevent dynamic start indices from generating out-of-bound array accesses.
+  XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                     tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+  // Enqueues a dynamic update slice operation onto the computation, which
+  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
+  // The shape of 'update' determines the shape of the slice of 'operand'
+  // which is updated.
+  // The indices specified in 'start_indices' specify the offset of the slice
+  // of 'operand' which is updated.
+  //
+  //               update = {10, 11} // calculated at runtime.
+  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
+  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
+  //   [7 8 9]                                                  [7 8  9 ]
+  //
+  // The shape of 'start_indices' must be rank == 1, with dimension size
+  // equal to the rank of the 'operand'.
+  // Slice index calculations are computed modulo update dimension sizes to
+  // prevent dynamic start indices from generating out-of-bound array accesses.
+  XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                           const XlaOp& start_indices);
+
+  // Enqueues a concatenate instruction onto the computation. 'operands' must
+  // have >= 1 entry.
+  XlaOp ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                    int64 dimension);
+
+  // Enqueue a tracing operation onto the computation; the computation will emit
+  // a logging message with the operand.
+  void Trace(const string& tag, const XlaOp& operand);
+
+  // Enqueues a conditional-move-like select operation onto the computation;
+  // predicated on pred, selects between on_true and on_false.
+  XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
+
+  // Enqueues a tuple-creation instruction onto the computation.
+  XlaOp Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements);
+
+  // Enqueues a tuple-element-get instruction onto the computation.
+  XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
+
+  // Enqueues an equal-to comparison instruction onto the computation.
+  XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a not-equal comparison instruction onto the computation.
+  XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a greater-or-equal comparison instruction onto the computation.
+  XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a greater-than comparison instruction onto the computation.
+  XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a less-than comparison instruction onto the computation.
+  XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a less-or-equal comparison instruction onto the computation.
+  XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a dot instruction onto the computation.
+  XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+
+  // Enqueues a general dot instruction onto the computation.
+  XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                   const DotDimensionNumbers& dimension_numbers);
+
+  // Default dimension numbers used for a 2D convolution.
+  static constexpr int64 kConvBatchDimension = 0;
+  static constexpr int64 kConvFeatureDimension = 1;
+  static constexpr int64 kConvFirstSpatialDimension = 2;
+  static constexpr int64 kConvSecondSpatialDimension = 3;
+  static constexpr int64 kConvKernelOutputDimension = 0;
+  static constexpr int64 kConvKernelInputDimension = 1;
+  static constexpr int64 kConvKernelFirstSpatialDimension = 2;
+  static constexpr int64 kConvKernelSecondSpatialDimension = 3;
+
+  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
+  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
+  // the kernel operand
+  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
+  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
+      int num_spatial_dims = 2);
+
+  // Returns an error if the convolution dimension numbers have conflicts.
+  static Status Validate(const ConvolutionDimensionNumbers& dnum);
+
+  // Enqueues a convolution instruction onto the computation, which uses the
+  // default convolution dimension numbers.
+  XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+             tensorflow::gtl::ArraySlice<int64> window_strides,
+             Padding padding);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration in the format returned by MakePadding().
+  XlaOp ConvWithGeneralPadding(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided dimension numbers configuration.
+  XlaOp ConvWithGeneralDimensions(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration as well as the dimension numbers.
+  XlaOp ConvGeneral(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration, dilation factors and dimension numbers.
+  XlaOp ConvGeneralDilated(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues an FFT instruction onto the computation, of the given type and
+  // with the given FFT length.
+  XlaOp Fft(const XlaOp& operand, FftType fft_type,
+            tensorflow::gtl::ArraySlice<int64> fft_length);
+
+  // Enqueues an infeed instruction onto the computation, which writes data of
+  // the given shape to the infeed buffer of the device.
+  XlaOp Infeed(const Shape& shape, const string& config = "");
+
+  // Enqueues an outfeed instruction onto the computation. This instruction
+  // generates outgoing data transfers for the given data.
+  //
+  // shape_with_layout communicates the laid out shape that we want to outfeed
+  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
+  // will occur.
+  void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+               const string& outfeed_config);
+
+  // Enqueues a call instruction onto the computation.
+  XlaOp Call(const XlaComputation& computation,
+             tensorflow::gtl::ArraySlice<XlaOp> operands);
+
+  // Enqueues a custom call instruction onto the computation.
+  // During code generation, a call instruction is emitted which targets a
+  // symbol with the name |call_target_name|.  The |operands| are passed to the
+  // call instruction.  |shape| is the resultant shape.
+  XlaOp CustomCall(const string& call_target_name,
+                   tensorflow::gtl::ArraySlice<XlaOp> operands,
+                   const Shape& shape);
+
+  // Enqueues a pseudo-op to represent host-side computation data-dependencies.
+  // During code generation, host send and receive operations will be generated
+  // to transfer |operands| to the host and a single result of |shape| back to
+  // the device.  Host send/recv operations are emitted using |channel_name|.
+  // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
+  // instruction scheduling.
+  XlaOp HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                    const string& channel_name, int64 cost_estimate_ns,
+                    const Shape& shape);
+
+  // The following methods enqueue element-wise binary arithmetic operations
+  // onto the computation. The shapes of the operands have to match unless one
+  // of the operands is a scalar, or an explicit broadcast dimension is given
+  // (see g3doc for more details).
+
+  // Enqueues a complex compose instruction onto the computation.
+  XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a complex conjugate instruction onto the computation.
+  XlaOp Conj(const XlaOp& operand);
+
+  // Enqueues an add instruction onto the computation.
+  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a subtract instruction onto the computation.
+  XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a multiply instruction onto the computation.
+  XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a divide instruction onto the computation.
+  XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a remainder instruction onto the computation.
+  XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a max instruction onto the computation.
+  XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a min instruction onto the computation.
+  XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Element-wise logical operators
+  XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Not(const XlaOp& operand);
+
+  XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+  XlaOp ShiftRightArithmetic(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+  XlaOp ShiftRightLogical(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Reduces an array among the provided dimensions, given "computation" as a
+  // reduction operator.
+  XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+               const XlaComputation& computation,
+               tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+
+  // Convenience wrapper around the above that reduces all the dimensions in the
+  // operand shape.
+  XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                  const XlaComputation& computation);
+
+  // Enqueues a windowed reduce instruction onto the computation.
+  XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
+                     const XlaComputation& computation,
+                     tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                     tensorflow::gtl::ArraySlice<int64> window_strides,
+                     Padding padding);
+
+  // As ReduceWindow(), but the padding is given in the format
+  // returned by MakePadding().
+  XlaOp ReduceWindowWithGeneralPadding(
+      const XlaOp& operand, const XlaOp& init_value,
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+  // Returns the sum of the operand value across all replicas. All replicas
+  // supply one input to the sum and all replicas receive the resulting sum.
+  XlaOp CrossReplicaSum(const XlaOp& operand);
+
+  // Enqueues an operation that scatters the `source` array to the selected
+  // indices of each window.
+  XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
+                         tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                         tensorflow::gtl::ArraySlice<int64> window_strides,
+                         Padding padding, const XlaOp& source,
+                         const XlaOp& init_value,
+                         const XlaComputation& scatter);
+
+  // As SelectAndScatter(), but the padding is given in the format
+  // returned by MakePadding().
+  XlaOp SelectAndScatterWithGeneralPadding(
+      const XlaOp& operand, const XlaComputation& select,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const XlaOp& source, const XlaOp& init_value,
+      const XlaComputation& scatter);
+
+  // Enqueues an abs instruction onto the computation.
+  XlaOp Abs(const XlaOp& operand);
+
+  // Enqueues a atan2 instruction onto the computation.
+  XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues an exp instruction onto the computation.
+  XlaOp Exp(const XlaOp& operand);
+
+  // Enqueues a floor instruction onto the computation.
+  XlaOp Floor(const XlaOp& operand);
+
+  // Enqueues a ceil instruction onto the computation.
+  XlaOp Ceil(const XlaOp& operand);
+
+  // Enqueues a round instruction onto the computation, rounding to nearest even
+  // with half-way cases rounding away from zero.
+  XlaOp Round(const XlaOp& operand);
+
+  // Enqueues an log instruction (natural logarithm) onto the computation.
+  XlaOp Log(const XlaOp& operand);
+
+  // Enqueues a sign instruction onto the computation.
+  XlaOp Sign(const XlaOp& operand);
+
+  // Enqueues a count leading zeros instruction onto the computation.
+  XlaOp Clz(const XlaOp& operand);
+
+  // Enqueues a cosine instruction onto the computation.
+  XlaOp Cos(const XlaOp& operand);
+
+  // Enqueues a sine instruction onto the computation.
+  XlaOp Sin(const XlaOp& operand);
+
+  // Enqueues a tanh instruction onto the computation.
+  XlaOp Tanh(const XlaOp& operand);
+
+  // Enqueues a real-part instruction onto the computation.
+  XlaOp Real(const XlaOp& operand);
+
+  // Enqueues an imaginary-part instruction onto the computation.
+  XlaOp Imag(const XlaOp& operand);
+
+  // Enqueues a float32 sqrt instruction onto the computation.
+  // (float32 is specified as there is an implicit float32 0.5f constant
+  // exponent).
+  XlaOp SqrtF32(const XlaOp& operand);
+
+  // Enqueues a float32 square instruction onto the computation.
+  // (float32 is specified as there is an implicit float32 2.0f constant
+  // exponent).
+  XlaOp SquareF32(const XlaOp& operand);
+
+  // Enqueues a lhs^rhs computation onto the computation.
+  XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues an operator that tests if the operand's values are finite, i.e.,
+  // not Inf or NaN. Defined only for floating-point types. Returns an array of
+  // booleans with the same shape where entries are true iff the corresponding
+  // entry was NaN.
+  XlaOp IsFinite(const XlaOp& operand);
+
+  // Enqueues a convert instruction onto the computation that changes the
+  // element type of the operand array to primitive_type.
+  XlaOp ConvertElementType(const XlaOp& operand,
+                           PrimitiveType new_element_type);
+
+  // Enqueues a no-op instruction onto the computation that changes
+  // the element type of the operand array to primitive_type. The
+  // bit-widths of the source and destination element types must be
+  // identical.
+  XlaOp BitcastConvertType(const XlaOp& operand,
+                           PrimitiveType new_element_type);
+
+  // Enqueues a float32 reciprocal instruction onto the computation.
+  // (float32 is specified as there is an implicit float32 -1.0f constant
+  // exponent).
+  //
+  // TODO(b/34468990) axe F32 suffix, can be determined by reflecting on the
+  // shape of the operand.
+  XlaOp ReciprocalF32(const XlaOp& operand);
+
+  // Enqueues a negate instruction onto the computation.
+  XlaOp Neg(const XlaOp& operand);
+
+  // Enqueues a transpose instruction onto the computation.
+  XlaOp Transpose(const XlaOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> permutation);
+
+  // Enqueues a reverse instruction onto the computation. The order of the
+  // elements in the given dimensions is reversed (i.e., the element at index i
+  // is moved to index dimension_size - 1 - i).
+  XlaOp Rev(const XlaOp& operand,
+            tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Enqueues a sort (as increasing order) instruction onto the computation.
+  XlaOp Sort(const XlaOp& operand);
+
+  // Enqueues a clamp instruction onto the computation.
+  XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+
+  // Enqueues a map instruction onto the computation.
+  XlaOp Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
+            const XlaComputation& computation,
+            tensorflow::gtl::ArraySlice<int64> dimensions,
+            tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
+
+  // Enqueues a N(mu, sigma) random number generation instruction onto the
+  // computation.
+  XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
+
+  // Enqueues a U(a, b) random number generation instruction onto the
+  // computation. Returns values in the semi-open interval [a, b).
+  XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+
+  // Enqueues a while node onto the computation.
+  XlaOp While(const XlaComputation& condition, const XlaComputation& body,
+              const XlaOp& init);
+
+  // Enqueues a conditional node onto the computation.
+  XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                    const XlaComputation& true_computation,
+                    const XlaOp& false_operand,
+                    const XlaComputation& false_computation);
+
+  // Enqueues a ReducePrecision node onto the computation.
+  XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                        const int mantissa_bits);
+
+  // Enqueues a Gather node onto the computation.
+  XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+               const GatherDimensionNumbers& dimension_numbers,
+               tensorflow::gtl::ArraySlice<int64> window_bounds);
+
+  // Enqueues a Send node onto the computation, to send the given operand to
+  // a Recv instruction that shares the same channel handle.
+  void Send(const XlaOp& operand, const ChannelHandle& handle);
+
+  // Enqueues a Recv node onto the computation. The data comes from a Send
+  // instruction that shares the same channel handle and its shape must
+  // be the same as the given shape.
+  XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
+
+  // Returns true if 'operand' is a compile-time constant. A compile-time
+  // constant does not depend on any parameters, or on stateful operators such
+  // as `RngNormal` or `Infeed`.
+  //
+  // This tests whether a computation is a compile-time constant without
+  // evaluating the computation.
+  StatusOr<bool> IsConstant(const XlaOp& operand) const;
+
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
+  // is the normalized result and batch_mean and batch_var are the mean and
+  // variance, respectively, across batch for the operand.
+  XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                          const XlaOp& offset, float epsilon,
+                          int64 feature_index);
+
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
+  // computing `mean` and `variance` for each batch inside the operation. It
+  // uses the input `mean` and `variance` instead as estimated values. The
+  // purpose of this op is to reduce latency in inference, hence the name
+  // `BatchNormInference`.
+  //
+  // The output has the same shape as `operand`, and contains the normalized
+  // values for each batch.
+  XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                           const XlaOp& offset, const XlaOp& mean,
+                           const XlaOp& variance, float epsilon,
+                           int64 feature_index);
+
+  // Calculates the gradients of a batch norm op.
+  //
+  // The inputs `batch_mean` and `batch_var` represent the mean and variance
+  // across the batch.
+  //
+  // Returns a tuple of three elements:
+  //   - grad_operand: Gradient with respect to input `operand`
+  //   - grad_offset: Gradient with respect to input `offset`
+  //   - grad_scale: Gradient with respect to input `scale`
+  XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                      const XlaOp& batch_mean, const XlaOp& batch_var,
+                      const XlaOp& grad_output, float epsilon,
+                      int64 feature_index);
+
+  // Returns a new XlaBuilder whose resultant Computation is used only by this
+  // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
+  // behavior as the parent.
+  std::unique_ptr<XlaBuilder> CreateSubBuilder(const string& computation_name);
+
+  // Builds the computation with the requested operations, or returns a non-ok
+  // status. Note that all ops that have been enqueued will be moved to the
+  // computation being returned.
+  StatusOr<XlaComputation> Build();
+
+  // Builds the computation with the requested operations, or notes an error in
+  // the parent XlaBuilder and returns an empty computation if building failed.
+  // This function is intended to be used where the returned XlaComputation is
+  // only used by the parent XlaBuilder and hence further operation on the
+  // returned XlaComputation will simply be error'ed out if an error occurred
+  // while building this computation. If the built computation is to be used by
+  // a XlaBuilder other than the parent XlaBuilder then Build() should be used
+  // instead.
+  XlaComputation BuildAndNoteError();
+
+  // Returns a subgraph that roots on the given root. If the root is not a
+  // compile-time constant (see `IsConstant`), returns an error.
+  //
+  // This will copy the needed ops/computations to the subgraph.
+  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op) const;
+
+  // Returns the first error that was encountered while building the
+  // computation. When an error is encountered, by default we return a vacuous
+  // XlaOp and inform the user of the error that occurred while
+  // building the computation when they make a final call to Build().
+  //
+  // See also set_die_immediately_on_error().
+  Status first_error() const { return first_error_; }
+
+  // Returns the shape of the given op.
+  StatusOr<Shape> GetShape(const XlaOp& op) const;
+
+  // Returns the (inferred) result for the current computation's shape.
+  StatusOr<ProgramShape> GetProgramShape() const;
+
+ private:
+  StatusOr<XlaOp> AddInstruction(
+      HloInstructionProto&& instr, HloOpcode opcode,
+      tensorflow::gtl::ArraySlice<XlaOp> operands = {});
+
+  void AddCalledComputation(const XlaComputation& computation,
+                            HloInstructionProto* instr);
+
+  // Notes that the error occurred by:
+  // * storing it internally and capturing a backtrace if it's the first error
+  //   (this deferred value will be produced on the call to Build())
+  // * dying if die_immediately_on_error_ is true
+  void NoteError(const Status& error);
+
+  XlaOp NoteErrorOrReturn(const std::function<StatusOr<XlaOp>()>& op_creator);
+
+  // Helper method that creates an empty op and notes error.
+  XlaOp UnimplementedOp();
+
+  StatusOr<const HloInstructionProto*> LookUpInstruction(const XlaOp& op) const;
+
+  // Internal helper method that does the building for an arbitrary unary op.
+  XlaOp UnaryOp(HloOpcode unop, const XlaOp& operand);
+
+  // Internal helper method that does the building for an arbitrary binary op.
+  // broadcast_dimensions specifies which dimensions to use for broadcasting
+  // when the operation is between tensors of different ranks.
+  XlaOp BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
+                 tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Internal helper method that does the building for an arbitrary ternary op.
+  XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
+                  const XlaOp& ehs);
+
+  XlaOp RngOp(RandomDistribution distribution,
+              tensorflow::gtl::ArraySlice<XlaOp> parameters,
+              const Shape& shape);
+
+  StatusOr<XlaOp> InDimBroadcast(
+      const Shape& shape, const XlaOp& operand,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Internal helper method that creates a sequence of instructions that
+  // performs an explicit broadcast of the operand to the target shape.
+  StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
+                                       const XlaOp& operand);
+
+  // Internal helper method for creating a Reshape op with the already inferred
+  // shape.
+  StatusOr<XlaOp> Reshape(const Shape& shape, const XlaOp& operand);
+
+  // Returns the (inferred) result for the program shape for the current
+  // computation and fills the root_id in the pointer.
+  StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
+
+  // A visitor which checks whether an operation is a compile-time constant,
+  // meaning that it doesn't depend on any parameters, or on any stateful
+  // operation such as `RngNormal` or `Infeed`. The visitor walks the
+  // computation starting at a given operation and sets is_constant to false iff
+  // a parameter or stateful operation is encountered.
+  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
+                         bool* is_constant) const;
+
+  // Checks bounds for convolution parameters.
+  Status VerifyConvolution(
+      const Shape& lhs_shape, const Shape& rhs_shape,
+      const ConvolutionDimensionNumbers& dimension_numbers) const;
+
+  // Helper function for creating a Window proto from user-supplied data.
+  // Returns error if the user-supplied data was invalid.
+  StatusOr<Window> MakeWindow(
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation) const;
+
+  string name_;  // Name to use for the built computation.
+
+  // The first error encountered while building the computation.
+  // This is OK until the first error is encountered.
+  Status first_error_;
+
+  // The saved stack trace from the point at which the first error occurred.
+  tensorflow::SavedStackTrace first_error_backtrace_;
+
+  // The instructions of this computation.
+  std::vector<HloInstructionProto> instructions_;
+
+  // The embedded computations used by this computation. Each computation was
+  // the entry computation of some XlaComputation, the key is the unique id of
+  // that XlaComputation.
+  std::map<int64, HloComputationProto> embedded_;
+
+  // The unique parameter numbers.
+  tensorflow::gtl::FlatSet<int64> parameter_numbers_;
+
+  // The metadata to attach to each op. This is structured as a "modal"-like
+  // operation, in order to simplify client code (and not sprinkle this metadata
+  // throughout the TensorFlow op kernel implementations).
+  OpMetadata metadata_;
+
+  // Sharding for this operator. This is structured as a "model"-like operation,
+  // in order to simplify client code, similar to metadata_.
+  tensorflow::gtl::optional<OpSharding> sharding_;
+
+  // Mode bit that indicates whether to die when a first error is encountered.
+  bool die_immediately_on_error_ = false;
+
+  XlaBuilder* parent_builder_{nullptr};
+};
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR0(NativeT value) {
+  return ConstantLiteral(*Literal::CreateR0<NativeT>(value));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values) {
+  return ConstantLiteral(*Literal::CreateR1<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) {
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
+  literal.PopulateWithValue(value);
+  return ConstantLiteral(literal);
+}
+
+inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) {
+  return ConstantLiteral(*Literal::CreateR1(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2(
+    std::initializer_list<std::initializer_list<NativeT>> values) {
+  return ConstantLiteral(*Literal::CreateR2<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array<NativeT>& values,
+                                              const Layout& layout) {
+  return ConstantLiteral(
+      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantFromArray(const Array<NativeT>& values) {
+  return ConstantLiteral(*Literal::CreateFromArray<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout(
+    const Array2D<NativeT>& values, const Layout& layout) {
+  return ConstantLiteral(
+      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D<NativeT>& values) {
+  return ConstantLiteral(*Literal::CreateR2FromArray2D<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout(
+    const Array3D<NativeT>& values, const Layout& layout) {
+  return ConstantLiteral(
+      *Literal::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D<NativeT>& values) {
+  return ConstantFromArray(values);
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout(
+    const Array4D<NativeT>& values, const Layout& layout) {
+  return ConstantFromArrayWithLayout(values, layout);
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
+  return ConstantFromArray(values);
+}
+
+// RAII-style object: sets the current sharding assignment in builder on
+// construction, and sets back to the previous assignment on destruction.
+//
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+class XlaScopedShardingAssignment {
+ public:
+  XlaScopedShardingAssignment(xla::XlaBuilder* builder,
+                              tensorflow::gtl::optional<OpSharding> sharding)
+      : builder_(builder), prev_sharding_(builder->sharding()) {
+    SetSharding(sharding);
+  }
+
+  XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete;
+  XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) =
+      delete;
+
+  ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); }
+
+ private:
+  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
+    if (sharding.has_value()) {
+      builder_->SetSharding(sharding.value());
+    } else {
+      builder_->ClearSharding();
+    }
+  }
+
+  xla::XlaBuilder* const builder_;
+  tensorflow::gtl::optional<OpSharding> prev_sharding_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce984564d016ce65fa6c932f3cda290cc0d75a4a
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
@@ -0,0 +1,237 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+using ::testing::HasSubstr;
+
+// TODO(b/74197823): Move the tests to service/.
+class XlaBuilderTest : public ::testing::Test {
+ protected:
+  StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder* b) {
+    TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build());
+    const HloModuleProto& proto = computation.proto();
+    TF_ASSIGN_OR_RETURN(const auto& config,
+                        HloModule::CreateModuleConfigFromProto(
+                            proto, legacy_flags::GetDebugOptionsFromFlags()));
+    return HloModule::CreateFromProto(proto, config);
+  }
+
+  // Returns the name of the test currently being run.
+  string TestName() const {
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+};
+
+TEST_F(XlaBuilderTest, OnePlusTwo) {
+  XlaBuilder b(TestName());
+  b.Add(b.ConstantR0<float>(1.0), b.ConstantR0<float>(2.0));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Constant(), op::Constant()));
+}
+
+TEST_F(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {3, 5}), "x");
+  b.Add(x, b.ConstantR0<float>(1.0));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Parameter(), op::Broadcast(op::Constant())));
+}
+
+TEST_F(XlaBuilderTest, ParamPlusParamHasBroadcast) {
+  XlaBuilder b(TestName());
+  const auto& x_shape = ShapeUtil::MakeShape(S32, {2, 4, 6});
+  const auto& y_shape = ShapeUtil::MakeShape(S32, {2, 4});
+  auto x = b.Parameter(0, x_shape, "x");
+  auto y = b.Parameter(1, y_shape, "y");
+  auto add = b.Add(x, y, /*broadcast_dimensions=*/{0, 1});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto add_shape, add.GetShape());
+  EXPECT_TRUE(ShapeUtil::Equal(add_shape, x_shape));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Parameter(0), op::Broadcast(op::Parameter(1))));
+}
+
+TEST_F(XlaBuilderTest, XPlusX) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(S32, {1, 3, 5, 7}), "x");
+  b.Add(x, x);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Parameter(0), op::Parameter(0)));
+}
+
+TEST_F(XlaBuilderTest, ShapeInferenceError) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(U32, {2, 4, 6}), "x");
+  auto y = b.Parameter(1, ShapeUtil::MakeShape(U32, {2, 4}), "y");
+  b.Add(x, y);
+  auto statusor = BuildHloModule(&b);
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(statusor.status().error_message(), HasSubstr("shape inference"));
+}
+
+TEST_F(XlaBuilderTest, ParameterAlreadyRegistered) {
+  XlaBuilder b_call("add");
+  b_call.Parameter(0, ShapeUtil::MakeShape(PRED, {}), "x");
+
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(PRED, {}), "x");
+  auto y = b.Parameter(0, ShapeUtil::MakeShape(PRED, {}), "y");
+  b.Add(x, y);
+  auto statusor = BuildHloModule(&b);
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("parameter 0 already registered"));
+}
+
+TEST_F(XlaBuilderTest, Call) {
+  XlaBuilder b_call("the_only_to_apply");
+  auto p0 = b_call.Parameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+  auto p1 = b_call.Parameter(1, ShapeUtil::MakeShape(F32, {}), "p1");
+  b_call.Add(p0, p1);
+  TF_ASSERT_OK_AND_ASSIGN(auto call, b_call.Build());
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
+  auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
+  auto one = b.ConstantR0<float>(1);
+  auto two = b.ConstantR0<float>(2);
+  b.Add(b.Call(call, {x, y}), b.Call(call, {one, two}));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Call(op::Parameter(), op::Parameter()),
+                            op::Call(op::Constant(), op::Constant())));
+}
+
+TEST_F(XlaBuilderTest, BinopHasDegenerateBroadcast) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {1, 2, 3}), "x");
+  auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {1, 2, 1}), "y");
+  b.Add(x, y);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+
+  // Expected:
+  //
+  //  x: f32[1,2,3]  y: f32[1,2,1]
+  //      |               |
+  //      |          reshape: f32[1,2]
+  //      |               |
+  //      |          broadcast: f32[1,2,3]
+  //       \             /
+  //            add
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Parameter(0),
+                            op::Broadcast(op::Reshape(op::Parameter(1)))));
+}
+
+TEST_F(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 3}), "x");
+  auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {2, 1, 4}), "y");
+  b.Add(x, y, /*broadcast_dimensions=*/{0, 1});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+
+  // The binary operation has in-dim broadcast and degenerate broadcast, should
+  // first do the in-dim broadcast then convert the degnerate broadcast into a
+  // reshape and a broadcast.
+  //
+  // Expected:
+  //
+  //  x: f32[2,3]            y: f32[2,1,4]
+  //      |                        |
+  //  broadcast: f32[2,3,4]  reshape: f32[2,4]
+  //      |                        |
+  //      |                  broadcast: f32[2,3,4]
+  //       \                      /
+  //                 add
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Broadcast(op::Parameter(0)),
+                            op::Broadcast(op::Reshape(op::Parameter(1)))));
+}
+
+TEST_F(XlaBuilderTest, OperandFromWrongBuilder) {
+  XlaBuilder b1("b1");
+  auto p0 = b1.Parameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+  XlaBuilder builder("main");
+  builder.Add(p0, p0);
+  auto statusor = builder.Build();
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Do not add XlaOp from builder b1 to builder main"));
+}
+
+TEST_F(XlaBuilderTest, ReshapeDefaultOrder) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x");
+  b.Reshape(x, /*new_sizes=*/{6, 35});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Reshape(op::Parameter()));
+}
+
+TEST_F(XlaBuilderTest, ReshapeHasTranspose) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x");
+  b.Reshape(x, /*dimensions=*/{3, 2, 1, 0}, /*new_sizes=*/{6, 35});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Reshape(op::Transpose(op::Parameter())));
+}
+
+TEST_F(XlaBuilderTest, Transpose) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
+  b.Transpose(x, /*permutation=*/{1, 0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Transpose(op::Parameter()));
+}
+
+// TODO(b/65209188): Create a dedicated lowering for Xor.
+TEST_F(XlaBuilderTest, Xor) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(PRED, {}), "x");
+  auto y = b.Parameter(1, ShapeUtil::MakeShape(PRED, {}), "y");
+  b.Xor(x, y);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  LOG(ERROR) << module->ToString();
+  EXPECT_THAT(root,
+              op::Or(op::And(op::Not(op::Parameter(0)), op::Parameter(1)),
+                     op::And(op::Parameter(0), op::Not(op::Parameter(1)))));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72e3935696e0c44ae3893fc8f1ceb261fa5e2646
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
+  TF_RET_CHECK(proto_.has_program_shape());
+  return proto_.program_shape();
+}
+
+StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
+  if (IsNull()) {
+    return InvalidArgument("Computation is invalid.");
+  }
+  auto session = MakeUnique<HloSnapshot>();
+  *session->mutable_hlo()->mutable_hlo_module() = proto_;
+  return std::move(session);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
new file mode 100644
index 0000000000000000000000000000000000000000..b70b57e9ffec40188f246f5e884146012c02f4a2
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// The computation graph that the user builds up with the XlaBuilder.
+//
+// TODO(b/74197823): Replace xla::Computation with this one.
+class XlaComputation {
+ public:
+  XlaComputation() : unique_id_(-1) {}
+  XlaComputation(const HloModuleProto& proto)
+      : unique_id_(proto.id()), proto_(proto) {}
+
+  ~XlaComputation() {}
+
+  XlaComputation(const XlaComputation&) = delete;
+  XlaComputation& operator=(const XlaComputation&) = delete;
+
+  XlaComputation(XlaComputation&& from) = default;
+
+  XlaComputation& operator=(XlaComputation&& from) = default;
+
+  // Returns the "program shape" (parameter and return shapes) for this
+  // computation.
+  StatusOr<ProgramShape> GetProgramShape() const;
+
+  const HloModuleProto& proto() const { return proto_; }
+
+  // Requests that we snapshot the computation into a serializable protocol
+  // buffer form.
+  StatusOr<std::unique_ptr<HloSnapshot>> Snapshot() const;
+
+  // Returns true if this object is a null Computation.
+  bool IsNull() const { return unique_id_ == -1; }
+
+ private:
+  XlaComputation(const int64 unique_id) : unique_id_(unique_id) {}
+  HloModuleProto* mutable_proto() { return &proto_; }
+  friend class XlaBuilder;
+
+  int64 unique_id_;
+  HloModuleProto proto_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/device_util.h b/tensorflow/compiler/xla/device_util.h
index 23a622b1ad0e2f3b220645f62767271f28df24e9..1a51fdee680721a4a03fa5de79a81746d92af76b 100644
--- a/tensorflow/compiler/xla/device_util.h
+++ b/tensorflow/compiler/xla/device_util.h
@@ -29,7 +29,7 @@ namespace xla {
 
 // Returns a string that represents the device in terms of platform and ordinal;
 // e.g. the first CUDA device will be "cuda:0"
-string DeviceIdentifier(perftools::gputools::StreamExecutor* stream_exec) {
+string DeviceIdentifier(se::StreamExecutor* stream_exec) {
   return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":",
                                      stream_exec->device_ordinal());
 }
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 392ad9010ab81923a089c7b00a79ddc281af92bb..a472747bd174e3bbd352f07f2ab092e678b81073 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -36,26 +36,15 @@ DeviceMemoryAllocator* ExecutableRunOptions::allocator() const {
 }
 
 ExecutableRunOptions& ExecutableRunOptions::set_stream(
-    perftools::gputools::Stream* stream) {
+    stream_executor::Stream* stream) {
   stream_ = stream;
   return *this;
 }
 
-perftools::gputools::Stream* ExecutableRunOptions::stream() const {
+stream_executor::Stream* ExecutableRunOptions::stream() const {
   return stream_;
 }
 
-ExecutableRunOptions& ExecutableRunOptions::set_inter_op_thread_pool(
-    tensorflow::thread::ThreadPool* inter_op_thread_pool) {
-  inter_op_thread_pool_ = inter_op_thread_pool;
-  return *this;
-}
-
-tensorflow::thread::ThreadPool* ExecutableRunOptions::inter_op_thread_pool()
-    const {
-  return inter_op_thread_pool_;
-}
-
 ExecutableRunOptions& ExecutableRunOptions::set_intra_op_thread_pool(
     const Eigen::ThreadPoolDevice* intra_op_thread_pool) {
   intra_op_thread_pool_ = intra_op_thread_pool;
@@ -87,4 +76,11 @@ const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_rng_seed(int rng_seed) {
+  rng_seed_ = rng_seed;
+  return *this;
+}
+
+int ExecutableRunOptions::rng_seed() const { return rng_seed_; }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index d4fcbf0493c936ebcd0639a432e56b62ee15672c..416131be006e6ecddb47651f8b684c1d91df4892 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,26 +16,27 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
-// Intentionally forward declared so that ExecutableRunOptions can be linked
+// Pulls in the ::stream_executor -> ::xla::se namespace alias.
+#include "tensorflow/compiler/xla/types.h"
+
+// These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
 // need to be linked).
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Stream;
 class Platform;
-}
-}
+}  // namespace stream_executor
 
 namespace tensorflow {
 namespace thread {
 class ThreadPool;
-}
-}
+}  // namespace thread
+}  // namespace tensorflow
 
 namespace Eigen {
 struct ThreadPoolDevice;
-}
+}  // namespace Eigen
 
 namespace xla {
 
@@ -61,14 +62,8 @@ class ExecutableRunOptions {
   // If set, this is the stream to run the computation on. The platform of the
   // stream must match the platform the executable was built for.  A value of
   // nullptr indicates the option has not been set.
-  ExecutableRunOptions& set_stream(perftools::gputools::Stream* stream);
-  perftools::gputools::Stream* stream() const;
-
-  // Sets the thread pool on which to run parallel CPU backend
-  // computations. Does not take ownership.
-  ExecutableRunOptions& set_inter_op_thread_pool(
-      tensorflow::thread::ThreadPool* inter_op_thread_pool);
-  tensorflow::thread::ThreadPool* inter_op_thread_pool() const;
+  ExecutableRunOptions& set_stream(stream_executor::Stream* stream);
+  stream_executor::Stream* stream() const;
 
   // Sets the thread pool device on which to run Eigen subcomputations.
   // Does not take ownership.
@@ -84,14 +79,17 @@ class ExecutableRunOptions {
       DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
+  ExecutableRunOptions& set_rng_seed(int rng_seed);
+  int rng_seed() const;
+
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
   DeviceAssignment* device_assignment_ = nullptr;
-  perftools::gputools::Stream* stream_ = nullptr;
-  tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
+  stream_executor::Stream* stream_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
+  int rng_seed_ = 0;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD
index 0a9725db0a4fcf963cadcacf2cbc1d95d2c7239d..89353448e29ec3d97275dac288e23aa8e96e31b2 100644
--- a/tensorflow/compiler/xla/legacy_flags/BUILD
+++ b/tensorflow/compiler/xla/legacy_flags/BUILD
@@ -75,17 +75,3 @@ tf_cc_test(
             "//tensorflow/core:test",
         ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index c8ed3e3a2b009ddffdfb79a9a6ced8d5e736bee6..bc8405703b02dc1b0c4c87005ea3c15372552157 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -40,7 +40,10 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   flags->set_xla_cpu_multi_thread_eigen(true);
   flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   flags->set_xla_eliminate_hlo_implicit_broadcast(true);
-
+#ifdef INTEL_MKL
+  flags->set_xla_cpu_use_mkl_dnn(true);
+#endif  // INTEL_MKL
+  flags->set_xla_gpu_max_kernel_unroll_factor(4);
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
   flags->set_xla_gpu_use_cudnn_batchnorm(false);
@@ -220,6 +223,11 @@ void AllocateFlags() {
           bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
           flag_values->xla_gpu_disable_multi_streaming(),
           "If true, multi-streaming in the GPU backend is disabled."),
+      tensorflow::Flag(
+          "xla_gpu_max_kernel_unroll_factor",
+          int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
+          flag_values->xla_gpu_max_kernel_unroll_factor(),
+          "Specify the maximum kernel unroll factor for the GPU backend."),
       tensorflow::Flag(
           "xla_dump_optimized_hlo_proto_to",
           flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
@@ -288,6 +296,10 @@ void AllocateFlags() {
           flag_values->xla_gpu_use_cudnn_batchnorm(),
           "Allows the GPU backend to implement batchnorm HLOs using cudnn, "
           "rather than expanding them to a soup of HLOs."),
+      tensorflow::Flag("xla_cpu_use_mkl_dnn",
+                       bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
+                       flag_values->xla_cpu_use_mkl_dnn(),
+                       "Generate calls to MKL-DNN in the CPU backend."),
   });
   ParseFlagsFromEnv(*flag_objects);
 }
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
index a3b4286f4c12bf39a44c63dd6e7d303a46a418c3..7b6ae311c1099dccb8dceb2f49743c1b185cd5ab 100644
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
+++ b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index e0a9b148b443e90a0c4f3e19660b6234d49eef84..bb6dd4f9098aefc1c2bbb1b1c41b3cee856b67de 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -44,8 +44,16 @@ namespace {
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 
-// Converts between little and big endian, assuming elements in the array are 16
-// bits long.
+// Converts between little and big endian.
+//
+// Precondition: size % 2 == 0 (elements in the array are 16 bits long)
+void ConvertEndianShort(string* bytes) {
+  CHECK_EQ(bytes->size() / 2, 0);
+  for (int64 i = 0; i < bytes->size(); i += 2) {
+    std::swap((*bytes)[i], (*bytes)[i + 1]);
+  }
+}
+
 void ConvertEndianShort(char* bytes, int64 size) {
   CHECK_EQ(size / 2, 0);
   for (int64 i = 0; i < size; i += 2) {
@@ -97,11 +105,18 @@ Literal::Literal(const Shape& shape, bool allocate_arrays)
     const Shape& subshape = piece.subshape();
     if (ShapeUtil::IsArray(subshape)) {
       if (allocate_arrays) {
-        piece.set_buffer(new char[piece.size_bytes()]);
         if (LayoutUtil::IsSparseArray(subshape)) {
+          // For sparse arrays, the buffer must be of the size of the maximum
+          // number of sparse elements possible.
+          const int64 max_sparse_elements =
+              LayoutUtil::MaxSparseElements(subshape.layout());
+          piece.set_buffer(
+              new char[max_sparse_elements * ShapeUtil::ByteSizeOfPrimitiveType(
+                                                 subshape.element_type())]);
           piece.set_sparse_indices(new SparseIndexArray(
-              LayoutUtil::MaxSparseElements(subshape.layout()),
-              ShapeUtil::Rank(subshape)));
+              max_sparse_elements, ShapeUtil::Rank(subshape)));
+        } else {
+          piece.set_buffer(new char[piece.size_bytes()]);
         }
       } else {
         piece.set_buffer(nullptr);
@@ -223,7 +238,7 @@ Status Literal::CopySliceFromInternal(
     Literal::StrideConfig stride_config(src_literal.shape(), shape(),
                                         copy_size);
 
-    auto copy_proc = [&](const std::vector<int64>& indexes) {
+    auto copy_proc = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
       // Map from multi-dimensional index, to source index.
       std::transform(indexes.begin(), indexes.end(), src_base.begin(),
                      src_indexes.begin(), std::plus<int64>());
@@ -248,6 +263,28 @@ Status Literal::CopySliceFromInternal(
   return Status::OK();
 }
 
+Status Literal::CopyElementFrom(const Literal& src_literal,
+                                tensorflow::gtl::ArraySlice<int64> src_index,
+                                tensorflow::gtl::ArraySlice<int64> dest_index) {
+  DCHECK_EQ(shape().element_type(), src_literal.shape().element_type());
+  const int64 src_linear_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+      src_literal.shape(), src_index);
+  const int64 dest_linear_index =
+      IndexUtil::MultidimensionalIndexToLinearIndex(shape(), dest_index);
+  const int64 primitive_size =
+      ShapeUtil::ByteSizeOfPrimitiveType(shape().element_type());
+
+  char* dest_address =
+      static_cast<char*>(untyped_data()) + dest_linear_index * primitive_size;
+  const char* source_address =
+      static_cast<const char*>(src_literal.untyped_data()) +
+      src_linear_index * primitive_size;
+  if (dest_address != source_address) {
+    memcpy(dest_address, source_address, primitive_size);
+  }
+  return Status::OK();
+}
+
 std::vector<Literal> Literal::DecomposeTuple() {
   CHECK(ShapeUtil::IsTuple(shape()));
   std::vector<Literal> elements;
@@ -343,7 +380,7 @@ Status Literal::Piece::CopyFrom(const Literal::Piece& src) {
 #undef COPY_ELEMENTS
       default:
         return Unimplemented(
-            "Unhandled primitive type %s",
+            "Copying a Literal object with element type %s is not implemented.",
             PrimitiveType_Name(subshape().element_type()).c_str());
     }
   }
@@ -491,7 +528,10 @@ Status Literal::CopySliceFrom(const Literal& src_literal,
     default:
       break;
   }
-  return Unimplemented("Unhandled primitive type %d", shape().element_type());
+  return Unimplemented(
+      "Copying a slice from a Literal object with element type %d is not "
+      "implemented.",
+      shape().element_type());
 }
 
 /* static */ Literal Literal::Zero(PrimitiveType primitive_type) {
@@ -808,9 +848,10 @@ std::unique_ptr<Literal> Literal::Slice(
   DimensionVector result_dimensions;
   for (int64 dnum = 0; dnum < ShapeUtil::Rank(shape()); ++dnum) {
     CHECK_GE(start_indices[dnum], 0);
-    CHECK_LE(limit_indices[dnum], shape().dimensions(dnum));
+    CHECK_LE(limit_indices[dnum], shape().dimensions(dnum))
+        << "dnum = " << dnum;
     int64 dimension = limit_indices[dnum] - start_indices[dnum];
-    CHECK_GT(dimension, 0);
+    CHECK_GE(dimension, 0) << "dnum = " << dnum;
     result_dimensions.push_back(dimension);
   }
   const auto result_shape =
@@ -903,7 +944,7 @@ string Literal::GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
     case U64:
       return StrCat(Get<uint64>(multi_index, shape_index));
     case F16:
-      return StrCat(Get<half>(multi_index, shape_index));
+      return StrCat(static_cast<float>(Get<half>(multi_index, shape_index)));
     case F32:
       return StrCat(Get<float>(multi_index, shape_index));
     case BF16:
@@ -953,7 +994,8 @@ string Literal::GetSparseElementAsString(int64 sparse_element_number,
       return StrCat(
           GetSparseElement<uint64>(sparse_element_number, shape_index));
     case F16:
-      return StrCat(GetSparseElement<half>(sparse_element_number, shape_index));
+      return StrCat(static_cast<float>(
+          GetSparseElement<half>(sparse_element_number, shape_index)));
     case F32:
       return StrCat(
           GetSparseElement<float>(sparse_element_number, shape_index));
@@ -997,6 +1039,36 @@ StatusOr<int64> Literal::GetIntegralAsS64(
   }
 }
 
+Status Literal::SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
+                                 int64 value) {
+  CHECK(LayoutUtil::IsDenseArray(shape()));
+  switch (shape().element_type()) {
+    case PRED:
+      Set<bool>(multi_index, value);
+      break;
+    case U8:
+      Set<uint8>(multi_index, value);
+      break;
+    case S32:
+      Set<int32>(multi_index, value);
+      break;
+    case S64:
+      Set<int64>(multi_index, value);
+      break;
+    case U32:
+      Set<uint32>(multi_index, value);
+      break;
+    case U64:
+      Set<uint64>(multi_index, value);
+      break;
+    default:
+      return FailedPrecondition(
+          "Array element type is not integral: %s",
+          PrimitiveType_Name(shape().element_type()).c_str());
+  }
+  return Status::OK();
+}
+
 tensorflow::gtl::ArraySlice<int64> Literal::GetSparseIndex(
     int64 sparse_element_number, const ShapeIndex& shape_index) const {
   const Piece& p = piece(shape_index);
@@ -1009,6 +1081,49 @@ void Literal::SortSparseElements(const ShapeIndex& shape_index) {
   piece(shape_index).SortSparseElements();
 }
 
+Literal Literal::GetFirstScalarLiteral() const {
+  CHECK(ShapeUtil::IsArray(shape_));
+  CHECK_GT(ShapeUtil::ElementsIn(shape_), 0);
+  switch (shape_.element_type()) {
+    case PRED:
+      return std::move(*Literal::CreateR0<bool>(GetFirstElement<bool>()));
+    // 8 bit types.
+    case S8:
+      return std::move(*Literal::CreateR0<int8>(GetFirstElement<int8>()));
+    case U8:
+      return std::move(*Literal::CreateR0<uint8>(GetFirstElement<uint8>()));
+    // 16 bit types.
+    case BF16:
+      return std::move(
+          *Literal::CreateR0<bfloat16>(GetFirstElement<bfloat16>()));
+    case F16:
+      return std::move(*Literal::CreateR0<half>(GetFirstElement<half>()));
+    case S16:
+      return std::move(*Literal::CreateR0<int16>(GetFirstElement<int16>()));
+    case U16:
+      return std::move(*Literal::CreateR0<uint16>(GetFirstElement<uint16>()));
+    // 32 bit types.
+    case F32:
+      return std::move(*Literal::CreateR0<float>(GetFirstElement<float>()));
+    case S32:
+      return std::move(*Literal::CreateR0<int32>(GetFirstElement<int32>()));
+    case U32:
+      return std::move(*Literal::CreateR0<uint32>(GetFirstElement<uint32>()));
+    // 64 bit types.
+    case C64:
+      return std::move(
+          *Literal::CreateR0<complex64>(GetFirstElement<complex64>()));
+    case F64:
+      return std::move(*Literal::CreateR0<double>(GetFirstElement<double>()));
+    case S64:
+      return std::move(*Literal::CreateR0<int64>(GetFirstElement<int64>()));
+    case U64:
+      return std::move(*Literal::CreateR0<uint64>(GetFirstElement<uint64>()));
+    default:
+      LOG(FATAL) << "Unhandled primitive type " << shape_.element_type();
+  }
+}
+
 void Literal::Piece::SortSparseElements() {
   switch (subshape().element_type()) {
     case PRED:
@@ -1285,8 +1400,9 @@ void Literal::EachCellAsString(
 }
 
 namespace {
-template <typename NativeSrcT, typename NativeDestT>
-std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
+template <typename NativeSrcT, typename NativeDestT, typename ConverterType>
+std::unique_ptr<Literal> ConvertBetweenNativeTypesWithConverter(
+    const Literal& src_literal, const ConverterType& converter) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
   auto result_literal = MakeUnique<Literal>(ShapeUtil::ChangeElementType(
       src_literal.shape(),
@@ -1296,11 +1412,40 @@ std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
   int64 num_elements = src_literal.element_count();
 
   for (int64 i = 0; i < num_elements; ++i) {
-    dest_data[i] = static_cast<NativeDestT>(src_data[i]);
+    dest_data[i] = converter(src_data[i]);
   }
   return result_literal;
 }
 
+template <typename NativeSrcT, typename NativeDestT>
+std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
+  auto converter = [](NativeSrcT src) { return static_cast<NativeDestT>(src); };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
+      src_literal, converter);
+}
+
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)),
+                        std::unique_ptr<Literal>>::type
+BitcastBetweenNativeTypes(const Literal& src_literal) {
+  auto converter = [](NativeSrcT src) {
+    return tensorflow::bit_cast<NativeDestT>(src);
+  };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
+      src_literal, converter);
+}
+
+// This template specialization is here to make the compiler happy. bit_cast has
+// a static check that the types are the same size. This specialization should
+// never be used because the source and destination types are checked for
+// identical sizes higher up.
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)),
+                        std::unique_ptr<Literal>>::type
+BitcastBetweenNativeTypes(const Literal& src_literal) {
+  LOG(FATAL) << "Invalid bitcast between types of different sizes.";
+}
+
 template <PrimitiveType primitive_src_type>
 std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
@@ -1320,21 +1465,33 @@ std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
 }
 
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal) {
+std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal,
+                                             bool bitcast) {
   CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-  return ConvertBetweenNativeTypes<
-      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type,
-      typename primitive_util::PrimitiveTypeToNative<
-          primitive_dest_type>::type>(src_literal);
+  if (bitcast) {
+    return BitcastBetweenNativeTypes<
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_src_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_dest_type>::type>(src_literal);
+  } else {
+    return ConvertBetweenNativeTypes<
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_src_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_dest_type>::type>(src_literal);
+  }
 }
 
 template <PrimitiveType primitive_src_type>
 StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+    const Literal& src_literal, PrimitiveType primitive_dest_type,
+    bool bitcast) {
   switch (primitive_dest_type) {
-#define CONVERT_IF_TYPES_MATCH(type) \
-  case (type):                       \
-    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
+#define CONVERT_IF_TYPES_MATCH(type)                                    \
+  case (type):                                                          \
+    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal, \
+                                                           bitcast);
     CONVERT_IF_TYPES_MATCH(PRED)
     CONVERT_IF_TYPES_MATCH(S8)
     CONVERT_IF_TYPES_MATCH(S32)
@@ -1348,25 +1505,31 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
     CONVERT_IF_TYPES_MATCH(BF16)
 #undef CONVERT_IF_TYPES_MATCH
     case C64:
-      return ConvertToC64<primitive_src_type>(src_literal);
+      if (!bitcast) {
+        return ConvertToC64<primitive_src_type>(src_literal);
+      }
+      break;
     // Other types are not yet supported.
     default:
-      return InvalidArgument(
-          "Unimplemented: Convert from type %s to type %s",
-          PrimitiveType_Name(src_literal.shape().element_type()).c_str(),
-          PrimitiveType_Name(primitive_dest_type).c_str());
+      break;
   }
+  return Unimplemented(
+      "Converting from type %s to type %s is not implemented.",
+      PrimitiveType_Name(src_literal.shape().element_type()).c_str(),
+      PrimitiveType_Name(primitive_dest_type).c_str());
 }
 
-}  // namespace
-
-StatusOr<std::unique_ptr<Literal>> Literal::Convert(
-    PrimitiveType primitive_dest_type) const {
-  TF_RET_CHECK(ShapeUtil::IsArray(shape()));
-  switch (shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
-  case (type):                             \
-    return ConvertIfDestTypeMatches<(type)>(*this, primitive_dest_type);
+StatusOr<std::unique_ptr<Literal>> ConvertSwitch(
+    const Literal& literal, PrimitiveType primitive_dest_type, bool bitcast) {
+  TF_RET_CHECK(ShapeUtil::IsArray(literal.shape()));
+  if (literal.shape().element_type() == primitive_dest_type) {
+    return literal.CloneToUnique();
+  }
+  switch (literal.shape().element_type()) {
+#define CONVERT_IF_DEST_TYPE_MATCHES(type)                                \
+  case (type):                                                            \
+    return ConvertIfDestTypeMatches<(type)>(literal, primitive_dest_type, \
+                                            bitcast);
     CONVERT_IF_DEST_TYPE_MATCHES(PRED)
     CONVERT_IF_DEST_TYPE_MATCHES(S8)
     CONVERT_IF_DEST_TYPE_MATCHES(S32)
@@ -1381,10 +1544,60 @@ StatusOr<std::unique_ptr<Literal>> Literal::Convert(
 #undef CONVERT_IF_DEST_TYPE_MATCHES
       // Other types are not yet supported.
     default:
-      return InvalidArgument("Unimplemented: Convert from type %s to type %s",
-                             PrimitiveType_Name(shape().element_type()).c_str(),
-                             PrimitiveType_Name(primitive_dest_type).c_str());
+      return Unimplemented(
+          "%s from type %s to type %s is not implemented.",
+          (bitcast ? "Bitcast converting" : "Converting"),
+          PrimitiveType_Name(literal.shape().element_type()).c_str(),
+          PrimitiveType_Name(primitive_dest_type).c_str());
+  }
+}
+
+}  // namespace
+
+StatusOr<std::unique_ptr<Literal>> Literal::Convert(
+    PrimitiveType primitive_dest_type) const {
+  return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/false);
+}
+
+StatusOr<std::unique_ptr<Literal>> Literal::BitcastConvert(
+    PrimitiveType primitive_dest_type) const {
+  if (primitive_util::BitWidth(shape().element_type()) !=
+      primitive_util::BitWidth(primitive_dest_type)) {
+    return InvalidArgument(
+        "Cannot bitcast convert from %s to %s, bit widths are different: %d != "
+        "%d",
+        PrimitiveType_Name(shape().element_type()).c_str(),
+        PrimitiveType_Name(primitive_dest_type).c_str(),
+        primitive_util::BitWidth(shape().element_type()),
+        primitive_util::BitWidth(primitive_dest_type));
+  }
+  return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/true);
+}
+
+StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
+    const Shape& dest_shape, bool round_f32_to_bf16) const {
+  if (!ShapeUtil::IsTuple(dest_shape)) {
+    if (round_f32_to_bf16 && shape().element_type() == F32 &&
+        dest_shape.element_type() == BF16) {
+      auto converter = [](float src) {
+        return tensorflow::bfloat16::round_to_bfloat16(src);
+      };
+      return ConvertBetweenNativeTypesWithConverter<float, bfloat16>(*this,
+                                                                     converter);
+    }
+    return Convert(dest_shape.element_type());
+  }
+  std::vector<Literal> elements;
+  for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) {
+    auto element = LiteralView::Create(*this, {i});
+    TF_ASSIGN_OR_RETURN(
+        auto new_element,
+        element.ConvertToShape(ShapeUtil::GetSubshape(dest_shape, {i})));
+    elements.push_back(std::move(*new_element));
   }
+  auto converted = MakeUnique<Literal>();
+  *converted = Literal::MoveIntoTuple(&elements);
+  return std::move(converted);
 }
 
 template <typename NativeT>
@@ -1571,6 +1784,92 @@ bool Literal::IsAllComplex(complex64 value) const {
   }
 }
 
+bool Literal::IsAllFirst() const {
+  for (const auto& pair : pieces_) {
+    const Piece& piece = pair.second;
+    if (!ShapeUtil::IsArray(piece.subshape())) {
+      continue;
+    }
+
+    // Empty shapes are not all the first element since there is no first
+    // element.
+    if (ShapeUtil::HasZeroElements(piece.subshape())) {
+      return false;
+    }
+    auto piece_is_all = [&]() {
+      switch (piece.subshape().element_type()) {
+        case PRED: {
+          auto data = piece.data<bool>();
+          return AllElementsEqualValue<bool>(data, data[0]);
+        }
+        // 8 bit types
+        case S8: {
+          auto data = piece.data<int8>();
+          return AllElementsEqualValue<int8>(data, data[0]);
+        }
+        case U8: {
+          auto data = piece.data<uint8>();
+          return AllElementsEqualValue<uint8>(data, data[0]);
+        }
+        // 16 bit types
+        case BF16: {
+          auto data = piece.data<bfloat16>();
+          return AllElementsEqualValue<bfloat16>(data, data[0]);
+        }
+        case F16: {
+          auto data = piece.data<half>();
+          return AllElementsEqualValue<half>(data, data[0]);
+        }
+        case S16: {
+          auto data = piece.data<int16>();
+          return AllElementsEqualValue<int16>(data, data[0]);
+        }
+        case U16: {
+          auto data = piece.data<uint16>();
+          return AllElementsEqualValue<uint16>(data, data[0]);
+        }
+        // 32 bit types
+        case F32: {
+          auto data = piece.data<float>();
+          return AllElementsEqualValue<float>(data, data[0]);
+        }
+        case U32: {
+          auto data = piece.data<uint32>();
+          return AllElementsEqualValue<uint32>(data, data[0]);
+        }
+        case S32: {
+          auto data = piece.data<int32>();
+          return AllElementsEqualValue<int32>(data, data[0]);
+        }
+        // 64 bit types
+        case C64: {
+          auto data = piece.data<complex64>();
+          return AllElementsEqualValue<complex64>(data, data[0]);
+        }
+        case F64: {
+          auto data = piece.data<double>();
+          return AllElementsEqualValue<double>(data, data[0]);
+        }
+        case S64: {
+          auto data = piece.data<int64>();
+          return AllElementsEqualValue<int64>(data, data[0]);
+        }
+        case U64: {
+          auto data = piece.data<uint64>();
+          return AllElementsEqualValue<uint64>(data, data[0]);
+        }
+        default:
+          return false;
+      }
+    };
+
+    if (!piece_is_all()) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool Literal::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
   CHECK(ShapeUtil::IsArray(shape()));
   switch (shape().element_type()) {
@@ -1639,16 +1938,14 @@ void Literal::Piece::WriteToProto(LiteralProto* proto) const {
       *proto->mutable_f16s() = string(
           reinterpret_cast<const char*>(data<half>().data()), size_bytes());
       if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto->mutable_f16s()->data()),
-                           proto->f16s().size());
+        ConvertEndianShort(proto->mutable_f16s());
       }
       break;
     case BF16:
       *proto->mutable_bf16s() = string(
           reinterpret_cast<const char*>(data<bfloat16>().data()), size_bytes());
       if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto->mutable_bf16s()->data()),
-                           proto->bf16s().size());
+        ConvertEndianShort(proto->mutable_bf16s());
       }
       break;
     case F32:
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index d996004888ab521790b4c5a10da2a93f0d98d12f..8aa19222dc4b9175ec72128dfdad448f65c23e91 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -262,6 +262,11 @@ class Literal {
                        tensorflow::gtl::ArraySlice<int64> dest_base,
                        tensorflow::gtl::ArraySlice<int64> copy_size);
 
+  // Copies one element from src_literal[src_index] to (*this)[dest_index].
+  Status CopyElementFrom(const Literal& src_literal,
+                         tensorflow::gtl::ArraySlice<int64> src_index,
+                         tensorflow::gtl::ArraySlice<int64> dest_index);
+
   // Returns a vector containing the tuple elements of this Literal as separate
   // Literals. This Literal must be tuple-shaped and can be a nested tuple. The
   // elements are moved into the new Literals; no data is copied. Upon return
@@ -328,11 +333,30 @@ class Literal {
   template <typename NativeT>
   std::unique_ptr<Literal> Replicate(int64 times) const;
 
-  // Converts this literal to another primitive type. Returns an error if the
-  // conversion is not possible. This literal must be array-shaped.
+  // Converts this literal to another primitive type using
+  // static_cast<>. Returns an error if the conversion is not possible. This
+  // literal must be array-shaped.
   StatusOr<std::unique_ptr<Literal>> Convert(
       PrimitiveType primitive_dest_type) const;
 
+  // Converts this literal to another primitive type using a bitcast
+  // conversion. The to and from primitive types must have the same bit
+  // width. Returns an error if the conversion is not possible. This literal
+  // must be array-shaped.
+  StatusOr<std::unique_ptr<Literal>> BitcastConvert(
+      PrimitiveType primitive_dest_type) const;
+
+  // Converts this literal to the given shape. Returns an error is the
+  // conversion is not possible.
+  //
+  // round_f32_to_bf16: if true, converting F32 elements to BF16 uses rounding
+  // instead of truncation; otherwise, truncation is used.
+  //
+  // TODO(b/69266521): remove the round_to_bfloat16 flag when rounding becomes
+  // the default behavior.
+  StatusOr<std::unique_ptr<Literal>> ConvertToShape(
+      const Shape& dest_shape, bool round_f32_to_bf16 = false) const;
+
   // Creates a scalar literal value zero of the given primitive type.
   static Literal Zero(PrimitiveType primitive_type);
 
@@ -451,6 +475,9 @@ class Literal {
   template <typename NativeT>
   NativeT GetFirstElement() const;
 
+  // Returns a literal scalar representing the first element.
+  Literal GetFirstScalarLiteral() const;
+
   // As Get(), but determines the correct type and converts the value
   // into text.
   string GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
@@ -466,6 +493,11 @@ class Literal {
   StatusOr<int64> GetIntegralAsS64(
       tensorflow::gtl::ArraySlice<int64> multi_index) const;
 
+  // As Set(), but truncates `value` to the literal element type before storing.
+  // This literal must be an array.
+  Status SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
+                          int64 value);
+
   // Returns an identity matrix (rank 2) with the given row and column count.
   template <typename NativeT>
   static std::unique_ptr<Literal> MakeIdentityR2(int64 size);
@@ -563,6 +595,12 @@ class Literal {
   template <typename NativeT, typename FnType>
   Status Populate(const FnType& generator);
 
+  // A parallel version of Populate(). This can be used if the generator is
+  // thread-safe and the values for the shape's different elements are
+  // independent.
+  template <typename NativeT, typename FnType>
+  Status PopulateParallel(const FnType& generator);
+
   // Fills this literal with the given value.
   template <typename NativeT>
   void PopulateWithValue(NativeT value);
@@ -602,6 +640,9 @@ class Literal {
   // This literal must have a dense layout.
   bool IsAllComplex(complex64 value) const;
 
+  // Literal consists entirely of the first element of the literal.
+  bool IsAllFirst() const;
+
   // Returns whether this literal is zero at the specified index. This literal
   // must be an array with a dense layout.
   bool IsZero(tensorflow::gtl::ArraySlice<int64> indices) const;
@@ -700,7 +741,13 @@ class Literal {
     int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); }
 
     // Returns the number of elements in this piece's array.
-    int64 element_count() const { return ShapeUtil::ElementsIn(subshape()); }
+    int64 element_count() const {
+      // If this is a sparse array, use the number of elements represented by
+      // the indices in the associated SparseIndexArray.
+      return LayoutUtil::IsSparseArray(subshape())
+                 ? sparse_indices()->index_count()
+                 : ShapeUtil::ElementsIn(subshape());
+    }
 
     // Copy the data from 'src' into this piece's buffer. Shapes of this piece
     // and src must be compatible.
@@ -758,6 +805,10 @@ class Literal {
   // buffer).
   void DeallocateBuffers();
 
+  // Implementation details shared between Populate() and PopulateParallel()
+  template <typename NativeT, typename FnType>
+  Status PopulateInternal(const FnType& generator, bool parallel);
+
   Shape shape_;
   ShapeTree<Piece> pieces_;
 
@@ -808,8 +859,7 @@ tensorflow::gtl::ArraySlice<NativeT> Literal::Piece::data() const {
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
   return tensorflow::gtl::ArraySlice<NativeT>(
-      reinterpret_cast<const NativeT*>(buffer()),
-      ShapeUtil::ElementsIn(subshape()));
+      reinterpret_cast<const NativeT*>(buffer()), element_count());
 }
 
 template <typename NativeT>
@@ -822,7 +872,7 @@ tensorflow::gtl::MutableArraySlice<NativeT> Literal::Piece::data() {
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
   return tensorflow::gtl::MutableArraySlice<NativeT>(
-      reinterpret_cast<NativeT*>(buffer()), ShapeUtil::ElementsIn(subshape()));
+      reinterpret_cast<NativeT*>(buffer()), element_count());
 }
 
 template <typename NativeT>
@@ -1237,19 +1287,20 @@ void Literal::PopulateSparse(SparseIndexArray indices,
   CHECK_LE(num_elements, max_elements);
   CHECK_EQ(num_elements, indices.index_count());
   auto root_data = root_piece().data<NativeT>();
-  root_data.remove_suffix(max_elements - values.size());
+  // Piece::data() returns an ArraySlice of size equal to the number of indices
+  // in the SparseIndexArray. So there is no need to adjust the size of the data
+  // here. It is enough to just copy the incoming values into the data buffer.
   std::copy(values.begin(), values.end(), root_data.begin());
   *this->root_piece().sparse_indices() = std::move(indices);
   if (sort) {
     auto root_data = this->root_piece().data<NativeT>();
-    root_data.remove_suffix(root_data.size() - num_elements);
     this->root_piece().sparse_indices()->SortWithValues(root_data);
   }
   DCHECK(this->root_piece().sparse_indices()->Validate(shape()));
 }
 
 template <typename NativeT, typename FnType>
-Status Literal::Populate(const FnType& generator) {
+Status Literal::PopulateInternal(const FnType& generator, bool parallel) {
   const Shape& this_shape = shape();
   const int64 rank = ShapeUtil::Rank(this_shape);
   TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape));
@@ -1259,11 +1310,11 @@ Status Literal::Populate(const FnType& generator) {
   if (rank > 0) {
     StrideConfig stride_config(this_shape, this_shape,
                                AsInt64Slice(this_shape.dimensions()));
-    DimensionVector minor_scan_indexes(rank, 0);
     int64 minor_dimension_size =
         ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension);
 
-    auto init_function = [&](const std::vector<int64>& indexes) {
+    auto init_function = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+      DimensionVector minor_scan_indexes(rank, 0);
       const int64 index =
           IndexUtil::MultidimensionalIndexToLinearIndex(shape(), indexes);
       std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
@@ -1271,17 +1322,35 @@ Status Literal::Populate(const FnType& generator) {
         minor_scan_indexes[stride_config.minor_dimension] = i;
         literal_data.at(index + i) = generator(minor_scan_indexes);
       }
-      return true;
     };
-    ShapeUtil::ForEachIndex(this_shape, stride_config.base,
-                            stride_config.dimensions, stride_config.step,
-                            init_function);
+    if (parallel) {
+      ShapeUtil::ForEachIndexParallel(this_shape, stride_config.base,
+                                      stride_config.dimensions,
+                                      stride_config.step, init_function);
+    } else {
+      ShapeUtil::ForEachIndex(
+          this_shape, stride_config.base, stride_config.dimensions,
+          stride_config.step,
+          [&init_function](tensorflow::gtl::ArraySlice<int64> indexes) {
+            init_function(indexes);
+            return true;
+          });
+    }
   } else {
     // For scalars.
     literal_data.at(0) = generator({});
   }
   return Status::OK();
 }
+template <typename NativeT, typename FnType>
+Status Literal::Populate(const FnType& generator) {
+  return PopulateInternal<NativeT>(generator, /*parallel=*/false);
+}
+
+template <typename NativeT, typename FnType>
+Status Literal::PopulateParallel(const FnType& generator) {
+  return PopulateInternal<NativeT>(generator, /*parallel=*/true);
+}
 
 template <typename NativeT>
 void Literal::PopulateWithValue(NativeT value) {
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index b3583c2eb75de8297d5e7507430491f119bd4462..61046784e05623cd3117c24ecc6d6c474739bbd5 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -30,6 +31,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using tensorflow::gtl::ArraySlice;
 using ::testing::ElementsAre;
 using ::testing::HasSubstr;
 
@@ -214,11 +216,9 @@ TEST_F(LiteralUtilTest, CreateSparse) {
   std::vector<int64> expected_values = {8, 9, 7, 10};
 
   EXPECT_EQ(literal->sparse_indices()->data(),
-            tensorflow::gtl::ArraySlice<int64>(
-                expected_indices.data(), expected_indices.num_elements()));
-  EXPECT_EQ(tensorflow::gtl::ArraySlice<int64>(literal->data<int64>().data(),
-                                               expected_values.size()),
-            tensorflow::gtl::ArraySlice<int64>(expected_values));
+            ArraySlice<int64>(expected_indices.data(),
+                              expected_indices.num_elements()));
+  EXPECT_EQ(literal->data<int64>(), ArraySlice<int64>(expected_values));
 }
 
 TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
@@ -290,7 +290,7 @@ TEST_F(LiteralUtilTest, EachCellR2F32) {
   // clang-format on
   std::vector<std::tuple<int64, int64, string>> seen;
   literal->EachCellAsString(
-      [&seen](tensorflow::gtl::ArraySlice<int64> indices, const string& value) {
+      [&seen](ArraySlice<int64> indices, const string& value) {
         seen.emplace_back(indices[0], indices[1], value);
       });
 
@@ -501,6 +501,24 @@ TEST_F(LiteralUtilTest, IsAllComplex) {
                    ->IsAllComplex({8.0f, 9.0f}));
 }
 
+TEST_F(LiteralUtilTest, IsAllFirst) {
+  // IsAllComplex always returns false when the literal is not complex.
+  EXPECT_FALSE(Literal::CreateR1<bool>({false, true})->IsAllFirst());
+  EXPECT_TRUE(Literal::CreateR1<bool>({false, false})->IsAllFirst());
+  EXPECT_FALSE(Literal::CreateR1<int8>({1, 1, 2})->IsAllFirst());
+  EXPECT_TRUE(Literal::CreateR1<int8>({5, 5, 5, 5})->IsAllFirst());
+  EXPECT_FALSE(Literal::CreateR1<uint8>({1, 1, 2})->IsAllFirst());
+  EXPECT_TRUE(Literal::CreateR1<int32>({5, 5, 5, 5})->IsAllFirst());
+  EXPECT_FALSE(Literal::CreateR1<int32>({1, 1, 2})->IsAllFirst());
+  EXPECT_TRUE(Literal::CreateR1<uint32>({5, 5, 5, 5})->IsAllFirst());
+  EXPECT_FALSE(Literal::CreateR1<uint32>({1, 1, 2})->IsAllFirst());
+
+  complex64 c8_9 = {8, 9};
+  complex64 c7_9 = {7, 9};
+  EXPECT_TRUE(Literal::CreateR2<complex64>({{c8_9}, {c8_9}})->IsAllFirst());
+  EXPECT_FALSE(Literal::CreateR2<complex64>({{c7_9}, {c8_9}})->IsAllFirst());
+}
+
 TEST_F(LiteralUtilTest, IsZero) {
   auto scalar_zero = Literal::CreateR0<float>(0.0f);
   auto scalar_one = Literal::CreateR0<float>(1.0f);
@@ -604,11 +622,10 @@ TEST_F(LiteralUtilTest, TransposeR4) {
   // clang-format on
   auto reshape = original->Transpose(/*permutation=*/{2, 3, 0, 1});
 
-  reshape->EachCell<float>(
-      [&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
-        EXPECT_EQ(value, original->Get<float>(
-                             {indices[2], indices[3], indices[0], indices[1]}));
-      });
+  reshape->EachCell<float>([&](ArraySlice<int64> indices, float value) {
+    EXPECT_EQ(value, original->Get<float>(
+                         {indices[2], indices[3], indices[0], indices[1]}));
+  });
 }
 
 TEST_F(LiteralUtilTest, TestR4RelayoutEquivalence) {
@@ -845,7 +862,7 @@ TEST_F(LiteralUtilTest, CopySliceFrom) {
     const int64 zero_base[] = {0, 0, 0, 0};
     const int64 step[] = {1, 1, 1, 1};
     uint32 seqnr = 0;
-    auto init_proc = [&](const std::vector<int64>& indexes) {
+    auto init_proc = [&](ArraySlice<int64> indexes) {
       source->Set(indexes, ++seqnr);
       return true;
     };
@@ -861,7 +878,7 @@ TEST_F(LiteralUtilTest, CopySliceFrom) {
     std::vector<int64> source_indexes(TF_ARRAYSIZE(dimensions), 0);
     std::vector<int64> blank_indexes(TF_ARRAYSIZE(dimensions), 0);
     bool matched = true;
-    auto check_proc = [&](const std::vector<int64>& indexes) {
+    auto check_proc = [&](ArraySlice<int64> indexes) {
       std::copy(indexes.begin(), indexes.end(), source_indexes.begin());
       std::transform(source_indexes.begin(), source_indexes.end(), src_base,
                      source_indexes.begin(), std::plus<int64>());
@@ -1049,7 +1066,7 @@ TEST_F(LiteralUtilTest, Populate) {
         primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
         data.layout);
     auto literal = Literal::CreateFromShape(shape);
-    auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> uint32 {
+    auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
       // Offsets from linear index just to avoid R0 literals to be initialized
       // with zero.
       return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(),
@@ -1061,7 +1078,49 @@ TEST_F(LiteralUtilTest, Populate) {
     std::vector<int64> zero_base(data.dimensions.size(), 0);
     std::vector<int64> step(data.dimensions.size(), 1);
     bool matched = true;
-    auto check_function = [&](const std::vector<int64>& indexes) {
+    auto check_function = [&](ArraySlice<int64> indexes) {
+      auto value = literal->Get<uint32>(indexes);
+      matched = matched && (value == generator(indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
+                            check_function);
+    EXPECT_TRUE(matched);
+  }
+}
+
+TEST_F(LiteralUtilTest, PopulateParallel) {
+  struct PopulateData {
+    std::vector<int64> dimensions;
+    std::vector<int64> layout;
+  } populate_data[] = {
+      {{}, {}},
+      {{0}, {0}},
+      {{16}, {0}},
+      {{2, 0}, {1, 0}},
+      {{4, 16}, {1, 0}},
+      {{21, 12}, {0, 1}},
+      {{6, 11, 17}, {2, 0, 1}},
+      {{6, 11, 5, 17}, {3, 2, 0, 1}},
+  };
+  for (const auto& data : populate_data) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
+        data.layout);
+    auto literal = Literal::CreateFromShape(shape);
+    auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
+      // Offsets from linear index just to avoid R0 literals to be initialized
+      // with zero.
+      return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(),
+                                                           indexes) +
+             17;
+    };
+    TF_EXPECT_OK(literal->PopulateParallel<uint32>(generator));
+
+    std::vector<int64> zero_base(data.dimensions.size(), 0);
+    std::vector<int64> step(data.dimensions.size(), 1);
+    bool matched = true;
+    auto check_function = [&](ArraySlice<int64> indexes) {
       auto value = literal->Get<uint32>(indexes);
       matched = matched && (value == generator(indexes));
       return matched;
@@ -1214,15 +1273,34 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   EXPECT_EQ(*conv, *c64);
 
   EXPECT_EQ(s32->Convert(TUPLE).status().code(),
-            tensorflow::error::INVALID_ARGUMENT);
+            tensorflow::error::UNIMPLEMENTED);
   EXPECT_EQ(s32->Convert(S16).status().code(),
-            tensorflow::error::INVALID_ARGUMENT);
+            tensorflow::error::UNIMPLEMENTED);
   EXPECT_EQ(s32->Convert(U16).status().code(),
-            tensorflow::error::INVALID_ARGUMENT);
+            tensorflow::error::UNIMPLEMENTED);
   EXPECT_EQ(c64->Convert(F32).status().code(),
-            tensorflow::error::INVALID_ARGUMENT);
+            tensorflow::error::UNIMPLEMENTED);
   EXPECT_EQ(c64->Convert(S32).status().code(),
-            tensorflow::error::INVALID_ARGUMENT);
+            tensorflow::error::UNIMPLEMENTED);
+}
+
+TEST_F(LiteralUtilTest, BitcastConvert) {
+  auto original =
+      Literal::CreateR1<uint32>({tensorflow::bit_cast<uint32>(2.5f),
+                                 tensorflow::bit_cast<uint32>(-42.25f),
+                                 tensorflow::bit_cast<uint32>(100.f), 0xbeef});
+  auto expected = Literal::CreateR1<float>(
+      {2.5f, -42.25f, 100.0f, tensorflow::bit_cast<float>(0xbeef)});
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> converted,
+                          original->BitcastConvert(F32));
+}
+
+TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
+  auto literal = Literal::CreateR0<uint32>(1234);
+  Status status = literal->BitcastConvert(F64).status();
+  EXPECT_NE(Status::OK(), status);
+  EXPECT_TRUE(tensorflow::str_util::StrContains(status.error_message(),
+                                                "bit widths are different"));
 }
 
 TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
@@ -1684,7 +1762,7 @@ TEST_F(LiteralUtilTest, GetSparseElementAsString) {
   ASSERT_EQ(Literal::CreateSparse<half>(dimensions, indices,
                                         {half{1.0}, half{2.0}, half{3.0}})
                 ->GetSparseElementAsString(1),
-            tensorflow::strings::StrCat(half{2.0}));
+            tensorflow::strings::StrCat(static_cast<float>(half{2.0})));
   ASSERT_EQ(
       Literal::CreateSparse<complex64>(
           dimensions, indices,
diff --git a/tensorflow/compiler/xla/ptr_util.h b/tensorflow/compiler/xla/ptr_util.h
index c58c19db2cacbe9b038160f27b9bd76aa58146eb..bfcdfc62f9541ab09b94a48d5121e16bad4d43cd 100644
--- a/tensorflow/compiler/xla/ptr_util.h
+++ b/tensorflow/compiler/xla/ptr_util.h
@@ -28,26 +28,8 @@ limitations under the License.
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace xla {
-
-template <typename T>
-std::unique_ptr<T> WrapUnique(T* ptr) {
-  return tensorflow::WrapUnique<T>(ptr);
-}
-
-template <typename T, typename... Args>
-typename tensorflow::helper::MakeUniqueResult<T>::scalar MakeUnique(
-    Args&&... args) {
-  return tensorflow::MakeUnique<T, Args...>(std::forward<Args>(args)...);
-}
-
-// Overload for array of unknown bound.
-// The allocation of arrays needs to use the array form of new,
-// and cannot take element constructor arguments.
-template <typename T>
-typename tensorflow::helper::MakeUniqueResult<T>::array MakeUnique(size_t n) {
-  return tensorflow::MakeUnique<T>(n);
-}
-
+using tensorflow::MakeUnique;
+using tensorflow::WrapUnique;
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index e2972f06016ab3555c4fc0cc4616993fe6764b1e..ecb87bd8893276fbb9ecffaa0f8a3233d2e0043f 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -20,6 +20,7 @@ py_test(
     srcs = ["xla_client_test.py"],
     main = "xla_client_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],
     deps = [
         ":xla_client",
         "//tensorflow/python:platform_test",
@@ -72,15 +73,3 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla/service:cpu_plugin",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index cb7bb21e092c80d7360c23f3d6b00409a75dce23..7102f467373edc0e12eeb66bce865ecca82bf484 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -89,17 +89,16 @@ StatusOr<std::unique_ptr<Literal>> TransferFromOutfeedLocalReplica(
   return client->TransferFromOutfeedLocal(shape, device_ordinal);
 }
 
-LocalShapedBuffer::LocalShapedBuffer(
-    std::unique_ptr<ScopedShapedBuffer> shaped_buffer)
+LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer)
     : shaped_buffer_(std::move(shaped_buffer)) {}
 
-const std::unique_ptr<ScopedShapedBuffer>& LocalShapedBuffer::shaped_buffer()
-    const {
-  return shaped_buffer_;
+const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const {
+  return &shaped_buffer_;
 }
 
-static StatusOr<std::unique_ptr<ScopedShapedBuffer>> ToBuffer(
-    LocalClient* client, int device_ordinal, const Literal& arg) {
+static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
+                                             int device_ordinal,
+                                             const Literal& arg) {
   return client->LiteralToShapedBuffer(arg, device_ordinal,
                                        client->backend().memory_allocator());
 }
@@ -109,14 +108,15 @@ LocalShapedBuffer* LocalShapedBuffer::FromLiteral(
     const Literal& argument,
     const tensorflow::gtl::optional<Shape>& shape_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
-  std::unique_ptr<ScopedShapedBuffer> buf;
-  if (shape_with_layout) {
-    std::unique_ptr<Literal> relaid =
-        argument.Relayout(shape_with_layout.value());
-    buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie();
-  } else {
-    buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
-  }
+  ScopedShapedBuffer buf = [&] {
+    if (shape_with_layout) {
+      std::unique_ptr<Literal> relaid =
+          argument.Relayout(shape_with_layout.value());
+      return ToBuffer(client, /*device_ordinal=*/0, *relaid)
+          .ConsumeValueOrDie();
+    }
+    return ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
+  }();
   return new LocalShapedBuffer(std::move(buf));
 }
 
@@ -158,14 +158,14 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
                 << device_ordinal;
 
         // Transfer arguments in
-        std::vector<std::unique_ptr<ScopedShapedBuffer>> scoped_buffers;
+        std::vector<ScopedShapedBuffer> scoped_buffers;
         scoped_buffers.reserve(arguments.size());
         for (int i = 0; i < arguments.size(); ++i) {
           const Literal& argument = arguments[i];
           const tensorflow::gtl::optional<Shape>& shape_with_layout =
               shapes_with_layout[i];
 
-          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed;
+          StatusOr<ScopedShapedBuffer> pushed;
           if (shape_with_layout) {
             std::unique_ptr<Literal> relaid =
                 argument.Relayout(shape_with_layout.value());
@@ -185,7 +185,7 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         std::vector<const ShapedBuffer*> argument_buffers;
         argument_buffers.reserve(scoped_buffers.size());
         for (auto& buffer : scoped_buffers) {
-          argument_buffers.push_back(buffer.get());
+          argument_buffers.push_back(&buffer);
         }
 
         DeviceAssignment device_assignment =
@@ -197,12 +197,10 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         ExecutableRunOptions options;
         options.set_device_ordinal(device_ordinal);
         options.set_allocator(client->backend().memory_allocator());
-        options.set_inter_op_thread_pool(
-            client->backend().inter_op_thread_pool());
         options.set_intra_op_thread_pool(
             client->backend().eigen_intra_op_thread_pool_device());
         options.set_device_assignment(&device_assignment);
-        StatusOr<std::unique_ptr<ScopedShapedBuffer>> result_buffer_status =
+        StatusOr<ScopedShapedBuffer> result_buffer_status =
             executable_->Run(argument_buffers, options);
         if (!result_buffer_status.ok()) {
           results[replica] = result_buffer_status.status();
@@ -210,8 +208,8 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         }
 
         // Transfer result out
-        results[replica] =
-            client->ShapedBufferToLiteral(*result_buffer_status.ValueOrDie());
+        results[replica] = client->ShapedBufferToLiteral(
+            std::move(result_buffer_status).ValueOrDie());
       });
     }
   }
@@ -236,16 +234,15 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
   std::vector<const ShapedBuffer*> argument_buffers;
   argument_buffers.reserve(argument_handles.size());
   for (auto& handle : argument_handles) {
-    argument_buffers.push_back(handle->shaped_buffer().get());
+    argument_buffers.push_back(handle->shaped_buffer());
   }
 
   // Execute
   ExecutableRunOptions options;
   options.set_allocator(client->backend().memory_allocator());
-  options.set_inter_op_thread_pool(client->backend().inter_op_thread_pool());
   options.set_intra_op_thread_pool(
       client->backend().eigen_intra_op_thread_pool_device());
-  std::unique_ptr<ScopedShapedBuffer> result_buffer =
+  ScopedShapedBuffer result_buffer =
       executable_->Run(argument_buffers, options).ConsumeValueOrDie();
 
   return new LocalShapedBuffer(std::move(result_buffer));
@@ -368,6 +365,12 @@ ComputationDataHandle LocalComputationBuilder::Slice(
   return builder_.Slice(operand, start_indices, limit_indices, strides);
 }
 
+ComputationDataHandle LocalComputationBuilder::SliceInDim(
+    const ComputationDataHandle& operand, int64 start_index, int64 limit_index,
+    int64 stride, int64 dimno) {
+  return builder_.SliceInDim(operand, start_index, limit_index, stride, dimno);
+}
+
 ComputationDataHandle LocalComputationBuilder::DynamicSlice(
     const ComputationDataHandle& operand,
     const ComputationDataHandle& start_indices,
@@ -515,6 +518,17 @@ ComputationDataHandle LocalComputationBuilder::Conditional(
                               false_computation.computation());
 }
 
+StatusOr<bool> LocalComputationBuilder::IsConstant(
+    const ComputationDataHandle& operand, int64 num_parameters) {
+  return builder_.IsConstant(operand, num_parameters);
+}
+
+StatusOr<std::unique_ptr<Literal>> LocalComputationBuilder::ComputeConstant(
+    const ComputationDataHandle& operand, const Layout* output_layout,
+    tensorflow::gtl::ArraySlice<Literal> parameters) {
+  return builder_.ComputeConstant(operand, output_layout, parameters);
+}
+
 #define _FORWARD(method_name, return_sig, args_sig, args)    \
   return_sig LocalComputationBuilder::method_name args_sig { \
     return builder_.method_name args;                        \
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index d3e9503ea10b011520ec5148a756ef4d421f244c..e1048909ab29c2147a37ed72844391400d99e90d 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -62,12 +62,12 @@ class LocalShapedBuffer {
   static LocalShapedBuffer* FromLiteral(
       const Literal& argument,
       const tensorflow::gtl::optional<Shape>& shape_with_layout);
-  LocalShapedBuffer(std::unique_ptr<ScopedShapedBuffer> shaped_buffer);
-  const std::unique_ptr<ScopedShapedBuffer>& shaped_buffer() const;
+  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
+  const ScopedShapedBuffer* shaped_buffer() const;
   std::unique_ptr<Literal> ToLiteral() const;
 
  private:
-  std::unique_ptr<ScopedShapedBuffer> shaped_buffer_;
+  ScopedShapedBuffer shaped_buffer_;
 };
 
 // Wraps a LocalExecutable produced by compiling a
@@ -170,6 +170,10 @@ class LocalComputationBuilder {
                               tensorflow::gtl::ArraySlice<int64> limit_indices,
                               tensorflow::gtl::ArraySlice<int64> strides);
 
+  ComputationDataHandle SliceInDim(const ComputationDataHandle& operand,
+                                   int64 start_index, int64 limit_index,
+                                   int64 stride, int64 dimno);
+
   ComputationDataHandle DynamicSlice(
       const ComputationDataHandle& operand,
       const ComputationDataHandle& start_indices,
@@ -264,6 +268,13 @@ class LocalComputationBuilder {
                                     const ComputationDataHandle& false_operand,
                                     const LocalComputation& false_computation);
 
+  StatusOr<bool> IsConstant(const ComputationDataHandle& operand,
+                            int64 num_parameters);
+
+  StatusOr<std::unique_ptr<Literal> > ComputeConstant(
+      const ComputationDataHandle& operand, const Layout* output_layout,
+      tensorflow::gtl::ArraySlice<Literal> parameters);
+
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 456e341f877e529f7fc5ebc81d85862bfa291943..ac792e8189bda9eda472e7d282db86ac988c57b9 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -141,6 +141,33 @@ bool GetIntAttr(PyObject* o, const char* field, int64* result) {
   return true;
 }
 
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleStringAttribute(PyObject* o,
+                           const char* attr_name,
+                           std::function<void(string s)> f) {
+  if (!PyObject_HasAttrString(o, attr_name)) {
+    return true;  // It's ok for the object to not have the attribute.
+  }
+  PyObject* attr = PyObject_GetAttrString(o, attr_name);
+  if (attr == nullptr) {
+    return false;  // An error occurred getting the attribute.
+  }
+  if (attr == Py_None) {
+    Py_DECREF(attr);
+    return true;  // The attribute is None, which we consider ok.
+  }
+  if (!PyString_Check(attr)) {
+    string message = tensorflow::strings::Printf("%s must be a string or none; got %s",
+        attr_name, numpy::PyObjectCppRepr(attr).c_str());
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyString_AsString(attr));
+  Py_DECREF(attr);
+  return true;  // Handled string attribute, ok!
+}
+
 }
 }
 %}
@@ -155,7 +182,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const ComputationDataHandle& (ComputationDataHandle temp) {
   const int64 handle = numpy::PyIntOrPyLongToLong($input);
   if (handle == -1 && PyErr_Occurred()) {
-    return NULL;
+    SWIG_fail;
   }
   temp.set_handle(handle);
   $1 = &temp;
@@ -174,7 +201,7 @@ tensorflow::ImportNumpy();
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -184,7 +211,7 @@ tensorflow::ImportNumpy();
     $result = numpy::PyObjectFromXlaLiteral(*value);
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -197,7 +224,7 @@ tensorflow::ImportNumpy();
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -206,7 +233,16 @@ tensorflow::ImportNumpy();
     $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<bool> {
+  if ($1.ok()) {
+    $result = PyBool_FromLong($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
   }
 }
 
@@ -214,8 +250,9 @@ tensorflow::ImportNumpy();
   if (!$1.ok()) {
     PyErr_SetString(
         PyExc_RuntimeError, $1.ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
+  Py_INCREF(Py_None);
   $result = Py_None;
 }
 
@@ -225,7 +262,7 @@ tensorflow::ImportNumpy();
     (std::vector<int64> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.resize(size);
@@ -237,13 +274,13 @@ tensorflow::ImportNumpy();
           PyExc_TypeError,
           "Argument sequence element cannot be converted to int");
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps[i] = numpy::PyIntOrPyLongToLong(py_int);
     if (temps[i] == -1 && PyErr_Occurred()) {
       Py_DECREF(py_int);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     Py_DECREF(py_int);
     Py_DECREF(o);
@@ -257,7 +294,7 @@ tensorflow::ImportNumpy();
     (std::vector<ComputationDataHandle> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.resize(size);
@@ -268,13 +305,13 @@ tensorflow::ImportNumpy();
       PyErr_SetString(
           PyExc_TypeError,
           "Argument sequence element cannot be converted to int");
-      return NULL;
+      SWIG_fail;
     }
     const int64 handle = numpy::PyIntOrPyLongToLong(py_int);
     if (handle == -1 && PyErr_Occurred()) {
       Py_DECREF(py_int);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps[i].set_handle(handle);
     Py_DECREF(py_int);
@@ -289,7 +326,7 @@ tensorflow::ImportNumpy();
     (std::vector<LocalShapedBuffer*> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.reserve(size);
@@ -298,7 +335,7 @@ tensorflow::ImportNumpy();
     LocalShapedBuffer* lsbp;
     if ((SWIG_ConvertPtr(o, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
                          SWIG_POINTER_EXCEPTION)) == -1) {
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(lsbp);
     Py_DECREF(o);
@@ -312,7 +349,7 @@ tensorflow::ImportNumpy();
   literal_status = numpy::XlaLiteralFromPyObject($input);
   if (!literal_status.ok()) {
     PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   $1 = literal_status.ValueOrDie().get();
 }
@@ -324,7 +361,7 @@ tensorflow::ImportNumpy();
 %typemap(out) StatusOr< std::unique_ptr<Literal> > {
   if (!$1.ok()) {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   $result = numpy::PyObjectFromXlaLiteral(*$1.ValueOrDie());
 }
@@ -332,7 +369,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -341,7 +378,7 @@ tensorflow::ImportNumpy();
     if (!literal_status.ok()) {
       PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(std::move(*literal_status.ConsumeValueOrDie()));
     Py_DECREF(o);
@@ -355,7 +392,7 @@ tensorflow::ImportNumpy();
   StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
   if (!statusor.ok()) {
     PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
@@ -367,7 +404,7 @@ tensorflow::ImportNumpy();
   StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
   if (!statusor.ok()) {
     PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
@@ -382,7 +419,7 @@ tensorflow::ImportNumpy();
     StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
     if (!statusor.ok()) {
       PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      return NULL;
+      SWIG_fail;
     }
     temp = std::move(statusor).ValueOrDie();
     $1 = &temp;
@@ -396,7 +433,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -405,7 +442,7 @@ tensorflow::ImportNumpy();
     Py_DECREF(o);
     if (!statusor.ok()) {
       PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(statusor.ConsumeValueOrDie());
   }
@@ -416,7 +453,7 @@ tensorflow::ImportNumpy();
     std::vector<tensorflow::gtl::optional<Shape> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -428,7 +465,7 @@ tensorflow::ImportNumpy();
       Py_DECREF(o);
       if (!statusor.ok()) {
         PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-        return NULL;
+        SWIG_fail;
       }
       temps.push_back(statusor.ConsumeValueOrDie());
     }
@@ -442,18 +479,18 @@ tensorflow::ImportNumpy();
   PyObject* py_int = numpy::PyNumberToPyInt($input);
   if (!py_int) {
     PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
-    return NULL;
+    SWIG_fail;
   }
   const long value = numpy::PyIntOrPyLongToLong(py_int);
   if (value == -1 && PyErr_Occurred()) {
     Py_DECREF(py_int);
-    return NULL;
+    SWIG_fail;
   }
   if (!PrimitiveType_IsValid(value)) {
     PyErr_SetString(
         PyExc_TypeError, "Argument not valid for PrimitiveType enum");
     Py_DECREF(py_int);
-    return NULL;
+    SWIG_fail;
   }
   $1 = static_cast<PrimitiveType>(value);
 }
@@ -464,19 +501,19 @@ tensorflow::ImportNumpy();
     (std::vector<std::pair<int64, int64> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.reserve(size);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
     if (!o) {
-      return NULL;
+      SWIG_fail;
     }
     PyObject* first = PyTuple_GetItem(o, 0);
     if (!first) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* first_pyint = numpy::PyNumberToPyInt(first);
     if (!first_pyint) {
@@ -484,13 +521,13 @@ tensorflow::ImportNumpy();
           PyExc_TypeError,
           "First pair item cannot be converted to int");
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* second = PyTuple_GetItem(o, 1);
     if (!second) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* second_pyint = numpy::PyNumberToPyInt(second);
     if (!second_pyint) {
@@ -499,21 +536,21 @@ tensorflow::ImportNumpy();
           "Second pair item cannot be converted to int");
       Py_DECREF(o);
       Py_DECREF(first_pyint);
-      return NULL;
+      SWIG_fail;
     }
     const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
     if (first_value == -1 && PyErr_Occurred()) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
       Py_DECREF(second_pyint);
-      return NULL;
+      SWIG_fail;
     }
     const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
     if (second_value == -1 && PyErr_Occurred()) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
       Py_DECREF(second_pyint);
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(std::make_pair(first_value, second_value));
     Py_DECREF(o);
@@ -531,26 +568,26 @@ tensorflow::ImportNumpy();
   PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
       $input, "lhs_contracting_dimensions");
   if (!lhs_contracting_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(lhs_contracting_dimensions);
   if (length == -1) {
     Py_DECREF(lhs_contracting_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
     if (!item) {
       Py_DECREF(lhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(lhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_lhs_contracting_dimensions(dimension);
     Py_DECREF(item);
@@ -561,26 +598,26 @@ tensorflow::ImportNumpy();
   PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
       $input, "rhs_contracting_dimensions");
   if (!lhs_contracting_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(rhs_contracting_dimensions);
   if (length == -1) {
     Py_DECREF(rhs_contracting_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
     if (!item) {
       Py_DECREF(rhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(rhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_rhs_contracting_dimensions(dimension);
     Py_DECREF(item);
@@ -591,26 +628,26 @@ tensorflow::ImportNumpy();
   PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
       $input, "lhs_batch_dimensions");
   if (!lhs_batch_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(lhs_batch_dimensions);
   if (length == -1) {
     Py_DECREF(lhs_batch_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
     if (!item) {
       Py_DECREF(lhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(lhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_lhs_batch_dimensions(dimension);
     Py_DECREF(item);
@@ -621,26 +658,26 @@ tensorflow::ImportNumpy();
   PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
       $input, "rhs_batch_dimensions");
   if (!rhs_batch_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(rhs_batch_dimensions);
   if (length == -1) {
     Py_DECREF(rhs_batch_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
     if (!item) {
       Py_DECREF(rhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(rhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_rhs_batch_dimensions(dimension);
     Py_DECREF(item);
@@ -656,20 +693,20 @@ tensorflow::ImportNumpy();
     (PaddingConfig padding_config) {
   PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
   if (!dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   int length = PySequence_Size(dimensions);
   if (length == -1) {
     Py_DECREF(dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(dimensions, i);
     if (!item) {
       Py_DECREF(dimensions);
-      return NULL;
+      SWIG_fail;
     }
     int64 edge_padding_low, edge_padding_high, interior_padding;
     if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
@@ -677,7 +714,7 @@ tensorflow::ImportNumpy();
         || !GetIntAttr(item, "interior_padding", &interior_padding)) {
       Py_DECREF(item);
       Py_DECREF(dimensions);
-      return NULL;
+      SWIG_fail;
     }
     Py_DECREF(item);
 
@@ -699,32 +736,32 @@ tensorflow::ImportNumpy();
   int64 value;
 
   if (!GetIntAttr($input, "input_batch_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_input_batch_dimension(value);
 
   if (!GetIntAttr($input, "input_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_input_feature_dimension(value);
 
   if (!GetIntAttr($input, "output_batch_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_output_batch_dimension(value);
 
   if (!GetIntAttr($input, "output_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_output_feature_dimension(value);
 
   if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_kernel_output_feature_dimension(value);
 
   if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_kernel_input_feature_dimension(value);
 
@@ -733,24 +770,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "input_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_input_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -759,24 +796,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "kernel_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_kernel_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -785,24 +822,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "output_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_output_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -819,16 +856,32 @@ tensorflow::ImportNumpy();
   if ($input == Py_None) {
     $1 = NULL;
   } else {
-    PyObject* o = PyObject_GetAttrString($input, "generate_hlo_graph");
-    if (!o) {
-      return NULL;
+    if (!HandleStringAttribute($input, "generate_hlo_graph", [&](string s) {
+      build_options.set_generate_hlo_graph(std::move(s));
+    })) {
+      return nullptr;
+    }
+    if (!HandleStringAttribute($input, "dump_optimized_hlo_proto_to", [&](string s) {
+      build_options.set_dump_optimized_hlo_proto_to(std::move(s));
+    })) {
+      return nullptr;
+    }
+    if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) {
+      build_options.set_dump_per_pass_hlo_proto_to(std::move(s));
+    })) {
+      return nullptr;
+    }
+
+    PyObject* o = PyObject_GetAttrString($input, "hlo_profile");
+    if (o == NULL) {
+      SWIG_fail;
     }
     if (o != Py_None) {
-      if (!PyString_Check(o)) {
-        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.generate_hlo_graph must be a string or None.");
-        return NULL;
+      if (!PyBool_Check(o)) {
+        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.hlo_profile must be a bool or None.");
+        SWIG_fail;
       }
-      build_options.set_generate_hlo_graph(PyString_AsString(o));
+      build_options.set_hlo_profile(o == Py_True);
     }
     Py_DECREF(o);
 
@@ -841,7 +894,7 @@ tensorflow::ImportNumpy();
       if (!statusor.ok()) {
         PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat("ExecutableBuildOptions.result_shape could not be created from Python shape value: ", statusor.status().ToString()).c_str());
         Py_DECREF(o);
-        return NULL;
+        SWIG_fail;
       }
       build_options.set_result_layout(statusor.ValueOrDie());
     }
@@ -886,6 +939,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Collapse;
 %unignore xla::swig::LocalComputationBuilder::CrossReplicaSum;
 %unignore xla::swig::LocalComputationBuilder::Slice;
+%unignore xla::swig::LocalComputationBuilder::SliceInDim;
 %unignore xla::swig::LocalComputationBuilder::DynamicSlice;
 %unignore xla::swig::LocalComputationBuilder::DynamicUpdateSlice;
 %unignore xla::swig::LocalComputationBuilder::ConcatInDim;
@@ -906,6 +960,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::RngBernoulli;
 %unignore xla::swig::LocalComputationBuilder::While;
 %unignore xla::swig::LocalComputationBuilder::Conditional;
+%unignore xla::swig::LocalComputationBuilder::IsConstant;
 %unignore xla::swig::LocalComputationBuilder::Eq;
 %unignore xla::swig::LocalComputationBuilder::Ne;
 %unignore xla::swig::LocalComputationBuilder::Ge;
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index 3d87480728aab1d4ebbc71c6c7504d37cae5edaf..dc6f5fe5fcc067c99ced01812f9f2388a00766d0 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -170,8 +170,7 @@ static string PyObjectCppStr(PyObject* o) {
   return ExtractStringAndDecref(s);
 }
 
-// Safely returns a repr of the given Python object o as a C++ string.
-static string PyObjectCppRepr(PyObject* o) {
+string PyObjectCppRepr(PyObject* o) {
   PyObject* r = PyObject_Repr(o);
   return ExtractStringAndDecref(r);
 }
@@ -182,16 +181,6 @@ StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
                            PyObjectCppRepr(o).c_str());
   };
 
-  auto get_attr = [o, &error](const string& field) -> StatusOr<PyObject*> {
-    PyObject* result =
-        PyObject_GetAttrString(o, const_cast<char*>(field.c_str()));
-    if (result == nullptr) {
-      return error(tensorflow::strings::StrCat(
-          "Failed to get attribute of Shape object:", field));
-    }
-    return result;
-  };
-
   auto call_method = [o, &error](const string& method) -> StatusOr<PyObject*> {
     PyObject* result =
         PyObject_CallMethod(o, const_cast<char*>(method.c_str()), nullptr);
@@ -203,12 +192,16 @@ StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
   };
 
   PyObject* np_type;
-  TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype"));
+  TF_ASSIGN_OR_RETURN(np_type, call_method("numpy_dtype"));
   if (np_type->ob_type != &PyArrayDescr_Type) {
-    return error("Shape attribute np_dtype is not an integer numpy dtype");
+    return error(
+        "Return value of shape method numpy_dtype "
+        "is not an integer numpy dtype");
   }
   if (!NumpyTypeIsValid(NumpyTypenum(np_type))) {
-    return error("Shape attribute np_dtype is not a valid integer numpy dtype");
+    return error(
+        "Return value of shape method numpy_dtype "
+        "is not a valid integer numpy dtype");
   }
   const PrimitiveType element_type =
       NumpyTypeToPrimitiveType(NumpyTypenum(np_type));
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index adfcc3b8588dce01718bb19dea936bace483be4d..9656cb1c31c39dbe54293700c2765d0723255657 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -107,6 +107,9 @@ void CopyLiteralToNumpyArray(const Literal& literal, PyArrayObject* py_array) {
   std::copy(source.begin(), source.end(), dest);
 }
 
+// Safely returns a repr of the given Python object o as a C++ string.
+string PyObjectCppRepr(PyObject* o);
+
 // Workarounds for Python 2 and 3 interop
 
 PyObject* LongToPyIntOrPyLong(long x);  // NOLINT
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 9bda9d09294bc75acaa35d8e4a512820046e8920..f6809b6b871d7e246dd43811c7e8c08378d53989 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -30,9 +30,9 @@ from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python import pywrap_xla as c_api
 
 
-# Most functions are snake_case for consistency with other modules,
-# whereas method names of ComputationBuilder and LocalComputation are
-# CamelCase for consistency with XLA.
+# Most functions are snake_case for consistency with other modules, whereas
+# method names of ComputationBuilder and LocalComputation are CamelCase for
+# consistency with XLA.
 # pylint: disable=invalid-name
 
 
@@ -123,24 +123,34 @@ _BINARY_OPS = [
     'Pow',
 ]
 
+
 XLA_ELEMENT_TYPE_TO_DTYPE = {
-    xla_data_pb2.F32: np.dtype(np.float32),
-    xla_data_pb2.F64: np.dtype(np.float64),
-    xla_data_pb2.S32: np.dtype(np.int32),
-    xla_data_pb2.S64: np.dtype(np.int64),
-    xla_data_pb2.U32: np.dtype(np.uint32),
-    xla_data_pb2.U64: np.dtype(np.uint64),
-    xla_data_pb2.PRED: np.dtype(np.bool),
+    xla_data_pb2.PRED: np.dtype('bool'),
+    xla_data_pb2.S8: np.dtype('int8'),
+    xla_data_pb2.S16: np.dtype('int16'),
+    xla_data_pb2.S32: np.dtype('int32'),
+    xla_data_pb2.S64: np.dtype('int64'),
+    xla_data_pb2.U8: np.dtype('uint8'),
+    xla_data_pb2.U16: np.dtype('uint16'),
+    xla_data_pb2.U32: np.dtype('uint32'),
+    xla_data_pb2.U64: np.dtype('uint64'),
+    xla_data_pb2.F16: np.dtype('float16'),
+    xla_data_pb2.F32: np.dtype('float32'),
+    xla_data_pb2.F64: np.dtype('float64'),
+    xla_data_pb2.C64: np.dtype('complex64'),
     xla_data_pb2.TUPLE: np.dtype(np.object),
 }
 
 # Note the conversion on the key. Numpy has a known issue wherein dtype hashing
 # doesn't work as expected (https://github.com/numpy/numpy/issues/7242). Thus,
 # when keying by dtype in this dict, we use the string form of dtypes.
-DTYPE_TO_XLA_ELEMENT_TYPE = {
-    str(v): k
-    for k, v in XLA_ELEMENT_TYPE_TO_DTYPE.items()
-}
+DTYPE_TO_XLA_ELEMENT_TYPE = {str(dt): et
+                             for et, dt in XLA_ELEMENT_TYPE_TO_DTYPE.items()}
+
+
+def dtype_to_etype(dtype):
+  """Convenience function for reading DTYPE_TO_XLA_ELEMENT_TYPE."""
+  return DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
 
 
 class LocalBuffer(object):
@@ -156,14 +166,14 @@ class LocalBuffer(object):
     self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_py(npval, layout_fn=None):
-    npval = require_numpy_array_layout(npval)
+  def from_pyval(pyval, layout_fn=None):
+    pyval = require_numpy_array_layout(pyval)
     if layout_fn:
-      shape = Shape.from_numpy(npval)
+      shape = Shape.from_pyval(pyval)
       shape = shape.map_leaves(layout_fn)
     else:
       shape = None
-    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape))
+    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(pyval, shape))
 
   def to_py(self):
     return self.c_local_shaped_buffer.ToLiteral()
@@ -181,53 +191,104 @@ class LocalBuffer(object):
 
 
 class Shape(object):
-  """XLA shape.
+  """Represents an XLA shape.
 
-  Represents an XLA shape by a corresponding Python/Numpy type and a
-  list of dimensions, which are themselves Shapes in case this one
-  represents an XLA tuple.
+  A shape is either an array shape, having rank-many integer
+  dimensions and an element type (represented by a Numpy dtype), or it
+  is a tuple shape, having a shape for every tuple component:
+
+    type shape =
+        TupleShape of shape list
+      | ArrayShape of { dimensions: int list; element_type: dtype }
+
+  Callers are expected to instantiate this class only via the static
+  constructors: tuple_shape, array_shape, and from_pyval.
   """
 
-  def __init__(self, np_dtype, dimensions, minor_to_major=None):
+  @staticmethod
+  def tuple_shape(tuple_shapes):
+    """Construct a tuple shape."""
+    if (not isinstance(tuple_shapes, (tuple, list)) or
+        not all(isinstance(t, Shape) for t in tuple_shapes)):
+      raise TypeError('tuple_shapes must be a tuple of Shapes')
+    return Shape(tuple_shapes, tuple)
+
+  @staticmethod
+  def array_shape(element_type, dimensions, minor_to_major=None):
+    """Construct an array shape."""
+    if (not isinstance(dimensions, tuple) or
+        not all(isinstance(i, int) for i in dimensions)):
+      dimensions = tuple(int(i) for i in dimensions)
+    return Shape(dimensions, np.dtype(element_type),
+                 minor_to_major=minor_to_major)
+
+  @staticmethod
+  def from_pyval(pyval):
+    def convert(pyval):
+      if isinstance(pyval, tuple):
+        return Shape.tuple_shape(tuple(convert(elt) for elt in pyval))
+      else:
+        pyval = require_numpy_array_layout(pyval)
+        return Shape.array_shape(pyval.dtype, np.shape(pyval))
+    return convert(pyval)
+
+  def __init__(self, dimensions, dtype, minor_to_major=None):
     assert isinstance(dimensions, tuple)
-    self.np_dtype = np_dtype
     self._dimensions = dimensions
+    self._dtype = dtype
+    self._is_tuple = dtype == tuple
     self._minor_to_major = minor_to_major
     self._check_minor_to_major()
 
   def __eq__(self, other):
     # pylint: disable=protected-access
-    return (self.np_dtype == other.np_dtype and
+    return (self._dtype == other._dtype and
             self._dimensions == other._dimensions and
             self._minor_to_major == other._minor_to_major)
 
   def __repr__(self):
-    return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, '
-            'minor_to_major={!r})').format(self.np_dtype, self._dimensions,
-                                           self._minor_to_major)
-
-  def element_type(self):
-    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)]
+    return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, '
+            '_is_tuple={!r}), _minor_to_major={!r}').format(
+                self._dtype, self._dimensions, self._is_tuple,
+                self._minor_to_major)
 
   def is_tuple(self):
-    return self.element_type() == xla_data_pb2.TUPLE
+    return self._is_tuple
 
-  def dimensions(self):
-    if self.is_tuple():
-      raise ValueError('Tuple shape has no dimensions')
-    return self._dimensions
-
-  def minor_to_major(self):
-    return self._minor_to_major
+  def is_array(self):
+    return not self._is_tuple
 
   def tuple_shapes(self):
     if not self.is_tuple():
-      raise ValueError('Shape is not a tuple shape')
+      raise ValueError('not a tuple shape')
+    return self._dimensions
+
+  def numpy_dtype(self):
+    """Like element_type(), but returns dtype('O') in case of a tuple shape."""
+    if self.is_tuple():
+      return np.dtype(np.object)
+    else:
+      return self.element_type()
+
+  def xla_element_type(self):
+    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.numpy_dtype())]
+
+  def element_type(self):
+    if not self.is_array():
+      raise ValueError('not an array shape')
+    return self._dtype
+
+  def dimensions(self):
+    if not self.is_array():
+      raise ValueError('not an array shape')
     return self._dimensions
 
   def rank(self):
     return len(self.dimensions())
 
+  def minor_to_major(self):
+    return self._minor_to_major
+
   def map_leaves(self, f):
     """Map f over each leaf-level array subshape.
 
@@ -240,7 +301,7 @@ class Shape(object):
     """
     if self.is_tuple():
       children = tuple(child.map_leaves(f) for child in self.tuple_shapes())
-      return Shape(np.dtype('O'), children)
+      return Shape.tuple_shape(children)
     else:
       mapped = f(self)
       return self if mapped is None else mapped
@@ -254,30 +315,24 @@ class Shape(object):
       assert sorted(mtm) == range(len(mtm)), self
 
   def update_minor_to_major(self, minor_to_major):
+    if not self.is_array():
+      raise ValueError('not an array shape')
     if not isinstance(minor_to_major, tuple):
       raise TypeError('minor_to_major must be a tuple')
-    updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major)
+    updated = Shape.array_shape(
+        self.element_type(), self.dimensions(), minor_to_major)
     updated._check_minor_to_major()  # pylint: disable=protected-access
     return updated
 
-  @staticmethod
-  def from_numpy(npval):
-
-    def convert(npval):
-      if isinstance(npval, tuple):
-        return Shape(np.dtype('O'), tuple(convert(elt) for elt in npval))
-      else:
-        return Shape(npval.dtype, np.shape(npval))
-
-    return convert(require_numpy_array_layout(npval))
-
 
 def _wrap_shape(shape_info):
   dtype, dims = shape_info
   element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)]
   if element_type == xla_data_pb2.TUPLE:
-    dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
-  return Shape(dtype, dims)
+    shapes = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
+    return Shape.tuple_shape(shapes)
+  else:
+    return Shape.array_shape(dtype, dims)
 
 
 def _wrap_data_handle(handle):
@@ -310,6 +365,9 @@ class CompileOptions(object):
 
   def __init__(self):
     self.generate_hlo_graph = None
+    self.dump_optimized_hlo_proto_to = None
+    self.dump_per_pass_hlo_proto_to = None
+    self.hlo_profile = False
 
 
 def transfer_to_infeed(value, replica_number=None):
@@ -407,7 +465,7 @@ class LocalComputation(object):
                                   compile_options=None,
                                   layout_fn=None):
     return self.Compile(
-        argument_shapes=[Shape.from_numpy(arg) for arg in arguments],
+        argument_shapes=[Shape.from_pyval(arg) for arg in arguments],
         compile_options=compile_options,
         layout_fn=layout_fn)
 
@@ -415,7 +473,7 @@ class LocalComputation(object):
     """Execute with Python values as arguments and return value."""
     if not self.is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
-    argument_shapes = [Shape.from_numpy(arg) for arg in arguments]
+    argument_shapes = [Shape.from_pyval(arg) for arg in arguments]
     if layout_fn:
       argument_shapes = [
           shape.map_leaves(layout_fn) for shape in argument_shapes
@@ -594,7 +652,7 @@ class ComputationBuilder(object):
       A ComputationDataHandle message.
     """
     return self.ParameterWithShape(
-        Shape.from_numpy(value), name=name, parameter_num=parameter_num)
+        Shape.from_pyval(value), name=name, parameter_num=parameter_num)
 
   def Broadcast(self, operand, sizes):
     """Enqueues a broadcast operation onto the computation.
@@ -656,7 +714,7 @@ class ComputationBuilder(object):
         representing the configuration of the padding operation.
 
     Returns:
-      A ComputationDataHandle representing the added pad op.
+      A ComputationDataHandle representing the added Pad op.
     """
     if not isinstance(padding_config, xla_data_pb2.PaddingConfig):
       padding_config = GetPaddingConfigFromTriples(padding_config)
@@ -666,7 +724,20 @@ class ComputationBuilder(object):
                          padding_config))
 
   def Reshape(self, operand, dimensions, new_sizes):
-    """Reshape op."""
+    """Enqueues a reshape op onto the computation.
+
+    Args:
+      operand: ComputationDataHandle representing the array to be reshaped.
+      dimensions: sequence of integers encoding the order in which dimensions
+        are collapsed or None, in which case dimensions are flattened in order.
+      new_sizes: sequence of integers encoding the new dimension sizes (shape).
+
+    Returns:
+      A ComputationDataHandle representing the added Reshape op.
+    """
+    if dimensions is None:
+      ndim = len(self.GetShape(operand).dimensions())
+      dimensions = tuple(range(ndim))
     return _wrap_data_handle(
         self._client.Reshape(
             _unwrap_data_handle(operand), dimensions, new_sizes))
@@ -772,11 +843,27 @@ class ComputationBuilder(object):
       strides = [1] * len(start_indices)
     return _wrap_data_handle(
         self._client.Slice(
-            _unwrap_data_handle(operand),
-            start_indices,
-            limit_indices,
+            _unwrap_data_handle(operand), start_indices, limit_indices,
             strides))
 
+  def SliceInDim(self, operand, start_index, limit_index, stride, dimno):
+    """Enqueues a slice-in-dimension operation onto the computation.
+
+    Args:
+      operand: ComputationDataHandle for the N dimensional array to be sliced.
+      start_index: an integer containing the start index of the slice.
+      limit_index: an integer containing the end index of the slice.
+      stride: an integer containing the stride size for the slice.
+      dimno: an integer indicating the dimension along which to slice.
+
+    Returns:
+      A ComputationDataHandle representing the added Slice op.
+    """
+    return _wrap_data_handle(
+        self._client.SliceInDim(
+            _unwrap_data_handle(operand), start_index, limit_index, stride,
+            dimno))
+
   def DynamicSlice(self, operand, start_indices, slice_sizes):
     """Enqueues a slice op with dynamic start indices onto the computation.
 
@@ -926,7 +1013,7 @@ class ComputationBuilder(object):
 
     Returns: a ComputationDataHandle to the generated array of F32 values.
     """
-    shape = Shape(self.GetShape(mu).np_dtype, dims)
+    shape = Shape.array_shape(self.GetShape(mu).element_type(), dims)
     return _wrap_data_handle(
         self._client.RngNormal(
             _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape))
@@ -946,7 +1033,7 @@ class ComputationBuilder(object):
     Returns: a ComputationDataHandle to the generated array of values with the
       same numeric type (F32, S32, or U32) as the arguments a and b.
     """
-    shape = Shape(self.GetShape(a).np_dtype, dims)
+    shape = Shape.array_shape(self.GetShape(a).element_type(), dims)
     return _wrap_data_handle(
         self._client.RngUniform(
             _unwrap_data_handle(a), _unwrap_data_handle(b), shape))
@@ -986,6 +1073,20 @@ class ComputationBuilder(object):
             _unwrap_data_handle(false_operand),
             false_computation.c_local_computation))
 
+  def IsConstant(self, operand, num_parameters=0):
+    """Enqueues an IsConstant operation onto the computation.
+
+    Args:
+      operand: a ComputationDataHandle to test.
+      num_parameters: optional int, number of computation parameters to treat as
+        constant (default 0).
+
+    Returns: bool indicating whether `operand` is a compile-time constant,
+      meaning its value does not depend on parameters with index greater than or
+      equal to `num_parameters`.
+    """
+    return self._client.IsConstant(_unwrap_data_handle(operand), num_parameters)
+
   def Dot(self, lhs, rhs):
     """Enqueues a dot operation onto the computation.
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index c9d09cd5d57e001fd48d2dba9f2b0ee18374231b..c073c02040e4d260cf760ea2b25f70d60ddd41a1 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -319,7 +319,7 @@ class LocalBufferTest(LocalComputationTest):
 
   def _Execute(self, c, arguments):
     compiled_c = c.Build().CompileWithExampleArguments(arguments)
-    arg_buffers = [xla_client.LocalBuffer.from_py(arg) for arg in arguments]
+    arg_buffers = [xla_client.LocalBuffer.from_pyval(arg) for arg in arguments]
     result_buffer = compiled_c.ExecuteWithLocalBuffers(arg_buffers)
     return result_buffer.to_py()
 
@@ -350,7 +350,7 @@ class LocalBufferTest(LocalComputationTest):
     c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14))
     arg = NumpyArrayF32(1.11)
     compiled_c = c.Build().CompileWithExampleArguments([arg])
-    arg_buffer = xla_client.LocalBuffer.from_py(arg)
+    arg_buffer = xla_client.LocalBuffer.from_pyval(arg)
     arg_buffer.delete()
     with self.assertRaises(ValueError):
       compiled_c.ExecuteWithLocalBuffers([arg_buffer])
@@ -762,6 +762,23 @@ class SingleOpTest(LocalComputationTest):
         [3, 2])
     self._ExecuteAndCompareExact(c, expected=[[4, 5], [7, 8]])
 
+  def testSliceInDim(self):
+    c = self._NewComputation()
+    c.SliceInDim(
+        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        start_index=1,
+        limit_index=2,
+        stride=1,
+        dimno=1)
+    self._ExecuteAndCompareExact(c, expected=[[2], [5], [8]])
+    c.SliceInDim(
+        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        start_index=0,
+        limit_index=3,
+        stride=2,
+        dimno=0)
+    self._ExecuteAndCompareExact(c, expected=[[1, 2, 3], [7, 8, 9]])
+
   def testDynamicSlice(self):
     c = self._NewComputation()
     c.DynamicSlice(
@@ -838,6 +855,17 @@ class SingleOpTest(LocalComputationTest):
     self.assertTrue(np.all(lo <= result))
     self.assertTrue(np.all(result < hi))
 
+  def testIsConstant(self):
+    c = self._NewComputation()
+    a = c.ConstantS32Scalar(3)
+    b = c.ConstantS32Scalar(1)
+    x = c.ParameterFromNumpy(NumpyArrayS32(0))
+    const_expr = c.Sub(b, a)
+    non_const_expr = c.Mul(const_expr, x)
+    self.assertTrue(c.IsConstant(const_expr))
+    self.assertFalse(c.IsConstant(non_const_expr))
+    # self.assertTrue(c.IsConstant(c.Sub(c.Add(x, a), x)))  # TODO(b/77245564)
+
 
 class EmbeddedComputationsTest(LocalComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
@@ -1132,7 +1160,6 @@ class EmbeddedComputationsTest(LocalComputationTest):
       self._ExecuteAndCompareClose(
           c, expected=np.sum(input_array, axis=tuple(dims)))
 
-    _ReduceAndTest(0)
     _ReduceAndTest(0)
     _ReduceAndTest(0, 1)
     _ReduceAndTest(0, 2)
@@ -1260,7 +1287,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
   def testInfeedS32Values(self):
     to_infeed = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    c.Infeed(xla_client.Shape.from_numpy(to_infeed[0]))
+    c.Infeed(xla_client.Shape.from_pyval(to_infeed[0]))
     compiled_c = c.Build().CompileWithExampleArguments()
     for item in to_infeed:
       xla_client.transfer_to_infeed(item)
@@ -1272,7 +1299,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
   def testInfeedThenOutfeedS32(self):
     to_round_trip = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    x = c.Infeed(xla_client.Shape.from_numpy(to_round_trip[0]))
+    x = c.Infeed(xla_client.Shape.from_pyval(to_round_trip[0]))
     c.Outfeed(x)
 
     compiled_c = c.Build().CompileWithExampleArguments()
@@ -1282,7 +1309,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
       execution.start()
       xla_client.transfer_to_infeed(want)
       got = xla_client.transfer_from_outfeed(
-          xla_client.Shape.from_numpy(to_round_trip[0]))
+          xla_client.Shape.from_pyval(to_round_trip[0]))
       execution.join()
       self.assertEqual(want, got)
 
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index a9acdae380af5b7f9efb3d08302fc717108f5e40..c289c84cff743871a7126cb932d6cda823ceb696 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <array>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -30,29 +30,23 @@ limitations under the License.
 
 namespace xla {
 
-/* static */ std::unique_ptr<Array2D<float>> ReferenceUtil::TransposeArray2D(
-    const Array2D<float>& operand) {
-  auto result = MakeUnique<Array2D<float>>(operand.width(), operand.height());
-  for (int64 w = 0; w < operand.width(); ++w) {
-    for (int64 h = 0; h < operand.height(); ++h) {
-      (*result)(w, h) = operand(h, w);
-    }
-  }
-
-  return result;
-}
+namespace {
 
-/* static */ std::unique_ptr<Array2D<float>> ReferenceUtil::MatmulArray2D(
-    const Array2D<float>& lhs, const Array2D<float>& rhs) {
+template <typename T>
+std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
+    const Array2D<T>& lhs, const Array2D<T>& rhs,
+    const std::function<void(
+        const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m, int64 n,
+        int64 k, int32 transpose_lhs, int32 transpose_rhs)>& impl_fn) {
   CHECK_EQ(lhs.width(), rhs.height());
   int m = lhs.height();
   int n = rhs.width();
   int k = lhs.width();
-  auto result = MakeUnique<Array2D<float>>(m, n);
+  auto result = MakeUnique<Array2D<T>>(m, n);
   // Because Eigen is a header-oriented library, make sure that the Eigen code
   // is the same as the code used by the CPU backend (otherwise the linker will
   // randomly pick *some* definition).
-  __xla_cpu_runtime_EigenSingleThreadedMatMulF32(
+  impl_fn(
       /*run_options_ptr=*/nullptr, result->data(), rhs.data(), lhs.data(), n, m,
       k,
       /*transpose_lhs=*/0,
@@ -60,22 +54,24 @@ namespace xla {
   return result;
 }
 
+}  // namespace
+
+/* static */ std::unique_ptr<Array2D<Eigen::half>> ReferenceUtil::MatmulArray2D(
+    const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs) {
+  return MatmulArray2DImpl<Eigen::half>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF16);
+}
+
+/* static */ std::unique_ptr<Array2D<float>> ReferenceUtil::MatmulArray2D(
+    const Array2D<float>& lhs, const Array2D<float>& rhs) {
+  return MatmulArray2DImpl<float>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF32);
+}
+
 /* static */ std::unique_ptr<Array2D<double>> ReferenceUtil::MatmulArray2D(
     const Array2D<double>& lhs, const Array2D<double>& rhs) {
-  CHECK_EQ(lhs.width(), rhs.height());
-  int m = lhs.height();
-  int n = rhs.width();
-  int k = lhs.width();
-  auto result = MakeUnique<Array2D<double>>(m, n);
-  // Because Eigen is a header-oriented library, make sure that the Eigen code
-  // is the same as the code used by the CPU backend (otherwise the linker will
-  // randomly pick *some* definition).
-  __xla_cpu_runtime_EigenSingleThreadedMatMulF64(
-      /*run_options_ptr=*/nullptr, result->data(), rhs.data(), lhs.data(), n, m,
-      k,
-      /*transpose_lhs=*/0,
-      /*transpose_rhs=*/0);
-  return result;
+  return MatmulArray2DImpl<double>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF64);
 }
 
 /* static */ std::unique_ptr<Array2D<double>> ReferenceUtil::Array2DF32ToF64(
@@ -94,7 +90,7 @@ namespace xla {
     Padding padding) {
   return ConvArray3DGeneralDimensionsDilated(
       lhs, rhs, kernel_stride, padding, 1, 1,
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(1));
+      XlaBuilder::CreateDefaultConvDimensionNumbers(1));
 }
 
 /*static*/ std::unique_ptr<Array3D<float>>
@@ -144,7 +140,7 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated(
     std::pair<int64, int64> kernel_stride, Padding padding) {
   return ConvArray4DGeneralDimensions(
       lhs, rhs, kernel_stride, padding,
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 }
 
 /* static */ std::unique_ptr<Array4D<float>>
@@ -188,18 +184,6 @@ ReferenceUtil::SeparableConvArray4D(const Array4D<float>& input,
   return tensorflow::MathUtil::CeilOfRatio(unpadded_width, stride);
 }
 
-/* static  */ std::unique_ptr<std::vector<float>>
-ReferenceUtil::ReduceWindow1DGeneric(
-    const tensorflow::gtl::ArraySlice<float>& operand, float init,
-    const std::function<float(float, float)>& reduce_func,
-    const tensorflow::gtl::ArraySlice<int64>& window,
-    const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
-  std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
-  return ReduceWindow1DGeneric(
-      operand, init, reduce_func, window, stride,
-      xla::MakePadding(dim_lengths, window, stride, padding));
-}
-
 /* static  */ std::unique_ptr<std::vector<float>>
 ReferenceUtil::ReduceWindow1DGeneric(
     const tensorflow::gtl::ArraySlice<float>& operand, float init,
@@ -239,23 +223,28 @@ ReferenceUtil::ReduceWindow1DAdd(
     const tensorflow::gtl::ArraySlice<int64>& window,
     const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
   const auto add_reduce = [](float arg1, float arg2) { return arg1 + arg2; };
-  return ReduceWindow1DGeneric(operand, init, add_reduce, window, stride,
-                               padding);
+  std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
+  return ReduceWindow1DGeneric(
+      operand, init, add_reduce, window, stride,
+      xla::MakePadding(dim_lengths, window, stride, padding));
 }
 
-/* static  */ std::unique_ptr<Array2D<float>> ReferenceUtil::ReduceWindow2DAdd(
+/* static */ std::unique_ptr<Array2D<float>>
+ReferenceUtil::ReduceWindow2DGeneric(
     const Array2D<float>& operand, float init,
+    const std::function<float(float, float)>& reduce_func,
     const tensorflow::gtl::ArraySlice<int64>& window,
-    const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
+    const tensorflow::gtl::ArraySlice<int64>& stride,
+    const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding) {
   std::vector<int64> dim_lengths{operand.height(), operand.width()};
-  auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
 
   std::vector<int64> window_counts(window.size(), 0);
   std::vector<int64> pad_low(window.size(), 0);
   for (int64 i = 0; i < window.size(); ++i) {
+    int64 padded_width = padding[i].first + dim_lengths[i] + padding[i].second;
     window_counts[i] =
-        WindowCount(dim_lengths[i], window[i], stride[i], padding);
-    pad_low[i] = padding_both[i].first;
+        window_util::StridedBound(padded_width, window[i], stride[i]);
+    pad_low[i] = padding[i].first;
   }
   auto result = MakeUnique<Array2D<float>>(window_counts[0], window_counts[1]);
 
@@ -271,7 +260,7 @@ ReferenceUtil::ReduceWindow1DAdd(
           if (i0_base + i0_win >= 0 && i1_base + i1_win >= 0 &&
               i0_base + i0_win < operand.n1() &&
               i1_base + i1_win < operand.n2()) {
-            val += operand(i0_base + i0_win, i1_base + i1_win);
+            val = reduce_func(val, operand(i0_base + i0_win, i1_base + i1_win));
           }
         }
       }
@@ -281,6 +270,17 @@ ReferenceUtil::ReduceWindow1DAdd(
   return result;
 }
 
+/* static  */ std::unique_ptr<Array2D<float>> ReferenceUtil::ReduceWindow2DAdd(
+    const Array2D<float>& operand, float init,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
+  const auto add_reduce = [](float arg1, float arg2) { return arg1 + arg2; };
+  std::vector<int64> dim_lengths{operand.height(), operand.width()};
+  return ReduceWindow2DGeneric(
+      operand, init, add_reduce, window, stride,
+      xla::MakePadding(dim_lengths, window, stride, padding));
+}
+
 /* static  */ std::unique_ptr<Array3D<float>> ReferenceUtil::ReduceWindow3DAdd(
     const Array3D<float>& operand, float init,
     const tensorflow::gtl::ArraySlice<int64>& window,
@@ -472,7 +472,7 @@ ReferenceUtil::SelectAndScatter4DGePlus(
                       i3_base + i3_win < operand.n4()) {
                     float tmp = operand(i0_base + i0_win, i1_base + i1_win,
                                         i2_base + i2_win, i3_base + i3_win);
-                    if (tmp >= val) {
+                    if (tmp > val) {
                       val = tmp;
                       scatter_0 = i0_base + i0_win;
                       scatter_1 = i1_base + i1_win;
@@ -572,7 +572,8 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
 
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  HloModule module("ReferenceUtil");
+  HloModuleConfig config;
+  HloModule module("ReferenceUtil", config);
   auto computation = module.AddEntryComputation(b.Build());
 
   HloEvaluator evaluator;
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 3ec96f2f38b8f91e1549419b60481327fa9bbd5f..28d6a8c3fe85fa4179bf2f41c82ad4eb93a045fe 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -39,10 +39,22 @@ namespace xla {
 class ReferenceUtil {
  public:
   // Returns the result of a transpose operation on the input matrix.
-  static std::unique_ptr<Array2D<float>> TransposeArray2D(
-      const Array2D<float>& operand);
+  template <typename T>
+  static std::unique_ptr<Array2D<T>> TransposeArray2D(
+      const Array2D<T>& operand) {
+    auto result = MakeUnique<Array2D<T>>(operand.width(), operand.height());
+    for (int64 w = 0; w < operand.width(); ++w) {
+      for (int64 h = 0; h < operand.height(); ++h) {
+        (*result)(w, h) = operand(h, w);
+      }
+    }
+
+    return result;
+  }
 
   // Returns the result of a matrix multiply `lhs x rhs`.
+  static std::unique_ptr<Array2D<Eigen::half>> MatmulArray2D(
+      const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs);
   static std::unique_ptr<Array2D<float>> MatmulArray2D(
       const Array2D<float>& lhs, const Array2D<float>& rhs);
   static std::unique_ptr<Array2D<double>> MatmulArray2D(
@@ -187,9 +199,10 @@ class ReferenceUtil {
       const tensorflow::gtl::ArraySlice<float>& operand, float init,
       const std::function<float(float, float)>& reduce_func,
       const tensorflow::gtl::ArraySlice<int64>& window,
-      const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
-  static std::unique_ptr<std::vector<float>> ReduceWindow1DGeneric(
-      const tensorflow::gtl::ArraySlice<float>& operand, float init,
+      const tensorflow::gtl::ArraySlice<int64>& stride,
+      const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding);
+  static std::unique_ptr<Array2D<float>> ReduceWindow2DGeneric(
+      const Array2D<float>& operand, float init,
       const std::function<float(float, float)>& reduce_func,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride,
@@ -215,6 +228,7 @@ class ReferenceUtil {
 
   // Performs select and scatter with Greater Than or equal as the select, plus
   // as the scatter, and Same Padding.
+  // TODO(b/74533103) Switch tests to evaluator and remove this implementation.
   static std::unique_ptr<Array4D<float>> SelectAndScatter4DGePlus(
       const Array4D<float>& operand, const Array4D<float>& source, float init,
       const tensorflow::gtl::ArraySlice<int64>& window,
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..977f8637873a4b6555798f533010a28ff36e8679
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -0,0 +1,79 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load(
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_proto_library",
+    "xla_py_grpc_library",
+)
+
+xla_proto_library(
+    name = "xla_service_proto",
+    srcs = ["xla_service.proto"],
+    use_grpc_plugin = True,
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
+    ],
+)
+
+cc_library(
+    name = "grpc_stub",
+    srcs = ["grpc_stub.cc"],
+    hdrs = ["grpc_stub.h"],
+    deps = [
+        ":xla_service_proto",
+        "//tensorflow/compiler/xla:service_interface",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+    ],
+)
+
+tf_cc_binary(
+    name = "grpc_service_main_cpu",
+    srcs = ["grpc_service_main.cc"],
+    deps = [
+        ":grpc_service",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+tf_cc_test(
+    name = "grpc_client_test",
+    srcs = ["grpc_client_test.cc"],
+    data = [
+        "//tensorflow/compiler/xla/rpc:grpc_service_main_cpu",
+    ],
+    deps = [
+        ":grpc_stub",
+        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_service",
+    srcs = ["grpc_service.cc"],
+    hdrs = ["grpc_service.h"],
+    deps = [
+        ":xla_service_proto",
+        "//tensorflow/compiler/xla/service",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b559ee4b5a345dbb2cc481b571562a0a630b3294
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Simple C++ test to exercise the GRPC capabilities of XLA.
+//
+// Launches an RPC service in a subprocess and connects to it over a socket
+// using an RPCStub.
+#include <memory>
+#include <vector>
+
+#include "grpc++/create_channel.h"
+#include "grpc++/security/credentials.h"
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/rpc/grpc_stub.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/net.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class GRPCClientTestBase : public ::testing::Test {
+ protected:
+  GRPCClientTestBase() {
+    string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
+    string service_main_path = tensorflow::io::JoinPath(
+        test_srcdir, "compiler/xla/rpc/grpc_service_main_cpu");
+    int port = tensorflow::internal::PickUnusedPortOrDie();
+    subprocess_.SetProgram(
+        service_main_path,
+        {service_main_path, tensorflow::strings::Printf("--port=%d", port)});
+    subprocess_.SetChannelAction(tensorflow::CHAN_STDOUT,
+                                 tensorflow::ACTION_DUPPARENT);
+    subprocess_.SetChannelAction(tensorflow::CHAN_STDERR,
+                                 tensorflow::ACTION_DUPPARENT);
+    CHECK(subprocess_.Start());
+    LOG(INFO) << "Launched subprocess";
+
+    auto channel =
+        ::grpc::CreateChannel(tensorflow::strings::Printf("localhost:%d", port),
+                              ::grpc::InsecureChannelCredentials());
+    channel->WaitForConnected(gpr_time_add(
+        gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(10, GPR_TIMESPAN)));
+    LOG(INFO) << "Channel to server is connected on port " << port;
+
+    xla_service_ = grpc::XlaService::NewStub(channel);
+    stub_.reset(new GRPCStub(xla_service_.get()));
+    client_.reset(new Client(stub_.get()));
+  }
+
+  ~GRPCClientTestBase() override {
+    LOG(INFO) << "Killing subprocess";
+    subprocess_.Kill(SIGKILL);
+  }
+
+  tensorflow::SubProcess subprocess_;
+  std::unique_ptr<grpc::XlaService::Stub> xla_service_;
+  std::unique_ptr<GRPCStub> stub_;
+  std::unique_ptr<Client> client_;
+};
+
+TEST_F(GRPCClientTestBase, ItsAlive) {
+  ASSERT_NE(xla_service_, nullptr);
+  ASSERT_NE(stub_, nullptr);
+  ASSERT_NE(client_, nullptr);
+}
+
+TEST_F(GRPCClientTestBase, AxpyTenValues) {
+  ComputationBuilder builder(client_.get(), "axpy_10");
+  auto alpha = builder.ConstantR0<float>(3.1415926535);
+  auto x = builder.ConstantR1<float>(
+      {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
+  auto y = builder.ConstantR1<float>(
+      {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0});
+  auto ax = builder.Mul(alpha, x);
+  auto axpy = builder.Add(ax, y);
+
+  std::vector<float> expected = {
+      1.85840735, -1.85840735, 2.28318531,   -2.28318531,  -6.42477796,
+      6.42477796, 10.56637061, -10.56637061, -14.70796327, 14.70796327};
+  std::unique_ptr<Literal> expected_literal =
+      Literal::CreateR1<float>(expected);
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(auto result_literal, client_->ExecuteAndTransfer(
+                                                   computation, {}, nullptr));
+  LiteralTestUtil::ExpectNear(*expected_literal, *result_literal,
+                              ErrorSpec(0.0001));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b100bd108e239964483ed5ba279dff61bce0023
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -0,0 +1,192 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/rpc/grpc_service.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<GRPCService>> GRPCService::NewService(
+    se::Platform* platform) {
+  std::unique_ptr<GRPCService> grpc_service(new GRPCService());
+  TF_ASSIGN_OR_RETURN(grpc_service->service_,
+                      ::xla::Service::NewService(platform));
+  return std::move(grpc_service);
+}
+
+::grpc::Status DelegateRPC(std::function<tensorflow::Status()> op) {
+  tensorflow::Status s = op();
+  return tensorflow::ToGrpcStatus(s);
+}
+
+::grpc::Status GRPCService::Computation(::grpc::ServerContext* context,
+                                        const ComputationRequest* arg,
+                                        ComputationResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Computation(arg, result); });
+}
+
+::grpc::Status GRPCService::CreateOp(::grpc::ServerContext* context,
+                                     const OpRequest* arg, OpResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Op(arg, result); });
+}
+
+::grpc::Status GRPCService::Unregister(::grpc::ServerContext* context,
+                                       const UnregisterRequest* arg,
+                                       UnregisterResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Unregister(arg, result); });
+}
+
+::grpc::Status GRPCService::DeconstructTuple(::grpc::ServerContext* context,
+                                             const DeconstructTupleRequest* arg,
+                                             DeconstructTupleResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->DeconstructTuple(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::SetReturnValue(::grpc::ServerContext* context,
+                                           const SetReturnValueRequest* arg,
+                                           SetReturnValueResponse* results) {
+  return DelegateRPC([this, arg, results]() {
+    return service_->SetReturnValue(arg, results);
+  });
+}
+
+::grpc::Status GRPCService::Execute(::grpc::ServerContext* context,
+                                    const ExecuteRequest* arg,
+                                    ExecuteResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Execute(arg, result); });
+}
+
+::grpc::Status GRPCService::ExecuteAsync(::grpc::ServerContext* context,
+                                         const ExecuteAsyncRequest* arg,
+                                         ExecuteAsyncResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ExecuteAsync(arg, result); });
+}
+
+::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
+                                             const WaitForExecutionRequest* arg,
+                                             WaitForExecutionResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->WaitForExecution(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToClient(::grpc::ServerContext* context,
+                                             const TransferToClientRequest* arg,
+                                             TransferToClientResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToClient(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToServer(::grpc::ServerContext* context,
+                                             const TransferToServerRequest* arg,
+                                             TransferToServerResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToServer(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToInfeed(::grpc::ServerContext* context,
+                                             const TransferToInfeedRequest* arg,
+                                             TransferToInfeedResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToInfeed(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferFromOutfeed(
+    ::grpc::ServerContext* context, const TransferFromOutfeedRequest* arg,
+    TransferFromOutfeedResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferFromOutfeed(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::ResetDevice(::grpc::ServerContext* context,
+                                        const ResetDeviceRequest* arg,
+                                        ResetDeviceResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ResetDevice(arg, result); });
+}
+
+::grpc::Status GRPCService::IsConstant(::grpc::ServerContext* context,
+                                       const IsConstantRequest* arg,
+                                       IsConstantResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->IsConstant(arg, result); });
+}
+
+::grpc::Status GRPCService::ComputeConstant(::grpc::ServerContext* context,
+                                            const ComputeConstantRequest* arg,
+                                            ComputeConstantResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ComputeConstant(arg, result); });
+}
+
+::grpc::Status GRPCService::GetShape(::grpc::ServerContext* context,
+                                     const GetShapeRequest* arg,
+                                     GetShapeResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->GetShape(arg, result); });
+}
+
+::grpc::Status GRPCService::GetComputationShape(
+    ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
+    GetComputationShapeResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetComputationShape(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::GetLocalShape(::grpc::ServerContext* context,
+                                          const GetLocalShapeRequest* arg,
+                                          GetLocalShapeResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->GetLocalShape(arg, result); });
+}
+
+::grpc::Status GRPCService::GetComputationStats(
+    ::grpc::ServerContext* context, const ComputationStatsRequest* arg,
+    ComputationStatsResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetComputationStats(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::SnapshotComputation(
+    ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
+    SnapshotComputationResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->SnapshotComputation(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::LoadComputationSnapshot(
+    ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
+    LoadComputationSnapshotResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->LoadComputationSnapshot(arg, result);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..fad74375bd59f7254d97c4adbc6b3d2f5fbf6b29
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
+
+#include "grpc++/server_context.h"
+#include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h"
+#include "tensorflow/compiler/xla/service/service.h"
+
+namespace xla {
+
+// Service implementation which wraps a XLA Service with a GRPC interface.
+class GRPCService : public grpc::XlaService::Service {
+ public:
+  // Factory for creating a RPCService. The parameter platform is the platform
+  // that the service should target. If platform is null then the default
+  // platform is used.
+  static StatusOr<std::unique_ptr<GRPCService>> NewService(
+      se::Platform* platform = nullptr);
+
+  ::grpc::Status Computation(::grpc::ServerContext* context,
+                             const ComputationRequest* arg,
+                             ComputationResponse* result) override;
+
+  ::grpc::Status CreateOp(::grpc::ServerContext* context, const OpRequest* arg,
+                          OpResponse* result) override;
+
+  ::grpc::Status Unregister(::grpc::ServerContext* context,
+                            const UnregisterRequest* arg,
+                            UnregisterResponse* result) override;
+
+  ::grpc::Status DeconstructTuple(::grpc::ServerContext* context,
+                                  const DeconstructTupleRequest* arg,
+                                  DeconstructTupleResponse* result) override;
+
+  ::grpc::Status SetReturnValue(::grpc::ServerContext* context,
+                                const SetReturnValueRequest* arg,
+                                SetReturnValueResponse* results) override;
+
+  ::grpc::Status Execute(::grpc::ServerContext* context,
+                         const ExecuteRequest* arg,
+                         ExecuteResponse* result) override;
+
+  ::grpc::Status ExecuteAsync(::grpc::ServerContext* context,
+                              const ExecuteAsyncRequest* arg,
+                              ExecuteAsyncResponse* result) override;
+
+  ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
+                                  const WaitForExecutionRequest* arg,
+                                  WaitForExecutionResponse* result) override;
+
+  ::grpc::Status TransferToClient(::grpc::ServerContext* context,
+                                  const TransferToClientRequest* arg,
+                                  TransferToClientResponse* result) override;
+
+  ::grpc::Status TransferToServer(::grpc::ServerContext* context,
+                                  const TransferToServerRequest* arg,
+                                  TransferToServerResponse* result) override;
+
+  ::grpc::Status TransferToInfeed(::grpc::ServerContext* context,
+                                  const TransferToInfeedRequest* arg,
+                                  TransferToInfeedResponse* result) override;
+
+  ::grpc::Status TransferFromOutfeed(
+      ::grpc::ServerContext* context, const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
+
+  ::grpc::Status ResetDevice(::grpc::ServerContext* context,
+                             const ResetDeviceRequest* arg,
+                             ResetDeviceResponse* result) override;
+
+  ::grpc::Status IsConstant(::grpc::ServerContext* context,
+                            const IsConstantRequest* arg,
+                            IsConstantResponse* result) override;
+
+  ::grpc::Status ComputeConstant(::grpc::ServerContext* context,
+                                 const ComputeConstantRequest* arg,
+                                 ComputeConstantResponse* result) override;
+
+  ::grpc::Status GetShape(::grpc::ServerContext* context,
+                          const GetShapeRequest* arg,
+                          GetShapeResponse* result) override;
+
+  ::grpc::Status GetComputationShape(
+      ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
+      GetComputationShapeResponse* result) override;
+
+  ::grpc::Status GetLocalShape(::grpc::ServerContext* context,
+                               const GetLocalShapeRequest* arg,
+                               GetLocalShapeResponse* result) override;
+
+  ::grpc::Status GetComputationStats(::grpc::ServerContext* context,
+                                     const ComputationStatsRequest* arg,
+                                     ComputationStatsResponse* result) override;
+
+  ::grpc::Status SnapshotComputation(
+      ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
+      SnapshotComputationResponse* result) override;
+
+  ::grpc::Status LoadComputationSnapshot(
+      ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
+      LoadComputationSnapshotResponse* result) override;
+
+ private:
+  std::unique_ptr<::xla::Service> service_;
+
+  GRPCService() {}
+  GRPCService(const GRPCService&) = delete;
+  void operator=(const GRPCService&) = delete;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
diff --git a/tensorflow/compiler/xla/rpc/grpc_service_main.cc b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e29908ccec80db76e3b5b856e57382c56430c379
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic server binary that exposes a xla::Service through a GRPC interface
+// on a configurable port.
+#include "grpc++/security/server_credentials.h"
+#include "grpc++/server.h"
+#include "grpc++/server_builder.h"
+#include "tensorflow/compiler/xla/rpc/grpc_service.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+namespace {
+
+int RealMain(int argc, char** argv) {
+  int32 port = 1685;
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("port", &port, "port to listen on"),
+  };
+  string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parsed_values_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parsed_values_ok) {
+    LOG(ERROR) << usage;
+    return 2;
+  }
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  std::unique_ptr<xla::GRPCService> service =
+      xla::GRPCService::NewService().ConsumeValueOrDie();
+
+  ::grpc::ServerBuilder builder;
+  string server_address(tensorflow::strings::Printf("localhost:%d", port));
+
+  builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials());
+  builder.RegisterService(service.get());
+  std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+
+  LOG(INFO) << "Server listening on " << server_address;
+  server->Wait();
+
+  return 0;
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) { return xla::RealMain(argc, argv); }
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1f2b0abe39b10dd82b700941748bc4f4e8cb2f8
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -0,0 +1,244 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/rpc/grpc_stub.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace xla {
+
+GRPCStub::~GRPCStub() = default;
+
+tensorflow::Status MakeRPC(
+    const std::function<::grpc::Status(::grpc::ClientContext*)>& rpc_method) {
+  ::grpc::ClientContext context;
+  ::grpc::Status s = rpc_method(&context);
+  return tensorflow::FromGrpcStatus(s);
+}
+
+tensorflow::Status GRPCStub::TransferToClient(
+    const TransferToClientRequest* request,
+    TransferToClientResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToClient(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferToServer(
+    const TransferToServerRequest* request,
+    TransferToServerResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToServer(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferToInfeed(
+    const TransferToInfeedRequest* request,
+    TransferToInfeedResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToInfeed(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferFromOutfeed(
+    const TransferFromOutfeedRequest* request,
+    TransferFromOutfeedResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferFromOutfeed(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
+                                         ResetDeviceResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ResetDevice(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::LoadComputationSnapshot(
+    const LoadComputationSnapshotRequest* request,
+    LoadComputationSnapshotResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->LoadComputationSnapshot(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::Execute(const ExecuteRequest* request,
+                                     ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Execute(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
+                                          ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteGraph(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteParallel(
+    const ExecuteParallelRequest* request, ExecuteParallelResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteParallel(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteGraphParallel(
+    const ExecuteGraphParallelRequest* request,
+    ExecuteParallelResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteGraphParallel(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteAsync(const ExecuteAsyncRequest* request,
+                                          ExecuteAsyncResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteAsync(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::WaitForExecution(
+    const WaitForExecutionRequest* request,
+    WaitForExecutionResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->WaitForExecution(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::DeconstructTuple(
+    const DeconstructTupleRequest* request,
+    DeconstructTupleResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->DeconstructTuple(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationStats(
+    const ComputationStatsRequest* request,
+    ComputationStatsResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationStats(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationGraphStats(
+    const ComputationGraphStatsRequest* request,
+    ComputationStatsResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationGraphStats(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationShape(
+    const GetComputationShapeRequest* request,
+    GetComputationShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetShape(const GetShapeRequest* request,
+                                      GetShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetDeviceHandles(
+    const GetDeviceHandlesRequest* request,
+    GetDeviceHandlesResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetDeviceHandles(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::CreateChannelHandle(
+    const CreateChannelHandleRequest* request,
+    CreateChannelHandleResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->CreateChannelHandle(context, *request, response);
+  });
+}
+
+// Methods used by ComputationBuilder.
+tensorflow::Status GRPCStub::Computation(const ComputationRequest* request,
+                                         ComputationResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Computation(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::Op(const OpRequest* request,
+                                OpResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->CreateOp(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetLocalShape(const GetLocalShapeRequest* request,
+                                           GetLocalShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetLocalShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::SetReturnValue(
+    const SetReturnValueRequest* request, SetReturnValueResponse* responses) {
+  return MakeRPC([this, request, responses](::grpc::ClientContext* context) {
+    return grpc_stub_->SetReturnValue(context, *request, responses);
+  });
+}
+
+tensorflow::Status GRPCStub::IsConstant(const IsConstantRequest* request,
+                                        IsConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->IsConstant(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ComputeConstant(
+    const ComputeConstantRequest* request, ComputeConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ComputeConstant(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ComputeConstantGraph(
+    const ComputeConstantGraphRequest* request,
+    ComputeConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ComputeConstantGraph(context, *request, response);
+  });
+}
+
+// Methods used by Computation.
+tensorflow::Status GRPCStub::SnapshotComputation(
+    const SnapshotComputationRequest* request,
+    SnapshotComputationResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->SnapshotComputation(context, *request, response);
+  });
+}
+
+// Methods used by GlobalData.
+tensorflow::Status GRPCStub::Unregister(const UnregisterRequest* request,
+                                        UnregisterResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Unregister(context, *request, response);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd9810d4f1a5e084b73e83007ea7f9f8b0462c72
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
+#define TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
+
+#include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h"
+#include "tensorflow/compiler/xla/service_interface.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+class GRPCStub : public ServiceInterface {
+ public:
+  explicit GRPCStub(grpc::XlaService::Stub* stub) : grpc_stub_(stub) {}
+  ~GRPCStub() override;
+
+  tensorflow::Status TransferToClient(
+      const TransferToClientRequest* arg,
+      TransferToClientResponse* result) override;
+
+  tensorflow::Status TransferToServer(
+      const TransferToServerRequest* arg,
+      TransferToServerResponse* result) override;
+
+  tensorflow::Status TransferToInfeed(
+      const TransferToInfeedRequest* arg,
+      TransferToInfeedResponse* result) override;
+
+  tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
+
+  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
+                                 ResetDeviceResponse* result) override;
+
+  tensorflow::Status LoadComputationSnapshot(
+      const LoadComputationSnapshotRequest* request,
+      LoadComputationSnapshotResponse* result) override;
+
+  tensorflow::Status Execute(const ExecuteRequest* arg,
+                             ExecuteResponse* result) override;
+
+  tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* request,
+                                  ExecuteResponse* response) override;
+
+  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                                     ExecuteParallelResponse* result) override;
+
+  tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* request,
+      ExecuteParallelResponse* response) override;
+
+  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                                  ExecuteAsyncResponse* result) override;
+
+  tensorflow::Status WaitForExecution(
+      const WaitForExecutionRequest* arg,
+      WaitForExecutionResponse* result) override;
+
+  tensorflow::Status DeconstructTuple(
+      const DeconstructTupleRequest* arg,
+      DeconstructTupleResponse* result) override;
+
+  tensorflow::Status GetComputationStats(
+      const ComputationStatsRequest* arg,
+      ComputationStatsResponse* result) override;
+
+  tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* request,
+      ComputationStatsResponse* response) override;
+
+  tensorflow::Status GetComputationShape(
+      const GetComputationShapeRequest* arg,
+      GetComputationShapeResponse* result) override;
+
+  tensorflow::Status GetShape(const GetShapeRequest* arg,
+                              GetShapeResponse* result) override;
+
+  tensorflow::Status GetDeviceHandles(
+      const GetDeviceHandlesRequest* arg,
+      GetDeviceHandlesResponse* result) override;
+
+  tensorflow::Status CreateChannelHandle(
+      const CreateChannelHandleRequest* arg,
+      CreateChannelHandleResponse* result) override;
+
+  // Methods used by ComputationBuilder.
+  tensorflow::Status Computation(const ComputationRequest* arg,
+                                 ComputationResponse* result) override;
+
+  tensorflow::Status Op(const OpRequest* arg, OpResponse* result) override;
+  tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
+                                   GetLocalShapeResponse* result) override;
+
+  tensorflow::Status SetReturnValue(const SetReturnValueRequest* arg,
+                                    SetReturnValueResponse* results) override;
+
+  tensorflow::Status IsConstant(const IsConstantRequest* arg,
+                                IsConstantResponse* result) override;
+
+  tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
+                                     ComputeConstantResponse* result) override;
+
+  tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) override;
+
+  // Methods used by Computation.
+  tensorflow::Status SnapshotComputation(
+      const SnapshotComputationRequest* ag,
+      SnapshotComputationResponse* result) override;
+
+  // Methods used by GlobalData.
+  tensorflow::Status Unregister(const UnregisterRequest* arg,
+                                UnregisterResponse* result) override;
+
+  grpc::XlaService::Stub* service() { return grpc_stub_; }
+
+ private:
+  grpc::XlaService::Stub* grpc_stub_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GRPCStub);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c47164ee1b7657ae378a053f553442bee751753e
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -0,0 +1,225 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA service API.
+//
+// Users 1) build up computations and 2) create allocations via this API.
+// Computations are composed of data flowing between arbitrarily-sized
+// vector-oriented operations.
+//
+// Users build up computations using a ComputationHandle, and talk about
+// allocations using GlobalDataHandles.
+//
+// There are currently no checkpointing capabilities or distribution/replication
+// guarantees. The service runs on a single machine (e.g. one task) and that is
+// its failure domain.
+//
+// Canonical example of "alpha * X + Y":
+// * Make a computation.
+// * Add alpha and X and Y as parameters.
+// * Request the multiplication of alpha and X.
+// * Request the addition of that result and Y.
+//
+// Then, pass the computation and appropriately shaped inputs to the XLA
+// service's Execute method, which provides a result as a GlobalDataHandle.
+//
+// All data in XLA computations are conceptually immutable.
+//
+// Note: this API is subject to change / refinement over time -- use the
+// provided client libraries to insulate code from changes to this service API.
+
+syntax = "proto3";
+
+import "tensorflow/compiler/xla/xla.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
+package xla;
+
+service XlaService {
+  /////////////////////////
+  // Global data requests
+
+  // Unregisters a global allocation.
+  //
+  // If the handle given is not currently allocated, a NOT_FOUND status is
+  // returned.
+  rpc Unregister(UnregisterRequest) returns (UnregisterResponse) {
+  }
+
+  // Deconstructs a tuple. Returns a newly created GlobalDataHandle for each
+  // element in the tuple.
+  rpc DeconstructTuple(DeconstructTupleRequest)
+      returns (DeconstructTupleResponse) {
+  }
+
+  // Unpack requests that a global data handle, with a tuple shape, has global
+  // data handles created for each of its constituent members. This is the
+  // equivalent of the "destructuring assignment" present in various programming
+  // languages.
+  rpc Unpack(UnpackRequest) returns (UnpackResponse) {
+  }
+
+  // Requests the shape of the referenced global data.
+  rpc GetShape(GetShapeRequest) returns (GetShapeResponse) {
+  }
+
+  // Requests the program shape of the referenced computation.
+  rpc GetComputationShape(GetComputationShapeRequest)
+      returns (GetComputationShapeResponse) {
+  }
+
+  // Requests the statistics of the given computation.
+  rpc GetComputationStats(ComputationStatsRequest)
+      returns (ComputationStatsResponse) {
+  }
+
+  // Requests the statistics of the given computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  rpc GetComputationGraphStats(ComputationGraphStatsRequest)
+      returns (ComputationStatsResponse) {
+  }
+
+  // Loads a variable number of values with a given element type from ColumnIO.
+  rpc LoadData(LoadDataRequest) returns (LoadDataResponse) {
+  }
+
+  // Transfers the given global data to the client in the form of a Literal.
+  rpc TransferToClient(TransferToClientRequest)
+      returns (TransferToClientResponse) {
+  }
+
+  // Transfers the given literal to the server to be stored in a global
+  // allocation, which is returned.
+  rpc TransferToServer(TransferToServerRequest)
+      returns (TransferToServerResponse) {
+  }
+
+  // Transfers the given literal to the Infeed buffer of the device.
+  rpc TransferToInfeed(TransferToInfeedRequest)
+      returns (TransferToInfeedResponse) {
+  }
+
+  // Transferred literal from the Outfeed buffer of the device.
+  rpc TransferFromOutfeed(TransferFromOutfeedRequest)
+      returns (TransferFromOutfeedResponse) {
+  }
+
+  // Resets the device, clearing all existing state on the device.
+  rpc ResetDevice(ResetDeviceRequest) returns (ResetDeviceResponse) {
+  }
+
+  // Tests if an expression is a compile-time constant.
+  rpc IsConstant(IsConstantRequest) returns (IsConstantResponse) {
+  }
+
+  // Computes the value of a constant expression.
+  rpc ComputeConstant(ComputeConstantRequest)
+      returns (ComputeConstantResponse) {
+  }
+
+  // Computes the value of a constant expression. The request contains the
+  // computation graph for the constant expression.
+  rpc ComputeConstantGraph(ComputeConstantGraphRequest)
+      returns (ComputeConstantResponse) {
+  }
+
+  // Retrieves the inferred shape for a value within a computation.
+  rpc GetLocalShape(GetLocalShapeRequest) returns (GetLocalShapeResponse) {
+  }
+
+  // Requests one or more device handles from the target. The returned device
+  // handles can be used to specify the device on which to execute computations
+  // or transfer data.
+  rpc GetDeviceHandles(GetDeviceHandlesRequest)
+      returns (GetDeviceHandlesResponse) {
+  }
+
+  // Creates a channel handle that can be used to transfer data between
+  // two computations via a pair of Send and Recv instructions.
+  rpc CreateChannelHandle(CreateChannelHandleRequest)
+      returns (CreateChannelHandleResponse) {
+  }
+
+  // Requests that the referenced computation be specialized for the provided
+  // arguments for subsequent execution. This permits things such as value
+  // specialization.
+  rpc Specialize(SpecializeRequest) returns (SpecializeResponse) {
+  }
+
+  // Modifies the provided computation so that subsequent executions
+  // will compute the provided ComputationDataHandle, rather than the
+  // last expression enqueued on that Computation.
+  rpc SetReturnValue(SetReturnValueRequest) returns (SetReturnValueResponse) {
+  }
+
+  // Computation creates a new computation with the given name.
+  // A unique ComputationHandle is returned.
+  rpc Computation(ComputationRequest) returns (ComputationResponse) {
+  }
+
+  // Adds a new op to a computation.
+  rpc CreateOp(OpRequest) returns (OpResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. Returns global data output and execution timing.
+  rpc Execute(ExecuteRequest) returns (ExecuteResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. The request contains the whole computation graph.
+  // Returns global data output and execution timing.
+  rpc ExecuteGraph(ExecuteGraphRequest) returns (ExecuteResponse) {
+  }
+
+  // Invokes the provided list of computations in parallel with the provided
+  // global data for each computation. Returns a list of global data output and
+  // execution timing.
+  rpc ExecuteParallel(ExecuteParallelRequest)
+      returns (ExecuteParallelResponse) {
+  }
+
+  // Invokes the provided list of computations in parallel with the provided
+  // global data for each computation. Returns a list of global data output and
+  // execution timing.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  rpc ExecuteGraphParallel(ExecuteGraphParallelRequest)
+      returns (ExecuteParallelResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. Returns a handle to the execution.
+  rpc ExecuteAsync(ExecuteAsyncRequest) returns (ExecuteAsyncResponse) {
+  }
+
+  // Waits until the given execution (aysnchronously launched) is complete, and
+  // returns the global data output.
+  rpc WaitForExecution(WaitForExecutionRequest)
+      returns (WaitForExecutionResponse) {
+  }
+
+  // Serializes a computation to proto form, so it can be loaded via
+  // LoadComputationSnapshot.
+  rpc SnapshotComputation(SnapshotComputationRequest)
+      returns (SnapshotComputationResponse) {
+  }
+
+  // Loads a computation from a captured snapshot.
+  rpc LoadComputationSnapshot(LoadComputationSnapshotRequest)
+      returns (LoadComputationSnapshotResponse) {
+  }
+}
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 4a076ac0909d24f6c7355a323d4b78151d3fe2ac..ed0da47681c7eff8120ed46b4088213b0bb24734 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -106,6 +106,7 @@ tf_cc_test(
         ":bfloat16_normalization",
         ":bfloat16_support",
         ":hlo",
+        ":hlo_verifier",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
@@ -118,6 +119,42 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "bfloat16_propagation",
+    srcs = ["bfloat16_propagation.cc"],
+    hdrs = ["bfloat16_propagation.h"],
+    deps = [
+        ":bfloat16_support",
+        ":hlo",
+        ":hlo_dataflow_analysis",
+        ":hlo_dce",
+        ":hlo_pass",
+        ":tuple_simplifier",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "bfloat16_propagation_test",
+    srcs = ["bfloat16_propagation_test.cc"],
+    deps = [
+        ":bfloat16_propagation",
+        ":bfloat16_support",
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+    ],
+)
+
 cc_library(
     name = "shape_inference",
     srcs = ["shape_inference.cc"],
@@ -248,6 +285,46 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dfs_hlo_visitor_with_default_test",
+    srcs = ["dfs_hlo_visitor_with_default_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_runner",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "pattern_matcher",
+    hdrs = ["pattern_matcher.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "pattern_matcher_test",
+    srcs = ["pattern_matcher_test.cc"],
+    deps = [
+        ":hlo",
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_reachability",
     srcs = ["hlo_reachability.cc"],
@@ -282,6 +359,7 @@ cc_library(
         ":hlo",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -586,6 +664,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
@@ -675,9 +754,9 @@ cc_library(
         ":computation_layout",
         ":device_memory_allocator",
         ":hlo",
-        ":hlo_cost_analysis",
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
+        ":hlo_proto",
         ":pool",
         ":session_proto",
         ":shaped_buffer",
@@ -952,6 +1031,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -1028,6 +1108,38 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_module_group_metadata",
+    srcs = ["hlo_module_group_metadata.cc"],
+    hdrs = ["hlo_module_group_metadata.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "hlo_module_group_util",
+    srcs = ["hlo_module_group_util.cc"],
+    hdrs = ["hlo_module_group_util.h"],
+    deps = [
+        ":hlo",
+        ":hlo_module_group_metadata",
+        ":hlo_reachability",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "hlo_scheduling",
     srcs = ["hlo_scheduling.cc"],
@@ -1059,6 +1171,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1093,6 +1206,37 @@ tf_cc_test(
         ":instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+    ],
+)
+
+cc_library(
+    name = "hlo_creation_utils",
+    srcs = ["hlo_creation_utils.cc"],
+    hdrs = ["hlo_creation_utils.h"],
+    deps = [
+        ":hlo",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_creation_utils_test",
+    srcs = ["hlo_creation_utils_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_evaluator",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -1104,7 +1248,6 @@ cc_library(
         ":hlo",
         ":hlo_pass",
         ":hlo_query",
-        ":shape_inference",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1116,6 +1259,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gather_expander",
+    srcs = ["gather_expander.cc"],
+    hdrs = ["gather_expander.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        ":while_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+    ],
+)
+
 tf_cc_test(
     name = "batchnorm_expander_test",
     size = "small",
@@ -1143,9 +1300,10 @@ cc_library(
     hdrs = ["algebraic_simplifier.h"],
     deps = [
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_pass",
         ":hlo_query",
-        ":shape_inference",
+        ":pattern_matcher",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1172,6 +1330,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
@@ -1179,6 +1338,53 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gather_expander_test",
+    srcs = ["gather_expander_test.cc"],
+    deps = [
+        ":gather_expander",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:test_macros_header",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+    ],
+)
+
+cc_library(
+    name = "conditional_simplifier",
+    srcs = ["conditional_simplifier.cc"],
+    hdrs = ["conditional_simplifier.h"],
+    deps = [
+        ":call_inliner",
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "conditional_simplifier_test",
+    srcs = ["conditional_simplifier_test.cc"],
+    deps = [
+        ":conditional_simplifier",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "while_loop_simplifier",
     srcs = ["while_loop_simplifier.cc"],
@@ -1201,6 +1407,7 @@ tf_cc_test(
         ":while_loop_simplifier",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
 )
@@ -1434,6 +1641,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -1828,6 +2036,7 @@ cc_library(
     srcs = ["hlo_verifier.cc"],
     hdrs = ["hlo_verifier.h"],
     deps = [
+        ":hlo",
         ":hlo_pass",
         ":shape_inference",
         "//tensorflow/compiler/xla:status_macros",
@@ -2212,6 +2421,7 @@ tf_cc_test(
         ":hlo_graph_dumper",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
@@ -2315,6 +2525,24 @@ cc_library(
         ":hlo",
         ":hlo_proto",
         "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_proto_util_test",
+    srcs = ["hlo_proto_util_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_proto",
+        ":hlo_proto_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -2354,6 +2582,7 @@ cc_library(
     srcs = ["hlo_runner.cc"],
     hdrs = ["hlo_runner.h"],
     deps = [
+        ":computation_placer",
         ":executable",
         ":hlo",
         ":transfer_manager",
@@ -2370,6 +2599,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2413,7 +2643,9 @@ cc_library(
     deps = [
         ":call_inliner",
         ":hlo",
+        ":hlo_creation_utils",
         ":tuple_util",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -2456,6 +2688,21 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "despecializer",
+    srcs = ["despecializer.cc"],
+    hdrs = ["despecializer.h"],
+    deps = [
+        ":bfloat16_normalization",
+        ":defuser",
+        ":hlo",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":implicit_broadcast_remover",
+        "//tensorflow/compiler/xla:statusor",
+    ],
+)
+
 cc_library(
     name = "source_map_util",
     srcs = ["source_map_util.cc"],
@@ -2467,17 +2714,3 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index fb857559f972a220a19b108baa4c441e09b90e1f..8e785de68cb1fbe4ce9fd58a661bdc208725483b 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -26,10 +26,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -44,8 +45,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
+
 namespace {
 
+namespace m = match;
+
 // Returns whether operand is a literal with the given value.
 bool IsLiteralWithValue(const HloInstruction* operand, int8 value) {
   return operand->opcode() == HloOpcode::kConstant &&
@@ -105,6 +109,7 @@ HloComputation* CreateScalarBinaryComputation(HloModule* module,
       module->AddEmbeddedComputation(b.Build(scalar_op));
   return scalar_computation;
 }
+
 }  // namespace
 
 // AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain
@@ -122,6 +127,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleBitcast(HloInstruction* bitcast) override;
 
+  Status HandleBitcastConvert(HloInstruction* bitcast) override;
+
   Status HandleBroadcast(HloInstruction* broadcast) override;
 
   Status HandleConcatenate(HloInstruction* concatenate) override;
@@ -300,7 +307,7 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Disable dot strength reduction on platforms where it causes a slowdown.
   bool enable_dot_strength_reduction_;
 
-  // Disable convolution simplication on platforms where it causes a slowdown.
+  // Disable convolution simplification on platforms where it causes a slowdown.
   bool enable_conv_simplification_;
 };
 
@@ -348,8 +355,9 @@ bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
 }
 
 Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
-  auto lhs = add->mutable_operand(0);
-  auto rhs = add->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(add, m::Add(m::Op(&lhs), m::Op(&rhs))));
+
   // A + 0 => A
   VLOG(10) << "trying transform [A + 0 => A]: " << add->ToString();
   if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(add, lhs)) {
@@ -364,7 +372,7 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   // Canonicalization: Put constants on the right.  This makes the reassociation
   // rules below simpler.
   VLOG(10) << "trying transform [Const + A => A + Const]";
-  if (lhs->IsConstant() && !rhs->IsConstant()) {
+  if (Match(add, m::Add(m::Constant(), m::NonConstant()))) {
     return ReplaceWithNewInstruction(
         add,
         HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd, rhs, lhs));
@@ -377,20 +385,13 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   //   (A + C1) + (B + C2) =>  A + B + (C1 + C2).
   //
   VLOG(10) << "trying transform [(A + C1) + C2 => A + (C1 + C2)]";
-  if (rhs->IsConstant() && lhs->opcode() == HloOpcode::kAdd &&
-      !lhs->operand(0)->IsConstant() && lhs->operand(1)->IsConstant()) {
-    auto* c1 = lhs->mutable_operand(1);
-    auto* c2 = rhs;
-    TF_ASSIGN_OR_RETURN(
-        Shape sum_of_constants_shape,
-        ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, c1, c2));
-
-    auto* sum_of_constants =
-        computation_->AddInstruction(HloInstruction::CreateBinary(
-            sum_of_constants_shape, HloOpcode::kAdd, c1, c2));
+  HloInstruction *a, *c1, *c2;
+  if (Match(add, m::Add(m::Add(m::NonConstant(&a), m::Constant(&c1)),
+                        m::Constant(&c2)))) {
+    TF_ASSIGN_OR_RETURN(auto* sum_of_constants,
+                        MakeBinaryHlo(HloOpcode::kAdd, c1, c2));
     return ReplaceWithNewInstruction(
-        add, HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd,
-                                          lhs->mutable_operand(0),
+        add, HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd, a,
                                           sum_of_constants));
   }
 
@@ -399,11 +400,11 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
 
 Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
   // If a bitcast feeds a bitcast, make it a single bitcast.
-  if (bitcast->operand(0)->opcode() == HloOpcode::kBitcast) {
+  HloInstruction* op;
+  if (Match(bitcast, m::Bitcast(m::Bitcast(m::Op(&op))))) {
     return ReplaceWithNewInstruction(
-        bitcast, HloInstruction::CreateUnary(
-                     bitcast->shape(), HloOpcode::kBitcast,
-                     bitcast->mutable_operand(0)->mutable_operand(0)));
+        bitcast,
+        HloInstruction::CreateUnary(bitcast->shape(), HloOpcode::kBitcast, op));
   }
   // All bitcasts can be eliminated (assuming layout constraints are
   // satisified).
@@ -411,13 +412,19 @@ Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleBitcastConvert(
+    HloInstruction* bitcast) {
+  // Eliminate bitcast converts between same shape.
+  ReplaceInstructionIfSameShape(bitcast, bitcast->mutable_operand(0));
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
   // If a copy feeds a copy, make it a single copy.
-  if (copy->operand(0)->opcode() == HloOpcode::kCopy) {
+  HloInstruction* op;
+  if (Match(copy, m::Copy(m::Copy(m::Op(&op))))) {
     return ReplaceWithNewInstruction(
-        copy, HloInstruction::CreateUnary(
-                  copy->shape(), HloOpcode::kCopy,
-                  copy->mutable_operand(0)->mutable_operand(0)));
+        copy, HloInstruction::CreateUnary(copy->shape(), HloOpcode::kCopy, op));
   }
   // All copies can be eliminated (assuming layout constraints are satisified).
   ReplaceInstructionIfSameShape(copy, copy->mutable_operand(0));
@@ -457,12 +464,10 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
   } else if (operands.size() == 2) {
     // A binary concat with a broadcasted scalar as an operand can be converted
     // into a pad which is simpler to fold into other operations.
-    bool is_effective_low_pad =
-        operands[0]->opcode() == HloOpcode::kBroadcast &&
-        ShapeUtil::IsScalar(operands[0]->operand(0)->shape());
-    bool is_effective_high_pad =
-        operands[1]->opcode() == HloOpcode::kBroadcast &&
-        ShapeUtil::IsScalar(operands[1]->operand(0)->shape());
+    bool is_effective_low_pad = Match(
+        operands[0], m::Broadcast(m::Op().WithShape(m::Shape().IsScalar())));
+    bool is_effective_high_pad = Match(
+        operands[1], m::Broadcast(m::Op().WithShape(m::Shape().IsScalar())));
     if (!is_effective_low_pad && !is_effective_high_pad) {
       return Status::OK();
     }
@@ -516,12 +521,24 @@ Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) {
     return ReplaceInstruction(
         constant, BuildTupleConstant(computation_, constant->literal()));
   }
+
+  // If a literal is all the same element replace it with a scalar broadcast.
+  if (ShapeUtil::ElementsIn(constant->shape()) > 1 &&
+      constant->literal().IsAllFirst()) {
+    std::unique_ptr<Literal> unique_scalar =
+        MakeUnique<Literal>(constant->literal().GetFirstScalarLiteral());
+    HloInstruction* scalar = computation_->AddInstruction(
+        HloInstruction::CreateConstant(std::move(unique_scalar)));
+    return ReplaceWithNewInstruction(
+        constant,
+        HloInstruction::CreateBroadcast(constant->shape(), scalar, {}));
+  }
   return Status::OK();
 }
 
 Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
-  auto lhs = sub->mutable_operand(0);
-  auto rhs = sub->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(sub, m::Subtract(m::Op(&lhs), m::Op(&rhs))));
   // A - 0 => A
   VLOG(10) << "trying transform [A - 0 => A]: " << sub->ToString();
   if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(sub, lhs)) {
@@ -530,7 +547,7 @@ Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
 
   // Canonicalize subtraction of a constant to addition.
   VLOG(10) << "trying transform [A - Const => A + (-Const)]";
-  if (rhs->IsConstant() && !lhs->IsConstant()) {
+  if (Match(sub, m::Subtract(m::NonConstant(&lhs), m::Constant(&rhs)))) {
     HloInstruction* negative_const = computation_->AddInstruction(
         HloInstruction::CreateUnary(rhs->shape(), HloOpcode::kNegate, rhs));
     return ReplaceWithNewInstruction(
@@ -542,56 +559,53 @@ Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
 }
 
 Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
-  auto lhs = divide->mutable_operand(0);
-  auto rhs = divide->mutable_operand(1);
+  Shape* shape;
+  HloInstruction *a, *b, *c, *d;
+  CHECK(Match(divide, m::Divide(m::Op(&a), m::Op(&b))));
   // A/1 => A
   VLOG(10) << "trying transform [A/1 => A]: " << divide->ToString();
-  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(divide, lhs)) {
+  if (IsAll(b, 1) && ReplaceInstructionIfSameShape(divide, a)) {
     return Status::OK();
   }
 
   // exp(A)/exp(B) => exp(A-B)
-  if (lhs->opcode() == HloOpcode::kExp && rhs->opcode() == HloOpcode::kExp) {
+  if (Match(divide, m::Divide(m::Exp(m::Op(&a)), m::Exp(m::Op(&b)))
+                        .WithShape(m::Shape(&shape)))) {
     VLOG(10) << "transform [exp(A)/exp(B) => exp(A-B)]: " << divide->ToString();
-    HloInstruction* subtract =
-        computation_->AddInstruction(HloInstruction::CreateBinary(
-            divide->shape(), HloOpcode::kSubtract, lhs->mutable_operand(0),
-            rhs->mutable_operand(0)));
+    HloInstruction* subtract = computation_->AddInstruction(
+        HloInstruction::CreateBinary(*shape, HloOpcode::kSubtract, a, b));
     return ReplaceWithNewInstruction(
-        divide, HloInstruction::CreateUnary(divide->shape(), HloOpcode::kExp,
-                                            subtract));
+        divide, HloInstruction::CreateUnary(*shape, HloOpcode::kExp, subtract));
   }
 
   // A/exp(B) => A*exp(-B)
-  if (rhs->opcode() == HloOpcode::kExp) {
+  if (Match(divide, m::Divide(m::Op(&a), m::Exp(m::Op(&b))))) {
     VLOG(10) << "transform [A/exp(B) => A*exp(-B)]: " << divide->ToString();
-    HloInstruction* negate =
-        computation_->AddInstruction(HloInstruction::CreateUnary(
-            divide->shape(), HloOpcode::kNegate, rhs->mutable_operand(0)));
+    HloInstruction* negate = computation_->AddInstruction(
+        HloInstruction::CreateUnary(divide->shape(), HloOpcode::kNegate, b));
     HloInstruction* new_exp = computation_->AddInstruction(
         HloInstruction::CreateUnary(divide->shape(), HloOpcode::kExp, negate));
     return ReplaceWithNewInstruction(
-        divide, HloInstruction::CreateBinary(
-                    divide->shape(), HloOpcode::kMultiply, lhs, new_exp));
+        divide, HloInstruction::CreateBinary(divide->shape(),
+                                             HloOpcode::kMultiply, a, new_exp));
   }
 
   // A/pow(B,C) => A*pow(B,-C)
-  if (rhs->opcode() == HloOpcode::kPower) {
+  if (Match(divide, m::Divide(m::Op(&a), m::Power(m::Op(&b), m::Op(&c))))) {
     VLOG(10) << "transform [A/pow(B,C) => A*pow(B,-C)]: " << divide->ToString();
     // The output shape of the created negate operator should be the same as the
     // input.
-    const Shape& negate_shape = rhs->operand(1)->shape();
-    HloInstruction* negate =
-        computation_->AddInstruction(HloInstruction::CreateUnary(
-            negate_shape, HloOpcode::kNegate, rhs->mutable_operand(1)));
+    const Shape& negate_shape = c->shape();
+    HloInstruction* negate = computation_->AddInstruction(
+        HloInstruction::CreateUnary(negate_shape, HloOpcode::kNegate, c));
     // And the power operator should retain the output shape of the old one.
-    const Shape& new_power_shape = rhs->shape();
-    HloInstruction* new_power = computation_->AddInstruction(
-        HloInstruction::CreateBinary(new_power_shape, HloOpcode::kPower,
-                                     rhs->mutable_operand(0), negate));
+    const Shape& new_power_shape = b->shape();
+    HloInstruction* new_power =
+        computation_->AddInstruction(HloInstruction::CreateBinary(
+            new_power_shape, HloOpcode::kPower, b, negate));
     return ReplaceWithNewInstruction(
         divide, HloInstruction::CreateBinary(
-                    divide->shape(), HloOpcode::kMultiply, lhs, new_power));
+                    divide->shape(), HloOpcode::kMultiply, a, new_power));
   }
 
   // Simplifying integral division would produce unexpected results.
@@ -603,65 +617,46 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   //
   // (Backends can do this transformation, but generally only if the constant is
   // a scalar.)
-  if (lhs->opcode() != HloOpcode::kConstant &&
-      rhs->opcode() == HloOpcode::kConstant) {
+  if (Match(divide, m::Divide(m::NonConstant(&a), m::Constant(&b)))) {
     HloInstruction* one =
         computation_->AddInstruction(HloInstruction::CreateConstant(
-            Literal::One(lhs->shape().element_type()).CloneToUnique()));
-    HloInstruction* inverse =
-        computation_->AddInstruction(HloInstruction::CreateBinary(
-            rhs->shape(), HloOpcode::kDivide, one, rhs));
+            Literal::One(a->shape().element_type()).CloneToUnique()));
+    HloInstruction* inverse = computation_->AddInstruction(
+        HloInstruction::CreateBinary(b->shape(), HloOpcode::kDivide, one, b));
     return ReplaceWithNewInstruction(
-        divide, HloInstruction::CreateBinary(
-                    divide->shape(), HloOpcode::kMultiply, lhs, inverse));
+        divide, HloInstruction::CreateBinary(divide->shape(),
+                                             HloOpcode::kMultiply, a, inverse));
   }
 
   // (A / B) / (C / D)  =>  (A / B)*(D / C) => (A * D) / (B * C)
-  if (lhs->opcode() == HloOpcode::kDivide &&
-      rhs->opcode() == HloOpcode::kDivide) {
-    TF_ASSIGN_OR_RETURN(
-        const Shape a_times_d_shape,
-        ShapeInference::InferBinaryOpShape(HloOpcode::kMultiply,
-                                           lhs->operand(0), rhs->operand(1)));
-    auto a_times_d = computation_->AddInstruction(HloInstruction::CreateBinary(
-        a_times_d_shape, HloOpcode::kMultiply, lhs->mutable_operand(0),
-        rhs->mutable_operand(1)));
-    TF_ASSIGN_OR_RETURN(
-        const Shape b_times_c_shape,
-        ShapeInference::InferBinaryOpShape(HloOpcode::kMultiply,
-                                           lhs->operand(1), rhs->operand(0)));
-    auto b_times_c = computation_->AddInstruction(HloInstruction::CreateBinary(
-        b_times_c_shape, HloOpcode::kMultiply, lhs->mutable_operand(1),
-        rhs->mutable_operand(0)));
-    return ReplaceWithNewInstruction(
-        divide, HloInstruction::CreateBinary(
-                    divide->shape(), HloOpcode::kDivide, a_times_d, b_times_c));
+  if (Match(divide, m::Divide(m::Divide(m::Op(&a), m::Op(&b)),
+                              m::Divide(m::Op(&c), m::Op(&d))))) {
+    TF_ASSIGN_OR_RETURN(auto a_times_d,
+                        MakeBinaryHlo(HloOpcode::kMultiply, a, d));
+    TF_ASSIGN_OR_RETURN(auto b_times_c,
+                        MakeBinaryHlo(HloOpcode::kMultiply, b, c));
+    TF_ASSIGN_OR_RETURN(auto new_divide, MakeBinaryHlo(HloOpcode::kDivide,
+                                                       a_times_d, b_times_c));
+
+    return ReplaceInstruction(divide, new_divide);
   }
 
   // (A / B) / C => A / (B * C)
-  if (lhs->opcode() == HloOpcode::kDivide) {
-    TF_ASSIGN_OR_RETURN(const Shape b_times_c_shape,
-                        ShapeInference::InferBinaryOpShape(
-                            HloOpcode::kMultiply, lhs->operand(1), rhs));
-    auto b_times_c = computation_->AddInstruction(HloInstruction::CreateBinary(
-        b_times_c_shape, HloOpcode::kMultiply, lhs->mutable_operand(1), rhs));
+  if (Match(divide, m::Divide(m::Divide(m::Op(&a), m::Op(&b)), m::Op(&c)))) {
+    TF_ASSIGN_OR_RETURN(auto b_times_c,
+                        MakeBinaryHlo(HloOpcode::kMultiply, b, c));
     return ReplaceWithNewInstruction(
-        divide,
-        HloInstruction::CreateBinary(divide->shape(), HloOpcode::kDivide,
-                                     lhs->mutable_operand(0), b_times_c));
+        divide, HloInstruction::CreateBinary(divide->shape(),
+                                             HloOpcode::kDivide, a, b_times_c));
   }
 
   // A / (B / C) => (A*C) / B
-  if (rhs->opcode() == HloOpcode::kDivide) {
-    TF_ASSIGN_OR_RETURN(const Shape a_times_c_shape,
-                        ShapeInference::InferBinaryOpShape(
-                            HloOpcode::kMultiply, lhs, rhs->operand(1)));
-    auto a_times_c = computation_->AddInstruction(HloInstruction::CreateBinary(
-        a_times_c_shape, HloOpcode::kMultiply, lhs, rhs->mutable_operand(1)));
+  if (Match(divide, m::Divide(m::Op(&a), m::Divide(m::Op(&b), m::Op(&c))))) {
+    TF_ASSIGN_OR_RETURN(auto a_times_c,
+                        MakeBinaryHlo(HloOpcode::kMultiply, a, c));
     return ReplaceWithNewInstruction(
-        divide,
-        HloInstruction::CreateBinary(divide->shape(), HloOpcode::kDivide,
-                                     a_times_c, rhs->mutable_operand(0)));
+        divide, HloInstruction::CreateBinary(divide->shape(),
+                                             HloOpcode::kDivide, a_times_c, b));
   }
 
   return Status::OK();
@@ -669,8 +664,8 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
 
 StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     HloInstruction* dot) {
-  HloInstruction* lhs = dot->mutable_operand(0);
-  HloInstruction* rhs = dot->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
   int64 lhs_collapsing_dim =
       dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
   if (lhs->IsRank2Transpose()) {
@@ -787,8 +782,8 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcat(
 
   const int64 lhs_contracting_dim = dnums.lhs_contracting_dimensions(0);
   const int64 rhs_contracting_dim = dnums.rhs_contracting_dimensions(0);
-  HloInstruction* lhs = dot->mutable_operand(0);
-  HloInstruction* rhs = dot->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * optimized_lhs_concat,
@@ -918,8 +913,8 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper(
 }
 
 Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
-  auto lhs = dot->mutable_operand(0);
-  auto rhs = dot->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
 
   // Only optimize F32 dot operations where the dot, rhs and lhs are rank 2 or
   // below.
@@ -971,8 +966,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
 }
 
 Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
-  auto lhs = multiply->mutable_operand(0);
-  auto rhs = multiply->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(multiply, m::Multiply(m::Op(&lhs), m::Op(&rhs))));
   // A*1 => A
   VLOG(10) << "trying transform [A*1 => A]: " << multiply->ToString();
   if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(multiply, lhs)) {
@@ -985,10 +980,9 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
   }
 
   // exp(A) * exp(B) => exp(A+B)
-  if (lhs->opcode() == HloOpcode::kExp && rhs->opcode() == HloOpcode::kExp) {
+  if (Match(multiply, m::Multiply(m::Exp(m::Op(&lhs)), m::Exp(m::Op(&rhs))))) {
     auto add = computation_->AddInstruction(HloInstruction::CreateBinary(
-        multiply->shape(), HloOpcode::kAdd, lhs->mutable_operand(0),
-        rhs->mutable_operand(0)));
+        multiply->shape(), HloOpcode::kAdd, lhs, rhs));
     return ReplaceWithNewInstruction(
         multiply,
         HloInstruction::CreateUnary(multiply->shape(), HloOpcode::kExp, add));
@@ -999,20 +993,19 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
 Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) {
   // ln(exp(A)) => A
   VLOG(10) << "trying transform [ln(exp(A)) => A]: " << log->ToString();
-  auto operand = log->mutable_operand(0);
-  if (operand->opcode() == HloOpcode::kExp &&
-      ReplaceInstructionIfSameShape(log, operand->mutable_operand(0))) {
+  HloInstruction *a, *b;
+  if (Match(log, m::Log(m::Exp(m::Op(&a)))) &&
+      ReplaceInstructionIfSameShape(log, a)) {
     return Status::OK();
   }
 
   // ln(pow(A,B)) => B*ln(A)
-  if (operand->opcode() == HloOpcode::kPower) {
-    auto new_log = computation_->AddInstruction(HloInstruction::CreateUnary(
-        log->shape(), HloOpcode::kLog, operand->mutable_operand(0)));
+  if (Match(log, m::Log(m::Power(m::Op(&a), m::Op(&b))))) {
+    auto new_log = computation_->AddInstruction(
+        HloInstruction::CreateUnary(log->shape(), HloOpcode::kLog, a));
     return ReplaceWithNewInstruction(
-        log,
-        HloInstruction::CreateBinary(log->shape(), HloOpcode::kMultiply,
-                                     new_log, operand->mutable_operand(1)));
+        log, HloInstruction::CreateBinary(log->shape(), HloOpcode::kMultiply,
+                                          new_log, b));
   }
 
   return Status::OK();
@@ -1115,11 +1108,12 @@ bool OutputIsSubsetOfOperandElements(HloInstruction* instruction,
 }  // namespace
 
 Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
-  auto operand = broadcast->mutable_operand(0);
+  HloInstruction* operand;
+  CHECK(Match(broadcast, m::Broadcast(m::Op(&operand))));
+  auto dims = broadcast->dimensions();
   // A degenerate broadcast of a reshape that does not change the number of
   // elements can be replaced by a reshape.
-  if (std::is_sorted(broadcast->dimensions().begin(),
-                     broadcast->dimensions().end()) &&
+  if (std::is_sorted(dims.begin(), dims.end()) &&
       ShapeUtil::ElementsIn(broadcast->shape()) ==
           ShapeUtil::ElementsIn(operand->shape())) {
     VLOG(10) << "transform broadcast(X) -> reshape(X) where "
@@ -1137,8 +1131,8 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
     VLOG(10) << "transform broadcast(X) -> transpose(X) where "
                 "n(broadcast(X)) == n(X)";
     return ReplaceWithNewInstruction(
-        broadcast, HloInstruction::CreateTranspose(broadcast->shape(), operand,
-                                                   broadcast->dimensions()));
+        broadcast,
+        HloInstruction::CreateTranspose(broadcast->shape(), operand, dims));
   }
 
   // A broadcast of a reshape which merely inserts 1-sized dimensions can
@@ -1152,7 +1146,6 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
     if (merely_inserts_or_deletes_1_sized_dimensions &&
         deleted_indices.empty()) {
       std::reverse(inserted_indices.begin(), inserted_indices.end());
-      auto dims = broadcast->dimensions();
       for (auto inserted_index : inserted_indices) {
         dims.erase(dims.begin() + inserted_index);
       }
@@ -1196,6 +1189,19 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
         return user->ReplaceAllUsesWith(new_broadcast);
       }
     }
+    return Status::OK();
+  }
+
+  // Merge two consecutive broadcasts into a single one.
+  if (operand->opcode() == HloOpcode::kBroadcast) {
+    std::vector<int64> new_dimensions;
+    for (auto dim : operand->dimensions()) {
+      new_dimensions.push_back(dims[dim]);
+    }
+    return ReplaceWithNewInstruction(
+        broadcast,
+        HloInstruction::CreateBroadcast(
+            broadcast->shape(), operand->mutable_operand(0), new_dimensions));
   }
   return Status::OK();
 }
@@ -1214,30 +1220,28 @@ Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert) {
 
 // Complex(Real(c), Imag(c)) -> c
 Status AlgebraicSimplifierVisitor::HandleComplex(HloInstruction* complex) {
-  auto real = complex->mutable_operand(0);
-  auto imag = complex->mutable_operand(1);
-  if (real->opcode() == HloOpcode::kReal &&
-      imag->opcode() == HloOpcode::kImag &&
-      real->operand(0) == imag->operand(0)) {
-    return ReplaceInstruction(complex, real->mutable_operand(0));
+  HloInstruction *c0, *c1;
+  if (Match(complex, m::Complex(m::Real(m::Op(&c0)), m::Imag(m::Op(&c1)))) &&
+      c0 == c1) {
+    return ReplaceInstruction(complex, c0);
   }
   return Status::OK();
 }
 
 // Real(Complex(r, i)) -> r
 Status AlgebraicSimplifierVisitor::HandleReal(HloInstruction* real) {
-  auto operand = real->mutable_operand(0);
-  if (operand->opcode() == HloOpcode::kComplex) {
-    return ReplaceInstruction(real, operand->mutable_operand(0));
+  HloInstruction* op;
+  if (Match(real, m::Real(m::Complex(m::Op(&op), m::Op())))) {
+    return ReplaceInstruction(real, op);
   }
   return Status::OK();
 }
 
 // Imag(Complex(r, i)) -> i
 Status AlgebraicSimplifierVisitor::HandleImag(HloInstruction* imag) {
-  auto operand = imag->mutable_operand(0);
-  if (operand->opcode() == HloOpcode::kComplex) {
-    return ReplaceInstruction(imag, operand->mutable_operand(1));
+  HloInstruction* op;
+  if (Match(imag, m::Imag(m::Complex(m::Op(), m::Op(&op))))) {
+    return ReplaceInstruction(imag, op);
   }
   return Status::OK();
 }
@@ -1290,17 +1294,14 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
         padding_dimension->set_edge_padding_high(0);
       }
     }
-    TF_ASSIGN_OR_RETURN(Shape nonzero_pad_shape,
-                        ShapeInference::InferPadShape(pad->operand(0)->shape(),
-                                                      pad->operand(1)->shape(),
-                                                      nonzero_padding));
+
+    TF_ASSIGN_OR_RETURN(HloInstruction * nonzero_pad,
+                        MakePadHlo(pad->mutable_operand(0),
+                                   pad->mutable_operand(1), nonzero_padding));
     // Copy the layout from the original pad instructions. The new pad and the
     // slice instruction should all have the same layout.
-    TF_RETURN_IF_ERROR(
-        LayoutUtil::CopyLayoutBetweenShapes(pad->shape(), &nonzero_pad_shape));
-    HloInstruction* nonzero_pad = computation_->AddInstruction(
-        HloInstruction::CreatePad(nonzero_pad_shape, pad->mutable_operand(0),
-                                  pad->mutable_operand(1), nonzero_padding));
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        pad->shape(), nonzero_pad->mutable_shape()));
 
     // Second, construct the slice instruction to perform the negative padding.
     std::vector<int64> start_indices;
@@ -1313,7 +1314,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
       if (padding_dimension.edge_padding_low() < 0) {
         start = -1 * padding_dimension.edge_padding_low();
       }
-      int64 end = nonzero_pad_shape.dimensions(i);
+      int64 end = nonzero_pad->shape().dimensions(i);
       if (padding_dimension.edge_padding_high() < 0) {
         end += padding_dimension.edge_padding_high();
       }
@@ -1322,16 +1323,14 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
       strides.push_back(1);
     }
 
-    // Verify that the slice shape matches the pad shape.
     TF_ASSIGN_OR_RETURN(
-        Shape inferred_slice_shape,
-        ShapeInference::InferSliceShape(nonzero_pad_shape, start_indices,
-                                        end_indices, strides));
-    TF_RET_CHECK(ShapeUtil::Compatible(inferred_slice_shape, pad->shape()));
+        HloInstruction * slice,
+        MakeSliceHlo(nonzero_pad, start_indices, end_indices, strides));
+
+    // Verify that the slice shape matches the pad shape.
+    TF_RET_CHECK(ShapeUtil::Compatible(slice->shape(), pad->shape()));
 
-    std::unique_ptr<HloInstruction> slice = HloInstruction::CreateSlice(
-        pad->shape(), nonzero_pad, start_indices, end_indices, strides);
-    return ReplaceWithNewInstruction(pad, std::move(slice));
+    return ReplaceInstruction(pad, slice);
   }
 
   return Status::OK();
@@ -1339,8 +1338,8 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
 
 Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   VLOG(10) << "trying transform [pow(A, 0) => 1]: " << power->ToString();
-  auto lhs = power->mutable_operand(0);
-  auto rhs = power->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(power, m::Power(m::Op(&lhs), m::Op(&rhs))));
   if (IsAll(rhs, 0)) {
     auto one = HloInstruction::CreateConstant(
         Literal::One(power->shape().element_type()).CloneToUnique());
@@ -1360,9 +1359,10 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   }
 
   // pow(exp(A),B) => exp(A*B)
-  if (lhs->opcode() == HloOpcode::kExp) {
+  HloInstruction *a, *b;
+  if (Match(power, m::Power(m::Exp(m::Op(&a)), m::Op(&b)))) {
     auto a_times_b = computation_->AddInstruction(HloInstruction::CreateBinary(
-        power->shape(), HloOpcode::kMultiply, lhs->operands()[0], rhs));
+        power->shape(), HloOpcode::kMultiply, a, b));
     return ReplaceWithNewInstruction(
         power, HloInstruction::CreateUnary(power->shape(), HloOpcode::kExp,
                                            a_times_b));
@@ -1604,6 +1604,14 @@ Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
   if (IsAll(start_indices, 0) && SameShape(dynamic_update_slice, update)) {
     return ReplaceInstruction(dynamic_update_slice, update);
   }
+
+  // If any dimension of update is 0, elide the DynamicUpdateSlice.  This
+  // optimization becomes invalid should we later prefer to warn about out of
+  // bound indices.
+  if (ShapeUtil::HasZeroElements(update->shape())) {
+    return ReplaceInstruction(dynamic_update_slice,
+                              dynamic_update_slice->mutable_operand(0));
+  }
   return Status::OK();
 }
 
@@ -1686,7 +1694,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
         HloInstruction::CreateReshape(reduce->shape(), arg));
     return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateMap(reduce->shape(),
-                                          {reshape, init_value}, function));
+                                          {init_value, reshape}, function));
   }
   return Status::OK();
 }
@@ -1711,18 +1719,29 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
                                   function));
   }
 
-  VLOG(10) << "Considering folding Pad: " << operand->ToString()
-           << "\ninto reduce-window: " << reduce_window->ToString();
-
   // This optimization folds a pad op into reduce_window.
-  if (operand->opcode() != HloOpcode::kPad) {
+  HloInstruction* pad;
+  const HloInstruction* convert = nullptr;
+  if (operand->opcode() == HloOpcode::kPad) {
+    pad = operand;
+  } else if (operand->opcode() == HloOpcode::kConvert &&
+             operand->operand(0)->opcode() == HloOpcode::kPad) {
+    convert = operand;
+    pad = operand->mutable_operand(0);
+  } else {
     VLOG(10) << "Not folding pad into reduce-window as there is no pad.";
     return Status::OK();
   }
 
+  VLOG(10) << "Considering folding Pad: " << pad->ToString()
+           << "\ninto reduce-window: " << reduce_window->ToString()
+           << (convert != nullptr ? tensorflow::strings::StrCat(
+                                        "\nvia convert: ", convert->ToString())
+                                  : "");
+
   // Do not fold interior padding into ReduceWindow since the backends do not
   // support it.
-  const PaddingConfig& pad_config = operand->padding_config();
+  const PaddingConfig& pad_config = pad->padding_config();
   if (HasInteriorPadding(pad_config)) {
     VLOG(10) << "Not folding pad into reduce-window due to interior padding.";
     return Status::OK();
@@ -1730,14 +1749,27 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
 
   // If reduce_window already has padding, the pad value of the pad op and the
   // init value of reduce_window must match to allow folding the pad.
-  const HloInstruction* pad_value = operand->operand(1);
+  const HloInstruction* pad_value = pad->operand(1);
   const HloInstruction* reduce_init_value = reduce_window->operand(1);
   if (pad_value != reduce_init_value) {
+    auto literals_are_equivalent = [&] {
+      auto& pad_literal = pad_value->literal();
+      auto& reduce_init_literal = reduce_init_value->literal();
+      if (pad_literal == reduce_init_literal) {
+        return true;
+      }
+      auto converted_pad_literal = pad_literal.ConvertToShape(
+          reduce_init_value->shape(), /*round_f32_to_bf16=*/true);
+      if (!converted_pad_literal.ok()) {
+        return false;
+      }
+      return *converted_pad_literal.ValueOrDie() == reduce_init_literal;
+    };
     // The pad value is usually a constant, so we handle that case and do not
     // try to get more fancy about proving equivalence in cases beyond that.
     if (pad_value->opcode() != HloOpcode::kConstant ||
         reduce_init_value->opcode() != HloOpcode::kConstant ||
-        pad_value->literal() != reduce_init_value->literal()) {
+        !literals_are_equivalent()) {
       VLOG(10) << "Not folding pad into reduce-window due to different pad "
                   "values.";
       return Status::OK();
@@ -1746,7 +1778,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
 
   // If the pad puts a single non-identity value in each window that we're
   // reducing, then this is a broadcast.
-  HloInstruction* pad_operand = operand->mutable_operand(0);
+  HloInstruction* pad_operand = pad->mutable_operand(0);
   auto is_effective_broadcast = [&] {
     if (window_util::HasStride(window)) {
       VLOG(10) << "Window has stride.";
@@ -1790,6 +1822,18 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     VLOG(10) << "Found window covers a single unpadded element.";
     return true;
   };
+
+  HloInstruction* new_reduce_window_operand;
+  if (convert != nullptr) {
+    new_reduce_window_operand =
+        computation_->AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::ChangeElementType(pad_operand->shape(),
+                                         convert->shape().element_type()),
+            pad_operand));
+  } else {
+    new_reduce_window_operand = pad_operand;
+  }
+
   if (is_effective_broadcast()) {
     VLOG(10) << "Replacing pad/reduce-window with (implicit) broadcast.";
     auto fadd = [this](std::unique_ptr<HloInstruction> x) {
@@ -1798,7 +1842,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     return ReplaceWithNewInstruction(
         reduce_window, HloInstruction::CreateBroadcastSequence(
                            /*output_shape=*/reduce_window->shape(),
-                           /*operand=*/pad_operand, fadd));
+                           /*operand=*/new_reduce_window_operand, fadd));
   }
 
   // Carry out the folding of the pad into reduce_window.
@@ -1815,10 +1859,11 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     window_dim.set_padding_high(window_dim.padding_high() +
                                 pad_dim.edge_padding_high());
   }
+
   return ReplaceWithNewInstruction(
       reduce_window, HloInstruction::CreateReduceWindow(
                          /*shape=*/reduce_window->shape(),
-                         /*operand=*/pad_operand,
+                         /*operand=*/new_reduce_window_operand,
                          /*init_value=*/reduce_window->mutable_operand(1),
                          /*window=*/new_window,
                          /*reduce_computation=*/function));
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 43315f5cdc7afbe79039420320f4a0d0535e11f1..c48196e861a559a5abfa360841ec70b39356fa2b 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-// A pass which performs AlgebraicSimplications.
+// A pass which performs algebraic simplifications.
 class AlgebraicSimplifier : public HloPassInterface {
  public:
   // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
@@ -57,10 +57,10 @@ class AlgebraicSimplifier : public HloPassInterface {
   bool is_layout_sensitive_;
   ValidBitcastCallback valid_bitcast_callback_;
 
-  // Enable dot simplication on platforms where it is profitable.
+  // Enable dot simplification on platforms where it is profitable.
   bool enable_dot_strength_reduction_;
 
-  // Enable convolution simplication on platforms where it is profitable.
+  // Enable convolution simplification on platforms where it is profitable.
   bool enable_conv_simplification_;
 };
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 0f08eb3a3267c4b7b04958270a5788fc48d3fa04..d0c99bf818cd54b897ae9da6f9c46862254d64e5 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -35,6 +36,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
+using ::testing::ElementsAre;
+
 namespace xla {
 namespace {
 
@@ -162,6 +165,37 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   EXPECT_EQ(root, param0);
 }
 
+TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) {
+  HloComputation::Builder builder(TestName());
+  builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({3.14f, 3.14f, 3.14f})));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_EQ(3.14f, root->operand(0)->literal().GetFirstElement<float>());
+}
+
+TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) {
+  HloComputation::Builder builder(TestName());
+  builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({3.14, 3.14, 4})));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+}
+
 // Test that A - 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, SubZero) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
@@ -1666,14 +1700,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {2, 2}), param, zero, no_padding));
 
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1699,8 +1733,8 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {11, 5}), param, zero, padding));
 
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
@@ -1718,7 +1752,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
   EXPECT_TRUE(has_negative_padding(pad));
 
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
   EXPECT_FALSE(
@@ -1733,14 +1767,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {2, 3}), param));
 
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1756,14 +1790,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
       ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
       /*limit_indices=*/{dim0, dim1}, /*strides=*/{1, 1}));
 
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1891,12 +1925,12 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     b.AddInstruction(HloInstruction::CreateConvolve(out_shape, input, filter,
                                                     window, dnums));
 
-    HloModule module(TestName());
-    auto* computation = module.AddEntryComputation(b.Build());
+    auto module = CreateNewModule();
+    auto* computation = module->AddEntryComputation(b.Build());
 
     AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                    bitcasting_callback());
-    if (!simplifier.Run(&module).ValueOrDie()) {
+    if (!simplifier.Run(module.get()).ValueOrDie()) {
       return "NO_CHANGE";
     }
     auto* root = computation->root_instruction();
@@ -2011,15 +2045,15 @@ TEST_F(AlgebraicSimplifierTest, MaxMinToClamp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMaximum, min, max_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Maximum(op::Minimum(param0, min_value), max_value));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2041,15 +2075,15 @@ TEST_F(AlgebraicSimplifierTest, MinMaxToClamp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2072,15 +2106,15 @@ TEST_F(AlgebraicSimplifierTest, MinMaxWithBroadcastToClamp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kMinimum, max, min_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2102,15 +2136,15 @@ TEST_F(AlgebraicSimplifierTest, MinMaxNotToClamp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
@@ -2134,8 +2168,8 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMinimum, fmax, min_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
@@ -2143,7 +2177,7 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
@@ -2168,8 +2202,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
       slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, slice);
@@ -2178,10 +2212,10 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
 
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(scalar_param));
@@ -2209,8 +2243,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   HloInstruction* reshape = builder.AddInstruction(
       HloInstruction::CreateReshape(reshape_shape, transpose));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reshape);
@@ -2218,7 +2252,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(forty_two));
@@ -2227,7 +2261,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
 TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2256,7 +2290,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module.AddEmbeddedComputation(builder.Build());
+    add_computation = module->AddEmbeddedComputation(builder.Build());
   }
 
   // Create the reduce-window.
@@ -2279,15 +2313,15 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
           add_computation));
 
   // Build the computation and run the simplifier.
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   // Verify the result
   root = computation->root_instruction();
@@ -2305,6 +2339,91 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   EXPECT_EQ(root->window().dimensions(3).padding_high(), 102);
 }
 
+// Test that ReduceWindow(Convert(Pad(op, x)), y) can simplify to
+// ReduceWindow(Convert(op), x).
+TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
+  auto module = CreateNewModule();
+  HloComputation::Builder builder(TestName());
+
+  // Create operand to the pad.
+  HloInstruction* parameter =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(BF16, {1, 2, 3, 4}), "p0"));
+
+  // Create the pad.
+  PaddingConfig padding = MakeNoPaddingConfig(4);
+  padding.mutable_dimensions(1)->set_edge_padding_low(1);
+  padding.mutable_dimensions(3)->set_edge_padding_high(2);
+
+  HloInstruction* pad_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(5.0f)));
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(BF16, {1, 3, 3, 5}), parameter, pad_value, padding));
+
+  HloInstruction* convert =
+      builder.AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(pad->shape(), F32), pad));
+
+  // Create add computation.
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module->AddEmbeddedComputation(builder.Build());
+  }
+
+  // Create the reduce-window.
+  Window window;
+  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+    auto* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_padding_low(10);
+    dim->set_padding_high(100);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+  const Shape reduce_window_shape =
+      ShapeUtil::MakeShape(F32, {111, 113, 113, 115});
+  HloInstruction* reduce_init_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(5.0f)));
+  HloInstruction* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          reduce_window_shape, convert, reduce_init_value, window,
+          add_computation));
+
+  // Build the computation and run the simplifier.
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root, reduce_window);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  // Running simplification again should not result in any further changes.
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+
+  // Verify the result
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::ReduceWindow(op::Convert(parameter), op::Constant()));
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
+      << ShapeUtil::HumanString(root->shape()) << " vs "
+      << ShapeUtil::HumanString(reduce_window_shape);
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(1).padding_low(), 11);
+  EXPECT_EQ(root->window().dimensions(2).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(3).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(1).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(2).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(3).padding_high(), 102);
+}
+
 TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   HloComputation::Builder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {448, 2048, 1, 1});
@@ -2313,12 +2432,12 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   builder.AddInstruction(
       HloInstruction::CreateReverse(shape, a, /*dimensions=*/{2, 3}));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(a, root);
@@ -2431,6 +2550,55 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
               op::DynamicSlice(op::Parameter(), op::Parameter()));
 }
 
+// Test that two consecutive broadcasts can be merged to one.
+TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
+  HloComputation::Builder builder(TestName());
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
+  HloInstruction* input_array = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({3, 4})));
+  HloInstruction* inner_bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r2f32, input_array, {1}));
+  Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 2, 2});
+  builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r3f32, inner_bcast, {0, 2}));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root->dimensions(), ElementsAre(2));
+}
+
+// Test that two consecutive broadcasts can be merged to one.
+TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
+  HloComputation::Builder builder(TestName());
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 3});
+  Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 5, 3});
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  // The initial dimensions go to places 0 and 2 in the 3-dim array,
+  // and to places 1 and 3 in the 4-dim array,
+  HloInstruction* inner_bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r3f32, param0, {0, 2}));
+  Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 2, 5, 3});
+  builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r4f32, inner_bcast, {1, 2, 3}));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Broadcast(op::Parameter(0)));
+  EXPECT_THAT(root->dimensions(), ElementsAre(1, 3));
+}
+
 struct PadReduceWindowEffectiveBroadcastCase {
   std::vector<int64> input_spatials;
   std::vector<int64> symmetric_pad_spatials;
@@ -2769,6 +2937,29 @@ DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
     {/*m=*/1, /*k=*/16, /*n=*/1},   //
 };
 
+// Test that DynamicUpdateSlice update param with any dimension equal to zero
+// gets removed.
+TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
+  HloComputation::Builder builder(TestName());
+  const Shape dslice_shape = ShapeUtil::MakeShape(F32, {10});
+  HloInstruction* const operand = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, dslice_shape, "operand"));
+  const Shape update_shape = ShapeUtil::MakeShape(F32, {0});
+  HloInstruction* const update = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, update_shape, "update"));
+  HloInstruction* const start_indices = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int>({0})));
+  builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      dslice_shape, operand, update, start_indices));
+  const HloComputation* const computation =
+      module().AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(), operand);
+}
+
 INSTANTIATE_TEST_CASE_P(DotOfConcatSimplificationTestInstantiation,
                         DotOfConcatSimplificationTest,
                         ::testing::ValuesIn(kDotOfConcatTestSpecs));
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 4e80679c11dfdf7fdf8077a9f354139a4cab6803..cf1231bcce4d004284b71a49063e3e470a9eb93f 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -31,43 +31,73 @@ limitations under the License.
 namespace xla {
 
 StatusOr<GlobalDataHandle> AllocationTracker::Register(
-    std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag) {
+    ScopedShapedBuffer shaped_buffer, const string& tag) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "Register";
-  return RegisterInternal(std::move(shaped_buffer), tag);
+  std::vector<ScopedShapedBuffer> replicated_buffers;
+  replicated_buffers.emplace_back(std::move(shaped_buffer));
+  return RegisterInternal(std::move(replicated_buffers), tag);
 }
 
+StatusOr<GlobalDataHandle> AllocationTracker::RegisterReplicatedBuffers(
+    std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag) {
+  tensorflow::mutex_lock lock(mutex_);
+  VLOG(2) << "RegisterReplicatedBuffers";
+  return RegisterInternal(std::move(replicated_buffers), tag);
+}
+
+// ReleaseIfScopedShapedBuffer lets RegisterInternal<ShapedBufferTy>(b) call
+// b.release() if b is a ScopedShapedBuffer, or otherwise pass b through
+// unmodified.
+static ShapedBuffer ReleaseIfScopedShapedBuffer(ShapedBuffer b) { return b; }
+static ShapedBuffer ReleaseIfScopedShapedBuffer(ScopedShapedBuffer b) {
+  return b.release();
+}
+
+template <typename ShapedBufferTy>
 StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
-    std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag) {
+    std::vector<ShapedBufferTy> replicated_buffers, const string& tag) {
+  static_assert(std::is_same<ShapedBufferTy, ShapedBuffer>::value ||
+                    std::is_same<ShapedBufferTy, ScopedShapedBuffer>::value,
+                "ShapedBufferTy must be ShapedBuffer or ScopedShapedBuffer.");
   VLOG(2) << "RegisterInternal("
-          << "tag: \"" << tag << "\" "
-          << "shaped_buffer: " << *shaped_buffer;
-  if (shaped_buffer->platform() != backend_->platform()) {
-    return InvalidArgument(
-        "AllocationTracker for platform %s cannot register buffer from "
-        "platform %s",
-        backend_->platform()->Name().c_str(),
-        shaped_buffer->platform()->Name().c_str());
+          << "tag: \"" << tag << "\" with " << replicated_buffers.size()
+          << " shaped_buffers.";
+  for (const auto& shaped_buffer : replicated_buffers) {
+    VLOG(2) << "shaped_buffer:" << shaped_buffer;
+    if (shaped_buffer.platform() != backend_->platform()) {
+      return InvalidArgument(
+          "AllocationTracker for platform %s cannot register buffer from "
+          "platform %s",
+          backend_->platform()->Name().c_str(),
+          shaped_buffer.platform()->Name().c_str());
+    }
   }
 
   int64 handle = next_handle_++;
-  std::vector<ShapeIndex> shape_indices;
-  ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(),
-                             [this, &shape_indices](const Shape& /*subshape*/,
-                                                    const ShapeIndex& index) {
-                               shape_indices.push_back(index);
-                             });
-  for (const ShapeIndex& index : shape_indices) {
-    AddAllocationOrIncrementRefCount(shaped_buffer->buffer(index),
-                                     shaped_buffer->device_ordinal());
+  for (auto& shaped_buffer : replicated_buffers) {
+    std::vector<ShapeIndex> shape_indices;
+    ShapeUtil::ForEachSubshape(
+        shaped_buffer.on_device_shape(),
+        [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+          shape_indices.push_back(index);
+        });
+    // Add shaped_buffer's buffers to opaque_to_allocation_map_, which owns
+    // them.
+    for (const ShapeIndex& index : shape_indices) {
+      AddAllocationOrIncrementRefCount(shaped_buffer.buffer(index),
+                                       shaped_buffer.device_ordinal());
+    }
+    // If ShapedBufferTy is ScopedShapedBuffer, release the ScopedShapedBuffer
+    // into a regular ShapedBuffer, which is stored in
+    // handle_to_shaped_buffers_.
+    handle_to_shaped_buffers_[handle].emplace_back(MakeUnique<ShapedBuffer>(
+        ReleaseIfScopedShapedBuffer(std::move(shaped_buffer))));
   }
+
   GlobalDataHandle result;
   result.set_handle(handle);
-
-  handle_to_shaped_buffer_[handle] = std::move(shaped_buffer);
-
   VLOG(2) << "handle: " << handle;
-
   return result;
 }
 
@@ -75,23 +105,31 @@ tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "Unregister("
           << "handle: " << data.handle() << ")";
-  TF_ASSIGN_OR_RETURN(ShapedBuffer * shaped_buffer, ResolveInternal(data));
-  std::vector<ShapeIndex> shape_indices;
-  ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(),
-                             [this, &shape_indices](const Shape& /*subshape*/,
-                                                    const ShapeIndex& index) {
-                               shape_indices.push_back(index);
-                             });
-  for (const ShapeIndex& index : shape_indices) {
-    TF_RETURN_IF_ERROR(DecrementRefCount(shaped_buffer->buffer(index),
-                                         shaped_buffer->device_ordinal()));
+  TF_ASSIGN_OR_RETURN(std::vector<const ShapedBuffer*> replicated_buffers,
+                      ResolveInternal(data));
+  for (const auto& shaped_buffer : replicated_buffers) {
+    std::vector<ShapeIndex> shape_indices;
+    ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(),
+                               [this, &shape_indices](const Shape& /*subshape*/,
+                                                      const ShapeIndex& index) {
+                                 shape_indices.push_back(index);
+                               });
+    for (const ShapeIndex& index : shape_indices) {
+      TF_RETURN_IF_ERROR(DecrementRefCount(shaped_buffer->buffer(index),
+                                           shaped_buffer->device_ordinal()));
+    }
+  }
+  // Keep a nullptr as a tombstone for unregistered handles. This enables
+  // better error messages. That is, "handle has been deallocated" versus
+  // "handle does not exist".
+  auto it = handle_to_shaped_buffers_.find(data.handle());
+  if (it == handle_to_shaped_buffers_.end()) {
+    return NotFound("no allocation record for global data handle: %lld",
+                    data.handle());
+  }
+  for (auto& shaped_buffer : it->second) {
+    shaped_buffer.reset();
   }
-
-  // Keep a nullptr as a tombstone for unregistered handles. This enables better
-  // error messages. That is, "handle has been deallocated" versus "handle does
-  // not exist".
-  handle_to_shaped_buffer_.at(data.handle()).reset();
-
   return tensorflow::Status::OK();
 }
 
@@ -99,7 +137,11 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
     const GlobalDataHandle& data) {
   tensorflow::mutex_lock lock(mutex_);
 
-  TF_ASSIGN_OR_RETURN(ShapedBuffer * shaped_buffer, ResolveInternal(data));
+  TF_ASSIGN_OR_RETURN(std::vector<const ShapedBuffer*> replicated_buffers,
+                      ResolveInternal(data));
+  // We only need to care about replica id 0 here, since the GlobalDataHandle is
+  // the same for all buffers across replicas.
+  const ShapedBuffer* shaped_buffer = replicated_buffers[0];
   if (!ShapeUtil::IsTuple(shaped_buffer->on_host_shape())) {
     return InvalidArgument("global data handle %lld is not a tuple",
                            data.handle());
@@ -109,54 +151,72 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
   TF_RET_CHECK(ShapeUtil::IsTuple(shaped_buffer->on_device_shape()));
 
   if (ShapeUtil::IsNestedTuple(shaped_buffer->on_device_shape())) {
-    return Unimplemented("deconstructing nested tuples not yet supported");
+    return Unimplemented("Deconstructing nested tuples is not implemented.");
   }
 
   std::vector<GlobalDataHandle> element_handles;
   for (int i = 0;
        i < ShapeUtil::TupleElementCount(shaped_buffer->on_device_shape());
        ++i) {
-    auto element_buffer = MakeUnique<ShapedBuffer>(
+    auto element_buffer = ShapedBuffer(
         ShapeUtil::GetTupleElementShape(shaped_buffer->on_host_shape(), i),
         ShapeUtil::GetTupleElementShape(shaped_buffer->on_device_shape(), i),
         shaped_buffer->platform(), shaped_buffer->device_ordinal());
-    element_buffer->set_buffer(shaped_buffer->buffer(/*index=*/{i}),
-                               /*index=*/{});
+    element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}),
+                              /*index=*/{});
+    std::vector<ShapedBuffer> replicated_buffers;
+    replicated_buffers.push_back(std::move(element_buffer));
     TF_ASSIGN_OR_RETURN(
         GlobalDataHandle element_handle,
-        RegisterInternal(std::move(element_buffer), "deconstructed tuple"));
+        RegisterInternal(std::move(replicated_buffers), "deconstructed tuple"));
 
     element_handles.push_back(element_handle);
   }
   return std::move(element_handles);
 }
 
-StatusOr<const ShapedBuffer*> AllocationTracker::Resolve(
+StatusOr<std::vector<const ShapedBuffer*>> AllocationTracker::Resolve(
     const GlobalDataHandle& data) {
   tensorflow::mutex_lock lock(mutex_);
   return AllocationTracker::ResolveInternal(data);
 }
 
-StatusOr<ShapedBuffer*> AllocationTracker::ResolveInternal(
+StatusOr<const ShapedBuffer*> AllocationTracker::ResolveForReplica(
+    const GlobalDataHandle& data, int replica_id) {
+  tensorflow::mutex_lock lock(mutex_);
+  TF_ASSIGN_OR_RETURN(std::vector<const ShapedBuffer*> replicated_buffers,
+                      ResolveInternal(data));
+  if (replica_id >= replicated_buffers.size()) {
+    return InvalidArgument(
+        "Requesting buffer for replica %d, but found buffers only for %lu "
+        "replicas.",
+        replica_id, replicated_buffers.size());
+  }
+  return replicated_buffers[replica_id];
+}
+
+StatusOr<std::vector<const ShapedBuffer*>> AllocationTracker::ResolveInternal(
     const GlobalDataHandle& data) {
   VLOG(2) << "resolve:" << data.handle();
-  auto it = handle_to_shaped_buffer_.find(data.handle());
-  if (it == handle_to_shaped_buffer_.end()) {
+  auto it = handle_to_shaped_buffers_.find(data.handle());
+  if (it == handle_to_shaped_buffers_.end()) {
     return NotFound("no allocation record for global data handle: %lld",
                     data.handle());
   }
-  ShapedBuffer* shaped_buffer = it->second.get();
-
-  if (shaped_buffer == nullptr) {
-    return InvalidArgument("global data handle %lld was previously deallocated",
-                           data.handle());
+  std::vector<const ShapedBuffer*> replicated_buffers;
+  for (const auto& shaped_buffer : it->second) {
+    if (shaped_buffer == nullptr) {
+      return InvalidArgument(
+          "global data handle %lld was previously deallocated", data.handle());
+    }
+    replicated_buffers.push_back(shaped_buffer.get());
   }
 
-  return shaped_buffer;
+  return replicated_buffers;
 }
 
 void AllocationTracker::AddAllocationOrIncrementRefCount(
-    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+    se::DeviceMemoryBase device_memory, int device_ordinal) {
   AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
   auto it = allocation_map.find(device_memory.opaque());
   if (it == allocation_map.end()) {
@@ -167,8 +227,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
   }
 }
 
-Status AllocationTracker::DecrementRefCount(
-    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
+                                            int device_ordinal) {
   AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
   auto it = allocation_map.find(device_memory.opaque());
   TF_RET_CHECK(it != allocation_map.end());
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 807af8694972083d097604a67ee46d2f73d9545a..1174fa641c06ae053bcc652416bfbc30cabc777c 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -43,9 +43,15 @@ class AllocationTracker {
   AllocationTracker(Backend* backend) : backend_(backend), next_handle_(1) {}
 
   // Registers a shaped buffer of device memory, and returns a corresponding
-  // handle that can be used for talking to XLA clients.
-  StatusOr<GlobalDataHandle> Register(
-      std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag);
+  // handle that can be used for talking to XLA clients. The given shaped buffer
+  // will be treated as the buffer corresponding to the only replica.
+  StatusOr<GlobalDataHandle> Register(ScopedShapedBuffer shaped_buffer,
+                                      const string& tag);
+
+  // Registers a vector of shaped buffers of device memory, one per replica, and
+  // returns a corresponding handle that can be used for talking to XLA clients.
+  StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
+      std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag);
 
   // Unregister the allocation for the given data handle.
   Status Unregister(const GlobalDataHandle& data);
@@ -54,15 +60,23 @@ class AllocationTracker {
   StatusOr<std::vector<GlobalDataHandle>> DeconstructTuple(
       const GlobalDataHandle& Data);
 
-  // Resolve a handle from an XLA client to a shaped buffer, or provide an error
-  // status to say whether it was not found (or found, but found deallocated).
-  StatusOr<const ShapedBuffer*> Resolve(const GlobalDataHandle& data);
+  // Resolve a handle from an XLA client to a vector of shaped buffers, one per
+  // replica, or provide an error status to say whether any of those buffers
+  // were not found (or found, but found deallocated).
+  StatusOr<std::vector<const ShapedBuffer*>> Resolve(
+      const GlobalDataHandle& data);
+
+  // Resolves a handle from an XLA client and replica id to a shaped buffer, or
+  // provide an error status to say whether it was not found (or found, but
+  // found deallocated).
+  StatusOr<const ShapedBuffer*> ResolveForReplica(const GlobalDataHandle& data,
+                                                  int replica_id);
 
  private:
   // Data structure encapsulating single memory allocation on the device.
   struct Allocation {
     // The pointer to this allocation.
-    perftools::gputools::DeviceMemoryBase device_memory;
+    se::DeviceMemoryBase device_memory;
 
     // The device that the memory is allocated on.
     int device_ordinal;
@@ -73,24 +87,28 @@ class AllocationTracker {
   };
 
   // Internal helper which resolves the given GlobalDataHandle to a
-  // ShapedBuffer.
-  StatusOr<ShapedBuffer*> ResolveInternal(const GlobalDataHandle& data)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Internal helper which registers a shaped buffer.
+  // list of ScopedShapedBuffers.
+  StatusOr<std::vector<const ShapedBuffer*>> ResolveInternal(
+      const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Internal helper which registers a vector of shaped buffers, one per
+  // replica.  ShapedBufferTy is either ScopedShapedBuffer or ShapedBuffer.  If
+  // it's ShapedBuffer, all of the given buffers must already be tracked by this
+  // object -- presumably this is a call from DeconstructTuple.
+  template <typename ShapedBufferTy>
   StatusOr<GlobalDataHandle> RegisterInternal(
-      std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag)
+      std::vector<ShapedBufferTy> replicated_buffers, const string& tag)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Adds the given device address to the allocation tracker, or if it already
-  // exists, then increment it's reference count.
-  void AddAllocationOrIncrementRefCount(
-      perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal)
+  // exists, then increment its reference count.
+  void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
+                                        int device_ordinal)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Decrements the reference count of the given device memory. Then, if it is
   // zero, deallocate the memory.
-  Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory,
+  Status DecrementRefCount(se::DeviceMemoryBase device_memory,
                            int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // A map from device memory opaque value to allocation. One such map is
@@ -111,9 +129,25 @@ class AllocationTracker {
   tensorflow::gtl::FlatMap<int, AllocationMap> opaque_to_allocation_map_
       GUARDED_BY(mutex_);
 
-  // A map from data handle to ShapedBuffer.
-  tensorflow::gtl::FlatMap<int64, std::unique_ptr<ShapedBuffer>>
-      handle_to_shaped_buffer_ GUARDED_BY(mutex_);
+  // A map from data handle to a vector of shaped buffers that represent the
+  // buffers for different replicas.
+  //
+  // The ShapedBuffers in this map's vectors need to be unique_ptrs, because our
+  // public API returns pointers to them.  We expect the concrete class to be
+  // ShapedBuffer and never ScopedShapedBuffer; deallocation of buffers is
+  // handled by opaque_to_allocation_map_.
+  //
+  // The elements of the vectors need to be unique_ptrs because we return
+  // pointers to them.  (In theory we could use std::list or something instead,
+  // but we also want to be able to null out these elements.)
+  //
+  // The reason that the elements can't be unique_ptr<ScopedShapedBuffer>s is
+  // the existence of DeconstructTuple().  This function allows us to create a
+  // non-owning "view" into a tuple's sub-buffers.  The sub-buffers are then
+  // free'd when both the view *and* the original tuple are Unregistered.  This
+  // refcounting is managed in opaque_to_allocation_map_.
+  tensorflow::gtl::FlatMap<int64, std::vector<std::unique_ptr<ShapedBuffer>>>
+      handle_to_shaped_buffers_ GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(AllocationTracker);
 };
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 05f2d062784147108a94ffb7bb0ca42ddfe4f010..349b32451a697dbd6804b44cd1a36419c753bb14 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -31,24 +31,20 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
-BackendOptions& BackendOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+BackendOptions& BackendOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
   return *this;
 }
 
-perftools::gputools::Platform* BackendOptions::platform() const {
-  return platform_;
-}
+se::Platform* BackendOptions::platform() const { return platform_; }
 
 BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
     int num_threads) {
@@ -77,7 +73,7 @@ struct Backend::EigenThreadPoolWrapper {
 
 /* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
     const BackendOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto stream_executors,
                       PlatformUtil::GetStreamExecutors(platform));
@@ -121,7 +117,7 @@ StatusOr<Backend::StreamPtr> Backend::BorrowStream(
 }
 
 Backend::Backend(
-    perftools::gputools::Platform* platform, Compiler* compiler,
+    se::Platform* platform, Compiler* compiler,
     tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
     TransferManager* transfer_manager, ComputationPlacer* computation_placer,
     int intra_op_parallelism_threads)
@@ -142,9 +138,6 @@ Backend::Backend(
       << "Service found no devices for backend " << platform_->Name() << '.';
 
   if (platform->id() == se::host::kHostPlatformId) {
-    inter_op_thread_pool_.reset(new tensorflow::thread::ThreadPool(
-        tensorflow::Env::Default(), "xla_inter_op",
-        tensorflow::port::NumSchedulableCPUs()));
     const int num_threads = intra_op_parallelism_threads > 0
                                 ? intra_op_parallelism_threads
                                 : tensorflow::port::NumSchedulableCPUs();
@@ -159,10 +152,6 @@ int Backend::default_device_ordinal() const {
   return default_stream_executor()->device_ordinal();
 }
 
-tensorflow::thread::ThreadPool* Backend::inter_op_thread_pool() const {
-  return inter_op_thread_pool_.get();
-}
-
 const Eigen::ThreadPoolDevice* Backend::eigen_intra_op_thread_pool_device()
     const {
   if (intra_op_thread_pool_wrapper_ == nullptr) {
@@ -178,7 +167,7 @@ tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
   return intra_op_thread_pool_wrapper_->pool.get();
 }
 
-StatusOr<perftools::gputools::StreamExecutor*> Backend::stream_executor(
+StatusOr<se::StreamExecutor*> Backend::stream_executor(
     int device_ordinal) const {
   if (device_ordinal < 0 ||
       device_ordinal > stream_executors_.back()->device_ordinal()) {
@@ -201,9 +190,9 @@ StatusOr<bool> Backend::devices_equivalent(int device_ordinal_a,
   // bit crude but works for GPUs which is the important case where we compile
   // an executable for one GPU and want to know if it will run (well) on
   // another.
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_a,
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_a,
                       stream_executor(device_ordinal_a));
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_b,
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_b,
                       stream_executor(device_ordinal_b));
   return (executor_a->GetDeviceDescription().name() ==
           executor_b->GetDeviceDescription().name());
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index b5ca483b7274d20c31e932d748b6a4c9dea926f9..6546602473e3381cf13879ddebd05d34d1f7a055 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -44,8 +44,8 @@ namespace xla {
 class BackendOptions {
  public:
   // Set the platform backing the backend, or nullptr for the default platform.
-  BackendOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  BackendOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
 
   // Sets the thread pool size for parallel execution of an individual operator.
   // The default value of -1 will result in initializing the thread pool with
@@ -54,7 +54,7 @@ class BackendOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_ = nullptr;
+  se::Platform* platform_ = nullptr;
   int intra_op_parallelism_threads_ = -1;
 };
 
@@ -66,7 +66,7 @@ class BackendOptions {
 //    StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
 class Backend {
  public:
-  using StreamPtr = Pool<perftools::gputools::Stream>::SmartPtr;
+  using StreamPtr = Pool<se::Stream>::SmartPtr;
 
   // Creates a new backend.
   static StatusOr<std::unique_ptr<Backend>> CreateBackend(
@@ -79,7 +79,7 @@ class Backend {
   ~Backend();
 
   // Accessors for the various objects.
-  perftools::gputools::Platform* platform() const { return platform_; }
+  se::Platform* platform() const { return platform_; }
   Compiler* compiler() const { return compiler_; }
   DeviceMemoryAllocator* memory_allocator() const {
     return memory_allocator_.get();
@@ -96,19 +96,17 @@ class Backend {
 
   // Returns stream executors of all supported devices for this backend. The
   // executors are ordered by the device ordinal.
-  const std::vector<perftools::gputools::StreamExecutor*>& stream_executors()
-      const {
+  const std::vector<se::StreamExecutor*>& stream_executors() const {
     return stream_executors_;
   }
 
   // Returns the stream executor for the given device ordinal.
-  StatusOr<perftools::gputools::StreamExecutor*> stream_executor(
-      int device_ordinal) const;
+  StatusOr<se::StreamExecutor*> stream_executor(int device_ordinal) const;
 
   // Returns the stream executor for the default device ordinal. This stream
   // executor can only be used when the number of computations is 1 (replication
   // can be > 1).
-  perftools::gputools::StreamExecutor* default_stream_executor() const {
+  se::StreamExecutor* default_stream_executor() const {
     CHECK(!stream_executors_.empty());
     return stream_executors_[0];
   }
@@ -117,8 +115,7 @@ class Backend {
   // internal pool, or by constructing/initializating it, and returns the result
   // to the caller.
   StatusOr<StreamPtr> BorrowStream(int device_ordinal);
-  StatusOr<StreamPtr> BorrowStream(
-      perftools::gputools::StreamExecutor* executor);
+  StatusOr<StreamPtr> BorrowStream(se::StreamExecutor* executor);
 
   // Returns a function to borrow a stream, as `BorrowStream` above does.
   // Purely for convenience, the caller could rather make this anonymous
@@ -143,10 +140,6 @@ class Backend {
   // be equivalent to an executable compiled for the other.
   StatusOr<bool> devices_equivalent(int device_ordinal_a, int device_ordinal_b);
 
-  // For the host platform, returns the threadpool to use when scheduling
-  // parallel operators. For other platforms, returns NULL.
-  tensorflow::thread::ThreadPool* inter_op_thread_pool() const;
-
   // For the host platform, returns the configured eigen threadpool device to be
   // used for scheduling work. For other platforms, returns NULL.
   const Eigen::ThreadPoolDevice* eigen_intra_op_thread_pool_device() const;
@@ -157,36 +150,30 @@ class Backend {
 
  private:
   struct EigenThreadPoolWrapper;
-  Backend(perftools::gputools::Platform* platform, Compiler* compiler,
-          tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-              stream_executors,
+  Backend(se::Platform* platform, Compiler* compiler,
+          tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
           TransferManager* transfer_manager,
           ComputationPlacer* computation_placer,
           int intra_op_parallelism_threads);
   Backend(const Backend&) = delete;
   Backend& operator=(const Backend&) = delete;
 
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
   Compiler* compiler_;
   TransferManager* transfer_manager_;
   ComputationPlacer* computation_placer_;
 
   // Vector of stream executors. stream_executors_[0] is the default executor.
-  std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
+  std::vector<se::StreamExecutor*> stream_executors_;
 
   tensorflow::mutex mu_;
 
   // Mapping from stream executor to stream pools, used by `BorrowStream` above.
-  std::map<perftools::gputools::StreamExecutor*,
-           Pool<perftools::gputools::Stream>>
-      stream_pools_ GUARDED_BY(mu_);
+  std::map<se::StreamExecutor*, Pool<se::Stream>> stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
 
-  // For the CPU backend, a threadpool for scheduling parallel operators.
-  std::unique_ptr<tensorflow::thread::ThreadPool> inter_op_thread_pool_;
-
   // For the CPU backend, an Eigen threadpool device for use by Eigen code.
   std::unique_ptr<EigenThreadPoolWrapper> intra_op_thread_pool_wrapper_;
 };
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 27ddfd47aa3096afd3e245af1ac3cedd9b48ce4a..38086bd7e121847be6b6b69415cfe87814e7fc24 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -153,6 +152,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   std::vector<HloInstruction*> added_instructions;
   auto add = [&](std::unique_ptr<HloInstruction> inst) {
     HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_inst->set_metadata(batch_norm->metadata());
     added_instructions.push_back(added_inst);
     return added_inst;
   };
@@ -334,6 +334,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
   std::vector<HloInstruction*> added_instructions;
   auto add = [&](std::unique_ptr<HloInstruction> inst) {
     HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_inst->set_metadata(batch_norm->metadata());
     added_instructions.push_back(added_inst);
     return added_inst;
   };
@@ -419,6 +420,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
   std::vector<HloInstruction*> added_instructions;
   auto add = [&](std::unique_ptr<HloInstruction> inst) {
     HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_inst->set_metadata(batch_norm->metadata());
     added_instructions.push_back(added_inst);
     return added_inst;
   };
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
index cde990e176ddb57a8e93ecc3c60260b2dbae32a8..08d0152e3cfcfcb7ae1e85f72c2f7dc856f5e8b3 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -34,6 +34,9 @@ class BFloat16ConversionFoldingVisitor : public DfsHloVisitorWithDefault {
 
   Status DefaultAction(HloInstruction* hlo) override;
 
+  // Special handling for cross-replica-sum which can have a tuple output.
+  Status HandleCrossReplicaSum(HloInstruction* crs) override;
+
   static bool Run(HloComputation* computation,
                   const BFloat16Support* bfloat16_support) {
     BFloat16ConversionFoldingVisitor visitor(computation, bfloat16_support);
@@ -84,6 +87,25 @@ Status BFloat16ConversionFoldingVisitor::FoldOperandConversion(
   return Status::OK();
 }
 
+namespace {
+
+// Returns whether hlo has users and all users are conversions from F32 to BF16.
+bool AllUsersAreF32ToBF16Converts(const HloInstruction* hlo) {
+  if (hlo->user_count() == 0 || hlo->shape().element_type() != F32) {
+    return false;
+  }
+  for (const auto user : hlo->users()) {
+    if (user->opcode() == HloOpcode::kConvert &&
+        user->shape().element_type() == BF16) {
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
 Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
     HloInstruction* hlo) {
   std::vector<int64> bf16_to_f32_operands;
@@ -104,22 +126,9 @@ Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
     }
   }
 
-  bool fold_output_conversion = hlo->user_count() > 0 &&
-                                hlo->shape().element_type() == F32 &&
-                                bfloat16_support_->SupportsBF16Output(*hlo) &&
-                                hlo != computation_->root_instruction();
-  if (fold_output_conversion) {
-    for (auto user : hlo->users()) {
-      if (user->opcode() == HloOpcode::kConvert &&
-          user->shape().element_type() == BF16) {
-        continue;
-      }
-      // We should not change the output type if any user is not a conversion
-      // from F32 to BF16.
-      fold_output_conversion = false;
-      break;
-    }
-  }
+  const bool fold_output_conversion =
+      AllUsersAreF32ToBF16Converts(hlo) &&
+      bfloat16_support_->SupportsBF16Output(*hlo);
 
   if (!bfloat16_support_->SupportsMixedPrecisions(*hlo)) {
     if (has_other_f32_operands ||
@@ -147,6 +156,10 @@ Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
       hlo->opcode() == HloOpcode::kGetTupleElement ||  //
       hlo->opcode() == HloOpcode::kInfeed ||           //
       hlo->opcode() == HloOpcode::kOutfeed ||          //
+      hlo->opcode() == HloOpcode::kSend ||             //
+      hlo->opcode() == HloOpcode::kSendDone ||         //
+      hlo->opcode() == HloOpcode::kRecv ||             //
+      hlo->opcode() == HloOpcode::kRecvDone ||         //
       hlo->opcode() == HloOpcode::kConstant ||         //
       hlo->opcode() == HloOpcode::kParameter ||        //
       hlo->opcode() == HloOpcode::kFusion ||           //
@@ -167,6 +180,52 @@ Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
   return TryFoldBF16Conversions(hlo);
 }
 
+Status BFloat16ConversionFoldingVisitor::HandleCrossReplicaSum(
+    HloInstruction* crs) {
+  if (!ShapeUtil::IsTuple(crs->shape()) ||
+      !bfloat16_support_->SupportsMixedPrecisions(*crs)) {
+    return DefaultAction(crs);
+  }
+
+  // First use DefaultAction() to handle the operands. It can't handle
+  // tuple-shaped output.
+  TF_RETURN_IF_ERROR(DefaultAction(crs));
+
+  // Then do per-tuple-element handling on the output.
+  std::vector<std::vector<HloInstruction*>> per_tuple_element_gtes(
+      crs->operand_count());
+  for (auto user : crs->users()) {
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      return Status::OK();
+    }
+    per_tuple_element_gtes[user->tuple_index()].push_back(user);
+  }
+
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    // Fold conversions only when all the get-tuple-elements' users are
+    // conversions from F32 to BF16.
+    auto all_gte_users_are_bf16_convert = [&per_tuple_element_gtes, i]() {
+      for (auto gte : per_tuple_element_gtes[i]) {
+        if (!AllUsersAreF32ToBF16Converts(gte)) {
+          return false;
+        }
+      }
+      return true;
+    };
+    if (!all_gte_users_are_bf16_convert()) {
+      continue;
+    }
+
+    ShapeUtil::GetMutableSubshape(crs->mutable_shape(), {i})
+        ->set_element_type(BF16);
+    for (auto gte : per_tuple_element_gtes[i]) {
+      TF_RETURN_IF_ERROR(FoldOutputConversions(gte));
+    }
+  }
+
+  return Status::OK();
+}
+
 StatusOr<bool> BFloat16ConversionFolding::Run(HloModule* module) {
   XLA_VLOG_LINES(
       2, "BFloat16ConversionFolding::Run(), before:\n" + module->ToString());
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index cb37759439debf41a305ec7dccaa548e1bf234cd..28e71c2054f59ba4d5d096bf7d898161877bb42f 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -37,7 +37,8 @@ class TestBFloat16Support : public BFloat16Support {
     if (hlo.opcode() == HloOpcode::kAdd ||
         hlo.opcode() == HloOpcode::kSubtract ||
         hlo.opcode() == HloOpcode::kTuple ||
-        hlo.opcode() == HloOpcode::kGetTupleElement) {
+        hlo.opcode() == HloOpcode::kGetTupleElement ||
+        hlo.opcode() == HloOpcode::kCrossReplicaSum) {
       return true;
     }
     return false;
@@ -47,7 +48,8 @@ class TestBFloat16Support : public BFloat16Support {
     if (hlo.opcode() == HloOpcode::kAdd ||
         hlo.opcode() == HloOpcode::kSubtract ||
         hlo.opcode() == HloOpcode::kTuple ||
-        hlo.opcode() == HloOpcode::kGetTupleElement) {
+        hlo.opcode() == HloOpcode::kGetTupleElement ||
+        hlo.opcode() == HloOpcode::kCrossReplicaSum) {
       return true;
     }
     return false;
@@ -55,7 +57,8 @@ class TestBFloat16Support : public BFloat16Support {
 
   bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
     if (hlo.opcode() == HloOpcode::kAdd || hlo.opcode() == HloOpcode::kTuple ||
-        hlo.opcode() == HloOpcode::kGetTupleElement) {
+        hlo.opcode() == HloOpcode::kGetTupleElement ||
+        hlo.opcode() == HloOpcode::kCrossReplicaSum) {
       return true;
     }
     return false;
@@ -206,4 +209,46 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
   EXPECT_EQ(tuple->operand(1), convert0);
 }
 
+TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, bf16_shape, "a"));
+  HloInstruction* convert_a =
+      builder.AddInstruction(HloInstruction::CreateConvert(f32_shape, a));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32_shape, "b"));
+
+  HloInstruction* crs =
+      builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
+          ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b}));
+  HloInstruction* gte_a = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32_shape, crs, 0));
+  HloInstruction* gte_b = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32_shape, crs, 1));
+  HloInstruction* convert_gte_b =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte_b));
+  HloInstruction* tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({gte_a, convert_gte_b}));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(FoldConversions(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), tuple);
+  EXPECT_EQ(tuple->operand(0), gte_a);
+  EXPECT_EQ(tuple->operand(1), gte_b);
+  EXPECT_EQ(gte_a->shape().element_type(), F32);
+  EXPECT_EQ(gte_b->shape().element_type(), BF16);
+  EXPECT_EQ(crs->operand(0), a);
+  EXPECT_EQ(crs->operand(1), b);
+  EXPECT_EQ(a->shape().element_type(), BF16);
+  EXPECT_EQ(b->shape().element_type(), F32);
+  EXPECT_EQ(ShapeUtil::GetSubshape(crs->shape(), {0}).element_type(), F32);
+  EXPECT_EQ(ShapeUtil::GetSubshape(crs->shape(), {1}).element_type(), BF16);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.cc b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
index b032c040e8aff49f9e0fc1ff9a1c1e79ea4bb77f..14c54ddd135af024327f63418b410da1ed3c4fd4 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
@@ -152,44 +152,64 @@ Status BFloat16NormalizationVisitor::HandleCrossReplicaSum(
 
   std::vector<PrimitiveType> operand_types(crs->operand_count());
   std::vector<PrimitiveType> output_types(crs->operand_count());
-  bool has_f32 = false;
-  bool has_bf16 = false;
-  bool has_bf16_output = false;
+  int64 f32_count = 0;
+  int64 bf16_count = 0;
+  bool has_unsupported_bf16_operand = false;
+  bool has_unsupported_bf16_output = false;
   for (int64 i = 0; i < crs->operand_count(); ++i) {
     operand_types[i] = crs->operand(i)->shape().element_type();
     output_types[i] = ShapeUtil::GetSubshape(crs->shape(), {i}).element_type();
-    if (operand_types[i] == F32 || output_types[i] == F32) {
-      has_f32 = true;
+    if (operand_types[i] == F32) {
+      f32_count += 1;
     } else if (operand_types[i] == BF16) {
-      has_bf16 = true;
+      bf16_count += 1;
+      if (!bfloat16_support_->SupportsBF16Operand(*crs, i)) {
+        has_unsupported_bf16_operand = true;
+      }
     }
-    if (output_types[i] == BF16) {
-      has_bf16 = true;
-      has_bf16_output = true;
+    if (output_types[i] == F32) {
+      f32_count += 1;
+    } else if (output_types[i] == BF16) {
+      bf16_count += 1;
+      if (!bfloat16_support_->SupportsBF16Output(*crs)) {
+        has_unsupported_bf16_output = true;
+      }
     }
   }
 
-  for (int64 i = 0; i < crs->operand_count(); ++i) {
+  if (bf16_count == 0) {
+    return Status::OK();
+  }
+
+  auto should_convert_operand = [&](int64 i) {
     if (operand_types[i] != BF16) {
-      continue;
+      return false;
     }
-    if (bfloat16_support_->SupportsBF16Operand(*crs, i) &&
-        (bfloat16_support_->SupportsMixedPrecisions(*crs) || !has_f32)) {
-      continue;
+    if (!bfloat16_support_->SupportsBF16Operand(*crs, i)) {
+      return true;
     }
-    TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(crs, i, F32, computation_));
-    has_f32 = true;
-  }
+    if (bfloat16_support_->SupportsMixedPrecisions(*crs)) {
+      return false;
+    }
+    return has_unsupported_bf16_operand || has_unsupported_bf16_output ||
+           f32_count > 0;
+  };
 
-  if (!has_bf16_output) {
-    return Status::OK();
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    if (should_convert_operand(i)) {
+      TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(crs, i, F32, computation_));
+      f32_count += 1;
+      bf16_count -= 1;
+    }
   }
 
-  if (bfloat16_support_->SupportsBF16Output(*crs) &&
-      (bfloat16_support_->SupportsMixedPrecisions(*crs) || !has_f32)) {
+  if (!has_unsupported_bf16_output &&
+      (bfloat16_support_->SupportsMixedPrecisions(*crs) || f32_count == 0 ||
+       bf16_count == 0)) {
     return Status::OK();
   }
 
+  std::vector<HloInstruction*> materialized_users = crs->users();
   std::vector<HloInstruction*> output_elements(crs->operand_count());
   auto original_shape = crs->shape();
   for (int64 i = 0; i < crs->operand_count(); ++i) {
@@ -209,7 +229,6 @@ Status BFloat16NormalizationVisitor::HandleCrossReplicaSum(
   auto tuple = computation_->AddInstruction(
       HloInstruction::CreateTuple(output_elements));
 
-  std::vector<HloInstruction*> materialized_users = crs->users();
   // Use the crs' shape temporarily, in order to pass checks in
   // ReplaceUseWith.
   *tuple->mutable_shape() = crs->shape();
@@ -221,41 +240,37 @@ Status BFloat16NormalizationVisitor::HandleCrossReplicaSum(
 }
 
 Status BFloat16NormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
-  std::vector<int64> bf16_operands;
-  std::vector<int64> f32_operands;
-  bool has_f32 = false;
-  bool has_bf16 = false;
+  int f32_count = 0;
+  int bf16_count = 1;
 
   for (int64 i = 0; i < hlo->operand_count(); ++i) {
     if (hlo->operand(i)->shape().element_type() == F32) {
-      f32_operands.push_back(i);
-      has_f32 = true;
+      f32_count += 1;
     } else if (hlo->operand(i)->shape().element_type() == BF16) {
-      bf16_operands.push_back(i);
-      has_bf16 = true;
+      bf16_count += 1;
     }
   }
 
   if (hlo->shape().element_type() == F32) {
-    has_f32 = true;
+    f32_count += 1;
   } else if (hlo->shape().element_type() == BF16) {
-    has_bf16 = true;
+    bf16_count += 1;
   }
 
   std::vector<HloComputation*> bf16_called_comps;
   for (auto* comp : hlo->called_computations()) {
     bool comp_has_bf16 = false;
     if (comp->root_instruction()->shape().element_type() == F32) {
-      has_f32 = true;
+      f32_count += 1;
     } else if (comp->root_instruction()->shape().element_type() == BF16) {
-      has_bf16 = true;
+      bf16_count += 1;
       comp_has_bf16 = true;
     }
     for (auto* param : comp->parameter_instructions()) {
       if (param->shape().element_type() == F32) {
-        has_f32 = true;
+        f32_count += 1;
       } else if (param->shape().element_type() == BF16) {
-        has_bf16 = true;
+        bf16_count += 1;
         comp_has_bf16 = true;
       }
     }
@@ -264,54 +279,69 @@ Status BFloat16NormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
     }
   }
 
-  if (!bfloat16_support_->SupportsMixedPrecisions(*hlo) && has_bf16 &&
-      has_f32) {
-    // Resolve unsupported mixed precision.
-    //
-    // See if we can change everything to BF16.
-    if (hlo->called_computations().empty() &&
-        hlo->shape().element_type() == BF16) {
-      bool can_use_bf16 = true;
-      for (int i : f32_operands) {
-        if (bfloat16_support_->EffectiveOperandPrecisionIsOutputPrecision(*hlo,
-                                                                          i) &&
-            bfloat16_support_->SupportsBF16Operand(*hlo, i)) {
-          continue;
-        }
-        can_use_bf16 = false;
-        break;
-      }
-      if (can_use_bf16) {
-        for (int i : f32_operands) {
-          TF_RETURN_IF_ERROR(
-              InsertConvertBeforeOperand(hlo, i, BF16, computation_));
-        }
-        return Status::OK();
-      }
-    }
-    if (hlo->shape().element_type() == BF16) {
-      TF_RETURN_IF_ERROR(
-          ChangeOutputTypeThenInsertConvertBack(hlo, F32, computation_));
-    }
-    for (int i : bf16_operands) {
-      TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(hlo, i, F32, computation_));
-    }
-    return ConvertCalledComputations(hlo, bf16_called_comps);
-  }
-
-  for (int i : bf16_operands) {
-    if (!bfloat16_support_->SupportsBF16Operand(*hlo, i)) {
+  // Resolve unsupported BF16 operands.
+  for (int i = 0; i < hlo->operand_count(); ++i) {
+    if (hlo->operand(i)->shape().element_type() == BF16 &&
+        !bfloat16_support_->SupportsBF16Operand(*hlo, i)) {
       TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(hlo, i, F32, computation_));
+      bf16_count -= 1;
+      f32_count += 1;
     }
   }
 
+  // Resolve unsupported BF16 output.
   if (hlo->shape().element_type() == BF16 &&
       !bfloat16_support_->SupportsBF16Output(*hlo)) {
     TF_RETURN_IF_ERROR(
         ChangeOutputTypeThenInsertConvertBack(hlo, F32, computation_));
+    bf16_count -= 1;
+    f32_count += 1;
   }
 
-  return Status::OK();
+  // Resolve unsupported mixed precision after resolving unsupported BF16
+  // operands and output, because the numbers of BF16 operands/output and F32
+  // operands/output may have changed.
+  if (bfloat16_support_->SupportsMixedPrecisions(*hlo) || bf16_count == 0 ||
+      f32_count == 0) {
+    return Status::OK();
+  }
+  // See if we can change everything to BF16.
+  if (hlo->called_computations().empty() &&
+      hlo->shape().element_type() == BF16) {
+    bool can_use_bf16 = true;
+    for (int i = 0; i < hlo->operand_count(); ++i) {
+      if (hlo->operand(i)->shape().element_type() == BF16) {
+        continue;
+      }
+      if ((bfloat16_support_->EffectiveOperandPrecisionIsBF16(*hlo, i) ||
+           bfloat16_support_->EffectiveOperandPrecisionIsOutputPrecision(*hlo,
+                                                                         i)) &&
+          bfloat16_support_->SupportsBF16Operand(*hlo, i)) {
+        continue;
+      }
+      can_use_bf16 = false;
+      break;
+    }
+    if (can_use_bf16) {
+      for (int i = 0; i < hlo->operand_count(); ++i) {
+        if (hlo->operand(i)->shape().element_type() == F32) {
+          TF_RETURN_IF_ERROR(
+              InsertConvertBeforeOperand(hlo, i, BF16, computation_));
+        }
+      }
+      return Status::OK();
+    }
+  }
+  if (hlo->shape().element_type() == BF16) {
+    TF_RETURN_IF_ERROR(
+        ChangeOutputTypeThenInsertConvertBack(hlo, F32, computation_));
+  }
+  for (int i = 0; i < hlo->operand_count(); ++i) {
+    if (hlo->operand(i)->shape().element_type() == BF16) {
+      TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(hlo, i, F32, computation_));
+    }
+  }
+  return ConvertCalledComputations(hlo, bf16_called_comps);
 }
 
 Status BFloat16NormalizationVisitor::DefaultAction(HloInstruction* hlo) {
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index 66c3085842c4afe7ffc4d5891883e4cce9389d45..1afaefd9df9c5771fb9e134ae9050f3abb00ea4a 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -41,13 +42,17 @@ class TestBFloat16Support : public BFloat16Support {
         hlo.opcode() == HloOpcode::kGetTupleElement) {
       return true;
     }
+    if (hlo.opcode() == HloOpcode::kDot) {
+      // Test that only the first operand of kDot supports BF16.
+      return operand_index == 0;
+    }
     return false;
   }
 
   bool SupportsBF16Output(const HloInstruction& hlo) const override {
     if (hlo.opcode() == HloOpcode::kAdd || hlo.opcode() == HloOpcode::kReduce ||
         hlo.opcode() == HloOpcode::kSubtract ||
-        hlo.opcode() == HloOpcode::kTuple ||
+        hlo.opcode() == HloOpcode::kDot || hlo.opcode() == HloOpcode::kTuple ||
         hlo.opcode() == HloOpcode::kGetTupleElement) {
       return true;
     }
@@ -70,6 +75,10 @@ class BFloat16NormalizationTest : public HloTestBase {
     BFloat16Normalization normalization(&bfloat16_support_);
     StatusOr<bool> result = normalization.Run(module);
     EXPECT_IS_OK(result.status());
+
+    HloVerifier verifier(/*allow_mixed_precision=*/true);
+    EXPECT_IS_OK(verifier.Run(module).status());
+
     return result.ValueOrDie();
   }
 };
@@ -166,7 +175,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
   Shape f32_input_shape = ShapeUtil::MakeShape(F32, {2, 4});
   Shape f32_output_shape = ShapeUtil::MakeShape(F32, {4});
 
-  Shape bf16_scalar_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+  Shape bf16_scalar_shape = ShapeUtil::MakeShape(BF16, {});
 
   auto reduce_comp_builder = HloComputation::Builder("reduce_comp");
   auto reduce_comp_param0 = reduce_comp_builder.AddInstruction(
@@ -245,4 +254,34 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
   EXPECT_EQ(ShapeUtil::GetSubshape(crs->shape(), {1}).element_type(), F32);
 }
 
+// Tests that the normalization should not cause unsupported mixed precision due
+// to resolving unsupported BF16 operand.
+TEST_F(BFloat16NormalizationTest, DoNotAddUnsupportedMixedPrecision) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {4, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, bf16_shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_shape, "b"));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(bf16_shape, a, b, dot_dnums));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(Normalize(module.get()));
+
+  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
+  EXPECT_EQ(dot->shape().element_type(), F32);
+  EXPECT_EQ(dot->operand(0)->shape().element_type(), F32);
+  EXPECT_EQ(dot->operand(0)->opcode(), HloOpcode::kConvert);
+  EXPECT_EQ(dot->operand(1)->shape().element_type(), F32);
+  EXPECT_EQ(dot->operand(1)->opcode(), HloOpcode::kConvert);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43ebe92c5ec1c945780f76ca4178a94f948a81b9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -0,0 +1,709 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/bfloat16_propagation.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+BFloat16Propagation::BFloat16Propagation(
+    const BFloat16Support* bfloat16_support)
+    : bfloat16_support_(bfloat16_support) {}
+
+void BFloat16Propagation::DetermineAndMutateFusionComputationPrecision(
+    HloInstruction* fusion) {
+  CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
+  if (!bfloat16_support_->SupportsMixedPrecisions(*fusion)) {
+    return;
+  }
+
+  // We are depending on the fusion node itself having already been analyzed
+  // for whether it can output BF16 and this has been adjusted in the output
+  // shape, and now we're looking to update the interior of the fusion node to
+  // match the new output shape, as well as recursively process the whole fusion
+  // node even if the output shape was not modified.
+  auto root = fusion->fused_instructions_computation()->root_instruction();
+
+  // Adjust root's element types according to the fusion's output shape.
+  ShapeUtil::ForEachMutableSubshape(
+      root->mutable_shape(), [&](Shape* subshape, const ShapeIndex& index) {
+        if (subshape->element_type() != F32) {
+          return;
+        }
+        if (ShapeUtil::GetSubshape(fusion->shape(), index).element_type() ==
+            BF16) {
+          subshape->set_element_type(BF16);
+          changed_ = true;
+          VLOG(2) << "Fused root " << root->ToString() << " at shape index "
+                  << index << " changed to BF16 precision for fusion "
+                  << fusion->ToString();
+        }
+      });
+
+  // Propagate BF16 in the fusion computation.
+  auto insts =
+      fusion->fused_instructions_computation()->MakeInstructionPostOrder();
+  for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
+    DetermineAndMutateInstructionPrecision(*inst_it, /*skip_parameters=*/false);
+  }
+  computations_visited_in_mutation_pass_.insert(
+      fusion->fused_instructions_computation());
+}
+
+void BFloat16Propagation::DetermineAndMutateWhileComputationsPrecision(
+    HloInstruction* while_hlo) {
+  CHECK_EQ(while_hlo->opcode(), HloOpcode::kWhile);
+
+  // We are depending on the while node itself having already been analyzed for
+  // whether it can output BF16 and this has been adjusted in the output shape,
+  // and now we're looking to update the body and condition computations to
+  // match the new output shape, as well as recursively process the whole while
+  // node even if the output shape was not modified.
+  HloComputation* body = while_hlo->while_body();
+  auto body_root = body->root_instruction();
+  HloComputation* condition = while_hlo->while_condition();
+
+  ShapeUtil::ForEachMutableSubshape(
+      body_root->mutable_shape(),
+      [this, while_hlo, body_root](Shape* subshape, const ShapeIndex& index) {
+        if (subshape->element_type() != F32) {
+          return;
+        }
+        if (ShapeUtil::GetSubshape(while_hlo->shape(), index).element_type() ==
+            BF16) {
+          subshape->set_element_type(BF16);
+          changed_ = true;
+          VLOG(2) << "While body root " << body_root->ToString()
+                  << " at shape index " << index
+                  << " changed to BF16 precision for while "
+                  << while_hlo->ToString();
+        }
+      });
+
+  auto body_insts = body->MakeInstructionPostOrder();
+  for (auto inst_it = body_insts.rbegin(); inst_it != body_insts.rend();
+       ++inst_it) {
+    DetermineAndMutateInstructionPrecision(*inst_it, /*skip_parameters=*/false);
+  }
+  computations_visited_in_mutation_pass_.insert(body);
+
+  auto condition_insts = condition->MakeInstructionPostOrder();
+  for (auto inst_it = condition_insts.rbegin();
+       inst_it != condition_insts.rend(); ++inst_it) {
+    DetermineAndMutateInstructionPrecision(*inst_it, /*skip_parameters=*/false);
+  }
+  computations_visited_in_mutation_pass_.insert(condition);
+}
+
+bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
+                                              const ShapeIndex& index) const {
+  auto value_set = dataflow_->GetValueSet(&hlo, index);
+  for (const HloValue* value : value_set.values()) {
+    if (ContainsKey(values_that_must_be_kept_as_f32_, value)) {
+      return false;
+    }
+    if (value->shape().element_type() == BF16) {
+      continue;
+    }
+    for (const HloUse& use : value->uses()) {
+      if (!ContainsKey(instructions_visited_in_mutation_pass_,
+                       use.instruction)) {
+        // We don't know yet whether use.instruction will consume BF16 since it
+        // hasn't been visited. Although we visit instructions in reverse
+        // topological order, this is still possible because there may be
+        // unvisited instruction that alias the same buffer. In this case, we
+        // aggressively skip this use, and if this causes inconsistency (e.g.,
+        // one use is in BF16 but another use is in F32), it will be resolved at
+        // the end of the BFloat16Propagation pass.
+        continue;
+      }
+      // Any visited user that can accept BF16 has already been updated if
+      // necessary, e.g., the output has been changed to BF16 if it propagates
+      // precision, or a called computation's parameters have been changed to
+      // BF16 for fusions or whiles.
+      if (use.instruction->opcode() == HloOpcode::kFusion) {
+        const auto* fused_parameter =
+            use.instruction->fused_parameter(use.operand_number);
+        if (ShapeUtil::GetSubshape(fused_parameter->shape(), use.operand_index)
+                .element_type() != BF16) {
+          return false;
+        }
+        continue;
+      } else if (use.instruction->opcode() == HloOpcode::kWhile) {
+        const auto* cond_parameter =
+            use.instruction->while_condition()->parameter_instruction(
+                use.operand_number);
+        if (ShapeUtil::GetSubshape(cond_parameter->shape(), use.operand_index)
+                .element_type() != BF16) {
+          return false;
+        }
+        const auto* body_parameter =
+            use.instruction->while_body()->parameter_instruction(
+                use.operand_number);
+        if (ShapeUtil::GetSubshape(body_parameter->shape(), use.operand_index)
+                .element_type() != BF16) {
+          return false;
+        }
+        continue;
+      }
+      if (bfloat16_support_->EffectiveOperandPrecisionIsBF16(
+              *use.instruction, use.operand_number)) {
+        continue;
+      }
+      // If the op propagates precision and it outputs a BF16, then it's OK to
+      // supply BF16 also as the input. In the backward mutation pass, the users
+      // shapes should have already been processed.
+      PrimitiveType user_output_type = PRIMITIVE_TYPE_INVALID;
+      if (use.instruction->opcode() == HloOpcode::kTuple ||
+          (use.instruction->opcode() == HloOpcode::kCrossReplicaSum &&
+           ShapeUtil::IsTuple(use.instruction->shape()))) {
+        user_output_type = ShapeUtil::GetSubshape(
+                               ShapeUtil::GetSubshape(use.instruction->shape(),
+                                                      {use.operand_number}),
+                               use.operand_index)
+                               .element_type();
+      } else {
+        user_output_type = use.instruction->shape().element_type();
+      }
+      if (bfloat16_support_->EffectiveOperandPrecisionIsOutputPrecision(
+              *use.instruction, use.operand_number) &&
+          user_output_type == BF16) {
+        continue;
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+void BFloat16Propagation::DetermineAndMutateInstructionPrecision(
+    HloInstruction* hlo, bool skip_parameters) {
+  // We handle any fusion computation or while body/condition after the
+  // instruction is handled, because we need to know the output shape of a
+  // fusion or while before propagating inside its  computations.
+  bool postpone_processing_called_computations = false;
+  auto cleaner = tensorflow::gtl::MakeCleanup(
+      [this, hlo, &postpone_processing_called_computations] {
+        if (!postpone_processing_called_computations) {
+          if (hlo->opcode() == HloOpcode::kFusion) {
+            DetermineAndMutateFusionComputationPrecision(hlo);
+          } else if (hlo->opcode() == HloOpcode::kWhile) {
+            DetermineAndMutateWhileComputationsPrecision(hlo);
+          }
+        }
+        instructions_visited_in_mutation_pass_.insert(hlo);
+      });
+
+  if (hlo->opcode() == HloOpcode::kWhile &&
+      (caller_counts_[hlo->while_condition()] > 1 ||
+       caller_counts_[hlo->while_body()] > 1)) {
+    postpone_processing_called_computations = true;
+    return;
+  }
+
+  // Do not change precision for instructions related to entry and exit of a
+  // computation, and control flow, because this pass might break the interfaces
+  // or assumptions for them.
+  if (hlo->opcode() == HloOpcode::kInfeed ||       //
+      hlo->opcode() == HloOpcode::kOutfeed ||      //
+      hlo->opcode() == HloOpcode::kSend ||         //
+      hlo->opcode() == HloOpcode::kSendDone ||     //
+      hlo->opcode() == HloOpcode::kRecv ||         //
+      hlo->opcode() == HloOpcode::kRecvDone ||     //
+      hlo->opcode() == HloOpcode::kCustomCall ||   //
+      hlo->opcode() == HloOpcode::kCall ||         //
+      hlo->opcode() == HloOpcode::kConditional ||  //
+      (hlo->opcode() == HloOpcode::kParameter && skip_parameters)) {
+    return;
+  }
+
+  // Prevent root instructions from having their output modified by recording
+  // all F32 output values as needing to stay as F32.
+  CHECK(hlo->parent() != nullptr);
+  if (hlo == hlo->parent()->root_instruction()) {
+    if (!hlo->parent()->IsFusionComputation()) {
+      ShapeUtil::ForEachSubshape(hlo->shape(), [&](const Shape& subshape,
+                                                   const ShapeIndex& index) {
+        if (subshape.element_type() != F32) {
+          return;
+        }
+        for (const auto* value : dataflow_->GetValueSet(hlo, index).values()) {
+          // Since we use HloValues from the dataflow analysis, this can also
+          // affect HLO instructions beyond the root, e.g., if the root is a
+          // Tuple HLO, then its operands are also affected.
+          values_that_must_be_kept_as_f32_.insert(value);
+        }
+      });
+    }
+    return;
+  }
+
+  if (!ContainsKey(consider_using_bfloat16_, hlo)) {
+    return;
+  }
+
+  if (!bfloat16_support_->SupportsBF16Output(*hlo)) {
+    return;
+  }
+
+  ShapeUtil::ForEachMutableSubshape(
+      hlo->mutable_shape(),
+      [hlo, this](Shape* subshape, const ShapeIndex& index) {
+        if (subshape->element_type() == F32 &&
+            AllUsersConsumeBF16(*hlo, index)) {
+          subshape->set_element_type(BF16);
+          changed_ = true;
+          VLOG(2) << "HloInstruction output at shape index " << index
+                  << " changed to BF16 precision: " << hlo->ToString();
+        }
+      });
+}
+
+bool BFloat16Propagation::InstructionIsCandidateForBF16Output(
+    HloInstruction* hlo) {
+  if (!bfloat16_support_->SupportsMixedPrecisions(*hlo) &&
+      hlo->opcode() != HloOpcode::kTuple &&
+      hlo->opcode() != HloOpcode::kGetTupleElement &&
+      hlo->shape().element_type() != BF16) {
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      if (!bfloat16_support_->EffectiveOperandPrecisionIsOutputPrecision(*hlo,
+                                                                         i) ||
+          !ContainsKey(consider_using_bfloat16_, hlo->operand(i))) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void BFloat16Propagation::AdjustCalledComputationParameters(
+    HloInstruction* hlo) {
+  auto adjust_computation =
+      [this, hlo](HloComputation* computation,
+                  tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+        // Adjust parameters.
+        CHECK_EQ(operands.size(), computation->num_parameters());
+        for (int64 i = 0; i < operands.size(); ++i) {
+          auto parameter = computation->parameter_instruction(i);
+          ShapeUtil::ForEachMutableSubshape(
+              parameter->mutable_shape(),
+              [this, i, hlo, &operands, parameter](Shape* subshape,
+                                                   const ShapeIndex& index) {
+                if (!ShapeUtil::IsLeafIndex(parameter->shape(), index)) {
+                  return;
+                }
+                PrimitiveType operand_type =
+                    ShapeUtil::GetSubshape(operands[i]->shape(), index)
+                        .element_type();
+                if (subshape->element_type() == operand_type) {
+                  return;
+                }
+                CHECK(operand_type == F32 || operand_type == BF16);
+                subshape->set_element_type(operand_type);
+                changed_ = true;
+                VLOG(2) << "Called computation parameter "
+                        << parameter->ToString() << " at shape index " << index
+                        << " adjusted to match operand in HLO "
+                        << hlo->ToString();
+              });
+        }
+      };
+
+  switch (hlo->opcode()) {
+    case HloOpcode::kFusion:
+      adjust_computation(hlo->fused_instructions_computation(),
+                         hlo->operands());
+      break;
+    case HloOpcode::kWhile:
+      adjust_computation(hlo->while_condition(), hlo->operands());
+      adjust_computation(hlo->while_body(), hlo->operands());
+      break;
+    default:
+      break;
+  }
+}
+
+void BFloat16Propagation::AdjustCalledComputationRoot(HloInstruction* hlo) {
+  auto adjust_computation = [this, hlo](HloComputation* computation,
+                                        const Shape& output_shape) {
+    // Adjust root.
+    HloInstruction* root = computation->root_instruction();
+    ShapeUtil::ForEachMutableSubshape(
+        root->mutable_shape(), [this, hlo, root, &output_shape](
+                                   Shape* subshape, const ShapeIndex& index) {
+          if (!ShapeUtil::IsLeafIndex(hlo->shape(), index)) {
+            return;
+          }
+          const PrimitiveType output_type =
+              ShapeUtil::GetSubshape(output_shape, index).element_type();
+          if (subshape->element_type() == output_type) {
+            return;
+          }
+          CHECK(output_type == F32 || output_type == BF16);
+          subshape->set_element_type(output_type);
+          // It's possible that output_type is F32, but the root instruction's
+          // type is BF16; e.g., a fusion node's output was changed to BF16
+          // initially but then adjusted back to F32, and the fusion computation
+          // is now being adjusted after the fusion node.
+          if (output_type == F32) {
+            for (const auto* value :
+                 dataflow_->GetValueSet(root, index).values()) {
+              // We rely on the fact that this adjustment works in reverse
+              // topological order so that called computation will be
+              // processed later. Adding the value to
+              // values_that_must_be_kept_as_f32_ will ensure the
+              // correctness of the adjustment for HLOs that will be
+              // processed later.
+              values_that_must_be_kept_as_f32_.insert(value);
+            }
+          }
+          changed_ = true;
+          VLOG(2) << "Called computation root " << root->ToString()
+                  << " at shape index " << index
+                  << " adjusted to match output shape of " << hlo->ToString();
+        });
+  };
+
+  switch (hlo->opcode()) {
+    case HloOpcode::kFusion:
+      adjust_computation(hlo->fused_instructions_computation(), hlo->shape());
+      break;
+    case HloOpcode::kWhile:
+      adjust_computation(hlo->while_body(), hlo->shape());
+      break;
+    default:
+      break;
+  }
+}
+
+bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
+    HloComputation* computation,
+    tensorflow::gtl::FlatSet<const HloComputation*>* visited_computations) {
+  bool parameter_changed = false;
+  auto insts = computation->MakeInstructionPostOrder();
+  // Do the adjustment on each instruction in the computation in reverse
+  // topological order.
+  for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
+    auto hlo = *inst_it;
+    auto adjust_hlo_output = [this, hlo, &parameter_changed](
+                                 Shape* subshape, const ShapeIndex& index) {
+      if (subshape->element_type() != F32 && subshape->element_type() != BF16) {
+        return;
+      }
+      PrimitiveType type = BF16;
+      for (const auto* value : dataflow_->GetValueSet(hlo, index).values()) {
+        if (value->shape().element_type() == BF16) {
+          continue;
+        }
+        CHECK_EQ(value->shape().element_type(), F32);
+        type = F32;
+        break;
+      }
+      // It's possible that a user has been changed from BF16 to F32
+      // during this final adjustment pass, so we need to check
+      // AllUsersConsumeBF16() again.
+      if (type == BF16 && !AllUsersConsumeBF16(*hlo, index)) {
+        type = F32;
+      }
+      if (type == F32) {
+        for (const auto* value : dataflow_->GetValueSet(hlo, index).values()) {
+          // We rely on the fact that this adjustment works in reverse
+          // topological order. Adding the value to
+          // values_that_must_be_kept_as_f32_ will ensure the correctness
+          // of the adjustment for HLOs that will be processed later.
+          values_that_must_be_kept_as_f32_.insert(value);
+        }
+      }
+      if (type != subshape->element_type()) {
+        subshape->set_element_type(type);
+        VLOG(2) << "HloInstruction output at shape index " << index
+                << " adjusted to " << *subshape << ": " << hlo->ToString();
+        if (hlo->opcode() == HloOpcode::kParameter) {
+          parameter_changed = true;
+        }
+      }
+    };
+    ShapeUtil::ForEachMutableSubshape(hlo->mutable_shape(), adjust_hlo_output);
+    AdjustCalledComputationRoot(hlo);
+    if (hlo->opcode() == HloOpcode::kWhile) {
+      // We need to run on the while body and condition repeatedly until a fixed
+      // point is reached, i.e., the parameters do not change any more. We may
+      // need more than one iteration because the while input and output alias
+      // each other, so changing one input parameter requires changing the
+      // corresponding output element and thus may transitively require changing
+      // another input parameter. A fixed point will be reached because the
+      // parameters can only be changed from BF16 to F32, not the other way
+      // around.
+      tensorflow::gtl::FlatSet<const HloComputation*> visited_in_while;
+      while (ResolveInconsistencyOfAliasingBuffersHelper(hlo->while_condition(),
+                                                         &visited_in_while) ||
+             ResolveInconsistencyOfAliasingBuffersHelper(hlo->while_body(),
+                                                         &visited_in_while)) {
+        visited_in_while.clear();
+        ShapeUtil::ForEachMutableSubshape(hlo->mutable_shape(),
+                                          adjust_hlo_output);
+        AdjustCalledComputationRoot(hlo);
+      }
+      visited_computations->insert(visited_in_while.begin(),
+                                   visited_in_while.end());
+    }
+  }
+  // Now adjust parameters of called computations.
+  for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
+    AdjustCalledComputationParameters(*inst_it);
+  }
+  return parameter_changed;
+}
+
+Status BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
+    HloModule* module) {
+  std::list<HloComputation*> computations_topological_order =
+      module->MakeComputationPostOrder();
+  tensorflow::gtl::FlatSet<const HloComputation*> resolved;
+  for (auto comp_it = computations_topological_order.rbegin();
+       comp_it != computations_topological_order.rend(); ++comp_it) {
+    if (ContainsKey(resolved, *comp_it)) {
+      continue;
+    }
+    ResolveInconsistencyOfAliasingBuffersHelper(*comp_it, &resolved);
+  }
+
+  // We could have changed a fusion computation's root shape to have a different
+  // precision than the fusion node's output, if the fusion root does not
+  // define a buffer (e.g., a tuple). Now we add conversions after such fusion
+  // roots to make them match the fusion output. If the fusion output is a
+  // (possibly nested) tuple, we first create get-tuple-elements, then convert
+  // the unmatching leaf nodes, and finally create a new tuple as the fusion
+  // computation's root. If tuples and get-tuple-elements are created, we will
+  // run tuple simplifier and dead code elimination at the end (dead code is not
+  // allowed in fusion computation). E.g.,
+  //
+  // (1)             (2)             (3)
+  // a  b            a  b            a  b
+  // |\ |            |\ |            |\ |
+  // \ add   ->      |add    ->      | add
+  //  \ |            \ |        convert |
+  //  tuple         tuple             \ |
+  //                 / \              tuple
+  //               gte gte
+  //                |   |
+  //           convert  |
+  //                 \  /
+  //                 tuple
+  // (1) a is F32 but tuple is BF16
+  // (2) after adding conversion
+  // (3) after tuple simplifier and DCE.
+  bool needs_tuple_simplifier = false;
+  for (auto computation : computations_topological_order) {
+    auto insts = computation->MakeInstructionPostOrder();
+    for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
+      auto hlo = *inst_it;
+      if (hlo->opcode() != HloOpcode::kFusion) {
+        continue;
+      }
+      auto fusion_computation = hlo->fused_instructions_computation();
+      auto fusion_root = fusion_computation->root_instruction();
+      if (ShapeUtil::Compatible(fusion_root->shape(), hlo->shape())) {
+        continue;
+      }
+      ShapeTree<HloInstruction*> converted_outputs(hlo->shape());
+      // Iterate through nodes in the shape tree in pre-order and initialize
+      // each non-root node with a corresponding get-tuple-element. For a leaf
+      // node, if its shape does not match the fusion output, create a
+      // conversion node to overwrite the node value.
+      for (auto it = converted_outputs.begin(); it != converted_outputs.end();
+           ++it) {
+        ShapeIndex output_index = it->first;
+        HloInstruction*& output = it->second;
+        const Shape subshape =
+            ShapeUtil::GetSubshape(hlo->shape(), output_index);
+        if (output_index.empty()) {
+          output = fusion_root;
+        } else {
+          ShapeIndex parent_index = output_index;
+          parent_index.pop_back();
+          output = fusion_computation->AddInstruction(
+              HloInstruction::CreateGetTupleElement(
+                  subshape, converted_outputs.element(parent_index),
+                  output_index.back()));
+        }
+        if (ShapeUtil::IsTuple(subshape)) {
+          continue;
+        }
+        if (!ShapeUtil::Compatible(
+                subshape,
+                ShapeUtil::GetSubshape(fusion_root->shape(), output_index))) {
+          output = fusion_computation->AddInstruction(
+              HloInstruction::CreateConvert(subshape, output));
+        }
+      }
+      // Iterate through nodes in the shape tree in reverse pre-order and create
+      // a tuple instruction for each non-leaf node where the elements are the
+      // values of its child nodes.
+      for (auto it = converted_outputs.rbegin(); it != converted_outputs.rend();
+           ++it) {
+        ShapeIndex output_index = it->first;
+        HloInstruction*& output = it->second;
+        const Shape& subshape =
+            ShapeUtil::GetSubshape(hlo->shape(), output_index);
+        if (!ShapeUtil::IsTuple(subshape)) {
+          continue;
+        }
+        std::vector<HloInstruction*> elements(
+            ShapeUtil::TupleElementCount(subshape));
+        ShapeIndex child_index = output_index;
+        for (int64 i = 0; i < elements.size(); ++i) {
+          child_index.push_back(i);
+          elements[i] = converted_outputs.element(child_index);
+          child_index.pop_back();
+        }
+        output = fusion_computation->AddInstruction(
+            HloInstruction::CreateTuple(elements));
+      }
+      fusion_computation->set_root_instruction(converted_outputs.element({}));
+      needs_tuple_simplifier |= ShapeUtil::IsTuple(hlo->shape());
+    }
+  }
+
+  // We may have converted some constants from F32 to BF16, so adjust the
+  // constant literals in such cases. We do this here instead of when the
+  // constant node's is changed because 1) the HloInstruction interface does not
+  // allow resetting the literal so we have to create a new kConstant
+  // instruction to replace the old one, which invalidates dataflow analysis,
+  // and 2) it's possible that a kConstant's output gets changed to BF16 at the
+  // beginning but later on adjusted back to F32, so converting literals here
+  // can avoid repeated conversions.
+  //
+  // TODO(b/73833576): Consider resetting literal in HloInstruction.
+  bool needs_dce = needs_tuple_simplifier;
+  for (auto computation : computations_topological_order) {
+    for (auto hlo : computation->MakeInstructionPostOrder()) {
+      if (hlo->opcode() != HloOpcode::kConstant) {
+        continue;
+      }
+      if (!ShapeUtil::Equal(hlo->literal().shape(), hlo->shape())) {
+        TF_ASSIGN_OR_RETURN(
+            auto converted_literal,
+            hlo->literal().ConvertToShape(hlo->shape(),
+                                          /*round_f32_to_bf16=*/true));
+        auto new_constant = computation->AddInstruction(
+            HloInstruction::CreateConstant(std::move(converted_literal)));
+        TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_constant));
+        needs_dce = true;
+      }
+    }
+  }
+
+  if (needs_tuple_simplifier) {
+    TupleSimplifier tuple_simplifier;
+    TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+  }
+  if (needs_dce) {
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
+  return Status::OK();
+}
+
+Status BFloat16Propagation::RemoveNoopConversions(HloModule* module) {
+  for (auto computation : module->computations()) {
+    for (auto hlo : computation->MakeInstructionPostOrder()) {
+      if (hlo->opcode() != HloOpcode::kConvert) {
+        continue;
+      }
+      auto source = hlo->mutable_operand(0);
+      if (!ShapeUtil::Equal(source->shape(), hlo->shape())) {
+        continue;
+      }
+      const bool is_root = hlo == computation->root_instruction();
+      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(source));
+      if (is_root) {
+        computation->set_root_instruction(source);
+      }
+      TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(hlo));
+    }
+  }
+  return Status::OK();
+}
+
+// The algorithm first does a forward pass (parameters to root) to determine a
+// set of instructions to consider using bfloat16, then does a backward pass to
+// determine the precisions of those instructions according to the need of
+// their users.
+StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(dataflow_, HloDataflowAnalysis::Run(*module));
+
+  std::list<HloComputation*> computations_topological_order =
+      module->MakeComputationPostOrder();
+  // The first step is a forward pass (parameters to root), where we determine
+  // the potential candidate instructions to use bfloat16 in the outputs that
+  // are not likely to cause overhead from extra explicit conversions. This is
+  // done forwardly because we determine whether an HLO is a candidate partially
+  // based on whether its operands are candidates.
+  for (auto computation : computations_topological_order) {
+    for (auto inst : computation->MakeInstructionPostOrder()) {
+      if (InstructionIsCandidateForBF16Output(inst)) {
+        consider_using_bfloat16_.insert(inst);
+      }
+    }
+  }
+
+  // The second step is a backward pass (root to parameters), where we modify
+  // the precisions of the instructions identified in the first step when
+  // feasible. This is done backwardly because we determine the precision of an
+  // HLO's output based on how it is later used.
+  //
+  // The precision of an instruction is determined by its users, so we do the
+  // propagation in reverse topological order.
+  for (auto comp_it = computations_topological_order.rbegin();
+       comp_it != computations_topological_order.rend(); ++comp_it) {
+    if ((*comp_it)->IsFusionComputation()) {
+      // Fusion computations are handled when visiting the fusion instruction.
+      continue;
+    }
+    auto insts = (*comp_it)->MakeInstructionPostOrder();
+    for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
+      DetermineAndMutateInstructionPrecision(*inst_it,
+                                             /*skip_parameters=*/true);
+    }
+  }
+
+  if (!changed_) {
+    return false;
+  }
+
+  // It's possible that an instruction does not define a buffer, but the
+  // defining instruction's shape has changed. So we need to adjust the output
+  // shapes of instructions according to the HLO values they refer to.
+  TF_RETURN_IF_ERROR(ResolveInconsistencyOfAliasingBuffers(module));
+
+  // This pass could have turned an F32 -> BF16 conversion to a no-op (BF16 ->
+  // BF16), so we remove them now.
+  TF_RETURN_IF_ERROR(RemoveNoopConversions(module));
+  return true;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.h b/tensorflow/compiler/xla/service/bfloat16_propagation.h
new file mode 100644
index 0000000000000000000000000000000000000000..1744e9db90aeff269daa91eb68a1d61bb0fc3035
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.h
@@ -0,0 +1,164 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_PROPAGATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_PROPAGATION_H_
+
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass which reduces the precision of some HLO instructions to BF16
+// according to the backend-specific BFloat16Support rule provided by the
+// caller.
+//
+// This pass can be used to reduce instruction precision without affecting the
+// numerical accuracy of the module, i.e., the final output of the module would
+// be bitwise identical to that without this pass; this is possible if the
+// backend already reduces precision to BF16 on some HLO instructions.
+//
+// This pass will not modify the signature of a computation, unless it is a
+// fusion computation or its only caller is a while.
+//
+// !!! WARNING !!! This pass can introduce mixed precision in individual HLOs,
+// which has two issues:
+//
+// 1) It does not guarantee to respect the passed-in BFloat16Support
+// specification in terms of mixed precision, so the backend may not support an
+// HLO that has mixed precision produced by this pass. To address this issue,
+// run BFloat16Normalization with the same BFloat16Support after this pass.
+//
+// 2) In general, mixed precision may break the assumptions of some other HLO
+// passes even if the specific backend supports the individual HLOs. Such
+// assumptions include that there are no HLOs using mixed precision, or that the
+// precision of an HLO's output is determined by its inputs. It should be used
+// at the end of the HLO optimization pipeline but before
+// BFloat16ConversionFolding. If other passes are needed after this pass, run
+// BFloat16MixedPrecisionRemoval first to undo some of the changes made by this
+// pass.
+class BFloat16Propagation : public HloPassInterface {
+ public:
+  explicit BFloat16Propagation(const BFloat16Support* bfloat16_support);
+
+  ~BFloat16Propagation() override = default;
+
+  tensorflow::StringPiece name() const override {
+    return "bfloat16-propagation";
+  }
+
+  // Runs the pass on the given module. Returns whether the module was changed
+  // (precision reductions were added).
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // ***************************
+  // Function called and state produced by the forward analysis pass (from
+  // parameters to root) that determines the candidate HLOs to use BF16 outputs.
+
+  // Determines whether we should consider changing the precision of the given
+  // instruction in the forward pass.
+  bool InstructionIsCandidateForBF16Output(HloInstruction* hlo);
+
+  // The set of instructions to consider using bfloat16, computed in the forward
+  // pass.
+  tensorflow::gtl::FlatSet<const HloInstruction*> consider_using_bfloat16_;
+
+  // ***************************
+  // Functions called and state produced by the backward mutation pass (from
+  // root to parameters).
+
+  // Determines the precision for the given instruction in the mutation pass.
+  void DetermineAndMutateInstructionPrecision(HloInstruction* hlo,
+                                              bool skip_parameters);
+
+  // Special handling in the mutation pass for fusion computations.
+  //
+  // Precondition: hlo->opcode() == kFusion
+  void DetermineAndMutateFusionComputationPrecision(HloInstruction* fusion);
+
+  // Special handling in the mutation pass for while computations.
+  //
+  // Precondition: hlo->opcode() == kWhile
+  void DetermineAndMutateWhileComputationsPrecision(HloInstruction* while_hlo);
+
+  // The set of HloInstructions that have been visited in the mutation pass.
+  tensorflow::gtl::FlatSet<const HloInstruction*>
+      instructions_visited_in_mutation_pass_;
+
+  // The set of HloComputations that have been visited in the mutation pass.
+  tensorflow::gtl::FlatSet<const HloComputation*>
+      computations_visited_in_mutation_pass_;
+
+  // ***************************
+  // Functions called by the final inconsistency resolving pass.
+
+  // Adjusts the output shapes of HloInstructions such that if two
+  // HloInstructions have aliasing buffers in their outputs, they must have the
+  // same precision.
+  Status ResolveInconsistencyOfAliasingBuffers(HloModule* module);
+
+  // Resolves inconsistency of aliasing buffers for the given computation, and
+  // recursively runs on a while instruction's condition and body until a fixed
+  // point is reached.
+  bool ResolveInconsistencyOfAliasingBuffersHelper(
+      HloComputation* computation,
+      tensorflow::gtl::FlatSet<const HloComputation*>* visited_computations);
+
+  // Makes the parameters of called computations match how they are called by
+  // the given HLO.
+  void AdjustCalledComputationParameters(HloInstruction* hlo);
+
+  // Makes the root instructions of called computations match how they are used
+  // by the given HLO.
+  void AdjustCalledComputationRoot(HloInstruction* hlo);
+
+  // ***************************
+  // Removes no-op conversions (same source and target shapes) that can be
+  // produced this pass.
+  Status RemoveNoopConversions(HloModule* module);
+
+  // ***************************
+  // Functions called and state used by two or more passes.
+
+  // Returns whether all uses of the given HloInstruction can consume BF16
+  // input.
+  bool AllUsersConsumeBF16(const HloInstruction& hlo,
+                           const ShapeIndex& index) const;
+
+  // The set of F32 HLO values that must be kept in F32.
+  tensorflow::gtl::FlatSet<const HloValue*> values_that_must_be_kept_as_f32_;
+
+  // Mapping from each HloComputation to the number of callers to it in the
+  // module. Populated at the beginning of this pass.
+  tensorflow::gtl::FlatMap<const HloComputation*, int64> caller_counts_;
+
+  const BFloat16Support* bfloat16_support_;
+  std::unique_ptr<HloDataflowAnalysis> dataflow_;
+
+  bool changed_ = false;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_PROPAGATION_H_
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..183db1652e498edb0b94e9c9a272e2b8a7fc53ba
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -0,0 +1,714 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/bfloat16_propagation.h"
+#include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// A class specifying the BF16 support used to test the propagation pass. It
+// specifies that BF16 and mixed precision are supported in all HloInstructions,
+// and that kDot reduces its operands precision to BF16.
+class TestBFloat16Support : public BFloat16Support {
+ public:
+  TestBFloat16Support() {}
+  ~TestBFloat16Support() override {}
+
+  bool SupportsBF16Operand(const HloInstruction& hlo,
+                           int64 operand_index) const override {
+    return true;
+  }
+
+  bool SupportsBF16Output(const HloInstruction& hlo) const override {
+    return true;
+  }
+
+  bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
+    return true;
+  }
+
+  bool EffectiveOperandPrecisionIsBF16(const HloInstruction& hlo,
+                                       int64 operand_index) const override {
+    return hlo.opcode() == HloOpcode::kDot;
+  }
+};
+
+class BFloat16PropagationTest : public HloTestBase {
+ protected:
+  // Runs the propagation pass on the given module, and returns whether the
+  // module is changed after this pass.
+  bool PropagatePrecision(HloModule* module) {
+    TestBFloat16Support bfloat16_support;
+    BFloat16Propagation propagation(&bfloat16_support);
+    StatusOr<bool> result = propagation.Run(module);
+    EXPECT_IS_OK(result.status());
+    return result.ValueOrDie();
+  }
+
+  // Returns whether the given HloInstruction's output element type is BF16 or
+  // the only use of it is converting to BF16.
+  bool OutputsBF16(const HloInstruction* inst) {
+    if (inst->shape().element_type() == BF16) {
+      return true;
+    }
+    return inst->user_count() == 1 &&
+           inst->users()[0]->opcode() == HloOpcode::kConvert &&
+           inst->users()[0]->shape().element_type() == BF16;
+  }
+};
+
+// Tests that BF16 can propagate through select over non-tuple buffers, but not
+// through add where reducing operand precision can affect the result.
+TEST_F(BFloat16PropagationTest, PropagateThroughSelectButNotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* c =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "c"));
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, add0, b));
+  HloInstruction* pred = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kEq, a, b));
+  HloInstruction* sel = builder.AddInstruction(
+      HloInstruction::CreateTernary(shape, HloOpcode::kSelect, pred, c, add1));
+  HloInstruction* xpose =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {4, 2}), sel, {1, 0}));
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kDot, xpose, a));
+  HloInstruction* root = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), root);
+  EXPECT_TRUE(OutputsBF16(xpose));
+  EXPECT_TRUE(OutputsBF16(sel));
+  EXPECT_TRUE(OutputsBF16(add1));
+  EXPECT_FALSE(OutputsBF16(add0));
+  EXPECT_FALSE(OutputsBF16(a));
+  EXPECT_FALSE(OutputsBF16(b));
+  EXPECT_FALSE(OutputsBF16(c));
+}
+
+// Tests that if a constant is converted to BF16 then its literal must also be
+// converted.
+TEST_F(BFloat16PropagationTest, ConvertConstantLiteral) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+  Array2D<float> array_a(4, 4);
+  array_a.FillUnique(1.0f);
+  Array2D<float> array_b(4, 4);
+  array_b.FillUnique(10.0f);
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateFromArray(array_a)));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateFromArray(array_b)));
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kDot, a, b));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_TRUE(OutputsBF16(dot->operand(0)));
+  EXPECT_TRUE(OutputsBF16(dot->operand(1)));
+  EXPECT_EQ(dot->operand(0)->opcode(), HloOpcode::kConstant);
+  EXPECT_EQ(dot->operand(1)->opcode(), HloOpcode::kConstant);
+  LiteralTestUtil::ExpectEqual(
+      dot->operand(0)->literal(),
+      *LiteralTestUtil::ConvertF32ToBF16(*Literal::CreateFromArray(array_a)));
+  LiteralTestUtil::ExpectEqual(
+      dot->operand(1)->literal(),
+      *LiteralTestUtil::ConvertF32ToBF16(*Literal::CreateFromArray(array_b)));
+}
+
+// Tests that BF16 can be propagated through nested tuples.
+TEST_F(BFloat16PropagationTest, PropagateThroughTuples) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, a));
+  HloInstruction* add2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, b, b));
+  HloInstruction* xpose =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {4, 2}), add1, {1, 0}));
+
+  HloInstruction* tuple0 =
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, add1, add2}));
+  HloInstruction* tuple1 =
+      builder.AddInstruction(HloInstruction::CreateTuple({tuple0, xpose}));
+
+  HloInstruction* lhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(xpose->shape(), tuple1, 1));
+  HloInstruction* rhs =
+      builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+          add0->shape(),
+          builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+              tuple0->shape(), tuple1, 0)),
+          0));
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kDot, lhs, rhs));
+
+  HloInstruction* output_tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({dot, add2}));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), output_tuple);
+  EXPECT_TRUE(OutputsBF16(xpose));
+  EXPECT_TRUE(OutputsBF16(add0));
+  EXPECT_TRUE(OutputsBF16(add1));
+  EXPECT_FALSE(OutputsBF16(add2));
+}
+
+// Tests that even if an instruction does not define a buffer in its output, its
+// shape must match the defining instruction.
+TEST_F(BFloat16PropagationTest, SameValueReferencedTwice) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, a));
+
+  HloInstruction* lhs = builder.AddInstruction(HloInstruction::CreateTranspose(
+      ShapeUtil::MakeShape(F32, {4, 2}), add1, {1, 0}));
+
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
+  HloInstruction* rhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(add1->shape(), tuple, 1));
+
+  // lhs is the transpose of add1, and rhs is a get-tuple-element aliasing add1.
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kDot, lhs, rhs));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_TRUE(OutputsBF16(add0));
+  EXPECT_TRUE(OutputsBF16(add1));
+  EXPECT_TRUE(OutputsBF16(lhs));
+  // rhs is a get-tuple-element, which does not define a buffer, but its shape
+  // should also be adjusted accordingly.
+  EXPECT_TRUE(OutputsBF16(rhs));
+}
+
+// Tests that a non-fusion computation's root should not be changed.
+TEST_F(BFloat16PropagationTest, DoNotChangeComputationRoot) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
+
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kDot, add, add));
+
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({add, dot}));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), tuple);
+  EXPECT_FALSE(OutputsBF16(add));
+}
+
+// Tests that BF16 is propagated properly through fused computations.
+TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+
+  auto builder_f0 = HloComputation::Builder("fusion0");
+  HloInstruction* a_f0 =
+      builder_f0.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b_f0 =
+      builder_f0.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* tuple_f0 =
+      builder_f0.AddInstruction(HloInstruction::CreateTuple({a_f0, b_f0}));
+  auto comp_f0 = module->AddEmbeddedComputation(builder_f0.Build());
+  auto fusion0 = builder.AddInstruction(HloInstruction::CreateFusion(
+      tuple_f0->shape(), HloInstruction::FusionKind::kCustom, {add, add},
+      comp_f0));
+
+  auto builder_f1 = HloComputation::Builder("fusion1");
+  HloInstruction* p_f1 = builder_f1.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_f0->shape(), "param"));
+  HloInstruction* a_f1 = builder_f1.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, p_f1, 0));
+  HloInstruction* b_f1 = builder_f1.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, p_f1, 1));
+  HloInstruction* dot = builder_f1.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kDot, a_f1, b_f1));
+  auto comp_f1 = module->AddEmbeddedComputation(builder_f1.Build());
+  auto fusion1 = builder.AddInstruction(HloInstruction::CreateFusion(
+      dot->shape(), HloInstruction::FusionKind::kCustom, {fusion0}, comp_f1));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), fusion1);
+  EXPECT_TRUE(OutputsBF16(add));
+  EXPECT_TRUE(OutputsBF16(a_f0));
+  EXPECT_TRUE(OutputsBF16(b_f0));
+  EXPECT_TRUE(OutputsBF16(a_f1));
+  EXPECT_TRUE(OutputsBF16(b_f1));
+}
+
+// Tests that if 1) the root instruction of a fusion is a tuple, 2) the fusion
+// outputs are only used by a dot, and 3) one element of the tuple is used by
+// an add in the fusion computation, then the propagation pass should create a
+// convert in the fusion computation to keep the add's operand in F32 but change
+// the fusion output to BF16. E.g., the following fusion computation
+//   (F32, F32) fusion_computation(F32 a, F32 b)
+//     = tuple(F32 a, F32 add(F32 a, F32 b))
+// will be changed to
+//   (BF16, BF16) fusion_computation(F32 a, F32 b)
+//     = tuple(BF16 convert(a), BF16 add(F32 a, F32 b))
+TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+
+  auto builder_f = HloComputation::Builder("fusion0");
+  HloInstruction* a_f =
+      builder_f.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b_f =
+      builder_f.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* add_f = builder_f.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a_f, b_f));
+  HloInstruction* tuple_f =
+      builder_f.AddInstruction(HloInstruction::CreateTuple({a_f, add_f}));
+  auto comp_f = module->AddEmbeddedComputation(builder_f.Build());
+  auto fusion = builder.AddInstruction(HloInstruction::CreateFusion(
+      tuple_f->shape(), HloInstruction::FusionKind::kCustom, {add, add},
+      comp_f));
+
+  HloInstruction* gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, fusion, 0));
+  HloInstruction* gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, fusion, 1));
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kDot, gte0, gte1));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_TRUE(OutputsBF16(gte0));
+  EXPECT_TRUE(OutputsBF16(gte1));
+  EXPECT_FALSE(OutputsBF16(a_f));
+  EXPECT_FALSE(OutputsBF16(b_f));
+  EXPECT_TRUE(OutputsBF16(add_f));
+  auto new_fusion_root = comp_f->root_instruction();
+  EXPECT_EQ(new_fusion_root->opcode(), HloOpcode::kTuple);
+  EXPECT_EQ(new_fusion_root->operand(1), add_f);
+  EXPECT_EQ(new_fusion_root->operand(0)->opcode(), HloOpcode::kConvert);
+  EXPECT_TRUE(OutputsBF16(new_fusion_root->operand(0)));
+}
+
+// A select over tuples does not define the leaf buffers, so the types in
+// on_true and on_false must match, so that as long as one of them is F32, the
+// other must be F32 as well.
+TEST_F(BFloat16PropagationTest, SelectOverTuples) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  HloInstruction* pred = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(PRED, {}), "pred"));
+
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, add0, param));
+  HloInstruction* tuple0 =
+      builder.AddInstruction(HloInstruction::CreateTuple({param, add0}));
+  HloInstruction* tuple1 =
+      builder.AddInstruction(HloInstruction::CreateTuple({param, add1}));
+  HloInstruction* sel = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple0->shape(), HloOpcode::kSelect, pred, tuple0, tuple1));
+  HloInstruction* gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, sel, 0));
+  HloInstruction* gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, sel, 1));
+  HloInstruction* xpose =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {4, 2}), gte0, {1, 0}));
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kDot, xpose, gte1));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_FALSE(OutputsBF16(add0));
+  EXPECT_FALSE(OutputsBF16(add1));
+  EXPECT_FALSE(OutputsBF16(gte0));
+  EXPECT_FALSE(OutputsBF16(gte1));
+  EXPECT_TRUE(OutputsBF16(xpose));
+}
+
+// Tests that BF16 is propagated properly through a while computation with
+// non-tuple input/output.
+TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "param1"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+
+  auto builder_cond = HloComputation::Builder("cond");
+  auto cond_param = builder_cond.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "cond_param"));
+  auto cond_dot = builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, cond_param, cond_param));
+  auto cond_root = builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_dot, {0, 0}, {1, 1}, {1, 1})),
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_dot, {1, 1}, {2, 2}, {1, 1}))));
+  auto cond = module->AddEmbeddedComputation(builder_cond.Build());
+
+  auto builder_body = HloComputation::Builder("body");
+  auto body_param = builder_body.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "body_param"));
+  auto body_dot = builder_body.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, body_param, body_param));
+  auto body = module->AddEmbeddedComputation(builder_body.Build());
+
+  auto while_hlo = builder.AddInstruction(
+      HloInstruction::CreateWhile(shape, cond, body, add));
+
+  auto dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, while_hlo, while_hlo));
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(cond_root->shape(), ShapeUtil::MakeShape(PRED, {})));
+  EXPECT_TRUE(OutputsBF16(add));
+  EXPECT_TRUE(OutputsBF16(body_dot));
+  EXPECT_TRUE(OutputsBF16(body_param));
+  EXPECT_TRUE(OutputsBF16(cond_param));
+  EXPECT_FALSE(OutputsBF16(dot));
+}
+
+// Tests that BF16 is propagated properly through while computations with
+// tuple-shaped input/output.
+TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "param1"));
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
+
+  auto builder_cond = HloComputation::Builder("cond");
+  auto cond_param = builder_cond.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple->shape(), "cond_param"));
+  auto cond_lhs = builder_cond.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, cond_param, 0));
+  auto cond_rhs = builder_cond.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, cond_param, 1));
+  // This add should prevent RHS from using BF16
+  auto cond_add_rhs = builder_cond.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, cond_rhs, cond_rhs));
+  auto cond_dot = builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, cond_lhs, cond_add_rhs));
+  builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_dot, {0, 0}, {1, 1}, {1, 1})),
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_dot, {1, 1}, {2, 2}, {1, 1}))));
+  auto cond = module->AddEmbeddedComputation(builder_cond.Build());
+
+  auto builder_body = HloComputation::Builder("body");
+  auto body_param = builder_body.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple->shape(), "body_param"));
+  auto body_lhs = builder_body.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, body_param, 0));
+  auto body_rhs = builder_body.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, body_param, 1));
+  auto body_dot = builder_body.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kDot, body_lhs, body_rhs));
+  builder_body.AddInstruction(
+      HloInstruction::CreateTuple({body_dot, body_rhs}));
+  auto body = module->AddEmbeddedComputation(builder_body.Build());
+
+  auto while_hlo = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple->shape(), cond, body, tuple));
+
+  auto lhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, while_hlo, 0));
+  auto rhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, while_hlo, 1));
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kDot, lhs, rhs));
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_TRUE(OutputsBF16(lhs));
+  EXPECT_FALSE(OutputsBF16(rhs));
+  EXPECT_TRUE(OutputsBF16(body_dot));
+  EXPECT_TRUE(OutputsBF16(body_lhs));
+  EXPECT_FALSE(OutputsBF16(body_rhs));
+  EXPECT_TRUE(OutputsBF16(cond_lhs));
+  EXPECT_FALSE(OutputsBF16(cond_rhs));
+  EXPECT_TRUE(OutputsBF16(add0));
+  EXPECT_FALSE(OutputsBF16(add1));
+}
+
+// Tests that BF16 is not propagated through multiple whiles that invoke the
+// same computation as long as one while prevents the propagation.
+TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "param1"));
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+  HloInstruction* add2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+  HloInstruction* add3 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+  HloInstruction* tuple0 =
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
+  HloInstruction* tuple1 =
+      builder.AddInstruction(HloInstruction::CreateTuple({add2, add3}));
+
+  // Condition computation for the first while.
+  auto builder_cond0 = HloComputation::Builder("cond0");
+  auto cond0_param = builder_cond0.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple0->shape(), "cond0_param"));
+  auto cond0_lhs = builder_cond0.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, cond0_param, 0));
+  auto cond0_rhs = builder_cond0.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, cond0_param, 1));
+  // This add should prevent RHS from using BF16
+  auto cond0_add_rhs =
+      builder_cond0.AddInstruction(HloInstruction::CreateBinary(
+          shape, HloOpcode::kAdd, cond0_rhs, cond0_rhs));
+  auto cond0_dot = builder_cond0.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, cond0_lhs, cond0_add_rhs));
+  builder_cond0.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+      builder_cond0.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond0_dot, {0, 0}, {1, 1}, {1, 1})),
+      builder_cond0.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond0_dot, {1, 1}, {2, 2}, {1, 1}))));
+  auto cond0 = module->AddEmbeddedComputation(builder_cond0.Build());
+
+  // Condition computation for the second while.
+  auto builder_cond1 = HloComputation::Builder("cond1");
+  auto cond1_param = builder_cond1.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple1->shape(), "cond1_param"));
+  auto cond1_lhs = builder_cond1.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, cond1_param, 0));
+  auto cond1_rhs = builder_cond1.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, cond1_param, 1));
+  // This add should prevent LHS from using BF16
+  auto cond1_add_lhs =
+      builder_cond1.AddInstruction(HloInstruction::CreateBinary(
+          shape, HloOpcode::kAdd, cond1_lhs, cond1_lhs));
+  auto cond1_dot = builder_cond1.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, cond1_add_lhs, cond1_rhs));
+  builder_cond1.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+      builder_cond1.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond1_dot, {0, 0}, {1, 1}, {1, 1})),
+      builder_cond1.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond1_dot, {1, 1}, {2, 2}, {1, 1}))));
+  auto cond1 = module->AddEmbeddedComputation(builder_cond1.Build());
+
+  // Body computation shared by both whiles.
+  auto builder_body = HloComputation::Builder("body");
+  auto body_param = builder_body.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple0->shape(), "body_param"));
+  auto body_lhs = builder_body.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, body_param, 0));
+  auto body_rhs = builder_body.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, body_param, 1));
+  auto body_dot = builder_body.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kDot, body_lhs, body_rhs));
+  builder_body.AddInstruction(
+      HloInstruction::CreateTuple({body_dot, body_rhs}));
+  auto body = module->AddEmbeddedComputation(builder_body.Build());
+
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple0->shape(), cond0, body, tuple0));
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple1->shape(), cond1, body, tuple1));
+
+  auto lhs = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot,
+      builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(shape, while0, 0)),
+      builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(shape, while0, 1))));
+  auto rhs = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot,
+      builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(shape, while1, 0)),
+      builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(shape, while1, 1))));
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kDot, lhs, rhs));
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+  EXPECT_FALSE(OutputsBF16(body_dot));
+  EXPECT_FALSE(OutputsBF16(body_rhs));
+  EXPECT_FALSE(OutputsBF16(body_lhs));
+  EXPECT_FALSE(OutputsBF16(cond0_lhs));
+  EXPECT_FALSE(OutputsBF16(cond0_rhs));
+  EXPECT_FALSE(OutputsBF16(cond1_lhs));
+  EXPECT_FALSE(OutputsBF16(cond1_rhs));
+  EXPECT_TRUE(OutputsBF16(cond0_add_rhs));
+  EXPECT_TRUE(OutputsBF16(cond1_add_lhs));
+  EXPECT_EQ(computation->root_instruction(), dot);
+}
+
+// Tests that if this pass turns an F32 -> BF16 conversion into a no-op (BF16 ->
+// BF16 conversion), then it will remove that conversion.
+TEST_F(BFloat16PropagationTest, NoopConversionRemoved) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {4, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {4, 4});
+
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "param"));
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, param, param));
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, param, param));
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
+  HloInstruction* gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32_shape, tuple, 0));
+  HloInstruction* gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32_shape, tuple, 1));
+  HloInstruction* convert0 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte0));
+  HloInstruction* convert1 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte1));
+  HloInstruction* add2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      bf16_shape, HloOpcode::kAdd, convert0, convert1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), add2);
+  EXPECT_EQ(add2->operand(0), gte0);
+  EXPECT_EQ(add2->operand(1), gte1);
+  EXPECT_EQ(gte0->shape().element_type(), BF16);
+  EXPECT_EQ(gte1->shape().element_type(), BF16);
+  EXPECT_EQ(add0->shape().element_type(), BF16);
+  EXPECT_EQ(add1->shape().element_type(), BF16);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
index 3fd9e24601f27633c8063e4574c7c4f91f30dcff..07b4b14b5ec1bdbc01345091105df69368b0b2fb 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -79,6 +79,7 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision(
     case HloOpcode::kBroadcast:
     case HloOpcode::kClamp:
     case HloOpcode::kConcatenate:
+    case HloOpcode::kConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kMaximum:
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.h b/tensorflow/compiler/xla/service/bfloat16_support.h
index 29f662d22b4e5486662a1387407d41e0fd2ed1b3..82c2745f444e4f9c544c78cb36dafc11f678518a 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.h
+++ b/tensorflow/compiler/xla/service/bfloat16_support.h
@@ -39,7 +39,7 @@ class BFloat16Support {
   // precisions (BF16 and F32).
   virtual bool SupportsMixedPrecisions(const HloInstruction& hlo) const;
 
-  // Returns whether the given HLO inherits its BF16 operand precision at the
+  // Returns whether the given HLO preserves its BF16 operand precision at the
   // given index, so even if the output is F32, elements in the output that
   // depend on the BF16 operand will still have BF16 effective precision even if
   // they have F32 format. Similarly, this also means if the output is BF16 then
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index b1e693da9d5af4babe619b8796007f2da318f6a8..dbe45e932cdeed00e959355d5b3199d2e858148f 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -48,6 +48,183 @@ using ::tensorflow::strings::HumanReadableNumBytes;
 using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrAppend;
 
+namespace {
+
+template <typename T>
+string ColocatedBufferSetsToString(const T& container, const char* title) {
+  string result;
+  StrAppend(&result, title, "\n");
+  for (const auto& it : container) {
+    StrAppend(&result, "\t", it->ToString(), "\n");
+  }
+  return result;
+}
+
+// Walk the call graph of the HLO module and place each computation into either
+// thread_local_computations or global_computations depending upon whether the
+// computation requires thread-local allocations or global allocations. The
+// elements in thread_local_computations and global_computations are in post
+// order (if computation A has an instruction which calls computation B, then A
+// will appear after B in the vector).
+Status GatherComputationsByAllocationType(
+    const HloModule* module,
+    std::vector<const HloComputation*>* thread_local_computations,
+    std::vector<const HloComputation*>* global_computations) {
+  // Create a worklist of computations paired with whether the allocation must
+  // be thread-local.
+  std::deque<std::pair<const HloComputation*, bool>> worklist;
+  worklist.push_back(std::make_pair(module->entry_computation(),
+                                    /*is_thread_local*/ false));
+
+  // Sets for quickly checking membership. Computations are returned in vectors
+  // for stable iteration.
+  FlatSet<const HloComputation*> thread_local_set;
+  FlatSet<const HloComputation*> global_set;
+
+  while (!worklist.empty()) {
+    auto worklist_front = worklist.front();
+    worklist.pop_front();
+    const HloComputation* computation = worklist_front.first;
+    bool is_thread_local = worklist_front.second;
+    bool in_thread_local_set = thread_local_set.count(computation) > 0;
+    bool in_global_set = global_set.count(computation) > 0;
+
+    // If the computation has already been added to the respective set, then
+    // nothing to do.
+    if ((is_thread_local && in_thread_local_set) ||
+        (!is_thread_local && in_global_set)) {
+      continue;
+    }
+
+    // If the computation has already been added to the other set this is an
+    // error condition because the global call to the computation (eg,
+    // while/call) may return a reference to one of the thread-local buffers to
+    // the calling computation which will become a dangling reference when the
+    // thread-local is deallocated with the call return.
+    if ((is_thread_local && in_global_set) ||
+        (!is_thread_local && in_thread_local_set)) {
+      return InvalidArgument(
+          "computation %s has conflicting allocation requirements (global "
+          "and thread-local)",
+          computation->name().c_str());
+    }
+
+    if (is_thread_local) {
+      thread_local_set.insert(computation);
+    } else {
+      global_set.insert(computation);
+    }
+
+    for (auto* instruction : computation->instructions()) {
+      for (HloComputation* subcomputation :
+           instruction->called_computations()) {
+        switch (instruction->opcode()) {
+          case HloOpcode::kCall:
+          case HloOpcode::kConditional:
+          case HloOpcode::kWhile:
+            // Call and while must be called from a computation with global
+            // allocations as they may return references to buffers inside the
+            // called computation which cannot be thread-local.
+            if (is_thread_local) {
+              return InvalidArgument(
+                  "computation %s cannot contain call/while op because it "
+                  "requires thread-local buffer allocations",
+                  computation->name().c_str());
+            }
+            worklist.push_back(std::make_pair(subcomputation,
+                                              false));  // Not thread local.
+            break;
+          case HloOpcode::kMap:
+          case HloOpcode::kReduce:
+          case HloOpcode::kReduceWindow:
+          case HloOpcode::kSelectAndScatter:
+          case HloOpcode::kFusion:
+            // Map/reduce etc computations are always thread-local.
+            worklist.push_back(std::make_pair(subcomputation,
+                                              true));  // Thread local.
+            break;
+          default:
+            return InternalError(
+                "Unexpected calling opcode: %s",
+                HloOpcodeString(instruction->opcode()).c_str());
+        }
+      }
+    }
+  }
+
+  // Add the computations to the vectors in post order.
+  for (auto* computation : module->MakeComputationPostOrder()) {
+    if (thread_local_set.count(computation) > 0) {
+      thread_local_computations->push_back(computation);
+    } else if (global_set.count(computation) > 0) {
+      global_computations->push_back(computation);
+    }
+    // If the computation is not reachable from the entry computation, then it
+    // will not appear in either thread_local_set or global_set. We don't bother
+    // assigning buffers for these.
+  }
+  return Status::OK();
+}
+
+// Checks that points-to set of 'instruction' is unambiguous and distinct
+// (ensured by CopyInsertion), then adds the buffer from the points-to set at
+// 'index' to 'colocated_set'.
+const LogicalBuffer* AddBufferToColocatedSet(
+    const HloInstruction* instruction, const ShapeIndex& index,
+    const TuplePointsToAnalysis& points_to_analysis,
+    std::vector<const LogicalBuffer*>* colocated_set) {
+  // CopyInsertion ensures root points-to set is unambiguous and distinct.
+  const auto& points_to = points_to_analysis.GetPointsToSet(instruction);
+  DCHECK(!points_to.IsAmbiguous());
+  colocated_set->push_back(points_to.element(index)[0]);
+  return colocated_set->back();
+}
+
+// Given the interference map of a graph (the list of interfering node indices
+// for each node), perform graph coloring such that interfering nodes are
+// assigned to different colors. Returns the assigned color of the nodes, where
+// the colors are represented as integer values [0, color_count).
+std::vector<int64> ColorInterferenceGraph(
+    const std::vector<std::vector<int64>>& interference_map) {
+  const int64 node_count = interference_map.size();
+
+  // Sort the nodes such that we assign nodes with more interference first. This
+  // relies on the common heuristic of assigning the most constrained node
+  // first, but it would be good to investigate other ordering heuristics too.
+  std::vector<int64> nodes(node_count);
+  std::iota(nodes.begin(), nodes.end(), 0);
+  std::sort(nodes.begin(), nodes.end(),
+            [&interference_map](const int64 i, const int64 j) {
+              return interference_map[i].size() > interference_map[j].size();
+            });
+
+  const int64 kColorUnassigned = -1;
+  std::vector<int64> assigned_colors(node_count, kColorUnassigned);
+  for (int64 node : nodes) {
+    // Mark the colors that are already assigned to the neighbors.
+    std::vector<bool> available_colors(node_count, true);
+    for (int64 neighbor : interference_map[node]) {
+      int64 color = assigned_colors[neighbor];
+      if (color != kColorUnassigned) {
+        available_colors[color] = false;
+      }
+    }
+
+    // Find the color that is not yet assigned to the neighbors.
+    int64 color = kColorUnassigned;
+    for (color = 0; color < available_colors.size(); ++color) {
+      if (available_colors[color]) {
+        break;
+      }
+    }
+    CHECK_NE(color, kColorUnassigned);
+    assigned_colors[node] = color;
+  }
+  return assigned_colors;
+}
+
+}  // namespace
+
 size_t BufferAllocation::Slice::Hasher::operator()(Slice s) const {
   uint64 h = std::hash<int64>()(s.index());
   h = tensorflow::Hash64Combine(h, std::hash<int64>()(s.offset()));
@@ -115,6 +292,112 @@ BufferAllocationProto BufferAllocation::ToProto() const {
   return proto;
 }
 
+std::pair<int64, std::vector<const LogicalBuffer*>>
+BufferAllocation::ComputePeakMemoryLogicalBuffers() const {
+  if (HeapTraces().empty()) {
+    // Just return the largest LogicalBuffer in the allocation.
+    const LogicalBuffer* largest_buffer = nullptr;
+    int64 largest_size = 0;
+    for (const auto& pair : assigned_buffers()) {
+      const LogicalBuffer* buffer = pair.first;
+      int64 size = pair.second.size;
+      if (largest_buffer == nullptr) {
+        largest_buffer = buffer;
+        largest_size = size;
+        continue;
+      }
+      // Tie-break with LogicalBuffer::Id so the return value is stable relative
+      // to changing addresses.
+      if (size > largest_size ||
+          ((size == largest_size) && (largest_buffer->id() > buffer->id()))) {
+        largest_buffer = buffer;
+        largest_size = size;
+      }
+    }
+    CHECK(largest_buffer != nullptr)
+        << "No logical buffers in allocation: " << ToString();
+    return {largest_size, {largest_buffer}};
+  }
+
+  // Create a map from LogicalBuffer::Id to LogicalBuffer* for the logical
+  // buffers in this allocation.
+  tensorflow::gtl::FlatMap<LogicalBuffer::Id, const LogicalBuffer*>
+      id_to_buffer;
+  tensorflow::gtl::FlatMap<const LogicalBuffer*, int64> buffer_sizes;
+  for (const auto& pair : assigned_buffers()) {
+    const LogicalBuffer* buffer = pair.first;
+    const OffsetSize& offset_size = pair.second;
+    id_to_buffer[buffer->id()] = buffer;
+    buffer_sizes[buffer] = offset_size.size;
+  }
+
+  // Returns how much the given event increases the total size of live
+  // buffers. Can be negative.
+  auto memory_delta = [this, &id_to_buffer, &buffer_sizes](
+                          const HeapSimulatorTrace::Event& event) -> int64 {
+    const LogicalBuffer* buffer = id_to_buffer.at(event.buffer_id());
+    const int64 buffer_size = buffer_sizes.at(buffer);
+    if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
+      return buffer_size;
+    } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
+      // Sharing a buffer does not change the live set size for the purposes of
+      // the heap simulator. Even though the shared-with buffer may be smaller,
+      // the entire allocation remains live.
+      return 0;
+    } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
+      return -1 * buffer_size;
+    }
+    LOG(FATAL) << "Unknown event kind: " << event.kind();
+  };
+
+  int64 total_max_live_size = 0;
+  std::vector<const LogicalBuffer*> live_buffers_vector;
+  for (const HeapSimulatorTrace& heap_trace : HeapTraces()) {
+    // First compute the size of the maximal live set.
+    int64 max_live_size = 0;
+    int64 live_size = 0;
+    for (const auto& event : heap_trace.events()) {
+      live_size += memory_delta(event);
+      if (max_live_size < live_size) {
+        max_live_size = live_size;
+      }
+    }
+
+    // Next gather the set of logical buffers live at the earliest point of
+    // maximal live set size.
+    tensorflow::gtl::FlatSet<const LogicalBuffer*> live_buffers;
+    live_size = 0;
+    for (const auto& event : heap_trace.events()) {
+      const LogicalBuffer* buffer = id_to_buffer.at(event.buffer_id());
+      if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
+        InsertOrDie(&live_buffers, buffer);
+      } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
+        // Nothing to do.
+      } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
+        CHECK(ContainsKey(live_buffers, buffer));
+        live_buffers.erase(buffer);
+      }
+
+      live_size += memory_delta(event);
+      if (live_size == max_live_size) {
+        break;
+      }
+    }
+    CHECK_EQ(live_size, max_live_size);
+    total_max_live_size += max_live_size;
+
+    live_buffers_vector.insert(live_buffers_vector.end(), live_buffers.begin(),
+                               live_buffers.end());
+  }
+
+  // Stabily sort the live buffers.
+  std::sort(live_buffers_vector.begin(), live_buffers_vector.end(),
+            [](const LogicalBuffer* a, const LogicalBuffer* b) {
+              return a->id() < b->id();
+            });
+  return {total_max_live_size, live_buffers_vector};
+}
+
 string BufferAllocation::ToString() const {
   string output;
   Appendf(&output, "allocation %lld: %p, size %lld", index_, this, size());
@@ -348,6 +631,7 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
 // Combines allocations of temporary buffers of the same color into one big
 // BufferAllocation.
 void BufferAssignment::CombineTempAllocations() {
+  VLOG(1) << "CombineTempAllocations()";
   FlatMap<LogicalBuffer::Color, BufferAllocation, LogicalBuffer::Color::Hasher>
       combined_allocation_map;
 
@@ -369,11 +653,16 @@ void BufferAssignment::CombineTempAllocations() {
       if (combined_it == combined_allocation_map.end()) {
         // We have found the first temp allocation of this color. Collect
         // the other temp allocations of the same color into it.
+        VLOG(1) << "Combined temp allocation for color " << color
+                << " is: " << temp_allocation;
         combined_allocation_map.emplace(color, temp_allocation);
         continue;
       }
 
       auto* combined_allocation = &combined_it->second;
+      VLOG(1) << "Combined allocation absorbing temp allocation: "
+              << temp_allocation;
+
       // Each temp allocation is placed end-to-end, accounting for alignment.
       // The offset of each buffer in the combined allocation is computed from
       // the base offset of the allocation.
@@ -387,6 +676,10 @@ void BufferAssignment::CombineTempAllocations() {
         const int64 size = buffer_offset_size.second.size;
         combined_allocation->AddAssignment(*buffer, base + offset, size);
       }
+      if (!temp_allocation.HeapTraces().empty()) {
+        CHECK_EQ(temp_allocation.HeapTraces().size(), 1);
+        combined_allocation->AddHeapTrace(temp_allocation.HeapTraces().front());
+      }
     }
     // Replace all existing temporary allocations with the new combined
     // allocations.
@@ -516,123 +809,13 @@ BufferAssignmentProto BufferAssignment::ToProto() const {
   for (const BufferAllocation& allocation : Allocations()) {
     BufferAllocationProto proto_allocation = allocation.ToProto();
     proto.add_buffer_allocations()->Swap(&proto_allocation);
-  }
-  for (const HeapSimulatorTrace& trace : heap_simulator_traces_) {
-    *proto.add_heap_simulator_traces() = trace;
-  }
-  return proto;
-}
-
-namespace {
-
-// Walk the call graph of the HLO module and place each computation into either
-// thread_local_computations or global_computations depending upon whether the
-// computation requires thread-local allocations or global allocations. The
-// elements in thread_local_computations and global_computations are in post
-// order (if computation A has an instruction which calls computation B, then A
-// will appear after B in the vector).
-Status GatherComputationsByAllocationType(
-    const HloModule* module,
-    std::vector<const HloComputation*>* thread_local_computations,
-    std::vector<const HloComputation*>* global_computations) {
-  // Create a worklist of computations paired with whether the allocation must
-  // be thread-local.
-  std::deque<std::pair<const HloComputation*, bool>> worklist;
-  worklist.push_back(std::make_pair(module->entry_computation(),
-                                    /*is_thread_local*/ false));
-
-  // Sets for quickly checking membership. Computations are returned in vectors
-  // for stable iteration.
-  FlatSet<const HloComputation*> thread_local_set;
-  FlatSet<const HloComputation*> global_set;
-
-  while (!worklist.empty()) {
-    auto worklist_front = worklist.front();
-    worklist.pop_front();
-    const HloComputation* computation = worklist_front.first;
-    bool is_thread_local = worklist_front.second;
-    bool in_thread_local_set = thread_local_set.count(computation) > 0;
-    bool in_global_set = global_set.count(computation) > 0;
-
-    // If the computation has already been added to the respective set, then
-    // nothing to do.
-    if ((is_thread_local && in_thread_local_set) ||
-        (!is_thread_local && in_global_set)) {
-      continue;
+    for (const HeapSimulatorTrace& heap_trace : allocation.HeapTraces()) {
+      *proto.add_heap_simulator_traces() = heap_trace;
     }
-
-    // If the computation has already been added to the other set this is an
-    // error condition because the global call to the computation (eg,
-    // while/call) may return a reference to one of the thread-local buffers to
-    // the calling computation which will become a dangling reference when the
-    // thread-local is deallocated with the call return.
-    if ((is_thread_local && in_global_set) ||
-        (!is_thread_local && in_thread_local_set)) {
-      return InvalidArgument(
-          "computation %s has conflicting allocation requirements (global "
-          "and thread-local)",
-          computation->name().c_str());
-    }
-
-    if (is_thread_local) {
-      thread_local_set.insert(computation);
-    } else {
-      global_set.insert(computation);
-    }
-
-    for (auto* instruction : computation->instructions()) {
-      for (HloComputation* subcomputation :
-           instruction->called_computations()) {
-        switch (instruction->opcode()) {
-          case HloOpcode::kCall:
-          case HloOpcode::kConditional:
-          case HloOpcode::kWhile:
-            // Call and while must be called from a computation with global
-            // allocations as they may return references to buffers inside the
-            // called computation which cannot be thread-local.
-            if (is_thread_local) {
-              return InvalidArgument(
-                  "computation %s cannot contain call/while op because it "
-                  "requires thread-local buffer allocations",
-                  computation->name().c_str());
-            }
-            worklist.push_back(std::make_pair(subcomputation,
-                                              false));  // Not thread local.
-            break;
-          case HloOpcode::kMap:
-          case HloOpcode::kReduce:
-          case HloOpcode::kReduceWindow:
-          case HloOpcode::kSelectAndScatter:
-          case HloOpcode::kFusion:
-            // Map/reduce etc computations are always thread-local.
-            worklist.push_back(std::make_pair(subcomputation,
-                                              true));  // Thread local.
-            break;
-          default:
-            return InternalError(
-                "Unexpected calling opcode: %s",
-                HloOpcodeString(instruction->opcode()).c_str());
-        }
-      }
-    }
-  }
-
-  // Add the computations to the vectors in post order.
-  for (auto* computation : module->MakeComputationPostOrder()) {
-    if (thread_local_set.count(computation) > 0) {
-      thread_local_computations->push_back(computation);
-    } else if (global_set.count(computation) > 0) {
-      global_computations->push_back(computation);
-    }
-    // If the computation is not reachable from the entry computation, then it
-    // will not appear in either thread_local_set or global_set. We don't bother
-    // assigning buffers for these.
   }
-  return Status::OK();
+  return proto;
 }
 
-}  // namespace
-
 /* static */
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
@@ -1064,7 +1247,8 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
     assignment->AddAssignment(allocation, buffer, chunk.offset, chunk.size);
   }
 
-  assignment->heap_simulator_traces_.push_back(result.debug_trace);
+  VLOG(1) << "Ran heap simulation for allocation: " << allocation->ToString();
+  allocation->AddHeapTrace(result.debug_trace);
 }
 
 // Adds the 'colocated_set' of buffers to 'colocated_buffer_sets', maintaining
@@ -1085,7 +1269,8 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   if (colocated_set.empty()) {
     return;
   }
-
+  VLOG(5) << ColocatedBufferSetsToString(colocated_set,
+                                         "Adding colocated buffer set");
   // Find existing sets that overlap with at least one buffer from the
   // colocated_set. The resulting 'overlap_set_indices' will have at most
   // colocated_buffer_sets->size() entries, and will be in increasing order.
@@ -1093,6 +1278,10 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   for (size_t index = 0; index < colocated_buffer_sets->size(); ++index) {
     for (const LogicalBuffer* buffer : colocated_set) {
       if ((*colocated_buffer_sets)[index].count(buffer) > 0) {
+        VLOG(5) << "Found overlap with existing set on buffer "
+                << buffer->ToString() << "\n"
+                << ColocatedBufferSetsToString((*colocated_buffer_sets)[index],
+                                               "Overlapping set");
         overlap_set_indices.push_back(index);
         break;
       }
@@ -1104,6 +1293,7 @@ void BufferAssigner::AddSetToColocatedBufferSets(
     colocated_buffer_sets->emplace_back();
     colocated_buffer_sets->back().insert(colocated_set.begin(),
                                          colocated_set.end());
+    VLOG(5) << "No overlap found, new group created";
     return;
   }
 
@@ -1115,6 +1305,8 @@ void BufferAssigner::AddSetToColocatedBufferSets(
     first->insert(overlap_set.begin(), overlap_set.end());
   }
   first->insert(colocated_set.begin(), colocated_set.end());
+  VLOG(5) << ColocatedBufferSetsToString(
+      *first, "Result of the colocated buffer set merging");
 
   // Remove overlap sets that we just merged. The offset accounts for the fact
   // that as elements are erased, the indices need to be adjusted. Keep in mind
@@ -1125,67 +1317,6 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   }
 }
 
-namespace {
-
-// Checks that points-to set of 'instruction' is unambiguous and distinct
-// (ensured by CopyInsertion), then adds the buffer from the points-to set at
-// 'index' to 'colocated_set'.
-const LogicalBuffer* AddBufferToColocatedSet(
-    const HloInstruction* instruction, const ShapeIndex& index,
-    const TuplePointsToAnalysis& points_to_analysis,
-    std::vector<const LogicalBuffer*>* colocated_set) {
-  // CopyInsertion ensures root points-to set is unambiguous and distinct.
-  const auto& points_to = points_to_analysis.GetPointsToSet(instruction);
-  DCHECK(!points_to.IsAmbiguous());
-  colocated_set->push_back(points_to.element(index)[0]);
-  return colocated_set->back();
-}
-
-// Given the interference map of a graph (the list of interfering node indices
-// for each node), perform graph coloring such that interfering nodes are
-// assigned to different colors. Returns the assigned color of the nodes, where
-// the colors are represented as integer values [0, color_count).
-std::vector<int64> ColorInterferenceGraph(
-    const std::vector<std::vector<int64>>& interference_map) {
-  const int64 node_count = interference_map.size();
-
-  // Sort the nodes such that we assign nodes with more interference first. This
-  // relies on the common heuristic of assigning the most constrained node
-  // first, but it would be good to investigate other ordering heuristics too.
-  std::vector<int64> nodes(node_count);
-  std::iota(nodes.begin(), nodes.end(), 0);
-  std::sort(nodes.begin(), nodes.end(),
-            [&interference_map](const int64 i, const int64 j) {
-              return interference_map[i].size() > interference_map[j].size();
-            });
-
-  const int64 kColorUnassigned = -1;
-  std::vector<int64> assigned_colors(node_count, kColorUnassigned);
-  for (int64 node : nodes) {
-    // Mark the colors that are already assigned to the neighbors.
-    std::vector<bool> available_colors(node_count, true);
-    for (int64 neighbor : interference_map[node]) {
-      int64 color = assigned_colors[neighbor];
-      if (color != kColorUnassigned) {
-        available_colors[color] = false;
-      }
-    }
-
-    // Find the color that is not yet assigned to the neighbors.
-    int64 color = kColorUnassigned;
-    for (color = 0; color < available_colors.size(); ++color) {
-      if (available_colors[color]) {
-        break;
-      }
-    }
-    CHECK_NE(color, kColorUnassigned);
-    assigned_colors[node] = color;
-  }
-  return assigned_colors;
-}
-
-}  // namespace
-
 std::vector<BufferAssigner::ColocatedBufferSet>
 BufferAssigner::MergeColocatedBufferSets(
     const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
@@ -1208,26 +1339,35 @@ BufferAssigner::MergeColocatedBufferSets(
   auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
                                    &buffer_size,
                                    &is_entry_parameter](int64 i, int64 j) {
-    for (auto& buffer_a : colocated_buffer_sets[i]) {
-      for (auto& buffer_b : colocated_buffer_sets[j]) {
-        // Do not merge if the set includes live outs or entry parameters.
-        if ((buffer_liveness.MaybeLiveOut(*buffer_a) &&
-             is_entry_parameter(*buffer_b)) ||
-            (buffer_liveness.MaybeLiveOut(*buffer_b) &&
-             is_entry_parameter(*buffer_a))) {
+    // Do not merge if one of the sets includes live outs or entry parameters.
+    for (int64 key : {i, j}) {
+      for (auto& buffer : colocated_buffer_sets[key]) {
+        if (buffer_liveness.MaybeLiveOut(*buffer) ||
+            is_entry_parameter(*buffer)) {
           return true;
         }
-        // Do not merge if the buffers interfere with each other.
+      }
+    }
+
+    // Colocated sets satisfy the invariant that all buffers within a set have
+    // the same size. That means we need to check whether the size is the same
+    // between the two sets, but also that it's enough to look at just one
+    // buffer within each set.
+    if (buffer_size(**colocated_buffer_sets[i].begin()) !=
+        buffer_size(**colocated_buffer_sets[j].begin())) {
+      return true;
+    }
+
+    // Do not merge if some pair of buffers interferes with each other.
+    for (auto& buffer_a : colocated_buffer_sets[i]) {
+      for (auto& buffer_b : colocated_buffer_sets[j]) {
         if (buffer_a->id() != buffer_b->id() &&
             buffer_liveness.MayInterfere(*buffer_a, *buffer_b)) {
           return true;
         }
-        // Do not merge if the buffer sizes are different.
-        if (buffer_size(*buffer_a) != buffer_size(*buffer_b)) {
-          return true;
-        }
       }
     }
+
     return false;
   };
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 6b7fd0014d103ef0617afcc5cb3f663554a01aa4..3086d0e2ca0026547134285b8ceb357390fc7ece 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -192,6 +192,37 @@ class BufferAllocation {
            !is_thread_local();
   }
 
+  // Add a heap trace which was used to assign slices to logical buffers in this
+  // allocation. A single BufferAllocation may include multiple heap traces
+  // in the case of the temporary block where there is a heap trace per
+  // computation.
+  void AddHeapTrace(const HeapSimulatorTrace& heap_trace) {
+    heap_traces_.push_back(heap_trace);
+  }
+
+  // Return the set of heap traces used to assign slices to logical buffers in
+  // this allocation.
+  const std::vector<HeapSimulatorTrace> HeapTraces() const {
+    return heap_traces_;
+  }
+
+  // Compute and return the LogicalBuffers which are live at the point of peak
+  // memory usage for the given allocation. The point of peak memory usage is
+  // the point at which the total size of all live logical buffers is
+  // maximal. If peak memory is reached at multiple points, the set of logical
+  // buffers live at the earliest maximal point is returned. The vector is
+  // stabily asserted by LogicalBuffer::Index.
+  //
+  // The return value is a pair of total size of the logical buffers at peak,
+  // and the buffers themselves.
+  std::pair<int64, std::vector<const LogicalBuffer*>>
+  ComputePeakMemoryLogicalBuffers() const;
+
+  // Get the number of bytes lost to fragmentation. This is equal to the
+  // difference between the size of the allocation and the size of the maximal
+  // live set.
+  int64 fragmentation_bytes() const { return fragmentation_bytes_; }
+
   bool operator==(const BufferAllocation& other) const {
     return index_ == other.index_;
   }
@@ -257,6 +288,9 @@ class BufferAllocation {
   // Mapping from the set of buffers assigned to this allocation to their
   // logical offsets and sizes.
   tensorflow::gtl::FlatMap<const LogicalBuffer*, OffsetSize> assigned_buffers_;
+
+  int64 fragmentation_bytes_ = 0;
+  std::vector<HeapSimulatorTrace> heap_traces_;
 };
 
 // Add stream operators for nicer output of CHECK/RET_CHECK failures.
@@ -441,7 +475,6 @@ class BufferAssignment {
   LogicalBuffer::AlignmentFunction color_alignment_;
 
   Stats stats_;
-  std::vector<HeapSimulatorTrace> heap_simulator_traces_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssignment);
 };
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index cd73654b8f666c4b96c000235cc3ad2cd0a46c17..3ec9795a655041548f96487f9f6401dab9e5c58d 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -37,14 +37,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace xla {
-
 namespace {
 
+using ::testing::UnorderedElementsAre;
+
 // DFS visitor that collects the instructions referenced by a computation
 // without descending into nested computations, i.e., only from the operands.
 class InstructionListVisitor : public DfsHloVisitorWithDefault {
@@ -101,6 +103,22 @@ class BufferAssignmentTest : public HloTestBase {
         .ConsumeValueOrDie();
   }
 
+  std::unique_ptr<BufferAssignment> RunBufferAssignmentWithInstructionSequence(
+      HloModule* module,
+      tensorflow::gtl::ArraySlice<const HloInstruction*> instruction_sequence,
+      int64 alignment = 1) {
+    SequentialHloOrdering::HloModuleSequence module_sequence;
+    module_sequence[module->entry_computation()] =
+        std::vector<const HloInstruction*>(instruction_sequence.begin(),
+                                           instruction_sequence.end());
+    return BufferAssigner::Run(
+               module,
+               xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
+               backend().compiler()->BufferSizeBytesFunction(),
+               [alignment](LogicalBuffer::Color) { return alignment; })
+        .ConsumeValueOrDie();
+  }
+
   // Builds an x+1.0 computation to use in a Map.
   std::unique_ptr<HloComputation> BuildMapComputationPlus1(const string& name) {
     auto builder = HloComputation::Builder(name);
@@ -1370,7 +1388,7 @@ TEST_F(BufferAssignmentTest, AmbiguousBufferAsOutput) {
   auto element_slices = assignment->GetAllSlices(select, /*index=*/{0});
   EXPECT_EQ(2, element_slices.size());
   EXPECT_THAT(element_slices,
-              ::testing::UnorderedElementsAre(
+              UnorderedElementsAre(
                   assignment->GetUniqueSlice(tuple_param0, /*index=*/{0})
                       .ConsumeValueOrDie(),
                   assignment->GetUniqueSlice(tuple_param1, /*index=*/{0})
@@ -1473,6 +1491,98 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
   }
 }
 
+TEST_F(BufferAssignmentTest, TrivialPeakBuffers) {
+  // paramscalar ------- (mul) -- (add) -- (sub)
+  //                     /        /        /
+  // param0[100] -------/        /        /
+  //                            /        /
+  // param1[100] --------------/--------/
+  auto builder = HloComputation::Builder(TestName());
+  auto paramscalar =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec100_, ""));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec100_, ""));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kSubtract, add, param1));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  auto buffers = RunBufferAssignment(module.get());
+
+  // Trivially, the set of peak memory logical buffer(s) of an allocation with a
+  // single logical buffer should be exactly the logical buffer in that
+  // allocation.
+  const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
+  int64 peak_size;
+  std::vector<const LogicalBuffer*> peak_buffers;
+
+  std::tie(peak_size, peak_buffers) =
+      mul_buffer.ComputePeakMemoryLogicalBuffers();
+  EXPECT_EQ(peak_size, ShapeUtil::ByteSizeOf(f32vec100_));
+  ASSERT_EQ(peak_buffers.size(), 1);
+  EXPECT_EQ(peak_buffers[0]->instruction(), mul);
+}
+
+TEST_F(BufferAssignmentTest, PeakBuffers) {
+  // Compute the peak liveness buffers of the following sequence:
+  //
+  //   %param = ...
+  //   %log = log(%param)
+  //   %rev = reverse(%log)
+  //   %neg = neg(%param)
+  //   %concat = concat(%rev, %neg)
+  //   ROOT %root = slice(concat)
+  //
+  // In the temporary block, the set of live buffers at peak memory use should
+  // be {%rev, %neg, %concat}. This occurs right at the concat itself.
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec100_, ""));
+  auto log = builder.AddInstruction(
+      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kLog, param));
+  auto rev = builder.AddInstruction(
+      HloInstruction::CreateReverse(f32vec100_, log, {0}));
+  auto neg = builder.AddInstruction(
+      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param));
+  const Shape concat_shape = ShapeUtil::MakeShape(F32, {200});
+  auto concat = builder.AddInstruction(
+      HloInstruction::CreateConcatenate(concat_shape, {rev, neg}, 0));
+  // Make the root tiny so no interior nodes can share its buffer.
+  auto root = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {1}), concat, {0}, {1}, {1}));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  auto buffers = RunBufferAssignmentWithInstructionSequence(
+      module.get(), {param, log, rev, neg, concat, root});
+
+  // The temporary buffer should hold the 4 interior instructions.
+  const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, concat);
+  EXPECT_FALSE(buffer.IsInputOrOutput());
+  EXPECT_TRUE(buffer.IsPreallocatedTempBuffer());
+  ASSERT_EQ(buffer.assigned_buffers().size(), 4);
+
+  int64 peak_size;
+  std::vector<const LogicalBuffer*> peak_buffers;
+  std::tie(peak_size, peak_buffers) = buffer.ComputePeakMemoryLogicalBuffers();
+
+  // The peak live set should be concat and its inputs.
+  EXPECT_EQ(peak_size, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(F32, {400})));
+  ASSERT_EQ(peak_buffers.size(), 3);
+  std::vector<const HloInstruction*> peak_instructions;
+  for (const LogicalBuffer* logical_buffer : peak_buffers) {
+    peak_instructions.push_back(logical_buffer->instruction());
+  }
+  EXPECT_THAT(peak_instructions, UnorderedElementsAre(rev, neg, concat));
+}
+
 class WhileBufferAssignmentTest : public HloTestBase {
  protected:
   std::unique_ptr<HloComputation> BuildWhileConditionComputation(
@@ -1531,7 +1641,7 @@ static void RunCopyInsertion(HloModule* module) {
 }
 
 TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1587,6 +1697,81 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
             assignment->GetUniqueSlice(while1, {1}).ConsumeValueOrDie());
 }
 
+// Tests that two colocated buffer sets are not merged if an entry parameter
+// buffer belongs to either of the colocation sets (b/73267882).
+//
+// %param --> %while.0 --> %mul --> %while.1 --> %broadcast
+//
+// %while.0 body just forwards the init value, so the loop carried variable
+// remains the constant, whereas %while.1 changes the loop carried variable.
+TEST_F(WhileBufferAssignmentTest, ColocatedBufferWithEntryParameter) {
+  const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+
+  const char* module_str = R"(
+HloModule test_module
+
+%cond.v0 {
+  %param = s32[] parameter(0)
+  ROOT %constant = pred[] constant(true)
+}
+
+%cond.v1 {
+  %param.0 = s32[] parameter(0)
+  ROOT %constant.0 = pred[] constant(true)
+}
+
+%body.v0 {
+  ROOT %param.1 = s32[] parameter(0)
+}
+
+%body.v1 {
+  %param.2 = s32[] parameter(0)
+  ROOT add = s32[] add(%param.2, %param.2)
+}
+
+ENTRY %test_module {
+  %param.3 = s32[] parameter(0)
+  %while.0 = s32[] while(%param.3), condition=%cond.v0, body=%body.v0
+  %mul = s32[] multiply(%while.0, %while.0)
+  %while.1 = s32[] while(%mul), condition=%cond.v1, body=%body.v1
+  ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(module_str));
+
+  // Run CopyInsertion and check if the graph constructed above doesn't need
+  // any copies inserted for BufferAssignment to run.
+  int64 instruction_count = module->instruction_count();
+  CopyInsertion copy_insertion;
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
+  ASSERT_EQ(instruction_count, module->instruction_count());
+
+  // Get the instructions in the module.
+  const HloInstruction* bcast = module->entry_computation()->root_instruction();
+  const HloInstruction* param =
+      module->entry_computation()->parameter_instruction(0);
+  ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
+  const HloInstruction* while1 = bcast->operand(0);
+  ASSERT_EQ(while1->opcode(), HloOpcode::kWhile);
+  const HloInstruction* while0 = while1->operand(0)->operand(0);
+  ASSERT_EQ(while0->opcode(), HloOpcode::kWhile);
+
+  // Run buffer assignment.
+  auto assignment = RunBufferAssignment(module.get());
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_param,
+                          assignment->GetUniqueSlice(param, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
+                          assignment->GetUniqueSlice(while0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while1,
+                          assignment->GetUniqueSlice(while1, {}));
+
+  // The parameter slice is part of the while0's colocation set (init value),
+  // but not merged into the while1's colocation set.
+  EXPECT_EQ(slice_param, slice_while0);
+  EXPECT_NE(slice_param, slice_while1);
+}
+
 // Tests that the colocated buffers for while instructions are properly assigned
 // during buffer assignment such that the result tuple elements are not assigned
 // to the same buffer.
@@ -1631,7 +1816,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   };
 
   // Build the entry computation as described in the comment above.
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder("entry");
 
   auto infeed = builder.AddInstruction(HloInstruction::CreateInfeed(r0s32, ""));
@@ -1699,7 +1884,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
 }
 
 TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1744,7 +1929,7 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
 }
 
 TEST_F(BufferAssignmentTest, TwoCalls) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   Shape r0f32 = ShapeUtil::MakeShape(xla::F32, {});
   HloComputation* sub_computation;
   {
@@ -1809,7 +1994,7 @@ static bool IsPostOrderTraversal(
 }
 
 TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto zero = builder.AddInstruction(
@@ -1888,7 +2073,7 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
 }
 
 TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 13eb02ca012f44b2b5ed7c6f5becb7d54b07c33c..a8053d15e124319c5c898f0034b9aaa95a007a89 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -51,8 +51,8 @@ std::ostream& operator<<(std::ostream& out, const CallContext& context) {
   return out;
 }
 
-CallContext GetInstructionCallContext(const HloInstruction* instruction) {
-  switch (instruction->opcode()) {
+CallContext GetInstructionCallContext(HloOpcode opcode) {
+  switch (opcode) {
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kWhile:
@@ -101,7 +101,7 @@ void CallGraphNode::AddCallerCallSite(const CallSite& caller_callsite) {
 
 void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
   CHECK_EQ(instruction->parent(), computation());
-  const CallContext context = GetInstructionCallContext(instruction);
+  const CallContext context = GetInstructionCallContext(instruction->opcode());
   if (!instruction->called_computations().empty()) {
     CHECK(context == CallContext::kSequential ||
           context == CallContext::kParallel);
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 688c4085dfb4f47d3e08a4abee5e7b645f595b11..97d3811508adee1bf2d0942bcc69e3e34a41c8c3 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -53,7 +53,7 @@ enum class CallContext {
 string CallContextToString(CallContext context);
 std::ostream& operator<<(std::ostream& out, const CallContext& context);
 
-CallContext GetInstructionCallContext(const HloInstruction* instruction);
+CallContext GetInstructionCallContext(HloOpcode opcode);
 
 // Represents an HLO instruction which calls one or more computations.
 class CallSite {
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index dab73596e1639eed62151197048ee8d29570b20a..c9f78a0f9f1c0e889cd2c761e3129ec329a7b647 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -37,7 +37,7 @@ limitations under the License.
 namespace xla {
 
 /* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
-CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
+CompileOnlyService::NewService(se::Platform* platform) {
   ServiceOptions default_options;
   default_options.set_platform(platform);
   return NewService(default_options);
@@ -45,7 +45,7 @@ CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
 
 /* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
 CompileOnlyService::NewService(const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
@@ -61,6 +61,33 @@ CompileOnlyService::CompileOnlyService(const ServiceOptions& options,
                                        Compiler* compiler)
     : Service(options, /*execute_backend=*/nullptr), compiler_(compiler) {}
 
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyService::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<std::unique_ptr<HloModule>> hlo_modules;
+  for (const AotXlaComputationInstance& instance : computations) {
+    TF_RET_CHECK(instance.computation.has_program_shape());
+
+    const DebugOptions& debug_options = options.debug_options();
+    const auto& program_shape = instance.computation.program_shape();
+    ExecutionOptions execution_options;
+    *execution_options.mutable_debug_options() = debug_options;
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(program_shape, instance.argument_layouts,
+                           &execution_options));
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModule> hlo_module,
+        HloModule::CreateFromProto(instance.computation, *module_config));
+    TF_RETURN_IF_ERROR(MaybeDumpHloModule(*hlo_module));
+    hlo_modules.push_back(std::move(hlo_module));
+  }
+
+  return compiler_->CompileAheadOfTime(std::move(hlo_modules), options);
+}
+
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 CompileOnlyService::CompileAheadOfTime(
     const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
@@ -72,8 +99,7 @@ CompileOnlyService::CompileAheadOfTime(
     VersionedComputationHandle versioned_handle =
         user_computation->GetVersionedHandle();
 
-    // TODO(b/63773457): Track DebugOptions in AotCompilationOptions.
-    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+    const DebugOptions& debug_options = options.debug_options();
 
     // Dump computation proto state if flag is set.
     const string& directory_path = debug_options.xla_dump_computations_to();
@@ -101,7 +127,7 @@ CompileOnlyService::CompileAheadOfTime(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
         CreateModuleConfig(*program_shape, instance.argument_layouts,
-                           &execution_options, *user_computation));
+                           &execution_options, user_computation));
 
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
                         computation_tracker_.BuildHloModule(
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index 9859941c6c17460939e5b6817f1c7c415e63443c..c10609e67fcdec459baf25a95173bbf700994be9 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -34,7 +34,7 @@ class CompileOnlyService : public Service {
   // platform that the service should target. If platform is null then the
   // default platform is used.
   static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
-      perftools::gputools::Platform* platform);
+      se::Platform* platform);
   static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
       const ServiceOptions& options);
 
@@ -53,6 +53,25 @@ class CompileOnlyService : public Service {
       const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
       const AotCompilationOptions& Options);
 
+  // A description of a xla computation to compile using CompileAheadOfTime.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  struct AotXlaComputationInstance {
+    HloModuleProto computation;
+    std::vector<const Shape*> argument_layouts;
+    const Shape* result_layout = nullptr;
+  };
+
+  // Compiles a list of xla computations for ahead-of-time execution.  This is
+  // intended for use in static compilation.  See
+  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+      const AotCompilationOptions& options);
+
   // Override Service methods that require or imply the existence of an
   // execute backend.  Note that this does not include TransferToClient, as
   // computing constants produces global data that we may wish to transfer.
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index e2e9d2a0c048fec6c6ffbeef1223ae0e6aef50d1..8b01a6c4b5004d03e6e7d23b99b923fdcdeaff99 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -23,26 +23,21 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
     tensorflow::LINKER_INITIALIZED);
 
-/* static */ std::map<perftools::gputools::Platform::Id,
-                      Compiler::CompilerFactory>*
+/* static */ std::map<se::Platform::Id, Compiler::CompilerFactory>*
 Compiler::GetPlatformCompilerFactories() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, CompilerFactory>;
+  static auto* r = new std::map<se::Platform::Id, CompilerFactory>;
   return r;
 }
 
 /* static */
-std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
+std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
 Compiler::GetPlatformCompilers() {
-  static auto* r = new std::map<perftools::gputools::Platform::Id,
-                                std::unique_ptr<Compiler>>;
+  static auto* r = new std::map<se::Platform::Id, std::unique_ptr<Compiler>>;
   return r;
 }
 
@@ -86,4 +81,7 @@ Compiler::GetPlatformCompilers() {
   return compilers->at(platform->id()).get();
 }
 
+AotCompilationOptions::AotCompilationOptions()
+    : debug_options_(legacy_flags::GetDebugOptionsFromFlags()) {}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 74fd24edf88d44b2dfdc87556b0af43987e69e08..5c14591d93cc995a0b75efb14da8ec98d5859ff5 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -70,7 +70,7 @@ class AotCompilationOptions {
   virtual ~AotCompilationOptions() = default;
 
   // Returns the ID of the platform to which these options apply.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;
 
   // Optional allocator that may be used for allocating temp space on the device
   // during compilation.
@@ -79,11 +79,15 @@ class AotCompilationOptions {
     device_allocator_ = device_allocator;
   }
 
+  const DebugOptions& debug_options() const { return debug_options_; }
+  DebugOptions* mutable_debug_options() { return &debug_options_; }
+
  protected:
-  AotCompilationOptions() = default;
+  AotCompilationOptions();
 
  private:
   DeviceMemoryAllocator* device_allocator_ = nullptr;
+  DebugOptions debug_options_;
 };
 
 // Abstract compiler interface that is subclassed for compilation on a
@@ -105,7 +109,7 @@ class Compiler {
   virtual ~Compiler() {}
 
   // Returns the ID of the platform that this compiler targets.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;
 
   // Runs Hlo passes to optimize the given Hlo module, returns the optimized
   // module.
@@ -116,14 +120,13 @@ class Compiler {
   // algorithm over those buffers, to see which variant is fastest.  Any space
   // allocated should be deallocated before this function returns.
   virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
   // applied to module. Generally a module should be passed through RunHloPasses
-  // prior to calling this method because the some HLO passes are required for
+  // prior to calling this method because some HLO passes are required for
   // correctness. Takes ownership of the HLO module and is free to transform it.
   //
   // The compiler may optionally specialize to the individual device
@@ -133,8 +136,7 @@ class Compiler {
   //
   // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
@@ -147,8 +149,7 @@ class Compiler {
   // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
       DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
@@ -167,14 +168,12 @@ class Compiler {
   // be a singleton, so no ownership is transferred.
   //
   // Precondition: a platform kind must not be registered more than once.
-  static void RegisterCompilerFactory(
-      perftools::gputools::Platform::Id platform_id,
-      CompilerFactory compiler_factory);
+  static void RegisterCompilerFactory(se::Platform::Id platform_id,
+                                      CompilerFactory compiler_factory);
 
   // Returns the compiler singleton pointer if it is available for the given
   // platform, or an error status if it is not.
-  static StatusOr<Compiler*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+  static StatusOr<Compiler*> GetForPlatform(const se::Platform* platform);
 
   // Returns a function that computes the size in bytes of the logical
   // buffer that contains a shape.
@@ -194,12 +193,12 @@ class Compiler {
   static tensorflow::mutex platform_compiler_mutex_;
 
   // Map from platform kind to compiler factory.
-  static std::map<perftools::gputools::Platform::Id, CompilerFactory>*
+  static std::map<se::Platform::Id, CompilerFactory>*
   GetPlatformCompilerFactories();
 
   // Map from platform kind to compiler instance, if we made one already (based
   // on the factories above).
-  static std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
+  static std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
   GetPlatformCompilers();
 };
 
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index 657fba6b6231104bf47f9dec80f7cd36a0ba3efd..7c1bacff92b231661477b9931a3066fd91110445 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
@@ -132,11 +130,9 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
     ComputationPlacer::platform_computation_placer_mutex_(
         tensorflow::LINKER_INITIALIZED);
 
-/* static */ std::map<perftools::gputools::Platform::Id,
-                      ComputationPlacer::State>*
+/* static */ std::map<se::Platform::Id, ComputationPlacer::State>*
 ComputationPlacer::GetPlatformComputationPlacers() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, ComputationPlacer::State>;
+  static auto* r = new std::map<se::Platform::Id, ComputationPlacer::State>;
   return r;
 }
 
@@ -147,10 +143,10 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }
 
 static bool InitModule() {
-  xla::ComputationPlacer::RegisterComputationPlacer(se::host::kHostPlatformId,
-                                                    &CreateComputationPlacer);
-  xla::ComputationPlacer::RegisterComputationPlacer(se::cuda::kCudaPlatformId,
-                                                    &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      stream_executor::host::kHostPlatformId, &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      stream_executor::cuda::kCudaPlatformId, &CreateComputationPlacer);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index 737ccabaa7a61931b6e2787f75b02857562d4820..737d00e93ecb51a9bd544bbcbe99d93374d108fb 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -80,13 +80,13 @@ class ComputationPlacer {
 
   // Registers a computation placer creation function for a particular platform.
   static void RegisterComputationPlacer(
-      perftools::gputools::Platform::Id platform_id,
+      se::Platform::Id platform_id,
       ComputationPlacerCreationFunction creation_function);
 
   // Returns the computation placer singleton pointer if it is available for the
   // given platform, or an error status if it is not.
   static StatusOr<ComputationPlacer*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+      const se::Platform* platform);
 
  private:
   // The mutex that guards the platform-to-computation placer map.
@@ -101,10 +101,9 @@ class ComputationPlacer {
   };
 
   // Map from platform kind to computation placer singleton.
-  static std::map<perftools::gputools::Platform::Id, State>*
-  GetPlatformComputationPlacers();
+  static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
 
-  perftools::gputools::Platform::Id platform_id_;
+  se::Platform::Id platform_id_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
 };
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e560abc87f84566905333181c159edd3ca297563
--- /dev/null
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -0,0 +1,106 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/conditional_simplifier.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+
+// Tries to replace a conditional with a call operation of the corresponding
+// computation. If the given conditional has a constant predicate, tries to
+// replace it with a call to its true/false computation as appropirate and then
+// inline that computation.
+//
+// Returns true if it made a change to the graph.
+static StatusOr<bool> TryRemoveConditional(HloInstruction* conditional) {
+  CHECK_EQ(conditional->opcode(), HloOpcode::kConditional);
+  // Do not remove conditionals that contain side-effecting instructions or
+  // have control predecessors/successors in either true/false computation.
+  if (!conditional->parent()->IsRemovable(conditional) ||
+      conditional->HasSideEffect()) {
+    VLOG(2) << "Not attempting to remove conditional as it is not removable or "
+               "has side effect: "
+            << conditional->ToShortString();
+    return false;
+  }
+
+  if (conditional->operand(0)->opcode() != HloOpcode::kConstant) {
+    VLOG(2) << "Not attempting to remove conditional as its predicate is not a "
+               "compile-time constant: "
+            << conditional->ToShortString();
+    return false;
+  }
+
+  auto computation = conditional->parent();
+  HloInstruction* call_op;
+  if (conditional->operand(0)->literal().Get<bool>({})) {
+    call_op = computation->AddInstruction(HloInstruction::CreateCall(
+        conditional->shape(), {conditional->mutable_operand(1)},
+        conditional->true_computation()));
+  } else {
+    call_op = computation->AddInstruction(HloInstruction::CreateCall(
+        conditional->shape(), {conditional->mutable_operand(2)},
+        conditional->false_computation()));
+  }
+  conditional->SetupDerivedInstruction(call_op);
+  TF_RETURN_IF_ERROR(computation->ReplaceInstruction(conditional, call_op));
+  TF_RETURN_IF_ERROR(CallInliner::Inline(call_op).status());
+
+  return true;
+}
+
+StatusOr<bool> ConditionalSimplifier::Run(HloModule* module) {
+  XLA_VLOG_LINES(
+      3, "ConditionalSimplifier::Run(), before:\n" + module->ToString());
+  bool changed = false;
+
+  // Gather all the conditional ops in our module. We do this ahead of time so
+  // we don't have to worry about mutating the lists of computations or
+  // instructions as we iterate.
+  std::vector<HloInstruction*> conditional_ops;
+  for (auto* comp : module->computations()) {
+    for (auto* instr : comp->instructions()) {
+      if (instr->opcode() == HloOpcode::kConditional) {
+        conditional_ops.push_back(instr);
+      }
+    }
+  }
+
+  for (HloInstruction* conditional_op : conditional_ops) {
+    TF_ASSIGN_OR_RETURN(bool result, TryRemoveConditional(conditional_op));
+    changed |= result;
+  }
+
+  XLA_VLOG_LINES(3,
+                 "ConditionalSimplifier::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.h b/tensorflow/compiler/xla/service/conditional_simplifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..063261e26d06e21a297e8e3c405898a17221b7ca
--- /dev/null
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace xla {
+
+// HLO pass that removes kConditional with a constant predicate, replacing them
+// with their true or false computation as appropriate.
+class ConditionalSimplifier : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override {
+    return "simplify-conditional";
+  }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..868348547d9f5cbdc7576c7fc0697d72c3a3e557
--- /dev/null
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/conditional_simplifier.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ConditionalSimplifierTest : public HloVerifiedTestBase {
+ public:
+  // Makes a computation that contains a conditional with constant predicate.
+  HloComputation* MakeConditional(HloModule* module);
+};
+
+HloComputation* ConditionalSimplifierTest::MakeConditional(HloModule* module) {
+  HloComputation::Builder builder(TestName());
+
+  // true_computation returns param+1.
+  HloComputation* true_computation;
+  {
+    HloComputation::Builder true_computation_builder(TestName() +
+                                                     ".true_computation");
+    auto param =
+        true_computation_builder.AddInstruction(HloInstruction::CreateParameter(
+            0, ShapeUtil::MakeShape(S32, {}), "param"));
+    auto one = true_computation_builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)));
+
+    true_computation_builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(S32, {}), HloOpcode::kAdd, param, one));
+
+    true_computation =
+        module->AddEmbeddedComputation(true_computation_builder.Build());
+  }
+
+  // false_computation returns param+42.
+  HloComputation* false_computation;
+  {
+    HloComputation::Builder false_computation_builder(TestName() +
+                                                      ".false_computation");
+    auto param = false_computation_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(S32, {}),
+                                        "param"));
+    auto forty_two = false_computation_builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
+
+    false_computation_builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(S32, {}), HloOpcode::kAdd, param, forty_two));
+    false_computation =
+        module->AddEmbeddedComputation(false_computation_builder.Build());
+  }
+
+  auto false_instrn = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  auto false_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(S32, {}), "false_param"));
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)));
+
+  builder.AddInstruction(HloInstruction::CreateConditional(
+      ShapeUtil::MakeShape(S32, {}), false_instrn, one, true_computation,
+      false_param, false_computation));
+
+  return module->AddEntryComputation(builder.Build());
+}
+
+TEST_F(ConditionalSimplifierTest, ConditionalGetsInlined) {
+  HloComputation* computation = MakeConditional(&module());
+  ASSERT_TRUE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Parameter(), op::Constant()));
+}
+
+TEST_F(ConditionalSimplifierTest, ConditionalWithControlDependency) {
+  HloComputation* computation = MakeConditional(&module());
+
+  auto* true_op = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  TF_ASSERT_OK(
+      true_op->AddControlDependencyTo(computation->root_instruction()));
+
+  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+}
+
+TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsSend) {
+  HloComputation* computation = MakeConditional(&module());
+  auto* conditional = computation->root_instruction();
+  ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
+
+  auto* true_computation = conditional->true_computation();
+  auto* send = true_computation->AddInstruction(HloInstruction::CreateSend(
+      true_computation->AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<bool>(true))),
+      /*channel_id=*/0));
+  true_computation->AddInstruction(HloInstruction::CreateSendDone(send));
+  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+}
+
+TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsRecv) {
+  HloComputation* computation = MakeConditional(&module());
+  auto* conditional = computation->root_instruction();
+  ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
+
+  auto* true_computation = conditional->true_computation();
+  auto* recv = true_computation->AddInstruction(HloInstruction::CreateRecv(
+      ShapeUtil::MakeShape(F32, {1}), /*channel_id=*/0));
+  true_computation->AddInstruction(HloInstruction::CreateRecvDone(recv));
+  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+}
+
+TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsNonRemovableInstruction) {
+  HloComputation* computation = MakeConditional(&module());
+  auto* conditional = computation->root_instruction();
+  ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
+  auto* false_computation = conditional->false_computation();
+  false_computation->AddInstruction(
+      HloInstruction::CreateInfeed(ShapeUtil::MakeShape(F32, {1}), "config"));
+  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index cc195879a6bb490a9b49ad962aa9326cb51d9b0a..40519ecc799c8f0343294ad88009820dbd8535e9 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -58,6 +58,46 @@ bool ValueIsReadOnly(const HloValue& value) {
   return IsConstantValue(value) || IsEntryParameterValue(value);
 }
 
+// Data structure describing the action which should be taken on parts of a
+// computation buffers, with respect to the adding of special case copies.
+struct SpecialCaseCopyPolicy {
+  // Insert a copy if the same buffer is found at multiple indices within the
+  // output tuple.
+  bool copy_root_replicated_buffers = false;
+  // If true, insert a copy if a buffer coming from a constant or a parameter
+  // is found wihtin the output tuple.
+  bool copy_parameters_and_constants = false;
+};
+
+SpecialCaseCopyPolicy GetSpecialCaseCopyPolicy(const CallGraphNode& node,
+                                               HloModule* module,
+                                               HloComputation* computation) {
+  SpecialCaseCopyPolicy policy;
+  if (computation == module->entry_computation()) {
+    policy.copy_parameters_and_constants = true;
+    policy.copy_root_replicated_buffers = true;
+  }
+  for (const CallSite& site : node.caller_callsites()) {
+    // The AddCopiesForConditional() already adds copies, but the copy remover
+    // removes them, so we re-add them by returning the policy here. But really
+    // the copy remover should not be removing them.
+    if (site.instruction()->opcode() == HloOpcode::kConditional) {
+      policy.copy_parameters_and_constants = true;
+      policy.copy_root_replicated_buffers = true;
+    }
+  }
+  return policy;
+}
+
+bool ShouldCopyRootValue(const HloValue& value,
+                         const SpecialCaseCopyPolicy& policy) {
+  if (policy.copy_parameters_and_constants) {
+    return IsConstantValue(value) ||
+           value.defining_instruction()->opcode() == HloOpcode::kParameter;
+  }
+  return false;
+}
+
 // Deep copy the given instructions 'from' and 'to' at the ShapeIndexes given in
 // 'indices_to_copy'. Add control edges from the respective kCopy instructions
 // in deep copy of 'from' to the respective kCopy instruction in the deep copy
@@ -282,6 +322,29 @@ Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis,
   return Status::OK();
 }
 
+// We add copies for all the indices of the true and false computaiton roots,
+// in order to resolve interference. We later rely on the CopyRemover to drop
+// the unnecessary ones.
+Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
+                               HloInstruction* conditional) {
+  VLOG(2) << "Adding copies for kConditional instruction "
+          << conditional->name();
+  TF_RET_CHECK(conditional->opcode() == HloOpcode::kConditional);
+
+  for (HloComputation* computation :
+       {conditional->true_computation(), conditional->false_computation()}) {
+    HloInstruction* root = computation->root_instruction();
+    std::vector<HloInstruction*> users = root->users();
+    TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy,
+                        computation->DeepCopyInstruction(root));
+    for (HloInstruction* user : users) {
+      TF_RETURN_IF_ERROR(root->ReplaceUseWith(user, deep_copy));
+    }
+    computation->set_root_instruction(deep_copy);
+  }
+  return Status::OK();
+}
+
 // Removes any control dependencies to or from the given instruction.
 Status StripControlDependenciesFrom(HloInstruction* instruction) {
   while (!instruction->control_successors().empty()) {
@@ -309,6 +372,9 @@ Status AddCopiesToResolveInterference(HloModule* module) {
     for (HloInstruction* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kWhile) {
         TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction));
+      } else if (instruction->opcode() == HloOpcode::kConditional) {
+        TF_RETURN_IF_ERROR(
+            AddCopiesForConditional(*alias_analysis, instruction));
       }
     }
   }
@@ -557,6 +623,7 @@ class CopyRemover {
 
       auto is_live_range_before = [this](const ValueNode& a,
                                          const ValueNode& b) {
+        VLOG(3) << "Checking live range of " << *a.value << " WRT " << *b.value;
         if (LiveRangeBefore(a, b)) {
           VLOG(2) << "  Live range of " << a.value->ToShortString()
                   << " is before " << b.value->ToShortString();
@@ -571,7 +638,7 @@ class CopyRemover {
       VLOG(3) << copy->name() << " copies value "
               << src->value->ToShortString();
       VLOG(3) << "Source buffer values: " << ValueListToString(src);
-      VLOG(3) << "Dest buffer values: " << ValueListToString(src);
+      VLOG(3) << "Dest buffer values: " << ValueListToString(dest);
 
       // A kCopy instruction copies an HLO value from a source buffer and
       // defines an HLO value in a destination buffer. Most generally, the
@@ -747,16 +814,16 @@ class CopyRemover {
     // updated as copies are removed.
     bool LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
       if (a.uses.empty()) {
-        VLOG(2) << "Empty uses";
+        VLOG(2) << "Empty uses for " << *a.value;
         return ordering_.IsDefinedBefore(*a.value, *b.value);
       }
       for (const HloUse* use : a.uses) {
-        VLOG(2) << "use: " << *use;
-        VLOG(2) << "is before:" << *b.value;
+        VLOG(2) << "Checking use " << *use << " against " << *b.value;
         if (!ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_)) {
-          VLOG(2) << "Not before";
+          VLOG(2) << "Use " << *use << " is NOT before " << *b.value;
           return false;
         }
+        VLOG(2) << "Use " << *use << " is before " << *b.value;
       }
       return true;
     }
@@ -892,7 +959,6 @@ Status RemoveUnnecessaryCopies(
   CopyRemover copy_remover(*alias_analysis, ordering, module);
   XLA_VLOG_LINES(3, copy_remover.ToString());
 
-  tensorflow::gtl::FlatSet<int> existing_copies;
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kCopy &&
@@ -901,7 +967,6 @@ Status RemoveUnnecessaryCopies(
       }
     }
   }
-
   return Status::OK();
 }
 
@@ -921,7 +986,7 @@ Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
 
   // Identify which shape indices of which instructions need to be copied. Store
   // these results in 'instructions_to_copy'.
-  std::unordered_map<HloInstruction*, ShapeTree<bool>> instructions_to_copy;
+  HloInstructionMap<ShapeTree<bool>> instructions_to_copy;
   auto add_index_to_copy = [&instructions_to_copy](HloInstruction* instruction,
                                                    const ShapeIndex& index) {
     auto it = instructions_to_copy.find(instruction);
@@ -957,7 +1022,8 @@ Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
     }
     TF_RET_CHECK(node.context() == CallContext::kSequential);
 
-    const bool is_entry = computation == module->entry_computation();
+    SpecialCaseCopyPolicy policy =
+        GetSpecialCaseCopyPolicy(node, module, computation);
     HloInstruction* root = computation->root_instruction();
 
     // Mark nondistinct/ambiguous indices.
@@ -970,27 +1036,26 @@ Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
           for (const HloBuffer* buffer : buffers_at_index) {
             buffer_seen_before |= !seen.insert(buffer).second;
           }
-          if (buffers_at_index.size() > 1 || (buffer_seen_before && is_entry)) {
-            VLOG(2) << "Index " << index << " of root of computation "
+          if (buffers_at_index.size() > 1 ||
+              (buffer_seen_before && policy.copy_root_replicated_buffers)) {
+            VLOG(2) << "Index " << index << " of computation "
                     << computation->name() << " (" << root->name()
                     << ") has ambiguous or non-distinct buffer. Copying.";
             add_index_to_copy(root, index);
           }
         });
 
-    // For entry instructions, mark any parameter or constant values.
-    if (is_entry) {
-      for (const auto& pair :
-           alias_analysis->dataflow_analysis().GetInstructionValueSet(root)) {
-        const ShapeIndex& index = pair.first;
-        const HloValueSet& value_set = pair.second;
-        for (const HloValue* value : value_set.values()) {
-          if (ValueIsReadOnly(*value)) {
-            VLOG(2) << "Root of entry computation (" << root->name()
-                    << ") has constant or entry parameter value at index "
-                    << index << ". Copying.";
-            add_index_to_copy(root, index);
-          }
+    for (const auto& pair :
+         alias_analysis->dataflow_analysis().GetInstructionValueSet(root)) {
+      const ShapeIndex& index = pair.first;
+      const HloValueSet& value_set = pair.second;
+      for (const HloValue* value : value_set.values()) {
+        if (ShouldCopyRootValue(*value, policy)) {
+          VLOG(2) << "Root of (" << root->name() << ") of computation("
+                  << computation->name()
+                  << ") has constant or parameter value at index " << index
+                  << ". Copying.";
+          add_index_to_copy(root, index);
         }
       }
     }
@@ -1012,7 +1077,6 @@ Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
       instruction->parent()->set_root_instruction(deep_copy);
     }
   }
-
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index c13a0b1cdf0b5be0b69db98b2b9587f30ca4c304..2fc6c6bd55157521328a1f5a056a399282e79db5 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -18,6 +18,10 @@ load(":build_defs.bzl", "runtime_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -85,12 +89,10 @@ cc_library(
         ":cpu_instruction_fusion",
         ":cpu_layout_assignment",
         ":cpu_options",
-        ":cpu_parallelization_preparation",
         ":disassembler",
         ":dot_op_emitter",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":parallel_cpu_executable",
         ":parallel_task_assignment",
         ":simple_orc_jit",
         "//tensorflow/compiler/xla:literal_util",
@@ -105,9 +107,11 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:gather_expander",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
@@ -163,10 +167,13 @@ cc_library(
         ":disassembler",
         ":external_constant_pool",
         ":orc_jit_memory_mapper",
+        ":runtime_fp16",
         ":runtime_conv2d",
+        ":runtime_conv2d_mkl",
         ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
@@ -182,6 +189,20 @@ cc_library(
     ] + ORC_JIT_MEMORY_MAPPER_TARGETS,
 )
 
+cc_library(
+    name = "runtime_fp16",
+    srcs = [
+        "runtime_fp16.cc",
+    ],
+    hdrs = [
+        "runtime_fp16.h",
+    ],
+    copts = runtime_copts(),
+    deps = [
+        "//tensorflow/core:framework_lite",
+    ],
+)
+
 cc_library(
     name = "cpu_executable",
     srcs = ["cpu_executable.cc"],
@@ -210,35 +231,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "parallel_cpu_executable",
-    srcs = ["parallel_cpu_executable.cc"],
-    hdrs = [
-        "parallel_cpu_executable.h",
-    ],
-    deps = [
-        ":cpu_runtime",
-        ":shape_partition",
-        ":simple_orc_jit",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
-        "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:logical_buffer",
-        "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "@llvm//:orc_jit",
-    ],
-)
-
 cc_library(
     name = "ir_emitter",
     srcs = [
@@ -479,6 +471,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_conv2d_mkl",
+    srcs = [
+        "runtime_conv2d_mkl.cc",
+    ],
+    hdrs = ["runtime_conv2d_mkl.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_conv2d",
+        ":runtime_single_threaded_conv2d",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//third_party/eigen3",
+    ] + if_mkl([
+        "@mkl_dnn",
+        "//third_party/mkl:intel_binary_blob",
+    ]),
+)
+
 cc_library(
     name = "runtime_fft",
     srcs = [
@@ -499,7 +512,6 @@ cc_library(
 
 cc_library(
     name = "runtime_matvec",
-    srcs = ["runtime_matvec.cc"],
     hdrs = ["runtime_matvec.h"],
     copts = runtime_copts(),
     deps = [
@@ -522,6 +534,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_matmul_mkl",
+    srcs = ["runtime_matmul_mkl.cc"],
+    hdrs = ["runtime_matmul_mkl.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ] + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn",
+    ]),
+)
+
 cc_library(
     name = "runtime_single_threaded_conv2d",
     srcs = [
@@ -568,10 +596,12 @@ cc_library(
 tf_cc_test(
     name = "cpu_runtime_test",
     srcs = ["cpu_runtime_test.cc"],
+    shard_count = 10,
     tags = ["optonly"],
     deps = [
         ":cpu_runtime",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_matmul",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:types",
@@ -594,6 +624,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -622,25 +653,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cpu_parallelization_preparation",
-    srcs = ["cpu_parallelization_preparation.cc"],
-    hdrs = [
-        "cpu_parallelization_preparation.h",
-    ],
-    deps = [
-        ":ir_emission_utils",
-        ":parallel_task_assignment",
-        ":shape_partition",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
-        "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "ir_emission_utils",
     srcs = ["ir_emission_utils.cc"],
@@ -654,6 +666,22 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "ir_emission_utils_test",
+    srcs = ["ir_emission_utils_test.cc"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+    ],
+)
+
 cc_library(
     name = "cpu_layout_assignment",
     srcs = ["cpu_layout_assignment.cc"],
@@ -756,6 +784,31 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "parallel_task_assignment_test",
+    srcs = ["parallel_task_assignment_test.cc"],
+    deps = [
+        ":cpu_executable",
+        ":parallel_task_assignment",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_layout",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:computation_layout",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "cpu_options",
     srcs = ["cpu_options.cc"],
@@ -859,17 +912,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index ed290fcdf8bb69f1bbad57fa5a0926376bc9405a..6a7eb85e3baec3517b8f3ddef6a8dcfae9c9e614 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -25,11 +25,11 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
@@ -93,8 +93,8 @@ class FilteredPassManager : public llvm::legacy::PassManager {
 };
 }  // anonymous namespace
 
-llvm::object::OwningBinary<llvm::object::ObjectFile> CompilerFunctor::
-operator()(llvm::Module& module) const {
+std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
+    llvm::Module& module) const {
   FilteredPassManager module_passes(disable_expensive_passes_);
   FilteredFunctionPassManager function_passes(&module,
                                               disable_expensive_passes_);
@@ -157,27 +157,8 @@ operator()(llvm::Module& module) const {
   codegen_passes.run(module);
 
   // Construct ObjectFile from machine code buffer.
-  std::unique_ptr<llvm::MemoryBuffer> memory_buffer(
-      new llvm::ObjectMemoryBuffer(std::move(stream_buffer)));
-  llvm::Expected<std::unique_ptr<llvm::object::ObjectFile>>
-      object_file_or_error = llvm::object::ObjectFile::createObjectFile(
-          memory_buffer->getMemBufferRef());
-  CHECK(object_file_or_error);
-
-  std::unique_ptr<llvm::object::ObjectFile> object_file =
-      std::move(object_file_or_error.get());
-  if (VLOG_IS_ON(2)) {
-    StatusOr<DisassemblerResult> disassembly_status =
-        disassembler_->DisassembleObjectFile(*object_file);
-    if (disassembly_status.ok()) {
-      auto result = disassembly_status.ValueOrDie();
-      XLA_VLOG_LINES(2, result.text);
-      VLOG(2) << "compiled code size: " << result.code_size_bytes << " bytes";
-    }
-  }
-
-  return llvm::object::OwningBinary<llvm::object::ObjectFile>(
-      std::move(object_file), std::move(memory_buffer));
+  return std::unique_ptr<llvm::MemoryBuffer>(
+      new llvm::SmallVectorMemoryBuffer(std::move(stream_buffer)));
 }
 
 static std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl() {
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
index 1a8283a702223a7414c1ffcd99c1ac42c04ac068..c38b896c5019b48fd2a16a51abd59e12ebdb29eb 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
@@ -47,7 +47,7 @@ class CompilerFunctor {
         post_optimization_hook_(post_optimization_hook) {}
 
   // Compile a Module to an ObjectFile.
-  llvm::object::OwningBinary<llvm::object::ObjectFile> operator()(
+  std::unique_ptr<llvm::MemoryBuffer> operator()(
       llvm::Module& module) const;  // NOLINT
 
  private:
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index f9cc9651846cca7bd6ab7e9e61590cec4e2400da..ec2bb6c762d0bbb1d28f4db00def04afa895d13d 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
 #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
@@ -55,17 +56,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h"
 #include "tensorflow/compiler/xla/service/cpu/disassembler.h"
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h"
-#include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/gather_expander.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
@@ -98,8 +98,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {
 
@@ -120,10 +118,12 @@ se::Platform::Id CpuAotCompilationOptions::PlatformId() const {
 
 CpuAotCompilationResult::CpuAotCompilationResult(
     ObjectFileData object_file_data, BufferSizes buffer_sizes,
-    int64 result_buffer_index)
+    int64 result_buffer_index,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data)
     : object_file_data_(std::move(object_file_data)),
       buffer_sizes_(std::move(buffer_sizes)),
-      result_buffer_index_(result_buffer_index) {}
+      result_buffer_index_(result_buffer_index),
+      hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {}
 
 CpuAotCompilationResult::~CpuAotCompilationResult() = default;
 
@@ -173,14 +173,13 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
  public:
   static StatusOr<std::unordered_map<const HloInstruction*, int64>>
   GetCandidatesForComputation(
-      HloComputation* computation,
+      const HloComputation& computation,
       const std::unordered_map<const HloInstruction*, int64>&
           assigned_indices) {
     std::unordered_map<const HloInstruction*, int64> hlo_to_profile_idx;
     CollectProfileCandidates profile_candidates_for_computation(
         &hlo_to_profile_idx, assigned_indices);
-    TF_RETURN_IF_ERROR(
-        computation->Accept(&profile_candidates_for_computation));
+    TF_RETURN_IF_ERROR(computation.Accept(&profile_candidates_for_computation));
     return hlo_to_profile_idx;
   }
 
@@ -275,6 +274,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
     pass.AddPass<HloDCE>();
     pass.AddPass<ReshapeMover>();
     pass.AddPass<HloConstantFolding>();
+    pass.AddPass<ConditionalSimplifier>();
   }
   pipeline.AddPass<TransposeFolding>(
       [](const HloInstruction& dot,
@@ -287,6 +287,8 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
   pipeline.AddPass<CpuInstructionFusion>();
 
+  pipeline.AddPass<GatherExpander>();
+
   ReducePrecisionInsertion::AddPasses(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
@@ -306,15 +308,12 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
       module->config().intra_op_parallelism_threads() > 0
           ? module->config().intra_op_parallelism_threads()
           : tensorflow::port::NumSchedulableCPUs();
-  if (options::CpuParallelBackendRequested(module->config())) {
-    pipeline.AddPass<ParallelizationPreparation>(max_parallelism,
-                                                 ShapeSizeBytesFunction());
-  } else if (!is_aot_compile) {
+  if (!is_aot_compile) {
     // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module.
     // Note this is not run for AOT because it would bring in thread pool
     // and thread synchronization dependencies which would likely increase
     // binary size (and most AOT applications are single-threaded).
-    // TODO(29630486) Support multi-threaded AOT.
+    // TODO(b/29630486) Support multi-threaded AOT.
     pipeline.AddPass<ParallelTaskAssigner>(max_parallelism,
                                            ShapeSizeBytesFunction());
   }
@@ -327,13 +326,6 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   pipeline.AddPass<HloDCE>();
   pipeline.AddPass<FlattenCallGraph>();
   pipeline.AddPass<CpuCopyInsertion>();
-  if (options::CpuParallelBackendRequested(module->config())) {
-    // Re-run the outlining, in case any copies were inserted into the entry
-    // computation.
-    pipeline.AddPass<ParallelizationPreparation>(max_parallelism,
-                                                 ShapeSizeBytesFunction());
-    pipeline.AddPass<CpuCopyInsertion>();
-  }
   pipeline.AddPass<HloDCE>();
   return pipeline.Run(module).status();
 }
@@ -433,11 +425,45 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
   return Status::OK();
 }
 
+Status CreateHloProfilingArtifacts(
+    const HloModule& module,
+    std::unordered_map<const HloInstruction*, int64>*
+        instruction_to_profile_idx,
+    std::unordered_map<const HloComputation*, int64>*
+        computation_to_profile_idx,
+    std::unique_ptr<HloProfileIndexMap>* hlo_profile_index_map,
+    std::unique_ptr<HloProfilePrinterData>* hlo_profile_printer_data) {
+  *hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(module);
+  const HloComputation& entry_computation = *module.entry_computation();
+
+  TF_ASSIGN_OR_RETURN(
+      *instruction_to_profile_idx,
+      CollectProfileCandidates::GetCandidatesForComputation(
+          entry_computation,
+          (*hlo_profile_index_map)->instruction_to_profile_idx()));
+
+  auto shape_size_bytes = [](const Shape& shape) {
+    // On the cpu, opaques are pointers.
+    if (ShapeUtil::IsOpaque(shape)) {
+      return static_cast<int64>(sizeof(void*));
+    }
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  };
+
+  HloCostAnalysis cost_analysis(shape_size_bytes);
+  TF_RETURN_IF_ERROR(entry_computation.Accept(&cost_analysis));
+  *hlo_profile_printer_data =
+      CreateHloProfilePrinterData(**hlo_profile_index_map, cost_analysis);
+  *computation_to_profile_idx =
+      (*hlo_profile_index_map)->computation_to_profile_idx();
+
+  return Status::OK();
+}
+
 }  // namespace
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* /*stream_exec*/,
+    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
     DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(2) << "Before optimization:";
   XLA_VLOG_LINES(2, module->ToString());
@@ -450,8 +476,7 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 }
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
-    std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* stream_exec,
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* /*device_allocator*/) {
   const string timer_message =
       "Compiling [" + module->name() + "] for CPU using JIT";
@@ -489,28 +514,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
   std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
   if (module->config().hlo_profiling_enabled()) {
-    hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
-
-    TF_ASSIGN_OR_RETURN(
-        instruction_to_profile_idx,
-        CollectProfileCandidates::GetCandidatesForComputation(
-            entry_computation,
-            hlo_profile_index_map->instruction_to_profile_idx()));
-
-    auto shape_size_bytes = [](const Shape& shape) {
-      // On the cpu, opaques are pointers.
-      if (ShapeUtil::IsOpaque(shape)) {
-        return static_cast<int64>(sizeof(void*));
-      }
-      return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-    };
-
-    HloCostAnalysis cost_analysis(shape_size_bytes);
-    TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));
-    hlo_profile_printer_data =
-        CreateHloProfilePrinterData(*hlo_profile_index_map, cost_analysis);
-    computation_to_profile_idx =
-        hlo_profile_index_map->computation_to_profile_idx();
+    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
+        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
+        &hlo_profile_index_map, &hlo_profile_printer_data));
   }
 
   std::unique_ptr<Executable> cpu_executable;
@@ -522,190 +528,80 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   const string xla_dump_optimized_hlo_proto_to =
       module->config().debug_options().xla_dump_optimized_hlo_proto_to();
 
-  if (options::CpuParallelBackendRequested(module->config())) {
-    VLOG(1) << "Using parallel cpu backend";
-
-    // Run buffer analysis on the HLO graph. This analysis figures out which
-    // temporary buffers are required to run the computation.
-    // DependencyHloOrdering is used for the parallel emitter because the order
-    // of HLO instruction execution is not known ahead of time.
-    // DependencyHloOrdering is the most conservative partial order and only
-    // uses data dependencies for determining order.
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(
-            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()),
-            BufferSizeBytesFunction(), memory_alignment));
-    // BufferAssignment::ToString() includes a header, so no need for us to
-    // print one ourselves.
-    XLA_VLOG_LINES(2, assignment->ToString());
-
-    if (!xla_dump_optimized_hlo_proto_to.empty()) {
-      HloProto proto = MakeHloProto(*module, *assignment);
-      TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_optimized_hlo_proto_to, module->name()));
-    }
-
-    // If we are using the parallel CPU backend, we need to create map from
-    // HloInstruction to the corresponding generated function name.
-    std::map<HloComputation*, HloInstruction*> parallel_computations;
-    std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
-        aligned_constants;
-    for (auto instruction : entry_computation->MakeInstructionPostOrder()) {
-      // Parameters and constants don't get their own computation.
-      if (instruction->opcode() == HloOpcode::kParameter) {
-        continue;
-      }
-      if (instruction->opcode() == HloOpcode::kConstant) {
-        // Copy the constant out of the ProtocolBuffer so that we can give it a
-        // higher alignment.
-        const void* data = instruction->literal().untyped_data();
-        int64 size = CpuExecutable::ShapeSizeBytes(instruction->shape());
-        auto iter = aligned_constants.emplace(
-            instruction, xla::MakeUnique<unsigned char[]>(size));
-        CHECK_EQ(iter.second, true);
-        unsigned char* aligned_data = iter.first->second.get();
-        memcpy(aligned_data, data, size);
-        continue;
-      }
-      // The parallel preparation should have ensured that the top-level
-      // computation consists solely of Call instructions.
-      TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall)
-          << module->ToString();
-      HloComputation* to_apply = instruction->to_apply();
-      parallel_computations.emplace(to_apply, instruction);
-    }
-
-    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         std::move(instruction_to_profile_idx),
-                         std::move(computation_to_profile_idx),
-                         jit->target_machine(), jit->external_constant_pool());
-
-    std::unique_ptr<HloInstructionMap<string>> function_names(
-        new HloInstructionMap<string>());
-    for (auto embedded_computation :
-         entry_computation->MakeEmbeddedComputationsList()) {
-      if (embedded_computation->IsFusionComputation()) {
-        continue;
-      }
-      auto parallel_computation_iter =
-          parallel_computations.find(embedded_computation);
-      // All parallel computations are considered to be an entry computation for
-      // IR generation purposes.
-      bool computation_is_parallel =
-          parallel_computation_iter != parallel_computations.end();
-      TF_ASSIGN_OR_RETURN(
-          llvm::Function * ir_function,
-          ir_emitter.EmitComputation(
-              embedded_computation, embedded_computation->name(),
-              /*is_top_level_computation=*/computation_is_parallel,
-              /*instruction_order=*/nullptr));
-      // If this computation is parallel, remember it in the function name map.
-      // This way we know what function to execute when we try to run code for
-      // the Call instruction.
-      if (computation_is_parallel) {
-        HloInstruction* call_instruction = parallel_computation_iter->second;
-        InsertOrDie(function_names.get(), call_instruction,
-                    llvm_ir::AsString(ir_function->getName()));
-      }
-    }
-
-    string ir_module_string;
-    if (embed_ir_in_executable) {
-      ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
-    }
-    TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
-
-    // JIT compile the LLVM IR module to in-memory machine code.
-    jit->AddModule(std::move(llvm_module));
-    cpu_executable.reset(new ParallelCpuExecutable(
-        std::move(jit), std::move(assignment), std::move(module),
-        std::move(function_names), std::move(aligned_constants),
-        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
-
-    if (embed_ir_in_executable) {
-      static_cast<CpuExecutable&>(*cpu_executable)
-          .set_ir_module_string(ir_module_string);
-    }
-  } else {
-    VLOG(1) << "Using sequential cpu backend";
-
-    // Select an order for emitting the HLO instructions for each
-    // computation. Using this sequence enables tighter buffer liveness analysis
-    // and reduced memory usage (as compared to using DependencyHloOrdering).
-    TF_ASSIGN_OR_RETURN(
-        SequentialHloOrdering::HloModuleSequence module_sequence,
-        CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction()));
-
-    // Run buffer analysis on the HLO graph. This analysis figures out which
-    // temporary buffers are required to run the computation.
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(module.get(),
-                            xla::MakeUnique<SequentialHloOrdering>(
-                                module.get(), module_sequence),
-                            BufferSizeBytesFunction(), memory_alignment));
-    // BufferAssignment::ToString() includes a header, so no need for us to
-    // print one ourselves.
-    XLA_VLOG_LINES(2, assignment->ToString());
-
-    if (!xla_dump_optimized_hlo_proto_to.empty()) {
-      HloProto proto = MakeHloProto(*module, *assignment);
-      TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_optimized_hlo_proto_to, module->name()));
-    }
-
-    // Each computation is a single function.  Emit all embedded computations
-    // before the entry computation. The order of computations returned from
-    // GetEmbeddedComputations guarantees that a called computation occurs
-    // before a caller computation.
+  // Select an order for emitting the HLO instructions for each
+  // computation. Using this sequence enables tighter buffer liveness analysis
+  // and reduced memory usage (as compared to using DependencyHloOrdering).
+  TF_ASSIGN_OR_RETURN(
+      SequentialHloOrdering::HloModuleSequence module_sequence,
+      CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction()));
+
+  // Run buffer analysis on the HLO graph. This analysis figures out which
+  // temporary buffers are required to run the computation.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> assignment,
+      BufferAssigner::Run(
+          module.get(),
+          xla::MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
+          BufferSizeBytesFunction(), memory_alignment));
+  // BufferAssignment::ToString() includes a header, so no need for us to
+  // print one ourselves.
+  XLA_VLOG_LINES(2, assignment->ToString());
+
+  if (!xla_dump_optimized_hlo_proto_to.empty()) {
+    HloProto proto = MakeHloProto(*module, *assignment);
+    TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
+        proto, xla_dump_optimized_hlo_proto_to, module->name()));
+  }
 
-    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         std::move(instruction_to_profile_idx),
-                         std::move(computation_to_profile_idx),
-                         jit->target_machine(), jit->external_constant_pool());
+  // Each computation is a single function.  Emit all embedded computations
+  // before the entry computation. The order of computations returned from
+  // GetEmbeddedComputations guarantees that a called computation occurs
+  // before a caller computation.
 
-    for (auto embedded_computation :
-         entry_computation->MakeEmbeddedComputationsList()) {
-      if (embedded_computation->IsFusionComputation()) {
-        continue;
-      }
-      TF_RETURN_IF_ERROR(
-          ir_emitter
-              .EmitComputation(embedded_computation,
-                               embedded_computation->name(),
-                               /*is_top_level_computation=*/false,
-                               &module_sequence.at(embedded_computation))
-              .status());
-    }
-    string function_name_prefix = entry_computation->name().empty()
-                                      ? "__compute"
-                                      : entry_computation->name();
-    TF_ASSIGN_OR_RETURN(
-        llvm::Function * entry_function,
-        ir_emitter.EmitComputation(entry_computation, function_name_prefix,
-                                   /*is_top_level_computation=*/true,
-                                   &module_sequence.at(entry_computation)));
+  IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+                       std::move(instruction_to_profile_idx),
+                       std::move(computation_to_profile_idx),
+                       jit->target_machine(), jit->external_constant_pool());
 
-    string function_name = llvm_ir::AsString(entry_function->getName());
-    string ir_module_string;
-    if (embed_ir_in_executable) {
-      ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
+  for (auto embedded_computation :
+       entry_computation->MakeEmbeddedComputationsList()) {
+    if (embedded_computation->IsFusionComputation()) {
+      continue;
     }
-    TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
+    TF_RETURN_IF_ERROR(
+        ir_emitter
+            .EmitComputation(embedded_computation, embedded_computation->name(),
+                             /*is_top_level_computation=*/false,
+                             &module_sequence.at(embedded_computation))
+            .status());
+  }
+  string function_name_prefix = entry_computation->name().empty()
+                                    ? "__compute"
+                                    : entry_computation->name();
+  TF_ASSIGN_OR_RETURN(
+      llvm::Function * entry_function,
+      ir_emitter.EmitComputation(entry_computation, function_name_prefix,
+                                 /*is_top_level_computation=*/true,
+                                 &module_sequence.at(entry_computation)));
+
+  string function_name = llvm_ir::AsString(entry_function->getName());
+  string ir_module_string;
+  if (embed_ir_in_executable) {
+    ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
+  }
+  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
-    XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module));
+  XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module));
 
-    // JIT compile the LLVM IR module to in-memory machine code.
-    jit->AddModule(std::move(llvm_module));
-    cpu_executable.reset(new CpuExecutable(
-        std::move(jit), std::move(assignment), std::move(module), function_name,
-        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
+  // JIT compile the LLVM IR module to in-memory machine code.
+  jit->AddModule(std::move(llvm_module));
+  cpu_executable.reset(new CpuExecutable(
+      std::move(jit), std::move(assignment), std::move(module), function_name,
+      std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
 
-    if (embed_ir_in_executable) {
-      static_cast<CpuExecutable&>(*cpu_executable)
-          .set_ir_module_string(ir_module_string);
-    }
+  if (embed_ir_in_executable) {
+    static_cast<CpuExecutable&>(*cpu_executable)
+        .set_ir_module_string(ir_module_string);
   }
 
   VLOG(1) << "Compilation finished";
@@ -836,11 +732,20 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
           proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
+    std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
+    std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
+
+    if (module->config().hlo_profiling_enabled()) {
+      TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
+          *module, &instruction_to_profile_idx, &computation_to_profile_idx,
+          &hlo_profile_index_map, &hlo_profile_printer_data));
+    }
+
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
-                         /*instruction_to_profile_idx=*/
-                         std::unordered_map<const HloInstruction*, int64>{},
-                         /*computation_to_profile_idx=*/
-                         std::unordered_map<const HloComputation*, int64>{},
+                         std::move(instruction_to_profile_idx),
+                         std::move(computation_to_profile_idx),
                          target_machine.get(),
                          /*external_constant_pool=*/nullptr);
     HloComputation* computation = module->entry_computation();
@@ -889,11 +794,10 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
         module->config().debug_options().xla_enable_fast_math(),
         module->config().debug_options().xla_llvm_disable_expensive_passes(),
         pre_optimization_ir_dump_hook, post_optimization_ir_dump_hook);
-    llvm::object::OwningBinary<llvm::object::ObjectFile> object_file =
+    std::unique_ptr<llvm::MemoryBuffer> object_file =
         compiler_functor(llvm_module);
-    llvm::StringRef object_file_data_ref = object_file.getBinary()->getData();
-    ObjectFileData object_file_data(object_file_data_ref.begin(),
-                                    object_file_data_ref.end());
+    ObjectFileData object_file_data(object_file->getBufferStart(),
+                                    object_file->getBufferEnd());
 
     BufferSizes buffer_sizes;
     for (const BufferAllocation& allocation : assignment->Allocations()) {
@@ -916,7 +820,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
 
     results.emplace_back(MakeUnique<CpuAotCompilationResult>(
         std::move(object_file_data), std::move(buffer_sizes),
-        result_slice.index()));
+        result_slice.index(), std::move(hlo_profile_printer_data)));
   }
 
   VLOG(1) << "Compilation finished";
@@ -935,9 +839,9 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
 }  // namespace xla
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() {
-    return xla::MakeUnique<xla::cpu::CpuCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::host::kHostPlatformId,
+      []() { return xla::MakeUnique<xla::cpu::CpuCompiler>(); });
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 3498139ab95d21383c6dc008ae5614b7bfe91148..65b05f04fa8d9c72e7bfb6978f6a6384dfbcf976 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -53,7 +53,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
                            RelocationModel relocation_model);
   ~CpuAotCompilationOptions() override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   // The triple used for compilation, similar to clang's -target flag.
   const string& triple() const { return triple_; }
@@ -76,10 +76,16 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
 
 class CpuAotCompilationResult : public AotCompilationResult {
  public:
-  CpuAotCompilationResult(ObjectFileData object_file_data,
-                          BufferSizes buffer_sizes, int64 result_buffer_index);
+  CpuAotCompilationResult(
+      ObjectFileData object_file_data, BufferSizes buffer_sizes,
+      int64 result_buffer_index,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
   ~CpuAotCompilationResult();
 
+  HloProfilePrinterData* hlo_profile_printer_data() const {
+    return hlo_profile_printer_data_.get();
+  }
+
   const ObjectFileData& object_file_data() const { return object_file_data_; }
   const BufferSizes& buffer_sizes() const { return buffer_sizes_; }
   int64 result_buffer_index() const { return result_buffer_index_; }
@@ -97,6 +103,10 @@ class CpuAotCompilationResult : public AotCompilationResult {
   // result of the computation.  This buffer should be passed into the output
   // parameter when calling the compiled computation.
   const int64 result_buffer_index_;
+
+  // Contains an instance of HloProfilePrinterData if HLO profiling is enabled,
+  // otherwise is nullptr.
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
 };
 
 // CPU-targeting implementation of the XLA Compiler interface.
@@ -112,25 +122,23 @@ class CpuCompiler : public LLVMCompiler {
   // Bring in
   // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
   //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //     std::vector<std::vector<se::StreamExecutor*>>
   //        stream_execs)
   using LLVMCompiler::Compile;
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
                      const AotCompilationOptions& options) override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index c053703c3524a47ee1de9681c1b986edbf109430..aabf4d5161e3af9d49876c6133f8ec5ddfbbf6d6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -45,8 +45,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/host/host_stream.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {
 
@@ -75,7 +73,7 @@ CpuExecutable::CpuExecutable(
 
 Status CpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
+    std::vector<se::DeviceMemoryBase>* buffers) {
   CHECK_EQ(buffers->size(), assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
@@ -245,19 +243,18 @@ static Status DeallocateTempBuffers(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
+StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        allocated_buffers,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
     std::vector<bool>* buffers_in_result) {
   se::Stream* stream = run_options->stream();
-  auto result_buffer = MakeUnique<ShapedBuffer>(
+  ScopedShapedBuffer result_buffer(
       /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
-      stream->parent()->platform(), stream->parent()->device_ordinal());
+      run_options->allocator(), stream->parent()->device_ordinal());
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer which is returned to the caller.
-  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
+  TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
       [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
         // The points to set is unambiguous so the set should be a
@@ -284,7 +281,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
   return std::move(result_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -303,7 +300,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
 
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result_buffer,
+      ScopedShapedBuffer result_buffer,
       CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   // Free all buffers not in the result.
@@ -313,7 +310,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   if (hlo_profiling_enabled()) {
@@ -322,7 +319,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
         "supported on CPU.");
   }
 
-  auto* host_stream = dynamic_cast<perftools::gputools::host::HostStream*>(
+  auto* host_stream = dynamic_cast<se::host::HostStream*>(
       run_options->stream()->implementation());
   se::Stream* stream = run_options->stream();
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
@@ -333,7 +330,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
 
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result_buffer,
+      ScopedShapedBuffer result_buffer,
       CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   LogLiveAddresses(buffers, buffers_in_result);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 267b89a10b3c038dc2048f0ad5b5b343c88ef0f9..68ad38cba88720a04519fc2473fe6f9decbaaf93 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -55,12 +55,12 @@ class CpuExecutable : public Executable {
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
@@ -71,11 +71,6 @@ class CpuExecutable : public Executable {
     ir_module_string_ = ir_module_string;
   }
 
-  const Status EqualOrFail(const Executable& executable) {
-    // TODO(b/62952745) Implement equality test on CPU executable.
-    return Unimplemented("Equality test on CPU executable is not implemented.");
-  }
-
   static int64 ShapeSizeBytes(const Shape& shape);
 
   // Type of the computation function we expect in the JIT.
@@ -95,29 +90,27 @@ class CpuExecutable : public Executable {
   // assignment. Each vector element corresponds to a particular Index. If
   // a vector element already contains a non-null DeviceMemoryBase, then no
   // buffer is assigned for this element.
-  Status AllocateBuffers(
-      DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
+  Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
+                         int device_ordinal,
+                         std::vector<se::DeviceMemoryBase>* buffers);
 
   // Calls the generated function performing the computation with the given
   // arguments using the supplied buffers.
   Status ExecuteComputeFunction(
       const ExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
-  // Create a ShapedBuffer for holding the result of the computation. The
+  // Creates a ScopedShapedBuffer for holding the result of the computation. The
   // addresses (DeviceMemoryBases) are set according to buffer assignment.
   // 'buffers_in_result' should point to a vector of the same size as
   // 'allocated_buffers'. An element in buffers_in_result is set to true if the
   // corresponding buffer is live out of the computation (and thus contained in
   // the returned ShapedBuffer).
-  StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
+  StatusOr<ScopedShapedBuffer> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          allocated_buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
       std::vector<bool>* buffers_in_result);
 
   // Returns the points-to set of the root instruction of the entry
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 482e04052d5a914eab0e5bff2c7a83f3b698052f..b40d264c03aba6e9308e8a621ae86e180e33c335 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -30,11 +30,11 @@ bool CanBeLoopFused(const HloInstruction& hlo) {
   // These are the only ones we fuse since we rely on effective elemental IR
   // generation.
   return hlo.IsElementwise() ||  //
-         hlo.opcode() == HloOpcode::kBitcast ||
          hlo.opcode() == HloOpcode::kBroadcast ||
          hlo.opcode() == HloOpcode::kConcatenate ||
          hlo.opcode() == HloOpcode::kDynamicSlice ||
          hlo.opcode() == HloOpcode::kDynamicUpdateSlice ||
+         hlo.opcode() == HloOpcode::kGather ||
          hlo.opcode() == HloOpcode::kPad ||
          hlo.opcode() == HloOpcode::kReshape ||
          hlo.opcode() == HloOpcode::kReverse ||
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 595c3f55b321f47e2312b93e0c238c7637495d77..a98e85a151ffb77e6682b82164603481265283c4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -77,7 +78,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_1) {
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
-TEST_F(InstructionFusionTest, DotOperationFusion_Bitcast) {
+TEST_F(InstructionFusionTest, DotOperationNoFusion_Bitcast) {
   HloComputation::Builder builder(TestName());
   HloInstruction* arg0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {2, 512, 2, 128}), "arg0"));
@@ -94,8 +95,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Bitcast) {
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
-  EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Fusion());
+  EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
 }
 
 TEST_F(InstructionFusionTest, DotOperationFusion_Reshape) {
@@ -244,35 +244,33 @@ class OpcodeFusionTest : public InstructionFusionTest {
   }
 };
 
-TEST_F(OpcodeFusionTest, Exponential_Bitcast_Negate) {
+TEST_F(OpcodeFusionTest, Exponential_Reshape_Negate) {
   HloComputation::Builder builder(TestName());
   Shape param_shape = ShapeUtil::MakeShape(F32, {1, 4});
   Shape result_shape = ShapeUtil::MakeShape(F32, {4});
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "param"));
-  // InstructionFusion::ShouldFuse() precludes fusing a bitcast whose operand
-  // is a parameter, so create an operand between the parameter and bitcast.
   HloInstruction* exp1 = builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kExp, param0));
-  HloInstruction* bitcast2 = builder.AddInstruction(
-      HloInstruction::CreateUnary(result_shape, HloOpcode::kBitcast, exp1));
+  HloInstruction* reshape2 =
+      builder.AddInstruction(HloInstruction::CreateReshape(result_shape, exp1));
   builder.AddInstruction(
-      HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, bitcast2));
+      HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape2));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
-      module.get(), {HloOpcode::kNegate, HloOpcode::kBitcast, HloOpcode::kExp,
+      module.get(), {HloOpcode::kNegate, HloOpcode::kReshape, HloOpcode::kExp,
                      HloOpcode::kParameter});
 }
 
-TEST_F(OpcodeFusionTest, Broadcast_Bitcast_DynamicSlice_Tanh) {
+TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   HloComputation::Builder builder(TestName());
   Shape param_shape = ShapeUtil::MakeShape(F32, {8});
   Shape starts_shape = ShapeUtil::MakeShape(F32, {2});
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {1, 8, 8});
-  Shape bitcast_shape = ShapeUtil::MakeShape(F32, {8, 8});
+  Shape reshape_shape = ShapeUtil::MakeShape(F32, {8, 8});
   Shape dynamic_slice_shape = ShapeUtil::MakeShape(F32, {4, 4});
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "param"));
@@ -280,11 +278,11 @@ TEST_F(OpcodeFusionTest, Broadcast_Bitcast_DynamicSlice_Tanh) {
       HloInstruction::CreateParameter(1, starts_shape, "starts"));
   HloInstruction* broadcast2 = builder.AddInstruction(
       HloInstruction::CreateBroadcast(broadcast_shape, param0, {1}));
-  HloInstruction* bitcast3 = builder.AddInstruction(HloInstruction::CreateUnary(
-      bitcast_shape, HloOpcode::kBitcast, broadcast2));
+  HloInstruction* reshape3 = builder.AddInstruction(
+      HloInstruction::CreateReshape(reshape_shape, broadcast2));
   HloInstruction* dynamic_slice4 =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-          dynamic_slice_shape, bitcast3, param1, {4, 4}));
+          dynamic_slice_shape, reshape3, param1, {4, 4}));
   builder.AddInstruction(HloInstruction::CreateUnary(
       dynamic_slice_shape, HloOpcode::kTanh, dynamic_slice4));
 
@@ -293,7 +291,7 @@ TEST_F(OpcodeFusionTest, Broadcast_Bitcast_DynamicSlice_Tanh) {
 
   RunFusionAndCheckOpcodesWereFused(
       module.get(),
-      {HloOpcode::kTanh, HloOpcode::kDynamicSlice, HloOpcode::kBitcast,
+      {HloOpcode::kTanh, HloOpcode::kDynamicSlice, HloOpcode::kReshape,
        HloOpcode::kBroadcast, HloOpcode::kParameter, HloOpcode::kParameter});
 }
 
@@ -700,6 +698,154 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
               Not(op::Fusion()));
 }
 
+struct GatherLoopFusionTestSpec {
+  string test_name;
+  string hlo_computation_text;
+
+  static string Name(
+      const ::testing::TestParamInfo<GatherLoopFusionTestSpec>& info) {
+    return info.param.test_name;
+  }
+};
+
+class GatherLoopFusionTest
+    : public OpcodeFusionTest,
+      public ::testing::WithParamInterface<GatherLoopFusionTestSpec> {};
+
+TEST_P(GatherLoopFusionTest, GatherLoopFusion) {
+  const GatherLoopFusionTestSpec& spec = GetParam();
+  string hlo_string = tensorflow::strings::StrCat(
+      "HloModule ", spec.test_name, "\n\n", spec.hlo_computation_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+
+  RunFusionAndCheckOpcodesWereFused(
+      module.get(),
+      {HloOpcode::kGather, HloOpcode::kAdd, HloOpcode::kBroadcast,
+       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter});
+}
+
+std::vector<GatherLoopFusionTestSpec> GetGatherLoopFusionTestSpecs() {
+  std::vector<GatherLoopFusionTestSpec> result;
+
+  result.push_back({"FusedTensorFlowGatherV2", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  gather = s32[3,2] gather(operand, indices),
+      output_window_dims={0},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[3,2] broadcast(one), dimensions={}
+  ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedTensorFlowGatherMultipleBatchDims", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,3,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,3,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedTensorFlowGatherNdMultipleBatchDims", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=2,
+      window_bounds={1, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedTensorFlowGatherNd_0", R"(
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1,2}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedTensorFlowGatherNd_1", R"(
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1,2}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedDynamicSlice", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  gather = s32[1,1] gather(operand, indices),
+      output_window_dims={0,1},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[1,1] broadcast(one), dimensions={}
+  ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedBatchDynamicSlice", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,1,1] gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,1,1] broadcast(one), dimensions={}
+  ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  return result;
+}
+
+INSTANTIATE_TEST_CASE_P(GatherLoopFusionTestInstantiation, GatherLoopFusionTest,
+                        ::testing::ValuesIn(GetGatherLoopFusionTestSpecs()),
+                        GatherLoopFusionTestSpec::Name);
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index 09f028463af68bbc2841fecdb2ca6c6a42498798..f9c51f243c47b8069500eca3c9c2929b17f04e62 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 namespace {
 
-const char* const kXlaParallelCpuOption = "xla_cpu_parallel";
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
@@ -30,12 +29,6 @@ namespace xla {
 namespace cpu {
 namespace options {
 
-bool CpuParallelBackendRequested(const HloModuleConfig& config) {
-  const auto& extra_options_map =
-      config.debug_options().xla_backend_extra_options();
-  return extra_options_map.count(kXlaParallelCpuOption) > 0;
-}
-
 bool OptimizeForSizeRequested(const HloModuleConfig& config) {
   const auto& extra_options_map =
       config.debug_options().xla_backend_extra_options();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 6ba0fd24538b63a3da81083482e6bee3b552dfea..be62ff3cc1af23408ca8a00f1372e7a998f160c6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -24,7 +24,6 @@ namespace xla {
 namespace cpu {
 namespace options {
 
-bool CpuParallelBackendRequested(const HloModuleConfig& config);
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
 tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
deleted file mode 100644
index 662ee609232f5582ce74f4f515637b2623175e94..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h"
-
-#include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
-#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-
-namespace xla {
-namespace cpu {
-
-StatusOr<bool> ParallelizationPreparation::Run(HloModule* module) {
-  XLA_VLOG_LINES(2, "ParallelizationPreparation ENTRY");
-  XLA_VLOG_LINES(2, module->ToString());
-
-  bool changed = false;
-  TF_ASSIGN_OR_RETURN(changed, RunParallelTaskAssignment(module));
-
-  HloComputation* entry_computation = module->entry_computation();
-  std::unordered_set<HloInstruction*> outlined;
-  std::vector<HloInstruction*> instructions_to_outline;
-  for (HloInstruction* instruction :
-       entry_computation->MakeInstructionPostOrder()) {
-    // If the instruction has been outlined, it no longer exists and we must not
-    // dereference it.
-    if (outlined.count(instruction) > 0) {
-      continue;
-    }
-
-    // Skip parameters and constants, there is nothing to parallelize.
-    if (instruction->opcode() == HloOpcode::kParameter ||
-        instruction->opcode() == HloOpcode::kConstant) {
-      continue;
-    }
-
-    // Outline 'instruction' in isolation if it was assigned parallel tasks.
-    if (OutlineParallelizableInstruction(instruction)) {
-      outlined.insert(instruction);
-      changed = true;
-      continue;
-    }
-
-    instructions_to_outline.clear();
-    HloInstruction* outline_candidate = instruction;
-    instructions_to_outline.push_back(outline_candidate);
-
-    // Outline sole users with the current instruction.
-    while (CanOutlineWithUser(outline_candidate)) {
-      HloInstruction* prior_candidate = outline_candidate;
-      outline_candidate = *outline_candidate->users().begin();
-      if (std::any_of(outline_candidate->operands().begin(),
-                      outline_candidate->operands().end(),
-                      [&](const HloInstruction* operand) {
-                        // Do not consider any candidates which have operands
-                        // other than the prior candidate, constants or
-                        // parameters. Otherwise, we'd increase the fan-in which
-                        // would reduce parallelism.
-                        return operand->opcode() != HloOpcode::kParameter &&
-                               operand->opcode() != HloOpcode::kConstant &&
-                               operand != prior_candidate;
-                      })) {
-        break;
-      }
-      instructions_to_outline.push_back(outline_candidate);
-    }
-
-    outlined.insert(instructions_to_outline.begin(),
-                    instructions_to_outline.end());
-
-    // Optimization to avoid replacing a single existing kCall with another
-    // kCall that just calls the first one.
-    if (instructions_to_outline.size() == 1 &&
-        instructions_to_outline[0]->opcode() == HloOpcode::kCall) {
-      continue;
-    }
-
-    module->OutlineExpressionFromComputation(
-        instructions_to_outline,
-        tensorflow::strings::StrCat("pp_", instruction->name()),
-        entry_computation);
-    changed = true;
-  }
-
-  XLA_VLOG_LINES(2, "ParallelizationPreparation EXIT");
-  XLA_VLOG_LINES(2, module->ToString());
-  return changed;
-}
-
-StatusOr<bool> ParallelizationPreparation::RunParallelTaskAssignment(
-    HloModule* module) {
-  VLOG(1) << "RunParallelTaskAssignment max_parallelism_: " << max_parallelism_;
-  bool changed = false;
-  // Initialize ParallelTaskAssignment.
-  ParallelTaskAssignment parallel_task_assignment(max_parallelism_, shape_size_,
-                                                  module);
-  // Assign parallel tasks to HLOs in entry computation.
-  HloComputation* computation = module->entry_computation();
-  for (auto* instruction : computation->instructions()) {
-    // Calculate target parallel task count in [1, max_parallelism_].
-    const int64 target_parallel_task_count =
-        parallel_task_assignment.GetTargetParallelTaskCount(instruction);
-    if (target_parallel_task_count == 1) {
-      continue;
-    }
-
-    // Assign feasible dimension partitions (based on actual dimension sizes).
-    auto dim_partition_counts = ShapePartitionAssigner(instruction->shape())
-                                    .Run(target_parallel_task_count);
-    const int64 total_partition_count =
-        ShapePartitionAssigner::GetTotalPartitionCount(dim_partition_counts);
-    if (total_partition_count <= 1) {
-      // Feasible partition calculation resulting in no partitioning, so skip.
-      continue;
-    }
-    VLOG(2) << "Assigning parallel task count: " << total_partition_count
-            << " to instruction: " << instruction->name();
-    // Map 'instruction' to assigned dimension partitioning.
-    instruction->set_outer_dimension_partitions(dim_partition_counts);
-  }
-
-  return changed;
-}
-
-bool ParallelizationPreparation::OutlineParallelizableInstruction(
-    HloInstruction* instruction) {
-  if (instruction->outer_dimension_partitions().empty()) {
-    return false;
-  }
-  // Store dimension partition counts before outlining (which clones
-  // 'instruction').
-  std::vector<int64> dim_partition_counts =
-      instruction->outer_dimension_partitions();
-  // Outline 'instruction' in its own sub-computation.
-  HloModule* module = instruction->parent()->parent();
-  auto* call = module->OutlineExpressionFromComputation(
-      {instruction}, tensorflow::strings::StrCat("pp_", instruction->name()),
-      module->entry_computation());
-  // Map previously assigned 'dim_partition_counts' to cloned root instruction.
-  VLOG(1) << "Outlining parallelizable"
-          << " caller: " << call->name()
-          << " callee: " << call->to_apply()->root_instruction()->name();
-  call->to_apply()->root_instruction()->set_outer_dimension_partitions(
-      dim_partition_counts);
-  return true;
-}
-
-bool ParallelizationPreparation::CanOutlineWithUser(
-    HloInstruction* instruction) {
-  if (instruction->users().size() != 1) {
-    // Do not outline 'instruction' with multiple users.
-    return false;
-  }
-  if (AssignedParallelTasks(instruction) ||
-      AssignedParallelTasks(*instruction->users().begin())) {
-    // Do not outline if 'instruction' (or user) were assigned parallel tasks.
-    return false;
-  }
-  return true;
-}
-
-bool ParallelizationPreparation::AssignedParallelTasks(
-    HloInstruction* instruction) {
-  return !instruction->outer_dimension_partitions().empty() ||
-         (instruction->opcode() == HloOpcode::kCall &&
-          !instruction->to_apply()
-               ->root_instruction()
-               ->outer_dimension_partitions()
-               .empty());
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h
deleted file mode 100644
index 87be758ef5d0535fdce3a65e54ce225042019cdb..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_
-
-#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-
-namespace xla {
-namespace cpu {
-
-// This pass prepares an HLO module for parallel execution by transforming
-// subgraphs of the top-level computation into embedded computations which can
-// be executed in parallel.
-// TODO(b/29630486): Currently, it is limited to turning all instructions (which
-// are not constants or parameters) in the entry computation into embedded
-// computations.  However, it could make sense to coarsen the parallelization to
-// improve cache locality.  Also, we will need to do something to intelligently
-// handle While constructs.
-class ParallelizationPreparation : public HloPassInterface {
- public:
-  // 'max_parallelism': the maximum parallel task count per instruction.
-  // 'shape_size': shape size function used by HloCostAnalysis during parallel
-  //               task assignment.
-  ParallelizationPreparation(
-      const int64 max_parallelism,
-      const HloCostAnalysis::ShapeSizeFunction& shape_size)
-      : max_parallelism_(max_parallelism), shape_size_(shape_size) {}
-  ~ParallelizationPreparation() override {}
-
-  tensorflow::StringPiece name() const override {
-    return "cpu-parallel-prepare";
-  }
-
-  // Run parallel preparation on the given computation. Returns whether the
-  // computation was changed.
-  StatusOr<bool> Run(HloModule* module) override;
-
- private:
-  // Assigns parallel task partitions to conformant instructions in 'module'.
-  // Returns true on success or error status otherwise.
-  StatusOr<bool> RunParallelTaskAssignment(HloModule* module);
-
-  // Outlines 'instruction' from entry computation, if it had
-  // been assigned parallel tasks in an earlier pass through the computation.
-  // Returns true if 'instruction' was successfully outlined, false otherwise.
-  bool OutlineParallelizableInstruction(HloInstruction* instruction);
-
-  // Returns true if 'instruction' can be outlined into the same sub-computation
-  // with its single user (parallelizable instructions are not outlined with
-  // each other). Returns false otherwise.
-  bool CanOutlineWithUser(HloInstruction* instruction);
-
-  // Returns true if 'instruction' (or the root of the sub-computation that
-  // 'instruction' calls) has had parallel tasks assigned in earlier pass.
-  // Returns false otherwise.
-  bool AssignedParallelTasks(HloInstruction* instruction);
-
-  const int64 max_parallelism_;
-  const HloCostAnalysis::ShapeSizeFunction shape_size_;
-};
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 40ace963270e8cead47cc731cc326351178dff7d..215405f6802cf1956ebec011da2fcd11b95c0c64 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -31,15 +31,28 @@ XfeedManager* GetXfeedManager() {
   return manager;
 }
 
+extern const char* const kEigenMatMulF16SymbolName =
+    "__xla_cpu_runtime_EigenMatMulF16";
 extern const char* const kEigenMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenMatMulF32";
 extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
+extern const char* const kMKLConvF32SymbolName = "__xla_cpu_runtime_MKLConvF32";
+extern const char* const kMKLMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF32";
+extern const char* const kMKLMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF64";
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF32";
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF64";
 extern const char* const kEigenConvF16SymbolName =
     "__xla_cpu_runtime_EigenConvF16";
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
+extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF32";
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 2141dfe1cedd6f9674acc348152574b4fd30895b..1dce6efa5cd65e67ae73a2e2affe2d2d3c537508 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -41,11 +41,18 @@ namespace runtime {
 //    the actual symbol.
 // 2. When using ahead-of-time compilation, the linker can resolve the name
 //    because it is a symbol in the cpu_runtime library.
+extern const char* const kEigenMatMulF16SymbolName;
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kMKLConvF32SymbolName;
+extern const char* const kMKLMatMulF32SymbolName;
+extern const char* const kMKLMatMulF64SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
+extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenSingleThreadedConvF16SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index f385829cdf5cafbd35e083f47106734cdd5dde88..2ac950e6d93ade315808f2ca1d0bdd7bc85f53b9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
@@ -130,25 +131,23 @@ MatMulShape MatMulShapes[] = {
 // * transpose_lhs
 // * transpose_rhs
 // * single_threaded
-using EigenMatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
+using MatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
 
-class EigenMatMulTest
-    : public CpuRuntimeTest,
-      public ::testing::WithParamInterface<EigenMatMulTestParam> {
+class EigenMatMulTest : public CpuRuntimeTest,
+                        public ::testing::WithParamInterface<MatMulTestParam> {
  public:
-  static string Name(
-      const ::testing::TestParamInfo<EigenMatMulTestParam>& info) {
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
     MatMulShape shape = std::get<0>(info.param);
     bool transpose_lhs = std::get<1>(info.param);
     bool transpose_rhs = std::get<2>(info.param);
     bool single_threaded = std::get<3>(info.param);
 
     return tensorflow::strings::Printf(
-        "MatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        "EigenMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
         transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
         single_threaded ? "single" : "multi");
   }
-};  // namespace xla
+};
 
 TEST_P(EigenMatMulTest, DoIt) {
   MatMulShape shape = std::get<0>(GetParam());
@@ -169,5 +168,74 @@ INSTANTIATE_TEST_CASE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
                                            ::testing::Bool()),
                         EigenMatMulTest::Name);
 
+#ifdef INTEL_MKL
+class MKLMatMulTest : public CpuRuntimeTest,
+                      public ::testing::WithParamInterface<MatMulTestParam> {
+ public:
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
+    MatMulShape shape = std::get<0>(info.param);
+    bool transpose_lhs = std::get<1>(info.param);
+    bool transpose_rhs = std::get<2>(info.param);
+    bool single_threaded = std::get<3>(info.param);
+
+    return tensorflow::strings::Printf(
+        "MKLMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
+        single_threaded ? "single" : "multi");
+  }
+};
+
+std::unique_ptr<Array2D<float>> MKLMatrixMultiply(const Array2D<float>& a,
+                                                  const Array2D<float>& b,
+                                                  bool transpose_lhs,
+                                                  bool transpose_rhs,
+                                                  bool single_threaded) {
+  CHECK_EQ(a.width(), b.height());
+  int64 m = a.height();
+  int64 n = b.width();
+  int64 k = a.width();
+
+  // The MKL matmul runtime function expects the matrix to be in column major
+  // order and array2d is in row-major order. Create transposes of a and b. The
+  // 'data' buffer in the transposed array is the original array in column major
+  // order.
+  auto a_transpose = MaybeTransposeArray2D(a, !transpose_lhs);
+  auto b_transpose = MaybeTransposeArray2D(b, !transpose_rhs);
+
+  // Since we're going to transpose c before returning it, swap the order of the
+  // dimension sizes to ensure the returned array is properly dimensioned.
+  auto c_transpose = MakeUnique<Array2D<float>>(n, m);
+  if (single_threaded) {
+    __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+        nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
+        m, n, k, transpose_lhs, transpose_rhs);
+  } else {
+    __xla_cpu_runtime_MKLMatMulF32(nullptr, c_transpose->data(),
+                                   a_transpose->data(), b_transpose->data(), m,
+                                   n, k, transpose_lhs, transpose_rhs);
+  }
+  return MaybeTransposeArray2D(*c_transpose, true);
+}
+
+TEST_P(MKLMatMulTest, DoIt) {
+  MatMulShape shape = std::get<0>(GetParam());
+  bool transpose_lhs = std::get<1>(GetParam());
+  bool transpose_rhs = std::get<2>(GetParam());
+  bool single_threaded = std::get<3>(GetParam());
+
+  auto a = MakeLinspaceArray2D(0.0, 1.0, shape.m, shape.k);
+  auto b = MakeLinspaceArray2D(-2.0, 2.0, shape.k, shape.n);
+  auto c =
+      MKLMatrixMultiply(*a, *b, transpose_lhs, transpose_rhs, single_threaded);
+  CheckMatrixMultiply(*a, *b, *c);
+}
+
+INSTANTIATE_TEST_CASE_P(MKLMatMulTestInstantiaion, MKLMatMulTest,
+                        ::testing::Combine(::testing::ValuesIn(MatMulShapes),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()),
+                        MKLMatMulTest::Name);
+#endif  // INTEL_MKL
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index f5e61aef534da57ce13d3ee9bbeaeaec31f53d2e..9b39e7f5765ae5eb6a25c06eef4d74b1c00e5c91 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -34,8 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 namespace {
@@ -241,21 +239,20 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
 }
 
 StatusOr<Shape> CpuTransferManager::TransferTupleBuffersFromOutfeed(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data) {
   return TransferBuffersFromOutfeedInternal(executor, buffer_data,
                                             /*is_tuple=*/true);
 }
 
 StatusOr<Shape> CpuTransferManager::TransferArrayBufferFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, void* destination,
-    int64 size_bytes) {
+    se::StreamExecutor* executor, void* destination, int64 size_bytes) {
   return TransferBuffersFromOutfeedInternal(
       executor, {{destination, size_bytes}}, /*is_tuple=*/false);
 }
 
 StatusOr<Shape> CpuTransferManager::TransferBuffersFromOutfeedInternal(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
     bool is_tuple) {
   std::vector<std::unique_ptr<CpuOutfeedBuffer>> buffers;
@@ -306,8 +303,8 @@ static std::unique_ptr<xla::TransferManager> CreateCpuTransferManager() {
 }
 
 static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(se::host::kHostPlatformId,
-                                                &CreateCpuTransferManager);
+  xla::TransferManager::RegisterTransferManager(
+      stream_executor::host::kHostPlatformId, &CreateCpuTransferManager);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
index 6c7524d94716464218ba18ad9950f702d2759f89..3ecb0d236498371f48caf63249f9cd4e8777752b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@@ -37,36 +37,35 @@ class CpuTransferManager : public GenericTransferManager {
   CpuTransferManager();
   ~CpuTransferManager() override {}
 
-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
-  Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;
 
  private:
   // Transfers infeed data to device. InfeedBuffer->Done() must be
   // called to clean up the memory allocated for InfeedBuffer.
   StatusOr<cpu::runtime::XfeedBuffer*> TransferBufferToInfeedInternal(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source);
+      se::StreamExecutor* executor, int64 size, const void* source);
 
   // Helper that transfers a tuple of element buffers from the device's outfeed.
   StatusOr<Shape> TransferTupleBuffersFromOutfeed(
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data);
 
   // Helper that transfers an array buffer from the device's outfeed.
-  StatusOr<Shape> TransferArrayBufferFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, void* destination,
-      int64 size_bytes);
+  StatusOr<Shape> TransferArrayBufferFromOutfeed(se::StreamExecutor* executor,
+                                                 void* destination,
+                                                 int64 size_bytes);
 
   // On success, returns the shape that was transferred from the outfeed -- if
   // is_tuple is true, the returned shape will be a tuple of the returned shapes
   // for the given buffers.
   StatusOr<Shape> TransferBuffersFromOutfeedInternal(
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
       bool is_tuple);
 
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index cfe7c9c3af0be109ac8a86753e880e2bcbceba41..495fecc4aa8b3cf8fcb3ab63d82d8146546854da 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -715,6 +715,11 @@ tensorflow::Status DotOpEmitter::Emit() {
   // which performs the sum-of-products (the reduction loop) before storing
   // the result in the output buffer.
 
+  // This routine assumes that the dot operation is not in a parallelized
+  // enclosing computation.
+  CHECK(
+      dot_.parent()->root_instruction()->outer_dimension_partitions().empty());
+
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
 
@@ -913,22 +918,35 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded_eigen =
+  bool multi_threaded =
       hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
   const char* fn_name;
   switch (type) {
+    case F16:
+      fn_name = multi_threaded
+                    ? runtime::kEigenMatMulF16SymbolName
+                    : runtime::kEigenSingleThreadedMatMulF16SymbolName;
+      float_type = ir_builder_->getHalfTy();
+      break;
     case F32:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF32SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF32SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF32SymbolName
+                                   : runtime::kEigenMatMulF32SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF32SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF32SymbolName);
       float_type = ir_builder_->getFloatTy();
       break;
     case F64:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF64SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF64SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF64SymbolName
+                                   : runtime::kEigenMatMulF64SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF64SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF64SymbolName);
       float_type = ir_builder_->getDoubleTy();
       break;
     default:
@@ -1051,7 +1069,9 @@ static bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
   // The inputs and the output must
   // 1) be matrices with no padding, and
   // 2) have an allowed element type.
-  return output_shape.element_type() == F32 &&
+  PrimitiveType output_primitive_type = output_shape.element_type();
+  return (output_primitive_type == F64 || output_primitive_type == F32 ||
+          output_primitive_type == F16) &&
          IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
          IsRank2WithNoPadding(output_shape);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index 99c5e16db70c6a203b4751c1ed8a106c0dc260e6..e97113dfa0f59e791d614c0093d0781e49c48ee4 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -115,7 +115,7 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
       for (int i = 0; i < hlo->operand_count(); i++) {
         TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
                             operand_to_generator.at(hlo->operand(i))(
-                                ElementwiseSourceIndex(index, *hlo, 0)));
+                                ElementwiseSourceIndex(index, *hlo, i)));
         operands.push_back(operand_value);
       }
       return ir_emitter_->EmitScalarCall(hlo->shape().element_type(),
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index 788217aab6172b4e548452b3f6ffd4197c163ce4..f209a69e3cd0f8d336d61bafd1e22be8bc88ca3f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -34,14 +34,16 @@ bool PotentiallyImplementedAsEigenConvolution(
   //
   // To be sufficient, certain layout constraints need to be satisfied as well.
   const Shape& input_shape = convolution.operand(0)->shape();
-  const Shape& kernel_shape = convolution.operand(0)->shape();
+  const Shape& kernel_shape = convolution.operand(1)->shape();
   if (ShapeUtil::HasZeroElements(input_shape) ||
       ShapeUtil::HasZeroElements(kernel_shape)) {
     return false;
   }
+  // Make sure input and kernel has the same data type.
+  CHECK(
+      ShapeUtil::SameElementTypeIgnoringFpPrecision(input_shape, kernel_shape));
   // TODO(b/65408531): Explore using Eigen dot for complex64 type.
-  if (ShapeUtil::ElementIsComplex(input_shape) ||
-      ShapeUtil::ElementIsComplex(kernel_shape)) {
+  if (ShapeUtil::ElementIsComplex(input_shape)) {
     return false;
   }
   if (window_util::HasWindowReversal(convolution.window())) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..215f48c4cc1a1a6b13d98dff76e0d1f0f773f5c1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+namespace xla {
+namespace {
+
+TEST(IrEmitterTest, ConvWithZeroSizedKernelNotImplementedAsEigen) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithConv
+
+ENTRY Conv {
+  input = f32[32,50,28,28]{3,2,1,0} parameter(0)
+  kernel = f32[0,32,5,5]{3,2,1,0} parameter(1)
+  ROOT convolution = f32[64,50,24,24]{3,2,1,0} convolution(input, kernel),
+    window={size=5x5},
+    dim_labels=b01f_01io->b01f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+
+  HloComputation* entry_computation = module->entry_computation();
+
+  HloInstruction* conv_instr = entry_computation->root_instruction();
+  EXPECT_FALSE(cpu::PotentiallyImplementedAsEigenConvolution(*conv_instr));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 4dffaee87f6b33933b58c8c58478eec918569197..d582b5aaae93799b0fc0e57873c85ec5af9e8d08 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -93,8 +93,6 @@ IrEmitter::IrEmitter(
       computation_to_profile_idx_(std::move(computation_to_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
-      parallel_cpu_backend_(
-          options::CpuParallelBackendRequested(hlo_module_config_)),
       is_top_level_computation_(false),
       target_machine_features_(target_machine),
       external_constant_pool_(external_constant_pool) {
@@ -438,12 +436,14 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
 
   if (kind == XfeedKind::kInfeed) {
     // Copy to the program buffer address from the acquired buffer.
-    ir_builder_.CreateMemCpy(program_buffer_address, acquired_pointer,
-                             length_32, 1);
+    ir_builder_.CreateMemCpy(program_buffer_address, /*DstAlign=*/1,
+                             acquired_pointer,
+                             /*SrcAlign=*/1, length_32);
   } else {
     // Outfeed -- copy from the in-program address to the acquired buffer.
-    ir_builder_.CreateMemCpy(acquired_pointer, program_buffer_address,
-                             length_32, 1);
+    ir_builder_.CreateMemCpy(acquired_pointer, /*DstAlign=*/1,
+                             program_buffer_address,
+                             /*SrcAlign=*/1, length_32);
   }
 
   ir_builder_.CreateCall(release_func,
@@ -854,6 +854,8 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   const ConvolutionDimensionNumbers& dnums =
       convolution->convolution_dimension_numbers();
 
+  // TODO(tonywy): Add PotentiallyImplementedAsMKLCovolution to support
+  // different data layouts.
   if (PotentiallyImplementedAsEigenConvolution(*convolution)) {
     const Shape& lhs_shape = lhs->shape();
     const Shape& rhs_shape = rhs->shape();
@@ -942,16 +944,26 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
            int64_type,    int64_type,  int64_type,  int64_type,  int64_type,
            int64_type,    int64_type,  int64_type,  int64_type},
           /*isVarArg=*/false);
-      bool multi_threaded_eigen =
+      bool multi_threaded =
           hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+      bool use_mkl_dnn =
+          hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
+
+      // TODO(b/78639006) Singlethread MKL conv2d is not implemented due to the
+      // potential race condition by setting the omp_num_threads.
       const char* fn_name =
           primitive_type == F16
-              ? (multi_threaded_eigen
+              ? (multi_threaded
                      ? runtime::kEigenConvF16SymbolName
                      : runtime::kEigenSingleThreadedConvF16SymbolName)
-              : (multi_threaded_eigen
-                     ? runtime::kEigenConvF32SymbolName
+              : (multi_threaded
+                     ? (use_mkl_dnn ? runtime::kMKLConvF32SymbolName
+                                    : runtime::kEigenConvF32SymbolName)
                      : runtime::kEigenSingleThreadedConvF32SymbolName);
+      if (!multi_threaded && use_mkl_dnn) {
+        LOG(WARNING) << "Using Eigen instead of MKL-DNN for single-threaded "
+                        "conv2d function.";
+      }
       llvm::Function* conv_func = llvm::cast<llvm::Function>(
           module_->getOrInsertFunction(fn_name, conv_type));
       conv_func->setCallingConv(llvm::CallingConv::C);
@@ -2074,7 +2086,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
 
     TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
         /*instruction=*/*root, /*operands=*/{lhs, rhs},
-        /*supported_types=*/{F32}));
+        /*supported_types=*/{F16, F32, F64}));
 
     llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
     llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
@@ -2161,8 +2173,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(call));
 
-  if (!computation->root_instruction()->outer_dimension_partitions().empty() &&
-      !parallel_cpu_backend_) {
+  if (!computation->root_instruction()->outer_dimension_partitions().empty()) {
     // ParallelTaskAssignment assigned partitions, emit call to
     // ParallelForkJoin.
     std::vector<llvm::Value*> call_args = GetArrayFunctionCallArguments(
@@ -2441,7 +2452,8 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
     target_array.AnnotateLoadStoreInstructionWithMetadata(store_instruction);
   } else {
     auto* memcpy_instruction = ir_builder_.CreateMemCpy(
-        target, source, element_count * primitive_type_size, element_alignment);
+        target, /*DstAlign=*/element_alignment, source,
+        /*SrcAlign=*/element_alignment, element_count * primitive_type_size);
 
     // The memcpy does the load and the store internally.  The aliasing related
     // metadata has to reflect that.
@@ -2547,22 +2559,6 @@ Status IrEmitter::FinishVisit(HloInstruction* root) {
     }
   };
 
-  // For the parallel cpu backend, we record the total for each embedded
-  // computation callee with its caller kCall HLO.
-  if (parallel_cpu_backend_ && is_top_level_computation_) {
-    auto* computation = root->parent();
-    auto* entry_computation = computation->parent()->entry_computation();
-    if (computation != entry_computation) {
-      for (HloInstruction* instruction : entry_computation->instructions()) {
-        if (instruction->opcode() == HloOpcode::kCall &&
-            instruction->to_apply()->root_instruction() == root) {
-          record_complete_computation(GetProfileCounterFor(*instruction));
-          return Status::OK();
-        }
-      }
-    }
-  }
-
   // For the entry computation this increment is cumulative of embedded
   // computations since it includes cycles spent in computations invoked by
   // While, Call etc.
@@ -2905,7 +2901,8 @@ Status IrEmitter::EmitMemcpy(const HloInstruction& source,
   llvm::Value* destination_value = GetEmittedValueFor(&destination);
   int64 source_size = ByteSizeOf(source.shape());
   // TODO(b/63762267): Be more aggressive about specifying alignment.
-  ir_builder_.CreateMemCpy(destination_value, source_value, source_size, 1);
+  ir_builder_.CreateMemCpy(destination_value, /*DstAlign=*/1, source_value,
+                           /*SrcAlign=*/1, source_size);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 509440251497cd7337284c39dae05c5f6c28e7c2..0f2f3d1817d6e891211bed843cd05c414771f151 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -532,8 +532,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
 
   const HloModuleConfig& hlo_module_config_;
 
-  const bool parallel_cpu_backend_;
-
   bool is_top_level_computation_;
 
   TargetMachineFeatures target_machine_features_;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
deleted file mode 100644
index 07a9f0efcb64db4b2ff0c6518d4b48eee9a505e0..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ /dev/null
@@ -1,531 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h"
-
-#include <stdint.h>
-#include <algorithm>
-#include <deque>
-#include <iterator>
-#include <list>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/shaped_buffer.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace se = ::perftools::gputools;
-
-namespace xla {
-namespace cpu {
-
-ParallelCpuExecutable::ParallelCpuExecutable(
-    std::unique_ptr<SimpleOrcJIT> jit,
-    std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<const HloModule> hlo_module,
-    std::unique_ptr<const HloInstructionMap<string>> function_names,
-    std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
-        aligned_constants,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
-                 std::move(hlo_profile_index_map)),
-      jit_(std::move(jit)),
-      assignment_(std::move(assignment)),
-      function_names_(std::move(function_names)),
-      aligned_constants_(std::move(aligned_constants)) {}
-
-// Type of the computation function we expect in the JIT.
-using ComputeFunctionType = void (*)(void*, const void*, const void**, void**,
-                                     int64*, int64*);
-
-// Given a pointer to an output buffer (following the CPU JIT calling
-// conventions), mark addresses that are "live". The initial pointer itself is
-// trivially live. If the shape of the buffer is a tuple, this analysis looks
-// into the tuple's elements and marks them live as well (since tuples keep
-// pointers to buffers) and also works recursively.
-// address is an in-memory buffer address that contains some runtime XLA object.
-// shape is its shape. marked_addresses is the set of live addresses to
-// populate.
-static void MarkLiveAddressesInOutput(
-    const void* address, const Shape& shape,
-    std::unordered_set<const void*>* marked_addresses) {
-  marked_addresses->insert(address);
-  const uintptr_t* address_buffer = static_cast<const uintptr_t*>(address);
-  if (ShapeUtil::IsTuple(shape)) {
-    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      const uintptr_t* element_address = address_buffer + i;
-      const void* element = reinterpret_cast<const void*>(*element_address);
-      MarkLiveAddressesInOutput(
-          element, ShapeUtil::GetTupleElementShape(shape, i), marked_addresses);
-    }
-  }
-}
-
-namespace {
-
-// Executor manages the concurrent execution of 'functions' for instructions
-// in 'pending' on 'thread_pool' (storing resulting data in 'results').
-class Executor {
- public:
-  Executor(const HloInstructionMap<ComputeFunctionType>& functions,
-           const ServiceExecutableRunOptions* run_options,
-           std::list<HloInstruction*>* pending,
-           HloInstructionMap<const void*>* results, void** temps_array,
-           int64* profile_counters_array, const BufferAssignment* assignment)
-      : functions_(functions),
-        run_options_(run_options),
-        pending_(pending),
-        results_(results),
-        temps_array_(temps_array),
-        profile_counters_array_(profile_counters_array),
-        thread_pool_(CHECK_NOTNULL(run_options_->xla_intra_op_thread_pool())),
-        assignment_(assignment) {}
-
-  // Executes pending list of instructions on thread pool.
-  // Returns OK status on success, error status otherwise.
-  Status Run();
-
- private:
-  // Schedules a parallel invocation of compute function for 'instruction' on
-  // 'thread_pool_', storing result in 'result_buffer'.
-  // If 'partition_buffers' is non-null, parallel task will be invoked on
-  // per-dimension partition [start, limit) values stored in
-  // 'partition_buffers'.
-  void Schedule(HloInstruction* instruction, int64* partition_buffers,
-                void* result_buffer);
-
-  // Returns true if 'instruction' has been assigned parallel tasks (returns
-  // false otherwise).
-  bool HasParallelTasks(HloInstruction* instruction);
-
-  // Returns in 'partition_buffers' the partition [size, limit) for each
-  // dimension.
-  int64* GetPartitionBuffers(
-      const std::vector<std::pair<int64, int64>>& partition);
-
-  // Returns array of result buffers for all operands in 'instruction'.
-  const void** GetOperandBuffers(HloInstruction* instruction);
-
-  // Arguments passed into Executor.
-  const HloInstructionMap<ComputeFunctionType>& functions_;
-  const ServiceExecutableRunOptions* run_options_;
-  std::list<HloInstruction*>* pending_;
-  HloInstructionMap<const void*>* results_;
-  void** temps_array_;
-  int64* profile_counters_array_;
-  tensorflow::thread::ThreadPool* thread_pool_;
-  const BufferAssignment* assignment_;
-
-  // Members used to manage instruction execution.
-  tensorflow::mutex completion_queue_lock_;
-  tensorflow::condition_variable completion_queue_cv_;
-  std::deque<HloInstruction*> completion_queue_;
-  int64 instructions_in_flight_ = 0;
-  std::unordered_map<const HloInstruction*, int64> tasks_in_flight_;
-};
-
-Status Executor::Run() {
-  while (!pending_->empty() || instructions_in_flight_ > 0) {
-    auto pending_it = pending_->begin();
-    while (pending_it != pending_->end()) {
-      HloInstruction* instruction = *pending_it;
-      // Skip pending instructions whose operands aren't ready.
-      if (std::any_of(instruction->operands().begin(),
-                      instruction->operands().end(),
-                      [&](HloInstruction* operand) {
-                        return !ContainsKey(*results_, operand);
-                      })) {
-        ++pending_it;
-        continue;
-      }
-
-      // Get 'result_buffer' reference to result buffer for 'instruction'.
-      TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                          assignment_->GetUniqueTopLevelSlice(instruction));
-      void* result_buffer =
-          static_cast<char*>(temps_array_[result_slice.index()]) +
-          result_slice.offset();
-
-      if (HasParallelTasks(instruction)) {
-        // 'instruction' has been assigned parallel task partitions.
-        CHECK_EQ(HloOpcode::kCall, instruction->opcode());
-        HloInstruction* root = instruction->to_apply()->root_instruction();
-
-        // Create ShapePartitionIterator to iterate through all outer dimension
-        // partitions of 'instruction'.
-        ShapePartitionIterator partition_iterator(
-            root->shape(), root->outer_dimension_partitions());
-
-        const int64 partition_count =
-            partition_iterator.GetTotalPartitionCount();
-
-        // Record total parallel task count for 'instruction' before dispatch.
-        {
-          tensorflow::mutex_lock l(completion_queue_lock_);
-          tasks_in_flight_.insert(std::make_pair(instruction, partition_count));
-          VLOG(2) << "Schedule PARALLEL"
-                  << " instruction: " << instruction->name()
-                  << " instruction.callee: "
-                  << instruction->to_apply()->root_instruction()->name()
-                  << " partition_count: " << partition_count;
-        }
-
-        for (int64 i = 0; i < partition_count; ++i) {
-          // Get partition [start, limit) for each dimension.
-          auto partition_buffers =
-              GetPartitionBuffers(partition_iterator.GetPartition(i));
-          Schedule(instruction, partition_buffers, result_buffer);
-        }
-
-      } else {
-        // Set tasks in-flight to '1' for sequential instruction execution.
-        {
-          tensorflow::mutex_lock l(completion_queue_lock_);
-          tasks_in_flight_.insert(std::make_pair(instruction, 1));
-          VLOG(2) << "Schedule SEQUENTIAL"
-                  << " instruction: " << instruction->name()
-                  << " instruction.callee: "
-                  << instruction->to_apply()->root_instruction()->name();
-        }
-        Schedule(instruction, nullptr, result_buffer);
-      }
-
-      ++instructions_in_flight_;
-      pending_it = pending_->erase(pending_it);
-    }
-    // Wait for a completed HLO instruction to be present in the queue.  We will
-    // pop it out of the queue and make the result available to its users.
-    HloInstruction* instruction;
-    do {
-      tensorflow::mutex_lock l(completion_queue_lock_);
-      if (completion_queue_.empty()) {
-        completion_queue_cv_.wait(l);
-      }
-      if (!completion_queue_.empty()) {
-        instruction = completion_queue_.front();
-        completion_queue_.pop_front();
-        break;
-      }
-    } while (true);
-    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                        assignment_->GetUniqueTopLevelSlice(instruction));
-    void* result_buffer =
-        static_cast<char*>(temps_array_[result_slice.index()]) +
-        result_slice.offset();
-    InsertOrDie(results_, instruction, result_buffer);
-    --instructions_in_flight_;
-  }
-  return Status::OK();
-}
-
-void Executor::Schedule(HloInstruction* instruction, int64* partition_buffers,
-                        void* result_buffer) {
-  // The thread pool entry takes ownership of |operand_buffers|.
-  auto operand_buffers = GetOperandBuffers(instruction);
-
-  auto function = FindOrDie(functions_, instruction);
-  const auto* exec_run_options = &run_options_->run_options();
-  thread_pool_->Schedule([this, instruction, result_buffer, operand_buffers,
-                          partition_buffers, exec_run_options, function]() {
-    function(result_buffer, exec_run_options, operand_buffers, temps_array_,
-             partition_buffers, profile_counters_array_);
-
-    delete[] operand_buffers;
-    delete[] partition_buffers;
-    // Push the completed HLO instruction on the queue, the main
-    // thread will pop it off and potentially launch more work which
-    // uses the result.
-    // TODO(b/27458679) Consider alternative task scheduling and synchronization
-    // schemes. For example, we could avoid the overhead associate with the
-    // condvar here if the thread just dequed the next instruction to execute
-    // on completion.
-    {
-      tensorflow::mutex_lock l(completion_queue_lock_);
-      // Decrement in-flight task count for this completion.
-      if (--FindOrDie(tasks_in_flight_, instruction) == 0) {
-        completion_queue_.push_back(instruction);
-        completion_queue_cv_.notify_all();
-        tasks_in_flight_.erase(instruction);
-      }
-    }
-  });
-}
-
-int64* Executor::GetPartitionBuffers(
-    const std::vector<std::pair<int64, int64>>& partition) {
-  // Return in 'partition_buffers' partition [size, limit) for each dimension.
-  auto partition_buffers = new int64[partition.size() * 2];
-  for (int i = 0; i < partition.size(); ++i) {
-    partition_buffers[2 * i + 0] = partition[i].first;
-    partition_buffers[2 * i + 1] = partition[i].first + partition[i].second;
-  }
-  return partition_buffers;
-}
-
-bool Executor::HasParallelTasks(HloInstruction* instruction) {
-  return instruction->opcode() == HloOpcode::kCall &&
-         !instruction->to_apply()
-              ->root_instruction()
-              ->outer_dimension_partitions()
-              .empty();
-}
-
-const void** Executor::GetOperandBuffers(HloInstruction* instruction) {
-  // We cannot use a move-only RAII type like std::unique_ptr because the
-  // list of operands is allocated on the main thread and transferred to the
-  // worker via the lambda passed to enqueue_function.  In order for the
-  // lambda to take ownership, we would need to use generalized lambda
-  // capture which is a feature new to C++14.
-  // TODO(b/27458679) Avoid dynamic allocations in Executor.
-  auto operand_buffers = new const void*[instruction->operand_count()];
-  std::transform(instruction->operands().begin(), instruction->operands().end(),
-                 operand_buffers, [this](HloInstruction* operand) {
-                   return FindOrDie(*results_, operand);
-                 });
-  return operand_buffers;
-}
-
-}  // namespace
-
-Status ParallelCpuExecutable::AllocateBuffers(
-    DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
-  CHECK_EQ(buffers->size(), assignment_->Allocations().size());
-  VLOG(3) << "Allocating " << assignment_->Allocations().size()
-          << " allocations for module " << module().name();
-  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
-       ++i) {
-    auto& allocation = assignment_->GetAllocation(i);
-
-    VLOG(3) << allocation.ToString();
-
-    if (allocation.is_entry_computation_parameter()) {
-      VLOG(3) << "allocation #" << i << " is a parameter";
-      continue;
-    }
-
-    if (allocation.is_thread_local()) {
-      VLOG(3) << "buffer #" << i << " is thread-local";
-      continue;
-    }
-
-    int64 buffer_size = allocation.size();
-    if (!(*buffers)[i].is_null()) {
-      VLOG(3) << "buffer #" << i
-              << " is in the preallocated result ShapedBuffer";
-    } else {
-      TF_ASSIGN_OR_RETURN((*buffers)[i], memory_allocator->Allocate(
-                                             device_ordinal, buffer_size));
-
-      VLOG(3) << "buffer #" << i << " allocated " << buffer_size << " bytes ["
-              << (*buffers)[i].opaque() << "]";
-    }
-
-    // Since the output buffer and all the temporary buffers were written into
-    // by the JITed code, msan has no way of knowing their memory was
-    // initialized. Mark them initialized so that msan doesn't flag loads from
-    // these buffers.
-    TF_ANNOTATE_MEMORY_IS_INITIALIZED((*buffers)[i].opaque(), buffer_size);
-  }
-
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                      assignment_->GetUniqueTopLevelOutputSlice());
-  VLOG(3) << "result index: " << result_slice.index();
-
-  return Status::OK();
-}
-
-Status ParallelCpuExecutable::ExecuteComputeFunctions(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    HloExecutionProfile* hlo_execution_profile) {
-  // Allocate profiling counters for each hlo instruction that we would like to
-  // profile.
-  std::vector<int64>* profile_counters = nullptr;
-  if (hlo_execution_profile) {
-    profile_counters = hlo_execution_profile->mutable_profile_counters();
-  }
-
-  std::vector<void*> buffer_pointers;
-  buffer_pointers.reserve(buffers.size());
-  for (auto device_allocation : buffers) {
-    buffer_pointers.push_back(device_allocation.opaque());
-  }
-
-  // Resolve functions for all the HLO instructions ahead of time.
-  HloInstructionMap<ComputeFunctionType> functions;
-  for (auto& entry : *function_names_) {
-    tensorflow::mutex_lock lock(jit_mutex_);
-    HloInstruction* instruction = entry.first;
-    llvm::JITSymbol sym = jit_->FindCompiledSymbol(entry.second);
-    TF_RET_CHECK(sym);
-    InsertOrDie(
-        &functions, instruction,
-        reinterpret_cast<ComputeFunctionType>(cantFail(sym.getAddress())));
-  }
-
-  // Map containing pointers to result buffers for each instruction.
-  HloInstructionMap<const void*> results;
-
-  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
-
-  std::list<HloInstruction*> pending;
-
-  // Call the function for each HLO instruction in topological order.
-  const HloComputation& entry_computation = *module().entry_computation();
-  for (auto* instruction : entry_computation.MakeInstructionPostOrder()) {
-    // Parameters and constants have no functions associated with them. Instead
-    // just copy the existing buffer into the map containing instruction
-    // results..
-    if (instruction->opcode() == HloOpcode::kParameter) {
-      InsertOrDie(
-          &results, instruction,
-          arguments[instruction->parameter_number()]->root_buffer().opaque());
-    } else if (instruction->opcode() == HloOpcode::kConstant) {
-      unsigned char* aligned_data =
-          FindOrDie(aligned_constants_, instruction).get();
-      InsertOrDie(&results, instruction, aligned_data);
-    } else {
-      TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall);
-      pending.push_back(instruction);
-    }
-  }
-
-  // TODO(b/27458679) Manage scheduling based on in-flight concurrency limits.
-  // For example, if we expect a library conv/matmul call to run at max
-  // concurrency, we should not dispatch runnable instructions until the
-  // library call is finished (to avoid expensive cache invalidation).
-  Executor executor(
-      functions, run_options, &pending, &results, buffer_pointers.data(),
-      profile_counters ? profile_counters->data() : nullptr, assignment_.get());
-
-  TF_RETURN_IF_ERROR(executor.Run());
-
-  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
-
-  {
-    tensorflow::mutex_lock lock(mutex_);
-    double nanoseconds = (end_micros - start_micros) * 1000.0;
-    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-  }
-
-  return Status::OK();
-}
-
-StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  if (GetRootPointsToSet().IsAmbiguous()) {
-    return Unimplemented("Points-to set of root instruction is ambiguous");
-  }
-
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
-
-  auto result_buffer = MakeUnique<ShapedBuffer>(
-      /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
-      stream->parent()->platform(), stream->parent()->device_ordinal());
-
-  TF_RETURN_IF_ERROR(AllocateBuffers(
-      memory_allocator, stream->parent()->device_ordinal(), &buffers));
-
-  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(run_options, arguments, buffers,
-                                             hlo_execution_profile));
-
-  // Copy DeviceMemoryBase values which into the respective location in
-  // ShapedBuffer which is returned to the caller.
-  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
-      [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
-        const auto& sources = this->GetRootPointsToSet().element(index);
-
-        // The points to set is unambiguous so the set should be a singleton.
-        CHECK_EQ(1, sources.size());
-        const LogicalBuffer* buffer_source = sources[0];
-        HloInstruction* src = buffer_source->instruction();
-
-        // The source for this result buffer can be a nested buffer such as a
-        // tuple element. The source instruction should have a non-parameter
-        // buffer assigned.
-        TF_ASSIGN_OR_RETURN(
-            const BufferAllocation::Slice slice,
-            this->assignment_->GetUniqueSlice(src, buffer_source->index()));
-        CHECK(!slice.allocation()->is_entry_computation_parameter());
-
-        const BufferAllocation::Index buffer_index = slice.index();
-        const se::DeviceMemoryBase& buffer = buffers[buffer_index];
-        CHECK(!buffer.is_null() || buffer.size() == 0);
-        *device_memory = buffer;
-        buffers_in_result[buffer_index] = true;
-        return Status::OK();
-      }));
-
-  // Free all buffers not in the result.
-  for (size_t i = 0; i < buffers.size(); ++i) {
-    se::DeviceMemoryBase alloc = buffers[i];
-    if (!buffers_in_result[i] && !alloc.is_null()) {
-      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(
-          stream->parent()->device_ordinal(), &alloc));
-    }
-  }
-
-  return std::move(result_buffer);
-}
-
-StatusOr<std::unique_ptr<ShapedBuffer>>
-ParallelCpuExecutable::ExecuteAsyncOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
-  // TODO(b/30671675): Implement asynchronous execution mode.
-  return Unimplemented(
-      "Asynchronous execution on stream is not yet supported on CPU.");
-}
-
-const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
-  return assignment_->points_to_analysis().GetPointsToSet(
-      module().entry_computation()->root_instruction());
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
deleted file mode 100644
index c393e9b8ea39bfb4c605ebba8e2cd29726bc4af9..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_
-
-#include <stddef.h>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/shaped_buffer.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-
-namespace xla {
-namespace cpu {
-
-// CPU-targeting parallel implementation of the XLA Executable interface.
-//
-// Wraps a JIT-ed object that can be executed "on device". We JIT for the host
-// architecture, so JIT-ed code and host code share the same ABI.
-class ParallelCpuExecutable : public Executable {
- public:
-  ParallelCpuExecutable(
-      std::unique_ptr<SimpleOrcJIT> jit,
-      std::unique_ptr<const BufferAssignment> assignment,
-      std::unique_ptr<const HloModule> hlo_module,
-      std::unique_ptr<const HloInstructionMap<string>> function_names,
-      std::unordered_map<const HloInstruction*,
-                         std::unique_ptr<unsigned char[]>>
-          aligned_constants,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
-  ~ParallelCpuExecutable() override {}
-
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
-
-  // This should be called after set_ir_module_string.
-  const string& ir_module_string() const { return ir_module_string_; }
-
-  void set_ir_module_string(const string& ir_module_string) {
-    ir_module_string_ = ir_module_string;
-  }
-
-  static int64 ShapeSizeBytes(const Shape& shape) {
-    // On the cpu, opaques are pointers.
-    if (ShapeUtil::IsOpaque(shape)) {
-      return sizeof(void*);
-    }
-    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-  }
-
-  const Status EqualOrFail(const Executable& executable) {
-    // TODO(b/62952745) Implement equality test on CPU parallel executable.
-    return Unimplemented(
-        "Equality test on CPU parallel executable is not implemented.");
-  }
-
- private:
-  // Allocate buffers required for execution and assign them to the elements of
-  // "buffers". "buffers" should be sized to the number of buffers in buffer
-  // assignment. Each vector element corresponds to a particular Index. If
-  // a vector element already contains a non-null DeviceMemoryBase, then no
-  // buffer is assigned for this element.
-  Status AllocateBuffers(
-      DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
-
-  // Calls the generated functions in 'function_names_', performing the
-  // computation with the given arguments using the supplied buffers.
-  Status ExecuteComputeFunctions(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
-      HloExecutionProfile* hlo_execution_profile);
-
-  // Returns the points-to set of the root instruction of the entry
-  // computation. Uses points-to analysis from buffer assignment.
-  const PointsToSet& GetRootPointsToSet() const;
-
-  // The JIT containing compiled modules.
-  tensorflow::mutex jit_mutex_;
-  const std::unique_ptr<SimpleOrcJIT> jit_ GUARDED_BY(jit_mutex_);
-
-  // Buffer assignment for the buffers we need to allocate.
-  const std::unique_ptr<const BufferAssignment> assignment_;
-
-  // The LLVM IR, in string format, of the unoptimized module generated for this
-  // ParallelCpuExecutable. We save a string instead of an llvm::Module* because
-  // leaving llvm::Module* in a singleton can cause the heap checker to emit
-  // false positives.
-  string ir_module_string_;
-
-  // Map containing the JITted function names for each HLO instruction.
-  const std::unique_ptr<const HloInstructionMap<string>> function_names_;
-
-  // Map from HLO Constant instructions to a pointer to their literal data.
-  // The data stored in the protocol buffer might be insufficiently aligned,
-  // we create a sufficiently aligned copy and store it in this map.
-  const std::unordered_map<const HloInstruction*,
-                           std::unique_ptr<unsigned char[]>>
-      aligned_constants_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ParallelCpuExecutable);
-};
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index 1e439cde11cf74272101b80c867a308e51ab26a6..54af40506dab48b3c2a3a44eb0b5f5fb213a32ec 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -29,7 +29,8 @@ ParallelLoopEmitter::ParallelLoopEmitter(
     : LoopEmitter(target_element_generator, target_array, ir_builder),
       dynamic_loop_bounds_(dynamic_loop_bounds) {}
 
-llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<llvm_ir::IrArray::Index>
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   CHECK(!ShapeUtil::IsTuple(shape_));
   CHECK(!ShapeUtil::IsScalar(shape_));
@@ -69,7 +70,7 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
   CHECK(exit_bb_ != nullptr);
 
-  return array_index;
+  return {array_index};
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
index ce92e36a944de33b991d97460f0b2e859ad56081..755715634aa70a822b21d25dcae20a8fe053477a 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -60,7 +60,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
   ~ParallelLoopEmitter() override = default;
 
-  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index deb21bf4ef5895cfdbec5c2449b6ce7b306a7008..fb28280fade307ac1f193e7dca481bd2afa855fc 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -71,7 +71,7 @@ class DefaultCostModel : public ParallelCostModel {
     if (flops_to_bytes_ratio <= 1.0) {
       // Limit max parallelism for I/O bound instructions by assuming a
       // sub-linear scaling function (fit based on empirical benchmark results).
-      // TODO(29630486) Develop system bandwidth model.
+      // TODO(b/29630486) Develop system bandwidth model.
       max_parallelism =
           std::ceil(std::sqrt(tensorflow::port::NumSchedulableCPUs()));
       // Use shape size instruction cost and L2 cache size min per-thread cost.
@@ -81,7 +81,7 @@ class DefaultCostModel : public ParallelCostModel {
       // Use max parallelism for compute bound instructions.
       max_parallelism = max_parallelism_;
       // Calculate the instruction cost in cycles.
-      // TODO(29630486) Improve on this linear cost model.
+      // TODO(b/29630486) Improve on this linear cost model.
       // Consider making 'min_cost_per_thread' be a function of the target
       // bandwidth limit for instructions with low arithmetic complexity.
       instruction_cost =
@@ -128,24 +128,25 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
   // one of the following properties:
   // *) Internal threading (library calls to kConv, kDot, kFft, kCustomCall).
   // *) Emit custom loops (kSelectAndScatter, FusionKind::kTransposeDot).
+  // *) Operations that are not thread safe (like infeed and rng).
   // *) Tuple-shaped.
   // TODO(b/27458679) Parallelize instructions which are skipped here.
-  if (instruction->opcode() == HloOpcode::kParameter ||
-      instruction->opcode() == HloOpcode::kConstant ||
-      instruction->opcode() == HloOpcode::kCall ||
-      instruction->opcode() == HloOpcode::kCustomCall ||
-      instruction->opcode() == HloOpcode::kSelectAndScatter ||
-      instruction->opcode() == HloOpcode::kGetTupleElement ||
-      instruction->opcode() == HloOpcode::kBitcast ||
-      instruction->opcode() == HloOpcode::kFft ||
-      (instruction->opcode() == HloOpcode::kConvolution &&
+  auto opcode = instruction->opcode();
+  if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
+      opcode == HloOpcode::kCall || opcode == HloOpcode::kCustomCall ||
+      opcode == HloOpcode::kDot || opcode == HloOpcode::kSelectAndScatter ||
+      opcode == HloOpcode::kGetTupleElement || opcode == HloOpcode::kBitcast ||
+      opcode == HloOpcode::kFft || opcode == HloOpcode::kInfeed ||
+      opcode == HloOpcode::kOutfeed || opcode == HloOpcode::kRng ||
+      (opcode == HloOpcode::kConvolution &&
        PotentiallyImplementedAsEigenConvolution(*instruction)) ||
       PotentiallyImplementedAsEigenDot(*instruction) ||
-      (instruction->opcode() == HloOpcode::kFusion &&
+      (opcode == HloOpcode::kFusion &&
        instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) ||
       ShapeUtil::IsTuple(instruction->shape())) {
     return 1;
   }
+
   // Consult 'cost_model_' to compute target parallel task count.
   return cost_model_->GetParallelTaskCount(instruction);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..13eb75a57213b1a68a5732a4f6061efdf97fa4f4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace xla {
+namespace {
+
+class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
+ protected:
+  const HloCostAnalysis::ShapeSizeFunction shape_size_func_ =
+      cpu::CpuExecutable::ShapeSizeBytes;
+
+  // Use any value larger than 2 since we only test whether a module is
+  // parallelized or not
+  const int max_parallelism_ = 10;
+};
+
+TEST_F(ParallelTaskAssignmentTest, DotOperationNotParallelized) {
+  const string hlo_string = R"(
+    HloModule TestTaskParallel_Dot
+    ENTRY Dot {
+      dot_lhs = f32[196614,2]{1,0} parameter(0)
+      dot_rhs = f32[2,1]{1,0} parameter(1)
+      ROOT dot = f32[196614,1]{1,0} dot(dot_lhs, dot_rhs),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
+                                            max_parallelism_, shape_size_func_)
+                                            .Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ParallelTaskAssignmentTest,
+       FusedComputationWithDotOperationNotParallelized) {
+  const string hlo_string = R"(
+    HloModule TestTaskParallel_DotNestedInFusedComp
+    fused_computation.0 {
+      parameter.0 = f32[196614,2]{1,0} parameter(0)
+      parameter.0.1 = f32[2,1]{1,0} parameter(1)
+      parameter.0.2 = f32[196614,1]{1,0} parameter(2)
+      dot.0 = f32[196614,1]{1,0} dot(parameter.0, parameter.0.1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      ROOT add.0 = f32[196614,1]{1,0} add(dot.0, parameter.0.2)
+
+    }
+    ENTRY DotNestedInFusedComp {
+      parameter = f32[196614,2]{1,0} parameter(0)
+      parameter.1 = f32[2,1]{1,0} parameter(1)
+      parameter.2 = f32[196614,1]{1,0} parameter(2)
+      ROOT fusion = f32[196614,1]{1,0} fusion(parameter, parameter.1,
+        parameter.2), kind=kOutput, calls=fused_computation.0
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
+                                            max_parallelism_, shape_size_func_)
+                                            .Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ParallelTaskAssignmentTest, RngOperationNotParallelized) {
+  const string hlo_string = R"(
+    HloModule TestTaskParallel_rng
+    ENTRY Rng {
+      src0 = f32[] parameter(0)
+      src1 = f32[] parameter(1)
+      ROOT rng0 = f32[1234567,2]{1,0} rng(f32[] src0, f32[] src1),
+      distribution=rng_uniform
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
+                                            max_parallelism_, shape_size_func_)
+                                            .Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
+  const string hlo_string = R"(
+    HloModule TestTaskParallel_infeed_outfeed
+    ENTRY InfeedOutfeed {
+      infeed0 = u32[12345678,2]{1,0} infeed()
+      ROOT outfeed0 = u32[12345678,2]{1,0} outfeed(infeed0)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
+                                            max_parallelism_, shape_size_func_)
+                                            .Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d.h b/tensorflow/compiler/xla/service/cpu/runtime_conv2d.h
index 39e20ed45639040110b99ddb52eb6f6dab26dfaa..7337c907f5c83d608641b7382e75902e6f6c05d4 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv2d.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONV2D_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONV2D_H_
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
 extern "C" {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c60580d6e763c659102b570ed044706f87899437
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h"
+#include <iostream>
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::int64;
+
+#ifdef INTEL_MKL
+#include <omp.h>
+#include "mkldnn.hpp"
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
+
+namespace {
+
+// Downcast an int64 to int and check if value is in range.
+int ToInt(int64 input) {
+  int output = static_cast<int>(input);
+  if (static_cast<int64>(output) != input) {
+    std::cerr << "Error occurred in downcasting int64 to int32: Value " << input
+              << " is out-of-range for type int32. \n";
+    exit(1);
+  }
+  return output;
+}
+
+using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
+using mkldnn::engine;
+using mkldnn::memory;
+using mkldnn::padding_kind;
+using mkldnn::primitive;
+using mkldnn::prop_kind;
+using mkldnn::reorder;
+using mkldnn::stream;
+
+template <typename EigenDevice, typename ScalarType>
+void MKLConvImpl(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
+                 ScalarType* rhs, int64 input_batch, int64 input_rows,
+                 int64 input_cols, int64 input_channels, int64 kernel_rows,
+                 int64 kernel_cols, int64 kernel_channels, int64 kernel_filters,
+                 int64 output_rows, int64 output_cols, int64 row_stride,
+                 int64 col_stride, int64 padding_top, int64 padding_bottom,
+                 int64 padding_left, int64 padding_right,
+                 int64 lhs_row_dilation, int64 lhs_col_dilation,
+                 int64 rhs_row_dilation, int64 rhs_col_dilation) {
+  auto cpu_engine = engine(engine::cpu, 0);
+
+  // Create a vector primitive to hold the network.
+  std::vector<primitive> net;
+
+  // Since memory::dims takes int for each dimension, we downcast the int64
+  // values to int using the ToInt function defined above.
+  memory::dims conv1_src_dim = {ToInt(input_batch), ToInt(input_channels),
+                                ToInt(input_rows), ToInt(input_cols)};
+  memory::dims conv1_weights_dim = {ToInt(kernel_filters),
+                                    ToInt(kernel_channels), ToInt(kernel_rows),
+                                    ToInt(kernel_cols)};
+  memory::dims conv1_dst_dim = {ToInt(input_batch), ToInt(kernel_filters),
+                                ToInt(output_rows), ToInt(output_cols)};
+  memory::dims conv1_strides = {ToInt(row_stride), ToInt(col_stride)};
+  // Note: In MKL_DNN dilation starts from 0.
+  memory::dims conv1_dilates = {ToInt(rhs_row_dilation - 1),
+                                ToInt(rhs_col_dilation - 1)};
+  memory::dims conv1_padding_l = {ToInt(padding_top), ToInt(padding_left)};
+  memory::dims conv1_padding_r = {ToInt(padding_bottom), ToInt(padding_right)};
+
+  // Create memory for user data. Input and output data have format of NHWC and
+  // kernel data has format of HWIO.
+  // Note that as a convention in MKL-DNN, the dimensions of the data is always
+  // described in NCHW/IOHW, regardless of the actual layout of the data.
+  auto user_src_memory =
+      memory({{{conv1_src_dim}, memory::data_type::f32, memory::format::nhwc},
+              cpu_engine},
+             lhs);
+  auto user_weights_memory = memory(
+      {{{conv1_weights_dim}, memory::data_type::f32, memory::format::hwio},
+       cpu_engine},
+      rhs);
+  auto user_dst_memory =
+      memory({{{conv1_dst_dim}, memory::data_type::f32, memory::format::nhwc},
+              cpu_engine},
+             out);
+
+  // Create memory descriptors for convolution data with no specified format for
+  // best performance.
+  auto conv1_src_mem_desc = memory::desc(
+      {conv1_src_dim}, memory::data_type::f32, memory::format::any);
+  auto conv1_weights_mem_desc = memory::desc(
+      {conv1_weights_dim}, memory::data_type::f32, memory::format::any);
+  auto conv1_dst_mem_desc = memory::desc(
+      {conv1_dst_dim}, memory::data_type::f32, memory::format::any);
+
+  // Create a convolution.
+  auto conv1_desc = convolution_forward::desc(
+      prop_kind::forward_inference, convolution_direct, conv1_src_mem_desc,
+      conv1_weights_mem_desc, conv1_dst_mem_desc, conv1_strides, conv1_dilates,
+      conv1_padding_l, conv1_padding_r, padding_kind::zero);
+  auto conv1_prim_desc =
+      convolution_forward::primitive_desc(conv1_desc, cpu_engine);
+
+  // Create reorders for data and weights if layout requested by convolution is
+  // different from NCHW/OIHW.
+  auto conv1_src_memory = user_src_memory;
+  if (memory::primitive_desc(conv1_prim_desc.src_primitive_desc()) !=
+      user_src_memory.get_primitive_desc()) {
+    conv1_src_memory = memory(conv1_prim_desc.src_primitive_desc());
+    net.push_back(reorder(user_src_memory, conv1_src_memory));
+  }
+
+  auto conv1_weights_memory = user_weights_memory;
+  if (memory::primitive_desc(conv1_prim_desc.weights_primitive_desc()) !=
+      user_weights_memory.get_primitive_desc()) {
+    conv1_weights_memory = memory(conv1_prim_desc.weights_primitive_desc());
+    net.push_back(reorder(user_weights_memory, conv1_weights_memory));
+  }
+
+  // Check if output need layout conversion. If yes, create memory for
+  // intermediate layer of conv1_dst_memory.
+  bool need_output_conversion =
+      (memory::primitive_desc(conv1_prim_desc.dst_primitive_desc()) !=
+       user_dst_memory.get_primitive_desc());
+  auto conv1_dst_memory = need_output_conversion
+                              ? memory(conv1_prim_desc.dst_primitive_desc())
+                              : user_dst_memory;
+
+  // Create convolution primitive and add it to net.
+  net.push_back(convolution_forward(conv1_prim_desc, conv1_src_memory,
+                                    conv1_weights_memory, conv1_dst_memory));
+  if (need_output_conversion) {
+    net.push_back(reorder(conv1_dst_memory, user_dst_memory));
+  }
+  stream(stream::kind::eager).submit(net).wait();
+}
+}  // namespace
+#endif  // INTEL_MKL
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_MKLConvF32(
+    const void* run_options_ptr, float* out, float* lhs, float* rhs,
+    int64 input_batch, int64 input_rows, int64 input_cols, int64 input_channels,
+    int64 kernel_rows, int64 kernel_cols, int64 kernel_channels,
+    int64 kernel_filters, int64 output_rows, int64 output_cols,
+    int64 row_stride, int64 col_stride, int64 padding_top, int64 padding_bottom,
+    int64 padding_left, int64 padding_right, int64 lhs_row_dilation,
+    int64 lhs_col_dilation, int64 rhs_row_dilation, int64 rhs_col_dilation) {
+#ifdef INTEL_MKL
+  // Since MKL_DNN cannot handle transposed convolution, this is handled by
+  // Eigen.
+  if (lhs_row_dilation > 1 || lhs_col_dilation > 1) {
+    __xla_cpu_runtime_EigenConvF32(
+        run_options_ptr, out, lhs, rhs, input_batch, input_rows, input_cols,
+        input_channels, kernel_rows, kernel_cols, kernel_channels,
+        kernel_filters, output_rows, output_cols, row_stride, col_stride,
+        padding_top, padding_bottom, padding_left, padding_right,
+        lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation);
+  } else {
+    MKLConvImpl(nullptr, out, lhs, rhs, input_batch, input_rows, input_cols,
+                input_channels, kernel_rows, kernel_cols, kernel_channels,
+                kernel_filters, output_rows, output_cols, row_stride,
+                col_stride, padding_top, padding_bottom, padding_left,
+                padding_right, lhs_row_dilation, lhs_col_dilation,
+                rhs_row_dilation, rhs_col_dilation);
+  }
+#else
+  std::cerr << "Attempt to call MKL Conv2D runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+#endif  // INTEL_MKL
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b239e71d231c5237a51a7048025bc2dcbd54fbe5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONV2D_MKL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONV2D_MKL_H_
+
+#include <iostream>
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_MKLConvF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 input_batch,
+    tensorflow::int64 input_rows, tensorflow::int64 input_cols,
+    tensorflow::int64 input_channels, tensorflow::int64 kernel_rows,
+    tensorflow::int64 kernel_cols, tensorflow::int64 kernel_channels,
+    tensorflow::int64 kernel_filters, tensorflow::int64 output_rows,
+    tensorflow::int64 output_cols, tensorflow::int64 row_stride,
+    tensorflow::int64 col_stride, tensorflow::int64 padding_top,
+    tensorflow::int64 padding_bottom, tensorflow::int64 padding_left,
+    tensorflow::int64 padding_right, tensorflow::int64 lhs_row_dilation,
+    tensorflow::int64 lhs_col_dilation, tensorflow::int64 rhs_row_dilation,
+    tensorflow::int64 rhs_col_dilation);
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONV2D_MKL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fp16.cc b/tensorflow/compiler/xla/service/cpu/runtime_fp16.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af0275c8bd00c82220fbe116eb90d2692393713b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fp16.cc
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstring>
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_fp16.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace {
+using tensorflow::uint16;
+using tensorflow::uint32;
+
+// Helper class that lets us access the underlying bit representation
+// of a float without breaking C++ strict aliasing.
+class AliasedFloatInt {
+ public:
+  static_assert(sizeof(float) == sizeof(uint32), "");
+
+  static AliasedFloatInt FromFloat(float f) {
+    AliasedFloatInt value;
+    value.set_float(f);
+    return value;
+  }
+
+  static AliasedFloatInt FromUInt(uint32 u) {
+    AliasedFloatInt value;
+    value.set_uint(u);
+    return value;
+  }
+
+  void set_float(float f) { memcpy(&value_, &f, sizeof(f)); }
+  float as_float() const {
+    float f;
+    memcpy(&f, &value_, sizeof(f));
+    return f;
+  }
+
+  void set_uint(uint32 u) { value_ = u; }
+  uint32 as_uint() const { return value_; }
+
+ private:
+  uint32 value_;
+};
+}  // namespace
+
+// __gnu_f2h_ieee and __gnu_h2f_ieee are marked as weak symbols so if XLA is
+// built with compiler-rt (that also defines these symbols) we don't get a
+// duplicate definition linker error.  Making these symbols weak also ensures
+// that the compiler-rt definitions "win", but that isn't essential.
+
+// Algorithm copied from Eigen.
+uint16 TF_ATTRIBUTE_WEAK __gnu_f2h_ieee(float float_value) {
+  AliasedFloatInt f = AliasedFloatInt::FromFloat(float_value);
+
+  const AliasedFloatInt f32infty = AliasedFloatInt::FromUInt(255 << 23);
+  const AliasedFloatInt f16max = AliasedFloatInt::FromUInt((127 + 16) << 23);
+  const AliasedFloatInt denorm_magic =
+      AliasedFloatInt::FromUInt(((127 - 15) + (23 - 10) + 1) << 23);
+  unsigned int sign_mask = 0x80000000u;
+  uint32 o = static_cast<uint16>(0x0u);
+
+  unsigned int sign = f.as_uint() & sign_mask;
+  f.set_uint(f.as_uint() ^ sign);
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.as_uint() >=
+      f16max.as_uint()) {  // result is Inf or NaN (all exponent bits set)
+    o = (f.as_uint() > f32infty.as_uint()) ? 0x7e00
+                                           : 0x7c00;  // NaN->qNaN and Inf->Inf
+  } else {                            // (De)normalized number or zero
+    if (f.as_uint() < (113 << 23)) {  // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.set_float(f.as_float() + denorm_magic.as_float());
+
+      // and one integer subtract of the bias later, we have our final float!
+      o = static_cast<uint16>(f.as_uint() - denorm_magic.as_uint());
+    } else {
+      unsigned int mant_odd =
+          (f.as_uint() >> 13) & 1;  // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      f.set_uint(f.as_uint() + (static_cast<unsigned int>(15 - 127) << 23) +
+                 0xfff);
+      // rounding bias part 2
+      f.set_uint(f.as_uint() + mant_odd);
+      // take the bits!
+      o = static_cast<uint16>(f.as_uint() >> 13);
+    }
+  }
+
+  o |= static_cast<uint16>(sign >> 16);
+  return o;
+}
+
+// Algorithm copied from Eigen.
+float TF_ATTRIBUTE_WEAK __gnu_h2f_ieee(uint16 h) {
+  const AliasedFloatInt magic = AliasedFloatInt::FromUInt(113 << 23);
+  const unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
+  AliasedFloatInt o;
+
+  o.set_uint((h & 0x7fff) << 13);                // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.as_uint();  // just the exponent
+  o.set_uint(o.as_uint() + ((127 - 15) << 23));  // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {                        // Inf/NaN?
+    o.set_uint(o.as_uint() + ((128 - 16) << 23));  // extra exp adjust
+  } else if (exp == 0) {                           // Zero/Denormal?
+    o.set_uint(o.as_uint() + (1 << 23));           // extra exp adjust
+    o.set_float(o.as_float() - magic.as_float());  // renormalize
+  }
+
+  o.set_uint(o.as_uint() | (h & 0x8000) << 16);  // sign bit
+  return o.as_float();
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fp16.h b/tensorflow/compiler/xla/service/cpu/runtime_fp16.h
new file mode 100644
index 0000000000000000000000000000000000000000..01d92d031904af99884c2583a8c7b5086b289d44
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fp16.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FP16_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FP16_H_
+
+#include "tensorflow/core/platform/types.h"
+
+// Converts an F32 value to a F16.
+extern "C" tensorflow::uint16 __gnu_f2h_ieee(float);
+
+// Converts an F16 value to a F32.
+extern "C" float __gnu_h2f_ieee(tensorflow::uint16);
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FP16_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index bff57d33ae23fbba8c664cbd18df77e4c35eb592..39b13183ff093611a42b3931d45f64eadb420622 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -63,30 +63,41 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
   C.device(*run_options->intra_op_thread_pool()) = A.contract(B, dims);
 }
 
+template <typename T>
+void MatMulImpl(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
+                int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+  if (m == 1 || n == 1) {
+    // Despite being single threaded, this version of matrix * vector is faster.
+    xla::EigenMatVec<T>(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  } else {
+    MatMul<T>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
+              transpose_rhs);
+  }
+}
+
 }  // namespace
 
+void __xla_cpu_runtime_EigenMatMulF16(const void* run_options_ptr,
+                                      Eigen::half* out, Eigen::half* lhs,
+                                      Eigen::half* rhs, int64 m, int64 n,
+                                      int64 k, int32 transpose_lhs,
+                                      int32 transpose_rhs) {
+  MatMulImpl<Eigen::half>(run_options_ptr, out, lhs, rhs, m, n, k,
+                          transpose_lhs, transpose_rhs);
+}
+
 void __xla_cpu_runtime_EigenMatMulF32(const void* run_options_ptr, float* out,
                                       float* lhs, float* rhs, int64 m, int64 n,
                                       int64 k, int32 transpose_lhs,
                                       int32 transpose_rhs) {
-  if (m == 1 || n == 1) {
-    // Despite being single threaded, this version of matrix * vector is faster.
-    xla::EigenMatVecF32(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-  } else {
-    MatMul<float>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
-                  transpose_rhs);
-  }
+  MatMulImpl<float>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
+                    transpose_rhs);
 }
 
 void __xla_cpu_runtime_EigenMatMulF64(const void* run_options_ptr, double* out,
                                       double* lhs, double* rhs, int64 m,
                                       int64 n, int64 k, int32 transpose_lhs,
                                       int32 transpose_rhs) {
-  if (m == 1 || n == 1) {
-    // Despite being single threaded, this version of matrix * vector is faster.
-    xla::EigenMatVecF64(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-  } else {
-    MatMul<double>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
-                   transpose_rhs);
-  }
+  MatMulImpl<double>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
+                     transpose_rhs);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul.h
index fdb644651dd5d0fa0345580f52ed0fb051672285..d96fe3d58bd5ffbad347e3ede3534d1d47be697a 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
 extern "C" {
@@ -25,6 +26,12 @@ extern "C" {
 // order. 'out' is a pointer to a buffer sufficiently large to hold the result
 // of the operation. Following standard nomenclature: lhs is m x k,
 // rhs is k x n, and out is m x n.
+extern void __xla_cpu_runtime_EigenMatMulF16(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
 extern void __xla_cpu_runtime_EigenMatMulF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
     float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92da5f71c23d5e1450b39ea8b7bb8345f6fabb3b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+#include "third_party/intel_mkl_ml/include/mkl_service.h"
+
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/core/platform/types.h"
+
+#define EIGEN_USE_THREADS
+#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+namespace {
+// BLAS GEMM API for 32-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF32(const void* run_options_ptr, float* out, float* lhs, float* rhs,
+               int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For column-major matrices, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_sgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+// BLAS GEMM API for 64-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF64(const void* run_options_ptr, double* out, double* lhs,
+               double* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For a column-major matrix, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_dgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+}  // namespace
+
+void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out,
+                                    float* lhs, float* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+// BLAS GEMM API for 64-bit Matrix Multiplication
+void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out,
+                                    double* lhs, double* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr,
+                                                  float* out, float* lhs,
+                                                  float* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr,
+                                                  double* out, double* lhs,
+                                                  double* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+#endif  // INTEL_MKL
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
new file mode 100644
index 0000000000000000000000000000000000000000..831b796efb971f6fb0170e2321c00ac415f2830f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+
+#include <iostream>
+#include "tensorflow/core/platform/types.h"
+#ifdef INTEL_MKL
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
+#else
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matvec.cc b/tensorflow/compiler/xla/service/cpu/runtime_matvec.cc
deleted file mode 100644
index 435820cdd36e2a906d9dfbe2555f4c0df623c729..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/runtime_matvec.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cassert>
-
-#include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/compiler/xla/service/cpu/runtime_matvec.h"
-
-using tensorflow::int32;
-using tensorflow::int64;
-
-namespace {
-
-// Does mat * x or mat^T * x.
-template <typename T>
-void MatVec(T* out_buf, T* mat_buf, T* x_buf, int64 rows, int64 cols,
-            int32 transpose) {
-  // Use an Eigen Matrix instead of a Tensor, as the GEMV from Matrix seems to
-  // be faster (b/30223679).  See also: the matmul op kernel in TensorFlow,
-  // which implements the same optimization.
-  using Matrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
-  using MatrixMap = Eigen::Map<Matrix>;
-
-  using Vector = Eigen::Matrix<T, Eigen::Dynamic, 1>;
-  using VectorMap = Eigen::Map<Vector>;
-
-  auto x = VectorMap(x_buf, cols);
-  auto out = VectorMap(out_buf, rows);
-
-  int64 mat_rows = rows;
-  int64 mat_cols = cols;
-
-  if (transpose) {
-    std::swap(mat_rows, mat_cols);
-  }
-
-  auto mat = MatrixMap(mat_buf, mat_rows, mat_cols);
-
-  if (transpose) {
-    out = mat.transpose() * x;
-  } else {
-    out = mat * x;
-  }
-}
-
-// Converts matmul-style args to matvec.
-template <typename T>
-void DispatchMatVec(T* out, T* lhs, T* rhs, int64 m, int64 n, int64 k,
-                    int32 transpose_lhs, int32 transpose_rhs) {
-  // If the input is in the form x * A, where x is the vector, then bring A back
-  // over to the left hand side.  We make use of the identity
-  //
-  //   (x * A)^T = A^T * x^T
-  //
-  // We do not need to take the transpose of x or of the result since taking
-  // the transpose of a vector does not change the memory layout.
-  const int64 cols = k;
-
-  T* mat;
-  T* vec;
-  int64 rows;
-  bool transpose_mat;
-
-  bool is_mat_vec = (n == 1);
-
-  if (is_mat_vec) {
-    mat = lhs;
-    vec = rhs;
-    rows = m;
-    transpose_mat = transpose_lhs;
-  } else {
-    mat = rhs;
-    vec = lhs;
-    rows = n;
-    transpose_mat = !transpose_rhs;
-  }
-
-  MatVec<T>(out, mat, vec, rows, cols, transpose_mat);
-}
-
-}  // namespace
-
-namespace xla {
-
-void EigenMatVecF32(float* out, float* lhs, float* rhs, int64 m, int64 n,
-                    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
-  assert((m == 1 || n == 1) && "not a matrix-vector multiply");
-  DispatchMatVec<float>(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-}
-
-void EigenMatVecF64(double* out, double* lhs, double* rhs, int64 m, int64 n,
-                    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
-  assert((m == 1 || n == 1) && "not a matrix-vector multiply");
-  DispatchMatVec<double>(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matvec.h b/tensorflow/compiler/xla/service/cpu/runtime_matvec.h
index 1bd8dfb377acc1f7cfbe9a92773f87f0ef25de3a..70eb98c54169824e220d9287753c0849362eade6 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matvec.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matvec.h
@@ -16,10 +16,86 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
 
+#include "third_party/eigen3/Eigen/Core"
+
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
+namespace detail {
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+// Does mat * x or mat^T * x.
+template <typename T>
+void MatVec(T* out_buf, T* mat_buf, T* x_buf, int64 rows, int64 cols,
+            int32 transpose) {
+  // Use an Eigen Matrix instead of a Tensor, as the GEMV from Matrix seems to
+  // be faster (b/30223679).  See also: the matmul op kernel in TensorFlow,
+  // which implements the same optimization.
+  using Matrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  using Vector = Eigen::Matrix<T, Eigen::Dynamic, 1>;
+  using VectorMap = Eigen::Map<Vector>;
+
+  auto x = VectorMap(x_buf, cols);
+  auto out = VectorMap(out_buf, rows);
+
+  int64 mat_rows = rows;
+  int64 mat_cols = cols;
+
+  if (transpose) {
+    std::swap(mat_rows, mat_cols);
+  }
+
+  auto mat = MatrixMap(mat_buf, mat_rows, mat_cols);
+
+  if (transpose) {
+    out = mat.transpose() * x;
+  } else {
+    out = mat * x;
+  }
+}
+
+// Converts matmul-style args to matvec.
+template <typename T>
+void DispatchMatVec(T* out, T* lhs, T* rhs, int64 m, int64 n, int64 k,
+                    int32 transpose_lhs, int32 transpose_rhs) {
+  // If the input is in the form x * A, where x is the vector, then bring A back
+  // over to the left hand side.  We make use of the identity
+  //
+  //   (x * A)^T = A^T * x^T
+  //
+  // We do not need to take the transpose of x or of the result since taking
+  // the transpose of a vector does not change the memory layout.
+  const int64 cols = k;
+
+  T* mat;
+  T* vec;
+  int64 rows;
+  bool transpose_mat;
+
+  bool is_mat_vec = (n == 1);
+
+  if (is_mat_vec) {
+    mat = lhs;
+    vec = rhs;
+    rows = m;
+    transpose_mat = transpose_lhs;
+  } else {
+    mat = rhs;
+    vec = lhs;
+    rows = n;
+    transpose_mat = !transpose_rhs;
+  }
+
+  MatVec<T>(out, mat, vec, rows, cols, transpose_mat);
+}
+
+}  // namespace detail
+
 // Performs a matrix-vector multiplication using Eigen. 'lhs' and 'rhs' are
 // pointers to buffers containing input matrices in column-major order. 'out' is
 // a pointer to a buffer sufficiently large to hold the result of the
@@ -30,15 +106,15 @@ namespace xla {
 //
 // TODO(b/64684907): Compare runtime performance of these functions with dot
 // simplification.
-void EigenMatVecF32(float* out, float* lhs, float* rhs, tensorflow::int64 m,
-                    tensorflow::int64 n, tensorflow::int64 k,
-                    tensorflow::int32 transpose_lhs,
-                    tensorflow::int32 transpose_rhs);
-
-void EigenMatVecF64(double* out, double* lhs, double* rhs, tensorflow::int64 m,
-                    tensorflow::int64 n, tensorflow::int64 k,
-                    tensorflow::int32 transpose_lhs,
-                    tensorflow::int32 transpose_rhs);
+template <typename T>
+void EigenMatVec(T* out, T* lhs, T* rhs, tensorflow::int64 m,
+                 tensorflow::int64 n, tensorflow::int64 k,
+                 tensorflow::int32 transpose_lhs,
+                 tensorflow::int32 transpose_rhs) {
+  assert((m == 1 || n == 1) && "not a matrix-vector multiply");
+  detail::DispatchMatVec<T>(out, lhs, rhs, m, n, k, transpose_lhs,
+                            transpose_rhs);
+}
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h
index f216bd0152aa93b8753d881938c63a9cabea899b..44b201725b2c724f48c1a3f0373c41e76211e0c2 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV2D_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV2D_H_
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
 extern "C" {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index ee8eb081556d60fcf6537b1036a4a5825c4c7bf6..17303e2f0d34e531a3a56aa147608b949e0f43ae 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -57,26 +57,38 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
   C = A.contract(B, dims);
 }
 
+template <typename T>
+void SingleThreadedMatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs,
+                          int64 m, int64 n, int64 k, int32 transpose_lhs,
+                          int32 transpose_rhs) {
+  if (m == 1 || n == 1) {
+    xla::EigenMatVec<T>(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  } else {
+    MatMul<T>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
+              transpose_rhs);
+  }
+}
+
 }  // namespace
 
+void __xla_cpu_runtime_EigenSingleThreadedMatMulF16(
+    const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
+    Eigen::half* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
+    int32 transpose_rhs) {
+  SingleThreadedMatMul<Eigen::half>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                    transpose_lhs, transpose_rhs);
+}
+
 void __xla_cpu_runtime_EigenSingleThreadedMatMulF32(
     const void* run_options_ptr, float* out, float* lhs, float* rhs, int64 m,
     int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
-  if (m == 1 || n == 1) {
-    xla::EigenMatVecF32(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-  } else {
-    MatMul<float>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
-                  transpose_rhs);
-  }
+  SingleThreadedMatMul<float>(run_options_ptr, out, lhs, rhs, m, n, k,
+                              transpose_lhs, transpose_rhs);
 }
 
 void __xla_cpu_runtime_EigenSingleThreadedMatMulF64(
     const void* run_options_ptr, double* out, double* lhs, double* rhs, int64 m,
     int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
-  if (m == 1 || n == 1) {
-    xla::EigenMatVecF64(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-  } else {
-    MatMul<double>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
-                   transpose_rhs);
-  }
+  SingleThreadedMatMul<double>(run_options_ptr, out, lhs, rhs, m, n, k,
+                               transpose_lhs, transpose_rhs);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h
index 029eb9514287d8c69cde2cfb06e0d56e78d6f165..82a1fcce594fa5b04f4fe459870991863c32a91a 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
 extern "C" {
@@ -25,6 +26,12 @@ extern "C" {
 // 'out' is a pointer to a buffer sufficiently large to hold the result of the
 // operation. Following standard nomenclature: lhs is m x k, rhs is k x n, and
 // out is m x n.
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF16(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
 extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
     float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.cc b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
index 61b408b8c24dded134218110d4e219c31f1685a8..42fe955f1917e0268dc739e44fbd0a7afb39185c 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
@@ -20,12 +20,13 @@ namespace cpu {
 
 std::vector<int64> ShapePartitionAssigner::Run(int64 target_partition_count) {
   // Gather outer-most dims where dim_size >= 'target_partition_count'.
-  // Note: always leave inner-dim static for vectorization/optimizations.
+  // This may include the inner-dim as LLVM can vectorize loops with dynamic
+  // bounds.
   std::vector<int64> outer_dims;
   int64 outer_dim_size = 1;
   // TODO(b/27458679) Consider reserving enough minor dimensions (based on
   // target vector register width) to enable vector instructions.
-  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 1; --i) {
+  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 0; --i) {
     const int64 dimension = shape_.layout().minor_to_major(i);
     outer_dims.push_back(dimension);
     outer_dim_size *= shape_.dimensions(dimension);
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
index ee0c53fa6d7c41481a53350e57e5844dea2644c1..ae80a6f4977f85cfd9f872734fd0a69432a1f382 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
@@ -30,105 +30,65 @@ class ShapePartitionAssignerTest : public HloTestBase {
  protected:
   typedef std::vector<int64> Vec;
 
-  void RunR2Test(const Shape& shape, const int64 expected_max_partition_count) {
+  void RunR2Test(const Shape& shape, int64 max_target_partition_count,
+                 const std::vector<int64>* expected_partitions) {
     ShapePartitionAssigner assigner(shape);
-    // Check all partitions of outer dimension.
-    for (int64 i = 1; i <= expected_max_partition_count; ++i) {
-      EXPECT_TRUE(ContainersEqual(Vec({i}),
-                                  assigner.Run(/*target_partition_count=*/i)));
+    // Iterate through 1..max_target_partition_count.
+    for (int64 i = 1; i <= max_target_partition_count; ++i) {
+      std::vector<int64> actual_partitions =
+          assigner.Run(/*target_partition_count=*/i);
+      EXPECT_THAT(actual_partitions, expected_partitions[i - 1]);
     }
-    // Check target_partition_count > outer dimension size.
-    EXPECT_TRUE(ContainersEqual(
-        Vec({expected_max_partition_count}),
-        assigner.Run(
-            /*target_partition_count=*/expected_max_partition_count + 1)));
   }
 };
 
 TEST_F(ShapePartitionAssignerTest, Shape13WithLayout10) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {1, 3}, {1, 0}), 1);
+  std::vector<int64> expected_partitions[] = {{1} /* 1 */, {1, 2} /* 2 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {1, 3}, {1, 0}), 2,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape31WithLayout01) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {3, 1}, {0, 1}), 1);
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */, {1, 2} /* 2 */
+  };
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {3, 1}, {0, 1}), 2,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape53WithLayout10) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0}), 5);
+  std::vector<int64> expected_partitions[] = {{1} /* 1 */, {2} /* 2 */,
+                                              {3} /* 3 */, {4} /* 4 */,
+                                              {5} /* 5 */, {3, 2} /* 6 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0}), 6,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape53WithLayout01) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {0, 1}), 3);
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */, {2} /* 2 */, {3} /* 3 */, {2, 2} /* 4 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {0, 1}), 4,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape532WithLayout210) {
-  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0});
-  ShapePartitionAssigner assigner(shape);
-
-  for (int64 i = 1; i <= 5; ++i) {
-    EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run(
-                                              /*target_partition_count=*/i)));
-  }
-
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({4, 2}), assigner.Run(/*target_partition_count=*/8)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/10)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/11)));
-  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
-                              assigner.Run(/*target_partition_count=*/12)));
-  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
-                              assigner.Run(/*target_partition_count=*/13)));
-  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
-                              assigner.Run(/*target_partition_count=*/14)));
-  EXPECT_TRUE(ContainersEqual(Vec({5, 3}),
-                              assigner.Run(/*target_partition_count=*/15)));
-  EXPECT_TRUE(ContainersEqual(Vec({5, 3}),
-                              assigner.Run(/*target_partition_count=*/16)));
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */,     {2} /* 2 */,     {3} /* 3 */,     {4} /* 4 */,
+      {5} /* 5 */,     {3, 2} /* 6 */,  {3, 2} /* 7 */,  {4, 2} /* 8 */,
+      {3, 3} /* 9 */,  {3, 3} /* 10 */, {3, 3} /* 11 */, {4, 3} /* 12 */,
+      {4, 3} /* 13 */, {4, 3} /* 14 */, {5, 3} /* 15 */, {4, 2, 2} /* 16 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0}), 16,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape532WithLayout201) {
-  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 0, 1});
-  ShapePartitionAssigner assigner(shape);
-
-  for (int64 i = 1; i <= 3; ++i) {
-    EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run(
-                                              /*target_partition_count=*/i)));
-  }
-
-  EXPECT_TRUE(
-      ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/4)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/5)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/8)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/10)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/11)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
-                              assigner.Run(/*target_partition_count=*/12)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
-                              assigner.Run(/*target_partition_count=*/13)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
-                              assigner.Run(/*target_partition_count=*/14)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 5}),
-                              assigner.Run(/*target_partition_count=*/15)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 5}),
-                              assigner.Run(/*target_partition_count=*/16)));
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */,     {2} /* 2 */,     {3} /* 3 */,     {2, 2} /* 4 */,
+      {2, 2} /* 5 */,  {3, 2} /* 6 */,  {3, 2} /* 7 */,  {3, 2} /* 8 */,
+      {3, 3} /* 9 */,  {3, 3} /* 10 */, {3, 3} /* 11 */, {3, 4} /* 12 */,
+      {3, 4} /* 13 */, {3, 4} /* 14 */, {3, 5} /* 15 */, {3, 2, 2} /* 16 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 0, 1}), 16,
+            expected_partitions);
 }
 
 class ShapePartitionIteratorTest : public HloTestBase {
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index aa8d4ad9dc51b2c1f500898f8bbd2c548f710643..ff6f0a9d4e443c2ed7d2dd6c58f4aaf28205b0cb 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -31,9 +31,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fork_join.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_fp16.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
@@ -86,7 +89,6 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
-      execution_session_(string_pool_),
       symbol_resolver_(llvm::orc::createLegacyLookupResolver(
           [this](const std::string& name) -> llvm::JITSymbol {
             return this->ResolveRuntimeSymbol(name);
@@ -177,19 +179,29 @@ bool RegisterKnownJITSymbols() {
 
   REGISTER_CPU_RUNTIME_SYMBOL(AcquireInfeedBufferForDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(AcquireOutfeedBufferForPopulation);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenFft);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation);
 
+  registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee));
+  registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee));
+
 #undef REGISTER_CPU_RUNTIME_SYMBOL
 
 // Register both the f32 (float) and f64 (double) versions of a libm symbol.
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index d0011e0a185cd0284d2f9334594f6e06d9284be7..f4260a95bc45557b6cd969f7d3fff01c8b392575 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -46,9 +46,7 @@ namespace cpu {
 class SimpleOrcJIT {
  public:
   using ObjLayerT = llvm::orc::RTDyldObjectLinkingLayer;
-  using CompileFtor =
-      std::function<llvm::object::OwningBinary<llvm::object::ObjectFile>(
-          llvm::Module&)>;
+  using CompileFtor = std::function<ObjLayerT::ObjectPtr(llvm::Module&)>;
   using CompileLayerT = llvm::orc::IRCompileLayer<ObjLayerT, CompileFtor>;
   using VModuleKeyT = llvm::orc::VModuleKey;
 
@@ -104,7 +102,6 @@ class SimpleOrcJIT {
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const Disassembler disassembler_;
   const llvm::DataLayout data_layout_;
-  llvm::orc::SymbolStringPool string_pool_;
   llvm::orc::ExecutionSession execution_session_;
   std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
   ObjLayerT object_layer_;
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index 150db1cb6edec1af6724a8bca6a5f6272f1a7416..cd1165e23812861ba9951546b7dd744529232196 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -370,6 +370,9 @@ std::vector<llvm::Value*> VectorSupportLibrary::ComputeHorizontalSums(
 std::vector<llvm::Value*>
 VectorSupportLibrary::ComputeAvxOptimizedHorizontalSums(
     std::vector<llvm::Value*> vectors, llvm::Value* init_values) {
+  // vectors are N llvm vector values, each with N elements.
+  int64 lane_width = vectors.size();
+
   while (vectors.size() != 2) {
     std::vector<llvm::Value*> new_vectors;
     for (int i = 0; i < vectors.size(); i += 2) {
@@ -390,10 +393,14 @@ VectorSupportLibrary::ComputeAvxOptimizedHorizontalSums(
     high = AddInternal(ExtractHighHalf(init_values), high);
   }
 
+  // `low` has the first `lane_width / 2` horizontal reductions, and `high` has
+  // the next `lane_width / 2` horizontal reductions.
+
   std::vector<llvm::Value*> results;
-  for (int i = 0; i < 8; i++) {
+  for (int i = 0; i < lane_width; i++) {
     llvm::Value* scalar_result = ir_builder()->CreateExtractElement(
-        i < 4 ? low : high, ir_builder()->getInt32(i % 4), name());
+        i < (lane_width / 2) ? low : high,
+        ir_builder()->getInt32(i % (lane_width / 2)), name());
     results.push_back(scalar_result);
   }
 
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d938f3a2c4b5bfdd70d5a614b9890b4d7bf050f7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/despecializer.h"
+
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/defuser.h"
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+namespace xla {
+
+Despecializer::Despecializer() : pipeline_("despecializer") {
+  // TODO(b/70588125): Also deal with window reversal in a fast way.
+  pipeline_.AddPass<Defuser>();
+  pipeline_.AddPass<ImplicitBroadcastRemover>();
+  pipeline_.AddPass<BFloat16MixedPrecisionRemoval>();
+}
+
+StatusOr<bool> Despecializer::Run(HloModule* module) {
+  return pipeline_.Run(module);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h
new file mode 100644
index 0000000000000000000000000000000000000000..af48f4ab6e506d295251239fe92db68cfec6dcfa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/despecializer.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Creates an HloPassPipeline containing multiple HloPasses that can
+// despecialize an optimized HloModule. This is useful to run an HloModule
+// optimized for one specfic platform on a different platform (undoing platform
+// specific passes) with matching numerics for comparison.
+//
+// Current despecialization passes are Defuser, ImplicitBroadcastRemover,
+// and BFloat16MixedPrecisionRemoval.
+class Despecializer : public HloPassInterface {
+ public:
+  Despecializer();
+  tensorflow::StringPiece name() const override { return "despecializer"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  HloPassPipeline pipeline_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 78e7aa48accdbb51a8477455f5f9c004828c068f..35db4fd2a22cc1615ade77a801cb28c504db09a6 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -24,19 +24,16 @@ limitations under the License.
 namespace xla {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    const perftools::gputools::Platform* platform,
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        stream_executors)
+    const se::Platform* platform,
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors)
     : DeviceMemoryAllocator(platform),
       stream_executors_(stream_executors.begin(), stream_executors.end()) {}
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
-StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) {
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
+StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                       GetStreamExecutor(device_ordinal));
-  perftools::gputools::DeviceMemoryBase result =
-      stream_executor->AllocateArray<uint8>(size);
+  se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
   if (size > 0 && result == nullptr) {
     return ResourceExhausted(
         "Failed to allocate request for %s (%lluB) on device ordinal %d",
@@ -47,22 +44,22 @@ StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
 }
 
 tensorflow::Status StreamExecutorMemoryAllocator::Deallocate(
-    int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) {
+    int device_ordinal, se::DeviceMemoryBase* mem) {
   if (!mem->is_null()) {
-    TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                         GetStreamExecutor(device_ordinal));
     // We make a local copy of 'mem' so the original is not zeroed out by the
     // Deallocate() call below. This gives us a better chance of
     // catching double-free bugs, since Deallocate silently succeeds for null
     // values.
-    perftools::gputools::DeviceMemoryBase mem_copy(*mem);
+    se::DeviceMemoryBase mem_copy(*mem);
     stream_executor->Deallocate(&mem_copy);
   }
   return tensorflow::Status::OK();
 }
 
-StatusOr<perftools::gputools::StreamExecutor*>
-StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
+StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
+    int device_ordinal) {
   if (device_ordinal < 0) {
     return InvalidArgument("device ordinal value (%d) must be non-negative",
                            device_ordinal);
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 39dfad84c1c1c1c461c24de555ecd919cea47d83..da45c4d45a1c56fd39b1e3e17ff131de59ceeced 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -33,30 +33,42 @@ class DeviceMemoryAllocator {
  public:
   // Parameter platform indicates which platform the allocator allocates memory
   // on. Must be non-null.
-  explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
+  explicit DeviceMemoryAllocator(const se::Platform* platform)
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
   // 'retry_on_failure': If false, and the first attempt to allocate the memory
-  // fails, the allocation should return immediately without retrying.
-  // An example use case is optional scratch spaces where a failure
-  // has only performance impact.
+  // fails, the allocation should return immediately without retrying.  An
+  // example use case is optional scratch spaces where a failure has only
+  // performance impact.
+  //
   // Allocate() should return a null pointer for a size-0 allocation.
   // Deallocate() must be a no-op for null pointers.
-  virtual StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure = true) = 0;
-  virtual tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
+  virtual StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal,
+                                                  uint64 size,
+                                                  bool retry_on_failure) = 0;
+
+  // Two-arg version of Allocate(), which sets retry-on-failure to true.
+  //
+  // (We don't simply use a default argument on the virtual Allocate function
+  // because default args on virtual functions are disallowed by the Google
+  // style guide.)
+  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size) {
+    return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
+  }
+
+  virtual tensorflow::Status Deallocate(int device_ordinal,
+                                        se::DeviceMemoryBase* mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
-  const perftools::gputools::Platform* platform() const { return platform_; }
+  const se::Platform* platform() const { return platform_; }
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
   virtual bool AllowsAsynchronousDeallocation() const = 0;
 
  protected:
-  const perftools::gputools::Platform* platform_;
+  const se::Platform* platform_;
 };
 
 // Default memory allocator for a platform which uses
@@ -64,25 +76,27 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   StreamExecutorMemoryAllocator(
-      const perftools::gputools::Platform* platform,
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          stream_executors);
+      const se::Platform* platform,
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure = true) override;
-  tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
+  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
+                                          bool retry_on_failure) override;
+
+  // Pull in two-arg overload that sets retry_on_failure to true.
+  using DeviceMemoryAllocator::Allocate;
+
+  tensorflow::Status Deallocate(int device_ordinal,
+                                se::DeviceMemoryBase* mem) override;
 
   bool AllowsAsynchronousDeallocation() const override;
 
  private:
-  StatusOr<perftools::gputools::StreamExecutor*> GetStreamExecutor(
-      int device_ordinal);
+  StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);
 
   // A vector indexed by device ordinal of StreamExecutors for each device of
   // the allocator's platform type. If an element is nullptr, then the device
   // with the respective device ordinal is not supported by XLA.
-  std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
+  std::vector<se::StreamExecutor*> stream_executors_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 56723e765048698baedc50ae7b189d0287ee56b8..0528b076027603796a445d8b0e9cbcebd1b513a7 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -147,6 +147,9 @@ class DfsHloVisitorBase {
   virtual Status HandleLog(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleClz(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleCos(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index ecda5288ee17a3856ce95f0caa327c3524fd180b..240faebe62f5cee4f61b3c36b5e8f653cfd6db8e 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -35,6 +35,12 @@ class HloInstruction;
 // DfsHloVisitor with default action based on the HloInstruction being visited.
 // Users should not use this class directly, but use the type aliases
 // DfsHloVisitorWithDefault/ConstDfsHloVisitorWithDefault instead.
+//
+// Do *not* add an override to this class if the opcode is covered by
+// HandleElementwiseUnary/Binary. These opcode handlers dispatch to
+// HandleElementwiseUnary/Binary in DfsHloVisitorBase. Adding such a handler
+// here will break passes which rely on the HandleElementwiseUnary/Binary
+// handling these opcodes.
 template <typename HloInstructionPtr>
 class DfsHloVisitorWithDefaultBase
     : public DfsHloVisitorBase<HloInstructionPtr> {
@@ -70,12 +76,6 @@ class DfsHloVisitorWithDefaultBase
   Status HandleConcatenate(HloInstructionPtr concatenate) override {
     return DefaultAction(concatenate);
   }
-  Status HandleConvert(HloInstructionPtr convert) override {
-    return DefaultAction(convert);
-  }
-  Status HandleCopy(HloInstructionPtr copy) override {
-    return DefaultAction(copy);
-  }
   Status HandleSelect(HloInstructionPtr select) override {
     return DefaultAction(select);
   }
@@ -91,9 +91,6 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCrossReplicaSum(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
-  Status HandleCompare(HloInstructionPtr compare) override {
-    return DefaultAction(compare);
-  }
   Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..825e1436f0ec6d49b555e5e3e9c2c7a19fb7b062
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class DfsHloVisitorWithDefaultTest : public HloTestBase {};
+
+TEST_F(DfsHloVisitorWithDefaultTest, DefaultElementwiseTest) {
+  // Verify that HandleElementwiseBinary and HandleElementwiseUnary are called
+  // on the appropriate HLO ops (elementwise binary/unary ops).
+
+  class ElementwiseTestVisitor : public DfsHloVisitorWithDefault {
+   public:
+    Status DefaultAction(HloInstruction* hlo) override {
+      // The HLO should be neither an elementwise unary nor binary op. These
+      // cases are handled in HandleElementwiseBinary/Unary.
+      TF_RET_CHECK(!(hlo->IsElementwise() && hlo->operand_count() == 2))
+          << hlo->ToString();
+      TF_RET_CHECK(!(hlo->IsElementwise() && hlo->operand_count() == 1))
+          << hlo->ToString();
+      return Status::OK();
+    }
+
+    Status HandleElementwiseBinary(HloInstruction* hlo) override {
+      // HLO should be elementwise binary.
+      TF_RET_CHECK(hlo->IsElementwise() && hlo->operand_count() == 2)
+          << hlo->ToString();
+      return Status::OK();
+    }
+    Status HandleElementwiseUnary(HloInstruction* hlo) override {
+      // HLO should be elementwise unary.
+      TF_RET_CHECK(hlo->IsElementwise() && hlo->operand_count() == 1)
+          << hlo->ToString();
+      return Status::OK();
+    }
+  };
+
+  // HLO module contains are arbitrary mix of elementwise and non-elementwise
+  // operations.
+  const string& hlo_string = R"(
+HloModule TestModule
+
+ENTRY TestComputation {
+  arg = f32[] parameter(0)
+  tuple = (f32[]) tuple(arg)
+  gte = f32[] get-tuple-element(tuple), index=0
+  abs = f32[] abs(arg)
+  add = f32[] add(arg, gte)
+  broadcast = f32[42] broadcast(add), dimensions={}
+  slice = f32[0] slice(broadcast), slice={[1:2]}
+  copy = f32[] copy(arg)
+  eq = pred[] equal-to(arg, gte)
+  neg = f32[] negate(arg)
+  ROOT convert = f64[] convert(f32[] arg)
+})";
+  std::unique_ptr<HloModule> module =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())
+          .ConsumeValueOrDie();
+  ElementwiseTestVisitor visitor;
+  TF_EXPECT_OK(module->entry_computation()->Accept(&visitor));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 4468adbadbf823f1420a8b665a26f66cb7d36b43..ae32d33766093cf4e610a0dc05f7d8c88cb37d31 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -52,6 +52,13 @@ using tensorflow::strings::StrCat;
 
 namespace {
 
+int64 GlobalRandomValue() {
+  static auto* mu = new tensorflow::mutex();
+  static std::mt19937_64 rng{42};
+  tensorflow::mutex_lock l(*mu);
+  return rng();
+}
+
 llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
                                       int64 mantissa_bits,
                                       llvm::IRBuilder<>* ir_builder) {
@@ -226,7 +233,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
       if (primitive_util::IsIntegralType(to_type)) {
         return ir_builder_->CreateIntCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_),
-            primitive_util::IsSignedIntegralType(to_type));
+            primitive_util::IsSignedIntegralType(from_type));
       }
       if (primitive_util::IsFloatingPointType(to_type)) {
         if (to_type == BF16) {
@@ -293,6 +300,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
         return operand_value;
       }
     }
+    case HloOpcode::kClz: {
+      auto is_zero_undef = ir_builder_->getFalse();
+      return llvm_ir::EmitCallToIntrinsic(
+          llvm::Intrinsic::ctlz, {operand_value, is_zero_undef},
+          {operand_value->getType()}, ir_builder_);
+    }
     case HloOpcode::kSign: {
       bool is_signed =
           primitive_util::IsSignedIntegralType(op->shape().element_type());
@@ -1003,6 +1016,30 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
                                   ir_builder_);
 }
 
+static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* ir_builder,
+                                             llvm::Value* lhs, llvm::Value* rhs,
+                                             llvm::Value* shift_result,
+                                             bool saturate_to_sign_bit) {
+  llvm::IntegerType* integer_type =
+      llvm::cast<llvm::IntegerType>(lhs->getType());
+  unsigned integer_bitsize = integer_type->getBitWidth();
+  llvm::ConstantInt* integer_bitsize_constant =
+      llvm::ConstantInt::get(integer_type, integer_bitsize);
+  llvm::ConstantInt* zero = llvm::ConstantInt::get(integer_type, 0);
+  llvm::ConstantInt* minus_one = llvm::ConstantInt::get(integer_type, -1);
+  llvm::Value* saturated_value;
+  if (saturate_to_sign_bit) {
+    saturated_value = ir_builder->CreateSelect(
+        ir_builder->CreateICmpSLT(lhs, zero), minus_one, zero);
+  } else {
+    saturated_value = zero;
+  }
+  llvm::Value* shift_amt_in_range =
+      ir_builder->CreateICmpULT(rhs, integer_bitsize_constant, "shft.chk");
+  return ir_builder->CreateSelect(shift_amt_in_range, shift_result,
+                                  saturated_value);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value,
     bool is_signed) const {
@@ -1050,12 +1087,27 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
       return ir_builder_->CreateAnd(lhs_value, rhs_value);
     case HloOpcode::kOr:
       return ir_builder_->CreateOr(lhs_value, rhs_value);
-    case HloOpcode::kShiftLeft:
-      return ir_builder_->CreateShl(lhs_value, rhs_value);
+
+    // Shifting out bits >= the number of bits in the type being shifted
+    // produces a poison value in LLVM which is basically "deferred undefined
+    // behavior" -- doing something observable with such a value precipitates
+    // UB.  We replace the poison value with a constant to avoid this deferred
+    // UB.
     case HloOpcode::kShiftRightArithmetic:
-      return ir_builder_->CreateAShr(lhs_value, rhs_value);
+      return SaturateShiftIfNecessary(
+          ir_builder_, lhs_value, rhs_value,
+          ir_builder_->CreateAShr(lhs_value, rhs_value),
+          /*saturate_to_sign_bit=*/true);
+    case HloOpcode::kShiftLeft:
+      return SaturateShiftIfNecessary(
+          ir_builder_, lhs_value, rhs_value,
+          ir_builder_->CreateShl(lhs_value, rhs_value),
+          /*saturate_to_sign_bit=*/false);
     case HloOpcode::kShiftRightLogical:
-      return ir_builder_->CreateLShr(lhs_value, rhs_value);
+      return SaturateShiftIfNecessary(
+          ir_builder_, lhs_value, rhs_value,
+          ir_builder_->CreateLShr(lhs_value, rhs_value),
+          /*saturate_to_sign_bit=*/false);
     default:
       return Unimplemented("binary integer op '%s'",
                            HloOpcodeString(op->opcode()).c_str());
@@ -1130,7 +1182,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
   llvm::Value* increment = ir_builder_->getInt(
       llvm::APInt(128, {0x14057B7EF767814F, 0x5851F42D4C957F2D}));
 
-  auto random_value = [hlo]() {
+  auto random_value_from_hlo = [hlo]() {
     const HloModule* module =
         hlo->IsFused() ? hlo->parent()->FusionInstruction()->parent()->parent()
                        : hlo->parent()->parent();
@@ -1152,10 +1204,15 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
       /*Ty=*/ir_builder_->getInt64Ty(),
       /*isConstant=*/false,
       /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
-      /*Initializer=*/ir_builder_->getInt64(random_value()),
+      /*Initializer=*/ir_builder_->getInt64(random_value_from_hlo()),
       /*Name=*/"state_ptr0");
+
+  // When the module config seed is 0, the expected result of a prng is a random
+  // value. Instead of using the random_value_from_hlo, we need a global random
+  // value as the graph seed. This is because if we use random_value_from_hlo
+  // here, then for a newly built hlo graph, it always gives the same number.
   uint64 graph_seed = hlo_module_config_.seed() != 0 ? hlo_module_config_.seed()
-                                                     : random_value();
+                                                     : GlobalRandomValue();
   llvm::GlobalVariable* state_ptr1 = new llvm::GlobalVariable(
       /*M=*/*module_,
       /*Ty=*/ir_builder_->getInt64Ty(),
@@ -1287,6 +1344,525 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
   };
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalSelect(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  TF_ASSIGN_OR_RETURN(llvm::Value * pred_value,
+                      operand_to_generator.at(hlo->operand(0))(
+                          ElementwiseSourceIndex(index, *hlo, 0)));
+  TF_ASSIGN_OR_RETURN(llvm::Value * on_true_value,
+                      operand_to_generator.at(hlo->operand(1))(
+                          ElementwiseSourceIndex(index, *hlo, 1)));
+  TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value,
+                      operand_to_generator.at(hlo->operand(2))(
+                          ElementwiseSourceIndex(index, *hlo, 2)));
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateTrunc(pred_value, ir_builder_->getInt1Ty()),
+      on_true_value, on_false_value);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalClamp(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  TF_ASSIGN_OR_RETURN(llvm::Value * min_value,
+                      operand_to_generator.at(hlo->operand(0))(
+                          ElementwiseSourceIndex(index, *hlo, 0)));
+  TF_ASSIGN_OR_RETURN(llvm::Value * arg_value,
+                      operand_to_generator.at(hlo->operand(1))(
+                          ElementwiseSourceIndex(index, *hlo, 1)));
+  TF_ASSIGN_OR_RETURN(llvm::Value * max_value,
+                      operand_to_generator.at(hlo->operand(2))(
+                          ElementwiseSourceIndex(index, *hlo, 2)));
+  PrimitiveType prim_type = hlo->shape().element_type();
+  if (primitive_util::IsFloatingPointType(prim_type)) {
+    return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+  } else if (primitive_util::IsIntegralType(prim_type)) {
+    bool is_signed = primitive_util::IsSignedIntegralType(prim_type);
+    return EmitIntegralMin(
+        max_value, EmitIntegralMax(min_value, arg_value, is_signed), is_signed);
+  } else {
+    return Unimplemented("Clamp unimplemented for %s",
+                         PrimitiveType_Name(prim_type).c_str());
+  }
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& target_index) const {
+  const int64 concat_dim = hlo->dimensions(0);
+  auto source_index = target_index;
+
+  llvm::BasicBlock* init_block = ir_builder_->GetInsertBlock();
+
+  // A terminator should be present iff we're emitting code
+  // into the middle (as opposed to the end) of a basic block.
+  CHECK_EQ(ir_builder_->GetInsertPoint() == init_block->end(),
+           init_block->getTerminator() == nullptr);
+
+  llvm::BasicBlock* exit_block;
+  if (ir_builder_->GetInsertPoint() == init_block->end()) {
+    exit_block = llvm_ir::CreateBasicBlock(
+        /*insert_before=*/nullptr, IrName(hlo, "merge"), ir_builder_);
+  } else {
+    exit_block = init_block->splitBasicBlock(ir_builder_->GetInsertPoint(),
+                                             AsStringRef(IrName(hlo, "merge")));
+    init_block->getTerminator()->eraseFromParent();
+  }
+
+  llvm_ir::SetToFirstInsertPoint(exit_block, ir_builder_);
+  llvm::PHINode* output = ir_builder_->CreatePHI(
+      llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
+      hlo->operands().size());
+  auto prior_insert_point = ir_builder_->GetInsertPoint();
+
+  ir_builder_->SetInsertPoint(init_block);
+
+  for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
+       ++operand_idx) {
+    const HloInstruction* operand = hlo->operand(operand_idx);
+    auto true_block = llvm_ir::CreateBasicBlock(
+        exit_block, StrCat("concat_index_from_operand", operand_idx),
+        ir_builder_);
+    auto false_block = llvm_ir::CreateBasicBlock(
+        exit_block, StrCat("concat_index_not_from_operand", operand_idx),
+        ir_builder_);
+    auto concat_dim_size =
+        llvm::ConstantInt::get(source_index[concat_dim]->getType(),
+                               operand->shape().dimensions(concat_dim));
+    ir_builder_->CreateCondBr(
+        ir_builder_->CreateICmpULT(source_index[concat_dim], concat_dim_size),
+        true_block, false_block);
+
+    // Create the terminator of the true block before calling operand
+    // generators, because they require non-degenerate basic blocks.
+    ir_builder_->SetInsertPoint(
+        llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
+    TF_ASSIGN_OR_RETURN(llvm::Value * value,
+                        operand_to_generator.at(operand)(source_index));
+    output->addIncoming(value, ir_builder_->GetInsertBlock());
+
+    // Subtract the size of the concat dimension of the current operand
+    // from the source index.
+    ir_builder_->SetInsertPoint(false_block);
+    source_index[concat_dim] =
+        ir_builder_->CreateSub(source_index[concat_dim], concat_dim_size);
+  }
+
+  ir_builder_->CreateUnreachable();
+  ir_builder_->SetInsertPoint(exit_block, prior_insert_point);
+  return output;
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  // Emit IR to read dynamic start indices from hlo->operand(1).
+  const HloInstruction* input_hlo = hlo->operand(0);
+  const int64 rank = ShapeUtil::Rank(input_hlo->shape());
+  llvm_ir::IrArray::Index slice_start_index(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
+    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
+                        operand_to_generator.at(hlo->operand(1))(dim_index));
+    start_index_value->setName(
+        AsStringRef(IrName(hlo, StrCat("start_idx", i))));
+    slice_start_index[i] = start_index_value;
+  }
+
+  llvm_ir::IrArray::Index input_index(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    // Emit IR which computes:
+    //   input_index = (start_index + offset_index) % dim_size
+    // Security note: this is the code that keeps the indices in-bounds.
+    llvm::Value* dim_size = llvm::ConstantInt::get(
+        index[i]->getType(), input_hlo->shape().dimensions(i));
+    llvm::Value* start_index = ir_builder_->CreateZExtOrBitCast(
+        slice_start_index[i], index[i]->getType());
+    input_index[i] = ir_builder_->CreateURem(
+        ir_builder_->CreateAdd(start_index, index[i]), dim_size);
+  }
+  return operand_to_generator.at(input_hlo)(input_index);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  const Shape& operand_shape = hlo->operand(0)->shape();
+  const Shape& indices_shape = hlo->operand(1)->shape();
+  const Shape& output_shape = hlo->shape();
+
+  const GatherDimensionNumbers& dim_numbers = hlo->gather_dimension_numbers();
+
+  const llvm_ir::ElementGenerator& operand_generator =
+      operand_to_generator.at(hlo->operand(0));
+  const llvm_ir::ElementGenerator& indices_generator =
+      operand_to_generator.at(hlo->operand(1));
+
+  // This is the index into `operand` that holds the element we want to
+  // generate.  This index "unsafe" as in the components in here may be
+  // out of bounds.
+  IrArray::Index unsafe_operand_index;
+
+  // First copy in the window indices to unsafe_operand_index.
+  for (int64 i = 0, e = operand_shape.dimensions_size(),
+             unsafe_operand_index_dim = 0;
+       i < e; i++) {
+    if (c_binary_search(dim_numbers.elided_window_dims(), i)) {
+      unsafe_operand_index.push_back(ir_builder_->getInt64(0));
+    } else {
+      unsafe_operand_index.push_back(
+          index[dim_numbers.output_window_dims(unsafe_operand_index_dim++)]);
+    }
+  }
+
+  // This is the index of the index vector in the gather_indices tensor.
+  IrArray::Index gather_index_index;
+  {
+    std::vector<llvm::Value*> gather_index_index_components;
+    for (int64 i = 0, e = output_shape.dimensions_size(); i < e; i++) {
+      if (!c_binary_search(dim_numbers.output_window_dims(), i)) {
+        gather_index_index.push_back(index[i]);
+      }
+    }
+
+    if (gather_index_index.size() != indices_shape.dimensions_size()) {
+      gather_index_index.InsertAt(dim_numbers.index_vector_dim(), nullptr);
+    }
+  }
+
+  auto add_to_unsafe_operand_index = [&](llvm::Value* index_component,
+                                         int64 dim) {
+    llvm::Value* gather_dim_component_extended = ir_builder_->CreateSExtOrTrunc(
+        index_component, ir_builder_->getInt64Ty());
+    unsafe_operand_index[dim_numbers.gather_dims_to_operand_dims(dim)] =
+        ir_builder_->CreateAdd(
+            unsafe_operand_index[dim_numbers.gather_dims_to_operand_dims(dim)],
+            gather_dim_component_extended);
+  };
+
+  if (indices_shape.dimensions_size() == dim_numbers.index_vector_dim()) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * gather_dim_component,
+                        indices_generator(gather_index_index));
+    add_to_unsafe_operand_index(gather_dim_component, 0);
+  } else {
+    int64 index_vector_size =
+        indices_shape.dimensions(dim_numbers.index_vector_dim());
+    for (int64 i = 0; i < index_vector_size; i++) {
+      gather_index_index[dim_numbers.index_vector_dim()] =
+          ir_builder_->getInt64(i);
+      TF_ASSIGN_OR_RETURN(llvm::Value * gather_dim_component,
+                          indices_generator(gather_index_index));
+      add_to_unsafe_operand_index(gather_dim_component, i);
+    }
+  }
+
+  IrArray::Index safe_operand_index;
+  for (int64 i = 0, e = unsafe_operand_index.size(); i < e; i++) {
+    safe_operand_index.push_back(ir_builder_->CreateURem(
+        unsafe_operand_index[i],
+        ir_builder_->getInt64(operand_shape.dimensions(i))));
+  }
+
+  return operand_generator(safe_operand_index);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  const HloInstruction* input_hlo = hlo->operand(0);
+  const HloInstruction* update_hlo = hlo->operand(1);
+  const HloInstruction* start_hlo = hlo->operand(2);
+  // Calculate slice start/end indices.
+  const int64 rank = ShapeUtil::Rank(input_hlo->shape());
+  llvm_ir::IrArray::Index slice_start_index(rank);
+  llvm_ir::IrArray::Index slice_limit_index(rank);
+  // Slice starts at update[index - slice_start_index_adjusted],
+  // where adjusted value = slice_start_index when in bounds, and
+  // adjusted value = slice_start_index - input_dim, when wrapping.
+  llvm_ir::IrArray::Index slice_start_index_adjusted(rank);
+
+  // Slice intersection gathers (ANDs) conditions on all ranks for which
+  // 'input' is set to 'update'
+  llvm::Value* slice_intersection = ir_builder_->getTrue();
+
+  for (int64 i = 0; i < rank; ++i) {
+    // Emit IR to read dynamic start indices from 'start_hlo'.
+    llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
+    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
+                        operand_to_generator.at(start_hlo)(dim_index));
+    start_index_value->setName(
+        AsStringRef(IrName(hlo, StrCat("start_idx", i))));
+    slice_start_index[i] = ir_builder_->CreateZExtOrBitCast(
+        start_index_value, index[i]->getType());
+
+    llvm::Value* input_dim_size = llvm::ConstantInt::get(
+        index[i]->getType(), input_hlo->shape().dimensions(i));
+    llvm::Value* update_dim_size = llvm::ConstantInt::get(
+        index[i]->getType(), update_hlo->shape().dimensions(i));
+
+    // Generate code to handle wrapping semantics:
+    // slice_start_index[i] = slice_start_index[i] % input_dim_size;
+    // slice_limit_index[i] = slice_start_index[i] + update_dim_size.
+    // slice_start_index[i] is updated in place and it will now be in
+    // range. slice_limit_index[i] may be out of range, and it's being
+    // URem-ed below if so.
+    slice_start_index[i] =
+        ir_builder_->CreateURem(slice_start_index[i], input_dim_size);
+    slice_limit_index[i] =
+        ir_builder_->CreateAdd(slice_start_index[i], update_dim_size);
+
+    // Test if slice_limit_index[i] is in bounds
+    llvm::Value* in_bounds =
+        ir_builder_->CreateICmpULE(slice_limit_index[i], input_dim_size);
+    llvm_ir::LlvmIfData if_in_bounds =
+        llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
+
+    // Handle true BB (slice_limit_index[i] <= input_dim_size).
+    SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
+    // Check that index[i] >= slice_start_index[i] &&
+    //            index[i] < slice_limit_index[i]
+    llvm::Value* slice_intersection_in_bounds = ir_builder_->CreateAnd(
+        slice_intersection,
+        ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
+        "slice_intersection_in");
+    slice_intersection_in_bounds = ir_builder_->CreateAnd(
+        slice_intersection_in_bounds,
+        ir_builder_->CreateICmpSLT(index[i], slice_limit_index[i]),
+        "slice_intersection_in");
+
+    // Handle false BB (slice_limit_index[i] > input_dim_size).
+    SetToFirstInsertPoint(if_in_bounds.false_block, ir_builder_);
+    // Check that index[i] >= slice_start_index[i] ||
+    //            index[i] < slice_limit_index[i]%input_dim_size.
+    llvm::Value* index_wraps = ir_builder_->CreateICmpSLT(
+        index[i],
+        ir_builder_->CreateURem(slice_limit_index[i], input_dim_size));
+    llvm::Value* slice_intersection_or = ir_builder_->CreateOr(
+        ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]), index_wraps,
+        "slice_intersection_out");
+    llvm::Value* slice_intersection_out_of_bounds = ir_builder_->CreateAnd(
+        slice_intersection, slice_intersection_or, "slice_intersection_out");
+    // Create value for slice_start_index_adjusted[i] when out of bounds.
+    // If within out-of-bounds if.
+    llvm_ir::LlvmIfData if_start_needs_adjustment =
+        llvm_ir::EmitIfThenElse(index_wraps, "adjust_start", ir_builder_);
+    SetToFirstInsertPoint(if_start_needs_adjustment.true_block, ir_builder_);
+    llvm::Value* slice_start_index_adjusted_oob =
+        ir_builder_->CreateSub(slice_start_index[i], input_dim_size);
+    SetToFirstInsertPoint(if_start_needs_adjustment.after_block, ir_builder_);
+    llvm::PHINode* slice_start_index_adjusted_phi =
+        ir_builder_->CreatePHI(slice_start_index_adjusted_oob->getType(), 2);
+    slice_start_index_adjusted_phi->addIncoming(
+        slice_start_index_adjusted_oob, if_start_needs_adjustment.true_block);
+    slice_start_index_adjusted_phi->addIncoming(
+        slice_start_index[i], if_start_needs_adjustment.false_block);
+    // End of if within if.
+
+    // After checking in/out of bounds.
+    SetToFirstInsertPoint(if_in_bounds.after_block, ir_builder_);
+    llvm::PHINode* phi_slice_intersection =
+        ir_builder_->CreatePHI(slice_intersection->getType(), 2);
+    phi_slice_intersection->addIncoming(slice_intersection_in_bounds,
+                                        if_in_bounds.true_block);
+    phi_slice_intersection->addIncoming(slice_intersection_out_of_bounds,
+                                        if_start_needs_adjustment.after_block);
+    slice_intersection = phi_slice_intersection;
+
+    llvm::PHINode* phi_index =
+        ir_builder_->CreatePHI(slice_start_index[i]->getType(), 2);
+    phi_index->addIncoming(slice_start_index[i], if_in_bounds.true_block);
+    phi_index->addIncoming(slice_start_index_adjusted_phi,
+                           if_start_needs_adjustment.after_block);
+    slice_start_index_adjusted[i] = phi_index;
+  }
+
+  // Emit:
+  // if (slice_intersection) -> return data from 'update'.
+  // else                    -> return data from 'input'.
+  llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+      llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
+      "ret_value_addr", ir_builder_);
+  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
+      slice_intersection, "slice_intersection", ir_builder_);
+
+  // Handle true BB (return data from 'update')
+  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+  // Compute update index for intersection case.
+  llvm_ir::IrArray::Index update_index(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    llvm::Value* update_dim_size = llvm::ConstantInt::get(
+        index[i]->getType(), update_hlo->shape().dimensions(i));
+    // NOTE: Subtraction will be positive due to bounds checking above.
+    update_index[i] = ir_builder_->CreateURem(
+        ir_builder_->CreateSub(index[i], slice_start_index_adjusted[i]),
+        update_dim_size);
+  }
+  TF_ASSIGN_OR_RETURN(llvm::Value * true_value,
+                      operand_to_generator.at(update_hlo)(update_index));
+  ir_builder_->CreateStore(true_value, ret_value_addr);
+
+  // Handle false BB (return data from 'input')
+  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  TF_ASSIGN_OR_RETURN(llvm::Value * false_value,
+                      operand_to_generator.at(input_hlo)(index));
+  ir_builder_->CreateStore(false_value, ret_value_addr);
+
+  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
+  return ir_builder_->CreateLoad(ret_value_addr);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& padded_index) const {
+  auto index = padded_index;
+  llvm::Value* in_bounds = ir_builder_->getTrue();
+  for (size_t i = 0; i < index.size(); ++i) {
+    auto index_typed_const = [=](int64 n) {
+      return llvm::ConstantInt::get(index[i]->getType(), n);
+    };
+    const auto& pad_dim = hlo->padding_config().dimensions(i);
+    index[i] = ir_builder_->CreateSub(
+        index[i], index_typed_const(pad_dim.edge_padding_low()));
+    in_bounds = ir_builder_->CreateAnd(
+        in_bounds, ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
+        "in_bounds");
+    in_bounds = ir_builder_->CreateAnd(
+        in_bounds,
+        ir_builder_->CreateICmpEQ(
+            index_typed_const(0),
+            ir_builder_->CreateURem(
+                index[i], index_typed_const(pad_dim.interior_padding() + 1))),
+        "in_bounds");
+    index[i] = ir_builder_->CreateSDiv(
+        index[i], index_typed_const(pad_dim.interior_padding() + 1));
+    in_bounds = ir_builder_->CreateAnd(
+        in_bounds,
+        ir_builder_->CreateICmpSLT(
+            index[i],
+            index_typed_const(hlo->operand(0)->shape().dimensions(i))),
+        "in_bounds");
+  }
+
+  // if (in_bounds) {
+  //   ret_value = operand0[index];  // source
+  // } else {
+  //   ret_value = *operand1;        // padding
+  // }
+  llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+      llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
+      "pad_result_addr", ir_builder_);
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
+  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+  TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+                      operand_to_generator.at(hlo->operand(0))(index));
+  ir_builder_->CreateStore(operand_value, ret_value_addr);
+
+  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
+                      operand_to_generator.at(hlo->operand(1))({}));
+  ir_builder_->CreateStore(padding_value, ret_value_addr);
+
+  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
+  // Don't create phi(operand_value, padding_value) here, because invoking
+  // operand_to_generator may create new basic blocks, making the parent
+  // of operand_value or padding_value no longer a predecessor of
+  // if_data.after_block.
+  return ir_builder_->CreateLoad(ret_value_addr);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& dot_result_index) const {
+  auto lhs_generator = operand_to_generator.at(hlo->operand(0));
+  auto rhs_generator = operand_to_generator.at(hlo->operand(1));
+  int64 contracted_dim_size = hlo->operand(0)->shape().dimensions(
+      hlo->operand(0)->shape().dimensions_size() - 1);
+  int64 lhs_dims = hlo->operand(0)->shape().dimensions_size();
+  int64 rhs_dims = hlo->operand(1)->shape().dimensions_size();
+
+  std::unique_ptr<llvm_ir::ForLoop> inner_loop = llvm_ir::ForLoop::EmitForLoop(
+      IrName(hlo, "inner"), ir_builder_->getInt64(0),
+      ir_builder_->getInt64(contracted_dim_size), ir_builder_->getInt64(1),
+      ir_builder_);
+
+  SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(), ir_builder_);
+  PrimitiveType primitive_type = hlo->shape().element_type();
+  llvm::Type* primitive_type_llvm =
+      llvm_ir::PrimitiveTypeToIrType(primitive_type, module_);
+  llvm::Value* accumulator_alloca = llvm_ir::EmitAllocaAtFunctionEntry(
+      primitive_type_llvm, "dot_acc", ir_builder_);
+  ir_builder_->CreateStore(llvm::Constant::getNullValue(primitive_type_llvm),
+                           accumulator_alloca);
+
+  SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), ir_builder_);
+
+  // This is the inner reduction loop for a dot operation that produces
+  // one element in the output.  If the operands to the dot operation have
+  // shapes [A,B,C,T] and [D,T,E], the result has a shape [A,B,C,D,E].
+  // Given an output index [a,b,c,d,e] in the result, we compute:
+  //   sum(lhs[a,b,c,t]*rhs[d,t,e] for t in [0, T))
+
+  IrArray::Index lhs_index, rhs_index;
+
+  for (int64 i = 0; i < lhs_dims - 1; i++) {
+    lhs_index.push_back(dot_result_index[i]);
+  }
+  lhs_index.push_back(inner_loop->GetIndVarValue());
+
+  for (int64 i = 0; i < rhs_dims - 2; i++) {
+    rhs_index.push_back(dot_result_index[lhs_dims - 1 + i]);
+  }
+  rhs_index.push_back(inner_loop->GetIndVarValue());
+  rhs_index.push_back(dot_result_index.back());
+
+  llvm::Value* current_accumulator =
+      ir_builder_->CreateLoad(accumulator_alloca);
+  TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index));
+  TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index));
+  llvm::Value* next_accumulator;
+  if (primitive_util::IsComplexType(primitive_type)) {
+    llvm::Value* product_real = ir_builder_->CreateFSub(
+        ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                EmitExtractReal(rhs_value)),
+        ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                EmitExtractImag(rhs_value)));
+    llvm::Value* product_imag = ir_builder_->CreateFAdd(
+        ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                EmitExtractImag(rhs_value)),
+        ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                EmitExtractReal(rhs_value)));
+    next_accumulator = ir_builder_->CreateInsertValue(
+        current_accumulator,
+        ir_builder_->CreateFAdd(EmitExtractReal(current_accumulator),
+                                product_real),
+        {0});
+    next_accumulator = ir_builder_->CreateInsertValue(
+        next_accumulator,
+        ir_builder_->CreateFAdd(EmitExtractImag(current_accumulator),
+                                product_imag),
+        {1});
+  } else if (primitive_util::IsFloatingPointType(primitive_type)) {
+    next_accumulator = ir_builder_->CreateFAdd(
+        current_accumulator, ir_builder_->CreateFMul(lhs_value, rhs_value));
+  } else {
+    next_accumulator = ir_builder_->CreateAdd(
+        current_accumulator, ir_builder_->CreateMul(lhs_value, rhs_value));
+  }
+  ir_builder_->CreateStore(next_accumulator, accumulator_alloca);
+
+  SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), ir_builder_);
+  return ir_builder_->CreateLoad(accumulator_alloca);
+}
+
 llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator)
@@ -1295,6 +1871,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kConvert:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
@@ -1353,43 +1930,12 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kSelect:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        TF_ASSIGN_OR_RETURN(llvm::Value * pred_value,
-                            operand_to_generator.at(hlo->operand(0))(
-                                ElementwiseSourceIndex(index, *hlo, 0)));
-        TF_ASSIGN_OR_RETURN(llvm::Value * on_true_value,
-                            operand_to_generator.at(hlo->operand(1))(
-                                ElementwiseSourceIndex(index, *hlo, 1)));
-        TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value,
-                            operand_to_generator.at(hlo->operand(2))(
-                                ElementwiseSourceIndex(index, *hlo, 2)));
-        return ir_builder_->CreateSelect(
-            ir_builder_->CreateTrunc(pred_value, ir_builder_->getInt1Ty()),
-            on_true_value, on_false_value);
+        return EmitElementalSelect(hlo, operand_to_generator, index);
       };
     case HloOpcode::kClamp:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        TF_ASSIGN_OR_RETURN(llvm::Value * min_value,
-                            operand_to_generator.at(hlo->operand(0))(
-                                ElementwiseSourceIndex(index, *hlo, 0)));
-        TF_ASSIGN_OR_RETURN(llvm::Value * arg_value,
-                            operand_to_generator.at(hlo->operand(1))(
-                                ElementwiseSourceIndex(index, *hlo, 1)));
-        TF_ASSIGN_OR_RETURN(llvm::Value * max_value,
-                            operand_to_generator.at(hlo->operand(2))(
-                                ElementwiseSourceIndex(index, *hlo, 2)));
-        PrimitiveType prim_type = hlo->shape().element_type();
-        if (primitive_util::IsFloatingPointType(prim_type)) {
-          return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
-        } else if (primitive_util::IsIntegralType(prim_type)) {
-          bool is_signed = primitive_util::IsSignedIntegralType(prim_type);
-          return EmitIntegralMin(
-              max_value, EmitIntegralMax(min_value, arg_value, is_signed),
-              is_signed);
-        } else {
-          return Unimplemented("Clamp unimplemented for %s",
-                               PrimitiveType_Name(prim_type).c_str());
-        }
+        return EmitElementalClamp(hlo, operand_to_generator, index);
       };
     case HloOpcode::kReducePrecision:
       return [this, hlo, &operand_to_generator](
@@ -1402,70 +1948,8 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kConcatenate:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index target_index) -> StatusOr<llvm::Value*> {
-        const int64 concat_dim = hlo->dimensions(0);
-        auto source_index = target_index;
-
-        llvm::BasicBlock* init_block = ir_builder_->GetInsertBlock();
-
-        // A terminator should be present iff we're emitting code
-        // into the middle (as opposed to the end) of a basic block.
-        CHECK_EQ(ir_builder_->GetInsertPoint() == init_block->end(),
-                 init_block->getTerminator() == nullptr);
-
-        llvm::BasicBlock* exit_block;
-        if (ir_builder_->GetInsertPoint() == init_block->end()) {
-          exit_block = llvm_ir::CreateBasicBlock(
-              /*insert_before=*/nullptr, IrName(hlo, "merge"), ir_builder_);
-        } else {
-          exit_block = init_block->splitBasicBlock(
-              ir_builder_->GetInsertPoint(), AsStringRef(IrName(hlo, "merge")));
-          init_block->getTerminator()->eraseFromParent();
-        }
-
-        llvm_ir::SetToFirstInsertPoint(exit_block, ir_builder_);
-        llvm::PHINode* output =
-            ir_builder_->CreatePHI(llvm_ir::PrimitiveTypeToIrType(
-                                       hlo->shape().element_type(), module_),
-                                   hlo->operands().size());
-        auto prior_insert_point = ir_builder_->GetInsertPoint();
-
-        ir_builder_->SetInsertPoint(init_block);
-
-        for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
-             ++operand_idx) {
-          const HloInstruction* operand = hlo->operand(operand_idx);
-          auto true_block = llvm_ir::CreateBasicBlock(
-              exit_block, StrCat("concat_index_from_operand", operand_idx),
-              ir_builder_);
-          auto false_block = llvm_ir::CreateBasicBlock(
-              exit_block, StrCat("concat_index_not_from_operand", operand_idx),
-              ir_builder_);
-          auto concat_dim_size =
-              llvm::ConstantInt::get(source_index[concat_dim]->getType(),
-                                     operand->shape().dimensions(concat_dim));
-          ir_builder_->CreateCondBr(
-              ir_builder_->CreateICmpULT(source_index[concat_dim],
-                                         concat_dim_size),
-              true_block, false_block);
-
-          // Create the terminator of the true block before calling operand
-          // generators, because they require non-degenerate basic blocks.
-          ir_builder_->SetInsertPoint(
-              llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
-          TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                              operand_to_generator.at(operand)(source_index));
-          output->addIncoming(value, ir_builder_->GetInsertBlock());
-
-          // Subtract the size of the concat dimension of the current operand
-          // from the source index.
-          ir_builder_->SetInsertPoint(false_block);
-          source_index[concat_dim] =
-              ir_builder_->CreateSub(source_index[concat_dim], concat_dim_size);
-        }
-
-        ir_builder_->CreateUnreachable();
-        ir_builder_->SetInsertPoint(exit_block, prior_insert_point);
-        return output;
+        return EmitElementalConcatenate(hlo, operand_to_generator,
+                                        target_index);
       };
     case HloOpcode::kReverse:
       return [this, hlo, &operand_to_generator](
@@ -1483,15 +1967,12 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kBroadcast:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& target_index) -> StatusOr<llvm::Value*> {
+        const HloInstruction* operand = hlo->operand(0);
         // The `dimensions` member of the broadcast instruction maps from
         // input dimensions to output dimensions.
-        const HloInstruction* operand = hlo->operand(0);
-        int64 rank = ShapeUtil::Rank(operand->shape());
-        IrArray::Index source_index(rank);
-        for (int64 i = 0; i < rank; ++i) {
-          source_index[i] = target_index[hlo->dimensions(i)];
-        }
-        return operand_to_generator.at(operand)(source_index);
+        return operand_to_generator.at(
+            operand)(target_index.SourceIndexOfBroadcast(
+            hlo->shape(), operand->shape(), hlo->dimensions(), ir_builder_));
       };
     case HloOpcode::kSlice:
       return [this, hlo, &operand_to_generator](
@@ -1504,184 +1985,27 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kDynamicSlice:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        // Emit IR to read dynamic start indices from hlo->operand(1).
-        const HloInstruction* input_hlo = hlo->operand(0);
-        const int64 rank = ShapeUtil::Rank(input_hlo->shape());
-        llvm_ir::IrArray::Index slice_start_index(rank);
-        for (int64 i = 0; i < rank; ++i) {
-          llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
-          TF_ASSIGN_OR_RETURN(
-              llvm::Value * start_index_value,
-              operand_to_generator.at(hlo->operand(1))(dim_index));
-          start_index_value->setName(
-              AsStringRef(IrName(hlo, StrCat("start_idx", i))));
-          slice_start_index[i] = start_index_value;
-        }
+        return EmitElementalDynamicSlice(hlo, operand_to_generator, index);
+      };
 
-        llvm_ir::IrArray::Index input_index(rank);
-        for (int64 i = 0; i < rank; ++i) {
-          // Emit IR which computes:
-          //   input_index = (start_index + offset_index) % dim_size
-          // Security note: this is the code that keeps the indices in-bounds.
-          llvm::Value* dim_size = llvm::ConstantInt::get(
-              index[i]->getType(), input_hlo->shape().dimensions(i));
-          llvm::Value* start_index = ir_builder_->CreateZExtOrBitCast(
-              slice_start_index[i], index[i]->getType());
-          input_index[i] = ir_builder_->CreateURem(
-              ir_builder_->CreateAdd(start_index, index[i]), dim_size);
-        }
-        return operand_to_generator.at(input_hlo)(input_index);
+    case HloOpcode::kGather:
+      return [this, hlo, &operand_to_generator](
+                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+        return EmitElementalGather(hlo, operand_to_generator, index);
       };
     case HloOpcode::kDynamicUpdateSlice:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        const HloInstruction* input_hlo = hlo->operand(0);
-        const HloInstruction* update_hlo = hlo->operand(1);
-        const HloInstruction* start_hlo = hlo->operand(2);
-        // Calculate slice start/end indices.
-        const int64 rank = ShapeUtil::Rank(input_hlo->shape());
-        llvm_ir::IrArray::Index slice_start_index(rank);
-        llvm_ir::IrArray::Index slice_limit_index(rank);
-        // Slice starts at update[index - slice_start_index_adjusted],
-        // where adjusted value = slice_start_index when in bounds, and
-        // adjusted value = slice_start_index - input_dim, when wrapping.
-        llvm_ir::IrArray::Index slice_start_index_adjusted(rank);
-
-        // Slice intersection gathers (ANDs) conditions on all ranks for which
-        // 'input' is set to 'update'
-        llvm::Value* slice_intersection = ir_builder_->getTrue();
-
-        for (int64 i = 0; i < rank; ++i) {
-          // Emit IR to read dynamic start indices from 'start_hlo'.
-          llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
-          TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
-                              operand_to_generator.at(start_hlo)(dim_index));
-          start_index_value->setName(
-              AsStringRef(IrName(hlo, StrCat("start_idx", i))));
-          slice_start_index[i] = ir_builder_->CreateZExtOrBitCast(
-              start_index_value, index[i]->getType());
-
-          llvm::Value* input_dim_size = llvm::ConstantInt::get(
-              index[i]->getType(), input_hlo->shape().dimensions(i));
-          llvm::Value* update_dim_size = llvm::ConstantInt::get(
-              index[i]->getType(), update_hlo->shape().dimensions(i));
-
-          // Generate code to handle wrapping semantics:
-          // slice_start_index[i] = slice_start_index[i] % input_dim_size;
-          // slice_limit_index[i] = slice_start_index[i] + update_dim_size.
-          // slice_start_index[i] is updated in place and it will now be in
-          // range. slice_limit_index[i] may be out of range, and it's being
-          // URem-ed below if so.
-          slice_start_index[i] =
-              ir_builder_->CreateURem(slice_start_index[i], input_dim_size);
-          slice_limit_index[i] =
-              ir_builder_->CreateAdd(slice_start_index[i], update_dim_size);
-
-          // Test if slice_limit_index[i] is in bounds
-          llvm::Value* in_bounds =
-              ir_builder_->CreateICmpULE(slice_limit_index[i], input_dim_size);
-          llvm_ir::LlvmIfData if_in_bounds =
-              llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-
-          // Handle true BB (slice_limit_index[i] <= input_dim_size).
-          SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
-          // Check that index[i] >= slice_start_index[i] &&
-          //            index[i] < slice_limit_index[i]
-          llvm::Value* slice_intersection_in_bounds = ir_builder_->CreateAnd(
-              slice_intersection,
-              ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
-              "slice_intersection_in");
-          slice_intersection_in_bounds = ir_builder_->CreateAnd(
-              slice_intersection_in_bounds,
-              ir_builder_->CreateICmpSLT(index[i], slice_limit_index[i]),
-              "slice_intersection_in");
-
-          // Handle false BB (slice_limit_index[i] > input_dim_size).
-          SetToFirstInsertPoint(if_in_bounds.false_block, ir_builder_);
-          // Check that index[i] >= slice_start_index[i] ||
-          //            index[i] < slice_limit_index[i]%input_dim_size.
-          llvm::Value* index_wraps = ir_builder_->CreateICmpSLT(
-              index[i],
-              ir_builder_->CreateURem(slice_limit_index[i], input_dim_size));
-          llvm::Value* slice_intersection_or = ir_builder_->CreateOr(
-              ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
-              index_wraps, "slice_intersection_out");
-          llvm::Value* slice_intersection_out_of_bounds =
-              ir_builder_->CreateAnd(slice_intersection, slice_intersection_or,
-                                     "slice_intersection_out");
-          // Create value for slice_start_index_adjusted[i] when out of bounds.
-          // If within out-of-bounds if.
-          llvm_ir::LlvmIfData if_start_needs_adjustment =
-              llvm_ir::EmitIfThenElse(index_wraps, "adjust_start", ir_builder_);
-          SetToFirstInsertPoint(if_start_needs_adjustment.true_block,
-                                ir_builder_);
-          llvm::Value* slice_start_index_adjusted_oob =
-              ir_builder_->CreateSub(slice_start_index[i], input_dim_size);
-          SetToFirstInsertPoint(if_start_needs_adjustment.after_block,
-                                ir_builder_);
-          llvm::PHINode* slice_start_index_adjusted_phi =
-              ir_builder_->CreatePHI(slice_start_index_adjusted_oob->getType(),
-                                     2);
-          slice_start_index_adjusted_phi->addIncoming(
-              slice_start_index_adjusted_oob,
-              if_start_needs_adjustment.true_block);
-          slice_start_index_adjusted_phi->addIncoming(
-              slice_start_index[i], if_start_needs_adjustment.false_block);
-          // End of if within if.
-
-          // After checking in/out of bounds.
-          SetToFirstInsertPoint(if_in_bounds.after_block, ir_builder_);
-          llvm::PHINode* phi_slice_intersection =
-              ir_builder_->CreatePHI(slice_intersection->getType(), 2);
-          phi_slice_intersection->addIncoming(slice_intersection_in_bounds,
-                                              if_in_bounds.true_block);
-          phi_slice_intersection->addIncoming(
-              slice_intersection_out_of_bounds,
-              if_start_needs_adjustment.after_block);
-          slice_intersection = phi_slice_intersection;
-
-          llvm::PHINode* phi_index =
-              ir_builder_->CreatePHI(slice_start_index[i]->getType(), 2);
-          phi_index->addIncoming(slice_start_index[i], if_in_bounds.true_block);
-          phi_index->addIncoming(slice_start_index_adjusted_phi,
-                                 if_start_needs_adjustment.after_block);
-          slice_start_index_adjusted[i] = phi_index;
-        }
-
-        // Emit:
-        // if (slice_intersection) -> return data from 'update'.
-        // else                    -> return data from 'input'.
-        llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
-                                           module_),
-            "ret_value_addr", ir_builder_);
-        llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            slice_intersection, "slice_intersection", ir_builder_);
-
-        // Handle true BB (return data from 'update')
-        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
-        // Compute update index for intersection case.
-        llvm_ir::IrArray::Index update_index(rank);
-        for (int64 i = 0; i < rank; ++i) {
-          llvm::Value* update_dim_size = llvm::ConstantInt::get(
-              index[i]->getType(), update_hlo->shape().dimensions(i));
-          // NOTE: Subtraction will be positive due to bounds checking above.
-          update_index[i] = ir_builder_->CreateURem(
-              ir_builder_->CreateSub(index[i], slice_start_index_adjusted[i]),
-              update_dim_size);
-        }
-        TF_ASSIGN_OR_RETURN(llvm::Value * true_value,
-                            operand_to_generator.at(update_hlo)(update_index));
-        ir_builder_->CreateStore(true_value, ret_value_addr);
-
-        // Handle false BB (return data from 'input')
-        SetToFirstInsertPoint(if_data.false_block, ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * false_value,
-                            operand_to_generator.at(input_hlo)(index));
-        ir_builder_->CreateStore(false_value, ret_value_addr);
-
-        SetToFirstInsertPoint(if_data.after_block, ir_builder_);
-        return ir_builder_->CreateLoad(ret_value_addr);
+        return EmitElementalDynamicUpdateSlice(hlo, operand_to_generator,
+                                               index);
+      };
+    case HloOpcode::kBitcast:
+      CHECK_EQ(ShapeUtil::ElementsIn(hlo->shape()),
+               ShapeUtil::ElementsIn(hlo->operand(0)->shape()));
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        const HloInstruction* operand = hlo->operand(0);
+        return operand_to_generator.at(operand)(index.SourceIndexOfBitcast(
+            hlo->shape(), operand->shape(), ir_builder_));
       };
     case HloOpcode::kReshape:
       CHECK_EQ(ShapeUtil::ElementsIn(hlo->shape()),
@@ -1702,155 +2026,16 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kRng:
       return MakeRngElementGenerator(hlo, operand_to_generator);
     case HloOpcode::kPad:
-      return [=, &operand_to_generator](
+      return [this, hlo, &operand_to_generator](
                  const IrArray::Index& padded_index) -> StatusOr<llvm::Value*> {
-        auto index = padded_index;
-        llvm::Value* in_bounds = ir_builder_->getTrue();
-        for (size_t i = 0; i < index.size(); ++i) {
-          auto index_typed_const = [=](int64 n) {
-            return llvm::ConstantInt::get(index[i]->getType(), n);
-          };
-          const auto& pad_dim = hlo->padding_config().dimensions(i);
-          index[i] = ir_builder_->CreateSub(
-              index[i], index_typed_const(pad_dim.edge_padding_low()));
-          in_bounds = ir_builder_->CreateAnd(
-              in_bounds,
-              ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
-              "in_bounds");
-          in_bounds = ir_builder_->CreateAnd(
-              in_bounds,
-              ir_builder_->CreateICmpEQ(
-                  index_typed_const(0),
-                  ir_builder_->CreateURem(
-                      index[i],
-                      index_typed_const(pad_dim.interior_padding() + 1))),
-              "in_bounds");
-          index[i] = ir_builder_->CreateSDiv(
-              index[i], index_typed_const(pad_dim.interior_padding() + 1));
-          in_bounds = ir_builder_->CreateAnd(
-              in_bounds,
-              ir_builder_->CreateICmpSLT(
-                  index[i],
-                  index_typed_const(hlo->operand(0)->shape().dimensions(i))),
-              "in_bounds");
-        }
-
-        // if (in_bounds) {
-        //   ret_value = operand0[index];  // source
-        // } else {
-        //   ret_value = *operand1;        // padding
-        // }
-        llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
-                                           module_),
-            "pad_result_addr", ir_builder_);
-        llvm_ir::LlvmIfData if_data =
-            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
-                            operand_to_generator.at(hlo->operand(0))(index));
-        ir_builder_->CreateStore(operand_value, ret_value_addr);
-
-        SetToFirstInsertPoint(if_data.false_block, ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
-                            operand_to_generator.at(hlo->operand(1))({}));
-        ir_builder_->CreateStore(padding_value, ret_value_addr);
-
-        SetToFirstInsertPoint(if_data.after_block, ir_builder_);
-        // Don't create phi(operand_value, padding_value) here, because invoking
-        // operand_to_generator may create new basic blocks, making the parent
-        // of operand_value or padding_value no longer a predecessor of
-        // if_data.after_block.
-        return ir_builder_->CreateLoad(ret_value_addr);
+        return EmitElementalPad(hlo, operand_to_generator, padded_index);
       };
 
     case HloOpcode::kDot:
-      return [=, &operand_to_generator](const IrArray::Index& dot_result_index)
+      return [this, hlo,
+              &operand_to_generator](const IrArray::Index& dot_result_index)
                  -> StatusOr<llvm::Value*> {
-        auto lhs_generator = operand_to_generator.at(hlo->operand(0));
-        auto rhs_generator = operand_to_generator.at(hlo->operand(1));
-        int64 contracted_dim_size = hlo->operand(0)->shape().dimensions(
-            hlo->operand(0)->shape().dimensions_size() - 1);
-        int64 lhs_dims = hlo->operand(0)->shape().dimensions_size();
-        int64 rhs_dims = hlo->operand(1)->shape().dimensions_size();
-
-        std::unique_ptr<llvm_ir::ForLoop> inner_loop =
-            llvm_ir::ForLoop::EmitForLoop(
-                IrName(hlo, "inner"), ir_builder_->getInt64(0),
-                ir_builder_->getInt64(contracted_dim_size),
-                ir_builder_->getInt64(1), ir_builder_);
-
-        SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(),
-                              ir_builder_);
-        PrimitiveType primitive_type = hlo->shape().element_type();
-        llvm::Type* primitive_type_llvm =
-            llvm_ir::PrimitiveTypeToIrType(primitive_type, module_);
-        llvm::Value* accumulator_alloca = llvm_ir::EmitAllocaAtFunctionEntry(
-            primitive_type_llvm, "dot_acc", ir_builder_);
-        ir_builder_->CreateStore(
-            llvm::Constant::getNullValue(primitive_type_llvm),
-            accumulator_alloca);
-
-        SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), ir_builder_);
-
-        // This is the inner reduction loop for a dot operation that produces
-        // one element in the output.  If the operands to the dot operation have
-        // shapes [A,B,C,T] and [D,T,E], the result has a shape [A,B,C,D,E].
-        // Given an output index [a,b,c,d,e] in the result, we compute:
-        //   sum(lhs[a,b,c,t]*rhs[d,t,e] for t in [0, T))
-
-        IrArray::Index lhs_index, rhs_index;
-
-        for (int64 i = 0; i < lhs_dims - 1; i++) {
-          lhs_index.push_back(dot_result_index[i]);
-        }
-        lhs_index.push_back(inner_loop->GetIndVarValue());
-
-        for (int64 i = 0; i < rhs_dims - 2; i++) {
-          rhs_index.push_back(dot_result_index[lhs_dims - 1 + i]);
-        }
-        rhs_index.push_back(inner_loop->GetIndVarValue());
-        rhs_index.push_back(dot_result_index.back());
-
-        llvm::Value* current_accumulator =
-            ir_builder_->CreateLoad(accumulator_alloca);
-        TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index));
-        TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index));
-        llvm::Value* next_accumulator;
-        if (primitive_util::IsComplexType(primitive_type)) {
-          llvm::Value* product_real = ir_builder_->CreateFSub(
-              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                      EmitExtractReal(rhs_value)),
-              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                      EmitExtractImag(rhs_value)));
-          llvm::Value* product_imag = ir_builder_->CreateFAdd(
-              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                      EmitExtractImag(rhs_value)),
-              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                      EmitExtractReal(rhs_value)));
-          next_accumulator = ir_builder_->CreateInsertValue(
-              current_accumulator,
-              ir_builder_->CreateFAdd(EmitExtractReal(current_accumulator),
-                                      product_real),
-              {0});
-          next_accumulator = ir_builder_->CreateInsertValue(
-              next_accumulator,
-              ir_builder_->CreateFAdd(EmitExtractImag(current_accumulator),
-                                      product_imag),
-              {1});
-        } else if (primitive_util::IsFloatingPointType(primitive_type)) {
-          next_accumulator = ir_builder_->CreateFAdd(
-              current_accumulator,
-              ir_builder_->CreateFMul(lhs_value, rhs_value));
-        } else {
-          next_accumulator = ir_builder_->CreateAdd(
-              current_accumulator,
-              ir_builder_->CreateMul(lhs_value, rhs_value));
-        }
-        ir_builder_->CreateStore(next_accumulator, accumulator_alloca);
-
-        SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), ir_builder_);
-        return ir_builder_->CreateLoad(accumulator_alloca);
+        return EmitElementalDot(hlo, operand_to_generator, dot_result_index);
       };
     default:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index c516a826d9e382bc738e54635426db639d17108c..26dff0d96f1d0f00fcdf12410a3769d18a186773 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -142,6 +142,46 @@ class ElementalIrEmitter {
     return ir_builder_->getIntN(128, 0);
   }
 
+  StatusOr<llvm::Value*> EmitElementalSelect(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalClamp(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalConcatenate(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& target_index) const;
+
+  StatusOr<llvm::Value*> EmitElementalDynamicSlice(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalGather(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalDynamicUpdateSlice(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalPad(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& padded_index) const;
+
+  StatusOr<llvm::Value*> EmitElementalDot(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& dot_result_index) const;
+
   llvm::IRBuilder<>* const ir_builder_;
 
   llvm::Module* module_;
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 90481c7a88f90edea5399ee44aee2d2c77fc115f..021f09d310b718b51932d0492d1b8f5eb562605c 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 
@@ -28,18 +29,19 @@ using tensorflow::gtl::ArraySlice;
 
 namespace xla {
 
-StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>>
-Executable::ExecuteOnStreams(
+StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
     ArraySlice<const ServiceExecutableRunOptions> run_options,
     ArraySlice<ArraySlice<const ShapedBuffer*>> arguments) {
   TF_RET_CHECK(run_options.size() == arguments.size());
 
-  std::vector<std::unique_ptr<ShapedBuffer>> return_values(run_options.size());
+  std::vector<ScopedShapedBuffer> return_values;
+  return_values.reserve(run_options.size());
 
   if (run_options.size() == 1) {
-    TF_ASSIGN_OR_RETURN(return_values[0],
+    TF_ASSIGN_OR_RETURN(auto rv,
                         ExecuteOnStream(&run_options[0], arguments[0],
                                         /*hlo_execution_profile=*/nullptr));
+    return_values.push_back(std::move(rv));
     return std::move(return_values);
   }
 
@@ -47,8 +49,9 @@ Executable::ExecuteOnStreams(
     // We cannot BlockHostUntilDone() on the already-launched executions in case
     // of error, since if the executions communicate, the initially launched
     // executions may never complete if not all executions are running.
-    TF_ASSIGN_OR_RETURN(return_values[i],
+    TF_ASSIGN_OR_RETURN(auto rv,
                         ExecuteAsyncOnStream(&run_options[i], arguments[i]));
+    return_values.push_back(std::move(rv));
   }
   for (const auto& options : run_options) {
     TF_RET_CHECK(options.stream() != nullptr);
@@ -57,13 +60,13 @@ Executable::ExecuteOnStreams(
   return std::move(return_values);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
+StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
     const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
     ArraySlice<const ShapedBuffer*> arguments) {
-  perftools::gputools::Stream* stream = run_options->stream();
-  std::unique_ptr<perftools::gputools::Timer> timer;
+  se::Stream* stream = run_options->stream();
+  std::unique_ptr<se::Timer> timer;
   if (profile != nullptr) {
-    timer.reset(new perftools::gputools::Timer(stream->parent()));
+    timer.reset(new se::Timer(stream->parent()));
     stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
   }
 
@@ -77,8 +80,9 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
                                             &hlo_profile_index_map())
           : nullptr;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> return_value =
+  StatusOr<ScopedShapedBuffer> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
+  TF_RETURN_IF_ERROR(return_value.status());
 
   if (profile != nullptr) {
     VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
@@ -159,4 +163,24 @@ Status Executable::DumpSessionModule() {
                                        result);
 }
 
+/* static */ Status Executable::DumpToDirectory(
+    const string& directory_path, string filename,
+    const HloSnapshot& hlo_session) {
+  tensorflow::Env* env = tensorflow::Env::Default();
+  if (!env->IsDirectory(directory_path).ok()) {
+    // NB! CreateDir does not work reliably with multiple XLA threads -- two
+    // threads can race to observe the absence of the dump directory and
+    // simultaneously try to create it, causing the "losing" thread to get a
+    // "directory already exists" error.
+    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path));
+  }
+  filename = SanitizeFileName(std::move(filename));
+  string file_path = tensorflow::io::JoinPath(directory_path, filename);
+  string result;
+  TF_RET_CHECK(
+      tensorflow::SerializeToStringDeterministic(hlo_session, &result));
+  return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path,
+                                       result);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 0aee535ee780ef000bc5e9963ff48786b3a61eb2..f7af1ca57492972c58d3ce5ddc083088a32a968f 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -63,14 +63,14 @@ class Executable {
   // enabled.
   //
   // Returns a shaped buffer containing the result of the computation.
-  virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  virtual StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) = 0;
 
   // Same as ExecuteOnStream(), but this call is non-blocking and returns as
   // soon as all of the operations are enqueued for launch on the stream.
-  virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) = 0;
 
@@ -78,7 +78,7 @@ class Executable {
   // streams. arguments[i] contains the arguments to the execution on
   // run_options[i]->stream() and the returned value is at index i of the
   // returned vector.
-  virtual StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>> ExecuteOnStreams(
+  virtual StatusOr<std::vector<ScopedShapedBuffer>> ExecuteOnStreams(
       tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions>
           run_options,
       tensorflow::gtl::ArraySlice<
@@ -91,14 +91,14 @@ class Executable {
   // has completed.
   virtual Status PopulateExecutionProfile(
       HloExecutionProfile* hlo_execution_profile,
-      perftools::gputools::StreamExecutor* executor) {
+      se::StreamExecutor* executor) {
     return Status::OK();
   }
 
   // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
   // timer for the execution, sets up HLO profiling if enabled, and fills in the
   // given ExecutionProfile if non-null.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStreamWrapper(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStreamWrapper(
       const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
@@ -109,14 +109,6 @@ class Executable {
     return execution_profile_;
   }
 
-  // Returns Status::ok() if the two executables are equal to each other.
-  //
-  // An error status is returned otherwise.
-  virtual const Status EqualOrFail(const Executable& executable) {
-    return Unimplemented(
-        "Equality test on this executable is not implemented.");
-  }
-
   const HloProfilePrinterData& hlo_profile_printer_data() const {
     CHECK(hlo_profiling_enabled());
     return *hlo_profile_printer_data_;
@@ -164,6 +156,10 @@ class Executable {
   static Status DumpToDirectory(const string& directory_path, string filename,
                                 const SessionModule& session_module);
 
+  // Dump hlo snapshot to directory_path/filename.
+  static Status DumpToDirectory(const string& directory_path, string filename,
+                                const HloSnapshot& hlo_session);
+
  protected:
   mutable tensorflow::mutex mutex_;
 
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index 2b6caa149439a86d6d047605099bc3ff7b295a8e..85409b330b11537158059dcce8c2a96c98d38f30 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -93,7 +93,7 @@ Status FlattenNode(const CallGraphNode& node) {
       auto current = worklist.back();
       worklist.pop_back();
       for (auto* instruction : current->instructions()) {
-        if (GetInstructionCallContext(instruction) !=
+        if (GetInstructionCallContext(instruction->opcode()) !=
             CallContext::kSequential) {
           continue;
         }
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d3e4b1fcdf6675955714cab262a8b2ca8ff4297
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -0,0 +1,388 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gather_expander.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+using tensorflow::gtl::ArraySlice;
+
+static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
+    HloInstruction* gather_indices, int64 index_vector_dim) {
+  const Shape& gather_indices_shape = gather_indices->shape();
+
+  if (gather_indices_shape.dimensions_size() == index_vector_dim) {
+    return gather_indices;
+  }
+
+  if (index_vector_dim == (gather_indices_shape.dimensions_size() - 1)) {
+    return gather_indices;
+  }
+
+  std::vector<int64> permutation;
+  permutation.reserve(gather_indices_shape.dimensions_size());
+  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != index_vector_dim) {
+      permutation.push_back(i);
+    }
+  }
+  permutation.push_back(index_vector_dim);
+  return MakeTransposeHlo(gather_indices, permutation);
+}
+
+// Canonicalizes the gather_indices tensors so that we only have deal with some
+// specific cases in the while loop that does the heavy lifting.
+//
+// See the "High Level Algorithm" section for a broader picture.
+static StatusOr<HloInstruction*> CanonicalizeGatherIndices(
+    HloInstruction* gather_indices, int64 index_vector_dim) {
+  // Transpose the non-index-vector dimensions to the front.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * transposed_gather_indices,
+      TransposeIndexVectorDimToLast(gather_indices, index_vector_dim));
+  bool indices_are_scalar =
+      index_vector_dim == gather_indices->shape().dimensions_size();
+
+  // The number of dimensions in gather_indices that are index dimensions.
+  const int64 index_dims_in_gather_indices = indices_are_scalar ? 0 : 1;
+
+  // If there is only one index (i.e. gather_indices has rank 1 and this gather
+  // is really just a dynamic slice) add a leading degenerate dimension for
+  // uniformity.  Otherwise create a "collapsed" leading dimension that subsumes
+  // all of the non-index-vector dimensions.
+  const Shape& shape = transposed_gather_indices->shape();
+  if (shape.dimensions_size() == index_dims_in_gather_indices) {
+    return PrependDegenerateDims(transposed_gather_indices, 1);
+  } else {
+    // Collapse all but the dimensions (0 or 1) in gather_indices containing the
+    // index vectors.
+    return CollapseFirstNDims(
+        transposed_gather_indices,
+        shape.dimensions_size() - index_dims_in_gather_indices);
+  }
+}
+
+// Expands out or contracts away the gather dimensions in the accumulator
+// produced by the while loop.
+static StatusOr<HloInstruction*> AdjustGatherDimsInAccumulator(
+    const Shape& gather_indices_shape, HloInstruction* accumulator,
+    int64 index_vector_dim) {
+  std::vector<int64> output_gather_dim_bounds;
+  output_gather_dim_bounds.reserve(gather_indices_shape.dimensions_size());
+  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != index_vector_dim) {
+      output_gather_dim_bounds.push_back(gather_indices_shape.dimensions(i));
+    }
+  }
+
+  if (output_gather_dim_bounds.empty()) {
+    // If output_gather_dim_bounds is empty we must be lowering a (effectively)
+    // dynamic-slice.  In that case, there is a leading degenerate gather
+    // dimension that we added to make this special case play well with the
+    // general while loop which we need to remove now.
+    return ElideDegenerateDims(accumulator, {0});
+  }
+
+  return ExpandFirstDimIntoNDims(accumulator, output_gather_dim_bounds);
+}
+
+// Expand an index vector from the gather_indices tensor into a vector that can
+// be used to dynamic-slice out of the gather operand.
+static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
+    HloInstruction* index_vector, const GatherDimensionNumbers& dim_numbers,
+    int64 operand_rank) {
+  HloComputation* computation = index_vector->parent();
+  const Shape& index_shape = index_vector->shape();
+  HloInstruction* zero =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateFromDimensions(index_shape.element_type(), {1})));
+
+  // We extract out individual components from the smaller index and concatenate
+  // them (interspersing zeros as needed) into the larger index.
+  std::vector<HloInstruction*> expanded_index_components;
+
+  for (int i = 0; i < operand_rank; i++) {
+    int64 index_vector_dim_index =
+        FindIndex(dim_numbers.gather_dims_to_operand_dims(), i);
+    if (index_vector_dim_index !=
+        dim_numbers.gather_dims_to_operand_dims_size()) {
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * component_to_concat,
+          MakeSliceHlo(index_vector, /*start_indices=*/{index_vector_dim_index},
+                       /*limit_indices=*/{index_vector_dim_index + 1},
+                       /*strides=*/{1}));
+      expanded_index_components.push_back(component_to_concat);
+    } else {
+      expanded_index_components.push_back(zero);
+    }
+  }
+
+  return MakeConcatHlo(expanded_index_components, /*dimension=*/0);
+}
+
+// This generates the body of the while that implements the main data movement
+// behavior of gather using dynamic-slice and dynamic-update-slice.
+static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
+    const HloInstruction& gather, HloInstruction* induction_var,
+    const std::vector<HloInstruction*>& incoming_loop_state) {
+  const GatherDimensionNumbers& dim_numbers = gather.gather_dimension_numbers();
+  CHECK_EQ(incoming_loop_state.size(), 3);
+  HloInstruction* const operand = incoming_loop_state[0];
+  HloInstruction* const gather_indices = incoming_loop_state[1];
+  HloInstruction* const output_accumulator = incoming_loop_state[2];
+
+  bool has_scalar_indices = gather_indices->shape().dimensions_size() == 1;
+  CHECK_EQ(has_scalar_indices,
+           dim_numbers.index_vector_dim() ==
+               gather.operand(1)->shape().dimensions_size());
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * induction_var_as_vector,
+      MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
+                       /*result_shape_bounds=*/{1}));
+
+  HloInstruction* index_vector;
+
+  if (has_scalar_indices) {
+    // In this case gather_indices has rank 1 and induction_var_as_vector (of
+    // shape {1}) is an index into this rank 1 tensor.
+    TF_ASSIGN_OR_RETURN(
+        index_vector,
+        MakeDynamicSliceHlo(gather_indices, induction_var_as_vector, {1}));
+  } else {
+    // In this case gather_indices has rank 2 and induction_var_as_vector (of
+    // shape {1}) is an index into just the first dimension of this rank 2
+    // tensor.
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_into_gather_indices,
+        PadVectorWithZeros(induction_var_as_vector,
+                           /*zeros_to_prepend=*/0, /*zeros_to_append=*/1));
+
+    int64 index_vector_size = gather_indices->shape().dimensions(1);
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_vector_2d,
+        MakeDynamicSliceHlo(gather_indices, index_into_gather_indices,
+                            {1, index_vector_size}));
+
+    TF_ASSIGN_OR_RETURN(index_vector,
+                        ElideDegenerateDims(index_vector_2d, {0}));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gathered_slice_start,
+      ExpandIndexVectorIntoOperandSpace(index_vector, dim_numbers,
+                                        operand->shape().dimensions_size()));
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice,
+                      MakeDynamicSliceHlo(operand, gathered_slice_start,
+                                          gather.gather_window_bounds()));
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gathered_slice_with_dims_elided,
+      ElideDegenerateDims(gathered_slice,
+                          AsInt64Slice(dim_numbers.elided_window_dims())));
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gathered_slice_for_update,
+      PrependDegenerateDims(gathered_slice_with_dims_elided, 1));
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * index_vector_into_accumulator,
+      PadVectorWithZeros(
+          induction_var_as_vector, /*zeros_to_prepend=*/0,
+          /*zeros_to_append=*/
+          gathered_slice_with_dims_elided->shape().dimensions_size()));
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * updated_accumulator,
+      MakeDynamicUpdateSliceHlo(output_accumulator, gathered_slice_for_update,
+                                index_vector_into_accumulator));
+
+  // New loop state -- only the accumulator has changed.  The
+  // WhileUtil::MakeCountedLoop functions takes care of the induction variable
+  // and the while loop exit condition.
+  return StatusOr<std::vector<HloInstruction*>>{
+      {operand, gather_indices, updated_accumulator}};
+}
+
+static StatusOr<HloInstruction*> CreateGatherLoopAccumulatorInitValue(
+    HloComputation* computation, PrimitiveType element_type,
+    ArraySlice<int64> window_bounds, int64 gather_loop_trip_count,
+    const GatherDimensionNumbers& dim_numbers) {
+  std::vector<int64> accumulator_state_shape_dims;
+  accumulator_state_shape_dims.reserve(1 + window_bounds.size());
+  accumulator_state_shape_dims.push_back(gather_loop_trip_count);
+  for (int64 i = 0; i < window_bounds.size(); i++) {
+    if (!c_binary_search(dim_numbers.elided_window_dims(), i)) {
+      accumulator_state_shape_dims.push_back(window_bounds[i]);
+    }
+  }
+  return BroadcastZeros(computation, element_type,
+                        accumulator_state_shape_dims);
+}
+
+// `accumulator` is almost the tensor the gather operation would have produced,
+// except that it has the dimensions in the wrong order -- the gather dimensions
+// are the major dimensions and the window dimensions are the minor dimensions.
+// Fix this up with a transpose.
+static StatusOr<HloInstruction*> PermuteGatherAndWindowDims(
+    HloInstruction* accumulator, ArraySlice<int64> output_window_dims,
+    int64 output_rank) {
+  std::vector<int64> permutation;
+  permutation.reserve(output_rank);
+
+  int64 gather_idx_counter = 0;
+  int64 window_idx_counter = output_rank - output_window_dims.size();
+  for (int64 i = 0; i < output_rank; i++) {
+    bool is_window_dim = c_binary_search(output_window_dims, i);
+    if (is_window_dim) {
+      permutation.push_back(window_idx_counter++);
+    } else {
+      permutation.push_back(gather_idx_counter++);
+    }
+  }
+
+  return MakeTransposeHlo(accumulator, permutation);
+}
+
+// High Level Algorithm
+//
+// We follow the following steps in sequence:
+//
+//  1. We canonicalize the gather_indices tensor such that it has rank
+//     2 (i.e. is a matrix) where each row is an index vector into the
+//     operand.
+//  2. We iterate over the set of indices in the canonicalized
+//     gather_indices tensor using a while loop, accumulating slices
+//     of the operand tensor into an accumulator using
+//     DynamicUpdateSlice.
+//  3. The accumulator result from the while loop from (2) is then
+//     reshaped to split out all the individual gather dimensions and
+//     then transposed to give the final result.
+//
+// As an example, if we started with the following operation:
+//
+//   HloModule TensorFlowGatherMultipleBatchDims
+//
+//   ENTRY main {
+//     operand = s32[3,3] parameter(0)
+//     indices = s32[2,2] parameter(1)
+//     ROOT gather = s32[2,3,2] gather(operand, indices),
+//         output_window_dims={1},
+//         elided_window_dims={1},
+//         gather_dims_to_operand_dims={1},
+//         index_vector_dim=2,
+//         window_bounds={3, 1}
+//   }
+//
+// We'd first reshape indices to s32[4,1], where each row is an index
+// into operand.  We'd then run a loop to slice out 4 tensors of shape
+// [3,1] out of operand into an accumulator of shape [4,3,1].  We then
+// reshape this result to [2,2,3] and finally transpose it to [2,3,2].
+
+StatusOr<HloInstruction*> GatherExpander::ExpandGather(
+    HloInstruction* gather_instr) {
+  CHECK(!ShapeUtil::HasZeroElements(gather_instr->shape()));
+
+  HloComputation* computation = gather_instr->parent();
+  HloInstruction* operand = gather_instr->mutable_operand(0);
+  HloInstruction* gather_indices = gather_instr->mutable_operand(1);
+  const Shape& gather_indices_shape = gather_indices->shape();
+  const Shape& output_shape = gather_instr->shape();
+  int64 output_rank = output_shape.dimensions_size();
+
+  const GatherDimensionNumbers& dim_numbers =
+      gather_instr->gather_dimension_numbers();
+
+  int64 gather_loop_trip_count = 1;
+  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != dim_numbers.index_vector_dim()) {
+      gather_loop_trip_count *= gather_indices_shape.dimensions(i);
+    }
+  }
+
+  if (!IsInt32(gather_loop_trip_count)) {
+    return Unimplemented(
+        "Gather operations with more than 2147483647 gather indices are not "
+        "supported. This error occurred for %s.",
+        gather_instr->ToString().c_str());
+  }
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * canonical_gather_indices,
+                      CanonicalizeGatherIndices(
+                          gather_indices, dim_numbers.index_vector_dim()));
+
+  CHECK_EQ(gather_loop_trip_count,
+           canonical_gather_indices->shape().dimensions(0));
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * accumulator_init,
+      CreateGatherLoopAccumulatorInitValue(
+          computation, output_shape.element_type(),
+          gather_instr->gather_window_bounds(), gather_loop_trip_count,
+          gather_instr->gather_dimension_numbers()));
+
+  StatusOr<std::vector<HloInstruction*>> gather_loop_result_or_error =
+      WhileUtil::MakeCountedLoop(
+          computation, gather_loop_trip_count,
+          {operand, canonical_gather_indices, accumulator_init},
+          [&](HloInstruction* indvar,
+              const std::vector<HloInstruction*>& loop_state) {
+            return GatherLoopBody(*gather_instr, indvar, loop_state);
+          });
+
+  TF_ASSIGN_OR_RETURN(std::vector<HloInstruction*> gather_loop_result,
+                      gather_loop_result_or_error);
+
+  HloInstruction* accumulator_result = gather_loop_result.back();
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * accumulator_with_output_gather_dims_decanonicalized,
+      AdjustGatherDimsInAccumulator(gather_indices->shape(), accumulator_result,
+                                    dim_numbers.index_vector_dim()));
+
+  return PermuteGatherAndWindowDims(
+      accumulator_with_output_gather_dims_decanonicalized,
+      AsInt64Slice(dim_numbers.output_window_dims()), output_rank);
+}
+
+StatusOr<bool> GatherExpander::Run(HloModule* module) {
+  auto is_nontrivial_gather = [](HloInstruction* inst) {
+    return inst->opcode() == HloOpcode::kGather &&
+           // Avoid expanding gather ops that produce zero sized tensors,
+           // instead punt these to ZeroSizedHloElimination.
+           !ShapeUtil::HasZeroElements(inst->shape());
+  };
+
+  std::vector<HloInstruction*> gather_instrs;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    c_copy_if(computation->instructions(), std::back_inserter(gather_instrs),
+              is_nontrivial_gather);
+  }
+
+  for (HloInstruction* inst : gather_instrs) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandGather(inst));
+    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
+  }
+
+  return !gather_instrs.empty();
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander.h b/tensorflow/compiler/xla/service/gather_expander.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1fc8574da99fff223c7dbb570b4533f76905b9a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gather_expander.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GATHER_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GATHER_EXPANDER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass rewrites gather operations into (roughly) while loops of dynamic
+// slices.  This lets backends that don't support gather directly to
+// nevertheless have a minimum level of support.
+class GatherExpander : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override { return "gather_expander"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<HloInstruction*> ExpandGather(HloInstruction* gather_instr);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GATHER_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c72ca066502eb549bf8638cdf0b7827b06f92d7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gather_expander.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+namespace xla {
+namespace {
+TEST(GatherExpanderTest, ErrorStatusOnTooManyIndices) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2147483647,5] parameter(1)
+  ROOT gather = s32[2147483647,3,5] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3, 1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_text));
+
+  Status status = GatherExpander{}.Run(module.get()).status();
+  EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
+
+  ASSERT_THAT(
+      status.error_message(),
+      ::testing::HasSubstr("Gather operations with more than 2147483647 gather "
+                           "indices are not supported."));
+}
+
+TEST(GatherExpanderTest, AvoidDegenerateDims) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherV2
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[3,2] gather(operand, indices),
+      output_window_dims={0},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3, 1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  HloInstruction* while_instr = nullptr;
+  for (auto* instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      ASSERT_EQ(while_instr, nullptr)
+          << "Expected exactly one while instruction in the entry computation "
+             "after gather expansion";
+      while_instr = instr;
+    }
+  }
+
+  ASSERT_NE(while_instr, nullptr)
+      << "Expected exactly one while instruction in the entry computation "
+         "after gather expansion";
+
+  // We want to avoid create while loop with shapes that have degenerate
+  // dimensions for TF gather.  In this case we expect the loop state to be of
+  // the shape (sNN[], s32[3,3]{1,0}, s32[2]{0}, s32[2,3]{1,0}).  The leading
+  // sNN is an implementation detail from WhileUtil::MakeCountedLoop so we don't
+  // check it here (though in theory the form of the while loop state is itself
+  // an implementation detail from WhileUtil::MakeCountedLoop).
+
+  const Shape& while_shape = while_instr->shape();
+  ASSERT_TRUE(ShapeUtil::IsTuple(while_shape));
+  ASSERT_EQ(ShapeUtil::TupleElementCount(while_shape), 4);
+
+  EXPECT_TRUE(ShapeUtil::SameDimensions(
+      ShapeUtil::MakeShape(S32, {3, 3}),
+      ShapeUtil::GetTupleElementShape(while_shape, 1)));
+
+  EXPECT_TRUE(ShapeUtil::SameDimensions(
+      ShapeUtil::MakeShape(S32, {2}),
+      ShapeUtil::GetTupleElementShape(while_shape, 2)));
+
+  EXPECT_TRUE(ShapeUtil::SameDimensions(
+      ShapeUtil::MakeShape(S32, {2, 3}),
+      ShapeUtil::GetTupleElementShape(while_shape, 3)));
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index 78dc0ad4fcd167c93f19d0c2b18ea72d666897ef..ddb687314ee8221ba9282f230db498b3a5d23499 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -32,29 +32,20 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id,
                                                size_t pointer_size)
-    : platform_id_(platform_id), pointer_size_(pointer_size) {
-  // We currently only support kHostPlatformId for CPU, kCudaPlatformId for
-  // GPU and kInterpreterPlatformId for Interpreter. Before supporting other
-  // platforms, we need to test this transfer manager on them.
-  CHECK(platform_id_ == se::host::kHostPlatformId ||
-        platform_id_ == se::interpreter::kInterpreterPlatformId ||
-        platform_id_ == se::cuda::kCudaPlatformId);
-}
+    : platform_id_(platform_id), pointer_size_(pointer_size) {}
 
 se::Platform::Id GenericTransferManager::PlatformId() const {
   return platform_id_;
 }
 
 Status GenericTransferManager::WriteSingleTupleIndexTable(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
-    const Shape& shape, perftools::gputools::DeviceMemoryBase* region) {
+    const Shape& shape, se::DeviceMemoryBase* region) {
   TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape));
 
   std::vector<const void*> element_pointers;
@@ -151,20 +142,19 @@ Status GenericTransferManager::TransferLiteralToInfeed(
 }
 
 Status GenericTransferManager::TransferBufferToInfeed(
-    perftools::gputools::StreamExecutor* executor, int64 size,
-    const void* source) {
+    se::StreamExecutor* executor, int64 size, const void* source) {
   return Unimplemented("Generic transfer to Infeed");
 }
 
 Status GenericTransferManager::TransferLiteralFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+    se::StreamExecutor* executor, const Shape& literal_shape,
     Literal* literal) {
   return Unimplemented(
       "Outfeed is not supported on this platform (b/30467474)");
 }
 
 Status GenericTransferManager::ResetDevices(
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*>
     /*executors*/) {
   return Unimplemented(
       "Device reset is not yet supported on this platform (b/30481585)");
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 63a7c820cf4e5fbbdf870086a4fb5316ac50d10b..0579099de40ba3e43f69e4e6474b56691064c692 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -36,46 +36,41 @@ namespace xla {
 // infeed.
 class GenericTransferManager : public TransferManager {
  public:
-  GenericTransferManager(perftools::gputools::Platform::Id platform_id,
-                         size_t pointer_size);
+  GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size);
   ~GenericTransferManager() override {}
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const ShapedBuffer& device_buffer) override;
+      se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override;
 
-  Status TransferLiteralToDevice(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToDevice(se::StreamExecutor* executor,
                                  const Literal& literal,
                                  const ShapedBuffer& device_buffer) override;
 
-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) override;
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;
 
   Status ResetDevices(
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          executors) override;
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
 
   int64 GetByteSizeRequirement(const Shape& shape) const override;
 
  protected:
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
 
   Status WriteSingleTupleIndexTable(
-      perftools::gputools::StreamExecutor* executor,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          elements,
-      const Shape& shape,
-      perftools::gputools::DeviceMemoryBase* region) override;
+      se::StreamExecutor* executor,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
+      const Shape& shape, se::DeviceMemoryBase* region) override;
 
  private:
   // The platform this transfer manager targets.
-  const perftools::gputools::Platform::Id platform_id_;
+  const se::Platform::Id platform_id_;
 
   // The size in bytes of pointers on this platform.
   const size_t pointer_size_;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 9da4fb97fa27a238fead74985cb481a9be1f4a65..f1707442fe3354d5183d905468810f3871146ff5 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -241,6 +241,7 @@ cc_library(
         "gpu_executable.cc",
         "infeed_thunk.cc",
         "kernel_thunk.cc",
+        "memset_thunk.cc",
         "sequential_thunk.cc",
         "thunk_schedule.cc",
         "tuple_thunk.cc",
@@ -257,6 +258,7 @@ cc_library(
         "gpu_executable.h",
         "infeed_thunk.h",
         "kernel_thunk.h",
+        "memset_thunk.h",
         "sequential_thunk.h",
         "thunk.h",
         "thunk_schedule.h",
@@ -273,6 +275,7 @@ cc_library(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -293,6 +296,7 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
+        "//tensorflow/stream_executor",
     ],
 )
 
@@ -397,6 +401,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -437,8 +442,10 @@ tf_cc_test(
         ":fusion_merger",
         ":instruction_fusion",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -452,6 +459,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:shape_inference",
     ],
@@ -510,9 +518,11 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:gather_expander",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
@@ -690,17 +700,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 2029c303d47e9a62135b003c3bd9be6f8b3438d4..837f05244f7a8c71483cc30eeac9e1c219e6bbd2 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index ea7f0eb3745f2e0e0bfd88c3dca79d6ad25884ed..c2fc35be4ca4bc6df85ee21fb6b564bfd6de3ec0 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -41,7 +41,7 @@ class BufferAllocations {
     // user-specified result buffers) to the given buffer index. The builder
     // will skip allocating buffers for registered buffer indices.
     void RegisterBuffer(BufferAllocation::Index index,
-                        perftools::gputools::DeviceMemoryBase address);
+                        se::DeviceMemoryBase address);
 
     // Builds a BufferAllocations object from the given buffer assignment.
     // `memory_allocator` is what this function uses to allocate device memory.
@@ -52,8 +52,7 @@ class BufferAllocations {
         DeviceMemoryAllocator* memory_allocator);
 
    private:
-    std::map<BufferAllocation::Index, perftools::gputools::DeviceMemoryBase>
-        registered_buffers_;
+    std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
   };
 
   BufferAllocations(const BufferAllocations&) = delete;
@@ -65,22 +64,20 @@ class BufferAllocations {
   // Returns the device address of buffer `buffer_index`. `buffer_index` must be
   // a valid index, i.e., in [0, buffer_count). This function returns null if
   // `buffer_index` is not assigned to a buffer address.
-  perftools::gputools::DeviceMemoryBase GetDeviceAddress(
+  se::DeviceMemoryBase GetDeviceAddress(
       BufferAllocation::Index buffer_index) const;
 
   // Same as above, but also adjusts the returned address for the offset and
   // size contained in the given slice.
-  perftools::gputools::DeviceMemoryBase GetDeviceAddress(
+  se::DeviceMemoryBase GetDeviceAddress(
       const BufferAllocation::Slice& buffer_slice) const;
 
-  perftools::gputools::DeviceMemoryBase GetTempBufferBase() const {
-    return temp_buffer_base_;
-  }
+  se::DeviceMemoryBase GetTempBufferBase() const { return temp_buffer_base_; }
 
   // Tears down all buffers allocated by this object that are not in
   // `live_addresses`.
   tensorflow::Status TearDown(
-      const std::set<perftools::gputools::DeviceMemoryBase>& live_addresses,
+      const std::set<se::DeviceMemoryBase>& live_addresses,
       const BufferAssignment& buffer_assignment);
 
  private:
@@ -92,15 +89,15 @@ class BufferAllocations {
 
   // Sets the device address of buffer `buffer_index`.
   void SetBuffer(BufferAllocation::Index buffer_index,
-                 perftools::gputools::DeviceMemoryBase buffer);
+                 se::DeviceMemoryBase buffer);
 
   // An array of device pointers that stores the address of each buffer
   // indexed by Index. Each element can point to a temporary buffer, an
   // input buffer, or nullptr if no buffer is needed for that Index.
-  std::vector<perftools::gputools::DeviceMemoryBase> buffers_;
+  std::vector<se::DeviceMemoryBase> buffers_;
 
   // The base address of the memory block that contains all temporary buffers.
-  perftools::gputools::DeviceMemoryBase temp_buffer_base_;
+  se::DeviceMemoryBase temp_buffer_base_;
 
   int device_ordinal_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 790ca535b11ee47724ef6227de40726d940d6153..dce8de2e301ecfaa4674b8be48b8c02bfabf3f4b 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -42,11 +42,10 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
 }
 
 Status ConditionalThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
   // Copy the predicate value from device.
   bool predicate;
-  perftools::gputools::DeviceMemoryBase predicate_address =
+  se::DeviceMemoryBase predicate_address =
       buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
   stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index 7725c46a3b4b51af34a4dd977885353ff32c21f6..e40872688fdad24d24db5f7cebb3206c77652dce 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -49,7 +49,7 @@ class ConditionalThunk : public Thunk {
 
   Status Initialize(const GpuExecutable& executable) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice predicate_buffer_index_;
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 461747b699b542ae0c8735aea34cc9e57c1fb387..64d3b84b8c73d82800270aebcebf7f7a8fec5fe4 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 900d9cb6243088b56a1825fb3ab8c06cf8d74726..6d845025b1aef2b0a5f147401b6db0598ba94d6d 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -66,23 +66,21 @@ class ConvolutionThunk : public Thunk {
 
   // Does the convolution for the thunk on "stream".
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   class ScratchAllocator;
 
-  Status Convolve(
-      const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
-      perftools::gputools::DeviceMemory<float> input_data,
-      const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
-      perftools::gputools::DeviceMemory<float> filter_data,
-      const perftools::gputools::dnn::BatchDescriptor& output_descriptor,
-      perftools::gputools::DeviceMemory<float> output_data,
-      const perftools::gputools::dnn::ConvolutionDescriptor&
-          convolution_descriptor,
-      const perftools::gputools::dnn::AlgorithmConfig& algorithm_config,
-      perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator,
-      perftools::gputools::dnn::ProfileResult* profile_result);
+  Status Convolve(const se::dnn::BatchDescriptor& input_descriptor,
+                  se::DeviceMemory<float> input_data,
+                  const se::dnn::FilterDescriptor& filter_descriptor,
+                  se::DeviceMemory<float> filter_data,
+                  const se::dnn::BatchDescriptor& output_descriptor,
+                  se::DeviceMemory<float> output_data,
+                  const se::dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const se::dnn::AlgorithmConfig& algorithm_config,
+                  se::Stream* stream, ScratchAllocator* scratch_allocator,
+                  se::dnn::ProfileResult* profile_result);
 
   const CudnnConvKind convolution_kind_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
index f4498663b1c039b3175376baf8f27c4ecec678ec..bf912fbd14de5874062a79db28186ab233575233 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
@@ -30,9 +30,8 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
       mem_size_(mem_size) {}
 
 tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase destination_data =
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   stream->ThenMemcpy(&destination_data, source_address_, mem_size_);
   return tensorflow::Status::OK();
@@ -48,11 +47,10 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
       mem_size_(mem_size) {}
 
 tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase destination_data =
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
-  perftools::gputools::DeviceMemoryBase source_data =
+  se::DeviceMemoryBase source_data =
       buffer_allocations.GetDeviceAddress(source_buffer_);
   stream->ThenMemcpy(&destination_data, source_data, mem_size_);
   return tensorflow::Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
index e2783fd255239d31edc89701ea208f33ebb8d3fb..2e7eb5f3445bc9294fa455ef31fb816cdba4726c 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
@@ -40,8 +40,7 @@ class HostToDeviceCopyThunk : public Thunk {
   HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete;
 
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const void* source_address_;
@@ -64,8 +63,7 @@ class DeviceToDeviceCopyThunk : public Thunk {
   DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
 
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice source_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
index 58d9c8caff31e878487fbef01afce566e6187fd9..68099fd63847ef9993f9bc7ac0e28b2939631b35 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
@@ -28,7 +28,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-namespace se = ::perftools::gputools;
 namespace dnn = se::dnn;
 
 static std::pair<dnn::BatchDescriptor /*input_desc*/,
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
index c5fbb6d8a3912d380172d496d8d35e80dc9f5c71..874f85a863092ee05ae5df1f92d732318c5a0554 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
@@ -60,7 +60,7 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk {
       const CudnnBatchNormForwardInferenceThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice operand_;
@@ -90,7 +90,7 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
       const CudnnBatchNormForwardTrainingThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice operand_;
@@ -123,7 +123,7 @@ class CudnnBatchNormBackwardThunk : public Thunk {
       delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice operand_;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 1792893ae401bf16d2dd9e861607e8f3821a505e..c4c56c56928810d043085f284cda351391195c3b 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -24,8 +24,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-namespace se = perftools::gputools;
-
 using se::DeviceMemoryBase;
 using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
@@ -94,11 +92,17 @@ se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
 // Determines whether we can safely perform a winograd non-fused convolution for
 // the given input and output shapes.  This works around b/68264959, an integer
 // overflow in cuDNNv5 and cuDNNv6.
-//
-// TODO(jlebar): We shouldn't need this check for cuDNNv7.
-bool ShouldIncludeWinogradNonfusedAlgo(
-    const Shape& input_shape, const Shape& output_shape,
-    const ConvolutionDimensionNumbers& dnums) {
+bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const ConvolutionDimensionNumbers& dnums,
+                                       se::StreamExecutor* stream_exec) {
+  // Skip this check for cudnn7 and newer.
+  auto version =
+      stream_exec->AsDnn()->GetVersion();
+  if (version.ok() && version.ValueOrDie().major_version() >= 7) {
+    return true;
+  }
+
   int64 batch = input_shape.dimensions(dnums.input_batch_dimension());
   int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension());
   int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0));
@@ -118,20 +122,20 @@ bool ShouldIncludeWinogradNonfusedAlgo(
 
 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
                                          bool with_winograd_nonfused,
-                                         se::StreamExecutor* stream_exec_) {
+                                         se::StreamExecutor* stream_exec) {
   std::vector<AlgorithmDesc> algorithms;
   switch (kind) {
     case CudnnConvKind::kBackwardFilter:
-      CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms(
+      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
           with_winograd_nonfused, &algorithms));
       break;
     case CudnnConvKind::kBackwardInput:
-      CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms(
+      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
           with_winograd_nonfused, &algorithms));
       break;
     case CudnnConvKind::kForward:
-      CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused,
-                                                &algorithms));
+      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
+                                               &algorithms));
       break;
   }
 
@@ -209,8 +213,8 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     return nullopt;
   }
 
-  const bool use_winograd_nonfused =
-      ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums);
+  const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
+      input_shape, output_shape, dnums, stream_exec_);
   se::dnn::ProfileResult best_result;
   int64 best_result_bytes_used = 0;
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
index 516210ec2e500cf03774d27408300ac3346e7b4f..bc5d1ce94afd2075a006899f0f6bcf64352e5e99 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -33,9 +33,8 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
   // If the `allocator` parameter is not null, we will use it to allocate temp
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
-  CudnnConvolutionAlgorithmPicker(
-      perftools::gputools::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* allocator)
+  CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec,
+                                  DeviceMemoryAllocator* allocator)
       : stream_exec_(stream_exec), allocator_(allocator) {}
 
   tensorflow::StringPiece name() const override {
@@ -52,7 +51,7 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
       const Shape& output_shape, const Window& window,
       const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // never null
+  se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
index e4ae839e1dd4cb3a744a3f6a3329cabdaeb3f38d..10b4c3de89989c52cfea5273c3d5b0beef76abd2 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@@ -22,8 +22,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-namespace se = ::perftools::gputools;
-
 using se::DeviceMemory;
 using se::DeviceMemoryBase;
 using se::Stream;
@@ -215,14 +213,12 @@ string CudnnConvKindToString(CudnnConvKind kind) {
 
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::DeviceMemoryBase scratch_buf, const Window& window,
     const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result) {
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result) {
   ScratchBufAllocator scratch_allocator(scratch_buf);
   return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
                              input_buf, filter_buf, output_buf,
@@ -232,14 +228,12 @@ Status RunCudnnConvolution(
 
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::ScratchAllocator* scratch_allocator,
-    const Window& window, const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result) {
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result) {
   PrimitiveType output_primitive_type = output_shape.element_type();
   CHECK(output_primitive_type == F32 || output_primitive_type == F16)
       << ShapeUtil::HumanString(output_shape);
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
index 3dbfa2730da359d3c7937140508017c4a7b02d6c..944e4ac686d45408b08ff1faa321510c1c8920ba 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
@@ -72,25 +72,21 @@ string CudnnConvKindToString(CudnnConvKind kind);
 // that size, if you like.
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::DeviceMemoryBase scratch_buf, const Window& window,
     const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result = nullptr);
 
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::ScratchAllocator* scratch_allocator,
-    const Window& window, const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result = nullptr);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index 66931bdc8b1030b2b2e7731ce6327c1e908d4ee6..cc747addbd152eb82b0b2ef92b8653fc861f97be 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index 52fb8c376d7acea0f15aaa865c23fa2382717338..24b1dca99865fe21d0ca3af91e0d169f7b74a78a 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -34,24 +34,24 @@ namespace gpu {
 // released on destruction.
 //
 // Not thread-safe in that AllocateBytes, destructor are not locked.
-class FftScratchAllocator : public perftools::gputools::ScratchAllocator {
+class FftScratchAllocator : public se::ScratchAllocator {
  public:
   FftScratchAllocator(int device_ordinal,
                       DeviceMemoryAllocator* memory_allocator);
 
   ~FftScratchAllocator() override;
 
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override;
 
   int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
 
-  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override;
 
  private:
   const int device_ordinal_;
   DeviceMemoryAllocator* memory_allocator_;
-  std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
+  std::vector<se::DeviceMemoryBase> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
@@ -74,16 +74,15 @@ class FftThunk : public Thunk {
 
   // Does the FFT for the thunk on "stream".
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
-  const perftools::gputools::fft::Type fft_type_;
+  const se::fft::Type fft_type_;
   const std::vector<int64> fft_length_;
 
   float scale_factor_;
 
-  std::unique_ptr<perftools::gputools::fft::Plan> fft_plan_;
+  std::unique_ptr<se::fft::Plan> fft_plan_;
 
   const BufferAllocation::Slice input_buffer_;
   const BufferAllocation::Slice output_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 283d21ca222a236a69e4bab1b6504665d4d1cdd3..6e6966df3987eef29b2122c3ef8f11b7cd0bfe14 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -36,8 +36,7 @@ tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) {
 }
 
 tensorflow::Status ForThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
   for (int64 i = 0; i < loop_limit_; ++i) {
     // Invoke loop body thunk sequence.
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 832494d17e9c4e1d9e92e18ef331df1cf3689024..c78d1c50686297aea8235af928aba562697f49bc 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -38,8 +38,7 @@ class ForThunk : public Thunk {
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const int64 loop_limit_;
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index c137fbc97e29e24edb3603c611a5c8f093bc62a6..3cd30b754c3242f00c704de1afab2282ed827b41 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -45,6 +45,7 @@ void MaybeResolveTupleElements(HloInstruction* instruction,
 // Returns the bytes read by fusion parameter 'param', by returning the byte
 // size of 'param' shape (or the cumulative byte sizes of all leaf tuple
 // elements if 'param' is tuple-shaped).
+//
 // In the special case where all users of 'param' (or all users of a leaf
 // tuple element if 'param' is tuple-shaped) are Slice instructions, the size
 // of each slice instruction is accumulated instead, to give a more accurate
@@ -63,11 +64,10 @@ double CalculateBytesReadByFusionParameter(HloInstruction* param) {
   // Slice for a more accurate estimate of bytes read.
   double bytes = 0.0;
   for (auto& instruction : instructions) {
-    if (std::all_of(instruction->users().begin(), instruction->users().end(),
-                    [](const HloInstruction* instruction) {
-                      return instruction->opcode() == HloOpcode::kSlice ||
-                             instruction->opcode() == HloOpcode::kDynamicSlice;
-                    })) {
+    if (c_all_of(instruction->users(), [](const HloInstruction* instruction) {
+          return instruction->opcode() == HloOpcode::kSlice ||
+                 instruction->opcode() == HloOpcode::kDynamicSlice;
+        })) {
       // All users are slice: accumulate bytes of all user slice instructions.
       for (auto& user : instruction->users()) {
         bytes += ShapeUtil::ByteSizeOf(user->shape());
@@ -199,6 +199,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   ++total_visited_;
   // Skip 'fusion' instruction if there are no users into which we can merge.
   if (fusion->users().empty()) {
+    VLOG(3) << "Not merging " << fusion->name() << ": Has no users.";
     ++num_fail_no_users_;
     return Status::OK();
   }
@@ -208,24 +209,27 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // Input fusion instructions need to be rooted at a particular HLO (e.g.
   // kReduce), so they shouldn't be further fused either.
   if (fusion->fusion_kind() != HloInstruction::FusionKind::kLoop) {
+    VLOG(3) << "Not merging " << fusion->name() << ": Is not loop fusion.";
     ++num_fail_not_loop_fusion_;
     return Status::OK();
   }
 
   // Skip multiple output fusion. It's not yet supported.
   if (fusion->IsMultiOutputFusion()) {
+    VLOG(3) << "Not merging " << fusion->name() << ": Is multi-output fusion.";
     ++num_fail_not_loop_fusion_;
     return Status::OK();
   }
   // Skip 'fusion' instruction if we cannot merge into all of its users.
   // Merging into all users enables the removal of 'fusion' from the
   // computation.
-  if (!std::all_of(fusion->users().begin(), fusion->users().end(),
-                   [](const HloInstruction* instruction) {
-                     return instruction->opcode() == HloOpcode::kFusion &&
-                            instruction->fusion_kind() ==
-                                HloInstruction::FusionKind::kLoop;
-                   })) {
+  if (!c_all_of(fusion->users(), [](const HloInstruction* user) {
+        return user->opcode() == HloOpcode::kFusion &&
+               (user->fusion_kind() == HloInstruction::FusionKind::kLoop ||
+                user->fusion_kind() == HloInstruction::FusionKind::kInput);
+      })) {
+    VLOG(3) << "Not merging " << fusion->name()
+            << ": Some of its users are not loop/input fusion kernels.";
     ++num_fail_merge_all_users_;
     return Status::OK();
   }
@@ -233,18 +237,17 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // Skip 'fusion' instruction if any of its fused instructions are expensive.
   // This is done to avoid the duplication of expensive instructions, which
   // would occur if 'fusion' were merged into multiple users.
+  //
   // If 'fusion' has just one user, then an earlier fusion pass chose not to
   // fuse this producer/comsumer pair (likely because of expensive instruction
   // re-use by the consumer), and so we honor that choice here as well.
-  if (!std::all_of(fusion->fused_instructions().begin(),
-                   fusion->fused_instructions().end(),
-                   [](const HloInstruction* instruction) {
-                     if (instruction->opcode() != HloOpcode::kParameter &&
-                         GpuInstructionFusion::IsExpensive(*instruction)) {
-                       return false;
-                     }
-                     return true;
-                   })) {
+  if (c_any_of(fusion->fused_instructions(),
+               [](const HloInstruction* instruction) {
+                 return instruction->opcode() != HloOpcode::kParameter &&
+                        GpuInstructionFusion::IsExpensive(*instruction);
+               })) {
+    VLOG(3) << "Not merging " << fusion->name()
+            << ": Contains one or more expensive instructions.";
     ++num_fail_expensive_fused_instruction_;
     return Status::OK();
   }
@@ -253,6 +256,8 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // exceeds the threshold value.
   if (CalculateFlopsToBytesRatio(fusion) >
       FusionMerger::GetThresholdFlopsToBytesRatio()) {
+    VLOG(3) << "Not merging " << fusion->name()
+            << ": flops-to-bytes ratio is not favorable.";
     ++num_fail_flops_to_byte_ratio_;
     return Status::OK();
   }
@@ -265,6 +270,9 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   const double merged_to_current_bytes_ratio =
       merged_bytes_transferred / std::max(1.0, current_bytes_transferred);
   if (merged_to_current_bytes_ratio > 1.10) {
+    VLOG(3) << "Not merging " << fusion->name()
+            << ": merged-to-current-bytes-ratio of "
+            << merged_to_current_bytes_ratio << " is not favorable.";
     ++num_fail_net_bytes_transferred_ratio_;
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index deef5966b80d1b7f16e9982eed9ac5c7131e9d73..2217776c7d5a5f92c520d56222988f80401be9e4 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -16,257 +16,21 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-class FusionMergerTest : public HloTestBase {
- protected:
-  FusionMergerTest() : module_(CreateNewModule()) {}
-
-  // Builds the following computation:
-  //
-  //                 Param
-  //               /   |   \
-  //              /    |    \
-  //  OnesVec  GTE(0) GTE(1) GTE(2)
-  //       \   /         \   /
-  //        Add           Add  OnesVec
-  //         \           /  \  /
-  //           \      Add   Mul  OnesVec
-  //            \      |     |  /
-  //             \    Mul    Add
-  //              \    |    /
-  //               \   |   /
-  //                 Tuple
-  //
-  HloComputation* BuildComputation0() {
-    auto builder = HloComputation::Builder(TestName() + ".Computation0");
-    // Create param instruction to access computation state.
-    auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape3_, "param"));
-
-    // Create GetTupleElement instructions for each tuple element.
-    auto gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, param, 0));
-    auto gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, param, 1));
-    auto gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, param, 2));
-
-    // Create const vector of ones to be used in element-wise computations.
-    auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
-
-    // Create simple fusable computation for tuple element 0 (wont get merged).
-    auto out0 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kAdd, one_vec, gte0));
-
-    // Create fusable computation which is dependent on second and third tuple
-    // elements (will initially be fused on its own).
-    auto add1 = builder.AddInstruction(
-        HloInstruction::CreateBinary(data_shape_, HloOpcode::kAdd, gte1, gte2));
-
-    // Create two sub-computations, both of which are users of 'add1'.
-
-    // First sub-computation: out1 = Mul(Add(add1, one_vec), one_vec)
-    auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kAdd, add1, one_vec));
-    auto out1 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kMultiply, add2, one_vec));
-
-    // Second sub-computation: out2 = Add(Mul(add1, one_vec), one_vec)
-    auto mul0 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kMultiply, add1, one_vec));
-    auto out2 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kAdd, mul0, one_vec));
-
-    // Create output Tuple.
-    builder.AddInstruction(HloInstruction::CreateTuple({out0, out1, out2}));
-    return module_->AddEntryComputation(builder.Build());
-  }
-
-  // Builds the following computation:
-  //
-  //                 Param
-  //               /      \
-  //            GTE(0)   GTE(1)
-  //            | | \   /
-  //            | |  Mul
-  //             \  \ |
-  //              \  Mul
-  //               \ |
-  //      OnesVec   Mul  OnesVec
-  //             \  /  \ /
-  //     OnesVec  Add  Mul  OnesVec
-  //            \  |    |  /
-  //             Mul    Add
-  //               \    /
-  //                \  /
-  //                Tuple
-  //
-  HloComputation* BuildComputation1() {
-    auto builder = HloComputation::Builder(TestName() + ".Computation1");
-    Shape tuple_shape2_ = ShapeUtil::MakeTupleShape({data_shape_, data_shape_});
-    // Create param instruction to access computation state.
-    auto state = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape2_, "state"));
-
-    // Create shared sub-computation (will initially be fused on its own).
-    auto gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, state, 0));
-    auto gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, state, 2));
-    // Calculate the flops we need to generate for this shared computation
-    // to exceed the threshold flops_to_bytes_ratio.
-    // Note that bytes transferred is multiplied by 3 because there are two
-    // operands and one output of size 'data_shape_'.
-    const int64 flops_needed = FusionMerger::GetThresholdFlopsToBytesRatio() *
-                               ShapeUtil::ByteSizeOf(data_shape_) * 3;
-    const int64 vec_elements = ShapeUtil::ElementsIn(data_shape_);
-    const int64 iters = (flops_needed + vec_elements - 1) / vec_elements;
-
-    auto mul0 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kMultiply, gte0, gte1));
-    for (int i = 0; i < iters; ++i) {
-      mul0 = builder.AddInstruction(HloInstruction::CreateBinary(
-          data_shape_, HloOpcode::kMultiply, gte0, mul0));
-    }
-
-    // Create two sub-computations, both of which are users of 'mul0'.
-    auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
-
-    // First sub-computation: out0 = Mul(Add(mul0, one_vec), one_vec)
-    auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kAdd, mul0, one_vec));
-    auto out0 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kMultiply, add0, one_vec));
-
-    // Second sub-computation: out1 = Add(Mul(mul0, one_vec), one_vec)
-    auto mul1 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kMultiply, mul0, one_vec));
-    auto out1 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kAdd, mul1, one_vec));
-
-    // Create output Tuple.
-    builder.AddInstruction(HloInstruction::CreateTuple({out0, out1}));
-    return module_->AddEntryComputation(builder.Build());
-  }
-
-  // Builds the following computation:
-  //
-  //                Param
-  //             /   |   |  \
-  //            /    |   |   \
-  //           /     |   |    \
-  //      GTE(0) GTE(1) GTE(2) GTE(3)
-  //           \   /    /     /
-  //            Add    /     /
-  //              \   /     /
-  //               Add     /
-  //                 \    /
-  //                  \  /
-  //         OnesVec   Add  OnesVec
-  //                \  /  \ /
-  //        OnesVec  Add  Mul OnesVec
-  //              \  |    |  /
-  //               Mul    Add
-  //                 \    /
-  //                  \  /
-  //                  Tuple
-  //
-  HloComputation* BuildComputation2(bool add_extra_input) {
-    auto builder = HloComputation::Builder(TestName() + ".Computation2");
-    Shape state_shape = add_extra_input ? tuple_shape4_ : tuple_shape3_;
-    // Create param instruction to access computation state.
-    auto state = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, state_shape, "state"));
-
-    // Create GetTupleElement instructions for each tuple element.
-    auto gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, state, 0));
-    auto gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, state, 1));
-    auto gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, state, 2));
-
-    // Create shared fusable computation that reduces its operands.
-    auto reduce0 = builder.AddInstruction(
-        HloInstruction::CreateBinary(data_shape_, HloOpcode::kAdd, gte0, gte1));
-    auto reduce_out = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kAdd, reduce0, gte2));
-    if (add_extra_input) {
-      auto gte3 = builder.AddInstruction(
-          HloInstruction::CreateGetTupleElement(data_shape_, state, 3));
-      reduce_out = builder.AddInstruction(HloInstruction::CreateBinary(
-          data_shape_, HloOpcode::kAdd, reduce_out, gte3));
-    }
+namespace op = xla::testing::opcode_matchers;
 
-    // Create two fusable sub-computations which are dependent on shared
-    // computation 'reduce_out'.
-    auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
-
-    // First sub-computation: out0 = Mul(Add(reduce_out, one_vec), one_vec)
-    auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kAdd, reduce_out, one_vec));
-    auto out0 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kMultiply, add2, one_vec));
-
-    // Second sub-computation: out1 = Add(Mul(reduce_out, one_vec), one_vec)
-    auto mul0 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kMultiply, reduce_out, one_vec));
-    auto out1 = builder.AddInstruction(HloInstruction::CreateBinary(
-        data_shape_, HloOpcode::kAdd, mul0, one_vec));
-
-    // Create output Tuple.
-    builder.AddInstruction(HloInstruction::CreateTuple({out0, out1}));
-    return module_->AddEntryComputation(builder.Build());
-  }
-
-  Shape data_shape_ = ShapeUtil::MakeShape(F32, {4});
-  Shape tuple_shape2_ = ShapeUtil::MakeTupleShape({data_shape_, data_shape_});
-  Shape tuple_shape3_ =
-      ShapeUtil::MakeTupleShape({data_shape_, data_shape_, data_shape_});
-  Shape tuple_shape4_ = ShapeUtil::MakeTupleShape(
-      {data_shape_, data_shape_, data_shape_, data_shape_});
-
-  std::unique_ptr<HloModule> module_;
-};
+class FusionMergerTest : public HloTestBase {};
 
 // Tests that we can merge a fusion instruction that is below threshold.
 //
-// Original computation:
-//
-//                 Param
-//                /  |  \
-//               /   |   \
-//  OnesVec  GTE(0) GTE(1) GTE(2)
-//       \   /         \   /
-//        Add           Add  OnesVec
-//         \           /  \  /
-//           \      Add   Mul  OnesVec
-//            \      |     |  /
-//             \    Mul    Add
-//              \    |    /
-//               \   |   /
-//                 Tuple
-//
-// Computation after fusion passes:
-//
-//                  Param
-//                 /     \
-//            Fusion3    Fusion2
-//               |       /     \
-//                \ Fusion0  Fusion1
-//                 \    |   /
-//                  \   |  /
-//                   Tuple
-//
 // Computation after fusion merger pass (Fusion2 is merged into Fusion0 and
 // Fusion1):
 //                   Param
@@ -276,19 +40,50 @@ class FusionMergerTest : public HloTestBase {
 //                   Tuple
 //
 TEST_F(FusionMergerTest, MergeSharedFusionInstruction) {
-  auto computation = BuildComputation0();
-  // Run standard fusion passes.
-  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false)
-                  .Run(module_.get())
-                  .ValueOrDie());
-  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module_.get())
-                   .ValueOrDie());
-  // Run fusion merger pass, which should merge the shared fusion instruction
-  // into its two users.
-  EXPECT_TRUE(FusionMerger().Run(module_.get()).ValueOrDie());
-
-  auto* root = computation->root_instruction();
+  auto module = tools::Parse(R"(
+HloModule MergeSharedFusionInstruction
+
+comp.3 {
+  constant.param_0 = f32[4]{0} parameter(0)
+  param.param_1.2 = (f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(1)
+  get-tuple-element.6 = f32[4]{0} get-tuple-element(param.param_1.2), index=0
+  ROOT add.7 = f32[4]{0} add(constant.param_0, get-tuple-element.6)
+}
+
+comp.2 {
+  param.param_1.1 = (f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
+  get-tuple-element.4 = f32[4]{0} get-tuple-element(param.param_1.1), index=1
+  get-tuple-element.5 = f32[4]{0} get-tuple-element(param.param_1.1), index=2
+  ROOT add.6 = f32[4]{0} add(get-tuple-element.4, get-tuple-element.5)
+}
+
+comp.1 {
+  add.1.param_1.1 = f32[4]{0} parameter(1)
+  constant.param_1.3 = f32[4]{0} parameter(0)
+  add.5 = f32[4]{0} add(add.1.param_1.1, constant.param_1.3)
+  ROOT multiply.3 = f32[4]{0} multiply(add.5, constant.param_1.3)
+}
+
+comp {
+  add.1.param_1 = f32[4]{0} parameter(1)
+  constant.param_1.1 = f32[4]{0} parameter(0)
+  multiply.2 = f32[4]{0} multiply(add.1.param_1, constant.param_1.1)
+  ROOT add.4 = f32[4]{0} add(multiply.2, constant.param_1.1)
+}
+
+ENTRY MergeSharedFusionInstruction.Computation0 {
+  constant = f32[4]{0} constant({1, 1, 1, 1})
+  param = (f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
+  fusion.3 = f32[4]{0} fusion(constant, param), kind=kLoop, calls=comp.3
+  fusion.4 = f32[4]{0} fusion(param), kind=kLoop, calls=comp.2
+  fusion.5 = f32[4]{0} fusion(constant, fusion.4), kind=kLoop, calls=comp.1
+  fusion.6 = f32[4]{0} fusion(constant, fusion.4), kind=kLoop, calls=comp
+  ROOT tuple = (f32[4]{0}, f32[4]{0}, f32[4]{0}) tuple(fusion.3, fusion.5, fusion.6)
+})")
+                    .ValueOrDie();
+  EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
+
+  auto* root = module->entry_computation()->root_instruction();
   EXPECT_EQ(HloOpcode::kTuple, root->opcode());
   // Check operand 0 (not merged). Should have 4 instructions.
   auto* operand0 = root->operand(0);
@@ -307,156 +102,188 @@ TEST_F(FusionMergerTest, MergeSharedFusionInstruction) {
 // Tests that we do not merge a fusion instruction that above flops to bytes
 // threshold.
 //
-// Original computation:
-//
-//                 Param
-//                /     \
-//            GTE(0)   GTE(1)
-//            | | \   /
-//            | |  Mul
-//             \  \ |
-//              \  Mul
-//               \ |
-//      OnesVec   Mul  OnesVec
-//             \  /  \ /
-//     OnesVec  Add  Mul  OnesVec
-//            \  |    |  /
-//             Mul    Add
-//               \    /
-//                \  /
-//                Tuple
-//
-// Computation after fusion passes and fusion merger pass (Fusion2 is not
-// merged because it exceeds the threshold flops to bytes ratio).
-//
-//                 Param
-//                   |
-//                Fusion2
-//                /     \
-//           Fusion0  Fusion1
-//                \    /
-//                 Tuple
-//
+// Fusion2 is not merged because it exceeds the threshold flops-to-bytes ratio.
 TEST_F(FusionMergerTest, FlopsToBytesRatioThresholdExceeded) {
-  BuildComputation1();
-  // Run standard fusion passes.
-  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false)
-                  .Run(module_.get())
-                  .ValueOrDie());
-  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module_.get())
-                   .ValueOrDie());
+  auto module = tools::Parse(R"(
+HloModule FlopsToBytesRatioThresholdExceeded
+
+comp.2 {
+  state.param_1.1 = (f32[4]{0}, f32[4]{0}) parameter(0)
+  get-tuple-element.3 = f32[4]{0} get-tuple-element(state.param_1.1), index=0
+  get-tuple-element.4 = f32[4]{0} get-tuple-element(state.param_1.1), index=2
+  multiply.29 = f32[4]{0} multiply(get-tuple-element.3, get-tuple-element.4)
+  multiply.30 = f32[4]{0} multiply(get-tuple-element.3, multiply.29)
+  multiply.31 = f32[4]{0} multiply(get-tuple-element.3, multiply.30)
+  multiply.32 = f32[4]{0} multiply(get-tuple-element.3, multiply.31)
+  multiply.33 = f32[4]{0} multiply(get-tuple-element.3, multiply.32)
+  multiply.34 = f32[4]{0} multiply(get-tuple-element.3, multiply.33)
+  multiply.35 = f32[4]{0} multiply(get-tuple-element.3, multiply.34)
+  multiply.36 = f32[4]{0} multiply(get-tuple-element.3, multiply.35)
+  multiply.37 = f32[4]{0} multiply(get-tuple-element.3, multiply.36)
+  multiply.38 = f32[4]{0} multiply(get-tuple-element.3, multiply.37)
+  multiply.39 = f32[4]{0} multiply(get-tuple-element.3, multiply.38)
+  multiply.40 = f32[4]{0} multiply(get-tuple-element.3, multiply.39)
+  ROOT multiply.41 = f32[4]{0} multiply(get-tuple-element.3, multiply.40)
+}
+
+comp.1 {
+  multiply.12.param_1.1 = f32[4]{0} parameter(1)
+  constant.param_1.3 = f32[4]{0} parameter(0)
+  add.3 = f32[4]{0} add(multiply.12.param_1.1, constant.param_1.3)
+  ROOT multiply.16 = f32[4]{0} multiply(add.3, constant.param_1.3)
+}
+
+comp {
+  multiply.12.param_1 = f32[4]{0} parameter(1)
+  constant.param_1.1 = f32[4]{0} parameter(0)
+  multiply.15 = f32[4]{0} multiply(multiply.12.param_1, constant.param_1.1)
+  ROOT add.2 = f32[4]{0} add(multiply.15, constant.param_1.1)
+}
+
+ENTRY FlopsToBytesRatioThresholdExceeded.Computation1 {
+  constant = f32[4]{0} constant({1, 1, 1, 1})
+  state = (f32[4]{0}, f32[4]{0}) parameter(0)
+  fusion.2 = f32[4]{0} fusion(state), kind=kLoop, calls=comp.2
+  fusion.3 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp.1
+  fusion.4 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp
+  ROOT tuple = (f32[4]{0}, f32[4]{0}) tuple(fusion.3, fusion.4)
+})")
+                    .ValueOrDie();
   // Run fusion merger pass, which should detect that the flops/bytes of the
   // shared fusion instruction exceeds the threshold ratio, and therefore
   // cannot be merged with other fusion instructions.
-  EXPECT_FALSE(FusionMerger().Run(module_.get()).ValueOrDie());
+  EXPECT_FALSE(FusionMerger().Run(module.get()).ValueOrDie());
 }
 
 // Tests that threshold for bytes transferred if merged is exceeded.
 //
-// Original computation:
-//
-//                Param
-//             /   |   |  \
-//            /    |   |   \
-//           /     |   |    \
-//      GTE(0) GTE(1) GTE(2) GTE(3)
-//           \   /    /     /
-//            Add    /     /
-//              \   /     /
-//               Add     /
-//                 \    /
-//                  \  /
-//         OnesVec   Add  OnesVec
-//                \  /  \ /
-//        OnesVec  Add  Mul OnesVec
-//              \  |    |  /
-//               Mul    Add
-//                 \    /
-//                  \  /
-//                  Tuple
-//
-// Computation after fusion passes and fusion merger pass. Fusion2 is not
-// merged because it exceeds the threshold bytes transferred. This is because
-// the bytes read by Fusion2 (when replicated if the instruction is merged
-// into Fusion0 and Fusion1) would exceed the bytes transferred threshold.
-//
-//                 Param
-//                   |
-//                Fusion2
-//                /     \
-//           Fusion0  Fusion1
-//                \    /
-//                 Tuple
-//
+// Fusion2 is not merged because it exceeds the threshold bytes transferred.
+// This is because the bytes read by Fusion2 (when replicated if the instruction
+// is merged into Fusion0 and Fusion1) would exceed the bytes transferred
+// threshold.
 TEST_F(FusionMergerTest, BytesTransferredThresholdExeceeded) {
-  BuildComputation2(/*add_extra_input=*/true);
-  // Run standard fusion passes.
-  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false)
-                  .Run(module_.get())
-                  .ValueOrDie());
-  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module_.get())
-                   .ValueOrDie());
+  auto module = tools::Parse(R"(
+HloModule BytesTransferredThresholdExeceeded
+
+comp.2 {
+  state.param_1.1 = (f32[4]{0}, f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
+  get-tuple-element.7 = f32[4]{0} get-tuple-element(state.param_1.1), index=0
+  get-tuple-element.8 = f32[4]{0} get-tuple-element(state.param_1.1), index=1
+  add.9 = f32[4]{0} add(get-tuple-element.7, get-tuple-element.8)
+  get-tuple-element.9 = f32[4]{0} get-tuple-element(state.param_1.1), index=2
+  add.10 = f32[4]{0} add(add.9, get-tuple-element.9)
+  get-tuple-element.10 = f32[4]{0} get-tuple-element(state.param_1.1), index=3
+  ROOT add.11 = f32[4]{0} add(add.10, get-tuple-element.10)
+}
+
+comp.1 {
+  add.2.param_1.1 = f32[4]{0} parameter(1)
+  constant.param_1.3 = f32[4]{0} parameter(0)
+  add.6 = f32[4]{0} add(add.2.param_1.1, constant.param_1.3)
+  ROOT multiply.3 = f32[4]{0} multiply(add.6, constant.param_1.3)
+}
+
+comp {
+  add.2.param_1 = f32[4]{0} parameter(1)
+  constant.param_1.1 = f32[4]{0} parameter(0)
+  multiply.2 = f32[4]{0} multiply(add.2.param_1, constant.param_1.1)
+  ROOT add.5 = f32[4]{0} add(multiply.2, constant.param_1.1)
+}
+
+ENTRY BytesTransferredThresholdExeceeded.Computation2 {
+  constant = f32[4]{0} constant({1, 1, 1, 1})
+  state = (f32[4]{0}, f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
+  fusion.2 = f32[4]{0} fusion(state), kind=kLoop, calls=comp.2
+  fusion.3 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp.1
+  fusion.4 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp
+  ROOT tuple = (f32[4]{0}, f32[4]{0}) tuple(fusion.3, fusion.4)
+})")
+                    .ValueOrDie();
   // Run fusion merger pass, which should detect that the net bytes transferred
   // (if merged) would increase.
-  EXPECT_FALSE(FusionMerger().Run(module_.get()).ValueOrDie());
+  EXPECT_FALSE(FusionMerger().Run(module.get()).ValueOrDie());
 }
 
 // Tests that threshold for bytes transferred if merged is not exceeded.
 //
-// Original computation:
-//
-//               Param
-//             /   |  \
-//            /    |   \
-//           /     |    \
-//      GTE(0) GTE(1) GTE(2)
-//           \   /    /
-//            Add    /
-//              \   /
-//     OnesVec   Add  OnesVec
-//            \  /  \ /
-//   OnesVec  Add   Mul OnesVec
-//              \  /   \  /
-//               Mul    Add
-//                 \    /
-//                  \  /
-//                  Tuple
-//
-// Computation after fusion passes:
-//
-//                 Param
-//                   |
-//                Fusion2
-//                /     \
-//           Fusion0  Fusion1
-//                \    /
-//                 Tuple
-//
-// Computation after fusion merger pass (Fusion2 is merged into Fusion0 and
-// Fusion1, because bytes read from Param by Fusion2 is reduced for this test
-// which makes the merge operation into its operand below the bytes
-// transferred threshold.
-//
-//                   Param
-//                   /  \
-//             Fusion0  Fusion1
-//                   \    /
-//                   Tuple
-//
+// Fusion2 is merged into Fusion0 and Fusion1, because bytes read from Param by
+// Fusion2 is reduced for this test which makes the merge operation into its
+// operand below the bytes transferred threshold.
 TEST_F(FusionMergerTest, BytesTransferredThresholdNotExeceeded) {
-  BuildComputation2(/*add_extra_input=*/false);
-  // Run standard fusion passes.
-  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false)
-                  .Run(module_.get())
-                  .ValueOrDie());
-  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module_.get())
-                   .ValueOrDie());
+  auto module = tools::Parse(R"(
+HloModule BytesTransferredThresholdNotExeceeded
+
+comp.2 {
+  state.param_1.1 = (f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
+  get-tuple-element.5 = f32[4]{0} get-tuple-element(state.param_1.1), index=0
+  get-tuple-element.6 = f32[4]{0} get-tuple-element(state.param_1.1), index=1
+  add.7 = f32[4]{0} add(get-tuple-element.5, get-tuple-element.6)
+  get-tuple-element.7 = f32[4]{0} get-tuple-element(state.param_1.1), index=2
+  ROOT add.8 = f32[4]{0} add(add.7, get-tuple-element.7)
+}
+
+comp.1 {
+  add.1.param_1.1 = f32[4]{0} parameter(1)
+  constant.param_1.3 = f32[4]{0} parameter(0)
+  add.5 = f32[4]{0} add(add.1.param_1.1, constant.param_1.3)
+  ROOT multiply.3 = f32[4]{0} multiply(add.5, constant.param_1.3)
+}
+
+comp {
+  add.1.param_1 = f32[4]{0} parameter(1)
+  constant.param_1.1 = f32[4]{0} parameter(0)
+  multiply.2 = f32[4]{0} multiply(add.1.param_1, constant.param_1.1)
+  ROOT add.4 = f32[4]{0} add(multiply.2, constant.param_1.1)
+}
+
+ENTRY BytesTransferredThresholdNotExeceeded.Computation2 {
+  constant = f32[4]{0} constant({1, 1, 1, 1})
+  state = (f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
+  fusion.2 = f32[4]{0} fusion(state), kind=kLoop, calls=comp.2
+  fusion.3 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp.1
+  fusion.4 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp
+  ROOT tuple = (f32[4]{0}, f32[4]{0}) tuple(fusion.3, fusion.4)
+})")
+                    .ValueOrDie();
   // Run fusion merger pass, which should detect that the net bytes transferred
   // (if merged) would not increase.
-  EXPECT_TRUE(FusionMerger().Run(module_.get()).ValueOrDie());
+  EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
+}
+
+// Check that we're willing to merge f1_computation into f2_computation, even
+// though f2 is an input fusion node.
+TEST_F(FusionMergerTest, WillMergeIntoInputFusion) {
+  auto module = tools::Parse(R"(
+    HloModule m
+
+    f1_computation {
+      f1_p0 = f32[10]{0} parameter(0)
+      ROOT f1_root = f32[10]{0} add(f1_p0, f1_p0)
+    }
+
+    add_computation {
+      add_lhs = f32[] parameter(0)
+      add_rhs = f32[] parameter(1)
+      ROOT add_root = f32[] add(add_lhs, add_rhs)
+    }
+
+    f2_computation {
+      f2_p0 = f32[10]{0} parameter(0)
+      f2_mul = f32[10]{0} multiply(f2_p0, f2_p0)
+      f2_zero = f32[] constant(0)
+      ROOT f2_root = f32[] reduce(f2_mul, f2_zero), dimensions={0},
+             to_apply=add_computation
+    }
+
+    ENTRY entry {
+      p0 = f32[10]{0} parameter(0)
+      f1 = f32[10]{0} fusion(p0), kind=kLoop, calls=f1_computation
+      ROOT f2 = f32[] fusion(f1), kind=kInput, calls=f2_computation
+    })")
+                    .ValueOrDie();
+  EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Fusion(op::Parameter()));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index ba482793e7632f0f423cc9da0dd9620bdf29c642..0ec12f52d8b398218ec370fc74bfdf6f97f43893 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -22,8 +22,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
@@ -49,7 +47,7 @@ struct MatrixDescriptor {
 // rhs_matrix, and stores the result to output_matrix.
 template <typename Element>
 bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
-            MatrixDescriptor output_matrix, se::Stream* stream) {
+            MatrixDescriptor output_matrix, double alpha, se::Stream* stream) {
   DCHECK(!output_matrix.transpose);
 
   se::DeviceMemory<Element> lhs_data(lhs_matrix.data);
@@ -65,7 +63,7 @@ bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
   return stream
       ->ThenBlasGemm(
           lhs_transpose, rhs_transpose, output_matrix.num_rows,
-          output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/1.0,
+          output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/alpha,
           lhs_data, /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data,
           /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/0.0,
           &output_data, /*leading dim of output=*/output_matrix.num_rows)
@@ -89,7 +87,7 @@ bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
 template <typename Element>
 bool DoGemmWithAlgorithm(MatrixDescriptor lhs_matrix,
                          MatrixDescriptor rhs_matrix,
-                         MatrixDescriptor output_matrix,
+                         MatrixDescriptor output_matrix, double alpha,
                          se::blas::ComputationType computation_type,
                          se::blas::AlgorithmType algorithm, se::Stream* stream,
                          se::blas::ProfileResult* output_profile_result) {
@@ -108,11 +106,13 @@ bool DoGemmWithAlgorithm(MatrixDescriptor lhs_matrix,
   return stream
       ->ThenBlasGemmWithAlgorithm(
           lhs_transpose, rhs_transpose, output_matrix.num_rows,
-          output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/1.0,
-          lhs_data, /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data,
-          /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/0.0,
-          &output_data, /*leading dim of output=*/output_matrix.num_rows,
-          computation_type, algorithm, output_profile_result)
+          output_matrix.num_cols, /*size of reduce dim=*/k,
+          /*alpha=*/static_cast<Element>(alpha), lhs_data,
+          /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data,
+          /*leading dim of RHS=*/rhs_matrix.num_rows,
+          /*beta=*/static_cast<Element>(0.0f), &output_data,
+          /*leading dim of output=*/output_matrix.num_rows, computation_type,
+          algorithm, output_profile_result)
       .ok();
 }
 
@@ -125,8 +125,8 @@ bool DoGemmWithAlgorithm(MatrixDescriptor lhs_matrix,
 template <typename Element>
 StatusOr<se::blas::AlgorithmType> DoGemmAutotune(
     MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
-    MatrixDescriptor output_matrix, se::blas::ComputationType computation_type,
-    se::Stream* stream) {
+    MatrixDescriptor output_matrix, double alpha,
+    se::blas::ComputationType computation_type, se::Stream* stream) {
   std::vector<se::blas::AlgorithmType> algorithms;
   CHECK(stream->parent()->GetBlasGemmAlgorithms(&algorithms));
 
@@ -138,8 +138,8 @@ StatusOr<se::blas::AlgorithmType> DoGemmAutotune(
     // non-null ProfileResult, DoGemmWithAlgorithm should always return true,
     // and the actual success-ness is returned in ProfileResult::is_valid.
     CHECK(DoGemmWithAlgorithm<Element>(lhs_matrix, rhs_matrix, output_matrix,
-                                       computation_type, algorithm, stream,
-                                       &profile_result));
+                                       alpha, computation_type, algorithm,
+                                       stream, &profile_result));
 
     if (profile_result.is_valid() && profile_result.elapsed_time_in_ms() <
                                          best_result.elapsed_time_in_ms()) {
@@ -161,6 +161,8 @@ StatusOr<se::blas::AlgorithmType> DoGemmAutotune(
 // DoGemm/DoGemmWithAlgorithm/DoGemmAutotune.
 auto GetGemmFn(PrimitiveType type) -> decltype(&DoGemm<float>) {
   switch (type) {
+    case F16:
+      return &DoGemm<Eigen::half>;
     case F32:
       return &DoGemm<float>;
     case F64:
@@ -172,6 +174,8 @@ auto GetGemmFn(PrimitiveType type) -> decltype(&DoGemm<float>) {
 auto GetGemmWithAlgorithmFn(PrimitiveType type)
     -> decltype(&DoGemmWithAlgorithm<float>) {
   switch (type) {
+    case F16:
+      return &DoGemmWithAlgorithm<Eigen::half>;
     case F32:
       return &DoGemmWithAlgorithm<float>;
     case F64:
@@ -182,6 +186,8 @@ auto GetGemmWithAlgorithmFn(PrimitiveType type)
 }
 auto GetGemmAutotuneFn(PrimitiveType type) -> decltype(&DoGemmAutotune<float>) {
   switch (type) {
+    case F16:
+      return &DoGemmAutotune<Eigen::half>;
     case F32:
       return &DoGemmAutotune<float>;
     case F64:
@@ -196,6 +202,10 @@ auto GetGemmAutotuneFn(PrimitiveType type) -> decltype(&DoGemmAutotune<float>) {
 // separately from the precision of the inputs and result.
 se::blas::ComputationType GetBlasComputationType(PrimitiveType type) {
   switch (type) {
+    case F16:
+      // Use F32 as computation type for F16 as we currently only implement the
+      // cuDNN pseudo half configuration for half precision.
+      return se::blas::ComputationType::kF32;
     case F32:
       return se::blas::ComputationType::kF32;
     case F64:
@@ -212,7 +222,8 @@ GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
                      const BufferAllocation::Slice& output_buffer,
                      const Shape& lhs_shape, const Shape& rhs_shape,
                      const Shape& output_shape, bool transpose_lhs,
-                     bool transpose_rhs, const HloInstruction* hlo_instruction)
+                     bool transpose_rhs, double alpha,
+                     const HloInstruction* hlo_instruction)
     : Thunk(Kind::kGemm, hlo_instruction),
       lhs_buffer_(lhs_buffer),
       rhs_buffer_(rhs_buffer),
@@ -221,7 +232,8 @@ GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
       rhs_shape_(rhs_shape),
       output_shape_(output_shape),
       transpose_lhs_(transpose_lhs),
-      transpose_rhs_(transpose_rhs) {}
+      transpose_rhs_(transpose_rhs),
+      alpha_(alpha) {}
 
 tensorflow::Status GemmThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
@@ -290,7 +302,7 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
     if (autotune_it == autotune_results_.end()) {
       StatusOr<se::blas::AlgorithmType> best_algorithm =
           GetGemmAutotuneFn(element_type)(lhs_matrix, rhs_matrix, output_matrix,
-                                          computation_type, stream);
+                                          alpha_, computation_type, stream);
       autotune_it =
           autotune_results_.insert({device_name, best_algorithm}).first;
 
@@ -311,12 +323,15 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
       VLOG(2) << "Using algorithm " << algorithm
               << " chosen by autotuning on GemmThunk " << this;
       return GetGemmWithAlgorithmFn(element_type)(
-          lhs_matrix, rhs_matrix, output_matrix, computation_type, algorithm,
-          stream,
+          lhs_matrix, rhs_matrix, output_matrix, alpha_, computation_type,
+          algorithm, stream,
           /*output_profile_result=*/nullptr);
     }
+
+    // Autotune will fail when CUDA 8 and GPU sm_50 or older are used.
+    // Use the older Gemm API in this case.
     return GetGemmFn(element_type)(lhs_matrix, rhs_matrix, output_matrix,
-                                   stream);
+                                   alpha_, stream);
   };
 
   bool launch_ok;
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 8c6a1f51a8a09ef78950dfe7e89994a3fe247f49..a18f425bc38fd3fbbb345901514c4ac16dbe97ec 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -34,29 +34,28 @@ namespace gpu {
 // This is thread-compatible.
 class GemmThunk : public Thunk {
  public:
-  // Constructs a thunk that computes "output = lhs <dot> rhs" using BLAS gemm.
-  // transpose_lhs and transpose_rhs indicate whether gemm should transpose the
-  // lhs and rhs operand. hlo_instruction is as in Thunk.
+  // Constructs a thunk that computes "output = (lhs <dot> rhs) * alpha" using
+  // BLAS gemm. transpose_lhs and transpose_rhs indicate whether gemm should
+  // transpose the lhs and rhs operand. hlo_instruction is as in Thunk. alpha is
+  // a constant.
   GemmThunk(const BufferAllocation::Slice& lhs_buffer,
             const BufferAllocation::Slice& rhs_buffer,
             const BufferAllocation::Slice& output_buffer,
             const Shape& lhs_shape, const Shape& rhs_shape,
             const Shape& output_shape, bool transpose_lhs, bool transpose_rhs,
-            const HloInstruction* hlo_instruction);
+            double alpha, const HloInstruction* hlo_instruction);
 
   GemmThunk(const GemmThunk&) = delete;
   GemmThunk& operator=(const GemmThunk&) = delete;
 
   // Does the gemm operation for the thunk on "stream", which must be non-null.
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
   // Returns true if we'll perform autotuning if run on the given stream.  If
   // so, we want the GPU to be quiescent during autotuning, so as not to
   // introduce noise in our results.
-  bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream* stream) override {
+  bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override {
     return autotune_results_.count(
                stream->parent()->GetDeviceDescription().name()) != 0;
   }
@@ -72,13 +71,13 @@ class GemmThunk : public Thunk {
 
   const bool transpose_lhs_;
   const bool transpose_rhs_;
+  const double alpha_;
 
   // Maps device names (StreamExecutor::DeviceDescription::name()) to autotune
   // results.  The map's value is the best algorithm we've found for this thunk
   // on this device, or an error if none of the algorithms worked and we should
   // use the regular gemm without an algorithm.
-  std::unordered_map<string,
-                     StatusOr<::perftools::gputools::blas::AlgorithmType>>
+  std::unordered_map<string, StatusOr<se::blas::AlgorithmType>>
       autotune_results_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 28ebd034ee0c89137f4e6eb417d8a37f4a00af7a..796c3070f22edd0cd088ccaf08a7f31fcde70f2d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -33,8 +33,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/gather_expander.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
@@ -89,8 +91,6 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
@@ -100,7 +100,7 @@ namespace gpu {
 
 namespace {
 
-using tensorflow::port::Tracing;
+namespace tracing = tensorflow::tracing;
 
 // Returns the directory containing nvvm libdevice files.  config_cuda_data_dir
 // should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the
@@ -164,6 +164,9 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
           /*rewrite_grad_op=*/true,
           /*use_fusion=*/false);
 
+      // Rewrite gather ops into smaller ones.
+      pass.AddPass<GatherExpander>();
+
       // BatchNormExpander can create zero-sized ops, so zero-sized HLO
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
@@ -176,6 +179,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
       pass.AddPass<HloDCE>();
       pass.AddPass<ReshapeMover>();
       pass.AddPass<HloConstantFolding>();
+      pass.AddPass<ConditionalSimplifier>();
     }
 
     pipeline.AddPass<TransposeFolding>(
@@ -241,6 +245,22 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
+  {
+    HloPassPipeline pipeline("layout_assignment");
+    pipeline.AddPass<GpuLayoutAssignment>(
+        hlo_module->mutable_entry_computation_layout());
+
+    // The LayoutAssignment pass may leave behind kCopy instructions which are
+    // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
+        /*is_layout_sensitive=*/true,
+        /*valid_bitcast_callback=*/[](const Shape&, const Shape&) {
+          return true;
+        });
+    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
   {
     HloPassFix<HloPassPipeline> fusion("fusion");
     fusion.AddInvariantChecker<HloVerifier>();
@@ -277,15 +297,6 @@ tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   HloPassPipeline pipeline("GPU-ir-emit-prepare");
   pipeline.AddInvariantChecker<HloVerifier>();
 
-  pipeline.AddPass<GpuLayoutAssignment>(
-      hlo_module->mutable_entry_computation_layout());
-
-  // The LayoutAssignment pass may leave behind kCopy instructions which are
-  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-      /*is_layout_sensitive=*/true,
-      [](const Shape&, const Shape&) { return true; });
-  pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
   // materializes the value) or missing a necessary copy (later pass removes an
@@ -399,7 +410,7 @@ void WarnIfBadDriverJITVersion() {
 // code (i.e. a cubin) as a byte array.
 StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
                                         int cc_minor) {
-  Tracing::TraceMe annotation("Compile PTX", /*is_expensive=*/true);
+  tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true);
   const string ptxas_path =
       tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas");
   VLOG(2) << "Using ptxas at " << ptxas_path;
@@ -470,8 +481,8 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
-  Tracing::TraceMe annotation("HLO Transforms", module->name(),
-                              /*is_expensive=*/true);
+  tracing::ScopedActivity activity("HLO Transforms", module->name(),
+                                   /*is_expensive=*/true);
   TF_RETURN_IF_ERROR(
       OptimizeHloModule(module.get(), stream_exec, device_allocator));
   return std::move(module);
@@ -658,6 +669,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   if (module->config().hlo_profiling_enabled()) {
     HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
+    cost_analysis.set_bytes_per_second(
+        stream_exec->GetDeviceDescription().memory_bandwidth());
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
     profile_printer =
@@ -679,7 +692,7 @@ std::vector<uint8> GpuCompiler::CompilePtxOrGetCachedResult(const string& ptx,
                                                             int cc_major,
                                                             int cc_minor) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::CompilePtxOrGetCachedResult");
-  Tracing::TraceMe annotation("PTX->CUBIN", /*is_expensive=*/true);
+  tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true);
   bool inserted;
   decltype(compilation_cache_.begin()) iter;
   // Pointers into compilation_cache_ where the ptx and (optional) cubin are
@@ -764,9 +777,9 @@ se::Platform::Id GpuCompiler::PlatformId() const {
 }  // namespace xla
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(se::cuda::kCudaPlatformId, []() {
-    return xla::MakeUnique<xla::gpu::GpuCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::cuda::kCudaPlatformId,
+      []() { return xla::MakeUnique<xla::gpu::GpuCompiler>(); });
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index c352d4d8462fadb266c55ad437de998e86a6528e..f3b02ae5d8867bdf1d970e809bff95a15d9f54d2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -45,25 +45,23 @@ class GpuCompiler : public LLVMCompiler {
   // Bring in
   // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
   //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //     std::vector<std::vector<se::StreamExecutor*>>
   //        stream_execs)
   using LLVMCompiler::Compile;
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
                      AotCompilationOptions const& options) override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
     // Capture just the pointer size, not the entire GpuCompiler object.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 623d6714de501000e38b7698620925f66425f157..980cc89fa03abd874a8e0a694f2abb775c1de050 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -34,8 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 namespace {
@@ -46,12 +44,14 @@ namespace {
 class HloExecutionProfiler {
  public:
   // If profiling is enabled, start an execution timer running.
-  explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
-                                se::Stream* stream,
-                                const HloComputation* computation)
+  explicit HloExecutionProfiler(
+      bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
+      const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
+      const HloComputation* computation)
       : do_profile_(do_profile),
         profile_(profile),
         stream_(stream),
+        sub_streams_(sub_streams),
         computation_(computation) {
     if (do_profile_) {
       clock_rate_ghz_ =
@@ -70,6 +70,7 @@ class HloExecutionProfiler {
     CHECK(!finished_execution_) << "Call FinishExecution only once!";
     finished_execution_ = true;
     if (do_profile_) {
+      stream_->ThenWaitFor(&sub_streams_);
       stream_->ThenStopTimer(execution_timer_.get());
       stream_->BlockHostUntilDone().IgnoreError();
       profile_->set_total_cycles_executed(
@@ -88,6 +89,7 @@ class HloExecutionProfiler {
   // that the hlo_instruction took to execute in the profile.
   void FinishOperation(const HloInstruction* hlo_instruction) {
     if (do_profile_) {
+      stream_->ThenWaitFor(&sub_streams_);
       stream_->ThenStopTimer(per_op_timer_.get());
       stream_->BlockHostUntilDone().IgnoreError();
       profile_->SetCyclesTakenBy(
@@ -100,6 +102,7 @@ class HloExecutionProfiler {
   double clock_rate_ghz_;
   HloExecutionProfile* profile_;
   se::Stream* stream_;
+  const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams_;
   const HloComputation* computation_;
   std::unique_ptr<se::Timer> execution_timer_;
   std::unique_ptr<se::Timer> per_op_timer_;
@@ -147,13 +150,9 @@ Status GpuExecutable::ExecuteThunks(
     LOG(WARNING) << "PROFILING: profiling is enabled";
   }
 
-  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
-                                hlo_module_->entry_computation());
-
-  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
-
   // Stream 0 indicates `main_stream` and substreams start from stream 1.
   std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
+  sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
     sub_streams.emplace_back();
     TF_ASSIGN_OR_RETURN(
@@ -161,6 +160,10 @@ Status GpuExecutable::ExecuteThunks(
         run_options->BorrowStream(main_stream->parent()->device_ordinal()));
   }
 
+  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
+                                sub_streams, hlo_module_->entry_computation());
+  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
   // The next event enqueued on stream N must not run until the thunk at
   // last_blocking_thunk_for_stream[N] completes.
   std::map<int32, const Thunk*> last_blocking_thunk_for_stream;
@@ -247,7 +250,7 @@ Status GpuExecutable::ExecuteThunks(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -262,16 +265,22 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
        ++i) {
     const BufferAllocation& allocation = assignment_->GetAllocation(i);
     if (allocation.is_entry_computation_parameter()) {
-      // The caller must give us a buffer for ShapeIndex {} of every parameter.
-      // It can optionally give us a buffer for other ShapeIndices, but we
-      // ignore them: Because we can't rely on these sub-buffers' addresses
-      // being available, our generated code can't use them.  Instead, it must
-      // chase pointers starting at the tuple root.
-      if (allocation.param_shape_index().empty()) {
-        auto param_no = allocation.parameter_number();
-        buffer_allocations_builder.RegisterBuffer(
-            i, arguments[param_no]->root_buffer());
+      auto param_no = allocation.parameter_number();
+      se::DeviceMemoryBase buffer =
+          arguments[param_no]->buffer(allocation.param_shape_index());
+
+      // All top-level buffers and sub-buffers must have an explicit, non-null
+      // pointer, except for zero-sized buffers, which may be null.
+      if (buffer.is_null() && buffer.size() > 0) {
+        return FailedPrecondition(
+            "Cannot run XLA computation because pointer to (sub-)buffer at "
+            "index %s of parameter %lld was null.  All pointers to "
+            "(sub-)buffers must not be null, unless the (sub-)buffer has zero "
+            "elements.",
+            allocation.param_shape_index().ToString().c_str(), param_no);
       }
+
+      buffer_allocations_builder.RegisterBuffer(i, buffer);
     }
   }
   se::StreamExecutor* executor = run_options->stream()->parent();
@@ -288,13 +297,13 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
 
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
   auto device_ordinal = executor->device_ordinal();
-  auto shaped_buffer = MakeUnique<ShapedBuffer>(
-      root->shape(), root->shape(), executor->platform(), device_ordinal);
+  ScopedShapedBuffer shaped_buffer(root->shape(), root->shape(),
+                                   memory_allocator, device_ordinal);
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer.
   std::set<se::DeviceMemoryBase> buffers_in_result;
-  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+  TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus(
       [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
           const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
@@ -313,7 +322,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
             this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
         CHECK(!slice.allocation()->is_entry_computation_parameter());
 
-        perftools::gputools::DeviceMemoryBase src_base =
+        se::DeviceMemoryBase src_base =
             buffer_allocations->GetDeviceAddress(slice.index());
         CHECK(!src_base.is_null() || src_base.size() == 0);
         *device_memory = src_base;
@@ -326,7 +335,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
   return std::move(shaped_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index b19cfd43debd0a5490495d176fa2f1fcd625da07..80ec38c3ac114fe4ad9d56784330c1144d913db1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -74,20 +74,15 @@ class GpuExecutable : public Executable {
 
   // ExecuteOnStream will fail if the compute capability of the stream doesn't
   // match the compute capability passed to this object's constructor.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
-  const Status EqualOrFail(const Executable& executable) {
-    // TODO(b/62952745) Implement equality test on GPU executable.
-    return Unimplemented("Equality test on GPU executable is not implemented.");
-  }
-
  private:
   // If `block_host_until_done` is false, execution will not block the host
   // until the kernels have completed. This is used as an optimization for
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index af9897769fda371e47af06c19abce9a06015e094..f13727ca9b6954f6be9b9277018fcc64ee326954 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -33,8 +33,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 // TODO(b/30467474) Once GPU infeed implementation settles, consider
@@ -153,8 +151,8 @@ static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() {
 }
 
 static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId,
-                                                &CreateGpuTransferManager);
+  xla::TransferManager::RegisterTransferManager(
+      stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
index 9aa369c668364079504ead3491903e2590a142cc..d040a99975230578c270deabdfe60c61649e778c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
@@ -36,21 +36,20 @@ class GpuTransferManager : public GenericTransferManager {
   GpuTransferManager();
   ~GpuTransferManager() override {}
 
-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
 
  private:
   // Initiates the infeed data transfers. InfeedBuffer->Done() must be
   // called to clean up the memory allocated for InfeedBuffer.
   StatusOr<gpu::InfeedBuffer*> TransferBufferToInfeedInternal(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source);
+      se::StreamExecutor* executor, int64 size, const void* source);
 
   // Enqueues infeed data buffers with the infeed manager after their
   // transfer completes.
-  Status EnqueueBuffersToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status EnqueueBuffersToInfeed(se::StreamExecutor* executor,
                                 std::vector<gpu::InfeedBuffer*> buffers);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager);
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
index ee5b447c9cd0b1fde4d3a0943d5d4cb8cc5b3376..3ddc1c0789d746bf021256638342364aac63e0e3 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
index 73d5a5ce35497f156a181371bfb97fc37a8eb09e..d5f2216d460a45085536b15f9bf6e3bd3579f9c8 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
@@ -46,7 +46,7 @@ namespace gpu {
 // the client. The client manages the memory of the buffer.
 class InfeedBuffer {
  public:
-  InfeedBuffer(perftools::gputools::StreamExecutor* executor, int64 length)
+  InfeedBuffer(se::StreamExecutor* executor, int64 length)
       : executor_(executor), length_(length) {
     device_memory_ = executor_->AllocateArray<uint8>(length);
     CHECK(!device_memory_.is_null());
@@ -60,14 +60,12 @@ class InfeedBuffer {
   // client to manage memory for the infeed buffers.
   void Done() { delete this; }
 
-  perftools::gputools::DeviceMemoryBase* device_memory() {
-    return &device_memory_;
-  }
+  se::DeviceMemoryBase* device_memory() { return &device_memory_; }
 
  private:
-  perftools::gputools::StreamExecutor* executor_;  // Not owned.
+  se::StreamExecutor* executor_;  // Not owned.
   const int64 length_;
-  perftools::gputools::DeviceMemoryBase device_memory_;
+  se::DeviceMemoryBase device_memory_;
 };
 
 // Client-side class used to enqueue infeed buffers.
@@ -100,8 +98,7 @@ class InfeedManager {
   // new stream on the first invocation. On subsequent invocations, if
   // the cached executor is not the same as the requested executor,
   // returns null.
-  perftools::gputools::Stream* GetStream(
-      perftools::gputools::StreamExecutor* executor);
+  se::Stream* GetStream(se::StreamExecutor* executor);
 
  private:
   // TODO(b/30467474): Revisit if this mutex becomes a point of
@@ -121,10 +118,10 @@ class InfeedManager {
   tensorflow::gtl::FlatSet<const InfeedBuffer*> dequeued_buffer_;
 
   // Cached host to device stream for queuing infeed data.
-  std::unique_ptr<perftools::gputools::Stream> host_to_device_stream_;
+  std::unique_ptr<se::Stream> host_to_device_stream_;
 
   // Executor that the host_to_device_stream belongs to. Not owned.
-  perftools::gputools::StreamExecutor* host_to_device_executor_;
+  se::StreamExecutor* host_to_device_executor_;
 };
 
 // Singleton creator-or-accessor: Returns the GPU infeed manager.
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 2ac95ceb692447c7ac6dbbcd8b9a38876f7a77b6..ea34d5b30c91e8b809e3e17a904e27e589fd6b5f 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -31,10 +31,10 @@ InfeedThunk::InfeedThunk(
       destination_buffer_(destination_buffer) {}
 
 Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                    perftools::gputools::Stream* stream) {
+                                    se::Stream* stream) {
   VLOG(2) << "Infeeding to GPU ";
 
-  perftools::gputools::DeviceMemoryBase destination_address =
+  se::DeviceMemoryBase destination_address =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
 
   InfeedManager* infeed_manager = GetOrCreateInfeedManager();
@@ -45,7 +45,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     std::vector<void*> tuple_element_addresses;
     for (BufferAllocation::Slice tuple_element_buffer :
          tuple_element_buffers_) {
-      perftools::gputools::DeviceMemoryBase tuple_element_address =
+      se::DeviceMemoryBase tuple_element_address =
           buffer_allocations.GetDeviceAddress(tuple_element_buffer);
 
       InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index 86918705fa0305217f11753e383200c7bd71474b..93713cb12defd95bdd69cb0aa7ad7b4e37fc8fae 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -44,7 +44,7 @@ class InfeedThunk : public Thunk {
   InfeedThunk& operator=(const InfeedThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index b5962f069bf499c913bd5479f263a7cb77c00555..85ecbe8fdb34700ca738b99ddd9ea615afc35da3 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -25,13 +25,19 @@ namespace gpu {
 namespace {
 
 bool IsFusile(const HloInstruction& hlo) {
+  // Don't fuse get-tuple-element on GPU: We can, but it's slower than not
+  // fusing.  We never generate kernels for unfused GTEs.  Instead, if an
+  // unfused GTE is an input to a kernel (including a fusion kernel), we
+  // compute the address of the GTE at the top of the kernel.  Often we know the
+  // address of the GTE result statically, so we can do this without chasing any
+  // pointers.
   return (hlo.IsElementwise() && hlo.operand_count() > 0) ||
+         hlo.opcode() == HloOpcode::kBitcast ||
          hlo.opcode() == HloOpcode::kBroadcast ||
          hlo.opcode() == HloOpcode::kConcatenate ||
          hlo.opcode() == HloOpcode::kDynamicSlice ||
          hlo.opcode() == HloOpcode::kDynamicUpdateSlice ||
          hlo.opcode() == HloOpcode::kFusion ||
-         hlo.opcode() == HloOpcode::kGetTupleElement ||
          hlo.opcode() == HloOpcode::kPad ||
          hlo.opcode() == HloOpcode::kReduce ||
          hlo.opcode() == HloOpcode::kReduceWindow ||
@@ -46,6 +52,34 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Check if we can use output fusion for (A @ B) * alpha
+  if (producer->opcode() == HloOpcode::kDot) {
+    if (consumer->opcode() == HloOpcode::kMultiply) {
+      CHECK_EQ(consumer->operand_count(), 2);
+      int64 other_operand_index = 1 - operand_index;
+      const HloInstruction* alpha = consumer->operand(other_operand_index);
+      if (alpha->opcode() == HloOpcode::kConstant &&
+          ShapeUtil::IsScalar(alpha->shape())) {
+        return true;
+      }
+    }
+  }
+
+  // Only allow to fuse transpose into an output fusion.
+  if (consumer->opcode() == HloOpcode::kFusion &&
+      consumer->fusion_kind() == HloInstruction::FusionKind::kOutput) {
+    if (producer->opcode() != HloOpcode::kTranspose) {
+      return false;
+    }
+    // Check that the transpose is the operand of a dot.
+    auto producer_operand_index = consumer->operand_index(producer);
+    auto fused_parameter = consumer->fused_parameter(producer_operand_index);
+    const std::vector<HloInstruction*>& fused_parameter_users =
+        fused_parameter->users();
+    return (fused_parameter_users.size() == 1 &&
+            fused_parameter_users[0]->opcode() == HloOpcode::kDot);
+  }
+
   // Output fusion is not currently supported on GPUs.
   if (producer->opcode() == HloOpcode::kFusion) {
     return false;
@@ -70,17 +104,6 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     return false;
   }
 
-  // We may need to know original operand layout to emit input fusion, and so
-  // far, we merely use the layout of an operand of the fusion node, which means
-  // we must fuse only elementwise operations. This restriction should be lifted
-  // later if we need to fuse other operations, e.g. transpose, for performance.
-  if ((IsReductionToVector(*consumer) ||
-       (HloOpcode::kFusion == consumer->opcode() &&
-        HloInstruction::FusionKind::kInput == consumer->fusion_kind())) &&
-      !producer->IsElementwise()) {
-    return false;
-  }
-
   // Cost condition: not fuse (simple, expensive producers) and (consumers who
   // reuse operand elements).
   if (producer->opcode() != HloOpcode::kFusion &&
@@ -98,6 +121,9 @@ HloInstruction::FusionKind GpuInstructionFusion::ChooseKind(
   if (IsReductionToVector(*consumer)) {
     return HloInstruction::FusionKind::kInput;
   }
+  if (producer->opcode() == HloOpcode::kDot) {
+    return HloInstruction::FusionKind::kOutput;
+  }
   if (HloOpcode::kFusion == consumer->opcode()) {
     return consumer->fusion_kind();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 2d6dad27a59978da6e4719afc50ebee5e641dde0..4b231c449f8f101127b4d30bfff20c69d8cef5c1 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace op = xla::testing::opcode_matchers;
 
@@ -137,30 +138,119 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
                    .ValueOrDie());
 }
 
-TEST_F(InstructionFusionTest, GetTupleElementFused) {
-  HloComputation::Builder builder(TestName());
-  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
-  Shape tuple_shape = ShapeUtil::MakeTupleShape({data_shape, data_shape});
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, tuple_shape, "param"));
-  auto gte0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, param, 0));
-  auto gte1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, param, 1));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, gte0, gte1));
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+// Tests that broadcasts fused into a fusion with a reduce root.
+TEST_F(InstructionFusionTest, BroadcastIntoReduce) {
+  auto module = tools::Parse(R"(
+    HloModule test_module
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY BroadcastIntoReduce {
+      constant = f32[] constant(1)
+      broadcast = f32[16,16,16,16]{3,2,1,0} broadcast(constant), dimensions={}
+      constant.1 = f32[] constant(0)
+      ROOT reduce = f32[] reduce(broadcast, constant.1), dimensions={0,1,2,3},
+                                                         to_apply=add
+    })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Reduce(op::Broadcast(op::Parameter()), op::Parameter()));
+}
+
+TEST_F(InstructionFusionTest, BitcastIntoAdd) {
+  auto module = tools::Parse(R"(
+    HloModule test_module
+
+    ENTRY BroadcastIntoAdd {
+      p0 = f32[4,1,1]{2,1,0} parameter(0)
+      p1 = f32[4,1]{1,0} parameter(1)
+      bitcast = f32[4,1]{1,0} bitcast(p0)
+      ROOT add = f32[4,1] add(bitcast, p1)
+    })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Add(op::Bitcast(op::Parameter()), op::Parameter()));
+}
+
+TEST_F(InstructionFusionTest, AddIntoBitcast) {
+  auto module = tools::Parse(R"(
+    HloModule test_module
+
+    ENTRY BroadcastIntoAdd {
+      p0 = f32[4,1,1]{2,1,0} parameter(0)
+      p1 = f32[4,1]{1,0} parameter(1)
+      add = f32[4,1] add(p0, p1)
+      ROOT bitcast = f32[4,1,1] bitcast(add)
+    })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Bitcast(op::Add(op::Parameter(), op::Parameter())));
+}
+
+TEST_F(InstructionFusionTest, DontFuseGTE) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY DontFuseGTE {
+    p0 = (f32[10], f32[10]) parameter(0)
+    gte0 = f32[10] get-tuple-element(p0), index=0
+    gte1 = f32[10] get-tuple-element(p0), index=1
+    ROOT add = f32[10] add(gte0, gte1)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module.get())
+                   .ValueOrDie());
+}
+
+TEST_F(InstructionFusionTest, DotOutputFusion) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    constant = f32[] constant(3)
+    p0 = f32[4,3]{1,0} parameter(0)
+    p1 = f32[4,3]{1,0} parameter(1)
+    transpose = f32[3,4]{1,0} transpose(p1), dimensions={1, 0}
+    dot = f32[4,4]{1,0} dot(p0, transpose)
+    ROOT mul = f32[4,4] multiply(constant, dot)
+  })")
+                    .ValueOrDie();
+
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
                   .Run(module.get())
                   .ValueOrDie());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(HloOpcode::kFusion, root->opcode());
-  HloInstruction* fused_root = root->fused_expression_root();
-  EXPECT_EQ(HloOpcode::kAdd, fused_root->opcode());
-  // Check that operands of 'fused_root' are GTE.
-  EXPECT_EQ(HloOpcode::kGetTupleElement, fused_root->operand(0)->opcode());
-  EXPECT_EQ(HloOpcode::kGetTupleElement, fused_root->operand(1)->opcode());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(
+      root->fused_expression_root(),
+      op::Multiply(op::Parameter(),
+                   op::Dot(op::Parameter(), op::Transpose(op::Parameter()))));
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 2f65edffea81db7dba1f8545f92b27ea622044e7..532d436ee82b985a4efe300f90223e1298e85765 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -49,8 +49,10 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
   // The inputs and the output must
   // 1) be matrices with no padding and a non-zero number of elements,
   // 2) have an allowed element type.
-  bool type_is_allowed = (output_shape.element_type() == F32 ||
-                          output_shape.element_type() == F64);
+  PrimitiveType output_primitive_type = output_shape.element_type();
+  bool type_is_allowed =
+      (output_primitive_type == F16 || output_primitive_type == F32 ||
+       output_primitive_type == F64);
   return type_is_allowed && IsRank2WithNoPadding(lhs_shape) &&
          IsRank2WithNoPadding(rhs_shape) &&
          IsRank2WithNoPadding(output_shape) &&
@@ -87,6 +89,19 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
     return true;
   }
 
+  if (hlo.opcode() == HloOpcode::kFusion &&
+      hlo.fusion_kind() == HloInstruction::FusionKind::kOutput &&
+      hlo.fused_expression_root()->opcode() == HloOpcode::kMultiply) {
+    // Try to find the dot inside the output fusion node.
+    const HloInstruction* dot = hlo.fused_expression_root()->operand(0);
+    if (dot->opcode() != HloOpcode::kDot) {
+      dot = hlo.fused_expression_root()->operand(1);
+    }
+    if (dot->opcode() == HloOpcode::kDot) {
+      return ImplementedAsGemm(*dot);
+    }
+  }
+
   return false;
 }
 
@@ -145,14 +160,19 @@ static HloInstruction* CreateCudnnConv(
   Shape call_shape =
       ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
 
-  // Our CustomCall takes three arguments: The conv lhs and rhs, and the cudnn
-  // algorithm to use.  It's up to a later pass to choose the algorithm, so to
-  // indicate that we haven't yet made a choice, we speicfy -1 for that arg.
+  // Our CustomCall takes four arguments: The conv lhs and rhs, the cudnn
+  // algorithm to use, and a boolean indicating whether to use tensor cores.
+  //
+  // It's up to a later pass to choose the algorithm and decide whether to use
+  // tensor cores, so to indicate that we haven't yet made a choice, we speicfy
+  // -1 and false for those args.
   HloInstruction* negative_one = computation->AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<int64>(-1)));
+  HloInstruction* false_constant = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloInstruction* custom_call =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
-          call_shape, {lhs, rhs, negative_one}, call_target));
+          call_shape, {lhs, rhs, negative_one, false_constant}, call_target));
   custom_call->set_window(window);
   custom_call->set_convolution_dimension_numbers(dnums);
   return custom_call;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index a3df67a87344d6ece2ea9047321ad9542c13f8cf..1e0db2821a2c212d0f212ae94ab69231bc6053ea 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 #include <unordered_map>
+#include <utility>
 
 #include "tensorflow/core/platform/logging.h"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
@@ -438,6 +439,32 @@ Status IrEmitter::HandleSelect(HloInstruction* select) {
   return IrEmitter::DefaultAction(select);
 }
 
+namespace {
+llvm::Value* Real(llvm::Value* x, llvm::IRBuilder<>* ir_builder) {
+  return ir_builder->CreateExtractValue(x, {0});
+}
+
+llvm::Value* Imag(llvm::Value* x, llvm::IRBuilder<>* ir_builder) {
+  return ir_builder->CreateExtractValue(x, {1});
+}
+
+std::pair<llvm::Value*, llvm::Value*> MultiplyComplex(
+    llvm::Value* lhs_value, llvm::Value* rhs_value,
+    llvm::IRBuilder<>* ir_builder) {
+  llvm::Value* lhs_real = Real(lhs_value, ir_builder);
+  llvm::Value* lhs_imag = Imag(lhs_value, ir_builder);
+  llvm::Value* rhs_real = Real(rhs_value, ir_builder);
+  llvm::Value* rhs_imag = Imag(rhs_value, ir_builder);
+  llvm::Value* real_result1 = ir_builder->CreateFMul(lhs_real, rhs_real);
+  llvm::Value* real_result2 = ir_builder->CreateFMul(lhs_imag, rhs_imag);
+  llvm::Value* real_result = ir_builder->CreateFSub(real_result1, real_result2);
+  llvm::Value* imag_result1 = ir_builder->CreateFMul(lhs_real, rhs_imag);
+  llvm::Value* imag_result2 = ir_builder->CreateFMul(lhs_imag, rhs_real);
+  llvm::Value* imag_result = ir_builder->CreateFAdd(imag_result1, imag_result2);
+  return {real_result, imag_result};
+}
+}  // namespace
+
 Status IrEmitter::HandleDot(HloInstruction* dot) {
   auto lhs_instruction = dot->operand(0);
   auto rhs_instruction = dot->operand(1);
@@ -456,21 +483,10 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
         rhs_array.EmitReadArrayElement(/*index=*/{}, &ir_builder_);
     llvm::Value* result;
     if (ShapeUtil::ElementIsComplex(lhs_shape)) {
-      auto real = [&](llvm::Value* x) {
-        return ir_builder_.CreateExtractValue(x, {0});
-      };
-      auto imag = [&](llvm::Value* x) {
-        return ir_builder_.CreateExtractValue(x, {1});
-      };
-      llvm::Value* real_result = ir_builder_.CreateFSub(
-          ir_builder_.CreateFMul(real(lhs_value), real(rhs_value)),
-          ir_builder_.CreateFMul(imag(lhs_value), imag(rhs_value)));
-      llvm::Value* imag_result = ir_builder_.CreateFAdd(
-          ir_builder_.CreateFMul(real(lhs_value), imag(rhs_value)),
-          ir_builder_.CreateFMul(imag(lhs_value), real(rhs_value)));
+      auto value = MultiplyComplex(lhs_value, rhs_value, &ir_builder_);
       result = llvm::ConstantAggregateZero::get(lhs_array.GetElementLlvmType());
-      result = ir_builder_.CreateInsertValue(result, real_result, {0});
-      result = ir_builder_.CreateInsertValue(result, imag_result, {1});
+      result = ir_builder_.CreateInsertValue(result, value.first, {0});
+      result = ir_builder_.CreateInsertValue(result, value.second, {1});
     } else {
       result = ir_builder_.CreateFMul(lhs_value, rhs_value);
     }
@@ -548,20 +564,13 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   llvm::Value* accum = ir_builder_.CreateLoad(accum_address);
   llvm::Value* updated_accum;
   if (ShapeUtil::ElementIsComplex(lhs_shape)) {
-#define REAL(x) ir_builder_.CreateExtractValue(x, {0})
-#define IMAG(x) ir_builder_.CreateExtractValue(x, {1})
-    llvm::Value* product_real = ir_builder_.CreateFSub(
-        ir_builder_.CreateFMul(REAL(lhs_element), REAL(rhs_element)),
-        ir_builder_.CreateFMul(IMAG(lhs_element), IMAG(rhs_element)));
-    llvm::Value* product_imag = ir_builder_.CreateFAdd(
-        ir_builder_.CreateFMul(REAL(lhs_element), IMAG(rhs_element)),
-        ir_builder_.CreateFMul(IMAG(lhs_element), REAL(rhs_element)));
-    updated_accum = ir_builder_.CreateInsertValue(
-        accum, ir_builder_.CreateFAdd(REAL(accum), product_real), {0});
-    updated_accum = ir_builder_.CreateInsertValue(
-        updated_accum, ir_builder_.CreateFAdd(IMAG(accum), product_imag), {1});
-#undef IMAG
-#undef REAL
+    auto value = MultiplyComplex(lhs_element, rhs_element, &ir_builder_);
+    llvm::Value* accum_real = Real(accum, &ir_builder_);
+    llvm::Value* real_sum = ir_builder_.CreateFAdd(accum_real, value.first);
+    updated_accum = ir_builder_.CreateInsertValue(accum, real_sum, {0});
+    llvm::Value* accum_imag = Imag(accum, &ir_builder_);
+    llvm::Value* imag_sum = ir_builder_.CreateFAdd(accum_imag, value.second);
+    updated_accum = ir_builder_.CreateInsertValue(updated_accum, imag_sum, {1});
   } else {
     llvm::Value* product = ir_builder_.CreateFMul(lhs_element, rhs_element);
     updated_accum = ir_builder_.CreateFAdd(accum, product);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index 3790ed313b9d0e167185a8b12c812132ee78811f..a78b4ff83075fd7ef330bb97ce217a198d450cf8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -32,7 +32,7 @@ class IrEmitterContext {
  public:
   IrEmitterContext(const HloModule* hlo_module,
                    const BufferAssignment* buffer_assignment,
-                   const perftools::gputools::DeviceDescription* device_desc,
+                   const se::DeviceDescription* device_desc,
                    llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
@@ -47,7 +47,7 @@ class IrEmitterContext {
   const BufferAssignment& buffer_assignment() const {
     return *buffer_assignment_;
   }
-  const perftools::gputools::DeviceDescription& device_description() const {
+  const se::DeviceDescription& device_description() const {
     return *device_desc_;
   }
   llvm::Module* llvm_module() { return llvm_module_; }
@@ -56,7 +56,7 @@ class IrEmitterContext {
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
-  const perftools::gputools::DeviceDescription* device_desc_;
+  const se::DeviceDescription* device_desc_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 30c88c0a5d38f6ea3f94d3b47b7b69c7122bf6ac..26e497762f2a6f23767c5b98f339eefdef0b7468 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <cstring>
 #include <memory>
 #include <string>
 #include <vector>
@@ -44,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/memset_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
@@ -142,37 +145,6 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
        llvm::ConstantAsMetadata::get(threads_per_block_ir_value)}));
 }
 
-// Tries to get a Slice for the given instruction at the given index, but
-// returns nullopt if we might not know the slice's address at runtime without
-// dereferencing a containing tuple.
-//
-// In particular, when XLA accepts a parameter of tuple type, the caller has the
-// option of telling XLA what are the values inside of the tuple, or just giving
-// XLA a pointer to the top-level tuple and letting us chase the pointers on the
-// GPU.  We therefore cannot rely having these pointers to parameter sub-buffers
-// being present when we run the program.
-optional<BufferAllocation::Slice> GetKnownAtRuntimeSlice(
-    const HloInstruction* instr, const ShapeIndex& index,
-    const BufferAssignment& buffer_assn) {
-  auto maybe_slice = buffer_assn.GetUniqueSlice(instr, index);
-  if (!maybe_slice.ok()) {
-    return nullopt;
-  }
-  // BufferAllocation gives a slice and alloc to every buffer accessed by XLA,
-  // but we don't necessarily know the runtime address of sub-buffers of input
-  // parameters.
-  const BufferAllocation::Slice& slice = maybe_slice.ValueOrDie();
-  const BufferAllocation* alloc = slice.allocation();
-  if (alloc->IsInputOrOutput() && !alloc->maybe_live_out() &&
-      !alloc->param_shape_index().empty()) {
-    return nullopt;
-  }
-
-  // Otherwise, we will know the address of this slice at runtime without having
-  // to dereference a tuple.
-  return slice;
-}
-
 }  // namespace
 
 IrEmitterUnnested::IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
@@ -203,7 +175,7 @@ bool ImplementedAsHostToDeviceMemcpy(const BufferAssignment& buffer_assignment,
   return hlo.opcode() == HloOpcode::kCopy &&
          hlo.operand(0)->opcode() == HloOpcode::kConstant &&
          ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) &&
-         GetKnownAtRuntimeSlice(&hlo, {}, buffer_assignment).has_value();
+         buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok();
 }
 
 bool ImplementedAsDeviceToDeviceMemcpy(
@@ -213,13 +185,13 @@ bool ImplementedAsDeviceToDeviceMemcpy(
   //
   // 1. `hlo` is a kCopy instruction.
   // 2. `hlo` and its operand have the same shape (thus the same layout too).
-  // 3. The operand to `hlo` has a buffer assignment (constants do not, for
-  //    instance) which means the source buffer also resides on the device.
+  // 3. `hlo` and its operand have a statically-known buffer assignment
+  //     (constants do not, for instance), which means the source buffer also
+  //     resides on the device.
   return hlo.opcode() == HloOpcode::kCopy &&
          ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) &&
-         GetKnownAtRuntimeSlice(&hlo, {}, buffer_assignment).has_value() &&
-         GetKnownAtRuntimeSlice(hlo.operand(0), {}, buffer_assignment)
-             .has_value();
+         buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok() &&
+         buffer_assignment.GetUniqueTopLevelSlice(hlo.operand(0)).ok();
 }
 }  // namespace
 
@@ -498,12 +470,11 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     switch (root->opcode()) {
       case HloOpcode::kReduce: {
         VLOG(3) << "Emitting fused reduction to vector: " << fusion->ToString();
+        TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
+                            BuildInitializerThunk(fusion));
         std::vector<std::unique_ptr<Thunk>> thunks;
-        thunks.emplace_back(BuildKernelThunk(fusion));
-        TF_RETURN_IF_ERROR(EmitInitializer(
-            fusion, static_cast<KernelThunk*>(thunks.back().get())));
-        bindings_.UnbindAllLocalIrValues();
-        thunks.emplace_back(BuildKernelThunk(fusion));
+        thunks.push_back(std::move(initializer_thunk));
+        thunks.push_back(BuildKernelThunk(fusion));
         thunk_sequence_->emplace_back(
             MakeUnique<SequentialThunk>(std::move(thunks), fusion));
         std::vector<llvm_ir::IrArray> parameter_arrays;
@@ -517,39 +488,6 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         TF_RETURN_IF_ERROR(root->Accept(&fused_emitter));
 
         Shape input_shape = root->operand(0)->shape();
-        // EmitReductionToVector requires the input shape to have a layout, but
-        // fused instructions don't have one. So we determine its layout from
-        // the fusion's operands. The choice of the layout only affects
-        // performance but not correctness.
-        auto choose_input_layout = [](
-            tensorflow::gtl::ArraySlice<const HloInstruction*> operands,
-            Shape* input_shape) -> Status {
-          // Prefer the layout of an operand whose shape is compatible with
-          // input_shape.
-          for (const HloInstruction* operand : operands) {
-            if (ShapeUtil::Compatible(*input_shape, operand->shape())) {
-              return LayoutUtil::CopyLayoutBetweenShapes(operand->shape(),
-                                                         input_shape);
-            }
-          }
-          // If no operand has a compatible shape, prefer an operand that has
-          // the same rank at least.
-          for (const HloInstruction* operand : operands) {
-            if (ShapeUtil::Rank(*input_shape) ==
-                ShapeUtil::Rank(operand->shape())) {
-              // Do not use CopyLayoutBetweenShapes because input_shape and
-              // operand->shape() may be incompatible.
-              *input_shape->mutable_layout() = operand->shape().layout();
-              return Status::OK();
-            }
-          }
-          // When all the above fails, which is rare, set the default layout.
-          LayoutUtil::SetToDefaultLayout(input_shape);
-          return Status::OK();
-        };
-        TF_RETURN_IF_ERROR(
-            choose_input_layout(fusion->operands(), &input_shape));
-
         return EmitReductionToVector(
             root, input_shape, fused_emitter.GetGenerator(root->operand(0)),
             fused_emitter.GetGenerator(root->operand(1)), root->dimensions(),
@@ -598,7 +536,27 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     thunk_sequence_->emplace_back(BuildGemmThunk(fusion));
     return Status::OK();
   }
-  thunk_sequence_->emplace_back(BuildKernelThunk(fusion));
+
+  int max_unroll_factor = fusion->GetModule()
+                              ->config()
+                              .debug_options()
+                              .xla_gpu_max_kernel_unroll_factor();
+
+  // Find the largest possible power of two to unroll by.
+  // TODO(kramerb): Make this smarter.
+  int unroll_factor = 1;
+  if (!fusion->IsMultiOutputFusion()) {
+    CHECK(fusion->fusion_kind() == HloInstruction::FusionKind::kLoop);
+    int64 num_elements = ShapeUtil::ElementsIn(fusion->shape());
+    for (int i = max_unroll_factor; i > 1; i /= 2) {
+      if (num_elements % i == 0) {
+        unroll_factor = i;
+        break;
+      }
+    }
+  }
+
+  thunk_sequence_->emplace_back(BuildKernelThunk(fusion, unroll_factor));
   return IrEmitter::HandleFusion(fusion);
 }
 
@@ -1668,14 +1626,14 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
   if (IsReductionToVector(*reduce) &&
       // NVPTX backend can't do atomic cmpxchg any narrower than 32 bits
       32 <= primitive_util::BitWidth(reduce->shape().element_type())) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
+                        BuildInitializerThunk(reduce));
     std::vector<std::unique_ptr<Thunk>> thunks;
-    thunks.emplace_back(BuildKernelThunk(reduce));
-    TF_RETURN_IF_ERROR(EmitInitializer(
-        reduce, static_cast<KernelThunk*>(thunks.back().get())));
-    bindings_.UnbindAllLocalIrValues();
-    thunks.emplace_back(BuildKernelThunk(reduce));
+    thunks.push_back(std::move(initializer_thunk));
+    thunks.push_back(BuildKernelThunk(reduce));
     thunk_sequence_->emplace_back(
         MakeUnique<SequentialThunk>(std::move(thunks), reduce));
+
     return EmitReductionToVector(
         reduce, input->shape(),
         [&](const llvm_ir::IrArray::Index& index) {
@@ -1739,16 +1697,13 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   CHECK_EQ(rank, ShapeUtil::Rank(source->shape()));
   CHECK_EQ(rank, window.dimensions_size());
 
-  {
-    std::vector<std::unique_ptr<Thunk>> thunks;
-    thunks.emplace_back(BuildKernelThunk(select_and_scatter));
-    TF_RETURN_IF_ERROR(EmitInitializer(
-        select_and_scatter, static_cast<KernelThunk*>(thunks.back().get())));
-    bindings_.UnbindAllLocalIrValues();
-    thunks.emplace_back(BuildKernelThunk(select_and_scatter));
-    thunk_sequence_->emplace_back(
-        MakeUnique<SequentialThunk>(std::move(thunks), select_and_scatter));
-  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
+                      BuildInitializerThunk(select_and_scatter));
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(initializer_thunk));
+  thunks.push_back(BuildKernelThunk(select_and_scatter));
+  thunk_sequence_->emplace_back(
+      MakeUnique<SequentialThunk>(std::move(thunks), select_and_scatter));
 
   // TODO(b/31410564): Implement dilation rate for select-and-scatter.
   if (window_util::HasDilation(window)) {
@@ -1993,38 +1948,54 @@ GetHloBufferSlices(const HloInstruction* hlo,
       -> optional<std::pair<BufferAllocation::Slice, ShapeIndex>> {
     // Simple, common case: Is the buffer for instr known at runtime?  If so,
     // we're done.
-    auto slice = GetKnownAtRuntimeSlice(instr, index, buffer_assn);
-    if (slice.has_value()) {
-      return {{*slice, ShapeIndex()}};
+    auto slice = buffer_assn.GetUniqueSlice(instr, index);
+    if (slice.ok()) {
+      return {{slice.ValueOrDie(), ShapeIndex()}};
     }
 
-    // If we don't know the buffer for instr at index, see if we know the buffer
-    // for instr at index without its last element.  If so, we can dynamically
-    // find the buffer for instr by dereferencing a pointer in that buffer.
-    // Continue looking this way until we run out of elements in 'index'.
-    ShapeIndex new_index = index;
-    ShapeIndex gte_indices;
-    while (!new_index.empty()) {
-      gte_indices.push_front(new_index.back());
-      new_index.pop_back();
-      auto slice = GetKnownAtRuntimeSlice(instr, new_index, buffer_assn);
-      if (slice.has_value()) {
-        return {{*slice, gte_indices}};
+    // If that didn't work, walk up any bitcasts that we might see.  These must
+    // appear before any GTE instructions, because it's illegal to bitcast to a
+    // tuple type.
+    const HloInstruction* parent = instr;
+    while (parent->opcode() == HloOpcode::kBitcast) {
+      parent = parent->operand(0);
+
+      auto slice = buffer_assn.GetUniqueSlice(parent, {});
+      if (slice.ok()) {
+        return {{slice.ValueOrDie(), ShapeIndex()}};
       }
     }
 
-    // If *that* didn't work, check whether instr is a GTE instruction.  If it
-    // is, see if we can get a buffer for its parent, and continue walking up
-    // parents until we find a defined buffer or we hit something that's not a
-    // GTE.
-    const HloInstruction* parent = instr;
+    // Check whether instr is a GTE instruction.  If it is, see if we can get a
+    // buffer for its parent, and continue walking up parents until we find a
+    // defined buffer or we hit something that's not a GTE.
+    ShapeIndex gte_indices;
     while (parent->opcode() == HloOpcode::kGetTupleElement) {
       gte_indices.push_front(parent->tuple_index());
       parent = parent->operand(0);
 
-      auto slice = GetKnownAtRuntimeSlice(parent, {}, buffer_assn);
-      if (slice.has_value()) {
-        return {{*slice, gte_indices}};
+      auto slice = buffer_assn.GetUniqueSlice(parent, {});
+      if (slice.ok()) {
+        return {{slice.ValueOrDie(), gte_indices}};
+      }
+    }
+
+    // Finally, if we don't know the buffer for instr at index, see if we know
+    // the buffer for instr at index without its last element.  If so, we can
+    // dynamically find the buffer for instr by dereferencing a pointer in that
+    // buffer.  Continue looking this way until we run out of elements in
+    // 'index'.
+    //
+    // We can almost always get a buffer without resorting to this.  The only
+    // exception is for cases where the relevant sub-buffer is truly unknowable,
+    // for example the sub-buffer of a tuple-shaped select.
+    ShapeIndex new_index = index;
+    while (!new_index.empty()) {
+      gte_indices.push_front(new_index.back());
+      new_index.pop_back();
+      auto slice = buffer_assn.GetUniqueSlice(instr, new_index);
+      if (slice.ok()) {
+        return {{slice.ValueOrDie(), gte_indices}};
       }
     }
 
@@ -2069,8 +2040,8 @@ Status IrEmitterUnnested::HandleGather(HloInstruction* gather) {
   return Unimplemented("Gather is not implemented on GPUs.");
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildKernelThunk(
-    const HloInstruction* inst) {
+std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
+    const HloInstruction* inst, int unroll_factor) {
   const BufferAssignment& buffer_assn =
       ir_emitter_context_->buffer_assignment();
 
@@ -2162,7 +2133,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildKernelThunk(
   }
 
   return MakeUnique<KernelThunk>(buffers, llvm_ir::AsString(kernel->getName()),
-                                 inst);
+                                 inst, unroll_factor);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
@@ -2221,31 +2192,63 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
         inst->shape(),              // The shape of the output.
         false,                      // Do not transpose LHS.
         false,                      // Do not transpose RHS.
+        1.0,                        // alpha.
         inst);
   }
 
   if (inst->opcode() == HloOpcode::kFusion) {
-    const HloInstruction* dot = inst->fused_expression_root();
-    DCHECK(dot->opcode() == HloOpcode::kDot);
-    const HloInstruction* lhs_parameter = StripTranspose(*dot->operand(0));
-    const HloInstruction* rhs_parameter = StripTranspose(*dot->operand(1));
-    DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
-           rhs_parameter->opcode() == HloOpcode::kParameter);
-    const HloInstruction* lhs =
-        inst->operand(lhs_parameter->parameter_number());
-    const HloInstruction* rhs =
-        inst->operand(rhs_parameter->parameter_number());
-
-    return MakeUnique<GemmThunk>(
-        GetAllocationSlice(*lhs),             // The buffer assigned to LHS.
-        GetAllocationSlice(*rhs),             // The buffer assigned to RHS.
-        GetAllocationSlice(*inst),            // The output buffer.
-        lhs->shape(),                         // The shape of LHS.
-        rhs->shape(),                         // The shape of RHS.
-        inst->shape(),                        // The shape of the output.
-        dot->operand(0)->IsRank2Transpose(),  // Transpose LHS.
-        dot->operand(1)->IsRank2Transpose(),  // Trasnpose RHS.
-        inst);
+    if (inst->fusion_kind() == HloInstruction::FusionKind::kOutput) {
+      const HloInstruction* mul = inst->fused_expression_root();
+      const HloInstruction* dot = mul->operand(0);
+      const HloInstruction* alpha = mul->operand(1);
+      if (dot->opcode() != HloOpcode::kDot) {
+        std::swap(dot, alpha);
+      }
+      DCHECK(dot->opcode() == HloOpcode::kDot);
+      const HloInstruction* lhs_parameter = StripTranspose(*dot->operand(0));
+      const HloInstruction* rhs_parameter = StripTranspose(*dot->operand(1));
+      DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
+             rhs_parameter->opcode() == HloOpcode::kParameter);
+      const HloInstruction* lhs =
+          inst->operand(lhs_parameter->parameter_number());
+      const HloInstruction* rhs =
+          inst->operand(rhs_parameter->parameter_number());
+
+      return MakeUnique<GemmThunk>(
+          GetAllocationSlice(*lhs),             // The buffer assigned to LHS.
+          GetAllocationSlice(*rhs),             // The buffer assigned to RHS.
+          GetAllocationSlice(*mul),             // The output buffer.
+          lhs->shape(),                         // The shape of LHS.
+          rhs->shape(),                         // The shape of RHS.
+          inst->shape(),                        // The shape of the output.
+          dot->operand(0)->IsRank2Transpose(),  // Transpose LHS.
+          dot->operand(1)->IsRank2Transpose(),  // Transpose RHS.
+          alpha->literal().Get<double>({0}),    // alpha.
+          inst);
+    } else {
+      const HloInstruction* dot = inst->fused_expression_root();
+      DCHECK(dot->opcode() == HloOpcode::kDot);
+      const HloInstruction* lhs_parameter = StripTranspose(*dot->operand(0));
+      const HloInstruction* rhs_parameter = StripTranspose(*dot->operand(1));
+      DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
+             rhs_parameter->opcode() == HloOpcode::kParameter);
+      const HloInstruction* lhs =
+          inst->operand(lhs_parameter->parameter_number());
+      const HloInstruction* rhs =
+          inst->operand(rhs_parameter->parameter_number());
+
+      return MakeUnique<GemmThunk>(
+          GetAllocationSlice(*lhs),             // The buffer assigned to LHS.
+          GetAllocationSlice(*rhs),             // The buffer assigned to RHS.
+          GetAllocationSlice(*inst),            // The output buffer.
+          lhs->shape(),                         // The shape of LHS.
+          rhs->shape(),                         // The shape of RHS.
+          inst->shape(),                        // The shape of the output.
+          dot->operand(0)->IsRank2Transpose(),  // Transpose LHS.
+          dot->operand(1)->IsRank2Transpose(),  // Transpose RHS.
+          1.0,                                  // Alpha.
+          inst);
+    }
   }
 
   LOG(FATAL) << "Cannot build a GemmThunk for " << inst->ToString();
@@ -2261,37 +2264,87 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
                               /*output_shape=*/inst->shape(), inst);
 }
 
-Status IrEmitterUnnested::EmitInitializer(const HloInstruction* hlo,
-                                          KernelThunk* thunk) {
+StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
+    const HloInstruction* hlo) {
   bool fused = HloOpcode::kFusion == hlo->opcode();
-
   const HloInstruction* inst = fused ? hlo->fused_expression_root() : hlo;
-  CHECK(inst->opcode() == HloOpcode::kSelectAndScatter ||
-        inst->opcode() == HloOpcode::kReduce);
-  const HloInstruction* init_value = nullptr;
-  switch (inst->opcode()) {
-    case HloOpcode::kSelectAndScatter:
-      init_value = inst->operand(2);
-      break;
-    case HloOpcode::kReduce:
-      init_value = inst->operand(1);
-      break;
-    default:
-      LOG(FATAL) << "Opcode " << inst->opcode()
-                 << " should not need an initializer.";
-  }
+  const HloInstruction* init_value = [&] {
+    switch (inst->opcode()) {
+      case HloOpcode::kSelectAndScatter:
+        return inst->operand(2);
+      case HloOpcode::kReduce:
+        return inst->operand(1);
+      default:
+        LOG(FATAL) << "Opcode " << inst->opcode()
+                   << " should not need an initializer.";
+    }
+  }();
 
   if (fused && init_value->opcode() == HloOpcode::kParameter) {
     init_value = hlo->operand(init_value->parameter_number());
   }
 
-  return EmitTargetElementLoopInThunk(
+  // In the common case, the initializer is a constant.  In this case, emit a
+  // device-memset call if we can.  Currently StreamExecutor only supports
+  // zeroing and 32-bit memsets.
+  if (init_value->IsConstant()) {
+    CHECK(ShapeUtil::IsScalar(init_value->shape()));
+    int64 num_bytes = ShapeUtil::ByteSizeOfElements(init_value->shape());
+    const auto& literal = init_value->literal();
+
+    // Are all the bytes of this scalar equal to 0?  If so, we can create a
+    // MemzeroThunk.
+    ArraySlice<uint8> literal_bytes(
+        reinterpret_cast<const uint8*>(literal.untyped_data()), num_bytes);
+    if (c_all_of(literal_bytes, [](uint8 byte) { return byte == 0; })) {
+      return {MakeUnique<MemzeroThunk>(GetAllocationSlice(*hlo), hlo)};
+    }
+
+    // If the literal is 8 or 16 bits wide, we can emit a 32-bit memset by
+    // repeating the literal 4 or 2 times, so long as the destination buffer is
+    // an even multiple of 32 bits long.
+    if ((num_bytes == 1 || num_bytes == 2) &&
+        ShapeUtil::ByteSizeOf(hlo->shape()) % 4 == 0) {
+      uint16 pattern16;
+      if (num_bytes == 1) {
+        uint8 b = literal_bytes.front();
+        pattern16 = uint16{b} | (uint16{b} << 8);
+      } else {
+        pattern16 = literal_bytes.front();
+      }
+      uint32 pattern32 = uint32{pattern16} | (uint32{pattern16} << 16);
+      return {MakeUnique<Memset32BitValueThunk>(pattern32,
+                                                GetAllocationSlice(*hlo), hlo)};
+    }
+
+    // If the literal is an even multiple of 32 bits wide, we can emit a 32-bit
+    // memset so long as all 32-bit words of the scalar are equal to each other.
+    if (num_bytes >= 4 && num_bytes % 4 == 0 &&
+        memcmp(literal_bytes.data(), literal_bytes.data() + 4,
+               literal_bytes.size() - 4) == 0) {
+      uint32 word;
+      memcpy(&word, literal_bytes.data(), sizeof(word));
+      return {MakeUnique<Memset32BitValueThunk>(word, GetAllocationSlice(*hlo),
+                                                hlo)};
+    }
+  }
+
+  // Otherwise fall back to our slow initializer code.
+  std::unique_ptr<KernelThunk> kernel_thunk = BuildKernelThunk(hlo);
+  TF_RETURN_IF_ERROR(EmitTargetElementLoopInThunk(
       *hlo,
       [=](const llvm_ir::IrArray::Index& index) {
         return GetIrArray(*init_value, *hlo)
             .EmitReadArrayElement(index, &ir_builder_);
       },
-      thunk);
+      kernel_thunk.get()));
+
+  // Clean up state left behind by emitting the loop above.  (This is normally
+  // done in IrEmitterUnnested::Postprocess().)
+  bindings_.UnbindAllLocalIrValues();
+
+  // Convert unique_ptr<KernelThunk> to StatusOr<unique_ptr<Thunk>>.
+  return {std::move(kernel_thunk)};
 }
 
 namespace {
@@ -2452,21 +2505,28 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
+  int unroll_factor = thunk->unroll_factor();
   VLOG(3) << bindings_.ToString();
 
   const Shape& element_shape = hlo.IsMultiOutputFusion()
                                    ? ShapeUtil::GetSubshape(hlo.shape(), {0})
                                    : hlo.shape();
+  VLOG(3) << "EmitTargetElementLoopInThunk "
+          << ShapeUtil::HumanStringWithLayout(hlo.shape())
+          << " for unroll_factor " << unroll_factor;
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      element_shape, ir_emitter_context_->device_description());
+      element_shape, ir_emitter_context_->device_description(), unroll_factor);
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
   if (!hlo.IsMultiOutputFusion()) {
     return ParallelLoopEmitter(element_generator, GetIrArray(hlo, hlo),
-                               launch_dimensions, &ir_builder_)
+                               launch_dimensions, &ir_builder_, unroll_factor)
         .EmitLoop(IrName(&hlo));
   }
 
+  CHECK_EQ(unroll_factor, 1)
+      << "multi-output fusion does not support unrolling";
+
   // For multiple outputs fusion, we need to emit each operand and the root.
   std::vector<llvm_ir::IrArray> output_arrays;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(hlo.shape()); ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index b83a2337e2decd9d4fba3d40fcf33f131fca8a3c..b842f480c6257c1a8bee8cdac55e29c5db6801a0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -148,13 +148,12 @@ class IrEmitterUnnested : public IrEmitter {
       tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
       HloComputation* reducer);
 
-  // Emits code to initialize buffer of `inst` in given `thunk`.
-  Status EmitInitializer(const HloInstruction* inst, KernelThunk* thunk);
-
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
-  // Thunk object.
-  std::unique_ptr<Thunk> BuildKernelThunk(const HloInstruction* inst);
+  // Thunk object. The kernel implementation will be unrolled if unroll_factor
+  // is greater than one.
+  std::unique_ptr<KernelThunk> BuildKernelThunk(const HloInstruction* inst,
+                                                int unroll_factor = 1);
 
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
@@ -163,6 +162,11 @@ class IrEmitterUnnested : public IrEmitter {
   // to make sure `inst` outlives the lifetime of the returned Thunk object.
   std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
 
+  // Returns a thunk that, given a reduce or select-and-scatter op, initializes
+  // its memory to the appropriate initial value.
+  StatusOr<std::unique_ptr<Thunk>> BuildInitializerThunk(
+      const HloInstruction* hlo);
+
   // Returns a thunk that calls host-to-device cuMemcpy to implement `inst`.
   std::unique_ptr<Thunk> BuildHostToDeviceCopyThunk(const HloInstruction* inst);
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index c20a781a33fe89af4740ed31dd5bfb1a64473057..d376ef7a245eb9ed86939f44c611b6dde5606b23 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -23,17 +23,17 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
 KernelThunk::KernelThunk(
     tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
-    const string& kernel_name, const HloInstruction* hlo_instruction)
+    const string& kernel_name, const HloInstruction* hlo_instruction,
+    int unroll_factor)
     : Thunk(Kind::kKernel, hlo_instruction),
       args_(args.begin(), args.end()),
-      kernel_name_(kernel_name) {}
+      kernel_name_(kernel_name),
+      unroll_factor_(unroll_factor) {}
 
 tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
   tensorflow::mutex_lock lock(mutex_);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 9ae455e2fcc253a7a08ff95764721048a16b0bf7..b556befe66b6bebba1a958f553f0a9b2c4eebbe4 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -47,20 +47,21 @@ class KernelThunk : public Thunk {
   //
   // `hlo_instruction` is as in Thunk. Other arguments are as the class members.
   KernelThunk(tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
-              const string& kernel_name, const HloInstruction* hlo_instruction);
+              const string& kernel_name, const HloInstruction* hlo_instruction,
+              int unroll_factor);
   KernelThunk(const KernelThunk&) = delete;
   KernelThunk& operator=(const KernelThunk&) = delete;
   ~KernelThunk() override = default;
 
   const string& kernel_name() const { return kernel_name_; }
+  int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
 
   // Executes the kernel for the thunk on "stream", which must be non-null.
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   // Buffers passed to the kernel as arguments.
@@ -69,6 +70,10 @@ class KernelThunk : public Thunk {
   // Entry kernel name for the computation.
   const string kernel_name_;
 
+  // The number of times this kernel should be unrolled. This works as a
+  // multiplier on the number of elements produced by a GPU thread.
+  const int unroll_factor_;
+
   // The thread and block dimension used to launch the kernel.
   // Will be set by IrEmitterUnnested.
   LaunchDimensions launch_dimensions_;
@@ -76,13 +81,11 @@ class KernelThunk : public Thunk {
   // Describes how to load this kernel. ExecuteOnStream reuses this loader
   // specification for all executions.
   mutable tensorflow::mutex mutex_;
-  std::unique_ptr<perftools::gputools::MultiKernelLoaderSpec> loader_spec_
-      GUARDED_BY(mutex_);
+  std::unique_ptr<se::MultiKernelLoaderSpec> loader_spec_ GUARDED_BY(mutex_);
 
   // Loaded kernels for each `StreamExecutor`
-  std::unordered_map<perftools::gputools::StreamExecutor*,
-                     perftools::gputools::KernelBase>
-      kernel_cache_ GUARDED_BY(mutex_);
+  std::unordered_map<se::StreamExecutor*, se::KernelBase> kernel_cache_
+      GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index f4c4dcdafd6cc0cd64da5a8d1f23c8c0e7b2a9cb..86c4ac18b0501c38aaaae5a007bddcf261ca338f 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -68,17 +68,3 @@ tf_cc_test(
         "@llvm//:support",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index defd281d74bd38f7da3f268e0f55970fc1af8263..d70cb07c57d48c0faed2cdc5ea9fc5ce5fb32be0 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/CodeGen/CommandFlags.def"
+#include "llvm/CodeGen/CommandFlags.inc"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
@@ -491,7 +491,7 @@ StatusOr<string> CompileToPtx(llvm::Module* module,
 
   string ptx;
   {
-    tensorflow::port::Tracing::TraceMe annotation(
+    tensorflow::tracing::ScopedActivity activity(
         "Compiling IR", llvm_ir::AsString(module->getName()),
         /*is_expensive=*/true);
     XLA_SCOPED_LOGGING_TIMER("Compile module " +
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4100a898b5bb9eec382c34932c2db104c9e985b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/memset_thunk.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+Status MemzeroThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
+  stream->ThenMemZero(&dest_data, dest_data.size());
+  return Status::OK();
+}
+
+Status Memset32BitValueThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
+  stream->ThenMemset32(&dest_data, value_, dest_data.size());
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..51c332d287d139335b356fc66411b5ffaa448b5a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MEMSET_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MEMSET_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+// This file contains thunks that set a buffer's elements to a particular value.
+// This can be faster than emitting a kernel to set the elements.
+
+namespace xla {
+namespace gpu {
+
+// Thunk that zeroes out a given chunk of memory.
+class MemzeroThunk : public Thunk {
+ public:
+  explicit MemzeroThunk(const BufferAllocation::Slice& dest,
+                        const HloInstruction* hlo)
+      : Thunk(Kind::kMemzero, hlo), dest_(dest) {}
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
+
+ private:
+  const BufferAllocation::Slice dest_;
+};
+
+// Thunk that sets a given chunk of memory to a particular 32-bit value.  The
+// destination chunk must have size divisible by 32 bits.
+class Memset32BitValueThunk : public Thunk {
+ public:
+  explicit Memset32BitValueThunk(uint32 value,
+                                 const BufferAllocation::Slice& dest,
+                                 const HloInstruction* hlo)
+      : Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {}
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
+
+ private:
+  uint32 value_;
+  const BufferAllocation::Slice dest_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MEMSET_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index 25846dc6cd4633c7becb6e62d6bc9585348a6eac..7bda4e2fcd469bd430e5ef1846251c8504225383 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -68,13 +69,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
     HloInstruction* padding =
         computation->AddInstruction(HloInstruction::CreateConstant(
             MakeUnique<Literal>(Literal::Zero(element_type))));
-    input = computation->AddInstruction(HloInstruction::CreatePad(
-        ShapeInference::InferPadShape(
-            /*operand_shape=*/input->shape(),
-            /*padding_value_shape=*/ShapeUtil::MakeShape(element_type, {}),
-            padding_config)
-            .ConsumeValueOrDie(),
-        input, padding, padding_config));
+    input = MakePadHlo(input, padding, padding_config).ValueOrDie();
   }
 
   if (window_util::HasNegativePadding(conv_window)) {
@@ -97,11 +92,8 @@ HloInstruction* MaybePaddedAndSlicedInput(
           std::max<int64>(0LL, -conv_window.dimensions(i).padding_high());
     }
 
-    input = computation->AddInstruction(HloInstruction::CreateSlice(
-        ShapeInference::InferSliceShape(input->shape(), start_indices,
-                                        limit_indices, strides)
-            .ConsumeValueOrDie(),
-        input, start_indices, limit_indices, strides));
+    input =
+        MakeSliceHlo(input, start_indices, limit_indices, strides).ValueOrDie();
   }
 
   return input;
@@ -134,13 +126,7 @@ HloInstruction* MaybePaddedKernel(const Window& conv_window,
   HloInstruction* padding =
       computation->AddInstruction(HloInstruction::CreateConstant(
           MakeUnique<Literal>(Literal::Zero(element_type))));
-  return computation->AddInstruction(HloInstruction::CreatePad(
-      ShapeInference::InferPadShape(
-          /*operand_shape=*/kernel->shape(),
-          /*padding_value_shape=*/ShapeUtil::MakeShape(element_type, {}),
-          padding_config)
-          .ConsumeValueOrDie(),
-      kernel, padding, padding_config));
+  return MakePadHlo(kernel, padding, padding_config).ValueOrDie();
 }
 }  // namespace
 
@@ -252,11 +238,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
       computation->AddInstruction(HloInstruction::CreateConstant(
           MakeUnique<Literal>(Literal::Zero(input->shape().element_type()))));
   HloInstruction* padded_input =
-      computation->AddInstruction(HloInstruction::CreatePad(
-          ShapeInference::InferPadShape(input->shape(), padding->shape(),
-                                        input_padding_config)
-              .ConsumeValueOrDie(),
-          input, padding, input_padding_config));
+      MakePadHlo(input, padding, input_padding_config).ValueOrDie();
 
   // The shape of the backward_conv CustomCall is a tuple (conv_result,
   // scratch_buffer).  Extract out the shape of conv_result.
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 388dcc008b07a76ff9ed07df04181e49a8734f51..d8c07dc3119fb81a3ef22822acb11b7c4d5bbca5 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -32,25 +32,32 @@ namespace gpu {
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     BodyEmitter body_emitter, const Shape& shape,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(body_emitter, shape, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(target_element_generator, target_arrays, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     const llvm_ir::IrArray& target_array,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(target_element_generator, target_array, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
-llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<llvm_ir::IrArray::Index>
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   // Emit the following code in LLVM IR:
   //   linear_index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -63,6 +70,9 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   //   "It is guaranteed that [...] 0  <=  %ctaid.x <  %nctaid.x"
   //
   // %nctaid.x is currently specified as 2147483647.
+  VLOG(3) << "EmitIndexAndSetExitBasicBlock unroll_factor " << unroll_factor_;
+  std::vector<llvm_ir::IrArray::Index> array_indices;
+
   llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, ir_builder_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(),
@@ -81,7 +91,7 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   thread_id = ir_builder_->CreateZExt(thread_id, ir_builder_->getInt64Ty(),
                                       "thread_id");
 
-  llvm::Value* linear_index = ir_builder_->CreateAdd(
+  llvm::Value* linear_index_base = ir_builder_->CreateAdd(
       ir_builder_->CreateMul(
           block_id,
           ir_builder_->getInt64(launch_dimensions_.threads_per_block()), "",
@@ -99,15 +109,30 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::assume,
       {ir_builder_->CreateICmpULT(
-          linear_index,
+          linear_index_base,
           ir_builder_->getInt64(launch_dimensions_.threads_per_block() *
                                 launch_dimensions_.block_count()),
           "linear_index_in_range")},
       {}, ir_builder_);
 
+  if (unroll_factor_ > 1) {
+    linear_index_base = ir_builder_->CreateMul(
+        linear_index_base, ir_builder_->getInt64(unroll_factor_),
+        "linear_index_base", /*HasNUW=*/true, /*HasNSW=*/true);
+  }
+
+  array_indices.emplace_back(linear_index_base, shape_, ir_builder_);
+  for (int i = 1; i < unroll_factor_; ++i) {
+    llvm::Value* linear_index = ir_builder_->CreateAdd(
+        linear_index_base, ir_builder_->getInt64(i), "linear_index",
+        /*HasNUW=*/true, /*HasNSW=*/true);
+    array_indices.emplace_back(linear_index, shape_, ir_builder_);
+  }
+
   auto if_in_bounds = llvm_ir::EmitIfThenElse(
       ir_builder_->CreateICmpULT(
-          linear_index, ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))),
+          linear_index_base,
+          ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))),
       llvm_ir::IrName(loop_name, "in_bounds"), ir_builder_, false);
 
   // Set exit_bb_ to the exit block of the if structure.
@@ -116,7 +141,8 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
 
   // Set IR builder insertion point to the body of the if structure.
   llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
-  return llvm_ir::IrArray::Index(linear_index, shape_, ir_builder_);
+
+  return array_indices;
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 8ed63a854a74fc06c3c389f40fe1f5970885deac..25318b3bed8bf4a2dfe3a4a974269d0405c3bfec 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -34,13 +34,13 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   // The meanings of other parameters are the same as LoopEmitter.
   ParallelLoopEmitter(BodyEmitter body_emitter, const Shape& shape,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
   // Constructs a ParallelLoopEmitter from an element generator that generates
   // each element of the given target array.
   ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
                       const llvm_ir::IrArray& target_array,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
 
   // Constructs a loop emitter for a loop that generates on element of each of N
   // arrays on each iteration.
@@ -50,18 +50,20 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter(
       const llvm_ir::ElementGenerator& target_element_generator,
       tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder);
+      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+      int unroll_factor = 1);
 
   ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
   ~ParallelLoopEmitter() override = default;
 
-  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name) override;
 
  private:
   // The thread and block dimension to parallelize the loop on.
   const LaunchDimensions launch_dimensions_;
+  const int unroll_factor_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index 6cf280df05496716a0780d61ded92efd9982734c..d3fd0544fb68809125e9b9f7a5e5b7eff8c6ef43 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
@@ -44,12 +42,16 @@ std::ostream& operator<<(std::ostream& out,
 
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape, const se::DeviceDescription& device_desc) {
+    const Shape& shape, const se::DeviceDescription& device_desc,
+    int unroll_factor) {
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
   }
 
+  CHECK_EQ(num_elements % unroll_factor, 0);
+  num_elements = num_elements / unroll_factor;
+
   // Since we don't do any inter-warp communication, we're free to choose any
   // block size we want, subject to hardware constraints.  We choose the
   // smallest block size that allows the GPU to reach full occupancy (assuming
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 0bf463a6ef95d5a32784838c08ad239752fd1acf..c125474edb1036090a926020f2b1e7fcf64c751a 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -57,8 +57,8 @@ std::ostream& operator<<(std::ostream& out,
                          const LaunchDimensions& launch_dims);
 
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc);
+    const Shape& shape, const se::DeviceDescription& device_desc,
+    int unroll_factor = 1);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index d8a43091d4037a0edd125a4a1b6cb5ad7c7065f0..c8510808f10a731af90154447bd3e1e037db6348 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -33,8 +33,7 @@ tensorflow::Status SequentialThunk::Initialize(
 }
 
 tensorflow::Status SequentialThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
   for (const auto& thunk : thunks_) {
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index 32c5b748aba14239d6795d14e442c1c3b43d010e..df17b8d67b80321c7088243eae46e7a723b4ede9 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -40,8 +40,7 @@ class SequentialThunk : public Thunk {
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   // The list of sub-thunks.
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 2c3032d79be221e8cacb178ffb1817459b603cc0..a0c785ed913109e987d058124c8ef49019c98500 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -51,6 +51,8 @@ class Thunk {
     kGemm,
     kInfeed,
     kKernel,
+    kMemset32BitValue,
+    kMemzero,
     kSequential,
     kTuple,
     kWhile,
@@ -83,8 +85,7 @@ class Thunk {
   // This value is not required to be constant for a given Thunk.  For example,
   // a Thunk that performs autotuning may return true for its first run and
   // false thereafter.
-  virtual bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream* /*stream*/) {
+  virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) {
     return false;
   }
 
@@ -102,8 +103,7 @@ class Thunk {
   // called after Initialize and can be called multiple times over Thunk's
   // lifetime. Stream argument must be non-null.
   virtual tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) = 0;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) = 0;
 
  private:
   Kind kind_;
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
index bd65e72393a59e72671ff0cc32c37eaa48856255..ecb54857ccc40ead21e5a18d79a37b545680021d 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
@@ -17,8 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/util.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
index 3b1a496328540ae69a449e7080903d31284885d1..8b459c29a136a6e7853e68a1bead7d12c0d08ad0 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
@@ -46,8 +46,7 @@ class TupleThunk : public Thunk {
   TupleThunk& operator=(const TupleThunk&) = delete;
 
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index c21559af6d2e5dfb5aaf62afcdcaed514e0914c9..a9f3d619a3ffd6d849572355e2902375e43508fa 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -41,8 +41,8 @@ Status WhileThunk::Initialize(const GpuExecutable& executable) {
 }
 
 Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                   perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase condition_result_data =
+                                   se::Stream* stream) {
+  se::DeviceMemoryBase condition_result_data =
       buffer_allocations.GetDeviceAddress(condition_result_buffer_index_);
 
   while (true) {
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 4c9f45de9e42494df58706d0a4a3eb0c4220b8b8..e589ca78a7ea00e7592d6e09ead9c270f902702f 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -47,7 +47,7 @@ class WhileThunk : public Thunk {
 
   Status Initialize(const GpuExecutable& executable) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice condition_result_buffer_index_;
diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc
index 05017008e2ddbe0b9e78d06275fdec5d08d94bfa..acf661148699dab18916e3065ee647d37fda6208 100644
--- a/tensorflow/compiler/xla/service/graphviz_example.cc
+++ b/tensorflow/compiler/xla/service/graphviz_example.cc
@@ -82,7 +82,8 @@ HloComputation* CallForwardingComputation(HloComputation* computation,
 // instructions. Sets the computation as the entry to an HLO module and returns
 // the module.
 std::unique_ptr<HloModule> MakeBigGraph() {
-  auto module = MakeUnique<HloModule>("BigGraph");
+  HloModuleConfig config;
+  auto module = MakeUnique<HloModule>("BigGraph", config);
 
   auto builder = HloComputation::Builder("TestBigGraphvizGraph");
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index a2d13c013c56059148ccd04dba2137a5b2badc42..3dd4c4a0794e5c41b877078c4e69c6c9584ce6c0 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -27,38 +27,6 @@ namespace xla {
 using tensorflow::gtl::FlatMap;
 using tensorflow::gtl::FlatSet;
 
-namespace {
-
-// Returns the set of buffers that may be sources of all operands of the given
-// instruction.  The returned buffers are guaranteed to have no duplicates, and
-// to be sorted in a deterministic order.
-std::vector<const LogicalBuffer*> UniqueOperandSourceBuffers(
-    const HloInstruction* instruction,
-    const TuplePointsToAnalysis& points_to_analysis) {
-  std::vector<const LogicalBuffer*> buffers;
-  for (const HloInstruction* operand : instruction->operands()) {
-    points_to_analysis.GetPointsToSet(operand).ForEachElement(
-        [&](const ShapeIndex& /*index*/,
-            const PointsToSet::BufferList& points_to) {
-          buffers.insert(buffers.end(), points_to.begin(), points_to.end());
-        });
-  }
-
-  // Sort and then remove duplicates from buffers.
-  std::sort(buffers.begin(), buffers.end(),
-            [](const LogicalBuffer* a, const LogicalBuffer* b) {
-              return a->id() < b->id();
-            });
-  buffers.erase(std::unique(buffers.begin(), buffers.end(),
-                            [](const LogicalBuffer* a, const LogicalBuffer* b) {
-                              return a->id() == b->id();
-                            }),
-                buffers.end());
-  return buffers;
-}
-
-}  // namespace
-
 /*static*/
 StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
@@ -93,6 +61,7 @@ Status HeapSimulator::RunComputation(
     const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
     const TuplePointsToAnalysis& points_to_analysis) {
+  VLOG(3) << "Computation:\n" << computation.ToString();
   // The goal here is to minimize memory usage, assuming the given sequential
   // ordering of instructions.  The strategy is to walk through the instruction
   // sequence, calling Alloc and Free on the underlying heap algorithm.  The
@@ -101,7 +70,51 @@ Status HeapSimulator::RunComputation(
   // 'live_buffers' tracks the liveness of each buffer that we assign, by
   // associating it with a set of HloInstructions that need to be visited.  When
   // the set becomes empty, the buffer is no longer used, and can be freed.
+  // 'used_buffers' is the reverse map - it tracks which buffers were used by an
+  // instruction, so that we can remove the instructions from a buffer's live
+  // set after they are visited.
   FlatMap<const LogicalBuffer*, FlatSet<const HloInstruction*>> live_buffers;
+  FlatMap<const HloInstruction*, FlatSet<const LogicalBuffer*>> used_buffers;
+  auto add_user_to_buffer = [this, &live_buffers, &used_buffers](
+                                const HloInstruction* user,
+                                const LogicalBuffer* buffer) {
+    if (!IgnoreBuffer(buffer)) {
+      VLOG(4) << "  Adding user " << user->name() << " to buffer "
+              << buffer->ToString();
+      live_buffers[buffer].insert(user);
+      used_buffers[user].insert(buffer);
+    }
+  };
+
+  // Initialize live_buffers for each buffer that we're going to assign.  The
+  // set of instructions that need to be visited contains all users of all
+  // aliases, that is, all users of all instructions that have the buffer
+  // contained in their points-to set.
+  for (const HloInstruction* instruction : instruction_sequence) {
+    const PointsToSet& points_to =
+        points_to_analysis.GetPointsToSet(instruction);
+    const PointsToSet::BufferSet& buffer_set = points_to.CreateFlattenedSet();
+    for (const HloInstruction* user : instruction->users()) {
+      if (user->opcode() != HloOpcode::kGetTupleElement) {
+        for (const LogicalBuffer* buffer : buffer_set) {
+          add_user_to_buffer(user, buffer);
+        }
+      } else {
+        // A GetTupleElement doesn't need to keep all of its operand's buffers
+        // alive. It only needs the buffers that relate to the element its
+        // extracting, and the tuple it's extracting from, but not the buffers
+        // for the other elements.
+        for (const LogicalBuffer* buffer : points_to.element({})) {
+          add_user_to_buffer(user, buffer);
+        }
+        const PointsToSet& gte_points_to =
+            points_to_analysis.GetPointsToSet(user);
+        for (const LogicalBuffer* buffer : gte_points_to.CreateFlattenedSet()) {
+          add_user_to_buffer(user, buffer);
+        }
+      }
+    }
+  }
 
   const HloInstruction* root = computation.root_instruction();
   auto output_source_buffers =
@@ -114,34 +127,17 @@ Status HeapSimulator::RunComputation(
         buffers_defined_by_instruction =
             points_to_analysis.GetBuffersDefinedByInstruction(instruction);
 
-    // Initialize live_buffers for each buffer that we're going to assign.  The
-    // set of instructions that need to be visited contains all users of all
-    // aliases.  The alias itself is not necessary; if it has users, the users
-    // are necessarily scheduled after the alias.  And if it has no users, it is
-    // either a dead value or an output, both of which are handled below.
-    //
-    // We ignore control dependencies here. The reasoning is that the control
-    // dependencies have already been accounted for in the ordering of the given
-    // 'instruction_sequence', and should not otherwise artificially extend the
-    // lifetime of buffers that aren't already connected by a data dependency.
+    VLOG(3) << "Instruction: " << instruction->ToString();
+    for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
+      VLOG(4) << "  Defines: " << buffer->ToString()
+              << (IgnoreBuffer(buffer) ? " (Ignored)" : "");
+    }
+
     dead_buffers_to_free.clear();
     for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
       if (IgnoreBuffer(buffer)) {
         continue;
       }
-      FlatSet<const HloInstruction*>* live_set = nullptr;
-      for (const BufferAlias& alias :
-           points_to_analysis.GetBufferAliases(*buffer)) {
-        const std::vector<HloInstruction*>& users =
-            alias.instruction()->users();
-        if (!users.empty()) {
-          if (live_set == nullptr) {
-            live_set = &live_buffers[buffer];
-          }
-          live_set->insert(users.begin(), users.end());
-        }
-      }
-
       // Add a nullptr sentry to ensure entry parameters and output source
       // buffers are not freed until the very end.
       const bool entry_parameter =
@@ -165,11 +161,12 @@ Status HeapSimulator::RunComputation(
     // have no instructions left to visit are moved from live_buffers to
     // operand_buffers_to_free.
     operand_buffers_to_free.clear();
-    for (const LogicalBuffer* operand_buffer :
-         UniqueOperandSourceBuffers(instruction, points_to_analysis)) {
+    for (const LogicalBuffer* operand_buffer : used_buffers[instruction]) {
       if (IgnoreBuffer(operand_buffer)) {
         continue;
       }
+      VLOG(4) << "  Removing user " << instruction->name() << " from buffer "
+              << operand_buffer->ToString();
       auto it = live_buffers.find(operand_buffer);
       FlatSet<const HloInstruction*>* live_set = &it->second;
       live_set->erase(instruction);
@@ -178,6 +175,11 @@ Status HeapSimulator::RunComputation(
         operand_buffers_to_free.push_back(operand_buffer);
       }
     }
+    // Sort to get a deterministic iteration order.
+    std::sort(operand_buffers_to_free.begin(), operand_buffers_to_free.end(),
+              [](const LogicalBuffer* x, const LogicalBuffer* y) {
+                return x->id() < y->id();
+              });
 
     // Allocate buffers defined by this instruction.  This is the latest point
     // that we can allocate; right before the buffer is first used.  This must
@@ -203,6 +205,8 @@ Status HeapSimulator::RunComputation(
               CanShareOperandBufferWithUser(
                   operand_buffer->instruction(), operand_buffer->index(),
                   buffer->instruction(), buffer->index(), points_to_analysis)) {
+            VLOG(3) << "  Sharing: " << buffer->ToString() << " with "
+                    << operand_buffer->ToString();
             ShareBuffer(buffer, operand_buffer, instruction);
             shared = true;
             break;
@@ -211,6 +215,7 @@ Status HeapSimulator::RunComputation(
       }
 
       if (!shared) {
+        VLOG(3) << "  Allocating: " << buffer->ToString();
         Alloc(buffer, instruction);
       }
     }
@@ -244,20 +249,34 @@ Status HeapSimulator::RunComputation(
     // Free buffers that are no longer live.  This is the earliest point that we
     // can de-allocate; right after the last use of the buffer.
     for (const LogicalBuffer* buffer : dead_buffers_to_free) {
+      VLOG(3) << "  Freeing dead: " << buffer->ToString();
       Free(buffer, instruction);
     }
     for (const LogicalBuffer* buffer : operand_buffers_to_free) {
+      VLOG(3) << "  Freeing operand: " << buffer->ToString();
       Free(buffer, instruction);
     }
   }
 
   // Any remaining live buffers must be entry parameters or output source
-  // buffers, which had a nullptr sentry added.  Free them now.
+  // buffers, which had a nullptr sentry added.  Free them now, in a
+  // deterministic order.
+  std::vector<const LogicalBuffer*> to_free;
+  to_free.reserve(live_buffers.size());
   for (const auto& buffer_pending : live_buffers) {
     const LogicalBuffer* buffer = buffer_pending.first;
     const FlatSet<const HloInstruction*>& pending = buffer_pending.second;
     CHECK_EQ(pending.size(), 1) << *buffer;
     CHECK(*pending.begin() == nullptr) << *buffer;
+    to_free.push_back(buffer);
+  }
+
+  std::sort(to_free.begin(), to_free.end(),
+            [](const LogicalBuffer* x, const LogicalBuffer* y) {
+              return x->id() < y->id();
+            });
+  for (const LogicalBuffer* buffer : to_free) {
+    VLOG(3) << "Freeing pending: " << buffer->ToString();
     Free(buffer, root);
   }
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 387b649a731ebcbfd8307807469f39f22d192b06..e983fd11d4eefc2c4b3ba910a7d9ed50d637c2a3 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -76,7 +76,8 @@ class HeapSimulatorTracker {
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
       const std::vector<const HloInstruction*>& instruction_sequence) {
-    module_ = MakeUnique<HloModule>(name);
+    HloModuleConfig config;
+    module_ = MakeUnique<HloModule>(name, config);
     module_->AddEntryComputation(std::move(computation));
     points_to_analysis_ =
         TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
@@ -94,7 +95,8 @@ class HeapSimulatorTracker {
   }
 
   explicit HeapSimulatorTracker(const string& name) {
-    module_ = MakeUnique<HloModule>(name);
+    HloModuleConfig config;
+    module_ = MakeUnique<HloModule>(name, config);
   }
 
   // Similar to the single entry computation constructor above, but runs the
@@ -410,6 +412,56 @@ TEST_F(HeapSimulatorTest, MultiplyDotDotTuple) {
   });
 }
 
+TEST_F(HeapSimulatorTest, IndependentTupleElements) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32scalar_, "paramA"));
+  auto paramB = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32scalar_, "paramB"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32scalar_, HloOpcode::kMultiply, paramA, paramB));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32scalar_, HloOpcode::kAdd, paramA, paramB));
+  auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({mul, add}));
+  auto element0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32scalar_, tuple, 0));
+  auto broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(f32vec4_, element0, {0}));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32scalar_, HloOpcode::kSubtract, paramA, paramB));
+  auto element1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32scalar_, tuple, 1));
+  auto output = builder.AddInstruction(
+      HloInstruction::CreateTuple({broadcast, sub, element1}));
+
+  HeapSimulatorTracker tracker(TestName(), builder.Build(),
+                               {paramA, paramB, mul, add, tuple, element0,
+                                broadcast, sub, element1, output});
+  tracker.ExpectCallSequence({
+      {kAlloc, tracker.BufferAt(paramA, {})},
+      {kAlloc, tracker.BufferAt(paramB, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
+      {kAlloc, tracker.BufferAt(add, {})},
+      {kAlloc, tracker.BufferAt(tuple, {})},
+      {kAlloc, tracker.BufferAt(broadcast, {})},
+      // The mul can be freed right after the broadcast happens, even though
+      // The other GetTupleElement is still alive.
+      {kFree, tracker.BufferAt(mul, {})},
+      {kAlloc, tracker.BufferAt(sub, {})},
+      // The temporary tuple is now dead.
+      {kFree, tracker.BufferAt(tuple, {})},
+      {kAlloc, tracker.BufferAt(output, {})},
+      // All params and outputs are freed at the end.
+      {kFree, tracker.BufferAt(paramA, {})},
+      {kFree, tracker.BufferAt(paramB, {})},
+      {kFree, tracker.BufferAt(add, {})},
+      {kFree, tracker.BufferAt(broadcast, {})},
+      {kFree, tracker.BufferAt(sub, {})},
+      {kFree, tracker.BufferAt(output, {})},
+      {kFinish, nullptr},
+  });
+}
+
 TEST_F(HeapSimulatorTest, WholeModule) {
   HeapSimulatorTracker tracker(TestName());
 
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index a43785b4a9701369ae315f67d4d64d03dc6c081d..aa6860880b7a1308d3ecabb52318daa7d2852af2 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -13,13 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// DO NOT USE THESE PROTO MESSAGES FOR ANYTHING OTHER THAN DEBUGGING.
-//
-// Don't use these protos in the real compilation or execution codepaths. The
-// data format is meant for debugging only, and may change without notice.
+// This proto file defines messages which represent the HLO module. This is a
+// full fidelity serialization of the c++ HLO constructs.
 //
 // Many of the protos below are simple 1-to-1 serializations of the
-// corresponding C++ classes.
+// corresponding C++ classes, e.g., HloModule, HloComputation, and
+// HloInstruction.
 //
 // FIELD NAMES ARE IMPORTANT
 //
@@ -38,16 +37,19 @@ option cc_enable_arenas = true;
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
+  reserved 12;
+  reserved "fused_instructions_computation";
+  reserved 4;
+  reserved "operand_names";
+  reserved 5;
+  reserved "control_predecessor_names";
+  reserved 6;
+  reserved "called_computation_names";
 
   string name = 1;
   string opcode = 2;
   xla.Shape shape = 3;
 
-  // TODO(b/67782397): Replace instruction names with HloInstruction ids.
-  repeated string operand_names = 4;
-  repeated string control_predecessor_names = 5;
-  repeated string called_computation_names = 6;
-
   xla.OpMetadata metadata = 7;
 
   // Literal, only present for kConstant.
@@ -58,7 +60,6 @@ message HloInstructionProto {
 
   // Fusion state, only present for kFusion.
   string fusion_kind = 11;
-  HloComputationProto fused_instructions_computation = 12;
 
   // Index for kGetTupleElement.
   int64 tuple_index = 13;
@@ -133,28 +134,57 @@ message HloInstructionProto {
   // Gather dimension numbers.
   xla.GatherDimensionNumbers gather_dimension_numbers = 33;
   repeated int64 gather_window_bounds = 34;
+
+  // Compute Host.
+  string channel_name = 41;
+  int64 cost_estimate_ns = 42;
+
+  // The id of this instruction.
+  int64 id = 35;
+
+  repeated int64 operand_ids = 36;
+  repeated int64 control_predecessor_ids = 37;
+  repeated int64 called_computation_ids = 38;
+
+  xla.OpSharding sharding = 40;
 }
 
 // Serialization of HloComputation.
 message HloComputationProto {
+  reserved 3;
+  reserved "root_name";
+
   string name = 1;
 
   // The array of instructions is always in a valid dependency order, where
   // operands appear before their users.
   repeated HloInstructionProto instructions = 2;
 
-  // The name of the root of the computation.
-  string root_name = 3;
+  // The program shape (with layout) of this computation.
+  xla.ProgramShape program_shape = 4;
+
+  // The id of this computation.
+  int64 id = 5;
+
+  // The id of the root of the computation.
+  int64 root_id = 6;
 }
 
 // Serialization of HloModule.
 message HloModuleProto {
   string name = 1;
   string entry_computation_name = 2;
+  int64 entry_computation_id = 6;
 
   // The array of computations is always in a valid dependency order, where
   // callees appear before their callers.
   repeated HloComputationProto computations = 3;
+
+  // The program shape (with layout) of the entry computation.
+  xla.ProgramShape program_shape = 4;
+
+  // The id of this module.
+  int64 id = 5;
 }
 
 // Serialization of HloOrdering.
@@ -266,3 +296,20 @@ message HloProto {
   HloOrderingProto hlo_ordering = 2;
   BufferAssignmentProto buffer_assignment = 3;
 }
+
+// Encapsulates HloProto together with the arguments, result, and
+// execution_platform. This message is used for purposes such as
+// analysis/replay/file-storage.
+message HloSnapshot {
+  // The hlo graph.
+  HloProto hlo = 1;
+
+  // The arguments passed to the graph.
+  repeated LiteralProto arguments = 2;
+
+  // The result of the graph.
+  LiteralProto result = 3;
+
+  // The name of the platform used to run the graph.
+  string execution_platform = 4;
+}
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 30e32a46d7dd0923f738939c33407ac7484b5bbe..a88283ed9a6459b4fa9310e160b59c77d51f1027 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -171,24 +171,21 @@ class BufferValueMap {
     return value_to_buffer_number_.at(&value);
   }
 
-  // Compute and return a vector of buffers that the given value must be
-  // contained in due to HLO aliasing rules.
-  std::vector<BufferNumber> ComputeAliasedBuffers(const HloValue& value) {
+  void ComputeWhileAliasedBuffers(const HloValue& value,
+                                  std::vector<BufferNumber>* aliased_buffers) {
+    VLOG(3) << "Compute kWhile aliases";
     // Value is init of a while (use is while).
-    std::vector<BufferNumber> aliased_buffers;
     for (const HloUse& use : value.uses()) {
-      VLOG(2) << "use of value " << value.ToShortString() << ": " << use;
       if (use.instruction->opcode() == HloOpcode::kWhile) {
         // Determine the while value that this shares a buffer with.
         const HloValue& while_value =
             dataflow_.GetUniqueValueAt(use.instruction, use.operand_index);
-        aliased_buffers.push_back(GetBufferForValue(while_value));
+        aliased_buffers->push_back(GetBufferForValue(while_value));
         VLOG(3) << "  value is init value to a while; must share buffer with "
                    "while value "
                 << while_value.ToShortString();
       }
     }
-
     // Value is a parameter of a while body/condition.
     if (value.defining_instruction()->opcode() == HloOpcode::kParameter) {
       const HloComputation* computation =
@@ -205,11 +202,10 @@ class BufferValueMap {
           VLOG(3) << "  value is parameter value of the body or condition of a "
                      "while; must share buffer with while value "
                   << while_value.ToShortString();
-          aliased_buffers.push_back(GetBufferForValue(while_value));
+          aliased_buffers->push_back(GetBufferForValue(while_value));
         }
       }
     }
-
     // Value is the root of a while body.
     for (const HloPosition& position : value.positions()) {
       const HloComputation* computation = position.instruction->parent();
@@ -224,27 +220,71 @@ class BufferValueMap {
 
             const HloValue& while_value = dataflow_.GetUniqueValueAt(
                 callsite.instruction(), position.index);
-            VLOG(3) << "  value is root the body computation of a while; must "
-                       "share buffer with while value "
+            VLOG(3) << "  value @ " << position << " is root of "
+                    << callsite.instruction()->name()
+                    << "; body root and while value root must share buffer "
+                       "among them : "
                     << while_value.ToShortString();
-            aliased_buffers.push_back(GetBufferForValue(while_value));
+            aliased_buffers->push_back(GetBufferForValue(while_value));
           }
         }
       }
     }
-
     // Value is the output of the while instruction itself.
     if (value.defining_instruction()->opcode() == HloOpcode::kWhile) {
       VLOG(3) << "  value is output of a while instruction";
-      aliased_buffers.push_back(GetBufferForValue(value));
+      aliased_buffers->push_back(GetBufferForValue(value));
+    }
+  }
+
+  void ComputeConditionalAliasedBuffers(
+      const HloValue& value, std::vector<BufferNumber>* aliased_buffers) {
+    VLOG(3) << "Compute kConditional aliases";
+    // Aliases the buffers of the true/false computations roots, with the one of
+    // the conditional.
+    for (const HloPosition& position : value.positions()) {
+      const HloComputation* computation = position.instruction->parent();
+      const CallGraphNode& call_graph_node =
+          dataflow_.call_graph().GetNode(computation);
+      if (position.instruction == computation->root_instruction()) {
+        for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+          if (callsite.instruction()->opcode() == HloOpcode::kConditional) {
+            // Call graph must have been flattened.
+            CHECK_EQ(call_graph_node.caller_callsites().size(), 1);
+
+            const HloValue& cond_value = dataflow_.GetUniqueValueAt(
+                callsite.instruction(), position.index);
+            VLOG(3)
+                << "  value @ " << position << " is root of "
+                << callsite.instruction()->name()
+                << "; true/false branch roots must share buffer among them : "
+                << cond_value.ToShortString();
+            aliased_buffers->push_back(GetBufferForValue(cond_value));
+          }
+        }
+      }
+    }
+    // Value is the output of the conditional instruction itself.
+    if (value.defining_instruction()->opcode() == HloOpcode::kConditional) {
+      VLOG(3) << "  value is output of a conditional instruction";
+      aliased_buffers->push_back(GetBufferForValue(value));
     }
+  }
 
+  // Compute and return a vector of buffers that the given value must be
+  // contained in due to HLO aliasing rules.
+  std::vector<BufferNumber> ComputeAliasedBuffers(const HloValue& value) {
+    for (const HloUse& use : value.uses()) {
+      VLOG(2) << "Use of value " << value.ToShortString() << ": " << use;
+    }
+    std::vector<BufferNumber> aliased_buffers;
+    ComputeWhileAliasedBuffers(value, &aliased_buffers);
+    ComputeConditionalAliasedBuffers(value, &aliased_buffers);
     // Uniquify aliased buffers.
     std::sort(aliased_buffers.begin(), aliased_buffers.end());
     aliased_buffers.erase(
         std::unique(aliased_buffers.begin(), aliased_buffers.end()),
         aliased_buffers.end());
-
     return aliased_buffers;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 21e6b2ca730f6347af902097e6496826b861e8a3..594413e88fb26e86b198d08b2e4db77fad671348 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -65,6 +65,7 @@ HloComputation::HloComputation(
     std::vector<std::unique_ptr<HloInstruction>>* instructions,
     HloInstruction* root_instruction, HloInstruction* fusion_instruction)
     : name_(name),
+      unique_id_(-1),
       root_instruction_(root_instruction),
       fusion_instruction_(fusion_instruction) {
   param_instructions_.resize(parameter_count, nullptr);
@@ -101,7 +102,7 @@ HloInstruction* HloComputation::AddInstructionInternal(
     instruction->UniquifyName(&parent()->instruction_name_uniquer());
     instruction->SetUniqueId(parent()->NewUniqueInstructionId());
   }
-  Reparent(instruction.get());
+  instruction->set_parent(this);
   HloInstruction* pinst = instruction.get();
   instruction_iterators_[pinst] =
       instructions_.insert(instructions_.end(), std::move(instruction));
@@ -158,10 +159,6 @@ Status HloComputation::RemoveParameter(int64 param_no) {
   return Status::OK();
 }
 
-void HloComputation::Reparent(HloInstruction* instruction) {
-  instruction->set_parent(this);
-}
-
 bool HloComputation::IsRemovable(const HloInstruction* instruction) {
   // If the instruction has control predecessors or successors then we cannot
   // remove the instruction without violating ordering constraints (added, for
@@ -307,19 +304,15 @@ void ComputeComputationPostOrder(
     HloComputation* computation,
     tensorflow::gtl::FlatSet<HloComputation*>* visited,
     std::list<HloComputation*>* post_order) {
-  if (visited->count(computation) > 0) {
-    return;
-  }
-
-  for (auto* instruction : computation->instructions()) {
-    for (HloComputation* called_computation :
-         instruction->called_computations()) {
-      ComputeComputationPostOrder(called_computation, visited, post_order);
+  if (visited->insert(computation).second) {
+    for (auto* instruction : computation->instructions()) {
+      for (HloComputation* called_computation :
+           instruction->called_computations()) {
+        ComputeComputationPostOrder(called_computation, visited, post_order);
+      }
     }
+    post_order->push_back(computation);
   }
-
-  visited->insert(computation);
-  post_order->push_back(computation);
 }
 
 }  // namespace
@@ -393,43 +386,46 @@ string HloComputation::ToString(const HloPrintOptions& options) const {
 
 HloComputationProto HloComputation::ToProto() const {
   HloComputationProto proto;
+  CHECK(unique_id_ != -1)
+      << "This computation does not have a valid id. Please make sure the "
+         "computation is inside a module before dumping it.";
+  proto.set_id(unique_id_);
   proto.set_name(name_);
   for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
     HloInstructionProto instruction_proto = instruction->ToProto();
     proto.add_instructions()->Swap(&instruction_proto);
   }
-  proto.set_root_name(root_instruction()->name());
+  proto.set_root_id(root_instruction()->unique_id());
+  *proto.mutable_program_shape() = ComputeProgramShape();
   return proto;
 }
 
 /* static */ StatusOr<std::unique_ptr<HloComputation>>
 HloComputation::CreateFromProto(
     HloModule* module, const HloComputationProto& proto,
-    const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
-    const std::function<void(std::unique_ptr<HloComputation>)>&
-        add_fused_computation,
-    HloInstruction* fusion_instruction) {
+    const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
   std::vector<std::unique_ptr<HloInstruction>> instructions;
-  tensorflow::gtl::FlatMap<string, HloInstruction*> instruction_map;
+  tensorflow::gtl::FlatMap<int64, HloInstruction*> instruction_map;
   int64 parameter_count = 0;
   for (const HloInstructionProto& instruction_proto : proto.instructions()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloInstruction> instruction,
-                        HloInstruction::CreateFromProto(
-                            module, instruction_proto, instruction_map,
-                            computation_map, add_fused_computation));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloInstruction> instruction,
+        HloInstruction::CreateFromProto(module, instruction_proto,
+                                        instruction_map, computation_map));
     if (instruction->opcode() == HloOpcode::kParameter) {
       parameter_count++;
     }
-    TF_RET_CHECK(!ContainsKey(instruction_map, instruction->name()));
-    instruction_map[instruction->name()] = instruction.get();
+    TF_RET_CHECK(!ContainsKey(instruction_map, instruction_proto.id()));
+    instruction_map[instruction_proto.id()] = instruction.get();
     instructions.push_back(std::move(instruction));
   }
 
-  TF_RET_CHECK(!proto.root_name().empty());
-  TF_RET_CHECK(ContainsKey(instruction_map, proto.root_name()));
-  HloInstruction* root = instruction_map.at(proto.root_name());
-  return WrapUnique(new HloComputation(
-      proto.name(), parameter_count, &instructions, root, fusion_instruction));
+  TF_RET_CHECK(proto.root_id() != -1);
+  TF_RET_CHECK(ContainsKey(instruction_map, proto.root_id()));
+  HloInstruction* root = instruction_map.at(proto.root_id());
+  return WrapUnique(new HloComputation(proto.name(), parameter_count,
+                                       &instructions, root,
+                                       /*fusion_instruction=*/nullptr));
 }
 
 void HloComputation::FuseInstructionsInto(
@@ -532,7 +528,6 @@ ProgramShape HloComputation::ComputeProgramShape() const {
   }
   *program_shape.mutable_result() = root_instruction_->shape();
 
-  LayoutUtil::ClearLayout(&program_shape);
   return program_shape;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 39d864efcb70382b6f8e631d7e6e452ea6410104..9d3f6e9a2c2efd97681a22b6b0f6d929afc553de 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -160,20 +160,12 @@ class HloComputation {
   //   module: the module which will contain the computation. The newly created
   //     computation is *not* added to the module, however.
   //   proto: the proto to convert from.
-  //   computation_map: a map from computation name to HloComputation*. This map
+  //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed computation
   //     calls.
-  //   add_fused_computation: A function to call to add a fused
-  //     computation. Used only when the instruction is a fusion instruction.
-  //   fusion_instruction: if non-null then the newly created computation will
-  //     be constructed as a fused computation with this instruction as its
-  //     fusion parent.
   static StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
       HloModule* module, const HloComputationProto& proto,
-      const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
-      const std::function<void(std::unique_ptr<HloComputation>)>&
-          add_fused_computation,
-      HloInstruction* fusion_instruction = nullptr);
+      const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
 
   // Gets the instructions in this computation.
   //
@@ -248,7 +240,7 @@ class HloComputation {
       ShapeTree<HloInstruction*>* copies_added = nullptr);
 
   // Computes and returns the ProgramShape of this computation (shape of
-  // parameters and result without layout).
+  // parameters and result with layout).
   ProgramShape ComputeProgramShape() const;
 
   // Return whether `*this` and `other` are functionally equivalent.
@@ -342,6 +334,15 @@ class HloComputation {
     fusion_instruction_ = fusion_instruction;
   }
 
+  // The id of this computation should be unique within the module.
+  void SetUniqueId(int64 id) {
+    CHECK_EQ(unique_id_, -1);
+    CHECK_GE(id, 0);
+    unique_id_ = id;
+  }
+
+  int64 unique_id() const { return unique_id_; }
+
  private:
   explicit HloComputation(
       const string& name, int parameter_count,
@@ -352,10 +353,6 @@ class HloComputation {
   HloInstruction* AddInstructionInternal(
       std::unique_ptr<HloInstruction> instruction);
 
-  // Helper for setting the parent of instructions that are added to this
-  // computation.
-  void Reparent(HloInstruction* instruction);
-
   // Fuses HLOs in instructions_to_fuse into fusion_instruction.
   //
   // Pre-condition: fusion_instruction's opcode is kFusion.
@@ -373,6 +370,7 @@ class HloComputation {
   std::vector<HloInstruction*> CollectUnreachableRoots() const;
 
   string name_;
+  int64 unique_id_;
   HloInstruction* root_instruction_;
 
   // If this computation is a fusion computation, this field points to the
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 53450991b6fad5b9651d9d23b55c908e6b68e5dd..35ecd4428d0dfde2de445ea34472d2c78148c6c9 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -35,7 +35,10 @@ limitations under the License.
 namespace xla {
 
 StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
-  auto evaluator = MakeUnique<HloEvaluator>();
+  // Limit the constant folding to 0 iterations to skip folding loops. This
+  // retains the behavior from before while loop support in HloEvaluator and may
+  // be revised.
+  auto evaluator = MakeUnique<HloEvaluator>(/*max_loop_iterations=*/0);
 
   XLA_VLOG_LINES(2,
                  "HloConstantFolding::Run(), before:\n" + module->ToString());
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 4ec2ef27bf59b0c877ec38e55ef5c12debeec227..44e4f75f75b275653e1a07111943843fc6f78b33 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -379,20 +380,101 @@ Status HloCostAnalysis::HandleTranspose(const HloInstruction*) {
 }
 
 Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
-  auto rhs_instruction = convolution->operand(1);
+  auto lhs = convolution->operand(0);
+  auto rhs = convolution->operand(1);
+  Window window = convolution->window();
+  const auto& result_shape = convolution->shape();
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+
   const auto& dnums = convolution->convolution_dimension_numbers();
-  const int64 output_features =
-      convolution->shape().dimensions(dnums.output_feature_dimension());
-
-  // For each output element, we do one fma per element in the kernel at some
-  // given output feature index.
-  const int64 fmas_per_output_element =
-      output_features > 0
-          ? ShapeUtil::ElementsIn(rhs_instruction->shape()) / output_features
-          : 0;
-  const int64 output_elements = ShapeUtil::ElementsIn(convolution->shape());
-  current_properties_[kFlopsKey] =
-      output_elements * fmas_per_output_element * kFmaFlops;
+
+  const int64 input_batch_dim = dnums.input_batch_dimension();
+  const int64 input_feature_dim = dnums.input_feature_dimension();
+  const int64 output_feature_dim = dnums.output_feature_dimension();
+  const int64 input_feature =
+      ShapeUtil::GetDimension(lhs_shape, input_feature_dim);
+  const int64 output_feature =
+      ShapeUtil::GetDimension(result_shape, output_feature_dim);
+  const int64 batch = ShapeUtil::GetDimension(lhs_shape, input_batch_dim);
+
+  DimensionVector kernel_limits;
+  DimensionVector output_limits;
+  DimensionVector input_limits;
+  if (window.dimensions().empty()) {
+    window = window_util::MakeWindow({1});
+    kernel_limits.push_back(1);
+    output_limits.push_back(1);
+    input_limits.push_back(1);
+  } else {
+    for (int64 spatial_dimension = 0;
+         spatial_dimension < window.dimensions_size(); ++spatial_dimension) {
+      // Spatial dimension number for kernel (rhs).
+      const int64 kernel_spatial_dim =
+          dnums.kernel_spatial_dimensions(spatial_dimension);
+      const int64 kernel_limit = rhs_shape.dimensions(kernel_spatial_dim);
+      kernel_limits.push_back(kernel_limit);
+
+      // Spatial dimension number for output.
+      const int64 output_spatial_dim =
+          dnums.output_spatial_dimensions(spatial_dimension);
+      const int64 output_limit = result_shape.dimensions(output_spatial_dim);
+      output_limits.push_back(output_limit);
+
+      // Spatial dimension number for input (lhs).
+      const int64 input_spatial_dim =
+          dnums.input_spatial_dimensions(spatial_dimension);
+      const int64 input_limit = lhs_shape.dimensions(input_spatial_dim);
+      input_limits.push_back(input_limit);
+    }
+  }
+
+  DimensionVector valid_position_counts;
+
+  // Loop over each spatial dimension.
+  for (int64 spatial_dimension = 0;
+       spatial_dimension < window.dimensions_size(); ++spatial_dimension) {
+    int64 valid_position_count = 0;
+    // Loop over each point in the kernel.
+    for (int64 kernel_idx = 0; kernel_idx < kernel_limits[spatial_dimension];
+         ++kernel_idx) {
+      // Loop over each point in the output.
+      for (int64 output_idx = 0; output_idx < output_limits[spatial_dimension];
+           ++output_idx) {
+        // Calculate lhs (input) index without taking base dilation into
+        // account.
+        const auto& window_dim = window.dimensions(spatial_dimension);
+        const int64 undilated_index = output_idx * window_dim.stride() -
+                                      window_dim.padding_low() +
+                                      kernel_idx * window_dim.window_dilation();
+
+        // Calculate the actual lhs (input) index after dilation. Avoid the
+        // division as an optimization.
+        const int64 lhs_spatial_index =
+            window_dim.base_dilation() > 1
+                ? undilated_index / window_dim.base_dilation()
+                : undilated_index;
+
+        // Skip if the lhs (input) index is to be dilated.
+        if (undilated_index != lhs_spatial_index * window_dim.base_dilation()) {
+          continue;
+        }
+
+        // Skip if input index is not in bound.
+        if (lhs_spatial_index < 0 ||
+            lhs_spatial_index >= input_limits[spatial_dimension]) {
+          continue;
+        }
+
+        valid_position_count += 1;
+      }
+    }
+    valid_position_counts.push_back(valid_position_count);
+  }
+
+  const int64 fma_count =
+      input_feature * output_feature * batch * Product(valid_position_counts);
+  current_properties_[kFlopsKey] = fma_count * kFmaFlops;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 3b289c240a45e8f3df8156ed89e879da2132d01a..81cc7c4bdc1e0092dd671c741cdc0b8adec72156 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -186,12 +186,14 @@ TEST_F(HloCostAnalysisTest, Map) {
 TEST_F(HloCostAnalysisTest, Convolution) {
   ComputationBuilder builder(client_, "convolution");
   auto input = builder.Parameter(
-      0, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
-                                    /*x_dim=*/20}),
+      0,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
+                                 /*x_dim=*/20}),
       "input");
   auto kernel = builder.Parameter(
-      1, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3,
-                                    /*x_dim=*/3}),
+      1,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3,
+                                 /*x_dim=*/3}),
       "kernel");
   auto result = builder.Conv(input, kernel, {1, 1}, Padding::kValid);
 
@@ -368,8 +370,8 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
         HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract, mul, clamp));
     auto tuple = HloInstruction::CreateTuple({sub, sub, mul, c1});
 
-    HloModule module(TestName());
-    auto* computation = module.AddEntryComputation(builder.Build());
+    auto module = CreateNewModule();
+    auto* computation = module->AddEntryComputation(builder.Build());
     auto* fusion = computation->CreateFusionInstruction(
         {sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
 
@@ -410,8 +412,8 @@ TEST_F(FusionCostAnalysis, NoLayout) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       shape_with_layout, HloOpcode::kAdd, c1, broadcast));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add, broadcast}, HloInstruction::FusionKind::kLoop);
 
@@ -440,5 +442,32 @@ TEST_F(HloCostAnalysisTest, TupleCost) {
   EXPECT_EQ(analysis.bytes_accessed(), kPointerSize * 2);
 }
 
+TEST_F(HloCostAnalysisTest, BaseDilatedConvolution) {
+  ComputationBuilder builder(client_, "BaseDilatedConvolution");
+  auto input = builder.Parameter(
+      0,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
+                                 /*x_dim=*/20}),
+      "input");
+  auto kernel = builder.Parameter(
+      1,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3,
+                                 /*x_dim=*/3}),
+      "kernel");
+
+  auto result = builder.ConvGeneralDilated(
+      input, kernel, /*window_strides=*/{1, 1}, /*padding=*/{{1, 1}, {1, 1}},
+      /*lhs_dilation=*/{3, 5}, /*rhs_dilation=*/{7, 11},
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(2));
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  EXPECT_EQ(analysis.flop_count(), 1472);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a89888480b8c79dfb1f79a50e9686bf45aa49b3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -0,0 +1,290 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+using tensorflow::gtl::ArraySlice;
+using tensorflow::strings::StrCat;
+
+StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
+                                        HloInstruction* rhs) {
+  HloComputation* computation = lhs->parent();
+  CHECK_EQ(computation, rhs->parent());
+  TF_ASSIGN_OR_RETURN(Shape binary_op_shape,
+                      ShapeInference::InferBinaryOpShape(opcode, lhs, rhs));
+  return computation->AddInstruction(
+      HloInstruction::CreateBinary(binary_op_shape, opcode, lhs, rhs));
+}
+
+StatusOr<HloInstruction*> MakePadHlo(HloInstruction* operand,
+                                     HloInstruction* padding_value,
+                                     const PaddingConfig& padding_config) {
+  HloComputation* computation = operand->parent();
+  CHECK_EQ(computation, padding_value->parent());
+  TF_ASSIGN_OR_RETURN(
+      Shape pad_shape,
+      ShapeInference::InferPadShape(operand->shape(), padding_value->shape(),
+                                    padding_config));
+  return computation->AddInstruction(HloInstruction::CreatePad(
+      pad_shape, operand, padding_value, padding_config));
+}
+
+StatusOr<HloInstruction*> MakeSliceHlo(HloInstruction* operand,
+                                       ArraySlice<int64> start_indices,
+                                       ArraySlice<int64> limit_indices,
+                                       ArraySlice<int64> strides) {
+  HloComputation* computation = operand->parent();
+  TF_ASSIGN_OR_RETURN(Shape slice_shape, ShapeInference::InferSliceShape(
+                                             operand->shape(), start_indices,
+                                             limit_indices, strides));
+  return computation->AddInstruction(HloInstruction::CreateSlice(
+      slice_shape, operand, start_indices, limit_indices, strides));
+}
+
+StatusOr<HloInstruction*> MakeConvolveHlo(
+    HloInstruction* lhs, HloInstruction* rhs, const Window& window,
+    const ConvolutionDimensionNumbers& dimension_numbers) {
+  HloComputation* computation = lhs->parent();
+  CHECK_EQ(computation, rhs->parent());
+  TF_ASSIGN_OR_RETURN(Shape convolve_shape, ShapeInference::InferConvolveShape(
+                                                lhs->shape(), rhs->shape(),
+                                                window, dimension_numbers));
+  return computation->AddInstruction(HloInstruction::CreateConvolve(
+      convolve_shape, lhs, rhs, window, dimension_numbers));
+}
+
+StatusOr<HloInstruction*> MakeTransposeHlo(HloInstruction* operand,
+                                           ArraySlice<int64> dimensions) {
+  HloComputation* computation = operand->parent();
+  TF_ASSIGN_OR_RETURN(
+      Shape transpose_shape,
+      ShapeInference::InferTransposeShape(operand->shape(), dimensions));
+  return computation->AddInstruction(
+      HloInstruction::CreateTranspose(transpose_shape, operand, dimensions));
+}
+
+StatusOr<HloInstruction*> MakeReshapeHlo(const Shape& result_shape,
+                                         HloInstruction* operand) {
+  HloComputation* computation = operand->parent();
+  return computation->AddInstruction(
+      HloInstruction::CreateReshape(result_shape, operand));
+}
+
+StatusOr<HloInstruction*> MakeReshapeHlo(
+    ArraySlice<int64> result_shape_dim_bounds, HloInstruction* operand) {
+  Shape new_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
+                                         result_shape_dim_bounds);
+  return MakeReshapeHlo(new_shape, operand);
+}
+
+StatusOr<HloInstruction*> MakeDynamicSliceHlo(HloInstruction* operand,
+                                              HloInstruction* start_indices,
+                                              ArraySlice<int64> slice_sizes) {
+  HloComputation* computation = operand->parent();
+  CHECK_EQ(computation, start_indices->parent());
+  TF_ASSIGN_OR_RETURN(
+      Shape dynamic_slice_shape,
+      ShapeInference::InferDynamicSliceShape(
+          operand->shape(), start_indices->shape(), slice_sizes));
+  return computation->AddInstruction(HloInstruction::CreateDynamicSlice(
+      dynamic_slice_shape, operand, start_indices, slice_sizes));
+}
+
+StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
+    HloInstruction* operand, HloInstruction* update,
+    HloInstruction* start_indices) {
+  HloComputation* computation = operand->parent();
+  CHECK_EQ(computation, update->parent());
+  CHECK_EQ(computation, start_indices->parent());
+  TF_ASSIGN_OR_RETURN(
+      Shape dynamic_update_slice_shape,
+      ShapeInference::InferDynamicUpdateSliceShape(
+          operand->shape(), update->shape(), start_indices->shape()));
+  return computation->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      dynamic_update_slice_shape, operand, update, start_indices));
+}
+
+StatusOr<HloInstruction*> MakeBroadcastHlo(
+    HloInstruction* operand, ArraySlice<int64> broadcast_dimensions,
+    ArraySlice<int64> result_shape_bounds) {
+  HloComputation* computation = operand->parent();
+  Shape broadcast_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
+                                               result_shape_bounds);
+
+  return computation->AddInstruction(HloInstruction::CreateBroadcast(
+      broadcast_shape, operand, broadcast_dimensions));
+}
+
+StatusOr<HloInstruction*> MakeGetTupleElementHlo(HloInstruction* operand,
+                                                 int64 index) {
+  HloComputation* computation = operand->parent();
+
+  TF_ASSIGN_OR_RETURN(
+      Shape gte_shape,
+      ShapeInference::InferGetTupleElementShape(operand->shape(), index));
+  return computation->AddInstruction(
+      HloInstruction::CreateGetTupleElement(gte_shape, operand, index));
+}
+
+StatusOr<HloInstruction*> MakeConcatHlo(ArraySlice<HloInstruction*> operands,
+                                        int64 dimension) {
+  CHECK_GT(operands.size(), 0);
+
+  HloComputation* computation = operands[0]->parent();
+  CHECK(c_all_of(operands, [&](HloInstruction* instr) {
+    return instr->parent() == computation;
+  }));
+
+  std::vector<const Shape*> operand_shapes;
+  c_transform(operands, std::back_inserter(operand_shapes),
+              [](HloInstruction* instr) { return &instr->shape(); });
+
+  TF_ASSIGN_OR_RETURN(Shape concat_shape, ShapeInference::InferConcatOpShape(
+                                              operand_shapes, dimension));
+  return computation->AddInstruction(
+      HloInstruction::CreateConcatenate(concat_shape, operands, dimension));
+}
+
+StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
+  CHECK_GT(n, 0);
+
+  const Shape& operand_shape = operand->shape();
+  CHECK_GE(operand_shape.dimensions_size(), n);
+  int64 new_shape_leading_bound = 1;
+  for (int64 i = 0; i < n; i++) {
+    new_shape_leading_bound *= operand_shape.dimensions(i);
+  }
+
+  std::vector<int64> new_shape_dims;
+  new_shape_dims.reserve(operand_shape.dimensions_size() - n + 1);
+  new_shape_dims.push_back(new_shape_leading_bound);
+
+  std::copy(operand_shape.dimensions().begin() + n,
+            operand_shape.dimensions().end(),
+            std::back_inserter(new_shape_dims));
+
+  Shape output_shape =
+      ShapeUtil::MakeShape(operand_shape.element_type(), new_shape_dims);
+
+  return MakeReshapeHlo(output_shape, operand);
+}
+
+StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
+                                                int64 n) {
+  CHECK_GT(n, 0);
+  std::vector<int64> new_shape_dims;
+  const Shape& operand_shape = operand->shape();
+  new_shape_dims.reserve(n + operand_shape.dimensions_size());
+  new_shape_dims.insert(new_shape_dims.begin(), n, 1);
+  c_copy(operand_shape.dimensions(), std::back_inserter(new_shape_dims));
+  return MakeReshapeHlo(new_shape_dims, operand);
+}
+
+StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
+    HloInstruction* operand, ArraySlice<int64> expanded_dims) {
+  CHECK_GT(operand->shape().dimensions_size(), 0);
+  CHECK_EQ(operand->shape().dimensions(0), Product(expanded_dims));
+
+  std::vector<int64> expanded_shape_dim_bounds;
+  expanded_shape_dim_bounds.reserve(expanded_dims.size() +
+                                    operand->shape().dimensions_size() - 1);
+  c_copy(expanded_dims, std::back_inserter(expanded_shape_dim_bounds));
+  std::copy(operand->shape().dimensions().begin() + 1,
+            operand->shape().dimensions().end(),
+            std::back_inserter(expanded_shape_dim_bounds));
+  Shape new_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
+                                         expanded_shape_dim_bounds);
+  return MakeReshapeHlo(new_shape, operand);
+}
+
+StatusOr<HloInstruction*> ElideDegenerateDims(HloInstruction* operand,
+                                              ArraySlice<int64> dims_to_elide) {
+  CHECK(c_is_sorted(dims_to_elide));
+
+  const Shape& input_shape = operand->shape();
+  // First accumulate in reverse
+  std::vector<int64> new_shape_dim_bounds;
+  new_shape_dim_bounds.reserve(input_shape.dimensions_size() -
+                               dims_to_elide.size());
+  int64 dims_to_elide_idx = dims_to_elide.size() - 1;
+  for (int64 i = input_shape.dimensions_size() - 1; i >= 0; i--) {
+    if (dims_to_elide_idx >= 0 && i == dims_to_elide[dims_to_elide_idx]) {
+      CHECK_EQ(input_shape.dimensions(i), 1);
+      dims_to_elide_idx--;
+    } else {
+      new_shape_dim_bounds.push_back(input_shape.dimensions(i));
+    }
+  }
+
+  c_reverse(new_shape_dim_bounds);
+  Shape output_shape =
+      ShapeUtil::MakeShape(input_shape.element_type(), new_shape_dim_bounds);
+  return MakeReshapeHlo(output_shape, operand);
+}
+
+StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
+                                             int64 zeros_to_prepend,
+                                             int64 zeros_to_append) {
+  HloComputation* computation = operand->parent();
+  CHECK_EQ(operand->shape().dimensions_size(), 1);
+  PaddingConfig padding_config;
+  PaddingConfig::PaddingConfigDimension padding_config_dim;
+  padding_config_dim.set_edge_padding_low(zeros_to_prepend);
+  padding_config_dim.set_edge_padding_high(zeros_to_append);
+  *padding_config.add_dimensions() = padding_config_dim;
+
+  HloInstruction* zero =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          MakeUnique<Literal>(Literal::Zero(operand->shape().element_type()))));
+  return MakePadHlo(operand, zero, padding_config);
+}
+
+StatusOr<HloInstruction*> BroadcastZeros(
+    HloComputation* computation, PrimitiveType element_type,
+    ArraySlice<int64> broadcast_dimensions) {
+  HloInstruction* zero =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          MakeUnique<Literal>(Literal::Zero(element_type))));
+  return MakeBroadcastHlo(zero, /*broadcast_dimensions=*/{},
+                          /*result_shape_bounds=*/broadcast_dimensions);
+}
+
+StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
+    ArraySlice<const Shape*> domain, const Shape& range,
+    tensorflow::StringPiece name) {
+  HloComputation::Builder b(name.ToString());
+  int64 param_idx = 0;
+  for (const Shape* param_shape : domain) {
+    b.AddInstruction(HloInstruction::CreateParameter(
+        param_idx, *param_shape, StrCat("param.", param_idx)));
+    param_idx++;
+  }
+
+  // We can't change the root type of a computation once it is created so create
+  // a dummy root instruction to give the computation the right root shape.  In
+  // the future we may want to use a (recursive) broadcast here to avoid
+  // creating large constants.
+  b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateFromShape(range)));
+
+  return b.Build();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9a7361a6af0c2a0839c59a0ea695ec1b9a98bd4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -0,0 +1,163 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CREATION_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CREATION_UTILS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Some lightweight utilities intended to make HLO instruction creation more
+// ergonomic.  We don't have a complete set of helpers yet -- I expect we'll
+// expand this interface as needed on an ad-hoc basis.
+
+// Creates a binary HLO instruction and adds it to the computation containing
+// `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
+StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
+                                        HloInstruction* rhs);
+
+// Creates a pad HLO instruction and adds it to the computation containing
+// `operand` and `padding_value` (`operand` and `padding_value` must be in the
+// same computation).
+StatusOr<HloInstruction*> MakePadHlo(HloInstruction* operand,
+                                     HloInstruction* padding_value,
+                                     const PaddingConfig& padding_config);
+
+// Creates a slice HLO instruction and adds it to the computation containing
+// `operand`.
+StatusOr<HloInstruction*> MakeSliceHlo(
+    HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> start_indices,
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> strides);
+
+// Creates a convolution HLO instruction and adds it to the computation
+// containing `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
+StatusOr<HloInstruction*> MakeConvolveHlo(
+    HloInstruction* lhs, HloInstruction* rhs, const Window& window,
+    const ConvolutionDimensionNumbers& dimension_numbers);
+
+// Creates a transpose HLO instruction and adds it to the computation containing
+// `operand`.
+StatusOr<HloInstruction*> MakeTransposeHlo(
+    HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> dimensions);
+
+// Creates a reshape HLO instruction and adds it to the computation containing
+// `operand`.
+StatusOr<HloInstruction*> MakeReshapeHlo(const Shape& result_shape,
+                                         HloInstruction* operand);
+
+StatusOr<HloInstruction*> MakeReshapeHlo(
+    tensorflow::gtl::ArraySlice<int64> result_shape_dim_bounds,
+    HloInstruction* operand);
+
+// Creates a dynamic-slice HLO instruction and adds it to the computation
+// containing `operand` and `start_indices` (`operand` and `start_indices` must
+// be in the same computation).
+StatusOr<HloInstruction*> MakeDynamicSliceHlo(
+    HloInstruction* operand, HloInstruction* start_indices,
+    tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+// Creates a dynamic-update-slice HLO instruction and adds it to the computation
+// containing `operand`, `update` and `start_indices` (`operand`, `update` and
+// `start_indices` must be in the same computation).
+StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
+    HloInstruction* operand, HloInstruction* update,
+    HloInstruction* start_indices);
+
+// Creates a broadcast HLO instruction and adds it to the computation containing
+// `operand`.
+StatusOr<HloInstruction*> MakeBroadcastHlo(
+    HloInstruction* operand,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions,
+    tensorflow::gtl::ArraySlice<int64> result_shape_bounds);
+
+// Creates a GetTupleElement HLO instruction and adds it to the computation
+// containing `operand`.
+StatusOr<HloInstruction*> MakeGetTupleElementHlo(HloInstruction* operand,
+                                                 int64 index);
+
+// Creates a Concatenate HLO instruction and adds it to the computation
+// containing `operands` (`operands` must be non-empty and every element must be
+// contained in the same computation).
+StatusOr<HloInstruction*> MakeConcatHlo(
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands, int64 dimension);
+
+// -----------------------------------------------------------------------------
+// Some other miscellaneous helpers to generate common HLO patterns.  All of
+// these add all the instructions they generate into the computation containing
+// their operand(s).
+
+// Collapses (via reshape) the first N (logical) dimensions of `operand` into a
+// single leading dimension.  `operand` must have rank > `n` and `n` must not be
+// 0.
+//
+// For instance if `operand` has shape f32[7,8,9] and n is 2 then the output is
+// the `operand` reshaped to [56,9].
+StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n);
+
+// Prepends `n` degenerate dimensions (dimensions with bound = 1) to `operand`
+// using a reshape.
+//
+// For instance if operand has shape f32[3,4,5] then this returns the operand
+// reshaped to f32[1,3,4,5].  If the operand is a f32 scalar (i.e. has shape
+// f32[]) then this returns the operand reshaped to f32[1].
+StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
+                                                int64 n);
+
+// Expands (via reshape) the first (logical) dimension of `operand` into a
+// sequence of `expanded_dims` dimensions.  `operand` must at least be of rank 1
+// and the number of elements in its first dimension must be equal to the
+// product of `expanded_dims`.
+//
+// For instance if `operand` has shape f32[200,9,7] and expanded_dims is
+// {2,5,20} the result is `operand` reshaped to [2,5,20,9,7].
+StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
+    HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> expanded_dims);
+
+// Elides (via reshape) a set of degenerate dimensions (dimensions containing
+// exactly one element), `dims_to_elide` from `operand`.  Every dimension in
+// `dims_to_elide` must be a degenerate dimension.  `dims_to_elide` must be
+// sorted and not contain duplicates.
+//
+// For example if `operand` is of shape f32[19,1,20,1,7,1,9] and dims_to_elide
+// is {1,5} then the result is `operand` reshaped to [19,20,1,7,9].
+StatusOr<HloInstruction*> ElideDegenerateDims(
+    HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> dims_to_elide);
+
+// Pads `operand` (which must have rank 1) with `zeros_to_prepend` zeros in the
+// front and `zeros_to_append` zeros in the back.
+StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
+                                             int64 zeros_to_prepend,
+                                             int64 zeros_to_append);
+
+// Broadcasts a zero value of type `element_type` into a tensor with element
+// type `element_type` and dimension bounds `broadcast_dimensions`.  The
+// broadcast instruction is emitted into `computation`.
+StatusOr<HloInstruction*> BroadcastZeros(
+    HloComputation* computation, PrimitiveType element_type,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+// Creates a HLO computation that takes arguments of type `domain` and produces
+// a value of type `range`.
+StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
+    tensorflow::gtl::ArraySlice<const Shape*> domain, const Shape& range,
+    tensorflow::StringPiece name);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CREATION_UTILS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e7c4f95fed737f40064224717f409b934e4ff27
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -0,0 +1,239 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+using tensorflow::gtl::ArraySlice;
+
+class HloCreationUtilsTest : public HloTestBase {
+ protected:
+  static std::unique_ptr<HloModule> CreateModuleWithProgramShape(
+      PrimitiveType primitive_type, ArraySlice<int64> input_shape_dims,
+      ArraySlice<int64> output_shape_dims, HloInstruction** param,
+      HloComputation** entry_computation) {
+    Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims);
+    Shape output_shape =
+        ShapeUtil::MakeShape(primitive_type, output_shape_dims);
+    auto module = CreateNewModule("test");
+    *entry_computation = module->AddEntryComputation(
+        CreateComputationWithSignature({&input_shape}, output_shape, "entry")
+            .ValueOrDie());
+    *param = (*entry_computation)->parameter_instruction(0);
+    return module;
+  }
+};
+
+TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_1_dims_collapsed,
+                          CollapseFirstNDims(param, 1));
+  entry_computation->set_root_instruction(first_1_dims_collapsed);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({3, 4})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR1<int32>({3, 4}));
+}
+
+TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_2_dims_collapsed,
+                          CollapseFirstNDims(param, 2));
+  entry_computation->set_root_instruction(first_2_dims_collapsed);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Literal> result_literal,
+      evaluator.Evaluate<std::unique_ptr<Literal>>(
+          *module,
+          {Literal::CreateR3<int32>(
+              {{{1, 2}, {3, 4}, {5, 6}}, {{-1, -2}, {-3, -4}, {-5, -6}}})}));
+  CHECK_EQ(*result_literal,
+           *Literal::CreateR2<int32>(
+               {{1, 2}, {3, 4}, {5, 6}, {-1, -2}, {-3, -4}, {-5, -6}}));
+}
+
+TEST_F(HloCreationUtilsTest, Prepend1DegenerateDim) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_1_degenerate_dim_prepended,
+                          PrependDegenerateDims(param, 1));
+  entry_computation->set_root_instruction(with_1_degenerate_dim_prepended);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({9, 10})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{9, 10}}));
+}
+
+TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 1, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
+                          PrependDegenerateDims(param, 2));
+  entry_computation->set_root_instruction(with_2_degenerate_dims_prepended);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({9, 10})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR3<int32>({{{9, 10}}}));
+}
+
+TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{}, /*output_shape_dims=*/{1, 1}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
+                          PrependDegenerateDims(param, 2));
+  entry_computation->set_root_instruction(with_2_degenerate_dims_prepended);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR0<int32>(9)}));
+  CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{9}}));
+}
+
+TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{6}, /*output_shape_dims=*/{3, 1, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_dim_expanded,
+                          ExpandFirstDimIntoNDims(param, {3, 1, 2}));
+  entry_computation->set_root_instruction(first_dim_expanded);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Literal> result_literal,
+      evaluator.Evaluate<std::unique_ptr<Literal>>(
+          *module, {Literal::CreateR1<int32>({1, 2, 3, 4, 5, 6})}));
+  CHECK_EQ(*result_literal,
+           *Literal::CreateR3<int32>({{{1, 2}}, {{3, 4}}, {{5, 6}}}));
+}
+
+TEST_F(HloCreationUtilsTest, PadVectorWithZeros) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{6}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * zero_padded_param,
+      PadVectorWithZeros(param, /*zeros_to_prepend=*/3, /*zeros_to_append=*/1));
+  entry_computation->set_root_instruction(zero_padded_param);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({3, 4})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR1<int32>({0, 0, 0, 3, 4, 0}));
+}
+
+TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * zeros,
+      BroadcastZeros(module->entry_computation(), S32, {2, 2}));
+  entry_computation->set_root_instruction(zeros);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR0<int32>(0)}));
+  CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{0, 0}, {0, 0}}));
+}
+
+TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      F32,
+      /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * zeros,
+      BroadcastZeros(module->entry_computation(), F32, {2, 2}));
+  entry_computation->set_root_instruction(zeros);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR0<float>(0.0f)}));
+  CHECK_EQ(*result_literal,
+           *Literal::CreateR2<float>({{0.0f, 0.0f}, {0.0f, 0.0f}}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 279edd4ba8772a9c576f76f554de8ec68631b953..3b22c93733af293e4d73a2b1b3ac8822dec6d5f5 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -97,6 +97,10 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
   const std::function<bool(const HloComputation*, const HloComputation*)>
       eq_computations = std::equal_to<const HloComputation*>();
   for (auto* computation : module->computations()) {
+    if (only_fusion_computations_ && !computation->IsFusionComputation()) {
+      continue;
+    }
+
     changed |= CombineConstants(computation, is_layout_sensitive_);
 
     std::list<HloInstruction*> post_order =
@@ -109,6 +113,11 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
         continue;
       }
 
+      // Skip instructions which have side effects.
+      if (instruction->HasSideEffect()) {
+        continue;
+      }
+
       // An instruction is considered to be equivalent to another only if they
       // share the exact same set of operands. So to find equivalent
       // instructions, we just search among instructions which share operand(0)
@@ -118,7 +127,7 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
       tensorflow::gtl::InlinedVector<HloInstruction*, 8>
           equivalent_instructions;
       for (HloInstruction* user : operand->users()) {
-        if (user != instruction &&
+        if (user != instruction && !user->HasSideEffect() &&
             user->Identical(*instruction, eq_instructions, eq_computations,
                             is_layout_sensitive_)) {
           equivalent_instructions.push_back(user);
diff --git a/tensorflow/compiler/xla/service/hlo_cse.h b/tensorflow/compiler/xla/service/hlo_cse.h
index 70096e07a2493763a9d4b0dc8e1c31510718c6c2..5e2b348bdda2b31556fb692e24d2bad2e4173ef5 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.h
+++ b/tensorflow/compiler/xla/service/hlo_cse.h
@@ -29,9 +29,11 @@ class HloCSE : public HloPassInterface {
  public:
   // If is_layout_sensitive is true, then the simplifier preserves layout during
   // transformation. Otherwise, layout is ignored.
-  explicit HloCSE(bool is_layout_sensitive)
-      : is_layout_sensitive_(is_layout_sensitive) {}
-  ~HloCSE() override {}
+  explicit HloCSE(bool is_layout_sensitive,
+                  bool only_fusion_computations = false)
+      : is_layout_sensitive_(is_layout_sensitive),
+        only_fusion_computations_(only_fusion_computations) {}
+  ~HloCSE() override = default;
   tensorflow::StringPiece name() const override { return "cse"; }
 
   // Run CSE on the given module. Returns whether the module was changed (common
@@ -39,7 +41,8 @@ class HloCSE : public HloPassInterface {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  bool is_layout_sensitive_;
+  const bool is_layout_sensitive_;
+  const bool only_fusion_computations_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 3601a790c4428ee39c264b217a4b9a991ad8456c..df8853f34f6a72c52d1cde7332ada3809d2f3d96 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -414,8 +414,7 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   EXPECT_THAT(root, op::Add(rng1, rng2));
 }
 
-// TODO(b/28245743): Handle impure functions correctly in CSE.
-TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
+TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   // Test that two calls to an impure function are not commoned. RNG
   // is the source of the impurity.
 
@@ -458,14 +457,16 @@ TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(op::Map(), op::Map()));
 
+  VLOG(3) << "before: " << module->ToString();
+
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+
+  VLOG(3) << "after: " << module->ToString();
 
   EXPECT_EQ(4, computation->instruction_count());
   root = computation->root_instruction();
-  auto operand = root->operand(0)->operand(0);
-  EXPECT_THAT(operand, op::Map());
-  EXPECT_THAT(root, op::Add(operand, operand));
+  EXPECT_THAT(root, op::Add(op::Map(op::Constant()), op::Map(op::Constant())));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 934e43ba4879628362009267c671ec4cb0d79c52..0c37a8d75f38dabaad886cc9d4adce8ab29ddf18 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -368,11 +368,11 @@ bool HloDataflowAnalysis::UpdateConditionalValueSet(
           conditional->true_computation()->root_instruction()),
       &GetInstructionValueSet(
           conditional->false_computation()->root_instruction())};
-  // A phi-node is not defined for a kConditional instruction even though it
-  // represents a join point. This is because the current approach is to define
-  // a phi-node only for kWhile to account for the dataflow through back-edges
-  // and deal with the ambiguity in other cases.
-  return GetInstructionValueSet(conditional).AssignUnionOf(inputs);
+  if (ssa_form_) {
+    return Phi(conditional, inputs);
+  } else {
+    return GetInstructionValueSet(conditional).AssignUnionOf(inputs);
+  }
 }
 
 bool HloDataflowAnalysis::UpdateCopyValueSet(HloInstruction* copy) {
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 7bf3a1a06045c79621d75b653bf42220705a69d4..07f69b8e1339fed636e4eb54791941b85e09fd17 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1602,11 +1602,17 @@ TEST_P(HloDataflowAnalysisTest, ConditionalWithIdentity) {
   EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
               ElementsAre(HloUse{conditional, 2, {}}));
 
-  EXPECT_EQ(analysis.values().size(), 3);
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
-  EXPECT_THAT(HloValuesAt(conditional),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
-                                   analysis.GetValueDefinedAt(constant2)));
+  bool ssa_form = GetParam();
+  if (ssa_form) {
+    EXPECT_EQ(analysis.values().size(), 4);
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(conditional));
+  } else {
+    EXPECT_EQ(analysis.values().size(), 3);
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+    EXPECT_THAT(HloValuesAt(conditional),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                     analysis.GetValueDefinedAt(constant2)));
+  }
 }
 
 TEST_P(HloDataflowAnalysisTest, ConditionalTakingTupleOperand) {
@@ -1713,11 +1719,17 @@ TEST_P(HloDataflowAnalysisTest, ConditionalTakingTupleOperand) {
                   HloUse{true_x, 0, {}}, HloUse{true_y, 0, {}},
                   HloUse{false_x, 0, {}}, HloUse{false_y, 0, {}}));
 
-  EXPECT_EQ(analysis.values().size(), 6);
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
-  EXPECT_THAT(HloValuesAt(conditional),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(add),
-                                   analysis.GetValueDefinedAt(sub)));
+  bool ssa_form = GetParam();
+  if (ssa_form) {
+    EXPECT_EQ(analysis.values().size(), 7);
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(conditional));
+  } else {
+    EXPECT_EQ(analysis.values().size(), 6);
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+    EXPECT_THAT(HloValuesAt(conditional),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(add),
+                                     analysis.GetValueDefinedAt(sub)));
+  }
 }
 
 TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
@@ -1834,20 +1846,27 @@ TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
   EXPECT_EQ(analysis.GetUniqueValueAt(false_operand_cond),
             analysis.GetValueDefinedAt(constant2));
 
-  EXPECT_EQ(analysis.values().size(), 9);
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(inner_conditional));
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
-  EXPECT_THAT(
-      HloValuesAt(inner_conditional),
-      UnorderedElementsAre(
-          analysis.GetValueDefinedAt(computation1->root_instruction()),
-          analysis.GetValueDefinedAt(computation2->root_instruction())));
-  EXPECT_THAT(
-      HloValuesAt(conditional),
-      UnorderedElementsAre(
-          analysis.GetValueDefinedAt(computation1->root_instruction()),
-          analysis.GetValueDefinedAt(computation2->root_instruction()),
-          analysis.GetValueDefinedAt(computation3->root_instruction())));
+  bool ssa_form = GetParam();
+  if (ssa_form) {
+    EXPECT_EQ(analysis.values().size(), 11);
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(inner_conditional));
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(conditional));
+  } else {
+    EXPECT_EQ(analysis.values().size(), 9);
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(inner_conditional));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+    EXPECT_THAT(
+        HloValuesAt(inner_conditional),
+        UnorderedElementsAre(
+            analysis.GetValueDefinedAt(computation1->root_instruction()),
+            analysis.GetValueDefinedAt(computation2->root_instruction())));
+    EXPECT_THAT(
+        HloValuesAt(conditional),
+        UnorderedElementsAre(
+            analysis.GetValueDefinedAt(computation1->root_instruction()),
+            analysis.GetValueDefinedAt(computation2->root_instruction()),
+            analysis.GetValueDefinedAt(computation3->root_instruction())));
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index 1e5f0f797a13fd7e7ce1cc934387a274a74153bc..fcd723af146e2227b8661b1a4993f1338f7de389 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -40,7 +40,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
   VLOG(2) << "Before dce:";
   XLA_VLOG_LINES(2, module->ToString());
 
-  for (auto* computation : module->MakeNonfusionComputations()) {
+  for (auto* computation : module->MakeComputationPostOrder()) {
     std::unordered_set<HloInstruction*> live_instructions;
     TF_RETURN_IF_ERROR(computation->root_instruction()->Accept(
         [&live_instructions](HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index c782d1b0add17c70e0f54826917df251d5a613e2..d236f83aeb9254b9c6e6d04629758ac2c8fd0da3 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -178,24 +178,37 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
       if (hlo->shape().element_type() == eliminate_type_) {
         Shape shape =
             ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_);
+
         new_hlo = computation->AddInstruction(
             hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule()));
+        TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
+
         new_hlo = ToElementType(new_hlo, eliminate_type_);
       } else if (ShapeUtil::IsTuple(hlo->shape())) {
         Shape old_shape = hlo->shape();
         Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_,
                                                  replace_with_type_);
+
         new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
             new_shape, new_operands, hlo->GetModule()));
+        TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
+
         // Convert the elements of the result of `new_hlo` to produce a new
         // tuple with shape `old_shape`.
         new_hlo = ConvertTupleElements(new_hlo, old_shape);
       } else {
         new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
             hlo->shape(), new_operands, hlo->GetModule()));
+        TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
       }
 
-      TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, new_hlo));
+      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo));
+      TF_RETURN_IF_ERROR(hlo->DropAllControlDeps());
+
+      // NB!  We want to replace and remove side effecting instructions like Rng
+      // as well so we can't rely HloComputation::ReplaceInstruction to reliably
+      // remove the replaced instruction.
+      TF_RETURN_IF_ERROR(computation->RemoveInstruction(hlo));
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
index cb94d9f19b825d1321263a4737b66a6bf198a772..5c5a059e0fd895f03bc26a975609b57333237faf 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -22,6 +22,12 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
+using ::testing::Contains;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Not;
+using ::testing::ResultOf;
+
 class HloElementTypeConverterTest : public HloTestBase {
  public:
   std::unique_ptr<HloModule> CreateModuleFromHloString(
@@ -117,5 +123,65 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
                         op::Convert(op::GetTupleElement(batch_norm, 2))));
 }
 
+TEST_F(HloElementTypeConverterTest, RngIsRemoved) {
+  const string& hlo_string = R"(
+HloModule RngIsRemoved
+
+ENTRY main {
+  constant.3 = bf16[] constant(0)
+  constant.4 = bf16[] constant(1)
+  ROOT rng = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform
+}
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
+
+  std::function<bool(const HloInstruction*)> is_bf16_rng =
+      [](const HloInstruction* inst) {
+        return inst->shape().element_type() == BF16 &&
+               inst->opcode() == HloOpcode::kRng;
+      };
+
+  EXPECT_THAT(module->entry_computation()->instructions(),
+              Not(Contains(ResultOf(is_bf16_rng, Eq(true)))));
+}
+
+TEST_F(HloElementTypeConverterTest, RngCtrlDep) {
+  const string& hlo_string = R"(
+HloModule RngIsRemoved
+
+ENTRY main {
+  constant.3 = bf16[] constant(0)
+  constant.4 = bf16[] constant(1)
+  rng0 = bf16[1,2000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform
+  ROOT rng1 = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), control-predecessors={%rng0}, distribution=rng_uniform
+}
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
+
+  HloInstruction *rng0, *rng1;
+  for (auto* inst : module->entry_computation()->instructions()) {
+    if (inst->opcode() == HloOpcode::kRng) {
+      const Shape& shape = inst->shape();
+      ASSERT_EQ(shape.dimensions_size(), 3);
+      ASSERT_TRUE(shape.dimensions(1) == 2000 || shape.dimensions(1) == 1000);
+      if (shape.dimensions(1) == 2000) {
+        rng0 = inst;
+      } else {
+        rng1 = inst;
+      }
+    }
+  }
+
+  EXPECT_THAT(rng0->control_successors(), ElementsAre(rng1));
+  EXPECT_THAT(rng1->control_predecessors(), ElementsAre(rng0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 296f010a920a801ef0a4dc5e40bf0dbc07898196..8cf94123b714031455088047b4620001804c6b43 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -51,12 +51,22 @@ namespace xla {
 
 namespace {
 
+using tensorflow::gtl::ArraySlice;
+using tensorflow::gtl::FlatSet;
+using tensorflow::gtl::optional;
+
 template <typename T>
 struct is_complex_t : public std::false_type {};
 
 template <>
 struct is_complex_t<complex64> : public std::true_type {};
 
+template <typename T>
+struct is_complex64_t : public std::false_type {};
+
+template <>
+struct is_complex64_t<complex64> : public std::true_type {};
+
 template <typename OperandT>
 StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
                                            const Literal& lhs_literal,
@@ -99,11 +109,10 @@ StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
   }
 
   auto result = Literal::CreateFromShape(shape);
-  TF_RETURN_IF_ERROR(result->Populate<bool>(
-      [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-        return compare_op(lhs_literal.Get<OperandT>(multi_index),
-                          rhs_literal.Get<OperandT>(multi_index));
-      }));
+  TF_RETURN_IF_ERROR(result->Populate<bool>([&](ArraySlice<int64> multi_index) {
+    return compare_op(lhs_literal.Get<OperandT>(multi_index),
+                      rhs_literal.Get<OperandT>(multi_index));
+  }));
 
   return std::move(result);
 }
@@ -130,11 +139,10 @@ StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
   }
 
   auto result = Literal::CreateFromShape(shape);
-  TF_RETURN_IF_ERROR(result->Populate<bool>(
-      [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-        return compare_op(lhs_literal.Get<complex64>(multi_index),
-                          rhs_literal.Get<complex64>(multi_index));
-      }));
+  TF_RETURN_IF_ERROR(result->Populate<bool>([&](ArraySlice<int64> multi_index) {
+    return compare_op(lhs_literal.Get<complex64>(multi_index),
+                      rhs_literal.Get<complex64>(multi_index));
+  }));
 
   return std::move(result);
 }
@@ -159,8 +167,8 @@ StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
 
   auto result = Literal::CreateFromShape(shape);
 
-  TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-      [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+  TF_RETURN_IF_ERROR(
+      result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
         return unary_op(operand_literal.Get<NativeT>(multi_index));
       }));
   return std::move(result);
@@ -172,7 +180,7 @@ StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
 // with the base index.
 void IterateThroughWindow(
     const Shape& window_shape, const Window& window, const Shape& base_shape,
-    const tensorflow::gtl::ArraySlice<int64>& window_count_index,
+    const ArraySlice<int64>& window_count_index,
     const std::function<void(const std::vector<int64>&)>& f) {
   const int64 rank = ShapeUtil::Rank(base_shape);
   DimensionVector window_index(rank);
@@ -194,6 +202,25 @@ void IterateThroughWindow(
   } while (IndexUtil::BumpIndices(window_shape, &window_index));
 }
 
+// Creates a vector of multipliers which can be used to create a linear index
+// into shape.
+//
+// Given the multidimensional index {i1, ..., iN} and
+// M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
+//
+//   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
+//
+// This lets you calculate LI given the multidimensional indices in any order.
+DimensionVector MakeDimMultipliers(const Shape& shape) {
+  DimensionVector v(ShapeUtil::Rank(shape));
+  int64 scale = 1;
+  for (auto dim : LayoutUtil::MinorToMajor(shape)) {
+    v[dim] = scale;
+    scale *= shape.dimensions(dim);
+  }
+  return v;
+}
+
 }  // namespace
 
 template <typename ReturnT, typename ElementwiseT>
@@ -248,17 +275,37 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
   template <
       typename NativeT,
-      typename std::enable_if<std::is_signed<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
+      typename std::enable_if<std::is_signed<NativeT>::value>::type* = nullptr>
   Status HandleAbs(HloInstruction* abs) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
-                        ElementWiseUnaryOp(abs, [](ElementwiseT elem_operand) {
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
                           return std::abs(elem_operand);
                         }));
     return Status::OK();
   }
 
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex64_t<NativeT>::value>::type* = nullptr>
+  Status HandleAbs(HloInstruction* abs) {
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(abs->operand(0));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[abs],
+        (ElementWiseUnaryOpImpl<float, NativeT>(
+            abs, [](NativeT elem_operand) { return std::abs(elem_operand); },
+            operand_literal)));
+
+    return Status::OK();
+  }
+
   Status HandleAbs(HloInstruction* abs) override {
+    // If the operand is of C64 type, the return type of abs will be F32.
+    // However, ElementwiseT would still be the return type, F32, and thus
+    // specifying the ElementwiseT explicitly as C64 is needed below.
+    if (abs->operand(0)->shape().element_type() == C64) {
+      return HandleAbs<complex64>(abs);
+    }
     return HandleAbs<ElementwiseT>(abs);
   }
 
@@ -306,13 +353,12 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                    operand_to_broadcast.shape().dimensions(i));
     }
 
-    return output->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-          for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
-            broadcast_indices[i] = multi_index[broadcast->dimensions(i)];
-          }
-          return operand_to_broadcast.Get<ReturnT>(broadcast_indices);
-        });
+    return output->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
+      for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+        broadcast_indices[i] = multi_index[broadcast->dimensions(i)];
+      }
+      return operand_to_broadcast.Get<ReturnT>(broadcast_indices);
+    });
   }
 
   template <
@@ -353,6 +399,22 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleBitcastConvert(HloInstruction* convert) override {
+    const HloInstruction* operand = convert->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
+                        parent_->GetEvaluatedLiteralFor(operand).BitcastConvert(
+                            convert->shape().element_type()));
+
+    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
+      parent_->evaluated_[convert] = std::move(result);
+    } else {
+      parent_->evaluated_[convert] =
+          result->Relayout(convert->shape().layout());
+    }
+    return Status::OK();
+  }
+
   Status HandleExp(HloInstruction* exp) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
                         ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
@@ -586,14 +648,25 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleMaximum(HloInstruction* maximum) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[maximum],
+        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
+          return std::max(lhs, rhs);
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
   Status HandleMaximum(HloInstruction* maximum) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[maximum],
         ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
-          return std::fmax(lhs, rhs);
+          return ((lhs >= rhs) || std::isnan(lhs)) ? lhs : rhs;
         }));
     return Status::OK();
   }
@@ -609,18 +682,30 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return HandleMaximum<ElementwiseT>(maximum);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
   Status HandleMinimum(HloInstruction* minimum) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[minimum],
                         ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
                                                         ElementwiseT rhs_el) {
-                          return std::fmin(lhs_el, rhs_el);
+                          return std::min(lhs_el, rhs_el);
                         }));
     return Status::OK();
   }
 
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleMinimum(HloInstruction* minimum) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[minimum],
+        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
+                                        ElementwiseT rhs_el) {
+          return ((lhs_el <= rhs_el) || std::isnan(lhs_el)) ? lhs_el : rhs_el;
+        }));
+    return Status::OK();
+  }
+
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
@@ -740,7 +825,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[shl],
         ElementWiseBinaryOp(shl, [](NativeT lhs_elem, NativeT rhs_elem) {
-          return lhs_elem << rhs_elem;
+          return IsShiftOutOfBounds<NativeT>(rhs_elem) ? 0
+                                                       : (lhs_elem << rhs_elem);
         }));
     return Status::OK();
   }
@@ -765,8 +851,12 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[shr],
         ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
-          return static_cast<NativeT>(static_cast<SignedT>(lhs_elem) >>
-                                      rhs_elem);
+          SignedT lhs_signed = static_cast<SignedT>(lhs_elem);
+          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
+            return lhs_signed < 0 ? static_cast<SignedT>(-1) : 0;
+          } else {
+            return lhs_signed >> rhs_elem;
+          }
         }));
     return Status::OK();
   }
@@ -793,7 +883,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
         parent_->evaluated_[shr],
         ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
           // If shift amount is greater than the number of bits, then return 0.
-          if (rhs_elem >= sizeof(UnsignedT) * CHAR_BIT) {
+          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
             return static_cast<NativeT>(0);
           }
           return static_cast<NativeT>(static_cast<UnsignedT>(lhs_elem) >>
@@ -820,7 +910,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleClamp(HloInstruction* clamp) {
     std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
         clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
-          return std::fmax(low, std::fmin(value, high));
+          return std::fmin(high, std::fmax(value, low));
         };
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[clamp],
@@ -841,6 +931,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleSelect(HloInstruction* select) override {
+    CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape()));
     CHECK(!ShapeUtil::IsTuple(select->shape()));
     std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
         [](bool pred, ReturnT on_true, ReturnT on_false) {
@@ -871,8 +962,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     auto result = Literal::CreateFromShape(result_shape);
 
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> out_index) {
+    TF_RETURN_IF_ERROR(
+        result->Populate<ReturnT>([&](ArraySlice<int64> out_index) {
           std::vector<int64> from_index(out_index.begin(), out_index.end());
           for (const int64 dim : reverse_dimensions) {
             from_index[dim] = result_shape.dimensions(dim) - 1 - out_index[dim];
@@ -923,18 +1014,6 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
 
-    // Dimension number applicable for input (lhs).
-    const int64 input_batch_dim = dnums.input_batch_dimension();
-    const int64 input_z_dim = dnums.input_feature_dimension();
-    // Dimension number applicable for kernel (rhs).
-    const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
-    const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
-    // Dimension number applicable for output.
-    const int64 output_batch_dim = dnums.output_batch_dimension();
-    const int64 output_z_dim = dnums.output_feature_dimension();
-
-    const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
-
     std::vector<int64> window_dimension_sizes;
     for (auto i : dnums.kernel_spatial_dimensions()) {
       window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
@@ -943,25 +1022,43 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Shape& window_shape =
         ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
 
-    DimensionVector lhs_index(lhs_rank);
-    DimensionVector rhs_index(rhs_rank);
-    DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size());
+    DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
+    DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
 
-    auto func = [&](tensorflow::gtl::ArraySlice<int64> out_index) {
-      ElementwiseT result_val = static_cast<ElementwiseT>(0);
+    auto lhs_literal_data = lhs_literal.data<ReturnT>();
+    auto rhs_literal_data = rhs_literal.data<ReturnT>();
+
+    auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
+                 &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
+                 rhs_literal_data](ArraySlice<int64> out_index) {
+      // Dimension number applicable for input (lhs).
+      const int64 input_batch_dim = dnums.input_batch_dimension();
+      const int64 input_z_dim = dnums.input_feature_dimension();
+      // Dimension number applicable for kernel (rhs).
+      const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
+      const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
+      // Dimension number applicable for output.
+      const int64 output_batch_dim = dnums.output_batch_dimension();
+      const int64 output_z_dim = dnums.output_feature_dimension();
 
-      std::fill(lhs_index.begin(), lhs_index.end(), 0);
-      std::fill(rhs_index.begin(), rhs_index.end(), 0);
-      std::fill(rhs_spatial_index.begin(), rhs_spatial_index.end(), 0);
+      const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
 
-      lhs_index[input_batch_dim] = out_index[output_batch_dim];
-      rhs_index[kernel_output_z_dim] = out_index[output_z_dim];
+      ElementwiseT result_val = static_cast<ElementwiseT>(0);
+      DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
+                                        0);
 
       // Convolve input feature with kernel.
       do {
         for (int64 iz = 0; iz < z_size; ++iz) {
-          lhs_index[input_z_dim] = iz;
-          rhs_index[kernel_input_z_dim] = iz;
+          int64 lhs_linear_index = 0;
+          lhs_linear_index += out_index[output_batch_dim] *
+                              lhs_dim_multipliers[input_batch_dim];
+          lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
+
+          int64 rhs_linear_index = 0;
+          rhs_linear_index += out_index[output_z_dim] *
+                              rhs_dim_multipliers[kernel_output_z_dim];
+          rhs_linear_index += iz * rhs_dim_multipliers[kernel_input_z_dim];
 
           // Find corresponding spatial dimension index for input (lhs).
           for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
@@ -986,29 +1083,32 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
             // Calculate the actual lhs (input) index after dilation.  As an
             // optimization, skip this integer divide if there's no dilation.
+            int64 lhs_spatial_index;
             if (window_dim.base_dilation() > 1) {
-              lhs_index[input_spatial_dim] =
-                  undilated_index / window_dim.base_dilation();
+              lhs_spatial_index = undilated_index / window_dim.base_dilation();
             } else {
-              lhs_index[input_spatial_dim] = undilated_index;
+              lhs_spatial_index = undilated_index;
             }
+            lhs_linear_index +=
+                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
 
-            // Skip if input index is not in bound.
-            if (!(lhs_index[input_spatial_dim] >= 0 &&
-                  lhs_index[input_spatial_dim] <
+            // Skip if input index is not in bounds.
+            if (!(lhs_spatial_index >= 0 &&
+                  lhs_spatial_index <
                       lhs_shape.dimensions(input_spatial_dim))) {
               goto cnt;
             }
 
-            rhs_index[dnums.kernel_spatial_dimensions(ki)] =
-                window_dim.window_reversal()
-                    ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
-                    : rhs_spatial_index[ki];
+            rhs_linear_index +=
+                (window_dim.window_reversal()
+                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                     : rhs_spatial_index[ki]) *
+                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
           }
 
           result_val +=
-              static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
-              static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+              static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
+              static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
         }
       cnt : {}
       } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
@@ -1017,7 +1117,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     };
 
     auto result = Literal::CreateFromShape(result_shape);
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(func));
+    TF_RETURN_IF_ERROR(result->PopulateParallel<ReturnT>(func));
 
     parent_->evaluated_[conv] = std::move(result);
     return Status::OK();
@@ -1069,9 +1169,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     }
 
     std::vector<int64> rhs_non_batch_non_contracting_dims;
-    tensorflow::gtl::FlatSet<int64> batch_dims_set(
-        dnums.rhs_batch_dimensions().begin(),
-        dnums.rhs_batch_dimensions().end());
+    FlatSet<int64> batch_dims_set(dnums.rhs_batch_dimensions().begin(),
+                                  dnums.rhs_batch_dimensions().end());
     for (int64 i = 0; i < rhs_rank; i++) {
       if (i != rhs_contracting_dimension && batch_dims_set.count(i) == 0) {
         rhs_non_batch_non_contracting_dims.push_back(i);
@@ -1083,8 +1182,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     DimensionVector lhs_index(lhs_rank);
     DimensionVector rhs_index(rhs_rank);
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> result_index) {
+    TF_RETURN_IF_ERROR(
+        result->Populate<ReturnT>([&](ArraySlice<int64> result_index) {
           ElementwiseT result_val = static_cast<ElementwiseT>(0);
 
           // Find the corresponding non-contracting indices for lhs and rhs.
@@ -1178,9 +1277,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
         parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get<ReturnT>({});
     auto result = Literal::CreateFromShape(pad->shape());
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&scalar](tensorflow::gtl::ArraySlice<int64> multi_index) {
-          return scalar;
-        }));
+        [&scalar](ArraySlice<int64> multi_index) { return scalar; }));
 
     const Literal& evaluated_operand =
         parent_->GetEvaluatedLiteralFor(pad->operand(0));
@@ -1193,7 +1290,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     // corresponding index of the resulting padded literal.
     const PaddingConfig& pad_config = pad->padding_config();
 
-    auto func = [&](const std::vector<int64>& input_index) {
+    auto func = [&](ArraySlice<int64> input_index) {
       for (auto i = 0; i < input_index.size(); ++i) {
         // Interior padding occurs logically before edge padding, so in the case
         // of negative edge padding elements are removed from the
@@ -1343,9 +1440,9 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     auto result = Literal::CreateFromShape(map->shape());
 
-    HloEvaluator embedded_evaluator;
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
+    TF_RETURN_IF_ERROR(
+        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
           std::vector<std::unique_ptr<Literal>> arg_literals;
           arg_literals.reserve(operands.size());
 
@@ -1435,7 +1532,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleReduce(HloInstruction* reduce) override {
     auto arg = reduce->operand(0);
     auto init_value = reduce->operand(1);
-    tensorflow::gtl::ArraySlice<int64> dimensions(reduce->dimensions());
+    ArraySlice<int64> dimensions(reduce->dimensions());
     HloComputation* function = reduce->to_apply();
     TF_RET_CHECK(ShapeUtil::Rank(reduce->shape()) ==
                  ShapeUtil::Rank(arg->shape()) - dimensions.size());
@@ -1467,21 +1564,19 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       arg_dim_counts[dim] = arg_dimensions[dim];
     }
 
-    // Create mapping from result index to arg index.
-    const int64 result_rank = ShapeUtil::Rank(result->shape());
-    int64 result_dim = 0;
-    std::vector<int64> result_to_arg_index(result_rank);
+    // Map each dimension in the result to a dimension in arg that isn't
+    // being reduced.
+    std::vector<int64> result_to_arg_index;
     for (int64 i = 0; i < arg_dimensions.size(); ++i) {
       if (arg_dim_steps[i] == 0) {
-        result_to_arg_index[result_dim] = i;
-        ++result_dim;
+        result_to_arg_index.push_back(i);
       }
     }
 
-    HloEvaluator embedded_evaluator;
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
     // For each resulting dimension, calculate and assign computed value.
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+    TF_RETURN_IF_ERROR(
+        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
           ReturnT result_val = init_scalar;
 
           std::vector<int64> base(arg_dimensions.size());
@@ -1489,31 +1584,43 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             base[result_to_arg_index[i]] = multi_index[i];
           }
 
-          auto func = [&](const std::vector<int64>& input_index) {
+          // When the reduction is addition of floats, accumulate in a double
+          // for better precision. Also, avoid creating Literals for the
+          // intermediate results; it's much faster.
+          if (ShapeUtil::ElementIsFloating(init_literal.shape()) &&
+              IsScalarAdd(function)) {
+            double computed_result = 0;
+            auto func = [&](ArraySlice<int64> input_index) {
+              computed_result += arg_literal.Get<float>(input_index);
+              return true;
+            };
+            ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
+                                    arg_dim_steps, func);
+            return static_cast<ReturnT>(computed_result);
+          }
+          auto func = [&](ArraySlice<int64> input_index) {
             auto curr_val = arg_literal.Get<ReturnT>(input_index);
 
             // Evaluate computation with specified literal operands.
             auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
             auto result_val_literal = Literal::CreateR0<ReturnT>(result_val);
-            std::vector<const Literal*> args = {curr_val_literal.get(),
-                                                result_val_literal.get()};
+            std::vector<const Literal*> args = {result_val_literal.get(),
+                                                curr_val_literal.get()};
 
             std::unique_ptr<Literal> computed_result =
                 embedded_evaluator.Evaluate<const Literal*>(*function, args)
                     .ConsumeValueOrDie();
-            // Clear visit states so that the we can use the evaluate again on
+            // Clear visit states so that we can use the evaluator again on
             // the same computation.
             embedded_evaluator.ResetVisitStates();
-
             // Assign computed result to result_val.
             result_val = computed_result->Get<ReturnT>({});
-
             return true;
           };
-
+          // Computes one element of the result, reducing all dimensions that
+          // contribute to that element.
           ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
                                   arg_dim_steps, func);
-
           return result_val;
         }));
 
@@ -1521,6 +1628,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  bool IsScalarAdd(HloComputation* computation) {
+    HloInstruction* instruction = computation->root_instruction();
+    if (instruction->opcode() == HloOpcode::kAdd &&
+        computation->num_parameters() == 2) {
+      const HloInstruction* lhs = instruction->operand(0);
+      const HloInstruction* rhs = instruction->operand(1);
+      return lhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(lhs->shape()) &&
+             rhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(rhs->shape()) && lhs != rhs;
+    }
+    return false;
+  }
+
   Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
     auto operand = select_and_scatter->operand(0);
     auto source = select_and_scatter->operand(1);
@@ -1535,9 +1656,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     // Initialize result array with the init value.
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> output_index) {
-          return init_scalar;
-        }));
+        [&](ArraySlice<int64> output_index) { return init_scalar; }));
 
     std::vector<int64> window_dimension_sizes;
     for (const auto& window_dimension : window.dimensions()) {
@@ -1554,7 +1673,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     int64 rank = ShapeUtil::Rank(operand_literal.shape());
 
-    HloEvaluator embedded_evaluator;
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
     DimensionVector source_index(rank);
 
     std::fill(source_index.begin(), source_index.end(), 0);
@@ -1570,8 +1689,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       // 2. Using the selected index, scatter value from `source` to result. We
       // do this by iterating through the window, and compare each index with
       // the selected index.
-      tensorflow::gtl::optional<ReturnT> selected_val;
-      tensorflow::gtl::optional<std::vector<int64>> selected_index;
+      optional<ReturnT> selected_val;
+      optional<std::vector<int64>> selected_index;
 
       IterateThroughWindow(
           window_shape, window, operand_literal.shape(), source_index,
@@ -1586,11 +1705,11 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                 Literal::CreateR0<ReturnT>(*selected_val);
 
             const std::vector<const Literal*> args = {
-                curr_val_literal.get(), selected_val_literal.get()};
+                selected_val_literal.get(), curr_val_literal.get()};
             std::unique_ptr<Literal> computed_result =
                 embedded_evaluator.Evaluate<const Literal*>(*select, args)
                     .ConsumeValueOrDie();
-            bool selected = computed_result->Get<bool>({});
+            bool selected = !computed_result->Get<bool>({});
             if (selected) {
               selected_val = curr_val;
               selected_index = operand_index;
@@ -1665,10 +1784,10 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     DimensionVector window_index(window.dimensions_size());
     DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
 
-    HloEvaluator embedded_evaluator;
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
     // For each resulting dimension, calculate and assign computed value.
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> output_index) {
+    TF_RETURN_IF_ERROR(
+        result->Populate<ReturnT>([&](ArraySlice<int64> output_index) {
           ReturnT result_val = init_scalar;
 
           std::fill(window_index.begin(), window_index.end(), 0);
@@ -1685,7 +1804,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                 const auto result_val_literal =
                     Literal::CreateR0<ReturnT>(result_val);
                 const std::vector<const Literal*> args = {
-                    curr_val_literal.get(), result_val_literal.get()};
+                    result_val_literal.get(), curr_val_literal.get()};
                 std::unique_ptr<Literal> computed_result =
                     embedded_evaluator.Evaluate<const Literal*>(*function, args)
                         .ConsumeValueOrDie();
@@ -1718,7 +1837,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     const int64 rank = ShapeUtil::Rank(operand->shape());
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    auto func = [&](tensorflow::gtl::ArraySlice<int64> out_index) {
+    auto func = [&](ArraySlice<int64> out_index) {
       DimensionVector operand_index(rank);
       for (int64 i = 0; i < rank; ++i) {
         operand_index[i] =
@@ -1734,6 +1853,34 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  // Enable CLZ only for int32 and uint32.
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          (std::is_floating_point<NativeT>::value ||
+           std::is_integral<NativeT>::value || is_complex_t<NativeT>::value) &&
+          !(std::is_same<NativeT, uint32>::value ||
+            std::is_same<NativeT, int32>::value)>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    return InvalidArgument("Unsupported type for Clz");
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, uint32>::value ||
+                std::is_same<NativeT, int32>::value>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
+                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
+                          return 31 - tensorflow::Log2Floor(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleClz(HloInstruction* clz) override {
+    return HandleClz<ElementwiseT>(clz);
+  }
+
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleSin(HloInstruction* sin) {
@@ -1899,8 +2046,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     std::vector<int64> operand_indices(start.size());
 
     auto result = Literal::CreateFromShape(result_shape);
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+    TF_RETURN_IF_ERROR(
+        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
           for (int64 i = 0; i < operand_indices.size(); ++i) {
             CHECK_GE(multi_index[i] + start[i], 0);
             // Mod is only used here to be consistent with the existing
@@ -1920,17 +2067,26 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   StatusOr<std::unique_ptr<Literal>> DynamicUpdateSlice(
       const Literal& operand_literal, const Literal& update_literal,
       const Literal& start_indices_literal) {
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    const std::vector<int64> start(start_indices_typed.begin(),
-                                   start_indices_typed.end());
-
     auto result = operand_literal.CloneToUnique();
-    std::vector<int64> result_index(ShapeUtil::Rank(result->shape()), 0);
+    auto start_indices_typed = start_indices_literal.data<IndexT>();
+    const auto rank = ShapeUtil::Rank(result->shape());
+    std::vector<int64> start(rank, 0);
+    for (int64 i = 0; i < rank; ++i) {
+      // All other implementations currently wrap-around the index, so this
+      // should do so as well.
+      start[i] = (start_indices_typed[i] % result->shape().dimensions(i));
+      start[i] += (start[i] < 0) * result->shape().dimensions(i);
+    }
+    std::vector<int64> result_index(rank, 0);
 
-    auto func = [&](const std::vector<int64>& update_index) {
+    auto func = [&](ArraySlice<int64> update_index) {
       std::transform(update_index.begin(), update_index.end(), start.begin(),
                      result_index.begin(), std::plus<int64>());
-
+      // Same as above, wrap-around only to match other implementations'
+      // semantics.
+      std::transform(result_index.begin(), result_index.end(),
+                     result->shape().dimensions().begin(), result_index.begin(),
+                     std::modulus<int64>());
       result->Set<ReturnT>(result_index,
                            update_literal.Get<ReturnT>(update_index));
       return true;
@@ -1983,8 +2139,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     auto result = Literal::CreateFromShape(shape);
 
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+    TF_RETURN_IF_ERROR(
+        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
           return ConvertBinaryFunction(binary_op)(
               lhs_literal.Get<ReturnT>(multi_index),
               rhs_literal.Get<ReturnT>(multi_index));
@@ -2021,8 +2177,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     auto result = Literal::CreateFromShape(shape);
 
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+    TF_RETURN_IF_ERROR(
+        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
           return ternary_op(lhs_literal.Get<LhsType>(multi_index),
                             rhs_literal.Get<RhsType>(multi_index),
                             ehs_literal.Get<EhsType>(multi_index));
@@ -2031,20 +2187,31 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return std::move(result);
   }
 
+  template <typename NativeT>
+  static bool IsShiftOutOfBounds(NativeT rhs) {
+    typedef typename std::make_unsigned<NativeT>::type UnsignedT;
+    UnsignedT lhs_size_unsigned = sizeof(NativeT) * CHAR_BIT;
+    UnsignedT rhs_unsigned = static_cast<UnsignedT>(rhs);
+    return rhs_unsigned >= lhs_size_unsigned;
+  }
+
   HloEvaluator* parent_;
 };  // class HloEvaluator::TypedVisitor
 
-HloEvaluator::HloEvaluator() {
+HloEvaluator::HloEvaluator(int64 max_loop_iterations)
+    : max_loop_iterations_(max_loop_iterations) {
   typed_visitors_[PRED] = MakeUnique<TypedVisitor<bool>>(this);
   typed_visitors_[U8] = MakeUnique<TypedVisitor<uint8>>(this);
   typed_visitors_[U16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented("HloEvaluator: unhandled primitive type: U16.");
+    return Unimplemented(
+        "HloEvaluator::TypedVisitor: unhandled primitive type: U16.");
   });
   typed_visitors_[U32] = MakeUnique<TypedVisitor<uint32>>(this);
   typed_visitors_[U64] = MakeUnique<TypedVisitor<uint64>>(this);
   typed_visitors_[S8] = MakeUnique<TypedVisitor<int8>>(this);
   typed_visitors_[S16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented("HloEvaluator: unhandled primitive type: S16.");
+    return Unimplemented(
+        "HloEvaluator::TypedVisitor: unhandled primitive type: S16.");
   });
   typed_visitors_[S32] = MakeUnique<TypedVisitor<int32>>(this);
   typed_visitors_[S64] = MakeUnique<TypedVisitor<int64>>(this);
@@ -2058,18 +2225,20 @@ HloEvaluator::HloEvaluator() {
   // elementwise computations to be done in F32 and do BF16<->F32 conversion
   // around the input and the output of the computations.
   typed_visitors_[BF16] = MakeUnique<TypedVisitor<bfloat16, float>>(this);
+
   typed_visitors_[TUPLE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented("HloEvaluator: unhandled primitive type: TUPLE.");
+    return Unimplemented(
+        "HloEvaluator::TypedVistor: unhandled primitive type: TUPLE.");
   });
   typed_visitors_[OPAQUE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented("HloEvaluator: unhandled primitive type: OPAQUE.");
+    return Unimplemented(
+        "HloEvaluator::TypedVisitor: unhandled primitive type: OPAQUE.");
   });
 }
 
 template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
-    const HloModule& module,
-    tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals) {
+    const HloModule& module, ArraySlice<LiteralPtr> arg_literals) {
   XLA_VLOG_LINES(2, "HloEvaluator::Evaluate module:\n" + module.ToString());
 
   evaluated_.clear();
@@ -2086,8 +2255,8 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
 
 template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
-    const HloComputation& computation,
-    tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals) {
+    const HloComputation& computation, ArraySlice<LiteralPtr> arg_literals) {
+  CHECK(computation.parent() != nullptr);
   XLA_VLOG_LINES(
       2, "HloEvaluator::Evaluate computation:\n" + computation.ToString());
 
@@ -2103,8 +2272,7 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
 
 template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
-    HloInstruction* instruction,
-    tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals) {
+    HloInstruction* instruction, ArraySlice<LiteralPtr> arg_literals) {
   TF_RET_CHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(instruction->shape()));
 
@@ -2229,8 +2397,7 @@ Status HloEvaluator::HandleTranspose(HloInstruction* transpose) {
 }
 
 Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(
-      concatenate->operands());
+  ArraySlice<HloInstruction*> operands(concatenate->operands());
   // The result concatenate dimension is going to be the sum of all
   // concatenate dimensions of the operands taking part of the operation.
   const Shape& reference_shape = operands[0]->shape();
@@ -2369,6 +2536,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
     } break;
     case F16:
       return Unimplemented("unhandled primitive type: F16.");
+    case BF16: {
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<bfloat16>(compare->shape(), opcode,
+                                            lhs_literal, rhs_literal));
+    } break;
     case F32: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
@@ -2402,6 +2574,349 @@ Status HloEvaluator::HandleTuple(HloInstruction* tuple) {
   return Status::OK();
 }
 
+// Returns an ShapeUtil::IndexIterationSpace that iterates over the output
+// gather dimensions while keeping the rest of the output dimensions clamped to
+// 0.
+ShapeUtil::IndexIterationSpace IterationSpaceForOutputGatherIndices(
+    const Shape& output_shape, const GatherDimensionNumbers& dim_numbers) {
+  int64 output_rank = output_shape.dimensions_size();
+  std::vector<int64> index_base(output_rank, 0);
+  std::vector<int64> index_count;
+  index_count.reserve(output_rank);
+  for (int64 i = 0; i < output_rank; i++) {
+    bool is_output_gather_dim =
+        !c_binary_search(dim_numbers.output_window_dims(), i);
+    index_count.push_back(is_output_gather_dim ? output_shape.dimensions(i)
+                                               : 1);
+  }
+
+  return {std::move(index_base), std::move(index_count),
+          std::vector<int64>(output_rank, 1)};
+}
+
+// Return an ShapeUtil::IndexIterationSpace that iterates over the output window
+// dimensions while keeping the rest of the output dimensions clamped to 0.
+ShapeUtil::IndexIterationSpace IterationSpaceForOutputWindowIndices(
+    int64 output_rank, ArraySlice<int64> window_bounds,
+    const GatherDimensionNumbers& dim_numbers) {
+  std::vector<int64> index_base(output_rank, 0);
+  std::vector<int64> index_count(output_rank, 1);
+  int64 window_bounds_idx = 0;
+  for (int64 i = 0; i < output_rank; i++) {
+    bool is_output_window_dim =
+        c_binary_search(dim_numbers.output_window_dims(), i);
+    if (is_output_window_dim) {
+      while (c_binary_search(dim_numbers.elided_window_dims(),
+                             window_bounds_idx)) {
+        window_bounds_idx++;
+      }
+      index_count[i] = window_bounds[window_bounds_idx++];
+    }
+  }
+
+  return {std::move(index_base), std::move(index_count),
+          std::vector<int64>(output_rank, 1)};
+}
+
+// This functor computes the contribution of gather_indices to an input index
+// corresponding to an output index.  That is, given an output index I, it picks
+// out the gather output indices in I and uses them to look up a gather index,
+// G, from the gather indices tensor, and expands G into the input space
+// according to gather_dims_to_operand_dims.
+class OutputGatherIndexToInputIndex {
+ public:
+  // The constructor does some setup work that is amortized across all
+  // iterations.
+  explicit OutputGatherIndexToInputIndex(
+      const GatherDimensionNumbers* dim_numbers, const Shape& input_shape,
+      const Shape& output_shape, const Literal* gather_indices)
+      : dim_numbers_(*dim_numbers), gather_indices_(*gather_indices) {
+    for (int64 i = 0; i < output_shape.dimensions_size(); i++) {
+      output_dim_is_gather_dims_.push_back(
+          !c_binary_search(dim_numbers_.output_window_dims(), i));
+    }
+
+    for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
+      int64 index_of_input_dim_in_index_vector =
+          std::distance(dim_numbers_.gather_dims_to_operand_dims().begin(),
+                        c_find(dim_numbers_.gather_dims_to_operand_dims(), i));
+      if (index_of_input_dim_in_index_vector ==
+          dim_numbers_.gather_dims_to_operand_dims_size()) {
+        input_dim_value_to_index_vector_.push_back(-1);
+      } else {
+        input_dim_value_to_index_vector_.push_back(
+            index_of_input_dim_in_index_vector);
+      }
+    }
+
+    index_vector_index_.resize(gather_indices_.shape().dimensions_size());
+    input_index_.resize(input_shape.dimensions_size());
+    int64 index_vector_size =
+        gather_indices_.shape().dimensions(dim_numbers_.index_vector_dim());
+    index_vector_.resize(index_vector_size);
+  }
+
+  // Returns the contribution of gather_indices to the input index corresponding
+  // to output_index.  See gather_inner_loop_body.
+  //
+  // This is conceptually  a stateless transformation from output_index to the
+  // gather input index, but:
+  //
+  //  - Instead of allocating memory to represent the gather input index on
+  //    every invocation we reuse the same storage for the result
+  //    (input_index_), mutating it in place.
+  //  - Instead of allocating buffers for temporary values like
+  //    index_vector_index_ and index_vector on every invocation, we reuse the
+  //    same storage for all invocations.
+  //
+  // This returns an arrayslice into memory owned by the class.
+  StatusOr<ArraySlice<int64>> operator()(ArraySlice<int64> output_index) {
+    PropagateOutputIndexGatherDimsToIndexVectorIndex(output_index);
+    TF_RETURN_IF_ERROR(FetchIndexVector());
+    PropagateIndexVectorToInputIndex();
+    return ArraySlice<int64>(input_index_);
+  }
+
+ private:
+  // Propagates the gather index dimensions from the output index into
+  // index_vector_index_ by mutating index_vector_index_ in place.  Does not
+  // update the dim_numbers.index_vector_dim() dimension -- that's the dimension
+  // we iterate over in FetchIndexVector.
+  void PropagateOutputIndexGatherDimsToIndexVectorIndex(
+      ArraySlice<int64> output_index) {
+    int64 index_vector_index_i = 0;
+    for (int64 i = 0, e = output_index.size(); i < e; i++) {
+      if (!output_dim_is_gather_dims_[i]) {
+        continue;
+      }
+
+      if (index_vector_index_i == dim_numbers_.index_vector_dim()) {
+        index_vector_index_i++;
+      }
+
+      index_vector_index_[index_vector_index_i++] = output_index[i];
+    }
+  }
+
+  // Populates index_vector_ by iterating over gather_indices_ according to
+  // index_vector_index_.
+  Status FetchIndexVector() {
+    int64 index_vector_dim = dim_numbers_.index_vector_dim();
+    for (int64 i = 0, e = index_vector_.size(); i < e; i++) {
+      index_vector_index_[index_vector_dim] = i;
+      TF_ASSIGN_OR_RETURN(index_vector_[i], gather_indices_.GetIntegralAsS64(
+                                                index_vector_index_));
+    }
+    return Status::OK();
+  }
+
+  // Populates input_index_.
+  void PropagateIndexVectorToInputIndex() {
+    for (int64 i = 0, e = input_index_.size(); i < e; i++) {
+      if (input_dim_value_to_index_vector_[i] != -1) {
+        input_index_[i] = index_vector_[input_dim_value_to_index_vector_[i]];
+      }
+
+      // If input_dim_value_to_index_vector_[i] == -1 then input_index_[i]
+      // remains 0, as set by the constructor.
+    }
+  }
+
+  // input_dim_value_to_index_vector_[i] tells us how to compute dimension i of
+  // the input index from the index vector.  See
+  // PropagateIndexVectorToInputIndex.
+  std::vector<int64> input_dim_value_to_index_vector_;
+
+  // output_dim_is_gather_dims_[i] is true iff the output index i is a gather
+  // dimension.
+  std::vector<bool> output_dim_is_gather_dims_;
+
+  // The buffer into which we construct an index into gather_indices_ to fetch
+  // the index vector.
+  std::vector<int64> index_vector_index_;
+
+  // The index vector fetched from gather_indices_.
+  std::vector<int64> index_vector_;
+
+  // The result computed by this functor.  operator() returns an ArraySlice into
+  // this vector.
+  std::vector<int64> input_index_;
+
+  const GatherDimensionNumbers& dim_numbers_;
+  const Literal& gather_indices_;
+};
+
+// This functor computes the contribution of the window indices in an output
+// index to an input index.  That is, given an output index I it picks out the
+// output window indices in I and expands it into a window index into the input
+// shape.
+class OutputWindowIndexToInputIndex {
+ public:
+  // The constructor does some setup work that is amortized across all
+  // iterations.
+  explicit OutputWindowIndexToInputIndex(
+      const GatherDimensionNumbers& dim_numbers, const Shape& input_shape,
+      const Shape& output_shape) {
+    std::vector<int64> window_index_to_output_index;
+    int64 output_index_count = 0;
+    for (int64 i = 0; i < output_shape.dimensions_size(); i++) {
+      if (c_binary_search(dim_numbers.output_window_dims(), i)) {
+        window_index_to_output_index.push_back(output_index_count++);
+      } else {
+        output_index_count++;
+      }
+    }
+
+    int64 window_dim_count = 0;
+    for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
+      if (c_binary_search(dim_numbers.elided_window_dims(), i)) {
+        input_dim_value_to_output_index_.push_back(-1);
+      } else {
+        input_dim_value_to_output_index_.push_back(
+            window_index_to_output_index[window_dim_count++]);
+      }
+    }
+
+    input_index_.resize(input_shape.dimensions_size());
+  }
+
+  // Returns the contribution of the window indices to the input index
+  // corresponding to output_index.  See gather_inner_loop_body.
+  //
+  // This is conceptually a stateless transformation from output_index to the
+  // window input index, but instead of allocating memory to represent the
+  // gather input index on every invocation we reuse the same storage for the
+  // result (input_index_), mutating it in place.
+  //
+  // This returns an arrayslice into memory owned by the class.
+  StatusOr<ArraySlice<int64>> operator()(ArraySlice<int64> output_index) {
+    PropagateOutputIndexWindowDimsToInputIndex(output_index);
+    return ArraySlice<int64>(input_index_);
+  }
+
+ private:
+  // Propagates window dimensions from the output index to input_index_ by
+  // mutating input_index_ in place.
+  void PropagateOutputIndexWindowDimsToInputIndex(
+      ArraySlice<int64> output_index) {
+    for (int64 i = 0, e = input_index_.size(); i < e; i++) {
+      if (input_dim_value_to_output_index_[i] != -1) {
+        input_index_[i] = output_index[input_dim_value_to_output_index_[i]];
+      }
+
+      // If input_dim_value_to_index_vector_[i] == -1 then input_index_[i]
+      // remains 0, as set by the constructor.
+    }
+  }
+
+  // input_dim_value_to_index_vector_[i] tells us how to compute dimension i of
+  // the input index from the output index. See
+  // PropagateOutputIndexToInputIndex.
+  std::vector<int64> input_dim_value_to_output_index_;
+
+  // The result computed by this functor.  operator() returns an ArraySlice into
+  // this vector.
+  std::vector<int64> input_index_;
+};
+
+// Rehapes the gather indices input to have a trailing degenerate `1` dimension
+// if necessary.  Hands over the ownership of the newly created literal (if
+// there is one) to `reshaped_gather_indices`.
+static StatusOr<std::reference_wrapper<const Literal>> ReshapedGatherIndices(
+    int64 index_vector_dim, const Literal& gather_indices,
+    std::unique_ptr<Literal>* reshaped_gather_indices) {
+  if (gather_indices.shape().dimensions_size() != index_vector_dim) {
+    return std::cref(gather_indices);
+  }
+
+  std::vector<int64> new_shape(gather_indices.shape().dimensions().begin(),
+                               gather_indices.shape().dimensions().end());
+  new_shape.push_back(1);
+  TF_ASSIGN_OR_RETURN(*reshaped_gather_indices,
+                      gather_indices.Reshape(new_shape));
+  return std::cref(**reshaped_gather_indices);
+}
+
+Status HloEvaluator::HandleGather(HloInstruction* gather) {
+  std::unique_ptr<Literal> result = Literal::CreateFromShape(gather->shape());
+  const Shape& shape = gather->shape();
+  const GatherDimensionNumbers& dim_numbers =
+      gather->gather_dimension_numbers();
+  const Literal& operand = GetEvaluatedLiteralFor(gather->operand(0));
+  std::unique_ptr<Literal> reshaped_gather_indices;
+  TF_ASSIGN_OR_RETURN(
+      const Literal& gather_indices,
+      ReshapedGatherIndices(dim_numbers.index_vector_dim(),
+                            GetEvaluatedLiteralFor(gather->operand(1)),
+                            &reshaped_gather_indices));
+
+  // We iterate over the gather dimensions in the output shape in an outer loop
+  // nest, and iterate over the window dimensions in the output shape in an
+  // inner loop nest.
+
+  ShapeUtil::IndexIterationSpace gather_indices_iteration_space =
+      IterationSpaceForOutputGatherIndices(shape, dim_numbers);
+  ShapeUtil::IndexIterationSpace window_indices_iteration_space =
+      IterationSpaceForOutputWindowIndices(
+          shape.dimensions_size(), gather->gather_window_bounds(), dim_numbers);
+
+  // Scratch buffers that hold an index in the output shape and the
+  // corresponding index in the input shape.
+  std::vector<int64> input_index(operand.shape().dimensions_size());
+  std::vector<int64> output_index(gather->shape().dimensions_size());
+
+  OutputGatherIndexToInputIndex output_gather_index_to_input_index(
+      &gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(),
+      /*output_shape=*/shape, &gather_indices);
+  OutputWindowIndexToInputIndex output_window_index_to_input_index(
+      gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(),
+      /*output_shape=*/shape);
+
+  const Shape& operand_shape = operand.shape();
+
+  auto gather_inner_loop_body =
+      [&](ArraySlice<int64> output_window_index,
+          ArraySlice<int64> input_gather_index,
+          ArraySlice<int64> output_gather_index) -> StatusOr<bool> {
+    TF_ASSIGN_OR_RETURN(
+        ArraySlice<int64> input_window_index,
+        output_window_index_to_input_index(output_window_index));
+    for (int i = 0, e = output_index.size(); i < e; i++) {
+      output_index[i] = output_gather_index[i] + output_window_index[i];
+      DCHECK_LT(output_index[i], shape.dimensions(i));
+    }
+    for (int i = 0, e = input_index.size(); i < e; i++) {
+      // TODO(b/74360564): We should implement whatever out of bounds behavior
+      // we decide for dynamic-slice here as well.
+      input_index[i] = (input_gather_index[i] + input_window_index[i]) %
+                       operand_shape.dimensions(i);
+      if (input_index[i] < 0) {
+        input_index[i] += operand_shape.dimensions(i);
+      }
+    }
+    TF_RETURN_IF_ERROR(
+        result->CopyElementFrom(operand, input_index, output_index));
+    return true;
+  };
+
+  auto gather_outer_loop_body =
+      [&](ArraySlice<int64> output_gather_index) -> StatusOr<bool> {
+    TF_ASSIGN_OR_RETURN(
+        ArraySlice<int64> input_gather_index,
+        output_gather_index_to_input_index(output_gather_index));
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
+        shape, window_indices_iteration_space,
+        std::bind(gather_inner_loop_body, std::placeholders::_1,
+                  input_gather_index, output_gather_index)));
+    return true;
+  };
+
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
+      shape, gather_indices_iteration_space, gather_outer_loop_body));
+  evaluated_[gather] = std::move(result);
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   const auto result_shape = get_tuple_element->shape();
   const int64 index = get_tuple_element->tuple_index();
@@ -2432,6 +2947,136 @@ Status HloEvaluator::HandleCopy(HloInstruction* copy) {
   return Status::OK();
 }
 
+Status HloEvaluator::HandleCall(HloInstruction* call) {
+  auto* computation = call->to_apply();
+  auto operands = call->operands();
+
+  std::vector<const Literal*> arg_literals;
+  arg_literals.reserve(operands.size());
+  for (auto operand : operands) {
+    const Literal& arg_literal = GetEvaluatedLiteralFor(operand);
+    arg_literals.push_back(&arg_literal);
+  }
+
+  HloEvaluator embedded_evaluator;
+  std::unique_ptr<Literal> result =
+      embedded_evaluator.Evaluate<const Literal*>(*computation, arg_literals)
+          .ConsumeValueOrDie();
+
+  evaluated_[call] = std::move(result);
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
+  HloModuleConfig config;
+  // Attach cloned computation to an empty HLO module so the existing ones are
+  // not modified.
+  HloModule empty_hlo_module("EmptyModuleForFusion", config);
+  auto cloned_fused_computation =
+      fusion->fused_instructions_computation()->Clone(
+          /*suffix=*/"clone_with_layout", &empty_hlo_module);
+  for (auto* instruction : cloned_fused_computation->instructions()) {
+    LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
+  }
+  auto readded_computation =
+      empty_hlo_module.AddEntryComputation(std::move(cloned_fused_computation));
+
+  auto operands = fusion->operands();
+  std::vector<const Literal*> arg_literals;
+  arg_literals.reserve(operands.size());
+  for (auto operand : operands) {
+    const Literal& arg_literal = GetEvaluatedLiteralFor(operand);
+    arg_literals.push_back(&arg_literal);
+  }
+
+  HloEvaluator embedded_evaluator;
+  std::unique_ptr<Literal> result =
+      embedded_evaluator
+          .Evaluate<const Literal*>(*readded_computation, arg_literals)
+          .ConsumeValueOrDie();
+
+  evaluated_[fusion] = std::move(result);
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleConditional(HloInstruction* conditional) {
+  const auto& pred = GetEvaluatedLiteralFor(conditional->operand(0));
+  const auto& true_computation_arg =
+      GetEvaluatedLiteralFor(conditional->operand(1));
+  const auto& false_computation_arg =
+      GetEvaluatedLiteralFor(conditional->operand(2));
+
+  auto* true_computation = conditional->true_computation();
+  auto* false_computation = conditional->false_computation();
+
+  auto result = Literal::CreateFromShape(conditional->shape());
+  HloEvaluator embedded_evaluator;
+  if (pred.Get<bool>({})) {
+    result = embedded_evaluator
+                 .Evaluate<const Literal*>(*true_computation,
+                                           {&true_computation_arg})
+                 .ConsumeValueOrDie();
+  } else {
+    result = embedded_evaluator
+                 .Evaluate<const Literal*>(*false_computation,
+                                           {&false_computation_arg})
+                 .ConsumeValueOrDie();
+  }
+
+  evaluated_[conditional] = std::move(result);
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleSelect(HloInstruction* select) {
+  const auto& pred = GetEvaluatedLiteralFor(select->operand(0));
+  const auto& on_true = GetEvaluatedLiteralFor(select->operand(1));
+  const auto& on_false = GetEvaluatedLiteralFor(select->operand(2));
+
+  // If predicate is of scalar type, no element-wise selection would be needed.
+  // This would also handle output array of tuple types as the DefaultAction
+  // would go through the TypedVisitor which doesn't handle tuples.
+  if (ShapeUtil::IsScalar(pred.shape())) {
+    if (pred.Get<bool>({})) {
+      evaluated_[select] = on_true.CloneToUnique();
+    } else {
+      evaluated_[select] = on_false.CloneToUnique();
+    }
+    return Status::OK();
+  }
+
+  return DefaultAction(select);
+}
+
+Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) {
+  HloComputation* cond_comp = while_hlo->while_condition();
+  HloComputation* body_comp = while_hlo->while_body();
+  // Initialize the loop carried valued with the input to the While instruction.
+  auto lcv = GetEvaluatedLiteralFor(while_hlo->operand(0)).CloneToUnique();
+  bool keep_going = true;
+  int64 iteration_count = 0;
+  HloEvaluator cond_evaluator(max_loop_iterations_);
+  HloEvaluator loop_body_evaluator(max_loop_iterations_);
+  while (keep_going) {
+    if (max_loop_iterations_ >= 0 && iteration_count++ > max_loop_iterations_) {
+      return InvalidArgument("Loop %s exceeded loop iteration limit (%lld).",
+                             while_hlo->name().c_str(), max_loop_iterations_);
+    }
+    TF_ASSIGN_OR_RETURN(auto cond_val, cond_evaluator.Evaluate<Literal*>(
+                                           *cond_comp, {lcv.get()}));
+    keep_going = cond_val->GetFirstElement<bool>();
+    if (keep_going) {
+      TF_ASSIGN_OR_RETURN(auto body_val, loop_body_evaluator.Evaluate<Literal*>(
+                                             *body_comp, {lcv.get()}));
+      VLOG(3) << "Loop iteration result: " << body_val->ToString();
+      lcv = std::move(body_val);
+      cond_evaluator.ResetVisitStates();
+      loop_body_evaluator.ResetVisitStates();
+    }
+  }
+  evaluated_[while_hlo] = std::move(lcv);
+  return Status::OK();
+}
+
 Status HloEvaluator::Preprocess(HloInstruction* hlo) {
   VLOG(2) << "About to visit HLO: " << hlo->ToString();
   return Status::OK();
@@ -2445,28 +3090,27 @@ Status HloEvaluator::Postprocess(HloInstruction* hlo) {
 
 // Explicit instantiation of templatized Evaluate* methods.
 //
-template StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate<
-    const Literal*>(const HloModule& module,
-                    tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+template StatusOr<std::unique_ptr<Literal>>
+HloEvaluator::Evaluate<const Literal*>(const HloModule& module,
+                                       ArraySlice<const Literal*> arg_literals);
 template StatusOr<std::unique_ptr<Literal>>
 HloEvaluator::Evaluate<std::unique_ptr<Literal>>(
-    const HloModule& module,
-    tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arg_literals);
+    const HloModule& module, ArraySlice<std::unique_ptr<Literal>> arg_literals);
 
-template StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate<
-    const Literal*>(const HloComputation& computation,
-                    tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+template StatusOr<std::unique_ptr<Literal>>
+HloEvaluator::Evaluate<const Literal*>(const HloComputation& computation,
+                                       ArraySlice<const Literal*> arg_literals);
 template StatusOr<std::unique_ptr<Literal>>
 HloEvaluator::Evaluate<std::unique_ptr<Literal>>(
     const HloComputation& computation,
-    tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arg_literals);
+    ArraySlice<std::unique_ptr<Literal>> arg_literals);
 
-template StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate<
-    const Literal*>(HloInstruction* instruction,
-                    tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+template StatusOr<std::unique_ptr<Literal>>
+HloEvaluator::Evaluate<const Literal*>(HloInstruction* instruction,
+                                       ArraySlice<const Literal*> arg_literals);
 template StatusOr<std::unique_ptr<Literal>>
 HloEvaluator::Evaluate<std::unique_ptr<Literal>>(
     HloInstruction* instruction,
-    tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arg_literals);
+    ArraySlice<std::unique_ptr<Literal>> arg_literals);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 3b2b697e492a78a06a4e5ae6bf056ff8676f2ff5..c0dcee0c3e382f74de72a2b89f39e06f042e2b80 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -36,7 +36,10 @@ namespace xla {
 // This class is not thread-safe.
 class HloEvaluator : public DfsHloVisitorWithDefault {
  public:
-  HloEvaluator();
+  // Only evaluate up to max_loop_iterations per while-loop execution if
+  // specified.
+  explicit HloEvaluator(int64 max_loop_iterations = -1);
+
   // Evaluates an HLO module and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
   // Precondition: The indices of arg_literals correspond to the parameter
@@ -149,10 +152,22 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleTuple(HloInstruction* tuple) override;
 
+  Status HandleGather(HloInstruction* gather) override;
+
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
 
   Status HandleCopy(HloInstruction* copy) override;
 
+  Status HandleConditional(HloInstruction* conditional) override;
+
+  Status HandleCall(HloInstruction* call) override;
+
+  Status HandleFusion(HloInstruction* fusion) override;
+
+  Status HandleWhile(HloInstruction* while_hlo) override;
+
+  Status HandleSelect(HloInstruction* select) override;
+
  private:
   // Returns the already-evaluated literal result for the instruction.
   // A Constant instruction is considered evaluated and its literal will be
@@ -190,6 +205,9 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Must be cleared for each evaluation.
   std::vector<const Literal*> arg_literals_;
 
+  // Max loop iterations to execute with no maximum if negative.
+  int64 max_loop_iterations_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 97765d65909cee192f65069777f8f195081603b2..230147abfec10d84d704ba0d62a17d2fb031aa35 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -1205,6 +1206,80 @@ TEST_P(HloEvaluatorTest,
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
+class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
+
+// Tests that Reduce doesn't lose precision when adding many numbers (because
+// it accumulates its result in a double).
+TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
+  HloComputation::Builder b(TestName());
+
+  constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
+  std::vector<float> v(kNumElements, 1.0f);
+  HloInstruction* arg_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(v)));
+  HloInstruction* init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
+
+  HloComputation::Builder add_computation("add");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  add_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
+  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+
+  HloInstruction* reduce_instruction = b.AddInstruction(
+      HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
+                                   /*dimensions_to_reduce=*/{0}, add_func));
+  module().AddEntryComputation(b.Build());
+
+  HloEvaluator hlo_eval;
+  std::unique_ptr<Literal> result =
+      hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  LiteralTestUtil::ExpectR0Equal<float>(kNumElements, *result);
+}
+
+// Reducing many numbers should be fast because it doesn't create
+// intermediate Literals; the microbenchmark should finish in < 1 msec.
+void BM_ReducePrecisely(int num_iters) {
+  tensorflow::testing::StopTiming();
+  HloComputation::Builder b("BM_ReducePrecisely");
+  HloModuleConfig config;
+  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  HloModule module("BM_ReducePrecisely", VersionedComputationHandle(), config);
+
+  constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
+  std::vector<float> v(kNumElements, 1.0f);
+  HloInstruction* arg_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(v)));
+  auto init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
+
+  HloComputation::Builder add_computation("add");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  add_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
+  auto add_func = module.AddEmbeddedComputation(add_computation.Build());
+
+  HloInstruction* reduce_instruction = b.AddInstruction(
+      HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
+                                   /*dimensions_to_reduce=*/{0}, add_func));
+  module.AddEntryComputation(b.Build());
+
+  HloEvaluator hlo_eval;
+  tensorflow::testing::StartTiming();
+  hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  tensorflow::testing::StopTiming();
+}
+
+BENCHMARK(BM_ReducePrecisely);
+
 TEST_P(HloEvaluatorTest, ReduceAdd) {
   HloComputation::Builder b(TestName());
 
@@ -1729,6 +1804,223 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
                                *result.ValueOrDie());
 }
 
+TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
+  const char* hlo_text = R"(
+HloModule TensorFlowGatherV1
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[2,3] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1, 3}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
+  LiteralTestUtil::ExpectEqual(
+      *Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}),
+      *Evaluate({operand.get(), gather_indices.get()}));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
+  const char* hlo_text = R"(
+HloModule TensorFlowGatherV2
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[3,2] gather(operand, indices),
+      output_window_dims={0},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3, 1}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
+  LiteralTestUtil::ExpectEqual(
+      *Literal::CreateR2<int32>({{1, 3}, {4, 6}, {7, 9}}),
+      *Evaluate({operand.get(), gather_indices.get()}));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
+  const char* hlo_text = R"(
+HloModule TensorFlowGatherMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  ROOT gather = s32[2,3,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3, 1}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 2}, {2, 1}});
+  LiteralTestUtil::ExpectEqual(
+      *Literal::CreateR3<int32>(
+          {{{1, 3}, {4, 6}, {7, 9}}, {{3, 2}, {6, 5}, {9, 8}}}),
+      *Evaluate({operand.get(), gather_indices.get()}));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
+  const char* hlo_text = R"(
+HloModule TensorFlowGatherNd
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  ROOT gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1,2}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 0}, {1, 0}});
+  LiteralTestUtil::ExpectEqual(
+      *Literal::CreateR2<int32>({{-1, 1}, {-4, 4}}),
+      *Evaluate({operand.get(), gather_indices.get()}));
+}
+
+TEST_P(HloEvaluatorTest,
+       EvaluateGather_TensorFlowGatherNdNonDefaultIndexVectorDim) {
+  const char* hlo_text = R"(
+HloModule TensorFlowGatherNd
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  ROOT gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1,2}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 0}, {1, 0}});
+  LiteralTestUtil::ExpectEqual(
+      *Literal::CreateR2<int32>({{-2, 2}, {-1, 1}}),
+      *Evaluate({operand.get(), gather_indices.get()}));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
+  const char* hlo_text = R"(
+HloModule DynamicSlice
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[1,1] gather(operand, indices),
+      output_window_dims={0,1},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({1, 1});
+  LiteralTestUtil::ExpectEqual(
+      *Literal::CreateR2<int32>({{5}}),
+      *Evaluate({operand.get(), gather_indices.get()}));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
+  const char* hlo_text = R"(
+HloModule BatchDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  ROOT gather = s32[2,1,1] gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{2, 1}, {1, 1}});
+  LiteralTestUtil::ExpectEqual(
+      *Literal::CreateR3<int32>({{{8}}, {{5}}}),
+      *Evaluate({operand.get(), gather_indices.get()}));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
+  const char* hlo_text = R"(
+HloModule TensorFlowGatherV1
+
+ENTRY main {
+  operand = s32[3,0] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[2,0] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1, 0}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand = Literal::CreateR2<int32>({{}, {}, {}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
+  LiteralTestUtil::ExpectEqual(
+      *Literal::CreateR2<int32>({{}, {}}),
+      *Evaluate({operand.get(), gather_indices.get()}));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise comparison with 2 bfloat16 operands.
+TEST_P(HloEvaluatorTest, DoesCompareBF16) {
+  // lhs >= rhs
+  auto lhs = Literal::CreateR2<bfloat16>(
+      {{bfloat16(0.25), bfloat16(0.35), bfloat16(0.125)},
+       {bfloat16(-0.25), bfloat16(-0.35), bfloat16(-0.125)}});
+  auto rhs = Literal::CreateR2<bfloat16>(
+      {{bfloat16(0.5), bfloat16(0.125), bfloat16(0.125)},
+       {bfloat16(0.25), bfloat16(-0.375), bfloat16(-0.127)}});
+  auto expected =
+      Literal::CreateR2<bool>({{false, true, true}, {false, true, true}});
+  TestBinaryOp(HloOpcode::kGe, std::move(expected), std::move(lhs),
+               std::move(rhs));
+}
+
 INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
                         ::testing::ValuesIn(use_bf16_params));
 
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index f0df93b61d29c1535d8a89fbd65e669de5b43729..c3ccbf0f0c75b569b49652807dea52faebdccc31 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -111,8 +111,8 @@ HloExecutionProfile::HloExecutionProfile(
     : hlo_profile_printer_data_(*hlo_profile_printer_data),
       hlo_profile_index_map_(*hlo_profile_index_map),
       profile_counters_(
-          /*count*/ hlo_profile_index_map_.total_count(),
-          /*value*/ 0) {}
+          /*count=*/hlo_profile_index_map_.total_count(),
+          /*value=*/0) {}
 
 void HloExecutionProfile::SetCyclesTakenBy(const HloInstruction* hlo,
                                            uint64 cycles_taken) {
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 6fb91b9bef9d1df82b8806ce79cc147823edeb3d..be989846ef5cd2645da88ac9bbfea9534dd47821 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -88,7 +88,7 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
 // down how much time each HLO took.
 class HloExecutionProfile {
  public:
-  using DeviceDescription = perftools::gputools::DeviceDescription;
+  using DeviceDescription = se::DeviceDescription;
 
   HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
                       const HloProfileIndexMap* hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 2861fec39ef0c92fdfbcee04584f9bd36d3cb4d8..516e14b4642ae6665a2d15c91715dc9b057ab41a 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -157,52 +157,60 @@ enum ColorScheme {
   kDashedBorder,
 };
 
+// Graphviz attributes/colors that make up a color scheme.
+struct NodeColors {
+  const char* style;
+  const char* fill_color;
+  const char* stroke_color;
+  const char* font_color;
+};
+
+NodeColors NodeColorsForScheme(ColorScheme color) {
+  switch (color) {
+    case kBlue:
+      return NodeColors{"filled", "#bbdefb", "#8aacc8", "black"};
+    case kBrown:
+      return NodeColors{"filled", "#bcaaa4", "#8c7b75", "black"};
+    case kDarkBlue:
+      return NodeColors{"filled", "#1565c0", "#003c8f", "white"};
+    case kDarkGreen:
+      return NodeColors{"filled", "#2e7d32", "#005005", "white"};
+    case kDarkRed:
+      return NodeColors{"filled", "#b71c1c", "#7f0000", "white"};
+    case kGray:
+      return NodeColors{"filled", "#cfd8dc", "#9ea7aa", "black"};
+    case kGreen:
+      return NodeColors{"filled", "#c8e6c9", "#97b498", "black"};
+    case kOrange:
+      return NodeColors{"filled", "#ffe0b2", "#cbae82", "black"};
+    case kPurple:
+      return NodeColors{"filled", "#e1bee7", "#af8eb5", "black"};
+    case kRed:
+      return NodeColors{"filled", "#ffcdd2", "#cb9ca1", "black"};
+    case kWhite:
+      return NodeColors{"filled", "white", "black", "black"};
+    case kYellow:
+      return NodeColors{"filled", "#fff9c4", "#cbc693", "black"};
+    case kDashedBorder:
+      // "filled,dashed" looks the same as "dashed", since we have a white
+      // background.  But we use "filled,dashed" so that when you hover over
+      // any part of the node (not just the text inside the node), our css
+      // :hover rule is triggered.
+      return NodeColors{"filled,dashed", "white", "#757575", "#757575"};
+  }
+}
+
 // Given a ColorScheme, returns an attribute string for a node of that color.
 // Sets the node's style and fill/stroke/text colors.
 //
 // Colors are from https://material.io/color.
 string NodeColorAttributes(ColorScheme color) {
-  using std::make_tuple;
-
-  const char *style, *fill_color, *stroke_color, *font_color;
-  std::tie(style, fill_color, stroke_color, font_color) = [color] {
-    switch (color) {
-      case kBlue:
-        return make_tuple("filled", "#bbdefb", "#8aacc8", "black");
-      case kBrown:
-        return make_tuple("filled", "#bcaaa4", "#8c7b75", "black");
-      case kDarkBlue:
-        return make_tuple("filled", "#1565c0", "#003c8f", "white");
-      case kDarkGreen:
-        return make_tuple("filled", "#2e7d32", "#005005", "white");
-      case kDarkRed:
-        return make_tuple("filled", "#b71c1c", "#7f0000", "white");
-      case kGray:
-        return make_tuple("filled", "#cfd8dc", "#9ea7aa", "black");
-      case kGreen:
-        return make_tuple("filled", "#c8e6c9", "#97b498", "black");
-      case kOrange:
-        return make_tuple("filled", "#ffe0b2", "#cbae82", "black");
-      case kPurple:
-        return make_tuple("filled", "#e1bee7", "#af8eb5", "black");
-      case kRed:
-        return make_tuple("filled", "#ffcdd2", "#cb9ca1", "black");
-      case kWhite:
-        return make_tuple("filled", "white", "black", "black");
-      case kYellow:
-        return make_tuple("filled", "#fff9c4", "#cbc693", "black");
-      case kDashedBorder:
-        // "filled,dashed" looks the same as "dashed", since we have a white
-        // background.  But we use "filled,dashed" so that when you hover over
-        // any part of the node (not just the text inside the node), our css
-        // :hover rule is triggered.
-        return make_tuple("filled,dashed", "white", "#757575", "#757575");
-    }
-  }();
+  NodeColors node_colors = NodeColorsForScheme(color);
 
   return Printf(
-      R"(style="%s", fontcolor="%s", color="%s", fillcolor="%s")", style,
-      font_color, stroke_color, fill_color);
+      R"(style="%s", fontcolor="%s", color="%s", fillcolor="%s")",
+      node_colors.style, node_colors.font_color, node_colors.stroke_color,
+      node_colors.fill_color);
 }
 
 // Replaces <> with &lt;&gt;, so that this string is safe(er) for use in a
@@ -604,11 +612,21 @@ tooltip = " ";
       StrAppend(&subcomp_label, "<br/>", extra_info);
     }
 
-    // Subcomputation's fill/stroke color is light/dark red/gray, depending on
-    // whether or not the subcomputation's fusion node is highlighted.
     bool highlight = filter_.Highlight(parent_instr);
-    const char* fillcolor = highlight ? "#ffcdd2" : "#f5f5f5";
-    const char* strokecolor = highlight ? "#b71c1c" : "#c2c2c2";
+    const char* fillcolor;
+    const char* strokecolor;
+    if (debug_options_.xla_hlo_graph_sharding_color() && !highlight) {
+      // Use the sharding color, if the node isn't highlighted.
+      NodeColors node_colors =
+          NodeColorsForScheme(GetInstructionColor(parent_instr));
+      fillcolor = node_colors.fill_color;
+      strokecolor = node_colors.stroke_color;
+    } else {
+      // Subcomputation's fill/stroke color is light/dark red/gray, depending on
+      // whether or not the subcomputation's fusion node is highlighted.
+      fillcolor = highlight ? "#ffcdd2" : "#f5f5f5";
+      strokecolor = highlight ? "#b71c1c" : "#c2c2c2";
+    }
     style =
         Printf(R"(style="rounded,filled,bold"; fillcolor="%s"; color="%s;")",
                fillcolor, strokecolor);
@@ -782,6 +800,14 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
   auto stringify_constant = [](const HloInstruction* constant) {
     const auto& shape = constant->shape();
 
+    // If the shape has a dimension of size zero, print it as e.g.
+    // "{} (f32[42, 0, 10])".  The alternative, calling Literal::ToString(),
+    // enumerates all of its empty dimensions (e.g.  "{ { {}, {} }, ..."), which
+    // is just noise.
+    if (ShapeUtil::HasZeroElements(shape)) {
+      return Printf("{} (%s)", ShapeUtil::HumanString(constant->shape()));
+    }
+
     // Print the literal value of constants with <= K elements.
     optional<int64> elem_count;
     if (!ShapeUtil::IsOpaque(shape) && !ShapeUtil::IsTuple(shape)) {
@@ -797,7 +823,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
 
     // Otherwise, print e.g. "%constant.42 (s32[100])".
     string constant_name;
-    if (tensorflow::StringPiece(constant->name()).starts_with("constant")) {
+    if (tensorflow::str_util::StartsWith(constant->name(), "constant")) {
       constant_name = constant->name();
     } else {
       constant_name = StrCat("constant ", constant->name());
@@ -883,6 +909,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
+    case HloOpcode::kClz:
     case HloOpcode::kComplex:
     case HloOpcode::kConvert:
     case HloOpcode::kCos:
@@ -1015,8 +1042,8 @@ string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
 
   // The HLO instruction name contains usually the opcode, e.g. "%add.42" is
   // an add instruction.  In this case we render just the name.
-  if (tensorflow::StringPiece(instr->name())
-          .starts_with(HloOpcodeString(instr->opcode()))) {
+  if (tensorflow::str_util::StartsWith(instr->name(),
+                                       HloOpcodeString(instr->opcode()))) {
     return Printf("<b>%s</b>", HtmlLikeStringSanitize(instr->name()));
   }
   string extended_opcode =
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
index 1f00aa41dc783f9e5657f5fa654884a31fae0fe7..b589cd573d82930adf9c37ebfca3328f7d866e1e 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -47,7 +48,9 @@ class DotRenderer : public hlo_graph_dumper::GraphRendererInterface {
 
 XLA_REGISTER_GRAPH_RENDERER(DotRenderer);
 
-TEST(HloGraphDumperTest, NestedFusion) {
+class HloGraphDumperTest : public HloTestBase {};
+
+TEST_F(HloGraphDumperTest, NestedFusion) {
   HloComputation::Builder b("b");
 
   // Build param0 + param1 + param2 + param3 + param4.
@@ -64,10 +67,9 @@ TEST(HloGraphDumperTest, NestedFusion) {
     sums.push_back(b.AddInstruction(HloInstruction::CreateBinary(
         shape, HloOpcode::kAdd, sums[i], params[i + 2])));
   }
-
-  HloModule m(TestName());
-  m.AddEntryComputation(b.Build());
-  HloComputation* root_computation = m.entry_computation();
+  auto m = CreateNewModule();
+  m->AddEntryComputation(b.Build());
+  HloComputation* root_computation = m->entry_computation();
 
   // Fuse into fusion(param0 + param1 + param2 + param3 + param4).
   auto* outer_fusion = root_computation->CreateFusionInstruction(
@@ -117,13 +119,13 @@ TEST(HloGraphDumperTest, NestedFusion) {
       HasSubstr(inner_sum->name()));
 }
 
-TEST(HloGraphDumperTest, Constant) {
+TEST_F(HloGraphDumperTest, Constant) {
   HloComputation::Builder b("b");
   auto instruction = b.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(-42)));
   instruction->set_name("i_am_a_constant_root_instruction");
-  HloModule m(TestName());
-  HloComputation* root_computation = m.AddEntryComputation(b.Build());
+  auto m = CreateNewModule();
+  HloComputation* root_computation = m->AddEntryComputation(b.Build());
   string graph = hlo_graph_dumper::DumpGraph(
       *root_computation, /*label=*/"an_empty_graph", DebugOptions());
   EXPECT_THAT(graph, HasSubstr("an_empty_graph"));
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index b7dd055d7cd78eb759a2b24bcbbbc948159f9425..a714d0e114245021c28da26beae444dbd3d99bb5 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -51,24 +52,22 @@ using ::tensorflow::strings::StrCat;
 /* static */
 StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     HloModule* module, const HloInstructionProto& proto,
-    const tensorflow::gtl::FlatMap<string, HloInstruction*>& instruction_map,
-    const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
-    const std::function<void(std::unique_ptr<HloComputation>)>&
-        add_fused_computation) {
+    const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
+    const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
   TF_RET_CHECK(!proto.opcode().empty());
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(proto.opcode()));
   TF_RET_CHECK(proto.has_shape());
 
   auto instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
-  for (const string& operand_name : proto.operand_names()) {
-    TF_RET_CHECK(ContainsKey(instruction_map, operand_name))
-        << "No instruction named " << operand_name;
-    instruction->AppendOperand(instruction_map.at(operand_name));
-  }
-  for (const string& predecessor_name : proto.control_predecessor_names()) {
-    TF_RET_CHECK(ContainsKey(instruction_map, predecessor_name))
-        << "No instruction named " << predecessor_name;
-    TF_RETURN_IF_ERROR(instruction_map.at(predecessor_name)
+  for (const int64 operand_id : proto.operand_ids()) {
+    TF_RET_CHECK(ContainsKey(instruction_map, operand_id))
+        << "No instruction with id " << operand_id;
+    instruction->AppendOperand(instruction_map.at(operand_id));
+  }
+  for (const int64 predecessor_id : proto.control_predecessor_ids()) {
+    TF_RET_CHECK(ContainsKey(instruction_map, predecessor_id))
+        << "No instruction with id " << predecessor_id;
+    TF_RETURN_IF_ERROR(instruction_map.at(predecessor_id)
                            ->AddControlDependencyTo(instruction.get()));
   }
 
@@ -76,26 +75,36 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   // HloInstructionProto and do not appear as an HloComputationProto within the
   // HloModuleProto.
   if (instruction->opcode() == HloOpcode::kFusion) {
-    TF_RET_CHECK(proto.has_fused_instructions_computation());
     TF_RET_CHECK(!proto.fusion_kind().empty());
     TF_ASSIGN_OR_RETURN(instruction->fusion_kind_,
                         StringToFusionKind(proto.fusion_kind()));
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> fused_computation,
-                        HloComputation::CreateFromProto(
-                            module, proto.fused_instructions_computation(),
-                            computation_map, add_fused_computation,
-                            /*fusion_instruction=*/instruction.get()));
-    instruction->called_computations_.push_back(fused_computation.get());
-    add_fused_computation(std::move(fused_computation));
+
+    // Find the fused computation and set its fusion instruction.
+    TF_RET_CHECK(proto.called_computation_ids_size() == 1)
+        << "Expect 1 called computation for fusion instruction, but sees "
+        << proto.called_computation_ids_size();
+    const int64 fusion_id = proto.called_computation_ids(0);
+    auto* fused_computation = FindPtrOrNull(computation_map, fusion_id);
+    TF_RET_CHECK(fused_computation != nullptr)
+        << "No fusion computation with id " << fusion_id;
+    fused_computation->SetFusionInstruction(instruction.get());
+    instruction->called_computations_.push_back(fused_computation);
   } else {
-    for (const string& computation_name : proto.called_computation_names()) {
-      TF_RET_CHECK(ContainsKey(computation_map, computation_name))
-          << "No computation named " << computation_name;
+    for (const int64 computation_id : proto.called_computation_ids()) {
+      TF_RET_CHECK(ContainsKey(computation_map, computation_id))
+          << "No computation with id " << computation_id;
       instruction->called_computations_.push_back(
-          computation_map.at(computation_name));
+          computation_map.at(computation_id));
     }
   }
 
+  if (instruction->opcode() == HloOpcode::kTrace) {
+    TF_RET_CHECK(instruction->operands().size() == 1)
+        << "Trace instruction should have 1 operand but sees "
+        << instruction->operands().size();
+    instruction->mutable_operand(0)->set_tracing(instruction.get());
+  }
+
   TF_RET_CHECK(!proto.name().empty());
   instruction->name_ = proto.name();
 
@@ -150,6 +159,23 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->fft_length_.push_back(fft_len);
   }
 
+  if (proto.has_sharding()) {
+    TF_ASSIGN_OR_RETURN(const auto& sharding,
+                        HloSharding::FromProto(proto.sharding()));
+    instruction->set_sharding(sharding);
+  }
+
+  if (proto.has_gather_dimension_numbers()) {
+    instruction->gather_dimension_numbers_ =
+        MakeUnique<GatherDimensionNumbers>(proto.gather_dimension_numbers());
+  }
+  for (int64 bound : proto.gather_window_bounds()) {
+    instruction->gather_window_bounds_.push_back(bound);
+  }
+
+  instruction->channel_name_ = proto.channel_name();
+  instruction->cost_estimate_ns_ = proto.cost_estimate_ns();
+
   return std::move(instruction);
 }
 
@@ -168,6 +194,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()));
   instruction->operands_.push_back(operand);
   instruction->literal_ = Literal::CreateR1U8(tag);
+  operand->set_tracing(instruction.get());
   return instruction;
 }
 
@@ -182,6 +209,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateGetTupleElement(const Shape& shape,
                                       HloInstruction* operand, int64 index) {
+  CHECK(ShapeUtil::IsTuple(operand->shape()));
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kGetTupleElement, shape));
   instruction->tuple_index_ = index;
@@ -226,6 +254,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kCeil:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
+    case HloOpcode::kClz:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
     case HloOpcode::kImag:
@@ -801,6 +830,16 @@ static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
   return instruction;
 }
 
+void HloInstruction::SetupDerivedInstruction(
+    HloInstruction* derived_instruction) const {
+  if (sharding_ != nullptr) {
+    derived_instruction->set_sharding(*sharding_);
+  } else {
+    derived_instruction->clear_sharding();
+  }
+  derived_instruction->set_metadata(metadata_);
+}
+
 HloInstruction* HloInstruction::AddFusionOperand(HloInstruction* new_operand) {
   CHECK_EQ(opcode(), HloOpcode::kFusion);
   CHECK_EQ(operand_count(),
@@ -1172,7 +1211,8 @@ bool HloInstruction::HasSideEffect() const {
 /* static */ GatherDimensionNumbers HloInstruction::MakeGatherDimNumbers(
     tensorflow::gtl::ArraySlice<int64> output_window_dims,
     tensorflow::gtl::ArraySlice<int64> elided_window_dims,
-    tensorflow::gtl::ArraySlice<int64> gather_dims_to_operand_dims) {
+    tensorflow::gtl::ArraySlice<int64> gather_dims_to_operand_dims,
+    int64 index_vector_dim) {
   GatherDimensionNumbers gather_dim_numbers;
   for (int64 output_window_dim : output_window_dims) {
     gather_dim_numbers.add_output_window_dims(output_window_dim);
@@ -1184,6 +1224,7 @@ bool HloInstruction::HasSideEffect() const {
     gather_dim_numbers.add_gather_dims_to_operand_dims(gather_dim_to_input_dim);
   }
 
+  gather_dim_numbers.set_index_vector_dim(index_vector_dim);
   return gather_dim_numbers;
 }
 
@@ -1208,6 +1249,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -1437,10 +1479,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
-  clone->set_metadata(metadata_);
-  if (has_sharding()) {
-    clone->set_sharding(sharding());
-  }
+  SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
   return clone;
 }
@@ -1639,14 +1678,35 @@ Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
 }
 
 Status HloInstruction::RemoveControlDependencyTo(HloInstruction* instruction) {
-  auto succ_it = std::find(control_successors_.begin(),
-                           control_successors_.end(), instruction);
-  TF_RET_CHECK(succ_it != control_successors_.end());
-  control_successors_.erase(succ_it);
-  auto pred_it = std::find(instruction->control_predecessors_.begin(),
-                           instruction->control_predecessors_.end(), this);
-  TF_RET_CHECK(pred_it != instruction->control_predecessors_.end());
-  instruction->control_predecessors_.erase(pred_it);
+  TF_RET_CHECK(instruction->parent() == parent());
+  TF_RETURN_IF_ERROR(EraseElementFromVector(&control_successors_, instruction));
+  TF_RETURN_IF_ERROR(
+      EraseElementFromVector(&instruction->control_predecessors_, this));
+  return Status::OK();
+}
+
+Status HloInstruction::DropAllControlDeps() {
+  for (auto* ctrl_succ : control_successors_) {
+    TF_RETURN_IF_ERROR(
+        EraseElementFromVector(&ctrl_succ->control_predecessors_, this));
+  }
+  for (auto* ctrl_pred : control_predecessors_) {
+    TF_RETURN_IF_ERROR(
+        EraseElementFromVector(&ctrl_pred->control_successors_, this));
+  }
+  control_successors_.clear();
+  control_predecessors_.clear();
+  return Status::OK();
+}
+
+Status HloInstruction::CopyAllControlDepsFrom(const HloInstruction* inst) {
+  for (auto* ctrl_pred : inst->control_predecessors()) {
+    TF_RETURN_IF_ERROR(ctrl_pred->AddControlDependencyTo(this));
+  }
+
+  for (auto* ctrl_succ : inst->control_successors()) {
+    TF_RETURN_IF_ERROR(this->AddControlDependencyTo(ctrl_succ));
+  }
 
   return Status::OK();
 }
@@ -1691,6 +1751,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kAdd:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
+    case HloOpcode::kClz:
     case HloOpcode::kComplex:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
@@ -2310,14 +2371,18 @@ string HloInstruction::ToShortString() const {
 
 HloInstructionProto HloInstruction::ToProto() const {
   HloInstructionProto proto;
+  CHECK(unique_id_ != -1)
+      << "This instruction does not have a valid id. Please make sure the "
+         "instruction is inside a module before dumping it.";
+  proto.set_id(unique_id_);
   proto.set_name(name_);
   proto.set_opcode(HloOpcodeString(opcode_));
   *proto.mutable_shape() = shape_;
   for (const HloInstruction* operand : operands_) {
-    *proto.add_operand_names() = operand->name();
+    proto.add_operand_ids(operand->unique_id());
   }
   for (const HloInstruction* control : control_predecessors_) {
-    *proto.add_control_predecessor_names() = control->name();
+    proto.add_control_predecessor_ids(control->unique_id());
   }
 
   *proto.mutable_metadata() = metadata_;
@@ -2327,11 +2392,11 @@ HloInstructionProto HloInstruction::ToProto() const {
   proto.set_parameter_number(parameter_number_);
   if (opcode() == HloOpcode::kFusion) {
     proto.set_fusion_kind(xla::ToString(fusion_kind()));
-    *proto.mutable_fused_instructions_computation() =
-        fused_instructions_computation()->ToProto();
+    proto.add_called_computation_ids(
+        fused_instructions_computation()->unique_id());
   } else {
     for (const HloComputation* computation : called_computations_) {
-      *proto.add_called_computation_names() = computation->name();
+      proto.add_called_computation_ids(computation->unique_id());
     }
   }
 
@@ -2386,6 +2451,9 @@ HloInstructionProto HloInstruction::ToProto() const {
     proto.add_fft_length(fft_len);
   }
 
+  proto.set_channel_name(channel_name_);
+  proto.set_cost_estimate_ns(cost_estimate_ns_);
+
   return proto;
 }
 
@@ -2609,6 +2677,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleFloor(this);
     case HloOpcode::kCeil:
       return visitor->HandleCeil(this);
+    case HloOpcode::kClz:
+      return visitor->HandleClz(this);
     case HloOpcode::kLog:
       return visitor->HandleLog(this);
     case HloOpcode::kTanh:
@@ -2680,8 +2750,10 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
     case HloOpcode::kTrace:
       break;
   }
-  return Unimplemented("unhandled HloOpcode for DfsHloVisitor: %s",
-                       HloOpcodeString(opcode_).c_str());
+  return InternalError(
+      "Unhandled HloOpcode for DfsHloVisitor: %s. This should not happen - "
+      "please file a bug for XLA.",
+      HloOpcodeString(opcode_).c_str());
 }
 
 // Explicit instantiations.
@@ -2948,6 +3020,7 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kConvert:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
@@ -3369,9 +3442,12 @@ string HloInstruction::GatherDimensionNumbersToString() const {
   string gather_dims_to_operand_dims = StrCat(
       "gather_dims_to_operand_dims={",
       Join(gather_dimension_numbers_->gather_dims_to_operand_dims(), ","), "}");
+  string index_vector_dim = StrCat(
+      "index_vector_dim=", gather_dimension_numbers_->index_vector_dim());
 
   return Join<std::initializer_list<string>>(
-      {output_window_dims, elided_window_dims, gather_dims_to_operand_dims},
+      {output_window_dims, elided_window_dims, gather_dims_to_operand_dims,
+       index_vector_dim},
       ", ");
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index c4fe132d1d52d6071914869cd50a035ace3389b2..a5e9aecb9e7f5204b53186abca78033215a75828 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -179,20 +179,15 @@ class HloInstruction {
   //   module: the module which will contain the instruction. The newly created
   //     instruction is *not* added to the module or any computation, however.
   //   proto: the proto to convert from.
-  //   instruction_map: a map from instruction name to HloInstruction*. This map
+  //   instruction_map: a map from instruction id to HloInstruction*. This map
   //     must contain all operands of the newly constructed instruction.
-  //   computation_map: a map from computation name to HloComputation*. This map
+  //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed instruction
   //     calls.
-  //   add_fused_computation: A function to call to add a fused
-  //     computation. Used (clearly) when the instruction is a fusion
-  //     instruction.
   static StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
       HloModule* module, const HloInstructionProto& proto,
-      const tensorflow::gtl::FlatMap<string, HloInstruction*>& instruction_map,
-      const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
-      const std::function<void(std::unique_ptr<HloComputation>)>&
-          add_fused_computation);
+      const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
+      const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
 
   // Creates a parameter-retrieving instruction.
   static std::unique_ptr<HloInstruction> CreateParameter(int64 parameter_number,
@@ -502,7 +497,8 @@ class HloInstruction {
   static GatherDimensionNumbers MakeGatherDimNumbers(
       tensorflow::gtl::ArraySlice<int64> output_window_dims,
       tensorflow::gtl::ArraySlice<int64> elided_window_dims,
-      tensorflow::gtl::ArraySlice<int64> gather_dims_to_operand_dims);
+      tensorflow::gtl::ArraySlice<int64> gather_dims_to_operand_dims,
+      int64 index_vector_dim);
 
   // Returns the opcode for this instruction.
   HloOpcode opcode() const { return opcode_; }
@@ -561,6 +557,18 @@ class HloInstruction {
   // 'instruction'.
   Status RemoveControlDependencyTo(HloInstruction* instruction);
 
+  // Drops all control predecessors and successors from this HLO instruction.
+  Status DropAllControlDeps();
+
+  // Copies the control predecessors and successors on this HLO instruction to
+  // `inst`.  Does not do a deep copy so this makes sense only if `inst` and
+  // this HLO are in the same module.
+  //
+  // Depending on the use cases we see in practice, in the future we may
+  // consider folding the logic here into Clone, CloneWithNewOperands and
+  // ReplaceAllUsesWith by treating control dependencies like data dependencies.
+  Status CopyAllControlDepsFrom(const HloInstruction* inst);
+
   // Returns the set of control predecessors (successors) of this
   // instruction. Control predecessors (successors) must execute before (after)
   // the current instruction.
@@ -824,6 +832,12 @@ class HloInstruction {
   // Precondition: opcode() == HloOpcode::kSend or HloOpcode::kRecv
   int64 channel_id() const { return channel_id_; }
 
+  // Returns the channel name associated with the instruction. The name is
+  // used to identify host Send/Recv operations.
+  //
+  // Precondition: opcode() == HloOpcode::kHostCompute
+  string channel_name() const { return channel_name_; }
+
   // Returns feature_index field associated with the instruction. The index
   // represents the index of the feature dimension.
   //
@@ -926,6 +940,13 @@ class HloInstruction {
   const HloSharding& sharding_or_default(const HloSharding& default_) const {
     return sharding_ ? *sharding_ : default_;
   }
+  // Returns the sharding unique device, if any.
+  tensorflow::gtl::optional<int64> sharding_unique_device() const {
+    if (sharding_ == nullptr || !sharding_->HasUniqueDevice()) {
+      return tensorflow::gtl::optional<int64>();
+    }
+    return sharding_->UniqueDevice().ValueOrDie();
+  }
   // Sets the sharding of this operator. Should only be called by HloModule or
   // HloComputation methods.
   void set_sharding(const HloSharding& sharding) {
@@ -936,6 +957,13 @@ class HloInstruction {
   // Return true if this operator has a sharding assigned.
   bool has_sharding() const { return sharding_ != nullptr; }
 
+  // When creating a new instruction which either replaces, or shifts up (kCopy
+  // insertion case), another instruction, we need to make sure the certain
+  // properties of the new instruction are copied into the derived one. As of
+  // today, the metadata and sharding will be propagated to the derived
+  // instruction.
+  void SetupDerivedInstruction(HloInstruction* derived_instruction) const;
+
   // Adds a new operand the fusion instruction.
   HloInstruction* AddFusionOperand(HloInstruction* new_operand);
 
@@ -1132,17 +1160,17 @@ class HloInstruction {
   // Clones the HLO instruction. The clone will have the same opcode, shape, and
   // operands. After creation the clone has no uses. "this" (the instruction
   // cloned from) is not changed. Suffix is the string to append to the name of
-  // the instruction to form the name of the cloned instruction.
-  // If the module pointer is not nullptr, it will be the module where
-  // the cloned computations will be added to (in order to support deep
-  // cloning).
+  // the instruction to form the name of the cloned instruction.  If the module
+  // pointer is not nullptr, it will be the module where the cloned computations
+  // will be added to (in order to support deep cloning).  Ignores the control
+  // predecessors and successors of this HLO instruction.
   std::unique_ptr<HloInstruction> Clone(const string& suffix = "clone",
                                         HloModule* module = nullptr) const;
 
-  // Clones the HLO instruction as above but with new shape and operands.
-  // If the module pointer is not nullptr, it will be the module where
-  // the cloned computations will be added to (in order to support deep
-  // cloning).
+  // Clones the HLO instruction as above but with new shape and operands.  If
+  // the module pointer is not nullptr, it will be the module where the cloned
+  // computations will be added to (in order to support deep cloning).  Ignores
+  // the control predecessors and successors of this HLO instruction.
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       HloModule* module = nullptr) const;
@@ -1433,7 +1461,7 @@ class HloInstruction {
   string channel_name_;
 
   // Estimate of the duration of a host computation in nanoseconds.
-  int64 cost_estimate_ns_;
+  int64 cost_estimate_ns_ = 0;
 
   // Computations called by this instruction.
   std::vector<HloComputation*> called_computations_;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 32d3ed272bd6b239918076999ecae6c1b3ded2fd..5b65b1152c8298a8954890374626ae5329dccff9 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -149,8 +149,8 @@ TEST_F(HloInstructionTest, UserWithTwoOperands) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r0f32_, "bar"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(add->operands(), UnorderedElementsAre(foo, bar));
   EXPECT_THAT(foo->users(), UnorderedElementsAre(add));
@@ -186,8 +186,8 @@ TEST_F(HloInstructionTest, MultipleUsers) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
   EXPECT_EQ(1, bar->user_count());
@@ -219,8 +219,8 @@ TEST_F(HloInstructionTest, RepeatedUser) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "foo"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(1, foo->user_count());
 
@@ -254,8 +254,8 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperands) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c0, param1));
   auto addtotal = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(addtotal->Accept(&visitor));
@@ -303,8 +303,8 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperandsWithUnaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
   auto neg2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, addtotal));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(neg2->Accept(&visitor));
@@ -325,7 +325,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
   //
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   Shape f32a100x10 = ShapeUtil::MakeShape(F32, {100, 10});
-  HloModule module(TestName());
+  auto module = CreateNewModule();
 
   // Builds an x+1.0 computation to use in a Map.
   auto embedded_builder = HloComputation::Builder("f32+1");
@@ -335,7 +335,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   embedded_builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param, value));
-  auto add_f32 = module.AddEmbeddedComputation(embedded_builder.Build());
+  auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
 
   // Builds a parameter and feeds it to the map.
   HloComputation::Builder builder(TestName());
@@ -343,7 +343,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
       HloInstruction::CreateParameter(0, f32a100x10, ""));
   auto map = builder.AddInstruction(
       HloInstruction::CreateMap(f32a100x10, {param0}, add_f32));
-  module.AddEntryComputation(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(map->Accept(&visitor));
@@ -373,8 +373,8 @@ TEST_F(HloInstructionTest, TrivialReduce) {
       HloInstruction::CreateParameter(1, r0f32, "y"));
   embedded_builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, paramx, paramy));
-  HloModule module(TestName());
-  auto add_f32 = module.AddEmbeddedComputation(embedded_builder.Build());
+  auto module = CreateNewModule();
+  auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
 
   // Builds a parameter and an initial value and feeds them to the reduce.
   HloComputation::Builder builder(TestName());
@@ -387,7 +387,7 @@ TEST_F(HloInstructionTest, TrivialReduce) {
   auto reduce = builder.AddInstruction(
       HloInstruction::CreateReduce(f32v100, param0, const0,
                                    /*dimensions_to_reduce=*/{1}, add_f32));
-  module.AddEntryComputation(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(reduce->Accept(&visitor));
@@ -414,8 +414,8 @@ TEST_F(HloInstructionTest, ReplaceUseInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
   EXPECT_EQ(1, bar->user_count());
@@ -449,8 +449,8 @@ TEST_F(HloInstructionTest, ReplaceUseInVariadicOp) {
       builder.AddInstruction(HloInstruction::CreateTuple({foo, bar, baz, foo}));
   auto add_foobar = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
   EXPECT_THAT(foo->users(), UnorderedElementsAre(tuple, add_foobar));
@@ -477,8 +477,8 @@ TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
   EXPECT_THAT(foo->users(), UnorderedElementsAre(exp, log));
@@ -514,8 +514,8 @@ TEST_F(HloInstructionTest, ReplaceAllUsesWithInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
   EXPECT_EQ(1, bar->user_count());
@@ -544,8 +544,8 @@ TEST_F(HloInstructionTest, ReplaceAllUsesInMultipleOps) {
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({foo, bar}));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
   EXPECT_EQ(2, bar->user_count());
@@ -609,8 +609,8 @@ TEST_F(HloInstructionTest, PostProcessAllVisitedNodes) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, exp, log));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   NodeCollectorAndPostProcessor visitor;
   ASSERT_IS_OK(add->Accept(&visitor));
@@ -627,8 +627,8 @@ TEST_F(HloInstructionTest, SingletonFusionOp) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp}, HloInstruction::FusionKind::kLoop);
 
@@ -645,8 +645,8 @@ TEST_F(HloInstructionTest, BinaryFusionOp) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(42.1f)));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant2));
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add}, HloInstruction::FusionKind::kLoop);
 
@@ -667,8 +667,8 @@ TEST_F(HloInstructionTest, ChainFusionOp) {
   auto exp3 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp2));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp3, exp2, exp1}, HloInstruction::FusionKind::kLoop);
 
@@ -690,8 +690,8 @@ TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
   exp1->set_metadata(metadata);
   exp2->set_metadata(metadata);
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp2, exp1}, HloInstruction::FusionKind::kLoop);
 
@@ -746,13 +746,13 @@ TEST_F(HloInstructionTest, PreserveTupleShapeThroughClone) {
 TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
   // Create a fusion instruction containing a single unary operation.
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  HloModule module(TestName());
+  auto module = CreateNewModule();
 
   auto make_map_computation = [&]() {
     auto builder = HloComputation::Builder("FusionMap");
     builder.AddInstruction(
         HloInstruction::CreateParameter(0, scalar_shape, "param"));
-    return module.AddEmbeddedComputation(builder.Build());
+    return module->AddEmbeddedComputation(builder.Build());
   };
 
   HloComputation* computation_x = make_map_computation();
@@ -767,7 +767,7 @@ TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
       scalar_shape, {map_1_x}, computation_x, /*static_operands=*/{}));
   auto map_3_y = builder.AddInstruction(HloInstruction::CreateMap(
       scalar_shape, {map_2_x}, computation_y, /*static_operands=*/{}));
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* computation = module->AddEntryComputation(builder.Build());
 
   auto* fusion = computation->CreateFusionInstruction(
       {map_3_y}, HloInstruction::FusionKind::kLoop);
@@ -814,8 +814,8 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({sub, sub, mul, c1}));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {tuple, sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
 
@@ -940,8 +940,8 @@ TEST_F(HloInstructionTest, FunctionVisitor) {
       HloInstruction::CreateUnary(f32, HloOpcode::kExp, param));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32, HloOpcode::kAdd, negate, exp));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   int visit_num = 0;
   std::unordered_map<HloInstruction*, int> visit_order;
@@ -969,8 +969,8 @@ TEST_F(HloInstructionTest, FullyElementwise) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, x, y));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(add->IsElementwise());
   for (int i = 0; i < add->operand_count(); ++i) {
@@ -1013,8 +1013,8 @@ TEST_F(HloInstructionTest, PartiallyElementwise) {
   HloInstruction* max = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kMaximum, div, broadcast));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {max, broadcast, div, mul}, HloInstruction::FusionKind::kLoop);
   EXPECT_FALSE(fusion->IsElementwise());
@@ -1056,8 +1056,8 @@ TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) {
   HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kSubtract, min, broadcast));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {sub, broadcast, min}, HloInstruction::FusionKind::kLoop);
   EXPECT_FALSE(fusion->IsElementwise());
@@ -1099,8 +1099,8 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
   HloInstruction* dot = builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
 
@@ -1118,7 +1118,7 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
 }
 
 TEST_F(HloInstructionTest, FusionEquality) {
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create two fusion instructions containing a single unary operation.
@@ -1128,7 +1128,7 @@ TEST_F(HloInstructionTest, FusionEquality) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, parameter));
   auto neg = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, parameter));
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp}, HloInstruction::FusionKind::kLoop);
   auto* fusion2 = computation->CreateFusionInstruction(
@@ -1140,7 +1140,7 @@ TEST_F(HloInstructionTest, FusionEquality) {
 }
 
 TEST_F(HloInstructionTest, NestedFusionEquality) {
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Build a nested fusion computation.
@@ -1166,7 +1166,7 @@ TEST_F(HloInstructionTest, NestedFusionEquality) {
       data_shape, HloOpcode::kSubtract, dot, add_operand));
   builder.AddInstruction(
       HloInstruction::CreateBinary(data_shape, HloOpcode::kMultiply, add, sub));
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputation(builder.Build());
 
   auto nested_fusion = computation->CreateFusionInstruction(
       {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
@@ -1244,8 +1244,8 @@ TEST_F(HloInstructionTest, Stringification) {
             "%dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} "
             "%transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
 
@@ -1271,7 +1271,7 @@ TEST_F(HloInstructionTest, Stringification) {
             "true_computation=%TransposeDot, false_computation=%TransposeDot");
 }
 
-TEST_F(HloInstructionTest, StringifyGather) {
+TEST_F(HloInstructionTest, StringifyGather_0) {
   Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46});
   Shape gather_indices_tensor_shape =
       ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 5});
@@ -1291,11 +1291,12 @@ TEST_F(HloInstructionTest, StringifyGather) {
           HloInstruction::MakeGatherDimNumbers(
               /*output_window_dims=*/{4, 5, 6, 7, 8},
               /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*index_vector_dim=*/4),
           /*window_bounds=*/{30, 29, 28, 27, 26}));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gather_instruction->ToString(),
             "%gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
@@ -1303,7 +1304,43 @@ TEST_F(HloInstructionTest, StringifyGather) {
             "s64[10,9,8,7,5]{4,3,2,1,0} %gather_indices), "
             "output_window_dims={4,5,6,7,8}, elided_window_dims={}, "
             "gather_dims_to_operand_dims={0,1,2,3,4}, "
-            "window_bounds={30,29,28,27,26}");
+            "index_vector_dim=4, window_bounds={30,29,28,27,26}");
+}
+
+TEST_F(HloInstructionTest, StringifyGather_1) {
+  Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46});
+  Shape gather_indices_tensor_shape =
+      ShapeUtil::MakeShape(S64, {10, 9, 5, 7, 6});
+  Shape gather_result_shape =
+      ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26});
+
+  HloComputation::Builder builder("Gather");
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_tensor_shape, "input_tensor"));
+  HloInstruction* gather_indices =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, gather_indices_tensor_shape, "gather_indices"));
+
+  HloInstruction* gather_instruction =
+      builder.AddInstruction(HloInstruction::CreateGather(
+          gather_result_shape, input, gather_indices,
+          HloInstruction::MakeGatherDimNumbers(
+              /*output_window_dims=*/{4, 5, 6, 7, 8},
+              /*elided_window_dims=*/{},
+              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*index_vector_dim=*/2),
+          /*window_bounds=*/{30, 29, 28, 27, 26}));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(gather_instruction->ToString(),
+            "%gather = f32[10,9,7,6,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
+            "gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, "
+            "s64[10,9,5,7,6]{4,3,2,1,0} %gather_indices), "
+            "output_window_dims={4,5,6,7,8}, elided_window_dims={}, "
+            "gather_dims_to_operand_dims={0,1,2,3,4}, "
+            "index_vector_dim=2, window_bounds={30,29,28,27,26}");
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index bc74c4bc10cad20eab20b5caf8550b17048a5276..69deac263ee58f9e4d46987a54f09b11d650950a 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -132,6 +132,69 @@ bool HloCustomCallMatcher::MatchAndExplain(
   return result;
 }
 
+bool HloShapeMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (ShapeUtil::Compatible(instruction->shape(), shape_)) {
+    return true;
+  }
+  *listener << instruction->ToString() << " has incorrect shape (expected: "
+            << ShapeUtil::HumanString(shape_) << ")";
+  return false;
+}
+
+void HloShapeMatcher::DescribeTo(std::ostream* os) const {
+  *os << ShapeUtil::HumanString(shape_);
+}
+
+bool HloShapeAndLayoutMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (ShapeUtil::Equal(instruction->shape(), shape_)) {
+    return true;
+  }
+  *listener << instruction->ToString() << " has incorrect shape (expected: "
+            << ShapeUtil::HumanStringWithLayout(shape_) << ")";
+  return false;
+}
+
+void HloShapeAndLayoutMatcher::DescribeTo(std::ostream* os) const {
+  *os << ShapeUtil::HumanStringWithLayout(shape_);
+}
+
+bool HloShardingMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!sharding_.has_value()) {
+    if (!instruction->has_sharding()) {
+      return true;
+    }
+    *listener << instruction->ToString() << " expected to have no sharding.";
+    return false;
+  }
+  if (instruction->has_sharding()) {
+    if (instruction->sharding() == sharding_.value()) {
+      return true;
+    }
+    *listener << instruction->ToString()
+              << " has incorrect sharding (expected: " << sharding_->ToString()
+              << ")";
+    return false;
+  } else {
+    *listener << instruction->ToString()
+              << " has no sharding (expected: " << sharding_->ToString() << ")";
+    return false;
+  }
+}
+
+void HloShardingMatcher::DescribeTo(std::ostream* os) const {
+  if (sharding_.has_value()) {
+    *os << sharding_->ToString();
+  } else {
+    *os << "<no-sharding>";
+  }
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 103f04a2cb7a1a5ae877d8bf259692f7cbed3408..5175736a2506c85836577a7f2ba2359a3d5a6b18 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
 namespace testing {
@@ -86,6 +87,50 @@ class HloCustomCallMatcher : public HloMatcher {
   ::testing::Matcher<string> call_target_matcher_;
 };
 
+class HloShapeMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloShapeMatcher(const Shape& shape) : shape_(shape) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  Shape shape_;
+};
+
+class HloShapeAndLayoutMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloShapeAndLayoutMatcher(const Shape& shape) : shape_(shape) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  Shape shape_;
+};
+
+// Verify the sharding of an instruction against the provided HloSharding. If a
+// nullopt is provided for the expected sharding then it checks that no sharding
+// is present for an instruction.
+class HloShardingMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloShardingMatcher(
+      const tensorflow::gtl::optional<HloSharding>& sharding)
+      : sharding_(sharding) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  tensorflow::gtl::optional<HloSharding> sharding_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -231,6 +276,40 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall() {
       new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {}));
 }
 
+// Verifies the shape or the shape and the layout of an HLO instruction against
+// the provided shape object.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
+    const class Shape& shape) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(shape));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
+    tensorflow::StringPiece shape) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(
+      ShapeUtil::ParseShapeString(shape).ValueOrDie()));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
+    const class Shape& shape) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShapeAndLayoutMatcher(shape));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
+    tensorflow::StringPiece shape) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloShapeAndLayoutMatcher(
+      ShapeUtil::ParseShapeString(shape).ValueOrDie()));
+}
+
+// Verifies the value of the HloSharing against the provided sharding object.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
+    const HloSharding& sharding) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShardingMatcher(sharding));
+}
+// Verifies that no HloSharding is set for an HLO instruction.
+inline ::testing::Matcher<const ::xla::HloInstruction*> NoSharding() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShardingMatcher(tensorflow::gtl::nullopt));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 1c21703a45e11914854153bc14fabd85e9ea57f2..f2463060b7cd653dffb408f8df17f44fe0c1a97c 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -100,5 +100,70 @@ TEST(HloMatchersTest, CustomCallMatcher) {
               R"(custom-call with call target that is equal to "foo_target")");
 }
 
+TEST(HloMatchersTest, ShapeMatcher) {
+  auto p0 = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1}), "param");
+
+  EXPECT_THAT(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {5, 7})));
+  EXPECT_THAT(p0.get(), op::Shape("f32[5,7]"));
+  EXPECT_THAT(
+      p0.get(),
+      ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {5, 7}))));
+  EXPECT_THAT(p0.get(), ::testing::Not(op::ShapeWithLayout("f32[5,7]")));
+  EXPECT_THAT(p0.get(),
+              ::testing::Not(op::Shape(ShapeUtil::MakeShape(F32, {7, 5}))));
+  EXPECT_THAT(p0.get(), ::testing::Not(op::Shape("f32[7,5]")));
+  EXPECT_THAT(
+      p0.get(),
+      ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {7, 5}))));
+  EXPECT_THAT(p0.get(), ::testing::Not(op::ShapeWithLayout("f32[7,5]")));
+  EXPECT_THAT(p0.get(),
+              op::Shape(ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1})));
+  EXPECT_THAT(p0.get(), op::Shape("f32[5,7]{0,1}"));
+  EXPECT_THAT(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout(
+                            F32, {5, 7}, {0, 1})));
+  EXPECT_THAT(p0.get(), op::ShapeWithLayout("f32[5,7]{0,1}"));
+  EXPECT_THAT(p0.get(),
+              ::testing::Not(op::ShapeWithLayout(
+                  ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {1, 0}))));
+  EXPECT_THAT(p0.get(), ::testing::Not(op::ShapeWithLayout("f32[5,7]{1,0}")));
+
+  EXPECT_THAT(Explain(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {7, 5}))),
+              "%param = f32[5,7]{0,1} parameter(0) has incorrect shape "
+              "(expected: f32[7,5])");
+  EXPECT_THAT(
+      Explain(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout(
+                            F32, {7, 5}, {1, 0}))),
+      "%param = f32[5,7]{0,1} parameter(0) has incorrect shape "
+      "(expected: f32[7,5]{1,0})");
+}
+
+TEST(HloMatchersTest, ShardingMatcher) {
+  auto p0 = HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {5}),
+                                            "param.0");
+  p0->clear_sharding();
+  auto p1 = HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {7}),
+                                            "param.1");
+  p1->set_sharding(HloSharding::AssignDevice(1));
+
+  EXPECT_THAT(p0.get(), op::NoSharding());
+  EXPECT_THAT(p0.get(),
+              ::testing::Not(op::Sharding(HloSharding::AssignDevice(1))));
+  EXPECT_THAT(p1.get(), ::testing::Not(op::NoSharding()));
+  EXPECT_THAT(p1.get(),
+              ::testing::Not(op::Sharding(HloSharding::AssignDevice(0))));
+  EXPECT_THAT(p1.get(), op::Sharding(HloSharding::AssignDevice(1)));
+
+  EXPECT_THAT(Explain(p0.get(), op::Sharding(HloSharding::AssignDevice(1))),
+              "%param.0 = f32[5]{0} parameter(0) has no sharding (expected: "
+              "{maximal device=1})");
+  EXPECT_THAT(Explain(p1.get(), op::NoSharding()),
+              "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} "
+              "expected to have no sharding.");
+  EXPECT_THAT(Explain(p1.get(), op::Sharding(HloSharding::AssignDevice(0))),
+              "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} "
+              "has incorrect sharding (expected: {maximal device=0})");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index cb2fe9f874012a51e1e6cbd1dd086dbb26994bde..d4bad16f7976fc6ddc70c1497ae4004407aba94c 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -41,9 +41,6 @@ HloModule::HloModule(const string& name,
       entry_computation_handle_(entry_computation_handle),
       unique_id_(next_unique_module_id_++) {}
 
-HloModule::HloModule(const string& name)
-    : name_(NameUniquer::GetSanitizedName(name)),
-      unique_id_(next_unique_module_id_++) {}
 HloModule::HloModule(const string& name, const HloModuleConfig& config)
     : name_(NameUniquer::GetSanitizedName(name)),
       config_(config),
@@ -83,6 +80,11 @@ HloComputation* HloModule::AddComputationInternal(
   for (auto* instruction : computation->instructions()) {
     instruction->SetUniqueId(NewUniqueInstructionId());
   }
+  // Set unique id to this computation.
+  CHECK_NE(computation->root_instruction()->unique_id(), -1)
+      << "Root has no valid id: " << computation->ToString();
+  computation->SetUniqueId(computation->root_instruction()->unique_id());
+
   computation->set_parent(this);
   computations_.push_back(std::move(computation));
   return computations_.back().get();
@@ -204,90 +206,36 @@ string HloModule::ToString(const HloPrintOptions& options) const {
 
 HloModuleProto HloModule::ToProto() const {
   HloModuleProto proto;
+  proto.set_id(unique_id_);
   proto.set_name(name_);
   proto.set_entry_computation_name(entry_computation_->name());
+  proto.set_entry_computation_id(entry_computation_->unique_id());
   for (const HloComputation* computation : MakeComputationPostOrder()) {
-    // Fusion computations are added when the fusion instructions are created by
-    // HloInstruction::CreateFromProto.
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
     HloComputationProto computation_proto = computation->ToProto();
+    if (computation->name() == entry_computation_->name()) {
+      *proto.mutable_program_shape() = computation_proto.program_shape();
+    }
     proto.add_computations()->Swap(&computation_proto);
   }
   return proto;
 }
 
-namespace {
-
-// Construct a ProgramShape matching the shape of the parameters and root of the
-// given module's entry computation.
-StatusOr<ProgramShape> ProgramShapeFromProto(const HloModuleProto& module) {
-  const HloComputationProto* entry_computation = nullptr;
-  for (const HloComputationProto& computation : module.computations()) {
-    if (computation.name() == module.entry_computation_name()) {
-      entry_computation = &computation;
-      break;
-    }
-  }
-  TF_RET_CHECK(entry_computation != nullptr)
-      << "No computation with entry computation name"
-      << module.entry_computation_name();
-
-  tensorflow::gtl::FlatMap<int64, std::pair<string, const Shape*>> parameters;
-  const HloInstructionProto* root = nullptr;
-  for (const HloInstructionProto& instruction :
-       entry_computation->instructions()) {
-    if (instruction.name() == entry_computation->root_name()) {
-      TF_RET_CHECK(root == nullptr) << "Entry computation has more than "
-                                       "one instruction with (root) name "
-                                    << instruction.name();
-      root = &instruction;
-    }
-    if (instruction.opcode() == HloOpcodeString(HloOpcode::kParameter)) {
-      TF_RET_CHECK(!ContainsKey(parameters, instruction.parameter_number()))
-          << "Entry computation has more than one parameter instruction "
-             "with parameter number "
-          << instruction.parameter_number();
-      parameters[instruction.parameter_number()] = {instruction.name(),
-                                                    &instruction.shape()};
-    }
-  }
-  TF_RET_CHECK(root != nullptr)
-      << "Entry computation is missing root instruction named "
-      << entry_computation->root_name();
-
-  ProgramShape program_shape;
-  *program_shape.mutable_result() = root->shape();
-  for (int64 i = 0; i < parameters.size(); ++i) {
-    TF_RET_CHECK(ContainsKey(parameters, i))
-        << "Entry computation missing parameter number " << i;
-    const string& name = parameters.at(i).first;
-    const Shape& shape = *parameters.at(i).second;
-    *program_shape.add_parameters() = shape;
-    program_shape.add_parameter_names(name);
-  }
-
-  return std::move(program_shape);
-}
-
-}  // namespace
-
 /* static */
 StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config,
     const VersionedComputationHandle& entry_computation_handle) {
   // The ProgramShape in the passed in module config must match the shapes of
   // the entry parameters and root.
-  TF_ASSIGN_OR_RETURN(ProgramShape expected_program_shape,
-                      ProgramShapeFromProto(proto));
+  TF_RET_CHECK(proto.has_program_shape())
+      << "No program shape found in the proto";
+  const auto& expected_program_shape = proto.program_shape();
   TF_RET_CHECK(expected_program_shape.parameters_size() ==
                module_config.entry_computation_layout().parameter_count());
   for (int i = 0; i < expected_program_shape.parameters_size(); ++i) {
     const Shape& parameter_shape =
         module_config.entry_computation_layout().parameter_layout(i).shape();
-    TF_RET_CHECK(
-        ShapeUtil::Equal(expected_program_shape.parameters(i), parameter_shape))
+    TF_RET_CHECK(ShapeUtil::Compatible(expected_program_shape.parameters(i),
+                                       parameter_shape))
         << "HloModuleConfig has different shape for parameter " << i
         << " than the HLO module. Expected: "
         << ShapeUtil::HumanStringWithLayout(
@@ -296,7 +244,8 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   }
   const Shape& result_shape =
       module_config.entry_computation_layout().result_layout().shape();
-  TF_RET_CHECK(ShapeUtil::Equal(expected_program_shape.result(), result_shape))
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(expected_program_shape.result(), result_shape))
       << "HloModuleConfig has different result shape than the HLO module. "
          "Expected: "
       << ShapeUtil::HumanStringWithLayout(expected_program_shape.result())
@@ -305,26 +254,20 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   auto module = MakeUnique<HloModule>(proto.name(), entry_computation_handle,
                                       module_config);
 
-  tensorflow::gtl::FlatMap<string, HloComputation*> computation_map;
+  tensorflow::gtl::FlatMap<int64, HloComputation*> computation_map;
   for (const HloComputationProto& computation_proto : proto.computations()) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloComputation> computation,
-        HloComputation::CreateFromProto(
-            module.get(), computation_proto, computation_map,
-            /*add_fused_computation=*/
-            [&module](std::unique_ptr<HloComputation> fused_computation) {
-              module->AddComputationInternal(std::move(fused_computation),
-                                             /*is_entry=*/false,
-                                             /*uniquify_names=*/false);
-            }));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> computation,
+                        HloComputation::CreateFromProto(
+                            module.get(), computation_proto, computation_map));
     CHECK_NE(computation.get(), nullptr);
-    TF_RET_CHECK(!ContainsKey(computation_map, computation->name()));
-    string computation_name = computation->name();
+    int64 computation_id = computation_proto.id();
+    TF_RET_CHECK(computation_id != -1);
+    TF_RET_CHECK(!ContainsKey(computation_map, computation_id));
     // Don't uniquify names because we want names to be stable across
     // serialization and deserialization.
-    computation_map[computation_name] = module->AddComputationInternal(
+    computation_map[computation_id] = module->AddComputationInternal(
         std::move(computation),
-        /*is_entry=*/proto.entry_computation_name() == computation_name,
+        /*is_entry=*/proto.entry_computation_id() == computation_id,
         /*uniquify_names=*/false);
   }
   TF_RET_CHECK(module->entry_computation_ != nullptr);
@@ -334,10 +277,6 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   tensorflow::gtl::FlatSet<string> computation_names;
   tensorflow::gtl::FlatSet<string> instruction_names;
   for (HloComputation* computation : module->computations()) {
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
-
     TF_RET_CHECK(!ContainsKey(computation_names, computation->name()))
         << "Computation name is not unique: " << computation->name();
     computation_names.insert(computation->name());
@@ -353,11 +292,13 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
 /* static */
 StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
-    const HloModuleProto& module) {
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      ProgramShapeFromProto(module));
+    const HloModuleProto& module, const DebugOptions& debug_options) {
+  TF_RET_CHECK(module.has_program_shape())
+      << "No program shape found in the proto";
+  const auto& program_shape = module.program_shape();
 
   HloModuleConfig module_config(program_shape);
+  module_config.set_debug_options(debug_options);
 
   // The module config is constructed with default layouts regardless of what is
   // passed in via the ProgramShape. Set the layouts to the appropriate values.
@@ -535,8 +476,7 @@ std::vector<HloComputation*> HloModule::MakeNonfusionComputations() const {
 
 std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
   VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n";
-  auto module = MakeUnique<HloModule>(name_ + "-" + suffix);
-  module->config_ = config_;
+  auto module = MakeUnique<HloModule>(name_ + "-" + suffix, config_);
   module->entry_computation_handle_ = entry_computation_handle_;
   module->has_entry_computation_handle_ = has_entry_computation_handle_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 06d92f94fd6f62162b22575e9cc341f2906cd0db..aa843ead51747902ce9536e80c2045e257b0dc2a 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -55,7 +55,6 @@ class HloModule {
   // only be used for HloModules used outside of the XLA service (eg
   // tests). The versioned handle is used by the service in the compilation
   // cache. A default configuration is created for this module.
-  explicit HloModule(const string& name);
   explicit HloModule(const string& name, const HloModuleConfig& config);
 
   // Adds an entry computation to the module. A module can only have one entry
@@ -103,7 +102,7 @@ class HloModule {
     return config_.mutable_entry_computation_layout();
   }
 
-  ComputationLayout entry_computation_layout() const {
+  const ComputationLayout& entry_computation_layout() const {
     return config_.entry_computation_layout();
   }
 
@@ -172,7 +171,7 @@ class HloModule {
   // Creates and returns an HloModuleConfig with an appropriate program shape
   // for the HLO module in the given proto.
   static StatusOr<HloModuleConfig> CreateModuleConfigFromProto(
-      const HloModuleProto& module);
+      const HloModuleProto& module, const DebugOptions& debug_options);
 
   // Outlines the given expression from the given computation.
   // instructions_to_outline contains the instructions that form the expression.
@@ -187,11 +186,6 @@ class HloModule {
   // Returns a randomly generated uint64.
   uint64 RandomNew64() const;
 
-  // Returns the unique name for a computation in this module.
-  string GetUniqueCompuationName(const string& prefix) {
-    return computation_name_uniquer_.GetUniqueName(prefix);
-  }
-
   // Returns the NameUniquer for uniquing instruction names in this module.
   NameUniquer& instruction_name_uniquer() { return instruction_name_uniquer_; }
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index 822e2f1f53e5ee460b88c2241ecf7f6b91ef608b..4205b0402cb8b2c31141d65be652cd84c22e7262 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -40,7 +40,7 @@ void HloModuleConfig::SetDefaultComputationLayout(
 
 string HloModuleConfig::compilation_cache_key() const {
   string key =
-      tensorflow::strings::StrCat("profiling=", hlo_profiling_enabled_);
+      tensorflow::strings::StrCat("profiling=", hlo_profiling_enabled());
   StrAppend(&key, "::(");
   std::vector<string> params;
   for (const ShapeLayout& param_layout :
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index a5ee895e48448fbb8fa3879dc1b6764c1f9f6966..586a03d412681cacdd780f48e77baf4cd4c51415 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -63,9 +63,19 @@ class HloModuleConfig {
     return &(*entry_computation_layout_);
   }
 
-  // Sets/returns whether to enable HLO-level profiling.
-  bool hlo_profiling_enabled() const { return hlo_profiling_enabled_; }
-  void enable_hlo_profiling(bool enabled) { hlo_profiling_enabled_ = enabled; }
+  // Returns whether to enable HLO-level profiling.
+  bool hlo_profiling_enabled() const {
+    return debug_options_.xla_hlo_profile();
+  }
+
+  // Sets/returns whether this is a "host module".  Host modules are used to
+  // record the data- and control-flow dependencies of host side computation
+  // that communicates with compiled code.  They are used for analysis and
+  // scheduling purposes, but no code is generated.
+  bool is_host_module() const { return is_host_module_; }
+  void set_is_host_module(bool is_host_module) {
+    is_host_module_ = is_host_module;
+  }
 
   // Sets/returns the module seed set during execution.
   void set_seed(uint64 seed) { seed_ = seed; }
@@ -101,8 +111,8 @@ class HloModuleConfig {
 
   tensorflow::gtl::optional<ComputationLayout> entry_computation_layout_;
 
-  // Whether to enable HLO-level profiling.
-  bool hlo_profiling_enabled_ = false;
+  // Whether this is a 'host module'.
+  bool is_host_module_ = false;
 
   // Module/graph-level seed handle.
   uint64 seed_ = 0;
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54c34ce116651608e6d91cdcba9c708ca3a5f75e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -0,0 +1,371 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_module_group_metadata.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+string HloModuleGroupMetadata::TrackedInstruction::ToString() const {
+  string repr =
+      (instruction_ != nullptr) ? instruction_->ToShortString() : "NULL";
+  switch (kind_) {
+    case ComputationKind::kInvalid:
+      repr += ":INVALID";
+      break;
+    case ComputationKind::kWhileCondition:
+      repr += ":WHILE_CONDITION";
+      break;
+    case ComputationKind::kWhileBody:
+      repr += ":WHILE_BODY";
+      break;
+    case ComputationKind::kConditionalTrue:
+      repr += ":CONDITIONAL_TRUE";
+      break;
+    case ComputationKind::kConditionalFalse:
+      repr += ":CONDITIONAL_FALSE";
+      break;
+  }
+  return repr;
+}
+
+/* static */ StatusOr<std::unique_ptr<HloModuleGroupMetadata>>
+HloModuleGroupMetadata::Build(const std::vector<HloModule*>& modules) {
+  auto metadata = absl::make_unique<HloModuleGroupMetadata>(modules);
+  TF_RETURN_IF_ERROR(metadata->Build());
+  return std::move(metadata);
+}
+
+Status HloModuleGroupMetadata::Build() {
+  TF_RETURN_IF_ERROR(RecordInstructions());
+  TF_RETURN_IF_ERROR(VerifyChannelInstructions());
+
+  // Record all companion while instructions.
+  const auto visitor = [this](HloInstruction* hlo) -> Status {
+    // We only need to process if the instruction is within the computation
+    // of a companion instruction, like in the condition or body computation
+    // of a While.
+    const TrackedInstruction* tracked = GetTrackedInstruction(hlo->parent());
+    if (tracked == nullptr) {
+      return Status::OK();
+    }
+    // Add the parent computation of this channel instruction and its peer
+    // computation (both must be while computations) as companions.
+    if (IsChannelInstruction(hlo)) {
+      HloComputation* peer_computation = PeerComputation(hlo);
+      const TrackedInstruction* peer_tracked =
+          GetTrackedInstruction(peer_computation);
+      TF_RET_CHECK(peer_tracked != nullptr)
+          << "Peer instruction is not a possible companion";
+      TF_RET_CHECK(*tracked == *peer_tracked)
+          << "Peer instruction does not match the computation kind";
+      TF_RETURN_IF_ERROR(
+          AddCompanion(tracked->instruction(), peer_tracked->instruction()));
+    }
+
+    // Add the parents of companion instructions (they must be all of the same
+    // kind of instructions, opcode wise) as companions.
+    if (IsCompanionInstruction(hlo)) {
+      for (HloInstruction* companion : Companions(hlo)) {
+        const TrackedInstruction* companion_tracked =
+            GetTrackedInstruction(companion->parent());
+        TF_RET_CHECK(companion_tracked != nullptr);
+        TF_RET_CHECK(*tracked == *companion_tracked);
+        TF_RETURN_IF_ERROR(AddCompanion(tracked->instruction(),
+                                        companion_tracked->instruction()));
+      }
+    }
+    return Status::OK();
+  };
+
+  // Visit the computations in postorder so that the companion information grows
+  // from inner computations to outer ones.
+  for (HloModule* module : modules_) {
+    for (HloComputation* computation : module->MakeComputationPostOrder()) {
+      TF_RETURN_IF_ERROR(computation->Accept(visitor));
+    }
+  }
+  return Status::OK();
+}
+
+bool HloModuleGroupMetadata::IsChannelInstruction(
+    const HloInstruction* instruction) const {
+  switch (instruction->opcode()) {
+    case HloOpcode::kSend:
+    case HloOpcode::kRecv:
+    case HloOpcode::kSendDone:
+    case HloOpcode::kRecvDone:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool HloModuleGroupMetadata::IsCompanionInstruction(HloInstruction* hlo) const {
+  return companion_set_index_.count(hlo) > 0;
+}
+
+bool HloModuleGroupMetadata::InstructionCommunicates(
+    HloInstruction* hlo) const {
+  return IsChannelInstruction(hlo) || IsCompanionInstruction(hlo);
+}
+
+const HloModuleGroupMetadata::Channel& HloModuleGroupMetadata::GetChannel(
+    int64 channel_id) const {
+  CHECK(channel_id_map_.find(channel_id) != channel_id_map_.end());
+  return channels_[channel_id_map_.at(channel_id)];
+}
+
+HloComputation* HloModuleGroupMetadata::PeerComputation(
+    const HloInstruction* instruction) const {
+  CHECK(IsChannelInstruction(instruction));
+  const Channel& channel = GetChannel(instruction->channel_id());
+  switch (instruction->opcode()) {
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
+      return channel.recv->parent();
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
+      return channel.send->parent();
+    default:
+      LOG(FATAL) << "opcode not supported";
+  }
+}
+
+std::vector<HloModuleGroupMetadata::TrackedInstruction>
+HloModuleGroupMetadata::GetCompanionsPath(const HloInstruction* hlo) const {
+  std::vector<TrackedInstruction> path;
+  const HloComputation* parent = hlo->parent();
+  const TrackedInstruction* companion;
+  while ((companion = GetTrackedInstruction(parent)) != nullptr) {
+    parent = companion->instruction()->parent();
+    path.push_back(*companion);
+  }
+  return path;
+}
+
+bool HloModuleGroupMetadata::CheckCompanionPathsCompatibility(
+    const std::vector<TrackedInstruction>& path0,
+    const std::vector<TrackedInstruction>& path1) const {
+  if (path0.size() != path1.size()) {
+    VLOG(5) << "Companion path size do not match: " << path0.size()
+            << " != " << path1.size();
+    return false;
+  }
+  for (int64 i = 0; i < path0.size(); ++i) {
+    if (path0[i] != path1[i]) {
+      VLOG(5) << "Companion instructions at path index " << i
+              << " do not have the same opcode: " << path0[i].ToString()
+              << " vs " << path1[i].ToString();
+      return false;
+    }
+  }
+  return true;
+}
+
+int64 HloModuleGroupMetadata::GetModuleId(const HloModule* module) const {
+  for (int64 i = 0; i < modules_.size(); ++i) {
+    if (modules_[i] == module) {
+      return i;
+    }
+  }
+  LOG(FATAL) << "unknown module";
+}
+
+Status HloModuleGroupMetadata::RecordInstructions() {
+  const auto visitor = [this](HloInstruction* hlo) -> Status {
+    if (hlo->opcode() == HloOpcode::kWhile) {
+      tracked_instructions_[hlo->while_condition()] =
+          TrackedInstruction(hlo, ComputationKind::kWhileCondition);
+      tracked_instructions_[hlo->while_body()] =
+          TrackedInstruction(hlo, ComputationKind::kWhileBody);
+    } else if (hlo->opcode() == HloOpcode::kConditional) {
+      tracked_instructions_[hlo->true_computation()] =
+          TrackedInstruction(hlo, ComputationKind::kConditionalTrue);
+      tracked_instructions_[hlo->false_computation()] =
+          TrackedInstruction(hlo, ComputationKind::kConditionalFalse);
+    }
+    if (!IsChannelInstruction(hlo)) {
+      return Status::OK();
+    }
+
+    // Add a new channel if needed.
+    if (channel_id_map_.find(hlo->channel_id()) == channel_id_map_.end()) {
+      channels_.emplace_back();
+      channels_.back().id = hlo->channel_id();
+      channel_id_map_[hlo->channel_id()] = channels_.size() - 1;
+      max_channel_id_ = std::max(max_channel_id_, hlo->channel_id());
+    }
+    Channel& channel = channels_[channel_id_map_[hlo->channel_id()]];
+
+    if (hlo->opcode() == HloOpcode::kSend) {
+      TF_RET_CHECK(channel.send == nullptr)
+          << "channel id " << hlo->channel_id()
+          << " is used by multiple send instructions";
+      channel.send = hlo;
+    }
+    if (hlo->opcode() == HloOpcode::kRecv) {
+      TF_RET_CHECK(channel.recv == nullptr)
+          << "channel id " << hlo->channel_id()
+          << " is used by multiple recv instructions";
+      channel.recv = hlo;
+    }
+    if (hlo->opcode() == HloOpcode::kSendDone) {
+      TF_RET_CHECK(channel.send_done == nullptr)
+          << "channel id " << hlo->channel_id()
+          << " is used by multiple send-done instructions";
+      channel.send_done = hlo;
+    }
+    if (hlo->opcode() == HloOpcode::kRecvDone) {
+      TF_RET_CHECK(channel.recv_done == nullptr)
+          << "channel id " << hlo->channel_id()
+          << " is used by multiple recv-done instructions";
+      channel.recv_done = hlo;
+    }
+    return Status::OK();
+  };
+
+  for (HloModule* module : modules_) {
+    for (auto* computation : module->computations()) {
+      TF_RETURN_IF_ERROR(computation->Accept(visitor));
+    }
+  }
+  return Status::OK();
+}
+
+Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
+                                            HloInstruction* instruction2) {
+  TF_RET_CHECK(instruction1->opcode() == HloOpcode::kWhile ||
+               instruction1->opcode() == HloOpcode::kConditional);
+  VLOG(2) << "adding as companions:" << instruction1->ToString() << " and "
+          << instruction2->ToString();
+
+  if (!ContainsKey(companion_set_index_, instruction1) &&
+      !ContainsKey(companion_set_index_, instruction2)) {
+    companion_sets_.push_back(
+        absl::make_unique<std::unordered_set<HloInstruction*>>());
+    auto companion_set = companion_sets_.back().get();
+    companion_set->insert(instruction1);
+    companion_set->insert(instruction2);
+    companion_set_index_[instruction1] = companion_sets_.size() - 1;
+    companion_set_index_[instruction2] = companion_sets_.size() - 1;
+  } else if (!ContainsKey(companion_set_index_, instruction1)) {
+    companion_sets_[companion_set_index_[instruction2]]->insert(instruction1);
+    companion_set_index_[instruction1] = companion_set_index_[instruction2];
+  } else if (!ContainsKey(companion_set_index_, instruction2)) {
+    companion_sets_[companion_set_index_[instruction1]]->insert(instruction2);
+    companion_set_index_[instruction2] = companion_set_index_[instruction1];
+  } else if (companion_set_index_[instruction1] !=
+             companion_set_index_[instruction2]) {
+    companion_sets_[companion_set_index_[instruction1]]->insert(
+        Companions(instruction2).begin(), Companions(instruction2).end());
+    int64 index_to_remove = companion_set_index_[instruction2];
+    for (HloInstruction* hlo : Companions(instruction2)) {
+      companion_set_index_[hlo] = companion_set_index_[instruction1];
+    }
+    companion_sets_.erase(companion_sets_.begin() + index_to_remove);
+  }
+  return Status::OK();
+}
+
+Status HloModuleGroupMetadata::VerifyChannelInstructions() {
+  for (const Channel& channel : channels_) {
+    if (channel.send == nullptr) {
+      return FailedPrecondition("missing send for id : %lld", channel.id);
+    }
+    if (channel.recv == nullptr) {
+      return FailedPrecondition("missing recv for id : %lld", channel.id);
+    }
+    if (channel.send_done == nullptr) {
+      return FailedPrecondition("missing send-done for id : %lld", channel.id);
+    }
+    if (channel.recv_done == nullptr) {
+      return FailedPrecondition("missing recv-done for id : %lld", channel.id);
+    }
+  }
+
+  // Check if the shapes match for each channel.
+  for (const Channel& channel : channels_) {
+    const Shape& send_shape = channel.send->operand(0)->shape();
+    const Shape& recv_shape = channel.recv_done->shape();
+    if (!ShapeUtil::Compatible(send_shape, recv_shape)) {
+      return FailedPrecondition("send/recv shapes do not match");
+    }
+    const HloModule* send_module = channel.send->parent()->parent();
+    const HloModule* send_done_module = channel.send_done->parent()->parent();
+    if (send_module != send_done_module) {
+      return FailedPrecondition(
+          "send and send-done (channel=%lld) must be on the same device: %lld "
+          "vs. %lld",
+          channel.id, GetModuleId(send_module), GetModuleId(send_done_module));
+    }
+    const HloModule* recv_module = channel.recv->parent()->parent();
+    const HloModule* recv_done_module = channel.recv_done->parent()->parent();
+    if (recv_module != recv_done_module) {
+      return FailedPrecondition(
+          "recv and recv-done (channel=%lld) must be on the same device: %lld "
+          "vs. %lld",
+          channel.id, GetModuleId(recv_module), GetModuleId(recv_done_module));
+    }
+    if (send_module == recv_module) {
+      return FailedPrecondition(
+          "send and recv (channel=%lld) must be on different devices: %lld",
+          channel.id, GetModuleId(send_module));
+    }
+  }
+
+  // Check if channel instructions are used only in allowed computations.
+  const auto allowed = [this](HloInstruction* hlo) {
+    HloComputation* computation = hlo->parent();
+    const HloModule* module = computation->parent();
+    if (module->entry_computation() == computation ||
+        tracked_instructions_.count(computation) > 0) {
+      return true;
+    }
+    return false;
+  };
+  for (const Channel& channel : channels_) {
+    if (!allowed(channel.send) || !allowed(channel.send_done) ||
+        !allowed(channel.recv) || !allowed(channel.recv_done)) {
+      return FailedPrecondition("channel is used in disallowed computation");
+    }
+  }
+  // Check if the nest levels match for each channel.
+  for (const Channel& channel : channels_) {
+    std::vector<TrackedInstruction> path = GetCompanionsPath(channel.send);
+    if (!CheckCompanionPathsCompatibility(
+            path, GetCompanionsPath(channel.send_done)) ||
+        !CheckCompanionPathsCompatibility(path,
+                                          GetCompanionsPath(channel.recv)) ||
+        !CheckCompanionPathsCompatibility(
+            path, GetCompanionsPath(channel.recv_done))) {
+      return FailedPrecondition(
+          "Nest companion paths do not match for channel %lld", channel.id);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..c48a7ab0b59269474f7406ef24a249355528e085
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -0,0 +1,239 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_GROUP_METADATA_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_GROUP_METADATA_H_
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// Class for bookkeeping the information on the given modules, in particular on
+// the interaction between computations.
+//
+// Companion instructions are one of the information collected as we build the
+// metadata. For example, for each While instruction, companion instructions
+// refer to a set of While instructions in other computations that communicate
+// with each other.
+// In the example below with 3 modules, {While_0, While_2, While_5}, {While_1,
+// While_4}, {While_3, While_6} are companion sets.
+//
+// <Module 0>               <Module 1>                 <Module 2>
+// While_0() {              While_2() {                While_5() {
+//   While_1() { Send(0) }    While_3() { Send(1) }      While_6() { Recv(1) }
+// }                          While_4() { Recv(0) }
+//                          }
+//
+// Companion instructions are used to detect cycles in the graph and also for
+// global scheduling.
+class HloModuleGroupMetadata {
+ public:
+  // The kind of companion computation a given instruction can be within.
+  enum class ComputationKind {
+    kInvalid,
+    kWhileCondition,
+    kWhileBody,
+    kConditionalTrue,
+    kConditionalFalse,
+  };
+
+  // Tracks the instruction mapped to a given computation, and the computation
+  // kind.
+  // For example, a body computation of a while instruction, will generate a
+  // TrackedInstruction with instruction being the while instruction, and
+  // kind being ComputationKind::kWhileBody.
+  class TrackedInstruction {
+   public:
+    TrackedInstruction() = default;
+    TrackedInstruction(HloInstruction* instruction, ComputationKind kind)
+        : instruction_(instruction), kind_(kind) {}
+
+    bool operator==(const TrackedInstruction& rhs) const {
+      return instruction_->opcode() == rhs.instruction_->opcode() &&
+             kind_ == rhs.kind_;
+    }
+    bool operator!=(const TrackedInstruction& rhs) const {
+      return !operator==(rhs);
+    }
+
+    HloInstruction* instruction() const { return instruction_; }
+
+    string ToString() const;
+
+   private:
+    HloInstruction* instruction_ = nullptr;
+    ComputationKind kind_ = ComputationKind::kInvalid;
+  };
+
+  // Represents a channel and the 4 instructions that form the channel.
+  struct Channel {
+    int64 id = -1;
+    HloInstruction* send = nullptr;
+    HloInstruction* recv = nullptr;
+    HloInstruction* send_done = nullptr;
+    HloInstruction* recv_done = nullptr;
+  };
+
+  explicit HloModuleGroupMetadata(const std::vector<HloModule*>& modules)
+      : modules_(modules) {}
+
+  ~HloModuleGroupMetadata() = default;
+
+  // Build and return the metadata for the given modules.
+  static StatusOr<std::unique_ptr<HloModuleGroupMetadata>> Build(
+      const std::vector<HloModule*>& modules);
+
+  // Returns true if the instruction is one of the 4 channel instructions (Send,
+  // Recv, SendDone, RecvDone).
+  bool IsChannelInstruction(const HloInstruction* instruction) const;
+
+  // Returns true if the instruction is a companion instruction. See the class
+  // comment above on companion instructions.
+  bool IsCompanionInstruction(HloInstruction* hlo) const;
+
+  // Returns true if the instruction is either a channel instruction or a
+  // companion instruction.
+  bool InstructionCommunicates(HloInstruction* hlo) const;
+
+  // Returns the Channel instance for the given channel id.
+  const Channel& GetChannel(int64 channel_id) const;
+
+  // Returns the computation that contains the peer channel instructions for
+  // the given instruction.
+  //
+  // Precondition: IsChannelInstruction(instruction) is true.
+  HloComputation* PeerComputation(const HloInstruction* instruction) const;
+
+  // Returns the path of the nested companion instructions, in terms of HLO
+  // instructions. The path goes from inner to outer companions.
+  // The returned path does not include the input hlo instruction, in case it
+  // is a companion instruction.
+  std::vector<TrackedInstruction> GetCompanionsPath(
+      const HloInstruction* hlo) const;
+
+  // Checks whether two companion paths (as returned by the GetCompanionsPath()
+  // API) are compatible. The two paths are compatible if the sequence of
+  // opcodes, and the companion kinds, of the two paths matches.
+  bool CheckCompanionPathsCompatibility(
+      const std::vector<TrackedInstruction>& path0,
+      const std::vector<TrackedInstruction>& path1) const;
+
+  // Returns the unique integer for each module. The returned id is the index of
+  // the module in the module vector.
+  int64 GetModuleId(const HloModule* module) const;
+
+  // Returns the companion instructions for the given instruction.
+  //
+  // Precondition: IsCompanionWhile(instruction) is true.
+  const std::unordered_set<HloInstruction*>& Companions(
+      HloInstruction* instruction) const {
+    CHECK_EQ(companion_set_index_.count(instruction), 1);
+    return companion_set(companion_set_index_.at(instruction));
+  }
+
+  // Returns the companion set at the given index.
+  const std::unordered_set<HloInstruction*>& companion_set(int64 index) const {
+    CHECK_LT(index, companion_sets_.size());
+    return *companion_sets_[index];
+  }
+
+  // Returns the companion set index of the given instruction.
+  int64 companion_set_index(HloInstruction* instruction) const {
+    return companion_set_index_.at(instruction);
+  }
+
+  // Returns the list of all companion sets in the HLO module group.
+  const std::vector<std::unique_ptr<std::unordered_set<HloInstruction*>>>&
+  companion_sets() const {
+    return companion_sets_;
+  }
+
+  // Returns all channels in the module group.
+  const std::vector<Channel>& channels() const { return channels_; }
+
+  // Returns the maximum channel id used in the module group.
+  int64 max_channel_id() const { return max_channel_id_; }
+
+ private:
+  Status Build();
+
+  // Record all channel instructions and While instructions.
+  Status RecordInstructions();
+
+  // Verifies the given HloModules are well-formed and follow the specification,
+  // in particular with respect to using channel instructions.
+  //
+  // * Each channel has all 4 instructions (Send, Recv, SendDone, RecvDone).
+  // * The shape of channel instructions match.
+  // * The nest level of channel instructions match.
+  // * Channel instructions are used in allowed computations; i.e., in the
+  //   entry computation of the module or condition/body of While computations.
+  //
+  // TODO(b/62064342): Currently, HloModuleGroupScheduler checks if there is a
+  // cycle in the graph, but it would be good to verify here.
+  Status VerifyChannelInstructions();
+
+  // Adds metadata that the given two instructions are companions.
+  Status AddCompanion(HloInstruction* instruction1,
+                      HloInstruction* instruction2);
+
+  // Retrieves a pointer to the stored TrackedInstruction associated with a
+  // tracked computation, or nullptr in case such computation is not tracked.
+  const TrackedInstruction* GetTrackedInstruction(
+      const HloComputation* computation) const {
+    auto it = tracked_instructions_.find(computation);
+    return it != tracked_instructions_.end() ? &it->second : nullptr;
+  }
+
+  // List of all companion instructions sets in the module.
+  std::vector<std::unique_ptr<std::unordered_set<HloInstruction*>>>
+      companion_sets_;
+
+  // Map from each companion while instruction to the index into companion_set_.
+  tensorflow::gtl::FlatMap<HloInstruction*, int64> companion_set_index_;
+
+  // Map from computation to the instruction using it (a kWhile, kConditional).
+  tensorflow::gtl::FlatMap<const HloComputation*, TrackedInstruction>
+      tracked_instructions_;
+
+  // All channels in the module.
+  std::vector<Channel> channels_;
+
+  // Map from channel ids to the index in channels_.
+  tensorflow::gtl::FlatMap<int64, int64> channel_id_map_;
+
+  // The maximum channel id used in the module group.
+  int64 max_channel_id_ = -1;
+
+  // The modules that this metadata was built from.
+  const std::vector<HloModule*>& modules_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_GROUP_METADATA_H_
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..289c96b0a7b90c5f8a122cd3fc327a5762099106
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -0,0 +1,316 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_module_group_util.h"
+
+#include <algorithm>
+#include <list>
+#include <queue>
+#include <stack>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+std::vector<HloInstruction*> HloModuleGroupUtil::GlobalPredecessors(
+    HloInstruction* instruction) {
+  std::vector<HloInstruction*> predecessors;
+
+  // Adds to the unique predecessors list and also add companion instructions
+  // if the given predecessor has those.
+  auto add_unique_predecessor = [&](HloInstruction* predecessor) {
+    if (std::find(predecessors.begin(), predecessors.end(), predecessor) !=
+        predecessors.end()) {
+      return;
+    }
+    if (!metadata_.IsCompanionInstruction(predecessor)) {
+      predecessors.push_back(predecessor);
+      return;
+    }
+    for (HloInstruction* companion : metadata_.Companions(predecessor)) {
+      predecessors.push_back(companion);
+    }
+  };
+
+  // If the given instruction is a companion instruction, we need to find the
+  // predecessors of all of its companion instructions.
+  std::vector<HloInstruction*> instruction_group;
+  if (metadata_.IsCompanionInstruction(instruction)) {
+    for (HloInstruction* companion : metadata_.Companions(instruction)) {
+      instruction_group.push_back(companion);
+    }
+  } else {
+    instruction_group.push_back(instruction);
+  }
+
+  for (HloInstruction* hlo : instruction_group) {
+    for (HloInstruction* operand : hlo->operands()) {
+      add_unique_predecessor(operand);
+    }
+    for (HloInstruction* control_predecessor : hlo->control_predecessors()) {
+      add_unique_predecessor(control_predecessor);
+    }
+  }
+  if (instruction->opcode() == HloOpcode::kRecvDone) {
+    // Send is a remote predecessor of RecvDone.
+    HloInstruction* send = metadata_.GetChannel(instruction->channel_id()).send;
+    add_unique_predecessor(send);
+  }
+  if (instruction->opcode() == HloOpcode::kSend) {
+    // Recv is a remote predecessor of Send.
+    HloInstruction* recv_done =
+        metadata_.GetChannel(instruction->channel_id()).recv_done;
+    CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
+    CHECK_EQ(recv_done->operand_count(), 1);
+    HloInstruction* recv = recv_done->mutable_operand(0);
+    add_unique_predecessor(recv);
+  }
+  return predecessors;
+}
+
+std::vector<HloInstruction*> HloModuleGroupUtil::GlobalSuccessors(
+    HloInstruction* instruction) {
+  std::vector<HloInstruction*> successors;
+
+  // Adds to the unique successors list and also add companion instructions
+  // if the given successor has those.
+  auto add_unique_successor = [&](HloInstruction* successor) {
+    if (std::find(successors.begin(), successors.end(), successor) !=
+        successors.end()) {
+      return;
+    }
+    if (!metadata_.IsCompanionInstruction(successor)) {
+      successors.push_back(successor);
+      return;
+    }
+    for (HloInstruction* companion : metadata_.Companions(successor)) {
+      successors.push_back(companion);
+    }
+  };
+
+  // If the given instruction is a companion instruction, we need to find the
+  // successors of all of its companion instructions.
+  std::vector<HloInstruction*> instruction_group;
+  if (metadata_.IsCompanionInstruction(instruction)) {
+    for (HloInstruction* companion : metadata_.Companions(instruction)) {
+      instruction_group.push_back(companion);
+    }
+  } else {
+    instruction_group.push_back(instruction);
+  }
+
+  for (HloInstruction* hlo : instruction_group) {
+    for (HloInstruction* user : hlo->users()) {
+      add_unique_successor(user);
+    }
+    for (HloInstruction* control_successor : hlo->control_successors()) {
+      add_unique_successor(control_successor);
+    }
+  }
+  if (instruction->opcode() == HloOpcode::kRecv) {
+    // Send is a remote successor of Recv.
+    const HloInstruction* recv_done = instruction->users().front();
+    CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
+    HloInstruction* send = metadata_.GetChannel(instruction->channel_id()).send;
+    add_unique_successor(send);
+  }
+  if (instruction->opcode() == HloOpcode::kSend) {
+    // RecvDone is a remote successor of Send.
+    HloInstruction* recv_done =
+        metadata_.GetChannel(instruction->channel_id()).recv_done;
+    add_unique_successor(recv_done);
+  }
+  return successors;
+}
+
+std::vector<HloInstruction*> HloModuleGroupUtil::RootInstructions(
+    tensorflow::gtl::ArraySlice<HloComputation*> computations) {
+  std::vector<HloInstruction*> roots;
+  for (HloComputation* computation : computations) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (GlobalSuccessors(instruction).empty()) {
+        roots.push_back(instruction);
+      }
+    }
+  }
+  return roots;
+}
+
+Status HloModuleGroupUtil::VisitTopologicalOrder(
+    VisitStates* visit_state, const VisitFunction& visit_function,
+    HloInstruction* root) {
+  // Stack of HLO instructions visited in DFS order.
+  std::stack<HloInstruction*> stack;
+  stack.push(root);
+
+  while (!stack.empty()) {
+    HloInstruction* hlo = stack.top();
+
+    // Find the instruction group of the currently visited instruction. The
+    // instruction group represents all companion instructions of the
+    // current instruction, and are considered to be a single entity for the
+    // purpose of the traversal (i.e., they must always be in the same visit
+    // state).
+    std::vector<HloInstruction*> instruction_group;
+    if (metadata_.IsCompanionInstruction(hlo)) {
+      for (HloInstruction* companion : metadata_.Companions(hlo)) {
+        instruction_group.push_back(companion);
+      }
+    } else {
+      instruction_group.push_back(hlo);
+    }
+
+    if ((*visit_state)[hlo] == VisitState::kVisited) {
+      // All instructions in the group must be in the same state.
+      for (HloInstruction* instruction : instruction_group) {
+        TF_RET_CHECK((*visit_state)[instruction] == VisitState::kVisited);
+      }
+      stack.pop();
+      continue;
+    }
+
+    if ((*visit_state)[hlo] == VisitState::kVisiting) {
+      TF_RETURN_IF_ERROR(visit_function(hlo, instruction_group));
+
+      // Set the visit state of all instructions in the group to kVisited.
+      for (HloInstruction* instruction : instruction_group) {
+        TF_RET_CHECK((*visit_state)[instruction] == VisitState::kVisiting);
+        (*visit_state)[instruction] = VisitState::kVisited;
+      }
+      stack.pop();
+      continue;
+    }
+
+    // Set the visit state of all instructions in the group to kVisiting.
+    for (HloInstruction* instruction : instruction_group) {
+      TF_RET_CHECK((*visit_state)[instruction] == VisitState::kNotVisited)
+          << instruction->ToString();
+      (*visit_state)[instruction] = VisitState::kVisiting;
+    }
+
+    // For each instruction in the group, visit its predecessors (operands,
+    // control predecessors and remote predecessors).
+    for (HloInstruction* instruction : instruction_group) {
+      for (HloInstruction* predecessor : GlobalPredecessors(instruction)) {
+        // Visiting a node that is already being visited implies that there is
+        // a cycle. Generate an error with the list of instructions in the
+        // cycle.
+        if ((*visit_state)[predecessor] == VisitState::kVisiting) {
+          string cyclic_instructions;
+          for (const auto& state : *visit_state) {
+            if (state.second == VisitState::kVisiting) {
+              tensorflow::strings::StrAppend(&cyclic_instructions,
+                                             state.first->ToString(), "\n");
+            }
+          }
+          // TODO(b/64305524): Improve the error message to print out the
+          // instructions in a deterministic order that forms the cycle.
+          return FailedPrecondition(
+              "Cross-computation cycle detected via communicating nodes. The "
+              "cycle contains the node %s. The cycle is found among the "
+              "following nodes. Note that the order of the nodes is arbitrary "
+              "and that the list may include nodes that are not part of the "
+              "cycle.\n%s",
+              predecessor->ToString().c_str(), cyclic_instructions.c_str());
+        }
+        stack.push(predecessor);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status HloModuleGroupUtil::VerifyComputations(
+    tensorflow::gtl::ArraySlice<HloComputation*> computations) {
+  auto visit_function =
+      [&](HloInstruction* instruction,
+          const std::vector<HloInstruction*>& instruction_group) {
+        return Status::OK();
+      };
+  int64 instructions_count = 0;
+  VisitStates visit_states;
+  for (HloComputation* computation : computations) {
+    // Visit all instructions, and not just from the root instruction of the
+    // computation. This allows us to detect dead cycles (i.e., cycles that
+    // are not reachable from the root) or to enforce an order for the
+    // communication instructions that are not reachable from any roots.
+    for (HloInstruction* instruction : computation->instructions()) {
+      TF_RETURN_IF_ERROR(
+          VisitTopologicalOrder(&visit_states, visit_function, instruction));
+    }
+    instructions_count += computation->instruction_count();
+  }
+
+  // Check if all instructions are visited and are in the visited state.
+  TF_RET_CHECK(visit_states.size() == instructions_count);
+  for (auto& state : visit_states) {
+    TF_RET_CHECK(state.second == VisitState::kVisited);
+  }
+
+  return Status::OK();
+}
+
+StatusOr<std::unique_ptr<HloReachabilityMap>>
+HloModuleGroupUtil::ComputeReachability(
+    tensorflow::gtl::ArraySlice<HloComputation*> computations) {
+  std::list<HloInstruction*> post_order;
+  auto visit_function =
+      [&](HloInstruction* instruction,
+          const std::vector<HloInstruction*>& instruction_group) {
+        post_order.insert(post_order.end(), instruction_group.begin(),
+                          instruction_group.end());
+        return Status::OK();
+      };
+  HloModuleGroupUtil::VisitStates visit_states;
+  for (HloInstruction* root : RootInstructions(computations)) {
+    TF_RETURN_IF_ERROR(
+        VisitTopologicalOrder(&visit_states, visit_function, root));
+  }
+  auto reachability = absl::make_unique<HloReachabilityMap>(post_order);
+  for (HloInstruction* hlo : post_order) {
+    reachability->SetReachabilityToUnion(GlobalPredecessors(hlo), hlo);
+  }
+  return std::move(reachability);
+}
+
+void HloModuleGroupUtil::UpdateReachabilityThroughInstruction(
+    HloInstruction* instruction, HloReachabilityMap* reachability_map) {
+  std::queue<HloInstruction*> worklist;
+  worklist.push(instruction);
+
+  while (!worklist.empty()) {
+    HloInstruction* item = worklist.front();
+    worklist.pop();
+    if (reachability_map->SetReachabilityToUnion(GlobalPredecessors(item),
+                                                 item)) {
+      for (HloInstruction* successor : GlobalSuccessors(item)) {
+        worklist.push(successor);
+      }
+    }
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.h b/tensorflow/compiler/xla/service/hlo_module_group_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..c25ca1aff50b288f3ac3885cbed53e7ba9768430
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.h
@@ -0,0 +1,117 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_GROUP_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_GROUP_UTIL_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_group_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace xla {
+
+// Collection of utilities for handling HloModuleGroups.
+class HloModuleGroupUtil {
+ public:
+  explicit HloModuleGroupUtil(const HloModuleGroupMetadata& metadata)
+      : metadata_(metadata) {}
+
+  // Returns all unique predecessors of the instruction. This includes:
+  // * predecessors in the same computation: operands and control predecessors
+  // * Recv is a predecessor of Send
+  // * Send is a predecessor of RecvDone
+  // * predecessors of companions (if the instruction is a companion while)
+  // * predecessors' companions (for any predecessor that is a companion while)
+  std::vector<HloInstruction*> GlobalPredecessors(HloInstruction* instruction);
+
+  // Returns all unique successors of the instruction. This includes:
+  // * successors in the same computation: users and control successors
+  // * Send is a successor of Recv
+  // * RecvDone is a predecessor of Send
+  // * successors of companions (if the instruction is a companion while)
+  // * successors' companions (for any successor that is a companion while)
+  std::vector<HloInstruction*> GlobalSuccessors(HloInstruction* instruction);
+
+  // Returns the root instructions of the computations.
+  std::vector<HloInstruction*> RootInstructions(
+      tensorflow::gtl::ArraySlice<HloComputation*> computations);
+
+  // Visit state of each instruction during DFS traversal.
+  enum VisitState {
+    kNotVisited = 0,
+    kVisiting,
+    kVisited,
+  };
+
+  // Function called on each instruction group during the DFS traversal. See the
+  // comment for VisitTopologicalOrder()).
+  using VisitFunction = std::function<Status(
+      HloInstruction* hlo,
+      const std::vector<HloInstruction*>& instruction_group)>;
+
+  // Given the hlo instruction as the root, recursively visits all its
+  // predecessor instructions in DFS order to visit nodes in topological order.
+  //
+  // Note that the DFS traversal does not only visit nodes in the same
+  // computation (parent of the root instruction), but also visits nodes in
+  // different computations connected via communication instructions. During the
+  // traversal, companion While instructions (see the class comment in
+  // HloModuleGroupMetadata) are treated as a single instruction (called
+  // instruction group, which contains only a single instruction if the visiting
+  // node is not a companion while) -- visiting one of the instructions in the
+  // group effectively visits all other instructions in the group, and then all
+  // predecessor instructions of the group are visited.
+  //
+  // * visit_state: map from each instruction to its visit state.
+  // * visit_function: function called when each instruction group.
+  // * root: the root instruction of the traversal.
+  using VisitStates = tensorflow::gtl::FlatMap<HloInstruction*, VisitState>;
+  Status VisitTopologicalOrder(VisitStates* visit_state,
+                               const VisitFunction& visit_function,
+                               HloInstruction* root);
+
+  // Verifies that the computations are well-formed (e.g., no cycles).
+  Status VerifyComputations(
+      tensorflow::gtl::ArraySlice<HloComputation*> computations);
+
+  // Below Reachability utils resemble those in HloComputation, except that
+  // they can handle instructions across multiple computations.
+  //
+  // Creates the reachability map for the instructions in the computations.
+  StatusOr<std::unique_ptr<HloReachabilityMap>> ComputeReachability(
+      tensorflow::gtl::ArraySlice<HloComputation*> computations);
+
+  // Updates the reachability of the given instruction, taking the global
+  // predeccessorss and successors into account.
+  void UpdateReachabilityThroughInstruction(
+      HloInstruction* instruction, HloReachabilityMap* reachability_map);
+
+ private:
+  const HloModuleGroupMetadata& metadata_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_GROUP_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index af24604c39b554f146793594958f373999844b4c..ca763076a16af1150a8623fb7dbf22c46a5ca263 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -57,6 +57,7 @@ namespace xla {
   V(kCall, "call", kHloOpcodeIsVariadic)                     \
   V(kCeil, "ceil")                                           \
   V(kClamp, "clamp")                                         \
+  V(kClz, "count-leading-zeros")                             \
   V(kComplex, "complex")                                     \
   V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)       \
   V(kConditional, "conditional")                             \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 1b24d8da9e832e6847cb6f405e15af3c455f695a..e89d94bede6c437ca1131a1b1b0098390d58c0d9 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -66,6 +66,28 @@ bool HloOrdering::ExecutesBefore(const HloInstruction* a,
     }
   }
 
+  // If the common ancestor is a conditional instruction, even though the true
+  // and false computations are not really ordered per-se, we define the true
+  // computation to be ordered before the false one.
+  // This ensures that buffers can still be shared among the two computations
+  // as they will forcibly have disjoint liveness.
+  if (a_ancestor == b_ancestor &&
+      a_ancestor->opcode() == HloOpcode::kConditional) {
+    const HloComputation* true_computation = a_ancestor->true_computation();
+    const HloComputation* false_computation = a_ancestor->false_computation();
+    if (call_graph_->InstructionIsNestedIn(a, true_computation) &&
+        call_graph_->InstructionIsNestedIn(b, false_computation)) {
+      return true;
+    }
+    // If 'b' is the conditional ancestor, and 'a' is within the true or false
+    // computations, 'a' executes before 'b'.
+    if (b == a_ancestor &&
+        (call_graph_->InstructionIsNestedIn(a, true_computation) ||
+         call_graph_->InstructionIsNestedIn(a, false_computation))) {
+      return true;
+    }
+  }
+
   return ExecutesBeforeInSameComputation(a_ancestor, b_ancestor);
 }
 
@@ -118,7 +140,18 @@ bool HloOrdering::IsDefinedBefore(const HloValue& a, const HloValue& b) const {
            b.defining_instruction()->while_condition()))) {
     return true;
   }
-
+  // If 'b' is a conditional phi and 'a' is in the true or false computation,
+  // then 'a' executes before 'b'.
+  if (b.is_phi() &&
+      b.defining_instruction()->opcode() == HloOpcode::kConditional &&
+      (call_graph_->InstructionIsNestedIn(
+           a.defining_instruction(),
+           b.defining_instruction()->true_computation()) ||
+       call_graph_->InstructionIsNestedIn(
+           a.defining_instruction(),
+           b.defining_instruction()->false_computation()))) {
+    return true;
+  }
   return ExecutesBefore(a.defining_instruction(), b.defining_instruction());
 }
 
@@ -212,18 +245,17 @@ bool HloOrdering::LiveRangeStrictlyBefore(
   VLOG(4) << "LiveRangeStrictlyBefore(a = " << a.ToShortString()
           << ", b = " << b.ToShortString() << ")";
   if (!IsDefinedBefore(a, b)) {
-    VLOG(4) << "a not defined before b";
+    VLOG(4) << a << " not defined before " << b;
     return false;
   }
-
   // All uses of 'a' must be before 'b' is defined.
   for (const HloUse& use : a.uses()) {
     if (!UseIsBeforeValueDefinition(use, b, dataflow)) {
-      VLOG(4) << "use of a (" << use << ") not before b is defined";
+      VLOG(4) << "use of " << a << " (" << use << ") not before " << b
+              << " is defined";
       return false;
     }
   }
-
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index a989fce63234cb860d08c48b02462e96bec879bc..37a7fbad97cea2f34798efecc2489e57d1374f35 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -34,53 +34,6 @@ namespace {
 
 class HloOrderingTest : public HloTestBase {};
 
-TEST_F(HloOrderingTest, LastUseScheduledFirst) {
-  // Tests scheduling of the following HLO code:
-  //
-  //   %ab = abs(%param)
-  //   %exp = exp(%param)
-  //   %add = add(%ab, %exp)
-  //   %negate = negate(%exp)
-  //   %sub = subtract(%add, %negate)
-  //
-  // %add should be scheduled before %negate because %add is the last (and only)
-  // use of %ab. Scheduling %add first then frees up %ab's buffer.
-  const Shape vec = ShapeUtil::MakeShape(xla::F32, {42});
-  auto builder = HloComputation::Builder(TestName());
-  auto param =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, vec, "param"));
-  auto ab = builder.AddInstruction(
-      HloInstruction::CreateUnary(vec, HloOpcode::kAbs, param));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(vec, HloOpcode::kExp, param));
-
-  auto add = builder.AddInstruction(
-      HloInstruction::CreateBinary(vec, HloOpcode::kAdd, ab, exp));
-  auto negate = builder.AddInstruction(
-      HloInstruction::CreateUnary(vec, HloOpcode::kNegate, exp));
-  auto sub = builder.AddInstruction(
-      HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
-
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      SequentialHloOrdering::HloModuleSequence sequence,
-      CreateMemoryMinimizingSequence(*module, [](const LogicalBuffer& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape());
-      }));
-  // Verify that all instructions are in the sequence.
-  EXPECT_EQ(module->entry_computation()->instruction_count(),
-            sequence.at(module->entry_computation()).size());
-
-  // The first instruction should be the parameter and the last the root "sub".
-  EXPECT_EQ(param, sequence.at(module->entry_computation()).front());
-  EXPECT_EQ(sub, sequence.at(module->entry_computation()).back());
-
-  SequentialHloOrdering ordering(module.get(), sequence);
-  EXPECT_TRUE(ordering.ExecutesBefore(add, negate));
-}
-
 TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
   // Tests the ordering of instructions in different computations using the
   // following HLO code:
@@ -362,5 +315,66 @@ ENTRY while.v11 {
   ordering.ToString();  // Shouldn't crash.
 }
 
+TEST_F(HloOrderingTest, ConditionalInstructionOrdering) {
+  const char* module_str = R"(
+HloModule test_conditional_module
+
+true_branch {
+  param.1 = (s32[], s32[]) parameter(0)
+  get-tuple-element.1 = s32[] get-tuple-element(param.1), index=0
+  get-tuple-element.2 = s32[] get-tuple-element(param.1), index=1
+  add.1 = s32[] add(get-tuple-element.1, get-tuple-element.2)
+  ROOT tuple.1 = (s32[], s32[]) tuple(add.1, get-tuple-element.1)
+}
+
+false_branch {
+  param.2 = (s32[], s32[]) parameter(0)
+  get-tuple-element.3 = s32[] get-tuple-element(param.2), index=0
+  get-tuple-element.4 = s32[] get-tuple-element(param.2), index=1
+  add.2 = s32[] add(get-tuple-element.3, get-tuple-element.4)
+  ROOT tuple.2 = (s32[], s32[]) tuple(add.2, get-tuple-element.4)
+}
+
+ENTRY root {
+  param.3 = (pred[], (s32[], s32[])) parameter(0)
+  pred.1 = pred[] get-tuple-element(param.3), index=0
+  cond_arg.1 = (s32[], s32[]) get-tuple-element(param.3), index=1
+  conditional = (s32[], s32[]) conditional(pred.1, cond_arg.1, cond_arg.1), true_computation=true_branch, false_computation=false_branch
+  cond_res.1 = s32[] get-tuple-element(conditional), index=0
+  cond_res.2 = s32[] get-tuple-element(conditional), index=1
+  add.3 = s32[] add(cond_res.1, cond_res.2)
+  ROOT result = (s32[], s32[], s32[]) tuple(add.3, cond_res.1, cond_res.2)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(auto dataflow,
+                          HloDataflowAnalysis::Run(*module, /*ssa_form=*/true));
+  DependencyHloOrdering ordering(module.get());
+
+  // Even though the true and false branches has no ordering, since they do not
+  // interfere (as they are mutually exclusive), we define the true computation
+  // to be before the false one.
+  // Similarly, any instruction in the true or false branches are considered
+  // before the conditional instruction. The roots are effectively "at the same
+  // time" WRT the conditional, but they are Phi-ed anyway.
+  HloInstruction* add_1 = FindInstruction(module.get(), "add.1");
+  HloInstruction* add_2 = FindInstruction(module.get(), "add.2");
+  HloInstruction* add_3 = FindInstruction(module.get(), "add.3");
+  HloInstruction* conditional = FindInstruction(module.get(), "conditional");
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_1),
+                                       dataflow->GetValueDefinedAt(add_2)));
+  EXPECT_TRUE(
+      ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_2),
+                               dataflow->GetValueDefinedAt(conditional)));
+  EXPECT_TRUE(
+      ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_1),
+                               dataflow->GetValueDefinedAt(conditional)));
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_1),
+                                       dataflow->GetValueDefinedAt(add_3)));
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_2),
+                                       dataflow->GetValueDefinedAt(add_3)));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.cc b/tensorflow/compiler/xla/service/hlo_proto_util.cc
index 78e6a101c10a1e812e3e2631d520139fd0bc425c..3460679558d185d1e022660d9a1d23176d0d96bf 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 
+#include <string>
+
+#include "tensorflow/compiler/xla/util.h"
+
 namespace xla {
 
 HloProto MakeHloProto(const HloModule& module,
@@ -35,4 +39,35 @@ HloProto MakeHloProto(const HloModule& module) {
   return proto;
 }
 
+StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
+    const HloProto& hlo_proto) {
+  if (!hlo_proto.has_hlo_module()) {
+    return NotFound("HloProto missing HloModuleProto.");
+  }
+  if (!hlo_proto.hlo_module().has_program_shape()) {
+    return NotFound("HloProto missing program shape.");
+  }
+
+  std::vector<const Shape*> parameter_shapes;
+  const auto& program_shape = hlo_proto.hlo_module().program_shape();
+  for (const Shape& shape : program_shape.parameters()) {
+    parameter_shapes.push_back(&shape);
+  }
+  return parameter_shapes;
+}
+
+StatusOr<const Shape*> EntryComputationOutputShape(const HloProto& hlo_proto) {
+  if (!hlo_proto.has_hlo_module()) {
+    return NotFound("HloProto missing HloModuleProto.");
+  }
+  if (!hlo_proto.hlo_module().has_program_shape()) {
+    return NotFound("HloProto missing program shape.");
+  }
+  if (!hlo_proto.hlo_module().program_shape().has_result()) {
+    return NotFound("HloProto missing result in its program shape");
+  }
+
+  return &hlo_proto.hlo_module().program_shape().result();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.h b/tensorflow/compiler/xla/service/hlo_proto_util.h
index 320288fdb9aa0810b306b1d78bd1ff4cfc366ed2..3d9c375cd5d26f92cf8316f78789daf4fc08c927 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.h
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.h
@@ -35,6 +35,15 @@ HloProto MakeHloProto(const HloModule& module,
 // will not be included in the output.
 HloProto MakeHloProto(const HloModule& module);
 
+// Returns the shapes of the parameters of the entry computation. Shape pointers
+// refer to shapes inside of the given HloProto.
+StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
+    const HloProto& hlo_proto);
+
+// Returns the shape of the output of the entry computation. The shape pointer
+// refers to the output shape inside of the given HloProto.
+StatusOr<const Shape*> EntryComputationOutputShape(const HloProto& hlo_proto);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROTO_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util_test.cc b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9cca138703c8fa61aadf69dd7304a215a9f4be2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
+
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace xla {
+namespace {
+
+class HloProtoUtilTest : public ::testing::Test {};
+
+TEST_F(HloProtoUtilTest, ParamsAndOutputShapeMissingModule) {
+  HloProto hlo_proto;
+
+  auto status = EntryComputationParameterShapes(hlo_proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(),
+              ::testing::HasSubstr("missing HloModuleProto"));
+}
+
+TEST_F(HloProtoUtilTest, MissingProgramShape) {
+  HloProto hlo_proto;
+  HloModuleProto* module = hlo_proto.mutable_hlo_module();
+  module->set_name("entry");
+
+  auto status = EntryComputationParameterShapes(hlo_proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(),
+              ::testing::HasSubstr("missing program shape"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 98b8d34be1f331aaeac94e952deeae1e76379861..b0632448933df4b7681a0704c58d697b5ec68a1f 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1320,7 +1320,7 @@ StatusOr<bool> HloRematerialization::Run(
 /* static */ StatusOr<bool> HloRematerialization::RematerializeAndSchedule(
     const HloRematerialization::ShapeSizeFunction& size_function,
     int64 memory_limit_bytes, HloModule* hlo_module,
-    SchedulerAlgorithm scheduler_algorithm,
+    MemorySchedulerAlgorithm scheduler_algorithm,
     SequentialHloOrdering::HloModuleSequence* sequence,
     RematerializationSizes* sizes) {
   HloRematerialization remat(scheduler_algorithm, size_function);
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 52553439033a3bcfa4b472f13f9cd4b1ecf5ed96..2ee2dd0571ae8c6604e4ca722351fd48a913bda5 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -66,12 +66,12 @@ class HloRematerialization {
   // code generation.
   static StatusOr<bool> RematerializeAndSchedule(
       const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
-      HloModule* hlo_module, SchedulerAlgorithm scheduler_algorithm,
+      HloModule* hlo_module, MemorySchedulerAlgorithm scheduler_algorithm,
       SequentialHloOrdering::HloModuleSequence* sequence,
       RematerializationSizes* sizes = nullptr);
 
  protected:
-  HloRematerialization(SchedulerAlgorithm scheduler_algorithm,
+  HloRematerialization(MemorySchedulerAlgorithm scheduler_algorithm,
                        const ShapeSizeFunction& size_function)
       : scheduler_algorithm_(scheduler_algorithm),
         size_function_(size_function) {}
@@ -108,7 +108,7 @@ class HloRematerialization {
       const HloInstruction* instruction) const;
 
   // Selects an algorithm to use for HLO scheduling.
-  SchedulerAlgorithm scheduler_algorithm_;
+  MemorySchedulerAlgorithm scheduler_algorithm_;
 
   // Function which computes the size of the top-level buffer of a shape.
   const ShapeSizeFunction size_function_;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 1b7d26dde501a6a0955d62ea0938e0683a32d49d..83de54f3fa56ee660b79d8c366dbc0b52f9fde87 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -162,7 +162,7 @@ TEST_F(HloRematerializationTest, SingleComputation) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/14 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // Root should not have changed.
@@ -195,7 +195,7 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/20 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
 
   // No instructions should have been materialized.
   EXPECT_FALSE(changed);
@@ -236,7 +236,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/17 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // Only the entry computation should have a rematerialized instruction added.
@@ -272,7 +272,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/15 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // Both computations should have a rematerialized instruction added.
@@ -314,7 +314,7 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/13 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // All computations should have a rematerialized instruction added.
@@ -385,7 +385,7 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) {
       bool changed, HloRematerialization::RematerializeAndSchedule(
                         ByteSizeOf,
                         /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_),
-                        module.get(), SchedulerAlgorithm::kAuto, &sequence));
+                        module.get(), DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
   // The rng should not have been rematerialized.
   EXPECT_EQ(count_rngs(entry_computation), 1);
@@ -480,7 +480,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/22 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // The broadcast should have been rematerialized 3 times.
@@ -577,7 +577,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/22 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   // Rematerialization should only occur if the rematerializable instruction has
   // no indirect uses.
   if (indirectly_used) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 41b079eb799d06321a31f7d7ae0630dc8d58c46b..48da1a505c9bea72378aaba7824548cca0eef447 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -16,27 +16,20 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 
-#include <set>
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
@@ -52,10 +45,9 @@ namespace {
 // Creates an HloModule from the given proto.
 StatusOr<std::unique_ptr<HloModule>> HloProtoToModule(
     const HloProto& proto, const DebugOptions& debug_options) {
-  TF_ASSIGN_OR_RETURN(
-      HloModuleConfig config,
-      HloModule::CreateModuleConfigFromProto(proto.hlo_module()));
-  config.set_debug_options(debug_options);
+  TF_ASSIGN_OR_RETURN(HloModuleConfig config,
+                      HloModule::CreateModuleConfigFromProto(proto.hlo_module(),
+                                                             debug_options));
   TF_ASSIGN_OR_RETURN(auto module,
                       HloModule::CreateFromProto(proto.hlo_module(), config));
   return std::move(module);
@@ -92,15 +84,6 @@ HloRunner::ReadModuleFromHloTextFile(const std::string& filename,
   return tools::Parse(hlo_string, config);
 }
 
-// Define this in .cc file to avoid having to include eigen or forward declare
-// these types in the header.
-struct HloRunner::EigenThreadPoolWrapper {
-  std::unique_ptr<EigenThreadPoolWrapper> pool;
-  std::unique_ptr<Eigen::ThreadPoolDevice> device;
-};
-
-HloRunner::HloRunner() {}
-
 HloRunner::HloRunner(se::Platform* platform) {
   BackendOptions backend_options;
   backend_options.set_platform(platform);
@@ -110,65 +93,45 @@ HloRunner::HloRunner(se::Platform* platform) {
 
 HloRunner::~HloRunner() {}
 
-StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteInternal(
+StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
     std::unique_ptr<HloModule> module,
     const tensorflow::gtl::ArraySlice<Literal*> arguments,
     bool run_hlo_passes) {
-  if (run_hlo_passes) {
-    TF_ASSIGN_OR_RETURN(
-        module, backend().compiler()->RunHloPasses(
-                    std::move(module), backend().default_stream_executor(),
-                    /*device_allocator=*/nullptr));
-  }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      backend().compiler()->RunBackend(std::move(module),
-                                       backend().default_stream_executor(),
-                                       /*device_allocator=*/nullptr));
-
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      CreateExecutable(std::move(module), run_hlo_passes));
   se::Stream stream(backend().default_stream_executor());
   stream.Init();
 
-  ExecutableRunOptions run_options;
-  run_options.set_device_ordinal(backend().default_device_ordinal());
-  run_options.set_stream(&stream);
-  run_options.set_allocator(backend().memory_allocator());
-  run_options.set_inter_op_thread_pool(backend().inter_op_thread_pool());
-  run_options.set_intra_op_thread_pool(
-      backend().eigen_intra_op_thread_pool_device());
-
-  ServiceExecutableRunOptions service_run_options(
-      run_options, backend().StreamBorrower(),
-      backend().inter_op_thread_pool());
+  ServiceExecutableRunOptions service_run_options(GetServiceRunOptionsForDevice(
+      backend().default_device_ordinal(), &stream, nullptr));
+  const ExecutableRunOptions& run_options = service_run_options.run_options();
 
   // Copy arguments to device.
-  std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
-  std::vector<ShapedBuffer*> argument_buffer_ptrs;
+  std::vector<ScopedShapedBuffer> argument_buffers;
   for (Literal* argument : arguments) {
     TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<ScopedShapedBuffer> argument_buffer,
+        ScopedShapedBuffer argument_buffer,
         backend().transfer_manager()->AllocateScopedShapedBuffer(
             argument->shape(), run_options.allocator(),
             run_options.device_ordinal()));
     TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-        stream.parent(), *argument, *argument_buffer));
+        stream.parent(), *argument, argument_buffer));
     argument_buffers.push_back(std::move(argument_buffer));
-    argument_buffer_ptrs.push_back(argument_buffers.back().get());
   }
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result,
-      executable->ExecuteOnStream(&service_run_options, argument_buffer_ptrs,
-                                  /*hlo_execution_profile=*/nullptr));
+  std::vector<const ShapedBuffer*> argument_buffer_ptrs;
+  argument_buffer_ptrs.reserve(argument_buffers.size());
+  for (const auto& buf : argument_buffers) {
+    argument_buffer_ptrs.push_back(&buf);
+  }
 
-  // Create a ScopedShapedBuffer of the result to manage deallocation. This will
-  // deallocate all the device memory when it goes out of scope.
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ScopedShapedBuffer> scoped_result,
-      ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator()));
+      ScopedShapedBuffer result,
+      executable->ExecuteOnStreamWrapper(
+          &service_run_options, /*profile=*/nullptr, argument_buffer_ptrs));
 
   auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice(
-      stream.parent(), *scoped_result);
+      stream.parent(), result);
   if (result_literal.ok()) {
     VLOG(4) << "Executed binary and got result: "
             << result_literal.ValueOrDie()->ToString();
@@ -179,10 +142,156 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteInternal(
   return result_literal;
 }
 
+StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
+    std::unique_ptr<HloModule> module,
+    const ReplicatedExecuteOptions& options) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      CreateExecutable(std::move(module), options.run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+  std::vector<std::unique_ptr<se::Stream>> streams;
+  std::vector<ServiceExecutableRunOptions> service_run_options;
+
+  std::vector<ScopedShapedBuffer> argument_buffers;
+  // This reserve() call is necessary for correctness, because
+  // argument_buffer_ptrs contains pointers into the elements of
+  // argument_buffers.
+  argument_buffers.reserve(options.num_replicas * options.arguments.size());
+
+  // Plus one so we can safely get &argument_buffer_ptrs[0] in case there are
+  // no arguments.
+  std::vector<const ShapedBuffer*> argument_buffer_ptrs(
+      options.num_replicas * options.arguments.size() + 1);
+  std::vector<tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
+      argument_buffer_slices;
+  int64 index = 0;
+  for (int64 i = 0; i < options.num_replicas; ++i) {
+    int64 device = device_assignment(i, 0);
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                        backend().stream_executor(device));
+    streams.push_back(absl::make_unique<se::Stream>(executor));
+    streams.back()->Init();
+    service_run_options.emplace_back(GetServiceRunOptionsForDevice(
+        device, streams.back().get(), &device_assignment));
+
+    // Copy arguments to device.
+    for (const Literal* argument : options.arguments) {
+      TF_ASSIGN_OR_RETURN(
+          ScopedShapedBuffer argument_buffer,
+          backend().transfer_manager()->AllocateScopedShapedBuffer(
+              argument->shape(), backend().memory_allocator(), device));
+      TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+          executor, *argument, argument_buffer));
+      argument_buffers.push_back(std::move(argument_buffer));
+      argument_buffer_ptrs[index++] = &argument_buffers.back();
+    }
+    argument_buffer_slices.emplace_back(
+        &argument_buffer_ptrs[index - options.arguments.size()],
+        options.arguments.size());
+  }
+
+  std::unique_ptr<tensorflow::thread::ThreadPool> pool;
+  int64 num_threads = (options.infeed != nullptr) ? options.num_replicas : 0;
+  if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
+    num_threads += options.num_replicas;
+  }
+  if (num_threads > 0) {
+    pool = absl::make_unique<tensorflow::thread::ThreadPool>(
+        tensorflow::Env::Default(), "infeed_outfeed",
+        /*num_threads=*/num_threads);
+  }
+  if (options.infeed != nullptr) {
+    for (int64 i = 0; i < options.num_replicas; ++i) {
+      int64 device = device_assignment(i, 0);
+      pool->Schedule([this, device, &options]() {
+        se::StreamExecutor* executor =
+            backend().stream_executor(device).ValueOrDie();
+        VLOG(1) << "Starting infeed on device " << device;
+        for (int64 step = 1;
+             options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
+          TF_CHECK_OK(backend().transfer_manager()->TransferLiteralToInfeed(
+              executor, *options.infeed));
+          if (step % 100 == 0) {
+            VLOG(1) << "Infeed step " << step;
+          }
+        }
+      });
+    }
+  }
+  if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
+    for (int64 i = 0; i < options.num_replicas; ++i) {
+      int64 device = device_assignment(i, 0);
+      pool->Schedule([this, device, &options]() {
+        se::StreamExecutor* executor =
+            backend().stream_executor(device).ValueOrDie();
+        VLOG(1) << "Starting outfeed on device " << device;
+        for (int64 step = 1;
+             options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
+          auto literal = absl::make_unique<Literal>();
+          TF_CHECK_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
+              executor, options.outfeed_shape, literal.get()));
+          if (options.outfeed_values != nullptr) {
+            options.outfeed_values->push_back(std::move(literal));
+          }
+          if (step % 100 == 0) {
+            VLOG(1) << "Outfeed step " << step;
+          }
+        }
+      });
+    }
+  }
+
+  LOG(INFO) << "Replicated execution started";
+  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> results,
+                      executable->ExecuteOnStreams(service_run_options,
+                                                   argument_buffer_slices));
+  LOG(INFO) << "Replicated execution terminated";
+
+  std::vector<std::unique_ptr<Literal>> exec_results;
+  for (int64 i = 0; i < options.num_replicas; ++i) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
+                        backend().transfer_manager()->TransferLiteralFromDevice(
+                            streams[i]->parent(), results[i]));
+    exec_results.push_back(std::move(literal));
+  }
+  return std::move(exec_results);
+}
+
+StatusOr<std::unique_ptr<Executable>> HloRunner::CreateExecutable(
+    std::unique_ptr<HloModule> module, bool run_hlo_passes) {
+  if (run_hlo_passes) {
+    TF_ASSIGN_OR_RETURN(
+        module, backend().compiler()->RunHloPasses(
+                    std::move(module), backend().default_stream_executor(),
+                    backend().memory_allocator()));
+  }
+  return backend().compiler()->RunBackend(std::move(module),
+                                          backend().default_stream_executor(),
+                                          backend().memory_allocator());
+}
+
+ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
+    int64 device, se::Stream* stream, DeviceAssignment* device_assignment) {
+  ExecutableRunOptions run_options;
+  run_options.set_device_ordinal(device);
+  run_options.set_stream(stream);
+  run_options.set_allocator(backend().memory_allocator());
+  run_options.set_intra_op_thread_pool(
+      backend().eigen_intra_op_thread_pool_device());
+  if (device_assignment != nullptr) {
+    run_options.set_device_assignment(device_assignment);
+  }
+  return ServiceExecutableRunOptions(
+      run_options, backend().StreamBorrower(),
+      /*xla_intra_op_thread_pool=*/backend().eigen_intra_op_thread_pool());
+}
+
 Backend& HloRunner::backend() {
   if (!backend_) {
     backend_ = Backend::CreateDefaultBackend().ConsumeValueOrDie();
-    VLOG(1) << "executing on platform " << backend().platform()->Name();
+    VLOG(1) << "Executing on platform " << backend().platform()->Name();
   }
   return *backend_;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index cbaebc68bee708090b8ccb2eae19b556c4d6d453..53f7c6fe4a09111c5ee24f2290f0f4aeed0a4401 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -16,17 +16,22 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_RUNNER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_RUNNER_H_
 
+#include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -39,9 +44,43 @@ namespace xla {
 // file), or parsed from a hlo textual IR string.
 class HloRunner {
  public:
-  HloRunner();
-
-  HloRunner(::perftools::gputools::Platform* platform);
+  // The options used to configure a ExecuteReplicated() call.
+  struct ReplicatedExecuteOptions {
+    // The number of devices the HLO module should be replicated onto.
+    int64 num_replicas = 1;
+
+    // The arguments to be fed to each replica. Since this is used for a
+    // replicated execution, all the arguments are the same for all replicas.
+    std::vector<const Literal*> arguments;
+
+    // If the HLO module being run has an infeed instruction, this will be the
+    // data which will be fed to it, for as many as infeed_steps steps.
+    const Literal* infeed = nullptr;
+
+    // The number of times the infeed literal should be fed to the HLO module.
+    // For a clean exit, this should match the iterations-per-loop parameter
+    // used when generating the HLO module proto (that is usually the main
+    // while bounary counter). A value higher then iterations-per-loop would
+    // lead to infeed threads feeding to a gone computation, while a lower
+    // value would trigger a stuck ExecuteReplicated() call (the computation
+    // will be trying to infeed data which will never come).
+    int64 infeed_steps = -1;
+
+    // The shape of the outfeed operation. If empty, the HLO module does not
+    // generate any outfeed.
+    Shape outfeed_shape;
+
+    // A pointer to a vector where the outfeed values will be stored. If
+    // nullptr, the values will be read and discarded.
+    std::vector<std::unique_ptr<Literal>>* outfeed_values = nullptr;
+
+    // Whether the HLO passes should be run on the input module. Usually
+    // saved modules are coming from after the HLO pass pipeline, so triggering
+    // another run will likely cause errors.
+    bool run_hlo_passes = false;
+  };
+
+  explicit HloRunner(se::Platform* platform);
 
   ~HloRunner();
 
@@ -64,17 +103,34 @@ class HloRunner {
       const std::string& filename, const DebugOptions& debug_options);
 
   // Executes the given module with given literals as input and returns the
-  // result as a Literal. The LiteralPtr type accepts Literal* or
-  // std::unique_ptr<Literal>.
+  // result as a Literal.
   //
   // If run_hlo_passes is false, the module will be executed without Hlo
   // optimization.
-  template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::ArraySlice<Literal*> arguments,
       bool run_hlo_passes = true);
 
+  StatusOr<std::unique_ptr<Literal>> Execute(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arguments,
+      bool run_hlo_passes = true) {
+    // Construct a vector of plain pointers for the arguments.
+    std::vector<Literal*> argument_pointers;
+    c_transform(
+        arguments, std::back_inserter(argument_pointers),
+        [](const std::unique_ptr<Literal>& literal) { return literal.get(); });
+    return Execute(std::move(module), argument_pointers, run_hlo_passes);
+  }
+
+  // Executes a given HLO module into a set of replicas, and returns a map
+  // with the replica number as key, and the corresponding returned literal as
+  // value.
+  StatusOr<std::vector<std::unique_ptr<Literal>>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options);
+
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
   //
@@ -83,31 +139,21 @@ class HloRunner {
   Backend& backend();
 
  private:
-  StatusOr<std::unique_ptr<Literal>> ExecuteInternal(
-      std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<Literal*> arguments,
-      bool run_hlo_passes = true);
-
-  struct EigenThreadPoolWrapper;
-
-  std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
+  // Creates an executable object given an HLO module. If run_hlo_passes is
+  // true, the HLO passes will be run before.
+  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes);
+
+  // Creates a ServiceExecutableRunOptions object to configure a run on device,
+  // using the provided stream object. If device_assignment is not nullptr, it
+  // will be used to configure the replication parameters. Replicated executions
+  // should pass the device_assignment parameter.
+  ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
+      int64 device, se::Stream* stream, DeviceAssignment* device_assignment);
 
   std::unique_ptr<Backend> backend_;
 };
 
-template <typename LiteralPtr>
-StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
-    std::unique_ptr<HloModule> module,
-    const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
-    bool run_hlo_passes) {
-  // Construct a vector of plain pointers for the arguments.
-  std::vector<Literal*> argument_pointers;
-  for (const auto& argument : arguments) {
-    argument_pointers.push_back(&*argument);
-  }
-  return ExecuteInternal(std::move(module), argument_pointers, run_hlo_passes);
-}
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_RUNNER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index f6e33403f538bd8492b04c34d46a458f7f06cc06..1a767628f6e2d33df353366974fb866e89f0df5a 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -103,10 +103,11 @@ class ListScheduler {
     for (auto* instruction : computation.instructions()) {
       tensorflow::gtl::FlatSet<const LogicalBuffer*> instr_uses;
       for (auto* operand : instruction->operands()) {
-        for (const LogicalBuffer* buffer :
-             points_to_analysis.GetBuffersDefinedByInstruction(operand)) {
-          instr_uses.insert(buffer);
-        }
+        points_to_analysis.GetPointsToSet(operand).ForEachElement(
+            [&](const ShapeIndex& /*index*/,
+                const PointsToSet::BufferList& buffers) {
+              instr_uses.insert(buffers.begin(), buffers.end());
+            });
       }
       buffer_uses_[instruction] = std::vector<const LogicalBuffer*>(
           instr_uses.begin(), instr_uses.end());
@@ -339,7 +340,33 @@ int64 SumLogicalBufferSizes(
   return size;
 }
 
-StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
+StatusOr<int64> MinimumMemoryForComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
+                         sequence, points_to_analysis, size_function));
+  return result.heap_size;
+}
+
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const MemorySchedulerAlgorithm& algorithm) {
+  VLOG(2) << "Computation: " << computation.name();
+  if (algorithm) {
+    return algorithm(computation, points_to_analysis, size_function);
+  }
+  return DefaultMemoryScheduler(computation, points_to_analysis, size_function);
+}
+
+}  // namespace
+
+StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function) {
@@ -348,6 +375,7 @@ StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
   // simply users-1 for each instruction.  By subtracting 1, we're saying that
   // instructions with no users or a single user don't count; instructions with
   // lots of fan-out will be visited earlier.
+  int64 cumulative_total_size = 0;
   tensorflow::gtl::FlatMap<const HloInstruction*, int64> extra_users;
   tensorflow::gtl::FlatMap<const HloInstruction*, int64> total_sizes;
   for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
@@ -357,14 +385,17 @@ StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
       continue;
     }
     extra_users[hlo] = hlo->users().empty() ? 0 : hlo->users().size() - 1;
-    total_sizes[hlo] = SumLogicalBufferSizes(
+    int64 logical_buffer_size = SumLogicalBufferSizes(
         points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function);
+    total_sizes[hlo] = logical_buffer_size;
+    cumulative_total_size += logical_buffer_size;
     tensorflow::gtl::FlatSet<const HloInstruction*> unique_operands(
         hlo->operands().begin(), hlo->operands().end());
     for (const HloInstruction* operand : unique_operands) {
       extra_users[hlo] += extra_users[operand];
       total_sizes[hlo] += total_sizes[operand];
     }
+    total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size);
   }
   CHECK_EQ(extra_users.size(), computation.instruction_count());
   CHECK_EQ(total_sizes.size(), computation.instruction_count());
@@ -392,32 +423,17 @@ StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
   return sequence;
 }
 
-StatusOr<int64> MinimumMemoryForComputation(
+StatusOr<std::vector<const HloInstruction*>> ListMemoryScheduler(
     const HloComputation& computation,
-    const std::vector<const HloInstruction*>& sequence,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function) {
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
-                         sequence, points_to_analysis, size_function));
-  return result.heap_size;
+  return ListScheduler::Run(computation, points_to_analysis, size_function);
 }
 
-StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+StatusOr<std::vector<const HloInstruction*>> DefaultMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    SchedulerAlgorithm algorithm) {
-  VLOG(2) << "Computation: " << computation.name();
-  if (algorithm == SchedulerAlgorithm::kListSchedule) {
-    return ListScheduler::Run(computation, points_to_analysis, size_function);
-  }
-  if (algorithm == SchedulerAlgorithm::kDfsSchedule) {
-    return RunDFSMemoryScheduler(computation, points_to_analysis,
-                                 size_function);
-  }
-
+    const LogicalBuffer::SizeFunction& size_function) {
   // We try both a list-scheduler based ordering and a DFS based ordering, and
   // choose whichever returns a lower min-memory, not accounting for
   // fragmentation.
@@ -427,7 +443,7 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
   // within the caller's context. But it's good enough for now.
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> list_sequence,
-      ListScheduler::Run(computation, points_to_analysis, size_function));
+      ListMemoryScheduler(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 list_memory,
       MinimumMemoryForComputation(computation, list_sequence,
@@ -436,7 +452,7 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
 
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> dfs_sequence,
-      RunDFSMemoryScheduler(computation, points_to_analysis, size_function));
+      DFSMemoryScheduler(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 dfs_memory,
       MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
@@ -454,12 +470,10 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
   }
 }
 
-}  // namespace
-
 StatusOr<SequentialHloOrdering::HloModuleSequence>
 CreateMemoryMinimizingSequence(const HloModule& module,
                                const LogicalBuffer::SizeFunction& size_function,
-                               SchedulerAlgorithm algorithm) {
+                               const MemorySchedulerAlgorithm& algorithm) {
   SequentialHloOrdering::HloModuleSequence sequence;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(&module));
@@ -475,7 +489,7 @@ CreateMemoryMinimizingSequence(const HloModule& module,
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const LogicalBuffer::SizeFunction& size_function,
-    SchedulerAlgorithm algorithm) {
+    const MemorySchedulerAlgorithm& algorithm) {
   CHECK(!computation.IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(computation.parent()));
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h
index 1d1eb1e064f75c2220b39e84b010e720a0c37880..068e68383deb170ded1c9b09a8b7ceb8c4c0ab4b 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.h
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 
@@ -33,28 +34,48 @@ StatusOr<int64> MinimumMemoryForSequence(
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const LogicalBuffer::SizeFunction& size_function);
 
-enum class SchedulerAlgorithm {
-  kListSchedule,
-  kDfsSchedule,
+// A memory scheduler computes an execution sequence for the HLO instructions in
+// 'computation' that minimizes peak memory, given a points-to analysis result
+// that describes buffer aliasing, together with a target-specific size function
+// that maps a tensor's logical size to its padded size.
+typedef std::function<StatusOr<std::vector<const HloInstruction*>>(
+    const HloComputation&, const TuplePointsToAnalysis&,
+    const LogicalBuffer::SizeFunction&)>
+    MemorySchedulerAlgorithm;
 
-  // Selects the available scheduler algorithm that had the minimum memory in
-  // the resulting sequence (a la MinimumMemoryForSequence).
-  kAuto,
-};
+// List scheduler
+StatusOr<std::vector<const HloInstruction*>> ListMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function);
+
+// DFS-order scheduler
+StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function);
+
+// The default scheduling algorithm. Runs both the list scheduler
+// and the DFS scheduler, and chooses whichever returns a lower min-memory,
+// not accounting for fragmentation.
+StatusOr<std::vector<const HloInstruction*>> DefaultMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function);
 
 // Returns an HloModuleSequence which seeks to minimize the memory required for
 // the computation. size_function is the function returning the number of bytes
 // required for a LogicalBuffer.
 StatusOr<SequentialHloOrdering::HloModuleSequence>
-CreateMemoryMinimizingSequence(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
-    SchedulerAlgorithm algorithm = SchedulerAlgorithm::kAuto);
+CreateMemoryMinimizingSequence(const HloModule& module,
+                               const LogicalBuffer::SizeFunction& size_function,
+                               const MemorySchedulerAlgorithm& algorithm = {});
 
 // Overload of above that computes the sequence for a single computation.
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const LogicalBuffer::SizeFunction& size_function,
-    SchedulerAlgorithm algorithm = SchedulerAlgorithm::kAuto);
+    const MemorySchedulerAlgorithm& algorithm = {});
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
index 7fb338e7042ce19ac9647e23719e738f3ef42c7c..74544c4a67a819d341056aba4cf6b321a5a86c0a 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -89,5 +90,105 @@ TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
             MinimumMemoryForSequence(module_sequence, size_fn).ValueOrDie());
 }
 
+class HloSchedulingTest : public HloTestBase {};
+
+TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
+  // Tests scheduling of the following HLO code:
+  //
+  //   %ab = abs(%param)
+  //   %exp = exp(%param)
+  //   %add = add(%ab, %exp)
+  //   %negate = negate(%exp)
+  //   %sub = subtract(%add, %negate)
+  //
+  // %add should be scheduled before %negate because %add is the last (and only)
+  // use of %ab. Scheduling %add first then frees up %ab's buffer.
+  const Shape vec = ShapeUtil::MakeShape(xla::F32, {42});
+  auto builder = HloComputation::Builder(TestName());
+  auto param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, vec, "param"));
+  auto ab = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec, HloOpcode::kAbs, param));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec, HloOpcode::kExp, param));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec, HloOpcode::kAdd, ab, exp));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec, HloOpcode::kNegate, exp));
+  auto sub = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      CreateMemoryMinimizingSequence(*module, [](const LogicalBuffer& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape());
+      }));
+  // Verify that all instructions are in the sequence.
+  EXPECT_EQ(module->entry_computation()->instruction_count(),
+            sequence.at(module->entry_computation()).size());
+
+  // The first instruction should be the parameter and the last the root "sub".
+  EXPECT_EQ(param, sequence.at(module->entry_computation()).front());
+  EXPECT_EQ(sub, sequence.at(module->entry_computation()).back());
+
+  SequentialHloOrdering ordering(module.get(), sequence);
+  EXPECT_TRUE(ordering.ExecutesBefore(add, negate));
+}
+
+TEST_F(HloSchedulingTest, ListSchedulerHandlesAliasing) {
+  const char* module_str = R"(
+HloModule test_aliasing_module
+
+ENTRY root {
+  param = s32[1000] parameter(0)
+  p0 = s32[1000] copy(param)
+  p1 = s32[1000] copy(param)
+  t = (s32[1000], s32[1000]) tuple(p0, p1)
+  a = s32[1000] get-tuple-element(t), index=0
+  b = s32[1000] get-tuple-element(t), index=1
+  c = s32[1000] add(a, b)
+  d = s32[1000] add(c, b)
+  e = s32[1000] add(c, c)
+  f = s32[1000] add(e, e)
+  ROOT result = (s32[1000], s32[1000], s32[1000]) tuple(d, e, f)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(module_str));
+
+  auto size_fn = [](const LogicalBuffer& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  };
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      CreateMemoryMinimizingSequence(*module, size_fn, ListMemoryScheduler));
+  // Verify that all instructions are in the sequence.
+  EXPECT_EQ(module->entry_computation()->instruction_count(),
+            sequence.at(module->entry_computation()).size());
+
+  std::unordered_map<string, const HloInstruction*> instructions_by_name;
+  for (const HloInstruction* instruction :
+       sequence.at(module->entry_computation())) {
+    instructions_by_name[instruction->name()] = instruction;
+  }
+
+  // The first instruction should be the parameter and the last the root.
+  EXPECT_EQ(instructions_by_name.at("param"),
+            sequence.at(module->entry_computation()).front());
+  EXPECT_EQ(instructions_by_name.at("result"),
+            sequence.at(module->entry_computation()).back());
+
+  // Instructions "d" and "e" will both be schedulable at the same time, but
+  // instruction "d" allows us to free the buffer of "p1", so the list scheduler
+  // should prefer it.
+  SequentialHloOrdering ordering(module.get(), sequence);
+  EXPECT_TRUE(ordering.ExecutesBefore(instructions_by_name.at("d"),
+                                      instructions_by_name.at("e")));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 447c2446668253c932b44b51b2db22bfd47f9957..994de441237493b5e2254a0a66763d6195c5ea85 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 namespace xla {
 
+using ::tensorflow::str_util::Join;
 using ::tensorflow::strings::StrCat;
 
 HloSharding HloSharding::AssignDevice(int64 device_id) {
@@ -57,8 +58,9 @@ string HloSharding::ToString() const {
     return StrCat(
         "{maximal device=", static_cast<int64>(*tile_assignment_.begin()), "}");
   } else {
-    return StrCat("{", ShapeUtil::HumanString(tile_shape_), " ",
-                  "devices=", VectorString(tile_assignment_), "}");
+    return StrCat("{", ShapeUtil::HumanString(tile_shape_), " ", "devices=[",
+                  Join(tile_assignment_.dimensions(), ","), "]",
+                  Join(tile_assignment_, ","), "}");
   }
 }
 
@@ -183,6 +185,10 @@ Status HloSharding::ValidateTuple(const Shape& shape, int64 num_devices) const {
   // shape tree.
   ShapeTree<HloSharding> shape_tree = GetAsShapeTree(shape);
   for (const auto& index_to_sharding : shape_tree.leaves()) {
+    if (index_to_sharding.first.empty()) {
+      // An empty tuple has a ShapeTree with a single leaf at the empty index.
+      continue;
+    }
     Status status = index_to_sharding.second.ValidateNonTuple(
         ShapeUtil::GetSubshape(shape, index_to_sharding.first), num_devices);
     if (!status.ok()) {
@@ -222,7 +228,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   Status status = Status::OK();
   std::set<int64> seen_cores;
   tile_assignment_.Each(
-      [&](tensorflow::gtl::ArraySlice<int64> indices, uint32 core) {
+      [&](tensorflow::gtl::ArraySlice<int64> indices, int32 core) {
         // Don't overwrite a bad status, so we report the first error.
         if (status.ok()) {
           if (core >= num_devices) {
@@ -250,37 +256,24 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
         ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
-  // The tile shape must not be the same as the input shape without maximal_
-  // also set. If this is the case, we're not actually sharded and the correct
-  // constructor should have been used.
-  if (ShapeUtil::Equal(shape, tile_shape_)) {
+  // The correct constructor have to be used to create tile maximal shardings.
+  if (tile_assignment_.num_elements() == 1) {
     return tensorflow::errors::InvalidArgument(
-        "Tile shape is the same as the input shape. If a replicated sharding "
-        "was intended, use HloSharding::Replicated(). If a device placement "
-        "was intended, use HloSharding::AssignDevice()");
-  }
-
-  // The tile shape must not be greater than the input shape in any dimension.
-  for (int64 i = 0, e = ShapeUtil::Rank(shape); i != e; ++i) {
-    auto tile_dim = tile_shape_.dimensions(i);
-    auto shape_dim = shape.dimensions(i);
-    if (tile_dim > shape_dim) {
-      return tensorflow::errors::InvalidArgument(
-          StrCat("Tile is larger than input shape (dimension ", i, ", ",
-                 tile_dim, " > ", shape_dim));
-    }
+        "Tile assignment only contains a single device. If a replicated "
+        "sharding was intended, use HloSharding::Replicated(). If a device "
+        "placement was intended, use HloSharding::AssignDevice()");
   }
 
-  // The tile assignment tensor must be exactly dimensioned to ceil(shape[dim]
-  // tile[dim]) for every dimension contained within tile.
+  // The tile assignment tensor must contain enough element to cover the full
+  // shape with tiles of the specified size.
   for (int64 i = 0, e = tile_assignment_.dimensions().size(); i != e; ++i) {
-    int64 expected_dim =
-        CeilOfRatio(shape.dimensions(i), tile_shape_.dimensions(i));
-    if (tile_assignment_.dimensions()[i] != expected_dim) {
+    int64 total_tile_size = tile_assignment_.dim(i) * tile_shape_.dimensions(i);
+    if (shape.dimensions(i) > total_tile_size) {
       return tensorflow::errors::InvalidArgument(
-          StrCat("Tile assignment tensor has incorrect shape. Dimension ", i,
-                 " expected ", expected_dim, " but got ",
-                 tile_assignment_.dimensions()[i]));
+          StrCat("Tile assignment tensor has too few element to cover the full "
+                 "shape. Dimension ",
+                 i, ", shape ", shape.dimensions(i), ", total size ",
+                 total_tile_size));
     }
   }
 
@@ -344,4 +337,45 @@ OpSharding HloSharding::ToProto() const {
   return result;
 }
 
+HloSharding HloSharding::TransformShardedTileShape(
+    const Shape& new_shape,
+    const std::function<int64(int64, int64)>& transform) const {
+  CHECK(!IsTuple());
+  if (IsTileMaximal()) {
+    return *this;
+  }
+  CHECK_EQ(ShapeUtil::Rank(new_shape), ShapeUtil::Rank(tile_shape()));
+  Shape new_tile_shape;
+  new_tile_shape.set_element_type(tile_shape().element_type());
+  for (int64 i = 0; i < ShapeUtil::Rank(new_shape); ++i) {
+    int64 dim;
+    if (tile_assignment().dim(i) == 1) {
+      dim = new_shape.dimensions(i);
+    } else if (transform) {
+      dim = transform(i, tile_shape().dimensions(i));
+    } else {
+      dim = tile_shape().dimensions(i);
+    }
+    new_tile_shape.add_dimensions(dim);
+  }
+  TF_CHECK_OK(
+      LayoutUtil::CopyLayoutBetweenShapes(tile_shape_, &new_tile_shape));
+  return HloSharding::Tile(new_tile_shape, tile_assignment());
+}
+
+HloSharding HloSharding::GetSubSharding(const Shape& shape,
+                                        const ShapeIndex& index) const {
+  CHECK(IsTuple());
+
+  ShapeTree<HloSharding> sub_shape_tree(ShapeUtil::GetSubshape(shape, index),
+                                        Replicate());
+  sub_shape_tree.CopySubtreeFrom(GetAsShapeTree(shape), index, {});
+  return Tuple(sub_shape_tree);
+}
+
+std::ostream& operator<<(std::ostream& out, const HloSharding& sharding) {
+  out << sharding.ToString();
+  return out;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 7263198385cf0c84b1dac1e15177dcac99adaafb..2b8e757f42991f697df37d3d34bfdff6a36bc509 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -94,6 +94,10 @@ class HloSharding {
   // Create a new sharding from a protobuf OpSharding.
   static StatusOr<HloSharding> FromProto(const OpSharding& proto);
 
+  // Checks whether device is a reserved device number. A reserved device number
+  // has usually a special meaning, with dedicated handling logic.
+  static bool IsReservedDevice(int64 device) { return device < 0; }
+
   OpSharding ToProto() const;
   string ToString() const;
 
@@ -171,9 +175,13 @@ class HloSharding {
     }
   }
 
+  // Retrieves the sub sharding at a given index, out of a tuple sharding.
+  // REQUIRES: IsTuple()
+  HloSharding GetSubSharding(const Shape& shape, const ShapeIndex& index) const;
+
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
-           protobuf_util::ProtobufEquals(tile_shape_, other.tile_shape_) &&
+           ShapeUtil::Compatible(tile_shape_, other.tile_shape_) &&
            tile_assignment_ == other.tile_assignment_ &&
            tuple_elements_ == other.tuple_elements_;
   }
@@ -207,6 +215,26 @@ class HloSharding {
   // REQUIRES: !IsReplicated() && !IsTuple()
   const Array<int64>& tile_assignment() const { return tile_assignment_; }
 
+  // Returns the flattened list of all the leaf shardings in a tuple shape, by
+  // pre-order walk (ShapeTree iterator order).
+  // REQUIRES: IsTuple().
+  const std::vector<HloSharding>& tuple_elements() const {
+    return tuple_elements_;
+  }
+
+  // Return a new sharding that can apply to the given new shape.
+  // If this sharding is tile-maximal, the returned sharding will be the same as
+  // this sharding. If this sharding is not tile-maximal, the returned
+  // sharding's tile size will differ:
+  //   - Non-sharded dimensions will be adapted to be the same as `new_shape`;
+  //     tile_dimension(i) = new_shape.dimensions(i);
+  //   - Sharded dimensions will be kept the same unless `transform` is supplied
+  //     in which case tile_dimension(i) = transform(i, tile_dimension(i));
+  // REQUIRES: !IsTuple().
+  HloSharding TransformShardedTileShape(
+      const Shape& new_shape,
+      const std::function<int64(int64, int64)>& transform = nullptr) const;
+
  private:
   HloSharding()
       : replicated_(true),
@@ -249,6 +277,8 @@ class HloSharding {
   std::vector<HloSharding> tuple_elements_;
 };
 
+std::ostream& operator<<(std::ostream& out, const HloSharding& sharding);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_H_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 0c7487b3ac77ff181d44dd55ebcf2608feaf02ea..3bf0d25efb7fad78aeccdd9269c289950b2171ab 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -88,7 +88,7 @@ TEST_F(HloShardingTest, Tile) {
   }
 
   {
-    // Test should pass.
+    // Test should fail because of more devices used then `num_device`.
     Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
     HloSharding sharding =
         HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3}));
@@ -97,17 +97,8 @@ TEST_F(HloShardingTest, Tile) {
   }
 
   {
-    // Test should fail due to the tile being larger than the input space.
-    Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
-    HloSharding sharding =
-        HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3}));
-    EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {2, 2}),
-                                       /*num_devices=*/4));
-  }
-
-  {
-    // Test should fail due to the tile not dividing the input space into 4
-    // sections (even with padding).
+    // Test should fail because the total tiled size in dimension 0 is 4 but we
+    // have 6 elements along that dimensions.
     Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
     HloSharding sharding =
         HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3}));
@@ -269,5 +260,57 @@ TEST_F(HloShardingTest, Hash) {
   }
 }
 
+TEST_F(HloShardingTest, TransformShardedTileShapeTest) {
+  HloSharding sharding =
+      HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 5, 7, 11}),
+                        Array4D<int64>({{{{0, 1}, {2, 3}}}}));
+  HloSharding result = sharding.TransformShardedTileShape(
+      ShapeUtil::MakeShape(F32, {13, 15, 17, 19}),
+      [](int dim, int value) { return dim * 111; });
+  HloSharding expected =
+      HloSharding::Tile(ShapeUtil::MakeShape(F32, {13, 15, 222, 333}),
+                        Array4D<int64>({{{{0, 1}, {2, 3}}}}));
+  EXPECT_EQ(result, expected);
+}
+
+TEST_F(HloShardingTest, ToStringReplicatedTest) {
+  HloSharding sharding = HloSharding::Replicate();
+  EXPECT_EQ(sharding.ToString(), "{replicated}");
+}
+
+TEST_F(HloShardingTest, ToStringAssignDeviceTest) {
+  HloSharding sharding = HloSharding::AssignDevice(7);
+  EXPECT_EQ(sharding.ToString(), "{maximal device=7}");
+}
+
+TEST_F(HloShardingTest, ToStringTiledTest) {
+  HloSharding sharding =
+      HloSharding::Tile(ShapeUtil::MakeShape(S32, {7, 11, 13}),
+                        Array3D<int64>({{{2, 3}}, {{5, 7}}}));
+  EXPECT_EQ(sharding.ToString(), "{s32[7,11,13] devices=[2,1,2]2,3,5,7}");
+}
+
+TEST_F(HloShardingTest, ToStringTupleTest) {
+  HloSharding sharding = HloSharding::Tuple(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 5}),
+                                 ShapeUtil::MakeShape(U32, {7, 25}),
+                                 ShapeUtil::MakeShape(S32, {9, 11})}),
+      {HloSharding::Replicate(),
+       HloSharding::Tile(ShapeUtil::MakeShape(U32, {7, 13}),
+                         Array2D<int64>({{3, 5}})),
+       HloSharding::AssignDevice(3)});
+  EXPECT_EQ(sharding.ToString(),
+            "{{replicated}, {u32[7,13] devices=[1,2]3,5}, {maximal device=3}}");
+}
+
+TEST_F(HloShardingTest, OstreamTest) {
+  HloSharding sharding =
+      HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 5, 7, 11}),
+                        Array4D<int64>({{{{0, 1}, {2, 3}}}}));
+  std::ostringstream oss;
+  oss << sharding;
+  EXPECT_EQ(oss.str(), "{f32[3,5,7,11] devices=[1,1,2,2]0,1,2,3}");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index b1fd068115e1d104a11d880675ef84e07d6d5602..8a30cbf9cd622ffb64d345ddaf0dc88f34850bfc 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <set>
 
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -731,6 +732,73 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   return tensorflow::Status::OK();
 }
 
+Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
+  auto* while_cond = instruction->while_condition();
+  auto* while_body = instruction->while_body();
+  if (while_cond->num_parameters() != 1) {
+    return FailedPrecondition(
+        "While condition must have exactly 1 parameter; had %lld : %s",
+        while_cond->num_parameters(), while_cond->ToString().c_str());
+  }
+  if (while_body->num_parameters() != 1) {
+    return FailedPrecondition(
+        "While body must have exactly 1 parameter; had %lld : %s",
+        while_body->num_parameters(), while_body->ToString().c_str());
+  }
+  if (instruction->operand_count() != 1) {
+    return FailedPrecondition(
+        "While loop must have exactly one operand; had %lld : %s",
+        instruction->operand_count(), instruction->ToString().c_str());
+  }
+  auto* init = instruction->operand(0);
+  auto* cond_param = while_cond->parameter_instruction(0);
+  if (!ShapeUtil::Compatible(init->shape(), cond_param->shape())) {
+    return FailedPrecondition(
+        "While condition's parameter must have the same shape as the "
+        "loop's 'init'. init: %s, param: %s",
+        init->ToString().c_str(), cond_param->ToString().c_str());
+  }
+  auto* cond_root = while_cond->root_instruction();
+  if (!ShapeUtil::Compatible(cond_root->shape(),
+                             ShapeUtil::MakeShape(PRED, {}))) {
+    return FailedPrecondition("While condition should have shape PRED: %s",
+                              cond_root->ToString().c_str());
+  }
+  auto* body_param = while_body->parameter_instruction(0);
+  if (!ShapeUtil::Compatible(init->shape(), body_param->shape())) {
+    return FailedPrecondition(
+        "While body's parameter must have the same shape as the loop's"
+        " 'init'. init: %s, param: %s",
+        init->ToString().c_str(), body_param->ToString().c_str());
+  }
+  auto* body_root = while_body->root_instruction();
+  if (!ShapeUtil::Compatible(init->shape(), body_root->shape())) {
+    return FailedPrecondition(
+        "While body should have same shape as the loop's 'init'."
+        "init: %s, body: %s",
+        init->ToString().c_str(), body_root->ToString().c_str());
+  }
+  return tensorflow::Status::OK();
+}
+
+Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
+  const Shape& out_shape = instruction->shape();
+  for (HloInstruction* operand : instruction->operands()) {
+    const Shape& operand_shape = operand->shape();
+    if (!ShapeUtil::IsScalar(operand_shape) &&
+        !ShapeUtil::CompatibleIgnoringElementType(operand_shape, out_shape)) {
+      return FailedPrecondition(
+          "Implicit broadcast is not allowed in HLO."
+          "Found non-compatible shapes for instruction %s.\n"
+          "output: %s\noperand: %s\n",
+          HloOpcodeString(instruction->opcode()).c_str(),
+          ShapeUtil::HumanString(out_shape).c_str(),
+          ShapeUtil::HumanString(operand_shape).c_str());
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
 
@@ -762,45 +830,18 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
       } else if (instruction->opcode() == HloOpcode::kBroadcast) {
         // If you see this failure then someone has confused the difference
         // between the HLO broadcast op, and the UserComputation broadcast
-        // op.  See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I
+        // op. See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I
         // or ComputationLowerer::Visit()
         TF_RET_CHECK(instruction->dimensions().size() ==
                      ShapeUtil::Rank(instruction->operand(0)->shape()))
-            << "Broadcast HLO has invalid number of dimensions.";
+            << "Broadcast HLO (" << instruction->ToShortString()
+            << ") has invalid number of dimensions: "
+            << instruction->dimensions().size()
+            << " != " << ShapeUtil::Rank(instruction->operand(0)->shape());
       } else if (instruction->opcode() == HloOpcode::kWhile) {
-        auto* while_cond = instruction->while_condition();
-        auto* while_body = instruction->while_body();
-        TF_RET_CHECK(while_cond->num_parameters() == 1)
-            << "While condition must have exactly 1 parameter; had "
-            << while_cond->num_parameters() << ": " << while_cond->ToString();
-        TF_RET_CHECK(while_body->num_parameters() == 1)
-            << "While body must have exactly 1 parameter; had "
-            << while_body->num_parameters() << ": " << while_body->ToString();
-        TF_RET_CHECK(instruction->operand_count() == 1)
-            << "While loop must have exactly one operand; had "
-            << instruction->operand_count() << ": " << instruction->ToString();
-
-        auto* init = instruction->operand(0);
-        auto* cond_param = while_cond->parameter_instruction(0);
-        TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), cond_param->shape()))
-            << "While condition's parameter must have the same shape as the "
-               "loop's 'init'. init: "
-            << init->ToString() << ", param: " << cond_param->ToString();
-        auto* cond_root = while_cond->root_instruction();
-        TF_RET_CHECK(ShapeUtil::Compatible(cond_root->shape(),
-                                           ShapeUtil::MakeShape(PRED, {})))
-            << "While condition should have shape PRED: "
-            << cond_root->ToString();
-
-        auto* body_param = while_body->parameter_instruction(0);
-        TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_param->shape()))
-            << "While body's parameter must have the same shape as the loop's "
-               "'init'. init: "
-            << init->ToString() << ", param: " << body_param->ToString();
-        auto* body_root = while_body->root_instruction();
-        TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_root->shape()))
-            << "While body should have same shape as the loop's 'init'. init: "
-            << init->ToString() << ", body: " << body_root->ToString();
+        TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction));
+      } else if (instruction->IsElementwise()) {
+        TF_RETURN_IF_ERROR(CheckElementwiseInstruction(instruction));
       }
 
       auto previous = instructions.find(instruction->name());
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 1dd7ec3c51e18dcfe89bd478de87798ba3858119..6208887547a14d22b512ef308dd2668af2f4468d 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -102,7 +102,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status CheckTernaryShape(const HloInstruction* instruction);
   Status CheckVariadicShape(const HloInstruction* instruction);
 
-  // Checks if the given two instructions shares the same channel id.
+  // Checks if the given two instructions share the same channel id.
   Status CheckSameChannel(const HloInstruction* instr1,
                           const HloInstruction* instr2);
 
@@ -144,9 +144,15 @@ class HloVerifier : public HloPassInterface {
   // CHECKs various invariants of a fusion instruction.
   Status CheckFusionInstruction(HloInstruction* fusion) const;
 
+  Status CheckWhileInstruction(HloInstruction* instruction);
+
+  // Checks that the non-scalar operand shapes are compatible to the output
+  // shape, i.e., that there are no implicit broadcasts of size-one dimensions.
+  Status CheckElementwiseInstruction(HloInstruction* instruction);
+
   // Creates a ShapeVerifier that checks that shapes match inferred
-  // expectations.  This is a factory function because ShapeVerifier,  Note that
-  // ShapeVerifier, being a DfsHloVisitor, is stateful.  We want a clean object
+  // expectations. This is a factory function because ShapeVerifier,
+  // being a DfsHloVisitor, is stateful. We want a clean object
   // for each run of the verifier.
   ShapeVerifierFactory shape_verifier_factory_;
 };
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index f494748e17fc2d0de74dec67f7414d4791f76a07..dc1a39e9fa9fd3ef5c55bd86309fe23f5ef51dd5 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -39,6 +39,7 @@ namespace xla {
     case HloOpcode::kBroadcast:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
+    case HloOpcode::kClz:
     case HloOpcode::kComplex:
     case HloOpcode::kConcatenate:
     case HloOpcode::kConstant:
@@ -127,11 +128,11 @@ namespace xla {
   return false;
 }
 
-// An "effectively unary" operation is one that has one "large"
+// An "effectively at most unary" operation is one that has at most one "large"
 // input with the others being negligible in terms of memory usage.
 // We use "has a smaller true rank than the output" as a heuristic
 // for "negligible" memory usage.
-bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) {
+bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) {
   int64 output_rank = 0;
   ShapeUtil::ForEachSubshape(
       hlo->shape(),
@@ -155,66 +156,91 @@ bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) {
 }
 
 bool InstructionFusion::CanFuseOnAllPaths(
-    const HloReachabilityMap& reachability_map, HloInstruction* producer,
-    HloInstruction* consumer, DoNotFuseSet* do_not_fuse) {
-  auto could_fuse_on_all_paths = [&] {
-    // First check to see if we have already marked this producer as infeasible
-    // to fuse into consumer.
-    if (do_not_fuse->count(producer) > 0) {
+    HloInstruction* producer, HloInstruction* consumer,
+    const HloReachabilityMap& reachability_map,
+    const DoNotFuseSet& do_not_fuse) {
+  if (consumer == producer) {
+    return true;
+  }
+  if (!consumer->IsFusable()) {
+    return false;
+  }
+  for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) {
+    auto* consumer_operand = consumer->mutable_operand(i);
+    // If the operand is not on a path to the producer, it doesn't matter
+    // whether it's fusable.
+    if (!reachability_map.IsReachable(producer, consumer_operand)) {
+      continue;
+    }
+    if (do_not_fuse.count(consumer_operand) > 0 || !ShouldFuse(consumer, i)) {
       return false;
     }
-    // Make sure it is possible for producer and consumer to exist in a fusion
-    // node.
-    if (!producer->IsFusable() || !consumer->IsFusable()) {
+    // The producer is reachable from consumer_operand which means we need
+    // to be able to fuse consumer_operand into consumer in order for
+    // producer to be fusable into consumer on all paths.
+    // Perform the recursive step: make sure producer can be fused into
+    // consumer_operand on all paths.
+    if (!CanFuseOnAllPaths(producer, consumer_operand, reachability_map,
+                           do_not_fuse)) {
       return false;
     }
-    // We do an upward walk of the graph from consumer towards all paths which
-    // lead to producer to find any unfusable paths.
-    for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) {
-      auto* consumer_operand = consumer->mutable_operand(i);
-      if (consumer_operand == producer) {
-        // This is the base case: our upward crawl ends but we need to make sure
-        // that fusion from consumer can happen.
-        if (!ShouldFuse(consumer, i)) {
-          return false;
-        }
-      } else if (reachability_map.IsReachable(producer, consumer_operand)) {
-        // The reachability map told us that consumer_operand is a node on the
-        // path to producer. We need to further investigate from
-        // consumer_operand.
-
-        // First check if we have already ruled out fusing producer into
-        // consumer_operand.
-        if (do_not_fuse->count(consumer_operand) > 0) {
-          return false;
-        }
-        // Make sure it is possible for consumer_operand to exist in a fusion
-        // node.
-        if (!consumer_operand->IsFusable()) {
-          return false;
-        }
-        // The producer is reachable from consumer_operand which means we need
-        // to be able to fuse consumer_operand into consumer in order for
-        // producer to be fusable into consumer on all paths.
-        if (!ShouldFuse(consumer, i)) {
-          return false;
-        }
-        // Perform the recursive step: make sure producer can be fused into
-        // consumer_operand on all paths.
-        if (!CanFuseOnAllPaths(reachability_map, producer, consumer_operand,
-                               do_not_fuse)) {
-          return false;
-        }
+  }
+  return true;
+}
+
+InstructionFusion::DoNotFuseSet InstructionFusion::ComputeGloballyUnfusable(
+    tensorflow::gtl::ArraySlice<HloInstruction*> post_order) {
+  auto reachability = computation_->ComputeReachability();
+
+  // Forbid fusion of producers that:
+  // a) Need to be duplicated, unless they can be fused into all consumers
+  //    via all paths.
+  // b) Are more than unary, that is, fusing them would likely lead to an
+  //    increase in memory bandwidth use.
+  //
+  // Note that if we allow fusion by these global rules, we may still forbid
+  // fusing operations that require duplication later depending on
+  // is_expensive_().
+  DoNotFuseSet do_not_fuse;
+  for (HloInstruction* consumer : post_order) {
+    for (HloInstruction* producer : consumer->operands()) {
+      if (do_not_fuse.count(producer) > 0) {
+        continue;
       }
+
+      // If the producer is effectively not more than unary, duplicating it
+      // will not increase the number of relevant inputs read, as the fusion
+      // node will only need to read at most 1 relevant input (the input of
+      // the producer). In that case, we do not forbid fusion of the operation
+      // here.
+      if (EffectivelyAtMostUnary(producer)) {
+        continue;
+      }
+      // Otherwise we will forbid fusing the op unless we can fuse it into
+      // all of its consumers on all paths.
+      //
+      // That means, that for:
+      // A --> B (fusable)
+      //   \-> C (non-fusable)
+      // A will be not allowed to be fused into B, as it cannot be fused into C.
+      //
+      // Similarly, for:
+      // A -------------> B
+      //   \-> C -> D -/
+      // If:
+      // - A is fusable into B and C, and D is fusable into B
+      // - C is *not* fusable into D
+      // A will be not allowed to be fused into B, as it cannot be fused via
+      // all paths.
+      if (producer->IsFusable() &&
+          CanFuseOnAllPaths(producer, consumer, *reachability, do_not_fuse)) {
+        continue;
+      }
+      do_not_fuse.insert(producer);
     }
-    return true;
-  };
-  if (could_fuse_on_all_paths()) {
-    return true;
   }
-  // We couldn't fuse on all paths, record this result.
-  do_not_fuse->insert(producer);
-  return false;
+
+  return do_not_fuse;
 }
 
 StatusOr<bool> InstructionFusion::Run(HloModule* module) {
@@ -243,36 +269,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
       InsertOrDie(&post_order_index, post_order[i], i);
     }
 
-    DoNotFuseSet do_not_fuse;
-    auto reachability = computation->ComputeReachability();
-
-    auto cheap_to_duplicate = [this](HloInstruction* producer) {
-      if (producer->opcode() == HloOpcode::kBroadcast) {
-        return true;
-      }
-      if (producer->opcode() == HloOpcode::kConstant &&
-          ShapeUtil::IsEffectiveScalar(producer->shape())) {
-        return true;
-      }
-      if (EffectivelyUnary(producer)) {
-        return true;
-      }
-      return false;
-    };
-
-    for (HloInstruction* consumer : post_order) {
-      for (HloInstruction* producer : consumer->operands()) {
-        if (cheap_to_duplicate(producer)) {
-          continue;
-        }
-        if (CanFuseOnAllPaths(*reachability, producer, consumer,
-                              &do_not_fuse)) {
-          CHECK_EQ(do_not_fuse.count(producer), 0);
-        } else {
-          CHECK_GT(do_not_fuse.count(producer), 0);
-        }
-      }
-    }
+    DoNotFuseSet do_not_fuse = ComputeGloballyUnfusable(post_order);
 
     // Instruction fusion effectively fuses edges in the computation graph
     // (producer instruction -> consumer instruction) so we iterate over all
@@ -302,7 +299,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
 
       // Consider each operand of this instruction for fusion into this
       // instruction. We want to consider the operands in a particular order to
-      // avoid created duplicate instruction clones in the fusion instruction.
+      // avoid creating duplicate instruction clones in the fusion instruction.
       // For example, consider the following expression:
       //
       //   A = ...
@@ -377,7 +374,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         changed = true;
 
         if (operand->user_count() == 0) {
-          // Operand is now dead. Remove from post order by setting it's
+          // Operand is now dead. Remove from post order by setting its
           // location to nullptr.
           post_order[FindOrDie(post_order_index, operand)] = nullptr;
           post_order_index.erase(operand);
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index 152d0886ee9eda19961e092df44cb234ee2bd29d..2ea1fcf937ceaf2cce3f8ed0891399384d93dbd0 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -70,11 +70,11 @@ class InstructionFusion : public HloPassInterface {
   virtual HloInstruction* Fuse(HloInstruction* producer,
                                HloInstruction* consumer);
 
-  // An "effectively unary" operation is one that has one "large"
+  // An "effectively unary" operation is one that has at most one "large"
   // input with the others being negligible in terms of memory usage.
   // We use "has a smaller true rank than the output" as a heuristic
   // for "negligible" memory usage.
-  bool EffectivelyUnary(HloInstruction* hlo);
+  bool EffectivelyAtMostUnary(HloInstruction* hlo);
 
   // Returns true if fusing producer into consumer would cause producer to be
   // duplicated. This is the case if producer has uses other than consumer.
@@ -95,11 +95,16 @@ class InstructionFusion : public HloPassInterface {
   // The set of producers whose consumers we cannot fuse into.
   using DoNotFuseSet = std::unordered_set<HloInstruction*>;
 
-  // Whether or not we can fuse consumer into original_producer on all paths
+  // Whether or not we can fuse producer into consumer on all paths
   // from the producer to the consumer where nodes are HLOs and edges are uses.
-  bool CanFuseOnAllPaths(const HloReachabilityMap& reachability_map,
-                         HloInstruction* producer, HloInstruction* consumer,
-                         DoNotFuseSet* do_not_fuse);
+  bool CanFuseOnAllPaths(HloInstruction* producer, HloInstruction* consumer,
+                         const HloReachabilityMap& reachability_map,
+                         const DoNotFuseSet& do_not_fuse);
+
+  // Computes the set of nodes that we do not want to fuse into any of their
+  // consumers based on a global analysis of the HLO graph.
+  DoNotFuseSet ComputeGloballyUnfusable(
+      tensorflow::gtl::ArraySlice<HloInstruction*> post_order);
 
   // Used to determine if an HLO is expensive. Expensive operations will not be
   // duplicated.
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index 0fa2c95fb458f8f2b863388fd77bca5f10372a0a..e78b99a80cf41318faa1cb709428b8ba0f531944 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 
@@ -92,6 +93,161 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) {
           .ValueOrDie());
 }
 
+// Counts the number of HLO ops with a given op code in the specified module.
+static int Count(const HloModule& module, HloOpcode op) {
+  int count = 0;
+  for (const auto* computation : module.computations()) {
+    for (const auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == op) {
+        ++count;
+      }
+    }
+  }
+  return count;
+}
+
+TEST_F(InstructionFusionTest, FuseCheapNonDuplicatableOps) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add = f32[4,3]{1,0} add(p0, p0)
+    ROOT root = f32[4,3]{1,0} subtract(add, add)
+  })")
+                    .ValueOrDie();
+  // Expect the add and subtraction to be fused.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+
+  // Make sure the add hasn't been duplicated.
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+}
+
+TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) {
+  // Make sure we do not duplicate the add, as we cannot fuse through the rng.
+  //
+  // p0 -> add -------------------------> sub
+  //           \-> abs1 -> rng -> abs2 -/
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add = f32[4,3]{1,0} add(p0, p0)
+    abs1 = f32[4,3]{1,0} abs(add)
+    rng = f32[4,3]{1,0} rng(abs1), distribution=rng_uniform
+    abs2 = f32[4,3]{1,0} abs(rng)
+    ROOT root = f32[4,3]{1,0} subtract(abs2, add)
+  })")
+                    .ValueOrDie();
+  // We expect abs2 to be fused into root.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+
+  // Make sure the add hasn't been duplicated.
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString();
+
+  // Use a log node with a second consumer to break the fusion.
+  //
+  // p0 -> add -------------------------> sub
+  //           \-> abs1 -> log -> abs2 -/
+  //                           \-> send
+  module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add = f32[4,3]{1,0} add(p0, p0)
+    abs1 = f32[4,3]{1,0} abs(add)
+    log = f32[4,3]{1,0} log(abs1)
+    send = f32[4,3]{1,0} send(log), channel_id=0
+    abs2 = f32[4,3]{1,0} abs(log)
+    ROOT root = f32[4,3]{1,0} subtract(abs2, add)
+  })")
+               .ValueOrDie();
+
+  // We expect abs2 to be fused into root and abs1 to be fused into log.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 2) << module->ToString();
+
+  // Make sure the add hasn't been duplicated.
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString();
+
+  // Make sure we still fuse ops where one operand in the chain to the producer
+  // can't be fused.
+  //
+  // p0 ---> add1 -----------> sub
+  //    \         \-> add2 -/
+  //     \-> log -/
+  //             \-> send
+  module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add1 = f32[4,3]{1,0} add(p0, p0)
+    log = f32[4,3]{1,0} log(p0)
+    send = f32[4,3]{1,0} send(log), channel_id=0
+    add2 = f32[4,3]{1,0} add(log, add1)
+    ROOT root = f32[4,3]{1,0} subtract(add1, add2)
+  })")
+               .ValueOrDie();
+
+  // Expect the add1 and add2 to be fused into root.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+
+  // Make sure we didn't duplicate any adds.
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 2) << module->ToString();
+
+  // A variant of the above that allows the algorithm to put add2 into the set
+  // of unfusable ops to short-circuit the decision whether add1 should be fused
+  // into sub2.
+  //
+  //             /---------------\
+  // p0 ---> add1 ---> add2 ------> sub2
+  //                       \------> sub1
+  //                        log -/
+  //                            \-> send
+  module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add1 = f32[4,3]{1,0} add(p0, p0)
+    add2 = f32[4,3]{1,0} add(add1, add1)
+    log = f32[4,3]{1,0} log(add2)
+    send = f32[4,3]{1,0} send(log), channel_id=0
+    sub1 = f32[4,3]{1,0} subtract(log, add2)
+    sub2 = f32[4,3]{1,0} subtract(add2, add1)
+    ROOT root = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(sub1, sub2)
+  })")
+               .ValueOrDie();
+
+  // Expect sub1 and sub2 to be fused into root.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+
+  // Make sure we didn't duplicate any adds.
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 2) << module->ToString();
+}
+
 TEST_F(InstructionFusionTest, AllowUnaryDuplication) {
   HloComputation::Builder builder(TestName());
   auto shape = ShapeUtil::MakeShape(F32, {16, 16});
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 0819ab3b90b2360c6b0b2afaa89f322afe566eb3..45505484951abfcee93a62fec7a99e86cbb9150c 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -63,10 +63,7 @@ cc_library(
     name = "platform_id",
     srcs = ["platform_id.cc"],
     hdrs = ["platform_id.h"],
-    deps = [
-        "@nsync//:nsync_headers",
-        "//tensorflow/core:stream_executor_headers_lib",
-    ] + if_static(
+    deps = ["//tensorflow/core:stream_executor_headers_lib"] + if_static(
         ["@protobuf_archive//:protobuf"],
         ["@protobuf_archive//:protobuf_headers"],
     ),
@@ -123,14 +120,3 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 9171e859c6f84ceef9664aa1eb90a07c87dfab40..76b3ecad26fe92e910fd3fe0e405c726da7e14b7 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -41,9 +41,6 @@ limitations under the License.
 namespace xla {
 namespace interpreter {
 
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::interpreter;
-
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
@@ -96,7 +93,7 @@ InterpreterCompiler::CompileAheadOfTime(
 }
 
 se::Platform::Id InterpreterCompiler::PlatformId() const {
-  return sep::kInterpreterPlatformId;
+  return se::interpreter::kXlaInterpreterPlatformId;
 }
 
 HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
@@ -109,11 +106,12 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(sep::kInterpreterPlatformId, []() {
-    return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
-  });
-  xla::ComputationPlacer::RegisterComputationPlacer(sep::kInterpreterPlatformId,
-                                                    &CreateComputationPlacer);
+  xla::Compiler::RegisterCompilerFactory(
+      se::interpreter::kXlaInterpreterPlatformId, []() {
+        return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
+      });
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index c8660c04d86a82e7dfcfd1658310c2a0e4fa0083..e90ae3e818522e6e4fd9d9f5acb846800bc899ca 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -44,19 +44,16 @@ class InterpreterCompiler : public Compiler {
   ~InterpreterCompiler() override {}
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> hlo_modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
@@ -65,7 +62,7 @@ class InterpreterCompiler : public Compiler {
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
  private:
   Status RunHloOptimization(HloModule* hlo_module);
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 0cb9b5d8107cd8bf468b07d5fe2a22930d9e8b8c..61f199bc9e8f4f95a2f097af4abf9395a1e05f64 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -38,8 +38,6 @@ limitations under the License.
 namespace xla {
 namespace interpreter {
 
-namespace se = ::perftools::gputools;
-
 InterpreterExecutable::InterpreterExecutable(
     std::unique_ptr<const HloModule> hlo_module)
     : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
@@ -47,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable(
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
-StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -90,12 +88,12 @@ StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
       evaluator.Evaluate<std::unique_ptr<Literal>>(*computation, arg_literals));
 
   // Transform the result literal back into a ShapedBuffer.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> result,
-                      transfer_manager->AllocateShapedBuffer(
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                      transfer_manager->AllocateScopedShapedBuffer(
                           result_literal->shape(), run_options->allocator(),
-                          run_options->device_ordinal()));
+                          executor->device_ordinal()));
   TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
-      executor, *result_literal, *result));
+      executor, *result_literal, result));
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -108,8 +106,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
   return std::move(result);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>>
-InterpreterExecutable::ExecuteAsyncOnStream(
+StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return tensorflow::errors::Unimplemented(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index 410110a1adf04c83001c38ed03f5d60dd203dc7e..b0b797ca7d6f449a11c662ffba7c2a0a0040e47e 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -43,12 +43,12 @@ class InterpreterExecutable : public Executable {
   InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module);
   ~InterpreterExecutable() override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 68371910d76f42c0b6d4b1adad9d6a83bdb858e6..97e9fa2c8e8ecd918ffe3df2fd4e731f3b91e6db 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -19,8 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/status_macros.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 host::HostStream *AsExecutorStream(Stream *stream) {
@@ -28,84 +27,85 @@ host::HostStream *AsExecutorStream(Stream *stream) {
   return dynamic_cast<host::HostStream *>(stream->implementation());
 }
 
-InterpreterExecutor::InterpreterExecutor(const PluginConfig &plugin_config)
+XlaInterpreterExecutor::XlaInterpreterExecutor(
+    const PluginConfig &plugin_config)
     : plugin_config_(plugin_config) {}
 
-InterpreterExecutor::~InterpreterExecutor() {}
+XlaInterpreterExecutor::~XlaInterpreterExecutor() {}
 
-void *InterpreterExecutor::Allocate(uint64 size) { return new char[size]; }
+void *XlaInterpreterExecutor::Allocate(uint64 size) { return new char[size]; }
 
-void *InterpreterExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
-                                             uint64 offset_bytes,
-                                             uint64 /*size_bytes*/) {
+void *XlaInterpreterExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
+                                                uint64 offset_bytes,
+                                                uint64 /*size_bytes*/) {
   return parent + offset_bytes;
 }
 
-void InterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
+void XlaInterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
   if (!mem->is_sub_buffer()) {
     delete[] static_cast<char *>(mem->opaque());
   }
 }
 
-bool InterpreterExecutor::Memcpy(Stream *stream, void *host_dst,
-                                 const DeviceMemoryBase &dev_src, uint64 size) {
+bool XlaInterpreterExecutor::Memcpy(Stream *stream, void *host_dst,
+                                    const DeviceMemoryBase &dev_src,
+                                    uint64 size) {
   AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
     port::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
   });
   return true;
 }
 
-bool InterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
-                                 const void *host_src, uint64 size) {
+bool XlaInterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
+                                    const void *host_src, uint64 size) {
   AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
     port::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
   });
   return true;
 }
 
-port::Status InterpreterExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
-                                                    const void *host_src,
-                                                    uint64 size) {
+port::Status XlaInterpreterExecutor::SynchronousMemcpy(
+    DeviceMemoryBase *dev_dst, const void *host_src, uint64 size) {
   memcpy(dev_dst->opaque(), host_src, size);
   return port::Status::OK();
 }
 
-port::Status InterpreterExecutor::SynchronousMemcpy(
+port::Status XlaInterpreterExecutor::SynchronousMemcpy(
     void *host_dst, const DeviceMemoryBase &dev_src, uint64 size) {
   memcpy(host_dst, dev_src.opaque(), size);
   return port::Status::OK();
 }
 
-bool InterpreterExecutor::HostCallback(Stream *stream,
-                                       std::function<void()> callback) {
+bool XlaInterpreterExecutor::HostCallback(Stream *stream,
+                                          std::function<void()> callback) {
   AsExecutorStream(stream)->EnqueueTask(callback);
   return true;
 }
 
-bool InterpreterExecutor::CreateStreamDependency(Stream *dependent,
-                                                 Stream *other) {
+bool XlaInterpreterExecutor::CreateStreamDependency(Stream *dependent,
+                                                    Stream *other) {
   AsExecutorStream(dependent)->EnqueueTask(
       [other]() { SE_CHECK_OK(other->BlockHostUntilDone()); });
   AsExecutorStream(dependent)->BlockUntilDone();
   return true;
 }
 
-bool InterpreterExecutor::StartTimer(Stream *stream, Timer *timer) {
+bool XlaInterpreterExecutor::StartTimer(Stream *stream, Timer *timer) {
   dynamic_cast<host::HostTimer *>(timer->implementation())->Start(stream);
   return true;
 }
 
-bool InterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
+bool XlaInterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
   dynamic_cast<host::HostTimer *>(timer->implementation())->Stop(stream);
   return true;
 }
 
-port::Status InterpreterExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status XlaInterpreterExecutor::BlockHostUntilDone(Stream *stream) {
   AsExecutorStream(stream)->BlockUntilDone();
   return port::Status::OK();
 }
 
-DeviceDescription *InterpreterExecutor::PopulateDeviceDescription() const {
+DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const {
   internal::DeviceDescriptionBuilder builder;
 
   builder.set_device_address_bits(64);
@@ -118,5 +118,4 @@ DeviceDescription *InterpreterExecutor::PopulateDeviceDescription() const {
 }
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index c5d07e906dafb033905c50c604069e80e1ce80cd..9b109022fbfc698f7dadc678ef837da270a5e74a 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Declares the InterpreterExecutor class, which is a CPU-only implementation of
-// the StreamExecutor interface. For now, this is used for testing and to
+// Declares the XlaInterpreterExecutor class, which is a CPU-only implementation
+// of the StreamExecutor interface. For now, this is used for testing and to
 // examine the performance of host-based StreamExecutor code.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
@@ -44,16 +44,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/timer.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
 
-class InterpreterExecutor : public internal::StreamExecutorInterface {
+class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
  public:
-  explicit InterpreterExecutor(const PluginConfig &plugin_config);
-  ~InterpreterExecutor() override;
+  explicit XlaInterpreterExecutor(const PluginConfig &plugin_config);
+  ~XlaInterpreterExecutor() override;
 
   port::Status Init(int device_ordinal, DeviceOptions device_options) override {
     return port::Status::OK();
@@ -213,7 +212,6 @@ class InterpreterExecutor : public internal::StreamExecutorInterface {
 };
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
index cf98ecd7749d61261bf072cdb1882c7603f39556..d27cd7502f10a1f615fc5b0d610acafdf55e3e43 100644
--- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
@@ -21,12 +21,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 
-namespace sei = ::perftools::gputools::interpreter;
-
 namespace xla {
 
 InterpreterTransferManager::InterpreterTransferManager()
-    : GenericTransferManager(sei::kInterpreterPlatformId,
+    : GenericTransferManager(se::interpreter::kXlaInterpreterPlatformId,
                              /*pointer_size=*/sizeof(void*)) {}
 
 }  // namespace xla
@@ -38,7 +36,8 @@ CreateInterpreterTransferManager() {
 
 static bool InitModule() {
   xla::TransferManager::RegisterTransferManager(
-      sei::kInterpreterPlatformId, &CreateInterpreterTransferManager);
+      stream_executor::interpreter::kXlaInterpreterPlatformId,
+      &CreateInterpreterTransferManager);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index a60e7fc59f7c5f0b1b24e026b34e195ca0fe5ebb..92e069a8c67c1d441ba9d396dee503c9b3bde0df 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -28,24 +28,22 @@ limitations under the License.
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::interpreter;
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
-InterpreterPlatform::InterpreterPlatform() : name_("Interpreter") {}
+XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
 
-InterpreterPlatform::~InterpreterPlatform() {}
+XlaInterpreterPlatform::~XlaInterpreterPlatform() {}
 
-Platform::Id InterpreterPlatform::id() const { return kInterpreterPlatformId; }
+Platform::Id XlaInterpreterPlatform::id() const {
+  return kXlaInterpreterPlatformId;
+}
 
-int InterpreterPlatform::VisibleDeviceCount() const { return 1; }
+int XlaInterpreterPlatform::VisibleDeviceCount() const { return 1; }
 
-const string& InterpreterPlatform::Name() const { return name_; }
+const string& XlaInterpreterPlatform::Name() const { return name_; }
 
-port::StatusOr<StreamExecutor*> InterpreterPlatform::ExecutorForDevice(
+port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::ExecutorForDevice(
     int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
@@ -55,7 +53,7 @@ port::StatusOr<StreamExecutor*> InterpreterPlatform::ExecutorForDevice(
 }
 
 port::StatusOr<StreamExecutor*>
-InterpreterPlatform::ExecutorForDeviceWithPluginConfig(
+XlaInterpreterPlatform::ExecutorForDeviceWithPluginConfig(
     int device_ordinal, const PluginConfig& plugin_config) {
   StreamExecutorConfig config;
   config.ordinal = device_ordinal;
@@ -64,16 +62,17 @@ InterpreterPlatform::ExecutorForDeviceWithPluginConfig(
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> InterpreterPlatform::GetExecutor(
+port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::GetExecutor(
     const StreamExecutorConfig& config) {
   return executor_cache_.GetOrCreate(
       config, [&]() { return GetUncachedExecutor(config); });
 }
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
-InterpreterPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<InterpreterExecutor>(config.plugin_config));
+XlaInterpreterPlatform::GetUncachedExecutor(
+    const StreamExecutorConfig& config) {
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<XlaInterpreterExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
@@ -86,26 +85,26 @@ InterpreterPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   return std::move(executor);
 }
 
-void InterpreterPlatform::RegisterTraceListener(
+void XlaInterpreterPlatform::RegisterTraceListener(
     std::unique_ptr<TraceListener> listener) {
   LOG(FATAL) << "not yet implemented: register executor trace listener";
 }
 
-void InterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
+void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
   LOG(FATAL) << "not yet implemented: unregister executor trace listener";
 }
 
-static void InitializeInterpreterPlatform() {
-  std::unique_ptr<se::Platform> platform(new sep::InterpreterPlatform);
-  SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+static void InitializeXlaInterpreterPlatform() {
+  std::unique_ptr<Platform> platform(new XlaInterpreterPlatform);
+  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(interpreter_platform,
-                            sep::InitializeInterpreterPlatform());
+REGISTER_MODULE_INITIALIZER(
+    interpreter_platform,
+    stream_executor::interpreter::InitializeXlaInterpreterPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h
index c66ddb907d1c5a8e99d3178a202a77a72a646ce5..d68c5aa20dda7ac246ed4aa667851e385a604c04 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@@ -23,14 +23,13 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
-class InterpreterPlatform : public Platform {
+class XlaInterpreterPlatform : public Platform {
  public:
-  InterpreterPlatform();
-  ~InterpreterPlatform() override;
+  XlaInterpreterPlatform();
+  ~XlaInterpreterPlatform() override;
 
   Platform::Id id() const override;
 
@@ -60,11 +59,10 @@ class InterpreterPlatform : public Platform {
   // Cache of created StreamExecutors.
   ExecutorCache executor_cache_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(InterpreterPlatform);
+  SE_DISALLOW_COPY_AND_ASSIGN(XlaInterpreterPlatform);
 };
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.cc b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
index 1a0373cf86e26b564e0e732e8de1a0a5d868bfa6..3272396ce5045129a7689a160ec859d11fbbe9fa 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
@@ -14,12 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
-PLATFORM_DEFINE_ID(kInterpreterPlatformId);
+PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId);
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.h b/tensorflow/compiler/xla/service/interpreter/platform_id.h
index 905efef1690d3bd32461353fe32dd394eb6bca9e..a6cc10bcc1eb756a3146d4a834efa4cd3ceb2d27 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
-extern const Platform::Id kInterpreterPlatformId;
+extern const Platform::Id kXlaInterpreterPlatformId;
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 0668f66051ce96292c3c85bac7e649d89914106c..2494569db53f260b900b3d5d3d0d2da5b1fc5f73 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -57,76 +57,6 @@ namespace xla {
 // anonymous namespace, instead of three or four spread all over this file.
 namespace {
 
-// Creates and returns a copy of the given instruction with a different
-// layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
-// instruction producing the copy is returned.
-StatusOr<HloInstruction*> CreateCopyWithNewLayout(
-    const Shape& shape_with_layout, HloInstruction* instruction) {
-  TF_RET_CHECK(LayoutUtil::HasLayout(shape_with_layout));
-  DCHECK(ShapeUtil::Compatible(shape_with_layout, instruction->shape()))
-      << ShapeUtil::HumanString(shape_with_layout) << " "
-      << ShapeUtil::HumanString(instruction->shape())
-      << " instruction: " << instruction->ToString();
-
-  if (ShapeUtil::IsTuple(instruction->shape())) {
-    // Deep-copy tuples.
-    std::vector<HloInstruction*> element_copies;
-    for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
-         ++i) {
-      HloInstruction* gte = instruction->parent()->AddInstruction(
-          HloInstruction::CreateGetTupleElement(
-              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
-              i));
-
-      // Recurse to copy each elements.
-      TF_ASSIGN_OR_RETURN(
-          HloInstruction * element_copy,
-          CreateCopyWithNewLayout(
-              ShapeUtil::GetSubshape(shape_with_layout, {i}), gte));
-      element_copies.push_back(element_copy);
-    }
-    // Gather element copies into a tuple with a new Tuple instruction.
-    HloInstruction* tuple_copy = instruction->parent()->AddInstruction(
-        HloInstruction::CreateTuple(element_copies));
-    LayoutUtil::ClearLayout(tuple_copy->mutable_shape());
-    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-        shape_with_layout, tuple_copy->mutable_shape()));
-    return tuple_copy;
-  } else if (ShapeUtil::IsArray(instruction->shape())) {
-    HloInstruction* copy =
-        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
-            instruction->shape(), HloOpcode::kCopy, instruction));
-    LayoutUtil::ClearLayout(copy->mutable_shape());
-    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-        shape_with_layout, copy->mutable_shape()));
-
-    return copy;
-  } else {
-    return FailedPrecondition(
-        "Can only copy array and tuple shaped instructions");
-  }
-}
-
-// Creates a copy of the given operand if the operand's layout does not match
-// the given layout. This copy replaces the use in the given instruction. Tuple
-// operands will be deep-copied.
-Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
-                                  HloInstruction* instruction,
-                                  int64 operand_no) {
-  HloInstruction* operand = instruction->mutable_operand(operand_no);
-  TF_RET_CHECK(operand_layout.LayoutIsSet());
-  TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
-
-  if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
-    // Operand layout already matches our constraint. Nothing to do.
-    return Status::OK();
-  }
-
-  TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
-                      CreateCopyWithNewLayout(operand_layout.shape(), operand));
-
-  return instruction->ReplaceOperandWith(operand_no, operand_copy);
-}
 
 }  // namespace
 
@@ -192,17 +122,34 @@ LayoutConstraints::LayoutConstraints(
   }
 }
 
+PointsToSet::BufferSet* LayoutConstraints::GetBufferSet(
+    const HloInstruction* instruction) const {
+  auto it = buffer_sets_cache_.find(instruction);
+  if (it != buffer_sets_cache_.end()) {
+    return it->second.get();
+  }
+  auto& buffer_set =
+      buffer_sets_cache_
+          .emplace(instruction, MakeUnique<PointsToSet::BufferSet>())
+          .first->second;
+  const auto& points_to_set = points_to_analysis_.GetPointsToSet(instruction);
+  points_to_set.ForEachElement(
+      [&buffer_set](const ShapeIndex& /*index*/,
+                    const PointsToSet::BufferList& buffers) {
+        buffer_set->insert(buffers.begin(), buffers.end());
+      });
+  return buffer_set.get();
+}
+
 bool LayoutConstraints::OperandBufferForwarded(
     const HloInstruction* instruction, int64 operand_no) const {
   // The operand is potentially forwarded if the intersection of points-to sets
   // of the operand and the instruction is non-empty.
-  auto output_buffers =
-      points_to_analysis_.GetPointsToSet(instruction).CreateFlattenedSet();
-  auto operand_buffers =
-      points_to_analysis_.GetPointsToSet(instruction->operand(operand_no))
-          .CreateFlattenedSet();
-  for (const LogicalBuffer* output_buffer : output_buffers) {
-    if (operand_buffers.count(output_buffer) > 0) {
+  PointsToSet::BufferSet* output_buffers = GetBufferSet(instruction);
+  PointsToSet::BufferSet* operand_buffers =
+      GetBufferSet(instruction->operand(operand_no));
+  for (const LogicalBuffer* output_buffer : *output_buffers) {
+    if (operand_buffers->count(output_buffer) > 0) {
       return true;
     }
   }
@@ -776,6 +723,99 @@ Status CheckConstantLayout(HloInstruction* constant) {
 
 }  // namespace
 
+StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
+    const Shape& shape_with_layout, HloInstruction* instruction) {
+  TF_RET_CHECK(LayoutUtil::HasLayout(shape_with_layout));
+  DCHECK(ShapeUtil::Compatible(shape_with_layout, instruction->shape()))
+      << ShapeUtil::HumanString(shape_with_layout) << " "
+      << ShapeUtil::HumanString(instruction->shape())
+      << " instruction: " << instruction->ToString();
+
+  if (ShapeUtil::IsTuple(instruction->shape())) {
+    // Deep-copy tuples.
+    std::vector<HloInstruction*> element_copies;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
+         ++i) {
+      HloInstruction* gte = instruction->parent()->AddInstruction(
+          HloInstruction::CreateGetTupleElement(
+              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
+              i));
+      SetupCopiedInstruction(*instruction, gte, {i});
+      // Recurse to copy each elements.
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * element_copy,
+          CreateCopyWithNewLayout(
+              ShapeUtil::GetSubshape(shape_with_layout, {i}), gte));
+      element_copies.push_back(element_copy);
+    }
+    // Gather element copies into a tuple with a new Tuple instruction.
+    HloInstruction* tuple_copy = instruction->parent()->AddInstruction(
+        HloInstruction::CreateTuple(element_copies));
+    SetupCopiedInstruction(*instruction, tuple_copy, {});
+    LayoutUtil::ClearLayout(tuple_copy->mutable_shape());
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        shape_with_layout, tuple_copy->mutable_shape()));
+    return tuple_copy;
+  } else if (ShapeUtil::IsArray(instruction->shape())) {
+    HloInstruction* copy =
+        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
+            instruction->shape(), HloOpcode::kCopy, instruction));
+    SetupCopiedInstruction(*instruction, copy, {});
+    LayoutUtil::ClearLayout(copy->mutable_shape());
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        shape_with_layout, copy->mutable_shape()));
+
+    return copy;
+  } else {
+    return FailedPrecondition(
+        "Can only copy array and tuple shaped instructions");
+  }
+}
+
+// Creates a copy of the given operand if the operand's layout does not match
+// the given layout. This copy replaces the use in the given instruction. Tuple
+// operands will be deep-copied.
+Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
+    const ShapeLayout& operand_layout, HloInstruction* instruction,
+    int64 operand_no) {
+  HloInstruction* operand = instruction->mutable_operand(operand_no);
+  TF_RET_CHECK(operand_layout.LayoutIsSet());
+  TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
+
+  if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
+    // Operand layout already matches our constraint. Nothing to do.
+    return Status::OK();
+  }
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
+                      CreateCopyWithNewLayout(operand_layout.shape(), operand));
+
+  return instruction->ReplaceOperandWith(operand_no, operand_copy);
+}
+
+void LayoutAssignment::SetupCopiedInstruction(const HloInstruction& instruction,
+                                              HloInstruction* copy,
+                                              const ShapeIndex& index) {
+  if (instruction.has_sharding()) {
+    // If the index is empty, we want to copy the whole sharding, in case the
+    // sharding is a tuple sharding.
+    HloSharding sharding =
+        !index.empty() && instruction.sharding().IsTuple()
+            ? instruction.sharding().GetSubSharding(instruction.shape(), index)
+            : instruction.sharding();
+    // We propagate the sharding to the copied instruction only if it is a
+    // special sharding, like tiled ones, or special devices like the
+    // HostCompute module.
+    // Otherwise it is preferable to leave the new instruction without device,
+    // and let the automatic device placer to choose the best location.
+    if (!sharding.HasUniqueDevice() ||
+        HloSharding::IsReservedDevice(sharding.UniqueDevice().ValueOrDie())) {
+      copy->set_sharding(sharding);
+    }
+  }
+  copy->set_metadata(instruction.metadata());
+}
+
 Status LayoutAssignment::CheckLayouts(HloModule* module) {
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
@@ -1544,6 +1584,13 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
     // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
     // layout assignment pass that may accidently use the existing layout.
     for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kBitcast) {
+        // bitcasts are inherently layout sensitive and so a bitcast instruction
+        // present in the IR before layout assignment is a bug.
+        return InternalError(
+            "Unexpected bitcast operation seen during layout assignment: %s.",
+            instruction->ToString().c_str());
+      }
       if (instruction->opcode() != HloOpcode::kInfeed) {
         LayoutUtil::ClearLayout(instruction->mutable_shape());
       }
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 29018584487cabfd740d7914625c2a50f552d6ff..ae4986d6ad9bc3de100eab9cc38b709bb56c7813 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -199,6 +200,11 @@ class LayoutConstraints {
   string ToString() const;
 
  private:
+  // Find a bufferset in the bufferset cache. This is useful since we can
+  // currently create the flattened buffer set for the same instruction many
+  // times, which is often slow.
+  PointsToSet::BufferSet* GetBufferSet(const HloInstruction* instruction) const;
+
   // The set of BufferLayoutConstraints applied to the computation.
   std::unordered_map<const LogicalBuffer*, BufferLayoutConstraint>
       buffer_constraints_;
@@ -221,6 +227,10 @@ class LayoutConstraints {
   // Array-shaped buffers which have not yet been constrained.
   std::set<LogicalBuffer::Id> unconstrained_buffer_ids_;
 
+  mutable tensorflow::gtl::FlatMap<const HloInstruction*,
+                                   std::unique_ptr<PointsToSet::BufferSet>>
+      buffer_sets_cache_;
+
   HloComputation* computation_;
 };
 
@@ -393,14 +403,37 @@ class LayoutAssignment : public HloPassInterface {
   Status CheckLayouts(HloModule* module);
 
   ComputationLayout* entry_computation_layout_;
-  ChannelLayoutConstraints* channel_layout_constraints_;
 
  protected:
+  // Sets up the copy instruction according to the characteristic (sharding,
+  // metadata, ...) of the reference instruction. The index argument is used
+  // when the instruction is a tuple, and in such case the index represents
+  // the location from where the copy instruction was created from.
+  // If the index is empty, the whole sharding will be propagated, even in case
+  // the intruction has a tuple sharding.
+  static void SetupCopiedInstruction(const HloInstruction& instruction,
+                                     HloInstruction* copy,
+                                     const ShapeIndex& index);
+
+  // Creates and returns a copy of the given instruction with a different
+  // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
+  // instruction producing the copy is returned.
+  static StatusOr<HloInstruction*> CreateCopyWithNewLayout(
+      const Shape& shape_with_layout, HloInstruction* instruction);
+
+  // Creates a copy of the given operand if the operand's layout does not match
+  // the given layout. This copy replaces the use in the given instruction.
+  // Tuple operands will be deep-copied.
+  static Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
+                                           HloInstruction* instruction,
+                                           int64 operand_no);
+
   // Map containing the layouts of all computations assigned so
   // far. Computations are handled in a topological sort where computations are
   // handled before their caller instructions so the layouts of caller
   // instructions can be set to match the computation.
   std::map<HloComputation*, ComputationLayout> computation_layouts_;
+  ChannelLayoutConstraints* channel_layout_constraints_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index dd0fba2758f0d77e72bc55138df229b24c026677..4b1c9bad41de8030cf14bc6d1c0db21b9c56c3bf 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -590,6 +590,85 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) {
                                             transpose->shape(), {2, 3, 0, 1}));
 }
 
+// TransposeIsBitcast shouldn't be called without layout information.
+TEST_F(LayoutAssignmentTest, TransposeIsBitcastFail) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+  Shape input_shape_with_layout(input_shape);
+  *input_shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({2, 1, 0});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape_with_layout, "param"));
+  auto hlo = builder.AddInstruction(
+      HloInstruction::CreateTranspose(input_shape, param, {0, 2, 1}));
+  // Clear the default layout assigned to the instruction.
+  LayoutUtil::ClearLayout(hlo->mutable_shape());
+  EXPECT_DEATH(ShapeUtil::TransposeIsBitcast(hlo->operand(0)->shape(),
+                                             hlo->shape(), hlo->dimensions()),
+               "LayoutUtil::HasLayout");
+}
+
+// ReshapeIsBitcast shouldn't be called without layout information.
+TEST_F(LayoutAssignmentTest, ReshapeIsBitcastFail) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+  Shape input_shape_with_layout(input_shape);
+  *input_shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({2, 1, 0});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape_with_layout, "param"));
+  auto hlo =
+      builder.AddInstruction(HloInstruction::CreateReshape(input_shape, param));
+  // Clear the default layout assigned to the instruction.
+  LayoutUtil::ClearLayout(hlo->mutable_shape());
+  EXPECT_DEATH(
+      ShapeUtil::ReshapeIsBitcast(hlo->operand(0)->shape(), hlo->shape()),
+      "LayoutUtil::HasLayout");
+}
+
+// Check that the computation below doesn't crash the compiler.
+//
+// Within a fusion computation, only the parameters and result get assigned a
+// layout.  When we run the algebraic simplifier on this computation post layout
+// assignment, it should not call TransposeIsBitcast on the `transpose` node
+// inside the fusion computation as TransposeIsBitcast checks both input_shape
+// and output_shape have layouts.
+TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) {
+  const char* module_str = R"(
+    HloModule test_module
+
+    fused_computation {
+      param_1 = f32[2,2,2]{2,1,0} parameter(1)
+      transpose = f32[2,2,2]{2,1,0} transpose(param_1), dimensions={0,2,1}
+      reduce_1 = f32[] parameter(0)
+      broadcast_1 = f32[2,2,2]{2,1,0} broadcast(reduce_1), dimensions={}
+      ROOT divide_1 = f32[2,2,2]{2,1,0} divide(transpose, broadcast_1)
+    }
+
+    ENTRY entry_computation {
+      fusion.1 = f32[2,2,2]{2,1,0} parameter(1)
+      reduce.1 = f32[] parameter(0)
+      fusion.2 = f32[2,2,2]{2,1,0} fusion(reduce.1, fusion.1), kind=kLoop, calls=fused_computation
+     ROOT tuple.1 = (f32[2,2,2]{2,1,0}) tuple(fusion.2)
+    }
+  )";
+
+  auto module = tools::Parse(module_str).ValueOrDie();
+
+  module =
+      backend()
+          .compiler()
+          ->RunHloPasses(std::move(module), backend().default_stream_executor(),
+                         /*device_allocator=*/nullptr)
+          .ConsumeValueOrDie();
+
+  EXPECT_EQ(
+      ::tensorflow::Status::OK(),
+      backend()
+          .compiler()
+          ->RunBackend(std::move(module), backend().default_stream_executor(),
+                       /*device_allocator=*/nullptr)
+          .status());
+}
+
 // A GTE inside of a fusion node inherits the layout of its operand (which
 // should, if we keep following operands, eventually be a parameter).
 TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
@@ -629,33 +708,29 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
       LayoutUtil::MakeLayout({2, 1, 0}));
   AssignLayouts(module.get(), &computation_layout);
 
-  HloComputation* fused_computation = *std::find_if(
-      module->computations().begin(), module->computations().end(),
-      [](const HloComputation* c) { return c->name() == "fused_computation"; });
-
-  auto fused_instr = [&](const string& name) {
-    auto it = std::find_if(
-        fused_computation->instructions().begin(),
-        fused_computation->instructions().end(),
-        [&](const HloInstruction* i) { return i->name() == name; });
-    CHECK(it != fused_computation->instructions().end());
-    return *it;
+  auto layout_of = [&](tensorflow::StringPiece name) {
+    return FindInstruction(module.get(), name)
+        ->shape()
+        .layout()
+        .minor_to_major();
   };
 
-  EXPECT_THAT(fused_instr("gte0")->shape().layout().minor_to_major(),
-              ElementsAre(0, 1, 2));
-  EXPECT_THAT(
-      fused_instr("gte1")->shape().tuple_shapes(0).layout().minor_to_major(),
-      ElementsAre(1, 2, 0));
-  EXPECT_THAT(
-      fused_instr("gte1")->shape().tuple_shapes(1).layout().minor_to_major(),
-      ElementsAre(2, 0, 1));
-  EXPECT_THAT(fused_instr("gte1a")->shape().layout().minor_to_major(),
+  EXPECT_THAT(layout_of("gte0"), ElementsAre(0, 1, 2));
+  EXPECT_THAT(layout_of("gte1a"), ElementsAre(1, 2, 0));
+  EXPECT_THAT(layout_of("gte1b"), ElementsAre(2, 0, 1));
+  EXPECT_THAT(layout_of("fresult"), ElementsAre(2, 1, 0));
+  EXPECT_THAT(FindInstruction(module.get(), "gte1")
+                  ->shape()
+                  .tuple_shapes(0)
+                  .layout()
+                  .minor_to_major(),
               ElementsAre(1, 2, 0));
-  EXPECT_THAT(fused_instr("gte1b")->shape().layout().minor_to_major(),
+  EXPECT_THAT(FindInstruction(module.get(), "gte1")
+                  ->shape()
+                  .tuple_shapes(1)
+                  .layout()
+                  .minor_to_major(),
               ElementsAre(2, 0, 1));
-  EXPECT_THAT(fused_instr("fresult")->shape().layout().minor_to_major(),
-              ElementsAre(2, 1, 0));
 }
 
 TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
@@ -721,5 +796,26 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   EXPECT_THAT(false_result->opcode(), HloOpcode::kCopy);
 }
 
+TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
+  auto builder = HloComputation::Builder(TestName());
+  auto constant0 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      constant0->shape(), HloOpcode::kBitcast, constant0));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  ComputationLayout computation_layout(
+      module->entry_computation()->ComputeProgramShape());
+  LayoutAssignment layout_assignment(&computation_layout);
+  Status error_status = layout_assignment.Run(module.get()).status();
+  EXPECT_FALSE(error_status.ok());
+  EXPECT_THAT(
+      error_status.error_message(),
+      ::testing::HasSubstr(
+          "Unexpected bitcast operation seen during layout assignment"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 911b243fe28a5baf8a4b8ed752b892265f5388ac..b17c9d504501a907e27d5152e0082799e87443c7 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -23,7 +23,7 @@ limitations under the License.
 namespace xla {
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
+    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
     DeviceMemoryAllocator* device_allocator) {
   // Tensorflow tries to enable the following behaviors in all its threads:
   //
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index d74e81bb7f622ac5e89203a3d02ca5ad839da07e..f1c623508c5307f2b1c036d3ec6823b75c7eda13 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -60,19 +60,18 @@ class LLVMCompiler : public Compiler {
   // Bring in
   //   StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       se::StreamExecutor* stream_exec,
   //       DeviceMemoryAllocator* device_allocator)
   //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       se::StreamExecutor* stream_exec,
   //       DeviceMemoryAllocator* device_allocator)
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs,
+      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
       DeviceMemoryAllocator* device_allocator) override;
 
  protected:
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 37261ed1e665ebed9685751161a412ad114a9e96..f1e7fc29532ce7e6841010a5258f4000a7c70383 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -169,17 +169,3 @@ cc_library(
         "@llvm//:core",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 6384c7f46f5ebbedaeda232b40095611a5d738a4..3312a888443233139841ce7a5e3173f907605e1d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -29,18 +29,13 @@ limitations under the License.
 namespace xla {
 namespace llvm_ir {
 
-IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
-                      llvm::IRBuilder<>* ir_builder)
-    : multidim_(ShapeUtil::Rank(shape)),
-      linear_(linear),
-      layout_(shape.layout()),
-      dims_(shape.dimensions().begin(), shape.dimensions().end()) {
-  CHECK(LayoutUtil::HasLayout(shape))
-      << "Shape " << ShapeUtil::HumanStringWithLayout(shape)
-      << " should have a layout.";
+static void Delinearize(std::vector<llvm::Value*>* multidim,
+                        llvm::Value* linear, const Shape& shape,
+                        llvm::IRBuilder<>* ir_builder) {
   int64 divisor = 1;
-  for (int64 i = 0; i < layout_.minor_to_major_size(); ++i) {
-    int64 dimension = layout_.minor_to_major(i);
+  const Layout& layout = shape.layout();
+  for (int64 i = 0; i < layout.minor_to_major_size(); ++i) {
+    int64 dimension = layout.minor_to_major(i);
     int64 size_of_current_dimension = shape.dimensions(dimension);
 
     // If i is not the last dimension, compute
@@ -54,16 +49,28 @@ IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
     // memory lives in one big allocation, so cuda-memcheck can't detect
     // out-of-bounds accesses.
     auto* quot = ir_builder->CreateUDiv(linear, ir_builder->getInt64(divisor));
-    if (i < layout_.minor_to_major_size() - 1) {
-      multidim_[dimension] = ir_builder->CreateURem(
+    if (i < layout.minor_to_major_size() - 1) {
+      (*multidim)[dimension] = ir_builder->CreateURem(
           quot, ir_builder->getInt64(size_of_current_dimension));
     } else {
-      multidim_[dimension] = quot;
+      (*multidim)[dimension] = quot;
     }
     divisor *= size_of_current_dimension;
   }
 }
 
+IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
+                      llvm::IRBuilder<>* ir_builder)
+    : multidim_(ShapeUtil::Rank(shape)),
+      linear_(linear),
+      layout_(shape.layout()),
+      dims_(shape.dimensions().begin(), shape.dimensions().end()) {
+  CHECK(LayoutUtil::HasLayout(shape))
+      << "Shape " << ShapeUtil::HumanStringWithLayout(shape)
+      << " should have a layout.";
+  Delinearize(&multidim_, linear, shape, ir_builder);
+}
+
 IrArray::Index::Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
                       llvm::Value* linear, const Shape& shape)
     : multidim_(multidim.begin(), multidim.end()),
@@ -83,7 +90,6 @@ IrArray::Index::Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
       dims_(shape.dimensions().begin(), shape.dimensions().end()) {
   CHECK_EQ(shape.dimensions_size(), multidim.size());
   CHECK(LayoutUtil::HasLayout(shape));
-  linear_ = Linearize(AsInt64Slice(shape.dimensions()), ir_builder);
 }
 
 IrArray::IrArray(llvm::Value* base_ptr, const Shape& shape)
@@ -106,16 +112,13 @@ IrArray::IrArray(llvm::Value* base_ptr, const Shape& shape)
   }
 }
 
-// Returns whether given linear index valid on given shape.
+// Returns whether the given linear index is valid on the given shape.
 bool IrArray::Index::LinearValidOnShape(const Shape& a) const {
-  auto b = ShapeUtil::MakeShape(PRED /* irrelevant */, dims_);
+  auto b = ShapeUtil::MakeShape(a.element_type(), dims_);
   *b.mutable_layout() = layout_;
   return linear_ != nullptr &&
-         ContainersEqual(
-             ShapeUtil::StripDegenerateDimensions(a).dimensions(),
-             ShapeUtil::StripDegenerateDimensions(b).dimensions()) &&
-         LayoutUtil::Equal(ShapeUtil::StripDegenerateDimensions(a).layout(),
-                           ShapeUtil::StripDegenerateDimensions(b).layout());
+         ShapeUtil::ElementsIn(a) == ShapeUtil::ElementsIn(b) &&
+         ShapeUtil::ReshapeIsBitcast(a, b);
 }
 
 IrArray::Index IrArray::Index::SourceIndexOfReshape(
@@ -160,7 +163,8 @@ IrArray::Index IrArray::Index::SourceIndexOfReshape(
     }
   }
 
-  if (linear() != nullptr &&
+  if (linear() != nullptr && LayoutUtil::HasLayout(input_shape) &&
+      LayoutUtil::HasLayout(output_shape) &&
       ShapeUtil::ReshapeIsBitcast(input_shape, output_shape)) {
     return Index(source_multidim_index, linear(), input_shape);
   }
@@ -195,13 +199,111 @@ IrArray::Index IrArray::Index::SourceIndexOfTranspose(
     llvm::IRBuilder<>* builder) const {
   std::vector<llvm::Value*> operand_multidim_index =
       Permute(dimension_mapping, multidim());
-  if (linear() != nullptr &&
+
+  if (linear() != nullptr && LayoutUtil::HasLayout(operand_shape) &&
+      LayoutUtil::HasLayout(shape) &&
       ShapeUtil::TransposeIsBitcast(operand_shape, shape, dimension_mapping)) {
     return Index(operand_multidim_index, linear(), operand_shape);
   }
+
   return Index(operand_multidim_index);
 }
 
+IrArray::Index IrArray::Index::SourceIndexOfBitcast(
+    const Shape& shape, const Shape& operand_shape,
+    llvm::IRBuilder<>* builder) const {
+  CHECK(LayoutUtil::HasLayout(shape) && LayoutUtil::HasLayout(operand_shape));
+  // In case the bitcast is just a reshape, we can use SourceIndexOfReshape()
+  // instead. This will reuse linear() if possible, so we don't have to build a
+  // new 'linear_index'.
+  if (ShapeUtil::ReshapeIsBitcast(operand_shape, shape)) {
+    return SourceIndexOfReshape(shape, operand_shape, builder);
+  }
+
+  // First linearize the index coming from the output of the bitcast. We want
+  // the physical index of the element in the buffer. This is like Linearize,
+  // but takes the layout into account.
+  int64 scale = 1;
+  llvm::Value* linear_index = builder->getInt64(0);
+  for (auto dimension : LayoutUtil::MinorToMajor(shape)) {
+    linear_index = builder->CreateAdd(
+        linear_index,
+        builder->CreateMul(multidim_[dimension], builder->getInt64(scale), "",
+                           /*HasNUW=*/true, /*HasNSW=*/true),
+        "", /*HasNUW=*/true, /*HasNSW=*/true);
+    scale *= shape.dimensions(dimension);
+  }
+
+  // Now delinearize it for the input of the bitcast.
+  std::vector<llvm::Value*> multi_index(operand_shape.dimensions_size());
+  Delinearize(&multi_index, linear_index, operand_shape, builder);
+
+  return Index(multi_index, linear_index, operand_shape);
+}
+
+IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
+    const Shape& shape, const Shape& operand_shape,
+    tensorflow::gtl::ArraySlice<int64> dimension_mapping,
+    llvm::IRBuilder<>* builder) const {
+  int64 rank = ShapeUtil::Rank(operand_shape);
+  std::vector<llvm::Value*> source_index(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    source_index[i] = multidim_[dimension_mapping[i]];
+  }
+  if (linear_ == nullptr || !LayoutUtil::HasLayout(operand_shape) ||
+      !LayoutUtil::HasLayout(shape)) {
+    return Index(source_index);
+  }
+  // High-level idea: we can reuse the linear index if the broadcasted
+  // dimensions are contiguous, and this part of the operation is a bitcast.
+  // The other dimensions can be masked out with a div and a mod operation.
+  std::vector<int64> logical_to_physical =
+      LayoutUtil::MakeLogicalToPhysical(shape.layout());
+  int64 output_rank = ShapeUtil::Rank(shape);
+  // The minimum physical dimension that is broadcasted.
+  int64 min_broadcasted_dimension = output_rank;
+  // The maximum physical dimension that is broadcasted.
+  int64 max_broadcasted_dimension = -1;
+  for (int64 i = 0; i < rank; ++i) {
+    int64 physical_dim = logical_to_physical[dimension_mapping[i]];
+    min_broadcasted_dimension =
+        std::min(min_broadcasted_dimension, physical_dim);
+    max_broadcasted_dimension =
+        std::max(max_broadcasted_dimension, physical_dim);
+  }
+  bool contiguous_broadcast_dimensions =
+      max_broadcasted_dimension - min_broadcasted_dimension == rank - 1;
+  if (!contiguous_broadcast_dimensions) {
+    return Index(source_index);
+  }
+  // Check if the mapped dimensions are a bitcast.
+  std::vector<int64> operand_logical_to_physical =
+      LayoutUtil::MakeLogicalToPhysical(operand_shape.layout());
+  for (int64 i = 0; i < rank; ++i) {
+    if (operand_logical_to_physical[i] !=
+        logical_to_physical[dimension_mapping[i]] - min_broadcasted_dimension) {
+      return Index(source_index);
+    }
+  }
+  llvm::Value* linear = linear_;
+  int64 divisor = 1;
+  for (int64 i = max_broadcasted_dimension + 1; i < output_rank; ++i) {
+    divisor *= shape.dimensions(LayoutUtil::Major(shape.layout(), i));
+  }
+  if (divisor > 1) {
+    linear = builder->CreateUDiv(linear, builder->getInt64(divisor));
+  }
+  if (min_broadcasted_dimension > 0) {
+    int64 mod = 1;
+    for (int64 i = min_broadcasted_dimension; i <= max_broadcasted_dimension;
+         ++i) {
+      mod *= shape.dimensions(LayoutUtil::Major(shape.layout(), i));
+    }
+    linear = builder->CreateURem(linear, builder->getInt64(mod));
+  }
+  return Index(source_index, linear, operand_shape);
+}
+
 llvm::Value* IrArray::Index::Linearize(
     tensorflow::gtl::ArraySlice<int64> dimensions,
     llvm::IRBuilder<>* builder) const {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 387d4629125cbb791840e943013188d14159908a..4c3195c29c859c9eef08e3f6531b059edbebfc47 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -76,8 +76,7 @@ class IrArray {
           llvm::IRBuilder<>* ir_builder);
 
     // Constructs an index from the given multi-dimensional index and the shape
-    // that it indexes into. Also, computes the linear index according to
-    // "shape".
+    // that it indexes into.
     //
     // Precondition: "shape" has a layout.
     Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
@@ -98,6 +97,10 @@ class IrArray {
     llvm::Value*& operator[](size_t i) { return multidim()[i]; }
 
     void push_back(llvm::Value* value) { multidim().push_back(value); }
+    void InsertAt(int64 index, llvm::Value* value) {
+      CHECK_LE(index, size());
+      multidim().insert(multidim().begin() + index, value);
+    }
 
     using iterator = std::vector<llvm::Value*>::iterator;
     using const_iterator = std::vector<llvm::Value*>::const_iterator;
@@ -134,6 +137,18 @@ class IrArray {
         tensorflow::gtl::ArraySlice<int64> dimension_mapping,
         llvm::IRBuilder<>* builder) const;
 
+    // Given that "this" is the target index of a bitcast from `operand_shape`
+    // to `shape`, returns the source index.
+    Index SourceIndexOfBitcast(const Shape& shape, const Shape& operand_shape,
+                               llvm::IRBuilder<>* builder) const;
+
+    // Given that "this" is the target index of a broadcast from `operand_shape`
+    // to `shape` with the given dimension mapping, returns the source index.
+    Index SourceIndexOfBroadcast(
+        const Shape& shape, const Shape& operand_shape,
+        tensorflow::gtl::ArraySlice<int64> dimension_mapping,
+        llvm::IRBuilder<>* builder) const;
+
     // Linearizes the index into the given shape, i.e. reshapes it to rank-1 and
     // returns the index into the sole dimension 0 of the new shape.
     llvm::Value* Linearize(tensorflow::gtl::ArraySlice<int64> dimensions,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 5c1866311d1ae1e0c33ab061ee326d86d647a908..ec04239b4f9112134ba876fdfbb3905a3baf1f72 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -106,8 +107,10 @@ llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
     auto cmp = ir_builder->CreateFCmpUGE(lhs_value, rhs_value);
     return ir_builder->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
-    return EmitCallToIntrinsic(llvm::Intrinsic::maxnum, {lhs_value, rhs_value},
-                               {lhs_value->getType()}, ir_builder);
+    auto cmp_ge = ir_builder->CreateFCmpOGE(lhs_value, rhs_value);
+    auto lhs_is_nan = ir_builder->CreateFCmpUNE(lhs_value, lhs_value);
+    auto sel_lhs = ir_builder->CreateOr(cmp_ge, lhs_is_nan);
+    return ir_builder->CreateSelect(sel_lhs, lhs_value, rhs_value);
   }
 }
 
@@ -117,8 +120,10 @@ llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
     auto cmp = ir_builder->CreateFCmpULE(lhs_value, rhs_value);
     return ir_builder->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
-    return EmitCallToIntrinsic(llvm::Intrinsic::minnum, {lhs_value, rhs_value},
-                               {lhs_value->getType()}, ir_builder);
+    auto cmp_le = ir_builder->CreateFCmpOLE(lhs_value, rhs_value);
+    auto lhs_is_nan = ir_builder->CreateFCmpUNE(lhs_value, lhs_value);
+    auto sel_lhs = ir_builder->CreateOr(cmp_le, lhs_is_nan);
+    return ir_builder->CreateSelect(sel_lhs, lhs_value, rhs_value);
   }
 }
 
@@ -758,7 +763,7 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
     fake_argv_storage.push_back("");
     for (const auto& it : options) {
       // Skip options the XLA backend itself consumes.
-      if (!tensorflow::StringPiece(it.first).starts_with("xla_")) {
+      if (!tensorflow::str_util::StartsWith(it.first, "xla_")) {
         if (it.second.empty()) {
           fake_argv_storage.push_back(it.first);
         } else {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index b6b918ec78a27b90325f72eea14b97f9aee43c54..3978acc132f34b8b195d3772ccf71d0d467984db 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -88,12 +88,12 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
   }
 }
 
-IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   if (ShapeUtil::IsScalar(shape_)) {
     // No loop needed, so set exit_bb_ to nullptr.
     exit_bb_ = nullptr;
-    return IrArray::Index();
+    return {IrArray::Index()};
   }
 
   // Create loop nest with one for-loop for each dimension of the target shape.
@@ -121,12 +121,14 @@ IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
   CHECK_NOTNULL(exit_bb_);
 
-  return array_index;
+  return {array_index};
 }
 
 tensorflow::Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
-  IrArray::Index array_index = EmitIndexAndSetExitBasicBlock(loop_name);
-  TF_RETURN_IF_ERROR(body_emitter_(array_index));
+  for (const IrArray::Index& array_index :
+       EmitIndexAndSetExitBasicBlock(loop_name)) {
+    TF_RETURN_IF_ERROR(body_emitter_(array_index));
+  }
 
   // Set the insertion point of ir_builder_ to the loop exit, so that
   // code emitted for later instructions will be correctly placed.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 0fc528439a0d5bf8382dfcf2d8b3051f8900bf1d..9ff497aecd0bc964c929205c7fd410cca87d9b77 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -63,11 +63,12 @@ class LoopEmitter {
 
   // Emits a loop nest (with a yet-to-be-filled loop body) that iterates through
   // every element in the given shape. Returns the multi-dimensional index that
-  // specifies the element.
-  IrArray::Index EmitIndexAndSetExitBasicBlock() {
+  // specifies the element, will return multiple indices if the loop is
+  // unrolled.
+  std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock() {
     return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"");
   }
-  virtual IrArray::Index EmitIndexAndSetExitBasicBlock(
+  virtual std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name);
 
   // Emits a complete loop nest for every element in the given shape.
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 07f989d4faea199e812e54d2ae74d3ff9e7fa19a..0fa4061738612df76c72a18a9353f16bf6a42677 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -43,13 +43,11 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 /* static */ StatusOr<std::unique_ptr<LocalService>> LocalService::NewService(
     const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
@@ -69,6 +67,68 @@ LocalService::LocalService(const ServiceOptions& options,
                            std::unique_ptr<Backend> execute_backend)
     : Service(options, std::move(execute_backend)) {}
 
+namespace {
+
+// Retrieves the parameter metadata for the given computation and parameter
+// number.
+//
+// If the parameter number is invalid for this computation, nullopt is
+// returned. When the return value has_value(), nullptr will never be
+// the held value.
+tensorflow::gtl::optional<const OpMetadata*> ParameterMetadata(
+    const XlaComputation& computation, int parameter_number) {
+  for (const HloComputationProto& comp : computation.proto().computations()) {
+    if (comp.id() == computation.proto().entry_computation_id()) {
+      for (const HloInstructionProto& instr : comp.instructions()) {
+        if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter) &&
+            instr.parameter_number() == parameter_number) {
+          if (!instr.has_metadata()) {
+            return tensorflow::gtl::nullopt;
+          }
+          return &instr.metadata();
+        }
+      }
+    }
+  }
+  return tensorflow::gtl::nullopt;
+}
+
+ExecutionOptions CreateExecutionOptions(
+    const ExecutableBuildOptions& build_options,
+    const ProgramShape* program_shape) {
+  ExecutionOptions execution_options = CreateDefaultExecutionOptions();
+  if (build_options.hlo_profile().has_value()) {
+    execution_options.mutable_debug_options()->set_xla_hlo_profile(
+        *build_options.hlo_profile());
+  }
+  if (build_options.generate_hlo_graph().has_value()) {
+    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
+        build_options.generate_hlo_graph().value());
+  }
+  if (build_options.dump_optimized_hlo_proto_to().has_value()) {
+    execution_options.mutable_debug_options()
+        ->set_xla_dump_optimized_hlo_proto_to(
+            build_options.dump_optimized_hlo_proto_to().value());
+  }
+  if (build_options.dump_per_pass_hlo_proto_to().has_value()) {
+    execution_options.mutable_debug_options()
+        ->set_xla_dump_per_pass_hlo_proto_to(
+            build_options.dump_per_pass_hlo_proto_to().value());
+  }
+  if (build_options.result_layout() != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *build_options.result_layout();
+  } else {
+    *execution_options.mutable_shape_with_output_layout() =
+        program_shape->result();
+    LayoutUtil::SetToDefaultLayout(
+        execution_options.mutable_shape_with_output_layout());
+  }
+  return execution_options;
+}
+
+}  // namespace
+
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
@@ -118,30 +178,78 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
         *build_options.result_layout(), program_shape->result()));
   }
 
-  ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-  if (build_options.generate_hlo_graph().has_value()) {
-    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
-        build_options.generate_hlo_graph().value());
+  ExecutionOptions execution_options =
+      CreateExecutionOptions(build_options, program_shape.get());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
+                      CreateModuleConfig(*program_shape, argument_layouts,
+                                         &execution_options, user_computation));
+
+  TF_ASSIGN_OR_RETURN(
+      se::StreamExecutor * executor,
+      execute_backend_->stream_executor(build_options.device_ordinal()));
+
+  return BuildExecutable(versioned_handle, std::move(module_config),
+                         execute_backend_.get(), executor,
+                         build_options.device_allocator());
+}
+
+StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
+    const XlaComputation& computation,
+    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
+    const ExecutableBuildOptions& build_options) {
+  const HloModuleProto& proto = computation.proto();
+  TF_RET_CHECK(proto.has_program_shape());
+  const ProgramShape& program_shape = proto.program_shape();
+
+  // Validate incoming layouts.
+  if (argument_layouts.size() != program_shape.parameters_size()) {
+    return InvalidArgument(
+        "Invalid number of arguments for computation: expected %d, got %zu.",
+        program_shape.parameters_size(), argument_layouts.size());
+  }
+
+  for (int i = 0; i < argument_layouts.size(); ++i) {
+    const Shape& argument_shape = *argument_layouts[i];
+    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(argument_shape));
+    if (!ShapeUtil::Compatible(argument_shape, program_shape.parameters(i))) {
+      tensorflow::gtl::optional<const OpMetadata*> metadata =
+          ParameterMetadata(computation, /*parameter_number=*/i);
+      auto metadata_string = [&metadata]() -> string {
+        if (!metadata.has_value()) {
+          return "";
+        }
+        CHECK(metadata.value() != nullptr);
+        const OpMetadata& m = *metadata.value();
+        if (!m.source_file().empty()) {
+          return tensorflow::strings::Printf(
+              " (%s:%d)", m.source_file().c_str(), m.source_line());
+        }
+        return "";
+      };
+      return InvalidArgument(
+          "Invalid argument shape for argument %d%s, expected %s, got %s.", i,
+          metadata_string().c_str(),
+          ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
+          ShapeUtil::HumanString(argument_shape).c_str());
+    }
   }
   if (build_options.result_layout() != nullptr) {
-    *execution_options.mutable_shape_with_output_layout() =
-        *build_options.result_layout();
-  } else {
-    *execution_options.mutable_shape_with_output_layout() =
-        program_shape->result();
-    LayoutUtil::SetToDefaultLayout(
-        execution_options.mutable_shape_with_output_layout());
+    TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(
+        *build_options.result_layout(), program_shape.result()));
   }
+
+  ExecutionOptions execution_options =
+      CreateExecutionOptions(build_options, &program_shape);
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, argument_layouts, &execution_options,
-                         *user_computation));
+      CreateModuleConfig(program_shape, argument_layouts, &execution_options));
 
   TF_ASSIGN_OR_RETURN(
       se::StreamExecutor * executor,
       execute_backend_->stream_executor(build_options.device_ordinal()));
 
-  return BuildExecutable(versioned_handle, std::move(module_config),
+  return BuildExecutable(proto, std::move(module_config),
                          execute_backend_.get(), executor,
                          build_options.device_allocator());
 }
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 15e120685e1be9190d49fdaf5ed6706bdf991a6c..06567cabd6eb28aae53881613cd6beb78e25e222 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -50,6 +51,18 @@ class LocalService : public Service {
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
       const ExecutableBuildOptions& options);
 
+  // Builds an Executable with the given XlaComputation, argument layouts and
+  // options. If result_layout is non-null, then the executable is compiled to
+  // produce a result of the given layout.  If device_allocator is non-null,
+  // then the compiler may use it to allocate temp space on the device.  The
+  // compiler is responsible for freeing any memory it allocates this way.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Executable>> CompileExecutable(
+      const XlaComputation& computation,
+      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
+      const ExecutableBuildOptions& build_options);
+
   // Returns the device ordinal that corresponds to the given replica number.
   //
   // This returns an error if there is not a one-to-one correspondence of
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index 7d8c05fffa4ab11d7dbf9956d2cb7ebd5bcdd3c4..f74bcb0b79355c8e69890487266cbc5f2a4500be 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -53,17 +53,18 @@ NameUniquer::NameUniquer(const string& separator) {
 }
 
 string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
-  string root = prefix.empty() ? "name" : prefix.ToString();
-  root = GetSanitizedName(root);
+  string root = GetSanitizedName(prefix.empty() ? "name" : prefix.ToString());
 
   // Strip away numeric suffix (if any). Only recognize separator if it is in
   // the middle of the name.
+  bool has_numeric_suffix = false;
+  int64 numeric_suffix = 0;
   size_t separator_index = root.rfind(separator_);
   if (separator_index != string::npos && (separator_index > 0) &&
       (separator_index < root.size() - 1)) {
     string after_suffix = root.substr(separator_index + 1);
-    int64 numeric_suffix;
     if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) {
+      has_numeric_suffix = true;
       // Remove numeric suffix from root.
       root = root.substr(0, separator_index);
       // Update count to at least the numeric suffix value to avoid future
@@ -71,11 +72,11 @@ string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
       generated_names_[root] = std::max(generated_names_[root], numeric_suffix);
     }
   }
-
   int64* count = &(generated_names_[root]);
   if (*count == 0) {
     *count = 1;
-    return root;
+    return has_numeric_suffix ? tensorflow::strings::StrCat(root, separator_, 0)
+                              : root;
   } else {
     tensorflow::strings::StrAppend(&root, separator_, *count);
     // Increment lookup under old 'root' name.
diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc
index 4258cf16876ab46dce6df062ab701b1b1a4a7580..2ec255558c4ed3695ec6c824458cbedac44dc297 100644
--- a/tensorflow/compiler/xla/service/name_uniquer_test.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc
@@ -57,11 +57,18 @@ TEST_F(NameUniquerTest, NumericSuffixes) {
   EXPECT_EQ("foo.55", uniquer.GetUniqueName("foo"));
   EXPECT_EQ("foo.55.1", uniquer.GetUniqueName("foo.55.1"));
   EXPECT_EQ("foo.55.2", uniquer.GetUniqueName("foo.55.1"));
-  EXPECT_EQ("bar", uniquer.GetUniqueName("bar.-1000"));
+  EXPECT_EQ("bar.0", uniquer.GetUniqueName("bar.-1000"));
   EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.-2000"));
   EXPECT_EQ("bar.2", uniquer.GetUniqueName("bar.1"));
 }
 
+TEST_F(NameUniquerTest, PrefixHasSuffix) {
+  NameUniquer uniquer(".");
+
+  EXPECT_EQ("foo.11.0", uniquer.GetUniqueName("foo.11.0"));
+  EXPECT_EQ("foo.11", uniquer.GetUniqueName("foo.11"));
+}
+
 TEST_F(NameUniquerTest, Sanitize) {
   NameUniquer uniquer("_");
 
@@ -73,7 +80,7 @@ TEST_F(NameUniquerTest, Sanitize) {
   EXPECT_EQ("foo_55", uniquer.GetUniqueName("foo"));
 
   // Invalid characters will be replaced with '_'.
-  EXPECT_EQ("bar", uniquer.GetUniqueName("bar<-1000"));
+  EXPECT_EQ("bar_0", uniquer.GetUniqueName("bar<-1000"));
   EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar<-2000"));
   EXPECT_EQ("bar_2", uniquer.GetUniqueName("bar_1"));
 
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..586f6ef7a9c4f17f69340e77be17aec2f677a791
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -0,0 +1,1013 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace xla {
+
+// A pattern matcher for HloInstructions, Shapes, and Layouts.
+//
+// The Match function's first argument must be HloInstruction*, Shape*, or
+// Layout*. The second argument is a pattern that will be matched against the
+// first argument, as described below.
+//
+// Patterns are constructed using the match::Op, match::Shape, or match::Layout
+// functions. By default, the returned patterns will match any HloInstruction,
+// Shape, or Layout, respectively. However the match can be made more specific
+// by using the pattern's modifier methods, for example:
+//
+//   match::Op().WithOpcode(HloOpcode::kAdd).WithOperand(
+//     0, match::Op().WithOpcode(HloOpcode::kConstant))
+//
+// This pattern will match Add instructions whose first operand is a constant.
+//
+// Each pattern type has the following modifiers:
+//
+//   Op():
+//     - WithName: match operations with the given name
+//     - WithOpcode: match operations with the given opcode
+//     - WithShape: match operations whose shape matches the given pattern
+//     - WithOperand: match operations whose operand matches the given pattern
+//
+//   Shape():
+//     - EqualTo: matches shapes that are equal to the argument
+//     - CompatibleTo: matches shapes that are compatible to the argument
+//     - IsScalar/IsArray/IsTuple: matches scalar/array/tuple shapes
+//     - IsDenseArray/IsSparseArray: matches arrays with dense/sparse format
+//     - WithLayout: match shapes whose layout matches the given pattern
+//     - WithLayoutEqualTo: matches shapes whose layouts equal the argument
+//     - WithSubshape: matches tuple shapes whose subshape matches the given
+//       pattern
+//     - WithSubshapeEqualTo: matches shapes with a subshape equal the argument
+//     - WithElementType: matches array/scalar shapes with the given element
+//       type
+//     - WithRank: matches array/scalar types with the given rank
+//
+//  Layout():
+//     - EqualTo: matches layouts that are equal to the argument
+//     - WithDenseFormat/WithSparseFormat: matches layouts with dense/sparse
+//       format
+//
+// Op(), Shape(), and Layout() may be passed an argument of type
+// HloInstruction**, Shape**, or Layout**, respectively, or const versions of
+// these pointers. If the pattern is matched, the address of the matched value
+// will be "captured" and stored at this location.
+//
+// For example:
+//   HloInstruction* foo = ...;
+//   HloInstruction* matched_operand;
+//   CHECK(Match(foo,
+//               match::Op().WithOperand(0, match::Op(&matched_operand))));
+//
+// Helpers are provided for common nullary, unary, binary, and ternary
+// instructions. These helpers can be called with no arguments, in which case
+// they will match any instruction matching the opcode. They may also be called
+// with matches for the operands and with an optional capture. (The capture must
+// be the first argument.) Some examples of these helpers and their equivalents
+// are provided below.
+//
+// Example nullary instruction:
+//   Recv()                            == Op().WithOpcode(HloOpcode::kRecv)
+//   Recv(&a)                          == Op(&a).WithOpcode(HloOpcode::kRecv)
+//
+// Example unary instruction:
+//   Abs()                             == Op().WithOpcode(HloOpcode::kAbs)
+//   Abs(Op(&a))                       == Op().WithOpcode(HloOpcode::kAbs)
+//                                            .WithOperand(0, Op(&a)))
+//   Abs(&a, Op(&b))                   == Op(&a).WithOpcode(HloOpcode::kAbs)
+//                                              .WithOperand(0, Op(&b))
+//
+// Example binary instruction:
+//   Add()                             == Op().WithOpcode(HloOpcode::kAdd)
+//   Add(Op(&a), Op(&b))               == Op().WithOpcode(HloOpcode::kAdd)
+//                                            .WithOperand(0, Op(&a))
+//                                            .WithOperand(1, Op(&b))
+//   Add(&a, Op(&b), Op(&c))           == Op(&a).WithOpcode(HloOpcode::kAdd)
+//                                              .WithOperand(0, Op(&b))
+//                                              .WithOperand(1, Op(&c))
+//
+// Example ternary instruction:
+//   Clamp()                           == Op().WithOpcode(HloOpcode::kClamp)
+//   Clamp(Op(&a), Op(&b), Op(&c))     == Op().WithOpcode(HloOpcode::kClamp)
+//                                            .WithOperand(0, Op(&a))
+//                                            .WithOperand(1, Op(&b))
+//                                            .WithOperand(2, Op(&c))
+//   Clamp(&a, Op(&b), Op(&c), Op(&d)) == Op(&a).WithOpcode(HloOpcode::kClamp)
+//                                              .WithOperand(0, Op(&b))
+//                                              .WithOperand(1, Op(&c))
+//                                              .WithOperand(2, Op(&d))
+//
+template <typename Value, typename Pattern>
+bool Match(Value* value, const Pattern& pattern) {
+  return pattern.Match(value);
+}
+
+namespace match {
+
+namespace detail {
+
+template <typename LayoutType, typename Impl>
+class LayoutPattern;
+
+// The base LayoutPattern implementation. Matches only if the layout is not
+// nullptr.
+class LayoutPatternBaseImpl {
+ public:
+  bool Match(const ::xla::Layout* layout) const { return layout != nullptr; }
+};
+
+// A LayoutPattern implementation that matches only if the layout equals a
+// Layout proto.
+template <typename Previous>
+class LayoutPatternEqualImpl {
+ public:
+  explicit constexpr LayoutPatternEqualImpl(const Previous& previous,
+                                            const ::xla::Layout* layout)
+      : previous_(previous), layout_(layout) {}
+
+  bool Match(const ::xla::Layout* layout) const {
+    return previous_.Match(layout) && LayoutUtil::Equal(*layout_, *layout);
+  }
+
+ private:
+  Previous previous_;
+  const ::xla::Layout* layout_;
+};
+
+// A LayoutPattern implementation that matches only if the layout has a given
+// format.
+template <typename Previous>
+class LayoutPatternFormatImpl {
+ public:
+  explicit constexpr LayoutPatternFormatImpl(const Previous& previous,
+                                             Format format)
+      : previous_(previous), format_(format) {}
+
+  bool Match(const ::xla::Layout* layout) const {
+    return previous_.Match(layout) && layout->format() == format_;
+  }
+
+ private:
+  Previous previous_;
+  Format format_;
+};
+
+// A pattern that matches Layouts.
+template <typename LayoutType, typename Impl>
+class LayoutPattern {
+ public:
+  explicit constexpr LayoutPattern(const Impl& impl,
+                                   LayoutType** matched_layout)
+      : impl_(impl), matched_layout_(matched_layout) {}
+
+  // Returns true and captures the layout iff it matches the pattern.
+  bool Match(const ::xla::Layout* layout) const {
+    if (impl_.Match(layout)) {
+      if (matched_layout_) {
+        *matched_layout_ = layout;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the layout iff it matches the pattern.
+  bool Match(::xla::Layout* layout) const {
+    if (impl_.Match(layout)) {
+      if (matched_layout_) {
+        *matched_layout_ = layout;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the layout equals the given proto.
+  // The layout must outlive the returned pattern.
+  constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
+      const Layout* layout) const {
+    return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
+        LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
+  }
+
+  // Modifies the pattern to match only if the layout has a dense format.
+  constexpr LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>
+  WithDenseFormat() const {
+    return LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>(
+        LayoutPatternFormatImpl<Impl>(impl_, DENSE), matched_layout_);
+  }
+
+  // Modifies the pattern to match only if the layout has a sparse format.
+  constexpr LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>
+  WithSparseFormat() const {
+    return LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>(
+        LayoutPatternFormatImpl<Impl>(impl_, SPARSE), matched_layout_);
+  }
+
+ private:
+  Impl impl_;
+  LayoutType** matched_layout_;
+};
+
+}  // namespace detail
+
+// Creates a layout pattern that will capture the matched layout in the
+// argument.
+inline constexpr detail::LayoutPattern<const ::xla::Layout,
+                                       detail::LayoutPatternBaseImpl>
+Layout(const ::xla::Layout** matched_layout = nullptr) {
+  return detail::LayoutPattern<const ::xla::Layout,
+                               detail::LayoutPatternBaseImpl>(
+      detail::LayoutPatternBaseImpl(), matched_layout);
+}
+
+// Creates a layout pattern that will capture the matched layout in the
+// argument.
+inline constexpr detail::LayoutPattern<::xla::Layout,
+                                       detail::LayoutPatternBaseImpl>
+Layout(::xla::Layout** matched_layout) {
+  return detail::LayoutPattern<::xla::Layout, detail::LayoutPatternBaseImpl>(
+      detail::LayoutPatternBaseImpl(), matched_layout);
+}
+
+namespace detail {
+
+template <typename ShapeType, typename Impl>
+class ShapePattern;
+
+// The base ShapePattern implementation. Matches only if the shape is not
+// nullptr.
+class ShapePatternBaseImpl {
+ public:
+  bool Match(const ::xla::Shape* shape) const { return shape != nullptr; }
+};
+
+// A ShapePattern implementation that matches only if the shape equals a Shape
+// proto.
+template <typename Previous>
+class ShapePatternEqualImpl {
+ public:
+  explicit constexpr ShapePatternEqualImpl(const Previous& previous,
+                                           const ::xla::Shape* shape)
+      : previous_(previous), shape_(shape) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::Equal(*shape_, *shape);
+  }
+
+ private:
+  Previous previous_;
+  const ::xla::Shape* shape_;
+};
+
+// A ShapePattern implementation that matches only if the shape is compatible to
+// a Shape proto.
+template <typename Previous>
+class ShapePatternCompatibleImpl {
+ public:
+  explicit constexpr ShapePatternCompatibleImpl(const Previous& previous,
+                                                const ::xla::Shape* shape)
+      : previous_(previous), shape_(shape) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::Compatible(*shape_, *shape);
+  }
+
+ private:
+  Previous previous_;
+  const ::xla::Shape* shape_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a given
+// element type.
+template <typename Previous>
+class ShapePatternElementTypeImpl {
+ public:
+  explicit constexpr ShapePatternElementTypeImpl(const Previous& previous,
+                                                 PrimitiveType element_type)
+      : previous_(previous), element_type_(element_type) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && shape->element_type() == element_type_;
+  }
+
+ private:
+  Previous previous_;
+  PrimitiveType element_type_;
+};
+
+// A ShapePattern implementation that matches only if the shape is scalar.
+template <typename Previous>
+class ShapePatternIsScalarImpl {
+ public:
+  explicit constexpr ShapePatternIsScalarImpl(const Previous& previous)
+      : previous_(previous) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IsScalar(*shape);
+  }
+
+ private:
+  Previous previous_;
+};
+
+// A ShapePattern implementation that matches only if the shape is an array
+template <typename Previous>
+class ShapePatternIsArrayImpl {
+ public:
+  explicit constexpr ShapePatternIsArrayImpl(const Previous& previous)
+      : previous_(previous) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IsArray(*shape);
+  }
+
+ private:
+  Previous previous_;
+};
+
+// A ShapePattern implementation that matches only if the shape is a tuple.
+template <typename Previous>
+class ShapePatternIsTupleImpl {
+ public:
+  explicit constexpr ShapePatternIsTupleImpl(const Previous& previous)
+      : previous_(previous) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IsTuple(*shape);
+  }
+
+ private:
+  Previous previous_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a given
+// rank.
+template <typename Previous>
+class ShapePatternRankImpl {
+ public:
+  explicit constexpr ShapePatternRankImpl(const Previous& previous, int64 rank)
+      : previous_(previous), rank_(rank) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::Rank(*shape) == rank_;
+  }
+
+ private:
+  Previous previous_;
+  int64 rank_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a layout
+// that matches a given pattern.
+template <typename Previous, typename LayoutType, typename LayoutImpl>
+class ShapePatternLayoutImpl {
+ public:
+  explicit constexpr ShapePatternLayoutImpl(
+      const Previous& previous,
+      const LayoutPattern<LayoutType, LayoutImpl>& layout)
+      : previous_(previous), layout_(layout) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && LayoutUtil::HasLayout(*shape) &&
+           layout_.Match(&shape->layout());
+  }
+
+  bool Match(Shape* shape) const {
+    return previous_.Match(shape) && LayoutUtil::HasLayout(*shape) &&
+           layout_.Match(shape->mutable_layout());
+  }
+
+ private:
+  Previous previous_;
+  LayoutPattern<LayoutType, LayoutImpl> layout_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a subshape
+// that matches a given pattern.
+template <typename Previous, typename SubshapeType, typename SubshapeImpl>
+class ShapePatternSubshapeImpl {
+ public:
+  explicit ShapePatternSubshapeImpl(
+      const Previous& previous, ShapeIndexView index,
+      const ShapePattern<SubshapeType, SubshapeImpl>& subshape)
+      : previous_(previous), index_(index), subshape_(subshape) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IndexIsValid(*shape, index_) &&
+           subshape_.Match(&ShapeUtil::GetSubshape(*shape, index_));
+  }
+
+  bool Match(::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IndexIsValid(*shape, index_) &&
+           subshape_.Match(ShapeUtil::GetMutableSubshape(shape, index_));
+  }
+
+ private:
+  Previous previous_;
+  ShapeIndexView index_;
+  ShapePattern<SubshapeType, SubshapeImpl> subshape_;
+};
+
+// A pattern that matches Shapes.
+template <typename ShapeType, typename Impl>
+class ShapePattern {
+ public:
+  explicit constexpr ShapePattern(const Impl& impl, ShapeType** matched_shape)
+      : impl_(impl), matched_shape_(matched_shape) {}
+
+  // Returns true and captures the shape iff it matches the pattern.
+  bool Match(const ::xla::Shape* shape) const {
+    if (impl_.Match(shape)) {
+      if (matched_shape_) {
+        *matched_shape_ = shape;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the shape iff it matches the pattern.
+  bool Match(::xla::Shape* shape) const {
+    if (impl_.Match(shape)) {
+      if (matched_shape_) {
+        *matched_shape_ = shape;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the shape equals the given proto.
+  // The layout must outlive the returned pattern.
+  constexpr ShapePattern<ShapeType, ShapePatternEqualImpl<Impl>> EqualTo(
+      const ::xla::Shape* shape) const {
+    return ShapePattern<ShapeType, ShapePatternEqualImpl<Impl>>(
+        ShapePatternEqualImpl<Impl>(impl_, shape), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is compatible to the given
+  // proto. The layout must outlive the returned pattern.
+  constexpr ShapePattern<ShapeType, ShapePatternCompatibleImpl<Impl>>
+  CompatibleTo(const ::xla::Shape* shape) const {
+    return ShapePattern<ShapeType, ShapePatternCompatibleImpl<Impl>>(
+        ShapePatternCompatibleImpl<Impl>(impl_, shape), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape has the given element type.
+  constexpr ShapePattern<ShapeType, ShapePatternElementTypeImpl<Impl>>
+  WithElementType(PrimitiveType element_type) const {
+    return ShapePattern<ShapeType, ShapePatternElementTypeImpl<Impl>>(
+        ShapePatternElementTypeImpl<Impl>(impl_, element_type), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is scalar.
+  constexpr ShapePattern<ShapeType, ShapePatternIsScalarImpl<Impl>> IsScalar()
+      const {
+    return ShapePattern<ShapeType, ShapePatternIsScalarImpl<Impl>>(
+        ShapePatternIsScalarImpl<Impl>(impl_), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is an array.
+  constexpr ShapePattern<ShapeType, ShapePatternIsArrayImpl<Impl>> IsArray()
+      const {
+    return ShapePattern<ShapeType, ShapePatternIsArrayImpl<Impl>>(
+        ShapePatternIsArrayImpl<Impl>(impl_), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is a tuple.
+  constexpr ShapePattern<ShapeType, ShapePatternIsTupleImpl<Impl>> IsTuple()
+      const {
+    return ShapePattern<ShapeType, ShapePatternIsTupleImpl<Impl>>(
+        ShapePatternIsTupleImpl<Impl>(impl_), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape has the given rank.
+  constexpr ShapePattern<ShapeType, ShapePatternRankImpl<Impl>> WithRank(
+      int64 rank) const {
+    return ShapePattern<ShapeType, ShapePatternRankImpl<Impl>>(
+        ShapePatternRankImpl<Impl>(impl_, rank), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape has a layout that matches
+  // the given pattern.
+  template <typename LayoutType, typename LayoutImpl>
+  constexpr ShapePattern<ShapeType,
+                         ShapePatternLayoutImpl<Impl, LayoutType, LayoutImpl>>
+  WithLayout(const LayoutPattern<LayoutType, LayoutImpl>& layout) const {
+    return ShapePattern<ShapeType,
+                        ShapePatternLayoutImpl<Impl, LayoutType, LayoutImpl>>(
+        ShapePatternLayoutImpl<Impl, LayoutType, LayoutImpl>(impl_, layout),
+        matched_shape_);
+  }
+
+  constexpr ShapePattern<
+      ShapeType,
+      ShapePatternLayoutImpl<Impl, const ::xla::Layout,
+                             LayoutPatternEqualImpl<LayoutPatternBaseImpl>>>
+  WithLayoutEqualTo(const ::xla::Layout* layout) const {
+    return WithLayout(Layout().EqualTo(layout));
+  }
+
+  constexpr ShapePattern<
+      ShapeType,
+      ShapePatternLayoutImpl<Impl, const ::xla::Layout,
+                             LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
+  IsDenseArray() const {
+    return WithLayout(Layout().WithDenseFormat());
+  }
+
+  constexpr ShapePattern<
+      ShapeType,
+      ShapePatternLayoutImpl<Impl, const ::xla::Layout,
+                             LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
+  IsSparseArray() const {
+    return WithLayout(Layout().WithSparseFormat());
+  }
+
+  // Modifies the pattern to match only if the shape has a subshape that matches
+  // the given pattern.
+  template <typename SubshapeType, typename SubshapeImpl>
+  ShapePattern<ShapeType,
+               ShapePatternSubshapeImpl<Impl, SubshapeType, SubshapeImpl>>
+  WithSubshape(ShapeIndexView index,
+               const ShapePattern<SubshapeType, SubshapeImpl>& subshape) const {
+    return ShapePattern<
+        ShapeType, ShapePatternSubshapeImpl<Impl, SubshapeType, SubshapeImpl>>(
+        ShapePatternSubshapeImpl<Impl, SubshapeType, SubshapeImpl>(impl_, index,
+                                                                   subshape),
+        matched_shape_);
+  }
+
+  ShapePattern<ShapeType, ShapePatternSubshapeImpl<
+                              Impl, const ::xla::Shape,
+                              ShapePatternEqualImpl<ShapePatternBaseImpl>>>
+  WithSubshapeEqualTo(ShapeIndexView index, const ::xla::Shape* shape) const {
+    return WithSubshape(index,
+                        ShapePattern<const ::xla::Shape, ShapePatternBaseImpl>(
+                            ShapePatternBaseImpl(), nullptr)
+                            .EqualTo(shape));
+  }
+
+  ShapePattern<ShapeType, ShapePatternSubshapeImpl<
+                              Impl, const ::xla::Shape,
+                              ShapePatternCompatibleImpl<ShapePatternBaseImpl>>>
+  WithSubshapeCompatibleTo(ShapeIndexView index,
+                           const ::xla::Shape* shape) const {
+    return WithSubshape(index,
+                        ShapePattern<const ::xla::Shape, ShapePatternBaseImpl>(
+                            ShapePatternBaseImpl(), nullptr)
+                            .CompatibleTo(shape));
+  }
+
+ private:
+  Impl impl_;
+  ShapeType** matched_shape_;
+};
+
+}  // namespace detail
+
+// Creates a shape pattern that will capture the matched layout in the argument.
+inline constexpr detail::ShapePattern<const ::xla::Shape,
+                                      detail::ShapePatternBaseImpl>
+Shape(const ::xla::Shape** matched_shape = nullptr) {
+  return detail::ShapePattern<const ::xla::Shape, detail::ShapePatternBaseImpl>(
+      detail::ShapePatternBaseImpl(), matched_shape);
+}
+
+// Creates a shape pattern that will capture the matched layout in the argument.
+inline constexpr detail::ShapePattern<::xla::Shape,
+                                      detail::ShapePatternBaseImpl>
+Shape(::xla::Shape** matched_shape) {
+  return detail::ShapePattern<::xla::Shape, detail::ShapePatternBaseImpl>(
+      detail::ShapePatternBaseImpl(), matched_shape);
+}
+
+namespace detail {
+
+template <typename HloInstructionType, typename Impl>
+class HloInstructionPattern;
+
+// The base HloInstructionPattern implementation. Matches only if the
+// instruction is not nullptr.
+class HloInstructionPatternBaseImpl {
+ public:
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return inst != nullptr;
+  }
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a given name.
+template <typename Previous>
+class HloInstructionPatternNameImpl {
+ public:
+  explicit HloInstructionPatternNameImpl(const Previous& previous,
+                                         tensorflow::StringPiece name)
+      : previous_(previous), name_(name) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && inst->name() == name_;
+  }
+
+ private:
+  Previous previous_;
+  tensorflow::StringPiece name_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a given opcode.
+template <typename Previous>
+class HloInstructionPatternOpcodeImpl {
+ public:
+  explicit constexpr HloInstructionPatternOpcodeImpl(const Previous& previous,
+                                                     HloOpcode opcode,
+                                                     bool invert)
+      : previous_(previous), opcode_(opcode), invert_(invert) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && (invert_ ^ (inst->opcode() == opcode_));
+  }
+
+ private:
+  Previous previous_;
+  HloOpcode opcode_;
+  bool invert_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a shape that matches a given pattern.
+template <typename Previous, typename ShapeType, typename ShapeImpl>
+class HloInstructionPatternShapeImpl {
+ public:
+  explicit constexpr HloInstructionPatternShapeImpl(
+      const Previous& previous, const ShapePattern<ShapeType, ShapeImpl>& shape)
+      : previous_(previous), shape_(shape) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && shape_.Match(&inst->shape());
+  }
+
+  bool Match(::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && shape_.Match(inst->mutable_shape());
+  }
+
+ private:
+  Previous previous_;
+  ShapePattern<ShapeType, ShapeImpl> shape_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has an operand that matches a given pattern.
+template <typename Previous, typename OperandType, typename OperandImpl>
+class HloInstructionPatternOperandImpl {
+ public:
+  explicit constexpr HloInstructionPatternOperandImpl(
+      const Previous& previous, int64 operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand)
+      : previous_(previous), operand_index_(operand_index), operand_(operand) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && operand_index_ < inst->operand_count() &&
+           operand_.Match(inst->operand(operand_index_));
+  }
+
+  bool Match(::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && operand_index_ < inst->operand_count() &&
+           operand_.Match(inst->mutable_operand(operand_index_));
+  }
+
+ private:
+  Previous previous_;
+  int64 operand_index_;
+  HloInstructionPattern<OperandType, OperandImpl> operand_;
+};
+
+// A pattern that matches HloInstructions.
+template <typename HloInstructionType, typename Impl>
+class HloInstructionPattern {
+ public:
+  explicit constexpr HloInstructionPattern(const Impl& impl,
+                                           HloInstructionType** matched_inst)
+      : impl_(impl), matched_inst_(matched_inst) {}
+
+  // Returns true and captures the instruction iff it matches the pattern.
+  bool Match(const ::xla::HloInstruction* inst) const {
+    if (impl_.Match(inst)) {
+      if (matched_inst_) {
+        *matched_inst_ = inst;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the instruction iff it matches the pattern.
+  bool Match(::xla::HloInstruction* inst) const {
+    if (impl_.Match(inst)) {
+      if (matched_inst_) {
+        *matched_inst_ = inst;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the instruction has the given name.
+  HloInstructionPattern<HloInstructionType, HloInstructionPatternNameImpl<Impl>>
+  WithName(tensorflow::StringPiece name) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternNameImpl<Impl>>(
+        HloInstructionPatternNameImpl<Impl>(impl_, name), matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction has the given opcode.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  WithOpcode(HloOpcode opcode) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternOpcodeImpl<Impl>>(
+        HloInstructionPatternOpcodeImpl<Impl>(impl_, opcode, false),
+        matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction does not have the
+  // given opcode.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  WithoutOpcode(HloOpcode opcode) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternOpcodeImpl<Impl>>(
+        HloInstructionPatternOpcodeImpl<Impl>(impl_, opcode, true),
+        matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction is a constant.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  IsConstant() const {
+    return WithOpcode(HloOpcode::kConstant);
+  }
+
+  // Modifies the pattern to match only if the instruction is not a constant.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  IsNonConstant() const {
+    return WithoutOpcode(HloOpcode::kConstant);
+  }
+
+  // Modifies the pattern to match only if the instruction has a shape that
+  // matches the given pattern.
+  template <typename ShapeType, typename ShapeImpl>
+  constexpr HloInstructionPattern<
+      HloInstructionType,
+      HloInstructionPatternShapeImpl<Impl, ShapeType, ShapeImpl>>
+  WithShape(const ShapePattern<ShapeType, ShapeImpl>& shape) const {
+    return HloInstructionPattern<
+        HloInstructionType,
+        HloInstructionPatternShapeImpl<Impl, ShapeType, ShapeImpl>>(
+        HloInstructionPatternShapeImpl<Impl, ShapeType, ShapeImpl>(impl_,
+                                                                   shape),
+        matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction has an operand that
+  // matches the given pattern.
+  template <typename OperandType, typename OperandImpl>
+  constexpr HloInstructionPattern<
+      HloInstructionType,
+      HloInstructionPatternOperandImpl<Impl, OperandType, OperandImpl>>
+  WithOperand(
+      int64 operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand) const {
+    return HloInstructionPattern<
+        HloInstructionType,
+        HloInstructionPatternOperandImpl<Impl, OperandType, OperandImpl>>(
+        HloInstructionPatternOperandImpl<Impl, OperandType, OperandImpl>(
+            impl_, operand_index, operand),
+        matched_inst_);
+  }
+
+ private:
+  Impl impl_;
+  HloInstructionType** matched_inst_;
+};
+
+}  // namespace detail
+
+// Creates an instruction pattern that will capture the matched instruction in
+// the argument.
+inline constexpr detail::HloInstructionPattern<
+    const ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
+Op(const ::xla::HloInstruction** matched_inst = nullptr) {
+  return detail::HloInstructionPattern<const ::xla::HloInstruction,
+                                       detail::HloInstructionPatternBaseImpl>(
+      detail::HloInstructionPatternBaseImpl(), matched_inst);
+}
+
+// Creates an instruction pattern that will capture the matched instruction in
+// the argument.
+inline constexpr detail::HloInstructionPattern<
+    ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
+Op(::xla::HloInstruction** matched_inst) {
+  return detail::HloInstructionPattern<::xla::HloInstruction,
+                                       detail::HloInstructionPatternBaseImpl>(
+      detail::HloInstructionPatternBaseImpl(), matched_inst);
+}
+
+// Helpers for nullary instructions.
+#define XLA_NULLOP_PATTERN(NAME)                                      \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
+    return Op().WithOpcode(HloOpcode::k##NAME);                       \
+  }                                                                   \
+                                                                      \
+  template <typename HloInstructionType>                              \
+  inline auto NAME(HloInstructionType** matched_inst)                 \
+      ->decltype(Op(matched_inst).WithOpcode(HloOpcode::k##NAME)) {   \
+    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);           \
+  }
+XLA_NULLOP_PATTERN(Constant)
+XLA_NULLOP_PATTERN(Infeed)
+XLA_NULLOP_PATTERN(Parameter)
+XLA_NULLOP_PATTERN(Recv)
+#undef XLA_NULLOP_PATTERN
+
+// Helpers for unary instructions.
+#define XLA_UNOP_PATTERN(NAME)                                        \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
+    return Op().WithOpcode(HloOpcode::k##NAME);                       \
+  }                                                                   \
+                                                                      \
+  template <typename Arg>                                             \
+  inline auto NAME(Arg&& arg)->decltype(                              \
+      Op().WithOpcode(HloOpcode::k##NAME)                             \
+          .WithOperand(0, std::forward<Arg>(arg))) {                  \
+    return Op()                                                       \
+        .WithOpcode(HloOpcode::k##NAME)                               \
+        .WithOperand(0, std::forward<Arg>(arg));                      \
+  }                                                                   \
+                                                                      \
+  template <typename HloInstructionType, typename Arg>                \
+  inline auto NAME(HloInstructionType** matched_inst, Arg&& arg)      \
+      ->decltype(Op(matched_inst)                                     \
+                     .WithOpcode(HloOpcode::k##NAME)                  \
+                     .WithOperand(0, std::forward<Arg>(arg))) {       \
+    return Op(matched_inst)                                           \
+        .WithOpcode(HloOpcode::k##NAME)                               \
+        .WithOperand(0, std::forward<Arg>(arg));                      \
+  }
+XLA_UNOP_PATTERN(Abs)
+XLA_UNOP_PATTERN(RoundNearestAfz)
+XLA_UNOP_PATTERN(Bitcast)
+XLA_UNOP_PATTERN(Broadcast)
+XLA_UNOP_PATTERN(Ceil)
+XLA_UNOP_PATTERN(Copy)
+XLA_UNOP_PATTERN(Cos)
+XLA_UNOP_PATTERN(Exp)
+XLA_UNOP_PATTERN(Fft)
+XLA_UNOP_PATTERN(Floor)
+XLA_UNOP_PATTERN(Imag)
+XLA_UNOP_PATTERN(IsFinite)
+XLA_UNOP_PATTERN(Log)
+XLA_UNOP_PATTERN(Not)
+XLA_UNOP_PATTERN(Negate)
+XLA_UNOP_PATTERN(Outfeed)
+XLA_UNOP_PATTERN(Real)
+XLA_UNOP_PATTERN(Reduce)
+XLA_UNOP_PATTERN(ReducePrecision)
+XLA_UNOP_PATTERN(Reshape)
+XLA_UNOP_PATTERN(Reverse)
+XLA_UNOP_PATTERN(Send)
+XLA_UNOP_PATTERN(Sign)
+XLA_UNOP_PATTERN(Sin)
+XLA_UNOP_PATTERN(Sort)
+XLA_UNOP_PATTERN(Tanh)
+XLA_UNOP_PATTERN(Transpose)
+#undef XLA_UNOP_PATTERN
+
+// Helpers for binary instructions.
+#define XLA_BINOP_PATTERN(NAME)                                             \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {       \
+    return Op().WithOpcode(HloOpcode::k##NAME);                             \
+  }                                                                         \
+                                                                            \
+  template <typename Lhs, typename Rhs>                                     \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs)                                    \
+      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                        \
+                     .WithOperand(0, std::forward<Lhs>(lhs))                \
+                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
+    return Op()                                                             \
+        .WithOpcode(HloOpcode::k##NAME)                                     \
+        .WithOperand(0, std::forward<Lhs>(lhs))                             \
+        .WithOperand(1, std::forward<Rhs>(rhs));                            \
+  }                                                                         \
+                                                                            \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) \
+      ->decltype(Op(matched_inst)                                           \
+                     .WithOpcode(HloOpcode::k##NAME)                        \
+                     .WithOperand(0, std::forward<Lhs>(lhs))                \
+                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
+    return Op(matched_inst)                                                 \
+        .WithOpcode(HloOpcode::k##NAME)                                     \
+        .WithOperand(0, std::forward<Lhs>(lhs))                             \
+        .WithOperand(1, std::forward<Rhs>(rhs));                            \
+  }
+XLA_BINOP_PATTERN(Add)
+XLA_BINOP_PATTERN(Atan2)
+XLA_BINOP_PATTERN(Divide)
+XLA_BINOP_PATTERN(Complex)
+XLA_BINOP_PATTERN(Dot)
+XLA_BINOP_PATTERN(Eq)
+XLA_BINOP_PATTERN(Gather)
+XLA_BINOP_PATTERN(Ge)
+XLA_BINOP_PATTERN(Gt)
+XLA_BINOP_PATTERN(Le)
+XLA_BINOP_PATTERN(Lt)
+XLA_BINOP_PATTERN(Maximum)
+XLA_BINOP_PATTERN(Minimum)
+XLA_BINOP_PATTERN(Multiply)
+XLA_BINOP_PATTERN(Ne)
+XLA_BINOP_PATTERN(Power)
+XLA_BINOP_PATTERN(Remainder)
+XLA_BINOP_PATTERN(Subtract)
+XLA_BINOP_PATTERN(And)
+XLA_BINOP_PATTERN(Or)
+XLA_BINOP_PATTERN(ShiftLeft)
+XLA_BINOP_PATTERN(ShiftRightArithmetic)
+XLA_BINOP_PATTERN(ShiftRightLogical)
+#undef XLA_BINOP_PATTERN
+
+// Helpers for ternary instructions.
+#define XLA_TERNOP_PATTERN(NAME)                                       \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {  \
+    return Op().WithOpcode(HloOpcode::k##NAME);                        \
+  }                                                                    \
+                                                                       \
+  template <typename Arg0, typename Arg1, typename Arg2>               \
+  inline auto NAME(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2)              \
+      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                   \
+                     .WithOperand(0, std::forward<Arg0>(arg0))         \
+                     .WithOperand(1, std::forward<Arg1>(arg1))         \
+                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+    return Op()                                                        \
+        .WithOpcode(HloOpcode::k##NAME)                                \
+        .WithOperand(0, std::forward<Arg0>(arg0))                      \
+        .WithOperand(1, std::forward<Arg1>(arg1))                      \
+        .WithOperand(2, std::forward<Arg2>(arg2));                     \
+  }                                                                    \
+                                                                       \
+  template <typename HloInstructionType, typename Arg0, typename Arg1, \
+            typename Arg2>                                             \
+  inline auto NAME(HloInstructionType** matched_inst, Arg0&& arg0,     \
+                   Arg1&& arg1, Arg2&& arg2)                           \
+      ->decltype(Op(matched_inst)                                      \
+                     .WithOpcode(HloOpcode::k##NAME)                   \
+                     .WithOperand(0, std::forward<Arg0>(arg0))         \
+                     .WithOperand(1, std::forward<Arg1>(arg1))         \
+                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+    return Op(matched_inst)                                            \
+        .WithOpcode(HloOpcode::k##NAME)                                \
+        .WithOperand(0, std::forward<Arg0>(arg0))                      \
+        .WithOperand(1, std::forward<Arg1>(arg1))                      \
+        .WithOperand(2, std::forward<Arg2>(arg2));                     \
+  }
+XLA_TERNOP_PATTERN(Clamp);
+XLA_TERNOP_PATTERN(Select);
+#undef XLA_TERNOP_PATTERN
+
+// Helpers for matching non-constant instructions.
+inline auto NonConstant() -> decltype(Op().IsNonConstant()) {
+  return Op().IsNonConstant();
+}
+
+template <typename HloInstructionType>
+inline auto NonConstant(HloInstructionType** matched_inst)
+    -> decltype(Op(matched_inst).IsNonConstant()) {
+  return Op(matched_inst).IsNonConstant();
+}
+
+}  // namespace match
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c88157c312524fb273e6df368d2ef61d679d1d8b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -0,0 +1,174 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+TEST(PatternMatcherTest, AddOp) {
+  constexpr char kModuleStr[] = R"(HloModule two_plus_two_module
+    ENTRY %two_plus_two_computation () -> f32[] {
+      %two = f32[] constant(2)
+      ROOT %two_plus_two = f32[] add(f32[] %two, f32[] %two)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, tools::Parse(kModuleStr));
+
+  const HloInstruction* matched_inst;
+  HloInstruction* matched_operand;
+  Shape* matched_shape;
+  Layout* matched_layout;
+
+  ASSERT_TRUE(Match(
+      hlo_module->entry_computation()->root_instruction(),
+      match::Op(&matched_inst)
+          .WithName("two_plus_two")
+          .WithOpcode(HloOpcode::kAdd)
+          .WithShape(
+              match::Shape(&matched_shape)
+                  .WithLayout(match::Layout(&matched_layout).WithDenseFormat()))
+          .WithOperand(
+              0,
+              match::Op(&matched_operand).WithOpcode(HloOpcode::kConstant))));
+  ASSERT_NE(matched_inst, nullptr);
+  EXPECT_EQ(matched_inst->name(), "two_plus_two");
+  EXPECT_EQ(matched_inst->opcode(), HloOpcode::kAdd);
+
+  EXPECT_TRUE(Match(hlo_module->entry_computation()->root_instruction(),
+                    match::Add(match::Constant(), match::Constant())));
+
+  EXPECT_FALSE(Match(hlo_module->entry_computation()->root_instruction(),
+                     match::Op().WithName("bad_name")));
+  matched_inst = nullptr;
+  EXPECT_FALSE(Match(hlo_module->entry_computation()->root_instruction(),
+                     match::Multiply(&matched_inst, match::Op(), match::Op())));
+}
+
+TEST(PatternMatcherTest, ScalarShape) {
+  auto scalar_shape = ShapeUtil::MakeShape(F32, {});
+  Shape* matched_shape;
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape(&matched_shape).IsScalar()));
+  EXPECT_EQ(matched_shape, &scalar_shape);
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsArray()));
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsDenseArray()));
+  EXPECT_FALSE(Match(&scalar_shape, match::Shape().IsTuple()));
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithElementType(F32)));
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithRank(0)));
+  EXPECT_FALSE(Match(
+      &scalar_shape,
+      match::Shape().WithSubshape({0}, match::Shape()).WithElementType(F32)));
+}
+
+TEST(PatternMatcherTest, DenseArrayShape) {
+  auto array_shape = ShapeUtil::MakeShape(F32, {2, 3, 4});
+  Shape* matched_shape;
+  EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray()));
+  EXPECT_EQ(matched_shape, &array_shape);
+  EXPECT_TRUE(Match(&array_shape, match::Shape().IsDenseArray()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsSparseArray()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple()));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32)));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithRank(3)));
+  EXPECT_FALSE(
+      Match(&array_shape, match::Shape().WithSubshape({0}, match::Shape())));
+  Layout* matched_layout;
+  EXPECT_FALSE(Match(&array_shape,
+                     match::Shape().WithLayout(
+                         match::Layout(&matched_layout).WithSparseFormat())));
+  EXPECT_TRUE(Match(&array_shape,
+                    match::Shape().WithLayout(
+                        match::Layout(&matched_layout).WithDenseFormat())));
+  EXPECT_EQ(matched_layout, &array_shape.layout());
+}
+
+TEST(PatternMatcherTest, SparseArrayShape) {
+  auto array_shape = ShapeUtil::MakeShapeWithSparseLayout(F32, {2, 3, 4}, 10);
+  Shape* matched_shape;
+  EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray()));
+  EXPECT_EQ(matched_shape, &array_shape);
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsDenseArray()));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().IsSparseArray()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple()));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32)));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithRank(3)));
+  EXPECT_FALSE(
+      Match(&array_shape, match::Shape().WithSubshape({0}, match::Shape())));
+  Layout* matched_layout;
+  EXPECT_FALSE(Match(&array_shape,
+                     match::Shape().WithLayout(
+                         match::Layout(&matched_layout).WithDenseFormat())));
+  EXPECT_TRUE(Match(&array_shape,
+                    match::Shape().WithLayout(
+                        match::Layout(&matched_layout).WithSparseFormat())));
+  EXPECT_EQ(matched_layout, &array_shape.layout());
+}
+
+TEST(PatternMatcherTest, TupleShape) {
+  auto tuple_shape = ShapeUtil::MakeTupleShape({
+      ShapeUtil::MakeShape(F32, {1, 2, 3}),
+      ShapeUtil::MakeShape(S32, {4, 5}),
+  });
+  EXPECT_TRUE(Match(&tuple_shape, match::Shape().IsTuple()));
+  EXPECT_FALSE(Match(&tuple_shape, match::Shape().IsArray()));
+  EXPECT_FALSE(Match(&tuple_shape, match::Shape().IsScalar()));
+
+  Shape* subshape;
+  ASSERT_TRUE(Match(
+      &tuple_shape,
+      match::Shape().WithSubshape(
+          {0}, match::Shape(&subshape).WithElementType(F32).WithRank(3))));
+  ASSERT_NE(subshape, nullptr);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(*subshape, ShapeUtil::GetSubshape(tuple_shape, {0})));
+  EXPECT_TRUE(Match(&tuple_shape,
+                    match::Shape().WithSubshape(
+                        {0}, match::Shape().EqualTo(
+                                 &ShapeUtil::GetSubshape(tuple_shape, {0})))));
+  EXPECT_FALSE(Match(&tuple_shape,
+                     match::Shape().WithSubshape(
+                         {0}, match::Shape().EqualTo(
+                                  &ShapeUtil::GetSubshape(tuple_shape, {1})))));
+
+  ASSERT_TRUE(Match(
+      &tuple_shape,
+      match::Shape().WithSubshape(
+          {1}, match::Shape(&subshape).WithElementType(S32).WithRank(2))));
+  ASSERT_NE(subshape, nullptr);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(*subshape, ShapeUtil::GetSubshape(tuple_shape, {1})));
+  EXPECT_TRUE(Match(&tuple_shape,
+                    match::Shape().WithSubshape(
+                        {1}, match::Shape().EqualTo(
+                                 &ShapeUtil::GetSubshape(tuple_shape, {1})))));
+  EXPECT_FALSE(Match(&tuple_shape,
+                     match::Shape().WithSubshape(
+                         {1}, match::Shape().EqualTo(
+                                  &ShapeUtil::GetSubshape(tuple_shape, {0})))));
+
+  EXPECT_FALSE(
+      Match(&tuple_shape, match::Shape().WithSubshape({2}, match::Shape())));
+  EXPECT_FALSE(
+      Match(&tuple_shape, match::Shape().WithSubshape({0, 0}, match::Shape())));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index aa974ee61a27de9c19e97d8a6eb48f9261ce4bd9..7c63c0acc7764d558b2151190f0fa79fac355cbf 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 using tensorflow::str_util::Lowercase;
diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h
index 69188820a70707d9c9be10b20fb7de92ad4d9873..571451ba43a81d19b70e4954e45d3447f15dcedc 100644
--- a/tensorflow/compiler/xla/service/platform_util.h
+++ b/tensorflow/compiler/xla/service/platform_util.h
@@ -34,29 +34,27 @@ class PlatformUtil {
   //
   // Note that, even if a platform is present with zero devices, if we *do* have
   // compilation support for it, it will be returned in this sequence.
-  static StatusOr<std::vector<perftools::gputools::Platform*>>
-  GetSupportedPlatforms();
+  static StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms();
 
   // Convenience function which returns the default supported platform for
   // tests. If exactly one supported platform is present, then this platform is
   // the default platform. If exactly two platforms are present and one of them
   // is the interpreter platform, then the other platform is the default
   // platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+  static StatusOr<se::Platform*> GetDefaultPlatform();
 
   // Convenience function which returns the sole supported platform. If
   // exactly one supported platform is present, then this platform is the
   // default platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetSolePlatform();
+  static StatusOr<se::Platform*> GetSolePlatform();
 
   // Returns the platform according to the given name. Returns error if there is
   // no such platform.
-  static StatusOr<perftools::gputools::Platform*> GetPlatform(
-      const string& platform_name);
+  static StatusOr<se::Platform*> GetPlatform(const string& platform_name);
 
   // Returns exactly one platform that does not have given name. Returns error
   // if there is no such platform, or there are multiple such platforms.
-  static StatusOr<perftools::gputools::Platform*> GetPlatformExceptFor(
+  static StatusOr<se::Platform*> GetPlatformExceptFor(
       const string& platform_name);
 
   // Returns a vector of StreamExecutors for the given platform. The vector is
@@ -64,8 +62,8 @@ class PlatformUtil {
   // element is nullptr, then the device is present by not supported by XLA.
   //
   // If the platform has no visible devices, a not-found error is returned.
-  static StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
-  GetStreamExecutors(perftools::gputools::Platform* platform);
+  static StatusOr<std::vector<se::StreamExecutor*>> GetStreamExecutors(
+      se::Platform* platform);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(PlatformUtil);
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index e62bafc50b0e1270702621c9ea7b2ee43e001fe0..0f26a025bf125f70199637894741540f89eae7e5 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -53,8 +53,8 @@ bool IsReshapeOrTranspose(const HloInstruction* instruction) {
          instruction->opcode() == HloOpcode::kTranspose;
 }
 
-// Returns true iff `instruction` can change its shape simply by adjusting
-// metadata.
+// Returns true if `instruction` can change its shape simply by adjusting
+// metadata or if `instruction` is a broadcast of a scalar value.
 bool CanTriviallyChangeShape(const HloInstruction* instruction) {
   // NOTE: Technically a sequence of reshape(reshape(constant)) is also
   // trivially reshapable, so we might be tempted to simply recurse if
@@ -88,19 +88,31 @@ bool CanTriviallyChangeShape(const HloInstruction* instruction) {
       instruction->user_count() == 1) {
     return true;
   }
+
+  // A broadcase of scalar can trivially change its shape.
+  if (instruction->opcode() == HloOpcode::kBroadcast &&
+      ShapeUtil::IsScalar(instruction->operand(0)->shape())) {
+    return true;
+  }
+
   return false;
 }
 
-// Finds the first non-scalar operand of an instruction that is a non-trivial
-// reshape or transpose. Returns the operand if it is found or nullptr if not
-// found.
+// Returns true iff `instruction` is a reshape/transpose instruction for which
+// a shape change is nontrivial.
+bool IsNontrivialReshape(const HloInstruction* instruction) {
+  return !ShapeUtil::IsScalar(instruction->shape()) &&
+         IsReshapeOrTranspose(instruction) &&
+         !CanTriviallyChangeShape(instruction->operand(0));
+}
+
+// Finds the first operand of an instruction that is a non-trivial reshape or
+// transpose. Returns such an operand or nullptr if not found.
 HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
     const HloInstruction* hlo) {
   for (HloInstruction* operand : hlo->operands()) {
-    if (!ShapeUtil::IsScalar(operand->shape()) &&
-        IsReshapeOrTranspose(operand) &&
-        !CanTriviallyChangeShape(operand->operand(0))) {
-      VLOG(5) << "Found first non-scalar and non-trivial reshape operand of "
+    if (IsNontrivialReshape(operand)) {
+      VLOG(5) << "Found first non-trivial reshape operand of "
               << hlo->ToString(HloPrintOptions().set_print_metadata(false))
               << ":\n\t"
               << operand->ToString(HloPrintOptions().set_print_metadata(false));
@@ -110,7 +122,7 @@ HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
   return nullptr;
 }
 
-// Returns whether `a` and `b` are equivalent for the purposes of this pass.
+// Returns whether `a` and `b` are equivalent reshapes/transposes.
 bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   if (a->opcode() != b->opcode() ||
       !ShapeUtil::SameDimensions(a->shape(), b->shape())) {
@@ -127,71 +139,14 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   }
 }
 
-// Returns true if all operands of `instruction` can easily change shape.
-// Operands can easily change shape if they are all reshapes/transposes to and
-// from the same shape. Additionally, operands like constant, rng, and any
-// scalar change shape with only an adjustment of metadata.
-bool AllOperandsHaveEasyShapeChanges(
-    const HloInstruction* instruction,
-    const HloInstruction* first_reshape_operand) {
-  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
-  VLOG(3) << "** Checking whether all operands have easy shape changes: "
-          << instruction->ToString(print_no_metadata);
-  // Check whether all operands:
-  //    0. Have the same dimensions as the output -- if not, it may be
-  //       implicitly broadcast, which can confound the movement's
-  //       correctness.
-  //
-  // And one of the following:
-  //    1. Are reshapes or transposes that have the same input and
-  //       output shapes as all other reshaped or transposed operands.
-  //     or
-  //    2. Are one of kConstant, kRng, and scalars that can change shape
-  //    trivially,
-  for (const HloInstruction* operand : instruction->operands()) {
-    if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
-      VLOG(5) << "Operand shape differs from output shape; may be "
-                 "implicitly broadcast, so preventing "
-                 "movement\n\toperand: "
-              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
-              << instruction->ToString(print_no_metadata);
-      return false;
-    }
-
-    if (AreEquivalentReshapes(first_reshape_operand, operand)) {
-      VLOG(5) << "Are equivalent reshapes:\n\tfirst_reshape_operand: "
-              << first_reshape_operand->ToString(print_no_metadata)
-              << "\n\toperand: " << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    if (CanTriviallyChangeShape(operand)) {
-      VLOG(5) << "Operand can trivially change shape: "
-              << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    // TODO(someone): Look into supporting general ops for the operands as
-    // well.
-    VLOG(5) << "Operand is neither equalivant to the first Reshape operand"
-               "nor can trivially change shape: "
-            << operand->ToString(print_no_metadata);
-    return false;
-  }
-
-  VLOG(3) << "All operands have easy shape changes: "
-          << instruction->ToString(print_no_metadata);
-  return true;
-}
-
 // This function is called once we've decided to sink reshape/transpose operands
 // across an instruction. It returns an updated `operand` with a shape that
 // plays nicely with `new_operand_shape`; either it has the same shape (of the
 // correct type), or it is a scalar that may be implicitly broadcast.
-HloInstruction* UpdateOperand(HloComputation* computation,
-                              const HloInstruction* first_reshape_operand,
+HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand,
                               const Shape& new_operand_shape,
                               HloInstruction* operand) {
+  HloComputation* computation = operand->parent();
   const PrimitiveType element_type = operand->shape().element_type();
   const Shape new_shape =
       ShapeUtil::ChangeElementType(new_operand_shape, element_type);
@@ -200,15 +155,20 @@ HloInstruction* UpdateOperand(HloComputation* computation,
     case HloOpcode::kConstant: {
       if (first_reshape_operand->opcode() == HloOpcode::kReshape) {
         VLOG(5) << "Adding reshape to kConstant operand";
-        return computation->AddInstruction(
+        HloInstruction* reshape = computation->AddInstruction(
             HloInstruction::CreateReshape(new_shape, operand));
+        operand->SetupDerivedInstruction(reshape);
+        return reshape;
       } else {
         CHECK(first_reshape_operand->opcode() == HloOpcode::kTranspose);
         VLOG(5) << "Adding transpose to kConstant operand";
         std::vector<int64> inverse_permutation =
             InversePermutation(first_reshape_operand->dimensions());
-        return computation->AddInstruction(HloInstruction::CreateTranspose(
-            new_shape, operand, inverse_permutation));
+        HloInstruction* transpose =
+            computation->AddInstruction(HloInstruction::CreateTranspose(
+                new_shape, operand, inverse_permutation));
+        operand->SetupDerivedInstruction(transpose);
+        return transpose;
       }
     }
     case HloOpcode::kRng: {
@@ -222,36 +182,24 @@ HloInstruction* UpdateOperand(HloComputation* computation,
       VLOG(5) << "Using existing operand of kReshape or kTranspose";
       return operand->mutable_operand(0);
     }
+    case HloOpcode::kBroadcast: {
+      CHECK(ShapeUtil::IsScalar(operand->operand(0)->shape()));
+      HloInstruction* inst = computation->AddInstruction(
+          operand->CloneWithNewOperands(new_shape, operand->operands()));
+      VLOG(5) << "Changing broadcast from " << operand->ToString() << " to "
+              << inst->ToString();
+      return inst;
+    }
+
     default:
       LOG(FATAL) << "Unexpected operand opcode during update: " << operand;
   }
 }
 
-// Try to sink any reshape or transpose operands of `instruction` across it. We
-// do so if `instruction` is elementwise and all operands are either equivalent
-// reshapes/transposes or are trivially reshapable.
-StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
-                                         HloInstruction* instruction) {
-  // Only perform sinks for live elementwise instructions with operands.
-  const bool is_dead = instruction->user_count() == 0 &&
-                       instruction != computation->root_instruction();
-  if (!instruction->IsElementwise() || instruction->operands().empty() ||
-      is_dead) {
-    return false;
-  }
-
-  // Only perform sinks if there are any nontrivial reshape/transpose operands.
-  const HloInstruction* first_reshape_operand =
-      FirstNonScalarAndNonTrivialReshapeOperand(instruction);
-  if (!first_reshape_operand) {
-    return false;
-  }
-
-  // Only perform sinks if all operands can easily change shape.
-  if (!AllOperandsHaveEasyShapeChanges(instruction, first_reshape_operand)) {
-    return false;
-  }
-
+// Actually performs the reshape-move transformation -- that is, sinks the
+// reshape or transpose operands of `instruction` across it.
+StatusOr<bool> PerformSinkReshapeOrTranspose(
+    HloInstruction* instruction, const HloInstruction* first_reshape_operand) {
   auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
   // At this point we've decided to sink reshape/transpose operands.
   const Shape& new_operand_shape = first_reshape_operand->operand(0)->shape();
@@ -272,8 +220,8 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
     }
     VLOG(3) << "Updating operand #" << i << ": "
             << operands[i]->ToString(print_no_metadata);
-    operands[i] = UpdateOperand(computation, first_reshape_operand,
-                                new_operand_shape, operands[i]);
+    operands[i] =
+        UpdateOperand(first_reshape_operand, new_operand_shape, operands[i]);
   }
   if (HloOpcode::kFusion == instruction->opcode()) {
     // Here we already know `instruction` is elementwise, and no operand is
@@ -285,6 +233,7 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
       *shape->mutable_layout() = new_operand_shape.layout();
     }
   }
+  HloComputation* computation = instruction->parent();
   HloInstruction* new_elementwise =
       computation->AddInstruction(instruction->CloneWithNewOperands(
           // `instruction` may change the element type, e.g., from
@@ -319,6 +268,141 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
   return true;
 }
 
+// Returns true if the instruction is a reshape-move candidate.
+//
+// An instruction is a reshape-move candidate if the instruction is elementwise,
+// has at least one nontrivial reshape/transpose operand, and its operands are
+// either trivially reshapable or are equivalent nontrivial reshapes/transposes.
+bool IsReshapeMoveCandidate(HloInstruction* instruction) {
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
+  VLOG(5) << "** Checking instruction: "
+          << instruction->ToString(print_no_metadata);
+
+  // Only perform reshape-move for live elementwise instructions with operands.
+  const bool is_dead = instruction->user_count() == 0 &&
+                       instruction != instruction->parent()->root_instruction();
+  if (!instruction->IsElementwise() || instruction->operands().empty() ||
+      is_dead) {
+    return false;
+  }
+
+  // Check whether all operands:
+  //    0. Have the same dimensions as the output -- if not, they may be
+  //       implicitly broadcast, which can confound the movement's
+  //       correctness.
+  //
+  // And one of the following:
+  //    1. Are reshapes or transposes that have the same input and
+  //       output shapes as all other reshaped or transposed operands.
+  //     or
+  //    2. Are one of kConstant, kRng, broadcast of a scalar value, and scalars
+  //     that can change shape trivially.
+  const HloInstruction* first_reshape_operand = nullptr;
+  for (const HloInstruction* operand : instruction->operands()) {
+    if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
+      VLOG(5) << "Operand shape differs from output shape; may be "
+                 "implicitly broadcast, so preventing "
+                 "movement\n\toperand: "
+              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
+              << instruction->ToString(print_no_metadata);
+      return false;
+    }
+
+    if (CanTriviallyChangeShape(operand)) {
+      VLOG(5) << "Operand can trivially change shape: "
+              << operand->ToString(print_no_metadata);
+      continue;
+    }
+
+    if (!IsNontrivialReshape(operand)) {
+      VLOG(5) << "Operand can't trivially change shape: "
+              << operand->ToString(print_no_metadata);
+      return false;
+    }
+
+    if (first_reshape_operand == nullptr) {
+      first_reshape_operand = operand;
+      VLOG(5) << "First reshape operand "
+              << operand->ToString(print_no_metadata);
+    } else if (AreEquivalentReshapes(first_reshape_operand, operand)) {
+      VLOG(5)
+          << "Operand is an equivalent reshape of the first reshape operand "
+          << operand->ToString(print_no_metadata);
+    } else {
+      // TODO(someone): Look into supporting general ops for the operands as
+      // well.
+      VLOG(5) << "Operand is a reshape but is not equivalent to the first "
+                 "Reshape operand"
+              << operand->ToString(print_no_metadata);
+      return false;
+    }
+  }
+
+  if (first_reshape_operand) {
+    VLOG(5) << "All operands have easy shape changes: "
+            << instruction->ToString(print_no_metadata);
+  }
+
+  return first_reshape_operand != nullptr;
+}
+
+// Reshape-moves all qualifying instructions in reshape_candidates.  Returns
+// true if it makes changes.
+//
+// `reshape_candidates` is a set of HloInstructions with nontrivial reshape
+// operands, and a instruction in the set can be reshape-moved iff all the users
+// of its nontrivial reshape operands can also be reshaped-moved.
+//
+// The algorithm here iteratively finds the nontrivial operands with users that
+// are outside the set of `reshape_candidates`, and removes their users from
+// `reshape_candidates`, until either `reshape_candidates` becomes empty or none
+// of the remaining nontrivial operands have users outside `reshape_candidates`.
+// In the later case, all the remaining instructions in `reshape_candidates`
+// are reshape-moved and the routine returns true.
+StatusOr<bool> TryReshapeMoveOnCandidates(
+    HloInstructionSet* reshape_candidates) {
+  bool removed = true;
+  while (!reshape_candidates->empty() && removed) {
+    if (VLOG_IS_ON(5)) {
+      for (const HloInstruction* instruction : *reshape_candidates) {
+        VLOG(5) << "candidate " << instruction->ToString();
+      }
+    }
+    ConstHloInstructionSet nontrivial_operands;
+    for (const HloInstruction* instruction : *reshape_candidates) {
+      for (const auto* operand : instruction->operands()) {
+        if (IsNontrivialReshape(operand)) {
+          nontrivial_operands.insert(operand);
+        }
+      }
+    }
+
+    removed = false;
+    for (auto operand : nontrivial_operands) {
+      if (c_any_of(operand->users(), [&](HloInstruction* user) {
+            return !reshape_candidates->count(user);
+          })) {
+        for (auto* user : operand->users()) {
+          removed |= reshape_candidates->erase(user) > 0;
+        }
+      }
+    }
+  }
+
+  if (reshape_candidates->empty()) {
+    return false;
+  }
+  for (HloInstruction* instruction : *reshape_candidates) {
+    const HloInstruction* first_reshape_operand =
+        FirstNonScalarAndNonTrivialReshapeOperand(instruction);
+    TF_ASSIGN_OR_RETURN(
+        bool did_change,
+        PerformSinkReshapeOrTranspose(instruction, first_reshape_operand));
+    CHECK(did_change);
+  }
+  return true;
+}
+
 }  // namespace
 
 StatusOr<bool> ReshapeMover::Run(HloModule* module) {
@@ -326,11 +410,15 @@ StatusOr<bool> ReshapeMover::Run(HloModule* module) {
   VLOG(2) << "Pre ReshapeMover HLO:";
   XLA_VLOG_LINES(2, module->ToString());
   for (auto* comp : module->MakeNonfusionComputations()) {
-    for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
-      TF_ASSIGN_OR_RETURN(bool did_change,
-                          TrySinkReshapeOrTranspose(comp, instruction));
-      changed |= did_change;
+    HloInstructionSet reshape_candidates;
+    for (HloInstruction* instruction : comp->instructions()) {
+      if (IsReshapeMoveCandidate(instruction)) {
+        reshape_candidates.insert(instruction);
+      }
     }
+    TF_ASSIGN_OR_RETURN(bool did_change,
+                        TryReshapeMoveOnCandidates(&reshape_candidates));
+    changed |= did_change;
   }
   VLOG(2) << "Post ReshapeMover HLO:";
   XLA_VLOG_LINES(2, module->ToString());
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index aac8638a54f744f0c230ec6c5ca071c1daf45ab2..13e2d3258e3b92f52320201c382594962c0e3b2b 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -458,57 +458,6 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
   EXPECT_EQ(select, computation->root_instruction());
 }
 
-// Tree looks like:
-//
-// param0 [1,128,1]
-//  |
-// reshape [128,1]          constant [128,1024]
-//   \                         /
-//     multiply w/implicit broadcast [128,1024]
-//
-// The reshape mover would like to sink the reshape below the multiply.
-//
-// Previously we would attempt to insert a reshape of the constant to [1,128,1]
-// (which is unsound, because it has a different number of elements) as
-// preparation for sinking the reshape.
-//
-// To eliminate the unsoundness, we outlaw reshape sinking when one of the
-// operands is implicitly broadcast in the elementwise consumer.
-//
-// TODO(b/37799338) However, it would be possible in this case to do a more
-// in-depth analysis to get reshape movement to occur:
-//
-// 1. Note that the broadcast dimension (logical dimension 1) in the operands
-//    would map back to logical dimension 2 in the param0 node.
-// 2. Match rank of the constant to the param0 node (by prepending a trivial 1
-//    dimension).
-// 3. Reshape to [128,1024] at the root.
-//
-// But this is not currently done.
-TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) {
-  HloComputation::Builder builder(TestName());
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 128, 1}), "param0"));
-  auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
-      ShapeUtil::MakeShape(F32, {128, 1}), param0));
-  Array2D<float> a(128, 1024);
-  auto literal = Literal::CreateR2FromArray2D<float>(a);
-  auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(std::move(literal)));
-  auto multiply = builder.AddInstruction(HloInstruction::CreateBinary(
-      constant->shape(), HloOpcode::kMultiply, constant, reshape));
-
-  auto computation = module().AddEntryComputation(builder.Build());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Constant(), op::Reshape(param0)));
-
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Constant(), op::Reshape(param0)));
-  EXPECT_EQ(multiply, computation->root_instruction());
-}
-
 // Tree looks like this:
 //
 // add1
@@ -560,5 +509,95 @@ TEST_F(ReshapeMoverTest, MultiplePasses) {
       op::Reshape(op::Add(param2, op::Reshape(op::Add(param0, param1)))));
 }
 
+TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
+  const string hlo_string = R"(
+    HloModule TransposeMulInversedTransposeModule
+    ENTRY TransposeMulInversedTranspose {
+      src0 = f32[20,8]{1,0} parameter(0)
+      transpose0 = f32[8,20]{1,0} transpose(src0), dimensions={1,0}
+      src1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(src1), dimensions={}
+      ROOT multiply0 = f32[8,20]{1,0} multiply(transpose0, broadcast0)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Transpose(op::Multiply()));
+}
+
+TEST_F(ReshapeMoverTest, ReshapeWithUsersOutsideCandidatesNotSink) {
+  const string hlo_string = R"(
+    HloModule ReshapeWithUsersOutsideCandidates
+    ENTRY ReshapeWithMultipleUsers {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      param1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(param1), dimensions={}
+      param2 = f32[20,8]{1,0} parameter(2)
+      reshape1 = f32[8,20]{1,0} reshape(param2)
+      param3 = f32[20,8]{1,0} parameter(3)
+      reshape2 = f32[8,20]{1,0} reshape(param3)
+      param4 = f32[8,20]{1,0} parameter(4)
+      add0 = f32[8,20]{1,0} add(reshape0, broadcast0)
+      add1 = f32[8,20]{1,0} add(reshape0, reshape1)
+      add2 = f32[8,20]{1,0} add(reshape1, param4)
+      ROOT tuple = (f32[8,20]{1,0},f32[8,20]{1,0},
+        f32[8,20]{1,0}) tuple(add0, add1, add2)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink1) {
+  const string hlo_string = R"(
+    HloModule ReshapeNoUsersOutsideCandidates1
+    ENTRY ReshapeWithMultipleUsers1 {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      param1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(param1), dimensions={}
+      param2 = f32[20,8]{1,0} parameter(2)
+      reshape1 = f32[8,20]{1,0} reshape(param2)
+      param3 = f32[20,8]{1,0} parameter(3)
+      reshape2 = f32[8,20]{1,0} reshape(param3)
+      add0 = f32[8,20]{1,0} add(reshape0, broadcast0)
+      add1 = f32[8,20]{1,0} add(reshape0, reshape1)
+      add2 = f32[8,20]{1,0} add(reshape1, reshape2)
+      ROOT tuple = (f32[8,20]{1,0},f32[8,20]{1,0},
+        f32[8,20]{1,0}) tuple(add0, add1, add2)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Tuple(op::Reshape(), op::Reshape(), op::Reshape()));
+}
+
+TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink2) {
+  const string hlo_string = R"(
+    HloModule ReshapeNoUsersOutsideCandidates2
+    ENTRY ReshapeWithMultipleUsers2 {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      ROOT add0 = f32[8,20]{1,0} add(reshape0, reshape0)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Reshape(op::Add()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index e278eab69088d3031b1d951734b7dcad6f8afc77..6e0d07a12f906b4b95d521e957ac28c84dd28774 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -54,8 +54,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrCat;
 using ::xla::source_map_util::InvalidParameterArgument;
@@ -95,15 +93,12 @@ tensorflow::Status RecordResult(const ShapedBuffer& result,
 
 }  // namespace
 
-ServiceOptions& ServiceOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+ServiceOptions& ServiceOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
   return *this;
 }
 
-perftools::gputools::Platform* ServiceOptions::platform() const {
-  return platform_;
-}
+se::Platform* ServiceOptions::platform() const { return platform_; }
 
 ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) {
   number_of_replicas_ = number_of_replicas;
@@ -123,7 +118,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
 }
 
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   ServiceOptions default_options;
   default_options.set_platform(platform);
   return NewService(default_options);
@@ -131,7 +126,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
 
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
     const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   std::unique_ptr<Backend> execute_backend;
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
@@ -232,10 +227,13 @@ tensorflow::Status Service::ValidateResultShapeWithLayout(
   return ShapeUtil::ValidateShape(shape_with_layout);
 }
 
-StatusOr<std::vector<const ShapedBuffer*>> Service::ResolveAndValidateArguments(
+StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
+Service::ResolveAndValidateArguments(
     tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    int device_ordinal) {
-  std::vector<const ShapedBuffer*> shaped_buffers;
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors) {
+  CHECK_EQ(options_.number_of_replicas(), stream_executors.size());
+  std::vector<std::vector<const ShapedBuffer*>> replicated_arguments;
+  replicated_arguments.resize(options_.number_of_replicas());
   for (size_t i = 0; i < arguments.size(); ++i) {
     auto buffer_status = allocation_tracker_.Resolve(*arguments[i]);
     if (!buffer_status.ok()) {
@@ -243,29 +241,32 @@ StatusOr<std::vector<const ShapedBuffer*>> Service::ResolveAndValidateArguments(
                     StrCat(buffer_status.status().error_message(), ", ",
                            "failed to resolve allocation for parameter ", i));
     }
-    const ShapedBuffer* shaped_buffer = buffer_status.ValueOrDie();
-
-    // Verify allocation is same platform and device as the execution.
-    if (shaped_buffer->platform() != execute_backend_->platform() ||
-        shaped_buffer->device_ordinal() != device_ordinal) {
-      return InvalidArgument(
-          "argument %lu is on device %s:%d but computation will be executed "
-          "on device %s",
-          i, shaped_buffer->platform()->Name().c_str(),
-          shaped_buffer->device_ordinal(),
-          execute_backend_->device_name(device_ordinal).c_str());
+    auto replicated_buffers = buffer_status.ValueOrDie();
+    CHECK_EQ(options_.number_of_replicas(), replicated_buffers.size());
+    for (int replica = 0; replica < options_.number_of_replicas(); ++replica) {
+      const ShapedBuffer* shaped_buffer = replicated_buffers[replica];
+      int replica_device_ordinal = stream_executors[replica]->device_ordinal();
+      // Verify allocation is same platform and device as the execution.
+      if (shaped_buffer->platform() != execute_backend_->platform() ||
+          shaped_buffer->device_ordinal() != replica_device_ordinal) {
+        return InvalidArgument(
+            "argument %lu is on device %s:%d but computation will be executed "
+            "on device %s",
+            i, shaped_buffer->platform()->Name().c_str(),
+            shaped_buffer->device_ordinal(),
+            execute_backend_->device_name(replica_device_ordinal).c_str());
+      }
+      replicated_arguments[replica].push_back(shaped_buffer);
     }
-
-    shaped_buffers.push_back(shaped_buffer);
   }
-  return shaped_buffers;
+  return replicated_arguments;
 }
 
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
     const ExecutionOptions* execution_options,
-    const UserComputation& user_computation) {
+    const UserComputation* user_computation) {
   auto config = MakeUnique<HloModuleConfig>(program_shape);
   auto* computation_layout = config->mutable_entry_computation_layout();
 
@@ -279,8 +280,15 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     // ProgramShape.
     if (!ShapeUtil::Compatible(*argument_shapes[i],
                                program_shape.parameters(i))) {
+      if (user_computation == nullptr) {
+        return InvalidArgument(
+            "Argument does not match shape of computation parameter %d: want "
+            "%s, got %s",
+            i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
+            ShapeUtil::HumanString(*argument_shapes[i]).c_str());
+      }
       return InvalidParameterArgument(
-          *user_computation.ParameterMetadata(i).value(),
+          *user_computation->ParameterMetadata(i).value(),
           "Argument does not match shape of computation parameter %d: want %s, "
           "got %s",
           i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
@@ -307,8 +315,6 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
   if (execution_options != nullptr) {
     config->set_seed(execution_options->seed());
     config->set_debug_options(execution_options->debug_options());
-    config->enable_hlo_profiling(
-        execution_options->debug_options().xla_hlo_profile());
   } else {
     config->set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
   }
@@ -325,7 +331,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutionOptions& execution_options,
-    const UserComputation& user_computation) {
+    const UserComputation* user_computation) {
   std::vector<const Shape*> argument_shapes;
   for (const auto* arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
@@ -337,8 +343,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     std::vector<VersionedComputationHandle> versioned_handles,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
     DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
@@ -397,6 +402,36 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
   return std::move(executables);
 }
 
+StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
+    const std::vector<const HloModuleProto*>& module_protos,
+    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
+    DeviceMemoryAllocator* device_allocator) {
+  VLOG(1) << Printf("BuildExecutable on service %p", this);
+
+  VLOG(1) << "Computations:";
+  for (const HloModuleProto* proto : module_protos) {
+    VLOG(1) << proto->name();
+  }
+
+  CHECK_EQ(module_protos.size(), module_configs.size());
+  std::vector<std::unique_ptr<HloModule>> modules;
+  for (int64 i = 0; i < module_protos.size(); ++i) {
+    const HloModuleProto* proto = module_protos[i];
+    const HloModuleConfig& config = *module_configs[i];
+    TF_ASSIGN_OR_RETURN(auto module,
+                        HloModule::CreateFromProto(*proto, config));
+    modules.push_back(std::move(module));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<Executable>> executables,
+      backend->compiler()->Compile(std::move(modules), std::move(executors),
+                                   device_allocator));
+
+  return std::move(executables);
+}
+
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const VersionedComputationHandle& versioned_handle,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -450,7 +485,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
     const VersionedComputationHandle& versioned_handle,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+    se::StreamExecutor* executor, ExecutionProfile* profile,
     DeviceMemoryAllocator* device_allocator) {
   std::shared_ptr<Executable> executable =
       compilation_cache_.LookUp(versioned_handle, *module_config);
@@ -490,14 +525,15 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
 StatusOr<std::vector<GlobalDataHandle>>
 Service::ExecuteParallelAndRegisterResult(
     tensorflow::gtl::ArraySlice<Executable*> executables,
-    tensorflow::gtl::ArraySlice<std::vector<const ShapedBuffer*>> arguments,
+    tensorflow::gtl::ArraySlice<std::vector<std::vector<const ShapedBuffer*>>>
+        arguments,
     Backend* backend, tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
     tensorflow::gtl::ArraySlice<string> result_tags,
     ExecutionProfile* profile) {
   // Streams where the computation are launched, so we can wait on the streams
   // to complete.
   std::vector<Pool<se::Stream>::SmartPtr> streams;
-  std::vector<std::unique_ptr<perftools::gputools::Timer>> timers;
+  std::vector<std::unique_ptr<se::Timer>> timers;
 
   // Global data handles for the computation results, one for each computation.
   std::vector<GlobalDataHandle> result_handles;
@@ -513,14 +549,15 @@ Service::ExecuteParallelAndRegisterResult(
   for (int64 i = 0; i < executables.size(); i++) {
     // Stream executors for the replicas of the current computation.
     TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i]));
+    CHECK_EQ(replicas.size(), arguments[i].size());
+    std::vector<ScopedShapedBuffer> result_buffers;
     for (int64 replica = 0; replica < replicas.size(); ++replica) {
       TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
                           backend->BorrowStream(replicas[replica]));
       streams.push_back(std::move(stream));
 
       if (replica == 0 && profile != nullptr) {
-        timers.emplace_back(
-            new perftools::gputools::Timer(streams.back()->parent()));
+        timers.emplace_back(new se::Timer(streams.back()->parent()));
         streams.back()
             ->InitTimer(timers.back().get())
             .ThenStartTimer(timers.back().get());
@@ -537,7 +574,6 @@ Service::ExecuteParallelAndRegisterResult(
       ExecutableRunOptions options;
       options.set_stream(streams.back().get());
       options.set_allocator(backend->memory_allocator());
-      options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
       options.set_intra_op_thread_pool(
           backend->eigen_intra_op_thread_pool_device());
       options.set_device_assignment(&device_assignment);
@@ -545,23 +581,20 @@ Service::ExecuteParallelAndRegisterResult(
                                               backend->StreamBorrower());
 
       // Asynchronously launch the computation.
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<ShapedBuffer> result,
-          executables[i]->ExecuteAsyncOnStream(&run_options, arguments[i]));
+      TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                          executables[i]->ExecuteAsyncOnStream(
+                              &run_options, arguments[i][replica]));
 
       if (replica == 0 && profile != nullptr) {
         streams.back()->ThenStopTimer(timers.back().get());
       }
 
-      // All replicas share the same device address for the result allocation,
-      // so only one of the replicas need to register the result handle.
-      if (replica == 0) {
-        TF_ASSIGN_OR_RETURN(
-            GlobalDataHandle handle,
-            allocation_tracker_.Register(std::move(result), result_tags[i]));
-        result_handles.push_back(handle);
-      }
+      result_buffers.emplace_back(std::move(result));
     }
+    TF_ASSIGN_OR_RETURN(GlobalDataHandle handle,
+                        allocation_tracker_.RegisterReplicatedBuffers(
+                            std::move(result_buffers), result_tags[i]));
+    result_handles.push_back(handle);
   }
 
   // Wait for all executions to complete.
@@ -627,9 +660,9 @@ Service::ExecuteParallelAndRegisterResult(
 
 StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     Executable* executable,
-    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    Backend* backend, perftools::gputools::StreamExecutor* executor,
-    const string& result_tag, ExecutionProfile* profile) {
+    const tensorflow::gtl::ArraySlice<std::vector<const ShapedBuffer*>>
+        arguments,
+    Backend* backend, const string& result_tag, ExecutionProfile* profile) {
   // Set up streams.
   std::vector<Pool<se::Stream>::SmartPtr> streams;
 
@@ -654,29 +687,34 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     options.set_stream(stream.get());
     options.set_device_ordinal(stream->parent()->device_ordinal());
     options.set_allocator(backend->memory_allocator());
-    options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
     options.set_device_assignment(&device_assignment);
-    run_options.emplace_back(options, backend->StreamBorrower(),
-                             backend->inter_op_thread_pool());
+    run_options.emplace_back(
+        options, backend->StreamBorrower(),
+        /*xla_intra_op_thread_pool=*/backend->eigen_intra_op_thread_pool());
   }
 
-  std::unique_ptr<ShapedBuffer> result;
   if (options_.number_of_replicas() == 1) {
-    TF_ASSIGN_OR_RETURN(result, executable->ExecuteOnStreamWrapper(
-                                    &run_options[0], profile, arguments));
-  } else {
-    // TODO(b/69985541): Support profiling also on this path.
-    std::vector<tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
-        repeated_arguments(options_.number_of_replicas(), arguments);
+    TF_ASSIGN_OR_RETURN(
+        auto result, executable->ExecuteOnStreamWrapper(&run_options[0],
+                                                        profile, arguments[0]));
+    return allocation_tracker_.Register(std::move(result), result_tag);
+  }
 
-    TF_ASSIGN_OR_RETURN(auto results, executable->ExecuteOnStreams(
-                                          run_options, repeated_arguments));
-    TF_RET_CHECK(!results.empty());
-    result = std::move(results[0]);
+  // TODO(b/69985541): Support profiling also on this path.
+
+  std::vector<tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
+      replicated_arguments;
+  for (const auto& arg : arguments) {
+    replicated_arguments.emplace_back(arg);
   }
-  return allocation_tracker_.Register(std::move(result), result_tag);
+
+  TF_ASSIGN_OR_RETURN(auto results, executable->ExecuteOnStreams(
+                                        run_options, replicated_arguments));
+  TF_RET_CHECK(!results.empty());
+  return allocation_tracker_.RegisterReplicatedBuffers(std::move(results),
+                                                       result_tag);
 }
 
 tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
@@ -686,12 +724,53 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
   return computation->SetReturnValue(arg->operand());
 }
 
+StatusOr<std::vector<se::StreamExecutor*>> Service::GetExecutors(
+    const ExecutionOptions& execution_options, int64 requests_size,
+    int64 request_index) const {
+  if (execution_options.device_handles().empty()) {
+    return FailedPrecondition(
+        "device handles must be given to execute parallel computations");
+  }
+  if (requests_size > 1 && execution_options.device_handles_size() > 1) {
+    return InvalidArgument(
+        "Parallel requests with multiple device handles is not supported. "
+        "Found %lld parallel requests, with request %lld containing %d device "
+        "handles.",
+        requests_size, request_index, execution_options.device_handles_size());
+  }
+  std::vector<se::StreamExecutor*> executors;
+  for (const auto& device_handle : execution_options.device_handles()) {
+    TF_ASSIGN_OR_RETURN(auto replicas,
+                        Replicas(*execute_backend_, device_handle));
+    se::StreamExecutor* executor = replicas[0];
+    CHECK(executor != nullptr);
+    executors.push_back(executor);
+  }
+  return executors;
+}
+
+StatusOr<std::vector<std::vector<const ShapedBuffer*>>> Service::GetArguments(
+    const ExecutionOptions& execution_options,
+    tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments) {
+  // Resolve the allocations for the arguments of the computation, and create
+  // a vector of device memory offsets for the arguments from the allocations.
+  // In the case of partitioned computations, assume all arguments go on the
+  // zeroth core.
+  TF_ASSIGN_OR_RETURN(
+      auto replicas,
+      Replicas(*execute_backend_, execution_options.device_handles(0)));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arguments, replicas));
+  return replicated_arguments;
+}
+
 tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
                                             ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
 
-  std::vector<std::vector<const ShapedBuffer*>> all_arguments;
-  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
+  std::vector<std::vector<se::StreamExecutor*>> all_executors;
   std::vector<VersionedComputationHandle> versioned_handles;
   std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
   std::vector<string> computation_names;
@@ -714,18 +793,10 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // is one of the executors to run the replicated computation.
     const ExecutionOptions& execution_options =
         arg->requests(i).execution_options();
-    if (execution_options.device_handles().empty()) {
-      return FailedPrecondition(
-          "device handles must be given to execute parallel computations");
-    }
-    std::vector<perftools::gputools::StreamExecutor*> executors;
-    for (const auto& device_handle : execution_options.device_handles()) {
-      TF_ASSIGN_OR_RETURN(auto replicas,
-                          Replicas(*execute_backend_, device_handle));
-      se::StreamExecutor* executor = replicas[0];
-      CHECK(executor != nullptr);
-      executors.push_back(executor);
-    }
+
+    // Get the executors.
+    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
+                                                     arg->requests_size(), i));
 
     // Resolve the UserComputation object associated with the requested
     // computation and compute the program shape.
@@ -742,27 +813,24 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
         std::shared_ptr<const ProgramShape> program_shape,
         user_computation->ComputeProgramShape(versioned_handle.version));
 
-    // Resolve the allocations for the arguments of the computation, and create
-    // a vector of device memory offsets for the arguments from the allocations.
-    // In the case of partitioned computations, assume all arguments go on the
-    // zeroth core.
-    TF_ASSIGN_OR_RETURN(
-        std::vector<const ShapedBuffer*> arguments,
-        ResolveAndValidateArguments(request.arguments(),
-                                    executors[0]->device_ordinal()));
+    // Get the replicated arguments.
+    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
+                        GetArguments(execution_options, request.arguments()));
 
     // Create an HloModuleConfig object for the computation, given the shape of
-    // the program and the argument allocations.
+    // the program and the argument allocations. Here, we care only about the
+    // shapes of the arguments, so, it is sufficient to use the arguments of
+    // replica 0.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(*program_shape, arguments,
-                           request.execution_options(), *user_computation));
+        CreateModuleConfig(*program_shape, replicated_arguments.front(),
+                           request.execution_options(), user_computation));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
 
     // Adds to the vectors to build and execute the computations after the loop.
-    all_arguments.push_back(arguments);
-    all_arguments.insert(all_arguments.end(), executors.size() - 1, {});
+    all_arguments.push_back(replicated_arguments);
+    all_arguments.insert(all_arguments.end(), executors.size() - 1, {{}});
     versioned_handles.push_back(versioned_handle);
     module_configs.push_back(std::move(module_config));
     computation_names.insert(computation_names.end(), executors.size(),
@@ -808,6 +876,107 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status Service::ExecuteGraphParallel(
+    const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) {
+  VLOG(1) << "running execute-graph-parallel request";
+
+  std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
+  std::vector<std::vector<se::StreamExecutor*>> all_executors;
+  std::vector<const HloModuleProto*> module_protos;
+  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
+  std::vector<string> computation_names;
+  std::vector<DeviceHandle> device_handles;
+
+  int num_requested_devices =
+      std::accumulate(arg->requests().begin(), arg->requests().end(), 0,
+                      [](int a, const ExecuteGraphRequest& r) -> int {
+                        return a + r.execution_options().device_handles_size();
+                      });
+  if (num_requested_devices * options_.number_of_replicas() >
+      execute_backend_->device_count()) {
+    return FailedPrecondition(
+        "there are not enough stream executors to execute %d computations",
+        num_requested_devices);
+  }
+
+  for (int64 i = 0; i < arg->requests_size(); ++i) {
+    // Get the stream executor for the i'th computation. This stream executor
+    // is one of the executors to run the replicated computation.
+    const ExecutionOptions& execution_options =
+        arg->requests(i).execution_options();
+    const ExecuteGraphRequest& request = arg->requests(i);
+    TF_RET_CHECK(request.has_computation()) << "computations may not be empty";
+    TF_RET_CHECK(request.computation().has_program_shape())
+        << "programe shape may not be empty";
+
+    // Get the executors.
+    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
+                                                     arg->requests_size(), i));
+
+    // Get the replicated arguments.
+    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
+                        GetArguments(execution_options, request.arguments()));
+
+    // Create an HloModuleConfig object for the computation, given the shape of
+    // the program and the argument allocations. Here, we care only about the
+    // shapes of the arguments, so, it is sufficient to use the arguments of
+    // replica 0.
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(request.computation().program_shape(),
+                           replicated_arguments.front(),
+                           request.execution_options(),
+                           /*user_computation=*/nullptr));
+    VLOG(3)
+        << "ExecuteGraphParallel created HloModuleConfig computation layout: "
+        << module_config->entry_computation_layout().ToString();
+
+    // Adds to the vectors to build and execute the computations after the loop.
+    all_arguments.push_back(replicated_arguments);
+    all_arguments.insert(all_arguments.end(), executors.size() - 1, {{}});
+    module_protos.push_back(&request.computation());
+    module_configs.push_back(std::move(module_config));
+    computation_names.insert(computation_names.end(), executors.size(),
+                             request.computation().name());
+    all_executors.push_back(executors);
+    device_handles.insert(device_handles.end(),
+                          execution_options.device_handles().begin(),
+                          execution_options.device_handles().end());
+  }
+
+  // Build the HloModules and compile to generate the executables.
+  //
+  // TODO(jlebar): There's currently no way to pass a device allocator to
+  // ExecuteGraphParallel, so we have to pass a null device_allocator below.
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
+                      BuildExecutables(module_protos, std::move(module_configs),
+                                       execute_backend_.get(), all_executors,
+                                       /*device_allocator=*/nullptr));
+  std::vector<Executable*> executable_ptrs;
+  executable_ptrs.reserve(executables.size());
+  for (const auto& executable : executables) {
+    executable_ptrs.push_back(executable.get());
+  }
+
+  // Execute the generated executables in parallel and return the device
+  // handles for each computation's output.
+  ExecutionProfile profile;
+  TF_ASSIGN_OR_RETURN(
+      std::vector<GlobalDataHandle> outputs,
+      ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
+                                       execute_backend_.get(), device_handles,
+                                       computation_names, &profile));
+  for (const GlobalDataHandle& output : outputs) {
+    ExecuteResponse response;
+    *response.mutable_output() = output;
+    *response.mutable_profile() = profile;
+    *result->add_responses() = response;
+  }
+
+  VLOG(1) << "successfully completed 'execute-graph-parallel' request";
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
                                              GetDeviceHandlesResponse* result) {
   const int64 available_device_count = execute_backend_->device_count();
@@ -832,6 +1001,47 @@ tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status Service::ExecuteOneToN(const ExecuteRequest* arg,
+                                          ExecuteResponse* result) {
+  ExecuteParallelRequest parallel_arg;
+  *parallel_arg.add_requests() = *arg;
+  ExecuteParallelResponse parallel_result;
+  TF_RETURN_IF_ERROR(ExecuteParallel(&parallel_arg, &parallel_result));
+  return PickParallelResponse(parallel_result, result);
+}
+
+tensorflow::Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
+                                          ExecuteResponse* result) {
+  ExecuteGraphParallelRequest parallel_arg;
+  *parallel_arg.add_requests() = *arg;
+  ExecuteParallelResponse parallel_result;
+  TF_RETURN_IF_ERROR(ExecuteGraphParallel(&parallel_arg, &parallel_result));
+  return PickParallelResponse(parallel_result, result);
+}
+
+tensorflow::Status Service::PickParallelResponse(
+    const ExecuteParallelResponse& parallel_result, ExecuteResponse* result) {
+  // The "result device" selection is a bit hacky, but better than assuming it
+  // is device 0. We have b/76035356 for restructuring the client API to clean
+  // up the current asymmetries and support more functionalities.
+  for (int64 i = 0; i < parallel_result.responses_size(); ++i) {
+    TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
+                        allocation_tracker_.ResolveForReplica(
+                            parallel_result.responses(i).output(), 0));
+    const Shape& shape = buffer->on_host_shape();
+    if (!ShapeUtil::IsEmptyTuple(shape)) {
+      *result = parallel_result.responses(i);
+      VLOG(3) << "Fetching result from device " << i << ": "
+              << ShapeUtil::HumanString(shape);
+      return Status::OK();
+    }
+  }
+  TF_RET_CHECK(parallel_result.responses_size() > 0);
+  *result = parallel_result.responses(0);
+  VLOG(1) << "Defaulting to device 0 result";
+  return Status::OK();
+}
+
 tensorflow::Status Service::Execute(const ExecuteRequest* arg,
                                     ExecuteResponse* result) {
   VLOG(1) << "running execute request: " << arg->ShortDebugString();
@@ -848,28 +1058,25 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
 
   // If we received multiple device handles, we must partition the module.
   if (arg->execution_options().device_handles_size() > 1) {
-    ExecuteParallelRequest parallel_arg;
-    *parallel_arg.add_requests() = *arg;
-    ExecuteParallelResponse parallel_result;
-    TF_RETURN_IF_ERROR(ExecuteParallel(&parallel_arg, &parallel_result));
-    TF_RET_CHECK(parallel_result.responses_size() > 0);
-    *result = parallel_result.responses(0);
-    return Status::OK();
+    return ExecuteOneToN(arg, result);
   }
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<const ProgramShape> program_shape,
       user_computation->ComputeProgramShape(versioned_handle.version));
 
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
+                                              SingleComputationDeviceHandle()));
   TF_ASSIGN_OR_RETURN(
-      std::vector<const ShapedBuffer*> arguments,
-      ResolveAndValidateArguments(arg->arguments(),
-                                  execute_backend_->default_device_ordinal()));
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arg->arguments(), replicas));
 
+  // Since we care only about the shapes of the arguments, it is sufficient to
+  // use the arguments of replica 0.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, arguments, arg->execution_options(),
-                         *user_computation));
+      CreateModuleConfig(*program_shape, replicated_arguments.front(),
+                         arg->execution_options(), user_computation));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -885,20 +1092,21 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
     executable->session_module()->set_execution_platform(
         execute_backend_->platform()->Name());
     TF_RETURN_IF_ERROR(RecordArguments(
-        arguments, execute_backend_->default_stream_executor(),
+        replicated_arguments.front(),
+        execute_backend_->default_stream_executor(),
         execute_backend_->transfer_manager(), executable->session_module()));
   }
 
   TF_ASSIGN_OR_RETURN(
       *result->mutable_output(),
       ExecuteAndRegisterResult(
-          executable.get(), arguments, execute_backend_.get(),
-          execute_backend_->default_stream_executor(),
+          executable.get(), replicated_arguments, execute_backend_.get(),
           "result of " + user_computation->name(), result->mutable_profile()));
 
   if (executable->dumping()) {
-    TF_ASSIGN_OR_RETURN(const ShapedBuffer* result_buffer,
-                        allocation_tracker_.Resolve(result->output()));
+    TF_ASSIGN_OR_RETURN(
+        const ShapedBuffer* result_buffer,
+        allocation_tracker_.ResolveForReplica(result->output(), 0));
     TF_RETURN_IF_ERROR(RecordResult(
         *result_buffer, execute_backend_->default_stream_executor(),
         execute_backend_->transfer_manager(), executable->session_module()));
@@ -909,6 +1117,74 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
   return tensorflow::Status::OK();
 }
 
+StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
+    const HloModuleProto& module_proto,
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
+  VLOG(1) << Printf(
+      "BuildExecutable on service %p with serialized module proto: %s", this,
+      module_proto.name().c_str());
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(module_proto, *module_config));
+
+  TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module));
+
+  TF_ASSIGN_OR_RETURN(
+      module, backend->compiler()->RunHloPasses(std::move(module), executor,
+                                                device_allocator));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      backend->compiler()->RunBackend(
+                          std::move(module), executor, device_allocator));
+
+  return std::move(executable);
+}
+
+tensorflow::Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
+                                         ExecuteResponse* result) {
+  VLOG(1) << "running execute-graph request";
+
+  if (!arg->has_computation()) {
+    return InvalidArgument("computations may not be empty");
+  }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("programe shape may not be empty");
+  }
+
+  // If we received multiple device handles, we must partition the module.
+  if (arg->execution_options().device_handles_size() > 1) {
+    return ExecuteOneToN(arg, result);
+  }
+
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
+                                              SingleComputationDeviceHandle()));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arg->arguments(), replicas));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
+                      CreateModuleConfig(arg->computation().program_shape(),
+                                         replicated_arguments.front(),
+                                         arg->execution_options()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      BuildExecutable(arg->computation(), std::move(module_config),
+                      execute_backend_.get(),
+                      execute_backend_->default_stream_executor(),
+                      /*device_allocator=*/nullptr));
+
+  TF_ASSIGN_OR_RETURN(
+      *result->mutable_output(),
+      ExecuteAndRegisterResult(
+          executable.get(), replicated_arguments, execute_backend_.get(),
+          "result of " + arg->computation().name(), result->mutable_profile()));
+
+  VLOG(1) << "successfully completed 'execute-graph' request";
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
                                          ExecuteAsyncResponse* result) {
   VLOG(1) << "running execute-async request: " << arg->ShortDebugString();
@@ -926,15 +1202,17 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
       std::shared_ptr<const ProgramShape> program_shape,
       user_computation->ComputeProgramShape(versioned_handle.version));
 
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
+                                              SingleComputationDeviceHandle()));
+  TF_RET_CHECK(!replicas.empty());
   TF_ASSIGN_OR_RETURN(
-      std::vector<const ShapedBuffer*> arguments,
-      ResolveAndValidateArguments(arg->arguments(),
-                                  execute_backend_->default_device_ordinal()));
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arg->arguments(), replicas));
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, arguments, arg->execution_options(),
-                         *user_computation));
+      CreateModuleConfig(*program_shape, replicated_arguments.front(),
+                         arg->execution_options(), user_computation));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -947,45 +1225,37 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
           versioned_handle, std::move(module_config), execute_backend_.get(),
           execute_backend_->default_stream_executor(), &profile));
 
-  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
-                                              SingleComputationDeviceHandle()));
-  TF_RET_CHECK(!replicas.empty());
-
   // Set up streams.
   std::vector<Pool<se::Stream>::SmartPtr> streams;
-
   for (se::StreamExecutor* executor : replicas) {
     TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
                         execute_backend_->BorrowStream(executor));
     streams.push_back(std::move(stream));
   }
 
-  std::unique_ptr<ShapedBuffer> result_buffer;
-  for (const Pool<se::Stream>::SmartPtr& stream : streams) {
+  std::vector<ScopedShapedBuffer> result_buffers;
+  for (size_t i = 0; i < streams.size(); ++i) {
+    const auto& stream = streams[i];
     ExecutableRunOptions options;
     options.set_stream(stream.get());
     options.set_allocator(execute_backend_->memory_allocator());
-    options.set_inter_op_thread_pool(execute_backend_->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
         execute_backend_->eigen_intra_op_thread_pool_device());
 
     ServiceExecutableRunOptions service_options(
         options, execute_backend_->StreamBorrower());
 
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<ShapedBuffer> this_result_buffer,
-        executable->ExecuteAsyncOnStream(&service_options, arguments));
+    TF_ASSIGN_OR_RETURN(ScopedShapedBuffer this_result_buffer,
+                        executable->ExecuteAsyncOnStream(
+                            &service_options, replicated_arguments[i]));
 
-    // Take the first result.
-    if (result_buffer == nullptr) {
-      result_buffer = std::move(this_result_buffer);
-    }
+    result_buffers.emplace_back(std::move(this_result_buffer));
   }
 
   TF_ASSIGN_OR_RETURN(
       GlobalDataHandle output,
-      allocation_tracker_.Register(std::move(result_buffer),
-                                   "result of " + user_computation->name()));
+      allocation_tracker_.RegisterReplicatedBuffers(
+          std::move(result_buffers), "result of " + user_computation->name()));
 
   *result->mutable_execution() = execution_tracker_.Register(
       execute_backend_.get(), std::move(streams), profile, output);
@@ -1013,7 +1283,7 @@ tensorflow::Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
 tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
                                              TransferToClientResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
-                      allocation_tracker_.Resolve(arg->data()));
+                      allocation_tracker_.ResolveForReplica(arg->data(), 0));
 
   const Shape* return_shape;
   if (arg->has_shape_with_layout()) {
@@ -1074,37 +1344,24 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
         replicas, Replicas(*execute_backend_, SingleComputationDeviceHandle()));
   }
 
-  // All memory allocation is done on the first replica. The allocations in all
-  // other replicas mirror the firsts'.
-  int master_device_ordinal = replicas[0]->device_ordinal();
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> shaped_buffer,
-      execute_backend_->transfer_manager()->AllocateShapedBuffer(
-          shape, execute_backend_->memory_allocator(), master_device_ordinal));
-
-  // Transfer the data to the replicas.
+  // Allocate memory in each replica and transfer the data to all replicas.
+  std::vector<ScopedShapedBuffer> replicated_buffers;
   for (se::StreamExecutor* executor : replicas) {
-    if (executor->device_ordinal() == master_device_ordinal) {
-      TF_RETURN_IF_ERROR(
-          execute_backend_->transfer_manager()->TransferLiteralToDevice(
-              executor, *literal, *shaped_buffer));
-    } else {
-      // The replica is not the master. Create an cloned shaped buffer with
-      // the replica's device ordinal. This is required because
-      // TransferLiteralToDevice verifies that the device ordinal of the shaped
-      // buffer matches that of the executor.
-      std::unique_ptr<ShapedBuffer> clone =
-          CloneShapedBufferOnDevice(*shaped_buffer, executor->device_ordinal());
-      TF_RETURN_IF_ERROR(
-          execute_backend_->transfer_manager()->TransferLiteralToDevice(
-              executor, *literal, *clone));
-    }
+    TF_ASSIGN_OR_RETURN(
+        ScopedShapedBuffer shaped_buffer,
+        execute_backend_->transfer_manager()->AllocateScopedShapedBuffer(
+            shape, execute_backend_->memory_allocator(),
+            executor->device_ordinal()));
+    TF_RETURN_IF_ERROR(
+        execute_backend_->transfer_manager()->TransferLiteralToDevice(
+            executor, *literal, shaped_buffer));
+    replicated_buffers.emplace_back(std::move(shaped_buffer));
   }
-  TF_ASSIGN_OR_RETURN(
-      *result->mutable_data(),
-      allocation_tracker_.Register(std::move(shaped_buffer),
-                                   StrCat("TransferToServer literal of shape ",
-                                          ShapeUtil::HumanString(shape))));
+  TF_ASSIGN_OR_RETURN(*result->mutable_data(),
+                      allocation_tracker_.RegisterReplicatedBuffers(
+                          std::move(replicated_buffers),
+                          StrCat("TransferToServer literal of shape ",
+                                 ShapeUtil::HumanString(shape))));
 
   return tensorflow::Status::OK();
 }
@@ -1255,7 +1512,7 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
                       CreateModuleConfig(program_shape, {}, execution_options,
-                                         *user_computation));
+                                         user_computation));
 
   // Exclude dead parameter instructions for the purpose of computing constants.
   TF_ASSIGN_OR_RETURN(
@@ -1276,6 +1533,50 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
 
   // Since the shape_with_output_layout option in ExecutionOption is
   // non-effective to the Evaluator results, explicit relayout here.
+  //
+  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
+  if (arg->has_output_layout()) {
+    result_literal = result_literal->Relayout(arg->output_layout());
+  }
+  *result->mutable_literal() = result_literal->ToProto();
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status Service::ComputeConstantGraph(
+    const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) {
+  if (!arg->has_computation()) {
+    return InvalidArgument("computations may not be empty");
+  }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("program shape may not be empty");
+  }
+  if (arg->computation().program_shape().parameters_size() != 0) {
+    return InvalidArgument(
+        "constant computation may not depend on any parameters.");
+  }
+
+  ProgramShape program_shape = arg->computation().program_shape();
+  TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
+  if (arg->has_output_layout()) {
+    TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
+        arg->output_layout(), program_shape.result()));
+  }
+
+  HloModuleConfig config(program_shape);
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(arg->computation(), config));
+
+  HloEvaluator evaluator;
+  TF_ASSIGN_OR_RETURN(auto result_literal,
+                      evaluator.Evaluate<std::unique_ptr<Literal>>(
+                          *module, /*arg_literals=*/{}));
+
+  // Since the result layout is non-effective to the Evaluator results, explicit
+  // relayout here.
+  //
+  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
   if (arg->has_output_layout()) {
     result_literal = result_literal->Relayout(arg->output_layout());
   }
@@ -1287,7 +1588,7 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
 tensorflow::Status Service::GetShape(const GetShapeRequest* arg,
                                      GetShapeResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
-                      allocation_tracker_.Resolve(arg->data()));
+                      allocation_tracker_.ResolveForReplica(arg->data(), 0));
   *result->mutable_shape() = buffer->on_host_shape();
   return tensorflow::Status::OK();
 }
@@ -1347,6 +1648,36 @@ tensorflow::Status Service::GetComputationStats(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status Service::GetComputationGraphStats(
+    const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) {
+  if (!arg->has_computation()) {
+    return InvalidArgument("Computations may not be empty.");
+  }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("Program shape may not be empty.");
+  }
+
+  HloModuleConfig config(arg->computation().program_shape());
+  config.set_debug_options(arg->debug_options());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(arg->computation(), config));
+
+  hlo_graph_dumper::MaybeDumpHloModule(*module,
+                                       "computation statistics subject");
+
+  // Run HLO analysis to get the computation statistics.
+  HloCostAnalysis analysis(
+      execute_backend_->compiler()->ShapeSizeBytesFunction());
+
+  TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&analysis));
+
+  ComputationStats stats;
+  stats.set_flop_count(analysis.flop_count());
+  stats.set_transcendental_count(analysis.transcendental_count());
+  *result->mutable_stats() = stats;
+  return tensorflow::Status::OK();
+}
+
 template <typename RequestT, typename ResponseT>
 tensorflow::Status Service::AddInstruction(
     const RequestT* arg, ResponseT* result,
@@ -1556,8 +1887,10 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
     case OpRequest::kSendRequest: {
       TF_RETURN_IF_ERROR(
           channel_tracker_.RegisterSend(arg->send_request().channel_handle()));
-      TF_RETURN_IF_ERROR(computation->AddSendInstruction(arg->send_request()));
-      return tensorflow::Status::OK();
+      // Send does not return a value, but we need a handle to be able to
+      // set OpMetadata and OpSharding (device assignment).
+      handle_status = computation->AddSendInstruction(arg->send_request());
+      break;
     }
     case OpRequest::kRecvRequest: {
       TF_RETURN_IF_ERROR(
@@ -1609,9 +1942,9 @@ DeviceHandle Service::SingleComputationDeviceHandle() const {
   return device_handle;
 }
 
-StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
+StatusOr<std::vector<se::StreamExecutor*>> Service::Replicas(
     const Backend& backend, const DeviceHandle& device_handle) const {
-  std::vector<perftools::gputools::StreamExecutor*> replicas;
+  std::vector<se::StreamExecutor*> replicas;
   for (int replica = 0; replica < options_.number_of_replicas(); ++replica) {
     // From the computation placer, find out the device ids of the replicas for
     // the given device handle.
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 6ce241971156599aaa25aea1b0caac0e1bd5379c..476bd0597de735a9f777be78f5ab01dac1188525 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -53,8 +53,8 @@ namespace xla {
 class ServiceOptions {
  public:
   // Set the platform backing the service, or nullptr for the default platform.
-  ServiceOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  ServiceOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
 
   // Set the number of replicas to use when compiling replicated
   // programs.
@@ -66,7 +66,7 @@ class ServiceOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_ = nullptr;
+  se::Platform* platform_ = nullptr;
   int number_of_replicas_ = 1;
   int intra_op_parallelism_threads_ = -1;
 };
@@ -79,7 +79,7 @@ class Service : public ServiceInterface {
  public:
   // Factory method for creating a new Service.
   static StatusOr<std::unique_ptr<Service>> NewService(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
   static StatusOr<std::unique_ptr<Service>> NewService(
       const ServiceOptions& options);
 
@@ -112,12 +112,29 @@ class Service : public ServiceInterface {
   tensorflow::Status Execute(const ExecuteRequest* arg,
                              ExecuteResponse* result) override;
 
+  // Executes a computation with the provided global data passed as
+  // immutable arguments. The request contains the whole computation graph.
+  // Returns global data output and execution timing.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* arg,
+                                  ExecuteResponse* result) override;
+
   // Executes one or more computations in parallel with the provided global data
   // passed as immutable arguments. Returns global data output for each
   // computation.
   tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
                                      ExecuteParallelResponse* result) override;
 
+  // Executes one or more computations in parallel with the provided global data
+  // passed as immutable arguments. Returns global data output for each
+  // computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* arg,
+      ExecuteParallelResponse* result) override;
+
   // Requests one or more device handles from the target.
   //
   // When N device handles are requested and the number of replicas is R, at
@@ -189,6 +206,9 @@ class Service : public ServiceInterface {
   // Computes the value of a constant expression.
   tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
                                      ComputeConstantResponse* result) override;
+  tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) override;
 
   // Returns the shape (with layout) of an array associated with a given data
   // handle.
@@ -216,6 +236,13 @@ class Service : public ServiceInterface {
       const ComputationStatsRequest* arg,
       ComputationStatsResponse* result) override;
 
+  // Retrieves the statistics of a computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* arg,
+      ComputationStatsResponse* result) override;
+
   // Snapshots the current state of a computation handle into a serializable
   // protocol buffer form, so it can be loaded via
   // LoadComputationSnapshot.
@@ -252,7 +279,21 @@ class Service : public ServiceInterface {
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutionOptions& execution_options,
-      const UserComputation& user_computation);
+      const UserComputation* user_computation = nullptr);
+
+  // Picks a parallel response and fills the result.
+  Status PickParallelResponse(const ExecuteParallelResponse& parallel_result,
+                              ExecuteResponse* result);
+
+  // Prepare the executors for executing parallel.
+  StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
+      const ExecutionOptions& execution_options, int64 requests_size,
+      int64 request_index) const;
+
+  // Prepare the arguments for executing parallel.
+  StatusOr<std::vector<std::vector<const ShapedBuffer*>>> GetArguments(
+      const ExecutionOptions& execution_options,
+      tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments);
 
  protected:
   friend class LocalExecutable;
@@ -262,14 +303,14 @@ class Service : public ServiceInterface {
   Service(const ServiceOptions& options,
           std::unique_ptr<Backend> execute_backend);
 
-  static StatusOr<std::unique_ptr<Backend>> CreateComputeConstantBackend();
-
   // Resolves the given argument handles in the allocation tracker and returns
-  // the corresponding allocations. The function also verifies that each
-  // allocation matches the execution platform and device ordinal.
-  StatusOr<std::vector<const ShapedBuffer*>> ResolveAndValidateArguments(
+  // the corresponding allocations for every replica. The function also verifies
+  // that each allocation matches the execution platform and device ordinal of
+  // the corresponding replica.
+  StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
+  ResolveAndValidateArguments(
       tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      int device_ordinal);
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
   // Create a Hlo module config for the given program shape and arguments.
   // execution_options is optional; if not given a default is used.
@@ -277,7 +318,7 @@ class Service : public ServiceInterface {
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
       const ExecutionOptions* execution_options,
-      const UserComputation& user_computation);
+      const UserComputation* user_computation = nullptr);
 
   // Builds an Executable for the given parameters.
   //
@@ -287,7 +328,16 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const VersionedComputationHandle& versioned_handle,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator = nullptr);
+
+  // Builds an Executable for the given HLO module proto.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Executable>> BuildExecutable(
+      const HloModuleProto& module_proto,
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
@@ -295,8 +345,12 @@ class Service : public ServiceInterface {
   StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
       std::vector<VersionedComputationHandle> versioned_handles,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-      Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
+      DeviceMemoryAllocator* device_allocator);
+  StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
+      const std::vector<const HloModuleProto*>& module_protos,
+      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
@@ -305,7 +359,7 @@ class Service : public ServiceInterface {
   StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
       const VersionedComputationHandle& versioned_handle,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+      se::StreamExecutor* executor, ExecutionProfile* profile,
       DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Runs the given executable with the given arguments and register the result
@@ -314,16 +368,17 @@ class Service : public ServiceInterface {
   // ExecutionProfile object which will be filled in with profile data.
   StatusOr<GlobalDataHandle> ExecuteAndRegisterResult(
       Executable* executable,
-      const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      Backend* backend, perftools::gputools::StreamExecutor* executor,
-      const string& result_tag, ExecutionProfile* profile);
+      const tensorflow::gtl::ArraySlice<std::vector<const ShapedBuffer*>>
+          arguments,
+      Backend* backend, const string& result_tag, ExecutionProfile* profile);
 
   // Runs the given executables with the given arguments and register the result
   // from each executable in the allocation tracker. The handles of the result
   // from the tracker are returned.
   StatusOr<std::vector<GlobalDataHandle>> ExecuteParallelAndRegisterResult(
       tensorflow::gtl::ArraySlice<Executable*> executables,
-      tensorflow::gtl::ArraySlice<std::vector<const ShapedBuffer*>> arguments,
+      tensorflow::gtl::ArraySlice<std::vector<std::vector<const ShapedBuffer*>>>
+          arguments,
       Backend* backend,
       tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
       tensorflow::gtl::ArraySlice<string> result_tags,
@@ -336,6 +391,14 @@ class Service : public ServiceInterface {
       const std::function<StatusOr<ComputationDataHandle>(UserComputation*)>&
           adder);
 
+  // Executes a single computation which has more than one target device.
+  // The N devices are expected to all return an empty tuple, but one, which
+  // will be the result of this computation.
+  tensorflow::Status ExecuteOneToN(const ExecuteRequest* arg,
+                                   ExecuteResponse* result);
+  tensorflow::Status ExecuteOneToN(const ExecuteGraphRequest* arg,
+                                   ExecuteResponse* result);
+
   // Convenience function which checks whether the given shape_with_layout
   // (presumably passed by the client to set the result layout) is valid for the
   // given computation result shape.
@@ -345,7 +408,7 @@ class Service : public ServiceInterface {
   // Returns the stream executors assigned to the replicas represented by the
   // given device handle. Each device_handle is a virtual replicated device that
   // represents a set of physical devices for the replicas.
-  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Replicas(
+  StatusOr<std::vector<se::StreamExecutor*>> Replicas(
       const Backend& backend, const DeviceHandle& device_handle) const;
 
   Status MaybeDumpHloModule(const HloModule& module) const;
@@ -372,8 +435,6 @@ class Service : public ServiceInterface {
   CompilationCache compilation_cache_;
 
   // Backend to compile and execute computations on.
-  //
-  // TODO(b/28616830): Support multiple backends for execution.
   std::unique_ptr<Backend> execute_backend_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Service);
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 6c1f8feac7ed4423051cf2737be57dcfab508671..7f3910cdb0366078b97fb5f6a2dc498b37570926 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -28,7 +28,7 @@ namespace xla {
 class ServiceExecutableRunOptions {
  public:
   using StreamBorrower =
-      std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
+      std::function<StatusOr<Pool<se::Stream>::SmartPtr>(int)>;
 
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
@@ -45,14 +45,13 @@ class ServiceExecutableRunOptions {
   ExecutableRunOptions* mutable_run_options() { return &run_options_; }
 
   // Delegate to `ExecutableRunOptions` member.
-  perftools::gputools::Stream* stream() const { return run_options_.stream(); }
+  se::Stream* stream() const { return run_options_.stream(); }
   DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
   int device_ordinal() const { return run_options_.device_ordinal(); }
 
   // Borrows a stream and returns a smart pointer which returns the stream on
   // destruction.
-  StatusOr<Pool<perftools::gputools::Stream>::SmartPtr> BorrowStream(
-      int device_ordinal) const {
+  StatusOr<Pool<se::Stream>::SmartPtr> BorrowStream(int device_ordinal) const {
     return borrow_stream_
                ? borrow_stream_(device_ordinal)
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index c9692757b27980b10a5ca562223c3d0f6462d820..48b2922e77b78719e5d3469cbaa4fc15969de91b 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -52,6 +52,8 @@ UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) {
       return UNOP_ABS;
     case HloOpcode::kCeil:
       return UNOP_CEIL;
+    case HloOpcode::kClz:
+      return UNOP_CLZ;
     case HloOpcode::kCos:
       return UNOP_COS;
     case HloOpcode::kExp:
@@ -169,11 +171,11 @@ bool AllUnique(tensorflow::gtl::ArraySlice<int64> slice) {
 tensorflow::Status ExpectNotTupleOrOpaque(const Shape& shape,
                                           tensorflow::StringPiece op_type) {
   if (ShapeUtil::IsTuple(shape)) {
-    return InvalidArgument("Expected non-tuple argument for %s. Got: %s",
+    return InvalidArgument("Expected non-tuple argument for %s, but got %s.",
                            op_type.ToString().c_str(),
                            ShapeUtil::HumanString(shape).c_str());
   } else if (ShapeUtil::IsOpaque(shape)) {
-    return InvalidArgument("Expected non-opaque argument for %s. Got: %s",
+    return InvalidArgument("Expected non-opaque argument for %s, but got %s.",
                            op_type.ToString().c_str(),
                            ShapeUtil::HumanString(shape).c_str());
   } else {
@@ -193,8 +195,10 @@ tensorflow::Status VerifyReducerShape(const ProgramShape& reducer_shape,
 
   const Shape& accumulator_shape = reducer_shape.result();
   if (ShapeUtil::Rank(accumulator_shape) != 0) {
-    return Unimplemented(
-        "Reduction function currently must have rank-0 result.");
+    return InvalidArgument(
+        "Reduction function must have rank 0 (rank %lld reduction function "
+        "given).",
+        ShapeUtil::Rank(accumulator_shape));
   }
 
   // Check that the accumulator can be passed in as the first argument.
@@ -235,8 +239,8 @@ tensorflow::Status VerifyReducerShape(const ProgramShape& reducer_shape,
   if (!ShapeUtil::CompatibleIgnoringFpPrecision(accumulator_shape,
                                                 reducer_shape.parameters(1))) {
     return InvalidArgument(
-        "Reduction function's second parameter shape currently must "
-        "match the result shape. Got %s vs %s",
+        "Reduction function's second parameter shape must "
+        "match the result shape, but got %s vs %s.",
         ShapeUtil::HumanString(reducer_shape.parameters(1)).c_str(),
         ShapeUtil::HumanString(accumulator_shape).c_str());
   }
@@ -258,29 +262,29 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   for (int64 i = 0; i < window.dimensions_size(); ++i) {
     const auto& dim = window.dimensions(i);
     if (dim.size() <= 0) {
-      return InvalidArgument("Window has a non-positive dimension. Window: %s",
+      return InvalidArgument("Window %s has a non-positive dimension.",
                              window.DebugString().c_str());
     }
     if (dim.stride() <= 0) {
-      return InvalidArgument("Window has a non-positive stride. Window: %s",
+      return InvalidArgument("Window %s has a non-positive stride.",
                              window.DebugString().c_str());
     }
     if (!allow_negative_padding && dim.padding_low() < 0) {
-      return InvalidArgument("Window has a negative low padding. Window: %s",
+      return InvalidArgument("Window %s has a negative low padding.",
                              window.DebugString().c_str());
     }
     if (!allow_negative_padding && dim.padding_high() < 0) {
-      return InvalidArgument("Window has a negative high padding. Window: %s",
+      return InvalidArgument("Window %s has a negative high padding.",
                              window.DebugString().c_str());
     }
     if (dim.base_dilation() < 1) {
       return InvalidArgument(
-          "Window has a non-positive base area dilation factor. Window: %s",
+          "Window %s has a non-positive base area dilation factor.",
           window.DebugString().c_str());
     }
     if (dim.window_dilation() < 1) {
       return InvalidArgument(
-          "Window has a non-positive window dilation factor. Window: %s",
+          "Window %s has a non-positive window dilation factor.",
           window.DebugString().c_str());
     }
 
@@ -302,12 +306,17 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 
 /* static */ StatusOr<Shape> ShapeInference::InferUnaryOpShape(
     HloOpcode opcode, const HloInstruction* operand) {
+  return InferUnaryOpShape(opcode, operand->shape());
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferUnaryOpShape(
+    HloOpcode opcode, const Shape& shape) {
   // There is no copy operation at the proto level, so handle copy explicitly.
   if (opcode == HloOpcode::kCopy) {
-    return operand->shape();
+    return shape;
   }
 
-  return InferUnaryOpShape(OpcodeToUnaryOperation(opcode), operand->shape());
+  return InferUnaryOpShape(OpcodeToUnaryOperation(opcode), shape);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferUnaryOpShape(
@@ -320,8 +329,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     case UNOP_CEIL:
       if (!ShapeUtil::ElementIsFloating(arg)) {
         return InvalidArgument(
-            "expected element type in shape to be floating for floor/ceil "
-            "operation; got %s",
+            "Expected element type in shape to be floating for floor/ceil "
+            "operation; got %s.",
             PrimitiveType_Name(arg.element_type()).c_str());
       }
       return arg;
@@ -333,8 +342,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
       if (!ShapeUtil::ElementIsFloating(arg) &&
           !ShapeUtil::ElementIsComplex(arg)) {
         return InvalidArgument(
-            "expected element type in shape to be floating or complex for "
-            "sin/cos/exp/log/tanh operation; got %s",
+            "Expected element type in shape to be floating or complex for "
+            "sin/cos/exp/log/tanh operation; got %s.",
             PrimitiveType_Name(arg.element_type()).c_str());
       }
       return arg;
@@ -342,8 +351,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     case UNOP_IMAG:
       if (!ShapeUtil::ElementIsComplex(arg)) {
         return InvalidArgument(
-            "expected element type in shape to be complex for real/imag "
-            "operation; got %s",
+            "Expected element type in shape to be complex for real/imag "
+            "operation; got %s.",
             PrimitiveType_Name(arg.element_type()).c_str());
       }
       return ShapeUtil::ChangeElementType(arg, F32);
@@ -353,6 +362,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
             arg, primitive_util::ComplexComponentType(arg.element_type()));
       }
       return arg;
+    case UNOP_CLZ:
     case UNOP_NEGATE:
     case UNOP_ROUND_NEAREST_AFZ:
     case UNOP_SIGN:
@@ -363,8 +373,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
       if (arg.element_type() != PRED &&
           !primitive_util::IsIntegralType(arg.element_type())) {
         return InvalidArgument(
-            "expected pred or an integral element type in argument to not "
-            "operation; got %s",
+            "Expected pred or an integral element type in argument to Not "
+            "operation; got %s.",
             PrimitiveType_Name(arg.element_type()).c_str());
       }
       return arg;
@@ -372,8 +382,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     case UNOP_IS_FINITE:
       if (!ShapeUtil::ElementIsFloating(arg)) {
         return InvalidArgument(
-            "expected element type in shape to be floating point for IsFinite "
-            "operation; got %s",
+            "Expected element type in shape to be floating point for IsFinite "
+            "operation; got %s.",
             PrimitiveType_Name(arg.element_type()).c_str());
       }
       return ShapeUtil::ChangeElementType(arg, PRED);
@@ -389,10 +399,10 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     tensorflow::gtl::ArraySlice<const Shape*> arg_shapes,
     const int64 dimension) {
   if (arg_shapes.empty()) {
-    return InvalidArgument("Concatenate expects at least one argument");
+    return InvalidArgument("Concatenate expects at least one argument.");
   }
   if (dimension < 0 || dimension >= ShapeUtil::Rank(*arg_shapes[0])) {
-    return InvalidArgument("dimension to concatenate along out of bounds: %lld",
+    return InvalidArgument("Concatenate dimension out of bounds: %lld.",
                            dimension);
   }
   const Shape* arg_shape = nullptr;
@@ -408,14 +418,14 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     if (ShapeUtil::Rank(*arg_shape) != ShapeUtil::Rank(*shape)) {
       return InvalidArgument(
           "Cannot concatenate arrays with different ranks: %lld (%s) vs %lld "
-          "(%s)",
+          "(%s).",
           ShapeUtil::Rank(*arg_shape),
           ShapeUtil::HumanString(*arg_shape).c_str(), ShapeUtil::Rank(*shape),
           ShapeUtil::HumanString(*shape).c_str());
     }
     if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shape, *shape)) {
       return InvalidArgument(
-          "cannot concatenate arrays with different element types: %s vs %s",
+          "Cannot concatenate arrays with different element types: %s vs %s.",
           PrimitiveType_Name(arg_shape->element_type()).c_str(),
           PrimitiveType_Name(shape->element_type()).c_str());
     }
@@ -428,9 +438,9 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                      // concatenating.
         }
         return InvalidArgument(
-            "cannot concatenate arrays that differ in dimensions other than "
+            "Cannot concatenate arrays that differ in dimensions other than "
             "the one being concatenated (the other array dimensions must be "
-            "the same): %s vs %s in dimension %lld",
+            "the same): %s vs %s in dimension %lld.",
             ShapeUtil::HumanString(*arg_shape).c_str(),
             ShapeUtil::HumanString(*shape).c_str(), dimension);
       }
@@ -452,7 +462,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   if (primitive_util::IsComplexType(old_element_type) &&
       !primitive_util::IsComplexType(new_element_type)) {
     return Unimplemented(
-        "Unsupported conversion from complex to real type: %s => %s",
+        "Conversion from complex to real type %s => %s is not implemented.",
         ShapeUtil::HumanString(operand_shape).c_str(),
         PrimitiveType_Name(new_element_type).c_str());
   }
@@ -461,7 +471,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     // future, by recursing into the tuple elements to check all sub-conversions
     // are valid. For now we just reject them, though.
     return InvalidArgument(
-        "cannot convert from or to tuple type; requested conversion: %s => %s",
+        "Convert does not allow tuples, so cannot convert from %s to %s.",
         ShapeUtil::HumanString(operand_shape).c_str(),
         PrimitiveType_Name(new_element_type).c_str());
   }
@@ -474,24 +484,23 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   auto old_element_type = operand_shape.element_type();
   if (primitive_util::IsComplexType(old_element_type) !=
       primitive_util::IsComplexType(new_element_type)) {
-    return Unimplemented(
-        "Unsupported conversion between real and complex types: %s => %s",
-        ShapeUtil::HumanString(operand_shape).c_str(),
-        PrimitiveType_Name(new_element_type).c_str());
+    return InvalidArgument("Conversion from complex to real type %s => %s.",
+                           ShapeUtil::HumanString(operand_shape).c_str(),
+                           PrimitiveType_Name(new_element_type).c_str());
   }
   if (ShapeUtil::IsTuple(operand_shape) || new_element_type == TUPLE) {
     // Note: we may want to support tuple conversions via this operation in the
     // future, by recursing into the tuple elements to check all sub-conversions
     // are valid. For now we just reject them, though.
     return InvalidArgument(
-        "cannot convert from or to tuple type; requested conversion: %s => %s",
+        "Cannot convert from or to tuple type; requested conversion: %s => %s.",
         ShapeUtil::HumanString(operand_shape).c_str(),
         PrimitiveType_Name(new_element_type).c_str());
   }
   if (primitive_util::BitWidth(old_element_type) !=
       primitive_util::BitWidth(new_element_type)) {
     return InvalidArgument(
-        "cannot bitcast types with different bit-widths: %s => %s",
+        "Cannot bitcast types with different bit-widths: %s => %s.",
         PrimitiveType_Name(old_element_type).c_str(),
         PrimitiveType_Name(new_element_type).c_str());
   }
@@ -504,20 +513,20 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     const int mantissa_bits) {
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
     return InvalidArgument(
-        "expected element type in shape to be floating point for "
-        "ReducePrecision operation; got %s",
+        "Expected element type in shape to be floating point for "
+        "ReducePrecision operation; got %s.",
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
   if (exponent_bits < 1) {
     // One exponent bit is necessary to distinguish 0 from infinity.  Having
     // no exponent bits doesn't produce a sensible number, so we require at
     // least one.
-    return InvalidArgument("expected exponent_bits >= 1; got %d",
+    return InvalidArgument("Expected exponent_bits >= 1; got %d.",
                            exponent_bits);
   }
   if (mantissa_bits < 0) {
     // A number with no mantissa bits is still meaningful, however.
-    return InvalidArgument("expected non-negative mantissa_bits; got %d",
+    return InvalidArgument("Expected non-negative mantissa_bits; got %d.",
                            mantissa_bits);
   }
   return operand_shape;
@@ -528,23 +537,23 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     const PaddingConfig& padding_config) {
   if (ShapeUtil::IsTuple(operand_shape)) {
     return InvalidArgument(
-        "pad operation does not support tuple-shape operands");
+        "Pad operation does not support tuple-shape operands.");
   }
   if (!ShapeUtil::IsScalar(padding_value_shape)) {
     return InvalidArgument(
-        "pad operation does not support non-scalar padding values");
+        "Pad operation does not support non-scalar padding values.");
   }
   if (ShapeUtil::Rank(operand_shape) != padding_config.dimensions_size()) {
     return InvalidArgument(
         "The rank of the operand and the padding configuration do not match: "
-        "%s vs %s",
+        "%s vs %s.",
         ShapeUtil::HumanString(operand_shape).c_str(),
         padding_config.ShortDebugString().c_str());
   }
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(operand_shape,
                                                      padding_value_shape)) {
     return InvalidArgument(
-        "the element types of the operands to pad do not match");
+        "The element types of the operands to Pad do not match.");
   }
   std::vector<int64> dimensions(ShapeUtil::Rank(operand_shape));
   for (int64 i = 0; i < operand_shape.dimensions_size(); ++i) {
@@ -605,7 +614,7 @@ Status ValidateDotDimensionNumbers(
                      lhs_batch_dimensions) ||
       !dims_in_range(ShapeUtil::Rank(rhs), rhs_contracting_dimensions,
                      rhs_batch_dimensions)) {
-    return InvalidArgument("A dimension number is out of range in dot: %s",
+    return InvalidArgument("A dimension number is out of range in Dot: %s.",
                            dimension_numbers.DebugString().c_str());
   }
 
@@ -623,7 +632,7 @@ Status ValidateDotDimensionNumbers(
 
   if (!dims_unique(lhs_contracting_dimensions, lhs_batch_dimensions) ||
       !dims_unique(rhs_contracting_dimensions, rhs_batch_dimensions)) {
-    return InvalidArgument("A dimension number is not unique in dot: %s",
+    return InvalidArgument("A dimension number is not unique in Dot: %s.",
                            dimension_numbers.DebugString().c_str());
   }
 
@@ -641,8 +650,7 @@ Status ValidateDotDimensionNumbers(
       rhs_non_contracting_non_batch_dims < 0 ||
       rhs_non_contracting_non_batch_dims > 1) {
     return InvalidArgument(
-        "batch and contracting dimension number mismatch "
-        "with rank ");
+        "Batch and contracting dimension number mismatch with rank.");
   }
 
   // Check that batch dimension numbers are ordered before all others, and
@@ -654,7 +662,7 @@ Status ValidateDotDimensionNumbers(
       !std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
                   rhs_batch_dimensions.begin())) {
     return InvalidArgument(
-        "batch dimension numbers must precede non-batch dimensions and be"
+        "Batch dimension numbers must precede non-batch dimensions and be"
         "monotonically increasing.");
   }
 
@@ -671,22 +679,22 @@ Status ValidateDotDimensionNumbers(
 
   auto fail = [lhs, rhs](const string& addendum) -> Status {
     string message = tensorflow::strings::Printf(
-        "cannot infer shape for dot operation: %s <dot> %s",
+        "Cannot infer shape for dot operation: %s <dot> %s.",
         ShapeUtil::HumanString(lhs).c_str(),
         ShapeUtil::HumanString(rhs).c_str());
     if (!addendum.empty()) {
-      message += ": " + addendum;
+      message += " " + addendum;
     }
     return InvalidArgument("%s", message.c_str());
   };
 
   // Check if both element types are the same.
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
-    return fail("element types do not match");
+    return fail("Element types do not match.");
   }
 
   if ((ShapeUtil::Rank(lhs) < 1) || (ShapeUtil::Rank(rhs) < 1)) {
-    return fail("dot only supports rank 1 or above.");
+    return fail("Dot only supports rank 1 or above.");
   }
 
   // Validate basic properties of dot dimension numbers.
@@ -696,7 +704,7 @@ Status ValidateDotDimensionNumbers(
   if (dimension_numbers.lhs_contracting_dimensions_size() !=
           dimension_numbers.rhs_contracting_dimensions_size() ||
       dimension_numbers.lhs_contracting_dimensions_size() != 1) {
-    return fail("must specify one contracting dimension for both lhs and rhs.");
+    return fail("Must specify one contracting dimension for both lhs and rhs.");
   }
 
   // Check that contracting dimension sizes match.
@@ -706,13 +714,13 @@ Status ValidateDotDimensionNumbers(
       dimension_numbers.rhs_contracting_dimensions(0);
   if (lhs.dimensions(lhs_contracting_dimension) !=
       rhs.dimensions(rhs_contracting_dimension)) {
-    return fail("contracting dimension sizes do not match.");
+    return fail("Contracting dimension sizes do not match.");
   }
 
   // Check that number of batch dimensions match.
   if (dimension_numbers.lhs_batch_dimensions_size() !=
       dimension_numbers.rhs_batch_dimensions_size()) {
-    return fail("must the same number of batch dimensions for lhs and rhs.");
+    return fail("Must the same number of batch dimensions for lhs and rhs.");
   }
 
   // Check that batch dimension numbers and sizes match.
@@ -721,7 +729,7 @@ Status ValidateDotDimensionNumbers(
             dimension_numbers.rhs_batch_dimensions(i) ||
         lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
             rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i))) {
-      return fail("batch dimension numbers and sizes must match for lhs/rhs.");
+      return fail("Batch dimension numbers and sizes must match for lhs/rhs.");
     }
   }
 
@@ -770,10 +778,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     } else if (rhs.dimensions(i) == 1) {
       output_dimensions[i] = lhs.dimensions(i);
     } else {
-      return InvalidArgument("binary op %s with incompatible shapes: %s and %s",
-                             BinaryOperation_Name(operation).c_str(),
-                             ShapeUtil::HumanString(lhs).c_str(),
-                             ShapeUtil::HumanString(rhs).c_str());
+      return InvalidArgument(
+          "Binary op %s with incompatible shapes: %s and %s.",
+          BinaryOperation_Name(operation).c_str(),
+          ShapeUtil::HumanString(lhs).c_str(),
+          ShapeUtil::HumanString(rhs).c_str());
     }
   }
   return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
@@ -788,15 +797,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     // Reject "magic" inference for binops on different shapes, requiring
     // the user to provide an explicit broadcast dimension in this case.
     // See b/25177275 for more details.
-    return InvalidArgument("automatic shape inference not supported: %s and %s",
+    return InvalidArgument("Automatic shape inference not supported: %s and %s",
                            ShapeUtil::HumanString(smaller_shape).c_str(),
                            ShapeUtil::HumanString(larger_shape).c_str());
   } else if (broadcast_dimensions.size() != ShapeUtil::Rank(smaller_shape)) {
     return InvalidArgument(
-        "size of broadcast_dimensions has to match lower-rank operand's "
+        "Size of broadcast_dimensions has to match lower-rank operand's "
         "rank; "
         " lower-rank operand's rank is %lld, size of broadcast_dimensions is "
-        "%zu",
+        "%zu.",
         ShapeUtil::Rank(smaller_shape), broadcast_dimensions.size());
   }
 
@@ -846,13 +855,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     int64 dimension_to_match = broadcast_dimensions.at(i);
     if (dimension_to_match < 0) {
       return InvalidArgument(
-          "broadcast dimension number (%lld) cannot be negative",
+          "Broadcast dimension number (%lld) cannot be negative.",
           dimension_to_match);
     }
     if (dimension_to_match >= larger_shape.dimensions_size()) {
       return InvalidArgument(
-          "broadcast dimension number (%lld) too large; higher-rank "
-          "operand has rank %d",
+          "Broadcast dimension number (%lld) too large; higher-rank "
+          "operand has rank %d.",
           dimension_to_match, larger_shape.dimensions_size());
     }
     int64 small_dimension_size = smaller_shape.dimensions(i);
@@ -863,7 +872,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     if (small_dimension_size != large_dimension_size &&
         small_dimension_size != 1 && large_dimension_size != 1) {
       return InvalidArgument(
-          "broadcast dimension %d mismatch: %lld != %lld; %s and %s", i,
+          "Broadcast dimension %d mismatch: %lld != %lld; %s and %s.", i,
           small_dimension_size, large_dimension_size,
           ShapeUtil::HumanString(smaller_shape).c_str(),
           ShapeUtil::HumanString(larger_shape).c_str());
@@ -872,7 +881,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     // order.
     if (i > 0 && broadcast_dimensions.at(i - 1) >= dimension_to_match) {
       return InvalidArgument(
-          "broadcast dimensions order is wrong: %lld comes after %lld",
+          "Broadcast dimensions order is wrong: %lld comes after %lld.",
           dimension_to_match, broadcast_dimensions.at(i - 1));
     }
 
@@ -892,7 +901,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return InvalidArgument(
-        "binary op %s with different element types: %s and %s",
+        "Binary op %s with different element types: %s and %s.",
         BinaryOperation_Name(operation).c_str(),
         ShapeUtil::HumanString(lhs).c_str(),
         ShapeUtil::HumanString(rhs).c_str());
@@ -904,8 +913,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     if (!broadcast_dimensions.empty() &&
         broadcast_dimensions != identity_dims) {
       return InvalidArgument(
-          "broadcast dimensions field must either be not set or be the "
-          "identity on binary operations with operands of the same rank");
+          "Broadcast dimensions field must either be not set or be the "
+          "identity on binary operations with operands of the same rank.");
     }
   }
 
@@ -943,6 +952,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                             rhs->shape(), /*broadcast_dimensions=*/{});
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferBinaryOpShape(
+    HloOpcode opcode, const Shape& lhs, const Shape& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return InferBinaryOpShape(OpcodeToBinaryOperation(opcode), lhs, rhs,
+                            broadcast_dimensions);
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferBinaryOpShape(
     BinaryOperation operation, const Shape& lhs, const Shape& rhs,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
@@ -979,8 +995,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     case BINOP_COMPLEX: {
       if (!ShapeUtil::ElementIsFloating(lhs)) {
         return InvalidArgument(
-            "expected element type in shape to be floating for complex compose "
-            "operation; got %s",
+            "Expected element type in shape to be floating for complex compose "
+            "operation; got %s.",
             PrimitiveType_Name(lhs.element_type()).c_str());
       }
       TF_ASSIGN_OR_RETURN(const Shape& shape,
@@ -989,7 +1005,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       if (lhs.element_type() == F32 && rhs.element_type() == F32) {
         return ShapeUtil::ChangeElementType(shape, C64);
       } else {
-        return Unimplemented("complex component type not supported");
+        return Unimplemented("Complex component type is not implemented.");
       }
     }
     case BINOP_AND:
@@ -997,8 +1013,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       if (lhs.element_type() != PRED &&
           !primitive_util::IsIntegralType(lhs.element_type())) {
         return InvalidArgument(
-            "expected pred or integral type in argument to and/or operation; "
-            "got %s",
+            "Expected pred or integral type in argument to and/or operation; "
+            "got %s.",
             PrimitiveType_Name(lhs.element_type()).c_str());
       }
       return InferElementwiseBinaryOpShape(operation, lhs, rhs,
@@ -1016,7 +1032,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     }
     default:
       return Unimplemented(
-          "not yet implemented; infer binary op shape: %s; lhs: %s; rhs: %s",
+          "Binary op shape inference: %s; lhs: %s; rhs: %s is not implemented.",
           BinaryOperation_Name(operation).c_str(),
           lhs.ShortDebugString().c_str(), rhs.ShortDebugString().c_str());
   }
@@ -1025,8 +1041,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 /* static */ StatusOr<Shape> ShapeInference::InferTernaryOpShape(
     HloOpcode opcode, const HloInstruction* lhs, const HloInstruction* rhs,
     const HloInstruction* ehs) {
-  return InferTernaryOpShape(OpcodeToTernaryOperation(opcode), lhs->shape(),
-                             rhs->shape(), ehs->shape());
+  return InferTernaryOpShape(opcode, lhs->shape(), rhs->shape(), ehs->shape());
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferTernaryOpShape(
+    HloOpcode opcode, const Shape& lhs, const Shape& rhs, const Shape& ehs) {
+  return InferTernaryOpShape(OpcodeToTernaryOperation(opcode), lhs, rhs, ehs);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferTernaryOpShape(
@@ -1041,7 +1061,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     case TRIOP_SELECT:
       return InferSelectShape(lhs, rhs, ehs);
     default:
-      return InvalidArgument("unknown operation %s",
+      return InvalidArgument("Unknown operation %s.",
                              TernaryOperation_Name(operation).c_str());
   }
 }
@@ -1053,6 +1073,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (const HloInstruction* operand : operands) {
     operand_shapes.push_back(&operand->shape());
   }
+  return InferVariadicOpShape(opcode, operand_shapes);
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferVariadicOpShape(
+    HloOpcode opcode,
+    tensorflow::gtl::ArraySlice<const Shape*> operand_shapes) {
   return InferVariadicOpShape(OpcodeToVariadicOperation(opcode),
                               operand_shapes);
 }
@@ -1072,7 +1098,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       return result;
     }
     default:
-      return InvalidArgument("unknown operation %s",
+      return InvalidArgument("Unknown operation %s.",
                              VariadicOperation_Name(operation).c_str());
   }
 }
@@ -1082,7 +1108,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const ProgramShape& to_apply,
     tensorflow::gtl::ArraySlice<int64> dimensions) {
   if (arg_shapes.empty()) {
-    return InvalidArgument("Map expects at least one argument");
+    return InvalidArgument("Map expects at least one argument.");
   }
 
   // All arguments must have the same shape.
@@ -1113,7 +1139,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     }
     return InvalidArgument(
         "Map operation requires all operands to have the same shape; got: "
-        "%s",
+        "%s.",
         Join(pieces, ", ").c_str());
   }
 
@@ -1122,7 +1148,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   if (dimensions.size() != arg_shape->dimensions_size()) {
     return InvalidArgument(
         "Map applied to a subset of dimensions currently not supported: "
-        "arg_dimension_size: %d, requested_map_dimensions_size: %zu",
+        "arg_dimension_size: %d, requested_map_dimensions_size: %zu.",
         arg_shape->dimensions_size(), dimensions.size());
   }
 
@@ -1130,7 +1156,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int i = 0; i < dimensions.size(); ++i) {
     if (dimensions[i] != i) {
       return InvalidArgument(
-          "Map requires monotonically increasing dimension numbers, found: %s ",
+          "Map requires monotonically increasing dimension numbers; got: %s.",
           Join(dimensions, ", ").c_str());
     }
   }
@@ -1139,7 +1165,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   if (arg_shapes.size() != to_apply.parameters_size()) {
     return InvalidArgument(
         "Map applied function arity must match number of arguments; got: "
-        "arity: %d, arguments: %zu",
+        "arity: %d, arguments: %zu.",
         to_apply.parameters_size(), arg_shapes.size());
   }
 
@@ -1147,8 +1173,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   const Shape& output_shape = to_apply.result();
   if (!ShapeUtil::IsScalar(output_shape)) {
     return InvalidArgument(
-        "mapped computation's result has to be a scalar; "
-        "got: %s",
+        "Mapped computation's result has to be a scalar; got: %s.",
         ShapeUtil::HumanString(output_shape).c_str());
   }
 
@@ -1157,16 +1182,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
     if (!ShapeUtil::IsScalar(parameter_shape)) {
       return InvalidArgument(
-          "mapped computation's parameter has to be a scalar; "
-          "got parameter %d shape: %s",
+          "Mapped computation's parameter has to be a scalar; "
+          "got parameter %d shape: %s.",
           i, ShapeUtil::HumanString(parameter_shape).c_str());
     }
 
     if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(parameter_shape,
                                                        *arg_shape)) {
       return InvalidArgument(
-          "mapped computation's parameter type has to match argument element "
-          "type; got parameter %d shape: %s, argument shape: %s",
+          "Mapped computation's parameter type has to match argument element "
+          "type; got parameter %d shape: %s, argument shape: %s.",
           i, ShapeUtil::HumanString(parameter_shape).c_str(),
           ShapeUtil::HumanString(*arg_shape).c_str());
     }
@@ -1197,21 +1222,21 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Expected feature_index of batch-norm-training to be "
         "smaller than the rank of operand_shape; "
-        "got feature_index %lld, and rank %lld",
+        "got feature_index %lld, and rank %lld.",
         feature_index, ShapeUtil::Rank(operand_shape));
   }
 
   if (feature_index < 0) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-training to "
-        "be a non-negative number, got %lld",
+        "be a non-negative number, got %lld.",
         feature_index);
   }
 
   if (ShapeUtil::Rank(operand_shape) < 1) {
     return InvalidArgument(
         "Expected the rank of operand to "
-        "batch-norm-training to be at least 1; got %lld",
+        "batch-norm-training to be at least 1; got %lld.",
         ShapeUtil::Rank(operand_shape));
   }
 
@@ -1232,7 +1257,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
     return InvalidArgument(
         "The operand to batch-norm-training must have a floating point "
-        "element type, but the shape is %s",
+        "element type, but the shape is %s.",
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
@@ -1241,7 +1266,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-training, "
         "but the shape of offset factor is %s "
-        "and the shape of operand is %s",
+        "and the shape of operand is %s.",
         PrimitiveType_Name(offset_shape.element_type()).c_str(),
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
@@ -1251,7 +1276,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-training, "
         "but the shape of scale factor is %s "
-        "and the shape of operand is %s",
+        "and the shape of operand is %s.",
         PrimitiveType_Name(scale_shape.element_type()).c_str(),
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
@@ -1264,7 +1289,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The size of offset factor should be the same as feature count,"
         "but the size of offset factor is %lld "
-        "and the feature count is %lld",
+        "and the feature count is %lld.",
         ShapeUtil::GetDimension(offset_shape, 0), feature_count);
   }
 
@@ -1272,7 +1297,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The size of scale factor should be the same as feature count,"
         "but the size of scale factor is %lld "
-        "and the feature count is %lld",
+        "and the feature count is %lld.",
         ShapeUtil::GetDimension(scale_shape, 0), feature_count);
   }
 
@@ -1307,21 +1332,21 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Expected feature_index of batch-norm-inference to be "
         "smaller than the rank of operand_shape; "
-        "got feature_index %lld, and rank %lld",
+        "got feature_index %lld, and rank %lld.",
         feature_index, ShapeUtil::Rank(operand_shape));
   }
 
   if (feature_index < 0) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-inference to "
-        "be a non-negative number, got %lld",
+        "be a non-negative number, got %lld.",
         feature_index);
   }
 
   if (ShapeUtil::Rank(operand_shape) < 1) {
     return InvalidArgument(
         "Expected the rank of operand to "
-        "batch-norm-inference to be at least 1; got %lld",
+        "batch-norm-inference to be at least 1; got %lld.",
         ShapeUtil::Rank(operand_shape));
   }
 
@@ -1342,7 +1367,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
     return InvalidArgument(
         "The operand to batch-norm-inference must have a floating point "
-        "element type, but the shape is %s",
+        "element type, but the shape is %s.",
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
@@ -1352,7 +1377,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         "The inputs should have the same element type for "
         "batch-norm-inference, "
         "but the shape of offset factor is %s "
-        "and the shape of operand is %s",
+        "and the shape of operand is %s.",
         PrimitiveType_Name(offset_shape.element_type()).c_str(),
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
@@ -1363,7 +1388,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         "The inputs should have the same element type for "
         "batch-norm-inference, "
         "but the shape of scale factor is %s "
-        "and the shape of operand is %s",
+        "and the shape of operand is %s.",
         PrimitiveType_Name(scale_shape.element_type()).c_str(),
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
@@ -1374,7 +1399,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         "The inputs should have the same element type for "
         "batch-norm-inference, "
         "but the shape of mean is %s "
-        "and the shape of operand is %s",
+        "and the shape of operand is %s.",
         PrimitiveType_Name(mean_shape.element_type()).c_str(),
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
@@ -1385,7 +1410,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         "The inputs should have the same element type for "
         "batch-norm-inference, "
         "but the shape of variance is %s "
-        "and the shape of operand is %s",
+        "and the shape of operand is %s.",
         PrimitiveType_Name(mean_shape.element_type()).c_str(),
         PrimitiveType_Name(variance_shape.element_type()).c_str());
   }
@@ -1398,7 +1423,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The size of offset factor should be the same as feature count,"
         "but the size of offset factor is %lld "
-        "and the feature count is %lld",
+        "and the feature count is %lld.",
         ShapeUtil::GetDimension(offset_shape, 0), feature_count);
   }
 
@@ -1406,7 +1431,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The size of scale factor should be the same as feature count,"
         "but the size of scale factor is %lld "
-        "and the feature count is %lld",
+        "and the feature count is %lld.",
         ShapeUtil::GetDimension(scale_shape, 0), feature_count);
   }
 
@@ -1414,7 +1439,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The size of mean should be the same as feature count,"
         "but the size of mean is %lld "
-        "and the feature count is %lld",
+        "and the feature count is %lld.",
         ShapeUtil::GetDimension(mean_shape, 0), feature_count);
   }
 
@@ -1422,7 +1447,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The size of variance should be the same as feature count,"
         "but the size of variance is %lld "
-        "and the feature count is %lld",
+        "and the feature count is %lld.",
         ShapeUtil::GetDimension(variance_shape, 0), feature_count);
   }
 
@@ -1455,7 +1480,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Expected feature_index of batch-norm-grad to be "
         "smaller than the rank of operand_shape; "
-        "got feature_index %lld, and rank %lld",
+        "got feature_index %lld, and rank %lld.",
         feature_index, ShapeUtil::Rank(operand_shape));
   }
 
@@ -1463,7 +1488,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Expected operand_shape of batch-norm-grad to have the same rank as"
         " output_grad_shape; got rank(oprand_shape) %lld, and"
-        " rank(output_grad_shape) %lld",
+        " rank(output_grad_shape) %lld.",
         ShapeUtil::Rank(operand_shape), ShapeUtil::Rank(output_grad_shape));
   }
 
@@ -1491,14 +1516,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
     return InvalidArgument(
         "The operand to batch-norm-grad must have a floating point "
-        "element type, but the shape is %s",
+        "element type, but the shape is %s.",
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
   if (!ShapeUtil::ElementIsFloating(output_grad_shape)) {
     return InvalidArgument(
         "The output_grad to batch-norm-grad must have a floating point "
-        "element type, but the shape is %s",
+        "element type, but the shape is %s.",
         PrimitiveType_Name(output_grad_shape.element_type()).c_str());
   }
 
@@ -1507,7 +1532,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-grad, "
         "but the element type of output_grad is %s "
-        "and the element type of operand is %s",
+        "and the element type of operand is %s.",
         PrimitiveType_Name(output_grad_shape.element_type()).c_str(),
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
@@ -1517,7 +1542,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-grad, "
         "but the element type of scale factor is %s "
-        "and the element type of operand is %s",
+        "and the element type of operand is %s.",
         PrimitiveType_Name(scale_shape.element_type()).c_str(),
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
@@ -1527,7 +1552,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-grad, "
         "but the element type of mean is %s "
-        "and the element type of operand is %s",
+        "and the element type of operand is %s.",
         PrimitiveType_Name(mean_shape.element_type()).c_str(),
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
@@ -1537,7 +1562,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-grad, "
         "but the element type of mean is %s "
-        "and the element type of operand is %s",
+        "and the element type of operand is %s.",
         PrimitiveType_Name(mean_shape.element_type()).c_str(),
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
@@ -1551,7 +1576,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The size of mean should be the same as feature count,"
         "but the size of offset factor is %lld "
-        "and the feature count is %lld",
+        "and the feature count is %lld.",
         ShapeUtil::GetDimension(mean_shape, 0), feature_count);
   }
 
@@ -1559,7 +1584,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The size of scale factor should be the same as feature count,"
         "but the size of scale factor is %lld "
-        "and the feature count is %lld",
+        "and the feature count is %lld.",
         ShapeUtil::GetDimension(scale_shape, 0), feature_count);
   }
 
@@ -1567,7 +1592,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "The size of variance should be the same as feature count,"
         "but the size of variance is %lld "
-        "and the feature count is %lld",
+        "and the feature count is %lld.",
         ShapeUtil::GetDimension(var_shape, 0), feature_count);
   }
 
@@ -1578,7 +1603,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       return InvalidArgument(
           "The bounds of operand shape should be the same as output_grad's,"
           "but the bound of operand_shape at dimension %lld is %lld "
-          "and the bound of output_grad_shape is %lld",
+          "and the bound of output_grad_shape is %lld.",
           i, ShapeUtil::GetDimension(operand_shape, i),
           ShapeUtil::GetDimension(output_grad_shape, i));
     }
@@ -1596,7 +1621,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return InvalidArgument(
-        "Convolution with different element types: %s and %s",
+        "Convolution with different element types: %s and %s.",
         ShapeUtil::HumanString(lhs).c_str(),
         ShapeUtil::HumanString(rhs).c_str());
   }
@@ -1612,21 +1637,19 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   if (window.dimensions_size() != num_spatial_dims) {
     return InvalidArgument(
         "Window must have same number of dimensions as dimension numbers.\n"
-        "Window: %s\nDimension numbers: %s",
+        "Window: %s\nDimension numbers: %s.",
         window.DebugString().c_str(), dnums.DebugString().c_str());
   }
 
   const int num_dims = num_spatial_dims + 2;
   if (ShapeUtil::Rank(lhs) != num_dims) {
     return InvalidArgument(
-        "The LHS argument to a convolution should have rank %d.\n"
-        "lhs: %s",
+        "The LHS argument to a convolution should have rank %d; lhs: %s.",
         num_dims, ShapeUtil::HumanString(lhs).c_str());
   }
   if (ShapeUtil::Rank(rhs) != num_dims) {
     return InvalidArgument(
-        "The RHS argument to a convolution should have rank %d.\n"
-        "lhs: %s",
+        "The RHS argument to a convolution should have rank %d; lhs: %s.",
         num_dims, ShapeUtil::HumanString(lhs).c_str());
   }
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
@@ -1663,26 +1686,26 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       !std::all_of(window_dnums.begin(), window_dnums.end(), in_range) ||
       !std::all_of(output_dnums.begin(), output_dnums.end(), in_range)) {
     return InvalidArgument(
-        "A dimension number is out of range in convolution: %s",
+        "A dimension number is out of range in convolution: %s.",
         dnums.DebugString().c_str());
   }
 
   if (input_dnums != expected_dnums) {
     return InvalidArgument(
         "Input dimensions of convolution must contain each dimension exactly "
-        "once: %s",
+        "once: %s.",
         dnums.DebugString().c_str());
   }
   if (window_dnums != expected_dnums) {
     return InvalidArgument(
         "Window dimensions of convolution must contain each dimension exactly "
-        "once: %s",
+        "once: %s.",
         dnums.DebugString().c_str());
   }
   if (output_dnums != expected_dnums) {
     return InvalidArgument(
         "Output dimensions of convolution must contain each dimension exactly "
-        "once: %s",
+        "once: %s.",
         dnums.DebugString().c_str());
   }
 
@@ -1706,7 +1729,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Expected LHS feature dimension (value %lld) to match RHS "
         "input feature dimension (value %lld); got <conv>(%s, %s)\n"
-        "Dimension numbers: {%s}",
+        "Dimension numbers: {%s}.",
         input_features, kernel_input_features,
         ShapeUtil::HumanString(lhs).c_str(),
         ShapeUtil::HumanString(rhs).c_str(), dnums.DebugString().c_str());
@@ -1720,7 +1743,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         "Window dimensions do not match RHS shape:\n\t"
         "RHS shape: %s\n\t"
         "Window: {%s}\n\t"
-        "Dimension numbers: {%s}",
+        "Dimension numbers: {%s}.",
         ShapeUtil::HumanString(rhs).c_str(), window.ShortDebugString().c_str(),
         dnums.ShortDebugString().c_str());
   }
@@ -1748,8 +1771,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const tensorflow::gtl::ArraySlice<int64> fft_length) {
   const int64 fft_rank = fft_length.size();
   if (fft_rank < 1 || fft_rank > 3) {
-    return InvalidArgument("FFT only supports ranks 1-3, but got %lld",
-                           fft_rank);
+    return InvalidArgument("FFT only supports ranks 1-3; got %lld.", fft_rank);
   }
 #define RET_CHECK_RANK(x)                              \
   if (x.dimensions_size() < fft_rank) {                \
@@ -1762,7 +1784,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     case FFT:
     case IFFT:
       if (in.element_type() != C64) {
-        return InvalidArgument("%s requires C64 input type, found %s",
+        return InvalidArgument("%s requires C64 input type, found %s.",
                                FftType_Name(fft_type).c_str(),
                                PrimitiveType_Name(in.element_type()).c_str());
       }
@@ -1770,7 +1792,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       return in;
     case RFFT: {
       if (in.element_type() != F32) {
-        return InvalidArgument("RFFT requires F32 input type, found %s",
+        return InvalidArgument("RFFT requires F32 input type, found %s.",
                                PrimitiveType_Name(in.element_type()).c_str());
       }
       RET_CHECK_RANK(in);
@@ -1779,7 +1801,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
             fft_length[i]) {
           return InvalidArgument(
               "RFFT requires innermost dimensions match fft_length but "
-              "dimension %lld is %lld and should be %lld",
+              "dimension %lld is %lld and should be %lld.",
               in.dimensions_size() - fft_rank + i,
               in.dimensions(in.dimensions_size() - fft_rank + i),
               fft_length[i]);
@@ -1792,7 +1814,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     }
     case IRFFT: {
       if (in.element_type() != C64) {
-        return InvalidArgument("IRFFT requires C64 input type, found %s",
+        return InvalidArgument("IRFFT requires C64 input type, found %s.",
                                PrimitiveType_Name(in.element_type()).c_str());
       }
       RET_CHECK_RANK(in);
@@ -1802,7 +1824,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
             fft_length[i]) {
           return InvalidArgument(
               "IRFFT requires all but one innermost dimensions match "
-              "fft_length, but dimension %lld is %lld and should be %lld",
+              "fft_length, but dimension %lld is %lld and should be %lld.",
               in.dimensions_size() - fft_rank + i,
               in.dimensions(in.dimensions_size() - fft_rank + i),
               fft_length[i]);
@@ -1812,7 +1834,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
           fft_length[fft_rank - 1] / 2 + 1) {
         return InvalidArgument(
             "IRFFT requires innermost dimension matches fft_length/2+1, but "
-            "dimension %d is %lld and should be %lld",
+            "dimension %d is %lld and should be %lld.",
             in.dimensions_size() - 1, in.dimensions(in.dimensions_size() - 1),
             fft_length[fft_rank - 1] / 2 + 1);
       }
@@ -1850,8 +1872,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int64 dimension : dimensions_to_reduce) {
     if (dimension >= ShapeUtil::Rank(arg) || dimension < 0) {
       return InvalidArgument(
-          "attempting to reduce out-of-bounds dimension %lld in shape %s",
-          dimension, ShapeUtil::HumanString(arg).c_str());
+          "Reducing out-of-bounds dimension %lld in shape %s.", dimension,
+          ShapeUtil::HumanString(arg).c_str());
     }
   }
   TF_RETURN_IF_ERROR(
@@ -1891,30 +1913,30 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   // Check if the select function has a proper shape of (T,T) -> PRED.
   if (select_shape.parameters_size() != 2) {
     return InvalidArgument(
-        "select function must take 2 parameters, but "
+        "Select function must take 2 parameters, but "
         "takes %d parameter(s).",
         select_shape.parameters_size());
   }
   const Shape& select_result_shape = select_shape.result();
   if (!ShapeUtil::Compatible(select_result_shape,
                              ShapeUtil::MakeShape(PRED, {}))) {
-    return Unimplemented("select function must have rank-0 PRED result.");
+    return InvalidArgument("Select function must have rank-0 PRED result.");
   }
   const Shape& operand_element_shape =
       ShapeUtil::MakeShape(operand_shape.element_type(), {});
   if (!ShapeUtil::CompatibleIgnoringFpPrecision(operand_element_shape,
                                                 select_shape.parameters(0))) {
     return InvalidArgument(
-        "select function's first parameter shape currently must "
-        "match the operand element shape. Got %s vs %s",
+        "Select function's first parameter shape currently must "
+        "match the operand element shape, but got %s vs %s.",
         ShapeUtil::HumanString(select_shape.parameters(0)).c_str(),
         ShapeUtil::HumanString(operand_element_shape).c_str());
   }
   if (!ShapeUtil::CompatibleIgnoringFpPrecision(operand_element_shape,
                                                 select_shape.parameters(1))) {
     return InvalidArgument(
-        "select function's second parameter shape currently must "
-        "match the operand element shape. Got %s vs %s",
+        "Select function's second parameter shape currently must "
+        "match the operand element shape, but got %s vs %s.",
         ShapeUtil::HumanString(select_shape.parameters(1)).c_str(),
         ShapeUtil::HumanString(operand_element_shape).c_str());
   }
@@ -1931,8 +1953,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   if (!ShapeUtil::CompatibleIgnoringFpPrecision(source_shape,
                                                 window_result_shape)) {
     return InvalidArgument(
-        "source shape does not match the shape of window-reduced operand: "
-        "source(%s), window-reduced operand(%s)",
+        "Source shape does not match the shape of window-reduced operand: "
+        "source(%s), window-reduced operand(%s).",
         ShapeUtil::HumanString(source_shape).c_str(),
         ShapeUtil::HumanString(window_result_shape).c_str());
   }
@@ -1946,7 +1968,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   auto error = [&](const string& message) {
     return InvalidArgument(
         "%s in slice operation; argument shape: %s; starts: {%s}; limits: "
-        "{%s}; strides: {%s}",
+        "{%s}; strides: {%s}.",
         message.c_str(), ShapeUtil::HumanString(arg).c_str(),
         Join(starts, ",").c_str(), Join(limits, ",").c_str(),
         Join(strides, ",").c_str());
@@ -1969,7 +1991,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
   if (starts.size() != ShapeUtil::Rank(arg)) {
     return InvalidArgument(
-        "slice index count does not match argument rank: %zu vs %lld",
+        "Slice index count does not match argument rank: %zu vs %lld.",
         starts.size(), ShapeUtil::Rank(arg));
   }
 
@@ -1979,7 +2001,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     int64 limit_index = limits[dimension];
     int64 stride = strides[dimension];
     if (start_index < 0) {
-      return InvalidArgument("negative start index to slice: %lld",
+      return InvalidArgument("Negative start index to slice: %lld.",
                              start_index);
     }
     if (limit_index > arg.dimensions(dimension)) {
@@ -1999,7 +2021,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                  limit_index, start_index));
     }
     if (stride <= 0) {
-      return InvalidArgument("stride (%lld) must be positive", stride);
+      return InvalidArgument("Stride (%lld) must be positive.", stride);
     }
     sizes.push_back((limit_index - start_index + stride - 1) / stride);
   }
@@ -2023,20 +2045,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
   if (ShapeUtil::Rank(start_indices_shape) != 1) {
     return InvalidArgument(
-        "dynamic slice start indices of rank %lld must be rank1.",
+        "Dynamic slice start indices of rank %lld must be rank1.",
         ShapeUtil::Rank(start_indices_shape));
   }
 
   if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
     return InvalidArgument(
-        "dynamic slice start indices must be of integral type.");
+        "Dynamic slice start indices must be of integral type.");
   }
 
   const int64 start_num_dims = start_indices_shape.dimensions(0);
   if (ShapeUtil::Rank(operand_shape) != start_num_dims) {
     return InvalidArgument(
-        "dynamic slice start number of dimensions %lld (%s) must match rank "
-        "%lld of slice input (%s)",
+        "Dynamic slice start number of dimensions %lld (%s) must match rank "
+        "%lld of slice input (%s).",
         start_num_dims, ShapeUtil::HumanString(start_indices_shape).c_str(),
         ShapeUtil::Rank(operand_shape),
         ShapeUtil::HumanString(operand_shape).c_str());
@@ -2044,7 +2066,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
   if (slice_sizes.size() != ShapeUtil::Rank(operand_shape)) {
     return InvalidArgument(
-        "dynamic slice index count does not match argument rank: %zu vs %lld",
+        "Dynamic slice index count does not match argument rank: %zu vs %lld.",
         slice_sizes.size(), ShapeUtil::Rank(operand_shape));
   }
 
@@ -2052,12 +2074,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const int64 input_dim_size = operand_shape.dimensions(dim);
     const int64 slice_dim_size = slice_sizes[dim];
     if (slice_dim_size < 0) {
-      return InvalidArgument("negative size index to dynamic slice: %lld",
+      return InvalidArgument("Negative size index to dynamic slice: %lld.",
                              slice_dim_size);
     }
     if (slice_dim_size > input_dim_size) {
       return InvalidArgument(
-          "slice dim size %lld greater than dynamic slice dimension: %lld",
+          "Slice dim size %lld greater than dynamic slice dimension: %lld.",
           slice_dim_size, input_dim_size);
     }
     VLOG(2) << tensorflow::strings::Printf("slice_sizes[%lld] = %lld", dim,
@@ -2086,20 +2108,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
   if (ShapeUtil::Rank(start_indices_shape) != 1) {
     return InvalidArgument(
-        "dynamic update slice start indices of rank %lld must be rank1.",
+        "Dynamic update slice start indices of rank %lld must be rank1.",
         ShapeUtil::Rank(start_indices_shape));
   }
 
   if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
     return InvalidArgument(
-        "dynamic update slice start indices must be of integral type.");
+        "Dynamic update slice start indices must be of integral type.");
   }
 
   const int64 start_num_dims = start_indices_shape.dimensions(0);
   if (ShapeUtil::Rank(operand_shape) != start_num_dims) {
     return InvalidArgument(
-        "dynamic slice start number of dimensions %lld (%s) must match rank "
-        "%lld of slice input (%s)",
+        "Dynamic update slice start number of dimensions %lld (%s) must match "
+        "rank %lld of slice input (%s).",
         start_num_dims, ShapeUtil::HumanString(start_indices_shape).c_str(),
         ShapeUtil::Rank(operand_shape),
         ShapeUtil::HumanString(operand_shape).c_str());
@@ -2107,16 +2129,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
   if (ShapeUtil::Rank(update_shape) != ShapeUtil::Rank(operand_shape)) {
     return InvalidArgument(
-        "dynamic update slice update rank does not match argument rank: "
-        "%lld vs %lld",
+        "Dynamic update slice update rank does not match argument rank: "
+        "%lld vs %lld.",
         ShapeUtil::Rank(update_shape), ShapeUtil::Rank(operand_shape));
   }
 
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(operand_shape,
                                                      update_shape)) {
     return InvalidArgument(
-        "dynamic update slice update element type does not match argument. "
-        "operand.element_type: %s vs update.element_type: %s",
+        "Dynamic update slice update element type does not match argument. "
+        "operand.element_type: %s vs update.element_type: %s.",
         PrimitiveType_Name(operand_shape.element_type()).c_str(),
         PrimitiveType_Name(update_shape.element_type()).c_str());
   }
@@ -2126,12 +2148,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const int64 update_dim_size = update_shape.dimensions(dim);
     if (update_dim_size < 0) {
       return InvalidArgument(
-          "size index %lld to dynamic update slice must be >= 0",
+          "Size index %lld to dynamic update slice must be >= 0.",
           update_dim_size);
     }
     if (update_dim_size > input_dim_size) {
       return InvalidArgument(
-          "update dim size %lld greater than dynamic slice dimension: %lld",
+          "Update dim size %lld greater than dynamic slice dimension: %lld.",
           update_dim_size, input_dim_size);
     }
     VLOG(2) << tensorflow::strings::Printf("update_sizes[%lld] = %lld", dim,
@@ -2151,7 +2173,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int64 dimension : dimensions) {
     if (dimension >= ShapeUtil::Rank(operand_shape) || dimension < 0) {
       return InvalidArgument(
-          "one of the reverse dimensions (%lld) is out-of-bounds in shape %s",
+          "One of the reverse dimensions (%lld) is out-of-bounds in shape %s.",
           dimension, ShapeUtil::HumanString(operand_shape).c_str());
     }
   }
@@ -2162,14 +2184,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const Shape& arg, int64 index) {
   if (!ShapeUtil::IsTuple(arg)) {
     return InvalidArgument(
-        "cannot infer shape: attempting to index into non-tuple: %s",
+        "Cannot infer shape: attempting to index into non-tuple: %s.",
         ShapeUtil::HumanString(arg).c_str());
   }
 
   if (index >= arg.tuple_shapes_size()) {
     return InvalidArgument(
-        "cannot infer shape: attempt to index out of tuple bounds: %lld "
-        ">= %d in shape %s",
+        "Cannot infer shape: attempt to index out of tuple bounds: %lld "
+        ">= %d in shape %s.",
         index, arg.tuple_shapes_size(), ShapeUtil::HumanString(arg).c_str());
   }
 
@@ -2181,17 +2203,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const Shape& init) {
   // Check the number of parameters for given computations.
   if (condition.parameters_size() != 1) {
-    return InvalidArgument("condition must take 1 arguments; got %d",
+    return InvalidArgument("Condition must take 1 arguments; got %d.",
                            condition.parameters_size());
   }
   if (body.parameters_size() != 1) {
-    return InvalidArgument("body must take 1 arguments; got %d",
+    return InvalidArgument("Body must take 1 arguments; got %d.",
                            body.parameters_size());
   }
 
   auto shape_string = [&]() {
     return tensorflow::strings::Printf(
-        "condition: %s; body: %s; init: %s",
+        "Condition: %s; body: %s; init: %s.",
         ShapeUtil::HumanString(condition).c_str(),
         ShapeUtil::HumanString(body).c_str(),
         ShapeUtil::HumanString(init).c_str());
@@ -2199,15 +2221,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
   // Check the shapes of computation parameters and return types.
   if (!ShapeUtil::ShapeIs(condition.result(), PRED, {})) {
-    return InvalidArgument("condition must return a boolean; got %s",
+    return InvalidArgument("Condition must return a boolean; got %s.",
                            shape_string().c_str());
   }
   if (!ShapeUtil::Compatible(body.result(), condition.parameters(0)) ||
       !ShapeUtil::Compatible(body.result(), body.parameters(0)) ||
       !ShapeUtil::Compatible(body.result(), init)) {
     return InvalidArgument(
-        "the parameter of condition and body, the result of the body, and init "
-        "must all have the same shape; got %s",
+        "The parameter of condition and body, the result of the body, and init "
+        "must all have the same shape; got %s.",
         shape_string().c_str());
   }
 
@@ -2219,7 +2241,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const Shape& false_operand, const ProgramShape& true_computation,
     const ProgramShape& false_computation) {
   if (!ShapeUtil::ShapeIs(predicate, PRED, {})) {
-    return InvalidArgument("predicate must be a boolean; got %s.",
+    return InvalidArgument("Predicate must be a boolean; got %s.",
                            ShapeUtil::HumanString(predicate).c_str());
   }
 
@@ -2302,8 +2324,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
   if (ShapeUtil::ElementsIn(operand) != ShapeUtil::ElementsIn(inferred_shape)) {
     return InvalidArgument(
-        "reshape operation has mismatched element counts: from=%lld (%s) "
-        "to=%lld (%s)",
+        "Reshape operation has mismatched element counts: from=%lld (%s) "
+        "to=%lld (%s).",
         ShapeUtil::ElementsIn(operand), ShapeUtil::HumanString(operand).c_str(),
         ShapeUtil::ElementsIn(inferred_shape),
         ShapeUtil::HumanString(inferred_shape).c_str());
@@ -2351,7 +2373,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(max, "clamp max"));
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(min, operand) ||
       !ShapeUtil::SameElementTypeIgnoringFpPrecision(max, operand)) {
-    return InvalidArgument("clamp op with different operand types: %s, %s, %s",
+    return InvalidArgument("Clamp with different operand types: %s, %s, %s.",
                            ShapeUtil::HumanString(min).c_str(),
                            ShapeUtil::HumanString(operand).c_str(),
                            ShapeUtil::HumanString(max).c_str());
@@ -2372,7 +2394,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     }
   }
   return Unimplemented(
-      "not yet implemented: %s, %s <clamp> %s", min.ShortDebugString().c_str(),
+      "%s, %s <clamp> %s is not implemented.", min.ShortDebugString().c_str(),
       max.ShortDebugString().c_str(), operand.ShortDebugString().c_str());
 }
 
@@ -2391,25 +2413,26 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   }
   if (!compatible) {
     return InvalidArgument(
-        "operands to select must be the same shape; got %s and %s",
+        "Operands to select must be the same shape; got %s and %s.",
         ShapeUtil::HumanString(on_true).c_str(),
         ShapeUtil::HumanString(on_false).c_str());
   }
   if (pred.element_type() != PRED) {
     return InvalidArgument(
-        "select's pred operand must have PRED element type; got %s",
+        "Select's pred operand must have PRED element type; got %s.",
         ShapeUtil::HumanString(pred).c_str());
   }
-  if (ShapeUtil::SameDimensions(pred, on_true) || ShapeUtil::Rank(pred) == 0) {
+  if (ShapeUtil::CompatibleIgnoringElementType(pred, on_true) ||
+      ShapeUtil::Rank(pred) == 0) {
     // By this stage we know that pred's element type is PRED. Therefore, this
     // check restricts pred to be a PRED scalar, or a PRED array with the same
     // dimensions as on_true and on_false.
     return ShapeUtil::ChangeElementType(
         on_true, ShapeUtil::HigherPrecisionElementType(on_true, on_false));
   } else {
-    return Unimplemented(
-        "select operation with non-scalar predicate with dimensionality "
-        " different from the other operands: %s",
+    return InvalidArgument(
+        "Select operation with non-scalar predicate with dimensionality "
+        " different from the other operands: %s.",
         ShapeUtil::HumanString(pred).c_str());
   }
 }
@@ -2427,7 +2450,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Call applied function arity must match number of arguments; got: "
         "arity: %d, arguments: %zu; computation signature: %s; argument "
-        "shapes: [%s]",
+        "shapes: [%s].",
         to_apply.parameters_size(), arg_shapes.size(),
         computation_signature.c_str(), argument_shapes.c_str());
   }
@@ -2439,7 +2462,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     if (!ShapeUtil::Compatible(arg_shape, param_shape)) {
       return InvalidArgument(
           "Call parameter must match argument; got parameter %d shape: %s, "
-          "argument shape: %s",
+          "argument shape: %s.",
           i, ShapeUtil::HumanString(param_shape).c_str(),
           ShapeUtil::HumanString(arg_shape).c_str());
     }
@@ -2454,40 +2477,40 @@ static Status ValidateGatherDimensionNumbers(
     const GatherDimensionNumbers& dim_numbers) {
   if (!c_is_sorted(dim_numbers.output_window_dims())) {
     return InvalidArgument(
-        "Output window dimensions in gather op must be ascending; got: %s",
+        "Output window dimensions in gather op must be ascending; got: %s.",
         Join(dim_numbers.output_window_dims(), ", ").c_str());
   }
 
   if (c_adjacent_find(dim_numbers.output_window_dims()) !=
       dim_numbers.output_window_dims().end()) {
     return InvalidArgument(
-        "Output window dimensions in gather op must not repeat; got: %s",
+        "Output window dimensions in gather op must not repeat; got: %s.",
         Join(dim_numbers.output_window_dims(), ", ").c_str());
   }
 
   const int64 output_window_dim_count = dim_numbers.output_window_dims_size();
   const int64 output_shape_rank =
-      output_window_dim_count + gather_indices_shape.size();
+      output_window_dim_count + gather_indices_shape.size() - 1;
 
   for (int i = 0; i < dim_numbers.output_window_dims_size(); ++i) {
     int64 window_index = dim_numbers.output_window_dims(i);
     if (window_index < 0 || window_index >= output_shape_rank) {
       return InvalidArgument(
           "Window index %d in gather op is out of bounds; got %lld, but should "
-          "have been in"
-          "[0,%lld)",
+          "have been in [0,%lld).",
           i, window_index, output_shape_rank);
     }
   }
 
   if (dim_numbers.gather_dims_to_operand_dims_size() !=
-      gather_indices_shape.back()) {
+      gather_indices_shape[dim_numbers.index_vector_dim()]) {
     return InvalidArgument(
-        "There must be exactly as many elements in gather_dims_to_operand_dims "
-        "as there are elements in the last dimension of %%gather_indices; got: "
-        "%d, expected %lld",
+        "Gather op has %d elements in gather_dims_to_operand_dims and the "
+        "bound of dimension index_vector_dim=%lld of gather_indices is "
+        "%lld. These two numbers must be equal.",
         dim_numbers.gather_dims_to_operand_dims_size(),
-        gather_indices_shape.back());
+        dim_numbers.index_vector_dim(),
+        gather_indices_shape[dim_numbers.index_vector_dim()]);
   }
 
   for (int i = 0; i < dim_numbers.gather_dims_to_operand_dims_size(); i++) {
@@ -2496,7 +2519,7 @@ static Status ValidateGatherDimensionNumbers(
         gather_dim_to_input_dim >= input_shape.dimensions_size()) {
       return InvalidArgument(
           "Invalid gather_dims_to_operand_dims mapping; domain is [0, %d), "
-          "got: %d->%lld",
+          "got: %d->%lld.",
           input_shape.dimensions_size(), i, gather_dim_to_input_dim);
     }
   }
@@ -2511,7 +2534,7 @@ static Status ValidateGatherDimensionNumbers(
       sorted_gather_dims_to_operand_dims.end()) {
     return InvalidArgument(
         "Repeated dimensions are not allowed in gather_dims_to_operand_dims; "
-        "got: %s",
+        "got: %s.",
         Join(dim_numbers.gather_dims_to_operand_dims(), ", ").c_str());
   }
 
@@ -2519,7 +2542,7 @@ static Status ValidateGatherDimensionNumbers(
     if (elided_dim < 0 || elided_dim >= input_shape.dimensions_size()) {
       return InvalidArgument(
           "Invalid elided_window_dims set in gather op; valid range is [0, "
-          "%d), got: %lld",
+          "%d), got: %lld.",
           input_shape.dimensions_size(), elided_dim);
     }
   }
@@ -2534,7 +2557,7 @@ static Status ValidateGatherDimensionNumbers(
       dim_numbers.elided_window_dims().end()) {
     return InvalidArgument(
         "Repeated dimensions not allowed in elided_window_dims in gather op; "
-        "got: %s",
+        "got: %s.",
         Join(dim_numbers.elided_window_dims(), ", ").c_str());
   }
 
@@ -2550,24 +2573,33 @@ static Status ValidateGatherDimensionNumbers(
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
       gather_indices_shape, "gather indices operand of gather op"));
 
-  if (gather_indices_shape.dimensions_size() < 1) {
+  if (!ShapeUtil::ElementIsIntegral(gather_indices_shape)) {
     return InvalidArgument(
-        "Gather indices parameter must at least of rank 1; got %s",
+        "Gather indices parameter must be an integral tensor; got %s.",
         ShapeUtil::HumanString(gather_indices_shape).c_str());
   }
 
-  if (!ShapeUtil::ElementIsIntegral(gather_indices_shape)) {
+  // We implicitly reshape gather indices of shape P[A,B,C] to P[A,B,C,1] if
+  // index_vector_dim is rank(P).  The bounds of this expanded shape is
+  // stored in expanded_gather_indices_shape.
+
+  if (gather_indices_shape.dimensions_size() <
+          gather_dim_numbers.index_vector_dim() ||
+      gather_dim_numbers.index_vector_dim() < 0) {
     return InvalidArgument(
-        "Gather indices parameter must be an integral tensor; got %s",
-        ShapeUtil::HumanString(gather_indices_shape).c_str());
+        "Gather index leaf dimension must be within [0, rank(gather_indices) + "
+        "1). rank(gather_indices) is %d and gather index leaf dimension is "
+        "%lld.",
+        gather_indices_shape.dimensions_size(),
+        gather_dim_numbers.index_vector_dim());
   }
 
   std::vector<int64> expanded_gather_indices_shape;
-  // We implicitly reshape gather indices of shape P[N] to P[N,1].
   expanded_gather_indices_shape.reserve(gather_indices_shape.dimensions_size());
   c_copy(gather_indices_shape.dimensions(),
          std::back_inserter(expanded_gather_indices_shape));
-  if (expanded_gather_indices_shape.size() == 1) {
+  if (expanded_gather_indices_shape.size() ==
+      gather_dim_numbers.index_vector_dim()) {
     expanded_gather_indices_shape.push_back(1);
   }
 
@@ -2577,7 +2609,7 @@ static Status ValidateGatherDimensionNumbers(
   if (window_bounds.size() != input_shape.dimensions_size()) {
     return InvalidArgument(
         "Gather op must have one window bound for every input dimension; got: "
-        "len(window_bounds)=%lu, input_shape.rank=%d",
+        "len(window_bounds)=%lu, input_shape.rank=%d.",
         window_bounds.size(), input_shape.dimensions_size());
   }
 
@@ -2587,7 +2619,7 @@ static Status ValidateGatherDimensionNumbers(
     return InvalidArgument(
         "All components of the window index in a gather op must either be a "
         "output window index or explicitly elided; got len(window_bounds)=%lu, "
-        "output_window_bounds=%s, elided_window_bounds=%s",
+        "output_window_bounds=%s, elided_window_bounds=%s.",
         window_bounds.size(),
         Join(gather_dim_numbers.output_window_dims(), ",").c_str(),
         Join(gather_dim_numbers.elided_window_dims(), ",").c_str());
@@ -2600,7 +2632,7 @@ static Status ValidateGatherDimensionNumbers(
       return InvalidArgument(
           "Window bound at index %d in gather op is out of range, must be "
           "within "
-          "[0, %lld), got %lld",
+          "[0, %lld), got %lld.",
           i, corresponding_input_bound + 1, window_bound);
     }
   }
@@ -2609,7 +2641,7 @@ static Status ValidateGatherDimensionNumbers(
     if (window_bounds[gather_dim_numbers.elided_window_dims(i)] != 1) {
       return InvalidArgument(
           "Gather op can only elide window indices with bound 1, but bound is "
-          "%lld for index %lld at position %d",
+          "%lld for index %lld at position %d.",
           window_bounds[gather_dim_numbers.elided_window_dims(i)],
           gather_dim_numbers.elided_window_dims(i), i);
     }
@@ -2632,6 +2664,9 @@ static Status ValidateGatherDimensionNumbers(
       }
       current_bound = window_bounds[window_dims_seen++];
     } else {
+      if (gather_dims_seen == gather_dim_numbers.index_vector_dim()) {
+        gather_dims_seen++;
+      }
       current_bound = expanded_gather_indices_shape[gather_dims_seen++];
     }
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 0d3045213db2230da3e18ffcb1a9923250560b64..9da2c99b4177f08ece8daabaf2922ddd7e947a1b 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -48,6 +48,8 @@ class ShapeInference {
   // given input shape.
   static StatusOr<Shape> InferUnaryOpShape(UnaryOperation operation,
                                            const Shape& arg);
+  static StatusOr<Shape> InferUnaryOpShape(HloOpcode opcode,
+                                           const Shape& shape);
   static StatusOr<Shape> InferUnaryOpShape(HloOpcode opcode,
                                            const HloInstruction* operand);
 
@@ -56,6 +58,9 @@ class ShapeInference {
   static StatusOr<Shape> InferBinaryOpShape(
       BinaryOperation operation, const Shape& lhs, const Shape& rhs,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  static StatusOr<Shape> InferBinaryOpShape(
+      HloOpcode opcode, const Shape& lhs, const Shape& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
   static StatusOr<Shape> InferBinaryOpShape(HloOpcode opcode,
                                             const HloInstruction* lhs,
                                             const HloInstruction* rhs);
@@ -65,6 +70,9 @@ class ShapeInference {
   static StatusOr<Shape> InferTernaryOpShape(TernaryOperation operation,
                                              const Shape& lhs, const Shape& rhs,
                                              const Shape& ehs);
+  static StatusOr<Shape> InferTernaryOpShape(HloOpcode opcode, const Shape& lhs,
+                                             const Shape& rhs,
+                                             const Shape& ehs);
   static StatusOr<Shape> InferTernaryOpShape(HloOpcode opcode,
                                              const HloInstruction* lhs,
                                              const HloInstruction* rhs,
@@ -75,6 +83,9 @@ class ShapeInference {
   static StatusOr<Shape> InferVariadicOpShape(
       VariadicOperation operation,
       tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
+  static StatusOr<Shape> InferVariadicOpShape(
+      HloOpcode opcode,
+      tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
   static StatusOr<Shape> InferVariadicOpShape(
       HloOpcode opcode,
       tensorflow::gtl::ArraySlice<const HloInstruction*> operands);
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 7eb120843fd841d841048eeaefd895fde96d133c..0e61994a786b53a295ef9c9c2287b28fbf754d9b 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -135,7 +135,7 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) {
       TernaryOperation::TRIOP_SELECT, pred_, matrix_64_48_, matrix_32_64_);
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().error_message(),
-              HasSubstr("operands to select must be the same shape"));
+              HasSubstr("Operands to select must be the same shape"));
 
   auto inferred_status_error2 = ShapeInference::InferTernaryOpShape(
       TernaryOperation::TRIOP_SELECT, s32_, matrix_64_48_, matrix_64_48_);
@@ -340,7 +340,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSourceShape) {
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().error_message(),
-              HasSubstr("source shape does not match"));
+              HasSubstr("Source shape does not match"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
@@ -351,7 +351,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().error_message(),
-              HasSubstr("select function must take 2 parameters"));
+              HasSubstr("Select function must take 2 parameters"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
@@ -362,7 +362,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().error_message(),
-              HasSubstr("select function must have rank-0 PRED"));
+              HasSubstr("Select function must have rank-0 PRED"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
@@ -373,7 +373,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().error_message(),
-              HasSubstr("select function's first parameter"));
+              HasSubstr("Select function's first parameter"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
@@ -384,7 +384,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().error_message(),
-              HasSubstr("select function's second parameter"));
+              HasSubstr("Select function's second parameter"));
 }
 
 TEST_F(ShapeInferenceTest, Convolve) {
@@ -906,7 +906,7 @@ TEST_F(ShapeInferenceTest, ScalarDotVector) {
       ShapeInference::InferDotOpShape(f32_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("dot only supports rank"));
+              HasSubstr("Dot only supports rank"));
 }
 
 // 3D <dot> 2D: error
@@ -918,7 +918,7 @@ TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
       ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("batch and contracting dimension number mismatch"));
+              HasSubstr("Batch and contracting dimension number mismatch"));
 }
 
 // vector <dot> vector -> scalar
@@ -1024,7 +1024,7 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("must specify one contracting dimension for both "
+              HasSubstr("Must specify one contracting dimension for both "
                         "lhs and rhs"));
 }
 
@@ -1044,7 +1044,7 @@ TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimSizesFails) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("batch dimension numbers and sizes must match"));
+              HasSubstr("Batch dimension numbers and sizes must match"));
 }
 
 // BatchMatMul with different batch dimension numbers fails.
@@ -1063,7 +1063,7 @@ TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersFails) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("batch dimension numbers must precede non-batch"));
+              HasSubstr("Batch dimension numbers must precede non-batch"));
 }
 
 // BatchMatMul with out-of-range dimension numbers fails.
@@ -1166,42 +1166,42 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
       BinaryOperation::BINOP_ADD, tensor, vec8, {});
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().error_message(),
-              HasSubstr("automatic"));
+              HasSubstr("Automatic"));
 
   // broadcast_dimension out of bounds for tensor's rank
   auto inferred_status_error2 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, vec8, {3});
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().error_message(),
-              ContainsRegex("broadcast dimension number .* too large"));
+              ContainsRegex("Broadcast dimension number .* too large"));
 
   // broadcast_dimension doesn't match corresponding dimension
   auto inferred_status_error3 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, vec8, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().error_message(),
-              HasSubstr("broadcast dimension 0 mismatch"));
+              HasSubstr("Broadcast dimension 0 mismatch"));
 
   // broadcast_dimensions list too long
   auto inferred_status_error4 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, matrix8_4, {0, 1, 2});
   ASSERT_FALSE(inferred_status_error4.ok());
   ASSERT_THAT(inferred_status_error4.status().error_message(),
-              HasSubstr("size of broadcast_dimensions has to match"));
+              HasSubstr("broadcast_dimensions has to match"));
 
   // there's a dimension above the rank of the tensor
   auto inferred_status_error5 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, matrix8_4, {3, 0});
   ASSERT_FALSE(inferred_status_error5.ok());
   ASSERT_THAT(inferred_status_error5.status().error_message(),
-              ContainsRegex("broadcast dimension number .* too large"));
+              ContainsRegex("dimension number .* too large"));
 
   // broadcasting dimensions don't match in this order
   auto inferred_status_error6 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, matrix8_4, {2, 1});
   ASSERT_FALSE(inferred_status_error6.ok());
   ASSERT_THAT(inferred_status_error6.status().error_message(),
-              HasSubstr("broadcast dimension 0 mismatch"));
+              HasSubstr("dimension 0 mismatch"));
 
   // The following two tests make sure that broadcasting dimensions are listed
   // in a proper (strictly increasing) order, even if the lower-rank array
@@ -1210,13 +1210,13 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
       BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {0, 0});
   ASSERT_FALSE(inferred_status_error7.ok());
   ASSERT_THAT(inferred_status_error7.status().error_message(),
-              HasSubstr("broadcast dimensions order is wrong"));
+              HasSubstr("dimensions order is wrong"));
 
   auto inferred_status_error8 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {1, 0});
   ASSERT_FALSE(inferred_status_error8.ok());
   ASSERT_THAT(inferred_status_error8.status().error_message(),
-              HasSubstr("broadcast dimensions order is wrong"));
+              HasSubstr("dimensions order is wrong"));
 }
 
 // Tests for the while instruction with proper shapes.
@@ -1242,7 +1242,7 @@ TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
       ShapeInference::InferWhileShape(bad_shape_1, body, result_shape);
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().error_message(),
-              HasSubstr("condition must take 1 arguments"));
+              HasSubstr("Condition must take 1 arguments"));
 
   auto bad_shape_2 =
       ShapeUtil::MakeProgramShape({s32_, result_shape}, result_shape);
@@ -1250,14 +1250,14 @@ TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
       ShapeInference::InferWhileShape(cond, bad_shape_2, result_shape);
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().error_message(),
-              HasSubstr("body must take 1 arguments"));
+              HasSubstr("Body must take 1 arguments"));
 
   auto bad_shape_3 = ShapeUtil::MakeProgramShape({result_shape}, s32_);
   auto inferred_status_error3 =
       ShapeInference::InferWhileShape(bad_shape_3, body, result_shape);
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().error_message(),
-              HasSubstr("condition must return a boolean"));
+              HasSubstr("Condition must return a boolean"));
 
   auto bad_shape_4 = ShapeUtil::MakeProgramShape({result_shape}, vector_32_);
   auto inferred_status_error4 =
@@ -1301,13 +1301,13 @@ TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/-1);
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().error_message(),
-              HasSubstr("dimension to concatenate along out of bounds: -1"));
+              HasSubstr("dimension out of bounds: -1"));
 
   auto inferred_status_error3 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/1);
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().error_message(),
-              HasSubstr("dimension to concatenate along out of bounds: 1"));
+              HasSubstr("dimension out of bounds: 1"));
 
   Shape tuple = ShapeUtil::MakeTupleShape({vector_32_});
   auto inferred_status_error4 = ShapeInference::InferConcatOpShape(
@@ -1315,21 +1315,20 @@ TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
   ASSERT_FALSE(inferred_status_error4.ok());
   ASSERT_THAT(
       inferred_status_error4.status().error_message(),
-      HasSubstr("Expected non-tuple argument for operand of concatenation."));
+      HasSubstr("Expected non-tuple argument for operand of concatenation"));
 
   const Shape vector_s32 = ShapeUtil::MakeShape(S32, {32});
   auto inferred_status_error5 = ShapeInference::InferConcatOpShape(
       {&vector_32_, &vector_s32}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(
-      inferred_status_error5.status().error_message(),
-      HasSubstr("cannot concatenate arrays with different element types"));
+  ASSERT_THAT(inferred_status_error5.status().error_message(),
+              HasSubstr("concatenate arrays with different element types"));
 
   auto inferred_status_error6 = ShapeInference::InferConcatOpShape(
       {&matrix_32_48_, &matrix_32_64_}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error6.ok());
   ASSERT_THAT(inferred_status_error6.status().error_message(),
-              HasSubstr("cannot concatenate arrays that differ in "
+              HasSubstr("concatenate arrays that differ in "
                         "dimensions other than the one being "
                         "concatenated"));
 }
@@ -1467,7 +1466,7 @@ TEST_F(ShapeInferenceTest, Conditional) {
       ShapeUtil::MakeProgramShape({vector_64_}, f32_));
   EXPECT_FALSE(inferred_status_error0.ok());
   EXPECT_THAT(inferred_status_error0.status().error_message(),
-              HasSubstr("predicate must be a boolean"));
+              HasSubstr("Predicate must be a boolean"));
 
   auto inferred_status_error1 = ShapeInference::InferConditionalShape(
       pred_, ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_,
@@ -1530,11 +1529,17 @@ TEST_F(ShapeInferenceTest, BadSlice) {
 
 class GatherShapeInferenceTest : public ShapeInferenceTest {
  protected:
+  const Shape s64_scalar_ = ShapeUtil::MakeShape(S64, {});
+  const Shape s64_vector_5_ = ShapeUtil::MakeShape(S64, {5});
   const Shape s64_vector_32_ = ShapeUtil::MakeShape(S64, {32});
   const Shape s64_4d_tensor_10_9_8_7_1_ =
       ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1});
   const Shape s64_4d_tensor_10_9_8_7_5_ =
       ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 5});
+  const Shape s64_4d_tensor_5_10_9_7_6_ =
+      ShapeUtil::MakeShape(S64, {5, 10, 9, 7, 6});
+  const Shape s64_4d_tensor_10_9_5_7_6_ =
+      ShapeUtil::MakeShape(S64, {10, 9, 5, 7, 6});
   const Shape f32_5d_tensor_50_49_48_47_46_ =
       ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46});
   const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
@@ -1548,7 +1553,8 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGather) {
                                        HloInstruction::MakeGatherDimNumbers(
                                            /*output_window_dims=*/{0},
                                            /*elided_window_dims=*/{1},
-                                           /*gather_dims_to_operand_dims=*/{1}),
+                                           /*gather_dims_to_operand_dims=*/{1},
+                                           /*index_vector_dim=*/1),
                                        /*window_bounds=*/{64, 1}));
   EXPECT_TRUE(
       ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {64, 32})))
@@ -1562,7 +1568,8 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGatherV2) {
                                        HloInstruction::MakeGatherDimNumbers(
                                            /*output_window_dims=*/{1},
                                            /*elided_window_dims=*/{0},
-                                           /*gather_dims_to_operand_dims=*/{0}),
+                                           /*gather_dims_to_operand_dims=*/{0},
+                                           /*index_vector_dim=*/1),
                                        /*window_bounds=*/{1, 48}));
   EXPECT_TRUE(
       ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {32, 48})))
@@ -1576,7 +1583,8 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGatherNd) {
                                        HloInstruction::MakeGatherDimNumbers(
                                            /*output_window_dims=*/{4},
                                            /*elided_window_dims=*/{0},
-                                           /*gather_dims_to_operand_dims=*/{0}),
+                                           /*gather_dims_to_operand_dims=*/{0},
+                                           /*index_vector_dim=*/4),
                                        /*window_bounds=*/{1, 48}));
   EXPECT_TRUE(ShapeUtil::Equal(gather_shape,
                                ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 48})))
@@ -1591,7 +1599,8 @@ TEST_F(GatherShapeInferenceTest, TensorFlowBatchDynamicSlice) {
           HloInstruction::MakeGatherDimNumbers(
               /*output_window_dims=*/{4, 5, 6, 7, 8},
               /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*index_vector_dim=*/4),
           /*window_bounds=*/{30, 29, 28, 27, 26}));
   EXPECT_TRUE(ShapeUtil::Equal(
       gather_shape,
@@ -1599,12 +1608,85 @@ TEST_F(GatherShapeInferenceTest, TensorFlowBatchDynamicSlice) {
       << ShapeUtil::HumanString(gather_shape);
 }
 
+TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      Shape gather_shape,
+      ShapeInference::InferGatherShape(
+          f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
+          HloInstruction::MakeGatherDimNumbers(
+              /*output_window_dims=*/{4, 5, 6, 7, 8},
+              /*elided_window_dims=*/{},
+              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*index_vector_dim=*/2),
+          /*window_bounds=*/{30, 29, 28, 27, 26}));
+
+  EXPECT_TRUE(ShapeUtil::Equal(
+      gather_shape,
+      ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26})))
+      << ShapeUtil::HumanString(gather_shape);
+}
+
+TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      Shape gather_shape,
+      ShapeInference::InferGatherShape(
+          f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_5_10_9_7_6_,
+          HloInstruction::MakeGatherDimNumbers(
+              /*output_window_dims=*/{4, 5, 6, 7, 8},
+              /*elided_window_dims=*/{},
+              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*index_vector_dim=*/0),
+          /*window_bounds=*/{30, 29, 28, 27, 26}));
+
+  EXPECT_TRUE(ShapeUtil::Equal(
+      gather_shape,
+      ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26})))
+      << ShapeUtil::HumanString(gather_shape);
+}
+
+TEST_F(GatherShapeInferenceTest, NoOutputGatherDims) {
+  // This is equivalent to a dynamic slice.
+  TF_ASSERT_OK_AND_ASSIGN(
+      Shape gather_shape,
+      ShapeInference::InferGatherShape(
+          f32_5d_tensor_50_49_48_47_46_, s64_vector_5_,
+          HloInstruction::MakeGatherDimNumbers(
+              /*output_window_dims=*/{0, 1, 2, 3, 4},
+              /*elided_window_dims=*/{},
+              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*index_vector_dim=*/0),
+          /*window_bounds=*/{30, 29, 28, 27, 26}));
+
+  EXPECT_TRUE(ShapeUtil::Equal(gather_shape,
+                               ShapeUtil::MakeShape(F32, {30, 29, 28, 27, 26})))
+      << ShapeUtil::HumanString(gather_shape);
+}
+
+TEST_F(GatherShapeInferenceTest, ScalarGatherIndices) {
+  // The gather indices "tensor" is a scalar S here that's used to slice out
+  // [S,0,0,0,0]..[S,30,29,28,27] into a [30,29,28,27] shaped result.
+  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+                          ShapeInference::InferGatherShape(
+                              f32_5d_tensor_50_49_48_47_46_, s64_scalar_,
+                              HloInstruction::MakeGatherDimNumbers(
+                                  /*output_window_dims=*/{0, 1, 2, 3},
+                                  /*elided_window_dims=*/{0},
+                                  /*gather_dims_to_operand_dims=*/{0},
+                                  /*index_vector_dim=*/0),
+                              /*window_bounds=*/{1, 30, 29, 28, 27}));
+
+  EXPECT_TRUE(ShapeUtil::Equal(gather_shape,
+                               ShapeUtil::MakeShape(F32, {30, 29, 28, 27})))
+      << ShapeUtil::HumanString(gather_shape);
+}
+
 TEST_F(GatherShapeInferenceTest, TupleShapedTensorInput) {
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       tuple_shape_, s64_vector_32_,
       HloInstruction::MakeGatherDimNumbers(/*output_window_dims=*/{0},
                                            /*elided_window_dims=*/{1},
-                                           /*gather_dims_to_operand_dims=*/{1}),
+                                           /*gather_dims_to_operand_dims=*/{1},
+                                           /*index_vector_dim=*/1),
       /*window_bounds=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
@@ -1617,7 +1699,8 @@ TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) {
       s64_vector_32_, tuple_shape_,
       HloInstruction::MakeGatherDimNumbers(/*output_window_dims=*/{0},
                                            /*elided_window_dims=*/{1},
-                                           /*gather_dims_to_operand_dims=*/{1}),
+                                           /*gather_dims_to_operand_dims=*/{1},
+                                           /*index_vector_dim=*/0),
       /*window_bounds=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
@@ -1625,25 +1708,13 @@ TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) {
       << statusor.status();
 }
 
-TEST_F(GatherShapeInferenceTest, ScalarGatherIndicesInput) {
-  StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
-      s64_vector_32_, s32_,
-      HloInstruction::MakeGatherDimNumbers(/*output_window_dims=*/{0},
-                                           /*elided_window_dims=*/{1},
-                                           /*gather_dims_to_operand_dims=*/{1}),
-      /*window_bounds=*/{64, 1});
-  ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Gather indices parameter must at least of rank 1"))
-      << statusor.status();
-}
-
 TEST_F(GatherShapeInferenceTest, FloatingPointGatherIndicesInput) {
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       s64_vector_32_, vector_32_,
       HloInstruction::MakeGatherDimNumbers(/*output_window_dims=*/{0},
                                            /*elided_window_dims=*/{1},
-                                           /*gather_dims_to_operand_dims=*/{1}),
+                                           /*gather_dims_to_operand_dims=*/{1},
+                                           /*index_vector_dim=*/0),
       /*window_bounds=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
@@ -1658,7 +1729,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 8, 7},
           /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
@@ -1674,7 +1746,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7, 7},
           /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
@@ -1690,7 +1763,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 99, 100, 101},
           /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
@@ -1698,6 +1772,22 @@ TEST_F(GatherShapeInferenceTest,
       << statusor.status();
 }
 
+TEST_F(GatherShapeInferenceTest,
+       InvalidGatherDimNumbers_WindowIndexBarelyOutOfBounds) {
+  StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+      f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
+      HloInstruction::MakeGatherDimNumbers(
+          /*output_window_dims=*/{4, 5, 6, 7, 9},
+          /*elided_window_dims=*/{},
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
+      /*window_bounds=*/{30, 29, 28, 27, 26});
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Window index 4 in gather op is out of bounds"))
+      << statusor.status();
+}
+
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_MismatchingElidedWindowDims) {
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
@@ -1705,7 +1795,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7, 8},
           /*elided_window_dims=*/{4},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
@@ -1722,7 +1813,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7, 8},
           /*elided_window_dims=*/{0, 1, 2, 3, 19},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
@@ -1738,7 +1830,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7, 8},
           /*elided_window_dims=*/{0, 1, 2, 3, 3},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
@@ -1755,15 +1848,15 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7, 8},
           /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
-      HasSubstr(
-          "There must be exactly as many elements in "
-          "gather_dims_to_operand_dims "
-          "as there are elements in the last dimension of %gather_indices"))
+      HasSubstr("Gather op has 4 elements in gather_dims_to_operand_dims and "
+                "the bound of dimension index_vector_dim=4 of "
+                "gather_indices is 5. These two numbers must be equal."))
       << statusor.status();
 }
 
@@ -1774,7 +1867,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7, 8},
           /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 7}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 7},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
@@ -1791,7 +1885,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7, 8},
           /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 3}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 3},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
@@ -1808,7 +1903,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7, 8},
           /*elided_window_dims=*/{2, 1},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{1, 1, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
@@ -1822,7 +1918,8 @@ TEST_F(GatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsTooLarge) {
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7},
           /*elided_window_dims=*/{2},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 1, 300, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
@@ -1838,7 +1935,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7, 8},
           /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
@@ -1855,7 +1953,8 @@ TEST_F(GatherShapeInferenceTest,
       HloInstruction::MakeGatherDimNumbers(
           /*output_window_dims=*/{4, 5, 6, 7},
           /*elided_window_dims=*/{1},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}),
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/4),
       /*window_bounds=*/{30, 29, 28, 26, 20});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
@@ -1864,5 +1963,22 @@ TEST_F(GatherShapeInferenceTest,
       << statusor.status();
 }
 
+TEST_F(GatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) {
+  StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+      f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
+      HloInstruction::MakeGatherDimNumbers(
+          /*output_window_dims=*/{4, 5, 6, 7, 8},
+          /*elided_window_dims=*/{},
+          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*index_vector_dim=*/32),
+      /*window_bounds=*/{30, 29, 28, 27, 26});
+
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Gather index leaf dimension must be within [0, "
+                        "rank(gather_indices) + 1)"))
+      << statusor.status();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 6e9986165f7eaf71a964b42b734a5ae5db5e45d7..fb3b5f06dad67b4305aed0305c9f6441e666db53 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 using ::tensorflow::strings::Appendf;
@@ -68,6 +66,8 @@ ShapedBuffer& ShapedBuffer::operator=(ShapedBuffer&& s) {
   return *this;
 }
 
+ShapedBuffer::~ShapedBuffer() {}
+
 void ShapedBuffer::clear() {
   for (auto& pair : buffers_) {
     // A default constructed DeviceMemoryBase is a null pointer.
@@ -104,18 +104,6 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
   return out;
 }
 
-/* static */
-StatusOr<std::unique_ptr<ScopedShapedBuffer>> ScopedShapedBuffer::MakeScoped(
-    ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator) {
-  auto scoped_buffer = WrapUnique(new ScopedShapedBuffer(
-      shaped_buffer->on_host_shape(), shaped_buffer->on_device_shape(),
-      allocator, shaped_buffer->device_ordinal()));
-  scoped_buffer->buffers_ = shaped_buffer->buffers();
-  shaped_buffer->clear();
-
-  return std::move(scoped_buffer);
-}
-
 ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
                                        const Shape& on_device_shape,
                                        DeviceMemoryAllocator* allocator,
@@ -128,7 +116,25 @@ ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
                                        DeviceMemoryAllocator* allocator)
     : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {}
 
+ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
+    : ShapedBuffer(static_cast<ShapedBuffer&&>(s)), allocator_(s.allocator_) {
+  // Null out s.allocator_ so it doesn't try to free anything in its destructor.
+  s.allocator_ = nullptr;
+}
+
+ScopedShapedBuffer& ScopedShapedBuffer::operator=(ScopedShapedBuffer&& s) {
+  *static_cast<ShapedBuffer*>(this) = std::move(static_cast<ShapedBuffer&>(s));
+  allocator_ = s.allocator_;
+  // Null out s.allocator_ so it doesn't try to free anything in its destructor.
+  s.allocator_ = nullptr;
+  return *this;
+}
+
 ScopedShapedBuffer::~ScopedShapedBuffer() {
+  // allocator_ will be null if we were moved-from.
+  if (allocator_ == nullptr) {
+    return;
+  }
   // Deallocate all non-null buffers. A buffer may appear in more than one spot
   // in the shape (eg, a tuple with a repeated element) so keep track of what
   // has been deallocated.
@@ -144,9 +150,9 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
   }
 }
 
-std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
-  auto shaped_buffer = MakeUnique<ShapedBuffer>(std::move(*this));
-  buffers_ = ShapeTree<perftools::gputools::DeviceMemoryBase>();
+ShapedBuffer ScopedShapedBuffer::release() {
+  ShapedBuffer shaped_buffer(static_cast<ShapedBuffer&&>(*this));
+  buffers_ = ShapeTree<se::DeviceMemoryBase>();
   return shaped_buffer;
 }
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index b816df8385ef65b0b69ede1d6e65a1991b4bd7c6..e10fca9e9466c018f6cb4da2f5618e4db4977307 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -30,6 +30,8 @@ limitations under the License.
 
 namespace xla {
 
+class ScopedShapedBuffer;
+
 // Class which encapsulates a buffer or set of buffers containing data of a
 // particular XLA shape.
 class ShapedBuffer {
@@ -41,8 +43,19 @@ class ShapedBuffer {
   // determines the number of device allocations (DeviceMemoryBase) held by the
   // ShapedBuffer.
   ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
-               const perftools::gputools::Platform* platform,
-               int device_ordinal);
+               const se::Platform* platform, int device_ordinal);
+
+  // Movable, but not copyable.
+  ShapedBuffer(ShapedBuffer&& s);
+  ShapedBuffer& operator=(ShapedBuffer&&);
+  ShapedBuffer(const ShapedBuffer&) = delete;
+  ShapedBuffer& operator=(const ShapedBuffer&) = delete;
+
+  // Prevent (some forms of) accidental object slicing.
+  ShapedBuffer(const ScopedShapedBuffer&) = delete;
+  ShapedBuffer& operator=(const ScopedShapedBuffer&) = delete;
+
+  virtual ~ShapedBuffer();
 
   // Returns the shape of the on-host representation of the data held by this
   // ShapedBuffer.
@@ -52,48 +65,36 @@ class ShapedBuffer {
   // ShapedBuffer.
   const Shape& on_device_shape() const { return on_device_shape_; }
 
-  const perftools::gputools::Platform* platform() const { return platform_; }
+  const se::Platform* platform() const { return platform_; }
   int device_ordinal() const { return device_ordinal_; }
 
   // Return the root buffer of the shape (shape index {}).
-  const perftools::gputools::DeviceMemoryBase& root_buffer() const {
+  const se::DeviceMemoryBase& root_buffer() const {
     return buffer(/*index=*/{});
   }
 
   // Returns the buffer at the given shape index where index is defined as in
   // ShapeUtil::GetSubshape.
-  const perftools::gputools::DeviceMemoryBase& buffer(
-      const ShapeIndex& index) const {
+  const se::DeviceMemoryBase& buffer(const ShapeIndex& index) const {
     return buffers_.element(index);
   }
 
   // Sets the device memory buffer at the given index.
-  void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer,
-                  const ShapeIndex& index) {
+  void set_buffer(const se::DeviceMemoryBase& buffer, const ShapeIndex& index) {
     *buffers_.mutable_element(index) = buffer;
   }
 
   // Returns the underlying ShapeTree containing all the device addresses in the
   // ShapedBuffer.
-  const ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() const {
-    return buffers_;
-  }
-  ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() {
-    return buffers_;
-  }
+  const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
+  ShapeTree<se::DeviceMemoryBase>& buffers() { return buffers_; }
 
   // Set all device memory pointers in the object to null.
   void clear();
 
   string ToString() const;
 
-  ShapedBuffer(ShapedBuffer&& s);
-  ShapedBuffer& operator=(ShapedBuffer&&);
-
  protected:
-  ShapedBuffer(const ShapedBuffer&) = delete;
-  ShapedBuffer& operator=(const ShapedBuffer&) = delete;
-
   // The shape of the data when represented on the host.
   Shape on_host_shape_;
 
@@ -101,13 +102,13 @@ class ShapedBuffer {
   Shape on_device_shape_;
 
   // The platform the memory is allocated on.
-  const perftools::gputools::Platform* platform_;
+  const se::Platform* platform_;
 
   // The device the memory is allocated on.
   int device_ordinal_;
 
   // The tree of device buffers. Its shape is on_device_shape().
-  ShapeTree<perftools::gputools::DeviceMemoryBase> buffers_;
+  ShapeTree<se::DeviceMemoryBase> buffers_;
 };
 
 std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
@@ -115,41 +116,45 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
 // ShapedBuffer derived class which allocates all internal buffers on
 // construction and deallocates the memory when the object is
 // destructed.
+//
+// TODO(timshen): Remove inheritance between ScopedShapedBuffer and
+// ShapedBuffer.  There should never be a need to consider a ScopedShapedBuffer
+// as a ShapedBuffer, because in that case we should just be able to pass around
+// our ShapeTree<DeviceMemoryBase>.  Inheritance only adds complexity.  See
+// discussion in cl/192849370.
 class ScopedShapedBuffer : public ShapedBuffer {
  public:
-  // Takes a ShapedBuffer and returns a ScopedShapedBuffer which manages the
-  // deallocation of the device memory held in the shaped buffer. All device
-  // memory pointers in the given ShapedBuffer are set to null.
-  static StatusOr<std::unique_ptr<ScopedShapedBuffer>> MakeScoped(
-      ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator);
-
-  // Create a ScopedShapedBuffer with null DeviceMemoryBases at each index.
-  ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
-                     DeviceMemoryAllocator* allocator, int device_ordinal);
+  // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
+  explicit ScopedShapedBuffer(const Shape& on_host_shape,
+                              const Shape& on_device_shape,
+                              DeviceMemoryAllocator* allocator,
+                              int device_ordinal);
 
   // Create a ScopedShapedBuffer by taking over the memory from the incoming
   // ShapedBuffer.
-  ScopedShapedBuffer(ShapedBuffer shaped_buffer,
-                     DeviceMemoryAllocator* allocator);
+  explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer,
+                              DeviceMemoryAllocator* allocator);
+
+  // Movable, but not copyable.
+  ScopedShapedBuffer(ScopedShapedBuffer&& s);
+  ScopedShapedBuffer& operator=(ScopedShapedBuffer&&);
+  ScopedShapedBuffer(const ScopedShapedBuffer&) = delete;
+  ScopedShapedBuffer& operator=(const ScopedShapedBuffer&) = delete;
+
+  // All buffers in the shape are deallocated on destruction.
+  ~ScopedShapedBuffer() override;
 
   // Return the allocator used to allocate the device memory held in this
   // ScopedShapedBuffer.
   DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
 
-  // Release all device memory owned by this ScopedShapedBuffer and
-  // return the device memory pointers in the form of a
-  // ShapedBuffer. The returned ShapedBuffer takes over the memory
-  // from the ScopedShapedBuffer. The resulting ScopedShapedBuffer can
-  // only be destroyed.
-  std::unique_ptr<ShapedBuffer> release();
-
-  // All buffers in the shape are deallocated on destruction.
-  virtual ~ScopedShapedBuffer();
+  // Releases all device memory owned by this ScopedShapedBuffer and returns the
+  // device memory pointers in the form of a ShapedBuffer. The returned
+  // ShapedBuffer takes over the memory from the ScopedShapedBuffer. The
+  // resulting ScopedShapedBuffer can only be destroyed.
+  ShapedBuffer release();
 
  protected:
-  ScopedShapedBuffer(const ScopedShapedBuffer&) = delete;
-  void operator=(const ScopedShapedBuffer&) = delete;
-
   DeviceMemoryAllocator* allocator_;
 };
 
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 2f36e2b16e0f2eed10aef811dd3cceeba6a5b8a9..8b71a415091f028b3167cddb2583754e72ba17c8 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -25,24 +25,20 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 /* static */ tensorflow::mutex
     TransferManager::platform_transfer_manager_mutex_(
         tensorflow::LINKER_INITIALIZED);
 
-/* static */ std::map<perftools::gputools::Platform::Id,
-                      TransferManager::State>*
+/* static */ std::map<se::Platform::Id, TransferManager::State>*
 TransferManager::GetPlatformTransferManagers() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, TransferManager::State>;
+  static auto* r = new std::map<se::Platform::Id, TransferManager::State>;
   return r;
 }
 
 Status TransferManager::TransferArrayToDevice(
-    perftools::gputools::StreamExecutor* executor, const Literal& literal,
-    const perftools::gputools::DeviceMemoryBase& dest) {
+    se::StreamExecutor* executor, const Literal& literal,
+    const se::DeviceMemoryBase& dest) {
   const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
   TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
       << "On-device representation of "
@@ -61,8 +57,8 @@ Status TransferManager::TransferArrayToDevice(
 }
 
 StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
-    perftools::gputools::StreamExecutor* executor, const Shape& shape,
-    const perftools::gputools::DeviceMemoryBase& source) {
+    se::StreamExecutor* executor, const Shape& shape,
+    const se::DeviceMemoryBase& source) {
   TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape))
       << "Shape " << ShapeUtil::HumanString(shape)
       << " has a differently shaped representation on-device: "
@@ -112,8 +108,7 @@ StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
 }
 
 Status TransferManager::WriteTupleIndexTables(
-    perftools::gputools::StreamExecutor* executor,
-    const ShapedBuffer& device_buffer) {
+    se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
   VLOG(2) << "Writing tuple index tables for " << device_buffer;
 
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
@@ -180,7 +175,7 @@ Status TransferManager::TransferBufferToDevice(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> TransferManager::AllocateShapedBuffer(
+StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
     const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
     int device_ordinal) {
   if (!LayoutUtil::HasLayout(on_host_shape)) {
@@ -192,31 +187,21 @@ StatusOr<std::unique_ptr<ShapedBuffer>> TransferManager::AllocateShapedBuffer(
   const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape);
   TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape));
 
-  auto shaped_buffer = WrapUnique(new ShapedBuffer(
-      on_host_shape, on_device_shape, allocator->platform(), device_ordinal));
+  ScopedShapedBuffer shaped_buffer(on_host_shape, on_device_shape, allocator,
+                                   device_ordinal);
 
   // Allocate an appropriate sized buffer for each element in the shape
   // including the tuple pointer arrays.
-  for (auto& pair : shaped_buffer->buffers()) {
+  for (auto& pair : shaped_buffer.buffers()) {
     const ShapeIndex& index = pair.first;
     se::DeviceMemoryBase& memory_base = pair.second;
     const Shape& subshape = ShapeUtil::GetSubshape(on_device_shape, index);
     TF_ASSIGN_OR_RETURN(memory_base,
-                        allocator->Allocate(shaped_buffer->device_ordinal(),
+                        allocator->Allocate(shaped_buffer.device_ordinal(),
                                             GetByteSizeRequirement(subshape)));
   }
 
   return std::move(shaped_buffer);
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-TransferManager::AllocateScopedShapedBuffer(const Shape& on_host_shape,
-                                            DeviceMemoryAllocator* allocator,
-                                            int device_ordinal) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> unscoped_buffer,
-      AllocateShapedBuffer(on_host_shape, allocator, device_ordinal));
-  return ScopedShapedBuffer::MakeScoped(unscoped_buffer.get(), allocator);
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 9f2b5c4aecf0b52f610171e0c2755de577b2bd9e..d82b4f0f81b5da38c1caf80bddefa0d3f7842463 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -42,7 +42,7 @@ class TransferManager {
   virtual ~TransferManager() {}
 
   // Returns the ID of the platform that this transfer manager acts on.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;
 
   // Returns the shape of the on-device representation for the given shape on
   // the host. This is intended for use with ShapedBuffer where buffers are
@@ -58,48 +58,45 @@ class TransferManager {
   // DeviceShape(literal_shape) must be compatible, but need not have the same
   // layout.
   virtual StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const ShapedBuffer& device_buffer) = 0;
+      se::StreamExecutor* executor, const ShapedBuffer& device_buffer) = 0;
 
   // Transfers the given literal into the previously allocated device memory
   // represented by the given ShapedBuffer using the given executor. The shape
   // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
   // but need not have the same layout
-  virtual Status TransferLiteralToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      const ShapedBuffer& device_buffer) = 0;
+  virtual Status TransferLiteralToDevice(se::StreamExecutor* executor,
+                                         const Literal& literal,
+                                         const ShapedBuffer& device_buffer) = 0;
 
   // Convenience methods for transferring an array to or from the device at a
   // known address. This avoids having to construct a ShapedBuffer just to
   // transfer an array at a known address.
-  Status TransferArrayToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      const perftools::gputools::DeviceMemoryBase& dest);
+  Status TransferArrayToDevice(se::StreamExecutor* executor,
+                               const Literal& literal,
+                               const se::DeviceMemoryBase& dest);
   StatusOr<std::unique_ptr<Literal>> TransferArrayFromDevice(
-      perftools::gputools::StreamExecutor* executor, const Shape& shape,
-      const perftools::gputools::DeviceMemoryBase& source);
+      se::StreamExecutor* executor, const Shape& shape,
+      const se::DeviceMemoryBase& source);
 
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
-  virtual Status TransferLiteralToInfeed(
-      perftools::gputools::StreamExecutor* executor,
-      const Literal& literal) = 0;
+  virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                         const Literal& literal) = 0;
 
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor.
-  virtual Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) = 0;
+  virtual Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                            const Shape& literal_shape,
+                                            Literal* literal) = 0;
 
   // Resets the devices associated with this transfer manager.
   virtual Status ResetDevices(
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          executor) = 0;
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executor) = 0;
 
   // Given an allocated ShapedBuffer, constructs the tuple index table(s) in
   // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the
   // ShapedBuffer is array-shaped this method does nothing.
-  Status WriteTupleIndexTables(perftools::gputools::StreamExecutor* executor,
+  Status WriteTupleIndexTables(se::StreamExecutor* executor,
                                const ShapedBuffer& device_buffer);
 
   // Determines the byte size requirement for the given shape on the underlying
@@ -107,13 +104,10 @@ class TransferManager {
   // region for a host-to-device transfer.
   virtual int64 GetByteSizeRequirement(const Shape& shape) const = 0;
 
-  // Allocate a ShapedBuffer which can hold data with the given on-host
+  // Allocates a ScopedShapedBuffer which can hold data with the given on-host
   // shape. The on-device shape may be different as indicated by
   // HostShapeToDeviceShape.
-  StatusOr<std::unique_ptr<ShapedBuffer>> AllocateShapedBuffer(
-      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
-      int device_ordinal);
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> AllocateScopedShapedBuffer(
+  StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
       const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
       int device_ordinal);
 
@@ -127,13 +121,13 @@ class TransferManager {
   // Precondition: a platform kind must not be registered more than once.
   typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
   static void RegisterTransferManager(
-      perftools::gputools::Platform::Id platform_id,
+      se::Platform::Id platform_id,
       TransferManagerCreationFunction transfer_manager);
 
   // Returns the transfer manager singleton pointer if it is available for the
   // given platform, or an error status if it is not.
   static StatusOr<TransferManager*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+      const se::Platform* platform);
 
  protected:
   // Transfer a memory block of the given size from 'source' buffer to the
@@ -143,35 +137,32 @@ class TransferManager {
   //
   // source is the source data that must be in the target-dependent layout that
   // the Infeed HLO used in the computation expects.
-  virtual Status TransferBufferToInfeed(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source) = 0;
+  virtual Status TransferBufferToInfeed(se::StreamExecutor* executor,
+                                        int64 size, const void* source) = 0;
 
   // Transfer a memory block of the given size from the device source into the
   // 'destination' buffer.
   //
   // size is the size to transfer to destination in bytes.
-  virtual Status TransferBufferFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source, int64 size,
-      void* destination);
+  virtual Status TransferBufferFromDevice(se::StreamExecutor* executor,
+                                          const se::DeviceMemoryBase& source,
+                                          int64 size, void* destination);
 
   // Transfer a memory block of the given size from 'source' buffer to the given
   // destination of the device.
   //
   // size is the size to transfer from source in bytes.
-  virtual Status TransferBufferToDevice(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source, perftools::gputools::DeviceMemoryBase* destination);
+  virtual Status TransferBufferToDevice(se::StreamExecutor* executor,
+                                        int64 size, const void* source,
+                                        se::DeviceMemoryBase* destination);
 
   // Writes the given device-memory pointers in 'elements' to the given region
   // to construct a tuple index table in the platform-specific tuple
   // representation.
   virtual Status WriteSingleTupleIndexTable(
-      perftools::gputools::StreamExecutor* executor,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          elements,
-      const Shape& shape, perftools::gputools::DeviceMemoryBase* region) = 0;
+      se::StreamExecutor* executor,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
+      const Shape& shape, se::DeviceMemoryBase* region) = 0;
 
  private:
   // The mutex that guards the platform-to-transfer manager map.
@@ -186,8 +177,7 @@ class TransferManager {
   };
 
   // Map from platform kind to transfer manager singleton.
-  static std::map<perftools::gputools::Platform::Id, State>*
-  GetPlatformTransferManagers();
+  static std::map<se::Platform::Id, State>* GetPlatformTransferManagers();
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 83185ac49e9b7c386d10d1cbc4e20dcdfdfd6cae..3efd38ce0daa3e3f3398b32463019df6cd10a009 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -159,6 +159,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   auto new_conv = HloInstruction::CreateConvolve(
       convolution.shape(), new_lhs, new_rhs, convolution.window(), new_dnums);
+  convolution.SetupDerivedInstruction(new_conv.get());
   TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
       &convolution, std::move(new_conv)));
 
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index caa1a111ad880b9dee62c1c94e32e8275c196fbf..c7c41603459189d11a0399740bf3df2507e621a5 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -71,10 +71,10 @@ TEST_F(TransposeFoldingTest, FoldDotTranspose) {
       HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
                                 /*rhs=*/transpose_y, dot_dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(dot));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(dot));
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the fusion.
   std::unordered_set<HloInstruction*> instruction_set(
@@ -114,10 +114,10 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
       ShapeUtil::MakeShape(F32, {1, 3}),
       /*lhs=*/transpose0, /*rhs=*/transpose1, dot_dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(dot));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(dot));
+  FoldTranspose(module.get());
 
   for (auto* instruction : entry_computation->instructions()) {
     if (instruction->opcode() == HloOpcode::kFusion) {
@@ -149,10 +149,10 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   HloInstruction* mul = builder.AddInstruction(HloInstruction::CreateBinary(
       add->shape(), HloOpcode::kMultiply, add, sub));
 
-  HloModule module("fuse_with_constant_operands");
+  auto module = CreateNewModule("fuse_with_constant_operands");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(mul));
-  HloInstruction* call = module.OutlineExpressionFromComputation(
+      module->AddEntryComputation(builder.Build(mul));
+  HloInstruction* call = module->OutlineExpressionFromComputation(
       {add, sub, mul}, "", entry_computation);
   EXPECT_EQ(call, entry_computation->root_instruction());
   HloComputation* callee_computation = call->to_apply();
@@ -182,14 +182,14 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeInWhile) {
       HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
                                 /*rhs=*/transpose_y, dot_dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(dot));
+      module->AddEntryComputation(builder.Build(dot));
 
-  HloInstruction* call = module.OutlineExpressionFromComputation(
+  HloInstruction* call = module->OutlineExpressionFromComputation(
       {transpose_y, dot}, "outlined", entry_computation);
 
-  FoldTranspose(&module);
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the fusion.
   std::unordered_set<HloInstruction*> instruction_set(
@@ -240,10 +240,10 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), x, transpose_y, window, dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(conv));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(conv));
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the convolution.
   std::unordered_set<HloInstruction*> instruction_set(
@@ -293,10 +293,10 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), x, transpose_y, window, dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(conv));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(conv));
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the convolution.
   std::unordered_set<HloInstruction*> instruction_set(
@@ -351,10 +351,10 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), transpose_x, y, window, dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(conv));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(conv));
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the convolution.
   std::unordered_set<HloInstruction*> instruction_set(
@@ -415,10 +415,10 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), transpose_x, y, window, dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(conv));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(conv));
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the convolution.
   std::unordered_set<HloInstruction*> instruction_set(
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 4a55e4095aa92cbdcd1bcb585dc851b2c5e9a32c..0f16a592b68e20f5dbd1e4655ad5720ecce5a7bd 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -49,6 +49,8 @@ HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
       return HloOpcode::kAbs;
     case UNOP_CEIL:
       return HloOpcode::kCeil;
+    case UNOP_CLZ:
+      return HloOpcode::kClz;
     case UNOP_COS:
       return HloOpcode::kCos;
     case UNOP_EXP:
@@ -226,7 +228,8 @@ StatusOr<ComputationDataHandle> UserComputation::AddParameterInstruction(
   return handle;
 }
 
-Status UserComputation::AddSendInstruction(const SendRequest& send_request) {
+StatusOr<ComputationDataHandle> UserComputation::AddSendInstruction(
+    const SendRequest& send_request) {
   tensorflow::mutex_lock lock(mutex_);
 
   // Check if the operand of the instruction is valid.
@@ -244,7 +247,7 @@ Status UserComputation::AddSendInstruction(const SendRequest& send_request) {
   VLOG(1) << "AddSendInstruction (" << GetVersionedHandleInternal()
           << "), data handle " << handle.handle() << ": "
           << send_request.ShortDebugString();
-  return Status::OK();
+  return handle;
 }
 
 StatusOr<ComputationDataHandle> UserComputation::AddRecvInstruction(
@@ -1283,8 +1286,8 @@ StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
     TF_RETURN_IF_ERROR(LookUpRequest(handle).status());
   }
 
-  if (tensorflow::StringPiece(custom_call_request.call_target_name())
-          .starts_with("$")) {
+  if (tensorflow::str_util::StartsWith(custom_call_request.call_target_name(),
+                                       "$")) {
     return InvalidArgument(
         "Invalid custom_call_target \"%s\": Call targets that start with '$' "
         "are reserved for internal use.",
@@ -3314,20 +3317,23 @@ void ComputationLowerer::Visit(
       HloInstruction* rhs = lookup_instruction(ternary_op_request.rhs());
       HloInstruction* ehs = lookup_instruction(ternary_op_request.ehs());
       auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop());
-
-      if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) {
-        if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
+      if (debug_options_.xla_eliminate_hlo_implicit_broadcast() &&
+          !ShapeUtil::IsTuple(request.output_shape())) {
+        if (!ShapeUtil::IsTuple(lhs->shape()) &&
+            !ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
           // lhs side is being implicitly broadcast. Change to explicit.
           lhs =
               ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape());
         }
 
-        if (!ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) {
+        if (!ShapeUtil::IsTuple(rhs->shape()) &&
+            !ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) {
           rhs =
               ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape());
         }
 
-        if (!ShapeUtil::SameDimensions(request.output_shape(), ehs->shape())) {
+        if (!ShapeUtil::IsTuple(ehs->shape()) &&
+            !ShapeUtil::SameDimensions(request.output_shape(), ehs->shape())) {
           ehs =
               ImplicitBroadcastToExplicitBroadcast(ehs, request.output_shape());
         }
@@ -3487,7 +3493,6 @@ void ComputationLowerer::Visit(
       HloInstruction* operand = lookup_instruction(trace_request.operand());
       hlo_instruction = add_instruction(
           HloInstruction::CreateTrace(trace_request.tag(), operand));
-      operand->set_tracing(hlo_instruction);
       break;
     }
 
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
index fd5a2ace9bacf66727dc91b6d96305424771a99b..5544c868fe905c1ca7e6cab32738440add2e3b4f 100644
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ b/tensorflow/compiler/xla/service/user_computation.h
@@ -236,7 +236,8 @@ class UserComputation {
       const UserComputation& false_computation);
 
   // Enqueues a Send instruction onto this user computation.
-  Status AddSendInstruction(const SendRequest& send_request);
+  StatusOr<ComputationDataHandle> AddSendInstruction(
+      const SendRequest& send_request);
 
   // Enqueues a Recv instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddRecvInstruction(
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index a5f9b01f011ce04f1114c74391a967c62f015221..3ef0cdff6751258e4489ce350deb0931fdf69ef9 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -106,20 +106,12 @@ static bool NotWorthHoistingIndividually(const HloInstruction& instruction) {
     case HloOpcode::kBitcast:
     case HloOpcode::kBroadcast:
     case HloOpcode::kConstant:
+    case HloOpcode::kReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kSlice:
+    case HloOpcode::kTranspose:
     case HloOpcode::kTuple:
       return true;
-
-    case HloOpcode::kTranspose:
-      return ShapeUtil::TransposeIsBitcast(
-          /*input_shape=*/instruction.operand(0)->shape(),
-          /*output_shape=*/instruction.shape(), instruction.dimensions());
-
-    case HloOpcode::kReshape:
-      return ShapeUtil::ReshapeIsBitcast(
-          /*input_shape=*/instruction.operand(0)->shape(),
-          /*output_shape=*/instruction.shape());
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 981de9b2200a9ae8938db21299580f510834d2f0..ec05a74e286c89dd8db5ae07580e461938d7c087 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -212,7 +213,7 @@ static optional<int64> GetLoopTripCount(HloInstruction* while_op) {
   // Now that we know the index of the induction variable, we can we can try to
   // compute how many times the loop executes.  Start by computing the induction
   // variable's initial value.
-  HloEvaluator evaluator;
+  HloEvaluator evaluator(/*max_loop_iterations=*/0);
   auto* while_init = while_op->mutable_operand(0);
   auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx);
   StatusOr<std::unique_ptr<Literal>> indvar_init_result =
@@ -605,6 +606,78 @@ static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
   return false;
 }
 
+static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
+  auto while_init = while_op->operand(0);
+  if (while_init->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  auto while_body = while_op->while_body();
+  auto while_body_root = while_body->root_instruction();
+  if (while_body_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  auto while_body_param = while_body->parameter_instruction(0);
+  const HloInstruction::InstructionVector& root_operands =
+      while_body_root->operands();
+
+  // Find the loop invariant tuple elements with scalar constant init value and
+  // build a map from the tuple element index to the constant value. Limit this
+  // to scalar constant values because propagating array constants can regress
+  // performance by forcing us to copy constants.
+  tensorflow::gtl::FlatMap<int, const HloInstruction*> index_to_constant;
+  for (int i = 0; i < root_operands.size(); i++) {
+    HloInstruction* instr = root_operands[i];
+    if (instr->opcode() == HloOpcode::kGetTupleElement &&
+        instr->tuple_index() == i && instr->operand(0) == while_body_param &&
+        ShapeUtil::IsScalar(instr->shape())) {
+      auto tuple_element = while_init->operand(i);
+      if (tuple_element->IsConstant()) {
+        VLOG(3) << "Found loop invariant tuple element " << i << " "
+                << tuple_element->ToString();
+        index_to_constant[i] = tuple_element;
+      }
+    }
+  }
+
+  if (index_to_constant.empty()) {
+    return false;
+  }
+
+  // Replace the use of each constant tuple element in the loop_condition and
+  // loop_body with the corresponding constant value.
+  auto propagate_constant = [&](HloComputation* computation) -> StatusOr<bool> {
+    HloInstruction* param = computation->parameter_instruction(0);
+    bool changed = false;
+    for (auto instr : param->users()) {
+      // Since only a while-loop with a tuple result reaches here, we can safely
+      // assume that `param` is a tuple and the first operand of the
+      // GetTupleElement instruction is a use of `param`.
+      if (instr->opcode() == HloOpcode::kGetTupleElement) {
+        VLOG(3) << "tuple index " << instr->tuple_index() << " "
+                << instr->ToString();
+        auto iter = index_to_constant.find(instr->tuple_index());
+        if (iter != index_to_constant.end()) {
+          const HloInstruction* hlo_constant = (*iter).second;
+          VLOG(3) << "Replace use of " << instr->ToString() << " with "
+                  << hlo_constant->ToString();
+          TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(
+              computation->AddInstruction(hlo_constant->Clone())));
+          changed = true;
+        }
+      }
+    }
+    return changed;
+  };
+
+  TF_ASSIGN_OR_RETURN(bool changed_cond,
+                      propagate_constant(while_op->while_condition()));
+  TF_ASSIGN_OR_RETURN(bool changed_body, propagate_constant(while_body));
+
+  return changed_cond || changed_body;
+}
+
 StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(3,
                  "WhileLoopSimplifier::Run(), before:\n" + module->ToString());
@@ -635,7 +708,11 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
       continue;
     }
 
-    StatusOr<bool> result = TryRemoveWhileLoop(while_op);
+    StatusOr<bool> result = TryPropagateConstant(while_op);
+    TF_RETURN_IF_ERROR(result.status());
+    changed |= result.ValueOrDie();
+
+    result = TryRemoveWhileLoop(while_op);
     TF_RETURN_IF_ERROR(result.status());
     if (result.ValueOrDie()) {
       changed = true;
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
index d3d55634c97bbdf3f81321d8089bb808c411340b..3d3e1d60f294c3a2574513c1c2f071805a341ad1 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.h
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -25,7 +25,7 @@ namespace xla {
 // HLO pass that makes the following transformations on while loops:
 //
 //  - A while loop with static trip count of 0 is deleted.
-//  - A while loops with static trip count of 1 is replaced by its body (sans
+//  - A while loop with static trip count of 1 is replaced by its body (sans
 //    loop).
 //  - Elements of a while loop's tuple that the loop doesn't use are removed
 //    from the tuple.
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index c5183f8d3aee99696ed4114c3f7e451888222137..619e87caa5b6d0f6ec3c3b1489b0d4f50ef29963 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace {
@@ -26,112 +27,140 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 
 class WhileLoopSimplifierTest : public HloVerifiedTestBase {
- public:
-  // Makes a computation that contains a loop that runs num_iters times.
-  HloComputation* MakeSimpleLoop(int num_iters, HloModule* module);
-
-  // Makes a computation which has one parameter, of the given shape, and always
-  // returns PRED[]{true}.  This is useful as a dummy loop condition.
-  HloComputation* MakeAlwaysTrueComputation(const Shape& param_shape,
-                                            HloModule* module);
+ protected:
+  // Makes an HloModule that contains a loop with `num_iters` iteration.
+  void MakeModuleWithSimpleLoop(int num_iters);
+
+  // Similar to MakeModuleWithSimpleLoop except that the loop bound is passed to
+  // the loop-condition through an element of a tuple which is the
+  // loop-condition parameter.
+  void MakeModuleWithSimpleLoopTupleElementLoopBound(int num_iters);
 };
 
-HloComputation* WhileLoopSimplifierTest::MakeSimpleLoop(int num_iters,
-                                                        HloModule* module) {
-  HloComputation::Builder builder(TestName());
-
-  auto loop_iter_init = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
-  auto loop_data_init = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR1<int32>({0, 1, 2})));
-  auto loop_init = builder.AddInstruction(
-      HloInstruction::CreateTuple({loop_iter_init, loop_data_init}));
-
-  HloComputation* condition;
-  {
-    HloComputation::Builder cond_builder(TestName() + ".condition");
-    auto loop_var = cond_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
-    auto loop_induction_var =
-        cond_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
-            ShapeUtil::MakeShape(S32, {}), loop_var, 0));
-    auto limit = cond_builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<int32>(42 + num_iters)));
-    cond_builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, loop_induction_var,
-        limit));
-    condition = module->AddEmbeddedComputation(cond_builder.Build());
+void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
+  string hlo_string_template = R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, multiply)
   }
-
-  HloComputation* body;
-  {
-    HloComputation::Builder body_builder(TestName() + ".body");
-    auto loop_var = body_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
-    auto loop_induction_var =
-        body_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
-            ShapeUtil::MakeShape(S32, {}), loop_var, 0));
-    auto new_loop_induction_var =
-        body_builder.AddInstruction(HloInstruction::CreateBinary(
-            loop_induction_var->shape(), HloOpcode::kAdd, loop_induction_var,
-            body_builder.AddInstruction(
-                HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)))));
-    auto loop_data =
-        body_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
-            loop_data_init->shape(), loop_var, 1));
-    auto new_loop_data =
-        body_builder.AddInstruction(HloInstruction::CreateBinary(
-            loop_data_init->shape(), HloOpcode::kMultiply, loop_data,
-            loop_data));
-    body_builder.AddInstruction(
-        HloInstruction::CreateTuple({new_loop_induction_var, new_loop_data}));
-    body = module->AddEmbeddedComputation(body_builder.Build());
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant({{LOOP_BOUND}})
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
   }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(42)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    ROOT while = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+  }
+  )";
 
-  builder.AddInstruction(HloInstruction::CreateWhile(
-      loop_init->shape(), condition, body, loop_init));
-
-  return module->AddEntryComputation(builder.Build());
+  string hlo_string = tensorflow::str_util::StringReplace(
+      hlo_string_template, "{{LOOP_BOUND}}",
+      tensorflow::strings::StrCat(42 + num_iters),
+      /*replace_all=*/true);
+  ParseAndVerifyModule(hlo_string);
 }
 
-HloComputation* WhileLoopSimplifierTest::MakeAlwaysTrueComputation(
-    const Shape& param_shape, HloModule* module) {
-  HloComputation::Builder builder(TestName() + ".always_true");
-  builder.AddInstruction(
-      HloInstruction::CreateParameter(0, param_shape, "param"));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
-  return module->AddEmbeddedComputation(builder.Build());
+void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
+    int num_iters) {
+  string hlo_string_template = R"(
+  HloModule SimpleLoopWithIndirectLoopBound
+  SimpleLoopWithIndirectLoopBound.body {
+    loop_var.1 = (s32[], s32[3]{0}, s32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    limit = s32[] get-tuple-element(loop_var.1), index=2
+    ROOT tuple = (s32[], s32[3]{0}, s32[]) tuple(add, multiply, limit)
+  }
+  SimpleLoopWithIndirectLoopBound.condition {
+    loop_var.2 = (s32[], s32[3]{0}, s32[]) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=2
+    ROOT less-than = pred[] less-than(get-tuple-element.3, get-tuple-element.4)
+  }
+  ENTRY SimpleLoopWithIndirectLoopBound {
+    constant.3 = s32[] constant(42)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    constant.2 = s32[] constant({{LOOP_BOUND}})
+    tuple.1 = (s32[], s32[3]{0}, s32[]) tuple(constant.3, constant.4,
+      constant.2)
+    ROOT while = (s32[], s32[3]{0}, s32[]) while(tuple.1),
+      condition=SimpleLoopWithIndirectLoopBound.condition,
+      body=SimpleLoopWithIndirectLoopBound.body
+  }
+  )";
+
+  string hlo_string = tensorflow::str_util::StringReplace(
+      hlo_string_template, "{{LOOP_BOUND}}",
+      tensorflow::strings::StrCat(42 + num_iters),
+      /*replace_all=*/true);
+  ParseAndVerifyModule(hlo_string);
 }
 
-TEST_F(WhileLoopSimplifierTest, WhileLoopWithZeroIterations) {
-  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/0, &module());
-  ASSERT_TRUE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(),
+TEST_F(WhileLoopSimplifierTest, LoopWithZeroIterationSimiplified) {
+  MakeModuleWithSimpleLoop(/*num_iters=*/0);
+  HloModule* the_module = &module();
+  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
               op::Tuple(op::Constant(), op::Constant()));
 }
 
-TEST_F(WhileLoopSimplifierTest, WhileLoopWithOneIteration) {
-  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
-  ASSERT_TRUE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(),
+TEST_F(WhileLoopSimplifierTest,
+       LoopWithZeroIterationTupleElementLoopBoundSimplified) {
+  MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/0);
+  HloModule* the_module = &module();
+  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+              op::Tuple(op::Constant(), op::Constant(), op::Constant()));
+}
+
+TEST_F(WhileLoopSimplifierTest, LoopWithOneIterationSimplified) {
+  MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloModule* the_module = &module();
+  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
               op::Tuple(op::Add(), op::Multiply()));
 }
 
-TEST_F(WhileLoopSimplifierTest, WhileLoopWithTwoIterations) {
-  MakeSimpleLoop(/*num_iters=*/2, &module());
+TEST_F(WhileLoopSimplifierTest,
+       LoopWithOneIterationTupleELementLoopBoundSimplified) {
+  MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/1);
+  HloModule* the_module = &module();
+  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+              op::Tuple(op::Add(), op::Multiply(), op::Constant()));
+}
+
+TEST_F(WhileLoopSimplifierTest, LoopWithTwoIterationsNotSimplified) {
+  MakeModuleWithSimpleLoop(/*num_iters=*/2);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
-TEST_F(WhileLoopSimplifierTest, WhileLoopWithControlDependency) {
-  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
+TEST_F(WhileLoopSimplifierTest,
+       LoopWithControlDependencySimplifiedDependencyPreserved) {
+  MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloModule* the_module = &module();
+  HloComputation* computation = the_module->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* true_op = while_op->while_body()->AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
   TF_ASSERT_OK(true_op->AddControlDependencyTo(
       while_op->while_body()->root_instruction()));
-  ASSERT_TRUE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
   EXPECT_THAT(computation->root_instruction()->control_predecessors(),
               ElementsAre(op::Constant()))
       << computation->ToString();
@@ -139,8 +168,10 @@ TEST_F(WhileLoopSimplifierTest, WhileLoopWithControlDependency) {
 
 // Loops that contain send/recv nodes can't be simplified; the loop structure
 // around send/recv nodes must be preserved.
-TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsSend) {
-  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
+TEST_F(WhileLoopSimplifierTest, LoopWithSendNotSimplified) {
+  MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloModule* the_module = &module();
+  HloComputation* computation = the_module->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
@@ -149,11 +180,13 @@ TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsSend) {
           HloInstruction::CreateConstant(Literal::CreateR0<bool>(true))),
       /*channel_id=*/0));
   while_body->AddInstruction(HloInstruction::CreateSendDone(send));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
 }
 
-TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsRecv) {
-  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
+TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
+  MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloModule* the_module = &module();
+  HloComputation* computation = the_module->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
@@ -161,247 +194,217 @@ TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsRecv) {
       HloInstruction::CreateRecv(ShapeUtil::MakeShape(F32, {1}),
                                  /*channel_id=*/0));
   while_body->AddInstruction(HloInstruction::CreateRecvDone(recv));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
 }
 
 // The limitation on not being able to simplify loops that contain infeeds (and
 // other non-removable instructions) isn't fundamental -- it just stems from the
 // fact that our infrastructure sees simplifying such a loop as tantamount to
 // removing the non-removable instruction.
-TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsNonRemovableInstruction) {
-  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
+TEST_F(WhileLoopSimplifierTest, LoopWithInfeedNotSimplified) {
+  MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloModule* the_module = &module();
+  HloComputation* computation = the_module->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
   while_body->AddInstruction(
       HloInstruction::CreateInfeed(ShapeUtil::MakeShape(F32, {1}), "config"));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
 }
 
-// Check that we don't crash when given a loop whose shape is not a tuple.
-TEST_F(WhileLoopSimplifierTest, IgnoreNonTupleShapedLoop) {
-  HloComputation::Builder builder(TestName());
-  auto loop_init = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
-
-  HloComputation* condition;
-  {
-    HloComputation::Builder cond_builder(TestName() + ".condition");
-    auto param = cond_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
-    cond_builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param,
-        cond_builder.AddInstruction(
-            HloInstruction::CreateConstant(Literal::CreateR0<int32>(100)))));
-    condition = module().AddEmbeddedComputation(cond_builder.Build());
+// A non-tuple shaped loop shouldn't be simplified or crash the compiler.
+TEST_F(WhileLoopSimplifierTest, NonTupleShapedLoopNotSimplified) {
+  const string hlo_string = R"(
+ HloModule NonTupleShapedLoop
+ NonTupleShapedLoop.body {
+   loop_var.1 = s32[] parameter(0)
+   constant.1 = s32[] constant(-1)
+   ROOT add = s32[] add(s32[] loop_var.1, s32[] constant.1)
+ }
+ NonTupleShapedLoop.condition {
+   loop_var = s32[] parameter(0)
+   constant = s32[] constant(100)
+   ROOT less-than = pred[] less-than(s32[] loop_var, s32[] constant)
+ }
+ ENTRY INonTupleShapedLoop {
+   constant.2 = s32[] constant(42)
+   ROOT while = s32[] while(s32[] constant.2),
+     condition=NonTupleShapedLoop.condition,
+     body=NonTupleShapedLoop.body
   }
+  )";
 
-  HloComputation* body;
-  {
-    HloComputation::Builder body_builder(TestName() + ".body");
-    auto param = body_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
-    body_builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(S32, {}), HloOpcode::kAdd, param,
-        body_builder.AddInstruction(
-            HloInstruction::CreateConstant(Literal::CreateR0<int32>(-1)))));
-    body = module().AddEmbeddedComputation(body_builder.Build());
-  }
-
-  builder.AddInstruction(HloInstruction::CreateWhile(
-      loop_init->shape(), condition, body, loop_init));
-
-  module().AddEntryComputation(builder.Build());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
-// Construct a loop where we swap the tuple elements in each iteration.
-// Although the tuple elements aren't used in the loop, we don't eliminate them,
-// because the swapping side-effect is visible to users of the loop.
-TEST_F(WhileLoopSimplifierTest, SwapTupleIndices) {
-  HloComputation::Builder builder(TestName());
-  auto loop_init = builder.AddInstruction(HloInstruction::CreateTuple({
-      builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
-      builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR0<int32>(1))),
-  }));
-
-  HloComputation* condition =
-      MakeAlwaysTrueComputation(loop_init->shape(), &module());
-  HloComputation* body;
-  {
-    HloComputation::Builder body_builder(TestName() + ".body");
-    auto param = body_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
-    auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-    body_builder.AddInstruction(HloInstruction::CreateTuple({
-        body_builder.AddInstruction(
-            HloInstruction::CreateGetTupleElement(scalar_s32, param, 1)),
-        body_builder.AddInstruction(
-            HloInstruction::CreateGetTupleElement(scalar_s32, param, 0)),
-    }));
-    body = module().AddEmbeddedComputation(body_builder.Build());
+// A while loop that does nothing else besides swapping tuple elements
+// can't be simplified as the result of the swapping is visible to users of the
+// loop.
+TEST_F(WhileLoopSimplifierTest, LoopSwappingTupleElementsNotSimplified) {
+  const string hlo_string = R"(
+  HloModule SwappingTupleElements
+  SwappingTupleElements.body {
+    loop_var = (s32[], s32[]) parameter(0)
+    get-tuple-element = s32[] get-tuple-element((s32[], s32[]) loop_var),index=1
+    get-tuple-element.1 = s32[] get-tuple-element((s32[], s32[]) loop_var),
+      index=0
+    ROOT tuple = (s32[], s32[]) tuple(s32[] get-tuple-element,
+      s32[] get-tuple-element.1)
   }
+  SwappingTupleElements.always_true {
+   param = (s32[], s32[]) parameter(0)
+   ROOT constant = pred[] constant(true)
+  }
+  ENTRY SwappingTupleElements {
+   x = s32[] parameter(0)
+   y = s32[] parameter(1)
+   tuple.1 = (s32[], s32[]) tuple(s32[] x, s32[] y)
+   ROOT while = (s32[], s32[]) while((s32[], s32[]) tuple.1),
+     condition=SwappingTupleElements.always_true,
+     body=SwappingTupleElements.body
+  }
+  )";
 
-  builder.AddInstruction(HloInstruction::CreateWhile(
-      loop_init->shape(), condition, body, loop_init));
-
-  module().AddEntryComputation(builder.Build());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
 // Construct a loop where we assign a constant to tuple element 0 in each
 // iteration.  We can't eliminate tuple element 0, even though we never use its
 // value.
-TEST_F(WhileLoopSimplifierTest, UnusedButModifiedTupleElement) {
-  HloComputation::Builder builder(TestName());
-  auto loop_init = builder.AddInstruction(
-      HloInstruction::CreateTuple({builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)))}));
-
-  HloComputation* condition =
-      MakeAlwaysTrueComputation(loop_init->shape(), &module());
-  HloComputation* body;
-  {
-    HloComputation::Builder body_builder(TestName() + ".body");
-    body_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
-    body_builder.AddInstruction(HloInstruction::CreateTuple({
-        body_builder.AddInstruction(
-            HloInstruction::CreateConstant(Literal::CreateR0<int32>(1))),
-    }));
-    body = module().AddEmbeddedComputation(body_builder.Build());
+TEST_F(WhileLoopSimplifierTest,
+       LoopWithUnusedButModifiedTupleElementNotSimplified) {
+  const string hlo_string = R"(
+  HloModule UnusedButModifiedTupleElement
+  UnusedButModifiedTupleElement.body {
+    loop_var = (s32[]) parameter(0)
+    constant.1 = s32[] constant(1)
+    ROOT tuple = (s32[]) tuple(s32[] constant.1)
   }
+  UnusedButModifiedTupleElement.always_true {
+    param = (s32[]) parameter(0)
+   ROOT  constant = pred[] constant(true)
+  }
+  ENTRY  UnusedButModifiedTupleElement {
+    constant.2 = s32[] constant(0)
+    tuple.1 = (s32[]) tuple(s32[]  constant.2)
+    ROOT while = (s32[]) while((s32[]) tuple.1),
+      condition=UnusedButModifiedTupleElement.always_true,
+      body=UnusedButModifiedTupleElement.body
+  }
+  )";
 
-  builder.AddInstruction(HloInstruction::CreateWhile(
-      loop_init->shape(), condition, body, loop_init));
-
-  module().AddEntryComputation(builder.Build());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
 // Nothing to simplify in a while loop whose tuple has 0 elements.
-TEST_F(WhileLoopSimplifierTest, EmptyTuple) {
-  HloComputation::Builder builder(TestName());
-  auto loop_init = builder.AddInstruction(HloInstruction::CreateTuple({}));
-
-  HloComputation* condition =
-      MakeAlwaysTrueComputation(loop_init->shape(), &module());
-  HloComputation* body;
-  {
-    HloComputation::Builder body_builder(TestName() + ".body");
-    body_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
-    body_builder.AddInstruction(HloInstruction::CreateTuple({}));
-    body = module().AddEmbeddedComputation(body_builder.Build());
+TEST_F(WhileLoopSimplifierTest, LoopWithEmptyTupleNotSimplified) {
+  const string hlo_string = R"(
+  HloModule EmptyTuple
+  EmptyTuple.body {
+    loop_var = () parameter(0)
+    ROOT  tuple = () tuple()
+  }
+  EmptyTuple.always_true {
+   param = () parameter(0)
+   ROOT constant = pred[] constant(true)
+  }
+  ENTRY EmptyTuple {
+   tuple.1 = () tuple()
+   ROOT while = () while(() tuple.1), condition=EmptyTuple.always_true,
+     body=EmptyTuple.body
   }
+  )";
 
-  builder.AddInstruction(HloInstruction::CreateWhile(
-      loop_init->shape(), condition, body, loop_init));
-  module().AddEntryComputation(builder.Build());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
 // While loop where one tuple element is used twice in the body, and thus can't
 // be simplified away.
-TEST_F(WhileLoopSimplifierTest, ElemUsedTwice) {
-  HloComputation::Builder builder(TestName());
-  auto loop_init = builder.AddInstruction(HloInstruction::CreateTuple({
-      builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
-      builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR0<int32>(1))),
-  }));
-
-  HloComputation* condition =
-      MakeAlwaysTrueComputation(loop_init->shape(), &module());
-
-  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-  HloComputation* body;
-  {
-    HloComputation::Builder body_builder(TestName() + ".body");
-    auto* param = body_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "param0"));
-    auto* gte0 = body_builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_s32, param, /*index=*/0));
-    // get0 is used twice in the loop body's tuple.
-    body_builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte0}));
-    body = module().AddEmbeddedComputation(body_builder.Build());
+TEST_F(WhileLoopSimplifierTest, LoopWithElemUsedTwiceNotSimplified) {
+  const string hlo_string = R"(
+  HloModule ElemUsedTwice
+  ElemUsedTwice.body {
+    param0 = (s32[], s32[]) parameter(0)
+    get-tuple-element = s32[] get-tuple-element((s32[], s32[]) param0), index=0
+    ROOT tuple = (s32[], s32[]) tuple(s32[] get-tuple-element,
+      s32[] get-tuple-element)
+  }
+  ElemUsedTwice.always_true {
+    param = (s32[], s32[]) parameter(0)
+    ROOT constant = pred[] constant(true)
   }
+  ENTRY ElemUsedTwice {
+   x = s32[] parameter(0)
+   y = s32[] parameter(1)
+   tuple.1 = (s32[], s32[]) tuple(s32[] x, s32[] y)
+   ROOT while = (s32[], s32[]) while((s32[], s32[]) tuple.1),
+     condition=ElemUsedTwice.always_true, body=ElemUsedTwice.body
+  }
+  )";
 
-  builder.AddInstruction(HloInstruction::CreateWhile(
-      loop_init->shape(), condition, body, loop_init));
-  module().AddEntryComputation(builder.Build());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
 // This while loop has three tuple elements.  Element 0 is unused and should be
 // removed. Element 1 is used by the loop body, and element 2 is used by the
 // loop condition; these two should stay.
-TEST_F(WhileLoopSimplifierTest, RemoveUnusedOperand) {
-  HloComputation::Builder builder(TestName());
-  auto loop_init = builder.AddInstruction(HloInstruction::CreateTuple({
-      builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
-      builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
-      builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
-  }));
-  auto loop_shape = loop_init->shape();
-  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-
-  HloComputation* condition;
-  {
-    HloComputation::Builder cond_builder(TestName() + ".loop_condition");
-    auto param = cond_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_shape, "param0"));
-    cond_builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq,
-        cond_builder.AddInstruction(
-            HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
-        cond_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
-            scalar_s32, param, /*index=*/2))));
-    condition = module().AddEmbeddedComputation(cond_builder.Build());
+TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
+  const string hlo_string = R"(
+  HloModule RemoveUnusedOperands
+  RemoveUnusedOperands.body {
+    loop_var = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element((s32[], s32[],
+      s32[]) loop_var), index=0
+    get-tuple-element.2 = s32[] get-tuple-element((s32[], s32[],
+      s32[]) loop_var), index=1
+    constant.1 = s32[] constant(1)
+    add = s32[] add(s32[] get-tuple-element.2, s32[] constant.1)
+    get-tuple-element.3 = s32[] get-tuple-element((s32[], s32[], s32[])
+      loop_var), index=2
+    ROOT tuple = (s32[], s32[], s32[]) tuple(s32[] get-tuple-element.1,
+      s32[] add, s32[] get-tuple-element.3)
   }
-
-  HloComputation* body;
-  {
-    HloComputation::Builder body_builder(TestName() + ".body");
-    auto* param = body_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_shape, "loop_var"));
-
-    auto* tuple0 = body_builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_s32, param, /*index=*/0));
-    auto* tuple1 = body_builder.AddInstruction(HloInstruction::CreateBinary(
-        scalar_s32, HloOpcode::kAdd,
-        body_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
-            scalar_s32, param, /*index=*/1)),
-        body_builder.AddInstruction(
-            HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)))));
-    auto* tuple2 = body_builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_s32, param, /*index=*/2));
-    body_builder.AddInstruction(
-        HloInstruction::CreateTuple({tuple0, tuple1, tuple2}));
-
-    body = module().AddEmbeddedComputation(body_builder.Build());
+  RemoveUnusedOperands.loop_condition {
+    constant.2 = s32[] constant(0)
+    param0 = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element = s32[] get-tuple-element((s32[], s32[], s32[]) param0),
+      index=2
+    ROOT equal-to = pred[] equal-to(s32[] constant.2, s32[] get-tuple-element)
   }
+  ENTRY RemoveUnusedOperands {
+    x = s32[] parameter(0)
+    constant.3 = s32[] constant(0)
+    y = s32[] parameter(1)
+    tuple.1 = (s32[], s32[], s32[]) tuple(s32[] x, s32[] constant.3,
+      s32[] y)
+    ROOT while = (s32[], s32[], s32[]) while((s32[], s32[], s32[]) tuple.1),
+      condition=RemoveUnusedOperands.loop_condition,
+      body=RemoveUnusedOperands.body
+  }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  HloModule* the_module = &module();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+
+  // The original while instruction is still left in the module as a dead
+  // instruction, find a while instruction with a different name as the new
+  // while instruction.
+  HloInstruction* new_while_op =
+      *std::find_if(the_module->entry_computation()->instructions().begin(),
+                    the_module->entry_computation()->instructions().end(),
+                    [&](const HloInstruction* instr) {
+                      return (instr->opcode() == HloOpcode::kWhile &&
+                              instr->name() != "while");
+                    });
 
-  auto* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
-      loop_init->shape(), condition, body, loop_init));
-
-  module().AddEntryComputation(builder.Build());
-  EXPECT_TRUE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
-
-  // We leave most of the checking to HloVerifiedTestBase, which runs the
-  // verifier on module() at the end of this test.
-  HloInstruction* new_while_op = *std::find_if(
-      module().entry_computation()->instructions().begin(),
-      module().entry_computation()->instructions().end(),
-      [&](const HloInstruction* instr) {
-        return instr != while_op && instr->opcode() == HloOpcode::kWhile;
-      });
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   EXPECT_TRUE(
       ShapeUtil::Equal(new_while_op->shape(),
                        ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32})))
@@ -418,31 +421,91 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedOperand) {
                      op::GetTupleElement(op::Parameter(0), /*tuple_index=*/1)));
 }
 
-TEST_F(WhileLoopSimplifierTest, BodyHasNonTupleRoot) {
-  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-  Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
-
-  HloComputation* while_body = [&]() {
-    HloComputation::Builder builder(TestName() + ".passthrough");
-    HloInstruction* param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, while_shape, "param"));
-    HloComputation* result = module().AddEmbeddedComputation(builder.Build());
-
-    result->AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
-    return result;
-  }();
-
-  HloComputation::Builder builder(TestName());
-  auto* init_value = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, while_shape, "init_value"));
-  builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  module().AddEntryComputation(builder.Build());
-  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopSimplifier{}.Run(&module()));
-  EXPECT_FALSE(simplified_loop);
+TEST_F(WhileLoopSimplifierTest, LoopWithNonTupleBodyShapeNotSimplified) {
+  const string hlo_string = R"(
+  HloModule BodyHasNonTupleRoot
+  BodyHasNonTupleRoot.passthrough {
+    ROOT param = (s32[], s32[]) parameter(0)
+  }
+  BodyHasNonTupleRoot.always_true {
+    param.1 = (s32[], s32[]) parameter(0)
+    ROOT constant = pred[] constant(true)
+  }
+  ENTRY BodyHasNonTupleRoot {
+    init_value = (s32[], s32[]) parameter(0)
+    ROOT while = (s32[], s32[]) while((s32[], s32[]) init_value),
+      condition=BodyHasNonTupleRoot.always_true,
+      body=BodyHasNonTupleRoot.passthrough
+  }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+TEST_F(WhileLoopSimplifierTest,
+       LoopWithNonTupleBodyRootInstructionNotSimplified) {
+  const string hlo_string = R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT custom-call = (s32[], s32[3]{0}) custom-call(add, multiply),
+      custom_call_target="x"
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(44)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(42)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    ROOT while = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+  }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+TEST_F(WhileLoopSimplifierTest, LoopWithArrayConstantNotSimplified) {
+  const string hlo_string = R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}, s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    get-tuple-element.3 = s32[3]{0} get-tuple-element(loop_var.1), index=2
+    add.2 = s32[3]{0} add(get-tuple-element.2, get-tuple-element.3)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, add.2, get-tuple-element.3)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}, s32[3]{0}) parameter(0)
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(47)
+    ROOT less-than = pred[] less-than(get-tuple-element.4, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(42)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4, constant.4)
+    ROOT while = (s32[], s32[3]{0}, s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+  }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index e20b25e4a08a946f6b58575a4d4e557744f8035c..bd0794184328b7926543c4275b3b915f51e7b812 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -15,18 +15,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 
+using tensorflow::strings::StrCat;
+
 static StatusOr<HloComputation*> WidenWhileCondition(
     HloComputation* narrow_condition, const Shape& wide_shape) {
   const Shape& narrow_shape =
       narrow_condition->parameter_instruction(0)->shape();
 
   HloComputation* wide_while_cond = [&]() {
-    HloComputation::Builder builder(
-        tensorflow::strings::StrCat("wide.", narrow_condition->name()));
+    HloComputation::Builder builder(StrCat("wide.", narrow_condition->name()));
     builder.AddInstruction(
         HloInstruction::CreateParameter(0, wide_shape, "wide_param"));
 
@@ -57,8 +60,7 @@ WidenWhileBody(HloComputation* narrow_body, const Shape& wide_shape) {
   const Shape& narrow_shape = narrow_body->parameter_instruction(0)->shape();
 
   HloComputation* wide_while_body = [&]() {
-    HloComputation::Builder builder(
-        tensorflow::strings::StrCat("wide.", narrow_body->name()));
+    HloComputation::Builder builder(StrCat("wide.", narrow_body->name()));
     builder.AddInstruction(
         HloInstruction::CreateParameter(0, wide_shape, "wide_param"));
     return narrow_body->parent()->AddEmbeddedComputation(builder.Build());
@@ -137,4 +139,109 @@ WhileUtil::MakeInstructionsLiveIn(
 
   return std::move(result);
 }
+
+static StatusOr<std::unique_ptr<HloComputation>>
+MakeCountedLoopConditionComputation(const Shape& loop_state_shape,
+                                    int32 trip_count) {
+  Shape scalar_pred = ShapeUtil::MakeShape(PRED, {});
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> cond_computation,
+                      CreateComputationWithSignature(
+                          {&loop_state_shape}, scalar_pred, "while_cond"));
+
+  HloInstruction* trip_count_constant = cond_computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(trip_count)));
+
+  HloInstruction* param = cond_computation->parameter_instruction(0);
+  TF_ASSIGN_OR_RETURN(HloInstruction * indvar,
+                      MakeGetTupleElementHlo(param, 0));
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * compare,
+      MakeBinaryHlo(HloOpcode::kLt, indvar, trip_count_constant));
+  cond_computation->set_root_instruction(compare);
+  return std::move(cond_computation);
+}
+
+static StatusOr<std::unique_ptr<HloComputation>> MakeCountedLoopBodyComputation(
+    const Shape& loop_state_shape,
+    const std::function<StatusOr<WhileUtil::LoopStateTy>(
+        HloInstruction*, const WhileUtil::LoopStateTy&)>& loop_body_generator) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> body_computation,
+                      CreateComputationWithSignature(
+                          {&loop_state_shape}, loop_state_shape, "while_body"));
+  HloInstruction* one = body_computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)));
+  HloInstruction* param = body_computation->parameter_instruction(0);
+  TF_ASSIGN_OR_RETURN(HloInstruction * indvar,
+                      MakeGetTupleElementHlo(param, 0));
+  TF_ASSIGN_OR_RETURN(HloInstruction * next_indvar,
+                      MakeBinaryHlo(HloOpcode::kAdd, indvar, one));
+
+  std::vector<HloInstruction*> loop_body_generator_args;
+  for (int64 i = 1, e = loop_state_shape.tuple_shapes_size(); i < e; i++) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * tuple_element,
+                        MakeGetTupleElementHlo(param, i));
+    loop_body_generator_args.push_back(tuple_element);
+  }
+  TF_ASSIGN_OR_RETURN(std::vector<HloInstruction*> next_state,
+                      loop_body_generator(indvar, loop_body_generator_args));
+  next_state.insert(next_state.begin(), next_indvar);
+  HloInstruction* next_state_tuple =
+      body_computation->AddInstruction(HloInstruction::CreateTuple(next_state));
+  body_computation->set_root_instruction(next_state_tuple);
+
+  return std::move(body_computation);
+}
+
+static StatusOr<HloInstruction*> MakeInitTupleFromInitValues(
+    HloComputation* computation, const WhileUtil::LoopStateTy& init_values) {
+  std::vector<HloInstruction*> init_values_with_indvar;
+  init_values_with_indvar.reserve(init_values.size() + 1);
+  HloInstruction* zero = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
+  init_values_with_indvar.push_back(zero);
+  c_copy(init_values, std::back_inserter(init_values_with_indvar));
+  return computation->AddInstruction(
+      HloInstruction::CreateTuple(init_values_with_indvar));
+}
+
+static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
+  std::vector<Shape> loop_state_shape_components;
+  loop_state_shape_components.reserve(init_values.size() + 1);
+  loop_state_shape_components.push_back(ShapeUtil::MakeShape(S32, {}));
+  c_transform(init_values, std::back_inserter(loop_state_shape_components),
+              [](HloInstruction* instr) { return instr->shape(); });
+  return ShapeUtil::MakeTupleShape(loop_state_shape_components);
+}
+
+/*static*/ StatusOr<WhileUtil::LoopStateTy> WhileUtil::MakeCountedLoop(
+    HloComputation* computation, int32 trip_count,
+    const WhileUtil::LoopStateTy& init_values,
+    const WhileUtil::LoopBodyGeneratorTy& loop_body_generator) {
+  CHECK_GE(trip_count, 0);
+
+  Shape loop_state_shape = MakeLoopStateShape(init_values);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloComputation> cond,
+      MakeCountedLoopConditionComputation(loop_state_shape, trip_count));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloComputation> body,
+      MakeCountedLoopBodyComputation(loop_state_shape, loop_body_generator));
+  TF_ASSIGN_OR_RETURN(HloInstruction * init_tuple,
+                      MakeInitTupleFromInitValues(computation, init_values));
+  HloModule* module = computation->parent();
+  HloInstruction* while_instr =
+      computation->AddInstruction(HloInstruction::CreateWhile(
+          loop_state_shape, module->AddEmbeddedComputation(std::move(cond)),
+          module->AddEmbeddedComputation(std::move(body)), init_tuple));
+
+  std::vector<HloInstruction*> result;
+  for (int64 i = 0, e = init_values.size(); i < e; i++) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * user_state,
+                        MakeGetTupleElementHlo(while_instr, i + 1));
+    result.push_back(user_state);
+  }
+  return result;
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
index 3600b5a80d26e37fdb7d5173c3b8743734306390..1688d4674269c36c5b356f262dbd5d958572e101 100644
--- a/tensorflow/compiler/xla/service/while_util.h
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -52,6 +52,28 @@ class WhileUtil {
   static StatusOr<MakeInstructionsLiveInResult> MakeInstructionsLiveIn(
       HloInstruction* while_instr,
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions);
+
+  using LoopStateTy = std::vector<HloInstruction*>;
+  using LoopBodyGeneratorTy = std::function<StatusOr<LoopStateTy>(
+      HloInstruction* /*induction_var*/,
+      const LoopStateTy& /*current_values*/)>;
+
+  // Creates a while loop in `computation` that runs for `trip_count`
+  // iterations.  The structure of the while loop is as follows, in pseudocode:
+  //
+  //  loop_state while_loop() {
+  //    indvar = 0;
+  //    loop_state = init_values
+  //    while (indvar < trip_count) {
+  //      loop_state = loop_body_generator(loop_state)
+  //      indvar++;
+  //    }
+  //    return loop_state;
+  //  }
+  static StatusOr<LoopStateTy> MakeCountedLoop(
+      HloComputation* computation, int32 trip_count,
+      const LoopStateTy& init_values,
+      const LoopBodyGeneratorTy& loop_body_generator);
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
index 063e312df66ce9cba0fa9f49c2fc6026ba6b74aa..8763e588c484011ba2ccbc7cad8f29817347a605 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
-// HLO pass that replaces zero sized Hlos with an zero sized constant literal.
+// HLO pass that replaces zero sized Hlos with a zero sized constant literal.
 namespace xla {
 class ZeroSizedHloElimination : public HloPassInterface {
  public:
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
index 4f8cdc1e0e73cdaa8675fc945ba3dbe19ce3da7d..a4e67cc9d9b8eea195ec409e8c502c5c2fbe1352 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -46,9 +46,9 @@ class ZeroSizedHloEliminationTest : public HloTestBase {
                 0, ShapeUtil::MakeShape(F32, {3, 0}), "zero sized param"))) {}
 
   StatusOr<bool> RunZeroSizedElimination() {
-    HloModule module("zero_sized_elimination_test_module");
-    module.AddEntryComputation(builder_.Build());
-    return ZeroSizedHloElimination{}.Run(&module);
+    auto module = CreateNewModule("zero_sized_elimination_test_module");
+    module->AddEntryComputation(builder_.Build());
+    return ZeroSizedHloElimination{}.Run(module.get());
   }
 
   HloComputation::Builder builder_;
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 809941d8fe1f63d66bf104e66eea66167a0f509d..5b44c26b7c7b082556d9533cf3b3b1b98e5e4b09 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -54,9 +54,16 @@ class ServiceInterface {
   virtual tensorflow::Status Execute(const ExecuteRequest* arg,
                                      ExecuteResponse* result) = 0;
 
+  virtual tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* arg,
+                                          ExecuteResponse* result) = 0;
+
   virtual tensorflow::Status ExecuteParallel(
       const ExecuteParallelRequest* arg, ExecuteParallelResponse* result) = 0;
 
+  virtual tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* arg,
+      ExecuteParallelResponse* result) = 0;
+
   virtual tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
                                           ExecuteAsyncResponse* result) = 0;
 
@@ -69,6 +76,10 @@ class ServiceInterface {
   virtual tensorflow::Status GetComputationStats(
       const ComputationStatsRequest* arg, ComputationStatsResponse* result) = 0;
 
+  virtual tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* arg,
+      ComputationStatsResponse* result) = 0;
+
   virtual tensorflow::Status GetComputationShape(
       const GetComputationShapeRequest* arg,
       GetComputationShapeResponse* result) = 0;
@@ -101,6 +112,10 @@ class ServiceInterface {
   virtual tensorflow::Status ComputeConstant(
       const ComputeConstantRequest* arg, ComputeConstantResponse* result) = 0;
 
+  virtual tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) = 0;
+
   // Methods used by Computation.
   virtual tensorflow::Status SnapshotComputation(
       const SnapshotComputationRequest* ag,
diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index 4c83750f3e6f3c735db66d8e0b86ae3f43e5ca11..a1dce758cd3ab3f204ce330fca2a7d2bdf57a2be 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -48,8 +48,7 @@ class ShapeLayout {
   bool MatchesLayoutInShape(const Shape& shape) const;
 
   // Copies the layout from the given shape into this ShapeLayout. 'other_shape'
-  // must be compatible with the ShapeLayout's shape, and 'other_shape' must
-  // have a layout (LayoutUtil::HasLayout).
+  // must be compatible with the ShapeLayout's shape.
   tensorflow::Status CopyLayoutFromShape(const Shape& other_shape);
 
   // Clears (Layout::Clear) all the Layouts stored in this object.
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 280f02e88675381bd75108bfae0dd22c462ba718..ffaa40c2d673a2365342371ed8dab59565d1d08f 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -53,7 +53,7 @@ struct ShapeTreeNode {
   ShapeTreeNode(const ShapeTreeNode& other)
       : data(other.data), children(other.children.size()) {
     for (size_t i = 0; i < children.size(); ++i) {
-      children[i] = MakeUnique<ShapeTreeNode>(*other.children[i]);
+      children[i] = ::xla::MakeUnique<ShapeTreeNode>(*other.children[i]);
     }
   }
 
@@ -62,7 +62,7 @@ struct ShapeTreeNode {
       data = other.data;
       children.resize(other.children.size());
       for (size_t i = 0; i < children.size(); ++i) {
-        children[i] = MakeUnique<ShapeTreeNode>(*other.children[i]);
+        children[i] = ::xla::MakeUnique<ShapeTreeNode>(*other.children[i]);
       }
     }
     return *this;
@@ -445,7 +445,7 @@ class ShapeTreeIterator : public std::iterator<std::forward_iterator_tag,
     for (auto& node_and_index : stack_) {
       index.push_back(node_and_index.second);
     }
-    current_ = MakeUnique<value_type>(index, node_->data);
+    current_ = ::xla::MakeUnique<value_type>(index, node_->data);
     return *current_;
   }
 
@@ -492,7 +492,7 @@ void ShapeTree<T>::InitChildren(const Shape& shape, Node* node) {
 template <typename T>
 ShapeTree<T>::ShapeTree(Shape shape)
     : root_(),
-      shape_storage_(MakeUnique<Shape>(std::move(shape))),
+      shape_storage_(::xla::MakeUnique<Shape>(std::move(shape))),
       shape_(shape_storage_.get()) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
@@ -508,7 +508,7 @@ ShapeTree<T>::ShapeTree(const Shape* shape) : root_(), shape_(shape) {
 template <typename T>
 ShapeTree<T>::ShapeTree(Shape shape, const T& init_value)
     : root_(init_value),
-      shape_storage_(MakeUnique<Shape>(std::move(shape))),
+      shape_storage_(::xla::MakeUnique<Shape>(std::move(shape))),
       shape_(shape_storage_.get()) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 604e0173e789348923316174873f58058eaf2815..d58baa3220a73fcb09354bd16465629bc73fd428 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -502,11 +502,11 @@ namespace {
 StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   tensorflow::str_util::RemoveLeadingWhitespace(s);
 
-  if (s->Consume("(")) {  // Tuple.
+  if (tensorflow::str_util::ConsumePrefix(s, "(")) {  // Tuple.
     std::vector<Shape> shapes;
     bool must_end = false;
     while (true) {
-      if (s->Consume(")")) {
+      if (tensorflow::str_util::ConsumePrefix(s, ")")) {
         break;
       } else if (must_end) {
         return InvalidArgument("Expected end of tuple; got: \"%s\"",
@@ -515,7 +515,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
       shapes.emplace_back();
       TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
       tensorflow::str_util::RemoveLeadingWhitespace(s);
-      must_end = !s->Consume(",");
+      must_end = !tensorflow::str_util::ConsumePrefix(s, ",");
     }
     return ShapeUtil::MakeTupleShape(shapes);
   }
@@ -609,6 +609,8 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
 
 /* static */ bool ShapeUtil::SameDimensions(const Shape& lhs,
                                             const Shape& rhs) {
+  CHECK(ShapeUtil::IsArray(lhs));
+  CHECK(ShapeUtil::IsArray(rhs));
   return ContainersEqual(lhs.dimensions(), rhs.dimensions());
 }
 
@@ -617,7 +619,10 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
     return rhs.element_type() == TUPLE &&
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(), Compatible);
   }
-  return SameDimensions(lhs, rhs) && SameElementType(lhs, rhs);
+  if (lhs.element_type() == OPAQUE) {
+    return rhs.element_type() == OPAQUE;
+  }
+  return SameElementType(lhs, rhs) && SameDimensions(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
@@ -627,7 +632,10 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            CompatibleIgnoringElementType);
   }
-  return SameDimensions(lhs, rhs);
+  if (lhs.element_type() == OPAQUE) {
+    return rhs.element_type() == OPAQUE;
+  }
+  return ShapeUtil::IsArray(rhs) && SameDimensions(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs,
@@ -637,6 +645,9 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            CompatibleIgnoringFpPrecision);
   }
+  if (lhs.element_type() == OPAQUE) {
+    return rhs.element_type() == OPAQUE;
+  }
   if (SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return CompatibleIgnoringElementType(lhs, rhs);
   }
@@ -813,6 +824,18 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   return new_shape;
 }
 
+/* static */ bool ShapeUtil::IndexIsValid(const Shape& shape,
+                                          ShapeIndexView index) {
+  const Shape* subshape = &shape;
+  for (auto i : index) {
+    if (!IsTuple(*subshape) || i >= subshape->tuple_shapes_size()) {
+      return false;
+    }
+    subshape = &subshape->tuple_shapes(i);
+  }
+  return true;
+}
+
 /* static */ const Shape& ShapeUtil::GetSubshape(const Shape& shape,
                                                  ShapeIndexView index) {
   const Shape* return_shape = &shape;
@@ -882,10 +905,17 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
            std::is_permutation(minor_to_major.begin(), minor_to_major.end(),
                                dims.begin()));
   }
-  Shape stripped_shape =
-      shape.has_layout() ? MakeShapeWithLayout(shape.element_type(),
-                                               dimension_sizes, minor_to_major)
-                         : MakeShape(shape.element_type(), dimension_sizes);
+  Shape stripped_shape;
+  if (LayoutUtil::IsDenseArray(shape)) {
+    stripped_shape = MakeShapeWithLayout(shape.element_type(), dimension_sizes,
+                                         minor_to_major);
+  } else if (LayoutUtil::IsSparseArray(shape)) {
+    stripped_shape =
+        MakeShapeWithSparseLayout(shape.element_type(), dimension_sizes,
+                                  shape.layout().max_sparse_elements());
+  } else {
+    stripped_shape = MakeShape(shape.element_type(), dimension_sizes);
+  }
 
   VLOG(10) << "Original_shape: " << HumanStringWithLayout(shape);
   VLOG(10) << "Stripped_shape: " << HumanStringWithLayout(stripped_shape);
@@ -1073,9 +1103,10 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 /* static */ bool ShapeUtil::TransposeIsBitcast(
     const Shape& input_shape, const Shape& output_shape,
     tensorflow::gtl::ArraySlice<int64> dimension_mapping) {
-  // Can't insert bitcasts without layout information.
-  if (!LayoutUtil::HasLayout(input_shape) &&
-      !LayoutUtil::HasLayout(output_shape)) {
+  CHECK(LayoutUtil::HasLayout(input_shape) &&
+        LayoutUtil::HasLayout(output_shape));
+
+  if (!SameElementType(input_shape, output_shape)) {
     return false;
   }
 
@@ -1106,9 +1137,10 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ bool ShapeUtil::ReshapeIsBitcast(const Shape& input_shape,
                                               const Shape& output_shape) {
-  // Can't convert reshapes into bitcasts without layout information.
-  if (!LayoutUtil::HasLayout(input_shape) ||
-      !LayoutUtil::HasLayout(output_shape)) {
+  CHECK(LayoutUtil::HasLayout(input_shape) &&
+        LayoutUtil::HasLayout(output_shape));
+
+  if (!SameElementType(input_shape, output_shape)) {
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 19b1aa93bd373ebd5f502d0dca56c9b31ab4fd7f..5fa728e7c2fa5faf6ba347198fdc99e56ca4c324 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -24,11 +24,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -208,6 +212,7 @@ class ShapeUtil {
 
   // Returns whether the LHS and RHS shapes have the same dimensions; note: does
   // not check element type.
+  // Precondition: IsArray(lhs) && IsArray(rhs)
   static bool SameDimensions(const Shape& lhs, const Shape& rhs);
 
   // Returns whether the lhs and rhs shapes have the same element type.
@@ -315,11 +320,25 @@ class ShapeUtil {
   // Returns an empty tuple shape. Can be used to indicate side-effects.
   static Shape MakeNil() { return MakeTupleShape({}); }
 
+  // Checks whether the shape is initialized.
+  static bool IsInitialized(const Shape& shape) {
+    return shape.element_type() != PRIMITIVE_TYPE_INVALID;
+  }
+
   // Constructs a new shape with the given element type and sequence of
   // dimensions.
   static Shape MakeShape(PrimitiveType element_type,
                          tensorflow::gtl::ArraySlice<int64> dimensions);
 
+  // Creates a Shape with element type corresponding to T and the given
+  // dimensions
+  template <typename T>
+  static Shape MakeShapeWithType(
+      tensorflow::gtl::ArraySlice<int64> dimensions) {
+    return ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<T>(),
+                                dimensions);
+  }
+
   // Constructs a new shape with the given minor_to_major order in its Layout.
   // Returns a value shape such that shape.has_layout().
   static Shape MakeShapeWithLayout(
@@ -430,6 +449,9 @@ class ShapeUtil {
   static bool ShapeIs(const Shape& shape, PrimitiveType element_type,
                       std::initializer_list<int64> dimensions);
 
+  // Returns true if the given shape has a subshape at the given index.
+  static bool IndexIsValid(const Shape& shape, ShapeIndexView index);
+
   // GetSubshape and GetMutableSubshape return a particular nested Shape within
   // the given Shape argument.
   static const Shape& GetSubshape(const Shape& shape, ShapeIndexView index);
@@ -522,12 +544,16 @@ class ShapeUtil {
   // Returns whether a transpose from input_shape to output_shape with dimension
   // mapping "dimension_mapping" produces a result which is bit-wise identical
   // to its input and thus may be replaced with a bitcast.
+  //
+  // Precondition: Both input_shape and output_shape have explicit layouts.
   static bool TransposeIsBitcast(
       const Shape& input_shape, const Shape& output_shape,
       tensorflow::gtl::ArraySlice<int64> dimension_mapping);
 
   // Returns whether a reshape from "input_shape" to "output_shape" is a
   // bitcast.
+  //
+  // Precondition: Both input_shape and output_shape have explicit layouts.
   static bool ReshapeIsBitcast(const Shape& input_shape,
                                const Shape& output_shape);
 
@@ -560,16 +586,84 @@ class ShapeUtil {
   // The visitor_function visitor function should return true if it wants to
   // continue, or false otherwise.
   //
-  // visitor_function must be a callable of type bool(const std::vector<int64>&)
-  // or compatible.
+  // visitor_function must be a callable of type
+  // StatusOr<bool>(ArraySlice<int64>) or compatible.
+  template <typename FnType>
+  static Status ForEachIndexWithStatus(const Shape& shape,
+                                       tensorflow::gtl::ArraySlice<int64> base,
+                                       tensorflow::gtl::ArraySlice<int64> count,
+                                       tensorflow::gtl::ArraySlice<int64> incr,
+                                       const FnType& visitor_function) {
+    return ForEachIndexInternal(shape, base, count, incr, visitor_function);
+  }
+
+  // Simple ergonomic wrapper around ShapeUtil::ForEachIndexWithStatus.
+  struct IndexIterationSpace {
+    std::vector<int64> index_base;
+    std::vector<int64> index_count;
+    std::vector<int64> index_incr;
+  };
+
+  template <typename FnTy>
+  static Status ForEachIndexWithStatus(
+      const Shape& shape, const IndexIterationSpace& iteration_space,
+      FnTy&& function) {
+    return ShapeUtil::ForEachIndexWithStatus(
+        shape, iteration_space.index_base, iteration_space.index_count,
+        iteration_space.index_incr, std::forward<FnTy>(function));
+  }
+
   template <typename FnType>
   static void ForEachIndex(const Shape& shape,
                            tensorflow::gtl::ArraySlice<int64> base,
                            tensorflow::gtl::ArraySlice<int64> count,
                            tensorflow::gtl::ArraySlice<int64> incr,
                            const FnType& visitor_function) {
+    ForEachIndexWithStatus(shape, base, count, incr,
+                           [&](tensorflow::gtl::ArraySlice<int64> indices) {
+                             return StatusOr<bool>(visitor_function(indices));
+                           })
+        .IgnoreError();
+  }
+
+  // A parallel version of ForEachIndex(WithStatus). This can only be used if
+  // the visitor_function is thread-safe and the order of iteration does not
+  // matter.
+  //
+  // visitor_function must be a callable of type
+  // void(ArraySlice<int64>) or compatible.
+  template <typename FnType>
+  static void ForEachIndexParallel(const Shape& shape,
+                                   tensorflow::gtl::ArraySlice<int64> base,
+                                   tensorflow::gtl::ArraySlice<int64> count,
+                                   tensorflow::gtl::ArraySlice<int64> incr,
+                                   const FnType& visitor_function) {
+    // The parallel version of ForEachIndexInternal can never fail.
+    CHECK(ForEachIndexInternal(
+              shape, base, count, incr,
+              [&visitor_function](tensorflow::gtl::ArraySlice<int64> indexes)
+                  -> StatusOr<bool> {
+                visitor_function(indexes);
+                return true;
+              },
+              /*parallel=*/true)
+              .ok());
+  }
+
+ private:
+  // Validates all of the non-layout properties of the shape -- this is a helper
+  // used by both the layout-optional and layout-required public method.
+  static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape);
+
+  template <typename FnType>
+  static Status ForEachIndexInternal(const Shape& shape,
+                                     tensorflow::gtl::ArraySlice<int64> base,
+                                     tensorflow::gtl::ArraySlice<int64> count,
+                                     tensorflow::gtl::ArraySlice<int64> incr,
+                                     const FnType& visitor_function,
+                                     bool parallel = false) {
     if (ShapeUtil::HasZeroElements(shape)) {
-      return;
+      return Status::OK();
     }
     CHECK_EQ(Rank(shape), base.size());
     CHECK_EQ(incr.size(), base.size());
@@ -579,7 +673,22 @@ class ShapeUtil {
     // once with the proper empty indexes.
     int64 n = -1;
     std::vector<int64> indexes(base.begin(), base.end());
-    while (n < rank && visitor_function(indexes)) {
+    const int kNumThreads = tensorflow::port::NumSchedulableCPUs();
+    tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
+    if (parallel) {
+      pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads);
+    }
+
+    while (n < rank) {
+      if (pool != tensorflow::gtl::nullopt) {
+        pool->Schedule(
+            [indexes, &visitor_function] { visitor_function(indexes); });
+      } else {
+        TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
+        if (!should_continue) {
+          break;
+        }
+      }
       // Increments dimensions in minor to major order.
       for (n = 0; n < rank; ++n) {
         int64 dim = LayoutUtil::Minor(shape.layout(), n);
@@ -590,12 +699,9 @@ class ShapeUtil {
         indexes[dim] = base[dim];
       }
     }
-  }
 
- private:
-  // Validates all of the non-layout properties of the shape -- this is a helper
-  // used by both the layout-optional and layout-required public method.
-  static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape);
+    return Status::OK();
+  }
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil);
 };
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 4db97d45b20b86dc60531845c6e28a223203ff7f..f7675e97da7b061bde063e5093256c2288f99c98 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -238,6 +238,18 @@ TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentDimensions) {
   EXPECT_FALSE(ShapeUtil::Compatible(tuple1, tuple2));
 }
 
+TEST(ShapeUtilTest, IncompatibleScalarVsTuple) {
+  Shape shape1 = ShapeUtil::MakeShape(F32, {});
+  Shape shape2 = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {3, 2}), ShapeUtil::MakeShape(U32, {})});
+  EXPECT_FALSE(ShapeUtil::Compatible(shape1, shape2));
+  EXPECT_FALSE(ShapeUtil::Compatible(shape2, shape1));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(shape1, shape2));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(shape2, shape1));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape1, shape2));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape2, shape1));
+}
+
 TEST(ShapeUtilTest, CompareShapesWithPaddedDimensionsMismatch) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30});
   shape1.mutable_layout()->add_padded_dimensions(10);
@@ -573,10 +585,11 @@ TEST(ShapeUtilTest, ForEachIndex) {
     Shape shape = ShapeUtil::MakeShape(F32, data.dimensions);
     // Increments at every invocation.
     int invocations = 0;
-    auto increment_func = [&invocations](const std::vector<int64>& indexes) {
-      invocations++;
-      return true;
-    };
+    auto increment_func =
+        [&invocations](tensorflow::gtl::ArraySlice<int64> indexes) {
+          invocations++;
+          return true;
+        };
 
     std::vector<int64> zero_base(data.dimensions.size(), 0);
     std::vector<int64> step(data.dimensions.size(), 1);
@@ -588,6 +601,47 @@ TEST(ShapeUtilTest, ForEachIndex) {
   }
 }
 
+TEST(ShapeUtilTest, ForEachIndexWithStatus) {
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 10});
+  // Increments at every invocation.
+  int invocations = 0;
+  auto increment_func =
+      [&invocations](
+          tensorflow::gtl::ArraySlice<int64> indexes) -> StatusOr<bool> {
+    if (++invocations == 5) {
+      return Unimplemented("Cannot increment beyond 5.");
+    }
+    return true;
+  };
+
+  Status error_status = ShapeUtil::ForEachIndexWithStatus(
+      shape, /*base=*/{0, 0}, /*count=*/{10, 10}, /*incr=*/{0, 1},
+      increment_func);
+
+  EXPECT_FALSE(error_status.ok());
+  EXPECT_THAT(error_status.error_message(),
+              ::testing::HasSubstr("Cannot increment beyond 5."));
+  EXPECT_EQ(invocations, 5);
+}
+
+TEST(ShapeUtilTest, ForEachIndexParallel) {
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 10});
+  int64 output[10][10];
+  int init = 5;
+  auto set_func = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+    output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
+  };
+
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{0, 0}, /*count=*/{10, 10},
+                                  /*incr=*/{1, 1}, set_func);
+
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(output[i][j], init + i + j);
+    }
+  }
+}
+
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1x1_to_1x1x1) {
   // All output dimensions should be unmodified. One of the input dimensions is
   // modified because the input rank is larger by one.
@@ -659,6 +713,16 @@ TEST(ShapeUtilTest, ReshapeIsBitcast_3x2x2_6x2_Dim1IsMostMinor) {
       ShapeUtil::MakeShapeWithLayout(F32, {6, 2}, {0, 1})));
 }
 
+TEST(ShapeUtilTest, StripDegenerateDimensions) {
+  EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::StripDegenerateDimensions(
+                                   ShapeUtil::MakeShape(F32, {3, 1, 2})),
+                               ShapeUtil::MakeShape(F32, {3, 2})));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      ShapeUtil::StripDegenerateDimensions(
+          ShapeUtil::MakeShapeWithSparseLayout(F32, {3, 1, 2}, 10)),
+      ShapeUtil::MakeShapeWithSparseLayout(F32, {3, 2}, 10)));
+}
+
 TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) {
   EXPECT_FALSE(ShapeUtil::ReshapeIsBitcast(
       ShapeUtil::MakeShapeWithLayout(F32, {3, 2, 2}, {0, 1, 2}),
diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h
index 641b5e9a6accc0a2e7737f79bcd485d317e4e521..cccbce5fc83af87396f4d51eb9e785cea93aba0b 100644
--- a/tensorflow/compiler/xla/statusor.h
+++ b/tensorflow/compiler/xla/statusor.h
@@ -113,17 +113,19 @@ class StatusOr : private internal_statusor::StatusOrData<T>,
   StatusOr& operator=(StatusOr&&) = default;
 
   // Conversion copy/move constructor, T must be convertible from U.
-  // TODO(b/62186717): These should not participate in overload resolution if U
-  // is not convertible to T.
-  template <typename U>
+  template <typename U, typename std::enable_if<
+                            std::is_convertible<U, T>::value>::type* = nullptr>
   StatusOr(const StatusOr<U>& other);
-  template <typename U>
+  template <typename U, typename std::enable_if<
+                            std::is_convertible<U, T>::value>::type* = nullptr>
   StatusOr(StatusOr<U>&& other);
 
   // Conversion copy/move assignment operator, T must be convertible from U.
-  template <typename U>
+  template <typename U, typename std::enable_if<
+                            std::is_convertible<U, T>::value>::type* = nullptr>
   StatusOr& operator=(const StatusOr<U>& other);
-  template <typename U>
+  template <typename U, typename std::enable_if<
+                            std::is_convertible<U, T>::value>::type* = nullptr>
   StatusOr& operator=(StatusOr<U>&& other);
 
   // Constructs a new StatusOr with the given value. After calling this
@@ -233,12 +235,14 @@ StatusOr<T>& StatusOr<T>::operator=(Status&& status) {
 }
 
 template <typename T>
-template <typename U>
+template <typename U,
+          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
 inline StatusOr<T>::StatusOr(const StatusOr<U>& other)
     : Base(static_cast<const typename StatusOr<U>::Base&>(other)) {}
 
 template <typename T>
-template <typename U>
+template <typename U,
+          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
 inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr<U>& other) {
   if (other.ok())
     this->Assign(other.ValueOrDie());
@@ -248,12 +252,14 @@ inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr<U>& other) {
 }
 
 template <typename T>
-template <typename U>
+template <typename U,
+          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
 inline StatusOr<T>::StatusOr(StatusOr<U>&& other)
     : Base(static_cast<typename StatusOr<U>::Base&&>(other)) {}
 
 template <typename T>
-template <typename U>
+template <typename U,
+          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
 inline StatusOr<T>& StatusOr<T>::operator=(StatusOr<U>&& other) {
   if (other.ok()) {
     this->Assign(std::move(other).ValueOrDie());
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 8339d08ef4d7455f9739b80074ab0405a404e8e8..840292010d50fde3d36983de9f6f4f0e4cfc7ed6 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -44,6 +44,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -138,6 +139,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -151,6 +153,8 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
@@ -188,6 +192,10 @@ cc_library(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -252,8 +260,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
@@ -271,6 +279,9 @@ cc_library(
 xla_test(
     name = "bad_rng_shape_validation_test",
     srcs = ["bad_rng_shape_validation_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
@@ -280,6 +291,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -290,6 +303,9 @@ xla_test(
 xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -300,6 +316,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -309,6 +327,9 @@ xla_test(
 xla_test(
     name = "query_inferred_shape_test",
     srcs = ["query_inferred_shape_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -316,6 +337,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -325,6 +348,9 @@ xla_test(
 xla_test(
     name = "while_test",
     srcs = ["while_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -332,10 +358,10 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -354,6 +380,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -366,9 +394,14 @@ xla_test(
 xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -394,6 +427,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -419,6 +454,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -430,11 +467,16 @@ xla_test(
 xla_test(
     name = "pred_test",
     srcs = ["pred_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -444,11 +486,16 @@ xla_test(
 xla_test(
     name = "select_test",
     srcs = ["select_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -460,11 +507,13 @@ xla_test(
 xla_test(
     name = "conditional_test",
     srcs = ["conditional_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -476,12 +525,15 @@ xla_test(
 xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -503,6 +555,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -522,6 +576,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -532,6 +588,9 @@ xla_test(
 xla_test(
     name = "deconstruct_tuple_test",
     srcs = ["deconstruct_tuple_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -543,6 +602,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -568,6 +629,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -591,6 +654,7 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -608,9 +672,9 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -622,8 +686,10 @@ xla_test(
 xla_test(
     name = "dot_operation_test",
     srcs = ["dot_operation_test.cc"],
+    shard_count = 20,
     tags = [
         "enable_for_xla_interpreter",
+        "optonly",
     ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
@@ -632,6 +698,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -642,32 +710,21 @@ xla_test(
     ],
 )
 
-# Tests the dot operation in some cases that can be performed via a
-# runtime call on some backends - e.g. a runtime call to Eigen.
 xla_test(
-    name = "dot_operation_runtime_test",
-    srcs = ["dot_operation_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
+    name = "gather_operation_test",
+    srcs = ["gather_operation_test.cc"],
     deps = [
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:array3d",
-        "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_utils",
+        ":client_library_test_base",
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
-# Repeat dot_operation_runtime_test with single-threded eigen.
+# Repeat dot_operation_runtime_test with single-threaded eigen.
 xla_test(
     name = "dot_operation_single_threaded_runtime_test",
     srcs = ["dot_operation_test.cc"],
@@ -675,10 +732,9 @@ xla_test(
         "cpu": [
             "--xla_cpu_multi_thread_eigen=false",
         ],
-        "cpu_parallel": [
-            "--xla_cpu_multi_thread_eigen=false",
-        ],
     },
+    shard_count = 20,
+    tags = ["optonly"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -686,6 +742,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -699,6 +757,9 @@ xla_test(
 xla_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -706,6 +767,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -717,6 +780,9 @@ xla_test(
 xla_test(
     name = "constants_test",
     srcs = ["constants_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -725,6 +791,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -747,10 +815,10 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -766,7 +834,6 @@ xla_test(
     backend_tags = {
         # TODO(b/31436974): Fix msan failure. Failed on 2016-09-12.
         "cpu": ["nomsan"],
-        "cpu_parallel": ["nomsan"],
     },
     shard_count = 30,
     deps = [
@@ -778,6 +845,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -801,6 +870,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -826,11 +897,11 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -863,6 +934,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -877,8 +950,7 @@ xla_test(
     name = "half_test",
     srcs = ["half_test.cc"],
     backends = [
-        # TODO(b/72509305): Flaky (fails with SEGV) as of 2018-01-25
-        # "cpu",
+        "cpu",
         "gpu",
     ],
     deps = [
@@ -889,6 +961,8 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -902,11 +976,14 @@ xla_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
     shard_count = 40,
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -918,11 +995,16 @@ xla_test(
 xla_test(
     name = "multidimensional_slice_test",
     srcs = ["multidimensional_slice_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -934,14 +1016,16 @@ xla_test(
     name = "dynamic_ops_test",
     timeout = "moderate",
     srcs = ["dynamic_ops_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
@@ -960,6 +1044,9 @@ xla_test(
 xla_test(
     name = "tuple_test",
     srcs = ["tuple_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -970,7 +1057,10 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -980,6 +1070,9 @@ xla_test(
 xla_test(
     name = "vector_ops_reduce_test",
     srcs = ["vector_ops_reduce_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -987,6 +1080,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -998,6 +1093,10 @@ xla_test(
     name = "reduce_test",
     srcs = ["reduce_test.cc"],
     shard_count = 40,
+    tags = [
+        "enable_for_xla_interpreter",
+        "optonly",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1013,6 +1112,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1035,10 +1136,11 @@ xla_test_library(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1075,11 +1177,11 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1091,12 +1193,16 @@ xla_test(
 xla_test(
     name = "copy_test",
     srcs = ["copy_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1109,6 +1215,9 @@ xla_test(
 xla_test(
     name = "reduce_hlo_test",
     srcs = ["reduce_hlo_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1122,6 +1231,9 @@ xla_test(
 xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -1129,6 +1241,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1159,12 +1273,17 @@ xla_test(
 xla_test(
     name = "binop_scaling_test",
     srcs = ["binop_scaling_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1175,6 +1294,9 @@ xla_test(
 xla_test(
     name = "broadcast_simple_test",
     srcs = ["broadcast_simple_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1183,6 +1305,8 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1192,15 +1316,18 @@ xla_test(
 xla_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1212,9 +1339,14 @@ xla_test(
 xla_test(
     name = "fmax_test",
     srcs = ["fmax_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1225,9 +1357,14 @@ xla_test(
 xla_test(
     name = "log_test",
     srcs = ["log_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1238,6 +1375,9 @@ xla_test(
 xla_test(
     name = "matrix_ops_simple_test",
     srcs = ["matrix_ops_simple_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -1250,8 +1390,11 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1269,6 +1412,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -1280,6 +1425,9 @@ xla_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
     shard_count = 30,
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1291,10 +1439,10 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1306,11 +1454,14 @@ xla_test(
 xla_test(
     name = "reverse_test",
     srcs = ["reverse_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1322,6 +1473,9 @@ xla_test(
 xla_test(
     name = "vector_ops_simple_test",
     srcs = ["vector_ops_simple_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:shape_util",
@@ -1333,6 +1487,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1345,6 +1501,9 @@ xla_test(
 xla_test(
     name = "concat_test",
     srcs = ["concat_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1352,9 +1511,9 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1365,11 +1524,17 @@ xla_test(
 xla_test(
     name = "convert_test",
     srcs = ["convert_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1382,11 +1547,14 @@ xla_test(
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1409,6 +1577,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1421,9 +1591,14 @@ xla_test(
 xla_test(
     name = "floor_ceil_test",
     srcs = ["floor_ceil_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1447,6 +1622,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1467,6 +1644,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1483,6 +1662,8 @@ xla_test(
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1496,6 +1677,8 @@ xla_test(
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1504,17 +1687,20 @@ xla_test(
 xla_test(
     name = "replay_test",
     srcs = ["replay_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1526,6 +1712,9 @@ xla_test(
 xla_test(
     name = "broadcast_test",
     srcs = ["broadcast_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -1582,6 +1771,9 @@ xla_test(
 xla_test(
     name = "fusion_test",
     srcs = ["fusion_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -1592,6 +1784,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1619,6 +1813,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1656,6 +1852,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1681,6 +1879,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1704,9 +1904,8 @@ tf_cc_test(
     deps = [
         ":local_client_test_base",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:computation_tracker",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/core:test_main",
@@ -1740,6 +1939,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1766,6 +1967,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1778,6 +1981,7 @@ xla_test(
     name = "deep_graph_test",
     srcs = ["deep_graph_test.cc"],
     deps = [
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -1854,16 +2058,17 @@ tf_cc_test(
     ],
 )
 
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+xla_test(
+    name = "test_utils_test",
+    srcs = ["test_utils_test.cc"],
+    deps = [
+        ":local_client_test_base",
+        ":test_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
 )
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 7e9005001db34d403ea923eb9c152d114bf32803..e8a5efe796a9209307ecfa343b89f66ff2a34e0f 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -50,28 +50,28 @@ class ArrayElementwiseOpTestParamCount
       public ::testing::WithParamInterface<int> {};
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantZeroElementF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   ComputeAndCompareR1<float>(&builder, {2.5f, -3.14f, -2.25f, 10.0f, -6.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({-1, 0, 1, 324,
                                       std::numeric_limits<int32>::min(),
                                       std::numeric_limits<int32>::max()});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   // -min == min for int32 due to an overflow. In C++ it is undefined behavior
   // to do this calculation. For XLA we have not specified that, so it
@@ -83,28 +83,55 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantZeroElementC64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantC64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 1.0f}, {0.0f, 3.14f}, {2.25f, -1.0f}, {-10.0f, 0.0f}});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   ComputeAndCompareR1<complex64>(
       &builder, {{2.5f, -1.0f}, {0.0f, -3.14f}, {-2.25f, 1.0f}, {10.0f, 0.0f}},
       {}, error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS64) {
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<int64>({
+      -1,
+      1,
+      0,
+      0x12345678,
+      static_cast<int64>(0xffffffff12345678l),
+      static_cast<int64>(0x8000000000000000LL),
+      static_cast<int64>(0x8000000000000001LL),
+  });
+  builder.Neg(a);
+  LOG(INFO) << -static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
+
+  ComputeAndCompareR1<int64>(&builder,
+                             {
+                                 1,
+                                 -1,
+                                 0,
+                                 -0x12345678,
+                                 0xedcba988,
+                                 static_cast<int64>(0x8000000000000000LL),
+                                 -static_cast<int64>(0x8000000000000001LL),
+                             },
+                             {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
-  auto result = builder.IsFinite(a);
+  builder.IsFinite(a);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
@@ -113,64 +140,63 @@ XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteZeroElementF32s) {
 static const float kNonCanonicalNaN = tensorflow::bit_cast<float>(0x7FD01234);
 
 XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteScalarF32) {
-  ComputationBuilder builder(client_, TestName());
-  auto result = builder.IsFinite(builder.ConstantR0<float>(NAN));
+  XlaBuilder builder(TestName());
+  builder.IsFinite(builder.ConstantR0<float>(NAN));
   ComputeAndCompareR0<bool>(&builder, false, {});
 
   EXPECT_TRUE(std::isnan(kNonCanonicalNaN));
-  auto result_non_canonical =
-      builder.IsFinite(builder.ConstantR0<float>(kNonCanonicalNaN));
+  builder.IsFinite(builder.ConstantR0<float>(kNonCanonicalNaN));
   ComputeAndCompareR0<bool>(&builder, false, {});
 
   const float inf = std::numeric_limits<float>::infinity();
-  auto result_inf = builder.IsFinite(builder.ConstantR0<float>(inf));
+  builder.IsFinite(builder.ConstantR0<float>(inf));
   ComputeAndCompareR0<bool>(&builder, false, {});
 
-  auto result_neg_inf = builder.IsFinite(builder.ConstantR0<float>(-inf));
+  builder.IsFinite(builder.ConstantR0<float>(-inf));
   ComputeAndCompareR0<bool>(&builder, false, {});
 
-  auto result_zero = builder.IsFinite(builder.ConstantR0<float>(0.0f));
+  builder.IsFinite(builder.ConstantR0<float>(0.0f));
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteR1F32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const float inf = std::numeric_limits<float>::infinity();
   EXPECT_TRUE(std::isnan(kNonCanonicalNaN));
   auto a = builder.ConstantR1<float>(
       {{NAN, 7.0f, kNonCanonicalNaN, -1.0f, inf, -inf}});
-  auto result = builder.IsFinite(a);
+  builder.IsFinite(a);
 
   ComputeAndCompareR1<bool>(&builder, {false, true, false, true, false, false},
                             {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({100.0f, 3.13f, 2.75f, 10.5f, -999.0f});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR1<float>(&builder, {97.5f, 6.27f, 5.0f, 0.5f, -993.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 0.0f}, {0.0f, 3.14f}, {2.25f, 0.0f}, {1.0f, -10.0f}});
   auto b = builder.ConstantR1<complex64>(
       {{100.0f, 0.0f}, {3.13f, 0.0f}, {2.75f, 1.0f}, {-2.0f, 10.5f}});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR1<complex64>(
       &builder, {97.5f, {3.13f, 3.14f}, {5.0f, 1.0f}, {-1.0f, 0.5f}}, {},
@@ -178,17 +204,97 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantC64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
   auto b = builder.ConstantR1<complex64>({});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) {
+  XlaBuilder b(TestName());
+
+  std::vector<uint64> lhs{0xFFFFFFFF,
+                          static_cast<uint64>(-1),
+                          0,
+                          0,
+                          0x7FFFFFFFFFFFFFFFLL,
+                          0x7FFFFFFFFFFFFFFLL,
+                          0x8000000000000000LL,
+                          0x8000000000000000LL,
+                          1};
+  std::unique_ptr<Literal> lhs_literal = Literal::CreateR1<uint64>({lhs});
+  auto lhs_param = b.Parameter(0, lhs_literal->shape(), "lhs_param");
+  std::unique_ptr<GlobalData> lhs_data =
+      client_->TransferToServer(*lhs_literal).ConsumeValueOrDie();
+
+  std::vector<uint64> rhs{1,
+                          0x7FFFFFFFFFFFFFFLL,
+                          0x7FFFFFFFFFFFFFFFLL,
+                          0x8000000000000000LL,
+                          0,
+                          static_cast<uint64>(-1),
+                          0,
+                          1,
+                          0x8000000000000000LL};
+  std::unique_ptr<Literal> rhs_literal = Literal::CreateR1<uint64>({rhs});
+  auto rhs_param = b.Parameter(1, rhs_literal->shape(), "rhs_param");
+  std::unique_ptr<GlobalData> rhs_data =
+      client_->TransferToServer(*rhs_literal).ConsumeValueOrDie();
+
+  b.Add(lhs_param, rhs_param);
+
+  std::vector<uint64> expected(lhs.size());
+  for (int64 i = 0; i < lhs.size(); ++i) {
+    expected[i] = lhs[i] + rhs[i];
+  }
+
+  ComputeAndCompareR1<uint64>(&b, expected, {lhs_data.get(), rhs_data.get()});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) {
+  XlaBuilder b(TestName());
+
+  std::vector<int64> lhs{static_cast<int64>(0x8000000000000000LL),
+                         static_cast<int64>(0x8000000000000000LL),
+                         -1,
+                         0x7FFFFFFFFFFFFFFLL,
+                         0x7FFFFFFFFFFFFFFFLL,
+                         1,
+                         0,
+                         -1};
+  std::unique_ptr<Literal> lhs_literal = Literal::CreateR1<int64>({lhs});
+  auto lhs_param = b.Parameter(0, lhs_literal->shape(), "lhs_param");
+  std::unique_ptr<GlobalData> lhs_data =
+      client_->TransferToServer(*lhs_literal).ConsumeValueOrDie();
+
+  std::vector<int64> rhs{-1,
+                         0,
+                         static_cast<int64>(0x8000000000000000LL),
+                         1,
+                         0,
+                         0x7FFFFFFFFFFFFFFLL,
+                         0x7FFFFFFFFFFFFFFFLL,
+                         0x7FFFFFFFFFFFFFFFLL};
+  std::unique_ptr<Literal> rhs_literal = Literal::CreateR1<int64>({rhs});
+  auto rhs_param = b.Parameter(1, rhs_literal->shape(), "rhs_param");
+  std::unique_ptr<GlobalData> rhs_data =
+      client_->TransferToServer(*rhs_literal).ConsumeValueOrDie();
+
+  auto sub = b.Sub(lhs_param, rhs_param);
+
+  std::vector<int64> expected(lhs.size());
+  for (int64 i = 0; i < lhs.size(); ++i) {
+    expected[i] = lhs[i] - rhs[i];
+  }
+
+  ComputeAndCompareR1<int64>(&b, expected, {lhs_data.get(), rhs_data.get()});
+}
+
 TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
   const int count = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<float> a_values;
   std::vector<float> b_values;
   for (int i = 0; i < count; ++i) {
@@ -227,49 +333,49 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({100.0f, 3.13f, 2.75f, 10.5f, -999.0f});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<float>(&builder, {-102.5f, 0.01f, -0.5f, -20.5f, 1005.0f},
                              {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({-1, 0, 2, 1000000000});
   auto b = builder.ConstantR1<int32>({-1, 2, 1, -1});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {0, -2, 1, 1000000001}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   auto b = builder.ConstantR1<int32>({});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 0.0f}, {0.0f, 3.14f}, {3.0f, 2.25f}});
   auto b = builder.ConstantR1<complex64>(
       {{0.0f, 10.0f}, {3.13f, 0.0f}, {2.75f, -0.25f}});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<complex64>(
       &builder, {{-2.5f, -10.0f}, {-3.13f, 3.14f}, {0.25f, 2.5f}}, {},
@@ -277,29 +383,29 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantC64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
   auto b = builder.ConstantR1<complex64>({});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({10.0f, 5.1f, 1.0f, 10.0f, -6.0f});
-  auto add = builder.Div(a, b);
+  builder.Div(a, b);
 
   ComputeAndCompareR1<float>(&builder, {-0.25f, 5.0f, 2.25f, -1.0f, -1.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Div(a, b);
+  builder.Div(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
@@ -329,9 +435,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
-    ComputationDataHandle divisor;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
+    XlaOp divisor;
     auto dividend_data =
         CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
     auto divisor_data =
@@ -344,8 +450,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
 
   // Test with a compile-time constant divisor.
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
     auto dividend_data =
         CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
     builder.Div(dividend, builder.ConstantR1<int32>(divisors));
@@ -354,9 +460,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
-    ComputationDataHandle divisor;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
+    XlaOp divisor;
     auto dividend_data =
         CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
     auto divisor_data =
@@ -369,8 +475,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
 
   // Test with a compile-time constant divisor.
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
     auto dividend_data =
         CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
     builder.Rem(dividend, builder.ConstantR1<int32>(divisors));
@@ -400,9 +506,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
-    ComputationDataHandle divisor;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
+    XlaOp divisor;
     auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
                                                    &builder, &dividend);
     auto divisor_data =
@@ -414,8 +520,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
     auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
                                                    &builder, &dividend);
     builder.Div(dividend, builder.ConstantR1<uint32>(divisors));
@@ -424,9 +530,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
-    ComputationDataHandle divisor;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
+    XlaOp divisor;
     auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
                                                    &builder, &dividend);
     auto divisor_data =
@@ -438,8 +544,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
     auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
                                                    &builder, &dividend);
     builder.Rem(dividend, builder.ConstantR1<uint32>(divisors));
@@ -449,33 +555,33 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 1.0f}, {-25.5f, 0.0f}, {2.0f, -1.0f}});
   auto b = builder.ConstantR1<complex64>(
       {{10.0f, 0.0f}, {0.0f, 1.0f}, {2.0f, -1.0f}});
-  auto div = builder.Div(a, b);
+  builder.Div(a, b);
 
   ComputeAndCompareR1<complex64>(
       &builder, {{-0.25f, 0.1f}, {0.0f, 25.5f}, {1.0f, 0.0f}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
   auto b = builder.ConstantR1<complex64>({});
-  auto div = builder.Div(a, b);
+  builder.Div(a, b);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, RemF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>(
       {-2.5f, 25.5f, 2.25f, -10.0f, 6.0f, 3.0f, 3.0f, -1.0f, -8.0f});
   auto b = builder.ConstantR1<float>(
       {10.0f, 5.1f, 1.0f, 10.0f, -6.0f, 2.0f, -2.0f, 7.0f, -4.0f});
-  auto add = builder.Rem(a, b);
+  builder.Rem(a, b);
 
   ComputeAndCompareR1<float>(
       &builder, {-2.5f, 0.0f, 0.25f, 0.0f, -0.0f, 1.0f, 1.0f, -1.0f, -0.0f}, {},
@@ -483,21 +589,21 @@ XLA_TEST_F(ArrayElementwiseOpTest, RemF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, RemZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Rem(a, b);
+  builder.Rem(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, RemF64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<double>(
       {-2.5, 25.5, 2.25, -10.0, 6.0, 3.0, 3.0, -1.0, -8.0});
   auto b = builder.ConstantR1<double>(
       {10.0, 5.1, 1.0, 10.0, -6.0, 2.0, -2.0, 7.0, -4.0});
-  auto add = builder.Rem(a, b);
+  builder.Rem(a, b);
 
   ComputeAndCompareR1<double>(
       &builder, {-2.5, 0.0, 0.25, 0.0, -0.0, 1.0, 1.0, -1.0, -0.0}, {},
@@ -505,20 +611,20 @@ XLA_TEST_F(ArrayElementwiseOpTest, RemF64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, -6.0f});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<float>(&builder, {-25.0f, 127.5f, 2.25f, -100.0f, -36.0f},
                              {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
@@ -541,19 +647,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantS32s) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>(a_data);
   auto b = builder.ConstantR1<int32>(b_data);
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   auto b = builder.ConstantR1<int32>({});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
@@ -572,21 +678,21 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantU32s) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>(a_data);
   auto b = builder.ConstantR1<uint32>(b_data);
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<uint32>(&builder, expected, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 0.0f}, {0.0f, 25.5f}, {2.0f, -10.0f}});
   auto b = builder.ConstantR1<complex64>(
       {{0.0f, 10.0f}, {5.0f, 1.0f}, {10.0f, -6.0f}});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<complex64>(
       &builder, {{0.0f, -25.0f}, {-25.5f, 127.5f}, {-40.0f, -112.0}}, {},
@@ -594,378 +700,386 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantC64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
   auto b = builder.ConstantR1<complex64>({});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({false, false, true, true});
   auto b = builder.ConstantR1<bool>({false, true, false, true});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<bool>(&builder, {false, false, false, true}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndPredR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({{false, false}, {true, true}});
   auto b = builder.ConstantR2<bool>({{false, true}, {false, true}});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   Array2D<bool> expected_array({{false, false}, {false, true}});
   ComputeAndCompareR2<bool>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndZeroElementPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({});
   auto b = builder.ConstantR1<bool>({});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({0, -1, -8});
   auto b = builder.ConstantR1<int32>({5, -7, 12});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {0, -7, 8}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndS32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<int32>({{0, -5}, {-1, 5}});
   auto b = builder.ConstantR2<int32>({{1, -6}, {4, 5}});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   Array2D<int32> expected_array({{0, -6}, {4, 5}});
   ComputeAndCompareR2<int32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndZeroElementS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   auto b = builder.ConstantR1<int32>({});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({0, 1, 8});
   auto b = builder.ConstantR1<int32>({5, 7, 12});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {0, 1, 8}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndU32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<uint32>({{0, 1}, {3, 8}});
   auto b = builder.ConstantR2<uint32>({{1, 0}, {7, 6}});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   Array2D<uint32> expected_array({{0, 0}, {3, 0}});
   ComputeAndCompareR2<uint32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndZeroElementU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({});
   auto b = builder.ConstantR1<uint32>({});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<uint32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({false, false, true, true});
   auto b = builder.ConstantR1<bool>({false, true, false, true});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<bool>(&builder, {false, true, true, true}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrPredR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({{false, false}, {true, true}});
   auto b = builder.ConstantR2<bool>({{false, true}, {false, true}});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   Array2D<bool> expected_array({{false, true}, {true, true}});
   ComputeAndCompareR2<bool>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrZeroElementPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({});
   auto b = builder.ConstantR1<bool>({});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({0, -1, 8});
   auto b = builder.ConstantR1<int32>({5, -7, 4});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {5, -1, 12}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrS32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<int32>({{0, -1}, {8, 8}});
   auto b = builder.ConstantR2<int32>({{5, -7}, {4, 1}});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   Array2D<int32> expected_array({{5, -1}, {12, 9}});
   ComputeAndCompareR2<int32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrZeroElementS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   auto b = builder.ConstantR1<int32>({});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({0, 1, 8});
   auto b = builder.ConstantR1<uint32>({5, 7, 4});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<uint32>(&builder, {5, 7, 12}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrU32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<uint32>({{0, 1}, {8, 8}});
   auto b = builder.ConstantR2<uint32>({{5, 7}, {4, 1}});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   Array2D<uint32> expected_array({{5, 7}, {12, 9}});
   ComputeAndCompareR2<uint32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrZeroElementU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({});
   auto b = builder.ConstantR1<uint32>({});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<uint32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({false, true, true, false});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<bool>(&builder, {true, false, false, true}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotPredR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({{false, true}, {true, false}});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   Array2D<bool> expected_array({{true, false}, {false, true}});
   ComputeAndCompareR2<bool>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotZeroElementPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({-1, 0, 1});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<int32>(&builder, {0, -1, -2}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotS32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<int32>({{-1, 0}, {1, 8}});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   Array2D<int32> expected_array({{0, -1}, {-2, -9}});
   ComputeAndCompareR2<int32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotZeroElementS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({0, 4294967295});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<uint32>(&builder, {4294967295, 0}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotU32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<uint32>({{0, 4294967295}, {1, 4294967294}});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   Array2D<uint32> expected_array({{4294967295, 0}, {4294967294, 1}});
   ComputeAndCompareR2<uint32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotZeroElementU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<uint32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftLeftS32) {
-  ComputationBuilder builder(client_, TestName());
-  auto a =
-      builder.ConstantR1<int32>({static_cast<int32>(0x12345678),
-                                 static_cast<int32>(0xF0001000), 1, 3, 77});
-  auto b = builder.ConstantR1<int32>({4, 8, 2, 7, 15});
-  auto out = builder.ShiftLeft(a, b);
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<int32>({static_cast<int32>(0x12345678),
+                                      static_cast<int32>(0xF0001000), 1, 3, 77,
+                                      1, -3, 77});
+  auto b = builder.ConstantR1<int32>({4, 8, 2, 7, 15, 32, 100, -1});
+  builder.ShiftLeft(a, b);
 
-  ComputeAndCompareR1<int32>(
-      &builder,
-      {static_cast<int32>(0x23456780), 0x00100000, 0x4, 0x180, 2523136}, {});
+  ComputeAndCompareR1<int32>(&builder,
+                             {static_cast<int32>(0x23456780), 0x00100000, 0x4,
+                              0x180, 2523136, 0, 0, 0},
+                             {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightArithmeticS32) {
-  ComputationBuilder builder(client_, TestName());
-  auto a =
-      builder.ConstantR1<int32>({static_cast<int32>(0x92345678),
-                                 static_cast<int32>(0x10001000), 1, 3, 77});
-  auto b = builder.ConstantR1<int32>({4, 8, 2, 7, 2});
-  auto out = builder.ShiftRightArithmetic(a, b);
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<int32>({static_cast<int32>(0x92345678),
+                                      static_cast<int32>(0x10001000), 1, 3, 77,
+                                      1, -3, 77});
+  auto b = builder.ConstantR1<int32>({4, 8, 2, 7, 2, 32, 100, -1});
+  builder.ShiftRightArithmetic(a, b);
 
-  ComputeAndCompareR1<int32>(&builder,
-                             {static_cast<int32>(0xF9234567),
-                              static_cast<int32>(0x00100010), 0, 0, 19},
-                             {});
+  ComputeAndCompareR1<int32>(
+      &builder,
+      {static_cast<int32>(0xF9234567), static_cast<int32>(0x00100010), 0, 0, 19,
+       0, -1, 0},
+      {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightLogicalS32) {
-  ComputationBuilder builder(client_, TestName());
-  auto a =
-      builder.ConstantR1<int32>({static_cast<int32>(0x92345678),
-                                 static_cast<int32>(0x10001000), 1, 3, 77});
-  auto b = builder.ConstantR1<int32>({4, 8, 2, 7, 5});
-  auto out = builder.ShiftRightLogical(a, b);
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<int32>({static_cast<int32>(0x92345678),
+                                      static_cast<int32>(0x10001000), 1, 3, 77,
+                                      1, -3, 77});
+  auto b = builder.ConstantR1<int32>({4, 8, 2, 7, 5, 32, 100, -1});
+  builder.ShiftRightLogical(a, b);
 
-  ComputeAndCompareR1<int32>(&builder, {0x09234567, 0x00100010, 0, 0, 2}, {});
+  ComputeAndCompareR1<int32>(&builder,
+                             {0x09234567, 0x00100010, 0, 0, 2, 0, 0, 0}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftLeftU32) {
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<uint32>({0x12345678, 0xF0001000, 1, 3, 77});
-  auto b = builder.ConstantR1<uint32>({4, 8, 2, 7, 15});
-  auto out = builder.ShiftLeft(a, b);
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<uint32>(
+      {0x12345678, 0xF0001000, 1, 3, 77, 1, ~3u, 77});
+  auto b = builder.ConstantR1<uint32>({4, 8, 2, 7, 15, 32, 100, ~0u});
+  builder.ShiftLeft(a, b);
 
   ComputeAndCompareR1<uint32>(
-      &builder, {0x23456780, 0x00100000, 0x4, 0x180, 2523136}, {});
+      &builder, {0x23456780, 0x00100000, 0x4, 0x180, 2523136, 0, 0, 0}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightArithmeticU32) {
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<uint32>({0x92345678, 0x10001000, 1, 3, 77});
-  auto b = builder.ConstantR1<uint32>({4, 8, 2, 7, 2});
-  auto out = builder.ShiftRightArithmetic(a, b);
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<uint32>(
+      {0x92345678, 0x10001000, 1, 3, 77, 1, ~3u, 77});
+  auto b = builder.ConstantR1<uint32>({4, 8, 2, 7, 2, 32, 100, ~0u});
+  builder.ShiftRightArithmetic(a, b);
 
-  ComputeAndCompareR1<uint32>(&builder, {0xF9234567, 0x00100010, 0, 0, 19}, {});
+  ComputeAndCompareR1<uint32>(
+      &builder, {0xF9234567, 0x00100010, 0, 0, 19, 0, ~0u, 0}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightLogicalU32) {
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<uint32>({0x92345678, 0x10001000, 1, 3, 77});
-  auto b = builder.ConstantR1<uint32>({4, 8, 2, 7, 5});
-  auto out = builder.ShiftRightLogical(a, b);
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<uint32>(
+      {0x92345678, 0x10001000, 1, 3, 77, 1, ~3u, 77});
+  auto b = builder.ConstantR1<uint32>({4, 8, 2, 7, 5, 32, 100, ~0u});
+  builder.ShiftRightLogical(a, b);
 
-  ComputeAndCompareR1<uint32>(&builder, {0x09234567, 0x00100010, 0, 0, 2}, {});
+  ComputeAndCompareR1<uint32>(&builder,
+                              {0x09234567, 0x00100010, 0, 0, 2, 0, 0, 0}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 2.25f, 10.0f, NAN});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {false, false, true, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({});
   auto rhs = builder.ConstantR1<float>({});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Ge(lhs, rhs);
+  builder.Ge(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {false, true, true, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Gt(lhs, rhs);
+  builder.Gt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {false, true, true, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLeF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 5.0f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Le(lhs, rhs);
+  builder.Le(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {true, true, false, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLtF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Lt(lhs, rhs);
+  builder.Lt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {true, false, false, false, false}, {});
 }
@@ -973,10 +1087,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtF32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, false, false, false, true, false, false, false, true},
@@ -984,17 +1098,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({});
   auto rhs = builder.ConstantR1<int32>({});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqC64s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<complex64>({{-2.5f, 10.0f},
                                             {1.0f, 25.5f},
                                             {2.25f, -3.0f},
@@ -1005,16 +1119,16 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqC64s) {
                                             {2.25f, -3.0f},
                                             {10.0f, 0.0f},
                                             {1.0f, NAN}});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {false, false, true, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<complex64>({});
   auto rhs = builder.ConstantR1<complex64>({});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
@@ -1023,7 +1137,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeC64s) {
   // Disable fast-math because we're operating on NaNs.
   SetFastMathDisabled(true);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<complex64>({{-2.5f, 10.0f},
                                             {1.0f, 25.5f},
                                             {2.25f, -3.0f},
@@ -1034,7 +1148,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeC64s) {
                                             {2.25f, -3.0f},
                                             {10.0f, 0.0f},
                                             {1.0f, NAN}});
-  auto compare = builder.Ne(lhs, rhs);
+  builder.Ne(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {true, true, false, true, true}, {});
 }
@@ -1043,10 +1157,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
   // Disable fast-math because we're operating on NaNs.
   SetFastMathDisabled(true);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 25.5f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Ne(lhs, rhs);
+  builder.Ne(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {true, false, true, true, true}, {});
 }
@@ -1054,10 +1168,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Ne(lhs, rhs);
+  builder.Ne(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, true, true, true, false, true, true, true, false}, {});
@@ -1066,10 +1180,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Ge(lhs, rhs);
+  builder.Ge(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, false, false, true, true, false, true, true, true}, {});
@@ -1078,10 +1192,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Gt(lhs, rhs);
+  builder.Gt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, false, false, true, false, false, true, true, false},
@@ -1091,10 +1205,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Le(lhs, rhs);
+  builder.Le(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, true, true, false, true, true, false, false, true}, {});
@@ -1103,10 +1217,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLeS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLtS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Lt(lhs, rhs);
+  builder.Lt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, true, true, false, false, true, false, false, false},
@@ -1115,10 +1229,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtS32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, false, false, false, true, false, false, false, true},
@@ -1127,10 +1241,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareNeU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Ne(lhs, rhs);
+  builder.Ne(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, true, true, true, false, true, true, true, false}, {});
@@ -1138,10 +1252,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGeU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Ge(lhs, rhs);
+  builder.Ge(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, false, false, true, true, false, true, true, true}, {});
@@ -1149,10 +1263,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Gt(lhs, rhs);
+  builder.Gt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, false, false, true, false, false, true, true, false},
@@ -1161,10 +1275,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLeU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Le(lhs, rhs);
+  builder.Le(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, true, true, false, true, true, false, false, true}, {});
@@ -1172,10 +1286,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLeU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Lt(lhs, rhs);
+  builder.Lt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, true, true, false, false, true, false, false, false},
@@ -1184,12 +1298,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs =
       builder.ConstantR1<float>({4.0f, 2.0f, 2.0f, NAN, 6.0f, -2.0f, -2.0f});
   auto rhs =
       builder.ConstantR1<float>({2.0f, -2.0f, 3.0f, 10.0f, NAN, 3.0f, 4.0f});
-  auto minimum = builder.Pow(lhs, rhs);
+  builder.Pow(lhs, rhs);
 
   ComputeAndCompareR1<float>(
       &builder, {16.0f, 0.25f, 8.0f, NAN, NAN, -8.0f, 16.0f}, {}, error_spec_);
@@ -1197,27 +1311,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowNonIntegerF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.0f, -0.6f, -0.6f, 0.0f});
   auto rhs = builder.ConstantR1<float>({0.5f, 0.6f, -0.6f, -0.6f});
-  auto minimum = builder.Pow(lhs, rhs);
+  builder.Pow(lhs, rhs);
 
   ComputeAndCompareR1<float>(&builder, {NAN, NAN, NAN, INFINITY}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({});
   auto rhs = builder.ConstantR1<float>({});
-  auto minimum = builder.Pow(lhs, rhs);
+  builder.Pow(lhs, rhs);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 // Some Pow cases that can be implemented more efficiently.
 XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values = {1.0f, 2.0f, 3.2f, -4.0f};
   std::vector<float> exponents = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1245,7 +1359,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
   std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1270,7 +1384,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, 4.0f, 0.5f, 5.7f};
   std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1295,7 +1409,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
   std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1320,7 +1434,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
   std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1345,7 +1459,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1377,7 +1491,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1410,7 +1524,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, 1.0f, 0.5f};
@@ -1443,7 +1557,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Div4F32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1484,14 +1598,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div4F32) {
 
 TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
   const int count = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<float> values;
   values.reserve(count);
   for (int i = 0; i < count; ++i) {
     values.push_back(i / static_cast<float>(count));
   }
   auto x = builder.ConstantR1<float>(values);
-  auto exp = builder.Pow(x, builder.ConstantR0<float>(2.0f));
+  builder.Pow(x, builder.ConstantR0<float>(2.0f));
 
   std::vector<float> expected;
   expected.reserve(values.size());
@@ -1503,7 +1617,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> values(2, 2, 2, 2);
 
   std::vector<float> values_vector;
@@ -1517,140 +1631,86 @@ XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4D) {
   Array4D<float> expected(2, 2, 2, 2, expected_vector);
 
   auto x = builder.ConstantR4FromArray4D<float>(values);
-  auto exp = builder.Pow(x, builder.ConstantR0<float>(2.0f));
+  builder.Pow(x, builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4DZeroElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> values(2, 2, 0, 2);
   Array4D<float> expected(2, 2, 0, 2);
 
   auto x = builder.ConstantR4FromArray4D<float>(values);
-  auto exp = builder.Pow(x, builder.ConstantR0<float>(2.0f));
+  builder.Pow(x, builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
-// GPU backend emits nvvm intrinsic for fmin and fmax, whose semantics is NOT
-// such
-// * fmin(NaN, x) = x
-// * fmax(NaN, x) = x
-// so we only test NAN on CPU.
-//
-// TODO(b/28180546): Make this compile in a way that is consistent
-// among backends.
 XLA_TEST_F(ArrayElementwiseOpTest, MinF32s) {
-  ComputationBuilder builder(client_, TestName());
-#if !defined(XLA_TEST_BACKEND_CPU)
-  auto lhs = builder.ConstantR1<float>({1.0f, 1.0f, 2.25f});
-  auto rhs = builder.ConstantR1<float>({2.0f, -5.0f, 1.0f});
-#else
+  XlaBuilder builder(TestName());
   SetFastMathDisabled(true);
   auto lhs = builder.ConstantR1<float>({1.0f, 1.0f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({2.0f, -5.0f, 1.0f, 10.0f, NAN});
-#endif
-  auto minimum = builder.Min(lhs, rhs);
-
-  ComputeAndCompareR1<float>(&builder,
-#if !defined(XLA_TEST_BACKEND_CPU)
-                             {1.0f, -5.0f, 1.0f},
-#else
-                             {1.0f, -5.0f, 1.0f, 10.0f, 6.0f},
-#endif
-                             {}, error_spec_);
+  builder.Min(lhs, rhs);
+
+  ComputeAndCompareR1<float>(&builder, {1.0f, -5.0f, 1.0f, NAN, NAN}, {},
+                             error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MinZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({});
   auto rhs = builder.ConstantR1<float>({});
-  auto minimum = builder.Min(lhs, rhs);
+  builder.Min(lhs, rhs);
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
-// TODO(b/28180546): Make this compile in a way that is consistent
-// among backends. See comment on MinF32s test above.
 XLA_TEST_F(ArrayElementwiseOpTest, MinF64s) {
-  ComputationBuilder builder(client_, TestName());
-#if !defined(XLA_TEST_BACKEND_CPU)
-  auto lhs = builder.ConstantR1<double>({1.0, 1.0, 2.25});
-  auto rhs = builder.ConstantR1<double>({2.0, -5.0, 1.0});
-#else
+  XlaBuilder builder(TestName());
   SetFastMathDisabled(true);
   auto lhs = builder.ConstantR1<double>({1.0, 1.0, 2.25, NAN, 6.0});
   auto rhs = builder.ConstantR1<double>({2.0, -5.0, 1.0, 10.0, NAN});
-#endif
-  auto minimum = builder.Min(lhs, rhs);
+  builder.Min(lhs, rhs);
 
-  ComputeAndCompareR1<double>(&builder,
-#if !defined(XLA_TEST_BACKEND_CPU)
-                              {1.0, -5.0, 1.0},
-#else
-                              {1.0, -5.0, 1.0, 10.0, 6.0},
-#endif
-                              {}, error_spec_);
+  ComputeAndCompareR1<double>(&builder, {1.0, -5.0, 1.0, NAN, NAN}, {},
+                              error_spec_);
 }
 
-// TODO(b/28180546): Make this compile in a way that is consistent
-// among backends. See comment on MinF32s test above.
 XLA_TEST_F(ArrayElementwiseOpTest, MaxF32s) {
-  ComputationBuilder builder(client_, TestName());
-#if !defined(XLA_TEST_BACKEND_CPU)
-  auto lhs = builder.ConstantR1<float>({1.0f, 1.0f, 2.25f});
-  auto rhs = builder.ConstantR1<float>({2.0f, -5.0f, 1.0f});
-#else
+  XlaBuilder builder(TestName());
   SetFastMathDisabled(true);
   auto lhs = builder.ConstantR1<float>({1.0f, 1.0f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({2.0f, -5.0f, 1.0f, 10.0f, NAN});
-#endif
-  auto maximum = builder.Max(lhs, rhs);
-
-  ComputeAndCompareR1<float>(&builder,
-#if !defined(XLA_TEST_BACKEND_CPU)
-                             {2.0f, 1.0f, 2.25f},
-#else
-                             {2.0f, 1.0f, 2.25f, 10.0f, 6.0f},
-#endif
-                             {}, error_spec_);
+  builder.Max(lhs, rhs);
+
+  ComputeAndCompareR1<float>(&builder, {2.0f, 1.0f, 2.25f, NAN, NAN}, {},
+                             error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({});
   auto rhs = builder.ConstantR1<float>({});
-  auto minimum = builder.Max(lhs, rhs);
+  builder.Max(lhs, rhs);
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
-// TODO(b/28180546): Make this compile in a way that is consistent
-// among backends. See comment on MinF32s test above.
 XLA_TEST_F(ArrayElementwiseOpTest, MaxF64s) {
-  ComputationBuilder builder(client_, TestName());
-#if !defined(XLA_TEST_BACKEND_CPU)
-  auto lhs = builder.ConstantR1<double>({1.0, 1.0, 2.25});
-  auto rhs = builder.ConstantR1<double>({2.0, -5.0, 1.0});
-#else
+  XlaBuilder builder(TestName());
   SetFastMathDisabled(true);
   auto lhs = builder.ConstantR1<double>({1.0, 1.0, 2.25, NAN, 6.0});
   auto rhs = builder.ConstantR1<double>({2.0, -5.0, 1.0, 10.0, NAN});
-#endif
-  auto maximum = builder.Max(lhs, rhs);
+  builder.Max(lhs, rhs);
 
-  ComputeAndCompareR1<double>(&builder,
-#if !defined(XLA_TEST_BACKEND_CPU)
-                              {2.0, 1.0, 2.25},
-#else
-                              {2.0, 1.0, 2.25, 10.0, 6.0},
-#endif
-                              {}, error_spec_);
+  ComputeAndCompareR1<double>(&builder, {2.0, 1.0, 2.25, NAN, NAN}, {},
+                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>(
       {min, min, min, -1, -1, 0, 0, 0, 1, 1, max, max, max});
   auto y = builder.ConstantR1<int32>(
@@ -1665,7 +1725,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, MinS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>(
       {min, min, min, -1, -1, 0, 0, 0, 1, 1, max, max, max});
   auto y = builder.ConstantR1<int32>(
@@ -1679,7 +1739,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinS32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<uint32>({0, 0, 1, 1, 1, max, max, max});
   auto y = builder.ConstantR1<uint32>({0, 1, 0, 1, 10, 0, 234234, max});
   builder.Max(x, y);
@@ -1690,7 +1750,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, MinU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<uint32>({0, 0, 1, 1, 1, max, max, max});
   auto y = builder.ConstantR1<uint32>({0, 1, 0, 1, 10, 0, 234234, max});
   builder.Min(x, y);
@@ -1700,7 +1760,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinU32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxTenF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
   auto y = builder.ConstantR1<float>(
@@ -1713,7 +1773,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxTenF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S1AndR1S0F32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto u = builder.ConstantR1<float>({3.5});
   auto v = builder.ConstantR1<float>({});
   builder.Max(u, v);
@@ -1723,7 +1783,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S1AndR1S0F32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S0AndR2S0x2F32s) {
   for (int broadcast_dim : {0, 1}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto u = builder.ConstantR1<float>({3.5});
     auto v = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
     builder.Max(u, v, /*broadcast_dimensions=*/{broadcast_dim});
@@ -1733,7 +1793,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S0AndR2S0x2F32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   auto m =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
@@ -1744,7 +1804,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({});
   auto m = builder.ConstantR2<float>({{}, {}});
   builder.Max(v, m, /*broadcast_dimensions=*/{1});
@@ -1754,7 +1814,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DZeroElementF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto scalar = builder.ConstantR0<int32>(2);
   Array3D<int32> a_3d({{{3, 9, -1}, {2, -10, 3}}, {{-2, 2, 8}, {12, 10, 4}}});
   auto array = builder.ConstantR3FromArray3D<int32>(a_3d);
@@ -1765,7 +1825,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarZeroElementS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto scalar = builder.ConstantR0<int32>(2);
   Array3D<int32> a_3d(2, 0, 3);
   auto array = builder.ConstantR3FromArray3D<int32>(a_3d);
@@ -1776,7 +1836,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarZeroElementS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m =
       builder.ConstantR2<float>({{-10.4f, 64.0f, 6.0f}, {0.1f, 32.0f, 16.1f}});
   auto v = builder.ConstantR1<float>({-10.2f, 16.4f});
@@ -1787,7 +1847,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantR2<float>({{}, {}});
   auto v = builder.ConstantR1<float>({-10.2f, 16.4f});
   builder.Min(m, v, /*broadcast_dimensions=*/{0});
@@ -1797,7 +1857,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DZeroElementF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto array2d =
       builder.ConstantR2<float>({{-12.2f, 64.3f, 6.1f}, {0.0f, 32.2f, 2.5f}});
   auto array4d = builder.ConstantR4FromArray4D<float>(
@@ -1812,7 +1872,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto array2d =
       builder.ConstantR2<float>({{-12.2f, 64.3f, 6.1f}, {0.0f, 32.2f, 2.5f}});
   Array4D<float> arg(2, 2, 0, 3);
@@ -1824,7 +1884,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DZeroElementF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MinTenS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto y = builder.ConstantR1<int32>({9, 8, 7, 6, 5, 4, 3, 2, 1, 0});
   builder.Min(x, y);
@@ -1834,7 +1894,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinTenS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxTenS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto y = builder.ConstantR1<int32>({9, 8, 7, 6, 5, 4, 3, 2, 1, 0});
   builder.Max(x, y);
@@ -1844,110 +1904,107 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxTenS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, RemTwoConstantS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({-3, 26, 2, -1, 1});
   auto b = builder.ConstantR1<int32>({10, 5, 1, 10, -10});
-  auto add = builder.Rem(a, b);
+  builder.Rem(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {-3, 1, 0, -1, 1}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NonNanClampF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto minimum = builder.ConstantR1<float>({1.0f, -6.5f, 1.0f, 2.25f, 0.0f});
   auto argument = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 10.0f});
   auto maximum = builder.ConstantR1<float>({3.0f, 0.5f, 25.5f, 5.0f, 123.0});
-  auto clamp = builder.Clamp(minimum, argument, maximum);
+  builder.Clamp(minimum, argument, maximum);
 
   ComputeAndCompareR1<float>(&builder, {2.0f, 0.5f, 1.0f, 2.25f, 10.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto minimum = builder.ConstantR0<float>(0.0f);
   auto argument = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 4.0f});
   auto maximum = builder.ConstantR0<float>(5.0f);
-  auto clamp = builder.Clamp(minimum, argument, maximum);
+  builder.Clamp(minimum, argument, maximum);
 
   ComputeAndCompareR1<float>(&builder, {2.0f, 5.0f, 0.0f, 1.0f, 4.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_scalar = builder.ConstantR0<float>(0.0f);
   auto min_vector = builder.ConstantR1<float>({1.0f, -6.5f, 1.0f, 2.25f, 0.0f});
   auto arg_vector = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 4.0f});
   auto max_scalar = builder.ConstantR0<float>(3.0f);
   auto max_vector = builder.ConstantR1<float>({3.0f, 0.5f, 25.5f, 5.0f, 123.0});
   // Perform clamp with broadcasted scalar and vector.
-  auto clamp = builder.Add(
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
-                  builder.Clamp(min_scalar, arg_vector, max_vector)),
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
-                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+  builder.Add(builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                          builder.Clamp(min_scalar, arg_vector, max_vector)),
+              builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                          builder.Clamp(min_scalar, arg_vector, max_scalar)));
 
   ComputeAndCompareR1<float>(&builder, {8.0f, 7.0f, 2.0f, 6.5f, 14.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampS32Vector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0, -5});
   auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4, 10});
   auto max_vector = builder.ConstantR1<int32>({3, 0, 25, 5, 123, -1});
-  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+  builder.Clamp(min_vector, arg_vector, max_vector);
 
   ComputeAndCompareR1<int32>(&builder, {2, 0, 1, 2, 4, -1}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampS32ScalarVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_scalar = builder.ConstantR0<int32>(0);
   auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0});
   auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4});
   auto max_scalar = builder.ConstantR0<int32>(3);
   auto max_vector = builder.ConstantR1<int32>({3, 1, 25, 5, 123});
   // Perform clamp with broadcasted scalar and vector.
-  auto clamp = builder.Add(
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
-                  builder.Clamp(min_scalar, arg_vector, max_vector)),
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
-                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+  builder.Add(builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                          builder.Clamp(min_scalar, arg_vector, max_vector)),
+              builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                          builder.Clamp(min_scalar, arg_vector, max_scalar)));
 
   ComputeAndCompareR1<int32>(&builder, {8, 8, 2, 6, 14}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_vector = builder.ConstantR1<uint32>({1, 2, 1, 2, 0, ~0u - 4});
   auto arg_vector = builder.ConstantR1<uint32>({2, 10, 5, 1, 4, 10});
   auto max_vector = builder.ConstantR1<uint32>({3, 5, 25, 5, 123, ~0u});
-  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+  builder.Clamp(min_vector, arg_vector, max_vector);
 
   ComputeAndCompareR1<uint32>(&builder, {2, 5, 5, 2, 4, ~0u - 4}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampU32ScalarVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_scalar = builder.ConstantR0<uint32>(0);
   auto min_vector = builder.ConstantR1<uint32>({1, 0, 1, 2, 0});
   auto arg_vector = builder.ConstantR1<uint32>({2, 10, 0, 1, 4});
   auto max_scalar = builder.ConstantR0<uint32>(3);
   auto max_vector = builder.ConstantR1<uint32>({3, 1, 25, 5, 123});
   // Perform clamp with broadcasted scalar and vector.
-  auto clamp = builder.Add(
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
-                  builder.Clamp(min_scalar, arg_vector, max_vector)),
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
-                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+  builder.Add(builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                          builder.Clamp(min_scalar, arg_vector, max_vector)),
+              builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                          builder.Clamp(min_scalar, arg_vector, max_scalar)));
 
   ComputeAndCompareR1<uint32>(&builder, {8, 8, 2, 6, 14}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
@@ -1961,7 +2018,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
 
   auto p0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto p1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto add = builder.Add(p0, p1);
+  builder.Add(p0, p1);
 
   ComputeAndCompareR1<float>(&builder, {8.3f, 4.5f, 6.7f, 11.1f},
                              {param0_data.get(), param1_data.get()},
@@ -1969,7 +2026,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR3FromArray3D<float>(Array3D<float>(0, 7, 0));
@@ -1983,7 +2040,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersZeroElementF32s) {
 
   auto p0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto p1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto add = builder.Add(p0, p1);
+  builder.Add(p0, p1);
 
   Array3D<float> expected(0, 7, 0);
   ComputeAndCompareR3<float>(
@@ -1991,7 +2048,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersZeroElementF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
@@ -2000,35 +2057,35 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) {
 
   auto a = builder.ConstantR1<float>({1.1f, 2.2f, 3.3f, 4.4f});
   auto p = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto add = builder.Add(a, p);
+  builder.Add(a, p);
 
   ComputeAndCompareR1<float>(&builder, {2.2f, 4.4f, 6.6f, 9.9f},
                              {param0_data.get()}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CosF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({3.14159f, 0.0f, 1.570796f, -0.78539f});
-  auto result = builder.Cos(a);
+  builder.Cos(a);
 
   ComputeAndCompareR1<float>(&builder, {-1.0f, 1.0f, 0.0f, 0.707107f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SinF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({3.14159f, 0.0f, 1.570796f, -0.78539f});
-  auto result = builder.Sin(a);
+  builder.Sin(a);
 
   ComputeAndCompareR1<float>(&builder, {0.0f, 0.0f, 1.0f, -0.707107f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Atan2F32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({0.0f, 5.0f, 0.0f, -3.0f, 2.0f, -8.0f});
   auto b = builder.ConstantR1<float>({6.0f, 0.0f, -4.0f, 0.0f, 2.0f, 8.0f});
-  auto atan = builder.Atan2(a, b);
+  builder.Atan2(a, b);
 
   ComputeAndCompareR1<float>(
       &builder,
@@ -2037,9 +2094,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Atan2F32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, TanhF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f});
-  auto result = builder.Tanh(a);
+  builder.Tanh(a);
 
   ComputeAndCompareR1<float>(&builder, {-0.986614f, 0.996260f, 0.978026}, {},
                              error_spec_);
@@ -2049,7 +2106,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
   // This is like the test ArrayElementwiseOpTest.TanhF32s above, except that
   // the input tensor is large enough to exercise the vectorized tanh
   // implementation on XLA CPU.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>(
       {1.02,  -0.32, 0.85,  0.90,  1.23,  -0.91, -0.49, 0.80,  -0.67, 0.16,
        -0.07, 0.39,  -0.41, 0.04,  1.36,  1.25,  0.41,  0.65,  -1.08, 0.32,
@@ -2088,7 +2145,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
 XLA_TEST_F(ArrayElementwiseOpTest, ExpF32sVector) {
   // The input tensor is large enough to exercise the vectorized exp
   // implementation on XLA CPU.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // Just to help make sense of the scales here -- exp(89) saturates float32 and
   // exp(-10) is smaller than our error spec.
@@ -2124,7 +2181,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, ExpF32sVector) {
 XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) {
   // The input tensor is large enough to exercise the vectorized exp
   // implementation on XLA CPU.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> input_literal = Literal::CreateR1<float>(
       {-1.29,    -1.41,    -1.25,    -13.5,    -11.7,    -17.9,    -198,
@@ -2159,19 +2216,28 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClzU32s) {
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<uint32>(
+      {0, 1, 0x10, 0x10000, 0x700000, 0x12345678, 0xF2345678});
+  builder.Clz(a);
+
+  ComputeAndCompareR1<uint32>(&builder, {32, 31, 27, 15, 9, 3, 0}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   // a ------ (add) --------- (add)
   //         /               /
   // b -----/               /
   // c---------------------/
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto a = builder.ConstantR1<float>({1.1f, 2.2f, 3.3f, 4.4f});
   auto b = builder.ConstantR1<float>({2.1f, 3.2f, 4.3f, 5.4f});
   auto c = builder.ConstantR1<float>({-3.3f, -15.5f, -7.7f, -29.9f});
 
   auto add = builder.Add(a, b);
-  auto add2 = builder.Add(add, c);
+  builder.Add(add, c);
 
   ComputeAndCompareR1<float>(&builder, {-0.1f, -10.1f, -0.1f, -20.1f}, {},
                              error_spec_);
@@ -2182,14 +2248,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldRight) {
   //         /               /
   // c -----/               /
   // a---------------------/
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto a = builder.ConstantR1<float>({91.1f, 2.2f, 3.3f, 4.4f});
   auto b = builder.ConstantR1<float>({2.1f, 3.2f, 4.3f, 5.4f});
   auto c = builder.ConstantR1<float>({-3.3f, -15.5f, -7.7f, -29.9f});
 
   auto add = builder.Add(b, c);
-  auto add2 = builder.Add(a, add);
+  builder.Add(a, add);
 
   ComputeAndCompareR1<float>(&builder, {89.9f, -10.1f, -0.1f, -20.1f}, {},
                              error_spec_);
@@ -2199,14 +2265,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddWithNeg) {
   // a ----- (neg) ----- (add)
   //                    /
   // b ----- (neg) ----/
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto a = builder.ConstantR1<float>({91.1f, 2.2f, 3.3f, 4.4f});
   auto b = builder.ConstantR1<float>({2.1f, 3.2f, 4.3f, 5.4f});
 
   auto neg_a = builder.Neg(a);
   auto neg_b = builder.Neg(b);
-  auto result = builder.Add(neg_a, neg_b);
+  builder.Add(neg_a, neg_b);
 
   ComputeAndCompareR1<float>(&builder, {-93.2f, -5.4f, -7.6f, -9.8f}, {},
                              error_spec_);
@@ -2220,7 +2286,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainTwoSide) {
   // c ------ (add) ------------/
   //         /
   // d -----/
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto a = builder.ConstantR1<float>({91.1f, 2.2f, 3.3f, 4.4f});
   auto b = builder.ConstantR1<float>({2.1f, 3.2f, 4.3f, 5.4f});
@@ -2229,19 +2295,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainTwoSide) {
 
   auto add_ab = builder.Add(a, b);
   auto add_cd = builder.Add(c, d);
-  auto add_all = builder.Add(add_ab, add_cd);
+  builder.Add(add_ab, add_cd);
 
   ComputeAndCompareR1<float>(&builder, {70.9f, -0.1f, -40.1f, 0.1f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, 2DBinaryOpF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto b =
       builder.ConstantR2<float>({{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   Array2D<float> expected_array(
       {{-4.0f, 11.28f, 43.0f}, {1.25f, -14.0f, 8.88f}});
@@ -2250,11 +2316,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, 2DBinaryOpF32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, ScalarPlus2DF32) {
   // Add a scalar + matrix.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto scalar = builder.ConstantR0<float>(3.0f);
-  auto add = builder.Add(scalar, a);
+  builder.Add(scalar, a);
 
   Array2D<float> expected_array({{0.5f, 6.14f, 4.0f}, {5.25f, -7.0f, 6.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2262,11 +2328,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, ScalarPlus2DF32) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, 2DPlusScalarF32) {
   // Add a matrix + scalar.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto scalar = builder.ConstantR0<float>(3.0f);
-  auto add = builder.Add(a, scalar);
+  builder.Add(a, scalar);
 
   Array2D<float> expected_array({{0.5f, 6.14f, 4.0f}, {5.25f, -7.0f, 6.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2275,14 +2341,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, 2DPlusScalarF32) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32) {
   // Test simple broadcasting of a R1F32 over R2F32. The vector's size matches
   // only dim 0 of the matrix.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({20.0f, 40.0f, 60.0f});
   // clang-format off
   auto m = builder.ConstantR2<float>({
     {-2.5f, 3.14f, 1.0f},
     {2.25f, -10.0f, 3.33f}});
   // clang-format on
-  auto add = builder.Add(v, m, /*broadcast_dimensions=*/{1});
+  builder.Add(v, m, /*broadcast_dimensions=*/{1});
   Array2D<float> expected_array(
       {{17.5f, 43.14f, 61.0f}, {22.25f, 30.0f, 63.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2290,7 +2356,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Eq) {
   // Test broadcasting in Eq comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({42, 73});
   auto m = builder.ConstantR2<int32>({{42, 73}, {42, 52}});
 
@@ -2308,10 +2374,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Eq) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ne) {
   // Test broadcasting in Ne comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({42, 73});
   auto m = builder.ConstantR2<int32>({{42, 73}, {42, 52}});
-  auto cmp = builder.Ne(v, m, /*broadcast_dimensions=*/{1});
+  builder.Ne(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,2] {
   { 00 },
@@ -2322,10 +2388,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ne) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ge) {
   // Test broadcasting in Ge comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, 2, 3, 4});
   auto m = builder.ConstantR2<int32>({{1, 0, 5, 6}, {42, 52, 10, 4}});
-  auto cmp = builder.Ge(v, m, /*broadcast_dimensions=*/{1});
+  builder.Ge(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
   { 1100 },
@@ -2336,10 +2402,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ge) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Gt) {
   // Test broadcasting in Gt comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, 2, 3, 4});
   auto m = builder.ConstantR2<int32>({{1, 0, 5, 6}, {42, 52, 10, 4}});
-  auto cmp = builder.Gt(v, m, /*broadcast_dimensions=*/{1});
+  builder.Gt(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
   { 0100 },
@@ -2350,10 +2416,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Gt) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Le) {
   // Test broadcasting in Le comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, 2, 3, 4});
   auto m = builder.ConstantR2<int32>({{1, 0, 5, 6}, {42, 52, 10, 4}});
-  auto cmp = builder.Le(v, m, /*broadcast_dimensions=*/{1});
+  builder.Le(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
   { 1011 },
@@ -2364,10 +2430,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Le) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Lt) {
   // Test broadcasting in Lt comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, 2, 3, 4});
   auto m = builder.ConstantR2<int32>({{1, 0, 5, 6}, {42, 52, 10, 4}});
-  auto cmp = builder.Lt(v, m, /*broadcast_dimensions=*/{1});
+  builder.Lt(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
   { 0011 },
@@ -2379,24 +2445,24 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Lt) {
 XLA_TEST_F(ArrayElementwiseOpTest, Mul2Dby1DF32) {
   // Test simple broadcasting of a R1F32 over R2F32 when the order of binary op
   // arguments is reversed.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantR2<float>({{1.5f, 2.5f, 3.5f}, {4.5f, 5.5f, 6.5f}});
   auto v = builder.ConstantR1<float>({2.0f, 4.0f, 6.0f});
-  auto add = builder.Mul(m, v, /*broadcast_dimensions=*/{1});
+  builder.Mul(m, v, /*broadcast_dimensions=*/{1});
   Array2D<float> expected_array({{3.0f, 10.0f, 21.0f}, {9.0f, 22.0f, 39.0f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim1) {
   // Tests broadcasting for arrays with degenerate (size == 1) dimensions.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // m's shape in XLA notation is {3, 2}
   // md's shape in XLA notation is {3, 1}
   // The result has shape {3, 2}, where md is broadcast over m
   auto m =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto md = builder.ConstantR2<float>({{10.0f, 20.0f, 30.0f}});
-  auto add = builder.Add(m, md);
+  builder.Add(m, md);
   Array2D<float> expected_array(
       {{7.5f, 23.14f, 31.0f}, {12.25f, 10.0f, 33.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2404,14 +2470,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim1) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim0) {
   // Tests broadcasting for arrays with degenerate (size == 1) dimensions.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // m's shape in XLA notation is {3, 2}
   // md's shape in XLA notation is {1, 2}
   // The result has shape {3, 2}, where md is broadcast over m
   auto m =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto md = builder.ConstantR2<float>({{10.0f}, {20.0f}});
-  auto add = builder.Add(m, md);
+  builder.Add(m, md);
   Array2D<float> expected_array(
       {{7.5f, 13.14f, 11.0f}, {22.25f, 10.0f, 23.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2422,13 +2488,13 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DsWithDegenerateDimsOuterProduct) {
   // effectively creates an "outer product" operation.
   // This is taken from the Numpy docs example at:
   // http://docs.scipy.org/doc/numpy-1.10.1/user/basics.broadcasting.html
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // a's shape in XLA notation is {1, 4}
   // b's shape in XLA notation is {3, 1}
   // The result has shape {3, 4}.
   auto a = builder.ConstantR2<float>({{0.0f}, {10.0f}, {20.0f}, {30.0f}});
   auto b = builder.ConstantR2<float>({{1.0f, 2.0f, 3.0f}});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
   Array2D<float> expected_array({{1.0f, 2.0f, 3.0f},
                                  {11.0f, 12.0f, 13.0f},
                                  {21.0f, 22.0f, 23.0f},
@@ -2439,10 +2505,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DsWithDegenerateDimsOuterProduct) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver1) {
   // Add together a (2,2) array and a (2) array, using dimension 0 for
   // broadcasting (though there are two ways to broadcast these shapes).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({20.0f, 40.0f});
   auto m = builder.ConstantR2<float>({{10.0f, 50.0f}, {77.0f, 88.0f}});
-  auto add = builder.Add(v, m, /*broadcast_dimensions=*/{1});
+  builder.Add(v, m, /*broadcast_dimensions=*/{1});
   Array2D<float> expected_array({{30.0f, 90.0f}, {97.0f, 128.0f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
@@ -2450,17 +2516,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver1) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver0) {
   // Add together a (2,2) array and a (2) array, using dimension 1 for
   // broadcasting (though there are two ways to broadcast these shapes).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({20.0f, 40.0f});
   auto m = builder.ConstantR2<float>({{10.0f, 50.0f}, {77.0f, 88.0f}});
-  auto add = builder.Add(v, m, /*broadcast_dimensions=*/{0});
+  builder.Add(v, m, /*broadcast_dimensions=*/{0});
   Array2D<float> expected_array({{30.0f, 70.0f}, {117.0f, 128.0f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, 3DBinaryOpF32s) {
   // Binary add of two R3s together
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_3d({{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
                        {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}});
   auto a = builder.ConstantR3FromArray3D<float>(a_3d);
@@ -2468,7 +2534,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, 3DBinaryOpF32s) {
   Array3D<float> b_3d({{{2.0f, 4.0f}, {6.0f, 8.0f}, {10.0f, 12.0f}},
                        {{14.0f, 16.0f}, {18.0f, 20.0f}, {22.0f, 24.0f}}});
   auto b = builder.ConstantR3FromArray3D<float>(b_3d);
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   Array3D<float> expected_3d(
       {{{3.0f, 6.0f}, {9.0f, 12.0f}, {15.0f, 18.0f}},
@@ -2479,7 +2545,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, 3DBinaryOpF32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver2) {
   // Add together a (2, 3, 2) array with a (2) array, using dimension 0 for
   // broadcasting (though there are two ways to broadcast these shapes).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   Array3D<float> a_3d({
     {{1.0f, 2.0f},
@@ -2492,7 +2558,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver2) {
   // clang-format on
   auto a = builder.ConstantR3FromArray3D<float>(a_3d);
   auto v = builder.ConstantR1<float>({10.0f, 20.0f});
-  auto add = builder.Add(a, v, /*broadcast_dimensions=*/{2});
+  builder.Add(a, v, /*broadcast_dimensions=*/{2});
 
   Array3D<float> expected_3d(
       {{{11.0f, 22.0f}, {13.0f, 24.0f}, {15.0f, 26.0f}},
@@ -2503,7 +2569,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver2) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver0) {
   // Add together a (2, 3, 2) array with a (2) array, using dimension 2 for
   // broadcasting (though there are two ways to broadcast these shapes).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   Array3D<float> a_3d({
     {{1.0f, 2.0f},
@@ -2516,7 +2582,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver0) {
   // clang-format on
   auto a = builder.ConstantR3FromArray3D<float>(a_3d);
   auto v = builder.ConstantR1<float>({10.0f, 20.0f});
-  auto add = builder.Add(a, v, /*broadcast_dimensions=*/{0});
+  builder.Add(a, v, /*broadcast_dimensions=*/{0});
 
   // clang-format off
   Array3D<float> expected_3d({
@@ -2534,7 +2600,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver0) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo3D) {
   // Add together a (2, 3, 2) array with a (3, 2) array, using dimensions {1,2}
   // for broadcasting.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   Array3D<float> a_3d({
     {{1.0f, 2.0f},
@@ -2549,7 +2615,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo3D) {
     {10.0f, 20.0f, 30.0f},
     {40.0f, 50.0f, 60.0f},
   });
-  auto add = builder.Add(a, m, /*broadcast_dimensions=*/{0, 1});
+  builder.Add(a, m, /*broadcast_dimensions=*/{0, 1});
 
   Array3D<float> expected_3d({
     {{11.0f, 12.0f},
@@ -2566,7 +2632,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo3D) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
   // Comparison between two 3D arrays of compatible shapes:
   // (2, 3, 2) and (2, 3, 1): expected to produce a (2, 3, 2) shape of PREDs.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_3d({{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
                        {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}});
   auto a = builder.ConstantR3FromArray3D<float>(a_3d);
@@ -2574,7 +2640,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
   Array3D<float> b_3d({{{7.0f, 1.0f}, {3.0f, 10.0f}, {15.0f, 6.0f}}});
   auto b = builder.ConstantR3FromArray3D<float>(b_3d);
 
-  auto compare = builder.Gt(a, b);
+  builder.Gt(a, b);
 
   Array3D<int> expected_3d(
       {{{0, 1}, {0, 0}, {0, 0}}, {{0, 1}, {1, 0}, {0, 1}}});
@@ -2590,7 +2656,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, 4DBinaryOpF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Array4D<float>> operand_a_4d(new Array4D<float>(2, 3, 4, 5));
   std::unique_ptr<Array4D<float>> operand_b_4d(new Array4D<float>(2, 3, 4, 5));
@@ -2611,13 +2677,13 @@ XLA_TEST_F(ArrayElementwiseOpTest, 4DBinaryOpF32s) {
 
   auto a = builder.ConstantR4FromArray4D<float>(*operand_a_4d);
   auto b = builder.ConstantR4FromArray4D<float>(*operand_b_4d);
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR4<float>(&builder, *expected_4d, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, R4PlusR1InDim1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Array4D<float>> operand_a_4d(new Array4D<float>(2, 3, 4, 5));
   std::unique_ptr<Array4D<float>> expected_4d(new Array4D<float>(2, 3, 4, 5));
@@ -2639,7 +2705,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4PlusR1InDim1) {
 
   auto a = builder.ConstantR4FromArray4D<float>(*operand_a_4d);
   auto b = builder.ConstantR1<float>(operand_b_1d);
-  auto add = builder.Add(a, b, {1});
+  builder.Add(a, b, {1});
 
   ComputeAndCompareR4<float>(&builder, *expected_4d, {}, error_spec_);
 }
@@ -2654,7 +2720,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
   std::vector<float> r1(d1);
   std::iota(r1.begin(), r1.end(), 1.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> a_literal = Literal::CreateR4FromArray4DWithLayout(
       r4, LayoutUtil::MakeLayout({0, 1, 2, 3}));
   auto a = builder.ConstantLiteral(*a_literal);
@@ -2675,11 +2741,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
 
 // Show that we can't add two opaques.
 XLA_TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto shape = ShapeUtil::MakeOpaqueShape();
   auto x = builder.Parameter(0, shape, "x");
-  auto concatenated = builder.Add(x, x);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.Add(x, x);
+  auto computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
               ::testing::ContainsRegex(
@@ -2687,12 +2753,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, IdentityBroadcastOfSameRankIsAllowed) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto b =
       builder.ConstantR2<float>({{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}});
-  auto add = builder.Add(a, b, /*broadcast_dimensions=*/{0, 1});
+  builder.Add(a, b, /*broadcast_dimensions=*/{0, 1});
 
   Array2D<float> expected_array(
       {{-4.0f, 11.28f, 43.0f}, {1.25f, -14.0f, 8.88f}});
@@ -2700,14 +2766,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, IdentityBroadcastOfSameRankIsAllowed) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto b =
       builder.ConstantR2<float>({{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}});
-  auto add = builder.Add(a, b, /*broadcast_dimensions=*/{1, 0});
+  builder.Add(a, b, /*broadcast_dimensions=*/{1, 0});
 
-  StatusOr<Computation> computation_status = builder.Build();
+  auto computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().error_message(),
               ::testing::ContainsRegex("must.*be the identity"));
@@ -2716,7 +2782,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) {
 // Regression test for b/31927799. "slice - y" is fused and requires implicit
 // broadcast.
 XLA_TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x_literal = Literal::CreateR1<float>({1, 2, 3});
   auto y_literal = Literal::CreateR1<float>({4, 5});
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
index 3f6fd7c65d3360a622dbf754833009fb20410535..fcd9ff55e393f64476ddd4754e0fa74427f1cb51 100644
--- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -28,11 +28,11 @@ namespace {
 class AxpySimpleTest : public ClientLibraryTestBase {};
 
 TEST_F(AxpySimpleTest, AxTenValues) {
-  ComputationBuilder builder(client_, "ax_10");
+  XlaBuilder builder("ax_10");
   auto alpha = builder.ConstantR0<float>(3.1415926535);
   auto x = builder.ConstantR1<float>(
       {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
-  auto ax = builder.Mul(alpha, x);
+  builder.Mul(alpha, x);
 
   std::vector<float> expected = {
       -3.14159265, 3.14159265,  6.28318531,   -6.28318531,  -9.42477796,
@@ -41,26 +41,26 @@ TEST_F(AxpySimpleTest, AxTenValues) {
 }
 
 XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) {
-  ComputationBuilder builder(client_, "axpy_10");
+  XlaBuilder builder("axpy_10");
   auto alpha = builder.ConstantR0<float>(3.1415926535);
   auto x = builder.ConstantR1<float>({});
   auto y = builder.ConstantR1<float>({});
   auto ax = builder.Mul(alpha, x);
-  auto axpy = builder.Add(ax, y);
+  builder.Add(ax, y);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 TEST_F(AxpySimpleTest, AxpyTenValues) {
-  ComputationBuilder builder(client_, "axpy_10");
+  XlaBuilder builder("axpy_10");
   auto alpha = builder.ConstantR0<float>(3.1415926535);
   auto x = builder.ConstantR1<float>(
       {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
   auto y = builder.ConstantR1<float>(
       {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0});
   auto ax = builder.Mul(alpha, x);
-  auto axpy = builder.Add(ax, y);
+  builder.Add(ax, y);
 
   TF_ASSERT_OK_AND_ASSIGN(ProgramShape shape, builder.GetProgramShape());
 
diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
index e4bf1827acf24bcdbfe20fe39e794a0265ab89e3..22c3394e6f34bd018ffaaaa4d9d68339673c3764 100644
--- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
+++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -34,13 +34,13 @@ namespace {
 class BadRngShapeValidationTest : public ClientLibraryTestBase {};
 
 TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0<float>(0.0);
   auto one = builder.ConstantR0<float>(1.0);
   Shape default_constructed;
   builder.RngUniform(zero, one, default_constructed);
 
-  StatusOr<Computation> computation = builder.Build();
+  StatusOr<XlaComputation> computation = builder.Build();
   EXPECT_FALSE(computation.ok());
   LOG(INFO) << "status received: " << computation.status();
   EXPECT_THAT(computation.status().error_message(),
@@ -48,7 +48,7 @@ TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
 }
 
 TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0<float>(0.0);
   auto one = builder.ConstantR0<float>(1.0);
   Shape sans_layout;
@@ -57,7 +57,7 @@ TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
 
   builder.RngUniform(zero, one, sans_layout);
 
-  StatusOr<Computation> computation = builder.Build();
+  StatusOr<XlaComputation> computation = builder.Build();
   ASSERT_TRUE(computation.ok());
   LOG(INFO) << computation.status();
 }
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 28ab9654997728fbafd6610af840e721e72cce5a..f3dac75a44b948c4b45b80b93e7462073010979e 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -69,6 +69,15 @@ class BatchNormalizationTest
     CHECK_EQ(kY, input_array_.width());
   }
 
+  XlaOp CheckShape(XlaBuilder* b, const XlaOp& operand,
+                   const Shape& expected_shape) const {
+    Shape actual_shape = b->GetShape(operand).ConsumeValueOrDie();
+    CHECK(ShapeUtil::Equal(expected_shape, actual_shape))
+        << "want " << ShapeUtil::HumanString(expected_shape) << " got "
+        << ShapeUtil::HumanString(actual_shape);
+    return operand;
+  }
+
   static constexpr int64 kSamples = 3;
   static constexpr int64 kX = 1;
   static constexpr int64 kY = 1;
@@ -91,7 +100,7 @@ INSTANTIATE_TEST_CASE_P(BatchNormalizationTestInstance, BatchNormalizationTest,
 #endif
 
 XLA_TEST_P(BatchNormalizationTest, SubtractInZ) {
-  ComputationBuilder builder(client_, "subtract_in_z_one_sample");
+  XlaBuilder builder("subtract_in_z_one_sample");
   auto x = builder.ConstantLiteral(input_literal_);
   auto y = builder.ConstantR1<float>({3.14, 4.25});
   builder.Sub(x, y, /*broadcast_dimensions=*/{1});
@@ -107,7 +116,7 @@ XLA_TEST_P(BatchNormalizationTest, SubtractInZ) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) {
-  ComputationBuilder builder(client_, "square_tesseract_elementwise");
+  XlaBuilder builder("square_tesseract_elementwise");
   auto x = builder.ConstantLiteral(input_literal_);
   builder.SquareF32(x);
 
@@ -124,9 +133,9 @@ XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SumToZ) {
-  ComputationBuilder builder(client_, "sum_to_z");
+  XlaBuilder builder("sum_to_z");
   auto input_activations = builder.ConstantLiteral(input_literal_);
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   // Reduce all but the Z dimension.
   builder.Reduce(input_activations, builder.ConstantR0<float>(0.0f), add,
                  {0, 2, 3});
@@ -136,24 +145,23 @@ XLA_TEST_P(BatchNormalizationTest, SumToZ) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SquareAndReduce) {
-  ComputationBuilder builder(client_, "square_and_reduce");
+  XlaBuilder builder("square_and_reduce");
   auto input_activations = builder.ConstantLiteral(input_literal_);
   auto set_means = builder.ConstantR1<float>({2.f, 4.2f});
   auto activation_deviations = builder.Sub(input_activations, set_means,
                                            /*broadcast_dimensions=*/{1});
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   auto dev_squares = builder.SquareF32(activation_deviations);
-  auto sum_of_squares = builder.Reduce(
-      dev_squares, builder.ConstantR0<float>(0.0f), add, {0, 2, 3});
+  builder.Reduce(dev_squares, builder.ConstantR0<float>(0.0f), add, {0, 2, 3});
 
   std::vector<float> expected = {18, 0.06};
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
-  ComputationBuilder builder(client_, "variance_to_stddev");
+  XlaBuilder builder("variance_to_stddev");
   auto variance = builder.ConstantR1<float>({6.f, .02f});
-  auto sqrt = builder.SqrtF32(variance);
+  builder.SqrtF32(variance);
 
   std::vector<float> expected = {2.44948974f, 0.14142136f};
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
@@ -162,23 +170,24 @@ XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
 // Compare against a forward batch normalization example in the NN spec
 // reference.
 XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
-  ComputationBuilder builder(client_, "batch_normalize_per_spec");
+  XlaBuilder builder("batch_normalize_per_spec");
   auto input_activations =
-      builder.CheckShape(builder.ConstantLiteral(input_literal_),
-                         ShapeUtil::MakeShape(F32, {3, 2, 1, 1}));
+      CheckShape(&builder, builder.ConstantLiteral(input_literal_),
+                 ShapeUtil::MakeShape(F32, {3, 2, 1, 1}));
   auto gamma = builder.ConstantR1<float>({1.0, 1.0});
   auto beta = builder.ConstantR1<float>({0.0, 0.0});
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   // Reduce all dimensions except dimension 1.
   Shape TwoElementVectorF32 = ShapeUtil::MakeShape(F32, {2});
-  auto sum = builder.CheckShape(
+  auto sum = CheckShape(
+      &builder,
       builder.Reduce(input_activations, builder.ConstantR0<float>(0.0f), add,
                      /*dimensions_to_reduce=*/{0, 2, 3}),
       TwoElementVectorF32);
   auto input_shape = builder.GetShape(input_activations).ConsumeValueOrDie();
   auto sum_shape = builder.GetShape(sum).ConsumeValueOrDie();
-  auto count = builder.ConstantR0<float>(ShapeUtil::ElementsIn(*input_shape) /
-                                         ShapeUtil::ElementsIn(*sum_shape));
+  auto count = builder.ConstantR0<float>(ShapeUtil::ElementsIn(input_shape) /
+                                         ShapeUtil::ElementsIn(sum_shape));
   auto set_means = builder.Div(sum, count);
 
   const float kEpsilon = 1e-9f;
@@ -187,14 +196,16 @@ XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
   auto activation_deviations = builder.Sub(input_activations, set_means,
                                            /*broadcast_dimensions=*/{1});
   auto dev_squares = builder.SquareF32(activation_deviations);
-  auto sum_of_squares = builder.CheckShape(
+  auto sum_of_squares = CheckShape(
+      &builder,
       builder.Reduce(dev_squares, builder.ConstantR0<float>(0.0f), add,
                      /*dimensions_to_reduce=*/{0, 2, 3}),
       TwoElementVectorF32);
   auto variance = builder.Div(sum_of_squares, count);
   auto standard_deviation = builder.SqrtF32(variance);
-  auto standard_deviation_above_epsilon = builder.CheckShape(
-      builder.Gt(standard_deviation, epsilon), ShapeUtil::MakeShape(PRED, {2}));
+  auto standard_deviation_above_epsilon =
+      CheckShape(&builder, builder.Gt(standard_deviation, epsilon),
+                 ShapeUtil::MakeShape(PRED, {2}));
   auto gt_eps = builder.Select(standard_deviation_above_epsilon,
                                standard_deviation, epsilon2);
   auto normalization_factors = builder.ReciprocalF32(gt_eps);
@@ -219,7 +230,7 @@ XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
 
 XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
   const int kFeatureIndex = 3;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<float>(
       {{{{1.f, 2.f}}, {{3.f, 4.f}}}, {{{5.f, 6.f}}, {{7.f, 8.f}}}});
@@ -228,8 +239,8 @@ XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
 
   auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
 
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
+  builder.BatchNormTraining(operand, scale, offset,
+                            /*epsilon=*/0.001, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR4<float>({{{{-1.6f, -2.0f}}, {{0.1f, 0.6f}}},
@@ -243,7 +254,7 @@ XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
 
 XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<float>(
       {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
@@ -252,8 +263,8 @@ XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
 
   auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
 
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
+  builder.BatchNormTraining(operand, scale, offset,
+                            /*epsilon=*/0.001, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR4<float>({{{{-1.6f}, {-2.0f}}, {{0.1f}, {0.6f}}},
@@ -268,23 +279,23 @@ XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
 XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) {
   // Use 0 dimension as feature, tests layout analyzer.
   const int kFeatureIndex = 0;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto operand = CreateR3Parameter<float>(Array3D<float>(260, 2, 2, 1.0f),
                                           /*parameter_number=*/0, "operand",
                                           &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto scale =
       CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
                                /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
+  XlaOp h2;
   auto offset =
       CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
                                /*parameter_number=*/2, "offset", &builder, &h2);
 
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/1, kFeatureIndex);
+  builder.BatchNormTraining(h0, h1, h2,
+                            /*epsilon=*/1, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR3FromArray3D<float>(Array3D<float>(260, 2, 2, 1.0f))
@@ -300,24 +311,24 @@ XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) {
 XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) {
   // Test the correctness of choosing a large epsilon value.
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto operand = CreateR3Parameter<float>({{{0.0f}, {10.0f}, {20.0f}, {30.0f}}},
                                           /*parameter_number=*/0, "operand",
                                           &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto scale =
       CreateR1Parameter<float>(std::vector<float>(1, 1.0f),
                                /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
+  XlaOp h2;
   auto offset =
       CreateR1Parameter<float>(std::vector<float>(1, 0.0f),
                                /*parameter_number=*/2, "offset", &builder, &h2);
 
   // var = 125, mean = 15, epsilon = -100
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/-100, kFeatureIndex);
+  builder.BatchNormTraining(h0, h1, h2,
+                            /*epsilon=*/-100, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR3FromArray3D<float>({{{-3.0f}, {-1.0f}, {1.0f}, {3.0f}}})
@@ -332,7 +343,7 @@ XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) {
 
 XLA_TEST_P(BatchNormalizationTest, BatchNormGradBasic) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand =
       builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 2, 2, 1, 0.0f));
@@ -439,7 +450,7 @@ INSTANTIATE_TEST_CASE_P(BatchNormTest_Instantiation, BatchNormTestManySizes,
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -539,7 +550,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) {
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -647,7 +658,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) {
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -814,9 +825,9 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) {
   std::unique_ptr<GlobalData> grad_output_data =
       client_->TransferToServer(*grad_output_literal).ConsumeValueOrDie();
 
-  auto t = builder.BatchNormGrad(input_parameter, scale_parameter,
-                                 mean_parameter, var_parameter,
-                                 grad_output_parameter, epsilon, feature_index);
+  builder.BatchNormGrad(input_parameter, scale_parameter, mean_parameter,
+                        var_parameter, grad_output_parameter, epsilon,
+                        feature_index);
 
   auto expected =
       Literal::MakeTuple({expected_grad_activation.get(),
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index b853dfaa15d7ff2e21048a5a6a486d22c5a05416..4e65cf11f3f1a027e1adc5a89930caba28958fea 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -19,10 +19,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -52,7 +51,7 @@ class Bfloat16Test : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(Bfloat16Test, ScalarOperation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(2.0f));
   auto y = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(1.0f));
   builder.Add(x, y);
@@ -62,7 +61,7 @@ XLA_TEST_F(Bfloat16Test, ScalarOperation) {
 }
 
 XLA_TEST_F(Bfloat16Test, LogOperation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(4.0f));
   builder.Log(x);
 
@@ -71,7 +70,7 @@ XLA_TEST_F(Bfloat16Test, LogOperation) {
 }
 
 XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Neg(builder.ConstantR0<bfloat16>(static_cast<bfloat16>(2.1f)));
 
   ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(-2.1f), {},
@@ -80,7 +79,7 @@ XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
 
 XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<bfloat16>(
       {{{{static_cast<bfloat16>(1.f)}, {static_cast<bfloat16>(2.f)}},
@@ -117,7 +116,7 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
 
 XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<bfloat16>(
       Array4D<bfloat16>(2, 2, 2, 1, static_cast<bfloat16>(0.0f)));
diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
index 97fec89b63fb8d3a4264275f3253a91e1ea2ce68..48203b1d40ea69ff00a57c2c9e42620739b23d59 100644
--- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc
+++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -32,7 +32,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_32x4) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 32, 4);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 4);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D<float>(*alhs);
   auto rhs = builder.ConstantR2FromArray2D<float>(*arhs);
   builder.Add(lhs, rhs);
@@ -48,7 +48,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_129x129) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 129);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 129);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D<float>(*alhs);
   auto rhs = builder.ConstantR2FromArray2D<float>(*arhs);
   builder.Add(lhs, rhs);
@@ -64,7 +64,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_9x5) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 9, 5);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 9, 1);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D<float>(*alhs);
   auto rhs = builder.ConstantR2FromArray2D<float>(*arhs);
   builder.Add(lhs, rhs);
@@ -80,7 +80,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 257);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 129, 1);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D<float>(*alhs);
   auto rhs = builder.ConstantR2FromArray2D<float>(*arhs);
   builder.Add(lhs, rhs);
@@ -93,7 +93,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) {
 }
 
 TEST_F(BinopScalingTest, R0PlusR2F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR0<float>(42.0);
   auto rhs = builder.ConstantR2<float>({
       {1.0, 2.0}, {3.0, 4.0},
@@ -109,7 +109,7 @@ TEST_F(BinopScalingTest, R0PlusR2F32) {
 }
 
 TEST_F(BinopScalingTest, R4PlusR0S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   Array4D<int> lhs_array({
     {{{1, 2},
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
index 0d94d65c1015fb54ada3fdfc95d0c31d0a0f158b..bff60f25ec8f15d372d251ac313200301a04f20f 100644
--- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -34,7 +34,7 @@ namespace {
 
 class BitcastConvertTest : public ClientLibraryTestBase {
  public:
-  explicit BitcastConvertTest(perftools::gputools::Platform* platform = nullptr)
+  explicit BitcastConvertTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
@@ -42,7 +42,7 @@ class BitcastConvertTest : public ClientLibraryTestBase {
 };
 
 TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({42, 64});
   builder.BitcastConvertType(a, S32);
 
@@ -51,7 +51,7 @@ TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0f, 64.0f});
   builder.BitcastConvertType(a, F32);
 
@@ -60,7 +60,7 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
 }
 
 TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR1<int32>({0, static_cast<int32>(0x80000000), 0x3F800000,
                                  static_cast<int32>(0xBF800000), 0x3F000000,
@@ -72,7 +72,7 @@ TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
 }
 
 XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   builder.BitcastConvertType(a, F32);
 
@@ -81,7 +81,7 @@ XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.6, 64.4});
   builder.BitcastConvertType(a, S32);
 
@@ -90,7 +90,7 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertS32Extremes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>(
       {std::numeric_limits<int32>::min(), std::numeric_limits<int32>::max()});
   builder.BitcastConvertType(a, F32);
@@ -100,7 +100,7 @@ TEST_F(BitcastConvertTest, ConvertS32Extremes) {
 }
 
 TEST_F(BitcastConvertTest, ConvertMapToS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in");
   b->BitcastConvertType(param, S32);
@@ -112,7 +112,7 @@ TEST_F(BitcastConvertTest, ConvertMapToS32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertMapToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in");
   b->BitcastConvertType(param, F32);
@@ -129,7 +129,7 @@ TEST_F(BitcastConvertTest, ConvertMapToF32) {
 //   input -> convert -> reshape
 // the new convert should have the same element type as the old convert.
 TEST_F(BitcastConvertTest, ConvertReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR1<int32>({0x42280000});
   auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{});
   builder.BitcastConvertType(reshape, F32);
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 03f5e08315bfed2bcb43ebb7098aaa0b97228605..34c86e007beea1cbac04641bdbdab62dc567f13e 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -33,10 +33,8 @@ namespace {
 
 class BroadcastSimpleTest : public ClientLibraryTestBase {
  public:
-  ComputationDataHandle BuildBinOp(HloOpcode op,
-                                   const ComputationDataHandle& lhs,
-                                   const ComputationDataHandle& rhs,
-                                   ComputationBuilder* builder) {
+  XlaOp BuildBinOp(HloOpcode op, const XlaOp& lhs, const XlaOp& rhs,
+                   XlaBuilder* builder) {
     switch (op) {
       case HloOpcode::kMinimum: {
         return builder->Min(lhs, rhs);
@@ -105,21 +103,21 @@ class BroadcastSimpleTest : public ClientLibraryTestBase {
 using ::testing::HasSubstr;
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0<float>(1.5), {});
   ComputeAndCompareR0<float>(&b, 1.5, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x3) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0<float>(2.25), {2, 3});
   Array2D<float> expected(2, 3, 2.25);
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) {
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle src;
+  XlaBuilder b(TestName());
+  XlaOp src;
   std::unique_ptr<GlobalData> param_data =
       CreateR0Parameter<float>(2.25f, /*parameter_number=*/0, /*name=*/"src",
                                /*builder=*/&b, /*data_handle=*/&src);
@@ -131,21 +129,21 @@ XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0<float>(2.25), {2, 0});
   Array2D<float> expected(2, 0);
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_0x2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0<float>(2.25), {0, 2});
   Array2D<float> expected(0, 2);
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR1<float>({1, 2, 3}), {2});
 
   Array2D<float> expected(2, 3);
@@ -160,7 +158,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
 
 // Tests implicit broadcasting of PREDs.
 XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   Array2D<bool> x_vals(2, 1);
   x_vals(0, 0) = true;
@@ -171,7 +169,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
   y_vals(1, 0, 0) = true;
   y_vals(1, 1, 0) = true;
 
-  ComputationDataHandle x, y;
+  XlaOp x, y;
   auto x_data = CreateR2Parameter<bool>(x_vals, 0, "x", &b, &x);
   auto y_data = CreateR3Parameter<bool>(y_vals, 1, "y", &b, &y);
   b.And(x, y, /*broadcast_dimensions=*/{1, 2});
@@ -186,7 +184,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR1<float>({}), {2});
 
   Array2D<float> expected(2, 0);
@@ -194,7 +192,7 @@ XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, 1DToZeroElement2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR1<float>({1, 2, 3}), {0});
 
   Array2D<float> expected(0, 3);
@@ -209,7 +207,7 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) {
   // broadcasting (broadcast_dimensions {1, 2}), then is added to the rhs shape
   // [2, 3, 1]. Degenerate dimension broadcasting then broadcasts the size one
   // dimensions.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 5.0}}),
         b.ConstantLiteral(*Literal::CreateR3<float>(
@@ -247,7 +245,7 @@ class BroadcastR3ImplicitTest
 
 XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
   const R3ImplicitBroadcastSpec& spec = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Shape r3_shape, r3_implicit_shape;
   Array3D<float> r3_array(spec.output_bounds[0], spec.output_bounds[1],
@@ -264,8 +262,7 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
 
   auto r3_implicit_parameter = builder.Parameter(0, r3_implicit_shape, "input");
   auto r3_parameter = builder.Parameter(1, r3_shape, "input");
-  ComputationDataHandle op =
-      BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder);
+  XlaOp op = BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder);
 
   Array3D<float> expected_array(spec.output_bounds[0], spec.output_bounds[1],
                                 spec.output_bounds[2]);
@@ -300,9 +297,9 @@ INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances,
 
 // r1 and r3's dim0 matches, and r1's dim1 and dim2 have size 1:
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle r1h;
-  ComputationDataHandle r3h;
+  XlaBuilder b(TestName());
+  XlaOp r1h;
+  XlaOp r3h;
 
   Array3D<float> r1d = {{{1}}, {{2}}};
   Array3D<float> r3d = {{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}};
@@ -319,7 +316,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1, 2}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -332,7 +329,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1}, {2}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -345,7 +342,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1, 2}, {3, 4}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -358,7 +355,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1, 2}}, {{3, 4}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -371,7 +368,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 =
       b.ConstantLiteral(*Literal::CreateR3<float>({{{1}, {2}}, {{3}, {4}}}));
   auto r3 = b.ConstantLiteral(
@@ -385,7 +382,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -491,7 +488,7 @@ class BroadcastR2ImplicitTest
 XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
   const R2ImplicitBroadcastSpec& spec = GetParam();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // Operands with degenerate dimensions require implicit broadcasting:
   Shape r2_shape, r2_implicit_shape1, r2_implicit_shape2;
@@ -517,10 +514,9 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
   auto r2_implicit_parameter2 =
       builder.Parameter(2, r2_implicit_shape2, "input2");
 
-  ComputationDataHandle op1 =
+  XlaOp op1 =
       BuildBinOp(spec.op1, r2_implicit_parameter1, r2_parameter, &builder);
-  ComputationDataHandle op2 =
-      BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder);
+  XlaOp op2 = BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder);
 
   Array2D<float> expected_array(spec.output_bounds[0], spec.output_bounds[1]);
 
@@ -547,7 +543,7 @@ INSTANTIATE_TEST_CASE_P(BroadcastR2ImplicitTestInstances,
                         ::testing::ValuesIn(kR2ImplicitBroadcastTestCases));
 
 XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR2<float>({{1, 2}}));
   auto r2 = b.ConstantLiteral(*Literal::CreateR2<float>({{1, 2}, {3, 4}}));
   b.Add(r2, r1);
@@ -558,7 +554,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR2<float>({{1}, {2}}));
   auto r2 = b.ConstantLiteral(*Literal::CreateR2<float>({{1, 2}, {3, 4}}));
   b.Add(r2, r1);
@@ -569,7 +565,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -582,7 +578,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -595,7 +591,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -608,7 +604,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1_0 = b.ConstantR1<float>({1000, 2000});
   auto r1_1 = b.ConstantR1<float>({100, 200});
   auto r1_2 = b.ConstantR1<float>({10, 20});
@@ -629,7 +625,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1_0 = b.ConstantR1<float>({1000, 2000});
   auto r1_1 = b.ConstantR1<float>({100, 200});
   auto r1_2 = b.ConstantR1<float>({10, 20});
@@ -652,7 +648,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
 XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
   // Binary dimension broadcasting of the smaller lhs ([2, 2] up to [2, 2, 2])
   // results in a shape incompatible with the lhs [2, 3, 1].
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 5.0}, {1.0, 5.0}}),
         b.ConstantLiteral(*Literal::CreateR3<float>(
@@ -662,12 +658,12 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(result_status.status().error_message(),
-              HasSubstr("broadcast dimension 0 mismatch"));
+              HasSubstr("dimension 0 mismatch"));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
   // Test invalid broadcasting with [1, 2] and [2, 3] inputs.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 2.0}}),
         b.ConstantR2<float>({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
@@ -675,12 +671,12 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(result_status.status().error_message(),
-              HasSubstr("binary op BINOP_ADD with incompatible shapes"));
+              HasSubstr("op BINOP_ADD with incompatible shapes"));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
   // Test invalid broadcasting with [1, 2] and [2, 3] inputs.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 2.0}}),
         b.ConstantR2<float>({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
@@ -688,7 +684,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(result_status.status().error_message(),
-              HasSubstr("binary op BINOP_ADD with incompatible shapes"));
+              HasSubstr("op BINOP_ADD with incompatible shapes"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 610302ac1256a57db6ed6e18016a4136973e3891..53f2c3bfbfce9585cb68f103a495ce2f1ad8432e 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -4,7 +4,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
 load("//tensorflow/compiler/xla/tests:plugin.bzl", "plugins")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-all_backends = ["cpu", "cpu_parallel", "gpu"] + plugins.keys()
+all_backends = ["cpu", "gpu"] + plugins.keys()
 
 def filter_backends(backends):
   """Removes "gpu" from a backend list if CUDA is not enabled.
@@ -39,10 +39,10 @@ def xla_test(name,
              **kwargs):
   """Generates cc_test targets for the given XLA backends.
 
-  This rule generates a cc_test target for one or more XLA backends and also
-  a platform-agnostic cc_library rule. The arguments are identical to cc_test
-  with two additions: 'backends' and 'backend_args'. 'backends' specifies the
-  backends to generate tests for ("cpu", "cpu_parallel", "gpu"), and
+  This rule generates a cc_test target for one or more XLA backends and also a
+  platform-agnostic cc_library rule. The arguments are identical to cc_test with
+  two additions: 'backends' and 'backend_args'. 'backends' specifies the
+  backends to generate tests for ("cpu", "gpu"), and
   'backend_args'/'backend_tags' specifies backend-specific args parameters to
   use when generating the cc_test.
 
@@ -90,9 +90,9 @@ def xla_test(name,
     deps: Dependencies of the target.
     xla_test_library_deps: If set, the generated test targets will depend on the
       respective cc_libraries generated by the xla_test_library rule.
-    backends: A list of backends to generate tests for. Supported
-      values: "cpu", "cpu_parallel", "gpu". If this list is empty, the test will
-      be generated for all supported backends.
+    backends: A list of backends to generate tests for. Supported values: "cpu",
+      "gpu". If this list is empty, the test will be generated for all supported
+      backends.
     blacklisted_backends: A list of backends to NOT generate tests for.
     args: Test arguments for the target.
     tags: Tags for the target.
@@ -128,16 +128,13 @@ def xla_test(name,
     if backend == "cpu":
       backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"]
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"]
-    elif backend == "cpu_parallel":
-      backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"]
-      backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"]
-      this_backend_args += ["--xla_backend_extra_options=\"xla_cpu_parallel\""]
     elif backend == "gpu":
       backend_deps = ["//tensorflow/compiler/xla/service:gpu_plugin"]
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"]
       this_backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
-      backend_deps = plugins[backend]["deps"]
+      backend_deps = []
+      backend_deps += plugins[backend]["deps"]
       this_backend_copts += plugins[backend]["copts"]
       this_backend_tags += plugins[backend]["tags"]
       this_backend_args += plugins[backend]["args"]
@@ -200,7 +197,7 @@ def xla_test_library(name,
     hdrs: Headers for the target.
     deps: Dependencies of the target.
     backends: A list of backends to generate libraries for.
-      Supported values: "cpu", "cpu_parallel", "gpu". If this list is empty, the
+      Supported values: "cpu", "gpu". If this list is empty, the
       library will be generated for all supported backends.
   """
 
@@ -209,7 +206,7 @@ def xla_test_library(name,
 
   for backend in filter_backends(backends):
     this_backend_copts = []
-    if backend in ["cpu", "cpu_parallel", "gpu"]:
+    if backend in ["cpu", "gpu"]:
       backend_deps = ["//tensorflow/compiler/xla/tests:test_macros_%s" % backend]
     elif backend in plugins:
       backend_deps = plugins[backend]["deps"]
diff --git a/tensorflow/compiler/xla/tests/call_test.cc b/tensorflow/compiler/xla/tests/call_test.cc
index 5e42365ae38dcc770bc2f1c9cb2c088fe02241a3..a43ca3d5ca2ba39ba9c16213e985e50bf39c0b7d 100644
--- a/tensorflow/compiler/xla/tests/call_test.cc
+++ b/tensorflow/compiler/xla/tests/call_test.cc
@@ -17,7 +17,8 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -32,16 +33,16 @@ namespace {
 
 class CallOpTest : public ClientLibraryTestBase {
  protected:
-  Computation CreateR0F32IdentityComputation() {
-    ComputationBuilder builder(client_, "Identity");
+  XlaComputation CreateR0F32IdentityComputation() {
+    XlaBuilder builder("Identity");
     builder.Parameter(0, r0f32_, "x");
     auto build_status = builder.Build();
     EXPECT_IS_OK(build_status.status());
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR1S0F32AdditionComputation() {
-    ComputationBuilder builder(client_, "Addition");
+  XlaComputation CreateR1S0F32AdditionComputation() {
+    XlaBuilder builder("Addition");
     auto x = builder.Parameter(0, r1s0f32_, "x");
     auto y = builder.Parameter(1, r1s0f32_, "y");
     builder.Add(x, y);
@@ -50,8 +51,8 @@ class CallOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR1S2F32AdditionComputation() {
-    ComputationBuilder builder(client_, "Addition");
+  XlaComputation CreateR1S2F32AdditionComputation() {
+    XlaBuilder builder("Addition");
     auto x = builder.Parameter(0, r1s2f32_, "x");
     auto y = builder.Parameter(1, r1s2f32_, "y");
     builder.Add(x, y);
@@ -60,8 +61,8 @@ class CallOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0F32TupleComputation() {
-    ComputationBuilder builder(client_, "Tuple");
+  XlaComputation CreateR0F32TupleComputation() {
+    XlaBuilder builder("Tuple");
     builder.Tuple({builder.Parameter(0, r0f32_, "x")});
     auto build_status = builder.Build();
     EXPECT_IS_OK(build_status.status());
@@ -74,8 +75,8 @@ class CallOpTest : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(CallOpTest, CallR0F32IdentityScalar) {
-  ComputationBuilder builder(client_, TestName());
-  Computation callee = CreateR0F32IdentityComputation();
+  XlaBuilder builder(TestName());
+  XlaComputation callee = CreateR0F32IdentityComputation();
   auto constant = builder.ConstantLiteral(*Literal::CreateR0<float>(42.0));
   builder.Call(callee, {constant});
 
@@ -83,8 +84,8 @@ XLA_TEST_F(CallOpTest, CallR0F32IdentityScalar) {
 }
 
 XLA_TEST_F(CallOpTest, CallR1S0F32AddArray) {
-  ComputationBuilder builder(client_, TestName());
-  Computation callee = CreateR1S0F32AdditionComputation();
+  XlaBuilder builder(TestName());
+  XlaComputation callee = CreateR1S0F32AdditionComputation();
   auto x = builder.ConstantLiteral(*Literal::CreateR1<float>({}));
   auto y = builder.ConstantLiteral(*Literal::CreateR1<float>({}));
   builder.Call(callee, {x, y});
@@ -93,8 +94,8 @@ XLA_TEST_F(CallOpTest, CallR1S0F32AddArray) {
 }
 
 XLA_TEST_F(CallOpTest, CallR1S2F32AddArray) {
-  ComputationBuilder builder(client_, TestName());
-  Computation callee = CreateR1S2F32AdditionComputation();
+  XlaBuilder builder(TestName());
+  XlaComputation callee = CreateR1S2F32AdditionComputation();
   auto x = builder.ConstantLiteral(*Literal::CreateR1<float>({1.0f, 2.0f}));
   auto y = builder.ConstantLiteral(*Literal::CreateR1<float>({2.0f, 3.0f}));
   builder.Call(callee, {x, y});
@@ -103,23 +104,23 @@ XLA_TEST_F(CallOpTest, CallR1S2F32AddArray) {
 }
 
 XLA_TEST_F(CallOpTest, CallTreeTwoDeepBranchFactorThree) {
-  ComputationBuilder builder(client_, "inner");
+  XlaBuilder builder("inner");
   {
     auto x = builder.Parameter(0, r0f32_, "x");
     builder.Add(x, builder.ConstantR0<float>(1.0));
   }
-  TF_ASSERT_OK_AND_ASSIGN(Computation inner, builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation inner, builder.Build());
 
-  ComputationBuilder builder2(client_, "outer");
+  XlaBuilder builder2("outer");
   {
     auto x = builder2.Parameter(0, r0f32_, "x");
     x = builder2.Call(inner, {x});
     x = builder2.Call(inner, {x});
     x = builder2.Call(inner, {x});
   }
-  TF_ASSERT_OK_AND_ASSIGN(Computation outer, builder2.Build());
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation outer, builder2.Build());
 
-  ComputationBuilder builder3(client_, "outermost");
+  XlaBuilder builder3("outermost");
   {
     auto x = builder3.Parameter(0, r0f32_, "x");
     x = builder3.Call(outer, {x});
@@ -134,8 +135,8 @@ XLA_TEST_F(CallOpTest, CallTreeTwoDeepBranchFactorThree) {
 }
 
 XLA_TEST_F(CallOpTest, CallR0F32Tuple) {
-  ComputationBuilder builder(client_, TestName());
-  Computation callee = CreateR0F32TupleComputation();
+  XlaBuilder builder(TestName());
+  XlaComputation callee = CreateR0F32TupleComputation();
   auto elem = Literal::CreateR0<float>(42.0);
   auto tuple = Literal::MakeTuple({elem.get()});
   builder.Call(callee, {builder.ConstantLiteral(*elem)});
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index f594cc10ac6496f710d03f0b0b134e6dd3b6d38f..660ff0cad5666219a4a7cb1eedbed03f06e651ba 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -35,7 +35,7 @@ using ::testing::ContainsRegex;
 class CheckExecutionArityTest : public ClientLibraryTestBase {};
 
 TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
-  ComputationBuilder builder(client_, "add_two_params");
+  XlaBuilder builder("add_two_params");
   auto param_literal = Literal::CreateR1<float>({1.1f, 2.2f});
 
   auto p0 = builder.Parameter(0, param_literal->shape(), "param0");
@@ -75,7 +75,7 @@ TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
 }
 
 XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
-  ComputationBuilder builder(client_, "add_two_params");
+  XlaBuilder builder("add_two_params");
 
   auto p0 = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
   auto p1 = builder.Parameter(1, ShapeUtil::MakeShape(F32, {4}), "param1");
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index a677986cd926cc0054d8f36abc98ccac33dc043d..22660c35dcaa0ebbb553aa2d5e2412043a2bb300 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -31,10 +32,12 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
+
+// Name of the interpreter backend.
+constexpr char kInterpreter[] = "interpreter";
+
 // Wrapper function that creates a nicer error message (than a bare
 // ValueOrDie()) if the platform we intend to test is not available.
 Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
@@ -43,14 +46,26 @@ Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
   TF_CHECK_OK(result.status()) << " could not create local client for testing";
   return result.ValueOrDie();
 }
+
+// Helper functions to get the reference platform.
+se::Platform* GetReferencePlatform() {
+  auto result = PlatformUtil::GetPlatform(kInterpreter);
+  TF_CHECK_OK(result.status()) << "could not get interpreter platform";
+  return result.ValueOrDie();
+}
+
 }  // namespace
 
 ClientLibraryTestBase::ClientLibraryTestBase(
-    perftools::gputools::Platform* platform,
-    const LocalClientOptions& client_options)
+    se::Platform* platform, const LocalClientOptions& client_options)
     : client_(GetOrCreateLocalClientOrDie(client_options)),
       execution_options_(CreateDefaultExecutionOptions()) {
   CHECK_EQ(platform, client_options.platform());
+
+  LocalClientOptions ref_options;
+  ref_options.set_platform(GetReferencePlatform());
+  ref_client_ = GetOrCreateLocalClientOrDie(ref_options);
+
   // Disabling constant_folding so that tests (usually written using Constants)
   // will exercise the intended code paths, instead of being constant folded.
   //
@@ -66,6 +81,11 @@ ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
   LocalClientOptions default_options;
   default_options.set_platform(platform);
   client_ = GetOrCreateLocalClientOrDie(default_options);
+
+  LocalClientOptions ref_options;
+  ref_options.set_platform(GetReferencePlatform());
+  ref_client_ = GetOrCreateLocalClientOrDie(ref_options);
+
   execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
       "constant_folding");
 }
@@ -74,9 +94,9 @@ string ClientLibraryTestBase::TestName() const {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
 
+template <typename BuilderT>
 StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
-    ComputationBuilder* builder,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   // Build the computation, as a convenience.
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
   return client_->Execute(computation, arguments, &execution_options_);
@@ -95,6 +115,20 @@ StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
                                      &execution_options);
 }
 
+StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_output_layout) {
+  ExecutionOptions execution_options = execution_options_;
+  if (shape_with_output_layout != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *shape_with_output_layout;
+  }
+  return client_->ExecuteAndTransfer(computation, arguments,
+                                     &execution_options);
+}
+
+template <>
 StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -104,6 +138,30 @@ StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
   return ExecuteAndTransfer(computation, arguments, shape_with_output_layout);
 }
 
+template <>
+StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_output_layout) {
+  // Build the computation, as a convenience.
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+  return ExecuteAndTransfer(computation, arguments, shape_with_output_layout);
+}
+
+StatusOr<std::unique_ptr<Literal>>
+ClientLibraryTestBase::ExecuteAndTransferReference(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_output_layout) {
+  ExecutionOptions execution_options = execution_options_;
+  if (shape_with_output_layout != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *shape_with_output_layout;
+  }
+  execution_options.clear_device_handles();
+  return ref_client_->ExecuteAndTransfer(computation, arguments,
+                                         &execution_options);
+}
+
 std::unique_ptr<GlobalData> ClientLibraryTestBase::ExecuteOrDie(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
@@ -116,14 +174,31 @@ std::unique_ptr<Literal> ClientLibraryTestBase::ExecuteAndTransferOrDie(
   return ExecuteAndTransfer(builder, arguments).ConsumeValueOrDie();
 }
 
+string ClientLibraryTestBase::ExecuteToString(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+  auto computation_status = builder->Build();
+  if (!computation_status.ok()) {
+    return computation_status.status().ToString();
+  }
+  auto computation = computation_status.ConsumeValueOrDie();
+
+  auto result =
+      client_->ExecuteAndTransfer(computation, arguments, &execution_options_);
+  if (!result.ok()) {
+    return result.status().ToString();
+  } else {
+    return result.ValueOrDie()->ToString();
+  }
+}
+
 string ClientLibraryTestBase::ExecuteToString(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-  StatusOr<Computation> computation_status = builder->Build();
+  auto computation_status = builder->Build();
   if (!computation_status.ok()) {
     return computation_status.status().ToString();
   }
-  Computation computation = computation_status.ConsumeValueOrDie();
+  auto computation = computation_status.ConsumeValueOrDie();
 
   auto result =
       client_->ExecuteAndTransfer(computation, arguments, &execution_options_);
@@ -142,16 +217,26 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments);
 }
 
+void ClientLibraryTestBase::ComputeAndCompareR1(
+    XlaBuilder* builder, const tensorflow::core::Bitmap& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+  std::unique_ptr<Literal> expected_literal = Literal::CreateR1(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
+                                                  arguments);
+}
+
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const Shape* shape_with_layout) {
   EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments,
                                                   shape_with_layout));
 }
 
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
     const Shape* shape_with_layout) {
   EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments,
@@ -249,8 +334,28 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
   return choose(0);
 }
 
+tensorflow::Status
+ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
+    const xla::XlaComputation& /*computation*/, const Literal& /*expected*/,
+    tensorflow::gtl::ArraySlice<GlobalData*> /*arguments*/,
+    const std::function<void(const Literal& actual,
+                             const string& error_message)>& /*verify_output*/) {
+  return Unimplemented("not yet implemented for XlaComputation");
+}
+
+tensorflow::Status
+ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
+    const xla::XlaComputation& /*computation*/, const Literal& /*expected*/,
+    tensorflow::gtl::ArraySlice<GlobalData*> /*arguments*/,
+    const std::function<void(const Literal& actual,
+                             const string& error_message)>& /*verify_output*/,
+    const Shape* /*output_with_layout*/) {
+  return Unimplemented("not yet implemented for XlaComputation");
+}
+
+template <typename BuilderT>
 tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
     const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
@@ -307,8 +412,9 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   return tensorflow::Status::OK();
 }
 
+template <typename BuilderT>
 tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
     ErrorSpec error, const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
@@ -360,7 +466,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
 }
 
 void ClientLibraryTestBase::ComputeAndCompareR1U8(
-    ComputationBuilder* builder, tensorflow::StringPiece expected,
+    XlaBuilder* builder, tensorflow::StringPiece expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -378,8 +484,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8(
   EXPECT_EQ(expected, actual->GetR1U8AsString());
 }
 
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareTuple(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -390,8 +497,9 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
   LiteralTestUtil::ExpectEqual(expected, *actual);
 }
 
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareTuple(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -456,8 +564,71 @@ ClientLibraryTestBase::ComputeValueAndReference(
   return std::make_pair(std::move(reference), std::move(result));
 }
 
-Computation ClientLibraryTestBase::CreateScalarRelu() {
-  ComputationBuilder builder(client_, "relu");
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments) {
+  auto status_or_data = ComputeValueAndReference(builder, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectEqual(*reference, *result);
+}
+
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments,
+    ErrorSpec error) {
+  auto status_or_data = ComputeValueAndReference(builder, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectNear(*reference, *result, error);
+}
+
+StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+ClientLibraryTestBase::ComputeValueAndReference(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments) {
+  // Transfer the arguments to the executor service. We put the unique_ptr's
+  // into a vector to keep the data alive on the service until the end of this
+  // function.
+  std::vector<std::unique_ptr<GlobalData>> argument_data;
+  std::vector<std::unique_ptr<GlobalData>> ref_argument_data;
+  for (const auto& arg : arguments) {
+    TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg.Clone()));
+    TF_ASSIGN_OR_RETURN(auto ref_data, ref_client_->TransferToServer(arg));
+    argument_data.push_back(std::move(data));
+    ref_argument_data.push_back(std::move(ref_data));
+  }
+
+  // Create raw pointers to the GlobalData for the rest of the call stack.
+  std::vector<GlobalData*> argument_data_ptr;
+  std::transform(
+      argument_data.begin(), argument_data.end(),
+      std::back_inserter(argument_data_ptr),
+      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
+  std::vector<GlobalData*> ref_argument_data_ptr;
+  std::transform(
+      ref_argument_data.begin(), ref_argument_data.end(),
+      std::back_inserter(ref_argument_data_ptr),
+      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
+
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+
+  TF_ASSIGN_OR_RETURN(auto result,
+                      ExecuteAndTransfer(computation, argument_data_ptr));
+
+  TF_ASSIGN_OR_RETURN(auto reference, ExecuteAndTransferReference(
+                                          computation, ref_argument_data_ptr));
+
+  return std::make_pair(std::move(reference), std::move(result));
+}
+
+XlaComputation ClientLibraryTestBase::CreateScalarRelu() {
+  XlaBuilder builder("relu");
   auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
   auto z_value = builder.Parameter(0, shape, "z_value");
   auto zero = use_bfloat16_
@@ -469,8 +640,8 @@ Computation ClientLibraryTestBase::CreateScalarRelu() {
   return computation_status.ConsumeValueOrDie();
 }
 
-Computation ClientLibraryTestBase::CreateScalarMax() {
-  ComputationBuilder builder(client_, "max");
+XlaComputation ClientLibraryTestBase::CreateScalarMax() {
+  XlaBuilder builder("max");
   auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
   auto x = builder.Parameter(0, shape, "x");
   auto y = builder.Parameter(1, shape, "y");
@@ -522,33 +693,6 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols,
   return array;
 }
 
-std::unique_ptr<GlobalData>
-ClientLibraryTestBase::CreateParameterAndTransferLiteral(
-    int64 parameter_number, const Literal& literal, const string& name,
-    ComputationBuilder* builder, ComputationDataHandle* data_handle) {
-  return CreateParameterAndTransferLiteral(parameter_number, literal, name,
-                                           nullptr, builder, data_handle);
-}
-
-std::unique_ptr<GlobalData>
-ClientLibraryTestBase::CreateParameterAndTransferLiteral(
-    int64 parameter_number, const Literal& literal, const string& name,
-    const DeviceHandle* device_handle, ComputationBuilder* builder,
-    ComputationDataHandle* data_handle) {
-  const Literal* param_literal = &literal;
-  std::unique_ptr<Literal> converted_literal;
-  if (use_bfloat16_) {
-    converted_literal = LiteralTestUtil::ConvertF32ToBF16(literal);
-    param_literal = converted_literal.get();
-  }
-  std::unique_ptr<GlobalData> data =
-      client_->TransferToServer(*param_literal, device_handle)
-          .ConsumeValueOrDie();
-  *data_handle =
-      builder->Parameter(parameter_number, param_literal->shape(), name);
-  return data;
-}
-
 ComputationDataHandle ClientLibraryTestBase::AddParam(
     const Literal& argument, ComputationBuilder* builder) {
   ComputationDataHandle data_handle;
@@ -557,10 +701,67 @@ ComputationDataHandle ClientLibraryTestBase::AddParam(
   return data_handle;
 }
 
+XlaOp ClientLibraryTestBase::AddParam(const Literal& argument,
+                                      XlaBuilder* builder) {
+  XlaOp data_handle;
+  arguments_.push_back(CreateParameterAndTransferLiteral(
+      arguments_.size(), argument, "", builder, &data_handle));
+  return data_handle;
+}
+
 ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
     const Literal& literal, ComputationBuilder* builder) {
   return builder->ConstantLiteral(
       use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
 }
 
+XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
+                                                       XlaBuilder* builder) {
+  return builder->ConstantLiteral(
+      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
+}
+
+template void ClientLibraryTestBase::ComputeAndCompareLiteral(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_layout);
+
+template void ClientLibraryTestBase::ComputeAndCompareLiteral(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_layout);
+
+template void ClientLibraryTestBase::ComputeAndCompareLiteral(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
+    const Shape* shape_with_layout);
+
+template void ClientLibraryTestBase::ComputeAndCompareLiteral(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
+    const Shape* shape_with_layout);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
+
+template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
+    ComputationBuilder* builder,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index ba0319990bc04196386e6812b0a03671676698ec..32eea7c2f3a65d2b4a83435ec6258ea9cf6aaf6a 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -63,11 +64,10 @@ std::vector<TestCase> ExpandUseBfloat16(
 // A client library test establishes an in-process XLA client connection.
 class ClientLibraryTestBase : public ::testing::Test {
  protected:
-  explicit ClientLibraryTestBase(
-      perftools::gputools::Platform* platform = nullptr);
+  explicit ClientLibraryTestBase(se::Platform* platform = nullptr);
 
   // Creates a new ClientLibraryTestBase with custom client options.
-  ClientLibraryTestBase(perftools::gputools::Platform* platform,
+  ClientLibraryTestBase(se::Platform* platform,
                         const LocalClientOptions& client_options);
 
   // Returns the name of the test currently being run.
@@ -91,18 +91,36 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Convenience methods for building and running a computation with the member
   // execution options. Modify execution_options_ in your test if you want to
   // customize the options.
+  template <typename BuilderT>
   StatusOr<std::unique_ptr<GlobalData>> Execute(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+      BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+  // TODO(b/74197823): Remove the template type 'BuilderT' in all methods once
+  // the migration to XlaBuilder is complete.
+
+  template <typename BuilderT>
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
+
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
 
+  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const Shape* shape_with_output_layout = nullptr);
+
+  // This executes the computation via the reference client (which connects a
+  // interpreter backend). The result is used as the expected values of the
+  // computation.
+  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransferReference(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const Shape* shape_with_output_layout = nullptr);
+
   // Convenience OrDie variants of above methods.
   std::unique_ptr<GlobalData> ExecuteOrDie(
       ComputationBuilder* builder,
@@ -113,29 +131,31 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   // Run a computation and return its value as a string. If an error
   // occurs, then instead return the error as a string.
+  string ExecuteToString(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<GlobalData*> arguments);
   string ExecuteToString(ComputationBuilder* builder,
                          tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   // Convenience methods for building and running a computation, transferring
   // the result, and comparing it to the expected value(s). Methods are
   // templated on the native host type which maps to specific XLA types (See
-  // ComputationBuilder for details). For each rank, two forms are provided: one
-  // for floating point types with an ErrorSpec parameter, and one for integral
-  // types without the ErrorSpec parameter.
-  template <typename NativeT>
-  void ComputeAndCompareR0(ComputationBuilder* builder, NativeT expected,
+  // ComputationBuilder/XlaBuilder for details). For each rank, two forms are
+  // provided: one for floating point types with an ErrorSpec parameter, and one
+  // for integral types without the ErrorSpec parameter.
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR0(BuilderT* builder, NativeT expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR0(ComputationBuilder* builder, NativeT expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR0(BuilderT* builder, NativeT expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
-  template <typename NativeT>
-  void ComputeAndCompareR1(ComputationBuilder* builder,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR1(BuilderT* builder,
                            tensorflow::gtl::ArraySlice<NativeT> expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR1(ComputationBuilder* builder,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR1(BuilderT* builder,
                            tensorflow::gtl::ArraySlice<NativeT> expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
@@ -145,72 +165,75 @@ class ClientLibraryTestBase : public ::testing::Test {
   void ComputeAndCompareR1(ComputationBuilder* builder,
                            const tensorflow::core::Bitmap& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+  void ComputeAndCompareR1(XlaBuilder* builder,
+                           const tensorflow::core::Bitmap& expected,
+                           tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
-  template <typename NativeT>
-  void ComputeAndCompareR2(ComputationBuilder* builder,
-                           const Array2D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR2(BuilderT* builder, const Array2D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR2(ComputationBuilder* builder,
-                           const Array2D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR2(BuilderT* builder, const Array2D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
-  template <typename NativeT>
-  void ComputeAndCompareR3(ComputationBuilder* builder,
-                           const Array3D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR3(BuilderT* builder, const Array3D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR3(ComputationBuilder* builder,
-                           const Array3D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR3(BuilderT* builder, const Array3D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
-  template <typename NativeT>
-  void ComputeAndCompareR4(ComputationBuilder* builder,
-                           const Array4D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR4(BuilderT* builder, const Array4D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR4(ComputationBuilder* builder,
-                           const Array4D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR4(BuilderT* builder, const Array4D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
   // Build and run the computation and compare the result with the given
   // literal. shape_with_layout indicates the result layout to request when
   // calling Execute.
+  template <typename BuilderT>
   void ComputeAndCompareLiteral(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_layout = nullptr);
+  template <typename BuilderT>
   void ComputeAndCompareLiteral(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
       const Shape* shape_with_layout = nullptr);
 
   // ComputeAndCompare variant which returns an error status.
+  template <typename BuilderT>
   tensorflow::Status ComputeAndCompareLiteralWithStatus(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_layout = nullptr);
+  template <typename BuilderT>
   tensorflow::Status ComputeAndCompareLiteralWithStatus(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
       const Shape* shape_with_layout = nullptr);
 
   // Compare the result of the computation to a strings. In XLA strings are
   // represented using rank-1 U8 shapes.
   void ComputeAndCompareR1U8(
-      ComputationBuilder* builder, tensorflow::StringPiece expected,
+      XlaBuilder* builder, tensorflow::StringPiece expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   // Convenience method for running a built computation, transferring the
   // result, and comparing it to the expected tuple literal.
+  template <typename BuilderT>
   void ComputeAndCompareTuple(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+  template <typename BuilderT>
   void ComputeAndCompareTuple(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
 
   // Convenience method for running a built computation and comparing the result
@@ -223,9 +246,17 @@ class ClientLibraryTestBase : public ::testing::Test {
                          tensorflow::gtl::ArraySlice<Literal> arguments,
                          ErrorSpec error);
 
+  // Convenience method for running a built computation and comparing the result
+  // with the reference result.
+  void ComputeAndCompare(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<Literal> arguments);
+  void ComputeAndCompare(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<Literal> arguments,
+                         ErrorSpec error);
+
   // Create scalar operations for use in reductions.
-  Computation CreateScalarRelu();
-  Computation CreateScalarMax();
+  XlaComputation CreateScalarRelu();
+  XlaComputation CreateScalarMax();
   Computation CreateScalarReluSensitivity();
 
   // Special case convenience functions for creating filled arrays.
@@ -266,17 +297,19 @@ class ClientLibraryTestBase : public ::testing::Test {
   // server, then stores into "data_handle" the global handle for that
   // parameter. When the use_bfloat16 flag is set but the literal has F32
   // elements, the literal will be converted to BF16 before being transferred.
+  template <typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
-      ComputationBuilder* builder, ComputationDataHandle* data_handle);
+      BuilderT* builder, HandleT* data_handle);
 
   // As above, but the caller can specify the device that the literal is
   // transferred to. If device_handle is nullptr, the literal will be
   // transferred to the default device.
+  template <typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
-      const DeviceHandle* device_handle, ComputationBuilder* builder,
-      ComputationDataHandle* data_handle);
+      const DeviceHandle* device_handle, BuilderT* builder,
+      HandleT* data_handle);
 
   // Creates a parameter instruction and sets the value that will be passed to
   // the computation as specified. This function must be used for all parameters
@@ -285,18 +318,24 @@ class ClientLibraryTestBase : public ::testing::Test {
   // set exactly once. The first added parameter gets index 0, then 1 and so on.
   ComputationDataHandle AddParam(const Literal& argument,
                                  ComputationBuilder* builder);
+  XlaOp AddParam(const Literal& argument, XlaBuilder* builder);
 
   template <class T>
   ComputationDataHandle AddParam(const Array<T>& argument,
                                  ComputationBuilder* builder) {
     return AddParam(*Literal::CreateFromArray(argument), builder);
   }
+  template <class T>
+  XlaOp AddParam(const Array<T>& argument, XlaBuilder* builder) {
+    return AddParam(*Literal::CreateFromArray(argument), builder);
+  }
 
   // Creates a constant instruction with the given literal. When the
   // use_bfloat16 flag is set but the literal has F32 elements, the elements
   // will be converted to BF16s.
   ComputationDataHandle CreateConstantFromLiteral(const Literal& literal,
                                                   ComputationBuilder* builder);
+  XlaOp CreateConstantFromLiteral(const Literal& literal, XlaBuilder* builder);
 
   // Creates a constant instruction with the given array. When the use_bfloat16
   // flag is set but the array has float elements, the elements will be
@@ -307,6 +346,12 @@ class ClientLibraryTestBase : public ::testing::Test {
     return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
   }
 
+  template <typename NativeT>
+  XlaOp CreateConstantFromArray(const Array<NativeT>& array,
+                                XlaBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
+  }
+
   // Same as CreateConstantFromArray, but for scalars.
   template <typename NativeT>
   ComputationDataHandle CreateConstantFromScalar(NativeT value,
@@ -315,6 +360,12 @@ class ClientLibraryTestBase : public ::testing::Test {
                                      builder);
   }
 
+  template <typename NativeT>
+  XlaOp CreateConstantFromScalar(NativeT value, XlaBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateR0<NativeT>(value),
+                                     builder);
+  }
+
   // Creates a parameter instruction that wraps a given value and then stores
   // into "data_handle" the global handle for that parameter.
   //
@@ -323,10 +374,12 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT>
-  std::unique_ptr<GlobalData> CreateR0Parameter(
-      NativeT value, int64 parameter_number, const string& name,
-      ComputationBuilder* builder, ComputationDataHandle* data_handle);
+  template <typename NativeT, typename BuilderT, typename HandleT>
+  std::unique_ptr<GlobalData> CreateR0Parameter(NativeT value,
+                                                int64 parameter_number,
+                                                const string& name,
+                                                BuilderT* builder,
+                                                HandleT* data_handle);
 
   // Creates a parameter instruction that wraps the given values and then stores
   // into "data_handle" the global handle for that parameter.
@@ -336,11 +389,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT>
+  template <typename NativeT, typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateR1Parameter(
       tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
-      const string& name, ComputationBuilder* builder,
-      ComputationDataHandle* data_handle);
+      const string& name, BuilderT* builder, HandleT* data_handle);
 
   // Creates a parameter instruction that wraps the given constant array
   // "array_2d" and then stores to "data_handle" the global handle for that
@@ -351,11 +403,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT>
+  template <typename NativeT, typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateR2Parameter(
       const Array2D<NativeT>& array_2d, int64 parameter_number,
-      const string& name, ComputationBuilder* builder,
-      ComputationDataHandle* data_handle);
+      const string& name, BuilderT* builder, HandleT* data_handle);
 
   // Creates a parameter instruction that wraps the given constant array
   // "array_3d" and then stores to "data_handle" the global handle for that
@@ -366,11 +417,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT>
+  template <typename NativeT, typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateR3Parameter(
       const Array3D<NativeT>& array_3d, int64 parameter_number,
-      const string& name, ComputationBuilder* builder,
-      ComputationDataHandle* data_handle);
+      const string& name, BuilderT* builder, HandleT* data_handle);
 
   // Getter and setter for the use_bfloat16 flag, which indicates whether to run
   // tests with all float-type input/output converted to bfloat16.
@@ -381,6 +431,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   PrimitiveType FloatType() const { return use_bfloat16_ ? BF16 : F32; }
 
   Client* client_;
+  Client* ref_client_;  // To compute reference result.
   ExecutionOptions execution_options_;
 
  private:
@@ -399,13 +450,32 @@ class ClientLibraryTestBase : public ::testing::Test {
                                const string& error_message)>& verify_output,
       const Shape* output_with_layout = nullptr);
 
+  tensorflow::Status ComputeAndCompareLiteralWithAllOutputLayouts(
+      const xla::XlaComputation& computation, const Literal& expected,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const std::function<void(const Literal& actual,
+                               const string& error_message)>& verify_output);
+  tensorflow::Status ComputeAndCompareLiteralWithAllInputLayouts(
+      const xla::XlaComputation& computation, const Literal& expected,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const std::function<void(const Literal& actual,
+                               const string& error_message)>& verify_output,
+      const Shape* output_with_layout = nullptr);
+
   // Executes the computation and calculates the expected reference value using
-  // the HloEvaluator. Returns two literal in the order of (expected, actual).
+  // the HloEvaluator. Returns two literals in the order of (expected, actual).
   StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
   ComputeValueAndReference(ComputationBuilder* builder,
                            const ComputationDataHandle& operand,
                            tensorflow::gtl::ArraySlice<Literal> arguments);
 
+  // Executes the computation and calculates the expected reference value using
+  // the reference client. Returns two literals in the order of (expected,
+  // actual).
+  StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+  ComputeValueAndReference(XlaBuilder* builder,
+                           tensorflow::gtl::ArraySlice<Literal> arguments);
+
   // Whether to run tests with all float-type input/output converted to
   // bfloat16.
   bool use_bfloat16_ = false;
@@ -414,9 +484,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   std::vector<std::unique_ptr<GlobalData>> arguments_;
 };
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR0(
-    ComputationBuilder* builder, NativeT expected,
+    BuilderT* builder, NativeT expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR0<NativeT>(expected);
@@ -424,9 +494,9 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR0(
-    ComputationBuilder* builder, NativeT expected,
+    BuilderT* builder, NativeT expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -440,9 +510,9 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR1(
-    ComputationBuilder* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
+    BuilderT* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR1<NativeT>(expected);
@@ -450,9 +520,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR1(
-    ComputationBuilder* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
+    BuilderT* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -466,9 +536,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR2(
-    ComputationBuilder* builder, const Array2D<NativeT>& expected,
+    BuilderT* builder, const Array2D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR2FromArray2D<NativeT>(expected);
@@ -476,9 +546,9 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR2(
-    ComputationBuilder* builder, const Array2D<NativeT>& expected,
+    BuilderT* builder, const Array2D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -492,9 +562,9 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR3(
-    ComputationBuilder* builder, const Array3D<NativeT>& expected,
+    BuilderT* builder, const Array3D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR3FromArray3D<NativeT>(expected);
@@ -502,9 +572,9 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR3(
-    ComputationBuilder* builder, const Array3D<NativeT>& expected,
+    BuilderT* builder, const Array3D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -518,9 +588,9 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR4(
-    ComputationBuilder* builder, const Array4D<NativeT>& expected,
+    BuilderT* builder, const Array4D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR4FromArray4D<NativeT>(expected);
@@ -528,9 +598,9 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR4(
-    ComputationBuilder* builder, const Array4D<NativeT>& expected,
+    BuilderT* builder, const Array4D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -544,10 +614,10 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT, typename HandleT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
     NativeT value, int64 parameter_number, const string& name,
-    ComputationBuilder* builder, ComputationDataHandle* data_handle) {
+    BuilderT* builder, HandleT* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR0(value);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
     literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
@@ -558,11 +628,10 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
   return data;
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT, typename HandleT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
     tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
-    const string& name, ComputationBuilder* builder,
-    ComputationDataHandle* data_handle) {
+    const string& name, BuilderT* builder, HandleT* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR1(values);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
     literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
@@ -573,11 +642,10 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
   return data;
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT, typename HandleT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
     const Array2D<NativeT>& array_2d, int64 parameter_number,
-    const string& name, ComputationBuilder* builder,
-    ComputationDataHandle* data_handle) {
+    const string& name, BuilderT* builder, HandleT* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR2FromArray2D(array_2d);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
     literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
@@ -588,11 +656,10 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
   return data;
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT, typename HandleT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR3Parameter(
     const Array3D<NativeT>& array_3d, int64 parameter_number,
-    const string& name, ComputationBuilder* builder,
-    ComputationDataHandle* data_handle) {
+    const string& name, BuilderT* builder, HandleT* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR3FromArray3D(array_3d);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
     literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
@@ -628,6 +695,37 @@ std::unique_ptr<Array2D<NativeT>> ClientLibraryTestBase::CreatePseudorandomR2(
   return result;
 }
 
+template <typename BuilderT, typename HandleT>
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
+                                                         const Literal& literal,
+                                                         const string& name,
+                                                         BuilderT* builder,
+                                                         HandleT* data_handle) {
+  return CreateParameterAndTransferLiteral(parameter_number, literal, name,
+                                           nullptr, builder, data_handle);
+}
+
+template <typename BuilderT, typename HandleT>
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(
+    int64 parameter_number, const Literal& literal, const string& name,
+    const DeviceHandle* device_handle, BuilderT* builder,
+    HandleT* data_handle) {
+  const Literal* param_literal = &literal;
+  std::unique_ptr<Literal> converted_literal;
+  if (use_bfloat16_) {
+    converted_literal = LiteralTestUtil::ConvertF32ToBF16(literal);
+    param_literal = converted_literal.get();
+  }
+  std::unique_ptr<GlobalData> data =
+      client_->TransferToServer(*param_literal, device_handle)
+          .ConsumeValueOrDie();
+  *data_handle =
+      builder->Parameter(parameter_number, param_literal->shape(), name);
+  return data;
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_CLIENT_LIBRARY_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 045148cdd11da94ae4789a753efca95c6aaa1f27..0b425b93bb144e395baef2bcf074fd6e7991630b 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -16,9 +16,10 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -37,7 +38,7 @@ namespace {
 class ClientTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(ClientTest, ExecuteWithLayout) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
   for (const std::vector<int64>& execute_layout : layouts) {
@@ -69,7 +70,7 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) {
 }
 
 XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Tuple({b.ConstantR2<int32>({{1, 2}, {3, 4}}),
            b.ConstantR2<int32>({{10, 20}, {30, 40}})});
@@ -107,16 +108,15 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
                                      /*minor_to_major=*/{1, 0})));
 }
 
-XLA_TEST_F(ClientTest,
-        DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) {
-  Computation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
+XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) {
+  XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
   Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<GlobalData> const_arg,
       client_->TransferToServer(*Literal::CreateR2<int32>({{5, 6}, {7, 8}})));
 
-  ComputationBuilder b(client_, TestName() + ".add");
+  XlaBuilder b(TestName() + ".add");
   b.Add(b.Parameter(0, shape, "param_0"),
         b.ConstantR2<int32>({{1, 2}, {3, 4}}));
   TF_ASSERT_OK_AND_ASSIGN(add_with_one_arg, b.Build());
@@ -124,14 +124,14 @@ XLA_TEST_F(ClientTest,
   // We can't really test parallel execution on CPU since all of the cores in a
   // CPU are presented as a single device.  So for now we test "parallel"
   // execution on a single device.
-  std::vector<Client::ComputationInstance> computation_instances;
+  std::vector<Client::XlaComputationInstance> computation_instances;
   TF_ASSERT_OK_AND_ASSIGN(std::vector<xla::DeviceHandle> devices,
                           client_->GetDeviceHandles(1));
   ASSERT_EQ(devices.size(), 1);
 
   ExecutionOptions options = execution_options_;
   *options.add_device_handles() = devices[0];
-  computation_instances.push_back(Client::ComputationInstance(
+  computation_instances.push_back(Client::XlaComputationInstance(
       add_with_one_arg, {const_arg.get()}, options, nullptr));
 
   TF_ASSERT_OK_AND_ASSIGN(auto results,
diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
index 0f780fa87ef98fd5c48726ef83fa8efc1e90fbf7..ecce599a8a3bd588c11d6bb9ba461b5a917197db 100644
--- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc
+++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
@@ -18,9 +18,10 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -39,7 +40,7 @@ namespace {
 class CompilationCacheTest : public ClientLibraryTestBase {
  public:
   void ExecuteComputationR0F32(
-      const Computation& computation,
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, float expected_result,
       bool expect_cache_hit) {
     ExecutionProfile execution_profile;
@@ -55,7 +56,7 @@ class CompilationCacheTest : public ClientLibraryTestBase {
   }
 
   void ExecuteComputationR2F32(
-      const Computation& computation,
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       std::initializer_list<std::initializer_list<float>> expected_result,
       bool expect_cache_hit) {
@@ -74,17 +75,20 @@ class CompilationCacheTest : public ClientLibraryTestBase {
   ErrorSpec error_spec_{0.0001};
 };
 
-XLA_TEST_F(CompilationCacheTest, ComputationCalledMultipleTimes) {
-  ComputationBuilder builder(client_, TestName());
+// TODO(b/74197823): Disabled because there is no cache in the new design.
+XLA_TEST_F(CompilationCacheTest, DISABLED_ComputationCalledMultipleTimes) {
+  XlaBuilder builder(TestName());
   builder.Neg(builder.ConstantR0<float>(42.0));
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/false);
   ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
   ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
 }
 
-XLA_TEST_F(CompilationCacheTest, ComputationCalledWithDifferentParameters) {
+// TODO(b/74197823): Disabled because there is no cache in the new design.
+XLA_TEST_F(CompilationCacheTest,
+           DISABLED_ComputationCalledWithDifferentParameters) {
   std::unique_ptr<GlobalData> data_42 =
       client_->TransferToServer(*Literal::CreateR0<float>(42.0f))
           .ConsumeValueOrDie();
@@ -95,9 +99,9 @@ XLA_TEST_F(CompilationCacheTest, ComputationCalledWithDifferentParameters) {
       client_->TransferToServer(*Literal::CreateR0<float>(456.0f))
           .ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Neg(builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecuteComputationR0F32(computation, {data_42.get()}, -42.0,
                           /*expect_cache_hit=*/false);
@@ -109,19 +113,20 @@ XLA_TEST_F(CompilationCacheTest, ComputationCalledWithDifferentParameters) {
                           /*expect_cache_hit=*/true);
 }
 
-XLA_TEST_F(CompilationCacheTest, MultipleComputations) {
-  ComputationBuilder builder_neg(client_, TestName() + "_neg");
+// TODO(b/74197823): Disabled because there is no cache in the new design.
+XLA_TEST_F(CompilationCacheTest, DISABLED_MultipleComputations) {
+  XlaBuilder builder_neg(TestName() + "_neg");
   builder_neg.Neg(builder_neg.ConstantR0<float>(42.0));
-  Computation computation_neg = builder_neg.Build().ConsumeValueOrDie();
+  XlaComputation computation_neg = builder_neg.Build().ConsumeValueOrDie();
 
-  ComputationBuilder builder_exp(client_, TestName() + "_exp");
+  XlaBuilder builder_exp(TestName() + "_exp");
   builder_exp.Exp(builder_exp.ConstantR0<float>(1.0));
-  Computation computation_exp = builder_exp.Build().ConsumeValueOrDie();
+  XlaComputation computation_exp = builder_exp.Build().ConsumeValueOrDie();
 
-  ComputationBuilder builder_add(client_, TestName() + "_add");
+  XlaBuilder builder_add(TestName() + "_add");
   builder_add.Add(builder_add.ConstantR0<float>(2.0),
                   builder_add.ConstantR0<float>(3.0));
-  Computation computation_add = builder_add.Build().ConsumeValueOrDie();
+  XlaComputation computation_add = builder_add.Build().ConsumeValueOrDie();
 
   ExecuteComputationR0F32(computation_neg, {}, -42.0,
                           /*expect_cache_hit=*/false);
@@ -133,7 +138,8 @@ XLA_TEST_F(CompilationCacheTest, MultipleComputations) {
                           /*expect_cache_hit=*/true);
 }
 
-XLA_TEST_F(CompilationCacheTest, DifferentParameterLayouts) {
+// TODO(b/74197823): Disabled because there is no cache in the new design.
+XLA_TEST_F(CompilationCacheTest, DISABLED_DifferentParameterLayouts) {
   // Create two GlobalData arrays with the same shape but different
   // layouts. Use these arrays as parameters to a simple computation. If the
   // layout of the array changes then computation should be recompiled (cache
@@ -148,9 +154,9 @@ XLA_TEST_F(CompilationCacheTest, DifferentParameterLayouts) {
   auto colmaj_handle =
       client_->TransferToServer(*colmaj_array).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "param0");
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecuteComputationR2F32(computation, {colmaj_handle.get()},
                           {{1.0f, 2.0f}, {3.0f, 4.0f}},
@@ -169,32 +175,5 @@ XLA_TEST_F(CompilationCacheTest, DifferentParameterLayouts) {
                           /*expect_cache_hit=*/true);
 }
 
-XLA_TEST_F(CompilationCacheTest, MutatedComputation) {
-  // Build a computation, execute it, then mutate it. The mutated computation
-  // should not be in the cache until it is run once. This must be done through
-  // the stub interface because Computations built from ComputationBuilder are
-  // immutable.
-  ComputationBuilder builder(client_, TestName());
-  auto neg = builder.Neg(builder.ConstantR0<float>(42.0));
-  Computation computation = builder.Build().ConsumeValueOrDie();
-
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
-
-  BinaryOpRequest request;
-  request.set_binop(BINOP_ADD);
-  *request.mutable_lhs() = neg;
-  *request.mutable_rhs() = neg;
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation.handle();
-  *op_request.mutable_binary_op_request() = request;
-  OpResponse response;
-  tensorflow::Status s = client_->stub()->Op(&op_request, &response);
-  ASSERT_TRUE(s.ok());
-
-  ExecuteComputationR0F32(computation, {}, -84.0, /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation, {}, -84.0, /*expect_cache_hit=*/true);
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index ec2c580670cfac14ba42e8c9a836c86551af4b89..bf4b8fb0bcf229b4e8649b3920dcba1ae0579831 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -31,6 +31,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -43,16 +45,14 @@ ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly};
 
 class ComputeConstantTest : public ::testing::Test {
  public:
-  explicit ComputeConstantTest(
-      perftools::gputools::Platform* platform = nullptr)
+  explicit ComputeConstantTest(se::Platform* platform = nullptr)
       : platform_(platform) {}
 
   string TestName() const {
     return ::testing::UnitTest::GetInstance()->current_test_info()->name();
   }
 
-  Client* ClientOrDie(::perftools::gputools::Platform* platform,
-                      ClientType client_type) {
+  Client* ClientOrDie(se::Platform* platform, ClientType client_type) {
     if (client_type == ClientType::kLocal) {
       StatusOr<Client*> result =
           ClientLibrary::GetOrCreateLocalClient(platform);
@@ -70,39 +70,35 @@ class ComputeConstantTest : public ::testing::Test {
   }
 
   StatusOr<std::unique_ptr<Literal>> ComputeConstantLiteral(
-      Client* client, const ComputationDataHandle& operand,
-      ComputationBuilder* builder, Layout* output_layout = nullptr,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
-    TF_ASSIGN_OR_RETURN(auto computed, builder->ComputeConstant(
-                                           operand, output_layout, parameters));
+      Client* client, const XlaOp& operand, XlaBuilder* builder,
+      Layout* output_layout = nullptr) {
+    TF_ASSIGN_OR_RETURN(auto subgraph, builder->BuildConstantSubGraph(operand));
+    TF_ASSIGN_OR_RETURN(auto computed,
+                        client->ComputeConstant(subgraph, output_layout));
     return std::move(computed);
   }
 
   template <class Scalar>
-  StatusOr<Scalar> ComputeConstantScalar(
-      Client* client, const ComputationDataHandle& operand,
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
-    TF_ASSIGN_OR_RETURN(
-        auto literal,
-        ComputeConstantLiteral(client, operand, builder, nullptr, parameters));
+  StatusOr<Scalar> ComputeConstantScalar(Client* client, const XlaOp& operand,
+                                         XlaBuilder* builder) {
+    TF_ASSIGN_OR_RETURN(auto literal, ComputeConstantLiteral(client, operand,
+                                                             builder, nullptr));
     return literal->Get<Scalar>({});
   }
 
-  bool IsConstant(const ComputationDataHandle& operand,
-                  ComputationBuilder* builder, int64 num_parameters = 0) {
-    StatusOr<bool> result = builder->IsConstant(operand, num_parameters);
+  bool IsConstant(const XlaOp& operand, XlaBuilder* builder) {
+    StatusOr<bool> result = builder->IsConstant(operand);
     EXPECT_TRUE(result.ok()) << result.status();
     return result.ok() ? result.ValueOrDie() : false;
   }
 
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
 };
 
 TEST_F(ComputeConstantTest, ScalarInt32Literal) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.ConstantR0<int32>(42);
     EXPECT_TRUE(IsConstant(computation, &b));
 
@@ -115,7 +111,7 @@ TEST_F(ComputeConstantTest, ScalarInt32Literal) {
 TEST_F(ComputeConstantTest, ScalarFloatAdd) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
     EXPECT_TRUE(IsConstant(computation, &b));
@@ -129,7 +125,7 @@ TEST_F(ComputeConstantTest, ScalarFloatAdd) {
 TEST_F(ComputeConstantTest, ScalarRng) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
                      ShapeUtil::MakeShape(F32, {}));
@@ -141,34 +137,16 @@ TEST_F(ComputeConstantTest, ScalarRng) {
   }
 }
 
-TEST_F(ComputeConstantTest, Param) {
-  for (ClientType client_type : client_types) {
-    Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
-    auto param = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "lhs");
-    auto computation = b.Add(param, b.ConstantR0<float>(1.5f));
-
-    std::vector<Literal> arguments;
-    arguments.push_back(std::move(*Literal::CreateR0(42.5f)));
-    EXPECT_TRUE(IsConstant(computation, &b, arguments.size()));
-
-    auto value =
-        ComputeConstantScalar<float>(client, computation, &b, arguments);
-    ASSERT_TRUE(value.ok()) << value.status();
-    EXPECT_EQ(value.ValueOrDie(), 44.0f);
-  }
-}
-
 TEST_F(ComputeConstantTest, DirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
     EXPECT_FALSE(IsConstant(computation, &b));
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                    .contains("depends on a parameter"))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(),
+                                                  "depends on a parameter"))
         << value.status();
   }
 }
@@ -176,15 +154,15 @@ TEST_F(ComputeConstantTest, DirectParamMissing) {
 TEST_F(ComputeConstantTest, IndirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.Add(b.ConstantR0<float>(1.0f),
               b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
     EXPECT_FALSE(IsConstant(computation, &b));
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                    .contains("depends on a parameter"))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(),
+                                                  "depends on a parameter"))
         << value.status();
   }
 }
@@ -194,7 +172,7 @@ TEST_F(ComputeConstantTest, IndirectParamMissing) {
 TEST_F(ComputeConstantTest, UnrelatedParam) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
     auto constant_4 =
@@ -211,64 +189,64 @@ TEST_F(ComputeConstantTest, UnrelatedParam) {
 
     EXPECT_TRUE(IsConstant(constant_13, &b));
 
-    auto value = ComputeConstantScalar<float>(client, constant_13, &b);
-    ASSERT_TRUE(value.ok()) << value.status();
-    EXPECT_EQ(value.ValueOrDie(), 13.0f);
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto value, ComputeConstantScalar<float>(client, constant_13, &b));
+    EXPECT_EQ(value, 13.0f);
   }
 }
 
 TEST_F(ComputeConstantTest, NonScalarAdd) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     auto computation =
         b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
     EXPECT_TRUE(IsConstant(computation, &b));
 
-    auto computed = ComputeConstantLiteral(client, computation, &b);
-    ASSERT_TRUE(computed.ok()) << computed.status();
+    TF_ASSERT_OK_AND_ASSIGN(auto computed,
+                            ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal =
         Literal::CreateR1<int32>({4, 6});
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
   }
 }
 
 TEST_F(ComputeConstantTest, IntegerDivide) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
     EXPECT_TRUE(IsConstant(computation, &b));
 
-    auto computed = ComputeConstantLiteral(client, computation, &b);
-    ASSERT_TRUE(computed.ok()) << computed.status();
+    TF_ASSERT_OK_AND_ASSIGN(auto computed,
+                            ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal = Literal::CreateR0<int32>(5);
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
   }
 }
 
 XLA_TEST_F(ComputeConstantTest, Layout) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
     for (const std::vector<int64>& layout : layouts) {
       auto layout_proto = LayoutUtil::MakeLayout(layout);
-      auto computed = ComputeConstantLiteral(
-          client,
-          b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
-                b.ConstantR2<int32>({{10, 20}, {30, 40}})),
-          &b, &layout_proto);
-      ASSERT_TRUE(computed.ok()) << computed.status();
+      TF_ASSERT_OK_AND_ASSIGN(
+          auto computed, ComputeConstantLiteral(
+                             client,
+                             b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
+                                   b.ConstantR2<int32>({{10, 20}, {30, 40}})),
+                             &b, &layout_proto));
 
       std::unique_ptr<Literal> expected_literal =
           Literal::CreateR2WithLayout<int32>({{11, 22}, {33, 44}},
                                              LayoutUtil::MakeLayout(layout));
-      LiteralTestUtil::AssertEqualShapesAndLayouts(
-          expected_literal->shape(), computed.ValueOrDie()->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
+                                                   computed->shape());
+      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
     }
   }
 }
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 1bcad5a3f37a37c9d482f3a5a899ac527666cca3..a4c8a83eb15f7cc279b6c8f1bf1394c0afb9f7cf 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -38,9 +38,9 @@ using ::testing::HasSubstr;
 
 // Concatenate expects at least one argument.
 XLA_TEST_F(ConcatTest, Concat_Nothing) {
-  ComputationBuilder builder(client_, TestName());
-  auto concatenated = builder.ConcatInDim({}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  XlaBuilder builder(TestName());
+  builder.ConcatInDim({}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
               HasSubstr("Concatenate expects at least one argument"));
@@ -48,18 +48,18 @@ XLA_TEST_F(ConcatTest, Concat_Nothing) {
 
 // Concatenate with one argument works.
 XLA_TEST_F(ConcatTest, Concat_R1_With_Nothing) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
-  auto concatenated = builder.ConcatInDim({a}, 0);
+  builder.ConcatInDim({a}, 0);
 
   std::vector<float> expected = {42, 64};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a}, 0);
+  builder.ConcatInDim({a}, 0);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -68,51 +68,51 @@ XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
 // Show that we can't concatenate R0 with R0 because we can't name the dimension
 // to concatenate on.
 XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR0<float>(42.0);
   auto b = builder.ConstantR0<float>(64.0);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.ConcatInDim({a, b}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
-              HasSubstr("dimension to concatenate along out of bounds: 0"));
+              HasSubstr("out of bounds: 0"));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
   auto b = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {42, 64};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
   auto b = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -129,20 +129,20 @@ XLA_TEST_F(ConcatTest, Concat_R1_L253_With_R1_L7) {
     expected[253 + i] = rhs[i] = 253 + i + 1;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>(lhs);
   auto b = builder.ConstantR1<float>(rhs);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_0x0_With_0x0) {
   for (int dim : {0, 1}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto a = builder.ConstantR2FromArray2D(Array2D<float>(0, 0));
     auto b = builder.ConstantR2FromArray2D(Array2D<float>(0, 0));
-    auto concatenated = builder.ConcatInDim({a, b}, dim);
+    builder.ConcatInDim({a, b}, dim);
 
     ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {},
                                ErrorSpec(0.0001));
@@ -150,26 +150,27 @@ XLA_TEST_F(ConcatTest, Concat_0x0_With_0x0) {
 }
 
 XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(1, 1);
   auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   Array2D<float> expected({
-      {0}, {64},
+      {0},
+      {64},
   });
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(1, 1);
   auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   Array2D<float> expected({
       {0, 64},
@@ -178,22 +179,22 @@ XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) {
 }
 
 XLA_TEST_F(ConcatTest, Concat2x0With2x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(Array2D<float>(2, 0));
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   ComputeAndCompareR2<float>(&builder, *b_array, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat2x3With2x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(2, 3);
   auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   Array2D<float> expected({
       {0, 1, 2, 64, 65, 66, 67, 68},
@@ -203,22 +204,22 @@ XLA_TEST_F(ConcatTest, Concat2x3With2x5) {
 }
 
 XLA_TEST_F(ConcatTest, Concat3x2With0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(3, 2);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(Array2D<float>(0, 2));
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   ComputeAndCompareR2<float>(&builder, *a_array, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat3x2With5x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(3, 2);
   auto b_array = CreatePatternedMatrix(5, 2, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   Array2D<float> expected({
       {0, 1},
@@ -234,16 +235,16 @@ XLA_TEST_F(ConcatTest, Concat3x2With5x2) {
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x0x2_3x0x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR3FromArray3D(Array3D<float>(3, 0, 2));
   auto b = builder.ConstantR3FromArray3D(Array3D<float>(3, 0, 1));
-  auto concatenated = builder.ConcatInDim({a, b}, 2);
+  builder.ConcatInDim({a, b}, 2);
   ComputeAndCompareR3<float>(&builder, Array3D<float>(3, 0, 3), {},
                              ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_array({
       // 3x1x2
       {{0, 1}},
@@ -258,27 +259,29 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) {
   });
   auto a = builder.ConstantR3FromArray3D(a_array);
   auto b = builder.ConstantR3FromArray3D(b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 2);
+  builder.ConcatInDim({a, b}, 2);
 
   Array3D<float> expected({
-      {{0, 1, 6}}, {{2, 3, 7}}, {{4, 5, 8}},
+      {{0, 1, 6}},
+      {{2, 3, 7}},
+      {{4, 5, 8}},
   });
   ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_1x1_1x1_1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b, c}, 0);
+  builder.ConcatInDim({a, b, c}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_array({
       // 3x1x2
       {{0, 1}},
@@ -300,35 +303,35 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) {
   auto a = builder.ConstantR3FromArray3D(a_array);
   auto b = builder.ConstantR3FromArray3D(b_array);
   auto c = builder.ConstantR3FromArray3D(c_array);
-  auto concatenated = builder.ConcatInDim({a, b, c}, 2);
+  builder.ConcatInDim({a, b, c}, 2);
 
   Array3D<float> expected({
-      {{0, 1, 2, 3}}, {{4, 5, 6, 7}}, {{8, 9, 10, 11}},
+      {{0, 1, 2, 3}},
+      {{4, 5, 6, 7}},
+      {{8, 9, 10, 11}},
   });
   ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, DoubleConcatLeftAssociative) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
   // concatenated = (a concat b) concat c
-  auto concatenated =
-      builder.ConcatInDim({builder.ConcatInDim({a, b}, 0), c}, 0);
+  builder.ConcatInDim({builder.ConcatInDim({a, b}, 0), c}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, DoubleConcatRightAssociative) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
   // concatenated = a concat (b concat c)
-  auto concatenated =
-      builder.ConcatInDim({a, builder.ConcatInDim({b, c}, 0)}, 0);
+  builder.ConcatInDim({a, builder.ConcatInDim({b, c}, 0)}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -342,7 +345,7 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim0) {
     rhs(0, i) = i + 1024;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 0);
@@ -363,7 +366,7 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim1) {
     rhs(0, i) = i + 1024;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 1);
@@ -388,7 +391,7 @@ XLA_TEST_F(ConcatTest, Concat_64x64_With_64x2) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 1);
@@ -404,13 +407,13 @@ XLA_TEST_F(ConcatTest, Concat_64x64_With_64x2) {
 
 // Show that we can't concatenate with an opaques.
 XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto opaque_shape = ShapeUtil::MakeOpaqueShape();
   auto r1f32 = xla::ShapeUtil::MakeShape(xla::F32, {1});
   auto x = builder.Parameter(0, r1f32, "x");
   auto y = builder.Parameter(1, opaque_shape, "y");
-  auto concatenated = builder.ConcatInDim({x, y}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.ConcatInDim({x, y}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(
       computation_status.status().ToString(),
@@ -418,23 +421,23 @@ XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralBoxedPredicates) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto p0 = builder.ConstantR1<bool>({true});
   auto p1 = builder.ConstantR1<bool>({false});
   auto p2 = builder.ConstantR1<bool>({true});
-  auto concatenated = builder.ConcatInDim({p0, p1, p2}, 0);
+  builder.ConcatInDim({p0, p1, p2}, 0);
 
   bool expected[] = {true, false, true};
   ComputeAndCompareR1<bool>(&builder, expected, {});
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a0 = builder.ConstantR1<int32>({1});
   auto a1 = builder.ConstantR1<int32>({2, 3});
   auto a2 = builder.ConstantR1<int32>({4, 5, 6});
   auto a3 = builder.ConstantR1<int32>({7, 8, 9, 10});
-  auto concatenated = builder.ConcatInDim({a0, a1, a2, a3}, 0);
+  builder.ConcatInDim({a0, a1, a2, a3}, 0);
 
   std::vector<int32> expected(10);
   std::iota(expected.begin(), expected.end(), 1);
@@ -442,7 +445,7 @@ XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
 }
 
 XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array3D<float> arr0(9, 17, 1);
   arr0.Fill(1);
@@ -462,14 +465,14 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
     }
   }
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto p0 = CreateR3Parameter<float>(arr0, /*parameter_number=*/0, "p0",
                                      &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto p1 = CreateR3Parameter<float>(arr1, /*parameter_number=*/1, "p1",
                                      &builder, &h1);
 
-  auto concatenated = builder.ConcatInDim({h0, h1}, 2);
+  builder.ConcatInDim({h0, h1}, 2);
 
   ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
 }
@@ -495,7 +498,7 @@ TEST_P(ConcatR2BinaryTest, DoIt) {
   Array2D<int32> rhs(spec.rhs_dim0, spec.rhs_dim1);
   rhs.FillUnique(1000);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a0 = builder.ConstantR2FromArray2D<int32>(lhs);
   auto a1 = builder.ConstantR2FromArray2D<int32>(rhs);
   builder.ConcatInDim({a0, a1}, spec.concat_dimension);
@@ -521,7 +524,7 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, f32_scalar, "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto mul = builder.Mul(x, y);
@@ -545,7 +548,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto z = builder.Parameter(2, f32_scalar, "z");
@@ -573,7 +576,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) {
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto z = builder.Parameter(2, f32_scalar, "y");
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index bc821674820fb128823786d7149037fc59b22ab6..7ff6706935740c7d76ee5cd03eae292386760397 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -23,8 +24,8 @@ namespace {
 
 class ConditionalOpTest : public ClientLibraryTestBase {
  protected:
-  Computation CreateR0ConstantComputation(float value) {
-    ComputationBuilder builder(client_, "Constant");
+  XlaComputation CreateR0ConstantComputation(float value) {
+    XlaBuilder builder("Constant");
     builder.Parameter(0, empty_tuple_, "tuple");
     builder.ConstantR0<float>(value);
     auto build_status = builder.Build();
@@ -32,16 +33,16 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0IdentityComputation() {
-    ComputationBuilder builder(client_, "Identity");
+  XlaComputation CreateR0IdentityComputation() {
+    XlaBuilder builder("Identity");
     builder.Parameter(0, r0f32_, "x");
     auto build_status = builder.Build();
     EXPECT_IS_OK(build_status.status());
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateCeilComputation(const Shape& shape) {
-    ComputationBuilder builder(client_, "Ceil");
+  XlaComputation CreateCeilComputation(const Shape& shape) {
+    XlaBuilder builder("Ceil");
     auto param = builder.Parameter(0, shape, "param");
     builder.Ceil(param);
     auto build_status = builder.Build();
@@ -49,16 +50,16 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0CeilComputation() {
+  XlaComputation CreateR0CeilComputation() {
     return CreateCeilComputation(r0f32_);
   }
 
-  Computation CreateR1CeilComputation() {
+  XlaComputation CreateR1CeilComputation() {
     return CreateCeilComputation(r1s2f32_);
   }
 
-  Computation CreateFloorComputation(const Shape& shape) {
-    ComputationBuilder builder(client_, "Floor");
+  XlaComputation CreateFloorComputation(const Shape& shape) {
+    XlaBuilder builder("Floor");
     auto param = builder.Parameter(0, shape, "param");
     builder.Floor(param);
     auto build_status = builder.Build();
@@ -66,17 +67,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0FloorComputation() {
+  XlaComputation CreateR0FloorComputation() {
     return CreateFloorComputation(r0f32_);
   }
 
-  Computation CreateR1FloorComputation() {
+  XlaComputation CreateR1FloorComputation() {
     return CreateFloorComputation(r1s2f32_);
   }
 
-  Computation CreateTupleCeilComputation(const string& computation_name,
-                                         const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleCeilComputation(const string& computation_name,
+                                            const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -88,17 +89,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleCeilComputation() {
+  XlaComputation CreateR0TupleCeilComputation() {
     return CreateTupleCeilComputation("CeilR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleCeilComputation() {
+  XlaComputation CreateR1TupleCeilComputation() {
     return CreateTupleCeilComputation("CeilR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleFloorComputation(const string& computation_name,
-                                          const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleFloorComputation(const string& computation_name,
+                                             const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -110,17 +111,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleFloorComputation() {
+  XlaComputation CreateR0TupleFloorComputation() {
     return CreateTupleFloorComputation("FloorR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleFloorComputation() {
+  XlaComputation CreateR1TupleFloorComputation() {
     return CreateTupleFloorComputation("FloorR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleAddComputation(const string& computation_name,
-                                        const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleAddComputation(const string& computation_name,
+                                           const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -130,17 +131,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleAddComputation() {
+  XlaComputation CreateR0TupleAddComputation() {
     return CreateTupleAddComputation("AddR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleAddComputation() {
+  XlaComputation CreateR1TupleAddComputation() {
     return CreateTupleAddComputation("AddR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleSubComputation(const string& computation_name,
-                                        const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleSubComputation(const string& computation_name,
+                                           const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -150,11 +151,11 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleSubComputation() {
+  XlaComputation CreateR0TupleSubComputation() {
     return CreateTupleSubComputation("SubR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleSubComputation() {
+  XlaComputation CreateR1TupleSubComputation() {
     return CreateTupleSubComputation("SubR1", tuple_2_r1s2f32_);
   }
 
@@ -170,26 +171,25 @@ class ConditionalOpTest : public ClientLibraryTestBase {
 
 // Test true and false computations that do not take any parameters.
 XLA_TEST_F(ConditionalOpTest, Parameters0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({});
   auto true_computation = CreateR0ConstantComputation(56.0f);
   auto false_computation = CreateR0ConstantComputation(12.0f);
-  auto result = builder.Conditional(pred, operands, true_computation, operands,
-                                    false_computation);
+  builder.Conditional(pred, operands, true_computation, operands,
+                      false_computation);
 
   ComputeAndCompareR0<float>(&builder, 56.0f, {}, error_spec_);
 }
 
 // Test true and false computations that take in 1 parameter.
 XLA_TEST_F(ConditionalOpTest, Parameters1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto identity = CreateR0IdentityComputation();
-  auto result =
-      builder.Conditional(pred, operand1, identity, operand2, identity);
+  builder.Conditional(pred, operand1, identity, operand2, identity);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -197,12 +197,12 @@ XLA_TEST_F(ConditionalOpTest, Parameters1) {
 // Test conditional with two different computations in the true and false cases
 // that take in different arguments.
 XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -210,11 +210,11 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
 // Test conditional with two different computations in the true and false cases
 // that take in the same arguments.
 XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand, CreateR0CeilComputation(),
-                                    operand, CreateR0FloorComputation());
+  builder.Conditional(pred, operand, CreateR0CeilComputation(), operand,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -222,12 +222,12 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
 // Test conditional with the same computation in the true and false cases but
 // take in different arguments.
 XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
   auto floor = CreateR0FloorComputation();
-  auto result = builder.Conditional(pred, operand1, floor, operand2, floor);
+  builder.Conditional(pred, operand1, floor, operand2, floor);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -235,11 +235,11 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
 // Test conditional with the same computation in the true and false cases that
 // take in the same arguments.
 XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand = builder.ConstantR0<float>(12.6f);
   auto floor = CreateR0FloorComputation();
-  auto result = builder.Conditional(pred, operand, floor, operand, floor);
+  builder.Conditional(pred, operand, floor, operand, floor);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -247,12 +247,12 @@ XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
 // Test conditional with different instances of the same computation in the true
 // and false cases.
 XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand1, CreateR0FloorComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0FloorComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -260,7 +260,7 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
 // Test the case when a call invokes a computation that contains a conditional.
 XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   auto pred_cond = inner_builder.Parameter(0, r0bool, "param0");
   auto true_operand = inner_builder.Parameter(1, r0f32_, "param1");
   auto false_operand = inner_builder.Parameter(2, r0f32_, "param2");
@@ -268,7 +268,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
                             false_operand, CreateR0FloorComputation());
   auto inner_builder_result = inner_builder.Build();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
@@ -281,14 +281,13 @@ XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
 // Test true and false computations that take in 2 parameters and predicate is
 // true.
 XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
-                          operands, CreateR0TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
+                      CreateR0TupleSubComputation());
 
   ComputeAndCompareR0<float>(&builder, 68.0f, {}, error_spec_);
 }
@@ -296,14 +295,13 @@ XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
 // Test true and false computations that take in 2 parameters and predicate is
 // false.
 XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
-                          operands, CreateR0TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
+                      CreateR0TupleSubComputation());
 
   ComputeAndCompareR0<float>(&builder, 44.0f, {}, error_spec_);
 }
@@ -311,14 +309,13 @@ XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
 // Test true and false computations that take in 2 array parameters and
 // predicate is true.
 XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
   auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
-                          operands, CreateR1TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
+                      CreateR1TupleSubComputation());
 
   ComputeAndCompareR1<float>(&builder, {34.0f, 67.0f}, {}, error_spec_);
 }
@@ -326,21 +323,20 @@ XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
 // Test true and false computations that take in 2 array parameters and
 // predicate is false.
 XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
   auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
-                          operands, CreateR1TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
+                      CreateR1TupleSubComputation());
 
   ComputeAndCompareR1<float>(&builder, {14.0f, 45.0f}, {}, error_spec_);
 }
 
 // Test true and false computations that return a tuple of scalars.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operands = builder.Tuple(
       {builder.ConstantR0<float>(12.2f), builder.ConstantR0<float>(25.6f)});
@@ -356,7 +352,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
 
 // Test true and false computations that return a tuple of arrays.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({builder.ConstantR1<float>({12.2f, 15.8f}),
                                  builder.ConstantR1<float>({25.6f, 29.2f})});
@@ -373,7 +369,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
 // Test true and false computations that return a tuple of a predicate, a
 // scalar, and an array.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
-  ComputationBuilder true_builder(client_, TestName() + ".true");
+  XlaBuilder true_builder(TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
     auto true_pred = true_builder.ConstantR0<bool>(true);
@@ -384,7 +380,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   auto true_builder_result = true_builder.Build();
   EXPECT_IS_OK(true_builder_result.status());
 
-  ComputationBuilder false_builder(client_, TestName() + ".false");
+  XlaBuilder false_builder(TestName() + ".false");
   {
     false_builder.Parameter(0, empty_tuple_, "tuple");
     auto false_pred = false_builder.ConstantR0<bool>(false);
@@ -395,7 +391,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   auto false_builder_result = false_builder.Build();
   EXPECT_IS_OK(false_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({});
   builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
@@ -411,7 +407,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
 
 // Test true and false computations that return a nested tuple.
 XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
-  ComputationBuilder true_builder(client_, TestName() + ".true");
+  XlaBuilder true_builder(TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
     auto true_constant1 = true_builder.ConstantR0<float>(12.2f);
@@ -424,7 +420,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   auto true_builder_result = true_builder.Build();
   EXPECT_IS_OK(true_builder_result.status());
 
-  ComputationBuilder false_builder(client_, TestName() + ".false");
+  XlaBuilder false_builder(TestName() + ".false");
   {
     false_builder.Parameter(0, empty_tuple_, "tuple");
     auto false_constant1 = false_builder.ConstantR0<float>(46.6f);
@@ -438,7 +434,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   auto false_builder_result = false_builder.Build();
   EXPECT_IS_OK(false_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operands = builder.Tuple({});
   builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
@@ -460,16 +456,16 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
 // params.
 XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle pred, operand1, operand2;
+  XlaOp pred, operand1, operand2;
   auto pred_arg = CreateR0Parameter<bool>(true, 0, "pred", &builder, &pred);
   auto operand1_param =
       CreateR0Parameter<float>(56.3f, 1, "operand1", &builder, &operand1);
   auto operand2_param =
       CreateR0Parameter<float>(12.7f, 2, "operand2", &builder, &operand2);
-  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(
       &builder, 57.0f,
@@ -480,16 +476,16 @@ XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
 // Test conditional that takes in array operands in the form of external params.
 XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle pred, operand1, operand2;
+  XlaOp pred, operand1, operand2;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
   auto operand1_param = CreateR1Parameter<float>({24.3f, 56.7f}, 1, "operand1",
                                                  &builder, &operand1);
   auto operand2_param = CreateR1Parameter<float>({10.2f, 11.6f}, 2, "operand2",
                                                  &builder, &operand2);
-  auto result = builder.Conditional(pred, operand1, CreateR1CeilComputation(),
-                                    operand2, CreateR1FloorComputation());
+  builder.Conditional(pred, operand1, CreateR1CeilComputation(), operand2,
+                      CreateR1FloorComputation());
 
   ComputeAndCompareR1<float>(
       &builder, {10.0f, 11.0f},
@@ -499,7 +495,7 @@ XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
 
 // Test the case where one conditional is nested within another.
 XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
     Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
@@ -514,7 +510,7 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
   auto inner_builder_result = inner_builder.Build();
   EXPECT_IS_OK(inner_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred1 = builder.ConstantR0<bool>(true);
   auto pred2 = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(1.1f);
@@ -529,7 +525,7 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
 }
 
 XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
     Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
@@ -544,7 +540,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
   auto inner_builder_result = inner_builder.Build();
   EXPECT_IS_OK(inner_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred2 = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(1.1f);
   auto operand2 = builder.ConstantR0<float>(12.2f);
@@ -556,7 +552,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
 
 // Test a mismatch in the shape of the true operand and true computation.
 XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
@@ -571,5 +567,56 @@ XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
                                    "only parameter of true_computation"));
 }
 
+XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({r0f32_, r0f32_});
+  XlaComputation swapper;
+  {
+    XlaBuilder builder(TestName() + ".swapper");
+    auto param0 = builder.Parameter(0, tuple_shape, "sp0");
+    auto x = builder.GetTupleElement(param0, 0);
+    auto y = builder.GetTupleElement(param0, 1);
+    builder.Tuple({y, x});
+    swapper = builder.Build().ConsumeValueOrDie();
+  }
+  XlaComputation forwarder;
+  {
+    XlaBuilder builder(TestName() + ".forwarder");
+    auto param0 = builder.Parameter(0, tuple_shape, "fp0");
+    auto x = builder.GetTupleElement(param0, 0);
+    auto y = builder.GetTupleElement(param0, 1);
+    builder.Tuple({x, y});
+    forwarder = builder.Build().ConsumeValueOrDie();
+  }
+  XlaComputation main;
+  {
+    XlaBuilder builder(TestName() + ".main");
+    auto param0 = builder.Parameter(0, tuple_shape, "mp0");
+    auto x = builder.GetTupleElement(param0, 0);
+    auto y = builder.GetTupleElement(param0, 1);
+    auto lt_pred = builder.Lt(x, y);
+    auto res = builder.Conditional(lt_pred, param0, forwarder, param0, swapper);
+    auto ge_pred = builder.Ge(x, y);
+    builder.Conditional(ge_pred, res, swapper, res, forwarder);
+    main = builder.Build().ConsumeValueOrDie();
+  }
+
+  auto test_swap = [&](float a, float b) {
+    XlaBuilder builder(TestName());
+    auto x = builder.ConstantR0<float>(a);
+    auto y = builder.ConstantR0<float>(b);
+    auto tuple_operand = builder.Tuple({x, y});
+    builder.Call(main, {tuple_operand});
+
+    ComputeAndCompareTuple(
+        &builder,
+        *Literal::MakeTuple({Literal::CreateR0<float>(a).get(),
+                             Literal::CreateR0<float>(b).get()}),
+        {}, error_spec_);
+  };
+
+  test_swap(3.11f, 9.4f);
+  test_swap(11.24f, 5.55f);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 35aa3f6d696297efb7d95d826ed75a504a24529d..4743673561a665ca8670a56bf15d85a74073e472 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -39,7 +40,7 @@ class ConstantsTest : public ClientLibraryTestBase {
 };
 
 TEST_F(ConstantsTest, ZeroCellF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>({});
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
@@ -48,7 +49,7 @@ TEST_F(ConstantsTest, ZeroCellF32) {
 TEST_F(ConstantsTest, OneCellF32) {
   std::vector<float> constant = {2.0};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>(constant);
 
   ComputeAndCompareR1<float>(&builder, constant, {}, error_spec_);
@@ -57,7 +58,7 @@ TEST_F(ConstantsTest, OneCellF32) {
 TEST_F(ConstantsTest, OneCellS32) {
   std::vector<int32> constant = {2};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<int32>(constant);
 
   ComputeAndCompareR1<int32>(&builder, constant, {});
@@ -66,7 +67,7 @@ TEST_F(ConstantsTest, OneCellS32) {
 TEST_F(ConstantsTest, OneCellU32) {
   std::vector<uint32> constant = {2};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<uint32>(constant);
 
   ComputeAndCompareR1<uint32>(&builder, constant, {});
@@ -75,7 +76,7 @@ TEST_F(ConstantsTest, OneCellU32) {
 TEST_F(ConstantsTest, EightCells) {
   std::vector<float> constant = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>(constant);
 
   ComputeAndCompareR1<float>(&builder, constant, {}, error_spec_);
@@ -85,14 +86,14 @@ TEST_F(ConstantsTest, SixteenCells) {
   std::vector<float> constant = {0.0, 1.0, 2.0,  3.0,  4.0,  5.0,  6.0,  7.0,
                                  8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>(constant);
 
   ComputeAndCompareR1<float>(&builder, constant, {}, error_spec_);
 }
 
 TEST_F(ConstantsTest, Empty_0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 2), {}, error_spec_);
@@ -102,14 +103,14 @@ TEST_F(ConstantsTest, Small_2x2) {
   std::unique_ptr<Array2D<float>> constant =
       MakeLinspaceArray2D(100.0, 200.0, 2, 2);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR2FromArray2D<float>(*constant);
 
   ComputeAndCompareR2<float>(&builder, *constant, {}, error_spec_);
 }
 
 TEST_F(ConstantsTest, Empty_3x0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto constant = builder.ConstantLiteral(
       *Literal::CreateR3FromArray3D<float>(Array3D<float>(3, 0, 2)));
 
@@ -117,7 +118,7 @@ TEST_F(ConstantsTest, Empty_3x0x2) {
 }
 
 TEST_F(ConstantsTest, Small_2x2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> array3d({
       // x0  x1
       {{1.f, 2.f},   // y0
@@ -145,13 +146,13 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
       Literal::CreateR4FromArray4D(input_array);
 
   {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.ConstantLiteral(*input_literal);
     ComputeAndCompareR4<float>(&builder, input_array, {}, error_spec_);
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.ConstantR4FromArray4D<float>(input_array);
     ComputeAndCompareR4<float>(&builder, input_array, {}, error_spec_);
   }
@@ -159,12 +160,13 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
 
 // TODO(b/29263943): Support tuple constants.
 TEST_F(ConstantsTest, DISABLED_TupleConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantLiteral(
       *Literal::MakeTuple({Literal::CreateR2<float>({{1.0}, {2.0}}).get(),
                            Literal::CreateR1<float>({2.0, 42}).get()}));
 
-  std::unique_ptr<Literal> result = ExecuteAndTransferOrDie(&builder, {});
+  std::unique_ptr<Literal> result =
+      ExecuteAndTransfer(&builder, {}).ConsumeValueOrDie();
 
   LiteralTestUtil::ExpectR2Near<float>(
       {{1.0}, {2.0}}, LiteralView::Create(*result, {0}), error_spec_);
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index f66e3b57bf45fbc9f8ea786146d6fffe5d55a262..4ef0a77884c90b9fe32f96d3361fa3d80bde623b 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -18,13 +18,15 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -34,7 +36,7 @@ namespace {
 
 class ConvertTest : public ClientLibraryTestBase {
  public:
-  explicit ConvertTest(perftools::gputools::Platform* platform = nullptr)
+  explicit ConvertTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
@@ -42,7 +44,7 @@ class ConvertTest : public ClientLibraryTestBase {
 };
 
 TEST_F(ConvertTest, ConvertR1S32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({42, 64});
   builder.ConvertElementType(a, S32);
 
@@ -51,7 +53,7 @@ TEST_F(ConvertTest, ConvertR1S32ToR1S32) {
 }
 
 TEST_F(ConvertTest, ConvertR1F32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0f, 64.0f});
   builder.ConvertElementType(a, F32);
 
@@ -60,7 +62,7 @@ TEST_F(ConvertTest, ConvertR1F32ToR1F32) {
 }
 
 TEST_F(ConvertTest, ConvertR1S32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({42, 64});
   builder.ConvertElementType(a, F32);
 
@@ -69,7 +71,7 @@ TEST_F(ConvertTest, ConvertR1S32ToR1F32) {
 }
 
 TEST_F(ConvertTest, ConvertR1PREDToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({true, false, true});
   builder.ConvertElementType(a, S32);
 
@@ -78,7 +80,7 @@ TEST_F(ConvertTest, ConvertR1PREDToR1S32) {
 }
 
 TEST_F(ConvertTest, ConvertR1PREDToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({true, false, true});
   builder.ConvertElementType(a, F32);
 
@@ -87,7 +89,7 @@ TEST_F(ConvertTest, ConvertR1PREDToR1F32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1S0S32ToR1S0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   builder.ConvertElementType(a, F32);
 
@@ -96,7 +98,7 @@ XLA_TEST_F(ConvertTest, ConvertR1S0S32ToR1S0F32) {
 }
 
 TEST_F(ConvertTest, ConvertR1F32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.6, 64.4});
   builder.ConvertElementType(a, S32);
 
@@ -105,16 +107,168 @@ TEST_F(ConvertTest, ConvertR1F32ToR1S32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1S64ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<int64>({32, 64});
-  builder.ConvertElementType(a, F32);
+  XlaBuilder builder(TestName());
+  std::vector<int64> arg{
+      -9223371216516022272,
+      -2,
+      -1,
+      -0x7FFFFFFF,
+      -0x80000000,
+      0,
+      1,
+      2,
+      1073742145,
+      1073742656,
+      0x7FFFFFFF,
+      0x80000000,
+      826720496944058148,
+      4296062029846194332,
+      0x0007FB72E4000000LL,
+      0x0007FB72E4000001LL,
+      0x0007FB72E6000000LL,
+      0x0007FB72E7000000LL,
+      0x0007FB72E7FFFFFFLL,
+      0x0007FB72E8000000LL,
+      0x0007FB72E8000001LL,
+      0x0007FB72EA000000LL,
+      0x0007FB72EB000000LL,
+      0x0007FB72EBFFFFFFLL,
+      0x0007FB72EC000000LL,
+      0x7FFFFF0000000000LL,
+      0x7FFFFF8000000000LL,
+      0x7FFFFFFFFFFFFF00,
+      static_cast<int64>(0xFFFFFFFFFFFFFFFF),
+      static_cast<int64>(0x0000f234e67e0001LL),
+      static_cast<int64>(0x8000000000000000),
+      static_cast<int64>(0x8000000000000000LL),
+      static_cast<int64>(0x8000000000000001LL),
+      static_cast<int64>(0x8000008000000000LL),
+      static_cast<int64>(0x8000010000000000LL),
+  };
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<int64>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, F32);
+
+  std::vector<float> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<float>(arg[i]);
+  }
+  ComputeAndCompareR1<float>(&builder, expected, {arg_data.get()});
+}
 
-  std::vector<float> expected = {32.0, 64.0};
-  ComputeAndCompareR1<float>(&builder, expected, {});
+XLA_TEST_F(ConvertTest, ConvertR1U32ToR1F32) {
+  XlaBuilder builder(TestName());
+  std::vector<uint32> arg{0,          1,          0x1000,     0x7fffffff,
+                          0x80000000, 0x80000001, 0x80000002, 0x80000003,
+                          0x80000080, 0x80000081, 0x80000082, 0xFFFFFFFF};
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<uint32>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, F32);
+
+  std::vector<float> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<float>(arg[i]);
+  }
+  ComputeAndCompareR1<float>(&builder, expected, {arg_data.get()});
+}
+
+XLA_TEST_F(ConvertTest, ConvertR1F32ToR1U32) {
+  XlaBuilder builder(TestName());
+  std::vector<float> arg{0.0f,        1.0f,          16777216.0f,
+                         16777218.0f, 2147483647.0f, 4294967040.0f};
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<float>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, U32);
+
+  std::vector<uint32> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<uint32>(arg[i]);
+  }
+  ComputeAndCompareR1<uint32>(&builder, expected, {arg_data.get()});
+}
+
+XLA_TEST_F(ConvertTest, ConvertR1U32ToR1S64) {
+  XlaBuilder builder(TestName());
+  std::vector<uint32> arg{0, 1, 0x1000, 0x7fffffff, 0x80000082, 0xFFFFFFFF};
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<uint32>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, S64);
+
+  std::vector<int64> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<int64>(arg[i]);
+  }
+  ComputeAndCompareR1<int64>(&builder, expected, {arg_data.get()});
+}
+
+XLA_TEST_F(ConvertTest, ConvertR1S32ToR1S64) {
+  XlaBuilder builder(TestName());
+  std::vector<int32> arg{0, 1, 0x1000, -1, -0x1000};
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<int32>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, S64);
+
+  std::vector<int64> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<int64>(arg[i]);
+  }
+  ComputeAndCompareR1<int64>(&builder, expected, {arg_data.get()});
+}
+
+XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) {
+  XlaBuilder builder(TestName());
+  // Test cases from compiler_rt library.
+  std::vector<float> arg{0.0f,
+                         0.5f,
+                         0.99f,
+                         1.0f,
+                         1.5f,
+                         1.99f,
+                         2.0f,
+                         2.01f,
+                         2147483648.f,
+                         -0.5f,
+                         -0.99f,
+                         -1.0f,
+                         -1.5f,
+                         -1.99f,
+                         -2.0f,
+                         -2.01f,
+                         0x1.FFFFFEp+62F,
+                         0x1.FFFFFCp+62F,
+                         -0x1.FFFFFEp+62F,
+                         -0x1.FFFFFCp+62F};
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<float>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, S64);
+
+  std::vector<int64> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<int64>(arg[i]);
+  }
+  ComputeAndCompareR1<int64>(&builder, expected, {arg_data.get()});
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1U8ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint8_t>({32, 64});
   builder.ConvertElementType(a, F32);
 
@@ -123,7 +277,7 @@ XLA_TEST_F(ConvertTest, ConvertR1U8ToR1F32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1U8ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint8_t>({32, 64});
   builder.ConvertElementType(a, S32);
 
@@ -132,7 +286,7 @@ XLA_TEST_F(ConvertTest, ConvertR1U8ToR1S32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1U8ToR1U32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint8_t>({32, 64});
   builder.ConvertElementType(a, U32);
 
@@ -141,7 +295,7 @@ XLA_TEST_F(ConvertTest, ConvertR1U8ToR1U32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({32.0f, 64.0f});
   builder.ConvertElementType(a, F64);
 
@@ -150,7 +304,7 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F64) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1F64ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<double>({32.0, 64.0});
   builder.ConvertElementType(a, F32);
 
@@ -159,7 +313,7 @@ XLA_TEST_F(ConvertTest, ConvertR1F64ToR1F32) {
 }
 
 TEST_F(ConvertTest, ConvertS32Extremes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>(
       {std::numeric_limits<int32>::min(), std::numeric_limits<int32>::max()});
   builder.ConvertElementType(a, F32);
@@ -171,7 +325,7 @@ TEST_F(ConvertTest, ConvertS32Extremes) {
 }
 
 TEST_F(ConvertTest, ConvertMapToS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in");
   b->ConvertElementType(param, S32);
@@ -183,7 +337,7 @@ TEST_F(ConvertTest, ConvertMapToS32) {
 }
 
 TEST_F(ConvertTest, ConvertMapToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in");
   b->ConvertElementType(param, F32);
@@ -200,7 +354,7 @@ TEST_F(ConvertTest, ConvertMapToF32) {
 //   input -> convert -> reshape
 // the new convert should have the same element type as the old convert.
 TEST_F(ConvertTest, ConvertReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR1<int32>({42});
   auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{});
   builder.ConvertElementType(reshape, F32);
@@ -208,5 +362,104 @@ TEST_F(ConvertTest, ConvertReshape) {
   ComputeAndCompareR0<float>(&builder, 42.0f, {}, ErrorSpec(0.0001));
 }
 
+std::vector<float> GetInterestingF16ConversionTestCases() {
+  float infinity = std::numeric_limits<float>::infinity();
+  float half_min_positive_normal =
+      tensorflow::bit_cast<float, uint32>(0x38800000);
+  float half_max_subnormal = tensorflow::bit_cast<float, uint32>(0x387fc000);
+  float half_min_positive_subnormal =
+      tensorflow::bit_cast<float, uint32>(0x33800000);
+  float half_max = 65504.0f;
+
+  std::vector<float> test_cases(
+      {-infinity, -(half_max * 2 + 1), -half_max, -42.0f, -1.0f,
+       -half_min_positive_subnormal, -half_max_subnormal,
+       -half_min_positive_normal, -0.0f, 0.0f, half_min_positive_subnormal,
+       half_max_subnormal, half_min_positive_normal, 1.0f, 42.0f, half_max,
+       (half_max * 2 + 1), infinity});
+  return test_cases;
+}
+
+XLA_TEST_F(ConvertTest, ConvertR1F16ToR1F32) {
+  std::vector<float> test_cases = GetInterestingF16ConversionTestCases();
+  std::vector<half> input;
+  c_transform(test_cases, std::back_inserter(input),
+              [](float f) { return Eigen::half(f); });
+  std::vector<float> expected_output;
+  c_transform(input, std::back_inserter(expected_output),
+              [](Eigen::half h) { return static_cast<float>(h); });
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> dot_lhs_handle,
+      client_->TransferToServer(*Literal::CreateR1<half>(input)));
+
+  XlaBuilder builder(TestName());
+  builder.ConvertElementType(
+      builder.Parameter(
+          0, ShapeUtil::MakeShape(F16, {static_cast<int64>(input.size())}),
+          "param"),
+      F32);
+
+  ComputeAndCompareR1<float>(&builder, expected_output, {dot_lhs_handle.get()});
+}
+
+XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F16) {
+  std::vector<float> input = GetInterestingF16ConversionTestCases();
+  std::vector<half> expected_output;
+  c_transform(input, std::back_inserter(expected_output),
+              [](float f) { return Eigen::half(f); });
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> dot_lhs_handle,
+      client_->TransferToServer(*Literal::CreateR1<float>(input)));
+
+  XlaBuilder builder(TestName());
+  builder.ConvertElementType(
+      builder.Parameter(
+          0, ShapeUtil::MakeShape(F32, {static_cast<int64>(input.size())}),
+          "param"),
+      F16);
+
+  ComputeAndCompareR1<half>(&builder, expected_output, {dot_lhs_handle.get()});
+}
+
+XLA_TEST_F(ConvertTest, ConvertC64ToC64) {
+  XlaBuilder builder(TestName());
+  std::vector<complex64> x = {{42.0f, 64.0f}};
+  builder.ConvertElementType(builder.ConstantR1<complex64>(x), C64);
+  ComputeAndCompareR1<complex64>(&builder, x, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(ConvertTest, ConvertS64S64) {
+  XlaBuilder builder(TestName());
+  std::vector<int64> x = {{-42, 64}};
+  builder.ConvertElementType(builder.ConstantR1<int64>(x), S64);
+  ComputeAndCompareR1<int64>(&builder, x, {});
+}
+
+XLA_TEST_F(ConvertTest, ConvertU64U64) {
+  XlaBuilder builder(TestName());
+  std::vector<uint64> x = {{42, 64}};
+  builder.ConvertElementType(builder.ConstantR1<uint64>(x), U64);
+  ComputeAndCompareR1<uint64>(&builder, x, {});
+}
+
+XLA_TEST_F(ConvertTest, ConvertU64S64) {
+  XlaBuilder builder(TestName());
+  std::vector<uint64> unsigned_x = {{42, UINT64_MAX}};
+  builder.ConvertElementType(builder.ConstantR1<uint64>(unsigned_x), S64);
+  std::vector<int64> signed_x = {{42, -1}};
+  ComputeAndCompareR1<int64>(&builder, signed_x, {});
+}
+
+XLA_TEST_F(ConvertTest, ConvertS64U64) {
+  XlaBuilder builder(TestName());
+  std::vector<int64> signed_x = {{42, -1, INT64_MIN}};
+  builder.ConvertElementType(builder.ConstantR1<int64>(signed_x), U64);
+  std::vector<uint64> unsigned_x = {
+      {42, UINT64_MAX, tensorflow::MathUtil::IPow<uint64>(2, 63)}};
+  ComputeAndCompareR1<uint64>(&builder, unsigned_x, {});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 896b34fb6e2762c14bd9ec2bf1ba13c548d4cf60..b5a42e305987df030c15d089f5877f73bb61de1b 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,13 +34,35 @@ limitations under the License.
 namespace xla {
 namespace {
 
+StatusOr<ConvolutionDimensionNumbers> CreateConvDimensionNumbers(
+    int64 input_batch, int64 input_feature, int64 input_first_spatial,
+    int64 input_second_spatial, int64 output_batch, int64 output_feature,
+    int64 output_first_spatial, int64 output_second_spatial,
+    int64 kernel_output_feature, int64 kernel_input_feature,
+    int64 kernel_first_spatial, int64 kernel_second_spatial) {
+  ConvolutionDimensionNumbers dimension_numbers;
+  dimension_numbers.set_input_batch_dimension(input_batch);
+  dimension_numbers.set_input_feature_dimension(input_feature);
+  dimension_numbers.add_input_spatial_dimensions(input_first_spatial);
+  dimension_numbers.add_input_spatial_dimensions(input_second_spatial);
+  dimension_numbers.set_kernel_output_feature_dimension(kernel_output_feature);
+  dimension_numbers.set_kernel_input_feature_dimension(kernel_input_feature);
+  dimension_numbers.add_kernel_spatial_dimensions(kernel_first_spatial);
+  dimension_numbers.add_kernel_spatial_dimensions(kernel_second_spatial);
+  dimension_numbers.set_output_batch_dimension(output_batch);
+  dimension_numbers.set_output_feature_dimension(output_feature);
+  dimension_numbers.add_output_spatial_dimensions(output_first_spatial);
+  dimension_numbers.add_output_spatial_dimensions(output_second_spatial);
+  TF_RETURN_IF_ERROR(XlaBuilder::Validate(dimension_numbers));
+  return dimension_numbers;
+}
+
 class ConvolutionDimensionNumbersTest : public ClientLibraryTestBase {};
 
 // Tests the convolution operation with invalid input dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0,
-                                                     1, 2, 3);
+      CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("input are not unique"));
@@ -49,8 +71,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
 // Tests the convolution operation with invalid weight dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0,
-                                                     2, 2, 3);
+      CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("weight are not unique"));
@@ -59,8 +80,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
 // Tests the convolution operation with invalid output dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0,
-                                                     1, 2, 3);
+      CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("output are not unique"));
@@ -76,14 +96,14 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest,
       client_->TransferToServer(*Literal::CreateR4FromArray4D(*weight_array))
           .ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(*input_array);
   auto weight =
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {4, 3, 1, 1}), "weight");
   auto conv1 = builder.Conv(input, weight, {1, 1}, Padding::kValid);
 
   ConvolutionDimensionNumbers dim_nums =
-      ComputationBuilder::CreateDefaultConvDimensionNumbers();
+      XlaBuilder::CreateDefaultConvDimensionNumbers();
   // Swap batch_dimension and feature_dimension.
   int64 old_input_batch_dim = dim_nums.input_batch_dimension();
   int64 old_output_batch_dim = dim_nums.output_batch_dimension();
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 1385b437fc47fe5289c401581fab8b5278872382..947959beb144e1509a77ad2f94b8493de46ba6f2 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -53,27 +53,12 @@ class ConvolutionTest : public ClientLibraryTestBase {
 #endif
 };
 
-// TODO(b/72509305): Enable half data type tests for CPU
-#if (XLA_TEST_BACKEND_GPU)
-using TestTypes = ::testing::Types<float, Eigen::half>;
-#else
+#ifdef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
 using TestTypes = ::testing::Types<float>;
+#else
+using TestTypes = ::testing::Types<float, Eigen::half>;
 #endif
 
-template <typename T>
-Shape MakeShapeWrapper(tensorflow::gtl::ArraySlice<int64> dimensions);
-
-template <>
-Shape MakeShapeWrapper<float>(tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return ShapeUtil::MakeShape(F32, dimensions);
-}
-
-template <>
-Shape MakeShapeWrapper<Eigen::half>(
-    tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return ShapeUtil::MakeShape(F16, dimensions);
-}
-
 template <typename T>
 class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest {
  public:
@@ -103,12 +88,12 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest {
     ASSERT_EQ(2, arhs->width());
     ASSERT_EQ(2, arhs->height());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto lhs = builder.ConstantR4FromArray4D<T>(*alhs);
     auto rhs = builder.ConstantR4FromArray4D<T>(*arhs);
-    auto conv = builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
+    builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
 
-    ComputeAndCompare(&builder, conv, {}, error_spec_);
+    ComputeAndCompare(&builder, {}, error_spec_);
   }
 };
 
@@ -121,12 +106,12 @@ template <typename T>
 class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
-    Shape input_shape = MakeShapeWrapper<T>({1, 1, 1, 2});
-    Shape filter_shape = MakeShapeWrapper<T>({1, 1, 1, 2});
+    XlaBuilder builder(TestName());
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 1, 2});
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 1, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
     Array4D<T> input_data(1, 1, 1, 2);
     input_data.FillWithYX(Array2D<T>({
@@ -137,7 +122,7 @@ class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest {
         {5.0f, 6.0f},
     }));
 
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -152,12 +137,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
-    Shape input_shape = MakeShapeWrapper<T>({1, 1, 4, 4});
-    Shape filter_shape = MakeShapeWrapper<T>({1, 1, 2, 2});
+    XlaBuilder builder(TestName());
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 2, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({
@@ -171,7 +156,7 @@ class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest {
         {5.0f, 6.0f},
         {7.0f, 8.0f},
     }));
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -186,12 +171,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
-    Shape input_shape = MakeShapeWrapper<T>({1, 1, 4, 4});
-    Shape filter_shape = MakeShapeWrapper<T>({1, 1, 2, 2});
+    XlaBuilder builder(TestName());
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 2, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({
@@ -206,7 +191,7 @@ class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest {
         {7.0f, 8.0f},
     }));
 
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -222,12 +207,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
-    Shape input_shape = MakeShapeWrapper<T>({1, 1, 4, 4});
-    Shape filter_shape = MakeShapeWrapper<T>({1, 1, 3, 3});
+    XlaBuilder builder(TestName());
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 3, 3});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({{1.0f, 2.0f, 3.0f, 4.0f},
@@ -238,7 +223,7 @@ class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest {
     filter_data.FillWithYX(Array2D<T>(
         {{5.0f, 6.0f, 7.0f}, {8.0f, 9.0f, 10.0f}, {11.0f, 12.0f, 13.0f}}));
     // clang-format on
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -249,7 +234,7 @@ TYPED_TEST_CASE(Convolve_1x1x4x4_1x1x3x3_Same, TestTypes);
 TYPED_TEST(Convolve_1x1x4x4_1x1x3x3_Same, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -279,10 +264,10 @@ template <typename T>
 class Convolve1D_1x2x5_1x2x2_WithRHSDilation : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     {
-      Shape input_shape = MakeShapeWrapper<T>({1, 2, 5});
-      Shape filter_shape = MakeShapeWrapper<T>({1, 2, 2});
+      Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
+      Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
       auto input = builder.Parameter(0, input_shape, "input");
       auto filter = builder.Parameter(1, filter_shape, "filter");
       // Convolution dimensions are bf0_oi0->bo0.
@@ -315,7 +300,7 @@ TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithRHSDilation, TestTypes);
 TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithRHSDilation, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -346,7 +331,7 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
 }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -380,10 +365,10 @@ template <typename T>
 class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     {
-      Shape input_shape = MakeShapeWrapper<T>({1, 2, 5});
-      Shape filter_shape = MakeShapeWrapper<T>({1, 2, 2});
+      Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
+      Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
       auto input = builder.Parameter(0, input_shape, "input");
       auto filter = builder.Parameter(1, filter_shape, "filter");
       // Convolution dimensions are bf0_oi0->bo0.
@@ -417,7 +402,7 @@ TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithPadding, TestTypes);
 TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithPadding, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<int64> input_dims = {1, 4, 2, 3, 3};
   std::vector<int64> filter_dims = {2, 2, 2, 3, 3};
   Shape input_shape = ShapeUtil::MakeShape(F32, input_dims);
@@ -484,11 +469,11 @@ template <typename T>
 class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     std::vector<int64> input_dims = {1, 3, 3, 5};
     std::vector<int64> filter_dims = {3, 3, 5, 3};
-    Shape input_shape = MakeShapeWrapper<T>(input_dims);
-    Shape filter_shape = MakeShapeWrapper<T>(filter_dims);
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
     {
       auto input = builder.Parameter(0, input_shape, "input");
       auto filter = builder.Parameter(1, filter_shape, "filter");
@@ -552,7 +537,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
     execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
         "convolution-canonicalization");
   }
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Shape input_shape = ShapeUtil::MakeShape(F32, {4, 29});
   Shape filter_shape = ShapeUtil::MakeShape(F32, {4, 10});
 
@@ -566,8 +551,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
   dnums.set_kernel_output_feature_dimension(1);
   dnums.set_output_batch_dimension(0);
   dnums.set_output_feature_dimension(1);
-  auto conv = builder.ConvWithGeneralDimensions(input, filter, {},
-                                                Padding::kValid, dnums);
+  builder.ConvWithGeneralDimensions(input, filter, {}, Padding::kValid, dnums);
 
   Array2D<float> param0(4, 29);
   param0.FillUnique();
@@ -578,7 +562,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
   Array2D<float> expected_result(29, 10);
   expected_result.Fill(0);
 
-  ComputeAndCompare(&builder, conv,
+  ComputeAndCompare(&builder,
                     {std::move(*Literal::CreateFromArray(param0)),
                      std::move(*Literal::CreateFromArray(param1))},
                     error_spec_);
@@ -602,7 +586,7 @@ class Convolve1D1WindowTestBase
  protected:
   template <typename T>
   void TestImpl() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     int64 input_feature = GetParam().input_feature;
     int64 output_feature = GetParam().output_feature;
     int64 batch = GetParam().batch;
@@ -612,8 +596,8 @@ class Convolve1D1WindowTestBase
                                      input_feature};
     std::vector<int64> filter_dims = {window_size, input_feature,
                                       output_feature};
-    Shape input_shape = MakeShapeWrapper<T>(input_dims);
-    Shape filter_shape = MakeShapeWrapper<T>(filter_dims);
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
     {
       auto input = builder.Parameter(0, input_shape, "input");
       auto filter = builder.Parameter(1, filter_shape, "filter");
@@ -699,9 +683,7 @@ INSTANTIATE_TEST_CASE_P(
 #if (XLA_TEST_BACKEND_GPU || XLA_TEST_BACKEND_CPU)
 class Convolve1D1WindowTestHalf : public Convolve1D1WindowTestBase {};
 
-// TODO(b/72509305): Enable half data type tests for CPU.
-XLA_TEST_P(Convolve1D1WindowTestHalf,
-           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(Convolve1D1Window))) {
+XLA_TEST_P(Convolve1D1WindowTestHalf, Convolve1D1Window) {
   TestImpl<Eigen::half>();
 }
 
@@ -719,14 +701,16 @@ INSTANTIATE_TEST_CASE_P(
                       Convolve1DTestParam{130, 1, 1, 1, 3},
                       Convolve1DTestParam{64, 1, 1, 1, 1},
                       Convolve1DTestParam{128, 1, 1, 1, 1},
-                      // TODO(b/72566306): the following three tests fail on CPU
-                      // backend due to result miscompare.
+// TODO(b/72566306): The following five tests failed on CPU with unreasonable
+// relative errors.  Last ran on 2018-02-22.
+#if XLA_TEST_BACKEND_GPU
                       Convolve1DTestParam{139, 1, 1, 128, 1},
                       Convolve1DTestParam{640, 3, 3, 128, 1},
                       Convolve1DTestParam{900, 1, 1, 10, 1},
                       Convolve1DTestParam{1, 10, 10, 1, 10},
-                      Convolve1DTestParam{1, 10, 130, 1, 2},
                       Convolve1DTestParam{1, 10, 130, 1, 1},
+#endif
+                      Convolve1DTestParam{1, 10, 130, 1, 2},
                       Convolve1DTestParam{1, 64, 64, 1, 10},
                       Convolve1DTestParam{1, 65, 65, 1, 1},
                       Convolve1DTestParam{1, 128, 128, 1, 1},
@@ -738,13 +722,13 @@ INSTANTIATE_TEST_CASE_P(
 );
 #endif
 
-TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
-  ComputationBuilder builder(client_, TestName());
+XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
+  XlaBuilder builder(TestName());
   Shape input_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
   Shape filter_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
   auto input = builder.Parameter(0, input_shape, "input");
   auto filter = builder.Parameter(1, filter_shape, "filter");
-  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+  builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<bfloat16> input_data(1, 1, 1, 2);
   input_data.FillWithYX(Array2D<bfloat16>({
@@ -755,11 +739,34 @@ TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
       {bfloat16(5), bfloat16(6)},
   }));
 
-  ComputeAndCompare(&builder, conv,
+  ComputeAndCompare(&builder,
                     {std::move(*Literal::CreateFromArray(input_data)),
                      std::move(*Literal::CreateFromArray(filter_data))},
                     error_spec_);
 }
 
+// Check that GPU convs still work if the CudnnAlgorithmPicker pass is disabled.
+// (We run this test on all platforms, because, what the heck.)
+XLA_TEST_F(ConvolutionTest, NoCudnnAlgorithmPicker) {
+  execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+      "cudnn-convolution-algorithm-picker");
+
+  XlaBuilder builder(TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+  builder.Conv(input, filter, {1, 1}, Padding::kValid);
+
+  Array4D<float> input_data(1, 1, 1, 2);
+  input_data.FillIota(0);
+  Array4D<float> filter_data(1, 1, 1, 2);
+  filter_data.FillIota(10);
+
+  ComputeAndCompare(&builder,
+                    {std::move(*Literal::CreateFromArray(input_data)),
+                     std::move(*Literal::CreateFromArray(filter_data))});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index 9c1145def8c11f1222c63adf006102887d49f00d..50d6e25d868c4964ff35023b43a3734ed115bbb8 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -52,7 +53,7 @@ class ConvolutionVariantsTest : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(ConvolutionVariantsTest, Minimal) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const Array4D<float> input_array(1, 1, 1, 1, {2});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -67,7 +68,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Minimal) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const Array4D<float> input_array(5, 1, 1, 1, {1, 2, 3, 4, 5});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -82,7 +83,7 @@ XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(2, 1, 3, 4);
   input_array.FillWithMultiples(1);
@@ -99,7 +100,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 2, 1, 1, {10, 1});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -114,7 +115,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 2, {1, 2});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -129,7 +130,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 3, {1, 2, 3});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -144,7 +145,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -159,7 +160,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -174,7 +175,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -189,7 +190,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(
       2, 2, 2, 3, {0, 1, 2, 3, 4, 5,  6,  7,  8,  9,  0, 0,    // plane 0
@@ -210,7 +211,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 4, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -225,7 +226,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 5, {1, 2, 3, 4, 5});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -240,7 +241,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 4, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -255,7 +256,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 5, {1, 2, 3, 4, 5});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -270,7 +271,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -285,7 +286,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 1, {1});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -300,7 +301,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 3, {1, 2, 3});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -315,7 +316,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -333,7 +334,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 2, 1, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -348,7 +349,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -363,7 +364,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 3, {1, 2, 3});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -378,7 +379,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(64);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -398,7 +399,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(16 * 1 * 1 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -419,7 +420,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   constexpr int bs = 16;
   constexpr int kx = 2;
@@ -450,7 +451,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   constexpr int kx = 2;
   constexpr int ky = 2;
@@ -482,7 +483,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(16, 1, 8, 8);
   for (int i0 = 0; i0 < 16; ++i0) {
@@ -510,7 +511,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(2 * 8 * 8);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -536,7 +537,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(2 * 2 * 8 * 8);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -562,7 +563,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(32 * 2 * 8 * 8);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -602,7 +603,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(16, 16, 1, 1);
   Array4D<float> filter_array(16, 16, 1, 1);
@@ -628,7 +629,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 4 * 6);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -640,14 +641,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) {
   builder.ConvGeneralDilated(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{2, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 2, 2, {3924, 4257, 5922, 6255});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -659,14 +660,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) {
   builder.ConvGeneralDilated(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{},
       /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 8, {10, 2, 20, 3, 30, 4, 40, 5});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 3 * 4);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -682,8 +683,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
   builder.ConvGeneralDilated(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{2, 1},
       /*padding=*/{{1, 0}, {0, 0}}, /*lhs_dilation=*/{3, 2},
-      /*rhs_dilation=*/{},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      /*rhs_dilation=*/{}, XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 3, 5,
                           {204, 40, 406, 60, 608,       //
@@ -693,7 +693,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -705,14 +705,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) {
   builder.ConvGeneral(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {-1, -1}},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 2, {23, 34});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -724,14 +724,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) {
   builder.ConvGeneral(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {-1, 2}},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 5, {23, 34, 45, 50, 0});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -743,14 +743,14 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) {
   builder.ConvGeneral(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {2, -1}},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 5, {0, 1, 12, 23, 34});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -763,7 +763,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {3, 2}},
       /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   // input:
   //   [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5]
@@ -775,7 +775,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -788,7 +788,7 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) {
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {-3, -2}},
       /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   // input:
   //   [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5]
@@ -821,7 +821,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x1x2x3_Filter2x1x1x2) {
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -854,7 +854,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x16x1x1_Filter1x16x1x1) {
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -887,7 +887,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter1x16x1x1) {
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -920,7 +920,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) {
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -954,7 +954,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -966,7 +966,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1010,7 +1010,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1054,7 +1054,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1095,7 +1095,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 2);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1147,7 +1147,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
 //   BackwardInputConv([1,2,3], [5,6], padding_low=0, padding_high=1)
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardInputLowPaddingLessThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 3, /*values=*/{1, 2, 3}));
@@ -1166,19 +1166,18 @@ XLA_TEST_F(ConvolutionVariantsTest,
 //   BackwardInputConv([1], [1,10,100], stride=3, padding=(2,1))
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardInputLowPaddingGreaterThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 1, /*values=*/{1}));
   auto weights = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 3, /*values=*/{1, 10, 100}));
   auto mirrored_weights = builder.Rev(weights, {2, 3});
-  builder.ConvGeneralDilated(
-      gradients, mirrored_weights,
-      /*window_strides=*/{1, 1},
-      /*padding=*/{{0, 0}, {0, 3}},
-      /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+  builder.ConvGeneralDilated(gradients, mirrored_weights,
+                             /*window_strides=*/{1, 1},
+                             /*padding=*/{{0, 0}, {0, 3}},
+                             /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{},
+                             XlaBuilder::CreateDefaultConvDimensionNumbers());
   ComputeAndCompareR4<float>(&builder, {{{{100, 0}}}}, {}, error_spec_);
 }
 
@@ -1187,7 +1186,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
 // into
 //   BackwardInputConv([1], [1,10,100], padding=(1,1))
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 1, /*values=*/{1}));
@@ -1208,7 +1207,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
 // However, XLA:GPU doesn't actually fuse it because PadInsertion doesn't
 // support negative padding on backward convolution yet (b/32744257).
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 3, /*values=*/{1, 2, 3}));
@@ -1224,7 +1223,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
 
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardFilterLowPaddingLessThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,1,2,3,4,0,0
   // gradients:        100,10,1 -dilate-> 100,0,10,0,1
@@ -1240,7 +1239,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
       /*window_strides=*/{1, 1},
       /*padding=*/{{0, 0}, {1, 2}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
   builder.Transpose(forward_conv, {0, 1, 2, 3});
 
   ComputeAndCompareR4<float>(&builder, {{{{24, 130, 240}}}}, {}, error_spec_);
@@ -1248,7 +1247,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
 
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardFilterLowPaddingGreaterThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4
   // gradients:        100,10,1 -dilate-> 100,0,10,0,1
@@ -1266,14 +1265,14 @@ XLA_TEST_F(ConvolutionVariantsTest,
       /*window_strides=*/{1, 1},
       /*padding=*/{{0, 0}, {2, 0}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
   builder.Transpose(forward_conv, {0, 1, 2, 3});
 
   ComputeAndCompareR4<float>(&builder, {{{{13, 24}}}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4,0
   // gradients:        100,10,1 -dilate-> 100,0,10,0,1
@@ -1293,14 +1292,14 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
       /*window_strides=*/{1, 1},
       /*padding=*/{{0, 0}, {2, 1}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
   builder.Transpose(forward_conv, {0, 1, 2, 3});
 
   ComputeAndCompareR4<float>(&builder, {{{{13, 24, 130}}}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR3FromArray3D<float>(
       Array3D<float>(1, 1, 1, /*value=*/1));
@@ -1314,26 +1313,26 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto activations =
       builder.ConstantR3FromArray3D<float>(Array3D<float>({{{1, 2, 3, 4}}}));
   auto gradients =
       builder.ConstantR3FromArray3D<float>(Array3D<float>({{{100, 10, 1}}}));
-  auto forward_conv = builder.ConvGeneralDilated(
-      activations, gradients,
-      /*window_strides=*/{1},
-      /*padding=*/{{2, 1}},
-      /*lhs_dilation=*/{}, /*rhs_dilation=*/{2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(
-          /*num_spatial_dims=*/1));
+  auto forward_conv =
+      builder.ConvGeneralDilated(activations, gradients,
+                                 /*window_strides=*/{1},
+                                 /*padding=*/{{2, 1}},
+                                 /*lhs_dilation=*/{}, /*rhs_dilation=*/{2},
+                                 XlaBuilder::CreateDefaultConvDimensionNumbers(
+                                     /*num_spatial_dims=*/1));
   builder.Transpose(forward_conv, {0, 1, 2});
 
   ComputeAndCompareR3<float>(&builder, {{{13, 24, 130}}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients_flat = Literal::CreateR1<float>({1});
   auto gradients_literal =
@@ -1357,7 +1356,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto activations_flat = Literal::CreateR1<float>({1, 2, 3, 4});
   auto activations_literal =
@@ -1378,7 +1377,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
       /*window_strides=*/{1, 1, 1},
       /*padding=*/{{0, 0}, {0, 0}, {2, 1}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(
+      XlaBuilder::CreateDefaultConvDimensionNumbers(
           /*num_spatial_dims=*/3));
   builder.Transpose(forward_conv, {0, 1, 2, 3, 4});
   ComputeAndCompareLiteral(&builder, *expected_literal, {}, error_spec_);
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index ece7c3b05e7fafa299db7f9cbf50610c8204f95e..155fbacf58d81cff27939c142c8f30158cef4e00 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -246,7 +247,7 @@ XLA_TEST_F(CopyOpClientTest, Copy0x0) {
   Shape out_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {1, 0});
   auto empty = Literal::CreateFromShape(in_shape);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto param0 = builder.Parameter(0, in_shape, "input");
   auto input_data = client_->TransferToServer(*empty).ConsumeValueOrDie();
 
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 2d847a66b0ae7c8f09fa0cb181a4c84ea99be5b1..b43d5c9ff5d75ee0e1b3c9ceb2bc295e631ac107 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -134,9 +134,9 @@ class CustomCallClientAPITest : public ClientLibraryTestBase {};
 // When using the client API, CustomCall targets can't begin with '$' -- these
 // are reserved for internal use.
 XLA_TEST_F(CustomCallClientAPITest, IllegalCustomCallTarget) {
-  ComputationBuilder builder(client_, TestName());
-  auto call = builder.CustomCall("$illegal", /*operands=*/{},
-                                 ShapeUtil::MakeShape(F32, {1}));
+  XlaBuilder builder(TestName());
+  builder.CustomCall("$illegal", /*operands=*/{},
+                     ShapeUtil::MakeShape(F32, {1}));
 
   StatusOr<std::unique_ptr<GlobalData>> result =
       Execute(&builder, /*arguments=*/{});
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index fe5621e8dc209d6113e74030444c198716d355dc..c76e5aabf4b8a3463b2971654d0a6cf0dd594626 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -16,9 +16,10 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -36,9 +37,8 @@ class DeallocationTest : public ClientLibraryTestBase {
   // Build and execute the given computation then verify the results can be
   // transferred from the device successfully.
   std::unique_ptr<GlobalData> ExecuteAndCheckTransfer(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-    Computation computation = builder->Build().ConsumeValueOrDie();
+      XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    XlaComputation computation = builder->Build().ConsumeValueOrDie();
     auto global_data =
         client_->Execute(computation, arguments, &execution_options_)
             .ConsumeValueOrDie();
@@ -48,7 +48,7 @@ class DeallocationTest : public ClientLibraryTestBase {
 };
 
 TEST_F(DeallocationTest, DeallocateScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0);
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
@@ -66,7 +66,7 @@ TEST_F(DeallocationTest, DeallocateScalar) {
 }
 
 TEST_F(DeallocationTest, DeallocateVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
@@ -79,7 +79,7 @@ TEST_F(DeallocationTest, DeallocateVector) {
 }
 
 TEST_F(DeallocationTest, DeallocateEmptyVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>({});
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
@@ -92,7 +92,7 @@ TEST_F(DeallocationTest, DeallocateEmptyVector) {
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Tuple({builder.ConstantR0<float>(42.0),
                  builder.ConstantR1<float>({1.0, 2.0, 3.0})});
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
@@ -106,7 +106,7 @@ XLA_TEST_F(DeallocationTest, DeallocateTuple) {
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto element = builder.ConstantR0<float>(42.0);
   auto inner_tuple = builder.Tuple({builder.ConstantR0<float>(42.0), element});
   builder.Tuple({element, inner_tuple, element});
@@ -121,7 +121,7 @@ XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) {
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto inner_tuple =
       builder.Tuple({builder.ConstantR0<float>(42.0),
                      builder.ConstantR1<float>({1.0, 2.0, 3.0})});
diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
index 032c06cd3c9f872f57674d3d7b5adc201c91ea77..d0ada2474830390e50a90c4c41aa42166d6e8ea5 100644
--- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
@@ -17,9 +17,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -42,9 +43,8 @@ class DeconstructTupleTest : public ClientLibraryTestBase {
   // Build and execute the given computation then verify the results can be
   // transferred from the device successfully.
   std::unique_ptr<GlobalData> ExecuteAndCheckTransfer(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-    Computation computation = builder->Build().ConsumeValueOrDie();
+      XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    XlaComputation computation = builder->Build().ConsumeValueOrDie();
     auto global_data =
         client_->Execute(computation, arguments, &execution_options_)
             .ConsumeValueOrDie();
@@ -54,7 +54,7 @@ class DeconstructTupleTest : public ClientLibraryTestBase {
 };
 
 TEST_F(DeconstructTupleTest, DeconstructTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({const1, const2});
@@ -73,7 +73,7 @@ TEST_F(DeconstructTupleTest, DeconstructTuple) {
 }
 
 TEST_F(DeconstructTupleTest, DeconstructTupleTwice) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({const1, const2});
@@ -103,7 +103,7 @@ TEST_F(DeconstructTupleTest, DeconstructTupleTwice) {
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({const1, const2, const2, const1});
@@ -129,7 +129,7 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) {
 }
 
 TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({const1, const2, const1});
@@ -159,7 +159,7 @@ TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) {
 }
 
 TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
@@ -170,7 +170,7 @@ TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructTupleFromParam) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({3.14f, -100.25f});
   std::unique_ptr<GlobalData> param0_data =
@@ -186,7 +186,7 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructTupleFromParam) {
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructNestedTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({builder.Tuple({const1, const2}), const1});
@@ -195,7 +195,7 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructNestedTuple) {
   auto result_status = client_->DeconstructTuple(*global_data);
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(result_status.status().error_message(),
-              HasSubstr("deconstructing nested tuples not yet supported"));
+              HasSubstr("Deconstructing nested tuples is not implemented"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/deep_graph_test.cc b/tensorflow/compiler/xla/tests/deep_graph_test.cc
index 1da7a96fe2388eabd647a72aac81bdf2ef5bb6c6..085a5105aca1c173a7cbc211aebbeb5b254b0753 100644
--- a/tensorflow/compiler/xla/tests/deep_graph_test.cc
+++ b/tensorflow/compiler/xla/tests/deep_graph_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 
 namespace xla {
@@ -22,12 +23,12 @@ TEST_F(ClientLibraryTestBase, DeepGraph) {
   // intended to track, we need to set kDepth to 20000.
   // Unfortunately, setting it that high causes the test to time out.
   const int kDepth = 200;
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle x;
-  ComputationDataHandle y;
+  XlaBuilder b(TestName());
+  XlaOp x;
+  XlaOp y;
   auto x_data = CreateR0Parameter<int32>(3, 0, "x", &b, &x);
   auto y_data = CreateR0Parameter<int32>(1, 1, "y", &b, &y);
-  ComputationDataHandle z = x;
+  XlaOp z = x;
   for (int i = 0; i < kDepth; ++i) {
     z = b.Add(z, y);
   }
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 6b0c04c2c083bbfce267dd92d24ef15c06186d26..6b3efba4f80e45d230d3df9274d0fd40c6fb8c42 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -34,169 +34,219 @@ limitations under the License.
 namespace xla {
 namespace {
 
-// TODO(b/34468543): use GUnit typed tests when we can do all tests on all
-// backends.
 class DotOperationTest : public ClientLibraryTestBase {
  public:
   ErrorSpec error_spec_{0.0001, 1e-5};
-
- protected:
-  template <typename Element>
-  void TestOneElementVectorDot();
-  template <typename Element>
-  void TestVectorDot();
-  template <typename Element>
-  void TestSquareMatrixDot(bool lhs_row_major = false,
-                           bool rhs_row_major = false);
-  template <typename Element>
-  void TestNonsquareMatrixDot(bool lhs_row_major = false,
-                              bool rhs_row_major = false);
 };
 
-XLA_TEST_F(DotOperationTest, ZeroElementVectorDotF32) {
-  ComputationBuilder builder(client_, TestName());
-  auto lhs = builder.ConstantR1<float>({});
-  auto rhs = builder.ConstantR1<float>({});
+#if defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
+using TypesF16F32 = ::testing::Types<float>;
+using TypesF16F32F64 = ::testing::Types<float>;
+using TypesF16F32F64CF64 = ::testing::Types<float>;
+#elif !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16) && \
+    !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
+using TypesF16F32 = ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64 = ::testing::Types<Eigen::half, float, double>;
+using TypesF16F32F64CF64 =
+    ::testing::Types<Eigen::half, float, double, complex64>;
+#elif !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64) &&    \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_COMPLEX)
+using TypesF16F32 = ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64 = ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64CF64 = ::testing::Types<Eigen::half, float>;
+#else
+#error "Situation not handled yet"
+#endif
+
+// Check that we can safely pass an input tuple's elements to a dot operation.
+TEST_F(DotOperationTest, DotOfInputTupleElem) {
+  XlaBuilder builder(TestName());
+
+  XlaOp param;
+  auto param_data = CreateParameterAndTransferLiteral(
+      0,
+      *Literal::MakeTuple({Literal::CreateR2<float>({{1, 2}, {3, 4}}).get(),
+                           Literal::CreateR2<float>({{5, 6}, {7, 8}}).get()}),
+      "arg0", &builder, &param);
+  auto lhs = builder.GetTupleElement(param, 0);
+  auto rhs = builder.GetTupleElement(param, 1);
+  builder.Dot(lhs, rhs);
+
+  ComputeAndCompareLiteral(&builder,
+                           *Literal::CreateR2<float>({{19, 22}, {43, 50}}),
+                           {param_data.get()});
+}
+
+template <typename T>
+class DotOperationTest_F16F32F64CF64 : public DotOperationTest {};
+TYPED_TEST_CASE(DotOperationTest_F16F32F64CF64, TypesF16F32F64CF64);
+
+XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, ZeroElementVectorDot) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+
+  auto lhs = builder.ConstantR1<T>({});
+  auto rhs = builder.ConstantR1<T>({});
   auto result = builder.Dot(lhs, rhs);
 
-  ComputeAndCompareR0<float>(&builder, 0.0, {}, error_spec_);
+  this->template ComputeAndCompareR0<T>(&builder, static_cast<T>(0.0), {},
+                                        this->error_spec_);
 }
 
-XLA_TEST_F(DotOperationTest, TrivialMatrixVectorDotF32) {
-  ComputationBuilder builder(client_, TestName());
-  auto lhs = builder.ConstantR2<float>({{3.0, 4.0}});
-  auto rhs = builder.ConstantR1<float>({3.0, 4.0});
-  auto result = builder.Dot(lhs, rhs);
-
-  ComputeAndCompareR1<float>(&builder, {25.0}, {}, error_spec_);
-}
+template <typename T>
+class DotOperationTest_F16F32F64 : public DotOperationTest {};
+TYPED_TEST_CASE(DotOperationTest_F16F32F64, TypesF16F32F64);
 
-template <typename Element>
-void DotOperationTest::TestOneElementVectorDot() {
-  ComputationBuilder builder(client_, TestName());
-  auto lhs = builder.ConstantR1<Element>({2.0});
-  auto rhs = builder.ConstantR1<Element>({3.0});
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, TrivialMatrixVectorDot) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+  auto lhs = builder.ConstantR2FromArray2D<T>({{3.0f, 4.0f}});
+  auto rhs = builder.ConstantFromArray<T>({3.0f, 4.0f});
   auto result = builder.Dot(lhs, rhs);
 
-  ComputeAndCompareR0<Element>(&builder, 6.0, {}, error_spec_);
+  this->template ComputeAndCompareR1<T>(&builder, {static_cast<T>(25.0f)}, {},
+                                        this->error_spec_);
 }
 
-XLA_TEST_F(DotOperationTest, OneElementVectorDotF32) {
-  TestOneElementVectorDot<float>();
-}
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, OneElementVectorDot) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+  auto lhs = builder.ConstantR1<T>({static_cast<T>(2.0f)});
+  auto rhs = builder.ConstantR1<T>({static_cast<T>(3.0f)});
+  auto result = builder.Dot(lhs, rhs);
 
-XLA_TEST_F(DotOperationTest, OneElementVectorDotF64) {
-  TestOneElementVectorDot<double>();
+  this->template ComputeAndCompareR0<T>(&builder, static_cast<T>(6.0f), {},
+                                        this->error_spec_);
 }
 
-template <typename Element>
-void DotOperationTest::TestVectorDot() {
-  ComputationBuilder builder(client_, TestName());
-  auto lhs = builder.ConstantR1<Element>({1.0, 2.5, 42.0});
-  auto rhs = builder.ConstantR1<Element>({11.0, -1.0, 0.5});
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, VectorDot) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+  auto lhs = builder.ConstantFromArray<T>({1.0f, 2.5f, 42.0f});
+  auto rhs = builder.ConstantFromArray<T>({11.0f, -1.0f, 0.5f});
   auto result = builder.Dot(lhs, rhs);
 
-  ComputeAndCompareR0<Element>(&builder, 29.5, {}, error_spec_);
+  this->template ComputeAndCompareR0<T>(&builder, static_cast<T>(29.5f), {},
+                                        this->error_spec_);
 }
 
-XLA_TEST_F(DotOperationTest, VectorDotF32) { TestVectorDot<float>(); }
-
-XLA_TEST_F(DotOperationTest, VectorDotF64) { TestVectorDot<double>(); }
-
-namespace {
-
 std::vector<int64> MinorToMajorForIsRowMajor(bool row_major) {
   return {row_major ? 1 : 0, row_major ? 0 : 1};
 }
 
-}  // namespace
-
-XLA_TEST_F(DotOperationTest, Dot_0x2_2x0) {
-  ComputationBuilder builder(client_, TestName());
-  auto lhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
-  auto rhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(2, 0));
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x0) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+  auto lhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(0, 2));
+  auto rhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(2, 0));
   auto result = builder.Dot(lhs, rhs);
 
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {}, error_spec_);
+  this->template ComputeAndCompareR2<T>(&builder, Array2D<T>(0, 0), {},
+                                        this->error_spec_);
 }
 
-XLA_TEST_F(DotOperationTest, Dot_0x2_2x3) {
-  ComputationBuilder builder(client_, TestName());
-  auto lhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
-  auto rhs = builder.ConstantR2<float>({{7.0, 8.0, 9.0}, {42.0, 77.0, 101.0}});
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x3) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+  auto lhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(0, 2));
+  auto rhs = builder.ConstantR2FromArray2D<T>(
+      {{7.0f, 8.0f, 9.0f}, {42.0f, 77.0f, 101.0f}});
   auto result = builder.Dot(lhs, rhs);
 
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 3), {}, error_spec_);
+  this->template ComputeAndCompareR2<T>(&builder, Array2D<T>(0, 3), {},
+                                        this->error_spec_);
 }
 
-XLA_TEST_F(DotOperationTest, Dot_3x2_2x0) {
-  ComputationBuilder builder(client_, TestName());
-  auto lhs =
-      builder.ConstantR2<float>({{7.0, 8.0}, {9.0, 42.0}, {77.0, 101.0}});
-  auto rhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(2, 0));
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_3x2_2x0) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+  auto lhs = builder.ConstantR2FromArray2D<T>(
+      {{7.0f, 8.0f}, {9.0f, 42.0f}, {77.0f, 101.0f}});
+  auto rhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(2, 0));
   auto result = builder.Dot(lhs, rhs);
 
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(3, 0), {}, error_spec_);
+  this->template ComputeAndCompareR2<T>(&builder, Array2D<T>(3, 0), {},
+                                        this->error_spec_);
 }
 
-XLA_TEST_F(DotOperationTest, Dot_2x0_0x2) {
-  ComputationBuilder builder(client_, TestName());
-  auto lhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(2, 0));
-  auto rhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_2x0_0x2) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+  auto lhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(2, 0));
+  auto rhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(0, 2));
   auto result = builder.Dot(lhs, rhs);
 
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 2, 0.0f), {},
-                             error_spec_);
+  this->template ComputeAndCompareR2<T>(
+      &builder, Array2D<T>(2, 2, static_cast<T>(0.0f)), {}, this->error_spec_);
 }
 
-XLA_TEST_F(DotOperationTest, FusedDot) {
-  ComputationBuilder builder(client_, TestName());
-  auto param0 = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 4}), "arg0");
-  auto param1 = builder.Parameter(1, ShapeUtil::MakeShape(F32, {4, 1}), "arg1");
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, FusedDot) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+  auto param0 =
+      builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({2, 4}), "arg0");
+  auto param1 =
+      builder.Parameter(1, ShapeUtil::MakeShapeWithType<T>({4, 1}), "arg1");
   auto exp0 = builder.Exp(param0);
   auto result = builder.Dot(exp0, param1);
 
-  auto lhs_handle = client_
-                        ->TransferToServer(*Literal::CreateR2<float>(
-                            {{1.0, 2.0, 3.0, 4.0}, {-1.0, -2.0, -3.0, -4.0}}))
-                        .ConsumeValueOrDie();
-  auto rhs_handle = client_
-                        ->TransferToServer(*Literal::CreateR2<float>(
-                            {{1.0}, {2.0}, {3.0}, {4.0}}))
-                        .ConsumeValueOrDie();
-
-  ComputeAndCompareR2<float>(
-      &builder, Array2D<float>({{296.14560492846033}, {0.8611737683031964}}),
-      {lhs_handle.get(), rhs_handle.get()}, error_spec_);
-}
-
-template <typename Element>
-void DotOperationTest::TestSquareMatrixDot(bool lhs_row_major,
-                                           bool rhs_row_major) {
   auto lhs_handle =
-      client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
-              {{1.0, 2.0}, {3.0, -4.0}},
-              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major))))
-          .ConsumeValueOrDie();
-  auto rhs_handle =
-      client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
-              {{1.0, 6.0}, {7.0, -4.0}},
-              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major))))
+      this->client_
+          ->TransferToServer(*Literal::CreateR2FromArray2D<T>(
+              {{1.0f, 2.0f, 3.0f, 4.0f}, {-1.0f, -2.0f, -3.0f, -4.0f}}))
           .ConsumeValueOrDie();
+  auto rhs_handle = this->client_
+                        ->TransferToServer(*Literal::CreateR2FromArray2D<T>(
+                            {{1.0f}, {2.0f}, {3.0f}, {4.0f}}))
+                        .ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
-  auto prim_type = primitive_util::NativeToPrimitiveType<Element>();
-  auto result = builder.Dot(
-      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}), "lhs"),
-      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {2, 2}), "rhs"));
+  if (std::is_same<Eigen::half, T>::value) {
+    this->error_spec_ = ErrorSpec{0.0001, 1e-3};
+  }
 
-  Array2D<Element> expected({{15.0, -2.0}, {-25.0, 34.0}});
-  ComputeAndCompareR2<Element>(
-      &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
+  this->template ComputeAndCompareR2<T>(
+      &builder, Array2D<T>({{296.14560492846033f}, {0.8611737683031964f}}),
+      {lhs_handle.get(), rhs_handle.get()}, this->error_spec_);
 }
 
+template <typename T>
+class SquareMatrixDot : public DotOperationTest {
+ public:
+  void TestImpl(bool lhs_row_major, bool rhs_row_major) {
+    auto lhs_handle =
+        client_
+            ->TransferToServer(*Literal::CreateFromArrayWithLayout<T>(
+                {{1.0f, 2.0f}, {3.0f, -4.0f}},
+                LayoutUtil::MakeLayout(
+                    MinorToMajorForIsRowMajor(lhs_row_major))))
+            .ConsumeValueOrDie();
+    auto rhs_handle =
+        client_
+            ->TransferToServer(*Literal::CreateFromArrayWithLayout<T>(
+                {{1.0f, 6.0f}, {7.0f, -4.0f}},
+                LayoutUtil::MakeLayout(
+                    MinorToMajorForIsRowMajor(rhs_row_major))))
+            .ConsumeValueOrDie();
+    XlaBuilder builder(TestName());
+    auto prim_type = primitive_util::NativeToPrimitiveType<T>();
+    auto result = builder.Dot(
+        builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}), "lhs"),
+        builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {2, 2}), "rhs"));
+
+    Array2D<T> expected({{15.0f, -2.0f}, {-25.0f, 34.0f}});
+    ComputeAndCompareR2<T>(&builder, expected,
+                           {lhs_handle.get(), rhs_handle.get()}, error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(SquareMatrixDot, TypesF16F32F64CF64);
+XLA_TYPED_TEST(SquareMatrixDot, TypesFF) { this->TestImpl(false, false); }
+XLA_TYPED_TEST(SquareMatrixDot, TypesFT) { this->TestImpl(false, true); }
+XLA_TYPED_TEST(SquareMatrixDot, TypesTF) { this->TestImpl(true, false); }
+XLA_TYPED_TEST(SquareMatrixDot, TypesTT) { this->TestImpl(true, true); }
+
 struct DotTestParam {
   int m;
   int k;
@@ -225,58 +275,73 @@ string PrintDotTestParam(
 }
 
 class ParametricDotTest : public DotOperationTest,
-                          public ::testing::WithParamInterface<DotTestParam> {};
+                          public ::testing::WithParamInterface<DotTestParam> {
+ protected:
+  template <typename NativeT>
+  void TestImpl();
+};
 
-XLA_TEST_P(ParametricDotTest, TestF32) {
+template <typename NativeT>
+void ParametricDotTest::TestImpl() {
   DotTestParam param = GetParam();
 
-  std::unique_ptr<Array2D<float>> dot_lhs_data =
-      MakeLinspaceArray2D(0.0, 1.0, param.m, param.k);
+  std::unique_ptr<Array2D<NativeT>> dot_lhs_data =
+      MakeLinspaceArray2D<NativeT>(0.0, 1.0, param.m, param.k);
   std::unique_ptr<Literal> dot_lhs_lit = Literal::CreateR2FromArray2DWithLayout(
       *dot_lhs_data, LayoutUtil::MakeLayout(
                          MinorToMajorForIsRowMajor(param.dot_lhs_row_major)));
   std::unique_ptr<GlobalData> dot_lhs_handle =
       client_->TransferToServer(*dot_lhs_lit).ConsumeValueOrDie();
 
-  std::unique_ptr<Array2D<float>> dot_rhs_data =
-      MakeLinspaceArray2D(0.0, 1.0, param.k, param.n);
-  std::unique_ptr<Literal> dot_rhs_lit = Literal::CreateR2FromArray2DWithLayout(
-      *dot_rhs_data, LayoutUtil::MakeLayout(
-                         MinorToMajorForIsRowMajor(param.dot_rhs_row_major)));
+  std::unique_ptr<Array2D<NativeT>> dot_rhs_data =
+      MakeLinspaceArray2D<NativeT>(0.0, 1.0, param.k, param.n);
+  Layout rhs_layout = LayoutUtil::MakeLayout(
+      MinorToMajorForIsRowMajor(param.dot_rhs_row_major));
+  std::unique_ptr<Literal> dot_rhs_lit =
+      Literal::CreateR2FromArray2DWithLayout(*dot_rhs_data, rhs_layout);
   std::unique_ptr<GlobalData> dot_rhs_handle =
       client_->TransferToServer(*dot_rhs_lit).ConsumeValueOrDie();
 
-  std::unique_ptr<Array2D<float>> addend_data;
+  std::unique_ptr<Array2D<NativeT>> addend_data;
   std::unique_ptr<Literal> addend_lit;
   std::unique_ptr<GlobalData> addend_handle;
 
   if (param.has_addend) {
-    addend_data = MakeLinspaceArray2D(0.0, 1.0, param.m, param.n);
+    addend_data = MakeLinspaceArray2D<NativeT>(0.0, 1.0, param.m, param.n);
     addend_lit = Literal::CreateR2FromArray2DWithLayout(
         *addend_data, LayoutUtil::MakeLayout(
                           MinorToMajorForIsRowMajor(param.addend_row_major)));
     addend_handle = client_->TransferToServer(*addend_lit).ConsumeValueOrDie();
   }
 
-  ComputationBuilder builder(client_, TestName());
-  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
+  XlaBuilder builder(TestName());
+  auto prim_type = primitive_util::NativeToPrimitiveType<NativeT>();
   auto result = builder.Dot(
-      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {param.m, param.k}),
+      builder.Parameter(0,
+                        ShapeUtil::MakeShapeWithLayout(
+                            prim_type, {param.m, param.k},
+                            MinorToMajorForIsRowMajor(param.dot_lhs_row_major)),
                         "dot_lhs"),
-      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {param.k, param.n}),
+      builder.Parameter(1,
+                        ShapeUtil::MakeShapeWithLayout(
+                            prim_type, {param.k, param.n},
+                            MinorToMajorForIsRowMajor(param.dot_rhs_row_major)),
                         "dot_rhs"));
 
   if (param.has_addend) {
     result = builder.Add(
-        result,
-        builder.Parameter(
-            2, ShapeUtil::MakeShape(prim_type, {param.m, param.n}), "addend"));
+        result, builder.Parameter(
+                    2,
+                    ShapeUtil::MakeShapeWithLayout(
+                        prim_type, {param.m, param.n},
+                        MinorToMajorForIsRowMajor(param.addend_row_major)),
+                    "addend"));
   }
 
-  std::unique_ptr<Array2D<float>> expected;
+  std::unique_ptr<Array2D<NativeT>> expected;
   if (param.has_addend) {
     expected = ReferenceUtil::ApplyElementwise2D(
-        std::plus<float>(),
+        std::plus<NativeT>(),
         *ReferenceUtil::MatmulArray2D(*dot_lhs_data, *dot_rhs_data),
         *addend_data);
   } else {
@@ -287,8 +352,11 @@ XLA_TEST_P(ParametricDotTest, TestF32) {
   if (param.has_addend) {
     args.push_back(addend_handle.get());
   }
-
-  ComputeAndCompareR2<float>(&builder, *expected, args, ErrorSpec(0.3, 3e-3));
+  ErrorSpec error_spec(0.3, 3e-3);
+  if (std::is_same<Eigen::half, NativeT>::value) {
+    error_spec = ErrorSpec(0.3, 5e-3);
+  }
+  ComputeAndCompareR2<NativeT>(&builder, *expected, args, error_spec);
 }
 
 std::vector<DotTestParam> CreateDotTestParameters() {
@@ -305,30 +373,77 @@ std::vector<DotTestParam> CreateDotTestParameters() {
     }
   };
 
+  add_matrix_matrix_dot_test(/*m=*/12, /*k=*/117, /*n=*/7);
+  add_matrix_matrix_dot_test(/*m=*/270, /*k=*/270, /*n=*/520);
+  add_matrix_matrix_dot_test(/*m=*/260, /*k=*/3, /*n=*/520);
+
+  return params;
+}
+
+#ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
+XLA_TEST_P(ParametricDotTest, TestF16) { TestImpl<Eigen::half>(); }
+#endif
+XLA_TEST_P(ParametricDotTest, TestF32) { TestImpl<float>(); }
+XLA_TEST_P(ParametricDotTest, TestF64) { TestImpl<double>(); }
+
+INSTANTIATE_TEST_CASE_P(DotTests, ParametricDotTest,
+                        ::testing::ValuesIn(CreateDotTestParameters()),
+                        PrintDotTestParam);
+
+class ParametricDotTestWithoutLayoutAssignment : public ParametricDotTest {
+ public:
+  ParametricDotTestWithoutLayoutAssignment() {
+    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+        "layout-assignment");
+  }
+};
+
+std::vector<DotTestParam> CreateNoLayoutAssignmentDotTestParameters() {
+  std::vector<DotTestParam> params;
+
   auto add_matrix_vector_dot_test = [&](int k, int n) {
-    for (bool has_addend : {false, true}) {
-      params.push_back({/*m=*/1, /*k=*/k, /*n=*/n,
-                        /*dot_lhs_row_major=*/true, /*dot_rhs_row_major=*/true,
-                        /*has_addend=*/has_addend, /*addend_row_major=*/true});
-      if (n != 1) {
-        params.push_back(
-            {/*m=*/n, /*k=*/k, /*n=*/1,
-             /*dot_lhs_row_major=*/true, /*dot_rhs_row_major=*/true,
-             /*has_addend=*/has_addend, /*addend_row_major=*/true});
+    for (bool lhs_row_major : {true, false}) {
+      for (bool rhs_row_major : {true, false}) {
+        for (bool has_addend : {true, false}) {
+          params.push_back({/*m=*/1, /*k=*/k, /*n=*/n,
+                            /*dot_lhs_row_major=*/lhs_row_major,
+                            /*dot_rhs_row_major=*/rhs_row_major,
+                            /*has_addend=*/has_addend,
+                            /*addend_row_major=*/true});
+          if (has_addend) {
+            params.push_back({/*m=*/1, /*k=*/k, /*n=*/n,
+                              /*dot_lhs_row_major=*/lhs_row_major,
+                              /*dot_rhs_row_major=*/rhs_row_major,
+                              /*has_addend=*/has_addend,
+                              /*addend_row_major=*/false});
+          }
+          if (n != 1) {
+            params.push_back({/*m=*/n, /*k=*/k, /*n=*/1,
+                              /*dot_lhs_row_major=*/lhs_row_major,
+                              /*dot_rhs_row_major=*/rhs_row_major,
+                              /*has_addend=*/has_addend,
+                              /*addend_row_major=*/true});
+            if (has_addend) {
+              params.push_back({/*m=*/n, /*k=*/k, /*n=*/1,
+                                /*dot_lhs_row_major=*/lhs_row_major,
+                                /*dot_rhs_row_major=*/rhs_row_major,
+                                /*has_addend=*/has_addend,
+                                /*addend_row_major=*/false});
+            }
+          }
+        }
       }
     }
   };
 
-  add_matrix_matrix_dot_test(/*m=*/12, /*k=*/117, /*n=*/7);
-  add_matrix_matrix_dot_test(/*m=*/270, /*k=*/270, /*n=*/520);
-  add_matrix_matrix_dot_test(/*m=*/260, /*k=*/3, /*n=*/520);
-
   add_matrix_vector_dot_test(/*k=*/8, /*n=*/8);
   add_matrix_vector_dot_test(/*k=*/130, /*n=*/8);
   add_matrix_vector_dot_test(/*k=*/8, /*n=*/130);
   add_matrix_vector_dot_test(/*k=*/290, /*n=*/130);
   add_matrix_vector_dot_test(/*k=*/1, /*n=*/1);
   add_matrix_vector_dot_test(/*k=*/1, /*n=*/16);
+  add_matrix_vector_dot_test(/*k=*/1, /*n=*/4);
+  add_matrix_vector_dot_test(/*k=*/1, /*n=*/3);
   add_matrix_vector_dot_test(/*k=*/3, /*n=*/16);
   add_matrix_vector_dot_test(/*k=*/3, /*n=*/3);
   add_matrix_vector_dot_test(/*k=*/29, /*n=*/29);
@@ -339,109 +454,60 @@ std::vector<DotTestParam> CreateDotTestParameters() {
   return params;
 }
 
-INSTANTIATE_TEST_CASE_P(DotTests, ParametricDotTest,
-                        ::testing::ValuesIn(CreateDotTestParameters()),
-                        PrintDotTestParam);
-
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFF) {
-  TestSquareMatrixDot<float>(false, false);
-}
-
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFT) {
-  TestSquareMatrixDot<float>(false, true);
-}
-
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTF) {
-  TestSquareMatrixDot<float>(true, false);
-}
-
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTT) {
-  TestSquareMatrixDot<float>(true, true);
-}
-
-XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorFF) {
-  TestSquareMatrixDot<complex64>(false, false);
-}
-
-XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorFT) {
-  TestSquareMatrixDot<complex64>(false, true);
-}
-
-XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorTF) {
-  TestSquareMatrixDot<complex64>(true, false);
-}
-
-XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorTT) {
-  TestSquareMatrixDot<complex64>(true, true);
-}
-
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF64) {
-  TestSquareMatrixDot<double>();
-}
-
-template <typename Element>
-void DotOperationTest::TestNonsquareMatrixDot(bool lhs_row_major,
-                                              bool rhs_row_major) {
-  auto lhs_handle =
-      client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
-              {{1.0, 2.0, 3.0}, {3.0, -4.0, -1.0}},
-              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major))))
-          .ConsumeValueOrDie();
-  auto rhs_handle =
-      client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
-              {{1.0, 6.0}, {2.0, 3.0}, {7.0, -4.0}},
-              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major))))
-          .ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto prim_type = primitive_util::NativeToPrimitiveType<Element>();
-  auto result = builder.Dot(
-      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 3}), "lhs"),
-      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {3, 2}), "rhs"));
-
-  Array2D<Element> expected({{26.0, 0.0}, {-12.0, 10.0}});
-
-  ComputeAndCompareR2<Element>(
-      &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
-}
-
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF32MajorToMinorFF) {
-  TestNonsquareMatrixDot<float>(false, false);
-}
-
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF32MajorToMinorFT) {
-  TestNonsquareMatrixDot<float>(false, true);
-}
-
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF32MajorToMinorTF) {
-  TestNonsquareMatrixDot<float>(true, false);
-}
-
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF32MajorToMinorTT) {
-  TestNonsquareMatrixDot<float>(true, true);
+#ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
+XLA_TEST_P(ParametricDotTestWithoutLayoutAssignment, TestF16) {
+  TestImpl<Eigen::half>();
 }
-
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF64) {
-  TestNonsquareMatrixDot<double>();
+#endif
+XLA_TEST_P(ParametricDotTestWithoutLayoutAssignment, TestF32) {
+  TestImpl<float>();
 }
-
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64MajorToMinorFF) {
-  TestNonsquareMatrixDot<complex64>(false, false);
+XLA_TEST_P(ParametricDotTestWithoutLayoutAssignment, TestF64) {
+  TestImpl<double>();
 }
 
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64MajorToMinorFT) {
-  TestNonsquareMatrixDot<complex64>(false, true);
-}
+INSTANTIATE_TEST_CASE_P(
+    DotTests, ParametricDotTestWithoutLayoutAssignment,
+    ::testing::ValuesIn(CreateNoLayoutAssignmentDotTestParameters()),
+    PrintDotTestParam);
 
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64MajorToMinorTF) {
-  TestNonsquareMatrixDot<complex64>(true, false);
-}
+template <typename T>
+class NonsquareMatrixDot : public DotOperationTest {
+ public:
+  void TestImpl(bool lhs_row_major, bool rhs_row_major) {
+    auto lhs_handle =
+        client_
+            ->TransferToServer(*Literal::CreateFromArrayWithLayout<T>(
+                {{1.0f, 2.0f, 3.0f}, {3.0f, -4.0f, -1.0f}},
+                LayoutUtil::MakeLayout(
+                    MinorToMajorForIsRowMajor(lhs_row_major))))
+            .ConsumeValueOrDie();
+    auto rhs_handle =
+        client_
+            ->TransferToServer(*Literal::CreateFromArrayWithLayout<T>(
+                {{1.0f, 6.0f}, {2.0f, 3.0f}, {7.0f, -4.0f}},
+                LayoutUtil::MakeLayout(
+                    MinorToMajorForIsRowMajor(rhs_row_major))))
+            .ConsumeValueOrDie();
+
+    XlaBuilder builder(TestName());
+    auto prim_type = primitive_util::NativeToPrimitiveType<T>();
+    auto result = builder.Dot(
+        builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 3}), "lhs"),
+        builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {3, 2}), "rhs"));
+
+    Array2D<T> expected({{26.0f, 0.0f}, {-12.0f, 10.0f}});
+
+    ComputeAndCompareR2<T>(&builder, expected,
+                           {lhs_handle.get(), rhs_handle.get()}, error_spec_);
+  }
+};
 
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64MajorToMinorTT) {
-  TestNonsquareMatrixDot<complex64>(true, true);
-}
+TYPED_TEST_CASE(NonsquareMatrixDot, TypesF16F32F64CF64);
+XLA_TYPED_TEST(NonsquareMatrixDot, TestFF) { this->TestImpl(false, false); }
+XLA_TYPED_TEST(NonsquareMatrixDot, TestFT) { this->TestImpl(false, true); }
+XLA_TYPED_TEST(NonsquareMatrixDot, TestTF) { this->TestImpl(true, false); }
+XLA_TYPED_TEST(NonsquareMatrixDot, TestTT) { this->TestImpl(true, true); }
 
 XLA_TEST_F(DotOperationTest, MatrixVectorC64) {
   auto lhs_handle =
@@ -456,7 +522,7 @@ XLA_TEST_F(DotOperationTest, MatrixVectorC64) {
               LayoutUtil::MakeLayout({1, 0})))
           .ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto prim_type = primitive_util::NativeToPrimitiveType<complex64>();
   auto result = builder.Dot(
       builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {1, 4}), "lhs"),
@@ -468,31 +534,41 @@ XLA_TEST_F(DotOperationTest, MatrixVectorC64) {
       &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
 }
 
-XLA_TEST_F(DotOperationTest, ConcurrentMatMul) {
-  ComputationBuilder builder(client_, TestName());
-  auto matrix1 = builder.ConstantR2<float>({{1.0, 2.0}, {3.0, 4.0}});
-  auto matrix2 = builder.ConstantR2<float>({{5.0, 6.0}, {7.0, 8.0}});
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, ConcurrentMatMult) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  auto matrix1 = builder.ConstantR2FromArray2D<T>({{1.0f, 2.0f}, {3.0f, 4.0f}});
+  auto matrix2 = builder.ConstantR2FromArray2D<T>({{5.0f, 6.0f}, {7.0f, 8.0f}});
   auto matrix12 = builder.Dot(matrix1, matrix2);
   auto matrix21 = builder.Dot(matrix2, matrix1);
   builder.Add(matrix12, matrix21);
 
-  Array2D<float> expected({{42.0, 56.0}, {74.0, 96.0}});
-  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+  Array2D<T> expected({{42.0f, 56.0f}, {74.0f, 96.0f}});
+  this->template ComputeAndCompareR2<T>(&builder, expected, {},
+                                        this->error_spec_);
 }
 
+template <typename T>
+class DotOperationTestForBatchMatMul : public DotOperationTest {};
+TYPED_TEST_CASE(DotOperationTestForBatchMatMul, TypesF16F32F64);
+
 // Regression test for b/32055648. The root of the graph is a kFusion of 4
 // bitcasts. Although bitcasts don't map to thunks, the root should still be
 // sync-dependent on bitcasts' operands.
-XLA_TEST_F(DotOperationTest, BatchMatMul) {
-  ComputationBuilder builder(client_, TestName());
-  auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2, 2, 2}), "x");
-  auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2, 2, 2}), "y");
+XLA_TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
+  using T = TypeParam;
+  XlaBuilder builder(this->TestName());
+  auto x =
+      builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}), "x");
+  auto y =
+      builder.Parameter(1, ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}), "y");
 
   auto x_flat = builder.Reshape(x, {0, 1, 2, 3}, {4, 2, 2});
   auto y_flat = builder.Reshape(y, {0, 1, 2, 3}, {4, 2, 2});
 
   // Slice batches into individual matrices and multiply them.
-  std::vector<xla::ComputationDataHandle> out_slices;
+  std::vector<XlaOp> out_slices;
   for (int i = 0; i < 4; ++i) {
     // Slice off individual matrices and reshape to 2D tensors.
     auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
@@ -507,29 +583,42 @@ XLA_TEST_F(DotOperationTest, BatchMatMul) {
   auto out_flat = builder.ConcatInDim(out_slices, 0);
   builder.Reshape(out_flat, {0, 1, 2}, {2, 2, 2, 2});
 
-  auto x_data = client_
-                    ->TransferToServer(*Literal::CreateR4<float>(
-                        {{{{1000, 100}, {10, 1}}, {{2000, 200}, {20, 2}}},
-                         {{{3000, 300}, {30, 3}}, {{4000, 400}, {40, 4}}}}))
-                    .ConsumeValueOrDie();
-  auto y_data = client_
-                    ->TransferToServer(*Literal::CreateR4<float>(
-                        {{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}},
-                         {{{11, 22}, {33, 44}}, {{55, 66}, {77, 88}}}}))
+  auto x_data = this->client_
+                    ->TransferToServer(*Literal::CreateR4FromArray4D<T>(
+                        {{{{1000.0f, 100.0f}, {10.0f, 1.0f}},
+                          {{2000.0f, 200.0f}, {20.0f, 2.0f}}},
+                         {{{3000.0f, 300.0f}, {30.0f, 3.0f}},
+                          {{4000.0f, 400.0f}, {40.0f, 4.0f}}}}))
                     .ConsumeValueOrDie();
+  auto y_data =
+      this->client_
+          ->TransferToServer(*Literal::CreateR4FromArray4D<T>(
+              {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
+               {{{11.0f, 22.0f}, {33.0f, 44.0f}},
+                {{55.0f, 66.0f}, {77.0f, 88.0f}}}}))
+          .ConsumeValueOrDie();
 
-  ComputeAndCompareR4<float>(
+  if (std::is_same<Eigen::half, T>::value) {
+    this->error_spec_ = ErrorSpec{0.0001, 1e-3};
+  }
+  this->template ComputeAndCompareR4<T>(
       &builder,
       /*expected=*/
-      {{{{1300, 2400}, {13, 24}}, {{11400, 13600}, {114, 136}}},
-       {{{42900, 79200}, {429, 792}}, {{250800, 299200}, {2508, 2992}}}},
-      {x_data.get(), y_data.get()}, error_spec_);
+      {{{{1300.0f, 2400.0f}, {13.0f, 24.0f}},
+        {{11400.0f, 13600.0f}, {114.0f, 136.0f}}},
+       {{{42900.0f, 79200.0f}, {429.0f, 792.0f}},
+        {{250800.0f, 299200.0f}, {2508.0f, 2992.0f}}}},
+      {x_data.get(), y_data.get()}, this->error_spec_);
 }
 
-XLA_TEST_F(DotOperationTest, GeneralMatMul) {
-  ComputationBuilder builder(client_, TestName());
-  auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2, 2}), "x");
-  auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2, 2}), "y");
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, GeneralMatMul) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  auto x =
+      builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2}), "x");
+  auto y =
+      builder.Parameter(1, ShapeUtil::MakeShapeWithType<T>({2, 2, 2}), "y");
 
   DotDimensionNumbers dnums;
   dnums.add_lhs_contracting_dimensions(2);
@@ -539,31 +628,34 @@ XLA_TEST_F(DotOperationTest, GeneralMatMul) {
 
   auto out = builder.DotGeneral(x, y, dnums);
 
-  auto x_data = client_
-                    ->TransferToServer(*Literal::CreateR3<float>(
-                        {{{1.0, 2.0}, {3.0, 4.0}}, {{5.0, 6.0}, {7.0, 8.0}}}))
-                    .ConsumeValueOrDie();
+  auto x_data =
+      this->client_
+          ->TransferToServer(*Literal::CreateR3FromArray3D<T>(
+              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
+          .ConsumeValueOrDie();
 
-  auto y_data = client_
-                    ->TransferToServer(*Literal::CreateR3<float>(
-                        {{{1.0, 0.0}, {0.0, 1.0}}, {{1.0, 0.0}, {0.0, 1.0}}}))
-                    .ConsumeValueOrDie();
+  auto y_data =
+      this->client_
+          ->TransferToServer(*Literal::CreateR3FromArray3D<T>(
+              {{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}}))
+          .ConsumeValueOrDie();
 
-  ComputeAndCompareR3<float>(
+  this->template ComputeAndCompareR3<T>(
       &builder,
       /*expected=*/
-      {{{1.0, 2.0}, {3.0, 4.0}}, {{5.0, 6.0}, {7.0, 8.0}}},
-      {x_data.get(), y_data.get()}, error_spec_);
+      {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
+      {x_data.get(), y_data.get()}, this->error_spec_);
 }
 
-TEST_F(DotOperationTest, TransposeFolding) {
+XLA_TYPED_TEST(DotOperationTest_F16F32F64, TransposeFolding) {
+  using T = TypeParam;
   for (bool transpose_lhs : {false, true}) {
     for (bool transpose_rhs : {false, true}) {
       for (bool row_major : {false, true}) {
-        std::unique_ptr<Array2D<float>> lhs(
-            new Array2D<float>({{1.0, 2.0, 3.0}, {3.0, -4.0, -1.0}}));
-        std::unique_ptr<Array2D<float>> rhs(
-            new Array2D<float>({{1.0, 6.0}, {2.0, 3.0}, {7.0, -4.0}}));
+        std::unique_ptr<Array2D<T>> lhs(
+            new Array2D<T>({{1.0f, 2.0f, 3.0f}, {3.0f, -4.0f, -1.0f}}));
+        std::unique_ptr<Array2D<T>> rhs(
+            new Array2D<T>({{1.0f, 6.0f}, {2.0f, 3.0f}, {7.0f, -4.0f}}));
 
         if (transpose_lhs) {
           lhs = ReferenceUtil::TransposeArray2D(*lhs);
@@ -572,22 +664,20 @@ TEST_F(DotOperationTest, TransposeFolding) {
           rhs = ReferenceUtil::TransposeArray2D(*rhs);
         }
         auto lhs_handle =
-            client_
-                ->TransferToServer(
-                    *Literal::CreateR2FromArray2DWithLayout<float>(
-                        *lhs, LayoutUtil::MakeLayout(
-                                  MinorToMajorForIsRowMajor(row_major))))
+            this->client_
+                ->TransferToServer(*Literal::CreateR2FromArray2DWithLayout<T>(
+                    *lhs, LayoutUtil::MakeLayout(
+                              MinorToMajorForIsRowMajor(row_major))))
                 .ConsumeValueOrDie();
         auto rhs_handle =
-            client_
-                ->TransferToServer(
-                    *Literal::CreateR2FromArray2DWithLayout<float>(
-                        *rhs, LayoutUtil::MakeLayout(
-                                  MinorToMajorForIsRowMajor(row_major))))
+            this->client_
+                ->TransferToServer(*Literal::CreateR2FromArray2DWithLayout<T>(
+                    *rhs, LayoutUtil::MakeLayout(
+                              MinorToMajorForIsRowMajor(row_major))))
                 .ConsumeValueOrDie();
 
-        ComputationBuilder builder(client_, TestName());
-        auto prim_type = primitive_util::NativeToPrimitiveType<float>();
+        XlaBuilder builder(this->TestName());
+        auto prim_type = primitive_util::NativeToPrimitiveType<T>();
         auto lhs_arg = builder.Parameter(
             0, ShapeUtil::MakeShape(prim_type, {lhs->height(), lhs->width()}),
             "lhs");
@@ -602,24 +692,27 @@ TEST_F(DotOperationTest, TransposeFolding) {
         }
         auto result = builder.Dot(lhs_arg, rhs_arg);
 
-        Array2D<float> expected({{26.0, 0.0}, {-12.0, 10.0}});
+        Array2D<T> expected({{26.0f, 0.0f}, {-12.0f, 10.0f}});
         VLOG(1) << "TestTransposeFolding " << transpose_lhs << " "
                 << transpose_rhs << " " << row_major;
-        ComputeAndCompareR2<float>(&builder, expected,
-                                   {lhs_handle.get(), rhs_handle.get()},
-                                   error_spec_);
+        this->template ComputeAndCompareR2<T>(
+            &builder, expected, {lhs_handle.get(), rhs_handle.get()},
+            this->error_spec_);
       }
     }
   }
 }
 
-TEST_F(DotOperationTest, DotOfConcatOptimizationWithConstLHS) {
-  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
+XLA_TYPED_TEST(DotOperationTest_F16F32F64,
+               DotOfConcatOptimizationWithConstLHS) {
+  using T = TypeParam;
+  auto prim_type = primitive_util::NativeToPrimitiveType<T>();
 
-  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
-      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<T>> constant_lhs_array(
+      new Array2D<T>({{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
+                      {6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}}));
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(this->TestName());
   auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
   auto rhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}),
                                      "rhs_arg_0");
@@ -630,78 +723,80 @@ TEST_F(DotOperationTest, DotOfConcatOptimizationWithConstLHS) {
   auto result = builder.Dot(
       lhs_constant, builder.ConcatInDim({rhs_arg_0, rhs_arg_1, rhs_arg_2}, 0));
 
-  std::unique_ptr<Array2D<float>> arg_0_value_array(
-      new Array2D<float>({{1.0, 2.0}, {3.0, 4.0}}));
-  std::unique_ptr<Array2D<float>> arg_1_value_array(
-      new Array2D<float>({{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}));
-  std::unique_ptr<Array2D<float>> arg_2_value_array(
-      new Array2D<float>({{1.0, 2.0}}));
+  std::unique_ptr<Array2D<T>> arg_0_value_array(
+      new Array2D<T>({{1.0f, 2.0f}, {3.0f, 4.0f}}));
+  std::unique_ptr<Array2D<T>> arg_1_value_array(
+      new Array2D<T>({{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}));
+  std::unique_ptr<Array2D<T>> arg_2_value_array(new Array2D<T>({{1.0f, 2.0f}}));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto arg_0_value,
-      client_->TransferToServer(
-          *Literal::CreateR2FromArray2D<float>(*arg_0_value_array)));
+      this->client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<T>(*arg_0_value_array)));
   TF_ASSERT_OK_AND_ASSIGN(
       auto arg_1_value,
-      client_->TransferToServer(
-          *Literal::CreateR2FromArray2D<float>(*arg_1_value_array)));
+      this->client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<T>(*arg_1_value_array)));
   TF_ASSERT_OK_AND_ASSIGN(
       auto arg_2_value,
-      client_->TransferToServer(
-          *Literal::CreateR2FromArray2D<float>(*arg_2_value_array)));
+      this->client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<T>(*arg_2_value_array)));
 
-  Array2D<float> expected({{53.0, 74.0}, {45.0, 66.0}});
-  ComputeAndCompareR2<float>(
+  Array2D<T> expected({{53.0f, 74.0f}, {45.0f, 66.0f}});
+  this->template ComputeAndCompareR2<T>(
       &builder, expected,
-      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()}, error_spec_);
-}
-
-TEST_F(DotOperationTest, DotOfConcatOptimizationWithConstRHS) {
-  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
-
-  std::unique_ptr<Array2D<float>> constant_rhs_array(
-      new Array2D<float>({{1.0, 2.0},
-                          {3.0, 4.0},
-                          {5.0, 6.0},
-                          {6.0, 5.0},
-                          {4.0, 3.0},
-                          {2.0, 1.0}}));
-
-  ComputationBuilder builder(client_, TestName());
+      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()},
+      this->error_spec_);
+}
+
+XLA_TYPED_TEST(DotOperationTest_F16F32F64,
+               DotOfConcatOptimizationWithConstRHS) {
+  using T = TypeParam;
+  std::unique_ptr<Array2D<T>> constant_rhs_array(
+      new Array2D<T>({{1.0f, 2.0f},
+                      {3.0f, 4.0f},
+                      {5.0f, 6.0f},
+                      {6.0f, 5.0f},
+                      {4.0f, 3.0f},
+                      {2.0f, 1.0f}}));
+
+  XlaBuilder builder(this->TestName());
   auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
-  auto lhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}),
+  auto lhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({2, 2}),
                                      "lhs_arg_0");
-  auto lhs_arg_1 = builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {2, 3}),
+  auto lhs_arg_1 = builder.Parameter(1, ShapeUtil::MakeShapeWithType<T>({2, 3}),
                                      "lhs_arg_1");
-  auto lhs_arg_2 = builder.Parameter(2, ShapeUtil::MakeShape(prim_type, {2, 1}),
+  auto lhs_arg_2 = builder.Parameter(2, ShapeUtil::MakeShapeWithType<T>({2, 1}),
                                      "lhs_arg_2");
   auto result = builder.Dot(
       builder.ConcatInDim({lhs_arg_0, lhs_arg_1, lhs_arg_2}, 1), rhs_constant);
 
-  std::unique_ptr<Array2D<float>> arg_0_value_array(
-      new Array2D<float>({{1.0, 2.0}, {3.0, 4.0}}));
-  std::unique_ptr<Array2D<float>> arg_1_value_array(
-      new Array2D<float>({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
-  std::unique_ptr<Array2D<float>> arg_2_value_array(
-      new Array2D<float>({{1.0}, {2.0}}));
+  std::unique_ptr<Array2D<T>> arg_0_value_array(
+      new Array2D<T>({{1.0f, 2.0f}, {3.0f, 4.0f}}));
+  std::unique_ptr<Array2D<T>> arg_1_value_array(
+      new Array2D<T>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}));
+  std::unique_ptr<Array2D<T>> arg_2_value_array(
+      new Array2D<T>({{1.0f}, {2.0f}}));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto arg_0_value,
-      client_->TransferToServer(
-          *Literal::CreateR2FromArray2D<float>(*arg_0_value_array)));
+      this->client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<T>(*arg_0_value_array)));
   TF_ASSERT_OK_AND_ASSIGN(
       auto arg_1_value,
-      client_->TransferToServer(
-          *Literal::CreateR2FromArray2D<float>(*arg_1_value_array)));
+      this->client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<T>(*arg_1_value_array)));
   TF_ASSERT_OK_AND_ASSIGN(
       auto arg_2_value,
-      client_->TransferToServer(
-          *Literal::CreateR2FromArray2D<float>(*arg_2_value_array)));
+      this->client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<T>(*arg_2_value_array)));
 
-  Array2D<float> expected({{38.0, 36.0}, {93.0, 91.0}});
-  ComputeAndCompareR2<float>(
+  Array2D<T> expected({{38.0f, 36.0f}, {93.0f, 91.0f}});
+  this->template ComputeAndCompareR2<T>(
       &builder, expected,
-      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()}, error_spec_);
+      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()},
+      this->error_spec_);
 }
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 877dc7db0eec229a7119b3627f177a33ed0d971b..bfb83faf5222b8ca5ceceebf7f2f976ec803245e 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -18,9 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
@@ -36,8 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
@@ -112,10 +109,8 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   void TestR3Wrap() {
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
     RunR3<IndexT, DataT>(
-      {{{1, 2}, {3, 4}, {5, 6}},
-       {{7, 8}, {9, 10}, {11, 12}}},
-      {0, 2, 1}, {2, 1, 2},
-      {{{6, 5}}, {{12, 11}}});
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {0, 2, 1},
+        {2, 1, 2}, {{{6, 5}}, {{12, 11}}});
   }
 
   template <typename IndexT, typename DataT>
@@ -137,9 +132,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -163,9 +158,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -189,9 +184,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -206,19 +201,19 @@ XLA_TEST_F(DynamicSliceTest, Int32R1BF16) { TestR1<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32, int32>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R1Wrap) { TestR1Wrap<int32, int32>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64, float>(); }
-XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64, double>(); }
+XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64, float>(); }
 
 XLA_TEST_F(DynamicSliceTest, Int32R2BF16) { TestR2<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32, int32>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R2Wrap) { TestR2Wrap<int32, int32>(); }
-XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64, double>(); }
+XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64, float>(); }
 XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
 XLA_TEST_F(DynamicSliceTest, Int32R3BF16) { TestR3<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32, float>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R3Wrap) { TestR3Wrap<int32, float>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64, float>(); }
-XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64, double>(); }
+XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64, float>(); }
 
 XLA_TEST_F(DynamicSliceTest, Int32R1Pred) {
   // Slice at dimension start.
@@ -281,6 +276,15 @@ XLA_TEST_F(DynamicSliceTest, Int32R3Pred) {
 
 class DynamicUpdateSliceTest : public ClientLibraryTestBase {
  protected:
+  template <typename IndexT, typename DataT>
+  void TestR0() {
+    // Disable algebraic simplifier, otherwise the op will be replaced by a
+    // constant.
+    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+        "algsimp");
+    RunR0<IndexT, DataT>(0, 123, {}, 123);
+  }
+
   template <typename IndexT, typename DataT>
   void TestR1() {
     // Slice at dimension start.
@@ -341,6 +345,35 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
         {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 15}, {9, 10}, {11, 13}}});
   }
 
+  template <typename IndexT, typename DataT>
+  void RunR0(int input_value_int, int update_value_int,
+             const std::vector<IndexT> slice_starts, int expected_value_int) {
+    Literal input_value =
+        std::move(*Literal::CreateR0(input_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal update_value =
+        std::move(*Literal::CreateR0(update_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_value =
+        std::move(*Literal::CreateR0(expected_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
+    XlaBuilder builder(TestName());
+    // Initialize and transfer dynamic slice start indices parameter.
+    XlaOp starts;
+    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
+        slice_starts, 0, "slice_starts", &builder, &starts);
+    // Build dynamic slice computation.
+    auto input = builder.ConstantLiteral(input_value);
+    auto update = builder.ConstantLiteral(update_value);
+    builder.DynamicUpdateSlice(input, update, starts);
+    // Run computation and compare against expected values.
+    ComputeAndCompareLiteral(&builder, expected_value, {start_data.get()});
+  }
+
   template <typename IndexT, typename DataT>
   void RunR1(tensorflow::gtl::ArraySlice<int> input_values_int,
              tensorflow::gtl::ArraySlice<int> update_values_int,
@@ -359,9 +392,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -390,9 +423,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -421,9 +454,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -437,13 +470,6 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   template <class T>
   void RunR3Contiguous(std::vector<int32> operand_shape, int32 index,
                        int32 size) {
-#ifdef XLA_TEST_BACKEND_CPU_PARALLEL
-    // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
-    if (std::is_same<bfloat16, T>::value) {
-      return;
-    }
-#endif
-
     const int32 kSeq = operand_shape[0];
     const int32 kBatch = operand_shape[1];
     const int32 kDim = operand_shape[2];
@@ -474,13 +500,13 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     }
 
     // Build dynamic slice computation.
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer input parameter.
-    ComputationDataHandle input;
+    XlaOp input;
     std::unique_ptr<GlobalData> input_data =
         CreateR3Parameter<T>(input_values, 0, "input_values", &builder, &input);
     // Initialize and transfer update parameter.
-    ComputationDataHandle update;
+    XlaOp update;
     std::unique_ptr<GlobalData> update_data = CreateR3Parameter<T>(
         update_values, 1, "update_values", &builder, &update);
     auto starts = builder.ConstantR1<int32>({index, 0, 0});
@@ -500,31 +526,28 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 };
 
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R0BF16) { TestR0<int32, bfloat16>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R0) { TestR0<int32, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0<int64, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0<uint64, float>(); }
+
 // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
-XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R1BF16)) {
-  TestR1<int32, bfloat16>();
-}
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R1BF16) { TestR1<int32, bfloat16>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1<int32, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1<int64, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64, double>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64, float>(); }
 
-// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
-XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R2BF16)) {
-  TestR2<int32, bfloat16>();
-}
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R2BF16) { TestR2<int32, bfloat16>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2<int32, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2<int64, int64>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
-// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
-XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R3BF16)) {
-  TestR3<int32, bfloat16>();
-}
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R3BF16) { TestR3<int32, bfloat16>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64, int64>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64, uint64>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32WrapBF16)) {
+XLA_TEST_F(DynamicUpdateSliceTest, Int32WrapBF16) {
   TestWrap<int32, bfloat16>();
 }
 XLA_TEST_F(DynamicUpdateSliceTest, Int32Wrap) { TestWrap<int32, float>(); }
@@ -672,7 +695,7 @@ void BM_DynamicSlice(int num_iters) {
       TransferManager::GetForPlatform(platform).ValueOrDie();
   int device_ordinal = client->default_device_ordinal();
 
-  ComputationBuilder builder(client, "DynamicSlice");
+  XlaBuilder builder("DynamicSlice");
 
   // Create input as a constant: shape [1, 2, 3, 4]
   auto input_literal = Literal::CreateR4(
@@ -697,11 +720,11 @@ void BM_DynamicSlice(int num_iters) {
 
   auto start_indices_literal = Literal::CreateR1<int32>({0, 1, 2, 3});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *start_indices_literal, *buffer));
+      executors[device_ordinal], *start_indices_literal, buffer));
 
   std::unique_ptr<LocalExecutable> executable =
       client
-          ->Compile(computation, {&buffer->on_host_shape()},
+          ->Compile(computation, {&buffer.on_host_shape()},
                     ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
@@ -710,14 +733,14 @@ void BM_DynamicSlice(int num_iters) {
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({buffer.get()}, options);
+    auto result = executable->Run({&buffer}, options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({buffer.get()}, options);
+    auto result = executable->Run({&buffer}, options);
     ASSERT_TRUE(result.ok());
   }
 }
diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc
index 644cbbf40f296eb2a574ae568b4f32aa3d0bd12f..a6ba6db5d3bf86de91f6fda022c46afee01281c2 100644
--- a/tensorflow/compiler/xla/tests/execution_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/platform/test.h"
@@ -24,8 +25,7 @@ namespace {
 
 class ExecutionProfileTest : public ClientLibraryTestBase {};
 
-XLA_TEST_F(ExecutionProfileTest,
-           DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) {
+XLA_TEST_F(ExecutionProfileTest, ExecuteWithExecutionProfile) {
   Shape shape = ShapeUtil::MakeShape(F32, {256, 256});
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -33,9 +33,9 @@ XLA_TEST_F(ExecutionProfileTest,
       client_->TransferToServer(
           *Literal::CreateR2F32Linspace(1e0, 1e5, 256, 256)));
 
-  ComputationBuilder b(client_, TestName() + ".add");
+  XlaBuilder b(TestName() + ".add");
   b.Dot(b.Parameter(0, shape, "param_0"), b.Parameter(1, shape, "param_1"));
-  TF_ASSERT_OK_AND_ASSIGN(Computation dot_product, b.Build());
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation dot_product, b.Build());
 
   ExecutionProfile execution_profile;
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
index 6fe7737de7af349dca2931b52d62dbc03b14e0b3..0a37e4d423620122f2e109343a86a964f46d778f 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -35,7 +36,7 @@ class ExhaustiveF32ElementwiseOpTest
     int64 input_size = end - begin;
     LOG(INFO) << "Checking range [" << begin << ", " << end << ")";
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
 
     std::unique_ptr<Literal> input_literal =
         Literal::CreateFromDimensions(F32, {input_size});
@@ -71,16 +72,14 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) {
 #ifdef XLA_TEST_BACKEND_CPU
   // TODO(b/73141998): The vectorized Log implementation gives results outside
   // our error spec in this range (these numbers are bitwise representations of
-  // floats expressed as a zero extended int64):
-  std::pair<int64, int64> known_incorrect_range = {1, 8315654};
+  // floats expressed as a zero extended int64).
+  std::pair<int64, int64> known_incorrect_range = {1, 8388608};
 #else
   std::pair<int64, int64> known_incorrect_range = {0, 0};
 #endif
 
   ExhaustivelyTestF32Op(
-      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
-        builder->Log(input);
-      },
+      [](XlaBuilder* builder, const XlaOp& input) { builder->Log(input); },
       std::log, known_incorrect_range);
 }
 
@@ -96,17 +95,13 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ExpF32) {
 #endif
 
   ExhaustivelyTestF32Op(
-      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
-        builder->Exp(input);
-      },
+      [](XlaBuilder* builder, const XlaOp& input) { builder->Exp(input); },
       std::exp, known_incorrect_range);
 }
 
 XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, TanhF32) {
   ExhaustivelyTestF32Op(
-      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
-        builder->Tanh(input);
-      },
+      [](XlaBuilder* builder, const XlaOp& input) { builder->Tanh(input); },
       std::tanh, /*known_incorrect_range=*/{0, 0});
 }
 
diff --git a/tensorflow/compiler/xla/tests/floor_ceil_test.cc b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
index e75a41acacc3aaad770f8bba78b43d8bf99b911b..71eb914a8e5eaef2e38b9e6e7d45b8a10ce1bd7a 100644
--- a/tensorflow/compiler/xla/tests/floor_ceil_test.cc
+++ b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <limits>
 #include <string>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -41,7 +41,7 @@ class FloorCeilTest : public ClientLibraryTestBase {
                  tensorflow::gtl::ArraySlice<float> expected, Function f) {
     LOG(INFO) << "input: {" << tensorflow::str_util::Join(expected, ", ")
               << "}";
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto c = builder.ConstantR1<float>(input);
     if (f == kCeil) {
       builder.Ceil(c);
@@ -54,7 +54,7 @@ class FloorCeilTest : public ClientLibraryTestBase {
 
   void TestR0F32(float input, float expected, Function f) {
     LOG(INFO) << "input: " << expected;
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto c = builder.ConstantR0<float>(input);
     if (f == kCeil) {
       builder.Ceil(c);
diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc
index f2aaf6621c1f0d7a7d1bc29b845859579d8e8d9d..73f029b59bc56aa6c3e86200a49fcae0fd177101 100644
--- a/tensorflow/compiler/xla/tests/fmax_test.cc
+++ b/tensorflow/compiler/xla/tests/fmax_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -27,7 +27,7 @@ namespace {
 class FmaxSimpleTest : public ClientLibraryTestBase {};
 
 TEST_F(FmaxSimpleTest, FmaxTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
   auto y = builder.ConstantR1<float>(
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index a292eab1d198fbf69c6dc81c780487ea46756f72..b947f8208a5fa3f5a396ebc7a234afbf7ac3d900 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -25,8 +25,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -50,8 +49,6 @@ limitations under the License.
 
 using tensorflow::gtl::ArraySlice;
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
@@ -677,21 +674,20 @@ XLA_TEST_F(FusionTest, SharedConstant) {
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR1<int32>({0})));
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({0})));
   auto const1 = builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
   auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, const0));
+      ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, const0));
   auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add1));
+      ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add1));
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add2));
+      ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add2));
   auto add4 = builder.AddInstruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add3));
+      ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add3));
   hlo_module->AddEntryComputation(builder.Build())
-      ->CreateFusionInstruction(
-        {add4, add3, add2, add1, const1},
-        HloInstruction::FusionKind::kLoop);
+      ->CreateFusionInstruction({add4, add3, add2, add1, const1},
+                                HloInstruction::FusionKind::kLoop);
 
   HloComputation* entry_comp = hlo_module->entry_computation();
 
@@ -702,7 +698,7 @@ XLA_TEST_F(FusionTest, SharedConstant) {
   EXPECT_EQ(entry_comp->root_instruction()->fused_instruction_count(), 6);
 
   LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({8}),
-          *ExecuteAndTransfer(std::move(hlo_module), {}));
+                               *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
 XLA_TEST_F(FusionTest, Add2D) { TestElementwise2D<float, 2>(HloOpcode::kAdd); }
@@ -781,7 +777,7 @@ void BM_ParallelFusion(int num_iters) {
   const int64 param2_dim1 = 1024;
 
   // Create computation.
-  ComputationBuilder builder(client, "ParallelFusion");
+  XlaBuilder builder("ParallelFusion");
   Shape shape0 = ShapeUtil::MakeShape(F32, {param0_dim0, param0_dim1});
   auto param0 = builder.Parameter(0, shape0, "param0");
   Shape shape1 = ShapeUtil::MakeShape(F32, {param1_dim0, param1_dim1});
@@ -796,19 +792,19 @@ void BM_ParallelFusion(int num_iters) {
   // Transfer literals to device.
   auto param0_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1);
-  std::unique_ptr<ShapedBuffer> buffer0 =
+  ScopedShapedBuffer buffer0 =
       client->LiteralToShapedBuffer(*param0_literal, device_ordinal)
           .ConsumeValueOrDie();
 
   auto param1_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1);
-  std::unique_ptr<ShapedBuffer> buffer1 =
+  ScopedShapedBuffer buffer1 =
       client->LiteralToShapedBuffer(*param1_literal, device_ordinal)
           .ConsumeValueOrDie();
 
   auto param2_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1);
-  std::unique_ptr<ShapedBuffer> buffer2 =
+  ScopedShapedBuffer buffer2 =
       client->LiteralToShapedBuffer(*param2_literal, device_ordinal)
           .ConsumeValueOrDie();
 
@@ -816,8 +812,8 @@ void BM_ParallelFusion(int num_iters) {
   std::unique_ptr<LocalExecutable> executable =
       client
           ->Compile(computation,
-                    {&buffer0->on_host_shape(), &buffer1->on_host_shape(),
-                     &buffer2->on_host_shape()},
+                    {&buffer0.on_host_shape(), &buffer1.on_host_shape(),
+                     &buffer2.on_host_shape()},
                     ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
@@ -838,8 +834,7 @@ void BM_ParallelFusion(int num_iters) {
   // Run some warm-up executions.
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result =
-        executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options);
+    auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }
 
@@ -852,8 +847,7 @@ void BM_ParallelFusion(int num_iters) {
   tensorflow::testing::UseRealTime();
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result =
-        executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options);
+    auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }
 }
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..130456e61ca8a217e903d2ddecc487f29a098ce1
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -0,0 +1,636 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+// NB!  TODO(b/74360564): These tests do not test out of bounds behavior since
+// that hasn't been specced yet.
+
+namespace xla {
+namespace {
+
+using tensorflow::gtl::nullopt;
+
+class GatherOperationTest : public HloTestBase {
+ protected:
+  void RunTest(const string& hlo_text, Literal* operand,
+               Literal* gather_indices) {
+    RunTest(hlo_text, {operand, gather_indices});
+  }
+
+  void RunTest(const string& hlo_text,
+               tensorflow::gtl::ArraySlice<Literal*> args) {
+    HloModuleConfig config;
+    config.set_debug_options(GetDebugOptionsForTest());
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                            tools::Parse(hlo_text, config));
+    EXPECT_TRUE(RunAndCompare(std::move(module), args, nullopt));
+  }
+};
+
+XLA_TEST_F(GatherOperationTest, TensorFlowGatherV1) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherV1
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[2,3] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1, 3}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, TensorFlowGatherV2) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherV2
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[3,2] gather(operand, indices),
+      output_window_dims={0},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3, 1}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, TensorFlowGatherMultipleBatchDims) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  ROOT gather = s32[2,3,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3, 1}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 2}, {2, 1}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdMultipleBatchDims_0) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherNdMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2,2] parameter(1)
+  ROOT gather = s32[2,2] gather(operand, indices),
+      output_window_dims={},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=2,
+      window_bounds={1, 1}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdMultipleBatchDims_1) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherNdMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2,2] parameter(1)
+  ROOT gather = s32[2,1,1,2] gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=2,
+      window_bounds={1, 1}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, TensorFlowGatherNd) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherNd
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  ROOT gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1,2}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 0}, {1, 0}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdNonDefaultIndexVectorDim) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherNd
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  ROOT gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1,2}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 0}, {1, 0}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, DynamicSlice) {
+  const char* hlo_text = R"(
+HloModule DynamicSlice
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[1,1] gather(operand, indices),
+      output_window_dims={0,1},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({1, 1});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, BatchDynamicSlice) {
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  ROOT gather = s32[2,1,1] gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{2, 1}, {1, 1}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, ZeroDimBounds) {
+  const char* hlo_text = R"(
+HloModule TensorFlowGatherV1
+
+ENTRY main {
+  operand = s32[3,0] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[2,0] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1, 0}
+}
+)";
+  std::unique_ptr<Literal> operand = Literal::CreateR2<int32>({{}, {}, {}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, OutOfBoundsIndex) {
+  // Out of bounds indices must not crash, and the indices in range should
+  // produce the same values across all backends.
+  //
+  // TODO(b/74360564): Once we have a well defined semantics for OOB accesses,
+  // we should get rid of the mask and check that backends produce the same
+  // value for OOB indices too.
+
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = s32[6,2]{1,0} parameter(1)
+  gather = s32[6,1,1]{2,1,0} gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1}
+  gather_reshaped = s32[6]{0} reshape(gather)
+  in_bounds_mask = s32[6]{0} parameter(2)
+  ROOT result = s32[6]{0} multiply(gather_reshaped, in_bounds_mask)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR2<int32>(
+      {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483647, 1}, {1, 2}});
+  std::unique_ptr<Literal> in_bounds_mask =
+      Literal::CreateR1<int32>({0, 1, 1, 0, 0, 1});
+
+  RunTest(hlo_text,
+          {operand.get(), gather_indices.get(), in_bounds_mask.get()});
+}
+
+XLA_TEST_F(GatherOperationTest, NegativeIndex) {
+  // Negative indices must not crash, and the indices in range should produce
+  // the same values across all backends.
+  //
+  // TODO(b/74360564): Once we have a well defined semantics for negative
+  // accesses, we should get rid of the mask and check that backends produce the
+  // same value for negative indices too.
+
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = s32[6,2]{1,0} parameter(1)
+  gather = s32[6,1,1]{2,1,0} gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1}
+  gather_reshaped = s32[6]{0} reshape(gather)
+  in_bounds_mask = s32[6]{0} parameter(2)
+  ROOT result = s32[6]{0} multiply(gather_reshaped, in_bounds_mask)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR2<int32>(
+      {{2, -1}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}});
+  std::unique_ptr<Literal> in_bounds_mask =
+      Literal::CreateR1<int32>({0, 1, 1, 0, 0, 1});
+
+  RunTest(hlo_text,
+          {operand.get(), gather_indices.get(), in_bounds_mask.get()});
+}
+
+XLA_TEST_F(GatherOperationTest, OneScalarIndex) {
+  const char* hlo_text = R"(
+HloModule OneScalarIndex
+
+ENTRY main {
+  operand = s32[2,3,2]{2,1,0} parameter(0)
+  index = s32[] parameter(1)
+  ROOT gather = s32[1,3,2]{2,1,0} gather(operand, index),
+      output_window_dims={0,1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=0,
+      window_bounds={1,3,2}
+}
+)";
+  std::unique_ptr<Literal> operand = Literal::CreateR3<int32>(
+      {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR0<int32>(1);
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, ScalarResult) {
+  const char* hlo_text = R"(
+HloModule ScalarResult
+
+ENTRY main {
+  operand = s32[4]{0} parameter(0)
+  index = s32[] parameter(1)
+  ROOT gather = s32[] gather(operand, index),
+      output_window_dims={},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=0,
+      window_bounds={1}
+}
+)";
+  std::unique_ptr<Literal> operand = Literal::CreateR1<int32>({1, 2, 3, 4});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR0<int32>(1);
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, ZeroSizedResult) {
+  const string hlo_text = R"(
+HloModule ZeroSizedResult
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[0] parameter(1)
+  ROOT gather = s32[0,3] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1, 3}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherV2) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherV2
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  gather = s32[3,2] gather(operand, indices),
+      output_window_dims={0},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[3,2] broadcast(one), dimensions={}
+  ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherMultipleBatchDims) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,3,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,3,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 2}, {2, 1}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNdMultipleBatchDims) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherNdMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=2,
+      window_bounds={1, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNd) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherNd
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1,2}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 0}, {1, 0}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest,
+           FusedTensorFlowGatherNdNonDefaultIndexVectorDim) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherNd
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1,2}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 0}, {1, 0}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedDynamicSlice) {
+  const char* hlo_text = R"(
+HloModule FusedDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  gather = s32[1,1] gather(operand, indices),
+      output_window_dims={0,1},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[1,1] broadcast(one), dimensions={}
+  ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({1, 1});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedBatchDynamicSlice) {
+  const string hlo_text = R"(
+HloModule FusedBatchDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,1,1] gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,1,1] broadcast(one), dimensions={}
+  ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{2, 1}, {1, 1}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+class GatherClientLibraryTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
+  // We create this HLO, but using the XlaBuilder API.
+  //
+  // ENTRY main {
+  //   operand = s32[3,3] parameter(0)
+  //   indices = s32[2] parameter(1)
+  //   ROOT gather = s32[2,3] gather(operand, indices),
+  //       output_window_dims={1},
+  //       elided_window_dims={0},
+  //       gather_dims_to_operand_dims={0},
+  //       index_vector_dim=1,
+  //       window_bounds={1, 3}
+  // }
+
+  XlaBuilder builder("gather_basic");
+
+  Shape operand_shape = ShapeUtil::MakeShape(S32, {3, 3});
+  Shape indices_shape = ShapeUtil::MakeShape(S32, {2});
+
+  auto operand = builder.Parameter(0, operand_shape, "operand");
+  auto indices = builder.Parameter(1, indices_shape, "indices");
+  GatherDimensionNumbers dim_numbers;
+  dim_numbers.add_output_window_dims(1);
+  dim_numbers.add_elided_window_dims(0);
+  dim_numbers.add_gather_dims_to_operand_dims(0);
+  dim_numbers.set_index_vector_dim(1);
+  builder.Gather(operand, indices, dim_numbers, {1, 3});
+
+  std::vector<int32> expected = {};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> operand_arg,
+                          client_->TransferToServer(*Literal::CreateR2<int32>(
+                              {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> indices_arg,
+      client_->TransferToServer(*Literal::CreateR1<int32>({0, 2})));
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<xla::DeviceHandle> devices,
+                          client_->GetDeviceHandles(1));
+  xla::ExecutionOptions execution_options = CreateDefaultExecutionOptions();
+  *execution_options.add_device_handles() = devices[0];
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+  std::vector<xla::Client::XlaComputationInstance> computation_instances = {
+      {computation,
+       {operand_arg.get(), indices_arg.get()},
+       execution_options,
+       /*execution_profile=*/nullptr}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<xla::GlobalData>> result_data,
+      client_->ExecuteParallel(computation_instances));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          client_->Transfer(*(result_data[0])));
+  LiteralTestUtil::ExpectEqual(
+      *result_literal, *Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}));
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/half_test.cc b/tensorflow/compiler/xla/tests/half_test.cc
index ec2f49d43bd8cee84c6b0abe1892e8b2278eefeb..76bf47845ca045b4eede9a3b47ae5c2ce93ce577 100644
--- a/tensorflow/compiler/xla/tests/half_test.cc
+++ b/tensorflow/compiler/xla/tests/half_test.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -39,7 +38,7 @@ class HalfTestBase : public ClientLibraryTestBase {
 };
 
 using UnaryBuildFuncTy =
-    std::function<void(ComputationBuilder*, const ComputationDataHandle& src)>;
+    std::function<void(xla::XlaBuilder*, const xla::XlaOp& src)>;
 
 struct UnaryOpTestParam {
   std::function<half(half)> compute_func;
@@ -51,8 +50,8 @@ class UnaryOpTest : public HalfTestBase,
 
 XLA_TEST_P(UnaryOpTest, Ops) {
   std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle x_opnd;
+  XlaBuilder builder(TestName());
+  XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
                                         &builder, &x_opnd);
 
@@ -79,30 +78,21 @@ half round_imp(half value) {
 
 INSTANTIATE_TEST_CASE_P(
     half, UnaryOpTest,
-    ::testing::Values(UnaryOpTestParam{[](half x) { return abs(x); },
-                                       &ComputationBuilder::Abs},
-                      UnaryOpTestParam{[](half x) { return round_imp(x); },
-                                       &ComputationBuilder::Round},
-                      UnaryOpTestParam{[](half x) { return ceil(x); },
-                                       &ComputationBuilder::Ceil},
-                      UnaryOpTestParam{[](half x) { return cos(x); },
-                                       &ComputationBuilder::Cos},
-                      UnaryOpTestParam{[](half x) { return exp(x); },
-                                       &ComputationBuilder::Exp},
-                      UnaryOpTestParam{[](half x) { return floor(x); },
-                                       &ComputationBuilder::Floor},
-                      UnaryOpTestParam{[](half x) { return log(x); },
-                                       &ComputationBuilder::Log},
-                      UnaryOpTestParam{[](half x) { return -x; },
-                                       &ComputationBuilder::Neg},
-                      UnaryOpTestParam{[](half x) { return sign_imp(x); },
-                                       &ComputationBuilder::Sign},
-                      UnaryOpTestParam{[](half x) { return sin(x); },
-                                       &ComputationBuilder::Sin},
-                      UnaryOpTestParam{[](half x) { return tanh(x); },
-                                       &ComputationBuilder::Tanh}
+    ::testing::Values(
+        UnaryOpTestParam{[](half x) { return abs(x); }, &XlaBuilder::Abs},
+        UnaryOpTestParam{[](half x) { return round_imp(x); },
+                         &XlaBuilder::Round},
+        UnaryOpTestParam{[](half x) { return ceil(x); }, &XlaBuilder::Ceil},
+        UnaryOpTestParam{[](half x) { return cos(x); }, &XlaBuilder::Cos},
+        UnaryOpTestParam{[](half x) { return exp(x); }, &XlaBuilder::Exp},
+        UnaryOpTestParam{[](half x) { return floor(x); }, &XlaBuilder::Floor},
+        UnaryOpTestParam{[](half x) { return log(x); }, &XlaBuilder::Log},
+        UnaryOpTestParam{[](half x) { return -x; }, &XlaBuilder::Neg},
+        UnaryOpTestParam{[](half x) { return sign_imp(x); }, &XlaBuilder::Sign},
+        UnaryOpTestParam{[](half x) { return sin(x); }, &XlaBuilder::Sin},
+        UnaryOpTestParam{[](half x) { return tanh(x); }, &XlaBuilder::Tanh}
 
-                      ));
+        ));
 
 struct UnaryPredTestParam {
   std::function<bool(half)> compute_func;
@@ -115,8 +105,8 @@ class UnaryPredTest : public HalfTestBase,
 
 XLA_TEST_P(UnaryPredTest, Ops) {
   std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle x_opnd;
+  XlaBuilder builder(TestName());
+  XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
                                         &builder, &x_opnd);
 
@@ -136,11 +126,11 @@ XLA_TEST_P(UnaryPredTest, Ops) {
 INSTANTIATE_TEST_CASE_P(half, UnaryPredTest,
                         ::testing::Values(UnaryPredTestParam{
                             [](half x) { return isfinite(x); },
-                            &ComputationBuilder::IsFinite}));
+                            &XlaBuilder::IsFinite}));
 
 using BinaryBuildFuncTy = std::function<void(
-    ComputationBuilder*, const ComputationDataHandle& x,
-    const ComputationDataHandle& y, tensorflow::gtl::ArraySlice<int64>)>;
+    xla::XlaBuilder*, const xla::XlaOp& x, const xla::XlaOp& y,
+    tensorflow::gtl::ArraySlice<int64>)>;
 
 struct BinaryOpTestParam {
   std::function<half(half, half)> compute_func;
@@ -153,12 +143,12 @@ class BinaryOpTest : public HalfTestBase,
 XLA_TEST_P(BinaryOpTest, Ops) {
   std::vector<half> x({half(1.0), half(2.0), half(3.0), half(-4.0)});
   std::vector<half> y({half(0.4), half(-0.3), half(0.2), half(0.1)});
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle x_opnd;
+  XlaBuilder builder(TestName());
+  XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
                                         &builder, &x_opnd);
 
-  ComputationDataHandle y_opnd;
+  XlaOp y_opnd;
   auto y_data = CreateR1Parameter<half>(y, /*parameter_number=*/1, "y",
                                         &builder, &y_opnd);
 
@@ -184,21 +174,21 @@ INSTANTIATE_TEST_CASE_P(
     half, BinaryOpTest,
     ::testing::Values(
         BinaryOpTestParam{[](half x, half y) { return x + y; },
-                          &ComputationBuilder::Add},
+                          &XlaBuilder::Add},
         BinaryOpTestParam{[](half x, half y) { return atan2_imp(x, y); },
-                          &ComputationBuilder::Atan2},
+                          &XlaBuilder::Atan2},
         BinaryOpTestParam{[](half x, half y) { return x / y; },
-                          &ComputationBuilder::Div},
+                          &XlaBuilder::Div},
         BinaryOpTestParam{[](half x, half y) { return max(x, y); },
-                          &ComputationBuilder::Max},
+                          &XlaBuilder::Max},
         BinaryOpTestParam{[](half x, half y) { return min(x, y); },
-                          &ComputationBuilder::Min},
+                          &XlaBuilder::Min},
         BinaryOpTestParam{[](half x, half y) { return x * y; },
-                          &ComputationBuilder::Mul},
+                          &XlaBuilder::Mul},
         BinaryOpTestParam{[](half x, half y) { return pow(x, y); },
-                          &ComputationBuilder::Pow},
+                          &XlaBuilder::Pow},
         BinaryOpTestParam{[](half x, half y) { return x - y; },
-                          &ComputationBuilder::Sub}
+                          &XlaBuilder::Sub}
 
         ));
 
@@ -214,12 +204,12 @@ class BinaryPredTest
 XLA_TEST_P(BinaryPredTest, Ops) {
   std::vector<half> x({half(1.0), half(2.0), half(0.2), half(-4.0)});
   std::vector<half> y({half(0.4), half(-0.3), half(0.2), half(0.1)});
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle x_opnd;
+  XlaBuilder builder(TestName());
+  XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
                                         &builder, &x_opnd);
 
-  ComputationDataHandle y_opnd;
+  XlaOp y_opnd;
   auto y_data = CreateR1Parameter<half>(y, /*parameter_number=*/1, "y",
                                         &builder, &y_opnd);
 
@@ -239,17 +229,17 @@ XLA_TEST_P(BinaryPredTest, Ops) {
 INSTANTIATE_TEST_CASE_P(
     half, BinaryPredTest,
     ::testing::Values(BinaryPredTestParam{[](half x, half y) { return x == y; },
-                                          &ComputationBuilder::Eq},
+                                          &XlaBuilder::Eq},
                       BinaryPredTestParam{[](half x, half y) { return x != y; },
-                                          &ComputationBuilder::Ne},
+                                          &XlaBuilder::Ne},
                       BinaryPredTestParam{[](half x, half y) { return x >= y; },
-                                          &ComputationBuilder::Ge},
+                                          &XlaBuilder::Ge},
                       BinaryPredTestParam{[](half x, half y) { return x > y; },
-                                          &ComputationBuilder::Gt},
+                                          &XlaBuilder::Gt},
                       BinaryPredTestParam{[](half x, half y) { return x <= y; },
-                                          &ComputationBuilder::Le},
+                                          &XlaBuilder::Le},
                       BinaryPredTestParam{[](half x, half y) { return x < y; },
-                                          &ComputationBuilder::Lt}
+                                          &XlaBuilder::Lt}
 
                       ));
 
diff --git a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
index eded2077fce965ab1c729c610764afa2228ca128..cf971dd61b71ad329b20b0bb7c16166126562681 100644
--- a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
+++ b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
@@ -30,7 +29,7 @@ class HloMetadataTest : public LocalClientTestBase {
     metadata_.set_op_name("my_sum_op");
   }
 
-  void BuildAddComputation(ComputationBuilder* builder) {
+  void BuildAddComputation(XlaBuilder* builder) {
     auto x = builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder->Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder->Add(x, y);
@@ -40,7 +39,7 @@ class HloMetadataTest : public LocalClientTestBase {
 };
 
 TEST_F(HloMetadataTest, MetadataPropagation) {
-  ComputationBuilder builder(local_client_, "add");
+  XlaBuilder builder("add");
   builder.SetOpMetadata(metadata_);
   BuildAddComputation(&builder);
   builder.ClearOpMetadata();
@@ -61,7 +60,7 @@ TEST_F(HloMetadataTest, MetadataPropagation) {
 }
 
 TEST_F(HloMetadataTest, MetadataClearing) {
-  ComputationBuilder builder(local_client_, "add");
+  XlaBuilder builder("add");
   builder.SetOpMetadata(metadata_);
   // Some other pretend computation here.
   builder.ClearOpMetadata();
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 9f5806c5e16c30cf198027cffab5f78c315cb957..8b64f2e6315bc424b4a0eeb266e2e5ff16efdf6e 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 namespace {
@@ -91,15 +89,14 @@ HloTestBase::HloTestBase()
 HloTestBase::HloTestBase(se::Platform* test_platform,
                          se::Platform* reference_platform)
     : test_runner_(test_platform), reference_runner_(reference_platform) {
-  hlo_verifier_ = MakeUnique<HloVerifier>();
+  hlo_verifier_ = MakeUnique<HloVerifier>(/*allow_mixed_precision=*/true);
 }
 
 /* static */
-std::unique_ptr<HloModule> HloTestBase::CreateNewModule() {
+std::unique_ptr<HloModule> HloTestBase::CreateNewModule(const string& name) {
   HloModuleConfig config;
   config.set_debug_options(GetDebugOptionsForTest());
-  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
-                               config);
+  return MakeUnique<HloModule>(name, VersionedComputationHandle(), config);
 }
 
 /*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() {
@@ -115,6 +112,15 @@ StatusOr<std::unique_ptr<Literal>> HloTestBase::Execute(
   return test_runner_.Execute(std::move(module), arguments);
 }
 
+std::unique_ptr<Literal> HloTestBase::ExecuteNoHloPasses(
+    std::unique_ptr<HloModule> module,
+    tensorflow::gtl::ArraySlice<Literal*> arguments) {
+  return test_runner_
+      .Execute(std::move(module), arguments,
+               /*run_hlo_passes=*/false)
+      .ValueOrDie();
+}
+
 std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
     std::unique_ptr<HloModule> module,
     tensorflow::gtl::ArraySlice<Literal*> arguments) {
@@ -135,22 +141,15 @@ StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
           "reference preprocessor must not modify the program shape");
     }
   }
-  TF_RETURN_IF_ERROR(VerifyHloModule(*reference_runner_.backend().platform(),
-                                     reference_module.get()));
+  TF_RETURN_IF_ERROR(hlo_verifier_->Run(reference_module.get()).status());
   return std::move(reference_module);
 }
 
-template <typename LiteralPtr>
 StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
-    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    std::unique_ptr<HloModule> module, const ArraySlice<Literal*> arguments,
     const optional<ErrorSpec>& error, bool run_hlo_passes,
     const std::function<void(HloModule*)>& reference_preprocessor) {
-  static_assert(
-      std::is_same<Literal*, LiteralPtr>::value ||
-          std::is_same<std::unique_ptr<Literal>, LiteralPtr>::value,
-      "The LiteralPtr type only accepts Literal* or std::unique_ptr<Literal>.");
-  TF_RETURN_IF_ERROR(
-      VerifyHloModule(*test_runner_.backend().platform(), module.get()));
+  TF_RETURN_IF_ERROR(hlo_verifier_->Run(module.get()).status());
   TF_ASSIGN_OR_RETURN(auto reference_module,
                       MakeReferenceModule(*module, reference_preprocessor));
 
@@ -165,9 +164,8 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
                                       error);
 }
 
-template <typename LiteralPtr>
 ::testing::AssertionResult HloTestBase::RunAndCompare(
-    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    std::unique_ptr<HloModule> module, const ArraySlice<Literal*> arguments,
     const optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto result =
@@ -179,9 +177,8 @@ template <typename LiteralPtr>
   return result.ValueOrDie();
 }
 
-template <typename LiteralPtr>
 ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
-    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    std::unique_ptr<HloModule> module, const ArraySlice<Literal*> arguments,
     const optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto result =
@@ -198,8 +195,14 @@ template <typename LiteralPtr>
     const std::function<void(HloModule*)>& reference_preprocessor) {
   const auto& fake_arguments =
       MakeFakeArguments(module.get()).ConsumeValueOrDie();
-  return RunAndCompare<std::unique_ptr<Literal>>(
-      std::move(module), fake_arguments, error, reference_preprocessor);
+
+  std::vector<Literal*> fake_argument_ptrs;
+  c_transform(
+      fake_arguments, std::back_inserter(fake_argument_ptrs),
+      [](const std::unique_ptr<Literal>& literal) { return literal.get(); });
+
+  return RunAndCompare(std::move(module), fake_argument_ptrs, error,
+                       reference_preprocessor);
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
@@ -207,8 +210,13 @@ template <typename LiteralPtr>
     const std::function<void(HloModule*)>& reference_preprocessor) {
   const auto& fake_arguments =
       MakeFakeArguments(module.get()).ConsumeValueOrDie();
-  return RunAndCompareNoHloPasses<std::unique_ptr<Literal>>(
-      std::move(module), fake_arguments, error, reference_preprocessor);
+  std::vector<Literal*> fake_argument_ptrs;
+  c_transform(
+      fake_arguments, std::back_inserter(fake_argument_ptrs),
+      [](const std::unique_ptr<Literal>& literal) { return literal.get(); });
+
+  return RunAndCompareNoHloPasses(std::move(module), fake_argument_ptrs, error,
+                                  reference_preprocessor);
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompare(
@@ -267,6 +275,28 @@ template <typename LiteralPtr>
                                   reference_preprocessor);
 }
 
+HloComputation* HloTestBase::FindComputation(HloModule* module,
+                                             tensorflow::StringPiece name) {
+  auto it = c_find_if(module->computations(),
+                      [&](HloComputation* c) { return c->name() == name; });
+  if (it == module->computations().end()) {
+    return nullptr;
+  }
+  return *it;
+}
+
+HloInstruction* HloTestBase::FindInstruction(HloModule* module,
+                                             tensorflow::StringPiece name) {
+  for (const HloComputation* c : module->computations()) {
+    auto it = c_find_if(c->instructions(),
+                        [&](HloInstruction* i) { return i->name() == name; });
+    if (it != c->instructions().end()) {
+      return *it;
+    }
+  }
+  return nullptr;
+}
+
 Backend& HloTestBase::backend() { return test_runner_.backend(); }
 
 /* static */
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 4aea9fc9fd027231106e529eb16bcd43f23fbe1c..6491208895f9ecf15c938ed480f90bce0114ac30 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -44,7 +44,7 @@ namespace xla {
 // enables, for one, explicitly building a graph of HLO instructions to run.
 //
 // This can also be used to write text/file-based test cases. Note that the test
-// target is responsible for linking the needed backends. A covenient way to do
+// target is responsible for linking the needed backends. A convenient way to do
 // this is to make it an xla_test: it will generate test targets linking with
 // the respective backends, which will be used as the test backend; the
 // interpreter backend is already linked with hlo_test_base so it will be the
@@ -76,8 +76,7 @@ class HloTestBase : public ::testing::Test {
   // If your test doesn't use interpreter as the reference backend, you can use
   // this constructor. Note that your test target is responsible for linking in
   // both needed backends.
-  HloTestBase(::perftools::gputools::Platform* test_platform,
-              ::perftools::gputools::Platform* reference_platform);
+  HloTestBase(se::Platform* test_platform, se::Platform* reference_platform);
 
   ~HloTestBase() override {}
 
@@ -86,7 +85,8 @@ class HloTestBase : public ::testing::Test {
   // options from command-line flags. If you want a fresh HloModule object and
   // then add HloComputations to it, it's recommended to use this method in your
   // tests.
-  static std::unique_ptr<HloModule> CreateNewModule();
+  static std::unique_ptr<HloModule> CreateNewModule(
+      const string& name = TestName());
 
   // Populates debug options from command-line flags and adjusts the options for
   // testing. It is recommended to use this when you need to pass in
@@ -98,14 +98,19 @@ class HloTestBase : public ::testing::Test {
       std::unique_ptr<HloModule> module,
       tensorflow::gtl::ArraySlice<Literal*> arguments);
 
+  // Same as above, except the module will be executed without running any HLO
+  // passes on it.
+  std::unique_ptr<Literal> ExecuteNoHloPasses(
+      std::unique_ptr<HloModule> module,
+      tensorflow::gtl::ArraySlice<Literal*> arguments);
+
   std::unique_ptr<Literal> ExecuteAndTransfer(
       std::unique_ptr<HloModule> module,
       tensorflow::gtl::ArraySlice<Literal*> arguments);
 
   // Executes the given hlo module on two backends and compares results.
   //
-  // 'arguments': the input of the hlo module. The LiteralPtr type accepts
-  // Literal* or std::unique_ptr<Literal>.
+  // 'arguments': the input of the hlo module.
   //
   // 'error': if has value, expects the results to be near (within the error
   // bound). Otherwise, expects the results to be equal.
@@ -114,20 +119,18 @@ class HloTestBase : public ::testing::Test {
   // backend, but it might need to be tailored so that it is able to run on the
   // reference backend. Note that the program shape of the module must not be
   // modified.
-  template <typename LiteralPtr>
   ::testing::AssertionResult RunAndCompare(
       std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::ArraySlice<Literal*> arguments,
       const tensorflow::gtl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
 
   // Same as above, except that the module will be executed without Hlo
   // optimization.
-  template <typename LiteralPtr>
   ::testing::AssertionResult RunAndCompareNoHloPasses(
       std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::ArraySlice<Literal*> arguments,
       const tensorflow::gtl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
@@ -197,6 +200,15 @@ class HloTestBase : public ::testing::Test {
         ->Clear();
   }
 
+  // Gets the computation/instruction from the given module with the given name.
+  //
+  // This is useful for tests which create HLOs from a string and then want to
+  // inspect a particular computation or instruction.
+  HloComputation* FindComputation(HloModule* module,
+                                  tensorflow::StringPiece name);
+  HloInstruction* FindInstruction(HloModule* module,
+                                  tensorflow::StringPiece name);
+
   // Return an HLO verifier constructed for the test backend.
   HloVerifier& verifier() const { return *hlo_verifier_; }
 
@@ -223,10 +235,9 @@ class HloTestBase : public ::testing::Test {
   // Runs the module on two platforms with or without running hlo passes and
   // compares the results. Returns whether the results are near or equal. If any
   // error happens before the results are computed, returns the error status.
-  template <typename LiteralPtr>
   StatusOr<::testing::AssertionResult> RunAndCompareInternal(
       std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::ArraySlice<Literal*> arguments,
       const tensorflow::gtl::optional<ErrorSpec>& error, bool run_hlo_passes,
       const std::function<void(HloModule*)>& reference_preprocessor);
 };
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
index 506091ddd8d1d8e6519525bb7031f4e8b296b5fb..da4cf4ae0c31bc194cd2ec9b845df36afbde69b0 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -40,18 +41,22 @@ void HloVerifiedTestBase::TearDown() {
       << "TearDown called more than once; it should be called exactly once.";
   tear_down_called_ = true;
   if (module_) {
-    HloVerifier verifier;
-    xla::StatusOr<bool> mutated = verifier.Run(module_.get());
-    if (!mutated.ok()) {
-      ADD_FAILURE() << "HloVerifier failed: " << mutated.status();
-    } else {
-      EXPECT_FALSE(mutated.ValueOrDie())
-          << "HloVerifier should never mutate the HloModule";
-    }
+    VerifyModule();
   }
   HloTestBase::TearDown();
 }
 
+void HloVerifiedTestBase::VerifyModule() {
+  HloVerifier verifier;
+  xla::StatusOr<bool> mutated = verifier.Run(module_.get());
+  if (!mutated.ok()) {
+    ADD_FAILURE() << "HloVerifier failed: " << mutated.status();
+  } else {
+    EXPECT_FALSE(mutated.ValueOrDie())
+        << "HloVerifier should never mutate the HloModule";
+  }
+}
+
 HloModule& HloVerifiedTestBase::module() {
   if (!module_) {
     module_ = CreateNewModule();
@@ -59,4 +64,10 @@ HloModule& HloVerifiedTestBase::module() {
   return *module_;
 }
 
+void HloVerifiedTestBase::ParseAndVerifyModule(
+    tensorflow::StringPiece hlo_text) {
+  CHECK(!module_) << "Called ParseModule when test already has a module.";
+  TF_ASSERT_OK_AND_ASSIGN(module_, tools::Parse(hlo_text));
+  VerifyModule();
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
index 492688bf7d682cf991cb8c09399492a0437f651b..e5bb14a8839acbdef8fd2b79bb0f574c46ea3d40 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
@@ -44,6 +44,7 @@ class HloVerifiedTestBase : public HloTestBase {
   // Returns the default HloModule, lazily creating it if necessary via
   // HloTestBase::CreateNewModule().
   HloModule& module();
+  void ParseAndVerifyModule(tensorflow::StringPiece hlo_text);
 
   // Sets the shape-size function used during hlo verification. If this isn't
   // called, a default ShapeVerifier is used instead.
@@ -55,6 +56,7 @@ class HloVerifiedTestBase : public HloTestBase {
   std::unique_ptr<HloModule> module_;  // Lazily populated. Access via module().
   std::unique_ptr<ShapeVerifier> shape_verifier_;
   bool tear_down_called_ = false;
+  void VerifyModule();
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 5aa71a9261dbd414d1499f15c9b83cd63b634b49..c28f79ae386670ca80d603a42f6629dfd30e0bc9 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -39,6 +39,11 @@ limitations under the License.
 
 namespace xla {
 
+using ::tensorflow::strings::Appendf;
+using ::tensorflow::strings::Printf;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
 /* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapes(
     const Shape& expected, const Shape& actual) {
   if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) {
@@ -173,14 +178,11 @@ template <typename FloatT, typename UnsignedT>
   auto lhs_double = static_cast<double>(lhs);
   auto rhs_double = static_cast<double>(rhs);
   if (ulhs != urhs) {
-    return ::testing::AssertionFailure() << tensorflow::strings::Printf(
+    return ::testing::AssertionFailure() << Printf(
                "floating values are not bitwise-equal; and equality testing "
                "was requested: %s=%g=%a vs %s=%g=%a",
-               tensorflow::strings::StrCat(tensorflow::strings::Hex(ulhs))
-                   .c_str(),
-               lhs_double, lhs_double,
-               tensorflow::strings::StrCat(tensorflow::strings::Hex(urhs))
-                   .c_str(),
+               StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double,
+               lhs_double, StrCat(tensorflow::strings::Hex(urhs)).c_str(),
                rhs_double, rhs_double);
   }
   return ::testing::AssertionSuccess();
@@ -209,6 +211,11 @@ template <>
   return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs);
 }
 template <>
+::testing::AssertionResult CompareEqual<Eigen::half>(Eigen::half lhs,
+                                                     Eigen::half rhs) {
+  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs);
+}
+template <>
 ::testing::AssertionResult CompareEqual<float>(float lhs, float rhs) {
   return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
 }
@@ -259,9 +266,7 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
       << "expected:\n"
       << expected.ToString() << "\n\tvs actual:\n"
       << actual.ToString()
-      << (message.empty()
-              ? ""
-              : tensorflow::strings::StrCat("\nmessage: ", message));
+      << (message.empty() ? "" : StrCat("\nmessage: ", message));
 }
 
 /* static */ void LiteralTestUtil::ExpectNotEqual(const Literal& expected,
@@ -316,9 +321,8 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
     case TUPLE: {
       bool tuple_match = true;
       for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-        SCOPED_TRACE(tensorflow::strings::StrCat(
-            "Tuple index ", i, " in ",
-            ShapeUtil::HumanString(expected.shape())));
+        SCOPED_TRACE(StrCat("Tuple index ", i, " in ",
+                            ShapeUtil::HumanString(expected.shape())));
 
         // Create LiteralViews of the expected and actual elements.
         auto result = Equal(LiteralView::Create(expected, {i}),
@@ -345,227 +349,301 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
 
 namespace {
 
+// Gets the total element count.  For tuples, this is not the count of tuple
+// elements, but the sum of elements of each tuple element.
+int64 RecursiveElementCount(const Shape& shape) {
+  if (ShapeUtil::IsTuple(shape)) {
+    const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
+    int64 total = 0;
+    for (int64 i = 0; i < tuple_elements; ++i) {
+      total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
+    }
+    return total;
+  } else {
+    return ShapeUtil::ElementsIn(shape);
+  }
+}
+
+// Calling ToString on a literal with over 100 million elements takes around
+// 3 minutes.  The utility of printing a literal with >1000 elements is
+// questionable, especially when writing the Literal proto to disk is orders
+// of magnitude faster.
+string TruncateHugeLiteral(const Literal& literal) {
+  return RecursiveElementCount(literal.shape()) < 1000
+             ? literal.ToString()
+             : "[TRUNCATED, Literal with more than 1000 values]";
+}
+
+// Returns whether the actual and expected values are mismatched with respect to
+// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
+template <typename NativeT>
+bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
+  if (relaxed_nans) {
+    return !std::isnan(expected) && std::isnan(actual);
+  } else {
+    return std::isnan(expected) != std::isnan(actual);
+  }
+}
+
+template <>
+bool NanMismatch<complex64>(complex64 expected, complex64 actual,
+                            bool relaxed_nans) {
+  return NanMismatch<float>(expected.real(), actual.real(), relaxed_nans) ||
+         NanMismatch<float>(expected.imag(), actual.imag(), relaxed_nans);
+}
+
+template <>
+bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
+  return NanMismatch<float>(static_cast<float>(expected),
+                            static_cast<float>(actual), relaxed_nans);
+}
+
+// Converts the given floating-point value to a string.
+template <typename NativeT>
+string FpValueToString(NativeT value) {
+  return Printf("%8.4g", static_cast<double>(value));
+}
+
+template <>
+string FpValueToString<complex64>(complex64 value) {
+  return Printf("%8.4g + %8.4fi", value.real(), value.imag());
+}
+
+// Returns the absolute value of the given floating point value. This function
+// is used instead of std::abs directly in order to allow type-dependent
+// implementations for NearComparator.
+template <typename NativeT>
+float FpAbsoluteValue(NativeT value) {
+  return std::abs(value);
+}
+
+template <>
+float FpAbsoluteValue(bfloat16 value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
+template <>
+float FpAbsoluteValue(half value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
 // Helper class for comparing floating-point literals within an error bound.
+template <typename NativeT>
 class NearComparator {
  public:
-  explicit NearComparator(ErrorSpec error) : error_(error) {}
+  // Compares the two array literals elementwise and returns an assertion
+  // result. The assertion result is successful if all actual and expected
+  // elements are within the given error bound. In case of error, the assertion
+  // result contains a detailed error message in case of failure.
+  static ::testing::AssertionResult Compare(const Literal& expected,
+                                            const Literal& actual,
+                                            ErrorSpec error,
+                                            bool detailed_message) {
+    NearComparator<NativeT> comparator(expected, actual, error,
+                                       detailed_message);
+    return comparator.Run();
+  }
 
-  // Compares the two literals elementwise. EXPECTs each pair of elements to be
-  // within the error bound. Emits useful log messages and dumps literals to
-  // temporary files on failure. Returns true if  literals match.
-  bool ExpectNear(const Literal& expected, const Literal& actual) {
+ private:
+  // Data structure encapsulating metadata about a single element mismatch.
+  struct Mismatch {
+    NativeT actual;
+    NativeT expected;
+    float rel_error;
+    float abs_error;
+
+    // The linear index of the failure within the shape. This linear index is
+    // from the 'actual' literal.
+    int64 linear_index;
+
+    bool operator<(const Mismatch& other) const {
+      return rel_error < other.rel_error;
+    }
+
+    string ToString(const Shape& shape) const {
+      return Printf(
+          "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g",
+          FpValueToString(actual).c_str(), FpValueToString(expected).c_str(),
+          LiteralTestUtil::MultiIndexAsString(
+              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
+                                                            linear_index))
+              .c_str(),
+          rel_error, abs_error);
+    }
+  };
+
+  explicit NearComparator(const Literal& expected, const Literal& actual,
+                          ErrorSpec error, bool detailed_message)
+      : expected_(expected),
+        actual_(actual),
+        error_(error),
+        detailed_message_(detailed_message),
+        abs_value_buckets_(kAbsValueBucketBounds.size() - 1, {0, 0}),
+        abs_error_buckets_(kErrorBucketBounds.size(), 0),
+        rel_error_buckets_(kErrorBucketBounds.size(), 0) {}
+
+  // Runs the comparison between expected and actual literals.
+  ::testing::AssertionResult Run() {
     VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected));
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected_));
     VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual));
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual_));
 
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
     ::testing::AssertionResult equal_shapes =
-        LiteralTestUtil::EqualShapes(expected.shape(), actual.shape());
+        LiteralTestUtil::EqualShapes(expected_.shape(), actual_.shape());
     if (!equal_shapes) {
-      EXPECT_TRUE(equal_shapes);
-      return false;
+      return equal_shapes;
     }
-
-    // Set up members used during the comparison.
-    num_miscompares_ = 0;
-    abs_diff_sum_ = 0.0;
-    abs_expected_sum_ = 0.0;
-    abs_diff_miscompare_sum_ = 0.0;
-    abs_expected_miscompare_sum_ = 0.0;
-    max_rel_err_ = 0.0;
-    max_abs_err_ = 0.0;
-    first_linear_index_ = -1;
-    last_linear_index_ = -1;
-    max_rel_linear_index_ = -1;
-    max_abs_linear_index_ = -1;
-    miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED));
-    miscompares_.PopulateWithValue(false);
-    multi_index_.resize(expected.shape().dimensions_size(), 0);
-
-    switch (expected.shape().element_type()) {
-      case BF16:
-        ExpectLiteralsNear<bfloat16>(expected, actual, 0);
-        break;
-      case F16:
-        ExpectLiteralsNear<half>(expected, actual, 0);
-        break;
-      case F32:
-        ExpectLiteralsNear<float>(expected, actual, 0);
-        break;
-      case F64:
-        ExpectLiteralsNear<double>(expected, actual, 0);
-        break;
-      case C64:
-        ExpectLiteralsNear<complex64>(expected, actual, 0);
-        break;
-      default:
-        LOG(FATAL) << "Unsupported primitive type in near comparator: "
-                   << PrimitiveType_Name(expected.shape().element_type())
-                   << ". Must be floating-point type.";
+    if (!ShapeUtil::IsArray(expected_.shape())) {
+      return ::testing::AssertionFailure() << "Expected array shape";
     }
 
-    if (num_miscompares_ > 0) {
-      if (!VLOG_IS_ON(1)) {
-        LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape())
-                  << " " << TruncateHugeLiteral(expected);
-        LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape())
-                  << " " << TruncateHugeLiteral(actual);
-        LOG(INFO) << "Dumping literals to temp files...";
-        WriteLiteralToTempFile(expected, "expected");
-        WriteLiteralToTempFile(actual, "actual");
-        WriteLiteralToTempFile(miscompares_, "miscompares");
-      }
-      EXPECT_TRUE(num_miscompares_ == 0)
-          << "\nmax relative mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), max_rel_linear_index_))
-          << "\nmaximum relative error " << max_rel_err_
-          << "\nmax absolute mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), max_abs_linear_index_))
-          << "\nmaximum absolute error " << max_abs_err_
-          << "\nfirst mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), first_linear_index_))
-          << "\nlast mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), last_linear_index_))
-          << "\ntotal absolute error " << abs_diff_sum_
-          << "\ntotal absolute error of miscompares "
-          << abs_diff_miscompare_sum_ << "\ntotal relative error "
-          << (abs_diff_sum_ / abs_expected_sum_)
-          << "\ntotal relative error of miscompares "
-          << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_)
-          << "\nfailure count " << num_miscompares_;
+    mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED));
+    mismatches_.PopulateWithValue(false);
+
+    CompareLiterals();
+
+    if (num_mismatches_ == 0) {
+      return ::testing::AssertionSuccess();
+    } else if (!VLOG_IS_ON(1)) {
+      LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected_.shape())
+                << " " << TruncateHugeLiteral(expected_);
+      LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual_.shape())
+                << " " << TruncateHugeLiteral(actual_);
+      LOG(INFO) << "Dumping literals to temp files...";
+      WriteLiteralToTempFile(expected_, "expected");
+      WriteLiteralToTempFile(actual_, "actual");
+      WriteLiteralToTempFile(mismatches_, "mismatches");
     }
-    return num_miscompares_ == 0;
+    return ::testing::AssertionFailure() << ErrorMessage();
   }
 
- private:
-  template <typename NativeT>
-  bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
-    if (relaxed_nans) {
-      return !std::isnan(expected) && std::isnan(actual);
-    } else {
-      return std::isnan(expected) != std::isnan(actual);
+  // Insert the given absolute value into the absolute value bucket vector. The
+  // bounds of the buckets are given by kAbsValueBucketBounds.
+  void UpdateAbsValueBucket(NativeT value, bool is_mismatch) {
+    // Adjust the bucket containing the absolute values of the 'actual'
+    // elements.
+    const float abs_value = FpAbsoluteValue(value);
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      if (i == abs_value_buckets_.size() - 1 ||
+          (abs_value >= kAbsValueBucketBounds[i] &&
+           abs_value < kAbsValueBucketBounds[i + 1])) {
+        // The first value of the pair is the count of elements in the bucket,
+        // the second is the count of mismatches in the bucket.
+        abs_value_buckets_[i].first++;
+        if (is_mismatch) {
+          abs_value_buckets_[i].second++;
+        }
+        return;
+      }
     }
   }
 
-  template <typename NativeT>
-  void ExpectNear(NativeT expected, NativeT actual,
-                  const ::testing::Message& message) {
-    EXPECT_NEAR(expected, actual, error_.abs)
-        << "expected:\n  " << expected << "\n\tvs actual:\n  " << actual << "\n"
-        << message;
-  }
-
-  // EXPECTs that the two given scalar values are within the error bound. Keeps
-  // track of how many mismatches have occurred to keep the size of the output
-  // manageable.
-  template <typename NativeT>
-  bool ExpectValuesNear(NativeT expected, NativeT actual) {
-    if (expected == actual) {
-      return true;
+  // Insert the given error into the given error bucket vector.
+  void UpdateErrorBucket(
+      float error, tensorflow::gtl::MutableArraySlice<int64> error_buckets) {
+    CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size());
+    for (int i = 0; i < error_buckets.size(); ++i) {
+      if (error >= kErrorBucketBounds[i]) {
+        error_buckets[i]++;
+      }
     }
-
-    const float abs_diff = std::abs(actual - expected);
-    const float rel_err = abs_diff / std::abs(expected);
-    const bool nan_mismatch =
-        NanMismatch<NativeT>(expected, actual, error_.relaxed_nans);
-    const bool mismatch =
-        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
-    return !mismatch;
   }
 
-  // Assumes that expected vs actual fail ExpectValuesNear.
-  template <typename NativeT>
-  void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual,
-                               const Shape& shape, const int64 linear_index) {
-    const float abs_diff = std::abs(actual - expected);
-    const float rel_err = abs_diff / std::abs(expected);
-    abs_diff_sum_ += abs_diff;
-    abs_expected_sum_ += std::abs(expected);
-    if (rel_err > max_rel_err_ || std::isnan(rel_err)) {
-      max_rel_err_ = rel_err;
-      max_rel_linear_index_ = linear_index;
+  // Compares the two given elements from the expected and actual literals at
+  // the given literal_index and keeps track of various mismatch statistics.
+  void CompareValues(NativeT expected, NativeT actual, int64 linear_index) {
+    const bool is_nan_mismatch =
+        NanMismatch(expected, actual, error_.relaxed_nans);
+    float abs_error;
+    float rel_error;
+    if (actual == expected) {
+      abs_error = 0;
+      rel_error = 0;
+    } else if (is_nan_mismatch) {
+      num_nan_mismatches_++;
+      // A nan mismatch is considered to have infinite error. rel_error is used
+      // for sorting a std::set of the top mismatchs, and a nan value here will
+      // result in undefined behavior because nan's do not satisfy the strict
+      // weak ordering requirement of std containers.
+      abs_error = std::numeric_limits<float>::infinity();
+      rel_error = std::numeric_limits<float>::infinity();
+    } else {
+      abs_error = FpAbsoluteValue(actual - expected);
+      rel_error = abs_error / FpAbsoluteValue(expected);
     }
-    if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) {
-      max_abs_err_ = abs_diff;
-      max_abs_linear_index_ = linear_index;
+    const bool is_abs_mismatch = abs_error > error_.abs;
+    const bool is_rel_mismatch = rel_error > error_.rel;
+    const bool is_mismatch =
+        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
+
+    // Update the error of the relative bucket only if the *absolute* error
+    // bound is exceeded and vice versa.
+    if (is_abs_mismatch) {
+      num_abs_mismatches_++;
+      UpdateErrorBucket(rel_error, &rel_error_buckets_);
     }
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << tensorflow::strings::Printf(
-          "index %s abs_diff %f rel_err %f",
-          LiteralTestUtil::MultiIndexAsString(
-              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
-                                                            linear_index))
-              .c_str(),
-          abs_diff, rel_err);
+    if (is_rel_mismatch) {
+      num_rel_mismatches_++;
+      UpdateErrorBucket(abs_error, &abs_error_buckets_);
     }
-    abs_diff_miscompare_sum_ += abs_diff;
-    abs_expected_miscompare_sum_ += std::abs(expected);
-    const int64 kMaxFailures = 2;
-    if (num_miscompares_ < kMaxFailures) {
-      const auto multi_index =
-          IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index);
-      ::testing::Message msg;
-      msg << "mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff "
-          << abs_diff << " rel err " << rel_err << " failure #"
-          << num_miscompares_;
-      ExpectNear<NativeT>(expected, actual, msg);
-    } else if (num_miscompares_ == kMaxFailures) {
-      LOG(ERROR) << "reached max 'loud' failure count; silently proceeding...";
+
+    UpdateAbsValueBucket(actual, is_mismatch);
+
+    if (!is_mismatch) {
+      return;
     }
-    if (num_miscompares_ == 0) {
-      first_linear_index_ = linear_index;
+
+    num_mismatches_++;
+
+    // Keep track of the kTopRelativeErrorCount relative error mismatches.
+    if (top_rel_mismatches_.size() < kTopRelativeErrorCount ||
+        rel_error > top_rel_mismatches_.begin()->rel_error) {
+      Mismatch mismatch = {actual, expected, rel_error, abs_error,
+                           linear_index};
+      top_rel_mismatches_.insert(mismatch);
+      if (top_rel_mismatches_.size() > kTopRelativeErrorCount) {
+        top_rel_mismatches_.erase(top_rel_mismatches_.begin());
+      }
     }
-    num_miscompares_++;
-    last_linear_index_ = linear_index;
-    miscompares_.data<bool>()[linear_index] = true;
+
+    mismatches_.data<bool>()[linear_index] = true;
   }
 
-  // Recursive function which compares the two given literals elementwise.
-  template <typename NativeT>
-  void ExpectLiteralsNear(const Literal& expected, const Literal& actual,
-                          int64 dimension) {
+  // Compares the two literals elementwise.
+  void CompareLiterals() {
     // Fast path optimization for the case were layouts match.
-    if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) {
+    if (LayoutUtil::Equal(actual_.shape().layout(),
+                          expected_.shape().layout())) {
       tensorflow::gtl::ArraySlice<const NativeT> expected_data =
-          expected.data<NativeT>();
+          expected_.data<NativeT>();
       tensorflow::gtl::ArraySlice<const NativeT> actual_data =
-          actual.data<NativeT>();
+          actual_.data<NativeT>();
       const int64 len = expected_data.size();
       for (int64 i = 0; i < len; ++i) {
-        const bool near = ExpectValuesNear(expected_data[i], actual_data[i]);
-        if (!near) {
-          UpdateAndLogMiscompares<NativeT>(expected_data[i], actual_data[i],
-                                           actual.shape(), i);
-        }
+        CompareValues(expected_data[i], actual_data[i], i);
       }
       return;
     }
+    std::vector<int64> multi_index(ShapeUtil::Rank(actual_.shape()), 0);
+    CompareLiteralsSlow(0, &multi_index);
+  }
 
-    if (dimension == expected.shape().dimensions_size()) {
-      bool near = ExpectValuesNear(expected.Get<NativeT>(multi_index_),
-                                   actual.Get<NativeT>(multi_index_));
-      if (!near) {
-        UpdateAndLogMiscompares<NativeT>(
-            expected.Get<NativeT>(multi_index_),
-            actual.Get<NativeT>(multi_index_), actual.shape(),
-            IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(),
-                                                          multi_index_));
-      }
+  // Slow path for CompareLiterals when 'actual' and 'expected' literals have
+  // different layouts. In this case, multidimensional indices are constructed
+  // and indexed for each element.
+  void CompareLiteralsSlow(int64 dimension, std::vector<int64>* multi_index) {
+    if (dimension == multi_index->size()) {
+      CompareValues(expected_.Get<NativeT>(*multi_index),
+                    actual_.Get<NativeT>(*multi_index),
+                    IndexUtil::MultidimensionalIndexToLinearIndex(
+                        actual_.shape(), *multi_index));
     } else {
-      for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
-        multi_index_[dimension] = i;
-        ExpectLiteralsNear<NativeT>(expected, actual, dimension + 1);
+      for (int64 i = 0; i < expected_.shape().dimensions(dimension); ++i) {
+        (*multi_index)[dimension] = i;
+        CompareLiteralsSlow(dimension + 1, multi_index);
       }
     }
   }
@@ -575,159 +653,247 @@ class NearComparator {
     int64 now_usec = tensorflow::Env::Default()->NowMicros();
     string filename = tensorflow::io::JoinPath(
         tensorflow::testing::TmpDir(),
-        tensorflow::strings::Printf("tempfile-%s-%llx-%s", Hostname().c_str(),
-                                    now_usec, name.c_str()));
+        Printf("tempfile-%s-%llx-%s", Hostname().c_str(), now_usec,
+               name.c_str()));
     TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(),
                                              filename, literal.ToProto()));
     LOG(ERROR) << "wrote to " << name << " file: " << filename;
   }
 
-  // Gets the total element count.  For tuples, this is not the count of tuple
-  // elements, but the sum of elements of each tuple element.
-  int64 RecursiveElementCount(const Shape& shape) {
-    if (ShapeUtil::IsTuple(shape)) {
-      const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
-      int64 total = 0;
-      for (int64 i = 0; i < tuple_elements; ++i) {
-        total +=
-            RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
-      }
-      return total;
-    } else {
-      return ShapeUtil::ElementsIn(shape);
+  // Returns an error message string with a detailed breakdown of the
+  // mismatches. Called after calling Run().
+  string ErrorMessage() {
+    string out;
+    int64 element_count = ShapeUtil::ElementsIn(actual_.shape());
+
+    auto percent_string = [](float a, float b) {
+      float pct = b == 0.0 ? 0.0 : 100.0 * a / b;
+      return Printf("%0.4f%%", pct);
+    };
+
+    Appendf(&out,
+            "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound "
+            "%g, rel bound %g\n",
+            num_mismatches_,
+            percent_string(num_mismatches_, element_count).c_str(),
+            ShapeUtil::HumanString(actual_.shape()).c_str(),
+            ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel);
+    if (num_nan_mismatches_ > 0) {
+      StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n");
+    }
+    Appendf(&out, "Top relative error mismatches:\n");
+    for (auto it = top_rel_mismatches_.rbegin();
+         it != top_rel_mismatches_.rend(); ++it) {
+      StrAppend(&out, "  ", it->ToString(actual_.shape()).c_str(), "\n");
     }
-  }
-
-  // Calling ToString on a literal with over 100 million elements takes around
-  // 3 minutes.  The utility of printing a literal with >1000 elements is
-  // questionable, especially when writing the Literal proto to disk is orders
-  // of magnitude faster.
-  string TruncateHugeLiteral(const Literal& literal) {
-    return RecursiveElementCount(literal.shape()) < 1000
-               ? literal.ToString()
-               : "[TRUNCATED, Literal with more than 1000 values]";
-  }
 
-  ErrorSpec error_;
+    if (!detailed_message_) {
+      return out;
+    }
 
-  // Number of element miscomparisons encountered so far.
-  int64 num_miscompares_;
+    StrAppend(&out, "Absolute magnitude breakdown of actual values:\n");
+    CHECK_EQ(abs_value_buckets_.size() + 1, kAbsValueBucketBounds.size());
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      const int64 bucket_size = abs_value_buckets_[i].first;
+      const int64 bucket_mismatches = abs_value_buckets_[i].second;
+      string mismatch_str = bucket_mismatches > 0
+                                ? Printf(", mismatches %lld", bucket_mismatches)
+                                : "";
+      Appendf(&out, "  %-6g <= x < %-6g : %7lld (%9s)%s\n",
+              kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1],
+              bucket_size, percent_string(bucket_size, element_count).c_str(),
+              mismatch_str.c_str());
+    }
 
-  // A Literal containing which elements did not match in the expected and
-  // actual literals. miscompares_ contains PREDs and is of the same sizes as
-  // the comparison literals.
-  Literal miscompares_;
-
-  // A multidimensional index used when performing the recursive comparison.
-  std::vector<int64> multi_index_;
-
-  // Aggregated Statistics on input.
-  double abs_diff_sum_;
-  double abs_expected_sum_;
-  double abs_diff_miscompare_sum_;
-  double abs_expected_miscompare_sum_;
-  float max_rel_err_;
-  float max_abs_err_;
-  int64 first_linear_index_;
-  int64 last_linear_index_;
-  int64 max_rel_linear_index_;
-  int64 max_abs_linear_index_;
-};
+    auto print_accum_buckets = [&](const string& header, int64 total,
+                                   tensorflow::gtl::ArraySlice<int64> buckets) {
+      StrAppend(&out, header, ":\n");
+      Appendf(&out, "  <  %-6g : %7lld (%s)\n", kErrorBucketBounds[0],
+              total - buckets[0],
+              percent_string(total - buckets[0], total).c_str());
+      CHECK_EQ(buckets.size(), kErrorBucketBounds.size());
+      for (int i = 0; i < kErrorBucketBounds.size(); ++i) {
+        Appendf(&out, "  >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i],
+                buckets[i], percent_string(buckets[i], total).c_str());
+      }
+    };
+    Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n",
+            error_.abs, num_abs_mismatches_,
+            percent_string(num_abs_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Relative error breakdown of elements exceeding abs error bound",
+        num_abs_mismatches_, rel_error_buckets_);
+    Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n",
+            error_.rel, num_rel_mismatches_,
+            percent_string(num_rel_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Absolute error breakdown of elements exceeding rel error bound",
+        num_rel_mismatches_, abs_error_buckets_);
+    return out;
+  }
 
-template <>
-bool NearComparator::NanMismatch<complex64>(complex64 expected,
-                                            complex64 actual,
-                                            bool relaxed_nans) {
-  return NanMismatch(expected.real(), actual.real(), relaxed_nans) ||
-         NanMismatch(expected.imag(), actual.imag(), relaxed_nans);
-}
+  // 'actual' and 'expected' literals being compared.
+  const Literal& expected_;
+  const Literal& actual_;
 
-template <>
-void NearComparator::ExpectNear<complex64>(complex64 expected, complex64 actual,
-                                           const ::testing::Message& message) {
-  EXPECT_NEAR(expected.real(), actual.real(), error_.abs)
-      << "expected:\n  " << expected << "\n\tvs actual:\n  " << actual << "\n"
-      << message;
-  EXPECT_NEAR(expected.imag(), actual.imag(), error_.abs)
-      << "expected:\n  " << expected << "\n\tvs actual:\n  " << actual << "\n"
-      << message;
-}
+  // The error bounds of the comparison.
+  ErrorSpec error_;
 
-template <>
-bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
-                                                bfloat16 actual) {
-  return ExpectValuesNear(static_cast<float>(expected),
-                          static_cast<float>(actual));
-}
+  // Whether to include detailed breakdown of mismatches in the error message.
+  bool detailed_message_;
 
-template <>
-bool NearComparator::ExpectValuesNear<half>(half expected, half actual) {
-  return ExpectValuesNear(static_cast<float>(std::move(expected)),
-                          static_cast<float>(std::move(actual)));
-}
+  // Number of element element mismatches encountered so far.
+  int64 num_mismatches_ = 0;
 
-template <>
-void NearComparator::UpdateAndLogMiscompares<bfloat16>(
-    const bfloat16 expected, const bfloat16 actual, const Shape& shape,
-    const int64 linear_index) {
-  UpdateAndLogMiscompares(static_cast<float>(expected),
-                          static_cast<float>(actual), shape, linear_index);
-}
+  // Number of elements with a nan mismatch.
+  int64 num_nan_mismatches_ = 0;
 
-template <>
-void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
-                                                   const Shape& shape,
-                                                   const int64 linear_index) {
-  UpdateAndLogMiscompares(static_cast<float>(std::move(expected)),
-                          static_cast<float>(std::move(actual)), shape,
-                          linear_index);
-}
+  // Number of elements which exceed the absolute/relative error bound.
+  int64 num_abs_mismatches_ = 0;
+  int64 num_rel_mismatches_ = 0;
 
-}  // namespace
+  // A Literal containing which elements did not match in the expected and
+  // actual literals. mismatches_ contains PREDs and is of the same sizes as
+  // the comparison literals.
+  Literal mismatches_;
+
+  // The number of mismatches to report in the output, sorted by relative error
+  // magnitude.
+  static constexpr int64 kTopRelativeErrorCount = 5;
+
+  // The set of mismatches with the largest relative error. The size of this set
+  // is bounded by kTopRelativeErrorCount.
+  std::multiset<Mismatch> top_rel_mismatches_;
+
+  // Actual values are bucketed by absolute value. kAbsValueBucketBounds is the
+  // bounds of these buckets. abs_value_buckets_ contains a pair for each
+  // bucket: the element count and failure count.
+  static constexpr std::array<float, 7> kAbsValueBucketBounds = {
+      0.0, 0.0001, 0.001, 0.01, 0.1, 1, std::numeric_limits<float>::infinity()};
+  std::vector<std::pair<int64, int64>> abs_value_buckets_;
+
+  // Buckets for relative and absolute errors. The relative error buckets only
+  // contains those elements which exceed the *absolute* error bound, and vice
+  // versa. This makes it easy to see the effect of adjusting the relative (or
+  // absolute) error bound on the success of the comparison. kErrorBucketBounds
+  // are the lower bounds of the buckets in both vectors. The error buckets are
+  // a cumulative distribution so an error value may appear in more than one
+  // bucket. For example an error value of 0.003 may appear in the buckets
+  // bounded by 0.01, 0.1, and 1.0.
+  static constexpr std::array<float, 5> kErrorBucketBounds = {0.0001, 0.001,
+                                                              0.01, 0.1, 1};
+  std::vector<int64> abs_error_buckets_;
+  std::vector<int64> rel_error_buckets_;
+};
 
-/* static */ ::testing::AssertionResult LiteralTestUtil::Near(
-    const Literal& expected, const Literal& actual, const ErrorSpec& error) {
+template <typename NativeT>
+constexpr std::array<float, 7> NearComparator<NativeT>::kAbsValueBucketBounds;
+template <typename NativeT>
+constexpr std::array<float, 5> NearComparator<NativeT>::kErrorBucketBounds;
+
+// Helper function for comparing two literals for nearness. Handles tuple-shapes
+// via recursion. shape_index is the ShapeIndex of expected (or actual)
+// currently being compared.
+::testing::AssertionResult NearHelper(const Literal& expected,
+                                      const Literal& actual,
+                                      const ErrorSpec& error,
+                                      bool detailed_message,
+                                      const ShapeIndex& shape_index) {
   ::testing::AssertionResult err =
-      EqualShapes(expected.shape(), actual.shape());
+      LiteralTestUtil::EqualShapes(expected.shape(), actual.shape());
   if (!err) {
     return err;
   }
 
   if (ShapeUtil::IsTuple(expected.shape())) {
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-      SCOPED_TRACE(tensorflow::strings::StrCat(
-          "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape())));
       const auto expected_element = LiteralView::Create(expected, {i});
       const auto actual_element = LiteralView::Create(actual, {i});
-
+      ShapeIndex element_index = shape_index;
+      element_index.push_back(i);
       ::testing::AssertionResult res =
-          Near(expected_element, actual_element, error);
-      if (err && !res) {
-        err = res;
+          NearHelper(expected_element, actual_element, error, detailed_message,
+                     element_index);
+      if (!res) {
+        string err_message =
+            Printf("\nArray at shape index %s%s",
+                   element_index.ToString().c_str(), res.message());
+        if (err) {
+          err = ::testing::AssertionFailure() << err_message;
+        } else {
+          err << err_message;
+        }
       }
     }
+    if (!err && shape_index.empty()) {
+      // Emit a top-level error message containing the top-level shape in case
+      // of mismatch.
+      int64 total_elements = RecursiveElementCount(actual.shape());
+      err = ::testing::AssertionFailure()
+            << Printf("\nMismatches in shape %s (%lld elements):\n%s",
+                      ShapeUtil::HumanString(actual.shape()).c_str(),
+                      total_elements, err.message());
+    }
     return err;
   }
 
   if (ShapeUtil::ElementIsFloating(expected.shape()) ||
       ShapeUtil::ElementIsComplex(expected.shape())) {
-    NearComparator comparator(error);
-    return comparator.ExpectNear(expected, actual)
-               ? ::testing::AssertionSuccess()
-               : ::testing::AssertionFailure() << "values were not near";
+    switch (expected.shape().element_type()) {
+      case BF16:
+        return NearComparator<bfloat16>::Compare(expected, actual, error,
+                                                 detailed_message);
+        break;
+      case F16:
+        return NearComparator<half>::Compare(expected, actual, error,
+                                             detailed_message);
+        break;
+      case F32:
+        return NearComparator<float>::Compare(expected, actual, error,
+                                              detailed_message);
+        break;
+      case F64:
+        return NearComparator<double>::Compare(expected, actual, error,
+                                               detailed_message);
+        break;
+      case C64:
+        return NearComparator<complex64>::Compare(expected, actual, error,
+                                                  detailed_message);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported primitive type in near comparator: "
+                   << PrimitiveType_Name(expected.shape().element_type())
+                   << ". Must be floating-point type.";
+    }
   }
 
-  return Equal(expected, actual);
+  // Non-floating point literal.
+  return LiteralTestUtil::Equal(expected, actual);
+}
+
+}  // namespace
+
+/* static */ ::testing::AssertionResult LiteralTestUtil::Near(
+    const Literal& expected, const Literal& actual, const ErrorSpec& error,
+    bool detailed_message) {
+  return NearHelper(expected, actual, error, detailed_message,
+                    /*shape_index=*/{});
 }
 
 /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
                                               const Literal& actual,
                                               const ErrorSpec& error,
                                               const string& message) {
-  EXPECT_TRUE(Near(expected, actual, error))
-      << (message.empty()
-              ? ""
-              : tensorflow::strings::StrCat("\nmessage: ", message));
+  ::testing::AssertionResult res =
+      Near(expected, actual, error, /*detailed_message=*/false);
+  if (!res) {
+    res << "Expected: " << TruncateHugeLiteral(expected) << "\n";
+    res << "Actual: " << TruncateHugeLiteral(actual) << "\n";
+    if (!message.empty()) {
+      res << StrCat("\nmessage: ", message);
+    }
+  }
+  EXPECT_TRUE(res);
 }
 
 /*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
@@ -749,8 +915,7 @@ void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
 
 /* static */ string LiteralTestUtil::MultiIndexAsString(
     tensorflow::gtl::ArraySlice<int64> multi_index) {
-  return tensorflow::strings::StrCat(
-      "{", tensorflow::str_util::Join(multi_index, ","), "}");
+  return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}");
 }
 
 /* static */ std::unique_ptr<Literal> LiteralTestUtil::Reshape(
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 7b757a4bd7e7592583b7596b4305ddb7e6c52d75..a755568c0f098e15512bd1d3720269c867bc9c49 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -122,16 +122,19 @@ class LiteralTestUtil {
   // bounds are equivalent.
   //
   // Tuples are matched recursively.  When comparing tensors of
-  // non-floating-point type, checks for exact equality, ignoring the ErroSpec.
+  // non-floating-point type, checks for exact equality, ignoring the ErrorSpec.
   //
   // If the shape of the literals is neither a complex/floating-point tensor nor
   // a tuple which contains a complex/floating-point tensor, Near() is
   // equivalent to Equal().  We don't raise an error in this case, because we
   // want to allow callers to call Near() even if they have no preconceptions
   // about the shapes being compared.
+  //
+  // If detailed_message is true, then the error message in the assertion result
+  // will contain a more detailed breakdown of mismatches.
   static ::testing::AssertionResult Near(
-      const Literal& expected, const Literal& actual,
-      const ErrorSpec& error) TF_MUST_USE_RESULT;
+      const Literal& expected, const Literal& actual, const ErrorSpec& error,
+      bool detailed_message = false) TF_MUST_USE_RESULT;
 
   // Expects expected and actual to be Near with the given error.
   static void ExpectNear(const Literal& expected, const Literal& actual,
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index 3a421f8458268a14dcdd84889bcae4990c095ea4..9d619a77c7e8d6398b559e8f562cd7f8194e0811 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -89,7 +89,7 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
       EXPECT_EQ("2", literal->ToString());
     } else if (result.find("actual") != string::npos) {
       EXPECT_EQ("4", literal->ToString());
-    } else if (result.find("miscompares") != string::npos) {
+    } else if (result.find("mismatches") != string::npos) {
       EXPECT_EQ("true", literal->ToString());
     } else {
       FAIL() << "unknown file in temporary directory: " << result;
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 7e92439c494b677f718a63c71c20828d65bebef4..2f46ee0be216d7dabf1c476d3cfb7d528f8ab6a4 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -43,7 +43,7 @@ class LLVMCompilerTest : public ::testing::Test {
   ~LLVMCompilerTest() override {}
 
  protected:
-  using Platform = ::perftools::gputools::Platform;
+  using Platform = se::Platform;
 
   explicit LLVMCompilerTest(string platform_name)
       : platform_name_(std::move(platform_name)) {}
@@ -95,7 +95,7 @@ class LLVMCompilerTest : public ::testing::Test {
     modules.push_back(hlo_module->Clone());
     modules.push_back(std::move(hlo_module));
 
-    std::vector<std::vector<perftools::gputools::StreamExecutor *>> executors;
+    std::vector<std::vector<se::StreamExecutor *>> executors;
     executors.push_back({backend_->default_stream_executor()});
     executors.push_back({backend_->default_stream_executor()});
 
diff --git a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
index 99514baf23cafe61adc28a30dfdfe2691ab82d32..3023df47cda33f5d11abc921fd0355d48f761107 100644
--- a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -49,11 +50,11 @@ void LLVMIRGenTestBase::CompileAndVerifyIr(
     std::unique_ptr<HloModule> hlo_module, const string& pattern,
     bool match_optimized_ir) {
   SetIrHook(match_optimized_ir);
-  ASSERT_TRUE(CompileToExecutable(std::move(hlo_module)).ok());
+  TF_ASSERT_OK(CompileToExecutable(std::move(hlo_module)).status());
   ResetIrHook();
 
   StatusOr<bool> filecheck_result = RunFileCheck(ir_, pattern);
-  ASSERT_TRUE(filecheck_result.ok());
+  TF_ASSERT_OK(filecheck_result.status());
   EXPECT_TRUE(filecheck_result.ValueOrDie());
 }
 
diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
index 3d30ceeaf1b0369b6fdc0cd9620c04aae287941c..f21f83992ffb7c07dff31c68a7e9e3f7944bf512 100644
--- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
@@ -15,9 +15,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -25,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -37,7 +37,7 @@ class LocalClientAllocationTest : public LocalClientTestBase {
 };
 
 XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>({0.0f, 1.0f, 2.0f});
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
@@ -53,7 +53,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
   // deallocation happen on the right allocator.
   ExecutableRunOptions options;
   options.set_allocator(allocator);
-  std::unique_ptr<ScopedShapedBuffer> result =
+  tensorflow::gtl::optional<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {},
                           DefaultExecutableBuildOptions(), options);
 
@@ -66,14 +66,14 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
 
   // Deallocate result and verify that deallocate was called once.
   int64 deallocation_count_before = allocator_->deallocation_count();
-  result = nullptr;
+  result.reset();
   EXPECT_EQ(deallocation_count_before + 1, allocator_->deallocation_count());
 }
 
 XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) {
   // Run a computation on every device on the system. Verify that allocation
   // occurs on the proper device.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>({0.0f, 1.0f, 2.0f});
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
@@ -92,7 +92,7 @@ XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) {
         computation, {}, ExecutableBuildOptions().set_device_ordinal(d),
         ExecutableRunOptions().set_device_ordinal(d).set_allocator(allocator));
     LiteralTestUtil::ExpectR1Near<float>(
-        {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+        {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
 
     // At least one allocation should have been performed when executing the
     // computation.
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 2462ea39f914b1dbb525ea777a48d9ce66035638..44c6811df84f49b6c1b24c11449939e2d375a9d1 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -18,9 +18,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -43,8 +42,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
@@ -56,61 +53,57 @@ class LocalClientExecuteTest : public LocalClientTestBase {
 };
 
 XLA_TEST_F(LocalClientExecuteTest, Constant) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto y = builder.ConstantR0<float>(123.0f);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
+  ScopedShapedBuffer result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {});
-
-  LiteralTestUtil::ExpectR0Near<float>(123.f, *ShapedBufferToLiteral(*result),
+  LiteralTestUtil::ExpectR0Near<float>(123.f, *ShapedBufferToLiteral(result),
                                        error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddScalars) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
   auto y = builder.ConstantR0<float>(123.0f);
   builder.Add(x, y);
 
   auto x_value = LiteralToShapedBuffer(*Literal::CreateR0<float>(42.0f));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_value.get()});
-
-  LiteralTestUtil::ExpectR0Near<float>(165.f, *ShapedBufferToLiteral(*result),
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_value});
+  LiteralTestUtil::ExpectR0Near<float>(165.f, *ShapedBufferToLiteral(result),
                                        error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0}), "x");
   auto y = builder.ConstantR1<float>({});
   builder.Add(x, y);
 
   auto x_array = LiteralToShapedBuffer(*Literal::CreateR1<float>({}));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()});
-
-  LiteralTestUtil::ExpectR1Near<float>({}, *ShapedBufferToLiteral(*result),
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array});
+  LiteralTestUtil::ExpectR1Near<float>({}, *ShapedBufferToLiteral(result),
                                        error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddVectors) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
 
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({0.0f, 1.0f, 2.0f}));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()});
-
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array});
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
@@ -118,18 +111,17 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) {
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({0.0f, 1.0f, 2.0f}));
   ExecutionProfile profile;
-  std::unique_ptr<ScopedShapedBuffer> result = ExecuteLocallyOrDie(
-      builder.Build().ValueOrDie(), {x_array.get()},
-      DefaultExecutableBuildOptions(),
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(
+      builder.Build().ValueOrDie(), {&x_array}, DefaultExecutableBuildOptions(),
       DefaultExecutableRunOptions().set_execution_profile(&profile));
 
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
   EXPECT_GT(profile.compute_and_transfer_time_ns(), 0);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   builder.Add(x, y);
@@ -138,31 +130,31 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
   // Create x as a col-major array.
   auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1})));
-  EXPECT_TRUE(LayoutUtil::Equal(x_array->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(x_array.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
 
   // Create y as a row-major array.
   auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0})));
-  EXPECT_TRUE(LayoutUtil::Equal(y_array->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(y_array.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
 
-  std::unique_ptr<ScopedShapedBuffer> result_colmaj =
-      ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
+  ScopedShapedBuffer result_colmaj =
+      ExecuteLocallyOrDie(computation, {&x_array, &y_array});
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
-                                       *ShapedBufferToLiteral(*result_colmaj),
+                                       *ShapedBufferToLiteral(result_colmaj),
                                        error_spec_);
 
   // Run with the parameter values in a different order.
-  std::unique_ptr<ScopedShapedBuffer> result_param_swap =
-      ExecuteLocallyOrDie(computation, {y_array.get(), x_array.get()});
+  ScopedShapedBuffer result_param_swap =
+      ExecuteLocallyOrDie(computation, {&y_array, &x_array});
   LiteralTestUtil::ExpectR2Near<float>(
       {{11.0f, 22.0f}, {33.0f, 44.0f}},
-      *ShapedBufferToLiteral(*result_param_swap), error_spec_);
+      *ShapedBufferToLiteral(result_param_swap), error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   builder.Add(x, y);
@@ -174,32 +166,32 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
       *Literal::CreateR2<float>({{10.0f, 20.0f}, {30.0f, 40.0f}}));
 
   // Run with col-major result layout.
-  std::unique_ptr<ScopedShapedBuffer> result_colmaj = ExecuteLocallyOrDie(
-      computation, {x_array.get(), y_array.get()},
+  ScopedShapedBuffer result_colmaj = ExecuteLocallyOrDie(
+      computation, {&x_array, &y_array},
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {0, 1})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
-                                       *ShapedBufferToLiteral(*result_colmaj),
+                                       *ShapedBufferToLiteral(result_colmaj),
                                        error_spec_);
 
   // Run with row-major result layout.
-  std::unique_ptr<ScopedShapedBuffer> result_rowmaj = ExecuteLocallyOrDie(
-      computation, {x_array.get(), y_array.get()},
+  ScopedShapedBuffer result_rowmaj = ExecuteLocallyOrDie(
+      computation, {&x_array, &y_array},
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {1, 0})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
-                                       *ShapedBufferToLiteral(*result_rowmaj),
+                                       *ShapedBufferToLiteral(result_rowmaj),
                                        error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   builder.Tuple({x, y, x});
@@ -210,13 +202,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
   auto y_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{10.0f, 20.0f}, {30.0f, 40.0f}}));
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
-  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result->on_host_shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -227,7 +219,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   auto inner_tuple = builder.Tuple({x, y, x});
@@ -239,13 +231,13 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
   auto y_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{10.0f, 20.0f}, {30.0f, 40.0f}}));
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -261,7 +253,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
 
 XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
   // Verify setting the result layout of a computation with a tuple output.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   builder.Tuple({x, y});
@@ -276,11 +268,11 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
        ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2},
                                       /*minor_to_major=*/{1, 0})});
   options.set_result_layout(shape_with_layout);
-  std::unique_ptr<ScopedShapedBuffer> result = ExecuteLocallyOrDie(
-      builder.Build().ValueOrDie(), {array.get(), array.get()}, options,
-      DefaultExecutableRunOptions());
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&array, &array},
+                          options, DefaultExecutableRunOptions());
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -298,7 +290,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
 
   // Computation adds the respective array and vector elements from each tuple
   // argument and returns the results as a tuple.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, tuple_shape0, "x");
   auto y = builder.Parameter(1, tuple_shape1, "y");
   auto x_0 = builder.GetTupleElement(x, 0);
@@ -320,13 +312,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
   auto x_buffer = LiteralToShapedBuffer(*x_literal);
   auto y_buffer = LiteralToShapedBuffer(*y_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {x_buffer.get(), y_buffer.get()});
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(computation, {&x_buffer, &y_buffer});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{56.0f, 46.0f}, {36.0f, 26.0f}},
       LiteralView::Create(*result_literal, {0}));
@@ -345,7 +337,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
 
   // Computation negates the array element and sums the two vector elements in
   // the nested tuple. The resulting array and vector are returned as a tuple.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto param = builder.Parameter(0, nested_tuple_shape, "param");
   auto inner_tuple = builder.GetTupleElement(param, 0);
   auto inner_array = builder.GetTupleElement(inner_tuple, 0);
@@ -365,10 +357,9 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
        Literal::CreateR1<float>({222.0, -2.0, 10.0}).get()});
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{-1.0, -2.0}, {-3.0, -4}}, LiteralView::Create(*result_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
@@ -384,7 +375,7 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({array_shape, array_shape});
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto param = builder.Parameter(0, tuple_shape, "param");
   auto element_0 = builder.GetTupleElement(param, 0);
   auto element_1 = builder.GetTupleElement(param, 1);
@@ -396,18 +387,16 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
        Literal::CreateR2<float>({{11.0, 3.0}, {4.0, 5.0}}).get()});
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result_0 =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-  std::unique_ptr<Literal> result_0_literal = ShapedBufferToLiteral(*result_0);
+  ScopedShapedBuffer result_0 = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_0_literal = ShapedBufferToLiteral(result_0);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{-1.0, -2.0}, {-3.0, -4.0}},
       LiteralView::Create(*result_0_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{22.0, 6.0}, {8.0, 10}}, LiteralView::Create(*result_0_literal, {1}));
 
-  std::unique_ptr<ScopedShapedBuffer> result_1 =
-      ExecuteLocallyOrDie(computation, {result_0.get()});
-  std::unique_ptr<Literal> result_1_literal = ShapedBufferToLiteral(*result_1);
+  ScopedShapedBuffer result_1 = ExecuteLocallyOrDie(computation, {&result_0});
+  std::unique_ptr<Literal> result_1_literal = ShapedBufferToLiteral(result_1);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0, 2.0}, {3.0, 4.0}}, LiteralView::Create(*result_1_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -430,11 +419,11 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
   std::vector<Shape> element_shapes(kElementCount, element_shape);
   const Shape tuple_shape = ShapeUtil::MakeTupleShape(element_shapes);
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto param = builder.Parameter(0, tuple_shape, "param");
 
   // Add each element's tuple index value to every element.
-  std::vector<ComputationDataHandle> result_elements;
+  std::vector<XlaOp> result_elements;
   for (int i = 0; i < kElementCount; ++i) {
     auto element = builder.GetTupleElement(param, i);
     result_elements.push_back(
@@ -453,10 +442,8 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
       Literal::MakeTupleOwned(std::move(arg_elements));
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
 
   for (int i = 0; i < kElementCount; ++i) {
     LiteralTestUtil::ExpectR1Near<float>(
@@ -465,9 +452,7 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
   }
 }
 
-// TODO(b/66968986): Test times out on CPU parallel backend. Disabled
-// 2017-09-26.
-XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) {
+XLA_TEST_F(LocalClientExecuteTest, LargeNestedTuple) {
   // Construct and run a computation which takes a two-level nested tuple
   // parameter with a large fanout.
   const int kFanout = 40;
@@ -479,15 +464,15 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) {
   std::vector<Shape> inner_tuple_shapes(kFanout, inner_tuple_shape);
   const Shape tuple_shape = ShapeUtil::MakeTupleShape(inner_tuple_shapes);
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto param = builder.Parameter(0, tuple_shape, "param");
 
   // The computation increments each leaf value by an amount equal to the leaf's
   // ordinal position in a traversal of the tuple.
-  std::vector<ComputationDataHandle> result_elements;
+  std::vector<XlaOp> result_elements;
   for (int i = 0; i < kFanout; ++i) {
     auto outer_element = builder.GetTupleElement(param, i);
-    std::vector<ComputationDataHandle> inner_result_elements;
+    std::vector<XlaOp> inner_result_elements;
     for (int j = 0; j < kFanout; ++j) {
       auto inner_element = builder.GetTupleElement(outer_element, j);
       inner_result_elements.push_back(builder.Add(
@@ -511,9 +496,8 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) {
   auto arg_literal = Literal::MakeTupleOwned(std::move(outer_tuple_elements));
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
 
   for (int i = 0; i < kFanout; ++i) {
     for (int j = 0; j < kFanout; ++j) {
@@ -535,7 +519,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
     shape = ShapeUtil::MakeTupleShape({shape});
   }
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto element = builder.Parameter(0, shape, "param");
   for (int i = 0; i < kTupleDepth; ++i) {
     element = builder.GetTupleElement(element, 0);
@@ -556,9 +540,8 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
   }
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
 
   ShapeIndex index;
   for (int i = 0; i < kTupleDepth; ++i) {
@@ -570,7 +553,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
 
 XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
   // Test passing in an invalid number of arguments.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {3}), "y");
   builder.Add(x, y);
@@ -578,7 +561,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({1.0f, 2.0f, 3.0f}));
   auto execute_status =
-      ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()});
+      ExecuteLocally(builder.Build().ValueOrDie(), {&x_array});
 
   EXPECT_FALSE(execute_status.ok());
   EXPECT_THAT(execute_status.status().error_message(),
@@ -587,14 +570,14 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
 
 XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) {
   // Test passing in an argument with the wrong shape.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   builder.Neg(x);
 
   auto x_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{0.0f, 1.0f}, {2.0f, 3.0f}}));
   auto execute_status =
-      ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()});
+      ExecuteLocally(builder.Build().ValueOrDie(), {&x_array});
 
   EXPECT_FALSE(execute_status.ok());
   EXPECT_THAT(execute_status.status().error_message(),
@@ -604,14 +587,14 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) {
 
 XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) {
   // Test passing in an invalid result layout parameter.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   builder.Neg(x);
 
   auto x_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{0.0f, 1.0f}, {2.0f, 3.0f}}));
   auto execute_status = ExecuteLocally(
-      builder.Build().ValueOrDie(), {x_array.get()},
+      builder.Build().ValueOrDie(), {&x_array},
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32,
                                          /*dimensions=*/{1, 2, 3, 4},
@@ -627,7 +610,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) {
 XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) {
   // Try to run a trivial computation on every device on the system. If a
   // specific device is not supported, check that the right error is returned.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
   auto computation = builder.Build().ConsumeValueOrDie();
   for (int d = 0; d < local_client_->device_count(); ++d) {
@@ -644,9 +627,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) {
           computation, {},
           DefaultExecutableBuildOptions().set_device_ordinal(d),
           DefaultExecutableRunOptions().set_device_ordinal(d));
-      EXPECT_EQ(d, result->device_ordinal());
+      EXPECT_EQ(d, result.device_ordinal());
       LiteralTestUtil::ExpectR0Equal<float>(42.0f,
-                                            *ShapedBufferToLiteral(*result));
+                                            *ShapedBufferToLiteral(result));
     }
   }
 }
@@ -654,7 +637,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) {
 XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) {
   // Try running computations on devices with device ordinal values which do not
   // exist.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
   auto computation = builder.Build().ConsumeValueOrDie();
 
@@ -671,7 +654,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) {
 
 XLA_TEST_F(LocalClientExecuteTest, RunOnStream) {
   // Run a computation on a specific stream on each device on the system.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
   auto computation = builder.Build().ConsumeValueOrDie();
 
@@ -689,9 +672,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnStream) {
                             DefaultExecutableRunOptions().set_stream(&stream));
     // As a check to verify that the computation ran of the device associated
     // with the stream. This is a weak check, but stronger verification is hard.
-    EXPECT_EQ(d, result->device_ordinal());
+    EXPECT_EQ(d, result.device_ordinal());
     LiteralTestUtil::ExpectR0Equal<float>(42.0f,
-                                          *ShapedBufferToLiteral(*result));
+                                          *ShapedBufferToLiteral(result));
   }
 }
 
@@ -707,7 +690,7 @@ XLA_TEST_F(LocalClientExecuteTest,
   se::Stream wrong_stream(wrong_platform->ExecutorForDevice(0).ValueOrDie());
   wrong_stream.Init();
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
   auto execute_status = ExecuteLocally(
       builder.Build().ValueOrDie(), {}, DefaultExecutableBuildOptions(),
@@ -724,7 +707,7 @@ XLA_TEST_F(LocalClientExecuteTest,
           .ValueOrDie();
   TestAllocator allocator(wrong_platform);
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto y = builder.ConstantR0<float>(123.0f);
 
   auto execute_status = ExecuteLocally(
@@ -737,7 +720,7 @@ XLA_TEST_F(LocalClientExecuteTest,
 
 XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) {
   // Try to run a computation on a stream that has not been initialized.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
 
   LOG(INFO) << "default device = " << local_client_->default_device_ordinal();
@@ -757,7 +740,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -767,9 +750,9 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
       {builder.ConstantR1<float>(vec2), builder.ConstantR1<float>(vec1)});
   builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
+  ScopedShapedBuffer result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {});
-  std::unique_ptr<Literal> tuple_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> tuple_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR1Equal<float>(
       {2.0f, 4.0f, 6.0f}, LiteralView::Create(*tuple_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
@@ -777,7 +760,7 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
@@ -793,12 +776,12 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
 
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({0.0f, 1.0f, 2.0f}));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      executable->Run({x_array.get()}, DefaultExecutableRunOptions())
+  ScopedShapedBuffer result =
+      executable->Run({&x_array}, DefaultExecutableRunOptions())
           .ConsumeValueOrDie();
 
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) {
@@ -811,7 +794,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) {
             literal, local_client_->default_device_ordinal(), allocator_));
     TF_ASSERT_OK_AND_ASSIGN(
         auto transferred_literal,
-        local_client_->ShapedBufferToLiteral(*shaped_buffer));
+        local_client_->ShapedBufferToLiteral(shaped_buffer));
     EXPECT_EQ(literal, *transferred_literal);
   };
 
@@ -851,7 +834,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
             literal, local_client_->default_device_ordinal(), allocator_));
     TF_ASSERT_OK_AND_ASSIGN(
         auto transferred_literal,
-        local_client_->ShapedBufferToLiteral(*shaped_buffer));
+        local_client_->ShapedBufferToLiteral(shaped_buffer));
     EXPECT_EQ(literal, *transferred_literal);
   };
 
@@ -867,9 +850,8 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
 
 // TODO(b/34359662): Support infeed/outfeed on GPU and CPU parallel.
 // 2017-10-18.
-XLA_TEST_F(LocalClientExecuteTest,
-           DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(InfeedOutfeedTest))) {
-  ComputationBuilder builder(local_client_, TestName());
+XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_GPU(InfeedOutfeedTest)) {
+  XlaBuilder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {3});
   auto in = builder.Infeed(shape);
   auto constant = builder.ConstantR1<float>({1.0f, 2.0f, 3.0f});
@@ -907,7 +889,7 @@ void BM_LocalClientOverhead(int num_iters) {
   int device_ordinal = client->default_device_ordinal();
 
   // Use a tiny add operation as the computation.
-  ComputationBuilder builder(client, "Add");
+  XlaBuilder builder("Add");
   auto shape = ShapeUtil::MakeShape(F32, {2, 3});
   auto x = builder.Parameter(0, shape, "x");
   builder.Add(x, x);
@@ -919,12 +901,12 @@ void BM_LocalClientOverhead(int num_iters) {
           .ConsumeValueOrDie();
   auto literal = Literal::CreateR2<float>({{0, 0, 0}, {0, 0, 0}});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *literal, *buffer));
+      executors[device_ordinal], *literal, buffer));
 
   const int kWarmups = 2;
 
   auto executable_status = client->Compile(
-      computation, {&buffer->on_host_shape()}, ExecutableBuildOptions());
+      computation, {&buffer.on_host_shape()}, ExecutableBuildOptions());
   ASSERT_IS_OK(executable_status);
   std::unique_ptr<LocalExecutable> executable =
       executable_status.ConsumeValueOrDie();
@@ -936,13 +918,13 @@ void BM_LocalClientOverhead(int num_iters) {
   run_options.set_allocator(&allocator).set_stream(&stream);
 
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({buffer.get()}, run_options);
+    auto result = executable->Run({&buffer}, run_options);
     ASSERT_IS_OK(result);
   }
 
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({buffer.get()}, run_options);
+    auto result = executable->Run({&buffer}, run_options);
     ASSERT_IS_OK(result);
   }
 }
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 96b976d25d75d35f46adfd104a03aceb363661eb..e859b3059eea86b362443c3269f99ccae941dfe2 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -35,19 +35,21 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-StatusOr<perftools::gputools::DeviceMemoryBase> TestAllocator::Allocate(
-    int device_ordinal, uint64 size, bool retry_on_failure) {
+StatusOr<se::DeviceMemoryBase> TestAllocator::Allocate(int device_ordinal,
+                                                       uint64 size,
+                                                       bool retry_on_failure) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
     allocation_count_++;
     device_allocation_count_[device_ordinal]++;
   }
-  return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size);
+  return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
+                                                 retry_on_failure);
 }
 
-tensorflow::Status TestAllocator::Deallocate(
-    int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) {
+tensorflow::Status TestAllocator::Deallocate(int device_ordinal,
+                                             se::DeviceMemoryBase* mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
@@ -88,7 +90,7 @@ int64 TestAllocator::deallocation_count(int device_ordinal) const {
 }
 
 /* static */ TestAllocator* LocalClientTestBase::GetOrCreateAllocator(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
   tensorflow::mutex_lock lock(mu);
 
@@ -115,8 +117,7 @@ struct LocalClientTestBase::EigenThreadPoolWrapper {
   std::unique_ptr<Eigen::ThreadPoolDevice> device;
 };
 
-LocalClientTestBase::LocalClientTestBase(
-    perftools::gputools::Platform* platform)
+LocalClientTestBase::LocalClientTestBase(se::Platform* platform)
     : local_client_(
           ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()),
       thread_pool_wrapper_(new EigenThreadPoolWrapper()) {
@@ -128,7 +129,7 @@ LocalClientTestBase::LocalClientTestBase(
 
 LocalClientTestBase::~LocalClientTestBase() {}
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::LiteralToShapedBuffer(
+ScopedShapedBuffer LocalClientTestBase::LiteralToShapedBuffer(
     const Literal& literal) {
   return local_client_
       ->LiteralToShapedBuffer(literal, local_client_->default_device_ordinal())
@@ -148,23 +149,21 @@ ExecutableBuildOptions LocalClientTestBase::DefaultExecutableBuildOptions()
 
 ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const {
   ExecutableRunOptions run_options;
-  run_options.set_inter_op_thread_pool(
-      local_client_->backend().inter_op_thread_pool());
   run_options.set_intra_op_thread_pool(thread_pool_wrapper_->device.get());
   run_options.set_allocator(GetOrCreateAllocator(local_client_->platform()));
   return run_options;
 }
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
-    const Computation& computation,
+ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
+    const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
                         DefaultExecutableRunOptions())
       .ConsumeValueOrDie();
 }
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
-    const Computation& computation,
+ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
+    const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableBuildOptions& build_options,
     const ExecutableRunOptions& run_options) {
@@ -172,17 +171,15 @@ std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
       .ConsumeValueOrDie();
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-LocalClientTestBase::ExecuteLocally(
-    const Computation& computation,
+StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
+    const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
                         DefaultExecutableRunOptions());
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-LocalClientTestBase::ExecuteLocally(
-    const Computation& computation,
+StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
+    const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableBuildOptions& build_options,
     const ExecutableRunOptions& run_options) {
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index f0c73f04f6eb67b2e9cb5e111eccdc3818059b2b..3bbb760c806412a671bc2502846e123e2582fd16 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -41,15 +41,15 @@ namespace xla {
 
 class TestAllocator : public StreamExecutorMemoryAllocator {
  public:
-  explicit TestAllocator(perftools::gputools::Platform* platform)
+  explicit TestAllocator(se::Platform* platform)
       : StreamExecutorMemoryAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
   }
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure) override;
-  tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
+  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
+                                          bool retry_on_failure) override;
+  tensorflow::Status Deallocate(int device_ordinal,
+                                se::DeviceMemoryBase* mem) override;
 
   // Return the number of allocations that have been performed.
   int64 allocation_count() const;
@@ -75,18 +75,15 @@ class TestAllocator : public StreamExecutorMemoryAllocator {
 class LocalClientTestBase : public ::testing::Test {
  protected:
   struct EigenThreadPoolWrapper;
-  explicit LocalClientTestBase(
-      perftools::gputools::Platform* platform = nullptr);
+  explicit LocalClientTestBase(se::Platform* platform = nullptr);
   virtual ~LocalClientTestBase();
 
-  static TestAllocator* GetOrCreateAllocator(
-      perftools::gputools::Platform* platform);
+  static TestAllocator* GetOrCreateAllocator(se::Platform* platform);
 
   // Copy the given literal onto the default device and return a
   // ScopedShapedBuffer. Convenience wrapper around
   // LocalClient::LiteralToShapedBuffer.
-  std::unique_ptr<ScopedShapedBuffer> LiteralToShapedBuffer(
-      const Literal& literal);
+  ScopedShapedBuffer LiteralToShapedBuffer(const Literal& literal);
 
   // Construct and return a literal containing the array represented by
   // shaped_buffer.
@@ -95,20 +92,20 @@ class LocalClientTestBase : public ::testing::Test {
 
   // Execute the given computation on the local client. With and without
   // options.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteLocally(
-      const Computation& computation,
+  StatusOr<ScopedShapedBuffer> ExecuteLocally(
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteLocally(
-      const Computation& computation,
+  StatusOr<ScopedShapedBuffer> ExecuteLocally(
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableBuildOptions& build_options,
       const ExecutableRunOptions& run_options);
 
-  std::unique_ptr<ScopedShapedBuffer> ExecuteLocallyOrDie(
-      const Computation& computation,
+  ScopedShapedBuffer ExecuteLocallyOrDie(
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
-  std::unique_ptr<ScopedShapedBuffer> ExecuteLocallyOrDie(
-      const Computation& computation,
+  ScopedShapedBuffer ExecuteLocallyOrDie(
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableBuildOptions& build_options,
       const ExecutableRunOptions& run_options);
@@ -128,7 +125,7 @@ class LocalClientTestBase : public ::testing::Test {
   // of the process. So make the allocator static.
   static TestAllocator* allocator_;
 
-  perftools::gputools::StreamExecutor* stream_executor_;
+  se::StreamExecutor* stream_executor_;
   TransferManager* transfer_manager_;
 
   LocalClient* local_client_;
diff --git a/tensorflow/compiler/xla/tests/log_test.cc b/tensorflow/compiler/xla/tests/log_test.cc
index 174d433a9e17312c3548668feeeb2e92712c87f8..c0c02e584c2348f64a9d7d0800038f5ca67a2171 100644
--- a/tensorflow/compiler/xla/tests/log_test.cc
+++ b/tensorflow/compiler/xla/tests/log_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -29,7 +29,7 @@ namespace {
 class LogTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(LogTest, LogZeroValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR3FromArray3D<float>(Array3D<float>(3, 0, 0));
   builder.Log(x);
 
@@ -41,7 +41,7 @@ TEST_F(LogTest, LogTenValues) {
   std::vector<float> input = {-0.0, 1.0, 2.0,  -3.0, -4.0,
                               5.0,  6.0, -7.0, -8.0, 9.0};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(input);
   builder.Log(x);
 
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 2b0f7e6e80c48435ca55432a2afa3b6d69162625..7df45bebebdd3eb2e71f27d831a8e2ac9e3b5f7c 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -39,7 +39,7 @@ namespace {
 
 class MapTest : public ClientLibraryTestBase {
  public:
-  explicit MapTest(perftools::gputools::Platform* platform = nullptr)
+  explicit MapTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
@@ -50,18 +50,18 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (add)
   //                /
   // 1.0f ---------/
-  Computation CreateAdderToOne() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateAdderToOne() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto one = mapped_builder.ConstantR0<float>(1.0);
-    auto adder_to_one = mapped_builder.Add(x, one);
+    mapped_builder.Add(x, one);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
   }
 
-  Computation CreateMax() {
-    ComputationBuilder b(client_, TestName());
+  XlaComputation CreateMax() {
+    XlaBuilder b(TestName());
     auto lhs = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto rhs = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     b.Max(lhs, rhs);
@@ -73,8 +73,8 @@ class MapTest : public ClientLibraryTestBase {
   // Creates a computation that accepts an F32 and returns T(1) (ignoring the
   // argument).
   template <class T>
-  Computation CreateScalarOne() {
-    ComputationBuilder mapped_builder(client_, "scalar_one");
+  XlaComputation CreateScalarOne() {
+    XlaBuilder mapped_builder("scalar_one");
     (void)mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     mapped_builder.ConstantR0<T>(1);
     auto computation_status = mapped_builder.Build();
@@ -87,11 +87,11 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (mul)
   //                /
   // 2.0f ---------/
-  Computation CreateMulByTwo() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateMulByTwo() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto two = mapped_builder.ConstantR0<float>(2.0);
-    auto mul_by_two = mapped_builder.Mul(x, two);
+    mapped_builder.Mul(x, two);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -105,12 +105,12 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (add) ----> (mul)
   //                /
   // 1.0f ---------/
-  Computation CreateAdderToOneTimesItself() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateAdderToOneTimesItself() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto one = mapped_builder.ConstantR0<float>(1.0);
     auto adder_to_one = mapped_builder.Add(x, one);
-    auto result = mapped_builder.Mul(x, adder_to_one);
+    mapped_builder.Mul(x, adder_to_one);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -122,12 +122,13 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} -----------> (map) ----> (add)
   //                         /           /
   // embedded_computation --/       n --/
-  Computation CreateMapPlusN(const Computation& embedded_computation, float n) {
-    ComputationBuilder builder(client_, TestName());
+  XlaComputation CreateMapPlusN(const XlaComputation& embedded_computation,
+                                float n) {
+    XlaBuilder builder(TestName());
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto map = builder.Map({x}, embedded_computation, {});
     auto constant_n = builder.ConstantR0<float>(n);
-    auto add = builder.Add(map, constant_n);
+    builder.Add(map, constant_n);
     auto computation_status = builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -135,11 +136,11 @@ class MapTest : public ClientLibraryTestBase {
 
   // Creates a binary function with signature (F32, F32) -> Pred
   // defined by (x, y) -> x > y.
-  Computation CreateGt() {
-    ComputationBuilder b(client_, "Gt");
+  XlaComputation CreateGt() {
+    XlaBuilder b("Gt");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
-    auto gt = b.Gt(x, y);
+    b.Gt(x, y);
     auto computation_status = b.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -152,13 +153,13 @@ class MapTest : public ClientLibraryTestBase {
   // y {R0F32} ----> (add) ---> (add)
   //                           /
   // z {R0F32} ---------------/
-  Computation CreateTernaryAdder() {
-    ComputationBuilder mapped_builder(client_, "TernaryAdder");
+  XlaComputation CreateTernaryAdder() {
+    XlaBuilder mapped_builder("TernaryAdder");
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = mapped_builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     auto z = mapped_builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "z");
     auto xy = mapped_builder.Add(x, y);
-    auto xyz = mapped_builder.Add(xy, z);
+    mapped_builder.Add(xy, z);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -167,13 +168,13 @@ class MapTest : public ClientLibraryTestBase {
 
 TEST_F(MapTest, MapEachElemPlusOneR0) {
   // Applies lambda (x) (+ x 1)) to an input scalar.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(42.0);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {});
+  builder.Map({param}, CreateAdderToOne(), {});
 
   ComputeAndCompareR0<float>(&builder, 43.0, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -181,13 +182,13 @@ TEST_F(MapTest, MapEachElemPlusOneR0) {
 
 XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0});
+  builder.Map({param}, CreateAdderToOne(), {0});
 
   ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -195,55 +196,55 @@ XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
 
 TEST_F(MapTest, MapEachElemPlusOneR1S4) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0});
+  builder.Map({param}, CreateAdderToOne(), {0});
 
   ComputeAndCompareR1<float>(&builder, {3.2f, 4.3f, 5.4f, 6.5f},
                              {param0_data.get()}, ErrorSpec(0.01f));
 }
 
 TEST_F(MapTest, MapEachF32ElementToS32Constant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateScalarOne<int32>(), {0});
+  builder.Map({param}, CreateScalarOne<int32>(), {0});
 
   ComputeAndCompareR1<int32>(&builder, {1, 1, 1, 1}, {param0_data.get()});
 }
 
 TEST_F(MapTest, MapEachF32ElementToU32Constant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateScalarOne<uint32>(), {0});
+  builder.Map({param}, CreateScalarOne<uint32>(), {0});
 
   ComputeAndCompareR1<uint32>(&builder, {1, 1, 1, 1}, {param0_data.get()});
 }
 
 TEST_F(MapTest, MapEachElemLongerChainR1) {
   // Maps (lambda (x) (* (+ x 1) x)) onto an input R1F32 vector.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.6f, -5.1f, 0.1f, 0.2f, 999.0f, 255.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOneTimesItself(), {0});
+  builder.Map({param}, CreateAdderToOneTimesItself(), {0});
 
   ComputeAndCompareR1<float>(
       &builder, {9.36f, 20.91f, 0.11f, 0.24f, 999000.0f, 65535.75f},
@@ -253,14 +254,14 @@ TEST_F(MapTest, MapEachElemLongerChainR1) {
 XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0, and then
   // maps (lambda (x) (* x 2)) on the result.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
   auto map1 = builder.Map({param}, CreateAdderToOne(), {0});
-  auto map2 = builder.Map({map1}, CreateMulByTwo(), {0});
+  builder.Map({map1}, CreateMulByTwo(), {0});
 
   ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -269,7 +270,7 @@ XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
 TEST_F(MapTest, MapMultipleMapsR1S4) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4, and then
   // maps (lambda (x) (* x 2)) on the result.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -277,7 +278,7 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
   auto map1 = builder.Map({param}, CreateAdderToOne(), {0});
-  auto map2 = builder.Map({map1}, CreateMulByTwo(), {0});
+  builder.Map({map1}, CreateMulByTwo(), {0});
 
   ComputeAndCompareR1<float>(&builder, {6.4f, 8.6f, 10.8f, 13.0f},
                              {param0_data.get()}, ErrorSpec(0.01f));
@@ -285,14 +286,14 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
 
 TEST_F(MapTest, MapEachElemPlusOneR2) {
   // Maps (lambda (x) (+ x 1)) onto an input R2F32 vector.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2<float>(
       {{13.25f, 14.0f}, {-7.1f, -7.2f}, {-8.8f, 8.8f}});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0, 1});
+  builder.Map({param}, CreateAdderToOne(), {0, 1});
 
   Array2D<float> expected_array(
       {{14.25f, 15.0f}, {-6.1f, -6.2f}, {-7.8f, 9.8f}});
@@ -317,18 +318,18 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) {
   auto embed2 = CreateMapPlusN(embed1, 2.0);
   auto embed3 = CreateMapPlusN(embed1, 4.0);
 
-  ComputationBuilder embed4_builder(client_, "embed4");
+  XlaBuilder embed4_builder("embed4");
   auto embed4_param = embed4_builder.Parameter(0, scalar_shape, "x");
   auto embed4_map_lhs = embed4_builder.Map({embed4_param}, embed2, {});
   auto embed4_map_rhs = embed4_builder.Map({embed4_param}, embed3, {});
-  auto embed4_add = embed4_builder.Add(embed4_map_lhs, embed4_map_rhs);
+  embed4_builder.Add(embed4_map_lhs, embed4_map_rhs);
   auto embed4_status = embed4_builder.Build();
   ASSERT_IS_OK(embed4_status.status());
   auto embed4 = embed4_status.ConsumeValueOrDie();
 
   auto embed5 = CreateMapPlusN(embed2, 6.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto constant_42 = builder.ConstantR0<float>(42.0);
   auto constant_7 = builder.ConstantR0<float>(7.0);
   auto map_42 = builder.Map({constant_42}, embed5, {});
@@ -338,50 +339,9 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) {
   ComputeAndCompareR0<float>(&builder, 73.0, {}, ErrorSpec(0.01f));
 }
 
-TEST_F(MapTest, VersionedEmbeddedComputation) {
-  // Build a computation X, use it in a map, then add an additional operation to
-  // computation X and use it again in a different map. Verify that the proper
-  // versions of computation X are used in each of the maps.
-
-  // Create a (embedded) computation which adds one to its parameter argument.
-  ComputationBuilder embedded_builder(client_, "EmbeddedComputation");
-  auto param_0 =
-      embedded_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
-  auto constant_one = embedded_builder.ConstantR0<float>(1.0);
-  auto adder_to_one = embedded_builder.Add(param_0, constant_one);
-  auto computation_status = embedded_builder.Build();
-  ASSERT_IS_OK(computation_status.status());
-  auto embedded_computation = computation_status.ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto constant_vector = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
-  auto map_plus_1 = builder.Map({constant_vector}, embedded_computation, {0});
-
-  // Add another Add(1) operation to the existing embedded computation. This
-  // requires using the stub interface because the ComputationBuilder does not
-  // allow modification to the Computation objects after they have been built.
-  BinaryOpRequest request;
-  request.set_binop(BINOP_ADD);
-  *request.mutable_lhs() = adder_to_one;
-  *request.mutable_rhs() = constant_one;
-  OpRequest op_request;
-  *op_request.mutable_computation() = embedded_computation.handle();
-  *op_request.mutable_binary_op_request() = request;
-  OpResponse response;
-  tensorflow::Status s = client_->stub()->Op(&op_request, &response);
-  ASSERT_TRUE(s.ok());
-
-  auto map_plus_2 = builder.Map({map_plus_1}, embedded_computation, {0});
-
-  // The original vector has Add(1) applied to it with a map, followed by
-  // Add(1+1) resulting in a net Add(3).
-  ComputeAndCompareR1<float>(&builder, {4.0, 5.0, 6.0, 7.0}, {},
-                             ErrorSpec(0.01f));
-}
-
 TEST_F(MapTest, MapBinaryAdder) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -393,8 +353,7 @@ TEST_F(MapTest, MapBinaryAdder) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(F32, &builder), {0});
+  builder.Map({param0, param1}, CreateScalarAddComputation(F32, &builder), {0});
 
   ComputeAndCompareR1<float>(&builder, {7.3f, 7.7, 4.3f, 0},
                              {param0_data.get(), param1_data.get()},
@@ -404,7 +363,7 @@ TEST_F(MapTest, MapBinaryAdder) {
 // Adds two rank-2 arrays with different layouts. This test exercises a path
 // for Map that used to fail in shape inference (b/28989438).
 XLA_TEST_F(MapTest, AddWithMixedLayouts) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2WithLayout(
       {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({1, 0}));
   std::unique_ptr<GlobalData> param0_data =
@@ -417,8 +376,8 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(S32, &builder), {0, 1});
+  builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder),
+              {0, 1});
 
   Array2D<int32> expected(2, 2);
   expected(0, 0) = 11;
@@ -430,7 +389,7 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
 }
 
 XLA_TEST_F(MapTest, AddR3_3x0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR3FromArray3D<int32>(Array3D<int32>(3, 0, 2));
   std::unique_ptr<GlobalData> param0_data =
@@ -443,8 +402,8 @@ XLA_TEST_F(MapTest, AddR3_3x0x2) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(S32, &builder), {0, 1, 2});
+  builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder),
+              {0, 1, 2});
 
   ComputeAndCompareR3<int32>(&builder, Array3D<int32>(3, 0, 2),
                              {param0_data.get(), param1_data.get()});
@@ -452,7 +411,7 @@ XLA_TEST_F(MapTest, AddR3_3x0x2) {
 
 TEST_F(MapTest, MapTernaryAdder) {
   // Maps (lambda (x y z) (+ x y z)) onto three R1F32 vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -469,7 +428,7 @@ TEST_F(MapTest, MapTernaryAdder) {
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
   auto param2 = builder.Parameter(2, param2_literal->shape(), "param2");
-  auto map = builder.Map({param0, param1, param2}, CreateTernaryAdder(), {0});
+  builder.Map({param0, param1, param2}, CreateTernaryAdder(), {0});
 
   ComputeAndCompareR1<float>(
       &builder, {-2.7f, -92.3f, -895.7f, -400.0f},
@@ -479,24 +438,24 @@ TEST_F(MapTest, MapTernaryAdder) {
 
 TEST_F(MapTest, MapGt) {
   // Maps (x,y) -> x > y onto two R1F32 vectors.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto gt = CreateGt();
   b.Map({b.ConstantR1<float>({1, 20}), b.ConstantR1<float>({10, 2})}, gt, {0});
   ComputeAndCompareR1<bool>(&b, {false, true}, {});
 }
 
 TEST_F(MapTest, NestedBinaryMap) {
-  Computation max_with_square;
+  XlaComputation max_with_square;
   {
     // max_with_square(x) = do max(x, x^2) via a map.
-    ComputationBuilder b(client_, "max_with_square");
+    XlaBuilder b("max_with_square");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     b.Map({x, b.Mul(x, x)}, CreateMax(), {});
     auto computation_status = b.Build();
     ASSERT_IS_OK(computation_status.status());
     max_with_square = computation_status.ConsumeValueOrDie();
   }
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = b.ConstantR1<float>({0.1f, 0.5f, -0.5f, 1.0f, 2.0f});
   b.Map({input}, max_with_square, {0});
   ComputeAndCompareR1<float>(&b, {0.1f, 0.5f, 0.25f, 1.0f, 4.0f}, {});
@@ -505,13 +464,13 @@ TEST_F(MapTest, NestedBinaryMap) {
 TEST_F(MapTest, MapOperantionWithBuildError) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors but uses an unsupported
   // type combination (F32 + U16) to test that the error is reported to the
-  // outermost ComputationBuilder.
-  ComputationBuilder builder(client_, TestName());
+  // outermost XlaBuilder.
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("ErrorAdd");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
   auto y = sub_builder->Parameter(1, ShapeUtil::MakeShape(U16, {}), "y");
-  auto adder = sub_builder->Add(x, y);
+  sub_builder->Add(x, y);
   auto error_add = sub_builder->BuildAndNoteError();
 
   std::unique_ptr<Literal> param0_literal =
@@ -525,13 +484,13 @@ TEST_F(MapTest, MapOperantionWithBuildError) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1}, error_add, {0});
+  builder.Map({param0, param1}, error_add, {0});
 
-  StatusOr<Computation> computation_status = builder.Build();
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_TRUE(!computation_status.ok());
   EXPECT_THAT(
       computation_status.status().ToString(),
-      ::testing::HasSubstr("error from: ErrorAdd: binary op BINOP_ADD with "
+      ::testing::HasSubstr("error from: ErrorAdd: Binary op BINOP_ADD with "
                            "different element types: f32[] and u16[]"));
 }
 
@@ -545,7 +504,7 @@ using MapTestWithFullOpt = ClientLibraryTestBase;
 // to have issues with such patterns and maybe invalidate the pointer to entry
 // computation.
 TEST_F(MapTestWithFullOpt, MapScalarPower) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
@@ -572,7 +531,7 @@ TEST_F(MapTestWithFullOpt, MapScalarPower) {
 // Regression test for b/35786417, where the inliner would not notice the change
 // of parameter order inside the map.
 TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
@@ -598,7 +557,7 @@ TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
 // Regression test for b/35786417, where the inliner would CHECK-fail due to the
 // mul inside the map having more parameters than the map does.
 TEST_F(MapTestWithFullOpt, MapSquare) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 6c86dd5b9ef673c9facffafa37e00a859ce82010..7fa61eb33c2930ac8192ac965a71122001f808d3 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -19,8 +19,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -29,6 +30,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
@@ -38,258 +41,222 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class MatOpsSimpleTest : public ClientLibraryTestBase {
- protected:
-  Computation BuildSum() {
-    // sum(x, y) = x + y
-    ComputationBuilder builder(client_, "sum");
-    auto x_value =
-        builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x_value");
-    auto y_value =
-        builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y_value");
-    builder.Add(x_value, y_value);
-    auto computation_status = builder.Build();
-    TF_CHECK_OK(computation_status.status());
-    return computation_status.ConsumeValueOrDie();
-  }
-
-  void TestLinspaceMax(int64 rows, int64 cols) {
-    float from = -128.0, to = 256.0;
-    std::unique_ptr<Array2D<float>> alhs =
-        MakeLinspaceArray2D(from, to, rows, cols);
-    auto arhs = MakeUnique<Array2D<float>>(rows, cols, 1.0);
-
-    ComputationBuilder builder(
-        client_,
-        tensorflow::strings::Printf("max_%lldx%lld_linspace", rows, cols));
-    auto lhs = builder.ConstantR2FromArray2D<float>(*alhs);
-    auto rhs = builder.ConstantR2FromArray2D<float>(*arhs);
-    auto max = builder.Max(lhs, rhs);
-
-    Array2D<float> aexpected(rows, cols);
-    for (int row = 0; row < rows; ++row) {
-      for (int col = 0; col < cols; ++col) {
-        aexpected(row, col) = std::max((*alhs)(row, col), (*arhs)(row, col));
-      }
-    }
-
-    ComputeAndCompareR2<float>(&builder, aexpected, {}, ErrorSpec(1e-6));
-  }
-};
-
-TEST_F(MatOpsSimpleTest, ExpTwoByTwoValues) {
-  ComputationBuilder builder(client_, "exp_2x2");
-  auto data = builder.ConstantR2<float>({
-      {1.0, 0.0},   // row 0
-      {-1.0, 0.5},  // row 1
+#ifdef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
+using TypesF16F32 = ::testing::Types<float>;
+#else
+using TypesF16F32 = ::testing::Types<Eigen::half, float>;
+#endif
+
+class MatOpsSimpleTest : public ClientLibraryTestBase {};
+
+template <typename T>
+class MatOpsSimpleTest_F16F32 : public MatOpsSimpleTest {};
+
+// TODO(bixia): This test for F16 failed on GPU 02-25-2018.
+#ifdef XLA_TEST_BACKEND_GPU
+TYPED_TEST_CASE(MatOpsSimpleTest_F16F32, ::testing::Types<float>);
+#else
+TYPED_TEST_CASE(MatOpsSimpleTest_F16F32, TypesF16F32);
+#endif
+
+XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) {
+  using T = TypeParam;
+  XlaBuilder builder("exp_2x2");
+  auto data = builder.ConstantR2FromArray2D<T>({
+      {1.0f, 0.0f},   // row 0
+      {-1.0f, 0.5f},  // row 1
   });
   builder.Exp(data);
 
   std::unique_ptr<Literal> expected =
-      Literal::CreateR2<float>({{2.71828, 1.00000},    // row 0
-                                {0.36788, 1.64872}});  // row 1
+      Literal::CreateR2FromArray2D<T>({{2.71828f, 1.00000f},    // row 0
+                                       {0.36788f, 1.64872f}});  // row 1
 
-  ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-5));
+  this->template ComputeAndCompareLiteral(&builder, *expected, {},
+                                          ErrorSpec(1e-5));
 }
 
-TEST_F(MatOpsSimpleTest, MapTwoByTwo) {
-  Computation add_half;
+XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) {
+  using T = TypeParam;
+  XlaComputation add_half;
   {
     // add_half(x) = x + 0.5
-    ComputationBuilder builder(client_, "add_half");
+    XlaBuilder builder("add_half");
     auto x_value =
-        builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x_value");
-    auto half = builder.ConstantR0<float>(0.5);
+        builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({}), "x_value");
+    auto half = builder.ConstantR0<T>(static_cast<T>(0.5));
     builder.Add(x_value, half);
     auto computation_status = builder.Build();
     ASSERT_IS_OK(computation_status.status());
     add_half = computation_status.ConsumeValueOrDie();
   }
 
-  ComputationBuilder builder(client_, "map_2x2");
-  auto data = builder.ConstantR2<float>({
-      {1.0, 0.0},   // row 0
-      {-1.0, 0.5},  // row 1
+  XlaBuilder builder("map_2x2");
+  auto data = builder.ConstantR2FromArray2D<T>({
+      {1.0f, 0.0f},   // row 0
+      {-1.0f, 0.5f},  // row 1
   });
   auto map = builder.Map({data}, add_half, {0, 1});
 
   std::unique_ptr<Literal> expected =
-      Literal::CreateR2<float>({{1.5, 0.5},     // row 0
-                                {-0.5, 1.0}});  // row 1
-  ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-5));
+      Literal::CreateR2FromArray2D<T>({{1.5f, 0.5f},     // row 0
+                                       {-0.5f, 1.0f}});  // row 1
+  this->template ComputeAndCompareLiteral(&builder, *expected, {},
+                                          ErrorSpec(1e-5));
 }
 
-TEST_F(MatOpsSimpleTest, MaxTwoByTwoValues) {
-  ComputationBuilder builder(client_, "max_2x2");
-  auto lhs = builder.ConstantR2<float>({
-      {7.0, 2.0},   // row 0
-      {3.0, -4.0},  // row 1
+XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MaxTwoByTwoValues) {
+  using T = TypeParam;
+  XlaBuilder builder("max_2x2");
+  auto lhs = builder.ConstantR2FromArray2D<T>({
+      {7.0f, 2.0f},   // row 0
+      {3.0f, -4.0f},  // row 1
   });
-  auto rhs = builder.ConstantR2<float>({
-      {5.0, 6.0},   // row 0
-      {1.0, -8.0},  // row 1
+  auto rhs = builder.ConstantR2FromArray2D<T>({
+      {5.0f, 6.0f},   // row 0
+      {1.0f, -8.0f},  // row 1
   });
   auto max = builder.Max(lhs, rhs);
 
   std::unique_ptr<Literal> expected =
-      Literal::CreateR2<float>({{7.0, 6.0},     // row 0
-                                {3.0, -4.0}});  // row 1
-  ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-6));
+      Literal::CreateR2FromArray2D<T>({{7.0f, 6.0f},     // row 0
+                                       {3.0f, -4.0f}});  // row 1
+  this->template ComputeAndCompareLiteral(&builder, *expected, {},
+                                          ErrorSpec(1e-6));
 }
 
-TEST_F(MatOpsSimpleTest, Max1x1Linspace) { TestLinspaceMax(1, 1); }
-
-TEST_F(MatOpsSimpleTest, Max2x2Linspace) { TestLinspaceMax(2, 2); }
-
-TEST_F(MatOpsSimpleTest, Max3x3Linspace) { TestLinspaceMax(3, 3); }
-
-TEST_F(MatOpsSimpleTest, Max4x4Linspace) { TestLinspaceMax(4, 4); }
-
-TEST_F(MatOpsSimpleTest, Max6x6Linspace) { TestLinspaceMax(6, 6); }
-
-TEST_F(MatOpsSimpleTest, Max8x8Linspace) { TestLinspaceMax(8, 8); }
-
-TEST_F(MatOpsSimpleTest, Max12x12Linspace) { TestLinspaceMax(12, 12); }
-
-TEST_F(MatOpsSimpleTest, Max16x16Linspace) { TestLinspaceMax(16, 16); }
+struct TestLinspaceMaxParam {
+  int64 rows;
+  int64 cols;
+};
 
-TEST_F(MatOpsSimpleTest, Max32x8Linspace) { TestLinspaceMax(32, 8); }
+class TestLinspaceMaxParametric
+    : public MatOpsSimpleTest,
+      public ::testing::WithParamInterface<TestLinspaceMaxParam> {
+ public:
+  template <typename T>
+  void TestImpl() {
+    TestLinspaceMaxParam param = GetParam();
+    int64 rows = param.rows;
+    int64 cols = param.cols;
+    float from = -128.0, to = 256.0;
+    std::unique_ptr<Array2D<T>> alhs =
+        MakeLinspaceArray2D<T>(from, to, rows, cols);
+    auto arhs = MakeUnique<Array2D<T>>(rows, cols, static_cast<T>(1.0f));
 
-TEST_F(MatOpsSimpleTest, Max64x8Linspace) { TestLinspaceMax(64, 8); }
+    XlaBuilder builder(
+        tensorflow::strings::Printf("max_%lldx%lld_linspace", rows, cols));
+    auto lhs = builder.ConstantR2FromArray2D<T>(*alhs);
+    auto rhs = builder.ConstantR2FromArray2D<T>(*arhs);
+    auto max = builder.Max(lhs, rhs);
 
-class MatOpsDotAddTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {};
-
-TEST_P(MatOpsDotAddTest, Dot_Add_2x2_2x2) {
-  bool row_major = std::get<0>(GetParam());
-  bool add_lhs = std::get<1>(GetParam());
-  bool transpose = std::get<2>(GetParam());
-  Array2D<float> lhs({{1.0, 2.0}, {3.0, 4.0}});
-  Array2D<float> rhs({{10.0, 11.0}, {12.0, 13.0}});
-
-  auto minor_to_major = [](bool row_major) -> std::vector<int64> {
-    return {row_major ? 1 : 0, row_major ? 0 : 1};
-  };
-
-  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
-  Shape lhs_shape =
-      ShapeUtil::MakeShape(prim_type, {lhs.height(), lhs.width()});
-  Shape rhs_shape =
-      ShapeUtil::MakeShape(prim_type, {rhs.height(), rhs.width()});
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto lhs_handle,
-      client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout<float>(
-          lhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto rhs_handle,
-      client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout<float>(
-          rhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
-
-  ComputationBuilder builder(client_, TestName());
-  auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs");
-  auto lhs_mat_arg = lhs_arg;
-  if (transpose) {
-    lhs_mat_arg = builder.Transpose(lhs_mat_arg, {1, 0});
-  }
-  auto rhs_arg = builder.Parameter(1, rhs_shape, "rhs");
-  auto result = builder.Dot(lhs_mat_arg, rhs_arg);
-  Array2D<float> expected;
-  if (add_lhs) {
-    result = builder.Add(result, lhs_arg);
-    if (transpose) {
-      expected = Array2D<float>({{47, 52}, {71, 78}});
-    } else {
-      expected = Array2D<float>({{35, 39}, {81, 89}});
+    Array2D<T> expected(rows, cols);
+    for (int row = 0; row < rows; ++row) {
+      for (int col = 0; col < cols; ++col) {
+        expected(row, col) = std::max<T>((*alhs)(row, col), (*arhs)(row, col));
+      }
     }
-  } else {
-    result = builder.Add(result, rhs_arg);
-    if (transpose) {
-      expected = Array2D<float>({{56, 61}, {80, 87}});
-    } else {
-      expected = Array2D<float>({{44, 48}, {90, 98}});
+    ErrorSpec error_spec(1e-6);
+    if (std::is_same<Eigen::half, T>::value) {
+      error_spec = ErrorSpec(1e-6, 2e-4);
     }
+    ComputeAndCompareR2<T>(&builder, expected, {}, error_spec);
   }
+};
 
-  ComputeAndCompareR2<float>(&builder, expected,
-                             {lhs_handle.get(), rhs_handle.get()},
-                             ErrorSpec(1e-6));
+string PrintTestLinspaceMaxParam(
+    const ::testing::TestParamInfo<TestLinspaceMaxParam>& test_param) {
+  const TestLinspaceMaxParam& param = test_param.param;
+  return tensorflow::strings::StrCat(param.rows, "r", param.cols, "c");
 }
 
-INSTANTIATE_TEST_CASE_P(MatOpsDotAddTestInstances, MatOpsDotAddTest,
-                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
-                                           ::testing::Bool()));
+#ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
+// TODO(bixia): This test failed on GPU 02-25-2018
+#ifdef XLA_TEST_BACKEND_CPU
+XLA_TEST_P(TestLinspaceMaxParametric, TestF16) { TestImpl<Eigen::half>(); }
+#endif
+#endif
+XLA_TEST_P(TestLinspaceMaxParametric, TestF32) { TestImpl<float>(); }
+
+INSTANTIATE_TEST_CASE_P(
+    TestLinspaceMax, TestLinspaceMaxParametric,
+    ::testing::Values(TestLinspaceMaxParam{1, 1}, TestLinspaceMaxParam{2, 2},
+                      TestLinspaceMaxParam{3, 3}, TestLinspaceMaxParam{4, 4},
+                      TestLinspaceMaxParam{6, 6}, TestLinspaceMaxParam{8, 8},
+                      TestLinspaceMaxParam{12, 12},
+                      TestLinspaceMaxParam{16, 16}, TestLinspaceMaxParam{32, 8},
+                      TestLinspaceMaxParam{64, 8}),
+    PrintTestLinspaceMaxParam);
 
-class MatOpsDotAddTest_bf16
+class MatOpsDotAddTest
     : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {};
-
-TEST_P(MatOpsDotAddTest_bf16, Dot_Add_2x2_2x2) {
-  bool row_major = std::get<0>(GetParam());
-  bool add_lhs = std::get<1>(GetParam());
-  bool transpose = std::get<2>(GetParam());
-  Array2D<bfloat16> lhs(
-      {{bfloat16(1.0f), bfloat16(2.0f)}, {bfloat16(3.0), bfloat16(4.0)}});
-  Array2D<bfloat16> rhs(
-      {{bfloat16(10.0f), bfloat16(11.0f)}, {bfloat16(12.0f), bfloat16(13.0f)}});
-
-  auto minor_to_major = [](bool row_major) -> std::vector<int64> {
-    return {row_major ? 1 : 0, row_major ? 0 : 1};
-  };
-
-  auto prim_type = primitive_util::NativeToPrimitiveType<bfloat16>();
-  Shape lhs_shape =
-      ShapeUtil::MakeShape(prim_type, {lhs.height(), lhs.width()});
-  Shape rhs_shape =
-      ShapeUtil::MakeShape(prim_type, {rhs.height(), rhs.width()});
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto lhs_handle,
-      client_->TransferToServer(
-          *Literal::CreateR2FromArray2DWithLayout<bfloat16>(
-              lhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto rhs_handle,
-      client_->TransferToServer(
-          *Literal::CreateR2FromArray2DWithLayout<bfloat16>(
-              rhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
-
-  ComputationBuilder builder(client_, TestName());
-  auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs");
-  auto lhs_mat_arg = lhs_arg;
-  if (transpose) {
-    lhs_mat_arg = builder.Transpose(lhs_mat_arg, {1, 0});
-  }
-  auto rhs_arg = builder.Parameter(1, rhs_shape, "rhs");
-  auto result = builder.Dot(lhs_mat_arg, rhs_arg);
-  Array2D<bfloat16> expected;
-  if (add_lhs) {
-    result = builder.Add(result, lhs_arg);
+      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+ public:
+  template <typename T>
+  void TestImpl() {
+    bool row_major = std::get<0>(GetParam());
+    bool add_lhs = std::get<1>(GetParam());
+    bool transpose = std::get<2>(GetParam());
+    Array2D<T> lhs({{1.0f, 2.0f}, {3.0f, 4.0f}});
+    Array2D<T> rhs({{10.0f, 11.0f}, {12.0f, 13.0f}});
+
+    auto minor_to_major = [](bool row_major) -> std::vector<int64> {
+      return {row_major ? 1 : 0, row_major ? 0 : 1};
+    };
+
+    auto prim_type = primitive_util::NativeToPrimitiveType<T>();
+    Shape lhs_shape =
+        ShapeUtil::MakeShape(prim_type, {lhs.height(), lhs.width()});
+    Shape rhs_shape =
+        ShapeUtil::MakeShape(prim_type, {rhs.height(), rhs.width()});
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto lhs_handle,
+        client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout<T>(
+            lhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto rhs_handle,
+        client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout<T>(
+            rhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+
+    XlaBuilder builder(TestName());
+    auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs");
+    auto lhs_mat_arg = lhs_arg;
     if (transpose) {
-      expected = Array2D<bfloat16>(
-          {{bfloat16(47), bfloat16(52)}, {bfloat16(71), bfloat16(78)}});
-    } else {
-      expected = Array2D<bfloat16>(
-          {{bfloat16(35), bfloat16(39)}, {bfloat16(81), bfloat16(89)}});
+      lhs_mat_arg = builder.Transpose(lhs_mat_arg, {1, 0});
     }
-  } else {
-    result = builder.Add(result, rhs_arg);
-    if (transpose) {
-      expected = Array2D<bfloat16>(
-          {{bfloat16(56), bfloat16(61)}, {bfloat16(80), bfloat16(87)}});
+    auto rhs_arg = builder.Parameter(1, rhs_shape, "rhs");
+    auto result = builder.Dot(lhs_mat_arg, rhs_arg);
+    Array2D<T> expected;
+    if (add_lhs) {
+      result = builder.Add(result, lhs_arg);
+      if (transpose) {
+        expected = Array2D<T>({{47.0f, 52.0f}, {71.0f, 78.0f}});
+      } else {
+        expected = Array2D<T>({{35.0f, 39.0f}, {81.0f, 89.0f}});
+      }
     } else {
-      expected = Array2D<bfloat16>(
-          {{bfloat16(44), bfloat16(48)}, {bfloat16(90), bfloat16(98)}});
+      result = builder.Add(result, rhs_arg);
+      if (transpose) {
+        expected = Array2D<T>({{56.0f, 61.0f}, {80.0f, 87.0f}});
+      } else {
+        expected = Array2D<T>({{44.0f, 48.0f}, {90.0f, 98.0f}});
+      }
     }
+
+    ComputeAndCompareR2<T>(&builder, expected,
+                           {lhs_handle.get(), rhs_handle.get()},
+                           ErrorSpec(1e-6));
   }
+};
 
-  ComputeAndCompareR2<bfloat16>(&builder, expected,
-                                {lhs_handle.get(), rhs_handle.get()},
-                                ErrorSpec(1e-6));
-}
+XLA_TEST_P(MatOpsDotAddTest, Dot_Add_2x2_2x2BF16) { TestImpl<bfloat16>(); }
+#ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
+XLA_TEST_P(MatOpsDotAddTest, Dot_Add_2x2_2x2F16) { TestImpl<Eigen::half>(); }
+#endif
+XLA_TEST_P(MatOpsDotAddTest, Dot_Add_2x2_2x2F32) { TestImpl<float>(); }
 
-INSTANTIATE_TEST_CASE_P(MatOpsDotAddTestInstances, MatOpsDotAddTest_bf16,
+INSTANTIATE_TEST_CASE_P(MatOpsDotAddTestInstances, MatOpsDotAddTest,
                         ::testing::Combine(::testing::Bool(), ::testing::Bool(),
                                            ::testing::Bool()));
 
diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
index 11c0bf7a5a5bde9edcfb7f76a5c10ac4dd77bcee..0791a71aacf7614286fe964623a3172a174d4722 100644
--- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
+++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -32,7 +32,7 @@ namespace {
 class SliceTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(SliceTest, Slice2D) {
-  ComputationBuilder builder(client_, "slice_2d");
+  XlaBuilder builder("slice_2d");
   auto original = builder.ConstantR2<float>(
       {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}});
   builder.Slice(original, {2, 1}, {4, 3}, {1, 1});
@@ -42,7 +42,7 @@ XLA_TEST_F(SliceTest, Slice2D) {
 }
 
 XLA_TEST_F(SliceTest, Slice3D) {
-  ComputationBuilder builder(client_, "slice_3d");
+  XlaBuilder builder("slice_3d");
   Array3D<float> array_3d(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
   auto original = builder.ConstantR3FromArray3D<float>(array_3d);
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index 8cef8dd34dc7b16b1e58ded67d6b6a4ba79f20db..ce295b832d79e4f00656f2893c2ba1162693dd73 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -85,7 +85,7 @@ class PadTestFloat : public PadTest,
 
 // Tests a Pad() with a zero-element input and output.
 XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 0, high: 0, interior: 0}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -100,7 +100,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
 XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -115,7 +115,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -130,7 +130,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
         AddParam(*Literal::CreateR0<float>(1.5), &b), r4_padding_on_dim0_dim1_);
   ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
@@ -138,7 +138,7 @@ XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = MakeUnique<Array4D<float>>(1, 1, 3, 2);
   Array2D<float> input_xy({
       {1.0f, 2.0f},  // row 0
@@ -162,7 +162,7 @@ TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   const float pad_value = 1.5f;
   Array4D<float> input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
@@ -181,7 +181,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   PaddingConfig padding_config;
   auto dimension0 = padding_config.add_dimensions();
@@ -223,7 +223,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   PaddingConfig padding_config;
   auto dimension0 = padding_config.add_dimensions();
@@ -266,7 +266,7 @@ XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
 }
 
 XLA_TEST_F(PadTest, Pad4DU8Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = MakeUnique<Array4D<uint8>>(1, 1, 3, 2);
   Array2D<uint8> input_xy({
       {1, 2},  // row 0
@@ -290,7 +290,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) {
 }
 
 XLA_TEST_F(PadTest, Pad4DPredArray) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   // Since bool is currently not well supported, use Broadcast operation to
   // create the operand for Pad.
@@ -317,7 +317,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
 }
 
 XLA_TEST_P(PadTestFloat, Large2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   auto ones = MakeUnique<Array2D<float>>(4, 4);
   ones->Fill(1.0f);
@@ -329,15 +329,14 @@ XLA_TEST_P(PadTestFloat, Large2DPad) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 +
                                                                   100 * dim);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
   ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 35;
   constexpr int64 in_cols = 35;
@@ -352,15 +351,14 @@ XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
   padding_config.mutable_dimensions(1)->set_edge_padding_low(6);
   padding_config.mutable_dimensions(1)->set_edge_padding_high(4);
   padding_config.mutable_dimensions(1)->set_interior_padding(2);
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
   ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 XLA_TEST_P(PadTestFloat, High2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 129;
   constexpr int64 in_cols = 129;
@@ -378,8 +376,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -387,7 +384,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) {
 }
 
 XLA_TEST_P(PadTestFloat, NegativePadding2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 129;
   constexpr int64 in_cols = 129;
@@ -406,8 +403,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -415,7 +411,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) {
 }
 
 XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 8;
   constexpr int64 in_cols = 11;
@@ -434,8 +430,7 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding[dim]);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -444,20 +439,19 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
 
 // Regression test for b/31827337.
 XLA_TEST_P(PadTestFloat, ReducePad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
   ones->Fill(1.0);
   auto input = AddParam(*ones, &b);
 
-  Computation add = CreateScalarAddComputation(FloatType(), &b);
+  XlaComputation add = CreateScalarAddComputation(FloatType(), &b);
   auto reduce =
       b.Reduce(input, AddParam(*Literal::CreateR0<float>(0.0), &b), add, {0});
 
   PaddingConfig padding_config = MakeNoPaddingConfig(3);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(1);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(1);
-  auto padded = b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b),
-                      padding_config);
+  b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b), padding_config);
 
   Array3D<float> expected({{{0.0, 0.0}, {0.0, 0.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index bb7e800df84121f2045141bc366c34b94ba694ea..97dab860c06bddb2a0ffd45e48c4912c5f55d574 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -20,9 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -41,7 +42,7 @@ namespace {
 class ParamsTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(ParamsTest, ConstantR0F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(3.14159f);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
@@ -53,7 +54,7 @@ XLA_TEST_F(ParamsTest, ConstantR0F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
@@ -65,7 +66,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({3.14f, -100.25f});
   std::unique_ptr<GlobalData> param0_data =
@@ -78,7 +79,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR1U8Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   string str("hello world");
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1U8(str);
   std::unique_ptr<GlobalData> param0_data =
@@ -91,7 +92,7 @@ XLA_TEST_F(ParamsTest, ConstantR1U8Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR2FromArray2D<float>(Array2D<float>(3, 0));
   std::unique_ptr<GlobalData> param0_data =
@@ -104,7 +105,7 @@ XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR2F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2<float>(
       {{3.14f, -100.25f}, {7e8f, 7e-9f}, {30.3f, -100.0f}});
   std::unique_ptr<GlobalData> param0_data =
@@ -119,7 +120,7 @@ XLA_TEST_F(ParamsTest, ConstantR2F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, TwoParameters) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>({1, 2});
   std::unique_ptr<GlobalData> param0_data =
@@ -156,19 +157,15 @@ XLA_TEST_F(ParamsTest, MissingParameter) {
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto p = builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "param2");
-  auto computation = builder.Build().ConsumeValueOrDie();
+  auto computation_status = builder.Build();
 
-  auto execute_status = client_->Execute(computation, {data.get(), data.get()},
-                                         /*execution_options=*/nullptr,
-                                         /*execution_profile=*/nullptr);
-  ASSERT_EQ(execute_status.status().code(),
-            tensorflow::error::FAILED_PRECONDITION);
+  ASSERT_NE(computation_status.status(), tensorflow::Status::OK());
 }
 
 XLA_TEST_F(ParamsTest, UnusedParameter) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>({1, 2});
   std::unique_ptr<GlobalData> param0_data =
@@ -188,7 +185,7 @@ XLA_TEST_F(ParamsTest, UnusedParameter) {
 XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) {
   // Build a computation with a couple unused parameters which are used in an
   // unused expression.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>({1, 2});
   std::unique_ptr<GlobalData> param0_data =
@@ -214,12 +211,12 @@ XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) {
 }
 
 XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   constexpr int size = 8 * 128 * 2;
 
   std::vector<float> init_value = {{0, 1}};
   init_value.resize(size);
-  ComputationDataHandle sum_handle = builder.ConstantR1<float>(init_value);
+  XlaOp sum_handle = builder.ConstantR1<float>(init_value);
   std::vector<float> sum = {{0, 1}};
   sum.resize(size);
 
@@ -237,8 +234,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
     std::unique_ptr<Literal> literal = Literal::CreateR1<float>(sum_value);
     param_data_owner.push_back(
         client_->TransferToServer(*literal).ConsumeValueOrDie());
-    ComputationDataHandle param =
-        builder.Parameter(i, literal->shape(), "param");
+    XlaOp param = builder.Parameter(i, literal->shape(), "param");
     sum_handle = builder.Add(sum_handle, param);
   }
 
@@ -262,10 +258,10 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
 // compilation.
 XLA_TEST_F(ParamsTest,
            DISABLED_ON_CPU(DISABLED_ON_GPU(ThreeThousandParameters))) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<std::unique_ptr<GlobalData>> param_data_owner;
-  ComputationDataHandle sum_handle = builder.ConstantR0<float>(0.0f);
+  XlaOp sum_handle = builder.ConstantR0<float>(0.0f);
   float target = 0.0;
   constexpr int kParamCount = 3000;
   for (int i = 0; i < kParamCount; ++i) {
@@ -273,8 +269,7 @@ XLA_TEST_F(ParamsTest,
     std::unique_ptr<Literal> literal = Literal::CreateR0<float>(i);
     param_data_owner.push_back(
         std::move(client_->TransferToServer(*literal)).ValueOrDie());
-    ComputationDataHandle param =
-        builder.Parameter(i, literal->shape(), "param");
+    XlaOp param = builder.Parameter(i, literal->shape(), "param");
     sum_handle = builder.Add(sum_handle, param);
   }
 
@@ -294,25 +289,24 @@ XLA_TEST_F(ParamsTest,
 // compilation.
 XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(
                            ThreeThousandParametersAndOutputElements))) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<std::unique_ptr<GlobalData>> param_data_owner;
-  ComputationDataHandle sum_handle = builder.ConstantR1<int32>({0, 0});
+  XlaOp sum_handle = builder.ConstantR1<int32>({0, 0});
   int32 target = 0;
   constexpr int kParamCount = 3000;
-  std::vector<ComputationDataHandle> params;
+  std::vector<XlaOp> params;
   for (int i = 0; i < kParamCount; ++i) {
     target += i;
     std::unique_ptr<Literal> literal = Literal::CreateR1<int32>({i, i});
     param_data_owner.push_back(
         std::move(client_->TransferToServer(*literal)).ValueOrDie());
-    ComputationDataHandle param =
-        builder.Parameter(i, literal->shape(), "param");
+    XlaOp param = builder.Parameter(i, literal->shape(), "param");
     params.push_back(param);
     sum_handle = builder.Add(sum_handle, param);
   }
 
-  std::vector<ComputationDataHandle> outputs;
+  std::vector<XlaOp> outputs;
   for (int i = 0; i < kParamCount; ++i) {
     outputs.push_back(builder.Add(params[i], sum_handle));
   }
@@ -353,18 +347,17 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(
 // 2017-12-12.
 XLA_TEST_F(ParamsTest,
            DISABLED_ON_CPU(DISABLED_ON_GPU(ManyParametersIntoWhileLoop))) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<std::unique_ptr<GlobalData>> param_data_owner;
   constexpr int kParamCount = 1900;
-  std::vector<ComputationDataHandle> params;
+  std::vector<XlaOp> params;
   std::vector<Shape> parameter_shapes;
   for (int i = 0; i < kParamCount; ++i) {
     std::unique_ptr<Literal> literal = Literal::CreateR1<int32>({i, i});
     param_data_owner.push_back(
         std::move(client_->TransferToServer(*literal)).ValueOrDie());
-    ComputationDataHandle param =
-        builder.Parameter(i, literal->shape(), "param");
+    XlaOp param = builder.Parameter(i, literal->shape(), "param");
     params.push_back(param);
     parameter_shapes.push_back(literal->shape());
   }
@@ -374,7 +367,7 @@ XLA_TEST_F(ParamsTest,
   std::unique_ptr<Literal> bool_literal = Literal::CreateR0<bool>(false);
   param_data_owner.push_back(
       std::move(client_->TransferToServer(*bool_literal)).ValueOrDie());
-  ComputationDataHandle bool_param =
+  XlaOp bool_param =
       builder.Parameter(kParamCount, bool_literal->shape(), "bool_param");
   params.push_back(bool_param);
   parameter_shapes.push_back(bool_literal->shape());
@@ -383,9 +376,9 @@ XLA_TEST_F(ParamsTest,
 
   // Create a computation for the condition: while(bool_param).
   Shape while_shape = ShapeUtil::MakeTupleShape(parameter_shapes);
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto condition_parameter =
         builder.Parameter(0, while_shape, "condition_parameter");
     builder.GetTupleElement(condition_parameter, kParamCount);
@@ -394,11 +387,11 @@ XLA_TEST_F(ParamsTest,
 
   // Create a computation for the body.
   // Add {1, 1} to the each tuple element.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto body_parameter = builder.Parameter(0, while_shape, "body_parameter");
-    std::vector<ComputationDataHandle> updates;
+    std::vector<XlaOp> updates;
     for (int i = 0; i < kParamCount; ++i) {
       auto add = builder.Add(builder.GetTupleElement(body_parameter, i),
                              builder.ConstantR1<int32>({1, 1}));
@@ -413,7 +406,7 @@ XLA_TEST_F(ParamsTest,
 
   auto loop = builder.While(condition, body, init);
 
-  std::vector<ComputationDataHandle> outputs;
+  std::vector<XlaOp> outputs;
   for (int i = 0; i < kParamCount; ++i) {
     outputs.push_back(builder.GetTupleElement(loop, i));
   }
@@ -437,7 +430,7 @@ XLA_TEST_F(ParamsTest,
 #endif
 
 XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Shape r1f32_3 = ShapeUtil::MakeShape(F32, {3});
   Shape tuple_shape = ShapeUtil::MakeTupleShape({r1f32_3, r1f32_3});
@@ -464,7 +457,7 @@ XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) {
 XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) {
   std::unique_ptr<Literal> literal = Literal::CreateR2WithLayout<float>(
       {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({0, 1}));
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Parameter(0, literal->shape(), "input");
 
   std::unique_ptr<GlobalData> data =
@@ -476,7 +469,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) {
 XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) {
   std::unique_ptr<Literal> literal = Literal::CreateR2WithLayout<float>(
       {{1, 3}, {2, 4}}, LayoutUtil::MakeLayout({1, 0}));
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Parameter(0, literal->shape(), "input");
 
   std::unique_ptr<GlobalData> data =
@@ -501,7 +494,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
     ASSERT_EQ(2, literal->Get<float>({0, 1}));
   }
   // Use the original shape in building the computation.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.Parameter(0, original, "input");
   // Use the slice operator to get an off-diagonal element.
   builder.Slice(input, {0, 1}, {1, 2}, {1, 1});
diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc
index 10e44b274a8a9f3ac28dc40d7b1938d24a9ee40c..77159efb26f3b7dd4918f24305f7269a2d6ff647 100644
--- a/tensorflow/compiler/xla/tests/pred_test.cc
+++ b/tensorflow/compiler/xla/tests/pred_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -29,63 +29,62 @@ namespace {
 
 class PredTest : public ClientLibraryTestBase {
  protected:
-  void TestCompare(bool lhs, bool rhs, bool expected,
-                   ComputationDataHandle (ComputationBuilder::*op)(
-                       const ComputationDataHandle&,
-                       const ComputationDataHandle&,
-                       tensorflow::gtl::ArraySlice<int64>)) {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle lhs_op = builder.ConstantR0<bool>(lhs);
-    ComputationDataHandle rhs_op = builder.ConstantR0<bool>(rhs);
-    ComputationDataHandle result = (builder.*op)(lhs_op, rhs_op, {});
+  void TestCompare(
+      bool lhs, bool rhs, bool expected,
+      XlaOp (XlaBuilder::*op)(const xla::XlaOp&, const xla::XlaOp&,
+                              tensorflow::gtl::ArraySlice<int64>)) {
+    XlaBuilder builder(TestName());
+    XlaOp lhs_op = builder.ConstantR0<bool>(lhs);
+    XlaOp rhs_op = builder.ConstantR0<bool>(rhs);
+    XlaOp result = (builder.*op)(lhs_op, rhs_op, {});
     ComputeAndCompareR0<bool>(&builder, expected, {});
   }
 };
 
 TEST_F(PredTest, ConstantR0PredTrue) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR0<bool>(true);
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
 
 TEST_F(PredTest, ConstantR0PredFalse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR0<bool>(false);
   ComputeAndCompareR0<bool>(&builder, false, {});
 }
 
 TEST_F(PredTest, ConstantR0PredCompareEq) {
-  TestCompare(true, false, false, &ComputationBuilder::Eq);
+  TestCompare(true, false, false, &XlaBuilder::Eq);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareNe) {
-  TestCompare(true, false, true, &ComputationBuilder::Ne);
+  TestCompare(true, false, true, &XlaBuilder::Ne);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareLe) {
-  TestCompare(true, false, false, &ComputationBuilder::Le);
+  TestCompare(true, false, false, &XlaBuilder::Le);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareLt) {
-  TestCompare(true, false, false, &ComputationBuilder::Lt);
+  TestCompare(true, false, false, &XlaBuilder::Lt);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareGe) {
-  TestCompare(true, false, true, &ComputationBuilder::Ge);
+  TestCompare(true, false, true, &XlaBuilder::Ge);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareGt) {
-  TestCompare(true, false, true, &ComputationBuilder::Gt);
+  TestCompare(true, false, true, &XlaBuilder::Gt);
 }
 
 TEST_F(PredTest, ConstantR1Pred) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({true, false, false, true});
   ComputeAndCompareR1<bool>(&builder, {true, false, false, true}, {});
 }
 
 TEST_F(PredTest, ConstantR2Pred) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<bool>({{false, true, true}, {true, false, false}});
   const string expected = R"(pred[2,3] {
@@ -96,28 +95,28 @@ TEST_F(PredTest, ConstantR2Pred) {
 }
 
 TEST_F(PredTest, AnyR1True) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({true, false});
   TF_ASSERT_OK(Any(a, &builder).status());
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
 
 TEST_F(PredTest, AnyR1False) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({false, false});
   TF_ASSERT_OK(Any(a, &builder).status());
   ComputeAndCompareR0<bool>(&builder, false, {});
 }
 
 TEST_F(PredTest, AnyR1VacuouslyFalse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({});
   TF_ASSERT_OK(Any(a, &builder).status());
   ComputeAndCompareR0<bool>(&builder, false, {});
 }
 
 TEST_F(PredTest, AnyR2True) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({
       {false, false, false},
       {false, false, false},
@@ -128,7 +127,7 @@ TEST_F(PredTest, AnyR2True) {
 }
 
 TEST_F(PredTest, AnyR2False) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({
       {false, false, false},
       {false, false, false},
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 6aafb9fa6cb2175c478f0e9a5e16f5808cbea590..29a4f75001c688f2215745ab913df68bf2f62b76 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <limits>
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -52,13 +52,14 @@ class PrngTest : public ClientLibraryTestBase {
 template <typename T>
 std::unique_ptr<Literal> PrngTest::UniformTest(
     T a, T b, tensorflow::gtl::ArraySlice<int64> dims, int64 seed) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.RngUniform(
       builder.ConstantR0<T>(a), builder.ConstantR0<T>(b),
       ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<T>(), dims));
 
   SetSeed(seed);
-  auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+  auto actual =
+      ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie();
   EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   actual->EachCell<T>([=](tensorflow::gtl::ArraySlice<int64>, T value) {
     EXPECT_LE(a, value);
@@ -81,8 +82,7 @@ XLA_TEST_F(PrngTest, LargeU01) { UniformTest<float>(0, 1, {0x100, 0x100}); }
 XLA_TEST_F(PrngTest, TwelveValuesU524) { UniformTest<int32>(5, 24, {12}); }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(
-                         DISABLED_ON_CPU(ScalarBF16Tests)))) {
+XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) {
   for (int64 seed = 0; seed < 100; ++seed) {
     // The largest negative number smaller than zero in bf16 that's not
     // denormalized.
@@ -105,8 +105,7 @@ XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(
 }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(
-                         DISABLED_ON_CPU_PARALLEL(ScalarBF16CountTests)))) {
+XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16CountTests))) {
   // There are 3 BF16 values in the range of [32.25, 33): 32.25, 32.5, 32.75,
   // they should get similar counts.
   bfloat16 low = static_cast<bfloat16>(32.25);
@@ -141,13 +140,14 @@ double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count,
                                    int64 seed) {
   int32 sample_size = range_size * expected_count;
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.RngUniform(builder.ConstantR0<int32>(0),
                      builder.ConstantR0<int32>(range_size),
                      ShapeUtil::MakeShape(S32, {sample_size}));
 
   SetSeed(seed);
-  auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+  auto actual =
+      ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie();
   std::vector<int32> counts(range_size, 0);
   actual->EachCell<int32>([&counts](tensorflow::gtl::ArraySlice<int64>,
                                     int32 value) { ++counts[value]; });
@@ -182,16 +182,15 @@ XLA_TEST_F(PrngTest, Uniformity256) {
 
 XLA_TEST_F(PrngTest, MapUsingRng) {
   // Build a x -> (x + U[0,1)) computation.
-  auto build_sum_rng = [this](ComputationBuilder& builder) {
+  auto build_sum_rng = [this](XlaBuilder& builder) {
     auto b = builder.CreateSubBuilder("sum_with_rng");
     auto x = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "input");
-    b->Add(x,
-           b->RngUniform(b->ConstantR0<float>(0), b->ConstantR0<float>(1),
-                         ShapeUtil::MakeShape(F32, {})));
+    b->Add(x, b->RngUniform(b->ConstantR0<float>(0), b->ConstantR0<float>(1),
+                            ShapeUtil::MakeShape(F32, {})));
     return b->BuildAndNoteError();
   };
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 5.3f, 4.4f, 5.5f});
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> param0_data,
@@ -226,7 +225,7 @@ XLA_TEST_F(PrngTest, MapUsingRng) {
 XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
   // Build a U[0,1) computation.
   auto build_computation = [this]() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.RngUniform(builder.ConstantR0<float>(0),
                        builder.ConstantR0<float>(1),
                        ShapeUtil::MakeShape(F32, {10}));
@@ -282,24 +281,24 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
 }
 
 XLA_TEST_F(PrngTest, TenValuesN01) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.RngNormal(builder.ConstantR0<float>(0), builder.ConstantR0<float>(1),
                     ShapeUtil::MakeShape(F32, {10}));
 
   SetSeed(42);
-  ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+  ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie();
   // TODO(b/25995601): Test that resultant values are reasonable
 }
 
 XLA_TEST_F(PrngTest, RngUniformCrash) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // This used to crash XLA during LLVM IR generation for CPUs.
   auto rng_uniform = builder.RngUniform(builder.ConstantR0<int32>(0),
                                         builder.ConstantR0<int32>(1000 * 1000),
                                         ShapeUtil::MakeShape(S32, {}));
   SetSeed(0);
-  ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+  ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
index 212512207cfdc4d2ebdc4e7fd8f5794852cc6a79..f95e75648343aa88bd7c39de4ee9f387f2b60506 100644
--- a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
+++ b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -30,13 +30,13 @@ namespace {
 class QueryInferredShapeTest : public ClientLibraryTestBase {};
 
 TEST_F(QueryInferredShapeTest, OnePlusOneShape) {
-  ComputationBuilder builder(client_, "one_plus_one");
+  XlaBuilder builder("one_plus_one");
   auto one = builder.ConstantR0<float>(1.0);
   auto result = builder.Add(one, one);
-  StatusOr<std::unique_ptr<Shape>> shape_status = builder.GetShape(result);
+  StatusOr<Shape> shape_status = builder.GetShape(result);
   ASSERT_IS_OK(shape_status.status());
   auto shape = shape_status.ConsumeValueOrDie();
-  ASSERT_TRUE(ShapeUtil::Equal(*shape, ShapeUtil::MakeShape(F32, {})));
+  ASSERT_TRUE(ShapeUtil::Equal(shape, ShapeUtil::MakeShape(F32, {})));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index dc7ce3253cee255a7949326fa5b49fc8917432b8..b311785449f1774c3bc1e4d7ad35c2866e3b4061 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
@@ -228,15 +228,14 @@ XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) {
   // This is required for proper handling of NaN values.
   SetFastMathDisabled(true);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({input_values});
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(*a_literal).ConsumeValueOrDie();
   auto a = builder.Parameter(0, a_literal->shape(), "a");
 
-  auto reduce_precision =
-      builder.ReducePrecision(a, exponent_bits, mantissa_bits);
+  builder.ReducePrecision(a, exponent_bits, mantissa_bits);
 
   ComputeAndCompareR1<float>(&builder, expected_values, {a_data.get()});
 }
@@ -252,7 +251,7 @@ class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -265,7 +264,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
   // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the
   // reduce-precision operation showed up in the correct place in the
   // graph.
-  auto log = builder.Log(abs);
+  builder.Log(abs);
 
   // Insert precision-reduction after the Abs(x) operation, rounding that
   // result to exactly 1.0f.
@@ -281,7 +280,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -290,7 +289,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass after operation fusion, suffixing kAbs operations.  This
   // should not see into the fusion nodes and thus should not affect the
@@ -307,7 +306,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -316,7 +315,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass after operation fusion, suffixing kFusion operations.
   auto reduce_precision_pass = execution_options_.mutable_debug_options()
@@ -331,7 +330,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -340,7 +339,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass suffixing fusion nodes containing kCos operations.  This
   // should have no effect.
@@ -356,7 +355,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -365,7 +364,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass suffixing fusion nodes containing kAbs operations.  This
   // should see the kAbs operation within the above fusion node.
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 50d7b5074d201d2292cf90224ef4cd37efdbb8d3..bcc05c2d41d8439b021cdf6533b5ca87c19aec1f 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -35,10 +35,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -50,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -57,6 +59,10 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using FuncGeneratorForType = XlaComputation (*)(PrimitiveType, XlaBuilder*);
+
+using FuncGenerator = XlaComputation (*)(XlaBuilder*);
+
 class ReduceTest : public ClientLibraryTestBase {
  protected:
   ReduceTest() {
@@ -81,8 +87,8 @@ class ReduceTest : public ClientLibraryTestBase {
 
   // Runs an R1 => R0 reduction test with the given number of elements.
   void RunR1ToR0Test(int64 element_count) {
-    ComputationBuilder builder(client_, TestName());
-    Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+    XlaBuilder builder(TestName());
+    XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
     const Shape input_shape = ShapeUtil::MakeShape(F32, {element_count});
     auto input = builder.Parameter(0, input_shape, "input");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -111,13 +117,13 @@ class ReduceTest : public ClientLibraryTestBase {
   void RunR1ToR0PredTest(bool and_reduce,
                          tensorflow::gtl::ArraySlice<int> input_data) {
     const int element_count = input_data.size();
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     const Shape input_shape = ShapeUtil::MakeShape(S32, {element_count});
     auto input_par = builder.Parameter(0, input_shape, "input");
     auto pred_values =
         builder.Eq(input_par, builder.ConstantR1<int>(element_count, 1));
-    ComputationDataHandle init_value;
-    Computation reduce;
+    XlaOp init_value;
+    XlaComputation reduce;
     if (and_reduce) {
       init_value = builder.ConstantR0<bool>(true);
       reduce = CreateScalarAndComputation(&builder);
@@ -149,13 +155,13 @@ class ReduceTest : public ClientLibraryTestBase {
   template <int64 cols>
   void RunR2ToR1PredTest(bool and_reduce, int64 rows, int64 minor = 1,
                          int64 major = 0) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     const Shape input_shape = ShapeUtil::MakeShape(U8, {rows, cols});
     auto input = builder.Parameter(0, input_shape, "input");
     auto input_pred = builder.Eq(input, builder.ConstantR0<uint8>(1));
 
-    ComputationDataHandle init_value;
-    Computation reduce_op;
+    XlaOp init_value;
+    XlaComputation reduce_op;
     if (and_reduce) {
       init_value = builder.ConstantR0<bool>(true);
       reduce_op = CreateScalarAndComputation(&builder);
@@ -194,8 +200,8 @@ class ReduceTest : public ClientLibraryTestBase {
 
   // Runs an R2 => R0 reduction test with the given number of (rows, cols).
   void RunR2ToR0Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) {
-    ComputationBuilder builder(client_, TestName());
-    Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+    XlaBuilder builder(TestName());
+    XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
     const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols});
     auto input = builder.Parameter(0, input_shape, "input");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -222,8 +228,8 @@ class ReduceTest : public ClientLibraryTestBase {
 
   // Runs an R2 => R1 reduction test with the given number of (rows, cols).
   void RunR2ToR1Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) {
-    ComputationBuilder builder(client_, TestName());
-    Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+    XlaBuilder builder(TestName());
+    XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
     const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols});
     auto input = builder.Parameter(0, input_shape, "input");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -253,7 +259,7 @@ class ReduceTest : public ClientLibraryTestBase {
   template <typename NativeT>
   void ComputeAndCompareGeneric(
       typename std::enable_if<std::is_floating_point<NativeT>::value,
-                              ComputationBuilder>::type* builder,
+                              XlaBuilder>::type* builder,
       tensorflow::gtl::ArraySlice<NativeT> expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
     ComputeAndCompareR1<NativeT>(builder, expected, arguments,
@@ -263,7 +269,7 @@ class ReduceTest : public ClientLibraryTestBase {
   template <typename NativeT>
   void ComputeAndCompareGeneric(
       typename std::enable_if<std::is_integral<NativeT>::value,
-                              ComputationBuilder>::type* builder,
+                              XlaBuilder>::type* builder,
       tensorflow::gtl::ArraySlice<NativeT> expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
     ComputeAndCompareR1<NativeT>(builder, expected, arguments);
@@ -271,15 +277,15 @@ class ReduceTest : public ClientLibraryTestBase {
 
   template <typename NativeT>
   void RunVectorizedReduceTestForType(
-      const std::function<Computation(ComputationBuilder*)>&
+      const std::function<XlaComputation(XlaBuilder*)>&
           reduction_function_generator,
       const std::function<NativeT(NativeT, NativeT)>&
           reference_reduction_function,
       const NativeT& initial_value) {
     const int rows = 64, cols = 128;
     const int minor = 1, major = 0;
-    ComputationBuilder builder(client_, TestName());
-    Computation reduction_function = reduction_function_generator(&builder);
+    XlaBuilder builder(TestName());
+    XlaComputation reduction_function = reduction_function_generator(&builder);
     const Shape input_shape = ShapeUtil::MakeShape(
         xla::primitive_util::NativeToPrimitiveType<NativeT>(), {rows, cols});
     auto input = builder.Parameter(0, input_shape, "input");
@@ -314,7 +320,7 @@ class ReduceTest : public ClientLibraryTestBase {
   }
 
   void RunVectorizedReduceTest(
-      const std::function<Computation(PrimitiveType, ComputationBuilder*)>&
+      const std::function<XlaComputation(PrimitiveType, XlaBuilder*)>&
           reduction_function_generator_for_type,
       const std::function<float(float, float)>&
           reference_reduction_function_for_floats,
@@ -326,21 +332,21 @@ class ReduceTest : public ClientLibraryTestBase {
       uint32 unsigned_int_identity) {
     // Float version
     RunVectorizedReduceTestForType<float>(
-        [&](ComputationBuilder* builder) {
+        [&](XlaBuilder* builder) {
           return reduction_function_generator_for_type(F32, builder);
         },
         reference_reduction_function_for_floats, floating_point_identity);
 
     // Signed int version
     RunVectorizedReduceTestForType<int32>(
-        [&](ComputationBuilder* builder) {
+        [&](XlaBuilder* builder) {
           return reduction_function_generator_for_type(S32, builder);
         },
         reference_reduction_function_for_ints, signed_int_identity);
 
     // Unsigned int version
     RunVectorizedReduceTestForType<uint32>(
-        [&](ComputationBuilder* builder) {
+        [&](XlaBuilder* builder) {
           return reduction_function_generator_for_type(U32, builder);
         },
         reference_reduction_function_for_uints, unsigned_int_identity);
@@ -434,8 +440,8 @@ XLA_TEST_F(ReduceTest, OrReduceOnesAndZerosR1_10_Pred) {
 XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols});
   auto input = builder.Parameter(0, input_shape, "input");
   auto zero = builder.ConstantR0<float>(0.0);
@@ -465,8 +471,8 @@ XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) {
 XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols});
   auto input = builder.Parameter(0, input_shape, "input");
   auto zero = builder.ConstantR0<float>(0.0);
@@ -497,28 +503,25 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
 // Test that algebraic simplifier does not incorrectly fold a transpose into a
 // reduction operation.
 XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) {
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50});
-  ComputationDataHandle input = builder.Parameter(0, input_shape, "input");
-  ComputationDataHandle zero = builder.ConstantR0<float>(0.0);
-  ComputationDataHandle transpose =
-      builder.Transpose(input, /*permutation=*/{1, 0, 2});
-  ComputationDataHandle reduce =
-      builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
+  XlaOp input = builder.Parameter(0, input_shape, "input");
+  XlaOp zero = builder.ConstantR0<float>(0.0);
+  XlaOp transpose = builder.Transpose(input, /*permutation=*/{1, 0, 2});
+  builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> input_data,
                           MakeFakeLiteral(input_shape));
 
-  ComputeAndCompare(&builder, reduce, {std::move(*input_data)},
-                    ErrorSpec(0.01, 1e-4));
+  ComputeAndCompare(&builder, {std::move(*input_data)}, ErrorSpec(0.01, 1e-4));
 }
 
 XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, 2, cols / 2});
   auto input = builder.Parameter(0, input_shape, "input");
   auto zero = builder.ConstantR0<float>(0.0);
@@ -564,7 +567,7 @@ void PrintTo(const BoundsLayout& spec, std::ostream* os) {
 
 // Add-reduces a broadcasted scalar matrix among dimension 1 and 0.
 XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto add = CreateScalarAddComputation(F32, &builder);
   auto scalar = builder.ConstantR0<float>(42.0);
   auto broadcasted = builder.Broadcast(scalar, {500, 500});
@@ -576,7 +579,7 @@ XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) {
 
 // Max-reduces a broadcasted scalar matrix among dimension 1 and 0.
 XLA_TEST_F(ReduceTest, MaxReduce2DScalarToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto max = CreateScalarMaxComputation(F32, &builder);
   auto scalar = builder.ConstantR0<float>(42.0);
   auto broadcasted = builder.Broadcast(scalar, {500, 500});
@@ -588,7 +591,7 @@ XLA_TEST_F(ReduceTest, MaxReduce2DScalarToR0) {
 
 // Max-reduces a matrix among dimension 1 and 0.
 XLA_TEST_F(ReduceTest, MaxReduce2DToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto max = CreateScalarMaxComputation(F32, &builder);
   Array2D<float> input(300, 250);
   input.FillRandom(214.0f);
@@ -603,7 +606,7 @@ XLA_TEST_F(ReduceTest, MaxReduce2DToR0) {
 
 // Min-reduces matrix among dimension 1 and 0.
 XLA_TEST_F(ReduceTest, MinReduce2DToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min = CreateScalarMinComputation(F32, &builder);
   Array2D<float> input(150, 130);
   input.FillRandom(214.0f);
@@ -618,7 +621,7 @@ XLA_TEST_F(ReduceTest, MinReduce2DToR0) {
 }
 
 XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<uint32> input({{1}, {2}});
   auto min = CreateScalarMinComputation(U32, &builder);
   auto input_literal = Literal::CreateR2FromArray2D(input);
@@ -631,7 +634,7 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) {
 }
 
 XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<uint32> input({{1}, {2}});
   auto max = CreateScalarMaxComputation(U32, &builder);
   auto input_literal = Literal::CreateR2FromArray2D(input);
@@ -645,7 +648,7 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) {
 
 // Reduces a matrix among dimension 1.
 XLA_TEST_F(ReduceTest, Reduce2DAmong1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_2d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {1});
@@ -656,7 +659,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong1) {
 
 XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) {
   // Reduce a matrix among dimensions 0 and 1 (sum it up to a scalar).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_2d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0, 1});
@@ -666,7 +669,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) {
 
 // Tests 2D matrix ReduceToRow operation.
 XLA_TEST_F(ReduceTest, Reduce2DAmongY) {
-  ComputationBuilder builder(client_, "reduce_among_y");
+  XlaBuilder builder("reduce_among_y");
   auto m = builder.ConstantLiteral(*literal_2d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0});
@@ -676,7 +679,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmongY) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {1, 2});
@@ -686,7 +689,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0, 1});
@@ -696,7 +699,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3ToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0, 1, 2});
@@ -706,7 +709,7 @@ XLA_TEST_F(ReduceTest, ReduceR3ToR0) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0});
@@ -721,7 +724,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {1});
@@ -738,7 +741,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {2});
@@ -755,60 +758,64 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) {
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Add) {
-  RunVectorizedReduceTest(CreateScalarAddComputation,
-                          [](float a, float b) { return a + b; },
-                          [](int32 a, int32 b) {
-                            return static_cast<int32>(static_cast<uint32>(a) +
-                                                      static_cast<uint32>(b));
-                          },
-                          [](uint32 a, uint32 b) { return a + b; }, 0.0, 0, 0);
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarAddComputation),
+      [](float a, float b) { return a + b; },
+      [](int32 a, int32 b) {
+        return static_cast<int32>(static_cast<uint32>(a) +
+                                  static_cast<uint32>(b));
+      },
+      [](uint32 a, uint32 b) { return a + b; }, 0.0, 0, 0);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Multiply) {
-  RunVectorizedReduceTest(CreateScalarMultiplyComputation,
-                          [](float a, float b) { return a * b; },
-                          [](int32 a, int32 b) {
-                            return static_cast<int32>(static_cast<uint32>(a) *
-                                                      static_cast<uint32>(b));
-                          },
-                          [](uint32 a, uint32 b) { return a * b; }, 1.0, 1, 1);
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMultiplyComputation),
+      [](float a, float b) { return a * b; },
+      [](int32 a, int32 b) {
+        return static_cast<int32>(static_cast<uint32>(a) *
+                                  static_cast<uint32>(b));
+      },
+      [](uint32 a, uint32 b) { return a * b; }, 1.0, 1, 1);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Max) {
-  RunVectorizedReduceTest(CreateScalarMaxComputation,
-                          [](float a, float b) { return std::max(a, b); },
-                          [](int32 a, int32 b) { return std::max(a, b); },
-                          [](uint32 a, uint32 b) { return std::max(a, b); },
-                          std::numeric_limits<float>::min(),
-                          std::numeric_limits<int32>::min(),
-                          std::numeric_limits<uint32>::min());
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMaxComputation),
+      [](float a, float b) { return std::max(a, b); },
+      [](int32 a, int32 b) { return std::max(a, b); },
+      [](uint32 a, uint32 b) { return std::max(a, b); },
+      std::numeric_limits<float>::min(), std::numeric_limits<int32>::min(),
+      std::numeric_limits<uint32>::min());
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Min) {
-  RunVectorizedReduceTest(CreateScalarMinComputation,
-                          [](float a, float b) { return std::min(a, b); },
-                          [](int32 a, int32 b) { return std::min(a, b); },
-                          [](uint32 a, uint32 b) { return std::min(a, b); },
-                          std::numeric_limits<float>::max(),
-                          std::numeric_limits<int32>::max(),
-                          std::numeric_limits<uint32>::max());
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMinComputation),
+      [](float a, float b) { return std::min(a, b); },
+      [](int32 a, int32 b) { return std::min(a, b); },
+      [](uint32 a, uint32 b) { return std::min(a, b); },
+      std::numeric_limits<float>::max(), std::numeric_limits<int32>::max(),
+      std::numeric_limits<uint32>::max());
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanAnd) {
   RunVectorizedReduceTestForType<bool>(
-      CreateScalarAndComputation, [](bool a, bool b) { return a && b; }, true);
+      static_cast<FuncGenerator>(CreateScalarAndComputation),
+      [](bool a, bool b) { return a && b; }, true);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanOr) {
   RunVectorizedReduceTestForType<bool>(
-      CreateScalarOrComputation, [](bool a, bool b) { return a || b; }, false);
+      static_cast<FuncGenerator>(CreateScalarOrComputation),
+      [](bool a, bool b) { return a || b; }, false);
 }
 
 class ReduceR3ToR2Test : public ReduceTest,
                          public ::testing::WithParamInterface<BoundsLayout> {};
 
 XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const auto& bounds = GetParam().bounds;
   Array3D<float> input_array(bounds[0], bounds[1], bounds[2]);
   //  input_array.FillRandom(3.14f, 0.05);
@@ -822,7 +829,7 @@ XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) {
 
   auto input_activations =
       builder.Parameter(0, input_literal->shape(), "input");
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   auto sum = builder.Reduce(input_activations, builder.ConstantR0<float>(0.0f),
                             add, GetParam().reduce_dims);
 
@@ -862,8 +869,8 @@ INSTANTIATE_TEST_CASE_P(
 // IrEmitterUnnested::EmitInitializer() for the Reduce operator.  Failed on
 // 2017-07-26.
 XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OperationOnConstantAsInitValue)) {
-  ComputationBuilder builder(client_, TestName());
-  Computation max_f32 = CreateScalarMaxComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation max_f32 = CreateScalarMaxComputation(F32, &builder);
 
   auto a = builder.ConstantR0<float>(2.0f);
   auto a2 = builder.Abs(a);
@@ -884,5 +891,78 @@ XLA_TEST_F(ReduceTest, ReduceOrPredR2_64x32_To_R1) {
   RunR2ToR1PredTest</*cols=32*/ 32>(/*and_reduce=false*/ false, /*rows=64*/ 64);
 }
 
+// Tests reductions with different initial values.  There's no test macro that
+// combines TYPED_TEST and TYPED_P, so we have to do it manually.
+class ReduceInitializerTest : public ReduceTest {
+ protected:
+  template <typename T>
+  void DoTest(T initializer, int num_elems) {
+    XlaBuilder builder(TestName());
+    XlaComputation max_fn = CreateScalarMaxComputation(
+        primitive_util::NativeToPrimitiveType<T>(), &builder);
+
+    auto init = builder.ConstantR0<T>(initializer);
+    std::vector<T> input_arr(num_elems, std::numeric_limits<T>::lowest());
+    auto input_literal = Literal::CreateR1<T>(input_arr);
+    auto input_data =
+        client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+    builder.Reduce(builder.Parameter(0, input_literal->shape(), "input"), init,
+                   max_fn, {0});
+
+    ComputeAndCompareR0<T>(&builder, initializer, {input_data.get()});
+  }
+};
+
+XLA_TEST_F(ReduceInitializerTest, U8Small) { DoTest<uint8>(42, 2); }
+
+XLA_TEST_F(ReduceInitializerTest, U8BigPowerOf2) { DoTest<uint8>(42, 4096); }
+
+XLA_TEST_F(ReduceInitializerTest, U8InitializerBigNonPowerOf2) {
+  DoTest<uint8>(42, 4095);
+}
+
+XLA_TEST_F(ReduceInitializerTest, U64InitializerZero) {
+  DoTest<uint64>(0, 1024);
+}
+
+XLA_TEST_F(ReduceInitializerTest, U64InitializerOne) {
+  DoTest<uint64>(1, 1024);
+}
+
+XLA_TEST_F(ReduceInitializerTest, U64InitializerBigValue) {
+  DoTest<uint64>(1234556789123, 1024);
+}
+
+// Test the operational semantic that the init value is passed on the lhs for
+// reduces. Can be tested by performing an "identity" reduce (that simply
+// returns one of the parameters). In this case, we return the rhs, which for
+// a 1D array with one element, should not be the init value.
+XLA_TEST_F(ReduceTest, ReduceIdentity) {
+  XlaBuilder builder(TestName());
+  Shape single_float = ShapeUtil::MakeShape(F32, {});
+  builder.Parameter(0, single_float, "lhs-unused");
+  builder.Parameter(1, single_float, "rhs-used");
+  auto computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  Shape operand_shape = ShapeUtil::MakeShape(F32, {1});
+  builder.Reduce(builder.Parameter(0, operand_shape, "operand"),
+                 builder.Parameter(1, single_float, "init"),
+                 computation_status.ValueOrDie(), {0});
+
+  float operand[] = {42.0f};
+  float init = 58.5f;
+  float expected = 42.0f;
+  std::unique_ptr<Literal> input_literal = Literal::CreateR1<float>(operand);
+  std::unique_ptr<GlobalData> input_global_data =
+      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  std::unique_ptr<Literal> input_literal2 = Literal::CreateR0<float>(init);
+  std::unique_ptr<GlobalData> input_global_data2 =
+      client_->TransferToServer(*input_literal2).ConsumeValueOrDie();
+  ComputeAndCompareR0<float>(
+      &builder, expected, {input_global_data.get(), input_global_data2.get()},
+      ErrorSpec(0.0001));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index b11b64e40a582150d6adf29e915cd70b4bcb982b..10a3da3a387641ec45baf02d15790e32371601fa 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -21,10 +21,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -63,11 +64,9 @@ class ReduceWindowTestBase : public ClientLibraryTestBase {
 class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                          public ReduceWindowTestBase {
  public:
-  ReduceWindowTest() : builder_(client_, TestName()) {
-    set_use_bfloat16(GetParam());
-  }
+  ReduceWindowTest() : builder_(TestName()) { set_use_bfloat16(GetParam()); }
 
-  void ReduceWindowAdd(const ComputationDataHandle& input,
+  void ReduceWindowAdd(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -78,16 +77,17 @@ class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                           window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMax(const ComputationDataHandle& input,
+  void ReduceWindowMax(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
     auto init = CreateConstantFromLiteral(Literal::MinValue(F32), &builder_);
-    builder_.ReduceWindow(input, init, CreateScalarMax(), window_dimensions,
-                          window_strides, padding);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarMaxComputation(FloatType(), &builder_),
+                          window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMin(const ComputationDataHandle& input,
+  void ReduceWindowMin(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -97,7 +97,7 @@ class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                           window_dimensions, window_strides, padding);
   }
 
-  ComputationBuilder builder_;
+  XlaBuilder builder_;
 };
 
 TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
@@ -252,6 +252,48 @@ TEST_P(ReduceWindowTest, AmongMajor2DimsMediumSize) {
                            DefaultErrorSpec());
 }
 
+// Tests the super windowing logic w.r.t handling prime number of windows in a
+// major dimension with reduction.
+TEST_P(ReduceWindowTest, PrimeWindowsInReductionDimension) {
+  Array4D<float> input_array(15, 15, 4, 128);
+  input_array.FillRandom(2.f, 4.f);
+
+  int win_len = 3;
+  int win_stride = 2;
+
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
+
+  Padding padding = Padding::kSame;
+  // Reduce only along the x and y dimensions, according to the win_len.
+  ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
+                  {win_stride, win_stride, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {win_len, win_len, 1, 1},
+      {win_stride, win_stride, 1, 1}, padding);
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
+}
+
+TEST_P(ReduceWindowTest, ReduceAlongLaneDimension) {
+  Array4D<float> input_array(19, 17, 8, 256);
+  input_array.FillWithMinorDimNum();
+
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
+
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input_data_handle, {1, 1, 1, 11}, {1, 1, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {1, 1, 1, 11}, {1, 1, 1, 1}, padding);
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
+}
+
 // Tests a reduction function that is not a simple add/min/max/etc.
 XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   Array4D<float> input_array(1, 2, 2, 1);
@@ -268,7 +310,7 @@ XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   auto rhs = b->Parameter(1, scalar, "rhs");
   b->Min(b->Add(lhs, rhs),
          CreateConstantFromLiteral(*Literal::CreateR0<float>(8.0f), b.get()));
-  Computation reduce_fn = b->BuildAndNoteError();
+  XlaComputation reduce_fn = b->BuildAndNoteError();
 
   builder_.ReduceWindow(
       input,
@@ -296,7 +338,7 @@ TEST_P(ReduceWindowTest, R4UnitWindow) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -364,7 +406,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -386,7 +428,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -408,7 +450,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -509,7 +551,7 @@ TEST_P(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
 
 TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   Array2D<float> input_array(6, 4, 1.0f);
-  ComputationDataHandle input = builder_.Broadcast(
+  XlaOp input = builder_.Broadcast(
       CreateConstantFromLiteral(Literal::One(F32), &builder_), {6, 4});
 
   Padding padding = Padding::kSame;
@@ -568,7 +610,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
   R4ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
 
   void DoIt() {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     const auto& param = ::testing::get<0>(GetParam());
 
     const float kInitValue = 0.0f;
@@ -579,7 +621,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
     std::unique_ptr<Literal> input_literal =
         Literal::CreateR4FromArray4DWithLayout(
             input, LayoutUtil::MakeLayout(param.layout));
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                        &b, &parameter);
 
@@ -819,8 +861,7 @@ INSTANTIATE_TEST_CASE_P(
 class R4ReduceWindowAnyDimsTest : public R4ReduceWindowTest {};
 
 // TODO(b/72234705): Fix the test cases failed on CPU and GPU.
-XLA_TEST_P(R4ReduceWindowAnyDimsTest,
-           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) {
+XLA_TEST_P(R4ReduceWindowAnyDimsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) {
   DoIt();
 }
 
@@ -920,7 +961,7 @@ class R3ReduceWindowTest : public ReduceWindowTestBase,
 };
 
 TEST_P(R3ReduceWindowTest, Add) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd);
 
@@ -931,7 +972,7 @@ TEST_P(R3ReduceWindowTest, Add) {
       Literal::CreateR3FromArray3DWithLayout(
           input, LayoutUtil::MakeLayout(param.layout));
 
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                      &b, &parameter);
   auto init_value =
@@ -960,45 +1001,73 @@ struct R2ReduceWindowTestData {
   int64 base_bounds[2];
   int64 window_bounds[2];
   int64 strides[2];
+  int64 pad_low[2];
+  int64 pad_high[2];
   int64 layout[2];
-  Padding padding;
   Reducer reducer;
 } kR2TestCases[] = {
     {/*base_bounds=*/{4, 18}, /*window_bounds=*/{2, 4},
-     /*strides=*/{1, 2}, /*layout=*/{0, 1},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{1, 2}, /*pad_low=*/{0, 1}, /*pad_high=*/{1, 1},
+     /*layout=*/{0, 1},
+     /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{2, 5}, /*window_bounds=*/{2, 4},
-     /*strides=*/{1, 1}, /*layout=*/{0, 1},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{1, 1}, /*pad_low=*/{0, 1}, /*pad_high=*/{1, 2},
+     /*layout=*/{0, 1},
+     /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{1, 3}, /*window_bounds=*/{2, 3},
-     /*strides=*/{1, 1}, /*layout=*/{0, 1},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{1, 1}, /*pad_low=*/{0, 1}, /*pad_high=*/{1, 1},
+     /*layout=*/{0, 1},
+     /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{3, 129}, /*window_bounds=*/{1, 100},
-     /*strides=*/{2, 99}, /*layout=*/{0, 1},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{2, 99}, /*pad_low=*/{0, 0}, /*pad_high=*/{35, 35},
+     /*layout=*/{0, 1},
+     /*reducer=*/Reducer::kAdd},
+// TODO(b/74260408): This test last failed on GPU on 2018-03-08, likely due to a
+// ptxas bug.
+#ifndef XLA_TEST_BACKEND_GPU
     {/*base_bounds=*/{6, 152}, /*window_bounds=*/{2, 25},
-     /*strides=*/{5, 4}, /*layout=*/{0, 1},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{5, 4}, /*pad_low=*/{0, 1}, /*pad_high=*/{10, 11},
+     /*layout=*/{0, 1},
+     /*reducer=*/Reducer::kAdd},
+#endif
     {/*base_bounds=*/{6, 4}, /*window_bounds=*/{4, 2},
-     /*strides=*/{3, 3}, /*layout=*/{0, 1},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{3, 3}, /*pad_low=*/{0, 1}, /*pad_high=*/{0, 1},
+     /*layout=*/{0, 1},
+     /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{5, 147}, /*window_bounds=*/{1, 36},
-     /*strides=*/{4, 5}, /*layout=*/{1, 0},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{4, 5}, /*pad_low=*/{0, 0}, /*pad_high=*/{17, 17},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{4, 153}, /*window_bounds=*/{2, 93},
-     /*strides=*/{1, 1}, /*layout=*/{1, 0},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{1, 1}, /*pad_low=*/{0, 1}, /*pad_high=*/{46, 46},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kAdd},
     // Regression test for a bug that appeared in Inception (b/34784899).
     {/*base_bounds=*/{28, 28}, /*window_bounds=*/{3, 3},
-     /*strides=*/{1, 1}, /*layout=*/{1, 0},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{1, 1}, /*pad_low=*/{1, 1}, /*pad_high=*/{1, 1},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{4, 4}, /*window_bounds=*/{2, 2},
+     /*strides=*/{1, 1}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kAdd},
     // Regression test for a bug that appeared in Inception (b/34784899).
     {/*base_bounds=*/{4, 32}, /*window_bounds=*/{2, 2},
-     /*strides=*/{2, 2}, /*layout=*/{1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
-    {/*base_bounds=*/{4, 4}, /*window_bounds=*/{2, 2},
-     /*strides=*/{1, 1}, /*layout=*/{1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{2, 2}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kAdd},
+    // Regression test for b/73903312: bf16 lacks precision to store result of
+    // very large windows. Testing with a reasonable window larger than 128.
+    {/*base_bounds=*/{8, 130}, /*window_bounds=*/{1, 130},
+     /*strides=*/{1, 1}, /*pad_low=*/{0, 130}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{8, 256}, /*window_bounds=*/{1, 4},
+     /*strides=*/{1, 64}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{4096, 4096}, /*window_bounds=*/{1, 4},
+     /*strides=*/{1, 1024}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
+     /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
 };
 
 string R2ReduceWindowTestDataToString(
@@ -1008,10 +1077,11 @@ string R2ReduceWindowTestDataToString(
   string str = tensorflow::strings::StrCat(
       "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),  //
       "__window_bounds_",
-      tensorflow::str_util::Join(param.window_bounds, "x"),              //
-      "__strides_", tensorflow::str_util::Join(param.strides, "x"),      //
-      "__padding_", param.padding == Padding::kSame ? "same" : "valid",  //
-      "__layout_", param.layout[0], "_", param.layout[1],                //
+      tensorflow::str_util::Join(param.window_bounds, "x"),          //
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),  //
+      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),
+      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),
+      "__layout_", param.layout[0], "_", param.layout[1],  //
       "__reducer_", param.reducer == kAdd ? "add" : "max");
   if (::testing::get<1>(data.param)) {
     str = tensorflow::strings::StrCat(str, "_bfloat16");
@@ -1026,7 +1096,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
   R2ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
 
   void DoIt() {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     const auto& param = ::testing::get<0>(GetParam());
     CHECK(param.reducer == kAdd);
 
@@ -1036,20 +1106,32 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
         Literal::CreateR2FromArray2DWithLayout(
             input, LayoutUtil::MakeLayout(param.layout));
 
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                        &b, &parameter);
+    std::vector<std::pair<int64, int64>> padding(2);
+    for (int i = 0; i < 2; ++i) {
+      padding[i] = {param.pad_low[i], param.pad_high[i]};
+    }
+    auto computation = param.reducer == kAdd
+                           ? CreateScalarAddComputation(FloatType(), &b)
+                           : CreateScalarMaxComputation(FloatType(), &b);
     auto init_value =
         CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
-    b.ReduceWindow(/*operand=*/parameter,
-                   /*init_value=*/init_value,
-                   /*computation=*/CreateScalarAddComputation(FloatType(), &b),
-                   /*window_dimensions=*/param.window_bounds,
-                   /*window_strides=*/param.strides, /*padding=*/param.padding);
+    b.ReduceWindowWithGeneralPadding(
+        /*operand=*/parameter,
+        /*init_value=*/init_value,
+        /*computation=*/computation,
+        /*window_dimensions=*/param.window_bounds,
+        /*window_strides=*/param.strides, /*padding=*/padding);
 
-    auto expected = ReferenceUtil::ReduceWindow2DAdd(
-        /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
-        /*stride=*/param.strides, /*padding=*/param.padding);
+    auto reduce_func = param.reducer == kAdd
+                           ? +[](float a, float b) { return a + b; }
+                           : +[](float a, float b) { return std::max(a, b); };
+    auto expected = ReferenceUtil::ReduceWindow2DGeneric(
+        /*operand=*/input, /*init=*/kInitValue, /*reduce_func=*/reduce_func,
+        /*window=*/param.window_bounds,
+        /*stride=*/param.strides, /*padding=*/padding);
 
     ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
                              {input_arg.get()}, DefaultErrorSpec());
@@ -1068,14 +1150,15 @@ class R2ReduceWindowFailingCpuGpuBf16Test : public R2ReduceWindowTest {};
 
 // TODO(b/72234705): Fix the test cases failed on CPU and GPU.
 XLA_TEST_P(R2ReduceWindowFailingCpuGpuBf16Test,
-           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) {
+           DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) {
   DoIt();
 }
 
 const R2ReduceWindowTestData kR2FailingValuesCpuGpuBf16Test[] = {
     {/*base_bounds=*/{8, 128}, /*window_bounds=*/{8, 128},
-     /*strides=*/{1, 1}, /*layout=*/{1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*strides=*/{1, 1}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kAdd},
 };
 
 INSTANTIATE_TEST_CASE_P(
@@ -1211,7 +1294,7 @@ class R1ReduceWindowTest : public ReduceWindowTestBase,
 };
 
 TEST_P(R1ReduceWindowTest, DoIt) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd || param.reducer == kMax);
 
@@ -1220,7 +1303,7 @@ TEST_P(R1ReduceWindowTest, DoIt) {
   std::iota(std::begin(input_vector), std::end(input_vector), 0);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR1(tensorflow::gtl::ArraySlice<float>(input_vector));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                      &b, &parameter);
 
@@ -1315,5 +1398,58 @@ ENTRY R2Window {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
 }
 
+TEST_F(ReduceWindowTextTest, R2EffectiveScalar) {
+  const string& hlo_string = R"(
+HloModule R2Window
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+ENTRY R2Window {
+  operand = f32[1,1]{1,0} parameter(0)
+  negate = f32[1,1]{1,0} negate(operand)
+  constant = f32[] constant(1)
+  ROOT reduce-window = f32[1,1]{1,0} reduce-window(negate, constant), window={size=1x1 pad=0_0x0_0}, to_apply=mul
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
+}
+
+TEST_F(ReduceWindowTextTest, R3EffectiveScalar) {
+  const string& hlo_string = R"(
+HloModule R3Window
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+ENTRY R3Window {
+  operand = f32[1,1,1]{2,1,0} parameter(0)
+  negate = f32[1,1,1]{2,1,0} negate(operand)
+  constant = f32[] constant(1)
+  ROOT reduce-window = f32[1,1,1]{2,1,0} reduce-window(negate, constant), window={size=1x1x1 pad=0_0x0_0x0_0}, to_apply=mul
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
+}
+
+TEST_F(HloTestBase, ReduceWindowIdentity) {
+  const string& hlo_string = R"(
+HloModule ReduceWindowIdentity
+identity.pad_to_reduce_window {
+  param0 = f32[] parameter(0)
+  ROOT param1 = f32[] parameter(1)
+}
+ENTRY reduce-window-identity {
+  operand = f32[1,32,64]{2,1,0} parameter(0)
+  constant.4466 = f32[] constant(0)
+  ROOT reduce-window = f32[1,33,64]{2,1,0} reduce-window(operand, constant.4466),     window={size=1x1x1 pad=0_0x1_0x0_0}, to_apply=identity.pad_to_reduce_window
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc
index 6d063ffc363c092a1fbc40cbc22e87181d0c2502..36d763b0f7f4267ede076c0b25cfaf9654e96e0d 100644
--- a/tensorflow/compiler/xla/tests/replay_test.cc
+++ b/tensorflow/compiler/xla/tests/replay_test.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -38,17 +38,17 @@ class ReplayTest : public ClientLibraryTestBase {};
 
 TEST_F(ReplayTest, TwoPlusTwoReplay) {
   // Make 2+2 computation.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto two = builder.ConstantR0<int32>(2);
   builder.Add(two, two);
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   // Serialize it out.
-  std::unique_ptr<SessionModule> module =
+  std::unique_ptr<HloSnapshot> module =
       computation.Snapshot().ConsumeValueOrDie();
 
   // Replay it.
-  Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
+  XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
 
   // Check signature is the same.
   std::unique_ptr<ProgramShape> original_shape =
@@ -69,18 +69,18 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) {
 
 XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) {
   // Make computation.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(S32, {}), "y");
   builder.Add(x, y);
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   // Serialize it out.
-  std::unique_ptr<SessionModule> module =
+  std::unique_ptr<HloSnapshot> module =
       computation.Snapshot().ConsumeValueOrDie();
 
   // Replay it.
-  Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
+  XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
 
   // Check signature is the same.
   std::unique_ptr<ProgramShape> original_shape =
@@ -109,24 +109,24 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) {
 
 TEST_F(ReplayTest, MapPlusTwoOverR1) {
   // As above, but with map(+2) over some constant array.
-  ComputationBuilder plus_two_builder(client_, "plus two");
+  XlaBuilder plus_two_builder("plus two");
   auto input =
       plus_two_builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "input");
   plus_two_builder.Add(input, plus_two_builder.ConstantR0<int32>(2));
-  Computation plus_two = plus_two_builder.Build().ConsumeValueOrDie();
+  XlaComputation plus_two = plus_two_builder.Build().ConsumeValueOrDie();
 
-  ComputationBuilder mapper_builder(client_, TestName());
+  XlaBuilder mapper_builder(TestName());
   auto original = mapper_builder.ConstantR1<int32>({1, 2, 3});
   mapper_builder.Map({original}, plus_two, {0});
 
-  Computation computation = mapper_builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = mapper_builder.Build().ConsumeValueOrDie();
 
   // Serialize it out.
-  std::unique_ptr<SessionModule> module =
+  std::unique_ptr<HloSnapshot> module =
       computation.Snapshot().ConsumeValueOrDie();
 
   // Replay it.
-  Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
+  XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
 
   // Check signature is the same.
   std::unique_ptr<ProgramShape> original_shape =
@@ -135,10 +135,6 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) {
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
   ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
 
-  // Destroy the originals.
-  computation.Reset();
-  plus_two.Reset();
-
   // Run it.
   std::unique_ptr<Literal> literal =
       client_
diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
index e045e164e2e2db7d3480e7c2d1e20f461820ae67..5ebd5268992846e80dcce2675f8e92038e190ecf 100644
--- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
@@ -20,10 +20,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -45,7 +44,7 @@ namespace {
 using ReshapeMotionTest = ClientLibraryTestBase;
 
 TEST_F(ReshapeMotionTest, ElementwiseOfReshapesWithNonSameInputShapes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<int32>({{2, 3, 5}, {7, 11, 13}});
   auto b = builder.ConstantR2<int32>({{17, 19}, {23, 29}, {31, 37}});
   auto c = builder.Reshape(a, {6});
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index f7b04debd4f5c40a904e32c832b6fc384a03c33b..d7462d581b8596dc43b81b0162b3f5020cebb546 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -52,11 +52,11 @@ class ReshapeTest : public ::testing::WithParamInterface<bool>,
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to 1 dimension.
 XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -67,9 +67,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
 }
 
 XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{});
@@ -80,9 +80,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
 }
 
 XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0});
@@ -94,11 +94,11 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to scalar.
 XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   auto reshape = builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -111,15 +111,14 @@ XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
 }
 
 XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(1.0f);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
                                                  &builder, &parameter);
   auto a = builder.Neg(parameter);
-  auto reshape =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
+  builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
 
   auto expected_literal = Literal::CreateR1<float>({-1.0f});
   ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
@@ -130,10 +129,10 @@ XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(0, 3);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -146,11 +145,11 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-05-15
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -163,10 +162,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(3, 0);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -177,9 +176,9 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
 
 // Collapses a 2-dimensional row vector to 1 dimension.
 XLA_TEST_P(ReshapeTest, Trivial1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -190,9 +189,9 @@ XLA_TEST_P(ReshapeTest, Trivial1x3) {
 
 // Collapses a 2-dimensional column vector to 1 dimension.
 XLA_TEST_P(ReshapeTest, Trivial3x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -207,9 +206,9 @@ XLA_TEST_P(ReshapeTest, Trivial3x1) {
 //
 // Splits an empty vector into an empty matrix.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(R1ToR2_0_To_2x0)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
@@ -221,10 +220,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(R1ToR2_0_To_2x0)) {
 
 // Splits a vector into a matrix.
 XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal =
       Literal::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
@@ -241,9 +240,9 @@ XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
 //
 // Transposes a 2x0 array to a 0x2 array.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Reshape0x2To2x0)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 2));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -255,10 +254,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Reshape0x2To2x0)) {
 
 // Transposes a 2-dimensional row vector to a column vector.
 XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto simple = MakeLinspaceArray2D(1.0f, 3.0f, 1, 3);
   auto input_literal = Literal::CreateFromArray(*simple);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -272,10 +271,10 @@ XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
 
 // Transposes a 2-dimensional array.
 XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -291,11 +290,11 @@ XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 //
-// Transposes a 0x4 array with ComputationBuilder::Trans.
+// Transposes a 0x4 array with XlaBuilder::Transpose.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Transpose0x4)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 4));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Transpose(parameter, {1, 0});
@@ -306,10 +305,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Transpose0x4)) {
 
 // Transposes a 2-dimensional array with ComputationBuilder::Trans.
 XLA_TEST_P(ReshapeTest, Transpose4x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Transpose(parameter, {1, 0});
@@ -327,9 +326,9 @@ XLA_TEST_P(ReshapeTest, Transpose4x3) {
 // Reshapes an empty 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(6, 0));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -343,9 +342,9 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 4, 0));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -358,10 +357,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
 XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -378,9 +377,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
 // with an incorrect result rank.
 //
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 6));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -393,10 +392,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), and reorder the input (shuffle).
 XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -420,9 +419,9 @@ static Array3D<float> ArrayForDocR3Tests() {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
@@ -435,9 +434,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
@@ -455,9 +454,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -470,9 +469,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -490,9 +489,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -520,12 +519,12 @@ XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
 //
 // 1 2 3 4 5 6 1 2 3 4 5 6
 XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> t2x2x2x3(2, 2, 2, 3);
   auto filler2x3 = MakeLinspaceArray2D(1.0f, 6.0f, 2, 3);
   t2x2x2x3.FillWithYX(*filler2x3);
   auto input_literal = Literal::CreateFromArray(t2x2x2x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
@@ -539,7 +538,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
 
 // As above, but uses reshape directly.
 XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> t(2, 1, 2, 2);
   t(0, 0, 0, 0) = 0;
   t(0, 0, 0, 1) = 1;
@@ -550,7 +549,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 1, 0) = 6;
   t(1, 0, 1, 1) = 7;
   auto input_literal = Literal::CreateFromArray(t);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -565,7 +564,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
 // Reshape various ranks to a scalar.
 XLA_TEST_P(ReshapeTest, ToScalar) {
   for (int rank = 0; rank < 8; ++rank) {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     std::vector<int64> ones(rank, 1);  // this is {1, ..., 1}.
     std::vector<int64> dimensions(rank);
     std::iota(dimensions.begin(), dimensions.end(), 0);
@@ -573,7 +572,7 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
     std::vector<int64> zeros(rank, 0);  // this is {0, ..., 0}.
     input_literal.Set<float>(zeros, 83.0f);
 
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
                                                    &b, &parameter);
     b.Reshape(parameter, dimensions, {});
@@ -585,9 +584,9 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
 }
 
 XLA_TEST_P(ReshapeTest, BadDimensions) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
                                                  &parameter);
   b.Reshape(parameter, {}, {});
@@ -597,9 +596,9 @@ XLA_TEST_P(ReshapeTest, BadDimensions) {
 }
 
 XLA_TEST_P(ReshapeTest, BadNewSizes) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f, 2.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
                                                  &parameter);
   b.Reshape(parameter, {1}, {});
@@ -608,7 +607,7 @@ XLA_TEST_P(ReshapeTest, BadNewSizes) {
 }
 
 XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   auto input_literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
     {
@@ -634,7 +633,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   },
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
 
@@ -645,7 +644,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
       {222, 333, 444, 555, 666, 777, 888, 999},
   });
 
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
@@ -663,13 +662,13 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
 }
 
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
@@ -690,13 +689,13 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
 
 // Tests R2->R4 reshape with the reshape dimensions {1, 0}.
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
@@ -716,7 +715,7 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 1, 1);
@@ -726,7 +725,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
@@ -738,7 +737,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 4, 1);
@@ -748,7 +747,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
@@ -761,7 +760,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
 
 // Tests R4->R2 reshape with the reshape dimensions {0, 2, 1, 3}.
 XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(5, 10, 2, 3);
@@ -771,7 +770,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 2, 1, 3},
@@ -788,7 +787,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
 }
 
 XLA_TEST_P(ReshapeTest, NoopReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input_array(2, 3, 5, 7);
@@ -798,12 +797,12 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{3, 0, 1, 2},
                   /*new_sizes=*/{7, 2, 3, 5});
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
@@ -826,12 +825,12 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -845,8 +844,8 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 3, 2, 0},
@@ -879,8 +878,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -908,8 +907,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -937,8 +936,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -967,8 +966,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -996,8 +995,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 0, 2, 3},
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 8fc841f14087cdea02fe44cdaea521ff92122aec..e7bd142dc9ddefbd8bebfb77d72218d662645c31 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -85,7 +85,7 @@ TEST_P(FloatReverseTest, Reverses) {
   auto r1_literal = Literal::CreateR1<float>(input_vector);
   auto input_literal = r1_literal->Reshape(spec.input_dims).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = AddParam(*input_literal, &builder);
   builder.Rev(a, spec.reversal);
 
@@ -114,7 +114,7 @@ class ReverseTest : public ClientLibraryTestBase {};
 
 // Tests the reverse operation on a 4D U8 array on dimension 0 and 3.
 XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Input shape is U8[1x2x3x4].
   // clang-format off
   Array4D<uint8> input({{
@@ -144,7 +144,7 @@ XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) {
 
 // Tests the reverse operation on a 4D float array on dimension 0 and 1.
 TEST_F(ReverseTest, Reverse4DFloatArrayOnDim01) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Input shape is float[4x3x2x1].
   // clang-format off
   Array4D<float> input({
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index 4da6ee91607941b395b00befc98a10e7c17746ed..f35bc43a4952137b4b6c94c771819e0514d4228f 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -17,9 +17,10 @@ limitations under the License.
 #include <limits>
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -43,83 +44,80 @@ class ScalarComputationsTest : public ClientLibraryTestBase {
  protected:
   // A template for building and running a binary comparison test.
   template <typename NativeT>
-  void TestCompare(NativeT lhs, NativeT rhs, bool expected,
-                   ComputationDataHandle (ComputationBuilder::*op)(
-                       const ComputationDataHandle&,
-                       const ComputationDataHandle&,
-                       tensorflow::gtl::ArraySlice<int64>)) {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle lhs_op = builder.ConstantR0<NativeT>(lhs);
-    ComputationDataHandle rhs_op = builder.ConstantR0<NativeT>(rhs);
-    ComputationDataHandle result = (builder.*op)(lhs_op, rhs_op, {});
+  void TestCompare(
+      NativeT lhs, NativeT rhs, bool expected,
+      XlaOp (XlaBuilder::*op)(const XlaOp&, const XlaOp&,
+                              tensorflow::gtl::ArraySlice<int64>)) {
+    XlaBuilder builder(TestName());
+    XlaOp lhs_op = builder.ConstantR0<NativeT>(lhs);
+    XlaOp rhs_op = builder.ConstantR0<NativeT>(rhs);
+    XlaOp result = (builder.*op)(lhs_op, rhs_op, {});
     ComputeAndCompareR0<bool>(&builder, expected, {});
   }
 
   template <typename NativeT>
   void TestMinMax(NativeT lhs, NativeT rhs, NativeT expected,
-                  ComputationDataHandle (ComputationBuilder::*op)(
-                      const ComputationDataHandle&,
-                      const ComputationDataHandle&,
-                      tensorflow::gtl::ArraySlice<int64>)) {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle lhs_op = builder.ConstantR0<NativeT>(lhs);
-    ComputationDataHandle rhs_op = builder.ConstantR0<NativeT>(rhs);
-    ComputationDataHandle result = (builder.*op)(lhs_op, rhs_op, {});
+                  XlaOp (XlaBuilder::*op)(const XlaOp&, const XlaOp&,
+                                          tensorflow::gtl::ArraySlice<int64>)) {
+    XlaBuilder builder(TestName());
+    XlaOp lhs_op = builder.ConstantR0<NativeT>(lhs);
+    XlaOp rhs_op = builder.ConstantR0<NativeT>(rhs);
+    XlaOp result = (builder.*op)(lhs_op, rhs_op, {});
     ComputeAndCompareR0<NativeT>(&builder, expected, {});
   }
 };
 
 XLA_TEST_F(ScalarComputationsTest, ReturnScalarF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(2.1f);
 
   ComputeAndCompareR0<float>(&builder, 2.1f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, NegateScalarF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Neg(builder.ConstantR0<float>(2.1f));
 
   ComputeAndCompareR0<float>(&builder, -2.1f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, NegateScalarS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Neg(builder.ConstantR0<int32>(2));
 
   ComputeAndCompareR0<int32>(&builder, -2, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<float>(2.1f), builder.ConstantR0<float>(5.5f));
 
   ComputeAndCompareR0<float>(&builder, 7.6f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<int32>(2), builder.ConstantR0<int32>(5));
 
   ComputeAndCompareR0<int32>(&builder, 7, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<uint32>(35), builder.ConstantR0<uint32>(57));
 
   ComputeAndCompareR0<uint32>(&builder, 92, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<uint8>(35), builder.ConstantR0<uint8>(57));
 
   ComputeAndCompareR0<uint8>(&builder, 92, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const uint64 a = static_cast<uint64>(1) << 63;
   const uint64 b = a + 1;
   builder.Add(builder.ConstantR0<uint64>(a), builder.ConstantR0<uint64>(b));
@@ -128,7 +126,7 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU64) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const int64 a = static_cast<int64>(1) << 62;
   const int64 b = a - 1;
   builder.Add(builder.ConstantR0<int64>(a), builder.ConstantR0<int64>(b));
@@ -137,7 +135,7 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS64) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<double>(0.25),
               builder.ConstantR0<double>(3.5));
 
@@ -145,25 +143,25 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF64) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Sub(builder.ConstantR0<float>(2.1f), builder.ConstantR0<float>(5.5f));
 
   ComputeAndCompareR0<float>(&builder, -3.4f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Sub(builder.ConstantR0<int32>(2), builder.ConstantR0<int32>(5));
 
   ComputeAndCompareR0<int32>(&builder, -3, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, CastS64ToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.Parameter(0, ShapeUtil::MakeShape(S64, {}), "a");
   builder.ConvertElementType(a, F32);
 
-  int64 value = 3LL << 32;
+  int64 value = 3LL << 35;
   std::unique_ptr<Literal> a_literal = Literal::CreateR0<int64>(value);
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(*a_literal).ConsumeValueOrDie();
@@ -172,7 +170,7 @@ XLA_TEST_F(ScalarComputationsTest, CastS64ToF32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Mul(builder.Mul(builder.ConstantR0<float>(2.1f),
                           builder.ConstantR0<float>(5.5f)),
               builder.ConstantR0<float>(0.5f));
@@ -191,7 +189,7 @@ XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsS32) {
 
   for (int32 x : data) {
     for (int32 y : data) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Mul(builder.ConstantR0<int32>(x), builder.ConstantR0<int32>(y));
 
       // Signed integer overflow is undefined behavior in C++. Convert the input
@@ -210,7 +208,7 @@ XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsU32) {
 
   for (uint32 x : data) {
     for (uint32 y : data) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Mul(builder.ConstantR0<uint32>(x), builder.ConstantR0<uint32>(y));
 
       uint32 expected = x * y;
@@ -220,7 +218,7 @@ XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Mul(
       builder.Mul(builder.ConstantR0<int32>(2), builder.ConstantR0<int32>(5)),
       builder.ConstantR0<int32>(1));
@@ -229,7 +227,7 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> a_literal = Literal::CreateR0<float>(2.1f);
   std::unique_ptr<Literal> b_literal = Literal::CreateR0<float>(5.5f);
   std::unique_ptr<Literal> c_literal = Literal::CreateR0<float>(0.5f);
@@ -241,9 +239,9 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
   std::unique_ptr<GlobalData> c_data =
       client_->TransferToServer(*c_literal).ConsumeValueOrDie();
 
-  ComputationDataHandle a = builder.Parameter(0, a_literal->shape(), "a");
-  ComputationDataHandle b = builder.Parameter(1, b_literal->shape(), "b");
-  ComputationDataHandle c = builder.Parameter(2, c_literal->shape(), "c");
+  XlaOp a = builder.Parameter(0, a_literal->shape(), "a");
+  XlaOp b = builder.Parameter(1, b_literal->shape(), "b");
+  XlaOp c = builder.Parameter(2, c_literal->shape(), "c");
   builder.Mul(builder.Mul(a, b), c);
 
   ComputeAndCompareR0<float>(&builder, 5.775f,
@@ -252,14 +250,14 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Div(builder.ConstantR0<float>(5.0f), builder.ConstantR0<float>(2.5f));
 
   ComputeAndCompareR0<float>(&builder, 2.0f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Rem(builder.ConstantR0<float>(2.5f), builder.ConstantR0<float>(5.0f));
 
   ComputeAndCompareR0<float>(&builder, 2.5f, {}, error_spec_);
@@ -282,7 +280,7 @@ class DivS32Test : public ClientLibraryTestBase,
 
 XLA_TEST_P(DivS32Test, DivideTwoScalarsS32) {
   DivS32Params p = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Div(builder.ConstantR0<int32>(p.dividend),
               builder.ConstantR0<int32>(p.divisor));
 
@@ -291,7 +289,7 @@ XLA_TEST_P(DivS32Test, DivideTwoScalarsS32) {
 
 XLA_TEST_P(DivS32Test, RemainderTwoScalarsS32) {
   DivS32Params p = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Rem(builder.ConstantR0<int32>(p.dividend),
               builder.ConstantR0<int32>(p.divisor));
 
@@ -300,9 +298,9 @@ XLA_TEST_P(DivS32Test, RemainderTwoScalarsS32) {
 
 XLA_TEST_P(DivS32Test, DivideTwoScalarsNonConstS32) {
   DivS32Params p = GetParam();
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle dividend;
-  ComputationDataHandle divisor;
+  XlaBuilder builder(TestName());
+  XlaOp dividend;
+  XlaOp divisor;
   auto dividendd =
       CreateR0Parameter<int32>(p.dividend, 0, "dividend", &builder, &dividend);
   auto divisord =
@@ -315,9 +313,9 @@ XLA_TEST_P(DivS32Test, DivideTwoScalarsNonConstS32) {
 
 XLA_TEST_P(DivS32Test, RemainderTwoScalarsNonConstDivisorS32) {
   DivS32Params p = GetParam();
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle dividend;
-  ComputationDataHandle divisor;
+  XlaBuilder builder(TestName());
+  XlaOp dividend;
+  XlaOp divisor;
   auto dividendd =
       CreateR0Parameter<int32>(p.dividend, 0, "dividend", &builder, &dividend);
   auto divisord =
@@ -364,13 +362,13 @@ XLA_TEST_F(ScalarComputationsTest, DivU32s) {
     0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0x80000000, UINT32_MAX - 1, UINT32_MAX};
   // clang-format on
 
-  Computation div_computation;
+  XlaComputation div_computation;
   {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
 
-    ComputationDataHandle dividend =
+    XlaOp dividend =
         builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend");
-    ComputationDataHandle divisor =
+    XlaOp divisor =
         builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
     builder.Div(dividend, divisor);
     TF_ASSERT_OK_AND_ASSIGN(div_computation, builder.Build());
@@ -405,13 +403,13 @@ XLA_TEST_F(ScalarComputationsTest, RemU32s) {
     0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0x80000000, UINT32_MAX - 1, UINT32_MAX};
   // clang-format on
 
-  Computation rem_computation;
+  XlaComputation rem_computation;
   {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
 
-    ComputationDataHandle dividend =
+    XlaOp dividend =
         builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend");
-    ComputationDataHandle divisor =
+    XlaOp divisor =
         builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
     builder.Rem(dividend, divisor);
     TF_ASSERT_OK_AND_ASSIGN(rem_computation, builder.Build());
@@ -440,7 +438,7 @@ XLA_TEST_F(ScalarComputationsTest, RemU32s) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x");
   builder.Rem(x, builder.ConstantR0<int32>(80000));
 
@@ -450,7 +448,7 @@ XLA_TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // This verifies 0xFFFFFFFE / 2 = 0x7FFFFFFF. If XLA incorrectly treated U32
   // as S32, it would output -2 / 2 = -1 (0xFFFFFFFF).
   builder.Div(builder.ConstantR0<uint32>(0xFFFFFFFE),
@@ -460,7 +458,7 @@ XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Rem(builder.ConstantR0<uint32>(11), builder.ConstantR0<uint32>(3));
 
   ComputeAndCompareR0<uint32>(&builder, 2, {});
@@ -469,7 +467,7 @@ XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsU32) {
 XLA_TEST_F(ScalarComputationsTest, AndBool) {
   for (bool x : {false, true}) {
     for (bool y : {false, true}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.And(builder.ConstantR0<bool>(x), builder.ConstantR0<bool>(y));
 
       ComputeAndCompareR0<bool>(&builder, x && y, {});
@@ -480,7 +478,7 @@ XLA_TEST_F(ScalarComputationsTest, AndBool) {
 XLA_TEST_F(ScalarComputationsTest, AndS32) {
   for (int32 x : {0, 8}) {
     for (int32 y : {1, -16}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.And(builder.ConstantR0<int32>(x), builder.ConstantR0<int32>(y));
 
       ComputeAndCompareR0<int32>(&builder, x & y, {});
@@ -491,7 +489,7 @@ XLA_TEST_F(ScalarComputationsTest, AndS32) {
 XLA_TEST_F(ScalarComputationsTest, AndU32) {
   for (uint32 x : {0, 8}) {
     for (uint32 y : {1, 16}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.And(builder.ConstantR0<uint32>(x), builder.ConstantR0<uint32>(y));
 
       ComputeAndCompareR0<uint32>(&builder, x & y, {});
@@ -502,7 +500,7 @@ XLA_TEST_F(ScalarComputationsTest, AndU32) {
 XLA_TEST_F(ScalarComputationsTest, OrBool) {
   for (bool x : {false, true}) {
     for (bool y : {false, true}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Or(builder.ConstantR0<bool>(x), builder.ConstantR0<bool>(y));
 
       ComputeAndCompareR0<bool>(&builder, x || y, {});
@@ -513,7 +511,7 @@ XLA_TEST_F(ScalarComputationsTest, OrBool) {
 XLA_TEST_F(ScalarComputationsTest, OrS32) {
   for (int32 x : {0, 8}) {
     for (int32 y : {1, -16}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Or(builder.ConstantR0<int32>(x), builder.ConstantR0<int32>(y));
 
       ComputeAndCompareR0<int32>(&builder, x | y, {});
@@ -524,7 +522,7 @@ XLA_TEST_F(ScalarComputationsTest, OrS32) {
 XLA_TEST_F(ScalarComputationsTest, OrU32) {
   for (uint32 x : {0, 8}) {
     for (uint32 y : {1, 16}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Or(builder.ConstantR0<uint32>(x), builder.ConstantR0<uint32>(y));
 
       ComputeAndCompareR0<uint32>(&builder, x | y, {});
@@ -534,7 +532,7 @@ XLA_TEST_F(ScalarComputationsTest, OrU32) {
 
 XLA_TEST_F(ScalarComputationsTest, NotBool) {
   for (bool x : {false, true}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.Not(builder.ConstantR0<bool>(x));
 
     ComputeAndCompareR0<bool>(&builder, !x, {});
@@ -543,7 +541,7 @@ XLA_TEST_F(ScalarComputationsTest, NotBool) {
 
 XLA_TEST_F(ScalarComputationsTest, NotS32) {
   for (int32 x : {-1, 0, 1}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.Not(builder.ConstantR0<int32>(x));
 
     ComputeAndCompareR0<int32>(&builder, ~x, {});
@@ -552,7 +550,7 @@ XLA_TEST_F(ScalarComputationsTest, NotS32) {
 
 XLA_TEST_F(ScalarComputationsTest, NotU32) {
   for (uint32 x : {0, 1, 2}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.Not(builder.ConstantR0<uint32>(x));
 
     ComputeAndCompareR0<uint32>(&builder, ~x, {});
@@ -560,7 +558,7 @@ XLA_TEST_F(ScalarComputationsTest, NotU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, SelectScalarTrue) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Select(builder.ConstantR0<bool>(true),     // The predicate.
                  builder.ConstantR0<float>(123.0f),  // The value on true.
                  builder.ConstantR0<float>(42.0f));  // The value on false.
@@ -569,7 +567,7 @@ XLA_TEST_F(ScalarComputationsTest, SelectScalarTrue) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, SelectScalarFalse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Select(builder.ConstantR0<bool>(false),    // The predicate.
                  builder.ConstantR0<float>(123.0f),  // The value on true.
                  builder.ConstantR0<float>(42.0f));  // The value on false.
@@ -580,7 +578,7 @@ XLA_TEST_F(ScalarComputationsTest, SelectScalarFalse) {
 // This test is an explicit version of what is happening in the following
 // templatized comparison tests.
 XLA_TEST_F(ScalarComputationsTest, CompareGtScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Gt(builder.ConstantR0<float>(2.0f), builder.ConstantR0<float>(1.0f));
 
   ComputeAndCompareR0<bool>(&builder, true, {});
@@ -588,157 +586,156 @@ XLA_TEST_F(ScalarComputationsTest, CompareGtScalar) {
 
 // S32 comparisons.
 XLA_TEST_F(ScalarComputationsTest, CompareEqS32Greater) {
-  TestCompare<int32>(2, 1, false, &ComputationBuilder::Eq);
+  TestCompare<int32>(2, 1, false, &XlaBuilder::Eq);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareEqS32Equal) {
-  TestCompare<int32>(3, 3, true, &ComputationBuilder::Eq);
+  TestCompare<int32>(3, 3, true, &XlaBuilder::Eq);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareNeS32) {
-  TestCompare<int32>(2, 1, true, &ComputationBuilder::Ne);
+  TestCompare<int32>(2, 1, true, &XlaBuilder::Ne);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeS32) {
-  TestCompare<int32>(2, 1, true, &ComputationBuilder::Ge);
+  TestCompare<int32>(2, 1, true, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGtS32) {
-  TestCompare<int32>(1, 5, false, &ComputationBuilder::Gt);
+  TestCompare<int32>(1, 5, false, &XlaBuilder::Gt);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLeS32) {
-  TestCompare<int32>(2, 1, false, &ComputationBuilder::Le);
+  TestCompare<int32>(2, 1, false, &XlaBuilder::Le);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLtS32) {
-  TestCompare<int32>(9, 7, false, &ComputationBuilder::Lt);
+  TestCompare<int32>(9, 7, false, &XlaBuilder::Lt);
   TestCompare<int32>(std::numeric_limits<int32>::min(),
-                     std::numeric_limits<int32>::max(), true,
-                     &ComputationBuilder::Lt);
+                     std::numeric_limits<int32>::max(), true, &XlaBuilder::Lt);
 }
 
 // U32 comparisons.
 XLA_TEST_F(ScalarComputationsTest, CompareEqU32False) {
-  TestCompare<uint32>(2, 1, false, &ComputationBuilder::Eq);
+  TestCompare<uint32>(2, 1, false, &XlaBuilder::Eq);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareNeU32) {
-  TestCompare<uint32>(2, 1, true, &ComputationBuilder::Ne);
+  TestCompare<uint32>(2, 1, true, &XlaBuilder::Ne);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeU32Greater) {
-  TestCompare<uint32>(2, 1, true, &ComputationBuilder::Ge);
+  TestCompare<uint32>(2, 1, true, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeU32Equal) {
-  TestCompare<uint32>(3, 3, true, &ComputationBuilder::Ge);
+  TestCompare<uint32>(3, 3, true, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGtU32) {
-  TestCompare<uint32>(1, 5, false, &ComputationBuilder::Gt);
-  TestCompare<uint32>(5, 5, false, &ComputationBuilder::Gt);
-  TestCompare<uint32>(5, 1, true, &ComputationBuilder::Gt);
+  TestCompare<uint32>(1, 5, false, &XlaBuilder::Gt);
+  TestCompare<uint32>(5, 5, false, &XlaBuilder::Gt);
+  TestCompare<uint32>(5, 1, true, &XlaBuilder::Gt);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLeU32) {
-  TestCompare<uint32>(2, 1, false, &ComputationBuilder::Le);
+  TestCompare<uint32>(2, 1, false, &XlaBuilder::Le);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLtU32) {
-  TestCompare<uint32>(9, 7, false, &ComputationBuilder::Lt);
+  TestCompare<uint32>(9, 7, false, &XlaBuilder::Lt);
   TestCompare<uint32>(0, std::numeric_limits<uint32>::max(), true,
-                      &ComputationBuilder::Lt);
+                      &XlaBuilder::Lt);
 }
 
 // F32 comparisons.
 XLA_TEST_F(ScalarComputationsTest, CompareEqF32False) {
-  TestCompare<float>(2.0, 1.3, false, &ComputationBuilder::Eq);
+  TestCompare<float>(2.0, 1.3, false, &XlaBuilder::Eq);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareNeF32) {
-  TestCompare<float>(2.0, 1.3, true, &ComputationBuilder::Ne);
+  TestCompare<float>(2.0, 1.3, true, &XlaBuilder::Ne);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32Greater) {
-  TestCompare<float>(2.0, 1.9, true, &ComputationBuilder::Ge);
+  TestCompare<float>(2.0, 1.9, true, &XlaBuilder::Ge);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32Equal) {
-  TestCompare<float>(3.5, 3.5, true, &ComputationBuilder::Ge);
+  TestCompare<float>(3.5, 3.5, true, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGtF32) {
-  TestCompare<float>(1.0, 5.2, false, &ComputationBuilder::Gt);
+  TestCompare<float>(1.0, 5.2, false, &XlaBuilder::Gt);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLeF32) {
-  TestCompare<float>(2.0, 1.2, false, &ComputationBuilder::Le);
+  TestCompare<float>(2.0, 1.2, false, &XlaBuilder::Le);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLtF32) {
-  TestCompare<float>(9.0, 7.2, false, &ComputationBuilder::Lt);
+  TestCompare<float>(9.0, 7.2, false, &XlaBuilder::Lt);
 }
 
 // F32 comparisons with exceptional values.  The test names encode the
 // left/right operands at the end, and use Minf and Mzero for -inf and -0.0.
 XLA_TEST_F(ScalarComputationsTest, CompareLtF32MinfMzero) {
-  TestCompare<float>(-INFINITY, -0.0, true, &ComputationBuilder::Lt);
+  TestCompare<float>(-INFINITY, -0.0, true, &XlaBuilder::Lt);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareLtF32MzeroZero) {
   // Comparisons of 0.0 to -0.0 consider them equal in IEEE 754.
-  TestCompare<float>(-0.0, 0.0, false, &ComputationBuilder::Lt);
+  TestCompare<float>(-0.0, 0.0, false, &XlaBuilder::Lt);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareLtF32ZeroInf) {
-  TestCompare<float>(0.0, INFINITY, true, &ComputationBuilder::Lt);
+  TestCompare<float>(0.0, INFINITY, true, &XlaBuilder::Lt);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32MinfMzero) {
-  TestCompare<float>(-INFINITY, -0.0, false, &ComputationBuilder::Ge);
+  TestCompare<float>(-INFINITY, -0.0, false, &XlaBuilder::Ge);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32MzeroZero) {
   // Comparisons of 0.0 to -0.0 consider them equal in IEEE 754.
-  TestCompare<float>(-0.0, 0.0, true, &ComputationBuilder::Ge);
+  TestCompare<float>(-0.0, 0.0, true, &XlaBuilder::Ge);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32ZeroInf) {
-  TestCompare<float>(0.0, INFINITY, false, &ComputationBuilder::Ge);
+  TestCompare<float>(0.0, INFINITY, false, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ExpScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Exp(builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR0<float>(&builder, 7.3890562, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, LogScalar) {
-  ComputationBuilder builder(client_, "log");
+  XlaBuilder builder("log");
   builder.Log(builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR0<float>(&builder, 0.6931471, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, TanhScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Tanh(builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR0<float>(&builder, 0.96402758, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, TanhDoubleScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Tanh(builder.ConstantR0<double>(2.0));
 
   ComputeAndCompareR0<double>(&builder, 0.96402758, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, PowScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Pow(builder.ConstantR0<float>(2.0f), builder.ConstantR0<float>(3.0f));
 
   ComputeAndCompareR0<float>(&builder, 8.0, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
                 builder.ConstantR0<int32>(5),   // The operand to be clamped.
                 builder.ConstantR0<int32>(3));  // The upper bound.
@@ -747,7 +744,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
                 builder.ConstantR0<int32>(2),   // The operand to be clamped.
                 builder.ConstantR0<int32>(3));  // The upper bound.
@@ -756,7 +753,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarLowS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
                 builder.ConstantR0<int32>(-5),  // The operand to be clamped.
                 builder.ConstantR0<int32>(3));  // The upper bound.
@@ -765,7 +762,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarLowS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarHighU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
                 builder.ConstantR0<uint32>(5),   // The operand to be clamped.
                 builder.ConstantR0<uint32>(3));  // The upper bound.
@@ -774,7 +771,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHighU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
                 builder.ConstantR0<uint32>(2),   // The operand to be clamped.
                 builder.ConstantR0<uint32>(3));  // The upper bound.
@@ -783,7 +780,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarLowU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
                 builder.ConstantR0<uint32>(0),   // The operand to be clamped.
                 builder.ConstantR0<uint32>(3));  // The upper bound.
@@ -792,7 +789,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarLowU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(5.0f),   // The operand to be clamped.
                 builder.ConstantR0<float>(3.0f));  // The upper bound.
@@ -801,7 +798,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(2.5f),   // The operand to be clamped.
                 builder.ConstantR0<float>(3.0f));  // The upper bound.
@@ -810,7 +807,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(-5.0f),  // The operand to be clamped.
                 builder.ConstantR0<float>(3.0f));  // The upper bound.
@@ -819,58 +816,70 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinS32Above) {
-  TestMinMax<int32>(10, 3, 3, &ComputationBuilder::Min);
+  TestMinMax<int32>(10, 3, 3, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinS32Below) {
-  TestMinMax<int32>(-100, 3, -100, &ComputationBuilder::Min);
+  TestMinMax<int32>(-100, 3, -100, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxS32Above) {
-  TestMinMax<int32>(10, 3, 10, &ComputationBuilder::Max);
+  TestMinMax<int32>(10, 3, 10, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxS32Below) {
-  TestMinMax<int32>(-100, 3, 3, &ComputationBuilder::Max);
+  TestMinMax<int32>(-100, 3, 3, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinU32Above) {
   const uint32 large = std::numeric_limits<int32>::max();
-  TestMinMax<uint32>(large, 3, 3, &ComputationBuilder::Min);
+  TestMinMax<uint32>(large, 3, 3, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinU32Below) {
-  TestMinMax<uint32>(0, 5, 0, &ComputationBuilder::Min);
+  TestMinMax<uint32>(0, 5, 0, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxU32Above) {
   const uint32 large = std::numeric_limits<int32>::max();
-  TestMinMax<uint32>(large, 3, large, &ComputationBuilder::Max);
+  TestMinMax<uint32>(large, 3, large, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxU32Below) {
-  TestMinMax<uint32>(0, 5, 5, &ComputationBuilder::Max);
+  TestMinMax<uint32>(0, 5, 5, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinF32Above) {
-  TestMinMax<float>(10.1f, 3.1f, 3.1f, &ComputationBuilder::Min);
+  TestMinMax<float>(10.1f, 3.1f, 3.1f, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinF32Below) {
-  TestMinMax<float>(-100.1f, 3.1f, -100.1f, &ComputationBuilder::Min);
+  TestMinMax<float>(-100.1f, 3.1f, -100.1f, &XlaBuilder::Min);
+}
+
+XLA_TEST_F(ScalarComputationsTest, MinPropagatesNan) {
+  SetFastMathDisabled(true);
+  TestMinMax<float>(NAN, 3.1f, NAN, &XlaBuilder::Min);
+  TestMinMax<float>(-3.1f, NAN, NAN, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxF32Above) {
-  TestMinMax<float>(10.1f, 3.1f, 10.1f, &ComputationBuilder::Max);
+  TestMinMax<float>(10.1f, 3.1f, 10.1f, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxF32Below) {
-  TestMinMax<float>(-100.1f, 3.1f, 3.1f, &ComputationBuilder::Max);
+  TestMinMax<float>(-100.1f, 3.1f, 3.1f, &XlaBuilder::Max);
+}
+
+XLA_TEST_F(ScalarComputationsTest, MaxPropagatesNan) {
+  SetFastMathDisabled(true);
+  TestMinMax<float>(NAN, 3.1f, NAN, &XlaBuilder::Max);
+  TestMinMax<float>(-3.1f, NAN, NAN, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) {
   // Compute the expression (1 * (3 - 1) * (7 + 0) - 4) / 20.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Div(
       b.Sub(b.Mul(b.ConstantR0<float>(1),
                   b.Mul(b.Sub(b.ConstantR0<float>(3), b.ConstantR0<float>(1)),
@@ -883,7 +892,7 @@ XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) {
 
 XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) {
   // Compute the expression 1 * (3 - 1) * (7 + 0) - 4.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Sub(b.Mul(b.ConstantR0<int32>(1),
               b.Mul(b.Sub(b.ConstantR0<int32>(3), b.ConstantR0<int32>(1)),
                     b.Add(b.ConstantR0<int32>(7), b.ConstantR0<int32>(0)))),
@@ -893,21 +902,20 @@ XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, SqrtF320) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Literal zero_literal = Literal::Zero(PrimitiveType::F32);
 
   std::unique_ptr<GlobalData> zero_data =
       client_->TransferToServer(zero_literal).ConsumeValueOrDie();
 
-  ComputationDataHandle zero =
-      builder.Parameter(0, zero_literal.shape(), "zero");
+  XlaOp zero = builder.Parameter(0, zero_literal.shape(), "zero");
   builder.SqrtF32(zero);
 
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, RoundScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Round(builder.ConstantR0<float>(1.4f));
 
   ComputeAndCompareR0<float>(&builder, 1.0f, {}, error_spec_);
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index 9ee94b8571e5fc8789b60501462986967ce909a0..7015e5a6a31f506d30c2629d7735482cf354455a 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -50,7 +50,7 @@ class SelectAndScatterTest
     : public ClientLibraryTestBase,
       public ::testing::WithParamInterface<SelectAndScatterTestParam> {
  public:
-  SelectAndScatterTest() : builder_(client_, TestName()) {
+  SelectAndScatterTest() : builder_(TestName()) {
     // Create S32 GE and ADD computations for select and scatter respectively.
     ge_s32_ = CreateScalarGeComputation(S32, &builder_);
     add_s32_ = CreateScalarAddComputation(S32, &builder_);
@@ -60,13 +60,13 @@ class SelectAndScatterTest
     min_f32_ = CreateScalarMinComputation(F32, &builder_);
   }
 
-  ComputationBuilder builder_;
-  Computation ge_s32_;
-  Computation add_s32_;
-  Computation ge_f32_;
-  Computation add_f32_;
-  Computation max_f32_;
-  Computation min_f32_;
+  XlaBuilder builder_;
+  XlaComputation ge_s32_;
+  XlaComputation add_s32_;
+  XlaComputation ge_f32_;
+  XlaComputation add_f32_;
+  XlaComputation max_f32_;
+  XlaComputation min_f32_;
 };
 
 XLA_TEST_P(SelectAndScatterTest, ParamTest) {
@@ -80,12 +80,11 @@ XLA_TEST_P(SelectAndScatterTest, ParamTest) {
   s.FillRandom(12.0f);
   auto source = builder_.ConstantFromArray(s);
 
-  auto select_and_scatter = builder_.SelectAndScatter(
-      operand, ge_f32_, GetParam().window_dimensions, GetParam().window_strides,
-      GetParam().padding_type, source, builder_.ConstantR0<float>(0.0f),
-      add_f32_);
+  builder_.SelectAndScatter(operand, ge_f32_, GetParam().window_dimensions,
+                            GetParam().window_strides, GetParam().padding_type,
+                            source, builder_.ConstantR0<float>(0.0f), add_f32_);
 
-  ComputeAndCompare(&builder_, select_and_scatter, {}, ErrorSpec(1e-5));
+  ComputeAndCompare(&builder_, {}, ErrorSpec(1e-5));
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -252,6 +251,21 @@ XLA_TEST_F(SelectAndScatterTest, R2S32) {
   ComputeAndCompareR2<int32>(&builder_, expected, {});
 }
 
+// Test for tie breaking rule in ge_f32_. When a tie is present, the operand
+// that has the lower lexicographical order (smaller index) should be chosen.
+XLA_TEST_F(SelectAndScatterTest, R2F32Tie) {
+  const auto operand = builder_.ConstantR2<float>(
+      {{0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}});
+  const auto source = builder_.ConstantR2<float>(
+      {{1.0f, 2.0f, 3.0f}, {4.f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
+  Array2D<float> expected(
+      {{12.f, 9.f, 0.f}, {15.f, 9.f, 0.f}, {0.f, 0.f, 0.f}});
+  builder_.SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{3, 3},
+                            /*window_strides=*/{1, 1}, Padding::kSame, source,
+                            builder_.ConstantR0<float>(0.0f), add_f32_);
+  ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(1e-7));
+}
+
 // Similar to SelectAndScatterTest.R2S32 but the input is transposed.
 XLA_TEST_F(SelectAndScatterTest, ReshapeR2S32) {
   const auto operand = builder_.ConstantR2<int32>(
diff --git a/tensorflow/compiler/xla/tests/select_test.cc b/tensorflow/compiler/xla/tests/select_test.cc
index 009e7d24c5cbface4da910e2366db1ff749d5d68..3d694a9c3fe894107c3b0a8fc2e5d07310cb476c 100644
--- a/tensorflow/compiler/xla/tests/select_test.cc
+++ b/tensorflow/compiler/xla/tests/select_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -35,7 +35,7 @@ class SelectTest : public ClientLibraryTestBase {
 };
 
 TEST_F(SelectTest, SelectScalarF32True) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto on_true = builder.ConstantR0<float>(123.0f);
   auto on_false = builder.ConstantR0<float>(42.0f);
@@ -45,7 +45,7 @@ TEST_F(SelectTest, SelectScalarF32True) {
 }
 
 TEST_F(SelectTest, SelectScalarS32True) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto on_true = builder.ConstantR0<int32>(-42);
   auto on_false = builder.ConstantR0<int32>(42);
@@ -55,7 +55,7 @@ TEST_F(SelectTest, SelectScalarS32True) {
 }
 
 TEST_F(SelectTest, SelectScalarF32False) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto on_true = builder.ConstantR0<float>(123.0f);
   auto on_false = builder.ConstantR0<float>(42.0f);
@@ -65,7 +65,7 @@ TEST_F(SelectTest, SelectScalarF32False) {
 }
 
 XLA_TEST_F(SelectTest, SelectR1S0F32WithConstantR1S0PRED) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR1<bool>({});
   auto on_true = builder.ConstantR1<float>({});
   auto on_false = builder.ConstantR1<float>({});
@@ -75,7 +75,7 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithConstantR1S0PRED) {
 }
 
 TEST_F(SelectTest, SelectR1F32WithConstantR1PRED) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR1<bool>({false, true, false, true, false});
   auto on_true = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f});
   auto on_false = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, -6.0f});
@@ -88,7 +88,7 @@ TEST_F(SelectTest, SelectR1F32WithConstantR1PRED) {
 XLA_TEST_F(SelectTest, SelectR1S0F32WithCmpR1S0S32s) {
   // Similar to SelectR1S0F32WithConstantR1S0PRED, except that the pred vector
   // is not a constant, but rather the result of comparing two other vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v1 = builder.ConstantR1<int32>({});
   auto v2 = builder.ConstantR1<int32>({});
   auto cmp = builder.Eq(v1, v2);
@@ -102,7 +102,7 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithCmpR1S0S32s) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1S32s) {
   // Similar to SelectR1F32WithConstantR1PRED, except that the pred vector is
   // not a constant, but rather the result of comparing two other vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v1 = builder.ConstantR1<int32>({1, 2, 3, 4, 5});
   auto v2 = builder.ConstantR1<int32>({9, 2, 9, 4, 9});
   auto cmp = builder.Eq(v1, v2);
@@ -116,7 +116,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1S32s) {
 
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32s) {
   // Similar to SelectR1F32WithCmpR1S32s, except "gt"-comparing two R1F32s.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v1 = builder.ConstantR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
   auto v2 = builder.ConstantR1<float>({-1.0f, -2.0f, 13.0f, 14.0f, 4.4f});
   auto cmp = builder.Gt(v1, v2);
@@ -131,9 +131,9 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32s) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsSmall) {
   // Selects among two R1F32s, which come from parameters. v1 and v2 are
   // compared, and selection between them happens based on a gt-comparison mask.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle v1, v2;
+  XlaOp v1, v2;
   std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
       {41.0f, 2.0f, 3.0f, 84.0f}, /*parameter_number=*/0, /*name=*/"v1",
       /*builder=*/&builder, /*data_handle=*/&v1);
@@ -151,7 +151,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsSmall) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsLarge) {
   // Similar to SelectR1F32WithCmpR1F32sFromParamsSmall, except that the
   // data size passed in and out is large.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // Number of floats in the data passed into and out of the computation.
   constexpr int datalen = 15 * 1000;
@@ -174,7 +174,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsLarge) {
     expected_vec.push_back(larger);
   }
 
-  ComputationDataHandle v1, v2;
+  XlaOp v1, v2;
   std::unique_ptr<GlobalData> param0_data =
       CreateR1Parameter<float>(v1vec, /*parameter_number=*/0, /*name=*/"v1",
                                /*builder=*/&builder, /*data_handle=*/&v1);
@@ -192,7 +192,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsLarge) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1S32ToScalar) {
   // "gt"-compares a R1S32 with a S32 scalar, and uses the resulting R1PRED to
   // select between two R1F32s.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, -1, 2, -2});
   auto s = builder.ConstantR0<int32>(0);
   auto cmp = builder.Gt(v, s);
@@ -209,7 +209,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1S32ToScalar) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32ToScalar) {
   // "gt"-compares a R1F32 with a F32 scalar, and uses the resulting R1PRED to
   // select between two R1F32s.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
   auto s = builder.ConstantR0<float>(2.5f);
   auto cmp = builder.Gt(v, s);
@@ -225,7 +225,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32ToScalar) {
 
 XLA_TEST_F(SelectTest, SelectR1S0F32WithScalarPredicate) {
   for (bool which : {false, true}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto pred = builder.ConstantR0<bool>(which);
     auto on_true = builder.ConstantR1<float>({});
     auto on_false = builder.ConstantR1<float>({});
@@ -236,7 +236,7 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithScalarPredicate) {
 }
 
 TEST_F(SelectTest, SelectR1F32WithScalarPredicateTrue) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto on_true = builder.ConstantR1<float>({-2.5f, 25.5f});
   auto on_false = builder.ConstantR1<float>({10.0f, 5.0f});
@@ -246,7 +246,7 @@ TEST_F(SelectTest, SelectR1F32WithScalarPredicateTrue) {
 }
 
 TEST_F(SelectTest, SelectR1F32WithScalarPredicateFalse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto on_true = builder.ConstantR1<float>({-2.5f, 25.5f});
   auto on_false = builder.ConstantR1<float>({10.0f, 5.0f});
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index ac163df127e0087c02777fa3d5ce7970c51b97b9..52195db2aa74710b901dd7744a670764a034e96b 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -41,7 +41,7 @@ TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {3, 3, 1}, {1, 1, 1});
 
@@ -54,7 +54,7 @@ TEST_F(SliceTest, Slice3x3x3_To_3x1x3_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {3, 1, 3}, {1, 1, 1});
 
@@ -67,7 +67,7 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {1, 3, 3}, {1, 1, 1});
 
@@ -77,7 +77,7 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
   builder.Slice(original, {0, 0}, {0, 0}, {1, 1});
 
@@ -85,7 +85,7 @@ XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 20));
   builder.Slice(original, {0, 15}, {0, 20}, {1, 1});
 
@@ -93,7 +93,7 @@ XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice3x0to2x0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
   builder.Slice(original, {1, 0}, {3, 0}, {1, 1});
 
@@ -108,7 +108,7 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {128, 128}, {256, 256}, {1, 1});
 
@@ -126,7 +126,7 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) {
   Array2D<float> values(1, 4096);
   std::iota(values.data(), values.data() + 4096, 0.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1});
 
@@ -147,7 +147,7 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) {
       }
     }
   }
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {0, 0}, {16, 2}, {1, 1});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -159,7 +159,7 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
   values.FillRandom(3.14f);
   auto expected = ReferenceUtil::Slice4D(
       values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}}, /*strides=*/{{1, 1, 1, 1}});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR4FromArray4D(values);
   builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1});
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
@@ -172,7 +172,7 @@ XLA_TEST_F(SliceTest, StridedSliceR4WithOutputLayout) {
                                          /*strides=*/{{1, 1, 2, 1}});
   auto expected_literal = Literal::CreateR4FromArray4DWithLayout(
       *expected, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR4FromArray4D(values);
   builder.Slice(original, {0, 0, 0, 0}, {2, 4, 6, 8}, {1, 1, 2, 1});
   ComputeAndCompareLiteral(&builder, *expected_literal, {}, ErrorSpec(0.000001),
@@ -193,15 +193,18 @@ class SliceR1Test : public ClientLibraryTestBase,
  protected:
   template <typename NativeT>
   void Run(const R1Spec& spec) {
-    std::vector<NativeT> input(spec.input_dim0);
+    // This can't be an std::vector, since you can't grab an ArraySlice of a
+    // vector<bool>.
+    tensorflow::gtl::InlinedVector<NativeT, 1> input(spec.input_dim0);
     std::iota(input.begin(), input.end(), NativeT());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto original = builder.ConstantR1<NativeT>(input);
     builder.Slice(original, {spec.slice_start}, {spec.slice_limit},
                   {spec.slice_stride});
 
-    std::vector<NativeT> expected;
+    // Ditto.
+    tensorflow::gtl::InlinedVector<NativeT, 1> expected;
     for (int i = spec.slice_start; i < spec.slice_limit;
          i += spec.slice_stride) {
       expected.push_back(i);
@@ -211,6 +214,9 @@ class SliceR1Test : public ClientLibraryTestBase,
   }
 };
 
+// A version of SliceR1Test used to label and disable 'large' tests
+class SliceR1LargeTest : public SliceR1Test {};
+
 string SliceR1TestDataToString(const ::testing::TestParamInfo<R1Spec>& data) {
   const R1Spec& spec = data.param;
   return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0,
@@ -230,6 +236,21 @@ XLA_TEST_P(SliceR1Test, DoIt_U64) { Run<uint64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_S64) { Run<int64>(GetParam()); }
 
+XLA_TEST_P(SliceR1LargeTest, DoIt_F32) { Run<float>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_F64) { Run<double>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U32) { Run<uint32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S32) { Run<int32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U64) { Run<uint64>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S64) { Run<int64>(GetParam()); }
+
+XLA_TEST_P(SliceR1Test, DoIt_PRED) { Run<bool>(GetParam()); }
+
+
 // Tests for R1 slice ops.
 // The format for each testcase is {input size, start, limit, stride}.
 // clang-format off
@@ -267,13 +288,32 @@ INSTANTIATE_TEST_CASE_P(
         R1Spec{64 * 1024, 1024 + 1, 63 * 1024 - 1, 1},
         R1Spec{64 * 1024, 32 * 1024, 33 * 1024, 1},
         R1Spec{64 * 1024, 32 * 1024 + 1, 33 * 1024 - 1, 1},
-        R1Spec{64 * 1024, 32 * 1024 - 17, 36 * 1024 - 18, 1},
+        R1Spec{64 * 1024, 32 * 1024 - 17, 36 * 1024 - 18, 1}
+    ),
+    SliceR1TestDataToString
+);
+
 // TODO(b/69425338): This uses too much memory on GPU.
 #ifndef XLA_TEST_BACKEND_GPU
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1},
+INSTANTIATE_TEST_CASE_P(
+    SliceR1TestBigSlicesInstantiation,
+    SliceR1LargeTest,
+    ::testing::Values(
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1}
+    ),
+    SliceR1TestDataToString
+);
 #endif
+
+INSTANTIATE_TEST_CASE_P(
+    SliceStridedR1TestInstantiation,
+    SliceR1Test,
+    ::testing::Values(
         R1Spec{10, 2, 4, 2},
         R1Spec{10, 0, 10, 2},
         R1Spec{10, 0, 10, 3},
@@ -285,8 +325,24 @@ INSTANTIATE_TEST_CASE_P(
         R1Spec{2047, 1024 - 24, 1024 + 160, 31},
         R1Spec{2047, 1, 2046, 3 * 128},
         R1Spec{4096, 1024 + 3, 4095, 500},
-        R1Spec{8192, 0, 8192, 1024 * 3 + 400}
-        ),
+        R1Spec{8192, 0, 8192, 1024 * 3 + 400},
+        R1Spec{1024 * 1024, 0, 1024 * 1024, 2},
+        R1Spec{1024 * 1024, 0, 1024 * 1024, 8},
+        R1Spec{1024 * 1024, 0, 1024 * 1024, 7},
+        R1Spec{1024 * 1024, 0, 1024 * 1024, 125},
+        R1Spec{1024 * 1024, 3, 1024 - 9, 2},
+        R1Spec{1024 * 1024, 3, 1024 - 9, 8},
+        R1Spec{1024 * 1024, 3, 1024 - 9, 7},
+        R1Spec{1024 * 1024, 3, 1024 - 9, 125},
+        R1Spec{1024 * 1024, 3, 1024 * 512 - 9, 2},
+        R1Spec{1024 * 1024, 3, 1024 * 512 - 9, 8},
+        R1Spec{1024 * 1024, 3, 1024 * 512 - 9, 7},
+        R1Spec{1024 * 1024, 3, 1024 * 512 - 9, 125},
+        R1Spec{1024 * 1024 + 71, 3, 1024 * 512 - 9, 2},
+        R1Spec{1024 * 1024 + 71, 3, 1024 * 512 - 9, 8},
+        R1Spec{1024 * 1024 + 71, 3, 1024 * 512 - 9, 7},
+        R1Spec{1024 * 1024 + 71, 3, 1024 * 512 - 9, 125}
+    ),
     SliceR1TestDataToString
 );
 // clang-format on
@@ -310,7 +366,7 @@ XLA_TEST_P(SliceR2Test, DoIt) {
   Array2D<int32> input(spec.input_dim0, spec.input_dim1);
   input.FillUnique();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2DWithLayout<int32>(
       input, LayoutUtil::MakeLayout(spec.layout));
   builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
@@ -400,7 +456,7 @@ class SliceR4Test : public ClientLibraryTestBase,
     values.FillRandom(3.14f);
     auto expected = ReferenceUtil::Slice4D(
         values, spec.slice_starts, spec.slice_limits, spec.slice_strides);
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto literal = Literal::CreateR4FromArray4DWithLayout(
         values, LayoutUtil::MakeLayout(spec.input_layout));
     auto parameter = builder.Parameter(0, literal->shape(), "p0");
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index 978a669bcab720bddec5c4bcd0144810ba3c8477..be35ec6c6ee4c015755622b2dc9bb92e23af7c85 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index e2d406f66d94f8ec76faa5b7d2d2e84dcaf6db57..7ca99a91635e85cd0888e59ecde31e47fec21844 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 #define DISABLED_ON_CPU(X) X
-#define DISABLED_ON_CPU_PARALLEL(X) X
 #define DISABLED_ON_GPU(X) X
 #define DISABLED_ON_INTERPRETER(X) X
 
@@ -51,13 +50,6 @@ limitations under the License.
 # define DISABLED_ON_CPU(X) XLA_TEST_PASTE(DISABLED_, X)
 #endif  // XLA_TEST_BACKEND_CPU
 
-#ifdef XLA_TEST_BACKEND_CPU_PARALLEL
-# undef DISABLED_ON_CPU
-# define DISABLED_ON_CPU(X) XLA_TEST_PASTE(DISABLED_, X)
-# undef DISABLED_ON_CPU_PARALLEL
-# define DISABLED_ON_CPU_PARALLEL(X) XLA_TEST_PASTE(DISABLED_, X)
-#endif  // XLA_TEST_BACKEND_CPU_PARALLEL
-
 #ifdef XLA_TEST_BACKEND_GPU
 # undef DISABLED_ON_GPU
 # define DISABLED_ON_GPU(X) XLA_TEST_PASTE(DISABLED_, X)
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 0bc7df2a65b44a76f877b6513e6bf93b99fbc1a3..997a1d8273736af31994ebbd07ff3857d1e8e0b5 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -23,14 +23,14 @@ namespace xla {
 
 namespace {
 
-template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal,
-                                         std::minstd_rand0* engine) {
+template <typename FloatT, typename GeneratorT>
+void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
+                                             std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   // Create uniform numbers between 1 and 1.125 to avoid creating denormal
   // numbers.
-  std::uniform_real_distribution<FloatT> generator(1.0f, 1.125f);
+  std::uniform_real_distribution<GeneratorT> generator(1.0f, 1.125f);
   const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
   TF_CHECK_OK(literal->Populate<FloatT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices) {
@@ -52,10 +52,22 @@ void PopulateWithRandomFloatingPointData(Literal* literal,
         FloatT index_bias =
             static_cast<FloatT>(index_product % 113 - negative_bias) /
             static_cast<FloatT>(256.0f);
-        return (generator(*engine) - 1.0625) + index_bias;
+        return static_cast<FloatT>(generator(*engine) - 1.0625f) + index_bias;
       }));
 }
 
+template <typename FloatT>
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
+  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine);
+}
+
+template <>
+void PopulateWithRandomFloatingPointData<half>(Literal* literal,
+                                               std::minstd_rand0* engine) {
+  PopulateWithRandomFloatingPointDataImpl<half, float>(literal, engine);
+}
+
 // The standard library does not have a case for bfloat16, unsurprisingly, so we
 // handle that one specially.
 template <>
@@ -100,6 +112,9 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
     case BF16:
       PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
       break;
+    case F16:
+      PopulateWithRandomFloatingPointData<half>(literal.get(), engine);
+      break;
     case F32:
       PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
       break;
@@ -145,27 +160,38 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
   return std::move(literal);
 }
 
-// Matches binary addition computations.
-bool LooksLikeSum(const HloComputation& computation) {
+enum class ConstantType { kUnknown, kZero, kOne };
+
+// Return the constant type required by this computation, if known.
+ConstantType GetInitValue(const HloComputation& computation) {
   const HloInstruction* const root = computation.root_instruction();
-  return root->opcode() == HloOpcode::kAdd &&
-         computation.num_parameters() == 2 &&
-         root->operand(0)->opcode() == HloOpcode::kParameter &&
-         root->operand(1)->opcode() == HloOpcode::kParameter &&
-         root->operand(0) != root->operand(1);
+  if (computation.num_parameters() != 2 || root->operand_count() != 2 ||
+      root->operand(0)->opcode() != HloOpcode::kParameter ||
+      root->operand(1)->opcode() != HloOpcode::kParameter ||
+      root->operand(0) == root->operand(1)) {
+    return ConstantType::kUnknown;
+  }
+
+  switch (root->opcode()) {
+    case HloOpcode::kAdd:
+      return ConstantType::kZero;
+    case HloOpcode::kMultiply:
+      return ConstantType::kOne;
+    default:
+      return ConstantType::kUnknown;
+  }
 }
 
-// Reduce, ReduceWindow, and SelectAndScatter ops may use binary addition,
-// which requires an init_value of 0 rather than a random value.
-bool NeedsZeroInitValue(const HloUse& use) {
+// Reduce, ReduceWindow, and SelectAndScatter ops may need a non-random
+// initialization value.
+bool NeedsInitValue(const HloUse& use) {
   const HloInstruction* const instruction = use.instruction;
   const HloOpcode opcode = instruction->opcode();
   const int64 op_num = use.operand_number;
   return (
       ((opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow) &&
-       op_num == 1 && LooksLikeSum(*instruction->to_apply())) ||
-      (opcode == HloOpcode::kSelectAndScatter && op_num == 2 &&
-       LooksLikeSum(*instruction->scatter())));
+       op_num == 1) ||
+      (opcode == HloOpcode::kSelectAndScatter && op_num == 2));
 }
 
 // Generate random values that are constrained to the input_shape minus the
@@ -207,7 +233,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
         auto fused_uses = FindConstrainedUses(dataflow, *to_analyze);
         constrained_uses.insert(constrained_uses.end(), fused_uses.begin(),
                                 fused_uses.end());
-      } else if (NeedsZeroInitValue(use)) {
+      } else if (NeedsInitValue(use)) {
         constrained_uses.push_back(instruction);
       } else if (opcode == HloOpcode::kConvert ||
                  opcode == HloOpcode::kReducePrecision) {
@@ -228,7 +254,8 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
     const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
     const HloInstruction& param, std::minstd_rand0* engine) {
   HloInstruction* needs_index = nullptr;
-  HloInstruction* needs_zero = nullptr;
+  HloInstruction* needs_constant = nullptr;
+  ConstantType constant_type = ConstantType::kUnknown;
   for (HloInstruction* use : constrained_uses) {
     switch (use->opcode()) {
       case HloOpcode::kDynamicSlice:
@@ -243,8 +270,13 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
 
       case HloOpcode::kReduce:
       case HloOpcode::kReduceWindow:
+        needs_constant = use;
+        constant_type = GetInitValue(*use->to_apply());
+        break;
+
       case HloOpcode::kSelectAndScatter:
-        needs_zero = use;
+        needs_constant = use;
+        constant_type = GetInitValue(*use->scatter());
         break;
 
       default:
@@ -253,17 +285,26 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
             use->ToString().c_str());
     }
   }
-  if (needs_index != nullptr && needs_zero != nullptr) {
+  if (needs_index != nullptr && needs_constant != nullptr) {
     return Unimplemented(
         "Conflicting operand generation constraints.\nNeeds index: %s\nNeeds "
-        "zero: %s\n",
-        needs_index->ToString().c_str(), needs_zero->ToString().c_str());
+        "constant: %s\n",
+        needs_index->ToString().c_str(), needs_constant->ToString().c_str());
   }
   if (needs_index != nullptr) {
     return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
                                            needs_index->shape(), engine);
-  } else if (needs_zero != nullptr) {
-    return Literal::CreateFromShape(param.shape());
+  } else if (needs_constant != nullptr) {
+    switch (constant_type) {
+      case ConstantType::kZero:
+        return Literal::Zero(param.shape().element_type()).CloneToUnique();
+      case ConstantType::kOne:
+        return Literal::One(param.shape().element_type()).CloneToUnique();
+      case ConstantType::kUnknown:
+        // We want the identity element for the computation, but we don't really
+        // know what it is - so any value we generate will be just as wrong.
+        return MakeFakeLiteralInternal(param.shape(), engine);
+    }
   } else {
     return MakeFakeLiteralInternal(param.shape(), engine);
   }
@@ -298,9 +339,9 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
   return std::move(arguments);
 }
 
-Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module) {
-  return HloVerifier().Run(module).status();
+Status VerifyHloModule(const se::Platform& platform, HloModule* const module,
+                       bool allow_mixed_precision) {
+  return HloVerifier(allow_mixed_precision).Run(module).status();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index 0fb024ffb074f1c90b75022bc7f5a8b58b03c0c2..30c147910cae85e1ebdddc22e637a6c1fd577c20 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -68,8 +68,8 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
 
 // Check that a given module satisfies various constraints before trying to
 // execute it.
-Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module);
+Status VerifyHloModule(const se::Platform& platform, HloModule* const module,
+                       bool allow_mixed_precision = false);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59afd28a80c0fbf3df38457cd05961c883769856
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/local_client_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+// A test fixture is used because we need a client for our computation builder.
+class TestUtilsTest : public LocalClientTestBase {};
+
+XLA_TEST_F(TestUtilsTest, UnusedParam) {
+  XlaBuilder builder(TestName());
+  // Make the reduction lambda.
+  Shape single_float = ShapeUtil::MakeShape(F32, {});
+  builder.Parameter(0, single_float, "unused");
+  builder.Parameter(1, single_float, "used");
+  auto computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  // Make the reduction.
+  Shape pair_float = ShapeUtil::MakeShape(F32, {2});
+  builder.Reduce(builder.Parameter(0, pair_float, "operand"),
+                 builder.Parameter(1, single_float, "init"),
+                 computation_status.ValueOrDie(), {0});
+  computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  auto executable_status = local_client_->Compile(
+      computation_status.ValueOrDie(), {&pair_float, &single_float},
+      ExecutableBuildOptions());
+  TF_ASSERT_OK(executable_status.status());
+  HloModule& module = const_cast<HloModule&>(
+      executable_status.ValueOrDie()->executable()->module());
+  TF_ASSERT_OK(MakeFakeArguments(&module).status());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index 268ba338f2e6740a1d1a046d5a85494f3cf2e9f8..e2067bc1b835a946fc56801cbf227e05ef0686b4 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -45,7 +45,7 @@ class TransferManagerTest : public LocalClientTestBase {
 
   ~TransferManagerTest() override = default;
 
-  std::unique_ptr<ScopedShapedBuffer> AllocateDeviceBuffer(const Shape& shape) {
+  ScopedShapedBuffer AllocateDeviceBuffer(const Shape& shape) {
     return transfer_manager_
         ->AllocateScopedShapedBuffer(
             shape, GetOrCreateAllocator(local_client_->platform()),
@@ -64,10 +64,10 @@ XLA_TEST_F(TransferManagerTest, TransferR0U32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR0Equal<uint32>(42, *result);
 }
@@ -80,10 +80,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1F32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR1Equal<float>({1.25f, 2.5f, -17.0f, -20.125f},
                                         *result);
@@ -98,10 +98,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1LargeF32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR1Equal<float>(test_vector, *result);
 }
@@ -114,10 +114,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1U8) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   EXPECT_EQ(result->GetR1U8AsString(), test_string);
 }
@@ -130,10 +130,10 @@ XLA_TEST_F(TransferManagerTest, TransferR2F32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, *result);
@@ -150,10 +150,10 @@ XLA_TEST_F(TransferManagerTest,
   // Round trip literal through device. Set the on-device layout to something
   // different than the literal layout.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   EXPECT_FALSE(
       LayoutUtil::Equal(result->shape().layout(), literal->shape().layout()));
@@ -170,10 +170,10 @@ XLA_TEST_F(TransferManagerTest, TransferTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -184,10 +184,10 @@ XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -204,10 +204,10 @@ XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -219,10 +219,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValue) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -238,10 +238,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
diff --git a/tensorflow/compiler/xla/tests/transpose_test.cc b/tensorflow/compiler/xla/tests/transpose_test.cc
index fe5a1778a2cecff0121cee4d8b406c5b23a13e40..59ce23d0247b58c6aebc2b5a65453157c1ca15ff 100644
--- a/tensorflow/compiler/xla/tests/transpose_test.cc
+++ b/tensorflow/compiler/xla/tests/transpose_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -38,7 +38,7 @@ class TransposeTest : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(TransposeTest, Transpose0x0) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto lhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
   auto result = builder.Transpose(lhs, {1, 0});
 
@@ -46,7 +46,7 @@ XLA_TEST_F(TransposeTest, Transpose0x0) {
 }
 
 XLA_TEST_F(TransposeTest, Transpose0x42) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto lhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 42));
   auto result = builder.Transpose(lhs, {1, 0});
 
@@ -54,7 +54,7 @@ XLA_TEST_F(TransposeTest, Transpose0x42) {
 }
 
 XLA_TEST_F(TransposeTest, Transpose7x0) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto lhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(7, 0));
   auto result = builder.Transpose(lhs, {1, 0});
 
@@ -62,7 +62,7 @@ XLA_TEST_F(TransposeTest, Transpose7x0) {
 }
 
 TEST_F(TransposeTest, Transpose2x2) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto lhs = builder.ConstantR2<float>({
       {1.0, 2.0}, {3.0, 4.0},
   });
@@ -74,7 +74,7 @@ TEST_F(TransposeTest, Transpose2x2) {
 }
 
 XLA_TEST_F(TransposeTest, Transpose0x2x3_2x3x0) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto operand = builder.ConstantR3FromArray3D<int32>(Array3D<int32>(0, 2, 3));
   auto result = builder.Transpose(operand, {1, 2, 0});
 
@@ -82,7 +82,7 @@ XLA_TEST_F(TransposeTest, Transpose0x2x3_2x3x0) {
 }
 
 TEST_F(TransposeTest, Transpose1x2x3_2x3x1) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto operand = builder.ConstantR3FromArray3D<int32>({{{1, 2, 3}, {4, 5, 6}}});
   auto result = builder.Transpose(operand, {1, 2, 0});
 
@@ -92,7 +92,7 @@ TEST_F(TransposeTest, Transpose1x2x3_2x3x1) {
 }
 
 TEST_F(TransposeTest, Transpose1x2x3_3x2x1) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto operand = builder.ConstantR3FromArray3D<int32>({{{1, 2, 3}, {4, 5, 6}}});
   auto result = builder.Transpose(operand, {2, 1, 0});
 
@@ -102,7 +102,7 @@ TEST_F(TransposeTest, Transpose1x2x3_3x2x1) {
 }
 
 TEST_F(TransposeTest, Transpose1x2x3_1x2x3) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto operand = builder.ConstantR3FromArray3D<int32>({{{1, 2, 3}, {4, 5, 6}}});
   auto result = builder.Transpose(operand, {0, 1, 2});
 
@@ -116,7 +116,7 @@ TEST_F(TransposeTest, MultiTranspose3x2) {
   Array2D<float> transposed({{1.0f, 3.0f, 5.0f}, {2.0f, 4.0f, 6.0f}});
 
   for (int transposes = 0; transposes <= 10; ++transposes) {
-    ComputationBuilder builder(client_, "Transpose");
+    XlaBuilder builder("Transpose");
     auto computed = builder.ConstantR2FromArray2D<float>(input);
     for (int i = 0; i < transposes; ++i) {
       computed = builder.Transpose(computed, {1, 0});
@@ -130,7 +130,7 @@ TEST_F(TransposeTest, MultiTranspose3x2) {
 TEST_F(TransposeTest, Small_1x1) {
   auto aoperand = MakeLinspaceArray2D(0.0, 1.0, 1, 1);
 
-  ComputationBuilder builder(client_, "transpose_1x1");
+  XlaBuilder builder("transpose_1x1");
   auto operand = builder.ConstantR2FromArray2D<float>(*aoperand);
   builder.Transpose(operand, {1, 0});
 
@@ -142,7 +142,7 @@ TEST_F(TransposeTest, Small_1x1) {
 TEST_F(TransposeTest, Small_2x2) {
   auto aoperand = MakeLinspaceArray2D(0.0, 4.0, 2, 2);
 
-  ComputationBuilder builder(client_, "transpose_2x2");
+  XlaBuilder builder("transpose_2x2");
   auto operand = builder.ConstantR2FromArray2D<float>(*aoperand);
   builder.Transpose(operand, {1, 0});
 
@@ -162,7 +162,7 @@ void TransposeTest::TestTransposeConstant021(size_t n1, size_t n2, size_t n3) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto operand = builder.ConstantR3FromArray3D(aoperand);
   builder.Transpose(operand, {0, 2, 1});
 
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 2029312f94a14bc81706368b9ecfc2727fd9fe4c..5c287bac6a7cab5a3c2642971a5a67070ee56c72 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -18,13 +18,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -40,7 +42,7 @@ class TupleTest : public ClientLibraryTestBase {
 
 // Tests a tuple-shaped constant.
 XLA_TEST_F(TupleTest, TupleConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const float constant_scalar = 7.3f;
   std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
@@ -53,13 +55,13 @@ XLA_TEST_F(TupleTest, TupleConstant) {
                           Literal::CreateR1<float>(constant_vector).get(),
                           Literal::CreateR2<float>(constant_matrix).get()});
 
-  auto result = builder.ConstantLiteral(*value);
+  builder.ConstantLiteral(*value);
   ComputeAndCompareTuple(&builder, *value, {}, error_spec_);
 }
 
 // Tests a tuple made of scalar constants.
 XLA_TEST_F(TupleTest, TupleScalarConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const float constant_scalar1 = 7.3f;
   const float constant_scalar2 = 1.2f;
@@ -67,13 +69,13 @@ XLA_TEST_F(TupleTest, TupleScalarConstant) {
       Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar1).get(),
                           Literal::CreateR0<float>(constant_scalar2).get()});
 
-  auto result = builder.ConstantLiteral(*value);
+  builder.ConstantLiteral(*value);
   ComputeAndCompareTuple(&builder, *value, {}, error_spec_);
 }
 
 // Tests the creation of tuple data.
 XLA_TEST_F(TupleTest, TupleCreate) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const float constant_scalar = 7.3f;
   std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
@@ -81,9 +83,9 @@ XLA_TEST_F(TupleTest, TupleCreate) {
       {1.1f, 2.2f, 3.5f},  // row 0
       {4.8f, 5.0f, 6.7f},  // row 1
   };
-  auto result = builder.Tuple({builder.ConstantR0<float>(constant_scalar),
-                               builder.ConstantR1<float>(constant_vector),
-                               builder.ConstantR2<float>(constant_matrix)});
+  builder.Tuple({builder.ConstantR0<float>(constant_scalar),
+                 builder.ConstantR1<float>(constant_vector),
+                 builder.ConstantR2<float>(constant_matrix)});
 
   auto expected =
       Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar).get(),
@@ -94,9 +96,9 @@ XLA_TEST_F(TupleTest, TupleCreate) {
 
 // Tests the creation of tuple data.
 XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  auto result = builder.Tuple(
+  builder.Tuple(
       {builder.ConstantR0<float>(7.0), builder.ConstantR1<float>({})});
 
   auto expected = Literal::MakeTuple({Literal::CreateR0<float>(7.0).get(),
@@ -106,15 +108,15 @@ XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
 
 // Tests the creation of an empty tuple.
 XLA_TEST_F(TupleTest, EmptyTupleCreate) {
-  ComputationBuilder builder(client_, TestName());
-  auto result = builder.Tuple({});
+  XlaBuilder builder(TestName());
+  builder.Tuple({});
   auto expected = Literal::MakeTuple({});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
 // Trivial test for extracting a tuple element with GetTupleElement.
 XLA_TEST_F(TupleTest, GetTupleElement) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -122,23 +124,23 @@ XLA_TEST_F(TupleTest, GetTupleElement) {
   };
   auto tuple_data = builder.Tuple({builder.ConstantR1<float>(constant_vector),
                                    builder.ConstantR2<float>(constant_matrix)});
-  auto matrix_element = builder.GetTupleElement(tuple_data, 1);
+  builder.GetTupleElement(tuple_data, 1);
   ComputeAndCompareR2<float>(&builder, Array2D<float>(constant_matrix), {},
                              error_spec_);
 }
 
 // Trivial test for extracting a tuple element with GetTupleElement.
 XLA_TEST_F(TupleTest, GetTupleElementWithZeroElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto tuple_data = builder.Tuple(
       {builder.ConstantR1<float>({}),
        builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 101))});
-  auto matrix_element = builder.GetTupleElement(tuple_data, 1);
+  builder.GetTupleElement(tuple_data, 1);
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 101), {}, error_spec_);
 }
 
 XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto value = builder.ConstantR1<float>({4.5f});
   builder.GetTupleElement(value, 1);
   auto result_status = builder.Build();
@@ -151,7 +153,7 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
 // Extracts both elements from a tuple with GetTupleElement and then adds them
 // together.
 XLA_TEST_F(TupleTest, AddTupleElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -163,22 +165,22 @@ XLA_TEST_F(TupleTest, AddTupleElements) {
   auto matrix_element = builder.GetTupleElement(tuple_data, 1);
   auto vector_shape = builder.GetShape(vector_element).ConsumeValueOrDie();
   auto matrix_shape = builder.GetShape(matrix_element).ConsumeValueOrDie();
-  auto result = builder.Add(matrix_element, vector_element,
-                            /*broadcast_dimensions=*/{1});
+  builder.Add(matrix_element, vector_element,
+              /*broadcast_dimensions=*/{1});
 
   Array2D<float> expected({
       {2.f, 4.f, 6.f},  // row 0
       {5.f, 7.f, 9.f},  // row 1
   });
-  ASSERT_TRUE(ShapeUtil::ShapeIs(*vector_shape, F32, {3}));
-  ASSERT_TRUE(ShapeUtil::ShapeIs(*matrix_shape, F32, {/*y=*/2, /*x=*/3}));
+  ASSERT_TRUE(ShapeUtil::ShapeIs(vector_shape, F32, {3}));
+  ASSERT_TRUE(ShapeUtil::ShapeIs(matrix_shape, F32, {/*y=*/2, /*x=*/3}));
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
 // Extracts both elements from a tuple and then puts them into a new tuple in
 // the opposite order.
 XLA_TEST_F(TupleTest, TupleGTEToTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -186,8 +188,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
   };
   auto tuple_data = builder.Tuple({builder.ConstantR1<float>(constant_vector),
                                    builder.ConstantR2<float>(constant_matrix)});
-  auto new_tuple = builder.Tuple({builder.GetTupleElement(tuple_data, 1),
-                                  builder.GetTupleElement(tuple_data, 0)});
+  builder.Tuple({builder.GetTupleElement(tuple_data, 1),
+                 builder.GetTupleElement(tuple_data, 0)});
   auto expected =
       Literal::MakeTuple({Literal::CreateR2<float>(constant_matrix).get(),
                           Literal::CreateR1<float>(constant_vector).get()});
@@ -195,8 +197,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
 }
 
 XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle v1, v2;
+  XlaBuilder b(TestName());
+  XlaOp v1, v2;
 
   for (bool direction : {false, true}) {
     std::unique_ptr<GlobalData> v1_data =
@@ -209,7 +211,7 @@ XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
     auto v2_gt = b.Gt(v2, v1);             // true
     auto v1_v2 = b.Tuple({v1_gt, v2_gt});  // {false, true}
     auto v2_v1 = b.Tuple({v2_gt, v1_gt});  // {true, false}
-    auto select = b.Select(direction ? v1_gt : v2_gt, v1_v2, v2_v1);
+    b.Select(direction ? v1_gt : v2_gt, v1_v2, v2_v1);
     auto expected =
         Literal::MakeTuple({Literal::CreateR0<bool>(direction).get(),
                             Literal::CreateR0<bool>(!direction).get()});
@@ -236,7 +238,7 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
   //              \                (tuple10)--                     /
   //               \              /           \                   /
   //                -----(GTE 0)--             --(GTE 1)----------
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -256,8 +258,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
   auto addvectors = builder.Add(vector_from_01, vector_from_10);
   auto addmatrices = builder.Add(matrix_from_01, matrix_from_10);
 
-  auto result = builder.Add(addmatrices, addvectors,
-                            /*broadcast_dimensions=*/{1});
+  builder.Add(addmatrices, addvectors,
+              /*broadcast_dimensions=*/{1});
 
   Array2D<float> expected({
       {4.f, 8.f, 12.f},    // row 0
@@ -266,9 +268,9 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
-XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) {
+XLA_TEST_F(TupleTest, SelectBetweenTuplesOnFalse) {
   // Tests a selection between tuples with "false" path taken.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -277,21 +279,20 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) {
   auto tuple21 = builder.Tuple(
       {builder.ConstantR1<float>(vec2), builder.ConstantR1<float>(vec1)});
 
-  auto select =
-      builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
+  builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
   auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec2).get(),
                                       Literal::CreateR1<float>(vec1).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
 XLA_TEST_F(TupleTest, TuplesInAMap) {
-  Computation tuple_computation;
+  XlaComputation tuple_computation;
   {
     // tuple_computation(x) = 100 * min(x, x^2) + max(x, x^2) using tuples.
     //
     // Need to put a select in there to prevent HLO-level optimizations from
     // optimizing out the tuples.
-    ComputationBuilder b(client_, "sort_square");
+    XlaBuilder b("sort_square");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto x2 = b.Mul(x, x);
     auto x_smaller_tuple = b.Tuple({x, x2});
@@ -305,15 +306,15 @@ XLA_TEST_F(TupleTest, TuplesInAMap) {
     tuple_computation = computation_status.ConsumeValueOrDie();
   }
 
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = b.ConstantR1<float>({-1.0f, 1.0f, 2.1f});
   b.Map({input}, tuple_computation, {0});
   ComputeAndCompareR1<float>(&b, {-99.0f, 101.0f, 214.41f}, {}, error_spec_);
 }
 
-XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
+XLA_TEST_F(TupleTest, SelectBetweenTuplesOnTrue) {
   // Tests a selection between tuples with "true" path taken.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -322,8 +323,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
   auto tuple21 = builder.Tuple(
       {builder.ConstantR1<float>(vec2), builder.ConstantR1<float>(vec1)});
 
-  auto select =
-      builder.Select(builder.ConstantR0<bool>(true), tuple12, tuple21);
+  builder.Select(builder.ConstantR0<bool>(true), tuple12, tuple21);
   auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec1).get(),
                                       Literal::CreateR1<float>(vec2).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
@@ -332,7 +332,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
 XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) {
   // Tests a selection between tuples but the final result is an element of the
   // tuple, not the whole tuple.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -343,13 +343,13 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) {
 
   auto select =
       builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
-  auto element = builder.GetTupleElement(select, 0);
+  builder.GetTupleElement(select, 0);
 
   ComputeAndCompareR1<float>(&builder, vec2, {}, error_spec_);
 }
 
 // Cascaded selects between tuple types.
-XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) {
+XLA_TEST_F(TupleTest, SelectBetweenTuplesCascaded) {
   //
   //                       vec1     vec2   vec2     vec1
   //                        |        |      |        |
@@ -367,7 +367,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) {
   //                                /             --(GTE 1)--
   //                               /
   //                          (tuple 21)
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -383,17 +383,16 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) {
       builder.Select(builder.GetTupleElement(pred_tuple, 0), tuple12, tuple21);
   auto select2 =
       builder.Select(builder.GetTupleElement(pred_tuple, 1), tuple21, select1);
-  auto result = builder.Add(builder.GetTupleElement(select2, 0),
-                            builder.GetTupleElement(select2, 1));
+  builder.Add(builder.GetTupleElement(select2, 0),
+              builder.GetTupleElement(select2, 1));
 
   ComputeAndCompareR1<float>(&builder, {3.f, 6.f, 9.f}, {}, error_spec_);
 }
 
-XLA_TEST_F(TupleTest,
-           DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesReuseConstants)) {
+XLA_TEST_F(TupleTest, SelectBetweenTuplesReuseConstants) {
   // Similar to SelectBetweenTuples, but the constants are shared between the
   // input tuples.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -402,19 +401,18 @@ XLA_TEST_F(TupleTest,
   auto tuple12 = builder.Tuple({c1, c2});
   auto tuple21 = builder.Tuple({c2, c1});
 
-  auto select =
-      builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
+  builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
+
   auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec2).get(),
                                       Literal::CreateR1<float>(vec1).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
 XLA_TEST_F(TupleTest, NestedTuples) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto inner_tuple = builder.Tuple(
       {builder.ConstantR1<float>({1.0, 2.0}), builder.ConstantR0<float>(42.0)});
-  auto outer_tuple =
-      builder.Tuple({inner_tuple, builder.ConstantR1<float>({22.0, 44.0})});
+  builder.Tuple({inner_tuple, builder.ConstantR1<float>({22.0, 44.0})});
 
   auto expected_v1 = Literal::CreateR1<float>({1.0, 2.0});
   auto expected_s = Literal::CreateR0<float>(42.0);
@@ -428,7 +426,7 @@ XLA_TEST_F(TupleTest, NestedTuples) {
 }
 
 XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Shape data_shape = ShapeUtil::MakeShape(F32, {3});
   Shape inner_tuple_shape = ShapeUtil::MakeTupleShape({data_shape, data_shape});
@@ -459,7 +457,7 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
 }
 
 XLA_TEST_F(TupleTest, ComplexTuples) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape c64r0 = ShapeUtil::MakeShape(C64, {});
     Shape c64r1 = ShapeUtil::MakeShape(C64, {2});
@@ -514,5 +512,30 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
                          error_spec_);
 }
 
+class TupleHloTest : public HloTestBase {};
+
+// Disabled on the interpreter because bitcast doesn't exist on the interpreter.
+TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
+  const char* testcase = R"(
+    HloModule m
+
+    ENTRY test {
+      name.1 = (f32[3]{0}) parameter(0)
+      get-tuple-element.1 = f32[3]{0} get-tuple-element(name.1), index=0
+      bitcast = f32[1,3]{1,0} bitcast(get-tuple-element.1)
+      copy = f32[1,3]{1,0} copy(bitcast)
+      ROOT tuple.4 = (f32[1,3]{1,0}) tuple(copy)
+    }
+  )";
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::MakeTupleOwned(Literal::CreateR1<float>({1, 2, 3}));
+  auto result = ExecuteNoHloPasses(std::move(module), {param.get()});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result,
+      *Literal::MakeTupleOwned(Literal::CreateR2<float>({{1, 2, 3}}))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index 835e2d7e5594d7c8c6e523f9806e32dce23a87e9..50c8766f2e3976c7077046283ab3b3e762622fc5 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -37,7 +37,7 @@ class UnaryOpTest : public ClientLibraryTestBase {
   }
   template <typename T>
   void AbsSize0TestHelper() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto arg = builder.ConstantR1<T>({});
     auto abs = builder.Abs(arg);
 
@@ -50,7 +50,7 @@ class UnaryOpTest : public ClientLibraryTestBase {
 
   template <typename T>
   void AbsTestHelper() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto arg = builder.ConstantR1<T>({-2, 25, 0, -123, inf<T>(), -inf<T>()});
     auto abs = builder.Abs(arg);
 
@@ -59,7 +59,7 @@ class UnaryOpTest : public ClientLibraryTestBase {
 
   template <typename T>
   void SignTestHelper() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto arg = builder.ConstantR1<T>(
         {-2, 25, 0, static_cast<T>(-0.0), -123, inf<T>(), -inf<T>()});
     auto sign = builder.Sign(arg);
@@ -69,7 +69,7 @@ class UnaryOpTest : public ClientLibraryTestBase {
 
   template <typename T>
   void SignAbsTestHelper() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto arg = builder.ConstantR1<T>({-2, 25, 0, -123});
     auto sign = builder.Sign(arg);
     auto abs = builder.Abs(arg);
@@ -86,7 +86,7 @@ int UnaryOpTest::inf<int>() {
 
 template <>
 void UnaryOpTest::AbsTestHelper<complex64>() {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR1<complex64>({{-2, 0},
                                             {0, 25},
                                             {0, 0},
@@ -102,7 +102,7 @@ void UnaryOpTest::AbsTestHelper<complex64>() {
 
 template <>
 void UnaryOpTest::SignTestHelper<complex64>() {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR1<complex64>(
       {{-2, 0}, {0, 25}, {0, 0}, {static_cast<float>(-0.0), 0}, {-1, 1}});
   auto sign = builder.Sign(arg);
@@ -114,7 +114,7 @@ void UnaryOpTest::SignTestHelper<complex64>() {
 
 template <>
 void UnaryOpTest::SignAbsTestHelper<complex64>() {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg =
       builder.ConstantR1<complex64>({{-2, 0}, {0, 25}, {0, 0}, {-0.4, 0.3}});
   auto sign = builder.Sign(arg);
@@ -139,7 +139,7 @@ XLA_TEST_F(UnaryOpTest, AbsTestR1) {
 }
 
 XLA_TEST_F(UnaryOpTest, AbsTestR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto argi = builder.ConstantR0<int>(-5);
   auto absi = builder.Abs(argi);
   auto argf = builder.ConstantR0<float>(-3.0f);
@@ -155,7 +155,7 @@ XLA_TEST_F(UnaryOpTest, AbsTestR0) {
 }
 
 XLA_TEST_F(UnaryOpTest, SignTestR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto argi = builder.ConstantR0<int>(-5);
   auto sgni = builder.Sign(argi);  // -1
   auto argf = builder.ConstantR0<float>(-4.0f);
@@ -187,7 +187,7 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR1) {
 }
 
 XLA_TEST_F(UnaryOpTest, UnsignedAbsTestR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR1<unsigned int>(
       {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()});
   auto abs = builder.Abs(arg);
@@ -197,7 +197,7 @@ XLA_TEST_F(UnaryOpTest, UnsignedAbsTestR1) {
 }
 
 XLA_TEST_F(UnaryOpTest, UnsignedSignTestR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR1<unsigned int>(
       {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()});
   auto sign = builder.Sign(arg);
@@ -206,7 +206,7 @@ XLA_TEST_F(UnaryOpTest, UnsignedSignTestR1) {
 }
 
 XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR2<float>({{1.0, -2.0}, {-3.0, 4.0}});
   auto sign = builder.Sign(arg);
   auto abs = builder.Abs(arg);
@@ -216,7 +216,7 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
 }
 
 XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({0, 1});
   auto rhs = builder.ConstantR1<int32>({1, 1});
   builder.ConvertElementType(builder.Eq(lhs, rhs), S32);
@@ -225,7 +225,7 @@ XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
 }
 
 XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({0, 1});
   auto rhs = builder.ConstantR1<int32>({1, 1});
   builder.ConvertElementType(builder.Eq(lhs, rhs), F32);
diff --git a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
index 32ba067a10df6c15348344da813e6a960f05491c..82d301983fc7885ef5c1c1ed05b74fc017bb7727 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -33,9 +33,9 @@ namespace {
 
 class VecOpsReduceTest : public ClientLibraryTestBase {
  public:
-  VecOpsReduceTest() : builder_(client_, TestName()) {}
+  VecOpsReduceTest() : builder_(TestName()) {}
 
-  ComputationDataHandle BuildSampleConstantCube() {
+  XlaOp BuildSampleConstantCube() {
     // clang-format off
     Array3D<float> x3d({
           {{1.0, 2.0, 3.0},   // | dim 1    // } plane 0 in dim 0
@@ -49,7 +49,7 @@ class VecOpsReduceTest : public ClientLibraryTestBase {
     return builder_.ConstantR3FromArray3D<float>(x3d);
   }
 
-  ComputationBuilder builder_;
+  XlaBuilder builder_;
   ErrorSpec errspec_{1e-3, 0};
 };
 
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index b52c718814d4ffeff68c60588a6637a2159d57e5..3dded3f7157195b2c7aaac2ff9aac79ca4611d05 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -19,10 +19,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -39,7 +40,7 @@ namespace {
 
 class VecOpsSimpleTest : public ClientLibraryTestBase {
  public:
-  explicit VecOpsSimpleTest(perftools::gputools::Platform* platform = nullptr)
+  explicit VecOpsSimpleTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
@@ -49,7 +50,7 @@ class VecOpsSimpleTest : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   auto exp = builder.Exp(x);
@@ -63,7 +64,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) {
 
 XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) {
   for (int count : {63, 64, 65, 127, 128, 129, 17 * 4096}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     std::vector<float> exponents;
     exponents.reserve(count);
     for (int i = 0; i < count; ++i) {
@@ -84,7 +85,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> exponents(2, 2, 2, 2);
 
   std::vector<float> exponents_vector;
@@ -106,7 +107,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   builder.Neg(x);
@@ -117,7 +118,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>({2, -2, 12, -4, 5, 20, -15, 0, -2, 1});
   builder.Neg(x);
 
@@ -126,7 +127,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<uint32>(
       {0, 1, 42, static_cast<uint32>(-1), static_cast<uint32>(-12)});
   builder.Neg(x);
@@ -136,7 +137,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   builder.SquareF32(x);
@@ -147,7 +148,7 @@ XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   builder.ReciprocalF32(x);
@@ -159,7 +160,7 @@ XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>({0.0, -0.0});
   auto exp = builder.SqrtF32(x);
 
@@ -167,7 +168,7 @@ XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>({16.0, 1.0, 1024.0, 0.16, 0.2, 12345});
   auto exp = builder.SqrtF32(x);
 
@@ -176,7 +177,7 @@ XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x =
       builder.ConstantR1<float>({16.0, 1.0, 1024.0, 0.16, 0.2, 12345, 1.2345});
   auto exp = builder.Pow(x, builder.ConstantR0<float>(-.5f));
@@ -188,7 +189,7 @@ XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto add = CreateScalarAddComputation(F32, &builder);
 
   auto x = builder.ConstantR1<float>(
@@ -203,7 +204,7 @@ XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   auto y = builder.ConstantR1<float>(
@@ -218,8 +219,8 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) {
 XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) {
   // Similar to MaxTenValues, except that the inputs come from params rather
   // than constants.
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle v1, v2;
+  XlaBuilder builder(TestName());
+  XlaOp v1, v2;
   std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
       {41.0f, 2.0f, 3.0f, 84.0f}, /*parameter_number=*/0, /*name=*/"v1",
       /*builder=*/&builder, /*data_handle=*/&v1);
@@ -236,7 +237,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) {
 XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
   // Similar to MaxTenValuesFromParams, except that the data size passed in and
   // out is large.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // Number of floats in the data passed into and out of the computation.
   constexpr int datalen = 15 * 1000;
@@ -259,7 +260,7 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
     expected_vec.push_back(larger);
   }
 
-  ComputationDataHandle v1, v2;
+  XlaOp v1, v2;
   std::unique_ptr<GlobalData> param0_data =
       CreateR1Parameter<float>(v1vec, /*parameter_number=*/0, /*name=*/"v1",
                                /*builder=*/&builder, /*data_handle=*/&v1);
@@ -274,7 +275,7 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   auto y = builder.ConstantR0<float>(0);
@@ -286,7 +287,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, MinTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   auto y = builder.ConstantR1<float>(
@@ -299,7 +300,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0<float>(0);
   auto one = builder.ConstantR0<float>(1);
   auto x = builder.ConstantR1<float>(
@@ -312,7 +313,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0<float>(0);
   auto one = builder.ConstantR0<float>(1);
   auto x = builder.ConstantR1<float>(
@@ -325,7 +326,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR1<float>({0.0f, 0.0f});
   auto one = builder.ConstantR1<float>({1.0f, 1.0f});
   auto x = builder.ConstantR1<float>({2.1, -2.6});
@@ -336,7 +337,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto one = builder.ConstantR0<float>(1);
   auto two = builder.ConstantR0<float>(2);
   auto x = builder.ConstantR1<float>(
@@ -348,11 +349,22 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
+XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) {
+  ComputationBuilder builder(client_, TestName());
+  auto zero = builder.ConstantR0<int64>(0);
+  auto one = builder.ConstantR0<int64>(10);
+  auto x = builder.ConstantR1<int64>({-3, 3, 9, 13});
+  auto clamp = builder.Clamp(zero, x, one);
+
+  std::vector<int64> expected = {0, 3, 9, 10};
+  ComputeAndCompareR1<int64>(&builder, expected, {});
+}
+
 XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
-  Computation add_half;
+  XlaComputation add_half;
   {
     // add_half(x) = x + 0.5
-    ComputationBuilder builder(client_, "add_half");
+    XlaBuilder builder("add_half");
     auto x_value =
         builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x_value");
     auto half = builder.ConstantR0<float>(0.5);
@@ -362,10 +374,10 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
     add_half = computation_status.ConsumeValueOrDie();
   }
 
-  Computation clamp;
+  XlaComputation clamp;
   {
     // clamp(y) = clamp<0,5>(y)
-    ComputationBuilder builder(client_, "clamp");
+    XlaBuilder builder("clamp");
     auto y_value =
         builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y_value");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -375,10 +387,10 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
     clamp = computation_status.ConsumeValueOrDie();
   }
 
-  Computation mult_relu_add;
+  XlaComputation mult_relu_add;
   {
     // mult_relu_add(z) = clamp(add_half(2 * max(z, 0)))
-    ComputationBuilder builder(client_, "mult_relu_add");
+    XlaBuilder builder("mult_relu_add");
     auto z_value =
         builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "z_value");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -392,7 +404,7 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
     mult_relu_add = computation_status.ConsumeValueOrDie();
   }
 
-  ComputationBuilder builder(client_, "map10");
+  XlaBuilder builder("map10");
   {
     auto x = builder.ConstantR1<float>(
         {2.1, -21.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -405,7 +417,7 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>({-5, -4, -3, -2, -1, 0, 1, 2, 3, 4});
   auto y = builder.ConstantR0<int32>(3);
   builder.Rem(x, y);
@@ -415,7 +427,7 @@ XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<bool>({false, true});
   auto y = builder.ConstantR1<bool>({true, false});
   builder.Eq(x, y);
@@ -425,7 +437,7 @@ XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<bool>({false, true});
   auto y = builder.ConstantR1<bool>({true, false});
   builder.Ne(x, y);
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 52157b837c383205f77a030ef98b2fd03a41aff5..c463f3eac55e5b8ab32dc52d5a38e7840241bc58 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -37,8 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
@@ -54,29 +52,28 @@ TEST_F(WhileTest, WhileWithScalarS32Result) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int32>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -91,29 +88,28 @@ TEST_F(WhileTest, WhileWithScalarS64Result) {
   auto result_shape = ShapeUtil::MakeShape(S64, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int64>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int64>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int64>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int64>(&builder, 5, {});
 }
@@ -123,31 +119,30 @@ TEST_F(WhileTest, WhileWithScalarResultNonConstInit) {
   auto orig_shape = ShapeUtil::MakeShape(S32, {2});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int32>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.Reduce(builder.ConstantR1<int32>(2, 1),
                              builder.ConstantR0<int32>(0),
                              CreateScalarAddComputation(S32, &builder), {0});
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -156,28 +151,28 @@ TEST_F(WhileTest, WhileWithPredicateResult) {
   auto result_shape = ShapeUtil::MakeShape(PRED, {});
 
   // Create a computation for the condition: run until condition is true.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Ne(builder.ConstantR0<bool>(true), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: or condition with true.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
-    auto result = builder.Or(prev, builder.ConstantR0<bool>(true));
+    builder.Or(prev, builder.ConstantR0<bool>(true));
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.Ne(builder.ConstantR0<bool>(false),
                          builder.ConstantR0<bool>(true));
-  auto result = builder.While(condition, body, init);
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
@@ -194,9 +189,9 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {0});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -205,33 +200,34 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 15.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>({});
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>({});
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   ComputeAndCompareR1<float>(&builder, {}, {}, ErrorSpec(0.0001));
 }
@@ -247,9 +243,9 @@ TEST_F(WhileTest, WhileWithVectorResult) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -258,33 +254,34 @@ TEST_F(WhileTest, WhileWithVectorResult) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 5.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>(8, 0.125f);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>(8, 0.f);
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   // Individual elements with increase by 1/8 each time through the loop, so
   // the sum will increase by 1.0.  It will first be >15.5 when the elements
@@ -306,9 +303,9 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -317,34 +314,34 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 5.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>(8, 0.125f);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>(8, 0.f);
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   builder.Tuple({result});
 
   // Individual elements with increase by 1/8 each time through the loop, so
@@ -366,9 +363,9 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
   // Create a computation for the condition.
   // Repeat for N iterations.
   const int N = 2;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(N), iteration);
@@ -377,28 +374,28 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable and permute the weights.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto w1 = builder.GetTupleElement(prev, 1);
     auto w2 = builder.GetTupleElement(prev, 2);
     auto w3 = builder.GetTupleElement(prev, 3);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
        builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(N);
   auto expected_w1 = Literal::CreateR1<float>({1.0f, 1.0f, 1.0f});
@@ -419,9 +416,9 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   // Create a computation for the condition.
   // Repeat for N iterations.
   const int N = 2;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(N), iteration);
@@ -430,21 +427,21 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable permute the weights.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto w1 = builder.GetTupleElement(prev, 1);
     auto w2 = builder.GetTupleElement(prev, 2);
     auto w3 = builder.GetTupleElement(prev, 3);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
        builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
@@ -455,7 +452,7 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   auto result = builder.Add(add12, builder.GetTupleElement(xla_while, 3));
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   std::vector<float> expected = {6.f, 6.f, 6.f};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
@@ -474,9 +471,9 @@ TEST_F(WhileTest, WhileWithTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -486,26 +483,27 @@ TEST_F(WhileTest, WhileWithTupleResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR1<float>(
@@ -523,9 +521,9 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -534,27 +532,27 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable and or the predicate with true
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto pred = builder.GetTupleElement(prev, 1);
     auto new_pred = builder.Or(pred, builder.ConstantR0<bool>(true));
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_pred});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple({builder.ConstantR0<int32>(0),
                              builder.Ne(builder.ConstantR0<bool>(false),
                                         builder.ConstantR0<bool>(true))});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_predicate = Literal::CreateR0<bool>(true);
@@ -570,9 +568,9 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -582,25 +580,24 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and set the other tuple element to a
   // constant.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
-    auto result =
-        builder.Tuple({builder.Add(iteration, builder.ConstantR0<int32>(1)),
-                       builder.ConstantR0<int32>(7)});
+    builder.Tuple({builder.Add(iteration, builder.ConstantR0<int32>(1)),
+                   builder.ConstantR0<int32>(7)});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR0<int32>(7)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR0<int32>(7);
@@ -631,20 +628,20 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -654,34 +651,34 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  Computation body2;
+  XlaComputation body2;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body2, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -692,11 +689,11 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -710,20 +707,20 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -733,21 +730,21 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -758,11 +755,11 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -777,20 +774,20 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -800,21 +797,21 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -824,11 +821,11 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -844,9 +841,9 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -856,9 +853,9 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     // TupleElement 0
     auto iteration = builder.GetTupleElement(prev, 0);
@@ -873,18 +870,18 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // UpdateSlice.
     auto out1 = builder.DynamicUpdateSlice(input, update, starts);
 
-    auto result = builder.Tuple({out0, out1});
+    builder.Tuple({out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR1<float>(
@@ -910,23 +907,23 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
 // Per backend the values generated can be different as the different backends
 // use different random number generators.
 // TODO(b/32240857): Extend test to verify outputs.
-TEST_F(WhileTest, WhileWithPrngScalarResult) {
+TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
   auto v6s32 = ShapeUtil::MakeShape(S32, {6});
 
   // Create a computation for the condition: repeat for count iterations.
   auto build_condition = [this, v6s32](int count) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto prev = builder.Reshape(
         builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0},
-          {});
+        {});
     builder.Gt(builder.ConstantR0<int32>(count), prev);
     return builder.Build().ConsumeValueOrDie();
   };
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, v6s32, "prev");
     auto inc = builder.ConcatInDim(
         {builder.ConstantR1<int32>({1}),
@@ -934,16 +931,15 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
                             builder.ConstantR0<int32>(100),
                             ShapeUtil::MakeShape(S32, {5}))},
         0);
-    auto result = builder.Add(inc, prev);
+    builder.Add(inc, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
   auto while_loop = [this, &body, build_condition](int count) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto init = builder.ConstantR1<int32>({0, 0, 0, 0, 0, 0});
-    auto result = builder.While(build_condition(count), body, init);
-    auto shape = builder.GetShape(result).ConsumeValueOrDie();
+    builder.While(build_condition(count), body, init);
     return builder.Build();
   };
 
@@ -961,22 +957,21 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
 TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
-  ComputationBuilder outer(client_, "outer");
+  XlaBuilder outer("outer");
   auto p = outer.Parameter(0, element_shape, "param");
   auto t = outer.Tuple({p, outer.ConstantR1<float>({1, 1})});
 
-  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<Shape> tuple_shape,
-                          outer.GetShape(t));
+  TF_ASSERT_OK_AND_ASSIGN(Shape tuple_shape, outer.GetShape(t));
 
-  ComputationBuilder cond(client_, "cond");
-  auto cond_t = cond.Parameter(0, *tuple_shape, "t");
+  XlaBuilder cond("cond");
+  auto cond_t = cond.Parameter(0, tuple_shape, "t");
   TF_ASSERT_OK(Any(cond.Eq(cond.GetTupleElement(cond_t, 0),
                            cond.ConstantR1<float>({42, 42})),
                    &cond)
                    .status());
 
-  ComputationBuilder body(client_, "body");
-  auto body_t = body.Parameter(0, *tuple_shape, "t");
+  XlaBuilder body("body");
+  auto body_t = body.Parameter(0, tuple_shape, "t");
   auto e = body.GetTupleElement(body_t, 1);
   body.Tuple({e, e});
 
@@ -997,15 +992,15 @@ TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
 TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
-  ComputationBuilder outer(client_, "outer");
+  XlaBuilder outer("outer");
   auto p = outer.Parameter(0, element_shape, "param");
 
-  ComputationBuilder cond(client_, "cond");
+  XlaBuilder cond("cond");
   auto cond_t = cond.Parameter(0, element_shape, "t");
   TF_ASSERT_OK(
       Any(cond.Eq(cond_t, cond.ConstantR1<float>({42, 42})), &cond).status());
 
-  ComputationBuilder body(client_, "body");
+  XlaBuilder body("body");
   auto body_t = body.Parameter(0, element_shape, "t");
   auto e = body.Broadcast(body.ConstantR0<float>(1.0), {2});
 
@@ -1023,14 +1018,14 @@ TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
 TEST_F(WhileTest, WhileThatTurnsScalarParameterToTupleElement) {
   auto element_shape = ShapeUtil::MakeShape(F32, {});
 
-  ComputationBuilder outer(client_, "outer");
+  XlaBuilder outer("outer");
   auto p = outer.Parameter(0, element_shape, "param");
 
-  ComputationBuilder cond(client_, "cond");
+  XlaBuilder cond("cond");
   auto cond_t = cond.Parameter(0, element_shape, "t");
   cond.Eq(cond_t, cond.ConstantR0<float>(42));
 
-  ComputationBuilder body(client_, "body");
+  XlaBuilder body("body");
   auto body_t = body.Parameter(0, element_shape, "t");
   auto tuple =
       body.Tuple({body_t, body.Add(body_t, body.ConstantR0<float>(1))});
@@ -1059,23 +1054,23 @@ TEST_F(WhileTest, WhileWithMixedTupleElements) {
   auto result_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})});
 
-  ComputationBuilder outer(client_, "outer");
+  XlaBuilder outer("outer");
   auto p =
       outer.Tuple({outer.ConstantR0<int32>(0),
                    outer.Parameter(0, ShapeUtil::MakeShape(S32, {}), "t")});
 
-  ComputationBuilder cond(client_, "cond");
+  XlaBuilder cond("cond");
   auto params = cond.Parameter(0, result_shape, "prev");
   auto cond_t = cond.Add(cond.GetTupleElement(params, 1),
                          cond.GetTupleElement(params, 0));
   cond.Lt(cond_t, cond.ConstantR0<int32>(30));
 
-  ComputationBuilder body(client_, "body");
+  XlaBuilder body("body");
   auto body_t = body.Parameter(0, result_shape, "t");
 
   auto tuple = body.Tuple(
-      {body.Add(body.GetTupleElement(params, 0), body.ConstantR0<int32>(1)),
-       body.Add(body.GetTupleElement(params, 1), body.ConstantR0<int32>(1))});
+      {body.Add(body.GetTupleElement(body_t, 0), body.ConstantR0<int32>(1)),
+       body.Add(body.GetTupleElement(body_t, 1), body.ConstantR0<int32>(1))});
 
   TF_ASSERT_OK_AND_ASSIGN(auto cond_computation, cond.Build());
   TF_ASSERT_OK_AND_ASSIGN(auto body_computation, body.Build());
@@ -1107,9 +1102,9 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
   auto inner_result_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})});
 
-  Computation inner_condition;
+  XlaComputation inner_condition;
   {
-    ComputationBuilder builder(client_, "inner_condition");
+    XlaBuilder builder("inner_condition");
     auto params = builder.Parameter(0, inner_result_shape, "prev");
     auto i = builder.GetTupleElement(params, 0);
     builder.Lt(i, builder.ConstantR0<int32>(7));
@@ -1118,9 +1113,9 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 
   // Creates a computation for the outer loop condition:
   // repeat while result < 30.
-  Computation outer_condition;
+  XlaComputation outer_condition;
   {
-    ComputationBuilder builder(client_, "outer_condition");
+    XlaBuilder builder("outer_condition");
     auto prev = builder.Parameter(0, outer_result_shape, "prev");
     builder.Lt(prev, builder.ConstantR0<int32>(30));
     outer_condition = builder.Build().ConsumeValueOrDie();
@@ -1128,34 +1123,33 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 
   // Creates a computation for the inner loop body: add 1 to `i`, and add 2 to
   // `result`.
-  Computation inner_body;
+  XlaComputation inner_body;
   {
-    ComputationBuilder builder(client_, "inner_body");
+    XlaBuilder builder("inner_body");
     auto params = builder.Parameter(0, inner_result_shape, "prev");
     auto i = builder.GetTupleElement(params, 0);
     auto result = builder.GetTupleElement(params, 1);
     i = builder.Add(builder.ConstantR0<int32>(1), i);
     result = builder.Add(builder.ConstantR0<int32>(2), result);
-    auto output = builder.Tuple({i, result});
+    builder.Tuple({i, result});
     inner_body = builder.Build().ConsumeValueOrDie();
   }
 
   // Creates a computation for the outer loop: run the inner loop with i = 0.
-  Computation outer_body;
+  XlaComputation outer_body;
   {
-    ComputationBuilder builder(client_, "outer_body");
+    XlaBuilder builder("outer_body");
     auto prev = builder.Parameter(0, outer_result_shape, "prev");
     auto init = builder.Tuple({builder.ConstantR0<int32>(0), prev});
     auto result = builder.While(inner_condition, inner_body, init);
-    auto output = builder.GetTupleElement(result, 1);
+    builder.GetTupleElement(result, 1);
     outer_body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(outer_condition, outer_body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(outer_condition, outer_body, init);
 
   ComputeAndCompareR0<int32>(&builder, 42, {});
 }
@@ -1166,22 +1160,22 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 // while (f(result).get<0>()) {
 //   result = result + 1;
 // }
-TEST_F(WhileTest, WhileWithCallInsideCondition) {
+TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition_callee;
+  XlaComputation condition_callee;
   {
-    ComputationBuilder builder(client_, "condition_callee");
+    XlaBuilder builder("condition_callee");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Tuple({builder.Gt(builder.ConstantR0<int32>(5), prev)});
 
     condition_callee = builder.Build().ConsumeValueOrDie();
   }
 
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto result = builder.Call(condition_callee, {prev});
     builder.GetTupleElement(result, 0);
@@ -1189,20 +1183,19 @@ TEST_F(WhileTest, WhileWithCallInsideCondition) {
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -1214,28 +1207,28 @@ TEST_F(WhileTest, WhileWithLoopInvariantOperation) {
       {scalar_s32, matrix_shape, matrix_shape, matrix_shape});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto state = builder.Parameter(0, while_shape, "state");
     builder.Gt(builder.ConstantR0<int32>(5), builder.GetTupleElement(state, 0));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto state = builder.Parameter(0, while_shape, "state");
     auto indvar = builder.GetTupleElement(state, 0);
     auto input_0 = builder.GetTupleElement(state, 1);
     auto input_1 = builder.GetTupleElement(state, 2);
     auto output = builder.Tanh(builder.Dot(input_0, input_1));
     auto indvar_next = builder.Add(indvar, builder.ConstantR0<int32>(1));
-    auto tuple_result = builder.Tuple({indvar_next, input_0, input_1, output});
+    builder.Tuple({indvar_next, input_0, input_1, output});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto matrix_input = builder.Parameter(0, matrix_shape, "matrix");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), matrix_input, matrix_input, matrix_input});
@@ -1268,9 +1261,9 @@ void BM_WhileLoop(int num_iters) {
 
   // Create while condition computation with 'loop_limit'.
   const int32 loop_limit = 100;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, loop_state_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(loop_limit));
@@ -1278,9 +1271,9 @@ void BM_WhileLoop(int num_iters) {
   }
 
   // Create while body computation with unit loop increment.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, loop_state_shape, "prev");
     // TupleElement 0
     auto iteration = builder.GetTupleElement(prev, 0);
@@ -1294,12 +1287,12 @@ void BM_WhileLoop(int num_iters) {
     auto starts = builder.ConstantR1<int32>({0, 0, 0});
     // UpdateSlice.
     auto out1 = builder.DynamicUpdateSlice(input, update, starts);
-    auto result = builder.Tuple({out0, out1});
+    builder.Tuple({out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While instruction.
-  ComputationBuilder builder(client, "while");
+  XlaBuilder builder("while");
   auto zero = builder.ConstantR0<float>(0.0);
   auto input = builder.Broadcast(zero, {seq_len, 1024, 1024});
   auto init = builder.Tuple({builder.ConstantR0<int32>(0), input});
@@ -1327,10 +1320,6 @@ void BM_WhileLoop(int num_iters) {
   }
 }
 
-// TODO(b/32470510): Benchmark fails on parallel CPU backend.
-#ifndef XLA_TEST_BACKEND_CPU_PARALLEL
 BENCHMARK(BM_WhileLoop);
-#endif
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 9ad2a1985331b80625dd0687ea052300bc99e440..7944b5132f3d11cf84488acbd920cc98c084072a 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -17,8 +17,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -27,13 +28,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
-namespace se = ::perftools::gputools;
+
 namespace gtl = ::tensorflow::gtl;
 
 class HloProfileTest : public ClientLibraryTestBase {};
@@ -118,7 +120,7 @@ Status ParseOneProfileOutputLine(
 
 // Returns void so that we can ASSERT.
 void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
-                            const Computation& computation,
+                            const XlaComputation& computation,
                             const Shape& lhs_arg_shape,
                             const Shape& rhs_arg_shape) {
   LocalService* service = ClientLibrary::GetXlaService(client->platform());
@@ -128,23 +130,23 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   auto* transfer_manager = backend->transfer_manager();
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<ScopedShapedBuffer> lhs_arg,
+      ScopedShapedBuffer lhs_arg,
       transfer_manager->AllocateScopedShapedBuffer(
           lhs_arg_shape, allocator, backend->default_device_ordinal()));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
-      executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg));
+      executor, *Literal::CreateFromShape(lhs_arg_shape), lhs_arg));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<ScopedShapedBuffer> rhs_arg,
+      ScopedShapedBuffer rhs_arg,
       transfer_manager->AllocateScopedShapedBuffer(
           rhs_arg_shape, allocator, backend->default_device_ordinal()));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
-      executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg));
+      executor, *Literal::CreateFromShape(rhs_arg_shape), rhs_arg));
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LocalExecutable> local_executable,
       client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
-                      ExecutableBuildOptions()));
+                      ExecutableBuildOptions().set_hlo_profile(true)));
 
   Executable* executable = local_executable->executable();
   HloExecutionProfile hlo_execution_profile(
@@ -164,7 +166,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
       backend->eigen_intra_op_thread_pool());
   TF_ASSERT_OK_AND_ASSIGN(
       auto execution_result,
-      executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()},
+      executable->ExecuteOnStream(&run_options, {&lhs_arg, &rhs_arg},
                                   &hlo_execution_profile));
   (void)execution_result;
 
@@ -174,8 +176,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   XLA_VLOG_LINES(4, *profile_output);
 }
 
-// TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
-XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
+XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
   const int64 m = 256, k = 256, n = 256;
   Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k});
@@ -185,7 +186,7 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
   TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
                           ClientLibrary::GetOrCreateLocalClient(platform));
 
-  ComputationBuilder builder(client, TestName());
+  XlaBuilder builder(TestName());
   auto result = builder.Tanh(builder.Add(
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"),
       builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs")));
@@ -238,12 +239,9 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
   EXPECT_TRUE(HasTrops(tanh_profile));
 }
 
-// TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
-//
 // TODO(b/71544591): The GPU backend does not record cycles spent in on Hlo
 // instructions "interior" to while nodes.
-XLA_TEST_F(HloProfileTest,
-           DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(ProfileWhileComputation))) {
+XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) {
   const int64 size = 256;
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size});
   Shape while_result_shape =
@@ -254,18 +252,18 @@ XLA_TEST_F(HloProfileTest,
   TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
                           ClientLibrary::GetOrCreateLocalClient(platform));
 
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client, "condition");
+    XlaBuilder builder("condition");
     auto state = builder.Parameter(0, while_result_shape, "state");
     auto iteration = builder.GetTupleElement(state, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client, "body");
+    XlaBuilder builder("body");
     auto state = builder.Parameter(0, while_result_shape, "state");
     auto matrix = builder.GetTupleElement(state, 1);
     auto next_iteration = builder.Add(builder.GetTupleElement(state, 0),
@@ -274,7 +272,7 @@ XLA_TEST_F(HloProfileTest,
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  ComputationBuilder builder(client, TestName());
+  XlaBuilder builder(TestName());
   auto initial_while_state =
       builder.Tuple({builder.ConstantR0<int32>(0),
                      builder.Parameter(0, matrix_shape, "initial_value")});
@@ -294,7 +292,8 @@ XLA_TEST_F(HloProfileTest,
   auto while_body_profile_start =
       std::find_if(profile_output_lines.begin(), profile_output_lines.end(),
                    [](tensorflow::StringPiece s) {
-                     return s.starts_with("Execution profile for body");
+                     return tensorflow::str_util::StartsWith(
+                         s, "Execution profile for body");
                    });
 
   ASSERT_NE(while_body_profile_start, profile_output_lines.end());
diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
index 92b2b1ee778f8b0f8104e7d7ff27a5c11db59768..a9f2915b458b1816926de727b3da21982d06f6c0 100644
--- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
+++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 GTEST_API_ int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
@@ -25,7 +29,38 @@ GTEST_API_ int main(int argc, char** argv) {
     return 2;
   }
 
+  // If the --benchmarks flag is passed in then only run the benchmarks, not the
+  // tests.
+  for (int i = 1; i < argc; i++) {
+    tensorflow::StringPiece arg(argv[i]);
+    if (arg == "--benchmarks" ||
+        tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
+      const char* pattern = nullptr;
+      if (tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
+        pattern = argv[i] + strlen("--benchmarks=");
+      } else {
+        // Handle flag of the form '--benchmarks foo' (no '=').
+        if (i + 1 >= argc ||
+            tensorflow::str_util::StartsWith(argv[i + 1], "--")) {
+          LOG(ERROR) << "--benchmarks flag requires an argument.";
+          return 2;
+        }
+        pattern = argv[i + 1];
+      }
+      // Unfortunately Google's internal benchmark infrastructure has a
+      // different API than Tensorflow's.
+#if defined(PLATFORM_GOOGLE)
+      base::SetFlag(&FLAGS_benchmarks, pattern);
+      RunSpecifiedBenchmarks();
+#else
+      tensorflow::testing::Benchmark::Run(pattern);
+#endif
+      return 0;
+    }
+  }
+
   testing::InitGoogleTest(&argc, argv);
+
   if (argc > 1) {
     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
     return 2;
diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc
index 6fa4c48e11d1102367b21bc21d4734466495ef0e..44f874cd2ae8e6f65dc282b8675f195ec9c09415 100644
--- a/tensorflow/compiler/xla/text_literal_reader.cc
+++ b/tensorflow/compiler/xla/text_literal_reader.cc
@@ -38,7 +38,7 @@ namespace xla {
 
 StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadPath(
     tensorflow::StringPiece path) {
-  CHECK(!path.ends_with(".gz"))
+  CHECK(!tensorflow::str_util::EndsWith(path, ".gz"))
       << "TextLiteralReader no longer supports reading .gz files";
   std::unique_ptr<tensorflow::RandomAccessFile> file;
   Status s =
@@ -115,7 +115,7 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
     tensorflow::StringPiece value_string = pieces[1];
     tensorflow::str_util::RemoveWhitespaceContext(&coordinates_string);
     tensorflow::str_util::RemoveWhitespaceContext(&value_string);
-    if (!coordinates_string.Consume("(")) {
+    if (!tensorflow::str_util::ConsumePrefix(&coordinates_string, "(")) {
       return InvalidArgument(
           "expected '(' at the beginning of coordinates: \"%s\"", line.c_str());
     }
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 091fa0c3ec807a66449eca0bfbb141285b8eb532..0bc4045a5490319994b6cf24daf99fe856167507 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -75,6 +75,7 @@ cc_library(
     name = "replay_computation_library",
     srcs = ["replay_computation.cc"],
     deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -222,17 +223,3 @@ tf_cc_binary(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD
index 97aacf6b39f83978e732060817cd93ede81ca782..0fa4b98d0a41a1e7c681bb2302da3b752315867b 100644
--- a/tensorflow/compiler/xla/tools/parser/BUILD
+++ b/tensorflow/compiler/xla/tools/parser/BUILD
@@ -70,17 +70,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index cd2b843ad36013ae83818ecbc184fb823093f037..fdbfc0210ea63ac4350ba48ac3354d23c53c69a7 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -303,18 +303,14 @@ bool HloParser::ParseComputations() {
     // set the layouts to what the hlo text says.
     for (int p = 0; p < computation->num_parameters(); p++) {
       const Shape& param_shape = computation->parameter_instruction(p)->shape();
-      if (param_shape.has_layout()) {
-        module_->mutable_entry_computation_layout()
-            ->mutable_parameter_layout(p)
-            ->ResetLayout(param_shape.layout());
-      }
+      TF_CHECK_OK(module_->mutable_entry_computation_layout()
+                      ->mutable_parameter_layout(p)
+                      ->CopyLayoutFromShape(param_shape));
     }
     const Shape& result_shape = computation->root_instruction()->shape();
-    if (result_shape.has_layout()) {
-      module_->mutable_entry_computation_layout()
-          ->mutable_result_layout()
-          ->ResetLayout(result_shape.layout());
-    }
+    TF_CHECK_OK(module_->mutable_entry_computation_layout()
+                    ->mutable_result_layout()
+                    ->CopyLayoutFromShape(result_shape));
   }
 
   return true;
@@ -470,6 +466,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -1049,9 +1046,40 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           HloInstruction::CreateDot(shape, operands[0], operands[1], dnum));
       break;
     }
-    case HloOpcode::kGather:
-      // TODO(b/72710576): HLO parsing is not implemented for Gather.
-      return TokenError("HLO parsing is not implemented for Gather");
+    case HloOpcode::kGather: {
+      optional<std::vector<int64>> output_window_dims;
+      attrs["output_window_dims"] = {
+          /*required=*/true, AttrTy::kBracedInt64List, &output_window_dims};
+      optional<std::vector<int64>> elided_window_dims;
+      attrs["elided_window_dims"] = {
+          /*required=*/true, AttrTy::kBracedInt64List, &elided_window_dims};
+      optional<std::vector<int64>> gather_dims_to_operand_dims;
+      attrs["gather_dims_to_operand_dims"] = {/*required=*/true,
+                                              AttrTy::kBracedInt64List,
+                                              &gather_dims_to_operand_dims};
+      optional<int64> index_vector_dim;
+      attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
+                                   &index_vector_dim};
+      optional<std::vector<int64>> window_bounds;
+      attrs["window_bounds"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                                &window_bounds};
+
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+
+      GatherDimensionNumbers dim_numbers = HloInstruction::MakeGatherDimNumbers(
+          /*output_window_dims=*/*output_window_dims,
+          /*elided_window_dims=*/*elided_window_dims,
+          /*gather_dims_to_operand_dims=*/*gather_dims_to_operand_dims,
+          /*index_vector_dim=*/*index_vector_dim);
+
+      instruction = builder->AddInstruction(HloInstruction::CreateGather(
+          shape, /*operand=*/operands[0], /*gather_indices=*/operands[1],
+          dim_numbers, *window_bounds));
+      break;
+    }
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index b8c6b59204f897c7dc07b846370b5b776a19a808..adc8b1d620eb65fdca19072831360b71847abf9e 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -716,6 +717,18 @@ ENTRY %sparse_f32_r1 () -> f32[9] {
   ROOT %foo = f32[9]sparse{10} constant(f32[9]{1: 2, 3: 4, 5: 6})
 }
 
+)"
+},
+{
+"gather",
+R"(HloModule StringifyGather
+
+ENTRY %Gather (input_tensor: f32[50,49,48,47,46], gather_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] {
+  %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
+  %gather_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
+  ROOT %gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26}
+}
+
 )"
 },
   });
@@ -860,6 +873,18 @@ ENTRY dot {
   ROOT dot = f32[2,3]{1,0} dot(a, b), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
+)"
+},
+{
+"gather",
+R"(HloModule gather
+
+ENTRY Gather {
+  input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
+  gather_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
+  ROOT gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(input_tensor, gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26}
+}
+
 )"
 },
   });
@@ -870,7 +895,7 @@ class HloParserTest : public ::testing::Test,
                       public ::testing::WithParamInterface<TestData> {
  protected:
   static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-    EXPECT_TRUE(StringPiece(s).contains(expected))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(s, expected))
         << "'" << s << "' does not contain '" << expected << "'";
   }
 
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index eda5effbb92db92c9317a956497a00c0ec15c27c..62a353ad09af009e4abf47664a5c5f7bd70a049e 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -66,6 +67,7 @@ struct Options {
   bool use_fake_data = false;
   bool print_result = true;
   int num_runs = 1;
+  bool xla_hlo_profile_last_run = false;
 };
 
 // Invokes the given computation passing arbitrary data for every (unbound)
@@ -122,16 +124,21 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   std::unique_ptr<Literal> result;
   for (int i = 0; i < opts.num_runs; ++i) {
     ExecutionProfile profile;
+    ExecutionOptions execution_options = CreateDefaultExecutionOptions();
+    if (opts.xla_hlo_profile_last_run && i == opts.num_runs - 1) {
+      execution_options.mutable_debug_options()->set_xla_hlo_profile(true);
+    }
+
     if (opts.print_result) {
-      TF_ASSIGN_OR_RETURN(result, client->ExecuteAndTransfer(
-                                      computation, execute_arguments,
-                                      /*execution_options=*/nullptr, &profile));
+      TF_ASSIGN_OR_RETURN(
+          result, client->ExecuteAndTransfer(computation, execute_arguments,
+                                             &execution_options, &profile));
     } else {
       // If we're not printing the result, execute the computation but don't
       // bother retrieving the result.  This can be a significant speedup.
       TF_RETURN_IF_ERROR(client
                              ->Execute(computation, execute_arguments,
-                                       /*execution_options=*/nullptr, &profile)
+                                       &execution_options, &profile)
                              .status());
     }
     LOG(INFO) << "Execution took "
@@ -191,6 +198,9 @@ int main(int argc, char** argv) {
                        "Number of times to run each computation"),
       tensorflow::Flag("fake_infeed_shape", &opts.fake_infeed_shape,
                        "Shape of fake data to construct for (infinite) infeed"),
+      tensorflow::Flag(
+          "xla_hlo_profile_last_run", &opts.xla_hlo_profile_last_run,
+          "Pass --xla_hlo_profile the last time we run the computation."),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 9fa4297523bab0748863479be52dff1b7b523a8b..b645acb700b0f168112a40c9c72b4669435f717d 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -46,4 +46,10 @@ using ::Eigen::half;
 
 }  // namespace xla
 
+// Alias namespace ::stream_executor as ::xla::se.
+namespace stream_executor {}
+namespace xla {
+namespace se = ::stream_executor;
+}  // namespace xla
+
 #endif  // TENSORFLOW_COMPILER_XLA_TYPES_H_
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 1f0c626bbb2d64ef4e67c9ec51485ae96ae73d04..e43498e381b8e63543e2ddda08ca7c0df91817e4 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/util.h"
 
-#include <numeric>
 #include <stdarg.h>
 #include <numeric>
 
@@ -244,8 +243,8 @@ string HumanReadableNumOps(double flops, double nanoseconds,
       static_cast<int64>(nano_flops * 1e9));
   tensorflow::StringPiece sp(throughput);
   // Use the more common "G(FLOPS)", rather than "B(FLOPS)"
-  if (sp.ends_with("B") ||  // Ends in 'B', ignoring case
-      sp.ends_with("b")) {
+  if (tensorflow::str_util::EndsWith(sp, "B") ||  // Ends in 'B', ignoring case
+      tensorflow::str_util::EndsWith(sp, "b")) {
     *throughput.rbegin() = 'G';
   }
   throughput += tensorflow::strings::StrCat(op_prefix, "OP/s");
@@ -292,7 +291,8 @@ void LogLines(int sev, tensorflow::StringPiece text, const char* fname,
 }
 
 int64 Product(tensorflow::gtl::ArraySlice<int64> xs) {
-  return std::accumulate(xs.begin(), xs.end(), 1, std::multiplies<int64>());
+  return std::accumulate(xs.begin(), xs.end(), static_cast<int64>(1),
+                         std::multiplies<int64>());
 }
 
 std::vector<std::pair<int64, int64>> CommonFactors(
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 46ec7af54290f40dfac1e4627801eab4dabb8aa5..be33bd6dd1304fa8fc6e5aed1d4c4d65bf97e692 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "tensorflow/compiler/xla/status.h"
@@ -427,29 +428,37 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 string SanitizeFileName(string file_name);
 
 template <typename Container, typename Predicate>
-bool c_all_of(Container container, Predicate predicate) {
-  return std::all_of(std::begin(container), std::end(container), predicate);
+bool c_all_of(const Container& container, Predicate&& predicate) {
+  return std::all_of(std::begin(container), std::end(container),
+                     std::forward<Predicate>(predicate));
+}
+
+template <typename Container, typename Predicate>
+bool c_any_of(const Container& container, Predicate&& predicate) {
+  return std::any_of(std::begin(container), std::end(container),
+                     std::forward<Predicate>(predicate));
 }
 
 template <typename InputContainer, typename OutputIterator,
           typename UnaryOperation>
-OutputIterator c_transform(InputContainer input_container,
+OutputIterator c_transform(const InputContainer& input_container,
                            OutputIterator output_iterator,
-                           UnaryOperation unary_op) {
+                           UnaryOperation&& unary_op) {
   return std::transform(std::begin(input_container), std::end(input_container),
-                        output_iterator, unary_op);
+                        output_iterator,
+                        std::forward<UnaryOperation>(unary_op));
 }
 
 template <class InputContainer, class OutputIterator, class UnaryPredicate>
-OutputIterator c_copy_if(InputContainer input_container,
+OutputIterator c_copy_if(const InputContainer& input_container,
                          OutputIterator output_iterator,
-                         UnaryPredicate predicate) {
+                         UnaryPredicate&& predicate) {
   return std::copy_if(std::begin(input_container), std::end(input_container),
-                      output_iterator, predicate);
+                      output_iterator, std::forward<UnaryPredicate>(predicate));
 }
 
 template <class InputContainer, class OutputIterator>
-OutputIterator c_copy(InputContainer input_container,
+OutputIterator c_copy(const InputContainer& input_container,
                       OutputIterator output_iterator) {
   return std::copy(std::begin(input_container), std::end(input_container),
                    output_iterator);
@@ -461,12 +470,13 @@ void c_sort(InputContainer& input_container) {
 }
 
 template <class InputContainer, class Comparator>
-void c_sort(InputContainer& input_container, Comparator comparator) {
-  std::sort(std::begin(input_container), std::end(input_container), comparator);
+void c_sort(InputContainer& input_container, Comparator&& comparator) {
+  std::sort(std::begin(input_container), std::end(input_container),
+            std::forward<Comparator>(comparator));
 }
 
 template <typename Sequence, typename T>
-bool c_binary_search(Sequence& sequence, T&& value) {
+bool c_binary_search(const Sequence& sequence, T&& value) {
   return std::binary_search(std::begin(sequence), std::end(sequence),
                             std::forward<T>(value));
 }
@@ -480,6 +490,54 @@ template <typename C>
 auto c_adjacent_find(const C& c) -> decltype(std::begin(c)) {
   return std::adjacent_find(std::begin(c), std::end(c));
 }
+
+template <typename C, typename Pred>
+auto c_find_if(const C& c, Pred&& pred) -> decltype(std::begin(c)) {
+  return std::find_if(std::begin(c), std::end(c), std::forward<Pred>(pred));
+}
+
+template <typename C, typename Value>
+auto c_find(const C& c, Value&& value) -> decltype(std::begin(c)) {
+  return std::find(std::begin(c), std::end(c), std::forward<Value>(value));
+}
+
+template <typename Sequence>
+void c_reverse(Sequence& sequence) {
+  std::reverse(std::begin(sequence), std::end(sequence));
+}
+
+template <typename Sequence, typename T, typename BinaryOp>
+typename std::decay<T>::type c_accumulate(const Sequence& sequence, T&& init,
+                                          BinaryOp&& binary_op) {
+  return std::accumulate(std::begin(sequence), std::end(sequence),
+                         std::forward<T>(init),
+                         std::forward<BinaryOp>(binary_op));
+}
+
+template <typename C, typename Value>
+int64 FindIndex(const C& c, Value&& value) {
+  auto it = c_find(c, std::forward<Value>(value));
+  return std::distance(c.begin(), it);
+}
+
+// Returns true if `x` fits in 32-bits.
+template <typename T>
+bool IsInt32(T x) {
+  // Following conversion rules: "the value is unchanged if it can be
+  // represented in the destination type (and bit-field width); otherwise, the
+  // value is implementation-defined."
+  return static_cast<int32>(x) == x;
+}
+
+template <typename T>
+Status EraseElementFromVector(std::vector<T>* container, const T& value) {
+  // c_find returns a const_iterator which does not seem to work on gcc 4.8.4,
+  // and this breaks the ubuntu/xla_gpu build bot.
+  auto it = std::find(container->begin(), container->end(), value);
+  TF_RET_CHECK(it != container->end());
+  container->erase(it);
+  return Status::OK();
+}
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 93284b80f9e1f82c4b18dc7388754d5c01a7740c..f11123ca24849af1d9c4fd49809a986eb7202bd5 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -199,6 +199,9 @@ bool IsInactiveWindowDimension(const Window& window, int64 logical_dim) {
 int64 DilatedBound(int64 bound, int64 dilation) {
   CHECK_GE(bound, 0);
   CHECK_GE(dilation, 1);
+  if (bound == 0) {
+    return 0;
+  }
 
   // Suppose the array has three entries 123 and the dilation factor is 4. Then
   // the dilated array has 9 entries 1xxx2xxx3. Here, each original entry except
@@ -212,7 +215,7 @@ int64 StridedBound(int64 bound, int64 window_size, int64 stride) {
   CHECK_GE(bound, 0);
   CHECK_GE(stride, 1);
 
-  if (window_size > bound) {
+  if (bound == 0 || window_size > bound) {
     return 0;
   }
 
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 6b136d333bbf079efd314833f46fe3b98743fbac..1439f1bcc5cec39203a7cb4b1f8604e7349382c6 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -6,7 +6,9 @@ load("//tensorflow/core:platform/default/build_config_root.bzl",
      "if_static")
 
 # xla_proto_library() is a convenience wrapper around cc_proto_library.
-def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
+def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0, **kwargs):
+  if kwargs.get('use_grpc_plugin'):
+    kwargs['use_grpc_namespace'] = True
   cc_proto_library(name=name,
                    srcs=srcs,
                    deps=deps,
@@ -16,6 +18,13 @@ def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
                    ),
                    protoc="@protobuf_archive//:protoc",
                    testonly=testonly,
-                   visibility=visibility,)
+                   visibility=visibility,
+                   **kwargs)
+
+def xla_py_grpc_library(**kwargs):
+  # Note: we don't currently define any special targets for Python GRPC in OSS.
+  _ignore = kwargs
+  pass
+
 
 ORC_JIT_MEMORY_MAPPER_TARGETS = []
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 56162ab44e2e0e3e4478fe631888f243332dc1d8..f619b8dc24038af64a27fc0565c74447ca9d09cf 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -16,6 +16,7 @@ limitations under the License.
 syntax = "proto3";
 
 import "tensorflow/compiler/xla/xla_data.proto";
+import "tensorflow/compiler/xla/service/hlo.proto";
 import "tensorflow/compiler/xla/service/session.proto";
 
 package xla;
@@ -188,6 +189,12 @@ message DebugOptions {
   // directory.
   string xla_dump_per_pass_hlo_proto_to = 96;
 
+  // Generate calls to MKL-DNN in the CPU backend.
+  bool xla_cpu_use_mkl_dnn = 97;
+
+  // Maximum kernel unroll factor for the GPU backend.
+  int32 xla_gpu_max_kernel_unroll_factor = 98;
+
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
@@ -298,6 +305,11 @@ message ComputationStatsRequest {
   DebugOptions debug_options = 2;
 }
 
+message ComputationGraphStatsRequest {
+  HloModuleProto computation = 1;
+  DebugOptions debug_options = 2;
+}
+
 message ComputationStatsResponse {
   ComputationStats stats = 1;
 }
@@ -342,10 +354,22 @@ message ExecuteRequest {
   ExecutionOptions execution_options = 5;
 }
 
+message ExecuteGraphRequest {
+  HloModuleProto computation = 1;
+  repeated GlobalDataHandle arguments = 2;
+
+  // Options that affect how XLA compiles and runs code to service this request.
+  ExecutionOptions execution_options = 3;
+}
+
 message ExecuteParallelRequest {
   repeated ExecuteRequest requests = 1;
 }
 
+message ExecuteGraphParallelRequest {
+  repeated ExecuteGraphRequest requests = 1;
+}
+
 message ExecuteResponse {
   GlobalDataHandle output = 1;
   ExecutionProfile profile = 2;
@@ -396,6 +420,11 @@ message ComputeConstantRequest {
   repeated LiteralProto parameters = 4;
 }
 
+message ComputeConstantGraphRequest {
+  HloModuleProto computation = 1;
+  Layout output_layout = 2;
+}
+
 message ComputeConstantResponse {
   // A LiteralProto is returned directly for this request, instead of a
   // ComputationDataHandle.
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 28620c3b86349281573eaf57d2838bee1488d838..d23f9e5918f54c4f385f3b16fd84bbee51ed5a95 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -355,17 +355,19 @@ message WindowDimension {
   // positions of the window in this dimension.
   int64 stride = 2;
 
-  // If positive, means the amount of padding with zeroes to add to the base
-  // area at the low end of this dimension; if negative, its negative means the
-  // number of elements removed from the low end of this dimension. For example,
-  // in the horizontal dimension of a rectangle, this would be the number of
-  // zeroes to pad on the left, given that indices increase when going right.
+  // If positive, means the amount of padding to add to the base area at the low
+  // end of this dimension; if negative, its negative means the number of
+  // elements removed from the low end of this dimension. For example, in the
+  // horizontal dimension of a rectangle, this would be the number of padding
+  // values to pad on the left, given that indices increase when going right.
+  // The actual padding value depends upon the context. Convolution pads with
+  // zeros. ReduceWindow and SelectAndScatter pads with the reduce function's
+  // init value.
   int64 padding_low = 3;
 
-  // As padding_low, but on the high end of this dimension. For
-  // example, in the horizontal dimension of a rectangle, this would
-  // be the number of zeroes to pad on the right, given that indices
-  // increase when going right.
+  // As padding_low, but on the high end of this dimension. For example, in the
+  // horizontal dimension of a rectangle, this would be the number of values to
+  // pad on the right, given that indices increase when going right.
   int64 padding_high = 4;
 
   // Dilation factor of the sliding window in this dimension. A dilation factor
@@ -418,6 +420,10 @@ message GatherDimensionNumbers {
   // transforms the gather index looked up from the gather_indices tensor into
   // the starting index in the input space.
   repeated int64 gather_dims_to_operand_dims = 3;
+
+  // The dimension in the gather_indices input that contains the starting
+  // indices.
+  int64 index_vector_dim = 4;
 }
 
 // Operation requests that are all collected as a tagged union with a oneof
@@ -795,6 +801,9 @@ enum UnaryOperation {
 
   // Elementwise, extract real component of complex x.
   UNOP_IMAG = 16;
+
+  // Elementwise, computes clz(x).
+  UNOP_CLZ = 17;
 }
 
 message UnaryOpRequest {
diff --git a/tensorflow/compiler/xla/xlalogo.png b/tensorflow/compiler/xla/xlalogo.png
new file mode 100644
index 0000000000000000000000000000000000000000..7a0a295953d0c47b23718197dcbab1677b337455
Binary files /dev/null and b/tensorflow/compiler/xla/xlalogo.png differ
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index bab37e8906e5c648acdc1556da7e5f4601776ff5..abdbdb4cd22ff38a0fae89af10c600a178d9a3d4 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -8,6 +8,7 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 load("//third_party/mpi:mpi.bzl", "if_mpi")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 py_library(
     name = "contrib_py",
@@ -24,22 +25,24 @@ py_library(
         "//tensorflow/contrib/batching:batch_py",
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/boosted_trees:init_py",
+        "//tensorflow/contrib/checkpoint/python:checkpoint",
         "//tensorflow/contrib/cloud:cloud_py",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
-        "//tensorflow/contrib/coder:coder_ops_py",
+        "//tensorflow/contrib/coder:coder_py",
         "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/contrib/constrained_optimization",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
         "//tensorflow/contrib/data",
+        "//tensorflow/contrib/distribute:distribute",
         "//tensorflow/contrib/deprecated:deprecated_py",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow/contrib/estimator:estimator_py",
         "//tensorflow/contrib/factorization:factorization_py",
         "//tensorflow/contrib/feature_column:feature_column_py",
-        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/fused_conv:fused_conv_py",
         "//tensorflow/contrib/gan",
@@ -51,7 +54,6 @@ py_library(
         "//tensorflow/contrib/image:single_image_random_dot_stereograms_py",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
-        "//tensorflow/contrib/kafka",
         "//tensorflow/contrib/keras",
         "//tensorflow/contrib/kernel_methods",
         "//tensorflow/contrib/kfac",
@@ -63,7 +65,6 @@ py_library(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
-        "//tensorflow/contrib/lite/python:lite",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/losses:metric_learning_py",
@@ -75,16 +76,20 @@ py_library(
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
+        "//tensorflow/contrib/optimizer_v2:optimizer_v2_py",
         "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
+        "//tensorflow/contrib/proto",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
-        "//tensorflow/contrib/py2tf",
+        "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
+        "//tensorflow/contrib/recurrent:recurrent_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
         "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/rpc",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/contrib/seq2seq:seq2seq_py",
         "//tensorflow/contrib/signal:signal_py",
@@ -110,6 +115,15 @@ py_library(
         "//tensorflow/python:util",
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_tensorrt([
         "//tensorflow/contrib/tensorrt:init_py",
+    ]) + select({
+        "//tensorflow:with_kafka_support_windows_override": [],
+        "//tensorflow:with_kafka_support": [
+            "//tensorflow/contrib/kafka",
+        ],
+        "//conditions:default": [],
+    }) + if_not_windows([
+        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
+        "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
     ]),
 )
 
@@ -119,7 +133,6 @@ cc_library(
     deps = [
         "//tensorflow/contrib/boosted_trees:boosted_trees_kernels",
         "//tensorflow/contrib/coder:all_kernels",
-        "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_kernels",
         "//tensorflow/contrib/data/kernels:dataset_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
@@ -133,7 +146,13 @@ cc_library(
         "//tensorflow/contrib/text:all_kernels",
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_cuda([
         "//tensorflow/contrib/nccl:nccl_kernels",
-    ]),
+    ]) + select({
+        "//tensorflow:with_kafka_support_windows_override": [],
+        "//tensorflow:with_kafka_support": [
+            "//tensorflow/contrib/kafka:dataset_kernels",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
@@ -142,12 +161,10 @@ cc_library(
     deps = [
         "//tensorflow/contrib/boosted_trees:boosted_trees_ops_op_lib",
         "//tensorflow/contrib/coder:all_ops",
-        "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_ops_op_lib",
         "//tensorflow/contrib/data:dataset_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
-        "//tensorflow/contrib/kafka:kafka_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
@@ -158,17 +175,11 @@ cc_library(
         "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
         "//tensorflow/contrib/text:all_ops",
         "//tensorflow/contrib/tpu:all_ops",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
+    ] + select({
+        "//tensorflow:with_kafka_support_windows_override": [],
+        "//tensorflow:with_kafka_support": [
+            "//tensorflow/contrib/kafka:dataset_ops_op_lib",
         ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+        "//conditions:default": [],
+    }),
 )
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 4f6f539027b040de7554d09fe9118ff97aa006f8..9f5459f41da3e5a13286f7002e4b519978bc189b 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -1,3 +1,4 @@
+# pylint: disable=g-import-not-at-top
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,18 +19,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
+from tensorflow.contrib import checkpoint
 from tensorflow.contrib import cloud
 from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib import coder
 from tensorflow.contrib import compiler
+from tensorflow.contrib import constrained_optimization
 from tensorflow.contrib import copy_graph
 from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
 from tensorflow.contrib import data
 from tensorflow.contrib import deprecated
+from tensorflow.contrib import distribute
 from tensorflow.contrib import distributions
 from tensorflow.contrib import estimator
 from tensorflow.contrib import factorization
@@ -60,11 +66,13 @@ from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
+from tensorflow.contrib import proto
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
+from tensorflow.contrib import rpc
 from tensorflow.contrib import saved_model
 from tensorflow.contrib import seq2seq
 from tensorflow.contrib import signal
@@ -83,8 +91,11 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
-from tensorflow.contrib.lite.python import lite
+if os.name != "nt":
+  from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.optimizer_v2 import optimizer_v2_symbols as optimizer_v2
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
+from tensorflow.contrib.recurrent.python import recurrent_api as recurrent
 from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
 from tensorflow.contrib.specs import python as specs
 from tensorflow.contrib.summary import summary
@@ -92,6 +103,7 @@ from tensorflow.contrib.summary import summary
 from tensorflow.python.util.lazy_loader import LazyLoader
 ffmpeg = LazyLoader("ffmpeg", globals(),
                     "tensorflow.contrib.ffmpeg")
+del os
 del LazyLoader
 
 del absolute_import
diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index 8dff93b4f825277dcf0a64aa3b96bd809d36e1e9..62d1b1cf079d04d50e4899cfd9ba1d405ee1efb9 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -45,16 +45,3 @@ tf_py_test(
         "//tensorflow/python:state_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 6658f0d9c13f6db17b25354cde2593d57f104f17..159d985db5c48f8fe1a26350255f8d8f68482473 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -18,10 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import math
-import re
 
 from tensorflow.contrib import nccl
+from tensorflow.python.framework import device as device_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -38,16 +39,15 @@ def _flatten_tensors(tensors):
     shape: the original shape of each element of input tensors
 
   Raises:
-    ValueError: tensors are empty or non-isomorphic.
+    ValueError: tensors are empty or non-isomorphic or have unknown shape.
   """
   if not tensors:
     raise ValueError("tensors cannot be empty")
   shape = tensors[0].shape
   for tensor in tensors:
     shape = shape.merge_with(tensor.shape)
-  if shape.ndims is None:
-    raise ValueError("At least one of the tensors in 'tensors' must have "
-                     "statically known rank.")
+  if not shape.is_fully_defined():
+    raise ValueError("Tensors must have statically known shape.")
   if len(shape) != 1:
     reshaped = []
     for t in tensors:
@@ -660,21 +660,20 @@ def _split_by_task(devices, values):
   num_devices = len(devices)
   if num_devices != len(values):
     raise ValueError("len(devices) must equal len(values)")
-  pattern = re.compile(r"/task:(\d+)/")
-  per_task_devices = []
-  per_task_values = []
+  per_task_devices = collections.OrderedDict()
+  per_task_values = collections.OrderedDict()
   for d in range(num_devices):
-    m = pattern.search(devices[d])
-    if m:
-      index = int(m.group(1))
-      while index >= len(per_task_devices):
-        per_task_devices.append([])
-        per_task_values.append([])
-      per_task_devices[index].append(devices[d])
-      per_task_values[index].append(values[d])
-    else:
+    d_spec = device_lib.DeviceSpec.from_string(devices[d])
+    if not hasattr(d_spec, "task") or d_spec.task is None:
       assert False, "failed to parse device %s" % devices[d]
-  return (per_task_devices, per_task_values)
+    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
+    if index not in per_task_devices:
+      per_task_devices[index] = []
+      per_task_values[index] = []
+    per_task_devices[index].append(devices[d])
+    per_task_values[index].append(values[d])
+
+  return (list(per_task_devices.values()), list(per_task_values.values()))
 
 
 def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce_test.py b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
index 47bab0a3670a90644972b2c961954a3036b8ecba..b3f5d92259df8475b205110dd3f0cee1cb5bde6f 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce_test.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
@@ -36,6 +36,12 @@ from tensorflow.python.platform import tf_logging
 
 class AllReduceTest(test_util.TensorFlowTestCase):
 
+  def testFlattenTensorsShapesDefined(self):
+    x = array_ops.placeholder(types_pb2.DT_FLOAT, [None])
+    with self.assertRaisesRegexp(ValueError,
+                                 "must have statically known shape"):
+      ar._flatten_tensors([x, x])
+
   def testRingPermutations(self):
     # 0 devices
     pred_by_c_d, rank_by_c_d = ar._ring_permutations(1, 0, [])
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index 4bff3c27d22c4550747a651a59909bdef80e8285..60306ebdc6cddb04e8807bfd495fa92a56e55ecd 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -38,20 +38,6 @@ cc_library(
     alwayslink = 1,
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # JAR with Java bindings to TF.
 android_library(
     name = "android_tensorflow_inference_java",
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.cc b/tensorflow/contrib/android/asset_manager_filesystem.cc
index 380a652435ad089f46f3ca80e4fd43097fd96e10..513d519eabbd54f46fde9ec0f004247c02277732 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.cc
+++ b/tensorflow/contrib/android/asset_manager_filesystem.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 
 namespace tensorflow {
 namespace {
@@ -228,9 +229,8 @@ string AssetManagerFileSystem::NormalizeDirectoryPath(const string& fname) {
 }
 
 string AssetManagerFileSystem::RemoveAssetPrefix(const string& name) {
-  string output(name);
-  StringPiece piece(output);
-  piece.Consume(prefix_);
+  StringPiece piece(name);
+  str_util::ConsumePrefix(&piece, prefix_);
   return piece.ToString();
 }
 
@@ -243,6 +243,11 @@ bool AssetManagerFileSystem::DirectoryExists(const std::string& fname) {
   return AAssetDir_getNextFileName(dir.get()) != NULL;
 }
 
+Status AssetManagerFileSystem::GetMatchingPaths(const string& pattern,
+                                                std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status AssetManagerFileSystem::NewWritableFile(
     const string& fname, std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("Asset storage is read only.");
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.h b/tensorflow/contrib/android/asset_manager_filesystem.h
index 665304b5eef1f8a3633c8c522259e20d744b1808..a87ff42ae217c429ecf5d2458b88b3431551ad97 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.h
+++ b/tensorflow/contrib/android/asset_manager_filesystem.h
@@ -66,6 +66,9 @@ class AssetManagerFileSystem : public FileSystem {
   Status DeleteDir(const string& d) override;
   Status RenameFile(const string& s, const string& t) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
  private:
   string RemoveAssetPrefix(const string& name);
 
diff --git a/tensorflow/contrib/android/cmake/CMakeLists.txt b/tensorflow/contrib/android/cmake/CMakeLists.txt
index a115d1610e2334a6626f29674f3dd195e3a3c648..ecf1a103d2981f409a4598d762fb26100217f779 100644
--- a/tensorflow/contrib/android/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/android/cmake/CMakeLists.txt
@@ -75,7 +75,6 @@ target_link_libraries(tensorflow_inference
 include_directories(
     ${PREBUILT_DIR}/proto
     ${PREBUILT_DIR}/protobuf/include
-    ${PREBUILT_DIR}/nsync/public
     ${TENSORFLOW_ROOT_DIR}/tensorflow/contrib/makefile/downloads/eigen
     ${TENSORFLOW_ROOT_DIR}
     ${CMAKE_CURRENT_SOURCE_DIR}/..)
diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/autograph/BUILD
similarity index 75%
rename from tensorflow/contrib/py2tf/BUILD
rename to tensorflow/contrib/autograph/BUILD
index d91220f6ddb859ff52d4e5853948cb667981009b..30dd846893c30b9205972bd5216cc1871ab03d76 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/autograph/BUILD
@@ -15,16 +15,16 @@ filegroup(
 )
 
 py_library(
-    name = "py2tf",
+    name = "autograph",
     srcs = [
         "__init__.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/py2tf/impl",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/contrib/autograph/impl",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/contrib/autograph/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/autograph/README.md b/tensorflow/contrib/autograph/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ba99c396fc1c8ee1e12fbb4fe0293ee52ed9bc9
--- /dev/null
+++ b/tensorflow/contrib/autograph/README.md
@@ -0,0 +1,122 @@
+# AutoGraph
+
+IMPORTANT: AutoGraph is pre-alpha, under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback!
+
+AutoGraph is a Python to TensorFlow compiler.
+
+With AutoGraph, you can write [Eager style](https://www.tensorflow.org/programmers_guide/eager) code in a concise manner, and run it as a TensorFlow graph. AutoGraph uses source code transformation and partial evaluation to generate Python code that builds an equivalent TensorFlow subgraph. The result is code that behaves like ops and can be freely combined with other TensorFlow ops.
+
+For example, this Python function:
+
+```
+def f(x):
+  if x < 0:
+    x = -x
+  return x
+```
+
+would be converted to this:
+
+```
+def graph_mode_f(x):
+  with tf.name_scope('f'):
+
+    def if_true():
+      with tf.name_scope('if_true'):
+        x_1, = x,
+        x_1 = tf.negative(x_1)
+        return x_1,
+
+    def if_false():
+      with tf.name_scope('if_false'):
+        x_1, = x,
+        return x_1,
+    x = ag__.utils.run_cond(tf.greater(x, 0), if_true, if_false)
+    return x
+```
+
+so you can use it like an op:
+
+```
+with tf.Graph().as_default():
+  x = tf.constant(-1.0)
+
+  converted_f = autograph.to_graph(f)
+  y = converted_f(x)
+
+  with tf.Session() as sess:
+    print(sess.run(y))
+    # Output: 1
+```
+
+# Getting started
+
+Use AutoGraph in one of the following ways, described below:
+
+ 1. Annotations (simpler)
+ 2. Functional API (more flexible)
+
+To get started, install the latest nightly TensorFlow build:
+
+```shell
+pip install -U tf-nightly
+```
+
+Then import the `autograph` module from `tf.contrib`:
+
+```
+from tensorflow.contrib import autograph as ag
+```
+
+### Interactive demo notebooks
+
+For more extensive examples, check out these interactive notebooks:
+
+ * [RNN trained using Keras and Estimators](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb)
+ * [Demo from the TF Dev Summit 2018](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb)
+
+## Using with annotations
+
+Annotating a function or class with `@convert` converts it in place:
+
+```
+@ag.convert()
+def f(x):
+  if x < 0:
+    x = -x
+  return x
+```
+
+... so that it always outputs TensorFlow code:
+
+```
+with tf.Graph().as_default():
+  x = tf.constant(-1)
+
+  y = f(x)
+
+  with tf.Session() as sess:
+    print(sess.run(y))
+    # Output: 1
+```
+
+## Using the functional API
+
+The functional API allows you to convert an existing function, class or object after it was defined:
+
+```
+converted_f = ag.to_graph(f)
+
+print(converted_f(tf.constant(-1)))
+# Output: Tensor
+
+print(f(-1))
+# Output: 1
+```
+
+You can use the functional API to inspect the generated code as well:
+
+```
+print(ag.to_code(f))
+# Output: <Python and TensorFlow code>
+```
diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/autograph/__init__.py
similarity index 57%
rename from tensorflow/contrib/py2tf/__init__.py
rename to tensorflow/contrib/autograph/__init__.py
index 379fa7fd5c2a22b5b16a21cca8c2ea8afdcaeefa..3386c4eca4b93e850f6fe3c6239d29c61d787ece 100644
--- a/tensorflow/contrib/py2tf/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Py2TF compiles Python code into equivalent TensorFlow code.
+"""Autograph compiles Python code into equivalent TensorFlow code.
 
 Equivalent here means that they have the same effect when executed.
 """
@@ -21,16 +21,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.impl.api import convert
-from tensorflow.contrib.py2tf.impl.api import graph_ready
-from tensorflow.contrib.py2tf.impl.api import to_code
-from tensorflow.contrib.py2tf.impl.api import to_graph
-from tensorflow.contrib.py2tf.pyct.transformer import PyFlowParseError
+# TODO(mdan): Bring only the relevant symbols to the top level.
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.impl.api import convert
+from tensorflow.contrib.autograph.impl.api import converted_call
+from tensorflow.contrib.autograph.impl.api import do_not_convert
+from tensorflow.contrib.autograph.impl.api import RunMode
+from tensorflow.contrib.autograph.impl.api import to_code
+from tensorflow.contrib.autograph.impl.api import to_graph
+from tensorflow.contrib.autograph.pyct.transformer import AutographParseError
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'to_graph', 'to_code', 'convert', 'graph_ready', 'utils', 'PyFlowParseError'
+    'utils', 'convert', 'converted_call', 'do_not_convert', 'RunMode',
+    'to_code', 'to_graph', 'AutographParseError'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
similarity index 71%
rename from tensorflow/contrib/py2tf/converters/BUILD
rename to tensorflow/contrib/autograph/converters/BUILD
index e9a96ec8d1dfc01ff6bc3b1fcaaef8e9b71a14a8..8f9bffa55e44e4942bb3845945b3d440c7957cc9 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -24,10 +24,13 @@ py_library(
         "continue_statements.py",
         "control_flow.py",
         "decorators.py",
-        "for_loops.py",
+        "ifexp.py",
         "list_comprehension.py",
+        "lists.py",
         "logical_expressions.py",
+        "name_scopes.py",
         "side_effect_guards.py",
+        "single_return.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -45,8 +48,10 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":converters",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
-        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/contrib/autograph/operators",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/contrib/autograph/pyct/static_analysis",
+        "//tensorflow/contrib/autograph/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
@@ -56,9 +61,9 @@ py_test(
     name = "asserts_test",
     srcs = ["asserts_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -69,7 +74,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -78,20 +82,22 @@ py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
 
 py_test(
     name = "call_trees_test",
+    size = "large",
     srcs = ["call_trees_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/impl",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -102,7 +108,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -113,7 +118,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -124,18 +128,16 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
 
 py_test(
-    name = "for_loops_test",
-    srcs = ["for_loops_test.py"],
-    srcs_version = "PY2AND3",
+    name = "name_scopes_test",
+    srcs = ["name_scopes_test.py"],
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -146,7 +148,16 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "lists_test",
+    srcs = ["lists_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -157,7 +168,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -166,9 +176,35 @@ py_test(
     name = "side_effect_guards_test",
     srcs = ["side_effect_guards_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        # TODO(mdan): Fix.
+        "flaky",
+        "notap",
+    ],
+    deps = [
+        ":test_lib",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "single_return_test",
+    srcs = ["single_return_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "ifexp_test",
+    srcs = ["ifexp_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/contrib/py2tf/converters/__init__.py b/tensorflow/contrib/autograph/converters/__init__.py
similarity index 95%
rename from tensorflow/contrib/py2tf/converters/__init__.py
rename to tensorflow/contrib/autograph/converters/__init__.py
index ca10896ee5c6c23d9b20ff23add9945de68e5bf9..e4e8eda42f655e204310eaa9defdd5c90bf06e15 100644
--- a/tensorflow/contrib/py2tf/converters/__init__.py
+++ b/tensorflow/contrib/autograph/converters/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Code converters used by Py2TF."""
+"""Code converters used by Autograph."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/py2tf/converters/asserts.py b/tensorflow/contrib/autograph/converters/asserts.py
similarity index 86%
rename from tensorflow/contrib/py2tf/converters/asserts.py
rename to tensorflow/contrib/autograph/converters/asserts.py
index 5b9b8e772bed82df2429fd6cb94dbf7b565e22b3..2d9e2c58e3afcef5c18f477a7a29e518e98e672e 100644
--- a/tensorflow/contrib/py2tf/converters/asserts.py
+++ b/tensorflow/contrib/autograph/converters/asserts.py
@@ -20,15 +20,13 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
 class AssertsTransformer(transformer.Base):
   """Transforms Print nodes to Call so they can be handled as functions."""
 
-  # pylint:disable=invalid-name
-
   def visit_Assert(self, node):
     self.generic_visit(node)
 
@@ -44,9 +42,7 @@ class AssertsTransformer(transformer.Base):
     elif isinstance(node.msg, gast.Str):
       return templates.replace(template, test=node.test, msg=node.msg)
     else:
-      raise NotImplementedError('Can only convert string messages for now.')
-
-  # pylint:enable=invalid-name
+      raise NotImplementedError('can only convert string messages for now.')
 
 
 def transform(node, context):
diff --git a/tensorflow/contrib/py2tf/converters/asserts_test.py b/tensorflow/contrib/autograph/converters/asserts_test.py
similarity index 90%
rename from tensorflow/contrib/py2tf/converters/asserts_test.py
rename to tensorflow/contrib/autograph/converters/asserts_test.py
index 6611f2777a93a7e819c8becfa06a09b27f4e6aaf..cc913febe8d0f411588af69b87ec52ce58f4469c 100644
--- a/tensorflow/contrib/py2tf/converters/asserts_test.py
+++ b/tensorflow/contrib/autograph/converters/asserts_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.converters import asserts
-from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import asserts
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
new file mode 100644
index 0000000000000000000000000000000000000000..91de82f0a78ccae711298d78364810dd099a5c38
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -0,0 +1,125 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Canonicalizes break statements by de-sugaring into a control boolean."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
+
+
+# Tags for local state.
+BREAK_USED = 'break_used'
+CONTROL_VAR_NAME = 'control_var_name'
+
+
+class BreakStatementTransformer(transformer.Base):
+  """Canonicalizes break statements into additional conditionals."""
+
+  def _track_body(self, nodes, break_var):
+    self.enter_local_scope()
+    self.set_local(CONTROL_VAR_NAME, break_var)
+    nodes = self.visit_block(nodes)
+    break_used = self.get_local(BREAK_USED, False)
+    self.exit_local_scope()
+    return nodes, break_used
+
+  def visit_Break(self, node):
+    self.set_local(BREAK_USED, True)
+    var_name = self.get_local(CONTROL_VAR_NAME)
+    # TODO(mdan): This will fail when expanded inside a top-level else block.
+    template = """
+      var_name = True
+      continue
+    """
+    return templates.replace(template, var_name=var_name)
+
+  def _guard_if_present(self, block, var_name):
+    """Prevents the block from executing if var_name is set."""
+    if not block:
+      return block
+    template = """
+        if not var_name:
+          block
+      """
+    node = templates.replace(
+        template,
+        var_name=var_name,
+        block=block)
+    return node
+
+  def visit_While(self, node):
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    break_var = self.context.namer.new_symbol('break__', scope.referenced)
+
+    node.test = self.visit(node.test)
+    node.body, break_used = self._track_body(node.body, break_var)
+    # A break in the else clause applies to the containing scope.
+    node.orelse = self.visit_block(node.orelse)
+
+    if break_used:
+      template = """
+        var_name = False
+        while test and not var_name:
+          body
+        else:
+          orelse
+      """
+      # Python's else clause only triggers if the loop exited cleanly (e.g.
+      # break did not trigger).
+      node = templates.replace(
+          template,
+          var_name=break_var,
+          test=node.test,
+          body=node.body,
+          orelse=self._guard_if_present(node.orelse, break_var))
+
+    return node
+
+  def visit_For(self, node):
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    break_var = self.context.namer.new_symbol('break__', scope.referenced)
+
+    node.target = self.visit(node.target)
+    node.iter = self.visit(node.iter)
+    node.body, break_used = self._track_body(node.body, break_var)
+    # A break in the else clause applies to the containing scope.
+    node.orelse = self.visit_block(node.orelse)
+
+    if break_used:
+      node.orelse = self._guard_if_present(node.orelse, break_var)
+      template = """
+        var_name = False
+        for_stmt
+      """
+      # Python's else clause only triggers if the loop exited cleanly (e.g.
+      # break did not trigger).
+      node = templates.replace(
+          template,
+          var_name=break_var,
+          for_stmt=node)
+      extra_cond = templates.replace_as_expression(
+          'not var_name', var_name=break_var)
+      anno.setanno(node[1], 'extra_cond', extra_cond)
+
+    return node
+
+
+def transform(node, context):
+  return BreakStatementTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/break_statements_test.py b/tensorflow/contrib/autograph/converters/break_statements_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1af59e9b5260fe0d3a3ef72c7a003dc451e230f3
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/break_statements_test.py
@@ -0,0 +1,147 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for break_statements module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.converters import break_statements
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.python.platform import test
+
+
+class BreakCanonicalizationTest(converter_test_base.TestCase):
+
+  def test_basic_while(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        if x % 2 == 0:
+          break
+        v.append(x)
+      return v
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual([], result.test_fn(0))
+      self.assertEqual([], result.test_fn(1))
+      self.assertEqual([3], result.test_fn(4))
+
+  def test_basic_for(self):
+
+    def test_fn(a):
+      v = []
+      for x in a:
+        x -= 1
+        if x % 2 == 0:
+          break
+        v.append(x)
+      return v
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      # The break is incompletely canonicalized. The loop will not interrupt,
+      # but the section following the break will be skipped.
+      self.assertEqual([], result.test_fn([]))
+      self.assertEqual([3, 3], result.test_fn([4, 4]))
+      self.assertEqual([3], result.test_fn([4, 5]))
+      self.assertEqual([3], result.test_fn([5, 4]))
+
+  def test_deeply_nested(self):
+
+    def test_fn(x):
+      v = []
+      u = []
+      w = []
+      while x > 0:
+        x -= 1
+        if x % 2 == 0:
+          if x % 3 != 0:
+            u.append(x)
+          else:
+            w.append(x)
+            break
+        v.append(x)
+      return v, u, w
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(([], [], []), result.test_fn(0))
+      self.assertEqual(([2, 1], [2], [0]), result.test_fn(3))
+      self.assertEqual(([10, 9, 8, 7], [10, 8], [6]), result.test_fn(11))
+
+  def test_nested_loops(self):
+
+    def test_fn(x):
+      v = []
+      u = []
+      while x > 0:
+        x -= 1
+        y = x
+        while y > 0:
+          y -= 1
+          if y % 2 == 0:
+            break
+          u.append(y)
+        if x == 0:
+          break
+        v.append(x)
+      return v, u
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(([], []), result.test_fn(0))
+      self.assertEqual(([1], []), result.test_fn(2))
+      self.assertEqual(([2, 1], [1]), result.test_fn(3))
+      self.assertEqual(([4, 3, 2, 1], [3, 1]), result.test_fn(5))
+
+  def test_loop_else(self):
+
+    def test_fn(x):
+      v = []
+      u = []
+      while x > 0:
+        x -= 1
+        y = x
+        while y > 1:
+          break
+        else:
+          u.append(y)
+          break
+        v.append(x)
+      return v, u
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(([], []), result.test_fn(0))
+      self.assertEqual(([], [1]), result.test_fn(2))
+      self.assertEqual(([2], [1]), result.test_fn(3))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions.py b/tensorflow/contrib/autograph/converters/builtin_functions.py
similarity index 76%
rename from tensorflow/contrib/py2tf/converters/builtin_functions.py
rename to tensorflow/contrib/autograph/converters/builtin_functions.py
index 2eb00f90575920ac948e799b0e97a9cfccb42fad..317711a866f731de1b497295a2752dee0eb544f5 100644
--- a/tensorflow/contrib/py2tf/converters/builtin_functions.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
 class BuiltinFunctionTransformer(transformer.Base):
@@ -34,25 +34,26 @@ class BuiltinFunctionTransformer(transformer.Base):
   def __init__(self, context):
     super(BuiltinFunctionTransformer, self).__init__(context)
 
-  # pylint:disable=invalid-name
-
-  def _convert_len(self, node):
+  def _convert_builtin(self, node):
     template = """
-      tf.shape(args)[0]
+      ag__.utils.dynamic_builtin(func, args)
     """
-    return templates.replace(template, args=node.args)[0].value
+    return templates.replace(template, func=node.func, args=node.args)[0].value
 
   def _convert_print(self, node):
     template = """
-      py2tf_utils.call_print(args)
+      ag__.utils.dynamic_print(args)
     """
     return templates.replace(template, args=node.args)[0].value
 
   def visit_Call(self, node):
     self.generic_visit(node)
     # TODO(mdan): This won't work if the function was hidden.
-    if isinstance(node.func, gast.Name) and node.func.id == 'len':
-      return self._convert_len(node)
+    # TODO(mdan): Rely on the live_val and use inspect_utils.is_builtin instead.
+    if (isinstance(node.func, gast.Name) and
+        node.func.id in ('len', 'range', 'xrange')):
+      return self._convert_builtin(node)
+    # Print needs to be handled separately because it can be read as statement.
     if isinstance(node.func, gast.Name) and node.func.id == 'print':
       return self._convert_print(node)
     return node
@@ -69,8 +70,6 @@ class BuiltinFunctionTransformer(transformer.Base):
     function_call = templates.replace(template, fname='print', args=args)[0]
     return self.visit(function_call)
 
-  # pylint:enable=invalid-name
-
 
 def transform(node, context):
   return BuiltinFunctionTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions_test.py b/tensorflow/contrib/autograph/converters/builtin_functions_test.py
similarity index 64%
rename from tensorflow/contrib/py2tf/converters/builtin_functions_test.py
rename to tensorflow/contrib/autograph/converters/builtin_functions_test.py
index b279ff77ef10b96586d3d68585adb0d5424afb90..30272409df322560b04ba75b3e1cb6f9ad5ff0af 100644
--- a/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions_test.py
@@ -22,12 +22,10 @@ import sys
 
 import six
 
-from tensorflow.contrib.py2tf.converters import builtin_functions
-from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import builtin_functions
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import logging_ops
-from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
@@ -47,7 +45,9 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
                          sess.run(
                              result.test_fn(constant_op.constant([0, 0, 0]))))
 
-  def test_print_with_op(self):
+        self.assertEqual(3, result.test_fn([0, 0, 0]))
+
+  def test_print(self):
 
     def test_fn(a):
       print(a)
@@ -55,14 +55,12 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {'print': print})
     node = builtin_functions.transform(node, self.ctx)
 
-    # Note: it's relevant not to include script_ops.py_func here, to verify
-    # that tf.Print is used.
-    with self.compiled(node, logging_ops.Print) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         try:
           out_capturer = six.StringIO()
           sys.stdout = out_capturer
-          result.test_fn('a')
+          result.test_fn(constant_op.constant('a'))
           sess.run(sess.graph.get_operations())
           self.assertEqual(out_capturer.getvalue(), 'a\n')
         finally:
@@ -70,41 +68,19 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
 
   def test_print_with_op_multiple_values(self):
 
-    def test_fn(a, b):
-      print(a, b)
-
-    node = self.parse_and_analyze(test_fn, {'print': print})
-    node = builtin_functions.transform(node, self.ctx)
-
-    # Note: it's relevant not to include script_ops.py_func here, to verify
-    # that tf.Print is used.
-    with self.compiled(node, logging_ops.Print) as result:
-      with self.test_session() as sess:
-        try:
-          out_capturer = six.StringIO()
-          sys.stdout = out_capturer
-          result.test_fn('a', 1)
-          sess.run(sess.graph.get_operations())
-          self.assertEqual(out_capturer.getvalue(), 'a 1\n')
-        finally:
-          sys.stdout = sys.__stdout__
-
-  def test_print_with_py_func(self):
-
     def test_fn(a, b, c):
       print(a, b, c)
 
     node = self.parse_and_analyze(test_fn, {'print': print})
     node = builtin_functions.transform(node, self.ctx)
 
-    # Note: it's relevant not to include logging_ops.Print here, to verify
-    # that py_func is used.
-    with self.compiled(node, script_ops.py_func) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         try:
           out_capturer = six.StringIO()
           sys.stdout = out_capturer
-          result.test_fn('a', 1, [2, 3])
+          result.test_fn(
+              constant_op.constant('a'), constant_op.constant(1), [2, 3])
           sess.run(sess.graph.get_operations())
           self.assertEqual(out_capturer.getvalue(), 'a 1 [2, 3]\n')
         finally:
diff --git a/tensorflow/contrib/py2tf/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
similarity index 68%
rename from tensorflow/contrib/py2tf/converters/call_trees.py
rename to tensorflow/contrib/autograph/converters/call_trees.py
index 1050ba654c63bb52c1c5e71c981a6a0baa3fc987..554f0471d44d54194c45c3855b1483796ae65a6a 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -22,17 +22,29 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import types
+from collections import namedtuple
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import inspect_utils
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
 
+class FunctionInfo(namedtuple('FunctionInfo', ('dtype',))):
+  pass
+
+
+# TODO(mdan): Move this to config.py.
+KNOWN_NUMPY_FUNCTIONS = {
+    ('numpy', 'random', 'binomial'): FunctionInfo(dtype='tf.int64'),
+}
+
+
 class FunctionNamer(object):
   """Describes the interface for CallTreeTransformer's namer."""
 
@@ -72,9 +84,8 @@ class CallTreeTransformer(transformer.Base):
     self.uncompiled_modules = uncompiled_modules
     self.nocompile_decorators = nocompile_decorators
 
-  # pylint:disable=invalid-name
-
   def _resolve_name(self, node):
+    """Used to resolve decorator info."""
     if isinstance(node, gast.Call):
       return self._resolve_name(node.func)
     if isinstance(node, gast.Name):
@@ -99,7 +110,19 @@ class CallTreeTransformer(transformer.Base):
                          (owner_type, node.attr))
     return None
 
+  def _function_is_compilable(self, target_entity):
+    """Determines whether an entity can be compiled at all."""
+    # TODO(mdan): This is just a placeholder. Implement.
+    return not inspect_utils.isbuiltin(target_entity)
+
   def _should_compile(self, node, fqn):
+    """Determines whether an entity should be compiled in the context."""
+    # TODO(mdan): Needs cleanup. We should remove the use of fqn altogether.
+    module_name = fqn[0]
+    for mod in self.uncompiled_modules:
+      if module_name.startswith(mod[0] + '.'):
+        return False
+
     for i in range(1, len(fqn)):
       if fqn[:i] in self.uncompiled_modules:
         return False
@@ -123,7 +146,7 @@ class CallTreeTransformer(transformer.Base):
       # Inspect the target function decorators. If any include a @convert
       # or @graph_ready annotation, then they must be called as they are.
       # TODO(mdan): This may be quite heavy.
-      # To parse and re-analize each function for every call site could be quite
+      # To parse and re-analyze each function for every call site could be quite
       # wasteful. Maybe we could cache the parsed AST?
       try:
         target_node, _ = parser.parse_entity(target_entity)
@@ -141,33 +164,6 @@ class CallTreeTransformer(transformer.Base):
 
     return True
 
-  def _determine_function_owner(self, m):
-    # TODO(mdan): The parent type should be known at analysis. Use that instead.
-    if hasattr(m, 'im_class'):  # Python 2
-      return m.im_class
-    if hasattr(m, '__qualname__'):  # Python 3
-      # Object attributes: should be bound to "self".
-      if hasattr(m, '__self__'):
-        return type(m.__self__)
-
-      # Class attributes: should have the owner name in their namespace.
-      qn = m.__qualname__.split('.')
-      if len(qn) < 2:
-        return None
-      owner_name, func_name = qn[-2:]
-      if func_name != m.__name__:
-        raise ValueError('Inconsistent names detected '
-                         '(__qualname__[1] = "%s", __name__ = "%s") for %s.' %
-                         (func_name, m.__name__, m))
-      if owner_name == '<locals>':
-        return None
-      if owner_name not in self.context.namespace:
-        raise ValueError(
-            'Could not resolve name "%s" while analyzing %s. Namespace:\n%s' %
-            (owner_name, m, self.context.namespace))
-      return self.context.namespace[owner_name]
-    return None
-
   def _rename_compilable_function(self, node):
     assert anno.hasanno(node.func, 'live_val')
     assert anno.hasanno(node.func, 'fqn')
@@ -182,7 +178,11 @@ class CallTreeTransformer(transformer.Base):
           target_fqn, live_entity=target_entity)
       do_rename = True
     else:
-      owner_type = self._determine_function_owner(target_entity)
+      if anno.hasanno(node.func, 'parent_type'):
+        owner_type = anno.getanno(node.func, 'parent_type')
+      else:
+        # Fallback - not reliable.
+        owner_type = inspect_utils.getmethodclass(target_entity)
       new_name, do_rename = self.context.namer.compiled_function_name(
           target_fqn, live_entity=target_entity, owner_type=owner_type)
 
@@ -196,15 +196,54 @@ class CallTreeTransformer(transformer.Base):
     return node
 
   def _wrap_to_py_func_no_return(self, node):
-    # TODO(mdan): Properly handle varargs, kwargs, etc.
+    # TODO(mdan): Properly handle varargs, etc.
     template = """
-      py2tf_utils.wrap_py_func(func, None, (original_args,), True)
+      ag__.utils.wrap_py_func(func, None, (args,), kwargs, True)
     """
-    return templates.replace(template, func=node.func, original_args=node.args)
-
-  def _function_is_compilable(self, target_entity):
-    # TODO(mdan): This is just a placeholder. Implement.
-    return not isinstance(target_entity, types.BuiltinFunctionType)
+    return templates.replace(
+        template,
+        func=node.func,
+        args=node.args,
+        kwargs=ast_util.keywords_to_dict(node.keywords))
+
+  def _wrap_to_py_func_single_return(self, node, dtype):
+    # TODO(mdan): Properly handle varargs, etc.
+    template = """
+      ag__.utils.wrap_py_func(func, dtype, (args,), kwargs, False)
+    """
+    return templates.replace_as_expression(
+        template,
+        func=node.func,
+        dtype=parser.parse_expression(dtype),
+        args=node.args,
+        kwargs=ast_util.keywords_to_dict(node.keywords))
+
+  def _insert_dynamic_conversion(self, node):
+    """Inlines a dynamic conversion for a dynamic function."""
+    # TODO(mdan): Pass information on the statically compiled functions.
+    # Having access to the statically compiled functions can help avoid
+    # unnecessary compilation.
+    # For example, this would lead to function `a` being compiled twice:
+    #
+    #   def a():
+    #     v = b
+    #     b()
+    #   def b():
+    #     a()
+    #
+    # This is really a problem with recursive calls, which currently can
+    # only be gated by a static condition, and should be rare.
+    # TODO(mdan): It probably makes sense to use dynamic conversion every time.
+    # Before we could convert all the time though, we'd need a reasonable
+    # caching mechanism.
+    template = """
+      ag__.converted_call(func, True, False, {}, args)
+    """
+    call_expr = templates.replace(template, func=node.func, args=node.args)
+    new_call = call_expr[0].value
+    # TODO(mdan): Improve the template mechanism to better support this.
+    new_call.keywords = node.keywords
+    return new_call
 
   def visit_Expr(self, node):
     if isinstance(node.value, gast.Call):
@@ -239,20 +278,31 @@ class CallTreeTransformer(transformer.Base):
     self.generic_visit(node)
     if anno.hasanno(node.func, 'live_val'):
       target_entity = anno.getanno(node.func, 'live_val')
+      if anno.hasanno(node.func, 'fqn'):
+        target_fqn = anno.getanno(node.func, 'fqn')
+      else:
+        target_fqn = None
       if self._function_is_compilable(target_entity):
         node = self._rename_compilable_function(node)
+      elif target_fqn and target_fqn in KNOWN_NUMPY_FUNCTIONS:
+        # TODO(mdan): Should we replace these with equivalent TF ops instead?
+        node = self._wrap_to_py_func_single_return(
+            node, KNOWN_NUMPY_FUNCTIONS[target_fqn].dtype)
       else:
-        raise NotImplementedError('py_func with return values')
+        raise NotImplementedError(
+            'py_func with return values (unknown function)')
     else:
-      if self.context.recursive:
-        raise NotImplementedError('Could not resolve target function.')
+      if ast_util.matches(node, 'super(_)'):
+        # super() calls are preserved. The class conversion mechanism will
+        # ensure that they return the correct value.
+        pass
+      elif self.context.recursive:
+        node = self._insert_dynamic_conversion(node)
       else:
-        # TODO(mdan): Double check. Is this reachable code?
+        # Unresolved functions are allowed in non-recursive mode.
         pass
     return node
 
-  # pylint:enable=invalid-name
-
 
 def transform(node, context, uncompiled_modules, nocompile_decorators):
   """Transform function call to the compiled counterparts.
diff --git a/tensorflow/contrib/py2tf/converters/call_trees_test.py b/tensorflow/contrib/autograph/converters/call_trees_test.py
similarity index 75%
rename from tensorflow/contrib/py2tf/converters/call_trees_test.py
rename to tensorflow/contrib/autograph/converters/call_trees_test.py
index 777648dc0b31863227262fbf931aba680bb4ed98..303dd54a4ee49de27fad0c5cdc2d6274abfe0fa8 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees_test.py
+++ b/tensorflow/contrib/autograph/converters/call_trees_test.py
@@ -18,9 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import call_trees
-from tensorflow.contrib.py2tf.converters import converter_test_base
+import numpy as np
+
+from tensorflow.contrib.autograph.converters import call_trees
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -30,7 +34,7 @@ class CallTreesTest(converter_test_base.TestCase):
   def test_basic(self):
 
     def test_fn_1(_):
-      raise ValueError('This should not be called in the compiled verison.')
+      raise ValueError('This should not be called in the compiled version.')
 
     def renamed_test_fn_1(a):
       return a + 1
@@ -47,6 +51,21 @@ class CallTreesTest(converter_test_base.TestCase):
       result.renamed_test_fn_1 = renamed_test_fn_1
       self.assertEquals(3, result.test_fn_2(1))
 
+  def test_dynamic_function(self):
+
+    def test_fn_1():
+      raise ValueError('This should be masked by the mock.')
+
+    def test_fn_2(f):
+      return f() + 3
+
+    node = self.parse_and_analyze(test_fn_2, {})
+    node = call_trees.transform(node, self.ctx, (), ())
+
+    with self.compiled(node) as result:
+      # 10 = 7 (from the mock) + 3 (from test_fn_2)
+      self.assertEquals(10, result.test_fn_2(test_fn_1))
+
   def test_simple_methods(self):
 
     class TestClass(object):
@@ -59,6 +78,7 @@ class CallTreesTest(converter_test_base.TestCase):
 
     node = self.parse_and_analyze(
         TestClass.test_fn_2, {'TestClass': TestClass},
+        namer=converter_test_base.FakeNoRenameNamer(),
         arg_types={'self': (TestClass.__name__, TestClass)})
     node = call_trees.transform(node, self.ctx, (), ())
 
@@ -89,6 +109,20 @@ class CallTreesTest(converter_test_base.TestCase):
         sess.run(sess.graph.get_operations()[0])
         self.assertEquals('bar', a.foo)
 
+  def test_py_func_wrap_known_function(self):
+
+    def test_fn():
+      return np.random.binomial(2, 0.5)
+
+    node = self.parse_and_analyze(test_fn, {'np': np})
+    node = call_trees.transform(node, self.ctx, (), ())
+
+    with self.compiled(node, dtypes.int64) as result:
+      result.np = np
+      with self.test_session() as sess:
+        self.assertTrue(isinstance(result.test_fn(), ops.Tensor))
+        self.assertIn(sess.run(result.test_fn()), (0, 1, 2))
+
   def test_uncompiled_modules(self):
 
     def test_fn(a):
diff --git a/tensorflow/contrib/py2tf/converters/continue_statements.py b/tensorflow/contrib/autograph/converters/continue_statements.py
similarity index 94%
rename from tensorflow/contrib/py2tf/converters/continue_statements.py
rename to tensorflow/contrib/autograph/converters/continue_statements.py
index 4069a678b118b56b59d2e5491bb80cf52efd8143..4299a8a9d59715d032222c47794bbb4393f34ce6 100644
--- a/tensorflow/contrib/py2tf/converters/continue_statements.py
+++ b/tensorflow/contrib/autograph/converters/continue_statements.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class ContinueCanonicalizationTransformer(transformer.Base):
diff --git a/tensorflow/contrib/py2tf/converters/continue_statements_test.py b/tensorflow/contrib/autograph/converters/continue_statements_test.py
similarity index 95%
rename from tensorflow/contrib/py2tf/converters/continue_statements_test.py
rename to tensorflow/contrib/autograph/converters/continue_statements_test.py
index a598dcd1aed29478b7e3fe27e3c1b20010247dd9..bcbb316d7459aa5a25bb0bd128cd6e359a393288 100644
--- a/tensorflow/contrib/py2tf/converters/continue_statements_test.py
+++ b/tensorflow/contrib/autograph/converters/continue_statements_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import continue_statements
-from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import continue_statements
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py
similarity index 69%
rename from tensorflow/contrib/py2tf/converters/control_flow.py
rename to tensorflow/contrib/autograph/converters/control_flow.py
index d53e3e4fd6d87004cbe55bd430346ad263e898ea..2e26cdb3d9387d358e0225555506f199e9945d0b 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow.py
+++ b/tensorflow/contrib/autograph/converters/control_flow.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import ast_util
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class SymbolNamer(object):
@@ -49,11 +50,6 @@ class ControlFlowTransformer(transformer.Base):
   def __init__(self, context):
     super(ControlFlowTransformer, self).__init__(context)
 
-  # pylint:disable=invalid-name
-
-  def visit_For(self, node):
-    assert False, 'for statement should have been canonicalized at this point'
-
   def _create_cond_branch(self, body_name, aliased_orig_names,
                           aliased_new_names, body, returns):
     if aliased_orig_names:
@@ -82,7 +78,7 @@ class ControlFlowTransformer(transformer.Base):
   def _create_cond_expr(self, results, test, body_name, orelse_name):
     if results is not None:
       template = """
-        results = py2tf_utils.run_cond(test, body_name, orelse_name)
+        results = ag__.utils.run_cond(test, body_name, orelse_name)
       """
       return templates.replace(
           template,
@@ -92,7 +88,7 @@ class ControlFlowTransformer(transformer.Base):
           orelse_name=orelse_name)
     else:
       template = """
-        py2tf_utils.run_cond(test, body_name, orelse_name)
+        ag__.utils.run_cond(test, body_name, orelse_name)
       """
       return templates.replace(
           template, test=test, body_name=body_name, orelse_name=orelse_name)
@@ -170,7 +166,22 @@ class ControlFlowTransformer(transformer.Base):
     body_closure = body_scope.modified - body_scope.created
     all_referenced = body_scope.referenced
 
+    cond_scope = anno.getanno(node, NodeAnno.COND_SCOPE)
+    cond_closure = set()
+    for s in cond_scope.referenced:
+      for root in s.support_set:
+        if root not in body_scope.created:
+          cond_closure.add(root)
+
     state = list(body_closure)
+    if not state:
+      # TODO(mdan): Implement this properly.
+      # To complete this statement, we need to check whether any variable
+      # created inside the body scope is used before being modified outside the
+      # scope. This should be done during activity analysis, and in general
+      # should cover the case where variables may not be initialized.
+      raise ValueError('cannot convert while loop: no outputs')
+
     state_ssf = [
         self.context.namer.new_symbol(s.ssf(), all_referenced) for s in state
     ]
@@ -196,7 +207,8 @@ class ControlFlowTransformer(transformer.Base):
       def body_name(state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = py2tf_utils.run_while(test_name, body_name, [state])
+      state_ast_tuple = ag__.while_loop(
+          test_name, body_name, (state,), (extra_deps,))
     """
     node = templates.replace(
         template,
@@ -208,11 +220,67 @@ class ControlFlowTransformer(transformer.Base):
         test=test,
         body_name=self.context.namer.new_symbol('loop_body',
                                                 body_scope.referenced),
-        body=node_body)
+        body=node_body,
+        extra_deps=tuple(s.ast() for s in cond_closure),
+    )
 
     return node
 
-  # pylint:enable=invalid-name
+  def visit_For(self, node):
+    self.generic_visit(node)
+
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    body_closure = body_scope.modified - body_scope.created
+    all_referenced = body_scope.referenced
+
+    state = list(body_closure)
+
+    state_ssf = [
+        self.context.namer.new_symbol(s.ssf(), all_referenced) for s in state
+    ]
+    ssf_map = {
+        name: ssf
+        for name, ssf in zip(state, state_ssf)
+        if str(name) != ssf
+    }
+
+    if len(state) == 1:
+      state = state[0]
+      state_ssf = state_ssf[0]
+      state_ast_tuple = state
+    else:
+      state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
+
+    node_body = ast_util.rename_symbols(node.body, ssf_map)
+    if anno.hasanno(node, 'extra_cond'):
+      extra_cond = anno.getanno(node, 'extra_cond')
+      extra_cond = ast_util.rename_symbols(extra_cond, ssf_map)
+    else:
+      extra_cond = parser.parse_expression('True')
+
+    template = """
+      def extra_cond_name(state_ssf):
+        return extra_cond_expr
+      def body_name(iterate, state_ssf):
+        body
+        return state_ssf,
+      state_ast_tuple = ag__.for_loop(
+          iterated, extra_cond_name, body_name, (state,))
+    """
+    node = templates.replace(
+        template,
+        state=state,
+        state_ssf=state_ssf,
+        state_ast_tuple=state_ast_tuple,
+        iterated=node.iter,
+        iterate=node.target,
+        extra_cond_name=self.context.namer.new_symbol('extra_cond',
+                                                      all_referenced),
+        extra_cond_expr=extra_cond,
+        body_name=self.context.namer.new_symbol('loop_body', all_referenced),
+        body=node_body)
+
+    return node
 
 
 def transform(node, context):
diff --git a/tensorflow/contrib/py2tf/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py
similarity index 58%
rename from tensorflow/contrib/py2tf/converters/control_flow_test.py
rename to tensorflow/contrib/autograph/converters/control_flow_test.py
index b785b284a7fb7a0257551326c88b44a341b295ba..c5610b16b4e5de374f404307d3583660707d5e0b 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow_test.py
+++ b/tensorflow/contrib/autograph/converters/control_flow_test.py
@@ -18,9 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import control_flow
-from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import control_flow
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 
@@ -94,6 +95,77 @@ class ControlFlowTest(converter_test_base.TestCase):
       with self.test_session() as sess:
         self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
 
+  def test_simple_for(self):
+
+    def test_fn(l):
+      s1 = 0
+      s2 = 0
+      for e in l:
+        s1 += e
+        s2 += e * e
+      return s1, s2
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        l = [1, 2, 3]
+        self.assertEqual(
+            test_fn(l), sess.run(result.test_fn(constant_op.constant(l))))
+        l = []
+        self.assertEqual(
+            test_fn(l),
+            sess.run(
+                result.test_fn(
+                    constant_op.constant(l, shape=(0,), dtype=dtypes.int32))))
+
+  def test_for_single_var(self):
+
+    def test_fn(l):
+      s = 0
+      for e in l:
+        s += e
+      return s
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        l = [1, 2, 3]
+        self.assertEqual(
+            test_fn(l), sess.run(result.test_fn(constant_op.constant(l))))
+        l = []
+        self.assertEqual(
+            test_fn(l),
+            sess.run(
+                result.test_fn(
+                    constant_op.constant(l, shape=(0,), dtype=dtypes.int32))))
+
+  def test_for_with_iterated_expression(self):
+
+    eval_count = [0]
+
+    def count_evals(x):
+      eval_count[0] += 1
+      return x
+
+    def test_fn(n):
+      s = 0
+      for e in count_evals(range(n)):
+        s += e
+      return s
+
+    node = self.parse_and_analyze(test_fn, {'count_evals': count_evals})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      result.count_evals = count_evals
+      self.assertEqual(test_fn(5), result.test_fn(5))
+      # count_evals ran twice, once for test_fn and another for result.test_fn
+      self.assertEqual(eval_count[0], 2)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py
similarity index 55%
rename from tensorflow/contrib/py2tf/converters/converter_test_base.py
rename to tensorflow/contrib/autograph/converters/converter_test_base.py
index 67747183dd323a799a04943ce4c7fe8c4093d002..41c2e71702e7e3ee3811a2cbee27c8c988eb3a5c 100644
--- a/tensorflow/contrib/py2tf/converters/converter_test_base.py
+++ b/tensorflow/contrib/autograph/converters/converter_test_base.py
@@ -21,26 +21,31 @@ from __future__ import print_function
 import contextlib
 import imp
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.contrib.autograph import operators
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import pretty_printer
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis import live_values
+from tensorflow.contrib.autograph.pyct.static_analysis import type_info
 from tensorflow.python.platform import test
 
 
 class FakeNamer(object):
+  """A fake namer that uses a global counter to generate unique names."""
+
+  def __init__(self):
+    self.i = 0
 
   def new_symbol(self, name_root, used):
-    i = 0
     while True:
-      name = '%s%d' % (name_root, i)
+      self.i += 1
+      name = '%s%d' % (name_root, self.i)
       if name not in used:
         return name
-      i += 1
 
   def compiled_function_name(self,
                              original_fqn,
@@ -52,26 +57,51 @@ class FakeNamer(object):
     return ('renamed_%s' % '_'.join(original_fqn)), True
 
 
+class FakeNoRenameNamer(FakeNamer):
+
+  def compiled_function_name(self, original_fqn, **_):
+    return str(original_fqn), False
+
+
 class TestCase(test.TestCase):
   """Base class for unit tests in this module. Contains relevant utilities."""
 
   @contextlib.contextmanager
   def compiled(self, node, *symbols):
-    source = '<compile failed>'
+    source = None
+
+    self.dynamic_calls = []
+    def converted_call(*args):
+      """Mock version of api.converted_call."""
+      self.dynamic_calls.append(args)
+      return 7
+
     try:
       result, source = compiler.ast_to_object(node)
-      result.tf = self.make_fake_tf(*symbols)
-      result.py2tf_utils = utils
+      result.tf = self.make_fake_mod('fake_tf', *symbols)
+      fake_ag = self.make_fake_mod('fake_ag', converted_call)
+      fake_ag.__dict__.update(operators.__dict__)
+      fake_ag.__dict__['utils'] = utils
+      result.__dict__['ag__'] = fake_ag
       yield result
     except Exception:  # pylint:disable=broad-except
-      print('Offending compiled code:\n%s' % source)
+      if source is None:
+        print('Offending AST:\n%s' % pretty_printer.fmt(node, color=False))
+      else:
+        print('Offending compiled code:\n%s' % source)
       raise
 
-  def make_fake_tf(self, *symbols):
-    fake_tf = imp.new_module('fake_tf')
+  def make_fake_mod(self, name, *symbols):
+    fake_mod = imp.new_module(name)
     for s in symbols:
-      setattr(fake_tf, s.__name__, s)
-    return fake_tf
+      if hasattr(s, '__name__'):
+        setattr(fake_mod, s.__name__, s)
+      elif hasattr(s, 'name'):
+        # This is a bit of a hack, but works for things like tf.int32
+        setattr(fake_mod, s.name, s)
+      else:
+        raise ValueError('can not attach %s - what should be its name?' % s)
+    return fake_mod
 
   def attach_namespace(self, module, **ns):
     for k, v in ns.items():
@@ -83,6 +113,7 @@ class TestCase(test.TestCase):
                         namer=None,
                         arg_types=None,
                         include_type_analysis=True,
+                        owner_type=None,
                         recursive=True):
     node, source = parser.parse_entity(test_fn)
     ctx = context.EntityContext(
@@ -92,7 +123,9 @@ class TestCase(test.TestCase):
         namespace=namespace,
         arg_values=None,
         arg_types=arg_types,
-        recursive=recursive)
+        owner_type=owner_type,
+        recursive=recursive,
+        type_annotation_func=utils.set_element_type)
     node = qual_names.resolve(node)
     node = activity.resolve(node, ctx)
     node = live_values.resolve(node, ctx, {})
diff --git a/tensorflow/contrib/py2tf/converters/decorators.py b/tensorflow/contrib/autograph/converters/decorators.py
similarity index 57%
rename from tensorflow/contrib/py2tf/converters/decorators.py
rename to tensorflow/contrib/autograph/converters/decorators.py
index 3f620c1cd2d9b75f82410754a7e812e13eabe3ae..92445f31746cf94856ea43893f99a2ba60355fb5 100644
--- a/tensorflow/contrib/py2tf/converters/decorators.py
+++ b/tensorflow/contrib/autograph/converters/decorators.py
@@ -24,8 +24,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import pretty_printer
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import pretty_printer
 
 
 class DecoratorsTransformer(gast.NodeTransformer):
@@ -33,6 +33,7 @@ class DecoratorsTransformer(gast.NodeTransformer):
 
   def __init__(self, remove_decorators):
     self.remove_decorators = remove_decorators
+    self.additional_dependencies = set()
 
   # pylint:disable=invalid-name
 
@@ -44,13 +45,38 @@ class DecoratorsTransformer(gast.NodeTransformer):
         dec_func = dec.func
       else:
         dec_func = dec
+
+      # Special cases.
+      # TODO(mdan): Is there any way we can treat these more generically?
+      # We may want to forego using decorators altogether if we can't
+      # properly support them.
+      if isinstance(dec_func, gast.Name) and dec_func.id in ('classmethod',):
+        # Assumption: decorators are only visible in the AST when converting
+        # a function inline (via another decorator).
+        # In that case, the converted function is no longer part of the
+        # original object that it was declared into.
+        # This is currently verified by tests.
+        continue
+
       if not anno.hasanno(dec_func, 'live_val'):
         raise ValueError(
             'Could not resolve decorator: %s' % pretty_printer.fmt(dec_func))
+
       dec_value = anno.getanno(dec_func, 'live_val')
       if dec_value not in self.remove_decorators:
-        kept_decorators.append(dec)
-    node.decorator_list = kept_decorators
+        kept_decorators.append((dec, dec_value))
+
+    for _, dec_value in kept_decorators:
+      if dec_value.__module__ == '__main__':
+        raise ValueError(
+            'decorator "%s" was not allowed because it is declared '
+            'in the module "%s". To fix this, declare it in a separate '
+            'module that we can import it from.' % (dec_value,
+                                                    dec_value.__module__))
+      else:
+        self.additional_dependencies.add(dec_value)
+
+    node.decorator_list = [dec for dec, _ in kept_decorators]
     return node
 
   # pylint:enable=invalid-name
@@ -59,4 +85,4 @@ class DecoratorsTransformer(gast.NodeTransformer):
 def transform(node, remove_decorators):
   transformer = DecoratorsTransformer(remove_decorators)
   node = transformer.visit(node)
-  return node
+  return node, transformer.additional_dependencies
diff --git a/tensorflow/contrib/autograph/converters/decorators_test.py b/tensorflow/contrib/autograph/converters/decorators_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c01f689127dbedad7669c65b03e7da071b2d64d
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/decorators_test.py
@@ -0,0 +1,139 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for decorators module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import wraps
+
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import decorators
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.python.platform import test
+
+
+# The Python parser only briefly captures decorators into the AST.
+# The interpreter desugars them on load, and the decorated function loses any
+# trace of the decorator (which is normally what you would expect, since
+# they are meant to be transparent).
+# However, decorators are still visible when you analyze the function
+# from inside a decorator, before it was applied - as is the case
+# with our conversion decorators.
+
+
+def simple_decorator(f):
+  return lambda a: f(a) + 1
+
+
+def self_removing_decorator(removing_wrapper):
+  def decorator(f):
+    @wraps(f)
+    def wrapper(*args):
+      # This removing wrapper is defined in the test below. This setup is so
+      # intricate just to simulate how we use the transformer in practice.
+      transformed_f = removing_wrapper(f, (self_removing_decorator,))
+      return transformed_f(*args) + 1
+    return wrapper
+  return decorator
+
+
+class DecoratorsTest(converter_test_base.TestCase):
+
+  def _remover_wrapper(self, f, remove_decorators):
+    namespace = {
+        'self_removing_decorator': self_removing_decorator,
+        'simple_decorator': simple_decorator
+    }
+    node = self.parse_and_analyze(f, namespace)
+    node, _ = decorators.transform(node, remove_decorators=remove_decorators)
+    result, _ = compiler.ast_to_object(node)
+    return getattr(result, f.__name__)
+
+  def test_noop(self):
+
+    def test_fn(a):
+      return a
+
+    node = self.parse_and_analyze(test_fn, {})
+    node, deps = decorators.transform(node, remove_decorators=())
+    result, _ = compiler.ast_to_object(node)
+
+    self.assertFalse(deps)
+    self.assertEqual(1, result.test_fn(1))
+
+  def test_function(self):
+
+    @self_removing_decorator(self._remover_wrapper)
+    def test_fn(a):
+      return a
+
+    # 2 = 1 (a) + 1 (decorator applied exactly once)
+    self.assertEqual(2, test_fn(1))
+
+  def test_method(self):
+
+    class TestClass(object):
+
+      @self_removing_decorator(self._remover_wrapper)
+      def test_fn(self, a):
+        return a
+
+    # 2 = 1 (a) + 1 (decorator applied exactly once)
+    self.assertEqual(2, TestClass().test_fn(1))
+
+  def test_multiple_decorators(self):
+
+    class TestClass(object):
+
+      # Note that reversing the order of this two doesn't work.
+      @classmethod
+      @self_removing_decorator(self._remover_wrapper)
+      def test_fn(cls, a):
+        return a
+
+    # 2 = 1 (a) + 1 (decorator applied exactly once)
+    self.assertEqual(2, TestClass.test_fn(1))
+
+  def test_nested_decorators(self):
+
+    @self_removing_decorator(self._remover_wrapper)
+    def test_fn(a):
+      @simple_decorator
+      def inner_fn(b):
+        return b + 11
+      return inner_fn(a)
+
+    with self.assertRaises(ValueError):
+      test_fn(1)
+
+  # TODO(mdan): Uncomment this test once converter_test_base is updated.
+  # (can't do it now because it has unrelated pending changes)
+  # def test_nested_decorators(self):
+  #
+  #   @self_removing_decorator(self._remover_wrapper)
+  #   def test_fn(a):
+  #     @imported_decorator
+  #     def inner_fn(b):
+  #       return b + 11
+  #     return inner_fn(a)
+  #
+  #   # 14 = 1 (a) + 1 (simple_decorator) + 11 (inner_fn)
+  #   self.assertEqual(14, test_fn(1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/printing.py b/tensorflow/contrib/autograph/converters/ifexp.py
similarity index 50%
rename from tensorflow/contrib/py2tf/utils/printing.py
rename to tensorflow/contrib/autograph/converters/ifexp.py
index 95a62bd80b5f4854e6a062df18d882f7bd495555..616d222762e09feeba1809f119d915dfbe522283 100644
--- a/tensorflow/contrib/py2tf/utils/printing.py
+++ b/tensorflow/contrib/autograph/converters/ifexp.py
@@ -12,36 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TensorFlow printing support utilities."""
+"""Canonicalizes the ternary conditional operator."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.utils import py_func
-from tensorflow.python.ops import logging_ops
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
-def is_tf_print_compatible(value):
-  # TODO(mdan): Enable once we can reliably test this.
-  # This is currently disabled because we can't capture the output of
-  # op kernels from Python.
-  del value
-  return False
+class IfExp(transformer.Base):
+  """Canonicalizes all IfExp nodes into plain conditionals."""
 
+  def visit_IfExp(self, node):
+    template = """
+        ag__.utils.run_cond(test, lambda: (body,), lambda: (orelse,))
+    """
+    desugared_ifexp = templates.replace_as_expression(
+        template, test=node.test, body=node.body, orelse=node.orelse)
+    return desugared_ifexp
 
-def call_print(*values):
-  """Compiled counterpart of the print builtin.
 
-  The function attempts to use tf.Print if all the values are compatible.
-  Otherwise, it will fall back to py_func.
+def transform(node, context):
+  """Desugar IfExp nodes into plain conditionals.
 
   Args:
-    *values: values to print
+     node: an AST node to transform
+     context: a context object
+
   Returns:
-    A dummy value indicating the print completed. If tf.
+     new_node: an AST with no IfExp nodes, only conditionals.
   """
 
-  if all(map(is_tf_print_compatible, values)):
-    return logging_ops.Print(1, values)
-  return py_func.wrap_py_func(print, None, values, use_dummy_return=True)
+  node = IfExp(context).visit(node)
+  return node
diff --git a/tensorflow/contrib/autograph/converters/ifexp_test.py b/tensorflow/contrib/autograph/converters/ifexp_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac6849dcb4bd7dacd84bb205f5c65395d8c2f51e
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/ifexp_test.py
@@ -0,0 +1,106 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ifexp module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import ifexp
+from tensorflow.python.platform import test
+
+
+class IfExpTest(converter_test_base.TestCase):
+
+  def compiled_fn(self, test_fn, *args):
+    node = self.parse_and_analyze(test_fn, {})
+    node = ifexp.transform(node, self.ctx)
+    module = self.compiled(node, *args)
+    return module
+
+  def test_simple(self):
+
+    def test_fn(x):
+      return 1 if x else 0
+
+    with self.compiled_fn(test_fn) as result:
+      result.autograph_util = utils
+      for x in [0, 1]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_fn(self):
+
+    def f(x):
+      return 3 * x
+
+    def test_fn(x):
+      y = f(x * x if x > 0 else x)
+      return y
+
+    with self.compiled_fn(test_fn) as result:
+      result.autograph_util = utils
+      result.f = f
+      for x in [-2, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_exp(self):
+
+    def test_fn(x):
+      return x * x if x > 0 else x
+
+    with self.compiled_fn(test_fn) as result:
+      result.autograph_util = utils
+      for x in [-2, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_nested(self):
+
+    def test_fn(x):
+      return x * x if x > 0 else x if x else 1
+
+    with self.compiled_fn(test_fn) as result:
+      result.autograph_util = utils
+      for x in [-2, 0, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_in_cond(self):
+
+    def test_fn(x):
+      if x > 0:
+        return x * x if x < 5 else x * x * x
+      return -x
+
+    with self.compiled_fn(test_fn) as result:
+      result.autograph_util = utils
+      for x in [-2, 2, 5]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_assign_in_cond(self):
+
+    def test_fn(x):
+      if x > 0:
+        x = -x if x < 5 else x
+      return x
+
+    with self.compiled_fn(test_fn) as result:
+      result.autograph_util = utils
+      for x in [-2, 2, 5]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/list_comprehension.py b/tensorflow/contrib/autograph/converters/list_comprehension.py
similarity index 93%
rename from tensorflow/contrib/py2tf/converters/list_comprehension.py
rename to tensorflow/contrib/autograph/converters/list_comprehension.py
index e8744831100e4852919b5cd1253b74acea4d790d..d7f292015164e047d054c5d1fb0b391e960bb73d 100644
--- a/tensorflow/contrib/py2tf/converters/list_comprehension.py
+++ b/tensorflow/contrib/autograph/converters/list_comprehension.py
@@ -31,9 +31,9 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
 class ListCompCanonicalizationTransformer(transformer.Base):
diff --git a/tensorflow/contrib/py2tf/converters/list_comprehension_test.py b/tensorflow/contrib/autograph/converters/list_comprehension_test.py
similarity index 93%
rename from tensorflow/contrib/py2tf/converters/list_comprehension_test.py
rename to tensorflow/contrib/autograph/converters/list_comprehension_test.py
index 025fac11e41e6771fbb9b80ff3da70dc3ceec73e..4758671f5ec83c26cfa54be0ef68f5f564094f6c 100644
--- a/tensorflow/contrib/py2tf/converters/list_comprehension_test.py
+++ b/tensorflow/contrib/autograph/converters/list_comprehension_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import list_comprehension
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import list_comprehension
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/autograph/converters/lists.py b/tensorflow/contrib/autograph/converters/lists.py
new file mode 100644
index 0000000000000000000000000000000000000000..b49521b2c328f418828a5e92890aa1b169384b70
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/lists.py
@@ -0,0 +1,116 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converter for list operations.
+
+This includes converting Python lists to TensorArray/TensorList.
+"""
+
+# TODO(mdan): Elaborate the logic here.
+# TODO(mdan): Does it even make sense to attempt to try to use TAs?
+# The current rule (always convert to TensorArray) is naive and insufficient.
+# In general, a better mechanism could look like:
+#   * convert to TensorList by default
+#   * leave as Python list if the user explicitly forbids it
+#   * convert to TensorArray only when complete write once behavior can be
+#     guaranteed (e.g. list comprehensions)
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.python.framework import dtypes
+
+
+class ListTransformer(transformer.Base):
+  """Converts lists and related operations to their TF counterpart."""
+
+  def _empty_list(self, node):
+    if not anno.hasanno(node, 'element_type'):
+      raise NotImplementedError(
+          'type inference for empty lists is not yet supported; '
+          'use set_element_type(<list>, <dtype>) to continue')
+    dtype = anno.getanno(node, 'element_type')
+    if not isinstance(dtype, dtypes.DType):
+      # TODO(mdan): Allow non-TF dtypes?
+      # That would be consistent with the dynamic dispatch pattern, but
+      # we must make sure that doesn't become confusing.
+      raise NotImplementedError('element type "%s" not yet supported' % dtype)
+
+    dtype_name = dtype.name
+    # TODO(mdan): Does it ever make sense not to use tensor lists?
+    template = """
+      tf.TensorArray(tf.dtype_name, size=0, dynamic_size=True)
+    """
+    return templates.replace_as_expression(template, dtype_name=dtype_name)
+
+  def _pre_populated_list(self, node):
+    raise NotImplementedError('pre-populated lists')
+
+  def visit_Expr(self, node):
+    node = self.generic_visit(node)
+    if isinstance(node.value, gast.Call):
+      call_node = node.value
+
+      if not anno.hasanno(call_node.func, anno.Basic.QN):
+        return node
+      qn = anno.getanno(call_node.func, anno.Basic.QN)
+
+      if qn.qn[-1] == 'append' and (len(call_node.args) == 1):
+        template = """
+          target = ag__.utils.dynamic_list_append(target, element)
+        """
+        node = templates.replace(
+            template,
+            target=qn.parent.ast(),
+            element=call_node.args[0])
+    return node
+
+  def _replace_list_constructors(self, targets, values):
+    for target in targets:
+      if (isinstance(target, (gast.Tuple, gast.List)) and
+          isinstance(values, (gast.Tuple, gast.List))):
+        n_targets = len(target.elts)
+        for i in range(n_targets):
+          target_el, value_el = target.elts[i], values.elts[i]
+          values.elts[i] = self._replace_list_constructors(
+              (target_el,), value_el)
+        return values
+      if isinstance(values, gast.List):
+        if values.elts:
+          return self._pre_populated_list(values)
+        else:
+          return self._empty_list(values)
+    return values
+
+  def visit_Assign(self, node):
+    node = self.generic_visit(node)
+
+    # Only convert lists when they are assigned to a variable, e.g.:
+    #   l = []
+    # TODO(mdan): A similar pattern exists in type_info.py
+    # We should add a generic "unpack_assignment" function to the base
+    # transformer, that has the same effect as applying some logic to the SSA
+    # form.
+    node.value = self._replace_list_constructors(node.targets, node.value)
+    return node
+
+
+def transform(node, context):
+  return ListTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/lists_test.py b/tensorflow/contrib/autograph/converters/lists_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c6dc64f197f75eb3e66c01fb078467e8e8ea89
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/lists_test.py
@@ -0,0 +1,96 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lists module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import lists
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.platform import test
+
+
+class ListTest(converter_test_base.TestCase):
+
+  def test_empty_annotated_list(self):
+
+    def test_fn():
+      l = []
+      utils.set_element_type(l, dtypes.int32)
+      l.append(1)
+      return l
+
+    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node, tensor_array_ops.TensorArray,
+                       dtypes.int32) as result:
+      # TODO(mdan): Attach these additional modules automatically.
+      result.utils = utils
+      result.dtypes = dtypes
+      with self.test_session() as sess:
+        self.assertAllEqual([1], sess.run(result.test_fn().stack()))
+
+  def test_empty_annotated_lists_unpacked(self):
+
+    def test_fn():
+      l, m = [], []
+      utils.set_element_type(l, dtypes.int32)
+      utils.set_element_type(m, dtypes.int32)
+      l.append(1)
+      m.append(2)
+      return l, m
+
+    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node, tensor_array_ops.TensorArray,
+                       dtypes.int32) as result:
+      result.utils = utils
+      result.dtypes = dtypes
+      with self.test_session() as sess:
+        res_l, res_m = result.test_fn()
+        self.assertEqual([1], sess.run(res_l.stack()))
+        self.assertEqual([2], sess.run(res_m.stack()))
+
+  def test_empty_annotated_lists_list_unpacked(self):
+
+    def test_fn():
+      [l, m] = [], []
+      utils.set_element_type(l, dtypes.int32)
+      utils.set_element_type(m, dtypes.int32)
+      l.append(1)
+      m.append(2)
+      return l, m
+
+    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node, tensor_array_ops.TensorArray,
+                       dtypes.int32) as result:
+      result.utils = utils
+      result.dtypes = dtypes
+      with self.test_session() as sess:
+        res_l, res_m = result.test_fn()
+        self.assertEqual([1], sess.run(res_l.stack()))
+        self.assertEqual([2], sess.run(res_m.stack()))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/converters/logical_expressions.py b/tensorflow/contrib/autograph/converters/logical_expressions.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a795a315a3c2aa08ac1577a204102755b6e849c
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/logical_expressions.py
@@ -0,0 +1,132 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converter for logical expressions.
+
+e.g. `a and b -> tf.logical_and(a, b)`. This is not done automatically in TF.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+
+
+# TODO(mdan): Properly extrack boolean ops according to lazy eval rules.
+# Note that this isn't completely safe either, because tensors may have control
+# dependencies.
+# Note that for loops that should be done after the loop was converted to
+# tf.while_loop so that the expanded conditionals are properly scoped.
+
+# Used to signal that an operand is safe for non-lazy evaluation.
+SAFE_BOOLEAN_OPERAND = 'SAFE_BOOLEAN_OPERAND'
+
+
+class LogicalExpressionTransformer(transformer.Base):
+  """Converts logical expressions to corresponding TF calls."""
+
+  def __init__(self, context):
+    super(LogicalExpressionTransformer, self).__init__(context)
+    # TODO(mdan): Look into replacing with bitwise operators instead.
+    # TODO(mdan): Skip replacing if the function is trivial.
+    self.op_mapping = {
+        gast.And: 'tf.logical_and',
+        gast.Eq: 'tf.equal',
+        gast.Gt: 'tf.greater',
+        gast.GtE: 'tf.greater_equal',
+        gast.Lt: 'tf.less',
+        gast.LtE: 'tf.less_equal',
+        gast.Not: 'tf.logical_not',
+        gast.NotEq: 'tf.not_equal',
+        gast.Or: 'tf.logical_or',
+        gast.USub: 'tf.negative',
+        gast.Is: 'autograph_utils.dynamic_is',
+        gast.IsNot: 'autograph_utils.dynamic_is_not'
+    }
+
+  def _expect_simple_symbol(self, operand):
+    if isinstance(operand, gast.Name):
+      return
+    if anno.hasanno(operand, SAFE_BOOLEAN_OPERAND):
+      return
+    raise NotImplementedError(
+        'only simple local variables are supported in logical and compound '
+        'comparison expressions; for example, we support "a or b" but not '
+        '"a.x or b"; for a workaround, assign the expression to a local '
+        'variable and use that instead, for example "tmp = a.x", "tmp or b"')
+
+  def _matching_func(self, operator):
+    op_type = type(operator)
+    mapped_op = self.op_mapping.get(op_type)
+    if not mapped_op:
+      raise NotImplementedError('operator %s is not yet supported' % op_type)
+    return mapped_op
+
+  def _as_function(self, func_name, args):
+    template = """
+      func_name(args)
+    """
+    replacement = templates.replace_as_expression(
+        template, func_name=parser.parse_expression(func_name), args=args)
+    anno.setanno(replacement, SAFE_BOOLEAN_OPERAND, True)
+    return replacement
+
+  def visit_Compare(self, node):
+    node = self.generic_visit(node)
+    ops_and_comps = list(zip(node.ops, node.comparators))
+    left = node.left
+    op_tree = None
+
+    # Repeated comparisons are converted to conjunctions:
+    #   a < b < c   ->   a < b and b < c
+    while ops_and_comps:
+      op, right = ops_and_comps.pop(0)
+      binary_comparison = self._as_function(
+          self._matching_func(op), (left, right))
+      if isinstance(left, gast.Name) and isinstance(right, gast.Name):
+        anno.setanno(binary_comparison, SAFE_BOOLEAN_OPERAND, True)
+      if op_tree:
+        self._expect_simple_symbol(right)
+        op_tree = self._as_function('tf.logical_and',
+                                    (binary_comparison, op_tree))
+      else:
+        op_tree = binary_comparison
+      left = right
+    assert op_tree is not None
+    return op_tree
+
+  def visit_UnaryOp(self, node):
+    node = self.generic_visit(node)
+    return self._as_function(self._matching_func(node.op), node.operand)
+
+  def visit_BoolOp(self, node):
+    node = self.generic_visit(node)
+    node_values = node.values
+    right = node.values.pop()
+    self._expect_simple_symbol(right)
+    while node_values:
+      left = node_values.pop()
+      self._expect_simple_symbol(left)
+      right = self._as_function(self._matching_func(node.op), (left, right))
+    return right
+
+
+def transform(node, context):
+  return LogicalExpressionTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/logical_expressions_test.py b/tensorflow/contrib/autograph/converters/logical_expressions_test.py
similarity index 86%
rename from tensorflow/contrib/py2tf/converters/logical_expressions_test.py
rename to tensorflow/contrib/autograph/converters/logical_expressions_test.py
index a28326c517d468230f35e45f0fbfe5257d769895..2814060c4d831e4dddacb3dcbcbe1db42160db20 100644
--- a/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
+++ b/tensorflow/contrib/autograph/converters/logical_expressions_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import logical_expressions
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import logical_expressions
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -32,7 +32,7 @@ class GradientsFunctionTest(converter_test_base.TestCase):
       return a == b
 
     node = self.parse_and_analyze(test_fn, {})
-    node = logical_expressions.transform(node)
+    node = logical_expressions.transform(node, self.ctx)
 
     with self.compiled(node, math_ops.equal) as result:
       with self.test_session() as sess:
@@ -45,7 +45,7 @@ class GradientsFunctionTest(converter_test_base.TestCase):
       return (a or b) and (a or b or c)
 
     node = self.parse_and_analyze(test_fn, {})
-    node = logical_expressions.transform(node)
+    node = logical_expressions.transform(node, self.ctx)
 
     with self.compiled(node, math_ops.logical_or,
                        math_ops.logical_and) as result:
diff --git a/tensorflow/contrib/autograph/converters/name_scopes.py b/tensorflow/contrib/autograph/converters/name_scopes.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfee529abaa8c14d9b408819b32c5199500a2c2f
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/name_scopes.py
@@ -0,0 +1,74 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wraps a function body with a `name_scope` of the function name."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+
+
+class FunctionNameScopeTransformer(transformer.Base):
+  """Wrap a function body with a `name_scope` of the function name."""
+
+  def _name_for_current_scope(self):
+    innermost = self.enclosing_entities[-1]
+    if len(self.enclosing_entities) > 1:
+      parent = self.enclosing_entities[-2]
+      if isinstance(parent, gast.ClassDef):
+        # Methods also take the name of their class.
+        name = '%s/%s' % (parent.name, innermost.name)
+      else:
+        name = innermost.name
+    else:
+      name = innermost.name
+
+    # Sanitize the name.
+    # See https://www.tensorflow.org/api_docs/python/tf/Graph#name_scope
+    # TensorFlow doesn't like leading underscores at the top level.
+    while name[0] == '_':
+      name = name[1:]
+    return name
+
+  def visit_FunctionDef(self, node):
+    node = self.generic_visit(node)
+
+    unscoped_body = []
+    scoped_body = node.body
+    if scoped_body:
+      first = scoped_body[0]
+      if isinstance(first, gast.Expr) and isinstance(first.value, gast.Str):
+        # Skip any docstring.
+        unscoped_body = scoped_body[:1]
+        scoped_body = scoped_body[1:]
+
+    template = """
+      with tf.name_scope(scope_name):
+        body
+    """
+    scoped_body = templates.replace(
+        template,
+        scope_name=gast.Str(self._name_for_current_scope()),
+        body=scoped_body)
+    node.body = unscoped_body + scoped_body
+    return node
+
+
+def transform(node, context):
+  return FunctionNameScopeTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/name_scopes_test.py b/tensorflow/contrib/autograph/converters/name_scopes_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..17692cbd880dbc1db4bb40ad7345e27907499f9d
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/name_scopes_test.py
@@ -0,0 +1,139 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for for_canonicalization module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import name_scopes
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class FunctionNameScopeTransformer(converter_test_base.TestCase):
+
+  def test_basic(self):
+
+    def test_fn(l):
+      """This should stay here."""
+      a = 5
+      l += a
+      return l
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = name_scopes.transform(node, self.ctx)
+
+    with self.compiled(node, ops.name_scope) as result:
+      result_op = result.test_fn(constant_op.constant(1))
+      self.assertIn('test_fn/', result_op.op.name)
+
+      self.assertEqual('This should stay here.', result.test_fn.__doc__)
+
+  def test_long_docstring(self):
+
+    def test_fn(l):
+      """Multi-line docstring.
+
+      Args:
+        l: A thing.
+      Returns:
+        l
+      """
+      return l
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = name_scopes.transform(node, self.ctx)
+
+    with self.compiled(node, ops.name_scope) as result:
+      self.assertIn('Multi-line', result.test_fn.__doc__)
+      self.assertIn('Returns:', result.test_fn.__doc__)
+
+  def test_nested_functions(self):
+
+    def test_fn(l):
+
+      def inner_fn(i):
+        return i ** 2
+
+      l += 4
+      return inner_fn(l)
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = name_scopes.transform(node, self.ctx)
+
+    with self.compiled(node, ops.name_scope) as result:
+      result_op = result.test_fn(constant_op.constant(1))
+      first_result_input_name = result_op.op.inputs[0].name
+      second_result_input_name = result_op.op.inputs[1].name
+      self.assertIn('test_fn/', first_result_input_name)
+      self.assertNotIn('inner_fn', first_result_input_name)
+      self.assertIn('test_fn/inner_fn/', second_result_input_name)
+
+  def test_method(self):
+
+    class TestClass(object):
+
+      def test_fn(self, l):
+
+        def inner_fn(i):
+          return i ** 2
+
+        l += 4
+        return inner_fn(l)
+
+    # Note that 'TestClass' was needed in the namespace here.
+    node = self.parse_and_analyze(
+        TestClass, {'TestClass': TestClass}, owner_type=TestClass)
+    node = name_scopes.transform(node, self.ctx)
+
+    with self.compiled(node, ops.name_scope) as result:
+      result_op = result.TestClass().test_fn(constant_op.constant(1))
+      first_result_input_name = result_op.op.inputs[0].name
+      second_result_input_name = result_op.op.inputs[1].name
+      self.assertIn('TestClass/test_fn/', first_result_input_name)
+      self.assertNotIn('inner_fn', first_result_input_name)
+      self.assertIn('TestClass/test_fn/inner_fn/', second_result_input_name)
+
+  def test_operator(self):
+
+    class TestClass(object):
+
+      def __call__(self, l):
+
+        def inner_fn(i):
+          return i ** 2
+
+        l += 4
+        return inner_fn(l)
+
+    # Note that 'TestClass' was needed in the namespace here.
+    node = self.parse_and_analyze(
+        TestClass.__call__, {'TestClass': TestClass}, owner_type=TestClass)
+    node = name_scopes.transform(node, self.ctx)
+
+    with self.compiled(node, ops.name_scope) as result:
+      result_op = result.__call__(TestClass(), constant_op.constant(1))
+      first_result_input_name = result_op.op.inputs[0].name
+      second_result_input_name = result_op.op.inputs[1].name
+      self.assertIn('call__/', first_result_input_name)
+      self.assertNotIn('inner_fn', first_result_input_name)
+      self.assertIn('call__/inner_fn/', second_result_input_name)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/autograph/converters/side_effect_guards.py
similarity index 92%
rename from tensorflow/contrib/py2tf/converters/side_effect_guards.py
rename to tensorflow/contrib/autograph/converters/side_effect_guards.py
index 30976b3ec6db5a6607023ac804d9d54cfb296190..3bcb2d3c42c6e0663c8f78523199a364b6ac231f 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/autograph/converters/side_effect_guards.py
@@ -36,12 +36,12 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import ast_util
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class SymbolNamer(object):
@@ -160,8 +160,8 @@ class SideEffectGuardTransformer(transformer.Base):
               [alias_map.get(s, s).ast() for s in guarded_args], None)
 
         template = """
-          with py2tf_utils.control_dependency_on_returns(call):
-            aliased_guarded_args = py2tf_utils.alias_tensors(guarded_args)
+          with ag__.utils.control_dependency_on_returns(call):
+            aliased_guarded_args = ag__.utils.alias_tensors(guarded_args)
         """
         control_deps_guard = templates.replace(
             template,
@@ -172,7 +172,7 @@ class SideEffectGuardTransformer(transformer.Base):
         alias_map = {}
 
         template = """
-          with py2tf_utils.control_dependency_on_returns(call):
+          with ag__.utils.control_dependency_on_returns(call):
             pass
         """
         control_deps_guard = templates.replace(template, call=node.value)[-1]
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
similarity index 97%
rename from tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
rename to tensorflow/contrib/autograph/converters/side_effect_guards_test.py
index 463db2e770213ba9636d2537b095a77dece5d8f6..ce0ce33243a1352107eb8121050ee76474869809 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import side_effect_guards
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import side_effect_guards
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/autograph/converters/single_return.py b/tensorflow/contrib/autograph/converters/single_return.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcc9ca9dfeb00ef2d2e60edf6a1abfba19a1bad7
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/single_return.py
@@ -0,0 +1,317 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Canonicalizes functions with multiple returns to use just one."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
+
+
+# TODO(mdan): Move this logic into transformer_base.
+class BodyVisitor(transformer.Base):
+  """Walks breadth- or depth-first the list-of-nodes bodies of AST nodes."""
+
+  def __init__(self, context, depth_first=False):
+    self.depth_first = depth_first
+    self.changes_made = False
+    super(BodyVisitor, self).__init__(context)
+
+  def visit_nodelist(self, nodelist):
+    for node in nodelist:
+      if isinstance(node, list):
+        node = self.visit_nodelist(node)
+      else:
+        node = self.generic_visit(node)
+    return nodelist
+
+  def visit_If(self, node):
+    if self.depth_first:
+      node = self.generic_visit(node)
+    node.body = self.visit_nodelist(node.body)
+    node.orelse = self.visit_nodelist(node.orelse)
+    if not self.depth_first:
+      node = self.generic_visit(node)
+    return node
+
+  def visit_For(self, node):
+    if self.depth_first:
+      node = self.generic_visit(node)
+    node.body = self.visit_nodelist(node.body)
+    node.orelse = self.visit_nodelist(node.orelse)
+    if not self.depth_first:
+      node = self.generic_visit(node)
+    return node
+
+  def visit_While(self, node):
+    if self.depth_first:
+      node = self.generic_visit(node)
+    node.body = self.visit_nodelist(node.body)
+    node.orelse = self.visit_nodelist(node.orelse)
+    if not self.depth_first:
+      node = self.generic_visit(node)
+    return node
+
+  def visit_Try(self, node):
+    if self.depth_first:
+      node = self.generic_visit(node)
+    node.body = self.visit_nodelist(node.body)
+    node.orelse = self.visit_nodelist(node.orelse)
+    node.finalbody = self.visit_nodelist(node.finalbody)
+    for i in range(len(node.handlers)):
+      node.handlers[i].body = self.visit_nodelist(node.handlers[i].body)
+    if not self.depth_first:
+      node = self.generic_visit(node)
+    return node
+
+  def visit_With(self, node):
+    if self.depth_first:
+      node = self.generic_visit(node)
+    node.body = self.visit_nodelist(node.body)
+    if not self.depth_first:
+      node = self.generic_visit(node)
+    return node
+
+  def visit_FunctionDef(self, node):
+    if self.depth_first:
+      node = self.generic_visit(node)
+    node.body = self.visit_nodelist(node.body)
+    self.generic_visit(node)
+    if not self.depth_first:
+      node = self.generic_visit(node)
+    return node
+
+
+class FoldElse(BodyVisitor):
+
+  def visit_nodelist(self, nodelist):
+    for i in range(len(nodelist)):
+      node = nodelist[i]
+      if isinstance(node, gast.If):
+        true_branch_returns = isinstance(node.body[-1], gast.Return)
+        false_branch_returns = len(node.orelse) and isinstance(
+            node.orelse[-1], gast.Return)
+        # If the last node in the if body is a return,
+        # then every line after this if statement effectively
+        # belongs in the else.
+        if true_branch_returns and not false_branch_returns:
+          for j in range(i + 1, len(nodelist)):
+            nodelist[i].orelse.append(ast_util.copy_clean(nodelist[j]))
+          if nodelist[i + 1:]:
+            self.changes_made = True
+          return nodelist[:i + 1]
+        elif not true_branch_returns and false_branch_returns:
+          for j in range(i + 1, len(nodelist)):
+            nodelist[i].body.append(ast_util.copy_clean(nodelist[j]))
+          if nodelist[i + 1:]:
+            self.changes_made = True
+          return nodelist[:i + 1]
+        elif true_branch_returns and false_branch_returns:
+          if nodelist[i + 1:]:
+            raise ValueError(
+                'Unreachable code after conditional where both branches return.'
+            )
+          return nodelist
+      elif isinstance(node, gast.Return) and nodelist[i + 1:]:
+        raise ValueError(
+            'Cannot have statements after a return in the same basic block')
+    return nodelist
+
+
+def contains_return(node):
+  for n in gast.walk(node):
+    if isinstance(n, gast.Return):
+      return True
+  return False
+
+
+class LiftReturn(transformer.Base):
+  """Move return statements out of If and With blocks."""
+
+  def __init__(self, context):
+    self.changes_made = False
+    self.common_return_name = None
+    super(LiftReturn, self).__init__(context)
+
+  def visit_If(self, node):
+    # Depth-first traversal of if statements
+    node = self.generic_visit(node)
+
+    # We check if both branches return, and if so, lift the return out of the
+    # conditional. We don't enforce that the true and false branches either
+    # both return or both do not, because FoldElse might move a return
+    # into a branch after this transform completes. FoldElse and LiftReturn
+    # are alternately run until the code reaches a fixed point.
+    true_branch_returns = isinstance(node.body[-1], gast.Return)
+    false_branch_returns = len(node.orelse) and isinstance(
+        node.orelse[-1], gast.Return)
+    if true_branch_returns and false_branch_returns:
+      node.body[-1] = templates.replace(
+          'a = b', a=self.common_return_name, b=node.body[-1].value)[0]
+      node.orelse[-1] = templates.replace(
+          'a = b', a=self.common_return_name, b=node.orelse[-1].value)[0]
+      return_node = templates.replace('return a', a=self.common_return_name)[0]
+      self.changes_made = True
+      return [node, return_node]
+    else:
+      return node
+
+  def visit_With(self, node):
+    # Depth-first traversal of syntax
+    node = self.generic_visit(node)
+
+    # If the with statement returns, lift the return
+    if isinstance(node.body[-1], gast.Return):
+      node.body[-1] = templates.replace(
+          'a = b', a=self.common_return_name, b=node.body[-1].value)[0]
+      return_node = templates.replace('return a', a=self.common_return_name)[0]
+      node = self.generic_visit(node)
+      self.changes_made = True
+      return [node, return_node]
+    else:
+      return node
+
+  def visit_FunctionDef(self, node):
+    # Ensure we're doing depth-first traversal
+    last_return_name = self.common_return_name
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    referenced_names = body_scope.referenced
+    self.common_return_name = self.context.namer.new_symbol(
+        'return_', referenced_names)
+    node = self.generic_visit(node)
+    self.common_return_name = last_return_name
+    return node
+
+
+class DetectReturnInUnsupportedControlFlow(gast.NodeVisitor):
+  """Throws an error if code returns inside loops or try/except."""
+
+  # First, throw an error if we detect a return statement in a loop.
+  # TODO(alexbw): we need to learn to handle returns inside a loop,
+  # but don't currently have the TF constructs to do so (need something
+  # that looks vaguely like a goto).
+
+  def __init__(self):
+    self.cant_return = False
+    super(DetectReturnInUnsupportedControlFlow, self).__init__()
+
+  def visit_While(self, node):
+    self.cant_return = True
+    self.generic_visit(node)
+    self.cant_return = False
+
+  def visit_For(self, node):
+    self.cant_return = True
+    self.generic_visit(node)
+    self.cant_return = False
+
+  def visit_Try(self, node):
+    self.cant_return = True
+    self.generic_visit(node)
+    self.cant_return = False
+
+  def visit_Return(self, node):
+    if self.cant_return:
+      raise ValueError(
+          '`return` statements are not supported in loops. '
+          'Try assigning to a variable in the while loop, and returning '
+          'outside of the loop')
+
+
+class DetectReturnInConditional(gast.NodeVisitor):
+  """Assert that no return statements are present in conditionals."""
+
+  def __init__(self):
+    self.cant_return = False
+    super(DetectReturnInConditional, self).__init__()
+
+  def visit_If(self, node):
+    self.cant_return = True
+    self.generic_visit(node)
+    self.cant_return = False
+
+  def visit_Return(self, node):
+    if self.cant_return:
+      raise ValueError(
+          'After transforms, a conditional contained a `return `statement, '
+          'which is not allowed. This is a bug, and should not happen.')
+
+
+class DetectReturnInFunctionDef(gast.NodeVisitor):
+
+  def visit_FunctionDef(self, node):
+    self.generic_visit(node)
+    if not contains_return(node):
+      raise ValueError(
+          'Each function definition should contain at least one return.')
+
+
+def transform(node, context):
+  """Ensure a function has only a single return.
+
+  This transforms an AST node with multiple returns successively into containing
+  only a single return node.
+  There are a few restrictions on what we can handle:
+   - An AST being transformed must contain at least one return.
+   - No returns allowed in loops. We have to know the type of the return value,
+   and we currently don't have either a type inference system to discover it,
+   nor do we have a mechanism for late type binding in TensorFlow.
+   - After all transformations are finished, a Return node is not allowed inside
+   control flow. If we were unable to move a return outside of control flow,
+   this is an error.
+
+  Args:
+     node: an AST node to transform
+     context: a context object
+
+  Returns:
+     new_node: an AST with a single return value
+
+  Raises:
+    ValueError: if the AST is structured so that we can't perform the
+   transform.
+  """
+  # Make sure that the function has at least one return statement
+  # TODO(alexbw): turning off this assertion for now --
+  # we need to not require this in e.g. class constructors.
+  # DetectReturnInFunctionDef().visit(node)
+
+  # Make sure there's no returns in unsupported locations (loops, try/except)
+  DetectReturnInUnsupportedControlFlow().visit(node)
+
+  while True:
+
+    # Try to lift all returns out of if statements and with blocks
+    lr = LiftReturn(context)
+    node = lr.visit(node)
+    changes_made = lr.changes_made
+    fe = FoldElse(context)
+    node = fe.visit(node)
+    changes_made = changes_made or fe.changes_made
+
+    if not changes_made:
+      break
+
+  # Make sure we've scrubbed all returns from conditionals
+  DetectReturnInConditional().visit(node)
+
+  return node
diff --git a/tensorflow/contrib/autograph/converters/single_return_test.py b/tensorflow/contrib/autograph/converters/single_return_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d483005a09537ea8227814f65aa7e6402c853f60
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/single_return_test.py
@@ -0,0 +1,189 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for single_return module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import single_return
+from tensorflow.python.framework.ops import name_scope
+from tensorflow.python.platform import test
+
+
+class SingleReturnTest(converter_test_base.TestCase):
+
+  def compiled_fn(self, test_fn, *args):
+    node = self.parse_and_analyze(test_fn, {})
+    node = single_return.transform(node, self.ctx)
+    module = self.compiled(node, *args)
+    return module
+
+  def test_noop(self):
+    # Noop
+    def test_fn(x):
+      return x
+
+    with self.compiled_fn(test_fn) as result:
+      self.assertEqual(test_fn(2.0), result.test_fn(2.0))
+
+  def test_return_expression(self):
+    # ANF
+    def test_fn(x):
+      return x * x
+
+    with self.compiled_fn(test_fn) as result:
+      x = 2
+      self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_merge(self):
+    # Simple merge
+    def test_fn(x):
+      if x > 0:
+        return x
+      else:
+        return x * x
+
+    with self.compiled_fn(test_fn) as result:
+      for x in [-2, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_orphan_branch(self):
+
+    def test_fn(x):
+      if x > 0:
+        return x
+
+    with self.assertRaises(ValueError):
+      self.compiled_fn(test_fn)
+
+  def test_lift_body_into_false_branch(self):
+
+    def test_fn(x):
+      if x > 0:
+        return x
+      return x * x
+
+    with self.compiled_fn(test_fn) as result:
+      for x in [-2, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_lift_body_into_true_branch(self):
+
+    def test_fn(x):
+      if x < 0:
+        x *= x
+      else:
+        # TODO(alexbw): linter bug here that requires us suppress this warning.
+        return x  # pylint: disable=undefined-loop-variable
+      return x
+
+    with self.compiled_fn(test_fn) as result:
+      for x in [-2, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_nested_if(self):
+
+    def test_fn(x):
+      if x > 0:
+        if x < 5:
+          return x
+        else:
+          return x * x
+      else:
+        return x * x * x
+
+    with self.compiled_fn(test_fn) as result:
+      for x in [-2, 2, 5]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_context_manager(self):
+
+    def test_fn(x):
+
+      with name_scope(''):
+        return x * x
+
+    with self.compiled_fn(test_fn) as result:
+      result.name_scope = name_scope
+      for x in [-2, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_context_manager_in_conditional(self):
+
+    def test_fn(x):
+      if x > 0:
+        with name_scope(''):
+          return x * x
+      else:
+        return x
+
+    with self.compiled_fn(test_fn, name_scope) as result:
+      result.name_scope = name_scope
+      for x in [-2, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def text_conditional_in_context_manager(self):
+
+    def test_fn(x):
+      with name_scope(''):
+        if x > 0:
+          return x * x
+        else:
+          return x
+
+    with self.compiled_fn(test_fn) as result:
+      result.name_scope = name_scope
+      for x in [-2, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_no_return(self):
+
+    def test_fn(x):
+      x *= x
+
+    with self.compiled_fn(test_fn) as result:
+      self.assertEqual(test_fn(2), result.test_fn(2))
+
+  def test_nested_functiondefs(self):
+
+    def test_fn(x):
+
+      def inner_fn(y):
+        if y > 0:
+          return y * y
+        else:
+          return y
+
+      return inner_fn(x)
+
+    with self.compiled_fn(test_fn) as result:
+      for x in [-2, 2]:
+        self.assertEqual(test_fn(x), result.test_fn(x))
+
+  def test_loop(self):
+
+    def test_fn(x):
+      for _ in range(10):
+        return x
+      return x
+
+    with self.assertRaises(ValueError):
+      self.compiled_fn(test_fn)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..d62390494b78c415212ba91ac914cdfee324f971
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
@@ -0,0 +1,1919 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Dev Summit 2018 - Autograph",
+      "version": "0.3.2",
+      "views": {},
+      "default_view": {},
+      "provenance": [
+        {
+          "file_id": "1wCZUh73zTNs1jzzYjqoxMIdaBWCdKJ2K",
+          "timestamp": 1522238054357
+        },
+        {
+          "file_id": "1_HpC-RrmIv4lNaqeoslUeWaX8zH5IXaJ",
+          "timestamp": 1521743157199
+        },
+        {
+          "file_id": "1mjO2fQ2F9hxpAzw2mnrrUkcgfb7xSGW-",
+          "timestamp": 1520522344607
+        }
+      ],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python2",
+      "display_name": "Python 2"
+    }
+  },
+  "cells": [
+    {
+      "metadata": {
+        "id": "g7nGs4mzVUHP",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Experimental: TF Autograph\n",
+        "**TensorFlow Dev Summit, 2018.**\n",
+        "\n",
+        "This interactive notebook demonstrates **autograph**, an experimental source-code transformation library to automatically convert TF.Eager and Python code to TensorFlow graphs.\n",
+        "\n",
+        "**Note: this is pre-alpha software!** The notebook works best with Python 2, for now.\n",
+        "\n",
+        "> ![alt text](https://lh3.googleusercontent.com/QOvy0clmg7siaVKzwmSPAjicWWNQ0OeyaB16plDjSJMf35WD3vLjF6mz4CGrhSHw60HnlZPJjkyDCBzw5XOI0oBGSewyYw=s688)\n",
+        "\n",
+        "### Table of Contents\n",
+        "1. _Write Eager code that is fast and scalable._\n",
+        "2. _Case study: complex control flow._\n",
+        "3. _Case study: training MNIST with Keras._\n",
+        "4. _Case study: building an RNN._"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "uFcgBENZqkB2",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# Install TensorFlow; note that Colab notebooks run remotely, on virtual\n",
+        "# instances provided by Google.\n",
+        "!pip install -U -q tf-nightly"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Pa2qpEmoVOGe",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import time\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import six\n",
+        "\n",
+        "from google.colab import widgets"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "ZVKfj5ttVkqz",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 1. Write Eager code that is fast and scalable\n",
+        "\n",
+        "TF.Eager gives you more flexibility while coding, but at the cost of losing the benefits of TensorFlow graphs. For example, Eager does not currently support distributed training, exporting models, and a variety of memory and computation optimizations.\n",
+        "\n",
+        "Autograph gives you the best of both worlds: write your code in an Eager style, and we will automatically transform it into the equivalent TF graph code. The graph code can be executed eagerly (as a single op), included as part of a larger graph, or exported."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "snaZRFdWd9ym",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "For example, autograph can convert a function like this:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9__n8cSIeDnD",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def g(x):\n",
+        "  if x > 0:\n",
+        "    x = x * x\n",
+        "  else:\n",
+        "    x = 0\n",
+        "  return x"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "gq0eQcuReHET",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "... into a TF graph-building function:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "sELSn599ePUF",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 413
+        },
+        "outputId": "bb0c7216-1ca3-4da1-d1fb-589902cdcd1a",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345737505,
+          "user_tz": 240,
+          "elapsed": 243,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "print(autograph.to_code(g))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "from tensorflow.contrib.autograph.impl import api as autograph_api\n",
+            "from tensorflow.contrib.autograph import utils as autograph_utils\n",
+            "\n",
+            "def tf__g(x):\n",
+            "  with tf.name_scope('g'):\n",
+            "\n",
+            "    def if_true():\n",
+            "      with tf.name_scope('if_true'):\n",
+            "        x_1, = x,\n",
+            "        x_1 = x_1 * x_1\n",
+            "        return x_1,\n",
+            "\n",
+            "    def if_false():\n",
+            "      with tf.name_scope('if_false'):\n",
+            "        x_1, = x,\n",
+            "        x_1 = 0\n",
+            "        return x_1,\n",
+            "    x = autograph_utils.run_cond(tf.greater(x, 0), if_true, if_false)\n",
+            "    return x\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "j74n-8hEe6dk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "You can then use the converted function as you would any regular TF op -- you can pass `Tensor` arguments and it will return `Tensor`s:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "AkVaY0-dfEbH",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "outputId": "4ffe3757-c44d-424c-c2a8-7ddc973bfcce",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345737841,
+          "user_tz": 240,
+          "elapsed": 257,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "tf_g = autograph.to_graph(g)\n",
+        "\n",
+        "with tf.Graph().as_default():  \n",
+        "\n",
+        "  g_ops = tf_g(tf.constant(9))\n",
+        "\n",
+        "  with tf.Session() as sess:\n",
+        "    tf_g_result = sess.run(g_ops)\n",
+        "\n",
+        "  print('g(9) = %s' % g(9))\n",
+        "  print('tf_g(9) = %s' % tf_g_result)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "g(9) = 81\n",
+            "tf_g(9) = 81\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "trrHQBM1VnD0",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 2. Case study: complex control flow\n",
+        "\n",
+        "Autograph can convert a large chunk of the Python language into graph-equivalent code, and we're adding new supported language features all the time. In this section, we'll give you a taste of some of the functionality in autograph.\n",
+        "Autograph will automatically convert most Python control flow statements into their correct graph equivalent.\n",
+        "  "
+      ]
+    },
+    {
+      "metadata": {
+        "id": "u0YG3DPgZxoW",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "We support common statements like `while`, `for`, `if`, `break`, `return` and more. You can even nest them as much as you like. Imagine trying to write the graph version of this code by hand:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "xJYDzOcrZ8pI",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "6c244ee4-b141-4ad6-eefa-cfffa71f33c6",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345738402,
+          "user_tz": 240,
+          "elapsed": 483,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def sum_even(numbers):\n",
+        "  s = 0\n",
+        "  for n in numbers:\n",
+        "    if n % 2 > 0:\n",
+        "      continue\n",
+        "    s += n\n",
+        "  return s\n",
+        "\n",
+        "\n",
+        "tf_sum_even = autograph.to_graph(sum_even)\n",
+        "\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    result = sess.run(tf_sum_even(tf.constant([10, 12, 15, 20])))\n",
+        "\n",
+        "  print('Sum of even numbers: %s' % result)\n",
+        "  \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(sum_even))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Sum of even numbers: 42\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "_YXo4KOcbKrn",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Try replacing the `continue` in the above code with `break` -- Autograph supports that as well!"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "xHmC0rBIavW_",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "The Python code above is much more readable than the matching graph code. Autograph takes care of tediously converting every piece of Python code into the matching TensorFlow graph version for you, so that you can quickly write maintainable code, but still benefit from the optimizations and deployment benefits of graphs."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UEHWGpBXbS7g",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Let's try some other useful Python constructs, like `print` and `assert`. We automatically convert Python `assert` statements into the equivalent `tf.Assert` code.  "
+      ]
+    },
+    {
+      "metadata": {
+        "id": "qUU57xlEbauI",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "outputId": "add3db4a-2077-4dd5-f7a7-a5b5a4529c26",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345738697,
+          "user_tz": 240,
+          "elapsed": 253,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def f(x):\n",
+        "  assert x != 0, 'Do not pass zero!'\n",
+        "  return x * x\n",
+        "\n",
+        "tf_f = autograph.to_graph(f)\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    try:\n",
+        "      print(sess.run(tf_f(tf.constant(0))))\n",
+        "    except tf.errors.InvalidArgumentError as e:\n",
+        "      print('Got error message: %s' % e.message)\n",
+        "      \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(f))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Got error message: assertion failed: [Do not pass zero!]\n",
+            "\t [[Node: f/Assert/Assert = Assert[T=[DT_STRING], summarize=3, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](f/NotEqual, f/Assert/Assert/data_0)]]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "w5hBZaVJbck4",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "You can also use `print` functions in-graph:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "6NdzRKLEboRv",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "fb82dfc3-790f-4127-87f6-361805be9e9b",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345739013,
+          "user_tz": 240,
+          "elapsed": 247,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def print_sign(n):\n",
+        "  if n >= 0:\n",
+        "    print(n, 'is positive!')\n",
+        "  else:\n",
+        "    print(n, 'is negative!')\n",
+        "  return n\n",
+        "\n",
+        "\n",
+        "tf_print_sign = autograph.to_graph(print_sign)\n",
+        "with tf.Graph().as_default():\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf_print_sign(tf.constant(1)))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(print_sign))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "1 is positive!\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9u_Z3i3AivLA",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "We can convert lists to TensorArray, so appending to lists also works, with a few modifications:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "MjhCQJVuiTNR",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "dc320b87-595b-4392-d29c-994486fd8a0a",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345744470,
+          "user_tz": 240,
+          "elapsed": 5391,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def f(n):\n",
+        "  numbers = []\n",
+        "  # We ask you to tell us about the element dtype.\n",
+        "  autograph.utils.set_element_type(numbers, tf.int32)\n",
+        "  for i in range(n):\n",
+        "    numbers.append(i)\n",
+        "  return numbers.stack() # Stack the list so that it can be used as a Tensor\n",
+        "\n",
+        "\n",
+        "tf_f = autograph.to_graph(f)\n",
+        "with tf.Graph().as_default():\n",
+        "  with tf.Session() as sess:\n",
+        "    print(sess.run(tf_f(tf.constant(5))))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(f))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "[0 1 2 3 4]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UdG8ZFrkTAF2",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "And all of these functionalities, and more, can be composed into more complicated code:\n"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "DVs6wt8NKaGQ",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "cellView": "code",
+        "outputId": "0a4b8d08-8f65-4bbc-85ba-dc4c60563519",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345745186,
+          "user_tz": 240,
+          "elapsed": 658,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def print_primes(n):\n",
+        "  \"\"\"Returns all the prime numbers less than n.\"\"\"\n",
+        "  assert n > 0\n",
+        "  \n",
+        "  primes = []\n",
+        "  autograph.utils.set_element_type(primes, tf.int32)\n",
+        "  for i in range(2, n):\n",
+        "    is_prime = True\n",
+        "    for k in range(2, i):\n",
+        "      if i % k == 0:\n",
+        "        is_prime = False\n",
+        "        break\n",
+        "    if not is_prime:\n",
+        "      continue\n",
+        "    primes.append(i)\n",
+        "  all_primes = primes.stack()\n",
+        "\n",
+        "  print('The prime numbers less than', n, 'are:')\n",
+        "  print(all_primes)\n",
+        "  return tf.no_op()\n",
+        "\n",
+        "    \n",
+        "tf_print_primes = autograph.to_graph(print_primes)\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    n = tf.constant(50)\n",
+        "    sess.run(tf_print_primes(n))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(print_primes))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "The prime numbers less than 50 are:\n",
+            "[ 2  3  5  7 11 13 17 19 23 29 31 37 41 43 47]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "JQ8kQT99VqDk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 3. Case study: training MNIST with Keras\n",
+        "\n",
+        "As we've seen, writing control flow in Autograph is easy. So running a training loop in graph should be easy as well!\n",
+        "\n",
+        "Here, we show an example of such a training loop for a simple Keras model that trains on MNIST."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "0CrtGWgwuLJr",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "import gzip\n",
+        "import shutil\n",
+        "\n",
+        "from six.moves import urllib\n",
+        "\n",
+        "\n",
+        "def download(directory, filename):\n",
+        "  filepath = os.path.join(directory, filename)\n",
+        "  if tf.gfile.Exists(filepath):\n",
+        "    return filepath\n",
+        "  if not tf.gfile.Exists(directory):\n",
+        "    tf.gfile.MakeDirs(directory)\n",
+        "  url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'\n",
+        "  zipped_filepath = filepath + '.gz'\n",
+        "  print('Downloading %s to %s' % (url, zipped_filepath))\n",
+        "  urllib.request.urlretrieve(url, zipped_filepath)\n",
+        "  with gzip.open(zipped_filepath, 'rb') as f_in, open(filepath, 'wb') as f_out:\n",
+        "    shutil.copyfileobj(f_in, f_out)\n",
+        "  os.remove(zipped_filepath)\n",
+        "  return filepath\n",
+        "\n",
+        "\n",
+        "def dataset(directory, images_file, labels_file):\n",
+        "  images_file = download(directory, images_file)\n",
+        "  labels_file = download(directory, labels_file)\n",
+        "\n",
+        "  def decode_image(image):\n",
+        "    # Normalize from [0, 255] to [0.0, 1.0]\n",
+        "    image = tf.decode_raw(image, tf.uint8)\n",
+        "    image = tf.cast(image, tf.float32)\n",
+        "    image = tf.reshape(image, [784])\n",
+        "    return image / 255.0\n",
+        "\n",
+        "  def decode_label(label):\n",
+        "    label = tf.decode_raw(label, tf.uint8)\n",
+        "    label = tf.reshape(label, [])\n",
+        "    return tf.to_int32(label)\n",
+        "\n",
+        "  images = tf.data.FixedLengthRecordDataset(\n",
+        "      images_file, 28 * 28, header_bytes=16).map(decode_image)\n",
+        "  labels = tf.data.FixedLengthRecordDataset(\n",
+        "      labels_file, 1, header_bytes=8).map(decode_label)\n",
+        "  return tf.data.Dataset.zip((images, labels))\n",
+        "\n",
+        "\n",
+        "def mnist_train(directory):\n",
+        "  return dataset(directory, 'train-images-idx3-ubyte',\n",
+        "                 'train-labels-idx1-ubyte')\n",
+        "\n",
+        "def mnist_test(directory):\n",
+        "  return dataset(directory, 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "2zu1U9Nqir6L",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "First, we'll define a small three-layer neural network using the Keras API"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "x_MU13boiok2",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def mlp_model(input_shape):\n",
+        "  model = tf.keras.Sequential([\n",
+        "      tf.keras.layers.Dense(100, activation='relu', input_shape=input_shape),\n",
+        "      tf.keras.layers.Dense(100, activation='relu'),\n",
+        "      tf.keras.layers.Dense(10, activation='softmax')])\n",
+        "  model.build()\n",
+        "  return model"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Wuqg3H8mi0Xj",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Let's connect the model definition (here abbreviated as `m`) to a loss function, so that we can train our model."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "W51sfbONiz_5",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def predict(m, x, y):\n",
+        "  y_p = m(x)\n",
+        "  losses = tf.keras.losses.categorical_crossentropy(y, y_p)\n",
+        "  l = tf.reduce_mean(losses)\n",
+        "  accuracies = tf.keras.metrics.categorical_accuracy(y, y_p)\n",
+        "  accuracy = tf.reduce_mean(accuracies)\n",
+        "  return l, accuracy"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "035tNWQki9tr",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Now the final piece of the problem specification (before loading data, and clicking everything together) is backpropagating the loss through the model, and optimizing the weights using the gradient."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "CsAD0ajbi9iZ",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def fit(m, x, y, opt):\n",
+        "  l, accuracy = predict(m, x, y)\n",
+        "  opt.minimize(l)\n",
+        "  return l, accuracy"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "PcVRIacKjSwb",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "These are some utility functions to download data and generate batches for training"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "RVw57HdTjPzi",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def setup_mnist_data(is_training, hp, batch_size):\n",
+        "  if is_training:\n",
+        "    ds = mnist_train('/tmp/autograph_mnist_data')\n",
+        "    ds = ds.shuffle(batch_size * 10)\n",
+        "  else:\n",
+        "    ds = mnist_test('/tmp/autograph_mnist_data')\n",
+        "  ds = ds.repeat()\n",
+        "  ds = ds.batch(batch_size)\n",
+        "  return ds\n",
+        "\n",
+        "def get_next_batch(ds):\n",
+        "  itr = ds.make_one_shot_iterator()\n",
+        "  image, label = itr.get_next()\n",
+        "  x = tf.to_float(tf.reshape(image, (-1, 28 * 28)))\n",
+        "  y = tf.one_hot(tf.squeeze(label), 10)\n",
+        "  return x, y"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "2zEJH5XNjgFz",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "This function specifies the main training loop. We instantiate the model (using the code above), instantiate an optimizer (here we'll use SGD with momentum, nothing too fancy), and we'll instantiate some lists to keep track of training and test loss and accuracy over time.\n",
+        "\n",
+        "In the loop inside this function, we'll grab a batch of data, apply an update to the weights of our model to improve its performance, and then record its current training loss and accuracy. Every so often, we'll log some information about training as well."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UUI0566FjZPx",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def train(train_ds, test_ds, hp):\n",
+        "  m = mlp_model((28 * 28,))\n",
+        "  opt = tf.train.MomentumOptimizer(hp.learning_rate, 0.9)\n",
+        "  train_losses = []\n",
+        "  train_losses = autograph.utils.set_element_type(train_losses, tf.float32)\n",
+        "  test_losses = []\n",
+        "  test_losses = autograph.utils.set_element_type(test_losses, tf.float32)\n",
+        "  train_accuracies = []\n",
+        "  train_accuracies = autograph.utils.set_element_type(train_accuracies,\n",
+        "                                                      tf.float32)\n",
+        "  test_accuracies = []\n",
+        "  test_accuracies = autograph.utils.set_element_type(test_accuracies,\n",
+        "                                                     tf.float32)\n",
+        "  i = tf.constant(0)\n",
+        "  while i < hp.max_steps:\n",
+        "    train_x, train_y = get_next_batch(train_ds)\n",
+        "    test_x, test_y = get_next_batch(test_ds)\n",
+        "    step_train_loss, step_train_accuracy = fit(m, train_x, train_y, opt)\n",
+        "    step_test_loss, step_test_accuracy = predict(m, test_x, test_y)\n",
+        "    if i % (hp.max_steps // 10) == 0:\n",
+        "      print('Step', i, 'train loss:', step_train_loss, 'test loss:',\n",
+        "            step_test_loss, 'train accuracy:', step_train_accuracy,\n",
+        "            'test accuracy:', step_test_accuracy)\n",
+        "    train_losses.append(step_train_loss)\n",
+        "    test_losses.append(step_test_loss)\n",
+        "    train_accuracies.append(step_train_accuracy)\n",
+        "    test_accuracies.append(step_test_accuracy)\n",
+        "    i += 1\n",
+        "  return (train_losses.stack(), test_losses.stack(),  train_accuracies.stack(),\n",
+        "          test_accuracies.stack())"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "cYiUQ1ppkHzk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Everything is ready to go, let's train the model and plot its performance!"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "K1m8TwOKjdNd",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {},
+            {},
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 988
+        },
+        "outputId": "f9d3eef3-5bea-45c1-ddf9-4edee73e4436",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345800262,
+          "user_tz": 240,
+          "elapsed": 52391,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "with tf.Graph().as_default():\n",
+        "  hp = tf.contrib.training.HParams(\n",
+        "      learning_rate=0.05,\n",
+        "      max_steps=500,\n",
+        "  )\n",
+        "  train_ds = setup_mnist_data(True, hp, 50)\n",
+        "  test_ds = setup_mnist_data(False, hp, 1000)\n",
+        "  tf_train = autograph.to_graph(train)\n",
+        "  (train_losses, test_losses, train_accuracies,\n",
+        "   test_accuracies) = tf_train(train_ds, test_ds, hp)\n",
+        "\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf.global_variables_initializer())\n",
+        "    (train_losses, test_losses, train_accuracies,\n",
+        "     test_accuracies) = sess.run([train_losses, test_losses, train_accuracies,\n",
+        "                                  test_accuracies])\n",
+        "    plt.title('MNIST train/test losses')\n",
+        "    plt.plot(train_losses, label='train loss')\n",
+        "    plt.plot(test_losses, label='test loss')\n",
+        "    plt.legend()\n",
+        "    plt.xlabel('Training step')\n",
+        "    plt.ylabel('Loss')\n",
+        "    plt.show()\n",
+        "    plt.title('MNIST train/test accuracies')\n",
+        "    plt.plot(train_accuracies, label='train accuracy')\n",
+        "    plt.plot(test_accuracies, label='test accuracy')\n",
+        "    plt.legend(loc='lower right')\n",
+        "    plt.xlabel('Training step')\n",
+        "    plt.ylabel('Accuracy')\n",
+        "    plt.show()"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz to /tmp/autograph_mnist_data/train-images-idx3-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz to /tmp/autograph_mnist_data/train-labels-idx1-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz to /tmp/autograph_mnist_data/t10k-images-idx3-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz to /tmp/autograph_mnist_data/t10k-labels-idx1-ubyte.gz\n",
+            "Step 0 train loss: 2.244329 test loss: 2.2499208 train accuracy: 0.12 test accuracy: 0.161\n",
+            "Step 50 train loss: 0.64771986 test loss: 0.56013924 train accuracy: 0.82 test accuracy: 0.836\n",
+            "Step 100 train loss: 0.49011207 test loss: 0.42143965 train accuracy: 0.84 test accuracy: 0.879\n",
+            "Step 150 train loss: 0.3768609 test loss: 0.39319593 train accuracy: 0.88 test accuracy: 0.883\n",
+            "Step 200 train loss: 0.36007702 test loss: 0.37089333 train accuracy: 0.9 test accuracy: 0.881\n",
+            "Step 250 train loss: 0.182115 test loss: 0.28543878 train accuracy: 0.94 test accuracy: 0.915\n",
+            "Step 300 train loss: 0.2119576 test loss: 0.22305593 train accuracy: 0.92 test accuracy: 0.93\n",
+            "Step 350 train loss: 0.12932214 test loss: 0.29057172 train accuracy: 0.96 test accuracy: 0.906\n",
+            "Step 400 train loss: 0.22937602 test loss: 0.2200287 train accuracy: 0.92 test accuracy: 0.925\n",
+            "Step 450 train loss: 0.23444137 test loss: 0.19857481 train accuracy: 0.94 test accuracy: 0.94\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAFnCAYAAACPasF4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzs3XmAFNW9Pvynlt5mYdhmQMHggnGN\nS9zCD0ElKug1edUY9ZoQTYze3GuiRk1uYjRqRHNj4n5NrhKjiUYlbihGQFRUFDSoKIvgICAO6+xL\n711V5/2jlq7qZaZnpnumZ3g+/zjTXV1dXSP91PecU+dIQggBIiIiGjLkwT4AIiIi6h2GNxER0RDD\n8CYiIhpiGN5ERERDDMObiIhoiGF4ExERDTEMb6JeOOigg3DllVdmPf6rX/0KBx10kGe766+/3rPN\ne++9h9mzZwMAtm3bhkMPPdR57osvvsCPfvQjzJw5EzNnzsTZZ5+NV199FQBw0003YdasWZg1axYO\nO+wwnHLKKc7v4XDY8x7JZBLz58/v9edavXo1Lr300oK2XbBgAebMmdPn97J19/rZs2fjhRde6PO+\niYY7hjdRL3366aee0Ewmk1izZk3WditXrsQnn3xS0D6vu+46TJs2DYsXL8bixYtxyy234LrrrsPO\nnTtxyy23YNGiRVi0aBHGjRuH3//+987vVVVVnv188sknfQrUI444Ag8//HBB2y5fvhxTpkzp83vZ\n+vt6oj0Zw5uol0444QQsWbLE+f3tt9/GV77ylaztrrnmGtx+++0F7bO+vh5HHnmk8/uRRx6JxYsX\nY/z48QUfV3NzM3784x/jo48+wkUXXQTAbAF48MEHMXPmTOi6jlWrVuHcc8/FrFmzcOaZZ2L58uUA\nzFaB0047DQBw//334ze/+Q2uuOIKfP3rX8d5552HxsZG533ee+89HHzwwVnv9cEHH+Bb3/oWTjvt\nNJx//vloaGgAAOzevRsXX3wxzjzzTJx66qm4++67cx5rPu+99x7OOecczJo1C9/+9redC6Vc++3u\ncSEE/vd//xczZ87EKaecgjlz5kDXdQDAwoULcdZZZ+GMM87AN77xDbz33nsFn3eiwcDwJuqlM844\nAy+99JLz+z//+U/MmjUr53ZCCCxatKjHfU6fPh1XXnkl/va3v2HTpk0AgHHjxkGSpIKPa+zYsbjm\nmmtw1FFH4YknnnAeF0Jg8eLFUBQFv/71r3HppZdi0aJFuPzyy3HTTTfl3NeiRYtw/fXX49VXX8WY\nMWPw7LPPAgA2bdqE2tpaTJgwwfNe4XAY//mf/4lrrrkGS5Yswfe+9z1cddVVAIBHH30Uxx13HF5+\n+WUsWLAADQ0NMAwj57FmikQiuOqqq3DDDTdg0aJF+OEPf4jrrrsOhmHk3G9jY2Pex1944QUsWrQI\nzzzzDJYsWYKGhgY8+eSTAIBbbrkFDz74IBYuXIibbroJr7/+esHnnWgwMLyJeun444/Hxo0b0dLS\nglgshlWrVmHKlCk5t73++uvxhz/8AYlEott9/v73v8d3vvMdLFiwAGeddRZmzJjhBEt/nXzyyc7P\n8+fPxxlnnAEAOOaYY5zqONOxxx6LCRMmQJIkHHLIIdi5cycAYMWKFTk/6wcffIBx48Zh6tSpAICz\nzjoLX3zxBXbs2IExY8bg7bffxvvvvw+/34+77roLdXV1BR376tWrMX78eBxzzDEAgJkzZ6KtrQ3b\nt2/Pu998jy9duhTf+ta3UF1dDVVV8e1vfxuvvPIKAGDMmDF46qmnsH37dhx77LH45S9/WdjJJRok\n6mAfANFQoygKTj/9dCxcuBCjR4/GiSeeCFXN/U/psMMOw3HHHYdHHnkERx99dN59BgIBXHrppbj0\n0kvR2dmJRYsW4fbbb8fEiRMxbdq0fh3vyJEjnZ8XLFiAv/3tb4hEIjAMA/mWNqiurnZ+VhTFaV5+\n5513cMkll2Rt39nZiYaGBk8LhN/vR2trKy655BIYhoFbbrkFjY2N+M53voOf/OQnBR17a2srRowY\nkXVsLS0tefeb7/Guri48/PDDmDdvHgBA13WMHj0aAPCnP/0Jf/rTn3Duuedir732wvXXX4/jjz++\noGMkGgwMb6I+OPPMM3H33Xdj1KhRPfbZ/vSnP8W5556LiRMn5ny+tbUV69evd6rWESNG4Pzzz8ey\nZctQX1/f7/C27d69GzfccAOefvppHHLIIfj8888xc+bMgl+vaRrWrFmT8yKkrq4O+++/P5577rmc\nr7388stx+eWXY8uWLbjsssucSronY8aMQXt7u/O7EAIdHR0YM2YMVFXNud+pU6fmfLyurg4zZszA\nd7/73az3+dKXvoTf/va3MAwD8+fPx7XXXotly5YVeGaIBh6bzYn64Oijj0ZjYyM2btzYY4VWV1eH\n73znO7j//vtzPh+Px3HllVd6wmLr1q34+OOPceyxx/bquFRVRTgczllRt7a2oqKiAvvvvz80TXMq\n0EgkUtC+V69ejYMOOgh+vz/rvY488kg0NTXh448/BgA0NDTgZz/7GYQQ+PWvf4133nkHgBmSY8eO\nhSRJ3R6r7YgjjkBzczNWrVoFwBxfMH78eEycODHvfvM9/vWvfx0vvPACYrEYAOCpp57C888/j9bW\nVnz/+99HOByGLMs48sgjezXWgGgwsPIm6gNJknDaaachFotBlnu+Bv7BD36Ap59+Oudze++9N/70\npz/hvvvuw5w5cyCEQFVVFX75y196RqAX4phjjsEf/vAHTJs2DW+++abnuYMPPhjTp0/HzJkzMWbM\nGPziF7/Ahx9+iNmzZ+O///u/e9y3fYtYvve67777cOuttyISicDn8+Gqq66CJEm48MIL8etf/xq3\n3norhBCYMWMGpkyZgh07dnheryhK1ntWVFTgnnvuwa233opoNIrRo0fjrrvu6na/I0eOzPk4AGzc\nuBHnnHMOADPYb7vtNowePRrTpk3Dt771LSiKAp/Ph9tuu61X551ooElcz5uIiGhoYbM5ERHREMPw\nJiIiGmIY3kREREMMw5uIiGiIYXgTERENMUPmVrGmpq6i7m/UqAq0tUWLus89Ec9j//Ec9h/PYXHw\nPPZfsc9hbW11zsf32MpbVbPvKaXe43nsP57D/uM5LA6ex/4bqHO4x4Y3ERHRUMXwJiIiGmIY3kRE\nREMMw5uIiGiIYXgTERENMQxvIiKiIYbhTURENMQwvImIaNh6443XCt723nvvxI4d23vc7sMP38cN\nN/y8P4fVbwxvIiIalnbu3IFXX11c8PZXXXUt9t57QgmPqHiGzPSoREREvXHXXb/D+vXr8Mgjc2EY\nBnbs2I6dO3fgnnv+iN/+9jdoampELBbDD35wOaZOnYYf//hyXHPNz7F06WuIRML44out2L59G668\n8lpMmTI153u89toSzJv3dyiKgoMOOgS33XYL6us34M47fwefzwe/349bbvktdu7cnvVYdXXuqU8L\nsceGd0c4gfc3NOLYg+sG+1CIiIa9f7z+GVZuaCzqPo87uA7nz5ic9/l///fZeO65f+D7378MDz/8\nIDQthT/+8c9oa2vF8cd/DWeccRa2b9+GG2/8BaZOneZ5bWPjbvzhD/fh3XeX44UXns0Z3tFoFA89\n9AAeeeQJVFRU4Oc//yneffddvPzyyzjnnPMwa9a/4YMPVqK1tQUvv7wg6zGGdx9ceecbaO2M46ZL\njsOk8X0/gURENDQccshhAIDq6hFYv34dXnzxOUiSjM7OjqxtjzjiKABAXV0dwuFwzv01NHyBiRO/\nhIqKCgDA0Ucfg/Xr1+PEE0/CH/7wP2ho+AJf//ppmDRp35yP9cceGd5b23YiPOFNSMnD0dwRZ3gT\nEZXY+TMmd1slDwSfzwcAWLJkETo7O/HAA39GZ2cnfvjD2VnbKkp6gREhRM79SZL3OU1LQZJCOPbY\n4/HnP/8Ny5cvw5w5N+PHP74652Nf/eqxff4se2R4f7ztCyjVbTBG70RLZ3ywD4eIiEpAlmXoup71\neHt7O/baa2/Isow333wdqVSqT/vfZ59J2LbtC0SjEVRUVGLVqg9x1VU/xrPPzsOUKSfi9NPPgBAC\n9fUbsGXLpqzHGN69dPykA7G4CZArO9DSwfAmIhqOJk3aD59+ugH33XcnKiurnMdPPnkGfvGLa/DJ\nJ2vxb//2TdTV1eGRR+b2ev+hUAhXXHEVrr32J5AkGUcccRSOPfZY7NzZghtv/AWqqqrg8/lw/fU3\nob7+06zH+kMS+doDykxTU1dR93fjit+ipTOCQyLn4yfnHlHUfe9Jamuri/632dPwHPYfz2Fx8Dz2\nX7HPYW1t7m7dPfY+7y+P2Q+SL4mmcOtgHwoREVGv7LHhPbFmPACgLdk2yEdCRETUO3tseI8JjQIA\nxBFGPKkN8tEQEREVbs8N74rRAADJH+egNSIiGlL22PAeW2FW3pI/xtvFiIhoSNljw3uME96svImI\naGjZY8M75AvCLwcg+eNoZuVNRDQs9WZJUNtHH32ItjbvnUjlsAyo2x4b3gAwMlDDypuIaJjq7ZKg\ntn/+88Ws8C43e+QMa7a6ijFojDWiqSt7UnoiIhra3EuCXnDBRbj99lvQ1dUFXddx9dU/w+TJB+Lx\nxx/Fm28uhSzLmDp1Gg455FAsW/YGtmzZjDlz7sD48eOz9pu5DOjVV1/nLANaWRkCIJdkGVC3PTy8\nxwItQKfWPtiHQkQ0rD332UtY1bimqPs8uu4rOHfyWXmfdy8J+uijf8YJJ/w/fOMbZ2PLls24994/\n4J57/oinnnoc8+cvgqIomD//WRx33NcwefKXcc01P88Z3LmWAf3ww/fx1ltLcc4552H27AuxaNHr\nJVkG1G2PDu/a0FgAQAysvImIhrM1a1ajvb0Nixe/DABIJMzu0pNP/jquvvq/cNpps3D66bN63E+u\nZUDr6zc4S362tOzClCknlWQZULc9OrzrKszwTildMISALEmDfERERMPTuZPP6rZKLjWfT8VPf/oz\nHH64dy2L6677JbZu/Ryvv74EP/nJf+Chh/7a7X5yLQMaCAScJT/XrFlZsmVA3fboAWt25Y1gFNE4\nZ1kjIhpO3EuCHnro4XjrrTcAAFu2bMZTTz2OcDiMRx6Zi0mT9sX3v38ZqqtrEI1G8i4lCniXAQWA\nVas+xEEHHYpnn52Hzs4OfPOb38QFF1yE+voNzmOnn36G81ix7NGV96hgDSQhQw5EEYmnUBXyDfYh\nERFRkbiXBP3hD3+E2267Gf/1Xz+EYRi4+urrUFVVhfb2Nlx22fcQClXg8MOPwIgRNTjqqK/ihhv+\nG7/97Z3Yf/8DPPvMtQzokUcehVgsihtv/AVGjaoBIJdkGVC3PXZJUHvZtp++fgvicQM/O+pa7L/3\niKK+x56ASwj2H89h//EcFgfPY/9xSdABEpCCkNQUIvHUYB8KERFRQfb48A4qIUiqhs4oJ2ohIqKh\nYY8P7wo1BABoj4YH+UiIiIgKs8eHd6XPvFevIxEZ5CMhIiIqzB4f3iMClQCAjjjDm4iIhoY9PrxH\nVZgj+Xa1tw3ykRARERVmjw/v0RXm7WE7OjrQHk4M8tEQERH1bI8P70qfOWBNUpNYvallkI+GiIio\nZwxvn9nnDSWFpvbY4B4MERFRAUo6Peodd9yBDz74AJqm4T/+4z9w+umnO88tX74cd911FxRFwfTp\n03HFFVeU8lDysm8Vk9QUWjvZbE5EROWvZOH97rvvYuPGjZg3bx7a2tpwzjnneMJ7zpw5ePjhhzFu\n3Dh897vfxcyZMzF58uRSHU5eITVo/qBoaOviRC1ERFT+Shbexx13HI44wlx6bcSIEYjFYtB1HYqi\noKGhATU1Ndhrr70AACeddBJWrFgxKOHtV/wAAJ9foK2NlTcREZW/koW3oijOYuXPPPMMpk+fDkVR\nAABNTU0YPXq0s+3o0aPR0NDQ7f5GjaqAqipFPcba2mqM1M3K2+8XaI8kMXZsFSSu690r+SbOp8Lx\nHPYfz2Fx8Dz230Ccw5IvCfrqq6/imWeewV/+8pd+7aetLVqkIzLZK78IISBLMiTFQCKpY+u2NlQG\nuTRoobgKUf/xHPYfz2Fx8Dz237BYVWzZsmX4v//7P8ydOxfV1ekDqKurQ3Nzs/P77t27UVdXV8pD\nyUuSJPhlP2TVXHi9jYPWiIiozJUsvLu6unDHHXfgwQcfxMiRIz3PTZw4EeFwGNu2bYOmaVi6dCmm\nTp1aqkPpkV/xAbIZ3h2R5KAdBxERUSFK1mz+8ssvo62tDVdffbXz2AknnICDDjoIp512Gm6++WZc\ne+21AIAzzzwT++23X6kOpUd+xY9kyhxpHo5xXW8iIipvJQvvCy64ABdccEHe54877jjMmzevVG/f\nKwHFjw6YS4IyvImIqNzt8TOsAYBf9kMXZmhHGN5ERFTmGN4w+7wNGIBksPImIqKyx/BGeqIWyDrC\ncYY3ERGVN4Y3zD5vAGZ4s/ImIqIyx/CG2ecNAKrPYJ83ERGVPYY3rPu8AYRCEitvIiIqewxvpPu8\nQyEgHNMG+WiIiIi6x/BGus87GABiCQ26YQzyEREREeXH8Ea68g5YS3tH46y+iYiofDG8Afhls89b\nVc2KO57UB/NwiIiIusXwRrryllUBwGw6JyIiKlcMbwABJQAAzrKgrLyJiKicMbwBhFQzvCXVrLjj\nSVbeRERUvhjeAIKKNVJNNu/xjiVYeRMRUflieAMIWpW3IZkVd4yVNxERlTGGN4CQGgIAGJJZecdZ\neRMRURljeAMIWgPWdCQBsM+biIjKG8MbgCqrUCQFmhXe7PMmIqJyxvAGIEkSgmoAKWGFNytvIiIq\nYwxvS1AJImkkAABxTtJCRERljOFtCaoBJHQrvDlJCxERlTGGtyWkBpHQk1BkNpsTEVF5Y3hbgkoQ\nAgKBoOCtYkREVNYY3hZ7opZgCIiyz5uIiMoYw9sSVM0pUitCQCSWGuSjISIiyo/hbQlZ85sHQwJJ\nzUAixaZzIiIqTwxvi115B4IGAFbfRERUvhjeFrvP2+c3wzvM8CYiojLF8LbYzeYqw5uIiMocw9ti\nV96yz+zrZngTEVG5YnhbglblLavmbWIMbyIiKlcMb4tdeUNheBMRUXljeFtC1mhzQzJDm+FNRETl\niuFtCTK8iYhoiGB4W+w+b3tNb85vTkRE5YrhbfHJKmRJdtb0TunGIB8RERFRbgxviyRJCClBZ01v\nneFNRERliuHtElQDiGlxKLLEypuIiMoWw9slqAYR1xJQFRmaJgb7cIiIiHJieLsErWZzRQE0Vt5E\nRFSmGN4uITUAAQHVLxjeRERUthjeLj7Fb/5XNRjeRERUthjeLn7ZBwCQVYGUzj5vIiIqTwxvF5+s\nAgAU1YCm9b/ybutK4MEX16G5I9bvfREREdkY3i4+xay8FaU4fd5PvFqP9z7Zjb8u3NDvfREREdkY\n3i4+u9ncZ0ArQrN5PKl7/ktERFQMDG8Xu89bUQwYQsAw2O9NRETlh+HtYjebS4rZZM5Z1oiIqBwx\nvF2c0eayGdq8XYyIiMoRw9vF7vO2K+9i9HsTEREVG8PbxWk2tyvvItwuRkREVGwlDe/6+nqceuqp\nePzxx7OemzFjBi666CLMnj0bs2fPxu7du0t5KAWxK2/I5ujwfjebC1buRERUfGqpdhyNRnHrrbdi\nypQpebeZO3cuKisrS3UIvebPCG8OWCMionJUssrb7/dj7ty5qKurK9VbFF1Ws3mxwlsqzm6IiIiA\nElbeqqpCVbvf/U033YTt27fjmGOOwbXXXgtJGtyUs6dHFZLdbM5mbyIiKj8lC++eXHnllZg2bRpq\nampwxRVXYPHixZg1a1be7UeNqoCqKkU9htraas/vcf9IAIBqLi6Gqqpg1ja94fObp9enKv3aT7kb\nzp9toPAc9h/PYXHwPPbfQJzDQQvvs88+2/l5+vTpqK+v7za829qiRX3/2tpqNDV1eR4Lx1IAgJSW\nBAA0t4TRVBPo83ukkpq1Pz3rvYaLXOeReofnsP94DouD57H/in0O810IDMqtYl1dXbj00kuRTJoh\nuXLlShx44IGDcSge9mhzQ+KANSIiKl8lq7zXrl2L3/3ud9i+fTtUVcXixYsxY8YMTJw4Eaeddhqm\nT5+OCy64AIFAAIceemi3VfdA8St2n7dZMevs8yYiojJUsvA+/PDD8dhjj+V9/uKLL8bFF19cqrfv\nE6fyBitvIiIqX5xhzUWRFEiQYMCsvDnDGhERlSOGt4skSfApPqfy7uk+7x3hXXjsk38grsUH4vCI\niIgADOJo83Lll33QhTVKvIc+7/s+eghdyTDGVdTi9H1PGYjDIyIiYuWdKagEkDQSAAC9m8p7W2MY\nXckwACBpJAfk2IiIiACGd5bairGIGREEv/oqtic3593ulfcbnJ8lzn9KREQDiOGdYXyFORe7pGpY\nrb2af0N3i/ogT+tKRER7FoZ3hnGV6YVUVPjzbifAe8CJiGhwMLwzjK+oTf8iCquoZTabExHRAGJ4\nZxhfOc75OYEINEPLvaGn8GZ4ExHRwGF4Z6j2V+EHX/4h9I4xgCTQGm/r8TXs8iYiooHE8M5h/5pJ\nMLpGAQCaYq05t/GMV8tTebNXnIiISoHhnYOqSBApc7BaLJV7KVLhSmbeKkZERAOJ4Z2DqsiAYU4+\nl8g7AYsnvYmIiAYMwzsHVZEhDAUAkNBzh3chzeZERESlwPDOQVUkQDfDO5knvHuD4U5ERMXE8M5B\nkiQo1pot21o6cm8kvNsTERENFIZ3Hgp8AICV9TuwsyWS9TxHkhMR0WBheOdhhzdkHZ2R7pvO2SxO\nREQDieGdhyqlwzsX4bpXzBD5lw4lIiIqNoZ3HnZ4S0qe6VFd3EFORERUagzvPHyKz5yIRdaR1Lqv\nrA2w8iYiooHD8M5DlRXAUCApOpKp7KZzd7HNZnMiIhpIDO88fKp1r7esI5nqofJmszkREQ0ghnce\n5ixrKiRFQ0LLUXm7f2blTUREA4jhnYeqyAVX3nqe8GZBTkREpcDwzkOWJXN+c1lHIpljxLn7VjEO\nWCMiogHE8M7DMIQ5YE0WSGiprOe9zeY9lNicw4WIiIqI4Z2HYQhAN+c3j2mJ7rdlnzcREQ0ghnce\nuiGcZUFjqXj2Bp5bxdi5TUREA6eg8F67di2WLl0KALj77rtx8cUX4/333y/pgQ023RAQyQAAIKKH\nu92Wfd5ERDSQCgrvOXPmYL/99sP777+PNWvW4MYbb8R9991X6mMbVIYhIBIhAEBMdGU9z1vFiIho\nsBQU3oFAAPvuuy9ee+01nH/++Zg8eTJkeXi3uJuVdzfh7VmYhM3mREQ0cApK4FgshoULF+LVV1/F\niSeeiPb2dnR2dpb62AaVIQREMggASCLXet7pwK7f1pZzxDkXLCEiolIoKLyvueYaLFiwAD/96U9R\nVVWFxx57DJdcckmJD21w6a5m86Scq8873VS+uy2CpvZY9hZ2djPDiYioiNRCNvra176Gww8/HFVV\nVWhubsaUKVPw1a9+tdTHNqgMwwAMFUJToSvRrOfdlTckkQ5q9zZW5c0KnIiIiqmgyvvWW2/FwoUL\n0d7ejgsvvBCPP/44br755hIf2uD60rhqAIBIhKCrkawAzry3O3ezub1taY6RiIj2TAWF9yeffIJv\nf/vbWLhwIc455xzcc8892Lp1a6mPbVBdcsbB+N7Mg+DTqwFZR0cyo49fSieyxMqbiIgGUEHhbYfP\nG2+8gRkzZgAAkslk6Y6qDFQGfTj56AkIiBEAgMZok+d54b63WxI5VyGxA53ZTURExVRQeO+33344\n88wzEYlEcMghh2D+/Pmoqakp9bGVhZAwP+fOsDe8vROziJwBzcqbiIhKoaABa3PmzEF9fT0OOOAA\nAMDkyZNxxx13lPTAykW1MgotALZ37fY8nll557rXO+lrARSJfd5ERFRUBYV3PB7H66+/jnvvvReS\nJOGoo47C5MmTS31sZWGkbzSAXM3mmaPNvQm9qf1ztO31OvwVYyBaTy71YRIR0R6koGbzG2+8EeFw\nGBdeeCHOP/98NDc344Ybbij1sZWFmmAVhACi1uIkH3zaiBfe3gJkNJvruje869s2AQCUmhb2eRMR\nUVEVVHk3Nzfjrrvucn4/5ZRTMHv27JIdVDmpCKpAVIJm6ACAB55fCwA4cLLrukcS6EqGcdt7D+Oc\nyf+GQ8cchNZ4KwBApHzs8yYioqIqeHrUWCw9g1g0GkUi0f0a18NFZVAFhAxN1z2Pp9y/S8DqjlXY\nEdmFBz5+GADQEm8DAIhkiH3eRERUVAVV3hdccAHOOOMMHH744QCAdevW4aqrrirpgZWLiqAPEBI0\n4Q3vpK65fhMQGQndaoe3prLyJiKioioovM877zxMnToV69atgyRJuPHGG/HYY4+V+tjKgll5S9AN\n74xqKS0d3lLGgDUhhFN5QzYY3kREVFQFhTcA7LXXXthrr72c31evXl2SAyo3duWti+6azYUnoBN6\n0pk+VZJ1DlgjIqKi6vOi3HtKNVkZVCGEbC5U4pIy3GEunAFtABDX4+mnFH2POVdERDQw+hzekiQV\n8zjKVkVQBSBBhze8tYzKO2GkB/DFtHR4S7LOAWtERFRU3Tabn3TSSTlDWgiBtra2kh1UOamw+ryF\n8PZdp3QNAdd2yTzhzT5vIiIqtm7D+4knnhio4yhbiixDEjIMpKC5J2KR3TOsGXkrb7DPm4iIiqzb\n8J4wYcJAHUdZkyUJAgZSWrqpXJJdt4pJQMod3qmoazsDBpjeRERUPH3u8y5EfX09Tj31VDz++ONZ\nzy1fvhznnXceLrjgAjzwwAOlPIx+kyADEEhprn5v1R3eAkmRDu+2RIfn9ULSQEREVCwlC+9oNIpb\nb70VU6ZMyfn8nDlzcP/99+PJJ5/EO++8g88++6xUh9JviiRDSAaSrvD2VN4QSBnp9c2d8DbM0ysk\n721mRERE/VGy8Pb7/Zg7dy7q6uqynmtoaEBNTQ322msvyLKMk046CStWrCjVofSbLCkABOJJVwgr\n3klaUsIV3vF28wctCAAQYOVNRETFU7LwVlUVwWAw53NNTU0YPXq08/vo0aPR1NSUc9tyoMgyJFmg\nI5JuGpdUb+Wtwd1sboV3yhwf+2w3AAAgAElEQVSPzsqbiIiKqeAZ1gbbqFEVUFWlqPusra0uaDtV\nMU+TUFzXOlblLTQVkj/puQu8PWk1m1vhDVkv+L2GouH82QYKz2H/8RwWB89j/w3EORyU8K6rq0Nz\nc7Pz++7du3M2r7u1tUW7fb63amur0dTUVdC2spABCdi2M31vu2SHt+5zqvB9qvZGQ3gHuhJh87lU\nABIAQ9IKfq+hpjfnkXLjOew/nsPi4Hnsv2Kfw3wXAiUdbZ7PxIkTEQ6HsW3bNmiahqVLl2Lq1KmD\ncSgFUWTzNHVE0/3akDXz/m3NvP6pwlhMm+AdnCecZnP2eRMRUfGUrPJeu3Ytfve732H79u1QVRWL\nFy/GjBkzMHHiRJx22mm4+eabce211wIAzjzzTOy3336lOpR+UxUF0IHOqGvaU1UDdBWQzHu4fQhA\nldOn0y/7ENet5nb2eRMRURGVLLwPP/zwbpcNPe644zBv3rxSvX1RqbIV3rH0oDQoGoSuOn3fighA\nkdN98j7Fh6iumE0bDG8iIiqiQWk2H2pUK5S7oq7R5opZeUtOePuhSunwViU1fZ+3zGZzIiIqHoZ3\nAXyKGcrhuN3nLZzK2x6spghvs7kqqxCaFeYyK28iIioehncB/NatYl0xK7xlA5IkzD5v2A/5PM3m\nqqxA6NbvisaVxYiIqGgY3gXwWfeX68KqoJ3bxNzh7Tebyi2qpMIwrN9lnUuTEBFR0TC8C1Dh91k/\nmRHszGvuCm9J+Jy+cQBmFW5V3pJVeXdGk7j/2dXY1hgekOMmIqLhieFdAL/PCm/JmkdNsSpwwzXj\nm65mNJur6cpcMdf0/ufyrVi1sRn3Pbt6AI6aiIiGK4Z3ARTJOk2SgCSlK293szkMNavZ3A53STYr\nb3s98GSKA9iIiKjvGN4FUOxbwCSByqAPvoDVg+2qvCXd22yuuprNoegw2OlNRERFMmQWJhlMslV5\nS5JAZciHsGrAACB0BYmNR0EZ2QRZqvbcKqZIKgAZQpedypuIiKgYWHkXIN1sbuDEr4yHL2D1fRsq\njLbxSG35CoRhB7b9GsXZxu7zdkjSwBw4ERENSwzvAshWEP/7qZNx5tcmweczk9i5jxuArouMZnPV\n2UbKvM+bVTgREfUDw7sAduU9fmwIkiRB8dmVtyu8hchoNndV3jL7vImIqHgY3gWQrSVBDWGGtqxa\no8Vdo811XXjmNreb0IWuAIoGwzDSO2SzORER9QPDuwB2Fa1b4S1Z93m7m80NQzgD2wCkg1xXIUlA\nyuDiJEREVBwcbV4AO5S/6NyGz9o3A0rKfMJwVd6GAclVUctOs7n537iWXguceuetj3dgQm0lDti7\nZrAPhYioLDC8C2D3eS/e+joAQLZWGfMMWDPSk7CYr/Fuc+fqe3AELhqQ4x1OYgkNjy7cAAD4yy9m\nDPLREBGVBzabF0B29WUDgJCyJ2nRDYHHXql3frfDW1LNKj2hJyBghntnJIkHnlsDg6POe6TpRs8b\nERHtYRjeBVAk72kSMMy7vQxvn/fGhnbXa8zntN2T0tsgXZl/UN+Enc2REh0xERENZwzvAmSGNwAr\nuNN93PGk7unztkebG51jobWMN3+Gd05znfeP9YhniIgoG8O7ALKsZD+oe4cLRGIpz+8KXK+xKnQD\n3hHnDO+esWeBiCgbw7sAco7KWxgZ/eAwB1c5r5HdK45Z94lL3srbYHj3iOdoePvX+t247I6l2N0a\nHexDIRpSGN4FyN9s7hV2Vd+K69TaQS/YbN5rDO/h7c8vfQLdEFi2eudgHwrRkMLwLkDmaHMATjXt\n5g7j5vZk+glhbtssbYLkT1cYKY6k7hFH5A9v/PMS9Q3DuwC5Ku9RVaFuX7P4vW3pX6yg362uQ/Co\nt5yHUymGd08Y3kRE2RjeBcjV5z1+VBWqQj4AQCjQw1w3OZrYAVbehWCzORFRNoZ3AZQczeaqrDrL\nfFZX+Lp9vcjRxA4AyZSe83FKY3jvGbhWD1HvMLwLIOf4ZnEv/1kdyg7vQyeNxrUXHoWvHTouu/KW\nNQACb3e8jHe2vwcAWPDOFsxd8ElRj3s4YHYTEWVjeBcgksq+jcW9/Gd1hT/r+ZHVfhy272jzOeE9\nzZI/DskfxxfJT/HEp88CAJ5ftgUr1u0q8pEPnLWbW7BibfGPn5U3EVE2hncBJo3YBwDw5VGTncfM\nZnPz5xGV6fAWmlmFj6kYCQCQ5exmcykQg+RPrzJmrxMOwGmKz2f+ss34+LPmPnyK0rrrHx9j7kvF\nbznggDUiomwM7wJU+6vwwIw7cOa+pzqPqa5Z1/yq7Axei6+ZisTGo3DUhP0BABUBNavZXArEIAVi\nzu+NkXQYdxdWndEkXnznc9z7zOr+faAS6unio7fKObxffGeLs+IZEdFAYnj3guIKbFVWPfNu71NX\nBQCoCYzAdbPOwKTx1QCAiqAv655wSUlB8qfD+4GP/+KsEa7p3YR3JJn3uXKR1Io7gr6cm83nL9uC\ntz7eMdiHMaSV8bUZUVljePeCu59blVQ4y2ZIwMGTRgEAamtCzs8AUBlSs/q8IQlP5d2aaIW692YA\ngN7N7WPhaCrvc+Wi2CPoyzm8iYgGC8O7F7Iqbye7JZxxwpfwzan74rJvHOp5TWXQlzUPOmTDCe+v\n7zPd3F9tAyBrWZV3Uk9i4ZZX0Z7oQGe0/CvvRLHD23U6/ufvH6KxPZZ/40HCC4y+4y1iRH3D8O4F\n9/3e7j5vSQJURcbZ0/ZH7UjvzGuVOZrNIRmQ/HEoIoBzDzwLB4a+AknVIPnj0DIq73/Uv4CXtryC\nFzctQke4/MM7WeRZ49x93vUN7Zj32sai7r8YONlO37HZnKhvGN69oHbT551PZUjN7vOWDEi+JFQj\nCAAwdOt5SUBzVXHtiQ6s2LnS+b0jo897xY6VWLBpUS8/RWkVu/IWGVVtOS7mknnBRURUagzvXvBU\n3pKCQtK7MuiDENnN5lBSkI0AAEDT0o+7+7zvWzU3/RJJdgashQLm/h7f8DQWbX0dulE+M7UVu887\nM6zLsYk6VeRBekREPWF490L2aHMzSLrrtzNvFcucpCUBSQIk3QzvlDUOTZIM6Fafd2ckieZYi/Oa\nqBZzKu+qjBndolr59AMnSthsnuv3cqAxvPuNfd9EvcPw7gXPaHO5h8VILLIsZd/n7TMnaJE0c3IX\n3S5WJQOaYQbBtX98C7rQceDIAwAAsVQMHZEEAMCvKp77qbuS4d5/mCJyH0vxR5tn/l4e4a27Dox9\n3n0nCup8IqJMDO9eUFyBrcqq606xHsoG4X1e8pshbM/GJgzreUkgkdTxxqrt0GWzyq5UK+BX/Ihq\nMcQTZjDqhkBcT8/QFklF+vyZisHdtF30Pu/MyrtMwlvT0sfR3b35ROXglZUNWL+1bbAPg4qI4d0L\n7nW9PQPWemzyywhvnxnMwqq8Dd16Xjbw7Fub8bfFn0JSzI5wvxxEhRpCTIs5FZ4hBLqS6cAO55h7\nfSC5w9tdeacMDXd/+Ces2LEy18sKkt1s3uddFZW72uaANSpniZSOp17biN8/uWqwD4WKiOHdC1kD\n1iw9ZXfdqFDOx42kWXk74S0Z2N5kNoFLqtkRHrDCO6rFnYFRhiEQTrnDe5Arb1d4ufu8t3Y24LP2\nLXh8w9N933eZjjZ3BzYHrFE5K5fWKiouhncvSK5RNYprkpae3Pz94zC+60QkPj3G83gqYTbD233e\nkiTgXApY06X65QBCaghxLY6UZm5oCOFpKh/sZnMtT+WtGVquzXsl84unXAasuQepsc+7H6w/Z+bY\nBiLqHsO7j3yyAvf0qN0J+lVMCnwZRkcthKv/OxaVYRgCuqvytkmqGXw+KYAKXxACAklh9pUbRmaz\n+WBX3q4+by0d3rmWUu2tzLDOvO97sLgvWDjavP/K5aJsOOK5HZ4Y3n2UOT1qTxTFOtVGeluR8iMS\nT0HX0n3e6RdYlbcUQIVaYT1vPmYIIJxKjzAPJ7NDMvMfbCKl4911u5zqvZjczebJZPrnLtcxLnrv\niz7tO/N7Rx/gLyJNN7BkZQOice+88u7AZp93/7Fpt3TKpauJiovh3UfmwiSmQu5R9dnh7VqkROgq\nWjsTcPLUGm0OpPu8VRFASA1ab2pW45l93pnN5l/s7sIPf7cUb3603XnsuTc346EFn2D+si0FfT63\ndVta8doH2/I+7xlt7ro4CLtuYVv4Xu/fF8jRbD7AX0TPv7UZT762EX9fUu953N1Uzmbz/mN4l065\ntFZRcTG8+6jQ+7xtimIlvHuFMV3BLY+uRDhid3obTsVsh7cCPypUc8CbZFXjhiE8TdI7Irs8s6wt\nX7sLAPDU6585jzU0dgEANu3o7NVxA8Cd8z7C35fU521+y9fn7b7/3JD7tiJa1gxrA/w9tHFbBwCg\ntTPheZwD1orD/nOyabd0mN3DE8O7j1RZ6dWiCnblLQz7vxKc028FuiS5m83NKlsRfgTUgPVYesBa\nXDPv8z669itoT3Rgbct656V2FSP7EqhvMwPcp5qj4/vTbJ6vOvKMNk+6wtvVIiAUb/gV/J5Z93kP\nbFC2dZnHPbI64Hnc22wuEI2nsO7z1gE9tuGEAVM6bNUYnhjefaRIhU2PalNVO6itjQ1X5S7Sk7TY\n7CpbNgLpJnopfatYzArv0yedAgB4a9sK57VOv/A+a3DvqofwcdNa+K33T/ajSszXt5tvkhZ35d3X\n8M5s8hvoUcntYfO4R1T4PY+nXIP0NM3AnfM+xp1PfYT6hvYBPb6hzv6nM9AXZW5CCCz9cBt2tQ7u\nfAmlwlaN4Ynh3UeqrOLkoycAAA760qgCtvc2m8vCHd7Wn8E1YE3yJyAMCbLwwWc10UtyepKWmBaD\nLBQ89sIuHDhyf2xo24hdkd3m7uzAC5lBMn/Ty/D5zPdI9WPu8XwDX9yjzd39v+5BdYbct+VMM9/S\nEAKabmTNvFYq9mfOfD8tY5KWLTvN7oimMlxvvJw5zeaD2POweWcnHnulHr+a++7gHUQJsfIenhje\nfeSTFXzntC/jjh9NwWH7ju5xe6fytprNZeSqvN3hHYNIhqDrrv512Wo2N4CYFofQfdi8vRMnjDfv\nH/+0bZP5vN1vnjJHqTdGmyGpZgWZ7EezuZ5jGlAhhGeeb/e0oRHXKHhD6Wt4Zw9Yu/z3b2DO3z7o\n0/56w90FkDkoTcszYI0LbPRNb6vD9zc04sEX1xWlqozEzC6q4VqgsvIenhjefaTKKmRJwtiRuWdP\ny9o+Y7S5e7S63Q9uN5srqg7Jn4RIhKDpBnyKtYqY5K6844BuTtEaUioBAM+/vRHN7bF0FaOkB4nF\nVXOFMvfgqriW7hMvRGbl3RZvx/ee+ylWtb0PqGY4u0MtYbgCW+1bs3n2gDXzd7vSLaXWrvT88cmM\nFgv3eXT/LGekdyyhIZbo/2Q1A03TDWzd1TVg79fb6vCP89fivU92Y3cBTd2vf7itV5/FMATunPeR\n526NoYyV9/DE8O4j91SphVCt0eb2JC2q5FrW0x6wZjWLT5xo7lskQkhpRlazOWCGt6GZj8swt4+m\n4nhx+efpK20lHRqblXcANenp835iwzO4d9VD+KhpbUGfQc+oPjd1fI6ElsCyliUIHvkG4Is74W0I\nA5qhQYHVV9zHyjuzz3sgFwGx108Huq+8NU/l7Q3vK+5+C1fc/VaJjrB0HlrwCW55dOWA9eH3tTp0\n5k/Io7E9hsdfqcctj+afXz+ztaSxPYZ1W1rx10Wf9umYyg2ze3gqaXjffvvtuOCCC3DhhRdi9erV\nnudmzJiBiy66CLNnz8bs2bOxe/fuUh5K0Vz+le/hrP1metb2LoRdeUtWda1KKn71PWu61IxmcyVo\n9puKRAU03Ug3m9vN6rIBXegwUnZ4p5/fvKPTudIWcgp1FWMBAElEoY773FMl2iPUN1rN7T3pbrIH\nSTGgjGx0giypm8EXRJV1Avp2q1jSSHpaEIq95Gh3uqLp982cRU3zDFhzDTTsY7N5S6wVN7xzOza0\nbuzbDors/Q2NAFBQZVsMfa0OMy8oMxXy/0vmn2y4dX2w8h6eShbe//rXv7B161bMmzcPt912G267\n7basbebOnYvHHnsMjz32GMaNG1eqQymqI2sPxxn7fb3Xr3Oaza3wliDjgL1roMiS0w/ujDb3m1+Y\nduWdsjPErrytMBO6VZFb64VLio4dzRGzipEMQNYxOjAKFx30LfP5jACt9JnN7YVOr6plfAnYI95t\nysgmZxR2QrdmiDMqrffuW+W9UnseoWNegz20qbezRdW3fYZH1j2BVB/mWe+KuirvjLECqTxzm/e1\ngnz1izfRlmjH3DWPFbS9EAJPLKnHui2lvT2tKuTreaMi6Gu+9HSPfUE5LHX765DHPu/hqWThvWLF\nCpx66qkAgAMOOAAdHR0Ih8M9vGr4UuzR5s7tZVbftyJD2KPN7T5t1aq8k0GkdAML3ramFrUGrNnL\nhcIKbwjF83wkrjkBH1KDOGj0gZ7nbVU+c0BbOJk7vNvi7Xh03ZOQrIuJzCrHvtf8lJFnw4hXQK5u\ncyrUlNXfrYgghC47y6D2VhhmOMk1zX16/b2rHsL7uz/CxwV2Dbh1uirvzJDwDNJzN6FrfWz+tVpy\nNFHYRcb2pghe/WAb7pz3UZ/erzvukfWZF2yl4q4OOyNJrN3ckndbz/H11I3ShzJ6uGXdnlh5G0Lg\nd3//EP9c8flgH0rJ9G6asF5obm7GYYcd5vw+evRoNDU1oaqqynnspptuwvbt23HMMcfg2muvzeov\ndBs1qgKq2rum6p7U1lYXdX/dGdlsNT/azeaKitraavhUGYmUt887FJKBlFlZ+/wqWtpTwCjXJC5W\neAvdrIpGjrA+hxXOmiGchU1GVY/AXnXWrWzW/u3PXREIAl1AzIjmPBfLPnkbK3evQuAIGfH3T0f1\niJBnu+QXZrhVh6ogkgHIwSg0w0BtbTVi7eaAMkX2QST9gJrM+R7vbVuFUcEafHns/p7Ho/EUKoLp\nqk8Zux1GR61nm978/Xyh3v+9U64vPUOSPK/3B9LHJrv6XYMVfmc7dytBT+8dCJj/FDVDK+g4O+Lp\nC7Fi/3/c1pluUQm5Pk8pqT7FeZ9fPLQEja1R3Hftydhv75qsbSOx9EVVVXXQeV2u44y7rrnyfY6a\n1phnm5he+N9tKGgKpy+cC/k8w+Ezh6NJfNrQjk8b2nHJN78y4O8/IP9mSv4Olsz7ZK+88kpMmzYN\nNTU1uOKKK7B48WLMmjUr7+vb2orb91ZbW42mpoEbTdvVZX1BWAEqdKCpqQuyLOWYpMX6YhYyOrvi\nUCUFCddrneZva8BaW6s5ktsO/65Iup9Y0hR0tVnPWzO0NTZ2QpIkdMbMintXuAlf7GyCX/Z5+vI7\nwzFnv1IwjJaWCJpC5nsmkjpefOdTqOOARFQ4rQApI4mmpi7s6jAHOukpCdD8kIKRrPNtCAN3vvMQ\nAOCBGXc4j2/Y2oY7nlyF807e32yokAClphkpyfBML9ubv19LR5dn+22NYUgSMKG2yrPdZ9s7cO/T\nH+Pq849EY0u6RSIWT3le3+EKuIireb2tPepsF0+mq+jujvWtxmVY9Nkbzu/23ydTMqXj3U9247iD\n69DWnv73UKz/j1es24VdLVEctl/61sfWtuiA/DuJu85vo9XPvnFLC6p82Y2Dja576Ztawmiq9uf9\n99zSkm7ty/c5OjLOZVNzz68ZSlpb0/8f9/R5Bvp7sVQiroWEBvrzFPsc5rsQKFmzeV1dHZqb002d\njY2NqK1NV05nn302xowZA1VVMX36dNTX1+fazbDh3ELk6vMGkNHnbQ1YU+1FjmVougG/Yo3Ylg0E\nfIrTbG73ecPwNpvHk5qzTYUagk/2eZ5/7q3NeGfNTqfPOqEncd1bv8b1Cx/CZ9s7nGN2z58uBSOe\npuKuWNJpAZAMn3MsQtagG4bTbA5dgdB8kBQdCc3bdJ7Qc98+ttIaLLVw5RanA1JSNchVfR/5HE15\nJ0/59V/+hRsf/lfWds8s/QyRuIZnlm5yms1HVPg8zea6YeCL3el/nO4+b/e98O6R/d01Xc5bu8Dz\ne0TLfaG6YPnneHThBjz56saSNO3OXfAJFiz/HM0d6XM1UPO25+qXzddk7668e1qOtZAxEpnvM9xW\n4ervx+mIJPHBp03FOZgBMtz+hrmULLynTp2KxYsXAwDWrVuHuro6p8m8q6sLl156KZJJ88t85cqV\nOPDAA0t1KGUhff+vNe847D5vKV1NWuEtK1Z1bshIaQYCavo+74BPTt8CZjWb6xrM6t0K51hCd6rz\nkBqCIiuQhAzJev6fK7bi4X+uzxpwFg5twd3/SPehulcFU0Y1oiWRHhxlGMK5QJCFz6m8JUWDpgkk\nrNHmwlAgUubFR1vcezUaTaXf372wiv1FLqvmY0I3L07kEd5+UPMiQcsK5lw6k7mvhA1hYHc0/cVk\nT6aj6Qa6oklUhXwI+BVPiK1YuxtrXQPFtDyD19yz2el5phAzRPbjLbHcg9B2tZih/vmu0t7j3tKR\n/ruUMrzdrXG5Lm7yjST3hHcPo80LGayV+d7D7YvffQ76MjPh//z9Qzzw/JohNfVvrgmlhpuShfdX\nv/pVHHbYYbjwwgsxZ84c3HTTTXjuueewZMkSVFdXY/r06c5tZKNHj+62yXw4kK0Ba3a/tV15T6yt\nAmA1nctWVW6FNwwFKV0gqKbv8w74FSek7cldNN2AJBSn2Tye0JyAt5cTlaB41wuHQEJPoNJeKxyA\n0FTPVbq78lZrt+PvDQ8imdJR39BuTlpih7er8oaiIaUbSOr2iHgZ0Mzwrm/x3pIW1dKh61772/4y\ntS8OjC6zGVcOeQc8aprAw2sfw8+W3YRIKuoJccMQePHtLc5kOJ3J3IG3cMur+M27v3fudbfvCtB0\nga5oCtUVPvhUb3hv3tHh2UfKM2DNtba5PUJd0pHQcg9Ei2vZrQ/5Rv/b/w/phijpl1OLq0uglMud\nunMkZ3jnCdGwK7x7Or5CgjgrvHvY519eXo///r/lPe63XLg/X19Gntu3Cw6lqX/zXSwPJyXt877u\nuus8vx988MHOzxdffDEuvvjiUr59WXEqbzugrdHml5xxMPbfewdeiSsw7AFp9n+FDE0zYOj23Oe6\n2Wwup8MdsKoj4Qp1pG/NspcTlYXqHW2uaBAQ2K9mknO/t4hXpudgR+4Q+cvL6/Gv9Y046/9NgqRo\nELoC3YCn8tZ1A0nDWr5UV2AkzGOYt+kZHD/hSAStVdJirvDuSHRiZMAcnGR/v8jWoDsRr4DQFUhB\n7/GkdANrms1j//mymwEAPzjsIhwz7ij8a/1uzH97C0LHCEABOhPp4HdXa+/sMCfvWLV7DY6qPdwJ\n70RKRziWwoSxlYgndU94B/2utdxlAy0j/gUlPgJSMIIW3Q/AHHyXTBmAZCB45FuYv6kT3z3sW1nn\nM7P1AzAHreVi37FgGKLHirM/8lXeiZSOrmgSY2sKm1WwJ+4gyZWx+YI3Ek+fn54uYgoZaZ35Pj0F\n/turd1rbGVDk8p/nyhPehkAP89rkNZRaJIbSsfZV+f+fN0yMqQlaP3mbzasr/Pi3KftClc1wkkc2\nIpwKQ7Kq8ZRuIJGy/keUzD5vJ9ytyjulGea93q5wloLm1fLY0Bjzd6E4zeZAetWyoBLElV/5sfmg\nrHtmrAqnIhgd9C668q/1Zn/0xoYOc1CcrkI3BISRWXlbzea6DL3xSxBJM7Cjrv5cd3gv3rrUqdad\nudntixFdhYhXWp/JfZtQdoDtipjHZ37BC2cZ1Q5X5e2e6tReS317s9msbs+E12atJmZW3rInxGLW\nQLTbLjsBoTFtSFR/Dv8Bq+GbsAmrxItOU3hS0yH545D8CWzsyD0NbVw3gzKoBHDKPicCQM570qOp\nmPP31Q0BrciVhbs5tdm1drm7JeGOJ1bh539a4Zl5rj/0HirCfBcoUdd0sz1V3oWFd+ZtgIV98Wd2\nKQgh8PFnzWU3Ha773PYn1IbSLWdsNqeiGVUdwG2XneBUywq8k18okgJJ1RD48ofYEdllzaomIaUZ\nSKYEhCFDkg34fa6QFq5lPo2McA5EASFhTMhscpZyVN4AsGZjJ3738GdQjRAgG051J4RAOBVBlTWR\nS6akpluVt2p++en2RDEaNF044W1oMiBk6O3mYEU7oAHvILKPm9bipS3mGIms6V11FUas0hz17k+/\nRtMMyJL3f2FP8Lmmh+1IdDrv51621J4AJ2m9zl533V6UpLrSD58qw3AtwBK3ngv61Zwz7dmfMakZ\nkHxmOLfEWz2f3WZX3idNnIqJVXtnfwaYs9XNee9ObAq+CkBYK6sV98vJHUSeytsVjvZ88u5m6/5w\nh0GuUMn3GfU83RQ5t+1L5a27jyv//jOPb/naXbj3mdX466INPb7nQHJ/hP5c8w2lanYoXWj0FcN7\nAO01phIHpk6D3joOU+r+n+e5zBHGftkHVTFHmyeSulllywZURXaazYWr2VwYCiRfylkARA5GIeuh\n9LzoIqMyt5qk7XlzzIsD3ak8E3oSmqGhUq1AcrN5n6Q9hzoAJDTdDEddhaYJT5/3X15ej664GZS6\nZq+mZr7WDnXAW3kDwEeNa8xNnT5vb+UNAFIo3XSe0DSnYjyq1jzGlGEHp56ezAaAgMD7uz92nks/\nYU9ba430z2hTHFFhhjeQDji7sgr6FShq9rdh0khCNwxsbwxD8iec998VzZ4C2J7oJqQGnb9VKiPk\nP2hcjY5kJ8LyLsjVbVafd3Er77hrBTXPimnWZ3avsFasL3F3tZ85h735Prk/o2dq2j40my98dytW\nb2rJu02+VfIyZVbe9iDGTdtLv2hOb3gGBvbjNoVi/z9XSkPpQqOvGN4D7IpZU3H1cT/AtMO+1O12\nqqzCp5qVdyJlhndNtYq6kaH0wDO72Vw3zIFhAEJfXQqoCUj+BJRUFYQQ+OPzaxCNCUiygNPsbM8X\n7p5iVU734dn93RVqJTsCVIYAACAASURBVPTmCdA7R8OADsBuEk5Bks3Qjic1T5/3Z9s6sGqTGVS6\nZlXyVmVu94UDQDSjv7cl3obGaHO6/9OpvBWIlNns7p5mNZyMQkDgyNrD8c39Z5rnwqpaw7GU83q9\nrQ5CAG82LEdjW8QTRPY99kKyBt9l3F49wmo2B9Jf1vGEBglAwK84I+LdknoKTy/dhKde/wzwpZug\nd4R3ZW1rV95BNQjVuqVPM7zhvXLXh87PytjtVp934V9OWzq+wJ/XPo5wMpJ3tHE8zxzg9mduaDSv\n8tQJG/HytpcKfu/u9NRsnm+kuztce2w2z9hvNK7h6Tc24Z6nP05v002fd+b+3ecvc8rcZmtA11in\ni6w86D20cBS8nyE09Vyxu5XKEcN7gPl9Cg6eNKrb2eQAwCer8Cmy1WyuQ5FVCDkJQ04BkvWl4fR5\n604VDgDKSPP+eilZiVhCw/ufNmXdCy75zdCwQ1FYfeZ25b0zYgbNCL81QYDzeiu8EXFeH4lrnsob\nSFfYuqZ4Xp9wVd6fN5mVynXH/Bhn7GtOpdueaE9/Qcqu+9l17/EDQJc1rWuVr8IJPrvyjsRS6dHq\nsSroLXthV2wXfvXss3jh7S3pE23tLwnzizfzy7raVXl/vH0zVu/YjFhSRzCgQJYkyEqu8E5imTWo\nya68ge7DO6QE0pV3RrN5S7wN1b4qyEKFXNmRNWBtV2Q33tuZf33zf9TPx6rG1bjx5b/i9sdzbxfP\n009rn4+GRnNMgG/CJqxu/xDhWApPLKnvVxO6O0dyZUrmMqw276IwvWs2D8ey++u7azbPvIBwH1Pm\nc01Wd8OoEYFuj2mgGT3cklfwfoZQNbsn9HkP2Axr1DuqYt5fHEtqSGoGAlAQTnXhXflvgGwu4iKE\nq9lcl5wFFeRqs0lQSoUQtkfmWkHv+9IGpD4/DFLAXrnMHDls6GZzvH070spdq8ztIxMAtDmVM2Qd\nMFTEpU4oMEeoR42Uq/K2mrqtCwwtZVW2OZrN12zdBXUsMMJf5dzSFtPi6S8J2T52NT0JnSss7TnZ\nK32V8Cne4AvHNE+fubbjAKhjd0Ie0YpVG9OTB9n3w8eMMO54/37ElNEA9nKe19ROrK94BsrY/fBk\nwyLz/RJnOyPOJVflbUSrIFeEkdCTqAgoiCU0p88bAHZEssM77qq884V3OBnBmNAoSMlKdIR2Q0fK\nEzi3vncnAKAmMAL71UxCwJ7Uxz4uawBdomIbNm34ctYxAN5mczc7HKMJLf33ADD3pbVYs6kNmiHw\nvZkH5XxtT/L1ecuSBEOIvCuCefq8ezlgzT1ffa73BrxVW+b+3bPmuS/0hBDOQL5yC7nM0eZ9NZQC\nkc3mNGh8soqAT0VXxPyySfc3Cydw7C/7ZMqAcC2bKVeYVZIwZIStLys7PNW6bZBrmiFb4W3Ezfu8\nDatvWlENGMLA6uZPEMIIvPCKNWGIYc+/bo149pnNqCJe4am81boGyKN2Oc3Qeko2mxFzhLd7xLs7\nvJ2Kxj2TnD0Lnavytu9Dr/RVOLPI2f3F4VjK2b/QVAgtPdGNh7WNhhS2djag0f+x5+k2YxeSUgT+\n/dMLm8STOkKBdDcBACTWHwe9dbzzGYP281blXaFU5q68dbvPO2Teiw/vrWIpQ0Ncj6PaV4VqqRaS\nBBiBzpxNyvd/NBd3rLwPQgi8/uE2ayY2gdZ4m3ksqubchZApka/Z3AooTRee127aaf5/YfSjedId\nJO4+b7v1J6nluaDoplk7U+aXeFeOkfLdNptrmeGt53zOfftavhYDIQSefXOTZxbDgeAZbd6Ppu+B\nWqSmGPaE+7wZ3mXi9Emn4OBRBzr3OvtkFUG/4vzDE3L6S8eerGREyAy8aELzNM/KlWZ4G7qcbtbU\nXaOiJQEpEDOraWsCFfteclk2oBk6UkYKWjQEZ35Su9ncqnxl64vciFeiPZxIr3AGwDfhMwgrZFMp\nCaOqA1AlMzyT1rSpumFO8iKEOUNb0BXedsUl5HSfd2azPwBENKvyViucCxk7+CKxFKCmK3e4lk1N\nnwcjPSFOPkp2c3IslUDQnx5dDwBC8zvvsaO1AxV2ePviEJoPtYFx6Eh2eia+AbwD1pZ+YIb79tb0\ngCd7lrsqfyWqYN72JwKdeQcP7Yo2YmP7Jjz+Sj2WvN+AjmSXZzIcuTJ7MFVcS2Bly/KsVeeAdEBp\nugHZdZ99NGVdlAT7vmSonmcglWKHd54Q9FbehQ9YE0KgM9q7ZvPsyts1sM89IY/r4iffRcfnu7rw\nzxVbcftj+bs4SsGd17kGBvbEPb/AUFGs1oZyxvAuE//fAWfgJ0dfZt0iBqiy2WxuS0npL2A7qGsq\nrfCOa5B82TN1CUN2+vjcfeKSrEMKRK0mc8nZFgBk1XACUNcl1768fd72hCkiXoH2cDIdrjCb0u3K\nW+gKVEXGiKDZPG/fLhWOpszPofmh6cKpvONaHAnNACCQ8rUDwgxGu9nefTucHUqVvgrzVjtIzoC4\ncCzlDG4TKX+Oyl3Af9BKz2fPJQVr/vdPj4HeYYYnanYh4LcvqlyD6qxz8MTrG8zKXElBCkZhRKsw\n2mfeKpdZfcdc4b16o1khN7anJ5SxZ56r9lVBMsygFJLebRW0rvlT5+ftHebAwUlV5gBJKZQ9Teyz\nG1/EB13L4Nvn06zn7PBKaYZ5+6HFvmjpzz3NIk+zuT1oMpmnP7uvfd66Yc6alykz4LuvvHM3m7tb\nLqJGFz5sXJ31Pok8XROl1t8Ba3Z4r9ncUvKpeYuluwuw7vx10QZnEp5yx/AuM4p137JPVhH0eatl\nN2HIGGmFdyyhQW8dl7Uvs/K2vmxEOoilQBSSqkEkXTNluSpbu8/VHinuft4OTykQMydesSZnCfpV\nnDzCmkFMNiAk3ZqaVIJPlVFTYb5XJGmGVWs4BikQgxGvQDJleJrNkykdck0z9EA7KhP7mHO4Z1T+\nABC1Ku8qfyUkSYJPVhFJxLHgnS1WeNvN5n4AMoQhpcPfl4AywgxLvWW8s09ZT48U3m+vaucWPpEM\nQsTMufn9B6xG20irerIH1bmqe8jmjGxyVbvZzN01GmN85t9nc8fnnr+RHd6bGiLOHPbuPm+7X7/K\nX+lZtz39hZT9ZRxOpPvZd0fN/v0vjzBnN5QrO7NGnO+KmhPb5Ap2O7xSuuFtclfSa8c/99YmvPbB\ntqzX9iTfaPN05e0Nu2ff3ISXln+OsNTsdH/0ps9b13NX3lpGuOVbqx0AYnmazd2tBLvHvYSH1z6O\n19au97z2b1/8H/wHZy+GUyhDCM/FQ8Gv62cVav89GhrD+M2j72PTjg7c98xqRON9v3ArplxjI/py\nwZJM6Xjzox34y8vre964DDC8y4w9baoqKZ7KO4sho6bKbPKOJjSkPj8MX459A0YsPamK0CWn8naW\nEQUgWc3u9qxn9v4As9nZvlXJXXlnjVZXXCPMAVQEVewd2MfaRoMOzemHVhUZFX4zFCNWsOzobIYk\nCYh4BZKajpCSEd6VZr9gZXQ/81hzNJt3psyFEsYEzYrYp/iwszWM55dtMf/B+lyVN2BeaNjH75rn\nXa4I43jpAnNbpB8/cOJIp5lbaH7rIsDU5WswH5fTg+Ls1gF17834rKUBcrV5cWCER2Kczzw3G9q8\nM63t6mqF0GX88dkNCPnM/bv7vO1b9qp9Va7WAyNdfarZYbR5d5vzc7u1GEyNOhpGIgg51JX1ZaZI\n9oWZAXWfDVDqvjB/l1zN5prhad2RrM8djafw0vKt+PuSeuyM7Ma8T5/POV97Lkae+7ztqYTXbmnF\nR5+lBxf+c8VWvLBuGT6vfhnq3uY8+T32eXtmFzN6rLx1XXQ72txTeWu5K2+7p+mJ1z51LpQMYaAj\n1QZlRO5FZwpx3zOr8V93vdVta4emG57lMM337t993plTwP7+yVX46LNmvPnR9l7vq9iefXMTfnTn\nm9jZ4p06ubtBh24bt7Wjrcv8/zVfyLd0xEs6HXFfMbzLjF15GxCe8E6PJbcYCmoqrfCOpwChoEau\n9TRf64bkDFjzfPFat4l5mrqtn9uqV6fvv3Y1J2eFp6x7Xl8RUOFTFXMOckWH4QlvCRX/P3vfGW9H\nVa/9TN/19H5OzknvIR0SEjpEulIFiShYLyI2BEQR9PpD5aJX5d5XQbHAtYAIypULWABpIXRIg5De\nc0pO3XXKej+sMmv2npOQkJAE5vlAOHvKXrNm9jzr356/wfTMWax0xyDt5EUKSdiOhxjTYM+5eRSY\nJjgdgxEYq+w2H3D7YGkmKkxqERuqESB33eR9z/k51OD4GZztI2F5FehIjxDu/mRMxylzRvgxascA\nsf34rgGLndIG8VhnODZGNZaFPvlpkZvgZSphKnG0pVqwrm+DiGl7xENPoQskT5vT8LwDuXXqoBTz\nlhdQPO6rsAXKmMqRmBc/EwCwTYqZ9xeY7CuJg2QroJhF9GT7AxKnQo42MQijeQN0Rt5xU4fteMg5\nebylPwa10idSYXnnfCL58Su348mtS/D0tucwHAghIg8j2DDD30f+/Cf3Bd3PWgO18DU2Fsfx8Pra\nnmFL1uRzOR4JlXYtzXrfXZ33cAlroXFuhYhEtqCG/b4RAReWkRvHlOJbv34Bn//RUwGyGS488Xah\nlogfcC/Du9Uudnd4aMlGAMCK9cFFUVAlL/yaO3uz+O7/vIyb734RQDjJ9/Tn8dWfPov/vPe1sm0H\nGxF5H2Lgcp8e8QJu85OSl+CyKR8RWeeEqEjFDWiqItxXhq4G4rfE8RPWvLxvkYsabznWy+uwY9vx\n5zUPBT4D4Mufqi4AAqjB2vKERevS4eo0EU3xyds0NCTMIHl35ujLl1reXiDmXXRcEVsnpCRhTvXd\nxUNuH+pitejpz+P7v30Zjh20qDXTptYwczcTT2rqwv51drbD3dXC+qYbgOohldBx2xePRW1lDBk7\nA0MxAaIGLG9DYeSt+AI1gfkC/GQ3x4DjemhPt8EhDr551xMAgK5cD4jiwcumEDM1DGR4jbwjXLfC\n8jZTUtzft7x5XH989RjUaC1iO8cga4X6qwfXw8vSmv2/L1+BL972NJ5bSePv/YX+wHGinaylwXE9\nPLVlCfr1jZClCbjl3ZcpAIoL64h/iYXGa10rUIqubA8Gi0P4zSNv4KofP4Wt3ZlhNbdLX7YD2SJW\n71oLrWETVOba91gI47W1PfjRH1/Dd38d7o4W51IdvLTzFQxk/UUst4qD3+1hlf009DYa/y+zvAsS\nebthbnP/M0X1hFUnt9flHqF9xe6M561d9HmRPQHDLYzeLrRS5aJDEKW6GYFF2zBW86ad9J70MC3/\nsORH3tt+1cbesm0HGxF5H2KQyduSyLs53YA5jTNgKiwm66mIWzoqkqZI7DE0FfUVPkm7riLI29ky\nDsUNkwFIwiEy2Uj//0bvWwDoAsHfTv/fHLNMxHJlyzwRM6DrKrW8VQdEcaEpPB6uIcXc5isHX8fD\n6/+B3iJzKRcSKNgutnXmoCoqNnX3omh7Qq5UfAdhMWtOiEYBLhxk+k1c87MleHNzHwaGXMhtTxW9\n6LvMAboA4W5zVodOHJal7hIYGvMU6P6POGNnYaksN0Amb7Yw8VAU4QNSQt6KXqTKdVDgekR0U4Pq\nghCC7Sx5jeTSKBRdZLJ+jTx/6QvL20j6fd+lmDe3vFNGCiopDy2IVquOKch7xY4NAIA7HlwJQgj6\nCiWlSxqXf6WWt0tCrErNRUXSxMBQEUosCzVGX3KqomJd/4aApekRD//x4m345Yrf4cnXaDLQ+m0D\nw8a8S8l73dYB/PjV22GOXClkfVHiiXpdcq/L4C9xo/0N3Lf+TxhM+fFM/rIujXFvJstgtKwXf3O8\n2rUcAwU/L6DUba5YGcSP/Jv/5ap/H4ekKgO5MmRf8HZ6cpcuSDj25DbfuGMA/3X/soDrfTjyPpRy\nuEuH+HZi3p0lLU7DKjhMQyv77FBBRN6HGHi3MZd4Abd5OkGJQybvmKkFpBh1XUVrbYX4m3gaeofY\nKp9ocDtHiAYn/Bz+viGPQpjbHIA1eSn9n5KYNz1GBzQXRHVE85WYqSNp+eP86/q/+brmtonnV+7E\nt3/zIlxbw2CBveTYGF23xDvAPuelal2d0o+LaDSBTKUdxVy1ECBcriInn58vWlzPg6XSfXVDJu8M\nYiodu6gVB+ApLC9AsUXSXqAcDzSWbjHCdlxPiKcomgvXI9gyRInMy6XYi1AR96efuXeHbE7eKSGB\nC9XzrT5meadNSu6EoKScboiGDYgKkqXPhpyYtq2/F45EzqpnsnI6D6ahwnY9IfIiQzc8NFbHA+1n\nTxt5MmY3TAfgl8ABtAFNxsliTd86keCn6wrk05JhyAYA1m4rr4tWNV8NcHcQOvkshGEnt4ltnHxl\nK01uHSvvs7p3LX6+7C68YD8ItXon9Kb1cFwPHgsDFG0XetPGkkF6tIwS/n0EEBDuKRsv8fZIzm8n\nbC1n4e+N5X3lfzyOl1d34bkV5Tr85eMoP9ef1/zfbtX+9hV7Gnep5e0GLO/wY3mf8mSMl5mW73co\nl5lF5H2IgVvepJS848wFzcmbqIgZQfI2NEVYhAAATxUPKIXix39RalmHrDBD3OoyFATd5lTpTYei\nuVAU1gwFQNzUUBEP9oC2XZ6lreI1Fssjju5b1oxcbVt+80iWM3P9J9W0fz2uCkUliM/5BxQzD6J4\nAVc37bxGaDy9pDOb6xKYjFw1g373uv4NsD0HMS3BxufPnY0C8k4BLmxhvYfNkaHQc7oegcoFDVUX\nRdtFX44nDkpa2GyBMsAWXZt6ekA8BU7RD4koiivkTLmLO2kkqTEqhwYA5NwMVI/OPfdCFIlv+e0c\npLHCZHYU8svno4KwzHvdEfK8A5LL18vTcyUTCpK8xpuPQU+IOZRlcLn17xFPJPFpqor/emCZf97d\nSHgOZotlOR+q7ore67sDf4nzygoS8/MBuFXtegSKlYU17Um83hOMsfMXf3+BHpdVemGNewVG+5so\n2g4efHo9rvrxU1i5cRfUip7AsUrA8vYTqoazvPNOAf/+3K349crf7/aa9qTbrVZ24U3mPQPefsxb\nnvdk3F+YD7eYKP246Nr4+6YncNeqe3Y7vr2F63n42h1LcO9j4W11AZQ6YgJW9HCaCNt76LuxtoL+\n/sLc6283Uc0jBKs29r6jxi97i4i8DzHwhDW3JObNLW9D5a5XD5apo7bSJ0VdV8vIuxQyAQXIhoQ8\nCnsgd/m7EjFK3rL1yWO0MVNDMhaU7LSJLb6DJxEpngHD8jCiIeWXAknhQSK3PWX/5vNAR2MaJ8xs\nDYxXYdnqPMnMMjV/PjSnrDOb43rCbc47hf34lTsAUPUzOugY7E1UCtT2CljeQ12w3mBN4FwyTGbN\nP7NsOx54YpMYe9HxROa9fJ9UaIDioj9TxKadg+jNDQKOibVbB0RCG1RPlCxx8t64Nc+6z0neBcVD\nkRRgEDZ+fq3En1SejY5CEqZdA5PF8hXNgc403Tlx0fmk280YEd4WbnnHtLjwLshKev1539LnBNfd\nnwsmj9VtwD82/QuATzAXnzyOnstxxaLW3jyOnch5W+QtkvGYkp6iEtF5T7a89Za1UONZPLL1kcDx\nolQupMd63inikefpPX3hra1Q48GMZyhyzFsi72Es78c2P4nOXDde3PkqVvS8OSxp7l6m1IM14SX8\nz9q7sXGAVkS83WzzjTv9+/R2Er5K8wGyUmfEMG/NviKTc9DVl8fGnYPIOTlsGiwvS1R3Y3kPN/4u\nFs/mW12XAEYe8SMfwWObnsTTr2/H7//xVuixpXjxjU6ahf9WePjmQCAi70MMgZh3wG1OicVQfOvN\nKnWba6qwfADfsm6o9gl+WPIOURIjw7jNOUbUVfqHqwomtlcFysc42cRMXciJcnDxEz6GuKVjYmsD\nHGLjcxeNRnUFHWdBTiJ2Zbc3V3BTkU4YaKyOB65HJOUxy7syYYpriM96TKjQnbuQkoHjEphsMdLX\n9Dh6cr2iZGt2zZHivM6OUXD7a2ATGy/upPrvXBa11G0OACZbbK3fPuhnzGsOirYrVMrkudVVHVA9\nPL9qJ2761QtQDBq394h0P1TXLxdi5L1lexH5okv34d4JVmGQGeT3UQFxNbjwJ7UvT4nZKRiIW5oY\nLzQbpk7H1S/FefkCSTdcn7wNej5L8cm74PrWZU/Od3trjLxL1dOMjlV4YM1DcDwXhAAT26tw1GRa\nG19winCJC7evDs72MaKigTeMCUPfUAGPv7JVkJDcjc4cuRJqRXfA8pZDQNLFis5hgYQzBqphH1zA\naH3t0gLDldzmEnlb4eQtJ/r9v9fuxLLulaH7DVceV3RtaHV+WODxzc8AKPdqEELws9d/jf9dG1yo\n9PTnWRc8EiDm4cgvb7t4+LmNWL2ZlmxmbT+GvCvfF3rMvoCX5xUdF//96p34/gs/KRM7KvXWBGr3\nh1ns8NACfw4c1xNVDH9a81f86onnsKnTv++7s8K3sERBfr/fDUTkfYhBVcOzzbmVoTGZUUV1ETM0\n1ErkPba1UsiE0pPQ40c2paXPJHeYbJmXan5Lx7O9yzaPb6kVC4yBrI3KlIXjjmgX23lTkpipiZcc\nh6uxlxnLJm+pTeDoFkqSd6+6V4xHdpvLMWthgbsa0gkDDdWJwPWUlsNVJM3A9ei1NN5cnaIuccfz\nhCeBqDbuXf0AvcaqMWhPjQheuEv3W9e3EaYSA8nR+f33y+eXzRG3vAH4Cxtmeedt+sKvTib8/TUD\niurhjU19gOJC0VwQx4TrefA8iJg4J29uUaowaRmT7DYXjVmkBZur0wx5Bp4QZ+cNxC1dkLeiOeLe\n9hcly7tAnzdV84TbnBOXqcZD3ea9WXo8IUy6Vy+KEkYK/9nryXK3ugKTkXPe4wsxQ1wDNCcQ81ZT\nvbjnzT8LBb8f3PMq7n70TSxdxWK3hpSAVbMT1sQXBQm6HvGrGmRIuQWDdjl5F11byMNyD4ilxv3K\nDtUT3gWZvLmGQSl6MoMwSAIN8ToAwxPgcKpyj21+MqDBz/NKZPL9xV9XYe2urVjWvRKPbHxMfH7/\nk2tx9zPPIT7zceitawLqdjIxquke4bnY3p3BH59Yi+/9lraslWV4d2a7Qse4L8ixDP9C0cP6Aerp\nKG3y44sJ2VjbtwHLi0/AGE1DII7r4anXt+FP/1obOIaHRGQJYPkdEZu6BErCf/YzuxGl6WFW/HCS\nvgcCEXkfYjihbSEA4JSO42GZ5daAcFUzy7u1jr4oJnVUY1RzhbAeAQh3LHe5A74rm26XasI7R8Dp\nbIPT2eZvl9zQJJeCN1SJRNHvuGVqJs4/kVoZU0ZS17HIqAZQLPjkHbf0gIAMjAIjW7pPU20Ccxpn\nYFrdJKzr34AhbTs7h3TxcsyaK615GtIJk1qBsopcSUb9rPH1qEun/HMxxboYUzVzXSL01wFfxtXQ\nDJhG8GfCSSTjZGEp/uKptT6FUpiaLITj66sXbQ95FhNorvGTDGO6IVnOvshMNu/QlzBbwMiWNyGA\n6rG+6l65d0K27ImniVp2wE+kKuboPbKYWA50h1U7EAzZQ1DsONA5GvYW1pVMc0SiD0+aMxCDxa5X\nJm/umvcGqJiOVrsNg3JrTql0atsQJVtVVYVlXfAYKTAvCl3EObAMSU9/1HI8ufVZPLLhn3A9V5RM\n9bA2nUqImI3sNlfCyrcUV7yMB/dgeXMPSEz1PUCK6olqD368O1gFNZ4RLm0ZWTuHQk7DB0efAcDv\nA5ArOPjt31eL/Rw3PKntjV1BFy8PXcge7KGcjd89/1TZsX99diNyBiVEo3VtoFc5J38lNgRr0guw\nJtM6/lK1uqyUUf+/ax8WvyEZL+54Bc9L/elLUXSL5fr/kuXN8asVv8OPX75dhMe4Vfzwhn/ihy//\nP2zxVkKv2waoDhzPw6/+7w08tGRjwAshpH+55e2RMg+kKpP3btrfdrPnbLgGPwcCEXkfYphcOwE/\nOf67mNVwBCyj/PYYnGBUDzFTQ1XKwq1XHI0vf5hm+fK4LQBBvgH3ouwelC1vosHeMBVeVs5WD24v\nrJyPKUnfhWxqBj588nh89zPzMGMctRZiElnlmfEbs6jbvLBsIYobJ4rtinR+njRycvvx9OsUjyXE\nlMfdrSnPCsubeNTytgzNT3aDH1fk1xC3NMwd7y88eDcxUzOggCa1aFKHXMI8Dbyvugw5NGCqscC2\ns0afiqq+2eLvQHvOgHyqi6JXBCEKWup8z0jcNH0vCCccx0Qmb9MsbE+lMe+Cr3QH1wAhCLjNdU3y\nTngl91+aJy7/6hQMxE0NMS3OzmvT5iu6DZe4UPKViO+a5hOo6kiWN53LJ1/y+8bLMW+e8ObsGAli\nGzDa30Rfwbcqq6sly3Dlb2BNfRquPgRNVaEqCoqk3PImqitCSfLcPrrxMVz1xNegN9FSL9cjwoPh\nZYOLq0DCWgi5Q/WEBSqTN/GYfKtr+6ED9jwljARkHf3BnA3Xc7FhYBNMLyUWMLe8eBs2D/ou7oJt\nU8liR0c2xyxCRn6PLN0kyc8SbM9tx5cfvwlPrH8hMNwEy81wulqhQRM6/4E4t+KhR6WJX2rp619a\n5PXYdBHVP1TwNQXYgpiXBZaSmWx5bx7ahqU7yrPOf7Xy9/jNyj+Iv4u2G9B8v/Wl/8Y1T92EXfle\n/HrFH7Az0yme9VIZ1NV9a6HX00UQJ+A1fesD+6iJwYDbnH+XJ2nYc0+G63riPnLIev6lynUyIvKO\nAADQVPYjUspdedzyVlRPuNJrKmJCwjBgeTOrNpDMEaKqxlGZNANxW0teCDA0VfrkbmoGFEVBY7Xv\n9pXJm7+EYqbGXKBKILNazlavTtPjWpK+znhpkhx3hauJId8t7lLL2zTUACnpsWLgHKauBRc2jNhM\nzYSmqXC8oOXNyVtX9fLYqpQ3YKlWYNOpI09ERW6cv13zr5d7PbTa7XijawOKjg14KtobZcvbpN4F\nkDLL2/OISNrjpMFnvQAAIABJREFULwlFt0EcAx4jb3gaFIVlC3P3eWkSIRfaAc1GB2huQNzS/fun\nOYiZmp87UIhTS5yoILaBIjJSzLsI4mpYsqwbdz9MXZM524/r8nixl6mE09kORSEYcCh5X/Ghqaiu\nCU6vmhhCwaRuV8NQy8ibXoODdFJ+PoOWqNEuNVlhiwsu7AJQFz63vF3J8k5qKcQd2kRGUT3YbJ4H\n7SHEtBgaN58HZ/toAIBDbL8Gmn1Hykj4vyvFQ6Ho4q3e9cg5eaSdVnj9dWIMXUO+KtiWHuZKdw0M\nZei4RJMdRhp6y1rE5vwdD+/6LYrI4c8rngxc85Cdpde1fip01RALKNntrbe+BcegnhAufyyseOn3\ns9T5E17dvAFf+q9nxCLHigWJqbQ3Oq9l93rpb3ht34bAdtlb4HgObNfG13/+HP7th/8Sn29l5ZM3\nPPtdvLDzZfxr6xLkmOVdCHNJMw8cH2NNrDqwWUkMBkrF1vZuxuceuwbLu94Qn8ltb6EHr4mrJAI0\ncY4QgnvefACvdvnhCdvx0McSE4frQX8gEJH3IYzm2gROmNWKL15whPgskE0eAqOEcDVVCcgbkuEs\nbwC1lbEAoafiQWICgNYaP0lNjudyWJLb3M821/06TEk0hYu4AEBVih4XsFRLM+Cl97OaYGVWsuWt\ny+QddJsbuio0vGWYqgFNU6h2t7SY4Mk3pmqUkbec9CeTsxibNN+xEMtbjWXxt/7f04Q1TxPudtNQ\nYeq+Z0V0RXNMZPIOtSCY5e1fqA04BmzH893mAGJxlLnNP3bqBEHufFvOy9JnytMRs3RhvampPjqn\njLy9giU8EKQYR8YblFzGRX9O2Hf15XwrLONkaEzZMcR+3EqLWRo8zd83zix/T6X3z9BUOGD3UnwH\n/d5kXAqTlHTVcwerpG1sIWeb8DIVYpz8he95HqAX4RXi+FjHlTAddqzqsg531PJOm0nYDoSHpugV\nxQKAex8qrKT4XXHPx7Iu6vKOF5vhDVXD3joGAPDcm742+NZeupghjo6BQWZpMsubX6Wa3hUoAyxk\ng7+/jJ1hc6RAgyFCF778bT/05vVAMQEvkxZiQaVqfRxLNvtlc+NHVOH8U/zcDzW1C/IP8ub/eQld\ng9TFbO8YgYQex/qSJjy25xPj7ct+gy8/eQN2ubQpztbuTKjVammmkKQNI0au9Oc4dCxyxjtA3d6y\nbsCDq/8BAPjjW38Wn/FjHdcLvEMAQElI5J230Vvow5Nbl+Dny+4Sn+8azIuZiCzvCACoxfzRRRNw\nxBh/tT4hdQTcwSoU3pwdekwpucdMLaiQJJM3CZKZkDdlkBOpONpqq6T9yxcSlaZvRfKXWMyULT//\n/LpE3tzy1lTNJ9mSxUVx3RHwWMIUb0nKY96moQlXOOCX9nDi0jU1IBwiX4OuKtjUOYTfPbpOfM7L\no/RQ8vYXKLy0SoYWIG95MVOSw6C6UIkuLNiKhCnlNDjCCiCOQd3mhJM3LwVzoai0tj5XcFDgbnMA\nlim7zTUoACxDCyTNAUCB5JDQaC5CwtIRN+j86rU70KmsgWKypKd8TMwDKcThEgdEp5nJil70xXDY\ngi3LuscRQtDv7PLbz7Lvz7t0e6ezCTuTVNr0kxMvw1ktF9Ahs/71pqHCUYKVA2JRKB5PAhhFjEx3\n4OjmuXRqpC588iKosHIezGKNyDsAaLKiYhQB2/QXSACgUMvbIx6G7AzSZgq27Sc2Op4jkTf9jsp4\nWlLCo9v6cixbv0jnmeTpwAekBc6OPhZGcA08vIS6yGWyo99B5X7zrx1L50hx8M+XttA+5ZkiuocG\nxBzpii5i5tzw1iq7oShAYeN4EMeEogDZYkHEkkvj/tttP8FLU5WA0Iw1+XmYE18AJ/A1W/rx6rrt\nbJ4NjKrsQHd+FwaKfqWCHMte2fMmrftnv+MbfrEUP/uLb81yFN2i0DRwPSJCFv7AWNWJ68sJK0RF\n7oVF9JqsbMBtvnEbnfNdhV6YE16A3voWc6F71I3O3iFpvQLENpnbnC0M8g5eWeu3C/WIhy2dQ3h+\nVac/3ihhLcJwiGtxFFfNg9dfH7q9lLwtUyuxvOWENXr7501uxPc/O5+2/pMs7/GtJf5MMMuCn1sr\nt7wbEv5Cg1tIPGv5e5+Zh7PmjQ0dK7e8AYiM5dIMYJJP4SOTzg1+oWR5u92tZePxX8R+9q0MQzWk\nemH/+3gs2ND0snri0fX+NRoh5C3PtxJI6C8JA6gudMUQZXQVSRNtKRqX16q6AuSbZZa3r89OAuSe\nLTjCbQ4Ahkl8kvc0aBpLAJOS5gACm+TAeSNmakjr/uIrhz5R1uQWLDEPXoH1ZlcGac9yzRPhEL5Y\n4q1fu3O7YKMgLF5ueXsKJYo3B/3yqE1bbdz10Dq2ncdXM7Br3gKgCNLjC4BYjL2U9SIUBUjoSVw0\n4TwoTgyKbqOphu4vXP/FGEBUmAodP7f+O6uepIsgT4Preb4YDot5d2W74REP9fE62K4nFp02sSWl\nO2Z5mwkpt4Fu4yEEl1Vf8DnK2QUUbRe/fGgVVm1hjXpcXWyX8wb4dRLHFGI7ikoT2VZv7sMdf10B\nWymI+VWhS25ztsBgdegkmxZz2DkwhKLtQW9eC60mqKrW7/o1y6qqBKRhAdAOaZK1nnPZ78s10Jyg\nZX49OT80kA35/cmu+tfWlau6FdyicJtD8VhIyQfPc+GLqEwxA89mioKuCkVzgqV10m9Qq+wRTXgc\nhzDLm97Hj7Z/Bl6mIuClsl0Pv3/CL9+7c/n/4DtLbsMDT/niMZHlHWFYqHu4Y2aJNWwZWlD3N1Aq\nRh/kptoE6qvi0FQ1QO6TO2rF/08fU4svXzjdj8ejNL5OURvzCZ94Kl08MJd5Q3UCR030CVa4iAGk\nErIrmrfwLL/YMXV+TJx386pImNA1Bc7WsdQqIeUxfuIRVFoVpaeDqRmi5EgJqXU3VCMgvfjZD07B\n5R+YIf7Ww8hb2t+TpEcntNUFd9RtqIqOuGR5z2+eCxAFesNmP6Pe1ZHJ2zR2yaw6a+ozfptXx0Au\nHyTvbGIjI2h6nzVNoeTLXtqJlEvdpooHz6afxS0dST2Jwlv0+lzFFpa3V4gL0RauVpbxBkUSk8hl\nYN+/dscu/Owvy/HEm5ScSYaFW3jZGrdwTN+789yrg4J8XOY2R7IXUF2MVuaCFBOB73hsgJYUcosx\nriawoycLt6hDt1xRiREgb0BkxOftIjziIR9jFmM+IfIKAFoOt2pjL/64lNbzt6aaqQyqwlu32uVu\n81hSkD/XyOdVBbativtJPy/gsZe34ull27F5Fy2Rq02m/aQ/j7vNFYgcCFsqeWT3dzBrY8122mKX\ne0Dyeep2J4SAe43VWAbEU0AKcXGN37l7KdZvH4AxgvUzcHTkXliEpNMIBzZ4GZ+mKgErmkP+zdhM\nuY84hig5zEreroxdImID3+1N57A8abDgFkTCGkqSyVJGUhCrIyzvrK+q6OqA7gT7juvhSWe268F1\nCfNuqMgXJE8Zu8ai7QbG8GrXcmjpXvEbEfu8S4jI+z2GcLe5VPIVEvMOWJYSudek/Bfr/KlNmDra\nJ3MAAUEYDpnc4WkBlzk9p2+5VyV88RiZ8MQChBGVSGarS6IuLnkDPA26ptDEKkUBoIAUEtAhuarZ\nS8ojBIs6TkBV/wwa72OQyZk37pARqJsHVZKrkhYBCb085i27zT2pfj5VojKnKABcFQ3VCUwbXYu5\nExtQHauC5VZBiQ8FMuppqZhfh6omhkQmLHENDOVtFGzfbd6XXI7p09l99VToqsK6zrH5GPMMVNZb\nmj8TBduFoijwhmhoxEYOipmHQhTAlmLezPLut/tgxIPkzc/Vn83h+VWd+PsKSt6lljePLTpMMCb/\n+kLs7CmIuHavthHfff5HUHU6B5br51qIlynJQzFzIt5tKQlk8g6Ia8BVCjBYtUYpefNcjbybD1iD\nzrYxrByPC9FQ8n19K81gbk01U8ubu80JJe+EpQuXdtKK+d4PI0jeXPdAdPDzCsiKen3672lzx4rf\nnS2XWqksROKYoGI7qng+sgUnQJwAkMl6ICBwPMePeceyIIUEAFXyDri49wnfclR0ByAqFM8MzLWm\nKqHlcmD3UYkPQq3sogtqT4NC6Dhkb1fGLre8lVjWJz+jnFjzbkGUivE5cnc14gOpy2CqlvjMcT04\nnoO8mxeqisTVoaiOmGN6fSXfwd3ujkcXAJoNuAZts+zySgJ/n7LjpTkChkmqO0CIyPsww5566IZa\n3oGYd3mdNycbz/MCCWvyQqBUfrB0eygUglhJrbpsrVt6+PG8QQh/iR49tQkfP20irr5oBkzNpCtu\nNv50wixrSiCTN38RVqUs6KqO6sKkQHtUUzVEOdCYmjZ888jrMK5q9LDXaGhqoJZ9XHMdPnDkCNx0\n2Vz/slUFhVVzoearsKDZF27hgh4yKhMJ6JqKL104HfOnUq+CiQR9YRh+0h0tFSOB8h7enAWOgf6h\nYHY9APSxzm10kaMGLG8A0OtYwhT7bOG0ZurZYQRAyTEPnSQAKNA1BQumNWFEFQ3Z7Mr3wUqweHKJ\n5a0YBVimImWrJ6Brip/YptlQFF+qlQghGVVoxW8Z2ibmwHN8qV23zw8ZKUZRWN4WErSch32HbrJY\nrsVkMJnHIGXRf9/Y0oVfPkL7NDudbSDFOAtNcPJmI0pQi7Ml2URj3sxtTsnbRdzSka70UBmjrV1L\nyZ+7r4u8RxD7DRb1XRhwugHFhcZEgyqsBGrScRAiuc0VOW4vJe0x0ti4Y1C4r0lJ7kHRsyl560W6\nwGChB+Fh01ykkiE04AYXWSqzvHUvjuKGSbC3jaLbmSWqN9MFDsnR3vSKS8chk3e2pH4bAPSGLYjN\noNnmhuWTYFKpggIFBafot2FlY/EKcShOnC7CJLc5j6kHLG/NoUTMUGrdK6oHKJS4HZewcj0ahvIX\nOPQ7MnlbkLfT3QJnRzubA7o9bmmR5R1heBT3QN56iaVoGcGENeLJ2xXpv1wmskSqkyGsLWCY5Q0A\nY6voD5vYFsa0BF3VMtEamoFvf+JI3HrF0SXnpeTI5V1jpoZjp7eIuHiVVcmuRRMNW2QYEnlfcfZ0\nfP7caRjTSo/RNZW9YPh1aSJO1VSTQGOqBhWmbJkH57M0/p00E/jwiePQ3ugfo6kKvMFaJDedgOqY\n/7nc7IGjpabc2rcU+oJVOem4GnIFF7miCy3mZ1VziyWQxyBZ+l051vCFuc0NPRgWEYlVnobPnD0F\nNRUxen+IBuJqKHg5KLotXsS6ruITZ0zGNefT+9WT3wU9zsnbEucCAK1yF5qmv+W77l0dLbVJsVDQ\n67Yj1rLJF3MJkZYFAM9gCnBF+twkYjq83iZUDbIKDL0o9NKTajWyeceP+zJLTjGZNeZpuOXf5iPN\nyHv11h68vtFPsgJo8ppX4vZWrBwsNYZ7/rYJBP4C1CU0YU2zCsg4Q+iobGGeJhWEKFA1Rt6eDVM1\nYLOsZrHAqejBC7gPWsNmaJXsGow4aiuo0EvOLhey4fFuIkkFr9vZi9gUKpzCFy5y3NzziEgMEwtX\nISTjoChJ2Y6qYhnlJeENVSXoLw4grqThdnb4izUtaBUX3jgyMA5ZMjUjZYIHcmMAQPEEeRc3TMIM\n71xYmomiWxAxb1GD7eooFF0YqinKHh2X+Cp2IrFRh6J5yBSkeWTkrQ42wB1gZWWqC9vxqKdDs0Fc\ng3lwuOVNv38wa4v5cLvaUB1jybvsGY9behTzjjA89mR5lwovWKaGie30ITthVmvoS5KngHgEQQlR\nibiUEPIu7fTEccX0T6Bi2wkgmUqcefTIYcdqqDra6lOoqYiVfS6j1Hr33dYkKNTBj5dUz2pSScwc\n71tqhqYG6n0BX7iBZ30nDD9cUGZ5l2Sex8JKxbgbnhDELR1nzO/AvMmNWDituWxfSyuPmXPyVrhl\nzd2sRRfElPpCM3KXQx2Vdf6LWGQrexp0VaVubzlhx2KuVtfXntfZfSaOgSFniMqzshc5d5vH9TgS\nehy7cr2iJI+/zD+4YIw4f6eyRri947qJz35oqug0BwBoXYmCW4ACJZDMJ5frOBq1evM5Rt5snKpD\nCXj2tCTMup3wCnGkvUYqYcleuq/iL4DqUPJm46urjPueE83P6OcvfM8jovWqxvu6aw5yWQVLWJtM\ngy1aHTh0MZ2gNdod6XamSgfAU6FoLJud2DA1U2RNBxfQ/iKNz21N2gI8Ddtz23HHsrsgMvqlccLT\nxMKoM+tnO8ulcIBP3rwlqli4Sm5z3oa3Wm3CN46/KuCh4ffC1bLwiIdxDS04YVYrxrXUse22P5eA\nOG57Fx1vgLyZZXzVjE/jqhmfDswBvRd8gRLDUMaFpZk0YU3EvNn8cfJWTJFQ5rgeduWpp8kX86H/\nDhX8MSi6DS+TRmbVLH8Bwo7f5qyHogDeUGXAbS5yC3K2mA/iGJg7voWek2kimHpkeUfYDfbU67fU\nhZyMGWitT+G2Lx6DxaeMLy9XAkRrPyrmIFnGEonK33v6yJNRH68NxH5lWJqJq886ATd8bI7I+JXB\nm6+UeglKt/NyH8sILjgqmeWt6DbSyZBac0XOXA9+h6YpActbBpf7TOp+LH5PlndY9yQeo+eqcecd\nNwafPnsKmmuTqDQqA/uGldvFWekWfzEeN82vr1WkF7/O483SgqxdnVZ2PuL6lrdcIy7I3/W158e3\nV2H+lEY0pqtEaRBPaNOlhUtNrBo9+V7U1zOyZy/CU+YEdeCJ4oB4KmaOa0RTTQLfuuyowPa8W4Cl\nmaiuKF8EAUBRoyV7OUbeosENK9dzk53wFAfurkbs7M0hm7eF29RGHlrtdroAkcSBYixPQdHcMnf0\n8nW70DfAXMUa70jmBBZIwuOkuFTVLk5Jo6OizV9oen5M2iU2DCk8M5yXAQDqE3WoqYgJ1/1rXcuR\nJf2wJlBJUd/y1oU7l3sv7O0j4Q0wi5aR8zPbn4dLiBAb4QtX/syYY19FhtDx1+ktSFspGLoKj7e5\nZZamrdHjm5J1+OiiCWirpgaBYmVhjnsZWsUudk56n555lWaqb+rpFdfG3ea5IR1JI/heUHQbnsEs\nZ9tAf6YIS7NQCIl5wzGQLzq+qJLqoug6uH/NX+k1MhU7fo0i1q7QOm6xuJcaBdmOh60OFW5xu1tD\nLe+hrB2o8EjH6Hvig8eOwHc/PQ+WoUUx7wjDY97kJswaX4+vLZ4Vup1nVNdZ9Zg2uhZnL6Qu7GSM\nJWaFkTezvUvbBcqiJvKmM0Yvwk3zrw0mp5WgtjKGUc3h5M7JebiYOd8u9MdLkt7SJn0BKbqDdLyc\nvGXLu1RIxtBUv+SoBJwYZMtbLyFXTmAfnXQhxlaNwsiKkqYlAM6Y34HT5rXjU2dNLtv2hWlXBWr0\nwzL2E5rk1lc0jGzyCb+m+xhfI54n+kj3tEFvxw+O/ffgCT1VinlL99RgbnfPz3jXVBWfOmsK6lL+\nvXOKLAFLWrjUxqphezZ67R7qvuS92y3//GkjhYq0CsXTce6xNI9A04KLy135PliaiY+dOhFnzO8o\nmwvCwgCdPVRHnN8jTmI7MszqtC1s685Qy1tWA+WhBdvCSbOobn/CYG1Nx7wu+otzwn9pdZcIJ2ga\nK8nTnMACSSgPcnI26QJjRLoVlsmS+jwNHmhfe9uz0dMnJToNoxxYvXURLM2kSoeyVCk2+vMhW95C\n598RcyD2Y8f/c9OT6NPXQIlTD0ap5a1oHvQxNJuee5Fk8halWCo9vi5OibE6Sc9jtKyHVs3ugfQc\ncm8N11bo7suhM0sJ/Sf3vFn221fTPSC1G0CKFrxsBdZvH0BPn4OcUxAiLVzbgdgx5G0XGgwxxgIZ\nQme2G9PrpooWvdzyzhTZ74Qt1OIaj/v7mgeO6yGDXpCiBZJL0wx1fs/ZImkoZ0uuewOVTGggFgcq\nUxZMQ0XRdvdoYO0vhJs+EQ5ZWKaGK88tt644UkYS35p/HSrMVHhMOqS1J3/Zlbb+k6340pZ77wS+\n5R1O/pogb/riLiXvuJThHeY214gpvqd0gUHJV8XE+Cx0NFQFtnELf3duc+5Wntc8B/Oa54SOP27p\nuOD4saHbUrFYwPIPu0dJPQk4/vbqtH+9llcJe/0UWJOfB1GZFSBZhZahIaZbSOoJP8bo6dBUBTFL\ng9vTgqJuw+zw5SHh6mVd32TLyCkyYpeItyZO44V9hX5U6JXg7RsURUFh9UxY41+B7dlIWAZqrKQI\njWglnouck0M6UYdpo2sxbXQtHlqyEYWVR6FuyhoMEhazJ4Bjq+hoTQjPByfvXqaRbqoxbO/JImbq\nouc44MtbLpzSjounUNnadExanNWzpL1Aq1z6Ha45CKj1rCpAmmON11mzeD57oSeNhL/wJCo8uEhY\nOlzVLbG2gwsYTt5xRp7phBH4neaJn+XtDXBi8suYeLlVIJ9B+v+COgjVGoKXjwsPguyB4z9z/rsy\nNBW9/QRWo+/9Kap0DPUsVl2TKM/VCHw/I/8iyeP1tT348f8+g9gR6+EO1ACuIQiZQ6vugqIAxS3j\nAU9HvujCLChQzaLoC6ym+kEIdWsXii7i4HF5FzZT4RvoKxeEGsxnAZjQ0rS6okKvQhcQ0DywHQ8O\n8ZUCs3lb/K54eCJrboNVxWrfPRWVcbqIzjssVBUbgDbiDeSKp5XNzYFAZHm/B1EXrxk2mexblx9V\n9hmnZbIbgi61yt8JRFx+mFOqSnncXkZCcmvL7U55rJlnm4dZtfzlf3TNSTh7zKmh35PYjdv8ncLU\ntYALN8z7kNb9a7JUU7jhAZoMN29C0NqX3eomW4BUxZi1ThSAKNBUBcmYgfqqONydI6ES2UrSAhYz\nAD+jHxAvYpng5Xr+ilgSs8bXi0XloglzUUmakXcLyDm5gJiPripCHpQjVhL394aqMdM8xf/A1QEo\naK5Jipp8txict7pUBTp7cxjMFuF2t+KktuMB+PKWDekKUXWRNMt/G7LkbYJtH4qvg9FOFzky2QkJ\nYJ6Mp9iIaVbwuXV05L0cYpVDVMSmpLJDBifvGHvuUnED8o8jD2r1FlbPFORbEaf7ajXbpQ575RoO\nAPWsKUYRhCWr3fjxuThnQfnikqvrmYYq7jl3mxdUujyrZ5Z3Q0U5eZeqNxJPRc7JY9naHiEA43bS\nZ3XZup7gHHDPgOwV83QoCi+1I1CT/SC5FFRCyZ9b3takpbAVapWv3iDVkrt8AcF6rTdSQZZRFvOI\nSZZ10XHhoCg8BnLuBPds8GeBjRhxg3fQo+SdSa6F0bwBm3qD7UoPFCLyfp9hREMKN867BjcvuEF8\nxt08Lvt3XP+5uOGoqwPH7U/y5pa1S8KTO9QSy7s05t2UpOpN9bHaQO05fzlzyzuMGPnLP6xjG/+e\nZIjlffHJ4zB+RFVACW5fQL9fETHN/kJ5b2ceFgAASw+St6oquPC4oDu+QrIkeQ9s/pKloQefMK5f\nPBvzpjQibUrk7Oplcyxn3E8f2YxPnDEJx83wBXZ4xj9A5+vKc6dhFksMvPCEsRjZQL8/5+QDSXma\npsLZOg7FdVP9awxJ2otp/vg4cTbX+Za36ypI6v51N1VVwiME67cPQFVUnNi+AIBfTiff0zE1I/zs\neAauugbQOefQG1g3L4mYDFWjiyJmeXuqLeLoHPa2MSDwUGxaRj/gmvNmubdJMQsgRKEd5cDIW/N/\nG0VlqGwMHfXU82GOXAU11ReYJ/n7AKDIiJfY1LXb0ZTG+NagZgPgaxYQ4ru9eYJWERkYqiEWdfXp\n8pBYaSIeHAMFL49ETIfCeoDzkM+ytUHyVpmSn+w14Za8Yuap7oHmwstUwjI1arm7vuVcTG1mx0jN\nhQIxawIt3Y8RqTbUWLWB8Sqai6ydp78VtmjpzxQDx1tSCaCzg4Z3+D1/bPNTeHzz037mPQn3KO5v\nROT9PkRDog6VVvnKmbvGLSWFpmRDYFtIXtY+QxXkHX5SlcfaWcy71FoZXdmBzx7xcXxp9hXB47ik\nNCPv0pp3gFs1VIq0FMJtHmJ5nzJnBK67ZFawZn4foCgKrr5oBmbWURWz0sQdAEiYMVHrbGkmkjFd\nkLKmBUkLAJKmFONn19DMFjgcfOlVmbLw6bOmoDImddjytLJEx3qplOeoiW1YMK1ZzB2AwPOTCLsG\naQ5ly5vfS/klHUbeimv6nhNO3jVJcbzreUhLY2itoZ6ATN5BIqajwkoHqiHkMVZYaehvLkL+9YXi\nM/k+xIzdh5scj8BQLKiJQSjxAXhKOXl7fY1I6Wm4Zl/g+NLKCQFXF2JKybgRVC5TWaxXImdu9QF+\nXH+4RLiiTscwsq4ON1xKQz1hiZIJk96zwWxRsjqZ2xw5VJgp8ZzIz5x8DYCfsEk8DXllANvct0SN\nNo9Db9hRrtYG0JJD/qyLOTviaRH+IPkkYqaGfNHBUM6fI0/3NQ9Kx2N0rGJzRFATr0KMe5mE5e0K\nsR5O/rmCFPPWHDom1QOxTdibJtFxSc/tfW89KMJYils+twcCEXm/j/GlC6ejrT6JY46gJQ+cvGWC\nGsvqoxtq4uUn2EfwOHRYpjbgW+Ya++2kQmq5p9VNLluAcLe5RuiPKszyPml2G7568UyMaPDJ6+On\nTUR7QwpjWqk1EbS8939ayOSRNbjsiPNxycTzsajjxLLtluRaNzUqQiMatygKNFULvDiC5E3nroy8\nSxwnSSNoeZeiPu6Tt0zEHBVSA5rQBUiAvP2xcs+HHDoI08h3XCI8LPwl3taQFDK6iZiBasn676j3\n3fiJmA5VUZHQ/WssXfAYugqST6Hw5mwU101FXaU/3ngIecvEWSi6mKgfDUVzoTdtgIci4iELkLRR\nCaIEO7uVhif880slmpoaIG9P9ZOkOJRA1QDvXS/FsaUua45OiW/e+A7RwS6szDNlMNlbqVaeyt8S\n2MghLXljShd79BqY5rzJa8jpta8k//TVANl5t3aXS6USxwCIhoZqdi8kmeOWDno9Zx45HjGTajP0\nZf2ySRKKQnq/AAAbb0lEQVQbCJxfHo+i29CbN7BrTIjx8Xtijl6OdblV9CBXFzkAckKbodN7Egif\nlNxzy6I/suaaYEXJgUJE3u9jTBtdi29/4ihhhXLXuKym9qULp+Nri2dhTMv+eyC55T0cefPt6YSO\nb19+JCrfpqv6iDHUHdZWzVyKIdZFzNQxqaM68PI5dnoLbrr8SBh6iOUdco79AV3VcXTLkaFjNAzV\n713NSJeTN19Y8aoCUzUQtyTVOmF5S33R6ZkCf6UMyfIOJW/frZowysm7UnqRlxJj6THyNXLrkkus\nAggo1vE6esfxfCEPRkS1FTGcMa8DC6Y14XPnTEW15Sccjm32a/m5cE9ausbSaxC1+P31cLvbxLMD\nhJO3vMAp2C7aY+MB0HI7opAyyxsAKqQmL9yKa2sIL1NEqbWmlv825PvUW/Sbhvglf/52N6RxkRyO\naU+34YNjTgsQZMqS480avEwaWsUuaPVbQBQvcDwAjKroQEqpFp2+xjbV4rR57SK0YW8eL/bV0n1C\nOnU48FBGQzUTKUr6IaW8QRu3jKqvQ8LSkc07yG0ZAQyy6xS96/05mDLC9x7yTPWkkfS9H9K+y7LP\n0jE4BkawBQ4kt7mha9QL4egY1ZzGly+cXrbodPUcTM1EOvHOQmtvFxF5RxDglrfspo5bOsa1VQ13\nyD5B3UPMW1jm8IZ/2YXg46dNxFXnHYGFU6hs4R7lW4eBoRnCZbuv53gnMHVNvMy5vCTPOOfZ2jUx\npg6lKIhJ8WruNi9VsCq1vNvSkmBMyAtVJtQwy1te1ISRe3wYy1tkrEtWolySyMvRbMcT18jjoYqi\nIBEz8IkzJqO5NonqmL+gbK3zn9EjxtJrr5LIPVGywCgNxUwf689XmEuYuJqwVfNFF0nTAnE1EVMP\nI++URN7cyjv32NE4//gxZfsSV99ziZFENos6ThL/z/UQZOudZCqRe/HkQGy/QiJfRVGwqOMEJGzf\nQ5OWyRsKiutoAqLesLnseAC4es7nME+7UCwARjdX4YLjx4pyPrenFflXj5PGT3uNDwfujeGWt73N\nn6ch0Bh52kxhVEsFXI+gp9dD88CxwXOwRe8lp4zHF844DpW5CQAANUkt85SRRJznHYQtJFzd98rx\nksGqbrg1a2jioWvgklMmYOro2jLvQ09uV6gH5kAhIu8IAsfPpAlJs8aHtxvdXxhTORJAmHVIMb1u\nCh1P24K9Om/M1DFjXB3SZhKWZgaSrvYWPEZ6INzme4IpWd5claqmgvc7py8MTmxFtwhLiqNazHug\nqzqumvFptPfRspVSWhhd6ddUf2NxeQWCjHgIecsItbyHiXnLXp20Qq1dXv8L+Ja37bpoSNDnkBBg\nzsRgDgYAVPMFDILiOVzJri3V4o+x1PKWyPu4GS1oqfWvIWUl8K3510IfkhY4ro6PnEItyUVzR9De\n6LYpuqrFJaW9tnrqrm9IBpvoANSDcvq8Dnz9yC/jxBHHBM4vo3YoqONAXE2QCQBMrh+Lr5TkfJSF\nPzxdlNQBCP09aNLzLSc+AsBXPngs4GqC+NIhxxMoYlw8h+Wy0yYKWWRSjAnLXHZpA0DlllNwztgz\n/HOxPIiKBPME9jYF8hIA6k2Z2O7f95baCmiuf2/5d+SLDlRFRYfH+ruzkreUZHmHClY5BkYIqWMF\nbj99RovJbfQjVy+rfvHnItwDc6AQ1XlHEDh9XgcWTmt+227qfcWF4z+ICdVjMatxeuj2SbXj8b2F\n3wyWK+0FNFXDNXM+H3AN7y0Sehx9hf6DYnkbuubXm7JabRHzZqRTKxGX/DIxpSz6CTVjYbkZAD1l\n7D0i5WeOj24O96x8ceZnsHFwS5m7tBRhZYmyNR6WkAYA7eZErCg8g+aUb/0J8nY8zG2cgXV9GxDL\njMTZx00qO77GCo77psvmYiBbFHM1qmoEsCV8DHweZ42vx8dOnRgcu6WjLl6LOSNH47lupn3u6Zg/\npQknzaZCL6+s7mJSpdQzInsqvrZ4Nrr6cuhVN4nvT5oWvvrJo0TYoCXVhIUtR+GxzU/R87s6IHHC\nCGUatrxYg/icf9APSohGVYKJi6RE2lh87vj3Jox8PdZ61elpQoKFX266bC5Wb+7D5JE1UFdUwovv\nYseXPwfHTm/GP5dyOWBK3o01CXz90jlYuWEX/vPe10DsGHXtl1xDtVGHk9tnYkzlKNz94t+xsZMu\ndqaMqsH0TbUY0ZjCX5esByGK8C6kzBTGj/AXXifPbsPmt6rQyYVYuDgMlzw2LZCiJRZZKTOJmDK8\n5U1cHUdPbcIf/klbpBbfnIPYnL/BNQbE9pgxvOt/uGf9QCCyvCMIKIpywIkboC7Go5pn79aqTUuZ\nrfuCpmQjUua+kT9A3c5pI1VWc/5uwNJV1roRqGSJYaUxb9EUAcGyt9KSLz6DpIS9Dc3AhOqxGF9V\n7sLlGFc9Bie3Hzfsdo6w8IdcBx6WkAYAU5Nz8NkjPo6zR/v19kdNpkQ+sb0auqrjkknn47w5c0Q+\nggzZbQ4A7Y1pTB3lx65HVUqysiXPkio66ZW7qrkVP76+zf/Q1f0sZQCmqQUy5mXhoLilo70xHcgb\nOG56G1rqgs9jXPYGuDpOnOV/XzpBVdZ4YlmYlRhIOvR8adLW+iTOOYY1B5LqpsPCHzWDM+H2NqCt\nOE/McXtjGiczmVut6Lv+wyz3uso4KhL02r2S52DyyBpceMJYv1lKyTV091PCHVXZjqnG8aLne1NN\nAl+4YDqV2iWqyI8wVB2WZiIVN/CJMybhyxdOR3tjGpMa/C6AHz91Mlrrk2KR1TdUCCRHUstbCx0P\nAMAxkIobuP6js3Hy7DYACohtwVN5A52g5X3JxPOFpxAID58cKESWd4QIIfjY5ItQcIvvaAGxr9B1\nFW5nG2yjgCvPOx8AUJP21a+AYDKWXH5klpK3SJ0t/56rZn66/MO9wHnjzsIDax7CxJpxZdsaE37o\nZTjytkwd0+qCNevnHDMacyc2BKoBhgOPaZdm1nPwpD7e31lGW30SmzuHUFc1/MtWvi7iagGXv6Vr\nAZd02Eu7OdmI5mQjtmd2oqmy3LshW84nzhyB8SP8fToa0wAUmEocBZIVVutpR7ULyeOEEYcChS7M\nJCI6YWYrTpzVhhNnt+Hz/1WEmuqDSozQZ/mCo+bi0aWNWPyhCaFzoBerwIVd08N4ssZVjcZLna/5\n1QESqtIWyFZekkUt81HNVP50guT+lhdn3PuSZImH3kAN1FgWtudn4C+QmvzMbZyJf215BgBNPj12\nuh8uaapJYHl3HGqKJr8ljSRivN+BY+L85o/h3rcegJryLWuAVtmMba3EkZMace+W5diapS4U4hgB\nsaKjW47ElNqJeK2b9q1/N2PeEXlHiBACUzOHVak70DCYhKuzdRzqE9R6a29M4ZxjRokOaTweXB+v\nDVjbZoj4DDCsmN07wokjjgnGbSXIRCFaNZYgbKyqqgTaq+4Opmbg5gXfGHZxAAC5l06iL9sPBD//\n6AcmoK0hxayr0rHTfyulxjtfuXB2YB/TUAPkHQ/pLqcoCq6Z83m81rUC0+unlm3XVA03zbsWf1n3\nMOY1B88/aSQlNiPXgEJsA5QEJRdNU8X9VhUVcT2GrJMrkTv1O7DpJIbCiqNRlQ4nlTEtlbjinOHl\nluOZDgwZW2FU9pZpP3AsnnQBptVNxuyQMFgypkvtR6llfvyMFpxz7KhABYvrlmfX88WS21/ni+WE\nYGTFCDQmGtCSKs+h+dAxo5B5eRJeGKKqZykjAVPKjxhd3Q4vlxbkXZr1P7atEvW91YK8DcUs03pI\nmynoqg7HcyLLO0KE9zOSMQOXfmBCwPpUFAVnLRgl/q6NV+PauVehxqoWtdMAy1QPwbvUKyGARR0n\n4G8bH0d7upwggXIvwb6gcpjOdhy3f+kUhDlPYqaO044qb4RSio9Nvggv7HwFExpaA5/HTC0QTx7u\npW1qJuY2zRz2/PWJWnxy6uKyz6tSFlrrkujcUgN97Aa/R3XJjeTiIh111Th/8Ww8+vwmHD2Fkpii\nKHBcD4CCUU27n6fhoCkGiqtnY+KY6mFzH3Z3jRPaq1D7Vgp96BYKZJapBcIbANA7SGPSFSG9Cnij\nkXFVo8u2AfQ6bzjqK6GeBUPXcMmcU/DCE4/T79aswH6WqQXaKB83pTyMJHdPTJrhXRJrrCp05roD\nuQ8HGhF5R4hwCIJn/u8OYaSol3Tt8t9T7z57nz36VBzVNCvUnQr4mfEHEqX913eHay6eiQeeWheY\n+yObZuHIpvIOfg3VCZw4bQyeHqB61/EDYHGdd/wY/PTPWRQ3TII3SC3x0kXYpJrxWLVrNU4ffRLG\n1ldibFu4FT1hxL6Ve/L5U/YxPUpTVZw//Rj8YvkGuF10XsMWmJUpujiZPKqmbBscE1+c/CWMqKsu\n38awu/CWoer4ztHXI+fky/bTVQWktxVesg+jnWNw6YfKEyPlBWKlVU7eAK3+6Mx1v6sJaxF5R4jw\nHkLpy4n/fRAMbyiKMixxA8O7+A8WJnZU42sds/e8I8MHpx8JrO2GSzyMZuWP+xMzxtZhbGsVVm30\nPQSliYeXT/kICm4xkMAYhpHDtOfdEy47bRLufvRNXHRSeV7D28XMhmn45lFfxdeefx1AeF+Bs44e\nicqkhWOOaA58fvnpk/Dy6i6MaWh6R9LE1bEqhFG/rqtQMrUoLF+A6qnhYYFKSU2wKhHufeClm2Hh\nkwOFiLwjRHg/4GCw9x5Qmhl/uCFhxHHxxPMO6Hc01iSwamOv+LvU8k4YiVBteY5vXDoHb27uxbi2\nfVNIbKlL4tpLyj0Pe4vGZD14NnxYuMTQNZEhLmPhEc1YWELo+xO6poqSjPgwuvNT6yahVZuADVuK\n+MAJ4eWtnLwjt3mECBH2Cj/43AK4XnnSz26SzQ869kfM+72OxupgedecCeHW4XAY3VKB0S37ZnUf\nKBxKizZFAbhBP5znPWkkcN2xl9Me4lY4ZY6pot6RltSBW2iUIiLvCBHeA6geJpt4yqgavPRmF2aN\nrwvdfjBxqLnND0U0VvtW9e1XH79XMfxDFeYwCmXvJj555iS8sbGPlX3tObSkKsqwxA0A46vH4gfH\n/ntkeUeIEGH/4NjpLRjVVPG26qbfbZjvASI60JgyqhqTOqpx9NSm9wRxA76lezBx9NRmHD2VWsn7\nK6fz3SRuICLvCBHe01AVBR1N+67xfiDwqbMmY/32gVDVtAhBGLqGr148fKnZ4YTT53Xg2eXbUfUu\nqDjuDfzQ0qEYXBoeB3Qpd/PNN+PDH/4wLrroIrz++uuBbc8++yzOP/98fPjDH8Z///d/H8hhRIgQ\n4RDC/ClN+MjJ4/e8Y4T3FM4/fgx+eOXCQBOZQwEXn0wz6WVltsMBB8zyfv7557Fx40bcc889WLt2\nLa6//nrcc889Yvt3vvMd3HnnnWhsbMTixYvxgQ98AGPHjj1Qw4kQIUKECBHKILvQDyccsCXQkiVL\ncPLJJwMAxowZg/7+fgwNDQEANm/ejMrKSjQ3N0NVVRx33HFYsmTJgRpKhAgRIkSI8J7CAbO8u7u7\nMWWK322lpqYGXV1dSKVS6OrqQk1NTWDb5s2bd3u+6uoE9P0cI6uvP7RigYcronl854jm8J0jmsP9\ng2ge3znejTl81xLWSjV59xa9vdn9NBKK+vo0uroG9+s534+I5vGdI5rDd45oDvcPonl859jfczjc\nQuCAuc0bGhrQ3d0t/u7s7ER9fX3otp07d6KhYe/EByJEiBAhQoT3Kw4YeS9YsACPPvooAGDFihVo\naGhAKkVrTdva2jA0NIQtW7bAcRw8/vjjWLBgwYEaSoQIESJEiPCewgFzm8+aNQtTpkzBRRddBEVR\ncOONN+L+++9HOp3GKaecgptuuglf+cpXAACnn346Ro0atYczRogQIUKECBEAQCHvNBj9LmF/x2Gi\n2M7+QTSP7xzRHL5zRHO4fxDN4zvHYR/zjhAhQoQIESIcGETkHSFChAgRIhxmiMg7QoQIESJEOMwQ\nkXeECBEiRIhwmCEi7wgRIkSIEOEww2GTbR4hQoQIESJEoIgs7wgRIkSIEOEwQ0TeESJEiBAhwmGG\niLwjRIgQIUKEwwwReUeIECFChAiHGSLyjhAhQoQIEQ4zROQdIUKECBEiHGY4YF3FDmXcfPPNeO21\n16AoCq6//nocccQRB3tIhzRWr16NK664Ah//+MexePFibN++Hddccw1c10V9fT3+4z/+A6Zp4sEH\nH8RvfvMbqKqKCy+8EBdccMHBHvohg1tuuQUvvfQSHMfBZz7zGUybNi2aw71ALpfDddddh56eHhQK\nBVxxxRWYOHFiNIf7iHw+jzPPPBNXXHEF5s+fH83jXmDp0qX4whe+gHHjxgEAxo8fj09+8pPv/hyS\n9xmWLl1KPv3pTxNCCFmzZg258MILD/KIDm1kMhmyePFi8o1vfIPcfffdhBBCrrvuOvJ///d/hBBC\nfvCDH5Df/va3JJPJkEWLFpGBgQGSy+XIGWecQXp7ew/m0A8ZLFmyhHzyk58khBCya9cuctxxx0Vz\nuJd46KGHyB133EEIIWTLli1k0aJF0Ry+A/zwhz8k5557LvnTn/4UzeNe4rnnniOf//znA58djDl8\n37nNlyxZgpNPPhkAMGbMGPT392NoaOggj+rQhWma+PnPf46Ghgbx2dKlS3HSSScBAE444QQsWbIE\nr732GqZNm4Z0Oo1YLIZZs2bh5ZdfPljDPqQwd+5c/PjHPwYAVFRUIJfLRXO4lzj99NPxqU99CgCw\nfft2NDY2RnO4j1i7di3WrFmD448/HkD0e94fOBhz+L4j7+7ublRXV4u/a2pq0NXVdRBHdGhD13XE\nYrHAZ7lcDqZpAgBqa2vR1dWF7u5u1NTUiH2iefWhaRoSiQQA4L777sOxxx4bzeE+4qKLLsLVV1+N\n66+/PprDfcT3v/99XHfddeLvaB73HmvWrMFnP/tZXHzxxXjmmWcOyhy+L2PeMkikDvuOMNz8RfNa\njn/84x+477778Mtf/hKLFi0Sn0dz+Pbxhz/8AatWrcJXv/rVwPxEc/j28Oc//xkzZszAiBEjQrdH\n87hnjBw5EldeeSVOO+00bN68GZdeeilc1xXb3605fN+Rd0NDA7q7u8XfnZ2dqK+vP4gjOvyQSCSQ\nz+cRi8Wwc+dONDQ0hM7rjBkzDuIoDy089dRT+NnPfoZf/OIXSKfT0RzuJZYvX47a2lo0Nzdj0qRJ\ncF0XyWQymsO9xBNPPIHNmzfjiSeewI4dO2CaZvQs7iUaGxtx+umnAwDa29tRV1eHZcuWvetz+L5z\nmy9YsACPPvooAGDFihVoaGhAKpU6yKM6vHD00UeLOfzb3/6GY445BtOnT8eyZcswMDCATCaDl19+\nGXPmzDnIIz00MDg4iFtuuQW33347qqqqAERzuLd48cUX8ctf/hIADX1ls9loDvcBP/rRj/CnP/0J\n9957Ly644AJcccUV0TzuJR588EHceeedAICuri709PTg3HPPfdfn8H3ZVezWW2/Fiy++CEVRcOON\nN2LixIkHe0iHLJYvX47vf//72Lp1K3RdR2NjI2699VZcd911KBQKaGlpwXe/+10YhoFHHnkEd955\nJxRFweLFi3H22Wcf7OEfErjnnntw2223YdSoUeKz733ve/jGN74RzeHbRD6fx9e//nVs374d+Xwe\nV155JaZOnYprr702msN9xG233YbW1lYsXLgwmse9wNDQEK6++moMDAzAtm1ceeWVmDRp0rs+h+9L\n8o4QIUKECBEOZ7zv3OYRIkSIECHC4Y6IvCNEiBAhQoTDDBF5R4gQIUKECIcZIvKOECFChAgRDjNE\n5B0hQoQIESIcZnjfibREiHC44ZZbbsGyZctQKBSwcuVKzJw5EwBw3nnn4UMf+tDbOscdd9yB8ePH\nCz3rMHz0ox/Fr3/9a2iatj+GHcDOnTuxbt06zJ8/f7+fO0KE9yOiUrEIEQ4TbNmyBR/5yEfw5JNP\nHuyh7DUefPBBrF27Fl/60pcO9lAiRHhPILK8I0Q4jHHbbbdhy5Yt2LZtG6699lrk83nceuutME0T\n+XweN954I6ZMmYLrrrsOs2fPxvz58/Fv//ZvWLhwIV5//XVkMhncfvvtaGxsxIQJE7BixQr89Kc/\nRV9fH3bs2IGNGzfiqKOOwg033IBCoYBrr70WW7duRVNTEzRNw4IFCwI9ijOZDL7yla9gYGAAjuPg\nhBNOwJlnnokf/ehHIISgqqoKl1xyCb797W9j48aNyGQyOPPMM3H55Zfj/vvvx9///ncoioKdO3di\n9OjRuPnmm2EYxkGc4QgRDk1EMe8IEQ5zbNmyBXfddRemTp2Kvr4+3HTTTbjrrrtw6aWX4vbbby/b\nf+3atTj33HPx29/+FpMmTcLDDz9cts/KlSvxk5/8BPfddx/uv/9+9Pf348EHH4TjOPjjH/+Ib37z\nm3jmmWfKjnv22WfhOA5+97vf4Q9/+AMSiQRaW1txzjnn4Oyzz8Zll12Gu+66Cw0NDbj77rvxxz/+\nEQ899BDeeOMNAMCyZctw66234r777sO2bdsOSy9DhAjvBiLLO0KEwxzTp0+HoigAgLq6Otxyyy0o\nFAoYHBxEZWVl2f7V1dUYN24cAKClpQV9fX1l+8yePRuapkHTNFRXV6O/vx+rVq3CkUceCQCor6/H\n7Nmzy46bNWsWfvKTn+ALX/gCjjvuOFxwwQVQ1aCNsHTpUuzYsQMvvPACAKBYLGLTpk3ieN4+debM\nmVi7dq3okxwhQgQfEXlHiHCYQ3YrX3PNNfjWt76F+fPn4/HHHxfNPGSUJqSFpb2E7eN5XoCIS0kZ\noL2M//KXv+CVV17BP//5T5x33nl44IEHAvuYponPfe5zOPXUUwOf33///fA8b7fjihAhAkXkNo8Q\n4T2E7u5ujBs3Dq7r4pFHHkGxWNxv5x49ejReeeUVAEBPTw9eeun/t3eHOAoDYRTHHyGYJlwAMAjg\nAFROSC0STCWCIJCYBhwOwxEqegIkuqLBbRN0LQaBxkBZsdkaDJutmeb/05PJ517eZCbz9bYmSRLF\ncazhcKggCOQ4jm63m2q1mh6Ph6SfVv97VJ/nuXa7XdH+z+ez7ve7Xq+X0jTVYDAobX6gSmjeQIUs\nFgvNZjO1Wi3N53MFQaAoikrZezqdKo5j+b6vTqcj13XfGnq329V6vVYYhqrX6zLGqN1uy3VdrVYr\nNRoNLZdLZVkm3/f1fD7leV7xVWq/39dms9HlclGv15MxppTZgarhqRiAj1yvV6VpqvF4rDzPNZlM\ntN1ui3fn/3U4HHQ6nbTf70vZD6gymjeAjzSbTR2Px+J/4tFoVFpwA/gbmjcAAJbhwhoAAJYhvAEA\nsAzhDQCAZQhvAAAsQ3gDAGAZwhsAAMt8AxJ5C+54P8QOAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72fab5e290>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAFnCAYAAACPasF4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzsvXe8XVWZ///e5dTba3pCQiAJCSWE\nIJGmoSSgjsg4gmCb4Tf+dCwURUdEQXGs41gYFQvDiIyIiKIIJIAgEBJCgJBKertpt59z76m7fv9Y\nu55zboiQBCL783rllXt2WXvttfden6et55Fs27aJECFChAgRIhw1kF/vDkSIECFChAgR/jZE5B0h\nQoQIESIcZYjIO0KECBEiRDjKEJF3hAgRIkSIcJQhIu8IESJEiBDhKENE3hEiRIgQIcJRhoi8I7yp\nMW3aND796U9Xbf/iF7/ItGnTQsfdcMMNoWOWL1/OBz/4QQB2797NCSec4O3btWsXH/vYx1iwYAEL\nFizgkksu4bHHHgPgpptuYuHChSxcuJCZM2fy9re/3fudy+VC19A0jfvvv/9vvq/Vq1dz1VVXHdSx\nDzzwAF/72tde9bVcvNbz3wi46667+P73v/96dyNChFeE+np3IEKE1xsbN24kl8tRX18PCBJas2ZN\n1XErVqxg/fr1IZIeCZ/97Gd597vfzW233QbAqlWr+PCHP8zDDz/MV77yFe+4+fPn8+1vf5vTTjut\nZjvr16/n/vvv55JLLvmb7umkk07i9ttvP6hjly5dyvnnn/+qr+XitZ7/RsAHPvCB17sLESIcFCLN\nO8KbHm95y1t49NFHvd9LlizhxBNPrDruuuuu4+tf//pBtblp0yZOPvlk7/fJJ5/M4sWLGT169EH3\nq6+vj09+8pO89NJLXHHFFYCwAPz0pz9lwYIFmKbJypUrufTSS1m4cCEXX3wxS5cuBYRV4IILLgDg\n1ltv5atf/Sqf+MQnOO+883jve99LT0+Pd53ly5czffr0qmu98MIL/OM//iMXXHAB73vf++jq6gKg\nu7ubD3/4w1x88cWcf/75fO9736vZ18p7ueqqq1i4cCHz58/njjvu8PatXbuWSy+9lAULFvCBD3zA\nu85I26dNm8b+/fu9893fy5cv5/LLL+fqq6/mM5/5DAD33nsvF110ERdeeCFXXnkle/bsAcC2bb7x\njW8wf/58FixYwC9+8QtvrL74xS8CsH///pD15MknnwTAMAy++MUvsmDBAi644AI++clPVllMIkQ4\n3IjIO8KbHhdddBF//vOfvd8PPvggCxcurHmcbdssWrToFds855xz+PSnP82dd97J1q1bARg1ahSS\nJB10v9rb27nuuus45ZRT+PWvf+1tt22bxYsXoygKX/7yl7nqqqtYtGgRH/3oR7nppptqtrVo0SJu\nuOEGHnvsMdra2rjvvvsA2Lp1Kx0dHYwbNy50rVwux8c//nGuu+46Hn30UT70oQ9x9dVXA/C///u/\nzJ07l4ceeogHHniArq4uLMuq2VcXP/nJTxg/fjyLFi3il7/8Jd/97nfZt28fIISiq6++msWLF3P+\n+edzyy23HHD7gbB+/Xouv/xyvvvd79Lf389Xv/pV7rjjDh555BEmTpzIj3/8YwD+9Kc/sXr1ahYv\nXsx9993HXXfdxerVq0Ntff7zn2f69OksXryYn/3sZ3zuc59jcHCQJUuWsHv3bhYtWsQjjzzC1KlT\nWbly5Sv2LUKEQ4mIvCO86XH66aezefNm+vv7KRaLrFy5knnz5tU89oYbbuA///M/KZfLB2zzO9/5\nDldeeSUPPPAA73znO5k/fz533333Ienv2972Nu/v+++/n4suugiAOXPmeNppJU477TTGjRuHJEnM\nmDHDI85ly5bVvNcXXniBUaNGceaZZwLwzne+k127drF3717a2tpYsmQJzz//PPF4nP/6r/+is7Pz\ngH2+8cYb+dKXvgTAhAkT6OjoYPfu3Wzfvp3BwUHOPfdcQJitb7311hG3vxKSyaR3P21tbbzwwgue\nteO0007zxuepp55iwYIFxGIx6uvreeihh0LWlkKhwPLly/nIRz4CwKRJk5gzZw5PPvkkra2tbN26\nlUcffZRiscg111zD2Wef/Yp9ixDhUCLyeUd400NRFC688EIefvhhWltbOeuss1DV2p/GzJkzmTt3\nLnfccQezZ88esc1EIsFVV13FVVddxdDQEIsWLeLrX/8648ePf80TfXNzs/f3Aw88wJ133kk+n8ey\nLEYqVdDQ0OD9rSgKpmkC8Mwzz3gEFcTQ0BBdXV0hC0Q8HmdgYICPfOQjWJbFV77yFXp6erjyyiv5\n1Kc+dcA+r1mzxtO2ZVmmt7cXy7IYHBwM9U1VVVRVHXH7K6Gpqcn72zRNfvjDH/L4449jmib5fJ7J\nkycDMDg4SGNjo3dsOp0OtTM8PIxt21x++eXetkKhwBlnnMFJJ53EjTfeyK9+9Ss+//nPM3/+fG66\n6aZQexEiHG5E5B0hAnDxxRfzve99j5aWlpo+2yCuvfZaLr30UsaPH19z/8DAAC+//LKntTY2NvK+\n972Pp59+mk2bNh0yLa27u5sbb7yRe++9lxkzZrBjxw4WLFhw0OcbhsGaNWtqCiGdnZ1MmTKF3//+\n9zXP/ehHP8pHP/pRtm/fzr/+678yZ86cA17r+uuv58Mf/jDvf//7kSTJG4OWlhYymQyWZSHLMrqu\n093dPeL28ePHI8uyJ3xks9kRr/nQQw/x+OOPc9ddd9Ha2spvf/tbHnjgAe+6g4OD3rF9fX0kk0nv\nd1tbG4qicN9991FXV1fVtrs6IJPJcMMNN3D77bdz7bXXHnAMIkQ4lIjM5hEiALNnz6anp4fNmzdz\n+umnH/DYzs5OrrzyyhHNuKVSiU9/+tM8/fTT3radO3eyatWqEaPKR4KqquRyuZoa9cDAAOl0milT\npmAYBvfccw8A+Xz+oNpevXo106ZNIx6PV13r5JNPpre3l1WrVgHQ1dXF9ddfj23bfPnLX+aZZ54B\nYOLEibS3tyNJ0gH72t/fz6xZs5AkiT/84Q8Ui0UKhQLHHHMMo0eP5pFHHgHgd7/7HV/+8pdH3A7Q\n0dHBhg0bALjvvvuQ5drTWH9/P+PGjaO1tZXBwUEefvhhb2zmz5/Pgw8+iKZpFAoFrrjiCjZt2hQa\n93PPPZff/OY3ABSLRb7whS+wb98+7rvvPn70ox8BwgoyZcqUgxrvCBEOJSLyjhABkCSJCy64gLe+\n9a0jkkEQ//Iv/4Ku6zX3jR07lp/85CdeVPiFF17Itddeyxe+8IVQBPrBYM6cOfT09HD22Wd72qaL\n6dOnc84557BgwQIuu+wy5s+fzymnnOKtPX8lLF26NOTvDl4rFovxwx/+kFtuuYWLLrqIT3ziEyxc\nuBBJkrj88sv53ve+50W4z549m3nz5h2wr1dffTWf+MQneNe73kWhUOCyyy7jS1/6El1dXfzgBz/g\ntttu48ILL+TPf/4zN998M5Ik1dwOwvJx88038+53v5tUKuUt8avEO9/5TjKZDBdccAGf+cxnuOaa\na9i/fz/f/OY3ufjiiznrrLO48MILec973sN73/teTj311ND5N998MytWrGDhwoW85z3vYcKECYwZ\nM4bzzjuPdevWceGFF3LRRRexZcsW/vmf//mgxjxChEMFKarnHSFChAgRIhxdiDTvCBEiRIgQ4ShD\nRN4RIkSIECHCUYaIvCNEiBAhQoSjDBF5R4gQIUKECEcZIvKOECFChAgRjjIcNUlaenuHD2l7LS1p\nBgcLh7TNNyOicXztiMbwtSMaw0ODaBxfOw71GHZ0NNTc/qbVvFVVeb278HeBaBxfO6IxfO2IxvDQ\nIBrH144jNYZvWvKOECFChAgRjlZE5B0hQoQIESIcZYjIO0KECBEiRDjKEJF3hAgRIkSIcJQhIu8I\nESJEiBDhKENE3hEiRIgQIcJRhoi8I0SIECFChKMMEXlHiBAhQoQIRxkOK3lv2rSJ888/n7vuuqtq\n39KlS3nve9/LZZddxo9+9KPD2Y0IESJEiBDh7wqHjbwLhQK33HIL8+bNq7n/a1/7Grfeeit33303\nzzzzDFu2bDlcXYkQIUKECBH+rnDYyDsej/Pzn/+czs7Oqn1dXV00NTUxZswYZFnm3HPPZdmyZYer\nKxEivGmhGxZL1+6jWDZe76542NuXZ822/te7G0cNXtjYy879wyxduw/Lsl/v7rxq9GWKrN8x8Hp3\nA4D9AwVWbekDoKyZPPdyN7Y98tjmSzovbOw54DFHGoetMImqqqhq7eZ7e3tpbW31fre2ttLV1XXA\n9lpa0oc8Z+xICd8j/G2IxvG143CN4d2PbOTXizdw3twc11x+6mG5xt+Kf/nm4wDc/+13oSiHTn/4\ne3wP9/Tm+NEf1ni/48k4F8075rBe83CNo/vcf3XzQpobEoflGn9rX+79+jv4+d0vsmzNPmRV4aK3\nTq55/I9/8SzPv9zNdVecytvnTHjF9o/Eu3jUVBU71JVuOjoaDnmlsjcjonF87TicY7hhu9BwN+wY\neMM9p737syTjh2YK+nt9D7dWaKobt/dz2tS2w3a9IzGOXXsz6K3pw3qNg0V3zzArN/YAsGnnAKcd\n117zuA3Oc3hh/X5mTWw+YJuHegzfUFXFOjs76evr8353d3fXNK9HiBDhtcE180lIr3NPqqEZ1uvd\nhTc8SroZ+m2aR/+YvZFcOJZtY5jiG1EPYAVqrheWgsHh8hHp18HgdSHv8ePHk8vl2L17N4Zh8MQT\nT3DmmWe+Hl2JEOHvGq6LTnrjcTdGRN6viHIFeRtHsc/bRb6kv95d8GBaticQqcrIH0mLY+bP5N44\n5H3YzOZr167lW9/6Fnv27EFVVRYvXsz8+fMZP348F1xwATfffDOf+cxnALj44ouZPLm2ryFChAiv\nHW9E8tYj8n5FaHp4jEzz6CfvQun11byDQWeWZeP+UuSRddn6VAyAzAE072x5iKZE4yHp48HgsJH3\nrFmz+NWvfjXi/rlz53LPPfccrstHiPCGwf6BAo3pOOmk+Nx6MkXSCdWbEGqhe6BAQzpGOukf0z1Y\noLk+QSJWHbiZzZUxLZvWxmRou+Wazd+A7H0kzOYDQyUUWaKp/rUHSFm2TVd3jgmj6pEliZ7BAk11\nCRLx8PMoayZ9QyXGtde9pusVSjq7e3OhbYPDJbJ5jaa6uLetN1MkGVdoSMcrm6BYNtiyJ8u49rqq\ndwOEANWXLTKmrbqvA0Ml4jGF/mzJu+dK2LZNV0+Ose11ntnZtm329OUZ116H5IxTXeBdz5cM9vbl\n6WxJeedYts2W3VniMZljRjfSkynSmI6FYiJ2dQ8zpq2OmFqbZGudUwslzbdmmJZV9bdl2WzenSGV\nUEknVOJxxfuOhgo6fZkidakYqYR/naV7V/B/G+7lIye8n4s7zjng9Q8VjpqAtQgRjkaUNZMbfvYs\njXVxvv+pswD499uWIQG3//v8mufohsXNd6xg9nHtfPQfZgLQny1x48+X8455k7jk7ClV51z7388A\n8D8VbbpKhvw6cLdpmWzL7mBq85SawsOR0Lw/++OlQPW4vBosfm4X9z6xlcvPO45Tj2vn33/6LBOm\nZ7A7N3LdnH+jOdEEwDf/70V2dg/z7Y/No7059aqvd/MdK+jLlkLbNuzKcO2tS0L3c8Mf7sYup/nF\nxy6vauOexzezZNdKmuoVvnvlZVX7n1i5h3v+spmvXHU64zvqve2mZXljB3DlBcdz3pzxVeev2TbA\n9+9dxZknjuaqd5wAwOMv7uH/Ht3E+88/jtOmdfLvP32W9iZfcFi5uZdfLd7I208dxwcvnAbAqi19\n3HqfiKq//v2z+c7dK5k+sZnPXSFWSGzcNci3fr2SudM7+fgls6r6kc2V+ffbljF1XBM3fHBOjdH0\nEdT8g0vvXFJ/YVMvP7l/rbddSuZomLUSuXkaVqaTz922jHHtddzy/73FO+avu5cAsKJ7JRefeGTI\nO0qPGiHCYYRmiAlhKK8BYDj+tQMZP4tlg7Juhvxre/pymJb9N/vcfBPhkWfvezf/ie+v/CkrulfW\n3K8bZs3tbyT8cevDfGHJLWimxsrNIsh21ZY+9vYXIFair/FZ+kuDdA3v8c7Z2S0ijQcOMrhJt2qb\nkSuJuxZ2De0hPmkDieNfrLm/qzdH4riXKI15ofY1MkVsoKsnrOFXmrbXjrAuf9veLADPrNnvbXtx\nUy8AK17uYSivIaWHyE+7H7mpx2lLRG4/8aI/Zv2Be3VzAGzYlfHb3LsRKV5gxYaemv3oHiwCsGVP\ntuZ+0zIxLfG+BX3uZoC8NSe+oC9bDJ0rN/Wjy3kxxoo4d09fPnRM2RDPOqkcuSVwEXlHiHAYURlf\ndDDapghSssnGtzGsiUm1NyMmt6DPc1XvWnoLB0524pL366F5P71HJF7al+/2tlkBf+PR4PN+ZOcT\nDGnD9BbD45zJlVEa/WVcRaOaaJUDBEC52DW8m2v+egNL9jxb+wDJIjZlNXJzd2iz+1yf6FpywPZ7\nC/6qHsuuHm83IK43EyasSvIeyVRdy6Li3rdhWpiWTWzsVtHGpA0j9jN4vf4KoaW/OMCSwu+Jz3hu\nxPMrz6nEf734E7723HeBcLR7Tisi1WWIH/8Cw8ZwVV8AJFXz/pbragsHZVMck1CqXReHCxF5R4hw\nGFG5tEc/iKU+Zd1Ead9LpvU5/mfdrwF/cnU1hf35Hn625k7+47n/8jQGoCoDl6d315hkNw5sYU3f\n+oO+l1eLxri/TjVI2G/0pWLuhAygW+EI6d5MESnuE0ZBD5MfgHIQEtOjO/8KwIPbH625X2ndj9q+\nl8TxYeuFu7xpW3YnALZR7QEtaQZFxSfvWn0cibzzAQKT6rIMpNbXJH+5xj2qTuCXYdqifcVpq0Yf\na12vp6Ivq3qFCVtOjEzQwf6XtDD5dg3vYcfQLnoKfWim7l9L0fn+y98mOfNZlOZetiUfreoLgBQr\nB/7WqIWyeeSj0CPyjvC64I2UZvBwwqwgU10/OPKWG4RWtze3D/AnJ3epUG9RTMq6pYcmm0rh4EBW\n8x++9DNuW/2/r9ifkfBiz2q+8dz3KRrVpBCc6IPEFyTv16p5W7bFrSt/7hFg9X4bpW0vcsv+mvtf\nCbuGdnt/l4zw5NybKSLFAuRtVCeRMi2bIW2Y32/+MzktX7U/eI2JDeOq+g4gN4nnbNvhB1jWTWzb\nJlN2TMuKUUWufZkScr2vKeZr9NGNZu/LhImx4JiWpbosyZnL2Bd/gd3De6vOryWfuEuuTMuirJtI\nqmjLNqsDNF3BsxAwZe9zTNKphAgEfMkhb9saWRjakFvtPefKe1m+z3cZ5PScZzavJGJNyZLT86G+\nACEhDTV8zu83/5lfrL0LzXnHa1lgDhci8n6TwPUvWrZdMYGaNY97pW2vBat61/LJJz7PtuyOmvst\n2+Kl3rUU9dIBr23VEAAOVV9/uf433LT0m6/6fLcfQfI2TCtEriMJMJpmIsUFIbYmW4Cg2VycP1jy\n/YHByaaSED2zeeC3bdvkdJ9MKv3ouiGIwbKtmtqWi9vX3sXu3F5W9673zgHxXILm/JAGG+hfppzh\nzvX30F3oxbQsBofL2LZd9QxFX6rHqrfYz4bBzdy/9aGq4yzbxjBMYsesIzaxtrm2ss18SQ+ZTHcN\n++RdOSn3ZkpIcX/cCjUEGNO0+c7z/81fup7imb3Lvevphskftz7MZ578Mn0lIaTJkh+xXiwb7O4f\nRG7sQ2l0yLscDnzTdJMhLYdhi/5KEhR1v49lzWT7/iGkhE/Y+RoCxLDdR+yYtfQMD4W254o6iZlL\nSc70a04MlAZDxzyzdzm7zZeRm3tInb6IPbl92LbtRZAXk7tZMfCMr3lXQB2zleuXfImCXgwJoK5F\nJp2IYds2e/Ou8CXh2pJ0w2JgqESuqDNYyrIz/gyJ414C/JgDwzK4e+PveaFntX9fWt5/xnJ1v7Kl\noWqzeby25m1ZNn/peoqVgfaPJHlH0eZvAnQPFvjCT5/l4jMm8fLOQbbvG+J//n0+Dy7bwX1PbuPm\nf57LxFEN/PWlPdy5aCPXv382MyYJ0vjNXzbzyIouvvWxeXS8hsjZIO7fIibbv3Y9w5SmY6r2L937\nHHdv/D2N+jF0r5zOj649J7QsA2BTV4Zv/t+LfPySWcydLrLzPfp8F3c/tpkbPjgHPdVNS6KZ0XWv\nLnPfc/tFAJBhGajy3/aZvLxzkO/cvZIPLZzGceP9VIr5khEycRumTUyt1iZKuomUEGSwfVeZNW39\nXhCNaxbvCfgyQ5p3gBwf2LaY4eQQ0IYkSZR1k49/90nOOGEU557lB9Zc96On+MAFM5h/6niG8hrX\n3LqEc04eiz3xRV7u38R/nHUjsQOMgaZb/P//+SRnnTiGf3nHDD73k6VkpN0kRCBxyKToE7PFnwZv\nB6A50cSG5Z1s2JUhEVcoayafuewUZk5u5cWe1WzdrPDw091V0dvd+XDw0n/d8xLb9g3xb+85ke/+\n5iWuuGgikmKCbGFaJorsE+SqLX384Herue59JzNrShvb9w1xyy+fB+CbH5tHZ3OKVV27vON/8dBq\nxqnT/WsnXkJp9f3QtUzSmfKgR3j3P72de34Dn79iNt/69UpSpz8ROvalbfvZ2TlMc32cz922DOnY\n5SSm+wKQVKHxLVu3n98/v5LkTH9btpynLp6mpBlc/+Ol5EsGiVk+mQxr1Zp3X/ol1NR+8oqBbvhR\n0kPlAnJdmNAHy2F/76833AdAfLLQqH+y5EFSPafQ0SKeUXncc7yUAzlZ+x5iEzZj2LBpcAu5cnXf\nDNPi9kWrKDrmckm2QBZC4pduX06PE6TWcEwXeJ+5ze0PvsykUQ3stzZXxRIM6znyJdFfqYZQ8Z3f\nPUeb7FtB5MZ+5PQw2BJIdugeilr1+bWEuMOFSPN+E2CjE7X50LM72b5PfJCWbXPfk9sAPzr0waXC\nf7Z07T7v3EdWiIIxm7p8Te+1wtXmRlp77EbuZhFmuv6hamn2ryvFMfc+4ZeSve+vIjBm+cbd/PdL\nv+CW5f/5mvsa1BoPFktWi34/tGxnyOddKOkhzbsye5aLkmYguf492eLhZ3d6y1hcTb7HMZvHlXhI\nU3DJ0bAMFu34CwPNKwDxvIediPdn13fTlfMjfVEML3p2l6O1PLVqN893v0TeKJAp1Q7ScbEvI/Yv\nWbMPy7YZGCqHTI1lI0jezrNP+pO1bulelHDZuc+la/ezYWAzt6+9i8cHBUm8vCus+e2vIO91OwYp\nlk1+/dgG1NHbWbRGmFslSZivg1i0XBDzn5buAMS6eq/d/gLL973Attxmb5tml0NLBOzOTeJ/UwgE\ntczmPaVe729TEtaRxc+NUIBJ0dm0O0NXTw7dsFCawgFykmqA5L87f1ixKqQVA2RLeedehCY7arRN\nLOW/vzm9uo+u8UFp2093xo84HypWa+lBzTtoNZJi4t4yWYOd3TnvGVYimbb454unM2/mKN4+2yfI\nn6/9FXvG/L7KvVHWTZZt3hHaJsU0hgs6PYNFL9+BlvYtJA2NYoy27xtClqvzIeS0PANDzvtYg7yF\nZu5bshLTxfcjmXHn+v67nCtWZ4orRWbzCIcSlZGiUrzAQCHrmbfcCdUNsKn000LtwJRXCzenkSzV\nfv08U63j56t15ZST8CS47MM1t9nqa5N+g6biVxOIogdyJQfHslAyQj5vbQTyHtZySJK7QNsMCS8e\neRd8Yqg1BtlymKzcyF8Xe3P+RCkpJprmulWcyzb4wlqmXE3ewTEKEpebgSpE3gEBSDMsUDXkQKT2\nYDHnBWC5SMQVT4iT04JUKpPT7A1EsQ+X/D70JlcTm7iR4lh/nfJAKSx8uglz+lIr+dnqX4aEqoHC\nEHe+fA92zH+PJMW3mtiYge2Oz7aW5q0NBI4TRDE40lI/xaA3U/RiG1yhIISA1hcb5QsBVkEEBA47\nhNubKSI39DM0cTGmFCDvcjUhm5Lfn64B35ozXK6+n6DmXWt5m6aJ96w0AnnbisbZJ43lX981k7NO\nGlO1X2kJC2NlKUd8imOSdueCWJk9TuKaGcc0ITf1hPz6l54nhILebBHd9L8LN2ZgWM95Yyyp1fcg\nqToZR8gNCktSsQXbkpDrhjyXVqYQHs/GeEOkeUc4tIhVJNxPnvIUNy3/ukfq7oTvEnStmsG1siu9\nWnjBOCO8fqZDDF6QTsW1C3qRVervUNp3UyxXTxSGUjs46GAR/AC1V6F5uzm7K8k7XzIOSvMeChCv\npBj0Z/0J1jQtbNv2JlLN1MgXq33Kg+UwWelGONZhMKhNK4bXF89H3uATT7YGebtL2AAKZoA43Ykx\noKGEzeYWyROXED/Gj3LPFKsrMCVisqfpuYFKlQUt9hd88t7W7U/87uQaxEDRHw/TMulveB65foBy\n0xZW9a1DM/yJfqhUYwJWDIYdTasQ8C3bloRqJ8g774zhPV+bQS2gPTvrg0dapy8pOn2ZkhfbYBuB\n4C6XuFS/j5Ll77cKIrmKaxbvzRRDhKZaooJXvkLz3jS4FSvhH7c34z/zXMDEbmbbkGyZTDDOooal\nwRVkhgvV38yY5Hh0S/e+p2DSFu/8eFhrjU3Y4AluFB33k6qxs1tsax6TJTFNuLdkW4yHkhTj25sp\nUTQDz0lLOPeV9yPTlWrNWVI133IQ0MylvTPBjCHFyyROehqAgYL/DZzVeiGtyRaKRumIBeNG5P1m\nQ0CajKsVmrdyZDXvkczmXiCRM2lVLrfaNbybItmQ9haEJudqbg9ix9CuUDRxELkAMb0as7k7gcdU\nudpsHiDQkTSUIT3Qf8UMBVaZlk1eL2AENJ+hgJbktl+pLZtWOFguUwoICLJBWQ8njwlOpBkt7PuE\nsHBQNH1hyU0sEgzyqQxYq4zyHa6hESZiCrudSHsMYbKsDCQKmnF39PmWCKxqrTWoea8f2EivuoHE\nCc+BLO47GImdr0HekmJ4VoVcYLy1DXNRiFN0iNHVzmPHrmJDYZV/vqPlDeVqvE+2BIpBT7ZAr5sg\nJBCZHdNF/Elw3EzE3+ZQC9aQKBE6rPmad5D8k5YgviB59xT6+MHKn3r3D9CdC0SmOwKKOdSKtuUU\nVCsVGsOagVmOcDGU10LzjLZsY2voAAAgAElEQVRtFi0J0Qc3ULJWamC5gryDEfbGUKM3Bjv2i/dR\nSfnHj5VEPIIuFVBkib5MMWzCdsZzqJwj4zyDWj5vggKSs9/oHYdeSHrjLzlj1p0V42V0T6TdmEZK\nTWLaZkjjP5yIyPvvEAOlQTQzaEoNkETg5Yx55C32u2ZzzSpXSemHMsmHa3IdSZu3bLe/Yn/l8ifX\nZOx+XJU+tqLtE9NI0dI/X/Mrfvly7dz6w4Go3FdjNvfIW5FCVox8yXCehY0yagd7ctVLbwAKhk/e\nUkVErGnZVcQ8FCC/2uQttO6gmX446ANWzCrNO6g51zKbBzX3vOFf39e8S9imgm0qlMxqn3cQtZaa\nxWOyPz6KDtgVS+L00Du6dzAgyFnV01qwv7XKoxYD1oOc5vfX1WpRDE+wcTXvuvxUrFwrshX3rDWu\nEKS2+W4J28bT8mwIERtA0m5Ckm36snl6B4uOUO2/N0nTIe9gwJfzHWtbT8Z2hJu8JvrQmy2FiClh\nC7N60KKU06sF3IGCP0Yll7wHRoMZQzHTDGnDXpayWm4C1zIwVNA9rdUcGIXZN576mMidviWzXRxb\n49sXAl8wsMA/xsoJ8pcSBU/zjiXEOOp7jmVS8ngAslqWtqYkvZli2IK2XUT2DRQDgmhgjLRts5x7\nEGOcSqj+flMV30dIKLTodSL0bSNGb6ZIWhWBevkaY3M4EEWbH4XY1T3ML/68nk9eeiKdLeGi9nm9\nwJeWfoPx9WP5wunX8H+PbOIvL/oaZnACiFVq3g5Db2m+l889bdO89VLv2B/9YS3nzxnPFRccf1B9\nfOCZ7by8c5Dr3z/b+1DveXwz2ZxGLqWBAiDxg3tXsdkpSPCpfzyJyWMasQhr3pWlI7tdf6/zcXUP\nFkKBa3nTn4SeXtPFI8v3ceOHTvMi1otGkUw5S4MzET350h4ef3EPLQ0Jpk9soXNyteb98PKdrHi5\nhxs/dNorWiFcYUOpMJs//uJu9vUXkFI54pM2cHfXBo4ZfQ0dHdNC5xfMvC9WKz7hqopMMbGXb6y4\nN3R8vkLzfu7lbh7ZvAncVNWySV+2xH/+5iVQyySmr6Bo+WSlxk1PAHIzuAXJ+4k1W2nMdHHh3An8\n/qmtDA6VGTfL13SDxPenZ3aI8+MlbC2JpOrsGxxC003iMSUsSOL4CZ3+j++oZ3dvDqV1L08Ul1G2\nXQ1JRBkPFzRu+eXzDAyVuPyiseLWJBnLtugeGgScSHTJH3NbjyPFNJ7esI1ZiX5mTWmrGVQUFCCW\nb9hLYgZItoK2+VSSJz8VIkPN0kgACScVpmzF0S2DsqlVuUJsSwJLDWt5FRpfPpNAaRVBcXv6ZEa1\npukPHJM0WxlmK/Gpqyi91Iytpfz2jJj4B+zoHeCWX66gN1MiMcrC/WpiOLWoy4M8u24/v35sM23j\nstDqdlJEUvfkMlzzvb/S2ZRkz2CWeDNgim9GNtPY2GTKQ7SlWnjmZT8S37+voNbqru0W53em2wG4\nc/09nNwxq3a8i2yKNtzgMMdaUVpzJraewDZU1NE76Fk3FmjwrmFmOmlKNEFJBLt2NI9l3fYBVm7d\nBzEorT4Lu1SHjEJ/UVhrmuvj5J0xHNf/DrYMlGHKWi/4rqkuTlmvuIf++fR0OMl0VIP+vAZpMUZ9\nmRId44UrIK8XSODniT9ciDTvoxD//fs17O7Nc/+S7VX7smUhDe52tJYgcYP/UUE1eXuk5Ex++/rD\n2vdjL4i2Hti6iJuWfjNkuq3EH57ezoZdmdBktvi5Lp5d3+1V7zEsg1Vb+ymUDTI5jXXbhfZkOaTq\nkXeFGd9dJuVOYNv2DbFuh29CzZm+VnnnY2vZ11/w2ga8NchlS5DDLxdtpKsnx+qt/fz2iS0hs7mr\nzdz7xFZ27B9mYFhM/I/tepIvLf1GTbO6YYj+xlQ51Hd3PIPEuLXGWveiJTRZ24gJE51kkUooJOMK\nQx3LveMkXZBVIUBGmmFy2x/XMRQ0dct+pLrascf3IzqIxy3vOXmacUwTpk5bwlSKvLBR+JT/vHQn\nz6zd7yWPUWWVklXh/5QNpJiOrSWxTRXd0ti0W5hcS3p4vNpTbRhogM2oVnE/8amrKdhhbV+KldnX\nX2D7viGyeY2N+4VmO8FJbtJfEM9/zrQOkulAycfhZmxLwlIK/OB3Ivgp6Au1ikIjdCO1g+M1Sj8Z\nu5wS5tsa5OvmsVY1oRWu69/gCUFuxrPyunnYhhr2V1eQt2vilhQD07LpaEqScuSQs8fOo8mY6B3b\n0C76Kak6tiWDrXiad1e+i+37hsgVdU8rbUk0M8Y6EXO4ha58F89u3k6uqLN70DeB1xtCECrbBbbu\nzrJsXbfXx6ljBOm675rrLtm63/fne/0P3CMO8br7zh53BhMbxmNjM1QerhKgZEMoIak5j3uBeZKq\nYVsSdrEejDj67uORZBu5LksqoVK2xHc0a2In582ayrFNk9kwuJlp08XzH8yL91wkh5EYk5jAgN6L\nlMgzujXtPceGRJpPvWc2tiV5yk1bU9IXnB3yTlvtTFBO8PqWLfrfaaGkM7vzJE5sn0FnXTtHAhF5\nH4UYcgJC6pPVfiP7gCUvCJnN3UxIru9VqTRlSbVNzot2Pk5faaAqorkWatbudYSDsiHuo9NZu+ul\nAK2INh9Z8xb34i6BclG2AxOxc7/BW3Ozk2mmVm1Wl02yAZPycCk8ybhc/IctDzJQGqyZdcowLZAN\nCsmuqr7Hj3+exPTnvd+1AuLKtiBDu5T2+pSIKSiyhGwEAn1KQrovB8hINyykZA65MbBGOKC9h9Jo\n6qItJRYgb9MCbKRYmeZEI7aeQIqXqtJn7sntJ6HEmdQwAc0uhd6Vjsni2tZwC5gKyCb5ongPKrN8\n1cXS4n1QjFCZSxcJSbwbUkwjmw8kRTHFxDyrbTqqpNDPDuIxiX+7ZBbHT/K1HqtUJywA8ZJnBXH9\ntbGhiRyfOA2A/ny1sCNZCiCBWaE5O/uTagJFlogPC3J9dt/zvrAqmyT0Nuxio/C31iB/M9sqtErX\nv+1sb29OolllpjQdw+XT30NCrqO8+RQAzjhFVC5D1T2N2y6lMTPtKI0DyM3i25Ad8rzm1I+RUJKY\nvULI2Ws6Firn2zH6RzPJnOe0GXgXnb5ceubxpBMqaOJdcZMDlZx3Ttt6EqUX345VrAuR97hRTh4B\nh/iS8RjHNYtqeHkjXxWVPaV9lD+8DQNCaFUMMGOeddF2+iCpGsm44rXxrxefQjKhcsGkc0UDDb1M\n7KzHkp3+OO/8xLiwcClt+xjVmvb6m5QTzD6uQ5i9nW1j2tJV1gNVkZkxfpTTB92L1bDNGLppM731\nOD520j8TV0Yu9XsoEZH3UQg3pWFDuka6wVcIsAp+YO5yD9eXqCgyECAb+cDZygyrOjBDMzVRLEEO\ntx3KmuWQd8kh77FO3WOXICqXihkBn7duGV6gkrf8JlS9yaZsB5f4iD4G/es9gexfwSUvUqJA6rRH\nWbTjL962XLmCvKtyh9fI8mZaxI9byZ66p9haeDm0T2nuC/2u5VPXKGDbYDlZtSTFEOStSEiaX3fZ\ndLRGzfKfeX+5j8TMZUiq4S83CvrNAwFKdllMikrM94drugmqjiTbNMQbsEsppHiJTD6Q7U6y6Cn2\nMrZuNC1JQSZBa4LWuB3bkjB6JmBbigjGGhSknXeimG1TYczwOYK8Ee9lMmVWvXMtsrOkKFYmGxDS\nipYg77H1Yzix/QSM2BAtHWUkSfKIxb1HW0tCrOwJGG4msobisXTUif5nQwF8jvbs+DhtUw2Rr/ve\nJeQE8ZiCVaynM93OjuwuQd6ShSTbWIZ/vhCgrND5Vq4Fu9jgkYvSIN7rlqYYNjZJ1dHsFckjLtcq\nIyl6IChNwth/DBAonOFcI6UmUWQJa0jYyPP0e+MNYPaOp0FtAFsKPUM1bnrnx2Iylkve5QyWbVGS\nRTu2HgdkQdJObAKAGjP8sUMQn/usl+xZ7uVkd3FSu59tJnHcS6RmPYuk6khmjNaGROBaQEwnHlMo\nODEPKUX0rTMlNN5sOYuqytiyLtwWtqC5MbFjkWwFdew28vWbkRQD25KIqWIcG2PNSIkCclOvqG8e\n8Hm7z8H1a8tNfdhjnRUThloVVHskEJH3UYxaUeFBM26tJQth8hbHFsoOwcmSZ+6CEaIxAyjVIJ4l\ne5fzu81/Iu6UKHQTHlQm+we8oLr6VIzm+rgXqeznwq4OWOst9PmEqRiA7ZVelOoyJE56GjsogDj3\nIAX81K7mDWHylOurE9G4ZnMXlZp0Lf+pYfpJNoaMAye3qWV216WiiLB2J+eA5m1LgQxteYe8bf8e\n9pZ3ICkmetfxGN2TgLDmHXz+linGRFZNz/pSMjSSJz0FQL1aj1VOIUli+ZUXSZ7MY9kWY+pGe+lb\ng9HphprHLtWBkRBaqwQ9Q4Jsi6azpGr/MSQK47wJXU6UeEL/XxLTnwtZB1plx7edyntCK0DRsa40\nJxqZ2ijiMFIteeceAuRddDRvyRcw3ICidCzFqAZB3iUr8Jwd8rZ0550xYk6kcfC9EwlyknGFsm7S\nnmwjbxTIlYre+YYhuwMi/ne/rQpSsDVBCLGJGyFWoqlRXNclJUWWQBcEVrByoh+q7hEj+OlTvWVy\nAdO+KsvYWgoZGSvmm91BmHx1A5JyGjk9jNK2l8TMZ5AdQSKlJokpMmbJ1byz/HHrwxjNwuftWg2E\ni8dGaXfW5scCPnkH7rNetm8Fd738W4KYN2Yu7516iX8/ySFQdFRJCEiiLdcXrpGIyRSMIkkl4WXO\na3LqqWfKQyKHhaO5e5kiTJVYYTSSbLFOe1ospzNVYoo4/8KxFyFJoI7ewdi2tDf/ueMcU2XqnMC7\n2Litfl/N2EEVHDrUiMj7KEN/IUPylCeQW/bXXCccJCOj1gtVg7xdYrVtO+QTr6V5u/5qqC7WAHjL\nJJTGAZANr22fvG1PA3LN5om4Qkdziv6hEoZp+ZHyjoYeLIPZEyBeSRZtuZp3/NhVyE7mLjcgxtMw\nAm0EyTtoqQhOhi4KevgeNcMK+fprJWUIErxsB9s8sLAFIg7AUHJYpTS25ZyrGCTiCrIsYztadHnT\nqZ42plt+H12t08o3CpO1c76LEHk7yT0kxcS0bAzTIqsNeoFCti152rmUKLK/wmffnGyixZkw3XSu\nSBaWpHt+WDdCtzebc8bL0byNGLph0eFoS7HxIpuZXJ8N9bfDnoptg9wUWAoGDNvid0uiBVsT10qk\nxHlFowSWjLZtltBuXXOrI2C4qTjr4knGNAvhQx29E6VjlzceAGXNyXtQSiPJlne+uz+pCGIp6xYt\nSeH3HigP+uTtnO8SnEsGleZYs38Mck6YY+Vkgfp6cZ6vecvYegJsWJ9ZizJqJ5IEiuW7GWwthW0H\nnoOsE1fiKLLiLAGVSMuNSIkCx09o8oPLzBjDeY3ZTWcgqQbxY1cj1w1jJ7NOH5IidqMk+rJjaCeP\n7XrSfxCGK4CIMY5PWRsao+A35Uac10JKTXJK58zQNkm2ScpJLzmPa2lQO/YwNGoJBb1ISvXT5SbV\nBEklSaacJaZIQrM2VRJxcb5hWpT2jQ9dwzZjnvvw2JaJIj4hVqalIeG9h+51Fdm3HoTa0OO159rD\njIi8jzI8vnOZSBRw3Es10xAGyeBHf1hbtT84eWu2+LusmTy8fKfIchUMOlGq28+X/PaD5kkXwQpS\nUsqv4OOlHJRsQbrgVeJJxAR52zbc9sd1dGcdE6ZD8oWSwc/+tI7t+4ZYvlVIvHaAmAbcDGSBicI1\nobkfYFk3uOuRDdz57OPszPjpX3/1WKAkZg0ff7DYA8AdD7/MMxt2+PsDWt7zG3q4/+ltXoY1gBUv\nB7JG1bBkDBULWJbN/zz0Mt/9zUrW7u0CyRZBOs49SorQvFVZwpZ1UnIdVqbTm1SCfmQ3iMc2Yz75\nB4Uwl0D2noDpaeZim6abFAMa6JT6qb5Glyiw28ls5b5DxbzM48sdM6yreTt+U9fE6fZxy0AXS9fu\n8zVcI0ZXT47lS2JYxXqkej/gUJIgZTfz2TmfADOOlWsRVhGnbbmhnyFpPzNaj6cp0YBWEtey4jl+\nt/lPDJYzKHoDZt94QAqR992PbWbzfiG8NSTqGN/a4l03Ptl5FxzXgusxsYvChy4lnVgKR4BKqkkS\nMZmybnoWiKyWDWhsYdLxfMoBUhjVmgYkFIe8pXiJtMNHSVX0W5Yc06/zPONOXeykEqg1YMvCwuCQ\ntyXrYc0dyA6oSDGd3WN+i5zyg62GCjpnj52HbVbTQVJJEFcV8jlQibNreE9ovyuY6Hum+hslv5JY\ncL16LeKb0Xo8lxx7MZIk0ZRoqNqfjqX9zHqm6iXsKSf3M1jOkI6F6y00JxrZm9/PQMOLYi4zVS+W\n4p7Ht1AeaGFy9p3e8dZwi5dpsi6pYhtxJFVHVWTf8uhp3lLoHqxcE+WNp4KexDBtNnVl+PkD6w95\nIaeREJH3UQYvwtGSR9C8fXJdvbWvan9wOYcZINpHV3RhmFY4KrZm1R0/GKyW5h3URCXVr9Lk19AN\nrNX1yFtm1mThk3txU6+fwcohnadW7eXZ9d3c8svneXHnDsDRLBHLSTzLgeFrIp3pDq8PAC9s7OXJ\nrmdZXlgUShm5bmcgM1egb66/2fXLu9jTm+dXj/vJN4JLjH58/1r+9MyOUCYwSxJ/nzdnfDga10Hf\ncJ6+TJElq/exbscgK7YJ4cQq1vtai2z4ZnPZICY596m7+Zb9PnqWF1MNkH+15l3YO9Zr31svr1ue\nyVnfNY2J6SkhzXt3r/PsHRLq2qvRtdsJ7nK1UvcenWdh9ApNRx23lb+8sMeL5G9K1lPWTdZvz2L2\nj64al3pzFJObJqEbFtZwC5IEckoIdW5ynrdPOFvcS05MY7uNjSLeApDxScMn7yKPPt/FcKmIbcP0\n8e00pfzJ2DYVLjhtgvfeubGK9YrQqpV0nqb6uDdeKVVohZpmehaIgVLGF5Zcn7kTeCincsSOXYXa\nIVZttNbVMXl0g9NH8b6NHS15edBd8vVQIfx1NjaGfttaCjlRIjZlFSYaKZf8HfK2HdO7jS0sHDbU\nxVNccf5xjG2vJ2ZWL29SZMVZlSJR6vOjqMubT0HvOs57zu3pZibEnWWkqk5RFXPP5I4OLxVqXQ3N\n+x+mLOSCSW8T/ayxfGx0U6OnOYPvv3aRVivJ2zGdpzYiyRa2odJQkRBmzqQp3t9mpsPLQJlMqCTk\nJHJcFwKPa4Gq4bcHMHomYGU7aUzHMEyLp1ftZdm6/fRnj0x+84i8jzJ4ZGHEvIQQQYQCoCo0ye99\n8kzGjvJfZMM2mDymkUmjGiiUDQzDqliPWi0cZEv+MqNaPu9g6khJMTzy9uoDy7XIW+GMmaOZM80h\nXOe6biajYPUeOT0szLmOyTc4oU0d3eH9PaqCvLsHi6GsX36DZu2/HXPgLv1lnt+/MnxOILCndi7j\ngHnc6d+0Cc186rJpVUdqZjkkhA3oTgnIYr0n8UuqCNBRFAkUHQXXzxj0AYoJzrUE2IbqTToN9YHP\nXNHF0idLEe3bEpYsyLikGb7mbsQo66YnxMjJgle8xBUWTC0WIka3L+75AP/90X9gUuMElPpBhu0e\n9sbEWH78nbO5/v2zAbDyTVXjIluOS8AwPdLxVg441291TNXZrF1V79pSfWuEbz1wBQwDhRjzZo5B\nkiTU3aeKZUKKiT12LRPGiOuVyqKm9LX/cBYA8+c1M3/2uEAwWIJETMEGGmMueQ/6AW+u5u1o7kpr\nN2rbPuQ6IYTc8E9v83IPWI5PefrxKe+7cjVvL6eMHo7GnzVugvf3tz8+jwmdghzV9n0YUtkjb1fz\ndp+Vi3Qsxa1Xn8Ox45qIqTIzx4r2bEOl05jBW8ecDvhLSs0+EX9QrzZgDY5G6fdzPnz742+lNS2+\nydiY7QzYuzm+ZSpffN+5/MvFM4BqzbshVs/ExrAZ28qFBZJxLS2hnPZSxZyUrqHNB2FrqVBFwpnH\ntPD2U8czPjlZXG+oDdW5P1mSOG5MBzYW31/zQz/4L0DeDTFfwDH7x5KMKzTVJzBMS9R4l6Ct6dBU\nX3wlROR9lMElC3dyrUTIh1rxosdUORSJbdg6MVUmnVTRdKeggHJgzXsoQN7lGpp3KFtWgLx9zdvv\nk0fejmTtfaTudR3ydgPzpGQOuW4IK9vmE1egv2ogAdKYOmfpiUMm/dlSyKzuJVEIEHZQcLED2ZTu\nWH936B6DwVmVZnWlfTdKu798zG1TUYTJOwjbktCscMrUYVMEuNmlOo+0pFiZZFxBloVA4+ZxxlKw\nLRkppnnpJstWwIXg3INhB56pE8ErGEEiIdWhSeKZarqFZrmFMWLifdATyCjIyYDP2yFRvaSCGcM2\nFc9c6xKrazaPqTJtyRaQID9KVGhS+o9lcvNEjxRcK0pojB0TsW5YHmkpDYNI8aInILgTaX8mXPEL\nwFQC5K255F1EqsuKwCzbJ8J0cRJG9zEAPLN/GT3SJm98VUVmVLodWZJZ2buGPnmrFxOQiiW9dzet\nOMVB9Kz/jjvjbzlL+jwyQBBZc6LJS0lslcWzzpQy3mqKSh+xvP2tnNJxovd7Zsdx3t8xVWFWy6zQ\n8UmPvMU4G/smc0LsLG99eqXw3ZRwnoNkc4w1jytnvFcc5xatGWrj7JaFvHvM+wHoqCCphrgjPIze\nSVxOcOnUd4b2B8n7golv4wunX0MlyhtOp7R2nve7M91BIjYyTbkCigvTDs95drE+RN5u8NtFoy6l\n+OLbwYyhBoJZ61RxDz3FXuRkwUmyI85RFZn6eB0fPuFyJmbeAbZMQzqGqsjohk1vtkRrQ7KqENTh\nQkTebzDolsEL3atGXPJVMv3JtbbPO1A4voJ8Y6rirSEGQLZEBKVTYSlb0MKm3Rqad1+gwENNzTto\nNlcM8k4ke9Eh76CJ17CdJTfOB5WIK0h1Wc8n7loO3OVZSpvwVZt943yTslJtogY/o5N7P2U9LJgk\nqHP6GPQHB9ZDl/2JqXISDd5DZWrP+JS1xKesCbTpkLcsoVNhTrNUdFsLZR3TLFdzjnvFFKR4mURM\n8QOeLH+JkK3HQfXJW7c1Z3mM4pnNTcIJQkLEJTWiSQWQRIpUL3LdUB3BSyItNSKlh4jPWYSUzHkC\nUbkozKlWoQEplUNp24OccM+Pe/fdFHdIIZHHzHQwpnwasiT7BXMMv7b4jBZhnVBNJ5LesDxBQB29\nk+QpTwrLhy15/s7eTMl7Z+aOOhWAMaXT/HE2Y9iGitLc65fRDFilEjEFK+dr/xa+5qwqMnElzsJj\nzmNYy/F88REUZy11OuYHU6WoR5UUitKQJxDalkIqoYAR9zK9uVAlX5sDMHSFhBJnsJxlq5NCdErT\nJIKQyg28a8qF3u+JjX5lrrgqc+boed56cPDJ0hUQsFSmpWbzDqeNyY1+8hcARXIEViksCfnr6yVa\n9anETfE8O5rDxOmSN8C5o89hQsPY0H41UBP+H45d6AsLQVgqdqGJy6dcwTWzP8bcUbOrqskFMXfU\n7NDv9x1/ifftg1jn71aQA0g6wlZSjXvvnRog2/p4WJMXFj4xfm5g2+mjT0XVxfuSTsaIKRKGKQJn\nK8fkcCIi7zcYFu94nP9Z93/cv/Xhmvu9IDFbqql5B3OaV5KvLNuUrTC5xlWZdDIG2BhNO/xoVaiK\nNpcb+1jc/cfqvgQQIrMKn7c6ertXHxfAQiz18j5OtRSuUSyHydsNGDKHW3yTskNoqiJj4pN3a7KV\nhJIIpYN1NSZzqIVOyzFhBwQc19xp5ZrQd87wtlea+4Jthgs01Fiap7h542VPsDGHm/nQ8R/ANhVM\nWw/lHNesssiFbckhzTsekz1BwF0/DIARR1I16p01/yaaFyTkBqwFyRtVJyb5ZFkvi0lIbhhk7eAa\nj7xtM0beqaLVoDoR5bKN2tnlkVCx4JiF841IEsSPXYM6QQRTeZYRSQpN0ma2jQ4nKU88oFG17lnI\nl97yWT4y/QOUN84hVRJJRXTDCsUyiPHQUOwEsiRjWlaoZOqxzZP477d/i3GcGDonKIyBsxzPQTyu\nYA2OQt80h45Um3+Q5QsYFx9zPh+Y8T5/V66JZEz1a0obNu2pdqz4MI0Nzn2ZKumEeBZuJjcXHzxB\ntOUSgmHatCRb6C8OsGlwK82JJi8ILvhadaTamdQwgYXHnIcs++MXU2WSCVUkxnFwaufJ4hoBzTKm\nysyfcDafnfMJPjDjn0J9mtYqgs7M3rApOzPsv++9mZIXhNpWURmsMemblMc3d3IgjFQO2MVJ7TM4\nrmUKkiQRj/vvu7Z9JlY5yWzlHXz5LZ9leutxofPG1o/m+jmf8n7bxbqQ5u0+r2CKYzVQddHVvF2Y\nw63e30GN2nUDphNqiPzbm4+MyRwi8j6ieOz5Lrp6wqkphwoaDyz1g5y2O8kLdgyJZSuPrOjinsc3\ne8uhvCQNstCUlq3bz12PbOS3T2xhqKCFfd4V5Js3CuGkIgHNW27qJT55HWpnIA96KEDGJhYo4wjV\nAWuWbYfK5EmKEYo2V8eJ7E5mthUz60ySkuV9nHvl1aH2RE1rvw61p7kYcZ/YHD92XVL1lr5JG9/G\njq4yacXPmAR45KdvO9ExHVdq3k7U9daTwIxTeukcZDPBkBZ+ZkENasPgZm596o/c+9ctxBM1stsF\nNG+3kIax5zjmjj0RLAXN0vnLCr82s47uCCaSuE9bglhZWCVc4UMPrO/V40iK5UUoIxu+VcJdR+ya\n6yUTSbZIyP6k26AKv3HsmHX8pe8BhmNOX4yY9+wanWPASTiiashWjGLJoqUhUdNnbet+bEWQvO1S\n2iPvYKnaJI2MruskEVOxsh1YTlSxHtC8XcjJAoolnv/9T2/HtGxithCwOlLtSJJUpa25goxVrMPM\ntnGccoa3TxwrYWY7mBYkA0vxtFZJkpg35jRa4+K9NbonElMV7zovbupF1uqRFJN0Y9k737VqeX57\nQNtyMjNahb/YNWnbNqT0jnwAACAASURBVExrOZaSWaZgFJnaPLlm8Q5FVvjc3E/xrikLKrZLwrxs\nJLC1BAoqJ7YLAVRRwiQPMLlpkhfU6eLE9hOIbTsXfdf00PZgVbvebJGCM1e11CdCxzUHyLsj3Uot\n/MeZX+Rrb72h5r4g4oHnlwz8bfZOoLzqbYxPTmZUXW0BIRiBbpdTNc3mSlCgCZJ3haBuF/0I+CDJ\nu0pJXSoW2t4RkfffH/b05vj1Y5u56X+eC23/5cMb+MNT2/ijk6fc9dmokkJfpshv/rKZxc+JZTa6\nqfumV4e871y0kcdf3MOi5bt4fkNPyOddGdzh1om2yk6QkWx6Pu9ahelD/uBE0VtD7aLSbL5++wAl\no+RPtorOcN5J0lLWQRITsbb5VH8Nsmx6H+cwvdiWRGnVOZhZ5+OXA+StaiKBhy37EbxOn+rTMXRL\nx9bjFLJJfvC71aTUdCi5hr+EJ4ahy1X36Js7nWIMRh2y1iisCZIFWMSnP4fS0uMtWQHYYDzDw8/u\n8pa+ubBtKeTzdjNC2UYMWZbEGnDZ4PHnffK2EMk3Jo6qByTQ48LnHTCbG5r/2bpaaSJtihSwiu6T\ntvMcOjqcNe+uoBPQLprjzc44OlYBxe+jm9K0PuaTr6Tqjt88TqFk0NaYrE3eAW3ZM5sjfPluLedY\nYFJWHRLz/LPOulndsJDNMEkAyGaSkmbw4DIh7F7UcQVXTv8nprUI7TFoKhX37rgjSmm0jXOZnvTN\n6u77Z9swJu2n6cSWQxM7wPunXIm2YwZm/1hiqkzKuc79T29n5y4np3Z6nTjdUkgnVc49ZayXZAXC\na59PmSpMvP9w5jGcMdrv07njzwx0vur2PbQ42cckSfJIpLT2TK4c93FPuw1mF4yrI5ugAS47ay7Y\nMmec4I/DhXP9wLjeTNEjrvGdgqynTxTvUFOAvNuStcm7OdHkrYmvhfEd4t0MCl+1zOYH8oMDTJFP\nQ983GZBFeteKtoKat+dWIOxDNzPtmAP+OAQ17wWnC5fD204ZGyJvNxvckUBUVewIoViuvfZv/4CY\nLF3Tn0veiqzSE8gnPVzQ6Q/UL0a2KGtmyHReLBuUpaDmHSbkYUeDtMtpSJRANompCnXJcO7lifHj\n2aVtCpO/G6S07xhGG7Pon/DnqoC1fFlDUkyxbjemISkG/UMlLMumZJSRZJvpbZP5+GfO56tPbKef\nHpAt74PSEZnF7HLaXx8qW77ZPKb564fLKSRkSAhLREdTim5LCwWaqXZSRKzLplgj6yWmUCmXbVER\nyCHsKWMbSU2qZ1sOT7CoS6pCY0oBqoYkWSL5DAifbkX0uhtjEJfjDL94BvETlgc0b9kb/1s+IqKX\nZTuGpYhc4u4MLSkGthHnuHHNfPby2Vz/2FKkZE5MHE77Wjkwm7sJJGI6LQ0xioqF5Wb0slQScoJU\nyuCmj8zllj8+AAjTopsfqjXRChUp6t1o9JyjeadUn4ileAlUDb2QwrJt0kmVn3ziHfz48TgbSi+i\nNGRoUBspBsgqpHlrqZqatxfxK0tIkh+kqBsWsRqEI5kJT7iYPKaBK+efSl+fbyFprwim0ndNI3Hc\nS+h7BbknAqbYoJY3OqTNSSGTKMC4pk7MHuGLjqkyHQHTsV1hGscU39aHFkyjdetuFu0Sgkaw1vak\n0Q386NpzhGUFeNv4M2lPtYX93QcoV/Ctj83zAh49Td2Ih4g0SE6V91OJd501hVMmt4a01ffNn8q7\nz5rMt3+9kr39ec+d0tqQ4NZrziYVF8cGY0Nqrek+GHz5I3PRdCtErkGzubftAH5wgOPUuazrEgpR\nkLxdn/dImncwT4W2KRA3QVggPPeUsZw+YxTppMpTq/wA1WT8yFFqpHkfIVg1UpUCVaYxw3KLhMih\nYhCFkkFf0c/JLSt+SktX8ivr1oE1b6fghuf/U0zH5+1XParPT+WczvnO/mCqVD/pQn3MCc6p8Hkv\nyQo/va0lhd9WFVWSBoZLXtnIpkQ9qiLTXu8kvohp3sdZtovexGa7NZklV/O2QdWwveAmmTq5Ednx\ng7c3J0WQn+l/1JYernYkMi4JE2nZ4V3h47dJxhUM1zfsCACphIrlraUuh0zwthFnsiYKIXjJLZzx\nPnPs6SiWWKftEroiS2TKQ0hIjKpvce6gVhIVE0yVeFymPhUThUEUC1s2sJVgoJjTDyeoTZcLtLW4\n5nKfHBpiDWS1Idqakshp8fyntvqaVGstDckQZnt3kp7deiqzW+eIcUgNI8m2l9WsLqkSjym0SZPR\nd8yEgQl8aMpVBNXFUGCSLdf0eYeIXJFFwiBElbRa0buSmfDM+lPGNFV9R5WBQ9bgaN7ffg22YyUI\nam5BIvdWKQT6EkRdYAKPq3LIxxn0N4MTsJZUkSSJ9nTAOmGE1x2nEiqyJCFJEv90/Lt5+4Szqu53\nJKiKHCLaWvcUJKr4K5C3JElV7cnOto7mJLphsddZMphOxqhLxjyiDa7jrmXyPxioilxlNamteR+Y\nvINCyiuZzYPHnth+AnVqmium/2NVm3WBQlCSJHn9DL67ifiRo9SIvI8QauUZr7Xd07wlhd6MT475\nUrXm7aIuJV4iTTdDRSrCmrfN5sw28Zdjcg6bzZ3lN6UpXuIKKWg293Ihq6QSKkk1GdK8dctgW2GD\nc7AFZswj/L5Myat85Urnk5pEQJJclyURU9AtQ0RKuyZ3JxmDu9YbVRdm4YD/s0FpEfV3FZ2OppQg\n74DmrZXcQLhAZivHZFly5CK1bT9Kx26x3MPSPHJXFYlEXMEoOwJATAv5usHGGhjDhORkp9604Res\nUBNiQgsUtFBkiaw2RH28zsvFbLqme1dIkiyRWMJUfFOuQ85lCuiKIF+9GCAMxyeXNftobnL8pwGz\nbGO8kbxeYG+xy8vHPGOUr9U1JeqIyeIeZdstpOFkbnPMo+lEgg/NfC9WOemZ122nIlk66QpbNnax\nAXXvKbSkwlHESSXBuPix6LunoioyTfV+JLoLNaAdKrLkFXoQmncN8jYSnvm2crKH2r7HukCyjrBZ\n1m+/MR7O8hXsl/gd9h8Hr2NraYrPBXzRAZ93Q9zXhG0zTN6viFfBg0HNVKkIbHu1cO91pxO3U1dJ\nskqck9pnVvnjXytqEXWlUHWg/alEtQl+pIC1hng93z7nZs4c+5aqNmu9ZxAm/1cSKg4lIvI+QhiB\nu6vglsNUZYW+rK9590gb+O2m+/0DA8TqlgYtaWaIUIOat9zcy7J9ItLbKjnLpGJlYorsmM2dNddy\niua0Y/JSqoO9MEVO6sZ4AwOlQTQnA1le9zOvGb0TsE0VJSau35sp8v/Yu/P4qMqzf/yfs81MJpls\nkAAJ+yabICgo4i5Qt69WWxUXcKlaRVu1daFUpbUPuFT9Wbva1trqQ12hllddeLpp1YLWlcUVtAjI\nkkD2zHaW3x9nmXMmM5mQZCYZ5vP+h8xkZnLmJMx1rvu+7uuOWevL7eA9uXqMeVyl+/CrDx7Gqzv+\nbZ6npJ7Y/knrzKBmNUZxFy/ZhVRCoB0DyvxQDc0zbG533rKDrjkkbZ6rcLsrWJTvRWNwM3a173Iy\nd0kU4VckaBFX8PZUrsdR3xSBz96yUo45vxO/5IMkCOb6Z8mcKxdFc7cjuwMUAGiqfYFiPq+4OLGk\nx+nnbL3fiNaGmGiNnESCieYeVrOaFr0egZB1fK7gXR4wA+nKj58xHx8NYGBxYs4x4JfNoXMAJdoQ\nCLGg01TEzmz9PslsEqO5A5V5UWF/gNt75IiC0GGeWBAEnFJ9DtQvx6KqPODMwbqzMzkp8/YOm4uY\nV3YhYlumIbLpaMS3j4PSNNK5uEgOIgBQXtJx7tGdOaWbUxUEAQtGXYzohzM7HFcyRRZR2mFnP8HZ\nIcuI+Z2LG89FQYoe+r3NfUHiHjbvjeAdjWnmErqkQCUIAr459RKcMvLkbv+MVFLOb2e4oHFfdLmH\nsv0phs2TL9DSCaYY4QAS9RoAg/dByT1s/t6n9dANsxfuftd2llu/bEJMtTJcw0BdY9jJAPeXJ/aA\n1ttLrAIq8zXNDy8De/WtqI/sT/xQV/C1h5dlQYbeWG0uMSpuhqJ4h819QgClwSLo4SDEUKPzGu7M\ne2d9G0q1oYjpcTyx/jWs/2C3s7etumeY9fqJIri6pjBiVqFdsbWOcnRlrbmOdsBufNGyHau2/MU8\nUCt428PmghL3NOZwF0KVyFaLVCWC0lJ7eU7iP09Ts13oZm1VKCUqsdvaBGCH+SErVdShrug980nW\n/2NZMiuW7QsdqWq7N/OW4tjXHIEMa3jWmuMHzEzTzLytD3YljrgRQVxXUe4aQraXfCnDzKYgpSF7\nIwvZmUqwq5TbjVZExCYYutnD2mn5GPfDiPuwH19ik/oP8xQ0JqqI7S077SmX2JbEOmDAXPs/sMgM\n3pE2H9o3HO08xh42d9bhC4lhUfu47OBk/32bc9YdPwztAJuuGtcdJCVR8BSs+WQRgwODoe0fAqO9\nFOquMdBVn7MbXjDFvvbuzMo5Blfm7Q48/qQ51YmV46C3mFXlyRciboospXyvV0y5GPH35gGaL2Xm\nndziMxv8nmJAd/DufnAZ6JqKSHXBlC2ZsuxMz3FPz9gXAuky784Up/g7AwBZTrwWg/dByJ15P7Rq\nA155dyfuXvmOp9HK8sfeRn2zOTcc0+PY1xxFZWkAJQEFQsRd9GP9J7IztiIFUtUObCsyd/s5acDp\nAOBtB2oF4UvGLwIMEXpbGUR/GIZoNfiQzbaZPsmHoF+GVl8LQdQhVe72PB+agoaWKN57y/xD/vN7\nr+PXaz7Axm3m4+zgamgydMEMmvubo4gLVvC2Mm9REJ0Mz3OeUvTrdg9ZuzPvErnEeZ8lQSvwuTJv\nLe7q2Caaeyy7s57wl0M9VePun2suvZGgt1ZgQukkSKFGSFWJZXSxz6bCMIC4nZl7Mm+/uYey9f7E\nYDPaNHsLy0TWO7bYWspTuQcQVZSUJC5A7A+BgUHz8a/sfx5hcb815SGgusIOgmaTFA0xRIw2xHeM\nxfTBiTXqdvAGzPXtRpv5evaccNAvozpoBqq2Fsks7LOCS0u7+Tu3i3wG+BJroO3gbVfXjq01f85h\nYwc6owLuoFhZGoAAYGhVx9854B16lCUhkXlrZuadnDFqmp5YrpNuODPpQzlV4RLQ8QPXPUfaWYGX\nnbFVliay/AGl5haVMsy/02CK4J343XXN6CHm//3Dxg3M8MhERul+T6LYu5k3kH4IORvsn+WTRQyz\nKtyryjpvhuK+6PLMSSuJkbVU3+/KcSSTPXPeuQverDbPESczKWmAPPQTfLjTu7ymTdgLsbzOmeON\najFEYqq5jlY30KaJEACE6meiWbaWFllV1MGA7GzacP74ryKyx9zooXaIjMtPnAXdMPBKXQPW7/3M\nyXy11lKIZXUIi/sQ9I81s1PV3NtWFAV8e958/PKjTzF1qoh3/55ocFLiC6IZifWPdr/o3U1WW0+7\nGMfOOiUVMVWHjihEeCtSJ9YOxsdNiZaR5vPNDz17pACAOd/tt+daXQ1GrOB95IwifN76mfVzXWug\nnUYuGgTr/BiajOKAbA25CmZBmL9jsxnJNSw4o2w2Pmr+wNmJKbLhWHO/agDtLQJQYgV915y3JEWc\nJVRicTNaVfPnuzPvb596PJa/vBX75E8hKDFUlgWxwzpGe8574UmH4uebXI1rrO5XQwYU47yTxiIU\n9OGdLwdgXcM/UOoL4ZRDvo5h1SFcEDYb5OwT/us89fCRI3HWCWbryTsunYn9zebWh9VNZlCwh8JP\nnjEUf39nBzTdQHFAdoYd506agj98bG7KcsnJhyEkDMCU0WbWfszUIRhUUYTRNWaf7GWXzkSFK6hV\nlRfh9kuPwODK1FXInjlvSUQsrsEwDKfaPDljVDU9MSef5kP1vmuPxusbduGZl825fk+Rmiu4JQc0\nd5CXU2Tw9187B63huJN1L7t0JprazIs+ewcrO4ja2Zq7u9hti7xVzJnMnjIYA8sCGF2ToiNZkvsW\nH43G1ljSnHfXC9Y6M6DU3BfdMHIbvAM+GcsunYnykB+KJODLfe2oTXMRaHNfdCkpRlnSFay53X/t\nHLz18V488Tdzu9p0GXqqi4NcYPDOEbswTSzdB6m0Ac2tewAkrh63la6Fuyg3psUQi5vLqEQB2Cuo\nKJaLIDQMhVi1y1xcJOowAAQUGYJsZn2TB0zA3z7ZZ3bv8oedtZhavVWQ5jOvnu250jZhPwRBgCDH\nYaiJhgMTBtdC/FhE3JpntTPvimAJmvfHzbXWmuQUpe1vbzH/muxqcSdwxs0GNIpVze5aQlJRFAK8\nsdvJrGPbJiIweb35GnIMorVlpN6ayFxDivke3t3/Nt7d/7Z5pyvzdobQ5Rj8483vG9EihII+TB0z\nEOs273aCfak+BOMGV2P9f8zzJEuCk50FjUrobaUQi5s9xwgATc0wg7ccd4oI/ZIPoiA4PbvF4ia0\nqOZzy1xz3n6fhOpQGfaFzfqD8jLRXLalJ4bNq0PeavD4DrOJSHFAdrLYE8dNw4mY5nmcX5FQWQpU\nxkc79w0vH4RqK3sqtiqFAWDW4MPx7KsfQ9s/BJNHVngyQ3e2NW5AotBt+qihngsxURBwyPBEtfWI\nwR23dxw5OH3w6ThsbjhD58mZt98nQdONRJerNMOZpUGf50PeHdB8nmHljnP0smQeQ6oP9oqQ31lf\nDQChoA+hoLeRjP1+3BcCK+bcDkkUUaIcWMFa8rntTFmJH2VJ8/2pmrR0hyyJqAz5sa85mnYIOVvc\nf0/2KE9n5DRLwVIWrKW4QAPM3/PIFH/Hydw1BRw2Pwg5w+ZWT+WInmKHKxd7yZdfkcwPJ1GDIvoQ\njWuQkpYYybIASUlsU1jXGIERC6BdS6x7tVtzhvzmB669WUMM7TAMwwrePkSsHbxkUUaFvxx14X0Q\ny/dCHmAPi7u2WlQVZ/mUvduYMyft7GGsojUchVhsZuYhV+FOyrWg9rB7WzmiH1vLk5QoxFAj9EgR\nEHd1B/OluPp27fhlX0BIFXsgKHFIrYOh7jSDn7OUyPp9+EQ/Lp9yEcT95m5DdsEaYFZdq9aOSoYu\neLL7BmsBgFi6z5mXtzd+QDwAI+aHEGzGjlZzyL0maSlSib1LkRxDaYld7Z0YNncXOk0UToTeYI6q\ndDXzce+6lG7tbUD2I/blKECXMLC8KG27R3exXbHcvXW86aQqWItZ65cVyRu8Az4JqmZkHDYHktY4\np8mQUs2P2wVv7u1dD4Q9kuD+PZX5Qx365OdCb2XeQOJiLpeZd3d4Mu8U1eBdybzNx2U+X+6Lg1R/\nS9nC4J0jTsGatYGCs/uTLWlLQ7tHud9ndmkSJA2KYG4Dam9q4ARvSXSGtQNyAPWNYQhqAO1qO1Td\nvD+shiEKIoKKGbTsIdKI0WbuYiQYgKZ4CuiqigagOdYC//h3nPvawq41yZriFGm1WNXmRofMW0Wj\n8hnE4hZUxMd4AkiqYOLeYcoZQg81QJDjHdbRFrn28lWsYUnPPLp1DGKRtca8bZIzn+tklFa2LMIq\nHrP+I8qS4BS6tEfi0PbVmIFb9cFd6mq0l0JvLYNUXg+p2mxp65f8zsWa3h6C6I9g475NKJKLMCxU\n63kPIcVV+e/XneO2P2R8kmvNtpgYdTiQzOeiCeeiSC7C5AET0j4mZm0vW1bs82Qi7vXSgiDg4gnn\n4uyxp3d7HW86SoqlYnbzEZ8ieoJOQJGg6ZmHzYH0WZV7Pa6U4jF2Zt3Y0vlFdjr2h36uM9RUpG4U\nZ6VjX8wV+/v+fXXGezHoyox9nS8VS9aF2J2x8U22MHhnQTiqoqXduyuY0yXMyvTiRlJ3LtVbgOEE\nb8Xa9UtUAV1GLK5Bttbl2vPjih28NRkCBNQ1heEXzMBoN2Zpj4dRJAcgSaK5VCfuh2EIaFWb8ceP\nVgEAtP2DPB9WA4OuTRosEVenOMOqKPcrIiL2Bh3OnLcVOBUVMdnMumvh3bIwOXifNOxYs2DKZjVZ\nEcvMPa6T23C650EXTVqAG2dcA3XXqMTx6e75bxHlYiLrtT+c7WAfMMzXtocYJUl0/qO3RVRA9SH+\n3ymIb0/sya3IImCIiG01h6ztna38kh/2SgB7HXZYi2B8+egOGzKU+q3aASXmbCBiaHLK5TElUuLi\npegAMp+ja2bivuN+2GlbSltxkeL5MEquDp9dMxNzhx/f5Z/dZUnLxlQtfebt90nQNHPY3C4sTCdd\n5uS+v7Pg3dDazeCdIvPuK+5h855edOVL5q2kec+pMu/OCtbc2/Wm09MLou5i8M6C6x96Ddc/9Jrn\nPrt6Fk7w9gZ3Q9CgR4LmGlZRcTbZ8CsSSopkCJKOPfti5iYMgt061GroIgnOhhRtERXhqIZia3/h\npqg51xpWwwhamar5QWj2zd7eth0fNXyKQfIIaPW1GDwgEVA9OyxZhg8yX3dAqd8pShs62Oc0QnEy\nb6dtpwpVtIbsFe/8kXsI8VuHXdlh/99xQ8xWlfb/PfcmAYD3P+DQkhqMLR8FGCnmvGF1RLOqdodV\nlzgZROyzqYhvH4ehhrk8ys4AZVFwisbs4VmtvhbaPnP4fNSQUqdHtxEtcvrFA4Bf9jkdLY32xEhA\nqsy3LJAI3s7fhC55AtL/G30Kjhx8OAJS9pbq2PPcQyqDnkrsQTnaaMHdrEiWBOiG4azEUBTJO2yu\nmHPebRHVHJXqJCBJXVjDW2o1jXFn92NqzIu5IQO6N8ztVyQU+eU++2B3S3Vx0l2DrL+T0mJfhkf2\nrXS/d/v3ka63eTK7F7zYyd9YV9eJ97b+ffmUp+xCG8MwnA+WRPA2/9XQMXhDC0DdNQbV46LYGfkC\nsLbLnDV5IF54C04wcipXreCtSCIgxWFEfE5L1XJ/KRoANESbMArmnLddLFUe8mPP/nZz/tgXRZEc\nwHdnX463yxow3bUcZXz5mMTxqTK+MuJknHTUZLz7aT3iqo6nt5hrzysrJWyPxc1kU1NwzNQhEMsF\n/CeyCZKswfBFYBhCh0zbfXt8xRgIgoAbzp0Gnyxi9/52zJo4CDe/9ifnfert3jluWRKwaOL52NL4\neYcLjbISH5paDRiGGfyrS8px+lEjURr0YfaUwSgOKLj27EPx8z9thLprDIQae7g8kXnbRU32nuTj\nhpbhzGNGYV9TBDPGV+FnqzbA3GFcgN40EKK1I5tn2Nx1wTFj0FQkq7CaqMiDvsCGfebPOe+YyZ6i\no1NGmu1qX3rjC+e+AaW9u2/wLRdMx0dfNGDK6AGIxTV8/YQxB1Qo1VPupZR2sLH/litCfs8oix3I\nW9oT+5ink/yhe+uF0xGNe7OpMTVluPTUCc4GGwBw8hFD4fdJmDHeu/NWV10wd5wzrN/XejN4H35I\nFRbOH4+jJg/utdfMhuRs+vuLDkdTa+Iz1/130dn5GTE4hEtPnYBDhqcfteqrCzQG7yxStcSmCppm\nz3mbHxya4A3eEPREYxIkdtzy+yRnq0l7DbOdeTt7RUsCdCEOQwtiZ521UUdxJT4PA/sjDeZuZLrq\nZN5V5QEzeFsXEjXFQ1CsFOG4ad4sa3jpUAwOVmN3+15EP5yFY4+ag1DQh+Om1WD9B4lK7WDQgICw\ntbm9gLOPHY09cRn/ec8cNheUKBD3IVDi/aAt9lQrm+996hgzCE8YYfX/1v3QxXarUKxjRe+RVYfj\nyCGHdzj35SV+8z+rIQCCgepQGfyKhLlHJPp6H35IFYJ+Ge1R1cmU7Yst93CsnXkfOnoAJo9MVH+7\nq5zj28cDhoihlRVQRNmpcTDCJdAjRThhzHTPHL1znEWJC5KdrealwNETRnV4HODNEFJ1EOuJytIA\njp4yBIBZiX3aUSMyPKN3uTNve5jX3rSnqjzgyYrt77dFVFRXdF44l5xVpbsYOW5ajee2KAgd7jsQ\n44ZmnqLIld4M3pIo4sQZQzM/sI8lz0PbIympZJpKyPR30JWitmzo+zGdg5j7Cl/Tra+tYXNnj2Xz\nljlfavfztpc7WTtuOXt0W8HSZ+/HbDdOkTSn4GxHnVn1XVtqZtANkUa0W/PRRdY+t1X2jkuy+bo1\nJemvom8+4jpEP5wJI1zq3bQ+oCSGyJUwRH8EmpUZK7LobK0nyCoEXxRGzO/Zlxfo2s5DJa3mHLPe\n2DED6uxDKRS06wLMoFDiSz386QzJG4bntuyZ844797l55v00H+LbJmGYPsN6Qet+Q0R0w3E4b/xZ\nKX9+kc/nFA8CZuFdukpud/FVLqtac8Gdedvnede+xI5x7mFz9+890/RBbwaufCX1g6H7XMvlUHa6\nTaeyrfB+qzlkL7sCEsPmguBu2WmxN9+wMm87wxZE1QreiblQAAiIAc9rGFYWb2gydlrBe0SlOV+8\nP9Jo7kcNIGgFVLvNYeyzQzEiNAynjpyb9j0E5IDTKtL9HyIYkJ3g/Z+ItZtYuGPwhq/dXI8eD3To\nhdyV4F0ePgSRjXMQ+++UDt/r7EMped/idEt07Kvu5P9/dntUIJF5JweCVEU79lW49+VStwwFzGVP\n0Q+OcgJ4mb/jDlm2rhTP5Bv7nRquM2af50TmXeQ59+7fe6bCqT76XO1XCvECJpdD2U5ilmMcNu9F\numF45lI8mbfmLVhzb7cJwargtoKz0SHztrqLWXPefsneDMMM3ppgXQioMnZY2/UNG1AJn6hgQ/1m\nJxjYa4ZDRebws948ELfMPK/L7y8580bS7kh24xdFFlFkWM1gfFZns5i/Q1WwLMo4YegcDAqmn1eU\nZbFDoZqts0KT5Cvv9MHb/Dd52FwUEsHbnitLfs1Uy4DsD8p0u8h1PE4RRqzIXG5WuRdiJzsu2Mv4\netJoo78RBAGGYSRl3lbw3tcOvyIhFFSSNjFxZ96dz3l39fdwMGPwzi7nsz3HDp5PgT62e387rrjn\nn/j7267+1/HEsqrkgjVB6ph5G9awuW7vNuWLwO9zZ97mtZZTdWy9hu7KvJvaYigt9iHgk+GTzCD9\nft0mAMCospEA9OeBHQAAIABJREFUgPJQ9ypFk5tcOHtuW/RwCLIkmPv/Wpm37rOat8T9Kfv+njv+\nLBw39Oi0P7OzZRzJnbHcyoq9c8IBOXWBlz13XGQdWyITTPS/brcadSRn+ikzbyl1Jp+JfcEW19MX\nOdmvOXxQ560h84m9JMtd4S675rQHlgc6jES4f++ZMu+DbXqhOwrxHOTygqWvLqaZefeStz7aCwBY\n+ddPnPvcm444QyuiO/M2AAiJPautYFgSrwX8m6AM+wQ++WRnztvOvINyEIh3zLxrKspR7h/gVMi2\nurbpBIDRZWYR0qSRlTh99ghMH9e1Stprzz4UX9a3ej4EKkJ+nHroNLypbcWYkrF4+8P9MNpKofgT\nFfHmkjd7NzJft1oHdjZ3lep7t1wwHe9vrcfXjh8DUQT+ZT9WTP2zrz17Cl5Y/wVOn20VaLlesqqi\nCKOGhPD5LnP0IPkDIdV8qzNsbkVaWRJxwdxxad8DAFx51hSsa9iBrZFdnuHjZGcdMwqqpuOsY1IX\ntOWj75w/Df/3n+04+fBEEdSx02rQ0h6Hbhg4ekqiHuPCueMQ8MnY+mWip26m4D24MohTjhyOKaMq\nO33cwcyvSDhzzshO29MebIr8svmeh6R/zxfPH9/lTUk6M2N8FU6cXotjpw3p8WsdCAbvA7S18b+o\nC9fjqCHezQXswCYEWiGW1UPbMwLRlJm3GagF0XA2FnGG0q3g7YsOQoV/KBqKdwBSvMOcd5EcgBAX\nnNakLaq5DehXpo/F7JpEj+sLJ3wNL3z+NzRGm5znAeaQ8NeOTywDy+TwQ6pw+CEdA/3Xjp6Mq6uO\nwsdb67B+rbkft/sqNCgH0BSzh/SVbgZvMem22XMaSD1sPmFEhVOpfv5J4/Avc5dMKGLq4dXqiiAu\nPTWx/lpAYthbFARcd85UfPfnr6c8lmCKLlPJAf74aTU4cXpth8e5nXncGEzYfiZ+uWEfFhxydtrH\nBQMyFn7lkLTfz0dDBhTjklO869/H1pbh21/vuKzOXimwbXeLc1+mYXNBEHDeiWN74Ujz21ePHZ35\nQQeZTO/5pF6qmpclsU/+XzJ4H6AH3vkFAGDW4Bmebln2XHdgqtmcJdJa4Q3emrdgDYCZfetyIhu3\nhs1jcQ2KHgREQBOirszbqjZXJBTFi9BqBe9P2z+CKIiYPND7ITin5kjMqTkSb+95DwNTNFzpLe4P\nUPeSnqASRFMssZtXd7bLSw6YPlmCqplDy501TrAdWzsbr+5ch9HWlEEmiepz89+yksQUQ3Kmn7pg\nzTts3tWGVhWBciyddWPXHlzg3Bdt7o0/iAoJ//K7SdVVZ04ZAJKnWARRSxo2TypYgznsbcQDEKwm\nJPYcciSmQdB9ZvAWI4iq1lIxe523LCIoF6FNaoXgb8OeyJeYWDnes4mF2+GDDuvRe83Ep4hmP2rd\n8GTe7mpyo7uZd9J8kt8nOXPQXWn1eN74s/DVMachIHdvXbS3mYP3WFIOm9tz3vbwd+FNN2ad5ClY\n40cYFSYWrHWTmlRY1CGQGELSsLm9zjuRedubeiSGzc3gFo1rEKyGJHEjirC1TttemuWTRQSVIkCO\nQRpgNvaYOWh6z99UNwmC4HyIuueQPOuVXZttHIjkbPdAd0USBfGAAnfyum+3mKp5bqfaijIx523/\nfEbv3uYtWOvfG2QQZQuDdzfFda3zBwg6/vi3T7Hps30AXJm36FoTaFecJw2bb9vdgi92mvPcUSOM\ntri53tVu0qLIEkqUIATRgFS1A7IgY2rV5J6/qR6wP0QVxTtsbjNUxbOTU1clD5tne79cZ847xfda\n2uOe26kL1rpXbU5dx8ybiMG725Izb7ufucMKyA88/T6AdMPmKgZVFGFghbWHtWvplb0dZtQIO3tx\n25m3IotOYBT9EQwtHppoitJHjpo0CANKAzh8fLVzX1BJtAOVoXSrjaA7eI8cHMKF88b37EAzSZEo\nf+/iGZgwvByzJ3v34lZkEbMmVnsKopKHzZl4976JIyowqDKISSMrUFHau21iifIFL1u7SdW9WVgs\nufuVNY9tf3inK1i7Y9FMPP3uK3izHYAuosgvIRzVnK0129VE8LaboiiyiGI90XQk5Ov7db9nHjMK\nZyYtYXIPm/vl7q0td+/zfPslR2R9swdnnbfr1zRuaDluuXBGx8cKAq4+y+z89vQ/twBwVZsbicdQ\n7xo3tBx3XXVUXx8GUZ9i5t1NquEdNjdbV7rms63MuzJkZsTJvc0Bs1GLTxFRVGT9GgzRqdy2s+zW\nWBva4+3wiT5nWN0niyj3JdYvhtIUqvU1d8FadyrNgUTBmiSaLUaz3Xwh0XGte+Pe9pJBnfVqRJRF\nWc28V6xYgffffx+CIGDp0qWYOjWxdnPlypVYs2YNRFHElClT8P3vfz+bh9LrkofN46ruZNsAnK8H\nWMN6qea8BVmFJIoIBqzgrZutIOubIs6weVu8De1qGAEpALs1hSKLKJMSwbs0zaYbfc09593duWq7\nOMkO2tnecEBI7pd6gOSkJi3MvIkoG7KWeb/55pvYtm0bnnrqKSxfvhzLly93vtfa2opHHnkEK1eu\nxBNPPIGtW7fivffey9ahZEVyG8u4qnn7lVvBu9Rqv6m72qMaqnnNJPniePHzv6MdjQDMOe9ia39i\nSfdBgIDWeBva42FnO0/ALFgr8yeCd1mgf3ZOch9z8qYkXWVn3nZGm+3t9xLD5t2M3slLBhm7iSgL\nsvZJuG7dOsyda+5WNWbMGDQ1NaG11exzrSgKFEVBe3s7VFVFOBxGWVn6/Vb7Ql1jGI+t/djZDjJZ\nqszbvVOY0/LUCgLujUnsrFoYsAN/+XwtXtn5uvVY0Wk6IUkiipUgGqNNiGgRTxaryKI3ePeDOe9U\nZDExsNPtzFtK7K8N5K5Pc7eLxa0n9tU2gURUGLI2bF5fX4/JkxPLlyorK1FXV4eSkhL4/X5ce+21\nmDt3Lvx+P04//XSMGtV5v+aKiiBkuXeXCVVVpZ8rXrHyHWzZ3oiyUABXnNVxO8rikOJ5viCJiXXb\ngJN5y4qEqqoQJFkCYEAQAD3uAwLtHY+nrBhl1hy5IokYXl6DD+o+BQBUliSC9eDqEAJFiYA9fNAg\nVA3su3nvdOfRCNYC7wB6OIjSEn+n5zudygpzSkCWRef5AZ+E8cMruvV6mVx46kT86JE3cN68Q7r1\n+iWhAKqqQrj6nKn45aoNOHXO6C69TjbeS6HhOewdPI89l4tzmLNqc/cwZGtrKx5++GG89NJLKCkp\nwSWXXIKPPvoIEyZMSPv8hoaOwa4nqqpCqKtrSfv9fY1h69/2lI/b19CCOiVxf2tbzDNsPn54CB/s\nBMLhOOrqWtAeiSWK1TQZhi4msnPLVacfin+tM3+uKAoY5B+ED2AGb9lINKNoaQ5DiyZ+dVq72Ol7\nyabOzqMAH04oPh8vvl0HjDO6dYzhtqj1WnCe/7MbjoMgICvveVRVMX57y4kQRaFbr9/cHEZdXQtm\njhuIw7v4Opn+FikznsPewfPYc719DtNdCGRt2Ly6uhr19fXO7b1796KqytzcYuvWrRg2bBgqKyvh\n8/lwxBFHYNOmTdk6lG6xLzbSjdJ2GDbX9KRtPs3MW7NeJ2q0QvBHrBcXALXjdZMsys7wuiyJqA3V\nON9zL7tSJNFTCFWi9M9hcwCo8g0GNB/8Svf+1Ox13u4qc9GqPM+W3hqaL8StGIkoN7IWvOfMmYO1\na9cCADZv3ozq6mqUlJhBpra2Flu3bkUkYgazTZs2YeTIkdk6lG4xUqzTdY8edChYi2uA7B42N7Nq\nOxjvqFqDwNRXrRcSYcQ7NpdQRBmqtaRMEgUMK0kEb/dabrt/+IiQuctSd/t254IdfANK9wZ5ZDm3\nc91ERPkga8PmM2bMwOTJk7FgwQIIgoBly5Zh9erVCIVCmDdvHr7xjW9g0aJFkCQJ06dPxxFHHJH5\nRXPIvdRH1VX88aNVmO3aBlQ1OmbeYlFiqMQQzO87Veae1xZgtJVBLDYff+GEr+HThs9RVTQQmlYH\nwCxYqykZjONqZ0MWZcypmYUnsN45JgD47uGLu70eOVfsgjNfN1qjAole6WKWq8x7C+vUiCgXsjrn\nfdNNN3luu+e0FyxYgAULFmTzx/eIu8nGxvoP8cbut/HG7red76tJvc1jqg6xsinxfGgQBQGaYeCL\nvc3eFzdE6K3lQPUOAImtO4FEm1VZFCAKIs7vZH9nScxun+/eYGfe3a02l6zny3mSeff3iykiOjjk\nRzrTBxKZd+pGG8lz3jEtBiHYAr3NrArXoEIUBei6gR/8YZ33yboIvS310rhZE83+2ccfVpPy+/mm\nImQO6Q8o7V7v9UTm3b+D9xGHmPUcIwf3zzX3RHRwYW/zDARBgF/s2Jc7ntzbXG6EIBjQWiogBJuh\nG6qzx7Wn8xoAASLu/8ZXsOaLCMaVj/Z878hJgzC2tgyVKTZc+NkNxyY6teWJMbVluPvq2RhY1r3g\nbQ+79/fg/c2zJuO8ligGlhVlfjARUQ8xeKfhDJsLqbt6JQ+bq4K5lE2PBiHpkpN5R2Kad/03zD2m\ny0sCWDTp/JQ/e0CaQJevexdXl3c/oLl7m/dnkigycBNRznDYPI3EUjEBmqF3+H6HLUFFa9vOmB/Q\nRWiGBkkUsLehvUPmDZ2nvavsXuH9PfMmIsolRpE03FXDWlKWDXirzQ3DgC5Za7jjPhi6BNWIQxIF\nGAY6ZN727mCUmZ1550vBGhFRLjCKpJEp845riYC8e387oJidwIy4HzBEqIaayBalpODP4N1lSp7M\neRMR5RKjSBruOW/N6Dzz/vmfNkFQYgCAUn8I0BKZNwAIHQrWGIi6yqdIkCUBRT6WZxAR2Ri803A3\nadFTDZu75rwjMRWCEoUiKvjhJbNRO6AUcT0OwT67ycPmev9fn91fyJKI755/GM4/aWxfHwoRUb/B\ndCYDM/NOVbCWCOiabkDyx1DmC6G02I/yYDF2RXRIkpW+JxesaflZNd5XDhle0deHQETUrzDzTkN3\nNWlJOeftWuet6ToMKWoOmQPwS+YabdGa6xaS57w1XjMREVH3MXin47RHFVLPebuGzXUhBggGQtbu\nXgEreAv2RiVi0rC51rHpCxERUVcxeKdhrxRLV7AW01QnOzdEs9K8WDG37fRbu3wJaTNvDpsTEVH3\nMXhnIKYpWPt8dyN+9Zy5B7kmmpXmQTt4S1ZmbQftDpk3h82JiKj7GLy7INWcN0QNb31sbt9pWMG7\nWDaDtzNszsybiIiygME7A90wUg6bu7umGZJZvNZh2FxUARgQ/OGkF2XmTURE3cfgnYFupMm8reBt\nGAYMKXnY3NoRTFQhDdoGsbjZ05iFTVqIiKgnGLwzMAwj5Zy3ORRuQDcMCLKdeZu7StnD5pBUSKX7\nAQCXTlqQk+MlIqKDH4N3BuaweYrMGwAkFf/e+SaU2q0AgGDSnLchqBCKWmDEFVQFB+bkeImI6ODH\n4J2BYaRYKmavAZdUPPnpaufu5DlvTYpADIRhREKQRc5zExFR72DwzkDXOxasybDntL33FyctFYvI\n9eY3wqEO+38TERF1F4N3BobRcT9vUU/MaeuRIud+RTSXgNnD5mFxHwBAiIYwpHgQJMOH+M4xEFiv\nRkREPcDgnYFhGNCT5rwF3cysBUmFEU0Eb8GKyvawuV1ULmh++CQfZukLoe4cl/2DJiKigxqDdwYp\nC9bsJiuSCgjmBPjo8Hzn285SMYtgPV4wmHITEVHPMXin8NH+TyH4zMYqqQrWjLgVjK3gbRgCSvUa\n5/uKKENxFagJujeYExER9QSDd5KWWCt++t5v4J/2CgAz8/7vnibPY1S79kxSIQgGYAiQRG9WHfKF\nnK9FnbuIERFR72HwTtIWbwcAp6hM1XTsaWjzPMauX7MzbxgCxKQzGfKVOF+LOnuZExFR72HwThLT\nY57bcVV35rVtumadNsnsXW4Gb++pLHUFbwHmELoB7+sQERF1B4N3koga8dyOxXVAMAvWoh8dgaJw\nLbR6c37bnXlLSeu/Qkpi2Dz5e0RERD3B4J2kPSl4x1XNybz15gEI7T0aajQAABCUqBO8haQzWepP\nBG8hKXgn3yYiIjoQ7NmZJBz3bt8Zs4bNDQMABLRH44Dqg6EqEAJt1lruFAVrimvOW2SwJiKi3sPM\nO0lY6zhsLgg6YJinqj1ilprr4WIIgTAEUYNhCB0CtGepGDNtIiLqRQzeSbyZt4G4pjtD4wAQjpql\n5kakGIJgQPBFAUPskHlLouR8bX+L5WpERNQbGLyTeDJvwUAsrgGCDlmUMLqmFLo5fg4jXJx4nCFA\nTMquJw+YABgCYtsmdPgeERFRTzB4JwnHXcFbVJ05b1EQ4VcS2bQeDSYel2LYPOQrwYTGi6DtGclh\ncyIi6lUM3knCqmvYXNSdanMR3uAN3fV1ig5rAKwit8SwORERUW9g8E4Sdi0VEyTVWectQoLf5w7Y\n7lPXMfMG4AyxC4zeRETUixi8k3gzbw2abkBwhs1dp8u9Q1iKOW8gEbyd2M2KNSIi6gUM3kncTVoE\n0W5ibgZvn2vY3NATpy7VUjEAmDKyEgBw2NiBnvs5BU5ERD3BJi1JYpqrt7lkB28doiBBkdJn3qnm\nvOfOHIZDhldgWHVJh+8RERF1F4N3kqh7Y5KkzFv2BG9vIE+VeYuCgBGDQx3uJyIi6gkGbxfDMBDX\n4s7txLC5DkmQkoK3O1innvMmIiLKBs55u8R11bttp5TIvCVBhCy5h8q9WXiqYfNkrFcjIqLewODt\nYs93S4JZmCaIGiCqEATAJ/o9mbe7YC3dsHk6zNGJiKgnGLxdolbwDohW9zRRg+Azq89LlNABF6wl\nG1xpvu7omrLeOWAiIipInPN2iVvFakVSEG1aCyCp5sYjAEqVEGQxdcGakWadd7KTZtSiOCBj+riB\nGR9LRESUDoO3i5N5C2aGLEgaBMXMvEt9pZCTsm33110ZNpclEXMOHdJ7B0xERAWJw+YuMavS3O8M\nm6vOsHmZrzT9UrE07VGJiIiyIWPw3rp1ay6Oo1+IWcPmPqMIgJV5W8Pm5f4yyHLP5ryJiIh6Q8bg\n/e1vfxsXXHABVq1ahXA4nOnhec0eNldgBm9zztvMvCuKyrwFa8jc25yIiCgbMs55P//88/jkk0/w\n4osvYuHChZg4cSLOPfdcTJ06NRfHl1N2gxZBV2BoIgRJBXwRGLqIkFKMqBRJ/URD5LA5ERHlTJfm\nvMePH4/rr78eS5YswdatW7F48WJcdNFF+O9//5vlw8stO/OGIQG6DLG4GWKgHdq+wVCUpA5rbhw2\nJyKiHMqYee/cuRN/+tOf8Je//AVjx47F1VdfjWOPPRYbN27EzTffjGeeeSYXx5kT9py3oUkwNAmC\nYt6v7hwHRfL2Nvd0WwOYeRMRUc5kDN4LFy7E17/+dfzhD3/AoEGDnPunTp2aceh8xYoVeP/99yEI\nApYuXep5/K5du/Cd73wH8XgckyZNwp133tmDt9E77A5rhi4CmnlqDEOAEQtAlrztURXZvc5b5Jw3\nERHlTMZh8zVr1mDkyJFO4H7iiSfQ1tYGALj99tvTPu/NN9/Etm3b8NRTT2H58uVYvny55/t33303\nLr/8cjz77LOQJAlffvllT95Hr7CXihmqBEO39u6OKwAESJLgqTZP7rbGYXMiIsqVjMH7e9/7Hurr\n653bkUgEt9xyS8YXXrduHebOnQsAGDNmDJqamtDa2goA0HUdb7/9Nk466SQAwLJly1BTU9OtN9Cb\n7DlvXXNl3roMSTSryd0BO3nZGIfNiYgoVzIG78bGRixatMi5fdlll6G5uTnjC9fX16OiosK5XVlZ\nibq6OgDA/v37UVxcjLvuugsXXHAB7r///u4ce6+z57x1VYSzFEyTnEDtnvNOzrwZvImIKFcyznnH\n43Fs3boVY8aMAQBs2rQJ8Xg8w7M6MgzD8/WePXuwaNEi1NbW4qqrrsLLL7+ME044Ie3zKyqCkGXp\ngH9uZ6qqQp7bwqfmMUqiD7D28jZ0CQFFQlVVCEUlifddFFDgXMIYAgYOKO7weoWiUN93b+I57Dme\nw97B89hzuTiHGYP39773PSxevBgtLS3QNA2VlZW49957M75wdXW1Z7h97969qKqqAgBUVFSgpqYG\nw4cPBwDMnj0bn376aafBu6GhPePPPBBVVSHU1bV47mtpN39GuN2A4Lf28tYlSKKAuroWxOJa4sGu\nixEYApoa2+EvwOQ71XmkA8Nz2HM8h72D57HnevscprsQyDhsPm3aNKxduxbPP/881q5dixdffLFL\nmfecOXOwdu1aAMDmzZtRXV2NkpISAIAsyxg2bJizTnzz5s0YNWpUV99L1tjV5mocTuYNXXIqy93z\n3JJnqRg7rBERUe5kzLxbW1vx5z//GQ0NDQDMYfRVq1bhtdde6/R5M2bMwOTJk7FgwQIIgoBly5Zh\n9erVCIVCmDdvHpYuXYolS5bAMAyMHz/eKV7rS1E9BlmUoaqAPedtqDJ8VtB2B2jJ9bVhCDBARESU\nGxmD9w033ICamhq89tpr+MpXvoLXX38dP/jBD7r04jfddJPn9oQJE5yvR4wYgSeeeOLAjjbL4loc\nPlFBXNUhfjEdyvCPEd5+CJSqjgMU7gI1QTAYvImIKGcyDptHo1HceeedqK2txa233orHHnsML774\nYi6OLeeiWgw+yYeYqkNRy1C291hA9UNJUSgnJbdKNRi+iYgoNzIG73g8jvb2dui6joaGBpSXl2P7\n9u25OLaci2kx+CQz81ZkCbpuBmR3NzWbuymLJAuoCAVydpxERFTYMg6bn3XWWXj66adx7rnn4rTT\nTkNlZSVGjBiRi2PLuZgeQ7lYigZVQzCgQNV0AHDmvN3cwfvsY0alDPBERETZkDF42wVngLmka9++\nfZg4cWLWDyzXDMNATIvDJ/kQ13Qosoj2iAogc+bNGW8iIsqljOmiu7vaoEGDMGnSJCeYH0xUXYUB\nwwzeqg6fLELVzcxbSbEVqOgJ3kRERLmTMfOeOHEifvKTn2D69OlQFMW5f/bs2Vk9sFyLWq1RFVGB\nqhlQZBGaZs15KykK1kT3rmIM30RElDsZg/eHH34IAHjrrbec+wRBOOiCt92gRbY28VZkyZnzTpV5\ne4fN9RwcIRERkSlj8H788cdzcRx9zt4ONBG8RSd4y3IiUFeVB1DXGPEOmzPzJiKiHMoYvC+88MKU\nc9wrV67MygH1leTM2yeLUK1hc9k1RL78yqMQi2t4+p9bnftYsEZERLnUpQ5rtng8jvXr1yMYDGb1\noPqCvZe3CHN+293HXHb1MZcl0dkaVI8UQQyEUSQX5fBIiYio0GUM3rNmzfLcnjNnDq688sqsHVBf\nienmsLmExLC5rUM3Nfs5Hx+B4NCdOPb4g2v+n4iI+reMwTu5m9quXbvw+eefZ+2A+krMybzNU+Ju\nzCKLqZbGGTCixZD3TIFPUlJ8n4iIKDsyBu9LLrnE+VoQBJSUlOC6667L6kH1BSd4GzIA3ZN5y510\nTzv4VrwTEVF/lzF4/+Mf/4Cu6xCtoq14PO5Z732wiOnu4B3zbEYipxk2JyIi6gsZo9LatWuxePFi\n5/ZFF12El156KasH1RfsgjUY5ilxr+2WUgybc3UYERH1lYzB+9FHH8WPf/xj5/bvfvc7PProo1k9\nqL4Qt9Z5Q7fmvBV3wVr6wfGDsVUsERH1bxmDt2EYCIVCzu2SkpKDMmBFtCgAQLCCtzvzdq/ztjHx\nJiKivpJxznvKlCm44YYbMGvWLBiGgVdffRVTpkzJxbHllB287cxbUdzrvDnnTURE/UfG4H3bbbdh\nzZo12LBhAwRBwJlnnolTTjklF8eWU1HVCt6anXm7C9YOvpEGIiLKXxmDdzgchqIouP322wEATzzx\nBMLhMIqLi7N+cLlkZ96GZgZt91KxUNDX4fFVZQEAQG3VwXUeiIio/8s4Hnzrrbeivr7euR2JRHDL\nLbdk9aD6gp1566oZvH2yiOVXHolLTjkEIwaHOjz+lCOH44KTx+HKMybl9DiJiIgyBu/GxkYsWrTI\nuX3ZZZehubk5qwfVFyJaBIqoQNPM24osYsiAYhx/WG3KxyuyhHkzh6XMyomIiLIpY/COx+PYujWx\ng9bGjRsRj8ezelB9IaJFEZD8iKvWHt6ddFUjIiLqSxnnvL/3ve9h8eLFaGlpga7rqKiowL333puL\nY8upqBpFQPYjxuBNRET9XMYINW3aNKxduxarVq3CkiVLUF1djWuuuSYXx5ZTyZm3z9UelYiIqD/J\nmHm/9957WL16NV544QXouo4f/ehHmD9/fi6OLWd0Q0dUi8Ev+xHXmHkTEVH/ljZC/eY3v8Fpp52G\nG2+8EZWVlVi1ahWGDx+O008//aDbmMTuax6Q/IjHzYo1Bm8iIuqv0mbeDz74IMaOHYs77rgDRx11\nFICDt4931FrjHZADaGPmTURE/Vza4P3yyy/jT3/6E5YtWwZd13H22WcflFXmABCx1nj7JbNgTRBS\n7yRGRETUH6RNL6uqqnDVVVdh7dq1WLFiBb744gvs3LkTV199NV555ZVcHmPWOZm3VbDmk6WDdpSB\niIjyX5fGhmfOnIm7774br776Kk444QT8/Oc/z/Zx5VRYjQCAWbCm6hwyJyKifu2AolRJSQkWLFiA\np59+OlvH0ye8mbfG4E1ERP0aoxSA9ngYAFAkB9DcHkdx4OCqpiciooMLgzeAdtUM3qLuRzSmoao8\n0MdHRERElB6DN4D2eDsAIBoxT0dVeVFfHg4REVGnGLwBtFmZd6SNwZuIiPo/Bm8kMu+WVvM2gzcR\nEfVnDN5IzHk3NZnd1TjnTURE/RmDN4C2eDsUUUFLmxm8K0L+Pj4iIiKi9Bi8YQ6bFytBRGPmpiQ+\nhduBEhFR/8XgDXPYPCgXIRLX4FNEiGyNSkRE/VjBB2/d0BFWIwgqRYjFNfiZdRMRUT9X8ME7rEZg\nwECxHEQ6NoGoAAAYmElEQVSUwZuIiPIAg7ddad6so6E5Cr+PwZuIiPq3gg/eMc3co3zL9jYYADNv\nIiLq9wo+eMd1M3gbunkqGLyJiKi/Y/DWVfMLBm8iIsoTDN5W5g3dDNo+peBPCRER9XMFH6ni1pw3\nDPNUBFiwRkRE/RyDtzPnbWfeDN5ERNS/MXhzzpuIiPIMg7cz583gTURE+YHBW/MOmzN4ExFRf5fV\n4L1ixQqcf/75WLBgATZs2JDyMffffz8WLlyYzcPolDNsbhWsiSI3JSEiov4ta8H7zTffxLZt2/DU\nU09h+fLlWL58eYfHbNmyBf/5z3+ydQhdkrxUTNP0PjwaIiKizLIWvNetW4e5c+cCAMaMGYOmpia0\ntrZ6HnP33XfjxhtvzNYhdEksqcOapht9eThEREQZZS1419fXo6KiwrldWVmJuro65/bq1asxa9Ys\n1NbWZusQukR1qs3NzLu4SOnDoyEiIspMztUPMoxERtvY2IjVq1fj0UcfxZ49e7r0/IqKIGS5d4vJ\nqqpCkD63bugi5h85Al89aTwkznsfkKqqUF8fQt7jOew5nsPewfPYc7k4h1kL3tXV1aivr3du7927\nF1VVVQCA9evXY//+/bjooosQi8XwxRdfYMWKFVi6dGna12toaO/V46uqCqGurgXN7ebrGrqEuTNq\nsH9fa4Znkpt9Hqn7eA57juewd/A89lxvn8N0FwJZGzafM2cO1q5dCwDYvHkzqqurUVJSAgA45ZRT\n8MILL+Dpp5/Gz372M0yePLnTwJ1NqqvaXBILfuUcERHlgaxl3jNmzMDkyZOxYMECCIKAZcuWYfXq\n1QiFQpg3b162fuwBi7matMgSh8uJiKj/y+qc90033eS5PWHChA6PGTp0KB5//PFsHkannI1JdAmy\nxMybiIj6v4KPVqquWg1aBBaqERFRXij44B3T4xAMs4qdmTcREeWDgo9WcSt4CwJboxIRUX5g8NZU\nCKw0JyKiPFLwESuuxwFDYqU5ERHlDQZvPW4tEyv4U0FERHmioCOWYRiIaXFAl1lpTkREeaOgg7dq\naDBgwGCDFiIiyiMFHbxjWsz8QpcgcdiciIjyREFHLDt4G5rEYXMiIsobhR28rb7mhsaCNSIiyh8F\nHbHszFtn5k1ERHmkwIM3M28iIso/BR2xYnoi82a1ORER5YvCDt4sWCMiojxU4MHb3stb5FIxIiLK\nGwUdsRLrvGXOeRMRUd4o6IjlLBXTRQ6bExFR3ijs4O3qsMaCNSIiyhcM3gCgSdzPm4iI8kZBR6zE\nsLkERSnoU0FERHmkoCOWe9hcYcEaERHliYKOWFFnqZgEHzNvIiLKEwUdseJWhzWDmTcREeWRgo5Y\nMVfmrchS3x4MERFRFxV08I46c94iFLmgTwUREeWRgo5Yqq5CggRAgI/Bm4iI8kRBRyzVUCEKMgAw\n8yYiorxR0BErrschwpzrZvAmIqJ8UdARK66pruDNgjUiIsoPBR28VUOFYJingJk3ERHli4KOWKqu\nQrAybxasERFRvijoiBXXVQgG57yJiCi/FGzEMgzDzLw5bE5ERHmmYCOWqqvmF8y8iYgozxRsxIpr\nVvDW7cyb1eZERJQfCjd423t5W8PmLFgjIqJ8UbARy868DZ1z3kRElF8KNmLFrMwbmghBACRR6NsD\nIiIi6qKCDd6qlXnrugBFFiEIDN5ERJQfCjZ423t5G5oIRSrY00BERHmoYKOWXbCmaQJ8CivNiYgo\nfxRu8LaHzTWBmTcREeWVgo1acd0O3iIUpWBPAxER5aGCjVpxa85bUwXIYsGeBiIiykMFG7Xcw+ay\nzEpzIiLKH4UbvK2CNV0TITHzJiKiPFKwUcvpbW6IkCVm3kRElD8KN3jbvc11ETKrzYmIKI8UbNSy\nm7TAENkalYiI8krBBm9nP29dhMTMm4iI8kjBRq2Ys6uYBJmZNxER5ZGCDd5x97A5C9aIiCiPyNl8\n8RUrVuD999+HIAhYunQppk6d6nxv/fr1eOCBByCKIkaNGoXly5dDzOGSrYgaNb/QJBasERFRXsla\n1HrzzTexbds2PPXUU1i+fDmWL1/u+f4dd9yBhx56CE8++STa2trw6quvZutQUgqrEQCAocssWCMi\norySteC9bt06zJ07FwAwZswYNDU1obW11fn+6tWrMXjwYABAZWUlGhoasnUoKUXiZvCGJjPzJiKi\nvJK1qFVfX4+KigrndmVlJerq6pzbJSUlAIC9e/fi9ddfx/HHH5+tQ0kprEYhQLCqzZl5ExFR/sjq\nnLebYRgd7tu3bx+uvvpqLFu2zBPoU6moCEKWe2/f7XA8Ap/kRzsElJYEUFUV6rXXLjQ8dz3Hc9hz\nPIe9g+ex53JxDrMWvKurq1FfX+/c3rt3L6qqqpzbra2tuPLKK3HDDTfgmGOOyfh6DQ3tvXp8YTUC\nBQoAIBqNo66upVdfv1BUVYV47nqI57DneA57B89jz/X2OUx3IZC1YfM5c+Zg7dq1AIDNmzejurra\nGSoHgLvvvhuXXHIJjjvuuGwdQqci8QgU0QcALFgjIqK8krXMe8aMGZg8eTIWLFgAQRCwbNkyrF69\nGqFQCMcccwyee+45bNu2Dc8++ywA4IwzzsD555+frcPpIKxGUSGXAgAL1oiIKK9kdc77pptu8tye\nMGGC8/WmTZuy+aM7FddVqLoKQTffPoM3EVHfevnlv+OEE07u0mN/8pP7ce65C1BTU5vlo+q/CjJq\nRa0GLbv2xgBw2JyIqC/t2vUl/va3tV1+/PXXf7egAzeQw2rz/iSimcHb0MzqdS4VIyLqOw88cA8+\n/HAzHn30N9B1HV9+uRO7dn2JBx/8Be66607U1e1FOBzG5ZdfhTlzjsV1112F73znFvzzn39HW1sr\nvvhiG3bu3IFvf/u7mD17jvO6qqpi+fIfdHj+J598hPvvvweiKGDKlGm49trrU95n/5zRo8di1aqn\n0NjYiOnTD8eTT/4v2tvbcd11N+Ldd9/Gyy//HbquY/bsObj11u+ipaUFd955G9ra2lBSUoI77vgf\nXH75Rfj9759AMBjEhg3v4cknV2LFih93+5wVZPCOWsEb9rB5DtuyEhH1Z0//Ywv+89HeXn3NmROq\ncd5JY9N+/4ILFmL16qdx2WVX4pFHHoaqxvGLX/wWDQ37MWvWUTj11DOwc+cO3H77EsyZc6znuXv3\n7sF99z2E9ev/jT//eZUneLe0NKd8/oMP3oebb16KsWPH4Uc/ugO7d+9KeV86W7duwRNPrIbP58O7\n776NX/zitxBFEeeddxauvfabeOKJxzFr1myce+4CPPXUSrzzzls47rgT8dpr/8L8+afgtddewbx5\nX+nROS3I4G33NTc08+0z8yYi6j8mTpwMAAiFSvHhh5uxZs1qCIKI5uamDo+dOvUwAObyZHcXz86e\n/8UX2zB27DgAwO2335n2vnTGjh0Hn89crRQIBHDddVdBkiQ0NjaisbERn3zyEa644hoAwPnnXwQA\nqKmpxW9/+0vMn38K3n33bXzjG1cf+IlxKczgrSU2JQFYsEZEZDvvpLGdZsm5oChmD46//vUlNDc3\n4+c//y2am5txxRULOzxWkhLNu5KbgaV7fqpNsFLdJwiJxE5V1Q7Ht3v3Ljz11Er87ncrEQwGsXDh\nedZrSTAM3fNaY8eOw759+/Dhh5sxatQY+P3+zk9CBgUZtSL2piR25s2CNSKiPiOKIjRN63B/Y2Mj\nhgypgSiKeOWVfyAejx/Q66Z7/siRo7B5s7ni6a677sR///t5yvuKi4uxb5/ZbGzjxvdTvn5FRQWC\nwSA+/vgj7N69G/F4HBMnTsLbb/8HAPDcc6vw4ot/AQCcdNI8PPDAPZg375QDeh+pFGTwthlx88qH\nmTcRUd8ZMWIUPv74Izz00P2e+0844ST8+9+v4vrrr0FRURGqq6vx6KO/6fLrpnv+9dffhJ/97P/D\nNdd8A6FQKUaOHJXyvjPPPAf3338vbr75egwcWNXh9ceNG4+ioiCuueZy/P3v/4ezzjoHP/zhD3Hu\nuRdg06YNuO66q/Dvf7+G448/EQBw8snzsHfvXhx++MyenTAAgpGq6Xg/1Jvt5uJaHNf89hnojdWA\nIeKWC6ZjwojOe6tTamyn2HM8hz3Hc9g7eB57rrNz+Pzza7B79y584xvfPKDXS6Ug57wVSYHeMNi5\nzcybiIiy6Z57/gdffrkTd911X6+8XkEG72SsNiciomy69dbbevX1CjLl1HXvTAEL1oiIKJ8UZPCO\nxr1VjRw2JyKifFKQUSvWIXgz8yYiovxRkME7OfOW2B6ViIjySEFGrWjc2/mGmTcRUd96+eW/H/Bz\n3nvvHTQ07M/C0fR/hRm8Y0mZN+e8iYj6zIFuCWp7/vk1BRu8C3KpWMdhc2beRER9xb0l6PnnX4gV\nK36IlpYWaJqGG264GWPHjsP//u/v8cor/4Qoipgz51hMnDgJr776Mj7//DP8z//ci8GDzd4dfbEN\n6OWXX+VsAxqLReD3F2VlG1A3Bm+w2pyIyLZ6y1/w7t6Nvfqa06sPxTljz0j7ffeWoL///W9x5JFH\n4//9v6/i888/w09+ch8efPAXePLJ/8Vzz70ESZLw3HOrMHPmURg7djy+851bnMAN9M02oOeff6Gz\nDejixVfiZz/7VVa2AXVj8AabtBAR9RcbN25AY2MD1q59AQAQjZobSZ1wwsm44YbFmDfvFMyfn35j\nj77YBrS5uTkn24C6FWTwrgz54ZNF6IYBVTMgCgzeREQAcM7YMzrNkrNNUWTceOPNmDJlquf+m276\nHrZt+y/+8Y+/4lvf+iZ+/es/pHz+wbwNqOfYe+2V8sghwyvw1IrT8fBNJ+DXN5/Q14dDRFTQ3FuC\nTpo0Bf/618sAgM8//wxPPvm/aG1txaOP/gYjRozEZZddiVCoDO3tbSm3Ej2YtwH1nLNefbU8Iksi\nBEHgfDcRUR9zbwn69a+fj507t2Px4itwzz3/g8MOm4GSkhI0NjbgyisX4dvfvhqTJ09BaWkZDjts\nBm677VZ89tlW57X6YhvQ+++/x9kGdOHChVnbBtStILcEBbj1XW/heew5nsOe4znsHTyPPZd8Druz\nDWjy66VSkHPeRERE2dbb24C6MXgTERFlQW9vA+rGCV8iIqI8w+BNRESUZxi8iYiI8gyDNxERUZ5h\n8CYiIsozDN5ERER5hsGbiIgozzB4ExER5Zm8aY9KREREJmbeREREeYbBm4iIKM8weBMREeUZBm8i\nIqI8w+BNRESUZxi8iYiI8kxB7ue9YsUKvP/++xAEAUuXLsXUqVP7+pD6tU8++QSLFy/GpZdeiosv\nvhi7du3CLbfcAk3TUFVVhR//+Mfw+XxYs2YN/vCHP0AURZx33nk499xz+/rQ+417770Xb7/9NlRV\nxTe/+U0ceuihPIcHIBwOY8mSJdi3bx+i0SgWL16MCRMm8Bx2UyQSwRlnnIHFixdj9uzZPI8H4I03\n3sD111+PcePGAQDGjx+PK664Ivfn0Cgwb7zxhnHVVVcZhmEYW7ZsMc4777w+PqL+ra2tzbj44ouN\n2267zXj88ccNwzCMJUuWGC+88IJhGIZx//33GytXrjTa2tqM+fPnG83NzUY4HDZOP/10o6GhoS8P\nvd9Yt26dccUVVxiGYRj79+83jj/+eJ7DA/T8888bv/71rw3DMIwdO3YY8+fP5znsgQceeMA455xz\njFWrVvE8HqD169cb3/rWtzz39cU5LLhh83Xr1mHu3LkAgDFjxqCpqQmtra19fFT9l8/nw29+8xtU\nV1c7973xxhs4+eSTAQAnnngi1q1bh/fffx+HHnooQqEQAoEAZsyYgXfeeaevDrtfmTlzJn7yk58A\nAEpLSxEOh3kOD9Bpp52GK6+8EgCwa9cuDBo0iOewm7Zu3YotW7bghBNOAMD/z72hL85hwQXv+vp6\nVFRUOLcrKytRV1fXh0fUv8myjEAg4LkvHA7D5/MBAAYMGIC6ujrU19ejsrLSeQzPa4IkSQgGgwCA\nZ599FscddxzPYTctWLAAN910E5YuXcpz2E333HMPlixZ4tzmeTxwW7ZswdVXX40LLrgAr7/+ep+c\nw4Kc83Yz2B22R9KdP57Xjv72t7/h2Wefxe9+9zvMnz/fuZ/nsOuefPJJfPjhh7j55ps954fnsGue\ne+45HHbYYRg2bFjK7/M8ZjZy5Ehcd911OPXUU7F9+3YsWrQImqY538/VOSy44F1dXY36+nrn9t69\ne1FVVdWHR5R/gsEgIpEIAoEA9uzZg+rq6pTn9bDDDuvDo+xfXn31VfzqV7/Cb3/7W4RCIZ7DA7Rp\n0yYMGDAAQ4YMwcSJE6FpGoqLi3kOD9DLL7+M7du34+WXX8bu3bvh8/n4t3iABg0ahNNOOw0AMHz4\ncAwcOBAbN27M+TksuGHzOXPmYO3atQCAzZs3o7q6GiUlJX18VPnl6KOPds7h//3f/+HYY4/FtGnT\nsHHjRjQ3N6OtrQ3vvPMOjjjiiD4+0v6hpaUF9957Lx5++GGUl5cD4Dk8UG+99RZ+97vfATCnvtrb\n23kOu+HBBx/EqlWr8PTTT+Pcc8/F4sWLeR4P0Jo1a/DII48AAOrq6rBv3z6cc845OT+HBbmr2H33\n3Ye33noLgiBg2bJlmDBhQl8fUr+1adMm3HPPPdi5cydkWcagQYNw3333YcmSJYhGo6ipqcFdd90F\nRVHw0ksv4ZFHHoEgCLj44otx5pln9vXh9wtPPfUUfvrTn2LUqFHOfXfffTduu+02nsMuikQi+P73\nv49du3YhEonguuuuw5QpU3DrrbfyHHbTT3/6U9TW1uKYY47heTwAra2tuOmmm9Dc3Ix4PI7rrrsO\nEydOzPk5LMjgTURElM8KbticiIgo3zF4ExER5RkGbyIiojzD4E1ERJRnGLyJiIjyTME1aSHKN/fe\ney82btyIaDSKDz74ANOnTwcAfO1rX8NXv/rVLr3Gr3/9a4wfP97pZ53KwoUL8fvf/x6SJPXGYXvs\n2bMHn332GWbPnt3rr01UiLhUjChP7NixAxdeeCH+9a9/9fWhHLA1a9Zg69atuPHGG/v6UIgOCsy8\nifLYT3/6U+zYsQNffvklbr31VkQiEdx3333w+XyIRCJYtmwZJk+ejCVLluDwww/H7Nmzcc011+CY\nY47Bhg0b0NbWhocffhiDBg3CIYccgs2bN+OXv/wlGhsbsXv3bmzbtg1HHnkkbr/9dkSjUdx6663Y\nuXMnBg8eDEmSMGfOHM8exW1tbfjud7+L5uZmqKqKE088EWeccQYefPBBGIaB8vJyXHTRRbjzzjux\nbds2tLW14YwzzsDll1+O1atX469//SsEQcCePXswevRorFixAoqi9OEZJuqfOOdNlOd27NiBxx57\nDFOmTEFjYyN+8IMf4LHHHsOiRYvw8MMPd3j81q1bcc4552DlypWYOHEiXnzxxQ6P+eCDD/DQQw/h\n2WefxerVq9HU1IQ1a9ZAVVU888wzuOOOO/D66693eN6///1vqKqKP/7xj3jyyScRDAZRW1uLs88+\nG2eeeSYuu+wyPPbYY6iursbjjz+OZ555Bs8//zw++ugjAMDGjRv///bu2CW1MIzj+NcONQQRQi3W\nYnBsjDoSBFKNOVaEo0M4REO4HGyrKQin5ob+gDBaoiVyECEipakhWkKkQKFoiERPd5DOzYxLlysX\njvw+4+F5X97tx/PyHh7S6TSHh4eUy2VP3jKI/A/qvEU8bmJiAp/PB8DQ0BC7u7u8vb3x8vLC4OBg\nW73f78c0TQACgQBPT09tNZZlYRgGhmHg9/t5fn7m5uaG6elpAIaHh7Esq23d1NQUe3t7bGxsMDc3\nx8rKCj09rT3CxcUFDw8PXF5eAlCr1bi/v3fXf4xPnZyc5O7uzp2TLCK/KbxFPO7ztbJt22xvbzMz\nM8P5+bk7zOOzrw/Svnv28l2N4zgtQfw1lKE5y/j4+JhiscjZ2RnLy8scHR211PT19bG+vs7CwkLL\n90wmg+M4fzyXiDTp2lyki1QqFUzTpNFocHp6Sq1W69jeY2NjFItFAKrVKldXV201uVyObDaLZVnY\ntk1/fz/VahWfz0e9XgeaXf3HVb3jOOzs7Ljd//X1Na+vr7y/v1MoFBgfH+/Y+UW6iTpvkS6SSCSI\nx+MEAgFWV1exbZuDg4OO7L20tEQ2myUWizE6Oko4HG7r0IPBIKlUiv39fQzDIBKJMDIyQjgcJplM\n0tvby9raGre3t8RiMRqNBvPz8+6o1FAoxObmJqVSCdM0iUQiHTm7SLfRr2Ii8iOPj48UCgWi0SiO\n47C4uMjW1pb73/m/ymQy5PN50ul0R/YT6WbqvEXkRwYGBjg5OXHnE8/OznYsuEXk76jzFhER8Rg9\nWBMREfEYhbeIiIjHKLxFREQ8RuEtIiLiMQpvERERj1F4i4iIeMwvRph4T/csGFUAAAAASUVORK5C\nYII=\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72f867ef90>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "HNqUFL4deCsL",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 4. Case study: building an RNN\n"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "YkC1k4HEQ7rw",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "In this exercise we build and train a model similar to the RNNColorbot model that was used in the main Eager notebook. The model is adapted for converting and training in graph mode."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "7nkPDl5CTCNb",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "To get started, we load the colorbot dataset. The code is identical to that used in the other exercise and its details are unimportant."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "A0uREmVXCQEw",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def parse(line):\n",
+        "  \"\"\"Parses a line from the colors dataset.\n",
+        "  \n",
+        "  Args:\n",
+        "    line: A comma-separated string containing four items:\n",
+        "        color_name, red, green, and blue, representing the name and\n",
+        "        respectively the RGB value of the color, as an integer\n",
+        "        between 0 and 255.\n",
+        "\n",
+        "  Returns:\n",
+        "    A tuple of three tensors (rgb, chars, length), of shapes: (batch_size, 3),\n",
+        "    (batch_size, max_sequence_length, 256) and respectively (batch_size).\n",
+        "  \"\"\"\n",
+        "  items = tf.string_split([line], \",\").values\n",
+        "  rgb = tf.string_to_number(items[1:], out_type=tf.float32) / 255.0\n",
+        "  color_name = items[0]\n",
+        "  chars = tf.one_hot(tf.decode_raw(color_name, tf.uint8), depth=256)\n",
+        "  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n",
+        "  return rgb, chars, length\n",
+        "\n",
+        "\n",
+        "def maybe_download(filename, work_directory, source_url):\n",
+        "  \"\"\"Downloads the data from source url.\"\"\"\n",
+        "  if not tf.gfile.Exists(work_directory):\n",
+        "    tf.gfile.MakeDirs(work_directory)\n",
+        "  filepath = os.path.join(work_directory, filename)\n",
+        "  if not tf.gfile.Exists(filepath):\n",
+        "    temp_file_name, _ = six.moves.urllib.request.urlretrieve(source_url)\n",
+        "    tf.gfile.Copy(temp_file_name, filepath)\n",
+        "    with tf.gfile.GFile(filepath) as f:\n",
+        "      size = f.size()\n",
+        "    print('Successfully downloaded', filename, size, 'bytes.')\n",
+        "  return filepath\n",
+        "\n",
+        "\n",
+        "def load_dataset(data_dir, url, batch_size, training=True):\n",
+        "  \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n",
+        "  path = maybe_download(os.path.basename(url), data_dir, url)\n",
+        "  dataset = tf.data.TextLineDataset(path)\n",
+        "  dataset = dataset.skip(1)\n",
+        "  dataset = dataset.map(parse)\n",
+        "  dataset = dataset.cache()\n",
+        "  dataset = dataset.repeat()\n",
+        "  if training:\n",
+        "    dataset = dataset.shuffle(buffer_size=3000)\n",
+        "  dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None, None], []))\n",
+        "  return dataset\n",
+        "\n",
+        "\n",
+        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv\"\n",
+        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv\"\n",
+        "data_dir = \"tmp/rnn/data\""
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "waZ89t3DTUla",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Next, we set up the RNNColobot model, which is very similar to the one we used in the main exercise.\n",
+        "\n",
+        "Autograph doesn't fully support classes yet (but it will soon!), so we'll write the model using simple functions."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9v8AJouiC44V",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def model_components():\n",
+        "  lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
+        "  lower_cell.build(tf.TensorShape((None, 256)))\n",
+        "  upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "  upper_cell.build(tf.TensorShape((None, 256)))\n",
+        "  relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
+        "  relu_layer.build(tf.TensorShape((None, 128)))\n",
+        "  return lower_cell, upper_cell, relu_layer\n",
+        "\n",
+        "\n",
+        "def rnn_layer(chars, cell, batch_size, training):\n",
+        "  \"\"\"A simple RNN layer.\n",
+        "  \n",
+        "  Args:\n",
+        "    chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
+        "    cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "  Returns:\n",
+        "    A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
+        "  \"\"\"\n",
+        "  hidden_outputs = []\n",
+        "  autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
+        "  state, output = cell.zero_state(batch_size, tf.float32)\n",
+        "  n = tf.shape(chars)[0]\n",
+        "  i = 0\n",
+        "  while i < n:\n",
+        "    ch = chars[i]\n",
+        "    cell_output, (state, output) = cell.call(ch, (state, output))\n",
+        "    hidden_outputs.append(cell_output)\n",
+        "    i += 1\n",
+        "  hidden_outputs = hidden_outputs.stack()\n",
+        "  if training:\n",
+        "    hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
+        "  return hidden_outputs\n",
+        "\n",
+        "\n",
+        "def model(inputs, lower_cell, upper_cell, relu_layer, batch_size, training):\n",
+        "  \"\"\"RNNColorbot model.\n",
+        "  \n",
+        "  The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
+        "  followed by a fully connected layer with ReLU activation.\n",
+        "  \n",
+        "  Args:\n",
+        "    inputs: A tuple (chars, length)\n",
+        "    lower_cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    upper_cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    relu_layer: An object of type tf.layers.Dense\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "    \n",
+        "  Returns:\n",
+        "    A Tensor of shape (batch_size, 3) - the model predictions.\n",
+        "  \"\"\"\n",
+        "  (chars, length) = inputs\n",
+        "  chars_time_major = tf.transpose(chars, [1, 0, 2])\n",
+        "  chars_time_major.set_shape((None, batch_size, 256))\n",
+        "\n",
+        "  hidden_outputs = rnn_layer(chars_time_major, lower_cell, batch_size, training)\n",
+        "  final_outputs = rnn_layer(hidden_outputs, upper_cell, batch_size, training)\n",
+        "\n",
+        "  # Grab just the end-of-sequence from each output.\n",
+        "  indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
+        "  sequence_ends = tf.gather_nd(final_outputs, indices)\n",
+        "  return relu_layer(sequence_ends)\n",
+        "\n",
+        "def loss_fn(labels, predictions):\n",
+        "  return tf.reduce_mean((predictions - labels) ** 2)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "JjK4gXFvFsf4",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "The train and test functions are also similar to the ones used in the Eager notebook. Since the network requires a fixed batch size, we'll train in a single shot, rather than by epoch."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "ZWQMExk0S6X6",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def train(optimizer, train_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps):\n",
+        "  iterator = train_data.make_one_shot_iterator()\n",
+        "  step = 0\n",
+        "  while step < num_steps:\n",
+        "    labels, chars, sequence_length = iterator.get_next()\n",
+        "    predictions = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, batch_size, training=True)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "    optimizer.minimize(loss)\n",
+        "    if step % (num_steps // 10) == 0:\n",
+        "      print('Step', step, 'train loss', loss)\n",
+        "    step += 1\n",
+        "  return step\n",
+        "\n",
+        "\n",
+        "def test(eval_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps):\n",
+        "  total_loss = 0.0\n",
+        "  iterator = eval_data.make_one_shot_iterator()\n",
+        "  step = 0\n",
+        "  while step < num_steps:\n",
+        "    labels, chars, sequence_length = iterator.get_next()\n",
+        "    predictions = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, batch_size, training=False)\n",
+        "    total_loss += loss_fn(labels, predictions)\n",
+        "    step += 1\n",
+        "  print('Test loss', total_loss)\n",
+        "  return total_loss\n",
+        "\n",
+        "\n",
+        "def train_model(train_data, eval_data, batch_size, lower_cell, upper_cell, relu_layer, train_steps):\n",
+        "  optimizer = tf.train.AdamOptimizer(learning_rate=0.01)\n",
+        "\n",
+        "  train(optimizer, train_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps=tf.constant(train_steps))\n",
+        "  test(eval_data, lower_cell, upper_cell, relu_layer, 50, num_steps=tf.constant(2))\n",
+        "\n",
+        "  print('Colorbot is ready to generate colors!\\n\\n')\n",
+        "  \n",
+        "  # In graph mode, every op needs to be a dependent of another op.\n",
+        "  # Here, we create a no_op that will drive the execution of all other code in\n",
+        "  # this function. Autograph will add the necessary control dependencies.\n",
+        "  return tf.no_op()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "iopcs5hXG2od",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we add code to run inference on a single input, which we'll read from the input.\n",
+        "\n",
+        "Note the `do_not_convert` annotation that lets us disable conversion for certain functions and run them as a `py_func` instead, so you can still call them from compiled code."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "DyU0wnnAFEYj",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "@autograph.do_not_convert(run_as=autograph.RunMode.PY_FUNC)\n",
+        "def draw_prediction(color_name, pred):\n",
+        "  pred = pred * 255\n",
+        "  pred = pred.astype(np.uint8)\n",
+        "  plt.axis('off')\n",
+        "  plt.imshow(pred)\n",
+        "  plt.title(color_name)\n",
+        "  plt.show()\n",
+        "\n",
+        "\n",
+        "def inference(color_name, lower_cell, upper_cell, relu_layer):\n",
+        "  _, chars, sequence_length = parse(color_name)\n",
+        "  chars = tf.expand_dims(chars, 0)\n",
+        "  sequence_length = tf.expand_dims(sequence_length, 0)\n",
+        "  pred = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, 1, training=False)\n",
+        "  pred = tf.minimum(pred, 1.0)\n",
+        "  pred = tf.expand_dims(pred, 0)\n",
+        "  draw_prediction(color_name, pred)\n",
+        "  # Create an op that will drive the entire function.\n",
+        "  return tf.no_op()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Nt0Kv5OCHip0",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we put everything together.\n",
+        "\n",
+        "Note that the entire training and testing code is all compiled into a single op (`tf_train_model`) that you only execute once! We also still use a `sess.run` loop for the inference part, because that requires keyboard input."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "-GmWa0GtYWdh",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 668
+        },
+        "outputId": "61f4af1d-c81e-44db-9079-1a7b8ed8ce58",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345877153,
+          "user_tz": 240,
+          "elapsed": 75500,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def run_input_loop(sess, inference_ops, color_name_placeholder):\n",
+        "  \"\"\"Helper function that reads from input and calls the inference ops in a loop.\"\"\"\n",
+        "\n",
+        "  tb = widgets.TabBar([\"RNN Colorbot\"])\n",
+        "  while True:\n",
+        "    with tb.output_to(0):\n",
+        "      try:\n",
+        "        color_name = six.moves.input(\"Give me a color name (or press 'enter' to exit): \")\n",
+        "      except (EOFError, KeyboardInterrupt):\n",
+        "        break\n",
+        "    if not color_name:\n",
+        "      break\n",
+        "    with tb.output_to(0):\n",
+        "      tb.clear_tab()\n",
+        "      sess.run(inference_ops, {color_name_placeholder: color_name})\n",
+        "      plt.show()\n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  # Read the data.\n",
+        "  batch_size = 64\n",
+        "  train_data = load_dataset(data_dir, train_url, batch_size)\n",
+        "  eval_data = load_dataset(data_dir, test_url, 50, training=False)\n",
+        "  \n",
+        "  # Create the model components.\n",
+        "  lower_cell, upper_cell, relu_layer = model_components()\n",
+        "  # Create the helper placeholder for inference.\n",
+        "  color_name_placeholder = tf.placeholder(tf.string, shape=())\n",
+        "  \n",
+        "  # Compile the train / test code.\n",
+        "  tf_train_model = autograph.to_graph(train_model)\n",
+        "  train_model_ops = tf_train_model(\n",
+        "      train_data, eval_data, batch_size, lower_cell, upper_cell, relu_layer, train_steps=100)\n",
+        "  \n",
+        "  # Compile the inference code.\n",
+        "  tf_inference = autograph.to_graph(inference)\n",
+        "  inference_ops = tf_inference(color_name_placeholder, lower_cell, upper_cell, relu_layer)\n",
+        "  \n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf.global_variables_initializer())\n",
+        "    \n",
+        "    # Run training and testing.\n",
+        "    sess.run(train_model_ops)\n",
+        "     \n",
+        "    # Run the inference loop.\n",
+        "    run_input_loop(sess, inference_ops, color_name_placeholder)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "('Successfully downloaded', 'train.csv', 28010L, 'bytes.')\n",
+            "('Successfully downloaded', 'test.csv', 2414L, 'bytes.')\n",
+            "Step 0 train loss 0.37890616\n",
+            "Step 10 train loss 0.18515904\n",
+            "Step 20 train loss 0.0892782\n",
+            "Step 30 train loss 0.07883155\n",
+            "Step 40 train loss 0.08585831\n",
+            "Step 50 train loss 0.09302989\n",
+            "Step 60 train loss 0.089012615\n",
+            "Step 70 train loss 0.07275697\n",
+            "Step 80 train loss 0.06644974\n",
+            "Step 90 train loss 0.0854013\n",
+            "Test loss 0.13216865Colorbot is ready to generate colors!\n",
+            "\n",
+            "\n",
+            "\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<link rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'></link>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<script src='/nbextensions/google.colab/tabbar_main.min.js'></script>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<div id=\"id1\"></div>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b102d936-3379-11e8-ac70-0242ac110002\"] = colab_lib.createTabBar({\"contentBorder\": [\"0px\"], \"borderColor\": [\"#a7a7a7\"], \"tabNames\": [\"RNN Colorbot\"], \"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"elementId\": \"id1\"});\n",
+              "//# sourceURL=js_e223a56194"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b103532a-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_b8c6a821fb"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b105b28c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_44805e254b"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b106197a-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_a63d3c6c47"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b1069f44-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"b106197a-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7e203b8bce"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b1070f38-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_d53293d4a7"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6d90d5c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"b105b28c-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_3000dc2c05"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6da872c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_4136f669a3"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6dac868-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_2f70dd9aee"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6db07d8-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c6dac868-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7226726048"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6dcc6fe-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_72e7709865"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVQAAAFZCAYAAADHDNdrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAB9JJREFUeJzt3E1Lle0ax+HTF4jeEAyMBhE0DawI\nwsCH0AIlaGBWNJBo0CDoA0TQhmDXuKAGDioiCA2KlEAlnl05FD9Co8BeaGCQoBDa2jPZsXt4Bvu/\n0+o4Rmvd1zW4rsmP84bFamo0Go0C4H/WvNYHAPhVCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKDy\nUxgeHq5Dhw7V4OBgPXz4sHp7e+vWrVt15cqVOnnyZN2/f78ajUbdvn27+vr6qqenp65du1YrKytV\nVfXhw4e6cOFC9fX1VV9fX01PT1dV1dzcXHV3d9eDBw/q+PHj9ccff9TExMRaXpWfWOtaHwD+zuvX\nr+vOnTs1MTFRbW1tdf78+dW16enpGh8fr/b29hobG6upqal6/Phxbdy4sS5evFgjIyM1NDRUly5d\nqv3799fw8HC9efOmTp8+XVNTU1VV9enTp2pubq5nz57V5ORk3bhxo44dO7ZW1+UnZkJl3Zudna2D\nBw9WR0dHbdiwoQYHB1fX9u7dW+3t7VVV9fLlyxocHKytW7dWa2trnTp1qp4/f16Li4s1MzNT586d\nq6qqXbt21YEDB1an1OXl5Tpx4kRVVe3Zs6fevXv3Yy/IL8OEyrr3+fPnamtrW/2+ffv21c//+Xxh\nYaHu3r1bjx49qqqqlZWVam9vr4WFhWo0GnXmzJnVvYuLi9XV1VVVVS0tLbVp06aqqmpubq6vX7/+\nX+/Dr0tQWfe2bNlSi4uLq98/fvz43X0dHR3V29tbQ0ND3zxfXl6ulpaWevLkSW3evPmbtbm5ufyB\n+W155Wfd6+zsrJmZmZqfn68vX77U2NjYd/cdOXKkxsfHa2lpqaqqRkdH6+nTp9Xa2lqHDx+u0dHR\nqqpaWlqqy5cv1/v373/YHfg9CCrrXmdnZw0MDNTAwECdPXu2enp6vrvv6NGj1dPTUwMDA9Xf318v\nXryo7u7uqqq6evVqzc7OVn9/fw0MDNTOnTtrx44dP/Ia/Aaa/B8qP4NGo1FNTU1VVfXq1au6efPm\nX06qsFZMqKx78/Pz1dXVVW/fvq1Go1GTk5O1b9++tT4W/BcTKj+FkZGRunfvXjU1NdXu3bvr+vXr\ntW3btrU+FnxDUAFCvPIDhAgqQMi6+WH/kX8eXesjAPytf/3jz79cM6EChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCI\noAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIig\nAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAC\nhAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCI\noAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIig\nAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAC\nhAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkBI\nU6PRaKz1IQB+BSZUgBBBBQgRVIAQQQUIEVSAEEEFCBFUgBBBBQgRVIAQQQUIEVSAEEEFCBFUgBBB\nBQgRVIAQQQUIEVSAEEEFCBFUgBBBBQgRVIAQQQUIEVSAkH8D1Aj8lNhhe7QAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72f402e850>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c70592aa-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c6da872c-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_25c3aaf79a"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c70842c0-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_984c56b816"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c708dec4-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_e0451a1217"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7092726-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c708dec4-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7aa23d7385"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7099044-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_5722756ddb"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "stream",
+          "text": [
+            "Give me a color name (or press 'enter' to exit): \n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7baac12-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c70842c0-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_cdd622e58f"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "AHJ2c47U-A5W",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Where do we go next?\n",
+        "\n",
+        "Autograph is available in tensorflow.contrib, but it's still in its early stages. We're excited about the possibilities it brings — write your machine learning code in the flexible Eager style, but still enjoy all the benefits that come with running in graph mode. A beta version will be available soon -- stay tuned!"
+      ]
+    }
+  ]
+}
diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..324b23c24b5a7970d7f20ed955839ba1cf1774fc
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
@@ -0,0 +1,1078 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "LqNpENf-ec0X",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "Pa2qpEmoVOGe",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import six\n",
+        "\n",
+        "from google.colab import widgets"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HNqUFL4deCsL",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "source": [
+        "# Case study: training a custom RNN, using Keras and Estimators\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "YkC1k4HEQ7rw",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "In this section, we show how you can use AutoGraph to build RNNColorbot, an RNN that takes as input names of colors and predicts their corresponding RGB tuples. The model will be trained by a [custom Estimator](https://www.tensorflow.org/get_started/custom_estimators)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7nkPDl5CTCNb",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "To get started, set up the dataset. The following cells defines methods that download and format the data needed for RNNColorbot; the details aren't important (read them in the privacy of your own home if you so wish), but make sure to run the cells before proceeding."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "A0uREmVXCQEw",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def parse(line):\n",
+        "  \"\"\"Parses a line from the colors dataset.\"\"\"\n",
+        "  items = tf.string_split([line], \",\").values\n",
+        "  rgb = tf.string_to_number(items[1:], out_type=tf.float32) / 255.0\n",
+        "  color_name = items[0]\n",
+        "  chars = tf.one_hot(tf.decode_raw(color_name, tf.uint8), depth=256)\n",
+        "  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n",
+        "  return rgb, chars, length\n",
+        "\n",
+        "\n",
+        "def set_static_batch_shape(batch_size):\n",
+        "  def apply(rgb, chars, length):\n",
+        "    rgb.set_shape((batch_size, None))\n",
+        "    chars.set_shape((batch_size, None, 256))\n",
+        "    length.set_shape((batch_size,))\n",
+        "    return rgb, chars, length\n",
+        "  return apply\n",
+        "\n",
+        "\n",
+        "def load_dataset(data_dir, url, batch_size, training=True):\n",
+        "  \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n",
+        "  path = tf.keras.utils.get_file(os.path.basename(url), url, cache_dir=data_dir)\n",
+        "  dataset = tf.data.TextLineDataset(path)\n",
+        "  dataset = dataset.skip(1)\n",
+        "  dataset = dataset.map(parse)\n",
+        "  dataset = dataset.cache()\n",
+        "  dataset = dataset.repeat()\n",
+        "  if training:\n",
+        "    dataset = dataset.shuffle(buffer_size=3000)\n",
+        "  dataset = dataset.padded_batch(\n",
+        "      batch_size, padded_shapes=((None,), (None, 256), ()))\n",
+        "  # To simplify the model code, we statically set as many of the shapes that we\n",
+        "  # know.\n",
+        "  dataset = dataset.map(set_static_batch_shape(batch_size))\n",
+        "  return dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "waZ89t3DTUla",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "To show the use of control flow, we write the RNN loop by hand, rather than using a pre-built RNN model.\n",
+        "\n",
+        "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode.\n",
+        "We use Keras to define the model, and we will train it using Estimators."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "9v8AJouiC44V",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "@autograph.convert()\n",
+        "class RnnColorbot(tf.keras.Model):\n",
+        "  \"\"\"RNN Colorbot model.\"\"\"\n",
+        "\n",
+        "  def __init__(self):\n",
+        "    super(RnnColorbot, self).__init__()\n",
+        "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
+        "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "    self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
+        "\n",
+        "\n",
+        "  def _rnn_layer(self, chars, cell, batch_size, training):\n",
+        "    \"\"\"A single RNN layer.\n",
+        "\n",
+        "    Args:\n",
+        "      chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
+        "      cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "      batch_size: Int, the batch size to use\n",
+        "      training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "    Returns:\n",
+        "      A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
+        "    \"\"\"\n",
+        "    hidden_outputs = []\n",
+        "    autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
+        "    state, output = cell.zero_state(batch_size, tf.float32)\n",
+        "    for ch in chars:\n",
+        "      cell_output, (state, output) = cell.call(ch, (state, output))\n",
+        "      hidden_outputs.append(cell_output)\n",
+        "    hidden_outputs = hidden_outputs.stack()\n",
+        "    if training:\n",
+        "      hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
+        "    return hidden_outputs\n",
+        "\n",
+        "  def build(self, _):\n",
+        "    \"\"\"Creates the model variables. See keras.Model.build().\"\"\"\n",
+        "    self.lower_cell.build(tf.TensorShape((None, 256)))\n",
+        "    self.upper_cell.build(tf.TensorShape((None, 256)))\n",
+        "    self.relu_layer.build(tf.TensorShape((None, 128)))    \n",
+        "    self.built = True\n",
+        "\n",
+        "\n",
+        "  def call(self, inputs, training=False):\n",
+        "    \"\"\"The RNN model code. Uses Eager and \n",
+        "\n",
+        "    The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
+        "    followed by a fully connected layer with ReLU activation.\n",
+        "\n",
+        "    Args:\n",
+        "      inputs: A tuple (chars, length)\n",
+        "      training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "    Returns:\n",
+        "      A Tensor of shape (batch_size, 3) - the model predictions.\n",
+        "    \"\"\"\n",
+        "    chars, length = inputs\n",
+        "    batch_size = chars.shape[0]\n",
+        "    seq = tf.transpose(chars, (1, 0, 2))\n",
+        "\n",
+        "    seq = self._rnn_layer(seq, self.lower_cell, batch_size, training)\n",
+        "    seq = self._rnn_layer(seq, self.upper_cell, batch_size, training)\n",
+        "\n",
+        "    # Grab just the end-of-sequence from each output.\n",
+        "    indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
+        "    sequence_ends = tf.gather_nd(seq, indices)\n",
+        "    return self.relu_layer(sequence_ends)\n",
+        "\n",
+        "@autograph.convert()\n",
+        "def loss_fn(labels, predictions):\n",
+        "  return tf.reduce_mean((predictions - labels) ** 2)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "JjK4gXFvFsf4",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "source": [
+        "We will now create the model function for the custom Estimator.\n",
+        "\n",
+        "In the model function, we simply use the model class we defined above - that's it!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "-yso_Nx23Gy1",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def model_fn(features, labels, mode, params):\n",
+        "  \"\"\"Estimator model function.\"\"\"\n",
+        "  chars = features['chars']\n",
+        "  sequence_length = features['sequence_length']\n",
+        "  inputs = (chars, sequence_length)\n",
+        "\n",
+        "  # Create the model. Simply using the AutoGraph-ed class just works!\n",
+        "  colorbot = RnnColorbot()\n",
+        "  colorbot.build(None)\n",
+        "\n",
+        "  if mode == tf.estimator.ModeKeys.TRAIN:\n",
+        "    predictions = colorbot(inputs, training=True)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "\n",
+        "    learning_rate = params['learning_rate']\n",
+        "    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
+        "    global_step = tf.train.get_global_step()\n",
+        "    train_op = optimizer.minimize(loss, global_step=global_step)\n",
+        "    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)\n",
+        "\n",
+        "  elif mode == tf.estimator.ModeKeys.EVAL:\n",
+        "    predictions = colorbot(inputs)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "\n",
+        "    return tf.estimator.EstimatorSpec(mode, loss=loss)\n",
+        "\n",
+        "  elif mode == tf.estimator.ModeKeys.PREDICT:\n",
+        "    predictions = colorbot(inputs)\n",
+        "\n",
+        "    predictions = tf.minimum(predictions, 1.0)\n",
+        "    return tf.estimator.EstimatorSpec(mode, predictions=predictions)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HOQfoBnHC9CP",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "We'll create an input function that will feed our training and eval data."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "FJZlx7yG2MP0",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def input_fn(data_dir, data_url, params, training=True):\n",
+        "  \"\"\"An input function for training\"\"\"\n",
+        "  batch_size = params['batch_size']\n",
+        "  \n",
+        "  # load_dataset defined above\n",
+        "  dataset = load_dataset(data_dir, data_url, batch_size, training=training)\n",
+        "\n",
+        "  # Package the pipeline end in a format suitable for the estimator.\n",
+        "  labels, chars, sequence_length = dataset.make_one_shot_iterator().get_next()\n",
+        "  features = {\n",
+        "      'chars': chars,\n",
+        "      'sequence_length': sequence_length\n",
+        "  }\n",
+        "\n",
+        "  return features, labels"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "qsvv-lzbDqXd",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "We now have everything in place to build our custom estimator and use it for training and eval!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 35
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 10604,
+          "status": "ok",
+          "timestamp": 1524095272039,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "2pg1AfbxBJQq",
+        "outputId": "9c924b4f-06e1-4538-976c-a3e1ddac5660",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Eval loss at step 100: 0.0674834\n"
+          ]
+        }
+      ],
+      "source": [
+        "params = {\n",
+        "    'batch_size': 64,\n",
+        "    'learning_rate': 0.01,\n",
+        "}\n",
+        "\n",
+        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv\"\n",
+        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv\"\n",
+        "data_dir = \"tmp/rnn/data\"\n",
+        "\n",
+        "regressor = tf.estimator.Estimator(\n",
+        "    model_fn=model_fn,\n",
+        "    params=params)\n",
+        "\n",
+        "regressor.train(\n",
+        "    input_fn=lambda: input_fn(data_dir, train_url, params),\n",
+        "    steps=100)\n",
+        "eval_results = regressor.evaluate(\n",
+        "    input_fn=lambda: input_fn(data_dir, test_url, params, training=False),\n",
+        "    steps=2\n",
+        ")\n",
+        "\n",
+        "print('Eval loss at step %d: %s' % (eval_results['global_step'], eval_results['loss']))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "zG1YAjB_cUnQ",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "source": [
+        "And here's the same estimator used for inference."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 343
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 7990,
+          "status": "ok",
+          "timestamp": 1524095280105,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "dxHex2tUN_10",
+        "outputId": "2b889e5a-b9ed-4645-bf03-d98f26c72101",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "\u003clink rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'\u003e\u003c/link\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f3f36aa6cd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "\u003cscript src='/nbextensions/google.colab/tabbar_main.min.js'\u003e\u003c/script\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f3eca67f7d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "\u003cdiv id=\"id1\"\u003e\u003c/div\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f3eca67f8d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"e8ddfa22-4362-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"contentBorder\": [\"0px\"], \"elementId\": \"id1\", \"borderColor\": [\"#a7a7a7\"], \"contentHeight\": [\"initial\"], \"tabNames\": [\"RNN Colorbot\"], \"location\": \"top\", \"initialSelection\": 0});\n",
+              "//# sourceURL=js_71b9087b6d"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f950\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"e8ddfa23-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_e390445f33"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_241dd76d85"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_60c64e3d50"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"e8ddfa26-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_14ea437cbd"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"e8ddfa27-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_09294c2226"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fcd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec965514-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_e5e8266997"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_07a097f0ee"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fc90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_790d669ca8"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f8d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec965517-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_d30df771f0"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec965518-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_8a43a2da4b"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMBJREFUeJzt3F+I1XX+x/G32zjiFERUpgaFd2JBzOg5joX4h0SiMgmM\n/uhVGIlgFBlERGB3hUEkhkRdtDfRP1ACL6KpLBqcguxCjEAkmGamQcSohFHzsxe7O6zssvsydtff\n+ns8rs758j3f8z7fiyef7/k3o7XWCiDwh4s9APC/QzCAmGAAMcEAYoIBxAQDiAkGF8XTTz9d3W63\n7rvvvhoZGakVK1Zc7JEICMYlbvXq1TU8PHyxxzjPV199VcPDw/XZZ5/V22+/XVVVM2bMuMhTkRAM\n/qt+++23+uGHH+r666+vWbNmXexxuECCcQl76qmnanx8vLZs2VIDAwP1+uuv1zfffFP3339/dTqd\nWr9+fY2MjEzvv2nTpnr55ZfrgQceqIGBgXr44Yfr5MmTVVV1+vTp2r59ey1durQ6nU5t2LChTpw4\nUVVVk5OTtWXLllq6dGmtXbu23nnnnelj7tq1q7Zt21bbt2+vJUuW1HvvvVfPPvtsHTp0qAYGBmrX\nrl1/N/fRo0dr06ZN1el06u67766hoaGqqhodHa1OpzO93zPPPFO33nrr9P3t27fXm2+++e89iZyv\ncUlbtWpVGx4ebq21NjEx0brdbjtw4EBrrbUvvviidbvdduLEidZaaxs3bmxr1qxp33//fZuammob\nN25sO3fubK219tZbb7VHH320TU1NtXPnzrXDhw+3X375pbXW2kMPPdR27NjRTp8+3Y4cOdIGBwen\nn/OVV15pN910U/voo49aa61NTU21999/vz344IPTMx48eLCtWLGitdbamTNn2po1a9qePXvamTNn\n2vDwcOvv72/Hjh2bfj2HDx9urbW2du3advvtt7ejR4+21lpbuXJlO3LkyH/qVNJas8L4f6D95edC\n+/btq5UrV9by5curqmrZsmV1880316effjq977333ls33HBD9fb21h133FFHjhypqqqenp46efJk\nHTt2rGbMmFGLFi2qyy+/vCYmJurrr7+uJ598smbOnFkLFy6sDRs21N69e6eP2d/fX6tXr66qqt7e\n3n8666FDh+rUqVP1yCOPVE9PTw0ODtaqVavqgw8+qKqqJUuW1MjISB0/fryqqtauXVtffvlljY6O\n1q+//loLFy78N501/pGeiz0A/z1jY2O1f//++vjjj6vqzyE5e/ZsLVu2bHqfa665Zvr27Nmz69Sp\nU1VVdc8999TExEQ98cQT9fPPP9e6devq8ccfr8nJybryyitr9uzZ04+bP39+HT58ePr+3Llz4xkn\nJydr3rx5522bP39+TU5OVlVVp9OpoaGhuu6666rb7Va32629e/dWb29vLV68+ALOBr+HYFzi/vbT\nh3nz5tX69etrx44dF3ycnp6e2rp1a23durXGxsZq8+bNtWDBgrrtttvqp59+qlOnTlVfX19VVY2P\nj9ecOXP+4Qz/ypw5c2p8fPy8bWNjY7VgwYKqqup2u/Xiiy/WvHnzqtPp1MDAQD333HPV29tb3W73\ngl8XF8YlySXu2muvrdHR0aqqWrduXQ0NDdXnn39e586dq6mpqRoZGakff/zxXx7n4MGD9d1339W5\nc+eqr6+venp66rLLLqu5c+dWf39/vfTSS3X69On69ttv6913361169b9rnlvueWW6uvrq9dee63O\nnj1bBw8erE8++aTuvPPOqqq68cYba9asWbVv377qdDp1xRVX1NVXX10ffvjheW+I8p8hGJe4zZs3\n1+7du6vb7db+/ftr9+7dtWfPnlq2bFmtWrWq3njjjen3OP7ZSuD48eO1bdu2Wrx4cd111121dOnS\n6Sjs3LmzRkdHa/ny5bVt27Z67LHHzrvMuRAzZ86sV199tQ4cOFCDg4P1/PPP1wsvvDC9wqj68yrj\nqquumr7U+WsoFi1a9Luek9yM1vyBDpCxwgBiggHEBAOICQYQ+z/7PYzjf/QRGVxM12z68u+2WWEA\nMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE\nBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhAT\nDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMIDajtdYu9hDA/wYrDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4j9CY2LTAbbRbWuAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f3ecc00bf10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec965519-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_893ad561f4"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55c90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_2d99e0ac17"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fe50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_5c19462e32"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55dd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec96551c-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_b9c8b7567b"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55a50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec96551d-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_fd05186348"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55810\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "\u003cdiv class=id_888646481 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f3f32414810\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n",
+              "//# sourceURL=js_efef96e882"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ec96551f-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_6eca889864"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 input\");\n",
+              "//# sourceURL=js_f02070cc60"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b553d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ed8ea973-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"].remove();\n",
+              "//# sourceURL=js_ed9faba660"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31a95450\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n",
+              "//# sourceURL=js_f3458d7074"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31a95250\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ed8ea975-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_3ffd97bd6f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31a953d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"ed8ea976-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_7f73e8bcca"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "def predict_input_fn(color_name):\n",
+        "  \"\"\"An input function for prediction.\"\"\"\n",
+        "  _, chars, sequence_length = parse(color_name)\n",
+        "\n",
+        "  # We create a batch of a single element.\n",
+        "  features = {\n",
+        "      'chars': tf.expand_dims(chars, 0),\n",
+        "      'sequence_length': tf.expand_dims(sequence_length, 0)\n",
+        "  }\n",
+        "  return features, None\n",
+        "\n",
+        "\n",
+        "def draw_prediction(color_name, pred):\n",
+        "  pred = pred * 255\n",
+        "  pred = pred.astype(np.uint8)\n",
+        "  plt.axis('off')\n",
+        "  plt.imshow(pred)\n",
+        "  plt.title(color_name)\n",
+        "  plt.show()\n",
+        "\n",
+        "\n",
+        "def predict_with_estimator(color_name, regressor):\n",
+        "  predictions = regressor.predict(\n",
+        "      input_fn=lambda:predict_input_fn(color_name))\n",
+        "  pred = next(predictions)\n",
+        "  predictions.close()\n",
+        "  pred = np.minimum(pred, 1.0)\n",
+        "  pred = np.expand_dims(np.expand_dims(pred, 0), 0)\n",
+        "\n",
+        "  draw_prediction(color_name, pred)\n",
+        "\n",
+        "tb = widgets.TabBar([\"RNN Colorbot\"])\n",
+        "while True:\n",
+        "  with tb.output_to(0):\n",
+        "    try:\n",
+        "      color_name = six.moves.input(\"Give me a color name (or press 'enter' to exit): \")\n",
+        "    except (EOFError, KeyboardInterrupt):\n",
+        "      break\n",
+        "  if not color_name:\n",
+        "    break\n",
+        "  with tb.output_to(0):\n",
+        "    tb.clear_tab()\n",
+        "    predict_with_estimator(color_name, regressor)\n",
+        "  "
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "last_runtime": {
+        "build_target": "",
+        "kind": "local"
+      },
+      "name": "RNN Colorbot using Keras and Estimators",
+      "provenance": [
+        {
+          "file_id": "1CtzefX39ffFibX_BqE6cRbT0UW_DdVKl",
+          "timestamp": 1523579810961
+        },
+        {
+          "file_id": "1DcfimonWU11tmyivKBGVrbpAl3BIOaRG",
+          "timestamp": 1523016192637
+        },
+        {
+          "file_id": "1wCZUh73zTNs1jzzYjqoxMIdaBWCdKJ2K",
+          "timestamp": 1522238054357
+        },
+        {
+          "file_id": "1_HpC-RrmIv4lNaqeoslUeWaX8zH5IXaJ",
+          "timestamp": 1521743157199
+        },
+        {
+          "file_id": "1mjO2fQ2F9hxpAzw2mnrrUkcgfb7xSGW-",
+          "timestamp": 1520522344607
+        }
+      ],
+      "version": "0.3.2",
+      "views": {}
+    },
+    "kernelspec": {
+      "display_name": "Python 2",
+      "name": "python2"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/py2tf/impl/BUILD b/tensorflow/contrib/autograph/impl/BUILD
similarity index 74%
rename from tensorflow/contrib/py2tf/impl/BUILD
rename to tensorflow/contrib/autograph/impl/BUILD
index 90ffabbc9bf4524ec2ebf54b6dd847bd8768a486..54424e26472b8466b8fe68ea848b5463c10224c9 100644
--- a/tensorflow/contrib/py2tf/impl/BUILD
+++ b/tensorflow/contrib/autograph/impl/BUILD
@@ -25,10 +25,11 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/contrib/py2tf/converters",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
-        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/contrib/autograph/converters",
+        "//tensorflow/contrib/autograph/operators",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/contrib/autograph/pyct/static_analysis",
+        "//tensorflow/contrib/autograph/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
@@ -38,10 +39,12 @@ py_test(
     name = "api_test",
     srcs = ["api_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":impl",
-        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/contrib/autograph/utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -49,6 +52,7 @@ py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f87b2c14da4a3523f1e580d4362cbd3679a2cd
--- /dev/null
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -0,0 +1,296 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Public API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import wraps
+
+from enum import Enum
+
+# pylint:disable=g-bad-import-order
+import gast
+import six
+# pylint:enable=g-bad-import-order
+
+from tensorflow.contrib.autograph.impl import config
+from tensorflow.contrib.autograph.impl import conversion
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import inspect_utils
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.utils import builtins
+from tensorflow.contrib.autograph.utils import py_func
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
+
+# TODO(mdan): Properly document the type hints.
+# TODO(mdan): Reduce the type hint information to (module, type).
+# (currently we require (module + class name, type))
+
+
+def convert(recursive=False, verbose=False, arg_types=None):
+  """Decorator that compiles a function to graph mode.
+
+  The decorator is dynamic - invoking compilation whenever the decorated
+  function is called. This means the parameter values are known at compilation.
+
+  Args:
+    recursive: Whether to recursively convert any functions that the decorator
+        function may call.
+    verbose: Whether to output the compiled code in the logs.
+    arg_types: See to_graph.
+
+  Returns:
+    A decorator that compiles the given function to graph mode.
+
+  Raises:
+    ValueError: If any of the arguments are illegal.
+  """
+  if arg_types is None:
+    arg_types = {}
+
+  def decorator(f):
+    """Decorator implementation."""
+
+    @wraps(f)
+    def wrapper(*args, **kwargs):
+      return converted_call(f, recursive, verbose, arg_types, *args, **kwargs)
+
+    # Sometimes the decorator is just desugared, making it impossible to detect.
+    # This attribute makes detection easier.
+    setattr(wrapper, '__pyct_is_compile_decorator', True)
+    return wrapper
+
+  return decorator
+
+
+class RunMode(Enum):
+  GRAPH = 1
+  PY_FUNC = 2
+
+
+def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
+  """Decorator that suppresses compilation of a function.
+
+  Args:
+    run_as: RunMode value. Whether to run the function as-is, or wrap it into
+        a py_func.
+    return_dtypes: See autograph.utils.py_func.wrap_py_func. Setting to None or
+        empty list or tuple will create a dummy return value that can be used
+        to set control dependencies.
+
+  Returns:
+    A decorator that wraps the original function.
+  """
+  def decorator(f):
+    """Decorator implementation."""
+
+    @wraps(f)
+    def graph_wrapper(*args, **kwargs):
+      return f(*args, **kwargs)
+
+    @wraps(f)
+    def py_func_wrapper(*args, **kwargs):
+      if kwargs:
+        raise NotImplementedError(
+            'RunMode.PY_FUNC does not yet support kwargs')
+      # TODO(mdan): Add support for kwargs.
+      return py_func.wrap_py_func(
+          f, return_dtypes, args, kwargs, use_dummy_return=not return_dtypes)
+
+    if run_as == RunMode.GRAPH:
+      wrapper = graph_wrapper
+    elif run_as == RunMode.PY_FUNC:
+      wrapper = py_func_wrapper
+    else:
+      raise ValueError('unknown value for run_as: %s' % run_as)
+
+    # Sometimes the decorator is just desugared, making it impossible to detect.
+    # This attribute makes detection easier.
+    setattr(wrapper, '__pyct_is_compile_decorator', True)
+    return wrapper
+
+  return decorator
+
+
+def converted_call(f, recursive, verbose, arg_types, *args, **kwargs):
+  """Compiles a function call inline."""
+  # TODO(mdan): This needs cleanup.
+  # In particular, we may want to avoid renaming functions altogether.
+
+  if conversion.is_whitelisted_for_graph(f):
+    return f(*args, **kwargs)
+
+  unknown_arg_value = object()  # Sentinel for arguments of unknown value
+
+  if inspect_utils.isbuiltin(f):
+    return builtins.dynamic_builtin(f, *args, **kwargs)
+
+  if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
+    # Regular functions
+    target_entity = f
+    arg_map_target = f
+    effective_args = args
+    f_class = inspect_utils.getmethodclass(f)
+
+    if f_class is not None:
+      partial_types = (f_class,)
+    else:
+      partial_types = ()
+
+  elif tf_inspect.isclass(f):
+    # Constructors
+    target_entity = f
+    arg_map_target = f.__init__
+    effective_args = args
+    partial_types = ()
+
+  elif hasattr(f, '__call__') and hasattr(f, '__class__'):
+    # Callable objects
+    target_entity = f.__call__
+    arg_map_target = f.__call__
+    effective_args = (f,) + args
+    partial_types = (f.__class__,)
+
+  else:
+    NotImplementedError('unknown callable type "%s"' % type(f))
+
+  arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
+  for name, arg in arg_values.items():
+    if arg is unknown_arg_value:
+      continue
+    arg_class = arg.__class__
+    # If arg_value_hints specifies any name, use that instead.
+    if name not in arg_types:
+      arg_types[name] = (arg_class.__name__, arg_class)
+
+  # When called from within a decorator, this is the only indication that
+  # the function is a method - it appears that the decorator is applied
+  # before the method is bound.
+  if not partial_types:
+    if 'self' in arg_values:
+      if tf_inspect.isclass(arg_values['self'].__class__):
+        partial_types = (arg_values['self'].__class__,)
+    elif 'cls' in arg_values:
+      if tf_inspect.isclass(arg_values['cls']):
+        partial_types = (arg_values['cls'],)
+
+  converted_f = to_graph(
+      target_entity,
+      recursive=recursive,
+      verbose=verbose,
+      arg_values=arg_values,
+      arg_types=arg_types,
+      partial_types=partial_types)
+  return converted_f(*effective_args, **kwargs)
+
+
+def to_graph(e,
+             recursive=True,
+             verbose=False,
+             arg_values=None,
+             arg_types=None,
+             partial_types=None):
+  """Compile a Python entity into equivalent TensorFlow code.
+
+  Currently supported entities:
+    * functions
+    * classes
+
+  Classes are handled by converting all their methods into a new class.
+
+  Args:
+    e: A Python entity.
+    recursive: Whether to recursively convert any functions that the decorator
+        function may call.
+    verbose: Whether to output the compiled code in the logs.
+    arg_values: A dict containing value hints for symbols like function
+        parameters.
+    arg_types: A dict containing type hints for symbols like function
+        parameters.
+    partial_types: A set of types (e.g. classes) that will not be converted
+        entirely. Calls to member functions for these types will be renamed
+        independently.
+
+  Returns:
+    A function with a signature identical to `o`, but which when executed it
+  creates TF a graph that has the same functionality as the original entity.
+  """
+  conversion_map = conversion.ConversionMap(
+      recursive=recursive,
+      nocompile_decorators=(convert, do_not_convert, converted_call),
+      partial_types=partial_types,
+      api_module=tf_inspect.getmodule(to_graph))
+  _, name, namespace = conversion.entity_to_graph(e, conversion_map, arg_values,
+                                                  arg_types)
+
+  module = gast.Module([])
+  for import_line in config.COMPILED_IMPORT_STATEMENTS:
+    module.body.extend(parser.parse_str(import_line).body)
+  for dep in reversed(conversion_map.dependency_cache.values()):
+    module.body.append(dep)
+  compiled_node, compiled_src = compiler.ast_to_object(module)
+
+  # The compiled code should see everything the entry entity saw.
+  # TODO(mdan): This might not work well if the call tree spans modules?
+  for key, val in namespace.items():
+    # Avoid overwriting entities that have been transformed.
+    if key not in compiled_node.__dict__:
+      compiled_node.__dict__[key] = val
+  compiled_fn = getattr(compiled_node, name)
+
+  if verbose:
+    logging.info('Compiled output of %s:\n\n%s\n', e, compiled_src)
+
+  return compiled_fn
+
+
+def to_code(e,
+            recursive=True,
+            arg_values=None,
+            arg_types=None,
+            partial_types=None,
+            indentation='  '):
+  """Return the equivalent of an entity in TensorFlow code.
+
+  See `to_graph` for more details.
+
+  Args:
+    e: A Python entity.
+    recursive: See to_graph.
+    arg_values: See to_graph.
+    arg_types: See to_graph.
+    partial_types: See to_graph.
+    indentation: String, when to use for each level of indentation.
+
+  Returns:
+    String.
+  """
+  conversion_map = conversion.ConversionMap(
+      recursive=recursive,
+      nocompile_decorators=(convert, do_not_convert, converted_call),
+      partial_types=partial_types,
+      api_module=tf_inspect.getmodule(to_graph))
+  conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
+
+  imports = '\n'.join(config.COMPILED_IMPORT_STATEMENTS)
+  code = '\n'.join(
+      compiler.ast_to_source(dep, indentation)
+      for dep in reversed(tuple(
+          six.itervalues(conversion_map.dependency_cache))))
+
+  return imports + '\n\n' + code
diff --git a/tensorflow/contrib/py2tf/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py
similarity index 55%
rename from tensorflow/contrib/py2tf/impl/api_test.py
rename to tensorflow/contrib/autograph/impl/api_test.py
index 02cd8ed2d0ffee8ef2d31ea65902d2b493df9d64..a7737b7f448131b1c54951efa719b481e1f4d0c9 100644
--- a/tensorflow/contrib/py2tf/impl/api_test.py
+++ b/tensorflow/contrib/autograph/impl/api_test.py
@@ -18,23 +18,29 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.impl import api
-from tensorflow.contrib.py2tf.impl import config
-from tensorflow.contrib.py2tf.pyct import parser
+import numpy as np
+
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.impl import api
+from tensorflow.contrib.autograph.impl import config
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+tf = utils.fake_tf()
+
+
 class ApiTest(test.TestCase):
 
   def setUp(self):
-    config.DEFAULT_UNCOMPILED_MODULES.add((math_ops.__name__,))
     config.COMPILED_IMPORT_STATEMENTS = (
-        'from tensorflow.python.ops '
-        'import control_flow_ops as tf',
-        'from tensorflow.contrib.py2tf import utils as '
-        'py2tf_utils')
+        'from __future__ import print_function',
+        'from tensorflow.contrib.autograph import utils'
+        ' as autograph_utils',
+        'tf = autograph_utils.fake_tf()',
+    )
 
   def test_decorator_recurses(self):
 
@@ -47,7 +53,7 @@ class ApiTest(test.TestCase):
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
-        while math_ops.reduce_sum(x) > s:
+        while tf.reduce_sum(x) > s:
           x //= self.called_member(a)
         return x
 
@@ -63,11 +69,11 @@ class ApiTest(test.TestCase):
     class TestClass(object):
 
       def called_member(self, a):
-        return math_ops.negative(a)
+        return tf.negative(a)
 
       @api.convert(recursive=False)
       def test_method(self, x, s, a):
-        while math_ops.reduce_sum(x) > s:
+        while tf.reduce_sum(x) > s:
           x //= self.called_member(a)
         return x
 
@@ -78,17 +84,17 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
-  def test_decorator_calls_converted(self):
+  def test_decorator_calls_unconverted_graph(self):
 
     class TestClass(object):
 
-      @api.graph_ready
+      @api.do_not_convert(api.RunMode.GRAPH)
       def called_member(self, a):
-        return math_ops.negative(a)
+        return tf.negative(a)
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
-        while math_ops.reduce_sum(x) > s:
+        while tf.reduce_sum(x) > s:
           x //= self.called_member(a)
         return x
 
@@ -99,20 +105,23 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
-  def test_decorator_calls_decorated(self):
+  def test_decorator_calls_unconverted_py_func(self):
 
     class TestClass(object):
 
-      @api.convert()
+      @api.do_not_convert(
+          api.RunMode.PY_FUNC, return_dtypes=py_func.MatchDType(1))
       def called_member(self, a):
-        if a < 0:
-          a = -a
-        return a
+        return np.negative(a)
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
-        while math_ops.reduce_sum(x) > s:
-          x //= self.called_member(a)
+        while tf.reduce_sum(x) > s:
+          y = self.called_member(a)
+          # set_shape works around while_loop's limitations.
+          # TODO(mdan): Allow specifying shapes (or ShapeLike) instead.
+          y.set_shape(a.shape)
+          x //= y
         return x
 
     tc = TestClass()
@@ -122,10 +131,11 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
-  def test_convert_call_site_decorator(self):
+  def test_decorator_calls_decorated(self):
 
     class TestClass(object):
 
+      @api.convert()
       def called_member(self, a):
         if a < 0:
           a = -a
@@ -133,8 +143,8 @@ class ApiTest(test.TestCase):
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
-        while math_ops.reduce_sum(x) > s:
-          x //= api.convert_inline(self.called_member, a)
+        while tf.reduce_sum(x) > s:
+          x //= self.called_member(a)
         return x
 
     tc = TestClass()
@@ -144,17 +154,20 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
-  def test_graph_ready_call_site_decorator(self):
+  def test_convert_call_site_decorator(self):
 
     class TestClass(object):
 
       def called_member(self, a):
-        return math_ops.negative(a)
+        if a < 0:
+          a = -a
+        return a
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
-        while math_ops.reduce_sum(x) > s:
-          x //= api.graph_ready(self.called_member(a))
+        while tf.reduce_sum(x) > s:
+          x //= api.converted_call(self.called_member, False, False, {}, self,
+                                   a)
         return x
 
     tc = TestClass()
@@ -164,9 +177,96 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
+  def test_converted_call_builtin(self):
+    x = api.converted_call(range, False, False, {}, 3)
+    self.assertEqual((0, 1, 2), tuple(x))
+
+  def test_converted_call_function(self):
+
+    def test_fn(x):
+      if x < 0:
+        return -x
+      return x
+
+    with self.test_session() as sess:
+      x = api.converted_call(
+          test_fn, False, False, {}, constant_op.constant(-1))
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_method(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def test_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = TestClass(constant_op.constant(-1))
+      x = api.converted_call(tc.test_method, False, False, {}, tc)
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_method_by_class(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def test_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = TestClass(constant_op.constant(-1))
+      x = api.converted_call(TestClass.test_method, False, False, {}, tc)
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_callable_object(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def __call__(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = TestClass(constant_op.constant(-1))
+      x = api.converted_call(tc, False, False, {})
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_constructor(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def test_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = api.converted_call(
+          TestClass, False, False, {}, constant_op.constant(-1))
+      # tc is now a converted object.
+      x = tc.test_method()
+      self.assertEqual(1, sess.run(x))
+
   def test_to_graph_basic(self):
+
     def test_fn(x, s):
-      while math_ops.reduce_sum(x) > s:
+      while tf.reduce_sum(x) > s:
         x //= 2
       return x
 
@@ -177,15 +277,15 @@ class ApiTest(test.TestCase):
       self.assertListEqual([1, 2], sess.run(x).tolist())
 
   def test_to_code_basic(self):
+
     def test_fn(x, s):
-      while math_ops.reduce_sum(x) > s:
+      while tf.reduce_sum(x) > s:
         x /= 2
       return x
 
     compiled_code = api.to_code(test_fn)
 
-    # Just check for some key words and that it is parseable Python code.
-    self.assertRegexpMatches(compiled_code, 'py2tf_utils\\.run_while')
+    # Just check that it is parseable Python code.
     self.assertIsNotNone(parser.parse_str(compiled_code))
 
 
diff --git a/tensorflow/contrib/py2tf/impl/config.py b/tensorflow/contrib/autograph/impl/config.py
similarity index 73%
rename from tensorflow/contrib/py2tf/impl/config.py
rename to tensorflow/contrib/autograph/impl/config.py
index c90e85c96b690b7781358b173e5d83fe60e29c00..2600088595a12761b1138c4649c06882bd8fd000 100644
--- a/tensorflow/contrib/py2tf/impl/config.py
+++ b/tensorflow/contrib/autograph/impl/config.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
+from tensorflow.contrib.autograph import utils
 
 
 PYTHON_LITERALS = {
@@ -31,16 +31,19 @@ PYTHON_LITERALS = {
 DEFAULT_UNCOMPILED_MODULES = set((
     ('tensorflow',),
     (utils.__name__,),
+
+    # All of tensorflow's subpackages. Unlike the root tf module, they don't
+    # have well-known names. Not refering to the module directly to avoid
+    # circular imports.
+    (
+        utils.__name__[:-len('.contrib.autograph.utils')],),
 ))
 
 NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
-# TODO(mdan): Also allow controlling the generated names (for testability).
-# TODO(mdan): Make sure copybara renames the reference below.
+# TODO(mdan): Also allow controlling the generated names.
+# TODO(mdan); Consolidate all internal imports into a single __ag module.
 COMPILED_IMPORT_STATEMENTS = (
     'from __future__ import print_function',
     'import tensorflow as tf',
-    'from tensorflow.contrib.py2tf.impl import api as '
-    'py2tf_api',
-    'from tensorflow.contrib.py2tf import utils as '
-    'py2tf_utils')
+)
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..55a30dc127957b2a9caa053db843380c94bacfbf
--- /dev/null
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -0,0 +1,409 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""High level conversion support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import imp
+
+import gast
+
+from tensorflow.contrib.autograph import operators
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.converters import asserts
+from tensorflow.contrib.autograph.converters import break_statements
+from tensorflow.contrib.autograph.converters import builtin_functions
+from tensorflow.contrib.autograph.converters import call_trees
+from tensorflow.contrib.autograph.converters import continue_statements
+from tensorflow.contrib.autograph.converters import control_flow
+from tensorflow.contrib.autograph.converters import decorators
+from tensorflow.contrib.autograph.converters import ifexp
+from tensorflow.contrib.autograph.converters import lists
+from tensorflow.contrib.autograph.converters import logical_expressions
+from tensorflow.contrib.autograph.converters import name_scopes
+from tensorflow.contrib.autograph.converters import side_effect_guards
+from tensorflow.contrib.autograph.converters import single_return
+from tensorflow.contrib.autograph.impl import config
+from tensorflow.contrib.autograph.impl import naming
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import inspect_utils
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis import live_values
+from tensorflow.contrib.autograph.pyct.static_analysis import type_info
+from tensorflow.contrib.autograph.utils import type_hints
+from tensorflow.python.util import tf_inspect
+
+
+# TODO(mdan): Might we not need any renaming at all?
+
+
+class ConversionMap(object):
+  """ConversionMap keeps track of converting function hierarchies.
+
+  This object is mutable, and is updated as functions are converted.
+
+  Attributes:
+    recursive: Whether to recursively convert any functions that the decorator
+        function may call.
+    nocompile_decorators: tuple of decorator functions that toggle compilation
+        off.
+    dependency_cache: dict[object]: ast; maps original entities to their
+        converted AST
+    additional_imports: set(object); additional entities which for any reason
+        cannot be attached after loading and need to be explicitly imported
+        in the generated code
+    name_map: dict[string]: string; maps original entities to the name of
+        their converted counterparts
+    api_module: A reference to the api module. The reference needs to be passed
+        to avoid circular dependencies.
+  """
+
+  # TODO(mdan): Rename to ConversionContext, and pull in additional flags.
+
+  def __init__(self, recursive, nocompile_decorators, partial_types,
+               api_module):
+    self.recursive = recursive
+    self.nocompile_decorators = nocompile_decorators
+    self.partial_types = partial_types if partial_types else ()
+    # Required to output dependencies in discovery order, which should match
+    # the reverse dependency order.
+    self.dependency_cache = collections.OrderedDict()
+    self.additional_imports = set()
+    self.name_map = {}
+    self.api_module = api_module
+
+  def new_namer(self, namespace):
+    return naming.Namer(namespace, self.recursive, self.name_map,
+                        self.partial_types)
+
+  def update_name_map(self, namer):
+    for o, name in namer.renamed_calls.items():
+      if o in self.name_map:
+        if self.name_map[o] != name:
+          raise ValueError(
+              'Calls to %s were converted using multiple names (%s). This is '
+              'possible when an entity with one of these names already '
+              'existed. To fix, avoid using any of these names.')
+      else:
+        self.name_map[o] = name
+
+  def add_to_cache(self, original_entity, converted_ast):
+    self.dependency_cache[original_entity] = converted_ast
+
+
+def is_whitelisted_for_graph(o):
+  """Check whether an entity is whitelisted for use in graph mode.
+
+  Examples of whitelisted entities include all members of the tensorflow
+  package.
+
+  Args:
+    o: A Python entity.
+  Returns:
+    Boolean
+  """
+  m = tf_inspect.getmodule(o)
+  for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
+    if m.__name__.startswith(prefix):
+      return True
+  return False
+
+
+def entity_to_graph(o, conversion_map, arg_values, arg_types):
+  """Compile a Python entity into equivalent TensorFlow.
+
+  The function will also recursively compile all the entities that `o`
+  references, updating `dependency_cache`.
+
+  This function is reentrant, and relies on dependency_cache to avoid
+  generating duplicate code.
+
+  Args:
+    o: A Python entity.
+    conversion_map: A ConversionMap object.
+    arg_values: A dict containing value hints for symbols like function
+        parameters.
+    arg_types: A dict containing type hints for symbols like function
+        parameters.
+
+  Returns:
+    A tuple (ast, new_name, namespace):
+        * ast: An AST representing an entity with interface equivalent to `o`,
+            but which when executed it creates TF a graph.
+        * new_name: The symbol name under which the new entity can be found.
+        * namespace: A dict mapping all symbols visible to the converted entity,
+            keyed by their symbol name.
+
+  Raises:
+    ValueError: if the entity type is not supported.
+  """
+  if tf_inspect.isclass(o):
+    node, name, ns = class_to_graph(o, conversion_map)
+  elif tf_inspect.isfunction(o):
+    # TODO(mdan): This is not a reliable mechanism.
+    # The most reliable way is to check the source code, the AST will contain
+    # a Lambda node instead of a FunctionDef
+    if o.__name__ == '<lambda>':
+      raise NotImplementedError(
+          'lambda functions are not yet supported; declare the function'
+          ' using def instead: %s' % o)
+    else:
+      node, name, ns = function_to_graph(o, conversion_map, arg_values,
+                                         arg_types)
+  elif tf_inspect.ismethod(o):
+    node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types)
+  else:
+    raise ValueError(
+        'Entity "%s" has unsupported type "%s". Only functions and classes are '
+        'supported for now.' % (o, type(o)))
+
+  conversion_map.add_to_cache(o, node)
+  if conversion_map.recursive:
+    while True:
+      candidate = None
+      for obj in conversion_map.name_map.keys():
+        if obj not in conversion_map.dependency_cache:
+          candidate = obj
+          break
+      if candidate is None:
+        break
+      if (hasattr(candidate, 'im_class') and
+          getattr(candidate, 'im_class') not in conversion_map.partial_types):
+        # Class members are converted with their objects, unless they're
+        # only converted partially.
+        continue
+      entity_to_graph(candidate, conversion_map, {}, {})
+
+  return node, name, ns
+
+
+def class_to_graph(c, conversion_map):
+  """Specialization of `entity_to_graph` for classes."""
+  converted_members = {}
+  method_filter = lambda m: tf_inspect.isfunction(m) or tf_inspect.ismethod(m)
+  members = tf_inspect.getmembers(c, predicate=method_filter)
+  if not members:
+    raise ValueError('Cannot convert %s: it has no member methods.' % c)
+
+  class_namespace = {}
+  for _, m in members:
+    # Only convert the members that are directly defined by the class.
+    if inspect_utils.getdefiningclass(m, c) is not c:
+      continue
+    node, _, namespace = function_to_graph(
+        m,
+        conversion_map=conversion_map,
+        arg_values={},
+        arg_types={'self': (c.__name__, c)},
+        owner_type=c)
+    if class_namespace is None:
+      class_namespace = namespace
+    else:
+      class_namespace.update(namespace)
+    converted_members[m] = node
+  namer = conversion_map.new_namer(class_namespace)
+  class_name = namer.compiled_class_name(c.__name__, c)
+
+  # TODO(mdan): This needs to be explained more thoroughly.
+  # Process any base classes: if the sueprclass if of a whitelisted type, an
+  # absolute import line is generated. Otherwise, it is marked for conversion
+  # (as a side effect of the call to namer.compiled_class_name() followed by
+  # conversion_map.update_name_map(namer)).
+  output_nodes = []
+  renames = {}
+  bases = []
+  for base in c.__bases__:
+    if isinstance(object, base):
+      bases.append('object')
+      continue
+    if is_whitelisted_for_graph(base):
+      alias = namer.new_symbol(base.__name__, ())
+      output_nodes.append(
+          gast.ImportFrom(
+              module=base.__module__,
+              names=[gast.alias(name=base.__name__, asname=alias)],
+              level=0))
+    else:
+      # This will trigger a conversion into a class with this name.
+      alias = namer.compiled_class_name(base.__name__, base)
+    bases.append(alias)
+    renames[qual_names.QN(base.__name__)] = qual_names.QN(alias)
+  conversion_map.update_name_map(namer)
+
+  # Generate the definition of the converted class.
+  output_nodes.append(
+      gast.ClassDef(
+          class_name,
+          bases=bases,
+          keywords=[],
+          body=list(converted_members.values()),
+          decorator_list=[]))
+  node = gast.Module(output_nodes)
+
+  # Make a final pass to replace references to the class or its base classes.
+  # Most commonly, this occurs when making super().__init__() calls.
+  # TODO(mdan): Making direct references to superclass' superclass will fail.
+  node = qual_names.resolve(node)
+  renames[qual_names.QN(c.__name__)] = qual_names.QN(class_name)
+  node = ast_util.rename_symbols(node, renames)
+
+  return node, class_name, class_namespace
+
+
+def _add_reserved_symbol(namespace, name, entity):
+  if name not in namespace:
+    namespace[name] = entity
+  elif namespace[name] != entity:
+    raise ValueError('The name "%s" is reserved and may not be used.' % name)
+
+
+ag_internal = None
+
+
+def _add_self_references(namespace, api_module):
+  """Adds namespace references to the module that exposes the api itself."""
+  global ag_internal
+  if ag_internal is None:
+    # Craft a module that exposes parts of the external API as well as certain
+    # internal modules.
+    ag_internal = imp.new_module('autograph')
+    ag_internal.converted_call = api_module.converted_call
+    ag_internal.utils = utils
+    # TODO(mdan): Add safeguards against name clashes.
+    # We don't want to create a submodule because we want the operators to be
+    # accessible as ag__.<operator>
+    ag_internal.__dict__.update(operators.__dict__)
+
+  _add_reserved_symbol(namespace, 'ag__', ag_internal)
+
+
+def function_to_graph(f, conversion_map, arg_values, arg_types,
+                      owner_type=None):
+  """Specialization of `entity_to_graph` for callable functions."""
+  node, source = parser.parse_entity(f)
+  node = node.body[0]
+
+  namespace = inspect_utils.getnamespace(f)
+  _add_self_references(namespace, conversion_map.api_module)
+  namer = conversion_map.new_namer(namespace)
+
+  ctx = context.EntityContext(
+      namer=namer,
+      source_code=source,
+      source_file='<fragment>',
+      namespace=namespace,
+      arg_values=arg_values,
+      arg_types=arg_types,
+      owner_type=owner_type,
+      recursive=conversion_map.recursive,
+      type_annotation_func=type_hints.set_element_type)
+  node, deps = node_to_graph(node, ctx, conversion_map.nocompile_decorators)
+
+  # TODO(mdan): This somewhat duplicates the call rename logic in call_treest.py
+  new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type)
+  if not did_rename:
+    new_name = f.__name__
+    if node.name != f.__name__:
+      raise NotImplementedError('Strange corner case. Send us offending code!')
+
+  node.name = new_name
+  conversion_map.update_name_map(namer)
+  # TODO(mdan): Use this at compilation.
+  conversion_map.additional_imports.update(deps)
+
+  return node, new_name, namespace
+
+
+def _static_analysis_pass(node, ctx):
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
+  node = live_values.resolve(node, ctx, config.PYTHON_LITERALS)
+  node = type_info.resolve(node, ctx)
+  return node
+
+
+def node_to_graph(node, ctx, nocompile_decorators):
+  """Convert Python code to equivalent TF graph mode code.
+
+  Args:
+    node: A Python AST node representing the code to convert.
+    ctx: An EntityContext object.
+    nocompile_decorators: A tuple containing decorators to be stripped from
+        functions during conversion.
+
+  Returns:
+    A tuple (node, deps):
+        * node: A Python ast node, representing the converted code.
+        * deps: A set of strings, the fully qualified names of entity
+            dependencies that this node has.
+  """
+  # TODO(mdan): Verify arguments for correctness.
+
+  # TODO(mdan): Factor out common elements.
+  # These include:
+  #   * code move between blocks
+  #   * visiting blocks in transformers
+
+  # Certain steps, especially canonicalization, insert new symbols into the
+  # tree, which must be accounted. Although less efficient, it is most robust
+  # to re-run the analysis.
+
+  node = _static_analysis_pass(node, ctx)
+
+  # TODO(mdan): Clean this up.
+  # Some intermediate analyses are not required, and some comments got orphaned.
+
+  # Past this point, line numbers are no longer accurate so we ignore the
+  # source.
+  # TODO(mdan): Is it feasible to reconstruct intermediate source code?
+  ctx.source_code = None
+  node = ifexp.transform(node, ctx)
+  node, deps = decorators.transform(node, nocompile_decorators)
+  node = break_statements.transform(node, ctx)
+  node = _static_analysis_pass(node, ctx)
+
+  node = asserts.transform(node, ctx)
+
+  # Note: sequencing continue canonicalization before for loop one avoids
+  # dealing with the extra loop increment operation that the for
+  # canonicalization creates.
+  node = continue_statements.transform(node, ctx)
+  ctx.namespace['len'] = len
+
+  node = _static_analysis_pass(node, ctx)
+  node = single_return.transform(node, ctx)
+
+  node = _static_analysis_pass(node, ctx)
+  node = lists.transform(node, ctx)
+  node = builtin_functions.transform(node, ctx)
+
+  node = _static_analysis_pass(node, ctx)
+  node = call_trees.transform(node, ctx, config.DEFAULT_UNCOMPILED_MODULES,
+                              nocompile_decorators)
+  node = control_flow.transform(node, ctx)
+
+  # control_flow may create new symbols and change scopes.
+  node = _static_analysis_pass(node, ctx)
+  node = logical_expressions.transform(node, ctx)
+  node = side_effect_guards.transform(node, ctx)
+  node = name_scopes.transform(node, ctx)
+
+  return node, deps
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5edd8e74a8899a25fb51e2a4e133f3cb7933fa26
--- /dev/null
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for conversion module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.impl import api
+from tensorflow.contrib.autograph.impl import conversion
+from tensorflow.python.framework import constant_op
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.platform import test
+
+
+class ConversionTest(test.TestCase):
+
+  def _simple_conversion_map(self):
+    return conversion.ConversionMap(True, (), (), api)
+
+  def test_is_whitelisted_for_graph(self):
+
+    def test_fn():
+      return constant_op.constant(1)
+
+    self.assertFalse(conversion.is_whitelisted_for_graph(test_fn))
+    self.assertTrue(conversion.is_whitelisted_for_graph(utils))
+    self.assertTrue(conversion.is_whitelisted_for_graph(constant_op.constant))
+
+  def test_entity_to_graph_unsupported_types(self):
+    with self.assertRaises(ValueError):
+      conversion_map = self._simple_conversion_map()
+      conversion.entity_to_graph('dummy', conversion_map, None, None)
+
+  def test_entity_to_graph_callable(self):
+    b = 2
+    def f(a):
+      return a + b
+
+    conversion_map = self._simple_conversion_map()
+    ast, name, ns = conversion.entity_to_graph(f, conversion_map, None, None)
+    self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
+    self.assertEqual('tf__f', name)
+    self.assertTrue(ns['b'] is b)
+
+  def test_entity_to_graph_call_tree(self):
+
+    def g(a):
+      return a
+
+    def f(a):
+      return g(a)
+
+    conversion_map = self._simple_conversion_map()
+    conversion.entity_to_graph(f, conversion_map, None, None)
+
+    self.assertTrue(f in conversion_map.dependency_cache)
+    self.assertTrue(g in conversion_map.dependency_cache)
+    self.assertEqual('tf__f', conversion_map.dependency_cache[f].name)
+    # need the extra .body[0] in order to step past the with tf.name_scope('f')
+    # that is added automatically
+    self.assertEqual(
+        'tf__g',
+        conversion_map.dependency_cache[f].body[0].body[0].value.func.id)
+    self.assertEqual('tf__g', conversion_map.dependency_cache[g].name)
+
+  def test_entity_to_graph_class_hierarchy(self):
+
+    class TestBase(object):
+
+      def __init__(self, x='base'):
+        self.x = x
+
+      def foo(self):
+        return self.x
+
+      def bar(self):
+        return self.x
+
+    class TestSubclass(TestBase):
+
+      def __init__(self, y):
+        super(TestSubclass, self).__init__('sub')
+        self.y = y
+
+      def foo(self):
+        return self.y
+
+      def baz(self):
+        return self.y
+
+    conversion_map = self._simple_conversion_map()
+    conversion.entity_to_graph(TestSubclass, conversion_map, None, None)
+
+    self.assertTrue(TestBase in conversion_map.dependency_cache)
+    self.assertTrue(TestSubclass in conversion_map.dependency_cache)
+    self.assertEqual('TfTestBase',
+                     conversion_map.dependency_cache[TestBase].body[-1].name)
+    self.assertEqual(
+        'TfTestSubclass',
+        conversion_map.dependency_cache[TestSubclass].body[-1].name)
+
+  def test_entity_to_graph_class_hierarchy_whitelisted(self):
+
+    class TestSubclass(training.Model):
+
+      def __init__(self, y):
+        super(TestSubclass, self).__init__()
+        self.built = False
+
+      def call(self, x):
+        return 3 * x
+
+    conversion_map = self._simple_conversion_map()
+    conversion.entity_to_graph(TestSubclass, conversion_map, None, None)
+
+    self.assertTrue(TestSubclass in conversion_map.dependency_cache)
+    self.assertFalse(training.Model in conversion_map.dependency_cache)
+    self.assertEqual(
+        'Model',
+        conversion_map.dependency_cache[TestSubclass].body[0].names[0].name)
+    self.assertEqual(
+        'TfTestSubclass',
+        conversion_map.dependency_cache[TestSubclass].body[-1].name)
+
+  def test_entity_to_graph_lambda(self):
+    f = lambda a: a
+
+    with self.assertRaises(NotImplementedError):
+      conversion_map = self._simple_conversion_map()
+      conversion.entity_to_graph(f, conversion_map, None, None)
+
+  def test_ag_module_cached(self):
+    def callee():
+      return range(3)
+
+    def caller(a):
+      return a()
+
+    conversion_map = self._simple_conversion_map()
+    _, _, callee_ns = conversion.entity_to_graph(
+        callee, conversion_map, None, None)
+    _, _, caller_ns = conversion.entity_to_graph(
+        caller, conversion_map, None, None)
+
+    self.assertTrue(callee_ns['ag__'] is caller_ns['ag__'])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/impl/naming.py b/tensorflow/contrib/autograph/impl/naming.py
similarity index 96%
rename from tensorflow/contrib/py2tf/impl/naming.py
rename to tensorflow/contrib/autograph/impl/naming.py
index 51326091de13715c32d0a79279f1d3274e48ad10..b1d3f76be7763fada88fd0a1da9d3aa43b67ddfa 100644
--- a/tensorflow/contrib/py2tf/impl/naming.py
+++ b/tensorflow/contrib/autograph/impl/naming.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.autograph.pyct import qual_names
 
 
 class Namer(object):
@@ -62,8 +62,6 @@ class Namer(object):
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
 
-    if live_entity is not None:
-      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
     if live_entity is not None:
       self.renamed_calls[live_entity] = new_name
diff --git a/tensorflow/contrib/py2tf/impl/naming_test.py b/tensorflow/contrib/autograph/impl/naming_test.py
similarity index 98%
rename from tensorflow/contrib/py2tf/impl/naming_test.py
rename to tensorflow/contrib/autograph/impl/naming_test.py
index beb4e54937bbb91b19157c9b9e3c528353206c62..73fc0894655cb49e4f61bf8ca51995b06feb3072 100644
--- a/tensorflow/contrib/py2tf/impl/naming_test.py
+++ b/tensorflow/contrib/autograph/impl/naming_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.impl import naming
+from tensorflow.contrib.autograph.impl import naming
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/autograph/operators/BUILD b/tensorflow/contrib/autograph/operators/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..18bfec5d9c69912f90414c51ac63ba540cf4d5fc
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/BUILD
@@ -0,0 +1,54 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "operators",
+    srcs = [
+        "__init__.py",
+        "control_flow.py",
+        "data_structures.py",
+        "dispatch_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/contrib/autograph/utils",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_test(
+    name = "data_structures_test",
+    srcs = ["data_structures_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "control_flow_test",
+    srcs = ["control_flow_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/autograph/operators/__init__.py b/tensorflow/contrib/autograph/operators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..04b4734551d3227a1c611d668f006a157c2c2dd3
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This module implements operators that we overload.
+
+Note that "operator" is used loosely here, and includes control structures like
+conditionals and loops, implemented in functional form, using for example
+closures for the body.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# TODO(mdan): Add a container for implementation-specific toggles (throughout).
+
+from tensorflow.contrib.autograph.operators.control_flow import for_loop
+from tensorflow.contrib.autograph.operators.control_flow import while_loop
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9d8b0d593e5372942ca6423d10022f0f56d78ce
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -0,0 +1,216 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Control flow statements: loops, conditionals, etc."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.utils import builtins
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+
+# TODO(mdan): Rename _loop to _stmt to follow Python nomenclature.
+# TODO(mdan): Rename arguments to match the AST names.
+
+
+def for_loop(iterated, extra_cond, loop_body, init_state):
+  """Functional form of a for statement.
+
+  The loop operates on a so-called state, which includes all symbols that are
+  variant across loop iterations, excluding the iterate. In what follows we
+  refer to state as either a tuple of entities that represent an actual state,
+  or a list of arguments of the corresponding types.
+
+  Args:
+    iterated: The entity being iterated over.
+    extra_cond: Callable with the state as arguments, and boolean return type.
+        An additionnal loop condition.
+    loop_body: Callable with the iterate and the state as arguments, and
+        state as return type. The actual loop body.
+    init_state: Tuple containing the initial state.
+
+  Returns:
+    Tuple containing the final state.
+  """
+  if tensor_util.is_tensor(iterated):
+    return _known_len_for_loop(iterated, extra_cond, loop_body, init_state)
+  elif isinstance(iterated, dataset_ops.Dataset):
+    return _dataset_for_loop(iterated, extra_cond, loop_body, init_state)
+  else:
+    return _py_for_loop(iterated, extra_cond, loop_body, init_state)
+
+
+def _py_for_loop(iterated, extra_cond, loop_body, init_state):
+  """Overload of for_loop that executes a Python for loop."""
+  state = init_state
+  for iterate in iterated:
+    if not extra_cond(*state):
+      break
+    state = loop_body(iterate, *state)
+
+  # TODO(mdan): Remove this special case.
+  if len(state) == 1:
+    return state[0]
+  return state
+
+
+def _known_len_for_loop(iterated, extra_cond, loop_body, init_state):
+  """Overload of for_loop that iterates over objects that define a length."""
+  n = builtins.dynamic_len(iterated)
+
+  def while_body(iterate_index, *state):
+    iterate = iterated[iterate_index]
+    new_state = loop_body(iterate, *state)
+    return (iterate_index + 1,) + new_state
+
+  def while_cond(iterate_index, *state):
+    return gen_math_ops.logical_and(iterate_index < n, extra_cond(*state))
+
+  results = while_loop(
+      while_cond,
+      while_body,
+      init_state=(0,) + init_state,
+      extra_deps=(iterated,),
+      opts=dict(maximum_iterations=n))
+  # Dropping the iteration index because it's not syntactically visible.
+  results = results[1:]
+
+  # TODO(mdan): Remove this special case.
+  if len(results) == 1:
+    return results[0]
+  return results
+
+
+def _dataset_for_loop(ds, extra_cond, loop_body, init_state):
+  """Overload of for_loop that iterates over TF Datasets."""
+  # Because Datsets only expose get_next, in the style of Python iterators,
+  # we are forced to unpack the loop as:
+  #
+  # epoch_number, iterate = ds.get_next()
+  # while epoch_number < 2:
+  #   <body>
+  #   epoch_number, iterate = ds.get_next()
+  epoch_numbers = dataset_ops.Dataset.range(2)
+  def tag_with(ds, tag):
+    return dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(tag).repeat(), ds))
+  ds_with_epoch = epoch_numbers.flat_map(lambda i: tag_with(ds, i))
+
+  iterator = ds_with_epoch.make_initializable_iterator()
+  with ops.control_dependencies((iterator.initializer,)):
+    epoch_number, iterate = iterator.get_next()
+
+    def while_body(epoch_number, iterate, *state):
+      new_state = loop_body(iterate, *state)
+      epoch_number, iterate = iterator.get_next()
+      return (epoch_number, iterate) + new_state
+
+    def while_cond(epoch_number, iterate, *state):
+      del iterate
+      return gen_math_ops.logical_and(epoch_number < 1, extra_cond(*state))
+
+    results = while_loop(
+        while_cond,
+        while_body,
+        init_state=(epoch_number, iterate) + init_state,
+        extra_deps=())
+  # Dropping the epoch number and iterate because they are not not syntactically
+  # visible.
+  results = results[2:]
+
+  # TODO(mdan): Remove this special case.
+  if len(results) == 1:
+    return results[0]
+  return results
+
+
+def while_loop(loop_cond, loop_body, init_state, extra_deps, opts=None):
+  """Functional form of a while statement.
+
+  The loop operates on a so-called state, which includes all symbols that are
+  variant across loop iterations. In what follows we refer to state as either
+  a tuple of entities that represent an actual state, or a list of arguments
+  of the corresponding types.
+
+  Args:
+    loop_cond: Callable with the state as arguments, and boolean return type.
+        The loop condition.
+    loop_body: Callable with the state as arguments, and state as return type.
+        The actual loop body.
+    init_state: Tuple containing the initial state.
+    extra_deps: Tuple containing additional entities on which the loop may
+        depend, such as loop invariants referenced by loop_cond. Used
+        exclusively for dispatch control.
+    opts: Optional dict of extra loop parameters.
+
+  Returns:
+    Tuple containing the final state.
+  """
+  # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
+  # That could be somethins as simple as a collection of dispatch rules, with
+  # some prioritization.
+  if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
+    return _tf_while_loop(loop_cond, loop_body, init_state, opts)
+  else:
+    return _py_while_loop(loop_cond, loop_body, init_state, opts)
+
+
+def _tf_while_loop(loop_cond, loop_body, init_state, opts):
+  """Overload of while_loop that stages a TF while_loop."""
+  if opts is None:
+    opts = {}
+  return control_flow_ops.while_loop(loop_cond, loop_body, init_state, **opts)
+
+
+def _py_while_loop(loop_cond, loop_body, init_state, opts):
+  """Overload of while_loop that executes a Python while loop."""
+  del opts
+  state = init_state
+  while loop_cond(*state):
+    state = loop_body(*state)
+  return state
+
+
+def if_stmt(cond, body, orelse):
+  """Functional form of an if statement.
+
+  Args:
+    cond: Boolean.
+    body: Callable with no arguments, and outputs of the positive (if) branch
+        as return type.
+    orelse: Callable with no arguments, and outputs of the negative (else)
+        branch as return type.
+
+  Returns:
+    Tuple containing the statement outputs.
+  """
+  if tensor_util.is_tensor(cond):
+    return _tf_if_stmt(cond, body, orelse)
+  else:
+    return _py_if_stmt(cond, body, orelse)
+
+
+def _tf_if_stmt(cond, body, orelse):
+  """Overload of if_stmt that stages a TF cond."""
+  return control_flow_ops.cond(cond, body, orelse)
+
+
+def _py_if_stmt(cond, body, orelse):
+  """Overload of if_stmt that executes a Python if statement."""
+  return body() if cond else orelse()
diff --git a/tensorflow/contrib/autograph/operators/control_flow_test.py b/tensorflow/contrib/autograph/operators/control_flow_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0cd0bfa82bb052d55dfe30f8700fc33a794a59f
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/control_flow_test.py
@@ -0,0 +1,99 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for control_flow module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.operators import control_flow
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ForLoopTest(test.TestCase):
+
+  def test_tensor(self):
+    s = control_flow.for_loop(
+        constant_op.constant([1, 2, 3, 4]),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    with self.test_session() as sess:
+      self.assertEqual((10,), sess.run(s))
+
+  def test_python(self):
+    s = control_flow.for_loop(
+        range(5),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    self.assertEqual(10, s)
+
+  def test_dataset(self):
+    to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
+    s = control_flow.for_loop(
+        dataset_ops.Dataset.range(5).map(to_int32),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    with self.test_session() as sess:
+      self.assertEqual((10,), sess.run(s))
+
+
+class WhileLoopTest(test.TestCase):
+
+  def test_tensor(self):
+    n = constant_op.constant(5)
+    results = control_flow.while_loop(
+        loop_cond=lambda i, s: i < n,
+        loop_body=lambda i, s: (i + 1, s + i,),
+        init_state=(0, 0),
+        extra_deps=(n,))
+    with self.test_session() as sess:
+      self.assertEqual((5, 10), sess.run(results))
+
+  def test_python(self):
+    n = 5
+    results = control_flow.while_loop(
+        loop_cond=lambda i, s: i < n,
+        loop_body=lambda i, s: (i + 1, s + i),
+        init_state=(0, 0),
+        extra_deps=(n,))
+    self.assertEqual((5, 10), results)
+
+
+class IfStmtTest(test.TestCase):
+
+  def test_tensor(self):
+    def test_if_stmt(cond):
+      return control_flow.if_stmt(
+          cond=cond,
+          body=lambda: 1,
+          orelse=lambda: -1)
+    with self.test_session() as sess:
+      self.assertEqual(1, sess.run(test_if_stmt(constant_op.constant(True))))
+      self.assertEqual(-1, sess.run(test_if_stmt(constant_op.constant(False))))
+
+  def test_python(self):
+    self.assertEqual(1, control_flow.if_stmt(True, lambda: 1, lambda: -1))
+    self.assertEqual(-1, control_flow.if_stmt(False, lambda: 1, lambda: -1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/operators/data_structures.py b/tensorflow/contrib/autograph/operators/data_structures.py
new file mode 100644
index 0000000000000000000000000000000000000000..c862306baa9e8114a71a26323ddcbd35c8592c55
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/data_structures.py
@@ -0,0 +1,56 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operators specific to data structures: list append, subscripts, etc."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import tensor_array_ops
+
+# TODO(mdan): Add support for TensorList once functional.
+# TODO(mdan): Add primitives for empty list, list with elements.
+
+
+def append(target, element):
+  """The list append function.
+
+  Note: it is unspecified where target will be mutated or not. If target is
+  a TensorFlow entity, it will not be typically mutated. If target is a plain
+  list, it will be. In general, if the target is mutated then the return value
+  should point to the original entity.
+
+  Args:
+    target: An entity that supports append semantics.
+    element: The element to append.
+
+  Returns:
+    Same as target, after the append was performed.
+  """
+  if isinstance(target, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_append(target, element)
+  else:
+    return _py_append(target, element)
+
+
+def _tf_tensorarray_append(target, element):
+  """Overload of append that stages a TensorArray write at the last position."""
+  return target.write(target.size(), element)
+
+
+def _py_append(target, element):
+  """Overload of append that executes a Python list append."""
+  target.append(element)
+  return target
diff --git a/tensorflow/contrib/py2tf/converters/for_loops_test.py b/tensorflow/contrib/autograph/operators/data_structures_test.py
similarity index 54%
rename from tensorflow/contrib/py2tf/converters/for_loops_test.py
rename to tensorflow/contrib/autograph/operators/data_structures_test.py
index 70a367d3b517e528b67f260d607431d324d2ab7d..577d28c34da39f1216669513c29a00ac07bec126 100644
--- a/tensorflow/contrib/py2tf/converters/for_loops_test.py
+++ b/tensorflow/contrib/autograph/operators/data_structures_test.py
@@ -12,35 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for for_loops module."""
+"""Tests for data_structures module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import for_loops
+from tensorflow.contrib.autograph.operators import data_structures
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
 
-class ControlFlowTest(converter_test_base.TestCase):
+class AppendTest(test.TestCase):
 
-  def test_basic_for(self):
+  def test_tf_tensorarray(self):
+    l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
+    l1 = data_structures.append(l, 1)
+    l2 = data_structures.append(l1, 2)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(l1.stack()), [1])
+      self.assertAllEqual(sess.run(l2.stack()), [1, 2])
 
-    def test_fn(l):
-      s = 0
-      for e in l:
-        s += e
-      return s
-
-    node = self.parse_and_analyze(test_fn, {})
-    node = for_loops.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      l = [1, 2, 3]
-      self.assertEqual(test_fn(l), result.test_fn(l))
-      l = []
-      self.assertEqual(test_fn(l), result.test_fn(l))
+  def test_python(self):
+    l = []
+    self.assertAllEqual(data_structures.append(l, 1), [1])
+    self.assertAllEqual(data_structures.append(l, 2), [1, 2])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/operators/dispatch_context.py b/tensorflow/contrib/autograph/operators/dispatch_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..097002465bd140eb92ee65b9dcd4e3643a0357d2
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/dispatch_context.py
@@ -0,0 +1,41 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Structures that allow uniform control over the dispatch process."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+
+# TODO(mdan): This is where macro override controls fit.
+
+
+class DispatchContext(collections.namedtuple(
+    'DispatchContext',
+    ('options',))):
+  """Allows passing additional parameters to the specific implementations.
+
+  Attributes:
+    options: Optional dict of extra arguments that may be required by specific
+      implementations.
+  """
+
+  def option(self, name):
+    return self.options[name]
+
+
+NO_CTX = DispatchContext(options={})
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
similarity index 81%
rename from tensorflow/contrib/py2tf/pyct/BUILD
rename to tensorflow/contrib/autograph/pyct/BUILD
index e3c0da4b10f9ffbee1b2a906b64d4762f41d97b4..796ab445c74128e1123e24b67c288e0e3c5ca24c 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -24,6 +24,7 @@ py_library(
         "ast_util.py",
         "compiler.py",
         "context.py",
+        "inspect_utils.py",
         "parser.py",
         "pretty_printer.py",
         "qual_names.py",
@@ -65,6 +66,18 @@ py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "inspect_utils_test",
+    srcs = ["inspect_utils_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -112,3 +125,14 @@ py_test(
         "@gast_archive//:gast",
     ],
 )
+
+py_test(
+    name = "transformer_test",
+    srcs = ["transformer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/pyct/__init__.py b/tensorflow/contrib/autograph/pyct/__init__.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/__init__.py
rename to tensorflow/contrib/autograph/pyct/__init__.py
diff --git a/tensorflow/contrib/py2tf/pyct/anno.py b/tensorflow/contrib/autograph/pyct/anno.py
similarity index 92%
rename from tensorflow/contrib/py2tf/pyct/anno.py
rename to tensorflow/contrib/autograph/pyct/anno.py
index 7a0528b6d0b65b6604930b7a13d8493af9d61f02..cc4a7edf02ed7556c9a552d8730e4c7875038c83 100644
--- a/tensorflow/contrib/py2tf/pyct/anno.py
+++ b/tensorflow/contrib/autograph/pyct/anno.py
@@ -70,3 +70,8 @@ def delanno(node, key, field_name='___pyct_anno'):
   if not annotations:
     delattr(node, field_name)
     node._fields = tuple(f for f in node._fields if f != field_name)
+
+
+def copyanno(from_node, to_node, key, field_name='___pyct_anno'):
+  if hasanno(from_node, key, field_name):
+    setanno(to_node, key, getanno(from_node, key, field_name), field_name)
diff --git a/tensorflow/contrib/py2tf/pyct/anno_test.py b/tensorflow/contrib/autograph/pyct/anno_test.py
similarity index 77%
rename from tensorflow/contrib/py2tf/pyct/anno_test.py
rename to tensorflow/contrib/autograph/pyct/anno_test.py
index ff40bfe1f50ae731648afdf509c26c3a70d3f6cb..1d4d9d119e0c45c4bf9dd4e5b8156766489a2e4d 100644
--- a/tensorflow/contrib/py2tf/pyct/anno_test.py
+++ b/tensorflow/contrib/autograph/pyct/anno_test.py
@@ -20,10 +20,13 @@ from __future__ import print_function
 
 import ast
 
-from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.python.platform import test
 
 
+# TODO(mdan): Consider strong types instead of primitives.
+
+
 class AnnoTest(test.TestCase):
 
   def test_basic(self):
@@ -42,6 +45,17 @@ class AnnoTest(test.TestCase):
     with self.assertRaises(AttributeError):
       anno.getanno(node, 'foo')
 
+  def test_copyanno(self):
+    node_1 = ast.Name()
+    anno.setanno(node_1, 'foo', 3)
+
+    node_2 = ast.Name()
+    anno.copyanno(node_1, node_2, 'foo')
+    anno.copyanno(node_1, node_2, 'bar')
+
+    self.assertTrue(anno.hasanno(node_2, 'foo'))
+    self.assertFalse(anno.hasanno(node_2, 'bar'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py
similarity index 51%
rename from tensorflow/contrib/py2tf/pyct/ast_util.py
rename to tensorflow/contrib/autograph/pyct/ast_util.py
index f916775b9cf3cec960ec2896c334f1d737862205..c4f82d11708393a6029d3f17be428b47eb9342ff 100644
--- a/tensorflow/contrib/py2tf/pyct/ast_util.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util.py
@@ -22,13 +22,14 @@ import ast
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import parser
 
 
 class CleanCopier(gast.NodeVisitor):
-  """Copy AST nodes.
+  """Copies AST nodes.
 
-  The copied nodes will ignore almost all fields that prefixed by '__'.
+  The copied nodes will ignore almost all fields that are prefixed by '__'.
   Exceptions make some annotations.
   """
 
@@ -84,7 +85,10 @@ class SymbolRenamer(gast.NodeTransformer):
     return self._process(node)
 
   def visit_Attribute(self, node):
-    return self._process(node)
+    if anno.hasanno(node, anno.Basic.QN):
+      return self._process(node)
+    # Attributes of dynamic objects will not have a QN.
+    return self.generic_visit(node)
 
 
 def rename_symbols(node, name_map):
@@ -94,3 +98,88 @@ def rename_symbols(node, name_map):
   elif isinstance(node, tuple):
     return tuple(renamer.visit(n) for n in node)
   return renamer.visit(node)
+
+
+def keywords_to_dict(keywords):
+  keys = []
+  values = []
+  for kw in keywords:
+    keys.append(gast.Str(kw.arg))
+    values.append(kw.value)
+  return gast.Dict(keys=keys, values=values)
+
+
+class PatternMatcher(gast.NodeVisitor):
+  """Matches a node against a pattern represented by a node.
+
+  The pattern may contain wildcards represented by the symbol '_'.
+  """
+
+  def __init__(self, pattern):
+    self.pattern = pattern
+    self.pattern_stack = []
+    self.matches = True
+
+  def compare_and_visit(self, node, pattern):
+    self.pattern_stack.append(self.pattern)
+    self.pattern = pattern
+    self.generic_visit(node)
+    self.pattern = self.pattern_stack.pop()
+
+  def no_match(self):
+    self.matches = False
+    return False
+
+  def is_wildcard(self, p):
+    if isinstance(p, (list, tuple)) and len(p) == 1:
+      p, = p
+    if isinstance(p, gast.Name) and p.id == '_':
+      return True
+    if p == '_':
+      return True
+    return False
+
+  def generic_visit(self, node):
+    if not self.matches:
+      return
+
+    pattern = self.pattern
+    for f in node._fields:
+      if f.startswith('__'):
+        continue
+
+      if not hasattr(node, f):
+        if hasattr(pattern, f) and getattr(pattern, f):
+          return self.no_match()
+        else:
+          continue
+      if not hasattr(pattern, f):
+        return self.no_match()
+
+      v = getattr(node, f)
+      p = getattr(pattern, f)
+
+      if self.is_wildcard(p):
+        continue
+      if isinstance(v, (list, tuple)):
+        if not isinstance(p, (list, tuple)) or len(v) != len(p):
+          return self.no_match()
+        for v_item, p_item in zip(v, p):
+          self.compare_and_visit(v_item, p_item)
+      elif isinstance(v, (gast.AST, ast.AST)):
+        if not isinstance(v, type(p)) and not isinstance(p, type(v)):
+          return self.no_match()
+        self.compare_and_visit(v, p)
+      else:
+        # Assume everything else is a value type.
+        if v != p:
+          return self.no_match()
+
+
+def matches(node, pattern):
+  if isinstance(pattern, str):
+    pattern = parser.parse_expression(pattern)
+  matcher = PatternMatcher(pattern)
+  matcher.visit(node)
+  return matcher.matches
+
diff --git a/tensorflow/contrib/py2tf/pyct/ast_util_test.py b/tensorflow/contrib/autograph/pyct/ast_util_test.py
similarity index 53%
rename from tensorflow/contrib/py2tf/pyct/ast_util_test.py
rename to tensorflow/contrib/autograph/pyct/ast_util_test.py
index e0b00c178168f96e656c57cc75a76e6da8af1d8a..3afa04a50685d19c90944c14ed39f9d3ad35e486 100644
--- a/tensorflow/contrib/py2tf/pyct/ast_util_test.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 
 import ast
 
-from tensorflow.contrib.py2tf.pyct import ast_util
-from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
 from tensorflow.python.platform import test
 
 
@@ -33,15 +35,15 @@ class AstUtilTest(test.TestCase):
         ast.Name('b', ast.Load()),
         ast.Attribute(ast.Name('b', None), 'c', ast.Store()),
         ast.Attribute(
-            ast.Attribute(ast.Name('b', None), 'c', ast.Load()), 'd',
-            None)
+            ast.Attribute(ast.Name('b', None), 'c', ast.Load()), 'd', None)
     ], None)
     node = qual_names.resolve(node)
     node = ast_util.rename_symbols(
-        node,
-        {
-            qual_names.QN('a'): qual_names.QN('renamed_a'),
-            qual_names.QN('b.c'): qual_names.QN('renamed_b_c'),
+        node, {
+            qual_names.QN('a'):
+                qual_names.QN('renamed_a'),
+            qual_names.QN(qual_names.QN('b'), attr='c'):
+                qual_names.QN('renamed_b_c'),
         })
 
     self.assertEqual(node.elts[0].id, 'renamed_a')
@@ -74,6 +76,43 @@ class AstUtilTest(test.TestCase):
     self.assertFalse(ret is new_node.body[0])
     self.assertFalse(hasattr(new_node.body[0], '__foo'))
 
+  def test_keywords_to_dict(self):
+    keywords = parser.parse_expression('f(a=b, c=1, d=\'e\')').keywords
+    d = ast_util.keywords_to_dict(keywords)
+    # Make sure we generate a usable dict node by attaching it to a variable and
+    # compiling everything.
+    output = parser.parse_str('b = 3')
+    output.body += (ast.Assign([ast.Name(id='d', ctx=ast.Store())], d),)
+    result, _ = compiler.ast_to_object(output)
+    self.assertDictEqual(result.d, {'a': 3, 'c': 1, 'd': 'e'})
+
+  def assertMatch(self, target_str, pattern_str):
+    node = parser.parse_expression(target_str)
+    pattern = parser.parse_expression(pattern_str)
+    self.assertTrue(ast_util.matches(node, pattern))
+
+  def assertNoMatch(self, target_str, pattern_str):
+    node = parser.parse_expression(target_str)
+    pattern = parser.parse_expression(pattern_str)
+    self.assertFalse(ast_util.matches(node, pattern))
+
+  def test_matches_symbols(self):
+    self.assertMatch('foo', '_')
+    self.assertNoMatch('foo()', '_')
+    self.assertMatch('foo + bar', 'foo + _')
+    self.assertNoMatch('bar + bar', 'foo + _')
+    self.assertNoMatch('foo - bar', 'foo + _')
+
+  def test_matches_function_args(self):
+    self.assertMatch('super(Foo, self).__init__(arg1, arg2)',
+                     'super(_).__init__(_)')
+    self.assertMatch('super().__init__()', 'super(_).__init__(_)')
+    self.assertNoMatch('super(Foo, self).bar(arg1, arg2)',
+                       'super(_).__init__(_)')
+    self.assertMatch('super(Foo, self).__init__()', 'super(Foo, _).__init__(_)')
+    self.assertNoMatch('super(Foo, self).__init__()',
+                       'super(Bar, _).__init__(_)')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/compiler.py b/tensorflow/contrib/autograph/pyct/compiler.py
similarity index 90%
rename from tensorflow/contrib/py2tf/pyct/compiler.py
rename to tensorflow/contrib/autograph/pyct/compiler.py
index 51cf6930e8bcb3728ee55bf5d4781f01a5ef73bd..24c4517afa89147101f80af3ef60237132c1144c 100644
--- a/tensorflow/contrib/py2tf/pyct/compiler.py
+++ b/tensorflow/contrib/autograph/pyct/compiler.py
@@ -31,7 +31,7 @@ import astor
 import gast
 
 
-def ast_to_source(node, indentation):
+def ast_to_source(node, indentation='  '):
   """Return the source code of given AST."""
   if isinstance(node, gast.AST):
     node = gast.gast_to_ast(node)
@@ -39,7 +39,10 @@ def ast_to_source(node, indentation):
                                             astor.string_repr.pretty_string)
   generator.visit(node)
   generator.result.append('\n')
-  return astor.source_repr.pretty_source(generator.result).lstrip()
+  # In some versions of Python, literals may appear as actual values. This
+  # ensures everything is string.
+  code = map(str, generator.result)
+  return astor.source_repr.pretty_source(code).lstrip()
 
 
 def ast_to_object(
diff --git a/tensorflow/contrib/py2tf/pyct/compiler_test.py b/tensorflow/contrib/autograph/pyct/compiler_test.py
similarity index 83%
rename from tensorflow/contrib/py2tf/pyct/compiler_test.py
rename to tensorflow/contrib/autograph/pyct/compiler_test.py
index c1f84238efa7dd6fc0748748a2cb4f074572b4c6..98cdc1506b6aced603df99662f1468687a55f92c 100644
--- a/tensorflow/contrib/py2tf/pyct/compiler_test.py
+++ b/tensorflow/contrib/autograph/pyct/compiler_test.py
@@ -22,12 +22,29 @@ import textwrap
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
 
 
 class CompilerTest(test.TestCase):
 
+  def test_parser_compile_idempotent(self):
+
+    def test_fn(x):
+      a = True
+      b = ''
+      if a:
+        b = x + 1
+      return b
+
+    self.assertEqual(
+        textwrap.dedent(tf_inspect.getsource(test_fn)),
+        tf_inspect.getsource(
+            compiler.ast_to_object(
+                parser.parse_entity(test_fn)[0].body[0])[0].test_fn))
+
   def test_ast_to_source(self):
     node = gast.If(
         test=gast.Num(1),
diff --git a/tensorflow/contrib/py2tf/pyct/context.py b/tensorflow/contrib/autograph/pyct/context.py
similarity index 82%
rename from tensorflow/contrib/py2tf/pyct/context.py
rename to tensorflow/contrib/autograph/pyct/context.py
index fef74ebefa290369c7310af6d7e4faeef44d9aee..b34015cfd2888f0dbeb6492b9e7335d561bf4763 100644
--- a/tensorflow/contrib/py2tf/pyct/context.py
+++ b/tensorflow/contrib/autograph/pyct/context.py
@@ -22,6 +22,8 @@ from __future__ import print_function
 class EntityContext(object):
   """Contains information about an entity, like source code.
 
+  In general, objects of this class should be considered immutable.
+
   Attributes:
     namer: Namer that matches the contract of all converters.
     source_code: The entity's source code.
@@ -30,14 +32,18 @@ class EntityContext(object):
         (excluding parameters).
     arg_values: Dict[str->*], containing parameter values, if known.
     arg_types: Dict[str->*], containing parameter types, if known.
+    owner_type: The surrounding class type of the function, if present.
   """
 
+  # TODO(mdan): Remove the default and update tests.
   def __init__(self, namer, source_code, source_file, namespace, arg_values,
-               arg_types, recursive):
+               arg_types, owner_type, recursive, type_annotation_func=None):
     self.namer = namer
     self.source_code = source_code
     self.source_file = source_file
     self.namespace = namespace
     self.arg_values = {} if arg_values is None else arg_values
     self.arg_types = {} if arg_types is None else arg_types
+    self.owner_type = owner_type
     self.recursive = recursive
+    self.type_annotation_func = type_annotation_func
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eef74599a7d5415b4b05d2f05fb094b1dcd33323
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -0,0 +1,161 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Live entity inspection utilities.
+
+This module contains whatever inspect doesn't offer out of the box.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import types
+
+import six
+
+from tensorflow.python.util import tf_inspect
+
+
+def isbuiltin(f):
+  # Note these return false for isinstance(f, types.BuiltinFunctionType) so we
+  # need to specifically check for them.
+  if f in (range, int, float):
+    return True
+  if isinstance(f, types.BuiltinFunctionType):
+    return True
+  if tf_inspect.isbuiltin(f):
+    return True
+  return False
+
+
+def getnamespace(f):
+  """Returns the complete namespace of a function.
+
+  Namespace is defined here as the mapping of all non-local variables to values.
+  This includes the globals and the closure variables. Note that this captures
+  the entire globals collection of the function, and may contain extra symbols
+  that it does not actually use.
+
+  Args:
+    f: User defined function.
+  Returns:
+    A dict mapping symbol names to values.
+  """
+  namespace = dict(six.get_function_globals(f))
+  closure = six.get_function_closure(f)
+  freevars = six.get_function_code(f).co_freevars
+  if freevars and closure:
+    for name, cell in zip(freevars, closure):
+      namespace[name] = cell.cell_contents
+  return namespace
+
+
+def _get_unbound_function(m):
+  # TODO(mdan): Figure out why six.get_unbound_function fails in some cases.
+  # The failure case is for tf.keras.Model.
+  if hasattr(m, 'im_func'):
+    return m.im_func
+  return m
+
+
+def getdefiningclass(m, owner_class):
+  """Resolves the class (e.g. one of the superclasses) that defined a method."""
+  # Normalize bound functions to their respective unbound versions.
+  m = _get_unbound_function(m)
+  for superclass in owner_class.__bases__:
+    if hasattr(superclass, m.__name__):
+      superclass_m = getattr(superclass, m.__name__)
+      if _get_unbound_function(superclass_m) is m:
+        return superclass
+      elif hasattr(m, '__self__') and m.__self__ == owner_class:
+        # Python 3 class methods only work this way it seems :S
+        return superclass
+  return owner_class
+
+
+def getmethodclass(m):
+  """Resolves a function's owner, e.g. a method's class.
+
+  Note that this returns the object that the function was retrieved from, not
+  necessarily the class where it was defined.
+
+  This function relies on Python stack frame support in the interpreter, and
+  has the same limitations that inspect.currentframe.
+
+  Limitations. This function will only work correctly if the owned class is
+  visible in the caller's global or local variables.
+
+  Args:
+    m: A user defined function
+
+  Returns:
+    The class that this function was retrieved from, or None if the function
+    is not an object or class method, or the class that owns the object or
+    method is not visible to m.
+
+  Raises:
+    ValueError: if the class could not be resolved for any unexpected reason.
+  """
+
+  # Callable objects: return their own class.
+  if (not hasattr(m, '__name__') and hasattr(m, '__class__') and
+      hasattr(m, '__call__')):
+    if isinstance(m.__class__, six.class_types):
+      return m.__class__
+
+  # Instance method and class methods: should be bound to a non-null "self".
+  # If self is a class, then it's a class method.
+  if hasattr(m, '__self__'):
+    if m.__self__:
+      if tf_inspect.isclass(m.__self__):
+        return m.__self__
+      return type(m.__self__)
+
+  # Class, static and unbound methods: search all defined classes in any
+  # namespace. This is inefficient but more robust method.
+  owners = []
+  caller_frame = tf_inspect.currentframe().f_back
+  try:
+    # TODO(mdan): This doesn't consider cell variables.
+    # TODO(mdan): This won't work if the owner is hidden inside a container.
+    # Cell variables may be pulled using co_freevars and the closure.
+    for v in itertools.chain(caller_frame.f_locals.values(),
+                             caller_frame.f_globals.values()):
+      if hasattr(v, m.__name__):
+        candidate = getattr(v, m.__name__)
+        # Py2 methods may be bound or unbound, extract im_func to get the
+        # underlying function.
+        if hasattr(candidate, 'im_func'):
+          candidate = candidate.im_func
+        if hasattr(m, 'im_func'):
+          m = m.im_func
+        if candidate is m:
+          owners.append(v)
+  finally:
+    del caller_frame
+
+  if owners:
+    if len(owners) == 1:
+      return owners[0]
+
+    # If multiple owners are found, and are not subclasses, raise an error.
+    owner_types = tuple(o if tf_inspect.isclass(o) else type(o) for o in owners)
+    for o in owner_types:
+      if tf_inspect.isclass(o) and issubclass(o, tuple(owner_types)):
+        return o
+    raise ValueError('Found too many owners of %s: %s' % (m, owners))
+
+  return None
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a212f676a616307b41feafafda9d1d794ba3d2d
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -0,0 +1,277 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for unspect_utils module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import wraps
+
+import six
+
+from tensorflow.contrib.autograph.pyct import inspect_utils
+from tensorflow.python.platform import test
+
+
+def decorator(f):
+  return f
+
+
+def function_decorator():
+  def dec(f):
+    return f
+  return dec
+
+
+def wrapping_decorator():
+  def dec(f):
+    def replacement(*_):
+      return None
+
+    @wraps(f)
+    def wrapper(*args, **kwargs):
+      return replacement(*args, **kwargs)
+    return wrapper
+  return dec
+
+
+class TestClass(object):
+
+  def member_function(self):
+    pass
+
+  @decorator
+  def decorated_member(self):
+    pass
+
+  @function_decorator()
+  def fn_decorated_member(self):
+    pass
+
+  @wrapping_decorator()
+  def wrap_decorated_member(self):
+    pass
+
+  @staticmethod
+  def static_method():
+    pass
+
+  @classmethod
+  def class_method(cls):
+    pass
+
+
+def free_function():
+  pass
+
+
+def factory():
+  return free_function
+
+
+def free_factory():
+  def local_function():
+    pass
+  return local_function
+
+
+class InspectUtilsTest(test.TestCase):
+
+  def test_getnamespace_globals(self):
+    ns = inspect_utils.getnamespace(factory)
+    self.assertEqual(ns['free_function'], free_function)
+
+  def test_getnamespace_hermetic(self):
+
+    # Intentionally hiding the global function to make sure we don't overwrite
+    # it in the global namespace.
+    free_function = object()  # pylint:disable=redefined-outer-name
+
+    def test_fn():
+      return free_function
+
+    ns = inspect_utils.getnamespace(test_fn)
+    globs = six.get_function_globals(test_fn)
+    self.assertTrue(ns['free_function'] is free_function)
+    self.assertFalse(globs['free_function'] is free_function)
+
+  def test_getnamespace_locals(self):
+
+    def called_fn():
+      return 0
+
+    closed_over_list = []
+    closed_over_primitive = 1
+
+    def local_fn():
+      closed_over_list.append(1)
+      local_var = 1
+      return called_fn() + local_var + closed_over_primitive
+
+    ns = inspect_utils.getnamespace(local_fn)
+    self.assertEqual(ns['called_fn'], called_fn)
+    self.assertEqual(ns['closed_over_list'], closed_over_list)
+    self.assertEqual(ns['closed_over_primitive'], closed_over_primitive)
+    self.assertTrue('local_var' not in ns)
+
+  def test_getmethodclass(self):
+
+    self.assertEqual(
+        inspect_utils.getmethodclass(free_function), None)
+    self.assertEqual(
+        inspect_utils.getmethodclass(free_factory()), None)
+
+    self.assertEqual(
+        inspect_utils.getmethodclass(TestClass.member_function),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(TestClass.decorated_member),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(TestClass.fn_decorated_member),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(TestClass.wrap_decorated_member),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(TestClass.static_method),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(TestClass.class_method),
+        TestClass)
+
+    test_obj = TestClass()
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.member_function),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.decorated_member),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.fn_decorated_member),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.static_method),
+        TestClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.class_method),
+        TestClass)
+
+  def test_getmethodclass_locals(self):
+
+    def local_function():
+      pass
+
+    class LocalClass(object):
+
+      def member_function(self):
+        pass
+
+      @decorator
+      def decorated_member(self):
+        pass
+
+      @function_decorator()
+      def fn_decorated_member(self):
+        pass
+
+      @wrapping_decorator()
+      def wrap_decorated_member(self):
+        pass
+
+    self.assertEqual(
+        inspect_utils.getmethodclass(local_function), None)
+
+    self.assertEqual(
+        inspect_utils.getmethodclass(LocalClass.member_function),
+        LocalClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(LocalClass.decorated_member),
+        LocalClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(LocalClass.fn_decorated_member),
+        LocalClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(LocalClass.wrap_decorated_member),
+        LocalClass)
+
+    test_obj = LocalClass()
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.member_function),
+        LocalClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.decorated_member),
+        LocalClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.fn_decorated_member),
+        LocalClass)
+    self.assertEqual(
+        inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
+        LocalClass)
+
+  def test_getmethodclass_callables(self):
+    class TestCallable(object):
+
+      def __call__(self):
+        pass
+
+    c = TestCallable()
+    self.assertEqual(inspect_utils.getmethodclass(c), TestCallable)
+
+  def test_getdefiningclass(self):
+    class Superclass(object):
+
+      def foo(self):
+        pass
+
+      def bar(self):
+        pass
+
+      @classmethod
+      def class_method(cls):
+        pass
+
+    class Subclass(Superclass):
+
+      def foo(self):
+        pass
+
+      def baz(self):
+        pass
+
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.foo, Subclass) is Subclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.bar, Subclass) is Superclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.baz, Subclass) is Subclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.class_method, Subclass) is
+        Superclass)
+
+  def test_isbuiltin(self):
+    self.assertTrue(inspect_utils.isbuiltin(range))
+    self.assertTrue(inspect_utils.isbuiltin(float))
+    self.assertTrue(inspect_utils.isbuiltin(int))
+    self.assertTrue(inspect_utils.isbuiltin(len))
+    self.assertFalse(inspect_utils.isbuiltin(function_decorator))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/pyct/parser.py b/tensorflow/contrib/autograph/pyct/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..c961efa892df6a21804dae8f52ef64bf99cd409e
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/parser.py
@@ -0,0 +1,58 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converting code to AST.
+
+Adapted from Tangent.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+import gast
+
+from tensorflow.python.util import tf_inspect
+
+
+def parse_entity(entity):
+  """Returns the AST of given entity."""
+  source = tf_inspect.getsource(entity)
+  source = textwrap.dedent(source)
+  return parse_str(source), source
+
+
+def parse_str(src):
+  """Returns the AST of given piece of code."""
+  return gast.parse(src)
+
+
+def parse_expression(src):
+  """Returns the AST of given identifier.
+
+  Args:
+    src: A piece of code that represents a single Python expression
+  Returns:
+    A gast.AST object.
+  Raises:
+    ValueError: if src does not consist of a single Expression.
+  """
+  node = parse_str(src)
+  assert isinstance(node, gast.Module)
+  if len(node.body) != 1 and not isinstance(node.body[0], gast.Expr):
+    raise ValueError(
+        'Expected a single expression, found instead %s' % node.body)
+  return node.body[0].value
diff --git a/tensorflow/contrib/py2tf/pyct/parser_test.py b/tensorflow/contrib/autograph/pyct/parser_test.py
similarity index 80%
rename from tensorflow/contrib/py2tf/pyct/parser_test.py
rename to tensorflow/contrib/autograph/pyct/parser_test.py
index f35dfa04c70dc191078248c32f9a04d28133129a..007a4c6fb0393b7235808478d55b3ffa469f85d0 100644
--- a/tensorflow/contrib/py2tf/pyct/parser_test.py
+++ b/tensorflow/contrib/autograph/pyct/parser_test.py
@@ -20,28 +20,33 @@ from __future__ import print_function
 
 import textwrap
 
-from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.python.platform import test
 
 
-def f(x):
-  return x + 1
-
-
 class ParserTest(test.TestCase):
 
   def test_parse_entity(self):
+
+    def f(x):
+      return x + 1
+
     mod, _ = parser.parse_entity(f)
     self.assertEqual('f', mod.body[0].name)
 
   def test_parse_str(self):
     mod = parser.parse_str(
         textwrap.dedent("""
-        def f(x):
-          return x + 1
+            def f(x):
+              return x + 1
     """))
     self.assertEqual('f', mod.body[0].name)
 
+  def test_parse_expression(self):
+    node = parser.parse_expression('a.b')
+    self.assertEqual('a', node.value.id)
+    self.assertEqual('b', node.attr)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/pretty_printer.py b/tensorflow/contrib/autograph/pyct/pretty_printer.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/pretty_printer.py
rename to tensorflow/contrib/autograph/pyct/pretty_printer.py
diff --git a/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py b/tensorflow/contrib/autograph/pyct/pretty_printer_test.py
similarity index 96%
rename from tensorflow/contrib/py2tf/pyct/pretty_printer_test.py
rename to tensorflow/contrib/autograph/pyct/pretty_printer_test.py
index 81e3f47b80b6cb3bb7ba9f4a1787d03df4151a99..0cb48f35760b7b2655eb5cf73017b70e28dae219 100644
--- a/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py
+++ b/tensorflow/contrib/autograph/pyct/pretty_printer_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import ast
 
-from tensorflow.contrib.py2tf.pyct import pretty_printer
+from tensorflow.contrib.autograph.pyct import pretty_printer
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/autograph/pyct/qual_names.py b/tensorflow/contrib/autograph/pyct/qual_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..583cf7ecd7bce31c55de58361ab5295abb5d6707
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/qual_names.py
@@ -0,0 +1,228 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for manipulating qualified names.
+
+A qualified name is a uniform way to refer to simple (e.g. 'foo') and composite
+(e.g. 'foo.bar') syntactic symbols.
+
+This is *not* related to the __qualname__ attribute used by inspect, which
+refers to scopes.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import anno
+
+
+class Symbol(collections.namedtuple('Symbol', ['name'])):
+  """Represents a Python symbol."""
+
+
+class StringLiteral(collections.namedtuple('StringLiteral', ['value'])):
+  """Represents a Python string literal."""
+
+  def __str__(self):
+    return '\'%s\'' % self.value
+
+  def __repr__(self):
+    return str(self)
+
+
+class NumberLiteral(collections.namedtuple('NumberLiteral', ['value'])):
+  """Represents a Python numeric literal."""
+
+  def __str__(self):
+    return '%s' % self.value
+
+  def __repr__(self):
+    return str(self)
+
+
+# TODO(mdan): Use subclasses to remove the has_attr has_subscript booleans.
+class QN(object):
+  """Represents a qualified name."""
+
+  def __init__(self, base, attr=None, subscript=None):
+    if attr is not None and subscript is not None:
+      raise ValueError('A QN can only be either an attr or a subscript, not '
+                       'both: attr={}, subscript={}.'.format(attr, subscript))
+    self._has_attr = False
+    self._has_subscript = False
+
+    if attr is not None:
+      if not isinstance(base, QN):
+        raise ValueError(
+            'for attribute QNs, base must be a QN; got instead "%s"' % base)
+      if not isinstance(attr, str):
+        raise ValueError('attr may only be a string; got instead "%s"' % attr)
+      self._parent = base
+      # TODO(mdan): Get rid of the tuple - it can only have 1 or 2 elements now.
+      self.qn = (base, attr)
+      self._has_attr = True
+
+    elif subscript is not None:
+      if not isinstance(base, QN):
+        raise ValueError('For subscript QNs, base must be a QN.')
+      self._parent = base
+      self.qn = (base, subscript)
+      self._has_subscript = True
+
+    else:
+      if not isinstance(base, (str, StringLiteral, NumberLiteral)):
+        # TODO(mdan): Require Symbol instead of string.
+        raise ValueError(
+            'For simple QNs, base must be a string or a Literal object.')
+      assert '.' not in base and '[' not in base and ']' not in base
+      self._parent = None
+      self.qn = (base,)
+
+  def is_symbol(self):
+    return isinstance(self.qn[0], str)
+
+  def is_composite(self):
+    return len(self.qn) > 1
+
+  def has_subscript(self):
+    return self._has_subscript
+
+  def has_attr(self):
+    return self._has_attr
+
+  @property
+  def parent(self):
+    if self._parent is None:
+      raise ValueError('Cannot get parent of simple name "%s".' % self.qn[0])
+    return self._parent
+
+  @property
+  def support_set(self):
+    """Returns the set of simple symbols that this QN relies on.
+
+    This would be the smallest set of symbols necessary for the QN to
+    statically resolve (assuming properties and index ranges are verified
+    at runtime).
+
+    Examples:
+      'a.b' has only one support symbol, 'a'
+      'a[i]' has two roots, 'a' and 'i'
+    """
+    # TODO(mdan): This might be the set of Name nodes in the AST. Track those?
+    roots = set()
+    if self.has_attr():
+      roots.update(self.parent.support_set)
+    elif self.has_subscript():
+      roots.update(self.parent.support_set)
+      roots.update(self.qn[1].support_set)
+    else:
+      roots.add(self)
+    return roots
+
+  def __hash__(self):
+    return hash(self.qn + (self._has_attr, self._has_subscript))
+
+  def __eq__(self, other):
+    return (isinstance(other, QN) and self.qn == other.qn and
+            self.has_subscript() == other.has_subscript() and
+            self.has_attr() == other.has_attr())
+
+  def __str__(self):
+    if self.has_subscript():
+      return str(self.qn[0]) + '[' + str(self.qn[1]) + ']'
+    if self.has_attr():
+      return '.'.join(map(str, self.qn))
+    else:
+      return str(self.qn[0])
+
+  def __repr__(self):
+    return str(self)
+
+  def ssf(self):
+    """Simple symbol form."""
+    ssfs = [n.ssf() if isinstance(n, QN) else n for n in self.qn]
+    ssf_string = ''
+    for i in range(0, len(self.qn) - 1):
+      if self.has_subscript():
+        delimiter = '_sub_'
+      else:
+        delimiter = '_'
+      ssf_string += ssfs[i] + delimiter
+    return ssf_string + ssfs[-1]
+
+  def ast(self):
+    # The caller must adjust the context appropriately.
+    if self.has_subscript():
+      return gast.Subscript(self.parent.ast(), gast.Index(self.qn[-1].ast()),
+                            None)
+    if self.has_attr():
+      return gast.Attribute(self.parent.ast(), self.qn[-1], None)
+
+    base = self.qn[0]
+    if isinstance(base, str):
+      return gast.Name(base, None, None)
+    elif isinstance(base, StringLiteral):
+      return gast.Str(base.value)
+    elif isinstance(base, NumberLiteral):
+      return gast.Num(base.value)
+    else:
+      assert False, ('the constructor should prevent types other than '
+                     'str, StringLiteral and NumberLiteral')
+
+
+class QnResolver(gast.NodeTransformer):
+  """Annotates nodes with QN information.
+
+  Note: Not using NodeAnnos to avoid circular dependencies.
+  """
+
+  def visit_Name(self, node):
+    node = self.generic_visit(node)
+    anno.setanno(node, anno.Basic.QN, QN(node.id))
+    return node
+
+  def visit_Attribute(self, node):
+    node = self.generic_visit(node)
+    if anno.hasanno(node.value, anno.Basic.QN):
+      anno.setanno(node, anno.Basic.QN,
+                   QN(anno.getanno(node.value, anno.Basic.QN), attr=node.attr))
+    return node
+
+  def visit_Subscript(self, node):
+    node = self.generic_visit(node)
+    s = node.slice
+    if not isinstance(s, gast.Index):
+      # TODO(mdan): Support range and multi-dimensional indices.
+      # Continuing silently because some demos use these.
+      return node
+    if isinstance(s.value, gast.Num):
+      subscript = QN(NumberLiteral(s.value.n))
+    elif isinstance(s.value, gast.Str):
+      subscript = QN(StringLiteral(s.value.s))
+    else:
+      subscript = anno.getanno(node.slice.value, anno.Basic.QN)
+    if anno.hasanno(node.value, anno.Basic.QN):
+      anno.setanno(node, anno.Basic.QN,
+                   QN(anno.getanno(node.value, anno.Basic.QN),
+                      subscript=subscript))
+    return node
+
+
+def resolve(node):
+  return QnResolver().visit(node)
diff --git a/tensorflow/contrib/autograph/pyct/qual_names_test.py b/tensorflow/contrib/autograph/pyct/qual_names_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..264afd508cdb847315c486806b531dc1483ef622
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/qual_names_test.py
@@ -0,0 +1,246 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for qual_names module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.qual_names import QN
+from tensorflow.contrib.autograph.pyct.qual_names import resolve
+from tensorflow.python.platform import test
+
+
+class QNTest(test.TestCase):
+
+  def test_basic(self):
+    a = QN('a')
+    self.assertEqual(a.qn, ('a',))
+    self.assertEqual(str(a), 'a')
+    self.assertEqual(a.ssf(), 'a')
+    self.assertEqual(a.ast().id, 'a')
+    self.assertFalse(a.is_composite())
+    with self.assertRaises(ValueError):
+      _ = a.parent
+
+    a_b = QN(a, attr='b')
+    self.assertEqual(a_b.qn, (a, 'b'))
+    self.assertEqual(str(a_b), 'a.b')
+    self.assertEqual(a_b.ssf(), 'a_b')
+    self.assertEqual(a_b.ast().value.id, 'a')
+    self.assertEqual(a_b.ast().attr, 'b')
+    self.assertTrue(a_b.is_composite())
+    self.assertEqual(a_b.parent.qn, ('a',))
+
+  def test_subscripts(self):
+    a = QN('a')
+    b = QN('b')
+    a_sub_b = QN(a, subscript=b)
+    self.assertEqual(a_sub_b.qn, (a, b))
+    self.assertEqual(str(a_sub_b), 'a[b]')
+    self.assertEqual(a_sub_b.ssf(), 'a_sub_b')
+    self.assertEqual(a_sub_b.ast().value.id, 'a')
+    self.assertEqual(a_sub_b.ast().slice.value.id, 'b')
+    self.assertTrue(a_sub_b.is_composite())
+    self.assertTrue(a_sub_b.has_subscript())
+    self.assertEqual(a_sub_b.parent.qn, ('a',))
+
+    c = QN('c')
+    b_sub_c = QN(b, subscript=c)
+    a_sub_b_sub_c = QN(a, subscript=b_sub_c)
+    self.assertEqual(a_sub_b_sub_c.qn, (a, b_sub_c))
+    self.assertTrue(a_sub_b.is_composite())
+    self.assertTrue(a_sub_b_sub_c.is_composite())
+    self.assertTrue(a_sub_b.has_subscript())
+    self.assertTrue(a_sub_b_sub_c.has_subscript())
+    self.assertEqual(b_sub_c.qn, (b, c))
+    self.assertEqual(str(a_sub_b_sub_c), 'a[b[c]]')
+    self.assertEqual(a_sub_b_sub_c.ssf(), 'a_sub_b_sub_c')
+    self.assertEqual(a_sub_b_sub_c.ast().value.id, 'a')
+    self.assertEqual(a_sub_b_sub_c.ast().slice.value.value.id, 'b')
+    self.assertEqual(a_sub_b_sub_c.ast().slice.value.slice.value.id, 'c')
+    self.assertEqual(b_sub_c.ast().slice.value.id, 'c')
+    self.assertEqual(a_sub_b_sub_c.parent.qn, ('a',))
+    with self.assertRaises(ValueError):
+      QN('a', 'b')
+
+  def test_equality(self):
+    a = QN('a')
+    a2 = QN('a')
+    a_b = QN(a, attr='b')
+    self.assertEqual(a2.qn, ('a',))
+    with self.assertRaises(ValueError):
+      _ = a.parent
+
+    a_b2 = QN(a, attr='b')
+    self.assertEqual(a_b2.qn, (a, 'b'))
+    self.assertEqual(a_b2.parent.qn, ('a',))
+
+    self.assertTrue(a2 == a)
+    self.assertFalse(a2 is a)
+
+    self.assertTrue(a_b.parent == a)
+    self.assertTrue(a_b2.parent == a)
+
+    self.assertTrue(a_b2 == a_b)
+    self.assertFalse(a_b2 is a_b)
+    self.assertFalse(a_b2 == a)
+    a_sub_b = QN(a, subscript='b')
+    a_sub_b2 = QN(a, subscript='b')
+    self.assertTrue(a_sub_b == a_sub_b2)
+    self.assertFalse(a_sub_b == a_b)
+
+  def test_nested_attrs_subscripts(self):
+    a = QN('a')
+    b = QN('b')
+    c = QN('c')
+    b_sub_c = QN(b, subscript=c)
+    a_sub_b_sub_c = QN(a, subscript=b_sub_c)
+
+    b_dot_c = QN(b, attr='c')
+    a_sub__b_dot_c = QN(a, subscript=b_dot_c)
+
+    a_sub_b = QN(a, subscript=b)
+    a_sub_b__dot_c = QN(a_sub_b, attr='c')
+
+    a_dot_b = QN(a, attr='b')
+    a_dot_b_sub_c = QN(a_dot_b, subscript=c)
+
+    self.assertEqual(str(a_sub_b_sub_c), 'a[b[c]]')
+    self.assertEqual(str(a_sub__b_dot_c), 'a[b.c]')
+    self.assertEqual(str(a_sub_b__dot_c), 'a[b].c')
+    self.assertEqual(str(a_dot_b_sub_c), 'a.b[c]')
+
+    self.assertNotEqual(a_sub_b_sub_c, a_sub__b_dot_c)
+    self.assertNotEqual(a_sub_b_sub_c, a_sub_b__dot_c)
+    self.assertNotEqual(a_sub_b_sub_c, a_dot_b_sub_c)
+
+    self.assertNotEqual(a_sub__b_dot_c, a_sub_b__dot_c)
+    self.assertNotEqual(a_sub__b_dot_c, a_dot_b_sub_c)
+
+    self.assertNotEqual(a_sub_b__dot_c, a_dot_b_sub_c)
+
+  def test_hashable(self):
+    d = {QN('a'): 'a', QN('b'): 'b'}
+    self.assertEqual(d[QN('a')], 'a')
+    self.assertEqual(d[QN('b')], 'b')
+    self.assertTrue(QN('c') not in d)
+
+  def test_literals(self):
+    a = QN('a')
+    a_sub_str_b = QN(a, subscript=QN(qual_names.StringLiteral('b')))
+    a_sub_b = QN(a, subscript=QN('b'))
+
+    self.assertNotEqual(a_sub_str_b, a_sub_b)
+    self.assertNotEqual(hash(a_sub_str_b), hash(a_sub_b))
+
+    a_sub_three = QN(a, subscript=QN(qual_names.NumberLiteral(3)))
+    self.assertEqual(a_sub_three.ast().slice.value.n, 3)
+
+  def test_support_set(self):
+    a = QN('a')
+    b = QN('b')
+    c = QN('c')
+    a_sub_b = QN(a, subscript=b)
+    a_dot_b = QN(a, attr='b')
+    a_dot_b_dot_c = QN(a_dot_b, attr='c')
+    a_dot_b_sub_c = QN(a_dot_b, subscript=c)
+
+    self.assertSetEqual(a.support_set, set((a,)))
+    self.assertSetEqual(a_sub_b.support_set, set((a, b)))
+    self.assertSetEqual(a_dot_b.support_set, set((a,)))
+    self.assertSetEqual(a_dot_b_dot_c.support_set, set((a,)))
+    self.assertSetEqual(a_dot_b_sub_c.support_set, set((a, c)))
+
+
+class QNResolverTest(test.TestCase):
+
+  def assertQNStringIs(self, node, qn_str):
+    self.assertEqual(str(anno.getanno(node, anno.Basic.QN)), qn_str)
+
+  def test_resolve(self):
+    samples = """
+      a
+      a.b
+      (c, d.e)
+      [f, (g.h.i)]
+      j(k, l)
+    """
+    nodes = resolve(parser.parse_str(textwrap.dedent(samples)))
+    nodes = tuple(n.value for n in nodes.body)
+
+    self.assertQNStringIs(nodes[0], 'a')
+    self.assertQNStringIs(nodes[1], 'a.b')
+    self.assertQNStringIs(nodes[2].elts[0], 'c')
+    self.assertQNStringIs(nodes[2].elts[1], 'd.e')
+    self.assertQNStringIs(nodes[3].elts[0], 'f')
+    self.assertQNStringIs(nodes[3].elts[1], 'g.h.i')
+    self.assertQNStringIs(nodes[4].func, 'j')
+    self.assertQNStringIs(nodes[4].args[0], 'k')
+    self.assertQNStringIs(nodes[4].args[1], 'l')
+
+  def test_subscript_resolve(self):
+    samples = """
+      x[i]
+      x[i.b]
+      a.b[c]
+      a.b[x.y]
+      a[z[c]]
+      a[b[c[d]]]
+      a[b].c
+      a.b.c[d].e.f
+      a.b[c[d]].e.f
+      a.b[c[d.e.f].g].h
+    """
+    nodes = resolve(parser.parse_str(textwrap.dedent(samples)))
+    nodes = tuple(n.value for n in nodes.body)
+
+    self.assertQNStringIs(nodes[0], 'x[i]')
+    self.assertQNStringIs(nodes[1], 'x[i.b]')
+    self.assertQNStringIs(nodes[2], 'a.b[c]')
+    self.assertQNStringIs(nodes[3], 'a.b[x.y]')
+    self.assertQNStringIs(nodes[4], 'a[z[c]]')
+    self.assertQNStringIs(nodes[5], 'a[b[c[d]]]')
+    self.assertQNStringIs(nodes[6], 'a[b].c')
+    self.assertQNStringIs(nodes[7], 'a.b.c[d].e.f')
+    self.assertQNStringIs(nodes[8], 'a.b[c[d]].e.f')
+    self.assertQNStringIs(nodes[9], 'a.b[c[d.e.f].g].h')
+
+  def test_function_calls(self):
+    samples = """
+      a.b
+      a.b()
+      a().b
+      z[i]
+      z[i]()
+      z()[i]
+    """
+    nodes = resolve(parser.parse_str(textwrap.dedent(samples)))
+    nodes = tuple(n.value for n in nodes.body)
+    self.assertQNStringIs(nodes[0], 'a.b')
+    self.assertQNStringIs(nodes[1].func, 'a.b')
+    self.assertQNStringIs(nodes[2].value.func, 'a')
+    self.assertQNStringIs(nodes[3], 'z[i]')
+    self.assertQNStringIs(nodes[4].func, 'z[i]')
+    self.assertQNStringIs(nodes[5].value.func, 'z')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
similarity index 80%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
rename to tensorflow/contrib/autograph/pyct/static_analysis/BUILD
index fbfce18c60cca4b105e7de3c3ea7b9c3438f6b2a..83f3bafc4217649db6499566d548c1657428ad0b 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
@@ -25,7 +25,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "@gast_archive//:gast",
     ],
 )
@@ -34,9 +34,10 @@ py_test(
     name = "activity_test",
     srcs = ["activity_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":static_analysis",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
     ],
@@ -46,9 +47,10 @@ py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":static_analysis",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -59,7 +61,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/contrib/autograph/utils",
         "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/__init__.py b/tensorflow/contrib/autograph/pyct/static_analysis/__init__.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/__init__.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/__init__.py
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
similarity index 66%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/activity.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index 1c93e1603113d48176af7a97a0f37321e6f67586..2c14c2c8c23810c64446eb9e7ffc5402ce9a2298 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -22,9 +22,10 @@ import copy
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.qual_names import QN
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 # TODO(mdan): Add support for PY3 (e.g. Param vs arg).
 
@@ -70,13 +71,33 @@ class Scope(object):
                                         tuple(self.modified))
 
   def copy_from(self, other):
+    """Recursively copies the contents of this scope from another scope."""
+    if (self.parent is None) != (other.parent is None):
+      raise ValueError('cannot copy scopes of different structures')
+    if other.parent is not None:
+      self.parent.copy_from(other.parent)
+    self.isolated = other.isolated
     self.modified = copy.copy(other.modified)
     self.created = copy.copy(other.created)
     self.used = copy.copy(other.used)
     self.params = copy.copy(other.params)
     self.returned = copy.copy(other.returned)
 
+  @classmethod
+  def copy_of(cls, other):
+    if other.parent is not None:
+      parent = cls.copy_of(other.parent)
+    else:
+      parent = None
+    new_copy = cls(parent)
+    new_copy.copy_from(other)
+    return new_copy
+
   def merge_from(self, other):
+    if (self.parent is None) != (other.parent is None):
+      raise ValueError('cannot merge scopes of different structures')
+    if other.parent is not None:
+      self.parent.merge_from(other.parent)
     self.modified |= other.modified
     self.created |= other.created
     self.used |= other.used
@@ -112,18 +133,18 @@ class Scope(object):
   def mark_param(self, name):
     self.params.add(name)
 
-  def mark_creation(self, name):
+  def mark_creation(self, name, writes_create_symbol=False):
     if name.is_composite():
       parent = name.parent
       if self.has(parent):
-        # This is considered mutation of the parent, not creation.
-        # TODO(mdan): Is that really so?
-        return
+        if not writes_create_symbol:
+          return
       else:
         raise ValueError('Unknown symbol "%s".' % parent)
     self.created.add(name)
 
   def mark_write(self, name):
+    """Marks the given symbol as modified in the current scope."""
     self.modified.add(name)
     if self.isolated:
       self.mark_creation(name)
@@ -141,19 +162,45 @@ class Scope(object):
       self.parent.mark_returned(name)
 
 
-class ActivityAnalizer(transformer.Base):
+class ActivityAnalyzer(transformer.Base):
   """Annotates nodes with local scope information. See Scope."""
 
   def __init__(self, context, parent_scope):
-    super(ActivityAnalizer, self).__init__(context)
+    super(ActivityAnalyzer, self).__init__(context)
     self.scope = Scope(parent_scope)
     self._in_return_statement = False
 
-  def _track_symbol(self, node):
+  @property
+  def _in_constructor(self):
+    innermost = self.enclosing_entities[-1]
+    if len(self.enclosing_entities) > 1:
+      parent = self.enclosing_entities[-2]
+      return isinstance(parent, gast.ClassDef) and innermost.name == '__init__'
+    return False
+
+  def _node_sets_self_attribute(self, node):
+    if anno.hasanno(node, anno.Basic.QN):
+      qn = anno.getanno(node, anno.Basic.QN)
+      # TODO(mdan): The 'self' argument is not guaranteed to be called 'self'.
+      if qn.has_attr and qn.parent.qn == ('self',):
+        return True
+
+  def _track_symbol(self,
+                    node,
+                    composite_writes_alter_parent=False,
+                    writes_create_symbol=False):
+    # A QN may be missing when we have an attribute (or subscript) on a function
+    # call. Example: a().b
+    if not anno.hasanno(node, anno.Basic.QN):
+      return
     qn = anno.getanno(node, anno.Basic.QN)
 
     if isinstance(node.ctx, gast.Store):
       self.scope.mark_write(qn)
+      if qn.is_composite and composite_writes_alter_parent:
+        self.scope.mark_write(qn.parent)
+      if writes_create_symbol:
+        self.scope.mark_creation(qn, writes_create_symbol=True)
     elif isinstance(node.ctx, gast.Load):
       self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Param):
@@ -182,7 +229,18 @@ class ActivityAnalizer(transformer.Base):
 
   def visit_Attribute(self, node):
     self.generic_visit(node)
-    self._track_symbol(node)
+    if self._in_constructor and self._node_sets_self_attribute(node):
+      self._track_symbol(
+          node, composite_writes_alter_parent=True, writes_create_symbol=True)
+    else:
+      self._track_symbol(node)
+    return node
+
+  def visit_Subscript(self, node):
+    self.generic_visit(node)
+    # Subscript writes (e.g. a[b] = "value") are considered to modify
+    # both the element itself (a[b]) and its parent (a).
+    self._track_symbol(node, composite_writes_alter_parent=True)
     return node
 
   def visit_Print(self, node):
@@ -224,21 +282,46 @@ class ActivityAnalizer(transformer.Base):
     # modifies the parent state causing the other child blocks to be
     # processed incorrectly. So we need to checkpoint the parent scope so that
     # each child sees the same context.
-    before_parent = Scope(None)
-    before_parent.copy_from(self.scope)
+    before_parent = Scope.copy_of(self.scope)
     after_children = []
     for child, scope_name in children:
       self.scope.copy_from(before_parent)
       parent = self._process_block_node(parent, child, scope_name)
-      after_child = Scope(None)
-      after_child.copy_from(self.scope)
+      after_child = Scope.copy_of(self.scope)
       after_children.append(after_child)
     for after_child in after_children:
       self.scope.merge_from(after_child)
     return parent
 
+  def visit_FunctionDef(self, node):
+    if self.scope:
+      qn = QN(node.name)
+      self.scope.mark_write(qn)
+    current_scope = self.scope
+    body_scope = Scope(current_scope, isolated=True)
+    self.scope = body_scope
+    self.generic_visit(node)
+    anno.setanno(node, NodeAnno.BODY_SCOPE, body_scope)
+    self.scope = current_scope
+    return node
+
+  def visit_With(self, node):
+    current_scope = self.scope
+    with_scope = Scope(current_scope, isolated=False)
+    self.scope = with_scope
+    self.generic_visit(node)
+    anno.setanno(node, NodeAnno.BODY_SCOPE, with_scope)
+    self.scope = current_scope
+    return node
+
   def visit_If(self, node):
+    current_scope = self.scope
+    cond_scope = Scope(current_scope, isolated=False)
+    self.scope = cond_scope
     self.visit(node.test)
+    anno.setanno(node, NodeAnno.COND_SCOPE, cond_scope)
+    self.scope = current_scope
+
     node = self._process_parallel_blocks(node,
                                          ((node.body, NodeAnno.BODY_SCOPE),
                                           (node.orelse, NodeAnno.ORELSE_SCOPE)))
@@ -253,7 +336,13 @@ class ActivityAnalizer(transformer.Base):
     return node
 
   def visit_While(self, node):
+    current_scope = self.scope
+    cond_scope = Scope(current_scope, isolated=False)
+    self.scope = cond_scope
     self.visit(node.test)
+    anno.setanno(node, NodeAnno.COND_SCOPE, cond_scope)
+    self.scope = current_scope
+
     node = self._process_parallel_blocks(node,
                                          ((node.body, NodeAnno.BODY_SCOPE),
                                           (node.orelse, NodeAnno.ORELSE_SCOPE)))
@@ -267,4 +356,4 @@ class ActivityAnalizer(transformer.Base):
 
 
 def resolve(node, context, parent_scope=None):
-  return ActivityAnalizer(context, parent_scope).visit(node)
+  return ActivityAnalyzer(context, parent_scope).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
similarity index 51%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index e1eb954a5efef4d6a00ac492e7c85394d54e28c9..ef79a295bfa3940705d2f341edd4eda74d7d7068 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.qual_names import QN
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.qual_names import QN
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 from tensorflow.python.platform import test
 
 
@@ -45,7 +45,7 @@ class ScopeTest(test.TestCase):
     scope.mark_read(QN('bar'))
     self.assertFalse(scope.has(QN('bar')))
 
-  def test_copy(self):
+  def test_copy_from(self):
     scope = activity.Scope(None)
     scope.mark_write(QN('foo'))
 
@@ -65,6 +65,17 @@ class ScopeTest(test.TestCase):
     self.assertTrue(QN('bar') in scope.created)
     self.assertFalse(QN('bar') in other.created)
 
+  def test_copy_of(self):
+    scope = activity.Scope(None)
+    scope.mark_read(QN('foo'))
+
+    self.assertTrue(QN('foo') in activity.Scope.copy_of(scope).used)
+
+    child_scope = activity.Scope(scope)
+    child_scope.mark_read(QN('bar'))
+
+    self.assertTrue(QN('bar') in activity.Scope.copy_of(child_scope).used)
+
   def test_nesting(self):
     scope = activity.Scope(None)
     scope.mark_write(QN('foo'))
@@ -97,7 +108,7 @@ class ScopeTest(test.TestCase):
     self.assertFalse(QN('a') in child.referenced)
 
 
-class ActivityAnalizerTest(test.TestCase):
+class ActivityAnalyzerTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn):
     node, source = parser.parse_entity(test_fn)
@@ -108,6 +119,7 @@ class ActivityAnalizerTest(test.TestCase):
         namespace={},
         arg_values=None,
         arg_types=None,
+        owner_type=None,
         recursive=True)
     node = qual_names.resolve(node)
     node = activity.resolve(node, ctx)
@@ -132,10 +144,21 @@ class ActivityAnalizerTest(test.TestCase):
         anno.getanno(node.body[0].body[2].value,
                      NodeAnno.IS_LOCAL))  # b in return b
 
-  def assertScopeIs(self, scope, used, modified, created):
-    self.assertItemsEqual(used, tuple(str(s) for s in scope.used))
-    self.assertItemsEqual(modified, tuple(str(s) for s in scope.modified))
-    self.assertItemsEqual(created, tuple(str(s) for s in scope.created))
+  def assertSymbolSetsAre(self, expected, actual, name):
+    expected = set(expected)
+    actual = set(str(s) for s in actual)
+    self.assertSetEqual(
+        expected, actual, 'for symbol set: %s\n'
+        '  Expected: %s\n'
+        '  Got:      %s\n'
+        '  Missing:  %s\n'
+        '  Extra:    %s\n' % (name.upper(), expected, actual,
+                              expected - actual, actual - expected))
+
+  def assertScopeIsRmc(self, scope, used, modified, created):
+    self.assertSymbolSetsAre(used, scope.used, 'read')
+    self.assertSymbolSetsAre(modified, scope.modified, 'modified')
+    self.assertSymbolSetsAre(created, scope.created, 'created')
 
   def test_print_statement(self):
 
@@ -158,9 +181,9 @@ class ActivityAnalizerTest(test.TestCase):
       print_args_scope = anno.getanno(print_node, NodeAnno.ARGS_SCOPE)
     # We basically need to detect which variables are captured by the call
     # arguments.
-    self.assertScopeIs(print_args_scope, ('a', 'b'), (), ())
+    self.assertScopeIsRmc(print_args_scope, ('a', 'b'), (), ())
 
-  def test_call(self):
+  def test_call_args(self):
 
     def test_fn(a):
       b = 0
@@ -172,9 +195,60 @@ class ActivityAnalizerTest(test.TestCase):
     call_node = node.body[0].body[2].value
     # We basically need to detect which variables are captured by the call
     # arguments.
-    self.assertScopeIs(
+    self.assertScopeIsRmc(
         anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'b'), (), ())
 
+  def test_call_args_attributes(self):
+
+    def foo(*_):
+      pass
+
+    def test_fn(a):
+      a.c = 0
+      foo(a.b, a.c)
+      return a.d
+
+    node = self._parse_and_analyze(test_fn)
+    call_node = node.body[0].body[1].value
+    self.assertScopeIsRmc(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE),
+        ('a', 'a.b', 'a.c'),
+        (),
+        (),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE).parent,
+        ('a', 'a.b', 'a.c', 'a.d', 'foo'),
+        ('a.c',),
+        ('a',),
+    )
+
+  def test_call_args_subscripts(self):
+
+    def foo(*_):
+      pass
+
+    def test_fn(a):
+      b = 1
+      c = 2
+      foo(a[0], a[b])
+      return a[c]
+
+    node = self._parse_and_analyze(test_fn)
+    call_node = node.body[0].body[2].value
+    self.assertScopeIsRmc(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE),
+        ('a', 'a[0]', 'a[b]', 'b'),
+        (),
+        (),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE).parent,
+        ('a', 'a[0]', 'a[b]', 'a[c]', 'b', 'c', 'foo'),
+        ('b', 'c'),
+        ('a', 'b', 'c'),
+    )
+
   def test_while(self):
 
     def test_fn(a):
@@ -186,12 +260,14 @@ class ActivityAnalizerTest(test.TestCase):
 
     node = self._parse_and_analyze(test_fn)
     while_node = node.body[0].body[1]
-    self.assertScopeIs(
+    self.assertScopeIsRmc(
         anno.getanno(while_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'),
         ('c',))
-    self.assertScopeIs(
+    self.assertScopeIsRmc(
         anno.getanno(while_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
         ('b', 'c'), ('a', 'b', 'c'))
+    self.assertScopeIsRmc(
+        anno.getanno(while_node, NodeAnno.COND_SCOPE), ('b',), (), ())
 
   def test_for(self):
 
@@ -204,9 +280,9 @@ class ActivityAnalizerTest(test.TestCase):
 
     node = self._parse_and_analyze(test_fn)
     for_node = node.body[0].body[1]
-    self.assertScopeIs(
+    self.assertScopeIsRmc(
         anno.getanno(for_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'), ('c',))
-    self.assertScopeIs(
+    self.assertScopeIsRmc(
         anno.getanno(for_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
         ('b', 'c', '_'), ('a', 'b', 'c', '_'))
 
@@ -225,46 +301,161 @@ class ActivityAnalizerTest(test.TestCase):
 
     node = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
-    self.assertScopeIs(
+    self.assertScopeIsRmc(
         anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('x', 'y', 'z'),
         ('y', 'z'))
     # TODO(mdan): Double check: is it ok to not mark a local symbol as not read?
-    self.assertScopeIs(
+    self.assertScopeIsRmc(
         anno.getanno(if_node, NodeAnno.BODY_SCOPE).parent, ('x', 'z', 'u'),
         ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
-    self.assertScopeIs(
+    self.assertScopeIsRmc(
         anno.getanno(if_node, NodeAnno.ORELSE_SCOPE), ('x', 'y'),
         ('x', 'y', 'u'), ('y', 'u'))
-    self.assertScopeIs(
+    self.assertScopeIsRmc(
         anno.getanno(if_node, NodeAnno.ORELSE_SCOPE).parent, ('x', 'z', 'u'),
         ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
 
-  def test_call_with_composite_names(self):
-
-    def foo(*_):
-      pass
+  def test_if_attributes(self):
 
     def test_fn(a):
-      foo(a.b, a.c)
       if a > 0:
-        a.b = 2
+        a.b = -a.c
+        d = 2 * a
+      else:
+        a.b = a.c
+        d = 1
+      return d
+
+    node = self._parse_and_analyze(test_fn)
+    if_node = node.body[0].body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE),
+        ('a', 'a.c'),
+        ('a.b', 'd'),
+        ('d',),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE),
+        ('a', 'a.c'),
+        ('a.b', 'd'),
+        ('d',),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE).parent,
+        ('a', 'a.c', 'd'),
+        ('a.b', 'd'),
+        ('a', 'd'),
+    )
+
+  def test_if_subscripts(self):
+
+    def test_fn(a, b, c, e):
+      if a > 0:
+        a[b] = -a[c]
+        d = 2 * a
       else:
-        d = 2
-        d.e = a.c
-        f = d.e + 1
-        a.c = f
+        a[0] = e
+        d = 1
+      return d
 
     node = self._parse_and_analyze(test_fn)
-    call_node = node.body[0].body[0].value
-    self.assertScopeIs(
-        anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'a.b', 'a.c'), (),
-        ())
-    if_node = node.body[0].body[1]
-    self.assertScopeIs(
-        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('a',), ('a.b',), ())
-    self.assertScopeIs(
+    if_node = node.body[0].body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE),
+        ('a', 'b', 'c', 'a[c]'),
+        ('a', 'a[b]', 'd'),
+        ('d',),
+    )
+    # TODO(mdan): Should subscript writes (a[0] = 1) be considered to read "a"?
+    self.assertScopeIsRmc(
         anno.getanno(if_node, NodeAnno.ORELSE_SCOPE),
-        ('a', 'a.c', 'd', 'd.e', 'f'), ('a.c', 'd', 'd.e', 'f'), ('d', 'f'))
+        ('a', 'e'),
+        ('a', 'a[0]', 'd'),
+        ('d',),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE).parent,
+        ('a', 'b', 'c', 'd', 'e', 'a[c]'),
+        ('a', 'd', 'a[b]', 'a[0]'),
+        ('a', 'b', 'c', 'd', 'e'),
+    )
+
+  def test_nested_if(self):
+
+    def test_fn(b):
+      if b > 0:
+        if b < 5:
+          a = b
+        else:
+          a = b * b
+      return a
+
+    node = self._parse_and_analyze(test_fn)
+    inner_if_node = node.body[0].body[0].body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(inner_if_node, NodeAnno.BODY_SCOPE), ('b',), ('a',),
+        ('a',))
+    self.assertScopeIsRmc(
+        anno.getanno(inner_if_node, NodeAnno.ORELSE_SCOPE), ('b',), ('a',),
+        ('a',))
+
+  def test_nested_function(self):
+
+    def test_fn(a):
+
+      def f(x):
+        y = x * x
+        return y
+
+      b = a
+      for i in a:
+        c = b
+        b -= f(i)
+      return b, c
+
+    node = self._parse_and_analyze(test_fn)
+    fn_def_node = node.body[0].body[0]
+
+    self.assertScopeIsRmc(
+        anno.getanno(fn_def_node,
+                     NodeAnno.BODY_SCOPE).parent, ('b', 'i', 'f', 'c', 'a'),
+        ('f', 'b', 'c', 'i'), ('f', 'a', 'b', 'c', 'i'))
+    self.assertScopeIsRmc(
+        anno.getanno(fn_def_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('y',), (
+            'x',
+            'y',
+        ))
+
+  def test_constructor_attributes(self):
+
+    class TestClass(object):
+
+      def __init__(self, a):
+        self.b = a
+        self.b.c = 1
+
+    node = self._parse_and_analyze(TestClass)
+    init_node = node.body[0].body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(init_node, NodeAnno.BODY_SCOPE),
+        ('self', 'a', 'self.b'),
+        ('self', 'self.b', 'self.b.c'),
+        ('self', 'a', 'self.b'),
+    )
+
+  def test_aug_assign_subscripts(self):
+
+    def test_fn(a):
+      a[0] += 1
+
+    node = self._parse_and_analyze(test_fn)
+    fn_node = node.body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
+        ('a',),
+        ('a', 'a[0]'),
+        ('a',),
+    )
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/annos.py b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
similarity index 67%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/annos.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/annos.py
index 2d8e49442364fdd4a4752c8a83a5f3b76117fe57..b929b35b79200b0968c9c4f26b10cda28763773a 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/annos.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Annotations used by the static analizer."""
+"""Annotations used by the static analyzer."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,23 +28,32 @@ class NoValue(Enum):
 
 
 class NodeAnno(NoValue):
-  """Additionnal annotations used by the static analyzer.
+  """Additional annotations used by the static analyzer.
 
   These are in addition to the basic annotations declared in anno.py.
   """
 
   # Symbols
-
-  IS_LOCAL = 'Symbol is local to the function scope being analized.'
-  IS_PARAM = 'Symbol is a parameter to the function being analized.'
+  # These flags are boolean.
+  IS_LOCAL = 'Symbol is local to the function scope being analyzed.'
+  IS_PARAM = 'Symbol is a parameter to the function being analyzed.'
   IS_MODIFIED_SINCE_ENTRY = (
       'Symbol has been explicitly replaced in the current function scope.')
 
   # Scopes
+  # Scopes are represented by objects of type activity.Scope.
   ARGS_SCOPE = 'The scope for the argument list of a function call.'
+  COND_SCOPE = 'The scope for the test node of a conditional statement.'
   BODY_SCOPE = (
       'The scope for the main body of a statement (True branch for if '
       'statements, main body for loops).')
   ORELSE_SCOPE = (
       'The scope for the orelse body of a statement (False branch for if '
       'statements, orelse body for loops).')
+
+  # Type and Value annotations
+  # Type annotations are represented by objects of type type_info.Type.
+  STATIC_INFO = (
+      'The type or value information that should be asserted about the entity '
+      'referenced by the symbol holding this annotation, irrespective of the '
+      'execution context.')
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py b/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
similarity index 85%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
index 9c0a9a9e74eccb3d22840032e8f0c2b81e051e7e..53ae15459097baff918432a493edd7360ebf209d 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
@@ -25,9 +25,9 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class LiveValueResolver(transformer.Base):
@@ -55,11 +55,19 @@ class LiveValueResolver(transformer.Base):
       if not symbol_is_local and not symbol_is_param:
         if node.id in self.literals:
           anno.setanno(node, 'live_val', self.literals[node.id])
-          # TODO(mdan): Could live values have FQNs? i.e. 'a'.join()
         elif node.id in self.context.namespace:
           obj = self.context.namespace[node.id]
           anno.setanno(node, 'live_val', obj)
-          anno.setanno(node, 'fqn', (obj.__name__,))
+          if hasattr(obj, '__name__'):
+            anno.setanno(node, 'fqn', (obj.__name__,))
+          elif hasattr(obj, '__class__'):
+            obj_class = obj.__class__
+            anno.setanno(node, 'fqn',
+                         (obj_class.__module__, obj_class.__name__))
+          else:
+            # If the symbol value is for example a primitive, then it will not
+            # have a name.
+            pass
         else:
           pass
           # TODO(mdan): Should we raise an error here?
@@ -86,6 +94,7 @@ class LiveValueResolver(transformer.Base):
       if not hasattr(parent_object, node.attr):
         raise AttributeError('%s has no attribute %s' % (parent_object,
                                                          node.attr))
+      anno.setanno(node, 'parent_type', type(parent_object))
       anno.setanno(node, 'live_val', getattr(parent_object, node.attr))
       anno.setanno(node, 'fqn', anno.getanno(node.value, 'fqn') + (node.attr,))
     # TODO(mdan): Investigate the role built-in annotations can play here.
@@ -96,6 +105,7 @@ class LiveValueResolver(transformer.Base):
         # This would not hold for dynamic members like function attributes.
         # For the dynamic case, we simply leave the node without an annotation,
         # and let downstream consumers figure out what to do.
+        anno.setanno(node, 'parent_type', parent_type)
         anno.setanno(node, 'live_val', getattr(parent_type, node.attr))
         anno.setanno(node, 'fqn',
                      anno.getanno(node.value, 'type_fqn') + (node.attr,))
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/live_values_test.py
similarity index 75%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/live_values_test.py
index 9f64689401e3594a77fbdd7b6f02880bd6e90492..69e428bde109ed43c3cdda1a94970a832dc47852 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/live_values_test.py
@@ -18,13 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+import six
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis import live_values
+from tensorflow.contrib.autograph.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
@@ -46,6 +48,7 @@ class LiveValuesResolverTest(test.TestCase):
         namespace=namespace,
         arg_values=None,
         arg_types=arg_types,
+        owner_type=None,
         recursive=True)
     node = qual_names.resolve(node)
     node = activity.resolve(node, ctx)
@@ -56,13 +59,30 @@ class LiveValuesResolverTest(test.TestCase):
 
   def test_literals(self):
 
+    a = None
+
     def test_fn():
-      return Foo  # pylint: disable=undefined-variable
+      return a
 
-    node = self._parse_and_analyze(test_fn, {}, {'Foo': 'bar'})
+    node = self._parse_and_analyze(test_fn, {}, literals={'a': 'bar'})
     retval_node = node.body[0].body[0].value
     self.assertEquals('bar', anno.getanno(retval_node, 'live_val'))
 
+  def test_primitive_values(self):
+
+    a = None
+
+    def test_fn():
+      return a
+
+    node = self._parse_and_analyze(test_fn, {'a': True})
+    retval_node = node.body[0].body[0].value
+    if six.PY2:
+      self.assertEqual(
+          anno.getanno(retval_node, 'fqn'), ('__builtin__', 'bool'))
+    else:
+      self.assertEqual(anno.getanno(retval_node, 'fqn'), ('builtins', 'bool'))
+
   def test_namespace(self):
 
     def foo():
@@ -102,6 +122,7 @@ class LiveValuesResolverTest(test.TestCase):
         arg_types={'self': (TestClass.__name__, TestClass)})
     func_node = node.body[0].body[0].value.func
     self.assertEquals(TestClass.member, anno.getanno(func_node, 'live_val'))
+    self.assertEquals(TestClass, anno.getanno(func_node, 'parent_type'))
     self.assertEquals(('TestClass', 'member'), anno.getanno(func_node, 'fqn'))
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
similarity index 52%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index 8203bda0f9a792a5b24b9abb25d8f39b61625748..c00946f9c41bc68d5c638d71f356b484db1286d1 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -14,22 +14,45 @@
 # ==============================================================================
 """Type resolution.
 
+This analyzer uses known live values to further infer object types. This
+may include for instance constructed objects and object member functions.
+
+In addition, the analyzer will also process annotations for TF (staged) type
+annotations.
+
 Requires annotations generated by LiveValuesResolver.
 """
 
+# TODO(mdan): This would be more robust with a CFG.
+# Situations with multiple reaching modifications (e.g. modified inside and
+# outside a control flow statement) should be more robustly detected and
+# analyzed.
+
+# TODO(mdan): Look into using Python AST's type annotation fields instead.
+# It would be desirable to use that mechanism if we can.
+# Some caveats to consider: We may need to annotate other nodes like
+# Attribute. It may also not be feasible for us to faithfully to replicate
+# PY3's type annotations where it isn't available. It would also require us
+# to design rigorous type definitions that can accommodate Python types
+# as well as TensorFLow dtypes and shapes.
+
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
 
+# TODO(mdan): Remove the duplication between this and activity.py.
+# In particular, the symbol definitions we track here could as well be tracked
+# there because they follow the same rules for visibility.
 class Scope(object):
-  """Encloses symbol value references.
+  """Tracks symbol value references.
 
   Attributes:
     values: A dict mapping string to gast.Node, containing the value that was
@@ -79,20 +102,16 @@ class TypeInfoResolver(transformer.Base):
   def __init__(self, context):
     super(TypeInfoResolver, self).__init__(context)
     self.scope = Scope(None)
-    self.function_level = 0
 
   def visit_FunctionDef(self, node):
     self.scope = Scope(self.scope)
-    self.function_level += 1
-    self.generic_visit(node)
-    self.function_level -= 1
+    node = self.generic_visit(node)
     self.scope = self.scope.parent
     return node
 
   def _visit_block(self, block):
     self.scope = Scope(self.scope)
-    for i, n in enumerate(block):
-      block[i] = self.generic_visit(n)
+    block = self.visit_block(block)
     self.scope = self.scope.parent
     return block
 
@@ -117,7 +136,7 @@ class TypeInfoResolver(transformer.Base):
 
   def _process_function_arg(self, arg_name):
     str_name = str(arg_name)
-    if self.function_level == 1 and str_name in self.context.arg_types:
+    if len(self.enclosing_entities) == 1 and str_name in self.context.arg_types:
       # Forge a node to hold the type information, so that method calls on
       # it can resolve the type.
       type_holder = arg_name.ast()
@@ -138,14 +157,18 @@ class TypeInfoResolver(transformer.Base):
     elif isinstance(node.ctx, gast.Load) and self.scope.hasval(qn):
       # E.g. if we had
       # a = b
-      # then for future references to `a` we should have traced_source = `b`
-      traced_source = self.scope.getval(qn)
-      if anno.hasanno(traced_source, 'type'):
-        anno.setanno(node, 'type', anno.getanno(traced_source, 'type'))
-        anno.setanno(node, 'type_fqn', anno.getanno(traced_source, 'type_fqn'))
+      # then for future references to `a` we should have definition = `b`
+      definition = self.scope.getval(qn)
+      if anno.hasanno(definition, 'type'):
+        anno.setanno(node, 'type', anno.getanno(definition, 'type'))
+        anno.setanno(node, 'type_fqn', anno.getanno(definition, 'type_fqn'))
+      if anno.hasanno(definition, 'element_type'):
+        anno.setanno(node, 'element_type',
+                     anno.getanno(definition, 'element_type'))
     return node
 
   def _process_variable_assignment(self, source, targets):
+    # Special case: constructors.
     if isinstance(source, gast.Call):
       func = source.func
       if anno.hasanno(func, 'live_val'):
@@ -158,16 +181,25 @@ class TypeInfoResolver(transformer.Base):
           # We can have a whitelist of no-side-effects constructors.
           # We can also step inside the constructor and further analyze.
 
-    for t in targets:
-      if isinstance(t, gast.Tuple):
-        for i, e in enumerate(t.elts):
-          self.scope.setval(
-              anno.getanno(e, anno.Basic.QN),
-              gast.Subscript(source, gast.Index(i), ctx=gast.Store()))
-      elif isinstance(t, (gast.Name, gast.Attribute)):
-        self.scope.setval(anno.getanno(t, anno.Basic.QN), source)
+    # Multiple targets mean multiple assignment.
+    for target in targets:
+      # Tuple target means unpacking.
+      if isinstance(target, (gast.Tuple, gast.List)):
+        for i, target_item in enumerate(target.elts):
+          # Two cases here:
+          #   1. Static unpacking, e.g. a, b = c, d
+          #   2. Dynamic unpacking, e.g. a, b = c
+          # The former case is optimized away.
+          if isinstance(source, (gast.Tuple, gast.List)):
+            source_item = source.elts[i]
+          else:
+            source_item = gast.Subscript(source, gast.Index(i), ctx=None)
+          self._process_variable_assignment(source_item, (target_item,))
+      elif isinstance(target, (gast.Name, gast.Attribute)):
+        target_symbol = anno.getanno(target, anno.Basic.QN)
+        self.scope.setval(target_symbol, source)
       else:
-        raise ValueError('Dont know how to handle assignment to %s' % t)
+        raise ValueError('assignment target has unknown type: %s' % target)
 
   def visit_With(self, node):
     for wi in node.items:
@@ -181,6 +213,41 @@ class TypeInfoResolver(transformer.Base):
     self._process_variable_assignment(node.value, node.targets)
     return node
 
+  def visit_Call(self, node):
+    if anno.hasanno(node.func, 'live_val'):
+      # Symbols targeted by the "set_type" marker function are assigned the data
+      # type that it specified.
+      if (anno.getanno(node.func, 'live_val') is
+          self.context.type_annotation_func):
+
+        if len(node.args) != 2:
+          raise ValueError('"%s" must have exactly two parameters'
+                           % self.context.type_annotation_func)
+        target_arg, type_arg = node.args
+        if not anno.hasanno(target_arg, anno.Basic.QN):
+          raise ValueError('the first argument of "%s" must by a symbol'
+                           % self.context.type_annotation_func)
+        if isinstance(type_arg, gast.Str):
+          element_type = type_arg.s
+        elif isinstance(type_arg, gast.Num):
+          element_type = type_arg.n
+        else:
+          if not anno.hasanno(type_arg, 'live_val'):
+            raise ValueError(
+                'the second argument of "%s" must be statically resolvable' %
+                self.context.type_annotation_func)
+          element_type = anno.getanno(type_arg, 'live_val')
+
+        target_symbol = anno.getanno(target_arg, anno.Basic.QN)
+        # Find the definition of this symbol and annotate it with the given
+        # data type. That in turn will cause future uses of the symbol
+        # to receive the same type annotation.
+        definition = self.scope.getval(target_symbol)
+        anno.setanno(node, 'element_type', element_type)
+        anno.setanno(definition, 'element_type', element_type)
+        # TODO(mdan): Should we update references between definition and here?
+    return self.generic_visit(node)
+
 
 def resolve(node, context):
   return TypeInfoResolver(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
similarity index 69%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
index 3659f949db9910534870d8dd9e42fd4ee8297253..46b7701624a43073fb7cc612d2678ab851513d91 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
@@ -18,13 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis import live_values
+from tensorflow.contrib.autograph.pyct.static_analysis import type_info
 from tensorflow.python.client import session
 from tensorflow.python.platform import test
 from tensorflow.python.training import training
@@ -56,7 +57,10 @@ class ScopeTest(test.TestCase):
 
 class TypeInfoResolverTest(test.TestCase):
 
-  def _parse_and_analyze(self, test_fn, namespace, arg_types=None):
+  def _parse_and_analyze(self,
+                         test_fn,
+                         namespace,
+                         arg_types=None):
     node, source = parser.parse_entity(test_fn)
     ctx = context.EntityContext(
         namer=None,
@@ -65,7 +69,9 @@ class TypeInfoResolverTest(test.TestCase):
         namespace=namespace,
         arg_values=None,
         arg_types=arg_types,
-        recursive=True)
+        owner_type=None,
+        recursive=True,
+        type_annotation_func=utils.set_element_type)
     node = qual_names.resolve(node)
     node = activity.resolve(node, ctx)
     node = live_values.resolve(node, ctx, {})
@@ -174,6 +180,62 @@ class TypeInfoResolverTest(test.TestCase):
     method_call = node.body[0].body[1].value.func
     self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
+  def test_type_annotation(self):
+
+    class Foo(object):
+      pass
+
+    def test_fn():
+      f = []
+      f = utils.set_element_type(f, Foo)
+      return f
+
+    node = self._parse_and_analyze(test_fn, {'Foo': Foo, 'utils': utils})
+    f_def = node.body[0].body[0].value
+    self.assertEqual(anno.getanno(f_def, 'element_type'), Foo)
+    f_ref = node.body[0].body[1].value
+    self.assertEqual(anno.getanno(f_ref, 'element_type'), Foo)
+
+  def test_nested_unpacking(self):
+
+    class Foo(object):
+      pass
+
+    class Bar(object):
+      pass
+
+    def test_fn():
+      a, (b, c) = (Foo(), (Bar(), Foo()))
+      return a, b, c
+
+    node = self._parse_and_analyze(test_fn, {'Foo': Foo, 'Bar': Bar})
+    a, b, c = node.body[0].body[1].value.elts
+    self.assertEquals(Foo, anno.getanno(a, 'type'))
+    self.assertEquals(Bar, anno.getanno(b, 'type'))
+    self.assertEquals(Foo, anno.getanno(c, 'type'))
+    self.assertFalse(anno.hasanno(a, 'live_val'))
+    self.assertFalse(anno.hasanno(b, 'live_val'))
+    self.assertFalse(anno.hasanno(c, 'live_val'))
+
+  def test_inner_scope(self):
+
+    def test_fn():
+      a = []
+      utils.set_element_type(a, 1)
+      for _ in a:
+        b = []
+        utils.set_element_type(b, 2)
+        return a, b
+
+    node = self._parse_and_analyze(test_fn, {'utils': utils})
+    a, b = node.body[0].body[2].body[2].value.elts
+    self.assertEquals(1, anno.getanno(a, 'element_type'))
+    self.assertEquals(2, anno.getanno(b, 'element_type'))
+    self.assertFalse(anno.hasanno(a, 'type'))
+    self.assertFalse(anno.hasanno(b, 'type'))
+    self.assertFalse(anno.hasanno(a, 'live_val'))
+    self.assertFalse(anno.hasanno(b, 'live_val'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/autograph/pyct/templates.py
similarity index 56%
rename from tensorflow/contrib/py2tf/pyct/templates.py
rename to tensorflow/contrib/autograph/pyct/templates.py
index 6ee6c0c5ceb70d87779ee313670135cadc5214b5..baf7923fff7c786c1abd05e11fa6ffdb8c8f0912 100644
--- a/tensorflow/contrib/py2tf/pyct/templates.py
+++ b/tensorflow/contrib/autograph/pyct/templates.py
@@ -26,9 +26,9 @@ import textwrap
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import ast_util
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
 
 
 class ReplaceTransformer(gast.NodeTransformer):
@@ -44,8 +44,6 @@ class ReplaceTransformer(gast.NodeTransformer):
     self.replacements = replacements
     self.in_replacements = False
 
-  # TODO(mdan): Make a more detailed pass and clean up if needed.
-
   def visit_Expr(self, node):
     if (isinstance(node.value, gast.Name) and
         node.value.id in self.replacements):
@@ -53,17 +51,66 @@ class ReplaceTransformer(gast.NodeTransformer):
     self.generic_visit(node)
     return node
 
+  def visit_keyword(self, node):
+    if node.arg in self.replacements:
+      repl = self.replacements[node.arg]
+      if isinstance(repl, gast.keyword):
+        return repl
+      elif (isinstance(repl, (list, tuple)) and repl and
+            all(isinstance(r, gast.keyword) for r in repl)):
+        return repl
+      # TODO(mdan): We may allow replacing with a string as well.
+      # For example, if one wanted to replace foo with bar in foo=baz, then
+      # we could allow changing just node arg, so that we end up with bar=baz.
+      raise ValueError(
+          'a keyword argument may only be replaced by another keyword or a '
+          'non-empty list of keywords. Found: %s' % repl)
+    return self.generic_visit(node)
+
   def visit_FunctionDef(self, node):
     node = self.generic_visit(node)
     if node.name in self.replacements:
       repl = self.replacements[node.name]
       if not isinstance(repl, (gast.Name, ast.Name)):
         raise ValueError(
-            'A function name can only be replaced by a Name node. Found: %s' %
+            'a function name can only be replaced by a Name node. Found: %s' %
             repl)
       node.name = repl.id
     return node
 
+  def _check_has_context(self, node):
+    if not node.ctx:
+      raise ValueError('node %s is missing ctx value' % node)
+
+  def _check_inner_children_have_context(self, node):
+    if isinstance(node, gast.Attribute):
+      self._check_inner_children_have_context(node.value)
+      self._check_has_context(node)
+    elif isinstance(node, gast.Tuple):
+      for e in node.elts:
+        self._check_inner_children_have_context(e)
+      self._check_has_context(node)
+    elif isinstance(node, gast.Dict):
+      for e in node.keys:
+        self._check_inner_children_have_context(e)
+      for e in node.values:
+        self._check_inner_children_have_context(e)
+    elif isinstance(node, gast.Subscript):
+      self._check_inner_children_have_context(node.value)
+      self._check_inner_children_have_context(node.slice)
+    elif isinstance(node, gast.Slice):
+      self._check_inner_children_have_context(node.lower)
+      if node.upper:
+        self._check_inner_children_have_context(node.upper)
+      if node.step:
+        self._check_inner_children_have_context(node.step)
+    elif isinstance(node, gast.Name):
+      self._check_has_context(node)
+    elif isinstance(node, (gast.Str, gast.Num)):
+      pass
+    else:
+      raise ValueError('unexpected node type "%s"' % node)
+
   def _set_inner_child_context(self, node, ctx):
     if isinstance(node, gast.Attribute):
       self._set_inner_child_context(node.value, ctx)
@@ -74,11 +121,40 @@ class ReplaceTransformer(gast.NodeTransformer):
       node.ctx = ctx
     elif isinstance(node, gast.Name):
       node.ctx = ctx
+    elif isinstance(node, gast.Call):
+      self._set_inner_child_context(node.func, ctx)
+      # We may be able to override these to Load(), but for now it's simpler
+      # to just assert that they're set.
+      for a in node.args:
+        self._check_inner_children_have_context(a)
+      for k in node.keywords:
+        self._check_inner_children_have_context(k.value)
+    elif isinstance(node, gast.Dict):
+      # We may be able to override these to Load(), but for now it's simpler
+      # to just assert that they're set.
+      for e in node.keys:
+        self._check_inner_children_have_context(e)
+      for e in node.values:
+        self._check_inner_children_have_context(e)
+    elif isinstance(node, gast.Subscript):
+      self._set_inner_child_context(node.value, ctx)
+      self._check_inner_children_have_context(node.slice)
     elif isinstance(node, (gast.Str, gast.Num)):
       pass
     else:
       raise ValueError('unexpected node type "%s"' % node)
 
+  def visit_Attribute(self, node):
+    node = self.generic_visit(node)
+    if node.attr not in self.replacements:
+      return node
+    repl = self.replacements[node.attr]
+    if not isinstance(repl, gast.Name):
+      raise ValueError(
+          'An attribute can only be replaced by a Name node. Found: %s' % repl)
+    node.attr = repl.id
+    return node
+
   def visit_Name(self, node):
     if node.id not in self.replacements:
       return node
@@ -154,3 +230,17 @@ def replace(template, **replacements):
   if isinstance(results, list):
     return [qual_names.resolve(r) for r in results]
   return qual_names.resolve(results)
+
+
+def replace_as_expression(template, **replacements):
+  """Variant of replace that generates expressions, instead of code blocks."""
+  replacement = replace(template, **replacements)
+  if len(replacement) != 1:
+    raise ValueError(
+        'single expression expected; for more general templates use replace')
+  node = replacement[0]
+  if not isinstance(node, gast.Expr):
+    raise ValueError(
+        'the template is expected to generate an expression node; instead '
+        'found %s' % node)
+  return node.value
diff --git a/tensorflow/contrib/autograph/pyct/templates_test.py b/tensorflow/contrib/autograph/pyct/templates_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a01f8bf04c4faa6ec1779e0fb306155d99f5bd09
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/templates_test.py
@@ -0,0 +1,168 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for templates module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import imp
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.python.platform import test
+
+
+class TemplatesTest(test.TestCase):
+
+  def test_replace_tuple(self):
+    template = """
+      def test_fn(a, c):
+        return b,
+    """
+
+    node = templates.replace(template, b=('a', 'c'))[0]
+    result, _ = compiler.ast_to_object(node)
+
+    self.assertEquals((2, 3), result.test_fn(2, 3))
+
+  def test_replace_variable(self):
+    template = """
+      def test_fn(a):
+        a += 1
+        a = 2 * a + 1
+        return b
+    """
+
+    node = templates.replace(template, a='b')[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(7, result.test_fn(2))
+
+  def test_replace_function_name(self):
+    template = """
+      def fname(a):
+        a += 1
+        a = 2 * a + 1
+        return a
+    """
+
+    node = templates.replace(template, fname='test_fn')[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(7, result.test_fn(2))
+
+  def test_replace_code_block(self):
+    template = """
+      def test_fn(a):
+        block
+        return a
+    """
+
+    node = templates.replace(
+        template,
+        block=[
+            gast.Assign([
+                gast.Name('a', None, None)
+            ], gast.BinOp(gast.Name('a', None, None), gast.Add(), gast.Num(1))),
+        ] * 2)[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(3, result.test_fn(1))
+
+  def test_replace_attribute(self):
+    template = """
+      def test_fn(a):
+        return a.foo
+    """
+
+    node = templates.replace(template, foo='b')[0]
+    result, _ = compiler.ast_to_object(node)
+    mod = imp.new_module('test')
+    mod.b = 3
+    self.assertEquals(3, result.test_fn(mod))
+
+    with self.assertRaises(ValueError):
+      templates.replace(template, foo=1)
+
+  def test_replace_call_keyword(self):
+    template = """
+      def test_fn():
+        def f(a, d, f):
+          return a + d + f
+        return f(1, kws=None)
+    """
+
+    source = parser.parse_expression('f(d=3, f=5)')
+    node = templates.replace(template, kws=source.keywords)[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(9, result.test_fn())
+
+    with self.assertRaises(ValueError):
+      templates.replace(template, kws=[])
+      templates.replace(template, kws=1)
+
+  def test_replace_name_with_call(self):
+    template = """
+      def test_fn():
+        b = 5
+        def g(a):
+          return 3 * a
+        def f():
+          return g
+        return foo
+    """
+
+    source = parser.parse_expression('f()(b)')
+    node = templates.replace(template, foo=source)[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(15, result.test_fn())
+
+  def test_replace_name_with_dict(self):
+    template = """
+      def test_fn():
+        return foo['bar']
+    """
+
+    source = parser.parse_expression('{\'bar\': 3}')
+    node = templates.replace(template, foo=source)[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(3, result.test_fn())
+
+  def replace_as_expression(self):
+    template = """
+      foo(a)
+    """
+
+    node = templates.replace(template, foo='bar', a='baz')
+    self.assertTrue(node is gast.Call)
+    self.assertEqual(node.func.id, 'bar')
+    self.assertEqual(node.func.args[0].id, 'baz')
+
+  def replace_as_expression_restrictions(self):
+    template = """
+      foo(a)
+      bar(b)
+    """
+    with self.assertRaises(ValueError):
+      templates.replace_as_expression(template)
+    with self.assertRaises(ValueError):
+      templates.replace('')
+    with self.assertRaises(ValueError):
+      templates.replace('a = b')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db6cc0adfad90ffc1a6bbcadfc80215688d271e
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -0,0 +1,147 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A node transformer that includes utilities for SCT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import gast
+import six
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import pretty_printer
+
+
+class AutographParseError(SyntaxError):
+  pass
+
+
+def try_ast_to_source(node):
+  try:
+    return compiler.ast_to_source(node)
+  except AssertionError:
+    return '<could not convert AST to source>'
+
+
+class Base(gast.NodeTransformer):
+  """Base class for specialized transformers.
+
+  Scope-local state tracking: to keep state across nodes, at the level of
+  (possibly nested) scopes, use enter/exit_local_scope and set/get_local.
+  You must call enter/exit_local_scope manually, but the transformer detects
+  when they are not properly paired.
+  """
+
+  def __init__(self, context):
+    """Initialize the transformer. Subclasses should call this.
+
+    Args:
+      context: An EntityContext.
+    """
+    self._lineno = 0
+    self._col_offset = 0
+    self.context = context
+    self._enclosing_entities = []
+
+    # A stack that allows keeping mutable, scope-local state where scopes may be
+    # nested. For example, it can be used to track the usage of break
+    # statements in each loop, where loops may be nested.
+    self._local_scope_state = []
+    self.enter_local_scope()
+
+  @property
+  def enclosing_entities(self):
+    return tuple(self._enclosing_entities)
+
+  @property
+  def locel_scope_level(self):
+    return len(self._local_scope_state)
+
+  def enter_local_scope(self):
+    self._local_scope_state.append({})
+
+  def exit_local_scope(self):
+    return self._local_scope_state.pop()
+
+  def set_local(self, name, value):
+    self._local_scope_state[-1][name] = value
+
+  def get_local(self, name, default=None):
+    return self._local_scope_state[-1].get(name, default)
+
+  def debug_print(self, node):
+    """Helper method useful for debugging."""
+    if __debug__:
+      print(pretty_printer.fmt(node))
+    return node
+
+  def visit_block(self, nodes):
+    """Helper equivalent to generic_visit, but for node lists."""
+    results = []
+    for node in nodes:
+      replacement = self.visit(node)
+      if replacement:
+        if isinstance(replacement, (list, tuple)):
+          results.extend(replacement)
+        else:
+          results.append(replacement)
+    return results
+
+  def visit(self, node):
+    source_code = self.context.source_code
+    source_file = self.context.source_file
+    did_enter_function = False
+    local_scope_state_size = len(self._local_scope_state)
+
+    try:
+      if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
+        self._enclosing_entities.append(node)
+        did_enter_function = True
+
+      if source_code and hasattr(node, 'lineno'):
+        self._lineno = node.lineno
+        self._col_offset = node.col_offset
+      if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+        return node
+      return super(Base, self).visit(node)
+
+    except (ValueError, AttributeError, KeyError, NotImplementedError,
+            AssertionError) as e:
+      msg = '%s: %s\nOffending source:\n%s\n\nOccurred at node:\n%s' % (
+          e.__class__.__name__, str(e), try_ast_to_source(node),
+          pretty_printer.fmt(node, color=False))
+      if source_code:
+        line = source_code.splitlines()[self._lineno - 1]
+      else:
+        line = '<no source available>'
+      six.reraise(AutographParseError,
+                  AutographParseError(
+                      msg,
+                      (source_file, self._lineno, self._col_offset + 1, line)),
+                  sys.exc_info()[2])
+    finally:
+      if did_enter_function:
+        self._enclosing_entities.pop()
+
+      if local_scope_state_size != len(self._local_scope_state):
+        raise AssertionError(
+            'Inconsistent local scope stack. Before entering node %s, the'
+            ' stack had length %d, after exit it has length %d. This'
+            ' indicates enter_local_scope and exit_local_scope are not'
+            ' well paired.')
diff --git a/tensorflow/contrib/autograph/pyct/transformer_test.py b/tensorflow/contrib/autograph/pyct/transformer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f96b0dc377521a482d347436caa98633a0a32c8a
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/transformer_test.py
@@ -0,0 +1,179 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for templates module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.python.platform import test
+
+
+class TransformerTest(test.TestCase):
+
+  def _context_for_nodetesting(self):
+    return context.EntityContext(
+        namer=None,
+        source_code=None,
+        source_file=None,
+        namespace=None,
+        arg_values=None,
+        arg_types=None,
+        owner_type=None,
+        recursive=False)
+
+  def test_entity_scope_tracking(self):
+
+    class TestTransformer(transformer.Base):
+
+      # The choice of note to assign to is arbitrary. Using Assign because it's
+      # easy to find in the tree.
+      def visit_Assign(self, node):
+        anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
+        return self.generic_visit(node)
+
+      # This will show up in the lambda function.
+      def visit_BinOp(self, node):
+        anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
+        return self.generic_visit(node)
+
+    tr = TestTransformer(self._context_for_nodetesting())
+
+    def test_function():
+      a = 0
+
+      class TestClass(object):
+
+        def test_method(self):
+          b = 0
+          def inner_function(x):
+            c = 0
+            d = lambda y: (x + y)
+            return c, d
+          return b, inner_function
+      return a, TestClass
+
+    node, _ = parser.parse_entity(test_function)
+    node = tr.visit(node)
+
+    test_function_node = node.body[0]
+    test_class = test_function_node.body[1]
+    test_method = test_class.body[0]
+    inner_function = test_method.body[1]
+    lambda_node = inner_function.body[1].value
+
+    a = test_function_node.body[0]
+    b = test_method.body[0]
+    c = inner_function.body[0]
+    lambda_expr = lambda_node.body
+
+    self.assertEqual(
+        (test_function_node,), anno.getanno(a, 'enclosing_entities'))
+    self.assertEqual((test_function_node, test_class, test_method),
+                     anno.getanno(b, 'enclosing_entities'))
+    self.assertEqual(
+        (test_function_node, test_class, test_method, inner_function),
+        anno.getanno(c, 'enclosing_entities'))
+    self.assertEqual((test_function_node, test_class, test_method,
+                      inner_function, lambda_node),
+                     anno.getanno(lambda_expr, 'enclosing_entities'))
+
+  def test_statement_info_stack(self):
+
+    class TestTransformer(transformer.Base):
+
+      # Extract all string constants from the block.
+      def visit_Str(self, node):
+        self.set_local('string', self.get_local('string', default='') + node.s)
+        return self.generic_visit(node)
+
+      def _annotate_result(self, node):
+        self.enter_local_scope()
+        node = self.generic_visit(node)
+        anno.setanno(node, 'test', self.get_local('string'))
+        self.exit_local_scope()
+        return node
+
+      def visit_While(self, node):
+        return self._annotate_result(node)
+
+      def visit_For(self, node):
+        return self._annotate_result(node)
+
+    tr = TestTransformer(self._context_for_nodetesting())
+
+    def test_function(a):
+      """Docstring."""
+      assert a == 'This should not be counted'
+      for i in range(3):
+        _ = 'a'
+        if i > 2:
+          return 'b'
+        else:
+          _ = 'c'
+          while True:
+            raise '1'
+      return 'nor this'
+
+    node, _ = parser.parse_entity(test_function)
+    node = tr.visit(node)
+
+    for_node = node.body[0].body[2]
+    while_node = for_node.body[1].orelse[1]
+
+    self.assertFalse(anno.hasanno(for_node, 'string'))
+    self.assertEqual('abc', anno.getanno(for_node, 'test'))
+    self.assertFalse(anno.hasanno(while_node, 'string'))
+    self.assertEqual('1', anno.getanno(while_node, 'test'))
+
+  def test_statement_info_stack_checks_integrity(self):
+
+    class TestTransformer(transformer.Base):
+
+      def visit_If(self, node):
+        self.enter_local_scope()
+        return self.generic_visit(node)
+
+      def visit_For(self, node):
+        node = self.generic_visit(node)
+        self.exit_local_scope()
+        return node
+
+    tr = TestTransformer(self._context_for_nodetesting())
+
+    def no_exit(a):
+      if a > 0:
+        print(a)
+      return None
+
+    node, _ = parser.parse_entity(no_exit)
+    with self.assertRaises(AssertionError):
+      tr.visit(node)
+
+    def no_entry(a):
+      for _ in a:
+        print(a)
+
+    node, _ = parser.parse_entity(no_entry)
+    with self.assertRaises(AssertionError):
+      tr.visit(node)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/autograph/utils/BUILD
similarity index 88%
rename from tensorflow/contrib/py2tf/utils/BUILD
rename to tensorflow/contrib/autograph/utils/BUILD
index c2fdd40707775783140390e4b5c0186c9c3e562e..d3a1b9468892531cbc51bc13de66ef595f1a95f8 100644
--- a/tensorflow/contrib/py2tf/utils/BUILD
+++ b/tensorflow/contrib/autograph/utils/BUILD
@@ -20,26 +20,31 @@ py_library(
     name = "utils",
     srcs = [
         "__init__.py",
+        "builtins.py",
         "context_managers.py",
         "misc.py",
         "multiple_dispatch.py",
-        "printing.py",
         "py_func.py",
         "tensor_list.py",
+        "testing.py",
         "type_check.py",
+        "type_hints.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
+        "//tensorflow/python:list_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "@six_archive//:six",
     ],
 )
 
 py_test(
-    name = "context_managers_test",
-    srcs = ["context_managers_test.py"],
+    name = "builtins_test",
+    srcs = ["builtins_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
@@ -47,8 +52,8 @@ py_test(
 )
 
 py_test(
-    name = "misc_test",
-    srcs = ["misc_test.py"],
+    name = "context_managers_test",
+    srcs = ["context_managers_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":utils",
@@ -57,8 +62,8 @@ py_test(
 )
 
 py_test(
-    name = "multiple_dispatch_test",
-    srcs = ["multiple_dispatch_test.py"],
+    name = "misc_test",
+    srcs = ["misc_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":utils",
@@ -67,8 +72,8 @@ py_test(
 )
 
 py_test(
-    name = "py_func_test",
-    srcs = ["py_func_test.py"],
+    name = "multiple_dispatch_test",
+    srcs = ["multiple_dispatch_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":utils",
@@ -77,9 +82,10 @@ py_test(
 )
 
 py_test(
-    name = "printing_test",
-    srcs = ["printing_test.py"],
+    name = "py_func_test",
+    srcs = ["py_func_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/autograph/utils/__init__.py b/tensorflow/contrib/autograph/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..817d4126d106487e1fea3e442712a69bbfccd7f3
--- /dev/null
+++ b/tensorflow/contrib/autograph/utils/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility module that contains APIs usable in the generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.utils.builtins import dynamic_builtin
+from tensorflow.contrib.autograph.utils.builtins import dynamic_print
+from tensorflow.contrib.autograph.utils.builtins import dynamic_range
+from tensorflow.contrib.autograph.utils.context_managers import control_dependency_on_returns
+from tensorflow.contrib.autograph.utils.misc import alias_tensors
+from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is
+from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is_not
+from tensorflow.contrib.autograph.utils.multiple_dispatch import run_cond
+from tensorflow.contrib.autograph.utils.py_func import wrap_py_func
+from tensorflow.contrib.autograph.utils.tensor_list import dynamic_list_append
+from tensorflow.contrib.autograph.utils.testing import fake_tf
+from tensorflow.contrib.autograph.utils.type_check import is_tensor
+from tensorflow.contrib.autograph.utils.type_hints import set_element_type
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
new file mode 100644
index 0000000000000000000000000000000000000000..211e8eaee9082dd3e4f035e4379871cd2e154a39
--- /dev/null
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -0,0 +1,105 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Builtin conversion utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import six
+
+from tensorflow.contrib.autograph.utils import py_func
+from tensorflow.contrib.autograph.utils import type_check
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+
+
+def dynamic_builtin(f, *args, **kwargs):
+  """Converts a builtin function call inline."""
+  if f is len:
+    return dynamic_len(*args, **kwargs)
+  if six.PY2 and f is xrange:
+    return dynamic_range(*args, **kwargs)
+  if f is range:
+    return dynamic_range(*args, **kwargs)
+  raise ValueError('%s is not supported' % f)
+
+
+def dynamic_len(list_or_tensor):
+  """Implementation of len using dynamic dispatch."""
+  if tensor_util.is_tensor(list_or_tensor):
+    shape = list_or_tensor.shape
+    if not shape:
+      raise ValueError(
+          'len requires non-zero rank for tensor "%s"' % list_or_tensor)
+    return array_ops.shape(list_or_tensor)[0]
+  return len(list_or_tensor)
+
+
+def dynamic_range(start_or_stop, stop=None, step=None):
+  """Implementation of range using dynamic dispatch."""
+  if type_check.is_tensor(start_or_stop, stop, step):
+    if step is not None:
+      return math_ops.range(start_or_stop, stop, step)
+    if stop is not None:
+      return math_ops.range(start_or_stop, stop)
+    return math_ops.range(start_or_stop)
+
+  if step is not None:
+    return range(start_or_stop, stop, step)
+  elif stop is not None:
+    return range(start_or_stop, stop)
+  return range(start_or_stop)
+
+
+def is_tf_print_compatible(value):
+  # TODO(mdan): Enable once we can reliably test this.
+  # This is currently disabled because we can't capture the output of
+  # op kernels from Python.
+  del value
+  return False
+
+
+def dynamic_print(*values):
+  """Implementation of print using dynamic dispatch.
+
+  The function attempts to use tf.Print if all the values are compatible.
+  Otherwise, it will fall back to py_func.
+
+  Args:
+    *values: values to print
+  Returns:
+    A dummy value indicating the print completed. If tf.
+  """
+
+  if all(map(is_tf_print_compatible, values)):
+    return logging_ops.Print(1, values)
+
+  def print_wrapper(*vals):
+    if six.PY3:
+      # TensorFlow doesn't seem to generate Unicode when passing strings to
+      # py_func. This causes the print to add a "b'" wrapper to the output,
+      # which is probably never what you want.
+      vals = tuple(v.decode() if isinstance(v, bytes) else v for v in vals)
+    print(*vals)
+    # The flush helps avoid garbled output in IPython.
+    sys.stdout.flush()
+
+  return py_func.wrap_py_func(
+      print_wrapper, None, values, use_dummy_return=True)
diff --git a/tensorflow/contrib/autograph/utils/builtins_test.py b/tensorflow/contrib/autograph/utils/builtins_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..163e6984079fea5c3b3d9aeda0ec8048d651686f
--- /dev/null
+++ b/tensorflow/contrib/autograph/utils/builtins_test.py
@@ -0,0 +1,112 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for builtins module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import six
+
+from tensorflow.contrib.autograph.utils import builtins
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+
+
+class BuiltinsTest(test.TestCase):
+
+  def test_dynamic_len_tf_scalar(self):
+    a = constant_op.constant(1)
+
+    with self.assertRaises(ValueError):
+      with self.test_session() as sess:
+        sess.run(builtins.dynamic_builtin(len, a))
+
+  def test_dynamic_len_tf_array(self):
+    a = constant_op.constant([1, 2, 3])
+
+    with self.test_session() as sess:
+      self.assertEqual(3, sess.run(builtins.dynamic_builtin(len, a)))
+
+  def test_dynamic_len_tf_matrix(self):
+    a = constant_op.constant([[1, 2], [3, 4]])
+
+    with self.test_session() as sess:
+      self.assertEqual(2, sess.run(builtins.dynamic_builtin(len, a)))
+
+  def test_dynamic_len_py_list(self):
+    a = [3] * 5
+
+    self.assertEqual(5, builtins.dynamic_builtin(len, a))
+
+  def test_dynamic_range_all_python(self):
+    self.assertListEqual(list(builtins.dynamic_builtin(range, 3)), [0, 1, 2])
+    self.assertListEqual(list(builtins.dynamic_builtin(range, 1, 3)), [1, 2])
+    self.assertListEqual(
+        list(builtins.dynamic_builtin(range, 2, 0, -1)), [2, 1])
+
+  def test_dynamic_range_tf(self):
+    with self.test_session() as sess:
+      self.assertAllEqual(
+          sess.run(builtins.dynamic_builtin(range, constant_op.constant(3))),
+          [0, 1, 2])
+      self.assertAllEqual(
+          sess.run(builtins.dynamic_builtin(range, 1, constant_op.constant(3))),
+          [1, 2])
+      self.assertAllEqual(
+          sess.run(
+              builtins.dynamic_builtin(range, 2, 0, constant_op.constant(-1))),
+          [2, 1])
+
+  def test_dynamic_range_detection(self):
+    def range(x):  # pylint:disable=redefined-builtin
+      return x
+
+    # Functions that just have the names of builtins are rejected.
+    with self.assertRaises(ValueError):
+      self.assertEqual(builtins.dynamic_builtin(range, 1), 1)
+    if six.PY2:
+      self.assertListEqual(
+          list(builtins.dynamic_builtin(xrange, 3)), [0, 1, 2])
+    self.assertListEqual(
+        list(builtins.dynamic_builtin(six.moves.range, 3)), [0, 1, 2])
+    self.assertListEqual(
+        list(builtins.dynamic_builtin(six.moves.xrange, 3)), [0, 1, 2])
+
+  def test_dynamic_print_tf(self):
+    try:
+      out_capturer = six.StringIO()
+      sys.stdout = out_capturer
+      with self.test_session() as sess:
+        sess.run(builtins.dynamic_print('test message', 1))
+        self.assertEqual(out_capturer.getvalue(), 'test message 1\n')
+    finally:
+      sys.stdout = sys.__stdout__
+
+  def test_dynamic_print_complex(self):
+    try:
+      out_capturer = six.StringIO()
+      sys.stdout = out_capturer
+      with self.test_session() as sess:
+        sess.run(builtins.dynamic_print('test message', [1, 2]))
+        self.assertEqual(out_capturer.getvalue(), 'test message [1, 2]\n')
+    finally:
+      sys.stdout = sys.__stdout__
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/context_managers.py b/tensorflow/contrib/autograph/utils/context_managers.py
similarity index 85%
rename from tensorflow/contrib/py2tf/utils/context_managers.py
rename to tensorflow/contrib/autograph/utils/context_managers.py
index 38d9e11fe9069722b9023fee848bf53e1f72de6a..3d150a95817b83c4d7aaa78dc250092dcc4c5a9b 100644
--- a/tensorflow/contrib/py2tf/utils/context_managers.py
+++ b/tensorflow/contrib/autograph/utils/context_managers.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import contextlib
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import tensor_array_ops
 
 
 def control_dependency_on_returns(return_value):
@@ -34,9 +35,15 @@ def control_dependency_on_returns(return_value):
   Returns:
     A context manager.
   """
+  def control_dependency_handle(t):
+    if isinstance(t, tensor_array_ops.TensorArray):
+      return t.flow
+    return t
+
   if return_value is None:
     return contextlib.contextmanager(lambda: (yield))()
   # TODO(mdan): Filter to tensor objects.
   if not isinstance(return_value, (list, tuple)):
     return_value = (return_value,)
+  return_value = tuple(control_dependency_handle(t) for t in return_value)
   return ops.control_dependencies(return_value)
diff --git a/tensorflow/contrib/py2tf/utils/context_managers_test.py b/tensorflow/contrib/autograph/utils/context_managers_test.py
similarity index 82%
rename from tensorflow/contrib/py2tf/utils/context_managers_test.py
rename to tensorflow/contrib/autograph/utils/context_managers_test.py
index 633ba93540e696889a6b2b71b40b999da39d48ff..42e27724b9856f715b524cdd7539897851715638 100644
--- a/tensorflow/contrib/py2tf/utils/context_managers_test.py
+++ b/tensorflow/contrib/autograph/utils/context_managers_test.py
@@ -18,8 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.utils import context_managers
+from tensorflow.contrib.autograph.utils import context_managers
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
 
@@ -32,6 +34,9 @@ class ContextManagersTest(test.TestCase):
     with context_managers.control_dependency_on_returns(
         constant_op.constant(1)):
       pass
+    with context_managers.control_dependency_on_returns(
+        tensor_array_ops.TensorArray(dtypes.int32, size=1)):
+      pass
     with context_managers.control_dependency_on_returns(
         [constant_op.constant(1),
          constant_op.constant(2)]):
diff --git a/tensorflow/contrib/py2tf/utils/misc.py b/tensorflow/contrib/autograph/utils/misc.py
similarity index 100%
rename from tensorflow/contrib/py2tf/utils/misc.py
rename to tensorflow/contrib/autograph/utils/misc.py
diff --git a/tensorflow/contrib/py2tf/utils/misc_test.py b/tensorflow/contrib/autograph/utils/misc_test.py
similarity index 78%
rename from tensorflow/contrib/py2tf/utils/misc_test.py
rename to tensorflow/contrib/autograph/utils/misc_test.py
index bfcb304c838df69e9e3961907362c7939c065117..71e358c33e1ea9887d267c67bc80362bac26c3a6 100644
--- a/tensorflow/contrib/py2tf/utils/misc_test.py
+++ b/tensorflow/contrib/autograph/utils/misc_test.py
@@ -18,29 +18,29 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.utils import misc
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import variables
+from tensorflow.contrib.autograph.utils.misc import alias_tensors
+from tensorflow.python.framework.constant_op import constant
+from tensorflow.python.ops.variables import Variable
 from tensorflow.python.platform import test
 
 
-class ContextManagersTest(test.TestCase):
+class MiscTest(test.TestCase):
 
   def test_alias_single_tensor(self):
-    a = constant_op.constant(1)
+    a = constant(1)
 
-    new_a = misc.alias_tensors(a)
+    new_a = alias_tensors(a)
     self.assertFalse(new_a is a)
     with self.test_session() as sess:
       self.assertEqual(1, sess.run(new_a))
 
   def test_alias_tensors(self):
-    a = constant_op.constant(1)
-    v = variables.Variable(2)
+    a = constant(1)
+    v = Variable(2)
     s = 'a'
     l = [1, 2, 3]
 
-    new_a, new_v, new_s, new_l = misc.alias_tensors(a, v, s, l)
+    new_a, new_v, new_s, new_l = alias_tensors(a, v, s, l)
 
     self.assertFalse(new_a is a)
     self.assertTrue(new_v is v)
diff --git a/tensorflow/contrib/autograph/utils/multiple_dispatch.py b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..70eef5676f61bcd978ea53260f0b86a817f2bd7c
--- /dev/null
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
@@ -0,0 +1,66 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for type-dependent behavior used in autograph-generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.utils.type_check import is_tensor
+from tensorflow.python.ops import control_flow_ops
+
+
+def dynamic_is(left, right):
+  # TODO(alexbw) if we're sure we should leave 'is' in place,
+  # then change the semantics in converters/logical_expressions.py
+  return left is right
+
+
+def dynamic_is_not(left, right):
+  return left is not right
+
+
+def run_cond(condition, true_fn, false_fn):
+  """Type-dependent functional conditional.
+
+  Args:
+    condition: A Tensor or Python bool.
+    true_fn: A Python callable implementing the true branch of the conditional.
+    false_fn: A Python callable implementing the false branch of the
+      conditional.
+
+  Returns:
+    result: The result of calling the appropriate branch. If condition is a
+    Tensor, tf.cond will be used. Otherwise, a standard Python if statement will
+    be ran.
+  """
+  if is_tensor(condition):
+    return control_flow_ops.cond(condition, true_fn, false_fn)
+  else:
+    return py_cond(condition, true_fn, false_fn)
+
+
+def py_cond(condition, true_fn, false_fn):
+  """Functional version of Python's conditional."""
+  if condition:
+    results = true_fn()
+  else:
+    results = false_fn()
+
+  # The contract for the branch functions is to return tuples, but they should
+  # be collapsed to a single element when there is only one output.
+  if len(results) == 1:
+    return results[0]
+  return results
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
similarity index 50%
rename from tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
rename to tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
index 5bb4d4086b002211eebb86783bb7212c707a1418..f72f8e94a0df815f7d517e2b81ffc86c5c545f07 100644
--- a/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
@@ -17,7 +17,10 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from tensorflow.contrib.py2tf.utils import multiple_dispatch
+
+import numpy as np
+
+from tensorflow.contrib.autograph.utils import multiple_dispatch
 from tensorflow.python.client.session import Session
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.platform import test
@@ -25,44 +28,47 @@ from tensorflow.python.platform import test
 
 class MultipleDispatchTest(test.TestCase):
 
+  def test_dynamic_is_python(self):
+    a = np.eye(3)
+    also_a = a
+    not_actually_a = np.eye(3)
+    should_be_true1 = multiple_dispatch.dynamic_is(a, also_a)
+    should_be_false1 = multiple_dispatch.dynamic_is_not(a, also_a)
+    should_be_true2 = multiple_dispatch.dynamic_is_not(a, not_actually_a)
+    should_be_false2 = multiple_dispatch.dynamic_is(a, not_actually_a)
+    self.assertTrue(should_be_true1)
+    self.assertTrue(should_be_true2)
+    self.assertFalse(should_be_false1)
+    self.assertFalse(should_be_false2)
+
+  def test_dynamic_is_tf(self):
+    with Session().as_default():
+      a = constant([2.0])
+      also_a = a
+      not_actually_a = constant([2.0])
+      should_be_true1 = multiple_dispatch.dynamic_is(a, also_a)
+      should_be_false1 = multiple_dispatch.dynamic_is_not(a, also_a)
+      should_be_true2 = multiple_dispatch.dynamic_is_not(a, not_actually_a)
+      should_be_false2 = multiple_dispatch.dynamic_is(a, not_actually_a)
+      self.assertTrue(should_be_true1)
+      self.assertTrue(should_be_true2)
+      self.assertFalse(should_be_false1)
+      self.assertFalse(should_be_false2)
+
   def test_run_cond_python(self):
-    true_fn = lambda: 2.0
-    false_fn = lambda: 3.0
-    self.assertEqual(multiple_dispatch.run_cond(True, true_fn, false_fn), 2.0)
-    self.assertEqual(multiple_dispatch.run_cond(False, true_fn, false_fn), 3.0)
+    true_fn = lambda: (2,)
+    false_fn = lambda: (3,)
+    self.assertEqual(multiple_dispatch.run_cond(True, true_fn, false_fn), 2)
+    self.assertEqual(multiple_dispatch.run_cond(False, true_fn, false_fn), 3)
 
   def test_run_cond_tf(self):
-
-    true_fn = lambda: constant([2.0])
-    false_fn = lambda: constant([3.0])
+    true_fn = lambda: (constant(2),)
+    false_fn = lambda: (constant(3),)
     with Session() as sess:
       out = multiple_dispatch.run_cond(constant(True), true_fn, false_fn)
-      self.assertEqual(sess.run(out), 2.0)
+      self.assertEqual(sess.run(out), 2)
       out = multiple_dispatch.run_cond(constant(False), true_fn, false_fn)
-      self.assertEqual(sess.run(out), 3.0)
-
-  def test_run_while_python(self):
-    cond_fn = lambda x, t, s: x > t
-    body_fn = lambda x, t, s: (x * s, t, s)
-
-    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 1.0, 0.5])
-    self.assertEqual(x, 0.75)
-
-    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 4.0, 0.5])
-    self.assertEqual(x, 3.0)
-
-  def test_run_while_tf(self):
-    cond_fn = lambda x, t, s: x > t
-    body_fn = lambda x, t, s: (x * s, t, s)
-
-    with Session() as sess:
-      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
-                                            [constant(3.0), 1.0, 0.5])
-      self.assertEqual(sess.run(x), 0.75)
-
-      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
-                                            [constant(3.0), 4.0, 0.5])
-      self.assertEqual(sess.run(x), 3.0)
+      self.assertEqual(sess.run(out), 3)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/utils/py_func.py b/tensorflow/contrib/autograph/utils/py_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..11ebfb2e49f0e762b56ae2cde2b76d2e24032d72
--- /dev/null
+++ b/tensorflow/contrib/autograph/utils/py_func.py
@@ -0,0 +1,131 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pyfunc creation utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import script_ops
+
+
+class MatchDType(namedtuple('MatchDType', ('arg_number',))):
+  """Allows matching the dtype of an argument.
+
+  Used in conjunction with function calls. For example, MatchDType(0) will
+  match the DType of the first argument.
+  """
+
+  pass
+
+
+def wrap_py_func(f, return_dtypes, args, kwargs=None, use_dummy_return=False):
+  """Helper that wraps a callable to py_func.
+
+  The helper passes tensor arguments through the py_func interface. Non-tensor
+  arguments are allowed, and will be passed to f directly. Note that non-tensor
+  arguments are captured by f will not update every time the wrapper is
+  called (this is consistent with its argument list, which only includes
+  the tensor arguments). In general, it's safest not to reuse this wrapper.
+
+  Args:
+    f: Callable
+    return_dtypes: None, individual of tuple/list of DType or MatchDType, the
+        data type for each of f's return value(s). Set to None if f has no
+        return values or use_dummy_return is True. Use MatchDType to define a
+        dtype identical to that of `i`th argument (argument 0 is the first);
+        an argument must of Tensor type if it is to be used with MatchDType.
+    args: Positional arguments for f, as list or tuple.
+    kwargs: Keyword arguments for f, as dict with string keys. May be None.
+    use_dummy_return: If True, the function will return a dummy value of 1
+        and discard its actual return value.
+  Returns:
+    The return values of f converted to tensor.
+  Raises:
+    ValueError: if any of the arguments are incorrect.
+  """
+
+  if return_dtypes and use_dummy_return:
+    raise ValueError('if use_dummy_return is True, return_dtypes must be empty')
+
+  tensor_args = []
+  tensor_args_idx = {}
+
+  # Of the positional arguments, only grab the tensor ones to be passed through
+  # the py_func.
+  n_args = len(args)
+  arg_is_tensor = tuple(map(tensor_util.is_tensor, args))
+  for i in range(n_args):
+    if arg_is_tensor[i]:
+      tensor_args_idx[i] = len(tensor_args)
+      tensor_args.append(args[i])
+
+  # We essentially take the tensor kwargs, if any, and add them to the list of
+  # positional arguments. The kwargs are then reconstructed inside the py_func.
+  #
+  # For example, if
+  #
+  #     args = [Tensor(1), 'foo']
+  #     kwargs = {'a': Tensor(2), 'b': 'bar'}
+  #
+  # Then
+  #
+  #     tensor_args = (Tensor(1), Tensor(2))
+  #     kwarg_keys = ('a', 'b')
+  if kwargs:
+    kwarg_keys = tuple(kwargs.keys())
+    kwarg_is_tensor = {k: tensor_util.is_tensor(kwargs[k]) for k in kwarg_keys}
+    for k in kwarg_keys:
+      if kwarg_is_tensor[k]:
+        tensor_args_idx[k] = len(tensor_args)
+        tensor_args.append(kwargs[k])
+  else:
+    kwarg_keys = ()
+
+  # Set up return dtypes.
+  def match_arg_dtype(arg_number):
+    arg = args[arg_number]
+    if not arg_is_tensor[arg_number]:
+      raise ValueError(
+          'argument %d was used with MatchDType and must be a tf.Tensor, but '
+          'was %s instead' % (arg_number, type(arg)))
+    return arg.dtype
+
+  if return_dtypes:
+    if isinstance(return_dtypes, MatchDType):
+      return_dtypes = match_arg_dtype(return_dtypes.arg_number)
+    elif isinstance(return_dtypes, (list, tuple)):
+      return_dtypes = tuple(
+          match_arg_dtype(a.arg_number) if isinstance(a, MatchDType) else a
+          for a in return_dtypes)
+    else:
+      assert isinstance(return_dtypes, dtypes.DType)
+
+  def f_wrapper(*tensor_args):
+    f_args = tuple(tensor_args[tensor_args_idx[i]] if arg_is_tensor[i] else a
+                   for i, a in enumerate(args))
+    f_kwargs = {
+        k: tensor_args[tensor_args_idx[k]] if kwarg_is_tensor[k] else kwargs[k]
+        for i, k in enumerate(kwarg_keys)
+    }
+    retval = f(*f_args, **f_kwargs)
+    return 1 if use_dummy_return else retval
+
+  return script_ops.py_func(f_wrapper, tensor_args, dtypes.int64
+                            if use_dummy_return else return_dtypes)
diff --git a/tensorflow/contrib/autograph/utils/py_func_test.py b/tensorflow/contrib/autograph/utils/py_func_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2468263142f14332e86db99d198ba0f5c633dc69
--- /dev/null
+++ b/tensorflow/contrib/autograph/utils/py_func_test.py
@@ -0,0 +1,103 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for wrap_py_func module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.utils import py_func
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class PyFuncTest(test.TestCase):
+
+  def test_wrap_py_func_simple(self):
+
+    def test_fn(a, b, c):
+      return a + b + c
+
+    with self.test_session() as sess:
+      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+                                    (1, constant_op.constant(1), 1))
+      self.assertEqual(3, sess.run(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int64, (1, 1, 1))
+      self.assertEqual(3, sess.run(result))
+      result = py_func.wrap_py_func(
+          test_fn, dtypes.int64,
+          (constant_op.constant(1), 1, constant_op.constant(1)))
+      self.assertEqual(3, sess.run(result))
+
+  def test_wrap_py_func_complex_args(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.foo = 5
+
+    def test_fn(a, b):
+      return a * b.foo
+
+    with self.test_session() as sess:
+      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass()))
+      self.assertEqual(35, sess.run(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+                                    (constant_op.constant(7), TestClass()))
+      self.assertEqual(35, sess.run(result))
+
+  def test_wrap_py_func_kwargs(self):
+
+    class TestClass(object):
+
+      def __init__(self, foo):
+        self.foo = foo
+
+    def test_fn(a, b, c, d):
+      return a * b.foo + c * d.foo
+
+    with self.test_session() as sess:
+      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass(5)), {
+          'c': 11,
+          'd': TestClass(13)
+      })
+      self.assertEqual(178, sess.run(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+                                    (constant_op.constant(7), TestClass(5)), {
+                                        'c': constant_op.constant(11),
+                                        'd': TestClass(13)
+                                    })
+      self.assertEqual(178, sess.run(result))
+
+  def test_wrap_py_func_dummy_return(self):
+
+    side_counter = [0]
+
+    def test_fn(_):
+      side_counter[0] += 1
+
+    with self.test_session() as sess:
+      result = py_func.wrap_py_func(test_fn, None, (5,), use_dummy_return=True)
+      self.assertEqual(1, sess.run(result))
+      self.assertEqual([1], side_counter)
+      result = py_func.wrap_py_func(
+          test_fn, None, (constant_op.constant(5),), use_dummy_return=True)
+      self.assertEqual(1, sess.run(result))
+      self.assertEqual([2], side_counter)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/tensor_list.py b/tensorflow/contrib/autograph/utils/tensor_list.py
similarity index 67%
rename from tensorflow/contrib/py2tf/utils/tensor_list.py
rename to tensorflow/contrib/autograph/utils/tensor_list.py
index b6ff49e2a0eff384f10903e12212ab929e267804..2556f412891b4f0b954af5a6f0193341a6a5020a 100644
--- a/tensorflow/contrib/py2tf/utils/tensor_list.py
+++ b/tensorflow/contrib/autograph/utils/tensor_list.py
@@ -18,7 +18,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import tensor_array_ops
+
+
+def dynamic_list_append(target, element):
+  """Converts a list append call inline."""
+  if isinstance(target, tensor_array_ops.TensorArray):
+    return target.write(target.size(), element)
+  # TODO(mdan): What's the right way to check this?
+  # TODO(mdan): We may not need this branch.
+  # It may be possible to use TensorList alone if the loop body will not
+  # require wrapping it, although we'd have to think about an autoboxing
+  # mechanism for lists received as parameter.
+  if isinstance(target, ops.Tensor):
+    return list_ops.tensor_list_push_back(target, element)
+
+  # Python targets (including TensorList): fallback to their original append.
+  target.append(element)
+  return target
 
 
 class TensorList(object):
diff --git a/tensorflow/contrib/py2tf/utils/tensor_list_test.py b/tensorflow/contrib/autograph/utils/tensor_list_test.py
similarity index 71%
rename from tensorflow/contrib/py2tf/utils/tensor_list_test.py
rename to tensorflow/contrib/autograph/utils/tensor_list_test.py
index b5e554a162674e08da21785dcbe193c54647f128..d58489eb68b6b949a4276520605c62b7c2825558 100644
--- a/tensorflow/contrib/py2tf/utils/tensor_list_test.py
+++ b/tensorflow/contrib/autograph/utils/tensor_list_test.py
@@ -12,22 +12,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for PyFlow list."""
+"""Tests for Autograph lists."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.utils import tensor_list as tl
+from tensorflow.contrib.autograph.utils import tensor_list as tl
 from tensorflow.python.client.session import Session
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.constant_op import constant
+from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
 
 class TensorListTest(test.TestCase):
 
+  def _shape(self, shape_tuple):
+    return constant(shape_tuple, dtypes.int32)
+
+  def test_dynamic_list_append(self):
+    l = []
+    l = tl.dynamic_list_append(l, 1)
+    self.assertListEqual(l, [1])
+
+    l = list_ops.empty_tensor_list(self._shape(()), dtypes.int32)
+    l = tl.dynamic_list_append(l, 1)
+    s = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(s), [1])
+
+    l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
+    l = tl.dynamic_list_append(l, 1)
+    s = l.stack()
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(s), [1])
+
+    l = tl.TensorList(self._shape(()), dtypes.int32)
+    l = tl.dynamic_list_append(l, 1)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(l[0]), 1)
+
   def test_list_append_python(self):
     with context.eager_mode():
       a = constant(3.0)
diff --git a/tensorflow/contrib/py2tf/pyct/parser.py b/tensorflow/contrib/autograph/utils/testing.py
similarity index 65%
rename from tensorflow/contrib/py2tf/pyct/parser.py
rename to tensorflow/contrib/autograph/utils/testing.py
index dc7df883b349becd860bb0dbceab22cb39c750b5..cb4785d0dc0f4674b3560418daeb6733364b21e7 100644
--- a/tensorflow/contrib/py2tf/pyct/parser.py
+++ b/tensorflow/contrib/autograph/utils/testing.py
@@ -12,29 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Converting code to AST.
-
-Adapted from Tangent.
-"""
+"""Testing utilities."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import textwrap
-
-import gast
-
-from tensorflow.python.util import tf_inspect
-
+import imp
 
-def parse_entity(entity):
-  """Return the AST of given entity."""
-  source = tf_inspect.getsource(entity)
-  source = textwrap.dedent(source)
-  return parse_str(source), source
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
 
 
-def parse_str(src):
-  """Return the AST of given piece of code."""
-  return gast.parse(src)
+def fake_tf():
+  """Creates a fake module that looks like TensorFlow, for testing."""
+  mod = imp.new_module('tensorflow')
+  mod_contents = dict()
+  mod_contents.update(math_ops.__dict__)
+  mod_contents.update(ops.__dict__)
+  mod_contents.update(mod.__dict__)
+  mod.__dict__.update(mod_contents)
+  return mod
diff --git a/tensorflow/contrib/py2tf/utils/type_check.py b/tensorflow/contrib/autograph/utils/type_check.py
similarity index 86%
rename from tensorflow/contrib/py2tf/utils/type_check.py
rename to tensorflow/contrib/autograph/utils/type_check.py
index 9ca2dec872c8a9ca7bedaa8603f70e3214a3e24a..8748abc47bcfb55b4d0b11178a46816249732da9 100644
--- a/tensorflow/contrib/py2tf/utils/type_check.py
+++ b/tensorflow/contrib/autograph/utils/type_check.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities used in py2tf-generated code."""
+"""Utilities used in autograph-generated code."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,12 +22,12 @@ from tensorflow.python.framework import tensor_util
 
 
 def is_tensor(*args):
-  """Check if all arguments are tensors.
+  """Check if any arguments are tensors.
 
   Args:
     *args: Python objects that may or may not be tensors.
 
   Returns:
-    True if all *args are TensorFlow types, False if one or more are not.
+    True if any *args are TensorFlow types, False if none are.
   """
   return any([tensor_util.is_tensor(a) for a in args])
diff --git a/tensorflow/contrib/py2tf/utils/type_check_test.py b/tensorflow/contrib/autograph/utils/type_check_test.py
similarity index 96%
rename from tensorflow/contrib/py2tf/utils/type_check_test.py
rename to tensorflow/contrib/autograph/utils/type_check_test.py
index 7d0428e9cccecdc67511e236bc00655a055aea29..3b67b7194c5656b193d47860f93986a985cb1aef 100644
--- a/tensorflow/contrib/py2tf/utils/type_check_test.py
+++ b/tensorflow/contrib/autograph/utils/type_check_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy
 
-from tensorflow.contrib.py2tf.utils import type_check
+from tensorflow.contrib.autograph.utils import type_check
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/bayesflow/python/ops/optimizers.py b/tensorflow/contrib/autograph/utils/type_hints.py
similarity index 54%
rename from tensorflow/contrib/bayesflow/python/ops/optimizers.py
rename to tensorflow/contrib/autograph/utils/type_hints.py
index fb70628d1083836281e9327e83e109493276c64f..aeb9e545610460afbe364dfcfc7a54b9aede29fe 100644
--- a/tensorflow/contrib/bayesflow/python/ops/optimizers.py
+++ b/tensorflow/contrib/autograph/utils/type_hints.py
@@ -12,25 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Probabilistic optimizer modules.
+"""No-op utilities that provide static type hints.
 
-See ${python/contrib.bayesflow.optimizers}.
+These are used when the data type is not known at creation, for instance in the
+case of empty lists.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.sgld_optimizer import *
-from tensorflow.contrib.bayesflow.python.ops.variational_sgd_optimizer import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = [
-    'SGLDOptimizer',
-    'VariationalSGDOptimizer',
-]
+def set_element_type(entity, dtype, shape=None):
+  """Indicates that the entity is expected hold items of specified type.
 
-remove_undocumented(__name__, _allowed_symbols)
+  This function is a no-op. Its presence merely marks the data type of its
+  argument. The staged TensorFlow ops will reflect and assert this data type.
+
+  Args:
+    entity: A Tensor or TensorArray.
+    dtype: TensorFlow dtype value to assert for entity.
+    shape: Optional shape to assert for entity.
+  Returns:
+    The value of entity, unchanged.
+  """
+  del dtype
+  del shape
+  return entity
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index ee67909133fc26ba98355db05a4b90d3dfa6b97b..d65c990c87cbc316472237d183c03765416501e7 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -112,14 +112,3 @@ py_test(
         "//tensorflow/python:script_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
index fac7aff29f79fa18fa5f7e596db8afedabaa8993..e22f978dde6f1b7febc771d526201579c20292c7 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops_test.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -250,7 +250,7 @@ class BatchOpsTest(test.TestCase):
   def testUnbatchGrad(self):
     """Tests that batch and unbatch are differentiable."""
     with self.test_session() as sess:
-      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      inp = array_ops.placeholder(dtype=dtypes.float32, shape=[1])
       batched, index, id_t = batch_ops.batch(
           [inp], num_batch_threads=1, max_batch_size=2,
           batch_timeout_micros=36000000, grad_timeout_micros=1000000,
diff --git a/tensorflow/contrib/batching/test_util/BUILD b/tensorflow/contrib/batching/test_util/BUILD
index 6db627faad1df4a4b73082e74e7754829ff2b514..7cb2d8079bd18660f72eab92654629434ce4d6a5 100644
--- a/tensorflow/contrib/batching/test_util/BUILD
+++ b/tensorflow/contrib/batching/test_util/BUILD
@@ -8,17 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
-
 cc_library(
     name = "fake_clock_env",
     testonly = 1,
diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
index 2a84a7712a8fa66e89db41ff4e7ebe4f620029ca..8f81b6702f2807d7da7e72190ce2d86b28e52113 100644
--- a/tensorflow/contrib/batching/util/BUILD
+++ b/tensorflow/contrib/batching/util/BUILD
@@ -8,18 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "**/google_*",
-        ],
-    ),
-)
-
 cc_library(
     name = "periodic_function_dynamic",
     hdrs = ["periodic_function.h"],
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 74712aeb67c3f0a31def78f25a0298f9c02c9590..5a2d7f6a3c0ba233299a5790fa80488786712f3c 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -37,126 +37,6 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "metropolis_hastings_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/metropolis_hastings_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-    ],
-)
-
-cuda_py_test(
-    name = "csiszar_divergence_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/csiszar_divergence_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-    ],
-    tags = [
-        "manual",  # b/64490288
-        "notap",
-    ],
-)
-
-cuda_py_test(
-    name = "custom_grad_test",
-    size = "small",
-    srcs = ["python/kernel_tests/custom_grad_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-    ],
-)
-
-cuda_py_test(
-    name = "layers_conv_variational_test",
-    size = "small",
-    srcs = ["python/kernel_tests/layers_conv_variational_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-    ],
-)
-
-cuda_py_test(
-    name = "layers_dense_variational_test",
-    size = "small",
-    srcs = ["python/kernel_tests/layers_dense_variational_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-    ],
-)
-
-cuda_py_test(
-    name = "mcmc_diagnostics_test",
-    size = "small",
-    srcs = ["python/kernel_tests/mcmc_diagnostics_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:spectral_ops_test_util",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_seed",
-    ],
-)
-
 cuda_py_test(
     name = "monte_carlo_test",
     size = "small",
@@ -177,117 +57,3 @@ cuda_py_test(
         "//tensorflow/python:random_seed",
     ],
 )
-
-cuda_py_test(
-    name = "halton_sequence_test",
-    size = "small",
-    srcs = ["python/kernel_tests/halton_sequence_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-    ],
-    tags = ["no_mac"],  # b/73192243
-)
-
-cuda_py_test(
-    name = "hmc_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/hmc_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_seed",
-    ],
-)
-
-cuda_py_test(
-    name = "sgld_optimizer_test",
-    size = "small",
-    srcs = ["python/kernel_tests/sgld_optimizer_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_seed",
-    ],
-    tags = ["notsan"],
-)
-
-cuda_py_test(
-    name = "variable_utils_test",
-    size = "small",
-    srcs = ["python/kernel_tests/variable_utils_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "variational_sgd_optimizer_test",
-    size = "small",
-    srcs = ["python/kernel_tests/variational_sgd_optimizer_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_seed",
-    ],
-    tags = ["notsan"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/bayesflow/README.md b/tensorflow/contrib/bayesflow/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10323dc6d59918a9f8cf1840d06dcd219dfe3568
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/README.md
@@ -0,0 +1,17 @@
+# Notice
+
+`tf.contrib.bayesflow` has moved!
+
+See new code at [github.com/tensorflow/probability](
+https://github.com/tensorflow/probability).
+
+Switch imports with:
+
+```python
+# old
+import tensorflow as tf
+tfp = tf.contrib.bayesflow
+
+# new
+import tensorflow_probability as tfp
+```
diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py
index 528c4fbacd06c7b0defa0e32bd24a98b2bc07b64..41a8c920fc4e81af90f4c94a149d8c404c58b747 100644
--- a/tensorflow/contrib/bayesflow/__init__.py
+++ b/tensorflow/contrib/bayesflow/__init__.py
@@ -21,36 +21,14 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.bayesflow.python.ops import csiszar_divergence
-from tensorflow.contrib.bayesflow.python.ops import custom_grad
-from tensorflow.contrib.bayesflow.python.ops import halton_sequence
-from tensorflow.contrib.bayesflow.python.ops import hmc
-from tensorflow.contrib.bayesflow.python.ops import layers
-from tensorflow.contrib.bayesflow.python.ops import mcmc_diagnostics
-from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings
 from tensorflow.contrib.bayesflow.python.ops import monte_carlo
-from tensorflow.contrib.bayesflow.python.ops import optimizers
-from tensorflow.contrib.bayesflow.python.ops import variable_utils
 # pylint: enable=unused-import,line-too-long
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
-    'csiszar_divergence',
-    'custom_grad',
-    'entropy',
-    'halton_sequence',
-    'hmc',
-    'layers',
-    'metropolis_hastings',
-    'mcmc_diagnostics',
     'monte_carlo',
-    'optimizers',
-    'special_math',
-    'stochastic_variables',
-    'variable_utils',
-    'variational_inference',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
deleted file mode 100644
index 2e94b7206de4f7c40c89f083f3bfa2a22bb7b917..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
+++ /dev/null
@@ -1,1004 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Csiszar Divergence Ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import csiszar_divergence_impl
-from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_diag_lib
-from tensorflow.contrib.distributions.python.ops import mvn_full_covariance as mvn_full_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import kullback_leibler
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.platform import test
-
-
-cd = csiszar_divergence_impl
-
-
-def tridiag(d, diag_value, offdiag_value):
-  """d x d matrix with given value on diag, and one super/sub diag."""
-  diag_mat = linalg_ops.eye(d) * (diag_value - offdiag_value)
-  three_bands = array_ops.matrix_band_part(
-      array_ops.fill([d, d], offdiag_value), 1, 1)
-  return diag_mat + three_bands
-
-
-class AmariAlphaTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    for alpha in [-1., 0., 1., 2.]:
-      for normalized in [True, False]:
-        with self.test_session(graph=ops.Graph()):
-          self.assertAllClose(
-              cd.amari_alpha(0., alpha=alpha,
-                             self_normalized=normalized).eval(),
-              0.)
-
-  def test_correct_when_alpha0(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.amari_alpha(self._logu, alpha=0.).eval(),
-          -self._logu)
-
-      self.assertAllClose(
-          cd.amari_alpha(self._logu, alpha=0., self_normalized=True).eval(),
-          -self._logu + (self._u - 1.))
-
-  def test_correct_when_alpha1(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.amari_alpha(self._logu, alpha=1.).eval(),
-          self._u * self._logu)
-
-      self.assertAllClose(
-          cd.amari_alpha(self._logu, alpha=1., self_normalized=True).eval(),
-          self._u * self._logu - (self._u - 1.))
-
-  def test_correct_when_alpha_not_01(self):
-    for alpha in [-2, -1., -0.5, 0.5, 2.]:
-      with self.test_session(graph=ops.Graph()):
-        self.assertAllClose(
-            cd.amari_alpha(self._logu,
-                           alpha=alpha,
-                           self_normalized=False).eval(),
-            ((self._u**alpha - 1)) / (alpha * (alpha - 1.)))
-
-        self.assertAllClose(
-            cd.amari_alpha(self._logu,
-                           alpha=alpha,
-                           self_normalized=True).eval(),
-            ((self._u**alpha - 1.)
-             - alpha * (self._u - 1)) / (alpha * (alpha - 1.)))
-
-
-class KLReverseTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    for normalized in [True, False]:
-      with self.test_session(graph=ops.Graph()):
-        self.assertAllClose(
-            cd.kl_reverse(0., self_normalized=normalized).eval(),
-            0.)
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.kl_reverse(self._logu).eval(),
-          -self._logu)
-
-      self.assertAllClose(
-          cd.kl_reverse(self._logu, self_normalized=True).eval(),
-          -self._logu + (self._u - 1.))
-
-
-class KLForwardTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    for normalized in [True, False]:
-      with self.test_session(graph=ops.Graph()):
-        self.assertAllClose(
-            cd.kl_forward(0., self_normalized=normalized).eval(),
-            0.)
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.kl_forward(self._logu).eval(),
-          self._u * self._logu)
-
-      self.assertAllClose(
-          cd.kl_forward(self._logu, self_normalized=True).eval(),
-          self._u * self._logu - (self._u - 1.))
-
-
-class JensenShannonTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.jensen_shannon(0.).eval(), np.log(0.25))
-
-  def test_symmetric(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.jensen_shannon(self._logu).eval(),
-          cd.symmetrized_csiszar_function(
-              self._logu, cd.jensen_shannon).eval())
-
-      self.assertAllClose(
-          cd.jensen_shannon(self._logu, self_normalized=True).eval(),
-          cd.symmetrized_csiszar_function(
-              self._logu,
-              lambda x: cd.jensen_shannon(x, self_normalized=True)).eval())
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.jensen_shannon(self._logu).eval(),
-          (self._u * self._logu
-           - (1 + self._u) * np.log1p(self._u)))
-
-      self.assertAllClose(
-          cd.jensen_shannon(self._logu, self_normalized=True).eval(),
-          (self._u * self._logu
-           - (1 + self._u) * np.log((1 + self._u) / 2)))
-
-
-class ArithmeticGeometricMeanTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.arithmetic_geometric(0.).eval(), np.log(4))
-      self.assertAllClose(
-          cd.arithmetic_geometric(0., self_normalized=True).eval(), 0.)
-
-  def test_symmetric(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.arithmetic_geometric(self._logu).eval(),
-          cd.symmetrized_csiszar_function(
-              self._logu, cd.arithmetic_geometric).eval())
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.arithmetic_geometric(self._logu).eval(),
-          (1. + self._u) * np.log((1. + self._u) / np.sqrt(self._u)))
-
-      self.assertAllClose(
-          cd.arithmetic_geometric(self._logu, self_normalized=True).eval(),
-          (1. + self._u) * np.log(0.5 * (1. + self._u) / np.sqrt(self._u)))
-
-
-class TotalVariationTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.total_variation(0.).eval(), 0.)
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.total_variation(self._logu).eval(),
-          0.5 * np.abs(self._u - 1))
-
-
-class PearsonTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.pearson(0.).eval(), 0.)
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.pearson(self._logu).eval(),
-          np.square(self._u - 1))
-
-
-class SquaredHellingerTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.squared_hellinger(0.).eval(), 0.)
-
-  def test_symmetric(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.squared_hellinger(self._logu).eval(),
-          cd.symmetrized_csiszar_function(
-              self._logu, cd.squared_hellinger).eval())
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.squared_hellinger(self._logu).eval(),
-          np.square(np.sqrt(self._u) - 1))
-
-
-class TriangularTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.triangular(0.).eval(), 0.)
-
-  def test_symmetric(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.triangular(self._logu).eval(),
-          cd.symmetrized_csiszar_function(
-              self._logu, cd.triangular).eval())
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.triangular(self._logu).eval(),
-          np.square(self._u - 1) / (1 + self._u))
-
-
-class TPowerTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.t_power(0., t=-0.1).eval(), 0.)
-      self.assertAllClose(cd.t_power(0., t=0.5).eval(), 0.)
-      self.assertAllClose(cd.t_power(0., t=1.1).eval(), 0.)
-      self.assertAllClose(
-          cd.t_power(0., t=-0.1, self_normalized=True).eval(), 0.)
-      self.assertAllClose(
-          cd.t_power(0., t=0.5, self_normalized=True).eval(), 0.)
-      self.assertAllClose(
-          cd.t_power(0., t=1.1, self_normalized=True).eval(), 0.)
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.t_power(self._logu, t=np.float64(-0.1)).eval(),
-          self._u ** -0.1 - 1.)
-      self.assertAllClose(
-          cd.t_power(self._logu, t=np.float64(0.5)).eval(),
-          -self._u ** 0.5 + 1.)
-      self.assertAllClose(
-          cd.t_power(self._logu, t=np.float64(1.1)).eval(),
-          self._u ** 1.1 - 1.)
-
-  def test_correct_self_normalized(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.t_power(self._logu, t=np.float64(-0.1),
-                     self_normalized=True).eval(),
-          self._u ** -0.1 - 1. + 0.1 * (self._u - 1.))
-      self.assertAllClose(
-          cd.t_power(self._logu, t=np.float64(0.5),
-                     self_normalized=True).eval(),
-          -self._u ** 0.5 + 1. + 0.5 * (self._u - 1.))
-      self.assertAllClose(
-          cd.t_power(self._logu, t=np.float64(1.1),
-                     self_normalized=True).eval(),
-          self._u ** 1.1 - 1. - 1.1 * (self._u - 1.))
-
-
-class Log1pAbsTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.log1p_abs(0.).eval(), 0.)
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.log1p_abs(self._logu).eval(),
-          self._u**(np.sign(self._u - 1)) - 1)
-
-
-class JeffreysTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.jeffreys(0.).eval(), 0.)
-
-  def test_symmetric(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.jeffreys(self._logu).eval(),
-          cd.symmetrized_csiszar_function(
-              self._logu, cd.jeffreys).eval())
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.jeffreys(self._logu).eval(),
-          0.5 * (self._u * self._logu - self._logu))
-
-
-class ChiSquareTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(cd.chi_square(0.).eval(), 0.)
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.chi_square(self._logu).eval(),
-          self._u**2 - 1)
-
-
-class ModifiedGanTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10, 100)
-    self._u = np.exp(self._logu)
-
-  def test_at_zero(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.modified_gan(0.).eval(), np.log(2))
-      self.assertAllClose(
-          cd.modified_gan(0., self_normalized=True).eval(), np.log(2))
-
-  def test_correct(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.modified_gan(self._logu).eval(),
-          np.log1p(self._u) - self._logu)
-
-      self.assertAllClose(
-          cd.modified_gan(self._logu, self_normalized=True).eval(),
-          np.log1p(self._u) - self._logu + 0.5 * (self._u - 1))
-
-
-class SymmetrizedCsiszarFunctionTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10., 100)
-    self._u = np.exp(self._logu)
-
-  def test_jensen_shannon(self):
-    with self.test_session():
-
-      # The following functions come from the claim made in the
-      # symmetrized_csiszar_function docstring.
-      def js1(logu):
-        return (-logu
-                - (1. + math_ops.exp(logu)) * (
-                    nn_ops.softplus(logu)))
-
-      def js2(logu):
-        return 2. * (math_ops.exp(logu) * (
-            logu - nn_ops.softplus(logu)))
-
-      self.assertAllClose(
-          cd.symmetrized_csiszar_function(self._logu, js1).eval(),
-          cd.jensen_shannon(self._logu).eval())
-
-      self.assertAllClose(
-          cd.symmetrized_csiszar_function(self._logu, js2).eval(),
-          cd.jensen_shannon(self._logu).eval())
-
-  def test_jeffreys(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.symmetrized_csiszar_function(self._logu, cd.kl_reverse).eval(),
-          cd.jeffreys(self._logu).eval())
-
-      self.assertAllClose(
-          cd.symmetrized_csiszar_function(self._logu, cd.kl_forward).eval(),
-          cd.jeffreys(self._logu).eval())
-
-
-class DualCsiszarFunctionTest(test.TestCase):
-
-  def setUp(self):
-    self._logu = np.linspace(-10., 10., 100)
-    self._u = np.exp(self._logu)
-
-  def test_kl_forward(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.dual_csiszar_function(self._logu, cd.kl_forward).eval(),
-          cd.kl_reverse(self._logu).eval())
-
-  def test_kl_reverse(self):
-    with self.test_session():
-      self.assertAllClose(
-          cd.dual_csiszar_function(self._logu, cd.kl_reverse).eval(),
-          cd.kl_forward(self._logu).eval())
-
-
-class MonteCarloCsiszarFDivergenceTest(test.TestCase):
-
-  def test_kl_forward(self):
-    with self.test_session() as sess:
-      q = normal_lib.Normal(
-          loc=np.ones(6),
-          scale=np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0]))
-
-      p = normal_lib.Normal(loc=q.loc + 0.1, scale=q.scale - 0.2)
-
-      approx_kl = cd.monte_carlo_csiszar_f_divergence(
-          f=cd.kl_forward,
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=int(1e5),
-          seed=1)
-
-      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
-          f=lambda logu: cd.kl_forward(logu, self_normalized=True),
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=int(1e5),
-          seed=1)
-
-      exact_kl = kullback_leibler.kl_divergence(p, q)
-
-      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
-          approx_kl, approx_kl_self_normalized, exact_kl])
-
-      self.assertAllClose(approx_kl_, exact_kl_,
-                          rtol=0.08, atol=0.)
-
-      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
-                          rtol=0.02, atol=0.)
-
-  def test_kl_reverse(self):
-    with self.test_session() as sess:
-
-      q = normal_lib.Normal(
-          loc=np.ones(6),
-          scale=np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0]))
-
-      p = normal_lib.Normal(loc=q.loc + 0.1, scale=q.scale - 0.2)
-
-      approx_kl = cd.monte_carlo_csiszar_f_divergence(
-          f=cd.kl_reverse,
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=int(1e5),
-          seed=1)
-
-      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
-          f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=int(1e5),
-          seed=1)
-
-      exact_kl = kullback_leibler.kl_divergence(q, p)
-
-      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
-          approx_kl, approx_kl_self_normalized, exact_kl])
-
-      self.assertAllClose(approx_kl_, exact_kl_,
-                          rtol=0.07, atol=0.)
-
-      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
-                          rtol=0.02, atol=0.)
-
-  def test_kl_reverse_multidim(self):
-
-    with self.test_session() as sess:
-      d = 5  # Dimension
-
-      p = mvn_full_lib.MultivariateNormalFullCovariance(
-          covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5))
-
-      q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[0.5]*d)
-
-      approx_kl = cd.monte_carlo_csiszar_f_divergence(
-          f=cd.kl_reverse,
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=int(1e5),
-          seed=1)
-
-      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
-          f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=int(1e5),
-          seed=1)
-
-      exact_kl = kullback_leibler.kl_divergence(q, p)
-
-      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
-          approx_kl, approx_kl_self_normalized, exact_kl])
-
-      self.assertAllClose(approx_kl_, exact_kl_,
-                          rtol=0.02, atol=0.)
-
-      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
-                          rtol=0.08, atol=0.)
-
-  def test_kl_forward_multidim(self):
-
-    with self.test_session() as sess:
-      d = 5  # Dimension
-
-      p = mvn_full_lib.MultivariateNormalFullCovariance(
-          covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5))
-
-      # Variance is very high when approximating Forward KL, so we make
-      # scale_diag larger than in test_kl_reverse_multidim. This ensures q
-      # "covers" p and thus Var_q[p/q] is smaller.
-      q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[1.]*d)
-
-      approx_kl = cd.monte_carlo_csiszar_f_divergence(
-          f=cd.kl_forward,
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=int(1e5),
-          seed=1)
-
-      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
-          f=lambda logu: cd.kl_forward(logu, self_normalized=True),
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=int(1e5),
-          seed=1)
-
-      exact_kl = kullback_leibler.kl_divergence(p, q)
-
-      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
-          approx_kl, approx_kl_self_normalized, exact_kl])
-
-      self.assertAllClose(approx_kl_, exact_kl_,
-                          rtol=0.06, atol=0.)
-
-      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
-                          rtol=0.05, atol=0.)
-
-  def test_score_trick(self):
-
-    with self.test_session() as sess:
-      d = 5  # Dimension
-      num_draws = int(1e5)
-      seed = 1
-
-      p = mvn_full_lib.MultivariateNormalFullCovariance(
-          covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5))
-
-      # Variance is very high when approximating Forward KL, so we make
-      # scale_diag larger than in test_kl_reverse_multidim. This ensures q
-      # "covers" p and thus Var_q[p/q] is smaller.
-      s = array_ops.constant(1.)
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          scale_diag=array_ops.tile([s], [d]))
-
-      approx_kl = cd.monte_carlo_csiszar_f_divergence(
-          f=cd.kl_reverse,
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=num_draws,
-          seed=seed)
-
-      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
-          f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=num_draws,
-          seed=seed)
-
-      approx_kl_score_trick = cd.monte_carlo_csiszar_f_divergence(
-          f=cd.kl_reverse,
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=num_draws,
-          use_reparametrization=False,
-          seed=seed)
-
-      approx_kl_self_normalized_score_trick = (
-          cd.monte_carlo_csiszar_f_divergence(
-              f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
-              p_log_prob=p.log_prob,
-              q=q,
-              num_draws=num_draws,
-              use_reparametrization=False,
-              seed=seed))
-
-      exact_kl = kullback_leibler.kl_divergence(q, p)
-
-      grad_sum = lambda fs: gradients_impl.gradients(fs, s)[0]
-
-      [
-          approx_kl_grad_,
-          approx_kl_self_normalized_grad_,
-          approx_kl_score_trick_grad_,
-          approx_kl_self_normalized_score_trick_grad_,
-          exact_kl_grad_,
-          approx_kl_,
-          approx_kl_self_normalized_,
-          approx_kl_score_trick_,
-          approx_kl_self_normalized_score_trick_,
-          exact_kl_,
-      ] = sess.run([
-          grad_sum(approx_kl),
-          grad_sum(approx_kl_self_normalized),
-          grad_sum(approx_kl_score_trick),
-          grad_sum(approx_kl_self_normalized_score_trick),
-          grad_sum(exact_kl),
-          approx_kl,
-          approx_kl_self_normalized,
-          approx_kl_score_trick,
-          approx_kl_self_normalized_score_trick,
-          exact_kl,
-      ])
-
-      # Test average divergence.
-      self.assertAllClose(approx_kl_, exact_kl_,
-                          rtol=0.02, atol=0.)
-
-      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
-                          rtol=0.08, atol=0.)
-
-      self.assertAllClose(approx_kl_score_trick_, exact_kl_,
-                          rtol=0.02, atol=0.)
-
-      self.assertAllClose(approx_kl_self_normalized_score_trick_, exact_kl_,
-                          rtol=0.08, atol=0.)
-
-      # Test average gradient-divergence.
-      self.assertAllClose(approx_kl_grad_, exact_kl_grad_,
-                          rtol=0.007, atol=0.)
-
-      self.assertAllClose(approx_kl_self_normalized_grad_, exact_kl_grad_,
-                          rtol=0.011, atol=0.)
-
-      self.assertAllClose(approx_kl_score_trick_grad_, exact_kl_grad_,
-                          rtol=0.018, atol=0.)
-
-      self.assertAllClose(
-          approx_kl_self_normalized_score_trick_grad_, exact_kl_grad_,
-          rtol=0.017, atol=0.)
-
-
-class CsiszarVIMCOTest(test.TestCase):
-
-  def _csiszar_vimco_helper(self, logu):
-    """Numpy implementation of `csiszar_vimco_helper`."""
-
-    # Since this is a naive/intuitive implementation, we compensate by using the
-    # highest precision we can.
-    logu = np.float128(logu)
-    n = logu.shape[0]
-    u = np.exp(logu)
-    loogeoavg_u = []  # Leave-one-out geometric-average of exp(logu).
-    for j in range(n):
-      loogeoavg_u.append(np.exp(np.mean(
-          [logu[i, ...] for i in range(n) if i != j],
-          axis=0)))
-    loogeoavg_u = np.array(loogeoavg_u)
-
-    loosum_u = []  # Leave-one-out sum of exp(logu).
-    for j in range(n):
-      loosum_u.append(np.sum(
-          [u[i, ...] for i in range(n) if i != j],
-          axis=0))
-    loosum_u = np.array(loosum_u)
-
-    # Natural log of the average u except each is swapped-out for its
-    # leave-`i`-th-out Geometric average.
-    log_sooavg_u = np.log(loosum_u + loogeoavg_u) - np.log(n)
-
-    log_avg_u = np.log(np.mean(u, axis=0))
-    return log_avg_u, log_sooavg_u
-
-  def _csiszar_vimco_helper_grad(self, logu, delta):
-    """Finite difference approximation of `grad(csiszar_vimco_helper, logu)`."""
-
-    # This code actually estimates the sum of the Jacobiab because that's what
-    # TF's `gradients` does.
-    np_log_avg_u1, np_log_sooavg_u1 = self._csiszar_vimco_helper(
-        logu[..., None] + np.diag([delta]*len(logu)))
-    np_log_avg_u, np_log_sooavg_u = self._csiszar_vimco_helper(
-        logu[..., None])
-    return [
-        (np_log_avg_u1 - np_log_avg_u) / delta,
-        np.sum(np_log_sooavg_u1 - np_log_sooavg_u, axis=0) / delta,
-    ]
-
-  def test_vimco_helper_1(self):
-    """Tests that function calculation correctly handles batches."""
-
-    logu = np.linspace(-100., 100., 100).reshape([10, 2, 5])
-    with self.test_session() as sess:
-      np_log_avg_u, np_log_sooavg_u = self._csiszar_vimco_helper(logu)
-      [log_avg_u, log_sooavg_u] = sess.run(cd.csiszar_vimco_helper(logu))
-      self.assertAllClose(np_log_avg_u, log_avg_u,
-                          rtol=1e-8, atol=0.)
-      self.assertAllClose(np_log_sooavg_u, log_sooavg_u,
-                          rtol=1e-8, atol=0.)
-
-  def test_vimco_helper_2(self):
-    """Tests that function calculation correctly handles overflow."""
-
-    # Using 700 (rather than 1e3) since naive numpy version can't handle higher.
-    logu = np.float32([0., 700, -1, 1])
-    with self.test_session() as sess:
-      np_log_avg_u, np_log_sooavg_u = self._csiszar_vimco_helper(logu)
-      [log_avg_u, log_sooavg_u] = sess.run(cd.csiszar_vimco_helper(logu))
-      self.assertAllClose(np_log_avg_u, log_avg_u,
-                          rtol=1e-6, atol=0.)
-      self.assertAllClose(np_log_sooavg_u, log_sooavg_u,
-                          rtol=1e-5, atol=0.)
-
-  def test_vimco_helper_3(self):
-    """Tests that function calculation correctly handles underlow."""
-
-    logu = np.float32([0., -1000, -1, 1])
-    with self.test_session() as sess:
-      np_log_avg_u, np_log_sooavg_u = self._csiszar_vimco_helper(logu)
-      [log_avg_u, log_sooavg_u] = sess.run(cd.csiszar_vimco_helper(logu))
-      self.assertAllClose(np_log_avg_u, log_avg_u,
-                          rtol=1e-5, atol=0.)
-      self.assertAllClose(np_log_sooavg_u, log_sooavg_u,
-                          rtol=1e-4, atol=1e-15)
-
-  def test_vimco_helper_gradient_using_finite_difference_1(self):
-    """Tests that gradient calculation correctly handles batches."""
-
-    logu_ = np.linspace(-100., 100., 100).reshape([10, 2, 5])
-    with self.test_session() as sess:
-      logu = array_ops.constant(logu_)
-
-      grad = lambda flogu: gradients_impl.gradients(flogu, logu)[0]
-      log_avg_u, log_sooavg_u = cd.csiszar_vimco_helper(logu)
-
-      [
-          grad_log_avg_u,
-          grad_log_sooavg_u,
-      ] = sess.run([grad(log_avg_u), grad(log_sooavg_u)])
-
-      # We skip checking against finite-difference approximation since it
-      # doesn't support batches.
-
-      # Verify claim in docstring.
-      self.assertAllClose(
-          np.ones_like(grad_log_avg_u.sum(axis=0)),
-          grad_log_avg_u.sum(axis=0))
-      self.assertAllClose(
-          np.ones_like(grad_log_sooavg_u.mean(axis=0)),
-          grad_log_sooavg_u.mean(axis=0))
-
-  def test_vimco_helper_gradient_using_finite_difference_2(self):
-    """Tests that gradient calculation correctly handles overflow."""
-
-    delta = 1e-3
-    logu_ = np.float32([0., 1000, -1, 1])
-    with self.test_session() as sess:
-      logu = array_ops.constant(logu_)
-
-      [
-          np_grad_log_avg_u,
-          np_grad_log_sooavg_u,
-      ] = self._csiszar_vimco_helper_grad(logu_, delta)
-
-      grad = lambda flogu: gradients_impl.gradients(flogu, logu)[0]
-      log_avg_u, log_sooavg_u = cd.csiszar_vimco_helper(logu)
-
-      [
-          grad_log_avg_u,
-          grad_log_sooavg_u,
-      ] = sess.run([grad(log_avg_u), grad(log_sooavg_u)])
-
-      self.assertAllClose(np_grad_log_avg_u, grad_log_avg_u,
-                          rtol=delta, atol=0.)
-      self.assertAllClose(np_grad_log_sooavg_u, grad_log_sooavg_u,
-                          rtol=delta, atol=0.)
-      # Verify claim in docstring.
-      self.assertAllClose(
-          np.ones_like(grad_log_avg_u.sum(axis=0)),
-          grad_log_avg_u.sum(axis=0))
-      self.assertAllClose(
-          np.ones_like(grad_log_sooavg_u.mean(axis=0)),
-          grad_log_sooavg_u.mean(axis=0))
-
-  def test_vimco_helper_gradient_using_finite_difference_3(self):
-    """Tests that gradient calculation correctly handles underlow."""
-
-    delta = 1e-3
-    logu_ = np.float32([0., -1000, -1, 1])
-    with self.test_session() as sess:
-      logu = array_ops.constant(logu_)
-
-      [
-          np_grad_log_avg_u,
-          np_grad_log_sooavg_u,
-      ] = self._csiszar_vimco_helper_grad(logu_, delta)
-
-      grad = lambda flogu: gradients_impl.gradients(flogu, logu)[0]
-      log_avg_u, log_sooavg_u = cd.csiszar_vimco_helper(logu)
-
-      [
-          grad_log_avg_u,
-          grad_log_sooavg_u,
-      ] = sess.run([grad(log_avg_u), grad(log_sooavg_u)])
-
-      self.assertAllClose(np_grad_log_avg_u, grad_log_avg_u,
-                          rtol=delta, atol=0.)
-      self.assertAllClose(np_grad_log_sooavg_u, grad_log_sooavg_u,
-                          rtol=delta, atol=0.)
-      # Verify claim in docstring.
-      self.assertAllClose(
-          np.ones_like(grad_log_avg_u.sum(axis=0)),
-          grad_log_avg_u.sum(axis=0))
-      self.assertAllClose(
-          np.ones_like(grad_log_sooavg_u.mean(axis=0)),
-          grad_log_sooavg_u.mean(axis=0))
-
-  def test_vimco_and_gradient(self):
-
-    with self.test_session() as sess:
-      dims = 5  # Dimension
-      num_draws = int(20)
-      num_batch_draws = int(3)
-      seed = 1
-
-      f = lambda logu: cd.kl_reverse(logu, self_normalized=False)
-      np_f = lambda logu: -logu
-
-      p = mvn_full_lib.MultivariateNormalFullCovariance(
-          covariance_matrix=tridiag(dims, diag_value=1, offdiag_value=0.5))
-
-      # Variance is very high when approximating Forward KL, so we make
-      # scale_diag larger than in test_kl_reverse_multidim. This ensures q
-      # "covers" p and thus Var_q[p/q] is smaller.
-      s = array_ops.constant(1.)
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          scale_diag=array_ops.tile([s], [dims]))
-
-      vimco = cd.csiszar_vimco(
-          f=f,
-          p_log_prob=p.log_prob,
-          q=q,
-          num_draws=num_draws,
-          num_batch_draws=num_batch_draws,
-          seed=seed)
-
-      x = q.sample(sample_shape=[num_draws, num_batch_draws],
-                   seed=seed)
-      x = array_ops.stop_gradient(x)
-      logu = p.log_prob(x) - q.log_prob(x)
-      f_log_sum_u = f(cd.csiszar_vimco_helper(logu)[0])
-
-      grad_sum = lambda fs: gradients_impl.gradients(fs, s)[0]
-
-      def jacobian(x):
-        # Warning: this function is slow and may not even finish if prod(shape)
-        # is larger than, say, 100.
-        shape = x.shape.as_list()
-        assert all(s is not None for s in shape)
-        x = array_ops.reshape(x, shape=[-1])
-        r = [grad_sum(x[i]) for i in range(np.prod(shape))]
-        return array_ops.reshape(array_ops.stack(r), shape=shape)
-
-      [
-          logu_,
-          jacobian_logqx_,
-          vimco_,
-          grad_vimco_,
-          f_log_sum_u_,
-          grad_mean_f_log_sum_u_,
-      ] = sess.run([
-          logu,
-          jacobian(q.log_prob(x)),
-          vimco,
-          grad_sum(vimco),
-          f_log_sum_u,
-          grad_sum(f_log_sum_u) / num_batch_draws,
-      ])
-
-      np_log_avg_u, np_log_sooavg_u = self._csiszar_vimco_helper(logu_)
-
-      # Test VIMCO loss is correct.
-      self.assertAllClose(np_f(np_log_avg_u).mean(axis=0), vimco_,
-                          rtol=1e-5, atol=0.)
-
-      # Test gradient of VIMCO loss is correct.
-      #
-      # To make this computation we'll inject two gradients from TF:
-      # - grad[mean(f(log(sum(p(x)/q(x)))))]
-      # - jacobian[log(q(x))].
-      #
-      # We now justify why using these (and only these) TF values for
-      # ground-truth does not undermine the completeness of this test.
-      #
-      # Regarding `grad_mean_f_log_sum_u_`, note that we validate the
-      # correctness of the zero-th order derivative (for each batch member).
-      # Since `cd.csiszar_vimco_helper` itself does not manipulate any gradient
-      # information, we can safely rely on TF.
-      self.assertAllClose(np_f(np_log_avg_u), f_log_sum_u_, rtol=1e-4, atol=0.)
-      #
-      # Regarding `jacobian_logqx_`, note that testing the gradient of
-      # `q.log_prob` is outside the scope of this unit-test thus we may safely
-      # use TF to find it.
-
-      # The `mean` is across batches and the `sum` is across iid samples.
-      np_grad_vimco = (
-          grad_mean_f_log_sum_u_
-          + np.mean(
-              np.sum(
-                  jacobian_logqx_ * (np_f(np_log_avg_u)
-                                     - np_f(np_log_sooavg_u)),
-                  axis=0),
-              axis=0))
-
-      self.assertAllClose(np_grad_vimco, grad_vimco_,
-                          rtol=1e-5, atol=0.)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/custom_grad_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/custom_grad_test.py
deleted file mode 100644
index a95df31ac1fd9f5038abe779391ccba5f7fe408d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/custom_grad_test.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Custom Gradient Ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import custom_grad_impl
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-cg = custom_grad_impl
-
-
-class CustomGradientTest(test.TestCase):
-
-  def test_works_correctly(self):
-    with self.test_session() as sess:
-      f = lambda x: x**2 / 2
-      g = lambda x: (x - 1)**3 / 3
-      x_ = np.linspace(-100, 100, int(1e4)) + [0.]
-
-      x = constant_op.constant(x_)
-      fx = cg.custom_gradient(f(x), g(x), x)
-      gx = gradients_impl.gradients(fx, x)[0]
-      [fx_, gx_] = sess.run([fx, gx])
-
-      self.assertAllClose(f(x_), fx_)
-      self.assertAllClose(g(x_), gx_)
-
-  def test_works_correctly_both_f_g_zero(self):
-    with self.test_session() as sess:
-      f = lambda x: x**2 / 2
-      g = lambda x: x**3 / 3
-      x_ = np.linspace(-100, 100, int(1e4)) + [0.]
-
-      x = constant_op.constant(x_)
-      fx = cg.custom_gradient(f(x), g(x), x)
-      gx = gradients_impl.gradients(fx, x)[0]
-      [fx_, gx_] = sess.run([fx, gx])
-
-      self.assertAllClose(f(x_), fx_)
-      self.assertAllClose(g(x_), gx_)
-
-  def test_works_correctly_vector_of_vars(self):
-    with self.test_session() as sess:
-      x = variable_scope.get_variable(
-          name="x",
-          shape=[],
-          dtype=dtypes.float32,
-          initializer=init_ops.constant_initializer(2))
-      y = variable_scope.get_variable(
-          name="y",
-          shape=[],
-          dtype=dtypes.float32,
-          initializer=init_ops.constant_initializer(3))
-      sess.run([variables.global_variables_initializer()])
-
-      f = lambda z: z[0] * z[1]
-      g = lambda z: z[0]**2 * z[1]**2 / 2
-
-      z = array_ops.stack([x, y])
-      fz = cg.custom_gradient(f(z), g(z), z, axis=0)
-      gz = gradients_impl.gradients(fz, variables.trainable_variables())
-      [z_, fz_, gx_, gy_] = sess.run([z, fz, gz[0], gz[1]])
-
-      self.assertEqual(f(z_), fz_)
-      self.assertEqual(g(z_), gx_)
-      self.assertEqual(g(z_), gy_)
-
-  def test_works_correctly_side_vars(self):
-    with self.test_session() as sess:
-      x_ = np.float32(2.1)  # Adding extra tenth to force imprecision.
-      y_ = np.float32(3.1)
-      x = variable_scope.get_variable(
-          name="x",
-          shape=[],
-          dtype=dtypes.float32,
-          initializer=init_ops.constant_initializer(x_))
-      y = variable_scope.get_variable(
-          name="y",
-          shape=[],
-          dtype=dtypes.float32,
-          initializer=init_ops.constant_initializer(y_))
-      sess.run([variables.global_variables_initializer()])
-
-      f = lambda x: x * y
-      g = lambda z: math_ops.square(x) * y
-
-      fx = cg.custom_gradient(f(x), g(x), x)
-      gx = gradients_impl.gradients(fx, variables.trainable_variables())
-      [x_, fx_, gx_] = sess.run([x, fx, gx[0]])
-      gy_ = gx[1]
-
-      self.assertEqual(x_ * y_, fx_)
-      self.assertEqual(np.square(x_) * y_, gx_)
-      self.assertEqual(None, gy_)
-
-  def test_works_correctly_fx_gx_manually_stopped(self):
-    with self.test_session() as sess:
-      x_ = np.float32(2.1)  # Adding extra tenth to force imprecision.
-      y_ = np.float32(3.1)
-      x = variable_scope.get_variable(
-          name="x",
-          shape=[],
-          dtype=dtypes.float32,
-          initializer=init_ops.constant_initializer(x_))
-      y = variable_scope.get_variable(
-          name="y",
-          shape=[],
-          dtype=dtypes.float32,
-          initializer=init_ops.constant_initializer(y_))
-      sess.run([variables.global_variables_initializer()])
-
-      stop = array_ops.stop_gradient  # For readability.
-
-      # Basically we need to stop the `x` portion of `f`. And when we supply the
-      # arg to `custom_gradient` we need to stop the complement, i.e., the `y`
-      # part.
-      f = lambda x: stop(x) * y
-      g = lambda x: stop(math_ops.square(x)) * y
-      fx = cg.custom_gradient(f(x), g(x), x + stop(y),
-                              fx_gx_manually_stopped=True)
-
-      gx = gradients_impl.gradients(fx, variables.trainable_variables())
-      [x_, fx_, gx_, gy_] = sess.run([x, fx, gx[0], gx[1]])
-
-      self.assertEqual(x_ * y_, fx_)
-      self.assertEqual(np.square(x_) * y_, gx_)
-      self.assertEqual(x_, gy_)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
deleted file mode 100644
index 0a85862abfd744a86b9a38e10dbb5b985d0a0e94..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for halton_sequence.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import halton_sequence as halton
-from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.platform import test
-
-
-mc = monte_carlo_lib
-
-
-class HaltonSequenceTest(test.TestCase):
-
-  def test_known_values_small_bases(self):
-    with self.test_session():
-      # The first five elements of the Halton sequence with base 2 and 3
-      expected = np.array(((1. / 2, 1. / 3),
-                           (1. / 4, 2. / 3),
-                           (3. / 4, 1. / 9),
-                           (1. / 8, 4. / 9),
-                           (5. / 8, 7. / 9)), dtype=np.float32)
-      sample = halton.sample(2, num_samples=5)
-      self.assertAllClose(expected, sample.eval(), rtol=1e-6)
-
-  def test_sample_indices(self):
-    with self.test_session():
-      dim = 5
-      indices = math_ops.range(10, dtype=dtypes.int32)
-      sample_direct = halton.sample(dim, num_samples=10)
-      sample_from_indices = halton.sample(dim, sample_indices=indices)
-      self.assertAllClose(sample_direct.eval(), sample_from_indices.eval(),
-                          rtol=1e-6)
-
-  def test_dtypes_works_correctly(self):
-    with self.test_session():
-      dim = 3
-      sample_float32 = halton.sample(dim, num_samples=10, dtype=dtypes.float32)
-      sample_float64 = halton.sample(dim, num_samples=10, dtype=dtypes.float64)
-      self.assertEqual(sample_float32.eval().dtype, np.float32)
-      self.assertEqual(sample_float64.eval().dtype, np.float64)
-
-  def test_normal_integral_mean_and_var_correctly_estimated(self):
-    n = int(1000)
-    # This test is almost identical to the similarly named test in
-    # monte_carlo_test.py. The only difference is that we use the Halton
-    # samples instead of the random samples to evaluate the expectations.
-    # MC with pseudo random numbers converges at the rate of 1/ Sqrt(N)
-    # (N=number of samples). For QMC in low dimensions, the expected convergence
-    # rate is ~ 1/N. Hence we should only need 1e3 samples as compared to the
-    # 1e6 samples used in the pseudo-random monte carlo.
-    with self.test_session():
-      mu_p = array_ops.constant([-1.0, 1.0], dtype=dtypes.float64)
-      mu_q = array_ops.constant([0.0, 0.0], dtype=dtypes.float64)
-      sigma_p = array_ops.constant([0.5, 0.5], dtype=dtypes.float64)
-      sigma_q = array_ops.constant([1.0, 1.0], dtype=dtypes.float64)
-      p = normal_lib.Normal(loc=mu_p, scale=sigma_p)
-      q = normal_lib.Normal(loc=mu_q, scale=sigma_q)
-
-      cdf_sample = halton.sample(2, num_samples=n, dtype=dtypes.float64)
-      q_sample = q.quantile(cdf_sample)
-
-      # Compute E_p[X].
-      e_x = mc.expectation_importance_sampler(
-          f=lambda x: x, log_p=p.log_prob, sampling_dist_q=q, z=q_sample,
-          seed=42)
-
-      # Compute E_p[X^2].
-      e_x2 = mc.expectation_importance_sampler(
-          f=math_ops.square, log_p=p.log_prob, sampling_dist_q=q, z=q_sample,
-          seed=42)
-
-      stddev = math_ops.sqrt(e_x2 - math_ops.square(e_x))
-      # Keep the tolerance levels the same as in monte_carlo_test.py.
-      self.assertEqual(p.batch_shape, e_x.get_shape())
-      self.assertAllClose(p.mean().eval(), e_x.eval(), rtol=0.01)
-      self.assertAllClose(p.stddev().eval(), stddev.eval(), rtol=0.02)
-
-  def test_docstring_example(self):
-    # Produce the first 1000 members of the Halton sequence in 3 dimensions.
-    num_samples = 1000
-    dim = 3
-    with self.test_session():
-      sample = halton.sample(dim, num_samples=num_samples)
-
-      # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
-      # hypercube.
-      powers = math_ops.range(1.0, limit=dim + 1)
-      integral = math_ops.reduce_mean(
-          math_ops.reduce_prod(sample ** powers, axis=-1))
-      true_value = 1.0 / math_ops.reduce_prod(powers + 1.0)
-
-      # Produces a relative absolute error of 1.7%.
-      self.assertAllClose(integral.eval(), true_value.eval(), rtol=0.02)
-
-    # Now skip the first 1000 samples and recompute the integral with the next
-    # thousand samples. The sample_indices argument can be used to do this.
-
-      sample_indices = math_ops.range(start=1000, limit=1000 + num_samples,
-                                      dtype=dtypes.int32)
-      sample_leaped = halton.sample(dim, sample_indices=sample_indices)
-
-      integral_leaped = math_ops.reduce_mean(
-          math_ops.reduce_prod(sample_leaped ** powers, axis=-1))
-      self.assertAllClose(integral_leaped.eval(), true_value.eval(), rtol=0.001)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
deleted file mode 100644
index 5bd834e56245ab4d874544cfd014fe59ae521ea8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ /dev/null
@@ -1,863 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Hamiltonian Monte Carlo."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import numpy as np
-from scipy import stats
-
-from tensorflow.contrib.bayesflow.python.ops import hmc
-from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _compute_energy_change
-from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _leapfrog_integrator
-
-from tensorflow.contrib.distributions.python.ops import independent as independent_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_linalg_ops
-from tensorflow.python.ops import gradients_impl as gradients_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.distributions import gamma as gamma_lib
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging_ops
-
-
-def _reduce_variance(x, axis=None, keepdims=False):
-  sample_mean = math_ops.reduce_mean(x, axis, keepdims=True)
-  return math_ops.reduce_mean(
-      math_ops.squared_difference(x, sample_mean), axis, keepdims)
-
-
-class HMCTest(test.TestCase):
-
-  def setUp(self):
-    self._shape_param = 5.
-    self._rate_param = 10.
-
-    random_seed.set_random_seed(10003)
-    np.random.seed(10003)
-
-  def assertAllFinite(self, x):
-    self.assertAllEqual(np.ones_like(x).astype(bool), np.isfinite(x))
-
-  def _log_gamma_log_prob(self, x, event_dims=()):
-    """Computes log-pdf of a log-gamma random variable.
-
-    Args:
-      x: Value of the random variable.
-      event_dims: Dimensions not to treat as independent.
-
-    Returns:
-      log_prob: The log-pdf up to a normalizing constant.
-    """
-    return math_ops.reduce_sum(self._shape_param * x -
-                               self._rate_param * math_ops.exp(x),
-                               event_dims)
-
-  def _integrator_conserves_energy(self, x, independent_chain_ndims, sess,
-                                   feed_dict=None):
-    step_size = array_ops.placeholder(np.float32, [], name="step_size")
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name="hmc_lf_steps")
-
-    if feed_dict is None:
-      feed_dict = {}
-    feed_dict[hmc_lf_steps] = 1000
-
-    event_dims = math_ops.range(independent_chain_ndims,
-                                array_ops.rank(x))
-
-    m = random_ops.random_normal(array_ops.shape(x))
-    log_prob_0 = self._log_gamma_log_prob(x, event_dims)
-    grad_0 = gradients_ops.gradients(log_prob_0, x)
-    old_energy = -log_prob_0 + 0.5 * math_ops.reduce_sum(m**2., event_dims)
-
-    new_m, _, log_prob_1, _ = _leapfrog_integrator(
-        current_momentums=[m],
-        target_log_prob_fn=lambda x: self._log_gamma_log_prob(x, event_dims),
-        current_state_parts=[x],
-        step_sizes=[step_size],
-        num_leapfrog_steps=hmc_lf_steps,
-        current_target_log_prob=log_prob_0,
-        current_grads_target_log_prob=grad_0)
-    new_m = new_m[0]
-
-    new_energy = -log_prob_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
-                                                         event_dims)
-
-    x_shape = sess.run(x, feed_dict).shape
-    event_size = np.prod(x_shape[independent_chain_ndims:])
-    feed_dict[step_size] = 0.1 / event_size
-    old_energy_, new_energy_ = sess.run([old_energy, new_energy],
-                                        feed_dict)
-    logging_ops.vlog(1, "average energy relative change: {}".format(
-        (1. - new_energy_ / old_energy_).mean()))
-    self.assertAllClose(old_energy_, new_energy_, atol=0., rtol=0.02)
-
-  def _integrator_conserves_energy_wrapper(self, independent_chain_ndims):
-    """Tests the long-term energy conservation of the leapfrog integrator.
-
-    The leapfrog integrator is symplectic, so for sufficiently small step
-    sizes it should be possible to run it more or less indefinitely without
-    the energy of the system blowing up or collapsing.
-
-    Args:
-      independent_chain_ndims: Python `int` scalar representing the number of
-        dims associated with independent chains.
-    """
-    with self.test_session(graph=ops.Graph()) as sess:
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
-      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
-      self._integrator_conserves_energy(x_ph, independent_chain_ndims,
-                                        sess, feed_dict)
-
-  def testIntegratorEnergyConservationNullShape(self):
-    self._integrator_conserves_energy_wrapper(0)
-
-  def testIntegratorEnergyConservation1(self):
-    self._integrator_conserves_energy_wrapper(1)
-
-  def testIntegratorEnergyConservation2(self):
-    self._integrator_conserves_energy_wrapper(2)
-
-  def testIntegratorEnergyConservation3(self):
-    self._integrator_conserves_energy_wrapper(3)
-
-  def testSampleChainSeedReproducibleWorksCorrectly(self):
-    with self.test_session(graph=ops.Graph()) as sess:
-      num_results = 10
-      independent_chain_ndims = 1
-
-      def log_gamma_log_prob(x):
-        event_dims = math_ops.range(independent_chain_ndims,
-                                    array_ops.rank(x))
-        return self._log_gamma_log_prob(x, event_dims)
-
-      kwargs = dict(
-          target_log_prob_fn=log_gamma_log_prob,
-          current_state=np.random.rand(4, 3, 2),
-          step_size=0.1,
-          num_leapfrog_steps=2,
-          num_burnin_steps=150,
-          seed=52,
-      )
-
-      samples0, kernel_results0 = hmc.sample_chain(
-          **dict(list(kwargs.items()) + list(dict(
-              num_results=2 * num_results,
-              num_steps_between_results=0).items())))
-
-      samples1, kernel_results1 = hmc.sample_chain(
-          **dict(list(kwargs.items()) + list(dict(
-              num_results=num_results,
-              num_steps_between_results=1).items())))
-
-      [
-          samples0_,
-          samples1_,
-          target_log_prob0_,
-          target_log_prob1_,
-      ] = sess.run([
-          samples0,
-          samples1,
-          kernel_results0.current_target_log_prob,
-          kernel_results1.current_target_log_prob,
-      ])
-      self.assertAllClose(samples0_[::2], samples1_,
-                          atol=1e-5, rtol=1e-5)
-      self.assertAllClose(target_log_prob0_[::2], target_log_prob1_,
-                          atol=1e-5, rtol=1e-5)
-
-  def _chain_gets_correct_expectations(self, x, independent_chain_ndims,
-                                       sess, feed_dict=None):
-    counter = collections.Counter()
-    def log_gamma_log_prob(x):
-      counter["target_calls"] += 1
-      event_dims = math_ops.range(independent_chain_ndims,
-                                  array_ops.rank(x))
-      return self._log_gamma_log_prob(x, event_dims)
-
-    num_results = array_ops.placeholder(
-        np.int32, [], name="num_results")
-    step_size = array_ops.placeholder(
-        np.float32, [], name="step_size")
-    num_leapfrog_steps = array_ops.placeholder(
-        np.int32, [], name="num_leapfrog_steps")
-
-    if feed_dict is None:
-      feed_dict = {}
-    feed_dict.update({num_results: 150,
-                      step_size: 0.05,
-                      num_leapfrog_steps: 2})
-
-    samples, kernel_results = hmc.sample_chain(
-        num_results=num_results,
-        target_log_prob_fn=log_gamma_log_prob,
-        current_state=x,
-        step_size=step_size,
-        num_leapfrog_steps=num_leapfrog_steps,
-        num_burnin_steps=150,
-        seed=42)
-
-    self.assertAllEqual(dict(target_calls=2), counter)
-
-    expected_x = (math_ops.digamma(self._shape_param)
-                  - np.log(self._rate_param))
-
-    expected_exp_x = self._shape_param / self._rate_param
-
-    acceptance_probs_, samples_, expected_x_ = sess.run(
-        [kernel_results.acceptance_probs, samples, expected_x],
-        feed_dict)
-
-    actual_x = samples_.mean()
-    actual_exp_x = np.exp(samples_).mean()
-
-    logging_ops.vlog(1, "True      E[x, exp(x)]: {}\t{}".format(
-        expected_x_, expected_exp_x))
-    logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format(
-        actual_x, actual_exp_x))
-    self.assertNear(actual_x, expected_x_, 2e-2)
-    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
-    self.assertAllEqual(np.ones_like(acceptance_probs_, np.bool),
-                        acceptance_probs_ > 0.5)
-    self.assertAllEqual(np.ones_like(acceptance_probs_, np.bool),
-                        acceptance_probs_ <= 1.)
-
-  def _chain_gets_correct_expectations_wrapper(self, independent_chain_ndims):
-    with self.test_session(graph=ops.Graph()) as sess:
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
-      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
-      self._chain_gets_correct_expectations(x_ph, independent_chain_ndims,
-                                            sess, feed_dict)
-
-  def testHMCChainExpectationsNullShape(self):
-    self._chain_gets_correct_expectations_wrapper(0)
-
-  def testHMCChainExpectations1(self):
-    self._chain_gets_correct_expectations_wrapper(1)
-
-  def testHMCChainExpectations2(self):
-    self._chain_gets_correct_expectations_wrapper(2)
-
-  def testKernelResultsUsingTruncatedDistribution(self):
-    def log_prob(x):
-      return array_ops.where(
-          x >= 0.,
-          -x - x**2,  # Non-constant gradient.
-          array_ops.fill(x.shape, math_ops.cast(-np.inf, x.dtype)))
-    # This log_prob has the property that it is likely to attract
-    # the HMC flow toward, and below, zero...but for x <=0,
-    # log_prob(x) = -inf, which should result in rejection, as well
-    # as a non-finite log_prob.  Thus, this distribution gives us an opportunity
-    # to test out the kernel results ability to correctly capture rejections due
-    # to finite AND non-finite reasons.
-    # Why use a non-constant gradient?  This ensures the leapfrog integrator
-    # will not be exact.
-
-    num_results = 1000
-    # Large step size, will give rejections due to integration error in addition
-    # to rejection due to going into a region of log_prob = -inf.
-    step_size = 0.1
-    num_leapfrog_steps = 5
-    num_chains = 2
-
-    with self.test_session(graph=ops.Graph()) as sess:
-
-      # Start multiple independent chains.
-      initial_state = ops.convert_to_tensor([0.1] * num_chains)
-
-      states, kernel_results = hmc.sample_chain(
-          num_results=num_results,
-          target_log_prob_fn=log_prob,
-          current_state=initial_state,
-          step_size=step_size,
-          num_leapfrog_steps=num_leapfrog_steps,
-          seed=42)
-
-      states_, kernel_results_ = sess.run([states, kernel_results])
-      pstates_ = kernel_results_.proposed_state
-
-      neg_inf_mask = np.isneginf(kernel_results_.proposed_target_log_prob)
-
-      # First:  Test that the mathematical properties of the above log prob
-      # function in conjunction with HMC show up as expected in kernel_results_.
-
-      # We better have log_prob = -inf some of the time.
-      self.assertLess(0, neg_inf_mask.sum())
-      # We better have some rejections due to something other than -inf.
-      self.assertLess(neg_inf_mask.sum(), (~kernel_results_.is_accepted).sum())
-      # We better have been accepted a decent amount, even near the end of the
-      # chain, or else this HMC run just got stuck at some point.
-      self.assertLess(
-          0.1, kernel_results_.is_accepted[int(0.9 * num_results):].mean())
-      # We better not have any NaNs in proposed state or log_prob.
-      # We may have some NaN in grads, which involve multiplication/addition due
-      # to gradient rules.  This is the known "NaN grad issue with tf.where."
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isnan(kernel_results_.proposed_target_log_prob))
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isnan(states_))
-      # We better not have any +inf in states, grads, or log_prob.
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isposinf(kernel_results_.proposed_target_log_prob))
-      self.assertAllEqual(
-          np.zeros_like(states_),
-          np.isposinf(kernel_results_.proposed_grads_target_log_prob[0]))
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isposinf(states_))
-
-      # Second:  Test that kernel_results is congruent with itself and
-      # acceptance/rejection of states.
-
-      # Proposed state is negative iff proposed target log prob is -inf.
-      np.testing.assert_array_less(pstates_[neg_inf_mask], 0.)
-      np.testing.assert_array_less(0., pstates_[~neg_inf_mask])
-
-      # Acceptance probs are zero whenever proposed state is negative.
-      self.assertAllEqual(
-          np.zeros_like(pstates_[neg_inf_mask]),
-          kernel_results_.acceptance_probs[neg_inf_mask])
-
-      # The move is accepted ==> state = proposed state.
-      self.assertAllEqual(
-          states_[kernel_results_.is_accepted],
-          pstates_[kernel_results_.is_accepted],
-      )
-      # The move was rejected <==> state[t] == state[t - 1].
-      for t in range(1, num_results):
-        for i in range(num_chains):
-          if kernel_results_.is_accepted[t, i]:
-            self.assertNotEqual(states_[t, i], states_[t - 1, i])
-          else:
-            self.assertEqual(states_[t, i], states_[t - 1, i])
-
-  def _kernel_leaves_target_invariant(self, initial_draws,
-                                      independent_chain_ndims,
-                                      sess, feed_dict=None):
-    def log_gamma_log_prob(x):
-      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
-      return self._log_gamma_log_prob(x, event_dims)
-
-    def fake_log_prob(x):
-      """Cooled version of the target distribution."""
-      return 1.1 * log_gamma_log_prob(x)
-
-    step_size = array_ops.placeholder(np.float32, [], name="step_size")
-
-    if feed_dict is None:
-      feed_dict = {}
-
-    feed_dict[step_size] = 0.4
-
-    sample, kernel_results = hmc.kernel(
-        target_log_prob_fn=log_gamma_log_prob,
-        current_state=initial_draws,
-        step_size=step_size,
-        num_leapfrog_steps=5,
-        seed=43)
-
-    bad_sample, bad_kernel_results = hmc.kernel(
-        target_log_prob_fn=fake_log_prob,
-        current_state=initial_draws,
-        step_size=step_size,
-        num_leapfrog_steps=5,
-        seed=44)
-
-    [
-        acceptance_probs_,
-        bad_acceptance_probs_,
-        initial_draws_,
-        updated_draws_,
-        fake_draws_,
-    ] = sess.run([
-        kernel_results.acceptance_probs,
-        bad_kernel_results.acceptance_probs,
-        initial_draws,
-        sample,
-        bad_sample,
-    ], feed_dict)
-
-    # Confirm step size is small enough that we usually accept.
-    self.assertGreater(acceptance_probs_.mean(), 0.5)
-    self.assertGreater(bad_acceptance_probs_.mean(), 0.5)
-
-    # Confirm step size is large enough that we sometimes reject.
-    self.assertLess(acceptance_probs_.mean(), 0.99)
-    self.assertLess(bad_acceptance_probs_.mean(), 0.99)
-
-    _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(),
-                                        updated_draws_.flatten())
-    _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(),
-                                        fake_draws_.flatten())
-
-    logging_ops.vlog(1, "acceptance rate for true target: {}".format(
-        acceptance_probs_.mean()))
-    logging_ops.vlog(1, "acceptance rate for fake target: {}".format(
-        bad_acceptance_probs_.mean()))
-    logging_ops.vlog(1, "K-S p-value for true target: {}".format(
-        ks_p_value_true))
-    logging_ops.vlog(1, "K-S p-value for fake target: {}".format(
-        ks_p_value_fake))
-    # Make sure that the MCMC update hasn't changed the empirical CDF much.
-    self.assertGreater(ks_p_value_true, 1e-3)
-    # Confirm that targeting the wrong distribution does
-    # significantly change the empirical CDF.
-    self.assertLess(ks_p_value_fake, 1e-6)
-
-  def _kernel_leaves_target_invariant_wrapper(self, independent_chain_ndims):
-    """Tests that the kernel leaves the target distribution invariant.
-
-    Draws some independent samples from the target distribution,
-    applies an iteration of the MCMC kernel, then runs a
-    Kolmogorov-Smirnov test to determine if the distribution of the
-    MCMC-updated samples has changed.
-
-    We also confirm that running the kernel with a different log-pdf
-    does change the target distribution. (And that we can detect that.)
-
-    Args:
-      independent_chain_ndims: Python `int` scalar representing the number of
-        dims associated with independent chains.
-    """
-    with self.test_session(graph=ops.Graph()) as sess:
-      initial_draws = np.log(np.random.gamma(self._shape_param,
-                                             size=[50000, 2, 2]))
-      initial_draws -= np.log(self._rate_param)
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
-
-      feed_dict = {x_ph: initial_draws}
-
-      self._kernel_leaves_target_invariant(x_ph, independent_chain_ndims,
-                                           sess, feed_dict)
-
-  def testKernelLeavesTargetInvariant1(self):
-    self._kernel_leaves_target_invariant_wrapper(1)
-
-  def testKernelLeavesTargetInvariant2(self):
-    self._kernel_leaves_target_invariant_wrapper(2)
-
-  def testKernelLeavesTargetInvariant3(self):
-    self._kernel_leaves_target_invariant_wrapper(3)
-
-  def _ais_gets_correct_log_normalizer(self, init, independent_chain_ndims,
-                                       sess, feed_dict=None):
-    counter = collections.Counter()
-
-    def proposal_log_prob(x):
-      counter["proposal_calls"] += 1
-      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
-      return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
-                                        axis=event_dims)
-
-    def target_log_prob(x):
-      counter["target_calls"] += 1
-      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
-      return self._log_gamma_log_prob(x, event_dims)
-
-    if feed_dict is None:
-      feed_dict = {}
-
-    num_steps = 200
-
-    _, ais_weights, _ = hmc.sample_annealed_importance_chain(
-        proposal_log_prob_fn=proposal_log_prob,
-        num_steps=num_steps,
-        target_log_prob_fn=target_log_prob,
-        step_size=0.5,
-        current_state=init,
-        num_leapfrog_steps=2,
-        seed=45)
-
-    # We have three calls because the calculation of `ais_weights` entails
-    # another call to the `convex_combined_log_prob_fn`. We could refactor
-    # things to avoid this, if needed (eg, b/72994218).
-    self.assertAllEqual(dict(target_calls=3, proposal_calls=3), counter)
-
-    event_shape = array_ops.shape(init)[independent_chain_ndims:]
-    event_size = math_ops.reduce_prod(event_shape)
-
-    log_true_normalizer = (
-        -self._shape_param * math_ops.log(self._rate_param)
-        + math_ops.lgamma(self._shape_param))
-    log_true_normalizer *= math_ops.cast(event_size, log_true_normalizer.dtype)
-
-    log_estimated_normalizer = (math_ops.reduce_logsumexp(ais_weights)
-                                - np.log(num_steps))
-
-    ratio_estimate_true = math_ops.exp(ais_weights - log_true_normalizer)
-    ais_weights_size = array_ops.size(ais_weights)
-    standard_error = math_ops.sqrt(
-        _reduce_variance(ratio_estimate_true)
-        / math_ops.cast(ais_weights_size, ratio_estimate_true.dtype))
-
-    [
-        ratio_estimate_true_,
-        log_true_normalizer_,
-        log_estimated_normalizer_,
-        standard_error_,
-        ais_weights_size_,
-        event_size_,
-    ] = sess.run([
-        ratio_estimate_true,
-        log_true_normalizer,
-        log_estimated_normalizer,
-        standard_error,
-        ais_weights_size,
-        event_size,
-    ], feed_dict)
-
-    logging_ops.vlog(1, "        log_true_normalizer: {}\n"
-                        "   log_estimated_normalizer: {}\n"
-                        "           ais_weights_size: {}\n"
-                        "                 event_size: {}\n".format(
-                            log_true_normalizer_,
-                            log_estimated_normalizer_,
-                            ais_weights_size_,
-                            event_size_))
-    self.assertNear(ratio_estimate_true_.mean(), 1., 4. * standard_error_)
-
-  def _ais_gets_correct_log_normalizer_wrapper(self, independent_chain_ndims):
-    """Tests that AIS yields reasonable estimates of normalizers."""
-    with self.test_session(graph=ops.Graph()) as sess:
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
-      initial_draws = np.random.normal(size=[30, 2, 1])
-      self._ais_gets_correct_log_normalizer(
-          x_ph,
-          independent_chain_ndims,
-          sess,
-          feed_dict={x_ph: initial_draws})
-
-  def testAIS1(self):
-    self._ais_gets_correct_log_normalizer_wrapper(1)
-
-  def testAIS2(self):
-    self._ais_gets_correct_log_normalizer_wrapper(2)
-
-  def testAIS3(self):
-    self._ais_gets_correct_log_normalizer_wrapper(3)
-
-  def testSampleAIChainSeedReproducibleWorksCorrectly(self):
-    with self.test_session(graph=ops.Graph()) as sess:
-      independent_chain_ndims = 1
-      x = np.random.rand(4, 3, 2)
-
-      def proposal_log_prob(x):
-        event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
-        return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
-                                          axis=event_dims)
-
-      def target_log_prob(x):
-        event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
-        return self._log_gamma_log_prob(x, event_dims)
-
-      ais_kwargs = dict(
-          proposal_log_prob_fn=proposal_log_prob,
-          num_steps=200,
-          target_log_prob_fn=target_log_prob,
-          step_size=0.5,
-          current_state=x,
-          num_leapfrog_steps=2,
-          seed=53)
-
-      _, ais_weights0, _ = hmc.sample_annealed_importance_chain(
-          **ais_kwargs)
-
-      _, ais_weights1, _ = hmc.sample_annealed_importance_chain(
-          **ais_kwargs)
-
-      [ais_weights0_, ais_weights1_] = sess.run([
-          ais_weights0, ais_weights1])
-
-      self.assertAllClose(ais_weights0_, ais_weights1_,
-                          atol=1e-5, rtol=1e-5)
-
-  def testNanRejection(self):
-    """Tests that an update that yields NaN potentials gets rejected.
-
-    We run HMC with a target distribution that returns NaN
-    log-likelihoods if any element of x < 0, and unit-scale
-    exponential log-likelihoods otherwise. The exponential potential
-    pushes x towards 0, ensuring that any reasonably large update will
-    push us over the edge into NaN territory.
-    """
-    def _unbounded_exponential_log_prob(x):
-      """An exponential distribution with log-likelihood NaN for x < 0."""
-      per_element_potentials = array_ops.where(
-          x < 0.,
-          array_ops.fill(array_ops.shape(x), x.dtype.as_numpy_dtype(np.nan)),
-          -x)
-      return math_ops.reduce_sum(per_element_potentials)
-
-    with self.test_session(graph=ops.Graph()) as sess:
-      initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, kernel_results = hmc.kernel(
-          target_log_prob_fn=_unbounded_exponential_log_prob,
-          current_state=initial_x,
-          step_size=2.,
-          num_leapfrog_steps=5,
-          seed=46)
-      initial_x_, updated_x_, acceptance_probs_ = sess.run(
-          [initial_x, updated_x, kernel_results.acceptance_probs])
-
-      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
-      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
-      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
-
-      self.assertAllEqual(initial_x_, updated_x_)
-      self.assertEqual(acceptance_probs_, 0.)
-
-  def testNanFromGradsDontPropagate(self):
-    """Test that update with NaN gradients does not cause NaN in results."""
-    def _nan_log_prob_with_nan_gradient(x):
-      return np.nan * math_ops.reduce_sum(x)
-
-    with self.test_session(graph=ops.Graph()) as sess:
-      initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, kernel_results = hmc.kernel(
-          target_log_prob_fn=_nan_log_prob_with_nan_gradient,
-          current_state=initial_x,
-          step_size=2.,
-          num_leapfrog_steps=5,
-          seed=47)
-      initial_x_, updated_x_, acceptance_probs_ = sess.run(
-          [initial_x, updated_x, kernel_results.acceptance_probs])
-
-      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
-      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
-      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
-
-      self.assertAllEqual(initial_x_, updated_x_)
-      self.assertEqual(acceptance_probs_, 0.)
-
-      self.assertAllFinite(
-          gradients_ops.gradients(updated_x, initial_x)[0].eval())
-      self.assertAllEqual([True], [g is None for g in gradients_ops.gradients(
-          kernel_results.proposed_grads_target_log_prob, initial_x)])
-      self.assertAllEqual([False], [g is None for g in gradients_ops.gradients(
-          kernel_results.proposed_grads_target_log_prob,
-          kernel_results.proposed_state)])
-
-      # Gradients of the acceptance probs and new log prob are not finite.
-      # self.assertAllFinite(
-      #     gradients_ops.gradients(acceptance_probs, initial_x)[0].eval())
-      # self.assertAllFinite(
-      #     gradients_ops.gradients(new_log_prob, initial_x)[0].eval())
-
-  def _testChainWorksDtype(self, dtype):
-    with self.test_session(graph=ops.Graph()) as sess:
-      states, kernel_results = hmc.sample_chain(
-          num_results=10,
-          target_log_prob_fn=lambda x: -math_ops.reduce_sum(x**2., axis=-1),
-          current_state=np.zeros(5).astype(dtype),
-          step_size=0.01,
-          num_leapfrog_steps=10,
-          seed=48)
-      states_, acceptance_probs_ = sess.run(
-          [states, kernel_results.acceptance_probs])
-      self.assertEqual(dtype, states_.dtype)
-      self.assertEqual(dtype, acceptance_probs_.dtype)
-
-  def testChainWorksIn64Bit(self):
-    self._testChainWorksDtype(np.float64)
-
-  def testChainWorksIn16Bit(self):
-    self._testChainWorksDtype(np.float16)
-
-  def testChainWorksCorrelatedMultivariate(self):
-    dtype = np.float32
-    true_mean = dtype([0, 0])
-    true_cov = dtype([[1, 0.5],
-                      [0.5, 1]])
-    num_results = 2000
-    counter = collections.Counter()
-    with self.test_session(graph=ops.Graph()) as sess:
-      def target_log_prob(x, y):
-        counter["target_calls"] += 1
-        # Corresponds to unnormalized MVN.
-        # z = matmul(inv(chol(true_cov)), [x, y] - true_mean)
-        z = array_ops.stack([x, y], axis=-1) - true_mean
-        z = array_ops.squeeze(
-            gen_linalg_ops.matrix_triangular_solve(
-                np.linalg.cholesky(true_cov),
-                z[..., array_ops.newaxis]),
-            axis=-1)
-        return -0.5 * math_ops.reduce_sum(z**2., axis=-1)
-      states, _ = hmc.sample_chain(
-          num_results=num_results,
-          target_log_prob_fn=target_log_prob,
-          current_state=[dtype(-2), dtype(2)],
-          step_size=[0.5, 0.5],
-          num_leapfrog_steps=2,
-          num_burnin_steps=200,
-          num_steps_between_results=1,
-          seed=54)
-      self.assertAllEqual(dict(target_calls=2), counter)
-      states = array_ops.stack(states, axis=-1)
-      self.assertEqual(num_results, states.shape[0].value)
-      sample_mean = math_ops.reduce_mean(states, axis=0)
-      x = states - sample_mean
-      sample_cov = math_ops.matmul(x, x, transpose_a=True) / dtype(num_results)
-      [sample_mean_, sample_cov_] = sess.run([
-          sample_mean, sample_cov])
-      self.assertAllClose(true_mean, sample_mean_,
-                          atol=0.05, rtol=0.)
-      self.assertAllClose(true_cov, sample_cov_,
-                          atol=0., rtol=0.1)
-
-
-class _EnergyComputationTest(object):
-
-  def testHandlesNanFromPotential(self):
-    with self.test_session(graph=ops.Graph()) as sess:
-      x = [1, np.inf, -np.inf, np.nan]
-      target_log_prob, proposed_target_log_prob = [
-          self.dtype(x.flatten()) for x in np.meshgrid(x, x)]
-      num_chains = len(target_log_prob)
-      dummy_momentums = [-1, 1]
-      momentums = [self.dtype([dummy_momentums] * num_chains)]
-      proposed_momentums = [self.dtype([dummy_momentums] * num_chains)]
-
-      target_log_prob = ops.convert_to_tensor(target_log_prob)
-      momentums = [ops.convert_to_tensor(momentums[0])]
-      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
-      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
-
-      energy = _compute_energy_change(
-          target_log_prob,
-          momentums,
-          proposed_target_log_prob,
-          proposed_momentums,
-          independent_chain_ndims=1)
-      grads = gradients_ops.gradients(energy, momentums)
-
-      [actual_energy, grads_] = sess.run([energy, grads])
-
-      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
-      # finite otherwise.
-      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
-      self.assertAllEqual(expected_energy, actual_energy)
-
-      # Ensure gradient is finite.
-      self.assertAllEqual(np.ones_like(grads_).astype(np.bool),
-                          np.isfinite(grads_))
-
-  def testHandlesNanFromKinetic(self):
-    with self.test_session(graph=ops.Graph()) as sess:
-      x = [1, np.inf, -np.inf, np.nan]
-      momentums, proposed_momentums = [
-          [np.reshape(self.dtype(x), [-1, 1])]
-          for x in np.meshgrid(x, x)]
-      num_chains = len(momentums[0])
-      target_log_prob = np.ones(num_chains, self.dtype)
-      proposed_target_log_prob = np.ones(num_chains, self.dtype)
-
-      target_log_prob = ops.convert_to_tensor(target_log_prob)
-      momentums = [ops.convert_to_tensor(momentums[0])]
-      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
-      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
-
-      energy = _compute_energy_change(
-          target_log_prob,
-          momentums,
-          proposed_target_log_prob,
-          proposed_momentums,
-          independent_chain_ndims=1)
-      grads = gradients_ops.gradients(energy, momentums)
-
-      [actual_energy, grads_] = sess.run([energy, grads])
-
-      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
-      # finite otherwise.
-      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
-      self.assertAllEqual(expected_energy, actual_energy)
-
-      # Ensure gradient is finite.
-      g = grads_[0].reshape([len(x), len(x)])[:, 0]
-      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isfinite(g))
-
-      # The remaining gradients are nan because the momentum was itself nan or
-      # inf.
-      g = grads_[0].reshape([len(x), len(x)])[:, 1:]
-      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isnan(g))
-
-
-class EnergyComputationTest16(test.TestCase, _EnergyComputationTest):
-  dtype = np.float16
-
-
-class EnergyComputationTest32(test.TestCase, _EnergyComputationTest):
-  dtype = np.float32
-
-
-class EnergyComputationTest64(test.TestCase, _EnergyComputationTest):
-  dtype = np.float64
-
-
-class _HMCHandlesLists(object):
-
-  def testStateParts(self):
-    with self.test_session(graph=ops.Graph()) as sess:
-      dist_x = normal_lib.Normal(loc=self.dtype(0), scale=self.dtype(1))
-      dist_y = independent_lib.Independent(
-          gamma_lib.Gamma(concentration=self.dtype([1, 2]),
-                          rate=self.dtype([0.5, 0.75])),
-          reinterpreted_batch_ndims=1)
-      def target_log_prob(x, y):
-        return dist_x.log_prob(x) + dist_y.log_prob(y)
-      x0 = [dist_x.sample(seed=1), dist_y.sample(seed=2)]
-      samples, _ = hmc.sample_chain(
-          num_results=int(2e3),
-          target_log_prob_fn=target_log_prob,
-          current_state=x0,
-          step_size=0.85,
-          num_leapfrog_steps=3,
-          num_burnin_steps=int(250),
-          seed=49)
-      actual_means = [math_ops.reduce_mean(s, axis=0) for s in samples]
-      actual_vars = [_reduce_variance(s, axis=0) for s in samples]
-      expected_means = [dist_x.mean(), dist_y.mean()]
-      expected_vars = [dist_x.variance(), dist_y.variance()]
-      [
-          actual_means_,
-          actual_vars_,
-          expected_means_,
-          expected_vars_,
-      ] = sess.run([
-          actual_means,
-          actual_vars,
-          expected_means,
-          expected_vars,
-      ])
-      self.assertAllClose(expected_means_, actual_means_, atol=0.05, rtol=0.16)
-      self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.25)
-
-
-class HMCHandlesLists32(_HMCHandlesLists, test.TestCase):
-  dtype = np.float32
-
-
-class HMCHandlesLists64(_HMCHandlesLists, test.TestCase):
-  dtype = np.float64
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_conv_variational_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_conv_variational_test.py
deleted file mode 100644
index 750afb6654311fea30a1dc6b31b20aa3b4160ae2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_conv_variational_test.py
+++ /dev/null
@@ -1,521 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for convolutional Bayesian layers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import layers_conv_variational as prob_layers_lib
-from tensorflow.contrib.bayesflow.python.ops import layers_util as prob_layers_util
-from tensorflow.contrib.distributions.python.ops import independent as independent_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.ops.distributions import util as distribution_util
-from tensorflow.python.platform import test
-
-
-class Counter(object):
-  """Helper class to manage incrementing a counting `int`."""
-
-  def __init__(self):
-    self._value = -1
-
-  @property
-  def value(self):
-    return self._value
-
-  def __call__(self):
-    self._value += 1
-    return self._value
-
-
-class MockDistribution(independent_lib.Independent):
-  """Monitors layer calls to the underlying distribution."""
-
-  def __init__(self, result_sample, result_log_prob, loc=None, scale=None):
-    self.result_sample = result_sample
-    self.result_log_prob = result_log_prob
-    self.result_loc = loc
-    self.result_scale = scale
-    self.result_distribution = normal_lib.Normal(loc=0.0, scale=1.0)
-    if loc is not None and scale is not None:
-      self.result_distribution = normal_lib.Normal(loc=self.result_loc,
-                                                   scale=self.result_scale)
-    self.called_log_prob = Counter()
-    self.called_sample = Counter()
-    self.called_loc = Counter()
-    self.called_scale = Counter()
-
-  def log_prob(self, *args, **kwargs):
-    self.called_log_prob()
-    return self.result_log_prob
-
-  def sample(self, *args, **kwargs):
-    self.called_sample()
-    return self.result_sample
-
-  @property
-  def distribution(self):  # for dummy check on Independent(Normal)
-    return self.result_distribution
-
-  @property
-  def loc(self):
-    self.called_loc()
-    return self.result_loc
-
-  @property
-  def scale(self):
-    self.called_scale()
-    return self.result_scale
-
-
-class MockKLDivergence(object):
-  """Monitors layer calls to the divergence implementation."""
-
-  def __init__(self, result):
-    self.result = result
-    self.args = []
-    self.called = Counter()
-
-  def __call__(self, *args, **kwargs):
-    self.called()
-    self.args.append(args)
-    return self.result
-
-
-class ConvVariational(test.TestCase):
-
-  def _testKLPenaltyKernel(self, layer_class):
-    with self.test_session():
-      layer = layer_class(filters=2, kernel_size=3)
-      if layer_class in (prob_layers_lib.Conv1DReparameterization,
-                         prob_layers_lib.Conv1DFlipout):
-        inputs = random_ops.random_uniform([2, 3, 1], seed=1)
-      elif layer_class in (prob_layers_lib.Conv2DReparameterization,
-                           prob_layers_lib.Conv2DFlipout):
-        inputs = random_ops.random_uniform([2, 3, 3, 1], seed=1)
-      elif layer_class in (prob_layers_lib.Conv3DReparameterization,
-                           prob_layers_lib.Conv3DFlipout):
-        inputs = random_ops.random_uniform([2, 3, 3, 3, 1], seed=1)
-
-      # No keys.
-      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(losses), 0)
-      self.assertListEqual(layer.losses, losses)
-
-      _ = layer(inputs)
-
-      # Yes keys.
-      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(losses), 1)
-      self.assertListEqual(layer.losses, losses)
-
-  def _testKLPenaltyBoth(self, layer_class):
-    def _make_normal(dtype, *args):  # pylint: disable=unused-argument
-      return normal_lib.Normal(
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.))
-    with self.test_session():
-      layer = layer_class(
-          filters=2,
-          kernel_size=3,
-          bias_posterior_fn=prob_layers_util.default_mean_field_normal_fn(),
-          bias_prior_fn=_make_normal)
-      if layer_class in (prob_layers_lib.Conv1DReparameterization,
-                         prob_layers_lib.Conv1DFlipout):
-        inputs = random_ops.random_uniform([2, 3, 1], seed=1)
-      elif layer_class in (prob_layers_lib.Conv2DReparameterization,
-                           prob_layers_lib.Conv2DFlipout):
-        inputs = random_ops.random_uniform([2, 3, 3, 1], seed=1)
-      elif layer_class in (prob_layers_lib.Conv3DReparameterization,
-                           prob_layers_lib.Conv3DFlipout):
-        inputs = random_ops.random_uniform([2, 3, 3, 3, 1], seed=1)
-
-      # No keys.
-      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(losses), 0)
-      self.assertListEqual(layer.losses, losses)
-
-      _ = layer(inputs)
-
-      # Yes keys.
-      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(losses), 2)
-      self.assertListEqual(layer.losses, losses)
-
-  def _testConvSetUp(self, layer_class, batch_size, depth=None,
-                     height=None, width=None, channels=None, filters=None,
-                     **kwargs):
-    seed = Counter()
-    if layer_class in (prob_layers_lib.Conv1DReparameterization,
-                       prob_layers_lib.Conv1DFlipout):
-      inputs = random_ops.random_uniform(
-          [batch_size, width, channels], seed=seed())
-      kernel_size = (2,)
-    elif layer_class in (prob_layers_lib.Conv2DReparameterization,
-                         prob_layers_lib.Conv2DFlipout):
-      inputs = random_ops.random_uniform(
-          [batch_size, height, width, channels], seed=seed())
-      kernel_size = (2, 2)
-    elif layer_class in (prob_layers_lib.Conv3DReparameterization,
-                         prob_layers_lib.Conv3DFlipout):
-      inputs = random_ops.random_uniform(
-          [batch_size, depth, height, width, channels], seed=seed())
-      kernel_size = (2, 2, 2)
-
-    kernel_shape = kernel_size + (channels, filters)
-    kernel_posterior = MockDistribution(
-        loc=random_ops.random_uniform(kernel_shape, seed=seed()),
-        scale=random_ops.random_uniform(kernel_shape, seed=seed()),
-        result_log_prob=random_ops.random_uniform(kernel_shape, seed=seed()),
-        result_sample=random_ops.random_uniform(kernel_shape, seed=seed()))
-    kernel_prior = MockDistribution(
-        result_log_prob=random_ops.random_uniform(kernel_shape, seed=seed()),
-        result_sample=random_ops.random_uniform(kernel_shape, seed=seed()))
-    kernel_divergence = MockKLDivergence(
-        result=random_ops.random_uniform(kernel_shape, seed=seed()))
-
-    bias_size = (filters,)
-    bias_posterior = MockDistribution(
-        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-    bias_prior = MockDistribution(
-        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-    bias_divergence = MockKLDivergence(
-        result=random_ops.random_uniform(bias_size, seed=seed()))
-
-    layer = layer_class(
-        filters=filters,
-        kernel_size=kernel_size,
-        padding="SAME",
-        kernel_posterior_fn=lambda *args: kernel_posterior,
-        kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-        kernel_prior_fn=lambda *args: kernel_prior,
-        kernel_divergence_fn=kernel_divergence,
-        bias_posterior_fn=lambda *args: bias_posterior,
-        bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-        bias_prior_fn=lambda *args: bias_prior,
-        bias_divergence_fn=bias_divergence,
-        **kwargs)
-
-    outputs = layer(inputs)
-
-    kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    return (kernel_posterior, kernel_prior, kernel_divergence,
-            bias_posterior, bias_prior, bias_divergence,
-            layer, inputs, outputs, kl_penalty, kernel_shape)
-
-  def _testConvReparameterization(self, layer_class):
-    batch_size, depth, height, width, channels, filters = 2, 4, 4, 4, 3, 5
-    with self.test_session() as sess:
-      (kernel_posterior, kernel_prior, kernel_divergence,
-       bias_posterior, bias_prior, bias_divergence, layer, inputs,
-       outputs, kl_penalty, kernel_shape) = self._testConvSetUp(
-           layer_class, batch_size,
-           depth=depth, height=height, width=width, channels=channels,
-           filters=filters)
-
-      convolution_op = nn_ops.Convolution(
-          tensor_shape.TensorShape(inputs.shape),
-          filter_shape=tensor_shape.TensorShape(kernel_shape),
-          padding="SAME")
-      expected_outputs = convolution_op(inputs, kernel_posterior.result_sample)
-      expected_outputs = nn.bias_add(expected_outputs,
-                                     bias_posterior.result_sample,
-                                     data_format="NHWC")
-
-      [
-          expected_outputs_, actual_outputs_,
-          expected_kernel_, actual_kernel_,
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          expected_bias_, actual_bias_,
-          expected_bias_divergence_, actual_bias_divergence_,
-      ] = sess.run([
-          expected_outputs, outputs,
-          kernel_posterior.result_sample, layer.kernel_posterior_tensor,
-          kernel_divergence.result, kl_penalty[0],
-          bias_posterior.result_sample, layer.bias_posterior_tensor,
-          bias_divergence.result, kl_penalty[1],
-      ])
-
-      self.assertAllClose(
-          expected_kernel_, actual_kernel_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_bias_, actual_bias_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_outputs_, actual_outputs_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_bias_divergence_, actual_bias_divergence_,
-          rtol=1e-6, atol=0.)
-
-      self.assertAllEqual(
-          [[kernel_posterior.distribution,
-            kernel_prior.distribution,
-            kernel_posterior.result_sample]],
-          kernel_divergence.args)
-
-      self.assertAllEqual(
-          [[bias_posterior.distribution,
-            bias_prior.distribution,
-            bias_posterior.result_sample]],
-          bias_divergence.args)
-
-  def _testConvFlipout(self, layer_class):
-    batch_size, depth, height, width, channels, filters = 2, 4, 4, 4, 3, 5
-    with self.test_session() as sess:
-      (kernel_posterior, kernel_prior, kernel_divergence,
-       bias_posterior, bias_prior, bias_divergence, layer, inputs,
-       outputs, kl_penalty, kernel_shape) = self._testConvSetUp(
-           layer_class, batch_size,
-           depth=depth, height=height, width=width, channels=channels,
-           filters=filters, seed=44)
-
-      convolution_op = nn_ops.Convolution(
-          tensor_shape.TensorShape(inputs.shape),
-          filter_shape=tensor_shape.TensorShape(kernel_shape),
-          padding="SAME")
-
-      expected_kernel_posterior_affine = normal_lib.Normal(
-          loc=array_ops.zeros_like(kernel_posterior.result_loc),
-          scale=kernel_posterior.result_scale)
-      expected_kernel_posterior_affine_tensor = (
-          expected_kernel_posterior_affine.sample(seed=42))
-
-      expected_outputs = convolution_op(
-          inputs, kernel_posterior.distribution.loc)
-
-      input_shape = array_ops.shape(inputs)
-      output_shape = array_ops.shape(expected_outputs)
-      batch_shape = array_ops.expand_dims(input_shape[0], 0)
-      channels = input_shape[-1]
-      rank = len(inputs.get_shape()) - 2
-
-      sign_input = random_ops.random_uniform(
-          array_ops.concat([batch_shape,
-                            array_ops.expand_dims(channels, 0)], 0),
-          minval=0,
-          maxval=2,
-          dtype=dtypes.int32,
-          seed=layer.seed)
-      sign_input = math_ops.cast(2 * sign_input - 1, inputs.dtype)
-      sign_output = random_ops.random_uniform(
-          array_ops.concat([batch_shape,
-                            array_ops.expand_dims(filters, 0)], 0),
-          minval=0,
-          maxval=2,
-          dtype=dtypes.int32,
-          seed=distribution_util.gen_new_seed(
-              layer.seed, salt="conv_flipout"))
-      sign_output = math_ops.cast(2 * sign_output - 1, inputs.dtype)
-      for _ in range(rank):
-        sign_input = array_ops.expand_dims(sign_input, 1)  # 2D ex: (B, 1, 1, C)
-        sign_output = array_ops.expand_dims(sign_output, 1)
-
-      sign_input = array_ops.tile(  # tile for element-wise op broadcasting
-          sign_input,
-          [1] + [input_shape[i + 1] for i in range(rank)] + [1])
-      sign_output = array_ops.tile(
-          sign_output,
-          [1] + [output_shape[i + 1] for i in range(rank)] + [1])
-
-      perturbed_inputs = convolution_op(
-          inputs * sign_input, expected_kernel_posterior_affine_tensor)
-      perturbed_inputs *= sign_output
-
-      expected_outputs += perturbed_inputs
-      expected_outputs = nn.bias_add(expected_outputs,
-                                     bias_posterior.result_sample,
-                                     data_format="NHWC")
-
-      [
-          expected_outputs_, actual_outputs_,
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          expected_bias_, actual_bias_,
-          expected_bias_divergence_, actual_bias_divergence_,
-      ] = sess.run([
-          expected_outputs, outputs,
-          kernel_divergence.result, kl_penalty[0],
-          bias_posterior.result_sample, layer.bias_posterior_tensor,
-          bias_divergence.result, kl_penalty[1],
-      ])
-
-      self.assertAllClose(
-          expected_bias_, actual_bias_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_outputs_, actual_outputs_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_bias_divergence_, actual_bias_divergence_,
-          rtol=1e-6, atol=0.)
-
-      self.assertAllEqual(
-          [[kernel_posterior.distribution, kernel_prior.distribution, None]],
-          kernel_divergence.args)
-
-      self.assertAllEqual(
-          [[bias_posterior.distribution,
-            bias_prior.distribution,
-            bias_posterior.result_sample]],
-          bias_divergence.args)
-
-  def _testRandomConvFlipout(self, layer_class):
-    batch_size, depth, height, width, channels, filters = 2, 4, 4, 4, 3, 5
-    with self.test_session() as sess:
-      seed = Counter()
-      if layer_class in (prob_layers_lib.Conv1DReparameterization,
-                         prob_layers_lib.Conv1DFlipout):
-        inputs = random_ops.random_uniform(
-            [batch_size, width, channels], seed=seed())
-        kernel_size = (2,)
-      elif layer_class in (prob_layers_lib.Conv2DReparameterization,
-                           prob_layers_lib.Conv2DFlipout):
-        inputs = random_ops.random_uniform(
-            [batch_size, height, width, channels], seed=seed())
-        kernel_size = (2, 2)
-      elif layer_class in (prob_layers_lib.Conv3DReparameterization,
-                           prob_layers_lib.Conv3DFlipout):
-        inputs = random_ops.random_uniform(
-            [batch_size, depth, height, width, channels], seed=seed())
-        kernel_size = (2, 2, 2)
-
-      kernel_shape = kernel_size + (channels, filters)
-      bias_size = (filters,)
-
-      kernel_posterior = MockDistribution(
-          loc=random_ops.random_uniform(
-              kernel_shape, seed=seed()),
-          scale=random_ops.random_uniform(
-              kernel_shape, seed=seed()),
-          result_log_prob=random_ops.random_uniform(
-              kernel_shape, seed=seed()),
-          result_sample=random_ops.random_uniform(
-              kernel_shape, seed=seed()))
-      bias_posterior = MockDistribution(
-          loc=random_ops.random_uniform(
-              bias_size, seed=seed()),
-          scale=random_ops.random_uniform(
-              bias_size, seed=seed()),
-          result_log_prob=random_ops.random_uniform(
-              bias_size, seed=seed()),
-          result_sample=random_ops.random_uniform(
-              bias_size, seed=seed()))
-      layer_one = layer_class(
-          filters=filters,
-          kernel_size=kernel_size,
-          padding="SAME",
-          kernel_posterior_fn=lambda *args: kernel_posterior,
-          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-          bias_posterior_fn=lambda *args: bias_posterior,
-          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-          seed=44)
-      layer_two = layer_class(
-          filters=filters,
-          kernel_size=kernel_size,
-          padding="SAME",
-          kernel_posterior_fn=lambda *args: kernel_posterior,
-          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-          bias_posterior_fn=lambda *args: bias_posterior,
-          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-          seed=45)
-
-      outputs_one = layer_one(inputs)
-      outputs_two = layer_two(inputs)
-
-      outputs_one_, outputs_two_ = sess.run([
-          outputs_one, outputs_two])
-
-      self.assertLess(np.sum(np.isclose(outputs_one_, outputs_two_)),
-                      np.prod(outputs_one_.shape))
-
-  def testKLPenaltyKernelConv1DReparameterization(self):
-    self._testKLPenaltyKernel(prob_layers_lib.Conv1DReparameterization)
-
-  def testKLPenaltyKernelConv2DReparameterization(self):
-    self._testKLPenaltyKernel(prob_layers_lib.Conv2DReparameterization)
-
-  def testKLPenaltyKernelConv3DReparameterization(self):
-    self._testKLPenaltyKernel(prob_layers_lib.Conv3DReparameterization)
-
-  def testKLPenaltyKernelConv1DFlipout(self):
-    self._testKLPenaltyKernel(prob_layers_lib.Conv1DFlipout)
-
-  def testKLPenaltyKernelConv2DFlipout(self):
-    self._testKLPenaltyKernel(prob_layers_lib.Conv2DFlipout)
-
-  def testKLPenaltyKernelConv3DFlipout(self):
-    self._testKLPenaltyKernel(prob_layers_lib.Conv3DFlipout)
-
-  def testKLPenaltyBothConv1DReparameterization(self):
-    self._testKLPenaltyBoth(prob_layers_lib.Conv1DReparameterization)
-
-  def testKLPenaltyBothConv2DReparameterization(self):
-    self._testKLPenaltyBoth(prob_layers_lib.Conv2DReparameterization)
-
-  def testKLPenaltyBothConv3DReparameterization(self):
-    self._testKLPenaltyBoth(prob_layers_lib.Conv3DReparameterization)
-
-  def testKLPenaltyBothConv1DFlipout(self):
-    self._testKLPenaltyBoth(prob_layers_lib.Conv1DFlipout)
-
-  def testKLPenaltyBothConv2DFlipout(self):
-    self._testKLPenaltyBoth(prob_layers_lib.Conv2DFlipout)
-
-  def testKLPenaltyBothConv3DFlipout(self):
-    self._testKLPenaltyBoth(prob_layers_lib.Conv3DFlipout)
-
-  def testConv1DReparameterization(self):
-    self._testConvReparameterization(prob_layers_lib.Conv1DReparameterization)
-
-  def testConv2DReparameterization(self):
-    self._testConvReparameterization(prob_layers_lib.Conv2DReparameterization)
-
-  def testConv3DReparameterization(self):
-    self._testConvReparameterization(prob_layers_lib.Conv3DReparameterization)
-
-  def testConv1DFlipout(self):
-    self._testConvFlipout(prob_layers_lib.Conv1DFlipout)
-
-  def testConv2DFlipout(self):
-    self._testConvFlipout(prob_layers_lib.Conv2DFlipout)
-
-  def testConv3DFlipout(self):
-    self._testConvFlipout(prob_layers_lib.Conv3DFlipout)
-
-  def testRandomConv1DFlipout(self):
-    self._testRandomConvFlipout(prob_layers_lib.Conv1DFlipout)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
deleted file mode 100644
index 342f38ccec7ec74db1b393d6cdc22300205cc547..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
+++ /dev/null
@@ -1,443 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for dense Bayesian layers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import layers_dense_variational as prob_layers_lib
-from tensorflow.contrib.bayesflow.python.ops import layers_util as prob_layers_util
-from tensorflow.contrib.distributions.python.ops import independent as independent_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.ops.distributions import util as distribution_util
-from tensorflow.python.platform import test
-
-
-class Counter(object):
-  """Helper class to manage incrementing a counting `int`."""
-
-  def __init__(self):
-    self._value = -1
-
-  @property
-  def value(self):
-    return self._value
-
-  def __call__(self):
-    self._value += 1
-    return self._value
-
-
-class MockDistribution(independent_lib.Independent):
-  """Monitors layer calls to the underlying distribution."""
-
-  def __init__(self, result_sample, result_log_prob, loc=None, scale=None):
-    self.result_sample = result_sample
-    self.result_log_prob = result_log_prob
-    self.result_loc = loc
-    self.result_scale = scale
-    self.result_distribution = normal_lib.Normal(loc=0.0, scale=1.0)
-    if loc is not None and scale is not None:
-      self.result_distribution = normal_lib.Normal(loc=self.result_loc,
-                                                   scale=self.result_scale)
-    self.called_log_prob = Counter()
-    self.called_sample = Counter()
-    self.called_loc = Counter()
-    self.called_scale = Counter()
-
-  def log_prob(self, *args, **kwargs):
-    self.called_log_prob()
-    return self.result_log_prob
-
-  def sample(self, *args, **kwargs):
-    self.called_sample()
-    return self.result_sample
-
-  @property
-  def distribution(self):  # for dummy check on Independent(Normal)
-    return self.result_distribution
-
-  @property
-  def loc(self):
-    self.called_loc()
-    return self.result_loc
-
-  @property
-  def scale(self):
-    self.called_scale()
-    return self.result_scale
-
-
-class MockKLDivergence(object):
-  """Monitors layer calls to the divergence implementation."""
-
-  def __init__(self, result):
-    self.result = result
-    self.args = []
-    self.called = Counter()
-
-  def __call__(self, *args, **kwargs):
-    self.called()
-    self.args.append(args)
-    return self.result
-
-
-class DenseVariational(test.TestCase):
-
-  def _testKLPenaltyKernel(self, layer_class):
-    with self.test_session():
-      layer = layer_class(units=2)
-      inputs = random_ops.random_uniform([2, 3], seed=1)
-
-      # No keys.
-      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(losses), 0)
-      self.assertListEqual(layer.losses, losses)
-
-      _ = layer(inputs)
-
-      # Yes keys.
-      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(losses), 1)
-      self.assertListEqual(layer.losses, losses)
-
-  def _testKLPenaltyBoth(self, layer_class):
-    def _make_normal(dtype, *args):  # pylint: disable=unused-argument
-      return normal_lib.Normal(
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.))
-    with self.test_session():
-      layer = layer_class(
-          units=2,
-          bias_posterior_fn=prob_layers_util.default_mean_field_normal_fn(),
-          bias_prior_fn=_make_normal)
-      inputs = random_ops.random_uniform([2, 3], seed=1)
-
-      # No keys.
-      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(losses), 0)
-      self.assertListEqual(layer.losses, losses)
-
-      _ = layer(inputs)
-
-      # Yes keys.
-      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(losses), 2)
-      self.assertListEqual(layer.losses, losses)
-
-  def _testDenseSetUp(self, layer_class, batch_size, in_size, out_size,
-                      **kwargs):
-    seed = Counter()
-    inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
-
-    kernel_size = [in_size, out_size]
-    kernel_posterior = MockDistribution(
-        loc=random_ops.random_uniform(kernel_size, seed=seed()),
-        scale=random_ops.random_uniform(kernel_size, seed=seed()),
-        result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-        result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-    kernel_prior = MockDistribution(
-        result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-        result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-    kernel_divergence = MockKLDivergence(
-        result=random_ops.random_uniform(kernel_size, seed=seed()))
-
-    bias_size = [out_size]
-    bias_posterior = MockDistribution(
-        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-    bias_prior = MockDistribution(
-        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-    bias_divergence = MockKLDivergence(
-        result=random_ops.random_uniform(bias_size, seed=seed()))
-
-    layer = layer_class(
-        units=out_size,
-        kernel_posterior_fn=lambda *args: kernel_posterior,
-        kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-        kernel_prior_fn=lambda *args: kernel_prior,
-        kernel_divergence_fn=kernel_divergence,
-        bias_posterior_fn=lambda *args: bias_posterior,
-        bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-        bias_prior_fn=lambda *args: bias_prior,
-        bias_divergence_fn=bias_divergence,
-        **kwargs)
-
-    outputs = layer(inputs)
-
-    kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-    return (kernel_posterior, kernel_prior, kernel_divergence,
-            bias_posterior, bias_prior, bias_divergence,
-            layer, inputs, outputs, kl_penalty)
-
-  def testKLPenaltyKernelReparameterization(self):
-    self._testKLPenaltyKernel(prob_layers_lib.DenseReparameterization)
-
-  def testKLPenaltyKernelLocalReparameterization(self):
-    self._testKLPenaltyKernel(prob_layers_lib.DenseLocalReparameterization)
-
-  def testKLPenaltyKernelFlipout(self):
-    self._testKLPenaltyKernel(prob_layers_lib.DenseFlipout)
-
-  def testKLPenaltyBothReparameterization(self):
-    self._testKLPenaltyBoth(prob_layers_lib.DenseReparameterization)
-
-  def testKLPenaltyBothLocalReparameterization(self):
-    self._testKLPenaltyBoth(prob_layers_lib.DenseLocalReparameterization)
-
-  def testKLPenaltyBothFlipout(self):
-    self._testKLPenaltyBoth(prob_layers_lib.DenseFlipout)
-
-  def testDenseReparameterization(self):
-    batch_size, in_size, out_size = 2, 3, 4
-    with self.test_session() as sess:
-      (kernel_posterior, kernel_prior, kernel_divergence,
-       bias_posterior, bias_prior, bias_divergence, layer, inputs,
-       outputs, kl_penalty) = self._testDenseSetUp(
-           prob_layers_lib.DenseReparameterization,
-           batch_size, in_size, out_size)
-
-      expected_outputs = (
-          math_ops.matmul(inputs, kernel_posterior.result_sample) +
-          bias_posterior.result_sample)
-
-      [
-          expected_outputs_, actual_outputs_,
-          expected_kernel_, actual_kernel_,
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          expected_bias_, actual_bias_,
-          expected_bias_divergence_, actual_bias_divergence_,
-      ] = sess.run([
-          expected_outputs, outputs,
-          kernel_posterior.result_sample, layer.kernel_posterior_tensor,
-          kernel_divergence.result, kl_penalty[0],
-          bias_posterior.result_sample, layer.bias_posterior_tensor,
-          bias_divergence.result, kl_penalty[1],
-      ])
-
-      self.assertAllClose(
-          expected_kernel_, actual_kernel_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_bias_, actual_bias_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_outputs_, actual_outputs_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_bias_divergence_, actual_bias_divergence_,
-          rtol=1e-6, atol=0.)
-
-      self.assertAllEqual(
-          [[kernel_posterior.distribution,
-            kernel_prior.distribution,
-            kernel_posterior.result_sample]],
-          kernel_divergence.args)
-
-      self.assertAllEqual(
-          [[bias_posterior.distribution,
-            bias_prior.distribution,
-            bias_posterior.result_sample]],
-          bias_divergence.args)
-
-  def testDenseLocalReparameterization(self):
-    batch_size, in_size, out_size = 2, 3, 4
-    with self.test_session() as sess:
-      (kernel_posterior, kernel_prior, kernel_divergence,
-       bias_posterior, bias_prior, bias_divergence, layer, inputs,
-       outputs, kl_penalty) = self._testDenseSetUp(
-           prob_layers_lib.DenseLocalReparameterization,
-           batch_size, in_size, out_size)
-
-      expected_kernel_posterior_affine = normal_lib.Normal(
-          loc=math_ops.matmul(inputs, kernel_posterior.result_loc),
-          scale=math_ops.matmul(
-              inputs**2., kernel_posterior.result_scale**2)**0.5)
-      expected_kernel_posterior_affine_tensor = (
-          expected_kernel_posterior_affine.sample(seed=42))
-      expected_outputs = (expected_kernel_posterior_affine_tensor +
-                          bias_posterior.result_sample)
-
-      [
-          expected_outputs_, actual_outputs_,
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          expected_bias_, actual_bias_,
-          expected_bias_divergence_, actual_bias_divergence_,
-      ] = sess.run([
-          expected_outputs, outputs,
-          kernel_divergence.result, kl_penalty[0],
-          bias_posterior.result_sample, layer.bias_posterior_tensor,
-          bias_divergence.result, kl_penalty[1],
-      ])
-
-      self.assertAllClose(
-          expected_bias_, actual_bias_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_outputs_, actual_outputs_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_bias_divergence_, actual_bias_divergence_,
-          rtol=1e-6, atol=0.)
-
-      self.assertAllEqual(
-          [[kernel_posterior.distribution,
-            kernel_prior.distribution,
-            None]],
-          kernel_divergence.args)
-
-      self.assertAllEqual(
-          [[bias_posterior.distribution,
-            bias_prior.distribution,
-            bias_posterior.result_sample]],
-          bias_divergence.args)
-
-  def testDenseFlipout(self):
-    batch_size, in_size, out_size = 2, 3, 4
-    with self.test_session() as sess:
-      (kernel_posterior, kernel_prior, kernel_divergence,
-       bias_posterior, bias_prior, bias_divergence, layer, inputs,
-       outputs, kl_penalty) = self._testDenseSetUp(
-           prob_layers_lib.DenseFlipout,
-           batch_size, in_size, out_size, seed=44)
-
-      expected_kernel_posterior_affine = normal_lib.Normal(
-          loc=array_ops.zeros_like(kernel_posterior.result_loc),
-          scale=kernel_posterior.result_scale)
-      expected_kernel_posterior_affine_tensor = (
-          expected_kernel_posterior_affine.sample(seed=42))
-
-      sign_input = random_ops.random_uniform(
-          [batch_size, in_size],
-          minval=0,
-          maxval=2,
-          dtype=dtypes.int32,
-          seed=layer.seed)
-      sign_input = math_ops.cast(2 * sign_input - 1, inputs.dtype)
-      sign_output = random_ops.random_uniform(
-          [batch_size, out_size],
-          minval=0,
-          maxval=2,
-          dtype=dtypes.int32,
-          seed=distribution_util.gen_new_seed(
-              layer.seed, salt="dense_flipout"))
-      sign_output = math_ops.cast(2 * sign_output - 1, inputs.dtype)
-      perturbed_inputs = math_ops.matmul(
-          inputs * sign_input, expected_kernel_posterior_affine_tensor)
-      perturbed_inputs *= sign_output
-
-      expected_outputs = math_ops.matmul(inputs, kernel_posterior.result_loc)
-      expected_outputs += perturbed_inputs
-      expected_outputs += bias_posterior.result_sample
-
-      [
-          expected_outputs_, actual_outputs_,
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          expected_bias_, actual_bias_,
-          expected_bias_divergence_, actual_bias_divergence_,
-      ] = sess.run([
-          expected_outputs, outputs,
-          kernel_divergence.result, kl_penalty[0],
-          bias_posterior.result_sample, layer.bias_posterior_tensor,
-          bias_divergence.result, kl_penalty[1],
-      ])
-
-      self.assertAllClose(
-          expected_bias_, actual_bias_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_outputs_, actual_outputs_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_kernel_divergence_, actual_kernel_divergence_,
-          rtol=1e-6, atol=0.)
-      self.assertAllClose(
-          expected_bias_divergence_, actual_bias_divergence_,
-          rtol=1e-6, atol=0.)
-
-      self.assertAllEqual(
-          [[kernel_posterior.distribution, kernel_prior.distribution, None]],
-          kernel_divergence.args)
-
-      self.assertAllEqual(
-          [[bias_posterior.distribution,
-            bias_prior.distribution,
-            bias_posterior.result_sample]],
-          bias_divergence.args)
-
-  def testRandomDenseFlipout(self):
-    batch_size, in_size, out_size = 2, 3, 4
-    with self.test_session() as sess:
-      seed = Counter()
-      inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
-
-      kernel_posterior = MockDistribution(
-          loc=random_ops.random_uniform(
-              [in_size, out_size], seed=seed()),
-          scale=random_ops.random_uniform(
-              [in_size, out_size], seed=seed()),
-          result_log_prob=random_ops.random_uniform(
-              [in_size, out_size], seed=seed()),
-          result_sample=random_ops.random_uniform(
-              [in_size, out_size], seed=seed()))
-      bias_posterior = MockDistribution(
-          loc=random_ops.random_uniform(
-              [out_size], seed=seed()),
-          scale=random_ops.random_uniform(
-              [out_size], seed=seed()),
-          result_log_prob=random_ops.random_uniform(
-              [out_size], seed=seed()),
-          result_sample=random_ops.random_uniform(
-              [out_size], seed=seed()))
-      layer_one = prob_layers_lib.DenseFlipout(
-          units=out_size,
-          kernel_posterior_fn=lambda *args: kernel_posterior,
-          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-          bias_posterior_fn=lambda *args: bias_posterior,
-          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-          seed=44)
-      layer_two = prob_layers_lib.DenseFlipout(
-          units=out_size,
-          kernel_posterior_fn=lambda *args: kernel_posterior,
-          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-          bias_posterior_fn=lambda *args: bias_posterior,
-          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-          seed=45)
-
-      outputs_one = layer_one(inputs)
-      outputs_two = layer_two(inputs)
-
-      outputs_one_, outputs_two_ = sess.run([
-          outputs_one, outputs_two])
-
-      self.assertLess(np.sum(np.isclose(outputs_one_, outputs_two_)), out_size)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
deleted file mode 100644
index 52e36e135d95c1ec919c710f35d59073c2134d05..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for MCMC diagnostic utilities."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import mcmc_diagnostics_impl as mcmc_diagnostics
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import spectral_ops_test_util
-from tensorflow.python.platform import test
-
-rng = np.random.RandomState(42)
-
-
-class _EffectiveSampleSizeTest(object):
-
-  @property
-  def use_static_shape(self):
-    raise NotImplementedError(
-        "Subclass failed to implement `use_static_shape`.")
-
-  def _check_versus_expected_effective_sample_size(self,
-                                                   x_,
-                                                   expected_ess,
-                                                   sess,
-                                                   atol=1e-2,
-                                                   rtol=1e-2,
-                                                   filter_threshold=None,
-                                                   filter_beyond_lag=None):
-    x = array_ops.placeholder_with_default(
-        input=x_, shape=x_.shape if self.use_static_shape else None)
-    ess = mcmc_diagnostics.effective_sample_size(
-        x,
-        filter_threshold=filter_threshold,
-        filter_beyond_lag=filter_beyond_lag)
-    if self.use_static_shape:
-      self.assertAllEqual(x.shape[1:], ess.shape)
-
-    ess_ = sess.run(ess)
-
-    self.assertAllClose(
-        np.ones_like(ess_) * expected_ess, ess_, atol=atol, rtol=rtol)
-
-  def testIidRank1NormalHasFullEssMaxLags10(self):
-    # With a length 5000 iid normal sequence, and filter_beyond_lag = 10, we
-    # should have a good estimate of ESS, and it should be close to the full
-    # sequence length of 5000.
-    # The choice of filter_beyond_lag = 10 is a short cutoff, reasonable only
-    # since we know the correlation length should be zero right away.
-    with self.test_session() as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        self._check_versus_expected_effective_sample_size(
-            x_=rng.randn(5000).astype(np.float32),
-            expected_ess=5000,
-            sess=sess,
-            filter_beyond_lag=10,
-            filter_threshold=None,
-            rtol=0.3)
-
-  def testIidRank2NormalHasFullEssMaxLags10(self):
-    # See similar test for Rank1Normal for reasoning.
-    with self.test_session() as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        self._check_versus_expected_effective_sample_size(
-            x_=rng.randn(5000, 2).astype(np.float32),
-            expected_ess=5000,
-            sess=sess,
-            filter_beyond_lag=10,
-            filter_threshold=None,
-            rtol=0.3)
-
-  def testIidRank1NormalHasFullEssMaxLagThresholdZero(self):
-    # With a length 5000 iid normal sequence, and filter_threshold = 0,
-    # we should have a super-duper estimate of ESS, and it should be very close
-    # to the full sequence length of 5000.
-    # The choice of filter_beyond_lag = 0 means we cutoff as soon as the
-    # auto-corris below zero.  This should happen very quickly, due to the fact
-    # that the theoretical auto-corr is [1, 0, 0,...]
-    with self.test_session() as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        self._check_versus_expected_effective_sample_size(
-            x_=rng.randn(5000).astype(np.float32),
-            expected_ess=5000,
-            sess=sess,
-            filter_beyond_lag=None,
-            filter_threshold=0.,
-            rtol=0.1)
-
-  def testIidRank2NormalHasFullEssMaxLagThresholdZero(self):
-    # See similar test for Rank1Normal for reasoning.
-    with self.test_session() as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        self._check_versus_expected_effective_sample_size(
-            x_=rng.randn(5000, 2).astype(np.float32),
-            expected_ess=5000,
-            sess=sess,
-            filter_beyond_lag=None,
-            filter_threshold=0.,
-            rtol=0.1)
-
-  def testLength10CorrelationHasEssOneTenthTotalLengthUsingMaxLags50(self):
-    # Create x_, such that
-    #   x_[i] = iid_x_[0], i = 0,...,9
-    #   x_[i] = iid_x_[1], i = 10,..., 19,
-    #   and so on.
-    iid_x_ = rng.randn(5000, 1).astype(np.float32)
-    x_ = (iid_x_ * np.ones((5000, 10)).astype(np.float32)).reshape((50000,))
-    with self.test_session() as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        self._check_versus_expected_effective_sample_size(
-            x_=x_,
-            expected_ess=50000 // 10,
-            sess=sess,
-            filter_beyond_lag=50,
-            filter_threshold=None,
-            rtol=0.2)
-
-  def testLength10CorrelationHasEssOneTenthTotalLengthUsingMaxLagsThresholdZero(
-      self):
-    # Create x_, such that
-    #   x_[i] = iid_x_[0], i = 0,...,9
-    #   x_[i] = iid_x_[1], i = 10,..., 19,
-    #   and so on.
-    iid_x_ = rng.randn(5000, 1).astype(np.float32)
-    x_ = (iid_x_ * np.ones((5000, 10)).astype(np.float32)).reshape((50000,))
-    with self.test_session() as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        self._check_versus_expected_effective_sample_size(
-            x_=x_,
-            expected_ess=50000 // 10,
-            sess=sess,
-            filter_beyond_lag=None,
-            filter_threshold=0.,
-            rtol=0.1)
-
-  def testListArgs(self):
-    # x_ has correlation length 10 ==> ESS = N / 10
-    # y_ has correlation length 1  ==> ESS = N
-    iid_x_ = rng.randn(5000, 1).astype(np.float32)
-    x_ = (iid_x_ * np.ones((5000, 10)).astype(np.float32)).reshape((50000,))
-    y_ = rng.randn(50000).astype(np.float32)
-    states = [x_, x_, y_, y_]
-    filter_threshold = [0., None, 0., None]
-    filter_beyond_lag = [None, 5, None, 5]
-
-    # See other tests for reasoning on tolerance.
-    with self.test_session() as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        ess = mcmc_diagnostics.effective_sample_size(
-            states,
-            filter_threshold=filter_threshold,
-            filter_beyond_lag=filter_beyond_lag)
-        ess_ = sess.run(ess)
-    self.assertAllEqual(4, len(ess_))
-
-    self.assertAllClose(50000 // 10, ess_[0], rtol=0.3)
-    self.assertAllClose(50000 // 10, ess_[1], rtol=0.3)
-    self.assertAllClose(50000, ess_[2], rtol=0.1)
-    self.assertAllClose(50000, ess_[3], rtol=0.1)
-
-  def testMaxLagsThresholdLessThanNeg1SameAsNone(self):
-    # Setting both means we filter out items R_k from the auto-correlation
-    # sequence if k > filter_beyond_lag OR k >= j where R_j < filter_threshold.
-
-    # x_ has correlation length 10.
-    iid_x_ = rng.randn(500, 1).astype(np.float32)
-    x_ = (iid_x_ * np.ones((500, 10)).astype(np.float32)).reshape((5000,))
-    with self.test_session() as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        x = array_ops.placeholder_with_default(
-            input=x_, shape=x_.shape if self.use_static_shape else None)
-
-        ess_none_none = mcmc_diagnostics.effective_sample_size(
-            x, filter_threshold=None, filter_beyond_lag=None)
-        ess_none_200 = mcmc_diagnostics.effective_sample_size(
-            x, filter_threshold=None, filter_beyond_lag=200)
-        ess_neg2_200 = mcmc_diagnostics.effective_sample_size(
-            x, filter_threshold=-2., filter_beyond_lag=200)
-        ess_neg2_none = mcmc_diagnostics.effective_sample_size(
-            x, filter_threshold=-2., filter_beyond_lag=None)
-        ess_none_none_, ess_none_200_, ess_neg2_200_, ess_neg2_none_ = sess.run(
-            [ess_none_none, ess_none_200, ess_neg2_200, ess_neg2_none])
-
-        # filter_threshold=-2 <==> filter_threshold=None.
-        self.assertAllClose(ess_none_none_, ess_neg2_none_)
-        self.assertAllClose(ess_none_200_, ess_neg2_200_)
-
-  def testMaxLagsArgsAddInAnOrManner(self):
-    # Setting both means we filter out items R_k from the auto-correlation
-    # sequence if k > filter_beyond_lag OR k >= j where R_j < filter_threshold.
-
-    # x_ has correlation length 10.
-    iid_x_ = rng.randn(500, 1).astype(np.float32)
-    x_ = (iid_x_ * np.ones((500, 10)).astype(np.float32)).reshape((5000,))
-    with self.test_session() as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        x = array_ops.placeholder_with_default(
-            input=x_, shape=x_.shape if self.use_static_shape else None)
-
-        ess_1_9 = mcmc_diagnostics.effective_sample_size(
-            x, filter_threshold=1., filter_beyond_lag=9)
-        ess_1_none = mcmc_diagnostics.effective_sample_size(
-            x, filter_threshold=1., filter_beyond_lag=None)
-        ess_none_9 = mcmc_diagnostics.effective_sample_size(
-            x, filter_threshold=1., filter_beyond_lag=9)
-        ess_1_9_, ess_1_none_, ess_none_9_ = sess.run(
-            [ess_1_9, ess_1_none, ess_none_9])
-
-        # Since R_k = 1 for k < 10, and R_k < 1 for k >= 10,
-        # filter_threshold = 1 <==> filter_beyond_lag = 9.
-        self.assertAllClose(ess_1_9_, ess_1_none_)
-        self.assertAllClose(ess_1_9_, ess_none_9_)
-
-
-class EffectiveSampleSizeStaticTest(test.TestCase, _EffectiveSampleSizeTest):
-
-  @property
-  def use_static_shape(self):
-    return True
-
-
-class EffectiveSampleSizeDynamicTest(test.TestCase, _EffectiveSampleSizeTest):
-
-  @property
-  def use_static_shape(self):
-    return False
-
-
-class _PotentialScaleReductionTest(object):
-
-  @property
-  def use_static_shape(self):
-    raise NotImplementedError(
-        "Subclass failed to impliment `use_static_shape`.")
-
-  def testListOfStatesWhereFirstPassesSecondFails(self):
-    """Simple test showing API with two states.  Read first!."""
-    n_samples = 1000
-
-    # state_0 is two scalar chains taken from iid Normal(0, 1).  Will pass.
-    state_0 = rng.randn(n_samples, 2)
-
-    # state_1 is three 4-variate chains taken from Normal(0, 1) that have been
-    # shifted.  Since every chain is shifted, they are not the same, and the
-    # test should fail.
-    offset = np.array([1., -1., 2.]).reshape(3, 1)
-    state_1 = rng.randn(n_samples, 3, 4) + offset
-
-    rhat = mcmc_diagnostics.potential_scale_reduction(
-        chains_states=[state_0, state_1], independent_chain_ndims=1)
-
-    self.assertIsInstance(rhat, list)
-    with self.test_session() as sess:
-      rhat_0_, rhat_1_ = sess.run(rhat)
-
-    # r_hat_0 should be close to 1, meaning test is passed.
-    self.assertAllEqual((), rhat_0_.shape)
-    self.assertAllClose(1., rhat_0_, rtol=0.02)
-
-    # r_hat_1 should be greater than 1.2, meaning test has failed.
-    self.assertAllEqual((4,), rhat_1_.shape)
-    self.assertAllEqual(np.ones_like(rhat_1_).astype(bool), rhat_1_ > 1.2)
-
-  def check_results(self, state_, independent_chain_shape, should_pass):
-    sample_ndims = 1
-    independent_chain_ndims = len(independent_chain_shape)
-    with self.test_session():
-      state = array_ops.placeholder_with_default(
-          input=state_, shape=state_.shape if self.use_static_shape else None)
-
-      rhat = mcmc_diagnostics.potential_scale_reduction(
-          state, independent_chain_ndims=independent_chain_ndims)
-
-      if self.use_static_shape:
-        self.assertAllEqual(
-            state_.shape[sample_ndims + independent_chain_ndims:], rhat.shape)
-
-      rhat_ = rhat.eval()
-      if should_pass:
-        self.assertAllClose(np.ones_like(rhat_), rhat_, atol=0, rtol=0.02)
-      else:
-        self.assertAllEqual(np.ones_like(rhat_).astype(bool), rhat_ > 1.2)
-
-  def iid_normal_chains_should_pass_wrapper(self,
-                                            sample_shape,
-                                            independent_chain_shape,
-                                            other_shape,
-                                            dtype=np.float32):
-    """Check results with iid normal chains."""
-
-    state_shape = sample_shape + independent_chain_shape + other_shape
-    state_ = rng.randn(*state_shape).astype(dtype)
-
-    # The "other" dimensions do not have to be identical, just independent, so
-    # force them to not be identical.
-    if other_shape:
-      state_ *= rng.rand(*other_shape).astype(dtype)
-
-    self.check_results(state_, independent_chain_shape, should_pass=True)
-
-  def testPassingIIDNdimsAreIndependentOneOtherZero(self):
-    self.iid_normal_chains_should_pass_wrapper(
-        sample_shape=[10000], independent_chain_shape=[4], other_shape=[])
-
-  def testPassingIIDNdimsAreIndependentOneOtherOne(self):
-    self.iid_normal_chains_should_pass_wrapper(
-        sample_shape=[10000], independent_chain_shape=[3], other_shape=[7])
-
-  def testPassingIIDNdimsAreIndependentOneOtherTwo(self):
-    self.iid_normal_chains_should_pass_wrapper(
-        sample_shape=[10000], independent_chain_shape=[2], other_shape=[5, 7])
-
-  def testPassingIIDNdimsAreIndependentTwoOtherTwo64Bit(self):
-    self.iid_normal_chains_should_pass_wrapper(
-        sample_shape=[10000],
-        independent_chain_shape=[2, 3],
-        other_shape=[5, 7],
-        dtype=np.float64)
-
-  def offset_normal_chains_should_fail_wrapper(
-      self, sample_shape, independent_chain_shape, other_shape):
-    """Check results with normal chains that are offset from each other."""
-
-    state_shape = sample_shape + independent_chain_shape + other_shape
-    state_ = rng.randn(*state_shape)
-
-    # Add a significant offset to the different (formerly iid) chains.
-    offset = np.linspace(
-        0, 2, num=np.prod(independent_chain_shape)).reshape([1] * len(
-            sample_shape) + independent_chain_shape + [1] * len(other_shape))
-    state_ += offset
-
-    self.check_results(state_, independent_chain_shape, should_pass=False)
-
-  def testFailingOffsetNdimsAreSampleOneIndependentOneOtherOne(self):
-    self.offset_normal_chains_should_fail_wrapper(
-        sample_shape=[10000], independent_chain_shape=[2], other_shape=[5])
-
-
-class PotentialScaleReductionStaticTest(test.TestCase,
-                                        _PotentialScaleReductionTest):
-
-  @property
-  def use_static_shape(self):
-    return True
-
-  def testIndependentNdimsLessThanOneRaises(self):
-    with self.assertRaisesRegexp(ValueError, "independent_chain_ndims"):
-      mcmc_diagnostics.potential_scale_reduction(
-          rng.rand(2, 3, 4), independent_chain_ndims=0)
-
-
-class PotentialScaleReductionDynamicTest(test.TestCase,
-                                         _PotentialScaleReductionTest):
-
-  @property
-  def use_static_shape(self):
-    return False
-
-
-class _ReduceVarianceTest(object):
-
-  @property
-  def use_static_shape(self):
-    raise NotImplementedError(
-        "Subclass failed to impliment `use_static_shape`.")
-
-  def check_versus_numpy(self, x_, axis, biased, keepdims):
-    with self.test_session():
-      x_ = np.asarray(x_)
-      x = array_ops.placeholder_with_default(
-          input=x_, shape=x_.shape if self.use_static_shape else None)
-      var = mcmc_diagnostics._reduce_variance(
-          x, axis=axis, biased=biased, keepdims=keepdims)
-      np_var = np.var(x_, axis=axis, ddof=0 if biased else 1, keepdims=keepdims)
-
-      if self.use_static_shape:
-        self.assertAllEqual(np_var.shape, var.shape)
-
-      var_ = var.eval()
-      # We will mask below, which changes shape, so check shape explicitly here.
-      self.assertAllEqual(np_var.shape, var_.shape)
-
-      # We get NaN when we divide by zero due to the size being the same as ddof
-      nan_mask = np.isnan(np_var)
-      if nan_mask.any():
-        self.assertTrue(np.isnan(var_[nan_mask]).all())
-      self.assertAllClose(np_var[~nan_mask], var_[~nan_mask], atol=0, rtol=0.02)
-
-  def testScalarBiasedTrue(self):
-    self.check_versus_numpy(x_=-1.234, axis=None, biased=True, keepdims=False)
-
-  def testScalarBiasedFalse(self):
-    # This should result in NaN.
-    self.check_versus_numpy(x_=-1.234, axis=None, biased=False, keepdims=False)
-
-  def testShape2x3x4AxisNoneBiasedFalseKeepdimsFalse(self):
-    self.check_versus_numpy(
-        x_=rng.randn(2, 3, 4), axis=None, biased=True, keepdims=False)
-
-  def testShape2x3x4Axis1BiasedFalseKeepdimsTrue(self):
-    self.check_versus_numpy(
-        x_=rng.randn(2, 3, 4), axis=1, biased=True, keepdims=True)
-
-  def testShape2x3x4x5Axis13BiasedFalseKeepdimsTrue(self):
-    self.check_versus_numpy(
-        x_=rng.randn(2, 3, 4, 5), axis=1, biased=True, keepdims=True)
-
-  def testShape2x3x4x5Axis13BiasedFalseKeepdimsFalse(self):
-    self.check_versus_numpy(
-        x_=rng.randn(2, 3, 4, 5), axis=1, biased=False, keepdims=False)
-
-
-class ReduceVarianceTestStaticShape(test.TestCase, _ReduceVarianceTest):
-
-  @property
-  def use_static_shape(self):
-    return True
-
-
-class ReduceVarianceTestDynamicShape(test.TestCase, _ReduceVarianceTest):
-
-  @property
-  def use_static_shape(self):
-    return False
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/metropolis_hastings_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/metropolis_hastings_test.py
deleted file mode 100644
index 63d93fad64d077aa385b72428665e841b6784b90..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/metropolis_hastings_test.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for metropolis_hastings.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings_impl as mh
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class McmcStepTest(test.TestCase):
-
-  def test_density_increasing_step_accepted(self):
-    """Tests that if a transition increases density, it is always accepted."""
-    target_log_density = lambda x: - x * x
-    state = variable_scope.get_variable('state', initializer=10.)
-    state_log_density = variable_scope.get_variable(
-        'state_log_density',
-        initializer=target_log_density(state.initialized_value()))
-    log_accept_ratio = variable_scope.get_variable(
-        'log_accept_ratio', initializer=0.)
-
-    get_next_proposal = lambda x: (x - 1., None)
-    step = mh.evolve(state, state_log_density, log_accept_ratio,
-                     target_log_density, get_next_proposal, seed=1234)
-    init = variables.initialize_all_variables()
-    with self.test_session() as sess:
-      sess.run(init)
-      for j in range(9):
-        sess.run(step)
-        sample = sess.run(state)
-        sample_log_density = sess.run(state_log_density)
-        self.assertAlmostEqual(sample, 9 - j)
-        self.assertAlmostEqual(sample_log_density, - (9 - j) * (9 - j))
-
-  def test_sample_properties(self):
-    """Tests that the samples converge to the target distribution."""
-
-    def target_log_density(x):
-      """Log-density corresponding to a normal distribution with mean = 4."""
-      return - (x - 2.0) * (x - 2.0) * 0.5
-
-    # Use the uniform random walker to generate proposals.
-    proposal_fn = mh.uniform_random_proposal(
-        step_size=1.0, seed=1234)
-
-    state = variable_scope.get_variable('state', initializer=0.0)
-    state_log_density = variable_scope.get_variable(
-        'state_log_density',
-        initializer=target_log_density(state.initialized_value()))
-
-    log_accept_ratio = variable_scope.get_variable(
-        'log_accept_ratio', initializer=0.)
-    # Random walk MCMC converges slowly so need to put in enough iterations.
-    num_iterations = 5000
-    step = mh.evolve(state, state_log_density, log_accept_ratio,
-                     target_log_density, proposal_fn, seed=4321)
-
-    init = variables.global_variables_initializer()
-
-    sample_sum, sample_sq_sum = 0.0, 0.0
-    with self.test_session() as sess:
-      sess.run(init)
-      for _ in np.arange(num_iterations):
-        # Allow for the mixing of the chain and discard these samples.
-        sess.run(step)
-      for _ in np.arange(num_iterations):
-        sess.run(step)
-        sample = sess.run(state)
-        sample_sum += sample
-        sample_sq_sum += sample * sample
-
-    sample_mean = sample_sum / num_iterations
-    sample_variance = sample_sq_sum / num_iterations - sample_mean * sample_mean
-    # The samples have large autocorrelation which reduces the effective sample
-    # size.
-    self.assertAlmostEqual(sample_mean, 2.0, delta=0.1)
-    self.assertAlmostEqual(sample_variance, 1.0, delta=0.1)
-
-  def test_normal_proposals(self):
-    """Tests that the normal proposals are correctly distributed."""
-
-    initial_points = array_ops.ones([10000], dtype=dtypes.float32)
-    proposal_fn = mh.normal_random_proposal(
-        scale=2.0, seed=1234)
-    proposal_points, _ = proposal_fn(initial_points)
-
-    with self.test_session() as sess:
-      sample = sess.run(proposal_points)
-
-    # It is expected that the elements in proposal_points have the same mean as
-    # initial_points and have the standard deviation that was supplied to the
-    # proposal scheme.
-    self.assertAlmostEqual(np.mean(sample), 1.0, delta=0.1)
-    self.assertAlmostEqual(np.std(sample), 2.0, delta=0.1)
-
-  def test_docstring_example(self):
-    """Tests the simplified docstring example with multiple chains."""
-
-    n = 2  # dimension of the problem
-
-    # Generate 300 initial values randomly. Each of these would be an
-    # independent starting point for a Markov chain.
-    state = variable_scope.get_variable(
-        'state', initializer=random_ops.random_normal(
-            [300, n], mean=3.0, dtype=dtypes.float32, seed=42))
-
-    # Computes the log(p(x)) for the unit normal density and ignores the
-    # normalization constant.
-    def log_density(x):
-      return  - math_ops.reduce_sum(x * x, reduction_indices=-1) / 2.0
-
-    # Initial log-density value
-    state_log_density = variable_scope.get_variable(
-        'state_log_density',
-        initializer=log_density(state.initialized_value()))
-
-    # A variable to store the log_acceptance_ratio:
-    log_acceptance_ratio = variable_scope.get_variable(
-        'log_acceptance_ratio',
-        initializer=array_ops.zeros([300], dtype=dtypes.float32))
-
-    # Generates random proposals by moving each coordinate uniformly and
-    # independently in a box of size 2 centered around the current value.
-    # Returns the new point and also the log of the Hastings ratio (the
-    # ratio of the probability of going from the proposal to origin and the
-    # probability of the reverse transition). When this ratio is 1, the value
-    # may be omitted and replaced by None.
-    def random_proposal(x):
-      return (x + random_ops.random_uniform(
-          array_ops.shape(x), minval=-1, maxval=1,
-          dtype=x.dtype, seed=12)), None
-
-    #  Create the op to propagate the chain for 100 steps.
-    stepper = mh.evolve(
-        state, state_log_density, log_acceptance_ratio,
-        log_density, random_proposal, n_steps=100, seed=123)
-    init = variables.initialize_all_variables()
-    with self.test_session() as sess:
-      sess.run(init)
-      # Run the chains for a total of 1000 steps.
-      for _ in range(10):
-        sess.run(stepper)
-      samples = sess.run(state)
-      covariance = np.eye(n)
-      # Verify that the estimated mean and covariance are close to the true
-      # values.
-      self.assertAlmostEqual(
-          np.max(np.abs(np.mean(samples, 0)
-                        - np.zeros(n))), 0,
-          delta=0.1)
-      self.assertAlmostEqual(
-          np.max(np.abs(np.reshape(np.cov(samples, rowvar=False), [n**2])
-                        - np.reshape(covariance, [n**2]))), 0,
-          delta=0.2)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py
deleted file mode 100644
index 756c25683bd4b0c8c77e9e28485ca2a85582999c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional test for GradientDescent."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import math
-from tensorflow.contrib.bayesflow.python.ops.optimizers import SGLDOptimizer
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class SGLDOptimizerTest(test.TestCase):
-
-  def testBasic(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        decay_rate = 0.53
-        sgd_optimizer = SGLDOptimizer(3.0, preconditioner_decay_rate=decay_rate)
-        sgd_op = sgd_optimizer.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
-                                              (1 - decay_rate) * 0.1**2 + 1e-8))
-        self.assertAllCloseAccordingToType(
-            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
-        grads_scaled = (0.5 * 0.01 / math.sqrt(
-            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
-        self.assertAllCloseAccordingToType(
-            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
-        self.assertAllCloseAccordingToType(1, sgd_optimizer._counter.eval())
-
-  def testBasicMultiInstance(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        vara = variables.Variable([1.1, 2.1], dtype=dtype)
-        varb = variables.Variable([3.0, 4.0], dtype=dtype)
-        gradsa = constant_op.constant([0.1, 0.1], dtype=dtype)
-        gradsb = constant_op.constant([0.01, 0.01], dtype=dtype)
-        decay_rate = 0.5
-        sgd_optimizer = SGLDOptimizer(3.0, preconditioner_decay_rate=decay_rate)
-        sgd_op = sgd_optimizer.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        sgd_optimizer2 = SGLDOptimizer(
-            3.0, preconditioner_decay_rate=decay_rate)
-        sgd_op2 = sgd_optimizer2.apply_gradients(
-            zip([gradsa, gradsb], [vara, varb]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        self.assertAllCloseAccordingToType([1.1, 2.1], vara.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], varb.eval())
-
-        # Run 1 step of sgd
-        sgd_op.run()
-        sgd_op2.run()
-        # Validate updated params
-        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
-                                              (1 - decay_rate) * 0.1**2 + 1e-8))
-        self.assertAllCloseAccordingToType(
-            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
-        self.assertAllCloseAccordingToType(
-            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], vara.eval())
-
-        grads_scaled = (0.5 * 0.01 / math.sqrt(
-            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
-        self.assertAllCloseAccordingToType(
-            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
-        self.assertAllCloseAccordingToType(
-            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], varb.eval())
-        self.assertNotEqual(sgd_optimizer.variable_scope,
-                            sgd_optimizer2.variable_scope)
-        self.assertNotEqual(sgd_optimizer.variable_scope.name,
-                            sgd_optimizer2.variable_scope.name)
-        self.assertAllCloseAccordingToType(1, sgd_optimizer._counter.eval())
-        self.assertAllCloseAccordingToType(1, sgd_optimizer2._counter.eval())
-
-  def testTensorLearningRate(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        lrate = constant_op.constant(3.0)
-        decay_rate = 0.5
-        sgd_op = SGLDOptimizer(
-            lrate, preconditioner_decay_rate=constant_op.constant(
-                decay_rate)).apply_gradients(
-                    zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
-                                              (1 - decay_rate) * 0.1**2 + 1e-8))
-        self.assertAllCloseAccordingToType(
-            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
-        grads_scaled = (0.5 * 0.01 / math.sqrt(
-            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
-        self.assertAllCloseAccordingToType(
-            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
-
-  def testGradWrtRef(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        opt = SGLDOptimizer(3.0)
-        values = [1.0, 3.0]
-        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
-        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
-        variables.global_variables_initializer().run()
-        for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], grad.eval())
-
-  def testWithGlobalStep(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        global_step = variables.Variable(0, trainable=False)
-        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        decay_rate = 0.1
-        sgd_op = SGLDOptimizer(
-            3.0, preconditioner_decay_rate=decay_rate).apply_gradients(
-                zip([grads0, grads1], [var0, var1]), global_step=global_step)
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-
-        # Validate updated params and global_step
-        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
-                                              (1 - decay_rate) * 0.1**2 + 1e-8))
-        self.assertAllCloseAccordingToType(
-            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
-        grads_scaled = (0.5 * 0.01 / math.sqrt(
-            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
-        self.assertAllCloseAccordingToType(
-            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
-        self.assertAllCloseAccordingToType(1, global_step.eval())
-
-  def testSparseBasic(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = variables.Variable([[1.1], [2.1]], dtype=dtype)
-        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
-        grads0 = ops.IndexedSlices(
-            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
-            constant_op.constant([0]), constant_op.constant([2, 1]))
-        grads1 = ops.IndexedSlices(
-            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
-            constant_op.constant([1]), constant_op.constant([2, 1]))
-        decay_rate = 0.9
-        sgd_op = SGLDOptimizer(
-            3.0, preconditioner_decay_rate=decay_rate).apply_gradients(
-                zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.1], [2.1]], var0.eval())
-        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
-                                              (1 - decay_rate) * 0.1**2 + 1e-8))
-        self.assertAllCloseAccordingToType([[1.1 - 3.0 * grads_scaled], [2.1]],
-                                           var0.eval())
-        grads_scaled = (0.5 * 0.01 / math.sqrt(
-            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
-        self.assertAllCloseAccordingToType(
-            [[3.0 - 3.0 * 0], [4.0 - 3.0 * grads_scaled]], var1.eval())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py
deleted file mode 100644
index f978cf86417dc5ff5412a3eee584330a266e0964..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for utility functions related to managing `tf.Variable`s."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import warnings
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import variable_utils
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import variable_scope as varscope_ops
-from tensorflow.python.ops import variables as variables_ops
-from tensorflow.python.platform import test
-
-
-def test_fn(x):
-  x = ops.convert_to_tensor(x, name="x")
-  dtype = x.dtype.as_numpy_dtype
-  s = x.shape.as_list()
-  z = varscope_ops.get_variable(
-      name="z",
-      dtype=dtype,
-      initializer=np.arange(np.prod(s)).reshape(s).astype(dtype))
-  y = varscope_ops.get_variable(
-      name="y",
-      dtype=dtype,
-      initializer=np.arange(np.prod(s)).reshape(s).astype(dtype)**2)
-  return x + y + z
-
-
-class _WrapCallableTest(object):
-
-  def testDefaultArgsWorkCorrectly(self):
-    with self.test_session():
-      x = constant_op.constant(self.dtype([0.1, 0.2]))
-      wrapped_fn, vars_args = variable_utils.externalize_variables_as_args(
-          test_fn, [x])
-
-      varscope_ops.get_variable_scope().reuse_variables()
-
-      result = wrapped_fn(self.dtype(2), [3, 4, 5], 0.5)
-
-      y_actual = varscope_ops.get_variable("y", dtype=self.dtype)
-      z_actual = varscope_ops.get_variable("z", dtype=self.dtype)
-
-      variables_ops.global_variables_initializer().run()
-      result_ = result.eval()
-
-      self.assertEqual(self.dtype, result_.dtype)
-      self.assertAllEqual([5.5, 6.5, 7.5], result_)
-      self.assertAllEqual([y_actual, z_actual], vars_args)
-
-  def testNonDefaultArgsWorkCorrectly(self):
-    with self.test_session():
-      x = constant_op.constant(self.dtype([0.1, 0.2]))
-
-      _ = test_fn(self.dtype([0., 0.]))   # Needed to create vars.
-      varscope_ops.get_variable_scope().reuse_variables()
-
-      y_actual = varscope_ops.get_variable("y", dtype=self.dtype)
-
-      wrapped_fn, vars_args = variable_utils.externalize_variables_as_args(
-          test_fn, [x], possible_ancestor_vars=[y_actual])
-
-      result = wrapped_fn(self.dtype([2, 3]), 0.5)  # x, y
-
-      variables_ops.global_variables_initializer().run()
-      result_ = result.eval()
-
-      self.assertEqual(self.dtype, result_.dtype)
-      self.assertAllEqual([2.5, 4.5], result_)
-      self.assertAllEqual([y_actual], vars_args)
-
-  def testWarnings(self):
-    with self.test_session():
-      x = constant_op.constant(self.dtype([0.1, 0.2]))
-      wrapped_fn, _ = variable_utils.externalize_variables_as_args(
-          test_fn, [x], possible_ancestor_vars=[])
-      varscope_ops.get_variable_scope().reuse_variables()
-      with warnings.catch_warnings(record=True) as w:
-        wrapped_fn(self.dtype(2))
-      w = sorted(w, key=lambda w: str(w.message))
-      self.assertEqual(2, len(w))
-      self.assertRegexpMatches(
-          str(w[0].message),
-          r"Variable .* 'y:0' .* not found in bypass dict.")
-      self.assertRegexpMatches(
-          str(w[1].message),
-          r"Variable .* 'z:0' .* not found in bypass dict.")
-
-  def testExceptions(self):
-    with self.test_session():
-      x = constant_op.constant(self.dtype([0.1, 0.2]))
-      wrapped_fn, _ = variable_utils.externalize_variables_as_args(
-          test_fn,
-          [x],
-          possible_ancestor_vars=[],
-          assert_variable_override=True)
-      varscope_ops.get_variable_scope().reuse_variables()
-      with self.assertRaisesRegexp(ValueError, r"not found"):
-        wrapped_fn(self.dtype(2))
-
-
-class WrapCallableTest16(test.TestCase, _WrapCallableTest):
-  dtype = np.float16
-
-
-class WrapCallableTest32(test.TestCase, _WrapCallableTest):
-  dtype = np.float32
-
-
-class WrapCallableTest64(test.TestCase, _WrapCallableTest):
-  dtype = np.float64
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py
deleted file mode 100644
index 83c64dbe0fd586edcb784a5c09a4c133aaa99cff..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional test for GradientDescent."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from tensorflow.contrib.bayesflow.python.ops.optimizers import VariationalSGDOptimizer
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class VariationalSGDOptimizerTest(test.TestCase):
-
-  def testBasic(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        decay_rate = 0.53
-        sgd_op = VariationalSGDOptimizer(
-            1,
-            1,
-            preconditioner_decay_rate=decay_rate,
-            max_learning_rate=3.0,
-            burnin_max_learning_rate=3.0,
-            use_single_learning_rate=True).apply_gradients(
-                zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-
-  def testBasicMultiInstance(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        vara = variables.Variable([1.1, 2.1], dtype=dtype)
-        varb = variables.Variable([3.0, 4.0], dtype=dtype)
-        gradsa = constant_op.constant([0.1, 0.1], dtype=dtype)
-        gradsb = constant_op.constant([0.01, 0.01], dtype=dtype)
-        decay_rate = 0.5
-        batch_size = 2
-        total_num_examples = 10
-        optimizer = VariationalSGDOptimizer(
-            batch_size,
-            total_num_examples,
-            max_learning_rate=1.0,
-            burnin_max_learning_rate=3.0,
-            preconditioner_decay_rate=decay_rate)
-        sgd_op = optimizer.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        optimizer2 = VariationalSGDOptimizer(
-            batch_size,
-            total_num_examples,
-            max_learning_rate=1.0,
-            burnin_max_learning_rate=10.0,
-            burnin=0,
-            preconditioner_decay_rate=decay_rate)
-        sgd_op2 = optimizer2.apply_gradients(
-            zip([gradsa, gradsb], [vara, varb]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        self.assertAllCloseAccordingToType([1.1, 2.1], vara.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], varb.eval())
-
-        # Run 1 step of sgd
-        sgd_op.run()
-        sgd_op2.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.1 - 3. * 0.1, 2.1 - 3. * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([1.1 - 0.1, 2.1 - 0.1], vara.eval())
-
-        self.assertAllCloseAccordingToType([3.0 - 3. * 0.01, 4.0 - 3. * 0.01],
-                                           var1.eval())
-        self.assertAllCloseAccordingToType([3.0 - 0.01, 4.0 - 0.01],
-                                           varb.eval())
-        self.assertNotEqual(optimizer.variable_scope,
-                            optimizer2.variable_scope)
-        self.assertNotEqual(optimizer.variable_scope.name,
-                            optimizer2.variable_scope.name)
-        self.assertAllCloseAccordingToType(1, optimizer._counter.eval())
-        self.assertAllCloseAccordingToType(1, optimizer2._counter.eval())
-
-  def testTensorLearningRate(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        lrate = constant_op.constant(3.0)
-        decay_rate = 0.5
-        batch_size = 2
-        total_num_examples = 10
-        sgd_op = VariationalSGDOptimizer(
-            batch_size,
-            total_num_examples,
-            max_learning_rate=lrate,
-            burnin=0,
-            preconditioner_decay_rate=decay_rate).apply_gradients(
-                zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-
-  def testTensorDecayLearningRate(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        lrate = variables.Variable(3.0)
-        lrate_decay_op = lrate.assign_add(-3.)
-        decay_rate = 0.5
-        batch_size = 2
-        total_num_examples = 10
-        optimizer = VariationalSGDOptimizer(
-            batch_size,
-            total_num_examples,
-            max_learning_rate=lrate,
-            burnin=0,
-            preconditioner_decay_rate=decay_rate)
-        sgd_op = optimizer.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        # Update learning rate to 0
-        lrate_decay_op.eval()
-        sgd_op.run()
-        # Validate params haven't changed
-        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        lrate_decay_op.eval()
-
-        with self.assertRaises(errors.InvalidArgumentError):
-          sgd_op.run()
-
-  def testGradWrtRef(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        opt = VariationalSGDOptimizer(1, 1, max_learning_rate=1.0)
-        values = [1.0, 3.0]
-        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
-        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
-        variables.global_variables_initializer().run()
-        for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], grad.eval())
-
-  def testWithGlobalStep(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        global_step = variables.Variable(0, trainable=False)
-        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        decay_rate = 0.1
-        batch_size = 2
-        total_num_examples = 10
-        sgd_optimizer = VariationalSGDOptimizer(
-            batch_size,
-            total_num_examples,
-            max_learning_rate=3.0,
-            burnin=0,
-            preconditioner_decay_rate=decay_rate)
-        sgd_op = sgd_optimizer.apply_gradients(
-            zip([grads0, grads1], [var0, var1]), global_step=global_step)
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-
-        # Validate updated params and global_step
-        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        self.assertAllCloseAccordingToType(1, global_step.eval())
-        self.assertAllCloseAccordingToType(1, sgd_optimizer._counter.eval())
-
-  def testSparseBasic(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = variables.Variable([[1.1], [2.1]], dtype=dtype)
-        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
-        grads0 = ops.IndexedSlices(
-            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
-            constant_op.constant([0]), constant_op.constant([2, 1]))
-        grads1 = ops.IndexedSlices(
-            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
-            constant_op.constant([1]), constant_op.constant([2, 1]))
-        decay_rate = 0.1
-        batch_size = 2
-        total_num_examples = 10
-        sgd_op = VariationalSGDOptimizer(
-            batch_size,
-            total_num_examples,
-            max_learning_rate=3.0,
-            burnin=0,
-            preconditioner_decay_rate=decay_rate).apply_gradients(
-                zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.1], [2.1]], var0.eval())
-        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[1.1 - 3.0 * 0.1], [2.1]],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType(
-            [[3.0 - 3.0 * 0], [4.0 - 3.0 * 0.01]], var1.eval())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py b/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py
deleted file mode 100644
index 8efd59d6516924bea538717d45bb4ae303583421..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py
+++ /dev/null
@@ -1,1105 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Csiszar f-Divergence and helpers.
-
-@@amari_alpha
-@@arithmetic_geometric
-@@chi_square
-@@csiszar_vimco
-@@dual_csiszar_function
-@@jeffreys
-@@jensen_shannon
-@@kl_forward
-@@kl_reverse
-@@log1p_abs
-@@modified_gan
-@@monte_carlo_csiszar_f_divergence
-@@pearson
-@@squared_hellinger
-@@symmetrized_csiszar_function
-@@total_variation
-@@triangular
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib import framework as contrib_framework
-from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import distribution
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-def amari_alpha(logu, alpha=1., self_normalized=False, name=None):
-  """The Amari-alpha Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  When `self_normalized = True`, the Amari-alpha Csiszar-function is:
-
-  ```none
-  f(u) = { -log(u) + (u - 1),     alpha = 0
-         { u log(u) - (u - 1),    alpha = 1
-         { [(u**alpha - 1) - alpha (u - 1)] / (alpha (alpha - 1)),    otherwise
-  ```
-
-  When `self_normalized = False` the `(u - 1)` terms are omitted.
-
-  Warning: when `alpha != 0` and/or `self_normalized = True` this function makes
-  non-log-space calculations and may therefore be numerically unstable for
-  `|logu| >> 0`.
-
-  For more information, see:
-    A. Cichocki and S. Amari. "Families of Alpha-Beta-and GammaDivergences:
-    Flexible and Robust Measures of Similarities." Entropy, vol. 12, no. 6, pp.
-    1532-1568, 2010.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    alpha: `float`-like Python scalar. (See Mathematical Details for meaning.)
-    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
-      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
-      when `p, q` are unnormalized measures.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    amari_alpha_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
-      at `u = exp(logu)`.
-
-  Raises:
-    TypeError: if `alpha` is `None` or a `Tensor`.
-    TypeError: if `self_normalized` is `None` or a `Tensor`.
-  """
-  with ops.name_scope(name, "amari_alpha", [logu]):
-    if alpha is None or contrib_framework.is_tensor(alpha):
-      raise TypeError("`alpha` cannot be `None` or `Tensor` type.")
-    if self_normalized is None or contrib_framework.is_tensor(self_normalized):
-      raise TypeError("`self_normalized` cannot be `None` or `Tensor` type.")
-
-    logu = ops.convert_to_tensor(logu, name="logu")
-
-    if alpha == 0.:
-      f = -logu
-    elif alpha == 1.:
-      f = math_ops.exp(logu) * logu
-    else:
-      f = math_ops.expm1(alpha * logu) / (alpha * (alpha - 1.))
-
-    if not self_normalized:
-      return f
-
-    if alpha == 0.:
-      return f + math_ops.expm1(logu)
-    elif alpha == 1.:
-      return f - math_ops.expm1(logu)
-    else:
-      return f - math_ops.expm1(logu) / (alpha - 1.)
-
-
-def kl_reverse(logu, self_normalized=False, name=None):
-  """The reverse Kullback-Leibler Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  When `self_normalized = True`, the KL-reverse Csiszar-function is:
-
-  ```none
-  f(u) = -log(u) + (u - 1)
-  ```
-
-  When `self_normalized = False` the `(u - 1)` term is omitted.
-
-  Observe that as an f-Divergence, this Csiszar-function implies:
-
-  ```none
-  D_f[p, q] = KL[q, p]
-  ```
-
-  The KL is "reverse" because in maximum likelihood we think of minimizing `q`
-  as in `KL[p, q]`.
-
-  Warning: when self_normalized = True` this function makes non-log-space
-  calculations and may therefore be numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
-      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
-      when `p, q` are unnormalized measures.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    kl_reverse_of_u: `float`-like `Tensor` of the Csiszar-function evaluated at
-      `u = exp(logu)`.
-
-  Raises:
-    TypeError: if `self_normalized` is `None` or a `Tensor`.
-  """
-
-  with ops.name_scope(name, "kl_reverse", [logu]):
-    return amari_alpha(logu, alpha=0., self_normalized=self_normalized)
-
-
-def kl_forward(logu, self_normalized=False, name=None):
-  """The forward Kullback-Leibler Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  When `self_normalized = True`, the KL-forward Csiszar-function is:
-
-  ```none
-  f(u) = u log(u) - (u - 1)
-  ```
-
-  When `self_normalized = False` the `(u - 1)` term is omitted.
-
-  Observe that as an f-Divergence, this Csiszar-function implies:
-
-  ```none
-  D_f[p, q] = KL[p, q]
-  ```
-
-  The KL is "forward" because in maximum likelihood we think of minimizing `q`
-  as in `KL[p, q]`.
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
-      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
-      when `p, q` are unnormalized measures.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    kl_forward_of_u: `float`-like `Tensor` of the Csiszar-function evaluated at
-      `u = exp(logu)`.
-
-  Raises:
-    TypeError: if `self_normalized` is `None` or a `Tensor`.
-  """
-
-  with ops.name_scope(name, "kl_forward", [logu]):
-    return amari_alpha(logu, alpha=1., self_normalized=self_normalized)
-
-
-def jensen_shannon(logu, self_normalized=False, name=None):
-  """The Jensen-Shannon Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  When `self_normalized = True`, the Jensen-Shannon Csiszar-function is:
-
-  ```none
-  f(u) = u log(u) - (1 + u) log(1 + u) + (u + 1) log(2)
-  ```
-
-  When `self_normalized = False` the `(u + 1) log(2)` term is omitted.
-
-  Observe that as an f-Divergence, this Csiszar-function implies:
-
-  ```none
-  D_f[p, q] = KL[p, m] + KL[q, m]
-  m(x) = 0.5 p(x) + 0.5 q(x)
-  ```
-
-  In a sense, this divergence is the "reverse" of the Arithmetic-Geometric
-  f-Divergence.
-
-  This Csiszar-function induces a symmetric f-Divergence, i.e.,
-  `D_f[p, q] = D_f[q, p]`.
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  For more information, see:
-    Lin, J. "Divergence measures based on the Shannon entropy." IEEE Trans.
-    Inf. Th., 37, 145-151, 1991.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
-      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
-      when `p, q` are unnormalized measures.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    jensen_shannon_of_u: `float`-like `Tensor` of the Csiszar-function
-      evaluated at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "jensen_shannon", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    npdt = logu.dtype.as_numpy_dtype
-    y = nn_ops.softplus(logu)
-    if self_normalized:
-      y -= np.log(2).astype(npdt)
-    return math_ops.exp(logu) * logu - (1. + math_ops.exp(logu)) * y
-
-
-def arithmetic_geometric(logu, self_normalized=False, name=None):
-  """The Arithmetic-Geometric Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  When `self_normalized = True` the Arithmetic-Geometric Csiszar-function is:
-
-  ```none
-  f(u) = (1 + u) log( (1 + u) / sqrt(u) ) - (1 + u) log(2)
-  ```
-
-  When `self_normalized = False` the `(1 + u) log(2)` term is omitted.
-
-  Observe that as an f-Divergence, this Csiszar-function implies:
-
-  ```none
-  D_f[p, q] = KL[m, p] + KL[m, q]
-  m(x) = 0.5 p(x) + 0.5 q(x)
-  ```
-
-  In a sense, this divergence is the "reverse" of the Jensen-Shannon
-  f-Divergence.
-
-  This Csiszar-function induces a symmetric f-Divergence, i.e.,
-  `D_f[p, q] = D_f[q, p]`.
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
-      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
-      when `p, q` are unnormalized measures.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    arithmetic_geometric_of_u: `float`-like `Tensor` of the
-      Csiszar-function evaluated at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "arithmetic_geometric", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    y = nn_ops.softplus(logu) - 0.5 * logu
-    if self_normalized:
-      y -= np.log(2.).astype(logu.dtype.as_numpy_dtype)
-    return (1. + math_ops.exp(logu)) * y
-
-
-def total_variation(logu, name=None):
-  """The Total Variation Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The Total-Variation Csiszar-function is:
-
-  ```none
-  f(u) = 0.5 |u - 1|
-  ```
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    total_variation_of_u: `float`-like `Tensor` of the Csiszar-function
-      evaluated at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "total_variation", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    return 0.5 * math_ops.abs(math_ops.expm1(logu))
-
-
-def pearson(logu, name=None):
-  """The Pearson Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The Pearson Csiszar-function is:
-
-  ```none
-  f(u) = (u - 1)**2
-  ```
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    pearson_of_u: `float`-like `Tensor` of the Csiszar-function evaluated at
-      `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "pearson", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    return math_ops.square(math_ops.expm1(logu))
-
-
-def squared_hellinger(logu, name=None):
-  """The Squared-Hellinger Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The Squared-Hellinger Csiszar-function is:
-
-  ```none
-  f(u) = (sqrt(u) - 1)**2
-  ```
-
-  This Csiszar-function induces a symmetric f-Divergence, i.e.,
-  `D_f[p, q] = D_f[q, p]`.
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    squared_hellinger_of_u: `float`-like `Tensor` of the Csiszar-function
-      evaluated at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "squared_hellinger", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    return pearson(0.5 * logu)
-
-
-def triangular(logu, name=None):
-  """The Triangular Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The Triangular Csiszar-function is:
-
-  ```none
-  f(u) = (u - 1)**2 / (1 + u)
-  ```
-
-  This Csiszar-function induces a symmetric f-Divergence, i.e.,
-  `D_f[p, q] = D_f[q, p]`.
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    triangular_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
-      at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "triangular", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    return pearson(logu) / (1. + math_ops.exp(logu))
-
-
-def t_power(logu, t, self_normalized=False, name=None):
-  """The T-Power Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  When `self_normalized = True` the T-Power Csiszar-function is:
-
-  ```none
-  f(u) = s [ u**t - 1 - t(u - 1) ]
-  s = { -1   0 < t < 1
-      { +1   otherwise
-  ```
-
-  When `self_normalized = False` the `- t(u - 1)` term is omitted.
-
-  This is similar to the `amari_alpha` Csiszar-function, with the associated
-  divergence being the same up to factors depending only on `t`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    t:  `Tensor` of same `dtype` as `logu` and broadcastable shape.
-    self_normalized: Python `bool` indicating whether `f'(u=1)=0`.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    t_power_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
-      at `u = exp(logu)`.
-  """
-  with ops.name_scope(name, "t_power", [logu, t]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    t = ops.convert_to_tensor(t, dtype=logu.dtype.base_dtype, name="t")
-    fu = math_ops.expm1(t * logu)
-    if self_normalized:
-      fu -= t * math_ops.expm1(logu)
-    fu *= array_ops.where(math_ops.logical_and(0. < t, t < 1.),
-                          -array_ops.ones_like(t),
-                          array_ops.ones_like(t))
-    return fu
-
-
-def log1p_abs(logu, name=None):
-  """The log1p-abs Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The Log1p-Abs Csiszar-function is:
-
-  ```none
-  f(u) = u**(sign(u-1)) - 1
-  ```
-
-  This function is so-named because it was invented from the following recipe.
-  Choose a convex function g such that g(0)=0 and solve for f:
-
-  ```none
-  log(1 + f(u)) = g(log(u)).
-    <=>
-  f(u) = exp(g(log(u))) - 1
-  ```
-
-  That is, the graph is identically `g` when y-axis is `log1p`-domain and x-axis
-  is `log`-domain.
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    log1p_abs_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
-      at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "log1p_abs", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    return math_ops.expm1(math_ops.abs(logu))
-
-
-def jeffreys(logu, name=None):
-  """The Jeffreys Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The Jeffreys Csiszar-function is:
-
-  ```none
-  f(u) = 0.5 ( u log(u) - log(u) )
-       = 0.5 kl_forward + 0.5 kl_reverse
-       = symmetrized_csiszar_function(kl_reverse)
-       = symmetrized_csiszar_function(kl_forward)
-  ```
-
-  This Csiszar-function induces a symmetric f-Divergence, i.e.,
-  `D_f[p, q] = D_f[q, p]`.
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    jeffreys_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
-      at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "jeffreys", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    return 0.5 * math_ops.expm1(logu) * logu
-
-
-def chi_square(logu, name=None):
-  """The chi-Square Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The Chi-square Csiszar-function is:
-
-  ```none
-  f(u) = u**2 - 1
-  ```
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    chi_square_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
-      at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "chi_square", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    return math_ops.expm1(2. * logu)
-
-
-def modified_gan(logu, self_normalized=False, name=None):
-  """The Modified-GAN Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  When `self_normalized = True` the modified-GAN (Generative/Adversarial
-  Network) Csiszar-function is:
-
-  ```none
-  f(u) = log(1 + u) - log(u) + 0.5 (u - 1)
-  ```
-
-  When `self_normalized = False` the `0.5 (u - 1)` is omitted.
-
-  The unmodified GAN Csiszar-function is identical to Jensen-Shannon (with
-  `self_normalized = False`).
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
-      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
-      when `p, q` are unnormalized measures.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    chi_square_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
-      at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "chi_square", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    y = nn_ops.softplus(logu) - logu
-    if self_normalized:
-      y += 0.5 * math_ops.expm1(logu)
-    return y
-
-
-def dual_csiszar_function(logu, csiszar_function, name=None):
-  """Calculates the dual Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The Csiszar-dual is defined as:
-
-  ```none
-  f^*(u) = u f(1 / u)
-  ```
-
-  where `f` is some other Csiszar-function.
-
-  For example, the dual of `kl_reverse` is `kl_forward`, i.e.,
-
-  ```none
-  f(u) = -log(u)
-  f^*(u) = u f(1 / u) = -u log(1 / u) = u log(u)
-  ```
-
-  The dual of the dual is the original function:
-
-  ```none
-  f^**(u) = {u f(1/u)}^*(u) = u (1/u) f(1/(1/u)) = f(u)
-  ```
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    csiszar_function: Python `callable` representing a Csiszar-function over
-      log-domain.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    dual_f_of_u: `float`-like `Tensor` of the result of calculating the dual of
-      `f` at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "dual_csiszar_function", [logu]):
-    return math_ops.exp(logu) * csiszar_function(-logu)
-
-
-def symmetrized_csiszar_function(logu, csiszar_function, name=None):
-  """Symmetrizes a Csiszar-function in log-space.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The symmetrized Csiszar-function is defined as:
-
-  ```none
-  f_g(u) = 0.5 g(u) + 0.5 u g (1 / u)
-  ```
-
-  where `g` is some other Csiszar-function.
-
-  We say the function is "symmetrized" because:
-
-  ```none
-  D_{f_g}[p, q] = D_{f_g}[q, p]
-  ```
-
-  for all `p << >> q` (i.e., `support(p) = support(q)`).
-
-  There exists alternatives for symmetrizing a Csiszar-function. For example,
-
-  ```none
-  f_g(u) = max(f(u), f^*(u)),
-  ```
-
-  where `f^*` is the dual Csiszar-function, also implies a symmetric
-  f-Divergence.
-
-  Example:
-
-  When either of the following functions are symmetrized, we obtain the
-  Jensen-Shannon Csiszar-function, i.e.,
-
-  ```none
-  g(u) = -log(u) - (1 + u) log((1 + u) / 2) + u - 1
-  h(u) = log(4) + 2 u log(u / (1 + u))
-  ```
-
-  implies,
-
-  ```none
-  f_g(u) = f_h(u) = u log(u) - (1 + u) log((1 + u) / 2)
-         = jensen_shannon(log(u)).
-  ```
-
-  Warning: this function makes non-log-space calculations and may therefore be
-  numerically unstable for `|logu| >> 0`.
-
-  Args:
-    logu: `float`-like `Tensor` representing `log(u)` from above.
-    csiszar_function: Python `callable` representing a Csiszar-function over
-      log-domain.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    symmetrized_g_of_u: `float`-like `Tensor` of the result of applying the
-      symmetrization of `g` evaluated at `u = exp(logu)`.
-  """
-
-  with ops.name_scope(name, "symmetrized_csiszar_function", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-    return 0.5 * (csiszar_function(logu)
-                  + dual_csiszar_function(logu, csiszar_function))
-
-
-def monte_carlo_csiszar_f_divergence(
-    f,
-    p_log_prob,
-    q,
-    num_draws,
-    use_reparametrization=None,
-    seed=None,
-    name=None):
-  """Monte-Carlo approximation of the Csiszar f-Divergence.
-
-  A Csiszar-function is a member of,
-
-  ```none
-  F = { f:R_+ to R : f convex }.
-  ```
-
-  The Csiszar f-Divergence for Csiszar-function f is given by:
-
-  ```none
-  D_f[p(X), q(X)] := E_{q(X)}[ f( p(X) / q(X) ) ]
-                  ~= m**-1 sum_j^m f( p(x_j) / q(x_j) ),
-                             where x_j ~iid q(X)
-  ```
-
-  Tricks: Reparameterization and Score-Gradient
-
-  When q is "reparameterized", i.e., a diffeomorphic transformation of a
-  parameterless distribution (e.g.,
-  `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
-  expectation, i.e.,
-  `grad[Avg{ s_i : i=1...n }] = Avg{ grad[s_i] : i=1...n }` where `S_n=Avg{s_i}`
-  and `s_i = f(x_i), x_i ~iid q(X)`.
-
-  However, if q is not reparameterized, TensorFlow's gradient will be incorrect
-  since the chain-rule stops at samples of unreparameterized distributions. In
-  this circumstance using the Score-Gradient trick results in an unbiased
-  gradient, i.e.,
-
-  ```none
-  grad[ E_q[f(X)] ]
-  = grad[ int dx q(x) f(x) ]
-  = int dx grad[ q(x) f(x) ]
-  = int dx [ q'(x) f(x) + q(x) f'(x) ]
-  = int dx q(x) [q'(x) / q(x) f(x) + f'(x) ]
-  = int dx q(x) grad[ f(x) q(x) / stop_grad[q(x)] ]
-  = E_q[ grad[ f(x) q(x) / stop_grad[q(x)] ] ]
-  ```
-
-  Unless `q.reparameterization_type != distribution.FULLY_REPARAMETERIZED` it is
-  usually preferable to set `use_reparametrization = True`.
-
-  Example Application:
-
-  The Csiszar f-Divergence is a useful framework for variational inference.
-  I.e., observe that,
-
-  ```none
-  f(p(x)) =  f( E_{q(Z | x)}[ p(x, Z) / q(Z | x) ] )
-          <= E_{q(Z | x)}[ f( p(x, Z) / q(Z | x) ) ]
-          := D_f[p(x, Z), q(Z | x)]
-  ```
-
-  The inequality follows from the fact that the "perspective" of `f`, i.e.,
-  `(s, t) |-> t f(s / t))`, is convex in `(s, t)` when `s/t in domain(f)` and
-  `t` is a real. Since the above framework includes the popular Evidence Lower
-  BOund (ELBO) as a special case, i.e., `f(u) = -log(u)`, we call this framework
-  "Evidence Divergence Bound Optimization" (EDBO).
-
-  Args:
-    f: Python `callable` representing a Csiszar-function in log-space, i.e.,
-      takes `p_log_prob(q_samples) - q.log_prob(q_samples)`.
-    p_log_prob: Python `callable` taking (a batch of) samples from `q` and
-      returning the natural-log of the probability under distribution `p`.
-      (In variational inference `p` is the joint distribution.)
-    q: `tf.Distribution`-like instance; must implement:
-      `reparameterization_type`, `sample(n, seed)`, and `log_prob(x)`.
-      (In variational inference `q` is the approximate posterior distribution.)
-    num_draws: Integer scalar number of draws used to approximate the
-      f-Divergence expectation.
-    use_reparametrization: Python `bool`. When `None` (the default),
-      automatically set to:
-      `q.reparameterization_type == distribution.FULLY_REPARAMETERIZED`.
-      When `True` uses the standard Monte-Carlo average. When `False` uses the
-      score-gradient trick. (See above for details.)  When `False`, consider
-      using `csiszar_vimco`.
-    seed: Python `int` seed for `q.sample`.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    monte_carlo_csiszar_f_divergence: `float`-like `Tensor` Monte Carlo
-      approximation of the Csiszar f-Divergence.
-
-  Raises:
-    ValueError: if `q` is not a reparameterized distribution and
-      `use_reparametrization = True`. A distribution `q` is said to be
-      "reparameterized" when its samples are generated by transforming the
-      samples of another distribution which does not depend on the
-      parameterization of `q`. This property ensures the gradient (with respect
-      to parameters) is valid.
-    TypeError: if `p_log_prob` is not a Python `callable`.
-  """
-  with ops.name_scope(name, "monte_carlo_csiszar_f_divergence", [num_draws]):
-    if use_reparametrization is None:
-      use_reparametrization = (q.reparameterization_type
-                               == distribution.FULLY_REPARAMETERIZED)
-    elif (use_reparametrization and
-          q.reparameterization_type != distribution.FULLY_REPARAMETERIZED):
-      # TODO(jvdillon): Consider only raising an exception if the gradient is
-      # requested.
-      raise ValueError(
-          "Distribution `q` must be reparameterized, i.e., a diffeomorphic "
-          "transformation of a parameterless distribution. (Otherwise this "
-          "function has a biased gradient.)")
-    if not callable(p_log_prob):
-      raise TypeError("`p_log_prob` must be a Python `callable` function.")
-    return monte_carlo.expectation(
-        f=lambda q_samples: f(p_log_prob(q_samples) - q.log_prob(q_samples)),
-        samples=q.sample(num_draws, seed=seed),
-        log_prob=q.log_prob,  # Only used if use_reparametrization=False.
-        use_reparametrization=use_reparametrization)
-
-
-def csiszar_vimco(f,
-                  p_log_prob,
-                  q,
-                  num_draws,
-                  num_batch_draws=1,
-                  seed=None,
-                  name=None):
-  """Use VIMCO to lower the variance of gradient[csiszar_function(Avg(logu))].
-
-  This function generalizes "Variational Inference for Monte Carlo Objectives"
-  (VIMCO), i.e., https://arxiv.org/abs/1602.06725, to Csiszar f-Divergences.
-
-  Note: if `q.reparameterization_type = distribution.FULLY_REPARAMETERIZED`,
-  consider using `monte_carlo_csiszar_f_divergence`.
-
-  The VIMCO loss is:
-
-  ```none
-  vimco = f(Avg{logu[i] : i=0,...,m-1})
-  where,
-    logu[i] = log( p(x, h[i]) / q(h[i] | x) )
-    h[i] iid~ q(H | x)
-  ```
-
-  Interestingly, the VIMCO gradient is not the naive gradient of `vimco`.
-  Rather, it is characterized by:
-
-  ```none
-  grad[vimco] - variance_reducing_term
-  where,
-    variance_reducing_term = Sum{ grad[log q(h[i] | x)] *
-                                    (vimco - f(log Avg{h[j;i] : j=0,...,m-1}))
-                                 : i=0, ..., m-1 }
-    h[j;i] = { u[j]                             j!=i
-             { GeometricAverage{ u[k] : k!=i}   j==i
-  ```
-
-  (We omitted `stop_gradient` for brevity. See implementation for more details.)
-
-  The `Avg{h[j;i] : j}` term is a kind of "swap-out average" where the `i`-th
-  element has been replaced by the leave-`i`-out Geometric-average.
-
-  This implementation prefers numerical precision over efficiency, i.e.,
-  `O(num_draws * num_batch_draws * prod(batch_shape) * prod(event_shape))`.
-  (The constant may be fairly large, perhaps around 12.)
-
-  Args:
-    f: Python `callable` representing a Csiszar-function in log-space.
-    p_log_prob: Python `callable` representing the natural-log of the
-      probability under distribution `p`. (In variational inference `p` is the
-      joint distribution.)
-    q: `tf.Distribution`-like instance; must implement: `sample(n, seed)`, and
-      `log_prob(x)`. (In variational inference `q` is the approximate posterior
-      distribution.)
-    num_draws: Integer scalar number of draws used to approximate the
-      f-Divergence expectation.
-    num_batch_draws: Integer scalar number of draws used to approximate the
-      f-Divergence expectation.
-    seed: Python `int` seed for `q.sample`.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    vimco: The Csiszar f-Divergence generalized VIMCO objective.
-
-  Raises:
-    ValueError: if `num_draws < 2`.
-  """
-  with ops.name_scope(name, "csiszar_vimco", [num_draws, num_batch_draws]):
-    if num_draws < 2:
-      raise ValueError("Must specify num_draws > 1.")
-    stop = array_ops.stop_gradient  # For readability.
-    x = stop(q.sample(sample_shape=[num_draws, num_batch_draws],
-                      seed=seed))
-    logqx = q.log_prob(x)
-    logu = p_log_prob(x) - logqx
-    f_log_avg_u, f_log_sooavg_u = [f(r) for r in csiszar_vimco_helper(logu)]
-    dotprod = math_ops.reduce_sum(
-        logqx * stop(f_log_avg_u - f_log_sooavg_u),
-        axis=0)  # Sum over iid samples.
-    # We now rewrite f_log_avg_u so that:
-    #   `grad[f_log_avg_u] := grad[f_log_avg_u + dotprod]`.
-    # To achieve this, we use a trick that
-    #   `f(x) - stop(f(x)) == zeros_like(f(x))`
-    # but its gradient is grad[f(x)].
-    # Note that IEEE754 specifies that `x - x == 0.` and `x + 0. == x`, hence
-    # this trick loses no precision. For more discussion regarding the relevant
-    # portions of the IEEE754 standard, see the StackOverflow question,
-    # "Is there a floating point value of x, for which x-x == 0 is false?"
-    # http://stackoverflow.com/q/2686644
-    f_log_avg_u += dotprod - stop(dotprod)  # Add zeros_like(dot_prod).
-    return math_ops.reduce_mean(f_log_avg_u, axis=0)  # Avg over batches.
-
-
-def csiszar_vimco_helper(logu, name=None):
-  """Helper to `csiszar_vimco`; computes `log_avg_u`, `log_sooavg_u`.
-
-  `axis = 0` of `logu` is presumed to correspond to iid samples from `q`, i.e.,
-
-  ```none
-  logu[j] = log(u[j])
-  u[j] = p(x, h[j]) / q(h[j] | x)
-  h[j] iid~ q(H | x)
-  ```
-
-  Args:
-    logu: Floating-type `Tensor` representing `log(p(x, h) / q(h | x))`.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    log_avg_u: `logu.dtype` `Tensor` corresponding to the natural-log of the
-      average of `u`. The sum of the gradient of `log_avg_u` is `1`.
-    log_sooavg_u: `logu.dtype` `Tensor` characterized by the natural-log of the
-      average of `u`` except that the average swaps-out `u[i]` for the
-      leave-`i`-out Geometric-average. The mean of the gradient of
-      `log_sooavg_u` is `1`. Mathematically `log_sooavg_u` is,
-      ```none
-      log_sooavg_u[i] = log(Avg{h[j ; i] : j=0, ..., m-1})
-      h[j ; i] = { u[j]                              j!=i
-                 { GeometricAverage{u[k] : k != i}   j==i
-      ```
-
-  """
-  with ops.name_scope(name, "csiszar_vimco_helper", [logu]):
-    logu = ops.convert_to_tensor(logu, name="logu")
-
-    n = logu.shape.with_rank_at_least(1)[0].value
-    if n is None:
-      n = array_ops.shape(logu)[0]
-      log_n = math_ops.log(math_ops.cast(n, dtype=logu.dtype))
-      nm1 = math_ops.cast(n - 1, dtype=logu.dtype)
-    else:
-      log_n = np.log(n).astype(logu.dtype.as_numpy_dtype)
-      nm1 = np.asarray(n - 1, dtype=logu.dtype.as_numpy_dtype)
-
-    # Throughout we reduce across axis=0 since this is presumed to be iid
-    # samples.
-
-    log_max_u = math_ops.reduce_max(logu, axis=0)
-    log_sum_u_minus_log_max_u = math_ops.reduce_logsumexp(
-        logu - log_max_u, axis=0)
-
-    # log_loosum_u[i] =
-    # = logsumexp(logu[j] : j != i)
-    # = log( exp(logsumexp(logu)) - exp(logu[i]) )
-    # = log( exp(logsumexp(logu - logu[i])) exp(logu[i])  - exp(logu[i]))
-    # = logu[i] + log(exp(logsumexp(logu - logu[i])) - 1)
-    # = logu[i] + log(exp(logsumexp(logu) - logu[i]) - 1)
-    # = logu[i] + softplus_inverse(logsumexp(logu) - logu[i])
-    d = log_sum_u_minus_log_max_u + (log_max_u - logu)
-    # We use `d != 0` rather than `d > 0.` because `d < 0.` should never
-    # happens; if it does we want to complain loudly (which `softplus_inverse`
-    # will).
-    d_ok = math_ops.not_equal(d, 0.)
-    safe_d = array_ops.where(d_ok, d, array_ops.ones_like(d))
-    d_ok_result = logu + distribution_util.softplus_inverse(safe_d)
-
-    inf = np.array(np.inf, dtype=logu.dtype.as_numpy_dtype)
-
-    # When not(d_ok) and is_positive_and_largest then we manually compute the
-    # log_loosum_u. (We can efficiently do this for any one point but not all,
-    # hence we still need the above calculation.) This is good because when
-    # this condition is met, we cannot use the above calculation; its -inf.
-    # We now compute the log-leave-out-max-sum, replicate it to every
-    # point and make sure to select it only when we need to.
-    is_positive_and_largest = math_ops.logical_and(
-        logu > 0.,
-        math_ops.equal(logu, log_max_u[array_ops.newaxis, ...]))
-    log_lomsum_u = math_ops.reduce_logsumexp(
-        array_ops.where(is_positive_and_largest,
-                        array_ops.fill(array_ops.shape(logu), -inf),
-                        logu),
-        axis=0, keep_dims=True)
-    log_lomsum_u = array_ops.tile(
-        log_lomsum_u,
-        multiples=1 + array_ops.pad([n-1], [[0, array_ops.rank(logu)-1]]))
-
-    d_not_ok_result = array_ops.where(
-        is_positive_and_largest,
-        log_lomsum_u,
-        array_ops.fill(array_ops.shape(d), -inf))
-
-    log_loosum_u = array_ops.where(d_ok, d_ok_result, d_not_ok_result)
-
-    # The swap-one-out-sum ("soosum") is n different sums, each of which
-    # replaces the i-th item with the i-th-left-out average, i.e.,
-    # soo_sum_u[i] = [exp(logu) - exp(logu[i])] + exp(mean(logu[!=i]))
-    #              =  exp(log_loosum_u[i])      + exp(looavg_logu[i])
-    looavg_logu = (math_ops.reduce_sum(logu, axis=0) - logu) / nm1
-    log_soosum_u = math_ops.reduce_logsumexp(
-        array_ops.stack([log_loosum_u, looavg_logu]),
-        axis=0)
-
-    log_avg_u = log_sum_u_minus_log_max_u + log_max_u - log_n
-    log_sooavg_u = log_soosum_u - log_n
-
-    log_avg_u.set_shape(logu.shape.with_rank_at_least(1)[1:])
-    log_sooavg_u.set_shape(logu.shape)
-
-    return log_avg_u, log_sooavg_u
diff --git a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
deleted file mode 100644
index d44fe6529a7ff0da0c6747e193fdb98a272a8da3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functions for specifying custom gradients.
-
-@@custom_gradient
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-
-__all__ = [
-    "custom_gradient",
-]
-
-
-def custom_gradient(fx, gx, x, axis=(), fx_gx_manually_stopped=False,
-                    name=None):
-  """Enables specifying a custom gradient.
-
-  This function works by clever application of `stop_gradient`. I.e., observe
-  that:
-
-  ```none
-  h(x) = x * stop_gradient(g(x)) + stop_gradient(f(x) - x * g(x))
-  ```
-
-  is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] =
-  stop_gradient(g(x)).`
-
-  In addition to scalar-domain/scalar-range functions, this function also
-  supports tensor-domain/scalar-range functions. However, in the latter case it
-  is necessary to reduce `x` to a scalar. This can be done by indicating the
-  `axis` over which `f` operates or by appropriately `reduce_sum`-ing `x`, prior
-  to calling this function.
-
-  Partial Custom Gradient:
-
-  Suppose `h(x) = htilde(x, y)`. Note that `dh/dx = stop(g(x))` but `dh/dy =
-  None`. This is because a `Tensor` cannot have only a portion of its gradient
-  stopped. To circumvent this issue, one must manually `stop_gradient` the
-  relevant portions of `f`, `g`. For example see the unit-test,
-  `test_works_correctly_fx_gx_manually_stopped`.
-
-  Args:
-    fx: `Tensor`. Output of function evaluated at `x`.
-    gx: `Tensor`. Gradient of function evaluated at `x`.
-    x: `Tensor`. Point of evaluation for `f, g`.
-    axis: 1D `int` `Tensor` representing dimensions of `x` which are the domain
-      of `f`. If `()` (the default), `f` is assumed scalar-domain/scalar-range.
-      If `None` `f` is assumed to render one scalar given all of `x`. Otherwise
-      `f` is assumed to output one scalar for each of `axis` dimensions of `x`.
-    fx_gx_manually_stopped: Python `bool` indicating that `fx`, `gx` manually
-      have `stop_gradient` applied.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    fx: Floating-type `Tensor` equal to `f(x)` but which has gradient
-      `stop_gradient(g(x))`.
-  """
-  with ops.name_scope(name, "custom_gradient", [fx, gx, x]):
-    fx = ops.convert_to_tensor(fx, name="fx")
-    # We don't want to bother eagerly computing `gx` since we may not even need
-    # it.
-    with ops.control_dependencies([fx]):
-      gx = ops.convert_to_tensor(gx, dtype=fx.dtype, name="gx")
-      gx = array_ops.identity(gx, name="gx")
-    # Proof of correctness:
-    #
-    #  f(x) = x * stop[gx] + stop[fx - x * gx]
-    #       = stop[fx]
-    #
-    #  g(x) = grad[fx]
-    #       = stop[gx] + grad[stop[fx - x * gx]]
-    #       = stop[gx] + 0
-    #
-    # Notice that when x is zero it still works:
-    # grad[x * stop(gx) + stop(fx - x * gx)] = 1 * stop[gx] + 0 = stop[gx]
-    #
-    # The proof is similar for the tensor-domain case, except that `x` is
-    # replaced by `reduce_sum(x)`.
-    sum_x = math_ops.reduce_sum(x, axis=axis, name="sum_x")
-    if not fx_gx_manually_stopped:
-      fx = array_ops.stop_gradient(fx)
-      gx = array_ops.stop_gradient(gx)
-    # IEEE754 ensures `(x-x)==0.` and that `0.*x==0.` so we make sure to write
-    # the code this way, rather than, e.g.,
-    # `sum_x * stop(gx) + stop(fx - sum_x * gx)`.
-    # For more discussion regarding the relevant portions of the IEEE754
-    # standard, see the StackOverflow question,
-    # "Is there a floating point value of x, for which x-x == 0 is false?"
-    # http://stackoverflow.com/q/2686644
-    return (sum_x - array_ops.stop_gradient(sum_x)) * gx + fx
diff --git a/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py b/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
deleted file mode 100644
index 8cabf18903b5f15002470acdfb8fdd3ec31a7413..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Quasi Monte Carlo support: Halton sequence.
-
-@@sample
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-
-
-__all__ = [
-    'sample',
-]
-
-
-# The maximum dimension we support. This is limited by the number of primes
-# in the _PRIMES array.
-_MAX_DIMENSION = 1000
-
-
-def sample(dim, num_samples=None, sample_indices=None, dtype=None, name=None):
-  r"""Returns a sample from the `m` dimensional Halton sequence.
-
-  Warning: The sequence elements take values only between 0 and 1. Care must be
-  taken to appropriately transform the domain of a function if it differs from
-  the unit cube before evaluating integrals using Halton samples. It is also
-  important to remember that quasi-random numbers are not a replacement for
-  pseudo-random numbers in every context. Quasi random numbers are completely
-  deterministic and typically have significant negative autocorrelation (unless
-  randomized).
-
-  Computes the members of the low discrepancy Halton sequence in dimension
-  `dim`. The d-dimensional sequence takes values in the unit hypercube in d
-  dimensions. Currently, only dimensions up to 1000 are supported. The prime
-  base for the `k`-th axes is the k-th prime starting from 2. For example,
-  if dim = 3, then the bases will be [2, 3, 5] respectively and the first
-  element of the sequence will be: [0.5, 0.333, 0.2]. For a more complete
-  description of the Halton sequences see:
-  https://en.wikipedia.org/wiki/Halton_sequence. For low discrepancy sequences
-  and their applications see:
-  https://en.wikipedia.org/wiki/Low-discrepancy_sequence.
-
-  The user must supply either `num_samples` or `sample_indices` but not both.
-  The former is the number of samples to produce starting from the first
-  element. If `sample_indices` is given instead, the specified elements of
-  the sequence are generated. For example, sample_indices=tf.range(10) is
-  equivalent to specifying n=10.
-
-  Example Use:
-
-  ```python
-  bf = tf.contrib.bayesflow
-
-  # Produce the first 1000 members of the Halton sequence in 3 dimensions.
-  num_samples = 1000
-  dim = 3
-  sample = bf.halton_sequence.sample(dim, num_samples=num_samples)
-
-  # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
-  # hypercube.
-  powers = tf.range(1.0, limit=dim + 1)
-  integral = tf.reduce_mean(tf.reduce_prod(sample ** powers, axis=-1))
-  true_value = 1.0 / tf.reduce_prod(powers + 1.0)
-  with tf.Session() as session:
-    values = session.run((integral, true_value))
-
-  # Produces a relative absolute error of 1.7%.
-  print ("Estimated: %f, True Value: %f" % values)
-
-  # Now skip the first 1000 samples and recompute the integral with the next
-  # thousand samples. The sample_indices argument can be used to do this.
-
-
-  sample_indices = tf.range(start=1000, limit=1000 + num_samples,
-                            dtype=tf.int32)
-  sample_leaped = halton.sample(dim, sample_indices=sample_indices)
-
-  integral_leaped = tf.reduce_mean(tf.reduce_prod(sample_leaped ** powers,
-                                                  axis=-1))
-  with tf.Session() as session:
-    values = session.run((integral_leaped, true_value))
-  # Now produces a relative absolute error of 0.05%.
-  print ("Leaped Estimated: %f, True Value: %f" % values)
-  ```
-
-  Args:
-    dim: Positive Python `int` representing each sample's `event_size.` Must
-      not be greater than 1000.
-    num_samples: (Optional) positive Python `int`. The number of samples to
-      generate. Either this parameter or sample_indices must be specified but
-      not both. If this parameter is None, then the behaviour is determined by
-      the `sample_indices`.
-    sample_indices: (Optional) `Tensor` of dtype int32 and rank 1. The elements
-      of the sequence to compute specified by their position in the sequence.
-      The entries index into the Halton sequence starting with 0 and hence,
-      must be whole numbers. For example, sample_indices=[0, 5, 6] will produce
-      the first, sixth and seventh elements of the sequence. If this parameter
-      is None, then the `num_samples` parameter must be specified which gives
-      the number of desired samples starting from the first sample.
-    dtype: (Optional) The dtype of the sample. One of `float32` or `float64`.
-      Default is `float32`.
-    name:  (Optional) Python `str` describing ops managed by this function. If
-    not supplied the name of this function is used.
-
-  Returns:
-    halton_elements: Elements of the Halton sequence. `Tensor` of supplied dtype
-    and `shape` `[num_samples, dim]` if `num_samples` was specified or shape
-    `[s, dim]` where s is the size of `sample_indices` if `sample_indices`
-    were specified.
-
-  Raises:
-    ValueError: if both `sample_indices` and `num_samples` were specified or
-    if dimension `dim` is less than 1 or greater than 1000.
-  """
-  if dim < 1 or dim > _MAX_DIMENSION:
-    raise ValueError(
-        'Dimension must be between 1 and {}. Supplied {}'.format(_MAX_DIMENSION,
-                                                                 dim))
-  if (num_samples is None) == (sample_indices is None):
-    raise ValueError('Either `num_samples` or `sample_indices` must be'
-                     ' specified but not both.')
-
-  dtype = dtype or dtypes.float32
-  if not dtype.is_floating:
-    raise ValueError('dtype must be of `float`-type')
-
-  with ops.name_scope(name, 'sample', values=[sample_indices]):
-    # Here and in the following, the shape layout is as follows:
-    # [sample dimension, event dimension, coefficient dimension].
-    # The coefficient dimension is an intermediate axes which will hold the
-    # weights of the starting integer when expressed in the (prime) base for
-    # an event dimension.
-    indices = _get_indices(num_samples, sample_indices, dtype)
-    radixes = array_ops.constant(_PRIMES[0:dim], dtype=dtype, shape=[dim, 1])
-
-    max_sizes_by_axes = _base_expansion_size(math_ops.reduce_max(indices),
-                                             radixes)
-
-    max_size = math_ops.reduce_max(max_sizes_by_axes)
-
-    # The powers of the radixes that we will need. Note that there is a bit
-    # of an excess here. Suppose we need the place value coefficients of 7
-    # in base 2 and 3. For 2, we will have 3 digits but we only need 2 digits
-    # for base 3. However, we can only create rectangular tensors so we
-    # store both expansions in a [2, 3] tensor. This leads to the problem that
-    # we might end up attempting to raise large numbers to large powers. For
-    # example, base 2 expansion of 1024 has 10 digits. If we were in 10
-    # dimensions, then the 10th prime (29) we will end up computing 29^10 even
-    # though we don't need it. We avoid this by setting the exponents for each
-    # axes to 0 beyond the maximum value needed for that dimension.
-    exponents_by_axes = array_ops.tile([math_ops.range(max_size)], [dim, 1])
-    weight_mask = exponents_by_axes > max_sizes_by_axes
-    capped_exponents = array_ops.where(
-        weight_mask, array_ops.zeros_like(exponents_by_axes), exponents_by_axes)
-    weights = radixes ** capped_exponents
-    coeffs = math_ops.floor_div(indices, weights)
-    coeffs *= 1 - math_ops.cast(weight_mask, dtype)
-    coeffs = (coeffs % radixes) / radixes
-    return math_ops.reduce_sum(coeffs / weights, axis=-1)
-
-
-def _get_indices(n, sample_indices, dtype, name=None):
-  """Generates starting points for the Halton sequence procedure.
-
-  The k'th element of the sequence is generated starting from a positive integer
-  which must be distinct for each `k`. It is conventional to choose the starting
-  point as `k` itself (or `k+1` if k is zero based). This function generates
-  the starting integers for the required elements and reshapes the result for
-  later use.
-
-  Args:
-    n: Positive `int`. The number of samples to generate. If this
-      parameter is supplied, then `sample_indices` should be None.
-    sample_indices: `Tensor` of dtype int32 and rank 1. The entries
-      index into the Halton sequence starting with 0 and hence, must be whole
-      numbers. For example, sample_indices=[0, 5, 6] will produce the first,
-      sixth and seventh elements of the sequence. If this parameter is not None
-      then `n` must be None.
-    dtype: The dtype of the sample. One of `float32` or `float64`.
-      Default is `float32`.
-    name: Python `str` name which describes ops created by this function.
-
-  Returns:
-    indices: `Tensor` of dtype `dtype` and shape = `[n, 1, 1]`.
-  """
-  with ops.name_scope(name, 'get_indices', [n, sample_indices]):
-    if sample_indices is None:
-      sample_indices = math_ops.range(n, dtype=dtype)
-    else:
-      sample_indices = math_ops.cast(sample_indices, dtype)
-
-    # Shift the indices so they are 1 based.
-    indices = sample_indices + 1
-
-    # Reshape to make space for the event dimension and the place value
-    # coefficients.
-    return array_ops.reshape(indices, [-1, 1, 1])
-
-
-def _base_expansion_size(num, bases):
-  """Computes the number of terms in the place value expansion.
-
-  Let num = a0 + a1 b + a2 b^2 + ... ak b^k be the place value expansion of
-  `num` in base b (ak <> 0). This function computes and returns `k` for each
-  base `b` specified in `bases`.
-
-  This can be inferred from the base `b` logarithm of `num` as follows:
-    $$k = Floor(log_b (num)) + 1  = Floor( log(num) / log(b)) + 1$$
-
-  Args:
-    num: Scalar `Tensor` of dtype either `float32` or `float64`. The number to
-      compute the base expansion size of.
-    bases: `Tensor` of the same dtype as num. The bases to compute the size
-      against.
-
-  Returns:
-    Tensor of same dtype and shape as `bases` containing the size of num when
-    written in that base.
-  """
-  return math_ops.floor(math_ops.log(num) / math_ops.log(bases)) + 1
-
-
-def _primes_less_than(n):
-  # Based on
-  # https://stackoverflow.com/questions/2068372/fastest-way-to-list-all-primes-below-n-in-python/3035188#3035188
-  """Returns sorted array of primes such that `2 <= prime < n`."""
-  small_primes = np.array((2, 3, 5))
-  if n <= 6:
-    return small_primes[small_primes < n]
-  sieve = np.ones(n // 3 + (n % 6 == 2), dtype=np.bool)
-  sieve[0] = False
-  m = int(n ** 0.5) // 3 + 1
-  for i in range(m):
-    if not sieve[i]:
-      continue
-    k = 3 * i + 1 | 1
-    sieve[k ** 2 // 3::2 * k] = False
-    sieve[(k ** 2 + 4 * k - 2 * k * (i & 1)) // 3::2 * k] = False
-  return np.r_[2, 3, 3 * np.nonzero(sieve)[0] + 1 | 1]
-
-_PRIMES = _primes_less_than(7919+1)
-
-assert len(_PRIMES) == _MAX_DIMENSION
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc.py b/tensorflow/contrib/bayesflow/python/ops/hmc.py
deleted file mode 100644
index 7fd5652c5c3e085b23c05baef6e3a42b7a42e08f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/hmc.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# go/tf-wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.hmc_impl import *  # pylint: disable=wildcard-import,unused-wildcard-import,g-importing-member
-from tensorflow.python.util import all_util
-
-_allowed_symbols = [
-    "sample_chain",
-    "sample_annealed_importance_chain",
-    "kernel",
-]
-
-all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
deleted file mode 100644
index f724910c59315867a42a56fab3deb36f5d3adb7a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
+++ /dev/null
@@ -1,1185 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
-
-@@sample_chain
-@@sample_annealed_importance_chain
-@@kernel
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gradients_impl as gradients_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.distributions import util as distributions_util
-
-__all__ = [
-    "sample_chain",
-    "sample_annealed_importance_chain",
-    "kernel",
-]
-
-
-KernelResults = collections.namedtuple(
-    "KernelResults",
-    [
-        "acceptance_probs",
-        "current_grads_target_log_prob",  # "Current result" means "accepted".
-        "current_target_log_prob",  # "Current result" means "accepted".
-        "energy_change",
-        "is_accepted",
-        "proposed_grads_target_log_prob",
-        "proposed_state",
-        "proposed_target_log_prob",
-        "random_positive",
-    ])
-
-
-def _make_dummy_kernel_results(
-    dummy_state,
-    dummy_target_log_prob,
-    dummy_grads_target_log_prob):
-  return KernelResults(
-      acceptance_probs=dummy_target_log_prob,
-      current_grads_target_log_prob=dummy_grads_target_log_prob,
-      current_target_log_prob=dummy_target_log_prob,
-      energy_change=dummy_target_log_prob,
-      is_accepted=array_ops.ones_like(dummy_target_log_prob, dtypes.bool),
-      proposed_grads_target_log_prob=dummy_grads_target_log_prob,
-      proposed_state=dummy_state,
-      proposed_target_log_prob=dummy_target_log_prob,
-      random_positive=dummy_target_log_prob,
-  )
-
-
-def sample_chain(
-    num_results,
-    target_log_prob_fn,
-    current_state,
-    step_size,
-    num_leapfrog_steps,
-    num_burnin_steps=0,
-    num_steps_between_results=0,
-    seed=None,
-    current_target_log_prob=None,
-    current_grads_target_log_prob=None,
-    name=None):
-  """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains.
-
-  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm
-  that takes a series of gradient-informed steps to produce a Metropolis
-  proposal. This function samples from an HMC Markov chain at `current_state`
-  and whose stationary distribution has log-unnormalized-density
-  `target_log_prob_fn()`.
-
-  This function samples from multiple chains in parallel. It assumes that the
-  the leftmost dimensions of (each) `current_state` (part) index an independent
-  chain.  The function `target_log_prob_fn()` sums log-probabilities across
-  event dimensions (i.e., current state (part) rightmost dimensions). Each
-  element of the output of `target_log_prob_fn()` represents the (possibly
-  unnormalized) log-probability of the joint distribution over (all) the current
-  state (parts).
-
-  The `current_state` can be represented as a single `Tensor` or a `list` of
-  `Tensors` which collectively represent the current state. When specifying a
-  `list`, one must also specify a list of `step_size`s.
-
-  Note: `target_log_prob_fn` is called exactly twice.
-
-  Only one out of every `num_steps_between_samples + 1` steps is included in the
-  returned results. This "thinning" comes at a cost of reduced statistical
-  power, while reducing memory requirements and autocorrelation. For more
-  discussion see [1].
-
-  [1]: "Statistically efficient thinning of a Markov chain sampler."
-       Art B. Owen. April 2017.
-       http://statweb.stanford.edu/~owen/reports/bestthinning.pdf
-
-  #### Examples:
-
-  ##### Sample from a diagonal-variance Gaussian.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  def make_likelihood(true_variances):
-    return tfd.MultivariateNormalDiag(
-        scale_diag=tf.sqrt(true_variances))
-
-  dims = 10
-  dtype = np.float32
-  true_variances = tf.linspace(dtype(1), dtype(3), dims)
-  likelihood = make_likelihood(true_variances)
-
-  states, kernel_results = hmc.sample_chain(
-      num_results=1000,
-      target_log_prob_fn=likelihood.log_prob,
-      current_state=tf.zeros(dims),
-      step_size=0.5,
-      num_leapfrog_steps=2,
-      num_burnin_steps=500)
-
-  # Compute sample stats.
-  sample_mean = tf.reduce_mean(states, axis=0)
-  sample_var = tf.reduce_mean(
-      tf.squared_difference(states, sample_mean),
-      axis=0)
-  ```
-
-  ##### Sampling from factor-analysis posteriors with known factors.
-
-  I.e.,
-
-  ```none
-  for i=1..n:
-    w[i] ~ Normal(0, eye(d))            # prior
-    x[i] ~ Normal(loc=matmul(w[i], F))  # likelihood
-  ```
-
-  where `F` denotes factors.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  def make_prior(dims, dtype):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.zeros(dims, dtype))
-
-  def make_likelihood(weights, factors):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.tensordot(weights, factors, axes=[[0], [-1]]))
-
-  # Setup data.
-  num_weights = 10
-  num_factors = 4
-  num_chains = 100
-  dtype = np.float32
-
-  prior = make_prior(num_weights, dtype)
-  weights = prior.sample(num_chains)
-  factors = np.random.randn(num_factors, num_weights).astype(dtype)
-  x = make_likelihood(weights, factors).sample(num_chains)
-
-  def target_log_prob(w):
-    # Target joint is: `f(w) = p(w, x | factors)`.
-    return prior.log_prob(w) + make_likelihood(w, factors).log_prob(x)
-
-  # Get `num_results` samples from `num_chains` independent chains.
-  chains_states, kernels_results = hmc.sample_chain(
-      num_results=1000,
-      target_log_prob_fn=target_log_prob,
-      current_state=tf.zeros([num_chains, dims], dtype),
-      step_size=0.1,
-      num_leapfrog_steps=2,
-      num_burnin_steps=500)
-
-  # Compute sample stats.
-  sample_mean = tf.reduce_mean(chains_states, axis=[0, 1])
-  sample_var = tf.reduce_mean(
-      tf.squared_difference(chains_states, sample_mean),
-      axis=[0, 1])
-  ```
-
-  Args:
-    num_results: Integer number of Markov chain draws.
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
-      for the leapfrog integrator. Must broadcast with the shape of
-      `current_state`. Larger step sizes lead to faster progress, but too-large
-      step sizes make rejection exponentially more likely. When possible, it's
-      often helpful to match per-variable step sizes to the standard deviations
-      of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    num_burnin_steps: Integer number of chain steps to take before starting to
-      collect results.
-      Default value: 0 (i.e., no burn-in).
-    num_steps_between_results: Integer number of chain steps between collecting
-      a result. Only one out of every `num_steps_between_samples + 1` steps is
-      included in the returned results. This "thinning" comes at a cost of
-      reduced statistical power, while reducing memory requirements and
-      autocorrelation. For more discussion see [1].
-      Default value: 0 (i.e., no subsampling).
-    seed: Python integer to seed the random number generator.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn` at the `current_state`. The only reason to specify
-      this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `target_log_prob` at the `current_state` and wrt
-      the `current_state`. Must have same shape as `current_state`. The only
-      reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_sample_chain").
-
-  Returns:
-    accepted_states: Tensor or Python list of `Tensor`s representing the
-      state(s) of the Markov chain(s) at each result step. Has same shape as
-      input `current_state` but with a prepended `num_results`-size dimension.
-    kernel_results: `collections.namedtuple` of internal calculations used to
-      advance the chain.
-  """
-  with ops.name_scope(
-      name, "hmc_sample_chain",
-      [num_results, current_state, step_size, num_leapfrog_steps,
-       num_burnin_steps, num_steps_between_results, seed,
-       current_target_log_prob, current_grads_target_log_prob]):
-    with ops.name_scope("initialize"):
-      [
-          current_state,
-          step_size,
-          current_target_log_prob,
-          current_grads_target_log_prob,
-      ] = _prepare_args(
-          target_log_prob_fn,
-          current_state,
-          step_size,
-          current_target_log_prob,
-          current_grads_target_log_prob)
-      num_results = ops.convert_to_tensor(
-          num_results,
-          dtype=dtypes.int32,
-          name="num_results")
-      num_leapfrog_steps = ops.convert_to_tensor(
-          num_leapfrog_steps,
-          dtype=dtypes.int32,
-          name="num_leapfrog_steps")
-      num_burnin_steps = ops.convert_to_tensor(
-          num_burnin_steps,
-          dtype=dtypes.int32,
-          name="num_burnin_steps")
-      num_steps_between_results = ops.convert_to_tensor(
-          num_steps_between_results,
-          dtype=dtypes.int32,
-          name="num_steps_between_results")
-
-    def _run_chain(num_steps, current_state, kernel_results):
-      """Runs the chain(s) for `num_steps`."""
-      def _loop_body(iter_, current_state, kernel_results):
-        return [iter_ + 1] + list(kernel(
-            target_log_prob_fn,
-            current_state,
-            step_size,
-            num_leapfrog_steps,
-            seed,
-            kernel_results.current_target_log_prob,
-            kernel_results.current_grads_target_log_prob))
-      while_loop_kwargs = dict(
-          cond=lambda iter_, *args: iter_ < num_steps,
-          body=_loop_body,
-          loop_vars=[
-              np.int32(0),
-              current_state,
-              kernel_results,
-          ],
-      )
-      if seed is not None:
-        while_loop_kwargs["parallel_iterations"] = 1
-      return control_flow_ops.while_loop(
-          **while_loop_kwargs)[1:]  # Lop-off "iter_".
-
-    def _scan_body(args_list, iter_):
-      """Closure which implements `tf.scan` body."""
-      current_state, kernel_results = args_list
-      return _run_chain(
-          1 + array_ops.where(math_ops.equal(iter_, 0),
-                              num_burnin_steps,
-                              num_steps_between_results),
-          current_state,
-          kernel_results)
-
-    scan_kwargs = dict(
-        fn=_scan_body,
-        elems=math_ops.range(num_results),  # iter_: used to choose burnin.
-        initializer=[
-            current_state,
-            _make_dummy_kernel_results(
-                current_state,
-                current_target_log_prob,
-                current_grads_target_log_prob),
-        ])
-    if seed is not None:
-      scan_kwargs["parallel_iterations"] = 1
-    return functional_ops.scan(**scan_kwargs)
-
-
-def sample_annealed_importance_chain(
-    proposal_log_prob_fn,
-    num_steps,
-    target_log_prob_fn,
-    current_state,
-    step_size,
-    num_leapfrog_steps,
-    seed=None,
-    name=None):
-  """Runs annealed importance sampling (AIS) to estimate normalizing constants.
-
-  This function uses Hamiltonian Monte Carlo to sample from a series of
-  distributions that slowly interpolates between an initial "proposal"
-  distribution:
-
-  `exp(proposal_log_prob_fn(x) - proposal_log_normalizer)`
-
-  and the target distribution:
-
-  `exp(target_log_prob_fn(x) - target_log_normalizer)`,
-
-  accumulating importance weights along the way. The product of these
-  importance weights gives an unbiased estimate of the ratio of the
-  normalizing constants of the initial distribution and the target
-  distribution:
-
-  `E[exp(ais_weights)] = exp(target_log_normalizer - proposal_log_normalizer)`.
-
-  Note: `proposal_log_prob_fn` and `target_log_prob_fn` are called exactly three
-  times (although this may be reduced to two times, in the future).
-
-  #### Examples:
-
-  ##### Estimate the normalizing constant of a log-gamma distribution.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  # Run 100 AIS chains in parallel
-  num_chains = 100
-  dims = 20
-  dtype = np.float32
-
-  proposal = tfd.MultivatiateNormalDiag(
-     loc=tf.zeros([dims], dtype=dtype))
-
-  target = tfd.TransformedDistribution(
-    distribution=tfd.Gamma(concentration=dtype(2),
-                           rate=dtype(3)),
-    bijector=tfd.bijectors.Invert(tfd.bijectors.Exp()),
-    event_shape=[dims])
-
-  chains_state, ais_weights, kernels_results = (
-      hmc.sample_annealed_importance_chain(
-          proposal_log_prob_fn=proposal.log_prob,
-          num_steps=1000,
-          target_log_prob_fn=target.log_prob,
-          step_size=0.2,
-          current_state=proposal.sample(num_chains),
-          num_leapfrog_steps=2))
-
-  log_estimated_normalizer = (tf.reduce_logsumexp(ais_weights)
-                              - np.log(num_chains))
-  log_true_normalizer = tf.lgamma(2.) - 2. * tf.log(3.)
-  ```
-
-  ##### Estimate marginal likelihood of a Bayesian regression model.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  def make_prior(dims, dtype):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.zeros(dims, dtype))
-
-  def make_likelihood(weights, x):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.tensordot(weights, x, axes=[[0], [-1]]))
-
-  # Run 100 AIS chains in parallel
-  num_chains = 100
-  dims = 10
-  dtype = np.float32
-
-  # Make training data.
-  x = np.random.randn(num_chains, dims).astype(dtype)
-  true_weights = np.random.randn(dims).astype(dtype)
-  y = np.dot(x, true_weights) + np.random.randn(num_chains)
-
-  # Setup model.
-  prior = make_prior(dims, dtype)
-  def target_log_prob_fn(weights):
-    return prior.log_prob(weights) + make_likelihood(weights, x).log_prob(y)
-
-  proposal = tfd.MultivariateNormalDiag(
-      loc=tf.zeros(dims, dtype))
-
-  weight_samples, ais_weights, kernel_results = (
-      hmc.sample_annealed_importance_chain(
-        num_steps=1000,
-        proposal_log_prob_fn=proposal.log_prob,
-        target_log_prob_fn=target_log_prob_fn
-        current_state=tf.zeros([num_chains, dims], dtype),
-        step_size=0.1,
-        num_leapfrog_steps=2))
-  log_normalizer_estimate = (tf.reduce_logsumexp(ais_weights)
-                             - np.log(num_chains))
-  ```
-
-  Args:
-    proposal_log_prob_fn: Python callable that returns the log density of the
-      initial distribution.
-    num_steps: Integer number of Markov chain updates to run. More
-      iterations means more expense, but smoother annealing between q
-      and p, which in turn means exponentially lower variance for the
-      normalizing constant estimator.
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
-      for the leapfrog integrator. Must broadcast with the shape of
-      `current_state`. Larger step sizes lead to faster progress, but too-large
-      step sizes make rejection exponentially more likely. When possible, it's
-      often helpful to match per-variable step sizes to the standard deviations
-      of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    seed: Python integer to seed the random number generator.
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_sample_annealed_importance_chain").
-
-  Returns:
-    accepted_state: `Tensor` or Python list of `Tensor`s representing the
-      state(s) of the Markov chain(s) at the final iteration. Has same shape as
-      input `current_state`.
-    ais_weights: Tensor with the estimated weight(s). Has shape matching
-      `target_log_prob_fn(current_state)`.
-    kernel_results: `collections.namedtuple` of internal calculations used to
-      advance the chain.
-  """
-  def make_convex_combined_log_prob_fn(iter_):
-    def _fn(*args):
-      p = proposal_log_prob_fn(*args)
-      t = target_log_prob_fn(*args)
-      dtype = p.dtype.base_dtype
-      beta = (math_ops.cast(iter_ + 1, dtype)
-              / math_ops.cast(num_steps, dtype))
-      return (1. - beta) * p + beta * t
-    return _fn
-
-  with ops.name_scope(
-      name, "hmc_sample_annealed_importance_chain",
-      [num_steps, current_state, step_size, num_leapfrog_steps, seed]):
-    with ops.name_scope("initialize"):
-      [
-          current_state,
-          step_size,
-          current_log_prob,
-          current_grads_log_prob,
-      ] = _prepare_args(
-          make_convex_combined_log_prob_fn(iter_=0),
-          current_state,
-          step_size,
-          description="convex_combined_log_prob")
-      num_steps = ops.convert_to_tensor(
-          num_steps,
-          dtype=dtypes.int32,
-          name="num_steps")
-      num_leapfrog_steps = ops.convert_to_tensor(
-          num_leapfrog_steps,
-          dtype=dtypes.int32,
-          name="num_leapfrog_steps")
-    def _loop_body(iter_, ais_weights, current_state, kernel_results):
-      """Closure which implements `tf.while_loop` body."""
-      current_state_parts = (list(current_state)
-                             if _is_list_like(current_state)
-                             else [current_state])
-      # TODO(b/72994218): Consider refactoring things to avoid this unecessary
-      # call.
-      ais_weights += ((target_log_prob_fn(*current_state_parts)
-                       - proposal_log_prob_fn(*current_state_parts))
-                      / math_ops.cast(num_steps, ais_weights.dtype))
-      return [iter_ + 1, ais_weights] + list(kernel(
-          make_convex_combined_log_prob_fn(iter_),
-          current_state,
-          step_size,
-          num_leapfrog_steps,
-          seed,
-          kernel_results.current_target_log_prob,
-          kernel_results.current_grads_target_log_prob))
-
-    while_loop_kwargs = dict(
-        cond=lambda iter_, *args: iter_ < num_steps,
-        body=_loop_body,
-        loop_vars=[
-            np.int32(0),  # iter_
-            array_ops.zeros_like(current_log_prob),  # ais_weights
-            current_state,
-            _make_dummy_kernel_results(current_state,
-                                       current_log_prob,
-                                       current_grads_log_prob),
-        ])
-    if seed is not None:
-      while_loop_kwargs["parallel_iterations"] = 1
-
-    [ais_weights, current_state, kernel_results] = control_flow_ops.while_loop(
-        **while_loop_kwargs)[1:]  # Lop-off "iter_".
-
-    return [current_state, ais_weights, kernel_results]
-
-
-def kernel(target_log_prob_fn,
-           current_state,
-           step_size,
-           num_leapfrog_steps,
-           seed=None,
-           current_target_log_prob=None,
-           current_grads_target_log_prob=None,
-           name=None):
-  """Runs one iteration of Hamiltonian Monte Carlo.
-
-  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
-  algorithm that takes a series of gradient-informed steps to produce
-  a Metropolis proposal. This function applies one step of HMC to
-  randomly update the variable `x`.
-
-  This function can update multiple chains in parallel. It assumes that all
-  leftmost dimensions of `current_state` index independent chain states (and are
-  therefore updated independently). The output of `target_log_prob_fn()` should
-  sum log-probabilities across all event dimensions. Slices along the rightmost
-  dimensions may have different target distributions; for example,
-  `current_state[0, :]` could have a different target distribution from
-  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
-  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
-
-  #### Examples:
-
-  ##### Simple chain with warm-up.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  # Tuning acceptance rates:
-  dtype = np.float32
-  target_accept_rate = 0.631
-  num_warmup_iter = 500
-  num_chain_iter = 500
-
-  x = tf.get_variable(name="x", initializer=dtype(1))
-  step_size = tf.get_variable(name="step_size", initializer=dtype(1))
-
-  target = tfd.Normal(loc=dtype(0), scale=dtype(1))
-
-  new_x, other_results = hmc.kernel(
-      target_log_prob_fn=target.log_prob,
-      current_state=x,
-      step_size=step_size,
-      num_leapfrog_steps=3)[:4]
-
-  x_update = x.assign(new_x)
-
-  step_size_update = step_size.assign_add(
-      step_size * tf.where(
-        other_results.acceptance_probs > target_accept_rate,
-        0.01, -0.01))
-
-  warmup = tf.group([x_update, step_size_update])
-
-  tf.global_variables_initializer().run()
-
-  sess.graph.finalize()  # No more graph building.
-
-  # Warm up the sampler and adapt the step size
-  for _ in xrange(num_warmup_iter):
-    sess.run(warmup)
-
-  # Collect samples without adapting step size
-  samples = np.zeros([num_chain_iter])
-  for i in xrange(num_chain_iter):
-    _, x_, target_log_prob_, grad_ = sess.run([
-        x_update,
-        x,
-        other_results.target_log_prob,
-        other_results.grads_target_log_prob])
-    samples[i] = x_
-
-  print(samples.mean(), samples.std())
-  ```
-
-  ##### Sample from more complicated posterior.
-
-  I.e.,
-
-  ```none
-    W ~ MVN(loc=0, scale=sigma * eye(dims))
-    for i=1...num_samples:
-        X[i] ~ MVN(loc=0, scale=eye(dims))
-      eps[i] ~ Normal(loc=0, scale=1)
-        Y[i] = X[i].T * W + eps[i]
-  ```
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  def make_training_data(num_samples, dims, sigma):
-    dt = np.asarray(sigma).dtype
-    zeros = tf.zeros(dims, dtype=dt)
-    x = tfd.MultivariateNormalDiag(
-        loc=zeros).sample(num_samples, seed=1)
-    w = tfd.MultivariateNormalDiag(
-        loc=zeros,
-        scale_identity_multiplier=sigma).sample(seed=2)
-    noise = tfd.Normal(
-        loc=dt(0),
-        scale=dt(1)).sample(num_samples, seed=3)
-    y = tf.tensordot(x, w, axes=[[1], [0]]) + noise
-    return y, x, w
-
-  def make_prior(sigma, dims):
-    # p(w | sigma)
-    return tfd.MultivariateNormalDiag(
-        loc=tf.zeros([dims], dtype=sigma.dtype),
-        scale_identity_multiplier=sigma)
-
-  def make_likelihood(x, w):
-    # p(y | x, w)
-    return tfd.MultivariateNormalDiag(
-        loc=tf.tensordot(x, w, axes=[[1], [0]]))
-
-  # Setup assumptions.
-  dtype = np.float32
-  num_samples = 150
-  dims = 10
-  num_iters = int(5e3)
-
-  true_sigma = dtype(0.5)
-  y, x, true_weights = make_training_data(num_samples, dims, true_sigma)
-
-  # Estimate of `log(true_sigma)`.
-  log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0))
-  sigma = tf.exp(log_sigma)
-
-  # State of the Markov chain.
-  weights = tf.get_variable(
-      name="weights",
-      initializer=np.random.randn(dims).astype(dtype))
-
-  prior = make_prior(sigma, dims)
-
-  def joint_log_prob_fn(w):
-    # f(w) = log p(w, y | x)
-    return prior.log_prob(w) + make_likelihood(x, w).log_prob(y)
-
-  weights_update = weights.assign(
-      hmc.kernel(target_log_prob_fn=joint_log_prob,
-                 current_state=weights,
-                 step_size=0.1,
-                 num_leapfrog_steps=5)[0])
-
-  with tf.control_dependencies([weights_update]):
-    loss = -prior.log_prob(weights)
-
-  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-  log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma])
-
-  sess.graph.finalize()  # No more graph building.
-
-  tf.global_variables_initializer().run()
-
-  sigma_history = np.zeros(num_iters, dtype)
-  weights_history = np.zeros([num_iters, dims], dtype)
-
-  for i in xrange(num_iters):
-    _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights])
-    weights_history[i, :] = weights_
-    sigma_history[i] = sigma_
-
-  true_weights_ = sess.run(true_weights)
-
-  # Should converge to something close to true_sigma.
-  plt.plot(sigma_history);
-  plt.ylabel("sigma");
-  plt.xlabel("iteration");
-  ```
-
-  Args:
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
-      for the leapfrog integrator. Must broadcast with the shape of
-      `current_state`. Larger step sizes lead to faster progress, but too-large
-      step sizes make rejection exponentially more likely. When possible, it's
-      often helpful to match per-variable step sizes to the standard deviations
-      of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    seed: Python integer to seed the random number generator.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn` at the `current_state`. The only reason to
-      specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `current_target_log_prob` at the `current_state`
-      and wrt the `current_state`. Must have same shape as `current_state`. The
-      only reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_kernel").
-
-  Returns:
-    accepted_state: Tensor or Python list of `Tensor`s representing the state(s)
-      of the Markov chain(s) at each result step. Has same shape as
-      `current_state`.
-    kernel_results: `collections.namedtuple` of internal calculations used to
-      advance the chain.
-
-  Raises:
-    ValueError: if there isn't one `step_size` or a list with same length as
-      `current_state`.
-  """
-  with ops.name_scope(
-      name, "hmc_kernel",
-      [current_state, step_size, num_leapfrog_steps, seed,
-       current_target_log_prob, current_grads_target_log_prob]):
-    with ops.name_scope("initialize"):
-      [current_state_parts, step_sizes, current_target_log_prob,
-       current_grads_target_log_prob] = _prepare_args(
-           target_log_prob_fn, current_state, step_size,
-           current_target_log_prob, current_grads_target_log_prob,
-           maybe_expand=True)
-      independent_chain_ndims = distributions_util.prefer_static_rank(
-          current_target_log_prob)
-      current_momentums = []
-      for s in current_state_parts:
-        current_momentums.append(random_ops.random_normal(
-            shape=array_ops.shape(s),
-            dtype=s.dtype.base_dtype,
-            seed=seed))
-        seed = distributions_util.gen_new_seed(
-            seed, salt="hmc_kernel_momentums")
-
-      num_leapfrog_steps = ops.convert_to_tensor(
-          num_leapfrog_steps,
-          dtype=dtypes.int32,
-          name="num_leapfrog_steps")
-    [
-        proposed_momentums,
-        proposed_state_parts,
-        proposed_target_log_prob,
-        proposed_grads_target_log_prob,
-    ] = _leapfrog_integrator(current_momentums,
-                             target_log_prob_fn,
-                             current_state_parts,
-                             step_sizes,
-                             num_leapfrog_steps,
-                             current_target_log_prob,
-                             current_grads_target_log_prob)
-
-    energy_change = _compute_energy_change(current_target_log_prob,
-                                           current_momentums,
-                                           proposed_target_log_prob,
-                                           proposed_momentums,
-                                           independent_chain_ndims)
-
-    # u < exp(min(-energy, 0)),  where u~Uniform[0,1)
-    # ==> -log(u) >= max(e, 0)
-    # ==> -log(u) >= e
-    # (Perhaps surprisingly, we don't have a better way to obtain a random
-    # uniform from positive reals, i.e., `tf.random_uniform(minval=0,
-    # maxval=np.inf)` won't work.)
-    random_uniform = random_ops.random_uniform(
-        shape=array_ops.shape(energy_change),
-        dtype=energy_change.dtype,
-        seed=seed)
-    random_positive = -math_ops.log(random_uniform)
-    is_accepted = random_positive >= energy_change
-
-    accepted_target_log_prob = array_ops.where(is_accepted,
-                                               proposed_target_log_prob,
-                                               current_target_log_prob)
-
-    accepted_state_parts = [_choose(is_accepted,
-                                    proposed_state_part,
-                                    current_state_part,
-                                    independent_chain_ndims)
-                            for current_state_part, proposed_state_part
-                            in zip(current_state_parts, proposed_state_parts)]
-
-    accepted_grads_target_log_prob = [
-        _choose(is_accepted,
-                proposed_grad,
-                grad,
-                independent_chain_ndims)
-        for proposed_grad, grad
-        in zip(proposed_grads_target_log_prob, current_grads_target_log_prob)]
-
-    maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0]
-    return [
-        maybe_flatten(accepted_state_parts),
-        KernelResults(
-            acceptance_probs=math_ops.exp(math_ops.minimum(-energy_change, 0.)),
-            current_grads_target_log_prob=accepted_grads_target_log_prob,
-            current_target_log_prob=accepted_target_log_prob,
-            energy_change=energy_change,
-            is_accepted=is_accepted,
-            proposed_grads_target_log_prob=proposed_grads_target_log_prob,
-            proposed_state=maybe_flatten(proposed_state_parts),
-            proposed_target_log_prob=proposed_target_log_prob,
-            random_positive=random_positive,
-        ),
-    ]
-
-
-def _leapfrog_integrator(current_momentums,
-                         target_log_prob_fn,
-                         current_state_parts,
-                         step_sizes,
-                         num_leapfrog_steps,
-                         current_target_log_prob=None,
-                         current_grads_target_log_prob=None,
-                         name=None):
-  """Applies `num_leapfrog_steps` of the leapfrog integrator.
-
-  Assumes a simple quadratic kinetic energy function: `0.5 ||momentum||**2`.
-
-  #### Examples:
-
-  ##### Simple quadratic potential.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  dims = 10
-  num_iter = int(1e3)
-  dtype = np.float32
-
-  position = tf.placeholder(np.float32)
-  momentum = tf.placeholder(np.float32)
-
-  [
-      new_momentums,
-      new_positions,
-  ] = hmc._leapfrog_integrator(
-      current_momentums=[momentum],
-      target_log_prob_fn=tfd.MultivariateNormalDiag(
-          loc=tf.zeros(dims, dtype)).log_prob,
-      current_state_parts=[position],
-      step_sizes=0.1,
-      num_leapfrog_steps=3)[:2]
-
-  sess.graph.finalize()  # No more graph building.
-
-  momentum_ = np.random.randn(dims).astype(dtype)
-  position_ = np.random.randn(dims).astype(dtype)
-
-  positions = np.zeros([num_iter, dims], dtype)
-  for i in xrange(num_iter):
-    position_, momentum_ = sess.run(
-        [new_momentums[0], new_position[0]],
-        feed_dict={position: position_, momentum: momentum_})
-    positions[i] = position_
-
-  plt.plot(positions[:, 0]);  # Sinusoidal.
-  ```
-
-  Args:
-    current_momentums: Tensor containing the value(s) of the momentum
-      variable(s) to update.
-    target_log_prob_fn: Python callable which takes an argument like
-      `*current_state_parts` and returns its (possibly unnormalized) log-density
-      under the target distribution.
-    current_state_parts: Python `list` of `Tensor`s representing the current
-      state(s) of the Markov chain(s). The first `independent_chain_ndims` of
-      the `Tensor`(s) index different chains.
-    step_sizes: Python `list` of `Tensor`s representing the step size for the
-      leapfrog integrator. Must broadcast with the shape of
-      `current_state_parts`.  Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely. When
-      possible, it's often helpful to match per-variable step sizes to the
-      standard deviations of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn(*current_state_parts)`. The only reason to specify
-      this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `target_log_prob_fn(*current_state_parts`) wrt
-      `current_state_parts`. Must have same shape as `current_state_parts`. The
-      only reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_leapfrog_integrator").
-
-  Returns:
-    proposed_momentums: Updated value of the momentum.
-    proposed_state_parts: Tensor or Python list of `Tensor`s representing the
-      state(s) of the Markov chain(s) at each result step. Has same shape as
-      input `current_state_parts`.
-    proposed_target_log_prob: `Tensor` representing the value of
-      `target_log_prob_fn` at `accepted_state`.
-    proposed_grads_target_log_prob: Gradient of `proposed_target_log_prob` wrt
-      `accepted_state`.
-
-  Raises:
-    ValueError: if `len(momentums) != len(state_parts)`.
-    ValueError: if `len(state_parts) != len(step_sizes)`.
-    ValueError: if `len(state_parts) != len(grads_target_log_prob)`.
-    TypeError: if `not target_log_prob.dtype.is_floating`.
-  """
-  def _loop_body(step,
-                 current_momentums,
-                 current_state_parts,
-                 ignore_current_target_log_prob,  # pylint: disable=unused-argument
-                 current_grads_target_log_prob):
-    return [step + 1] + list(_leapfrog_step(current_momentums,
-                                            target_log_prob_fn,
-                                            current_state_parts,
-                                            step_sizes,
-                                            current_grads_target_log_prob))
-
-  with ops.name_scope(
-      name, "hmc_leapfrog_integrator",
-      [current_momentums, current_state_parts, step_sizes, num_leapfrog_steps,
-       current_target_log_prob, current_grads_target_log_prob]):
-    if len(current_momentums) != len(current_state_parts):
-      raise ValueError("`momentums` must be in one-to-one correspondence "
-                       "with `state_parts`")
-    num_leapfrog_steps = ops.convert_to_tensor(num_leapfrog_steps,
-                                               name="num_leapfrog_steps")
-    current_target_log_prob, current_grads_target_log_prob = (
-        _maybe_call_fn_and_grads(
-            target_log_prob_fn,
-            current_state_parts,
-            current_target_log_prob,
-            current_grads_target_log_prob))
-    return control_flow_ops.while_loop(
-        cond=lambda iter_, *args: iter_ < num_leapfrog_steps,
-        body=_loop_body,
-        loop_vars=[
-            np.int32(0),  # iter_
-            current_momentums,
-            current_state_parts,
-            current_target_log_prob,
-            current_grads_target_log_prob,
-        ],
-        back_prop=False)[1:]  # Lop-off "iter_".
-
-
-def _leapfrog_step(current_momentums,
-                   target_log_prob_fn,
-                   current_state_parts,
-                   step_sizes,
-                   current_grads_target_log_prob,
-                   name=None):
-  """Applies one step of the leapfrog integrator."""
-  with ops.name_scope(
-      name, "_leapfrog_step",
-      [current_momentums, current_state_parts, step_sizes,
-       current_grads_target_log_prob]):
-    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
-                          in zip(current_momentums,
-                                 step_sizes,
-                                 current_grads_target_log_prob)]
-    proposed_state_parts = [x + ss * m for x, ss, m
-                            in zip(current_state_parts,
-                                   step_sizes,
-                                   proposed_momentums)]
-    proposed_target_log_prob = target_log_prob_fn(*proposed_state_parts)
-    if not proposed_target_log_prob.dtype.is_floating:
-      raise TypeError("`target_log_prob_fn` must produce a `Tensor` "
-                      "with `float` `dtype`.")
-    proposed_grads_target_log_prob = gradients_ops.gradients(
-        proposed_target_log_prob, proposed_state_parts)
-    if any(g is None for g in proposed_grads_target_log_prob):
-      raise ValueError(
-          "Encountered `None` gradient. Does your target `target_log_prob_fn` "
-          "access all `tf.Variable`s via `tf.get_variable`?\n"
-          "  current_state_parts: {}\n"
-          "  proposed_state_parts: {}\n"
-          "  proposed_grads_target_log_prob: {}".format(
-              current_state_parts,
-              proposed_state_parts,
-              proposed_grads_target_log_prob))
-    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
-                          in zip(proposed_momentums,
-                                 step_sizes,
-                                 proposed_grads_target_log_prob)]
-    return [
-        proposed_momentums,
-        proposed_state_parts,
-        proposed_target_log_prob,
-        proposed_grads_target_log_prob,
-    ]
-
-
-def _compute_energy_change(current_target_log_prob,
-                           current_momentums,
-                           proposed_target_log_prob,
-                           proposed_momentums,
-                           independent_chain_ndims,
-                           name=None):
-  """Helper to `kernel` which computes the energy change."""
-  with ops.name_scope(
-      name, "compute_energy_change",
-      ([current_target_log_prob, proposed_target_log_prob,
-        independent_chain_ndims] +
-       current_momentums + proposed_momentums)):
-    # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy
-    # since they're a mouthful and lets us inline more.
-    lk0, lk1 = [], []
-    for current_momentum, proposed_momentum in zip(current_momentums,
-                                                   proposed_momentums):
-      axis = math_ops.range(independent_chain_ndims,
-                            array_ops.rank(current_momentum))
-      lk0.append(_log_sum_sq(current_momentum, axis))
-      lk1.append(_log_sum_sq(proposed_momentum, axis))
-
-    lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1),
-                                                  axis=-1)
-    lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1),
-                                                  axis=-1)
-    lp0 = -current_target_log_prob   # log_potential
-    lp1 = -proposed_target_log_prob  # proposed_log_potential
-    x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)],
-                        axis=-1)
-
-    # The sum is NaN if any element is NaN or we see both +Inf and -Inf.
-    # Thus we will replace such rows with infinite energy change which implies
-    # rejection. Recall that float-comparisons with NaN are always False.
-    is_sum_determinate = (
-        math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) &
-        math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1))
-    is_sum_determinate = array_ops.tile(
-        is_sum_determinate[..., array_ops.newaxis],
-        multiples=array_ops.concat([
-            array_ops.ones(array_ops.rank(is_sum_determinate),
-                           dtype=dtypes.int32),
-            [4],
-        ], axis=0))
-    x = array_ops.where(is_sum_determinate,
-                        x,
-                        array_ops.fill(array_ops.shape(x),
-                                       value=x.dtype.as_numpy_dtype(np.inf)))
-
-    return math_ops.reduce_sum(x, axis=-1)
-
-
-def _choose(is_accepted,
-            accepted,
-            rejected,
-            independent_chain_ndims,
-            name=None):
-  """Helper to `kernel` which expand_dims `is_accepted` to apply tf.where."""
-  def _expand_is_accepted_like(x):
-    with ops.name_scope("_choose"):
-      expand_shape = array_ops.concat([
-          array_ops.shape(is_accepted),
-          array_ops.ones([array_ops.rank(x) - array_ops.rank(is_accepted)],
-                         dtype=dtypes.int32),
-      ], axis=0)
-      multiples = array_ops.concat([
-          array_ops.ones([array_ops.rank(is_accepted)], dtype=dtypes.int32),
-          array_ops.shape(x)[independent_chain_ndims:],
-      ], axis=0)
-      m = array_ops.tile(array_ops.reshape(is_accepted, expand_shape),
-                         multiples)
-      m.set_shape(x.shape)
-      return m
-  with ops.name_scope(name, "_choose", values=[
-      is_accepted, accepted, rejected, independent_chain_ndims]):
-    return array_ops.where(_expand_is_accepted_like(accepted),
-                           accepted,
-                           rejected)
-
-
-def _maybe_call_fn_and_grads(fn,
-                             fn_arg_list,
-                             fn_result=None,
-                             grads_fn_result=None,
-                             description="target_log_prob"):
-  """Helper which computes `fn_result` and `grads` if needed."""
-  fn_arg_list = (list(fn_arg_list) if _is_list_like(fn_arg_list)
-                 else [fn_arg_list])
-  if fn_result is None:
-    fn_result = fn(*fn_arg_list)
-  if not fn_result.dtype.is_floating:
-    raise TypeError("`{}` must be a `Tensor` with `float` `dtype`.".format(
-        description))
-  if grads_fn_result is None:
-    grads_fn_result = gradients_ops.gradients(
-        fn_result, fn_arg_list)
-  if len(fn_arg_list) != len(grads_fn_result):
-    raise ValueError("`{}` must be in one-to-one correspondence with "
-                     "`grads_{}`".format(*[description]*2))
-  if any(g is None for g in grads_fn_result):
-    raise ValueError("Encountered `None` gradient.")
-  return fn_result, grads_fn_result
-
-
-def _prepare_args(target_log_prob_fn, state, step_size,
-                  target_log_prob=None, grads_target_log_prob=None,
-                  maybe_expand=False, description="target_log_prob"):
-  """Helper which processes input args to meet list-like assumptions."""
-  state_parts = list(state) if _is_list_like(state) else [state]
-  state_parts = [ops.convert_to_tensor(s, name="state")
-                 for s in state_parts]
-  target_log_prob, grads_target_log_prob = _maybe_call_fn_and_grads(
-      target_log_prob_fn,
-      state_parts,
-      target_log_prob,
-      grads_target_log_prob,
-      description)
-  step_sizes = list(step_size) if _is_list_like(step_size) else [step_size]
-  step_sizes = [
-      ops.convert_to_tensor(
-          s, name="step_size", dtype=target_log_prob.dtype)
-      for s in step_sizes]
-  if len(step_sizes) == 1:
-    step_sizes *= len(state_parts)
-  if len(state_parts) != len(step_sizes):
-    raise ValueError("There should be exactly one `step_size` or it should "
-                     "have same length as `current_state`.")
-  maybe_flatten = lambda x: x if maybe_expand or _is_list_like(state) else x[0]
-  return [
-      maybe_flatten(state_parts),
-      maybe_flatten(step_sizes),
-      target_log_prob,
-      grads_target_log_prob,
-  ]
-
-
-def _is_list_like(x):
-  """Helper which returns `True` if input is `list`-like."""
-  return isinstance(x, (tuple, list))
-
-
-def _log_sum_sq(x, axis=None):
-  """Computes log(sum(x**2))."""
-  return math_ops.reduce_logsumexp(2. * math_ops.log(math_ops.abs(x)), axis)
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers.py b/tensorflow/contrib/bayesflow/python/ops/layers.py
deleted file mode 100644
index a742b7c1aa593d6c08bf9d8d597c99c9fc4e7aed..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/layers.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Probabilistic neural layers.
-
-See ${python/contrib.bayesflow.layers}.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.layers_conv_variational import *
-from tensorflow.contrib.bayesflow.python.ops.layers_dense_variational import *
-from tensorflow.contrib.bayesflow.python.ops.layers_util import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'Convolution1DReparameterization',
-    'Convolution2DReparameterization',
-    'Convolution3DReparameterization',
-    'Convolution1DFlipout',
-    'Convolution2DFlipout',
-    'Convolution3DFlipout',
-    'Conv1DReparameterization',
-    'Conv2DReparameterization',
-    'Conv3DReparameterization',
-    'Conv1DFlipout',
-    'Conv2DFlipout',
-    'Conv3DFlipout',
-    'convolution1d_reparameterization',
-    'convolution2d_reparameterization',
-    'convolution3d_reparameterization',
-    'convolution1d_flipout',
-    'convolution2d_flipout',
-    'convolution3d_flipout',
-    'conv1d_reparameterization',
-    'conv2d_reparameterization',
-    'conv3d_reparameterization',
-    'conv1d_flipout',
-    'conv2d_flipout',
-    'conv3d_flipout',
-    'DenseReparameterization',
-    'DenseLocalReparameterization',
-    'DenseFlipout',
-    'dense_reparameterization',
-    'dense_local_reparameterization',
-    'dense_flipout',
-    'default_loc_scale_fn',
-    'default_mean_field_normal_fn',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_conv_variational.py b/tensorflow/contrib/bayesflow/python/ops/layers_conv_variational.py
deleted file mode 100644
index 7723cfb442712626ff415f1412e3362f2392ce9f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/layers_conv_variational.py
+++ /dev/null
@@ -1,2943 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Convolutional variational layer classes and their functional aliases.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.bayesflow.python.ops import layers_util
-from tensorflow.contrib.distributions.python.ops import independent as independent_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import base as layers_lib
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import standard_ops
-from tensorflow.python.ops.distributions import kullback_leibler as kl_lib
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-class _ConvVariational(layers_lib.Layer):
-  """Abstract nD convolution layer (private, used as implementation base).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      length of the convolution window.
-    strides: An integer or tuple/list of n integers,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of n integers, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    name: A string, the name of the layer.
-
-  Properties:
-    rank: Python integer, dimensionality of convolution.
-    filters: Python integer, dimensionality of the output space.
-    kernel_size: Size of the convolution window.
-    strides: Stride length of convolution.
-    padding: Python string describing padding approach.
-    data_format: Python string describing input data's dimensions.
-    dilation_rate: Dilation rate for an atrous convolution.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-  """
-
-  def __init__(
-      self,
-      rank,
-      filters,
-      kernel_size,
-      strides=1,
-      padding="valid",
-      data_format="channels_last",
-      dilation_rate=1,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      name=None,
-      **kwargs):
-    super(_ConvVariational, self).__init__(
-        trainable=trainable,
-        name=name,
-        activity_regularizer=activity_regularizer,
-        **kwargs)
-    self.rank = rank
-    self.filters = filters
-    self.kernel_size = utils.normalize_tuple(kernel_size, rank, "kernel_size")
-    self.strides = utils.normalize_tuple(strides, rank, "strides")
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.dilation_rate = utils.normalize_tuple(
-        dilation_rate, rank, "dilation_rate")
-    self.activation = activation
-    self.input_spec = layers_lib.InputSpec(ndim=self.rank + 2)
-    self.kernel_posterior_fn = kernel_posterior_fn
-    self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn
-    self.kernel_prior_fn = kernel_prior_fn
-    self.kernel_divergence_fn = kernel_divergence_fn
-    self.bias_posterior_fn = bias_posterior_fn
-    self.bias_posterior_tensor_fn = bias_posterior_tensor_fn
-    self.bias_prior_fn = bias_prior_fn
-    self.bias_divergence_fn = bias_divergence_fn
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == "channels_first":
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis].value is None:
-      raise ValueError("The channel dimension of the inputs "
-                       "should be defined. Found `None`.")
-    input_dim = input_shape[channel_axis].value
-    kernel_shape = self.kernel_size + (input_dim, self.filters)
-    dtype = dtypes.as_dtype(self.dtype)
-
-    # Must have a posterior kernel.
-    self.kernel_posterior = self.kernel_posterior_fn(
-        dtype, kernel_shape, "kernel_posterior",
-        self.trainable, self.add_variable)
-
-    if self.kernel_prior_fn is None:
-      self.kernel_prior = None
-    else:
-      self.kernel_prior = self.kernel_prior_fn(
-          dtype, kernel_shape, "kernel_prior",
-          self.trainable, self.add_variable)
-    self._built_kernel_divergence = False
-
-    if self.bias_posterior_fn is None:
-      self.bias_posterior = None
-    else:
-      self.bias_posterior = self.bias_posterior_fn(
-          dtype, (self.filters,), "bias_posterior",
-          self.trainable, self.add_variable)
-
-    if self.bias_prior_fn is None:
-      self.bias_prior = None
-    else:
-      self.bias_prior = self.bias_prior_fn(
-          dtype, (self.filters,), "bias_prior",
-          self.trainable, self.add_variable)
-    self._built_bias_divergence = False
-
-    self.input_spec = layers_lib.InputSpec(ndim=self.rank + 2,
-                                           axes={channel_axis: input_dim})
-    self._convolution_op = nn_ops.Convolution(
-        input_shape,
-        filter_shape=tensor_shape.TensorShape(kernel_shape),
-        dilation_rate=self.dilation_rate,
-        strides=self.strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format,
-                                              self.rank + 2))
-
-    self.built = True
-
-  def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
-
-    outputs = self._apply_variational_kernel(inputs)
-    outputs = self._apply_variational_bias(outputs)
-    if self.activation is not None:
-      outputs = self.activation(outputs)
-    if not self._built_kernel_divergence:
-      kernel_posterior = self.kernel_posterior
-      kernel_prior = self.kernel_prior
-      if isinstance(self.kernel_posterior, independent_lib.Independent):
-        kernel_posterior = kernel_posterior.distribution
-      if isinstance(self.kernel_prior, independent_lib.Independent):
-        kernel_prior = kernel_prior.distribution
-      self._apply_divergence(self.kernel_divergence_fn,
-                             kernel_posterior,
-                             kernel_prior,
-                             self.kernel_posterior_tensor,
-                             name="divergence_kernel")
-      self._built_kernel_divergence = True
-    if not self._built_bias_divergence:
-      bias_posterior = self.bias_posterior
-      bias_prior = self.bias_prior
-      if isinstance(self.bias_posterior, independent_lib.Independent):
-        bias_posterior = bias_posterior.distribution
-      if isinstance(self.bias_prior, independent_lib.Independent):
-        bias_prior = bias_prior.distribution
-      self._apply_divergence(self.bias_divergence_fn,
-                             bias_posterior,
-                             bias_prior,
-                             self.bias_posterior_tensor,
-                             name="divergence_bias")
-      self._built_bias_divergence = True
-    return outputs
-
-  def _apply_variational_bias(self, inputs):
-    if self.bias_posterior is None:
-      self.bias_posterior_tensor = None
-      return inputs
-    self.bias_posterior_tensor = self.bias_posterior_tensor_fn(
-        self.bias_posterior)
-    outputs = inputs
-    if self.data_format == "channels_first":
-      if self.rank == 1:
-        # nn.bias_add does not accept a 1D input tensor.
-        bias = array_ops.reshape(self.bias_posterior_tensor,
-                                 (1, self.filters, 1))
-        outputs += bias
-      if self.rank == 2:
-        outputs = nn.bias_add(outputs,
-                              self.bias_posterior_tensor,
-                              data_format="NCHW")
-      if self.rank == 3:
-        # As of Mar 2017, direct addition is significantly slower than
-        # bias_add when computing gradients. To use bias_add, we collapse Z
-        # and Y into a single dimension to obtain a 4D input tensor.
-        outputs_shape = outputs.shape.as_list()
-        outputs_4d = array_ops.reshape(outputs,
-                                       [outputs_shape[0], outputs_shape[1],
-                                        outputs_shape[2] * outputs_shape[3],
-                                        outputs_shape[4]])
-        outputs_4d = nn.bias_add(outputs_4d,
-                                 self.bias_posterior_tensor,
-                                 data_format="NCHW")
-        outputs = array_ops.reshape(outputs_4d, outputs_shape)
-    else:
-      outputs = nn.bias_add(outputs,
-                            self.bias_posterior_tensor,
-                            data_format="NHWC")
-    return outputs
-
-  def _apply_divergence(self, divergence_fn, posterior, prior,
-                        posterior_tensor, name):
-    if (divergence_fn is None or
-        posterior is None or
-        prior is None):
-      divergence = None
-      return
-    divergence = standard_ops.identity(
-        divergence_fn(
-            posterior, prior, posterior_tensor),
-        name=name)
-    self.add_loss(divergence)
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == "channels_last":
-      space = input_shape[1:-1]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0]] + new_space +
-                                      [self.filters])
-    else:
-      space = input_shape[2:]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0], self.filters] +
-                                      new_space)
-
-
-class _ConvReparameterization(_ConvVariational):
-  """Abstract nD convolution layer (private, used as implementation base).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the reparameterization
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      length of the convolution window.
-    strides: An integer or tuple/list of n integers,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of n integers, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    name: A string, the name of the layer.
-
-  Properties:
-    rank: Python integer, dimensionality of convolution.
-    filters: Python integer, dimensionality of the output space.
-    kernel_size: Size of the convolution window.
-    strides: Stride length of convolution.
-    padding: Python string describing padding approach.
-    data_format: Python string describing input data's dimensions.
-    dilation_rate: Dilation rate for an atrous convolution.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-
-  [1]: "Auto-Encoding Variational Bayes."
-        Diederik P. Kingma, Max Welling.
-        International Conference on Learning Representations, 2014.
-  """
-
-  def __init__(
-      self,
-      rank,
-      filters,
-      kernel_size,
-      strides=1,
-      padding="valid",
-      data_format="channels_last",
-      dilation_rate=1,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      name=None,
-      **kwargs):
-    super(_ConvReparameterization, self).__init__(
-        rank=rank,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        name=name, **kwargs)
-
-  def _apply_variational_kernel(self, inputs):
-    self.kernel_posterior_tensor = self.kernel_posterior_tensor_fn(
-        self.kernel_posterior)
-    self.kernel_posterior_affine = None
-    self.kernel_posterior_affine_tensor = None
-    outputs = self._convolution_op(inputs, self.kernel_posterior_tensor)
-    return outputs
-
-
-class Conv1DReparameterization(_ConvReparameterization):
-  """1D convolution layer (e.g. temporal convolution).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the reparameterization
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of a single integer, specifying the
-      length of the 1D convolution window.
-    strides: An integer or tuple/list of a single integer,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: An integer or tuple/list of a single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    name: A string, the name of the layer.
-
-  Properties:
-    filters: Python integer, dimensionality of the output space.
-    kernel_size: Size of the convolution window.
-    strides: Stride length of convolution.
-    padding: Python string describing padding approach.
-    data_format: Python string describing input data's dimensions.
-    dilation_rate: Dilation rate for an atrous convolution.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 128, 1])
-  net = tfp.layers.Conv1DReparameterization(64,
-                                            kernel_size=5,
-                                            padding="SAME",
-                                            activation=tf.nn.relu)(net)
-  net = tf.reshape(net, [-1, 128 * 64])
-  logits = tfp.layers.DenseReparameterization(10)(net)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Auto-Encoding Variational Bayes."
-        Diederik P. Kingma, Max Welling.
-        International Conference on Learning Representations, 2014.
-  """
-
-  def __init__(
-      self,
-      filters,
-      kernel_size,
-      strides=1,
-      padding="valid",
-      data_format="channels_last",
-      dilation_rate=1,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      name=None,
-      **kwargs):
-    super(Conv1DReparameterization, self).__init__(
-        rank=1,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        name=name, **kwargs)
-
-
-def conv1d_reparameterization(
-    inputs,
-    filters,
-    kernel_size,
-    strides=1,
-    padding="valid",
-    data_format="channels_last",
-    dilation_rate=1,
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    name=None,
-    reuse=None):
-  """Functional interface for 1D convolution layer (e.g. temporal convolution).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the reparameterization
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of a single integer, specifying the
-      length of the 1D convolution window.
-    strides: An integer or tuple/list of a single integer,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: An integer or tuple/list of a single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 128, 1])
-  net = tfp.layers.conv1d_reparameterization(net,
-                                             filters=64,
-                                             kernel_size=5,
-                                             padding="SAME",
-                                             activation=tf.nn.relu)
-  net = tf.reshape(net, [-1, 128 * 64])
-  logits = tfp.layers.dense_reparameterization(net, 10)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Auto-Encoding Variational Bayes."
-        Diederik P. Kingma, Max Welling.
-        International Conference on Learning Representations, 2014.
-  """
-  layer = Conv1DReparameterization(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
-
-
-class Conv2DReparameterization(_ConvReparameterization):
-  """2D convolution layer (e.g. spatial convolution over images).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the reparameterization
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    name: A string, the name of the layer.
-
-  Properties:
-    filters: Python integer, dimensionality of the output space.
-    kernel_size: Size of the convolution window.
-    strides: Stride length of convolution.
-    padding: Python string describing padding approach.
-    data_format: Python string describing input data's dimensions.
-    dilation_rate: Dilation rate for an atrous convolution.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 32, 32, 3])
-  net = tfp.layers.Conv2DReparameterization(64,
-                                            kernel_size=5,
-                                            padding="SAME",
-                                            activation=tf.nn.relu)(net)
-  net = tf.layers.MaxPooling2D(pool_size=2,
-                               strides=2,
-                               padding="SAME")(net)
-  net = tf.reshape(net, [-1, 8 * 8 * 64])
-  logits = tfp.layers.DenseReparameterization(10)(net)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Auto-Encoding Variational Bayes."
-        Diederik P. Kingma, Max Welling.
-        International Conference on Learning Representations, 2014.
-  """
-
-  def __init__(
-      self,
-      filters,
-      kernel_size,
-      strides=(1, 1),
-      padding="valid",
-      data_format="channels_last",
-      dilation_rate=(1, 1),
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      name=None,
-      **kwargs):
-    super(Conv2DReparameterization, self).__init__(
-        rank=2,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        name=name, **kwargs)
-
-
-def conv2d_reparameterization(
-    inputs,
-    filters,
-    kernel_size,
-    strides=(1, 1),
-    padding="valid",
-    data_format="channels_last",
-    dilation_rate=(1, 1),
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    name=None,
-    reuse=None):
-  """Functional interface for the 2D convolution layer.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the reparameterization
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 32, 32, 3])
-  net = tfp.layers.conv2d_reparameterization(net,
-                                             filters=64,
-                                             kernel_size=5,
-                                             padding="SAME",
-                                             activation=tf.nn.relu)
-  net = tf.layers.max_pooling2d(net,
-                                pool_size=2,
-                                strides=2,
-                                padding="SAME")
-  net = tf.reshape(net, [-1, 8 * 8 * 64])
-  logits = tfp.layers.dense_reparameterization(net, 10)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Auto-Encoding Variational Bayes."
-        Diederik P. Kingma, Max Welling.
-        International Conference on Learning Representations, 2014.
-  """
-  layer = Conv2DReparameterization(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
-
-
-class Conv3DReparameterization(_ConvReparameterization):
-  """3D convolution layer (e.g. spatial convolution over volumes).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the reparameterization
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the depth,
-      height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    dilation_rate: An integer or tuple/list of 3 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    name: A string, the name of the layer.
-
-  Properties:
-    filters: Python integer, dimensionality of the output space.
-    kernel_size: Size of the convolution window.
-    strides: Stride length of convolution.
-    padding: Python string describing padding approach.
-    data_format: Python string describing input data's dimensions.
-    dilation_rate: Dilation rate for an atrous convolution.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 256, 32, 32, 3])
-  net = tfp.layers.Conv3DReparameterization(64,
-                                            kernel_size=5,
-                                            padding="SAME",
-                                            activation=tf.nn.relu)(net)
-  net = tf.layers.MaxPooling2D(pool_size=2,
-                               strides=2,
-                               padding="SAME")(net)
-  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
-  logits = tfp.layers.DenseReparameterization(10)(net)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Auto-Encoding Variational Bayes."
-        Diederik P. Kingma, Max Welling.
-        International Conference on Learning Representations, 2014.
-  """
-
-  def __init__(
-      self,
-      filters,
-      kernel_size,
-      strides=(1, 1, 1),
-      padding="valid",
-      data_format="channels_last",
-      dilation_rate=(1, 1, 1),
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      name=None,
-      **kwargs):
-    super(Conv3DReparameterization, self).__init__(
-        rank=3,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        name=name, **kwargs)
-
-
-def conv3d_reparameterization(
-    inputs,
-    filters,
-    kernel_size,
-    strides=(1, 1, 1),
-    padding="valid",
-    data_format="channels_last",
-    dilation_rate=(1, 1, 1),
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    name=None,
-    reuse=None):
-  """Functional interface for the 3D convolution layer.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the reparameterization
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the depth,
-      height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    dilation_rate: An integer or tuple/list of 3 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 256, 32, 32, 3])
-  net = tfp.layers.conv3d_reparameterization(net,
-                                             filters=64,
-                                             kernel_size=5,
-                                             padding="SAME",
-                                             activation=tf.nn.relu)
-  net = tf.layers.max_pooling2d(net,
-                                pool_size=2,
-                                strides=2,
-                                padding="SAME")
-  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
-  logits = tfp.layers.dense_reparameterization(net, 10)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Auto-Encoding Variational Bayes."
-        Diederik P. Kingma, Max Welling.
-        International Conference on Learning Representations, 2014.
-  """
-  layer = Conv3DReparameterization(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
-
-
-class _ConvFlipout(_ConvVariational):
-  """Abstract nD convolution layer (private, used as implementation base).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the Flipout
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`. Flipout uses
-  roughly twice as many floating point operations as the
-  reparameterization estimator but has the advantage of significantly
-  lower variance.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      length of the convolution window.
-    strides: An integer or tuple/list of n integers,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of n integers, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    seed: Python scalar `int` which initializes the random number
-      generator. Default value: `None` (i.e., use global seed).
-    name: A string, the name of the layer.
-
-  Properties:
-    rank: Python integer, dimensionality of convolution.
-    filters: Python integer, dimensionality of the output space.
-    kernel_size: Size of the convolution window.
-    strides: Stride length of convolution.
-    padding: Python string describing padding approach.
-    data_format: Python string describing input data's dimensions.
-    dilation_rate: Dilation rate for an atrous convolution.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-    seed: Python integer, used to create random seeds.
-
-  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
-        Mini-Batches."
-        Anonymous. OpenReview, 2017.
-        https://openreview.net/forum?id=rJnpifWAb
-  """
-
-  def __init__(
-      self,
-      rank,
-      filters,
-      kernel_size,
-      strides=1,
-      padding="valid",
-      data_format="channels_last",
-      dilation_rate=1,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      seed=None,
-      name=None,
-      **kwargs):
-    super(_ConvFlipout, self).__init__(
-        rank=rank,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        name=name, **kwargs)
-    self.seed = seed
-
-  def _apply_variational_kernel(self, inputs):
-    if (not isinstance(self.kernel_posterior, independent_lib.Independent) or
-        not isinstance(self.kernel_posterior.distribution, normal_lib.Normal)):
-      raise TypeError(
-          "`{}` requires "
-          "`kernel_posterior_fn` produce an instance of "
-          "`tf.distributions.Independent(tf.distributions.Normal)` "
-          "(saw: \"{}\").".format(
-              type(self).__name__, self.kernel_posterior.name))
-    self.kernel_posterior_affine = normal_lib.Normal(
-        loc=array_ops.zeros_like(self.kernel_posterior.distribution.loc),
-        scale=self.kernel_posterior.distribution.scale)
-    self.kernel_posterior_affine_tensor = (
-        self.kernel_posterior_tensor_fn(self.kernel_posterior_affine))
-    self.kernel_posterior_tensor = None
-
-    outputs = self._convolution_op(
-        inputs, self.kernel_posterior.distribution.loc)
-
-    input_shape = array_ops.shape(inputs)
-    output_shape = array_ops.shape(outputs)
-    batch_shape = array_ops.expand_dims(input_shape[0], 0)
-    channels = input_shape[-1]
-
-    sign_input = layers_util.random_sign(
-        array_ops.concat([batch_shape,
-                          array_ops.expand_dims(channels, 0)], 0),
-        dtype=inputs.dtype,
-        seed=self.seed)
-    sign_output = layers_util.random_sign(
-        array_ops.concat([batch_shape,
-                          array_ops.expand_dims(self.filters, 0)], 0),
-        dtype=inputs.dtype,
-        seed=distribution_util.gen_new_seed(
-            self.seed, salt="conv_flipout"))
-    for _ in range(self.rank):
-      sign_input = array_ops.expand_dims(sign_input, 1)  # 2D ex: (B, 1, 1, C)
-      sign_output = array_ops.expand_dims(sign_output, 1)
-
-    sign_input = array_ops.tile(  # tile for element-wise op broadcasting
-        sign_input,
-        [1] + [input_shape[i + 1] for i in range(self.rank)] + [1])
-    sign_output = array_ops.tile(
-        sign_output,
-        [1] + [output_shape[i + 1] for i in range(self.rank)] + [1])
-
-    perturbed_inputs = self._convolution_op(
-        inputs * sign_input, self.kernel_posterior_affine_tensor) * sign_output
-
-    outputs += perturbed_inputs
-    return outputs
-
-
-class Conv1DFlipout(_ConvFlipout):
-  """1D convolution layer (e.g. temporal convolution) with Flipout.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the Flipout
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`. Flipout uses
-  roughly twice as many floating point operations as the
-  reparameterization estimator but has the advantage of significantly
-  lower variance.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of a single integer, specifying the
-      length of the 1D convolution window.
-    strides: An integer or tuple/list of a single integer,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: An integer or tuple/list of a single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    seed: Python scalar `int` which initializes the random number
-      generator. Default value: `None` (i.e., use global seed).
-    name: A string, the name of the layer.
-
-  Properties:
-    filters: Python integer, dimensionality of the output space.
-    kernel_size: Size of the convolution window.
-    strides: Stride length of convolution.
-    padding: Python string describing padding approach.
-    data_format: Python string describing input data's dimensions.
-    dilation_rate: Dilation rate for an atrous convolution.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-    seed: Python integer, used to create random seeds.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 128, 1])
-  net = tfp.layers.Conv1DFlipout(64,
-                                 kernel_size=5,
-                                 padding="SAME",
-                                 activation=tf.nn.relu)(net)
-  net = tf.reshape(net, [-1, 128 * 64])
-  logits = tfp.layers.DenseFlipout(10)(net)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses the Flipout gradient estimator to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
-        Mini-Batches."
-        Anonymous. OpenReview, 2017.
-        https://openreview.net/forum?id=rJnpifWAb
-  """
-
-  def __init__(
-      self,
-      filters,
-      kernel_size,
-      strides=1,
-      padding="valid",
-      data_format="channels_last",
-      dilation_rate=1,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      seed=None,
-      name=None,
-      **kwargs):
-    super(Conv1DFlipout, self).__init__(
-        rank=1,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        seed=seed,
-        name=name, **kwargs)
-
-
-def conv1d_flipout(
-    inputs,
-    filters,
-    kernel_size,
-    strides=1,
-    padding="valid",
-    data_format="channels_last",
-    dilation_rate=1,
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    seed=None,
-    name=None,
-    reuse=None):
-  """Functional interface for 1D convolution layer (e.g. temporal convolution).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the Flipout
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`. Flipout uses
-  roughly twice as many floating point operations as the
-  reparameterization estimator but has the advantage of significantly
-  lower variance.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of a single integer, specifying the
-      length of the 1D convolution window.
-    strides: An integer or tuple/list of a single integer,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: An integer or tuple/list of a single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    seed: Python scalar `int` which initializes the random number
-      generator. Default value: `None` (i.e., use global seed).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 128, 1])
-  net = tfp.layers.conv1d_flipout(net,
-                                  filters=64,
-                                  kernel_size=5,
-                                  padding="SAME",
-                                  activation=tf.nn.relu)
-  net = tf.reshape(net, [-1, 128 * 64])
-  logits = tfp.layers.dense_flipout(net, 10)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses the Flipout gradient estimator to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
-        Mini-Batches."
-        Anonymous. OpenReview, 2017.
-        https://openreview.net/forum?id=rJnpifWAb
-  """
-  layer = Conv1DFlipout(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      seed=seed,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
-
-
-class Conv2DFlipout(_ConvFlipout):
-  """2D convolution layer (e.g. spatial convolution over images) with Flipout.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the Flipout
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`. Flipout uses
-  roughly twice as many floating point operations as the
-  reparameterization estimator but has the advantage of significantly
-  lower variance.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    seed: Python scalar `int` which initializes the random number
-      generator. Default value: `None` (i.e., use global seed).
-    name: A string, the name of the layer.
-
-  Properties:
-    filters: Python integer, dimensionality of the output space.
-    kernel_size: Size of the convolution window.
-    strides: Stride length of convolution.
-    padding: Python string describing padding approach.
-    data_format: Python string describing input data's dimensions.
-    dilation_rate: Dilation rate for an atrous convolution.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-    seed: Python integer, used to create random seeds.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 32, 32, 3])
-  net = tfp.layers.Conv2DFlipout(64,
-                                 kernel_size=5,
-                                 padding="SAME",
-                                 activation=tf.nn.relu)(net)
-  net = tf.layers.MaxPooling2D(pool_size=2,
-                               strides=2,
-                               padding="SAME")(net)
-  net = tf.reshape(net, [-1, 8 * 8 * 64])
-  logits = tfp.layers.DenseFlipout(10)(net)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses the Flipout gradient estimator to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
-        Mini-Batches."
-        Anonymous. OpenReview, 2017.
-        https://openreview.net/forum?id=rJnpifWAb
-  """
-
-  def __init__(
-      self,
-      filters,
-      kernel_size,
-      strides=(1, 1),
-      padding="valid",
-      data_format="channels_last",
-      dilation_rate=(1, 1),
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      seed=None,
-      name=None,
-      **kwargs):
-    super(Conv2DFlipout, self).__init__(
-        rank=2,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        seed=seed,
-        name=name, **kwargs)
-
-
-def conv2d_flipout(
-    inputs,
-    filters,
-    kernel_size,
-    strides=(1, 1),
-    padding="valid",
-    data_format="channels_last",
-    dilation_rate=(1, 1),
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    seed=None,
-    name=None,
-    reuse=None):
-  """Functional interface for the 2D convolution layer.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the Flipout
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`. Flipout uses
-  roughly twice as many floating point operations as the
-  reparameterization estimator but has the advantage of significantly
-  lower variance.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    seed: Python scalar `int` which initializes the random number
-      generator. Default value: `None` (i.e., use global seed).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 32, 32, 3])
-  net = tfp.layers.conv2d_flipout(net,
-                                  filters=64,
-                                  kernel_size=5,
-                                  padding="SAME",
-                                  activation=tf.nn.relu)
-  net = tf.layers.max_pooling2d(net,
-                                pool_size=2,
-                                strides=2,
-                                padding="SAME")
-  net = tf.reshape(net, [-1, 8 * 8 * 64])
-  logits = tfp.layers.dense_flipout(net, 10)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses the Flipout gradient estimator to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
-        Mini-Batches."
-        Anonymous. OpenReview, 2017.
-        https://openreview.net/forum?id=rJnpifWAb
-  """
-  layer = Conv2DFlipout(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      seed=seed,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
-
-
-class Conv3DFlipout(_ConvFlipout):
-  """3D convolution layer (e.g. spatial convolution over volumes) with Flipout.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the Flipout
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`. Flipout uses
-  roughly twice as many floating point operations as the
-  reparameterization estimator but has the advantage of significantly
-  lower variance.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the depth,
-      height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    dilation_rate: An integer or tuple/list of 3 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    seed: Python scalar `int` which initializes the random number
-      generator. Default value: `None` (i.e., use global seed).
-    name: A string, the name of the layer.
-
-  Properties:
-    filters: Python integer, dimensionality of the output space.
-    kernel_size: Size of the convolution window.
-    strides: Stride length of convolution.
-    padding: Python string describing padding approach.
-    data_format: Python string describing input data's dimensions.
-    dilation_rate: Dilation rate for an atrous convolution.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-    seed: Python integer, used to create random seeds.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 256, 32, 32, 3])
-  net = tfp.layers.Conv3DFlipout(64,
-                                 kernel_size=5,
-                                 padding="SAME",
-                                 activation=tf.nn.relu)(net)
-  net = tf.layers.MaxPooling2D(pool_size=2,
-                               strides=2,
-                               padding="SAME")(net)
-  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
-  logits = tfp.layers.DenseFlipout(10)(net)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses the Flipout gradient estimator to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
-        Mini-Batches."
-        Anonymous. OpenReview, 2017.
-        https://openreview.net/forum?id=rJnpifWAb
-  """
-
-  def __init__(
-      self,
-      filters,
-      kernel_size,
-      strides=(1, 1, 1),
-      padding="valid",
-      data_format="channels_last",
-      dilation_rate=(1, 1, 1),
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      seed=None,
-      name=None,
-      **kwargs):
-    super(Conv3DFlipout, self).__init__(
-        rank=3,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        seed=seed,
-        name=name, **kwargs)
-
-
-def conv3d_flipout(
-    inputs,
-    filters,
-    kernel_size,
-    strides=(1, 1, 1),
-    padding="valid",
-    data_format="channels_last",
-    dilation_rate=(1, 1, 1),
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    seed=None,
-    name=None,
-    reuse=None):
-  """Functional interface for the 3D convolution layer.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. It may also include a bias addition and activation function
-  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
-  distributions.
-
-  By default, the layer implements a stochastic forward pass via
-  sampling from the kernel and bias posteriors,
-  ```none
-  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
-  ```
-  where f denotes the layer's calculation. It uses the Flipout
-  estimator [1], which performs a Monte Carlo approximation of the
-  distribution integrating over the `kernel` and `bias`. Flipout uses
-  roughly twice as many floating point operations as the
-  reparameterization estimator but has the advantage of significantly
-  lower variance.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Arguments:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the depth,
-      height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    dilation_rate: An integer or tuple/list of 3 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Optional regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-    seed: Python scalar `int` which initializes the random number
-      generator. Default value: `None` (i.e., use global seed).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tf.reshape(features, [-1, 256, 32, 32, 3])
-  net = tfp.layers.conv3d_flipout(net,
-                                  filters=64,
-                                  kernel_size=5,
-                                  padding="SAME",
-                                  activation=tf.nn.relu)
-  net = tf.layers.max_pooling2d(net,
-                                pool_size=2,
-                                strides=2,
-                                padding="SAME")
-  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
-  logits = tfp.layers.dense_flipout(net, 10)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses the Flipout gradient estimator to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
-        Mini-Batches."
-        Anonymous. OpenReview, 2017.
-        https://openreview.net/forum?id=rJnpifWAb
-  """
-  layer = Conv3DFlipout(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      seed=seed,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
-
-
-# Aliases
-
-Convolution1DReparameterization = Conv1DReparameterization
-Convolution2DReparameterization = Conv2DReparameterization
-Convolution3DReparameterization = Conv3DReparameterization
-convolution1d_reparameterization = conv1d_reparameterization
-convolution2d_reparameterization = conv2d_reparameterization
-convolution3d_reparameterization = conv3d_reparameterization
-Convolution1DFlipout = Conv1DFlipout
-Convolution2DFlipout = Conv2DFlipout
-Convolution3DFlipout = Conv3DFlipout
-convolution1d_flipout = conv1d_flipout
-convolution2d_flipout = conv2d_flipout
-convolution3d_flipout = conv3d_flipout
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational.py b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational.py
deleted file mode 100644
index 591a8e553de0c194786c7ee8693665f762711b2d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational.py
+++ /dev/null
@@ -1,1176 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Dense Bayesian layer using KL-divergence based variational inference.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.bayesflow.python.ops import layers_util
-from tensorflow.contrib.distributions.python.ops import independent as independent_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import base as layers_lib
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import standard_ops
-from tensorflow.python.ops.distributions import kullback_leibler as kl_lib
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-class _DenseVariational(layers_lib.Layer):
-  """Abstract densely-connected class (private, used as implementation base).
-
-  This layer implements the Bayesian variational inference analogue to
-  a dense layer by assuming the `kernel` and/or the `bias` are drawn
-  from distributions. By default, the layer implements a stochastic
-  forward pass via sampling from the kernel and bias posteriors,
-
-  ```none
-  kernel, bias ~ posterior
-  outputs = activation(matmul(inputs, kernel) + bias)
-  ```
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Args:
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (`callable`). Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    name: Python `str`, the name of the layer. Layers with the same name will
-      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
-      such cases.
-    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
-      layer by the same name.
-
-  Properties:
-    units: Python integer, dimensionality of the output space.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-  """
-
-  def __init__(
-      self,
-      units,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      name=None,
-      **kwargs):
-    super(_DenseVariational, self).__init__(
-        trainable=trainable,
-        name=name,
-        activity_regularizer=activity_regularizer,
-        **kwargs)
-    self.units = units
-    self.activation = activation
-    self.input_spec = layers_lib.InputSpec(min_ndim=2)
-    self.kernel_posterior_fn = kernel_posterior_fn
-    self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn
-    self.kernel_prior_fn = kernel_prior_fn
-    self.kernel_divergence_fn = kernel_divergence_fn
-    self.bias_posterior_fn = bias_posterior_fn
-    self.bias_posterior_tensor_fn = bias_posterior_tensor_fn
-    self.bias_prior_fn = bias_prior_fn
-    self.bias_divergence_fn = bias_divergence_fn
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    in_size = input_shape.with_rank_at_least(2)[-1].value
-    if in_size is None:
-      raise ValueError("The last dimension of the inputs to `Dense` "
-                       "should be defined. Found `None`.")
-    self._input_spec = layers_lib.InputSpec(min_ndim=2, axes={-1: in_size})
-    dtype = dtypes.as_dtype(self.dtype)
-
-    # Must have a posterior kernel.
-    self.kernel_posterior = self.kernel_posterior_fn(
-        dtype, [in_size, self.units], "kernel_posterior",
-        self.trainable, self.add_variable)
-
-    if self.kernel_prior_fn is None:
-      self.kernel_prior = None
-    else:
-      self.kernel_prior = self.kernel_prior_fn(
-          dtype, [in_size, self.units], "kernel_prior",
-          self.trainable, self.add_variable)
-    self._built_kernel_divergence = False
-
-    if self.bias_posterior_fn is None:
-      self.bias_posterior = None
-    else:
-      self.bias_posterior = self.bias_posterior_fn(
-          dtype, [self.units], "bias_posterior",
-          self.trainable, self.add_variable)
-
-    if self.bias_prior_fn is None:
-      self.bias_prior = None
-    else:
-      self.bias_prior = self.bias_prior_fn(
-          dtype, [self.units], "bias_prior",
-          self.trainable, self.add_variable)
-    self._built_bias_divergence = False
-
-    self.built = True
-
-  def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
-
-    outputs = self._apply_variational_kernel(inputs)
-    outputs = self._apply_variational_bias(outputs)
-    if self.activation is not None:
-      outputs = self.activation(outputs)  # pylint: disable=not-callable
-    if not self._built_kernel_divergence:
-      kernel_posterior = self.kernel_posterior
-      kernel_prior = self.kernel_prior
-      if isinstance(self.kernel_posterior, independent_lib.Independent):
-        kernel_posterior = kernel_posterior.distribution
-      if isinstance(self.kernel_prior, independent_lib.Independent):
-        kernel_prior = kernel_prior.distribution
-      self._apply_divergence(self.kernel_divergence_fn,
-                             kernel_posterior,
-                             kernel_prior,
-                             self.kernel_posterior_tensor,
-                             name="divergence_kernel")
-      self._built_kernel_divergence = True
-    if not self._built_bias_divergence:
-      bias_posterior = self.bias_posterior
-      bias_prior = self.bias_prior
-      if isinstance(self.bias_posterior, independent_lib.Independent):
-        bias_posterior = bias_posterior.distribution
-      if isinstance(self.bias_prior, independent_lib.Independent):
-        bias_prior = bias_prior.distribution
-      self._apply_divergence(self.bias_divergence_fn,
-                             bias_posterior,
-                             bias_prior,
-                             self.bias_posterior_tensor,
-                             name="divergence_bias")
-      self._built_bias_divergence = True
-    return outputs
-
-  def _apply_variational_bias(self, inputs):
-    if self.bias_posterior is None:
-      self.bias_posterior_tensor = None
-      return inputs
-    self.bias_posterior_tensor = self.bias_posterior_tensor_fn(
-        self.bias_posterior)
-    return nn.bias_add(inputs, self.bias_posterior_tensor)
-
-  def _apply_divergence(self, divergence_fn, posterior, prior,
-                        posterior_tensor, name):
-    if (divergence_fn is None or
-        posterior is None or
-        prior is None):
-      divergence = None
-      return
-    divergence = standard_ops.identity(
-        divergence_fn(
-            posterior, prior, posterior_tensor),
-        name=name)
-    self.add_loss(divergence)
-
-  def _matmul(self, inputs, kernel):
-    if inputs.shape.ndims <= 2:
-      return standard_ops.matmul(inputs, kernel)
-    # To handle broadcasting, we must use `tensordot`.
-    return standard_ops.tensordot(inputs, kernel, axes=[[-1], [0]])
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).with_rank_at_least(2)
-    if input_shape[-1].value is None:
-      raise ValueError(
-          "The innermost dimension of input_shape must be defined, "
-          "but saw: {}".format(input_shape))
-    return input_shape[:-1].concatenate(self.units)
-
-
-class DenseReparameterization(_DenseVariational):
-  """Densely-connected layer class with reparameterization estimator.
-
-  This layer implements the Bayesian variational inference analogue to
-  a dense layer by assuming the `kernel` and/or the `bias` are drawn
-  from distributions. By default, the layer implements a stochastic
-  forward pass via sampling from the kernel and bias posteriors,
-
-  ```none
-  kernel, bias ~ posterior
-  outputs = activation(matmul(inputs, kernel) + bias)
-  ```
-
-  It uses the reparameterization estimator [1], which performs a Monte Carlo
-  approximation of the distribution integrating over the `kernel` and
-  `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Args:
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (`callable`). Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    name: Python `str`, the name of the layer. Layers with the same name will
-      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
-      such cases.
-    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
-      layer by the same name.
-
-  Properties:
-    units: Python integer, dimensionality of the output space.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tfp.layers.DenseReparameterization(
-      512, activation=tf.nn.relu)(features)
-  logits = tfp.layers.DenseReparameterization(10)(net)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Auto-Encoding Variational Bayes."
-        Diederik P. Kingma, Max Welling.
-        International Conference on Learning Representations, 2014.
-  """
-
-  def __init__(
-      self,
-      units,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(
-          is_singular=True),
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      name=None,
-      **kwargs):
-    super(DenseReparameterization, self).__init__(
-        units=units,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        name=name,
-        **kwargs)
-
-  def _apply_variational_kernel(self, inputs):
-    self.kernel_posterior_tensor = self.kernel_posterior_tensor_fn(
-        self.kernel_posterior)
-    self.kernel_posterior_affine = None
-    self.kernel_posterior_affine_tensor = None
-    return self._matmul(inputs, self.kernel_posterior_tensor)
-
-
-def dense_reparameterization(
-    inputs,
-    units,
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    name=None,
-    reuse=None):
-  """Densely-connected layer with reparameterization estimator.
-
-  This layer implements the Bayesian variational inference analogue to
-  a dense layer by assuming the `kernel` and/or the `bias` are drawn
-  from distributions. By default, the layer implements a stochastic
-  forward pass via sampling from the kernel and bias posteriors,
-
-  ```none
-  kernel, bias ~ posterior
-  outputs = activation(matmul(inputs, kernel) + bias)
-  ```
-
-  It uses the reparameterization estimator [1], which performs a Monte Carlo
-  approximation of the distribution integrating over the `kernel` and
-  `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Args:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (`callable`). Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    name: Python `str`, the name of the layer. Layers with the same name will
-      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
-      such cases.
-    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
-      layer by the same name.
-
-  Returns:
-    output: `Tensor` representing a the affine transformed input under a random
-      draw from the surrogate posterior distribution.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tfp.layers.dense_reparameterization(
-      features, 512, activation=tf.nn.relu)
-  logits = tfp.layers.dense_reparameterization(net, 10)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Auto-Encoding Variational Bayes."
-        Diederik P. Kingma, Max Welling.
-        International Conference on Learning Representations, 2014.
-  """
-  layer = DenseReparameterization(
-      units,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
-
-
-class DenseLocalReparameterization(_DenseVariational):
-  """Densely-connected layer class with local reparameterization estimator.
-
-  This layer implements the Bayesian variational inference analogue to
-  a dense layer by assuming the `kernel` and/or the `bias` are drawn
-  from distributions. By default, the layer implements a stochastic
-  forward pass via sampling from the kernel and bias posteriors,
-
-  ```none
-  kernel, bias ~ posterior
-  outputs = activation(matmul(inputs, kernel) + bias)
-  ```
-
-  It uses the local reparameterization estimator [1], which performs a
-  Monte Carlo approximation of the distribution on the hidden units
-  induced by the `kernel` and `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Args:
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (`callable`). Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    name: Python `str`, the name of the layer. Layers with the same name will
-      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
-      such cases.
-    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
-      layer by the same name.
-
-  Properties:
-    units: Python integer, dimensionality of the output space.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tfp.layers.DenseLocalReparameterization(
-      512, activation=tf.nn.relu)(features)
-  logits = tfp.layers.DenseLocalReparameterization(10)(net)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses local reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Variational Dropout and the Local Reparameterization Trick."
-        Diederik P. Kingma, Tim Salimans, Max Welling.
-        Neural Information Processing Systems, 2015.
-  """
-
-  def __init__(
-      self,
-      units,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(
-          is_singular=True),
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      name=None,
-      **kwargs):
-    super(DenseLocalReparameterization, self).__init__(
-        units=units,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        name=name,
-        **kwargs)
-
-  def _apply_variational_kernel(self, inputs):
-    if (not isinstance(self.kernel_posterior, independent_lib.Independent) or
-        not isinstance(self.kernel_posterior.distribution, normal_lib.Normal)):
-      raise TypeError(
-          "`DenseLocalReparameterization` requires "
-          "`kernel_posterior_fn` produce an instance of "
-          "`tf.distributions.Independent(tf.distributions.Normal)` "
-          "(saw: \"{}\").".format(self.kernel_posterior.name))
-    self.kernel_posterior_affine = normal_lib.Normal(
-        loc=self._matmul(inputs, self.kernel_posterior.distribution.loc),
-        scale=standard_ops.sqrt(self._matmul(
-            standard_ops.square(inputs),
-            standard_ops.square(self.kernel_posterior.distribution.scale))))
-    self.kernel_posterior_affine_tensor = (
-        self.kernel_posterior_tensor_fn(self.kernel_posterior_affine))
-    self.kernel_posterior_tensor = None
-    return self.kernel_posterior_affine_tensor
-
-
-def dense_local_reparameterization(
-    inputs,
-    units,
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=layers_util.default_mean_field_normal_fn(
-        is_singular=True),
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    name=None,
-    reuse=None):
-  """Densely-connected layer with local reparameterization estimator.
-
-  This layer implements the Bayesian variational inference analogue to
-  a dense layer by assuming the `kernel` and/or the `bias` are drawn
-  from distributions. By default, the layer implements a stochastic
-  forward pass via sampling from the kernel and bias posteriors,
-
-  ```none
-  kernel, bias ~ posterior
-  outputs = activation(matmul(inputs, kernel) + bias)
-  ```
-
-  It uses the local reparameterization estimator [1], which performs a
-  Monte Carlo approximation of the distribution on the hidden units
-  induced by the `kernel` and `bias`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Args:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (`callable`). Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    name: Python `str`, the name of the layer. Layers with the same name will
-      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
-      such cases.
-    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
-      layer by the same name.
-
-  Returns:
-    output: `Tensor` representing a the affine transformed input under a random
-      draw from the surrogate posterior distribution.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tfp.layers.dense_local_reparameterization(
-      features, 512, activation=tf.nn.relu)
-  logits = tfp.layers.dense_local_reparameterization(net, 10)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses local reparameterization gradients to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Variational Dropout and the Local Reparameterization Trick."
-        Diederik P. Kingma, Tim Salimans, Max Welling.
-        Neural Information Processing Systems, 2015.
-  """
-  layer = DenseLocalReparameterization(
-      units,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
-
-
-class DenseFlipout(_DenseVariational):
-  """Densely-connected layer class with Flipout estimator.
-
-  This layer implements the Bayesian variational inference analogue to
-  a dense layer by assuming the `kernel` and/or the `bias` are drawn
-  from distributions. By default, the layer implements a stochastic
-  forward pass via sampling from the kernel and bias posteriors,
-
-  ```none
-  kernel, bias ~ posterior
-  outputs = activation(matmul(inputs, kernel) + bias)
-  ```
-
-  It uses the Flipout estimator [1], which performs a Monte Carlo
-  approximation of the distribution integrating over the `kernel` and
-  `bias`. Flipout uses roughly twice as many floating point operations
-  as the reparameterization estimator but has the advantage of
-  significantly lower variance.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Args:
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (`callable`). Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    seed: Python scalar `int` which initializes the random number
-      generator. Default value: `None` (i.e., use global seed).
-    name: Python `str`, the name of the layer. Layers with the same name will
-      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
-      such cases.
-    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
-      layer by the same name.
-
-  Properties:
-    units: Python integer, dimensionality of the output space.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_posterior_fn: `callable` returning posterior.
-    kernel_posterior_tensor_fn: `callable` operating on posterior.
-    kernel_prior_fn: `callable` returning prior.
-    kernel_divergence_fn: `callable` returning divergence.
-    bias_posterior_fn: `callable` returning posterior.
-    bias_posterior_tensor_fn: `callable` operating on posterior.
-    bias_prior_fn: `callable` returning prior.
-    bias_divergence_fn: `callable` returning divergence.
-    seed: Python integer, used to create random seeds.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tfp.layers.DenseFlipout(
-      512, activation=tf.nn.relu)(features)
-  logits = tfp.layers.DenseFlipout(10)(net)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses the Flipout gradient estimator to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
-        Mini-Batches."
-        Anonymous. OpenReview, 2017.
-        https://openreview.net/forum?id=rJnpifWAb
-  """
-
-  def __init__(
-      self,
-      units,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=layers_util.default_mean_field_normal_fn(
-          is_singular=True),
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      seed=None,
-      name=None,
-      **kwargs):
-    super(DenseFlipout, self).__init__(
-        units=units,
-        activation=activation,
-        activity_regularizer=activity_regularizer,
-        trainable=trainable,
-        kernel_posterior_fn=kernel_posterior_fn,
-        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-        kernel_prior_fn=kernel_prior_fn,
-        kernel_divergence_fn=kernel_divergence_fn,
-        bias_posterior_fn=bias_posterior_fn,
-        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-        bias_prior_fn=bias_prior_fn,
-        bias_divergence_fn=bias_divergence_fn,
-        name=name,
-        **kwargs)
-    self.seed = seed
-
-  def _apply_variational_kernel(self, inputs):
-    if (not isinstance(self.kernel_posterior, independent_lib.Independent) or
-        not isinstance(self.kernel_posterior.distribution, normal_lib.Normal)):
-      raise TypeError(
-          "`DenseFlipout` requires "
-          "`kernel_posterior_fn` produce an instance of "
-          "`tf.distributions.Independent(tf.distributions.Normal)` "
-          "(saw: \"{}\").".format(self.kernel_posterior.name))
-    self.kernel_posterior_affine = normal_lib.Normal(
-        loc=array_ops.zeros_like(self.kernel_posterior.distribution.loc),
-        scale=self.kernel_posterior.distribution.scale)
-    self.kernel_posterior_affine_tensor = (
-        self.kernel_posterior_tensor_fn(self.kernel_posterior_affine))
-    self.kernel_posterior_tensor = None
-
-    input_shape = array_ops.shape(inputs)
-    batch_shape = input_shape[:-1]
-
-    sign_input = layers_util.random_sign(
-        input_shape,
-        dtype=inputs.dtype,
-        seed=self.seed)
-    sign_output = layers_util.random_sign(
-        array_ops.concat([batch_shape,
-                          array_ops.expand_dims(self.units, 0)], 0),
-        dtype=inputs.dtype,
-        seed=distribution_util.gen_new_seed(
-            self.seed, salt="dense_flipout"))
-    perturbed_inputs = self._matmul(
-        inputs * sign_input, self.kernel_posterior_affine_tensor) * sign_output
-
-    outputs = self._matmul(inputs, self.kernel_posterior.distribution.loc)
-    outputs += perturbed_inputs
-    return outputs
-
-
-def dense_flipout(
-    inputs,
-    units,
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=layers_util.default_mean_field_normal_fn(
-        is_singular=True),
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    seed=None,
-    name=None,
-    reuse=None):
-  """Densely-connected layer with Flipout estimator.
-
-  This layer implements the Bayesian variational inference analogue to
-  a dense layer by assuming the `kernel` and/or the `bias` are drawn
-  from distributions. By default, the layer implements a stochastic
-  forward pass via sampling from the kernel and bias posteriors,
-
-  ```none
-  kernel, bias ~ posterior
-  outputs = activation(matmul(inputs, kernel) + bias)
-  ```
-
-  It uses the Flipout estimator [1], which performs a Monte Carlo
-  approximation of the distribution integrating over the `kernel` and
-  `bias`. Flipout uses roughly twice as many floating point operations
-  as the reparameterization estimator but has the advantage of
-  significantly lower variance.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  distributions.
-
-  Args:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (`callable`). Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    seed: Python scalar `int` which initializes the random number
-      generator. Default value: `None` (i.e., use global seed).
-    name: Python `str`, the name of the layer. Layers with the same name will
-      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
-      such cases.
-    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
-      layer by the same name.
-
-  Returns:
-    output: `Tensor` representing a the affine transformed input under a random
-      draw from the surrogate posterior distribution.
-
-  #### Examples
-
-  We illustrate a Bayesian neural network with [variational inference](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
-  assuming a dataset of `features` and `labels`.
-
-  ```python
-  tfp = tf.contrib.bayesflow
-
-  net = tfp.layers.dense_flipout(
-      features, 512, activation=tf.nn.relu)
-  logits = tfp.layers.dense_flipout(net, 10)
-  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits)
-  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-  loss = neg_log_likelihood + kl
-  train_op = tf.train.AdamOptimizer().minimize(loss)
-  ```
-
-  It uses the Flipout gradient estimator to minimize the
-  Kullback-Leibler divergence up to a constant, also known as the
-  negative Evidence Lower Bound. It consists of the sum of two terms:
-  the expected negative log-likelihood, which we approximate via
-  Monte Carlo; and the KL divergence, which is added via regularizer
-  terms which are arguments to the layer.
-
-  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
-        Mini-Batches."
-        Anonymous. OpenReview, 2017.
-        https://openreview.net/forum?id=rJnpifWAb
-  """
-  layer = DenseFlipout(
-      units,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      seed=seed,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_util.py b/tensorflow/contrib/bayesflow/python/ops/layers_util.py
deleted file mode 100644
index 8c1fb203f7328e8260e49b4326d813fbe133613e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/layers_util.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for probabilistic layers.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib
-from tensorflow.contrib.distributions.python.ops import independent as independent_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.distributions import normal as normal_lib
-
-
-def default_loc_scale_fn(
-    is_singular=False,
-    loc_initializer=init_ops.random_normal_initializer(stddev=0.1),
-    untransformed_scale_initializer=init_ops.random_normal_initializer(
-        mean=-3., stddev=0.1),
-    loc_regularizer=None,
-    untransformed_scale_regularizer=None,
-    loc_constraint=None,
-    untransformed_scale_constraint=None):
-  """Makes closure which creates `loc`, `scale` params from `tf.get_variable`.
-
-  This function produces a closure which produces `loc`, `scale` using
-  `tf.get_variable`. The closure accepts the following arguments:
-
-    dtype: Type of parameter's event.
-    shape: Python `list`-like representing the parameter's event shape.
-    name: Python `str` name prepended to any created (or existing)
-      `tf.Variable`s.
-    trainable: Python `bool` indicating all created `tf.Variable`s should be
-      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-      access existing) `tf.Variable`s.
-
-  Args:
-    is_singular: Python `bool` indicating if `scale is None`. Default: `False`.
-    loc_initializer: Initializer function for the `loc` parameters.
-      The default is `tf.random_normal_initializer(mean=0., stddev=0.1)`.
-    untransformed_scale_initializer: Initializer function for the `scale`
-      parameters. Default value: `tf.random_normal_initializer(mean=-3.,
-      stddev=0.1)`. This implies the softplus transformed result has mean
-      approximately `0.05` and std. deviation approximately `0.005`.
-    loc_regularizer: Regularizer function for the `loc` parameters.
-      The default (`None`) is to use the `tf.get_variable` default.
-    untransformed_scale_regularizer: Regularizer function for the `scale`
-      parameters. The default (`None`) is to use the `tf.get_variable` default.
-    loc_constraint: An optional projection function to be applied to the
-      loc after being updated by an `Optimizer`. The function must take as input
-      the unprojected variable and must return the projected variable (which
-      must have the same shape). Constraints are not safe to use when doing
-      asynchronous distributed training.
-      The default (`None`) is to use the `tf.get_variable` default.
-    untransformed_scale_constraint: An optional projection function to be
-      applied to the `scale` parameters after being updated by an `Optimizer`
-      (e.g. used to implement norm constraints or value constraints). The
-      function must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are not
-      safe to use when doing asynchronous distributed training. The default
-      (`None`) is to use the `tf.get_variable` default.
-
-  Returns:
-    default_loc_scale_fn: Python `callable` which instantiates `loc`, `scale`
-    parameters from args: `dtype, shape, name, trainable, add_variable_fn`.
-  """
-  def _fn(dtype, shape, name, trainable, add_variable_fn):
-    """Creates `loc`, `scale` parameters."""
-    loc = add_variable_fn(
-        name=name + "_loc",
-        shape=shape,
-        initializer=loc_initializer,
-        regularizer=loc_regularizer,
-        constraint=loc_constraint,
-        dtype=dtype,
-        trainable=trainable)
-    if is_singular:
-      return loc, None
-    untransformed_scale = add_variable_fn(
-        name=name + "_untransformed_scale",
-        shape=shape,
-        initializer=untransformed_scale_initializer,
-        regularizer=untransformed_scale_regularizer,
-        constraint=untransformed_scale_constraint,
-        dtype=dtype,
-        trainable=trainable)
-    scale = (np.finfo(dtype.as_numpy_dtype).eps +
-             nn_ops.softplus(untransformed_scale))
-    return loc, scale
-  return _fn
-
-
-def default_mean_field_normal_fn(
-    is_singular=False,
-    loc_initializer=None,
-    untransformed_scale_initializer=None,
-    loc_regularizer=None,
-    untransformed_scale_regularizer=None,
-    loc_constraint=None,
-    untransformed_scale_constraint=None):
-  """Creates a function to build Normal distributions with trainable params.
-
-  This function produces a closure which produces `tf.distributions.Normal`
-  parameterized by a loc` and `scale` each created using `tf.get_variable`. The
-  produced closure accepts the following arguments:
-
-    name: Python `str` name prepended to any created (or existing)
-      `tf.Variable`s.
-    shape: Python `list`-like representing the parameter's event shape.
-    dtype: Type of parameter's event.
-    trainable: Python `bool` indicating all created `tf.Variable`s should be
-      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-      access existing) `tf.Variable`s.
-
-  Args:
-    is_singular: Python `bool` if `True`, forces the special case limit of
-      `scale->0`, i.e., a `Deterministic` distribution.
-    loc_initializer: Initializer function for the `loc` parameters.
-      If `None` (default), values are initialized using the default
-      initializer used by `tf.get_variable`.
-    untransformed_scale_initializer: Initializer function for the `scale`
-      parameters. If `None` (default), values are initialized using the default
-      initializer used by `tf.get_variable`.
-    loc_regularizer: Regularizer function for the `loc` parameters.
-    untransformed_scale_regularizer: Regularizer function for the `scale`
-      parameters.
-    loc_constraint: An optional projection function to be applied to the
-      loc after being updated by an `Optimizer`. The function must take as input
-      the unprojected variable and must return the projected variable (which
-      must have the same shape). Constraints are not safe to use when doing
-      asynchronous distributed training.
-    untransformed_scale_constraint: An optional projection function to be
-      applied to the `scale` parameters after being updated by an `Optimizer`
-      (e.g. used to implement norm constraints or value constraints). The
-      function must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are not
-      safe to use when doing asynchronous distributed training.
-
-  Returns:
-    make_normal_fn: Python `callable` which creates a `tf.distributions.Normal`
-      using from args: `dtype, shape, name, trainable, add_variable_fn`.
-  """
-  loc_scale_fn_ = default_loc_scale_fn(
-      is_singular,
-      loc_initializer,
-      untransformed_scale_initializer,
-      loc_regularizer,
-      untransformed_scale_regularizer,
-      loc_constraint,
-      untransformed_scale_constraint)
-  def _fn(dtype, shape, name, trainable, add_variable_fn):
-    """Creates multivariate `Deterministic` or `Normal` distribution."""
-    loc, scale = loc_scale_fn_(dtype, shape, name, trainable, add_variable_fn)
-    if scale is None:
-      dist = deterministic_lib.Deterministic(loc=loc)
-    else:
-      dist = normal_lib.Normal(loc=loc, scale=scale)
-    reinterpreted_batch_ndims = array_ops.shape(dist.batch_shape_tensor())[0]
-    return independent_lib.Independent(
-        dist, reinterpreted_batch_ndims=reinterpreted_batch_ndims)
-  return _fn
-
-
-def random_sign(shape, dtype=dtypes.float32, seed=None):
-  """Draw values from {-1, 1} uniformly, i.e., Rademacher distribution."""
-  random_bernoulli = random_ops.random_uniform(shape, minval=0, maxval=2,
-                                               dtype=dtypes.int32,
-                                               seed=seed)
-  return math_ops.cast(2 * random_bernoulli - 1, dtype)
diff --git a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py
deleted file mode 100644
index 0424b6952bc89ce7fe5b00b0135c9a5fe1faa8cf..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py
+++ /dev/null
@@ -1,400 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for Markov Chain Monte Carlo (MCMC) sampling.
-
-@@effective_sample_size
-@@potential_scale_reduction
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops import sample_stats
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-
-__all__ = [
-    "effective_sample_size",
-    "potential_scale_reduction",
-]
-
-
-def effective_sample_size(states,
-                          filter_threshold=0.,
-                          filter_beyond_lag=None,
-                          name=None):
-  """Estimate a lower bound on effective sample size for each independent chain.
-
-  Roughly speaking, "effective sample size" (ESS) is the size of an iid sample
-  with the same variance as `state`.
-
-  More precisely, given a stationary sequence of possibly correlated random
-  variables `X_1, X_2,...,X_N`, each identically distributed ESS is the number
-  such that
-
-  ```Variance{ N**-1 * Sum{X_i} } = ESS**-1 * Variance{ X_1 }.```
-
-  If the sequence is uncorrelated, `ESS = N`.  In general, one should expect
-  `ESS <= N`, with more highly correlated sequences having smaller `ESS`.
-
-  #### Example of using ESS to estimate standard error.
-
-  ```
-  tfd = tf.contrib.distributions
-  tfb = tf.contrib.bayesflow
-
-  target = tfd.MultivariateNormalDiag(scale_diag=[1., 2.])
-
-  # Get 1000 states from one chain.
-  states = tfb.hmc.sample_chain(
-      num_results=1000,
-      target_log_prob_fn=target.log_prob,
-      current_state=tf.constant([0., 0.]),
-      step_size=0.05,
-      num_leapfrog_steps=20,
-      num_burnin_steps=200)
-  states.shape
-  ==> (1000, 2)
-
-  ess = effective_sample_size(states)
-  ==> Shape (2,) Tensor
-
-  mean, variance = tf.nn.moments(states, axis=0)
-  standard_error = tf.sqrt(variance / ess)
-  ```
-
-  Some math shows that, with `R_k` the auto-correlation sequence,
-  `R_k := Covariance{X_1, X_{1+k}} / Variance{X_1}`, we have
-
-  ```ESS(N) =  N / [ 1 + 2 * ( (N - 1) / N * R_1 + ... + 1 / N * R_{N-1}  ) ]```
-
-  This function estimates the above by first estimating the auto-correlation.
-  Since `R_k` must be estimated using only `N - k` samples, it becomes
-  progressively noisier for larger `k`.  For this reason, the summation over
-  `R_k` should be truncated at some number `filter_beyond_lag < N`.  Since many
-  MCMC methods generate chains where `R_k > 0`, a reasonable critera is to
-  truncate at the first index where the estimated auto-correlation becomes
-  negative.
-
-  The arguments `filter_beyond_lag`, `filter_threshold` are filters intended to
-  remove noisy tail terms from `R_k`.  They combine in an "OR" manner meaning
-  terms are removed if they were to be filtered under the `filter_beyond_lag` OR
-  `filter_threshold` criteria.
-
-  Args:
-    states:  `Tensor` or list of `Tensor` objects.  Dimension zero should index
-      identically distributed states.
-    filter_threshold:  `Tensor` or list of `Tensor` objects.
-      Must broadcast with `state`.  The auto-correlation sequence is truncated
-      after the first appearance of a term less than `filter_threshold`.
-      Setting to `None` means we use no threshold filter.  Since `|R_k| <= 1`,
-      setting to any number less than `-1` has the same effect.
-    filter_beyond_lag:  `Tensor` or list of `Tensor` objects.  Must be
-      `int`-like and scalar valued.  The auto-correlation sequence is truncated
-      to this length.  Setting to `None` means we do not filter based on number
-      of lags.
-    name:  `String` name to prepend to created ops.
-
-  Returns:
-    ess:  `Tensor` or list of `Tensor` objects.  The effective sample size of
-      each component of `states`.  Shape will be `states.shape[1:]`.
-
-  Raises:
-    ValueError:  If `states` and `filter_threshold` or `states` and
-      `filter_beyond_lag` are both lists with different lengths.
-  """
-  states_was_list = _is_list_like(states)
-
-  # Convert all args to lists.
-  if not states_was_list:
-    states = [states]
-
-  filter_beyond_lag = _broadcast_maybelist_arg(states, filter_beyond_lag,
-                                               "filter_beyond_lag")
-  filter_threshold = _broadcast_maybelist_arg(states, filter_threshold,
-                                              "filter_threshold")
-
-  # Process items, one at a time.
-  with ops.name_scope(name, "effective_sample_size"):
-    ess_list = [
-        _effective_sample_size_single_state(s, ml, mlt)
-        for (s, ml, mlt) in zip(states, filter_beyond_lag, filter_threshold)
-    ]
-
-  if states_was_list:
-    return ess_list
-  return ess_list[0]
-
-
-def _effective_sample_size_single_state(states, filter_beyond_lag,
-                                        filter_threshold):
-  """ESS computation for one single Tensor argument."""
-
-  with ops.name_scope(
-      "effective_sample_size_single_state",
-      values=[states, filter_beyond_lag, filter_threshold]):
-
-    states = ops.convert_to_tensor(states, name="states")
-    dt = states.dtype
-
-    # filter_beyond_lag == None ==> auto_corr is the full sequence.
-    auto_corr = sample_stats.auto_correlation(
-        states, axis=0, max_lags=filter_beyond_lag)
-    if filter_threshold is not None:
-      filter_threshold = ops.convert_to_tensor(
-          filter_threshold, dtype=dt, name="filter_threshold")
-      # Get a binary mask to zero out values of auto_corr below the threshold.
-      #   mask[i, ...] = 1 if auto_corr[j, ...] > threshold for all j <= i,
-      #   mask[i, ...] = 0, otherwise.
-      # So, along dimension zero, the mask will look like [1, 1, ..., 0, 0,...]
-      # Building step by step,
-      #   Assume auto_corr = [1, 0.5, 0.0, 0.3], and filter_threshold = 0.2.
-      # Step 1:  mask = [False, False, True, False]
-      mask = auto_corr < filter_threshold
-      # Step 2:  mask = [0, 0, 1, 1]
-      mask = math_ops.cast(mask, dtype=dt)
-      # Step 3:  mask = [0, 0, 1, 2]
-      mask = math_ops.cumsum(mask, axis=0)
-      # Step 4:  mask = [1, 1, 0, 0]
-      mask = math_ops.maximum(1. - mask, 0.)
-      auto_corr *= mask
-
-    # With R[k] := auto_corr[k, ...],
-    # ESS = N / {1 + 2 * Sum_{k=1}^N (N - k) / N * R[k]}
-    #     = N / {-1 + 2 * Sum_{k=0}^N (N - k) / N * R[k]} (since R[0] = 1)
-    #     approx N / {-1 + 2 * Sum_{k=0}^M (N - k) / N * R[k]}
-    # where M is the filter_beyond_lag truncation point chosen above.
-
-    # Get the factor (N - k) / N, and give it shape [M, 1,...,1], having total
-    # ndims the same as auto_corr
-    n = _axis_size(states, axis=0)
-    k = math_ops.range(0., _axis_size(auto_corr, axis=0))
-    nk_factor = (n - k) / n
-    if auto_corr.shape.ndims is not None:
-      new_shape = [-1] + [1] * (auto_corr.shape.ndims - 1)
-    else:
-      new_shape = array_ops.concat(
-          ([-1],
-           array_ops.ones([array_ops.rank(auto_corr) - 1], dtype=dtypes.int32)),
-          axis=0)
-    nk_factor = array_ops.reshape(nk_factor, new_shape)
-
-    return n / (-1 + 2 * math_ops.reduce_sum(nk_factor * auto_corr, axis=0))
-
-
-def potential_scale_reduction(chains_states,
-                              independent_chain_ndims=1,
-                              name=None):
-  """Gelman and Rubin's potential scale reduction factor for chain convergence.
-
-  Given `N > 1` states from each of `C > 1` independent chains, the potential
-  scale reduction factor, commonly referred to as R-hat, measures convergence of
-  the chains (to the same target) by testing for equality of means.
-  Specifically, R-hat measures the degree to which variance (of the means)
-  between chains exceeds what one would expect if the chains were identically
-  distributed.  See [1], [2].
-
-  Some guidelines:
-
-  * The initial state of the chains should be drawn from a distribution
-    overdispersed with respect to the target.
-  * If all chains converge to the target, then as `N --> infinity`, R-hat --> 1.
-    Before that, R-hat > 1 (except in pathological cases, e.g. if the chain
-    paths were identical).
-  * The above holds for any number of chains `C > 1`.  Increasing `C` does
-    improves effectiveness of the diagnostic.
-  * Sometimes, R-hat < 1.2 is used to indicate approximate convergence, but of
-    course this is problem depedendent.  See [2].
-  * R-hat only measures non-convergence of the mean. If higher moments, or other
-    statistics are desired, a different diagnostic should be used.  See [2].
-
-  #### Examples
-
-  Diagnosing convergence by monitoring 10 chains that each attempt to
-  sample from a 2-variate normal.
-
-  ```python
-  tfd = tf.contrib.distributions
-  tfb = tf.contrib.bayesflow
-
-  target = tfd.MultivariateNormalDiag(scale_diag=[1., 2.])
-
-  # Get 10 (2x) overdispersed initial states.
-  initial_state = target.sample(10) * 2.
-  ==> (10, 2)
-
-  # Get 1000 samples from the 10 independent chains.
-  chains_states, _ = tfb.hmc.sample_chain(
-      num_results=1000,
-      target_log_prob_fn=target.log_prob,
-      current_state=initial_state,
-      step_size=0.05,
-      num_leapfrog_steps=20,
-      num_burnin_steps=200)
-  chains_states.shape
-  ==> (1000, 10, 2)
-
-  rhat = tfb.mcmc_diagnostics.potential_scale_reduction(
-      chains_states, independent_chain_ndims=1)
-
-  # The second dimension needed a longer burn-in.
-  rhat.eval()
-  ==> [1.05, 1.3]
-  ```
-
-  To see why R-hat is reasonable, let `X` be a random variable drawn uniformly
-  from the combined states (combined over all chains).  Then, in the limit
-  `N, C --> infinity`, with `E`, `Var` denoting expectation and variance,
-
-  ```R-hat = ( E[Var[X | chain]] + Var[E[X | chain]] ) / E[Var[X | chain]].```
-
-  Using the law of total variance, the numerator is the variance of the combined
-  states, and the denominator is the total variance minus the variance of the
-  the individual chain means.  If the chains are all drawing from the same
-  distribution, they will have the same mean, and thus the ratio should be one.
-
-  [1] "Inference from Iterative Simulation Using Multiple Sequences"
-      Andrew Gelman and Donald B. Rubin
-      Statist. Sci. Volume 7, Number 4 (1992), 457-472.
-  [2] "General Methods for Monitoring Convergence of Iterative Simulations"
-      Stephen P. Brooks and Andrew Gelman
-      Journal of Computational and Graphical Statistics, 1998. Vol 7, No. 4.
-
-  Args:
-    chains_states:  `Tensor` or Python `list` of `Tensor`s representing the
-      state(s) of a Markov Chain at each result step.  The `ith` state is
-      assumed to have shape `[Ni, Ci1, Ci2,...,CiD] + A`.
-      Dimension `0` indexes the `Ni > 1` result steps of the Markov Chain.
-      Dimensions `1` through `D` index the `Ci1 x ... x CiD` independent
-      chains to be tested for convergence to the same target.
-      The remaining dimensions, `A`, can have any shape (even empty).
-    independent_chain_ndims: Integer type `Tensor` with value `>= 1` giving the
-      number of giving the number of dimensions, from `dim = 1` to `dim = D`,
-      holding independent chain results to be tested for convergence.
-    name: `String` name to prepend to created ops.  Default:
-      `potential_scale_reduction`.
-
-  Returns:
-    `Tensor` or Python `list` of `Tensor`s representing the R-hat statistic for
-    the state(s).  Same `dtype` as `state`, and shape equal to
-    `state.shape[1 + independent_chain_ndims:]`.
-
-  Raises:
-    ValueError:  If `independent_chain_ndims < 1`.
-  """
-  chains_states_was_list = _is_list_like(chains_states)
-  if not chains_states_was_list:
-    chains_states = [chains_states]
-
-  # tensor_util.constant_value returns None iff a constant value (as a numpy
-  # array) is not efficiently computable.  Therefore, we try constant_value then
-  # check for None.
-  icn_const_ = tensor_util.constant_value(
-      ops.convert_to_tensor(independent_chain_ndims))
-  if icn_const_ is not None:
-    independent_chain_ndims = icn_const_
-    if icn_const_ < 1:
-      raise ValueError(
-          "Argument `independent_chain_ndims` must be `>= 1`, found: {}".format(
-              independent_chain_ndims))
-
-  with ops.name_scope(name, "potential_scale_reduction"):
-    rhat_list = [
-        _potential_scale_reduction_single_state(s, independent_chain_ndims)
-        for s in chains_states
-    ]
-
-  if chains_states_was_list:
-    return rhat_list
-  return rhat_list[0]
-
-
-def _potential_scale_reduction_single_state(state, independent_chain_ndims):
-  """potential_scale_reduction for one single state `Tensor`."""
-  with ops.name_scope(
-      "potential_scale_reduction_single_state",
-      values=[state, independent_chain_ndims]):
-    # We assume exactly one leading dimension indexes e.g. correlated samples
-    # from each Markov chain.
-    state = ops.convert_to_tensor(state, name="state")
-    sample_ndims = 1
-
-    sample_axis = math_ops.range(0, sample_ndims)
-    chain_axis = math_ops.range(sample_ndims,
-                                sample_ndims + independent_chain_ndims)
-    sample_and_chain_axis = math_ops.range(
-        0, sample_ndims + independent_chain_ndims)
-
-    n = _axis_size(state, sample_axis)
-    m = _axis_size(state, chain_axis)
-
-    # In the language of [2],
-    # B / n is the between chain variance, the variance of the chain means.
-    # W is the within sequence variance, the mean of the chain variances.
-    b_div_n = _reduce_variance(
-        math_ops.reduce_mean(state, sample_axis, keepdims=True),
-        sample_and_chain_axis,
-        biased=False)
-    w = math_ops.reduce_mean(
-        _reduce_variance(state, sample_axis, keepdims=True, biased=True),
-        sample_and_chain_axis)
-
-    # sigma^2_+ is an estimate of the true variance, which would be unbiased if
-    # each chain was drawn from the target.  c.f. "law of total variance."
-    sigma_2_plus = w + b_div_n
-
-    return ((m + 1.) / m) * sigma_2_plus / w - (n - 1.) / (m * n)
-
-
-# TODO(b/72873233) Move some variant of this to sample_stats.
-def _reduce_variance(x, axis=None, biased=True, keepdims=False):
-  with ops.name_scope("reduce_variance"):
-    x = ops.convert_to_tensor(x, name="x")
-    mean = math_ops.reduce_mean(x, axis=axis, keepdims=True)
-    biased_var = math_ops.reduce_mean(
-        math_ops.squared_difference(x, mean), axis=axis, keepdims=keepdims)
-    if biased:
-      return biased_var
-    n = _axis_size(x, axis)
-    return (n / (n - 1.)) * biased_var
-
-
-def _axis_size(x, axis=None):
-  """Get number of elements of `x` in `axis`, as type `x.dtype`."""
-  if axis is None:
-    return math_ops.cast(array_ops.size(x), x.dtype)
-  return math_ops.cast(
-      math_ops.reduce_prod(array_ops.gather(array_ops.shape(x), axis)), x.dtype)
-
-
-def _is_list_like(x):
-  """Helper which returns `True` if input is `list`-like."""
-  return isinstance(x, (tuple, list))
-
-
-def _broadcast_maybelist_arg(states, secondary_arg, name):
-  """Broadcast a listable secondary_arg to that of states."""
-  if _is_list_like(secondary_arg):
-    if len(secondary_arg) != len(states):
-      raise ValueError("Argument `%s` was a list of different length ({}) than "
-                       "`states` ({})".format(name, len(states)))
-  else:
-    secondary_arg = [secondary_arg] * len(states)
-
-  return secondary_arg
diff --git a/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings.py b/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings.py
deleted file mode 100644
index 7bdeaa862d5bb64fa8940df453c7aa2b66023eda..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functions to create a Markov Chain Monte Carlo Metropolis step."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.metropolis_hastings_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'evolve',
-    'uniform_random_proposal',
-    'normal_random_proposal',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings_impl.py b/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings_impl.py
deleted file mode 100644
index dc1ac68ce009fa46d6c05a3200a29d9fdf245707..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings_impl.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functions to create a Markov Chain Monte Carlo Metropolis step.
-
-@@evolve
-@@uniform_random_proposal
-@@normal_random_proposal
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import state_ops
-
-__all__ = [
-    'evolve',
-    'uniform_random_proposal',
-    'normal_random_proposal',
-]
-
-
-def _single_iteration(current_state, current_log_density,
-                      log_unnormalized_prob_fn, proposal_fn, seed=None,
-                      name='None'):
-  """Performs a single Metropolis-Hastings step.
-
-  Args:
-    current_state: Float-like `Tensor` (i.e., `dtype` is either
-      `tf.float16`, `tf.float32` or `tf.float64`) of any shape that can
-      be consumed by the `log_unnormalized_prob_fn` and `proposal_fn`
-      callables.
-    current_log_density: Float-like `Tensor` with `dtype` and shape equivalent
-      to `log_unnormalized_prob_fn(current_state)`, i.e., matching the result of
-      `log_unnormalized_prob_fn` invoked at `current_state`.
-    log_unnormalized_prob_fn: A Python callable evaluated at
-      `current_state` and returning a float-like `Tensor` of log target-density
-      up to a normalizing constant. In other words,
-      `log_unnormalized_prob_fn(x) = log(g(x))`, where
-      `target_density = g(x)/Z` for some constant `A`. The shape of the input
-      tensor is the same as the shape of the `current_state`. The shape of the
-      output tensor is either
-        (a). Same as the input shape if the density being sampled is one
-          dimensional, or
-        (b). If the density is defined for `events` of shape
-          `event_shape = [E1, E2, ... Ee]`, then the input tensor should be of
-          shape `batch_shape + event_shape`, where `batch_shape = [B1, ..., Bb]`
-          and the result must be of shape [B1, ..., Bb]. For example, if the
-          distribution that is being sampled is a 10 dimensional normal,
-          then the input tensor may be of shape [100, 10] or [30, 20, 10]. The
-          last dimension will then be 'consumed' by `log_unnormalized_prob_fn`
-          and it should return tensors of shape [100] and [30, 20] respectively.
-    proposal_fn: A callable accepting a real valued `Tensor` of current sample
-      points and returning a tuple of two `Tensors`. The first element of the
-      pair is a `Tensor` containing the proposal state and should have
-      the same shape as the input `Tensor`. The second element of the pair gives
-      the log of the ratio of the probability of transitioning from the
-      proposal points to the input points and the probability of transitioning
-      from the input points to the proposal points. If the proposal is
-      symmetric (e.g., random walk, where the proposal is either
-      normal or uniform centered at `current_state`), i.e.,
-      Probability(Proposal -> Current) = Probability(Current -> Proposal)
-      the second value should be set to `None` instead of explicitly supplying a
-      tensor of zeros. In addition to being convenient, this also leads to a
-      more efficient graph.
-    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
-      applied.
-    name: Python `str` name prefix for ops managed by this function.
-
-  Returns:
-    next_state: `Tensor` with `dtype` and shape matching `current_state`.
-      Created by propagating the chain by one step, starting from
-      `current_state`.
-    next_log_density: `Tensor` with `dtype` and shape matching
-      `current_log_density`, which is equal to the value of the unnormalized
-      `log_unnormalized_prob_fn` computed at `next_state`.
-    log_accept_ratio: `Tensor` with `dtype` and shape matching
-      `current_log_density`. Stands for the log of Metropolis-Hastings
-      acceptance ratio used in generating the `next_state`.
-  """
-
-  with ops.name_scope(name, 'single_iteration', [current_state]):
-    # The proposed state and the log of the corresponding Hastings ratio.
-    proposal_state, log_transit_ratio = proposal_fn(current_state)
-
-    # If the log ratio is None, assume that the transitions are symmetric,
-    # i.e., Prob(Current -> Proposed) = Prob(Proposed -> Current).
-    if log_transit_ratio is None:
-      log_transit_ratio = 0.
-
-    # Log-density of the proposal state.
-    proposal_log_density = log_unnormalized_prob_fn(proposal_state)
-
-    # Ops to compute the log of the acceptance ratio. Recall that the
-    # acceptance ratio is: [Prob(Proposed) / Prob(Current)] *
-    # [Prob(Proposed -> Current) / Prob(Current -> Proposed)]. The log of the
-    # second term is the log_transit_ratio.
-    with ops.name_scope('accept_reject'):
-      # The log of the acceptance ratio.
-      log_accept_ratio = (proposal_log_density - current_log_density
-                          + log_transit_ratio)
-
-      # A proposal is accepted or rejected depending on the acceptance ratio.
-      # If the acceptance ratio is greater than 1 then it is always accepted.
-      # If the acceptance ratio is less than 1 then the proposal is accepted
-      # with probability = acceptance ratio. As we are working in log space to
-      # prevent over/underflows, this logic is expressed in log terms below.
-      # If a proposal is accepted we place a True in the acceptance state
-      # tensor and if it is to be rejected we place a False.
-      # The log_draws below have to be compared to the log_accept_ratio so we
-      # make sure that they have the same data type.
-      log_draws = math_ops.log(random_ops.random_uniform(
-          array_ops.shape(current_log_density), seed=seed,
-          dtype=log_accept_ratio.dtype))
-      is_proposal_accepted = log_draws < log_accept_ratio
-
-    # The acceptance state decides which elements of the current state are to
-    # be replaced with the corresponding elements in the proposal state.
-    with ops.name_scope(name, 'metropolis_single_step',
-                        [current_state, current_log_density]):
-      next_log_density = array_ops.where(is_proposal_accepted,
-                                         proposal_log_density,
-                                         current_log_density)
-      next_state = array_ops.where(is_proposal_accepted, proposal_state,
-                                   current_state)
-
-    return next_state, next_log_density, log_accept_ratio
-
-
-def evolve(initial_sample,
-           initial_log_density,
-           initial_log_accept_ratio,
-           log_unnormalized_prob_fn,
-           proposal_fn,
-           n_steps=1,
-           seed=None,
-           name=None):
-  """Performs `n_steps` of the Metropolis-Hastings update.
-
-  Given a probability density function, `f(x)` and a proposal scheme which
-  generates new points from old, this `Op` returns a tensor
-  which may be used to generate approximate samples from the target distribution
-  using the Metropolis-Hastings algorithm. These samples are from a Markov chain
-  whose equilibrium distribution matches the target distribution.
-
-  The probability distribution may have an unknown normalization constan.
-  We parameterize the probability density as follows:
-    ```
-      f(x) = exp(L(x) + constant)
-    ```
-  Here `L(x)` is any continuous function with an (possibly unknown but finite)
-  upper bound, i.e. there exists a number beta such that
-  `L(x)< beta < infinity` for all x. The constant is the normalization needed
-  to make `f(x)` a probability density (as opposed to just a finite measure).
-
-  Although `initial_sample` can be arbitrary, a poor choice may result in a
-  slow-to-mix chain. In many cases the best choice is the one that maximizes
-  the target density, i.e., choose `initial_sample` such that
-  `f(initial_sample) >= f(x)` for all `x`.
-
-
-  If the support of the distribution is a strict subset of R^n (but of non zero
-  measure), then the unnormalized log-density `L(x)` should return `-infinity`
-  outside the support domain. This effectively forces the sampler to only
-  explore points in the regions of finite support.
-
-  Usage:
-  This function is meant to be wrapped up with some of the common proposal
-  schemes (e.g. random walk, Langevin diffusion etc) to produce a more user
-  friendly interface. However, it may also be used to create bespoke samplers.
-
-  The following example, demonstrates the use to generate a 1000 uniform random
-  walk Metropolis samplers run in parallel for the normal target distribution.
-  ```python
-    n = 3  # dimension of the problem
-
-    # Generate 1000 initial values randomly. Each of these would be an
-    # independent starting point for a Markov chain.
-    state = tf.get_variable(
-        'state',initializer=tf.random_normal([1000, n], mean=3.0,
-                                             dtype=tf.float64, seed=42))
-
-    # Computes the log(p(x)) for the unit normal density and ignores the
-    # normalization constant.
-    def log_density(x):
-      return  - tf.reduce_sum(x * x, reduction_indices=-1) / 2.0
-
-    # Initial log-density value
-    state_log_density = tf.get_variable(
-        'state_log_density', initializer=log_density(state.initialized_value()))
-
-    # A variable to store the log_acceptance_ratio:
-    log_acceptance_ratio = tf.get_variable(
-        'log_acceptance_ratio', initializer=tf.zeros([1000], dtype=tf.float64))
-
-    # Generates random proposals by moving each coordinate uniformly and
-    # independently in a box of size 2 centered around the current value.
-    # Returns the new point and also the log of the Hastings ratio (the
-    # ratio of the probability of going from the proposal to origin and the
-    # probability of the reverse transition). When this ratio is 1, the value
-    # may be omitted and replaced by None.
-    def random_proposal(x):
-      return (x + tf.random_uniform(tf.shape(x), minval=-1, maxval=1,
-                                    dtype=x.dtype, seed=12)), None
-
-    #  Create the op to propagate the chain for 100 steps.
-    stepper = mh.evolve(
-        state, state_log_density, log_acceptance_ratio,
-        log_density, random_proposal, n_steps=100, seed=123)
-    init = tf.initialize_all_variables()
-    with tf.Session() as sess:
-      sess.run(init)
-      # Run the chains for a total of 1000 steps and print out the mean across
-      # the chains every 100 iterations.
-      for n_iter in range(10):
-        # Executing the stepper advances the chain to the next state.
-        sess.run(stepper)
-        # Print out the current value of the mean(sample) for every dimension.
-        print(np.mean(sess.run(state), 0))
-      # Estimated covariance matrix
-      samples = sess.run(state)
-      print('')
-      print(np.cov(samples, rowvar=False))
-  ```
-
-  Args:
-    initial_sample: A float-like `tf.Variable` of any shape that can
-      be consumed by the `log_unnormalized_prob_fn` and `proposal_fn`
-      callables.
-    initial_log_density: Float-like `tf.Variable` with `dtype` and shape
-      equivalent  to `log_unnormalized_prob_fn(initial_sample)`, i.e., matching
-        the result of `log_unnormalized_prob_fn` invoked at `current_state`.
-    initial_log_accept_ratio: A `tf.Variable` with `dtype` and shape matching
-      `initial_log_density`. Stands for the log of Metropolis-Hastings
-      acceptance ratio after propagating the chain for `n_steps`.
-    log_unnormalized_prob_fn: A Python callable evaluated at
-      `current_state` and returning a float-like `Tensor` of log target-density
-      up to a normalizing constant. In other words,
-      `log_unnormalized_prob_fn(x) = log(g(x))`, where
-      `target_density = g(x)/Z` for some constant `A`. The shape of the input
-      tensor is the same as the shape of the `current_state`. The shape of the
-      output tensor is either
-        (a). Same as the input shape if the density being sampled is one
-          dimensional, or
-        (b). If the density is defined for `events` of shape
-          `event_shape = [E1, E2, ... Ee]`, then the input tensor should be of
-          shape `batch_shape + event_shape`, here `batch_shape = [B1, ..., Bb]`
-          and the result must be of shape [B1, ..., Bb]. For example, if the
-          distribution that is being sampled is a 10 dimensional normal,
-          then the input tensor may be of shape [100, 10] or [30, 20, 10]. The
-          last dimension will then be 'consumed' by `log_unnormalized_prob_fn`
-          and it should return tensors of shape [100] and [30, 20] respectively.
-    proposal_fn: A callable accepting a real valued `Tensor` of current sample
-      points and returning a tuple of two `Tensors`. The first element of the
-      pair should be a `Tensor` containing the proposal state and should have
-      the same shape as the input `Tensor`. The second element of the pair gives
-      the log of the ratio of the probability of transitioning from the
-      proposal points to the input points and the probability of transitioning
-      from the input points to the proposal points. If the proposal is
-      symmetric, i.e.
-      Probability(Proposal -> Current) = Probability(Current -> Proposal)
-      the second value should be set to None instead of explicitly supplying a
-      tensor of zeros. In addition to being convenient, this also leads to a
-      more efficient graph.
-    n_steps: A positive `int` or a scalar `int32` tensor. Sets the number of
-      iterations of the chain.
-    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
-      applied.
-    name: A string that sets the name for this `Op`.
-
-  Returns:
-    forward_step: an `Op` to step the Markov chain forward for `n_steps`.
-  """
-
-  with ops.name_scope(name, 'metropolis_hastings', [initial_sample]):
-    current_state = initial_sample
-    current_log_density = initial_log_density
-    log_accept_ratio = initial_log_accept_ratio
-
-    # Stop condition for the while_loop
-    def stop_condition(i, _):
-      return i < n_steps
-
-    def step(i, loop_vars):
-      """Wrap `_single_iteration` for `while_loop`."""
-      state = loop_vars[0]
-      state_log_density = loop_vars[1]
-      return i + 1, list(_single_iteration(state, state_log_density,
-                                           log_unnormalized_prob_fn,
-                                           proposal_fn, seed=seed))
-
-    loop_vars = [current_state, current_log_density, log_accept_ratio]
-    # Build an `Op` to evolve the Markov chain for `n_steps`
-    (_, [end_state, end_log_density, end_log_acceptance]) = (
-        control_flow_ops.while_loop(
-            stop_condition, step,
-            (0, loop_vars),
-            parallel_iterations=1, swap_memory=1))
-
-    forward_step = control_flow_ops.group(
-        state_ops.assign(current_log_density, end_log_density),
-        state_ops.assign(current_state, end_state),
-        state_ops.assign(log_accept_ratio, end_log_acceptance))
-
-    return forward_step
-
-
-def uniform_random_proposal(step_size=1.,
-                            seed=None,
-                            name=None):
-  """Returns a callable that adds a random uniform tensor to the input.
-
-  This function returns a callable that accepts one `Tensor` argument of any
-  shape and a real data type (i.e. `tf.float32` or `tf.float64`). It adds a
-  sample from a random uniform distribution drawn from [-stepsize, stepsize]
-  to its input. It also returns the log of the ratio of the probability of
-  moving from the input point to the proposed point, but since this log ratio is
-  identically equal to 0 (because the probability of drawing a value `x` from
-  the symmetric uniform distribution is the same as the probability of drawing
-  `-x`), it simply returns None for the second element of the returned tuple.
-
-  Args:
-    step_size: A positive `float` or a scalar tensor of real dtype
-      controlling the scale of the uniform distribution.
-      If step_size = a, then draws are made uniformly from [-a, a].
-    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
-      applied.
-    name: A string that sets the name for this `Op`.
-
-  Returns:
-    proposal_fn:  A callable accepting one float-like `Tensor` and returning a
-    2-tuple. The first value in the tuple is a `Tensor` of the same shape and
-    dtype as the input argument and the second element of the tuple is None.
-  """
-
-  with ops.name_scope(name, 'uniform_random_proposal', [step_size]):
-    def proposal_fn(input_state, name=None):
-      """Adds a uniform perturbation to the input state.
-
-      Args:
-        input_state: A `Tensor` of any shape and real dtype.
-        name: A string that sets the name for this `Op`.
-
-      Returns:
-        proposal_state:  A float-like `Tensot` with `dtype` and shape matching
-          `input_state`.
-        log_transit_ratio: `None`. Proposal is symmetric.
-      """
-      with ops.name_scope(name, 'proposer', [input_state]):
-        input_state = ops.convert_to_tensor(input_state, name='input_state')
-        return input_state + random_ops.random_uniform(
-            array_ops.shape(input_state),
-            minval=-step_size,
-            maxval=step_size,
-            seed=seed), None
-    return proposal_fn
-
-
-def normal_random_proposal(scale=1.,
-                           seed=None,
-                           name=None):
-  """Returns a callable that adds a random normal tensor to the input.
-
-  This function returns a callable that accepts one `Tensor` argument of any
-  shape and a real data type (i.e. `tf.float32` or `tf.float64`). The callable
-  adds a sample from a normal distribution with the supplied standard deviation
-  and zero mean to its input argument (called the proposal point).
-  The callable returns a tuple with the proposal point as the first element.
-  The second element is identically `None`. It is included so the callable is
-  compatible with the expected signature of the proposal scheme argument in the
-  `metropolis_hastings` function. A value of `None` indicates that the
-  probability of going from the input point to the proposal point is equal to
-  the probability of going from the proposal point to the input point.
-
-  Args:
-    scale: A positive `float` or a scalar tensor of any real dtype controlling
-      the scale of the normal distribution.
-    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
-      applied.
-    name: A string that sets the name for this `Op`.
-
-  Returns:
-    proposal_fn: A callable accepting one float-like `Tensor` and returning a
-    2-tuple. The first value in the tuple is a `Tensor` of the same shape and
-    dtype as the input argument and the second element of the tuple is None.
-  """
-
-  with ops.name_scope(name, 'normal_random_proposal', [scale]):
-    def proposal_fn(input_state, name=None):
-      """Adds a normal perturbation to the input state.
-
-      Args:
-        input_state: A `Tensor` of any shape and real dtype.
-        name: A string that sets the name for this `Op`.
-
-      Returns:
-        proposal_state:  A float-like `Tensot` with `dtype` and shape matching
-          `input_state`.
-        log_transit_ratio: `None`. Proposal is symmetric.
-      """
-
-      with ops.name_scope(name, 'proposer', [input_state]):
-        input_state = ops.convert_to_tensor(input_state, name='input_state')
-        return input_state + random_ops.random_normal(
-            array_ops.shape(input_state),
-            mean=0.,
-            stddev=scale,
-            seed=seed), None
-    return proposal_fn
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 985177e897f443989e466d1a498c461a30aeb5cb..032b859d469ee5039e08e4af4c2f4ebf35c2ff19 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -44,15 +44,13 @@ def expectation_importance_sampler(f,
                                    n=None,
                                    seed=None,
                                    name='expectation_importance_sampler'):
-  r"""Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`.
+  r"""Monte Carlo estimate of \\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\).
 
-  With `p(z) := exp{log_p(z)}`, this `Op` returns
+  With \\(p(z) := exp^{log_p(z)}\\), this `Op` returns
 
-  ```
-  n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,
-  \approx E_q[ f(Z) p(Z) / q(Z) ]
-  =       E_p[f(Z)]
-  ```
+  \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,\\)
+  \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\)
+  \\(=       E_p[f(Z)]\\)
 
   This integral is done in log-space with max-subtraction to better handle the
   often extreme values that `f(z) p(z) / q(z)` can take on.
@@ -95,9 +93,9 @@ def expectation_importance_sampler(f,
       log_values = log_f_z + log_p_z - q_log_prob_z
       return _logspace_mean(log_values)
 
-    # With f_plus(z) = max(0, f(z)), f_minus(z) = max(0, -f(z)),
-    # E_p[f(Z)] = E_p[f_plus(Z)] - E_p[f_minus(Z)]
-    #           = E_p[f_plus(Z) + 1] - E_p[f_minus(Z) + 1]
+    # With \\(f_{plus}(z) = max(0, f(z)), f_{minus}(z) = max(0, -f(z))\\),
+    # \\(E_p[f(Z)] = E_p[f_{plus}(Z)] - E_p[f_{minus}(Z)]\\)
+    # \\(          = E_p[f_{plus}(Z) + 1] - E_p[f_{minus}(Z) + 1]\\)
     # Without incurring bias, 1 is added to each to prevent zeros in logspace.
     # The logarithm is approximately linear around 1 + epsilon, so this is good
     # for small values of 'z' as well.
@@ -121,14 +119,12 @@ def expectation_importance_sampler_logspace(
     name='expectation_importance_sampler_logspace'):
   r"""Importance sampling with a positive function, in log-space.
 
-  With `p(z) := exp{log_p(z)}`, and `f(z) = exp{log_f(z)}`, this `Op`
-  returns
+  With \\(p(z) := exp^{log_p(z)}\\), and \\(f(z) = exp{log_f(z)}\\),
+  this `Op` returns
 
-  ```
-  Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,
-  \approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]
-  =       Log[E_p[f(Z)]]
-  ```
+  \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,\\)
+  \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\)
+  \\(=       Log[E_p[f(Z)]]\\)
 
   This integral is done in log-space with max-subtraction to better handle the
   often extreme values that `f(z) p(z) / q(z)` can take on.
@@ -196,13 +192,11 @@ def _logspace_mean(log_values):
 
 def expectation(f, samples, log_prob=None, use_reparametrization=True,
                 axis=0, keep_dims=False, name=None):
-  """Computes the Monte-Carlo approximation of `E_p[f(X)]`.
+  """Computes the Monte-Carlo approximation of \\(E_p[f(X)]\\).
 
   This function computes the Monte-Carlo approximation of an expectation, i.e.,
 
-  ```none
-  E_p[f(X)] approx= m**-1 sum_i^m f(x_j),  x_j ~iid p(X)
-  ```
+  \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j),  x_j\  ~iid\ p(X)\\)
 
   where:
 
@@ -216,8 +210,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   parameterless distribution (e.g.,
   `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
   expectation, i.e.,
-  `grad[ Avg{ s_i : i=1...n } ] = Avg{ grad[s_i] : i=1...n }` where
-  `S_n = Avg{s_i}` and `s_i = f(x_i), x_i ~ p`.
+  grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n } where
+  S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\).
 
   However, if p is not reparameterized, TensorFlow's gradient will be incorrect
   since the chain-rule stops at samples of non-reparameterized distributions.
@@ -296,7 +290,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   Args:
     f: Python callable which can return `f(samples)`.
     samples: `Tensor` of samples used to form the Monte-Carlo approximation of
-      `E_p[f(X)]`.  A batch of samples should be indexed by `axis` dimensions.
+      \\(E_p[f(X)]\\).  A batch of samples should be indexed by `axis`
+      dimensions.
     log_prob: Python callable which can return `log_prob(samples)`. Must
       correspond to the natural-logarithm of the pdf/pmf of each sample. Only
       required/used if `use_reparametrization=False`.
@@ -316,7 +311,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
   Returns:
     approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation
-      of `E_p[f(X)]`.
+      of \\(E_p[f(X)]\\).
 
   Raises:
     ValueError: if `f` is not a Python `callable`.
@@ -328,7 +323,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
     if not callable(f):
       raise ValueError('`f` must be a callable function.')
     if use_reparametrization:
-      return math_ops.reduce_mean(f(samples), axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(f(samples), axis=axis, keepdims=keep_dims)
     else:
       if not callable(log_prob):
         raise ValueError('`log_prob` must be a callable function.')
@@ -348,7 +343,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
       # "Is there a floating point value of x, for which x-x == 0 is false?"
       # http://stackoverflow.com/q/2686644
       fx += stop(fx) * (logpx - stop(logpx))  # Add zeros_like(logpx).
-      return math_ops.reduce_mean(fx, axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(fx, axis=axis, keepdims=keep_dims)
 
 
 def _sample_mean(values):
diff --git a/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py b/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py
deleted file mode 100644
index 7786656398e3c87704227be95b3cd23a38785249..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""An optimizer module for stochastic gradient Langevin dynamics."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope as varscope_ops
-from tensorflow.python.training import optimizer
-from tensorflow.python.training import training_ops
-
-
-class SGLDOptimizer(optimizer.Optimizer):
-  """An optimizer module for stochastic gradient Langevin dynamics.
-
-  This implements the preconditioned Stochastic Gradient Langevin Dynamics
-  optimizer [1]. The optimization variable is regarded as a sample from the
-  posterior under Stochastic Gradient Langevin Dynamics with noise rescaled in
-  each dimension according to RMSProp [2].
-
-  Note: If a prior is included in the loss, it should be scaled by
-  `1/num_pseudo_batches`, where num_pseudo_batches is the number of minibatches
-  in the data.  I.e., it should be divided by the `num_pseudo_batches` term
-  described below.
-
-  [1]: "Preconditioned Stochastic Gradient Langevin Dynamics for Deep Neural
-       Networks." Chunyuan Li, Changyou Chen, David Carlson, Lawrence Carin.
-       ArXiv:1512.07666, 2015. https://arxiv.org/abs/1512.07666
-  [2]: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
-
-  Args:
-    learning_rate: Scalar `float`-like `Tensor`. The base learning rate for the
-      optimizer. Must be tuned to the specific function being minimized.
-    preconditioner_decay_rate: Scalar `float`-like `Tensor`. The exponential
-      decay rate of the rescaling of the preconditioner (RMSprop). (This is
-      "alpha" in [1]). Should be smaller than but nearly `1` to approximate
-      sampling from the posterior. (Default: `0.95`)
-    num_pseudo_batches: Scalar `int`-like `Tensor`. The effective number of
-      minibatches in the data set.  Trades off noise and prior with the SGD
-      likelihood term. Note: Assumes the loss is taken as the mean over a
-      minibatch. Otherwise if the sum was taken, divide this number by the
-      batch size.  (Default: `1`)
-    burnin: Scalar `int`-like `Tensor`. The number of iterations to collect
-      gradient statistics to update the preconditioner before starting to draw
-      noisy samples. (Default: `25`)
-    diagonal_bias: Scalar `float`-like `Tensor`. Term added to the diagonal of
-      the preconditioner to prevent the preconditioner from degenerating.
-      (Default: `1e-8`)
-    name: Python `str` describing ops managed by this function.
-      (Default: `"SGLDOptimizer"`)
-    variable_scope: Variable scope used for calls to `tf.get_variable`.
-      If `None`, a new variable scope is created using name
-      `ops.get_default_graph().unique_name(name or default_name)`.
-
-  Raises:
-    InvalidArgumentError: If preconditioner_decay_rate is a `Tensor` not in
-      `(0,1]`.
-  """
-
-  def __init__(self,
-               learning_rate,
-               preconditioner_decay_rate=0.95,
-               num_pseudo_batches=1,
-               burnin=25,
-               diagonal_bias=1e-8,
-               name=None,
-               variable_scope=None):
-    default_name = 'SGLDOptimizer'
-    with ops.name_scope(name, default_name, [
-        learning_rate, preconditioner_decay_rate, num_pseudo_batches, burnin,
-        diagonal_bias
-    ]):
-      if variable_scope is None:
-        var_scope_name = ops.get_default_graph().unique_name(
-            name or default_name)
-        with varscope_ops.variable_scope(var_scope_name) as scope:
-          self._variable_scope = scope
-      else:
-        self._variable_scope = variable_scope
-
-      self._preconditioner_decay_rate = ops.convert_to_tensor(
-          preconditioner_decay_rate, name='preconditioner_decay_rate')
-      self._num_pseudo_batches = ops.convert_to_tensor(
-          num_pseudo_batches, name='num_pseudo_batches')
-      self._burnin = ops.convert_to_tensor(burnin, name='burnin')
-      self._diagonal_bias = ops.convert_to_tensor(
-          diagonal_bias, name='diagonal_bias')
-      self._learning_rate = ops.convert_to_tensor(
-          learning_rate, name='learning_rate')
-
-      with varscope_ops.variable_scope(self._variable_scope):
-        self._counter = varscope_ops.get_variable(
-            'counter', initializer=0, trainable=False)
-
-      self._preconditioner_decay_rate = control_flow_ops.with_dependencies([
-          check_ops.assert_non_negative(
-              self._preconditioner_decay_rate,
-              message='`preconditioner_decay_rate` must be non-negative'),
-          check_ops.assert_less_equal(
-              self._preconditioner_decay_rate,
-              1.,
-              message='`preconditioner_decay_rate` must be at most 1.'),
-      ], self._preconditioner_decay_rate)
-
-      self._num_pseudo_batches = control_flow_ops.with_dependencies([
-          check_ops.assert_greater(
-              self._num_pseudo_batches,
-              0,
-              message='`num_pseudo_batches` must be greater than zero')
-      ], self._num_pseudo_batches)
-
-      self._burnin = control_flow_ops.with_dependencies([
-          check_ops.assert_non_negative(
-              self._burnin, message='`burnin` must be non-negative'),
-          check_ops.assert_integer(
-              self._burnin, message='`burnin` must be an integer')
-      ], self._burnin)
-
-      self._diagonal_bias = control_flow_ops.with_dependencies([
-          check_ops.assert_non_negative(
-              self._diagonal_bias,
-              message='`diagonal_bias` must be non-negative')
-      ], self._diagonal_bias)
-
-      super(SGLDOptimizer, self).__init__(use_locking=False,
-                                          name=name or default_name)
-
-  def _create_slots(self, var_list):
-    for v in var_list:
-      init_rms = init_ops.ones_initializer(dtype=v.dtype)
-      self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(),
-                                              v.dtype, 'rms', self._name)
-
-  def _prepare(self):
-    # We need to put the conversion and check here because a user will likely
-    # want to decay the learning rate dynamically.
-    self._learning_rate_tensor = control_flow_ops.with_dependencies([
-        check_ops.assert_non_negative(
-            self._learning_rate, message='`learning_rate` must be non-negative')
-    ], ops.convert_to_tensor(self._learning_rate, name='learning_rate_tensor'))
-    self._decay_tensor = ops.convert_to_tensor(
-        self._preconditioner_decay_rate, name='preconditioner_decay_rate')
-
-    super(SGLDOptimizer, self)._prepare()
-
-  def _apply_dense(self, grad, var):
-    rms = self.get_slot(var, 'rms')
-
-    with ops.control_dependencies([
-        self._update_momentum(rms, grad, math_ops.cast(self._decay_tensor,
-                                                       var.dtype.base_dtype))]):
-      new_grad = self._apply_noisy_update(rms, grad)
-
-    return training_ops.apply_gradient_descent(
-        var,
-        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
-        new_grad,
-        use_locking=self._use_locking).op
-
-  def _apply_sparse(self, grad, var):
-    rms = self.get_slot(var, 'rms')
-
-    with ops.control_dependencies([
-        self._update_momentum(rms, grad, math_ops.cast(self._decay_tensor,
-                                                       var.dtype.base_dtype))]):
-      new_grad = self._apply_noisy_update(rms, grad)
-
-    return training_ops.apply_gradient_descent(
-        var,
-        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
-        new_grad,
-        use_locking=self._use_locking).op
-
-  def _finish(self, update_ops, name_scope):
-    update_ops.append([self._counter.assign_add(1)])
-    return control_flow_ops.group(*update_ops, name=name_scope)
-
-  @property
-  def variable_scope(self):
-    """Variable scope of all calls to `tf.get_variable`."""
-    return self._variable_scope
-
-  def _apply_noisy_update(self, mom, grad):
-    # Compute and apply the gradient update following
-    # preconditioned Langevin dynamics
-    stddev = array_ops.where(
-        array_ops.squeeze(self._counter > self._burnin),
-        math_ops.cast(math_ops.rsqrt(self._learning_rate), grad.dtype),
-        array_ops.zeros([], grad.dtype))
-
-    preconditioner = math_ops.rsqrt(
-        mom + math_ops.cast(self._diagonal_bias, grad.dtype))
-    return (
-        0.5 * preconditioner * grad * math_ops.cast(self._num_pseudo_batches,
-                                                    grad.dtype) +
-        random_ops.random_normal(array_ops.shape(grad), 1.0, dtype=grad.dtype) *
-        stddev * math_ops.sqrt(preconditioner))
-
-  def _update_momentum(self, mom, grad, decay):
-    # Keep an exponentially weighted moving average of squared gradients.
-    # Not thread safe
-    return mom.assign_add((1.0 - decay) * (math_ops.square(grad) - mom))
diff --git a/tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py b/tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py
deleted file mode 100644
index ca3d75b5bfee093449026c7d1d62e3bdeff6b096..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility functions related to managing `tf.Variable`s.
-
-@@externalize_variables_as_args
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import warnings
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gradients_impl as gradients_ops
-from tensorflow.python.ops import variable_scope as varscope_ops
-from tensorflow.python.ops import variables as variables_ops
-
-__all__ = [
-    "externalize_variables_as_args",
-]
-
-
-# Cause all warnings to always be triggered.
-# Not having this means subsequent calls wont trigger the warning.
-warnings.simplefilter("always")
-
-
-def externalize_variables_as_args(fn,
-                                  fn_args=(),
-                                  ancestor_variables=None,
-                                  possible_ancestor_vars=None,
-                                  assert_variable_override=False,
-                                  name=None):
-  """"Converts variables within a callable into explicit args.
-
-  Makes a new callable from `fn` which has arguments `list(fn_args) +
-  list(ancestor_variables)`. If `ancestor_variables` is not specified, it is
-  inferred by checking which of `possible_ancestor_vars` actually influences the
-  return value of `fn` (concretely, gradient of `fn(*fn_args)` is not `None`).
-  By default `possible_ancestor_vars` is `tf.trainable_variables() +
-  tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)`.
-
-  #### Examples:
-
-  ```python
-  num_samples = 2
-  num_dims = 1
-  dtype = np.float32
-
-  def foo(x):
-    x = tf.convert_to_tensor(x, dtype=dtype, name="x")
-    s = x.shape.as_list()
-    y = tf.get_variable(
-        name="y",
-        dtype=dtype,
-        initializer=np.arange(np.prod(s)).reshape(s).astype(dtype))
-    return x + y
-
-  x = tf.constant(dtype([0.1, 0.2]))
-
-  wrapped_foo, discovered_ancestor_variables = (
-      externalize_variables_as_args(foo, [x]))
-
-  new_x = dtype([[1.], [2.]])
-  new_y = dtype([[3.], [4.]])
-  new_result = wrapped_foo(new_x, new_y)
-  # ==> [[4.], [6.]]
-
-  discovered_ancestor_variables == [tf.get_variable("y", dtype)]
-  # ==> [True]
-  ```
-
-  Args:
-    fn: Python callable which returns a `Tensor` and accepts `*fn_args`.
-    fn_args: Python list of args to `fn`. Represents dummy arguments passed to
-      `fn` to trace its execution; actual values are unimportant. These args are
-      only used to construct the output of `fn` and to resolve the ancestor
-      `tf.Variable`s.
-      Default value: `()` (i.e., `fn` takes no args).
-    ancestor_variables: Python list of `tf.Variable`s. When `None` the list is
-      expanded to non-`None` gradients of `fn(*fn_args)`. By directly providing
-      the `ancestor_variables` the internal call to `fn` is avoided.
-      Default value: `None` (i.e., `tf.Variable` dependencies are discovered).
-    possible_ancestor_vars: Python list of possible `tf.Variable`s which might
-      be a dependency of computing `fn(*fn_args)`.
-      Default value: `None` (i.e., expanded as described above).
-    assert_variable_override: Python `bool` indicating that not finding a
-      `tf.Variable` in the override list is an exception.
-      Default value: `False` (i.e., missing a `Variable` triggers a `warning`).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "externalize_variables_as_args").
-
-  Returns:
-    wrapped_fn: Python callable taking arguments like
-      `*(list(fn_args) + discovered_ancestor_variables)`.
-    discovered_ancestor_variables: Python list of `tf.Variable`s known to be a
-      dependency of `fn(*fn_args)`.
-
-  Raises:
-    ValueError: if `assert_variable_override` is `True` and `Variable` is
-      requested but not overridden.
-  """
-  def _make_bypassing_custom_getter_fn(new_var_dict):
-    """Return dict value rather than what would otherwise be dict key."""
-    def _custom_getter(getter, *args, **kwargs):
-      v = getter(*args, **kwargs)
-      new_v = new_var_dict.get(v, None)
-      if new_v is None:
-        msg = "Variable \"{}\" not found in bypass dict.".format(v)
-        if assert_variable_override:
-          raise ValueError(msg)
-        warnings.warn(msg)
-        return v
-      return new_v
-    return _custom_getter
-
-  with ops.name_scope(name, "externalize_variables_as_args"):
-    if ancestor_variables is not None and not ancestor_variables:
-      return fn, ()
-    if ancestor_variables is None:
-      y = fn(*fn_args)  # Side-effect: adds trainable vars.
-      if possible_ancestor_vars is None:
-        possible_ancestor_vars = (
-            variables_ops.trainable_variables() +
-            ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
-      # TODO(b/72873296): Add a dedicated op for identifying ancestors.
-      ancestors = [v for g, v
-                   in zip(gradients_ops.gradients(y, possible_ancestor_vars),
-                          possible_ancestor_vars)
-                   if g is not None]
-      ancestor_variables = sorted(ancestors, key=lambda v: v.name)
-  n = len(fn_args)
-  def _fn(*args):
-    with ops.name_scope("wrapped_fn"):
-      vars_dict = dict(
-          (k, ops.convert_to_tensor(
-              v, dtype=k.dtype.base_dtype, name=k.op.name))
-          for k, v in zip(ancestor_variables, args[n:]))
-      with varscope_ops.variable_scope(
-          varscope_ops.get_variable_scope(),
-          reuse=True,
-          custom_getter=_make_bypassing_custom_getter_fn(vars_dict)):
-        return fn(*args[:n])
-  return _fn, ancestor_variables
diff --git a/tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py b/tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py
deleted file mode 100644
index 4d5f0cfe9713a011b32c5aba8d429847d81f33e2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""An optimizer module for constant stochastic gradient descent."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope as varscope_ops
-from tensorflow.python.training import optimizer
-from tensorflow.python.training import training_ops
-
-
-class VariationalSGDOptimizer(optimizer.Optimizer):
-  """An optimizer module for constant stochastic gradient descent.
-
-  This implements an optimizer module for the constant stochastic gradient
-  descent algorithm [1].  The optimization variable is regarded as an
-  approximate sample from the posterior .
-
-  Note: If a prior is included in the loss, it should be scaled by
-  `1/num_pseudo_batches`, where num_pseudo_batches is the number of minibatches
-  in the data.  I.e., it should be divided by the `num_pseudo_batches` term
-  described below.
-
-  [1]: "Stochastic Gradient Descent as Approximate Bayesian Inference
-       Stephan Mandt, Matthew D. Hoffman, David M. Blei.
-       ArXiv:1704.04289, 2017. https://arxiv.org/abs/1704.04289
-
-  Args:
-    batch_size: Scalar `int`-like `Tensor`. The number of examples in a
-      minibatch in the data set. Note: Assumes the loss is taken as the mean
-      over a minibatch. Otherwise if the sum was taken set this to 1.
-    total_num_examples: Scalar `int`-like `Tensor`. The total number of examples
-      in the data set.
-    max_learning_rate: Scalar `float`-like `Tensor`. A maximum allowable
-      effective coordinate-wise learning rate. The algorithm scales down any
-      effective learning rate (i.e. after preconditioning) that is larger than
-      this. (Default: `1`)
-    preconditioner_decay_rate: Scalar `float`-like `Tensor`. The exponential
-      decay rate of the rescaling of the preconditioner (RMSprop). (This is
-      "alpha" in [1]). Should be smaller than but nearly `1` to approximate
-      sampling from the posterior. (Default: `0.95`)
-    burnin: Scalar `int`-like `Tensor`. The number of iterations to collect
-      gradient statistics to update the preconditioner before starting to draw
-      noisy samples. (Default: `25`)
-    burnin_max_learning_rate: Scalar `float`-like `Tensor`. Maximum learning
-      rate to use during the burnin period.
-      (Default: `1e-8`)
-    use_single_learning_rate: Boolean Indicates whether one single learning
-      rate is used or coordinate_wise learning rates are used.
-      (Default: `False`)
-    name: Python `str` describing ops managed by this function.
-      (Default: `"VariationalSGDOptimizer"`)
-    variable_scope: Variable scope used for calls to `tf.get_variable`.
-      If `None`, a new variable scope is created using name
-      `ops.get_default_graph().unique_name(name or default_name)`.
-
-  Raises:
-    InvalidArgumentError: If preconditioner_decay_rate is a `Tensor` not in
-      `(0,1]`.
-  """
-
-  def __init__(self,
-               batch_size,
-               total_num_examples,
-               max_learning_rate=1.0,
-               preconditioner_decay_rate=0.95,
-               burnin=25,
-               burnin_max_learning_rate=1e-6,
-               use_single_learning_rate=False,
-               name=None,
-               variable_scope=None):
-    default_name = 'VariationalSGDOptimizer'
-    with ops.name_scope(name, default_name, [
-        max_learning_rate, preconditioner_decay_rate, batch_size, burnin,
-        burnin_max_learning_rate
-    ]):
-      if variable_scope is None:
-        var_scope_name = ops.get_default_graph().unique_name(
-            name or default_name)
-        with varscope_ops.variable_scope(var_scope_name) as scope:
-          self._variable_scope = scope
-      else:
-        self._variable_scope = variable_scope
-
-      self._preconditioner_decay_rate = ops.convert_to_tensor(
-          preconditioner_decay_rate, name='preconditioner_decay_rate')
-      self._batch_size = ops.convert_to_tensor(batch_size, name='batch_size')
-      self._total_num_examples = ops.convert_to_tensor(
-          total_num_examples, name='total_num_examples')
-      self._burnin = ops.convert_to_tensor(burnin, name='burnin')
-      self._burnin_max_learning_rate = ops.convert_to_tensor(
-          burnin_max_learning_rate, name='burnin_max_learning_rate')
-      self._max_learning_rate = ops.convert_to_tensor(
-          max_learning_rate, name='max_learning_rate')
-      self._use_single_learning_rate = use_single_learning_rate
-
-      with varscope_ops.variable_scope(self._variable_scope):
-        self._counter = varscope_ops.get_variable(
-            'counter', initializer=0, trainable=False)
-
-      self._preconditioner_decay_rate = control_flow_ops.with_dependencies([
-          check_ops.assert_non_negative(
-              self._preconditioner_decay_rate,
-              message='`preconditioner_decay_rate` must be non-negative'),
-          check_ops.assert_less_equal(
-              self._preconditioner_decay_rate,
-              1.,
-              message='`preconditioner_decay_rate` must be at most 1.'),
-      ], self._preconditioner_decay_rate)
-
-      self._batch_size = control_flow_ops.with_dependencies([
-          check_ops.assert_greater(
-              self._batch_size,
-              0,
-              message='`batch_size` must be greater than zero')
-      ], self._batch_size)
-
-      self._total_num_examples = control_flow_ops.with_dependencies([
-          check_ops.assert_greater(
-              self._total_num_examples,
-              0,
-              message='`total_num_examples` must be greater than zero')
-      ], self._total_num_examples)
-
-      self._burnin = control_flow_ops.with_dependencies([
-          check_ops.assert_non_negative(
-              self._burnin, message='`burnin` must be non-negative'),
-          check_ops.assert_integer(
-              self._burnin, message='`burnin` must be an integer')
-      ], self._burnin)
-
-      self._burnin_max_learning_rate = control_flow_ops.with_dependencies([
-          check_ops.assert_non_negative(
-              self._burnin_max_learning_rate,
-              message='`burnin_max_learning_rate` must be non-negative')
-      ], self._burnin_max_learning_rate)
-
-      self._max_learning_rate = control_flow_ops.with_dependencies([
-          check_ops.assert_non_negative(
-              self._max_learning_rate,
-              message='`max_learning_rate` must be non-negative')
-      ], self._max_learning_rate)
-
-      super(VariationalSGDOptimizer, self).__init__(
-          use_locking=False, name=name or default_name)
-
-  def _create_slots(self, var_list):
-    for v in var_list:
-      init_moment = init_ops.zeros_initializer(dtype=v.dtype)
-      self._get_or_make_slot_with_initializer(
-          v, init_moment, v.get_shape(), v.dtype, 'first_moment', self._name)
-      self._get_or_make_slot_with_initializer(
-          v, init_moment, v.get_shape(), v.dtype, 'second_moment', self._name)
-
-  def _prepare(self):
-    self._decay_tensor = ops.convert_to_tensor(
-        self._preconditioner_decay_rate, name='preconditioner_decay_rate')
-    self._batch_size_tensor = ops.convert_to_tensor(
-        self._batch_size, name='batch_size_tensor')
-
-    super(VariationalSGDOptimizer, self)._prepare()
-
-  def _get_coordinatewise_learning_rate(self, grad, var):
-    # Compute the learning rate using a moving average for the diagonal of BB^T
-    avg_first = self.get_slot(var, 'first_moment')
-    avg_second = self.get_slot(var, 'second_moment')
-    decay_tensor = math_ops.cast(self._decay_tensor, var.dtype)
-    batch_size = math_ops.cast(self._batch_size_tensor, var.dtype)
-
-    # Create an estimator for the moving average of gradient mean and variance
-    # via Welford's algorithm
-    if isinstance(grad, ops.Tensor):
-      delta = grad - avg_first
-      first_moment_update = avg_first.assign_add(
-          array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype),
-                          1. - decay_tensor) * delta)
-
-      with ops.control_dependencies([first_moment_update]):
-        second_moment_update = avg_second.assign_add(
-            math_ops.cast(self._counter < 1, var.dtype) *
-            -(1. - decay_tensor) * (
-                avg_second - decay_tensor  * math_ops.square(delta)))
-      diag_preconditioner = control_flow_ops.with_dependencies(
-          [second_moment_update],
-          clip_ops.clip_by_value(avg_second, 1e-12, 1e12))
-    elif isinstance(grad, ops.IndexedSlices):
-      delta = grad.values - array_ops.gather_nd(avg_first, grad.indices)
-      first_moment_update = state_ops.scatter_add(
-          avg_first,
-          grad.indices,
-          array_ops.where(self._counter < 1,
-                          math_ops.cast(1., var.dtype),
-                          1. - decay_tensor) * delta)
-
-      with ops.control_dependencies([first_moment_update]):
-        avg_second = state_ops.scatter_add(
-            avg_second,
-            grad.indices,
-            math_ops.cast(self._counter < 1, var.dtype) *
-            -(1. - decay_tensor) * (
-                array_ops.gather_nd(avg_second, grad.indices) - decay_tensor *
-                math_ops.square(delta)))
-        avg_second = array_ops.gather_nd(avg_second, grad.indices)
-        # TODO(b/70783772)
-        diag_preconditioner = clip_ops.clip_by_value(avg_second, 1e-12, 1e12)
-    else:
-      raise errors.InvalidArgumentError(
-          None, None, 'grad must of type Tensor or IndexedSlice')
-
-    diag_preconditioner *= batch_size
-
-    if self._use_single_learning_rate:
-      diag_preconditioner = math_ops.reduce_mean(diag_preconditioner)
-
-    # From Theorem 2 Corollary 1 of Mandt et al. 2017
-    return 2. * batch_size / (
-        math_ops.cast(self._total_num_examples, var.dtype.base_dtype) *
-        diag_preconditioner)
-
-  def _apply_dense(self, grad, var):
-
-    max_learning_rate = array_ops.where(self._counter < self._burnin,
-                                        self._burnin_max_learning_rate,
-                                        self._max_learning_rate)
-
-    learn_rates = clip_ops.clip_by_value(
-        self._get_coordinatewise_learning_rate(grad, var), 0.0,
-        math_ops.cast(max_learning_rate, var.dtype.base_dtype))
-
-    newgrad = grad * learn_rates
-    return training_ops.apply_gradient_descent(
-        var,
-        math_ops.cast(1.0, var.dtype),
-        newgrad,
-        use_locking=self._use_locking).op
-
-  def _apply_sparse(self, grad, var):
-
-    max_learning_rate = array_ops.where(self._counter < self._burnin,
-                                        self._burnin_max_learning_rate,
-                                        self._max_learning_rate)
-
-    learn_rate = clip_ops.clip_by_value(
-        self._get_coordinatewise_learning_rate(grad, var), 0.0,
-        math_ops.cast(max_learning_rate, var.dtype))
-    delta = grad.values * learn_rate
-
-    return state_ops.scatter_sub(var, grad.indices, delta,
-                                 use_locking=self._use_locking)
-
-  def _finish(self, update_ops, name_scope):
-    update_ops.append([self._counter.assign_add(1)])
-    return control_flow_ops.group(*update_ops, name=name_scope)
-
-  @property
-  def variable_scope(self):
-    """Variable scope of all calls to `tf.get_variable`."""
-    return self._variable_scope
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index 6fdcd0f996ee011842a5add79f06264a28a2145c..8eac1243ef63dd09c5c5dad4bcd9bd7a15f58900 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -14,15 +14,6 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 package_group(name = "friends")
 
 cc_library(
@@ -128,7 +119,7 @@ py_library(
 
 py_test(
     name = "gbdt_batch_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/training/functions/gbdt_batch_test.py"],
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 289f5bb3140974d8c37f4938ceef27275b099f9a..8cff1a3bb1d11aff6a264636291a7149b40de516 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -10,23 +10,17 @@ package(
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "init_py",
-    srcs = [
-        "__init__.py",
-    ],
+    srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        "custom_export_strategy",
+        ":custom_loss_head",
+        ":estimator",
+        ":model",
+        ":trainer_hooks",
+    ],
 )
 
 py_library(
@@ -34,12 +28,13 @@ py_library(
     srcs = ["model.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator_utils",
         ":trainer_hooks",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees:model_ops_py",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
     ],
 )
 
@@ -57,6 +52,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "estimator_utils",
+    srcs = ["estimator_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/learn",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
 py_test(
     name = "trainer_hooks_test",
     size = "small",
@@ -124,6 +131,7 @@ py_library(
     srcs = ["estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator_utils",
         ":model",
         "//tensorflow/contrib/boosted_trees:losses",
         "//tensorflow/contrib/learn",
@@ -136,6 +144,7 @@ py_library(
     srcs = ["dnn_tree_combined_estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator_utils",
         ":trainer_hooks",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees:model_ops_py",
@@ -149,7 +158,7 @@ py_library(
 
 py_test(
     name = "dnn_tree_combined_estimator_test",
-    size = "small",
+    size = "medium",
     srcs = ["dnn_tree_combined_estimator_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -165,3 +174,22 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
 )
+
+py_test(
+    name = "estimator_test",
+    size = "medium",
+    srcs = ["estimator_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":estimator",
+        "//tensorflow/contrib/boosted_trees:gbdt_batch",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index 31f5c444817b9b82723c86bea3504d4934e57eb8..62f1f4122b05b56a708823df4246d618bd3fa5d4 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -39,7 +39,8 @@ _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE = "%s_%d"
 def make_custom_export_strategy(name,
                                 convert_fn,
                                 feature_columns,
-                                export_input_fn):
+                                export_input_fn,
+                                use_core_columns=False):
   """Makes custom exporter of GTFlow tree format.
 
   Args:
@@ -54,11 +55,11 @@ def make_custom_export_strategy(name,
     An `ExportStrategy`.
   """
   base_strategy = saved_model_export_utils.make_export_strategy(
-      serving_input_fn=export_input_fn)
+      serving_input_fn=export_input_fn, strip_default_attrs=True)
   input_fn = export_input_fn()
   (sorted_feature_names, dense_floats, sparse_float_indices, _, _,
    sparse_int_indices, _, _) = gbdt_batch.extract_features(
-       input_fn.features, feature_columns)
+       input_fn.features, feature_columns, use_core_columns)
 
   def export_fn(estimator, export_dir, checkpoint_path=None, eval_result=None):
     """A wrapper to export to SavedModel, and convert it to other formats."""
@@ -93,7 +94,9 @@ def make_custom_export_strategy(name,
                          "w") as f:
           f.write("\n".join("%s, %f" % (k, v) for k, v in sorted_by_importance))
     return result_dir
-  return export_strategy.ExportStrategy(name, export_fn)
+
+  return export_strategy.ExportStrategy(
+      name, export_fn, strip_default_attrs=True)
 
 
 def convert_to_universal_format(dtec, sorted_feature_names,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index cec3892b57655dc967b4e7926f7f5a6a30084487..9994c84ebdb930eea0818188225488eb5eca84eb 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -19,14 +19,13 @@ logits of the DNN. The input layer of the DNN (including the embeddings learned
 over sparse features) can optionally be provided to the boosted trees as
 an additional input feature.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
-
 from tensorflow.contrib import layers
+from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
@@ -34,6 +33,7 @@ from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.python.feature_column import feature_column as feature_column_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
@@ -43,10 +43,8 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
 
-
 _DNN_LEARNING_RATE = 0.001
 
-
 def _get_optimizer(optimizer):
   if callable(optimizer):
     return optimizer()
@@ -59,16 +57,25 @@ def _add_hidden_layer_summary(value, tag):
   summary.histogram("%s_activation" % tag, value)
 
 
-def _dnn_tree_combined_model_fn(
-    features, labels, mode, head, dnn_hidden_units,
-    dnn_feature_columns, tree_learner_config, num_trees,
-    tree_examples_per_layer,
-    config=None, dnn_optimizer="Adagrad",
-    dnn_activation_fn=nn.relu, dnn_dropout=None,
-    dnn_input_layer_partitioner=None,
-    dnn_input_layer_to_tree=True, dnn_steps_to_train=10000,
-    tree_feature_columns=None,
-    tree_center_bias=True):
+def _dnn_tree_combined_model_fn(features,
+                                labels,
+                                mode,
+                                head,
+                                dnn_hidden_units,
+                                dnn_feature_columns,
+                                tree_learner_config,
+                                num_trees,
+                                tree_examples_per_layer,
+                                config=None,
+                                dnn_optimizer="Adagrad",
+                                dnn_activation_fn=nn.relu,
+                                dnn_dropout=None,
+                                dnn_input_layer_partitioner=None,
+                                dnn_input_layer_to_tree=True,
+                                dnn_steps_to_train=10000,
+                                tree_feature_columns=None,
+                                tree_center_bias=False,
+                                use_core_versions=False):
   """DNN and GBDT combined model_fn.
 
   Args:
@@ -106,6 +113,8 @@ def _dnn_tree_combined_model_fn(
       set to True, these features are in addition to dnn_feature_columns.
     tree_center_bias: Whether a separate tree should be created for
       first fitting the bias.
+    use_core_versions: Whether feature columns and loss are from the core (as
+      opposed to contrib) version of tensorflow.
 
   Returns:
     A `ModelFnOps` object.
@@ -135,11 +144,17 @@ def _dnn_tree_combined_model_fn(
         "input_from_feature_columns",
         values=tuple(six.itervalues(features)),
         partitioner=dnn_partitioner) as input_layer_scope:
-      input_layer = layers.input_from_feature_columns(
-          columns_to_tensors=features,
-          feature_columns=dnn_feature_columns,
-          weight_collections=[dnn_parent_scope],
-          scope=input_layer_scope)
+      if use_core_versions:
+        input_layer = feature_column_lib.input_layer(
+            features=features,
+            feature_columns=dnn_feature_columns,
+            weight_collections=[dnn_parent_scope])
+      else:
+        input_layer = layers.input_from_feature_columns(
+            columns_to_tensors=features,
+            feature_columns=dnn_feature_columns,
+            weight_collections=[dnn_parent_scope],
+            scope=input_layer_scope)
     previous_layer = input_layer
     for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
       with variable_scope.variable_scope(
@@ -222,24 +237,51 @@ def _dnn_tree_combined_model_fn(
     del loss
     return control_flow_ops.no_op()
 
-  model_fn_ops = head.create_model_fn_ops(
-      features=features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_no_train_op_fn,
-      logits=tree_train_logits)
-  dnn_train_op = head.create_model_fn_ops(
-      features=features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_dnn_train_op_fn,
-      logits=dnn_logits).train_op
-  tree_train_op = head.create_model_fn_ops(
-      features=tree_features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_tree_train_op_fn,
-      logits=tree_train_logits).train_op
+  if use_core_versions:
+    model_fn_ops = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_no_train_op_fn,
+        logits=tree_train_logits)
+    dnn_train_op = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_dnn_train_op_fn,
+        logits=dnn_logits)
+    dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+        dnn_train_op).train_op
+
+    tree_train_op = head.create_estimator_spec(
+        features=tree_features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_tree_train_op_fn,
+        logits=tree_train_logits)
+    tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+        tree_train_op).train_op
+
+    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
+  else:
+    model_fn_ops = head.create_model_fn_ops(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_no_train_op_fn,
+        logits=tree_train_logits)
+    dnn_train_op = head.create_model_fn_ops(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_dnn_train_op_fn,
+        logits=dnn_logits).train_op
+    tree_train_op = head.create_model_fn_ops(
+        features=tree_features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_tree_train_op_fn,
+        logits=tree_train_logits).train_op
 
   if tree_center_bias:
     num_trees += 1
@@ -277,7 +319,8 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
                tree_feature_columns=None,
-               tree_center_bias=True):
+               tree_center_bias=False,
+               use_core_versions=False):
     """Initializes a DNNBoostedTreeCombinedClassifier instance.
 
     Args:
@@ -322,6 +365,8 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
         set to True, these features are in addition to dnn_feature_columns.
       tree_center_bias: Whether a separate tree should be created for
         first fitting the bias.
+      use_core_versions: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     head = head_lib.multi_class_head(
         n_classes=n_classes,
@@ -336,8 +381,8 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
           tree_learner_config, num_trees, tree_examples_per_layer, config,
           dnn_optimizer, dnn_activation_fn, dnn_dropout,
           dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train,
-          tree_feature_columns, tree_center_bias)
+          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
+          use_core_versions)
 
     super(DNNBoostedTreeCombinedClassifier, self).__init__(
         model_fn=_model_fn, model_dir=model_dir,
@@ -366,7 +411,8 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
                tree_feature_columns=None,
-               tree_center_bias=True):
+               tree_center_bias=False,
+               use_core_versions=False):
     """Initializes a DNNBoostedTreeCombinedRegressor instance.
 
     Args:
@@ -411,6 +457,8 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
         set to True, these features are in addition to dnn_feature_columns.
       tree_center_bias: Whether a separate tree should be created for
         first fitting the bias.
+      use_core_versions: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     head = head_lib.regression_head(
         label_name=label_name,
@@ -430,7 +478,8 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
           tree_learner_config, num_trees, tree_examples_per_layer, config,
           dnn_optimizer, dnn_activation_fn, dnn_dropout,
           dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train, tree_feature_columns, tree_center_bias)
+          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
+          use_core_versions)
 
     super(DNNBoostedTreeCombinedRegressor, self).__init__(
         model_fn=_model_fn, model_dir=model_dir,
@@ -460,7 +509,8 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
                tree_feature_columns=None,
-               tree_center_bias=True):
+               tree_center_bias=False,
+               use_core_versions=False):
     """Initializes a DNNBoostedTreeCombinedEstimator instance.
 
     Args:
@@ -500,6 +550,8 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
         set to True, these features are in addition to dnn_feature_columns.
       tree_center_bias: Whether a separate tree should be created for
         first fitting the bias.
+      use_core_versions: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
@@ -507,8 +559,8 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
           tree_learner_config, num_trees, tree_examples_per_layer, config,
           dnn_optimizer, dnn_activation_fn, dnn_dropout,
           dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train,
-          tree_feature_columns, tree_center_bias)
+          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
+          use_core_versions)
 
     super(DNNBoostedTreeCombinedEstimator, self).__init__(
         model_fn=_model_fn, model_dir=model_dir,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
index 83d58c561008e8a5a69eb503d1605bb9e940f281..f495edc62f0909880c170ccb4cf5d11e3f20f55c 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -19,15 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 import tempfile
-
 from tensorflow.contrib.boosted_trees.estimator_batch import dnn_tree_combined_estimator as estimator
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
 from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
 
 
@@ -100,6 +102,35 @@ class DNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
     classifier.fit(input_fn=_train_input_fn, steps=15)
     classifier.evaluate(input_fn=_eval_input_fn, steps=1)
 
+  def testFitAndEvaluateDontThrowExceptionWithCore(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    # Use core head
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+
+    classifier = estimator.DNNBoostedTreeCombinedEstimator(
+        head=head_fn,
+        dnn_hidden_units=[1],
+        # Use core feature columns
+        dnn_feature_columns=[core_feature_column.numeric_column("x")],
+        tree_learner_config=learner_config,
+        num_trees=1,
+        tree_examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        dnn_steps_to_train=10,
+        dnn_input_layer_to_tree=True,
+        tree_feature_columns=[],
+        use_core_versions=True)
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 01752416b347dd0a5e646283b6b5572592df4690..89d0d611d2905492cec09e033b8cbc238ec7fac6 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -40,7 +40,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
                label_keys=None,
                feature_engineering_fn=None,
                logits_modifier_function=None,
-               center_bias=True):
+               center_bias=True,
+               use_core_libs=False):
     """Initializes a GradientBoostedDecisionTreeClassifier estimator instance.
 
     Args:
@@ -63,7 +64,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
-
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     Raises:
       ValueError: If learner_config is not valid.
     """
@@ -81,7 +83,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
         n_classes=n_classes,
         weight_column_name=weight_column_name,
         enable_centered_bias=False,
-        loss_fn=loss_fn)
+        loss_fn=loss_fn,
+        label_keys=label_keys)
     if learner_config.num_classes == 0:
       learner_config.num_classes = n_classes
     elif learner_config.num_classes != n_classes:
@@ -98,6 +101,7 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
             'examples_per_layer': examples_per_layer,
             'center_bias': center_bias,
             'logits_modifier_function': logits_modifier_function,
+            'use_core_libs': use_core_libs,
         },
         model_dir=model_dir,
         config=config,
@@ -119,7 +123,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
                config=None,
                feature_engineering_fn=None,
                logits_modifier_function=None,
-               center_bias=True):
+               center_bias=True,
+               use_core_libs=False):
     """Initializes a GradientBoostedDecisionTreeRegressor estimator instance.
 
     Args:
@@ -144,6 +149,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     head = head_lib.regression_head(
         label_name=label_name,
@@ -165,6 +172,7 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
             'examples_per_layer': examples_per_layer,
             'logits_modifier_function': logits_modifier_function,
             'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
         },
         model_dir=model_dir,
         config=config,
@@ -188,7 +196,8 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
                config=None,
                feature_engineering_fn=None,
                logits_modifier_function=None,
-               center_bias=True):
+               center_bias=True,
+               use_core_libs=False):
     """Initializes a GradientBoostedDecisionTreeEstimator estimator instance.
 
     Args:
@@ -209,6 +218,8 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     super(GradientBoostedDecisionTreeEstimator, self).__init__(
         model_fn=model.model_builder,
@@ -221,6 +232,7 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
             'examples_per_layer': examples_per_layer,
             'logits_modifier_function': logits_modifier_function,
             'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
         },
         model_dir=model_dir,
         config=config,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d58317bd59331cfcde0e12aeb3a3a03fc45d89b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for GBDT estimator."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+from tensorflow.contrib.boosted_trees.estimator_batch import estimator
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
+from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+
+
+def _train_input_fn():
+  features = {"x": constant_op.constant([[2.], [1.], [1.]])}
+  label = constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+  return features, label
+
+
+def _eval_input_fn():
+  features = {"x": constant_op.constant([[1.], [2.], [2.]])}
+  label = constant_op.constant([[0], [1], [1]], dtype=dtypes.int32)
+  return features, label
+
+
+class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._export_dir_base = tempfile.mkdtemp() + "export/"
+    gfile.MkDir(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowException(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.GradientBoostedDecisionTreeClassifier(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[contrib_feature_column.real_valued_column("x")])
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+    classifier.export(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowExceptionWithCoreForEstimator(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    # Use core head
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+
+    model = estimator.GradientBoostedDecisionTreeEstimator(
+        head=head_fn,
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")],
+        use_core_libs=True)
+
+    model.fit(input_fn=_train_input_fn, steps=15)
+    model.evaluate(input_fn=_eval_input_fn, steps=1)
+    model.export(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowExceptionWithCoreForClassifier(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.GradientBoostedDecisionTreeClassifier(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")],
+        use_core_libs=True)
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+    classifier.export(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowExceptionWithCoreForRegressor(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    regressor = estimator.GradientBoostedDecisionTreeRegressor(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")],
+        use_core_libs=True)
+
+    regressor.fit(input_fn=_train_input_fn, steps=15)
+    regressor.evaluate(input_fn=_eval_input_fn, steps=1)
+    regressor.export(self._export_dir_base)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a7f85eada8c72de83b814af2f00e97a62a073e
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py
@@ -0,0 +1,74 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for converting between core and contrib feature columns."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.learn.python.learn.estimators import model_fn as contrib_model_fn_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output
+
+_CORE_MODE_TO_CONTRIB_MODE_ = {
+    model_fn_lib.ModeKeys.TRAIN: contrib_model_fn_lib.ModeKeys.TRAIN,
+    model_fn_lib.ModeKeys.EVAL: contrib_model_fn_lib.ModeKeys.EVAL,
+    model_fn_lib.ModeKeys.PREDICT: contrib_model_fn_lib.ModeKeys.INFER
+}
+
+
+def _core_mode_to_contrib_mode(mode):
+  return _CORE_MODE_TO_CONTRIB_MODE_[mode]
+
+
+def _export_outputs_to_output_alternatives(export_outputs):
+  """Converts EstimatorSpec.export_outputs to output_alternatives.
+
+  Args:
+    export_outputs: export_outputs created by create_estimator_spec.
+  Returns:
+    converted output_alternatives.
+  """
+  output = dict()
+  if export_outputs is not None:
+    for key, value in export_outputs.items():
+      if isinstance(value, export_output.ClassificationOutput):
+        exported_predictions = {
+            prediction_key.PredictionKey.SCORES: value.scores,
+            prediction_key.PredictionKey.CLASSES: value.classes
+        }
+        output[key] = (constants.ProblemType.CLASSIFICATION,
+                       exported_predictions)
+    return output
+  return None
+
+
+def estimator_spec_to_model_fn_ops(estimator_spec, export_alternatives=False):
+  if export_alternatives:
+    alternatives = _export_outputs_to_output_alternatives(
+        estimator_spec.export_outputs)
+  else:
+    alternatives = []
+
+  return model_fn.ModelFnOps(
+      mode=_core_mode_to_contrib_mode(estimator_spec.mode),
+      predictions=estimator_spec.predictions,
+      loss=estimator_spec.loss,
+      train_op=estimator_spec.train_op,
+      eval_metric_ops=estimator_spec.eval_metric_ops,
+      output_alternatives=alternatives)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index c6455a7ea3d18eb358edee034cee58b2bed21024..15ab6d814522ab1dee58dcd71246354fc4d8a483 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
@@ -60,6 +61,7 @@ def model_builder(features, labels, mode, params, config):
   feature_columns = params["feature_columns"]
   weight_column_name = params["weight_column_name"]
   num_trees = params["num_trees"]
+  use_core_libs = params["use_core_libs"]
   logits_modifier_function = params["logits_modifier_function"]
   if features is None:
     raise ValueError("At least one feature must be specified.")
@@ -93,7 +95,8 @@ def model_builder(features, labels, mode, params, config):
       learner_config=learner_config,
       feature_columns=feature_columns,
       logits_dimension=head.logits_dimension,
-      features=training_features)
+      features=training_features,
+      use_core_columns=use_core_libs)
   with ops.name_scope("gbdt", "gbdt_optimizer"):
     predictions_dict = gbdt_model.predict(mode)
     logits = predictions_dict["predictions"]
@@ -108,12 +111,22 @@ def model_builder(features, labels, mode, params, config):
         update_op = state_ops.assign_add(global_step, 1).op
         return update_op
 
-  model_fn_ops = head.create_model_fn_ops(
-      features=features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_train_op_fn,
-      logits=logits)
+  create_estimator_spec_op = getattr(head, "create_estimator_spec", None)
+  if use_core_libs and callable(create_estimator_spec_op):
+    model_fn_ops = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
+  else:
+    model_fn_ops = head.create_model_fn_ops(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
   if num_trees:
     if center_bias:
       num_trees += 1
diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
index 754b7bc3270d647fc381033b769eadd7b791771e..3bf33186ec13f5ff991db938d59849c0124a30a0 100644
--- a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
@@ -137,6 +137,61 @@ class TreeEnsembleDeserializeOp : public OpKernel {
   }
 };
 
+class TreeEnsembleUsedHandlersOp : public OpKernel {
+ public:
+  explicit TreeEnsembleUsedHandlersOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("num_all_handlers", &num_handlers_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    boosted_trees::models::DecisionTreeEnsembleResource* ensemble_resource;
+
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &ensemble_resource));
+    tf_shared_lock l(*ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(ensemble_resource);
+
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Only the Chief should run this Op and it is guaranteed to be in
+    // a consistent state so the stamps must always match.
+    CHECK(ensemble_resource->is_stamp_valid(stamp_token));
+
+    Tensor* output_used_handlers_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output("used_handlers_mask", {num_handlers_},
+                                          &output_used_handlers_t));
+    auto output_used_handlers = output_used_handlers_t->vec<bool>();
+
+    Tensor* output_num_used_handlers_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("num_used_handlers", {},
+                                            &output_num_used_handlers_t));
+    int handler_idx = 0;
+    std::vector<int64> used_handlers = ensemble_resource->GetUsedHandlers();
+    output_num_used_handlers_t->scalar<int64>()() = used_handlers.size();
+    for (int64 i = 0; i < num_handlers_; ++i) {
+      if (handler_idx >= used_handlers.size() ||
+          used_handlers[handler_idx] > i) {
+        output_used_handlers(i) = false;
+      } else {
+        OP_REQUIRES(context, used_handlers[handler_idx] == i,
+                    errors::InvalidArgument("Handler IDs should be sorted."));
+        ++handler_idx;
+        output_used_handlers(i) = true;
+      }
+    }
+  }
+
+ private:
+  int64 num_handlers_;
+};
+
 REGISTER_RESOURCE_HANDLE_KERNEL(DecisionTreeEnsembleResource);
 
 REGISTER_KERNEL_BUILDER(
@@ -155,5 +210,7 @@ REGISTER_KERNEL_BUILDER(Name("TreeEnsembleSerialize").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("TreeEnsembleDeserialize").Device(DEVICE_CPU),
                         TreeEnsembleDeserializeOp);
 
+REGISTER_KERNEL_BUILDER(Name("TreeEnsembleUsedHandlers").Device(DEVICE_CPU),
+                        TreeEnsembleUsedHandlersOp);
 }  // namespace boosted_trees
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 0f4c2298f56be48bb32f52d5d44cff8afe284f1e..0b28f81e7ca9a1228adc5bde19c429265e0aa9b8 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -253,7 +253,7 @@ class CreateQuantileAccumulatorOp : public OpKernel {
  private:
   float epsilon_;
   int32 num_quantiles_;
-  // An upperbound on the number of enteries that the summaries might have
+  // An upper bound on the number of entries that the summaries might have
   // for a feature.
   int64 max_elements_;
   bool generate_quantiles_;
diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
index 7f8dea1d3c2a04b725843f6e2932a0cdfbc7733c..1bfeed306641111718984b2097512e5ec3fa8630 100644
--- a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
@@ -361,27 +361,10 @@ class GrowTreeEnsembleOp : public OpKernel {
     // Increment attempt stats.
     ensemble_resource->IncrementAttempts();
 
-    // In case we want to do feature selection and we have reached the limit,
-    // build a list of handlers used so far to avoid adding new features.
-    std::vector<int64> allowed_handlers;
-    if (learner_config_.constraints().max_number_of_unique_feature_columns() >
-        0) {
-      allowed_handlers = ensemble_resource->GetUsedHandlers();
-      // TODO(soroush): We can disable handlers that are not going to be used to
-      // avoid unnecessary computations.
-      if (allowed_handlers.size() <
-          learner_config_.constraints()
-              .max_number_of_unique_feature_columns()) {
-        // We have not reached the limit yet. Empty the list of allow features
-        // which means we can keep adding new features.
-        allowed_handlers.clear();
-      }
-    }
-
     // Find best splits for each active partition.
     std::map<int32, SplitCandidate> best_splits;
-    FindBestSplitsPerPartition(context, allowed_handlers, partition_ids_list,
-                               gains_list, splits_list, &best_splits);
+    FindBestSplitsPerPartition(context, partition_ids_list, gains_list,
+                               splits_list, &best_splits);
 
     // No-op if no new splits can be considered.
     if (best_splits.empty()) {
@@ -422,19 +405,12 @@ class GrowTreeEnsembleOp : public OpKernel {
   // and finds the best split for each partition.
   void FindBestSplitsPerPartition(
       OpKernelContext* const context,
-      const std::vector<int64>& allowed_handlers,  // Empty means all handlers.
       const OpInputList& partition_ids_list, const OpInputList& gains_list,
       const OpInputList& splits_list,
       std::map<int32, SplitCandidate>* best_splits) {
     // Find best split per partition going through every feature candidate.
     // TODO(salehay): Is this worth parallelizing?
     for (int64 handler_id = 0; handler_id < num_handlers_; ++handler_id) {
-      if (!allowed_handlers.empty()) {
-        if (!std::binary_search(allowed_handlers.begin(),
-                                allowed_handlers.end(), handler_id)) {
-          continue;
-        }
-      }
       const auto& partition_ids = partition_ids_list[handler_id].vec<int32>();
       const auto& gains = gains_list[handler_id].vec<float>();
       const auto& splits = splits_list[handler_id].vec<string>();
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index 131bd48562a55a08981ac73277e93024db0d85d3..3028c2281705bd7e34b212332160d25386559d4e 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -15,17 +15,6 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # Utils
 
 cc_library(
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 7df514cd207c5e781f3b4abaa2020016b197669d..9d6cc9245aa463d0c8cfc7ad209736357b6c0323 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -417,9 +417,18 @@ class SparseSplitHandler(InequalitySplitHandler):
     return (are_splits_ready, partition_ids, gains, split_infos)
 
 
-@function.Defun(dtypes.bool, dtypes.bool, dtypes.float32, dtypes.float32,
-                dtypes.int32, dtypes.float32, dtypes.float32, dtypes.float32,
-                dtypes.float32, dtypes.float32)
+@function.Defun(
+    dtypes.bool,
+    dtypes.bool,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.int32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    noinline=True)
 def dense_make_stats_update(is_active, are_buckets_ready, float_column,
                             quantile_buckets, example_partition_ids, gradients,
                             hessians, weights, empty_gradients, empty_hessians):
@@ -452,9 +461,20 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
           gradients, hessians)
 
 
-@function.Defun(dtypes.bool, dtypes.bool, dtypes.int64, dtypes.float32,
-                dtypes.int64, dtypes.float32, dtypes.int32, dtypes.float32,
-                dtypes.float32, dtypes.float32, dtypes.float32, dtypes.float32)
+@function.Defun(
+    dtypes.bool,
+    dtypes.bool,
+    dtypes.int64,
+    dtypes.float32,
+    dtypes.int64,
+    dtypes.float32,
+    dtypes.int32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    noinline=True)
 def sparse_make_stats_update(
     is_active, are_buckets_ready, sparse_column_indices, sparse_column_values,
     sparse_column_shape, quantile_buckets, example_partition_ids, gradients,
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
index cd925f6b65e569538212e9c26aef0abc8482960b..794ba2bcb0aafa26c5e1c90fcd66caf9dd5bf7d5 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
@@ -137,7 +137,7 @@ struct NodeStats {
         Eigen::MatrixXf hessian =
             TensorToEigenMatrix(grad_stats.second.t, grad_dim, grad_dim);
         // I is an identity matrix.
-        // The gain in general form is -g^T (H+l2 I)^-1 g.
+        // The gain in general form is g^T (H+l2 I)^-1 g.
         // The node weights are -(H+l2 I)^-1 g.
         Eigen::MatrixXf identity;
         identity.setIdentity(grad_dim, grad_dim);
@@ -240,7 +240,7 @@ struct NodeStats {
   // given regularized Hessian and gradient vector g.
   void CalculateWeightAndGain(const Eigen::MatrixXf& hessian_and_reg,
                               const Eigen::VectorXf& g) {
-    // The gain in general form is -g^T (Hessian_and_regularization)^-1 g.
+    // The gain in general form is g^T (Hessian_and_regularization)^-1 g.
     // The node weights are -(Hessian_and_regularization)^-1 g.
     Eigen::VectorXf weight;
     // If we want to calculate x = K^-1 v, instead of explicitly calculating
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
index 4481c0d0e4400acd93c9a277de185db7aaf9bcb0..67ac9bf387ae9b3ca29e610c2c4138c28302ca33 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
@@ -138,6 +138,12 @@ void GenerateOneValue(int32 worker_id, int64 max_elements, double *total_weight,
   stream->Finalize();
 }
 
+void GenerateOneZeroWeightedValue(int32 worker_id, int64 max_elements,
+                                  double *total_weight, Stream *stream) {
+  stream->PushEntry(10, 0);
+  stream->Finalize();
+}
+
 TEST(WeightedQuantilesStreamTest, OneValue) {
   const double eps = 0.01;
   const int64 max_elements = 1 << 16;
@@ -145,6 +151,13 @@ TEST(WeightedQuantilesStreamTest, OneValue) {
                           {10.0, 10.0, 10.0, 10.0, 10.0}, 1e-2);
 }
 
+TEST(WeightedQuantilesStreamTest, OneZeroWeightValue) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateOneZeroWeightedValue, {},
+                          1e-2);
+}
+
 TEST(WeightedQuantilesStreamTest, FixedUniform) {
   const double eps = 0.01;
   const int64 max_elements = 1 << 16;
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index aec232f3cbb096f0aa51e4362a821882391f8027..7576856dc3a6d0b6681ee9745c875cf46d1e2960 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -235,6 +235,11 @@ class WeightedQuantilesSummary {
   // The resulting boundaries are guaranteed to both contain at least
   // num_boundaries unique elements and maintain approximation bounds.
   std::vector<ValueType> GenerateBoundaries(int64 num_boundaries) const {
+    std::vector<ValueType> output;
+    if (entries_.empty()) {
+      return output;
+    }
+
     // Generate soft compressed summary.
     WeightedQuantilesSummary<ValueType, WeightType, CompareFn>
         compressed_summary;
@@ -246,7 +251,6 @@ class WeightedQuantilesSummary {
     compressed_summary.Compress(num_boundaries, compression_eps);
 
     // Return boundaries.
-    std::vector<ValueType> output;
     output.reserve(compressed_summary.entries_.size());
     for (const auto& entry : compressed_summary.entries_) {
       output.push_back(entry.value);
@@ -260,6 +264,9 @@ class WeightedQuantilesSummary {
   // full rank queries O(nlogn).
   std::vector<ValueType> GenerateQuantiles(int64 num_quantiles) const {
     std::vector<ValueType> output;
+    if (entries_.empty()) {
+      return output;
+    }
     num_quantiles = std::max(num_quantiles, 2LL);
     output.reserve(num_quantiles + 1);
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc
index cf4f9a097a3368465fd4d9afb981bbaa68b4df49..35b059f3496dbc8fb2b3d4fe6ec6b55a9d73dd0c 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc
@@ -54,7 +54,7 @@ Status BatchFeatures::Initialize(
     TF_CHECK_AND_RETURN_IF_ERROR(
         dense_float_feature.dim_size(1) == 1,
         errors::InvalidArgument(
-            "Dense float features may not be multi-valent: dim_size(1) = ",
+            "Dense float features may not be multivalent: dim_size(1) = ",
             dense_float_feature.dim_size(1)));
     dense_float_feature_columns_.emplace_back(dense_float_feature);
   }
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
index da5e7448519cb7f4092f7bbbe1b526271008ec22..a3b1b013e3a40116f74d6ed2df78d87ed3a11ac7 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -48,9 +48,9 @@ class BatchFeatures {
   Status GetFeatureColumnSizes(int64* const num_dense_float_features,
                                int64* const num_sparse_float_features,
                                int64* const num_sparse_int_features) const {
-    QCHECK_NE(num_dense_float_features, nullptr);
-    QCHECK_NE(num_sparse_float_features, nullptr);
-    QCHECK_NE(num_sparse_int_features, nullptr);
+    QCHECK_NE(num_dense_float_features, static_cast<int64*>(nullptr));
+    QCHECK_NE(num_sparse_float_features, static_cast<int64*>(nullptr));
+    QCHECK_NE(num_sparse_int_features, static_cast<int64*>(nullptr));
     *num_dense_float_features = dense_float_feature_columns_.size();
     *num_sparse_float_features = sparse_float_feature_columns_.size();
     *num_sparse_int_features = sparse_int_feature_columns_.size();
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
index 609519e8b1153a27d987c5f9ca9bfcc9ee6717d6..cfe9101e7435cd798569f3e52a87fc8ed7b6a239 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
@@ -59,7 +59,7 @@ TEST_F(BatchFeaturesTest, DenseFloatFeatures_Multivalent) {
   BatchFeatures batch_features(1);
   auto dense_vec = AsTensor<float>({3.0f, 7.0f}, {1, 2});
   auto expected_error = InvalidArgument(
-      "Dense float features may not be multi-valent: dim_size(1) = 2");
+      "Dense float features may not be multivalent: dim_size(1) = 2");
   EXPECT_EQ(expected_error,
             batch_features.Initialize({dense_vec}, {}, {}, {}, {}, {}, {}));
 }
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
index db34db998a7442c69f2ab468f4557d991429f4ee..ce67db797ded54f5023eaa89369d4781aad31a7c 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
@@ -54,7 +54,7 @@ Status DropoutUtils::DropOutTrees(
   if (probability_of_skipping_dropout < 0 ||
       probability_of_skipping_dropout > 1) {
     return errors::InvalidArgument(
-        "Probability of skiping dropout must be in [0,1] range");
+        "Probability of skipping dropout must be in [0,1] range");
   }
   const auto num_trees = weights.size();
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
index 928bfbfe5c9394ab4083aabced4c8e1149bb10aa..77c16da5410fe65b20839c7b6bc677067d7ff297 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
@@ -66,7 +66,7 @@ class DropoutUtils {
       // Current weights and num_updates will be updated as a result of this
       // func
       std::vector<float>* current_weights,
-      // How many weight assignements have been done for each tree already.
+      // How many weight assignments have been done for each tree already.
       std::vector<int32>* num_updates);
 };
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc
index 0138aae3dbd3773241cb6644db625b99f9bf1372..cc7604745e6bb90837eeca1123faa88dc914e4fc 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc
@@ -34,7 +34,7 @@ TEST_F(SparseColumnIterableTest, Empty) {
 }
 
 TEST_F(SparseColumnIterableTest, Iterate) {
-  // 8 examples having 7 sparse features with the 3rd and 7th multi-valent.
+  // 8 examples having 7 sparse features with the 3rd and 7th multivalent.
   // This can be visualized like the following:
   // Instance | Sparse |
   // 0        |  x     |
diff --git a/tensorflow/contrib/boosted_trees/ops/model_ops.cc b/tensorflow/contrib/boosted_trees/ops/model_ops.cc
index 0786c4166410720e8d4d70960e5747ff111076d8..9d6343c7e80f369bf6a5465821c5f4bacb984cd0 100644
--- a/tensorflow/contrib/boosted_trees/ops/model_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/model_ops.cc
@@ -110,5 +110,32 @@ stamp_token: Token to use as the new value of the resource stamp.
 tree_ensemble_config: Serialized proto of the ensemble.
 )doc");
 
+REGISTER_OP("TreeEnsembleUsedHandlers")
+    .Attr("num_all_handlers: int >= 0")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Output("num_used_handlers: int64")
+    .Output("used_handlers_mask: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      int num_all_handlers;
+      c->GetAttr("num_all_handlers", &num_all_handlers).IgnoreError();
+      c->set_output(1, {c->Vector(num_all_handlers)});
+
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the mask of used handlers along with the number of non-zero elements in 
+this mask. Used in feature selection.
+
+tree_ensemble_handle: Handle to the tree ensemble.
+stamp_token: Token to use as the new value of the resource stamp.
+num_used_handlers: number of feature column handlers used in the model.
+used_handlers_mask: A boolean vector of showing which handlers are used in the
+                    model.
+)doc");
+
 }  // namespace boosted_trees
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
index ae99d53a2cf805d70d60746cd44f73f7fd9dc6e2..6aa52463987b55a54b7308765920cbe94c15b8d1 100644
--- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@@ -272,6 +272,20 @@ REGISTER_OP("Quantiles")
     .Input("sparse_indices: num_sparse_features * int64")
     .Output("dense_quantiles: num_dense_features * int32")
     .Output("sparse_quantiles: num_sparse_features * int32")
+    .SetShapeFn([](InferenceContext* c) {
+      int num_dense_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_dense_features", &num_dense_features));
+      int num_sparse_features;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_sparse_features", &num_sparse_features));
+      // Set output shapes (dense_quantiles and sparse_quantiles) by the
+      // relevant inputs (dense_values and sparse_values). Note that the output
+      // has an additional dimension for dimension_ids.
+      for (int i = 0; i < num_dense_features + num_sparse_features; ++i) {
+        c->set_output(i, c->MakeShape({c->Dim(c->input(i), 0), 2}));
+      }
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes quantile for each a given list of dense and sparse feature values using
 the given buckets.
diff --git a/tensorflow/contrib/boosted_trees/proto/BUILD b/tensorflow/contrib/boosted_trees/proto/BUILD
index 9a61e163eb5ff51dc75de4e40e0f43b090d03c0c..b07f0a4314246eea63764bb6d5e166dd720644fb 100644
--- a/tensorflow/contrib/boosted_trees/proto/BUILD
+++ b/tensorflow/contrib/boosted_trees/proto/BUILD
@@ -4,17 +4,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "learner_proto",
     srcs = [
diff --git a/tensorflow/contrib/boosted_trees/proto/tree_config.proto b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
index 4407c4d981785a279b6296f4726a221cacb4c5b1..81411aa84ae848cfaa1392e82a1e38c3df19cdb6 100644
--- a/tensorflow/contrib/boosted_trees/proto/tree_config.proto
+++ b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
@@ -53,7 +53,7 @@ message DenseFloatBinarySplit {
   // Float feature column and split threshold describing
   // the rule feature <= threshold.
   int32 feature_column = 1;
-  // If feature column is multivalent, this holds the index of the dimensiong
+  // If feature column is multivalent, this holds the index of the dimension
   // for the split. Defaults to 0.
   int32 dimension_id = 5;
   float threshold = 2;
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
index 27c288bbf78b3b593d0807e92ac7fd9afc4d2725..63b9c5fddf0d9967d53077608664b59d9ae00481 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
@@ -310,6 +310,22 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
         # The third tree was added after the save.
         self.assertAllClose(result.eval(), [[-1.1], [-1.1]])
 
+  def testUsedHandlers(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_config.growing_metadata.used_handler_ids.append(1)
+      tree_ensemble_config.growing_metadata.used_handler_ids.append(5)
+      stamp_token = 3
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=stamp_token,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="create_tree")
+      resources.initialize_resources(resources.shared_resources()).run()
+      result = model_ops.tree_ensemble_used_handlers(
+          tree_ensemble_handle, stamp_token, num_all_handlers=6)
+      self.assertAllEqual([0, 1, 0, 0, 0, 1], result.used_handlers_mask.eval())
+      self.assertEqual(2, result.num_used_handlers.eval())
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
index c1acf351603dd80c2d14c7ee0a5b4c89706bc1bf..cf55759aaabfb265466f4bbf8b2806d4347ca0b1 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
@@ -120,8 +120,8 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
     """Sets up the prediction tests.
 
     Create a batch of two examples having one dense float, two sparse float
-    single valued, one sparse float multidimensionl and one sparse int features.
-    The data looks like the following:
+    single valued, one sparse float multidimensional and one sparse int
+    features.  The data looks like the following:
     | Instance | Dense0 | SparseF0 | SparseF1 | SparseI0 | SparseM
     | 0        |  7     |    -3    |          |    9,1   | __, 5.0
     | 1        | -2     |          | 4        |          |  3, ___
@@ -810,7 +810,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
     # building. This tree should never be dropped.
     num_trees = 10
     with self.test_session():
-      # Empty tree ensenble.
+      # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Add 10 trees with some weights.
       for i in range(0, num_trees):
@@ -951,7 +951,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
   def testDropOutZeroProb(self):
     with self.test_session():
-      # Empty tree ensenble.
+      # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Add 1000 trees with some weights.
       for i in range(0, 999):
@@ -994,7 +994,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
   def testAveragingAllTrees(self):
     with self.test_session():
-      # Empty tree ensenble.
+      # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       adjusted_tree_ensemble_config = (
           tree_config_pb2.DecisionTreeEnsembleConfig())
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index 81f58de28cbe98bb996c6665114eeb0030ee52f9..074623699d9d82f999c9cbc483ddcd8a959f4bad 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -482,7 +482,7 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
     """Sets up the quantile op tests.
 
     Create a batch of 4 examples having 2 dense and 4 sparse features.
-    Forth sparse feature is multivalent (3 dimensional)
+    Fourth sparse feature is multivalent (3 dimensional)
     The data looks like this
     | Instance | Dense 0 | Dense 1 | Sparse 0 | Sparse 1 |Sparse 2| SparseM
     | 0        |   -0.1  |  -1     |   -2     |   0.1    |        |_ ,1,_
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
index 8ca1aabacaf53b66aaba184962922294427d6803..3e524efbeac74ff754d63cae92b3e194411cb2de 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
@@ -1588,7 +1588,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(
           2, tree_ensemble_config.tree_metadata[2].num_tree_weight_updates)
 
-  def testGrowExistingEnsembleTreeWithFeatureSelectionCanStillGrow(self):
+  def testGrowExistingEnsembleTreeWithFeatureSelectionUsedHandlers(self):
     """Test growing a tree with feature selection."""
     with self.test_session() as session:
       # Create existing ensemble with one root split and one bias tree.
@@ -1649,7 +1649,6 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           num_trees_attempted: 2
           num_layers_attempted: 2
           used_handler_ids: 2
-          used_handler_ids: 5
         }
       """, tree_ensemble_config)
       tree_ensemble_handle = model_ops.tree_ensemble_variable(
@@ -1668,183 +1667,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
-      # There are 2 handler_ids in used_handler_ids already but one of them
-      # is handler 2, so we can still grow trees.
-      learner_config.constraints.max_number_of_unique_feature_columns = 2
-      learner_config = learner_config.SerializeToString()
-      # Prepare handler inputs.
-      handler1_partitions = np.array([0], dtype=np.int32)
-      handler1_gains = np.array([7.62], dtype=np.float32)
-      handler1_split = [_gen_dense_split_info(5, 0.52, -4.375, 7.143)]
-      handler2_partitions = np.array([0], dtype=np.int32)
-      handler2_gains = np.array([0.63], dtype=np.float32)
-      handler2_split = [_gen_dense_split_info(2, 0.23, -0.6, 0.24)]
-      handler3_partitions = np.array([0], dtype=np.int32)
-      handler3_gains = np.array([7.62], dtype=np.float32)
-      handler3_split = [_gen_categorical_split_info(8, 7, -4.375, 7.143)]
-
-      # Grow tree ensemble.
-      grow_op = training_ops.grow_tree_ensemble(
-          tree_ensemble_handle,
-          stamp_token=0,
-          next_stamp_token=1,
-          learning_rate=1,
-          partition_ids=[
-              handler1_partitions, handler2_partitions, handler3_partitions
-          ],
-          gains=[handler1_gains, handler2_gains, handler3_gains],
-          splits=[handler1_split, handler2_split, handler3_split],
-          learner_config=learner_config,
-          dropout_seed=123,
-          center_bias=True)
-      session.run(grow_op)
-
-      # Expect a new tree to be added with the split from handler 1.
-      _, serialized = session.run(
-          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
-      tree_ensemble_config.ParseFromString(serialized)
-      self.assertEqual(3, len(tree_ensemble_config.trees))
-      self.assertEqual(
-          2, len(tree_ensemble_config.growing_metadata.used_handler_ids))
-
-  def testGrowExistingEnsembleTreeWithFeatureSelectionEmptyEnsemble(self):
-    """Test growing a tree with feature selection with empty ensemble."""
-    with self.test_session() as session:
-      # Create existing ensemble with one root split and one bias tree.
-      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
-      tree_ensemble_handle = model_ops.tree_ensemble_variable(
-          stamp_token=0,
-          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
-          name="tree_ensemble")
-      resources.initialize_resources(resources.shared_resources()).run()
-
-      # Prepare learner config.
-      learner_config = _gen_learner_config(
-          num_classes=2,
-          l1_reg=0,
-          l2_reg=0,
-          tree_complexity=0,
-          max_depth=1,
-          min_node_weight=0,
-          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
-      learner_config.constraints.max_number_of_unique_feature_columns = 2
-      learner_config = learner_config.SerializeToString()
-      # Prepare handler inputs.
-      handler1_partitions = np.array([0], dtype=np.int32)
-      handler1_gains = np.array([7.62], dtype=np.float32)
-      handler1_split = [_gen_dense_split_info(5, 0.52, -4.375, 7.143)]
-      handler2_partitions = np.array([0], dtype=np.int32)
-      handler2_gains = np.array([0.63], dtype=np.float32)
-      handler2_split = [_gen_dense_split_info(2, 0.23, -0.6, 0.24)]
-      handler3_partitions = np.array([0], dtype=np.int32)
-      handler3_gains = np.array([7.62], dtype=np.float32)
-      handler3_split = [_gen_categorical_split_info(8, 7, -4.375, 7.143)]
-
-      # Grow tree ensemble.
-      grow_op = training_ops.grow_tree_ensemble(
-          tree_ensemble_handle,
-          stamp_token=0,
-          next_stamp_token=1,
-          learning_rate=1,
-          partition_ids=[
-              handler1_partitions, handler2_partitions, handler3_partitions
-          ],
-          gains=[handler1_gains, handler2_gains, handler3_gains],
-          splits=[handler1_split, handler2_split, handler3_split],
-          learner_config=learner_config,
-          dropout_seed=123,
-          center_bias=True)
-      session.run(grow_op)
-
-      _, serialized = session.run(
-          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
-      tree_ensemble_config.ParseFromString(serialized)
-      self.assertEqual(1, len(tree_ensemble_config.trees))
-      self.assertEqual(
-          1, len(tree_ensemble_config.growing_metadata.used_handler_ids))
-
-  def testGrowExistingEnsembleTreeWithFeatureSelectionCantGrow(self):
-    """Test growing a tree with feature selection with empty ensemble."""
-    with self.test_session() as session:
-      # Create existing ensemble with one root split and one bias tree.
-      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
-      text_format.Merge("""
-        trees {
-          nodes {
-            leaf {
-              vector {
-                value: -0.32
-                value: 0.28
-              }
-            }
-          }
-        }
-        trees {
-          nodes {
-            categorical_id_binary_split {
-              feature_column: 3
-              feature_id: 7
-              left_id: 1
-              right_id: 2
-            }
-            node_metadata {
-              gain: 1.3
-            }
-          }
-          nodes {
-            leaf {
-              sparse_vector {
-                index: 0
-                value: 2.3
-              }
-            }
-          }
-          nodes {
-            leaf {
-              sparse_vector {
-                index: 0
-                value: -0.9
-              }
-            }
-          }
-        }
-        tree_weights: 0.7
-        tree_weights: 1
-        tree_metadata {
-          num_tree_weight_updates: 1
-          num_layers_grown: 1
-          is_finalized: true
-        }
-        tree_metadata {
-          num_tree_weight_updates: 5
-          num_layers_grown: 1
-          is_finalized: true
-        }
-        growing_metadata {
-          num_trees_attempted: 2
-          num_layers_attempted: 2
-          used_handler_ids: 4
-          used_handler_ids: 5
-        }
-      """, tree_ensemble_config)
-      tree_ensemble_handle = model_ops.tree_ensemble_variable(
-          stamp_token=0,
-          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
-          name="tree_ensemble")
-      resources.initialize_resources(resources.shared_resources()).run()
 
-      # Prepare learner config.
-      learner_config = _gen_learner_config(
-          num_classes=2,
-          l1_reg=0,
-          l2_reg=0,
-          tree_complexity=0,
-          max_depth=1,
-          min_node_weight=0,
-          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
-      learner_config.constraints.max_number_of_unique_feature_columns = 2
+      learner_config.constraints.max_number_of_unique_feature_columns = 3
       learner_config = learner_config.SerializeToString()
       # Prepare handler inputs.
       handler1_partitions = np.array([0], dtype=np.int32)
@@ -1876,12 +1700,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       _, serialized = session.run(
           model_ops.tree_ensemble_serialize(tree_ensemble_handle))
       tree_ensemble_config.ParseFromString(serialized)
-      # We can't grow a tree since we have reached the limit of 2 unique
-      # features [4, 5] and the only available splits are from
-      # handlers [0, 1, 2].
-      self.assertEqual(2, len(tree_ensemble_config.trees))
-      self.assertEqual(
-          2, len(tree_ensemble_config.growing_metadata.used_handler_ids))
+      self.assertEqual(3, len(tree_ensemble_config.trees))
+      # 2 was already used. handler 0 is being added in this tree.
+      self.assertAllEqual(
+          [0, 2], tree_ensemble_config.growing_metadata.used_handler_ids)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
index 7a5f509047d46549ba81039a23d29ec987ca7920..25b2c9e2fd72bd018717e8a87fce726f26bad968 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensem
 from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensemble_serialize
 # pylint: disable=unused-import
 from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensemble_stamp_token
+from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensemble_used_handlers
 # pylint: enable=unused-import
 
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 97d57e8b23608d4c3a8719426a75056fc6417d1d..1b184d296b329cee481db67992e77d1e33e18035 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -184,7 +184,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     """Finalizes quantile summary stream and resets it for next iteration.
 
     Args:
-      stamp_token: Exepcted current token.
+      stamp_token: Expected current token.
       next_stamp_token: Next value for the token.
     Returns:
       A list of quantiles or approximate boundaries.
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index f0b66dcbbe1c5167b9993e66b30b1dc8a839c380..08c1dcdd028829e6ef290965347d184ed42f416d 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -23,7 +23,6 @@ import copy
 
 from tensorflow.contrib import learn
 from tensorflow.contrib import stateless
-
 from tensorflow.contrib.boosted_trees.lib.learner.batch import categorical_split_handler
 from tensorflow.contrib.boosted_trees.lib.learner.batch import ordinal_split_handler
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
@@ -57,6 +56,8 @@ PREDICTIONS = "predictions"
 PARTITION_IDS = "partition_ids"
 NUM_LAYERS_ATTEMPTED = "num_layers"
 NUM_TREES_ATTEMPTED = "num_trees"
+NUM_USED_HANDLERS = "num_used_handlers"
+USED_HANDLERS_MASK = "used_handlers_mask"
 _FEATURE_NAME_TEMPLATE = "%s_%d"
 
 
@@ -70,7 +71,8 @@ def _get_column_by_index(tensor, indices):
   return array_ops.reshape(array_ops.gather(p_flat, i_flat), [shape[0], -1])
 
 
-def _make_predictions_dict(stamp, logits, partition_ids, ensemble_stats):
+def _make_predictions_dict(stamp, logits, partition_ids, ensemble_stats,
+                           used_handlers):
   """Returns predictions for the given logits and n_classes.
 
   Args:
@@ -79,6 +81,8 @@ def _make_predictions_dict(stamp, logits, partition_ids, ensemble_stats):
         that contains predictions when no dropout was applied.
     partition_ids: A rank 1 `Tensor` with shape [batch_size].
     ensemble_stats: A TreeEnsembleStatsOp result tuple.
+    used_handlers: A TreeEnsembleUsedHandlerOp result tuple of an int and a
+        boolean mask..
 
   Returns:
     A dict of predictions.
@@ -89,6 +93,8 @@ def _make_predictions_dict(stamp, logits, partition_ids, ensemble_stats):
   result[PARTITION_IDS] = partition_ids
   result[NUM_LAYERS_ATTEMPTED] = ensemble_stats.attempted_layers
   result[NUM_TREES_ATTEMPTED] = ensemble_stats.attempted_trees
+  result[NUM_USED_HANDLERS] = used_handlers.num_used_handlers
+  result[USED_HANDLERS_MASK] = used_handlers.used_handlers_mask
   return result
 
 
@@ -134,7 +140,7 @@ class _OpRoundRobinStrategy(object):
     return task
 
 
-def extract_features(features, feature_columns):
+def extract_features(features, feature_columns, use_core_columns):
   """Extracts columns from a dictionary of features.
 
   Args:
@@ -167,7 +173,11 @@ def extract_features(features, feature_columns):
       transformed_features = collections.OrderedDict()
       for fc in feature_columns:
         # pylint: disable=protected-access
-        if isinstance(fc, feature_column_lib._EmbeddingColumn):
+        if use_core_columns:
+          # pylint: disable=protected-access
+          tensor = fc_core._transform_features(features, [fc])[fc]
+          transformed_features[fc.name] = tensor
+        elif isinstance(fc, feature_column_lib._EmbeddingColumn):
           # pylint: enable=protected-access
           transformed_features[fc.name] = fc_core.input_layer(
               features, [fc],
@@ -258,7 +268,8 @@ class GradientBoostedDecisionTreeModel(object):
                learner_config,
                features,
                logits_dimension,
-               feature_columns=None):
+               feature_columns=None,
+               use_core_columns=False):
     """Construct a new GradientBoostedDecisionTreeModel function.
 
     Args:
@@ -331,8 +342,9 @@ class GradientBoostedDecisionTreeModel(object):
     if not features:
       raise ValueError("Features dictionary must be specified.")
     (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
-     sparse_float_shapes, sparse_int_indices, sparse_int_values,
-     sparse_int_shapes) = extract_features(features, self._feature_columns)
+     sparse_float_shapes, sparse_int_indices,
+     sparse_int_values, sparse_int_shapes) = extract_features(
+         features, self._feature_columns, use_core_columns)
     logging.info("Active Feature Columns: " + str(fc_names))
     self._fc_names = fc_names
     self._dense_floats = dense_floats
@@ -361,6 +373,13 @@ class GradientBoostedDecisionTreeModel(object):
     """
     ensemble_stats = training_ops.tree_ensemble_stats(ensemble_handle,
                                                       ensemble_stamp)
+    num_handlers = (
+        len(self._dense_floats) + len(self._sparse_float_shapes) +
+        len(self._sparse_int_shapes))
+    # Used during feature selection.
+    used_handlers = model_ops.tree_ensemble_used_handlers(
+        ensemble_handle, ensemble_stamp, num_all_handlers=num_handlers)
+
     # We don't need dropout info - we can always restore it based on the
     # seed.
     apply_dropout, seed = _dropout_params(mode, ensemble_stats)
@@ -395,7 +414,7 @@ class GradientBoostedDecisionTreeModel(object):
           use_locking=True)
 
     return _make_predictions_dict(ensemble_stamp, predictions, partition_ids,
-                                  ensemble_stats)
+                                  ensemble_stats, used_handlers)
 
   def predict(self, mode):
     """Returns predictions given the features and mode.
@@ -710,12 +729,28 @@ class GradientBoostedDecisionTreeModel(object):
       active_handlers_current_layer = (
           active_handlers_current_layer <
           self._learner_config.feature_fraction_per_tree)
-      active_handlers = array_ops.stack(active_handlers_current_layer,
-                                        array_ops.ones(
-                                            [len(handlers)], dtype=dtypes.bool))
+      active_handlers = array_ops.stack([
+          active_handlers_current_layer,
+          array_ops.ones([len(handlers)], dtype=dtypes.bool)], axis=1)
     else:
       active_handlers = array_ops.ones([len(handlers), 2], dtype=dtypes.bool)
 
+    if self._learner_config.constraints.max_number_of_unique_feature_columns:
+      target = (
+          self._learner_config.constraints.max_number_of_unique_feature_columns)
+
+      def _feature_selection_active_handlers():
+        # The active list for current and the next iteration.
+        used_handlers = array_ops.reshape(predictions_dict[USED_HANDLERS_MASK],
+                                          [-1, 1])
+        used_handlers = array_ops.concat([used_handlers, used_handlers], axis=1)
+        return math_ops.logical_and(used_handlers, active_handlers)
+
+      active_handlers = (
+          control_flow_ops.cond(predictions_dict[NUM_USED_HANDLERS] >= target,
+                                _feature_selection_active_handlers,
+                                lambda: active_handlers))
+
     # Prepare empty gradients and hessians when handlers are not ready.
     empty_hess_shape = [1] + hessian_shape.as_list()
     empty_grad_shape = [1] + gradient_shape.as_list()
@@ -935,10 +970,8 @@ class GradientBoostedDecisionTreeModel(object):
       # Stack all the inputs to one tensor per type.
       # This is a workaround for the slowness of graph building in tf.cond.
       # See (b/36554864).
-      split_sizes = array_ops.stack([
-          array_ops.shape(partition_id)[0]
-          for partition_id in partition_ids_list
-      ])
+      split_sizes = array_ops.reshape(
+          array_ops.shape_n(partition_ids_list), [-1])
       partition_ids = array_ops.concat(partition_ids_list, axis=0)
       gains = array_ops.concat(gains_list, axis=0)
       split_infos = array_ops.concat(split_info_list, axis=0)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index dba51d4f527792d2a8dedc693f74c07119fd231d..f9c22283b7f5136777bfa60a12c94974adfbd245 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -27,9 +27,11 @@ from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
 from tensorflow.contrib.boosted_trees.python.utils import losses
 
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -43,10 +45,42 @@ from tensorflow.python.platform import googletest
 def _squared_loss(label, unused_weights, predictions):
   """Unweighted loss implementation."""
   loss = math_ops.reduce_sum(
-      math_ops.square(predictions - label), 1, keep_dims=True)
+      math_ops.square(predictions - label), 1, keepdims=True)
   return loss
 
 
+def _append_to_leaf(leaf, c_id, w):
+  """Helper method for building tree leaves.
+
+  Appends weight contributions for the given class index to a leaf node.
+
+  Args:
+    leaf: leaf node to append to.
+    c_id: class Id for the weight update.
+    w: weight contribution value.
+  """
+  leaf.sparse_vector.index.append(c_id)
+  leaf.sparse_vector.value.append(w)
+
+
+def _set_float_split(split, feat_col, thresh, l_id, r_id):
+  """Helper method for building tree float splits.
+
+  Sets split feature column, threshold and children.
+
+  Args:
+    split: split node to update.
+    feat_col: feature column for the split.
+    thresh: threshold to split on forming rule x <= thresh.
+    l_id: left child Id.
+    r_id: right child Id.
+  """
+  split.feature_column = feat_col
+  split.threshold = thresh
+  split.left_id = l_id
+  split.right_id = r_id
+
+
 class GbdtTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -67,7 +101,8 @@ class GbdtTest(test_util.TensorFlowTestCase):
           array_ops.zeros([2], dtypes.int64))
       (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
        sparse_float_shapes, sparse_int_indices, sparse_int_values,
-       sparse_int_shapes) = (gbdt_batch.extract_features(features, None))
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(features, None, use_core_columns=False))
       self.assertEqual(len(fc_names), 3)
       self.assertAllEqual(fc_names,
                           ["dense_float", "sparse_float", "sparse_int"])
@@ -116,8 +151,9 @@ class GbdtTest(test_util.TensorFlowTestCase):
               "sparse_categorical", hash_bucket_size=1000000))
       (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
        sparse_float_shapes, sparse_int_indices, sparse_int_values,
-       sparse_int_shapes) = (gbdt_batch.extract_features(
-           features, feature_columns))
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(
+               features, feature_columns, use_core_columns=False))
       self.assertEqual(len(fc_names), 3)
       self.assertAllEqual(fc_names,
                           ["dense_float", "sparse_float", "sparse_categorical"])
@@ -142,6 +178,41 @@ class GbdtTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(sparse_int_shapes[0].eval(),
                           features["sparse_categorical"].dense_shape.eval())
 
+  def testExtractFeaturesFromCoreFeatureColumns(self):
+    """Tests feature extraction when using core columns."""
+    with self.test_session():
+      features = {}
+      # Sparse float column does not exist in core, so only dense numeric and
+      # categorical.
+      features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32)
+      features["sparse_categorical"] = sparse_tensor.SparseTensor(
+          array_ops.zeros([2, 2], dtypes.int64),
+          array_ops.zeros([2], dtypes.string), array_ops.zeros([2],
+                                                               dtypes.int64))
+
+      feature_columns = set()
+      feature_columns.add(core_feature_column.numeric_column("dense_float"))
+      feature_columns.add(
+          core_feature_column.categorical_column_with_hash_bucket(
+              "sparse_categorical", hash_bucket_size=1000000))
+      (fc_names, dense_floats, _, _, _, sparse_int_indices, sparse_int_values,
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(
+               features, feature_columns, use_core_columns=True))
+      self.assertEqual(len(fc_names), 2)
+      self.assertAllEqual(fc_names, ["dense_float", "sparse_categorical"])
+      self.assertEqual(len(dense_floats), 1)
+      self.assertEqual(len(sparse_int_indices), 1)
+      self.assertEqual(len(sparse_int_values), 1)
+      self.assertEqual(len(sparse_int_shapes), 1)
+      self.assertAllEqual(dense_floats[0].eval(),
+                          features["dense_float"].eval())
+      self.assertAllEqual(sparse_int_indices[0].eval(),
+                          features["sparse_categorical"].indices.eval())
+      self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263])
+      self.assertAllEqual(sparse_int_shapes[0].eval(),
+                          features["sparse_categorical"].dense_shape.eval())
+
   def testTrainFnChiefNoBiasCentering(self):
     """Tests the train function running on chief without bias centering."""
     with self.test_session() as sess:
@@ -917,6 +988,350 @@ class GbdtTest(test_util.TensorFlowTestCase):
           output.trees[0].nodes[2].leaf.sparse_vector.value[0],
           atol=1e-4, rtol=1e-4)
 
+  def testTrainFnChiefFeatureSelectionReachedLimitNoGoodSplit(self):
+    """Tests the train function running on chief with feature selection."""
+    with self.test_session() as sess:
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.max_number_of_unique_feature_columns = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      features["dense_float_0"] = array_ops.ones([4, 1], dtypes.float32)
+      # Feature 1 is predictive but it won't be used because we have reached the
+      # limit of num_used_handlers >= max_number_of_unique_feature_columns
+      features["dense_float_1"] = array_ops.constant([0, 0, 1, 1],
+                                                     dtypes.float32)
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          logits_dimension=1,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
+      partition_ids = array_ops.zeros([4], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions":
+              predictions,
+          "predictions_no_dropout":
+              predictions,
+          "partition_ids":
+              partition_ids,
+          "ensemble_stamp":
+              ensemble_stamp,
+          "num_trees":
+              12,
+          "num_used_handlers":
+              array_ops.constant(1, dtype=dtypes.int64),
+          "used_handlers_mask":
+              array_ops.constant([True, False], dtype=dtypes.bool),
+      }
+
+      labels = array_ops.constant([0, 0, 1, 1], dtypes.float32)
+      weights = array_ops.ones([4, 1], dtypes.float32)
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              _squared_loss(labels, weights, predictions)),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect no splits to be chosen because the quantile
+      # buckets will not be ready.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 0)
+      self.assertEquals(len(output.tree_weights), 0)
+      self.assertEquals(stamp_token.eval(), 1)
+
+      # Update the stamp to be able to run a second time.
+      sess.run([ensemble_stamp.assign_add(1)])
+
+      # On second run, expect a trivial split to be chosen to basically
+      # predict the average.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 1)
+      self.assertAllClose(output.tree_weights, [0.1])
+      self.assertEquals(stamp_token.eval(), 2)
+      expected_tree = """
+          nodes {
+            dense_float_binary_split {
+              feature_column: 0
+              threshold: 1.0
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 0
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.25
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0
+              }
+            }
+          }"""
+      self.assertProtoEquals(expected_tree, output.trees[0])
+
+  def testTrainFnChiefFeatureSelectionWithGoodSplits(self):
+    """Tests the train function running on chief with feature selection."""
+    with self.test_session() as sess:
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.max_number_of_unique_feature_columns = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      features["dense_float_0"] = array_ops.ones([4, 1], dtypes.float32)
+      # Feature 1 is predictive and is in our selected features so it will be
+      # used even when we're at the limit.
+      features["dense_float_1"] = array_ops.constant([0, 0, 1, 1],
+                                                     dtypes.float32)
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          logits_dimension=1,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
+      partition_ids = array_ops.zeros([4], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions":
+              predictions,
+          "predictions_no_dropout":
+              predictions,
+          "partition_ids":
+              partition_ids,
+          "ensemble_stamp":
+              ensemble_stamp,
+          "num_trees":
+              12,
+          "num_used_handlers":
+              array_ops.constant(1, dtype=dtypes.int64),
+          "used_handlers_mask":
+              array_ops.constant([False, True], dtype=dtypes.bool),
+      }
+
+      labels = array_ops.constant([0, 0, 1, 1], dtypes.float32)
+      weights = array_ops.ones([4, 1], dtypes.float32)
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              _squared_loss(labels, weights, predictions)),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect no splits to be chosen because the quantile
+      # buckets will not be ready.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 0)
+      self.assertEquals(len(output.tree_weights), 0)
+      self.assertEquals(stamp_token.eval(), 1)
+
+      # Update the stamp to be able to run a second time.
+      sess.run([ensemble_stamp.assign_add(1)])
+
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+
+      self.assertEquals(len(output.trees), 1)
+      self.assertAllClose(output.tree_weights, [0.1])
+      self.assertEquals(stamp_token.eval(), 2)
+      expected_tree = """
+          nodes {
+            dense_float_binary_split {
+              feature_column: 1
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 0.5
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.5
+              }
+            }
+          }"""
+      self.assertProtoEquals(expected_tree, output.trees[0])
+
+  def testTrainFnChiefFeatureSelectionReachedLimitIncrementAttemptedLayer(self):
+    """Tests the train function running on chief with feature selection."""
+    with self.test_session() as sess:
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree = tree_ensemble_config.trees.add()
+
+      _set_float_split(tree.nodes.add()
+                       .sparse_float_binary_split_default_right.split, 2, 4.0,
+                       1, 2)
+      _append_to_leaf(tree.nodes.add().leaf, 0, 0.5)
+      _append_to_leaf(tree.nodes.add().leaf, 1, 1.2)
+      tree_ensemble_config.tree_weights.append(1.0)
+      metadata = tree_ensemble_config.tree_metadata.add()
+      metadata.is_finalized = False
+      metadata.num_layers_grown = 1
+      tree_ensemble_config = tree_ensemble_config.SerializeToString()
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config=tree_ensemble_config,
+          name="tree_ensemble")
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.max_number_of_unique_feature_columns = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      # Both features will be disabled since the feature selection limit is
+      # already reached.
+      features["dense_float_0"] = array_ops.ones([4, 1], dtypes.float32)
+      features["dense_float_1"] = array_ops.constant([0, 0, 1, 1],
+                                                     dtypes.float32)
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          logits_dimension=1,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
+      partition_ids = array_ops.zeros([4], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions":
+              predictions,
+          "predictions_no_dropout":
+              predictions,
+          "partition_ids":
+              partition_ids,
+          "ensemble_stamp":
+              ensemble_stamp,
+          "num_trees":
+              12,
+          # We have somehow reached our limit 1. Both of the handlers will be
+          # disabled.
+          "num_used_handlers":
+              array_ops.constant(1, dtype=dtypes.int64),
+          "used_handlers_mask":
+              array_ops.constant([False, False], dtype=dtypes.bool),
+      }
+
+      labels = array_ops.constant([0, 0, 1, 1], dtypes.float32)
+      weights = array_ops.ones([4, 1], dtypes.float32)
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              _squared_loss(labels, weights, predictions)),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect no splits to be chosen because the quantile
+      # buckets will not be ready.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 1)
+      self.assertEquals(output.growing_metadata.num_layers_attempted, 1)
+      self.assertEquals(stamp_token.eval(), 1)
+
+      # Update the stamp to be able to run a second time.
+      sess.run([ensemble_stamp.assign_add(1)])
+
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      # Make sure the trees are not modified, but the num_layers_attempted is
+      # incremented so that eventually the training stops.
+      self.assertEquals(len(output.trees), 1)
+      self.assertEquals(len(output.trees[0].nodes), 3)
+
+      self.assertEquals(output.growing_metadata.num_layers_attempted, 2)
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/resources/BUILD b/tensorflow/contrib/boosted_trees/resources/BUILD
index 9fc101612f1e2a6bf6c5d86ea8c7199936dbb069..c0651868453d40d57e842862855f89e6845c507f 100644
--- a/tensorflow/contrib/boosted_trees/resources/BUILD
+++ b/tensorflow/contrib/boosted_trees/resources/BUILD
@@ -9,17 +9,6 @@ package(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "stamped_resource",
     hdrs = ["stamped_resource.h"],
diff --git a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
index 3ebf28ea442edf87815c39971ae9e01a2a8aae9a..94aeb2c7bb48c6eddb6c7894f8bf6f1567470113 100644
--- a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
@@ -126,7 +126,8 @@ class DecisionTreeEnsembleResource : public StampedResource {
       return;
     }
     used_ids->Add(handler_id);
-    std::rotate(first, used_ids->end() - 1, used_ids->end());
+    // Keep the list of used handlers sorted.
+    std::sort(used_ids->begin(), used_ids->end());
   }
 
   std::vector<int64> GetUsedHandlers() const {
diff --git a/tensorflow/contrib/checkpoint/README.md b/tensorflow/contrib/checkpoint/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d35c5bae3b702c0fea5194e5e653660e319e38c5
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/README.md
@@ -0,0 +1,2 @@
+Tools for working with object-based checkpoints produced by
+`tf.train.Checkpoint`.
diff --git a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py b/tensorflow/contrib/checkpoint/__init__.py
similarity index 70%
rename from tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
rename to tensorflow/contrib/checkpoint/__init__.py
index f3a645eafc249d1c39e0d4a238ae7ec8755c78d8..1192cc44a17823f69db28947308a8b839a83e57e 100644
--- a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -12,21 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for Markov Chain Monte Carlo (MCMC) sampling."""
+"""Tools for working with object-based checkpoints.
+
+
+For creating and managing dependencies:
+@@dot_graph_from_checkpoint
+@@split_dependency
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.mcmc_diagnostics_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
+from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
 
-_allowed_symbols = [
-    "effective_sample_size",
-    "potential_scale_reduction",
-]
+from tensorflow.python.util.all_util import remove_undocumented
 
-remove_undocumented(__name__, _allowed_symbols)
+remove_undocumented(module_name=__name__)
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a5681ffa61d07ef29d0a0862db9736a210c8e26e
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -0,0 +1,61 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "checkpoint",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":split_dependency",
+        ":visualize",
+    ],
+)
+
+py_library(
+    name = "split_dependency",
+    srcs = ["split_dependency.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "split_dependency_test",
+    srcs = ["split_dependency_test.py"],
+    deps = [
+        ":split_dependency",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "visualize",
+    srcs = ["visualize.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
+py_test(
+    name = "visualize_test",
+    srcs = ["visualize_test.py"],
+    deps = [
+        ":visualize",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency.py b/tensorflow/contrib/checkpoint/python/split_dependency.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aec8c96e90440d6da00d95cffc34bd53ec7164f
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/split_dependency.py
@@ -0,0 +1,136 @@
+"""Utility for creating multiple dependencies with synchronized save/restore."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.training import checkpointable as checkpointable
+from tensorflow.python.training import saver as saver_lib
+
+
+class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+  """Wraps save and restore callbacks as a `SaveableObject`."""
+
+  def __init__(self, name, dtype, save_callback, restore_callback):
+    self._restore_callback = restore_callback
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(
+        tensor=save_callback,
+        slice_spec="",
+        name=name,
+        dtype=dtype)
+    super(_CallbackSaveable, self).__init__(
+        save_callback, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return self._restore_callback(tensor)
+
+
+class _SplitDependency(checkpointable.CheckpointableBase):
+  """Looks like a regular variable while synchronizing save/restores."""
+
+  def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
+               fill_save_buffer_fn, consume_restore_buffer_fn):
+    self._save_buffer = save_buffer
+    self._restore_buffer = restore_buffer
+    self._name = name
+    self._dtype = dtype
+    self._num_components = num_components
+    self._fill_save_buffer_fn = fill_save_buffer_fn
+    self._consume_restore_buffer_fn = consume_restore_buffer_fn
+
+  def _save(self):
+    """Pull from the shared buffer, populating it if necessary."""
+    if self._name not in self._save_buffer:
+      if self._save_buffer:
+        raise AssertionError(
+            ("Split dependency %s (%s) unsynchronized. Split dependencies must "
+             "be saved together.") % (self._name, self))
+      self._fill_save_buffer_fn(self._save_buffer)
+    return self._save_buffer.pop(self._name)
+
+  def _restore(self, tensor):
+    """Push into the shared buffer, flushing it if necessary."""
+    if self._name in self._restore_buffer:
+      raise AssertionError(
+          ("Split dependency %s (%s) unsynchronized. Split dependencies must "
+           "be restored together.") % (self._name, self))
+    self._restore_buffer[self._name] = tensor
+    if len(self._restore_buffer) == self._num_components:
+      op = self._consume_restore_buffer_fn(self._restore_buffer)
+      self._restore_buffer.clear()
+      return op
+    else:
+      return control_flow_ops.no_op()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Looks to Checkpointable like a regular variable."""
+    return {
+        checkpointable.VARIABLE_VALUE_KEY:
+        functools.partial(_CallbackSaveable,
+                          dtype=self._dtype,
+                          save_callback=self._save,
+                          restore_callback=self._restore)
+    }
+
+
+def split_dependency(component_names, component_dtypes,
+                     fill_save_buffer_fn, consume_restore_buffer_fn):
+  """Creates multiple dependencies with a synchronized save/restore.
+
+  Useful when a single op produces `Tensor`s which should each be saved under
+  different objects, or when `Tensor`s saved with many different objects need to
+  be restored together as inputs to a single op (i.e. an object which uses a
+  single fused op may be swapped out for a subgraph of objects, and these two
+  programs are checkpoint compatible).
+
+  Args:
+    component_names: A sequence of names for the split
+      dependencies. `fill_save_buffer_fn` must add these keys to the dictionary
+      it is passed, and `consume_restore_buffer_fn` will receive a dictionary
+      with these keys.
+    component_dtypes: Data types for the `Tensor`s being saved and restored, a
+      sequence corresponding to `component_names`.
+    fill_save_buffer_fn: A function which takes an empty dictionary as an
+      argument and adds `Tensor`s with `component_names` as keys. These
+      `Tensor`s will be saved as if they were individual variables.
+    consume_restore_buffer_fn: A function which takes a dictionary with
+      `component_names` as keys mapping to restored individual `Tensor`s and
+      returns a restore op (or if executing eagerly, runs the restoration and
+      may return `None`).
+
+  Returns:
+    A dictionary mapping from names to Checkpointable objects. If one is
+    reachable from an object as a dependency, the others should be too; adding
+    dependencies on some but not all of the objects will result in errors.
+  """
+  save_buffer = {}
+  restore_buffer = {}
+  split_dependencies = {}
+  for name, dtype in zip(component_names, component_dtypes):
+    split_dependencies[name] = _SplitDependency(
+        save_buffer=save_buffer,
+        restore_buffer=restore_buffer,
+        name=name,
+        dtype=dtype,
+        num_components=len(component_names),
+        fill_save_buffer_fn=fill_save_buffer_fn,
+        consume_restore_buffer_fn=consume_restore_buffer_fn)
+  return split_dependencies
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1d9d19b047ee69281cf8bdba38a28dc87947e38
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -0,0 +1,112 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.checkpoint.python import split_dependency
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
+
+
+def _split_variable_closure(variable):
+  def _fill_save_buffer_fn(save_buffer):
+    save_buffer["first_half"] = variable[:2]
+    save_buffer["second_half"] = variable[2:]
+  return _fill_save_buffer_fn
+
+
+def _combine_variable_closure(variable):
+  def _consume_restore_buffer_fn(restore_buffer):
+    return variable.assign(
+        array_ops.concat([restore_buffer["first_half"],
+                          restore_buffer["second_half"]],
+                         axis=0))
+  return _consume_restore_buffer_fn
+
+
+class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase):
+
+  def __init__(self):
+    self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
+    split_dependencies = split_dependency.split_dependency(
+        component_names=("first_half", "second_half"),
+        component_dtypes=(self.combined.dtype,) * 2,
+        fill_save_buffer_fn=_split_variable_closure(
+            self.combined),
+        consume_restore_buffer_fn=_combine_variable_closure(
+            self.combined))
+    for name, dep in split_dependencies.items():
+      self._track_checkpointable(dep, name=name)
+
+
+class HasRegularDeps(checkpointable.Checkpointable):
+
+  def __init__(self):
+    self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
+    self.second_half = resource_variable_ops.ResourceVariable([0., 0.])
+
+
+class OnlyOneDep(checkpointable.Checkpointable):
+
+  def __init__(self):
+    self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
+
+
+class SplitTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestoreSplitDep(self):
+    save_checkpoint = checkpointable_utils.Checkpoint(
+        dep=SaveTensorSlicesAsDeps())
+    self.evaluate(save_checkpoint.dep.combined.assign([1., 2., 3., 4.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_checkpoint.save(checkpoint_prefix)
+
+    regular_deps = HasRegularDeps()
+    regular_restore_checkpoint = checkpointable_utils.Checkpoint(
+        dep=regular_deps)
+    regular_restore_checkpoint.restore(
+        save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2.], self.evaluate(regular_deps.first_half))
+    self.assertAllEqual([3., 4.], self.evaluate(regular_deps.second_half))
+
+    one_dep = OnlyOneDep()
+    one_dep_restore_checkpoint = checkpointable_utils.Checkpoint(dep=one_dep)
+    status = one_dep_restore_checkpoint.restore(save_path)
+    with self.assertRaises(AssertionError):
+      # Missing the second dependency.
+      status.assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2.], self.evaluate(one_dep.first_half))
+
+    restore_checkpoint = checkpointable_utils.Checkpoint()
+    status = restore_checkpoint.restore(save_path)
+    restore_checkpoint.dep = SaveTensorSlicesAsDeps()
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual(
+        [1., 2., 3., 4.],
+        self.evaluate(restore_checkpoint.dep.combined))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..86fbdb41d2c37803f2bd71b5aa2f72845c87d448
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/visualize.py
@@ -0,0 +1,111 @@
+"""Utilities for visualizing dependency graphs."""
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.training import checkpointable
+
+
+def dot_graph_from_checkpoint(save_path):
+  r"""Visualizes an object-based checkpoint (from `tf.train.Checkpoint`).
+
+  Useful for inspecting checkpoints and debugging loading issues.
+
+  Example usage from Python (requires pydot):
+  ```python
+  import tensorflow as tf
+  import pydot
+
+  dot_string = tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt')
+  parsed, = pydot.graph_from_dot_data(dot_string)
+  parsed.write_svg('/tmp/tensorflow/visualized_checkpoint.svg')
+  ```
+
+  Example command line usage:
+  ```sh
+  python -c "import tensorflow as tf;\
+    print(tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt'))"\
+    | dot -Tsvg > /tmp/tensorflow/checkpoint_viz.svg
+  ```
+
+  Args:
+    save_path: The checkpoint prefix, as returned by `tf.train.Checkpoint.save`
+      or `tf.train.latest_checkpoint`.
+  Returns:
+    A graph in DOT format as a string.
+  """
+  reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+  try:
+    object_graph_string = reader.get_tensor(
+        checkpointable.OBJECT_GRAPH_PROTO_KEY)
+  except errors_impl.NotFoundError:
+    raise ValueError(
+        ('The specified checkpoint "%s" does not appear to be object-based (it '
+         'is missing the key "%s"). Likely it was created with a name-based '
+         'saver and does not contain an object dependency graph.') % (
+             save_path, checkpointable.OBJECT_GRAPH_PROTO_KEY))
+  shape_map = reader.get_variable_to_shape_map()
+  dtype_map = reader.get_variable_to_dtype_map()
+  object_graph = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  object_graph.ParseFromString(object_graph_string)
+  graph = 'digraph {\n'
+  def _escape(name):
+    return name.replace('"', '\\"')
+  slot_ids = set()
+  for node in object_graph.nodes:
+    for slot_reference in node.slot_variables:
+      slot_ids.add(slot_reference.slot_variable_node_id)
+  for node_id, node in enumerate(object_graph.nodes):
+    if (len(node.attributes) == 1
+        and node.attributes[0].name == checkpointable.VARIABLE_VALUE_KEY):
+      if node_id in slot_ids:
+        color = 'orange'
+        tooltip_prefix = 'Slot variable'
+      else:
+        color = 'blue'
+        tooltip_prefix = 'Variable'
+      attribute = node.attributes[0]
+      graph += ('N_%d [shape=point label="" color=%s width=.25'
+                ' tooltip="%s %s shape=%s %s"]\n') % (
+                    node_id,
+                    color,
+                    tooltip_prefix,
+                    _escape(attribute.full_name),
+                    shape_map[attribute.checkpoint_key],
+                    dtype_map[attribute.checkpoint_key].name)
+    elif node.slot_variables:
+      graph += ('N_%d [shape=point label="" width=.25 color=red,'
+                'tooltip="Optimizer"]\n') % node_id
+    else:
+      graph += 'N_%d [shape=point label="" width=.25]\n' % node_id
+    for reference in node.children:
+      graph += 'N_%d -> N_%d [label="%s"]\n' % (
+          node_id, reference.node_id, _escape(reference.local_name))
+    for slot_reference in node.slot_variables:
+      graph += 'N_%d -> N_%d [label="%s" style=dotted]\n' % (
+          node_id,
+          slot_reference.slot_variable_node_id,
+          _escape(slot_reference.slot_name))
+      graph += 'N_%d -> N_%d [style=dotted]\n' % (
+          slot_reference.original_variable_node_id,
+          slot_reference.slot_variable_node_id)
+  graph += '}\n'
+  return graph
diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d9ab789235cb964521315b4864563f89745ae75
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/visualize_test.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+from tensorflow.contrib.checkpoint.python import visualize
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable_utils
+
+try:
+  import pydot  # pylint: disable=g-import-not-at-top
+except ImportError:
+  pydot = None
+
+
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class DotGraphTests(test.TestCase):
+
+  def testMakeDotGraph(self):
+    with context.eager_mode():
+      input_value = constant_op.constant([[3.]])
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      optimizer_step = resource_variable_ops.ResourceVariable(12)
+      save_checkpoint = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+      optimizer.minimize(functools.partial(model, input_value))
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+      save_path = save_checkpoint.save(checkpoint_prefix)
+      prefix = save_checkpoint.save(save_path)
+
+    dot_graph_string = visualize.dot_graph_from_checkpoint(prefix)
+
+    # The remainder of this test is more-or-less optional since it's so
+    # dependent on pydot/platform/Python versions.
+    if pydot is None:
+      self.skipTest('pydot is required for the remainder of this test.')
+    try:
+      parsed, = pydot.graph_from_dot_data(dot_graph_string)
+    except NameError as e:
+      if "name 'dot_parser' is not defined" in str(e):
+        self.skipTest("pydot isn't working")
+      else:
+        raise
+    # Check that the graph isn't completely trivial
+    self.assertEqual(
+        '"model"',
+        parsed.obj_dict['edges'][('N_0', 'N_1')][0]['attributes']['label'])
+    image_path = os.path.join(self.get_temp_dir(), 'saved.svg')
+    try:
+      parsed.write_svg(image_path)
+    except Exception as e:  # pylint: disable=broad-except
+      # For some reason PyDot's "dot not available" error is an Exception, not
+      # something more specific.
+      if '"dot" not found in path' in str(e):
+        self.skipTest("pydot won't save SVGs (dot not available)")
+      else:
+        raise
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index fe8bd072afd43a64fa62a65bd8900b5a98dbe761..f3a75e8688ece19a6e6fd53ee9faf7f4144d76cf 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -14,18 +14,6 @@ load(
     "tf_py_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_gen_op_libs(
     op_lib_names = ["bigquery_reader_ops"],
     deps = [
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index 56f930a9a8d32c5c3a025163ef56c9562f17d864..ff46f0daa80a70badedf73e15bfaf4dca85fdd89 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -20,20 +20,6 @@ load(
     "tf_proto_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_kernel_library(
     name = "bigquery_reader_ops",
     srcs = ["bigquery_reader_ops.cc"],
@@ -73,6 +59,7 @@ tf_cc_test(
     ],
     deps = [
         ":bigquery_table_accessor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
index e9b79a066def566096d6c3f3745974423e3371d1..7416eb19d3324fad84876cde5353bc25bac8f648 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -28,8 +29,8 @@ constexpr char kTestProject[] = "test-project";
 constexpr char kTestDataset[] = "test-dataset";
 constexpr char kTestTable[] = "test-table";
 
-bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 80e18a43a71cc9d6c9e2ccf5836e50c6427a30f6..c239e6f8f960910cee14e1df7c4678c643496f54 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -10,19 +10,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
-
 py_library(
     name = "cluster_resolver_pip",
     srcs = [
@@ -30,6 +17,7 @@ py_library(
         "python/training/__init__.py",
     ],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":cluster_resolver_py",
         ":gce_cluster_resolver_py",
@@ -109,5 +97,6 @@ tf_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
     ],
+    grpc_enabled = True,
     main = "python/training/tpu_cluster_resolver_test.py",
 )
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
index b04822fa9d66465e34a545d3b00c399bbb196514..1c480b25134b1e54200e0ddb780bd7bb0f122341 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
@@ -53,11 +53,16 @@ class ClusterResolver(object):
     raise NotImplementedError(
         'cluster_spec is not implemented for {}.'.format(self))
 
+  @abc.abstractmethod
+  def master(self):
+    """..."""
+    raise NotImplementedError('master is not implemented for {}.'.format(self))
+
 
 class SimpleClusterResolver(ClusterResolver):
   """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
 
-  def __init__(self, cluster_spec):
+  def __init__(self, cluster_spec, master=''):
     """Creates a SimpleClusterResolver from a ClusterSpec."""
     super(SimpleClusterResolver, self).__init__()
 
@@ -65,10 +70,18 @@ class SimpleClusterResolver(ClusterResolver):
       raise TypeError('cluster_spec must be a ClusterSpec.')
     self._cluster_spec = cluster_spec
 
+    if not isinstance(master, str):
+      raise TypeError('master must be a string.')
+    self._master = master
+
   def cluster_spec(self):
     """Returns the ClusterSpec passed into the constructor."""
     return self._cluster_spec
 
+  def master(self):
+    """Returns the master address to use when creating a session."""
+    return self._master
+
 
 class UnionClusterResolver(ClusterResolver):
   """Performs a union on underlying ClusterResolvers.
@@ -87,9 +100,13 @@ class UnionClusterResolver(ClusterResolver):
 
     Raises:
       TypeError: If any argument is not a subclass of `ClusterResolvers`.
+      ValueError: If there are no arguments passed.
     """
     super(UnionClusterResolver, self).__init__()
 
+    if not args:
+      raise ValueError('At least one ClusterResolver is required.')
+
     for cluster_resolver in args:
       if not isinstance(cluster_resolver, ClusterResolver):
         raise TypeError('All arguments must be a sub-class of '
@@ -169,3 +186,7 @@ class UnionClusterResolver(ClusterResolver):
           merged_cluster[job_name].update(task_dict)
 
     return ClusterSpec(merged_cluster)
+
+  def master(self):
+    """master returns the master address from the first cluster resolver."""
+    return self._cluster_resolvers[0].master()
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
index dbfb77723cdaab66e29bb41b764593bb5fd61b35..d9c97d53eb3663f6ab2f7b40395592dc7638b896 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
@@ -234,5 +234,7 @@ class UnionClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(cluster_spec, expected_proto)
 
 
+# TODO(saeta): Include tests for master resolution
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
index d6f2eced93ba4fda5ac27f9412b6f729981f4f40..3f5824128948453634bc5e5a7d6fdeedae60f5bd 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
@@ -134,3 +134,6 @@ class GceClusterResolver(ClusterResolver):
 
     worker_list.sort()
     return ClusterSpec({self._job_name: worker_list})
+
+  def master(self):
+    return ''
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index a6a6e642e4e4c721b94821a70d55d6fe931347d6..1403483d287041b02dfbf538f7e7ddee11662f47 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -18,12 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 
 from six.moves.urllib.request import Request
 from six.moves.urllib.request import urlopen
 
 from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
 
 _GOOGLE_API_CLIENT_INSTALLED = True
 try:
@@ -33,6 +35,9 @@ except ImportError:
   _GOOGLE_API_CLIENT_INSTALLED = False
 
 
+_GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
+
+
 class TPUClusterResolver(ClusterResolver):
   """Cluster Resolver for Google Cloud TPUs.
 
@@ -46,13 +51,32 @@ class TPUClusterResolver(ClusterResolver):
     req = Request('http://metadata/computeMetadata/v1/%s' % path,
                   headers={'Metadata-Flavor': 'Google'})
     resp = urlopen(req)
-    return resp.read()
+    return compat.as_bytes(resp.read())
+
+  def _shouldResolve(self):
+    if (self._tpu == compat.as_bytes('') or
+        self._tpu == compat.as_bytes('local') or
+        self._tpu.startswith(compat.as_bytes('/bns')) or
+        self._tpu.startswith(compat.as_bytes('grpc://'))):
+      return False
+    return True
+
+  @staticmethod
+  def _inGke():
+    """When running in GKE, the environment variable will be set."""
+    return _GKE_ENV_VARIABLE in os.environ
+
+  @staticmethod
+  def _gkeMaster():
+    return os.environ[_GKE_ENV_VARIABLE].split(',')[0]
 
   def __init__(self,
-               tpu_names,
+               tpu=None,
                zone=None,
                project=None,
-               job_name='tpu_worker',
+               job_name='worker',
+               coordinator_name=None,
+               coordinator_address=None,
                credentials='default',
                service=None):
     """Creates a new TPUClusterResolver object.
@@ -61,7 +85,11 @@ class TPUClusterResolver(ClusterResolver):
     for the IP addresses and ports of each Cloud TPU listed.
 
     Args:
-      tpu_names: A list of names of the target Cloud TPUs.
+      tpu: Either a string, or a list of strings corresponding to the TPUs to
+        use. If the single string is the empty string, the string 'local', or a
+        string that begins with 'grpc://' or '/bns', then it is assumed to not
+        correspond with a Cloud TPU and will instead be passed as the session
+        master and no ClusterSpec propagation will be done.
       zone: Zone where the TPUs are located. If omitted or empty, we will assume
         that the zone of the TPU is the same as the zone of the GCE VM, which we
         will try to discover from the GCE metadata service.
@@ -69,6 +97,12 @@ class TPUClusterResolver(ClusterResolver):
         empty, we will try to discover the project name of the GCE VM from the
         GCE metadata service.
       job_name: Name of the TensorFlow job the TPUs belong to.
+      coordinator_name: The name to use for the coordinator. Set to None if the
+        coordinator should not be included in the computed ClusterSpec.
+      coordinator_address: The address of the coordinator (typically an ip:port
+        pair). If set to None, a TF server will be started. If coordinator_name
+        is None, a TF server will not be started even if coordinator_address is
+        None.
       credentials: GCE Credentials. If None, then we use default credentials
         from the oauth2client
       service: The GCE API object returned by the googleapiclient.discovery
@@ -77,29 +111,48 @@ class TPUClusterResolver(ClusterResolver):
 
     Raises:
       ImportError: If the googleapiclient is not installed.
+      ValueError: If no TPUs are specified.
     """
+    if isinstance(tpu, list):
+      if not tpu:
+        raise ValueError('At least one TPU must be specified.')
+      if len(tpu) != 1:
+        raise NotImplementedError(
+            'Using multiple TPUs in a single session is not yet implemented')
+      tpu = tpu[0]
+
+    in_gke = self._inGke()
+    # When using GKE with Cloud TPUs, the env variable will be set.
+    if tpu is None and in_gke:
+      tpu = self._gkeMaster()
+
+    self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
+    self._job_name = job_name
+    self._credentials = credentials
 
-    if not project:
-      project = self._requestComputeMetadata('/project/project-id')
+    should_resolve = self._shouldResolve()
 
-    if not zone:
-      zone_path = self._requestComputeMetadata('/instance/zone')
+    if not project and should_resolve:
+      project = compat.as_str(
+          self._requestComputeMetadata('project/project-id'))
+
+    if not zone and should_resolve:
+      zone_path = compat.as_str(self._requestComputeMetadata('instance/zone'))
       zone = zone_path.split('/')[-1]
 
     self._project = project
     self._zone = zone
-    self._tpu_names = tpu_names
-    self._job_name = job_name
-    self._credentials = credentials
 
-    if credentials == 'default':
+    if credentials == 'default' and should_resolve:
       if _GOOGLE_API_CLIENT_INSTALLED:
         self._credentials = GoogleCredentials.get_application_default()
 
-    if service is None:
+    if service is None and should_resolve:
       if not _GOOGLE_API_CLIENT_INSTALLED:
         raise ImportError('googleapiclient must be installed before using the '
-                          'TPU cluster resolver')
+                          'TPU cluster resolver. Execute: `pip install '
+                          '--upgrade google-api-python-client` to install with '
+                          'pip.')
 
       self._service = discovery.build(
           'tpu', 'v1alpha1',
@@ -107,25 +160,42 @@ class TPUClusterResolver(ClusterResolver):
     else:
       self._service = service
 
-  def get_master(self):
-    """Get the ClusterSpec grpc master path.
+    self._coordinator_name = coordinator_name
+    if coordinator_name and not coordinator_address and (should_resolve or
+                                                         in_gke):
+      self._start_local_server()
+    else:
+      self._coordinator_address = coordinator_address
 
-    This returns the grpc path (grpc://1.2.3.4:8470) of first instance in the
-    ClusterSpec returned by the cluster_spec function. This is suitable for use
-    for the `master` argument in tf.Session() when you are using one TPU.
+  def master(self):
+    """Get the Master string to be used for the session.
+
+    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
+    first instance in the ClusterSpec returned by the cluster_spec function.
+
+    If a non-TPU name is used when constructing a TPUClusterResolver, that will
+    be returned instead (e.g. If the tpus argument's value when constructing
+    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
+    'grpc://10.240.1.2:8470' will be returned).
 
     Returns:
-      string, the grpc path of the first instance in the ClusterSpec.
+      string, the connection string to use when creating a session.
 
     Raises:
       ValueError: If none of the TPUs specified exists.
     """
+    if not self._shouldResolve():
+      return self._tpu
+
     job_tasks = self.cluster_spec().job_tasks(self._job_name)
     if not job_tasks:
       raise ValueError('No TPUs exists with the specified names exist.')
 
     return 'grpc://' + job_tasks[0]
 
+  def get_master(self):
+    return self.master()
+
   def cluster_spec(self):
     """Returns a ClusterSpec object based on the latest TPU information.
 
@@ -134,17 +204,73 @@ class TPUClusterResolver(ClusterResolver):
 
     Returns:
       A ClusterSpec containing host information returned from Cloud TPUs.
+
+    Raises:
+      RuntimeError: If the provided TPU is not healthy.
     """
-    worker_list = []
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
+    ############################################################################
 
-    for tpu_name in self._tpu_names:
+    if self._shouldResolve():
+      # Case 1.
       full_name = 'projects/%s/locations/%s/nodes/%s' % (
-          self._project, self._zone, tpu_name)
+          self._project, self._zone, compat.as_text(self._tpu))
       request = self._service.projects().locations().nodes().get(name=full_name)
       response = request.execute()
 
-      if 'health' in response and response['health'] == 'HEALTHY':
+      if 'health' in response and response['health'] != 'HEALTHY':
+        raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
+                                                            response['health']))
+
+      if 'networkEndpoints' in response:
+        worker_list = [
+            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+            for endpoint in response['networkEndpoints']
+        ]
+      else:
+        # Fall back to the deprecated response format
         instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-        worker_list.append(instance_url)
+        worker_list = [instance_url]
+
+      cluster_spec = {self._job_name: worker_list}
+    else:
+      if not self._tpu.startswith(compat.as_bytes('grpc://')):
+        # Case 3.
+        return None
+      # Case 2.
+      cluster_spec = {self._job_name: [self._tpu[len(
+          compat.as_bytes('grpc://')):]]}
+
+    if self._coordinator_address:
+      # {1, 2}.a
+      cluster_spec[self._coordinator_name] = [self._coordinator_address]
+
+    return server_lib.ClusterSpec(cluster_spec)
+
+  def _start_local_server(self):
+    address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
+    self._server = server_lib.Server(
+        {
+            'local': ['0.0.0.0:0']
+        }, protocol='grpc', config=None, start=True)
+    # self._server.target is of the form: grpc://ipaddress:port
+    target = compat.as_bytes(self._server.target)
+    splits = target.split(compat.as_bytes(':'))
+    assert len(splits) == 3, self._server.target
+    assert splits[0] == compat.as_bytes('grpc'), self._server.target
+    self._coordinator_port = compat.as_text(splits[2])
+    self._coordinator_address = '%s:%s' % (
+        address, compat.as_text(self._coordinator_port))
 
-    return ClusterSpec({self._job_name: worker_list})
+  def __deepcopy__(self, memo):
+    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
+    return self
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 4fd34629cf74f90869c77b8cb098d3c585a49404..5b3f9be5a11237f9dceebefa1db294efaf7e482d 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
-
+from tensorflow.python.util import compat
 
 mock = test.mock
 
@@ -50,10 +52,12 @@ class MockNodeClass(object):
 
 def mock_request_compute_metadata(cls, *args, **kwargs):
   del cls, kwargs  # Unused.
-  if args[0] == '/project/project-id':
+  if args[0] == 'project/project-id':
     return 'test-project'
-  elif args[0] == '/instance/zone':
+  elif args[0] == 'instance/zone':
     return 'projects/test-project/locations/us-central1-c'
+  elif args[0] == 'instance/network-interfaces/0/ip':
+    return '10.128.1.2'
   return ''
 
 
@@ -71,18 +75,17 @@ class TPUClusterResolverTest(test.TestCase):
       expected_proto: Expected protobuf
     """
     self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
-    self.assertProtoEquals(
-        expected_proto, server_lib.ClusterSpec(cluster_spec).as_cluster_def())
-    self.assertProtoEquals(
-        expected_proto,
-        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
     self.assertProtoEquals(
         expected_proto,
-        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+        server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(expected_proto,
+                           server_lib.ClusterSpec(
+                               cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(expected_proto,
+                           server_lib.ClusterSpec(
+                               cluster_spec.as_dict()).as_cluster_def())
 
-  def mock_service_client(
-      self,
-      tpu_map=None):
+  def mock_service_client(self, tpu_map=None):
 
     if tpu_map is None:
       tpu_map = {}
@@ -98,8 +101,7 @@ class TPUClusterResolverTest(test.TestCase):
 
     return mock_client
 
-  @mock.patch.object(TPUClusterResolver,
-                     '_requestComputeMetadata',
+  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testRetrieveProjectAndZoneFromMetadata(self):
     tpu_map = {
@@ -113,17 +115,27 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_cluster_resolver = TPUClusterResolver(
         project=None,
         zone=None,
-        tpu_names=['test-tpu-1'],
+        tpu=['test-tpu-1'],
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
-    job { name: 'tpu_worker' tasks { key: 0 value: '10.1.2.3:8470' } }
-    """
-    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    job {
+      name: 'coordinator'
+      tasks { key: 0 value: '10.128.1.2:%s' }
+    }
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.1.2.3:8470' }
+    }
+    """ % tpu_cluster_resolver._coordinator_port
+    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
 
-  def testSimpleSuccessfulRetrieval(self):
+  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testRetrieveProjectAndZoneFromMetadataNoCoordinator(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
@@ -133,116 +145,228 @@ class TPUClusterResolverTest(test.TestCase):
     }
 
     tpu_cluster_resolver = TPUClusterResolver(
-        project='test-project',
-        zone='us-central1-c',
-        tpu_names=['test-tpu-1'],
+        project=None,
+        zone=None,
+        tpu=['test-tpu-1'],
+        coordinator_name=None,
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
-    job { name: 'tpu_worker' tasks { key: 0 value: '10.1.2.3:8470' } }
+    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
-  def testMultipleSuccessfulRetrieval(self):
+  def testSimpleSuccessfulRetrieval(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
             'port': '8470',
             'health': 'HEALTHY'
-        },
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
-            'ipAddress': '10.4.5.6',
-            'port': '8470',
-            'health': 'HEALTHY'
         }
     }
 
     tpu_cluster_resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
-        tpu_names=['test-tpu-2', 'test-tpu-1'],
+        tpu=['test-tpu-1'],
+        coordinator_name='coordinator',
+        coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
-    job { name: 'tpu_worker' tasks { key: 0 value: '10.4.5.6:8470' }
-                             tasks { key: 1 value: '10.1.2.3:8470' } }
+    job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
+    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
-  def testHealthyTpuNodeRetrieval(self):
+  def testNewNetworkEndpointFormat(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'ipAddress': '10.1.2.3',
-            'port': '8470',
-            'health': 'HEALTHY'
-        },
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
-            'ipAddress': '10.4.5.6',
-            'port': '8470',
-        },
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-3': {
-            'ipAddress': '10.7.8.9',
-            'port': '8470',
-            'health': 'UNHEALTHY'
+            'health': 'HEALTHY',
+            'networkEndpoints': [{
+                'ipAddress': '10.2.3.4',
+                'port': 8470,
+            }]
         }
     }
 
     tpu_cluster_resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
-        tpu_names=['test-tpu-2', 'test-tpu-1', 'test-tpu-3'],
+        tpu='test-tpu-1',
+        coordinator_name='coordinator',
+        coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
-    job {
-      name: 'tpu_worker'
-      tasks {
-        key: 0
-        value: '10.1.2.3:8470'
-      }
-    }
+    job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
+    job { name: 'worker' tasks { key: 0 value: '10.2.3.4:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual('grpc://10.2.3.4:8470', tpu_cluster_resolver.master())
 
-  def testGetMasterMultipleEntries(self):
+  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testPodResolution(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'ipAddress': '10.1.2.3',
-            'port': '8470',
-            'health': 'HEALTHY'
-        },
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
-            'ipAddress': '10.4.5.6',
-            'port': '8470',
-            'health': 'HEALTHY'
+            'health':
+                'HEALTHY',
+            'networkEndpoints': [
+                {
+                    'ipAddress': '10.2.3.4',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.5',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.6',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.7',
+                    'port': 8470,
+                },
+            ]
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        tpu='test-tpu-1',
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
+
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'coordinator',
+      tasks { key: 0 value: '10.128.1.2:%s'}
+    }
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.2.3.4:8470' }
+      tasks { key: 1 value: '10.2.3.5:8470' }
+      tasks { key: 2 value: '10.2.3.6:8470' }
+      tasks { key: 3 value: '10.2.3.7:8470' }
+    }
+    """ % tpu_cluster_resolver._coordinator_port
+    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+
+  def testPodResolutionNoCoordinator(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'health':
+                'HEALTHY',
+            'networkEndpoints': [
+                {
+                    'ipAddress': '10.2.3.4',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.5',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.6',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.7',
+                    'port': 8470,
+                },
+            ]
         }
     }
 
     tpu_cluster_resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
-        tpu_names=['test-tpu-2', 'test-tpu-1'],
+        tpu='test-tpu-1',
+        coordinator_name=None,
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
-    self.assertEqual('grpc://10.4.5.6:8470', tpu_cluster_resolver.get_master())
+
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.2.3.4:8470' }
+      tasks { key: 1 value: '10.2.3.5:8470' }
+      tasks { key: 2 value: '10.2.3.6:8470' }
+      tasks { key: 3 value: '10.2.3.7:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
   def testGetMasterNoEntries(self):
     tpu_map = {}
 
+    with self.assertRaises(ValueError):
+      TPUClusterResolver(
+          project='test-project',
+          zone='us-central1-c',
+          tpu=[],
+          coordinator_name=None,
+          credentials=None,
+          service=self.mock_service_client(tpu_map=tpu_map))
+
+  # TODO(saeta): Convert to parameterized test when included in OSS TF.
+  def verifyShouldResolve(self, tpu, should_resolve):
     tpu_cluster_resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
-        tpu_names=[],
+        tpu=tpu,
+        coordinator_name=None,
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
-    with self.assertRaises(ValueError):
-      tpu_cluster_resolver.get_master()
+        service=self.mock_service_client(tpu_map={}))
+    self.assertEqual(should_resolve, tpu_cluster_resolver._shouldResolve(),
+                     "TPU: '%s'" % tpu)
+
+  def testShouldResolveNoName(self):
+    self.verifyShouldResolve('', False)
+
+  def testShouldResolveLocal(self):
+    self.verifyShouldResolve('local', False)
+
+  def testShouldResolveGrpc(self):
+    self.verifyShouldResolve('grpc://10.1.2.3:8470', False)
+
+  def testShouldResolveBns(self):
+    self.verifyShouldResolve('/bns/foo/bar', False)
+
+  def testShouldResolveName(self):
+    self.verifyShouldResolve('mytpu', True)
+
+  def testShouldResolveList(self):
+    self.verifyShouldResolve(['myothertpu'], True)
+
+  def testShouldResolveGrpcPrefix(self):
+    self.verifyShouldResolve('grpctpu', True)
+
+  def testNoCallComputeMetadata(self):
+    tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar')
+    self.assertEqual(
+        compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master())
+    self.assertEqual(None, tpu_cluster_resolver.cluster_spec())
+
+  def testGkeEnvironment(self):
+    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
+    self.assertTrue('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS' in os.environ)
+    self.assertTrue(TPUClusterResolver._inGke())
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470'),
+        compat.as_bytes(TPUClusterResolver._gkeMaster()))
+    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 23b31ae1dcc83d8a7152354ac147de9ada320429..0708d6b7b9f0ba549aea091a265f42890e50d223 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -31,10 +31,14 @@ option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
 option(tensorflow_BUILD_MORE_PYTHON_TESTS "Build more python unit tests for contrib packages" OFF)
 option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
-option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
 option(tensorflow_DISABLE_EIGEN_FORCEINLINE "Disable forceinline, to speed up build on windows." OFF)
 
+# SIMD, MKL and MKLDNN options
+option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions" OFF)
+option(tensorflow_ENABLE_MKL_SUPPORT "Enable Intel MKL support" OFF)
+option(tensorflow_ENABLE_MKLDNN_SUPPORT "Enable Intel MKLDNN support, requires MKL enabled" OFF)
+
 # GPU, CUDA and cuDNN options
 option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against")
@@ -80,7 +84,7 @@ if (NOT WIN32)
 
   option(systemlib_ALL "Turn on every possible systemlib_* options" OFF)
   if (systemlib_ALL)
-    set (systmelib_ZLIB ON)
+    set (systemlib_ZLIB ON)
   endif (systemlib_ALL)
 endif()
 
@@ -124,8 +128,16 @@ endif()
 
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
-  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
-  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      # 64 bits
+      add_definitions(-DWIN64)
+  elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+      # 32 bits
+      # temporary fix for #18241
+      add_definitions(-DEIGEN_DEFAULT_DENSE_INDEX_TYPE=std::int64_t)
+  endif()
+  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11)
+  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
   add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
   add_definitions(-DTF_COMPILE_LIBRARY)
   add_definitions(/bigobj /nologo /EHsc /GF /MP /Gm-)
@@ -160,14 +172,24 @@ if (tensorflow_OPTIMIZE_FOR_NATIVE_ARCH)
   endif()
 endif()
 
+include(CheckCXXCompilerFlag)
+
+# OpenMP Support
+CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
+if (GCC_OPENMP_SUPPORT)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+endif()
+CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
+if (MSVC_OPENMP_SUPPORT)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+endif()
+
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
   if (WIN32)
-    CHECK_CXX_COMPILER_FLAG("${tensorflow_WIN_CPU_SIMD_OPTIONS}" COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+    CHECK_CXX_COMPILER_FLAG(${tensorflow_WIN_CPU_SIMD_OPTIONS} COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
     if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${tensorflow_WIN_CPU_SIMD_OPTIONS}")
-    else()
-      message(FATAL_ERROR "${tensorflow_WIN_CPU_SIMD_OPTIONS} not supported")
     endif()
   endif()
 endif()
@@ -193,6 +215,7 @@ include(protobuf)
 include(re2)
 include(cub)
 include(sqlite)
+include(double_conversion)
 if (tensorflow_BUILD_CC_TESTS)
   include(googletest)
 endif()
@@ -213,6 +236,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
     ${protobuf_STATIC_LIBRARIES}
     ${re2_STATIC_LIBRARIES}
     ${sqlite_STATIC_LIBRARIES}
+    ${double_conversion_STATIC_LIBRARIES}
 )
 
 if (systemlib_ZLIB)
@@ -240,6 +264,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
     fft2d
     re2
     sqlite_copy_headers_to_destination
+    double_conversion
 )
 
 include_directories(
@@ -262,6 +287,7 @@ include_directories(
     ${PROTOBUF_INCLUDE_DIRS}
     ${re2_INCLUDE_DIR}
     ${sqlite_INCLUDE_DIR}
+    ${double_conversion_INCLUDE_DIR}
 )
 
 if(tensorflow_ENABLE_SSL_SUPPORT)
@@ -298,6 +324,49 @@ if(HAIKU)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
 endif()
 
+# MKL Support
+if (tensorflow_ENABLE_MKL_SUPPORT)
+  add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
+  if (WIN32)
+    find_path(MKL_HOME_PLATFORM mkl
+      PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
+      PATH_SUFFIXES windows)
+    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
+    set(MKL_LINK_DIRS
+      ${MKL_HOME_PLATFORM}/mkl/lib/intel64
+      ${MKL_HOME_PLATFORM}/tbb/lib/intel64/vc_mt
+      ${MKL_HOME_PLATFORM}/compiler/lib/intel64
+      ${MKL_HOME_PLATFORM}/mkl/tools/builder/lib)
+    set(MKL_REDIST_DLL_DIRS
+      ${MKL_HOME_PLATFORM}/redist/intel64/mkl
+      ${MKL_HOME_PLATFORM}/redist/intel64/tbb/vc_mt
+      ${MKL_HOME_PLATFORM}/redist/intel64/compiler)
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES
+      mkl_intel_lp64_dll mkl_sequential_dll mkl_core_dll mkl_rt mkl_cdll_intel64)
+  endif()
+  if (UNIX)
+    # Fix me: complete the path on linux
+    find_path(MKL_HOME_PLATFORM mkl
+      HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
+      PATH_SUFFIXES linux)
+    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
+    set(MKL_LINK_DIRS) # incompleted
+    set(MKL_REDIST_SO_DIRS) # incompleted
+  endif()
+  include_directories(${MKL_INCLUDE_DIRS})
+  link_directories(${MKL_LINK_DIRS})
+  if (tensorflow_ENABLE_MKLDNN_SUPPORT)
+    include(mkldnn)
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
+    list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn)
+    include_directories(${mkldnn_INCLUDE_DIRS})
+  else (tensorflow_ENABLE_MKLDNN_SUPPORT)
+    add_definitions(-DINTEL_MKL_ML)
+  endif()
+endif (tensorflow_ENABLE_MKL_SUPPORT)
+
 if (tensorflow_ENABLE_GPU)
   if (NOT WIN32)
     # Default install paths for cuda libraries in Linux
@@ -409,6 +478,10 @@ if (tensorflow_ENABLE_GPU)
   include_directories(${tensorflow_source_dir}/third_party/gpus)
   # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+  if(NOT WIN32)
+    # add gomp to tensorflow_EXTERNAL_LIBRARIES, needed by libcusolver.so
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES gomp)
+  endif()
 
   # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
   # in the default build is upgraded.
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 8f85a75ee466dbac524a1266dc2522109ca77cd5..0b79f718d4823a987e02804f59a432ee46d0ada3 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -26,7 +26,7 @@ The CMake files in this directory can build the core TensorFlow runtime, an
 example C++ binary, and a PIP package containing the runtime and Python
 bindings.
 
-### Pre-requisites
+### Prerequisites
 
 * CMake version 3.5 or later.
 
@@ -34,14 +34,16 @@ bindings.
 
 * [SWIG](http://www.swig.org/download.html)
 
-* Additional pre-requisites for Microsoft Windows:
+* Additional prerequisites for Microsoft Windows:
   - Visual Studio 2015
   - Python 3.5
-  - NumPy 1.11.0 or later
 
-* Additional pre-requisites for Linux:
+* Additional prerequisites for Linux:
   - Python 2.7 or later
   - [Docker](https://www.docker.com/) (for automated testing)
+
+* Python dependencies:
+  - wheel
   - NumPy 1.11.0 or later
 
 ### Known-good configurations
@@ -102,7 +104,7 @@ ops or APIs.
 Step-by-step Windows build
 ==========================
 
-1. Install the pre-requisites detailed above, and set up your environment.
+1. Install the prerequisites detailed above, and set up your environment.
 
    * The following commands assume that you are using the Windows Command
      Prompt (`cmd.exe`). You will need to set up your environment to use the
@@ -126,6 +128,18 @@ Step-by-step Windows build
      D:\local\cuda\bin
      ```
 
+   * When building with MKL support after installing [MKL](https://software.intel.com/en-us/mkl) from INTEL, append its bin directories to your PATH environment variable.
+
+     In case TensorFlow fails to find the MKL dll's during initialization, check your PATH environment variable.
+     It should contain the directory of the MKL dlls. For example:
+
+     ```
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\mkl
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\compiler
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\tbb\vc_mt
+     ```
+
+
    * We assume that `cmake` and `git` are installed and in your `%PATH%`. If
      for example `cmake` is not in your path and it is installed in
      `C:\Program Files (x86)\CMake\bin\cmake.exe`, you can add this directory
@@ -164,7 +178,15 @@ Step-by-step Windows build
    More? -Dtensorflow_ENABLE_GPU=ON ^
    More? -DCUDNN_HOME="D:\...\cudnn"
    ```
+   To build with MKL support add "^" at the end of the last line above following with:
+
+   ```
+   More? -Dtensorflow_ENABLE_MKL_SUPPORT=ON ^
+   More? -DMKL_HOME="D:\...\compilers_and_libraries"
+   ```
+
    To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
+
    ```
    More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
    ```
@@ -224,6 +246,7 @@ Step-by-step Windows build
      ```
      ctest -C RelWithDebInfo
      ```
+
    * `-Dtensorflow_BUILD_MORE_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This enables python tests on
      serveral major packages. This option is only valid if this and tensorflow_BUILD_PYTHON_TESTS are both set as `ON`.
      After building the python wheel, you need to install the new wheel before running the tests.
@@ -232,6 +255,12 @@ Step-by-step Windows build
      ctest -C RelWithDebInfo
      ```
 
+   * `-Dtensorflow_ENABLE_MKL_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL support. If MKL is enabled you need to install the [Intel Math Kernal Library](https://software.intel.com/en-us/mkl).
+     CMake will expect the location of MKL in -MKL_HOME=path_you_install_mkl.
+
+   * `-Dtensorflow_ENABLE_MKLDNN_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL DNN support. MKL DNN is [Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)](https://github.com/intel/mkl-dnn). You have to add `-Dtensorflow_ENABLE_MKL_SUPPORT=ON` before including MKL DNN support.
+
+
 4. Invoke MSBuild to build TensorFlow.
 
    To build the C++ example program, which will be created as a `.exe`
@@ -249,6 +278,7 @@ Step-by-step Windows build
    D:\...\build> MSBuild /p:Configuration=Release tf_python_build_pip_package.vcxproj
    ```
 
+
 Linux Continuous Integration build
 ==================================
 
diff --git a/tensorflow/contrib/cmake/external/cub.cmake b/tensorflow/contrib/cmake/external/cub.cmake
index 836889895567f679d9960e29ece1600d1a7a58eb..98a8c7e736e5c8c407b90e8eac440cdc7ab21579 100644
--- a/tensorflow/contrib/cmake/external/cub.cmake
+++ b/tensorflow/contrib/cmake/external/cub.cmake
@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(cub_URL https://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.4.zip)
-set(cub_HASH SHA256=20a1a39fd97e5da7f40f5f2e7fd73fd2ea59f9dc4bb8a6c5f228aa543e727e31)
+set(cub_URL https://mirror.bazel.build/github.com/NVlabs/cub/archive/1.8.0.zip)
+set(cub_HASH SHA256=6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3)
 set(cub_BUILD ${CMAKE_CURRENT_BINARY_DIR}/cub/src/cub)
 set(cub_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub/src/cub)
 set(cub_ARCHIVE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/cub_archive)
diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..527ccdc8d887cb4c2e7d2412c99a8bc682568472
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/double_conversion.cmake
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion)
+set(double_conversion_URL https://github.com/google/double-conversion.git)
+set(double_conversion_TAG 5664746)
+set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR})
+set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so)
+set(double_conversion_INCLUDES ${double_conversion_BUILD})
+
+if(WIN32)
+  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib)
+else()
+  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a)
+endif()
+
+set(double_conversion_HEADERS
+    "${double_conversion_INCLUDE_DIR}/double-conversion/bignum-dtoa.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/cached-powers.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/double-conversion.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/fixed-dtoa.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/strtod.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/bignum.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/diy-fp.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/fast-dtoa.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/ieee.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/utils.h"
+)
+
+ExternalProject_Add(double_conversion
+    PREFIX double_conversion
+    GIT_REPOSITORY ${double_conversion_URL}
+    GIT_TAG ${double_conversion_TAG}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    INSTALL_COMMAND ""
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index a235442dc5c0a07e249653381436eeae81575883..cdaa6b73b93666d272faacb869e8272561a2c74c 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(gemmlowp_URL https://github.com/google/gemmlowp/archive/6a2a90822e8546fc2bfa7044de0faf1c1cb4862f.zip)
-set(gemmlowp_HASH SHA256=3447948d219f3270383766bbe08942888c0eb4e0ca6663c0e0548502ec5bb77d)
+set(gemmlowp_URL https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip)
+set(gemmlowp_HASH SHA256=b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658)
 set(gemmlowp_BUILD ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 set(gemmlowp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index a9f43a3ecba4830533efcc13f8c4c1c61fe1ef78..693dc7cd673233b889b35a3f3170b57581da9a9f 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 730b778632e79cc3c96ad237f282d687ee325ce7)
+set(GRPC_TAG d184fa229d75d336aedea0041bd59cb93e7e267f)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
@@ -35,6 +35,8 @@ else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
 endif()
 
diff --git a/tensorflow/contrib/cmake/external/mkldnn.cmake b/tensorflow/contrib/cmake/external/mkldnn.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a639fdee367f060d4c8a79267803da6ffe3dc503
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/mkldnn.cmake
@@ -0,0 +1,44 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(mkldnn_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/include)
+set(mkldnn_URL https://github.com/01org/mkl-dnn.git)
+set(mkldnn_BUILD ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src)
+set(mkldnn_TAG 3063b2e4c943983f6bf5f2fb9a490d4a998cd291)
+
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.lib)
+  else()
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.lib)
+  endif()
+else()
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/libmkldnn.a)
+endif()
+
+ExternalProject_Add(mkldnn
+    PREFIX mkldnn
+    GIT_REPOSITORY ${mkldnn_URL}
+    GIT_TAG ${mkldnn_TAG}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${mkldnn_STATIC_LIBRARIES}
+    INSTALL_COMMAND ""
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DMKLINC:STRING=${MKL_INCLUDE_DIRS}
+)
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index f3a37ff5088e3f9e54e38c0edb5777c27b26969f..b9d1dd88d4c2d3c9141ba56e14911e06b4d33f7c 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 8502189abfa44c249c01c2cad64e6ed660a9a668)
+set(nsync_TAG 0559ce013feac8db639ee1bf776aca0325d28777)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 6cd66a65990e7a2b963b52b310061b551752cd4d..ad2af01bc002555ce48f8b9bfb7d8d724a1a7dc8 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -15,32 +15,33 @@
 include (ExternalProject)
 
 set(png_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/png_archive)
-set(png_URL https://storage.googleapis.com/libpng-public-archive/libpng-1.2.53.tar.gz)
-set(png_HASH SHA256=e05c9056d7f323088fd7824d8c6acc03a4a758c4b4916715924edc5dd3223a72)
+set(png_URL https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz)
+set(png_HASH SHA256=e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef)
 set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
     set(png_STATIC_LIBRARIES 
-      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
-      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib
+      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
   else()
     if(CMAKE_BUILD_TYPE EQUAL Debug)
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib)
     else()
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
     endif()
   endif()
 else()
-  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
+  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng16.a)
 endif()
 
 set(png_HEADERS
-    "${png_INSTALL}/include/libpng12/png.h"
-    "${png_INSTALL}/include/libpng12/pngconf.h"
+    "${png_INSTALL}/include/libpng16/png.h"
+    "${png_INSTALL}/include/libpng16/pngconf.h"
+    "${png_INSTALL}/include/libpng16/pnglibconf.h"
 )
 
 ExternalProject_Add(png
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index aba8a5244e17d717293deec6d9b6e8e725ef010e..ab464bc99a43138130bb2758ae28ecef29805c31 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 set(PROTOBUF_URL https://github.com/google/protobuf.git)
-set(PROTOBUF_TAG 396336eb961b75f03b25824fe86cf6490fb75e3a)
+set(PROTOBUF_TAG b04e5cba356212e4e8c66c61bbe0c3a20537c5b9)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index 57c4ae76517e4d7247093edd5e5bd95a83258d87..7f835d2d519273a6d52d12f92ed585a4ddbeb973 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -15,8 +15,8 @@
 include (ExternalProject)
 
 set(sqlite_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/sqlite)
-set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip)
-set(sqlite_HASH SHA256=208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4)
+set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2018/sqlite-amalgamation-3230100.zip)
+set(sqlite_HASH SHA256=4239a1f69e5721d07d9a374eb84d594225229e54be4ee628da2995f4315d8dfc)
 set(sqlite_BUILD ${CMAKE_CURRENT_BINARY_DIR}/sqlite/src/sqlite)
 set(sqlite_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/sqlite/install)
 
diff --git a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
index aaae18a313dd082b428654091c9411600c981ec9..6f059c7225dd0938b758e8f9c28ec36fcff6db4c 100644
--- a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
@@ -42,7 +42,6 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
   include_directories ("${PROJECT_SOURCE_DIR}/platform/c++11")
   add_definitions ("-DNSYNC_USE_CPP11_TIMEPOINT -DNSYNC_ATOMIC_CPP11")
   set (NSYNC_OS_CPP_SRC
-    "platform/c++11/src/nsync_semaphore_mutex.cc"
     "platform/c++11/src/per_thread_waiter.cc"
     "platform/c++11/src/yield.cc"
     "platform/c++11/src/time_rep_timespec.cc"
@@ -52,6 +51,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/win32")
     add_compile_options ("/TP")
     set (NSYNC_OS_SRC
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       "platform/win32/src/clock_gettime.c"
       "platform/win32/src/pthread_key_win32.cc"
       ${NSYNC_OS_CPP_SRC}
@@ -68,6 +68,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
       ${NSYNC_OS_CPP_SRC}
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       "platform/posix/src/clock_gettime.c"
       "platform/posix/src/nsync_semaphore_mutex.c"
     )
@@ -75,9 +76,11 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
       "platform/posix/src/start_thread.c"
     )
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "LinuxX")
+    include_directories (BEFORE "${PROJECT_SOURCE_DIR}/platform/c++11.futex")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
+      "platform/linux/src/nsync_semaphore_futex.c"
       ${NSYNC_OS_CPP_SRC}
     )
     set (NSYNC_TEST_OS_SRC
@@ -87,6 +90,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       ${NSYNC_OS_CPP_SRC}
     )
     set (NSYNC_TEST_OS_SRC
@@ -96,6 +100,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       ${NSYNC_OS_CPP_SRC}
     )
     set (NSYNC_TEST_OS_SRC
@@ -105,6 +110,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       ${NSYNC_OS_CPP_SRC}
     )
     set (NSYNC_TEST_OS_SRC
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index bfe53c01b3b5fb9db8a5d8fa280d1d7f98974882..6468bed4979253be5c20666d26bf24fa479d64a0 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -79,9 +79,11 @@ tensorflow/python/keras/_impl/keras/preprocessing
 tensorflow/python/keras/_impl/keras/utils
 tensorflow/python/keras/_impl/keras/wrappers
 tensorflow/python/kernel_tests
+tensorflow/python/kernel_tests/boosted_trees
 tensorflow/python/kernel_tests/distributions
 tensorflow/python/kernel_tests/linalg
 tensorflow/python/kernel_tests/random
+tensorflow/python/kernel_tests/testdata
 tensorflow/python/layers
 tensorflow/python/lib
 tensorflow/python/lib/core
@@ -102,6 +104,8 @@ tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
 tensorflow/tools
+tensorflow/tools/api
+tensorflow/tools/api/generator
 tensorflow/tools/graph_transforms
 tensorflow/contrib
 tensorflow/contrib/all_reduce
@@ -125,7 +129,13 @@ tensorflow/contrib/boosted_trees/kernels
 tensorflow/contrib/boosted_trees/ops
 tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/boosted_trees/python
+tensorflow/contrib/boosted_trees/python/kernel_tests
 tensorflow/contrib/boosted_trees/python/ops
+tensorflow/contrib/boosted_trees/python/training
+tensorflow/contrib/boosted_trees/python/training/functions
+tensorflow/contrib/boosted_trees/python/utils
+tensorflow/contrib/checkpoint
+tensorflow/contrib/checkpoint/python
 tensorflow/contrib/cloud
 tensorflow/contrib/cloud/kernels
 tensorflow/contrib/cloud/ops
@@ -138,8 +148,11 @@ tensorflow/contrib/coder
 tensorflow/contrib/coder/kernels
 tensorflow/contrib/coder/ops
 tensorflow/contrib/coder/python
+tensorflow/contrib/coder/python/layers
 tensorflow/contrib/coder/python/ops
 tensorflow/contrib/compiler
+tensorflow/contrib/constrained_optimization
+tensorflow/contrib/constrained_optimization/python
 tensorflow/contrib/copy_graph
 tensorflow/contrib/copy_graph/python
 tensorflow/contrib/copy_graph/python/util
@@ -147,8 +160,6 @@ tensorflow/contrib/crf
 tensorflow/contrib/crf/python
 tensorflow/contrib/crf/python/ops
 tensorflow/contrib/cudnn_rnn
-tensorflow/contrib/cudnn_rnn/kernels
-tensorflow/contrib/cudnn_rnn/ops
 tensorflow/contrib/cudnn_rnn/python
 tensorflow/contrib/cudnn_rnn/python/layers
 tensorflow/contrib/cudnn_rnn/python/ops
@@ -160,6 +171,9 @@ tensorflow/contrib/data/python/ops
 tensorflow/contrib/decision_trees
 tensorflow/contrib/decision_trees/proto
 tensorflow/contrib/deprecated
+tensorflow/contrib/distribute
+tensorflow/contrib/distribute/python
+tensorflow/contrib/distribute/python/examples
 tensorflow/contrib/distributions
 tensorflow/contrib/distributions/python
 tensorflow/contrib/distributions/python/ops
@@ -331,6 +345,7 @@ tensorflow/contrib/nccl/kernels
 tensorflow/contrib/nccl/ops
 tensorflow/contrib/nccl/python
 tensorflow/contrib/nccl/python/ops
+tensorflow/contrib/nearest_neighbor
 tensorflow/contrib/nearest_neighbor/kernels
 tensorflow/contrib/nearest_neighbor/ops
 tensorflow/contrib/nearest_neighbor/python
@@ -341,6 +356,7 @@ tensorflow/contrib/nn/python/ops
 tensorflow/contrib/opt
 tensorflow/contrib/opt/python
 tensorflow/contrib/opt/python/training
+tensorflow/contrib/optimizer_v2
 tensorflow/contrib/pi_examples
 tensorflow/contrib/pi_examples/camera
 tensorflow/contrib/pi_examples/label_image
@@ -349,6 +365,9 @@ tensorflow/contrib/periodic_resample
 tensorflow/contrib/periodic_resample/python
 tensorflow/contrib/periodic_resample/python/ops
 tensorflow/contrib/predictor
+tensorflow/contrib/proto
+tensorflow/contrib/proto/python
+tensorflow/contrib/proto/python/ops
 tensorflow/contrib/quantization
 tensorflow/contrib/quantization/python
 tensorflow/contrib/quantize
@@ -357,6 +376,10 @@ tensorflow/contrib/receptive_field
 tensorflow/contrib/receptive_field/python
 tensorflow/contrib/receptive_field/python/util
 tensorflow/contrib/receptive_field/python/util/examples
+tensorflow/contrib/recurrent
+tensorflow/contrib/recurrent/python
+tensorflow/contrib/recurrent/python/ops
+tensorflow/contrib/recurrent/python/kernel_tests
 tensorflow/contrib/reduce_slice_ops
 tensorflow/contrib/reduce_slice_ops/kernels
 tensorflow/contrib/reduce_slice_ops/ops
@@ -377,6 +400,9 @@ tensorflow/contrib/rnn/ops
 tensorflow/contrib/rnn/python
 tensorflow/contrib/rnn/python/kernel_tests
 tensorflow/contrib/rnn/python/ops
+tensorflow/contrib/rpc
+tensorflow/contrib/rpc/python
+tensorflow/contrib/rpc/python/ops
 tensorflow/contrib/saved_model
 tensorflow/contrib/saved_model/python
 tensorflow/contrib/saved_model/python/saved_model
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index 8a9c406d8b118c10ddcaafb0e4fc242aa79cdb57..d63c41db844af243f0c6600b1565635ac9b91cac 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -1,4 +1,5 @@
 tensorflow/core
+tensorflow/core/kernels/boosted_trees
 tensorflow/core/profiler
 tensorflow/python
 tensorflow/contrib/boosted_trees/proto
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index 96ac60d095dbc84470ff1be92f4bf52bb420fc52..a54cbff33b66d63d7229fa2f50b8a4ca962111ed 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -63,6 +63,12 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
     "${tensorflow_source_dir}/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc"
 )
+file(GLOB_RECURSE tf_core_cpu_whitelisted_srcs
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_id.h"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_id.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc"
+)
+list(REMOVE_ITEM tf_core_cpu_exclude_srcs ${tf_core_cpu_whitelisted_srcs})
 list(REMOVE_ITEM tf_core_cpu_srcs ${tf_core_cpu_exclude_srcs})
 
 if (tensorflow_ENABLE_GPU)
@@ -79,6 +85,7 @@ if (tensorflow_ENABLE_GPU)
      "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
   )
   list(REMOVE_ITEM tf_core_gpu_srcs ${tf_core_gpu_exclude_srcs})
+  list(REMOVE_ITEM tf_core_gpu_srcs ${tf_core_cpu_whitelisted_srcs})
   list(APPEND tf_core_cpu_srcs ${tf_core_gpu_srcs})
 endif()
 
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a1c320347fe60f87806736befc677541a93e7e93..b47c32f1c48b3d42fe5b4ba115cc2a511b7ee5f4 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    --raw_generate ${VERSION_INFO_CC}
+    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir} --git_tag_override=${GIT_TAG_OVERRIDE}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
@@ -341,9 +341,3 @@ add_dependencies(tf_core_framework
     tf_core_lib
     proto_text
 )
-
-if(WIN32)
-  # Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on.
-  # Instead of defining this global, limit it to tf_core_framework where its used.
-  target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC")
-endif()
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index f219d5eb577afa9edaadca09aef9869c81d2bd87..f38c9e05135f9f8d2fb3e2efedb7223e06e4983a 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -63,14 +63,15 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/training_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/unique_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc"
@@ -176,6 +177,16 @@ if(WIN32)
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
   )
   list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
+else(WIN32)
+  if(tensorflow_ENABLE_GPU)
+    file(GLOB_RECURSE tf_core_kernels_gpu_exclude_srcs
+        # temporarily disable nccl as it needs to be ported with gpu
+        "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
+    )
+    list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_gpu_exclude_srcs})
+  endif(tensorflow_ENABLE_GPU)
 endif(WIN32)
 
 file(GLOB_RECURSE tf_core_gpu_kernels_srcs
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 59e094812aaf4da2549d96314fc550e5635f9de8..e558691de4b74988031f7b2204aad92e8c7af68b 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -15,19 +15,23 @@
 set(tf_op_lib_names
     "audio_ops"
     "array_ops"
-		"batch_ops"
+    "batch_ops"
     "bitwise_ops"
+    "boosted_trees_ops"
     "candidate_sampling_ops"
     "checkpoint_ops"
     "control_flow_ops"
     "ctc_ops"
+    "cudnn_rnn_ops"
     "data_flow_ops"
     "dataset_ops"
+    "decode_proto_ops"
+    "encode_proto_ops"
     "functional_ops"
     "image_ops"
     "io_ops"
     "linalg_ops"
-		"list_ops"
+    "list_ops"
     "lookup_ops"
     "logging_ops"
     "manip_ops"
@@ -38,6 +42,7 @@ set(tf_op_lib_names
     "random_ops"
     "remote_fused_graph_ops"
     "resource_variable_ops"
+    "rpc_ops"
     "script_ops"
     "sdca_ops"
     "set_ops"
@@ -47,7 +52,7 @@ set(tf_op_lib_names
     "state_ops"
     "stateless_random_ops"
     "string_ops"
-		"summary_ops"
+    "summary_ops"
     "training_ops"
 )
 
@@ -84,7 +89,6 @@ GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_prediction "${tensorflow_source_dir}/t
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(coder "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(cudnn_rnn "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(data_dataset "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index b730ebd3baacafe8ae401e8987104f3062372954..c4bdb69d828b269e6246777e74c3756ba1c4b96f 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -319,6 +319,7 @@ GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
 GENERATE_PYTHON_OP_LIB("batch_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
+GENERATE_PYTHON_OP_LIB("boosted_trees_ops")
 GENERATE_PYTHON_OP_LIB("math_ops")
 GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("candidate_sampling_ops")
@@ -326,8 +327,13 @@ GENERATE_PYTHON_OP_LIB("checkpoint_ops")
 GENERATE_PYTHON_OP_LIB("control_flow_ops"
   ADDITIONAL_LIBRARIES $<TARGET_OBJECTS:tf_no_op>)
 GENERATE_PYTHON_OP_LIB("ctc_ops")
+GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
+GENERATE_PYTHON_OP_LIB("decode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_decode_proto_op.py)
+GENERATE_PYTHON_OP_LIB("encode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_encode_proto_op.py)
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
@@ -341,6 +347,8 @@ GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
+GENERATE_PYTHON_OP_LIB("rpc_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rpc/python/ops/gen_rpc_op.py)
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
@@ -348,6 +356,7 @@ GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
 GENERATE_PYTHON_OP_LIB("spectral_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
+GENERATE_PYTHON_OP_LIB("summary_ops")
 GENERATE_PYTHON_OP_LIB("user_ops")
 GENERATE_PYTHON_OP_LIB("training_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/training/gen_training_ops.py)
@@ -366,8 +375,6 @@ GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_stats_accumulator_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_stats_accumulator_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_coder_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/coder/python/ops/gen_coder_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_cudnn_rnn_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cudnn_rnn/ops/gen_cudnn_rnn_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_data_dataset_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_dataset_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
@@ -419,8 +426,6 @@ GENERATE_PYTHON_OP_LIB("stateless_random_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/stateless/gen_stateless_random_ops.py)
 GENERATE_PYTHON_OP_LIB("debug_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/debug/ops/gen_debug_ops.py)
-GENERATE_PYTHON_OP_LIB("summary_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/summary/gen_summary_ops.py)
 
 add_custom_target(tf_python_ops SOURCES ${tf_python_ops_generated_files} ${PYTHON_PROTO_GENFILES})
 add_dependencies(tf_python_ops tf_python_op_gen_main)
@@ -475,6 +480,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/lib/core/ndarray_tensor_bridge.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_exception_registry.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_exception_registry.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.h"
@@ -547,12 +554,13 @@ if(WIN32)
         set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.def")
     endif()
     set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
-
+    math(EXPR tensorflow_target_bitness "${CMAKE_SIZEOF_VOID_P}*8")
     add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
         COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
             --input "${pywrap_tensorflow_internal_static_dependencies}"
             --output "${pywrap_tensorflow_deffile}"
             --target _pywrap_tensorflow_internal.pyd
+            --bitness "${tensorflow_target_bitness}"
         BYPRODUCTS ${pywrap_tensorflow_deffile} # Required for Ninja
     )
 endif(WIN32)
@@ -582,6 +590,12 @@ add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_deffile}
 )
 
+# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+# linking to the tensorflow library. Adding the following libraries fixes it.
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+    target_link_libraries(pywrap_tensorflow_internal PRIVATE gcc_s gcc)
+endif()
+
 if(WIN32)
     add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static)
 endif(WIN32)
@@ -685,6 +699,77 @@ AddUserOps(TARGET _beam_search_ops
     DEPENDS pywrap_tensorflow_internal tf_python_ops
     DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
 
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  else()
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  endif()
+else()
+  add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
+endif()
+
+
+########################################################
+# Generate API __init__.py files.
+########################################################
+
+# Parse tensorflow/tools/api/generator/BUILD to get list of generated files.
+FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/BUILD api_generator_BUILD_text)
+STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
+string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "," ";" api_init_files_list ${api_init_files_text})
+
+set(api_init_files "")
+foreach(api_init_file ${api_init_files_list})
+    string(STRIP "${api_init_file}" api_init_file)
+    if(api_init_file)
+        string(REPLACE "\"" "" api_init_file "${api_init_file}")  # Remove quotes
+        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/${api_init_file}")
+    endif()
+endforeach(api_init_file)
+set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
+file(WRITE "${api_init_list_file}" "${api_init_files}")
+
+# Run create_python_api.py to generate __init__.py files.
+add_custom_command(
+      OUTPUT ${api_init_files}
+      DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+      # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+      # this step is running since the files aren't there yet.
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      # Run create_python_api.py to generate API init files.
+      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
+              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" "${api_init_list_file}"
+
+      # Re-add tensorflow/__init__.py back.
+      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      COMMENT "Generating __init__.py files for Python API."
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+)
+
+add_custom_target(tf_python_api SOURCES ${api_init_files})
+add_dependencies(tf_python_api tf_python_ops)
+
+
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
@@ -694,6 +779,7 @@ add_dependencies(tf_python_build_pip_package
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
     tf_python_ops
+    tf_python_api
     tf_extension_ops)
 
 # Fix-up Python files that were not included by the add_python_module() macros.
@@ -706,25 +792,6 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
 
-if(WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  else()
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  endif()
-else()
-  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
-endif()
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 6d36d5fc5c2854b2d7d2542a3cb12e033e193b88..38f40452b533fdc0dba6ac686a0ff43a2ef13cb8 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -52,12 +52,13 @@ if(WIN32)
     set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/tensorflow.def")
   endif()
   set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
-
+  math(EXPR tensorflow_target_bitness "${CMAKE_SIZEOF_VOID_P}*8")
   add_custom_command(TARGET tensorflow_static POST_BUILD
       COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
           --input "${tensorflow_static_dependencies}"
           --output "${tensorflow_deffile}"
           --target tensorflow.dll
+          --bitness "${tensorflow_target_bitness}"
   )
 endif(WIN32)
 
@@ -100,8 +101,7 @@ if(WIN32)
 endif(WIN32)
 
 target_include_directories(tensorflow PUBLIC 
-    $<INSTALL_INTERFACE:include/>
-    $<INSTALL_INTERFACE:include/external/nsync/public>)
+    $<INSTALL_INTERFACE:include/>)
 
 install(TARGETS tensorflow EXPORT tensorflow_export
         RUNTIME DESTINATION bin
@@ -133,10 +133,6 @@ install(DIRECTORY ${tensorflow_source_dir}/tensorflow/stream_executor/
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src/google/
         DESTINATION include/google
         FILES_MATCHING PATTERN "*.h")
-# nsync headers
-install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/
-        DESTINATION include/external/nsync
-        FILES_MATCHING PATTERN "*.h")
 # Eigen directory
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen/
         DESTINATION include/Eigen)
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 91ca33f4c4d5f6c822f45b0676e6e46d2e4c2860..9a37b681194d4ef82b27a0160dd969f733ecad67 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -64,7 +64,15 @@ file(GLOB tf_stream_executor_srcs
 if (tensorflow_ENABLE_GPU)
     file(GLOB tf_stream_executor_gpu_srcs
         "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
+        "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.h"
+        "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.cc"
     )
+    if (NOT tensorflow_BUILD_CC_TESTS)
+        file(GLOB tf_stream_executor_gpu_tests
+            "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc"
+        )
+        list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests})
+    endif()
     list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
 endif()
 
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 1c4ebd7f0c1113bcd0857fb0858df2248499f920..92f2ab6dea8e7da5dd8481639eda24e31c06848f 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -195,9 +195,11 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/python/profiler/model_analyzer_test.py"
     # Fails because uses data dependencies with bazel
     "${tensorflow_source_dir}/tensorflow/python/saved_model/saved_model_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py"
     # requires scipy
     "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/preprocessing/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py"
     # Takes very long to run without sharding (defined in bazel build file).
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/cwise_ops_test.py"
     # Loading resources in contrib doesn't seem to work on Windows
@@ -208,6 +210,9 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py"
     # Test is flaky on Windows GPU builds (b/38283730).
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/python/ops/gmm_test.py"
+    # Disable following manual tag in BUILD.
+    "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py"
+
   )
   if (WIN32)
     set(tf_test_src_py_exclude
@@ -222,6 +227,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/debug/cli/curses_ui_test.py"
       # TFDBG grpc:// mode is not yet available on Windows.
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/debug/lib/grpc_large_data_test.py"
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/source_remote_test.py"
       # stl on windows handles overflows different
@@ -278,6 +284,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py"  # Deadlocks
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/sloppy_transformation_dataset_op_test.py"  # b/65430561
+      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py"  # Segfaults on Windows.
       # tensor_forest tests (also note that we exclude the hybrid tests for now)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py"  # Results in wrong order.
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py"  # Results in wrong order.
@@ -475,6 +482,10 @@ if (tensorflow_BUILD_CC_TESTS)
     "${tensorflow_source_dir}/tensorflow/core/profiler/internal/advisor/*_test.cc"
   )
 
+  list(REMOVE_ITEM tf_test_src_simple
+    ${tf_core_profiler_test_srcs}
+  )
+
   set(tf_test_lib tf_test_lib)
   add_library(${tf_test_lib} STATIC ${tf_src_testlib})
 
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index 53c2285699a6ca94e1e6b147080338b507f4d768..cffe069aa352f8a6f2c436bc70b62f54e2336ac6 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -63,7 +63,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"^(TFE_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
-                        r"nsync_|"
+                        r"\?nsync_|"
                         r"perftools::gputools")
 
 # We want to identify data members explicitly in the DEF file, so that no one
@@ -87,6 +87,7 @@ def get_args():
                       required=True)
   parser.add_argument("--output", help="output deffile", required=True)
   parser.add_argument("--target", help="name of the target", required=True)
+  parser.add_argument("--bitness", help="build target bitness", required=True)
   args = parser.parse_args()
   return args
 
@@ -125,7 +126,10 @@ def main():
     # Header for the def file.
     def_fp.write("LIBRARY " + args.target + "\n")
     def_fp.write("EXPORTS\n")
-    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+    if args.bitness == "64":
+      def_fp.write("\t??1OpDef@tensorflow@@UEAA@XZ\n")
+    else:
+      def_fp.write("\t??1OpDef@tensorflow@@UAE@XZ\n")
 
     # Each symbols returned by undname matches the same position in candidates.
     # We compare on undname but use the decorated name from candidates.
diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
index ec3d550b70d2aaa23b989c44f3d86fa87cffb335..a2c6e413039ee3b5af3cb53d1af3325037536d36 100644
--- a/tensorflow/contrib/coder/BUILD
+++ b/tensorflow/contrib/coder/BUILD
@@ -1,5 +1,5 @@
 # Description:
-#   Contains entropy coding related modules.
+#   Contains tools related to data compression.
 
 package(default_visibility = [
     "//learning/brain:__subpackages__",
@@ -54,19 +54,27 @@ tf_gen_op_libs(
     ],
 )
 
+cc_library(
+    name = "range_coder_ops_util",
+    srcs = ["kernels/range_coder_ops_util.cc"],
+    hdrs = ["kernels/range_coder_ops_util.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_kernel_library(
     name = "range_coder_ops",
     srcs = [
         "kernels/range_coder_ops.cc",
-        "kernels/range_coder_ops_util.cc",
-    ],
-    hdrs = [
-        "kernels/range_coder_ops_util.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
         ":coder_ops_op_lib",
         ":range_coder",
+        ":range_coder_ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -92,6 +100,34 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "pmf_to_cdf_op",
+    srcs = ["kernels/pmf_to_cdf_op.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":coder_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "pmf_to_cdf_op_test",
+    size = "small",
+    srcs = ["kernels/pmf_to_cdf_op_test.cc"],
+    deps = [
+        ":pmf_to_cdf_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
 cc_library(
     name = "all_ops",
     deps = [":coder_ops_op_lib"],
@@ -99,12 +135,16 @@ cc_library(
 
 cc_library(
     name = "all_kernels",
-    deps = [":range_coder_ops"],
+    deps = [
+        ":pmf_to_cdf_op",
+        ":range_coder_ops",
+    ],
 )
 
 tf_custom_op_library(
     name = "python/ops/_coder_ops.so",
     srcs = [
+        "kernels/pmf_to_cdf_op.cc",
         "kernels/range_coder.cc",
         "kernels/range_coder.h",
         "kernels/range_coder_ops.cc",
@@ -120,10 +160,21 @@ tf_gen_op_wrapper_py(
     deps = [":coder_ops_op_lib"],
 )
 
+py_library(
+    name = "coder_py",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":coder_ops_py",
+        ":entropybottleneck_py",
+    ],
+)
+
 tf_custom_op_py_library(
     name = "coder_ops_py",
     srcs = [
-        "__init__.py",
         "python/ops/coder_ops.py",
     ],
     dso = [
@@ -155,13 +206,43 @@ tf_py_test(
     main = "python/ops/coder_ops_test.py",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
+py_library(
+    name = "entropybottleneck_py",
+    srcs = [
+        "python/layers/entropybottleneck.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":coder_ops_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:engine",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "entropybottleneck_py_test",
+    srcs = [
+        "python/layers/entropybottleneck_test.py",
+    ],
+    additional_deps = [
+        ":entropybottleneck_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:training",
+    ],
+    main = "python/layers/entropybottleneck_test.py",
 )
diff --git a/tensorflow/contrib/coder/__init__.py b/tensorflow/contrib/coder/__init__.py
index b7e663e6f1359f399cdaa80e037635a8f7546b37..99b8ac7595ec632b2918e6b7ca22c06dd7f0a8b3 100644
--- a/tensorflow/contrib/coder/__init__.py
+++ b/tensorflow/contrib/coder/__init__.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Entropy code operations."""
+"""Data compression tools."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.coder.python.layers.entropybottleneck import *
 from tensorflow.contrib.coder.python.ops.coder_ops import *
 # pylint: enable=wildcard-import
 
diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd5272ee6f20ac3537a2e378225ede5ee90782c5
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
@@ -0,0 +1,196 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+using errors::InvalidArgument;
+
+class PmfToCdfOp : public OpKernel {
+ public:
+  explicit PmfToCdfOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("precision", &precision_));
+    OP_REQUIRES(
+        context, 0 < precision_ && precision_ <= 16,
+        InvalidArgument("`precision` must be in [1, 16]: ", precision_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& pmf_tensor = context->input(0);
+
+    TensorShape shape = pmf_tensor.shape();
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(shape),
+                InvalidArgument("`pmf` should be at least 1-D."));
+    OP_REQUIRES(
+        context, shape.dim_size(shape.dims() - 1) > 1,
+        InvalidArgument("`pmf` size should be at least 2 in the last axis."));
+    shape.set_dim(shape.dims() - 1, shape.dim_size(shape.dims() - 1) + 1);
+
+    Tensor* cdf_tensor;
+    OP_REQUIRES_OK(context, context->allocate_output(0, shape, &cdf_tensor));
+
+    auto pmf = pmf_tensor.flat_inner_dims<float, 2>();
+    auto cdf = cdf_tensor->flat_inner_dims<int32, 2>();
+    CHECK_EQ(pmf.dimension(0), cdf.dimension(0));
+    CHECK_EQ(pmf.dimension(1) + 1, cdf.dimension(1));
+
+    const double n = pmf.dimension(1);
+    const int64 cost_per_unit = static_cast<int64>(50.0 * n * std::log2(n));
+    thread::ThreadPool* thread_pool =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    thread_pool->ParallelFor(
+        pmf.dimension(0), cost_per_unit,
+        [this, pmf, &cdf](int64 start, int64 limit) {
+          const gtl::ArraySlice<float>::size_type pmf_size = pmf.dimension(1);
+          for (int64 i = start; i < limit; ++i) {
+            cdf(i, 0) = 0;
+            PerShard({&pmf(i, 0), pmf_size}, {&cdf(i, 1), pmf_size});
+          }
+        });
+  }
+
+ private:
+  struct PenaltyItem {
+    PenaltyItem(int32* p, double mass) : pointer(p), mass(mass) {
+      penalty = ComputeNextPenalty();
+    }
+
+    void Decrease() {
+      CHECK_GT(*pointer, 1);
+      --*pointer;
+      penalty = ComputeNextPenalty();
+    }
+
+    friend bool operator<(const PenaltyItem& lhs, const PenaltyItem& rhs) {
+      return lhs.penalty < rhs.penalty;
+    }
+
+    double ComputeNextPenalty() {
+      if (*pointer <= 1) {
+        return std::numeric_limits<double>::infinity();
+      }
+      return mass * (std::log2(*pointer) - std::log2(*pointer - 1));
+    }
+
+    int32* pointer;
+    double mass;
+    double penalty;
+  };
+
+  struct GainItem {
+    GainItem(int32* p, double mass) : pointer(p), mass(mass) {
+      gain = ComputeNextGain();
+    }
+
+    void Increase() {
+      CHECK_GT(*pointer, 0);
+      ++*pointer;
+      gain = ComputeNextGain();
+    }
+
+    friend bool operator>(const GainItem& lhs, const GainItem& rhs) {
+      return lhs.gain > rhs.gain;
+    }
+
+    double ComputeNextGain() {
+      // Never increment zero value to non-zero value.
+      if (*pointer < 1) {
+        return -std::numeric_limits<double>::infinity();
+      }
+      return mass * (std::log2(*pointer + 1) - std::log2(*pointer));
+    }
+
+    int32* pointer;
+    double mass;
+    double gain;
+  };
+
+  void PerShard(gtl::ArraySlice<float> pmf,
+                gtl::MutableArraySlice<int32> cdf) const {
+    CHECK_EQ(pmf.size(), cdf.size());
+
+    const int32 normalizer = 1 << precision_;
+    std::transform(pmf.begin(), pmf.end(), cdf.begin(),
+                   [normalizer](float mass) {
+                     int32 value = std::rint(mass * normalizer);
+                     // NOTE: Consider checking if mass > 0.
+                     value = std::max(value, 1);
+                     return value;
+                   });
+
+    int32 sum = std::accumulate(cdf.begin(), cdf.end(), 0);
+    if (sum > normalizer) {
+      std::vector<PenaltyItem> queue;
+      queue.reserve(cdf.size());
+      for (int i = 0; i < cdf.size(); ++i) {
+        queue.emplace_back(&cdf[i], pmf[i]);
+      }
+
+      std::sort(queue.begin(), queue.end());
+      while (sum-- > normalizer) {
+        queue[0].Decrease();
+        // Performs a linear search because this find_if is likely to return
+        // iterator very close to the begin.
+        auto iter = std::find_if(
+            std::next(queue.begin()), queue.end(),
+            [&queue](const PenaltyItem& rhs) { return queue[0] < rhs; });
+        std::rotate(queue.begin(), std::next(queue.begin()), iter);
+      }
+    } else if (sum < normalizer) {
+      std::vector<GainItem> queue;
+      queue.reserve(cdf.size());
+      for (int i = 0; i < cdf.size(); ++i) {
+        queue.emplace_back(&cdf[i], pmf[i]);
+      }
+
+      std::sort(queue.begin(), queue.end(), std::greater<GainItem>());
+      while (sum++ < normalizer) {
+        queue[0].Increase();
+        // Performs a linear search because this find_if is likely to return
+        // iterator very close to the begin.
+        auto iter = std::find_if(
+            std::next(queue.begin()), queue.end(),
+            [&queue](const GainItem& rhs) { return queue[0] > rhs; });
+        std::rotate(queue.begin(), std::next(queue.begin()), iter);
+      }
+    }
+    std::partial_sum(cdf.begin(), cdf.end(), cdf.begin());
+  }
+
+  int precision_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("PmfToQuantizedCdf").Device(DEVICE_CPU),
+                        PmfToCdfOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3408f6b519a33fbb8f23d19c16bc7138fc34c121
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+class PmfToQuantizedCdfOpTest : public OpsTestBase {
+ protected:
+  void SetupOp(int precision, Tensor* input) {
+    TF_ASSERT_OK(NodeDefBuilder("pmf_to_cdf", "PmfToQuantizedCdf")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("precision", precision)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+
+    inputs_.clear();
+    inputs_.emplace_back(input);
+  }
+
+  void GenerateData(random::SimplePhilox* rand,
+                    gtl::MutableArraySlice<float> slice) {
+    constexpr float minimum = std::numeric_limits<float>::epsilon();
+    float sum = 0;
+    for (float& value : slice) {
+      value = std::max(rand->RandFloat(), minimum);
+      sum += value;
+    }
+    for (float& value : slice) {
+      value /= sum;
+    }
+  }
+
+  void Verify(int precision, const Tensor& pmf_tensor,
+              const Tensor& cdf_tensor) {
+    ASSERT_EQ(pmf_tensor.dims(), cdf_tensor.dims());
+    const int n = pmf_tensor.dims();
+
+    for (int i = 0; i < n - 1; ++i) {
+      EXPECT_EQ(pmf_tensor.dim_size(i), cdf_tensor.dim_size(i));
+    }
+
+    auto pmf = pmf_tensor.flat_inner_dims<float, 2>();
+    auto cdf = cdf_tensor.flat_inner_dims<int32, 2>();
+    EXPECT_EQ(pmf.dimension(1) + 1, cdf.dimension(1));
+
+    const int normalizer = 1 << precision;
+    for (int i = 0; i < pmf.dimension(0); ++i) {
+      EXPECT_EQ(0, cdf(i, 0));
+
+      TTypes<int32>::UnalignedConstVec cdf_slice(&cdf(i, 0), cdf.dimension(1));
+
+      for (int j = 1; j < cdf_slice.size(); ++j) {
+        const int32 diff = cdf_slice(j) - cdf_slice(j - 1);
+        EXPECT_GT(diff, 0);
+      }
+
+      EXPECT_EQ(cdf_slice(cdf_slice.size() - 1), normalizer);
+    }
+  }
+};
+
+TEST_F(PmfToQuantizedCdfOpTest, UnderSum) {
+  Tensor pmf(DT_FLOAT, {1, 10, 1, 32});
+  auto matrix = pmf.flat_inner_dims<float, 2>();
+  const std::size_t n = matrix.dimension(1);
+
+  random::PhiloxRandom gen(random::New64(), random::New64());
+  random::SimplePhilox rand(&gen);
+  for (int64 i = 0; i < matrix.dimension(0); ++i) {
+    GenerateData(&rand, {&matrix(i, 0), n});
+  }
+
+  pmf.flat<float>() = pmf.flat<float>() * 0.85f;
+
+  constexpr int kPrecision = 10;
+  SetupOp(kPrecision, &pmf);
+  TF_ASSERT_OK(RunOpKernel());
+
+  Verify(kPrecision, pmf, *GetOutput(0));
+}
+
+TEST_F(PmfToQuantizedCdfOpTest, OverSum) {
+  Tensor pmf(DT_FLOAT, {10, 1, 1, 100});
+  auto matrix = pmf.flat_inner_dims<float, 2>();
+
+  // Half of each PMF is filled with zeros. The op will round up zeros to ones,
+  // post quantization. These round ups are likely to make the sum over
+  // normalizer value.
+  matrix.setZero();
+  const std::size_t n = matrix.dimension(1) / 2;
+
+  random::PhiloxRandom gen(random::New64(), random::New64());
+  random::SimplePhilox rand(&gen);
+  for (int64 i = 0; i < matrix.dimension(0); ++i) {
+    GenerateData(&rand, {&matrix(i, 0), n});
+  }
+
+  constexpr int kPrecision = 7;
+  SetupOp(kPrecision, &pmf);
+  TF_ASSERT_OK(RunOpKernel());
+
+  Verify(kPrecision, pmf, *GetOutput(0));
+}
+
+TEST_F(PmfToQuantizedCdfOpTest, ShapeFn) {
+  ShapeInferenceTestOp op("PmfToQuantizedCdf");
+
+  INFER_OK(op, "?", "?");
+  INFER_OK(op, "[3]", "[4]");
+  INFER_OK(op, "[3,4]", "[d0_0,5]");
+  INFER_OK(op, "[3,4,5]", "[d0_0,d0_1,6]");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/ops/coder_ops.cc b/tensorflow/contrib/coder/ops/coder_ops.cc
index 9056d1a6963d7be92f499db31385fb6afe2dc515..a185e07913f84a813d76a8c63741bd22a832c8b9 100644
--- a/tensorflow/contrib/coder/ops/coder_ops.cc
+++ b/tensorflow/contrib/coder/ops/coder_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
@@ -76,7 +77,7 @@ are incorrect. For this reason, the range coder uses integer arithmetics and
 avoids using any floating point operations internally, and `cdf` should contain
 integers representing quantized probability mass rather than floating points. 
 
-data: An int32 tensor.
+data: An int16 tensor.
 cdf: An int32 tensor representing the CDF's of `data`. Each integer is divided
   by `2^precision` to represent a fraction.
 encoded: A range-coded scalar string.
@@ -111,9 +112,38 @@ potential performance issues, the decoder does not return error status.
 encoded: A scalar string tensor from RangeEncode.
 shape: An int32 1-D tensor representing the shape of the data encoded by
   RangeEncode.
-decoded: An int32 tensor with shape equal to `shape`.
+decoded: An int16 tensor with shape equal to `shape`.
 precision: The number of bits for probability quantization. Must be <= 16, and
   must match the precision used by RangeEncode that produced `encoded`.
 )doc");
+
+REGISTER_OP("PmfToQuantizedCdf")
+    .Input("pmf: float")
+    .Output("cdf: int32")
+    .Attr("precision: int >= 1")
+    .SetShapeFn([] (InferenceContext* c) {
+      ShapeHandle in;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &in));
+      DimensionHandle last;
+      TF_RETURN_IF_ERROR(c->Add(c->Dim(in, -1), 1, &last));
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->ReplaceDim(in, -1, last, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Converts PMF to quantized CDF. This op uses floating-point operations
+internally. Therefore the quantized output may not be consistent across multiple
+platforms. For entropy encoders and decoders to have the same quantized CDF on
+different platforms, the quantized CDF should be produced once and saved, then
+the saved quantized CDF should be used everywhere.
+
+After quantization, if PMF does not sum to 2^precision, then some values of PMF
+are increased or decreased to adjust the sum to equal to 2^precision.
+
+Note that the input PMF is pre-quantization. The input PMF is not normalized
+by this op prior to quantization. Therefore the user is responsible for
+normalizing PMF if necessary.
+)doc");
 // clang-format on
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck.py b/tensorflow/contrib/coder/python/layers/entropybottleneck.py
new file mode 100644
index 0000000000000000000000000000000000000000..f039cb0f5265b920200f63c5bd5ebeb4e23826be
--- /dev/null
+++ b/tensorflow/contrib/coder/python/layers/entropybottleneck.py
@@ -0,0 +1,697 @@
+# -*- coding: utf-8 -*-
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Entropy bottleneck layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.coder.python.ops import coder_ops
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import engine
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+
+
+class EntropyBottleneck(engine.Layer):
+  """Entropy bottleneck layer.
+
+  This layer can be used to model the entropy (the amount of information
+  conveyed) of the tensor passing through it. During training, this can be used
+  to impose a (soft) entropy constraint on its activations, limiting the amount
+  of information flowing through the layer. Note that this is distinct from
+  other types of bottlenecks, which reduce the dimensionality of the space, for
+  example. Dimensionality reduction does not limit the amount of information,
+  and does not enable efficient data compression per se.
+
+  After training, this layer can be used to compress any input tensor to a
+  string, which may be written to a file, and to decompress a file which it
+  previously generated back to a reconstructed tensor (possibly on a different
+  machine having access to the same model checkpoint). The entropies estimated
+  during training or evaluation are approximately equal to the average length of
+  the strings in bits.
+
+  The layer implements a flexible probability density model to estimate entropy,
+  which is described in the appendix of the paper (please cite the paper if you
+  use this code for scientific work):
+
+  "Variational image compression with a scale hyperprior"
+
+  Johannes Ballé, David Minnen, Saurabh Singh, Sung Jin Hwang, Nick Johnston
+
+  https://arxiv.org/abs/1802.01436
+
+  The layer assumes that the input tensor is at least 2D, with a batch dimension
+  at the beginning and a channel dimension as specified by `data_format`. The
+  layer trains an independent probability density model for each channel, but
+  assumes that across all other dimensions, the inputs are i.i.d. (independent
+  and identically distributed). Because the entropy (and hence, average
+  codelength) is a function of the densities, this assumption may have a direct
+  effect on the compression performance.
+
+  Because data compression always involves discretization, the outputs of the
+  layer are generally only approximations of its inputs. During training,
+  discretization is modeled using additive uniform noise to ensure
+  differentiability. The entropies computed during training are differential
+  entropies. During evaluation, the data is actually quantized, and the
+  entropies are discrete (Shannon entropies). To make sure the approximated
+  tensor values are good enough for practical purposes, the training phase must
+  be used to balance the quality of the approximation with the entropy, by
+  adding an entropy term to the training loss, as in the following example.
+
+  Here, we use the entropy bottleneck to compress the latent representation of
+  an autoencoder. The data vectors `x` in this case are 4D tensors in
+  `'channels_last'` format (for example, 16x16 pixel grayscale images).
+
+  The layer always produces exactly one auxiliary loss and one update op which
+  are only significant for compression and decompression. To use the compression
+  feature, the auxiliary loss must be minimized during or after training. After
+  that, the update op must be executed at least once. Here, we simply attach
+  them to the main training step.
+
+  Training:
+  ```
+  # Build autoencoder.
+  x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1])
+  y = forward_transform(x)
+  entropy_bottleneck = EntropyBottleneck()
+  y_, likelihoods = entropy_bottleneck(y, training=True)
+  x_ = backward_transform(y_)
+
+  # Information content (= predicted codelength) in bits of each batch element
+  # (note that taking the natural logarithm and dividing by `log(2)` is
+  # equivalent to taking base-2 logarithms):
+  bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2)
+
+  # Squared difference of each batch element:
+  squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3))
+
+  # The loss is a weighted sum of mean squared error and entropy (average
+  # information content), where the weight controls the trade-off between
+  # approximation error and entropy.
+  main_loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits)
+
+  # Minimize loss and auxiliary loss, and execute update op.
+  main_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
+  main_step = optimizer.minimize(main_loss)
+  # 1e-2 is a good starting point for the learning rate of the auxiliary loss,
+  # assuming Adam is used.
+  aux_optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
+  aux_step = optimizer.minimize(entropy_bottleneck.losses[0])
+  step = tf.group(main_step, aux_step, entropy_bottleneck.updates[0])
+  ```
+
+  Evaluation:
+  ```
+  # Build autoencoder.
+  x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1])
+  y = forward_transform(x)
+  y_, likelihoods = EntropyBottleneck()(y, training=False)
+  x_ = backward_transform(y_)
+
+  # Information content (= predicted codelength) in bits of each batch element:
+  bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2)
+
+  # Squared difference of each batch element:
+  squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3))
+
+  # The loss is a weighted sum of mean squared error and entropy (average
+  # information content), where the weight controls the trade-off between
+  # approximation error and entropy.
+  loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits)
+  ```
+
+  To be able to compress the bottleneck tensor and decompress it in a different
+  session, or on a different machine, you need three items:
+  - The compressed representations stored as strings.
+  - The shape of the bottleneck for these string representations as a `Tensor`,
+    as well as the number of channels of the bottleneck at graph construction
+    time.
+  - The checkpoint of the trained model that was used for compression. Note:
+    It is crucial that the auxiliary loss produced by this layer is minimized
+    during or after training, and that the update op is run after training and
+    minimization of the auxiliary loss, but *before* the checkpoint is saved.
+
+  Compression:
+  ```
+  x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1])
+  y = forward_transform(x)
+  strings = EntropyBottleneck().compress(y)
+  shape = tf.shape(y)[1:]
+  ```
+
+  Decompression:
+  ```
+  strings = tf.placeholder(tf.string, shape=[None])
+  shape = tf.placeholder(tf.int32, shape=[3])
+  entropy_bottleneck = EntropyBottleneck(dtype=tf.float32)
+  y_ = entropy_bottleneck.decompress(strings, shape, channels=5)
+  x_ = backward_transform(y_)
+  ```
+  Here, we assumed that the tensor produced by the forward transform has 5
+  channels.
+
+  The above four use cases can also be implemented within the same session (i.e.
+  on the same `EntropyBottleneck` instance), for testing purposes, etc., by
+  calling the object more than once.
+
+  Arguments:
+    init_scale: Float. A scaling factor determining the initial width of the
+      probability densities. This should be chosen big enough so that the
+      range of values of the layer inputs roughly falls within the interval
+      [`-init_scale`, `init_scale`] at the beginning of training.
+    filters: An iterable of ints, giving the number of filters at each layer of
+      the density model. Generally, the more filters and layers, the more
+      expressive is the density model in terms of modeling more complicated
+      distributions of the layer inputs. For details, refer to the paper
+      referenced above. The default is `[3, 3, 3]`, which should be sufficient
+      for most practical purposes.
+    tail_mass: Float, between 0 and 1. The bottleneck layer automatically
+      determines the range of input values that should be represented based on
+      their frequency of occurrence. Values occurring in the tails of the
+      distributions will be clipped to that range during compression.
+      `tail_mass` determines the amount of probability mass in the tails which
+      is cut off in the worst case. For example, the default value of `1e-9`
+      means that at most 1 in a billion input samples will be clipped to the
+      range.
+    optimize_integer_offset: Boolean. Typically, the input values of this layer
+      are floats, which means that quantization during evaluation can be
+      performed with an arbitrary offset. By default, the layer determines that
+      offset automatically. In special situations, such as when it is known that
+      the layer will receive only full integer values during evaluation, it can
+      be desirable to set this argument to `False` instead, in order to always
+      quantize to full integer values.
+    likelihood_bound: Float. If positive, the returned likelihood values are
+      ensured to be greater than or equal to this value. This prevents very
+      large gradients with a typical entropy loss (defaults to 1e-9).
+    range_coder_precision: Integer, between 1 and 16. The precision of the range
+      coder used for compression and decompression. This trades off computation
+      speed with compression efficiency, where 16 is the slowest but most
+      efficient setting. Choosing lower values may increase the average
+      codelength slightly compared to the estimated entropies.
+    data_format: Either `'channels_first'` or `'channels_last'` (default).
+    trainable: Boolean. Whether the layer should be trained.
+    name: String. The name of the layer.
+    dtype: Default dtype of the layer's parameters (default of `None` means use
+      the type of the first input).
+
+  Read-only properties:
+    init_scale: See above.
+    filters: See above.
+    tail_mass: See above.
+    optimize_integer_offset: See above.
+    likelihood_bound: See above.
+    range_coder_precision: See above.
+    data_format: See above.
+    name: String. See above.
+    dtype: See above.
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and non-trainable.
+    updates: List of update ops of this layer. Always contains exactly one
+      update op, which must be run once after the last training step, before
+      `compress` or `decompress` is used.
+    losses: List of losses added by this layer. Always contains exactly one
+      auxiliary loss, which must be added to the training loss.
+
+  Mutable properties:
+    trainable: Boolean. Whether the layer should be trained.
+    input_spec: Optional `InputSpec` object specifying the constraints on inputs
+      that can be accepted by the layer.
+  """
+
+  def __init__(self, init_scale=10, filters=(3, 3, 3), tail_mass=1e-9,
+               optimize_integer_offset=True, likelihood_bound=1e-9,
+               range_coder_precision=16, data_format="channels_last", **kwargs):
+    super(EntropyBottleneck, self).__init__(**kwargs)
+    self._init_scale = float(init_scale)
+    self._filters = tuple(int(f) for f in filters)
+    self._tail_mass = float(tail_mass)
+    if not 0 < self.tail_mass < 1:
+      raise ValueError(
+          "`tail_mass` must be between 0 and 1, got {}.".format(self.tail_mass))
+    self._optimize_integer_offset = bool(optimize_integer_offset)
+    self._likelihood_bound = float(likelihood_bound)
+    self._range_coder_precision = int(range_coder_precision)
+    self._data_format = data_format
+    self._channel_axis(2)  # trigger ValueError early
+    self.input_spec = engine.InputSpec(min_ndim=2)
+
+  @property
+  def init_scale(self):
+    return self._init_scale
+
+  @property
+  def filters(self):
+    return self._filters
+
+  @property
+  def tail_mass(self):
+    return self._tail_mass
+
+  @property
+  def optimize_integer_offset(self):
+    return self._optimize_integer_offset
+
+  @property
+  def likelihood_bound(self):
+    return self._likelihood_bound
+
+  @property
+  def range_coder_precision(self):
+    return self._range_coder_precision
+
+  @property
+  def data_format(self):
+    return self._data_format
+
+  def _channel_axis(self, ndim):
+    try:
+      return {"channels_first": 1, "channels_last": ndim - 1}[self.data_format]
+    except KeyError:
+      raise ValueError("Unsupported `data_format` for {} layer: {}.".format(
+          self.__class__.__name__, self.data_format))
+
+  def _logits_cumulative(self, inputs, stop_gradient):
+    """Evaluate logits of the cumulative densities.
+
+    Args:
+      inputs: The values at which to evaluate the cumulative densities, expected
+        to be a `Tensor` of shape `(channels, 1, batch)`.
+      stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so
+        that the gradient of the output with respect to the density model
+        parameters is disconnected (the gradient with respect to `inputs` is
+        left untouched).
+
+    Returns:
+      A `Tensor` of the same shape as `inputs`, containing the logits of the
+      cumulative densities evaluated at the given inputs.
+    """
+    logits = inputs
+
+    for i in range(len(self.filters) + 1):
+      matrix = self._matrices[i]
+      if stop_gradient:
+        matrix = array_ops.stop_gradient(matrix)
+      logits = math_ops.matmul(matrix, logits)
+
+      bias = self._biases[i]
+      if stop_gradient:
+        bias = array_ops.stop_gradient(bias)
+      logits += bias
+
+      if i < len(self._factors):
+        factor = self._factors[i]
+        if stop_gradient:
+          factor = array_ops.stop_gradient(factor)
+        logits += factor * math_ops.tanh(logits)
+
+    return logits
+
+  def build(self, input_shape):
+    """Builds the layer.
+
+    Creates the variables for the network modeling the densities, creates the
+    auxiliary loss estimating the median and tail quantiles of the densities,
+    and then uses that to create the probability mass functions and the update
+    op that produces the discrete cumulative density functions used by the range
+    coder.
+
+    Args:
+      input_shape: Shape of the input tensor, used to get the number of
+        channels.
+
+    Raises:
+      ValueError: if `input_shape` doesn't specify the length of the channel
+        dimension.
+    """
+    input_shape = tensor_shape.TensorShape(input_shape)
+    channel_axis = self._channel_axis(input_shape.ndims)
+    channels = input_shape[channel_axis].value
+    if channels is None:
+      raise ValueError("The channel dimension of the inputs must be defined.")
+    self.input_spec = engine.InputSpec(
+        ndim=input_shape.ndims, axes={channel_axis: channels})
+    filters = (1,) + self.filters + (1,)
+    scale = self.init_scale ** (1 / (len(self.filters) + 1))
+
+    # Create variables.
+    self._matrices = []
+    self._biases = []
+    self._factors = []
+    for i in range(len(self.filters) + 1):
+      init = np.log(np.expm1(1 / scale / filters[i + 1]))
+      matrix = self.add_variable(
+          "matrix_{}".format(i), dtype=self.dtype,
+          shape=(channels, filters[i + 1], filters[i]),
+          initializer=init_ops.Constant(init))
+      matrix = nn.softplus(matrix)
+      self._matrices.append(matrix)
+
+      bias = self.add_variable(
+          "bias_{}".format(i), dtype=self.dtype,
+          shape=(channels, filters[i + 1], 1),
+          initializer=init_ops.RandomUniform(-.5, .5))
+      self._biases.append(bias)
+
+      if i < len(self.filters):
+        factor = self.add_variable(
+            "factor_{}".format(i), dtype=self.dtype,
+            shape=(channels, filters[i + 1], 1),
+            initializer=init_ops.Zeros())
+        factor = math_ops.tanh(factor)
+        self._factors.append(factor)
+
+    # To figure out what range of the densities to sample, we need to compute
+    # the quantiles given by `tail_mass / 2` and `1 - tail_mass / 2`. Since we
+    # can't take inverses of the cumulative directly, we make it an optimization
+    # problem:
+    # `quantiles = argmin(|logit(cumulative) - target|)`
+    # where `target` is `logit(tail_mass / 2)` or `logit(1 - tail_mass / 2)`.
+    # Taking the logit (inverse of sigmoid) of the cumulative makes the
+    # representation of the right target more numerically stable.
+
+    # Numerically stable way of computing logits of `tail_mass / 2`
+    # and `1 - tail_mass / 2`.
+    target = np.log(2 / self.tail_mass - 1)
+    # Compute lower and upper tail quantile as well as median.
+    target = constant_op.constant([-target, 0, target], dtype=self.dtype)
+
+    def quantiles_initializer(shape, dtype=None, partition_info=None):
+      del partition_info  # unused
+      assert tuple(shape[1:]) == (1, 3)
+      init = constant_op.constant(
+          [[[-self.init_scale, 0, self.init_scale]]], dtype=dtype)
+      return array_ops.tile(init, (shape[0], 1, 1))
+
+    quantiles = self.add_variable(
+        "quantiles", shape=(channels, 1, 3), dtype=self.dtype,
+        initializer=quantiles_initializer)
+    logits = self._logits_cumulative(quantiles, stop_gradient=True)
+    loss = math_ops.reduce_sum(abs(logits - target))
+    self.add_loss(loss, inputs=None)
+
+    # Save medians for `call`, `compress`, and `decompress`.
+    self._medians = quantiles[:, :, 1:2]
+    if not self.optimize_integer_offset:
+      self._medians = math_ops.round(self._medians)
+
+    # Largest distance observed between lower tail quantile and median,
+    # or between median and upper tail quantile.
+    minima = math_ops.reduce_max(self._medians - quantiles[:, :, 0:1])
+    maxima = math_ops.reduce_max(quantiles[:, :, 2:3] - self._medians)
+    minmax = math_ops.maximum(minima, maxima)
+    minmax = math_ops.ceil(minmax)
+    minmax = math_ops.maximum(minmax, 1)
+
+    # Sample the density up to `minmax` around the median.
+    samples = math_ops.range(-minmax, minmax + 1, dtype=self.dtype)
+    samples += self._medians
+
+    half = constant_op.constant(.5, dtype=self.dtype)
+    # We strip the sigmoid from the end here, so we can use the special rule
+    # below to only compute differences in the left tail of the sigmoid.
+    # This increases numerical stability (see explanation in `call`).
+    lower = self._logits_cumulative(samples - half, stop_gradient=True)
+    upper = self._logits_cumulative(samples + half, stop_gradient=True)
+    # Flip signs if we can move more towards the left tail of the sigmoid.
+    sign = -math_ops.sign(math_ops.add_n([lower, upper]))
+    pmf = abs(math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower))
+    # Add tail masses to first and last bin of pmf, as we clip values for
+    # compression, meaning that out-of-range values get mapped to these bins.
+    pmf = array_ops.concat([
+        math_ops.add_n([pmf[:, 0, :1], math_ops.sigmoid(lower[:, 0, :1])]),
+        pmf[:, 0, 1:-1],
+        math_ops.add_n([pmf[:, 0, -1:], math_ops.sigmoid(-upper[:, 0, -1:])]),
+        ], axis=-1)
+    self._pmf = pmf
+
+    cdf = coder_ops.pmf_to_quantized_cdf(
+        pmf, precision=self.range_coder_precision)
+    def cdf_getter(*args, **kwargs):
+      del args, kwargs  # ignored
+      return variable_scope.get_variable(
+          "quantized_cdf", dtype=dtypes.int32, initializer=cdf,
+          trainable=False, validate_shape=False, collections=())
+    # Need to provide a fake shape here since add_variable insists on it.
+    self._quantized_cdf = self.add_variable(
+        "quantized_cdf", shape=(channels, 1), dtype=dtypes.int32,
+        getter=cdf_getter, trainable=False)
+
+    update_op = state_ops.assign(
+        self._quantized_cdf, cdf, validate_shape=False)
+    self.add_update(update_op, inputs=None)
+
+    super(EntropyBottleneck, self).build(input_shape)
+
+  def call(self, inputs, training):
+    """Pass a tensor through the bottleneck.
+
+    Args:
+      inputs: The tensor to be passed through the bottleneck.
+      training: Boolean. If `True`, returns a differentiable approximation of
+        the inputs, and their likelihoods under the modeled probability
+        densities. If `False`, returns the quantized inputs and their
+        likelihoods under the corresponding probability mass function. These
+        quantities can't be used for training, as they are not differentiable,
+        but represent actual compression more closely.
+
+    Returns:
+      values: `Tensor` with the same shape as `inputs` containing the perturbed
+        or quantized input values.
+      likelihood: `Tensor` with the same shape as `inputs` containing the
+        likelihood of `values` under the modeled probability distributions.
+
+    Raises:
+      ValueError: if `inputs` has different `dtype` or number of channels than
+        a previous set of inputs the model was invoked with earlier.
+    """
+    inputs = ops.convert_to_tensor(inputs)
+    ndim = self.input_spec.ndim
+    channel_axis = self._channel_axis(ndim)
+    half = constant_op.constant(.5, dtype=self.dtype)
+
+    # Convert to (channels, 1, batch) format by commuting channels to front
+    # and then collapsing.
+    order = list(range(ndim))
+    order.pop(channel_axis)
+    order.insert(0, channel_axis)
+    values = array_ops.transpose(inputs, order)
+    shape = array_ops.shape(values)
+    values = array_ops.reshape(values, (shape[0], 1, -1))
+
+    # Add noise or quantize.
+    if training:
+      noise = random_ops.random_uniform(array_ops.shape(values), -half, half)
+      values = math_ops.add_n([values, noise])
+    elif self.optimize_integer_offset:
+      values = math_ops.round(values - self._medians) + self._medians
+    else:
+      values = math_ops.round(values)
+
+    # Evaluate densities.
+    # We can use the special rule below to only compute differences in the left
+    # tail of the sigmoid. This increases numerical stability: sigmoid(x) is 1
+    # for large x, 0 for small x. Subtracting two numbers close to 0 can be done
+    # with much higher precision than subtracting two numbers close to 1.
+    lower = self._logits_cumulative(values - half, stop_gradient=False)
+    upper = self._logits_cumulative(values + half, stop_gradient=False)
+    # Flip signs if we can move more towards the left tail of the sigmoid.
+    sign = -math_ops.sign(math_ops.add_n([lower, upper]))
+    sign = array_ops.stop_gradient(sign)
+    likelihood = abs(
+        math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower))
+    if self.likelihood_bound > 0:
+      likelihood_bound = constant_op.constant(
+          self.likelihood_bound, dtype=self.dtype)
+      # TODO(jballe): Override gradients.
+      likelihood = math_ops.maximum(likelihood, likelihood_bound)
+
+    # Convert back to input tensor shape.
+    order = list(range(1, ndim))
+    order.insert(channel_axis, 0)
+    values = array_ops.reshape(values, shape)
+    values = array_ops.transpose(values, order)
+    likelihood = array_ops.reshape(likelihood, shape)
+    likelihood = array_ops.transpose(likelihood, order)
+
+    if not context.executing_eagerly():
+      values_shape, likelihood_shape = self.compute_output_shape(inputs.shape)
+      values.set_shape(values_shape)
+      likelihood.set_shape(likelihood_shape)
+
+    return values, likelihood
+
+  def compress(self, inputs):
+    """Compress inputs and store their binary representations into strings.
+
+    Args:
+      inputs: `Tensor` with values to be compressed.
+
+    Returns:
+      String `Tensor` vector containing the compressed representation of each
+      batch element of `inputs`.
+    """
+    with ops.name_scope(self._name_scope()):
+      inputs = ops.convert_to_tensor(inputs)
+      if not self.built:
+        # Check input assumptions set before layer building, e.g. input rank.
+        self._assert_input_compatibility(inputs)
+        if self.dtype is None:
+          self._dtype = inputs.dtype.base_dtype.name
+        self.build(inputs.shape)
+
+      # Check input assumptions set after layer building, e.g. input shape.
+      if not context.executing_eagerly():
+        self._assert_input_compatibility(inputs)
+
+      ndim = self.input_spec.ndim
+      channel_axis = self._channel_axis(ndim)
+      # Tuple of slices for expanding dimensions of tensors below.
+      slices = ndim * [None] + [slice(None)]
+      slices[channel_axis] = slice(None)
+      slices = tuple(slices)
+
+      # Expand dimensions of CDF to input dimensions, keeping the channels along
+      # the right dimension.
+      cdf = self._quantized_cdf[slices[1:]]
+      num_levels = array_ops.shape(cdf)[-1] - 1
+
+      # Bring inputs to the right range by centering the range on the medians.
+      half = constant_op.constant(.5, dtype=self.dtype)
+      medians = array_ops.squeeze(self._medians, [1, 2])
+      offsets = (math_ops.cast(num_levels // 2, self.dtype) + half) - medians
+      # Expand offsets to input dimensions and add to inputs.
+      values = inputs + offsets[slices[:-1]]
+
+      # Clip to range and cast to integers. Because we have added .5 above, and
+      # all values are positive, the cast effectively implements rounding.
+      values = math_ops.maximum(values, half)
+      values = math_ops.minimum(
+          values, math_ops.cast(num_levels, self.dtype) - half)
+      values = math_ops.cast(values, dtypes.int16)
+
+      def loop_body(tensor):
+        return coder_ops.range_encode(
+            tensor, cdf, precision=self.range_coder_precision)
+      strings = functional_ops.map_fn(
+          loop_body, values, dtype=dtypes.string, back_prop=False)
+
+      if not context.executing_eagerly():
+        strings.set_shape(inputs.shape[:1])
+
+      return strings
+
+  def decompress(self, strings, shape, channels=None):
+    """Decompress values from their compressed string representations.
+
+    Args:
+      strings: A string `Tensor` vector containing the compressed data.
+      shape: A `Tensor` vector of int32 type. Contains the shape of the tensor
+        to be decompressed, excluding the batch dimension.
+      channels: Integer. Specifies the number of channels statically. Needs only
+        be set if the layer hasn't been built yet (i.e., this is the first input
+        it receives).
+
+    Returns:
+      The decompressed `Tensor`. Its shape will be equal to `shape` prepended
+      with the batch dimension from `strings`.
+
+    Raises:
+      ValueError: If the length of `shape` isn't available at graph construction
+        time.
+    """
+    with ops.name_scope(self._name_scope()):
+      strings = ops.convert_to_tensor(strings)
+      shape = ops.convert_to_tensor(shape)
+      if self.built:
+        ndim = self.input_spec.ndim
+        channel_axis = self._channel_axis(ndim)
+        if channels is None:
+          channels = self.input_spec.axes[channel_axis]
+      else:
+        if not (shape.shape.is_fully_defined() and shape.shape.ndims == 1):
+          raise ValueError("`shape` must be a vector with known length.")
+        ndim = shape.shape[0].value + 1
+        channel_axis = self._channel_axis(ndim)
+        input_shape = ndim * [None]
+        input_shape[channel_axis] = channels
+        self.build(input_shape)
+
+      # Tuple of slices for expanding dimensions of tensors below.
+      slices = ndim * [None] + [slice(None)]
+      slices[channel_axis] = slice(None)
+      slices = tuple(slices)
+
+      # Expand dimensions of CDF to input dimensions, keeping the channels along
+      # the right dimension.
+      cdf = self._quantized_cdf[slices[1:]]
+      num_levels = array_ops.shape(cdf)[-1] - 1
+
+      def loop_body(string):
+        return coder_ops.range_decode(
+            string, shape, cdf, precision=self.range_coder_precision)
+      outputs = functional_ops.map_fn(
+          loop_body, strings, dtype=dtypes.int16, back_prop=False)
+      outputs = math_ops.cast(outputs, self.dtype)
+
+      medians = array_ops.squeeze(self._medians, [1, 2])
+      offsets = math_ops.cast(num_levels // 2, self.dtype) - medians
+      outputs -= offsets[slices[:-1]]
+
+      if not context.executing_eagerly():
+        outputs_shape = ndim * [None]
+        outputs_shape[0] = strings.shape[0]
+        outputs_shape[channel_axis] = channels
+        outputs.set_shape(outputs_shape)
+
+      return outputs
+
+  def visualize(self):
+    """Multi-channel visualization of densities as images.
+
+    Creates and returns an image summary visualizing the current probabilty
+    density estimates. The image contains one row for each channel. Within each
+    row, the pixel intensities are proportional to probability values, and each
+    row is centered on the median of the corresponding distribution.
+
+    Returns:
+      The created image summary.
+    """
+    with ops.name_scope(self._name_scope()):
+      image = self._pmf
+      image *= 255 / math_ops.reduce_max(image, axis=1, keepdims=True)
+      image = math_ops.cast(image + .5, dtypes.uint8)
+      image = image[None, :, :, None]
+    return summary.image("pmf", image, max_outputs=1)
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    return input_shape, input_shape
diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..798b0234ebcce7df108a0da65d1305502ce0253a
--- /dev/null
+++ b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py
@@ -0,0 +1,315 @@
+# -*- coding: utf-8 -*-
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests of EntropyBottleneck class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.coder.python.layers import entropybottleneck
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+class EntropyBottleneckTest(test.TestCase):
+
+  def test_noise(self):
+    # Tests that the noise added is uniform noise between -0.5 and 0.5.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 1))
+    layer = entropybottleneck.EntropyBottleneck()
+    noisy, _ = layer(inputs, training=True)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      values = np.linspace(-50, 50, 100)[:, None]
+      noisy, = sess.run([noisy], {inputs: values})
+      self.assertFalse(np.allclose(values, noisy, rtol=0, atol=.49))
+      self.assertAllClose(values, noisy, rtol=0, atol=.5)
+
+  def test_quantization(self):
+    # Tests that inputs are quantized to full integer values, even after
+    # quantiles have been updated.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 1))
+    layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=False)
+    quantized, _ = layer(inputs, training=False)
+    opt = gradient_descent.GradientDescentOptimizer(learning_rate=1)
+    self.assertTrue(len(layer.losses) == 1)
+    step = opt.minimize(layer.losses[0])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(step)
+      values = np.linspace(-50, 50, 100)[:, None]
+      quantized, = sess.run([quantized], {inputs: values})
+      self.assertAllClose(np.around(values), quantized, rtol=0, atol=1e-6)
+
+  def test_quantization_optimized_offset(self):
+    # Tests that inputs are not quantized to full integer values after quantiles
+    # have been updated. However, the difference between input and output should
+    # be between -0.5 and 0.5, and the offset must be consistent.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 1))
+    layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=True)
+    quantized, _ = layer(inputs, training=False)
+    opt = gradient_descent.GradientDescentOptimizer(learning_rate=1)
+    self.assertTrue(len(layer.losses) == 1)
+    step = opt.minimize(layer.losses[0])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(step)
+      values = np.linspace(-50, 50, 100)[:, None]
+      quantized, = sess.run([quantized], {inputs: values})
+      self.assertAllClose(values, quantized, rtol=0, atol=.5)
+      diff = np.ravel(np.around(values) - quantized) % 1
+      self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6)
+      self.assertNotEqual(diff[0], 0)
+
+  def test_codec(self):
+    # Tests that inputs are compressed and decompressed correctly, and quantized
+    # to full integer values, even after quantiles have been updated.
+    inputs = array_ops.placeholder(dtypes.float32, (1, None, 1))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_last", init_scale=60,
+        optimize_integer_offset=False)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    opt = gradient_descent.GradientDescentOptimizer(learning_rate=1)
+    self.assertTrue(len(layer.losses) == 1)
+    step = opt.minimize(layer.losses[0])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(step)
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = np.linspace(-50, 50, 100)[None, :, None]
+      decoded, = sess.run([decoded], {inputs: values})
+      self.assertAllClose(np.around(values), decoded, rtol=0, atol=1e-6)
+
+  def test_codec_optimized_offset(self):
+    # Tests that inputs are compressed and decompressed correctly, and not
+    # quantized to full integer values after quantiles have been updated.
+    # However, the difference between input and output should be between -0.5
+    # and 0.5, and the offset must be consistent.
+    inputs = array_ops.placeholder(dtypes.float32, (1, None, 1))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_last", init_scale=60,
+        optimize_integer_offset=True)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    opt = gradient_descent.GradientDescentOptimizer(learning_rate=1)
+    self.assertTrue(len(layer.losses) == 1)
+    step = opt.minimize(layer.losses[0])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(step)
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = np.linspace(-50, 50, 100)[None, :, None]
+      decoded, = sess.run([decoded], {inputs: values})
+      self.assertAllClose(values, decoded, rtol=0, atol=.5)
+      diff = np.ravel(np.around(values) - decoded) % 1
+      self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6)
+      self.assertNotEqual(diff[0], 0)
+
+  def test_codec_clipping(self):
+    # Tests that inputs are compressed and decompressed correctly, and clipped
+    # to the expected range.
+    inputs = array_ops.placeholder(dtypes.float32, (1, None, 1))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_last", init_scale=40)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = np.linspace(-50, 50, 100)[None, :, None]
+      decoded, = sess.run([decoded], {inputs: values})
+      expected = np.clip(np.around(values), -40, 40)
+      self.assertAllClose(expected, decoded, rtol=0, atol=1e-6)
+
+  def test_channels_last(self):
+    # Test the layer with more than one channel and multiple input dimensions,
+    # with the channels in the last dimension.
+    inputs = array_ops.placeholder(dtypes.float32, (None, None, None, 2))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_last", init_scale=50)
+    noisy, _ = layer(inputs, training=True)
+    quantized, _ = layer(inputs, training=False)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = 5 * np.random.normal(size=(7, 5, 3, 2))
+      noisy, quantized, decoded = sess.run(
+          [noisy, quantized, decoded], {inputs: values})
+      self.assertAllClose(values, noisy, rtol=0, atol=.5)
+      self.assertAllClose(values, quantized, rtol=0, atol=.5)
+      self.assertAllClose(values, decoded, rtol=0, atol=.5)
+
+  def test_channels_first(self):
+    # Test the layer with more than one channel and multiple input dimensions,
+    # with the channel dimension right after the batch dimension.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 3, None, None))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_first", init_scale=50)
+    noisy, _ = layer(inputs, training=True)
+    quantized, _ = layer(inputs, training=False)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = 5 * np.random.normal(size=(2, 3, 5, 7))
+      noisy, quantized, decoded = sess.run(
+          [noisy, quantized, decoded], {inputs: values})
+      self.assertAllClose(values, noisy, rtol=0, atol=.5)
+      self.assertAllClose(values, quantized, rtol=0, atol=.5)
+      self.assertAllClose(values, decoded, rtol=0, atol=.5)
+
+  def test_compress(self):
+    # Test compression and decompression, and produce test data for
+    # `test_decompress`. If you set the constant at the end to `True`, this test
+    # will fail and the log will contain the new test data.
+    inputs = array_ops.placeholder(dtypes.float32, (2, 3, 10))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_first", filters=(), init_scale=2)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = 5 * np.random.uniform(size=(2, 3, 10)) - 2.5
+      bitstrings, quantized_cdf, decoded = sess.run(
+          [bitstrings, layer._quantized_cdf, decoded], {inputs: values})
+      self.assertAllClose(values, decoded, rtol=0, atol=.5)
+      # Set this constant to `True` to log new test data for `test_decompress`.
+      if False:  # pylint:disable=using-constant-test
+        assert False, (bitstrings, quantized_cdf, decoded)
+
+  # Data generated by `test_compress`.
+  # pylint:disable=g-inconsistent-quotes,bad-whitespace
+  bitstrings = np.array([
+      b'\x1e\xbag}\xc2\xdaN\x8b\xbd.',
+      b'\x8dF\xf0%\x1cv\xccllW'
+  ], dtype=object)
+
+  quantized_cdf = np.array([
+      [    0, 15636, 22324, 30145, 38278, 65536],
+      [    0, 19482, 26927, 35052, 42904, 65535],
+      [    0, 21093, 28769, 36919, 44578, 65536]
+  ], dtype=np.int32)
+
+  expected = np.array([
+      [[-2.,  1.,  0., -2., -1., -2., -2., -2.,  2., -1.],
+       [ 1.,  2.,  1.,  0., -2., -2.,  1.,  2.,  0.,  1.],
+       [ 2.,  0., -2.,  2.,  0., -1., -2.,  0.,  2.,  0.]],
+      [[ 1.,  2.,  0., -1.,  1.,  2.,  1.,  1.,  2., -2.],
+       [ 2., -1., -1.,  0., -1.,  2.,  0.,  2., -2.,  2.],
+       [ 2., -2., -2., -1., -2.,  1., -2.,  0.,  0.,  0.]]
+  ], dtype=np.float32)
+  # pylint:enable=g-inconsistent-quotes,bad-whitespace
+
+  def test_decompress(self):
+    # Test that decompression of values compressed with a previous version
+    # works, i.e. that the file format doesn't change across revisions.
+    bitstrings = array_ops.placeholder(dtypes.string)
+    input_shape = array_ops.placeholder(dtypes.int32)
+    quantized_cdf = array_ops.placeholder(dtypes.int32)
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_first", filters=(), dtype=dtypes.float32)
+    layer.build(self.expected.shape)
+    layer._quantized_cdf = quantized_cdf
+    decoded = layer.decompress(bitstrings, input_shape[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      decoded, = sess.run([decoded], {
+          bitstrings: self.bitstrings, input_shape: self.expected.shape,
+          quantized_cdf: self.quantized_cdf})
+      self.assertAllClose(self.expected, decoded, rtol=0, atol=1e-6)
+
+  def test_build_decompress(self):
+    # Test that layer can be built when `decompress` is the first call to it.
+    bitstrings = array_ops.placeholder(dtypes.string)
+    input_shape = array_ops.placeholder(dtypes.int32, shape=[3])
+    layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32)
+    layer.decompress(bitstrings, input_shape[1:], channels=5)
+    self.assertTrue(layer.built)
+
+  def test_pmf_normalization(self):
+    # Test that probability mass functions are normalized correctly.
+    layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32)
+    layer.build((None, 10))
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      pmf, = sess.run([layer._pmf])
+      self.assertAllClose(np.ones(10), np.sum(pmf, axis=-1), rtol=0, atol=1e-6)
+
+  def test_visualize(self):
+    # Test that summary op can be constructed.
+    layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32)
+    layer.build((None, 10))
+    summary = layer.visualize()
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run([summary])
+
+  def test_normalization(self):
+    # Test that densities are normalized correctly.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 1))
+    layer = entropybottleneck.EntropyBottleneck(filters=(2,))
+    _, likelihood = layer(inputs, training=True)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      x = np.repeat(np.arange(-200, 201), 1000)[:, None]
+      likelihood, = sess.run([likelihood], {inputs: x})
+      self.assertEqual(x.shape, likelihood.shape)
+      integral = np.sum(likelihood) * .001
+      self.assertAllClose(1, integral, rtol=0, atol=1e-4)
+
+  def test_entropy_estimates(self):
+    # Test that entropy estimates match actual range coding.
+    inputs = array_ops.placeholder(dtypes.float32, (1, None, 1))
+    layer = entropybottleneck.EntropyBottleneck(
+        filters=(2, 3), data_format="channels_last")
+    _, likelihood = layer(inputs, training=True)
+    diff_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2)
+    _, likelihood = layer(inputs, training=False)
+    disc_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2)
+    bitstrings = layer.compress(inputs)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      diff_entropy, disc_entropy, bitstrings = sess.run(
+          [diff_entropy, disc_entropy, bitstrings],
+          {inputs: np.random.normal(size=(1, 10000, 1))})
+      codelength = 8 * sum(len(bitstring) for bitstring in bitstrings)
+      self.assertAllClose(diff_entropy, disc_entropy, rtol=5e-3, atol=0)
+      self.assertAllClose(disc_entropy, codelength, rtol=5e-3, atol=0)
+      self.assertGreater(codelength, disc_entropy)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index 388d8e6ed6d9cb9400b0bfbe8e3f50b80149ea1a..bcee0b04c8430588c2dcbc199504bede0436f8f1 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -46,15 +46,3 @@ cuda_py_test(
     ],
     xla_enabled = True,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py
index 29a593f6bcfa05dcafcdb2f94087380ad720dba1..b2f678fb29cedd3ec32f0460354cc4ac18fb63d3 100644
--- a/tensorflow/contrib/compiler/jit_test.py
+++ b/tensorflow/contrib/compiler/jit_test.py
@@ -175,7 +175,7 @@ class CompilationEnabledInGradientTest(test.TestCase):
 
   def testCompilationInGradient(self):
     with self.test_session():
-      x = constant_op.constant([[3]])
+      x = constant_op.constant([[3.]])
       y_nc = math_ops.matmul(x, x, name="not_compiled")
       with jit.experimental_jit_scope():
         y_c = math_ops.matmul(y_nc, y_nc, name="compiled")
@@ -200,11 +200,11 @@ class CompilationEnabledInGradientTest(test.TestCase):
     with self.test_session(graph=ops.Graph()):
       with jit.experimental_jit_scope():
         # XlaScope 0
-        a1 = constant_op.constant([[1]])
+        a1 = constant_op.constant([[1.]])
         a1t = math_ops.matmul(a1, a1)
       with jit.experimental_jit_scope():
         # XlaScope 1
-        a2 = constant_op.constant([[1]])
+        a2 = constant_op.constant([[1.]])
         a2t = math_ops.matmul(a2, a2)
 
       self.assertEqual(b"jit_scope_0", a1.op.get_attr("_XlaScope"))
@@ -222,11 +222,11 @@ class CompilationEnabledInGradientTest(test.TestCase):
     with self.test_session(graph=ops.Graph()):
       with jit.experimental_jit_scope(True, separate_compiled_gradients=True):
         # XlaScope 0
-        a1 = constant_op.constant([[1]])
+        a1 = constant_op.constant([[1.]])
         a1t = math_ops.matmul(a1, a1)
       with jit.experimental_jit_scope(True, separate_compiled_gradients=True):
         # XlaScope 1
-        a2 = constant_op.constant([[1]])
+        a2 = constant_op.constant([[1.]])
         a2t = math_ops.matmul(a2, a2)
 
       self.assertEqual(b"jit_scope_0", a1.op.get_attr("_XlaScope"))
diff --git a/tensorflow/contrib/constrained_optimization/BUILD b/tensorflow/contrib/constrained_optimization/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..619153df67c90cea5a5082a411972948bac5fe90
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/BUILD
@@ -0,0 +1,91 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "constrained_optimization_pip",
+    deps = [
+        ":constrained_optimization",
+        ":test_util",
+    ],
+)
+
+py_library(
+    name = "constrained_optimization",
+    srcs = [
+        "__init__.py",
+        "python/candidates.py",
+        "python/constrained_minimization_problem.py",
+        "python/constrained_optimizer.py",
+        "python/external_regret_optimizer.py",
+        "python/swap_regret_optimizer.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "candidates_test",
+    srcs = ["python/candidates_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+# NOTE: This library can't be "testonly" since it needs to be included in the
+# pip package.
+py_library(
+    name = "test_util",
+    srcs = ["python/test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:standard_ops",
+    ],
+)
+
+py_test(
+    name = "external_regret_optimizer_test",
+    srcs = ["python/external_regret_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "swap_regret_optimizer_test",
+    srcs = ["python/swap_regret_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c65a150464efc1e77419040f66f36fc6756325aa
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/README.md
@@ -0,0 +1,345 @@
+<!-- TODO(acotter): Add usage example of non-convex optimization and stochastic classification. -->
+
+# ConstrainedOptimization (TFCO)
+
+TFCO is a library for optimizing inequality-constrained problems in TensorFlow.
+Both the objective function and the constraints are represented as Tensors,
+giving users the maximum amount of flexibility in specifying their optimization
+problems.
+
+This flexibility makes optimization considerably more difficult: on a non-convex
+problem, if one uses the "standard" approach of introducing a Lagrange
+multiplier for each constraint, and then jointly maximizing over the Lagrange
+multipliers and minimizing over the model parameters, then a stable stationary
+point might not even *exist*. Hence, in some cases, oscillation, instead of
+convergence, is inevitable.
+
+Thankfully, it turns out that even if, over the course of optimization, no
+*particular* iterate does a good job of minimizing the objective while
+satisfying the constraints, the *sequence* of iterates, on average, usually
+will. This observation suggests the following approach: at training time, we'll
+periodically snapshot the model state during optimization; then, at evaluation
+time, each time we're given a new example to evaluate, we'll sample one of the
+saved snapshots uniformly at random, and apply it to the example. This
+*stochastic model* will generally perform well, both with respect to the
+objective function, and the constraints.
+
+In fact, we can do better: it's possible to post-process the set of snapshots to
+find a distribution over at most $$m+1$$ snapshots, where $$m$$ is the number of
+constraints, that will be at least as good (and will usually be much better)
+than the (much larger) uniform distribution described above. If you're unable or
+unwilling to use a stochastic model at all, then you can instead use a heuristic
+to choose the single best snapshot.
+
+For full details, motivation, and theoretical results on the approach taken by
+this library, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+which will be referred to as [CoJiSr18] throughout the remainder of this
+document.
+
+### Proxy Constraints
+
+Imagine that we want to constrain the recall of a binary classifier to be at
+least 90%. Since the recall is proportional to the number of true positive
+classifications, which itself is a sum of indicator functions, this constraint
+is non-differentible, and therefore cannot be used in a problem that will be
+optimized using a (stochastic) gradient-based algorithm.
+
+For this and similar problems, TFCO supports so-called *proxy constraints*,
+which are (at least semi-differentiable) approximations of the original
+constraints. For example, one could create a proxy recall function by replacing
+the indicator functions with sigmoids. During optimization, each proxy
+constraint function will be penalized, with the magnitude of the penalty being
+chosen to satisfy the corresponding *original* (non-proxy) constraint.
+
+On a problem including proxy constraints&mdash;even a convex problem&mdash;the
+Lagrangian approach discussed above isn't guaranteed to work. However, a
+different algorithm, based on minimizing *swap regret*, does work. Aside from
+this difference, the recommended procedure for optimizing a proxy-constrained
+problem remains the same: periodically snapshot the model during optimization,
+and then either find the best $$m+1$$-sized distribution, or heuristically
+choose the single best snapshot.
+
+## Components
+
+*   [constrained_minimization_problem](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py):
+    contains the `ConstrainedMinimizationProblem` interface. Your own
+    constrained optimization problems should be represented using
+    implementations of this interface.
+
+*   [constrained_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py):
+    contains the `ConstrainedOptimizer` interface, which is similar to (but
+    different from) `tf.train.Optimizer`, with the main difference being that
+    `ConstrainedOptimizer`s are given `ConstrainedMinimizationProblem`s to
+    optimize, and perform constrained optimization.
+
+    *   [external_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py):
+        contains the `AdditiveExternalRegretOptimizer` implementation, which is
+        a `ConstrainedOptimizer` implementing the Lagrangian approach discussed
+        above (with additive updates to the Lagrange multipliers). You should
+        use this optimizer for problems *without* proxy constraints. It may also
+        work for problems with proxy constraints, but we recommend using a swap
+        regret optimizer, instead.
+
+        This optimizer is most similar to Algorithm 3 in Appendix C.3 of
+        [CoJiSr18], and is discussed in Section 3. The two differences are that
+        it uses proxy constraints (if they're provided) in the update of the
+        model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for
+        the "inner" updates.
+
+    *   [swap_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py):
+        contains the `AdditiveSwapRegretOptimizer` and
+        `MultiplicativeSwapRegretOptimizer` implementations, which are
+        `ConstrainedOptimizer`s implementing the swap-regret minimization
+        approach mentioned above (with additive or multiplicative updates,
+        respectively, to the parameters associated with the
+        constraints&mdash;these parameters are not Lagrange multipliers, but
+        play a similar role). You should use one of these optimizers (we suggest
+        `MultiplicativeSwapRegretOptimizer`) for problems *with* proxy
+        constraints.
+
+        The `MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2
+        in Section 4 of [CoJiSr18], with the difference being that it uses
+        `tf.train.Optimizer`s, instead of SGD, for the "inner" updates. The
+        `AdditiveSwapRegretOptimizer` differs further in that it performs
+        additive (instead of multiplicative) updates of the stochastic matrix.
+
+*   [candidates](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/candidates.py):
+    contains two functions, `find_best_candidate_distribution` and
+    `find_best_candidate_index`. Both of these functions are given a set of
+    candidate solutions to a constrained optimization problem, from which the
+    former finds the best distribution over at most $$m+1$$ candidates, and the
+    latter heuristically finds the single best candidate. As discussed above,
+    the set of candidates will typically be model snapshots saved periodically
+    during optimization. Both of these functions require that scipy be
+    installed.
+
+    The `find_best_candidate_distribution` function implements the approach
+    described in Lemma 3 of [CoJiSr18], while `find_best_candidate_index`
+    implements the heuristic used for hyperparameter search in the experiments
+    of Section 5.2.
+
+## Convex Example with Proxy Constraints
+
+This is a simple example of recall-constrained optimization on simulated data:
+we will try to find a classifier that minimizes the average hinge loss while
+constraining recall to be at least 90%.
+
+We'll start with the required imports&mdash;notice the definition of `tfco`:
+
+```python
+import math
+import numpy as np
+import tensorflow as tf
+
+tfco = tf.contrib.constrained_optimization
+```
+
+We'll now create an implementation of the `ConstrainedMinimizationProblem` class
+for this problem. The constructor takes three parameters: a Tensor containing
+the classification labels (0 or 1) for every training example, another Tensor
+containing the model's predictions on every training example (sometimes called
+the "logits"), and the lower bound on recall that will be enforced using a
+constraint.
+
+This implementation will contain both constraints *and* proxy constraints: the
+former represents the constraint that the true recall (defined in terms of the
+*number* of true positives) be at least `recall_lower_bound`, while the latter
+represents the same constraint, but on a hinge approximation of the recall.
+
+```python
+class ExampleProblem(tfco.ConstrainedMinimizationProblem):
+
+  def __init__(self, labels, predictions, recall_lower_bound):
+    self._labels = labels
+    self._predictions = predictions
+    self._recall_lower_bound = recall_lower_bound
+    # The number of positively-labeled examples.
+    self._positive_count = tf.reduce_sum(self._labels)
+
+  @property
+  def objective(self):
+    return tf.losses.hinge_loss(labels=self._labels, logits=self._predictions)
+
+  @property
+  def constraints(self):
+    true_positives = self._labels * tf.to_float(self._predictions > 0)
+    true_positive_count = tf.reduce_sum(true_positives)
+    recall = true_positive_count / self._positive_count
+    # The constraint is (recall >= self._recall_lower_bound), which we convert
+    # to (self._recall_lower_bound - recall <= 0) because
+    # ConstrainedMinimizationProblems must always provide their constraints in
+    # the form (tensor <= 0).
+    #
+    # The result of this function should be a tensor, with each element being
+    # a quantity that is constrained to be nonpositive. We only have one
+    # constraint, so we return a one-element tensor.
+    return self._recall_lower_bound - recall
+
+  @property
+  def proxy_constraints(self):
+    # Use 1 - hinge since we're SUBTRACTING recall in the constraint function,
+    # and we want the proxy constraint function to be convex.
+    true_positives = self._labels * tf.minimum(1.0, self._predictions)
+    true_positive_count = tf.reduce_sum(true_positives)
+    recall = true_positive_count / self._positive_count
+    # Please see the corresponding comment in the constraints property.
+    return self._recall_lower_bound - recall
+```
+
+We'll now create a simple simulated dataset by sampling 1000 random
+10-dimensional feature vectors from a Gaussian, finding their labels using a
+random "ground truth" linear model, and then adding noise by randomly flipping
+200 labels.
+
+```python
+# Create a simulated 10-dimensional training dataset consisting of 1000 labeled
+# examples, of which 800 are labeled correctly and 200 are mislabeled.
+num_examples = 1000
+num_mislabeled_examples = 200
+dimension = 10
+# We will constrain the recall to be at least 90%.
+recall_lower_bound = 0.9
+
+# Create random "ground truth" parameters to a linear model.
+ground_truth_weights = np.random.normal(size=dimension) / math.sqrt(dimension)
+ground_truth_threshold = 0
+
+# Generate a random set of features for each example.
+features = np.random.normal(size=(num_examples, dimension)).astype(
+    np.float32) / math.sqrt(dimension)
+# Compute the labels from these features given the ground truth linear model.
+labels = (np.matmul(features, ground_truth_weights) >
+          ground_truth_threshold).astype(np.float32)
+# Add noise by randomly flipping num_mislabeled_examples labels.
+mislabeled_indices = np.random.choice(
+    num_examples, num_mislabeled_examples, replace=False)
+labels[mislabeled_indices] = 1 - labels[mislabeled_indices]
+```
+
+We're now ready to construct our model, and the corresponding optimization
+problem. We'll use a linear model of the form $$f(x) = w^T x - t$$, where $$w$$
+is the `weights`, and $$t$$ is the `threshold`. The `problem` variable will hold
+an instance of the `ExampleProblem` class we created earlier.
+
+```python
+# Create variables containing the model parameters.
+weights = tf.Variable(tf.zeros(dimension), dtype=tf.float32, name="weights")
+threshold = tf.Variable(0.0, dtype=tf.float32, name="threshold")
+
+# Create the optimization problem.
+constant_labels = tf.constant(labels, dtype=tf.float32)
+constant_features = tf.constant(features, dtype=tf.float32)
+predictions = tf.tensordot(constant_features, weights, axes=(1, 0)) - threshold
+problem = ExampleProblem(
+    labels=constant_labels,
+    predictions=predictions,
+    recall_lower_bound=recall_lower_bound,
+)
+```
+
+We're almost ready to train our model, but first we'll create a couple of
+functions to measure its performance. We're interested in two quantities: the
+average hinge loss (which we seek to minimize), and the recall (which we
+constrain).
+
+```python
+def average_hinge_loss(labels, predictions):
+  num_examples, = np.shape(labels)
+  signed_labels = (labels * 2) - 1
+  total_hinge_loss = np.sum(np.maximum(0.0, 1.0 - signed_labels * predictions))
+  return total_hinge_loss / num_examples
+
+def recall(labels, predictions):
+  positive_count = np.sum(labels)
+  true_positives = labels * (predictions > 0)
+  true_positive_count = np.sum(true_positives)
+  return true_positive_count / positive_count
+```
+
+As was mentioned earlier, external regret optimizers suffice for problems
+without proxy constraints, but swap regret optimizers are recommended for
+problems *with* proxy constraints. Since this problem contains proxy
+constraints, we use the `MultiplicativeSwapRegretOptimizer`.
+
+For this problem, the constraint is fairly easy to satisfy, so we can use the
+same "inner" optimizer (an `AdagradOptimizer` with a learning rate of 1) for
+optimization of both the model parameters (`weights` and `threshold`), and the
+internal parameters associated with the constraints (these are the analogues of
+the Lagrange multipliers used by the `MultiplicativeSwapRegretOptimizer`). For
+more difficult problems, it will often be necessary to use different optimizers,
+with different learning rates (presumably found via a hyperparameter search): to
+accomplish this, pass *both* the `optimizer` and `constraint_optimizer`
+parameters to `MultiplicativeSwapRegretOptimizer`'s constructor.
+
+Since this is a convex problem (both the objective and proxy constraint
+functions are convex), we can just take the last iterate. Periodic snapshotting,
+and the use of the `find_best_candidate_distribution` or
+`find_best_candidate_index` functions, is generally only necessary for
+non-convex problems (and even then, it isn't *always* necessary).
+
+```python
+with tf.Session() as session:
+  optimizer = tfco.MultiplicativeSwapRegretOptimizer(
+      optimizer=tf.train.AdagradOptimizer(learning_rate=1.0))
+  train_op = optimizer.minimize(problem)
+
+  session.run(tf.global_variables_initializer())
+  for ii in xrange(1000):
+    session.run(train_op)
+
+  trained_weights, trained_threshold = session.run((weights, threshold))
+
+trained_predictions = np.matmul(features, trained_weights) - trained_threshold
+print("Constrained average hinge loss = %f" % average_hinge_loss(
+    labels, trained_predictions))
+print("Constrained recall = %f" % recall(labels, trained_predictions))
+```
+
+Running the above code gives the following output (due to the randomness of the
+dataset, you'll get a different result when you run it):
+
+```none
+Constrained average hinge loss = 0.710019
+Constrained recall = 0.899811
+```
+
+As we hoped, the recall is extremely close to 90%&mdash;and, thanks to the use
+of proxy constraints, this is the *true* recall, not a hinge approximation.
+
+For comparison, let's try optimizing the same problem *without* the recall
+constraint:
+
+```python
+with tf.Session() as session:
+  optimizer = tf.train.AdagradOptimizer(learning_rate=1.0)
+  # For optimizing the unconstrained problem, we just minimize the "objective"
+  # portion of the minimization problem.
+  train_op = optimizer.minimize(problem.objective)
+
+  session.run(tf.global_variables_initializer())
+  for ii in xrange(1000):
+    session.run(train_op)
+
+  trained_weights, trained_threshold = session.run((weights, threshold))
+
+trained_predictions = np.matmul(features, trained_weights) - trained_threshold
+print("Unconstrained average hinge loss = %f" % average_hinge_loss(
+    labels, trained_predictions))
+print("Unconstrained recall = %f" % recall(labels, trained_predictions))
+```
+
+This code gives the following output (again, you'll get a different answer,
+since the dataset is random):
+
+```none
+Unconstrained average hinge loss = 0.627271
+Unconstrained recall = 0.793951
+```
+
+Because there is no constraint, the unconstrained problem does a better job of
+minimizing the average hinge loss, but naturally doesn't approach 90% recall.
diff --git a/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence.py b/tensorflow/contrib/constrained_optimization/__init__.py
similarity index 53%
rename from tensorflow/contrib/bayesflow/python/ops/csiszar_divergence.py
rename to tensorflow/contrib/constrained_optimization/__init__.py
index 9f7a95f138f7fd3e726f095dc16f41abb6182e17..1e49ba9f179ea98aaa9c35f79787605b53a1ec53 100644
--- a/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence.py
+++ b/tensorflow/contrib/constrained_optimization/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,40 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Csiszar f-Divergence and helpers.
-
-See ${python/contrib.bayesflow.csiszar_divergence}.
-"""
+"""A library for performing constrained optimization in TensorFlow."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.csiszar_divergence_impl import *
+from tensorflow.contrib.constrained_optimization.python.candidates import *
+from tensorflow.contrib.constrained_optimization.python.constrained_minimization_problem import *
+from tensorflow.contrib.constrained_optimization.python.constrained_optimizer import *
+from tensorflow.contrib.constrained_optimization.python.external_regret_optimizer import *
+from tensorflow.contrib.constrained_optimization.python.swap_regret_optimizer import *
 # pylint: enable=wildcard-import
+
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'amari_alpha',
-    'arithmetic_geometric',
-    'chi_square',
-    'csiszar_vimco',
-    'dual_csiszar_function',
-    'jeffreys',
-    'jensen_shannon',
-    'kl_forward',
-    'kl_reverse',
-    'log1p_abs',
-    'modified_gan',
-    'monte_carlo_csiszar_f_divergence',
-    'pearson',
-    'squared_hellinger',
-    'symmetrized_csiszar_function',
-    'total_variation',
-    't_power',
-    'triangular',
+    "AdditiveExternalRegretOptimizer",
+    "AdditiveSwapRegretOptimizer",
+    "ConstrainedMinimizationProblem",
+    "ConstrainedOptimizer",
+    "find_best_candidate_distribution",
+    "find_best_candidate_index",
+    "MultiplicativeSwapRegretOptimizer",
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates.py b/tensorflow/contrib/constrained_optimization/python/candidates.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac86a6741be1f244476f917d0e151166db65524b
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/candidates.py
@@ -0,0 +1,319 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code for optimizing over a set of candidate solutions.
+
+The functions in this file deal with the constrained problem:
+
+> minimize f(w)
+> s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+Here, f(w) is the "objective function", and g_i(w) is the ith (of m) "constraint
+function". Given the values of the objective and constraint functions for a set
+of n "candidate solutions" {w_0,w_1,...,w_{n-1}} (for a total of n objective
+function values, and n*m constraint function values), the
+`find_best_candidate_distribution` function finds the best DISTRIBUTION over
+these candidates, while `find_best_candidate_index' heuristically finds the
+single best candidate.
+
+Both of these functions have dependencies on `scipy`, so if you want to call
+them, then you must make sure that `scipy` is available. The imports are
+performed inside the functions themselves, so if they're not actually called,
+then `scipy` is not needed.
+
+For more specifics, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+The `find_best_candidate_distribution` function implements the approach
+described in Lemma 3, while `find_best_candidate_index` implements the heuristic
+used for hyperparameter search in the experiments of Section 5.2.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+
+def _find_best_candidate_distribution_helper(objective_vector,
+                                             constraints_matrix,
+                                             maximum_violation=0.0):
+  """Finds a distribution minimizing an objective subject to constraints.
+
+  This function deals with the constrained problem:
+
+  > minimize f(w)
+  > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+  Here, f(w) is the "objective function", and g_i(w) is the ith (of m)
+  "constraint function". Given a set of n "candidate solutions"
+  {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n
+  candidates that, in expectation, minimizes the objective while violating
+  the constraints by no more than `maximum_violation`. If no such distribution
+  exists, it returns an error (using Go-style error reporting).
+
+  The `objective_vector` parameter should be a numpy array with shape (n,), for
+  which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a
+  numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j).
+
+  This function will return a distribution for which at most m+1 probabilities,
+  and often fewer, are nonzero.
+
+  Args:
+    objective_vector: numpy array of shape (n,), where n is the number of
+      "candidate solutions". Contains the objective function values.
+    constraints_matrix: numpy array of shape (m,n), where m is the number of
+      constraints and n is the number of "candidate solutions". Contains the
+      constraint violation magnitudes.
+    maximum_violation: nonnegative float, the maximum amount by which any
+      constraint may be violated, in expectation.
+
+  Returns:
+    A pair (`result`, `message`), exactly one of which is None. If `message` is
+      None, then the `result` contains the optimal distribution as a numpy array
+      of shape (n,). If `result` is None, then `message` contains an error
+      message.
+
+  Raises:
+    ValueError: If `objective_vector` and `constraints_matrix` have inconsistent
+      shapes, or if `maximum_violation` is negative.
+    ImportError: If we're unable to import `scipy.optimize`.
+  """
+  if maximum_violation < 0.0:
+    raise ValueError("maximum_violation must be nonnegative")
+
+  mm, nn = np.shape(constraints_matrix)
+  if (nn,) != np.shape(objective_vector):
+    raise ValueError(
+        "objective_vector must have shape (n,), and constraints_matrix (m, n),"
+        " where n is the number of candidates, and m is the number of "
+        "constraints")
+
+  # We import scipy inline, instead of at the top of the file, so that a scipy
+  # dependency is only introduced if either find_best_candidate_distribution()
+  # or find_best_candidate_index() are actually called.
+  import scipy.optimize  # pylint: disable=g-import-not-at-top
+
+  # Feasibility (within maximum_violation) constraints.
+  a_ub = constraints_matrix
+  b_ub = np.full((mm, 1), maximum_violation)
+  # Sum-to-one constraint.
+  a_eq = np.ones((1, nn))
+  b_eq = np.ones((1, 1))
+  # Nonnegativity constraints.
+  bounds = (0, None)
+
+  result = scipy.optimize.linprog(
+      objective_vector,
+      A_ub=a_ub,
+      b_ub=b_ub,
+      A_eq=a_eq,
+      b_eq=b_eq,
+      bounds=bounds)
+  # Go-style error reporting. We don't raise on error, since
+  # find_best_candidate_distribution() needs to handle the failure case, and we
+  # shouldn't use exceptions as flow-control.
+  if not result.success:
+    return (None, result.message)
+  else:
+    return (result.x, None)
+
+
+def find_best_candidate_distribution(objective_vector,
+                                     constraints_matrix,
+                                     epsilon=0.0):
+  """Finds a distribution minimizing an objective subject to constraints.
+
+  This function deals with the constrained problem:
+
+  > minimize f(w)
+  > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+  Here, f(w) is the "objective function", and g_i(w) is the ith (of m)
+  "constraint function". Given a set of n "candidate solutions"
+  {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n
+  candidates that, in expectation, minimizes the objective while violating
+  the constraints by the smallest possible amount (with the amount being found
+  via bisection search).
+
+  The `objective_vector` parameter should be a numpy array with shape (n,), for
+  which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a
+  numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j).
+
+  This function will return a distribution for which at most m+1 probabilities,
+  and often fewer, are nonzero.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  This function implements the approach described in Lemma 3.
+
+  Args:
+    objective_vector: numpy array of shape (n,), where n is the number of
+      "candidate solutions". Contains the objective function values.
+    constraints_matrix: numpy array of shape (m,n), where m is the number of
+      constraints and n is the number of "candidate solutions". Contains the
+      constraint violation magnitudes.
+    epsilon: nonnegative float, the threshold at which to terminate the binary
+      search while searching for the minimal expected constraint violation
+      magnitude.
+
+  Returns:
+    The optimal distribution, as a numpy array of shape (n,).
+
+  Raises:
+    ValueError: If `objective_vector` and `constraints_matrix` have inconsistent
+      shapes, or if `epsilon` is negative.
+    ImportError: If we're unable to import `scipy.optimize`.
+  """
+  if epsilon < 0.0:
+    raise ValueError("epsilon must be nonnegative")
+
+  # If there is a feasible solution (i.e. with maximum_violation=0), then that's
+  # what we'll return.
+  pp, _ = _find_best_candidate_distribution_helper(objective_vector,
+                                                   constraints_matrix)
+  if pp is not None:
+    return pp
+
+  # The bound is the minimum over all candidates, of the maximum per-candidate
+  # constraint violation.
+  lower = 0.0
+  upper = np.min(np.amax(constraints_matrix, axis=0))
+  best_pp, _ = _find_best_candidate_distribution_helper(
+      objective_vector, constraints_matrix, maximum_violation=upper)
+  assert best_pp is not None
+
+  # Throughout this loop, a maximum_violation of "lower" is not achievable,
+  # but a maximum_violation of "upper" is achiveable.
+  while True:
+    middle = 0.5 * (lower + upper)
+    if (middle - lower <= epsilon) or (upper - middle <= epsilon):
+      break
+    else:
+      pp, _ = _find_best_candidate_distribution_helper(
+          objective_vector, constraints_matrix, maximum_violation=middle)
+      if pp is None:
+        lower = middle
+      else:
+        best_pp = pp
+        upper = middle
+
+  return best_pp
+
+
+def find_best_candidate_index(objective_vector,
+                              constraints_matrix,
+                              rank_objectives=False):
+  """Heuristically finds the best candidate solution to a constrained problem.
+
+  This function deals with the constrained problem:
+
+  > minimize f(w)
+  > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+  Here, f(w) is the "objective function", and g_i(w) is the ith (of m)
+  "constraint function". Given a set of n "candidate solutions"
+  {w_0,w_1,...,w_{n-1}}, this function finds the "best" solution according
+  to the following heuristic:
+
+    1. Across all models, the ith constraint violations (i.e. max{0, g_i(0)})
+       are ranked, as are the objectives (if rank_objectives=True).
+    2. Each model is then associated its MAXIMUM rank across all m constraints
+       (and the objective, if rank_objectives=True).
+    3. The model with the minimal maximum rank is then identified. Ties are
+       broken using the objective function value.
+    4. The index of this "best" model is returned.
+
+  The `objective_vector` parameter should be a numpy array with shape (n,), for
+  which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a
+  numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j).
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  This function implements the heuristic used for hyperparameter search in the
+  experiments of Section 5.2.
+
+  Args:
+    objective_vector: numpy array of shape (n,), where n is the number of
+      "candidate solutions". Contains the objective function values.
+    constraints_matrix: numpy array of shape (m,n), where m is the number of
+      constraints and n is the number of "candidate solutions". Contains the
+      constraint violation magnitudes.
+    rank_objectives: bool, whether the objective function values should be
+      included in the initial ranking step. If True, both the objective and
+      constraints will be ranked. If False, only the constraints will be ranked.
+      In either case, the objective function values will be used for
+      tiebreaking.
+
+  Returns:
+    The index (in {0,1,...,n-1}) of the "best" model according to the above
+      heuristic.
+
+  Raises:
+    ValueError: If `objective_vector` and `constraints_matrix` have inconsistent
+      shapes.
+    ImportError: If we're unable to import `scipy.stats`.
+  """
+  mm, nn = np.shape(constraints_matrix)
+  if (nn,) != np.shape(objective_vector):
+    raise ValueError(
+        "objective_vector must have shape (n,), and constraints_matrix (m, n),"
+        " where n is the number of candidates, and m is the number of "
+        "constraints")
+
+  # We import scipy inline, instead of at the top of the file, so that a scipy
+  # dependency is only introduced if either find_best_candidate_distribution()
+  # or find_best_candidate_index() are actually called.
+  import scipy.stats  # pylint: disable=g-import-not-at-top
+
+  if rank_objectives:
+    maximum_ranks = scipy.stats.rankdata(objective_vector, method="min")
+  else:
+    maximum_ranks = np.zeros(nn, dtype=np.int64)
+  for ii in xrange(mm):
+    # Take the maximum of the constraint functions with zero, since we want to
+    # rank the magnitude of constraint *violations*. If the constraint is
+    # satisfied, then we don't care how much it's satisfied by (as a result, we
+    # we expect all models satisfying a constraint to be tied at rank 1).
+    ranks = scipy.stats.rankdata(
+        np.maximum(0.0, constraints_matrix[ii, :]), method="min")
+    maximum_ranks = np.maximum(maximum_ranks, ranks)
+
+  best_index = None
+  best_rank = float("Inf")
+  best_objective = float("Inf")
+  for ii in xrange(nn):
+    if maximum_ranks[ii] < best_rank:
+      best_index = ii
+      best_rank = maximum_ranks[ii]
+      best_objective = objective_vector[ii]
+    elif (maximum_ranks[ii] == best_rank) and (objective_vector[ii] <=
+                                               best_objective):
+      best_index = ii
+      best_objective = objective_vector[ii]
+
+  return best_index
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates_test.py b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4c49d48bc5c763489215261a909573af0f19055
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for constrained_optimization.python.candidates."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.constrained_optimization.python import candidates
+from tensorflow.python.platform import test
+
+
+class CandidatesTest(test.TestCase):
+
+  def test_inconsistent_shapes_for_best_distribution(self):
+    """An error is raised when parameters have inconsistent shapes."""
+    objective_vector = np.array([1, 2, 3])
+    constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    with self.assertRaises(ValueError):
+      _ = candidates.find_best_candidate_distribution(objective_vector,
+                                                      constraints_matrix)
+
+  def test_inconsistent_shapes_for_best_index(self):
+    """An error is raised when parameters have inconsistent shapes."""
+    objective_vector = np.array([1, 2, 3])
+    constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    with self.assertRaises(ValueError):
+      _ = candidates.find_best_candidate_index(objective_vector,
+                                               constraints_matrix)
+
+  def test_best_distribution(self):
+    """Distribution should match known solution."""
+    objective_vector = np.array(
+        [0.03053309, -0.06667082, 0.88355145, 0.46529806])
+    constraints_matrix = np.array(
+        [[-0.60164551, 0.36676229, 0.7856454, -0.8441711],
+         [0.00371592, -0.16392108, -0.59778071, -0.56908492]])
+    distribution = candidates.find_best_candidate_distribution(
+        objective_vector, constraints_matrix)
+    # Verify that the solution is a probability distribution.
+    self.assertTrue(np.all(distribution >= 0))
+    self.assertAlmostEqual(np.sum(distribution), 1.0)
+    # Verify that the solution satisfies the constraints.
+    maximum_constraint_violation = np.amax(
+        np.dot(constraints_matrix, distribution))
+    self.assertLessEqual(maximum_constraint_violation, 0)
+    # Verify that the solution matches that which we expect.
+    expected_distribution = np.array([0.37872711, 0.62127289, 0, 0])
+    self.assertAllClose(expected_distribution, distribution, rtol=0, atol=1e-6)
+
+  def test_best_index_rank_objectives_true(self):
+    """Index should match known solution."""
+    # Objective ranks = [2, 1, 4, 3].
+    objective_vector = np.array(
+        [0.03053309, -0.06667082, 0.88355145, 0.46529806])
+    # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]].
+    constraints_matrix = np.array(
+        [[-0.60164551, 0.36676229, 0.7856454, -0.8441711],
+         [0.00371592, -0.16392108, -0.59778071, -0.56908492]])
+    # Maximum ranks = [4, 3, 4, 3].
+    index = candidates.find_best_candidate_index(
+        objective_vector, constraints_matrix, rank_objectives=True)
+    self.assertEqual(1, index)
+
+  def test_best_index_rank_objectives_false(self):
+    """Index should match known solution."""
+    # Objective ranks = [2, 1, 4, 3].
+    objective_vector = np.array(
+        [0.03053309, -0.06667082, 0.88355145, 0.46529806])
+    # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]].
+    constraints_matrix = np.array(
+        [[-0.60164551, 0.36676229, 0.7856454, -0.8441711],
+         [0.00371592, -0.16392108, -0.59778071, -0.56908492]])
+    # Maximum ranks = [4, 3, 4, 1].
+    index = candidates.find_best_candidate_index(
+        objective_vector, constraints_matrix, rank_objectives=False)
+    self.assertEqual(3, index)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
new file mode 100644
index 0000000000000000000000000000000000000000..70813fb217956b167b80a7e1d555c8ba79088fdb
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
@@ -0,0 +1,123 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines abstract class for `ConstrainedMinimizationProblem`s.
+
+A ConstrainedMinimizationProblem consists of an objective function to minimize,
+and a set of constraint functions that are constrained to be nonpositive.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ConstrainedMinimizationProblem(object):
+  """Abstract class representing a `ConstrainedMinimizationProblem`.
+
+  A ConstrainedMinimizationProblem consists of an objective function to
+  minimize, and a set of constraint functions that are constrained to be
+  nonpositive.
+
+  In addition to the constraint functions, there may (optionally) be proxy
+  constraint functions: a ConstrainedOptimizer will attempt to penalize these
+  proxy constraint functions so as to satisfy the (non-proxy) constraints. Proxy
+  constraints could be used if the constraints functions are difficult or
+  impossible to optimize (e.g. if they're piecewise constant), in which case the
+  proxy constraints should be some approximation of the original constraints
+  that is well-enough behaved to permit successful optimization.
+  """
+
+  @abc.abstractproperty
+  def objective(self):
+    """Returns the objective function.
+
+    Returns:
+      A 0d tensor that should be minimized.
+    """
+    pass
+
+  @property
+  def num_constraints(self):
+    """Returns the number of constraints.
+
+    Returns:
+      An int containing the number of constraints.
+
+    Raises:
+      ValueError: If the constraints (or proxy_constraints, if present) do not
+        have fully-known shapes, OR if proxy_constraints are present, and the
+        shapes of constraints and proxy_constraints are fully-known, but they're
+        different.
+    """
+    constraints_shape = self.constraints.get_shape()
+    if self.proxy_constraints is None:
+      proxy_constraints_shape = constraints_shape
+    else:
+      proxy_constraints_shape = self.proxy_constraints.get_shape()
+
+    if (constraints_shape is None or proxy_constraints_shape is None or
+        any([ii is None for ii in constraints_shape.as_list()]) or
+        any([ii is None for ii in proxy_constraints_shape.as_list()])):
+      raise ValueError(
+          "constraints and proxy_constraints must have fully-known shapes")
+    if constraints_shape != proxy_constraints_shape:
+      raise ValueError(
+          "constraints and proxy_constraints must have the same shape")
+
+    size = 1
+    for ii in constraints_shape.as_list():
+      size *= ii
+    return int(size)
+
+  @abc.abstractproperty
+  def constraints(self):
+    """Returns the vector of constraint functions.
+
+    Letting g_i be the ith element of the constraints vector, the ith constraint
+    will be g_i <= 0.
+
+    Returns:
+      A tensor of constraint functions.
+    """
+    pass
+
+  # This is a property, instead of an abstract property, since it doesn't need
+  # to be overridden: if proxy_constraints returns None, then there are no
+  # proxy constraints.
+  @property
+  def proxy_constraints(self):
+    """Returns the optional vector of proxy constraint functions.
+
+    The difference between `constraints` and `proxy_constraints` is that, when
+    proxy constraints are present, the `constraints` are merely EVALUATED during
+    optimization, whereas the `proxy_constraints` are DIFFERENTIATED. If there
+    are no proxy constraints, then the `constraints` are both evaluated and
+    differentiated.
+
+    For example, if we want to impose constraints on step functions, then we
+    could use these functions for `constraints`. However, because a step
+    function has zero gradient almost everywhere, we can't differentiate these
+    functions, so we would take `proxy_constraints` to be some differentiable
+    approximation of `constraints`.
+
+    Returns:
+      A tensor of proxy constraint functions.
+    """
+    return None
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..805554536610a5e2cc650ff0b47185f4fbd6fac5
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
@@ -0,0 +1,208 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines base class for `ConstrainedOptimizer`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.training import optimizer as train_optimizer
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ConstrainedOptimizer(object):
+  """Base class representing a constrained optimizer.
+
+  A ConstrainedOptimizer wraps a tf.train.Optimizer (or more than one), and
+  applies it to a ConstrainedMinimizationProblem. Unlike a tf.train.Optimizer,
+  which takes a tensor to minimize as a parameter to its minimize() method, a
+  constrained optimizer instead takes a ConstrainedMinimizationProblem.
+  """
+
+  def __init__(self, optimizer):
+    """Constructs a new `ConstrainedOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the
+        ConstraintedMinimizationProblem.
+
+    Returns:
+      A new `ConstrainedOptimizer`.
+    """
+    self._optimizer = optimizer
+
+  @property
+  def optimizer(self):
+    """Returns the `tf.train.Optimizer` used for optimization."""
+    return self._optimizer
+
+  def minimize_unconstrained(self,
+                             minimization_problem,
+                             global_step=None,
+                             var_list=None,
+                             gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                             aggregation_method=None,
+                             colocate_gradients_with_ops=False,
+                             name=None,
+                             grad_loss=None):
+    """Returns an `Op` for minimizing the unconstrained problem.
+
+    Unlike `minimize_constrained`, this function ignores the `constraints` (and
+    `proxy_constraints`) portion of the minimization problem entirely, and only
+    minimizes `objective`.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    return self.optimizer.minimize(
+        minimization_problem.objective,
+        global_step=global_step,
+        var_list=var_list,
+        gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        name=name,
+        grad_loss=grad_loss)
+
+  @abc.abstractmethod
+  def minimize_constrained(self,
+                           minimization_problem,
+                           global_step=None,
+                           var_list=None,
+                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                           aggregation_method=None,
+                           colocate_gradients_with_ops=False,
+                           name=None,
+                           grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    Unlike `minimize_unconstrained`, this function attempts to find a solution
+    that minimizes the `objective` portion of the minimization problem while
+    satisfying the `constraints` portion.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    pass
+
+  def minimize(self,
+               minimization_problem,
+               unconstrained_steps=None,
+               global_step=None,
+               var_list=None,
+               gate_gradients=train_optimizer.Optimizer.GATE_OP,
+               aggregation_method=None,
+               colocate_gradients_with_ops=False,
+               name=None,
+               grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    This method combines the functionality of `minimize_unconstrained` and
+    `minimize_constrained`. If global_step < unconstrained_steps, it will
+    perform an unconstrained update, and if global_step >= unconstrained_steps,
+    it will perform a constrained update.
+
+    The reason for this functionality is that it may be best to initialize the
+    constrained optimizer with an approximate optimum of the unconstrained
+    problem.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      unconstrained_steps: int, number of steps for which we should perform
+        unconstrained updates, before transitioning to constrained updates.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+
+    Raises:
+      ValueError: If unconstrained_steps is provided, but global_step is not.
+    """
+
+    def unconstrained_fn():
+      """Returns an `Op` for minimizing the unconstrained problem."""
+      return self.minimize_unconstrained(
+          minimization_problem=minimization_problem,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    def constrained_fn():
+      """Returns an `Op` for minimizing the constrained problem."""
+      return self.minimize_constrained(
+          minimization_problem=minimization_problem,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    if unconstrained_steps is not None:
+      if global_step is None:
+        raise ValueError(
+            "global_step cannot be None if unconstrained_steps is provided")
+      unconstrained_steps_tensor = ops.convert_to_tensor(unconstrained_steps)
+      dtype = unconstrained_steps_tensor.dtype
+      return control_flow_ops.cond(
+          standard_ops.cast(global_step, dtype) < unconstrained_steps_tensor,
+          true_fn=unconstrained_fn,
+          false_fn=constrained_fn)
+    else:
+      return constrained_fn()
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..01c6e4f08afb93e37aa124f31ca7faa10b07d4d6
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
@@ -0,0 +1,375 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines `AdditiveExternalRegretOptimizer`.
+
+This optimizer minimizes a `ConstrainedMinimizationProblem` by introducing
+Lagrange multipliers, and using `tf.train.Optimizer`s to jointly optimize over
+the model parameters and Lagrange multipliers.
+
+For the purposes of constrained optimization, at least in theory,
+external-regret minimization suffices if the `ConstrainedMinimizationProblem`
+we're optimizing doesn't have any `proxy_constraints`, while swap-regret
+minimization should be used if `proxy_constraints` are present.
+
+For more specifics, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+The formulation used by the AdditiveExternalRegretOptimizer--which is simply the
+usual Lagrangian formulation--can be found in Definition 1, and is discussed in
+Section 3. This optimizer is most similar to Algorithm 3 in Appendix C.3, with
+the two differences being that it uses proxy constraints (if they're provided)
+in the update of the model parameters, and uses `tf.train.Optimizer`s, instead
+of SGD, for the "inner" updates.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.contrib.constrained_optimization.python import constrained_optimizer
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer as train_optimizer
+
+
+def _project_multipliers_wrt_euclidean_norm(multipliers, radius):
+  """Projects its argument onto the feasible region.
+
+  The feasible region is the set of all vectors with nonnegative elements that
+  sum to at most `radius`.
+
+  Args:
+    multipliers: 1d tensor, the Lagrange multipliers to project.
+    radius: float, the radius of the feasible region.
+
+  Returns:
+    The 1d tensor that results from projecting `multipliers` onto the feasible
+      region w.r.t. the Euclidean norm.
+
+  Raises:
+    ValueError: if the `multipliers` tensor does not have a fully-known shape,
+      or is not one-dimensional.
+  """
+  multipliers_shape = multipliers.get_shape()
+  if multipliers_shape is None:
+    raise ValueError("multipliers must have known shape")
+  if multipliers_shape.ndims != 1:
+    raise ValueError(
+        "multipliers must be one dimensional (instead is %d-dimensional)" %
+        multipliers_shape.ndims)
+  dimension = multipliers_shape[0].value
+  if dimension is None:
+    raise ValueError("multipliers must have fully-known shape")
+
+  def while_loop_condition(iteration, multipliers, inactive, old_inactive):
+    """Returns false if the while loop should terminate."""
+    del multipliers  # Needed by the body, but not the condition.
+    not_done = (iteration < dimension)
+    not_converged = standard_ops.reduce_any(
+        standard_ops.not_equal(inactive, old_inactive))
+    return standard_ops.logical_and(not_done, not_converged)
+
+  def while_loop_body(iteration, multipliers, inactive, old_inactive):
+    """Performs one iteration of the projection."""
+    del old_inactive  # Needed by the condition, but not the body.
+    iteration += 1
+    scale = standard_ops.minimum(
+        0.0,
+        (radius - standard_ops.reduce_sum(multipliers)) / standard_ops.maximum(
+            1.0, standard_ops.reduce_sum(inactive)))
+    multipliers += scale * inactive
+    new_inactive = standard_ops.to_float(multipliers > 0)
+    multipliers *= new_inactive
+    return (iteration, multipliers, new_inactive, inactive)
+
+  iteration = standard_ops.constant(0)
+  inactive = standard_ops.ones_like(multipliers)
+
+  # We actually want a do-while loop, so we explicitly call while_loop_body()
+  # once before tf.while_loop().
+  iteration, multipliers, inactive, old_inactive = while_loop_body(
+      iteration, multipliers, inactive, inactive)
+  iteration, multipliers, inactive, old_inactive = control_flow_ops.while_loop(
+      while_loop_condition,
+      while_loop_body,
+      loop_vars=(iteration, multipliers, inactive, old_inactive),
+      name="euclidean_projection")
+
+  return multipliers
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _ExternalRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
+  """Base class representing an `_ExternalRegretOptimizer`.
+
+  This class contains most of the logic for performing constrained
+  optimization, minimizing external regret for the constraints player. What it
+  *doesn't* do is keep track of the internal state (the Lagrange multipliers).
+  Instead, the state is accessed via the _initial_state(),
+  _lagrange_multipliers(), _constraint_grad_and_var() and _projection_op()
+  methods.
+
+  The reason for this is that we want to make it easy to implement different
+  representations of the internal state.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by `_ExternalRegretOptimizer`s--which is simply the usual
+  Lagrangian formulation--can be found in Definition 1, and is discussed in
+  Section 3. Such optimizers are most similar to Algorithm 3 in Appendix C.3.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Constructs a new `_ExternalRegretOptimizer`.
+
+    The difference between `optimizer` and `constraint_optimizer` (if the latter
+    is provided) is that the former is used for learning the model parameters,
+    while the latter us used for the Lagrange multipliers. If no
+    `constraint_optimizer` is provided, then `optimizer` is used for both.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of the ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multipliers.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multipliers.
+
+    Returns:
+      A new `_ExternalRegretOptimizer`.
+    """
+    super(_ExternalRegretOptimizer, self).__init__(optimizer=optimizer)
+    self._constraint_optimizer = constraint_optimizer
+
+  @property
+  def constraint_optimizer(self):
+    """Returns the `tf.train.Optimizer` used for the Lagrange multipliers."""
+    return self._constraint_optimizer
+
+  @abc.abstractmethod
+  def _initial_state(self, num_constraints):
+    pass
+
+  @abc.abstractmethod
+  def _lagrange_multipliers(self, state):
+    pass
+
+  @abc.abstractmethod
+  def _constraint_grad_and_var(self, state, gradient):
+    pass
+
+  @abc.abstractmethod
+  def _projection_op(self, state, name=None):
+    pass
+
+  def minimize_constrained(self,
+                           minimization_problem,
+                           global_step=None,
+                           var_list=None,
+                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                           aggregation_method=None,
+                           colocate_gradients_with_ops=False,
+                           name=None,
+                           grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    The `optimizer` constructor parameter will be used to update the model
+    parameters, while the Lagrange multipliers will be updated using
+    `constrained_optimizer` (if provided) or `optimizer` (if not).
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    objective = minimization_problem.objective
+
+    constraints = minimization_problem.constraints
+    proxy_constraints = minimization_problem.proxy_constraints
+    if proxy_constraints is None:
+      proxy_constraints = constraints
+    # Flatten both constraints tensors to 1d.
+    num_constraints = minimization_problem.num_constraints
+    constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
+    proxy_constraints = standard_ops.reshape(
+        proxy_constraints, shape=(num_constraints,))
+
+    # We use a lambda to initialize the state so that, if this function call is
+    # inside the scope of a tf.control_dependencies() block, the dependencies
+    # will not be applied to the initializer.
+    state = standard_ops.Variable(
+        lambda: self._initial_state(num_constraints),
+        trainable=False,
+        name="external_regret_optimizer_state")
+
+    multipliers = self._lagrange_multipliers(state)
+    loss = (
+        objective + standard_ops.tensordot(multipliers, proxy_constraints, 1))
+    multipliers_gradient = constraints
+
+    update_ops = []
+    if self.constraint_optimizer is None:
+      # If we don't have a separate constraint_optimizer, then we use
+      # self._optimizer for both the update of the model parameters, and that of
+      # the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      grads_and_vars.append(
+          self._constraint_grad_and_var(state, multipliers_gradient))
+      update_ops.append(
+          self.optimizer.apply_gradients(grads_and_vars, name="update"))
+    else:
+      # If we have a separate constraint_optimizer, then we use self._optimizer
+      # for the update of the model parameters, and self._constraint_optimizer
+      # for that of the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      multiplier_grads_and_vars = [
+          self._constraint_grad_and_var(state, multipliers_gradient)
+      ]
+
+      gradients = [
+          gradient for gradient, _ in grads_and_vars + multiplier_grads_and_vars
+          if gradient is not None
+      ]
+      with ops.control_dependencies(gradients):
+        update_ops.append(
+            self.optimizer.apply_gradients(grads_and_vars, name="update"))
+        update_ops.append(
+            self.constraint_optimizer.apply_gradients(
+                multiplier_grads_and_vars, name="optimizer_state_update"))
+
+    with ops.control_dependencies(update_ops):
+      if global_step is None:
+        # If we don't have a global step, just project, and we're done.
+        return self._projection_op(state, name=name)
+      else:
+        # If we have a global step, then we need to increment it in addition to
+        # projecting.
+        projection_op = self._projection_op(state, name="project")
+        with ops.colocate_with(global_step):
+          global_step_op = state_ops.assign_add(
+              global_step, 1, name="global_step_increment")
+        return control_flow_ops.group(projection_op, global_step_op, name=name)
+
+
+class AdditiveExternalRegretOptimizer(_ExternalRegretOptimizer):
+  """A `ConstrainedOptimizer` based on external-regret minimization.
+
+  This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly
+  minimize over the model parameters, and maximize over Lagrange multipliers,
+  with the latter maximization using additive updates and an algorithm that
+  minimizes external regret.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by this optimizer--which is simply the usual Lagrangian
+  formulation--can be found in Definition 1, and is discussed in Section 3. It
+  is most similar to Algorithm 3 in Appendix C.3, with the two differences being
+  that it uses proxy constraints (if they're provided) in the update of the
+  model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for the
+  "inner" updates.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               maximum_multiplier_radius=None):
+    """Constructs a new `AdditiveExternalRegretOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multipliers.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multipliers.
+      maximum_multiplier_radius: float, an optional upper bound to impose on the
+        sum of the Lagrange multipliers.
+
+    Returns:
+      A new `AdditiveExternalRegretOptimizer`.
+
+    Raises:
+      ValueError: If the maximum_multiplier_radius parameter is nonpositive.
+    """
+    super(AdditiveExternalRegretOptimizer, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+
+    if maximum_multiplier_radius and (maximum_multiplier_radius <= 0.0):
+      raise ValueError("maximum_multiplier_radius must be strictly positive")
+
+    self._maximum_multiplier_radius = maximum_multiplier_radius
+
+  def _initial_state(self, num_constraints):
+    # For an AdditiveExternalRegretOptimizer, the internal state is simply a
+    # tensor of Lagrange multipliers with shape (m,), where m is the number of
+    # constraints.
+    return standard_ops.zeros((num_constraints,), dtype=dtypes.float32)
+
+  def _lagrange_multipliers(self, state):
+    return state
+
+  def _constraint_grad_and_var(self, state, gradient):
+    # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True?
+    return (-gradient, state)
+
+  def _projection_op(self, state, name=None):
+    with ops.colocate_with(state):
+      if self._maximum_multiplier_radius:
+        projected_multipliers = _project_multipliers_wrt_euclidean_norm(
+            state, self._maximum_multiplier_radius)
+      else:
+        projected_multipliers = standard_ops.maximum(state, 0.0)
+      return state_ops.assign(state, projected_multipliers, name=name)
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b4bf6271009161c4c449cd9c3cdab9fba90aa59
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py
@@ -0,0 +1,136 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for constrained_optimization.python.external_regret_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.constrained_optimization.python import external_regret_optimizer
+from tensorflow.contrib.constrained_optimization.python import test_util
+
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+class AdditiveExternalRegretOptimizerWrapper(
+    external_regret_optimizer.AdditiveExternalRegretOptimizer):
+  """Testing wrapper class around AdditiveExternalRegretOptimizer.
+
+  This class is identical to AdditiveExternalRegretOptimizer, except that it
+  caches the internal optimization state when _lagrange_multipliers() is called,
+  so that we can test that the Lagrange multipliers take on their expected
+  values.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               maximum_multiplier_radius=None):
+    """Same as AdditiveExternalRegretOptimizer.__init__."""
+    super(AdditiveExternalRegretOptimizerWrapper, self).__init__(
+        optimizer=optimizer,
+        constraint_optimizer=constraint_optimizer,
+        maximum_multiplier_radius=maximum_multiplier_radius)
+    self._cached_lagrange_multipliers = None
+
+  @property
+  def lagrange_multipliers(self):
+    """Returns the cached Lagrange multipliers."""
+    return self._cached_lagrange_multipliers
+
+  def _lagrange_multipliers(self, state):
+    """Caches the internal state for testing."""
+    self._cached_lagrange_multipliers = super(
+        AdditiveExternalRegretOptimizerWrapper,
+        self)._lagrange_multipliers(state)
+    return self._cached_lagrange_multipliers
+
+
+class ExternalRegretOptimizerTest(test.TestCase):
+
+  def test_project_multipliers_wrt_euclidean_norm(self):
+    """Tests Euclidean projection routine on some known values."""
+    multipliers1 = standard_ops.constant([-0.1, -0.6, -0.3])
+    expected_projected_multipliers1 = np.array([0.0, 0.0, 0.0])
+
+    multipliers2 = standard_ops.constant([-0.1, 0.6, 0.3])
+    expected_projected_multipliers2 = np.array([0.0, 0.6, 0.3])
+
+    multipliers3 = standard_ops.constant([0.4, 0.7, -0.2, 0.5, 0.1])
+    expected_projected_multipliers3 = np.array([0.2, 0.5, 0.0, 0.3, 0.0])
+
+    with self.test_session() as session:
+      projected_multipliers1 = session.run(
+          external_regret_optimizer._project_multipliers_wrt_euclidean_norm(
+              multipliers1, 1.0))
+      projected_multipliers2 = session.run(
+          external_regret_optimizer._project_multipliers_wrt_euclidean_norm(
+              multipliers2, 1.0))
+      projected_multipliers3 = session.run(
+          external_regret_optimizer._project_multipliers_wrt_euclidean_norm(
+              multipliers3, 1.0))
+
+    self.assertAllClose(
+        expected_projected_multipliers1,
+        projected_multipliers1,
+        rtol=0,
+        atol=1e-6)
+    self.assertAllClose(
+        expected_projected_multipliers2,
+        projected_multipliers2,
+        rtol=0,
+        atol=1e-6)
+    self.assertAllClose(
+        expected_projected_multipliers3,
+        projected_multipliers3,
+        rtol=0,
+        atol=1e-6)
+
+  def test_additive_external_regret_optimizer(self):
+    """Tests that the Lagrange multipliers update as expected."""
+    minimization_problem = test_util.ConstantMinimizationProblem(
+        np.array([0.6, -0.1, 0.4]))
+    optimizer = AdditiveExternalRegretOptimizerWrapper(
+        gradient_descent.GradientDescentOptimizer(1.0),
+        maximum_multiplier_radius=1.0)
+    train_op = optimizer.minimize_constrained(minimization_problem)
+
+    expected_multipliers = [
+        np.array([0.0, 0.0, 0.0]),
+        np.array([0.6, 0.0, 0.4]),
+        np.array([0.7, 0.0, 0.3]),
+        np.array([0.8, 0.0, 0.2]),
+        np.array([0.9, 0.0, 0.1]),
+        np.array([1.0, 0.0, 0.0]),
+        np.array([1.0, 0.0, 0.0]),
+    ]
+
+    multipliers = []
+    with self.test_session() as session:
+      session.run(standard_ops.global_variables_initializer())
+      while len(multipliers) < len(expected_multipliers):
+        multipliers.append(session.run(optimizer.lagrange_multipliers))
+        session.run(train_op)
+
+    for expected, actual in zip(expected_multipliers, multipliers):
+      self.assertAllClose(expected, actual, rtol=0, atol=1e-6)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..04014ab4aebd6d9cd70653c53f9361320e803329
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
@@ -0,0 +1,595 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines `{Additive,Multiplicative}SwapRegretOptimizer`s.
+
+These optimizers minimize a `ConstrainedMinimizationProblem` by using a
+swap-regret minimizing algorithm (either SGD or multiplicative weights) to learn
+what weights should be associated with the objective function and constraints.
+These algorithms do *not* use Lagrange multipliers, but the idea is similar.
+The main differences between the formulation used here, and the standard
+Lagrangian formulation, are that (i) the objective function is weighted, in
+addition to the constraints, and (ii) we learn a matrix of weights, instead of a
+vector.
+
+For the purposes of constrained optimization, at least in theory,
+external-regret minimization suffices if the `ConstrainedMinimizationProblem`
+we're optimizing doesn't have any `proxy_constraints`, while swap-regret
+minimization should be used if `proxy_constraints` are present.
+
+For more specifics, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+The formulation used by both of the SwapRegretOptimizers can be found in
+Definition 2, and is discussed in Section 4. The
+`MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2 in Section 4,
+with the difference being that it uses `tf.train.Optimizer`s, instead of SGD,
+for the "inner" updates. The `AdditiveSwapRegretOptimizer` differs further in
+that it performs additive (instead of multiplicative) updates of the stochastic
+matrix.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import math
+
+import six
+
+from tensorflow.contrib.constrained_optimization.python import constrained_optimizer
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer as train_optimizer
+
+
+def _maximal_eigenvector_power_method(matrix,
+                                      epsilon=1e-6,
+                                      maximum_iterations=100):
+  """Returns the maximal right-eigenvector of `matrix` using the power method.
+
+  Args:
+    matrix: 2D Tensor, the matrix of which we will find the maximal
+      right-eigenvector.
+    epsilon: nonnegative float, if two iterations of the power method differ (in
+      L2 norm) by no more than epsilon, we will terminate.
+    maximum_iterations: nonnegative int, if we perform this many iterations, we
+      will terminate.
+
+  Result:
+    The maximal right-eigenvector of `matrix`.
+
+  Raises:
+    ValueError: If the epsilon or maximum_iterations parameters violate their
+      bounds.
+  """
+  if epsilon <= 0.0:
+    raise ValueError("epsilon must be strictly positive")
+  if maximum_iterations <= 0:
+    raise ValueError("maximum_iterations must be strictly positive")
+
+  def while_loop_condition(iteration, eigenvector, old_eigenvector):
+    """Returns false if the while loop should terminate."""
+    not_done = (iteration < maximum_iterations)
+    not_converged = (standard_ops.norm(eigenvector - old_eigenvector) > epsilon)
+    return standard_ops.logical_and(not_done, not_converged)
+
+  def while_loop_body(iteration, eigenvector, old_eigenvector):
+    """Performs one iteration of the power method."""
+    del old_eigenvector  # Needed by the condition, but not the body.
+    iteration += 1
+    # We need to use tf.matmul() and tf.expand_dims(), instead of
+    # tf.tensordot(), since the former will infer the shape of the result, while
+    # the latter will not (tf.while_loop() needs the shapes).
+    new_eigenvector = standard_ops.matmul(
+        matrix, standard_ops.expand_dims(eigenvector, 1))[:, 0]
+    new_eigenvector /= standard_ops.norm(new_eigenvector)
+    return (iteration, new_eigenvector, eigenvector)
+
+  iteration = standard_ops.constant(0)
+  eigenvector = standard_ops.ones_like(matrix[:, 0])
+  eigenvector /= standard_ops.norm(eigenvector)
+
+  # We actually want a do-while loop, so we explicitly call while_loop_body()
+  # once before tf.while_loop().
+  iteration, eigenvector, old_eigenvector = while_loop_body(
+      iteration, eigenvector, eigenvector)
+  iteration, eigenvector, old_eigenvector = control_flow_ops.while_loop(
+      while_loop_condition,
+      while_loop_body,
+      loop_vars=(iteration, eigenvector, old_eigenvector),
+      name="power_method")
+
+  return eigenvector
+
+
+def _project_stochastic_matrix_wrt_euclidean_norm(matrix):
+  """Projects its argument onto the set of left-stochastic matrices.
+
+  This algorithm is O(n^3) at worst, where `matrix` is n*n. It can be done in
+  O(n^2 * log(n)) time by sorting each column (and maybe better with a different
+  algorithm), but the algorithm implemented here is easier to implement in
+  TensorFlow.
+
+  Args:
+    matrix: 2d square tensor, the matrix to project.
+
+  Returns:
+    The 2d square tensor that results from projecting `matrix` onto the set of
+      left-stochastic matrices w.r.t. the Euclidean norm applied column-wise
+      (i.e. the Frobenius norm).
+
+  Raises:
+    ValueError: if the `matrix` tensor does not have a fully-known shape, or is
+      not two-dimensional and square.
+  """
+  matrix_shape = matrix.get_shape()
+  if matrix_shape is None:
+    raise ValueError("matrix must have known shape")
+  if matrix_shape.ndims != 2:
+    raise ValueError(
+        "matrix must be two dimensional (instead is %d-dimensional)" %
+        matrix_shape.ndims)
+  if matrix_shape[0] != matrix_shape[1]:
+    raise ValueError("matrix must be be square (instead has shape (%d,%d))" %
+                     (matrix_shape[0], matrix_shape[1]))
+  dimension = matrix_shape[0].value
+  if dimension is None:
+    raise ValueError("matrix must have fully-known shape")
+
+  def while_loop_condition(iteration, matrix, inactive, old_inactive):
+    """Returns false if the while loop should terminate."""
+    del matrix  # Needed by the body, but not the condition.
+    not_done = (iteration < dimension)
+    not_converged = standard_ops.reduce_any(
+        standard_ops.not_equal(inactive, old_inactive))
+    return standard_ops.logical_and(not_done, not_converged)
+
+  def while_loop_body(iteration, matrix, inactive, old_inactive):
+    """Performs one iteration of the projection."""
+    del old_inactive  # Needed by the condition, but not the body.
+    iteration += 1
+    scale = (1.0 - standard_ops.reduce_sum(
+        matrix, axis=0, keep_dims=True)) / standard_ops.maximum(
+            1.0, standard_ops.reduce_sum(inactive, axis=0, keep_dims=True))
+    matrix += scale * inactive
+    new_inactive = standard_ops.to_float(matrix > 0)
+    matrix *= new_inactive
+    return (iteration, matrix, new_inactive, inactive)
+
+  iteration = standard_ops.constant(0)
+  inactive = standard_ops.ones_like(matrix)
+
+  # We actually want a do-while loop, so we explicitly call while_loop_body()
+  # once before tf.while_loop().
+  iteration, matrix, inactive, old_inactive = while_loop_body(
+      iteration, matrix, inactive, inactive)
+  iteration, matrix, inactive, old_inactive = control_flow_ops.while_loop(
+      while_loop_condition,
+      while_loop_body,
+      loop_vars=(iteration, matrix, inactive, old_inactive),
+      name="euclidean_projection")
+
+  return matrix
+
+
+def _project_log_stochastic_matrix_wrt_kl_divergence(log_matrix):
+  """Projects its argument onto the set of log-left-stochastic matrices.
+
+  Args:
+    log_matrix: 2d square tensor, the element-wise logarithm of the matrix to
+      project.
+
+  Returns:
+    The 2d square tensor that results from projecting exp(`matrix`) onto the set
+      of left-stochastic matrices w.r.t. the KL-divergence applied column-wise.
+  """
+
+  # For numerical reasons, make sure that the largest matrix element is zero
+  # before exponentiating.
+  log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keep_dims=True)
+  log_matrix -= standard_ops.log(
+      standard_ops.reduce_sum(
+          standard_ops.exp(log_matrix), axis=0, keep_dims=True))
+  return log_matrix
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
+  """Base class representing a `_SwapRegretOptimizer`.
+
+  This class contains most of the logic for performing constrained optimization,
+  minimizing external regret for the constraints player. What it *doesn't* do is
+  keep track of the internal state (the stochastic matrix).  Instead, the state
+  is accessed via the _initial_state(), _stochastic_matrix(),
+  _constraint_grad_and_var() and _projection_op() methods.
+
+  The reason for this is that we want to make it easy to implement different
+  representations of the internal state. For example, for additive updates, it's
+  most natural to store the stochastic matrix directly, whereas for
+  multiplicative updates, it's most natural to store its element-wise logarithm.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by `_SwapRegretOptimizer`s can be found in Definition 2,
+  and is discussed in Section 4. Such optimizers are most similar to Algorithm
+  2 in Section 4. Most notably, the internal state is a left-stochastic matrix
+  of shape (m+1,m+1), where m is the number of constraints.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Constructs a new `_SwapRegretOptimizer`.
+
+    The difference between `optimizer` and `constraint_optimizer` (if the latter
+    is provided) is that the former is used for learning the model parameters,
+    while the latter us used for the update to the constraint/objective weight
+    matrix (the analogue of Lagrange multipliers). If no `constraint_optimizer`
+    is provided, then `optimizer` is used for both.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multiplier analogues.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multiplier analogues.
+
+    Returns:
+      A new `_SwapRegretOptimizer`.
+    """
+    super(_SwapRegretOptimizer, self).__init__(optimizer=optimizer)
+    self._constraint_optimizer = constraint_optimizer
+
+  @property
+  def constraint_optimizer(self):
+    """Returns the `tf.train.Optimizer` used for the matrix."""
+    return self._constraint_optimizer
+
+  @abc.abstractmethod
+  def _initial_state(self, num_constraints):
+    pass
+
+  @abc.abstractmethod
+  def _stochastic_matrix(self, state):
+    pass
+
+  def _distribution(self, state):
+    distribution = _maximal_eigenvector_power_method(
+        self._stochastic_matrix(state))
+    distribution = standard_ops.abs(distribution)
+    distribution /= standard_ops.reduce_sum(distribution)
+    return distribution
+
+  @abc.abstractmethod
+  def _constraint_grad_and_var(self, state, gradient):
+    pass
+
+  @abc.abstractmethod
+  def _projection_op(self, state, name=None):
+    pass
+
+  def minimize_constrained(self,
+                           minimization_problem,
+                           global_step=None,
+                           var_list=None,
+                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                           aggregation_method=None,
+                           colocate_gradients_with_ops=False,
+                           name=None,
+                           grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    The `optimizer` constructor parameter will be used to update the model
+    parameters, while the constraint/objective weight matrix (the analogue of
+    Lagrange multipliers) will be updated using `constrained_optimizer` (if
+    provided) or `optimizer` (if not). Whether the matrix updates are additive
+    or multiplicative depends on the derived class.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    objective = minimization_problem.objective
+
+    constraints = minimization_problem.constraints
+    proxy_constraints = minimization_problem.proxy_constraints
+    if proxy_constraints is None:
+      proxy_constraints = constraints
+    # Flatten both constraints tensors to 1d.
+    num_constraints = minimization_problem.num_constraints
+    constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
+    proxy_constraints = standard_ops.reshape(
+        proxy_constraints, shape=(num_constraints,))
+
+    # We use a lambda to initialize the state so that, if this function call is
+    # inside the scope of a tf.control_dependencies() block, the dependencies
+    # will not be applied to the initializer.
+    state = standard_ops.Variable(
+        lambda: self._initial_state(num_constraints),
+        trainable=False,
+        name="swap_regret_optimizer_state")
+
+    zero_and_constraints = standard_ops.concat(
+        (standard_ops.zeros((1,)), constraints), axis=0)
+    objective_and_proxy_constraints = standard_ops.concat(
+        (standard_ops.expand_dims(objective, 0), proxy_constraints), axis=0)
+
+    distribution = self._distribution(state)
+    loss = standard_ops.tensordot(distribution, objective_and_proxy_constraints,
+                                  1)
+    matrix_gradient = standard_ops.matmul(
+        standard_ops.expand_dims(zero_and_constraints, 1),
+        standard_ops.expand_dims(distribution, 0))
+
+    update_ops = []
+    if self.constraint_optimizer is None:
+      # If we don't have a separate constraint_optimizer, then we use
+      # self._optimizer for both the update of the model parameters, and that of
+      # the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      grads_and_vars.append(
+          self._constraint_grad_and_var(state, matrix_gradient))
+      update_ops.append(
+          self.optimizer.apply_gradients(grads_and_vars, name="update"))
+    else:
+      # If we have a separate constraint_optimizer, then we use self._optimizer
+      # for the update of the model parameters, and self._constraint_optimizer
+      # for that of the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      matrix_grads_and_vars = [
+          self._constraint_grad_and_var(state, matrix_gradient)
+      ]
+
+      gradients = [
+          gradient for gradient, _ in grads_and_vars + matrix_grads_and_vars
+          if gradient is not None
+      ]
+      with ops.control_dependencies(gradients):
+        update_ops.append(
+            self.optimizer.apply_gradients(grads_and_vars, name="update"))
+        update_ops.append(
+            self.constraint_optimizer.apply_gradients(
+                matrix_grads_and_vars, name="optimizer_state_update"))
+
+    with ops.control_dependencies(update_ops):
+      if global_step is None:
+        # If we don't have a global step, just project, and we're done.
+        return self._projection_op(state, name=name)
+      else:
+        # If we have a global step, then we need to increment it in addition to
+        # projecting.
+        projection_op = self._projection_op(state, name="project")
+        with ops.colocate_with(global_step):
+          global_step_op = state_ops.assign_add(
+              global_step, 1, name="global_step_increment")
+        return control_flow_ops.group(projection_op, global_step_op, name=name)
+
+
+class AdditiveSwapRegretOptimizer(_SwapRegretOptimizer):
+  """A `ConstrainedOptimizer` based on swap-regret minimization.
+
+  This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly
+  minimize over the model parameters, and maximize over constraint/objective
+  weight matrix (the analogue of Lagrange multipliers), with the latter
+  maximization using additive updates and an algorithm that minimizes swap
+  regret.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by this optimizer can be found in Definition 2, and is
+  discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with
+  the differences being that it uses `tf.train.Optimizer`s, instead of SGD, for
+  the "inner" updates, and performs additive (instead of multiplicative) updates
+  of the stochastic matrix.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Constructs a new `AdditiveSwapRegretOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multiplier analogues.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multiplier analogues.
+
+    Returns:
+      A new `AdditiveSwapRegretOptimizer`.
+    """
+    # TODO(acotter): add a parameter determining the initial values of the
+    # matrix elements (like initial_multiplier_radius in
+    # MultiplicativeSwapRegretOptimizer).
+    super(AdditiveSwapRegretOptimizer, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+
+  def _initial_state(self, num_constraints):
+    # For an AdditiveSwapRegretOptimizer, the internal state is a tensor of
+    # shape (m+1,m+1), where m is the number of constraints, representing a
+    # left-stochastic matrix.
+    dimension = num_constraints + 1
+    # Initialize by putting all weight on the objective, and none on the
+    # constraints.
+    return standard_ops.concat(
+        (standard_ops.ones(
+            (1, dimension)), standard_ops.zeros((dimension - 1, dimension))),
+        axis=0)
+
+  def _stochastic_matrix(self, state):
+    return state
+
+  def _constraint_grad_and_var(self, state, gradient):
+    # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True?
+    return (-gradient, state)
+
+  def _projection_op(self, state, name=None):
+    with ops.colocate_with(state):
+      return state_ops.assign(
+          state,
+          _project_stochastic_matrix_wrt_euclidean_norm(state),
+          name=name)
+
+
+class MultiplicativeSwapRegretOptimizer(_SwapRegretOptimizer):
+  """A `ConstrainedOptimizer` based on swap-regret minimization.
+
+  This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly
+  minimize over the model parameters, and maximize over constraint/objective
+  weight matrix (the analogue of Lagrange multipliers), with the latter
+  maximization using multiplicative updates and an algorithm that minimizes swap
+  regret.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by this optimizer can be found in Definition 2, and is
+  discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with
+  the difference being that it uses `tf.train.Optimizer`s, instead of SGD, for
+  the "inner" updates.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               minimum_multiplier_radius=1e-3,
+               initial_multiplier_radius=None):
+    """Constructs a new `MultiplicativeSwapRegretOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multiplier analogues.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multiplier analogues.
+      minimum_multiplier_radius: float, each element of the matrix will be lower
+        bounded by `minimum_multiplier_radius` divided by one plus the number of
+        constraints.
+      initial_multiplier_radius: float, the initial value of each element of the
+        matrix associated with a constraint (i.e. excluding those elements
+        associated with the objective) will be `initial_multiplier_radius`
+        divided by one plus the number of constraints. Defaults to the value of
+        `minimum_multiplier_radius`.
+
+    Returns:
+      A new `MultiplicativeSwapRegretOptimizer`.
+
+    Raises:
+      ValueError: If the two radius parameters are inconsistent.
+    """
+    super(MultiplicativeSwapRegretOptimizer, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+
+    if (minimum_multiplier_radius <= 0.0) or (minimum_multiplier_radius >= 1.0):
+      raise ValueError("minimum_multiplier_radius must be in the range (0,1)")
+    if initial_multiplier_radius is None:
+      initial_multiplier_radius = minimum_multiplier_radius
+    elif (initial_multiplier_radius <
+          minimum_multiplier_radius) or (minimum_multiplier_radius > 1.0):
+      raise ValueError("initial_multiplier_radius must be in the range "
+                       "[minimum_multiplier_radius,1]")
+
+    self._minimum_multiplier_radius = minimum_multiplier_radius
+    self._initial_multiplier_radius = initial_multiplier_radius
+
+  def _initial_state(self, num_constraints):
+    # For a MultiplicativeSwapRegretOptimizer, the internal state is a tensor of
+    # shape (m+1,m+1), where m is the number of constraints, representing the
+    # element-wise logarithm of a left-stochastic matrix.
+    dimension = num_constraints + 1
+    # Initialize by putting as much weight as possible on the objective, and as
+    # little as possible on the constraints.
+    log_initial_one = math.log(1.0 - (self._initial_multiplier_radius *
+                                      (dimension - 1) / (dimension)))
+    log_initial_zero = math.log(self._initial_multiplier_radius / dimension)
+    return standard_ops.concat(
+        (standard_ops.constant(
+            log_initial_one, dtype=dtypes.float32, shape=(1, dimension)),
+         standard_ops.constant(
+             log_initial_zero,
+             dtype=dtypes.float32,
+             shape=(dimension - 1, dimension))),
+        axis=0)
+
+  def _stochastic_matrix(self, state):
+    return standard_ops.exp(state)
+
+  def _constraint_grad_and_var(self, state, gradient):
+    # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True?
+    return (-gradient, state)
+
+  def _projection_op(self, state, name=None):
+    with ops.colocate_with(state):
+      # Gets the dimension of the state (num_constraints + 1)--all of these
+      # assertions are of things that should be impossible, since the state
+      # passed into this method will have the same shape as that returned by
+      # _initial_state().
+      state_shape = state.get_shape()
+      assert state_shape is not None
+      assert state_shape.ndims == 2
+      assert state_shape[0] == state_shape[1]
+      dimension = state_shape[0].value
+      assert dimension is not None
+
+      minimum_log_multiplier = standard_ops.log(
+          self._minimum_multiplier_radius / standard_ops.to_float(dimension))
+
+      return state_ops.assign(
+          state,
+          standard_ops.maximum(
+              _project_log_stochastic_matrix_wrt_kl_divergence(state),
+              minimum_log_multiplier),
+          name=name)
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..34c4543dca97e12c8335e4c90b849820edaefa81
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py
@@ -0,0 +1,212 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for constrained_optimization.python.swap_regret_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.constrained_optimization.python import swap_regret_optimizer
+from tensorflow.contrib.constrained_optimization.python import test_util
+
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+class AdditiveSwapRegretOptimizerWrapper(
+    swap_regret_optimizer.AdditiveSwapRegretOptimizer):
+  """Testing wrapper class around AdditiveSwapRegretOptimizer.
+
+  This class is identical to AdditiveSwapRegretOptimizer, except that it caches
+  the internal optimization state when _stochastic_matrix() is called, so that
+  we can test that the stochastic matrices take on their expected values.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Same as AdditiveSwapRegretOptimizer.__init__()."""
+    super(AdditiveSwapRegretOptimizerWrapper, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+    self._cached_stochastic_matrix = None
+
+  @property
+  def stochastic_matrix(self):
+    """Returns the cached stochastic matrix."""
+    return self._cached_stochastic_matrix
+
+  def _stochastic_matrix(self, state):
+    """Caches the internal state for testing."""
+    self._cached_stochastic_matrix = super(AdditiveSwapRegretOptimizerWrapper,
+                                           self)._stochastic_matrix(state)
+    return self._cached_stochastic_matrix
+
+
+class MultiplicativeSwapRegretOptimizerWrapper(
+    swap_regret_optimizer.MultiplicativeSwapRegretOptimizer):
+  """Testing wrapper class around MultiplicativeSwapRegretOptimizer.
+
+  This class is identical to MultiplicativeSwapRegretOptimizer, except that it
+  caches the internal optimization state when _stochastic_matrix() is called, so
+  that we can test that the stochastic matrices take on their expected values.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               minimum_multiplier_radius=None,
+               initial_multiplier_radius=None):
+    """Same as MultiplicativeSwapRegretOptimizer.__init__()."""
+    super(MultiplicativeSwapRegretOptimizerWrapper, self).__init__(
+        optimizer=optimizer,
+        constraint_optimizer=constraint_optimizer,
+        minimum_multiplier_radius=1e-3,
+        initial_multiplier_radius=initial_multiplier_radius)
+    self._cached_stochastic_matrix = None
+
+  @property
+  def stochastic_matrix(self):
+    """Returns the cached stochastic matrix."""
+    return self._cached_stochastic_matrix
+
+  def _stochastic_matrix(self, state):
+    """Caches the internal state for testing."""
+    self._cached_stochastic_matrix = super(
+        MultiplicativeSwapRegretOptimizerWrapper,
+        self)._stochastic_matrix(state)
+    return self._cached_stochastic_matrix
+
+
+class SwapRegretOptimizerTest(test.TestCase):
+
+  def test_maximum_eigenvector_power_method(self):
+    """Tests power method routine on some known left-stochastic matrices."""
+    matrix1 = np.matrix([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9], [0.4, 0.3, 0.0]])
+    matrix2 = np.matrix([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5], [0.4, 0.5, 0.3]])
+
+    with self.test_session() as session:
+      eigenvector1 = session.run(
+          swap_regret_optimizer._maximal_eigenvector_power_method(
+              standard_ops.constant(matrix1)))
+      eigenvector2 = session.run(
+          swap_regret_optimizer._maximal_eigenvector_power_method(
+              standard_ops.constant(matrix2)))
+
+    # Check that eigenvector1 and eigenvector2 are eigenvectors of matrix1 and
+    # matrix2 (respectively) with associated eigenvalue 1.
+    matrix_eigenvector1 = np.tensordot(matrix1, eigenvector1, axes=1)
+    matrix_eigenvector2 = np.tensordot(matrix2, eigenvector2, axes=1)
+    self.assertAllClose(eigenvector1, matrix_eigenvector1, rtol=0, atol=1e-6)
+    self.assertAllClose(eigenvector2, matrix_eigenvector2, rtol=0, atol=1e-6)
+
+  def test_project_stochastic_matrix_wrt_euclidean_norm(self):
+    """Tests Euclidean projection routine on some known values."""
+    matrix = standard_ops.constant([[-0.1, -0.1, 0.4], [-0.8, 0.4, 1.2],
+                                    [-0.3, 0.1, 0.2]])
+    expected_projected_matrix = np.array([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9],
+                                          [0.4, 0.3, 0.0]])
+
+    with self.test_session() as session:
+      projected_matrix = session.run(
+          swap_regret_optimizer._project_stochastic_matrix_wrt_euclidean_norm(
+              matrix))
+
+    self.assertAllClose(
+        expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6)
+
+  def test_project_log_stochastic_matrix_wrt_kl_divergence(self):
+    """Tests KL-divergence projection routine on some known values."""
+    matrix = standard_ops.constant([[0.2, 0.8, 0.6], [0.1, 0.2, 1.5],
+                                    [0.2, 1.0, 0.9]])
+    expected_projected_matrix = np.array([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5],
+                                          [0.4, 0.5, 0.3]])
+
+    with self.test_session() as session:
+      projected_matrix = session.run(
+          standard_ops.exp(
+              swap_regret_optimizer.
+              _project_log_stochastic_matrix_wrt_kl_divergence(
+                  standard_ops.log(matrix))))
+
+    self.assertAllClose(
+        expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6)
+
+  def test_additive_swap_regret_optimizer(self):
+    """Tests that the stochastic matrices update as expected."""
+    minimization_problem = test_util.ConstantMinimizationProblem(
+        np.array([0.6, -0.1, 0.4]))
+    optimizer = AdditiveSwapRegretOptimizerWrapper(
+        gradient_descent.GradientDescentOptimizer(1.0))
+    train_op = optimizer.minimize_constrained(minimization_problem)
+
+    # Calculated using a numpy+python implementation of the algorithm.
+    expected_matrices = [
+        np.array([[1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0],
+                  [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]),
+        np.array([[0.66666667, 1.0, 1.0, 1.0], [0.26666667, 0.0, 0.0, 0.0],
+                  [0.0, 0.0, 0.0, 0.0], [0.06666667, 0.0, 0.0, 0.0]]),
+        np.array([[0.41666667, 0.93333333, 1.0,
+                   0.98333333], [0.46666667, 0.05333333, 0.0,
+                                 0.01333333], [0.0, 0.0, 0.0, 0.0],
+                  [0.11666667, 0.01333333, 0.0, 0.00333333]]),
+    ]
+
+    matrices = []
+    with self.test_session() as session:
+      session.run(standard_ops.global_variables_initializer())
+      while len(matrices) < len(expected_matrices):
+        matrices.append(session.run(optimizer.stochastic_matrix))
+        session.run(train_op)
+
+    for expected, actual in zip(expected_matrices, matrices):
+      self.assertAllClose(expected, actual, rtol=0, atol=1e-6)
+
+  def test_multiplicative_swap_regret_optimizer(self):
+    """Tests that the stochastic matrices update as expected."""
+    minimization_problem = test_util.ConstantMinimizationProblem(
+        np.array([0.6, -0.1, 0.4]))
+    optimizer = MultiplicativeSwapRegretOptimizerWrapper(
+        gradient_descent.GradientDescentOptimizer(1.0),
+        initial_multiplier_radius=0.8)
+    train_op = optimizer.minimize_constrained(minimization_problem)
+
+    # Calculated using a numpy+python implementation of the algorithm.
+    expected_matrices = [
+        np.array([[0.4, 0.4, 0.4, 0.4], [0.2, 0.2, 0.2, 0.2],
+                  [0.2, 0.2, 0.2, 0.2], [0.2, 0.2, 0.2, 0.2]]),
+        np.array([[0.36999014, 0.38528351, 0.38528351, 0.38528351], [
+            0.23517483, 0.21720297, 0.21720297, 0.21720297
+        ], [0.17774131, 0.18882719, 0.18882719, 0.18882719],
+                  [0.21709373, 0.20868632, 0.20868632, 0.20868632]]),
+        np.array([[0.33972109, 0.36811863, 0.37118462, 0.36906575], [
+            0.27114826, 0.23738228, 0.23376693, 0.23626491
+        ], [0.15712313, 0.17641793, 0.17858959, 0.17708679],
+                  [0.23200752, 0.21808115, 0.21645886, 0.21758255]]),
+    ]
+
+    matrices = []
+    with self.test_session() as session:
+      session.run(standard_ops.global_variables_initializer())
+      while len(matrices) < len(expected_matrices):
+        matrices.append(session.run(optimizer.stochastic_matrix))
+        session.run(train_op)
+
+    for expected, actual in zip(expected_matrices, matrices):
+      self.assertAllClose(expected, actual, rtol=0, atol=1e-6)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/test_util.py b/tensorflow/contrib/constrained_optimization/python/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..704b36ca4c9cf94e7c304f9bed4f6ac7ca275deb
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/test_util.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains helpers used by tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.constrained_optimization.python import constrained_minimization_problem
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import standard_ops
+
+
+class ConstantMinimizationProblem(
+    constrained_minimization_problem.ConstrainedMinimizationProblem):
+  """A `ConstrainedMinimizationProblem` with constant constraint violations.
+
+  This minimization problem is intended for use in performing simple tests of
+  the Lagrange multiplier (or equivalent) update in the optimizers. There is a
+  one-element "dummy" model parameter, but it should be ignored.
+  """
+
+  def __init__(self, constraints):
+    """Constructs a new `ConstantMinimizationProblem'.
+
+    Args:
+      constraints: 1d numpy array, the constant constraint violations.
+
+    Returns:
+      A new `ConstantMinimizationProblem'.
+    """
+    # We make an fake 1-parameter linear objective so that we don't get a "no
+    # variables to optimize" error.
+    self._objective = standard_ops.Variable(0.0, dtype=dtypes.float32)
+    self._constraints = standard_ops.constant(constraints, dtype=dtypes.float32)
+
+  @property
+  def objective(self):
+    """Returns the objective function."""
+    return self._objective
+
+  @property
+  def constraints(self):
+    """Returns the constant constraint violations."""
+    return self._constraints
diff --git a/tensorflow/contrib/copy_graph/BUILD b/tensorflow/contrib/copy_graph/BUILD
index 8ec706df74e2c91345c4bf7a506fdb424a996773..fa44c4d54e1ee871feb425115525b1cf8b732214 100644
--- a/tensorflow/contrib/copy_graph/BUILD
+++ b/tensorflow/contrib/copy_graph/BUILD
@@ -41,15 +41,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index b806799202bff4f2f6dbf717fbeea74a04b8cd6e..102bc460fdadb0ad5dc9a2960b8655c55357108e 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -201,7 +201,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
     #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it
     #stores String-based info such as name, device and type of the op.
     #Unique to every Operation instance.
-    new_node_def = deepcopy(op._node_def)
+    new_node_def = deepcopy(op.node_def)
     #Change the name
     new_node_def.name = new_name
 
@@ -211,7 +211,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
 
     #Make a copy of the op_def too.
     #Its unique to every _type_ of Operation.
-    op_def = deepcopy(op._op_def)
+    op_def = deepcopy(op.op_def)
 
     #Initialize a new Operation instance
     new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types,
diff --git a/tensorflow/contrib/crf/BUILD b/tensorflow/contrib/crf/BUILD
index 7aad4abdb908d0284b85137bff842bd0f38d09c6..5c1a17df4f95f3c4d05b286de0e3d7b009a76bd7 100644
--- a/tensorflow/contrib/crf/BUILD
+++ b/tensorflow/contrib/crf/BUILD
@@ -40,15 +40,3 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 721dc4d0801d1f0e116921888e3851a95e0b72b0..74f2ec22ffaab1654e5cd38169258fb87d307ad4 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -152,6 +152,22 @@ class CrfTest(test.TestCase):
 
         self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
 
+  def testCrfLogNormZeroSeqLength(self):
+    """
+    Test `crf_log_norm` when `sequence_lengths` contains one or more zeros.
+    """
+    with self.test_session() as sess:
+      inputs = constant_op.constant(np.ones([2, 10, 5],
+                                            dtype=np.float32))
+      transition_params = constant_op.constant(np.ones([5, 5],
+                                                       dtype=np.float32))
+      sequence_lengths = constant_op.constant(np.zeros([2],
+                                                       dtype=np.int32))
+      expected_log_norm = np.zeros([2], dtype=np.float32)
+      log_norm = crf.crf_log_norm(inputs, sequence_lengths, transition_params)
+      tf_log_norm = sess.run(log_norm)
+      self.assertAllClose(tf_log_norm, expected_log_norm)
+
   def testCrfLogLikelihood(self):
     inputs = np.array(
         [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
@@ -281,6 +297,21 @@ class CrfTest(test.TestCase):
         self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
                          expected_max_sequence[:sequence_lengths])
 
+  def testCrfDecodeZeroSeqLength(self):
+    """
+    Test that crf_decode works when sequence_length contains one or more zeros.
+    """
+    with self.test_session() as sess:
+      inputs = constant_op.constant(np.ones([2, 10, 5],
+                                            dtype=np.float32))
+      transition_params = constant_op.constant(np.ones([5, 5],
+                                                       dtype=np.float32))
+      sequence_lengths = constant_op.constant(np.zeros([2],
+                                                       dtype=np.int32))
+      tags, scores = crf.crf_decode(inputs, transition_params, sequence_lengths)
+      tf_tags, tf_scores = sess.run([tags, scores])
+      self.assertEqual(len(tf_tags.shape), 2)
+      self.assertEqual(len(tf_scores.shape), 1)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index a30bf06396117034f7cfe461d5e365b8a4a38a3f..d2beff849eb8d177b9a4e8e6fea1943a17748fd3 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -90,9 +90,13 @@ def crf_sequence_score(inputs, tag_indices, sequence_lengths,
     batch_size = array_ops.shape(inputs, out_type=tag_indices.dtype)[0]
     example_inds = array_ops.reshape(
         math_ops.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
-    return array_ops.gather_nd(
+    sequence_scores = array_ops.gather_nd(
         array_ops.squeeze(inputs, [1]),
         array_ops.concat([example_inds, tag_indices], axis=1))
+    sequence_scores = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
+                                      array_ops.zeros_like(sequence_scores),
+                                      sequence_scores)
+    return sequence_scores
 
   def _multi_seq_fn():
     # Compute the scores of the given tag sequence.
@@ -128,7 +132,12 @@ def crf_log_norm(inputs, sequence_lengths, transition_params):
   # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
   # the "initial state" (the unary potentials).
   def _single_seq_fn():
-    return math_ops.reduce_logsumexp(first_input, [1])
+    log_norm = math_ops.reduce_logsumexp(first_input, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
+                               array_ops.zeros_like(log_norm),
+                               log_norm)
+    return log_norm
 
   def _multi_seq_fn():
     """Forward computation of alpha values."""
@@ -137,13 +146,19 @@ def crf_log_norm(inputs, sequence_lengths, transition_params):
     # Compute the alpha values in the forward algorithm in order to get the
     # partition function.
     forward_cell = CrfForwardRnnCell(transition_params)
+    # Sequence length is not allowed to be less than zero.
+    sequence_lengths_less_one = math_ops.maximum(0, sequence_lengths - 1)
     _, alphas = rnn.dynamic_rnn(
         cell=forward_cell,
         inputs=rest_of_input,
-        sequence_length=sequence_lengths - 1,
+        sequence_length=sequence_lengths_less_one,
         initial_state=first_input,
         dtype=dtypes.float32)
     log_norm = math_ops.reduce_logsumexp(alphas, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
+                               array_ops.zeros_like(log_norm),
+                               log_norm)
     return log_norm
 
   max_seq_len = array_ops.shape(inputs)[1]
@@ -479,15 +494,17 @@ def crf_decode(potentials, transition_params, sequence_length):
     initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
     initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
     inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    # Sequence length is not allowed to be less than zero.
+    sequence_length_less_one = math_ops.maximum(0, sequence_length - 1)
     backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
         crf_fwd_cell,
         inputs=inputs,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
     backpointers = gen_array_ops.reverse_sequence(  # [B, T - 1, O]
-        backpointers, sequence_length - 1, seq_dim=1)
+        backpointers, sequence_length_less_one, seq_dim=1)
 
     # Computes backward decoding. Extract tag indices from backpointers.
     crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
@@ -497,7 +514,7 @@ def crf_decode(potentials, transition_params, sequence_length):
     decode_tags, _ = rnn.dynamic_rnn(  # [B, T - 1, 1]
         crf_bwd_cell,
         inputs=backpointers,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
@@ -511,7 +528,7 @@ def crf_decode(potentials, transition_params, sequence_length):
     return decode_tags, best_score
 
   return utils.smart_cond(
-      pred=math_ops.equal(
-          potentials.shape[1].value or array_ops.shape(potentials)[1], 1),
+      pred=math_ops.equal(potentials.shape[1].value or
+                          array_ops.shape(potentials)[1], 1),
       true_fn=_single_seq_fn,
       false_fn=_multi_seq_fn)
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index fec358c4e1067dc8dc8173d1b9d05dc90b90ca05..aeefa3cee62281c74388765ea5e2cbc7f16ff927 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -9,52 +9,10 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-tf_custom_op_library(
-    name = "python/ops/_cudnn_rnn_ops.so",
-    srcs = [
-        "kernels/cudnn_rnn_ops.cc",
-        "ops/cudnn_rnn_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/core/kernels:bounds_check_lib",
-        "@farmhash_archive//:farmhash",
-    ],
-)
-
-tf_kernel_library(
-    name = "cudnn_rnn_kernels",
-    srcs = ["kernels/cudnn_rnn_ops.cc"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
-        "//tensorflow/core/kernels:bounds_check_lib",
-        "//third_party/eigen3",
-        "@farmhash_archive//:farmhash",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["cudnn_rnn_ops"],
-    deps = [
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "cudnn_rnn_ops",
-    deps = [":cudnn_rnn_ops_op_lib"],
-)
 
 tf_custom_op_py_library(
     name = "cudnn_rnn_py",
@@ -64,20 +22,14 @@ tf_custom_op_py_library(
         "python/layers/cudnn_rnn.py",
         "python/ops/cudnn_rnn_ops.py",
     ],
-    dso = [
-        ":python/ops/_cudnn_rnn_ops.so",
-    ],
-    kernels = [
-        ":cudnn_rnn_kernels",
-        ":cudnn_rnn_ops_op_lib",
-    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        ":cudnn_rnn_ops",
+        "//tensorflow/contrib/checkpoint/python:split_dependency",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cudnn_rnn_ops_gen",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
@@ -172,32 +124,3 @@ cuda_py_test(
         "requires_cudnn5",
     ],
 )
-
-tf_cc_test(
-    name = "cudnn_rnn_ops_test_cc",
-    size = "small",
-    srcs = [
-        "ops/cudnn_rnn_ops_test.cc",
-    ],
-    deps = [
-        ":cudnn_rnn_ops_op_lib",
-        "//tensorflow/core",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
deleted file mode 100644
index ba9686e94ee7072cc485c955decb2287bd4a56f3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ /dev/null
@@ -1,1145 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#define EIGEN_USE_THREADS
-
-#include <stddef.h>
-#include <atomic>
-#include <cmath>
-#include <functional>
-#include <limits>
-#include <string>
-#include <unordered_set>
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/device_base.h"
-#include "tensorflow/core/framework/kernel_def_builder.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/fingerprint.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/env_var.h"
-
-#if GOOGLE_CUDA
-#include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/util/stream_executor_util.h"
-#endif  // GOOGLE_CUDA
-
-/*
- * This module implements ops that fuse a multi-layer multi-step RNN/LSTM model
- * using the underlying Cudnn library.
- *
- * Cudnn RNN library exposes an opaque parameter buffer with unknown layout and
- * format. And it is very likely that if saved, they cannot be used across
- * different GPUs. So users need to first query the size of the opaque
- * parameter buffer, and convert it to and from its canonical forms. But each
- * actual training step is carried out with the parameter buffer.
- *
- * Similar to many other ops, the forward op has two flavors: training and
- * inference. When training is specified, additional data in reserve_space will
- * be produced for the backward pass. So there is a performance penalty.
- *
- * In addition to the actual data and reserve_space, Cudnn also needs more
- * memory as temporary workspace. The memory management to and from
- * stream-executor is done through ScratchAllocator. In general,
- * stream-executor is responsible for creating the memory of proper size. And
- * TensorFlow is responsible for making sure the memory is alive long enough
- * and recycles afterwards.
- *
- */
-namespace tensorflow {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-#if GOOGLE_CUDA
-
-using GPUDevice = Eigen::GpuDevice;
-
-template <typename Device, typename T, typename Index>
-class CudnnRNNParamsSizeOp;
-
-template <typename Device, typename T>
-class CudnnRNNParamsToCanonical;
-
-template <typename Device, typename T>
-class CudnnRNNCanonicalToParams;
-
-template <typename Device, typename T>
-class CudnnRNNForwardOp;
-
-template <typename Device, typename T>
-class CudnnRNNBackwardOp;
-
-enum class TFRNNInputMode {
-  kRNNLinearInput = 0,
-  kRNNSkipInput = 1,
-  kAutoSelect = 9999999
-};
-
-namespace {
-using perftools::gputools::DeviceMemory;
-using perftools::gputools::DeviceMemoryBase;
-using perftools::gputools::ScratchAllocator;
-using perftools::gputools::dnn::RnnDirectionMode;
-using perftools::gputools::dnn::RnnInputMode;
-using perftools::gputools::dnn::RnnMode;
-using perftools::gputools::dnn::ToDataType;
-using perftools::gputools::port::StatusOr;
-
-Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
-  if (str == "rnn_relu") {
-    *rnn_mode = RnnMode::kRnnRelu;
-    return Status::OK();
-  } else if (str == "rnn_tanh") {
-    *rnn_mode = RnnMode::kRnnTanh;
-    return Status::OK();
-  } else if (str == "lstm") {
-    *rnn_mode = RnnMode::kRnnLstm;
-    return Status::OK();
-  } else if (str == "gru") {
-    *rnn_mode = RnnMode::kRnnGru;
-    return Status::OK();
-  }
-  return errors::InvalidArgument("Invalid RNN mode: ", str);
-}
-
-Status ParseTFRNNInputMode(const string& str, TFRNNInputMode* rnn_input_mode) {
-  if (str == "linear_input") {
-    *rnn_input_mode = TFRNNInputMode::kRNNLinearInput;
-    return Status::OK();
-  } else if (str == "skip_input") {
-    *rnn_input_mode = TFRNNInputMode::kRNNSkipInput;
-    return Status::OK();
-  } else if (str == "auto_select") {
-    *rnn_input_mode = TFRNNInputMode::kAutoSelect;
-    return Status::OK();
-  }
-  return errors::InvalidArgument("Invalid RNN input mode: ", str);
-}
-
-Status ParseRNNDirectionMode(const string& str,
-                             RnnDirectionMode* rnn_dir_mode) {
-  if (str == "unidirectional") {
-    *rnn_dir_mode = RnnDirectionMode::kRnnUnidirectional;
-    return Status::OK();
-  } else if (str == "bidirectional") {
-    *rnn_dir_mode = RnnDirectionMode::kRnnBidirectional;
-    return Status::OK();
-  }
-  return errors::InvalidArgument("Invalid RNN direction mode: ", str);
-}
-
-Status ToRNNInputMode(TFRNNInputMode tf_input_mode, int num_units,
-                      int input_size, RnnInputMode* input_mode) {
-  switch (tf_input_mode) {
-    case TFRNNInputMode::kRNNLinearInput:
-      *input_mode = RnnInputMode::kRnnLinearSkip;
-      break;
-    case TFRNNInputMode::kRNNSkipInput:
-      *input_mode = RnnInputMode::kRnnSkipInput;
-      break;
-    case TFRNNInputMode::kAutoSelect:
-      *input_mode = (input_size == num_units) ? RnnInputMode::kRnnSkipInput
-                                              : RnnInputMode::kRnnLinearSkip;
-      break;
-    default:
-      return errors::InvalidArgument("Invalid TF input mode: ",
-                                     static_cast<int>(tf_input_mode));
-  }
-  return Status::OK();
-}
-
-// TODO(zhengxq): Merge those into stream_executor_util.h.
-template <typename T>
-const DeviceMemory<T> AsDeviceMemory(const Tensor* tensor) {
-  return DeviceMemory<T>::MakeFromByteSize(
-      const_cast<T*>(tensor->template flat<T>().data()),
-      tensor->template flat<T>().size() * sizeof(T));
-}
-
-template <typename T>
-DeviceMemory<T> AsDeviceMemory(Tensor* tensor) {
-  return DeviceMemory<T>::MakeFromByteSize(
-      tensor->template flat<T>().data(),
-      tensor->template flat<T>().size() * sizeof(T));
-}
-
-template <typename U, typename T>
-DeviceMemory<U> CastDeviceMemory(Tensor* tensor) {
-  return DeviceMemory<U>::MakeFromByteSize(
-      tensor->template flat<T>().data(),
-      tensor->template flat<T>().size() * sizeof(T));
-}
-
-DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
-                                   int64 offset, int64 size) {
-  const void* base_ptr = device_memory.opaque();
-  void* offset_ptr =
-      const_cast<char*>(reinterpret_cast<const char*>(base_ptr) + offset);
-  CHECK(offset + size <= device_memory.size())
-      << "The slice is not within the region of DeviceMemory.";
-  return DeviceMemoryBase(offset_ptr, size);
-}
-
-inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
-  return s.ok() ? Status::OK()
-                : Status(static_cast<tensorflow::error::Code>(
-                             static_cast<int>(s.code())),
-                         s.error_message());
-}
-
-template <typename T>
-inline Status FromExecutorStatus(
-    const perftools::gputools::port::StatusOr<T>& s) {
-  return FromExecutorStatus(s.status());
-}
-
-inline perftools::gputools::port::Status ToExecutorStatus(const Status& s) {
-  return s.ok() ? perftools::gputools::port::Status::OK()
-                : perftools::gputools::port::Status(
-                      static_cast<perftools::gputools::port::error::Code>(
-                          static_cast<int>(s.code())),
-                      s.error_message());
-}
-
-// A helper to allocate temporary scratch memory for Cudnn RNN models. It takes
-// the ownership of the underlying memory. The expectation is that the memory
-// should be alive for the span of the Cudnn RNN itself.
-class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
- public:
-  ~CudnnRNNWorkspaceAllocator() override {}
-  explicit CudnnRNNWorkspaceAllocator(OpKernelContext* context)
-      : context_(context) {}
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
-    return std::numeric_limits<int64>::max();
-  }
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(
-      perftools::gputools::Stream* stream, int64 byte_size) override {
-    Tensor temporary_memory;
-    Status allocation_status(context_->allocate_temp(
-        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
-    if (!allocation_status.ok()) {
-      return ToExecutorStatus(allocation_status);
-    }
-    // Hold the reference of the allocated tensors until the end of the
-    // allocator.
-    allocated_tensors_.push_back(temporary_memory);
-    total_byte_size_ += byte_size;
-    return StatusOr<DeviceMemory<uint8>>(
-        AsDeviceMemory<uint8>(&temporary_memory));
-  }
-  int64 TotalByteSize() { return total_byte_size_; }
-
- private:
-  int64 total_byte_size_ = 0;
-  OpKernelContext* context_;  // not owned
-  std::vector<Tensor> allocated_tensors_;
-};
-
-// A helper to allocate reserve-space memory for Cudnn RNN models. The tensors
-// are allocated as a kernel output, and will be fed into the backward pass.
-// The memory is expected to live long enough after the backward pass is
-// finished.
-template <typename T>
-class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
- public:
-  ~CudnnRNNReserveSpaceAllocator() override {}
-  CudnnRNNReserveSpaceAllocator(OpKernelContext* context, int output_index)
-      : context_(context), output_index_(output_index) {}
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
-    return std::numeric_limits<int64>::max();
-  }
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(
-      perftools::gputools::Stream* stream, int64 byte_size) override {
-    CHECK(total_byte_size_ == 0)
-        << "Reserve space allocator can only be called once";
-    int64 allocate_count =
-        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
-
-    Tensor* temporary_memory = nullptr;
-    Status allocation_status(context_->allocate_output(
-        output_index_, TensorShape({allocate_count}), &temporary_memory));
-    if (!allocation_status.ok()) {
-      return ToExecutorStatus(allocation_status);
-    }
-    total_byte_size_ += byte_size;
-    auto memory_uint8 = DeviceMemory<uint8>::MakeFromByteSize(
-        temporary_memory->template flat<T>().data(),
-        temporary_memory->template flat<T>().size() * sizeof(T));
-    return StatusOr<DeviceMemory<uint8>>(memory_uint8);
-  }
-  int64 TotalByteSize() { return total_byte_size_; }
-
- private:
-  int64 total_byte_size_ = 0;
-  OpKernelContext* context_;  // not owned
-  int output_index_;
-};
-
-// A helper to allocate persistent memory for Cudnn RNN models, which is
-// expected to live between kernel invocations.
-// This class is not thread-safe.
-class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
- public:
-  explicit CudnnRNNPersistentSpaceAllocator(OpKernelContext* context)
-      : context_(context) {}
-
-  ~CudnnRNNPersistentSpaceAllocator() override {}
-
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
-    return std::numeric_limits<int64>::max();
-  }
-
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(
-      perftools::gputools::Stream* stream, int64 byte_size) override {
-    if (total_byte_size_ != 0) {
-      return Status(error::FAILED_PRECONDITION,
-                    "Persistent space allocator can only be called once");
-    }
-
-    Status allocation_status = context_->allocate_persistent(
-        DT_UINT8, TensorShape({byte_size}), &handle_, nullptr);
-    if (!allocation_status.ok()) {
-      return ToExecutorStatus(allocation_status);
-    }
-    total_byte_size_ += byte_size;
-    return AsDeviceMemory<uint8>(handle_.AccessTensor(context_));
-  }
-  int64 TotalByteSize() { return total_byte_size_; }
-
- private:
-  int64 total_byte_size_ = 0;
-  PersistentTensor handle_;
-  OpKernelContext* context_;  // not owned
-};
-
-struct CudnnModelTypes {
-  RnnMode rnn_mode;
-  TFRNNInputMode rnn_input_mode;
-  RnnDirectionMode rnn_direction_mode;
-  bool HasInputC() const {
-    // For Cudnn 5.0, only LSTM has input-c. All other models use only input-h.
-    return rnn_mode == RnnMode::kRnnLstm;
-  }
-};
-
-// A helper class that collects the shapes to describe a RNN model.
-struct CudnnModelShapes {
-  int num_layers;
-  int input_size;
-  int num_units;
-  int seq_length;
-  int batch_size;
-  int dir_count;
-  TensorShape input_shape;
-  TensorShape output_shape;
-  TensorShape hidden_state_shape;
-  // At present only fields related to cached RnnDescriptor are concerned.
-  bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
-    return num_layers == rhs.num_layers && input_size == rhs.input_size &&
-           num_units == rhs.num_units && dir_count == rhs.dir_count;
-  }
-  string RnnDescDebugString() {
-    return strings::Printf(
-        "[num_layers, input_size, num_units, dir_count]: [%d, %d, %d, %d]",
-        num_layers, input_size, num_units, dir_count);
-  }
-};
-
-// Utility class for using CudnnModelShapes as a hash table key.
-struct CudnnModelShapesHasher {
-  uint64 operator()(const CudnnModelShapes& to_hash) const {
-    uint64 hash = static_cast<uint64>(to_hash.num_layers);
-    hash = tensorflow::FingerprintCat64(
-        hash, static_cast<uint64>(to_hash.input_size));
-    hash = tensorflow::FingerprintCat64(hash,
-                                        static_cast<uint64>(to_hash.num_units));
-    return tensorflow::FingerprintCat64(hash,
-                                        static_cast<uint64>(to_hash.dir_count));
-  }
-};
-
-// Utility class for using CudnnModelShapes as a hash table key.
-struct CudnnModelShapesComparator {
-  bool operator()(const CudnnModelShapes& first,
-                  const CudnnModelShapes& second) const {
-    return first.IsCompatibleWith(second);
-  }
-};
-
-// Extract and checks the forward input tensors, parameters, and shapes from the
-// OpKernelContext.
-Status ExtractForwardInput(OpKernelContext* context,
-                           const CudnnModelTypes& model_types,
-                           const Tensor** input, const Tensor** input_h,
-                           const Tensor** input_c, const Tensor** params,
-                           CudnnModelShapes* model_shapes) {
-  TF_RETURN_IF_ERROR(context->input("input", input));
-  TF_RETURN_IF_ERROR(context->input("input_h", input_h));
-  if (model_types.HasInputC()) {
-    TF_RETURN_IF_ERROR(context->input("input_c", input_c));
-  }
-  TF_RETURN_IF_ERROR(context->input("params", params));
-
-  if ((*input)->dims() != 3) {
-    return errors::InvalidArgument("RNN input must be a 3-D vector.");
-  }
-  model_shapes->seq_length = (*input)->dim_size(0);
-  model_shapes->batch_size = (*input)->dim_size(1);
-  model_shapes->input_size = (*input)->dim_size(2);
-  model_shapes->input_shape = (*input)->shape();
-  model_shapes->dir_count =
-      (model_types.rnn_direction_mode == RnnDirectionMode::kRnnBidirectional)
-          ? 2
-          : 1;
-
-  if ((*input_h)->dims() != 3) {
-    return errors::InvalidArgument("RNN input must be a 3-D vector.");
-  }
-  model_shapes->num_layers = (*input_h)->dim_size(0) / model_shapes->dir_count;
-  model_shapes->num_units = (*input_h)->dim_size(2);
-
-  model_shapes->hidden_state_shape =
-      TensorShape({model_shapes->dir_count * model_shapes->num_layers,
-                   model_shapes->batch_size, model_shapes->num_units});
-  if ((*input_h)->shape() != model_shapes->hidden_state_shape) {
-    return errors::InvalidArgument(
-        "Invalid input_h shape: ", (*input_h)->shape().DebugString(), " ",
-        model_shapes->hidden_state_shape.DebugString());
-  }
-  if (model_types.HasInputC()) {
-    if ((*input_h)->shape() != (*input_c)->shape()) {
-      return errors::InvalidArgument(
-          "input_h and input_c must have the same shape: ",
-          (*input_h)->shape().DebugString(), " ",
-          (*input_c)->shape().DebugString());
-    }
-  }
-  model_shapes->output_shape =
-      TensorShape({model_shapes->seq_length, model_shapes->batch_size,
-                   model_shapes->dir_count * model_shapes->num_units});
-  return Status::OK();
-}
-
-using perftools::gputools::dnn::RnnDescriptor;
-
-template <typename T>
-void RestoreParams(const OpInputList params_input,
-                   const std::vector<RnnDescriptor::ParamsRegion>& params,
-                   DeviceMemoryBase* data_dst,
-                   perftools::gputools::Stream* stream) {
-  int num_params = params.size();
-  CHECK(params_input.size() == num_params)
-      << "Number of params mismatch. Expected " << params_input.size()
-      << ", got " << num_params;
-  for (int i = 0; i < params.size(); i++) {
-    int64 size_in_bytes = params[i].size;
-    int64 size = size_in_bytes / sizeof(T);
-    CHECK(size == params_input[i].NumElements())
-        << "Params size mismatch. Expected " << size << ", got "
-        << params_input[i].NumElements();
-    auto data_src_ptr = StreamExecutorUtil::AsDeviceMemory<T>(params_input[i]);
-    DeviceMemoryBase data_dst_ptr =
-        SliceDeviceMemory(*data_dst, params[i].offset, size_in_bytes);
-    stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
-  }
-}
-
-}  // namespace
-
-// Note: all following kernels depend on a RnnDescriptor instance, which
-// according to Cudnn official doc should be kept around and reused across all
-// Cudnn kernels in the same model.
-// In Tensorflow, we don't pass the reference across different OpKernels,
-// rather, recreate it separately in each OpKernel, which does no cause issue:
-// CudnnDropoutDescriptor keeps a reference to a memory for
-// random number generator state. During recreation, this state is lost.
-// However, only forward-pass Cudnn APIs make use of the state.
-
-// A common base class for RNN kernels. It extracts common attributes and
-// shape validations.
-class CudnnRNNKernelCommon : public OpKernel {
- protected:
-  explicit CudnnRNNKernelCommon(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("dropout", &dropout_));
-    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
-    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
-    string str;
-    OP_REQUIRES_OK(context, context->GetAttr("rnn_mode", &str));
-    OP_REQUIRES_OK(context, ParseRNNMode(str, &model_types_.rnn_mode));
-    OP_REQUIRES_OK(context, context->GetAttr("input_mode", &str));
-    OP_REQUIRES_OK(context,
-                   ParseTFRNNInputMode(str, &model_types_.rnn_input_mode));
-    OP_REQUIRES_OK(context, context->GetAttr("direction", &str));
-    OP_REQUIRES_OK(
-        context, ParseRNNDirectionMode(str, &model_types_.rnn_direction_mode));
-    // Reset CudnnRnnDescriptor and related random number generate states in
-    // every Compute() call.
-    OP_REQUIRES_OK(context, ReadBoolFromEnvVar("TF_CUDNN_RESET_RND_GEN_STATE",
-                                               false, &reset_rnd_gen_state_));
-  }
-
-  bool HasInputC() const { return model_types_.HasInputC(); }
-  RnnMode rnn_mode() const { return model_types_.rnn_mode; }
-  TFRNNInputMode rnn_input_mode() const { return model_types_.rnn_input_mode; }
-  RnnDirectionMode rnn_direction_mode() const {
-    return model_types_.rnn_direction_mode;
-  }
-  CudnnModelTypes model_types() const { return model_types_; }
-  float dropout() const { return dropout_; }
-  uint64 seed() { return (static_cast<uint64>(seed_) << 32) | seed2_; }
-  bool ResetRndGenState() { return reset_rnd_gen_state_; }
-
-  template <typename T>
-  Status ExtractCudnnRNNParamsInfo(OpKernelContext* context,
-                                   std::unique_ptr<RnnDescriptor>* rnn_desc) {
-    const Tensor* num_layers_t = nullptr;
-    TF_RETURN_IF_ERROR(context->input("num_layers", &num_layers_t));
-    if (!TensorShapeUtils::IsScalar(num_layers_t->shape())) {
-      return errors::InvalidArgument("num_layers is not a scalar");
-    }
-    int num_layers = num_layers_t->scalar<int>()();
-    const Tensor* num_units_t = nullptr;
-    TF_RETURN_IF_ERROR(context->input("num_units", &num_units_t));
-    if (!TensorShapeUtils::IsScalar(num_units_t->shape())) {
-      return errors::InvalidArgument("num_units is not a scalar");
-    }
-    int num_units = num_units_t->scalar<int>()();
-    const Tensor* input_size_t = nullptr;
-    TF_RETURN_IF_ERROR(context->input("input_size", &input_size_t));
-    if (!TensorShapeUtils::IsScalar(input_size_t->shape())) {
-      return errors::InvalidArgument("input_size is not a scalar");
-    }
-    int input_size = input_size_t->scalar<int>()();
-
-    RnnInputMode input_mode;
-    TF_RETURN_IF_ERROR(
-        ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode));
-
-    auto* stream = context->op_device_context()->stream();
-    // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
-    // random number generator, therefore set state_allocator to nullptr.
-    auto rnn_desc_s = stream->parent()->createRnnDescriptor(
-        num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
-        rnn_mode(), ToDataType<T>::value, dropout(), seed(),
-        nullptr /* state_allocator */);
-    if (!rnn_desc_s.ok()) {
-      return FromExecutorStatus(rnn_desc_s);
-    }
-    *rnn_desc = rnn_desc_s.ConsumeValueOrDie();
-    return Status::OK();
-  }
-
- private:
-  int seed_;
-  int seed2_;
-  float dropout_;
-  bool reset_rnd_gen_state_;
-
-  CudnnModelTypes model_types_;
-};
-
-// A class that returns the size of the opaque parameter buffer. The user should
-// use that to create the actual parameter buffer for training. However, it
-// should not be used for saving and restoring.
-template <typename T, typename Index>
-class CudnnRNNParamsSizeOp<GPUDevice, T, Index> : public CudnnRNNKernelCommon {
- public:
-  typedef GPUDevice Device;
-  explicit CudnnRNNParamsSizeOp(OpKernelConstruction* context)
-      : CudnnRNNKernelCommon(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    std::unique_ptr<RnnDescriptor> rnn_desc;
-    OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo<T>(context, &rnn_desc));
-    int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes();
-    CHECK(params_size_in_bytes % sizeof(T) == 0)
-        << "params_size_in_bytes must be multiple of element size";
-    int64 params_size = params_size_in_bytes / sizeof(T);
-
-    Tensor* output_t = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, {1}, &output_t));
-    *output_t->template flat<Index>().data() = params_size;
-  }
-};
-
-#define REGISTER_GPU(T)                                    \
-  REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsSize")       \
-                              .Device(DEVICE_GPU)          \
-                              .HostMemory("num_layers")    \
-                              .HostMemory("num_units")     \
-                              .HostMemory("input_size")    \
-                              .HostMemory("params_size")   \
-                              .TypeConstraint<T>("T")      \
-                              .TypeConstraint<int32>("S"), \
-                          CudnnRNNParamsSizeOp<GPUDevice, T, int32>);
-
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
-#undef REGISTER_GPU
-
-// Convert weight and bias params from a platform-specific layout to the
-// canonical form.
-template <typename T>
-class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
- public:
-  typedef GPUDevice Device;
-  explicit CudnnRNNParamsToCanonical(OpKernelConstruction* context)
-      : CudnnRNNKernelCommon(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("num_params", &num_params_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(3);
-    auto input_ptr = StreamExecutorUtil::AsDeviceMemory<T>(input);
-    auto* stream = context->op_device_context()->stream();
-
-    std::unique_ptr<RnnDescriptor> rnn_desc;
-    OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo<T>(context, &rnn_desc));
-    int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes();
-    CHECK(params_size_in_bytes % sizeof(T) == 0)
-        << "params_size_in_bytes must be multiple of element size";
-
-    const Tensor* num_units_t = nullptr;
-    OP_REQUIRES_OK(context, context->input("num_units", &num_units_t));
-    CHECK(TensorShapeUtils::IsScalar(num_units_t->shape()))
-        << "num_units is not a scalar";
-    int num_units = num_units_t->scalar<int>()();
-
-    const Tensor* input_size_t = nullptr;
-    OP_REQUIRES_OK(context, context->input("input_size", &input_size_t));
-    CHECK(TensorShapeUtils::IsScalar(input_size_t->shape()))
-        << "input_size is not a scalar";
-    int input_size = input_size_t->scalar<int>()();
-
-    const Tensor* num_layers_t = nullptr;
-    OP_REQUIRES_OK(context, context->input("num_layers", &num_layers_t));
-    CHECK(TensorShapeUtils::IsScalar(num_layers_t->shape()))
-        << "num_layers is not a scalar";
-    int num_layers = num_layers_t->scalar<int>()();
-    int num_dirs = 1;
-    if (rnn_direction_mode() == RnnDirectionMode::kRnnBidirectional) {
-      num_dirs = 2;
-    }
-    const int num_params_per_layer = num_params_ / num_layers / num_dirs;
-    // Number of params applied on inputs. The rest are applied on recurrent
-    // hidden states.
-    const int num_params_input_state = num_params_per_layer / 2;
-    CHECK(num_params_ % (num_layers * num_dirs) == 0)
-        << "Number of params is not a multiple of num_layers * num_dirs.";
-    CHECK(num_params_per_layer % 2 == 0)
-        << "Number of params per layer is not a even number.";
-
-    CHECK(num_params_ == rnn_desc->ParamsWeightRegions().size())
-        << "Number of params mismatch. Expected " << num_params_ << ", got "
-        << rnn_desc->ParamsWeightRegions().size();
-    for (int i = 0; i < rnn_desc->ParamsWeightRegions().size(); i++) {
-      int64 size_in_bytes = rnn_desc->ParamsWeightRegions()[i].size;
-      int64 size = size_in_bytes / sizeof(T);
-      const int layer_idx = i / num_params_per_layer;
-      const int index_within_layer = i % num_params_per_layer;
-      int width = 0, height = num_units;
-      // In CuDNN layout, each layer has num_params_per_layer params, with the
-      // first half a.k.a num_params_input_state params applied on the inputs,
-      // and the second half on the recurrent hidden states.
-      bool apply_on_input_state = index_within_layer < num_params_input_state;
-      if (rnn_direction_mode() == RnnDirectionMode::kRnnUnidirectional) {
-        if (layer_idx == 0 && apply_on_input_state) {
-          width = input_size;
-        } else {
-          width = num_units;
-        }
-      } else {
-        if (apply_on_input_state) {
-          if (layer_idx <= 1) {
-            // First fwd or bak layer.
-            width = input_size;
-          } else {
-            // Following layers, cell inputs are concatenated outputs of
-            // its prior layer.
-            width = 2 * num_units;
-          }
-        } else {
-          width = num_units;
-        }
-      }
-      CHECK(size == width * height) << "Params size mismatch. Expected "
-                                    << width * height << ", got " << size;
-      Tensor* output = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(
-                                  i, TensorShape({height, width}), &output));
-      DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
-          input_ptr, rnn_desc->ParamsWeightRegions()[i].offset, size_in_bytes);
-      auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
-      stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
-    }
-
-    OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(),
-                errors::InvalidArgument("Number of params mismatch. Expected ",
-                                        num_params_, ", got ",
-                                        rnn_desc->ParamsBiasRegions().size()));
-    for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) {
-      int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size;
-      int64 size = size_in_bytes / sizeof(T);
-      OP_REQUIRES(context, size == num_units,
-                  errors::InvalidArgument("Params size mismatch. Expected ",
-                                          num_units, ", got ", size));
-
-      Tensor* output = nullptr;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(num_params_ + i,
-                                              TensorShape({size}), &output));
-      DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
-          input_ptr, rnn_desc->ParamsBiasRegions()[i].offset, size_in_bytes);
-      auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
-      stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
-    }
-  }
-
- private:
-  int num_params_;
-};
-
-#define REGISTER_GPU(T)                                     \
-  REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsToCanonical") \
-                              .Device(DEVICE_GPU)           \
-                              .HostMemory("num_layers")     \
-                              .HostMemory("num_units")      \
-                              .HostMemory("input_size")     \
-                              .TypeConstraint<T>("T"),      \
-                          CudnnRNNParamsToCanonical<GPUDevice, T>);
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
-#undef REGISTER_GPU
-
-// Convert weight and bias params from the canonical form to a
-// platform-specific layout.
-template <typename T>
-class CudnnRNNCanonicalToParams<GPUDevice, T> : public CudnnRNNKernelCommon {
- public:
-  typedef GPUDevice Device;
-  explicit CudnnRNNCanonicalToParams(OpKernelConstruction* context)
-      : CudnnRNNKernelCommon(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    std::unique_ptr<RnnDescriptor> rnn_desc;
-    OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo<T>(context, &rnn_desc));
-    int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes();
-    CHECK(params_size_in_bytes % sizeof(T) == 0)
-        << "params_size_in_bytes must be multiple of element size";
-    Tensor* output = nullptr;
-    int params_size = params_size_in_bytes / sizeof(T);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, {params_size}, &output));
-    auto output_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
-    auto* stream = context->op_device_context()->stream();
-
-    OpInputList weights;
-    OP_REQUIRES_OK(context, context->input_list("weights", &weights));
-    RestoreParams<T>(weights, rnn_desc->ParamsWeightRegions(), &output_ptr,
-                     stream);
-
-    OpInputList biases;
-    OP_REQUIRES_OK(context, context->input_list("biases", &biases));
-    RestoreParams<T>(biases, rnn_desc->ParamsBiasRegions(), &output_ptr,
-                     stream);
-  }
-};
-
-#define REGISTER_GPU(T)                                     \
-  REGISTER_KERNEL_BUILDER(Name("CudnnRNNCanonicalToParams") \
-                              .Device(DEVICE_GPU)           \
-                              .HostMemory("num_layers")     \
-                              .HostMemory("num_units")      \
-                              .HostMemory("input_size")     \
-                              .TypeConstraint<T>("T"),      \
-                          CudnnRNNCanonicalToParams<GPUDevice, T>);
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
-#undef REGISTER_GPU
-
-// Pointers to RNN scratch space for a specific set of shape parameters (used as
-// a hash table value in CudnnRNNForwardOp and CudnnRNNBackwardOp).
-struct RnnScratchSpace {
-  std::unique_ptr<RnnDescriptor> rnn_desc;
-  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator;
-};
-
-// Run the forward operation of the RNN model.
-template <typename T>
-class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
- public:
-  typedef GPUDevice Device;
-  explicit CudnnRNNForwardOp(OpKernelConstruction* context)
-      : CudnnRNNKernelCommon(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor* input = nullptr;
-    const Tensor* input_h = nullptr;
-    const Tensor* input_c = nullptr;
-    const Tensor* params = nullptr;
-    CudnnModelShapes model_shapes;
-    OP_REQUIRES_OK(context,
-                   ExtractForwardInput(context, model_types(), &input, &input_h,
-                                       &input_c, &params, &model_shapes));
-    const auto& input_shape = model_shapes.input_shape;
-    const auto& hidden_state_shape = model_shapes.hidden_state_shape;
-    const auto& output_shape = model_shapes.output_shape;
-
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-    Tensor* output_h = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, hidden_state_shape, &output_h));
-    Tensor* output_c = nullptr;
-    if (HasInputC()) {
-      // Only LSTM uses input_c and output_c. So for all other models, we only
-      // need to create dummy outputs.
-      OP_REQUIRES_OK(
-          context, context->allocate_output(2, hidden_state_shape, &output_c));
-    } else {
-      OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_c));
-    }
-
-    auto* stream = context->op_device_context()->stream();
-    auto* executor = stream->parent();
-    RnnInputMode input_mode;
-    OP_REQUIRES_OK(context,
-                   ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
-                                  model_shapes.input_size, &input_mode));
-    auto data_type = ToDataType<T>::value;
-
-    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
-        input_shape.dim_size(0), input_shape.dim_size(1),
-        input_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(input_desc_s));
-    auto input_desc = input_desc_s.ConsumeValueOrDie();
-
-    auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
-        hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1),
-        hidden_state_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(hidden_state_desc_s));
-    auto hidden_state_desc = hidden_state_desc_s.ConsumeValueOrDie();
-
-    auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
-        output_shape.dim_size(0), output_shape.dim_size(1),
-        output_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(output_desc_s));
-    auto output_desc = output_desc_s.ConsumeValueOrDie();
-
-    auto input_data = AsDeviceMemory<T>(input);
-    auto input_h_data = AsDeviceMemory<T>(input_h);
-    DeviceMemory<T> input_c_data;
-    if (HasInputC()) {
-      input_c_data = AsDeviceMemory<T>(input_c);
-    }
-    auto params_data = AsDeviceMemory<T>(params);
-    auto output_data = AsDeviceMemory<T>(output);
-    auto output_h_data = AsDeviceMemory<T>(output_h);
-    DeviceMemory<T> output_c_data;
-    if (HasInputC()) {
-      output_c_data = AsDeviceMemory<T>(output_c);
-    }
-
-    // Creates a memory callback for the reserve_space. The memory lives in the
-    // output of this kernel. And it will be fed into the backward pass when
-    // needed.
-    CudnnRNNReserveSpaceAllocator<T> reserve_space_allocator(context, 3);
-    if (!is_training_) {
-      Tensor* dummy_reserve_space = nullptr;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(3, {}, &dummy_reserve_space));
-    }
-    // Creates a memory callback for the workspace. The memory lives to the end
-    // of this kernel calls.
-    CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status = false;
-    {
-      mutex_lock l(mu_);
-      RnnScratchSpace& rnn_state = rnn_state_cache_[model_shapes];
-      if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
-        CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
-            new CudnnRNNPersistentSpaceAllocator(context);
-        rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
-        auto rnn_desc_s = executor->createRnnDescriptor(
-            model_shapes.num_layers, model_shapes.num_units,
-            model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
-        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-        rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
-      }
-      launch_status =
-          stream
-              ->ThenRnnForward(*rnn_state.rnn_desc, *input_desc, input_data,
-                               *hidden_state_desc, input_h_data,
-                               *hidden_state_desc, input_c_data, params_data,
-                               *output_desc, &output_data, *hidden_state_desc,
-                               &output_h_data, *hidden_state_desc,
-                               &output_c_data, is_training_,
-                               &reserve_space_allocator, &workspace_allocator)
-              .ok();
-    }
-    OP_REQUIRES(context, launch_status,
-                errors::Internal("Failed to call ThenRnnForward"));
-  }
-
- private:
-  mutex mu_;
-  bool is_training_;
-  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
-                     CudnnModelShapesComparator>
-      rnn_state_cache_ GUARDED_BY(mu_);
-};
-
-#define REGISTER_GPU(T)                                           \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("CudnnRNN").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      CudnnRNNForwardOp<GPUDevice, T>);
-
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
-#undef REGISTER_GPU
-
-// Run the backward operation of the RNN model.
-template <typename T>
-class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
- public:
-  typedef GPUDevice Device;
-
-  explicit CudnnRNNBackwardOp(OpKernelConstruction* context)
-      : CudnnRNNKernelCommon(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor* input = nullptr;
-    const Tensor* input_h = nullptr;
-    const Tensor* input_c = nullptr;
-    const Tensor* params = nullptr;
-    CudnnModelShapes model_shapes;
-    OP_REQUIRES_OK(context,
-                   ExtractForwardInput(context, model_types(), &input, &input_h,
-                                       &input_c, &params, &model_shapes));
-
-    const auto& input_shape = model_shapes.input_shape;
-    const auto& hidden_state_shape = model_shapes.hidden_state_shape;
-    const auto& output_shape = model_shapes.output_shape;
-
-    auto data_type = ToDataType<T>::value;
-    const Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->input("output", &output));
-    OP_REQUIRES(context, output_shape == output->shape(),
-                errors::InvalidArgument(
-                    "input_h and input_c must have the same shape: ",
-                    input_h->shape().DebugString(), " ",
-                    input_c->shape().DebugString()));
-    const Tensor* output_h = nullptr;
-    OP_REQUIRES_OK(context, context->input("output_h", &output_h));
-    OP_REQUIRES(context, output_h->shape() == hidden_state_shape,
-                errors::InvalidArgument(
-                    "Invalid output_h shape: ", output_h->shape().DebugString(),
-                    " ", hidden_state_shape.DebugString()));
-    const Tensor* output_c = nullptr;
-    if (HasInputC()) {
-      // Only LSTM uses input_c and output_c. So for all other models, we only
-      // need to create dummy outputs.
-      OP_REQUIRES_OK(context, context->input("output_c", &output_c));
-      OP_REQUIRES(context, output_c->shape() == hidden_state_shape,
-                  errors::InvalidArgument("Invalid output_c shape: ",
-                                          output_c->shape().DebugString(), " ",
-                                          hidden_state_shape.DebugString()));
-    }
-
-    const Tensor* output_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->input("output_backprop", &output_backprop));
-    OP_REQUIRES(context, output_backprop->shape() == output_shape,
-                errors::InvalidArgument("Invalid output_backprop shapes: ",
-                                        output_backprop->shape().DebugString(),
-                                        " ", output_shape.DebugString()));
-
-    const Tensor* output_h_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->input("output_h_backprop", &output_h_backprop));
-    OP_REQUIRES(
-        context, output_h_backprop->shape() == hidden_state_shape,
-        errors::InvalidArgument("Invalid output_h_backprop shapes: ",
-                                output_h_backprop->shape().DebugString(), " ",
-                                hidden_state_shape.DebugString()));
-    const Tensor* output_c_backprop = nullptr;
-    if (HasInputC()) {
-      OP_REQUIRES_OK(context,
-                     context->input("output_c_backprop", &output_c_backprop));
-      OP_REQUIRES(
-          context, output_c_backprop->shape() == hidden_state_shape,
-          errors::InvalidArgument("Invalid output_c_backprop shapes: ",
-                                  output_c_backprop->shape().DebugString(), " ",
-                                  hidden_state_shape.DebugString()));
-    }
-    const Tensor* reserve_space_const = nullptr;
-    // This is the same "reserve_space" created by the forward op.
-    // It can also be modified by this backward operation.
-    OP_REQUIRES_OK(context,
-                   context->input("reserve_space", &reserve_space_const));
-    // Cudnn needs the reserve space to be writeable. This is fine because they
-    // are opaque.
-    Tensor* reserve_space = const_cast<Tensor*>(reserve_space_const);
-
-    Tensor* input_backprop = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, input->shape(), &input_backprop));
-    Tensor* input_h_backprop = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(1, input_h->shape(),
-                                                     &input_h_backprop));
-    Tensor* input_c_backprop = nullptr;
-    if (HasInputC()) {
-      OP_REQUIRES_OK(context, context->allocate_output(2, input_c->shape(),
-                                                       &input_c_backprop));
-    } else {
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(2, {}, &input_c_backprop));
-    }
-    Tensor* params_backprop = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(3, params->shape(),
-                                                     &params_backprop));
-
-    auto* stream = context->op_device_context()->stream();
-    auto* executor = stream->parent();
-    RnnInputMode input_mode;
-    OP_REQUIRES_OK(context,
-                   ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
-                                  model_shapes.input_size, &input_mode));
-
-    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
-        input_shape.dim_size(0), input_shape.dim_size(1),
-        input_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(input_desc_s));
-    auto input_desc = input_desc_s.ConsumeValueOrDie();
-
-    auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
-        hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1),
-        hidden_state_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(hidden_state_desc_s));
-    auto hidden_state_desc = hidden_state_desc_s.ConsumeValueOrDie();
-
-    auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
-        output_shape.dim_size(0), output_shape.dim_size(1),
-        output_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(output_desc_s));
-    auto output_desc = output_desc_s.ConsumeValueOrDie();
-
-    auto input_data = AsDeviceMemory<T>(input);
-    auto input_h_data = AsDeviceMemory<T>(input_h);
-    DeviceMemory<T> input_c_data;
-    if (HasInputC()) {
-      input_c_data = AsDeviceMemory<T>(input_c);
-    }
-    auto params_data = AsDeviceMemory<T>(params);
-    auto output_data = AsDeviceMemory<T>(output);
-    auto output_h_data = AsDeviceMemory<T>(output_h);
-    DeviceMemory<T> output_c_data;
-    if (HasInputC()) {
-      output_c_data = AsDeviceMemory<T>(output_c);
-    }
-    auto output_backprop_data = AsDeviceMemory<T>(output_backprop);
-    auto output_h_backprop_data = AsDeviceMemory<T>(output_h_backprop);
-    DeviceMemory<T> output_c_backprop_data;
-    if (HasInputC()) {
-      output_c_backprop_data = AsDeviceMemory<T>(output_c_backprop);
-    }
-    auto input_backprop_data = AsDeviceMemory<T>(input_backprop);
-    auto input_h_backprop_data = AsDeviceMemory<T>(input_h_backprop);
-    DeviceMemory<T> input_c_backprop_data;
-    if (HasInputC()) {
-      input_c_backprop_data = AsDeviceMemory<T>(input_c_backprop);
-    }
-    auto params_backprop_data = AsDeviceMemory<T>(params_backprop);
-    auto reserve_space_uint8 = CastDeviceMemory<uint8, T>(reserve_space);
-    // Creates a memory callback for the workspace. The memory lives to the end
-    // of this kernel calls.
-    CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status = false;
-    {
-      mutex_lock l(mu_);
-      RnnScratchSpace& rnn_state = rnn_state_cache_[model_shapes];
-      if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
-        CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
-            new CudnnRNNPersistentSpaceAllocator(context);
-        rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
-        auto rnn_desc_s = executor->createRnnDescriptor(
-            model_shapes.num_layers, model_shapes.num_units,
-            model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
-        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-        rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
-      }
-      launch_status =
-          stream
-              ->ThenRnnBackward(*rnn_state.rnn_desc, *input_desc, input_data,
-                                *hidden_state_desc, input_h_data,
-                                *hidden_state_desc, input_c_data, params_data,
-                                *output_desc, output_data, *hidden_state_desc,
-                                output_h_data, *hidden_state_desc,
-                                output_c_data, output_backprop_data,
-                                output_h_backprop_data, output_c_backprop_data,
-                                &input_backprop_data, &input_h_backprop_data,
-                                &input_c_backprop_data, &params_backprop_data,
-                                &reserve_space_uint8, &workspace_allocator)
-              .ok();
-    }
-    OP_REQUIRES(context, launch_status,
-                errors::Internal("Failed to call ThenRnnBackward"));
-  }
-
- private:
-  mutex mu_;
-  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
-                     CudnnModelShapesComparator>
-      rnn_state_cache_ GUARDED_BY(mu_);
-};
-
-#define REGISTER_GPU(T)                                                   \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("CudnnRNNBackprop").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      CudnnRNNBackwardOp<GPUDevice, T>);
-
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
-#undef REGISTER_GPU
-
-// TODO(zhengxq): Add the conversion of Cudnn RNN Params from and to
-// its canonical form.
-
-#endif  // GOOGLE_CUDA
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 9897c31a98e0b335c18a84825fc518ed1fc310a2..33ddfb8dee1c446f22c7d0071f9a0e2bbac6bdad 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import argparse
 import collections
+import functools
 import itertools
 import os
 import sys
@@ -34,7 +35,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework.test_util import TensorFlowTestCase
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -53,6 +54,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
@@ -265,7 +267,7 @@ def _CreateCudnnCompatibleCanonicalRNN(rnn, inputs, is_bidi=False, scope=None):
     return outputs, (output_state_fw, output_state_bw)
 
 
-class CudnnRNNTestBasic(TensorFlowTestCase):
+class CudnnRNNTestBasic(test_util.TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
@@ -467,7 +469,7 @@ class CudnnRNNTestBasic(TensorFlowTestCase):
 
 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
-class CudnnRNNTestSaveRestore(TensorFlowTestCase):
+class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
 
   def _CompareWeights(self, lhs, rhs):
     self.assertEqual(len(lhs), len(rhs))
@@ -701,9 +703,146 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
     self._TestSaveRestoreHelper(CUDNN_RNN_RELU)
 
 
+class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
+
+  def _VerifyCheckpoint(
+      self, checkpoint_path, compatible_cell_fn, cudnn_cell_fn,
+      num_layers, input_size, expected_variable_values, num_applications=3):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with ops.device("gpu:0"):
+      cudnn_layer = cudnn_cell_fn()
+      cudnn_checkpoint = checkpointable_utils.Checkpoint(cell=cudnn_layer)
+      status = cudnn_checkpoint.restore(checkpoint_path)
+      inputs = 3. * array_ops.ones([num_applications, num_layers, input_size],
+                                   dtype=dtypes.float32)
+      cudnn_output, _ = cudnn_layer(inputs)
+      status.run_restore_ops()
+    second_save_path = cudnn_checkpoint.save(checkpoint_prefix)
+    restore_layer = compatible_cell_fn()
+    restore_layer_checkpoint = checkpointable_utils.Checkpoint(
+        cell=restore_layer)
+    status = restore_layer_checkpoint.restore(second_save_path)
+    current_state = restore_layer.zero_state(1, dtypes.float32)
+    for _ in range(num_applications):
+      restore_layer_output, current_state = restore_layer(
+          inputs=3. * array_ops.ones([1, input_size]),
+          state=current_state)
+    status.run_restore_ops()
+    self.assertTrue(restore_layer.variables)
+    for variable, expected_value in zip(
+        restore_layer.variables, expected_variable_values):
+      self.assertAllClose(expected_value, self.evaluate(variable))
+    self.assertAllClose(self.evaluate(restore_layer_output),
+                        self.evaluate(cudnn_output)[-1, -1:, ...])
+
+  def _CheckpointableSingleCellUnidirectionalTestTemplate(
+      self, single_cell_fn, cudnn_cell_fn):
+    # Single-layer cuDNN cells with object-based checkpointing should be
+    # checkpoint compatible with either single CudnnCompatible cells or
+    # MultiRnnCells with one cell.
+    input_size = 3
+    save_cell_layer = single_cell_fn()
+    save_cell_layer(
+        inputs=array_ops.ones([1, input_size]),
+        state=save_cell_layer.zero_state(1, dtypes.float32))
+    self.assertTrue(save_cell_layer.variables)
+    expected_values = []
+    np.random.seed(10)
+    for variable in save_cell_layer.variables:
+      value = np.random.normal(size=variable.shape)
+      expected_values.append(value)
+      self.evaluate(variable.assign(value))
+    save_checkpoint = checkpointable_utils.Checkpoint(cell=save_cell_layer)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    first_save_path = save_checkpoint.save(checkpoint_prefix)
+    self._VerifyCheckpoint(
+        checkpoint_path=first_save_path,
+        compatible_cell_fn=
+        lambda: rnn_cell_impl.MultiRNNCell([single_cell_fn()]),
+        cudnn_cell_fn=cudnn_cell_fn,
+        num_layers=1,
+        expected_variable_values=expected_values,
+        input_size=input_size)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testLSTMCheckpointableSingleLayer(self):
+    num_units = 2
+    direction = CUDNN_RNN_UNIDIRECTION
+    self._CheckpointableSingleCellUnidirectionalTestTemplate(
+        single_cell_fn=functools.partial(
+            cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
+        cudnn_cell_fn=functools.partial(
+            cudnn_rnn.CudnnLSTM, num_layers=1, num_units=num_units,
+            direction=direction, name="awesome_lstm"))
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testGRUCheckpointableSingleLayer(self):
+    num_units = 2
+    direction = CUDNN_RNN_UNIDIRECTION
+    with self.assertRaises(NotImplementedError):
+      # TODO(allenl): Implement object-based saving for GRUs and other cells.
+      self._CheckpointableSingleCellUnidirectionalTestTemplate(
+          single_cell_fn=functools.partial(
+              cudnn_rnn_ops.CudnnCompatibleGRUCell, num_units=num_units),
+          cudnn_cell_fn=functools.partial(
+              cudnn_rnn.CudnnGRU, num_layers=1, num_units=num_units,
+              direction=direction, name="awesome_gru"))
+
+  def _CheckpointableMultiLayerTestTemplate(
+      self, single_cell_fn, cudnn_cell_fn, num_layers):
+
+    def _MultiCellFn():
+      return rnn_cell_impl.MultiRNNCell(
+          [single_cell_fn() for _ in range(num_layers)])
+    input_size = 3
+    save_graph = ops.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph):
+      save_layer = _MultiCellFn()
+      save_layer(inputs=array_ops.ones([1, input_size]),
+                 state=save_layer.zero_state(1, dtypes.float32))
+      self.assertTrue(save_layer.variables)
+      expected_values = []
+      np.random.seed(10)
+      for variable in save_layer.variables:
+        value = np.random.normal(size=variable.shape)
+        expected_values.append(value)
+        self.evaluate(variable.assign(value))
+      save_checkpoint = checkpointable_utils.Checkpoint(cell=save_layer)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      first_save_path = save_checkpoint.save(checkpoint_prefix)
+    self._VerifyCheckpoint(
+        checkpoint_path=first_save_path,
+        compatible_cell_fn=_MultiCellFn, cudnn_cell_fn=cudnn_cell_fn,
+        num_layers=num_layers,
+        expected_variable_values=expected_values,
+        input_size=input_size)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testCudnnCompatibleLSTMCheckpointablMultiLayer(self):
+    num_units = 2
+    num_layers = 3
+    direction = CUDNN_RNN_UNIDIRECTION
+    self._CheckpointableMultiLayerTestTemplate(
+        single_cell_fn=functools.partial(
+            cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
+        cudnn_cell_fn=functools.partial(
+            cudnn_rnn.CudnnLSTM, num_layers=num_layers, num_units=num_units,
+            direction=direction, name="awesome_lstm"),
+        num_layers=num_layers)
+
+
 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
-class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase):
+class CudnnRNNTestCompatibleRNNCells(test_util.TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
@@ -884,7 +1023,7 @@ class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase):
                               rtol=2e-5)
 
 
-class CudnnRNNTestParamsSize(TensorFlowTestCase):
+class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase):
 
   def _TestOpaqueParamsSize(self, rnn_mode, num_layers, num_units, input_size,
                             dtype, direction):
@@ -931,7 +1070,18 @@ class CudnnRNNTestParamsSize(TensorFlowTestCase):
                                    dtype, direction)
 
 
-class CudnnRNNTestTraining(TensorFlowTestCase):
+class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(CudnnRNNTestTraining, self).setUp()
+    self._reset_rnd_gen_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE",
+                                               str(False))
+    self._rnn_use_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0")
+
+  def tearDown(self):
+    super(CudnnRNNTestTraining, self).tearDown()
+    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = self._reset_rnd_gen_state
+    os.environ["TF_CUDNN_RNN_USE_V2"] = self._rnn_use_v2
 
   def _ComputeNumericGrad(self, sess, y, x, delta=1e-4, step=1):
     """Compute the numeric gradient of y wrt to x.
@@ -1045,11 +1195,10 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
   def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
                              batch_size, seq_length, dir_count, dropout, dtype,
-                             delta, tolerance):
+                             use_v2, delta, tolerance):
     # Gradient checking runs two forward ops with almost the same input. Need to
     # make sure the drop patterns across the two runs are the same.
     logging.info("Training test with config: %s", locals())
-    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
     os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
 
     np.random.seed(1234)
@@ -1057,6 +1206,10 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
     has_input_c = (rnn_mode == CUDNN_LSTM)
     direction = (CUDNN_RNN_UNIDIRECTION
                  if dir_count == 1 else CUDNN_RNN_BIDIRECTION)
+    if use_v2:
+      os.environ["TF_CUDNN_RNN_USE_V2"] = "1"
+    else:
+      os.environ["TF_CUDNN_RNN_USE_V2"] = "0"
     model = CudnnTestModel(
         rnn_mode,
         num_layers,
@@ -1106,22 +1259,22 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
           self._GradientCheck(
               sess, total_sum, all_inputs,
               tolerance=tolerance, delta=delta)
-      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
 
   def _TestSimpleTrainingHelper(self, rnn_mode, test_configs):
     dropouts = [0, 0.5, 1.]
-    for config, dropout in itertools.product(test_configs, dropouts):
+    v2_options = [str(False), str(True)]
+    for config, dropout, use_v2 in itertools.product(test_configs, dropouts,
+                                                     v2_options):
       dtype = config.get("dtype", dtypes.float32)
       delta = config.get("delta", 1e-4)
       tolerance = config.get("tolerance", 1e-6)
       dir_count = config.get("dir_count", 1)
       shape = config["shape"]
       with ops.Graph().as_default():
-        self._TestOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    dir_count, dropout, dtype, delta,
-                                    tolerance)
+        self._TestOneSimpleTraining(
+            rnn_mode, shape["num_layers"], shape["num_units"],
+            shape["input_size"], shape["batch_size"], shape["seq_length"],
+            dir_count, dropout, dtype, use_v2, delta, tolerance)
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 36fba917a8f56c26fd5b4c3468d1d980a8ba2ba5..d58198faf353aab68430d2fa153a18de359112de 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -142,6 +142,9 @@ class _CudnnRNN(base_layer.Layer):
   """
   # pylint:enable=line-too-long
 
+  # TODO(allenl): Document object-based saving and checkpoint compatibility once
+  # it's implemented for more cuDNN Layers.
+
   # The following are constants defined by subclasses.
   # Type of RNN cell.
   _rnn_mode = None
@@ -355,7 +358,8 @@ class _CudnnRNN(base_layer.Layer):
             "CUDA/CuDNN generations.")
       # Initialize opaque params with a tensor.
       self.kernel = vs.get_variable(
-          "opaque_kernel", initializer=opaque_params_t, validate_shape=False)
+          "opaque_kernel", dtype=self._plain_dtype,
+          initializer=opaque_params_t, validate_shape=False)
     # Create saveable in the outer scope of the cudnn subgraph, such that
     # alternative subgraph with platform-independent rnn cells can load the
     # checkpoints directly.
@@ -363,6 +367,11 @@ class _CudnnRNN(base_layer.Layer):
       self._create_saveable()
     self.built = True
 
+  def _gather_saveables_for_checkpoint(self):
+    raise NotImplementedError(
+        "This cell does not yet support object-based saving. File a feature "
+        "request if this limitation bothers you.")
+
   def call(self, inputs, initial_state=None, training=True):
     """Runs the forward step for the RNN model.
 
@@ -499,6 +508,8 @@ class _CudnnRNN(base_layer.Layer):
         direction=self.direction,
         scope=vs.get_variable_scope(),
         name="%s_saveable" % self.trainable_variables[0].name.split(":")[0])
+    self._saveable._add_checkpointable_dependencies(  # pylint: disable=protected-access
+        checkpointable=self, dtype=self._plain_dtype)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
 
@@ -521,6 +532,16 @@ class CudnnLSTM(_CudnnRNN):
     return ([self.num_layers * self.num_dirs, batch_size, self.num_units],
             [self.num_layers * self.num_dirs, batch_size, self.num_units])
 
+  @property
+  def _gather_saveables_for_checkpoint(self):
+    if self._direction == CUDNN_RNN_UNIDIRECTION:
+      # Skip one inheritance level to avoid NotImplementedError.
+      return super(_CudnnRNN, self)._gather_saveables_for_checkpoint
+    else:
+      raise NotImplementedError(
+          "Object-based saving does not currently support bidirectional LSTM "
+          "cells. File a feature request if this limitation bothers you.")
+
 
 class _CudnnRNNNoInputC(_CudnnRNN):
   """Abstract simple CudnnRNN layer without input_c."""
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index e87162f0ee9cc4eed795555171f55a93639e83cf..73a961992e19fabec5d0f75be1b52dbba20eb7af 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -17,27 +17,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cudnn_rnn.ops import gen_cudnn_rnn_ops
+import os
+from tensorflow.contrib.checkpoint.python import split_dependency
 from tensorflow.contrib.rnn.python.ops import lstm_ops
-from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.platform import resource_loader
+from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import saver
 
-_cudnn_rnn_ops_so = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
-
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
 CUDNN_RNN_BIDIRECTION = "bidirectional"
 CUDNN_LSTM = "lstm"
@@ -91,19 +89,23 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
 
   Cudnn compatible GRU (from Cudnn library user guide):
   ```python
-  r_t = sigma(x_t * W_r + h_t-1 * R_h + b_Wr + b_Rr)  # reset gate
-  u_t = sigma(x_t * W_u + h_t-1 * R_u + b_Wu + b_Ru)  # update gate
-  h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_Rh) + b_Wh)  # new memory gate
-  h_t = (1 - u_t) .* h'_t + u_t .* h_t-1
+  # reset gate
+  $$r_t = \sigma(x_t * W_r + h_t-1 * R_h + b_{Wr} + b_{Rr})$$
+  # update gate
+  $$u_t = \sigma(x_t * W_u + h_t-1 * R_u + b_{Wu} + b_{Ru})$$
+  # new memory gate
+  $$h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_{Rh}) + b_{Wh})$$
+  $$h_t = (1 - u_t) .* h'_t + u_t .* h_t-1$$
   ```
 
   Other GRU (see @{tf.nn.rnn_cell.GRUCell} and @{tf.contrib.rnn.GRUBlockCell}):
   ```python
-  h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_Wh)  # new memory gate
+  # new memory gate
+  \\(h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_{Wh})\\)
   ```
   which is not equivalent to Cudnn GRU: in addition to the extra bias term b_Rh,
   ```python
-  r .* (h * R) != (r .* h) * R
+  \\(r .* (h * R) != (r .* h) * R\\)
   ```
   """
 
@@ -267,13 +269,16 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     # instead of having the master pull all slices and then save them.
     slice_spec = ""
     params = weights + biases
-    param_names = weight_names + bias_names
+    self._weight_names = weight_names
+    self._bias_names = bias_names
+    self._param_names = weight_names + bias_names
+    prefixed_param_names = weight_names + bias_names
     if self._scope:
-      param_names = ["%s/%s" % (self._scope, pn) for pn in param_names]
-
+      prefixed_param_names = [
+          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names]
     specs = [
         saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
-        for param, param_name in zip(params, param_names)
+        for param, param_name in zip(params, prefixed_param_names)
     ]
     super(CudnnOpaqueParamsSaveable, self).__init__(
         array_ops.identity(self._variables), specs, name)
@@ -286,6 +291,45 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return state_ops.assign(
         self._variables, opaque_params, validate_shape=False)
 
+  def _checkpointable_save(self, save_buffer):
+    weights, biases = self._OpaqueParamsToCanonical()
+    with ops.device("gpu:0"):
+      (weights, _), (biases, _) = self._TransformCanonical(
+          weights, biases)
+    for name, tensor in zip(self._param_names, weights + biases):
+      save_buffer[name] = array_ops.identity(tensor)
+
+  def _checkpointable_restore(self, restore_buffer):
+    tensors = [array_ops.identity(restore_buffer[name])
+               for name in self._param_names]
+    return self.restore(
+        restored_tensors=tensors,
+        restored_shapes=None  # Unused
+    )
+
+  def _add_checkpointable_dependencies(self, checkpointable, dtype):
+    """Add canonical weight dependencies to `checkpointable`.
+
+    When saving or restoring, converts to or from the opaque buffer
+    format. Weights are saved and loaded in the configuration expected by
+    cuDNN-compatible cells.
+
+    Args:
+      checkpointable: An object inheriting from `CheckpointableBase` to add
+        dependencies too (typically the cuDNN `Layer`).
+      dtype: The dtype for the canonical parameter Tensors.
+    """
+    split_dependencies = split_dependency.split_dependency(
+        component_names=self._param_names,
+        component_dtypes=(dtype,) * len(self._param_names),
+        fill_save_buffer_fn=self._checkpointable_save,
+        consume_restore_buffer_fn=self._checkpointable_restore)
+    self._checkpointable_track_params(checkpointable, split_dependencies)
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Tracks parameters in a canonical configuration."""
+    return  # NotImplementedError raised by the Layer.
+
   def _TFCanonicalNamePrefix(self, layer, is_fwd=True):
     if self._direction == CUDNN_RNN_UNIDIRECTION:
       return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
@@ -481,10 +525,7 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
   _rnn_mode = CUDNN_LSTM
   _num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(CudnnCompatibleLSTMCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
 
   def _cudnn_to_tf_gate_params(self, *cu_gate_order):
     i_g, f_g, c_g, o_g = cu_gate_order
@@ -575,6 +616,29 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     tf_biases.append(b)
     tf_bias_names.append(prefix + "/bias")
 
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
+    biases = []
+    weights = []
+    for name in self._weight_names:
+      weights.append(params[name])
+    for name in self._bias_names:
+      biases.append(params[name])
+    assert len(params) == len(weights) + len(biases)
+    if len(weights) == 1 and len(biases) == 1:
+      # For single-layer cells, allow substituting a cell with no MultiRNNCell
+      # wrapping.
+      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
+      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
+      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
+      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+    assert len(biases) == len(weights)
+    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
+      cell = checkpointable_lib.Checkpointable()
+      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell.bias = bias
+      cell.kernel = kernel
+
 
 class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
   """SaveableObject implementation handling Cudnn GRU opaque params."""
@@ -582,10 +646,7 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
   _rnn_mode = CUDNN_GRU
   _num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(CudnnCompatibleGRUCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
@@ -664,11 +725,7 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
 class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
   """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(
-      rnn_cell_impl.BasicRNNCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
@@ -845,19 +902,27 @@ def _cudnn_rnn(inputs,
   check_direction(direction)
   check_input_mode(input_mode)
   seed, seed2 = random_seed.get_seed(seed)
-  outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-      input=inputs,
-      input_h=input_h,
-      input_c=input_c,
-      params=params,
-      is_training=is_training,
-      rnn_mode=rnn_mode,
-      input_mode=input_mode,
-      direction=direction,
-      dropout=dropout,
-      seed=seed,
-      seed2=seed2,
-      name=name)
+  # TODO(jamesqin): switch default value to "1" on May 25th 2018, and get rid
+  # of V1 ops.
+  use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0")
+  args = {
+      "input": inputs,
+      "input_h": input_h,
+      "input_c": input_c,
+      "params": params,
+      "is_training": is_training,
+      "rnn_mode": rnn_mode,
+      "input_mode": input_mode,
+      "direction": direction,
+      "dropout": dropout,
+      "seed": seed,
+      "seed2": seed2,
+      "name": name
+  }
+  if use_cudnn_v2 is not "1":
+    outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
+  else:
+    outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
   return (outputs, output_h, output_c)
 
 
@@ -1584,31 +1649,6 @@ class CudnnRNNRelu(_CudnnRNNNoInputC):
   _NUM_PARAMS_PER_LAYER = CUDNN_RNN_RELU_PARAMS_PER_LAYER
 
 
-@ops.RegisterGradient("CudnnRNN")
-def _cudnn_rnn_backward(op, *grad):
-  if not op.get_attr("is_training"):
-    raise ValueError(
-        "CudnnRNN must set is_training to True to be used in gradients")
-  return gen_cudnn_rnn_ops.cudnn_rnn_backprop(
-      input=op.inputs[0],
-      input_h=op.inputs[1],
-      input_c=op.inputs[2],
-      params=op.inputs[3],
-      output=op.outputs[0],
-      output_h=op.outputs[1],
-      output_c=op.outputs[2],
-      output_backprop=grad[0],
-      output_h_backprop=grad[1],
-      output_c_backprop=grad[2],
-      reserve_space=op.outputs[3],
-      dropout=op.get_attr("dropout"),
-      seed=op.get_attr("seed"),
-      seed2=op.get_attr("seed2"),
-      rnn_mode=op.get_attr("rnn_mode"),
-      input_mode=op.get_attr("input_mode"),
-      direction=op.get_attr("direction"))
-
-
 ops.RegisterShape("CudnnRNNParamsSize")(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape("CudnnRNNParamsToCanonical")(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape("CudnnRNNCanonicalToParams")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 0458199ff771bc45603106411550a39448e515b8..8bdbba83ef6a8541158d956e36caf6a9be435c5b 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -8,6 +8,11 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
     "tf_gen_op_libs",
+    "if_not_windows",
+)
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
 )
 
 py_library(
@@ -17,35 +22,25 @@ py_library(
     deps = [
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:iterator_ops",
-        "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/contrib/data/python/ops:shuffle_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
+cc_library(
+    name = "lib_proto_parsing_for_dataset_ops",
+    deps = if_not_windows(["//tensorflow/core:lib_proto_parsing"]),
+)
+
 tf_custom_op_library(
     name = "_dataset_ops.so",
     srcs = ["ops/dataset_ops.cc"],
-    deps = ["//tensorflow/contrib/data/kernels:dataset_kernels"],
+    deps = ["//tensorflow/contrib/data/kernels:dataset_kernels"] +
+           if_static(
+               extra_deps = [":lib_proto_parsing_for_dataset_ops"],
+               otherwise = [],
+           ),
 )
 
 tf_gen_op_libs(
     op_lib_names = ["dataset_ops"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index fcdccdd26ca1824bf13f1fd0cfd80b20ca8a10c3..077cbba9d2ae41a83f6c358a63ae27aec5741e2c 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -23,20 +23,28 @@ removing existing functionality.
 See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 
 @@Counter
+@@SqlDataset
 
+@@assert_element_shape
 @@batch_and_drop_remainder
+@@bucket_by_sequence_length
 @@dense_to_sparse_batch
 @@enumerate_dataset
 @@group_by_window
 @@ignore_errors
+@@make_batched_features_dataset
+@@make_csv_dataset
 @@make_saveable_from_iterator
 @@map_and_batch
 @@padded_batch_and_drop_remainder
 @@parallel_interleave
+@@prefetch_to_device
 @@read_batch_features
 @@rejection_resample
+@@sample_from_datasets
 @@scan
 @@shuffle_and_repeat
+@@sliding_window_batch
 @@sloppy_interleave
 @@unbatch
 
@@ -49,6 +57,7 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 
+from tensorflow.contrib.data.python.ops.batching import assert_element_shape
 from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
 from tensorflow.contrib.data.python.ops.batching import map_and_batch
@@ -58,16 +67,25 @@ from tensorflow.contrib.data.python.ops.counter import Counter
 from tensorflow.contrib.data.python.ops.enumerate_ops import enumerate_dataset
 from tensorflow.contrib.data.python.ops.error_ops import ignore_errors
 from tensorflow.contrib.data.python.ops.get_single_element import get_single_element
+from tensorflow.contrib.data.python.ops.grouping import bucket_by_sequence_length
 from tensorflow.contrib.data.python.ops.grouping import group_by_window
 from tensorflow.contrib.data.python.ops.interleave_ops import parallel_interleave
+from tensorflow.contrib.data.python.ops.interleave_ops import sample_from_datasets
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
+from tensorflow.contrib.data.python.ops.prefetching_ops import prefetch_to_device
+from tensorflow.contrib.data.python.ops.readers import make_batched_features_dataset
+from tensorflow.contrib.data.python.ops.readers import make_csv_dataset
 from tensorflow.contrib.data.python.ops.readers import read_batch_features
 from tensorflow.contrib.data.python.ops.readers import SqlDataset
 from tensorflow.contrib.data.python.ops.resampling import rejection_resample
 from tensorflow.contrib.data.python.ops.scan_ops import scan
 from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
+from tensorflow.contrib.data.python.ops.sliding import sliding_window_batch
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 remove_undocumented(__name__)
+
+# A constant that can be used to enable auto-tuning.
+AUTOTUNE = -1
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD
index 56471911c5c0d1c1825955c67997b5bbc0786463..c56910c7833d4c54fa8db27cd061b404013f3f54 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/contrib/data/kernels/BUILD
@@ -9,6 +9,18 @@ exports_files(["LICENSE"])
 cc_library(
     name = "prefetching_kernels",
     srcs = ["prefetching_kernels.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_headers_lib",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "directed_interleave_dataset_op",
+    srcs = ["directed_interleave_dataset_op.cc"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
@@ -28,24 +40,36 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "threadpool_dataset_op",
+    srcs = ["threadpool_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+cc_library(
+    name = "unique_dataset_op",
+    srcs = ["unique_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
 cc_library(
     name = "dataset_kernels",
     deps = [
+        ":directed_interleave_dataset_op",
         ":ignore_errors_dataset_op",
         ":prefetching_kernels",
+        ":threadpool_dataset_op",
+        ":unique_dataset_op",
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48d3734162525ffc6ace076e4f0523c1d0cae511
--- /dev/null
+++ b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
@@ -0,0 +1,274 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/hash/hash.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class DirectedInterleaveDatasetOp : public DatasetOpKernel {
+ public:
+  explicit DirectedInterleaveDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    DatasetBase* selector_input;
+    OP_REQUIRES_OK(ctx,
+                   GetDatasetFromVariantTensor(ctx->input(0), &selector_input));
+
+    OP_REQUIRES(
+        ctx,
+        selector_input->output_dtypes().size() == 1 &&
+            selector_input->output_dtypes()[0] == DT_INT64 &&
+            selector_input->output_shapes().size() == 1 &&
+            selector_input->output_shapes()[0].IsCompatibleWith(
+                PartialTensorShape({})),
+        errors::InvalidArgument(
+            "The selector input must be a dataset of scalar int64 elements."));
+
+    std::vector<DatasetBase*> data_inputs;
+    for (size_t i = 1; i < ctx->num_inputs(); ++i) {
+      DatasetBase* input;
+      OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(i), &input));
+      data_inputs.push_back(input);
+
+      OP_REQUIRES(
+          ctx, data_inputs[0]->output_dtypes() == input->output_dtypes(),
+          errors::InvalidArgument(
+              "All inputs must have the same output_dtypes. First input "
+              "has types ",
+              DataTypeVectorString(data_inputs[0]->output_dtypes()),
+              ", and input ", i - 1, " has types ",
+              DataTypeVectorString(input->output_dtypes())));
+    }
+    *output = new Dataset(ctx, selector_input, std::move(data_inputs));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* selector_input,
+            std::vector<DatasetBase*> data_inputs)
+        : GraphDatasetBase(ctx),
+          selector_input_(selector_input),
+          data_inputs_(std::move(data_inputs)) {
+      selector_input_->Ref();
+
+      output_shapes_ = data_inputs_[0]->output_shapes();
+      data_inputs_[0]->Ref();
+      for (size_t i = 1; i < data_inputs_.size(); ++i) {
+        const DatasetBase* data_input = data_inputs_[i];
+        data_input->Ref();
+        for (size_t j = 0; j < output_shapes_.size(); ++j) {
+          output_shapes_[j] = MostSpecificCompatibleShape(
+              output_shapes_[j], data_input->output_shapes()[j]);
+        }
+      }
+    }
+
+    ~Dataset() override {
+      selector_input_->Unref();
+      for (DatasetBase* data_input : data_inputs_) {
+        data_input->Unref();
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::DirectedInterleave")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return data_inputs_[0]->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("DirectedInterleaveDatasetOp::Dataset");
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* selector_input_node;
+      TF_RETURN_IF_ERROR(
+          b->AddParentDataset(ctx, selector_input_, &selector_input_node));
+      std::vector<Node*> data_input_nodes(data_inputs_.size());
+      for (size_t i = 0; i < data_inputs_.size(); ++i) {
+        TF_RETURN_IF_ERROR(
+            b->AddParentDataset(ctx, data_inputs_[i], &data_input_nodes[i]));
+      }
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, selector_input_node}},
+                                       {{1, data_input_nodes}}, {}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            selector_input_impl_(params.dataset->selector_input_->MakeIterator(
+                params.prefix + ".selector")),
+            num_active_inputs_(params.dataset->data_inputs_.size()) {
+        data_input_impls_.reserve(params.dataset->data_inputs_.size());
+        for (size_t i = 0; i < params.dataset->data_inputs_.size(); ++i) {
+          const DatasetBase* data_input = params.dataset->data_inputs_[i];
+          data_input_impls_.push_back(data_input->MakeIterator(
+              strings::StrCat(params.prefix, "[", i, "]")));
+        }
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (!selector_input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        while (true) {
+          std::vector<Tensor> selector_result;
+          *end_of_sequence = false;
+          TF_RETURN_IF_ERROR(selector_input_impl_->GetNext(
+              ctx, &selector_result, end_of_sequence));
+          if (*end_of_sequence) {
+            selector_input_impl_.reset();
+            for (auto& data_input_impl : data_input_impls_) {
+              data_input_impl.reset();
+            }
+            return Status::OK();
+          }
+
+          int64 selected_input = selector_result[0].scalar<int64>()();
+          if (selected_input < 0 || selected_input > data_input_impls_.size()) {
+            return errors::InvalidArgument(
+                "Selector index out of range: ", selected_input,
+                " >= ", data_input_impls_.size());
+          }
+
+          if (data_input_impls_[selected_input]) {
+            bool end_of_selected_input = false;
+            TF_RETURN_IF_ERROR(data_input_impls_[selected_input]->GetNext(
+                ctx, out_tensors, &end_of_selected_input));
+
+            if (!end_of_selected_input) {
+              return Status::OK();
+            }
+
+            data_input_impls_[selected_input].reset();
+            --num_active_inputs_;
+
+            if (num_active_inputs_ == 0) {
+              selector_input_impl_.reset();
+              *end_of_sequence = true;
+              return Status::OK();
+            }
+          }
+
+          LOG(WARNING) << "DirectedInterleave selected an exhausted input: "
+                       << selected_input;
+        }
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (selector_input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, selector_input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("selector_input_impl_empty"), ""));
+        }
+        for (size_t i = 0; i < data_input_impls_.size(); ++i) {
+          const auto& data_input_impl = data_input_impls_[i];
+          if (data_input_impl) {
+            TF_RETURN_IF_ERROR(SaveParent(writer, data_input_impl));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("data_input_impl_empty[", i, "]")),
+                ""));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("selector_input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, selector_input_impl_));
+        } else {
+          selector_input_impl_.reset();
+        }
+        for (size_t i = 0; i < data_input_impls_.size(); ++i) {
+          if (!reader->Contains(full_name(
+                  strings::StrCat("data_input_impl_empty[", i, "]")))) {
+            TF_RETURN_IF_ERROR(
+                RestoreParent(ctx, reader, data_input_impls_[i]));
+          } else {
+            data_input_impls_[i].reset();
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> selector_input_impl_ GUARDED_BY(mu_);
+      std::vector<std::unique_ptr<IteratorBase>> data_input_impls_
+          GUARDED_BY(mu_);
+      int64 num_active_inputs_ GUARDED_BY(mu_);
+    };
+
+    static PartialTensorShape MostSpecificCompatibleShape(
+        const PartialTensorShape& ts1, const PartialTensorShape& ts2) {
+      PartialTensorShape output_tensorshape;
+      if (ts1.dims() != ts2.dims() || ts1.unknown_rank() || ts2.unknown_rank())
+        return output_tensorshape;
+      auto dims1 = ts1.dim_sizes();
+      auto dims2 = ts2.dim_sizes();
+      for (int d = 0; d < ts1.dims(); d++) {
+        if (dims1[d] == dims2[d])
+          output_tensorshape.Concatenate(dims1[d]);
+        else
+          output_tensorshape.Concatenate(-1);
+      }
+      return output_tensorshape;
+    }
+
+    const DatasetBase* const selector_input_;
+    const std::vector<DatasetBase*> data_inputs_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("DirectedInterleaveDataset").Device(DEVICE_CPU),
+                        DirectedInterleaveDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
index d3df14bdd03476e9ee4015b374512e5bb9893a63..a2bfce03620a1482f5b21cbf23c66833bc5cd480 100644
--- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc
+++ b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <deque>
 
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
@@ -35,38 +36,25 @@ using FunctionBufferCallback = std::function<void(const BufferElement&)>;
 class FunctionBufferingResource : public ResourceBase {
  public:
   FunctionBufferingResource(FunctionLibraryRuntime* lib,
+                            std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                             const NameAttrList& func, int64 buffer_size,
                             const string& source_device,
                             const string& target_device,
-                            const std::vector<Tensor>& func_args,
-                            int64 thread_pool_size)
+                            const std::vector<Tensor>& func_args)
       : lib_(lib),
+        pflr_(std::move(pflr)),
         func_(func),
         buffer_size_(buffer_size),
         source_device_(source_device),
         target_device_(target_device),
         func_args_(func_args),
-        thread_pool_(new thread::ThreadPool(Env::Default(), ThreadOptions(),
-                                            "buffer_resource", thread_pool_size,
-                                            false /* low_latency_hint */)),
         handle_(kInvalidHandle),
         is_buffering_(false),
         end_of_sequence_(false),
-        cancelled_(false) {
-    runner_ = [this](std::function<void()> c) {
-      thread_pool_->Schedule(std::move(c));
-    };
-  }
+        cancelled_(false) {}
 
   ~FunctionBufferingResource() override {
     Cancel();
-    {
-      mutex_lock l(mu_);
-      while (is_buffering_) {
-        cond_var_.wait(l);
-      }
-    }
-    delete thread_pool_;
   }
 
   string DebugString() override {
@@ -100,6 +88,20 @@ class FunctionBufferingResource : public ResourceBase {
   void Cancel() LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     cancelled_ = true;
+    while (is_buffering_) {
+      cond_var_.wait(l);
+    }
+  }
+
+  // Cancels all pending operations and then clears out the state.
+  void Reset() LOCKS_EXCLUDED(mu_) {
+    Cancel();
+    mutex_lock l(mu_);
+    buffer_.clear();
+    requests_.clear();
+    is_buffering_ = false;
+    end_of_sequence_ = false;
+    cancelled_ = false;
   }
 
   // If the buffer has anything, runs `callback` on the first element in the
@@ -164,15 +166,12 @@ class FunctionBufferingResource : public ResourceBase {
       for (int i = 0; i < cancellation_callbacks.size(); ++i) {
         cancellation_callbacks[i](cancellation_buffer_elements[i]);
       }
-      // We only wait on cond_var_ in the destructor, so there would atmost be
-      // one waiter to notify.
-      cond_var_.notify_one();
+      cond_var_.notify_all();
       return;
     }
     FunctionLibraryRuntime::Options opts;
     // Copied from CapturedFunction::generate_step_id();
     opts.step_id = -std::abs(static_cast<int64>(random::New64()));
-    opts.runner = &runner_;
     opts.source_device = source_device_;
     AllocatorAttributes arg_alloc_attr;
     arg_alloc_attr.set_on_host(true);
@@ -191,13 +190,12 @@ class FunctionBufferingResource : public ResourceBase {
                   mutex_lock l(mu_);
                   BufferElement buffer_element;
                   buffer_element.status = status;
-                  if (!status.ok()) {
+                  if (status.ok()) {
+                    buffer_element.value.swap(*rets);
+                  } else {
                     end_of_sequence_ = true;
                     is_buffering_ = false;
-                    buffer_.push_back(std::move(buffer_element));
-                    return;
                   }
-                  buffer_element.value.swap(*rets);
                   buffer_.push_back(std::move(buffer_element));
                   if (!requests_.empty()) {
                     buffer_front = std::move(buffer_.front());
@@ -205,9 +203,16 @@ class FunctionBufferingResource : public ResourceBase {
                     callback = std::move(requests_.front());
                     requests_.pop_front();
                   }
-                  if (buffer_.size() < buffer_size_) {
+                  if (buffer_.size() < buffer_size_ && !end_of_sequence_) {
                     restart_buffering = true;
                   } else {
+                    // When the buffer is full, we don't want to call
+                    // FillBuffer() unless we're in cancellation phase in which
+                    // case FillBuffer() will do the final cleanup post
+                    // cancellation.
+                    if (cancelled_) {
+                      restart_buffering = true;
+                    }
                     is_buffering_ = false;
                   }
                 }
@@ -222,16 +227,15 @@ class FunctionBufferingResource : public ResourceBase {
 
   mutex mu_;
   FunctionLibraryRuntime* lib_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
   NameAttrList func_;
   const int64 buffer_size_;
   const string source_device_;
   const string target_device_;
   const std::vector<Tensor> func_args_;
-  thread::ThreadPool* thread_pool_;
   FunctionLibraryRuntime::Handle handle_ GUARDED_BY(mu_);
   std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
   std::deque<FunctionBufferCallback> requests_ GUARDED_BY(mu_);
-  std::function<void(std::function<void()>)> runner_ = nullptr;
   bool is_buffering_ GUARDED_BY(mu_);
   bool end_of_sequence_ GUARDED_BY(mu_);
   bool cancelled_ GUARDED_BY(mu_);
@@ -241,12 +245,22 @@ class FunctionBufferingResource : public ResourceBase {
 class FunctionBufferResourceHandleOp : public OpKernel {
  public:
   explicit FunctionBufferResourceHandleOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
+      : OpKernel(ctx), flib_def_(nullptr) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &buffer_size_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("thread_pool_size", &thread_pool_size_));
+  }
+
+  ~FunctionBufferResourceHandleOp() override {
+    if (cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->Delete<FunctionBufferingResource>(cinfo_.container(),
+                                                   cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource can have been deleted by session resets.
+      }
+    }
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -267,33 +281,44 @@ class FunctionBufferResourceHandleOp : public OpKernel {
 
     const string& source_device = ctx->device()->name();
 
-    ContainerInfo cinfo;
-    OP_REQUIRES_OK(ctx, cinfo.Init(ctx->resource_manager(), def()));
-    // Create the resource.
-    FunctionBufferingResource* buffer;
-    OP_REQUIRES_OK(
-        ctx, ctx->resource_manager()->LookupOrCreate<FunctionBufferingResource>(
-                 cinfo.container(), cinfo.name(), &buffer,
-                 [lib, &source_device, &target_device, func_args,
-                  this](FunctionBufferingResource** ptr) {
-                   *ptr = new FunctionBufferingResource(
-                       lib, func_, buffer_size_, source_device, target_device,
-                       func_args, thread_pool_size_);
-                   return Status::OK();
-                 }));
-    OP_REQUIRES_OK(ctx, buffer->Instantiate());
+    mutex_lock l(mu_);
+    if (!initialized_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def()));
+      FunctionLibraryRuntime* clone_lib;
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr;
+      OP_REQUIRES_OK(ctx, lib->Clone(&flib_def_, &pflr, &clone_lib));
+      // Create the resource.
+      FunctionBufferingResource* buffer;
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->resource_manager()->LookupOrCreate<FunctionBufferingResource>(
+              cinfo_.container(), cinfo_.name(), &buffer,
+              [clone_lib, &pflr, &source_device, &target_device, func_args,
+               this](FunctionBufferingResource** ptr) {
+                *ptr = new FunctionBufferingResource(
+                    clone_lib, std::move(pflr), func_, buffer_size_,
+                    source_device, target_device, func_args);
+                return Status::OK();
+              }));
+      core::ScopedUnref s(buffer);
+      OP_REQUIRES_OK(ctx, buffer->Instantiate());
+      initialized_ = true;
+    }
 
     OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
-                            ctx, 0, cinfo.container(), cinfo.name(),
+                            ctx, 0, cinfo_.container(), cinfo_.name(),
                             MakeTypeIndex<FunctionBufferingResource>()));
   }
 
  private:
+  mutex mu_;
+  ContainerInfo cinfo_ GUARDED_BY(mu_);
+  bool initialized_ GUARDED_BY(mu_) = false;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
   NameAttrList func_;
   int64 buffer_size_;
   string container_;
   string name_;
-  int64 thread_pool_size_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResource")
@@ -334,25 +359,27 @@ class FunctionBufferingResourceGetNextOp : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(
         ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer),
         done);
-    core::ScopedUnref s(buffer);
 
     if (buffer->Finished()) {
+      buffer->Unref();
       ctx->SetStatus(errors::OutOfRange("end_of_sequence"));
       done();
       return;
     }
 
     FunctionBufferCallback callback =
-        [ctx, done](const BufferElement& buffer_element) {
+        [ctx, buffer, done](const BufferElement& buffer_element) {
           Status s = buffer_element.status;
           if (!s.ok()) {
             ctx->SetStatus(s);
+            buffer->Unref();
             done();
             return;
           }
           for (size_t i = 0; i < buffer_element.value.size(); ++i) {
             ctx->set_output(i, buffer_element.value[i]);
           }
+          buffer->Unref();
           done();
         };
     buffer->MaybeGet(std::move(callback));
@@ -374,4 +401,62 @@ REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceGetNext")
                         FunctionBufferingResourceGetNextOp);
 #endif  // TENSORFLOW_USE_SYCL
 
+// Resets the FunctionBufferingResource, cancelling all pending requests and
+// clearing out the buffer.
+class FunctionBufferingResourceResetOp : public OpKernel {
+ public:
+  explicit FunctionBufferingResourceResetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  ~FunctionBufferingResourceResetOp() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    ResourceHandle handle;
+    OP_REQUIRES_OK(ctx,
+                   HandleFromInput(ctx, "function_buffer_resource", &handle));
+    FunctionBufferingResource* buffer = nullptr;
+    OP_REQUIRES_OK(
+        ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer));
+    core::ScopedUnref s(buffer);
+
+    buffer->Reset();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceReset")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("function_buffer_resource"),
+                        FunctionBufferingResourceResetOp);
+REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceReset")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("function_buffer_resource"),
+                        FunctionBufferingResourceResetOp);
+#if TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceReset")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("function_buffer_resource"),
+                        FunctionBufferingResourceResetOp);
+#endif  // TENSORFLOW_USE_SYCL
+
+class IteratorGetDeviceOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    // NOTE(mrry): We do not currently Validate that the handle
+    // corresponds to a real IteratorResource, because that symbol is
+    // not exposed from the framework library.
+    Tensor* device_name_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &device_name_t));
+    // NOTE(mrry): Since the operation's input is a resource, we must be
+    // colocated with it, and so we can simply return the current device's
+    // name without looking at the input.
+    device_name_t->scalar<string>()() = ctx->device()->name();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("IteratorGetDevice").Device(DEVICE_CPU),
+                        IteratorGetDeviceOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..63e19ae3f837c9d3cfb1221df64360ee74117f13
--- /dev/null
+++ b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
@@ -0,0 +1,193 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+namespace {
+
+class ThreadPoolResource : public ResourceBase {
+ public:
+  ThreadPoolResource(Env* env, const ThreadOptions& thread_options,
+                     const string& name, int num_threads, bool low_latency_hint)
+      : thread_pool_(env, thread_options, name, num_threads, low_latency_hint) {
+  }
+
+  // Schedules fn() for execution in the pool of threads.
+  void Schedule(std::function<void()> fn) {
+    thread_pool_.Schedule(std::move(fn));
+  }
+
+  string DebugString() override { return "ThreadPoolResource"; }
+
+ private:
+  thread::ThreadPool thread_pool_;
+};
+
+// Creates a handle to a ThreadPool resource. Note that we don't use
+// ResourceOpKernel here because the ThreadPoolResource constructor requires
+// access to `OpKernelContext::env()`, which isn't provided by
+// `ResourceOpKernel<T>::CreateResource()`.
+class ThreadPoolHandleOp : public OpKernel {
+ public:
+  explicit ThreadPoolHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("display_name", &display_name_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_threads", &num_threads_));
+    OP_REQUIRES(
+        ctx, num_threads_ > 0,
+        errors::InvalidArgument("`num_threads` must be greater than zero."));
+  }
+
+  // The resource is deleted from the resource manager only when it is private
+  // to kernel. Ideally the resource should be deleted when it is no longer held
+  // by anyone, but it would break backward compatibility.
+  ~ThreadPoolHandleOp() override {
+    if (cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->Delete<ThreadPoolResource>(cinfo_.container(), cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource can have been deleted by session resets.
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    if (!initialized_) {
+      ResourceMgr* mgr = ctx->resource_manager();
+      OP_REQUIRES_OK(ctx, cinfo_.Init(mgr, def()));
+      ThreadPoolResource* resource;
+      OP_REQUIRES_OK(ctx, mgr->LookupOrCreate<ThreadPoolResource>(
+                              cinfo_.container(), cinfo_.name(), &resource,
+                              [this, ctx](ThreadPoolResource** ret)
+                                  EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                    *ret = new ThreadPoolResource(
+                                        ctx->env(), {}, display_name_,
+                                        num_threads_,
+                                        false /* low_latency_hint */);
+                                    return Status::OK();
+                                  }));
+      initialized_ = true;
+    }
+    OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
+                            ctx, 0, cinfo_.container(), cinfo_.name(),
+                            MakeTypeIndex<ThreadPoolResource>()));
+  }
+
+ private:
+  mutex mu_;
+  ContainerInfo cinfo_ GUARDED_BY(mu_);
+  bool initialized_ GUARDED_BY(mu_) = false;
+  string display_name_;
+  int num_threads_;
+};
+
+class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ThreadPoolDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    ThreadPoolResource* threadpool_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
+                                       &threadpool_resource));
+    core::ScopedUnref unref_iterator(threadpool_resource);
+
+    *output = new Dataset(ctx, input, threadpool_resource);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            ThreadPoolResource* threadpool)
+        : GraphDatasetBase(ctx), input_(input), threadpool_(threadpool) {
+      input_->Ref();
+      threadpool_->Ref();
+    }
+
+    ~Dataset() override {
+      input_->Unref();
+      threadpool_->Unref();
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::ThreadPool")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "ThreadPoolDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented(
+          "Cannot currently serialize the thread pool for a "
+          "ThreadPoolDataset.");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        ThreadPoolResource* pool = dataset()->threadpool_;
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = [pool](std::function<void()> c) {
+          pool->Schedule(std::move(c));
+        };
+        params.stats_aggregator_getter = ctx->stats_aggregator_getter();
+        params.lib = ctx->lib();
+        params.function_library = ctx->function_library();
+        params.allocator_getter = ctx->allocator_getter();
+        IteratorContext threadpool_ctx(params);
+        return input_impl_->GetNext(&threadpool_ctx, out_tensors,
+                                    end_of_sequence);
+      }
+
+     private:
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    ThreadPoolResource* const threadpool_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ThreadPoolHandle").Device(DEVICE_CPU),
+                        ThreadPoolHandleOp);
+REGISTER_KERNEL_BUILDER(Name("ThreadPoolDataset").Device(DEVICE_CPU),
+                        ThreadPoolDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/unique_dataset_op.cc b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/data/unique_dataset_op.cc
rename to tensorflow/contrib/data/kernels/unique_dataset_op.cc
index 7726ee0edf71b34cb65fe5fceb2b60dd30bb58e2..69fbb0fcdcce87951d2c9b84210fda378081b103 100644
--- a/tensorflow/core/kernels/data/unique_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index 289ffa1d9c29092cdf434e86ed5553ff9644d43e..137deb63527f0bdde7da8d5be83ed038f430e581 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -17,6 +17,23 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("DirectedInterleaveDataset")
+    .Input("selector_input_dataset: variant")
+    .Input("data_input_datasets: N * variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+
+selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines
+  which of the `N` data inputs should produce the next output element.
+data_input_datasets: `N` datasets with the same type that will be interleaved
+  according to the values of `selector_input_dataset`.
+)doc");
+
 REGISTER_OP("IgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -27,6 +44,24 @@ REGISTER_OP("IgnoreErrorsDataset")
 Creates a dataset that contains the elements of `input_dataset` ignoring errors.
 )doc");
 
+REGISTER_OP("UniqueDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that contains the unique elements of `input_dataset`.
+)doc");
+
+REGISTER_OP("IteratorGetDevice")
+    .Input("resource: resource")
+    .Output("device: string")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Returns the name of the device on which `resource` has been placed.
+)doc");
+
 REGISTER_OP("FunctionBufferingResource")
     .Input("string_arg: string")
     .Input("target_device: string")
@@ -35,7 +70,6 @@ REGISTER_OP("FunctionBufferingResource")
     .Attr("container: string")
     .Attr("f: func")
     .Attr("buffer_size: int")
-    .Attr("thread_pool_size: int")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Creates a resource that fills up a buffer by making function calls.
@@ -45,7 +79,6 @@ target_device: Target device to execute the function on.
 resource: Handle to the resource created.
 f: Function to be executed.
 buffer_size: Size of the buffer.
-thread_pool_size: Size of the threadpool doing the prefetching.
 container: If non-empty, this resource is placed in the given container.
   Otherwise, a default container is used.
 shared_name: If non-empty, this resource will be shared under the given name
@@ -65,4 +98,42 @@ output: A list of return values.
 output_types: The type list for the return values.
 )doc");
 
+REGISTER_OP("FunctionBufferingResourceReset")
+    .Input("function_buffer_resource: resource")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Resets the FunctionBufferingResource.
+
+function_buffer_resource: The FunctionBufferingResource handle.
+)doc");
+
+REGISTER_OP("ThreadPoolDataset")
+    .Input("input_dataset: variant")
+    .Input("thread_pool: resource")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+
+handle: A resource produced by the ThreadPoolHandle op.
+)doc");
+
+REGISTER_OP("ThreadPoolHandle")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Attr("num_threads: int")
+    .Attr("display_name: string")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Doc(R"doc(
+Creates a custom thread pool with the given number of threads.
+
+handle: A resource that can be consumed by one or more ThreadPoolDataset ops.
+num_threads: The number of threads in the thread pool.
+display_name: A human-readable name for the threads that may be visible in
+  some visualizations.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index e51d57cc896dc32d8e11912cd89f34a04a858c78..d59dd17aea42618075e69516bcfa4ee2b9eafc81 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,24 +4,24 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["batch_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
@@ -37,8 +37,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:grouping",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -59,10 +58,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
@@ -79,8 +78,7 @@ py_test(
     ],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -124,16 +122,19 @@ py_test(
     size = "small",
     srcs = ["filter_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "optonly",
+    ],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -145,7 +146,7 @@ tf_py_test(
     additional_deps = [
         ":dataset_serialization_test",
         "//third_party/py/numpy",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -168,13 +169,14 @@ py_test(
     srcs = ["interleave_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "manual",
         "no_oss",
         "no_pip",
+        "notap",
     ],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:interleave_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -185,6 +187,7 @@ py_test(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -195,7 +198,8 @@ tf_py_test(
     srcs = ["get_single_element_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:get_single_element",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -210,11 +214,14 @@ py_test(
     size = "medium",
     srcs = ["map_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "noasan",  # times out
+        "optonly",
+    ],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:error_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -259,8 +266,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:counter",
+        "//tensorflow/contrib/data/python/ops:enumerate_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -272,6 +279,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -295,6 +303,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:iterator_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -304,14 +313,17 @@ py_test(
     srcs = ["resample_test.py"],
     shard_count = 2,
     srcs_version = "PY2AND3",
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",
+    ],
     deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:resampling",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -324,13 +336,14 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:scan_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
     ],
 )
@@ -343,11 +356,11 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -375,7 +388,6 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -412,10 +424,25 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:stats_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_test(
+    name = "threadpool_dataset_ops_test",
+    size = "small",
+    srcs = ["threadpool_dataset_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:threadpool",
+        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -427,13 +454,13 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/contrib/stateless",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -446,25 +473,20 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "prefetching_ops_test",
     size = "small",
     srcs = ["prefetching_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_oss",  # b/68785503
-    ],
-    deps = [
+    additional_deps = [
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -479,16 +501,39 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+tf_py_test(
+    name = "slide_dataset_op_test",
+    size = "small",
+    srcs = ["slide_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:sliding",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "writer_ops_test",
+    size = "small",
+    srcs = ["writer_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/contrib/data/python/ops:writers",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:readers",
+    ],
 )
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 71dc1c1172c9d515d4c85f85257c952135098329..a4a0ce79b6013d8813f2d8d294168ea8189d53ef 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -18,20 +18,26 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+import time
 
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 class BatchDatasetTest(test.TestCase):
@@ -149,6 +155,69 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
 
+  def testUnbatchDatasetWithStrings(self):
+    data = tuple([math_ops.range(10) for _ in range(3)])
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z))
+    expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
+    data = data.batch(2)
+    self.assertEqual(expected_types, data.output_types)
+    data = data.apply(batching.unbatch())
+    self.assertEqual(expected_types, data.output_types)
+
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+  def testUnbatchDatasetWithSparseTensor(self):
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[i, i] for i in range(10)],
+        values=list(range(10)),
+        dense_shape=[10, 10])
+    data = dataset_ops.Dataset.from_tensors(st)
+    data = data.apply(batching.unbatch())
+    data = data.batch(5)
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        st_row = sess.run(next_element)
+        self.assertEqual([i], st_row.indices)
+        self.assertEqual([i], st_row.values)
+        self.assertEqual([10], st_row.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testUnbatchDatasetWithDenseAndSparseTensor(self):
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[i, i] for i in range(10)],
+        values=list(range(10)),
+        dense_shape=[10, 10])
+    data = dataset_ops.Dataset.from_tensors((list(range(10)), st))
+    data = data.apply(batching.unbatch())
+    data = data.batch(5)
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        dense_elem, st_row = sess.run(next_element)
+        self.assertEqual(i, dense_elem)
+        self.assertEqual([i], st_row.indices)
+        self.assertEqual([i], st_row.values)
+        self.assertEqual([10], st_row.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -189,6 +258,53 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
 
+  def testUnbatchEmpty(self):
+    data = dataset_ops.Dataset.from_tensors(
+        (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
+         constant_op.constant([], shape=[0, 4, 0])))
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testUnbatchStaticShapeMismatch(self):
+    data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
+                                             np.arange(9)))
+    with self.assertRaises(ValueError):
+      data.apply(batching.unbatch())
+
+  def testUnbatchDynamicShapeMismatch(self):
+    ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
+    ph2 = array_ops.placeholder(dtypes.int32, shape=None)
+    data = dataset_ops.Dataset.from_tensors((ph1, ph2))
+    data = data.apply(batching.unbatch())
+    iterator = data.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Mismatch in the 0th dimension.
+      sess.run(
+          iterator.initializer,
+          feed_dict={
+              ph1: np.arange(7).astype(np.int32),
+              ph2: np.arange(8).astype(np.int32)
+          })
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(next_element))
+
+      # No 0th dimension (i.e. scalar value) for one component.
+      sess.run(
+          iterator.initializer,
+          feed_dict={
+              ph1: np.arange(7).astype(np.int32),
+              ph2: 7
+          })
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(next_element))
+
   def testBatchAndDropRemainder(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -311,10 +427,10 @@ class BatchDatasetTest(test.TestCase):
     self.assertEqual([None], dataset.output_shapes[1][0].as_list())
     self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
 
-  def _testBatchAndMapDatasetHelper(self, num_parallel_batches=1):
+  def _testMapAndBatchDatasetHelper(self, num_parallel_batches=1):
     """Test a dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset ->
-    # RepeatDataset(count) -> BatchAndMapDataset(square_3, batch_size).
+    # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
                   np.array(37.0) * np.arange(7))
@@ -381,11 +497,51 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
-  def testBatchAndMapDataset(self):
-    return self._testBatchAndMapDatasetHelper()
+  def testMapAndBatchDataset(self):
+    return self._testMapAndBatchDatasetHelper()
+
+  def testMapAndBatchDatasetWithParallelBatching(self):
+    return self._testMapAndBatchDatasetHelper(num_parallel_batches=10)
+
+  def _testMapAndBatchPartialBatchHelper(self, drop_remainder=False):
+    iterator = (
+        dataset_ops.Dataset.range(10).apply(
+            batching.map_and_batch(
+                lambda x: array_ops.reshape(x * x, [1]),
+                batch_size=4,
+                drop_remainder=drop_remainder)).make_one_shot_iterator())
+    if drop_remainder:
+      self.assertEqual([4, 1], iterator.output_shapes.as_list())
+    else:
+      self.assertEqual([None, 1], iterator.output_shapes.as_list())
+    next_element = iterator.get_next()
+    with self.test_session() as sess:
+      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+      if not drop_remainder:
+        self.assertAllEqual([[64], [81]], sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testMapAndBatchPartialBatch(self):
+    return self._testMapAndBatchPartialBatchHelper()
+
+  def testMapAndBatchPartialBatchDropRemainder(self):
+    return self._testMapAndBatchPartialBatchHelper(drop_remainder=True)
 
-  def testBatchAndMapDatasetWithParallelBatching(self):
-    return self._testBatchAndMapDatasetHelper(num_parallel_batches=10)
+  def testMapAndBatchYieldsPartialBatch(self):
+    iterator = (dataset_ops.Dataset.range(10)
+                .apply(batching.map_and_batch(
+                    lambda x: array_ops.reshape(x * x, [1]), 4))
+                .make_one_shot_iterator())
+    self.assertEqual([None, 1], iterator.output_shapes.as_list())
+    next_element = iterator.get_next()
+    with self.test_session() as sess:
+      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+      self.assertAllEqual([[64], [81]], sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
 
   def testMapAndBatchSparse(self):
 
@@ -411,7 +567,7 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testBatchAndMapDatasetFails(self):
+  def testMapAndBatchDatasetFails(self):
     """Test a dataset that maps a TF function across its input elements."""
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.check_numerics(
@@ -425,7 +581,7 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
         sess.run(init_op, feed_dict={batch_size: 14})
 
-  def testBatchAndMapDatasetShapeMismatch(self):
+  def testMapAndBatchDatasetShapeMismatch(self):
     """Test a dataset that maps a TF function across its input elements."""
 
     def generator():
@@ -503,6 +659,59 @@ class BatchDatasetSerializationTest(
     self.run_core_tests(self._build_dataset_nested_sparse, None, 1)
 
 
+class UnbatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def build_dataset(self, multiplier=15.0, tensor_slice_len=2, batch_size=2):
+    components = (
+        np.arange(tensor_slice_len),
+        np.array([[1, 2, 3]]) * np.arange(tensor_slice_len)[:, np.newaxis],
+        np.array(multiplier) * np.arange(tensor_slice_len))
+
+    return dataset_ops.Dataset.from_tensor_slices(components).batch(
+        batch_size).apply(batching.unbatch())
+
+  def testCore(self):
+    tensor_slice_len = 8
+    batch_size = 2
+    num_outputs = tensor_slice_len
+    self.run_core_tests(
+        lambda: self.build_dataset(15.0, tensor_slice_len, batch_size),
+        lambda: self.build_dataset(20.0, tensor_slice_len, batch_size),
+        num_outputs)
+
+
+class MapAndBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testSerializationCore(self):
+    range_size = 11
+    num_repeats = 2
+    batch_size = 5
+    total_outputs = range_size * num_repeats
+    num_outputs_drop_remainder = total_outputs // batch_size
+    num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size))
+    num_parallel_batches = 2
+
+    def build_ds(range_start, drop_remainder=False):
+
+      def _map_fn(x):
+        return math_ops.square(x)
+
+      return dataset_ops.Dataset.range(
+          range_start, range_start + range_size).repeat(num_repeats).apply(
+              batching.map_and_batch(
+                  map_func=_map_fn,
+                  batch_size=batch_size,
+                  num_parallel_batches=num_parallel_batches,
+                  drop_remainder=drop_remainder))
+
+    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
+                        num_outputs_keep_remainder)
+    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
+                        num_outputs_drop_remainder)
+
+
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
@@ -539,5 +748,149 @@ class PaddedBatchDatasetSerializationTest(
                         lambda: build_dataset(seq_lens2), 8)
 
 
+class RestructuredDatasetTest(test.TestCase):
+
+  def test_assert_element_shape(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    expected_shapes = (tensor_shape.TensorShape(2),
+                       tensor_shape.TensorShape((3, 4)))
+    result = dataset.apply(batching.assert_element_shape(expected_shapes))
+    self.assertEqual(expected_shapes, result.output_shapes)
+
+    iterator = result.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def test_assert_wrong_element_shape(self):
+
+    def create_dataset(_):
+      return (array_ops.ones(2, dtype=dtypes.float32),
+              array_ops.zeros((3, 4), dtype=dtypes.int32))
+
+    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    with self.assertRaises(ValueError):
+      dataset.apply(batching.assert_element_shape(wrong_shapes))
+
+  def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    iterator = (
+        dataset.apply(batching.assert_element_shape(wrong_shapes))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+
+class UnbatchDatasetBenchmark(test.Benchmark):
+
+  def benchmarkNativeUnbatch(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.apply(batching.unbatch())
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (native) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="benchmark_unbatch_dataset_native_batch_size_%d" %
+              batch_size)
+
+  # Include a benchmark of the previous `unbatch()` implementation that uses
+  # a composition of more primitive ops. Eventually we'd hope to generate code
+  # that is as good in both cases.
+  def benchmarkOldUnbatchImplementation(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (unfused) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
+              batch_size)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index f1b494e1a620992365ed75613b508e32f94b40a4..55a56b83a8efba899c6b296264d766839a824da5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import random
+
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
@@ -59,7 +61,7 @@ class GroupByWindowTest(test.TestCase):
 
       self.assertEqual(len(components), sum(counts))
       num_full_batches = len([c for c in counts if c == 4])
-      self.assertGreaterEqual(num_full_batches, 23)
+      self.assertGreaterEqual(num_full_batches, 24)
       self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
 
   def testImmediateOutput(self):
@@ -102,6 +104,21 @@ class GroupByWindowTest(test.TestCase):
       self.assertAllEqual([0, 0, 0], sess.run(get_next))
       self.assertAllEqual([1], sess.run(get_next))
 
+  def testEmpty(self):
+    iterator = (
+        dataset_ops.Dataset.range(4).apply(
+            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "Window size must be greater than zero, but got 0."):
+        print(sess.run(get_next))
+
   def testReduceFuncError(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
 
@@ -379,5 +396,118 @@ class BucketTest(test.TestCase):
       self.assertEqual(batches, 15)
 
 
+class BucketBySequenceLength(test.TestCase):
+
+  def testBucket(self):
+
+    boundaries = [10, 20, 30]
+    batch_sizes = [10, 8, 4, 2]
+    lengths = [8, 13, 25, 35]
+
+    def element_gen():
+      # Produce 1 batch for each bucket
+      elements = []
+      for batch_size, length in zip(batch_sizes, lengths):
+        for _ in range(batch_size):
+          elements.append([1] * length)
+      random.shuffle(elements)
+      for el in elements:
+        yield (el,)
+
+    element_len = lambda el: array_ops.shape(el)[0]
+    dataset = dataset_ops.Dataset.from_generator(
+        element_gen, (dtypes.int64,), ([None],)).apply(
+            grouping.bucket_by_sequence_length(
+                element_len, boundaries, batch_sizes))
+    batch, = dataset.make_one_shot_iterator().get_next()
+
+    with self.test_session() as sess:
+      batches = []
+      for _ in range(4):
+        batches.append(sess.run(batch))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(batch)
+    batch_sizes_val = []
+    lengths_val = []
+    for batch in batches:
+      batch_size = batch.shape[0]
+      length = batch.shape[1]
+      batch_sizes_val.append(batch_size)
+      lengths_val.append(length)
+    self.assertEqual(sum(batch_sizes_val), sum(batch_sizes))
+    self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
+    self.assertEqual(sorted(lengths), sorted(lengths_val))
+
+  def testPadToBoundary(self):
+
+    boundaries = [10, 20, 30]
+    batch_sizes = [10, 8, 4, 2]
+    lengths = [8, 13, 25]
+
+    def element_gen():
+      # Produce 1 batch for each bucket
+      elements = []
+      for batch_size, length in zip(batch_sizes[:-1], lengths):
+        for _ in range(batch_size):
+          elements.append([1] * length)
+      random.shuffle(elements)
+      for el in elements:
+        yield (el,)
+      for _ in range(batch_sizes[-1]):
+        el = [1] * (boundaries[-1] + 5)
+        yield (el,)
+
+    element_len = lambda el: array_ops.shape(el)[0]
+    dataset = dataset_ops.Dataset.from_generator(
+        element_gen, (dtypes.int64,), ([None],)).apply(
+            grouping.bucket_by_sequence_length(
+                element_len, boundaries, batch_sizes,
+                pad_to_bucket_boundary=True))
+    batch, = dataset.make_one_shot_iterator().get_next()
+
+    with self.test_session() as sess:
+      batches = []
+      for _ in range(3):
+        batches.append(sess.run(batch))
+      with self.assertRaisesOpError("bucket_boundaries"):
+        sess.run(batch)
+    batch_sizes_val = []
+    lengths_val = []
+    for batch in batches:
+      batch_size = batch.shape[0]
+      length = batch.shape[1]
+      batch_sizes_val.append(batch_size)
+      lengths_val.append(length)
+    batch_sizes = batch_sizes[:-1]
+    self.assertEqual(sum(batch_sizes_val), sum(batch_sizes))
+    self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
+    self.assertEqual(sorted(boundaries), sorted(lengths_val))
+
+  def testTupleElements(self):
+
+    def elements_gen():
+      text = [[1, 2, 3], [3, 4, 5, 6, 7], [1, 2], [8, 9, 0, 2, 3]]
+      label = [1, 2, 1, 2]
+      for x, y in zip(text, label):
+        yield (x, y)
+
+    def element_length_fn(x, y):
+      del y
+      return array_ops.shape(x)[0]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator=elements_gen,
+        output_shapes=(tensor_shape.TensorShape([None]),
+                       tensor_shape.TensorShape([])),
+        output_types=(dtypes.int32, dtypes.int32))
+    dataset = dataset.apply(grouping.bucket_by_sequence_length(
+        element_length_func=element_length_fn,
+        bucket_batch_sizes=[2, 2, 2],
+        bucket_boundaries=[0, 8]))
+    shapes = dataset.output_shapes
+    self.assertEqual([None, None], shapes[0].as_list())
+    self.assertEqual([None], shapes[1].as_list())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index dbc35097ddda9f0375060d43aeb43efa8107f929..78ecce8f7daaf84002ae78d8d77820755b967d89 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -163,7 +163,7 @@ class DatasetSerializationTestBase(test.TestCase):
                                  num_outputs,
                                  sparse_tensors=False,
                                  verify_exhausted=True):
-    """Verifies that restoring into an already initilized iterator works.
+    """Verifies that restoring into an already initialized iterator works.
 
     Args:
       ds_fn: See `run_core_tests`.
diff --git a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
index 32ea44f7c7ba329dc253bb9fbbcac0a1ed16aec7..87b7c6ddb7afcbaaf8fe97cd8be87e6f5af8cd4d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -33,17 +34,25 @@ class GetSingleElementTest(test.TestCase):
     take_value = array_ops.placeholder_with_default(
         constant_op.constant(1, dtype=dtypes.int64), shape=[])
 
+    def make_sparse(x):
+      x_1d = array_ops.reshape(x, [1])
+      x_2d = array_ops.reshape(x, [1, 1])
+      return sparse_tensor.SparseTensor(x_2d, x_1d, x_1d)
+
     dataset = (dataset_ops.Dataset.range(100)
                .skip(skip_value)
-               .map(lambda x: x * x)
+               .map(lambda x: (x * x, make_sparse(x)))
                .take(take_value))
 
     element = get_single_element.get_single_element(dataset)
 
     with self.test_session() as sess:
-      self.assertEqual(0, sess.run(element, feed_dict={skip_value: 0}))
-      self.assertEqual(25, sess.run(element, feed_dict={skip_value: 5}))
-      self.assertEqual(100, sess.run(element, feed_dict={skip_value: 10}))
+      for x in [0, 5, 10]:
+        dense_val, sparse_val = sess.run(element, feed_dict={skip_value: x})
+        self.assertEqual(x * x, dense_val)
+        self.assertAllEqual([[x]], sparse_val.indices)
+        self.assertAllEqual([x], sparse_val.values)
+        self.assertAllEqual([x], sparse_val.dense_shape)
 
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "Dataset was empty."):
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index 256ad8d94dc1a7c2b26df3f1ebf8e8e321882c15..43aa4b1bd02791ff304a990c0bbe8e45534c0c77 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -94,6 +95,76 @@ class InterleaveDatasetSerializationTest(
     self.run_core_tests(_build_dataset, None, 20)
 
 
+class ParallelInterleaveDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def setUp(self):
+    self.input_values = np.array([4, 5, 6], dtype=np.int64)
+    self.num_repeats = 2
+    self.num_outputs = np.sum(self.input_values) * 2
+
+  def _build_ds(self, cycle_length, block_length, sloppy=False):
+    return (dataset_ops.Dataset.from_tensor_slices(
+        self.input_values).repeat(self.num_repeats).apply(
+            interleave_ops.parallel_interleave(
+                lambda x: dataset_ops.Dataset.range(10 * x, 11 * x),
+                cycle_length, block_length, sloppy)))
+
+  def testSerializationCore(self):
+    # cycle_length > 1, block_length > 1
+    cycle_length = 2
+    block_length = 3
+    self.run_core_tests(
+        lambda: self._build_ds(cycle_length, block_length),
+        lambda: self._build_ds(cycle_length * 2, block_length * 1),
+        self.num_outputs)
+    # cycle_length = 1
+    cycle_length = 1
+    block_length = 3
+    self.run_core_tests(lambda: self._build_ds(cycle_length, block_length),
+                        None, self.num_outputs)
+    # block_length = 1
+    cycle_length = 2
+    block_length = 1
+    self.run_core_tests(lambda: self._build_ds(cycle_length, block_length),
+                        None, self.num_outputs)
+
+  def testSerializationWithSloppy(self):
+    break_points = self.gen_break_points(self.num_outputs, 10)
+    expected_outputs = np.repeat(
+        np.concatenate([np.arange(10 * x, 11 * x) for x in self.input_values]),
+        self.num_repeats).tolist()
+
+    def run_test(cycle_length, block_length):
+      actual = self.gen_outputs(
+          lambda: self._build_ds(cycle_length, block_length, True),
+          break_points, self.num_outputs)
+      self.assertSequenceEqual(sorted(actual), expected_outputs)
+
+    # cycle_length > 1, block_length > 1
+    run_test(2, 3)
+    # cycle_length = 1
+    run_test(1, 3)
+    # block_length = 1
+    run_test(2, 1)
+
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_dataset():
+      return dataset_ops.Dataset.range(10).map(_map_fn).apply(
+          interleave_ops.parallel_interleave(_interleave_fn, 1))
+
+    self.run_core_tests(_build_dataset, None, 20)
+
+
 class ParallelInterleaveDatasetTest(test.TestCase):
 
   def setUp(self):
@@ -338,7 +409,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def _testTwoThreadsNoContentionWithRaces(self, sloppy=False):
     """Tests where all the workers race in producing elements.
 
-    Note: this is in contrast with the prevous test which carefully sequences
+    Note: this is in contrast with the previous test which carefully sequences
     the execution of the map functions.
 
     Args:
@@ -424,7 +495,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def _testTwoThreadsNoContentionWithRacesAndBlocking(self, sloppy=False):
     """Tests where all the workers race in producing elements.
 
-    Note: this is in contrast with the prevous test which carefully sequences
+    Note: this is in contrast with the previous test which carefully sequences
     the execution of the map functions.
 
 
@@ -836,5 +907,114 @@ class ParallelInterleaveDatasetTest(test.TestCase):
         sess.run(self.next_element)
 
 
+class DirectedInterleaveDatasetTest(test.TestCase):
+
+  def testBasic(self):
+    selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
+    input_datasets = [
+        dataset_ops.Dataset.from_tensors(i).repeat(100) for i in range(10)
+    ]
+    dataset = interleave_ops.DirectedInterleaveDataset(selector_dataset,
+                                                       input_datasets)
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for _ in range(100):
+        for i in range(10):
+          self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def _normalize(self, vec):
+    return vec / vec.sum()
+
+  def _chi2(self, expected, actual):
+    actual = np.asarray(actual)
+    expected = np.asarray(expected)
+    diff = actual - expected
+    chi2 = np.sum(diff * diff / expected, axis=0)
+    return chi2
+
+  def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples):
+    # Create a dataset that samples each integer in `[0, num_datasets)`
+    # with probability given by `weights[i]`.
+    dataset = interleave_ops.sample_from_datasets([
+        dataset_ops.Dataset.from_tensors(i).repeat(None)
+        for i in range(num_datasets)
+    ], weights)
+    dataset = dataset.take(num_samples)
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      freqs = np.zeros([num_datasets])
+      for _ in range(num_samples):
+        freqs[sess.run(next_element)] += 1
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+    return freqs
+
+  def testSampleFromDatasets(self):
+    random_seed.set_random_seed(1619)
+    num_samples = 10000
+    rand_probs = self._normalize(np.random.random_sample((15,)))
+
+    # Use chi-squared test to assert that the observed distribution matches the
+    # expected distribution. Based on the implementation in
+    # "tensorflow/python/kernel_tests/multinomial_op_test.py".
+    for probs in [[.85, .05, .1], rand_probs]:
+      probs = np.asarray(probs)
+      classes = len(probs)
+      freqs = self._testSampleFromDatasetsHelper(probs, classes, num_samples)
+      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
+
+      # Also check that `weights` as a dataset samples correctly.
+      probs_ds = dataset_ops.Dataset.from_tensors(probs).repeat()
+      freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
+      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
+
+  def testErrors(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"vector of length `len\(datasets\)`"):
+      interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.range(10),
+           dataset_ops.Dataset.range(20)],
+          weights=[0.25, 0.25, 0.25, 0.25])
+
+    with self.assertRaisesRegexp(TypeError, "`tf.float32` or `tf.float64`"):
+      interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.range(10),
+           dataset_ops.Dataset.range(20)],
+          weights=[1, 1])
+
+    with self.assertRaisesRegexp(TypeError, "must have the same type"):
+      interleave_ops.sample_from_datasets([
+          dataset_ops.Dataset.from_tensors(0),
+          dataset_ops.Dataset.from_tensors(0.0)
+      ])
+
+
+class SampleFromDatasetsSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset(self, probs, num_samples):
+    dataset = interleave_ops.sample_from_datasets(
+        [
+            dataset_ops.Dataset.from_tensors(i).repeat(None)
+            for i in range(len(probs))
+        ],
+        probs,
+        seed=1813)
+    return dataset.take(num_samples)
+
+  def testSerializationCore(self):
+    self.run_core_tests(
+        lambda: self._build_dataset([0.5, 0.5], 100),
+        lambda: self._build_dataset([0.25, 0.25, 0.25, 0.25], 1000), 100)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index dc3e38db59301bf1819999f479171af35930e9d2..b08132cd72254326d965907a1fdafb8a820926a1 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
 import threading
 
 from tensorflow.contrib.data.python.ops import prefetching_ops
@@ -26,37 +25,43 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
-class StagingAreaOpsTest(test.TestCase):
+class PrefetchingKernelsOpsTest(test.TestCase):
 
   def setUp(self):
     self._event = threading.Event()
 
-  def _prefetch_fn_helper(self, buffer_name, device0, device1):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 2
+  def _create_ds_and_iterator(self, device0, initializable=False):
 
     def gen():
-      for i in itertools.count(start=1, step=1):
-        yield [i + 0.0]
+      for i in range(1, 10):
+        yield [float(i)]
         if i == 6:
           self._event.set()
 
     with ops.device(device0):
-      dataset_3 = dataset_ops.Dataset.from_generator(gen, (dtypes.float32))
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
+      ds = dataset_ops.Dataset.from_generator(gen, (dtypes.float32))
+      if initializable:
+        ds_iterator = ds.make_initializable_iterator()
+      else:
+        ds_iterator = ds.make_one_shot_iterator()
+      return (ds, ds_iterator)
+
+  def _create_ops(self, ds, ds_iterator, buffer_name, device0, device1):
+    ds_iterator_handle = ds_iterator.string_handle()
 
     @function.Defun(dtypes.string)
     def _remote_fn(h):
       remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
+          h, ds.output_types, ds.output_shapes)
       return remote_iterator.get_next()
 
     target = constant_op.constant(device0)
@@ -64,15 +69,28 @@ class StagingAreaOpsTest(test.TestCase):
       buffer_resource_handle = prefetching_ops.function_buffering_resource(
           f=_remote_fn,
           target_device=target,
-          string_arg=iterator_3_handle,
+          string_arg=ds_iterator_handle,
           buffer_size=3,
-          thread_pool_size=2,
           shared_name=buffer_name)
 
     with ops.device(device1):
       prefetch_op = prefetching_ops.function_buffering_resource_get_next(
           function_buffer_resource=buffer_resource_handle,
           output_types=[dtypes.float32])
+      reset_op = prefetching_ops.function_buffering_resource_reset(
+          function_buffer_resource=buffer_resource_handle)
+      destroy_op = resource_variable_ops.destroy_resource_op(
+          buffer_resource_handle, ignore_lookup_error=True)
+
+    return (prefetch_op, reset_op, destroy_op)
+
+  def _prefetch_fn_helper_one_shot(self, buffer_name, device0, device1):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+
+    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=False)
+    prefetch_op, _, destroy_op = self._create_ops(ds, ds_iterator, buffer_name,
+                                                  device0, device1)
 
     with self.test_session(config=worker_config) as sess:
       elem = sess.run(prefetch_op)
@@ -86,26 +104,277 @@ class StagingAreaOpsTest(test.TestCase):
       self._event.wait()
       elem = sess.run(prefetch_op)
       self.assertEqual(elem, [5.0])
-      sess.run(
-          resource_variable_ops.destroy_resource_op(
-              buffer_resource_handle, ignore_lookup_error=True))
+      sess.run(destroy_op)
 
   def testSameDeviceCPU(self):
-    self._prefetch_fn_helper("same_device_cpu",
-                             "/job:localhost/replica:0/task:0/cpu:0",
-                             "/job:localhost/replica:0/task:0/cpu:0")
+    self._prefetch_fn_helper_one_shot("same_device_cpu",
+                                      "/job:localhost/replica:0/task:0/cpu:0",
+                                      "/job:localhost/replica:0/task:0/cpu:0")
 
   def testDifferentDeviceCPU(self):
-    self._prefetch_fn_helper("diff_device_cpu",
-                             "/job:localhost/replica:0/task:0/cpu:0",
-                             "/job:localhost/replica:0/task:0/cpu:1")
+    self._prefetch_fn_helper_one_shot("diff_device_cpu",
+                                      "/job:localhost/replica:0/task:0/cpu:0",
+                                      "/job:localhost/replica:0/task:0/cpu:1")
 
   def testDifferentDeviceCPUGPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
 
-    self._prefetch_fn_helper("cpu_gpu", "/job:localhost/replica:0/task:0/cpu:0",
-                             "/job:localhost/replica:0/task:0/gpu:0")
+    self._prefetch_fn_helper_one_shot("cpu_gpu",
+                                      "/job:localhost/replica:0/task:0/cpu:0",
+                                      "/job:localhost/replica:0/task:0/gpu:0")
+
+  def testReinitialization(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+
+    device0 = "/job:localhost/replica:0/task:0/cpu:0"
+    device1 = "/job:localhost/replica:0/task:0/cpu:1"
+    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
+    prefetch_op, reset_op, destroy_op = self._create_ops(
+        ds, ds_iterator, "reinit", device0, device1)
+
+    with self.test_session(config=worker_config) as sess:
+      sess.run(ds_iterator.initializer)
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [1.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [2.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [3.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [4.0])
+      self._event.wait()
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [5.0])
+      # Lets reset the function buffering resource and reinitialize the
+      # iterator. Should be able to go through this again.
+      self._event.clear()
+      sess.run(reset_op)
+      sess.run(ds_iterator.initializer)
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [1.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [2.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [3.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [4.0])
+      self._event.wait()
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [5.0])
+      sess.run(destroy_op)
+
+  def testReinitializationOutOfRange(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+
+    device0 = "/job:localhost/replica:0/task:0/cpu:0"
+    device1 = "/job:localhost/replica:0/task:0/cpu:1"
+    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
+    prefetch_op, reset_op, destroy_op = self._create_ops(
+        ds, ds_iterator, "reinit", device0, device1)
+
+    with self.test_session(config=worker_config) as sess:
+      sess.run(ds_iterator.initializer)
+      for i in range(1, 10):
+        elem = sess.run(prefetch_op)
+        self.assertEqual(elem, [float(i)])
+      # Try fetching after its over twice to test out end of sequence.
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+
+      # Now reset everything and try it out again.
+      self._event.clear()
+      sess.run(reset_op)
+      sess.run(ds_iterator.initializer)
+      for i in range(1, 10):
+        elem = sess.run(prefetch_op)
+        self.assertEqual(elem, [float(i)])
+      # Try fetching after its over twice to test out end of sequence.
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+
+      sess.run(destroy_op)
+
+
+class PrefetchToDeviceTest(test.TestCase):
+
+  def testPrefetchToDevice(self):
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+    self.assertEqual([], next_element.shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchDictToDevice(self):
+    host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element["a"].dtype)
+    self.assertEqual([], next_element["a"].shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        self.assertEqual({"a": i}, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchSparseTensorsToDevice(self):
+    def make_tensor(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i*[1]), dense_shape=[2, 2])
+    host_dataset = dataset_ops.Dataset.range(10).map(make_tensor)
+
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        actual = sess.run(next_element)
+        self.assertAllEqual([i], actual.values)
+        self.assertAllEqual([[0, 0]], actual.indices)
+        self.assertAllEqual([2, 2], actual.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceWithReInit(self):
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_initializable_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+    self.assertEqual([], next_element.shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceGpuWithReInit(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    iterator = device_dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 6efe97444a375febc550ff3a3ea04bcd9330a3a5..1075302bae96ca2e0111efbacdf5e919ea76897d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -21,6 +21,8 @@ import gzip
 import os
 import zlib
 
+import numpy as np
+
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.core.example import example_pb2
@@ -262,12 +264,20 @@ class ReadBatchFeaturesTest(test.TestCase):
     self._num_records = 7
     self.test_filenames = self._createFiles()
 
-  def _read_batch_features(self, filenames, num_epochs, batch_size):
+  def _read_batch_features(self,
+                           filenames,
+                           num_epochs,
+                           batch_size,
+                           reader_num_threads=1,
+                           parser_num_threads=1,
+                           shuffle=False,
+                           shuffle_seed=None,
+                           drop_final_batch=False):
     self.filenames = filenames
     self.num_epochs = num_epochs
     self.batch_size = batch_size
 
-    return readers.read_batch_features(
+    return readers.make_batched_features_dataset(
         file_pattern=self.filenames,
         batch_size=self.batch_size,
         features={
@@ -276,22 +286,29 @@ class ReadBatchFeaturesTest(test.TestCase):
             "keywords": parsing_ops.VarLenFeature(dtypes.string)
         },
         reader=core_readers.TFRecordDataset,
-        randomize_input=False,
-        num_epochs=self.num_epochs)
+        num_epochs=self.num_epochs,
+        shuffle=shuffle,
+        shuffle_seed=shuffle_seed,
+        reader_num_threads=reader_num_threads,
+        parser_num_threads=parser_num_threads,
+        drop_final_batch=drop_final_batch).make_one_shot_iterator(
+        ).get_next()
 
   def _record(self, f, r):
-    example = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            "file":
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[f])),
-            "record":
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[r])),
-            "keywords":
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=self._get_keywords(f, r)))
-        }))
+    example = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                "file":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[f])),
+                "record":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[r])),
+                "keywords":
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=self._get_keywords(f, r)))
+            }))
     return example.SerializeToString()
 
   def _get_keywords(self, f, r):
@@ -312,24 +329,35 @@ class ReadBatchFeaturesTest(test.TestCase):
       writer.close()
     return filenames
 
-  def _next_actual_batch(self, sess):
-    file_op = self.outputs["file"]
-    keywords_indices_op = self.outputs["keywords"].indices
-    keywords_values_op = self.outputs["keywords"].values
-    keywords_dense_shape_op = self.outputs["keywords"].dense_shape
-    record_op = self.outputs["record"]
+  def _run_actual_batch(self, outputs, sess):
+    file_op = outputs["file"]
+    keywords_indices_op = outputs["keywords"].indices
+    keywords_values_op = outputs["keywords"].values
+    keywords_dense_shape_op = outputs["keywords"].dense_shape
+    record_op = outputs["record"]
     return sess.run([
         file_op, keywords_indices_op, keywords_values_op,
         keywords_dense_shape_op, record_op
     ])
 
-  def _next_expected_batch(self, file_indices, batch_size, num_epochs):
+  def _next_actual_batch(self, sess):
+    return self._run_actual_batch(self.outputs, sess)
+
+  def _next_expected_batch(self,
+                           file_indices,
+                           batch_size,
+                           num_epochs,
+                           cycle_length=1):
 
     def _next_record(file_indices):
       for j in file_indices:
         for i in range(self._num_records):
           yield j, i
 
+    def _next_record_interleaved(file_indices, cycle_length):
+      return self._interleave([_next_record([i]) for i in file_indices],
+                              cycle_length)
+
     file_batch = []
     keywords_batch_indices = []
     keywords_batch_values = []
@@ -337,15 +365,19 @@ class ReadBatchFeaturesTest(test.TestCase):
     record_batch = []
     batch_index = 0
     for _ in range(num_epochs):
-      for record in _next_record(file_indices):
+      if cycle_length == 1:
+        next_records = _next_record(file_indices)
+      else:
+        next_records = _next_record_interleaved(file_indices, cycle_length)
+      for record in next_records:
         f = record[0]
         r = record[1]
         file_batch.append(f)
         record_batch.append(r)
         keywords = self._get_keywords(f, r)
         keywords_batch_values.extend(keywords)
-        keywords_batch_indices.extend([[batch_index, i]
-                                       for i in range(len(keywords))])
+        keywords_batch_indices.extend(
+            [[batch_index, i] for i in range(len(keywords))])
         batch_index += 1
         keywords_batch_max_len = max(keywords_batch_max_len, len(keywords))
         if len(file_batch) == batch_size:
@@ -365,14 +397,41 @@ class ReadBatchFeaturesTest(test.TestCase):
           [len(file_batch), keywords_batch_max_len], record_batch
       ]
 
-  def _verify_records(self, sess, batch_size, file_index=None, num_epochs=1):
+  def _interleave(self, iterators, cycle_length):
+    pending_iterators = iterators
+    open_iterators = []
+    num_open = 0
+    for i in range(cycle_length):
+      if pending_iterators:
+        open_iterators.append(pending_iterators.pop(0))
+        num_open += 1
+
+    while num_open:
+      for i in range(min(cycle_length, len(open_iterators))):
+        if open_iterators[i] is None:
+          continue
+        try:
+          yield next(open_iterators[i])
+        except StopIteration:
+          if pending_iterators:
+            open_iterators[i] = pending_iterators.pop(0)
+          else:
+            open_iterators[i] = None
+            num_open -= 1
+
+  def _verify_records(self,
+                      sess,
+                      batch_size,
+                      file_index=None,
+                      num_epochs=1,
+                      interleave_cycle_length=1):
     if file_index is not None:
       file_indices = [file_index]
     else:
       file_indices = range(self._num_files)
 
-    for expected_batch in self._next_expected_batch(file_indices, batch_size,
-                                                    num_epochs):
+    for expected_batch in self._next_expected_batch(
+        file_indices, batch_size, num_epochs, interleave_cycle_length):
       actual_batch = self._next_actual_batch(sess)
       for i in range(len(expected_batch)):
         self.assertAllEqual(expected_batch[i], actual_batch[i])
@@ -418,9 +477,10 @@ class ReadBatchFeaturesTest(test.TestCase):
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
         "record": parsing_ops.FixedLenFeature([], dtypes.int64),
     }
-    dataset = (core_readers.TFRecordDataset(self.test_filenames)
-               .map(lambda x: parsing_ops.parse_single_example(x, features))
-               .repeat(10).batch(2))
+    dataset = (
+        core_readers.TFRecordDataset(self.test_filenames)
+        .map(lambda x: parsing_ops.parse_single_example(x, features))
+        .repeat(10).batch(2))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     next_element = iterator.get_next()
@@ -435,6 +495,596 @@ class ReadBatchFeaturesTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testReadWithFusedShuffleRepeatDataset(self):
+    num_epochs = 5
+    total_records = num_epochs * self._num_records
+    for batch_size in [1, 2]:
+      # Test that shuffling with same seed produces the same result.
+      with ops.Graph().as_default() as g:
+        with self.test_session(graph=g) as sess:
+          outputs1 = self._read_batch_features(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=5)
+          outputs2 = self._read_batch_features(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=5)
+          for _ in range(total_records // batch_size):
+            batch1 = self._run_actual_batch(outputs1, sess)
+            batch2 = self._run_actual_batch(outputs2, sess)
+            for i in range(len(batch1)):
+              self.assertAllEqual(batch1[i], batch2[i])
+
+      # Test that shuffling with different seeds produces a different order.
+      with ops.Graph().as_default() as g:
+        with self.test_session(graph=g) as sess:
+          outputs1 = self._read_batch_features(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=5)
+          outputs2 = self._read_batch_features(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=15)
+          all_equal = True
+          for _ in range(total_records // batch_size):
+            batch1 = self._run_actual_batch(outputs1, sess)
+            batch2 = self._run_actual_batch(outputs2, sess)
+            for i in range(len(batch1)):
+              all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
+          self.assertFalse(all_equal)
+
+  def testParallelReadersAndParsers(self):
+    num_epochs = 5
+    for batch_size in [1, 2]:
+      for reader_num_threads in [2, 4]:
+        for parser_num_threads in [2, 4]:
+          with ops.Graph().as_default() as g:
+            with self.test_session(graph=g) as sess:
+              self.outputs = self._read_batch_features(
+                  filenames=self.test_filenames,
+                  num_epochs=num_epochs,
+                  batch_size=batch_size,
+                  reader_num_threads=reader_num_threads,
+                  parser_num_threads=parser_num_threads)
+              self._verify_records(
+                  sess,
+                  batch_size,
+                  num_epochs=num_epochs,
+                  interleave_cycle_length=reader_num_threads)
+              with self.assertRaises(errors.OutOfRangeError):
+                self._next_actual_batch(sess)
+
+  def testDropFinalBatch(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 10]:
+        with ops.Graph().as_default():
+          # Basic test: read from file 0.
+          self.outputs = self._read_batch_features(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              drop_final_batch=True)
+          for _, tensor in self.outputs.items():
+            if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
+              self.assertEqual(tensor.shape[0], batch_size)
+
+
+class MakeCsvDatasetTest(test.TestCase):
+
+  COLUMN_TYPES = [
+      dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64, dtypes.string
+  ]
+  COLUMNS = ["col%d" % i for i in range(len(COLUMN_TYPES))]
+  DEFAULT_VALS = [[], [], [], [], ["NULL"]]
+  DEFAULTS = [
+      constant_op.constant([], dtype=dtypes.int32),
+      constant_op.constant([], dtype=dtypes.int64),
+      constant_op.constant([], dtype=dtypes.float32),
+      constant_op.constant([], dtype=dtypes.float64),
+      constant_op.constant(["NULL"], dtype=dtypes.string)
+  ]
+  LABEL = COLUMNS[0]
+
+  def setUp(self):
+    super(MakeCsvDatasetTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 11
+    self._test_filenames = self._create_files()
+
+  def _csv_values(self, fileno, recordno):
+    return [
+        fileno,
+        recordno,
+        fileno * recordno * 0.5,
+        fileno * recordno + 0.5,
+        "record %d" % recordno if recordno % 2 == 1 else "",
+    ]
+
+  def _write_file(self, filename, rows):
+    for i in range(len(rows)):
+      if isinstance(rows[i], list):
+        rows[i] = ",".join(str(v) if v is not None else "" for v in rows[i])
+    fn = os.path.join(self.get_temp_dir(), filename)
+    f = open(fn, "w")
+    f.write("\n".join(rows))
+    f.close()
+    return fn
+
+  def _create_file(self, fileno, header=True, comment=True):
+    rows = []
+    if header:
+      rows.append(self.COLUMNS)
+    for recno in range(self._num_records):
+      rows.append(self._csv_values(fileno, recno))
+      if comment:
+        rows.append("# Some comment goes here. Ignore me.")
+    return self._write_file("csv_file%d.csv" % fileno, rows)
+
+  def _create_files(self):
+    filenames = []
+    for i in range(self._num_files):
+      filenames.append(self._create_file(i))
+    return filenames
+
+  def _make_csv_dataset(
+      self,
+      filenames,
+      defaults,
+      column_names=COLUMNS,
+      label_name=LABEL,
+      select_cols=None,
+      batch_size=1,
+      num_epochs=1,
+      shuffle=False,
+      shuffle_seed=None,
+      header=True,
+      comment="#",
+      na_value="",
+      default_float_type=dtypes.float32,
+  ):
+    return readers.make_csv_dataset(
+        filenames,
+        batch_size=batch_size,
+        column_names=column_names,
+        column_defaults=defaults,
+        label_name=label_name,
+        num_epochs=num_epochs,
+        shuffle=shuffle,
+        shuffle_seed=shuffle_seed,
+        header=header,
+        comment=comment,
+        na_value=na_value,
+        default_float_type=default_float_type,
+        select_columns=select_cols,
+    )
+
+  def _next_actual_batch(self, file_indices, batch_size, num_epochs, defaults):
+    features = {col: list() for col in self.COLUMNS}
+    for _ in range(num_epochs):
+      for i in file_indices:
+        for j in range(self._num_records):
+          values = self._csv_values(i, j)
+          for n, v in enumerate(values):
+            if v == "":  # pylint: disable=g-explicit-bool-comparison
+              values[n] = defaults[n][0]
+          values[-1] = values[-1].encode("utf-8")
+
+          # Regroup lists by column instead of row
+          for n, col in enumerate(self.COLUMNS):
+            features[col].append(values[n])
+          if len(list(features.values())[0]) == batch_size:
+            yield features
+            features = {col: list() for col in self.COLUMNS}
+
+  def _run_actual_batch(self, outputs, sess):
+    features, labels = sess.run(outputs)
+    batch = [features[k] for k in self.COLUMNS if k != self.LABEL]
+    batch.append(labels)
+    return batch
+
+  def _verify_records(
+      self,
+      sess,
+      dataset,
+      file_indices,
+      defaults=tuple(DEFAULT_VALS),
+      label_name=LABEL,
+      batch_size=1,
+      num_epochs=1,
+  ):
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+
+    for expected_features in self._next_actual_batch(file_indices, batch_size,
+                                                     num_epochs, defaults):
+      actual_features = sess.run(get_next)
+
+      if label_name is not None:
+        expected_labels = expected_features.pop(label_name)
+        # Compare labels
+        self.assertAllEqual(expected_labels, actual_features[1])
+        actual_features = actual_features[0]  # Extract features dict from tuple
+
+      for k in expected_features.keys():
+        # Compare features
+        self.assertAllEqual(expected_features[k], actual_features[k])
+
+    with self.assertRaises(errors.OutOfRangeError):
+      sess.run(get_next)
+
+  def testMakeCSVDataset(self):
+    defaults = self.DEFAULTS
+
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        # Basic test: read from file 0.
+        dataset = self._make_csv_dataset(self._test_filenames[0], defaults)
+        self._verify_records(sess, dataset, [0])
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        # Basic test: read from file 1.
+        dataset = self._make_csv_dataset(self._test_filenames[1], defaults)
+        self._verify_records(sess, dataset, [1])
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        # Read from both files.
+        dataset = self._make_csv_dataset(self._test_filenames, defaults)
+        self._verify_records(sess, dataset, range(self._num_files))
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        # Read from both files. Exercise the `batch` and `num_epochs` parameters
+        # of make_csv_dataset and make sure they work.
+        dataset = self._make_csv_dataset(
+            self._test_filenames, defaults, batch_size=2, num_epochs=10)
+        self._verify_records(
+            sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
+
+  def testMakeCSVDataset_withBadColumns(self):
+    """Tests that exception is raised when input is malformed.
+    """
+    dupe_columns = self.COLUMNS[:-1] + self.COLUMNS[:1]
+    defaults = self.DEFAULTS
+
+    # Duplicate column names
+    with self.assertRaises(ValueError):
+      self._make_csv_dataset(
+          self._test_filenames, defaults, column_names=dupe_columns)
+
+    # Label key not one of column names
+    with self.assertRaises(ValueError):
+      self._make_csv_dataset(
+          self._test_filenames, defaults, label_name="not_a_real_label")
+
+  def testMakeCSVDataset_withNoLabel(self):
+    """Tests that CSV datasets can be created when no label is specified.
+    """
+    defaults = self.DEFAULTS
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        # Read from both files. Make sure this works with no label key supplied.
+        dataset = self._make_csv_dataset(
+            self._test_filenames,
+            defaults,
+            batch_size=2,
+            num_epochs=10,
+            label_name=None)
+        self._verify_records(
+            sess,
+            dataset,
+            range(self._num_files),
+            batch_size=2,
+            num_epochs=10,
+            label_name=None)
+
+  def testMakeCSVDataset_withNoComments(self):
+    """Tests that datasets can be created from CSV files with no header line.
+    """
+    defaults = self.DEFAULTS
+    file_without_header = self._create_file(
+        len(self._test_filenames), comment=False)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            file_without_header,
+            defaults,
+            batch_size=2,
+            num_epochs=10,
+            comment=None,
+        )
+        self._verify_records(
+            sess,
+            dataset,
+            [len(self._test_filenames)],
+            batch_size=2,
+            num_epochs=10,
+        )
+
+  def testMakeCSVDataset_withNoHeader(self):
+    """Tests that datasets can be created from CSV files with no header line.
+    """
+    defaults = self.DEFAULTS
+    file_without_header = self._create_file(
+        len(self._test_filenames), header=False)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            file_without_header,
+            defaults,
+            batch_size=2,
+            num_epochs=10,
+            header=False,
+        )
+        self._verify_records(
+            sess,
+            dataset,
+            [len(self._test_filenames)],
+            batch_size=2,
+            num_epochs=10,
+        )
+
+  def testMakeCSVDataset_withTypes(self):
+    """Tests that defaults can be a dtype instead of a Tensor for required vals.
+    """
+    defaults = [d for d in self.COLUMN_TYPES[:-1]]
+    defaults.append(constant_op.constant(["NULL"], dtype=dtypes.string))
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(self._test_filenames, defaults)
+        self._verify_records(sess, dataset, range(self._num_files))
+
+  def testMakeCSVDataset_withNoColNames(self):
+    """Tests that datasets can be created when column names are not specified.
+
+    In that case, we should infer the column names from the header lines.
+    """
+    defaults = self.DEFAULTS
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        # Read from both files. Exercise the `batch` and `num_epochs` parameters
+        # of make_csv_dataset and make sure they work.
+        dataset = self._make_csv_dataset(
+            self._test_filenames,
+            defaults,
+            column_names=None,
+            batch_size=2,
+            num_epochs=10)
+        self._verify_records(
+            sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
+
+  def testMakeCSVDataset_withTypeInferenceMismatch(self):
+    # Test that error is thrown when num fields doesn't match columns
+    with self.assertRaises(ValueError):
+      self._make_csv_dataset(
+          self._test_filenames,
+          column_names=self.COLUMNS + ["extra_name"],
+          defaults=None,
+          batch_size=2,
+          num_epochs=10)
+
+  def testMakeCSVDataset_withTypeInference(self):
+    """Tests that datasets can be created when no defaults are specified.
+
+    In that case, we should infer the types from the first N records.
+    """
+    # Test that it works with standard test files (with comments, header, etc)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            self._test_filenames, defaults=None, batch_size=2, num_epochs=10)
+        self._verify_records(
+            sess,
+            dataset,
+            range(self._num_files),
+            batch_size=2,
+            num_epochs=10,
+            defaults=[[], [], [], [], [""]])
+
+    # Test on a deliberately tricky file
+    fn = os.path.join(self.get_temp_dir(), "file.csv")
+    expected_dtypes = [
+        dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float32,
+        dtypes.string, dtypes.string
+    ]
+    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
+    rows = [[None, None, None, "NAN", "",
+             "a"], [1, 2**31 + 1, 2**64, 123, "NAN", ""],
+            ['"123"', 2, 2**64, 123.4, "NAN", '"cd,efg"']]
+    expected = [[0, 0, 0, 0, "", "a"], [1, 2**31 + 1, 2**64, 123, "", ""],
+                [123, 2, 2**64, 123.4, "", "cd,efg"]]
+    for row in expected:
+      row[-1] = row[-1].encode("utf-8")  # py3 expects byte strings
+      row[-2] = row[-2].encode("utf-8")  # py3 expects byte strings
+    self._write_file("file.csv", [col_names] + rows)
+
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            na_value="NAN",
+            default_float_type=dtypes.float32,
+        )
+        features = dataset.make_one_shot_iterator().get_next()
+        # Check that types match
+        for i in range(len(expected_dtypes)):
+          assert features["col%d" % i].dtype == expected_dtypes[i]
+        for i in range(len(rows)):
+          assert sess.run(features) == dict(zip(col_names, expected[i]))
+
+    # With float64 as default type for floats
+    expected_dtypes = [
+        dtypes.int32, dtypes.int64, dtypes.float64, dtypes.float64,
+        dtypes.string, dtypes.string
+    ]
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            na_value="NAN",
+            default_float_type=dtypes.float64,
+        )
+        features = dataset.make_one_shot_iterator().get_next()
+        # Check that types match
+        for i in range(len(expected_dtypes)):
+          self.assertAllEqual(features["col%d" % i].dtype, expected_dtypes[i])
+        for i in range(len(rows)):
+          self.assertAllEqual(
+              sess.run(features), dict(zip(col_names, expected[i])))
+
+  def testMakeCSVDataset_withSelectColsError(self):
+    data = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+    col_names = ["col%d" % i for i in range(5)]
+    fn = self._write_file("file.csv", [col_names] + data)
+    with self.assertRaises(ValueError):
+      # Mismatch in number of defaults and number of columns selected,
+      # should raise an error
+      self._make_csv_dataset(
+          fn,
+          defaults=[[0]] * 5,
+          column_names=col_names,
+          label_name=None,
+          select_cols=[1, 3])
+    with self.assertRaises(ValueError):
+      # Invalid column name should raise an error
+      self._make_csv_dataset(
+          fn,
+          defaults=[[0]],
+          column_names=col_names,
+          label_name=None,
+          select_cols=["invalid_col_name"])
+
+  def testMakeCSVDataset_withSelectCols(self):
+    data = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+    col_names = ["col%d" % i for i in range(5)]
+    fn = self._write_file("file.csv", [col_names] + data)
+    # If select_cols is specified, should only yield a subset of columns
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=[[0], [0]],
+            column_names=col_names,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can still do default inference with select_cols
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=col_names,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can still do column name inference
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can specify column names instead of indices
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            select_cols=[col_names[1], col_names[3]])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+
+  def testMakeCSVDataset_withShuffle(self):
+    total_records = self._num_files * self._num_records
+    defaults = self.DEFAULTS
+    for batch_size in [1, 2]:
+      with ops.Graph().as_default() as g:
+        with self.test_session(graph=g) as sess:
+          # Test that shuffling with the same seed produces the same result
+          dataset1 = self._make_csv_dataset(
+              self._test_filenames,
+              defaults,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=5)
+          dataset2 = self._make_csv_dataset(
+              self._test_filenames,
+              defaults,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=5)
+          outputs1 = dataset1.make_one_shot_iterator().get_next()
+          outputs2 = dataset2.make_one_shot_iterator().get_next()
+          for _ in range(total_records // batch_size):
+            batch1 = self._run_actual_batch(outputs1, sess)
+            batch2 = self._run_actual_batch(outputs2, sess)
+            for i in range(len(batch1)):
+              self.assertAllEqual(batch1[i], batch2[i])
+
+      with ops.Graph().as_default() as g:
+        with self.test_session(graph=g) as sess:
+          # Test that shuffling with a different seed produces different results
+          dataset1 = self._make_csv_dataset(
+              self._test_filenames,
+              defaults,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=5)
+          dataset2 = self._make_csv_dataset(
+              self._test_filenames,
+              defaults,
+              batch_size=batch_size,
+              shuffle=True,
+              shuffle_seed=6)
+          outputs1 = dataset1.make_one_shot_iterator().get_next()
+          outputs2 = dataset2.make_one_shot_iterator().get_next()
+          all_equal = False
+          for _ in range(total_records // batch_size):
+            batch1 = self._run_actual_batch(outputs1, sess)
+            batch2 = self._run_actual_batch(outputs2, sess)
+            for i in range(len(batch1)):
+              all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
+          self.assertFalse(all_equal)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
index 3c7b46629edb13459766b5ef3f392e8d00ad4db8..5f47dcb33999119a690bd633f0c97a12a1ae1c84 100644
--- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
@@ -21,7 +21,10 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import resampling
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -45,12 +48,10 @@ class ResampleTest(test.TestCase):
                 target_dist=target_dist,
                 initial_dist=initial_dist,
                 class_func=lambda c, _: c,
-                seed=27)).make_initializable_iterator())
-    init_op = iterator.initializer
+                seed=27)).make_one_shot_iterator())
     get_next = iterator.get_next()
 
     with self.test_session() as sess:
-      sess.run(init_op)
       returned = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
@@ -70,6 +71,43 @@ class ResampleTest(test.TestCase):
     returned_dist = class_counts / total_returned
     self.assertAllClose(target_dist, returned_dist, atol=1e-2)
 
+  def testRandomClasses(self):
+    init_dist = [0.25, 0.25, 0.25, 0.25]
+    target_dist = [0.0, 0.0, 0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test a dirac-delta target distribution
+    num_samples = 100
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(data_np)
+
+    # Apply a random mapping that preserves the data distribution.
+    def _remap_fn(_):
+      return math_ops.cast(random_ops.random_uniform([1]) * num_classes,
+                           dtypes.int32)[0]
+    dataset = dataset.map(_remap_fn)
+
+    # Reshape distribution.
+    dataset = dataset.apply(
+        resampling.rejection_resample(
+            class_func=lambda x: x,
+            target_dist=target_dist,
+            initial_dist=init_dist))
+
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.test_session() as sess:
+      returned = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          returned.append(sess.run(get_next))
+
+    classes, _ = zip(*returned)
+    bincount = np.bincount(
+        np.array(classes),
+        minlength=num_classes).astype(np.float32) / len(classes)
+
+    self.assertAllClose(target_dist, bincount, atol=1e-2)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
index e0494736b72ae52f586cb80d42a5c1e50ac17a61..1a97a84b2cba13e82c8af9c4c8ee413ee8264a5e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
@@ -24,9 +24,11 @@ import numpy as np
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import scan_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -57,19 +59,24 @@ class ScanDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testFibonacci(self):
     iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
         scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))
     ).make_one_shot_iterator()
-    next_element = iterator.get_next()
 
-    with self.test_session() as sess:
-      self.assertEqual(1, sess.run(next_element))
-      self.assertEqual(1, sess.run(next_element))
-      self.assertEqual(2, sess.run(next_element))
-      self.assertEqual(3, sess.run(next_element))
-      self.assertEqual(5, sess.run(next_element))
-      self.assertEqual(8, sess.run(next_element))
+    if context.executing_eagerly():
+      next_element = iterator.get_next
+    else:
+      get_next = iterator.get_next()
+      next_element = lambda: get_next
+
+    self.assertEqual(1, self.evaluate(next_element()))
+    self.assertEqual(1, self.evaluate(next_element()))
+    self.assertEqual(2, self.evaluate(next_element()))
+    self.assertEqual(3, self.evaluate(next_element()))
+    self.assertEqual(5, self.evaluate(next_element()))
+    self.assertEqual(8, self.evaluate(next_element()))
 
   def testChangingStateShape(self):
     # Test the fixed-point shape invariant calculations: start with
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index 36ddf3004237ed042f21d691d83eafbaa20621e6..d0cb203a3afd2775756c8542a1e86faedc5cee53 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -47,6 +47,11 @@ class SequenceDatasetSerializationTest(
     # Skip nothing
     self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10)
 
+  def testInvalidSkip(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), None, 0)
+
   def _build_take_dataset(self, count):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(count)
@@ -69,6 +74,11 @@ class SequenceDatasetSerializationTest(
     # Take nothing
     self.run_core_tests(lambda: self._build_take_dataset(0), None, 0)
 
+  def testInvalidTake(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_take_dataset([1, 2]), None, 0)
+
   def _build_repeat_dataset(self, count, take_count=3):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(
@@ -100,6 +110,12 @@ class SequenceDatasetSerializationTest(
     # Test repeat empty dataset
     self.run_core_tests(lambda: self._build_repeat_dataset(-1, 0), None, 0)
 
+  def testInvalidRepeat(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_repeat_dataset([1, 2], 0),
+                          None, 0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..33c48e20bea53b88d69a59e715af38b22dd2cbd4
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -0,0 +1,242 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import sliding
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class SlideDatasetTest(test.TestCase):
+
+  def testSlideDataset(self):
+    """Test an dataset that maps a TF function across its input elements."""
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+    window_size = array_ops.placeholder(dtypes.int64, shape=[])
+    stride = array_ops.placeholder(dtypes.int64, shape=[])
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(count) -> _SlideDataset(window_size, stride).
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .map(_map_fn)
+                .repeat(count)
+                .apply(sliding.sliding_window_batch(window_size, stride))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
+                     [t.shape.as_list() for t in get_next])
+
+    with self.test_session() as sess:
+      # Slide over a finite input, where the window_size divides the
+      # total number of elements.
+      sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 7})
+      # Same formula with convolution layer.
+      num_batches = (20 * 7 - 14) // 7 + 1
+      for i in range(num_batches):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(14):
+            self.assertAllEqual(component[(i*7 + j) % 7]**2,
+                                result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Slide over a finite input, where the window_size does not
+      # divide the total number of elements.
+      sess.run(init_op, feed_dict={count: 20, window_size: 17, stride: 9})
+
+      num_batches = (20 * 7 - 17) // 9 + 1
+      for i in range(num_batches):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(17):
+            self.assertAllEqual(component[(i*9 + j) % 7]**2,
+                                result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Slide over a finite input, which is less than window_size,
+      # should fail straight away.
+      sess.run(init_op, feed_dict={count: 1, window_size: 10, stride: 4})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      sess.run(init_op, feed_dict={count: 1, window_size: 10, stride: 8})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Slide over an empty input should fail straight away.
+      sess.run(init_op, feed_dict={count: 0, window_size: 8, stride: 4})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Empty window_size should be an initialization time error.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(init_op, feed_dict={count: 14, window_size: 0, stride: 0})
+
+      # Invalid stride should be an initialization time error.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 0})
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 3})
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 5})
+
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testSlideSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
+        sliding.sliding_window_batch(5, 3)).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      num_batches = (10 - 5) // 3 + 1
+      for i in range(num_batches):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 3, i * 3 + 1, i * 3 + 2, i * 3 + 3, i * 3 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSlideSparseWithDifferentDenseShapes(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=array_ops.expand_dims(
+              math_ops.range(i, dtype=dtypes.int64), 1),
+          values=array_ops.fill([math_ops.to_int32(i)], i),
+          dense_shape=[i])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
+        sliding.sliding_window_batch(5, 3)).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      num_batches = (10 - 5) // 3 + 1
+      for i in range(num_batches):
+        actual = sess.run(get_next)
+        expected_indices = []
+        expected_values = []
+        for j in range(5):
+          for k in range(i * 3 + j):
+            expected_indices.append([j, k])
+            expected_values.append(i * 3 + j)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=expected_indices,
+            values=expected_values,
+            dense_shape=[5, i * 3 + 5 - 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedSlideSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = (dataset_ops.Dataset.range(10)
+                .map(_sparse)
+                .apply(sliding.sliding_window_batch(4, 2))
+                .apply(sliding.sliding_window_batch(3, 1))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      # Slide: 1st batch.
+      actual = sess.run(get_next)
+      expected = sparse_tensor.SparseTensorValue(
+          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0],
+                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0],
+                   [2, 0, 0], [2, 1, 0], [2, 2, 0], [2, 3, 0]],
+          values=[0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7],
+          dense_shape=[3, 4, 1])
+      self.assertTrue(sparse_tensor.is_sparse(actual))
+      self.assertSparseValuesEqual(actual, expected)
+      # Slide: 2nd batch.
+      actual = sess.run(get_next)
+      expected = sparse_tensor.SparseTensorValue(
+          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0],
+                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0],
+                   [2, 0, 0], [2, 1, 0], [2, 2, 0], [2, 3, 0]],
+          values=[2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9],
+          dense_shape=[3, 4, 1])
+      self.assertTrue(sparse_tensor.is_sparse(actual))
+      self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSlideShapeError(self):
+
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    iterator = (dataset_ops.Dataset.from_generator(generator, dtypes.float32,
+                                                   output_shapes=[None])
+                .apply(sliding.sliding_window_batch(3, 1))
+                .make_initializable_iterator())
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Cannot batch tensors with different shapes in component 0. "
+          r"First element had shape \[3\] and element 2 had shape \[4\]."):
+        sess.run(next_element)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index 07bdf920446e953c2a1abaf495d2e9e1256106fd..5c74ed6ae7210e8e22efb6e8fdb773397459ce1e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -50,17 +50,17 @@ class StatsDatasetTest(test.TestCase):
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
 
   def testBytesProduced(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
-            stats_ops.bytes_produced_stats("bytes_produced"))
+            stats_ops.bytes_produced_stats("bytes_produced")).apply(
+                stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       expected_sum = 0.0
       for i in range(100):
         self.assertAllEqual(
@@ -76,16 +76,16 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
   def testLatencyStats(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       for i in range(100):
         self.assertEqual(i, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -95,16 +95,15 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
 
   def testReinitialize(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(stats_aggregator_subscriber)
       for j in range(5):
         sess.run(iterator.initializer)
         for i in range(100):
@@ -130,17 +129,17 @@ class StatsDatasetTest(test.TestCase):
         sess.run(next_element)
 
   def testMultipleTags(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency_2"))
+            stats_ops.latency_stats("record_latency_2")).apply(
+                stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       for i in range(100):
         self.assertEqual(i, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -154,17 +153,17 @@ class StatsDatasetTest(test.TestCase):
           sess.run(summary_t), "record_latency_2", 100.0)
 
   def testRepeatedTags(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency"))
+            stats_ops.latency_stats("record_latency")).apply(
+                stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       for i in range(100):
         self.assertEqual(i, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -174,19 +173,17 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
   def testMultipleIteratorsSameAggregator(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
     iterator_0 = dataset.make_initializable_iterator()
     iterator_1 = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscribers = [stats_aggregator.subscribe(iterator_0),
-                                    stats_aggregator.subscribe(iterator_1)]
     next_element = iterator_0.get_next() + iterator_1.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator_0.initializer, iterator_1.initializer,
-                stats_aggregator_subscribers])
+      sess.run([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
         self.assertEqual(i * 2, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -195,20 +192,6 @@ class StatsDatasetTest(test.TestCase):
         sess.run(next_element)
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
-  def testMultipleStatsAggregatorsSameIteratorFail(self):
-    dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
-    iterator = dataset.make_initializable_iterator()
-    stats_aggregator_0 = stats_ops.StatsAggregator()
-    stats_aggregator_1 = stats_ops.StatsAggregator()
-
-    with self.test_session() as sess:
-      sess.run(stats_aggregator_0.subscribe(iterator))
-      # TODO(mrry): Consider making this allowable (and also allowing
-      # aggregators to unsubscribe).
-      with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(stats_aggregator_1.subscribe(iterator))
-
 
 class StatsDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
@@ -218,6 +201,14 @@ class StatsDatasetSerializationTest(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
 
+  def test_bytes_produced_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.bytes_produced_stats(["bytes_produced"])),
+          None, 100)
+
   def testBytesStatsDatasetSaveableCore(self):
     num_outputs = 100
     self.run_core_tests(
@@ -235,6 +226,14 @@ class StatsDatasetSerializationTest(
     return dataset_ops.Dataset.range(num_elements).apply(
         stats_ops.latency_stats(tag1)).apply(stats_ops.latency_stats(tag2))
 
+  def test_latency_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.latency_stats(["record_latency", "record_latency_2"])),
+          None, 100)
+
   def testLatencyStatsDatasetSaveableCore(self):
     num_outputs = 100
 
@@ -253,5 +252,9 @@ class StatsDatasetSerializationTest(
         None, num_outputs)
 
 
+# TODO(shivaniagrawal): Can not checkpoint input_pipeline with the
+# transformation `stats_ops.set_stats_aggregator`, since we don't support
+# serializing StatsAggregator yet.
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9167cb3379bba5cb1ba76a96549395c45dca9e35
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/threadpool_dataset_ops_test.py
@@ -0,0 +1,77 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline statistics gathering ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import threadpool
+from tensorflow.contrib.data.python.ops import unique
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
+
+
+class OverrideThreadpoolDatasetTest(test.TestCase):
+
+  def testNumThreads(self):
+
+    def get_thread_id(_):
+      # Python creates a dummy thread object to represent the current
+      # thread when called from an "alien" thread (such as a
+      # `PrivateThreadPool` thread in this case). It does not include
+      # the TensorFlow-given display name, but it has a unique
+      # identifier that maps one-to-one with the underlying OS thread.
+      return np.array(threading.current_thread().ident).astype(np.int64)
+
+    for num_threads in [1, 2, 4, 8, 16]:
+
+      dataset = (
+          dataset_ops.Dataset.range(1000).map(
+              lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64),
+              num_parallel_calls=32).apply(unique.unique()))
+
+      dataset = threadpool.override_threadpool(
+          dataset,
+          threadpool.PrivateThreadPool(
+              num_threads, display_name="private_thread_pool_%d" % num_threads))
+
+      iterator = dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+      with self.test_session() as sess:
+        sess.run(iterator.initializer)
+        thread_ids = []
+        try:
+          while True:
+            thread_ids.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+        self.assertEqual(len(thread_ids), len(set(thread_ids)))
+        self.assertGreater(len(thread_ids), 0)
+        # NOTE(mrry): We don't control the thread pool scheduling, and
+        # so cannot guarantee that all of the threads in the pool will
+        # perform work.
+        self.assertLessEqual(len(thread_ids), num_threads)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c603ecc5ab27a711557376246b093fd5f80f8aec
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
@@ -0,0 +1,117 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.data.python.ops import writers
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class TFRecordWriterTest(test.TestCase):
+
+  def setUp(self):
+    super(TFRecordWriterTest, self).setUp()
+    self._num_records = 7
+    self.filename = array_ops.placeholder(dtypes.string, shape=[])
+    self.compression_type = array_ops.placeholder_with_default("", shape=[])
+
+    input_dataset = readers.TFRecordDataset([self.filename],
+                                            self.compression_type)
+    self.writer = writers.TFRecordWriter(
+        self._outputFilename(), self.compression_type).write(input_dataset)
+
+  def _record(self, i):
+    return compat.as_bytes("Record %d" % (i))
+
+  def _createFile(self, options=None):
+    filename = self._inputFilename()
+    writer = python_io.TFRecordWriter(filename, options)
+    for i in range(self._num_records):
+      writer.write(self._record(i))
+    writer.close()
+    return filename
+
+  def _inputFilename(self):
+    return os.path.join(self.get_temp_dir(), "tf_record.in.txt")
+
+  def _outputFilename(self):
+    return os.path.join(self.get_temp_dir(), "tf_record.out.txt")
+
+  def testWrite(self):
+    with self.test_session() as sess:
+      sess.run(
+          self.writer, feed_dict={
+              self.filename: self._createFile(),
+          })
+    for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())):
+      self.assertAllEqual(self._record(i), r)
+
+  def testWriteZLIB(self):
+    options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB)
+    with self.test_session() as sess:
+      sess.run(
+          self.writer,
+          feed_dict={
+              self.filename: self._createFile(options),
+              self.compression_type: "ZLIB",
+          })
+    for i, r in enumerate(
+        tf_record.tf_record_iterator(self._outputFilename(), options=options)):
+      self.assertAllEqual(self._record(i), r)
+
+  def testWriteGZIP(self):
+    options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP)
+    with self.test_session() as sess:
+      sess.run(
+          self.writer,
+          feed_dict={
+              self.filename: self._createFile(options),
+              self.compression_type: "GZIP",
+          })
+    for i, r in enumerate(
+        tf_record.tf_record_iterator(self._outputFilename(), options=options)):
+      self.assertAllEqual(self._record(i), r)
+
+  def testFailDataset(self):
+    with self.assertRaises(TypeError):
+      writers.TFRecordWriter(self._outputFilename(),
+                             self.compression_type).write("whoops")
+
+  def testFailDType(self):
+    input_dataset = dataset_ops.Dataset.from_tensors(10)
+    with self.assertRaises(TypeError):
+      writers.TFRecordWriter(self._outputFilename(),
+                             self.compression_type).write(input_dataset)
+
+  def testFailShape(self):
+    input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]])
+    with self.assertRaises(TypeError):
+      writers.TFRecordWriter(self._outputFilename(),
+                             self.compression_type).write(input_dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index b488357f226d0922bba3799cc1f4b5c75e2e8328..5b04c5316cfbb7577b3f8b3b6d364fc665d14c21 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -12,18 +12,26 @@ load(
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 py_library(
-    name = "dataset_ops",
-    srcs = [
-        "counter.py",
-        "get_single_element.py",
+    name = "counter",
+    srcs = ["counter.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":scan_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
+)
+
+py_library(
+    name = "get_single_element",
+    srcs = ["get_single_element.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":transformation_ops",
         "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
 )
 
@@ -66,18 +74,25 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":dataset_ops",
+        ":batching",
+        ":interleave_ops",
+        ":shuffle_ops",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -88,46 +103,192 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":random_ops",
-        ":transformation_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 py_library(
-    name = "transformation_ops",
-    srcs = [
-        "batching.py",
-        "enumerate_ops.py",
-        "error_ops.py",
-        "grouping.py",
-        "interleave_ops.py",
-        "resampling.py",
-        "scan_ops.py",
-        "stats_ops.py",
-        "unique.py",
+    name = "batching",
+    srcs = ["batching.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
+)
+
+py_library(
+    name = "enumerate_ops",
+    srcs = ["enumerate_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "error_ops",
+    srcs = ["error_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":contrib_op_loader",
         ":gen_dataset_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "grouping",
+    srcs = ["grouping.py"],
+    srcs_version = "PY2AND3",
+    deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
-        "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "interleave_ops",
+    srcs = ["interleave_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
+        ":random_ops",
+        "//tensorflow/contrib/stateless",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "resampling",
+    srcs = ["resampling.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batching",
+        ":scan_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "scan_ops",
+    srcs = ["scan_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
-        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "sliding",
+    srcs = ["sliding.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "stats_ops",
+    srcs = ["stats_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "threadpool",
+    srcs = ["threadpool.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "unique",
+    srcs = [
+        "unique.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "writers",
+    srcs = [
+        "writers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -167,17 +328,36 @@ py_library(
     srcs = ["prefetching_ops.py"],
     deps = [
         ":contrib_op_loader",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+py_library(
+    name = "dataset_ops",
+    deps = [
+        ":batching",
+        ":counter",
+        ":enumerate_ops",
+        ":error_ops",
+        ":get_single_element",
+        ":grouping",
+        ":interleave_ops",
+        ":prefetching_ops",
+        ":readers",
+        ":resampling",
+        ":scan_ops",
+        ":shuffle_ops",
+        ":sliding",
+        ":stats_ops",
+        ":threadpool",
+        ":unique",
+        ":writers",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
 )
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 6eb512dec67cb7b9c8c4518d03aee0b436205f9a..2152bcde84aae6b0c2b368e43750aafab3a04bf2 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.framework import with_shape
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
@@ -79,28 +80,98 @@ def dense_to_sparse_batch(batch_size, row_shape):
   return _apply_fn
 
 
+class UnbatchDataset(dataset_ops.Dataset):
+  """A dataset that splits the elements of its input into multiple elements."""
+
+  def __init__(self, input_dataset):
+    """See `unbatch()` for more details."""
+    super(UnbatchDataset, self).__init__()
+    flat_shapes = nest.flatten(input_dataset.output_shapes)
+    if any(s.ndims == 0 for s in flat_shapes):
+      raise ValueError("Cannot unbatch an input with scalar components.")
+    known_batch_dim = tensor_shape.Dimension(None)
+    for s in flat_shapes:
+      try:
+        known_batch_dim = known_batch_dim.merge_with(s[0])
+      except ValueError:
+        raise ValueError("Cannot unbatch an input whose components have "
+                         "different batch sizes.")
+    self._input_dataset = input_dataset
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.unbatch_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return nest.map_structure(lambda s: s[1:],
+                              self._input_dataset.output_shapes)
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
 def unbatch():
-  """A Transformation which splits the elements of a dataset.
+  """Splits elements of a dataset into multiple elements on the batch dimension.
 
   For example, if elements of the dataset are shaped `[B, a0, a1, ...]`,
-  where `B` may vary from element to element, then for each element in
-  the dataset, the unbatched dataset will contain `B` consecutive elements
+  where `B` may vary for each input element, then for each element in the
+  dataset, the unbatched dataset will contain `B` consecutive elements
   of shape `[a0, a1, ...]`.
 
+  ```python
+  # NOTE: The following example uses `{ ... }` to represent the contents
+  # of a dataset.
+  a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
+
+  a.apply(tf.contrib.data.unbatch()) == {
+      'a', 'b', 'c', 'a', 'b', 'a', 'b', 'c', 'd'}
+  ```
+
   Returns:
     A `Dataset` transformation function, which can be passed to
     @{tf.data.Dataset.apply}.
   """
 
   def _apply_fn(dataset):
-
-    def unbatch_map(arg, *rest):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    if not sparse.any_sparse(dataset.output_classes):
+      return UnbatchDataset(dataset)
+
+    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
+    # are normalized to the rank-1 dense representation, so that the
+    # sparse-oblivious unbatching logic will slice them
+    # appropriately. This leads to a somewhat inefficient re-encoding step
+    # for all SparseTensor components.
+    # TODO(mrry): Consider optimizing this in future
+    # if it turns out to be a bottleneck.
+    def normalize(arg, *rest):
       if rest:
-        return dataset_ops.Dataset.from_tensor_slices((arg,) + rest)
+        return sparse.serialize_many_sparse_tensors((arg,) + rest)
       else:
-        return dataset_ops.Dataset.from_tensor_slices(arg)
+        return sparse.serialize_many_sparse_tensors(arg)
+
+    normalized_dataset = dataset.map(normalize)
 
-    return dataset.flat_map(map_func=unbatch_map)
+    # NOTE(mrry): Our `map()` has lost information about the sparseness
+    # of any SparseTensor components, so re-apply the structure of the
+    # original dataset.
+    restructured_dataset = _RestructuredDataset(
+        normalized_dataset,
+        dataset.output_types,
+        dataset.output_shapes,
+        dataset.output_classes,
+        allow_unsafe_cast=True)
+    return UnbatchDataset(restructured_dataset)
 
   return _apply_fn
 
@@ -264,7 +335,8 @@ class _RestructuredDataset(dataset_ops.Dataset):
                dataset,
                output_types,
                output_shapes=None,
-               output_classes=None):
+               output_classes=None,
+               allow_unsafe_cast=False):
     """Creates a new dataset with the given output types and shapes.
 
     The given `dataset` must have a structure that is convertible:
@@ -282,6 +354,10 @@ class _RestructuredDataset(dataset_ops.Dataset):
         If omitted, the shapes will be inherited from `dataset`.
       output_classes: (Optional.) A nested structure of class types.
         If omitted, the class types will be inherited from `dataset`.
+      allow_unsafe_cast: (Optional.) If `True`, the caller may switch the
+        reported output types and shapes of the restructured dataset, e.g. to
+        switch a sparse tensor represented as `tf.variant` to its user-visible
+        type and shape.
 
     Raises:
       ValueError: If either `output_types` or `output_shapes` is not compatible
@@ -290,14 +366,15 @@ class _RestructuredDataset(dataset_ops.Dataset):
     super(_RestructuredDataset, self).__init__()
     self._dataset = dataset
 
-    # Validate that the types are compatible.
-    output_types = nest.map_structure(dtypes.as_dtype, output_types)
-    flat_original_types = nest.flatten(dataset.output_types)
-    flat_new_types = nest.flatten(output_types)
-    if flat_original_types != flat_new_types:
-      raise ValueError(
-          "Dataset with output types %r cannot be restructured to have output "
-          "types %r" % (dataset.output_types, output_types))
+    if not allow_unsafe_cast:
+      # Validate that the types are compatible.
+      output_types = nest.map_structure(dtypes.as_dtype, output_types)
+      flat_original_types = nest.flatten(dataset.output_types)
+      flat_new_types = nest.flatten(output_types)
+      if flat_original_types != flat_new_types:
+        raise ValueError(
+            "Dataset with output types %r cannot be restructured to have "
+            "output types %r" % (dataset.output_types, output_types))
 
     self._output_types = output_types
 
@@ -307,18 +384,19 @@ class _RestructuredDataset(dataset_ops.Dataset):
                                                   nest.flatten(
                                                       dataset.output_shapes))
     else:
-      # Validate that the shapes are compatible.
-      nest.assert_same_structure(output_types, output_shapes)
-      flat_original_shapes = nest.flatten(dataset.output_shapes)
-      flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
-
-      for original_shape, new_shape in zip(flat_original_shapes,
-                                           flat_new_shapes):
-        if not original_shape.is_compatible_with(new_shape):
-          raise ValueError(
-              "Dataset with output shapes %r cannot be restructured to have "
-              "incompatible output shapes %r" % (dataset.output_shapes,
-                                                 output_shapes))
+      if not allow_unsafe_cast:
+        # Validate that the shapes are compatible.
+        nest.assert_same_structure(output_types, output_shapes)
+        flat_original_shapes = nest.flatten(dataset.output_shapes)
+        flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
+
+        for original_shape, new_shape in zip(flat_original_shapes,
+                                             flat_new_shapes):
+          if not original_shape.is_compatible_with(new_shape):
+            raise ValueError(
+                "Dataset with output shapes %r cannot be restructured to have "
+                "incompatible output shapes %r" % (dataset.output_shapes,
+                                                   output_shapes))
       self._output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
     if output_classes is None:
@@ -345,16 +423,62 @@ class _RestructuredDataset(dataset_ops.Dataset):
     return self._output_shapes
 
 
+def assert_element_shape(expected_shapes):
+  """Assert the shape of this `Dataset`.
+
+  ```python
+  shapes = [tf.TensorShape([16, 256]), tf.TensorShape(None)]
+  result = dataset.apply(tf.contrib.data.assert_element_shape(shapes))
+  print(result.output_shapes)  # ==> "((16, 256), <unknown>)"
+  ```
+
+  If dataset shapes and expected_shape, are fully defined, assert they match.
+  Otherwise, add assert op that will validate the shapes when tensors are
+  evaluated, and set shapes on tensors, respectively.
+
+  Args:
+    expected_shapes: A nested structure of `tf.TensorShape` objects.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}
+  """
+
+  def _check_shape(*elements):
+    flatten_tensors = nest.flatten(elements)
+    flatten_shapes = nest.flatten(expected_shapes)
+    checked_tensors = [
+        with_shape(shape, tensor)
+        for shape, tensor in zip(flatten_shapes, flatten_tensors)
+    ]
+    return nest.pack_sequence_as(elements, checked_tensors)
+
+  def _apply_fn(dataset):
+    return _RestructuredDataset(
+        dataset.map(_check_shape),
+        dataset.output_types,
+        output_shapes=expected_shapes,
+        output_classes=dataset.output_classes)
+
+  return _apply_fn
+
+
 class _MapAndBatchDataset(dataset_ops.MapDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
-  def __init__(self, input_dataset, map_func, batch_size, num_parallel_batches):
+  def __init__(self, input_dataset, map_func, batch_size, num_parallel_batches,
+               drop_remainder):
     """See `Dataset.map()` for details."""
     super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
-    self._batch_size = ops.convert_to_tensor(
+    self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
-    self._num_parallel_batches = ops.convert_to_tensor(
+    self._num_parallel_batches_t = ops.convert_to_tensor(
         num_parallel_batches, dtype=dtypes.int64, name="num_parallel_batches")
+    self._drop_remainder_t = ops.convert_to_tensor(
+        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
+
+    self._batch_size = batch_size
+    self._drop_remainder = drop_remainder
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
@@ -363,8 +487,9 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
         input_resource,
         self._map_func.captured_inputs,
         f=self._map_func,
-        batch_size=self._batch_size,
-        num_parallel_batches=self._num_parallel_batches,
+        batch_size=self._batch_size_t,
+        num_parallel_batches=self._num_parallel_batches_t,
+        drop_remainder=self._drop_remainder_t,
         output_types=nest.flatten(
             sparse.as_dense_types(self.output_types, self.output_classes)),
         output_shapes=nest.flatten(
@@ -373,9 +498,9 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
 
   @property
   def output_shapes(self):
+    dim = self._batch_size if self._drop_remainder else None
     return nest.pack_sequence_as(self._output_shapes, [
-        tensor_shape.vector(tensor_util.constant_value(
-            self._batch_size)).concatenate(s)
+        tensor_shape.vector(dim).concatenate(s)
         for s in nest.flatten(self._output_shapes)
     ])
 
@@ -384,7 +509,10 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
     return self._output_types
 
 
-def map_and_batch(map_func, batch_size, num_parallel_batches=1):
+def map_and_batch(map_func,
+                  batch_size,
+                  num_parallel_batches=1,
+                  drop_remainder=False):
   """Fused implementation of `map` and `batch`.
 
   Maps `map_func` across `batch_size` consecutive elements of this dataset
@@ -404,6 +532,9 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
       number of batches to create in parallel. On one hand, higher values can
       help mitigate the effect of stragglers. On the other hand, higher values
       can increase contention if CPU is scarce.
+    drop_remainder: A `tf.bool` scalar `tf.Tensor`, representing whether the
+      last batch should be dropped in case its size is smaller than desired;
+      the default behavior is not to drop the smaller batch.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -412,6 +543,6 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
 
   def _apply_fn(dataset):
     return _MapAndBatchDataset(dataset, map_func, batch_size,
-                               num_parallel_batches)
+                               num_parallel_batches, drop_remainder)
 
   return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/counter.py b/tensorflow/contrib/data/python/ops/counter.py
index 63226fe78163c59025623a362d17c400fbe57c67..6ef65f9624601286691505a795a86dd6226eead1 100644
--- a/tensorflow/contrib/data/python/ops/counter.py
+++ b/tensorflow/contrib/data/python/ops/counter.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import ops
 
 
 def Counter(start=0, step=1, dtype=dtypes.int64):
-  """Creates a `Dataset` of a `step`-separated count startin from `start`.
+  """Creates a `Dataset` that counts from `start` in steps of size `step`.
 
   For example:
 
@@ -38,12 +38,13 @@ def Counter(start=0, step=1, dtype=dtypes.int64):
   ```
 
   Args:
-    start: starting value for count.
-    step: step size.
-    dtype: counter data type.
+    start: (Optional.) The starting value for the counter. Defaults to 0.
+    step: (Optional.) The step size for the counter. Defaults to 1.
+    dtype: (Optional.) The data type for counter elements. Defaults to
+      `tf.int64`.
 
   Returns:
-    A `Dataset` of scalar elements.
+    A `Dataset` of scalar `dtype` elements.
   """
   with ops.name_scope("counter"):
     start = ops.convert_to_tensor(start, dtype=dtype, name="start")
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
deleted file mode 100644
index 214641bb9a62e6cbdece07b511864a5cff20944d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ /dev/null
@@ -1,690 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python wrappers for Datasets and Iterators."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import enumerate_ops
-from tensorflow.contrib.data.python.ops import error_ops
-from tensorflow.contrib.data.python.ops import grouping
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import gen_io_ops
-from tensorflow.python.util import deprecation
-
-
-class Dataset(dataset_ops.Dataset):
-  """Represents a potentially large set of elements.
-
-  A `Dataset` can be used to represent an input pipeline as a
-  collection of elements (nested structures of tensors) and a "logical
-  plan" of transformations that act on those elements.
-  """
-
-  def __init__(self, dataset):
-    super(Dataset, self).__init__()
-    self._dataset = dataset
-
-  @deprecation.deprecated(None, "Use `ds._as_variant_tensor()`.")
-  def make_dataset_resource(self):
-    return self._as_variant_tensor()
-
-  def _as_variant_tensor(self):
-    return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-  @property
-  def output_classes(self):
-    return self._dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._dataset.output_types
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensors()`.")
-  def from_tensors(tensors):
-    """Creates a `Dataset` with a single element, comprising the given tensors.
-
-    Args:
-      tensors: A nested structure of tensors.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.TensorDataset(tensors))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
-  def from_tensor_slices(tensors):
-    """Creates a `Dataset` whose elements are slices of the given tensors.
-
-    Args:
-      tensors: A nested structure of tensors, each having the same size in the
-        0th dimension.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.TensorSliceDataset(tensors))
-
-  @staticmethod
-  @deprecation.deprecated(None,
-                          "Use `tf.data.Dataset.from_sparse_tensor_slices()`.")
-  def from_sparse_tensor_slices(sparse_tensor):
-    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
-
-    Args:
-      sparse_tensor: A `tf.SparseTensor`.
-
-    Returns:
-      A `Dataset` of rank-(N-1) sparse tensors.
-    """
-    return Dataset(dataset_ops.SparseTensorSliceDataset(sparse_tensor))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_generator()`.")
-  def from_generator(generator, output_types, output_shapes=None):
-    """Creates a `Dataset` whose elements are generated by `generator`.
-
-    The `generator` argument must be a callable object that returns
-    an object that support the `iter()` protocol (e.g. a generator function).
-    The elements generated by `generator` must be compatible with the given
-    `output_types` and (optional) `output_shapes` arguments.
-
-    For example:
-
-    ```python
-    import itertools
-
-    def gen():
-      for i in itertools.count(1):
-        yield (i, [1] * i)
-
-    ds = Dataset.from_generator(
-        gen, (tf.int64, tf.int64), (tf.TensorShape([]), tf.TensorShape([None])))
-    value = ds.make_one_shot_iterator().get_next()
-
-    sess.run(value)  # (1, array([1]))
-    sess.run(value)  # (2, array([1, 1]))
-    ```
-
-    Args:
-      generator: A callable object that takes no arguments and returns an
-        object that supports the `iter()` protocol.
-      output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of an element yielded by `generator`.
-      output_shapes: (Optional.) A nested structure of `tf.TensorShape`
-        objects corresponding to each component of an element yielded by
-        `generator`.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.Dataset.from_generator(
-        generator, output_types, output_shapes))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.range()`.")
-  def range(*args):
-    """Creates a `Dataset` of a step-separated range of values.
-
-    For example:
-
-    ```python
-    Dataset.range(5) == [0, 1, 2, 3, 4]
-    Dataset.range(2, 5) == [2, 3, 4]
-    Dataset.range(1, 5, 2) == [1, 3]
-    Dataset.range(1, 5, -2) == []
-    Dataset.range(5, 1) == []
-    Dataset.range(5, 1, -2) == [5, 3]
-    ```
-
-    Args:
-      *args: follow same semantics as python's xrange.
-        len(args) == 1 -> start = 0, stop = args[0], step = 1
-        len(args) == 2 -> start = args[0], stop = args[1], step = 1
-        len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
-
-    Returns:
-      A `RangeDataset`.
-
-    Raises:
-      ValueError: if len(args) == 0.
-    """
-    return Dataset(dataset_ops.RangeDataset(*args))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.zip()`.")
-  def zip(datasets):
-    """Creates a `Dataset` by zipping together the given datasets.
-
-    This method has similar semantics to the built-in `zip()` function
-    in Python, with the main difference being that the `datasets`
-    argument can be an arbitrary nested structure of `Dataset` objects.
-    For example:
-
-    ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3 }
-    b = { 4, 5, 6 }
-    c = { (7, 8), (9, 10), (11, 12) }
-    d = { 13, 14 }
-
-    # The nested structure of the `datasets` argument determines the
-    # structure of elements in the resulting dataset.
-    Dataset.zip((a, b)) == { (1, 4), (2, 5), (3, 6) }
-    Dataset.zip((b, a)) == { (4, 1), (5, 2), (6, 3) }
-
-    # The `datasets` argument may contain an arbitrary number of
-    # datasets.
-    Dataset.zip((a, b, c)) == { (1, 4, (7, 8)),
-                                (2, 5, (9, 10)),
-                                (3, 6, (11, 12)) }
-
-    # The number of elements in the resulting dataset is the same as
-    # the size of the smallest dataset in `datasets`.
-    Dataset.zip((a, d)) == { (1, 13), (2, 14) }
-    ```
-
-    Args:
-      datasets: A nested structure of datasets.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.ZipDataset(datasets))
-
-  def concatenate(self, dataset):
-    """Creates a `Dataset` by concatenating given dataset with this dataset.
-
-    ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3 }
-    b = { 4, 5, 6, 7 }
-
-    # Input dataset and dataset to be concatenated should have same
-    # nested structures and output types.
-    # c = { (8, 9), (10, 11), (12, 13) }
-    # d = { 14.0, 15.0, 16.0 }
-    # a.concatenate(c) and a.concatenate(d) would result in error.
-
-    a.concatenate(b) == { 1, 2, 3, 4, 5, 6, 7 }
-    ```
-
-    Args:
-      dataset: `Dataset` to be concatenated.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.ConcatenateDataset(self._dataset, dataset))
-
-  def prefetch(self, buffer_size):
-    """Creates a `Dataset` that prefetches elements from this dataset.
-
-    Args:
-      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
-        maximum number elements that will be buffered when prefetching.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.PrefetchDataset(self._dataset, buffer_size))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.list_files()`.")
-  def list_files(file_pattern):
-    """A dataset of all files matching a pattern.
-
-    Example:
-      If we had the following files on our filesystem:
-        - /path/to/dir/a.txt
-        - /path/to/dir/b.py
-        - /path/to/dir/c.py
-      If we pass "/path/to/dir/*.py" as the directory, the dataset would
-      produce:
-        - /path/to/dir/b.py
-        - /path/to/dir/c.py
-
-    Args:
-      file_pattern: A string or scalar string `tf.Tensor`, representing
-        the filename pattern that will be matched.
-
-    Returns:
-     A `Dataset` of strings corresponding to file names.
-    """
-    return Dataset.from_tensor_slices(gen_io_ops.matching_files(file_pattern))
-
-  def repeat(self, count=None):
-    """Repeats this dataset `count` times.
-
-    Args:
-      count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        number of times the elements of this dataset should be repeated. The
-        default behavior (if `count` is `None` or `-1`) is for the elements to
-        be repeated indefinitely.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.RepeatDataset(self._dataset, count))
-
-  @deprecation.deprecated(
-      None, "Use `ds.apply(tf.contrib.data.enumerate_dataset())`.")
-  def enumerate(self, start=0):
-    """Deprecated: Use `Dataset.apply(tf.contrib.data.enumerate_dataset(..)`."""
-
-    return self.apply(enumerate_ops.enumerate_dataset(start))
-
-  def shuffle(self, buffer_size, seed=None):
-    """Randomly shuffles the elements of this dataset.
-
-    Args:
-      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
-        number of elements from this dataset from which the new
-        dataset will sample.
-      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        random seed that will be used to create the distribution. See
-        @{tf.set_random_seed} for behavior.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.ShuffleDataset(self._dataset, buffer_size, seed))
-
-  def cache(self, filename=""):
-    """Caches the elements in this dataset.
-
-    Args:
-      filename: A `tf.string` scalar `tf.Tensor`, representing the name of a
-        directory on the filesystem to use for caching tensors in this Dataset.
-        If a filename is not provided, the dataset will be cached in memory.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.CacheDataset(self._dataset, filename))
-
-  def take(self, count):
-    """Creates a `Dataset` with at most `count` elements from this dataset.
-
-    Args:
-      count: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        elements of this dataset that should be taken to form the new dataset.
-        If `count` is -1, or if `count` is greater than the size of this
-        dataset, the new dataset will contain all elements of this dataset.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.TakeDataset(self._dataset, count))
-
-  def skip(self, count):
-    """Creates a `Dataset` that skips `count` elements from this dataset.
-
-    Args:
-      count: A `tf.int64` scalar `tf.Tensor`, representing the number
-        of elements of this dataset that should be skipped to form the
-        new dataset.  If `count` is greater than the size of this
-        dataset, the new dataset will contain no elements.  If `count`
-        is -1, skips the entire dataset.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.SkipDataset(self._dataset, count))
-
-  def shard(self, num_shards, index):
-    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-    This dataset operator is very useful when running distributed training, as
-    it allows each worker to read a unique subset.
-
-    When reading a single input file, you can skip elements as follows:
-
-    ```python
-    d = tf.data.TFRecordDataset(FLAGS.input_file)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Important caveats:
-
-    - Be sure to shard before you use any randomizing operator (such as
-      shuffle).
-    - Generally it is best if the shard operator is used early in the dataset
-      pipeline. For example, when reading from a set of TFRecord files, shard
-      before converting the dataset to input samples. This avoids reading every
-      file on every worker. The following is an example of an efficient
-      sharding strategy within a complete pipeline:
-
-    ```python
-    d = tf.data.Dataset.list_files(FLAGS.pattern)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.interleave(tf.data.TFRecordDataset,
-                     cycle_length=FLAGS.num_readers, block_length=1)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Args:
-      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel.
-      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-    Returns:
-      A `Dataset`.
-
-    Raises:
-      ValueError: if `num_shards` or `index` are illegal values. Note: error
-        checking is done on a best-effort basis, and aren't guaranteed to be
-        caught upon dataset creation. (e.g. providing in a placeholder tensor
-        bypasses the early checking, and will instead result in an error during
-        a session.run call.)
-    """
-    return Dataset(self._dataset.shard(num_shards, index))
-
-  @deprecation.deprecated(
-      None, "Use `ds.apply(tf.contrib.data.ignore_errors())`.")
-  def ignore_errors(self):
-    """Deprecated: Use `Dataset.apply(tf.contrib.data.ignore_errors())`."""
-
-    return self.apply(error_ops.ignore_errors())
-
-  def batch(self, batch_size):
-    """Combines consecutive elements of this dataset into batches.
-
-    Args:
-      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        consecutive elements of this dataset to combine in a single batch.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.BatchDataset(self._dataset, batch_size))
-
-  def padded_batch(self, batch_size, padded_shapes, padding_values=None):
-    """Combines consecutive elements of this dataset into padded batches.
-
-    Like `Dataset.dense_to_sparse_batch()`, this method combines
-    multiple consecutive elements of this dataset, which might have
-    different shapes, into a single element. The tensors in the
-    resulting element have an additional outer dimension, and are
-    padded to the respective shape in `padded_shapes`.
-
-    Args:
-      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        consecutive elements of this dataset to combine in a single batch.
-      padded_shapes: A nested structure of `tf.TensorShape` or
-        `tf.int64` vector tensor-like objects representing the shape
-        to which the respective component of each input element should
-        be padded prior to batching. Any unknown dimensions
-        (e.g. `tf.Dimension(None)` in a `tf.TensorShape` or `-1` in a
-        tensor-like object) will be padded to the maximum size of that
-        dimension in each batch.
-      padding_values: (Optional.) A nested structure of scalar-shaped
-        `tf.Tensor`, representing the padding values to use for the
-        respective components.  Defaults are `0` for numeric types and
-        the empty string for string types.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(
-        dataset_ops.PaddedBatchDataset(self._dataset, batch_size, padded_shapes,
-                                       padding_values))
-
-  @deprecation.deprecated(
-      None, "Use `ds.apply(tf.contrib.data.dense_to_sparse_batch())`.")
-  def dense_to_sparse_batch(self, batch_size, row_shape):
-    """Use: `Dataset.apply(tf.contrib.data.dense_to_sparse_batch(...))`."""
-
-    return self.apply(batching.dense_to_sparse_batch(batch_size, row_shape))
-
-  @deprecation.deprecated(
-      None, "Use `ds.apply(tf.contrib.data.group_by_window())`.")
-  def group_by_window(self, key_func, reduce_func, window_size):
-    """Deprecated: Use `Dataset.apply(tf.contrib.data.group_by_window(...))`."""
-
-    return self.apply(
-        grouping.group_by_window(key_func, reduce_func, window_size))
-
-  @deprecation.deprecated_args(
-      None,
-      "`output_buffer_size=N` with `ds.prefetch(N)` on the returned dataset.",
-      "num_threads", "output_buffer_size")
-  def map(self,
-          map_func,
-          num_threads=None,
-          output_buffer_size=None,
-          num_parallel_calls=None):
-    """Maps `map_func` across this dataset.
-
-    Args:
-      map_func: A function mapping a nested structure of tensors (having
-        shapes and types defined by `self.output_shapes` and
-       `self.output_types`) to another nested structure of tensors.
-      num_threads: (Optional.) Deprecated, use `num_parallel_calls` instead.
-      output_buffer_size: (Optional.) A `tf.int64` scalar `tf.Tensor`,
-        representing the maximum number of processed elements that will be
-        buffered.
-      num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
-        representing the number elements to process in parallel. If not
-        specified, elements will be processed sequentially.
-
-    Returns:
-      A `Dataset`.
-    """
-    if num_threads is None and num_parallel_calls is None:
-      ret = Dataset(dataset_ops.MapDataset(self._dataset, map_func))
-    else:
-      if num_threads is None:
-        ret = Dataset(
-            dataset_ops.ParallelMapDataset(self._dataset, map_func,
-                                           num_parallel_calls))
-      else:
-        ret = Dataset(
-            dataset_ops.ParallelMapDataset(self._dataset, map_func,
-                                           num_threads))
-    if output_buffer_size is not None:
-      ret = ret.prefetch(output_buffer_size)
-    return ret
-
-  def flat_map(self, map_func):
-    """Maps `map_func` across this dataset and flattens the result.
-
-    Args:
-      map_func: A function mapping a nested structure of tensors (having shapes
-        and types defined by `self.output_shapes` and `self.output_types`) to a
-        `Dataset`.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.FlatMapDataset(self._dataset, map_func))
-
-  def interleave(self, map_func, cycle_length, block_length=1):
-    """Maps `map_func` across this dataset, and interleaves the results.
-
-    For example, you can use `Dataset.interleave()` to process many input files
-    concurrently:
-
-    ```python
-    # Preprocess 4 files concurrently, and interleave blocks of 16 records from
-    # each file.
-    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ...]
-    dataset = (Dataset.from_tensor_slices(filenames)
-               .interleave(lambda x:
-                   TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
-                   cycle_length=4, block_length=16))
-    ```
-
-    The `cycle_length` and `block_length` arguments control the order in which
-    elements are produced. `cycle_length` controls the number of input elements
-    that are processed concurrently. If you set `cycle_length` to 1, this
-    transformation will handle one input element at a time, and will produce
-    identical results = to @{tf.data.Dataset.flat_map}. In general,
-    this transformation will apply `map_func` to `cycle_length` input elements,
-    open iterators on the returned `Dataset` objects, and cycle through them
-    producing `block_length` consecutive elements from each iterator, and
-    consuming the next input element each time it reaches the end of an
-    iterator.
-
-    For example:
-
-    ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3, 4, 5 }
-
-    # NOTE: New lines indicate "block" boundaries.
-    a.interleave(lambda x: Dataset.from_tensors(x).repeat(6),
-                 cycle_length=2, block_length=4) == {
-        1, 1, 1, 1,
-        2, 2, 2, 2,
-        1, 1,
-        2, 2,
-        3, 3, 3, 3,
-        4, 4, 4, 4,
-        3, 3,
-        4, 4,
-        5, 5, 5, 5,
-        5, 5,
-    }
-    ```
-
-    NOTE: The order of elements yielded by this transformation is
-    deterministic, as long as `map_func` is a pure function. If
-    `map_func` contains any stateful operations, the order in which
-    that state is accessed is undefined.
-
-    Args:
-      map_func: A function mapping a nested structure of tensors (having shapes
-        and types defined by `self.output_shapes` and `self.output_types`) to a
-        `Dataset`.
-      cycle_length: The number of elements from this dataset that will be
-        processed concurrently.
-      block_length: The number of consecutive elements to produce from each
-        input element before cycling to another input element.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(
-        dataset_ops.InterleaveDataset(self._dataset, map_func, cycle_length,
-                                      block_length))
-
-  @deprecation.deprecated(None, "Use `ds.apply(tf.contrib.data.unbatch())`.")
-  def unbatch(self):
-    """Deprecated: Use `Dataset.apply(tf.contrib.data.unbatch()`."""
-
-    return self.apply(batching.unbatch())
-
-  def filter(self, predicate):
-    """Filters this dataset according to `predicate`.
-
-    Args:
-      predicate: A function mapping a nested structure of tensors (having shapes
-        and types defined by `self.output_shapes` and `self.output_types`) to a
-        scalar `tf.bool` tensor.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.FilterDataset(self._dataset, predicate))
-
-  def apply(self, transformation_func):
-    """Apply a transformation function to this dataset.
-
-    `apply` enables chaining of custom `Dataset` transformations, which are
-    represented as functions that take one `Dataset` argument and return a
-    transformed `Dataset`.
-
-    For example:
-
-    ```
-    dataset = (dataset.map(lambda x: x ** 2)
-               .(group_by_window(key_func, reduce_func, window_size))
-               .map(lambda x: x ** 3))
-    ```
-
-    Args:
-      transformation_func: A function that takes one `Dataset` argument and
-        returns a `Dataset`.
-
-    Returns:
-      The `Dataset` returned by applying `transformation_func` to this dataset.
-    """
-    dataset = transformation_func(self)
-    if not isinstance(dataset, dataset_ops.Dataset):
-      raise TypeError("`transformation_func` must return a Dataset.")
-    return Dataset(dataset)
-
-
-def get_single_element(dataset):
-  """Returns the single element in `dataset` as a nested structure of tensors.
-
-  This function enables you to use a @{tf.data.Dataset} in a stateless
-  "tensor-in tensor-out" expression, without creating a @{tf.data.Iterator}.
-  This can be useful when your preprocessing transformations are expressed
-  as a `Dataset`, and you want to use the transformation at serving time.
-  For example:
-
-  ```python
-  input_batch = tf.placeholder(tf.string, shape=[BATCH_SIZE])
-
-  def preprocessing_fn(input_str):
-    # ...
-    return image, label
-
-  dataset = (tf.data.Dataset.from_tensor_slices(input_batch)
-             .map(preprocessing_fn, num_parallel_calls=BATCH_SIZE)
-             .batch(BATCH_SIZE))
-
-  image_batch, label_batch = tf.contrib.data.get_single_element(dataset)
-  ```
-
-  Args:
-    dataset: A @{tf.data.Dataset} object containing a single element.
-
-  Returns:
-    A nested structure of @{tf.Tensor} objects, corresponding to the single
-    element of `dataset`.
-
-  Raises:
-    TypeError: if `dataset` is not a `tf.data.Dataset` object.
-    InvalidArgumentError (at runtime): if `dataset` does not contain exactly
-      one element.
-  """
-  if not isinstance(dataset, dataset_ops.Dataset):
-    raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
-  return nest.pack_sequence_as(
-      dataset.output_types,
-      gen_dataset_ops.dataset_to_single_element(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          output_types=nest.flatten(dataset.output_types),
-          output_shapes=nest.flatten(dataset.output_shapes)))
diff --git a/tensorflow/contrib/data/python/ops/get_single_element.py b/tensorflow/contrib/data/python/ops/get_single_element.py
index a817b45b71b608810a9d7536ec123ab84f7cdc3b..3a07df572748e464284f580d67e3a664e71acdfe 100644
--- a/tensorflow/contrib/data/python/ops/get_single_element.py
+++ b/tensorflow/contrib/data/python/ops/get_single_element.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.ops import gen_dataset_ops
 
 
@@ -59,9 +60,14 @@ def get_single_element(dataset):
   """
   if not isinstance(dataset, dataset_ops.Dataset):
     raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
-  return nest.pack_sequence_as(
-      dataset.output_types,
-      gen_dataset_ops.dataset_to_single_element(
+
+  nested_ret = nest.pack_sequence_as(
+      dataset.output_types, gen_dataset_ops.dataset_to_single_element(
           dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          output_types=nest.flatten(dataset.output_types),
-          output_shapes=nest.flatten(dataset.output_shapes)))
+          output_types=nest.flatten(sparse.as_dense_types(
+              dataset.output_types, dataset.output_classes)),
+          output_shapes=nest.flatten(sparse.as_dense_shapes(
+              dataset.output_shapes, dataset.output_classes))))
+  return sparse.deserialize_sparse_tensors(
+      nested_ret, dataset.output_types, dataset.output_shapes,
+      dataset.output_classes)
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index 67b085002aa7797d858837fea4646fb968ad5d97..0531f9cbb9da6e6df85fa46940ab1661ad742eb4 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -17,13 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import math_ops
 
 
 def group_by_window(key_func,
@@ -35,7 +42,7 @@ def group_by_window(key_func,
   This transformation maps each consecutive element in a dataset to a key
   using `key_func` and groups the elements by key. It then applies
   `reduce_func` to at most `window_size_func(key)` elements matching the same
-  key. All execpt the final window for each key will contain
+  key. All except the final window for each key will contain
   `window_size_func(key)` elements; the final window may be smaller.
 
   You may provide either a constant `window_size` or a window size determined by
@@ -85,6 +92,114 @@ def group_by_window(key_func,
   return _apply_fn
 
 
+def bucket_by_sequence_length(element_length_func,
+                              bucket_boundaries,
+                              bucket_batch_sizes,
+                              padded_shapes=None,
+                              padding_values=None,
+                              pad_to_bucket_boundary=False):
+  """A transformation that buckets elements in a `Dataset` by length.
+
+  Elements of the `Dataset` are grouped together by length and then are padded
+  and batched.
+
+  This is useful for sequence tasks in which the elements have variable length.
+  Grouping together elements that have similar lengths reduces the total
+  fraction of padding in a batch which increases training step efficiency.
+
+  Args:
+    element_length_func: function from element in `Dataset` to `tf.int32`,
+      determines the length of the element, which will determine the bucket it
+      goes into.
+    bucket_boundaries: `list<int>`, upper length boundaries of the buckets.
+    bucket_batch_sizes: `list<int>`, batch size per bucket. Length should be
+      `len(bucket_boundaries) + 1`.
+    padded_shapes: Nested structure of `tf.TensorShape` to pass to
+      @{tf.data.Dataset.padded_batch}. If not provided, will use
+      `dataset.output_shapes`, which will result in variable length dimensions
+      being padded out to the maximum length in each batch.
+    padding_values: Values to pad with, passed to
+      @{tf.data.Dataset.padded_batch}. Defaults to padding with 0.
+    pad_to_bucket_boundary: bool, if `False`, will pad dimensions with unknown
+      size to maximum length in batch. If `True`, will pad dimensions with
+      unknown size to bucket boundary, and caller must ensure that the source
+      `Dataset` does not contain any elements with length longer than
+      `max(bucket_boundaries)`.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+
+  Raises:
+    ValueError: if `len(bucket_batch_sizes) != len(bucket_boundaries) + 1`.
+  """
+  with ops.name_scope("bucket_by_seq_length"):
+    if len(bucket_batch_sizes) != (len(bucket_boundaries) + 1):
+      raise ValueError(
+          "len(bucket_batch_sizes) must equal len(bucket_boundaries) + 1")
+
+    batch_sizes = constant_op.constant(bucket_batch_sizes, dtype=dtypes.int64)
+
+    def element_to_bucket_id(*args):
+      """Return int64 id of the length bucket for this element."""
+      seq_length = element_length_func(*args)
+
+      boundaries = list(bucket_boundaries)
+      buckets_min = [np.iinfo(np.int32).min] + boundaries
+      buckets_max = boundaries + [np.iinfo(np.int32).max]
+      conditions_c = math_ops.logical_and(
+          math_ops.less_equal(buckets_min, seq_length),
+          math_ops.less(seq_length, buckets_max))
+      bucket_id = math_ops.reduce_min(array_ops.where(conditions_c))
+
+      return bucket_id
+
+    def window_size_fn(bucket_id):
+      # The window size is set to the batch size for this bucket
+      window_size = batch_sizes[bucket_id]
+      return window_size
+
+    def make_padded_shapes(shapes, none_filler=None):
+      padded = []
+      for shape in nest.flatten(shapes):
+        shape = tensor_shape.TensorShape(shape)
+        shape = [
+            none_filler if d.value is None else d
+            for d in shape
+        ]
+        padded.append(shape)
+      return nest.pack_sequence_as(shapes, padded)
+
+    def batching_fn(bucket_id, grouped_dataset):
+      """Batch elements in dataset."""
+      batch_size = batch_sizes[bucket_id]
+      none_filler = None
+      if pad_to_bucket_boundary:
+        err_msg = ("When pad_to_bucket_boundary=True, elements must have "
+                   "length <= max(bucket_boundaries).")
+        check = check_ops.assert_less(
+            bucket_id,
+            constant_op.constant(len(bucket_batch_sizes) - 1,
+                                 dtype=dtypes.int64),
+            message=err_msg)
+        with ops.control_dependencies([check]):
+          boundaries = constant_op.constant(bucket_boundaries,
+                                            dtype=dtypes.int64)
+          bucket_boundary = boundaries[bucket_id]
+          none_filler = bucket_boundary
+      shapes = make_padded_shapes(
+          padded_shapes or grouped_dataset.output_shapes,
+          none_filler=none_filler)
+      return grouped_dataset.padded_batch(batch_size, shapes, padding_values)
+
+    def _apply_fn(dataset):
+      return dataset.apply(
+          group_by_window(element_to_bucket_id, batching_fn,
+                          window_size_func=window_size_fn))
+
+    return _apply_fn
+
+
 class _VariantDataset(dataset_ops.Dataset):
   """A Dataset wrapper for a tf.variant-typed function argument."""
 
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 3124ca1d1540e12d949dded88ce1c66181be3595..812a50ecbf105393f7e422edbbdf5c87311d72c1 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -17,101 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib import stateless
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.contrib.data.python.ops import random_ops
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import convert
+from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
 
 
-class ParallelInterleaveDataset(dataset_ops.Dataset):
-  """A `Dataset` that maps a function over its input and flattens the result."""
-
-  def __init__(self, input_dataset, map_func, cycle_length, block_length,
-               sloppy, buffer_output_elements, prefetch_input_elements):
-    """See `tf.contrib.data.parallel_interleave()` for details."""
-    super(ParallelInterleaveDataset, self).__init__()
-    self._input_dataset = input_dataset
-
-    @function.Defun(*nest.flatten(
-        sparse.as_dense_types(input_dataset.output_types,
-                              input_dataset.output_classes)))
-    def tf_map_func(*args):
-      """A wrapper for Defun that facilitates shape inference."""
-      # Pass in shape information from the input_dataset.
-      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
-                                            input_dataset.output_classes)
-      for arg, shape in zip(args, nest.flatten(dense_shapes)):
-        arg.set_shape(shape)
-
-      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-      nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types, input_dataset.output_shapes,
-          input_dataset.output_classes)
-      if dataset_ops._should_unpack_args(nested_args):  # pylint: disable=protected-access
-        dataset = map_func(*nested_args)
-      else:
-        dataset = map_func(nested_args)
-
-      if not isinstance(dataset, dataset_ops.Dataset):
-        raise TypeError("`map_func` must return a `Dataset` object.")
-
-      self._output_classes = dataset.output_classes
-      self._output_types = dataset.output_types
-      self._output_shapes = dataset.output_shapes
-
-      return dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-    self._map_func = tf_map_func
-    self._map_func.add_to_graph(ops.get_default_graph())
-
-    self._cycle_length = ops.convert_to_tensor(
-        cycle_length, dtype=dtypes.int64, name="cycle_length")
-    self._block_length = ops.convert_to_tensor(
-        block_length, dtype=dtypes.int64, name="block_length")
-    self._sloppy = ops.convert_to_tensor(
-        sloppy, dtype=dtypes.bool, name="sloppy")
-    self._buffer_output_elements = convert.optional_param_to_tensor(
-        "buffer_output_elements",
-        buffer_output_elements,
-        argument_default=2 * block_length)
-    self._prefetch_input_elements = convert.optional_param_to_tensor(
-        "prefetch_input_elements",
-        prefetch_input_elements,
-        argument_default=2 * cycle_length)
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.parallel_interleave_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,
-        self._cycle_length,
-        self._block_length,
-        self._sloppy,
-        self._buffer_output_elements,
-        self._prefetch_input_elements,
-        f=self._map_func,
-        output_types=nest.flatten(
-            sparse.as_dense_types(self.output_types, self.output_classes)),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-
 def parallel_interleave(map_func,
                         cycle_length,
                         block_length=1,
@@ -162,7 +82,7 @@ def parallel_interleave(map_func,
     @{tf.data.Dataset.apply}.
   """
   def _apply_fn(dataset):
-    return ParallelInterleaveDataset(
+    return readers.ParallelInterleaveDataset(
         dataset, map_func, cycle_length, block_length, sloppy,
         buffer_output_elements, prefetch_input_elements)
 
@@ -221,7 +141,7 @@ def sloppy_interleave(map_func, cycle_length, block_length=1):
     @{tf.data.Dataset.apply}.
   """
   def _apply_fn(dataset):
-    return ParallelInterleaveDataset(
+    return readers.ParallelInterleaveDataset(
         dataset,
         map_func,
         cycle_length,
@@ -231,3 +151,92 @@ def sloppy_interleave(map_func, cycle_length, block_length=1):
         prefetch_input_elements=None)
 
   return _apply_fn
+
+
+class DirectedInterleaveDataset(dataset_ops.Dataset):
+  """A substitute for `Dataset.interleave()` on a fixed list of datasets."""
+
+  def __init__(self, selector_input, data_inputs):
+    self._selector_input = selector_input
+    self._data_inputs = list(data_inputs)
+
+    for data_input in data_inputs[1:]:
+      if (data_input.output_types != data_inputs[0].output_types or
+          data_input.output_classes != data_inputs[0].output_classes):
+        raise TypeError("All datasets must have the same type.")
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    return gen_dataset_ops.directed_interleave_dataset(
+        self._selector_input._as_variant_tensor(),
+        [data_input._as_variant_tensor() for data_input in self._data_inputs],
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+    # pylint: enable=protected-access
+
+  @property
+  def output_classes(self):
+    return self._data_inputs[0].output_classes
+
+  @property
+  def output_shapes(self):
+    ret = self._data_inputs[0].output_shapes
+    for data_input in self._data_inputs[1:]:
+      ret = nest.pack_sequence_as(ret, [
+          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
+              nest.flatten(ret), nest.flatten(data_input.output_shapes))
+      ])
+    return ret
+
+  @property
+  def output_types(self):
+    return self._data_inputs[0].output_types
+
+
+def sample_from_datasets(datasets, weights=None, seed=None):
+  """Samples elements at random from the datasets in `datasets`.
+
+  Args:
+    datasets: A list of @{tf.data.Dataset} objects with compatible structure.
+    weights: (Optional.) A list of `len(datasets)` floating-point values where
+      `weights[i]` represents the probability with which an element should be
+      sampled from `datasets[i]`, or a @{tf.data.Dataset} object where each
+      element is such a list. Defaults to a uniform distribution across
+      `datasets`.
+    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      random seed that will be used to create the distribution. See
+      @{tf.set_random_seed} for behavior.
+
+  Returns:
+    A dataset that interleaves elements from `datasets` at random, according to
+    `weights` if provided, otherwise with uniform probability.
+
+  Raises:
+    TypeError: If the `datasets` or `weights` arguments have the wrong type.
+    ValueError: If the `weights` argument is specified and does not match the
+      length of the `datasets` element.
+  """
+  num_datasets = len(datasets)
+  if weights is None:
+    weights = dataset_ops.Dataset.from_tensors([1.0] * num_datasets).repeat()
+  elif not isinstance(weights, dataset_ops.Dataset):
+    weights = ops.convert_to_tensor(weights, name="weights")
+    if weights.dtype not in (dtypes.float32, dtypes.float64):
+      raise TypeError("`weights` must be convertible to a tensor of "
+                      "`tf.float32` or `tf.float64` elements.")
+    if not weights.shape.is_compatible_with([num_datasets]):
+      raise ValueError("`weights` must be a vector of length `len(datasets)`.")
+    weights = dataset_ops.Dataset.from_tensors(weights).repeat()
+
+  # The `stateless_multinomial()` op expects log-probabilities, as opposed to
+  # weights.
+  logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
+  def select_dataset(logits, seed):
+    return array_ops.squeeze(
+        stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1])
+  selector_input = dataset_ops.Dataset.zip(
+      (logits_ds, random_ops.RandomDataset(seed).batch(2))).map(select_dataset)
+
+  return DirectedInterleaveDataset(selector_input, datasets)
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 96a9e9ed6649444dac5e56d7dd2fcdb62fc56459..e4c9f8b58a2a4390004b0ad318163526b443d44f 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -17,27 +17,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
 from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops as core_gen_dataset_ops
 
 
 # TODO(rohanj): Add a python class that constructs resource in the __init__
 # method and provides a get_next() that calls the prefetch op.
 def function_buffering_resource(string_arg,
                                 target_device,
-                                shared_name,
                                 f,
                                 buffer_size,
-                                thread_pool_size=1,
                                 container="",
+                                shared_name=None,
                                 name=None):
+  if shared_name is None:
+    shared_name = ""
   return gen_dataset_ops.function_buffering_resource(
       string_arg=string_arg,
       target_device=target_device,
       shared_name=shared_name,
       f=f,
       buffer_size=buffer_size,
-      thread_pool_size=thread_pool_size,
       container=container,
       name=name)
 
@@ -49,3 +60,266 @@ def function_buffering_resource_get_next(function_buffer_resource,
       function_buffer_resource=function_buffer_resource,
       output_types=output_types,
       name=name)
+
+
+def function_buffering_resource_reset(function_buffer_resource, name=None):
+  return gen_dataset_ops.function_buffering_resource_reset(
+      function_buffer_resource=function_buffer_resource, name=name)
+
+
+# pylint: disable=protected-access
+class _PrefetchToDeviceIterator(object):
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               one_shot,
+               device,
+               buffer_size,
+               shared_name=None):
+    self._input_dataset = input_dataset
+    self._get_next_call_count = 0
+    self._one_shot = one_shot
+    if shared_name is None:
+      shared_name = ""
+
+    if self._one_shot:
+      self._input_iterator = input_dataset.make_one_shot_iterator()
+    else:
+      self._input_iterator = iterator_ops.Iterator.from_structure(
+          self._input_dataset.output_types, self._input_dataset.output_shapes,
+          shared_name, self._input_dataset.output_classes)
+    input_iterator_handle = self._input_iterator.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self._input_iterator.output_types,
+          self._input_iterator.output_shapes,
+          self._input_iterator.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    iterator_device = gen_dataset_ops.iterator_get_device(
+        self._input_iterator._iterator_resource)
+
+    with ops.device(device):
+      self._buffering_resource = function_buffering_resource(
+          f=_prefetch_fn,
+          target_device=iterator_device,
+          string_arg=input_iterator_handle,
+          buffer_size=buffer_size,
+          shared_name=shared_name)
+
+    if not self._one_shot:
+      reset_op = function_buffering_resource_reset(self._buffering_resource)
+      with ops.control_dependencies([reset_op]):
+        self._initializer = self._input_iterator.make_initializer(
+            self._input_dataset)
+
+  def get_next(self, name=None):
+    """See @{tf.data.Iterator.get_next}."""
+    self._get_next_call_count += 1
+    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
+
+    flat_ret = gen_dataset_ops.function_buffering_resource_get_next(
+        self._buffering_resource,
+        output_types=nest.flatten(sparse.as_dense_types(
+            self.output_types, self.output_classes)), name=name)
+
+    ret = sparse.deserialize_sparse_tensors(
+        nest.pack_sequence_as(self.output_types, flat_ret),
+        self.output_types, self.output_shapes, self.output_classes)
+
+    for tensor, shape in zip(
+        nest.flatten(ret), nest.flatten(self.output_shapes)):
+      if isinstance(tensor, ops.Tensor):
+        tensor.set_shape(shape)
+
+    return ret
+
+  @property
+  def initializer(self):
+    if self._one_shot:
+      raise NotImplementedError("Can't initialize a one_shot_iterator")
+    return self._initializer
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               device,
+               buffer_size):
+    with ops.device("/device:CPU:0"):
+      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
+      input_iterator_handle = core_gen_dataset_ops.iterator_to_string_handle(
+          self._resource)
+
+    self._device = device
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self.output_types, self.output_shapes, self.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    _prefetch_fn.add_to_graph(None)
+
+    with ops.device(device):
+      self._buffering_resource = function_buffering_resource(
+          f=_prefetch_fn,
+          target_device=gen_dataset_ops.iterator_get_device(self._resource),
+          string_arg=input_iterator_handle,
+          buffer_size=buffer_size,
+          shared_name=iterator_ops._generate_shared_name(
+              "function_buffer_resource"))
+
+  def _next_internal(self):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      with ops.device(self._device):
+        ret = gen_dataset_ops.function_buffering_resource_get_next(
+            function_buffer_resource=self._buffering_resource,
+            output_types=self._flat_output_types)
+      return sparse.deserialize_sparse_tensors(
+          nest.pack_sequence_as(self._output_types, ret), self._output_types,
+          self._output_shapes, self._output_classes)
+# pylint: enable=protected-access
+
+
+class _PrefetchToDeviceDataset(dataset_ops.Dataset):
+  """A `Dataset` whose iterator prefetches elements to another device."""
+
+  def __init__(self, input_dataset, device, buffer_size):
+    self._input_dataset = input_dataset
+    self._device = device
+    self._buffer_size = buffer_size if buffer_size is not None else 1
+
+  # The static analysis cannot tell that the eager iterator's superclass has
+  # a `next()` method.
+  # pylint: disable=non-iterator-returned
+  def __iter__(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    The returned iterator implements the Python iterator protocol and therefore
+    can only be used in eager mode.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+
+    Raises:
+      RuntimeError: If eager execution is enabled.
+    """
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      raise RuntimeError("dataset.__iter__() is only supported when eager "
+                         "execution is enabled.")
+  # pylint: enable=non-iterator-returned
+
+  def make_one_shot_iterator(self):
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      return _PrefetchToDeviceIterator(self._input_dataset, one_shot=True,
+                                       device=self._device,
+                                       buffer_size=self._buffer_size)
+
+  def make_initializable_iterator(self, shared_name=None):
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=False,
+        device=self._device,
+        buffer_size=self._buffer_size,
+        shared_name=shared_name)
+
+  def _as_variant_tensor(self):
+    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
+    # transformation methods is called.
+    # TODO(mrry): Investigate support for chaining further transformations after
+    # the prefetch, including GPU support.
+    raise NotImplementedError("`prefetch_to_device()` must be the last "
+                              "transformation in a dataset pipeline.")
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+def prefetch_to_device(device, buffer_size=None):
+  """A transformation that prefetches dataset values to the given `device`.
+
+  NOTE: Although the transformation creates a @{tf.data.Dataset}, the
+  transformation must be the final `Dataset` in the input pipeline.
+
+  Args:
+    device: A string. The name of a device to which elements will be prefetched.
+    buffer_size: (Optional.) The number of elements to buffer on `device`.
+      Defaults to an automatically chosen value.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+  def _apply_fn(dataset):
+    return _PrefetchToDeviceDataset(dataset, device, buffer_size)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/random_ops.py b/tensorflow/contrib/data/python/ops/random_ops.py
index 7d727165feabb101549567f28a2dfa07083de244..28ef5e50f39dd7d1b6f124e58e068fc968ddd6dc 100644
--- a/tensorflow/contrib/data/python/ops/random_ops.py
+++ b/tensorflow/contrib/data/python/ops/random_ops.py
@@ -19,11 +19,10 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import sparse
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 
@@ -34,16 +33,7 @@ class RandomDataset(dataset_ops.Dataset):
   def __init__(self, seed=None):
     """A `Dataset` of pseudorandom values."""
     super(RandomDataset, self).__init__()
-    seed, seed2 = random_seed.get_seed(seed)
-    if seed is None:
-      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
-    else:
-      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
-    if seed2 is None:
-      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
-    else:
-      self._seed2 = ops.convert_to_tensor(
-          seed2, dtype=dtypes.int64, name="seed2")
+    self._seed, self._seed2 = random_seed.get_seed(seed)
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.random_dataset(
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 57f30102778f3bac47580f9bdf94e411dfe1b621..bbb808fbd7730002e48cab47fa8d0fe09e2124d2 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -17,20 +17,560 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import csv
+from math import ceil
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.contrib.data.python.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import gfile
+from tensorflow.python.util import deprecation
+
+_ACCEPTABLE_CSV_TYPES = (dtypes.float32, dtypes.float64, dtypes.int32,
+                         dtypes.int64, dtypes.string)
+
+
+def _is_valid_int32(str_val):
+  try:
+    # Checks equality to prevent int32 overflow
+    return dtypes.int32.as_numpy_dtype(str_val) == dtypes.int64.as_numpy_dtype(
+        str_val)
+  except (ValueError, OverflowError):
+    return False
+
+
+def _is_valid_int64(str_val):
+  try:
+    dtypes.int64.as_numpy_dtype(str_val)
+    return True
+  except (ValueError, OverflowError):
+    return False
+
+
+def _is_valid_float(str_val, float_dtype):
+  try:
+    return float_dtype.as_numpy_dtype(str_val) < np.inf
+  except ValueError:
+    return False
+
+
+def _infer_type(str_val, na_value, prev_type, float_dtype):
+  """Given a string, infers its tensor type.
+
+  Infers the type of a value by picking the least 'permissive' type possible,
+  while still allowing the previous type inference for this column to be valid.
+
+  Args:
+    str_val: String value to infer the type of.
+    na_value: Additional string to recognize as a NA/NaN CSV value.
+    prev_type: Type previously inferred based on values of this column that
+      we've seen up till now.
+    float_dtype: Either `tf.float32` or `tf.float64`. Denotes what float type
+      to parse float strings as.
+  Returns:
+    Inferred dtype.
+  """
+  if str_val in ("", na_value):
+    return prev_type
+
+  if _is_valid_int32(str_val) and prev_type in (None, dtypes.int32):
+    return dtypes.int32
+
+  if _is_valid_int64(str_val) and prev_type in (None, dtypes.int32,
+                                                dtypes.int64):
+    return dtypes.int64
+
+  if _is_valid_float(str_val, float_dtype) and prev_type != dtypes.string:
+    return float_dtype
+
+  return dtypes.string
+
+
+def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
+                  comment):
+  for fn in filenames:
+    with file_io.FileIO(fn, "r") as f:
+      rdr = csv.reader(
+          f,
+          delimiter=field_delim,
+          quoting=csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE)
+      if header:
+        next(rdr)  # Skip header lines
+
+      for csv_row in rdr:
+        if comment is not None and csv_row[0].startswith(comment):
+          continue  # Skip comment lines
+
+        if len(csv_row) != num_cols:
+          raise ValueError(
+              "Problem inferring types: CSV row has different number of fields "
+              "than expected.")
+        yield csv_row
+
+
+def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
+                           na_value, header, comment, float_dtype,
+                           num_rows_for_inference, select_columns):
+  """Infers column types from the first N valid CSV records of files."""
+  if select_columns is None:
+    select_columns = range(num_cols)
+  inferred_types = [None] * len(select_columns)
+
+  for i, csv_row in enumerate(
+      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
+                    comment)):
+    if num_rows_for_inference is not None and i >= num_rows_for_inference:
+      break
+
+    for j, col_index in enumerate(select_columns):
+      inferred_types[j] = _infer_type(csv_row[col_index], na_value,
+                                      inferred_types[j], float_dtype)
+
+  # Replace None's with a default type
+  inferred_types = [t or dtypes.string for t in inferred_types]
+  # Default to 0 or '' for null values
+  return [
+      constant_op.constant([0 if t is not dtypes.string else ""], dtype=t)
+      for t in inferred_types
+  ]
+
+
+def _infer_column_names(filenames, field_delim, use_quote_delim):
+  """Infers column names from first rows of files."""
+  csv_kwargs = {
+      "delimiter": field_delim,
+      "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE
+  }
+  with file_io.FileIO(filenames[0], "r") as f:
+    try:
+      column_names = next(csv.reader(f, **csv_kwargs))
+    except StopIteration:
+      raise ValueError(("Received StopIteration when reading the header line "
+                        "of %s.  Empty file?") % filenames[0])
+
+  for name in filenames[1:]:
+    with file_io.FileIO(name, "r") as f:
+      try:
+        if next(csv.reader(f, **csv_kwargs)) != column_names:
+          raise ValueError(
+              "Files have different column names in the header row.")
+      except StopIteration:
+        raise ValueError(("Received StopIteration when reading the header line "
+                          "of %s.  Empty file?") % filenames[0])
+  return column_names
 
 
+def _get_sorted_col_indices(select_columns, column_names):
+  """Transforms select_columns argument into sorted column indices."""
+  names_to_indices = {n: i for i, n in enumerate(column_names)}
+  num_cols = len(column_names)
+  for i, v in enumerate(select_columns):
+    if isinstance(v, int):
+      if v < 0 or v >= num_cols:
+        raise ValueError(
+            "Column index %d specified in select_columns out of valid range." %
+            v)
+      continue
+    if v not in names_to_indices:
+      raise ValueError(
+          "Value '%s' specified in select_columns not a valid column index or "
+          "name." % v)
+    select_columns[i] = names_to_indices[v]
+
+  # Sort and ensure there are no duplicates
+  result = sorted(set(select_columns))
+  if len(result) != len(select_columns):
+    raise ValueError("select_columns contains duplicate columns")
+  return result
+
+
+def make_csv_dataset(
+    file_pattern,
+    batch_size,
+    column_names=None,
+    column_defaults=None,
+    label_name=None,
+    select_columns=None,
+    field_delim=",",
+    use_quote_delim=True,
+    na_value="",
+    header=True,
+    comment=None,
+    num_epochs=None,
+    shuffle=True,
+    shuffle_buffer_size=10000,
+    shuffle_seed=None,
+    prefetch_buffer_size=1,
+    num_parallel_reads=1,
+    num_parallel_parser_calls=2,
+    sloppy=False,
+    default_float_type=dtypes.float32,
+    num_rows_for_inference=100,
+):
+  """Reads CSV files into a dataset.
+
+  Reads CSV files into a dataset, where each element is a (features, labels)
+  tuple that corresponds to a batch of CSV rows. The features dictionary
+  maps feature column names to `Tensor`s containing the corresponding
+  feature data, and labels is a `Tensor` containing the batch's label data.
+
+  Args:
+    file_pattern: List of files or patterns of file paths containing CSV
+      records. See @{tf.gfile.Glob} for pattern rules.
+    batch_size: An int representing the number of consecutive elements of this
+      dataset to combine in a single batch.
+    column_names: An optional list of strings that corresponds to the CSV
+      columns, in order. One per column of the input record. If this is not
+      provided, infers the column names from the first row of the records.
+      These names will be the keys of the features dict of each dataset element.
+    column_defaults: A optional list of default values for the CSV fields. One
+      item per selected column of the input record. Each item in the list is
+      either a valid CSV dtype (float32, float64, int32, int64, or string), or a
+      `Tensor` with one of the aforementioned types. The tensor can either be
+      a scalar default value (if the column is optional), or an empty tensor (if
+      the column is required). If a dtype is provided instead of a tensor, the
+      column is also treated as required. If this list is not provided, tries
+      to infer types based on reading the first num_rows_for_inference rows of
+      files specified, and assumes all columns are optional, defaulting to `0`
+      for numeric values and `""` for string values. If both this and
+      `select_columns` are specified, these must have the same lengths, and
+      `column_defaults` is assumed to be sorted in order of increasing column
+      index.
+    label_name: A optional string corresponding to the label column. If
+      provided, the data for this column is returned as a separate `Tensor` from
+      the features dictionary, so that the dataset complies with the format
+      expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
+      function.
+    select_columns: An optional list of integer indices or string column
+      names, that specifies a subset of columns of CSV data to select. If
+      column names are provided, these must correspond to names provided in
+      `column_names` or inferred from the file header lines. When this argument
+      is specified, only a subset of CSV columns will be parsed and returned,
+      corresponding to the columns specified. Using this results in faster
+      parsing and lower memory usage. If both this and `column_defaults` are
+      specified, these must have the same lengths, and `column_defaults` is
+      assumed to be sorted in order of increasing column index.
+    field_delim: An optional `string`. Defaults to `","`. Char delimiter to
+      separate fields in a record.
+    use_quote_delim: An optional bool. Defaults to `True`. If false, treats
+      double quotation marks as regular characters inside of the string fields.
+    na_value: Additional string to recognize as NA/NaN.
+    header: A bool that indicates whether the first rows of provided CSV files
+      correspond to header lines with column names, and should not be included
+      in the data.
+    comment: An optional character string that marks lines that should not be
+      parsed as csv records. If this is provided, all lines that start with
+      this character will not be parsed.
+    num_epochs: An int specifying the number of times this dataset is repeated.
+      If None, cycles through the dataset forever.
+    shuffle: A bool that indicates whether the input should be shuffled.
+    shuffle_buffer_size: Buffer size to use for shuffling. A large buffer size
+      ensures better shuffling, but would increase memory usage and startup
+      time.
+    shuffle_seed: Randomization seed to use for shuffling.
+    prefetch_buffer_size: An int specifying the number of feature batches to
+      prefetch for performance improvement. Recommended value is the number of
+      batches consumed per training step.
+    num_parallel_reads: Number of threads used to read CSV records from files.
+      If >1, the results will be interleaved.
+    num_parallel_parser_calls: Number of parallel invocations of the CSV parsing
+      function on CSV records.
+    sloppy: If `True`, reading performance will be improved at
+      the cost of non-deterministic ordering. If `False`, the order of elements
+      produced is deterministic prior to shuffling (elements are still
+      randomized if `shuffle=True`. Note that if the seed is set, then order
+      of elements after shuffling is deterministic). Defaults to `False`.
+    default_float_type: Either `tf.float32` or `tf.float64`. If defaults are
+      not provided, float-like strings are interpreted to be this type.
+    num_rows_for_inference: Number of rows of a file to use for type inference
+      if record_defaults is not provided. If None, reads all the rows of all
+      the files. Defaults to 100.
+
+  Returns:
+    A dataset, where each element is a (features, labels) tuple that corresponds
+    to a batch of `batch_size` CSV rows. The features dictionary maps feature
+    column names to `Tensor`s containing the corresponding column data, and
+    labels is a `Tensor` containing the column data for the label column
+    specified by `label_name`.
+
+  Raises:
+    ValueError: If any of the arguments is malformed.
+  """
+  # Create dataset of all matching filenames
+  filenames = _get_file_names(file_pattern, False)
+  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
+  if shuffle:
+    dataset = dataset.shuffle(len(filenames), shuffle_seed)
+
+  # Clean arguments; figure out column names and defaults
+  if comment is not None and len(comment) != 1:
+    raise ValueError("`comment` arg must be a single-character string or None")
+
+  if column_names is None:
+    if not header:
+      raise ValueError("Cannot infer column names without a header line.")
+    # If column names are not provided, infer from the header lines
+    column_names = _infer_column_names(filenames, field_delim, use_quote_delim)
+  if len(column_names) != len(set(column_names)):
+    raise ValueError("Cannot have duplicate column names.")
+
+  if select_columns is not None:
+    select_columns = _get_sorted_col_indices(select_columns, column_names)
+
+  if column_defaults is not None:
+    column_defaults = [
+        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
+        for x in column_defaults
+    ]
+  else:
+    # If column defaults are not provided, infer from records at graph
+    # construction time
+    column_defaults = _infer_column_defaults(
+        filenames, len(column_names), field_delim, use_quote_delim, na_value,
+        header, comment, default_float_type, num_rows_for_inference,
+        select_columns)
+
+  if select_columns is not None and len(column_defaults) != len(select_columns):
+    raise ValueError(
+        "If specified, column_defaults and select_columns must have same "
+        "length."
+    )
+  if select_columns is not None and len(column_names) > len(select_columns):
+    # Pick the relevant subset of column names
+    column_names = [column_names[i] for i in select_columns]
+
+  if label_name is not None and label_name not in column_names:
+    raise ValueError("`label_name` provided must be one of the columns.")
+
+  # Define map and filter functions
+  def filter_fn(line):
+    return math_ops.not_equal(string_ops.substr(line, 0, 1), comment)
+
+  def filename_to_dataset(filename):
+    ds = core_readers.TextLineDataset(filename)
+    if header:
+      ds = ds.skip(1)
+    if comment is not None:
+      ds = ds.filter(filter_fn)
+    return ds
+
+  def decode_csv(line):
+    """Decodes CSV line into features.
+
+    Args:
+      line: String tensor corresponding to one csv record.
+    Returns:
+      A dictionary of feature names to values for that particular record. If
+      label_name is provided, extracts the label feature to be returned as the
+      second element of the tuple.
+    """
+    columns = parsing_ops.decode_csv(
+        line,
+        column_defaults,
+        field_delim=field_delim,
+        use_quote_delim=use_quote_delim,
+        na_value=na_value,
+        select_cols=select_columns,
+    )
+    features = dict(zip(column_names, columns))
+    if label_name is not None:
+      label = features.pop(label_name)
+      return features, label
+    return features
+
+  # Read files sequentially or in parallel
+  dataset = dataset.apply(
+      interleave_ops.parallel_interleave(
+          filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))
+
+  if num_epochs != 1 and shuffle:
+    # Use shuffle_and_repeat for perf
+    dataset = dataset.apply(
+        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
+                                       shuffle_seed))
+  elif shuffle:
+    dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
+  elif num_epochs != 1:
+    dataset = dataset.repeat(num_epochs)
+
+  # Use map_and_batch for perf
+  # TODO(b/76425672): use num_parallel_calls for better performance tuning when
+  # that is added
+  dataset = dataset.apply(
+      batching.map_and_batch(
+          map_func=decode_csv,
+          batch_size=batch_size,
+          num_parallel_batches=int(
+              ceil(num_parallel_parser_calls / batch_size))))
+
+  dataset = dataset.prefetch(prefetch_buffer_size)
+  return dataset
+
+
+def make_batched_features_dataset(file_pattern,
+                                  batch_size,
+                                  features,
+                                  reader=core_readers.TFRecordDataset,
+                                  reader_args=None,
+                                  num_epochs=None,
+                                  shuffle=True,
+                                  shuffle_buffer_size=10000,
+                                  shuffle_seed=None,
+                                  prefetch_buffer_size=1,
+                                  reader_num_threads=1,
+                                  parser_num_threads=2,
+                                  sloppy_ordering=False,
+                                  drop_final_batch=False):
+  """Returns a `Dataset` of feature dictionaries from `Example` protos.
+
+  Example:
+
+  ```
+  serialized_examples = [
+    features {
+      feature { key: "age" value { int64_list { value: [ 0 ] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
+    },
+    features {
+      feature { key: "age" value { int64_list { value: [] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
+    }
+  ]
+  ```
+
+  We can use arguments:
+
+  ```
+  features: {
+    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
+    "gender": FixedLenFeature([], dtype=tf.string),
+    "kws": VarLenFeature(dtype=tf.string),
+  }
+  ```
+
+  And the expected output is:
+
+  ```python
+  {
+    "age": [[0], [-1]],
+    "gender": [["f"], ["f"]],
+    "kws": SparseTensor(
+      indices=[[0, 0], [0, 1], [1, 0]],
+      values=["code", "art", "sports"]
+      dense_shape=[2, 2]),
+  }
+  ```
+
+  Args:
+    file_pattern: List of files or patterns of file paths containing
+      `Example` records. See `tf.gfile.Glob` for pattern rules.
+    batch_size: An int representing the number of consecutive elements of this
+      dataset to combine in a single batch.
+    features: A `dict` mapping feature keys to `FixedLenFeature` or
+      `VarLenFeature` values. See `tf.parse_example`.
+    reader: A function or class that can be
+      called with a `filenames` tensor and (optional) `reader_args` and returns
+      a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
+    reader_args: Additional arguments to pass to the reader class.
+    num_epochs: Integer specifying the number of times to read through the
+      dataset. If None, cycles through the dataset forever. Defaults to `None`.
+    shuffle: A boolean, indicates whether the input should be shuffled. Defaults
+      to `True`.
+    shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity
+      ensures better shuffling but would increase memory usage and startup time.
+    shuffle_seed: Randomization seed to use for shuffling.
+    prefetch_buffer_size: Number of feature batches to prefetch in order to
+      improve performance. Recommended value is the number of batches consumed
+      per training step (default is 1).
+    reader_num_threads: Number of threads used to read `Example` records. If >1,
+      the results will be interleaved.
+    parser_num_threads: Number of threads to use for parsing `Example` tensors
+      into a dictionary of `Feature` tensors.
+    sloppy_ordering: If `True`, reading performance will be improved at
+      the cost of non-deterministic ordering. If `False`, the order of elements
+      produced is deterministic prior to shuffling (elements are still
+      randomized if `shuffle=True`. Note that if the seed is set, then order
+      of elements after shuffling is deterministic). Defaults to `False`.
+    drop_final_batch: If `True`, and the batch size does not evenly divide the
+      input dataset size, the final smaller batch will be dropped. Defaults to
+      `False`.
+
+  Returns:
+    A dataset of `dict` elements. Each `dict` maps feature keys to
+    `Tensor` or `SparseTensor` objects.
+  """
+  # Create dataset of all matching filenames
+  filenames = _get_file_names(file_pattern, False)
+  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
+  if shuffle:
+    dataset = dataset.shuffle(len(filenames), shuffle_seed)
+
+  # Read `Example` records from files as tensor objects.
+  if reader_args is None:
+    reader_args = []
+
+  # Read files sequentially (if reader_num_threads=1) or in parallel
+  dataset = dataset.apply(
+      interleave_ops.parallel_interleave(
+          lambda filename: reader(filename, *reader_args),
+          cycle_length=reader_num_threads,
+          sloppy=sloppy_ordering))
+
+  # Extract values if the `Example` tensors are stored as key-value tuples.
+  if dataset.output_types == (dtypes.string, dtypes.string):
+    dataset = dataset.map(lambda _, v: v)
+
+  # Apply dataset repeat and shuffle transformations.
+  repeat_dataset = (num_epochs != 1)
+  if repeat_dataset and shuffle:
+    # Used fused shuffle_and_repeat operation for better performance
+    dataset = dataset.apply(
+        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
+                                       shuffle_seed))
+  elif repeat_dataset:
+    dataset = dataset.repeat(num_epochs)
+  elif shuffle:
+    dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
+
+  if drop_final_batch:
+    dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
+  else:
+    dataset = dataset.batch(batch_size)
+
+  # Parse `Example` tensors to a dictionary of `Feature` tensors.
+  dataset = dataset.map(
+      lambda x: parsing_ops.parse_example(x, features),
+      num_parallel_calls=parser_num_threads)
+
+  # TODO(rachelim): Add an optional label_name argument for extracting the label
+  # from the features dictionary, to comply with the type expected by the
+  # input_fn to a `tf.Estimator.train` or `tf.Estimator.evaluate` function.
+  dataset = dataset.prefetch(prefetch_buffer_size)
+  return dataset
+
+
+@deprecation.deprecated(None,
+                        "Use `tf.contrib.data.make_batched_features_dataset`")
 def read_batch_features(file_pattern,
                         batch_size,
                         features,
-                        reader,
+                        reader=core_readers.TFRecordDataset,
                         reader_args=None,
                         randomize_input=True,
                         num_epochs=None,
@@ -84,43 +624,38 @@ def read_batch_features(file_pattern,
       dataset to combine in a single batch.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
       `VarLenFeature` values. See `tf.parse_example`.
-    reader: A function or class that can be called with a `filenames` tensor
-      and (optional) `reader_args` and returns a `Dataset` of Examples.
+    reader: A function or class that can be
+      called with a `filenames` tensor and (optional) `reader_args` and returns
+      a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
     reader_args: Additional arguments to pass to the reader class.
     randomize_input: Whether the input should be randomized.
     num_epochs: Integer specifying the number of times to read through the
       dataset. If None, cycles through the dataset forever.
-    capacity: Capacity of the ShuffleDataset. A large capacity ensures better
+    capacity: Buffer size of the ShuffleDataset. A large capacity ensures better
       shuffling but would increase memory usage and startup time.
-
   Returns:
     A dict from keys in features to `Tensor` or `SparseTensor` objects.
   """
-  filenames = _get_file_names(file_pattern, randomize_input)
-  if reader_args:
-    dataset = reader(filenames, *reader_args)
-  else:
-    dataset = reader(filenames)
-  if dataset.output_types == (dtypes.string, dtypes.string):
-    dataset = dataset.map(lambda _, v: v)
-  if num_epochs != 1:
-    dataset = dataset.repeat(num_epochs)
-  if randomize_input:
-    dataset = dataset.shuffle(capacity)
-  dataset = dataset.batch(batch_size)
-  dataset = dataset.map(lambda x: parsing_ops.parse_example(x, features))
-  dataset = dataset.prefetch(1)
+  dataset = make_batched_features_dataset(
+      file_pattern,
+      batch_size,
+      features,
+      reader=reader,
+      reader_args=reader_args,
+      shuffle=randomize_input,
+      num_epochs=num_epochs,
+      shuffle_buffer_size=capacity)
   iterator = dataset.make_one_shot_iterator()
   outputs = iterator.get_next()
   return outputs
 
 
-def _get_file_names(file_pattern, randomize_input):
+def _get_file_names(file_pattern, shuffle):
   """Parse list of file names from pattern, optionally shuffled.
 
   Args:
     file_pattern: File glob pattern, or list of glob patterns.
-    randomize_input: Whether to shuffle the order of file names.
+    shuffle: Whether to shuffle the order of file names.
 
   Returns:
     List of file names matching `file_pattern`.
@@ -141,7 +676,7 @@ def _get_file_names(file_pattern, randomize_input):
     raise ValueError("No files match %s." % file_pattern)
 
   # Sort files so it will be deterministic for unit tests.
-  if not randomize_input:
+  if not shuffle:
     file_names = sorted(file_names)
   return file_names
 
diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py
index 56f526a330bfbea7305b0754bfd114c5e97db506..a182dddd38d23d096979eebb8de29f07573833dd 100644
--- a/tensorflow/contrib/data/python/ops/resampling.py
+++ b/tensorflow/contrib/data/python/ops/resampling.py
@@ -54,7 +54,7 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
     dist_estimation_batch_size = 32
-    target_dist_t = ops.convert_to_tensor(target_dist, name="initial_dist")
+    target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist")
     class_values_ds = dataset.map(class_func)
     if initial_dist is not None:
       initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist")
@@ -101,11 +101,12 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
                                                    initial_dist_ds))
                           .map(maybe_warn_on_large_rejection))
 
-    current_probabilities_ds = dataset_ops.Dataset.zip(
-        (acceptance_dist_ds, class_values_ds)).map(array_ops.gather)
+    def _gather_and_copy(class_val, acceptance_prob, data):
+      return (class_val, array_ops.gather(acceptance_prob, class_val), data)
+    current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
+        (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
     filtered_ds = (
-        dataset_ops.Dataset.zip((class_values_ds, current_probabilities_ds,
-                                 dataset))
+        current_probabilities_and_class_and_data_ds
         .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
     return filtered_ds.map(lambda class_value, _, data: (class_value, data))
 
@@ -151,7 +152,7 @@ def _calculate_acceptance_probs(initial_probs, target_probs):
   ```
 
 
-  A solution for a_i in terms of the other variabes is the following:
+  A solution for a_i in terms of the other variables is the following:
     ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
   """
   # Add tiny to initial_probs to avoid divide by zero.
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 1c88366273f5d186509454188e02350d4ea9f66b..60ef7efba4bb2bc281bc624ec3f58117ffa9a824 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -57,7 +57,7 @@ class _ScanDataset(dataset_ops.Dataset):
     self._output_shapes = None
     self._output_types = None
 
-    # Iteratively rerun the scan function until reaching a fixed pont on
+    # Iteratively rerun the scan function until reaching a fixed point on
     # `self._state_shapes`.
     need_to_rerun = True
     while need_to_rerun:
@@ -144,6 +144,7 @@ class _ScanDataset(dataset_ops.Dataset):
                                                    weakened_state_shapes)
 
     self._scan_func = tf_scan_func
+    self._scan_func.add_to_graph(ops.get_default_graph())
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/data/python/ops/shuffle_ops.py b/tensorflow/contrib/data/python/ops/shuffle_ops.py
index 99bb79bc06a421f811869ca9169aaa11deaca2f3..f35795abd38000b13cec0f08596e2ff66e86286c 100644
--- a/tensorflow/contrib/data/python/ops/shuffle_ops.py
+++ b/tensorflow/contrib/data/python/ops/shuffle_ops.py
@@ -19,11 +19,11 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import gen_dataset_ops
 
 
@@ -45,17 +45,7 @@ class _ShuffleAndRepeatDataset(dataset_ops.Dataset):
     else:
       self._count = ops.convert_to_tensor(
           count, dtype=dtypes.int64, name="count")
-
-    seed, seed2 = random_seed.get_seed(seed)
-    if seed is None:
-      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
-    else:
-      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
-    if seed2 is None:
-      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
-    else:
-      self._seed2 = ops.convert_to_tensor(
-          seed2, dtype=dtypes.int64, name="seed2")
+    self._seed, self._seed2 = random_seed.get_seed(seed)
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
new file mode 100644
index 0000000000000000000000000000000000000000..19cc3cb89fc5c494f79ce1d25ed57c92099c8bd2
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sliding dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class _SlideDataset(dataset_ops.Dataset):
+  """A `Dataset` that passes a sliding window over its input."""
+
+  def __init__(self, input_dataset, window_size, stride=1):
+    """See `sliding_window_batch` for details."""
+    super(_SlideDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._window_size = ops.convert_to_tensor(
+        window_size, dtype=dtypes.int64, name="window_size")
+    self._stride = ops.convert_to_tensor(
+        stride, dtype=dtypes.int64, name="stride")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.slide_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        window_size=self._window_size,
+        stride=self._stride,
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    input_shapes = self._input_dataset.output_shapes
+    return nest.pack_sequence_as(input_shapes, [
+        tensor_shape.vector(None).concatenate(s)
+        for s in nest.flatten(self._input_dataset.output_shapes)
+    ])
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+def sliding_window_batch(window_size, stride=1):
+  """A sliding window with size of `window_size` and step of `stride`.
+
+  This transformation passes a sliding window over this dataset. The
+  window size is `window_size` and step size is `stride`. If the left
+  elements cannot fill up the sliding window, this transformation will
+  drop the final smaller element. For example:
+
+  ```python
+  # NOTE: The following examples use `{ ... }` to represent the
+  # contents of a dataset.
+  a = { [1], [2], [3], [4], [5], [6] }
+
+  a.apply(tf.contrib.data.sliding_window_batch(window_size=3, stride=2)) ==
+  {
+      [[1], [2], [3]],
+      [[3], [4], [5]],
+  }
+  ```
+
+  Args:
+    window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      elements in the sliding window.
+    stride: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      steps moving the sliding window forward for one iteration. The default
+      is `1`. It must be in `[1, window_size)`.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+  def _apply_fn(dataset):
+    return _SlideDataset(dataset, window_size, stride)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index 9cd1701c397b5a0bf5cc47c1bcab033704794d80..3cbaab5affd7397213b0fbb6b0682db92b99d591 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
@@ -47,7 +46,7 @@ class StatsAggregator(object):
   dataset = ...
   iterator = dataset.make_one_shot_iterator()
   stats_aggregator = stats_ops.StatsAggregator()
-  set_op = stats_op.set_stats_aggregator_op(iterator, stats_aggregator)
+  set_op = stats_aggregator.subscribe(iterator)
 
   with tf.Session() as sess:
     # Running `set_op` will associate `iterator` with `stats_aggregator`.
@@ -85,32 +84,60 @@ class StatsAggregator(object):
     """
     return gen_dataset_ops.stats_aggregator_summary(self._resource)
 
-  def subscribe(self, iterator):
-    """Returns a @{tf.Operation} to associate this aggregator with `iterator`.
 
-    Note: Each @{tf.data.Iterator} can be associated with at most one
-    `StatsAggregator`. After running the operation that this function
-    returns, all statistics recorded in the iteration of `iterator`
-    will be stored in `stats_aggregator`.
+class _SetStatsAggregatorDataset(dataset_ops.Dataset):
+  """A `Dataset` that acts as an identity, and sets given stats_aggregator."""
 
-    Args:
-      iterator: A @{tf.data.Iterator} object.
+  def __init__(self, input_dataset, stats_aggregator):
+    super(_SetStatsAggregatorDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._stats_aggregator = stats_aggregator
 
-    Returns:
-      A @{tf.Operation} that, when run, associates this aggregator with
-      `iterator`.
-    """
-    if not isinstance(iterator, iterator_ops.Iterator):
-      raise TypeError("`iterator` must be a `tf.data.Iterator` object.")
-    return gen_dataset_ops.iterator_set_stats_aggregator(
-        iterator._iterator_resource, self._resource)  # pylint: disable=protected-access
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.set_stats_aggregator_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._stats_aggregator._resource,  # pylint: disable=protected-access
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+# TODO(shivaniagrawal): Expose these methods in `tf.contrib.data`.
+def set_stats_aggregator(stats_aggregator):
+  """Set the given stats_aggregator for aggregating the input dataset stats.
+
+  Args:
+    stats_aggregator: A `StatsAggregator` object.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _SetStatsAggregatorDataset(dataset, stats_aggregator)
+
+  return _apply_fn
 
 
 def bytes_produced_stats(tag):
   """Records the number of bytes produced by each element of the input dataset.
 
-  To consume the statistics, associate a `StatsAggregator` with an iterator
-  over the output dataset.
+  To consume the statistics, associate a `StatsAggregator` with the output
+  dataset.
 
   Args:
     tag: String. All statistics recorded by the returned transformation will
@@ -131,8 +158,8 @@ def bytes_produced_stats(tag):
 def latency_stats(tag):
   """Records the latency of producing each element of the input dataset.
 
-  To consume the statistics, associate a `StatsAggregator` with an iterator
-  over the output dataset.
+  To consume the statistics, associate a `StatsAggregator` with the output
+  dataset.
 
   Args:
     tag: String. All statistics recorded by the returned transformation will
diff --git a/tensorflow/contrib/data/python/ops/threadpool.py b/tensorflow/contrib/data/python/ops/threadpool.py
new file mode 100644
index 0000000000000000000000000000000000000000..56f67e1766bbaff680bdff6b939df0c3ba68c679
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/threadpool.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling threading in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
+from tensorflow.python.ops import resource_variable_ops
+
+_uid_counter = 0
+_uid_lock = threading.Lock()
+
+
+def _generate_shared_name(prefix):
+  with _uid_lock:
+    global _uid_counter
+    uid = _uid_counter
+    _uid_counter += 1
+  return "{}{}".format(prefix, uid)
+
+
+class PrivateThreadPool(object):
+  """A stateful resource that represents a private thread pool."""
+
+  def __init__(self, num_threads, display_name=None):
+    """Creates a `PrivateThreadPool` with the given number of threads."""
+    if context.executing_eagerly():
+      shared_name = _generate_shared_name("privatethreadpool")
+      self._resource = gen_dataset_ops.thread_pool_handle(
+          num_threads=num_threads,
+          display_name=display_name,
+          shared_name=shared_name)
+      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
+          handle=self._resource, handle_device=context.context().device_name)
+    else:
+      self._resource = gen_dataset_ops.thread_pool_handle(
+          num_threads=num_threads, display_name=display_name)
+
+
+class _ThreadPoolDataset(dataset_ops.Dataset):
+  """A `Dataset` that acts as an identity, and sets a custom threadpool."""
+
+  def __init__(self, input_dataset, thread_pool):
+    super(_ThreadPoolDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._thread_pool = thread_pool
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.thread_pool_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._thread_pool._resource,  # pylint: disable=protected-access
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+def override_threadpool(dataset, thread_pool):
+  """Returns a new dataset that uses the given thread pool for its operations.
+
+  Args:
+    dataset: A `tf.data.Dataset` object.
+    thread_pool: A `PrivateThreadPool` object.
+
+  Returns:
+    A dataset containing the same values as `dataset`, but which uses
+    `thread_pool` to compute any of its parallel operations (such as
+    @{tf.data.Dataset.map}).
+  """
+  return _ThreadPoolDataset(dataset, thread_pool)
diff --git a/tensorflow/contrib/data/python/ops/unique.py b/tensorflow/contrib/data/python/ops/unique.py
index 133e17d20d0fc4c8d52cef3c95c132374e927a0b..765ef3f9b6d42c9d7af3ce4916731d37d65c9260 100644
--- a/tensorflow/contrib/data/python/ops/unique.py
+++ b/tensorflow/contrib/data/python/ops/unique.py
@@ -17,11 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import gen_dataset_ops
 
 
 def unique():
diff --git a/tensorflow/contrib/data/python/ops/writers.py b/tensorflow/contrib/data/python/ops/writers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f53bd3f7383950d6cfdb35e12811fb1daf24b320
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/writers.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for tf.data writers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import convert
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class TFRecordWriter(object):
+  """Writes data to a TFRecord file."""
+
+  def __init__(self, filename, compression_type=None):
+    self._filename = ops.convert_to_tensor(
+        filename, dtypes.string, name="filename")
+    self._compression_type = convert.optional_param_to_tensor(
+        "compression_type",
+        compression_type,
+        argument_default="",
+        argument_dtype=dtypes.string)
+
+  def write(self, dataset):
+    """Returns a @{tf.Operation} to write a dataset to a file.
+
+    Args:
+      dataset: a @{tf.data.Dataset} whose elements are to be written to a file
+
+    Returns:
+      A @{tf.Operation} that, when run, writes contents of `dataset` to a file.
+    """
+    if not isinstance(dataset, dataset_ops.Dataset):
+      raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
+    if (dataset.output_types != dtypes.string or
+        dataset.output_shapes != tensor_shape.scalar()):
+      raise TypeError(
+          "`dataset` must produce scalar `DT_STRING` tensors whereas it "
+          "produces shape {0} and types {1}".format(dataset.output_shapes,
+                                                    dataset.output_types))
+    return gen_dataset_ops.dataset_to_tf_record(
+        dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index f6de5998d73a4869d2444cd90c9b64d1a2c889ac..3b50a48336d77ebd9327fa24e5612a95d5d0c372 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -13,19 +13,10 @@ load(
     "tf_pyclif_proto_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "generic_tree_model",
     srcs = ["generic_tree_model.proto"],
     cc_api_version = 2,
-    go_api_version = 2,
     java_api_version = 2,
     visibility = ["//visibility:public"],
 )
@@ -34,7 +25,6 @@ tf_proto_library(
     name = "generic_tree_model_extensions",
     srcs = ["generic_tree_model_extensions.proto"],
     cc_api_version = 2,
-    go_api_version = 2,
     protodeps = [":generic_tree_model"],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/contrib/deprecated/BUILD b/tensorflow/contrib/deprecated/BUILD
index 3dfbbf55273848afb8ad74ad444f0d85b45610bd..401527f1e74f7725d02a3b92a2c661d8ffc11e21 100644
--- a/tensorflow/contrib/deprecated/BUILD
+++ b/tensorflow/contrib/deprecated/BUILD
@@ -30,15 +30,3 @@ py_test(
         "//tensorflow/python:logging_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..74b2cd90a187159fd2da8ce236c14e813cc43c49
--- /dev/null
+++ b/tensorflow/contrib/distribute/BUILD
@@ -0,0 +1,36 @@
+# Implementation of a prototype TF distributed computation library.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "distribute",
+    srcs = ["__init__.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/contrib/distribute/python:cross_tower_ops",
+        "//tensorflow/contrib/distribute/python:mirrored_strategy",
+        "//tensorflow/contrib/distribute/python:monitor",
+        "//tensorflow/contrib/distribute/python:one_device_strategy",
+        "//tensorflow/contrib/distribute/python:step_fn",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..44a4481021c380e72b535cf0aca39df2bf04d3b7
--- /dev/null
+++ b/tensorflow/contrib/distribute/README.md
@@ -0,0 +1,141 @@
+# Distribution Strategy
+
+> *NOTE*: This is a experimental feature. The API and performance
+> characteristics are subject to change.
+
+## Overview
+
+[`DistributionStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/DistributionStrategy)
+API is an easy way to distribute your training
+across multiple devices/machines. Our goal is to allow users to use existing
+models and training code with minimal changes to enable distributed training.
+Moreover, we've design the API in such a way that it works with both eager and
+graph execution.
+
+Currently we support one type of strategy, called
+[`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy).
+It does in-graph replication with synchronous training
+on many GPUs on one machine. Essentially, we create copies of all variables in
+the model's layers on each device. We then use all-reduce to combine gradients
+across the devices before applying them to the variables to keep them in sync.
+In the future, we intend to support other kinds of training configurations such
+as multi-node, synchronous,
+[asynchronous](https://www.tensorflow.org/deploy/distributed#putting_it_all_together_example_trainer_program),
+parameter servers and model parallelism.
+
+## Example
+
+Let's demonstrate how to use this API with a simple example. We will use the
+[`Estimator`](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)
+approach, and show you how to scale your model to run on multiple GPUs on one
+machine using `MirroredStrategy`.
+
+Let's consider a very simple model function which tries to learn a simple
+function.
+
+```python
+def model_fn(features, labels, mode):
+  layer = tf.layers.Dense(1)
+  logits = layer(features)
+
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {"logits": logits}
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+  loss = tf.losses.mean_squared_error(
+      labels=labels, predictions=tf.reshape(logits, []))
+
+  if mode == tf.estimator.ModeKeys.EVAL:
+    return tf.estimator.EstimatorSpec(mode, loss=loss)
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss_fn())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+```
+
+Let's also define a simple input function to feed data for training this model.
+Note that we require using
+[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
+with `DistributionStrategy`.
+
+
+```python
+def input_fn():
+  features = tf.data.Dataset.from_tensors([[1.]]).repeat(100)
+  labels = tf.data.Dataset.from_tensors(1.).repeat(100)
+  return dataset_ops.Dataset.zip((features, labels))
+```
+
+Now that we have a model function and input function defined, we can define the
+estimator. To use `MirroredStrategy`, all we need to do is:
+
+* Create an instance of the `MirroredStrategy` class.
+* Pass it to the
+[`RunConfig`](https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig)
+parameter of `Estimator`.
+
+
+```python
+distribution = tf.contrib.distribute.MirroredStrategy()
+config = tf.estimator.RunConfig(train_distribute=distribution)
+classifier = tf.estimator.Estimator(model_fn=model_fn, config=config)
+classifier.train(input_fn=input_fn)
+```
+
+That's it! This change will now configure estimator to run on all GPUs on your
+machine, with the `MirroredStrategy` approach. It will take care of distributing
+the input dataset, replicating layers and variables on each device, and
+combining and applying gradients.
+
+The model and input functions do not have to change because we have changed the
+underlying components of TensorFlow (such as
+optimizer, batch norm and summaries) to become distribution-aware.
+That means those components know how to
+combine their state across devices. Further, saving and checkpointing works
+seamlessly, so you can save with one or no distribution strategy and resume with
+another.
+
+Above, we showed the easiest way to use [`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy#__init__).
+There are few things you can customize in practice:
+
+* You can specify a list of specific GPUs (using param `devices`) or the number
+of GPUs (using param `num_gpus`), in case you don't want auto detection.
+* You can specify various parameters for all reduce with the `cross_tower_ops`
+param, such as the all reduce algorithm to use, and gradient repacking.
+
+## Performance Tips
+
+We've tried to make it such that you get the best performance for your existing
+model. We also recommend you follow the tips from
+[Input Pipeline Performance Guide](https://www.tensorflow.org/performance/datasets_performance).
+Specifically, we found using [`map_and_batch`](https://www.tensorflow.org/performance/datasets_performance#map_and_batch)
+and [`dataset.prefetch`](https://www.tensorflow.org/performance/datasets_performance#pipelining)
+in the input function gives a solid boost in performance. When using
+`dataset.prefetch`, use `buffer_size=None` to let it detect optimal buffer size.
+
+## Caveats
+This feature is in early stages and there are a lot of improvements forthcoming:
+
+* Metrics are not yet supported during distributed training. They are still
+supported during the evaluation.
+* Summaries are only computed in the first tower in `MirroredStrategy`.
+* Evaluation is not yet distributed.
+* Eager support is in the works; performance can be more challenging with eager
+execution.
+* As mentioned earlier, multi-node and other distributed strategies will be
+introduced in the future.
+* If you are [`batching`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch)
+your input data, we will place one batch on each GPU in each step. So your
+effective batch size will be `num_gpus * batch_size`. Therefore, consider
+adjusting your learning rate or batch size according to the number of GPUs.
+We are working on addressing this limitation by splitting each batch across GPUs
+instead.
+* PartitionedVariables are not supported yet.
+
+## What's next?
+
+Please give distribution strategies a try. This feature is in early stages and
+is evolving, so we welcome your feedback via
+[issues on GitHub](https://github.com/tensorflow/tensorflow/issues/new).
+
+
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..76711baf3a11c8978fbb5770ec173ff74a153158
--- /dev/null
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -0,0 +1,52 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Prototype of a distributed computation library for TF."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.distribute.python.cross_tower_ops import *
+from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
+from tensorflow.contrib.distribute.python.monitor import Monitor
+from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
+from tensorflow.contrib.distribute.python.step_fn import *
+from tensorflow.python.training.distribute import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+_allowed_symbols = [
+    'AllReduceCrossTowerOps',
+    'CrossTowerOps',
+    'DistributionStrategy',
+    'MirroredStrategy',
+    'Monitor',
+    'OneDeviceStrategy',
+    'ReductionToOneDeviceCrossTowerOps',
+    'Step',
+    'StandardInputStep',
+    'StandardSingleLossStep',
+    'TowerContext',
+    'get_cross_tower_context',
+    'get_distribution_strategy',
+    'get_loss_reduction',
+    'get_tower_context',
+    'has_distribution_strategy',
+    'require_tower_context',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c2834d822664b9d60690c5d5dd527bbbd01a106f
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -0,0 +1,481 @@
+# Implementation of a prototype TF distributed computation library.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# TODO(priyag): Figure out testonly issues that are preventing us from
+# including our tests in pip for now.
+
+py_library(
+    name = "values",
+    srcs = ["values.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":prefetching_ops_v2",
+        "//tensorflow/contrib/data/python/ops:batching",
+        "//tensorflow/contrib/eager/python:datasets",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpointable",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+cuda_py_test(
+    name = "values_test",
+    srcs = ["values_test.py"],
+    additional_deps = [
+        ":mirrored_strategy",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/estimator:model_fn",
+    ],
+)
+
+py_library(
+    name = "mirrored_strategy",
+    srcs = ["mirrored_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":cross_tower_ops",
+        ":shared_variable_creator",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "one_device_strategy",
+    srcs = ["one_device_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":values",
+        "//tensorflow/contrib/eager/python:datasets",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "strategy_test_lib",
+    testonly = 1,
+    srcs = ["strategy_test_lib.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "combinations",
+    testonly = 1,
+    srcs = ["combinations.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":mirrored_strategy",
+        ":one_device_strategy",
+        ":tpu_strategy",
+        "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "combinations_test",
+    srcs = ["combinations_test.py"],
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":combinations",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_test(
+    name = "mirrored_strategy_test",
+    srcs = ["mirrored_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":mirrored_strategy",
+        ":strategy_test_lib",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_test(
+    name = "one_device_strategy_test",
+    srcs = ["one_device_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":one_device_strategy",
+        ":strategy_test_lib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+cuda_py_test(
+    name = "mirrored_strategy_multigpu_test",
+    srcs = ["mirrored_strategy_multigpu_test.py"],
+    additional_deps = [
+        ":mirrored_strategy",
+        ":values",
+        ":strategy_test_lib",
+        "//tensorflow/python:distribute",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "guitar",
+        "no_pip",
+        "multi_and_single_gpu",
+        # Do not perform the extra analysis on this test, because it is already
+        # performed for the `:mirrored_strategy_test` target.
+        "no_oss",
+        "noasan",
+        "notap",
+        "notsan",
+    ],
+)
+
+py_library(
+    name = "step_fn",
+    srcs = ["step_fn.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:backprop",
+    ],
+)
+
+py_library(
+    name = "tpu_strategy",
+    srcs = ["tpu_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":one_device_strategy",
+        ":values",
+        "//tensorflow/contrib/tpu",
+        "//tensorflow/contrib/tpu:tpu_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "minimize_loss_test_lib",
+    testonly = 1,
+    srcs = ["minimize_loss_test.py"],
+    deps = [
+        ":combinations",
+        ":mirrored_strategy",
+        ":single_loss_example",
+        "//tensorflow/contrib/tpu:tpu_lib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops/losses",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "minimize_loss_test",
+    srcs = ["minimize_loss_test.py"],
+    additional_deps = [
+        ":minimize_loss_test_lib",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+cuda_py_test(
+    name = "optimizer_v2_test",
+    srcs = ["optimizer_v2_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":single_loss_example",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+cuda_py_test(
+    name = "estimator_integration_test",
+    srcs = ["estimator_integration_test.py"],
+    additional_deps = [
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/estimator:dnn_linear_combined",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+py_library(
+    name = "single_loss_example",
+    srcs = ["single_loss_example.py"],
+    deps = [
+        ":step_fn",
+        "//tensorflow/contrib/data/python/ops:batching",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "step_fn_test",
+    srcs = ["step_fn_test.py"],
+    additional_deps = [
+        ":single_loss_example",
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+py_library(
+    name = "monitor",
+    srcs = ["monitor.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "monitor_test",
+    srcs = ["monitor_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":monitor",
+        ":one_device_strategy",
+        ":single_loss_example",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+py_library(
+    name = "shared_variable_creator",
+    srcs = ["shared_variable_creator.py"],
+    visibility = ["//tensorflow:internal"],
+)
+
+py_test(
+    name = "shared_variable_creator_test",
+    srcs = ["shared_variable_creator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":shared_variable_creator",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "cross_tower_utils",
+    srcs = ["cross_tower_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/nccl:nccl_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "cross_tower_ops",
+    srcs = ["cross_tower_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cross_tower_utils",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "cross_tower_ops_test",
+    srcs = ["cross_tower_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":combinations",
+        ":cross_tower_ops",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "prefetching_ops_v2",
+    srcs = ["prefetching_ops_v2.py"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:contrib_op_loader",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+cuda_py_test(
+    name = "prefetching_ops_v2_test",
+    srcs = ["prefetching_ops_v2_test.py"],
+    additional_deps = [
+        ":prefetching_ops_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
new file mode 100644
index 0000000000000000000000000000000000000000..946310aa6fc2101d75e86d3ff2e9f3284e6c6625
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -0,0 +1,312 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Facilities for creating multiple test combinations.
+
+Here is an example of testing various optimizers in Eager and Graph mode:
+
+class AdditionExample(test.TestCase, parameterized.TestCase):
+  @combinations.generate(
+     combinations.combine(mode=["graph", "eager"],
+                          optimizer=[AdamOptimizer(),
+                                     GradientDescentOptimizer()]))
+  def testOptimizer(self, optimizer):
+    ... f(optimizer)...
+
+This will run `testOptimizer` 4 times with the specified optimizers: 2 in
+Eager and 2 in Graph mode.
+The test will be provided with arguments that match the arguments of combine
+by name.  It is necessary to request all arguments, except for `mode`, which is
+optional.
+
+`combine()` function is available for creating a cross product of various
+options.  `times()` function exists for creating a product of N `combine()`-ed
+results.  See below.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+import sys
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.contrib.optimizer_v2 import adam as adam_v2
+from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.util import tf_inspect
+
+
+GPU_TEST = "test_gpu" in sys.argv[0]
+TPU_TEST = "test_tpu" in sys.argv[0]
+
+
+def generate(combinations):
+  """A decorator for generating test cases of a test method or a test class.
+
+  Args:
+    combinations: a list of dictionaries created using combine() and times().
+
+  Restrictions:
+   -- there should always be a "mode" argument.  Accepted values are "eager"
+      and "graph".
+   -- arguments of the test method must match by name to get the corresponding
+      value of the combination.  Tests must accept all arguments (except "mode",
+      which is optional).
+   -- distribution argument is special.  It is meant for passing instances of
+      DistributionStrategy.  Each instance is to be passed as `(<int>,
+      <DistributionStrategy>)` tuple, where <int> is the number of required
+      GPUs.  If the required number of GPUs for the DistributionStrategy isn't
+      available then the test case is going to be skipped.
+
+  Returns:
+    a decorator that will cause the test method to be run under the specified
+    conditions.
+
+  Raises:
+    ValueError - if "mode" argument wasn't either "eager" or "graph.
+  """
+
+  def decorator(test_function):
+    """The decorator to be returned."""
+
+    # Generate good test names that can be used with --test_filter.
+    for combination in combinations:
+      # We use OrderedDicts in `combine()` and `times()` to ensure stable
+      # order of keys in each dictionary.
+      assert isinstance(combination, OrderedDict)
+      name = "".join([
+          "_{}_{}".format(
+              "".join(filter(str.isalnum, key)),
+              "".join(filter(str.isalnum, str(value))))
+          for key, value in combination.items()
+      ])
+      combination.update({"testcase_name": "_test{}".format(name)})
+
+    @parameterized.named_parameters(*combinations)
+    def decorated(self, **kwargs):
+      """A wrapped test method that sets up `test_function`."""
+      assert "mode" in kwargs
+      mode = kwargs["mode"]
+
+      if "distribution" in kwargs:
+        distribution = kwargs["distribution"]
+        kwargs["distribution"] = distribution.strategy
+        if distribution.required_tpu and not TPU_TEST:
+          self.skipTest("Test requires a TPU, but it's not available.")
+        if not distribution.required_tpu and TPU_TEST:
+          self.skipTest("Test that doesn't require a TPU.")
+
+        if not distribution.required_gpus:
+          if GPU_TEST:
+            self.skipTest("Test that doesn't require GPUs.")
+        elif context.num_gpus() < distribution.required_gpus:
+          self.skipTest(
+              "{} GPUs are not available for this test. {} GPUs are available".
+              format(distribution.required_gpus, context.num_gpus()))
+
+      requested_arguments = tf_inspect.getfullargspec(test_function).args
+      missing_arguments = set(list(kwargs.keys()) + ["self"]).difference(
+          set(requested_arguments + ["mode"]))
+      if missing_arguments:
+        raise ValueError("The test is missing arguments {} .".format(
+            missing_arguments))
+
+      kwargs_to_pass = {}
+      for arg in requested_arguments:
+        if arg == "self":
+          kwargs_to_pass[arg] = self
+        else:
+          kwargs_to_pass[arg] = kwargs[arg]
+
+      if mode == "eager":
+        with context.eager_mode(), ops.Graph().as_default():
+          test_function(**kwargs_to_pass)
+      elif mode == "graph":
+        with context.graph_mode(), ops.Graph().as_default():
+          test_function(**kwargs_to_pass)
+      else:
+        raise ValueError(
+            "'mode' has to be either 'eager' or 'graph' and not {}".format(
+                mode))
+
+    return decorated
+  return decorator
+
+
+def combine(**kwargs):
+  """Generate combinations based on its keyword arguments.
+
+  Two sets of returned combinations can be concatenated using +.  Their product
+  can be computed using `times()`.
+
+  Args:
+    **kwargs: keyword arguments of form `option=[possibilities, ...]`.
+
+  Returns:
+    a list of dictionaries for each combination. Keys in the dictionaries are
+    the keyword argument names.  Each key has one value - one of the
+    corresponding keyword argument values.
+  """
+  if not kwargs:
+    return [OrderedDict()]
+
+  sort_by_key = lambda k: k[0][0]
+  kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
+  first = list(kwargs.items())[0]
+
+  rest = dict(list(kwargs.items())[1:])
+  rest_combined = combine(**rest)
+
+  key = first[0]
+  values = first[1]
+
+  return [
+      OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
+      for v in values
+      for combined in rest_combined
+  ]
+
+
+def times(*combined):
+  """Generate a product of N sets of combinations.
+
+  times(combine(a=[1,2]), combine(b=[3,4])) == combine(a=[1,2], b=[3,4])
+
+  Args:
+    *combined: N lists of dictionaries that specify combinations.
+
+  Returns:
+    a list of dictionaries for each combination.
+
+  Raises:
+    ValueError: if some of the inputs have overlapping keys.
+  """
+  assert combined
+
+  if len(combined) == 1:
+    return combined[0]
+
+  first = combined[0]
+  rest_combined = times(*combined[1:])
+
+  combined_results = []
+  for a in first:
+    for b in rest_combined:
+      if set(a.keys()).intersection(set(b.keys())):
+        raise ValueError("Keys need to not overlap: {} vs {}".format(
+            a.keys(), b.keys()))
+
+      combined_results.append(OrderedDict(list(a.items()) + list(b.items())))
+  return combined_results
+
+
+class NamedObject(object):
+  """A class that translates an object into a good test name."""
+
+  def __init__(self, name, obj):
+    self._name = name
+    self._obj = obj
+
+  def __getattr__(self, name):
+    return getattr(self._obj, name)
+
+  def __call__(self, *args, **kwargs):
+    return self._obj(*args, **kwargs)
+
+  def __repr__(self):
+    return self._name
+
+
+class NamedDistribution(object):
+  """Translates DistributionStrategy and its data into a good name."""
+
+  def __init__(self, name, distribution, required_gpus=None,
+               required_tpu=False):
+    self._distribution = distribution
+    self._name = name
+    self._required_gpus = required_gpus
+    self._required_tpu = required_tpu
+
+  def __repr__(self):
+    return self._name
+
+  @property
+  def strategy(self):
+    return self._distribution
+
+  @property
+  def required_gpus(self):
+    return self._required_gpus
+
+  @property
+  def required_tpu(self):
+    return self._required_tpu
+
+
+one_device_strategy = NamedDistribution(
+    "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
+    None)
+tpu_strategy = NamedDistribution(
+    "TPU", tpu_strategy.TPUStrategy(), required_tpu=True)
+mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
+    "MirroredCPUAndGPU",
+    mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1)
+mirrored_strategy_without_prefetch = NamedDistribution(
+    "MirroredCPUAndGPUNoPrefetch",
+    mirrored_strategy.MirroredStrategy(
+        ["/gpu:0", "/cpu:0"], prefetch_on_device=False), 1)
+mirrored_strategy_with_two_gpus = NamedDistribution(
+    "Mirrored2GPUs",
+    mirrored_strategy.MirroredStrategy(["/gpu:0", "/gpu:1"]), 2)
+
+adam_optimizer_v1_fn = NamedObject(
+    "AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1))
+gradient_descent_optimizer_v1_fn = NamedObject(
+    "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2))
+
+adam_optimizer_v2_fn = NamedObject(
+    "AdamV2", lambda: adam_v2.AdamOptimizer(0.2, epsilon=1))
+gradient_descent_optimizer_v2_fn = NamedObject(
+    "GradientDescentV2",
+    lambda: gradient_descent_v2.GradientDescentOptimizer(0.2))
+
+graph_and_eager_modes = ["graph", "eager"]
+
+
+def distributions_and_v1_optimizers():
+  """A common set of combination with DistributionStrategies and Optimizers."""
+  return combine(
+      distribution=[
+          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus
+      ],
+      optimizer_fn=[adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn])
+
+
+def distributions_and_v2_optimizers():
+  """DistributionStrategies and V2 Optimizers."""
+  return combine(
+      distribution=[
+          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus
+      ],
+      optimizer_fn=[adam_optimizer_v2_fn, gradient_descent_optimizer_v2_fn])
diff --git a/tensorflow/contrib/distribute/python/combinations_test.py b/tensorflow/contrib/distribute/python/combinations_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..219b24160f3902fcfa5363cc39a8fc5b30d00308
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/combinations_test.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for some testing utils from strategy_test_lib."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.python.eager import test
+
+
+class TestingCombinationsTest(test.TestCase):
+
+  def test_combine(self):
+    self.assertEqual([{
+        "a": 1,
+        "b": 2
+    }, {
+        "a": 1,
+        "b": 3
+    }, {
+        "a": 2,
+        "b": 2
+    }, {
+        "a": 2,
+        "b": 3
+    }], combinations.combine(a=[1, 2], b=[2, 3]))
+
+  def test_add(self):
+    self.assertEqual(
+        [{
+            "a": 1
+        }, {
+            "a": 2
+        }, {
+            "b": 2
+        }, {
+            "b": 3
+        }],
+        combinations.combine(a=[1, 2]) +
+        combinations.combine(b=[2, 3]))
+
+  def test_times(self):
+    c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
+    c2 = combinations.combine(mode=["eager"], loss=["callable"])
+    c3 = combinations.combine(distribution=["d1", "d2"])
+    c4 = combinations.times(c3, c1 + c2)
+    self.assertEqual([
+        OrderedDict([("distribution", "d1"), ("loss", "callable"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d1"), ("loss", "tensor"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d1"), ("loss", "callable"),
+                     ("mode", "eager")]),
+        OrderedDict([("distribution", "d2"), ("loss", "callable"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d2"), ("loss", "tensor"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d2"), ("loss", "callable"),
+                     ("mode", "eager")])
+    ], c4)
+
+  def test_times_variable_arguments(self):
+    c1 = combinations.combine(mode=["graph", "eager"])
+    c2 = combinations.combine(optimizer=["adam", "gd"])
+    c3 = combinations.combine(distribution=["d1", "d2"])
+    c4 = combinations.times(c3, c1, c2)
+    self.assertEqual([
+        OrderedDict([("distribution", "d1"), ("mode", "graph"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d1"), ("mode", "graph"),
+                     ("optimizer", "gd")]),
+        OrderedDict([("distribution", "d1"), ("mode", "eager"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d1"), ("mode", "eager"),
+                     ("optimizer", "gd")]),
+        OrderedDict([("distribution", "d2"), ("mode", "graph"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d2"), ("mode", "graph"),
+                     ("optimizer", "gd")]),
+        OrderedDict([("distribution", "d2"), ("mode", "eager"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d2"), ("mode", "eager"),
+                     ("optimizer", "gd")])
+    ], c4)
+    self.assertEqual(
+        combinations.combine(
+            mode=["graph", "eager"],
+            optimizer=["adam", "gd"],
+            distribution=["d1", "d2"]), c4)
+
+  def test_overlapping_keys(self):
+    c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
+    c2 = combinations.combine(mode=["eager"], loss=["callable"])
+    with self.assertRaisesRegexp(ValueError, ".*Keys.+overlap.+"):
+      _ = combinations.times(c1, c2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..cff717db80f0bdd377b3c9c7e8ca3578ff273930
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -0,0 +1,586 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for different algorithms of reduction and broadcasting."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.distribute.python import cross_tower_utils
+from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.client import device_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import device_util
+
+
+def _validate_destinations(destinations):
+  if not isinstance(destinations,
+                    (value_lib.DistributedValues, six.string_types, list)):
+    raise ValueError("destinations must be one of a `DistributedValues` object,"
+                     " a device string, a list of device strings or None")
+
+  if not destinations:
+    raise ValueError("destinations can not be empty")
+
+
+def _validate_value_destination_pairs(value_destination_pairs):
+  # pylint: disable=g-missing-docstring
+  if not value_destination_pairs: return False
+  if not isinstance(value_destination_pairs, (list, tuple)): return False
+  if not all([isinstance(pair, tuple) for pair in value_destination_pairs]):
+    return False
+  if not all([isinstance(v[0], value_lib.PerDevice)
+              for v in value_destination_pairs]):
+    return False
+  return True
+
+
+def _get_devices_from(destinations):
+  if isinstance(destinations, value_lib.DistributedValues):
+    return list(destinations.devices)
+  elif isinstance(destinations, six.string_types):
+    return [device_util.canonicalize(destinations)]
+  else:
+    return [
+        device_util.canonicalize(destination) for destination in destinations
+    ]
+
+
+def _devices_match(left, right):
+  return set(_get_devices_from(left)) == set(_get_devices_from(right))
+
+
+def _all_devices_match(value_destination_pairs):
+  if not all([d is None or _devices_match(v, d)
+              for v, d in value_destination_pairs]):
+    return False
+  if not all([_devices_match(v, value_destination_pairs[0][0])
+              for v, _ in value_destination_pairs[1:]]):
+    return False
+  return True
+
+
+def _simple_broadcast(tensor, destinations):
+  index = {}
+  devices = _get_devices_from(destinations)
+  for d in devices:
+    with ops.device(d):
+      index[d] = array_ops.identity(tensor)
+  return value_lib.Mirrored(index)
+
+
+def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn,
+                   method_string):
+  # pylint: disable=g-missing-docstring
+  all_values = []
+  count = 0
+  for v in per_device_value._index.values():  # pylint: disable=protected-access
+    if isinstance(v, value_lib.MapOutput):
+      v_list = v.get()
+      if not v_list:
+        continue
+      count += len(v_list)
+      # Sum within each device before aggregating across devices.
+      v = math_ops.add_n(v_list)
+    else:
+      count += 1
+    all_values.append(v)
+  if not all_values:
+    raise ValueError("`per_device_value` must be non-empty")
+
+  with ops.device(reduce_to_device):
+    with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+      if method_string == "sum":
+        reduced = accumulation_fn(all_values)
+      elif method_string == "mean":
+        reduced = accumulation_fn(all_values) / count
+      else:
+        raise ValueError("`method_string` must be 'sum' or 'mean'")
+  return reduced
+
+
+class CrossTowerOps(object):
+  """Base class for cross-tower reduction and broadcasting algorithms."""
+
+  def __init__(self):
+    pass
+
+  def reduce(self, method_string, per_device_value, destinations=None):
+    """Reduce `per_device_value` to `destinations`.
+
+    It runs the reduction operation defined by `method_string` and put the
+    result on `destinations`.
+
+    Args:
+      method_string: either 'sum' or 'mean' specifying the reduction method.
+      per_device_value: a PerDevice object.
+      destinations: the reduction destinations.
+
+    Returns:
+      a Mirrored object.
+
+    Raises:
+      ValueError: if per_device_value is not a PerDevice object.
+    """
+    if not isinstance(per_device_value, value_lib.PerDevice):
+      raise ValueError("`per_device_value` must be a `PerDevice` object.")
+    if destinations is not None:
+      _validate_destinations(destinations)
+    return self._reduce(method_string, per_device_value, destinations)
+
+  def batch_reduce(self, method_string, value_destination_pairs):
+    """Reduce PerDevice objects in a batch.
+
+    Reduce each first element in `value_destination_pairs` to each second
+    element which indicates the destinations.
+
+    Args:
+      method_string: either 'sum' or 'mean' specifying the reduction method.
+      value_destination_pairs: a list or a tuple of tuples of PerDevice objects
+        and destinations. If a destination is None, then the destinations
+        are set to match the devices of the input PerDevice object.
+
+    Returns:
+      a list of Mirrored objects.
+
+    Raises:
+      ValueError: if `value_destination_pairs` is not a list or a tuple of
+        tuples of PerDevice objects and destinations
+    """
+    if not _validate_value_destination_pairs(value_destination_pairs):
+      raise ValueError("`value_destination_pairs` must be a list or a tuple of "
+                       "tuples of PerDevice objects and destinations")
+    for _, d in value_destination_pairs:
+      if d is not None:
+        _validate_destinations(d)
+
+    return self._batch_reduce(method_string, value_destination_pairs)
+
+  def broadcast(self, tensor, destinations):
+    """Broadcast the `tensor` to destinations.
+
+    Args:
+      tensor: the tensor to broadcast.
+      destinations: the broadcast destinations.
+
+    Returns:
+      a Mirrored object.
+    """
+    _validate_destinations(destinations)
+    return self._broadcast(tensor, destinations)
+
+  def _reduce(self, method_string, per_device_value, destinations):
+    raise NotImplementedError(
+        "_reduce method must be implemented in descendants.")
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    raise NotImplementedError(
+        "_batch_reduce method must be implemented in descendants.")
+
+  def _broadcast(self, tensor, destinations):
+    return _simple_broadcast(tensor, destinations)
+
+
+class ReductionToOneDeviceCrossTowerOps(CrossTowerOps):
+  """Always do reduction to one device first and then do broadcasting.
+
+    Batch reduction is done by reduction on each element one by one.
+  """
+
+  def __init__(self, reduce_to_device=None, accumulation_fn=math_ops.add_n):
+    """Constructor.
+
+    Args:
+      reduce_to_device: the intermediate device to reduce to. If None, reduce
+        to the first device in `destinations` of the reduce() method.
+      accumulation_fn: a function that does accumulation.
+    """
+    self.reduce_to_device = reduce_to_device
+    self.accumulation_fn = accumulation_fn
+    super(ReductionToOneDeviceCrossTowerOps, self).__init__()
+
+  def _reduce(self, method_string, per_device_value, destinations):
+    devices = _get_devices_from(destinations or per_device_value)
+    reduce_to_device = self.reduce_to_device or devices[0]
+    reduced = _simple_reduce(per_device_value, reduce_to_device,
+                             self.accumulation_fn, method_string)
+    return self.broadcast(reduced, devices)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    return [self._reduce(method_string, t, destinations=v)
+            for t, v in value_destination_pairs]
+
+
+def _group_value_by_device(per_device_values):
+  """Group values into sublists by their devices.
+
+  This grouping is needed to call the all-reduce library.
+
+  Args:
+    per_device_values: a list of PerDevice obejcts.
+
+  Returns:
+    a list of lists, each sublist has components for its corresponding device of
+      PerDevice objects, paired with a None.
+  """
+  destinations = per_device_values[0].devices
+  grouped = [[] for _ in range(len(destinations))]
+  for per_device_value in per_device_values:
+    # pylint: disable=protected-access
+    for i, v in enumerate(per_device_value._index.values()):
+      assert per_device_value.devices == destinations
+      grouped[i].append((v, None))
+  return grouped
+
+
+def _ungroup_and_make_mirrored(grouped_reduced, destinations, method_string):
+  """Ungroup results from all-reduce and make Mirrored objects.
+
+  Each all-reduce result will be divided by the number of destinations before
+  Mirrored objects are created if method_string is "mean".
+
+  Args:
+    grouped_reduced: a list of lists, each sublist has components for each
+      device, paired with a None. It is the result from
+      cross_tower_utils.aggregate_gradients_using*.
+    destinations: a list of device strings for returned Mirrored objects.
+    method_string: "mean" or "sum".
+
+  Returns:
+    a list of Mirrored objects.
+  """
+  index = [{} for _ in range(len(grouped_reduced[0]))]
+  for d, per_device_reduced in enumerate(grouped_reduced):
+    for i, (v, _) in enumerate(per_device_reduced):
+      if method_string == "mean":
+        index[i][destinations[d]] = v / len(destinations)
+      else:
+        index[i][destinations[d]] = v
+  return [value_lib.Mirrored(v) for v in index]
+
+
+class ConcatAndSplitPacker(object):
+  """Concatenate and split tensors for reduction."""
+
+  def __init__(self, num_packs=1):
+    """Initialize the ConcatAndSplitPacker object.
+
+    Args:
+      num_packs: specifies the number of split packs that will be
+        formed.
+
+    Raises:
+      ValueError: if num_packs is not greater than 0.
+    """
+    if num_packs <= 0:
+      raise ValueError("num_packs must be greater than zero.")
+    self.num_packs = num_packs
+
+  def pack(self, grouped_grads_and_vars):
+    """Pack tensors."""
+    self.grouped_grads_and_vars = grouped_grads_and_vars
+    self.all_tower_shapes = []
+    self.all_tower_sizes = []
+
+    device_grad_packs = []
+    for tower_grads_and_vars in grouped_grads_and_vars:
+      with ops.colocate_with(tower_grads_and_vars[0][0]):
+        # Flatten all the grads.
+        flat_grads = [
+            array_ops.reshape(g, [-1]) for g, _ in tower_grads_and_vars
+        ]
+        # Remember the original shape of all the grads.
+        tower_shapes = [array_ops.shape(g) for g, _ in tower_grads_and_vars]
+        # Remember the original sizes of all the grads.
+        tower_sizes = [array_ops.size(g) for g, _ in tower_grads_and_vars]
+        # Concat all the flat grads into a big flat tensor.
+        concat_grads = array_ops.concat(flat_grads, 0)
+
+        # Split the big tensor into num_splits packs. In cases where the
+        # total size is not divisible num_splits, the last pack gets
+        # more elements.
+        # TODO(zhengxq): it is also possible to optimize away all the concat
+        # as well.
+        num_splits = self.num_packs
+        total_grad_size = array_ops.size(concat_grads)
+        split_size = total_grad_size // num_splits
+        split_size_last = total_grad_size - split_size * (num_splits - 1)
+        split_sizes = [split_size] * (num_splits - 1) + [split_size_last]
+        grad_packs = array_ops.split(concat_grads, split_sizes)
+
+        # Ready to aggregate the repacked gradients, with fake variables.
+        # TODO(zhengxq): It is hacky to have to use fake variables.
+        # We should remove the need for variables in
+        # aggregate_gradients_using*.
+        device_grad_packs.append(zip(grad_packs, [None] * num_splits))
+        self.all_tower_shapes.append(tower_shapes)
+        self.all_tower_sizes.append(tower_sizes)
+
+    return device_grad_packs
+
+  def unpack(self, summed_device_grad_packs):
+    """Reverse the pack."""
+    aggregated_device_grads = []
+    for (summed_tower_grad_packs,
+         tower_grads_and_vars, tower_shapes, tower_sizes) in zip(
+             summed_device_grad_packs, self.grouped_grads_and_vars,
+             self.all_tower_shapes, self.all_tower_sizes):
+      # pylint: enable=line-too-long
+      # Reverse the packing operations in the previous steps. Form the
+      # summed gradients back into their original shapes.
+      with ops.colocate_with(summed_tower_grad_packs[0][0]):
+        # Form a list of the summed grad packs.
+        device_grad_packs = [g for g, _ in summed_tower_grad_packs]
+
+        # Concat them back into a big flat tensor.
+        device_grads_concat = array_ops.concat(device_grad_packs, 0)
+
+        # Split the tensors back into their original sizes.
+        grads_with_sizes = array_ops.split(device_grads_concat, tower_sizes)
+
+        # Reshape the tensors back into their original shapes.
+        grads_with_shapes = [
+            array_ops.reshape(grad, shape)
+            for shape, grad in zip(tower_shapes, grads_with_sizes)
+        ]
+
+        # Form the list with the original list of variables.
+        summed_tower_grads = [
+            (g, v) for g, (_, v) in zip(grads_with_shapes, tower_grads_and_vars)
+        ]
+        aggregated_device_grads.append(summed_tower_grads)
+    return aggregated_device_grads
+
+
+class AggregateSmallTensorPacker(object):
+  """Concatenate small gradient tensors together for reduction."""
+
+  def __init__(self,
+               agg_small_grads_max_bytes=1048576,
+               agg_small_grads_max_group=16):
+    """Initialize the AggregateSmallTensorPacker object.
+
+    Args:
+      agg_small_grads_max_bytes: largest tensor eligible for aggregation,
+        in number of bytes.
+      agg_small_grads_max_group: largest permitted aggregation of small
+        tensors.
+
+    Raises:
+      ValueError: if `agg_small_grads_max_bytes` or `agg_small_grads_max_group`
+        is not greater than 0.
+    """
+    if agg_small_grads_max_bytes <= 0 or agg_small_grads_max_group <= 0:
+      raise ValueError("agg_small_grads_max_bytes and agg_small_grads_max_group"
+                       " should both be greater than zero.")
+    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self.agg_small_grads_max_group = agg_small_grads_max_group
+
+  def pack(self, grouped_grads_and_vars):
+    """Aggregate small tensors."""
+    if (self.agg_small_grads_max_bytes > 0 and
+        self.agg_small_grads_max_group > 0):
+      tower_grads, self.packing = cross_tower_utils.pack_small_tensors(
+          grouped_grads_and_vars,
+          max_bytes=self.agg_small_grads_max_bytes,
+          max_group=self.agg_small_grads_max_group)
+    return tower_grads
+
+  def unpack(self, summed_device_grad_packs):
+    """Reverse the aggregation process."""
+    return cross_tower_utils.unpack_small_tensors(summed_device_grad_packs,
+                                                  self.packing)
+
+
+class AllReduceCrossTowerOps(CrossTowerOps):
+  """Reduction using all reduce."""
+
+  def __init__(self,
+               all_reduce_alg="nccl",
+               num_packs=1,
+               agg_small_grads_max_bytes=0,
+               agg_small_grads_max_group=10):
+    """All-reduce implementation of CrossTowerOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation:
+      1) If `num_packs` is non-zero, pack values into
+        `num_packs` splits.
+      2) Otherwise, if `agg_small_grads_max_bytes` > 0 and
+        `agg_small_grads_max_group` > 0, aggregate values smaller than
+        `agg_small_grads_max_bytes` into groups with at most
+        `agg_small_grads_max_group` values.
+      3) Otherwise, no repacking or grouping will happen.
+
+    Args:
+      all_reduce_alg: the all-reduce algorithm to use, currently only "nccl" or
+        "hierarchical_copy" are supported.
+      num_packs: see above.
+      agg_small_grads_max_bytes: see above.
+      agg_small_grads_max_group: see above.
+        tensors.
+    """
+    self.all_reduce_alg = all_reduce_alg
+    self.num_packs = num_packs
+    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self.agg_small_grads_max_group = agg_small_grads_max_group
+    super(AllReduceCrossTowerOps, self).__init__()
+
+  def _reduce(self, method_string, per_device_value, destinations):
+    if ((destinations is None or _devices_match(per_device_value, destinations))
+        and not context.executing_eagerly()):
+      return self._batch_all_reduce(method_string, [per_device_value])[0]
+    else:
+      devices = _get_devices_from(destinations or per_device_value)
+      reduce_to_device = devices[0]
+      reduced = _simple_reduce(per_device_value, reduce_to_device,
+                               math_ops.add_n, method_string)
+      return self.broadcast(reduced, devices)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    if (_all_devices_match(value_destination_pairs) and
+        not context.executing_eagerly()):
+      return self._batch_all_reduce(method_string,
+                                    [v[0] for v in value_destination_pairs])
+    else:
+      if not context.executing_eagerly():
+        logging.warning("Efficient batch_reduce is not supported if "
+                        "destinations are different.")
+      return [
+          self._reduce(method_string, t, destinations=v)
+          for t, v in value_destination_pairs
+      ]
+
+  def _batch_all_reduce(self, method_string, per_device_values):
+    """All reduce algorithm in a batch."""
+    destinations = per_device_values[0].devices
+    grouped = _group_value_by_device(per_device_values)
+    if self.num_packs > 0:
+      logging.info(
+          "batch_all_reduce invoked for batches size = %d with "
+          "algorithm = %s and num_packs = %d", len(per_device_values),
+          self.all_reduce_alg, self.num_packs)
+      tensor_packer = ConcatAndSplitPacker(self.num_packs)
+      device_grad_packs = tensor_packer.pack(grouped)
+    elif (self.agg_small_grads_max_bytes > 0 and
+          self.agg_small_grads_max_group > 0):
+      logging.info(
+          "batch_all_reduce invoked for batches size = %d with "
+          "algorithm = %s, agg_small_grads_max_bytes = %d and "
+          "agg_small_grads_max_group = %d", len(per_device_values),
+          self.all_reduce_alg, self.agg_small_grads_max_bytes,
+          self.agg_small_grads_max_group)
+      tensor_packer = AggregateSmallTensorPacker(
+          self.agg_small_grads_max_bytes, self.agg_small_grads_max_group)
+      device_grad_packs = tensor_packer.pack(grouped)
+    else:
+      logging.info(
+          "batch_all_reduce invoked for batches size = %d with algorithm = %s",
+          len(per_device_values), self.all_reduce_alg)
+      tensor_packer = None
+      device_grad_packs = grouped
+
+    # The actual aggregation of the repacked gradients. Note that they are
+    # sharded among different aggregation trees. So it is important to strike
+    # the balance on num_splits.
+    if self.all_reduce_alg == "nccl":
+      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
+          device_grad_packs)
+    else:
+      # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
+      # order.
+      reduced = (
+          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
+              destinations, device_grad_packs))
+
+    if tensor_packer:
+      reduced = tensor_packer.unpack(reduced)
+
+    return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
+                                      method_string)
+
+
+_dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
+               [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
+
+
+def _has_dgx1_like_links(gpu_links):
+  if not gpu_links:
+    return False
+  # TODO(yuefengz): figure out the right topology for hierarchial copy if
+  # number of gpus are less than 8.
+  if len(gpu_links) < 8:
+    return False
+  for i, (gpu_link, dgx1_link) in enumerate(zip(gpu_links, _dgx1_links)):
+    if (set(gpu_link) != set(dgx1_link) and
+        set(gpu_link) != set(dgx1_link + [i])):
+      return False
+  return True
+
+
+def _choose_all_reduce_algorithm(device_links):
+  if _has_dgx1_like_links(device_links):
+    logging.info("Configured hierarchical_copy with num_packs=%d",
+                 len(device_links))
+    return AllReduceCrossTowerOps(
+        "hierarchical_copy", num_packs=len(device_links))
+  else:
+    logging.info("Configured nccl all-reduce.")
+    return AllReduceCrossTowerOps("nccl", num_packs=1)
+
+
+def choose_the_best(devices, session_config=None):
+  """Find the best subclass of CrossTowerOps given a tensorflow session.
+
+  Args:
+    devices: a list of devices passed for distribute strategy.
+    session_config: a tensorflow session config or None. If None, it will make
+      deciesion based on all local devices.
+
+  Returns:
+    a subclass of CrossTowerOps.
+  """
+  requested_devices = set([device_util.canonicalize(d) for d in devices])
+  machine_devices = device_lib.list_local_devices(session_config=session_config)
+  using_devices = []
+  for d in machine_devices:
+    if device_util.canonicalize(d.name) in requested_devices:
+      using_devices.append(d)
+    else:
+      logging.info(
+          "Device is available but not used by distribute strategy: %s", d.name)
+
+  if len(using_devices) != len(requested_devices):
+    logging.warning("Not all devices in distribute strategy are visible by "
+                    "TensorFlow sessions.")
+    return ReductionToOneDeviceCrossTowerOps()
+
+  if any([d.device_type.lower() != "gpu" for d in using_devices]):
+    logging.warning("Not all devices in DistributionStrategy are visible to "
+                    "TensorFlow session.")
+    return ReductionToOneDeviceCrossTowerOps()
+
+  device_links = [[] for _ in range(len(using_devices))]
+  for i, device in enumerate(using_devices):
+    for link in device.locality.links.link:
+      device_links[i].append(link.device_id)
+
+  return _choose_all_reduce_algorithm(device_links)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c7b0870887465ec2fe40007695d099277db38bf
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
@@ -0,0 +1,221 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CrossTowerOps."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _make_per_device(values, devices):
+  devices = cross_tower_ops_lib._get_devices_from(devices)
+  assert len(values) == len(devices)
+  index = {}
+  for d, v in zip(devices, values):
+    with ops.device(d):
+      placed_v = array_ops.identity(v)
+    index[d] = placed_v
+  return value_lib.PerDevice(index)
+
+
+# pylint: disable=g-doc-args,g-doc-return-or-yield
+def _fake_mirrored(value, devices):
+  """Create a faked Mirrored object for testing.
+
+  All components of the returned Mirrored have the same objects, which is not
+  true in reality.
+  """
+  devices = cross_tower_ops_lib._get_devices_from(devices)
+  return value_lib.Mirrored(
+      {d: v for d, v in zip(devices, [value] * len(devices))})
+
+
+_cpu_device = "/device:CPU:0"
+
+
+class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
+
+  def _assert_value_equal(self, left, right):
+    if isinstance(left, list):
+      for l, r in zip(left, right):
+        self._assert_value_equal(l, r)
+    else:
+      self.assertEqual(type(left), type(right))
+      self.assertEqual(left.devices, right.devices)
+      if context.executing_eagerly():
+        self.assertEqual([v.numpy() for v in left._index.values()],
+                         list(right._index.values()))
+      else:
+        with self.test_session() as sess:
+          self.assertEqual(
+              sess.run(list(left._index.values())), list(right._index.values()))
+
+  # TODO(yuefengz): decouple the num_gpus check from distribution in
+  # combinations module so that we can pass in devices instead of a distribution
+  # strategy.
+  reduction_to_one_combinations = combinations.combine(
+      cross_tower_ops=[
+          combinations.NamedObject(
+              "DefaultReductionToOneDeviceCrossTowerOps",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps()),
+          combinations.NamedObject(
+              "ReductionToCPUDeviceCrossTowerOps",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+                  reduce_to_device=_cpu_device)),
+          combinations.NamedObject(
+              "AccumulateNCrossTowerOp",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+                  accumulation_fn=math_ops.accumulate_n)),
+      ],
+      distribution=[
+          combinations.one_device_strategy,
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.mirrored_strategy_with_two_gpus
+      ],
+      mode=["graph", "eager"])
+  allreduce_combinations = combinations.combine(
+      cross_tower_ops=[
+          combinations.NamedObject(
+              "AllReduce",
+              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 1, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_tower_ops_lib.AllReduceCrossTowerOps(
+                  "hierarchical_copy", 8, 0, 0)),
+          combinations.NamedObject(
+              "AllReduceNoGradientRepacking",
+              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 0, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopyAggregateSmallTensors",
+              cross_tower_ops_lib.AllReduceCrossTowerOps(
+                  "hierarchical_copy", 0, 100, 10))
+      ],
+      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      mode=["graph", "eager"])
+
+  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
+  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+    devices = distribution.worker_devices
+
+    values = [constant_op.constant(float(d)) for d in range(len(devices))]
+    per_device = _make_per_device(values, devices)
+    mean = (len(devices) - 1.) / 2.
+
+    values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
+    per_device_2 = _make_per_device(values_2, devices)
+    mean_2 = mean + 1.
+
+    destination_mirrored = _fake_mirrored(1., devices)
+    destination_different = _fake_mirrored(1., _cpu_device)
+    destination_str = _cpu_device
+    destination_list = devices
+
+    all_destinations = [
+        None, destination_mirrored, destination_different, destination_str,
+        destination_list
+    ]
+
+    # test reduce()
+    for destinations in all_destinations:
+      self._assert_value_equal(
+          cross_tower_ops.reduce("mean", per_device, destinations=destinations),
+          _fake_mirrored(mean, destinations or per_device))
+      self._assert_value_equal(
+          cross_tower_ops.reduce(
+              "mean", per_device_2, destinations=destinations),
+          _fake_mirrored(mean_2, destinations or per_device))
+      self._assert_value_equal(
+          cross_tower_ops.reduce("sum", per_device, destinations=destinations),
+          _fake_mirrored(mean * len(devices), destinations or per_device))
+      self._assert_value_equal(
+          cross_tower_ops.reduce(
+              "sum", per_device_2, destinations=destinations),
+          _fake_mirrored(mean_2 * len(devices), destinations or per_device))
+
+    # test batch_reduce()
+    for d1, d2 in itertools.product(all_destinations, all_destinations):
+      self._assert_value_equal(
+          cross_tower_ops.batch_reduce(
+              "mean", [(per_device, d1), (per_device_2, d2)]),
+          [_fake_mirrored(mean, d1 or per_device),
+           _fake_mirrored(mean_2, d2 or per_device_2)])
+      self._assert_value_equal(
+          cross_tower_ops.batch_reduce(
+              "sum", [(per_device, d1), (per_device_2, d2)]),
+          [_fake_mirrored(mean * len(devices), d1 or per_device),
+           _fake_mirrored(mean_2 * len(devices), d2 or per_device_2)])
+
+    # test broadcast()
+    for destinations in all_destinations:
+      if destinations is None:
+        continue
+      else:
+        self._assert_value_equal(
+            cross_tower_ops.broadcast(constant_op.constant(1.), destinations),
+            _fake_mirrored(1., destinations))
+
+  def testChooseAlgorithm(self):
+    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
+                    [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result.num_packs, 8)
+
+    # if there are only 4 devices
+    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "nccl")
+    self.assertEqual(result.num_packs, 1)
+
+    # if devices links contain each device itself
+    device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
+                    [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
+                    [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result.num_packs, 8)
+
+    # if not dgx1-like links
+    device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
+                    [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "nccl")
+    self.assertEqual(result.num_packs, 1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/contrib/distribute/python/cross_tower_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc04e2195f6d305e0f7c642f24c355286f1a8cfa
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils.py
@@ -0,0 +1,339 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for cross_tower_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as pycoll
+
+from tensorflow.contrib import nccl
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def aggregate_gradients_using_nccl(tower_grads):
+  """Aggregate gradients using nccl allreduce."""
+  agg_all_g_and_v = []
+  for single_g_and_v in zip(*tower_grads):
+    single_grads = [g for g, _ in single_g_and_v]
+    agg_grads = nccl.all_sum(single_grads)
+    agg_all_g_and_v.append(
+        [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)])
+
+  agg_all_g_and_v = list(zip(*agg_all_g_and_v))
+
+  return agg_all_g_and_v
+
+
+def aggregate_gradients_using_hierarchical_copy(avail_devices, tower_grads):
+  """Aggregate gradients using hierarchical copies.
+
+  Args:
+    avail_devices: available GPU devices.
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over towers. The inner list is over individual gradients.
+
+  Returns:
+    The list of (aggregated_gradient, variable), where the gradient has been
+      summed across all towers and the variable is chosen from the first tower.
+  """
+  # This only works for DGX-1 type of machine topology
+  # Device peer to peer matrix
+  # DMA: 0 1 2 3 4 5 6 7
+  # 0:   Y Y Y Y Y N N N
+  # 1:   Y Y Y Y N Y N N
+  # 2:   Y Y Y Y N N Y N
+  # 3:   Y Y Y Y N N N Y
+  # 4:   Y N N N Y Y Y Y
+  # 5:   N Y N N Y Y Y Y
+  # 6:   N N Y N Y Y Y Y
+  # 7:   N N N Y Y Y Y Y
+  agg_grads = []
+  num_devices = len(avail_devices)
+  # In the special case of DGX-1 machine topology, the two groups have equal
+  # size.
+  group_size = num_devices // 2
+  for i, single_grads in enumerate(zip(*tower_grads)):
+    group_0_main_device = i % num_devices
+    group_1_main_device = (group_0_main_device + group_size) % num_devices
+    if group_0_main_device < group_size:
+      group_0_begin = 0
+      group_1_begin = group_size
+    else:
+      group_0_begin = group_size
+      group_1_begin = 0
+
+    # Aggregate the first group.
+    group_0_device_grads = single_grads[group_0_begin:
+                                        group_0_begin + group_size]
+    with ops.device(avail_devices[group_0_main_device]):
+      group_0_agg_grads, _ = aggregate_single_gradient_using_copy(
+          group_0_device_grads, False, False)
+
+    # Aggregate the second group.
+    group_1_device_grads = single_grads[group_1_begin:
+                                        group_1_begin + group_size]
+    with ops.device(avail_devices[group_1_main_device]):
+      group_1_agg_grads, _ = aggregate_single_gradient_using_copy(
+          group_1_device_grads, False, False)
+
+    # Aggregate between the groups.
+    with ops.device(avail_devices[group_0_main_device]):
+      (agg_total_grads, _), _ = aggregate_single_gradient_using_copy(
+          [group_0_agg_grads, group_1_agg_grads], False, False)
+
+    # Broadcast the result back into the root of each group.
+    with ops.device(avail_devices[group_0_main_device]):
+      group_0_agg_grads_bcast = array_ops.identity(agg_total_grads)
+    with ops.device(avail_devices[group_1_main_device]):
+      group_1_agg_grads_bcast = array_ops.identity(agg_total_grads)
+
+    agg_grads_bcast = []
+    for j in range(len(single_grads)):
+      with ops.device(avail_devices[j]):
+        # Broadcast the result back to each member in the group from the root.
+        if (group_0_main_device < group_size) == (j < group_size):
+          src_device_grad = group_0_agg_grads_bcast
+        else:
+          src_device_grad = group_1_agg_grads_bcast
+        agg_grads_bcast.append(array_ops.identity(src_device_grad))
+
+    agg_grads.append(
+        [(g, v) for g, (_, v) in zip(agg_grads_bcast, single_grads)])
+
+  agg_grads = list(zip(*agg_grads))
+
+  return agg_grads
+
+
+def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
+                                         check_inf_nan):
+  """Calculate the average gradient for a shared variable across all towers.
+
+  Note that this function provides a synchronization point across all towers.
+
+  Args:
+    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
+      (gradient, variable) pair within the outer list represents the gradient
+      of the variable calculated for a single tower, and the number of pairs
+      equals the number of towers.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  grads = [g for g, _ in grad_and_vars]
+  grad = math_ops.add_n(grads)
+
+  if use_mean and len(grads) > 1:
+    grad = array_ops.multiply(grad, 1.0 / len(grads))
+
+  v = grad_and_vars[0][1]
+  if check_inf_nan:
+    has_nan_or_inf = array_ops.logical_not(
+        array_ops.reduce_all(array_ops.is_finite(grads)))
+    return (grad, v), has_nan_or_inf
+  else:
+    return (grad, v), None
+
+
+def extract_ranges(index_list, range_size_limit=32):
+  """Extract consecutive ranges and singles from index_list.
+
+  Args:
+    index_list: List of monotone increasing non-negative integers.
+    range_size_limit: Largest size range to return.  If a larger
+      consecutive range exists, it will be returned as multiple
+      ranges.
+
+  Returns:
+    (ranges, singles) where ranges is a list of [first, last] pairs of
+      consecutive elements in index_list, and singles is all of the
+      other elements, in original order.
+  """
+  if not index_list:
+    return [], []
+  first = index_list[0]
+  last = first
+  ranges = []
+  singles = []
+  for i in index_list[1:]:
+    if i == last + 1 and (last - first) <= range_size_limit:
+      last = i
+    else:
+      if last > first:
+        ranges.append([first, last])
+      else:
+        singles.append(first)
+      first = i
+      last = i
+  if last > first:
+    ranges.append([first, last])
+  else:
+    singles.append(first)
+  return ranges, singles
+
+
+GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
+
+
+def pack_range(key, packing, grad_vars, rng):
+  """Form the concatenation of a specified range of gradient tensors.
+
+  Args:
+    key: Value under which to store meta-data in packing that will be used
+      later to restore the grad_var list structure.
+    packing: Dict holding data describing packed ranges of small tensors.
+    grad_vars: List of (grad, var) pairs for one tower.
+    rng: A pair of integers giving the first, last indices of a consecutive
+      range of tensors to be packed.
+
+  Returns:
+    A tensor that is the concatenation of all the specified small tensors.
+  """
+  to_pack = grad_vars[rng[0]:rng[1] + 1]
+  members = []
+  variables = []
+  restore_shapes = []
+  with ops.name_scope('pack'):
+    for g, v in to_pack:
+      variables.append(v)
+      restore_shapes.append(g.shape)
+      with ops.device(g.device):
+        members.append(array_ops.reshape(g, [-1]))
+    packing[key] = GradPackTuple(
+        indices=range(rng[0], rng[1] + 1),
+        vars=variables,
+        shapes=restore_shapes)
+    with ops.device(members[0].device):
+      return array_ops.concat(members, 0)
+
+
+def unpack_grad_tuple(gv, gpt):
+  """Unpack a previously packed collection of gradient tensors.
+
+  Args:
+    gv: A (grad, var) pair to be unpacked.
+    gpt: A GradPackTuple describing the packing operation that produced gv.
+
+  Returns:
+    A list of (grad, var) pairs corresponding to the values that were
+     originally packed into gv, maybe following subsequent operations like
+     reduction.
+  """
+  elt_widths = [x.num_elements() for x in gpt.shapes]
+  with ops.device(gv[0][0].device):
+    with ops.name_scope('unpack'):
+      splits = array_ops.split(gv[0], elt_widths)
+      unpacked_gv = []
+      for idx, s in enumerate(splits):
+        unpacked_gv.append((array_ops.reshape(s, gpt.shapes[idx]),
+                            gpt.vars[idx]))
+  return unpacked_gv
+
+
+def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
+  """Concatenate small gradient tensors together for reduction.
+
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples.
+    max_bytes: Int giving max number of bytes in a tensor that
+      may be considered small.
+    max_group: Int giving max number of small tensors that may be
+      concatenated into one new tensor.
+
+  Returns:
+    new_tower_grads, packing where new_tower_grads is identical to
+      tower_grads except that all feasible small_tensors have been removed
+      from their places and concatenated into larger tensors that are
+      now in the front of the list for each tower, and packing contains
+      the data necessary to restore the tower_grads structure.
+
+  Look through the first tower for gradients of the same type (float),
+  and small size, that are all sequential.  For each such group,
+  replace by a new tensor that is a flattened concatenation.  Note
+  that the corresponding variable will be absent, which doesn't matter
+  because it isn't used during all-reduce.
+
+  Requires:
+    Every gv_list in towers must have isomorphic structure including identical
+      tensor sizes and types.
+  """
+  small_indices = []
+  large_indices = []
+  for idx, (g, _) in enumerate(tower_grads[0]):
+    if g.dtype == dtypes.float32 and (4 * g.shape.num_elements()) <= max_bytes:
+      small_indices.append(idx)
+    else:
+      large_indices.append(idx)
+  small_ranges, small_singles = extract_ranges(
+      small_indices, range_size_limit=max_group)
+  large_indices = sorted(large_indices + small_singles)
+  num_gv = len(tower_grads[0])
+  packing = {}
+  if small_ranges:
+    new_tower_grads = []
+    for dev_idx, gv_list in enumerate(tower_grads):
+      assert len(gv_list) == num_gv
+      new_gv_list = []
+      for r in small_ranges:
+        key = '%d:%d' % (dev_idx, len(new_gv_list))
+        new_gv_list.append((pack_range(key, packing, gv_list, r),
+                            'packing_var_placeholder'))
+      for i in large_indices:
+        new_gv_list.append(gv_list[i])
+      new_tower_grads.append(new_gv_list)
+    return new_tower_grads, packing
+  else:
+    return tower_grads, None
+
+
+def unpack_small_tensors(tower_grads, packing):
+  """Undo the structure alterations to tower_grads done by pack_small_tensors.
+
+  Args:
+    tower_grads: List of List of (grad, var) tuples.
+    packing: A dict generated by pack_small_tensors describing the changes
+      it made to tower_grads.
+
+  Returns:
+    new_tower_grads: identical to tower_grads except that concatenations
+      of small tensors have been split apart and returned to their original
+      positions, paired with their original variables.
+  """
+  if not packing:
+    return tower_grads
+  new_tower_grads = []
+  num_devices = len(tower_grads)
+  num_packed = len(packing.keys()) // num_devices
+  for dev_idx, gv_list in enumerate(tower_grads):
+    gv_list = list(gv_list)
+    new_gv_list = gv_list[num_packed:]
+    for i in xrange(0, num_packed):
+      k = '%d:%d' % (dev_idx, i)
+      gpt = packing[k]
+      gv = unpack_grad_tuple(gv_list[i], gpt)
+      for gi, idx in enumerate(gpt.indices):
+        assert idx == gpt.indices[gi]
+        new_gv_list.insert(idx, gv[gi])
+    new_tower_grads.append(new_gv_list)
+  return new_tower_grads
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..34410a6470185ac2821bc6a59de9230ff478aeb6
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -0,0 +1,128 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that show that DistributionStrategy works with canned Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.optimizer_v2 import adagrad
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import test
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.canned import dnn_linear_combined
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.summary.writer import writer_cache
+
+
+class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
+                                                 parameterized.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def dataset_input_fn(self, x, y, batch_size, shuffle):
+
+    def input_fn():
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      if shuffle:
+        dataset = dataset.shuffle(batch_size)
+      dataset = dataset.repeat(10).batch(batch_size)
+      return dataset
+
+    return input_fn
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          distribution=[
+              combinations.one_device_strategy,
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.mirrored_strategy_with_two_gpus
+          ]))
+  def test_complete_flow_with_mode(self, distribution):
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    train_input_fn = self.dataset_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size // len(distribution.worker_devices),
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, y=data, batch_size=batch_size, shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, batch_size=batch_size, shuffle=False)
+
+    linear_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    dnn_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
+        linear_feature_columns=linear_feature_columns,
+        dnn_hidden_units=(2, 2),
+        dnn_feature_columns=dnn_feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir,
+        # TODO(isaprykin): Work around the colocate_with error.
+        dnn_optimizer=adagrad.AdagradOptimizer(0.001),
+        linear_optimizer=adagrad.AdagradOptimizer(0.001),
+        config=run_config.RunConfig(train_distribute=distribution))
+
+    num_steps = 10
+    estimator.train(train_input_fn, steps=num_steps)
+
+    scores = estimator.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in estimator.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
+                                             serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/BUILD b/tensorflow/contrib/distribute/python/examples/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..cbfd17850212a1c007e2edb9dd3986b3109f040d
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/BUILD
@@ -0,0 +1,30 @@
+# Example TensorFlow models that use DistributionStrategy for training.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_binary(
+    name = "simple_estimator_example",
+    srcs = ["simple_estimator_example.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "simple_tfkeras_example",
+    srcs = [
+        "simple_tfkeras_example.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..00c25c7a2482a559c8b94ff3be86c4961dfb439f
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
@@ -0,0 +1,87 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple example to test the a DistributionStrategy with Estimators.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def build_model_fn_optimizer():
+  """Simple model_fn with optimizer."""
+  # TODO(anjalisridhar): Move this inside the model_fn once OptimizerV2 is
+  # done?
+  optimizer = tf.train.GradientDescentOptimizer(0.2)
+
+  def model_fn(features, labels, mode):  # pylint: disable=unused-argument
+    """model_fn which uses a single unit Dense layer."""
+    # You can also use the Flatten layer if you want to test a model without any
+    # weights.
+    layer = tf.layers.Dense(1, use_bias=True)
+    logits = layer(features)
+
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      predictions = {"logits": logits}
+      return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+    def loss_fn():
+      y = tf.reshape(logits, []) - tf.constant(1.)
+      return y * y
+
+    if mode == tf.estimator.ModeKeys.EVAL:
+      return tf.estimator.EstimatorSpec(mode, loss=loss_fn())
+
+    assert mode == tf.estimator.ModeKeys.TRAIN
+
+    global_step = tf.train.get_global_step()
+    train_op = optimizer.minimize(loss_fn(), global_step=global_step)
+    return tf.estimator.EstimatorSpec(mode, loss=loss_fn(), train_op=train_op)
+
+  return model_fn
+
+
+def main(_):
+  distribution = tf.contrib.distribute.MirroredStrategy(
+      ["/device:GPU:0", "/device:GPU:1"])
+  config = tf.estimator.RunConfig(train_distribute=distribution)
+
+  def input_fn():
+    features = tf.data.Dataset.from_tensors([[1.]]).repeat(10)
+    labels = tf.data.Dataset.from_tensors([1.]).repeat(10)
+    return tf.data.Dataset.zip((features, labels))
+
+  estimator = tf.estimator.Estimator(
+      model_fn=build_model_fn_optimizer(), config=config)
+  estimator.train(input_fn=input_fn, steps=10)
+
+  eval_result = estimator.evaluate(input_fn=input_fn)
+  print("Eval result: {}".format(eval_result))
+
+  def predict_input_fn():
+    predict_features = tf.data.Dataset.from_tensors([[1.]]).repeat(10)
+    return predict_features
+
+  predictions = estimator.predict(input_fn=predict_input_fn)
+  # TODO(anjalsridhar): This returns a generator object, figure out how to get
+  # meaningful results here.
+  print("Prediction results: {}".format(predictions))
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..b87224251ca3844fc81c6f32a893d2c71664a955
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An example tf.keras model that is trained using MirroredStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from sys import argv
+import numpy as np
+import tensorflow as tf
+
+
+def input_fn():
+  x = np.random.random((1024, 10))
+  y = np.random.randint(2, size=(1024, 1))
+  x = tf.cast(x, tf.float32)
+  dataset = tf.data.Dataset.from_tensor_slices((x, y))
+  dataset = dataset.repeat(10)
+  dataset = dataset.batch(32)
+  return dataset
+
+
+def main(args):
+  if len(args) < 2:
+    print('You must specify  model_dir for checkpoints such as'
+          ' /tmp/tfkeras_example./')
+    return
+
+  print('Using %s to store checkpoints.' % args[1])
+
+  strategy = tf.contrib.distribute.MirroredStrategy(
+      ['/device:GPU:0', '/device:GPU:1'])
+  config = tf.estimator.RunConfig(train_distribute=strategy)
+  optimizer = tf.train.GradientDescentOptimizer(0.2)
+
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)))
+  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+
+  model.compile(loss='binary_crossentropy', optimizer=optimizer)
+  model.summary()
+  tf.keras.backend.set_learning_phase(True)
+  keras_estimator = tf.keras.estimator.model_to_estimator(
+      keras_model=model, config=config, model_dir=args[1])
+
+  keras_estimator.train(input_fn=input_fn, steps=10)
+  eval_result = keras_estimator.evaluate(input_fn=input_fn)
+  print('Eval result: {}'.format(eval_result))
+
+if __name__ == '__main__':
+  tf.app.run(argv=argv)
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e134fe34e10be402f028db986b8cbf14222db07f
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -0,0 +1,317 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for running legacy optimizer code with DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
+from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.ops.losses import losses_impl
+
+
+class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True]),
+          combinations.combine(is_tpu=[False])) +
+      combinations.combine(
+          distribution=[combinations.tpu_strategy],
+          optimizer_fn=[combinations.adam_optimizer_v1_fn],
+          mode=["graph"],
+          use_callable_loss=[False],
+          is_tpu=[True]))
+  def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss,
+                       is_tpu):
+    with distribution.scope():
+      model_fn, dataset_fn, layer = minimize_loss_example(
+          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
+
+      # TODO(isaprykin):  Eliminate `is_tpu`. Probably add a
+      # `DistributionStrategy.create_monitor` so that each DistributionStrategy
+      # could influence its training loop. That method would return an instance
+      # of Monitor.  TPUMonitor would execute tpu.initialize_system() and
+      # tpu.shutdown_system().
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+
+      def run_step():
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(), run_concurrently=layer.built))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          if is_tpu:
+            sess.run(tpu.initialize_system())
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(10):
+        run_step()
+
+        weights.append(self.evaluate(distribution.fetch(layer.kernel)))
+        biases.append(self.evaluate(distribution.fetch(layer.bias)))
+
+      if is_tpu:
+        with self.test_session() as sess:
+          sess.run(tpu.shutdown_system())
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers() +
+          combinations.distributions_and_v2_optimizers(),
+          combinations.combine(mode=["graph", "eager"], is_tpu=[False])) +
+      combinations.combine(
+          distribution=[combinations.tpu_strategy],
+          optimizer_fn=[
+              combinations.adam_optimizer_v1_fn,
+              combinations.gradient_descent_optimizer_v1_fn
+          ],
+          mode=["graph"],
+          is_tpu=[True]))
+
+  def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu):
+    created_variables = []
+    trainable_variables = []
+
+    def appending_creator(next_creator, *args, **kwargs):
+      v = next_creator(*args, **kwargs)
+      created_variables.append(v.name)
+      if "trainable" in kwargs and kwargs["trainable"]:
+        trainable_variables.append(v.name)
+      return v
+
+    # Creator scope needs to be set before it's used inside
+    # `distribution.scope`.
+    with variable_scope.variable_creator_scope(
+        appending_creator), distribution.scope():
+      model_fn, dataset_fn, layer = minimize_loss_example(
+          optimizer_fn,
+          use_bias=True,
+          use_callable_loss=True,
+          create_optimizer_inside_model_fn=True)
+
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+
+      def run_step():
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(), run_concurrently=layer.built))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          if is_tpu:
+            sess.run(tpu.initialize_system())
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      run_step()
+
+      if is_tpu:
+        with self.test_session() as sess:
+          sess.run(tpu.shutdown_system())
+
+      def get_expected_variables(optimizer_fn, num_parameter_devices):
+        variables_map = {
+            "GradientDescent": ["dense/kernel", "dense/bias"],
+            "Adam": [
+                "dense/kernel", "dense/bias", "beta1_power", "beta2_power",
+                "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam",
+                "dense/bias/Adam_1"
+            ]
+        }
+        variables = variables_map[optimizer_fn().get_name()]
+        variables.extend([
+            v + "/replica_{}".format(replica)
+            for v in variables
+            for replica in range(1, num_parameter_devices)
+        ])
+        return set([v + ":0" for v in variables])
+
+      self.assertEqual(
+          get_expected_variables(optimizer_fn,
+                                 len(distribution.parameter_devices)),
+          set(created_variables))
+
+  @combinations.generate(
+      combinations.times(combinations.distributions_and_v1_optimizers(),
+                         combinations.combine(
+                             mode=["graph", "eager"],
+                             momentum=[0.8, 0.9, 0.99],
+                             renorm=[False, True])))
+  def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
+                                    renorm):
+    """Verifies that moving mean updates are reduced across towers."""
+    with distribution.scope():
+      num_towers = len(distribution.worker_devices)
+      model_fn, dataset_fn, batchnorm = batchnorm_example(
+          optimizer_fn,
+          batch_per_epoch=num_towers,
+          momentum=momentum,
+          renorm=renorm)
+
+      # Disable prefetching since that makes the specific input on each device
+      # to be non deterministic, and this test relies on specific input being
+      # on each device.
+      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
+        distribution._prefetch_on_device = False
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+
+      def run_step():
+        return control_flow_ops.group(
+            distribution.unwrap(
+                distribution.call_for_each_tower(
+                    model_fn,
+                    iterator.get_next(),
+                    run_concurrently=batchnorm.built)) +
+            ops.get_collection(ops.GraphKeys.UPDATE_OPS))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      expected_moving_means = [0.] * 8
+
+      def averaged_batch_mean(i):
+        # Each batch has shape [16, 8] where the ith element in jth list is
+        # (8 * j + i + tower_id * 100). So the batch mean in each tower is
+        # (60 + i + tower_id * 100). So here comes its batch mean over all
+        # towers:
+        return 60. + i + (num_towers - 1.) / 2. * 100.
+
+      for _ in range(10):
+        run_step()
+        moving_means = self.evaluate(distribution.fetch(batchnorm.moving_mean))
+
+        # We make sure that the moving_mean is updated as if the sample mean is
+        # calculated over all towers.
+        for i, expected_moving_mean in enumerate(expected_moving_means):
+          expected_moving_means[i] -= ((
+              expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
+          self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(
+              distribution=[combinations.one_device_strategy,
+                            combinations.mirrored_strategy_with_gpu_and_cpu,
+                            combinations.mirrored_strategy_with_two_gpus],
+              optimizer_fn=[combinations.gradient_descent_optimizer_v1_fn,
+                            combinations.gradient_descent_optimizer_v2_fn],
+              loss_reduction=[losses_impl.Reduction.SUM,
+                              losses_impl.Reduction.MEAN,
+                              losses_impl.Reduction.SUM_OVER_BATCH_SIZE,
+                              losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS]),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+  def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
+                    use_callable_loss):
+    with distribution.scope():
+      all_vars = []
+
+      def model_fn(x, y):
+
+        def loss_fn():
+          # Use fixed initialization to make the steps deterministic.
+          w = variable_scope.get_variable("w", initializer=[[2.]])
+          all_vars.append(w)
+          predict = math_ops.matmul(x, w)
+          return losses_impl.mean_squared_error(
+              y, predict, reduction=loss_reduction)
+
+        optimizer = optimizer_fn()  # GradientDescent with 0.2 learning rate
+
+        if use_callable_loss:
+          return optimizer.minimize(loss_fn)
+        else:
+          return optimizer.minimize(loss_fn())
+
+      def dataset_fn():
+        features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
+        labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
+        return dataset_ops.Dataset.zip((features, labels)).repeat()
+
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+
+      def run_step():
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, *iterator.get_next(), run_concurrently=False))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      run_step()
+
+      self.assertEqual(distribution.num_towers, len(all_vars))
+      v = all_vars[0]
+      self.assertTrue(all([v is vi for vi in all_vars[1:]]))
+      weight = numpy.squeeze(self.evaluate(distribution.fetch(v)))
+      # Our model is:
+      #   predict = x * w
+      #   loss = (predict - y)^2
+      #   dloss/dpredict = 2*(predict - y)
+      #   dloss/dw = 2 * x^T @ (predict - y)
+      # For our batch size of 2, assuming sum loss reduction:
+      #   x = [2, 7]
+      #   y = [6, 21]
+      #   w_initial = 2
+      #   predict = [4, 14]
+      #   predict - y = [-2, -7]
+      #   dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106
+      # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
+      # with sum loss reduction, or 10.6 with mean.
+      if loss_reduction == losses_impl.Reduction.SUM:
+        # Note that the "distribution.num_towers" factor will go away once
+        # we split the input across towers, instead of pulling a complete
+        # batch of input per tower.
+        self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001)
+      else:
+        # One of the mean loss reductions.
+        self.assertNear(weight, 2 + 10.6, 0.0001)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..6efd578a775da7bf326826289bd5bd50a57be892
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -0,0 +1,497 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class MirroredStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+import six
+
+from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.contrib.distribute.python import values
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+def _cpu_device(device):
+  cpu_device = tf_device.DeviceSpec.from_string(device)
+  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
+  return cpu_device.to_string()
+
+
+class _RequestedStop(Exception):
+  pass
+
+
+class MirroredStrategy(distribute_lib.DistributionStrategy):
+  """Mirrors vars to distribute across multiple devices on a single machine.
+
+  This strategy uses one tower per device and sync replication.
+  """
+
+  def __init__(self,
+               devices=None,
+               num_gpus=None,
+               cross_tower_ops=None,
+               prefetch_on_device=None):
+    super(MirroredStrategy, self).__init__()
+    # Convert `num_gpus` into `devices`, shouldn't specify both.
+    if devices is None:
+      if num_gpus is None:
+        num_gpus = context.num_gpus()
+      devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
+    elif num_gpus is not None:
+      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = devices
+    self._canonical_device_set = set(
+        [device_util.canonicalize(d) for d in devices])
+    self._device_index = values.PerDevice(
+        dict((d, i) for i, d in enumerate(devices)))
+    self._cross_tower_ops = cross_tower_ops
+    self._prefetch_on_device = prefetch_on_device
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    """Create a mirrored variable. See `DistributionStrategy.scope`."""
+    # Figure out what collections this variable should be added to.
+    # We'll add the MirroredVariable to those collections instead.
+    collections = kwargs.pop("collections", None)
+    if collections is None:
+      collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+    kwargs["collections"] = []
+
+    colocate_with = kwargs.pop("colocate_with", None)
+    devices = self._get_devices_from(colocate_with)
+
+    tower_local = kwargs.pop("tower_local_reduce_method", None)
+    if tower_local is not None:
+      kwargs["trainable"] = False
+
+    # TODO(josh11b,apassos): It would be better if variable initialization
+    # was never recorded on the tape instead of having to do this manually
+    # here.
+    with tape.stop_recording():
+      index = {}
+      for i, d in enumerate(devices):
+        with ops.device(d):
+          if i > 0:
+            # Give replicas meaningful distinct names:
+            var0name = index[devices[0]].name.split(":")[0]
+            kwargs["name"] = "%s/replica_%d" % (var0name, i)
+            # Initialize replicas with the same value:
+            if context.executing_eagerly():
+              initial_value = index[devices[0]].value()
+            else:
+              initial_value = index[devices[0]].initial_value
+            kwargs["initial_value"] = array_ops.identity(initial_value)
+          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+            v = next_creator(*args, **kwargs)
+          assert not isinstance(v, values.DistributedVariable)
+          index[d] = v
+
+      if tower_local is None:
+        result = values.MirroredVariable(index, index[devices[0]])
+      else:
+        result = values.TowerLocalVariable(
+            index, index[devices[0]], tower_local)
+
+    if not context.executing_eagerly():
+      g = ops.get_default_graph()
+      # If "trainable" is True, next_creator() will add the member variables
+      # to the TRAINABLE_VARIABLES collection, so we manually remove
+      # them and replace with the MirroredVariable. We can't set
+      # "trainable" to False for next_creator() since that causes functions
+      # like implicit_gradients to skip those variables.
+      if kwargs.get("trainable", True):
+        collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+        l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+        for v in index.values():
+          l.remove(v)
+      g.add_to_collections(collections, result)
+    return result
+
+  def distribute_dataset(self, dataset_fn):
+    return values.PerDeviceDataset(
+        self._call_dataset_fn(dataset_fn), self._devices,
+        self._prefetch_on_device)
+
+  def _broadcast(self, tensor, destinations):
+    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
+    return self._get_cross_tower_ops().broadcast(tensor, destinations or
+                                                 self._devices)
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    """Run `fn` in separate threads, once per tower/worker device.
+
+    Args:
+      fn: function to run (will be run once per device, each in its own thread).
+      *args: positional arguments for `fn`
+      **kwargs: keyword arguments for `fn`.
+          `"run_concurrently"`: Boolean indicating whether executions of `fn`
+             can be run concurrently (under eager execution only), defaults to
+             `True`.
+
+    Returns:
+      Merged return value of `fn` across all towers.
+
+    Raises:
+      RuntimeError: If fn() calls get_tower_context().merge_call() a different
+          number of times for when called for different devices.
+    """
+    run_concurrently = kwargs.pop("run_concurrently", True)
+    if not context.executing_eagerly():
+      # Lots of TF library code isn't thread-safe in graph mode, and
+      # there is little to be gained by turning on multithreading when
+      # constructing a graph.
+      run_concurrently = False
+      # Needed for per-thread device, etc. contexts in graph mode.
+      ops.get_default_graph().switch_to_thread_local()
+    elif run_concurrently is None:
+      run_concurrently = True
+
+    coord = coordinator.Coordinator(
+        clean_stop_exception_types=(_RequestedStop,))
+
+    shared_variable_store = {}
+
+    # TODO(isaprykin): Create these threads once instead of during every run()
+    # call.
+    threads = []
+    for index, d in enumerate(self._devices):
+      variable_creator_fn = shared_variable_creator.make_fn(
+          shared_variable_store, index)
+      t = MirroredStrategy._MirroredTowerThread(
+          self, coord, d, variable_creator_fn, fn,
+          *values.select_device(d, args), **values.select_device(d, kwargs))
+      threads.append(t)
+
+    for t in threads:
+      t.start()
+
+    # When `fn` starts `should_run` event is set on _MirroredTowerThread
+    # (`MTT`) threads. The execution waits until
+    # `MTT.has_paused` is set, which indicates that either `fn` is
+    # complete or a `get_tower_context().merge_call()` is called.  If `fn` is
+    # complete, then `MTT.done` is set to True.  Otherwise, arguments
+    # of `get_tower_context().merge_call` from all paused threads are grouped
+    # and the `merge_fn` is performed.  Results of the
+    # `get_tower_context().merge_call` are then set to `MTT.merge_result`.
+    # Each such `get_tower_context().merge_call` call returns the
+    # `MTT.merge_result` for that thread when `MTT.should_run` event
+    # is reset again. Execution of `fn` resumes.
+
+    try:
+      with coord.stop_on_exception():
+        all_done = False
+        while not all_done and not coord.should_stop():
+          done = []
+          if run_concurrently:
+            for t in threads:
+              t.should_run.set()
+            for t in threads:
+              t.has_paused.wait()
+              t.has_paused.clear()
+              if coord.should_stop():
+                return None
+              done.append(t.done)
+          else:
+            for t in threads:
+              t.should_run.set()
+              t.has_paused.wait()
+              t.has_paused.clear()
+              if coord.should_stop():
+                return None
+              done.append(t.done)
+          if coord.should_stop():
+            return None
+          all_done = all(done)
+          if not all_done:
+            if any(done):
+              raise RuntimeError("Some towers made a different number of "
+                                 "tower_context().merge_call() calls.")
+            # get_tower_context().merge_call() case
+            merge_args = values.regroup(
+                {t.device: t.merge_args for t in threads})
+            merge_kwargs = values.regroup(
+                {t.device: t.merge_kwargs for t in threads})
+            merge_result = threads[0].merge_fn(
+                self, *merge_args, **merge_kwargs)
+            for t in threads:
+              t.merge_result = values.select_device(t.device, merge_result)
+    finally:
+      for t in threads:
+        t.should_run.set()
+      coord.join(threads)
+
+    return values.regroup({t.device: t.main_result for t in threads})
+
+  def map(self, map_over, fn, *args, **kwargs):
+    # TODO(josh11b): In eager mode, use one thread per device.
+    index = {}
+    i = 0
+    for m in map_over:
+      d = self._devices[i % len(self._devices)]
+      with ops.device(d):
+        l = index.get(d, [])
+        l.append(fn(m,
+                    *values.select_device_mirrored(d, args),
+                    **values.select_device_mirrored(d, kwargs)))
+        index[d] = l
+    # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput
+    # in addition to PerDevice data.
+    return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
+
+  def configure(self, session_config=None):
+    if self._cross_tower_ops is None:
+      self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
+          self._devices, session_config=session_config)
+
+  def _get_cross_tower_ops(self):
+    if self._cross_tower_ops is None:
+      self._cross_tower_ops = (
+          cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps())
+    return self._cross_tower_ops
+
+  def _reduce(self, method_string, value, destinations):
+    if len(self._devices) == 1 and not isinstance(value, values.PerDevice):
+      value = values.PerDevice({self._devices[0]: value})
+    assert isinstance(value, values.PerDevice)
+
+    return self._get_cross_tower_ops().reduce(
+        method_string, value, destinations=destinations)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    return self._get_cross_tower_ops().batch_reduce(method_string,
+                                                    value_destination_pairs)
+
+  def _update(self, var, fn, *args, **kwargs):
+    # TODO(josh11b): Also support TowerLocalVariables here? If so, args and
+    # kwargs don't need to be mirrored.
+    assert isinstance(var, values.MirroredVariable)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d, v in var._index.items():  # pylint: disable=protected-access
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(v,
+                        *values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.regroup(updates, values.Mirrored)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    assert isinstance(colocate_with, list)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d in colocate_with:
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(*values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.regroup(updates, values.Mirrored)
+
+  def _fetch(self, val, destination, fn):
+    """Return a copy of `val` or `fn(val)` on `destination`."""
+    assert isinstance(destination, six.string_types)
+    if isinstance(val, values.TowerLocalVariable):
+      val = self.reduce(val.reduce_method, val, destinations=destination)
+      with ops.device(destination):
+        return fn(self.unwrap(val)[0])
+
+    assert isinstance(val, values.Mirrored), (
+        "val = %s (type %s)" % (val, val.__class__.__name__))
+    if val.on_device(destination):
+      with ops.device(destination):
+        # Use an identity here to make sure we are returning a tensor
+        # instead of e.g. a variable object.
+        return array_ops.identity(fn(val.get(destination)))
+    device = None
+    for d in self._devices:
+      if val.on_device(d):
+        device = d
+        break
+    assert device is not None, (
+        "Could not find destination %s in list of devices %s." %
+        (destination, val.devices))
+    with ops.device(device):
+      v = fn(val.get(device))
+    with ops.device(destination):
+      return array_ops.identity(v)
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      if set(val.devices) == self._canonical_device_set:
+        return [val.get(device=d) for d in self._devices]
+      return [val.get(device=d) for d in sorted(val.devices)]
+    return [val]
+
+  @property
+  def is_single_tower(self):
+    return len(self._devices) == 1
+
+  @property
+  def num_towers(self):
+    return len(self._devices)
+
+  def _worker_device_index(self):
+    return self._device_index
+
+  @property
+  def worker_devices(self):
+    # Make a copy to prevent users from accidentally mutating our copy.
+    return list(self._devices)
+
+  @property
+  def parameter_devices(self):
+    return list(self._devices)
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return list(self._devices)
+
+  def _get_devices_from(self, colocate_with=None):
+    if colocate_with is None:
+      return self._devices
+    elif isinstance(colocate_with, values.DistributedValues):
+      # pylint: disable=protected-access
+      return list(colocate_with._index.keys())
+    elif isinstance(colocate_with, six.string_types):
+      return [colocate_with]
+    else:
+      return colocate_with
+
+  class _MirroredTowerThread(threading.Thread):
+    """A thread that runs() a function on a device."""
+
+    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
+                 **kwargs):
+      super(MirroredStrategy._MirroredTowerThread, self).__init__()  # pylint: disable=protected-access
+      self.coord = coord
+      self.distribution = dist
+      self.device = device
+      self.tower_id = dist.worker_devices.index(device)
+      self.variable_creator_fn = variable_creator_fn
+      # State needed to run and return the results of `fn`.
+      self.main_fn = fn
+      self.main_args = args
+      self.main_kwargs = kwargs
+      self.main_result = None
+      self.done = False
+      # State needed to run the next merge_call() (if any) requested via
+      # TowerContext.
+      self.merge_fn = None
+      self.merge_args = None
+      self.merge_kwargs = None
+      self.merge_result = None
+      # We use a thread.Event for the main thread to signal when this
+      # thread should start running (`should_run`), and another for
+      # this thread to transfer control back to the main thread
+      # (`has_paused`, either when it gets to a
+      # `get_tower_context().merge_call` or when `fn` returns). In
+      # either case the event starts cleared, is signaled by calling
+      # set(). The receiving thread waits for the signal by calling
+      # wait() and then immediately clearing the event using clear().
+      self.should_run = threading.Event()
+      self.has_paused = threading.Event()
+      # These fields have to do with inheriting various contexts from the
+      # parent thread:
+      # pylint: disable=protected-access
+      self.context_mode = context.context()._eager_context.mode
+      if not context.context()._context_handle:
+        context.context()._initialize_handle_and_devices()
+      self.context_device_policy = (
+          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+              context.context()._context_handle))
+      self.graph = ops.get_default_graph()
+      self._variable_creator_stack = self.graph._variable_creator_stack[:]
+      self._captured_var_scope = variable_scope.get_variable_scope()
+      # Adding a "/" at end lets us re-enter this scope later.
+      self._captured_name_scope = self.graph.get_name_scope()
+      if self._captured_name_scope:
+        self._captured_name_scope += "/"
+      if self.tower_id > 0:
+        if not self._captured_name_scope:
+          self._captured_name_scope = ""
+        self._captured_name_scope += "tower_%d/" % self.tower_id
+
+    def run(self):
+      # pylint: disable=protected-access
+      self.graph._variable_creator_stack = self._variable_creator_stack
+      self.should_run.wait()
+      self.should_run.clear()
+      try:
+        if self.coord.should_stop():
+          return
+        with self.coord.stop_on_exception(), \
+            context.context()._mode(self.context_mode), \
+            context.context().device_policy(self.context_device_policy), \
+            self.graph.as_default(), \
+            MirroredTowerContext(self.distribution, self.tower_id), \
+            ops.device(self.device), \
+            ops.name_scope(self._captured_name_scope), \
+            variable_scope.variable_scope(
+                self._captured_var_scope, reuse=self.tower_id > 0), \
+            variable_scope.variable_creator_scope(self.variable_creator_fn):
+          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
+          self.done = True
+      finally:
+        self.has_paused.set()
+
+
+class MirroredTowerContext(distribute_lib.TowerContext):
+  """TowerContext used in MirroredStrategy.call_for_each_tower().
+
+  Opened in `_MirroredTowerThread`, to allow the user to invoke
+  `MirroredStrategy`'s specific implementation of `merge_call()`,
+  which works by delegating the function and its arguments to
+  the main thread (the one that invoked
+  `MirroredStrategy.call_for_each_tower()`).
+  """
+
+  def _merge_call(self, fn, *args, **kwargs):
+    """Delegate to the main thread to actually perform merge_call()."""
+    t = threading.current_thread()  # a _MirroredTowerThread
+    t.merge_fn = fn
+    t.merge_args = args
+    t.merge_kwargs = kwargs
+    t.has_paused.set()
+    t.should_run.wait()
+    t.should_run.clear()
+    if t.coord.should_stop():
+      raise _RequestedStop()
+    return t.merge_result
+
+  @property
+  def device(self):
+    distribute_lib.require_tower_context(self)
+    return self._distribution_strategy.worker_devices[self._tower_id]
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c5c055070c0fc88ed8f3a459e3f346596f077a6
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -0,0 +1,436 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multi-GPU tests for MirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.contrib.distribute.python import values
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import core
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
+
+GPU_TEST = "test_gpu" in sys.argv[0]
+
+
+class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    if GPU_TEST:
+      self.assertGreater(context.num_gpus(), 0)
+      if context.num_gpus() > 1:
+        devices = ["/device:GPU:0", "/device:GPU:1"]
+    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
+    return mirrored_strategy.MirroredStrategy(devices)
+
+  def testMinimizeLossEager(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_minimize_loss_eager(self._get_distribution_strategy())
+
+  def testMinimizeLossGraph(self):
+    soft_placement = not GPU_TEST
+    print("testMinimizeLossGraph soft_placement:", soft_placement)
+    self._test_minimize_loss_graph(
+        self._get_distribution_strategy(), soft_placement=soft_placement)
+
+  def testMapReduce(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_map_reduce(self._get_distribution_strategy())
+
+  def testDeviceIndex(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_device_index(self._get_distribution_strategy())
+
+  def testTowerId(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_tower_id(self._get_distribution_strategy())
+
+  def testNumTowers(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self.assertEqual(2, self._get_distribution_strategy().num_towers)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCallAndMergeExceptions(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testRunRegroupError(self):
+
+    def run_fn(device_id):
+      # Generates a list with different lengths on different devices.
+      # Will fail in _regroup() (if more than one device).
+      return list(range(device_id))
+
+    dist = self._get_distribution_strategy()
+    with dist.scope(), self.assertRaises(AssertionError):
+      dist.call_for_each_tower(run_fn, dist.worker_device_index)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testReduceToCpu(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+
+    def run_fn(device_id):
+      return device_id
+
+    dist = self._get_distribution_strategy()
+    with dist.scope():
+      result = dist.call_for_each_tower(run_fn, dist.worker_device_index)
+      reduced = dist.reduce("sum", result, destinations="/device:CPU:0")
+      unwrapped = dist.unwrap(reduced)
+      self.assertEqual(1, len(unwrapped))
+      expected = sum(range(len(dist.worker_devices)))
+      self.assertEqual(expected, self.evaluate(unwrapped[0]))
+
+
+@test_util.with_c_api
+class MirroredStrategyVariableCreationTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  def _skip_eager_if_gpus_less_than(self, num_gpus):
+    if context.num_gpus() < num_gpus and context.executing_eagerly():
+      self.skipTest("Enough GPUs not available for this test in eager mode.")
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSingleVariable(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      # This variable should be created only once across the threads because of
+      # special variable_creator functions used by `dist.call_for_each_tower`.
+      v = variable_scope.variable(1.0, name="foo")
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEquals("foo:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testUnnamedVariable(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      v = variable_scope.variable(1.0)
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      # Default name of "Variable" will be used.
+      self.assertEquals("Variable:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleVariables(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      vs = []
+      for i in range(5):
+        vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return vs
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      for i, v in enumerate(result):
+        self.assertIsInstance(v, values.MirroredVariable)
+        self.assertEquals("foo" + str(i) + ":0", v.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleVariablesWithSameCanonicalName(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      vs = []
+      vs.append(variable_scope.variable(1.0, name="foo/bar"))
+      vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
+      vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
+      vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return vs
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      for v in result:
+        self.assertIsInstance(v, values.MirroredVariable)
+      self.assertEquals(4, len(result))
+      self.assertEquals("foo/bar:0", result[0].name)
+      self.assertEquals("foo_1/bar:0", result[1].name)
+      self.assertEquals("foo_1/bar_1:0", result[2].name)
+      self.assertEquals("foo/bar_1:0", result[3].name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testVariableWithSameCanonicalNameAcrossThreads(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn(device_id):
+      v = variable_scope.variable(1.0, name="foo_" + str(device_id))
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(
+          model_fn, dist.worker_device_index, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      # The resulting mirrored variable will use the name from the first device.
+      self.assertEquals("foo_0:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testWithLayers(self):
+    self._skip_eager_if_gpus_less_than(1)
+    def model_fn(features):
+      with variable_scope.variable_scope("common"):
+        layer1 = core.Dense(1)
+        layer1(features)
+        layer2 = core.Dense(1)
+        layer2(features)
+        # This will pause the current thread, and execute the other thread.
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        layer3 = core.Dense(1)
+        layer3(features)
+        return [(layer1.kernel, layer1.bias),
+                (layer2.kernel, layer2.bias),
+                (layer3.kernel, layer3.bias)]
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+    features = dist.distribute_dataset(
+        lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
+    ).make_one_shot_iterator().get_next()
+
+    with dist.scope():
+      result = dist.call_for_each_tower(
+          model_fn, features, run_concurrently=False)
+      suffixes = ["", "_1", "_2"]
+      for (kernel, bias), suffix in zip(result, suffixes):
+        self.assertIsInstance(kernel, values.MirroredVariable)
+        self.assertEquals("common/dense" + suffix + "/kernel:0", kernel.name)
+        self.assertIsInstance(bias, values.MirroredVariable)
+        self.assertEquals("common/dense" + suffix + "/bias:0", bias.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testWithGetVariableAndVariableScope(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      v0 = variable_scope.get_variable("var-thread0", [1])
+      with variable_scope.variable_scope("common"):
+        v1 = variable_scope.get_variable("var-thread1", [1])
+        # This will pause the current thread, and execute the other thread.
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        v2 = variable_scope.get_variable("var-thread2", [1])
+
+      return v0, v1, v2
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dist = mirrored_strategy.MirroredStrategy(devices)
+    with dist.scope():
+      with variable_scope.variable_scope("main"):
+        v = variable_scope.get_variable("var-main0", [1])
+        self.assertEquals("main/var-main0:0", v.name)
+
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+        self.assertEquals(3, len(result))
+        v0, v1, v2 = result
+        self.assertIsInstance(v0, values.MirroredVariable)
+        self.assertEquals("main/var-thread0:0", v0.name)
+        self.assertIsInstance(v1, values.MirroredVariable)
+        self.assertEquals("main/common/var-thread1:0", v1.name)
+        self.assertIsInstance(v2, values.MirroredVariable)
+        self.assertEquals("main/common/var-thread2:0", v2.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testThreeDevices(self):
+    self._skip_eager_if_gpus_less_than(2)
+
+    def model_fn():
+      v = variable_scope.variable(1.0, name="foo")
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEquals("foo:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testNonMatchingVariableCreation(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn(name):
+      v = variable_scope.variable(1.0, name=name)
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      names = values.DistributedValues({
+          "/device:CPU:0": "foo",
+          "/device:GPU:0": "bar"
+      })
+      with self.assertRaises(RuntimeError):
+        _ = dist.call_for_each_tower(model_fn, names, run_concurrently=False)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testTowerLocalVariable(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    all_v_sum = {}
+    all_v_mean = {}
+
+    def model_fn(device_id):
+      tower_context = distribute_lib.get_tower_context()
+      with tower_context.tower_local_var_scope("sum"):
+        v_sum = variable_scope.variable(1.0)
+      with tower_context.tower_local_var_scope("mean"):
+        v_mean = variable_scope.variable(4.0)
+      self.assertTrue(isinstance(v_sum, values.TowerLocalVariable))
+      self.assertTrue(isinstance(v_mean, values.TowerLocalVariable))
+      updates = [v_sum.assign_add(2.0 + device_id),
+                 v_mean.assign(6.0 * device_id)]
+      all_v_sum[device_id] = v_sum
+      all_v_mean[device_id] = v_mean
+      return updates, v_sum, v_mean
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      # Create "sum" and "mean" versions of TowerLocalVariables.
+      ret_ops, ret_v_sum, ret_v_mean = dist.call_for_each_tower(
+          model_fn, dist.worker_device_index, run_concurrently=False)
+      # Should see the same wrapping instance in all towers.
+      self.assertIs(all_v_sum[0], ret_v_sum)
+      self.assertIs(all_v_mean[0], ret_v_mean)
+      for i in range(1, dist.num_towers):
+        self.assertIs(all_v_sum[0], all_v_sum[1])
+        self.assertIs(all_v_mean[0], all_v_mean[1])
+
+      # Apply updates
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate([y for x in ret_ops for y in dist.unwrap(x)])
+      expected_sum = 0.0
+      expected_mean = 0.0
+      for i, d in enumerate(dist.worker_devices):
+        # Test access within a device scope, should see different values.
+        with ops.device(d):
+          v_sum_value = self.evaluate(ret_v_sum.read_value())
+          v_mean_value = self.evaluate(ret_v_mean.read_value())
+          expected = i + 3.0
+          self.assertEqual(expected, v_sum_value)
+          expected_sum += expected
+          expected = i * 6.0
+          self.assertEqual(expected, v_mean_value)
+          expected_mean += expected
+
+      # fetch() should return the value you get by applying the
+      # reduction across all towers.
+      self.assertEqual(expected_sum, self.evaluate(dist.fetch(ret_v_sum)))
+      expected_mean /= len(dist.worker_devices)
+      self.assertEqual(expected_mean, self.evaluate(dist.fetch(ret_v_mean)))
+
+  # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
+  # testing this in eager mode.
+
+  def testNameScope(self):
+    def model_fn():
+      with ops.name_scope("foo"):
+        a = constant_op.constant(1.0, name="a")
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        b = constant_op.constant(1.0, name="b")
+      return a, b
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      with ops.name_scope("main"):
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+        self.assertEquals(2, len(result))
+        for v, name in zip(result, ["a", "b"]):
+          self.assertIsInstance(v, values.DistributedValues)
+          v0, v1 = dist.unwrap(v)
+          self.assertEquals("main/foo/" + name + ":0", v0.name)
+          self.assertEquals("main/tower_1/foo/" + name + ":0", v1.name)
+
+  def testWithDefaultName(self):
+    def model_fn():
+      with ops.name_scope(None, "foo"):
+        a = constant_op.constant(1.0, name="a")
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        b = constant_op.constant(2.0, name="b")
+      return a, b
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertEquals(2, len(result))
+      for v, name in zip(result, ["a", "b"]):
+        self.assertIsInstance(v, values.DistributedValues)
+        v0, v1 = dist.unwrap(v)
+        self.assertEquals("foo/" + name + ":0", v0.name)
+        self.assertEquals("tower_1/foo/" + name + ":0", v1.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1ef0ecc77a8e8432dfa4eb6da7c324b371dab70
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
@@ -0,0 +1,91 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class MirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import distribute as distribute_lib
+
+
+@test_util.with_c_api
+class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    return mirrored_strategy.MirroredStrategy(["/device:CPU:0"])
+
+  def testMinimizeLossEager(self):
+    self._test_minimize_loss_eager(self._get_distribution_strategy())
+
+  def testMinimizeLossGraph(self):
+    self._test_minimize_loss_graph(self._get_distribution_strategy())
+
+  def testMapReduce(self):
+    self._test_map_reduce(self._get_distribution_strategy())
+
+  def testDeviceIndex(self):
+    self._test_device_index(self._get_distribution_strategy())
+
+  def testTowerId(self):
+    self._test_tower_id(self._get_distribution_strategy())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCallAndMergeExceptions(self):
+    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
+
+
+@test_util.with_c_api
+class VariableCreatorStackTest(test.TestCase):
+
+  def testCreatorStacksAreThreadLocal(self):
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dist = mirrored_strategy.MirroredStrategy(devices)
+
+    def model_fn(device_id):
+      assert isinstance(device_id, int)
+      def thread_creator_fn(next_creator, *args, **kwargs):
+        return next_creator(*args, **kwargs) + ":thread_" + str(device_id)
+
+      with variable_scope.variable_creator_scope(thread_creator_fn):
+        # Create a variable in this scope.
+        v = variable_scope.variable(1.0)
+
+        # This will pause the current thread, and execute the other thread.
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    def main_thread_creator(next_creator, *args, **kwargs):
+      # We are not using the underlying next_creator for test purposes.
+      del next_creator, args, kwargs
+      return "main_thread"
+
+    with context.graph_mode(), \
+        dist.scope(), \
+        variable_scope.variable_creator_scope(main_thread_creator):
+      result = dist.call_for_each_tower(model_fn, dist.worker_device_index)
+      result = dist.unwrap(result)
+      expected = ["main_thread:thread_0", "main_thread:thread_1"]
+      self.assertEquals(expected, result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/monitor.py b/tensorflow/contrib/distribute/python/monitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7644acedc99361d7287a91832d76bc68cbc6ac0a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/monitor.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Monitor is responsible for training, checkpointing and recovery."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import variables
+
+
+class Monitor(object):
+  """Executes training steps, recovers and checkpoints.
+
+  Note that this class is particularly preliminary, experimental, and
+  expected to change.
+  """
+  # TODO(isaprykin): Support step functions that need multiple session calls.
+  # TODO(isaprykin): Support extra arguments to the step function.
+  # TODO(isaprykin): Support recovery, checkpointing and summaries.
+
+  def __init__(self, step_callable, session=None):
+    """Initialize the Monitor with components for executing training steps.
+
+    Args:
+      step_callable: a training `Step` that's capable of signaling when done.
+      session: a `Session` instance that's needed for graph mode.
+
+    Raises:
+      ValueError: if `session` was provided for eager mode or not provided for
+        graph mode.
+    """
+    if context.executing_eagerly():
+      if session is not None:
+        raise ValueError("Should not provide a `session` in Eager mode.")
+      self._run_step = step_callable
+    else:
+      if session is None:
+        raise ValueError("Should provide a `session` in Graph mode.")
+      self._run_step = session.make_callable(step_callable())
+      session.run(variables.global_variables_initializer())
+
+  def run_steps(self, num_steps=None):
+    step = 0
+    while num_steps is None or step < num_steps:
+      try:
+        self._run_step()
+        step += 1
+      except errors.OutOfRangeError:
+        break
diff --git a/tensorflow/contrib/distribute/python/monitor_test.py b/tensorflow/contrib/distribute/python/monitor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8277e1e7919e86ef616b31d0986589dcc9c49bbd
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/monitor_test.py
@@ -0,0 +1,84 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class Monitor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import monitor as monitor_lib
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python.single_loss_example import single_loss_example
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.training import gradient_descent
+
+
+class MonitorTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=combinations.graph_and_eager_modes)))
+  def testTrainNetwork(self, distribution, optimizer_fn):
+    with distribution.scope():
+      single_loss_step, layer = single_loss_example(optimizer_fn, distribution)
+
+      if context.executing_eagerly():
+        monitor = monitor_lib.Monitor(single_loss_step, None)
+      else:
+        with self.test_session() as sess:
+          monitor = monitor_lib.Monitor(single_loss_step, sess)
+
+      monitor.run_steps(1)
+
+      self.assertEqual(1, len(layer.trainable_variables))
+      mirrored_weight_variable = layer.trainable_variables[0]
+      start_error = self.evaluate(distribution.fetch(mirrored_weight_variable))
+      start_error = abs(numpy.array(start_error) - 1)
+
+      monitor.run_steps(9)
+      end_error = self.evaluate(distribution.fetch(mirrored_weight_variable))
+      end_error = abs(numpy.array(end_error) - 1)
+      self.assertGreaterEqual(start_error, end_error)
+
+  def testPassingASessionInEager(self):
+    distribution = one_device_strategy.OneDeviceStrategy(
+        "/device:CPU:0")
+    step_function, _ = single_loss_example(
+        lambda: gradient_descent.GradientDescentOptimizer(0.2), distribution)
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(ValueError, "Should not provide"):
+        _ = monitor_lib.Monitor(step_function, sess)
+
+  def testNotPassingASessionInGraph(self):
+    distribution = one_device_strategy.OneDeviceStrategy(
+        "/device:CPU:0")
+    step_function, _ = single_loss_example(
+        lambda: gradient_descent.GradientDescentOptimizer(0.2), distribution)
+
+    with context.graph_mode(), ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError, "Should provide"):
+        _ = monitor_lib.Monitor(step_function, session=None)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..646d2a5c3b3b0bfcce6f89be0e588baacc6b9237
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -0,0 +1,143 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class OneDeviceStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.distribute.python import values
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import distribute as distribute_lib
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+class OneDeviceStrategy(distribute_lib.DistributionStrategy):
+  """A distribution strategy for running on a single device."""
+  # TODO(josh11b): Do we wrap values in types to generate errors if you are
+  # doing something that won't work with other DistributionStrategy
+  # implementations?
+
+  def __init__(self, device):
+    super(OneDeviceStrategy, self).__init__()
+    self._device = device
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    # No need to distinguish tower-local variables when not mirroring,
+    # we just enforce that they are not trainable.
+    if kwargs.pop("tower_local_reduce_method", None) is not None:
+      kwargs["trainable"] = False
+
+    colocate_with = kwargs.pop("colocate_with", None)
+    if colocate_with is None:
+      with ops.device(self._device):
+        return next_creator(*args, **kwargs)
+    if isinstance(colocate_with, six.string_types):
+      with ops.device(colocate_with):
+        return next_creator(*args, **kwargs)
+    if (isinstance(colocate_with, list) and len(colocate_with) == 1 and
+        isinstance(colocate_with[0], six.string_types)):
+      with ops.device(colocate_with[0]):
+        return next_creator(*args, **kwargs)
+    with ops.colocate_with(colocate_with):
+      return next_creator(*args, **kwargs)
+
+  def distribute_dataset(self, dataset_fn):
+    return self._call_dataset_fn(dataset_fn)
+
+  def _broadcast(self, tensor, destinations):
+    return tensor
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    # We don't run `fn` in multiple threads in OneDeviceStrategy.
+    kwargs.pop("run_concurrently", None)
+    with ops.device(self._device), _OneDeviceTowerContext(self):
+      return fn(*args, **kwargs)
+
+  def map(self, map_over, fn, *args, **kwargs):
+    with ops.device(self._device):
+      return values.MapOutput([fn(m, *args, **kwargs) for m in map_over])
+
+  def _reduce(self, method_string, value, destinations):
+    if not isinstance(value, values.MapOutput):
+      return value
+    l = value.get()
+    assert l
+    with ops.device(self._device):
+      if method_string == "sum":
+        return math_ops.add_n(l)
+      elif method_string == "mean":
+        return math_ops.add_n(l) / len(l)
+      else:
+        assert False
+
+  def _update(self, var, fn, *args, **kwargs):
+    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
+      return fn(var, *args, **kwargs)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    del colocate_with
+    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
+      return fn(*args, **kwargs)
+
+  def _fetch(self, val, destination, fn):
+    """Return a copy of `val` or `fn(val)` on `destination`."""
+    with ops.device(self._device):
+      v = fn(val)
+    with ops.device(destination):
+      return array_ops.identity(v)
+
+  def _unwrap(self, value):
+    return [value]
+
+  @property
+  def is_single_tower(self):
+    return True
+
+  @property
+  def num_towers(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    return [self._device]
+
+  @property
+  def parameter_devices(self):
+    return [self._device]
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return [self._device]
+
+  def _worker_device_index(self):
+    return 0
+
+
+class _OneDeviceTowerContext(distribute_lib.TowerContext):
+
+  def __init__(self, distribution_strategy):
+    distribute_lib.TowerContext.__init__(
+        self, distribution_strategy, tower_id=0)
+
+  @property
+  def device(self):
+    return self._distribution_strategy.worker_devices[0]
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7101ed0756f44b846f10ddc6d429afe005a2f196
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class OneDeviceStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+
+
+@test_util.with_c_api
+class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    return one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+
+  def testMinimizeLossEager(self):
+    self._test_minimize_loss_eager(self._get_distribution_strategy())
+
+  def testMinimizeLossGraph(self):
+    self._test_minimize_loss_graph(self._get_distribution_strategy())
+
+  def testMapReduce(self):
+    self._test_map_reduce(self._get_distribution_strategy())
+
+  def testDeviceIndex(self):
+    self._test_device_index(self._get_distribution_strategy())
+
+  def testTowerId(self):
+    self._test_tower_id(self._get_distribution_strategy())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCallAndMergeExceptions(self):
+    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd3a65ac4e19ece6b69b9834f4218fde55b60c2
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -0,0 +1,71 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for running legacy optimizer code with DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+
+
+class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v2_optimizers(),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+  def testTrainNetwork(self, distribution, optimizer_fn,
+                       use_callable_loss=True):
+    with distribution.scope():
+      model_fn, dataset_fn, layer = minimize_loss_example(
+          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
+
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+
+      def run_step():
+        return control_flow_ops.group(distribution.unwrap(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(), run_concurrently=layer.built)))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(10):
+        run_step()
+
+        weights.append(self.evaluate(distribution.fetch(layer.kernel)))
+        biases.append(self.evaluate(distribution.fetch(layer.bias)))
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3670b45aba801cf8c18e04bfea03e23eb67184
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -0,0 +1,225 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Extension of prefetching_ops to support more than one device."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.contrib.data.python.ops import prefetching_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import nest as data_nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.util import nest
+
+
+# pylint: disable=protected-access
+class _PrefetchToDeviceIterator(object):
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset.
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    devices: Devices on which to prefetch.
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server). Only used if one_shot
+        is False.
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               one_shot,
+               devices,
+               buffer_size,
+               shared_name=None):
+    self._input_dataset = input_dataset
+    self._get_next_call_count = 0
+    self._one_shot = one_shot
+    if shared_name is None:
+      shared_name = ""
+    self._devices = devices
+
+    if self._one_shot:
+      self._input_iterator = input_dataset.make_one_shot_iterator()
+    else:
+      self._input_iterator = iterator_ops.Iterator.from_structure(
+          self._input_dataset.output_types, self._input_dataset.output_shapes,
+          shared_name, self._input_dataset.output_classes)
+    input_iterator_handle = self._input_iterator.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self._input_iterator.output_types,
+          self._input_iterator.output_shapes,
+          self._input_iterator.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    target_device = gen_dataset_ops.iterator_get_device(
+        self._input_iterator._iterator_resource)
+    self._buffering_resources = []
+    for device in nest.flatten(self._devices):
+      with ops.device(device):
+        buffer_resource_handle = prefetching_ops.function_buffering_resource(
+            f=_prefetch_fn,
+            target_device=target_device,
+            string_arg=input_iterator_handle,
+            buffer_size=buffer_size,
+            shared_name=shared_name)
+        self._buffering_resources.append(buffer_resource_handle)
+
+    if not self._one_shot:
+      reset_ops = []
+      for buffer_resource in self._buffering_resources:
+        reset_ops.append(
+            prefetching_ops.function_buffering_resource_reset(buffer_resource))
+      with ops.control_dependencies(reset_ops):
+        self._initializer = self._input_iterator.make_initializer(
+            self._input_dataset)
+
+  def get_next(self, name=None):
+    """See @{tf.data.Iterator.get_next}."""
+    self._get_next_call_count += 1
+    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
+
+    flat_result = []
+    # TODO(priyag): This will fail if the input size (typically number of
+    # batches) is not divisible by number of devices.
+    # How do we handle that more gracefully / let the user know?
+    for buffer_resource in self._buffering_resources:
+      flat_ret = gen_dataset_ops.function_buffering_resource_get_next(
+          buffer_resource,
+          output_types=data_nest.flatten(sparse.as_dense_types(
+              self.output_types, self.output_classes)), name=name)
+
+      ret = sparse.deserialize_sparse_tensors(
+          data_nest.pack_sequence_as(self.output_types, flat_ret),
+          self.output_types, self.output_shapes, self.output_classes)
+
+      for tensor, shape in zip(
+          data_nest.flatten(ret), data_nest.flatten(self.output_shapes)):
+        if isinstance(tensor, ops.Tensor):
+          tensor.set_shape(shape)
+      flat_result.append(ret)
+
+    return nest.pack_sequence_as(self._devices, flat_result)
+
+  @property
+  def initializer(self):
+    if self._one_shot:
+      raise NotImplementedError("Can't initialize a one_shot_iterator")
+    return self._initializer
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+# pylint: enable=protected-access
+
+
+class _PrefetchToDeviceDataset(dataset_ops.Dataset):
+  """A `Dataset` whose iterator prefetches elements to other device(s)."""
+
+  def __init__(self, input_dataset, devices, buffer_size):
+    self._input_dataset = input_dataset
+    self._devices = devices
+    self._buffer_size = buffer_size if buffer_size is not None else 1
+
+  def make_one_shot_iterator(self):
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=True,
+        devices=self._devices,
+        buffer_size=self._buffer_size)
+
+  def make_initializable_iterator(self, shared_name=None):
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "make_initializable_iterator is not supported when eager "
+          "execution is enabled.")
+
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=False,
+        devices=self._devices,
+        buffer_size=self._buffer_size,
+        shared_name=shared_name)
+
+  def _as_variant_tensor(self):
+    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
+    # transformation methods is called.
+    # TODO(mrry): Investigate support for chaining further transformations after
+    # the prefetch, including GPU support.
+    raise NotImplementedError("`prefetch_to_devices()` must be the last "
+                              "transformation in a dataset pipeline.")
+
+  # TODO(priyag): Fix the output types, shapes and classes to match the result
+  # of get_next (which has the additional nesting layer of devices now).
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+def prefetch_to_devices(devices, buffer_size=None):
+  """A transformation that prefetches dataset values to the given `devices`.
+
+  NOTE: Although the transformation creates a @{tf.data.Dataset}, the
+  transformation must be the final `Dataset` in the input pipeline.
+
+  Args:
+    devices: A nested structure of devices on which to prefetch the data. It can
+      be a single device name, or a tuple or list of device names.
+    buffer_size: (Optional.) The number of elements to buffer on each device.
+      Defaults to an automatically chosen value.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+  def _apply_fn(dataset):
+    return _PrefetchToDeviceDataset(dataset, devices, buffer_size)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a68dbce6c7d03f6a1695ebfcd00178e21ac1cda0
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
@@ -0,0 +1,90 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for prefetching_ops_v2."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import prefetching_ops_v2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class PrefetchingOpsV2Test(test.TestCase):
+
+  def testPrefetchToOneDevice(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices("/gpu:0"))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToTwoDevicesInAList(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    output = []
+    with self.test_session() as sess:
+      for _ in range(5):
+        result = sess.run(next_element)
+        self.assertEqual(2, len(result))
+        output.extend(result)
+      self.assertEquals(set(range(10)), set(output))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToTwoDevicesWithReinit(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
+
+    iterator = device_dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for _ in range(5):
+        sess.run(next_element)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      sess.run(iterator.initializer)
+      for _ in range(5):
+        sess.run(next_element)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator.py b/tensorflow/contrib/distribute/python/shared_variable_creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7083e279f20803b227dcd52f6420ae832aa2df4
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/shared_variable_creator.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to re-use variables created on first device on subsequent devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+_VARIABLE_UNIQUIFYING_REGEX = re.compile(r"_\d/")
+_VARIABLE_UNIQUIFYING_REGEX_AT_END = re.compile(r"_\d$")
+
+
+def _canonicalize_variable_name(name):
+  # If no name is specified, uses default name "Variable".
+  if name is None:
+    return "Variable"
+  # Replace all instances of "_<num>/" with "/"
+  name = _VARIABLE_UNIQUIFYING_REGEX.sub("/", name)
+  # Replace any instances of "_<num>" at the end of the string with ""
+  name = _VARIABLE_UNIQUIFYING_REGEX_AT_END.sub("", name)
+  return name
+
+
+def make_fn(shared_variable_store, device_id):
+  """Construct the variable creator function for device `device_id`.
+
+  Constructs custom variable creator functions for the given device.
+  On first device (device_id == 0), it creates the variable using the
+  `next_creator`, and stores it in the provided `shared_variable_store`.
+  On all other devices (device_id > 0), it tries to re-use the variable
+  already created with the same name. If no such variable exists, it throws an
+  error.
+  Additionally, we de-uniquify variable names before checking for matches. This
+  helps re-use variables which are intended to be the same but have different
+  names due to variable uniquification happening upstream. Since this might
+  mean we may have multiple variables with the same canonical name, we store
+  them in a list per canonical name and return them in the same order as well.
+
+  Args:
+    shared_variable_store: A dictionary that we will use to store variables
+      created on the first device, and re-used by creators for other devices.
+    device_id: Integer index of the device whose creator should be
+      constructed.
+
+  Returns:
+    An appropriate creator function based on device_id.
+
+  """
+  variable_scope_access_index = {}
+  assert isinstance(device_id, int)
+
+  def create_new_variable(next_creator, *args, **kwargs):
+    """Create the variable using `next_creator` and store it."""
+    canonical_name = _canonicalize_variable_name(kwargs.get("name"))
+    v = next_creator(*args, **kwargs)
+
+    if canonical_name not in shared_variable_store:
+      shared_variable_store[canonical_name] = []
+    shared_variable_store[canonical_name].append(v)
+    return v
+
+  def reuse_variable(next_creator, *args, **kwargs):
+    """Re-use existing variable from store with same name (in order)."""
+    del next_creator, args
+    name = kwargs.get("name")
+    canonical_name = _canonicalize_variable_name(name)
+
+    try:
+      variable_index = variable_scope_access_index.get(canonical_name, 0)
+      v = shared_variable_store[canonical_name][variable_index]
+      # TODO(priyag): Make this variable re-use more robust by adding checks
+      # that the requested shape and dtype match the existing variable.
+      variable_scope_access_index[canonical_name] = variable_index + 1
+      return v
+    except (KeyError, IndexError):
+      raise RuntimeError(
+          "Tried to create variable {} with mismatching name on device {}".
+          format(name, device_id))
+
+  if device_id == 0:
+    return create_new_variable
+  else:
+    return reuse_variable
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..713494d603b855be2863af9f24ab98d4cf048042
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SharedVariableCreator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
+
+
+class CanonicalizeVariableNameTest(test.TestCase):
+
+  def _canonicalize(self, name):
+    return shared_variable_creator._canonicalize_variable_name(name)
+
+  def testNoName(self):
+    self.assertEquals("Variable", self._canonicalize(None))
+
+  def testPatternInMiddle(self):
+    self.assertEquals("foo/bar/baz", self._canonicalize("foo_1/bar_1/baz"))
+
+  def testPatternAtEnd(self):
+    self.assertEquals("foo", self._canonicalize("foo_1"))
+
+  def testWrongPatterns(self):
+    self.assertEquals("foo_1:0", self._canonicalize("foo_1:0"))
+    self.assertEquals("foo1", self._canonicalize("foo1"))
+    self.assertEquals("foo_a", self._canonicalize("foo_a"))
+
+
+@test_util.with_c_api
+class SharedVariableCreatorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSharedVariable(self):
+
+    shared_variable_store = {}
+    num_devices = 3
+    creator_fns = []
+    for i in range(num_devices):
+      creator_fn = shared_variable_creator.make_fn(shared_variable_store, i)
+      creator_fns.append(creator_fn)
+
+    with variable_scope.variable_creator_scope(creator_fns[0]):
+      v0 = variable_scope.variable(1.0, name="foo")
+
+    with variable_scope.variable_creator_scope(creator_fns[1]):
+      v1 = variable_scope.variable(1.0, name="foo")
+
+    with variable_scope.variable_creator_scope(creator_fns[2]):
+      v2 = variable_scope.variable(1.0, name="foo")
+
+    # v1 and v2 should be same as v0
+    self.assertIs(v1, v0)
+    self.assertIs(v2, v0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..0db0b59fcacee2785eb8191bb84ed5216a79b081
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -0,0 +1,117 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple network to use in tests and examples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.distribute.python import step_fn
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.layers import core
+from tensorflow.python.layers import normalization
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def single_loss_example(optimizer_fn, distribution, use_bias=False):
+  """Build a very simple network to use in tests and examples."""
+
+  def dataset_fn():
+    return dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+
+  optimizer = optimizer_fn()
+  layer = core.Dense(1, use_bias=use_bias)
+
+  def loss_fn(x):
+    y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
+    return y * y
+
+  single_loss_step = step_fn.StandardSingleLossStep(dataset_fn, loss_fn,
+                                                    optimizer, distribution)
+
+  # Layer is returned for inspecting the kernels in tests.
+  return single_loss_step, layer
+
+
+def minimize_loss_example(optimizer_fn,
+                          use_bias=False,
+                          use_callable_loss=True,
+                          create_optimizer_inside_model_fn=False):
+  """Example of non-distribution-aware legacy code."""
+
+  def dataset_fn():
+    dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+    # TODO(isaprykin): map_and_batch with drop_remainder causes shapes to be
+    # fully defined for TPU.  Remove this when XLA supports dynamic shapes.
+    return dataset.apply(
+        batching.map_and_batch(lambda x: x, batch_size=2, drop_remainder=True))
+
+  # An Optimizer instance is created either outside or inside model_fn.
+  outer_optimizer = None
+  if not create_optimizer_inside_model_fn:
+    outer_optimizer = optimizer_fn()
+
+  layer = core.Dense(1, use_bias=use_bias)
+
+  def model_fn(xs):
+    """A very simple model written by the user."""
+
+    def loss_fn():
+      x = math_ops.reduce_mean(xs, keepdims=True)
+      y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
+      return y * y
+
+    optimizer = outer_optimizer or optimizer_fn()
+
+    if use_callable_loss:
+      return optimizer.minimize(loss_fn)
+    else:
+      return optimizer.minimize(loss_fn())
+
+  return model_fn, dataset_fn, layer
+
+
+def batchnorm_example(optimizer_fn,
+                      batch_per_epoch=1,
+                      momentum=0.9,
+                      renorm=False):
+  """Example of non-distribution-aware legacy code with batch normalization."""
+
+  def dataset_fn():
+    # input shape is [16, 8], input values are increasing in both dimensions.
+    return dataset_ops.Dataset.from_tensor_slices(
+        [[[float(x * 8 + y + z * 100)
+           for y in range(8)]
+          for x in range(16)]
+         for z in range(batch_per_epoch)]).repeat()
+
+  optimizer = optimizer_fn()
+  batchnorm = normalization.BatchNormalization(
+      renorm=renorm, momentum=momentum, fused=False)
+
+  def model_fn(x):
+
+    def loss_fn():
+      y = math_ops.reduce_sum(batchnorm(x, training=True), axis=1)
+      loss = math_ops.reduce_mean(y - constant_op.constant(1.))
+      return loss
+
+    # Callable loss.
+    return optimizer.minimize(loss_fn)
+
+  return model_fn, dataset_fn, batchnorm
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1910622b38c748fc5a814f9e83c2294850d5d12
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The step function abstraction represents a single training step."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.training import optimizer as optimizer_lib
+
+
+class Step(object):
+  """Interface for performing each step of a training algorithm."""
+
+  def __init__(self, distribution):
+    self._distribution = distribution
+
+  @property
+  def distribution(self):
+    return self._distribution
+
+  def __call__(self):
+    """Perform one step of this training algorithm."""
+    return self.step(self.inputs())
+
+  def inputs(self):
+    """For the generating the input to be passed to `step()`."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  def step(self, inputs):
+    """Perform the main computation of this training algorithm."""
+    raise NotImplementedError("must be implemented in descendants")
+
+
+class StandardInputStep(Step):
+  """Step with a standard implementation of input handling.
+
+  Args:
+    dataset_fn: a function that returns a tf.data Dataset that produces the
+      input for the model.
+  """
+
+  def __init__(self, dataset_fn, distribution):
+    Step.__init__(self, distribution)
+    self._distributed_input = distribution.distribute_dataset(
+        dataset_fn).make_one_shot_iterator()
+
+  def inputs(self):
+    return self._distributed_input.get_next()
+
+
+class StandardSingleLossStep(StandardInputStep):
+  """A step function that implements a training step for a feed forward network.
+
+  An instance of this class is intended to be used as a callable:
+
+  ```python
+  ...
+  step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer)
+  step.initialize(distribution)
+
+  # Run a single training step on a given DistributionStrategy:
+  step(distribution)
+  ...
+  ```
+
+  Args:
+    dataset_fn: a function that returns a tf.data Dataset that produces the
+      input for the model.
+    loss_fn: a function that returns loss.
+    optimizer: an optimizer that implements an update rule.
+    distribution: a `DistributionStrategy` object.
+  """
+
+  def __init__(self, dataset_fn, loss_fn, optimizer, distribution):
+    StandardInputStep.__init__(self, dataset_fn, distribution)
+    self._loss_fn = loss_fn
+    self._optimizer = optimizer
+    self._is_run_concurrently = False
+
+  def step(self, inputs):
+    with self._distribution.scope():
+      gradients_fn = backprop.implicit_grad(self._loss_fn)
+      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
+
+      grads_and_vars = self.distribution.call_for_each_tower(
+          gradients_fn, inputs, run_concurrently=self._is_run_concurrently)
+      # If threads use layers, then we need to run the first step sequentially,
+      # so that layers.build() is not executed in parallel.  Otherwise, multiple
+      # sets of mirrored variables are going to be created.
+      self._is_run_concurrently = True
+      return self._optimizer._distributed_apply(  # pylint: disable=protected-access
+          self.distribution, grads_and_vars)
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..75c5ec9659d193e77d219ba79977615d58841d64
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class Step."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python.single_loss_example import single_loss_example
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.ops import variables
+
+
+class SingleLossStepTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=combinations.graph_and_eager_modes)))
+  def testTrainNetwork(self, distribution, optimizer_fn):
+    with distribution.scope():
+      single_loss_step, layer = single_loss_example(
+          optimizer_fn, distribution, use_bias=True)
+
+      if context.executing_eagerly():
+        run_step = single_loss_step
+      else:
+        with self.test_session() as sess:
+          run_step = sess.make_callable(single_loss_step())
+      self.evaluate(variables.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(10):
+        run_step()
+
+        weights.append(self.evaluate(distribution.fetch(layer.kernel)))
+        biases.append(self.evaluate(distribution.fetch(layer.bias)))
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b4ad9f146bc1d6a987fbeecbb05122946137154
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -0,0 +1,225 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for testing DistributionStrategy descendants."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import optimizer
+
+
+class _TestException(Exception):
+  pass
+
+
+# May be the argument to either distribution.call_for_each_tower() or
+# get_tower_context().merge_call()
+def _raise_exception_fn(_=None):
+  raise _TestException()
+
+
+# Must be the argument to a distribution.call_for_each_tower() call, calls a
+# get_tower_context().merge_call() that raises an exception.
+def _merge_raises_fn():
+  distribute_lib.get_tower_context().merge_call(_raise_exception_fn)
+
+
+# Must be the argument to a get_tower_context().merge_call() call, calls
+# dist.call_for_each_tower() with a function that raises an exception.
+def _call_raises_fn(dist):
+  dist.call_for_each_tower(_raise_exception_fn)
+
+
+# Must be the argument to a distribution.call_for_each_tower() call,
+# calls a get_tower_context().merge_call() that calls a
+# call_for_each_tower() that raises an exception.
+def _merge_call_raises_fn():
+  distribute_lib.get_tower_context().merge_call(_call_raises_fn)
+
+
+# Must be the argument to a get_tower_context().merge_call() call, calls
+# dist.call_for_each_tower() with a function that calls a
+# get_tower_context().merge_call() that raises an exception.
+def _call_merge_raises_fn(dist):
+  dist.call_for_each_tower(_merge_raises_fn)
+
+
+# Must be the argument to a distribution.call_for_each_tower() call, calls a
+# get_tower_context().merge_call() that calls a call_for_each_tower() that
+# calls a get_tower_context().merge_call() that raises an exception.
+def _merge_call_merge_raises_fn():
+  distribute_lib.get_tower_context().merge_call(_call_merge_raises_fn)
+
+
+class DistributionTestBase(test.TestCase):
+  """Some tests that should work with any DistributionStrategy."""
+
+  def _test_minimize_loss_eager(self, d):
+    with d.scope():
+      l = core.Dense(1, use_bias=False)
+
+      def loss(x):
+        # TODO(josh11b): What if this constant was instead a captured
+        # value?  Would it need to be a value that has been passed
+        # through d.broadcast()?
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+      # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
+      # common `implicit_grad` function and put it in DistributionStrategy.
+      grad_fn = backprop.implicit_grad(loss)
+      grad_fn = optimizer.get_filtered_grad_fn(grad_fn)
+
+      def update(v, g):
+        return v.assign_sub(0.2 * g)
+
+      one = d.broadcast(constant_op.constant([[1.]]))
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.call_for_each_tower(grad_fn, one, run_concurrently=l.built)
+
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.fetch(v)
+          before_list.append(fetched)
+          # control_dependencies irrelevant but harmless in eager execution
+          with ops.control_dependencies([fetched]):
+            g = d.reduce("sum", g, destinations=v)
+            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+              after_list.append(d.fetch(v))
+        return before_list, after_list
+
+      for i in range(10):
+        b, a = step()
+        if i == 0:
+          before, = b  # pylint: disable=unbalanced-tuple-unpacking
+        after, = a  # pylint: disable=unbalanced-tuple-unpacking
+
+      error_before = abs(before.numpy() - 1)
+      error_after = abs(after.numpy() - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+
+  def _test_minimize_loss_graph(self, d, soft_placement=False):
+    config = config_pb2.ConfigProto()
+    config.allow_soft_placement = soft_placement
+    config.gpu_options.per_process_gpu_memory_fraction = 0.3
+    with context.graph_mode(), \
+         ops.Graph().as_default(), \
+         self.test_session(config=config) as sess, \
+         d.scope():
+      l = core.Dense(1, use_bias=False)
+
+      def loss(x):
+        # TODO(josh11b): What if this constant was instead a captured
+        # value?  Would it need to be a value that has been passed
+        # through d.broadcast()?
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+
+      grad_fn = backprop.implicit_grad(loss)
+
+      def update(v, g):
+        return v.assign_sub(0.2 * g)
+
+      one = d.broadcast(constant_op.constant([[1.]]))
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.call_for_each_tower(grad_fn, one)
+
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.fetch(v)
+          before_list.append(fetched)
+          with ops.control_dependencies([fetched]):
+            g = d.reduce("sum", g, destinations=v)
+            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+              after_list.append(d.fetch(v))
+        return before_list, after_list
+
+      before_out, after_out = step()
+      variables.global_variables_initializer().run()
+      for i in range(10):
+        b, a = sess.run((before_out, after_out))
+        if i == 0:
+          before, = b
+        after, = a
+
+      error_before = abs(before - 1)
+      error_after = abs(after - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+
+  def _test_map_reduce(self, d, in_graph=None):
+    with d.scope():
+      map_in = [constant_op.constant(i) for i in range(10)]
+      map_out = d.map(map_in, lambda x, y: x * y, 2)
+      observed = d.fetch(d.reduce("sum", map_out))
+      expected = 90  # 2 * (0 + 1 + ... + 9)
+      self.assertEqual(expected, observed.numpy())
+
+  def _test_device_index(self, d):
+    with d.scope():
+      expected_devices = [False] * len(d.worker_devices)
+
+      def mark_devices_fn(device_id):
+        self.assertLess(device_id, len(d.worker_devices))
+        self.assertFalse(expected_devices[device_id])
+        expected_devices[device_id] = True
+
+      d.call_for_each_tower(mark_devices_fn, d.worker_device_index)
+      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+
+  def _test_tower_id(self, d):
+    with d.scope():
+      expected_devices = [False] * len(d.worker_devices)
+
+      def mark_devices_fn():
+        tower_id = distribute_lib.get_tower_context().tower_id
+        self.assertLess(tower_id, len(d.worker_devices))
+        self.assertFalse(expected_devices[tower_id])
+        expected_devices[tower_id] = True
+
+      d.call_for_each_tower(mark_devices_fn)
+      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+
+  def _test_call_and_merge_exceptions(self, dist):
+    with dist.scope():
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_raise_exception_fn)
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_merge_raises_fn)
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_merge_call_raises_fn)
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_merge_call_merge_raises_fn)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7e4fe80f3e65907fa4b48c5fe0fcfd422ba033f
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -0,0 +1,119 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU Distribution Strategy.
+
+This is experimental.  It's not ready for general use.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.contrib import tpu
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import values
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import nest
+
+
+# TODO(isaprykin):  Consider whether inheriting is really appropriate.
+class TPUStrategy(one_device_strategy.OneDeviceStrategy):
+  """Experimental TPU distribution strategy implementation."""
+
+  def __init__(self,
+               num_cores_per_host=2,
+               iterations_per_step=2):
+    # TODO(isaprykin): Generalize the defaults.  They are currently tailored for
+    # the unit test.
+    super(TPUStrategy, self).__init__('/cpu:0')
+    # TODO(isaprykin): Auto-detect number of cores and hosts.
+    self._num_cores_per_host = num_cores_per_host
+    # TODO(isaprykin): This might have to be per-call.
+    self._iterations_per_step = iterations_per_step
+
+  def distribute_dataset(self, dataset_fn):
+    return values.PerIterationDataset(
+        self._call_dataset_fn(dataset_fn), self._iterations_per_step,
+        self._num_cores_per_host)
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    kwargs.pop('run_concurrently', None)
+
+    inputs = {'args': args, 'kwargs': kwargs}
+    flat_inputs = nest.flatten(inputs)
+
+    feed_mask = [isinstance(f, values.PerIteration) for f in flat_inputs]
+
+    feeds = lambda: itertools.compress(flat_inputs, feed_mask)
+    shapes = [f.get_shape() for f in feeds()]
+    if any([not s.is_fully_defined() for s in shapes]):
+      raise ValueError(
+          'TPU currently requires fully defined shapes. Either use '
+          'set_shape() on the input tensors or use '
+          'dataset.apply(map_and_batch(..., drop_remainder=True)).')
+    types = [f.get_dtype() for f in feeds()]
+
+    def infeed_input(i):
+      """Get input, split it and then enqueue."""
+      iteration_inputs = [f.get(i) for f in feeds()]
+
+      infeed_inputs = [[inputs_per_core[core_id]
+                        for inputs_per_core in iteration_inputs]
+                       for core_id in range(self._num_cores_per_host)]
+
+      infeed_ops = []
+      for core_id, infeed_input in enumerate(infeed_inputs):
+        infeed_ops.append(
+            tpu_ops.infeed_enqueue_tuple(
+                inputs=infeed_input, shapes=shapes, device_ordinal=core_id))
+
+      with ops.control_dependencies(infeed_ops):
+        return i + 1
+
+    with ops.device('/task:0/device:CPU:0'):
+      enqueue_ops = control_flow_ops.while_loop(
+          lambda i: i < self._iterations_per_step,
+          infeed_input, [constant_op.constant(0)],
+          parallel_iterations=1)
+
+    def dequeueing_fn(*args, **kwargs):
+      """Dequeue input arguments and supply them to `fn`."""
+      del args, kwargs
+      dequeued = tpu.infeed_dequeue_tuple(dtypes=types, shapes=shapes)
+      dequeued = iter(dequeued)
+
+      fn_inputs = []
+      for inp, is_feed in zip(flat_inputs, feed_mask):
+        if is_feed:
+          fn_inputs.append(next(dequeued))
+        else:
+          fn_inputs.append(inp)
+
+      fn_inputs = nest.pack_sequence_as(inputs, fn_inputs)
+      return fn(*fn_inputs['args'], **fn_inputs['kwargs'])
+
+    def iterate_on_tpu():
+      return tpu.repeat(self._iterations_per_step, dequeueing_fn, [])
+
+    with one_device_strategy._OneDeviceTowerContext(self):  # pylint: disable=protected-access
+      tpu_result = tpu.batch_parallel(
+          iterate_on_tpu, [], num_shards=self._num_cores_per_host)
+
+    return control_flow_ops.group(tpu_result, enqueue_ops)
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cb5276579f48f9ea5781c5351cbf9bf3db16e6c
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -0,0 +1,635 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed values.
+
+See go/tf-distribution-strategy.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+import six
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.distribute.python import prefetching_ops_v2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import saver
+from tensorflow.python.util import nest
+
+
+# pylint: disable=line-too-long
+# TODO(josh11b): Should device values be strings or DeviceSpec objects
+# Not sure DeviceSpec objects are usable as a dict key.
+class DistributedValues(object):
+  """Holds a map from device to values. Either PerDevice or Mirrored."""
+
+  def __init__(self, index):
+    self._index = {device_util.canonicalize(key): value
+                   for key, value in six.iteritems(index)}
+
+  def get(self, device=None):
+    """Returns the value for the current device or raises a ValueError."""
+    if device is None:
+      tower_context = distribute_lib.get_tower_context()
+      if tower_context:
+        device = tower_context.device
+      else:
+        device = distribute_lib.get_update_device()
+        if device is None:
+          device = device_util.current()
+    device = device_util.canonicalize(device)
+    try:
+      return self._index[device]
+    except KeyError:
+      raise ValueError("Device %s not found in %s (current device %s)" %
+                       (device, self._index.keys(), device_util.current()))
+
+  def on_device(self, device):
+    device = device_util.canonicalize(device)
+    return device in self._index
+
+  @property
+  def devices(self):
+    return list(self._index.keys())
+
+  def __str__(self):
+    return "%s:%s" % (self.__class__.__name__, self._index)
+
+  def __repr__(self):
+    return "%s(%r)" % (self.__class__.__name__, self._index)
+
+  # TODO(josh11b): Possibly make an accessor for _index for use by
+  # DistributionStrategy implementations.
+
+
+class DistributedDelegate(DistributedValues):
+  """A map from device to values; acts as the same type as the values."""
+
+  def __init__(self, index):
+    super(DistributedDelegate, self).__init__(index)
+
+  def __getattr__(self, name):
+    return getattr(self.get(), name)
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o): return self.get() + o
+  def __radd__(self, o): return o + self.get()
+  def __sub__(self, o): return self.get() - o
+  def __rsub__(self, o): return o - self.get()
+  def __mul__(self, o): return self.get() * o
+  def __rmul__(self, o): return o * self.get()
+  def __truediv__(self, o): return self.get() / o
+  def __rtruediv__(self, o): return o / self.get()
+  def __floordiv__(self, o): return self.get() // o
+  def __rfloordiv__(self, o): return o // self.get()
+  def __mod__(self, o): return self.get() % o
+  def __rmod__(self, o): return o % self.get()
+  def __lt__(self, o): return self.get() < o
+  def __le__(self, o): return self.get() <= o
+  def __gt__(self, o): return self.get() > o
+  def __ge__(self, o): return self.get() >= o
+  def __and__(self, o): return self.get() & o
+  def __rand__(self, o): return o & self.get()
+  def __or__(self, o): return self.get() | o
+  def __ror__(self, o): return o | self.get()
+  def __xor__(self, o): return self.get() ^ o
+  def __rxor__(self, o): return o ^ self.get()
+  def __getitem__(self, o): return self.get()[o]
+  def __pow__(self, o, modulo=None): return pow(self.get(), o, modulo)
+  def __rpow__(self, o): return pow(o, self.get())
+  def __invert__(self): return ~self.get()
+  def __neg__(self): return -self.get()
+  def __abs__(self): return abs(self.get())
+
+  def __div__(self, o):
+    try:
+      return self.get().__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self.get().__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self.get().__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self.get().__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  # TODO(josh11b): Even more operator overloads.
+
+
+class PerDevice(DistributedValues):
+  """Holds a map from device to unsynchronized values."""
+  pass
+
+
+class Mirrored(DistributedValues):
+  """Holds a map from device to values which are kept in sync."""
+  pass
+
+
+def _assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(array_ops.identity(tensor))
+
+
+DistributedVarOp = collections.namedtuple(
+    "DistributedVarOp", ["name", "graph", "type"])
+
+
+class DistributedVariable(DistributedDelegate):
+  """Holds a map from device to variables."""
+  # TODO(josh11b): Support changing the set of variables if e.g. if new
+  # devices are joining or a device is to leave.
+
+  def __init__(self, index):
+    # Child class must set self._primary_var before calling
+    # super(...).__init__(index).
+    self._common_name = self._primary_var.name.split(":")[0]
+    super(DistributedVariable, self).__init__(index)
+
+  @property
+  def initializer(self):
+    return control_flow_ops.group([v.initializer for v in self._index.values()])
+
+  @property
+  def graph(self):
+    return self._primary_var.graph
+
+  @property
+  def _shared_name(self):
+    return self._common_name
+
+  @property
+  def _unique_id(self):
+    return self._primary_var._unique_id   # pylint: disable=protected-access
+
+  @property
+  def name(self):
+    return self._primary_var.name
+
+  @property
+  def dtype(self):
+    return self._primary_var.dtype
+
+  @property
+  def shape(self):
+    return self._primary_var.shape
+
+  def get_shape(self):
+    return self._primary_var.get_shape()
+
+  def to_proto(self, export_scope=None):
+    return self._primary_var.to_proto(export_scope=export_scope)
+
+  @property
+  def op(self):
+    # We want cross-tower code that does some var.op.X calls
+    # to work (even if the current device isn't in self.devices), but
+    # other uses of var.op in a cross-tower context to fail.
+    if distribute_lib.get_cross_tower_context():
+      return DistributedVarOp(self._primary_var.op.name,
+                              self._primary_var.op.graph,
+                              self._primary_var.op.type)
+    return self.get().op
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion(var, dtype=None, name=None, as_ref=False):
+  # Try to avoid assignments to and other mutations of MirroredVariable
+  # state except through a DistributionStrategy.update() call.
+  assert not as_ref
+  return ops.internal_convert_to_tensor(
+      var.get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+ops.register_tensor_conversion_function(DistributedVariable, _tensor_conversion)
+ops.register_dense_tensor_like_type(DistributedVariable)
+
+
+class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
+  """Class for defining how to restore a MirroredVariable."""
+
+  def __init__(self, mirrored_variable, primary_variable, name):
+    self._mirrored_variable = mirrored_variable
+    super(_MirroredSaveable, self).__init__(primary_variable, "", name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group([
+        _assign_on_device(d, v, tensor)
+        for d, v in six.iteritems(self._mirrored_variable._index)])  # pylint: disable=protected-access
+
+
+def _get_update_device():
+  """Validate we are in update/update_non_slot() and return current device.
+
+  This is used in MirroredVariable.assign* members, to make sure they
+  are only called via an update method, to make sure all components of the
+  variable are being updated in a consistent way.
+
+  Returns:
+    A string device.
+
+  Raises:
+    RuntimeError: If not in distribution.update()/.update_non_slot().
+  """
+  device = distribute_lib.get_update_device()
+  if device is None:
+    raise RuntimeError(
+        "Use DistributionStrategy.update() to modify a MirroredVariable.")
+  return device
+
+
+class MirroredVariable(DistributedVariable, Mirrored,
+                       checkpointable.CheckpointableBase):
+  """Holds a map from device to variables whose values are kept in sync."""
+
+  def __init__(self, index, primary_var):
+    # Use a weakref to make it easy to map from the contained values
+    # to the container without introducing a reference cycle.
+    for v in six.itervalues(index):
+      v._mirrored_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._primary_var = primary_var
+    super(MirroredVariable, self).__init__(index)
+
+  # We use _get_update_device() for the assign* methods to enforce
+  # that we are in an update() function. The arguments to update() are
+  # automatically unwrapped so the update() function would normally
+  # see regular variables, not MirroredVariables. However, the update
+  # function can still operate on wrapped MirroredVariables through
+  # object members, captured arguments, etc. This is more likely in an
+  # update_non_slot() function (like OptimizerV2._finish), which can
+  # update several non-slot variables in one call.
+  def assign_sub(self, *args, **kwargs):
+    return self.get(device=_get_update_device()).assign_sub(*args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    return self.get(device=_get_update_device()).assign_add(*args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    return self.get(device=_get_update_device()).assign(*args, **kwargs)
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    MirroredVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _MirroredSaveable(self, self._primary_var, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+
+class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Class for defining how to restore a TowerLocalVariable."""
+
+  def __init__(self, tower_local_variable, name):
+    self._tower_local_variable = tower_local_variable
+    # We use a callable so that we don't have to evaluate this expression
+    # in the case where we are trying to restore instead of save.
+    def tensor():
+      return distribute_lib.get_distribution_strategy().fetch(
+          tower_local_variable)
+    spec = saver.BaseSaverBuilder.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name,
+        dtype=tower_local_variable.dtype)
+    super(_TowerLocalSaveable, self).__init__(tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    tensor, = restored_tensors
+    # To preserve the sum across save and restore, we have to divide the
+    # total across all devices when restoring a variable that was summed
+    # when saving.
+    if self._tower_local_variable.reduce_method == "sum":
+      tensor *= 1. / len(self._tower_local_variable.devices)
+    return control_flow_ops.group([
+        _assign_on_device(d, v, tensor)
+        for d, v in six.iteritems(self._tower_local_variable._index)])  # pylint: disable=protected-access
+
+
+class TowerLocalVariable(DistributedVariable, PerDevice,
+                         checkpointable.CheckpointableBase):
+  """Holds a map from device to variables whose values are reduced on save."""
+
+  def __init__(self, index, primary_var, reduce_method):
+    self._primary_var = primary_var
+    self._reduce_method = reduce_method
+    super(TowerLocalVariable, self).__init__(index)
+
+  def assign_sub(self, *args, **kwargs):
+    return self.get().assign_sub(*args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    return self.get().assign_add(*args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    return self.get().assign(*args, **kwargs)
+
+  @property
+  def reduce_method(self):
+    return self._reduce_method
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    TowerLocalVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _TowerLocalSaveable(self, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+
+def _devices_match(d1, d2):
+  return device_util.canonicalize(d1) == device_util.canonicalize(d2)
+
+
+def regroup(per_device, wrap_class=PerDevice):
+  """Makes device->nest map into a nest of PerDevice/Mirrored values."""
+  items = list(per_device.items())
+  assert items
+  v0 = items[0][1]  # First value
+
+  if isinstance(v0, list):
+    for _, v in items[1:]:
+      assert isinstance(v, list)
+      assert len(v) == len(v0), ("len(v) == %d, len(v0) == %d, v: %s, v0: %s" %
+                                 (len(v), len(v0), v, v0))
+    return [regroup({k: v[i] for k, v in items}, wrap_class)
+            for i in range(len(v0))]
+
+  if isinstance(v0, tuple):
+    for _, v in items[1:]:
+      assert isinstance(v, tuple)
+      assert len(v) == len(v0)
+    regrouped_tuple = tuple(regroup({k: v[i] for k, v in items}, wrap_class)
+                            for i in range(len(v0)))
+    if hasattr(v0, "_fields"):
+      # This tuple is in fact a namedtuple! Create a new namedtuple instance
+      # and initialize it with the regrouped values:
+      assert hasattr(type(v0), "_make")
+      return type(v0)._make(regrouped_tuple)
+    else:
+      return regrouped_tuple
+
+  if isinstance(v0, dict):
+    v0keys = set(v0.keys())
+    for _, v in items[1:]:
+      assert isinstance(v, dict)
+      assert set(v.keys()) == v0keys
+    return {key: regroup({k: v[key] for k, v in items}, wrap_class)
+            for key in v0keys}
+
+  # If exactly the same object across all devices, return it unwrapped.
+  same_id = True
+  for _, v in items[1:]:
+    if v is not v0:
+      same_id = False
+      break
+  # Consider three cases where same_id is true:
+  # * If v0 is a MirroredVariable (and same_id means it is the same
+  #   across all devices), we want to return it. We check
+  #   MirroredVariable specifically since it can look like it
+  #   has a _mirrored_container member since its members do.
+  # * If v0 is a member of a mirrored variable, in which case
+  #   hasattr(v0, "_mirrored_container") is true, we want to
+  #   return the MirroredVariable that contains it using the
+  #   _mirrored_container logic below. This case can trigger
+  #   same_id when there is only one device.
+  # * In any other situation, same_id means we return v0.
+  if same_id and (isinstance(v0, MirroredVariable) or
+                  not hasattr(v0, "_mirrored_container")):
+    return v0
+
+  # Detect the case where each device has a parallel component of the
+  # same MirroredVariable. In this case we want to return the
+  # containing MirroredVariable, after a bunch of sanity checking.
+  # In particular, each component should have the same container,
+  # and the devices of the variables should match the keys of the
+  # per-device dictionary.
+  # TODO(josh11b): Do we need similar logic for TowerLocalVariables?
+  if hasattr(v0, "_mirrored_container"):
+    # pylint: disable=protected-access
+    assert not isinstance(v0, MirroredVariable), (
+        "ids = %s, items = %s" % ([id(v[1]) for v in items], items))
+    assert _devices_match(v0.device, items[0][0]), (
+        "v0.device = %s, items = %s" % (v0.device, items))
+    mirrored_container = v0._mirrored_container()
+    assert mirrored_container is not None
+    for d, v in items[1:]:
+      assert _devices_match(v.device, d), (
+          "v.device = %s, d = %s, items = %s" % (v.device, d, items))
+      assert mirrored_container is v._mirrored_container()
+    return mirrored_container
+  # pylint: enable=protected-access
+
+  return wrap_class(per_device)
+
+
+def select_device(device, structured):
+  """Specialize a nest of regular & per-device values for one device."""
+  def _get(x):
+    return x.get(device) if isinstance(x, DistributedValues) else x
+
+  return nest.map_structure(_get, structured)
+
+
+def select_device_mirrored(device, structured):
+  """Specialize a nest of regular & mirrored values for one device."""
+  def _get_mirrored(x):
+    if isinstance(x, DistributedValues):
+      if not isinstance(x, Mirrored):
+        raise TypeError(
+            "Expected value to be mirrored across towers: %s in %s." %
+            (x, structured))
+      return x.get(device)
+    else:
+      return x
+
+  return nest.map_structure(_get_mirrored, structured)
+
+
+class PerDeviceDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `PerDeviceDataset`."""
+
+  def __init__(self, iterator, devices, prefetch_on_device=None):
+    self._iterator = iterator
+    self._devices = devices
+    self._prefetch_on_device = prefetch_on_device
+
+  @property
+  def initializer(self):
+    return self._iterator.initializer
+
+  def get_next(self, name=None):
+    """Scatter the input across devices."""
+    if self._prefetch_on_device:
+      data_list = self._iterator.get_next(name=name)
+      index = dict(zip(self._devices, data_list))
+    else:
+      batch = self._iterator.get_next(name=name)
+      index = {}
+      def get_ith(i):
+        return lambda x: x[i]
+
+      for i, d in enumerate(self._devices):
+        index[d] = nest.map_structure(get_ith(i), batch)
+        if context.executing_eagerly():
+          with ops.device(d):
+            index[d] = nest.map_structure(array_ops.identity, index[d])
+
+    return regroup(index)
+
+
+class PerDeviceDataset(object):
+  """Like `tf.data.Dataset` split devices, producing `PerDevice` data."""
+
+  def __init__(self, dataset, devices, prefetch_on_device=None):
+    self._devices = devices
+
+    # Default to using prefetching in graph mode, unless specified.
+    # TODO(priyag): Enable prefetching in eager mode.
+    self._prefetch_on_device = prefetch_on_device
+    if self._prefetch_on_device is None:
+      self._prefetch_on_device = not context.executing_eagerly()
+    assert not (self._prefetch_on_device and context.executing_eagerly()), (
+        "Prefetching is only supported in graph mode currently")
+
+    if self._prefetch_on_device:
+      self._dataset = dataset.apply(
+          prefetching_ops_v2.prefetch_to_devices(self._devices))
+    else:
+      # TODO(priyag): If dropping remainder is not appropriate, find another
+      # approach to distributing the dataset when not possible to divide evenly.
+      # Possibly not an issue when we start using PartitionedDataset.
+      self._dataset = dataset.apply(
+          batching.batch_and_drop_remainder(len(devices)))
+
+  def make_one_shot_iterator(self):
+    """Get a one time use iterator for the distributed PerDeviceDataset."""
+    dataset_iterator = self._dataset.make_one_shot_iterator()
+    return PerDeviceDataIterator(
+        dataset_iterator, self._devices, self._prefetch_on_device)
+
+  def make_initializable_iterator(self):
+    """Get an initializable iterator for the distributed PerDeviceDataset."""
+    dataset_iterator = self._dataset.make_initializable_iterator()
+    return PerDeviceDataIterator(
+        dataset_iterator, self._devices, self._prefetch_on_device)
+
+
+class PerIteration(object):
+  """Holds input for multiple iterations at once."""
+
+  def __init__(self, index):
+    self._index = index
+
+  def get(self, iteration):
+    return array_ops.gather(self._index, iteration)
+
+  def get_shape(self):
+    return self._index[-1][-1].get_shape()
+
+  def get_dtype(self):
+    return self._index[-1][-1].dtype
+
+
+class MultiIterator(object):
+  """Iterator that returns results of multiple get_next()s."""
+
+  def __init__(self, dataset_iterator, iterations, batches_per_iteration):
+    self._dataset_iterator = dataset_iterator
+    self._iterations = iterations
+    self._batches_per_iteration = batches_per_iteration
+
+  def get_next(self, name=None):
+    return PerIteration([[
+        self._dataset_iterator.get_next(name=name)
+        for _ in range(self._batches_per_iteration)
+    ]
+                         for _ in range(self._iterations)])
+
+  @property
+  def initializer(self):
+    return self._dataset_iterator.initializer
+
+
+class PerIterationDataset(object):
+  """A dataset that returns MultiIterators."""
+
+  def __init__(self, dataset, iterations, batches_per_iteration):
+    self._dataset = dataset
+    self._iterations = iterations
+    self._batches_per_iteration = batches_per_iteration
+
+  def make_one_shot_iterator(self):
+    iterator = self._dataset.make_one_shot_iterator()
+    return MultiIterator(iterator, self._iterations,
+                         self._batches_per_iteration)
+
+  def make_initializable_iterator(self):
+    iterator = self._dataset.make_initializable_iterator()
+    return MultiIterator(iterator, self._iterations,
+                         self._batches_per_iteration)
+
+
+class MapOutput(object):
+  """Map can result in multiple outputs per device."""
+
+  def __init__(self, l):
+    self._l = l
+
+  def get(self):
+    return self._l
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e96ce547415fcb2bf3da8b6085ee11f51717db8d
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -0,0 +1,834 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the distributed values library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import values
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import device_util
+from tensorflow.python.training import saver as saver_lib
+
+
+@test_util.with_c_api
+class DistributedValuesTest(test.TestCase):
+
+  def testGetEager(self):
+    with ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = constant_op.constant(2)
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      with self.assertRaises(ValueError):
+        self.assertIsNone(v.get("/device:GPU:2"))
+
+  def testGetGraph(self):
+    with context.graph_mode(), \
+        ops.Graph().as_default(), \
+        ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = constant_op.constant(2)
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      with self.assertRaises(ValueError):
+        self.assertIsNone(v.get("/device:GPU:2"))
+
+  def testCanonicalization(self):
+    canonical_cpu = ["/job:localhost/replica:0/task:0/device:CPU:0"]
+    v = values.DistributedValues({"": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    v = values.DistributedValues({"/device:CPU:0": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    v = values.DistributedValues({"/cpu:0": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    v = values.DistributedValues({"/CPU:0": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    with self.assertRaises(AssertionError):
+      v = values.DistributedValues({"/device:cpu:0": 42})
+
+
+@test_util.with_c_api
+class DistributedDelegateTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGetAttr(self):
+    with ops.device("/device:CPU:0"):
+
+      class Foo(object):
+
+        def __init__(self, x):
+          self.x = x
+
+      v = values.DistributedDelegate(
+          {"/device:CPU:0": Foo(7), "/device:GPU:0": Foo(8)})
+      self.assertEqual(7, v.x)
+      with self.assertRaises(AttributeError):
+        _ = v.y
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOperatorOverride(self):
+    with ops.device("/device:CPU:0"):
+      v = values.DistributedDelegate({"/device:CPU:0": 7, "/device:GPU:0": 8})
+      # v should act like int(7).
+      self.assertEqual(8, v + 1)
+      self.assertEqual(10, 3 + v)
+      self.assertEqual(14, v + v)
+      self.assertEqual(5, v - 2)
+      self.assertEqual(6, 13 - v)
+      self.assertEqual(0, v - v)
+      self.assertEqual(14, v * 2)
+      self.assertEqual(21, 3 * v)
+      self.assertEqual(49, v * v)
+      self.assertEqual(3.5, v / 2)
+      self.assertEqual(1.5, 10.5 / v)
+      self.assertEqual(3, v // 2)
+      self.assertEqual(2, 15 // v)
+      self.assertEqual(1, v % 2)
+      self.assertEqual(2, 16 % v)
+      self.assertTrue(v < 12)
+      self.assertTrue(v <= 12)
+      self.assertFalse(v > 12)
+      self.assertFalse(v >= 12)
+      self.assertFalse(12 < v)
+      self.assertFalse(12 <= v)
+      self.assertTrue(12 > v)
+      self.assertTrue(12 >= v)
+      self.assertEqual(3, v & 3)
+      self.assertEqual(3, 11 & v)
+      self.assertEqual(15, v | 8)
+      self.assertEqual(23, 16 | v)
+      self.assertEqual(4, v ^ 3)
+      self.assertEqual(12, 11 ^ v)
+      self.assertEqual(343, pow(v, 3))
+      self.assertEqual(3, pow(v, 3, 10))
+      self.assertEqual(128, pow(2, v))
+      self.assertEqual(-7, -v)
+      self.assertEqual(~7, ~v)
+      self.assertEqual(7, abs(v))
+      with self.assertRaises(TypeError):
+        _ = v[2]
+
+
+def _device_str(d):
+  return "/device:GPU:" + str(d)
+
+
+def _nested_value(d):
+  return ("a" + d, ["b" + d, {"c": "d" + d, "e": "f" + d}, "g" + d], "h" + d)
+
+
+def _make_mirrored():
+  v = []
+  index = {}
+  devices = ["/device:GPU:0", "/device:CPU:0"]
+  for d, n, init in zip(devices, ["v", "v/replica"], [1., 2.]):
+    with ops.device(d):
+      v.append(variable_scope.get_variable(
+          name=n, initializer=init, use_resource=True))
+      index[d] = v[-1]
+  mirrored = values.MirroredVariable(index, v[0])
+  return v, devices, mirrored
+
+
+@test_util.with_c_api
+class RegroupAndSelectDeviceTest(test.TestCase):
+
+  def _is_per_device(self, result, expected, klass=values.PerDevice):
+    self.assertIsInstance(result, klass)
+    # We canonicalize the devices to match the device strings returned
+    # by PerDevice, which also does device string canonicalization.
+    devices = [device_util.canonicalize(_device_str(i))
+               for i in range(len(expected))]
+    self.assertEqual(set(devices), set(result.devices))
+    for i, d in enumerate(devices):
+      self.assertEqual(expected[i], result.get(d))
+      self.assertEqual(expected[i], result.get(_device_str(i)))
+
+  def testNested(self):
+    result = values.regroup({_device_str(0): _nested_value("1"),
+                             _device_str(1): _nested_value("2")})
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(3, len(result))
+    self._is_per_device(result[0], ["a1", "a2"])
+    self._is_per_device(result[2], ["h1", "h2"])
+
+    self.assertIsInstance(result[1], list)
+    self.assertEqual(3, len(result[1]))
+    self._is_per_device(result[1][0], ["b1", "b2"])
+    self._is_per_device(result[1][2], ["g1", "g2"])
+
+    self.assertIsInstance(result[1][1], dict)
+    self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
+    self._is_per_device(result[1][1]["c"], ["d1", "d2"])
+    self._is_per_device(result[1][1]["e"], ["f1", "f2"])
+
+    # Also test that we can undo the merge using select_device()
+    self.assertEqual(_nested_value("1"),
+                     values.select_device(_device_str(0), result))
+    self.assertEqual(_nested_value("2"),
+                     values.select_device(_device_str(1), result))
+    # select_device_mirrored() should fail due to non-mirrored values
+    with self.assertRaises(TypeError):
+      values.select_device_mirrored(_device_str(0), result)
+    with self.assertRaises(TypeError):
+      values.select_device_mirrored(_device_str(1), result)
+
+  def testWrapClass(self):
+    # Normally a mirrored value would be the same across devices, but
+    # for a test it is convenient to be able to tell the values apart.
+    result = values.regroup({_device_str(0): _nested_value("1"),
+                             _device_str(1): _nested_value("2")},
+                            values.Mirrored)
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(3, len(result))
+    self._is_per_device(result[0], ["a1", "a2"], values.Mirrored)
+    self._is_per_device(result[2], ["h1", "h2"], values.Mirrored)
+
+    self.assertIsInstance(result[1], list)
+    self.assertEqual(3, len(result[1]))
+    self._is_per_device(result[1][0], ["b1", "b2"], values.Mirrored)
+    self._is_per_device(result[1][2], ["g1", "g2"], values.Mirrored)
+
+    self.assertIsInstance(result[1][1], dict)
+    self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
+    self._is_per_device(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
+    self._is_per_device(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
+
+    # Also test that we can undo the merge using select_device()
+    self.assertEqual(_nested_value("1"),
+                     values.select_device(_device_str(0), result))
+    self.assertEqual(_nested_value("2"),
+                     values.select_device(_device_str(1), result))
+    # Values are marked as mirrored, so select_device_mirrored() is allowed.
+    self.assertEqual(_nested_value("1"),
+                     values.select_device_mirrored(_device_str(0), result))
+    self.assertEqual(_nested_value("2"),
+                     values.select_device_mirrored(_device_str(1), result))
+
+  def testMirroredContainer(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+    v, devices, mirrored = _make_mirrored()
+    result = values.regroup(dict(zip(devices, v)))
+    self.assertIs(mirrored, result)
+
+  def testSameId(self):
+    foo = object()
+    result = values.regroup({_device_str(0): ("a", foo),
+                             _device_str(1): ("b", foo)})
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(2, len(result))
+    self._is_per_device(result[0], ["a", "b"])
+    self.assertIs(foo, result[1])
+
+    # Test select_device(), should undo the merge done by regroup().
+    result_0 = values.select_device(_device_str(0), result)
+    self.assertIsInstance(result_0, tuple)
+    self.assertEqual(2, len(result_0))
+    self.assertEqual("a", result_0[0])
+    self.assertIs(foo, result_0[1])
+    result_1 = values.select_device(_device_str(1), result)
+    self.assertIsInstance(result_1, tuple)
+    self.assertEqual(2, len(result_1))
+    self.assertEqual("b", result_1[0])
+    self.assertIs(foo, result_1[1])
+
+  def testOneDevice(self):
+    result = values.regroup({_device_str(0): _nested_value("1")})
+    # On one device regroup() and select_device() are basically identity.
+    self.assertEqual(_nested_value("1"), result)
+    self.assertEqual(_nested_value("1"),
+                     values.select_device(_device_str(0), result))
+
+    # The one exception has to do with MirroredVariables.
+    d = "/device:CPU:0"
+    with ops.device(d):
+      v = variable_scope.get_variable(
+          name="v", initializer=1., use_resource=True)
+      index = {d: v}
+    mirrored = values.MirroredVariable(index, v)
+    result = values.regroup(index)
+    self.assertIs(mirrored, result)
+
+  def testNamedTupleEstimatorSpec(self):
+    with context.graph_mode(), ops.Graph().as_default():
+      created_estimator_specs = {}
+      to_regroup = {}
+
+      for device_id in range(3):
+        spec = model_fn_lib.EstimatorSpec(
+            mode=model_fn_lib.ModeKeys.TRAIN,
+            loss=constant_op.constant(device_id / 2),
+            train_op=array_ops.identity(constant_op.constant(device_id)))
+        created_estimator_specs[device_id] = spec
+        to_regroup[_device_str(device_id)] = spec
+
+      merged_estimator_spec = values.regroup(to_regroup)
+
+      self.assertTrue(
+          isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec))
+      self.assertEquals(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
+      for device_id in range(3):
+        d = _device_str(device_id)
+        self.assertEquals(created_estimator_specs[device_id].loss,
+                          merged_estimator_spec.loss.get(d))
+        self.assertEquals(created_estimator_specs[device_id].train_op,
+                          merged_estimator_spec.train_op.get(d))
+        # Scaffold is populated by `EstimatorSpec.__new__`.
+        self.assertEquals(created_estimator_specs[device_id].scaffold,
+                          merged_estimator_spec.scaffold.get(d))
+        # Also test that we can undo the merge using select_device()
+        self.assertEquals(created_estimator_specs[device_id],
+                          values.select_device(_device_str(device_id),
+                                               merged_estimator_spec))
+
+
+@test_util.with_c_api
+class PerDeviceDatasetTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  def _test_iterator_no_prefetch(self, devices, dataset, expected_values):
+    per_device_dataset = values.PerDeviceDataset(
+        dataset, devices, prefetch_on_device=False)
+    iterator = per_device_dataset.make_one_shot_iterator()
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      actual = self.evaluate([
+          values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, actual)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      self.evaluate([
+          values.select_device(d, next_element) for d in devices])
+
+  def _test_iterator_with_prefetch(self, devices, dataset, expected_values):
+    if not context.executing_eagerly():
+      per_device_dataset = values.PerDeviceDataset(
+          dataset, devices, prefetch_on_device=True)
+      iterator = per_device_dataset.make_one_shot_iterator()
+
+      # With prefetching, we cannot guarantee which input ends up on which
+      # device, so we verify that the complete set seen on all devices is
+      # correct, and equal numbers are distributed to each device.
+      combined_actual = []
+      combined_expected = []
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        combined_actual.extend(self.evaluate([
+            values.select_device(d, next_element) for d in devices]))
+        combined_expected.extend(expected_value)
+
+      self.assertEqual(set(combined_expected), set(combined_actual))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        self.evaluate([
+            values.select_device(d, next_element) for d in devices])
+
+  def _test_iterator(self, devices, dataset, expected_values):
+    self._test_iterator_no_prefetch(devices, dataset, expected_values)
+    self._test_iterator_with_prefetch(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOneDevice(self):
+    devices = ["/device:CPU:0"]
+    dataset = dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleDevices(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset = dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testTupleDataset(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testUnevenDatasetBatches(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset = dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(devices, dataset, expected_values)
+
+  def testInitializableIterator(self):
+    with context.graph_mode():
+      devices = ["/device:CPU:0"]
+      # Using random input since that is only allowed with initializable
+      # iterator.
+      dataset = dataset_ops.Dataset.from_tensor_slices(
+          random_ops.random_uniform((10,)))
+
+      per_device_dataset = values.PerDeviceDataset(
+          dataset, devices, prefetch_on_device=False)
+      iterator = per_device_dataset.make_initializable_iterator()
+
+      self.evaluate(iterator.initializer)
+      next_element = iterator.get_next()
+      for _ in range(10):
+        self.evaluate(next_element)
+
+      # Should fail after the input is finished.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element)
+
+      # After re-initializing the iterator, should be able to iterate again.
+      self.evaluate(iterator.initializer)
+      for _ in range(10):
+        self.evaluate(next_element)
+
+
+@test_util.with_c_api
+class MirroredVariableTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testProperties(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    v, _, mirrored = _make_mirrored()
+
+    self.assertEquals(v[0].name, mirrored.name)
+    self.assertEquals(v[0].dtype, mirrored.dtype)
+    self.assertEquals(v[0].shape, mirrored.shape)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testVariableOnAnotherDevice(self):
+    v = variable_scope.get_variable(
+        name="v", initializer=[1.], use_resource=True)
+    index = {"/job:foo/device:CPU:0": v}
+    mirrored = values.MirroredVariable(index, v)
+
+    self.assertEquals(v.name, mirrored.name)
+    self.assertEquals(v.dtype, mirrored.dtype)
+    self.assertEquals(v.shape, mirrored.shape)
+
+  def _assign_mirrored(self, devices, v, new):
+    for d, var, n in zip(devices, v, new):
+      with ops.device(d):
+        self.evaluate(var.assign(n))
+
+  def _save_return_saver(self, sess, var):
+    saver = saver_lib.Saver(var_list=[var])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    return saver.save(sess, prefix), saver
+
+  def _save(self, sess, var):
+    save_path, _ = self._save_return_saver(sess, var)
+    return save_path
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveAndRestoreMirroredOneGraph(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    with self.test_session() as sess:
+      v, devices, mirrored = _make_mirrored()
+
+      # Overwrite the initial values.
+      self._assign_mirrored(devices, v, [3., 4.])
+
+      # Saves the current value of v[0], 3.
+      save_path, saver = self._save_return_saver(sess, mirrored)
+
+      # Change the values between save and restore.
+      self._assign_mirrored(devices, v, [5., 6.])
+
+      # Restores the saved value of 3. to both variables.
+      saver.restore(sess, save_path)
+      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
+
+  def _save_mirrored(self):
+    """Save variables with mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, devices, mirrored = _make_mirrored()
+
+      # Overwrite the initial values.
+      self._assign_mirrored(devices, v, [3., 4.])
+
+      # Saves the current value of v[0], 3.
+      save_path = self._save(sess, mirrored)
+
+      # Change the values between save and restore.
+      self._assign_mirrored(devices, v, [5., 6.])
+    return save_path
+
+  def _save_normal(self):
+    """Save variables without mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=1., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(3.))
+
+      # Saves the current value of var, 3.
+      save_path = self._save(sess, var)
+
+      # Change the values between save and restore.
+      self.evaluate(var.assign(5.))
+    return save_path
+
+  def _restore_normal(self, save_path):
+    """Restore to variables without mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=7., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(8.))
+
+      # Restores the saved value of 3. to `var`.
+      saver = saver_lib.Saver(var_list=[var])
+      saver.restore(sess, save_path)
+      self.assertEqual(3., self.evaluate(var))
+
+  def _restore_mirrored(self, save_path):
+    """Restore to variables with mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, devices, mirrored = _make_mirrored()
+
+      # Overwrite the initial values.
+      self._assign_mirrored(devices, v, [7., 8.])
+
+      # Restores the saved value of 3. to both variables.
+      saver = saver_lib.Saver(var_list=[mirrored])
+      saver.restore(sess, save_path)
+      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveMirroredRestoreMirrored(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_mirrored()
+    self._restore_mirrored(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveMirroredRestoreNormal(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_mirrored()
+    self._restore_normal(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveNormalRestoreMirrored(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_normal()
+    self._restore_mirrored(save_path)
+
+
+_devices = ["/device:GPU:0", "/device:CPU:0"]
+
+
+def _make_tower_local(method):
+  v = []
+  index = {}
+  for d, n, init in zip(_devices, ["v", "v/replica"], [1., 2.]):
+    with ops.device(d):
+      v.append(variable_scope.get_variable(
+          name=n, initializer=init, use_resource=True))
+      index[d] = v[-1]
+  tower_local = values.TowerLocalVariable(index, v[0], method)
+  return v, tower_local
+
+
+@test_util.with_c_api
+class TowerLocalVariableTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testProperties(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    v, tower_local = _make_tower_local("sum")
+
+    self.assertEquals(v[0].name, tower_local.name)
+    self.assertEquals(v[0].dtype, tower_local.dtype)
+    self.assertEquals(v[0].shape, tower_local.shape)
+    self.assertEquals("sum", tower_local.reduce_method)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testVariableOnAnotherDevice(self):
+    v = variable_scope.get_variable(
+        name="v", initializer=[1.], use_resource=True)
+    index = {"/job:foo/device:CPU:0": v}
+    tower_local = values.TowerLocalVariable(index, v, "mean")
+
+    self.assertEquals(v.name, tower_local.name)
+    self.assertEquals(v.dtype, tower_local.dtype)
+    self.assertEquals(v.shape, tower_local.shape)
+    self.assertEquals("mean", tower_local.reduce_method)
+
+  def _assign_tower_local(self, devices, v, new):
+    for d, var, n in zip(devices, v, new):
+      with ops.device(d):
+        self.evaluate(var.assign(n))
+
+  def _save_return_saver(self, sess, var):
+    saver = saver_lib.Saver(var_list=[var])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    return saver.save(sess, prefix), saver
+
+  def _save(self, sess, var):
+    save_path, _ = self._save_return_saver(sess, var)
+    return save_path
+
+  def _dist_scope(self):
+    return mirrored_strategy.MirroredStrategy(_devices).scope()
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveAndRestoreTowerLocalSumOneGraph(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    with self.test_session() as sess:
+      v, tower_local = _make_tower_local("sum")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [3., 4.])
+
+      with self._dist_scope():
+        # Saves the current value of v[0] + v[1], 7.
+        save_path, saver = self._save_return_saver(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+
+        # Restores the saved value of 7. which gets divided equally
+        # between the variables.
+        saver.restore(sess, save_path)
+        self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveAndRestoreTowerLocalMeanOneGraph(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    with self.test_session() as sess:
+      v, tower_local = _make_tower_local("mean")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [3., 4.])
+
+      with self._dist_scope():
+        # Saves the current value of (v[0] + v[1])/2, 3.5.
+        save_path, saver = self._save_return_saver(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+
+        # Restores the saved value of 3.5 to both variables.
+        saver.restore(sess, save_path)
+        self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
+
+  def _save_tower_local_mean(self):
+    """Save variables with mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("mean")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [3., 4.])
+
+      with self._dist_scope():
+        # Saves the current value of (v[0] + v[1])/2, 3.5
+        save_path = self._save(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+    return save_path
+
+  def _save_tower_local_sum(self):
+    """Save variables with mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("sum")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [1.5, 2.])
+
+      with self._dist_scope():
+        # Saves the current value of v[0] + v[1], 3.5
+        save_path = self._save(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+    return save_path
+
+  def _save_normal(self):
+    """Save variables without mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=1., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(3.5))
+
+      # Saves the current value of var, 3.5.
+      save_path = self._save(sess, var)
+
+      # Change the values between save and restore.
+      self.evaluate(var.assign(5.))
+    return save_path
+
+  def _restore_normal(self, save_path):
+    """Restore to variables without mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=7., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(8.))
+
+      # Restores the saved value of 3.5 to `var`.
+      saver = saver_lib.Saver(var_list=[var])
+      saver.restore(sess, save_path)
+      self.assertEqual(3.5, self.evaluate(var))
+
+  def _restore_tower_local_mean(self, save_path):
+    """Restore to variables with mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("mean")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [7., 8.])
+
+      with self._dist_scope():
+        # Restores the saved value of 3.5 to both variables.
+        saver = saver_lib.Saver(var_list=[tower_local])
+        saver.restore(sess, save_path)
+        self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
+
+  def _restore_tower_local_sum(self, save_path):
+    """Restore to variables with mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("sum")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [7., 8.])
+
+      with self._dist_scope():
+        # Restores the saved value of 3.5 to both variables.
+        saver = saver_lib.Saver(var_list=[tower_local])
+        saver.restore(sess, save_path)
+        self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]]))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalRestoreTowerLocalMean(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_mean()
+    self._restore_tower_local_mean(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalRestoreTowerLocalSum(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_sum()
+    self._restore_tower_local_sum(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalMeanRestoreNormal(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_mean()
+    self._restore_normal(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalSumRestoreNormal(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_sum()
+    self._restore_normal(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveNormalRestoreTowerLocalMean(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_normal()
+    self._restore_tower_local_mean(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveNormalRestoreTowerLocalSum(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_normal()
+    self._restore_tower_local_sum(save_path)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 7f510c42215f48a9e795eb81bd9f66b0a2108335..fad613155d8861a2508fb7aca752b10ff85d35eb 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -251,6 +251,21 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "kumaraswamy_test",
+    srcs = ["python/kernel_tests/kumaraswamy_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "moving_stats_test",
     size = "small",
@@ -335,6 +350,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["nomsan"],
 )
 
 cuda_py_test(
@@ -403,7 +419,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "poisson_lognormal_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/poisson_lognormal_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -438,6 +454,21 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+)
+
+cuda_py_test(
+    name = "batch_reshape_test",
+    size = "small",
+    srcs = ["python/kernel_tests/batch_reshape_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
 )
 
 cuda_py_test(
@@ -459,6 +490,30 @@ cuda_py_test(
     tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
+cuda_py_test(
+    name = "seed_stream_test",
+    size = "small",
+    srcs = ["python/kernel_tests/seed_stream_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+cuda_py_test(
+    name = "statistical_testing_test",
+    size = "medium",
+    srcs = [
+        "python/kernel_tests/statistical_testing_test.py",
+    ],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
 cuda_py_test(
     name = "vector_sinh_arcsinh_diag_test",
     size = "medium",
@@ -654,6 +709,7 @@ cuda_py_test(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/python:client_testlib",
     ],
+    tags = ["noasan"],  # times out, http://b/78588814
 )
 
 cuda_py_test(
@@ -710,18 +766,6 @@ cuda_py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # === Bijector Tests ==========================================================
 
 cuda_py_test(
@@ -782,6 +826,25 @@ cuda_py_test(
     tags = ["noasan"],  # times out b/63678675
 )
 
+cuda_py_test(
+    name = "affine_scalar_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/affine_scalar_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "affine_linear_operator_test",
     size = "small",
@@ -801,6 +864,23 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "batch_normalization_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/batch_normalization_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = ["optonly"],
+)
+
 cuda_py_test(
     name = "chain_test",
     size = "small",
@@ -915,6 +995,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "kumaraswamy_bijector_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/kumaraswamy_bijector_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "masked_autoregressive_test",
     size = "small",
@@ -984,7 +1083,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "reshape_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/bijectors/reshape_test.py"],
     additional_deps = [
         ":bijectors_py",
@@ -1017,10 +1116,12 @@ cuda_py_test(
     ],
 )
 
+# Tests for SinhArcSinh bijector.  The file name has the extra "_bijector" to
+# avoid BUILD rule name conflicts with the distribution by the same name.
 cuda_py_test(
-    name = "sigmoid_centered_test",
+    name = "sinh_arcsinh_bijector_test",
     size = "small",
-    srcs = ["python/kernel_tests/bijectors/sigmoid_centered_test.py"],
+    srcs = ["python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py"],
     additional_deps = [
         ":bijectors_py",
         ":distributions_py",
@@ -1034,14 +1135,13 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
-# Tests for SinhArcSinh bijector.  The file name has the extra "_bijector" to
-# avoid BUILD rule name conflicts with the distribution by the same name.
 cuda_py_test(
-    name = "sinh_arcsinh_bijector_test",
+    name = "softmax_centered_test",
     size = "small",
-    srcs = ["python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py"],
+    srcs = ["python/kernel_tests/bijectors/softmax_centered_test.py"],
     additional_deps = [
         ":bijectors_py",
         ":distributions_py",
@@ -1058,9 +1158,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "softmax_centered_test",
+    name = "softplus_test",
     size = "small",
-    srcs = ["python/kernel_tests/bijectors/softmax_centered_test.py"],
+    srcs = ["python/kernel_tests/bijectors/softplus_test.py"],
     additional_deps = [
         ":bijectors_py",
         ":distributions_py",
@@ -1077,9 +1177,28 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "softplus_test",
+    name = "softsign_test",
     size = "small",
-    srcs = ["python/kernel_tests/bijectors/softplus_test.py"],
+    srcs = ["python/kernel_tests/bijectors/softsign_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "square_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/square_test.py"],
     additional_deps = [
         ":bijectors_py",
         ":distributions_py",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 61c411271d0bb8d7b4cc3b14992b82ec1e5674ed..ddf59891e626a85e6c917ac74b3cfaabf16eb15d 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops.autoregressive import *
+from tensorflow.contrib.distributions.python.ops.batch_reshape import *
 from tensorflow.contrib.distributions.python.ops.binomial import *
 from tensorflow.contrib.distributions.python.ops.cauchy import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
@@ -58,6 +59,7 @@ from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
+from tensorflow.contrib.distributions.python.ops.seed_stream import *
 from tensorflow.contrib.distributions.python.ops.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.test_util import *
 from tensorflow.contrib.distributions.python.ops.vector_diffeomixture import *
@@ -96,9 +98,10 @@ _allowed_symbols = [
     'ReparameterizationType',
     'Distribution',
     'Autoregressive',
-    'Binomial',
+    'BatchReshape',
     'Bernoulli',
     'Beta',
+    'Binomial',
     'BetaWithSoftplusConcentration',
     'Categorical',
     'Chi2',
@@ -124,6 +127,7 @@ _allowed_symbols = [
     'NormalWithSoftplusScale',
     'Poisson',
     'PoissonLogNormalQuadratureCompound',
+    'SeedStream',
     'SinhArcsinh',
     'StudentT',
     'StudentTWithAbsDfSoftplusScale',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..59d549b7b80a3d80d0b8409542eb6583f645bdaa
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
@@ -0,0 +1,568 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for BatchReshape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import batch_reshape as batch_reshape_lib
+from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_lib
+from tensorflow.contrib.distributions.python.ops import poisson as poisson_lib
+from tensorflow.contrib.distributions.python.ops import wishart as wishart_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+class _BatchReshapeTest(object):
+
+  def make_wishart(self, dims, new_batch_shape, old_batch_shape):
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = self.dtype([
+        [[1., 0.5],
+         [0.5, 1.]],
+        [[0.5, 0.25],
+         [0.25, 0.75]],
+    ])
+    scale = np.reshape(np.concatenate([scale, scale], axis=0),
+                       old_batch_shape + [dims, dims])
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    wishart = wishart_lib.WishartFull(df=5, scale=scale_ph)
+    reshape_wishart = batch_reshape_lib.BatchReshape(
+        distribution=wishart,
+        batch_shape=new_batch_shape_ph,
+        validate_args=True)
+
+    return wishart, reshape_wishart
+
+  def test_matrix_variate_sample_and_log_prob(self):
+    dims = 2
+    new_batch_shape = [4]
+    old_batch_shape = [2, 2]
+    wishart, reshape_wishart = self.make_wishart(
+        dims, new_batch_shape, old_batch_shape)
+
+    batch_shape = reshape_wishart.batch_shape_tensor()
+    event_shape = reshape_wishart.event_shape_tensor()
+
+    expected_sample_shape = [3, 1] + new_batch_shape + [dims, dims]
+    x = wishart.sample([3, 1], seed=42)
+    expected_sample = array_ops.reshape(x, expected_sample_shape)
+    actual_sample = reshape_wishart.sample([3, 1], seed=42)
+
+    expected_log_prob_shape = [3, 1] + new_batch_shape
+    expected_log_prob = array_ops.reshape(
+        wishart.log_prob(x), expected_log_prob_shape)
+    actual_log_prob = reshape_wishart.log_prob(expected_sample)
+
+    with self.test_session() as sess:
+      [
+          batch_shape_,
+          event_shape_,
+          expected_sample_, actual_sample_,
+          expected_log_prob_, actual_log_prob_,
+      ] = sess.run([
+          batch_shape,
+          event_shape,
+          expected_sample, actual_sample,
+          expected_log_prob, actual_log_prob,
+      ])
+
+    self.assertAllEqual(new_batch_shape, batch_shape_)
+    self.assertAllEqual([dims, dims], event_shape_)
+    self.assertAllClose(expected_sample_, actual_sample_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_log_prob_, actual_log_prob_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(new_batch_shape, reshape_wishart.batch_shape)
+    self.assertAllEqual([dims, dims], reshape_wishart.event_shape)
+    self.assertAllEqual(expected_sample_shape, actual_sample.shape)
+    self.assertAllEqual(expected_log_prob_shape, actual_log_prob.shape)
+
+  def test_matrix_variate_stats(self):
+    dims = 2
+    new_batch_shape = [4]
+    old_batch_shape = [2, 2]
+    wishart, reshape_wishart = self.make_wishart(
+        dims, new_batch_shape, old_batch_shape)
+
+    expected_scalar_stat_shape = new_batch_shape
+    expected_matrix_stat_shape = new_batch_shape + [dims, dims]
+
+    expected_entropy = array_ops.reshape(
+        wishart.entropy(), expected_scalar_stat_shape)
+    actual_entropy = reshape_wishart.entropy()
+
+    expected_mean = array_ops.reshape(
+        wishart.mean(), expected_matrix_stat_shape)
+    actual_mean = reshape_wishart.mean()
+
+    expected_mode = array_ops.reshape(
+        wishart.mode(), expected_matrix_stat_shape)
+    actual_mode = reshape_wishart.mode()
+
+    expected_stddev = array_ops.reshape(
+        wishart.stddev(), expected_matrix_stat_shape)
+    actual_stddev = reshape_wishart.stddev()
+
+    expected_variance = array_ops.reshape(
+        wishart.variance(), expected_matrix_stat_shape)
+    actual_variance = reshape_wishart.variance()
+
+    with self.test_session() as sess:
+      [
+          expected_entropy_, actual_entropy_,
+          expected_mean_, actual_mean_,
+          expected_mode_, actual_mode_,
+          expected_stddev_, actual_stddev_,
+          expected_variance_, actual_variance_,
+      ] = sess.run([
+          expected_entropy, actual_entropy,
+          expected_mean, actual_mean,
+          expected_mode, actual_mode,
+          expected_stddev, actual_stddev,
+          expected_variance, actual_variance,
+      ])
+
+    self.assertAllClose(expected_entropy_, actual_entropy_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mean_, actual_mean_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mode_, actual_mode_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_stddev_, actual_stddev_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_variance_, actual_variance_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(expected_scalar_stat_shape, actual_entropy.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_mean.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_mode.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_stddev.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_variance.shape)
+
+  def make_normal(self, new_batch_shape, old_batch_shape):
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = self.dtype(0.5 + np.arange(
+        np.prod(old_batch_shape)).reshape(old_batch_shape))
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    normal = normal_lib.Normal(loc=self.dtype(0), scale=scale_ph)
+    reshape_normal = batch_reshape_lib.BatchReshape(
+        distribution=normal,
+        batch_shape=new_batch_shape_ph,
+        validate_args=True)
+    return normal, reshape_normal
+
+  def test_scalar_variate_sample_and_log_prob(self):
+    new_batch_shape = [2, 2]
+    old_batch_shape = [4]
+
+    normal, reshape_normal = self.make_normal(
+        new_batch_shape, old_batch_shape)
+
+    batch_shape = reshape_normal.batch_shape_tensor()
+    event_shape = reshape_normal.event_shape_tensor()
+
+    expected_sample_shape = new_batch_shape
+    x = normal.sample(seed=52)
+    expected_sample = array_ops.reshape(x, expected_sample_shape)
+    actual_sample = reshape_normal.sample(seed=52)
+
+    expected_log_prob_shape = new_batch_shape
+    expected_log_prob = array_ops.reshape(
+        normal.log_prob(x), expected_log_prob_shape)
+    actual_log_prob = reshape_normal.log_prob(expected_sample)
+
+    with self.test_session() as sess:
+      [
+          batch_shape_,
+          event_shape_,
+          expected_sample_, actual_sample_,
+          expected_log_prob_, actual_log_prob_,
+      ] = sess.run([
+          batch_shape,
+          event_shape,
+          expected_sample, actual_sample,
+          expected_log_prob, actual_log_prob,
+      ])
+    self.assertAllEqual(new_batch_shape, batch_shape_)
+    self.assertAllEqual([], event_shape_)
+    self.assertAllClose(expected_sample_, actual_sample_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_log_prob_, actual_log_prob_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(new_batch_shape, reshape_normal.batch_shape)
+    self.assertAllEqual([], reshape_normal.event_shape)
+    self.assertAllEqual(expected_sample_shape, actual_sample.shape)
+    self.assertAllEqual(expected_log_prob_shape, actual_log_prob.shape)
+
+  def test_scalar_variate_stats(self):
+    new_batch_shape = [2, 2]
+    old_batch_shape = [4]
+
+    normal, reshape_normal = self.make_normal(new_batch_shape, old_batch_shape)
+
+    expected_scalar_stat_shape = new_batch_shape
+
+    expected_entropy = array_ops.reshape(
+        normal.entropy(), expected_scalar_stat_shape)
+    actual_entropy = reshape_normal.entropy()
+
+    expected_mean = array_ops.reshape(
+        normal.mean(), expected_scalar_stat_shape)
+    actual_mean = reshape_normal.mean()
+
+    expected_mode = array_ops.reshape(
+        normal.mode(), expected_scalar_stat_shape)
+    actual_mode = reshape_normal.mode()
+
+    expected_stddev = array_ops.reshape(
+        normal.stddev(), expected_scalar_stat_shape)
+    actual_stddev = reshape_normal.stddev()
+
+    expected_variance = array_ops.reshape(
+        normal.variance(), expected_scalar_stat_shape)
+    actual_variance = reshape_normal.variance()
+
+    with self.test_session() as sess:
+      [
+          expected_entropy_, actual_entropy_,
+          expected_mean_, actual_mean_,
+          expected_mode_, actual_mode_,
+          expected_stddev_, actual_stddev_,
+          expected_variance_, actual_variance_,
+      ] = sess.run([
+          expected_entropy, actual_entropy,
+          expected_mean, actual_mean,
+          expected_mode, actual_mode,
+          expected_stddev, actual_stddev,
+          expected_variance, actual_variance,
+      ])
+    self.assertAllClose(expected_entropy_, actual_entropy_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mean_, actual_mean_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mode_, actual_mode_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_stddev_, actual_stddev_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_variance_, actual_variance_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(expected_scalar_stat_shape, actual_entropy.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_mean.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_mode.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_stddev.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_variance.shape)
+
+  def make_mvn(self, dims, new_batch_shape, old_batch_shape):
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+    reshape_mvn = batch_reshape_lib.BatchReshape(
+        distribution=mvn,
+        batch_shape=new_batch_shape_ph,
+        validate_args=True)
+    return mvn, reshape_mvn
+
+  def test_vector_variate_sample_and_log_prob(self):
+    dims = 3
+    new_batch_shape = [2, 1]
+    old_batch_shape = [2]
+    mvn, reshape_mvn = self.make_mvn(
+        dims, new_batch_shape, old_batch_shape)
+
+    batch_shape = reshape_mvn.batch_shape_tensor()
+    event_shape = reshape_mvn.event_shape_tensor()
+
+    expected_sample_shape = [3] + new_batch_shape + [dims]
+    x = mvn.sample(3, seed=62)
+    expected_sample = array_ops.reshape(x, expected_sample_shape)
+    actual_sample = reshape_mvn.sample(3, seed=62)
+
+    expected_log_prob_shape = [3] + new_batch_shape
+    expected_log_prob = array_ops.reshape(
+        mvn.log_prob(x), expected_log_prob_shape)
+    actual_log_prob = reshape_mvn.log_prob(expected_sample)
+
+    with self.test_session() as sess:
+      [
+          batch_shape_,
+          event_shape_,
+          expected_sample_, actual_sample_,
+          expected_log_prob_, actual_log_prob_,
+      ] = sess.run([
+          batch_shape,
+          event_shape,
+          expected_sample, actual_sample,
+          expected_log_prob, actual_log_prob,
+      ])
+    self.assertAllEqual(new_batch_shape, batch_shape_)
+    self.assertAllEqual([dims], event_shape_)
+    self.assertAllClose(expected_sample_, actual_sample_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_log_prob_, actual_log_prob_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(new_batch_shape, reshape_mvn.batch_shape)
+    self.assertAllEqual([dims], reshape_mvn.event_shape)
+    self.assertAllEqual(expected_sample_shape, actual_sample.shape)
+    self.assertAllEqual(expected_log_prob_shape, actual_log_prob.shape)
+
+  def test_vector_variate_stats(self):
+    dims = 3
+    new_batch_shape = [2, 1]
+    old_batch_shape = [2]
+    mvn, reshape_mvn = self.make_mvn(
+        dims, new_batch_shape, old_batch_shape)
+
+    expected_scalar_stat_shape = new_batch_shape
+
+    expected_entropy = array_ops.reshape(
+        mvn.entropy(), expected_scalar_stat_shape)
+    actual_entropy = reshape_mvn.entropy()
+
+    expected_vector_stat_shape = new_batch_shape + [dims]
+
+    expected_mean = array_ops.reshape(
+        mvn.mean(), expected_vector_stat_shape)
+    actual_mean = reshape_mvn.mean()
+
+    expected_mode = array_ops.reshape(
+        mvn.mode(), expected_vector_stat_shape)
+    actual_mode = reshape_mvn.mode()
+
+    expected_stddev = array_ops.reshape(
+        mvn.stddev(), expected_vector_stat_shape)
+    actual_stddev = reshape_mvn.stddev()
+
+    expected_variance = array_ops.reshape(
+        mvn.variance(), expected_vector_stat_shape)
+    actual_variance = reshape_mvn.variance()
+
+    expected_matrix_stat_shape = new_batch_shape + [dims, dims]
+
+    expected_covariance = array_ops.reshape(
+        mvn.covariance(), expected_matrix_stat_shape)
+    actual_covariance = reshape_mvn.covariance()
+
+    with self.test_session() as sess:
+      [
+          expected_entropy_, actual_entropy_,
+          expected_mean_, actual_mean_,
+          expected_mode_, actual_mode_,
+          expected_stddev_, actual_stddev_,
+          expected_variance_, actual_variance_,
+          expected_covariance_, actual_covariance_,
+      ] = sess.run([
+          expected_entropy, actual_entropy,
+          expected_mean, actual_mean,
+          expected_mode, actual_mode,
+          expected_stddev, actual_stddev,
+          expected_variance, actual_variance,
+          expected_covariance, actual_covariance,
+      ])
+    self.assertAllClose(expected_entropy_, actual_entropy_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mean_, actual_mean_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mode_, actual_mode_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_stddev_, actual_stddev_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_variance_, actual_variance_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_covariance_, actual_covariance_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(expected_scalar_stat_shape, actual_entropy.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_mean.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_mode.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_stddev.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_variance.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_covariance.shape)
+
+  def test_bad_reshape_size(self):
+    dims = 2
+    new_batch_shape = [2, 3]
+    old_batch_shape = [2]   # 2 != 2*3
+
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(
+          ValueError, (r"`batch_shape` size \(6\) must match "
+                       r"`distribution\.batch_shape` size \(2\)")):
+        batch_reshape_lib.BatchReshape(
+            distribution=mvn,
+            batch_shape=new_batch_shape_ph,
+            validate_args=True)
+
+    else:
+      with self.test_session():
+        with self.assertRaisesOpError(r"`batch_shape` size must match "
+                                      r"`distributions.batch_shape` size"):
+          batch_reshape_lib.BatchReshape(
+              distribution=mvn,
+              batch_shape=new_batch_shape_ph,
+              validate_args=True).sample().eval()
+
+  def test_non_positive_shape(self):
+    dims = 2
+    new_batch_shape = [-1, -2]   # -1*-2=2 so will pass size check.
+    old_batch_shape = [2]
+
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(ValueError, r".*must be positive.*"):
+        batch_reshape_lib.BatchReshape(
+            distribution=mvn,
+            batch_shape=new_batch_shape_ph,
+            validate_args=True)
+
+    else:
+      with self.test_session():
+        with self.assertRaisesOpError(r".*must be positive.*"):
+          batch_reshape_lib.BatchReshape(
+              distribution=mvn,
+              batch_shape=new_batch_shape_ph,
+              validate_args=True).sample().eval()
+
+  def test_non_vector_shape(self):
+    dims = 2
+    new_batch_shape = 2
+    old_batch_shape = [2]
+
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(ValueError, r".*must be a vector.*"):
+        batch_reshape_lib.BatchReshape(
+            distribution=mvn,
+            batch_shape=new_batch_shape_ph,
+            validate_args=True)
+
+    else:
+      with self.test_session():
+        with self.assertRaisesOpError(r".*must be a vector.*"):
+          batch_reshape_lib.BatchReshape(
+              distribution=mvn,
+              batch_shape=new_batch_shape_ph,
+              validate_args=True).sample().eval()
+
+  def test_broadcasting_explicitly_unsupported(self):
+    old_batch_shape = [4]
+    new_batch_shape = [1, 4, 1]
+    rate_ = self.dtype([1, 10, 2, 20])
+
+    rate = array_ops.placeholder_with_default(
+        rate_,
+        shape=old_batch_shape if self.is_static_shape else None)
+    poisson_4 = poisson_lib.Poisson(rate)
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+    poisson_141_reshaped = batch_reshape_lib.BatchReshape(
+        poisson_4, new_batch_shape_ph, validate_args=True)
+
+    x_4 = self.dtype([2, 12, 3, 23])
+    x_114 = self.dtype([2, 12, 3, 23]).reshape(1, 1, 4)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(NotImplementedError,
+                                   "too few batch and event dims"):
+        poisson_141_reshaped.log_prob(x_4)
+      with self.assertRaisesRegexp(NotImplementedError,
+                                   "unexpected batch and event shape"):
+        poisson_141_reshaped.log_prob(x_114)
+      return
+
+    with self.assertRaisesOpError("too few batch and event dims"):
+      with self.test_session():
+        poisson_141_reshaped.log_prob(x_4).eval()
+
+    with self.assertRaisesOpError("unexpected batch and event shape"):
+      with self.test_session():
+        poisson_141_reshaped.log_prob(x_114).eval()
+
+
+class BatchReshapeStaticTest(_BatchReshapeTest, test.TestCase):
+
+  dtype = np.float32
+  is_static_shape = True
+
+
+class BatchReshapeDynamicTest(_BatchReshapeTest, test.TestCase):
+
+  dtype = np.float64
+  is_static_shape = False
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
index e0d65c79b2654c2949de161d6317f218d11cab43..042c8ebd51c47facfc5c942cae56bd56be9df7c5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
@@ -18,11 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 # pylint: disable=g-importing-member
 from tensorflow.contrib.distributions.python.ops.bijectors.absolute_value import AbsoluteValue
-from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -35,50 +32,38 @@ class AbsoluteValueTest(test.TestCase):
 
   def testBijectorVersusNumpyRewriteOfBasicFunctionsEventNdims0(self):
     with self.test_session() as sess:
-      bijector = AbsoluteValue(event_ndims=0, validate_args=True)
+      bijector = AbsoluteValue(validate_args=True)
       self.assertEqual("absolute_value", bijector.name)
       x = array_ops.constant([[0., 1., -1], [0., -5., 3.]])  # Shape [2, 3]
       y = math_ops.abs(x)
 
       y_ = y.eval()
-      zeros = np.zeros((2, 3))
 
       self.assertAllClose(y_, bijector.forward(x).eval())
       self.assertAllClose((-y_, y_), sess.run(bijector.inverse(y)))
-      self.assertAllClose((zeros, zeros),
-                          sess.run(bijector.inverse_log_det_jacobian(y)))
+      self.assertAllClose((0., 0.),
+                          sess.run(bijector.inverse_log_det_jacobian(
+                              y, event_ndims=0)))
 
       # Run things twice to make sure there are no issues in caching the tuples
       # returned by .inverse*
       self.assertAllClose(y_, bijector.forward(x).eval())
       self.assertAllClose((-y_, y_), sess.run(bijector.inverse(y)))
-      self.assertAllClose((zeros, zeros),
-                          sess.run(bijector.inverse_log_det_jacobian(y)))
-
-  def testEventNdimsMustBeZeroOrRaiseStatic(self):
-    with self.test_session():
-      with self.assertRaisesRegexp(ValueError, "event_ndims.*was not 0"):
-        AbsoluteValue(event_ndims=1)
-
-  def testEventNdimsMustBeZeroOrRaiseDynamic(self):
-    with self.test_session() as sess:
-      event_ndims = array_ops.placeholder(dtypes.int32)
-      abs_bijector = AbsoluteValue(event_ndims=event_ndims, validate_args=True)
-      with self.assertRaisesOpError("event_ndims was not 0"):
-        sess.run(abs_bijector.inverse_log_det_jacobian([1.]),
-                 feed_dict={event_ndims: 1})
+      self.assertAllClose((0., 0.),
+                          sess.run(bijector.inverse_log_det_jacobian(
+                              y, event_ndims=0)))
 
   def testNegativeYRaisesForInverseIfValidateArgs(self):
     with self.test_session() as sess:
-      bijector = AbsoluteValue(event_ndims=0, validate_args=True)
+      bijector = AbsoluteValue(validate_args=True)
       with self.assertRaisesOpError("y was negative"):
         sess.run(bijector.inverse(-1.))
 
   def testNegativeYRaisesForILDJIfValidateArgs(self):
     with self.test_session() as sess:
-      bijector = AbsoluteValue(event_ndims=0, validate_args=True)
+      bijector = AbsoluteValue(validate_args=True)
       with self.assertRaisesOpError("y was negative"):
-        sess.run(bijector.inverse_log_det_jacobian(-1.))
+        sess.run(bijector.inverse_log_det_jacobian(-1., event_ndims=0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
index 405ddd292cacd8ace87d6caeebf3e8cfc347c22d..1e4ad724d00f751a55370ef9aa6dde0003a2098c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
@@ -38,9 +38,11 @@ class AffineLinearOperatorTest(test.TestCase):
       self.assertEqual(affine.name, "affine_linear_operator")
       self.assertAllClose(y, affine.forward(x).eval())
       self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(
+          y, event_ndims=2).eval())
+      self.assertAllClose(
+          -affine.inverse_log_det_jacobian(y, event_ndims=2).eval(),
+          affine.forward_log_det_jacobian(x, event_ndims=2).eval())
 
   def testDiag(self):
     with self.test_session():
@@ -58,14 +60,16 @@ class AffineLinearOperatorTest(test.TestCase):
       self.assertEqual(affine.name, "affine_linear_operator")
       self.assertAllClose(y, affine.forward(x).eval())
       self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          ildj, affine.inverse_log_det_jacobian(y, event_ndims=1).eval())
+      self.assertAllClose(
+          -affine.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          affine.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testTriL(self):
     with self.test_session():
       shift = np.array([-1, 0, 1], dtype=np.float32)
-      tril = np.array([[[1, 0, 0],
+      tril = np.array([[[3, 0, 0],
                         [2, -1, 0],
                         [3, 2, 1]],
                        [[2, 0, 0],
@@ -85,15 +89,17 @@ class AffineLinearOperatorTest(test.TestCase):
       # y = np.matmul(x, tril) + shift.
       y = np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
       ildj = -np.sum(np.log(np.abs(np.diagonal(
-          tril, axis1=-2, axis2=-1))),
-                     axis=-1)
+          tril, axis1=-2, axis2=-1))))
 
       self.assertEqual(affine.name, "affine_linear_operator")
       self.assertAllClose(y, affine.forward(x).eval())
       self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          ildj, affine.inverse_log_det_jacobian(
+              y, event_ndims=2).eval())
+      self.assertAllClose(
+          -affine.inverse_log_det_jacobian(y, event_ndims=2).eval(),
+          affine.forward_log_det_jacobian(x, event_ndims=2).eval())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2533620bebeb0400b6d4a6346e8315c7e37c5c6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
@@ -0,0 +1,160 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Affine Scalar Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.affine_scalar import AffineScalar
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class AffineScalarBijectorTest(test.TestCase):
+  """Tests correctness of the Y = scale @ x + shift transformation."""
+
+  def testProperties(self):
+    with self.test_session():
+      mu = -1.
+      # scale corresponds to 1.
+      bijector = AffineScalar(shift=mu)
+      self.assertEqual("affine_scalar", bijector.name)
+
+  def testNoBatchScalar(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
+
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # Corresponds to scale = 2
+        bijector = AffineScalar(shift=mu, scale=2.)
+        x = [1., 2, 3]  # Three scalar samples (no batches).
+        self.assertAllClose([1., 3, 5], run(bijector.forward, x))
+        self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
+        self.assertAllClose(
+            -np.log(2.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
+
+  def testOneBatchScalarViaIdentityIn64BitUserProvidesShiftOnly(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
+
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value).astype(np.float64)
+        x = array_ops.placeholder(dtypes.float64, name="x")
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = np.float64([1.])
+        # One batch, scalar.
+        # Corresponds to scale = 1.
+        bijector = AffineScalar(shift=mu)
+        x = np.float64([1.])  # One sample from one batches.
+        self.assertAllClose([2.], run(bijector.forward, x))
+        self.assertAllClose([0.], run(bijector.inverse, x))
+        self.assertAllClose(
+            0.,
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
+
+  def testOneBatchScalarViaIdentityIn64BitUserProvidesScaleOnly(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
+
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value).astype(np.float64)
+        x = array_ops.placeholder(dtypes.float64, name="x")
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        multiplier = np.float64([2.])
+        # One batch, scalar.
+        # Corresponds to scale = 2, shift = 0.
+        bijector = AffineScalar(scale=multiplier)
+        x = np.float64([1.])  # One sample from one batches.
+        self.assertAllClose([2.], run(bijector.forward, x))
+        self.assertAllClose([0.5], run(bijector.inverse, x))
+        self.assertAllClose(
+            [np.log(0.5)],
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
+
+  def testTwoBatchScalarIdentityViaIdentity(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
+
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value).astype(np.float32)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [1., -1]
+        # Univariate, two batches.
+        # Corresponds to scale = 1.
+        bijector = AffineScalar(shift=mu)
+        x = [1., 1]  # One sample from each of two batches.
+        self.assertAllClose([2., 0], run(bijector.forward, x))
+        self.assertAllClose([0., 2], run(bijector.inverse, x))
+        self.assertAllClose(
+            0.,
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
+
+  def testTwoBatchScalarIdentityViaScale(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
+
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value).astype(np.float32)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [1., -1]
+        # Univariate, two batches.
+        # Corresponds to scale = 1.
+        bijector = AffineScalar(shift=mu, scale=[2., 1])
+        x = [1., 1]  # One sample from each of two batches.
+        self.assertAllClose([3., 0], run(bijector.forward, x))
+        self.assertAllClose([0., 2], run(bijector.inverse, x))
+        self.assertAllClose(
+            [-np.log(2), 0.],
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = AffineScalar(shift=3.6, scale=0.42)
+      assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
index c9158117f7a982e37047e8dd2b534a30040a87d9..9e14b9a53e6c63876478d876030c476c5d77dbbb 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
@@ -25,7 +25,6 @@ import numpy as np
 from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
@@ -36,209 +35,26 @@ class AffineBijectorTest(test.TestCase):
     with self.test_session():
       mu = -1.
       # scale corresponds to 1.
-      bijector = Affine(shift=mu, event_ndims=0)
+      bijector = Affine(shift=mu)
       self.assertEqual("affine", bijector.name)
 
-  def testNoBatchScalarViaIdentity(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = 2
-        bijector = Affine(
-            shift=mu, scale_identity_multiplier=2., event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1., 2, 3]  # Three scalar samples (no batches).
-        self.assertAllClose([1., 3, 5], run(bijector.forward, x))
-        self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testNoBatchScalarViaDiag(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = 2
-        bijector = Affine(shift=mu, scale_identity_multiplier=2., event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1., 2, 3]  # Three scalar samples (no batches).
-        self.assertAllClose([1., 3, 5], run(bijector.forward, x))
-        self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testWeirdSampleNoBatchScalarViaDiagMultiplier(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = 2.
-        bijector = Affine(
-            shift=mu, scale_identity_multiplier=2., event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [[1., 2, 3], [4, 5, 6]]  # Weird sample shape.
-        self.assertAllClose([[1., 3, 5],
-                             [7, 9, 11]],
-                            run(bijector.forward, x))
-        self.assertAllClose([[1., 1.5, 2.],
-                             [2.5, 3, 3.5]],
-                            run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testOneBatchScalarViaIdentityIn64BitUserProvidesShiftOnly(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value).astype(np.float64)
-        x = array_ops.placeholder(dtypes.float64, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = np.float64([1.])
-        # One batch, scalar.
-        # Corresponds to scale = 1.
-        bijector = Affine(shift=mu, event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = np.float64([1.])  # One sample from one batches.
-        self.assertAllClose([2.], run(bijector.forward, x))
-        self.assertAllClose([0.], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
-
-  def testOneBatchScalarViaIdentityIn64BitUserProvidesMultiplierOnly(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value).astype(np.float64)
-        x = array_ops.placeholder(dtypes.float64, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        multiplier = np.float64([2.])
-        # One batch, scalar.
-        # Corresponds to scale = 2, shift = 0.
-        bijector = Affine(scale_identity_multiplier=multiplier, event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = np.float64([1.])  # One sample from one batches.
-        self.assertAllClose([2.], run(bijector.forward, x))
-        self.assertAllClose([0.5], run(bijector.inverse, x))
-        self.assertAllClose([np.log(0.5)],
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testOneBatchScalarViaDiagMultiplier(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [1.]
-        # One batch, scalar.
-        # Corresponds to scale = 1.
-        bijector = Affine(shift=mu, scale_identity_multiplier=1., event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1.]  # One sample from one batches.
-        self.assertAllClose([2.], run(bijector.forward, x))
-        self.assertAllClose([0.], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
-
-  def testTwoBatchScalarIdentityViaIdentity(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [1., -1]
-        # Univariate, two batches.
-        # Corresponds to scale = 1.
-        bijector = Affine(shift=mu, event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1., 1]  # One sample from each of two batches.
-        self.assertAllClose([2., 0], run(bijector.forward, x))
-        self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
-
-  def testTwoBatchScalarIdentityViaDiagMultiplier(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [1., -1]
-        # Univariate, two batches.
-        # Corresponds to scale = 1.
-        bijector = Affine(shift=mu, scale_identity_multiplier=1., event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1., 1]  # One sample from each of two batches.
-        self.assertAllClose([2., 0], run(bijector.forward, x))
-        self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
-
   def testNoBatchMultivariateIdentity(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
         # Multivariate
         # Corresponds to scale = [[1., 0], [0, 1.]]
         bijector = Affine(shift=mu)
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 1]
         # matmul(sigma, x) + shift
         # = [-1, -1] + [1, -1]
@@ -251,33 +67,37 @@ class AffineBijectorTest(test.TestCase):
         x = [[1., 1], [-1., -1]]
         self.assertAllClose([[2., 0], [0., -2]], run(bijector.forward, x))
         self.assertAllClose([[0., 2], [-2., 0]], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            0., run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateDiag(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
         # Multivariate
         # Corresponds to scale = [[2., 0], [0, 1.]]
         bijector = Affine(shift=mu, scale_diag=[2., 1])
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 1]
         # matmul(sigma, x) + shift
         # = [-1, -1] + [1, -1]
         self.assertAllClose([3., 0], run(bijector.forward, x))
         self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
+        # Reset bijector.
+        bijector = Affine(shift=mu, scale_diag=[2., 1])
         # x is a 2-batch of 2-vectors.
         # The first vector is [1, 1], the second is [-1, -1].
         # Each undergoes matmul(sigma, x) + shift.
@@ -289,120 +109,116 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([[0., 2],
                              [-1., 0]],
                             run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateFullDynamic(self):
     with self.test_session() as sess:
       x = array_ops.placeholder(dtypes.float32, name="x")
       mu = array_ops.placeholder(dtypes.float32, name="mu")
       scale_diag = array_ops.placeholder(dtypes.float32, name="scale_diag")
-      event_ndims = array_ops.placeholder(dtypes.int32, name="event_ndims")
 
       x_value = np.array([[1., 1]], dtype=np.float32)
       mu_value = np.array([1., -1], dtype=np.float32)
       scale_diag_value = np.array([2., 2], dtype=np.float32)
-      event_ndims_value = np.array(1, dtype=np.int32)
       feed_dict = {
           x: x_value,
           mu: mu_value,
           scale_diag: scale_diag_value,
-          event_ndims: event_ndims_value
       }
 
-      bijector = Affine(
-          shift=mu, scale_diag=scale_diag, event_ndims=event_ndims)
-      self.assertEqual(1, sess.run(bijector.event_ndims, feed_dict))
+      bijector = Affine(shift=mu, scale_diag=scale_diag)
       self.assertAllClose([[3., 1]], sess.run(bijector.forward(x), feed_dict))
       self.assertAllClose([[0., 1]], sess.run(bijector.inverse(x), feed_dict))
       self.assertAllClose(
           -np.log(4),
-          sess.run(bijector.inverse_log_det_jacobian(x), feed_dict))
+          sess.run(bijector.inverse_log_det_jacobian(x, event_ndims=1),
+                   feed_dict))
 
   def testBatchMultivariateIdentity(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value, dtype=np.float32)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value)
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [[1., -1]]
         # Corresponds to 1 2x2 matrix, with twos on the diagonal.
         scale = 2.
         bijector = Affine(shift=mu, scale_identity_multiplier=scale)
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [[[1., 1]]]
         self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
         self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(4),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(4),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testBatchMultivariateDiag(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value, dtype=np.float32)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value)
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [[1., -1]]
         # Corresponds to 1 2x2 matrix, with twos on the diagonal.
         scale_diag = [[2., 2]]
         bijector = Affine(shift=mu, scale_diag=scale_diag)
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [[[1., 1]]]
         self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
         self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
-        self.assertAllClose([-np.log(4)],
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            [-np.log(4)],
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testBatchMultivariateFullDynamic(self):
     with self.test_session() as sess:
       x = array_ops.placeholder(dtypes.float32, name="x")
       mu = array_ops.placeholder(dtypes.float32, name="mu")
       scale_diag = array_ops.placeholder(dtypes.float32, name="scale_diag")
-      event_ndims = array_ops.placeholder(dtypes.int32, name="event_ndims")
 
       x_value = np.array([[[1., 1]]], dtype=np.float32)
       mu_value = np.array([[1., -1]], dtype=np.float32)
       scale_diag_value = np.array([[2., 2]], dtype=np.float32)
-      event_ndims_value = 1
 
       feed_dict = {
           x: x_value,
           mu: mu_value,
           scale_diag: scale_diag_value,
-          event_ndims: event_ndims_value
       }
 
-      bijector = Affine(
-          shift=mu, scale_diag=scale_diag, event_ndims=event_ndims)
-      self.assertEqual(1, sess.run(bijector.event_ndims, feed_dict))
+      bijector = Affine(shift=mu, scale_diag=scale_diag)
       self.assertAllClose([[[3., 1]]], sess.run(bijector.forward(x), feed_dict))
       self.assertAllClose([[[0., 1]]], sess.run(bijector.inverse(x), feed_dict))
-      self.assertAllClose([-np.log(4)],
-                          sess.run(
-                              bijector.inverse_log_det_jacobian(x), feed_dict))
+      self.assertAllClose(
+          [-np.log(4)],
+          sess.run(bijector.inverse_log_det_jacobian(
+              x, event_ndims=1), feed_dict))
 
   def testIdentityWithDiagUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -410,25 +226,25 @@ class AffineBijectorTest(test.TestCase):
         bijector = Affine(
             shift=mu,
             scale_identity_multiplier=1.,
-            scale_diag=[1., 1., 1.],
-            event_ndims=1)
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+            scale_diag=[1., 1., 1.])
         x = [1., 2, 3]  # Three scalar samples (no batches).
         self.assertAllClose([1., 3, 5], run(bijector.forward, x))
         self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.**3),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.**3),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityWithTriL(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -437,46 +253,48 @@ class AffineBijectorTest(test.TestCase):
             shift=mu,
             scale_identity_multiplier=1.,
             scale_tril=[[1., 0], [2., 1]])
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [[1., 2]]  # One multivariate sample.
         self.assertAllClose([[1., 5]], run(bijector.forward, x))
         self.assertAllClose([[1., 0.5]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(4.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(4.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testDiagWithTriL(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
         # scale = [[2., 0], [2, 3]]
         bijector = Affine(
             shift=mu, scale_diag=[1., 2.], scale_tril=[[1., 0], [2., 1]])
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [[1., 2]]  # One multivariate sample.
         self.assertAllClose([[1., 7]], run(bijector.forward, x))
         self.assertAllClose([[1., 1 / 3.]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(6.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(6.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityAndDiagWithTriL(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -486,23 +304,24 @@ class AffineBijectorTest(test.TestCase):
             scale_identity_multiplier=1.0,
             scale_diag=[1., 2.],
             scale_tril=[[1., 0], [2., 1]])
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [[1., 2]]  # One multivariate sample.
         self.assertAllClose([[2., 9]], run(bijector.forward, x))
         self.assertAllClose([[2 / 3., 5 / 12.]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(12.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(12.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityWithVDVTUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -514,7 +333,6 @@ class AffineBijectorTest(test.TestCase):
             scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
         bijector_ref = Affine(shift=mu, scale_diag=[10., 2, 3])
 
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 2, 3]  # Vector.
         self.assertAllClose([9., 3, 8], run(bijector.forward, x))
         self.assertAllClose(
@@ -523,22 +341,24 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([0.2, 1.5, 4 / 3.], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(60.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(60.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testDiagWithVDVTUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -550,7 +370,6 @@ class AffineBijectorTest(test.TestCase):
             scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
         bijector_ref = Affine(shift=mu, scale_diag=[10., 3, 5])
 
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 2, 3]  # Vector.
         self.assertAllClose([9., 5, 14], run(bijector.forward, x))
         self.assertAllClose(
@@ -558,22 +377,24 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([0.2, 1., 0.8], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(150.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(150.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testTriLWithVDVTUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -586,7 +407,6 @@ class AffineBijectorTest(test.TestCase):
         bijector_ref = Affine(
             shift=mu, scale_tril=[[10., 0, 0], [1, 3, 0], [2, 3, 5]])
 
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 2, 3]  # Vector.
         self.assertAllClose([9., 6, 22], run(bijector.forward, x))
         self.assertAllClose(
@@ -594,22 +414,24 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([0.2, 14 / 15., 4 / 25.], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(150.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(150.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testTriLWithVDVTUpdateNoDiagonal(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -622,7 +444,6 @@ class AffineBijectorTest(test.TestCase):
         bijector_ref = Affine(
             shift=mu, scale_tril=[[6., 0, 0], [1, 3, 0], [2, 3, 5]])
 
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 2, 3]  # Vector.
         self.assertAllClose([5., 6, 22], run(bijector.forward, x))
         self.assertAllClose(
@@ -630,11 +451,12 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([1 / 3., 8 / 9., 4 / 30.], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(90.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(90.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateRaisesWhenSingular(self):
     with self.test_session():
@@ -647,38 +469,6 @@ class AffineBijectorTest(test.TestCase):
       with self.assertRaisesOpError("diagonal part must be non-zero"):
         bijector.forward([1., 1.]).eval()
 
-  def testEventNdimsLargerThanOneRaises(self):
-    with self.test_session():
-      mu = [1., -1]
-      with self.assertRaisesRegexp(
-          ValueError, (r"event_ndims\(2\) was not 0 or 1")):
-        # Scale corresponds to 2x2 identity matrix.
-        bijector = Affine(shift=mu, event_ndims=2, validate_args=True)
-        bijector.forward([1., 1.]).eval()
-
-  def testScaleZeroScalarRaises(self):
-    with self.test_session():
-      mu = -1.
-      # Check Identity matrix with zero scaling.
-      bijector = Affine(
-          shift=mu,
-          scale_identity_multiplier=0.,
-          event_ndims=0,
-          validate_args=True)
-      with self.assertRaisesOpError("identity_multiplier should be non-zero"):
-        bijector.forward(1.).eval()
-
-  def testScaleDiagAndEventNdimsZeroRaises(self):
-    # Check Diag matrix with zero scaling.
-    with self.assertRaisesRegexp(ValueError, "only scale argument"):
-      Affine(shift=None, scale_diag=[0.0], event_ndims=0, validate_args=True)
-
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = Affine(
-          shift=3.6, scale_identity_multiplier=0.42, event_ndims=0)
-      assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.)
-
   def _makeScale(self,
                  x,
                  scale_identity_multiplier=None,
@@ -747,14 +537,12 @@ class AffineBijectorTest(test.TestCase):
         scale_args = dict({"x": x}, **args)
         scale = self._makeScale(**scale_args)
 
-        bijector_args = dict({"event_ndims": 1}, **args)
-
         # We haven't specified enough information for the scale.
         if scale is None:
           with self.assertRaisesRegexp(ValueError, ("must be specified.")):
-            bijector = Affine(shift=shift, **bijector_args)
+            bijector = Affine(shift=shift, **args)
         else:
-          bijector = Affine(shift=shift, **bijector_args)
+          bijector = Affine(shift=shift, **args)
           np_x = x
           # For the case a vector is passed in, we need to make the shape
           # match the matrix for matmul to work.
@@ -771,6 +559,7 @@ class AffineBijectorTest(test.TestCase):
             backward = np.squeeze(backward, axis=-1)
           self.assertAllClose(backward, bijector.inverse(x).eval())
 
+          scale *= np.ones(shape=x.shape[:-1], dtype=scale.dtype)
           ildj = -np.log(np.abs(np.linalg.det(scale)))
           # TODO(jvdillon): We need to make it so the scale_identity_multiplier
           # case does not deviate in expected shape. Fixing this will get rid of
@@ -781,7 +570,8 @@ class AffineBijectorTest(test.TestCase):
             ildj = np.squeeze(ildj[0])
           elif ildj.ndim < scale.ndim - 2:
             ildj = np.reshape(ildj, scale.shape[0:-2])
-          self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(x).eval())
+          self.assertAllClose(
+              ildj, bijector.inverse_log_det_jacobian(x, event_ndims=1).eval())
 
   def testLegalInputs(self):
     self._testLegalInputs(
@@ -829,15 +619,5 @@ class AffineBijectorTest(test.TestCase):
         x=np.array(
             [1., 2], dtype=np.float32))
 
-  def testScalarEventIdentityScale(self):
-    with self.test_session() as sess:
-      doubler = Affine(
-          scale_identity_multiplier=2.,
-          event_ndims=0)
-      doubler2 = doubler.inverse_log_det_jacobian(2.)
-      doubler2_ildj_ = sess.run([doubler2])
-      self.assertAllClose([-np.log(2.)], doubler2_ildj_)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c832fcaa686c92f83810e4f99ca3b23ae694b723
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
@@ -0,0 +1,237 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for BatchNorm Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import distributions
+from tensorflow.contrib.distributions.python.ops import test_util
+from tensorflow.contrib.distributions.python.ops.bijectors.batch_normalization import BatchNormalization
+from tensorflow.contrib.distributions.python.ops.bijectors.invert import Invert
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import normalization
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+
+class BatchNormTest(test_util.VectorDistributionTestHelpers,
+                    test.TestCase):
+
+  def _reduction_axes(self, input_shape, event_dims):
+    if isinstance(event_dims, int):
+      event_dims = [event_dims]
+    ndims = len(input_shape)
+    # Convert event_dims to non-negative indexing.
+    event_dims = list(event_dims)
+    for idx, x in enumerate(event_dims):
+      if x < 0:
+        event_dims[idx] = ndims + x
+    return tuple(i for i in range(ndims) if i not in event_dims)
+
+  def testForwardInverse(self):
+    """Tests forward and backward passes with different event shapes.
+
+    input_shape: Tuple of shapes for input tensor.
+    event_dims: Tuple of dimension indices that will be normalized.
+    training: Boolean of whether bijector runs in training or inference mode.
+    """
+    params = [
+        ((5*2, 4), [-1], False),
+        ((5, 2, 4), [-1], False),
+        ((5, 2, 4), [1, 2], False),
+        ((5, 2, 4), [0, 1], False),
+        ((5*2, 4), [-1], True),
+        ((5, 2, 4), [-1], True),
+        ((5, 2, 4), [1, 2], True),
+        ((5, 2, 4), [0, 1], True)
+    ]
+    for input_shape, event_dims, training in params:
+      x_ = np.arange(5 * 4 * 2).astype(np.float32).reshape(input_shape)
+      with self.test_session() as sess:
+        x = constant_op.constant(x_)
+        # When training, memorize the exact mean of the last
+        # minibatch that it normalized (instead of moving average assignment).
+        layer = normalization.BatchNormalization(
+            axis=event_dims, momentum=0., epsilon=0.)
+        batch_norm = BatchNormalization(
+            batchnorm_layer=layer, training=training)
+        # Minibatch statistics are saved only after norm_x has been computed.
+        norm_x = batch_norm.inverse(x)
+        with ops.control_dependencies(batch_norm.batchnorm.updates):
+          moving_mean = array_ops.identity(batch_norm.batchnorm.moving_mean)
+          moving_var = array_ops.identity(batch_norm.batchnorm.moving_variance)
+          denorm_x = batch_norm.forward(array_ops.identity(norm_x))
+          fldj = batch_norm.forward_log_det_jacobian(
+              x, event_ndims=len(event_dims))
+          # Use identity to invalidate cache.
+          ildj = batch_norm.inverse_log_det_jacobian(
+              array_ops.identity(denorm_x), event_ndims=len(event_dims))
+        variables.global_variables_initializer().run()
+        # Update variables.
+        norm_x_ = sess.run(norm_x)
+        [
+            norm_x_,
+            moving_mean_,
+            moving_var_,
+            denorm_x_,
+            ildj_,
+            fldj_,
+        ] = sess.run([
+            norm_x,
+            moving_mean,
+            moving_var,
+            denorm_x,
+            ildj,
+            fldj,
+        ])
+        self.assertEqual("batch_normalization", batch_norm.name)
+
+        reduction_axes = self._reduction_axes(input_shape, event_dims)
+        keepdims = len(event_dims) > 1
+
+        expected_batch_mean = np.mean(
+            x_, axis=reduction_axes, keepdims=keepdims)
+        expected_batch_var = np.var(x_, axis=reduction_axes, keepdims=keepdims)
+
+        if training:
+          # When training=True, values become normalized across batch dim and
+          # original values are recovered after de-normalizing.
+          zeros = np.zeros_like(norm_x_)
+          self.assertAllClose(np.mean(zeros, axis=reduction_axes),
+                              np.mean(norm_x_, axis=reduction_axes))
+
+          self.assertAllClose(expected_batch_mean, moving_mean_)
+          self.assertAllClose(expected_batch_var, moving_var_)
+          self.assertAllClose(x_, denorm_x_, atol=1e-5)
+          # Since moving statistics are set to batch statistics after
+          # normalization, ildj and -fldj should match.
+          self.assertAllClose(ildj_, -fldj_)
+          # ildj is computed with minibatch statistics.
+          expected_ildj = np.sum(np.log(1.) - .5 * np.log(
+              expected_batch_var + batch_norm.batchnorm.epsilon))
+          self.assertAllClose(expected_ildj, ildj_)
+        else:
+          # When training=False, moving_mean, moving_var remain at their
+          # initialized values (0., 1.), resulting in no scale/shift (a small
+          # shift occurs if epsilon > 0.)
+          self.assertAllClose(x_, norm_x_)
+          self.assertAllClose(x_, denorm_x_, atol=1e-5)
+          # ildj is computed with saved statistics.
+          expected_ildj = np.sum(
+              np.log(1.) - .5 * np.log(1. + batch_norm.batchnorm.epsilon))
+          self.assertAllClose(expected_ildj, ildj_)
+
+  def testMaximumLikelihoodTraining(self):
+    # Test Maximum Likelihood training with default bijector.
+    with self.test_session() as sess:
+      base_dist = distributions.MultivariateNormalDiag(loc=[0., 0.])
+      batch_norm = BatchNormalization(training=True)
+      dist = transformed_distribution_lib.TransformedDistribution(
+          distribution=base_dist,
+          bijector=batch_norm)
+      target_dist = distributions.MultivariateNormalDiag(loc=[1., 2.])
+      target_samples = target_dist.sample(100)
+      dist_samples = dist.sample(3000)
+      loss = -math_ops.reduce_mean(dist.log_prob(target_samples))
+      with ops.control_dependencies(batch_norm.batchnorm.updates):
+        train_op = adam.AdamOptimizer(1e-2).minimize(loss)
+        moving_mean = array_ops.identity(batch_norm.batchnorm.moving_mean)
+        moving_var = array_ops.identity(batch_norm.batchnorm.moving_variance)
+      variables.global_variables_initializer().run()
+      for _ in range(3000):
+        sess.run(train_op)
+      [
+          dist_samples_,
+          moving_mean_,
+          moving_var_
+      ] = sess.run([
+          dist_samples,
+          moving_mean,
+          moving_var
+      ])
+      self.assertAllClose([1., 2.], np.mean(dist_samples_, axis=0), atol=5e-2)
+      self.assertAllClose([1., 2.], moving_mean_, atol=5e-2)
+      self.assertAllClose([1., 1.], moving_var_, atol=5e-2)
+
+  def testLogProb(self):
+    with self.test_session() as sess:
+      layer = normalization.BatchNormalization(epsilon=0.)
+      batch_norm = BatchNormalization(batchnorm_layer=layer, training=False)
+      base_dist = distributions.MultivariateNormalDiag(loc=[0., 0.])
+      dist = transformed_distribution_lib.TransformedDistribution(
+          distribution=base_dist,
+          bijector=batch_norm,
+          validate_args=True)
+      samples = dist.sample(int(1e5))
+      # No volume distortion since training=False, bijector is initialized
+      # to the identity transformation.
+      base_log_prob = base_dist.log_prob(samples)
+      dist_log_prob = dist.log_prob(samples)
+      variables.global_variables_initializer().run()
+      base_log_prob_, dist_log_prob_ = sess.run([base_log_prob, dist_log_prob])
+      self.assertAllClose(base_log_prob_, dist_log_prob_)
+
+  def testMutuallyConsistent(self):
+    # BatchNorm bijector is only mutually consistent when training=False.
+    dims = 4
+    with self.test_session() as sess:
+      layer = normalization.BatchNormalization(epsilon=0.)
+      batch_norm = BatchNormalization(batchnorm_layer=layer, training=False)
+      dist = transformed_distribution_lib.TransformedDistribution(
+          distribution=normal_lib.Normal(loc=0., scale=1.),
+          bijector=batch_norm,
+          event_shape=[dims],
+          validate_args=True)
+      self.run_test_sample_consistent_log_prob(
+          sess_run_fn=sess.run,
+          dist=dist,
+          num_samples=int(1e5),
+          radius=2.,
+          center=0.,
+          rtol=0.02)
+
+  def testInvertMutuallyConsistent(self):
+    # BatchNorm bijector is only mutually consistent when training=False.
+    dims = 4
+    with self.test_session() as sess:
+      layer = normalization.BatchNormalization(epsilon=0.)
+      batch_norm = Invert(
+          BatchNormalization(batchnorm_layer=layer, training=False))
+      dist = transformed_distribution_lib.TransformedDistribution(
+          distribution=normal_lib.Normal(loc=0., scale=1.),
+          bijector=batch_norm,
+          event_shape=[dims],
+          validate_args=True)
+      self.run_test_sample_consistent_log_prob(
+          sess_run_fn=sess.run,
+          dist=dist,
+          num_samples=int(1e5),
+          radius=2.,
+          center=0.,
+          rtol=0.02)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
index 20e754308449af3f0399101f4ea1bb47b3356424..ca20442c3940664feab7526110229872a6cdc41f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -20,21 +20,33 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
 from tensorflow.contrib.distributions.python.ops.bijectors.chain import Chain
 from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
+class ShapeChanging(bijector.Bijector):
+  """Only used for op_ndims manipulation."""
+
+  def __init__(self, forward_min_event_ndims=0, inverse_min_event_ndims=3):
+    super(ShapeChanging, self).__init__(
+        forward_min_event_ndims=forward_min_event_ndims,
+        inverse_min_event_ndims=inverse_min_event_ndims,
+        validate_args=False, name="shape_changer")
+
+
 class ChainBijectorTest(test.TestCase):
   """Tests the correctness of the Y = Chain(bij1, bij2, bij3) transformation."""
 
   def testBijector(self):
     with self.test_session():
-      chain = Chain((Exp(event_ndims=1), Softplus(event_ndims=1)))
+      chain = Chain((Exp(), Softplus()))
       self.assertEqual("chain_of_exp_of_softplus", chain.name)
       x = np.asarray([[[1., 2.],
                        [2., 3.]]])
@@ -42,9 +54,10 @@ class ChainBijectorTest(test.TestCase):
       self.assertAllClose(np.log(x - 1.), chain.inverse(x).eval())
       self.assertAllClose(
           -np.sum(np.log(x - 1.), axis=2),
-          chain.inverse_log_det_jacobian(x).eval())
+          chain.inverse_log_det_jacobian(x, event_ndims=1).eval())
       self.assertAllClose(
-          np.sum(x, axis=2), chain.forward_log_det_jacobian(x).eval())
+          np.sum(x, axis=2),
+          chain.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testBijectorIdentity(self):
     with self.test_session():
@@ -54,33 +67,126 @@ class ChainBijectorTest(test.TestCase):
                        [2., 3.]]])
       self.assertAllClose(x, chain.forward(x).eval())
       self.assertAllClose(x, chain.inverse(x).eval())
-      self.assertAllClose(0., chain.inverse_log_det_jacobian(x).eval())
-      self.assertAllClose(0., chain.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          0., chain.inverse_log_det_jacobian(x, event_ndims=1).eval())
+      self.assertAllClose(
+          0., chain.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = Chain((Exp(), Softplus()))
+      chain = Chain((Exp(), Softplus()))
       assert_scalar_congruency(
-          bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
+          chain, lower_x=1e-3, upper_x=1.5, rtol=0.05)
 
   def testShapeGetters(self):
     with self.test_session():
-      bijector = Chain([
-          SoftmaxCentered(
-              event_ndims=1, validate_args=True),
-          SoftmaxCentered(
-              event_ndims=0, validate_args=True)
+      chain = Chain([
+          SoftmaxCentered(validate_args=True),
+          SoftmaxCentered(validate_args=True),
       ])
-      x = tensor_shape.TensorShape([])
+      x = tensor_shape.TensorShape([1])
       y = tensor_shape.TensorShape([2 + 1])
-      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(y, chain.forward_event_shape(x))
       self.assertAllEqual(
           y.as_list(),
-          bijector.forward_event_shape_tensor(x.as_list()).eval())
-      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+          chain.forward_event_shape_tensor(x.as_list()).eval())
+      self.assertAllEqual(x, chain.inverse_event_shape(y))
       self.assertAllEqual(
           x.as_list(),
-          bijector.inverse_event_shape_tensor(y.as_list()).eval())
+          chain.inverse_event_shape_tensor(y.as_list()).eval())
+
+  def testMinEventNdimsChain(self):
+    chain = Chain([Exp(), Exp(), Exp()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), Affine(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([Exp(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), Exp()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), Exp(), Softplus(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+  def testMinEventNdimsShapeChangingAddDims(self):
+    chain = Chain([ShapeChanging()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(3, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(4, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), ShapeChanging()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(3, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(), ShapeChanging()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(6, chain.inverse_min_event_ndims)
+
+  def testMinEventNdimsShapeChangingRemoveDims(self):
+    chain = Chain([ShapeChanging(3, 0)])
+    self.assertEqual(3, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(3, 0), Affine()])
+    self.assertEqual(3, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), ShapeChanging(3, 0)])
+    self.assertEqual(4, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(3, 0), ShapeChanging(3, 0)])
+    self.assertEqual(6, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+  def testMinEventNdimsShapeChangingAddRemoveDims(self):
+    chain = Chain([
+        ShapeChanging(2, 1),
+        ShapeChanging(3, 0),
+        ShapeChanging(1, 2)])
+    self.assertEqual(4, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+  def testChainExpAffine(self):
+    scale_diag = np.array([1., 2., 3.], dtype=np.float32)
+    chain = Chain([Exp(), Affine(scale_diag=scale_diag)])
+    x = [0., np.log(2., dtype=np.float32), np.log(3., dtype=np.float32)]
+    y = [1., 4., 27.]
+    self.assertAllClose(y, self.evaluate(chain.forward(x)))
+    self.assertAllClose(x, self.evaluate(chain.inverse(y)))
+    self.assertAllClose(
+        np.log(6, dtype=np.float32) + np.sum(scale_diag * x),
+        self.evaluate(chain.forward_log_det_jacobian(x, event_ndims=1)))
+
+    self.assertAllClose(
+        -np.log(6, dtype=np.float32) - np.sum(scale_diag * x),
+        self.evaluate(chain.inverse_log_det_jacobian(y, event_ndims=1)))
+
+  def testChainAffineExp(self):
+    scale_diag = np.array([1., 2., 3.], dtype=np.float32)
+    chain = Chain([Affine(scale_diag=scale_diag), Exp()])
+    x = [0., np.log(2., dtype=np.float32), np.log(3., dtype=np.float32)]
+    y = [1., 4., 9.]
+    self.assertAllClose(y, self.evaluate(chain.forward(x)))
+    self.assertAllClose(x, self.evaluate(chain.inverse(y)))
+    self.assertAllClose(
+        np.log(6, dtype=np.float32) + np.sum(x),
+        self.evaluate(chain.forward_log_det_jacobian(x, event_ndims=1)))
+
+    self.assertAllClose(
+        -np.log(6, dtype=np.float32) - np.sum(x),
+        self.evaluate(chain.inverse_log_det_jacobian(y, event_ndims=1)))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
index 0ff35304283fce9ce3f9e5d31b1258394e384d7b..e281e81bdf0698c1f7b2f60fb27783dd1351773f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
@@ -18,70 +18,114 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.distributions import gamma as gamma_lib
-from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
-from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
-class InvertBijectorTest(test.TestCase):
-  """Tests the correctness of the Y = Invert(bij) transformation."""
+class CholeskyOuterProductBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = X @ X.T transformation."""
 
-  def testBijector(self):
+  def testBijectorMatrix(self):
     with self.test_session():
-      for fwd in [
-          bijectors.Identity(),
-          bijectors.Exp(event_ndims=1),
-          bijectors.Affine(
-              shift=[0., 1.], scale_diag=[2., 3.], event_ndims=1),
-          bijectors.Softplus(event_ndims=1),
-          bijectors.SoftmaxCentered(event_ndims=1),
-          bijectors.SigmoidCentered(),
-      ]:
-        rev = bijectors.Invert(fwd)
-        self.assertEqual("_".join(["invert", fwd.name]), rev.name)
-        x = [[[1., 2.],
-              [2., 3.]]]
-        self.assertAllClose(fwd.inverse(x).eval(), rev.forward(x).eval())
-        self.assertAllClose(fwd.forward(x).eval(), rev.inverse(x).eval())
-        self.assertAllClose(
-            fwd.forward_log_det_jacobian(x).eval(),
-            rev.inverse_log_det_jacobian(x).eval())
-        self.assertAllClose(
-            fwd.inverse_log_det_jacobian(x).eval(),
-            rev.forward_log_det_jacobian(x).eval())
+      bijector = bijectors.CholeskyOuterProduct(validate_args=True)
+      self.assertEqual("cholesky_outer_product", bijector.name)
+      x = [[[1., 0], [2, 1]], [[np.sqrt(2.), 0], [np.sqrt(8.), 1]]]
+      y = np.matmul(x, np.transpose(x, axes=(0, 2, 1)))
+      # Fairly easy to compute differentials since we have 2x2.
+      dx_dy = [[[2. * 1, 0, 0],
+                [2, 1, 0],
+                [0, 2 * 2, 2 * 1]],
+               [[2 * np.sqrt(2.), 0, 0],
+                [np.sqrt(8.), np.sqrt(2.), 0],
+                [0, 2 * np.sqrt(8.), 2 * 1]]]
+      ildj = -np.sum(
+          np.log(np.asarray(dx_dy).diagonal(
+              offset=0, axis1=1, axis2=2)),
+          axis=1)
+      self.assertAllEqual((2, 2, 2), bijector.forward(x).get_shape())
+      self.assertAllEqual((2, 2, 2), bijector.inverse(y).get_shape())
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+      self.assertAllClose(
+          ildj, bijector.inverse_log_det_jacobian(
+              y, event_ndims=2).eval(), atol=0., rtol=1e-7)
+      self.assertAllClose(
+          -bijector.inverse_log_det_jacobian(
+              y, event_ndims=2).eval(),
+          bijector.forward_log_det_jacobian(
+              x, event_ndims=2).eval(),
+          atol=0.,
+          rtol=1e-7)
 
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = bijectors.Invert(bijectors.Exp())
-      assert_scalar_congruency(
-          bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
+  def testNoBatchStatic(self):
+    x = np.array([[1., 0], [2, 1]])  # np.linalg.cholesky(y)
+    y = np.array([[1., 2], [2, 5]])  # np.matmul(x, x.T)
+    with self.test_session() as sess:
+      y_actual = bijectors.CholeskyOuterProduct().forward(x=x)
+      x_actual = bijectors.CholeskyOuterProduct().inverse(y=y)
+    [y_actual_, x_actual_] = sess.run([y_actual, x_actual])
+    self.assertAllEqual([2, 2], y_actual.get_shape())
+    self.assertAllEqual([2, 2], x_actual.get_shape())
+    self.assertAllClose(y, y_actual_)
+    self.assertAllClose(x, x_actual_)
 
-  def testShapeGetters(self):
-    with self.test_session():
-      bijector = bijectors.Invert(bijectors.SigmoidCentered(validate_args=True))
-      x = tensor_shape.TensorShape([2])
-      y = tensor_shape.TensorShape([])
-      self.assertAllEqual(y, bijector.forward_event_shape(x))
-      self.assertAllEqual(
-          y.as_list(),
-          bijector.forward_event_shape_tensor(x.as_list()).eval())
-      self.assertAllEqual(x, bijector.inverse_event_shape(y))
-      self.assertAllEqual(
-          x.as_list(),
-          bijector.inverse_event_shape_tensor(y.as_list()).eval())
+  def testNoBatchDeferred(self):
+    x = np.array([[1., 0], [2, 1]])  # np.linalg.cholesky(y)
+    y = np.array([[1., 2], [2, 5]])  # np.matmul(x, x.T)
+    with self.test_session() as sess:
+      x_pl = array_ops.placeholder(dtypes.float32)
+      y_pl = array_ops.placeholder(dtypes.float32)
+      y_actual = bijectors.CholeskyOuterProduct().forward(x=x_pl)
+      x_actual = bijectors.CholeskyOuterProduct().inverse(y=y_pl)
+    [y_actual_, x_actual_] = sess.run([y_actual, x_actual],
+                                      feed_dict={x_pl: x, y_pl: y})
+    self.assertEqual(None, y_actual.get_shape())
+    self.assertEqual(None, x_actual.get_shape())
+    self.assertAllClose(y, y_actual_)
+    self.assertAllClose(x, x_actual_)
 
-  def testDocstringExample(self):
-    with self.test_session():
-      exp_gamma_distribution = (
-          transformed_distribution_lib.TransformedDistribution(
-              distribution=gamma_lib.Gamma(concentration=1., rate=2.),
-              bijector=bijectors.Invert(bijectors.Exp())))
-      self.assertAllEqual(
-          [], array_ops.shape(exp_gamma_distribution.sample()).eval())
+  def testBatchStatic(self):
+    x = np.array([[[1., 0],
+                   [2, 1]],
+                  [[3., 0],
+                   [1, 2]]])  # np.linalg.cholesky(y)
+    y = np.array([[[1., 2],
+                   [2, 5]],
+                  [[9., 3],
+                   [3, 5]]])  # np.matmul(x, x.T)
+    with self.test_session() as sess:
+      y_actual = bijectors.CholeskyOuterProduct().forward(x=x)
+      x_actual = bijectors.CholeskyOuterProduct().inverse(y=y)
+    [y_actual_, x_actual_] = sess.run([y_actual, x_actual])
+    self.assertEqual([2, 2, 2], y_actual.get_shape())
+    self.assertEqual([2, 2, 2], x_actual.get_shape())
+    self.assertAllClose(y, y_actual_)
+    self.assertAllClose(x, x_actual_)
+
+  def testBatchDeferred(self):
+    x = np.array([[[1., 0],
+                   [2, 1]],
+                  [[3., 0],
+                   [1, 2]]])  # np.linalg.cholesky(y)
+    y = np.array([[[1., 2],
+                   [2, 5]],
+                  [[9., 3],
+                   [3, 5]]])  # np.matmul(x, x.T)
+    with self.test_session() as sess:
+      x_pl = array_ops.placeholder(dtypes.float32)
+      y_pl = array_ops.placeholder(dtypes.float32)
+      y_actual = bijectors.CholeskyOuterProduct().forward(x=x_pl)
+      x_actual = bijectors.CholeskyOuterProduct().inverse(y=y_pl)
+    [y_actual_, x_actual_] = sess.run([y_actual, x_actual],
+                                      feed_dict={x_pl: x, y_pl: y})
+    self.assertEqual(None, y_actual.get_shape())
+    self.assertEqual(None, x_actual.get_shape())
+    self.assertAllClose(y, y_actual_)
+    self.assertAllClose(x, x_actual_)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
index 26e0d2a539c78540603281ae0f361987a7bf8d90..8b279ebcd908b6f375b35594ac5f3db9228a1e31 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
@@ -27,7 +27,7 @@ class _TestBijector(ConditionalBijector):
 
   def __init__(self):
     super(_TestBijector, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=0,
         graph_parents=[],
         is_constant_jacobian=True,
         validate_args=False,
@@ -51,11 +51,15 @@ class ConditionalBijectorTest(test.TestCase):
 
   def testConditionalBijector(self):
     b = _TestBijector()
-    for name in ["forward", "inverse", "inverse_log_det_jacobian",
-                 "forward_log_det_jacobian"]:
+    for name in ["forward", "inverse"]:
       method = getattr(b, name)
       with self.assertRaisesRegexp(ValueError, name + ".*b1.*b2"):
-        method(1.0, arg1="b1", arg2="b2")
+        method(1., arg1="b1", arg2="b2")
+
+    for name in ["inverse_log_det_jacobian", "forward_log_det_jacobian"]:
+      method = getattr(b, name)
+      with self.assertRaisesRegexp(ValueError, name + ".*b1.*b2"):
+        method(1., event_ndims=0., arg1="b1", arg2="b2")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
index 9970c0b4d86afda188d9401ebaf3c98d3fffbfdf..7be939cd274e6f0e33c9b01c82494755db2caa73 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
@@ -31,17 +31,21 @@ class ExpBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      bijector = Exp(event_ndims=1)
+      bijector = Exp()
       self.assertEqual("exp", bijector.name)
       x = [[[1.], [2.]]]
       y = np.exp(x)
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          -np.sum(np.log(y), axis=-1),
-          bijector.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-bijector.inverse_log_det_jacobian(np.exp(x)).eval(),
-                          bijector.forward_log_det_jacobian(x).eval())
+          -np.squeeze(np.log(y), axis=-1),
+          bijector.inverse_log_det_jacobian(
+              y, event_ndims=1).eval())
+      self.assertAllClose(
+          -bijector.inverse_log_det_jacobian(
+              np.exp(x), event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(
+              x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
@@ -51,10 +55,10 @@ class ExpBijectorTest(test.TestCase):
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = Exp(event_ndims=0)
+      bijector = Exp()
       x = np.linspace(-10, 10, num=10).astype(np.float32)
       y = np.logspace(-10, 10, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
index 9a905980c7581a86bbcda8c6c726da57c09fe4f8..54e54c3296a89a4fe29a3cce971760502b65e784 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
@@ -34,7 +34,7 @@ class GumbelBijectorTest(test.TestCase):
     with self.test_session():
       loc = 0.3
       scale = 5.
-      bijector = Gumbel(loc=loc, scale=scale, event_ndims=1, validate_args=True)
+      bijector = Gumbel(loc=loc, scale=scale, validate_args=True)
       self.assertEqual("gumbel", bijector.name)
       x = np.array([[[-3.], [0.], [0.5], [4.2], [12.]]], dtype=np.float32)
       # Gumbel distribution
@@ -43,13 +43,11 @@ class GumbelBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          # We should lose a dimension from calculating the determinant of the
-          # jacobian.
-          np.squeeze(gumbel_dist.logpdf(x), axis=2),
-          bijector.forward_log_det_jacobian(x).eval())
+          np.squeeze(gumbel_dist.logpdf(x), axis=-1),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -60,10 +58,10 @@ class GumbelBijectorTest(test.TestCase):
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = Gumbel(loc=0., scale=3.0, event_ndims=0, validate_args=True)
+      bijector = Gumbel(loc=0., scale=3.0, validate_args=True)
       x = np.linspace(-10., 10., num=10).astype(np.float32)
       y = np.linspace(0.01, 0.99, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
index 739fa6d439a8bce993ab1b4601489d9bbcd69bee..7d3bd758cd2db307f95d2d934923ea2133dc1217 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
@@ -33,15 +33,13 @@ class InlineBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      exp = Exp(event_ndims=1)
+      exp = Exp()
       inline = Inline(
           forward_fn=math_ops.exp,
           inverse_fn=math_ops.log,
-          inverse_log_det_jacobian_fn=(
-              lambda y: -math_ops.reduce_sum(  # pylint: disable=g-long-lambda
-                  math_ops.log(y), reduction_indices=-1)),
-          forward_log_det_jacobian_fn=(
-              lambda x: math_ops.reduce_sum(x, reduction_indices=-1)),
+          inverse_log_det_jacobian_fn=lambda y: -math_ops.log(y),
+          forward_log_det_jacobian_fn=lambda x: x,
+          forward_min_event_ndims=0,
           name="exp")
 
       self.assertEqual(exp.name, inline.name)
@@ -51,9 +49,10 @@ class InlineBijectorTest(test.TestCase):
       self.assertAllClose(x, inline.inverse(y).eval())
       self.assertAllClose(
           -np.sum(np.log(y), axis=-1),
-          inline.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-inline.inverse_log_det_jacobian(y).eval(),
-                          inline.forward_log_det_jacobian(x).eval())
+          inline.inverse_log_det_jacobian(y, event_ndims=1).eval())
+      self.assertAllClose(
+          -inline.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          inline.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testShapeGetters(self):
     with self.test_session():
@@ -62,6 +61,7 @@ class InlineBijectorTest(test.TestCase):
           forward_event_shape_fn=lambda x: x.as_list() + [1],
           inverse_event_shape_tensor_fn=lambda x: x[:-1],
           inverse_event_shape_fn=lambda x: x[:-1],
+          forward_min_event_ndims=0,
           name="shape_only")
       x = tensor_shape.TensorShape([1, 2, 3])
       y = tensor_shape.TensorShape([1, 2, 3, 1])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
index 0ff35304283fce9ce3f9e5d31b1258394e384d7b..8b14c8327f08902044f50483f9f8dfe67b58cd70 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
@@ -34,12 +34,10 @@ class InvertBijectorTest(test.TestCase):
     with self.test_session():
       for fwd in [
           bijectors.Identity(),
-          bijectors.Exp(event_ndims=1),
-          bijectors.Affine(
-              shift=[0., 1.], scale_diag=[2., 3.], event_ndims=1),
-          bijectors.Softplus(event_ndims=1),
-          bijectors.SoftmaxCentered(event_ndims=1),
-          bijectors.SigmoidCentered(),
+          bijectors.Exp(),
+          bijectors.Affine(shift=[0., 1.], scale_diag=[2., 3.]),
+          bijectors.Softplus(),
+          bijectors.SoftmaxCentered(),
       ]:
         rev = bijectors.Invert(fwd)
         self.assertEqual("_".join(["invert", fwd.name]), rev.name)
@@ -48,11 +46,11 @@ class InvertBijectorTest(test.TestCase):
         self.assertAllClose(fwd.inverse(x).eval(), rev.forward(x).eval())
         self.assertAllClose(fwd.forward(x).eval(), rev.inverse(x).eval())
         self.assertAllClose(
-            fwd.forward_log_det_jacobian(x).eval(),
-            rev.inverse_log_det_jacobian(x).eval())
+            fwd.forward_log_det_jacobian(x, event_ndims=1).eval(),
+            rev.inverse_log_det_jacobian(x, event_ndims=1).eval())
         self.assertAllClose(
-            fwd.inverse_log_det_jacobian(x).eval(),
-            rev.forward_log_det_jacobian(x).eval())
+            fwd.inverse_log_det_jacobian(x, event_ndims=1).eval(),
+            rev.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
@@ -62,9 +60,9 @@ class InvertBijectorTest(test.TestCase):
 
   def testShapeGetters(self):
     with self.test_session():
-      bijector = bijectors.Invert(bijectors.SigmoidCentered(validate_args=True))
+      bijector = bijectors.Invert(bijectors.SoftmaxCentered(validate_args=True))
       x = tensor_shape.TensorShape([2])
-      y = tensor_shape.TensorShape([])
+      y = tensor_shape.TensorShape([1])
       self.assertAllEqual(y, bijector.forward_event_shape(x))
       self.assertAllEqual(
           y.as_list(),
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8089881f684db9f8876d6dd738e52bf2f1f7606
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
@@ -0,0 +1,77 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Kumaraswamy Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.kumaraswamy import Kumaraswamy
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class KumaraswamyBijectorTest(test.TestCase):
+  """Tests correctness of the Kumaraswamy bijector."""
+
+  def testBijector(self):
+    with self.test_session():
+      a = 2.
+      b = 0.3
+      bijector = Kumaraswamy(
+          concentration1=a, concentration0=b, validate_args=True)
+      self.assertEqual("kumaraswamy", bijector.name)
+      x = np.array([[[0.1], [0.2], [0.3], [0.4], [0.5]]], dtype=np.float32)
+      # Kumaraswamy cdf. This is the same as inverse(x).
+      y = 1. - (1. - x ** a) ** b
+      self.assertAllClose(y, bijector.inverse(x).eval())
+      self.assertAllClose(x, bijector.forward(y).eval())
+      kumaraswamy_log_pdf = (np.log(a) + np.log(b) + (a - 1) * np.log(x) +
+                             (b - 1) * np.log1p(-x ** a))
+
+      self.assertAllClose(
+          np.squeeze(kumaraswamy_log_pdf, axis=-1),
+          bijector.inverse_log_det_jacobian(x, event_ndims=1).eval())
+      self.assertAllClose(
+          -bijector.inverse_log_det_jacobian(x, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(y, event_ndims=1).eval(),
+          rtol=1e-4,
+          atol=0.)
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      assert_scalar_congruency(
+          Kumaraswamy(concentration1=0.5, concentration0=1.1),
+          lower_x=0., upper_x=1., n=int(10e3), rtol=0.02)
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      concentration1 = 1.2
+      concentration0 = 2.
+      bijector = Kumaraswamy(
+          concentration1=concentration1,
+          concentration0=concentration0, validate_args=True)
+      # Omitting the endpoints 0 and 1, since idlj will be infinity at these
+      # endpoints.
+      y = np.linspace(.01, 0.99, num=10).astype(np.float32)
+      x = 1 - (1 - y ** concentration1) ** concentration0
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
index dcfb0eb05185d36d96947905c2eb91b2201aece1..5ba5a2083bf11791d7d58146dc2e6283b524d241 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
@@ -79,9 +79,10 @@ class MaskedAutoregressiveFlowTest(test_util.VectorDistributionTestHelpers,
       forward_x = ma.forward(x)
       # Use identity to invalidate cache.
       inverse_y = ma.inverse(array_ops.identity(forward_x))
-      fldj = ma.forward_log_det_jacobian(x)
+      fldj = ma.forward_log_det_jacobian(x, event_ndims=1)
       # Use identity to invalidate cache.
-      ildj = ma.inverse_log_det_jacobian(array_ops.identity(forward_x))
+      ildj = ma.inverse_log_det_jacobian(
+          array_ops.identity(forward_x), event_ndims=1)
       variables.global_variables_initializer().run()
       [
           forward_x_,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f5219588fb3be67beb797ba68ed8148e9e9fd2
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
@@ -0,0 +1,109 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.ordered import Ordered
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.platform import test
+
+
+
+class OrderedBijectorTest(test.TestCase):
+  """Tests correctness of the ordered transformation."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorVector(self):
+    with self.test_session():
+      ordered = Ordered()
+      self.assertEqual("ordered", ordered.name)
+      x = np.asarray([[2., 3, 4], [4., 8, 13]])
+      y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]]
+      self.assertAllClose(y, self.evaluate(ordered.forward(x)))
+      self.assertAllClose(x, self.evaluate(ordered.inverse(y)))
+      self.assertAllClose(
+          np.sum(np.asarray(y)[..., 1:], axis=-1),
+          self.evaluate(ordered.inverse_log_det_jacobian(y, event_ndims=1)),
+          atol=0.,
+          rtol=1e-7)
+      self.assertAllClose(
+          self.evaluate(-ordered.inverse_log_det_jacobian(y, event_ndims=1)),
+          self.evaluate(ordered.forward_log_det_jacobian(x, event_ndims=1)),
+          atol=0.,
+          rtol=1e-7)
+
+  def testBijectorUnknownShape(self):
+    with self.test_session():
+      ordered = Ordered()
+      self.assertEqual("ordered", ordered.name)
+      x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
+      real_x = np.asarray([[2., 3, 4], [4., 8, 13]])
+      y = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
+      real_y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]]
+      self.assertAllClose(real_y, ordered.forward(x).eval(
+          feed_dict={x: real_x}))
+      self.assertAllClose(real_x, ordered.inverse(y).eval(
+          feed_dict={y: real_y}))
+      self.assertAllClose(
+          np.sum(np.asarray(real_y)[..., 1:], axis=-1),
+          ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(
+              feed_dict={y: real_y}),
+          atol=0.,
+          rtol=1e-7)
+      self.assertAllClose(
+          -ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(
+              feed_dict={y: real_y}),
+          ordered.forward_log_det_jacobian(x, event_ndims=1).eval(
+              feed_dict={x: real_x}),
+          atol=0.,
+          rtol=1e-7)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testShapeGetters(self):
+    with self.test_session():
+      x = tensor_shape.TensorShape([4])
+      y = tensor_shape.TensorShape([4])
+      bijector = Ordered(validate_args=True)
+      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(y.as_list(),
+                          self.evaluate(bijector.forward_event_shape_tensor(
+                              x.as_list())))
+      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+      self.assertAllEqual(x.as_list(),
+                          self.evaluate(bijector.inverse_event_shape_tensor(
+                              y.as_list())))
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      ordered = Ordered()
+      x = np.sort(self._rng.randn(3, 10), axis=-1).astype(np.float32)
+      y = (self._rng.randn(3, 10)).astype(np.float32)
+      assert_bijective_and_finite(ordered, x, y, event_ndims=1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
index 54590de373441c32cc3214cb04d45cfc2d1807ed..7eef4ab599951bbb624652f13a0091363b36b93d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
@@ -53,8 +53,8 @@ class PermuteBijectorTest(test.TestCase):
           bijector.permutation,
           bijector.inverse(expected_y),
           bijector.forward(expected_x),
-          bijector.forward_log_det_jacobian(expected_x),
-          bijector.inverse_log_det_jacobian(expected_y),
+          bijector.forward_log_det_jacobian(expected_x, event_ndims=1),
+          bijector.inverse_log_det_jacobian(expected_y, event_ndims=1),
       ], feed_dict={permutation_ph: expected_permutation})
       self.assertEqual("permute", bijector.name)
       self.assertAllEqual(expected_permutation, permutation_)
@@ -78,10 +78,9 @@ class PermuteBijectorTest(test.TestCase):
     x = np.random.randn(4, 2, 3)
     y = x[..., permutation]
     with self.test_session():
-      bijector = Permute(
-          permutation=permutation,
-          validate_args=True)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
+      bijector = Permute(permutation=permutation, validate_args=True)
+      assert_bijective_and_finite(
+          bijector, x, y, event_ndims=1, rtol=1e-6, atol=0)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
index de1659aa9f4d0f7d19ec2e8185715573b78eaf2b..85d22830132816cd6c77cd0b07870f3a22ae9798 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
@@ -32,8 +32,7 @@ class PowerTransformBijectorTest(test.TestCase):
   def testBijector(self):
     with self.test_session():
       c = 0.2
-      bijector = PowerTransform(
-          power=c, event_ndims=1, validate_args=True)
+      bijector = PowerTransform(power=c, validate_args=True)
       self.assertEqual("power_transform", bijector.name)
       x = np.array([[[-1.], [2.], [-5. + 1e-4]]])
       y = (1. + x * c)**(1. / c)
@@ -41,27 +40,25 @@ class PowerTransformBijectorTest(test.TestCase):
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
           (c - 1.) * np.sum(np.log(y), axis=-1),
-          bijector.inverse_log_det_jacobian(y).eval())
+          bijector.inverse_log_det_jacobian(y, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = PowerTransform(
-          power=0.2, validate_args=True)
+      bijector = PowerTransform(power=0.2, validate_args=True)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = PowerTransform(
-          power=0.2, event_ndims=0, validate_args=True)
+      bijector = PowerTransform(power=0.2, validate_args=True)
       x = np.linspace(-4.999, 10, num=10).astype(np.float32)
       y = np.logspace(0.001, 10, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
index 46fe7797419a9906ecdad60dd0dfe1e9d7c743ed..2d52895fbe0967cdd2260d6d298a291286858d09 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
@@ -52,24 +52,28 @@ class RealNVPTest(test_util.VectorDistributionTestHelpers, test.TestCase):
       forward_x = nvp.forward(x)
       # Use identity to invalidate cache.
       inverse_y = nvp.inverse(array_ops.identity(forward_x))
-      fldj = nvp.forward_log_det_jacobian(x)
+      forward_inverse_y = nvp.forward(inverse_y)
+      fldj = nvp.forward_log_det_jacobian(x, event_ndims=1)
       # Use identity to invalidate cache.
-      ildj = nvp.inverse_log_det_jacobian(array_ops.identity(forward_x))
+      ildj = nvp.inverse_log_det_jacobian(
+          array_ops.identity(forward_x), event_ndims=1)
       variables.global_variables_initializer().run()
       [
           forward_x_,
           inverse_y_,
+          forward_inverse_y_,
           ildj_,
           fldj_,
       ] = sess.run([
           forward_x,
           inverse_y,
+          forward_inverse_y,
           ildj,
           fldj,
       ])
       self.assertEqual("real_nvp", nvp.name)
-      self.assertAllClose(forward_x_, forward_x_, rtol=1e-6, atol=0.)
-      self.assertAllClose(x_, inverse_y_, rtol=1e-5, atol=0.)
+      self.assertAllClose(forward_x_, forward_inverse_y_, rtol=1e-1, atol=0.)
+      self.assertAllClose(x_, inverse_y_, rtol=1e-1, atol=0.)
       self.assertAllClose(ildj_, -fldj_, rtol=1e-6, atol=0.)
 
   def testMutuallyConsistent(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index e216d88cb190dc16fc0056186f80817d6f2d7c67..46f2c63f9b0f78b25bb1948e6ea55ab20c5cfa6e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -65,8 +65,8 @@ class _ReshapeBijectorTest(object):
        ildj_) = sess.run((
            bijector.inverse(expected_y),
            bijector.forward(expected_x),
-           bijector.forward_log_det_jacobian(expected_x),
-           bijector.inverse_log_det_jacobian(expected_y),
+           bijector.forward_log_det_jacobian(expected_x, event_ndims=2),
+           bijector.inverse_log_det_jacobian(expected_y, event_ndims=2),
        ), feed_dict=feed_dict)
       self.assertEqual("reshape", bijector.name)
       self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
@@ -301,7 +301,8 @@ class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
           event_shape_in=[2, 3],
           event_shape_out=[1, 2, 3],
           validate_args=True)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
+      assert_bijective_and_finite(
+          bijector, x, y, event_ndims=2, rtol=1e-6, atol=0)
 
   def testInvalidDimensionsOpError(self):
     if ops._USE_C_API:
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py
deleted file mode 100644
index 4ff3f334ccb59f1c117b3d35032d9e799cfd79bb..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import SigmoidCentered
-from tensorflow.python.platform import test
-
-
-class SigmoidCenteredBijectorTest(test.TestCase):
-  """Tests correctness of the Y = g(X) = (1 + exp(-X))^-1 transformation."""
-
-  def testBijector(self):
-    with self.test_session():
-      sigmoid = SigmoidCentered()
-      self.assertEqual("sigmoid_centered", sigmoid.name)
-      x = np.log([[2., 3, 4],
-                  [4., 8, 12]])
-      y = [[[2. / 3, 1. / 3],
-            [3. / 4, 1. / 4],
-            [4. / 5, 1. / 5]],
-           [[4. / 5, 1. / 5],
-            [8. / 9, 1. / 9],
-            [12. / 13, 1. / 13]]]
-      self.assertAllClose(y, sigmoid.forward(x).eval())
-      self.assertAllClose(x, sigmoid.inverse(y).eval())
-      self.assertAllClose(
-          -np.sum(np.log(y), axis=2),
-          sigmoid.inverse_log_det_jacobian(y).eval(),
-          atol=0.,
-          rtol=1e-7)
-      self.assertAllClose(
-          -sigmoid.inverse_log_det_jacobian(y).eval(),
-          sigmoid.forward_log_det_jacobian(x).eval(),
-          atol=0.,
-          rtol=1e-7)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
index e4f9d72785c301284812a48c0a67614ca439ffae..cea4a62c22af5d98d38ee881b29c773e6a27a4b4 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
@@ -36,12 +36,13 @@ class SigmoidBijectorTest(test.TestCase):
       x = np.linspace(-10., 10., 100).reshape([2, 5, 10]).astype(np.float32)
       y = special.expit(x)
       ildj = -np.log(y) - np.log1p(-y)
-      self.assertAllClose(y, Sigmoid().forward(x).eval(), atol=0., rtol=1e-2)
-      self.assertAllClose(x, Sigmoid().inverse(y).eval(), atol=0., rtol=1e-4)
-      self.assertAllClose(ildj, Sigmoid().inverse_log_det_jacobian(y).eval(),
-                          atol=0., rtol=1e-6)
-      self.assertAllClose(-ildj, Sigmoid().forward_log_det_jacobian(x).eval(),
-                          atol=0., rtol=1e-4)
+      bijector = Sigmoid()
+      self.assertAllClose(y, bijector.forward(x).eval(), atol=0., rtol=1e-2)
+      self.assertAllClose(x, bijector.inverse(y).eval(), atol=0., rtol=1e-4)
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(
+          y, event_ndims=0).eval(), atol=0., rtol=1e-6)
+      self.assertAllClose(-ildj, bijector.forward_log_det_jacobian(
+          x, event_ndims=0).eval(), atol=0., rtol=1e-4)
 
   def testScalarCongruency(self):
     with self.test_session():
@@ -52,7 +53,8 @@ class SigmoidBijectorTest(test.TestCase):
       x = np.linspace(-7., 7., 100).astype(np.float32)
       eps = 1e-3
       y = np.linspace(eps, 1. - eps, 100).astype(np.float32)
-      assert_bijective_and_finite(Sigmoid(), x, y, atol=0., rtol=1e-4)
+      assert_bijective_and_finite(
+          Sigmoid(), x, y, event_ndims=0, atol=0., rtol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 172c180a44229089f06f250a872bc47a89991cf0..45760a29ee42835da69ef63803ccec7ce82a5a8f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -39,7 +39,6 @@ class SinhArcsinhBijectorTest(test.TestCase):
       bijector = SinhArcsinh(
           skewness=skewness,
           tailweight=tailweight,
-          event_ndims=1,
           validate_args=True)
       self.assertEqual("SinhArcsinh", bijector.name)
       x = np.array([[[-2.01], [2.], [1e-4]]]).astype(np.float32)
@@ -50,10 +49,11 @@ class SinhArcsinhBijectorTest(test.TestCase):
           np.sum(
               np.log(np.cosh(np.arcsinh(y) / tailweight - skewness)) -
               np.log(tailweight) - np.log(np.sqrt(y**2 + 1)),
-              axis=-1), bijector.inverse_log_det_jacobian(y).eval())
+              axis=-1),
+          bijector.inverse_log_det_jacobian(y, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -106,14 +106,15 @@ class SinhArcsinhBijectorTest(test.TestCase):
       bijector = SinhArcsinh(skewness=-1., tailweight=0.5, validate_args=True)
       x = np.concatenate((-np.logspace(-2, 10, 1000), [0], np.logspace(
           -2, 10, 1000))).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, x, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, x, event_ndims=0, rtol=1e-3)
 
   def testBijectiveAndFiniteSkewness1Tailweight3(self):
     with self.test_session():
       bijector = SinhArcsinh(skewness=1., tailweight=3., validate_args=True)
       x = np.concatenate((-np.logspace(-2, 5, 1000), [0], np.logspace(
           -2, 5, 1000))).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, x, rtol=1e-3)
+      assert_bijective_and_finite(
+          bijector, x, x, event_ndims=0, rtol=1e-3)
 
   def testBijectorEndpoints(self):
     with self.test_session():
@@ -124,7 +125,8 @@ class SinhArcsinhBijectorTest(test.TestCase):
             [np.finfo(dtype).min, np.finfo(dtype).max], dtype=dtype)
         # Note that the above bijector is the identity bijector. Hence, the
         # log_det_jacobian will be 0. Because of this we use atol.
-        assert_bijective_and_finite(bijector, bounds, bounds, atol=2e-6)
+        assert_bijective_and_finite(
+            bijector, bounds, bounds, event_ndims=0, atol=2e-6)
 
   def testBijectorOverRange(self):
     with self.test_session():
@@ -156,12 +158,12 @@ class SinhArcsinhBijectorTest(test.TestCase):
                 np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
                     y_float128**2 + 1)) -
             np.log(tailweight),
-            bijector.inverse_log_det_jacobian(y).eval(),
+            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             rtol=1e-4,
             atol=0.)
         self.assertAllClose(
-            -bijector.inverse_log_det_jacobian(y).eval(),
-            bijector.forward_log_det_jacobian(x).eval(),
+            -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+            bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
             rtol=1e-4,
             atol=0.)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
index 62e3869db090e9c9327bc552d10234ff76ba28fd..0f0a2fa531a0585a709df4c2c3e2631e5c275986 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
@@ -21,7 +21,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
 from tensorflow.python.platform import test
 
@@ -32,70 +34,68 @@ rng = np.random.RandomState(42)
 class SoftmaxCenteredBijectorTest(test.TestCase):
   """Tests correctness of the Y = g(X) = exp(X) / sum(exp(X)) transformation."""
 
-  def testBijectorScalar(self):
+  def testBijectorVector(self):
     with self.test_session():
-      softmax = SoftmaxCentered()  # scalar by default
+      softmax = SoftmaxCentered()
       self.assertEqual("softmax_centered", softmax.name)
-      x = np.log([[2., 3, 4],
-                  [4., 8, 12]])
-      y = [[[2. / 3, 1. / 3],
-            [3. / 4, 1. / 4],
-            [4. / 5, 1. / 5]],
-           [[4. / 5, 1. / 5],
-            [8. / 9, 1. / 9],
-            [12. / 13, 1. / 13]]]
+      x = np.log([[2., 3, 4], [4., 8, 12]])
+      y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
       self.assertAllClose(y, softmax.forward(x).eval())
       self.assertAllClose(x, softmax.inverse(y).eval())
       self.assertAllClose(
-          -np.sum(np.log(y), axis=2),
-          softmax.inverse_log_det_jacobian(y).eval(),
+          -np.sum(np.log(y), axis=1),
+          softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(),
           atol=0.,
           rtol=1e-7)
       self.assertAllClose(
-          -softmax.inverse_log_det_jacobian(y).eval(),
-          softmax.forward_log_det_jacobian(x).eval(),
+          -softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          softmax.forward_log_det_jacobian(x, event_ndims=1).eval(),
           atol=0.,
           rtol=1e-7)
 
-  def testBijectorVector(self):
+  def testBijectorUnknownShape(self):
     with self.test_session():
-      softmax = SoftmaxCentered(event_ndims=1)
+      softmax = SoftmaxCentered()
       self.assertEqual("softmax_centered", softmax.name)
-      x = np.log([[2., 3, 4], [4., 8, 12]])
-      y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
-      self.assertAllClose(y, softmax.forward(x).eval())
-      self.assertAllClose(x, softmax.inverse(y).eval())
+      x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
+      real_x = np.log([[2., 3, 4], [4., 8, 12]])
+      y = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
+      real_y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
+      self.assertAllClose(real_y, softmax.forward(x).eval(
+          feed_dict={x: real_x}))
+      self.assertAllClose(real_x, softmax.inverse(y).eval(
+          feed_dict={y: real_y}))
       self.assertAllClose(
-          -np.sum(np.log(y), axis=1),
-          softmax.inverse_log_det_jacobian(y).eval(),
+          -np.sum(np.log(real_y), axis=1),
+          softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(
+              feed_dict={y: real_y}),
           atol=0.,
           rtol=1e-7)
       self.assertAllClose(
-          -softmax.inverse_log_det_jacobian(y).eval(),
-          softmax.forward_log_det_jacobian(x).eval(),
+          -softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(
+              feed_dict={y: real_y}),
+          softmax.forward_log_det_jacobian(x, event_ndims=1).eval(
+              feed_dict={x: real_x}),
           atol=0.,
           rtol=1e-7)
 
   def testShapeGetters(self):
     with self.test_session():
-      for x, y, b in ((tensor_shape.TensorShape([]),
-                       tensor_shape.TensorShape([2]),
-                       SoftmaxCentered(
-                           event_ndims=0, validate_args=True)),
-                      (tensor_shape.TensorShape([4]),
-                       tensor_shape.TensorShape([5]),
-                       SoftmaxCentered(
-                           event_ndims=1, validate_args=True))):
-        self.assertAllEqual(y, b.forward_event_shape(x))
-        self.assertAllEqual(y.as_list(),
-                            b.forward_event_shape_tensor(x.as_list()).eval())
-        self.assertAllEqual(x, b.inverse_event_shape(y))
-        self.assertAllEqual(x.as_list(),
-                            b.inverse_event_shape_tensor(y.as_list()).eval())
+      x = tensor_shape.TensorShape([4])
+      y = tensor_shape.TensorShape([5])
+      bijector = SoftmaxCentered(validate_args=True)
+      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(y.as_list(),
+                          bijector.forward_event_shape_tensor(
+                              x.as_list()).eval())
+      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+      self.assertAllEqual(x.as_list(),
+                          bijector.inverse_event_shape_tensor(
+                              y.as_list()).eval())
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      softmax = SoftmaxCentered(event_ndims=1)
+      softmax = SoftmaxCentered()
       x = np.linspace(-50, 50, num=10).reshape(5, 2).astype(np.float32)
       # Make y values on the simplex with a wide range.
       y_0 = np.ones(5).astype(np.float32)
@@ -104,7 +104,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
       y = np.array([y_0, y_1, y_2])
       y /= y.sum(axis=0)
       y = y.T  # y.shape = [5, 3]
-      assert_bijective_and_finite(softmax, x, y)
+      assert_bijective_and_finite(softmax, x, y, event_ndims=1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
index d9af9aec50d3d69bb10f69f2ffd6ca3a24c316f8..3d8a0a32bba3539f732140e8eb7ebeb532d73ff5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
@@ -43,13 +43,13 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testHingeSoftnessZeroRaises(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=0., validate_args=True)
+      bijector = Softplus(hinge_softness=0., validate_args=True)
       with self.assertRaisesOpError("must be non-zero"):
         bijector.forward([1., 1.]).eval()
 
   def testBijectorForwardInverseEventDimsZero(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
       y = self._softplus(x)
@@ -59,7 +59,7 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorForwardInverseWithHingeSoftnessEventDimsZero(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=1.5)
+      bijector = Softplus(hinge_softness=1.5)
       x = 2 * rng.randn(2, 10)
       y = 1.5 * self._softplus(x / 1.5)
 
@@ -68,16 +68,17 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorLogDetJacobianEventDimsZero(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       y = 2 * rng.rand(2, 10)
       # No reduction needed if event_dims = 0.
       ildj = self._softplus_ildj_before_reduction(y)
 
-      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(
+          y, event_ndims=0).eval())
 
   def testBijectorForwardInverseEventDimsOne(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=1)
+      bijector = Softplus()
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
       y = self._softplus(x)
@@ -87,58 +88,59 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorLogDetJacobianEventDimsOne(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=1)
+      bijector = Softplus()
       y = 2 * rng.rand(2, 10)
       ildj_before = self._softplus_ildj_before_reduction(y)
       ildj = np.sum(ildj_before, axis=1)
 
-      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(
+          y, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testScalarCongruencyWithPositiveHingeSoftness(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=1.3)
+      bijector = Softplus(hinge_softness=1.3)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testScalarCongruencyWithNegativeHingeSoftness(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=-1.3)
+      bijector = Softplus(hinge_softness=-1.3)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testBijectiveAndFinite32bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = np.logspace(-10, 10, 100).astype(np.float32)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-2, atol=1e-2)
+          bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFiniteWithPositiveHingeSoftness32Bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=1.23)
+      bijector = Softplus(hinge_softness=1.23)
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = np.logspace(-10, 10, 100).astype(np.float32)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-2, atol=1e-2)
+          bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFiniteWithNegativeHingeSoftness32Bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=-0.7)
+      bijector = Softplus(hinge_softness=-0.7)
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = -np.logspace(-10, 10, 100).astype(np.float32)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-2, atol=1e-2)
+          bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFinite16bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       # softplus(-20) is zero, so we can't use such a large range as in 32bit.
       x = np.linspace(-10., 20., 100).astype(np.float16)
       # Note that float16 is only in the open set (0, inf) for a smaller
@@ -146,7 +148,7 @@ class SoftplusBijectorTest(test.TestCase):
       # for the test.
       y = np.logspace(-6, 3, 100).astype(np.float16)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-1, atol=1e-3)
+          bijector, x, y, event_ndims=0, rtol=1e-1, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ac06fce55b448a5f3da7ccb7f8766b5b1404ad7
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
@@ -0,0 +1,111 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.softsign import Softsign
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class SoftsignBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = g(X) = X / (1 + |X|) transformation."""
+
+  def _softsign(self, x):
+    return x / (1. + np.abs(x))
+
+  def _softsign_ildj_before_reduction(self, y):
+    """Inverse log det jacobian, before being reduced."""
+    return -2. * np.log1p(-np.abs(y))
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorBounds(self):
+    bijector = Softsign(validate_args=True)
+    with self.test_session():
+      with self.assertRaisesOpError("greater than -1"):
+        bijector.inverse(-3.).eval()
+      with self.assertRaisesOpError("greater than -1"):
+        bijector.inverse_log_det_jacobian(-3., event_ndims=0).eval()
+
+      with self.assertRaisesOpError("less than 1"):
+        bijector.inverse(3.).eval()
+      with self.assertRaisesOpError("less than 1"):
+        bijector.inverse_log_det_jacobian(3., event_ndims=0).eval()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorForwardInverse(self):
+    bijector = Softsign(validate_args=True)
+    self.assertEqual("softsign", bijector.name)
+    x = 2. * self._rng.randn(2, 10)
+    y = self._softsign(x)
+
+    self.assertAllClose(y, self.evaluate(bijector.forward(x)))
+    self.assertAllClose(x, self.evaluate(bijector.inverse(y)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorLogDetJacobianEventDimsZero(self):
+    bijector = Softsign(validate_args=True)
+    y = self._rng.rand(2, 10)
+    # No reduction needed if event_dims = 0.
+    ildj = self._softsign_ildj_before_reduction(y)
+
+    self.assertAllClose(ildj, self.evaluate(
+        bijector.inverse_log_det_jacobian(y, event_ndims=0)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorForwardInverseEventDimsOne(self):
+    bijector = Softsign(validate_args=True)
+    self.assertEqual("softsign", bijector.name)
+    x = 2. * self._rng.randn(2, 10)
+    y = self._softsign(x)
+    self.assertAllClose(y, self.evaluate(bijector.forward(x)))
+    self.assertAllClose(x, self.evaluate(bijector.inverse(y)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorLogDetJacobianEventDimsOne(self):
+    bijector = Softsign(validate_args=True)
+    y = self._rng.rand(2, 10)
+    ildj_before = self._softsign_ildj_before_reduction(y)
+    ildj = np.sum(ildj_before, axis=1)
+    self.assertAllClose(
+        ildj, self.evaluate(
+            bijector.inverse_log_det_jacobian(y, event_ndims=1)))
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = Softsign(validate_args=True)
+      assert_scalar_congruency(bijector, lower_x=-20., upper_x=20.)
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      bijector = Softsign(validate_args=True)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = np.linspace(-0.99, 0.99, 100).astype(np.float32)
+      assert_bijective_and_finite(
+          bijector, x, y, event_ndims=0, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..30c7a738c320b609ce90685512e6b8344dffc9dc
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
@@ -0,0 +1,59 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class SquareBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = X ** 2 transformation."""
+
+  def testBijectorScalar(self):
+    with self.test_session():
+      bijector = bijectors.Square(validate_args=True)
+      self.assertEqual("square", bijector.name)
+      x = [[[1., 5],
+            [2, 1]],
+           [[np.sqrt(2.), 3],
+            [np.sqrt(8.), 1]]]
+      y = np.square(x)
+      ildj = -np.log(2.) - np.log(x)
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+      self.assertAllClose(
+          ildj, bijector.inverse_log_det_jacobian(
+              y, event_ndims=0).eval(), atol=0., rtol=1e-7)
+      self.assertAllClose(
+          -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
+          atol=0.,
+          rtol=1e-7)
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = bijectors.Square(validate_args=True)
+      assert_scalar_congruency(bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
index 7a31228d1ade55ce32b511dca073657d3bab53ae..f57adcda898a1fdb18aacbb0804411db1bb4e4c8 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
@@ -36,7 +36,7 @@ class WeibullBijectorTest(test.TestCase):
       concentration = 0.3
       bijector = Weibull(
           scale=scale, concentration=concentration,
-          event_ndims=1, validate_args=True)
+          validate_args=True)
       self.assertEqual("weibull", bijector.name)
       x = np.array([[[0.], [1.], [14.], [20.], [100.]]], dtype=np.float32)
       # Weibull distribution
@@ -45,13 +45,11 @@ class WeibullBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          # We should lose a dimension from calculating the determinant of the
-          # jacobian.
-          np.squeeze(weibull_dist.logpdf(x), axis=2),
-          bijector.forward_log_det_jacobian(x).eval())
+          weibull_dist.logpdf(x),
+          bijector.forward_log_det_jacobian(x, event_ndims=0).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -64,12 +62,12 @@ class WeibullBijectorTest(test.TestCase):
   def testBijectiveAndFinite(self):
     with self.test_session():
       bijector = Weibull(
-          scale=20., concentration=2., event_ndims=0, validate_args=True)
+          scale=20., concentration=2., validate_args=True)
       x = np.linspace(1., 8., num=10).astype(np.float32)
       y = np.linspace(
           -np.expm1(-1 / 400.),
           -np.expm1(-16), num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
index 545471907f1eabc822b3d28ea9c57e183a09ff50..4e8989b6c2f93560b1fccbc99491d7809f494263 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
@@ -44,6 +44,7 @@ class _ChooseLocation(ConditionalBijector):
           graph_parents=[self._loc],
           is_constant_jacobian=True,
           validate_args=False,
+          forward_min_event_ndims=0,
           name=name)
 
   def _forward(self, x, z):
@@ -52,7 +53,7 @@ class _ChooseLocation(ConditionalBijector):
   def _inverse(self, x, z):
     return x - self._gather_loc(z)
 
-  def _inverse_log_det_jacobian(self, x, z=None):
+  def _inverse_log_det_jacobian(self, x, event_ndims, z=None):
     return 0.
 
   def _gather_loc(self, z):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
index 507ceb35853ebe0a996d789b3bdf8a5f2284549c..f42feae25d851eb9ae0bf48649fc3bbe2a221be0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
@@ -16,6 +16,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib import distributions
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -25,23 +27,23 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
-ds = distributions
+tfd = distributions
 
 
 class DistributionTest(test.TestCase):
 
   def testParamShapesAndFromParams(self):
     classes = [
-        ds.Normal,
-        ds.Bernoulli,
-        ds.Beta,
-        ds.Chi2,
-        ds.Exponential,
-        ds.Gamma,
-        ds.InverseGamma,
-        ds.Laplace,
-        ds.StudentT,
-        ds.Uniform,
+        tfd.Normal,
+        tfd.Bernoulli,
+        tfd.Beta,
+        tfd.Chi2,
+        tfd.Exponential,
+        tfd.Gamma,
+        tfd.InverseGamma,
+        tfd.Laplace,
+        tfd.StudentT,
+        tfd.Uniform,
     ]
 
     sample_shapes = [(), (10,), (10, 20, 30)]
@@ -63,15 +65,15 @@ class DistributionTest(test.TestCase):
     with self.test_session():
       # Note: we cannot easily test all distributions since each requires
       # different initialization arguments. We therefore spot test a few.
-      normal = ds.Normal(loc=1., scale=2., validate_args=True)
+      normal = tfd.Normal(loc=1., scale=2., validate_args=True)
       self.assertEqual(normal.parameters, normal.copy().parameters)
-      wishart = ds.WishartFull(df=2, scale=[[1., 2], [2, 5]],
-                               validate_args=True)
+      wishart = tfd.WishartFull(df=2, scale=[[1., 2], [2, 5]],
+                                validate_args=True)
       self.assertEqual(wishart.parameters, wishart.copy().parameters)
 
   def testCopyOverride(self):
     with self.test_session():
-      normal = ds.Normal(loc=1., scale=2., validate_args=True)
+      normal = tfd.Normal(loc=1., scale=2., validate_args=True)
       unused_normal_copy = normal.copy(validate_args=False)
       base_params = normal.parameters.copy()
       copy_params = normal.copy(validate_args=False).parameters.copy()
@@ -84,19 +86,19 @@ class DistributionTest(test.TestCase):
       mu = 1.
       sigma = 2.
 
-      normal = ds.Normal(mu, sigma, validate_args=True)
+      normal = tfd.Normal(mu, sigma, validate_args=True)
       self.assertTrue(tensor_util.constant_value(normal.is_scalar_event()))
       self.assertTrue(tensor_util.constant_value(normal.is_scalar_batch()))
 
-      normal = ds.Normal([mu], [sigma], validate_args=True)
+      normal = tfd.Normal([mu], [sigma], validate_args=True)
       self.assertTrue(tensor_util.constant_value(normal.is_scalar_event()))
       self.assertFalse(tensor_util.constant_value(normal.is_scalar_batch()))
 
-      mvn = ds.MultivariateNormalDiag([mu], [sigma], validate_args=True)
+      mvn = tfd.MultivariateNormalDiag([mu], [sigma], validate_args=True)
       self.assertFalse(tensor_util.constant_value(mvn.is_scalar_event()))
       self.assertTrue(tensor_util.constant_value(mvn.is_scalar_batch()))
 
-      mvn = ds.MultivariateNormalDiag([[mu]], [[sigma]], validate_args=True)
+      mvn = tfd.MultivariateNormalDiag([[mu]], [[sigma]], validate_args=True)
       self.assertFalse(tensor_util.constant_value(mvn.is_scalar_event()))
       self.assertFalse(tensor_util.constant_value(mvn.is_scalar_batch()))
 
@@ -126,7 +128,7 @@ class DistributionTest(test.TestCase):
       self.assertFalse(is_scalar.eval(feed_dict={x: [1]}))
 
   def _GetFakeDistribution(self):
-    class FakeDistribution(ds.Distribution):
+    class FakeDistribution(tfd.Distribution):
       """Fake Distribution for testing _set_sample_static_shape."""
 
       def __init__(self, batch_shape=None, event_shape=None):
@@ -188,6 +190,124 @@ class DistributionTest(test.TestCase):
       y = dist._set_sample_static_shape(x, sample_shape)
       self.assertTrue(y.get_shape().ndims is None)
 
+  def testNameScopeWorksCorrectly(self):
+    x = tfd.Normal(loc=0., scale=1., name="x")
+    x_duplicate = tfd.Normal(loc=0., scale=1., name="x")
+    with ops.name_scope("y") as name:
+      y = tfd.Bernoulli(logits=0., name=name)
+    x_sample = x.sample(name="custom_sample")
+    x_sample_duplicate = x.sample(name="custom_sample")
+    x_log_prob = x.log_prob(0., name="custom_log_prob")
+    x_duplicate_sample = x_duplicate.sample(name="custom_sample")
+
+    self.assertEqual(x.name, "x/")
+    self.assertEqual(x_duplicate.name, "x_1/")
+    self.assertEqual(y.name, "y/")
+    self.assertTrue(x_sample.name.startswith("x/custom_sample"))
+    self.assertTrue(x_sample_duplicate.name.startswith("x/custom_sample_1"))
+    self.assertTrue(x_log_prob.name.startswith("x/custom_log_prob"))
+    self.assertTrue(x_duplicate_sample.name.startswith(
+        "x_1/custom_sample"))
+
+  def testStrWorksCorrectlyScalar(self):
+    normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1))
+    self.assertEqual(
+        ("tf.distributions.Normal("
+         "\"Normal/\", "
+         "batch_shape=(), "
+         "event_shape=(), "
+         "dtype=float16)"),  # Got the dtype right.
+        str(normal))
+
+    chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly")
+    self.assertEqual(
+        ("tf.distributions.Chi2("
+         "\"silly/\", "  # What a silly name that is!
+         "batch_shape=(2,), "
+         "event_shape=(), "
+         "dtype=float32)"),
+        str(chi2))
+
+    exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32))
+    self.assertEqual(
+        ("tf.distributions.Exponential(\"Exponential/\", "
+         # No batch shape.
+         "event_shape=(), "
+         "dtype=float32)"),
+        str(exp))
+
+  def testStrWorksCorrectlyMultivariate(self):
+    mvn_static = tfd.MultivariateNormalDiag(
+        loc=np.zeros([2, 2]), name="MVN")
+    self.assertEqual(
+        ("tf.distributions.MultivariateNormalDiag("
+         "\"MVN/\", "
+         "batch_shape=(2,), "
+         "event_shape=(2,), "
+         "dtype=float64)"),
+        str(mvn_static))
+
+    mvn_dynamic = tfd.MultivariateNormalDiag(
+        loc=array_ops.placeholder(shape=[None, 3], dtype=dtypes.float32),
+        name="MVN2")
+    self.assertEqual(
+        ("tf.distributions.MultivariateNormalDiag("
+         "\"MVN2/\", "
+         "batch_shape=(?,), "  # Partially known.
+         "event_shape=(3,), "
+         "dtype=float32)"),
+        str(mvn_dynamic))
+
+  def testReprWorksCorrectlyScalar(self):
+    normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1))
+    self.assertEqual(
+        ("<tf.distributions.Normal"
+         " 'Normal/'"
+         " batch_shape=()"
+         " event_shape=()"
+         " dtype=float16>"),  # Got the dtype right.
+        repr(normal))
+
+    chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly")
+    self.assertEqual(
+        ("<tf.distributions.Chi2"
+         " 'silly/'"  # What a silly name that is!
+         " batch_shape=(2,)"
+         " event_shape=()"
+         " dtype=float32>"),
+        repr(chi2))
+
+    exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32))
+    self.assertEqual(
+        ("<tf.distributions.Exponential"
+         " 'Exponential/'"
+         " batch_shape=<unknown>"
+         " event_shape=()"
+         " dtype=float32>"),
+        repr(exp))
+
+  def testReprWorksCorrectlyMultivariate(self):
+    mvn_static = tfd.MultivariateNormalDiag(
+        loc=np.zeros([2, 2]), name="MVN")
+    self.assertEqual(
+        ("<tf.distributions.MultivariateNormalDiag"
+         " 'MVN/'"
+         " batch_shape=(2,)"
+         " event_shape=(2,)"
+         " dtype=float64>"),
+        repr(mvn_static))
+
+    mvn_dynamic = tfd.MultivariateNormalDiag(
+        loc=array_ops.placeholder(shape=[None, 3], dtype=dtypes.float32),
+        name="MVN2")
+    self.assertEqual(
+        ("<tf.distributions.MultivariateNormalDiag"
+         " 'MVN2/'"
+         " batch_shape=(?,)"  # Partially known.
+         " event_shape=(3,)"
+         " dtype=float32>"),
+        repr(mvn_dynamic))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/independent_test.py b/tensorflow/contrib/distributions/python/kernel_tests/independent_test.py
index 06318ca09dec851cf025fa35c83732b85824cbee..6a69f9e60b99a17c657f074597a075890265a93b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/independent_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/independent_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import bernoulli as bernoulli_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -126,6 +127,100 @@ class ProductDistributionTest(test.TestCase):
       self.assertAllClose(sample_entropy_, actual_entropy_, rtol=0.01, atol=0.)
       self.assertAllClose(loc, actual_mode_, rtol=1e-6, atol=0.)
 
+  def testKLRaises(self):
+    ind1 = independent_lib.Independent(
+        distribution=normal_lib.Normal(
+            loc=np.float32([-1., 1]),
+            scale=np.float32([0.1, 0.5])),
+        reinterpreted_batch_ndims=1)
+    ind2 = independent_lib.Independent(
+        distribution=normal_lib.Normal(
+            loc=np.float32(-1),
+            scale=np.float32(0.5)),
+        reinterpreted_batch_ndims=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Event shapes do not match"):
+      kullback_leibler.kl_divergence(ind1, ind2)
+
+    ind1 = independent_lib.Independent(
+        distribution=normal_lib.Normal(
+            loc=np.float32([-1., 1]),
+            scale=np.float32([0.1, 0.5])),
+        reinterpreted_batch_ndims=1)
+    ind2 = independent_lib.Independent(
+        distribution=mvn_diag_lib.MultivariateNormalDiag(
+            loc=np.float32([-1., 1]),
+            scale_diag=np.float32([0.1, 0.5])),
+        reinterpreted_batch_ndims=0)
+
+    with self.assertRaisesRegexp(
+        NotImplementedError, "different event shapes"):
+      kullback_leibler.kl_divergence(ind1, ind2)
+
+  def testKLScalarToMultivariate(self):
+    normal1 = normal_lib.Normal(
+        loc=np.float32([-1., 1]),
+        scale=np.float32([0.1, 0.5]))
+    ind1 = independent_lib.Independent(
+        distribution=normal1, reinterpreted_batch_ndims=1)
+
+    normal2 = normal_lib.Normal(
+        loc=np.float32([-3., 3]),
+        scale=np.float32([0.3, 0.3]))
+    ind2 = independent_lib.Independent(
+        distribution=normal2, reinterpreted_batch_ndims=1)
+
+    normal_kl = kullback_leibler.kl_divergence(normal1, normal2)
+    ind_kl = kullback_leibler.kl_divergence(ind1, ind2)
+    self.assertAllClose(
+        self.evaluate(math_ops.reduce_sum(normal_kl, axis=-1)),
+        self.evaluate(ind_kl))
+
+  def testKLIdentity(self):
+    normal1 = normal_lib.Normal(
+        loc=np.float32([-1., 1]),
+        scale=np.float32([0.1, 0.5]))
+    # This is functionally just a wrapper around normal1,
+    # and doesn't change any outputs.
+    ind1 = independent_lib.Independent(
+        distribution=normal1, reinterpreted_batch_ndims=0)
+
+    normal2 = normal_lib.Normal(
+        loc=np.float32([-3., 3]),
+        scale=np.float32([0.3, 0.3]))
+    # This is functionally just a wrapper around normal2,
+    # and doesn't change any outputs.
+    ind2 = independent_lib.Independent(
+        distribution=normal2, reinterpreted_batch_ndims=0)
+
+    normal_kl = kullback_leibler.kl_divergence(normal1, normal2)
+    ind_kl = kullback_leibler.kl_divergence(ind1, ind2)
+    self.assertAllClose(
+        self.evaluate(normal_kl), self.evaluate(ind_kl))
+
+  def testKLMultivariateToMultivariate(self):
+    # (1, 1, 2) batch of MVNDiag
+    mvn1 = mvn_diag_lib.MultivariateNormalDiag(
+        loc=np.float32([[[[-1., 1, 3.], [2., 4., 3.]]]]),
+        scale_diag=np.float32([[[0.2, 0.1, 5.], [2., 3., 4.]]]))
+    ind1 = independent_lib.Independent(
+        distribution=mvn1, reinterpreted_batch_ndims=2)
+
+    # (1, 1, 2) batch of MVNDiag
+    mvn2 = mvn_diag_lib.MultivariateNormalDiag(
+        loc=np.float32([[[[-2., 3, 2.], [1., 3., 2.]]]]),
+        scale_diag=np.float32([[[0.1, 0.5, 3.], [1., 2., 1.]]]))
+
+    ind2 = independent_lib.Independent(
+        distribution=mvn2, reinterpreted_batch_ndims=2)
+
+    mvn_kl = kullback_leibler.kl_divergence(mvn1, mvn2)
+    ind_kl = kullback_leibler.kl_divergence(ind1, ind2)
+    self.assertAllClose(
+        self.evaluate(math_ops.reduce_sum(mvn_kl, axis=[-1, -2])),
+        self.evaluate(ind_kl))
+
   def _testMnistLike(self, static_shape):
     sample_shape = [4, 5]
     batch_shape = [10]
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
index ea3c86b5c0f42b64fc6e4e362cbcc162bccf74a2..2980e2bfe93b2e2aa01d38fc9fa4650a015efc06 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
@@ -130,10 +130,8 @@ class KumaraswamyTest(test.TestCase):
       dist.prob([.1, .3, .6]).eval()
       dist.prob([.2, .3, .5]).eval()
       # Either condition can trigger.
-      with self.assertRaisesOpError("sample must be positive"):
+      with self.assertRaisesOpError("sample must be non-negative"):
         dist.prob([-1., 0.1, 0.5]).eval()
-      with self.assertRaisesOpError("sample must be positive"):
-        dist.prob([0., 0.1, 0.5]).eval()
       with self.assertRaisesOpError("sample must be no larger than `1`"):
         dist.prob([.1, .2, 1.2]).eval()
 
@@ -249,13 +247,13 @@ class KumaraswamyTest(test.TestCase):
       a = np.array([1., 2, 3])
       b = np.array([2., 4, 1.2])
       dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
-      with self.assertRaisesOpError("Condition x < y.*"):
+      with self.assertRaisesOpError("Mode undefined for concentration1 <= 1."):
         dist.mode().eval()
 
       a = np.array([2., 2, 3])
       b = np.array([1., 4, 1.2])
       dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
-      with self.assertRaisesOpError("Condition x < y.*"):
+      with self.assertRaisesOpError("Mode undefined for concentration0 <= 1."):
         dist.mode().eval()
 
   def testKumaraswamyModeEnableAllowNanStats(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
index 933756aa8e12cca4c42eb98d9193512bbf2ad585..9635134b08db47a47a17c869fe813e0376ae6f1e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
@@ -68,7 +68,7 @@ class MultivariateNormalDiagTest(test.TestCase):
       dist = ds.TransformedDistribution(
           base_dist,
           validate_args=True,
-          bijector=bijectors.Softplus(event_ndims=1))
+          bijector=bijectors.Softplus())
       samps = dist.sample(5)  # Shape [5, 1, 3].
       self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
index 1a02fbefb8e88599f5fedeb38fb06f5a09036439..7435bcbc684c1660a648cef4ab30c888723853f8 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
@@ -52,7 +52,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
       mu = [1., 2.]
       sigma = [[1., 0.], [0., 1.]]
       mvn = ds.MultivariateNormalFullCovariance(mu, sigma, name="Billy")
-      self.assertEqual(mvn.name, "Billy")
+      self.assertEqual(mvn.name, "Billy/")
 
   def testDoesNotRaiseIfInitializedWithSymmetricMatrix(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py b/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
index d9c9008417cdb20b62390630cf887d3bd888a0d3..19a7472d91758a2dbd00c4d918853d7bae33685d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from scipy import special
 from scipy import stats
 from tensorflow.contrib.distributions.python.ops import poisson as poisson_lib
 from tensorflow.python.framework import constant_op
@@ -110,7 +111,7 @@ class PoissonTest(test.TestCase):
       batch_size = 6
       lam = constant_op.constant([3.0] * batch_size)
       lam_v = 3.0
-      x = [2.2, 3.1, 4., 5.5, 6., 7.]
+      x = [2., 3., 4., 5., 6., 7.]
 
       poisson = self._make_poisson(rate=lam)
       log_cdf = poisson.log_cdf(x)
@@ -121,12 +122,31 @@ class PoissonTest(test.TestCase):
       self.assertEqual(cdf.get_shape(), (6,))
       self.assertAllClose(cdf.eval(), stats.poisson.cdf(x, lam_v))
 
+  def testPoissonCDFNonIntegerValues(self):
+    with self.test_session():
+      batch_size = 6
+      lam = constant_op.constant([3.0] * batch_size)
+      lam_v = 3.0
+      x = np.array([2.2, 3.1, 4., 5.5, 6., 7.], dtype=np.float32)
+
+      poisson = self._make_poisson(rate=lam)
+      cdf = poisson.cdf(x)
+      self.assertEqual(cdf.get_shape(), (6,))
+
+      # The Poisson CDF should be valid on these non-integer values, and
+      # equal to igammac(1 + x, rate).
+      self.assertAllClose(cdf.eval(), special.gammaincc(1. + x, lam_v))
+
+      with self.assertRaisesOpError("cannot contain fractional components"):
+        poisson_validate = self._make_poisson(rate=lam, validate_args=True)
+        poisson_validate.cdf(x).eval()
+
   def testPoissonCdfMultidimensional(self):
     with self.test_session():
       batch_size = 6
       lam = constant_op.constant([[2.0, 4.0, 5.0]] * batch_size)
       lam_v = [2.0, 4.0, 5.0]
-      x = np.array([[2.2, 3.1, 4., 5.5, 6., 7.]], dtype=np.float32).T
+      x = np.array([[2., 3., 4., 5., 6., 7.]], dtype=np.float32).T
 
       poisson = self._make_poisson(rate=lam)
       log_cdf = poisson.log_cdf(x)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
index 4186cf129dbf31724c84133734da3f226817c71a..ea04e8c29a2c94d4939bad277afa380401067ff2 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.contrib.distributions.python.ops import sample_stats
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.platform import test
 
@@ -455,6 +456,16 @@ class PercentileTestWithNearestInterpolation(test.TestCase):
       with self.assertRaisesOpError("rank"):
         pct.eval(feed_dict={q_ph: [0.5]})
 
+  def test_finds_max_of_long_array(self):
+    # d - 1 == d in float32 and d = 3e7.
+    # So this test only passes if we use double for the percentile indices.
+    # If float is used, it fails with InvalidArgumentError about an index out of
+    # bounds.
+    x = math_ops.linspace(0., 3e7, num=int(3e7))
+    with self.test_session():
+      minval = sample_stats.percentile(x, q=0, validate_args=True)
+      self.assertAllEqual(0, minval.eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..968057331787059240110b90545f70c0ab128aa8
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
@@ -0,0 +1,70 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the SeedStream class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import seed_stream
+from tensorflow.python.platform import test
+
+
+class SeedStreamTest(test.TestCase):
+
+  def assertAllUnique(self, items):
+    self.assertEqual(len(items), len(set(items)))
+
+  def testNonRepetition(self):
+    # The probability of repetitions in a short stream from a correct
+    # PRNG is negligible; this test catches bugs that prevent state
+    # updates.
+    strm = seed_stream.SeedStream(seed=4, salt="salt")
+    output = [strm() for _ in range(50)]
+    self.assertEqual(sorted(output), sorted(list(set(output))))
+
+  def testReproducibility(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm3 = seed_stream.SeedStream(seed=4, salt="salt")
+    outputs = [strm1() for _ in range(50)]
+    self.assertEqual(outputs, [strm2() for _ in range(50)])
+    self.assertEqual(outputs, [strm3() for _ in range(50)])
+
+  def testSeededDistinctness(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=5, salt="salt")
+    self.assertAllUnique(
+        [strm1() for _ in range(50)] + [strm2() for _ in range(50)])
+
+  def testSaltedDistinctness(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=4, salt="another salt")
+    self.assertAllUnique(
+        [strm1() for _ in range(50)] + [strm2() for _ in range(50)])
+
+  def testNestingRobustness(self):
+    # SeedStreams started from generated seeds should not collide with
+    # the master or with each other, even if the salts are the same.
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(strm1(), salt="salt")
+    strm3 = seed_stream.SeedStream(strm1(), salt="salt")
+    outputs = [strm1() for _ in range(50)]
+    self.assertAllUnique(
+        outputs + [strm2() for _ in range(50)] + [strm3() for _ in range(50)])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
index c8d795c3f6afbec5b41755951174439f7703efb9..243b5a034859288b0e2e120f09258cfee77fbdea 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
@@ -584,7 +584,6 @@ class DistributionShapeTest(test.TestCase):
 
   def testDistributionShapeGetDimsStatic(self):
     with self.test_session():
-      shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       x = 1
       self.assertAllEqual((_empty_shape, _empty_shape, _empty_shape),
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce6cf702d522792f1ad26066a3d9be42003a0e3c
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
@@ -0,0 +1,243 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the statistical testing library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import statistical_testing as st
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class StatisticalTestingTest(test.TestCase):
+
+  def test_dkwm_design_mean_one_sample_soundness(self):
+    thresholds = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
+    rates = [1e-6, 1e-3, 1e-2, 1.1e-1, 0.2, 0.5, 0.7, 1.]
+    false_fail_rates, false_pass_rates = np.meshgrid(rates, rates)
+    false_fail_rates = false_fail_rates.flatten().astype(np.float32)
+    false_pass_rates = false_pass_rates.flatten().astype(np.float32)
+
+    detectable_discrepancies = []
+    for false_pass_rate, false_fail_rate in zip(
+        false_pass_rates, false_fail_rates):
+      sufficient_n = st.min_num_samples_for_dkwm_mean_test(
+          thresholds, low=0., high=1., false_fail_rate=false_fail_rate,
+          false_pass_rate=false_pass_rate)
+      detectable_discrepancies.append(
+          st.min_discrepancy_of_true_means_detectable_by_dkwm(
+              sufficient_n, low=0., high=1., false_fail_rate=false_fail_rate,
+              false_pass_rate=false_pass_rate))
+
+    detectable_discrepancies_ = self.evaluate(detectable_discrepancies)
+    for discrepancies, false_pass_rate, false_fail_rate in zip(
+        detectable_discrepancies_, false_pass_rates, false_fail_rates):
+      below_threshold = discrepancies <= thresholds
+      self.assertAllEqual(
+          np.ones_like(below_threshold, np.bool), below_threshold,
+          msg='false_pass_rate({}), false_fail_rate({})'.format(
+              false_pass_rate, false_fail_rate))
+
+  def test_dkwm_design_mean_two_sample_soundness(self):
+    thresholds = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
+    rates = [1e-6, 1e-3, 1e-2, 1.1e-1, 0.2, 0.5, 0.7, 1.]
+    false_fail_rates, false_pass_rates = np.meshgrid(rates, rates)
+    false_fail_rates = false_fail_rates.flatten().astype(np.float32)
+    false_pass_rates = false_pass_rates.flatten().astype(np.float32)
+
+    detectable_discrepancies = []
+    for false_pass_rate, false_fail_rate in zip(
+        false_pass_rates, false_fail_rates):
+      [
+          sufficient_n1,
+          sufficient_n2
+      ] = st.min_num_samples_for_dkwm_mean_two_sample_test(
+          thresholds, low1=0., high1=1., low2=0., high2=1.,
+          false_fail_rate=false_fail_rate,
+          false_pass_rate=false_pass_rate)
+
+      detectable_discrepancies.append(
+          st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
+              n1=sufficient_n1,
+              low1=0.,
+              high1=1.,
+              n2=sufficient_n2,
+              low2=0.,
+              high2=1.,
+              false_fail_rate=false_fail_rate,
+              false_pass_rate=false_pass_rate))
+
+    detectable_discrepancies_ = self.evaluate(detectable_discrepancies)
+    for discrepancies, false_pass_rate, false_fail_rate in zip(
+        detectable_discrepancies_, false_pass_rates, false_fail_rates):
+      below_threshold = discrepancies <= thresholds
+      self.assertAllEqual(
+          np.ones_like(below_threshold, np.bool), below_threshold,
+          msg='false_pass_rate({}), false_fail_rate({})'.format(
+              false_pass_rate, false_fail_rate))
+
+  def test_true_mean_confidence_interval_by_dkwm_one_sample(self):
+    rng = np.random.RandomState(seed=0)
+
+    num_samples = 5000
+    # 5000 samples is chosen to be enough to find discrepancies of
+    # size 0.1 or more with assurance 1e-6, as confirmed here:
+    with self.test_session() as sess:
+      d = st.min_discrepancy_of_true_means_detectable_by_dkwm(
+          num_samples, 0., 1., false_fail_rate=1e-6, false_pass_rate=1e-6)
+      d = sess.run(d)
+      self.assertLess(d, 0.1)
+
+    # Test that the confidence interval computed for the mean includes
+    # 0.5 and excludes 0.4 and 0.6.
+    with self.test_session() as sess:
+      samples = rng.uniform(size=num_samples).astype(np.float32)
+      (low, high) = st.true_mean_confidence_interval_by_dkwm(
+          samples, 0., 1., error_rate=1e-6)
+      low, high = sess.run([low, high])
+      self.assertGreater(low, 0.4)
+      self.assertLess(low, 0.5)
+      self.assertGreater(high, 0.5)
+      self.assertLess(high, 0.6)
+
+  def test_dkwm_mean_one_sample_assertion(self):
+    rng = np.random.RandomState(seed=0)
+    num_samples = 5000
+
+    # Test that the test assertion agrees that the mean of the standard
+    # uniform distribution is 0.5.
+    samples = rng.uniform(size=num_samples).astype(np.float32)
+    with self.test_session() as sess:
+      sess.run(st.assert_true_mean_equal_by_dkwm(
+          samples, 0., 1., 0.5, false_fail_rate=1e-6))
+
+      # Test that the test assertion confirms that the mean of the
+      # standard uniform distribution is not 0.4.
+      with self.assertRaisesOpError("Mean confidence interval too high"):
+        sess.run(st.assert_true_mean_equal_by_dkwm(
+            samples, 0., 1., 0.4, false_fail_rate=1e-6))
+
+      # Test that the test assertion confirms that the mean of the
+      # standard uniform distribution is not 0.6.
+      with self.assertRaisesOpError("Mean confidence interval too low"):
+        sess.run(st.assert_true_mean_equal_by_dkwm(
+            samples, 0., 1., 0.6, false_fail_rate=1e-6))
+
+  def test_dkwm_mean_two_sample_assertion(self):
+    rng = np.random.RandomState(seed=0)
+    num_samples = 4000
+
+    # 4000 samples is chosen to be enough to find discrepancies of
+    # size 0.2 or more with assurance 1e-6, as confirmed here:
+    with self.test_session() as sess:
+      d = st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
+          num_samples, 0., 1., num_samples, 0., 1.,
+          false_fail_rate=1e-6, false_pass_rate=1e-6)
+      d = sess.run(d)
+      self.assertLess(d, 0.2)
+
+    # Test that the test assertion agrees that the standard
+    # uniform distribution has the same mean as itself.
+    samples1 = rng.uniform(size=num_samples).astype(np.float32)
+    samples2 = rng.uniform(size=num_samples).astype(np.float32)
+    with self.test_session() as sess:
+      sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
+          samples1, 0., 1., samples2, 0., 1., false_fail_rate=1e-6))
+
+  def test_dkwm_mean_two_sample_assertion_beta_2_1_false(self):
+    rng = np.random.RandomState(seed=0)
+    num_samples = 4000
+    samples1 = rng.uniform(size=num_samples).astype(np.float32)
+
+    # As established above, 4000 samples is enough to find discrepancies
+    # of size 0.2 or more with assurance 1e-6.
+
+    with self.test_session() as sess:
+      # Test that the test assertion confirms that the mean of the
+      # standard uniform distribution is different from the mean of beta(2, 1).
+      beta_high_samples = rng.beta(2, 1, size=num_samples).astype(np.float32)
+      with self.assertRaisesOpError("samples1 has a smaller mean"):
+        sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
+            samples1, 0., 1.,
+            beta_high_samples, 0., 1.,
+            false_fail_rate=1e-6))
+
+  def test_dkwm_mean_two_sample_assertion_beta_1_2_false(self):
+    rng = np.random.RandomState(seed=0)
+    num_samples = 4000
+    samples1 = rng.uniform(size=num_samples).astype(np.float32)
+
+    # As established above, 4000 samples is enough to find discrepancies
+    # of size 0.2 or more with assurance 1e-6.
+
+    with self.test_session() as sess:
+      # Test that the test assertion confirms that the mean of the
+      # standard uniform distribution is different from the mean of beta(1, 2).
+      beta_low_samples = rng.beta(1, 2, size=num_samples).astype(np.float32)
+      with self.assertRaisesOpError("samples2 has a smaller mean"):
+        sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
+            samples1, 0., 1.,
+            beta_low_samples, 0., 1.,
+            false_fail_rate=1e-6))
+
+  def test_dkwm_argument_validity_checking(self):
+    rng = np.random.RandomState(seed=0)
+    samples = rng.uniform(
+        low=[0., 1.], high=[1., 2.], size=(2500, 1, 2)).astype(np.float32)
+
+    # Test that the test library complains if the given samples fall
+    # outside the purported bounds.
+    with self.test_session() as sess:
+      with self.assertRaisesOpError("maximum value exceeds expectations"):
+        sess.run(st.true_mean_confidence_interval_by_dkwm(
+            samples, [[0., 1.]], [[0.5, 1.5]], error_rate=0.5))
+      with self.assertRaisesOpError("minimum value falls below expectations"):
+        sess.run(st.true_mean_confidence_interval_by_dkwm(
+            samples, [[0.5, 1.5]], [[1., 2.]], error_rate=0.5))
+
+      # But doesn't complain if they don't.
+      op = st.true_mean_confidence_interval_by_dkwm(
+          samples, [[0., 1.]], [[1., 2.]], error_rate=0.5)
+      _ = sess.run(op)
+
+  def test_do_maximum_mean(self):
+    n = 117
+    envelope = 0.02  # > 2 / n, but < 3 / n
+    rng = np.random.RandomState(seed=8)
+    samples = rng.uniform(size=n).astype(np.float32)
+
+    # Compute the answer in TF using the code under test
+    with self.test_session() as sess:
+      envelope_t = ops.convert_to_tensor(envelope)
+      max_mean = st._do_maximum_mean(samples, envelope_t, 1)
+      max_mean = sess.run(max_mean)
+
+    # Compute the correct answer for this case in numpy.  In this
+    # example, `n` and `envelope` are such that `samples[2]` is the
+    # element that should be taken partially, regardless of the
+    # content of the `samples` array (see algorithm description in
+    # `../ops/statistical_testing.py`).
+    samples = sorted(samples)
+    weight = 1. / n - (envelope - 2. / n)
+    answer = samples[2] * weight + sum(samples[3:]) / n + envelope * 1.
+    self.assertAllClose(max_mean, answer, rtol=1e-9)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
index cbaf74d3f66253ae5727e1ba579e2d49235b748e..5fe1331d2c34612e980c7b376367cd63b627533d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -36,6 +37,35 @@ ds = distributions
 la = linalg
 
 
+class DummyMatrixTransform(bs.Bijector):
+  """Tractable matrix transformation.
+
+  This is a non-sensical bijector that has forward/inverse_min_event_ndims=2.
+  The main use is to check that transformed distribution calculations are done
+  appropriately.
+  """
+
+  def __init__(self):
+    super(DummyMatrixTransform, self).__init__(
+        forward_min_event_ndims=2,
+        is_constant_jacobian=False,
+        validate_args=False,
+        name="dummy")
+
+  def _forward(self, x):
+    return x
+
+  def _inverse(self, y):
+    return y
+
+  # Note: These jacobians don't make sense.
+  def _forward_log_det_jacobian(self, x):
+    return -linalg_ops.matrix_determinant(x)
+
+  def _inverse_log_det_jacobian(self, x):
+    return linalg_ops.matrix_determinant(x)
+
+
 class TransformedDistributionTest(test.TestCase):
 
   def _cls(self):
@@ -55,7 +85,7 @@ class TransformedDistributionTest(test.TestCase):
       # you may or may not need a reduce_sum.
       log_normal = self._cls()(
           distribution=ds.Normal(loc=mu, scale=sigma),
-          bijector=bs.Exp(event_ndims=0))
+          bijector=bs.Exp())
       sp_dist = stats.lognorm(s=sigma, scale=np.exp(mu))
 
       # sample
@@ -87,7 +117,7 @@ class TransformedDistributionTest(test.TestCase):
       sigma = 2.0
       abs_normal = self._cls()(
           distribution=ds.Normal(loc=mu, scale=sigma),
-          bijector=bs.AbsoluteValue(event_ndims=0))
+          bijector=bs.AbsoluteValue())
       sp_normal = stats.norm(mu, sigma)
 
       # sample
@@ -129,7 +159,7 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllClose(grid, cdf_, rtol=1e-6, atol=0.)
 
   def testCachedSamples(self):
-    exp_forward_only = bs.Exp(event_ndims=0)
+    exp_forward_only = bs.Exp()
     exp_forward_only._inverse = self._make_unimplemented(
         "inverse")
     exp_forward_only._inverse_event_shape_tensor = self._make_unimplemented(
@@ -153,7 +183,7 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllClose(expected_log_pdf, log_pdf_val, rtol=1e-4, atol=0.)
 
   def testCachedSamplesInvert(self):
-    exp_inverse_only = bs.Exp(event_ndims=0)
+    exp_inverse_only = bs.Exp()
     exp_inverse_only._forward = self._make_unimplemented(
         "forward")
     exp_inverse_only._forward_event_shape_tensor = self._make_unimplemented(
@@ -186,12 +216,14 @@ class TransformedDistributionTest(test.TestCase):
       standard_normal = ds.Normal(loc=0., scale=1.)
       multi_logit_normal = self._cls()(
           distribution=standard_normal,
-          bijector=softmax)
-      x = [[-np.log(3.), 0.],
-           [np.log(3), np.log(5)]]
+          bijector=softmax,
+          event_shape=[1])
+      x = [[[-np.log(3.)], [0.]],
+           [[np.log(3)], [np.log(5)]]]
       y = softmax.forward(x).eval()
-      expected_log_pdf = (stats.norm(loc=0., scale=1.).logpdf(x) -
-                          np.sum(np.log(y), axis=-1))
+      expected_log_pdf = (
+          np.squeeze(stats.norm(loc=0., scale=1.).logpdf(x)) -
+          np.sum(np.log(y), axis=-1))
       self.assertAllClose(expected_log_pdf,
                           multi_logit_normal.log_prob(y).eval())
       self.assertAllClose(
@@ -208,8 +240,11 @@ class TransformedDistributionTest(test.TestCase):
       int_identity = bs.Inline(
           forward_fn=array_ops.identity,
           inverse_fn=array_ops.identity,
-          inverse_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
-          forward_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
+          inverse_log_det_jacobian_fn=(
+              lambda y: math_ops.cast(0, dtypes.int32)),
+          forward_log_det_jacobian_fn=(
+              lambda x: math_ops.cast(0, dtypes.int32)),
+          forward_min_event_ndims=0,
           is_constant_jacobian=True)
       normal = self._cls()(
           distribution=ds.Normal(loc=0., scale=1.),
@@ -245,9 +280,8 @@ class TransformedDistributionTest(test.TestCase):
     with self.test_session() as sess:
       exp2 = self._cls()(
           ds.Exponential(rate=0.25),
-          bijector=ds.bijectors.Affine(
-              scale_identity_multiplier=2.,
-              event_ndims=0))
+          bijector=ds.bijectors.AffineScalar(scale=2.)
+      )
       log_prob = exp2.log_prob(1.)
       log_prob_ = sess.run(log_prob)
       base_log_prob = -0.5 * 0.25 + np.log(0.25)
@@ -434,6 +468,82 @@ class ScalarToMultiTest(test.TestCase):
             event_shape=[3],
             validate_args=True)
 
+  def testMatrixEvent(self):
+    with self.test_session() as sess:
+      batch_shape = [2]
+      event_shape = [2, 3, 3]
+      batch_shape_pl = array_ops.placeholder(
+          dtypes.int32, name="dynamic_batch_shape")
+      event_shape_pl = array_ops.placeholder(
+          dtypes.int32, name="dynamic_event_shape")
+      feed_dict = {batch_shape_pl: np.array(batch_shape, dtype=np.int32),
+                   event_shape_pl: np.array(event_shape, dtype=np.int32)}
+
+      scale = 2.
+      loc = 0.
+      fake_mvn_dynamic = self._cls()(
+          distribution=ds.Normal(
+              loc=loc,
+              scale=scale),
+          bijector=DummyMatrixTransform(),
+          batch_shape=batch_shape_pl,
+          event_shape=event_shape_pl,
+          validate_args=True)
+
+      fake_mvn_static = self._cls()(
+          distribution=ds.Normal(
+              loc=loc,
+              scale=scale),
+          bijector=DummyMatrixTransform(),
+          batch_shape=batch_shape,
+          event_shape=event_shape,
+          validate_args=True)
+
+      def actual_mvn_log_prob(x):
+        # This distribution is the normal PDF, reduced over the
+        # last 3 dimensions + a jacobian term which corresponds
+        # to the determinant of x.
+        return (np.sum(
+            stats.norm(loc, scale).logpdf(x), axis=(-1, -2, -3)) +
+                np.sum(np.linalg.det(x), axis=-1))
+
+      self.assertAllEqual([2, 3, 3], fake_mvn_static.event_shape)
+      self.assertAllEqual([2], fake_mvn_static.batch_shape)
+
+      self.assertAllEqual(tensor_shape.TensorShape(None),
+                          fake_mvn_dynamic.event_shape)
+      self.assertAllEqual(tensor_shape.TensorShape(None),
+                          fake_mvn_dynamic.batch_shape)
+
+      num_samples = 5e3
+      for fake_mvn, feed_dict in ((fake_mvn_static, {}),
+                                  (fake_mvn_dynamic, feed_dict)):
+        # Ensure sample works by checking first, second moments.
+        y = fake_mvn.sample(int(num_samples), seed=0)
+        x = y[0:5, ...]
+        [
+            x_,
+            fake_event_shape_,
+            fake_batch_shape_,
+            fake_log_prob_,
+            fake_prob_,
+        ] = sess.run([
+            x,
+            fake_mvn.event_shape_tensor(),
+            fake_mvn.batch_shape_tensor(),
+            fake_mvn.log_prob(x),
+            fake_mvn.prob(x),
+        ], feed_dict=feed_dict)
+
+        # Ensure all other functions work as intended.
+        self.assertAllEqual([5, 2, 2, 3, 3], x_.shape)
+        self.assertAllEqual([2, 3, 3], fake_event_shape_)
+        self.assertAllEqual([2], fake_batch_shape_)
+        self.assertAllClose(actual_mvn_log_prob(x_), fake_log_prob_,
+                            atol=0., rtol=1e-6)
+        self.assertAllClose(np.exp(actual_mvn_log_prob(x_)), fake_prob_,
+                            atol=0., rtol=1e-5)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
index c355adeedbfff1072281a81de726ddb0ece07882..1226c66113ec4b43f57371abf4983aef1a529ec1 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
@@ -61,7 +61,7 @@ class VectorLaplaceDiagTest(test.TestCase):
       dist = ds.TransformedDistribution(
           base_dist,
           validate_args=True,
-          bijector=bijectors.Softplus(event_ndims=1))
+          bijector=bijectors.Softplus())
       samps = dist.sample(5)  # Shape [5, 1, 3].
       self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index 9044aa2850ae35f29cd48b0c5f54aa948bea0408..dcecce981f16a2d9e772d4e40062ff250725c3ac 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -390,6 +390,26 @@ class WishartCholeskyTest(test.TestCase):
                 chol_scale, dtype=np.int32),
             validate_args=False)
 
+  def testSampleBroadcasts(self):
+    dims = 2
+    batch_shape = [2, 3]
+    sample_shape = [2, 1]
+    scale = np.float32([
+        [[1., 0.5],
+         [0.5, 1.]],
+        [[0.5, 0.25],
+         [0.25, 0.75]],
+    ])
+    scale = np.reshape(np.concatenate([scale, scale, scale], axis=0),
+                       batch_shape + [dims, dims])
+    wishart = distributions.WishartFull(df=5, scale=scale)
+    x = wishart.sample(sample_shape, seed=42)
+    with self.test_session() as sess:
+      x_ = sess.run(x)
+    expected_shape = sample_shape + batch_shape + [dims, dims]
+    self.assertAllEqual(expected_shape, x.shape)
+    self.assertAllEqual(expected_shape, x_.shape)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/autoregressive.py b/tensorflow/contrib/distributions/python/ops/autoregressive.py
index 852298bf334666db003353d5fc8e172ffb738668..88ed0127841093cc1a1168d988f14e7bb0277b12 100644
--- a/tensorflow/contrib/distributions/python/ops/autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/autoregressive.py
@@ -36,7 +36,8 @@ class Autoregressive(distribution_lib.Distribution):
     "Autoregressive models decompose the joint density as a product of
     conditionals, and model each conditional in turn. Normalizing flows
     transform a base density (e.g. a standard Gaussian) into the target density
-    by an invertible transformation with tractable Jacobian." [1]
+    by an invertible transformation with tractable Jacobian." [(Papamakarios et
+    al., 2016)][1]
 
   In other words, the "autoregressive property" is equivalent to the
   decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
@@ -45,17 +46,18 @@ class Autoregressive(distribution_lib.Distribution):
 
   Practically speaking the autoregressive property means that there exists a
   permutation of the event coordinates such that each coordinate is a
-  diffeomorphic function of only preceding coordinates. [2]
+  diffeomorphic function of only preceding coordinates
+  [(van den Oord et al., 2016)][2].
 
   #### Mathematical Details
 
-  The probability function is,
+  The probability function is
 
   ```none
   prob(x; fn, n) = fn(x).prob(x)
   ```
 
-  And a sample is generated by,
+  And a sample is generated by
 
   ```none
   x = fn(...fn(fn(x0).sample()).sample()).sample()
@@ -93,13 +95,15 @@ class Autoregressive(distribution_lib.Distribution):
 
   ```
 
-  [1]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
+  #### References
 
-  [2]: "Conditional Image Generation with PixelCNN Decoders."
-       Aaron van den Oord, Nal Kalchbrenner, Oriol Vinyals, Lasse Espeholt, Alex
-       Graves, Koray Kavukcuoglu. Arxiv, 2016.
+  [1]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
+
+  [2]: Aaron van den Oord, Nal Kalchbrenner, Oriol Vinyals, Lasse Espeholt,
+       Alex Graves, and Koray Kavukcuoglu. Conditional Image Generation with
+       PixelCNN Decoders. In _Neural Information Processing Systems_, 2016.
        https://arxiv.org/abs/1606.05328
   """
 
@@ -141,7 +145,7 @@ class Autoregressive(distribution_lib.Distribution):
       ValueError: if `num_steps < 1`.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       self._distribution_fn = distribution_fn
       self._sample0 = sample0
       self._distribution0 = (distribution_fn() if sample0 is None
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf5590cd552a915a3ecfc1912ee530baf79665a6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -0,0 +1,416 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The BatchReshape distribution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+
+
+__all__ = [
+    "BatchReshape",
+]
+
+
+class BatchReshape(distribution_lib.Distribution):
+  """The Batch-Reshaping distribution.
+
+  This "meta-distribution" reshapes the batch dimensions of another
+  distribution.
+
+  Note: Unlike `tf.reshape`, the `BatchReshape` distribution does not support
+  `-1` for flattening.
+
+  #### Examples
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  dtype = np.float32
+  dims = 2
+  new_batch_shape = [1, 2, 3]
+  old_batch_shape = [6]
+
+  scale = np.ones(old_batch_shape + [dims], dtype)
+  mvn = tfd.MultivariateNormalDiag(scale_diag=scale)
+  reshape_mvn = tfd.BatchReshape(
+      distribution=mvn,
+      batch_shape=new_batch_shape,
+      validate_args=True)
+
+  reshape_mvn.batch_shape
+  # ==> [1, 2, 3]
+
+  x = reshape_mvn.sample(sample_shape=[4, 5])
+  x.shape
+  # ==> [4, 5, 1, 2, 3, 2] == sample_shape + new_batch_shape + [dims]
+
+  reshape_mvn.log_prob(x).shape
+  # ==> [4, 5, 1, 2, 3] == sample_shape + new_batch_shape
+  ```
+
+  """
+
+  def __init__(self,
+               distribution,
+               batch_shape,
+               validate_args=False,
+               allow_nan_stats=True,
+               name=None):
+    """Construct BatchReshape distribution.
+
+    Args:
+      distribution: The base distribution instance to reshape. Typically an
+        instance of `Distribution`.
+      batch_shape: Positive `int`-like vector-shaped `Tensor` representing the
+        new shape of the batch dimensions.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: The name to give Ops created by the initializer.
+        Default value: `"BatchReshape" + distribution.name`.
+
+    Raises:
+      ValueError: if `batch_shape` is not a vector.
+      ValueError: if `batch_shape` has non-positive elements.
+      ValueError: if `batch_shape` size is not the same as a
+        `distribution.batch_shape` size.
+    """
+    parameters = locals()
+    name = name or "BatchReshape" + distribution.name
+    self._distribution = distribution
+    with ops.name_scope(name, values=[batch_shape]) as name:
+      self._batch_shape_ = ops.convert_to_tensor(
+          batch_shape,
+          dtype=dtypes.int32,
+          name="batch_shape")
+      self._batch_shape_static = tensor_util.constant_value(self._batch_shape_)
+      if self._batch_shape_static is not None:
+        self._batch_shape_static = np.int32(self._batch_shape_static)
+      self._runtime_assertions = validate_init_args(
+          self._distribution,
+          self._batch_shape_,
+          validate_args,
+          self._batch_shape_static)
+      super(BatchReshape, self).__init__(
+          dtype=self._distribution.dtype,
+          reparameterization_type=self._distribution.reparameterization_type,
+          validate_args=validate_args,
+          allow_nan_stats=allow_nan_stats,
+          parameters=parameters,
+          graph_parents=(
+              [self._batch_shape_] +
+              self._distribution._graph_parents),  # pylint: disable=protected-access
+          name=name)
+
+  @property
+  def distribution(self):
+    return self._distribution
+
+  def _batch_shape_tensor(self):
+    with ops.control_dependencies(self._runtime_assertions):
+      return array_ops.identity(self._batch_shape_)
+
+  def _batch_shape(self):
+    return tensor_shape.TensorShape(self._batch_shape_static)
+
+  def _event_shape_tensor(self):
+    with ops.control_dependencies(self._runtime_assertions):
+      return array_ops.identity(self.distribution.event_shape_tensor())
+
+  def _event_shape(self):
+    return self.distribution.event_shape
+
+  def _sample_n(self, n, seed=None):
+    with ops.control_dependencies(self._runtime_assertions):
+      x = self.distribution.sample(sample_shape=n, seed=seed)
+      new_shape = array_ops.concat([
+          [n],
+          self.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], axis=0)
+      return array_ops.reshape(x, new_shape)
+
+  def _log_prob(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.log_prob, x)
+
+  def _prob(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.prob, x)
+
+  def _log_cdf(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.log_cdf, x)
+
+  def _cdf(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.cdf, x)
+
+  def _log_survival_function(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.log_survival_function, x)
+
+  def _survival_function(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.survival_function, x)
+
+  def _entropy(self):
+    return self._call_and_reshape_output(
+        self.distribution.entropy,
+        [],
+        [tensor_shape.scalar()])
+
+  def _mean(self):
+    return self._call_and_reshape_output(self.distribution.mean)
+
+  def _mode(self):
+    return self._call_and_reshape_output(self.distribution.mode)
+
+  def _stddev(self):
+    return self._call_and_reshape_output(self.distribution.stddev)
+
+  def _variance(self):
+    return self._call_and_reshape_output(self.distribution.variance)
+
+  def _covariance(self):
+    return self._call_and_reshape_output(
+        self.distribution.covariance,
+        [self.event_shape_tensor()]*2,
+        [self.event_shape]*2)
+
+  def _sample_shape(self, x):
+    """Computes graph and static `sample_shape`."""
+    x_ndims = (array_ops.rank(x) if x.shape.ndims is None else x.shape.ndims)
+    event_ndims = (array_ops.size(self.event_shape_tensor())
+                   if self.event_shape.ndims is None
+                   else self.event_shape.ndims)
+    batch_ndims = (array_ops.size(self.batch_shape_tensor())
+                   if self.batch_shape.ndims is None
+                   else self.batch_shape.ndims)
+    sample_ndims = x_ndims - batch_ndims - event_ndims
+    if isinstance(sample_ndims, int):
+      static_sample_shape = x.shape[:sample_ndims]
+    else:
+      static_sample_shape = tensor_shape.TensorShape(None)
+    if static_sample_shape.is_fully_defined():
+      sample_shape = np.int32(static_sample_shape.as_list())
+    else:
+      sample_shape = array_ops.shape(x)[:sample_ndims]
+    return sample_shape, static_sample_shape
+
+  def _call_reshape_input_output(self, fn, x):
+    """Calls `fn`, appropriately reshaping its input `x` and output."""
+    with ops.control_dependencies(
+        self._runtime_assertions + self._validate_sample_arg(x)):
+      sample_shape, static_sample_shape = self._sample_shape(x)
+      old_shape = array_ops.concat([
+          sample_shape,
+          self.distribution.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], axis=0)
+      result = fn(array_ops.reshape(x, old_shape))
+      new_shape = array_ops.concat([
+          sample_shape,
+          self.batch_shape_tensor(),
+      ], axis=0)
+      result = array_ops.reshape(result, new_shape)
+      if (static_sample_shape.ndims is not None and
+          self.batch_shape.ndims is not None):
+        new_shape = static_sample_shape.concatenate(self.batch_shape)
+        result.set_shape(result.shape.merge_with(new_shape))
+      return result
+
+  def _call_and_reshape_output(
+      self,
+      fn,
+      event_shape_list=None,
+      static_event_shape_list=None):
+    """Calls `fn` and appropriately reshapes its output."""
+    with ops.control_dependencies(self._runtime_assertions):
+      if event_shape_list is None:
+        event_shape_list = [self._event_shape_tensor()]
+      if static_event_shape_list is None:
+        static_event_shape_list = [self.event_shape]
+      new_shape = array_ops.concat(
+          [self.batch_shape_tensor()] + event_shape_list,
+          axis=0)
+      result = array_ops.reshape(fn(), new_shape)
+      if (self.batch_shape.ndims is not None and
+          self.event_shape.ndims is not None):
+        event_shape = tensor_shape.TensorShape([])
+        for rss in static_event_shape_list:
+          event_shape = event_shape.concatenate(rss)
+        static_shape = result.shape.merge_with(
+            self.batch_shape.concatenate(event_shape))
+        result.set_shape(static_shape)
+      return result
+
+  def _validate_sample_arg(self, x):
+    """Helper which validates sample arg, e.g., input to `log_prob`."""
+    with ops.name_scope(name="validate_sample_arg", values=[x]):
+      x_ndims = (array_ops.rank(x) if x.shape.ndims is None else x.shape.ndims)
+      event_ndims = (array_ops.size(self.event_shape_tensor())
+                     if self.event_shape.ndims is None
+                     else self.event_shape.ndims)
+      batch_ndims = (array_ops.size(self.batch_shape_tensor())
+                     if self.batch_shape.ndims is None
+                     else self.batch_shape.ndims)
+      expected_batch_event_ndims = batch_ndims + event_ndims
+
+      if (isinstance(x_ndims, int) and
+          isinstance(expected_batch_event_ndims, int)):
+        if x_ndims < expected_batch_event_ndims:
+          raise NotImplementedError(
+              "Broadcasting is not supported; too few batch and event dims "
+              "(expected at least {}, saw {}).".format(
+                  expected_batch_event_ndims, x_ndims))
+        ndims_assertion = []
+      elif self.validate_args:
+        ndims_assertion = [
+            check_ops.assert_greater_equal(
+                x_ndims,
+                expected_batch_event_ndims,
+                message=("Broadcasting is not supported; too few "
+                         "batch and event dims."),
+                name="assert_batch_and_event_ndims_large_enough"),
+        ]
+
+      if (self.batch_shape.is_fully_defined() and
+          self.event_shape.is_fully_defined()):
+        expected_batch_event_shape = np.int32(self.batch_shape.concatenate(
+            self.event_shape).as_list())
+      else:
+        expected_batch_event_shape = array_ops.concat([
+            self.batch_shape_tensor(),
+            self.event_shape_tensor(),
+        ], axis=0)
+
+      sample_ndims = x_ndims - expected_batch_event_ndims
+      if isinstance(sample_ndims, int):
+        sample_ndims = max(sample_ndims, 0)
+      if (isinstance(sample_ndims, int) and
+          x.shape[sample_ndims:].is_fully_defined()):
+        actual_batch_event_shape = np.int32(x.shape[sample_ndims:].as_list())
+      else:
+        sample_ndims = math_ops.maximum(sample_ndims, 0)
+        actual_batch_event_shape = array_ops.shape(x)[sample_ndims:]
+
+      if (isinstance(expected_batch_event_shape, np.ndarray) and
+          isinstance(actual_batch_event_shape, np.ndarray)):
+        if any(expected_batch_event_shape != actual_batch_event_shape):
+          raise NotImplementedError("Broadcasting is not supported; "
+                                    "unexpected batch and event shape "
+                                    "(expected {}, saw {}).".format(
+                                        expected_batch_event_shape,
+                                        actual_batch_event_shape))
+        # We need to set the final runtime-assertions to `ndims_assertion` since
+        # its possible this assertion was created. We could add a condition to
+        # only do so if `self.validate_args == True`, however this is redundant
+        # as `ndims_assertion` already encodes this information.
+        runtime_assertions = ndims_assertion
+      elif self.validate_args:
+        # We need to make the `ndims_assertion` a control dep because otherwise
+        # TF itself might raise an exception owing to this assertion being
+        # ill-defined, ie, one cannot even compare different rank Tensors.
+        with ops.control_dependencies(ndims_assertion):
+          shape_assertion = check_ops.assert_equal(
+              expected_batch_event_shape,
+              actual_batch_event_shape,
+              message=("Broadcasting is not supported; "
+                       "unexpected batch and event shape."),
+              name="assert_batch_and_event_shape_same")
+        runtime_assertions = [shape_assertion]
+      else:
+        runtime_assertions = []
+
+      return runtime_assertions
+
+
+def validate_init_args(
+    distribution,
+    batch_shape,
+    validate_args,
+    batch_shape_static):
+  """Helper to __init__ which makes or raises assertions."""
+  with ops.name_scope(name="validate_init_args",
+                      values=[batch_shape] + distribution._graph_parents):  # pylint: disable=protected-access
+    runtime_assertions = []
+
+    if batch_shape.shape.ndims is not None:
+      if batch_shape.shape.ndims != 1:
+        raise ValueError("`batch_shape` must be a vector "
+                         "(saw rank: {}).".format(
+                             batch_shape.shape.ndims))
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_rank(
+              batch_shape,
+              1,
+              message="`batch_shape` must be a vector.",
+              name="assert_batch_shape_is_vector"),
+      ]
+
+    batch_size_static = np.prod(batch_shape_static)
+    dist_batch_size_static = (
+        None if not distribution.batch_shape.is_fully_defined()
+        else np.prod(distribution.batch_shape).value)
+
+    if batch_size_static is not None and dist_batch_size_static is not None:
+      if batch_size_static != dist_batch_size_static:
+        raise ValueError("`batch_shape` size ({}) must match "
+                         "`distribution.batch_shape` size ({}).".format(
+                             batch_size_static,
+                             dist_batch_size_static))
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_equal(
+              math_ops.reduce_prod(batch_shape),
+              math_ops.reduce_prod(distribution.batch_shape_tensor()),
+              message=("`batch_shape` size must match "
+                       "`distributions.batch_shape` size."),
+              name="assert_batch_size"),
+      ]
+
+    if batch_shape_static is not None:
+      if np.any(batch_shape_static < 1):
+        raise ValueError("`batch_shape` elements must be positive "
+                         "(i.e., larger than zero).")
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_positive(
+              batch_shape,
+              message=("`batch_shape` elements must be positive "
+                       "(i.e., larger than zero)."),
+              name="assert_batch_shape_positive")
+      ]
+
+    return runtime_assertions
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index 93923c3f083c7f5136b55e9021cbd6323684b976..51478dbeffaabc58ce3662f25f06bc579e8a407e 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -17,7 +17,9 @@
 @@AbsoluteValue
 @@Affine
 @@AffineLinearOperator
+@@AffineScalar
 @@Bijector
+@@BatchNormalization
 @@Chain
 @@CholeskyOuterProduct
 @@ConditionalBijector
@@ -26,16 +28,19 @@
 @@Identity
 @@Inline
 @@Invert
+@@Kumaraswamy
 @@MaskedAutoregressiveFlow
+@@Ordered
 @@Permute
 @@PowerTransform
 @@RealNVP
 @@Reshape
 @@Sigmoid
-@@SigmoidCentered
 @@SinhArcsinh
 @@SoftmaxCentered
 @@Softplus
+@@Softsign
+@@Square
 @@Weibull
 
 @@masked_autoregressive_default_template
@@ -52,6 +57,8 @@ from __future__ import print_function
 from tensorflow.contrib.distributions.python.ops.bijectors.absolute_value import *
 from tensorflow.contrib.distributions.python.ops.bijectors.affine import *
 from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import *
+from tensorflow.contrib.distributions.python.ops.bijectors.affine_scalar import *
+from tensorflow.contrib.distributions.python.ops.bijectors.batch_normalization import *
 from tensorflow.contrib.distributions.python.ops.bijectors.chain import *
 from tensorflow.contrib.distributions.python.ops.bijectors.cholesky_outer_product import *
 from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector import *
@@ -59,16 +66,19 @@ from tensorflow.contrib.distributions.python.ops.bijectors.exp import *
 from tensorflow.contrib.distributions.python.ops.bijectors.gumbel import *
 from tensorflow.contrib.distributions.python.ops.bijectors.inline import *
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
+from tensorflow.contrib.distributions.python.ops.bijectors.kumaraswamy import *
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import *
+from tensorflow.contrib.distributions.python.ops.bijectors.ordered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.permute import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
 from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import *
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import *
+from tensorflow.contrib.distributions.python.ops.bijectors.softsign import *
+from tensorflow.contrib.distributions.python.ops.bijectors.square import *
 from tensorflow.python.ops.distributions.bijector import *
 from tensorflow.python.ops.distributions.identity_bijector import Identity
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
index 0fe9f6aa78fbe845b99d0668f075b0162ec2a9f7..c9e31d7712f09f6c4b4cc6ae51a34c42a19c291d 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
@@ -18,9 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -72,38 +70,22 @@ class AbsoluteValue(bijector.Bijector):
 
   """
 
-  def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"):
+  def __init__(self, validate_args=False, name="absolute_value"):
     """Instantiates the `AbsoluteValue` bijector.
 
     Args:
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.  Currently only zero is
-        supported.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness, in particular whether inputs to `inverse` and
         `inverse_log_det_jacobian` are non-negative.
       name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError:  If `event_ndims` is not zero.
     """
     self._graph_parents = []
     self._name = name
 
-    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-    event_ndims_const = tensor_util.constant_value(event_ndims)
-    if event_ndims_const is not None and event_ndims_const not in (0,):
-      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
-    else:
-      if validate_args:
-        event_ndims = control_flow_ops.with_dependencies(
-            [check_ops.assert_equal(
-                event_ndims, 0, message="event_ndims was not 0")],
-            event_ndims)
-
     with self._name_scope("init"):
       super(AbsoluteValue, self).__init__(
-          event_ndims=event_ndims,
+          forward_min_event_ndims=0,
+          is_constant_jacobian=True,
           validate_args=validate_args,
           name=name)
 
@@ -121,8 +103,7 @@ class AbsoluteValue(bijector.Bijector):
     # If event_ndims = 2,
     # F^{-1}(y) = (-y, y), so DF^{-1}(y) = (-1, 1),
     # so Log|DF^{-1}(y)| = Log[1, 1] = [0, 0].
-    batch_shape = array_ops.shape(y)[:array_ops.rank(y) - self.event_ndims]
-    zeros = array_ops.zeros(batch_shape, dtype=y.dtype)
+    zeros = constant_op.constant(0., dtype=y.dtype)
     if self.validate_args:
       zeros = control_flow_ops.with_dependencies(
           [check_ops.assert_non_negative(y, message="Argument y was negative")],
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
index 05bb9c2f9bdf35e222c94db3491157893da64ebd..b4c2939eb914d50475ba6b1c1e979a804090f641 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
@@ -62,7 +62,7 @@ class Affine(bijector.Bijector):
   matrices, i.e., the matmul is [matrix-free](
   https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
 
-  Examples:
+  #### Examples
 
   ```python
   # Y = X
@@ -104,7 +104,6 @@ class Affine(bijector.Bijector):
                scale_tril=None,
                scale_perturb_factor=None,
                scale_perturb_diag=None,
-               event_ndims=1,
                validate_args=False,
                name="affine"):
     """Instantiates the `Affine` bijector.
@@ -157,8 +156,6 @@ class Affine(bijector.Bijector):
         matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
         represents an `r x r` diagonal matrix. When `None` low rank updates will
         take the form `scale_perturb_factor * scale_perturb_factor.T`.
-      event_ndims: Scalar `int` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -187,22 +184,6 @@ class Affine(bijector.Bijector):
     with self._name_scope("init", values=[
         shift, scale_identity_multiplier, scale_diag, scale_tril,
         scale_perturb_diag, scale_perturb_factor]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims_const = tensor_util.constant_value(event_ndims)
-      if event_ndims_const is not None and event_ndims_const not in (0, 1):
-        raise ValueError("event_ndims(%s) was not 0 or 1" % event_ndims_const)
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-
-      if event_ndims_const == 0 and not self._is_only_identity_multiplier:
-        raise ValueError(
-            "If event_ndims == 0, the only scale argument you can pass is "
-            "scale_identity_multiplier.  All others operate on vectors.")
 
       # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
       dtype = dtypes.float32
@@ -251,12 +232,11 @@ class Affine(bijector.Bijector):
       self._scale = scale
       self._shaper = _DistributionShape(
           batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
+          event_ndims=1,
           validate_args=validate_args)
       super(Affine, self).__init__(
-          event_ndims=event_ndims,
+          forward_min_event_ndims=1,
           graph_parents=(
-              [event_ndims] +
               [self._scale] if tensor_util.is_tensor(self._scale)
               else self._scale.graph_parents +
               [self._shift] if self._shift is not None else []),
@@ -381,18 +361,17 @@ class Affine(bijector.Bijector):
         x, sample_shape, expand_batch_dim=False)
     return x
 
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
   def _forward_log_det_jacobian(self, x):
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
     if self._is_only_identity_multiplier:
       # We don't pad in this case and instead let the fldj be applied
       # via broadcast.
-      event_size = distribution_util.pick_vector(
-          math_ops.equal(self._shaper.event_ndims, 0),
-          [1], array_ops.shape(x))[-1]
+      event_size = array_ops.shape(x)[-1]
       event_size = math_ops.cast(event_size, dtype=self._scale.dtype)
       return math_ops.log(math_ops.abs(self._scale)) * event_size
+
     return self.scale.log_abs_determinant()
 
   def _maybe_check_scale(self):
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
index 89043b1410370074f11f2cfa59b6b6663fa62521..59f9742d576a7804f401d3a47ba31ae61d6c6e54 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
@@ -22,9 +22,6 @@ from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.ops.linalg import linear_operator
 
@@ -94,7 +91,6 @@ class AffineLinearOperator(bijector.Bijector):
   def __init__(self,
                shift=None,
                scale=None,
-               event_ndims=1,
                validate_args=False,
                name="affine_linear_operator"):
     """Instantiates the `AffineLinearOperator` bijector.
@@ -103,14 +99,11 @@ class AffineLinearOperator(bijector.Bijector):
       shift: Floating-point `Tensor`.
       scale:  Subclass of `LinearOperator`. Represents the (batch) positive
         definite matrix `M` in `R^{k x k}`.
-      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
 
     Raises:
-      ValueError: if `event_ndims` is not 0 or 1.
       TypeError: if `scale` is not a `LinearOperator`.
       TypeError: if `shift.dtype` does not match `scale.dtype`.
       ValueError: if not `scale.is_non_singular`.
@@ -120,20 +113,6 @@ class AffineLinearOperator(bijector.Bijector):
     self._validate_args = validate_args
     graph_parents = []
     with self._name_scope("init", values=[shift]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      if tensor_util.constant_value(event_ndims) is not None:
-        event_ndims = tensor_util.constant_value(event_ndims)
-        if event_ndims not in (0, 1):
-          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-        graph_parents += [event_ndims]
-
       # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
       dtype = dtypes.float32
 
@@ -166,10 +145,10 @@ class AffineLinearOperator(bijector.Bijector):
       self._scale = scale
       self._shaper = _DistributionShape(
           batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
+          event_ndims=1,
           validate_args=validate_args)
       super(AffineLinearOperator, self).__init__(
-          event_ndims=event_ndims,
+          forward_min_event_ndims=1,
           graph_parents=graph_parents,
           is_constant_jacobian=True,
           dtype=dtype,
@@ -213,12 +192,13 @@ class AffineLinearOperator(bijector.Bijector):
           x, sample_shape, expand_batch_dim=False)
     return x
 
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
+  def _forward_log_det_jacobian(self, x):
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
     if self.scale is None:
-      return constant_op.constant(0, dtype=x.dtype.base_dtype)
+      return constant_op.constant(0., dtype=x.dtype.base_dtype)
+
     with ops.control_dependencies(self._maybe_collect_assertions() if
                                   self.validate_args else []):
       return self.scale.log_abs_determinant()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd792e2c8cf48602daf9fb5eb56b8c34bac050c7
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py
@@ -0,0 +1,141 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Affine bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "AffineScalar",
+]
+
+
+class AffineScalar(bijector.Bijector):
+  """Compute `Y = g(X; shift, scale) = scale * X + shift`.
+
+  Examples:
+
+  ```python
+  # Y = X
+  b = AffineScalar()
+
+  # Y = X + shift
+  b = AffineScalar(shift=[1., 2, 3])
+
+  # Y = 2 * X + shift
+  b = AffineScalar(
+    shift=[1., 2, 3],
+    scale=2.)
+  ```
+
+  """
+
+  def __init__(self,
+               shift=None,
+               scale=None,
+               validate_args=False,
+               name="affine_scalar"):
+    """Instantiates the `AffineScalar` bijector.
+
+    This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
+    giving the forward operation:
+
+    ```none
+    Y = g(X) = scale * X + shift
+    ```
+
+    if `scale` is not specified, then the bijector has the semantics of
+    `scale = 1.`. Similarly, if `shift` is not specified, then the bijector
+    has the semantics of `shift = 0.`.
+
+    Args:
+      shift: Floating-point `Tensor`. If this is set to `None`, no shift is
+        applied.
+      scale: Floating-point `Tensor`. If this is set to `None`, no scale is
+        applied.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+
+    with self._name_scope("init", values=[scale, shift]):
+      self._shift = shift
+      self._scale = scale
+
+      if self._shift is not None:
+        self._shift = ops.convert_to_tensor(shift, name="shift")
+
+      if self._scale is not None:
+        self._scale = ops.convert_to_tensor(self._scale, name="scale")
+        if validate_args:
+          self._scale = control_flow_ops.with_dependencies(
+              [check_ops.assert_none_equal(
+                  self._scale,
+                  array_ops.zeros([], dtype=self._scale.dtype))],
+              self._scale)
+
+      super(AffineScalar, self).__init__(
+          forward_min_event_ndims=0,
+          is_constant_jacobian=True,
+          validate_args=validate_args,
+          name=name)
+
+  @property
+  def shift(self):
+    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
+    return self._shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
+    return self._scale
+
+  def _forward(self, x):
+    y = array_ops.identity(x)
+    if self.scale is not None:
+      y *= self.scale
+    if self.shift is not None:
+      y += self.shift
+    return y
+
+  def _inverse(self, y):
+    x = array_ops.identity(y)
+    if self.shift is not None:
+      x -= self.shift
+    if self.scale is not None:
+      x /= self.scale
+    return x
+
+  def _forward_log_det_jacobian(self, x):
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
+    if self.scale is None:
+      return constant_op.constant(0., dtype=x.dtype.base_dtype)
+
+    return math_ops.log(math_ops.abs(self.scale))
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py b/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..224cec8a63dba53a528490117efac890312fe8d5
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
@@ -0,0 +1,263 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Batch Norm bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import normalization
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "BatchNormalization",
+]
+
+
+def _undo_batch_normalization(x,
+                              mean,
+                              variance,
+                              offset,
+                              scale,
+                              variance_epsilon,
+                              name=None):
+  r"""Inverse of tf.nn.batch_normalization.
+
+  Args:
+    x: Input `Tensor` of arbitrary dimensionality.
+    mean: A mean `Tensor`.
+    variance: A variance `Tensor`.
+    offset: An offset `Tensor`, often denoted `beta` in equations, or
+      None. If present, will be added to the normalized tensor.
+    scale: A scale `Tensor`, often denoted `gamma` in equations, or
+      `None`. If present, the scale is applied to the normalized tensor.
+    variance_epsilon: A small `float` added to the minibatch `variance` to
+      prevent dividing by zero.
+    name: A name for this operation (optional).
+
+  Returns:
+    batch_unnormalized: The de-normalized, de-scaled, de-offset `Tensor`.
+  """
+  with ops.name_scope(
+      name, "undo_batchnorm", [x, mean, variance, scale, offset]):
+    # inv = math_ops.rsqrt(variance + variance_epsilon)
+    # if scale is not None:
+    #   inv *= scale
+    # return x * inv + (
+    #     offset - mean * inv if offset is not None else -mean * inv)
+    rescale = math_ops.sqrt(variance + variance_epsilon)
+    if scale is not None:
+      rescale /= scale
+    batch_unnormalized = x * rescale + (
+        mean - offset * rescale if offset is not None else mean)
+    return batch_unnormalized
+
+
+class BatchNormalization(bijector.Bijector):
+  """Compute `Y = g(X) s.t. X = g^-1(Y) = (Y - mean(Y)) / std(Y)`.
+
+  Applies Batch Normalization [(Ioffe and Szegedy, 2015)][1] to samples from a
+  data distribution. This can be used to stabilize training of normalizing
+  flows ([Papamakarios et al., 2016][3]; [Dinh et al., 2017][2])
+
+  When training Deep Neural Networks (DNNs), it is common practice to
+  normalize or whiten features by shifting them to have zero mean and
+  scaling them to have unit variance.
+
+  The `inverse()` method of the `BatchNormalization` bijector, which is used in
+  the log-likelihood computation of data samples, implements the normalization
+  procedure (shift-and-scale) using the mean and standard deviation of the
+  current minibatch.
+
+  Conversely, the `forward()` method of the bijector de-normalizes samples (e.g.
+  `X*std(Y) + mean(Y)` with the running-average mean and standard deviation
+  computed at training-time. De-normalization is useful for sampling.
+
+  ```python
+
+  dist = tfd.TransformedDistribution(
+      distribution=tfd.Normal()),
+      bijector=tfb.BatchNorm())
+
+  y = tfd.MultivariateNormalDiag(loc=1., scale=2.).sample(100)  # ~ N(1, 2)
+  x = dist.bijector.inverse(y)  # ~ N(0, 1)
+  y = dist.sample()  # ~ N(1, 2)
+  ```
+
+  During training time, `BatchNorm.inverse` and `BatchNorm.forward` are not
+  guaranteed to be inverses of each other because `inverse(y)` uses statistics
+  of the current minibatch, while `forward(x)` uses running-average statistics
+  accumulated from training. In other words,
+  `BatchNorm.inverse(BatchNorm.forward(...))` and
+  `BatchNorm.forward(BatchNorm.inverse(...))` will be identical when
+  `training=False` but may be different when `training=True`.
+
+  #### References
+
+  [1]: Sergey Ioffe and Christian Szegedy. Batch Normalization: Accelerating
+       Deep Network Training by Reducing Internal Covariate Shift. In
+       _International Conference on Machine Learning_, 2015.
+       https://arxiv.org/abs/1502.03167
+
+  [2]: Laurent Dinh, Jascha Sohl-Dickstein, and Samy Bengio. Density Estimation
+       using Real NVP. In _International Conference on Learning
+       Representations_, 2017. https://arxiv.org/abs/1605.08803
+
+  [3]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
+  """
+
+  def __init__(self,
+               batchnorm_layer=None,
+               training=True,
+               validate_args=False,
+               name="batch_normalization"):
+    """Instantiates the `BatchNorm` bijector.
+
+    Args:
+      batchnorm_layer: `tf.layers.BatchNormalization` layer object. If `None`,
+        defaults to
+        `tf.layers.BatchNormalization(gamma_constraint=nn_ops.relu(x) + 1e-6)`.
+        This ensures positivity of the scale variable.
+
+      training: If True, updates running-average statistics during call to
+        `inverse()`.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    Raises:
+      ValueError: If bn_layer is not an instance of
+        `tf.layers.BatchNormalization`, or if it is specified with `renorm=True`
+        or a virtual batch size.
+    """
+    # Scale must be positive.
+    g_constraint = lambda x: nn.relu(x) + 1e-6
+    self.batchnorm = batchnorm_layer or normalization.BatchNormalization(
+        gamma_constraint=g_constraint)
+    self._validate_bn_layer(self.batchnorm)
+    self._training = training
+    if isinstance(self.batchnorm.axis, int):
+      forward_min_event_ndims = 1
+    else:
+      forward_min_event_ndims = len(self.batchnorm.axis)
+    super(BatchNormalization, self).__init__(
+        forward_min_event_ndims=forward_min_event_ndims,
+        validate_args=validate_args, name=name)
+
+  def _validate_bn_layer(self, layer):
+    """Check for valid BatchNormalization layer.
+
+    Args:
+      layer: Instance of `tf.layers.BatchNormalization`.
+    Raises:
+      ValueError: If batchnorm_layer argument is not an instance of
+      `tf.layers.BatchNormalization`, or if `batchnorm_layer.renorm=True` or
+      if `batchnorm_layer.virtual_batch_size` is specified.
+    """
+    if not isinstance(layer, normalization.BatchNormalization):
+      raise ValueError(
+          "batchnorm_layer must be an instance of BatchNormalization layer.")
+    if layer.renorm:
+      raise ValueError("BatchNorm Bijector does not support renormalization.")
+    if layer.virtual_batch_size:
+      raise ValueError(
+          "BatchNorm Bijector does not support virtual batch sizes.")
+
+  def _get_broadcast_fn(self, x):
+    # Compute shape to broadcast scale/shift parameters to.
+    if not x.shape.is_fully_defined():
+      raise ValueError("Input must have shape known at graph construction.")
+    input_shape = np.int32(x.shape.as_list())
+
+    ndims = len(input_shape)
+    reduction_axes = [i for i in range(ndims) if i not in self.batchnorm.axis]
+    # Broadcasting only necessary for single-axis batch norm where the axis is
+    # not the last dimension
+    broadcast_shape = [1] * ndims
+    broadcast_shape[self.batchnorm.axis[0]] = (
+        input_shape[self.batchnorm.axis[0]])
+    def _broadcast(v):
+      if (v is not None and
+          len(v.get_shape()) != ndims and
+          reduction_axes != list(range(ndims - 1))):
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+    return _broadcast
+
+  def _normalize(self, y):
+    return self.batchnorm.apply(y, training=self._training)
+
+  def _de_normalize(self, x):
+    # Uses the saved statistics.
+    if not self.batchnorm.built:
+      input_shape = x.get_shape()
+      self.batchnorm.build(input_shape)
+    broadcast_fn = self._get_broadcast_fn(x)
+    mean = broadcast_fn(self.batchnorm.moving_mean)
+    variance = broadcast_fn(self.batchnorm.moving_variance)
+    beta = broadcast_fn(self.batchnorm.beta) if self.batchnorm.center else None
+    gamma = broadcast_fn(self.batchnorm.gamma) if self.batchnorm.scale else None
+    return _undo_batch_normalization(
+        x, mean, variance, beta, gamma, self.batchnorm.epsilon)
+
+  def _forward(self, x):
+    return self._de_normalize(x)
+
+  def _inverse(self, y):
+    return self._normalize(y)
+
+  def _forward_log_det_jacobian(self, x):
+    # Uses saved statistics to compute volume distortion.
+    return -self._inverse_log_det_jacobian(x, use_saved_statistics=True)
+
+  def _inverse_log_det_jacobian(self, y, use_saved_statistics=False):
+    if not y.shape.is_fully_defined():
+      raise ValueError("Input must have shape known at graph construction.")
+    input_shape = np.int32(y.shape.as_list())
+
+    if not self.batchnorm.built:
+      # Create variables.
+      self.batchnorm.build(input_shape)
+
+    event_dims = self.batchnorm.axis
+    reduction_axes = [i for i in range(len(input_shape)) if i not in event_dims]
+
+    if use_saved_statistics or not self._training:
+      log_variance = math_ops.log(
+          self.batchnorm.moving_variance + self.batchnorm.epsilon)
+    else:
+      # At training-time, ildj is computed from the mean and log-variance across
+      # the current minibatch.
+      _, v = nn.moments(y, axes=reduction_axes, keep_dims=True)
+      log_variance = math_ops.log(v + self.batchnorm.epsilon)
+
+    # `gamma` and `log Var(y)` reductions over event_dims.
+    # Log(total change in area from gamma term).
+    log_total_gamma = math_ops.reduce_sum(math_ops.log(self.batchnorm.gamma))
+
+    # Log(total change in area from log-variance term).
+    log_total_variance = math_ops.reduce_sum(log_variance)
+    # The ildj is scalar, as it does not depend on the values of x and are
+    # constant across minibatch elements.
+    return log_total_gamma - 0.5 * log_total_variance
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
index 3ce7c26213034c7345a20faa803c94a1bfa8d579..85ad23e4133ef09051cdc8b45e489caeea90fbb3 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
@@ -21,6 +21,9 @@ from __future__ import print_function
 import itertools
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import bijector
 
 
@@ -29,6 +32,91 @@ __all__ = [
 ]
 
 
+def _use_static_shape(input_tensor, ndims):
+  return input_tensor.shape.is_fully_defined() and isinstance(ndims, int)
+
+
+def _maybe_get_event_ndims_statically(event_ndims):
+  static_event_ndims = (event_ndims if isinstance(event_ndims, int)
+                        else tensor_util.constant_value(event_ndims))
+  if static_event_ndims is not None:
+    return static_event_ndims
+
+  return event_ndims
+
+
+def _compute_min_event_ndims(bijector_list, compute_forward=True):
+  """Computes the min_event_ndims associated with the give list of bijectors.
+
+  Given a list `bijector_list` of bijectors, compute the min_event_ndims that is
+  associated with the composition of bijectors in that list.
+
+  min_event_ndims is the # of right most dimensions for which the bijector has
+  done necessary computation on (i.e. the non-broadcastable part of the
+  computation).
+
+  We can derive the min_event_ndims for a chain of bijectors as follows:
+
+  In the case where there are no rank changing bijectors, this will simply be
+  `max(b.forward_min_event_ndims for b in bijector_list)`. This is because the
+  bijector with the most forward_min_event_ndims requires the most dimensions,
+  and hence the chain also requires operating on those dimensions.
+
+  However in the case of rank changing, more care is needed in determining the
+  exact amount of dimensions. Padding dimensions causes subsequent bijectors to
+  operate on the padded dimensions, and Removing dimensions causes bijectors to
+  operate more left.
+
+  Args:
+    bijector_list: List of bijectors to be composed by chain.
+    compute_forward: Boolean. If True, computes the min_event_ndims associated
+      with a forward call to Chain, and otherwise computes the min_event_ndims
+      associated with an inverse call to Chain. The latter is the same as the
+      min_event_ndims associated with a forward call to Invert(Chain(....)).
+
+  Returns:
+    min_event_ndims
+  """
+  min_event_ndims = 0
+  # This is a mouthful, but what this encapsulates is that if not for rank
+  # changing bijectors, we'd only need to compute the largest of the min
+  # required ndims. Hence "max_min". Due to rank changing bijectors, we need to
+  # account for synthetic rank growth / synthetic rank decrease from a rank
+  # changing bijector.
+  rank_changed_adjusted_max_min_event_ndims = 0
+
+  if compute_forward:
+    bijector_list = reversed(bijector_list)
+
+  for b in bijector_list:
+    if compute_forward:
+      current_min_event_ndims = b.forward_min_event_ndims
+      current_inverse_min_event_ndims = b.inverse_min_event_ndims
+    else:
+      current_min_event_ndims = b.inverse_min_event_ndims
+      current_inverse_min_event_ndims = b.forward_min_event_ndims
+
+    # New dimensions were touched.
+    if rank_changed_adjusted_max_min_event_ndims < current_min_event_ndims:
+      min_event_ndims += (
+          current_min_event_ndims - rank_changed_adjusted_max_min_event_ndims)
+    rank_changed_adjusted_max_min_event_ndims = max(
+        current_min_event_ndims, rank_changed_adjusted_max_min_event_ndims)
+
+    # If the number of dimensions has increased via forward, then
+    # inverse_min_event_ndims > forward_min_event_ndims, and hence the
+    # dimensions we computed on, have moved left (so we have operated
+    # on additional dimensions).
+    # Conversely, if the number of dimensions has decreased via forward,
+    # then we have inverse_min_event_ndims < forward_min_event_ndims,
+    # and so we will have operated on fewer right most dimensions.
+
+    number_of_changed_dimensions = (
+        current_min_event_ndims - current_inverse_min_event_ndims)
+    rank_changed_adjusted_max_min_event_ndims -= number_of_changed_dimensions
+  return min_event_ndims
+
+
 class Chain(bijector.Bijector):
   """Bijector which applies a sequence of bijectors.
 
@@ -93,21 +181,24 @@ class Chain(bijector.Bijector):
       raise ValueError("incompatible dtypes: %s" % dtype)
     elif len(dtype) == 2:
       dtype = dtype[1] if dtype[0] is None else dtype[0]
-      event_ndims = bijectors[0].event_ndims
     elif len(dtype) == 1:
       dtype = dtype[0]
-      event_ndims = bijectors[0].event_ndims
     else:
       dtype = None
-      event_ndims = None
+
+    inverse_min_event_ndims = _compute_min_event_ndims(
+        bijectors, compute_forward=False)
+    forward_min_event_ndims = _compute_min_event_ndims(
+        bijectors, compute_forward=True)
 
     super(Chain, self).__init__(
         graph_parents=list(itertools.chain.from_iterable(
             b.graph_parents for b in bijectors)),
+        forward_min_event_ndims=forward_min_event_ndims,
+        inverse_min_event_ndims=inverse_min_event_ndims,
         is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
         validate_args=validate_args,
         dtype=dtype,
-        event_ndims=event_ndims,
         name=name or ("identity" if not bijectors else
                       "_of_".join(["chain"] + [b.name for b in bijectors])))
 
@@ -147,10 +238,31 @@ class Chain(bijector.Bijector):
     return y
 
   def _inverse_log_det_jacobian(self, y, **kwargs):
-    ildj = constant_op.constant(0., dtype=y.dtype,
-                                name="inverse_log_det_jacobian")
+    ildj = constant_op.constant(
+        0., dtype=y.dtype.base_dtype, name="inverse_log_det_jacobian")
+
+    if not self.bijectors:
+      return ildj
+
+    event_ndims = _maybe_get_event_ndims_statically(
+        self.inverse_min_event_ndims)
+
+    if _use_static_shape(y, event_ndims):
+      event_shape = y.shape[y.shape.ndims - event_ndims:]
+    else:
+      event_shape = array_ops.shape(y)[array_ops.rank(y) - event_ndims:]
+
     for b in self.bijectors:
-      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
+      ildj += b.inverse_log_det_jacobian(
+          y, event_ndims=event_ndims, **kwargs.get(b.name, {}))
+
+      if _use_static_shape(y, event_ndims):
+        event_shape = b.inverse_event_shape(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+      else:
+        event_shape = b.inverse_event_shape_tensor(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(
+            array_ops.rank(event_shape))
       y = b.inverse(y, **kwargs.get(b.name, {}))
     return ildj
 
@@ -160,9 +272,34 @@ class Chain(bijector.Bijector):
     return x
 
   def _forward_log_det_jacobian(self, x, **kwargs):
-    fldj = constant_op.constant(0., dtype=x.dtype,
-                                name="forward_log_det_jacobian")
+    x = ops.convert_to_tensor(x, name="x")
+
+    fldj = constant_op.constant(
+        0., dtype=x.dtype, name="inverse_log_det_jacobian")
+
+    if not self.bijectors:
+      return fldj
+
+    event_ndims = _maybe_get_event_ndims_statically(
+        self.forward_min_event_ndims)
+
+    if _use_static_shape(x, event_ndims):
+      event_shape = x.shape[x.shape.ndims - event_ndims:]
+    else:
+      event_shape = array_ops.shape(x)[array_ops.rank(x) - event_ndims:]
+
     for b in reversed(self.bijectors):
-      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
+      fldj += b.forward_log_det_jacobian(
+          x, event_ndims=event_ndims, **kwargs.get(b.name, {}))
+      if _use_static_shape(x, event_ndims):
+        event_shape = b.forward_event_shape(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+      else:
+        event_shape = b.forward_event_shape_tensor(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(
+            array_ops.rank(event_shape))
+
       x = b.forward(x, **kwargs.get(b.name, {}))
+
     return fldj
+
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index cbd60f92a60612c6cf791b2c7708a3310c6e2b6b..ecdb8967f43e5960b2285de05125d0c3dbafe63c 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -39,8 +37,6 @@ __all__ = [
 class CholeskyOuterProduct(bijector.Bijector):
   """Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
 
-  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
-
   Note: the upper-triangular part of X is ignored (whether or not its zero).
 
   The surjectivity of g as a map from  the set of n x n positive-diagonal
@@ -61,49 +57,34 @@ class CholeskyOuterProduct(bijector.Bijector):
   that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
   diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
 
-  Examples:
+  #### Examples
 
   ```python
-  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
+  bijector.CholeskyOuterProduct().forward(x=[[1., 0], [2, 1]])
   # Result: [[1., 2], [2, 5]], i.e., x @ x.T
 
-  bijector.CholeskyOuterProduct(event_ndims=2).inverse(y=[[1., 2], [2, 5]])
+  bijector.CholeskyOuterProduct().inverse(y=[[1., 2], [2, 5]])
   # Result: [[1., 0], [2, 1]], i.e., cholesky(y).
   ```
 
   """
 
-  def __init__(self, event_ndims=2, validate_args=False,
-               name="cholesky_outer_product"):
+  def __init__(self, validate_args=False, name="cholesky_outer_product"):
     """Instantiates the `CholeskyOuterProduct` bijector.
 
     Args:
-      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
-        dimensions associated with a particular draw from the distribution. Must
-        be 0 or 2.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if event_ndims is neither 0 or 2.
     """
     self._graph_parents = []
     self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-    if event_ndims is None or event_ndims not in [0, 2]:
-      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
-    self._static_event_ndims = event_ndims
     super(CholeskyOuterProduct, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=2,
         validate_args=validate_args,
         name=name)
 
   def _forward(self, x):
-    if self._static_event_ndims == 0:
-      return math_ops.square(x)
     if self.validate_args:
       is_matrix = check_ops.assert_rank_at_least(x, 2)
       shape = array_ops.shape(x)
@@ -114,11 +95,7 @@ class CholeskyOuterProduct(bijector.Bijector):
     return math_ops.matmul(x, x, adjoint_b=True)
 
   def _inverse(self, y):
-    return (math_ops.sqrt(y) if self._static_event_ndims == 0
-            else linalg_ops.cholesky(y))
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(x=self._inverse(y))
+    return linalg_ops.cholesky(y)
 
   def _forward_log_det_jacobian(self, x):
     # Let Y be a symmetric, positive definite matrix and write:
@@ -161,13 +138,6 @@ class CholeskyOuterProduct(bijector.Bijector):
     # Since there is a 2 X[j,j] term for every lower-triangular element of X we
     # conclude:
     #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
-    if self._static_event_ndims == 0:
-      if self.validate_args:
-        is_positive = check_ops.assert_positive(
-            x, message="All elements must be positive.")
-        x = control_flow_ops.with_dependencies([is_positive], x)
-      return np.log(2.) + math_ops.log(x)
-
     diag = array_ops.matrix_diag_part(x)
 
     # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output
@@ -200,7 +170,7 @@ class CholeskyOuterProduct(bijector.Bijector):
     sum_weighted_log_diag = array_ops.squeeze(
         math_ops.matmul(math_ops.log(diag),
                         exponents[..., array_ops.newaxis]),
-        squeeze_dims=-1)
+        axis=-1)
     fldj = p_float * np.log(2.) + sum_weighted_log_diag
 
     return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
index ccb1f029277bc07011df7be047a075274f2b3a27..e9e994f839ab2fe0a0f52f5f404fb2a0c8f9cd94 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
@@ -44,12 +44,16 @@ class ConditionalBijector(bijector.Bijector):
       "**condition_kwargs":
       "Named arguments forwarded to subclass implementation."})
   def inverse_log_det_jacobian(
-      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
-    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
+      self, y, event_ndims, name="inverse_log_det_jacobian",
+      **condition_kwargs):
+    return self._call_inverse_log_det_jacobian(
+        y, event_ndims, name, **condition_kwargs)
 
   @distribution_util.AppendDocstring(kwargs_dict={
       "**condition_kwargs":
       "Named arguments forwarded to subclass implementation."})
   def forward_log_det_jacobian(
-      self, x, name="forward_log_det_jacobian", **condition_kwargs):
-    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
+      self, x, event_ndims, name="forward_log_det_jacobian",
+      **condition_kwargs):
+    return self._call_forward_log_det_jacobian(
+        x, event_ndims, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
index b1ff840d62a73c941a4d67dec73b5c9f4d5353f9..9fc1bbf052b419d07a9db149b990c2b80190d72b 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
@@ -33,8 +33,8 @@ class Exp(power_transform.PowerTransform):
 
     ```python
     # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    exp = Exp(event_ndims=2)
+    # batch ndim 2.
+    exp = Exp()
     x = [[[1., 2],
            [3, 4]],
           [[5, 6],
@@ -48,19 +48,17 @@ class Exp(power_transform.PowerTransform):
   """
 
   def __init__(self,
-               event_ndims=0,
                validate_args=False,
                name="exp"):
     """Instantiates the `Exp` bijector.
 
     Args:
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
     """
+    # forward_min_event_ndims = 0.
+    # No forward_min_event_ndims specified as this is done in PowerTransform.
     super(Exp, self).__init__(
-        event_ndims=event_ndims,
         validate_args=validate_args,
         name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
index 67f39785563255be0fe154aca3cbcf01c6a01e73..e656a258e56e71898ecb719dd2af876f158cf799 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
@@ -48,7 +48,6 @@ class Gumbel(bijector.Bijector):
   def __init__(self,
                loc=0.,
                scale=1.,
-               event_ndims=0,
                validate_args=False,
                name="gumbel"):
     """Instantiates the `Gumbel` bijector.
@@ -60,8 +59,6 @@ class Gumbel(bijector.Bijector):
       scale: Positive Float-like `Tensor` that is the same dtype and is
         broadcastable with `loc`.
         This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -80,7 +77,9 @@ class Gumbel(bijector.Bijector):
         ], self._scale)
 
     super(Gumbel, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
+        validate_args=validate_args,
+        forward_min_event_ndims=0,
+        name=name)
 
   @property
   def loc(self):
@@ -102,15 +101,11 @@ class Gumbel(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        math_ops.log(self.scale / (-math_ops.log(y) * y)), axis=event_dims)
+    return math_ops.log(self.scale / (-math_ops.log(y) * y))
 
   def _forward_log_det_jacobian(self, x):
-    event_dims = self._event_dims_tensor(x)
     z = (x - self.loc) / self.scale
-    return math_ops.reduce_sum(
-        -z - math_ops.exp(-z) - math_ops.log(self.scale), axis=event_dims)
+    return -z - math_ops.exp(-z) - math_ops.log(self.scale)
 
   def _maybe_assert_valid_y(self, y):
     if not self.validate_args:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
index fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94..2bde956d1345129285acae4684256c5ac828b9a1 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
@@ -40,7 +40,7 @@ class Inline(bijector.Bijector):
     name="exp")
   ```
 
-  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
+  The above example is equivalent to the `Bijector` `Exp()`.
   """
 
   def __init__(self,
@@ -54,6 +54,8 @@ class Inline(bijector.Bijector):
                inverse_event_shape_tensor_fn=None,
                is_constant_jacobian=False,
                validate_args=False,
+               forward_min_event_ndims=None,
+               inverse_min_event_ndims=None,
                name="inline"):
     """Creates a `Bijector` from callables.
 
@@ -76,10 +78,15 @@ class Inline(bijector.Bijector):
         constant for all input arguments.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
+      forward_min_event_ndims: Python `int` indicating the minimal
+        dimensionality this bijector acts on.
+      inverse_min_event_ndims: Python `int` indicating the minimal
+        dimensionality this bijector acts on.
       name: Python `str`, name given to ops managed by this object.
     """
     super(Inline, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=forward_min_event_ndims,
+        inverse_min_event_ndims=inverse_min_event_ndims,
         is_constant_jacobian=is_constant_jacobian,
         validate_args=validate_args,
         name=name)
@@ -134,8 +141,8 @@ class Inline(bijector.Bijector):
           "inverse_log_det_jacobian_fn is not a callable function.")
     return self._inverse_log_det_jacobian_fn(y, **kwargs)
 
-  def _forward_log_det_jacobian(self, y, **kwargs):
+  def _forward_log_det_jacobian(self, x, **kwargs):
     if not callable(self._forward_log_det_jacobian_fn):
       raise NotImplementedError(
           "forward_log_det_jacobian_fn is not a callable function.")
-    return self._forward_log_det_jacobian_fn(y, **kwargs)
+    return self._forward_log_det_jacobian_fn(x, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
index 2c603fe61f36dd27f4984fe6c13c11f2fb534321..84a3289ba2160ed22a2bc7030dd612ba9ca6f6df 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
@@ -18,14 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 __all__ = [
     "Invert",
 ]
 
 
-class Invert(bijector_lib.Bijector):
+class Invert(bijector.Bijector):
   """Bijector which inverts another Bijector.
 
   Example Use: [ExpGammaDistribution (see Background & Context)](
@@ -66,8 +66,9 @@ class Invert(bijector_lib.Bijector):
 
     self._bijector = bijector
     super(Invert, self).__init__(
-        event_ndims=bijector.event_ndims,
         graph_parents=bijector.graph_parents,
+        forward_min_event_ndims=bijector.inverse_min_event_ndims,
+        inverse_min_event_ndims=bijector.forward_min_event_ndims,
         is_constant_jacobian=bijector.is_constant_jacobian,
         validate_args=validate_args,
         dtype=bijector.dtype,
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py
new file mode 100644
index 0000000000000000000000000000000000000000..97000c17262d3efdef10274711364c2bc2083bd4
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py
@@ -0,0 +1,132 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kumaraswamy bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+__all__ = [
+    "Kumaraswamy",
+]
+
+
+class Kumaraswamy(bijector.Bijector):
+  """Compute `Y = g(X) = (1 - (1 - X)**(1 / b))**(1 / a), X in [0, 1]`.
+
+  This bijector maps inputs from `[0, 1]` to [0, 1]`. The inverse of the
+  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
+  random variable with the [Kumaraswamy distribution](
+  https://en.wikipedia.org/wiki/Kumaraswamy_distribution):
+
+  ```none
+  Y ~ Kumaraswamy(a, b)
+  pdf(y; a, b, 0 <= y <= 1) = a * b * y ** (a - 1) * (1 - y**a) ** (b - 1)
+  ```
+  """
+
+  def __init__(self,
+               concentration1=None,
+               concentration0=None,
+               validate_args=False,
+               name="kumaraswamy"):
+    """Instantiates the `Kumaraswamy` bijector.
+
+    Args:
+      concentration1: Python `float` scalar indicating the transform power,
+        i.e., `Y = g(X) = (1 - (1 - X)**(1 / b))**(1 / a)` where `a` is
+        `concentration1`.
+      concentration0: Python `float` scalar indicating the transform power,
+        i.e., `Y = g(X) = (1 - (1 - X)**(1 / b))**(1 / a)` where `b` is
+        `concentration0`.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+
+    with self._name_scope("init", values=[concentration1, concentration0]):
+      concentration1 = self._maybe_assert_valid_concentration(
+          ops.convert_to_tensor(concentration1, name="concentration1"),
+          validate_args=validate_args)
+      concentration0 = self._maybe_assert_valid_concentration(
+          ops.convert_to_tensor(concentration0, name="concentration0"),
+          validate_args=validate_args)
+
+    self._concentration1 = concentration1
+    self._concentration0 = concentration0
+    super(Kumaraswamy, self).__init__(
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
+
+  @property
+  def concentration1(self):
+    """The `a` in: `Y = g(X) = (1 - (1 - X)**(1 / b))**(1 / a)`."""
+    return self._concentration1
+
+  @property
+  def concentration0(self):
+    """The `b` in: `Y = g(X) = (1 - (1 - X)**(1 / b))**(1 / a)`."""
+    return self._concentration0
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid(x)
+    return math_ops.exp(
+        math_ops.log1p(-math_ops.exp(math_ops.log1p(-x) / self.concentration0))
+        / self.concentration1)
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid(y)
+    return math_ops.exp(math_ops.log1p(
+        -(1 - y**self.concentration1)**self.concentration0))
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid(y)
+    return (
+        math_ops.log(self.concentration1) + math_ops.log(self.concentration0) +
+        (self.concentration1 - 1) * math_ops.log(y) +
+        (self.concentration0 - 1) * math_ops.log1p(-y**self.concentration1))
+
+  def _maybe_assert_valid_concentration(self, concentration, validate_args):
+    """Checks the validity of a concentration parameter."""
+    if not validate_args:
+      return concentration
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_positive(
+            concentration,
+            message="Concentration parameter must be positive."),
+    ], concentration)
+
+  def _maybe_assert_valid(self, x):
+    if not self.validate_args:
+      return x
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_non_negative(
+            x,
+            message="sample must be non-negative"),
+        check_ops.assert_less_equal(
+            x, array_ops.ones([], self.concentration0.dtype),
+            message="sample must be no larger than `1`."),
+    ], x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index 5251dbcb5748f75688aa43ce6e4e9dbd76be78bb..83667b0e80cfcc1c4f0617cdc739221f24439665 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -32,7 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import template as template_ops
 from tensorflow.python.ops import variable_scope as variable_scope_lib
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -42,17 +42,18 @@ __all__ = [
 ]
 
 
-class MaskedAutoregressiveFlow(bijector_lib.Bijector):
+class MaskedAutoregressiveFlow(bijector.Bijector):
   """Affine MaskedAutoregressiveFlow bijector for vector-valued events.
 
-  The affine autoregressive flow [1] provides a relatively simple framework for
-  user-specified (deep) architectures to learn a distribution over vector-valued
-  events. Regarding terminology,
+  The affine autoregressive flow [(Papamakarios et al., 2016)][3] provides a
+  relatively simple framework for user-specified (deep) architectures to learn
+  a distribution over vector-valued events. Regarding terminology,
 
     "Autoregressive models decompose the joint density as a product of
     conditionals, and model each conditional in turn. Normalizing flows
     transform a base density (e.g. a standard Gaussian) into the target density
-    by an invertible transformation with tractable Jacobian." [1]
+    by an invertible transformation with tractable Jacobian."
+    [(Papamakarios et al., 2016)][3]
 
   In other words, the "autoregressive property" is equivalent to the
   decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
@@ -60,7 +61,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
   this property by zeroing out weights in its `masked_dense` layers.
 
   In the `tf.distributions` framework, a "normalizing flow" is implemented as a
-  `tf.distributions.bijectors.Bijector`. The `forward` "autoregression"
+  `tf.contrib.distributions.bijectors.Bijector`. The `forward` "autoregression"
   is implemented using a `tf.while_loop` and a deep neural network (DNN) with
   masked weights such that the autoregressive property is automatically met in
   the `inverse`.
@@ -75,26 +76,26 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
 
   Given a `shift_and_log_scale_fn`, the forward and inverse transformations are
   (a sequence of) affine transformations. A "valid" `shift_and_log_scale_fn`
-  must compute each `shift` (aka `loc` or "mu" [2]) and `log(scale)` (aka
-  "alpha" [2]) such that each are broadcastable with the arguments to `forward`
-  and `inverse`, i.e., such that the calculations in `forward`, `inverse`
-  [below] are possible.
+  must compute each `shift` (aka `loc` or "mu" in [Germain et al. (2015)][1])
+  and `log(scale)` (aka "alpha" in [Germain et al. (2015)][1]) such that each
+  are broadcastable with the arguments to `forward` and `inverse`, i.e., such
+  that the calculations in `forward`, `inverse` [below] are possible.
 
   For convenience, `masked_autoregressive_default_template` is offered as a
   possible `shift_and_log_scale_fn` function. It implements the MADE
-  architecture [2]. MADE is a feed-forward network that computes a `shift` and
-  `log(scale)` using `masked_dense` layers in a deep neural network. Weights are
-  masked to ensure the autoregressive property. It is possible that this
-  architecture is suboptimal for your task. To build alternative networks,
-  either change the arguments to `masked_autoregressive_default_template`, use
-  the `masked_dense` function to roll-out your own, or use some other
-  architecture, e.g., using `tf.layers`.
+  architecture [(Germain et al., 2015)][1]. MADE is a feed-forward network that
+  computes a `shift` and `log(scale)` using `masked_dense` layers in a deep
+  neural network. Weights are masked to ensure the autoregressive property. It
+  is possible that this architecture is suboptimal for your task. To build
+  alternative networks, either change the arguments to
+  `masked_autoregressive_default_template`, use the `masked_dense` function to
+  roll-out your own, or use some other architecture, e.g., using `tf.layers`.
 
   Warning: no attempt is made to validate that the `shift_and_log_scale_fn`
   enforces the "autoregressive property".
 
   Assuming `shift_and_log_scale_fn` has valid shape and autoregressive
-  semantics, the forward transformation is,
+  semantics, the forward transformation is
 
   ```python
   def forward(x):
@@ -106,7 +107,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
     return y
   ```
 
-  and the inverse transformation is,
+  and the inverse transformation is
 
   ```python
   def inverse(y):
@@ -121,7 +122,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
   the "last" `y` used to compute `shift`, `log_scale`. (Roughly speaking, this
   also proves the transform is bijective.)
 
-  #### Example Use
+  #### Examples
 
   ```python
   tfd = tf.contrib.distributions
@@ -142,7 +143,8 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
   maf.log_prob(x)   # Almost free; uses Bijector caching.
   maf.log_prob(0.)  # Cheap; no `tf.while_loop` despite no Bijector caching.
 
-  # [1] also describes an "Inverse Autoregressive Flow", e.g.,
+  # [Papamakarios et al. (2016)][3] also describe an Inverse Autoregressive
+  # Flow [(Kingma et al., 2016)][2]:
   iaf = tfd.TransformedDistribution(
       distribution=tfd.Normal(loc=0., scale=1.),
       bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
@@ -168,14 +170,20 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
       event_shape=[dims])
   ```
 
-  [1]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
+  #### References
 
-  [2]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
+  [1]: Mathieu Germain, Karol Gregor, Iain Murray, and Hugo Larochelle. MADE:
+       Masked Autoencoder for Distribution Estimation. In _International
+       Conference on Machine Learning_, 2015. https://arxiv.org/abs/1502.03509
 
+  [2]: Diederik P. Kingma, Tim Salimans, Rafal Jozefowicz, Xi Chen, Ilya
+       Sutskever, and Max Welling. Improving Variational Inference with Inverse
+       Autoregressive Flow. In _Neural Information Processing Systems_, 2016.
+       https://arxiv.org/abs/1606.04934
+
+  [3]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
   """
 
   def __init__(self,
@@ -212,6 +220,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
     self._shift_and_log_scale_fn = shift_and_log_scale_fn
     self._unroll_loop = unroll_loop
     super(MaskedAutoregressiveFlow, self).__init__(
+        forward_min_event_ndims=1,
         is_constant_jacobian=is_constant_jacobian,
         validate_args=validate_args,
         name=name)
@@ -329,11 +338,7 @@ def masked_dense(inputs,
                  **kwargs):
   """A autoregressively masked dense layer. Analogous to `tf.layers.dense`.
 
-  See [1] for detailed explanation.
-
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
+  See [Germain et al. (2015)][1] for detailed explanation.
 
   Arguments:
     inputs: Tensor input.
@@ -358,6 +363,12 @@ def masked_dense(inputs,
   Raises:
     NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
       graph execution.
+
+  #### References
+
+  [1]: Mathieu Germain, Karol Gregor, Iain Murray, and Hugo Larochelle. MADE:
+       Masked Autoencoder for Distribution Estimation. In _International
+       Conference on Machine Learning_, 2015. https://arxiv.org/abs/1502.03509
   """
   # TODO(b/67594795): Better support of dynamic shape.
   input_depth = inputs.shape.with_rank_at_least(1)[-1].value
@@ -398,23 +409,24 @@ def masked_autoregressive_default_template(
     name=None,
     *args,
     **kwargs):
-  """Build the MADE Model [1].
+  """Build the Masked Autoregressive Density Estimator (Germain et al., 2015).
 
   This will be wrapped in a make_template to ensure the variables are only
-  created once. It takes the input and returns the `loc` ("mu" [1]) and
-  `log_scale` ("alpha" [1]) from the MADE network.
+  created once. It takes the input and returns the `loc` ("mu" in [Germain et
+  al. (2015)][1]) and `log_scale` ("alpha" in [Germain et al. (2015)][1]) from
+  the MADE network.
 
   Warning: This function uses `masked_dense` to create randomly initialized
   `tf.Variables`. It is presumed that these will be fit, just as you would any
   other neural architecture which uses `tf.layers.dense`.
 
-  #### About Hidden Layers:
+  #### About Hidden Layers
 
   Each element of `hidden_layers` should be greater than the `input_depth`
   (i.e., `input_depth = tf.shape(input)[-1]` where `input` is the input to the
   neural network). This is necessary to ensure the autoregressivity property.
 
-  #### About Clipping:
+  #### About Clipping
 
   This function also optionally clips the `log_scale` (but possibly not its
   gradient). This is useful because if `log_scale` is too small/large it might
@@ -427,11 +439,7 @@ def masked_autoregressive_default_template(
   `grad[exp(clip(x))] = grad[x] exp(clip(x))` rather than the usual
   `grad[clip(x)] exp(clip(x))`.
 
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  Arguments:
+  Args:
     hidden_layers: Python `list`-like of non-negative integer, scalars
       indicating the number of units in each hidden layer. Default: `[512, 512].
     shift_only: Python `bool` indicating if only the `shift` term shall be
@@ -450,12 +458,20 @@ def masked_autoregressive_default_template(
     **kwargs: `tf.layers.dense` keyword arguments.
 
   Returns:
-    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
-    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
+    shift: `Float`-like `Tensor` of shift terms (the "mu" in
+      [Germain et al.  (2015)][1]).
+    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in
+      [Germain et al. (2015)][1]).
 
   Raises:
     NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
       graph execution.
+
+  #### References
+
+  [1]: Mathieu Germain, Karol Gregor, Iain Murray, and Hugo Larochelle. MADE:
+       Masked Autoencoder for Distribution Estimation. In _International
+       Conference on Machine Learning_, 2015. https://arxiv.org/abs/1502.03509
   """
 
   with ops.name_scope(name, "masked_autoregressive_default_template",
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f03592f314cc13e8a9ea7e2ae18c5bb1f14e74f
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
@@ -0,0 +1,125 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ordered bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Ordered",
+]
+
+
+class Ordered(bijector.Bijector):
+  """Bijector which maps a tensor x_k that has increasing elements in the last
+  dimension to an unconstrained tensor y_k.
+
+  Both the domain and the codomain of the mapping is [-inf, inf], however,
+  the input of the forward mapping must be strictly increasing.
+  The inverse of the bijector applied to a normal random vector `y ~ N(0, 1)`
+  gives back a sorted random vector with the same distribution `x ~ N(0, 1)`
+  where `x = sort(y)`
+
+  On the last dimension of the tensor, Ordered bijector performs:
+  `y[0] = x[0]`
+  `y[1:] = math_ops.log(x[1:] - x[:-1])`
+
+  #### Example Use:
+
+  ```python
+  bijector.Ordered().forward([2, 3, 4])
+  # Result: [2., 0., 0.]
+
+  bijector.Ordered().inverse([0.06428002, -1.07774478, -0.71530371])
+  # Result: [0.06428002, 0.40464228, 0.8936858]
+  ```
+  """
+
+  def __init__(self, validate_args=False, name="ordered"):
+    super(Ordered, self).__init__(
+        forward_min_event_ndims=1,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward_event_shape(self, input_shape):
+    if input_shape.ndims is None or input_shape[-1] is None:
+      return input_shape
+    return tensor_shape.TensorShape([input_shape[-1]])
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return (input_shape[-1])[..., array_ops.newaxis]
+
+  def _inverse_event_shape(self, output_shape):
+    if output_shape.ndims is None or output_shape[-1] is None:
+      return output_shape
+    if output_shape[-1] <= 1:
+      raise ValueError("output_shape[-1] = %d <= 1" % output_shape[-1])
+    return tensor_shape.TensorShape([output_shape[-1]])
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    if self.validate_args:
+      is_greater_one = check_ops.assert_greater(
+          output_shape[-1], 1, message="Need last dimension greater than 1.")
+      output_shape = control_flow_ops.with_dependencies(
+          [is_greater_one], output_shape)
+    return (output_shape[-1])[..., array_ops.newaxis]
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    y0 = x[..., 0, array_ops.newaxis]
+    yk = math_ops.log(x[..., 1:] - x[..., :-1])
+    y = array_ops.concat([y0, yk], axis=-1)
+    return y
+
+  def _inverse(self, y):
+    x0 = y[..., 0, array_ops.newaxis]
+    xk = math_ops.exp(y[..., 1:])
+    x = array_ops.concat([x0, xk], axis=-1)
+    return math_ops.cumsum(x, axis=-1)
+
+  def _inverse_log_det_jacobian(self, y):
+    # The Jacobian of the inverse mapping is lower
+    # triangular, with the diagonal elements being:
+    # J[i,i] = 1 if i=1, and
+    #          exp(y_i) if 1<i<=K
+    # which gives the absolute Jacobian determinant:
+    # |det(Jac)| = prod_{i=1}^{K} exp(y[i]).
+    # (1) - Stan Modeling Language User's Guide and Reference Manual
+    #       Version 2.17.0 session 35.2
+    return math_ops.reduce_sum(y[..., 1:], axis=-1)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
+    return -math_ops.reduce_sum(
+        math_ops.log(x[..., 1:] - x[..., :-1]),
+        axis=-1)
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args:
+      return x
+    is_valid = check_ops.assert_positive(
+        x[..., 1:] - x[..., :-1],
+        message="Forward transformation input must be strictly increasing.")
+    return control_flow_ops.with_dependencies([is_valid], x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
index 8654cc39d0c41ec4f1b85cd5fc4366ceaf4b224d..12a16a3f2ba3da53077307fd97d3f10d99b2c81f 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
@@ -28,7 +28,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -36,7 +36,7 @@ __all__ = [
 ]
 
 
-class Permute(bijector_lib.Bijector):
+class Permute(bijector.Bijector):
   """Permutes the rightmost dimension of a `Tensor`.
 
   ```python
@@ -114,6 +114,7 @@ class Permute(bijector_lib.Bijector):
         ], permutation)
       self._permutation = permutation
       super(Permute, self).__init__(
+          forward_min_event_ndims=1,
           is_constant_jacobian=True,
           validate_args=validate_args,
           name=name or "permute")
@@ -132,7 +133,10 @@ class Permute(bijector_lib.Bijector):
         axis=-1)
 
   def _inverse_log_det_jacobian(self, y):
-    return constant_op.constant(0., dtype=y.dtype)
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
+    return constant_op.constant(0., dtype=y.dtype.base_dtype)
 
   def _forward_log_det_jacobian(self, x):
-    return constant_op.constant(0., dtype=x.dtype)
+    return constant_op.constant(0., dtype=x.dtype.base_dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
index c37db61720d10949f294ff7b2e9778ba6efa57f0..71f123f2a998458edaa9c8da07ea2932f62625ca 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
@@ -43,7 +43,6 @@ class PowerTransform(bijector.Bijector):
 
   def __init__(self,
                power=0.,
-               event_ndims=0,
                validate_args=False,
                name="power_transform"):
     """Instantiates the `PowerTransform` bijector.
@@ -51,8 +50,6 @@ class PowerTransform(bijector.Bijector):
     Args:
       power: Python `float` scalar indicating the transform power, i.e.,
         `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -70,7 +67,7 @@ class PowerTransform(bijector.Bijector):
       raise ValueError("`power` must be a non-negative TF constant.")
     self._power = power
     super(PowerTransform, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -97,18 +94,13 @@ class PowerTransform(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return (self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log(y), axis=event_dims)
+    return (self.power - 1.) * math_ops.log(y)
 
   def _forward_log_det_jacobian(self, x):
     x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
     if self.power == 0.:
-      return math_ops.reduce_sum(x, axis=event_dims)
-    return (1. / self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log1p(x * self.power),
-        axis=event_dims)
+      return x
+    return (1. / self.power - 1.) * math_ops.log1p(x * self.power)
 
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args or self.power == 0.:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
index 2840f52e742eac5e9e37a576bf7f6d6f05a07a35..66e8a5b9b356867424d1d47efaf848fc6903c371 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
@@ -25,7 +25,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import template as template_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -34,11 +34,11 @@ __all__ = [
 ]
 
 
-class RealNVP(bijector_lib.Bijector):
+class RealNVP(bijector.Bijector):
   """RealNVP "affine coupling layer" for vector-valued events.
 
   Real NVP models a normalizing flow on a `D`-dimensional distribution via a
-  single `D-d`-dimensional conditional distribution [1]:
+  single `D-d`-dimensional conditional distribution [(Dinh et al., 2017)][1]:
 
   `y[d:D] = y[d:D] * math_ops.exp(log_scale_fn(y[d:D])) + shift_fn(y[d:D])`
   `y[0:d] = x[0:d]`
@@ -51,31 +51,34 @@ class RealNVP(bijector_lib.Bijector):
 
   Masking is currently only supported for base distributions with
   `event_ndims=1`. For more sophisticated masking schemes like checkerboard or
-  channel-wise masking [2], use the `tfb.Permute` bijector to re-order desired
-  masked units into the first `d` units. For base distributions with
-  `event_ndims > 1`, use the `tfb.Reshape` bijector to flatten the event shape.
-
-  Recall that the MAF bijector [2] implements a normalizing flow via an
-  autoregressive transformation. MAF and IAF have opposite computational
-  tradeoffs - MAF can train all units in parallel but must sample units
-  sequentially, while IAF must train units sequentially but can sample in
-  parallel. In contrast, Real NVP can compute both forward and inverse
-  computations in parallel. However, the lack of an autoregressive
+  channel-wise masking [(Papamakarios et al., 2016)[4], use the `tfb.Permute`
+  bijector to re-order desired masked units into the first `d` units. For base
+  distributions with `event_ndims > 1`, use the `tfb.Reshape` bijector to
+  flatten the event shape.
+
+  Recall that the MAF bijector [(Papamakarios et al., 2016)][4] implements a
+  normalizing flow via an autoregressive transformation. MAF and IAF have
+  opposite computational tradeoffs - MAF can train all units in parallel but
+  must sample units sequentially, while IAF must train units sequentially but
+  can sample in parallel. In contrast, Real NVP can compute both forward and
+  inverse computations in parallel. However, the lack of an autoregressive
   transformations makes it less expressive on a per-bijector basis.
 
   A "valid" `shift_and_log_scale_fn` must compute each `shift` (aka `loc` or
-  "mu" [2]) and `log(scale)` (aka "alpha" [2]) such that each are broadcastable
-  with the arguments to `forward` and `inverse`, i.e., such that the
-  calculations in `forward`, `inverse` [below] are possible. For convenience,
+  "mu" in [Papamakarios et al. (2016)][4]) and `log(scale)` (aka "alpha" in
+  [Papamakarios et al. (2016)][4]) such that each are broadcastable with the
+  arguments to `forward` and `inverse`, i.e., such that the calculations in
+  `forward`, `inverse` [below] are possible. For convenience,
   `real_nvp_default_nvp` is offered as a possible `shift_and_log_scale_fn`
   function.
 
-  NICE [3] is a special case of the Real NVP bijector which discards the scale
-  transformation, resulting in a constant-time inverse-log-determinant-Jacobian.
-  To use a NICE bijector instead of Real NVP, `shift_and_log_scale_fn` should
-  return `(shift, None)`, and `is_constant_jacobian` should be set to `True` in
-  the `RealNVP` constructor. Calling `real_nvp_default_template` with
-  `shift_only=True` returns one such NICE-compatible `shift_and_log_scale_fn`.
+  NICE [(Dinh et al., 2014)][2] is a special case of the Real NVP bijector
+  which discards the scale transformation, resulting in a constant-time
+  inverse-log-determinant-Jacobian. To use a NICE bijector instead of Real
+  NVP, `shift_and_log_scale_fn` should return `(shift, None)`, and
+  `is_constant_jacobian` should be set to `True` in the `RealNVP` constructor.
+  Calling `real_nvp_default_template` with `shift_only=True` returns one such
+  NICE-compatible `shift_and_log_scale_fn`.
 
   Caching: the scalar input depth `D` of the base distribution is not known at
   construction time. The first call to any of `forward(x)`, `inverse(x)`,
@@ -103,23 +106,24 @@ class RealNVP(bijector_lib.Bijector):
   nvp.log_prob(0.)
   ```
 
-  For more examples, see [4].
+  For more examples, see [Jang (2018)][3].
 
-  [1]: "Density Estimation using Real NVP."
-       Laurent Dinh, Jascha Sohl-Dickstein, Samy Bengio. ICLR. 2017.
-       https://arxiv.org/abs/1605.08803
+  #### References
 
-  [2]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
+  [1]: Laurent Dinh, Jascha Sohl-Dickstein, and Samy Bengio. Density Estimation
+       using Real NVP. In _International Conference on Learning
+       Representations_, 2017. https://arxiv.org/abs/1605.08803
 
-  [3]: "NICE: Non-linear Independent Components Estimation."
-       Laurent Dinh, David Krueger, Yoshua Bengio. ICLR. 2015.
-       https://arxiv.org/abs/1410.8516
+  [2]: Laurent Dinh, David Krueger, and Yoshua Bengio. NICE: Non-linear
+       Independent Components Estimation. _arXiv preprint arXiv:1410.8516_,
+       2014. https://arxiv.org/abs/1410.8516
 
-  [4]: "Normalizing Flows Tutorial, Part 2: Modern Normalizing Flows."
-       Eric Jang. Blog post. January 2018.
-       http://blog.evjang.com/2018/01/nf2.html
+  [3]: Eric Jang. Normalizing Flows Tutorial, Part 2: Modern Normalizing Flows.
+       _Technical Report_, 2018. http://blog.evjang.com/2018/01/nf2.html
+
+  [4]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
   """
 
   def __init__(self,
@@ -162,7 +166,7 @@ class RealNVP(bijector_lib.Bijector):
     self._input_depth = None
     self._shift_and_log_scale_fn = shift_and_log_scale_fn
     super(RealNVP, self).__init__(
-        event_ndims=1,
+        forward_min_event_ndims=1,
         is_constant_jacobian=is_constant_jacobian,
         validate_args=validate_args,
         name=name)
@@ -220,7 +224,7 @@ class RealNVP(bijector_lib.Bijector):
     _, log_scale = self._shift_and_log_scale_fn(
         x0, self._input_depth - self._num_masked)
     if log_scale is None:
-      return constant_op.constant(0., dtype=x.dtype, name="ildj")
+      return constant_op.constant(0., dtype=x.dtype, name="fldj")
     return math_ops.reduce_sum(log_scale, axis=-1)
 
 
@@ -250,12 +254,20 @@ def real_nvp_default_template(
     **kwargs: `tf.layers.dense` keyword arguments.
 
   Returns:
-    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
-    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
+    shift: `Float`-like `Tensor` of shift terms ("mu" in
+      [Papamakarios et al.  (2016)][1]).
+    log_scale: `Float`-like `Tensor` of log(scale) terms ("alpha" in
+      [Papamakarios et al. (2016)][1]).
 
   Raises:
     NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
       graph execution.
+
+  #### References
+
+  [1]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
   """
 
   with ops.name_scope(name, "real_nvp_default_template"):
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
index 55eca063126797d577653f0d6bcdfddf8192bdb5..5497c422e4d51e259435692dac722f801e8844ac 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@@ -28,7 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -44,7 +44,7 @@ def _ndims_from_shape(shape):
   return array_ops.shape(shape)[0]
 
 
-class Reshape(bijector_lib.Bijector):
+class Reshape(bijector.Bijector):
   """Reshapes the `event_shape` of a `Tensor`.
 
   The semantics generally follow that of `tf.reshape()`, with
@@ -128,15 +128,17 @@ class Reshape(bijector_lib.Bijector):
       self._event_shape_in = event_shape_in
       self._event_shape_out = event_shape_out
 
-      super(Reshape, self).__init__(is_constant_jacobian=True,
-                                    validate_args=validate_args,
-                                    name=name or "reshape")
+      super(Reshape, self).__init__(
+          forward_min_event_ndims=0,
+          is_constant_jacobian=True,
+          validate_args=validate_args,
+          name=name or "reshape")
 
   def _maybe_check_valid_shape(self, shape, validate_args):
     """Check that a shape Tensor is int-type and otherwise sane."""
     if not shape.dtype.is_integer:
       raise TypeError("{} dtype ({}) should be `int`-like.".format(
-          shape.op.name, shape.dtype.name))
+          shape, shape.dtype.name))
 
     assertions = []
 
@@ -144,10 +146,10 @@ class Reshape(bijector_lib.Bijector):
     ndims_ = tensor_util.constant_value(ndims)
     if ndims_ is not None and ndims_ > 1:
       raise ValueError("`{}` rank ({}) should be <= 1.".format(
-          shape.op.name, ndims_))
+          shape, ndims_))
     elif validate_args:
       assertions.append(check_ops.assert_less_equal(
-          ndims, 1, message="`{}` rank should be <= 1.".format(shape.op.name)))
+          ndims, 1, message="`{}` rank should be <= 1.".format(shape)))
 
     shape_ = tensor_util.constant_value_as_shape(shape)
     if shape_.is_fully_defined():
@@ -155,12 +157,12 @@ class Reshape(bijector_lib.Bijector):
       if sum(es == -1) > 1:
         raise ValueError(
             "`{}` must have at most one `-1` (given {})"
-            .format(shape.op.name, es))
+            .format(shape, es))
       if np.any(es < -1):
         raise ValueError(
             "`{}` elements must be either positive integers or `-1`"
             "(given {})."
-            .format(shape.op.name, es))
+            .format(shape, es))
     elif validate_args:
       assertions.extend([
           check_ops.assert_less_equal(
@@ -168,11 +170,11 @@ class Reshape(bijector_lib.Bijector):
                   math_ops.cast(math_ops.equal(shape, -1), dtypes.int32)),
               1,
               message="`{}` elements must have at most one `-1`."
-              .format(shape.op.name)),
+              .format(shape)),
           check_ops.assert_greater_equal(
               shape, -1,
               message="`{}` elements must be either positive integers or `-1`."
-              .format(shape.op.name)),
+              .format(shape)),
       ])
     return assertions
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
index a640dfe7dfbcce96261589c7fc49107deaefdd54..5df8c886315ff75cdc884e3b9b4665fb64bb109d 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
@@ -33,7 +33,9 @@ class Sigmoid(bijector.Bijector):
 
   def __init__(self, validate_args=False, name="sigmoid"):
     super(Sigmoid, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
 
   def _forward(self, x):
     return math_ops.sigmoid(x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
deleted file mode 100644
index 223bc9d042c69be05b0e578835a31ed6e83c0c97..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SigmoidCentered bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered
-
-
-__all__ = [
-    "SigmoidCentered",
-]
-
-
-class SigmoidCentered(softmax_centered.SoftmaxCentered):
-  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
-
-  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
-
-  See `bijector.SoftmaxCentered` for more details.
-  """
-
-  def __init__(self, validate_args=False, name="sigmoid_centered"):
-    super(SigmoidCentered, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
index 3a75e4ae9495793901b0da91a5aa3982aab35852..2a32e8abcde940b0056b0faf2955ec1b3bd71803 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
@@ -91,7 +91,6 @@ class SinhArcsinh(bijector.Bijector):
   def __init__(self,
                skewness=None,
                tailweight=None,
-               event_ndims=0,
                validate_args=False,
                name="SinhArcsinh"):
     """Instantiates the `SinhArcsinh` bijector.
@@ -101,8 +100,6 @@ class SinhArcsinh(bijector.Bijector):
         of type `float32`.
       tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
         `skewness` and broadcastable `shape`.  Default is `1` of type `float32`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -125,7 +122,9 @@ class SinhArcsinh(bijector.Bijector):
                 message="Argument tailweight was not positive")
         ], self._tailweight)
     super(SinhArcsinh, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
 
   @property
   def skewness(self):
@@ -149,31 +148,29 @@ class SinhArcsinh(bijector.Bijector):
     # dx/dy
     # = cosh(arcsinh(y) / tailweight - skewness)
     #     / (tailweight * sqrt(y**2 + 1))
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
+
+    # This is computed inside the log to avoid catastrophic cancellations
+    # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
+    return (
         math_ops.log(math_ops.cosh(
             math_ops.asinh(y) / self.tailweight - self.skewness)
                      # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
                      # where (arcsinh(x) / tailweight) - skewness ~= arcsinh(x).
                      / _sqrtx2p1(y))
-        - math_ops.log(self.tailweight),
-        axis=event_dims)
+        - math_ops.log(self.tailweight))
 
   def _forward_log_det_jacobian(self, x):
     # y = sinh((arcsinh(x) + skewness) * tailweight)
     # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
     # dy/dx
     # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
+
+    # This is computed inside the log to avoid catastrophic cancellations
+    # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
+    return (
         math_ops.log(math_ops.cosh(
             (math_ops.asinh(x) + self.skewness) * self.tailweight)
                      # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
                      # where (arcsinh(x) + skewness) * tailweight ~= arcsinh(x).
                      / _sqrtx2p1(x))
-        + math_ops.log(self.tailweight),
-        axis=event_dims)
+        + math_ops.log(self.tailweight))
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index a9dcce6c526600f3b26c6bceb730417000917ce7..f52b91550edff7390d8094a4508d862674e85d59 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -18,13 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -47,17 +42,14 @@ class SoftmaxCentered(bijector.Bijector):
   e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
   coordinate.
 
-  Because we append a coordinate, this bijector only supports `event_ndim in [0,
-  1]`, i.e., scalars and vectors.
-
   Example Use:
 
   ```python
-  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
+  bijector.SoftmaxCentered().forward(tf.log([2, 3, 4]))
   # Result: [0.2, 0.3, 0.4, 0.1]
   # Extra result: 0.1
 
-  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
+  bijector.SoftmaxCentered().inverse([0.2, 0.3, 0.4, 0.1])
   # Result: tf.log([2, 3, 4])
   # Extra coordinate removed.
   ```
@@ -69,87 +61,50 @@ class SoftmaxCentered(bijector.Bijector):
   """
 
   def __init__(self,
-               event_ndims=0,
                validate_args=False,
                name="softmax_centered"):
     self._graph_parents = []
     self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-      if event_ndims is None or event_ndims not in [0, 1]:
-        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
-    self._static_event_ndims = event_ndims
     super(SoftmaxCentered, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=1,
         validate_args=validate_args,
         name=name)
 
   def _forward_event_shape(self, input_shape):
-    if input_shape.ndims is None:
+    if input_shape.ndims is None or input_shape[-1] is None:
       return input_shape
-    if input_shape.ndims != self._static_event_ndims:
-      raise ValueError("input_shape.dims = %d != %d" %
-                       (input_shape.ndims, self._static_event_ndims))
-    if input_shape.ndims == 0:
-      return tensor_shape.TensorShape([2])
-    if input_shape.ndims == 1:
-      return tensor_shape.TensorShape(input_shape[0] + 1)
-    # Unreachable code:
-    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
+    return tensor_shape.TensorShape([input_shape[-1] + 1])
 
   def _forward_event_shape_tensor(self, input_shape):
-    ndims = array_ops.shape(input_shape)
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_zero_or_one = check_ops.assert_equal(
-          ndims, 0 if self._static_event_ndims == 0 else 1,
-          message="event_ndims must be 0 or 1")
-      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor(
-          [2], dtype=dtypes.int32, name="output_shape")
-    return input_shape + 1
+    return (input_shape[-1] + 1)[..., array_ops.newaxis]
 
   def _inverse_event_shape(self, output_shape):
-    if output_shape.ndims is None:
+    if output_shape.ndims is None or output_shape[-1] is None:
       return output_shape
-    if output_shape.ndims != 1:
-      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
-    if self._static_event_ndims == 0:
-      return tensor_shape.TensorShape([])
-    return tensor_shape.TensorShape(output_shape[0] - 1)
+    if output_shape[-1] <= 1:
+      raise ValueError("output_shape[-1] = %d <= 1" % output_shape[-1])
+    return tensor_shape.TensorShape([output_shape[-1] - 1])
 
   def _inverse_event_shape_tensor(self, output_shape):
-    ndims = array_ops.shape(output_shape)[0]
     if self.validate_args:
       # It is not possible for a negative shape so we need only check <= 1.
-      is_one = check_ops.assert_equal(
-          ndims, 1, message="event_ndims must be 1")
-      ndims = control_flow_ops.with_dependencies([is_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
-    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
+      is_greater_one = check_ops.assert_greater(
+          output_shape[-1], 1, message="Need last dimension greater than 1.")
+      output_shape = control_flow_ops.with_dependencies(
+          [is_greater_one], output_shape)
+    return (output_shape[-1] - 1)[..., array_ops.newaxis]
 
   def _forward(self, x):
     # Pad the last dim with a zeros vector. We need this because it lets us
     # infer the scale in the inverse function.
-    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
-    y = distribution_util.pad(y, axis=-1, back=True)
+    y = distribution_util.pad(x, axis=-1, back=True)
 
     # Set shape hints.
     if x.shape.ndims is not None:
-      shape = x.shape.as_list()
-      if self._static_event_ndims == 0:
-        shape += [2]
-      elif shape[-1] is not None:
-        shape[-1] += 1
-      shape = tensor_shape.TensorShape(shape)
+      shape = x.shape[:-1].concatenate(x.shape[-1] + 1)
       y.shape.assert_is_compatible_with(shape)
       y.set_shape(shape)
 
-    # Since we only support event_ndims in [0, 1] and we do padding, we always
-    # reduce over the last dimension, i.e., dim=-1 (which is the default).
     return nn_ops.softmax(y)
 
   def _inverse(self, y):
@@ -161,42 +116,17 @@ class SoftmaxCentered(bijector.Bijector):
     # x[i] = log(exp(x[i])) - log(y[end]) - log(normalization)
     #      = log(exp(x[i])/normalization) - log(y[end])
     #      = log(y[i]) - log(y[end])
-    shape = (np.asarray(y.shape.as_list(), dtype=np.int32)
-             if y.shape.is_fully_defined()
-             else array_ops.shape(y, name="shape"))
-    ndims = distribution_util.prefer_static_rank(y)
 
     # Do this first to make sure CSE catches that it'll happen again in
     # _inverse_log_det_jacobian.
     x = math_ops.log(y)
 
-    # We now extract the last coordinate of the rightmost dimension.
-    # Our trick is to slice from [0,0,...,shape[-1]-1] to shape[:-1]+[1].
-    begin = array_ops.one_hot(indices=ndims-1,
-                              depth=ndims,
-                              on_value=shape[-1]-np.array(1, dtype=shape.dtype),
-                              dtype=shape.dtype)
-    size = array_ops.concat([shape[:-1], np.asarray([1], dtype=shape.dtype)], 0)
-    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
-
-    # Here we slice out all but the last coordinate; see above for idea.
-    begin = array_ops.zeros_like(shape)
-    size = array_ops.concat([shape[:-1], [shape[-1] - 1]], 0)
-    x = array_ops.strided_slice(x, begin, begin + size)
-
-    x += log_normalization
-
-    if self._static_event_ndims == 0:
-      x = array_ops.squeeze(x, squeeze_dims=[ndims-1])
+    log_normalization = (-x[..., -1])[..., array_ops.newaxis]
+    x = x[..., :-1] + log_normalization
 
     # Set shape hints.
     if y.shape.ndims is not None:
-      shape = y.shape.as_list()
-      if self._static_event_ndims == 0:
-        shape = shape[:-1]
-      elif shape[-1] is not None:
-        shape[-1] -= 1
-      shape = tensor_shape.TensorShape(shape)
+      shape = y.shape[:-1].concatenate(y.shape[-1] - 1)
       x.shape.assert_is_compatible_with(shape)
       x.set_shape(shape)
 
@@ -222,19 +152,14 @@ class SoftmaxCentered(bijector.Bijector):
     return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
 
   def _forward_log_det_jacobian(self, x):
-    if self._static_event_ndims == 0:
-      return x - 2. * nn_ops.softplus(x)
-    else:
-      # This code is similar to nn_ops.log_softmax but different because we have
-      # an implicit zero column to handle. I.e., instead of:
-      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
-      # we must do:
-      #   log_normalization = 1 + reduce_sum(exp(logits))
-      #   -log_normalization + reduce_sum(logits - log_normalization)
-      log_normalization = nn_ops.softplus(
-          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
-      fldj = (-log_normalization +
-              math_ops.reduce_sum(x - log_normalization,
-                                  axis=-1,
-                                  keep_dims=True))
-      return array_ops.squeeze(fldj, squeeze_dims=-1)
+    # This code is similar to nn_ops.log_softmax but different because we have
+    # an implicit zero column to handle. I.e., instead of:
+    #   reduce_sum(logits - reduce_sum(exp(logits), dim))
+    # we must do:
+    #   log_normalization = 1 + reduce_sum(exp(logits))
+    #   -log_normalization + reduce_sum(logits - log_normalization)
+    log_normalization = nn_ops.softplus(
+        math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
+    return array_ops.squeeze(
+        (-log_normalization + math_ops.reduce_sum(
+            x - log_normalization, axis=-1, keepdims=True)), axis=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
index 81957fcf78922fa15fd20a25d144071f431161ae..96a938c803418ff818f9c531754b47ba1eb8667a 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
@@ -62,7 +62,7 @@ class Softplus(bijector.Bijector):
     ```python
     # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
     # batch ndim and 2 event ndims (i.e., vector of matrices).
-    softplus = Softplus(event_ndims=2)
+    softplus = Softplus()
     x = [[[1., 2],
           [3, 4]],
          [[5, 6],
@@ -81,7 +81,6 @@ class Softplus(bijector.Bijector):
               "Nonzero floating point `Tensor`.  Controls the softness of what "
               "would otherwise be a kink at the origin.  Default is 1.0")})
   def __init__(self,
-               event_ndims=0,
                hinge_softness=None,
                validate_args=False,
                name="softplus"):
@@ -101,7 +100,7 @@ class Softplus(bijector.Bijector):
             [nonzero_check], self.hinge_softness)
 
     super(Softplus, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -130,14 +129,12 @@ class Softplus(bijector.Bijector):
     # 1 - exp{-Y} approx Y.
     if self.hinge_softness is not None:
       y /= math_ops.cast(self.hinge_softness, y.dtype)
-    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
-                                axis=self._event_dims_tensor(y))
+    return -math_ops.log(-math_ops.expm1(-y))
 
   def _forward_log_det_jacobian(self, x):
     if self.hinge_softness is not None:
       x /= math_ops.cast(self.hinge_softness, x.dtype)
-    return -math_ops.reduce_sum(nn_ops.softplus(-x),
-                                axis=self._event_dims_tensor(x))
+    return -nn_ops.softplus(-x)
 
   @property
   def hinge_softness(self):
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softsign.py b/tensorflow/contrib/distributions/python/ops/bijectors/softsign.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4a658c171b8313358754228aabbfa4bf93fd84d
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softsign.py
@@ -0,0 +1,86 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Softsign bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Softsign",
+]
+
+
+class Softsign(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = X / (1 + |X|)`.
+
+  The softsign `Bijector` has the following two useful properties:
+
+  * The domain is all real numbers
+  * `softsign(x) approx sgn(x)`, for large `|x|`.
+
+  #### Examples
+
+  ```python
+  # Create the Y = softsign(X) transform.
+  softsign = Softsign()
+  x = [[[1., 2],
+        [3, 4]],
+       [[5, 6],
+        [7, 8]]]
+  x / (1 + abs(x)) == softsign.forward(x)
+  x / (1 - abs(x)) == softsign.inverse(x)
+  ```
+  """
+
+  def __init__(self, validate_args=False, name="softsign"):
+    super(Softsign, self).__init__(
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    return x / (1. + math_ops.abs(x))
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return y / (1. - math_ops.abs(y))
+
+  def _forward_log_det_jacobian(self, x):
+    return -2. * math_ops.log1p(math_ops.abs(x))
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return -2. * math_ops.log1p(-math_ops.abs(y))
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_valid = [
+        check_ops.assert_greater(
+            y, math_ops.cast(-1., dtype=y.dtype.base_dtype),
+            message="Inverse transformation input must be greater than -1."),
+        check_ops.assert_less(
+            y, math_ops.cast(1., dtype=y.dtype.base_dtype),
+            message="Inverse transformation input must be less than 1.")
+    ]
+
+    return control_flow_ops.with_dependencies(is_valid, y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/square.py b/tensorflow/contrib/distributions/python/ops/bijectors/square.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ccfdc95970e387e708603e2614ad29fb6a18db3
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/square.py
@@ -0,0 +1,84 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Square bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Square",
+]
+
+
+class Square(bijector.Bijector):
+  """Compute `g(X) = X^2`; X is a positive real number.
+
+  g is a bijection between the non-negative real numbers (R_+) and the
+  non-negative real numbers.
+
+  #### Examples
+
+  ```python
+  bijector.Square().forward(x=[[1., 0], [2, 1]])
+  # Result: [[1., 0], [4, 1]], i.e., x^2
+
+  bijector.Square().inverse(y=[[1., 4], [9, 1]])
+  # Result: [[1., 2], [3, 1]], i.e., sqrt(y).
+  ```
+
+  """
+
+  def __init__(self, validate_args=False, name="square"):
+    """Instantiates the `Square` bijector.
+
+    Args:
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._name = name
+    super(Square, self).__init__(
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid(x)
+    return math_ops.square(x)
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid(y)
+    return math_ops.sqrt(y)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid(x)
+    return np.log(2.) + math_ops.log(x)
+
+  def _maybe_assert_valid(self, t):
+    if not self.validate_args:
+      return t
+    is_valid = check_ops.assert_non_negative(
+        t, message="All elements must be non-negative.")
+    return control_flow_ops.with_dependencies([is_valid], t)
+
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
index 00520bcda85e9527767e6342bf75f10667c264a8..a22560fe80298b762795e7b0e7aea2db55823065 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
@@ -50,7 +50,6 @@ class Weibull(bijector.Bijector):
   def __init__(self,
                scale=1.,
                concentration=1.,
-               event_ndims=0,
                validate_args=False,
                name="weibull"):
     """Instantiates the `Weibull` bijector.
@@ -62,8 +61,6 @@ class Weibull(bijector.Bijector):
       concentration: Positive Float-type `Tensor` that is the same dtype and is
         broadcastable with `scale`.
         This is `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -89,7 +86,7 @@ class Weibull(bijector.Bijector):
         ], self._concentration)
 
     super(Weibull, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -113,29 +110,25 @@ class Weibull(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
+    return (
         -math_ops.log1p(-y) +
         (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) +
-        math_ops.log(self.scale / self.concentration),
-        axis=event_dims)
+        math_ops.log(self.scale / self.concentration))
 
   def _forward_log_det_jacobian(self, x):
     x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
+    return (
         -(x / self.scale) ** self.concentration +
         (self.concentration - 1) * math_ops.log(x) +
         math_ops.log(self.concentration) +
-        -self.concentration * math_ops.log(self.scale),
-        axis=event_dims)
+        -self.concentration * math_ops.log(self.scale))
 
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args:
       return x
     is_valid = check_ops.assert_non_negative(
         x,
-        message="Forward transformation input must be at least {}.".format(0))
+        message="Forward transformation input must be at least 0.")
     return control_flow_ops.with_dependencies([is_valid], x)
 
   def _maybe_assert_valid_y(self, y):
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index 6a1bb39ab28218a411bdf4329965186bcf32bf30..12d16031783b78dc3ea6273af77c1eaeb77ca94e 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -164,7 +164,7 @@ class Binomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]):
+    with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = self._maybe_assert_valid_total_count(
           ops.convert_to_tensor(total_count, name="total_count"),
           validate_args)
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
index 6f5d724a2a945ed8f9c159d8314327c6f994d1db..daacfe657fe154dce8d0db98894fe8b73546c476 100644
--- a/tensorflow/contrib/distributions/python/ops/cauchy.py
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -121,7 +121,7 @@ class Cauchy(distribution.Distribution):
       TypeError: if `loc` and `scale` have different `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)]
                                     if validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index bdd5571c966a74e58e4f9f8eed2628f131a1b92e..c77c5fd20895a6220604d76a95a152a22cd3d914 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import gamma
 
@@ -86,8 +88,12 @@ class Chi2(gamma.Gamma):
     # not true in the parent class "gamma."  therefore, passing
     # allow_nan_stats=True
     # through to the parent class results in unnecessary asserts.
-    with ops.name_scope(name, values=[df]):
-      self._df = ops.convert_to_tensor(df, name="df")
+    with ops.name_scope(name, values=[df]) as name:
+      with ops.control_dependencies([
+          check_ops.assert_positive(df),
+      ] if validate_args else []):
+        self._df = array_ops.identity(df, name="df")
+
       super(Chi2, self).__init__(
           concentration=0.5 * self._df,
           rate=constant_op.constant(0.5, dtype=self._df.dtype),
@@ -114,7 +120,7 @@ class Chi2WithAbsDf(Chi2):
                allow_nan_stats=True,
                name="Chi2WithAbsDf"):
     parameters = locals()
-    with ops.name_scope(name, values=[df]):
+    with ops.name_scope(name, values=[df]) as name:
       super(Chi2WithAbsDf, self).__init__(
           df=math_ops.floor(
               math_ops.abs(df, name="abs_df"),
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 1d4c5660d8d73b7b6a7e758fc834ccfddeb5c8ea..10b45361358b40a3c8fd725f27ad84ef9b8a37f5 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.contrib.distributions.python.ops import conditional_distribution
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import transformed_distribution
@@ -105,7 +106,9 @@ class ConditionalTransformedDistribution(
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
     x = self.bijector.inverse(y, **bijector_kwargs)
-    ildj = self.bijector.inverse_log_det_jacobian(y, **bijector_kwargs)
+    event_ndims = self._maybe_get_event_ndims_statically()
+    ildj = self.bijector.inverse_log_det_jacobian(
+        y, event_ndims=event_ndims, **bijector_kwargs)
     if self.bijector._is_injective:  # pylint: disable=protected-access
       return self._finish_log_prob_for_one_fiber(y, x, ildj,
                                                  distribution_kwargs)
@@ -128,7 +131,9 @@ class ConditionalTransformedDistribution(
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
     x = self.bijector.inverse(y, **bijector_kwargs)
-    ildj = self.bijector.inverse_log_det_jacobian(y, **bijector_kwargs)
+    event_ndims = self._maybe_get_event_ndims_statically()
+    ildj = self.bijector.inverse_log_det_jacobian(
+        y, event_ndims=event_ndims, **bijector_kwargs)
     if self.bijector._is_injective:  # pylint: disable=protected-access
       return self._finish_prob_for_one_fiber(y, x, ildj, distribution_kwargs)
 
@@ -214,3 +219,15 @@ class ConditionalTransformedDistribution(
     # implies the qth quantile of Y is g(x_q).
     inv_cdf = self.distribution.quantile(value, **distribution_kwargs)
     return self.bijector.forward(inv_cdf, **bijector_kwargs)
+
+  def _maybe_get_event_ndims_statically(self):
+    if self.event_shape.ndims is not None:
+      return self.event_shape.ndims
+
+    event_ndims = array_ops.size(self.event_shape_tensor())
+    static_event_ndims = tensor_util.constant_value(event_ndims)
+
+    if static_event_ndims is not None:
+      return static_event_ndims
+
+    return event_ndims
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index 8049522e9f5dc26b244b7e710a9ae8b981efd6b6..a42350430e98515e521ce357bf5a87ff2daefedc 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -87,7 +87,7 @@ class _BaseDeterministic(distribution.Distribution):
       ValueError:  If `loc` is a scalar.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, atol, rtol]):
+    with ops.name_scope(name, values=[loc, atol, rtol]) as name:
       loc = ops.convert_to_tensor(loc, name="loc")
       if is_vector and validate_args:
         msg = "Argument loc must be at least rank 1."
diff --git a/tensorflow/contrib/distributions/python/ops/estimator.py b/tensorflow/contrib/distributions/python/ops/estimator.py
index 6b53338c4542c75d3977c075b7750c780080ac48..98edd337fe02ffbf53c6ecd9ebda9424231ea2fe 100644
--- a/tensorflow/contrib/distributions/python/ops/estimator.py
+++ b/tensorflow/contrib/distributions/python/ops/estimator.py
@@ -75,7 +75,7 @@ def estimator_head_distribution_regression(make_distribution_fn,
 
 
 class _DistributionRegressionHead(_RegressionHead):
-  """Creates a _RegressionHead instance from an arbitray `Distribution`."""
+  """Creates a _RegressionHead instance from an arbitrary `Distribution`."""
 
   def __init__(self,
                make_distribution_fn,
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
index 8f190e48a7148d84082d73771cba4660a1a0d221..53dd42f4c83fcea0ec5b1374c8e3109ebe1dd127 100644
--- a/tensorflow/contrib/distributions/python/ops/geometric.py
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -86,7 +86,7 @@ class Geometric(distribution.Distribution):
     """
 
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]):
+    with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
 
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index d0efaefb8e78ddf4436e9e5a112d2c1cdddaf3b5..2c261073ee16462599740cb241108bfe08c773ec 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -125,7 +125,7 @@ class _Gumbel(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
@@ -190,9 +190,6 @@ class _Gumbel(distribution.Distribution):
   def _log_prob(self, x):
     return self._log_unnormalized_prob(x) - self._log_normalization()
 
-  def _prob(self, x):
-    return math_ops.exp(self._log_prob(x))
-
   def _log_cdf(self, x):
     return -math_ops.exp(-self._z(x))
 
diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py
index fc0751a6e0b78cb3d79bd3478e740bb05cd26428..d0df2befd6e46ca93e5a0b5d1cb5407d6719c7f2 100644
--- a/tensorflow/contrib/distributions/python/ops/half_normal.py
+++ b/tensorflow/contrib/distributions/python/ops/half_normal.py
@@ -106,7 +106,7 @@ class HalfNormal(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[scale]):
+    with ops.name_scope(name, values=[scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._scale = array_ops.identity(scale, name="scale")
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index cbce005013281ff3c58c94d525d5ce7a865d725a..fbde55ef310de1d926b8ddd503499fbed4809373 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 
 
 class Independent(distribution_lib.Distribution):
@@ -35,7 +36,7 @@ class Independent(distribution_lib.Distribution):
 
   This distribution is useful for regarding a collection of independent,
   non-identical distributions as a single random variable. For example, the
-  `Indpendent` distribution composed of a collection of `Bernoulli`
+  `Independent` distribution composed of a collection of `Bernoulli`
   distributions might define a distribution over an image (where each
   `Bernoulli` is a distribution over each pixel).
 
@@ -118,7 +119,7 @@ class Independent(distribution_lib.Distribution):
     parameters = locals()
     name = name or "Independent" + distribution.name
     self._distribution = distribution
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       if reinterpreted_batch_ndims is None:
         reinterpreted_batch_ndims = self._get_default_reinterpreted_batch_ndims(
             distribution)
@@ -254,3 +255,58 @@ class Independent(distribution_lib.Distribution):
     else:
       which_maximum = np.maximum
     return which_maximum(0, ndims - 1)
+
+
+@kullback_leibler.RegisterKL(Independent, Independent)
+def _kl_independent(a, b, name="kl_independent"):
+  """Batched KL divergence `KL(a || b)` for Independent distributions.
+
+  We can leverage the fact that
+  ```
+  KL(Independent(a) || Independent(b)) = sum(KL(a || b))
+  ```
+  where the sum is over the `reinterpreted_batch_ndims`.
+
+  Args:
+    a: Instance of `Independent`.
+    b: Instance of `Independent`.
+    name: (optional) name to use for created ops. Default "kl_independent".
+
+  Returns:
+    Batchwise `KL(a || b)`.
+
+  Raises:
+    ValueError: If the event space for `a` and `b`, or their underlying
+      distributions don't match.
+  """
+  p = a.distribution
+  q = b.distribution
+
+  # The KL between any two (non)-batched distributions is a scalar.
+  # Given that the KL between two factored distributions is the sum, i.e.
+  # KL(p1(x)p2(y) || q1(x)q2(y)) = KL(p1 || q1) + KL(q1 || q2), we compute
+  # KL(p || q) and do a `reduce_sum` on the reinterpreted batch dimensions.
+  if a.event_shape.is_fully_defined() and b.event_shape.is_fully_defined():
+    if a.event_shape == b.event_shape:
+      if p.event_shape == q.event_shape:
+        num_reduce_dims = a.event_shape.ndims - p.event_shape.ndims
+        reduce_dims = [-i - 1 for i in range(0, num_reduce_dims)]
+
+        return math_ops.reduce_sum(
+            kullback_leibler.kl_divergence(p, q, name=name), axis=reduce_dims)
+      else:
+        raise NotImplementedError("KL between Independents with different "
+                                  "event shapes not supported.")
+    else:
+      raise ValueError("Event shapes do not match.")
+  else:
+    with ops.control_dependencies([
+        check_ops.assert_equal(a.event_shape_tensor(), b.event_shape_tensor()),
+        check_ops.assert_equal(p.event_shape_tensor(), q.event_shape_tensor())
+    ]):
+      num_reduce_dims = (
+          array_ops.shape(a.event_shape_tensor()[0]) -
+          array_ops.shape(p.event_shape_tensor()[0]))
+      reduce_dims = math_ops.range(-num_reduce_dims - 1, -1, 1)
+      return math_ops.reduce_sum(
+          kullback_leibler.kl_divergence(p, q, name=name), axis=reduce_dims)
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index ee4d86867d48b20e97757bcec57d452085814b80..502bd4f493337bab180129cd0ddfaf5a76a0ca4e 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -126,7 +126,7 @@ class InverseGamma(distribution.Distribution):
       TypeError: if `concentration` and `rate` are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
           check_ops.assert_positive(rate),
@@ -192,12 +192,6 @@ class InverseGamma(distribution.Distribution):
   def _log_prob(self, x):
     return self._log_unnormalized_prob(x) - self._log_normalization()
 
-  def _prob(self, x):
-    return math_ops.exp(self._log_prob(x))
-
-  def _log_cdf(self, x):
-    return math_ops.log(self._cdf(x))
-
   def _cdf(self, x):
     x = self._maybe_assert_valid_sample(x)
     # Note that igammac returns the upper regularized incomplete gamma
@@ -287,7 +281,7 @@ class InverseGammaWithSoftplusConcentrationRate(InverseGamma):
                allow_nan_stats=True,
                name="InverseGammaWithSoftplusConcentrationRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       super(InverseGammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
                                     name="softplus_concentration"),
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
index 74d5d8773cf3e69a52554c87d656fea2835c8354..66682b2ff5493f8565410138e770b45ffc6b5d77 100644
--- a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -20,15 +20,17 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
-from tensorflow.python.ops.distributions import beta
 from tensorflow.python.ops.distributions import distribution
-from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import uniform
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -42,25 +44,23 @@ _kumaraswamy_sample_note = """Note: `x` must have dtype `self.dtype` and be in
 def _harmonic_number(x):
   """Compute the harmonic number from its analytic continuation.
 
-  Derivation from [1] and Euler's constant [2].
-  [1] -
-  https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers
-  [2] - https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant
-
+  Derivation from [here](
+  https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers)
+  and [Euler's constant](
+  https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant).
 
   Args:
     x: input float.
 
   Returns:
     z: The analytic continuation of the harmonic number for the input.
-
   """
   one = array_ops.ones([], dtype=x.dtype)
   return math_ops.digamma(x + one) - math_ops.digamma(one)
 
 
 @tf_export("distributions.Kumaraswamy")
-class Kumaraswamy(beta.Beta):
+class Kumaraswamy(transformed_distribution.TransformedDistribution):
   """Kumaraswamy distribution.
 
   The Kumaraswamy distribution is defined over the `(0, 1)` interval using
@@ -151,59 +151,33 @@ class Kumaraswamy(beta.Beta):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
+    with ops.name_scope(name, values=[concentration1, concentration0]) as name:
+      concentration1 = ops.convert_to_tensor(
+          concentration1, name="concentration1")
+      concentration0 = ops.convert_to_tensor(
+          concentration0, name="concentration0")
     super(Kumaraswamy, self).__init__(
-        concentration1=concentration1,
-        concentration0=concentration0,
-        validate_args=validate_args,
-        allow_nan_stats=allow_nan_stats,
+        distribution=uniform.Uniform(
+            low=array_ops.zeros([], dtype=concentration1.dtype),
+            high=array_ops.ones([], dtype=concentration1.dtype),
+            allow_nan_stats=allow_nan_stats),
+        bijector=bijectors.Kumaraswamy(
+            concentration1=concentration1, concentration0=concentration0,
+            validate_args=validate_args),
+        batch_shape=distribution_util.get_broadcast_shape(
+            concentration1, concentration0),
         name=name)
     self._reparameterization_type = distribution.FULLY_REPARAMETERIZED
 
-  def _sample_n(self, n, seed=None):
-    expanded_concentration1 = array_ops.ones_like(
-        self.total_concentration, dtype=self.dtype) * self.concentration1
-    expanded_concentration0 = array_ops.ones_like(
-        self.total_concentration, dtype=self.dtype) * self.concentration0
-    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
-    uniform_sample = random_ops.random_uniform(
-        shape=shape, minval=0.0, maxval=1.0, dtype=self.dtype, seed=seed)
-
-    kumaraswamy_sample = (1 - uniform_sample**(1. / expanded_concentration0))**(
-        1. / expanded_concentration1)
-    return kumaraswamy_sample
-
-  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
-  def _log_cdf(self, x):
-    a = self.concentration1
-    b = self.concentration0
-    return math_ops.log1p(-(1 - x**a)**b)
+  @property
+  def concentration1(self):
+    """Concentration parameter associated with a `1` outcome."""
+    return self.bijector.concentration1
 
-  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
-  def _cdf(self, x):
-    a = self.concentration1
-    b = self.concentration0
-    return 1 - (1 - x**a)**b
-
-  def _survival_function(self, x):
-    a = self.concentration1
-    b = self.concentration0
-    return (1 - x**a)**b
-
-  def _log_survival_function(self, x):
-    a = self.concentration1
-    b = self.concentration0
-    return b * math_ops.log1p(-x**a)
-
-  def _log_unnormalized_prob(self, x):
-    x = self._maybe_assert_valid_sample(x)
-    a = self.concentration1
-    b = self.concentration0
-    return (a - 1) * math_ops.log(x) + (b - 1) * math_ops.log1p(-x**a)
-
-  def _log_normalization(self):
-    a = self.concentration1
-    b = self.concentration0
-    return -(math_ops.log(a) + math_ops.log(b))
+  @property
+  def concentration0(self):
+    """Concentration parameter associated with a `0` outcome."""
+    return self.bijector.concentration0
 
   def _entropy(self):
     a = self.concentration1
@@ -213,10 +187,11 @@ class Kumaraswamy(beta.Beta):
 
   def _moment(self, n):
     """Compute the n'th (uncentered) moment."""
+    total_concentration = self.concentration1 + self.concentration0
     expanded_concentration1 = array_ops.ones_like(
-        self.total_concentration, dtype=self.dtype) * self.concentration1
+        total_concentration, dtype=self.dtype) * self.concentration1
     expanded_concentration0 = array_ops.ones_like(
-        self.total_concentration, dtype=self.dtype) * self.concentration0
+        total_concentration, dtype=self.dtype) * self.concentration0
     beta_arg0 = 1 + n / expanded_concentration1
     beta_arg = array_ops.stack([beta_arg0, expanded_concentration0], -1)
     log_moment = math_ops.log(expanded_concentration0) + special_math_ops.lbeta(
@@ -246,13 +221,14 @@ class Kumaraswamy(beta.Beta):
           name="nan")
       is_defined = (self.concentration1 > 1.) & (self.concentration0 > 1.)
       return array_ops.where(is_defined, mode, nan)
+
     return control_flow_ops.with_dependencies([
         check_ops.assert_less(
-            array_ops.ones([], dtype=self.dtype),
+            array_ops.ones([], dtype=self.concentration1.dtype),
             self.concentration1,
             message="Mode undefined for concentration1 <= 1."),
         check_ops.assert_less(
-            array_ops.ones([], dtype=self.dtype),
+            array_ops.ones([], dtype=self.concentration0.dtype),
             self.concentration0,
             message="Mode undefined for concentration0 <= 1.")
     ], mode)
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 473677f8d91b184e029f345bb05f5c5d63df7a40..c83b5bc2e3a8c56f5c52d063a7d0d399be1c1870 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -120,7 +120,7 @@ class Logistic(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
@@ -185,9 +185,6 @@ class Logistic(distribution.Distribution):
   def _log_prob(self, x):
     return self._log_unnormalized_prob(x) - self._log_normalization()
 
-  def _prob(self, x):
-    return math_ops.exp(self._log_prob(x))
-
   def _log_cdf(self, x):
     return -nn_ops.softplus(-self._z(x))
 
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index cef6a143fc615901315a3780bf4ed53b8c7cd177..2ef294af2e8bc9beff735ec2e0fd6b619ce96176 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -145,7 +145,7 @@ class Mixture(distribution.Distribution):
           "none of the components provide a static number of ndims")
 
     # Ensure that all batch and event ndims are consistent.
-    with ops.name_scope(name, values=[cat.logits]):
+    with ops.name_scope(name, values=[cat.logits]) as name:
       num_components = cat.event_size
       static_num_components = tensor_util.constant_value(num_components)
       if static_num_components is None:
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index b93bdc5ab4010663baddda1410b302644853648b..0b1301e551728f74bb0048d2dcf3c356ae110c75 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -131,7 +131,7 @@ class MixtureSameFamily(distribution.Distribution):
         `components_distribution` rightmost batch shape.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       self._mixture_distribution = mixture_distribution
       self._components_distribution = components_distribution
       self._runtime_assertions = []
diff --git a/tensorflow/contrib/distributions/python/ops/moving_stats.py b/tensorflow/contrib/distributions/python/ops/moving_stats.py
index 20f85643b9e7db61b4786dffe4115c7d3c00b046..87d40805a3c7a9c2871305af7f7182b7e2923530 100644
--- a/tensorflow/contrib/distributions/python/ops/moving_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/moving_stats.py
@@ -47,9 +47,7 @@ def assign_moving_mean_variance(
   Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses
   the lag-1 mean.
 
-  For derivation justification, see equation 143 of:
-    T. Finch, Feb 2009. "Incremental calculation of weighted mean and variance".
-    http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
+  For derivation justification, see [Finch (2009; Eq. 143)][1].
 
   Args:
     mean_var: `float`-like `Variable` representing the exponentially weighted
@@ -72,6 +70,12 @@ def assign_moving_mean_variance(
     TypeError: if `mean_var` does not have float type `dtype`.
     TypeError: if `mean_var`, `variance_var`, `value`, `decay` have different
       `base_dtype`.
+
+  #### References
+
+  [1]: Tony Finch. Incremental calculation of weighted mean and variance.
+       _Technical Report_, 2009.
+       http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
   """
   with ops.name_scope(name, "assign_moving_mean_variance",
                       [variance_var, mean_var, value, decay]):
@@ -183,9 +187,7 @@ def moving_mean_variance(value, decay, collections=None, name=None):
   Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses
   the lag-`1` mean.
 
-  For derivation justification, see equation 143 of:
-    T. Finch, Feb 2009. "Incremental calculation of weighted mean and variance".
-    http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
+  For derivation justification, see [Finch (2009; Eq. 143)][1].
 
   Unlike `assign_moving_mean_variance`, this function handles
   variable creation.
@@ -208,6 +210,12 @@ def moving_mean_variance(value, decay, collections=None, name=None):
   Raises:
     TypeError: if `value_var` does not have float type `dtype`.
     TypeError: if `value`, `decay` have different `base_dtype`.
+
+  #### References
+
+  [1]: Tony Finch. Incremental calculation of weighted mean and variance.
+       _Technical Report_, 2009.
+       http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
   """
   if collections is None:
     collections = [ops.GraphKeys.GLOBAL_VARIABLES]
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index e862552880f4073c8fa8e90134d0633e7484b0bf..e3236c2db93695a5e007bba9a1414773f3935f2e 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -194,7 +194,7 @@ class MultivariateNormalDiag(
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
         # No need to validate_args while making diag_scale.  The returned
@@ -225,7 +225,7 @@ class MultivariateNormalDiagWithSoftplusScale(MultivariateNormalDiag):
                allow_nan_stats=True,
                name="MultivariateNormalDiagWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[scale_diag]):
+    with ops.name_scope(name, values=[scale_diag]) as name:
       super(MultivariateNormalDiagWithSoftplusScale, self).__init__(
           loc=loc,
           scale_diag=nn.softplus(scale_diag),
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 413e88f03ae0286c294f3404549a73e1a47dcff7..2f6a6f198cbcfbdcbd0993d3074ddde1c389585f 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -218,7 +218,7 @@ class MultivariateNormalDiagPlusLowRank(
     parameters = locals()
     def _convert_to_tensor(x, name):
       return None if x is None else ops.convert_to_tensor(x, name=name)
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier, scale_perturb_factor,
           scale_perturb_diag]):
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
index 4bea99fbb75349f97fde473cb5716fe6c426ce90..5d06a396fe7a3b87cabb9c3081da45246854089f 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -45,7 +45,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
   The probability density function (pdf) is, with `@` as matrix multiplication,
 
   ```none
-  pdf(x; loc, covariance_matrix) = exp(-0.5 ||y||**2) / Z,
+  pdf(x; loc, covariance_matrix) = exp(-0.5 y) / Z,
   y = (x - loc)^T @ inv(covariance_matrix) @ (x - loc)
   Z = (2 pi)**(0.5 k) |det(covariance_matrix)|**(0.5).
   ```
@@ -54,8 +54,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
 
   * `loc` is a vector in `R^k`,
   * `covariance_matrix` is an `R^{k x k}` symmetric positive definite matrix,
-  * `Z` denotes the normalization constant, and,
-  * `||y||**2` denotes the squared Euclidean norm of `y`.
+  * `Z` denotes the normalization constant.
 
   Additional leading dimensions (if any) in `loc` and `covariance_matrix` allow
   for batch dimensions.
@@ -159,7 +158,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
     parameters = locals()
 
     # Convert the covariance_matrix up to a scale_tril and call MVNTriL.
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[loc, covariance_matrix]):
         if covariance_matrix is None:
           scale_tril = None
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index a7399792892f4c179c05168184d76ec95c168b51..44c92312c7dc758500051f89923ec9fafe850c0e 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -176,7 +176,7 @@ class MultivariateNormalLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    with ops.name_scope(name, values=[loc] + scale.graph_parents) as name:
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index 6c7dc4ca7aaf5b3a20b072e9360d15528ad10556..d6f8b731cbeed5fed3b43365e7c668d0434a267e 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -184,7 +184,7 @@ class MultivariateNormalTriL(
       return None if x is None else ops.convert_to_tensor(x, name=name)
     if loc is None and scale_tril is None:
       raise ValueError("Must specify one or both of `loc`, `scale_tril`.")
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[loc, scale_tril]):
         loc = _convert_to_tensor(loc, name="loc")
         scale_tril = _convert_to_tensor(scale_tril, name="scale_tril")
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
index 3a58df80da6c02b056f5e5a63bf41de5fc6d44a4..eeaf9c0a5ebc1323e137ff73f82588f6907031c7 100644
--- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -91,7 +91,7 @@ class NegativeBinomial(distribution.Distribution):
     """
 
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]):
+    with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
       with ops.control_dependencies(
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index b76cebf79fad09ebec68f2459c6fe80794ea81c0..305b138fdc2318523ee078195213caf865d96b4d 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -52,7 +52,7 @@ class OneHotCategorical(distribution.Distribution):
 
   #### Examples
 
-  Creates a 3-class distiribution, with the 2nd class, the most likely to be
+  Creates a 3-class distribution, with the 2nd class, the most likely to be
   drawn from.
 
   ```python
@@ -60,7 +60,7 @@ class OneHotCategorical(distribution.Distribution):
   dist = OneHotCategorical(probs=p)
   ```
 
-  Creates a 3-class distiribution, with the 2nd class the most likely to be
+  Creates a 3-class distribution, with the 2nd class the most likely to be
   drawn from, using logits.
 
   ```python
@@ -116,7 +116,7 @@ class OneHotCategorical(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]):
+    with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           name=name, logits=logits, probs=probs, validate_args=validate_args,
           multidimensional=True)
@@ -203,9 +203,6 @@ class OneHotCategorical(distribution.Distribution):
     ret = array_ops.reshape(ret, logits_shape)
     return ret
 
-  def _prob(self, x):
-    return math_ops.exp(self._log_prob(x))
-
   def _entropy(self):
     return -math_ops.reduce_sum(
         nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index e967dcc90d0712ffc346fb61ee67c44a6d9207cb..a84aad6fc9372395ac021fa3aa006ddf9272e6a9 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -35,9 +35,15 @@ __all__ = [
 
 
 _poisson_sample_note = """
-Note that the input value must be a non-negative floating point tensor with
-dtype `dtype` and whose shape can be broadcast with `self.rate`. `x` is only
-legal if it is non-negative and its components are equal to integer values.
+The Poisson distribution is technically only defined for non-negative integer
+values. When `validate_args=False`, non-integral inputs trigger an assertion.
+
+When `validate_args=False` calculations are otherwise unchanged despite
+integral or non-integral inputs.
+
+When `validate_args=False`, evaluating the pmf at non-integral values,
+corresponds to evaluations of an unnormalized distribution, that does not
+correspond to evaluations of the cdf.
 """
 
 
@@ -88,7 +94,7 @@ class Poisson(distribution.Distribution):
       TypeError: if `log_rate` is not a float-type.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[rate]):
+    with ops.name_scope(name, values=[rate]) as name:
       if (rate is None) == (log_rate is None):
         raise ValueError("Must specify exactly one of `rate` and `log_rate`.")
       elif log_rate is None:
@@ -150,10 +156,6 @@ class Poisson(distribution.Distribution):
   def _cdf(self, x):
     if self.validate_args:
       x = distribution_util.embed_check_nonnegative_integer_form(x)
-    else:
-      # Whether or not x is integer-form, the following is well-defined.
-      # However, scipy takes the floor, so we do too.
-      x = math_ops.floor(x)
     return math_ops.igammac(1. + x, self.rate)
 
   def _log_normalization(self):
@@ -162,9 +164,6 @@ class Poisson(distribution.Distribution):
   def _log_unnormalized_prob(self, x):
     if self.validate_args:
       x = distribution_util.embed_check_nonnegative_integer_form(x)
-    else:
-      # For consistency with cdf, we take the floor.
-      x = math_ops.floor(x)
     return x * self.log_rate - math_ops.lgamma(1. + x)
 
   def _mean(self):
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 92f2bba1828696248c9d9460566a08ba372c3358..19c99dcee92978e938a73af9be445cd098e5fe90 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -114,7 +114,7 @@ def quadrature_scheme_lognormal_quantiles(
     # Create a LogNormal distribution.
     dist = transformed_lib.TransformedDistribution(
         distribution=normal_lib.Normal(loc=loc, scale=scale),
-        bijector=Exp(event_ndims=0),
+        bijector=Exp(),
         validate_args=validate_args)
     batch_ndims = dist.batch_shape.ndims
     if batch_ndims is None:
@@ -256,7 +256,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
         `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       if loc is not None:
         loc = ops.convert_to_tensor(loc, name="loc")
       if scale is not None:
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index 8aebb79b9138cce1373e6472d17cf9072d2bc285..1ef7651d03a3388e72618b1d9bb8b819bde17e92 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -217,7 +217,7 @@ class QuantizedDistribution(distributions.Distribution):
     values = (
         list(distribution.parameters.values()) +
         [low, high])
-    with ops.name_scope(name, values=values):
+    with ops.name_scope(name, values=values) as name:
       self._dist = distribution
 
       if low is not None:
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index b525809015537ac8c7ee701c100fba6541fe2e92..84c8d29072c2f1f3888329638c4695bccf70eab7 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -35,10 +35,10 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
 
   The RelaxedBernoulli is a distribution over the unit interval (0,1), which
   continuously approximates a Bernoulli. The degree of approximation is
-  controlled by a temperature: as the temperaturegoes to 0 the RelaxedBernoulli
-  becomes discrete with a distribution described by the `logits` or `probs`
-  parameters, as the temperature goes to infinity the RelaxedBernoulli
-  becomes the constant distribution that is identically 0.5.
+  controlled by a temperature: as the temperature goes to 0 the
+  RelaxedBernoulli becomes discrete with a distribution described by the
+  `logits` or `probs` parameters, as the temperature goes to infinity the
+  RelaxedBernoulli becomes the constant distribution that is identically 0.5.
 
   The RelaxedBernoulli distribution is a reparameterized continuous
   distribution that is the binary special case of the RelaxedOneHotCategorical
@@ -166,7 +166,7 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
       ValueError: If both `probs` and `logits` are passed, or if neither.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs, temperature]):
+    with ops.name_scope(name, values=[logits, probs, temperature]) as name:
       with ops.control_dependencies([check_ops.assert_positive(temperature)]
                                     if validate_args else []):
         self._temperature = array_ops.identity(temperature, name="temperature")
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index 2aa771a71efe52c8d86d459f090ea8ee137c4487..325f41e37c928ba8e81e45e63a7f7f8126bc80f8 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -163,7 +163,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs, temperature]):
+    with ops.name_scope(name, values=[logits, probs, temperature]) as name:
 
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           name=name, logits=logits, probs=probs, validate_args=validate_args,
@@ -285,9 +285,6 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
     ret = array_ops.reshape(log_prob, logits_shape)
     return ret
 
-  def _prob(self, x):
-    return math_ops.exp(self._log_prob(x))
-
   def _assert_valid_sample(self, x):
     if not self.validate_args:
       return x
@@ -306,7 +303,7 @@ class RelaxedOneHotCategorical(
   The RelaxedOneHotCategorical is a distribution over random probability
   vectors, vectors of positive real values that sum to one, which continuously
   approximates a OneHotCategorical. The degree of approximation is controlled by
-  a temperature: as the temperaturegoes to 0 the RelaxedOneHotCategorical
+  a temperature: as the temperature goes to 0 the RelaxedOneHotCategorical
   becomes discrete with a distribution described by the `logits` or `probs`
   parameters, as the temperature goes to infinity the RelaxedOneHotCategorical
   becomes the constant distribution that is identically the constant vector of
@@ -412,5 +409,5 @@ class RelaxedOneHotCategorical(
                                        validate_args=validate_args,
                                        allow_nan_stats=allow_nan_stats)
     super(RelaxedOneHotCategorical, self).__init__(dist,
-                                                   bijectors.Exp(event_ndims=1),
+                                                   bijectors.Exp(),
                                                    name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index dfc813361977c159d8d48f9d5b9ff03db5b4acdc..f5aaa5cf34abde3ea4d25de1ecf3adaef3f2a770 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -301,13 +302,16 @@ def percentile(x,
 
   with ops.name_scope(name, [x, q]):
     x = ops.convert_to_tensor(x, name="x")
-    q = math_ops.to_float(q, name="q")
+    # Double is needed here and below, else we get the wrong index if the array
+    # is huge along axis.
+    q = math_ops.to_double(q, name="q")
     _get_static_ndims(q, expect_ndims=0)
 
     if validate_args:
       q = control_flow_ops.with_dependencies([
-          check_ops.assert_rank(q, 0), check_ops.assert_greater_equal(q, 0.),
-          check_ops.assert_less_equal(q, 100.)
+          check_ops.assert_rank(q, 0),
+          check_ops.assert_greater_equal(q, math_ops.to_double(0.)),
+          check_ops.assert_less_equal(q, math_ops.to_double(100.))
       ], q)
 
     if axis is None:
@@ -332,7 +336,7 @@ def percentile(x,
       y = _move_dims_to_flat_end(x, axis, x_ndims)
 
     frac_at_q_or_above = 1. - q / 100.
-    d = math_ops.to_float(array_ops.shape(y)[-1])
+    d = math_ops.to_double(array_ops.shape(y)[-1])
 
     if interpolation == "lower":
       index = math_ops.ceil((d - 1) * frac_at_q_or_above)
@@ -341,12 +345,18 @@ def percentile(x,
     elif interpolation == "nearest":
       index = math_ops.round((d - 1) * frac_at_q_or_above)
 
+    # If d is gigantic, then we would have d == d - 1, even in double... So
+    # let's use max/min to avoid out of bounds errors.
+    d = array_ops.shape(y)[-1]
+    # d - 1 will be distinct from d in int32.
+    index = clip_ops.clip_by_value(math_ops.to_int32(index), 0, d - 1)
+
     # Sort everything, not just the top 'k' entries, which allows multiple calls
     # to sort only once (under the hood) and use CSE.
     sorted_y = _sort_tensor(y)
 
     # result.shape = B
-    result = sorted_y[..., math_ops.to_int32(index)]
+    result = sorted_y[..., index]
     result.set_shape(y.get_shape()[:-1])
 
     if keep_dims:
diff --git a/tensorflow/contrib/distributions/python/ops/seed_stream.py b/tensorflow/contrib/distributions/python/ops/seed_stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..056d349688511e19a4fa3d58a5b3c1c8355671a3
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/seed_stream.py
@@ -0,0 +1,228 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Local PRNG for amplifying seed entropy into seeds for base operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import hashlib
+
+
+class SeedStream(object):
+  """Local PRNG for amplifying seed entropy into seeds for base operations.
+
+  Writing sampling code which correctly sets the pseudo-random number
+  generator (PRNG) seed is surprisingly difficult.  This class serves as
+  a helper for the TensorFlow Probability coding pattern designed to
+  avoid common mistakes.
+
+  # Motivating Example
+
+  A common first-cut implementation of a sampler for the beta
+  distribution is to compute the ratio of a gamma with itself plus
+  another gamma.  This code snippet tries to do that, but contains a
+  surprisingly common error:
+
+  ```python
+  def broken_beta(shape, alpha, beta, seed):
+    x = tf.random_gamma(shape, alpha, seed=seed)
+    y = tf.random_gamma(shape, beta, seed=seed)
+    return x / (x + y)
+  ```
+
+  The mistake is that the two gamma draws are seeded with the same
+  seed.  This causes them to always produce the same results, which,
+  in turn, leads this code snippet to always return `0.5`.  Because it
+  can happen across abstraction boundaries, this kind of error is
+  surprisingly easy to make when handling immutable seeds.
+
+  # Goals
+
+  TensorFlow Probability adopts a code style designed to eliminate the
+  above class of error, without exacerbating others.  The goals of
+  this code style are:
+
+  - Support reproducibility of results (by encouraging seeding of all
+    pseudo-random operations).
+
+  - Avoid shared-write global state (by not relying on a global PRNG).
+
+  - Prevent accidental seed reuse by TF Probability implementers.  This
+    goal is served with the local pseudo-random seed generator provided
+    in this module.
+
+  - Mitigate potential accidental seed reuse by TF Probability clients
+    (with a salting scheme).
+
+  - Prevent accidental resonances with downstream PRNGs (by hashing the
+    output).
+
+  ## Non-goals
+
+  - Implementing a high-performance PRNG for generating large amounts of
+    entropy.  That's the job of the underlying TensorFlow PRNG we are
+    seeding.
+
+  - Avoiding random seed collisions, aka "birthday attacks".
+
+  # Code pattern
+
+  ```python
+  def random_beta(shape, alpha, beta, seed):        # (a)
+    seed = SeedStream(seed, salt="random_beta")     # (b)
+    x = tf.random_gamma(shape, alpha, seed=seed())  # (c)
+    y = tf.random_gamma(shape, beta, seed=seed())   # (c)
+    return x / (x + y)
+  ```
+
+  The elements of this pattern are:
+
+  - Accept an explicit seed (line a) as an argument in all public
+    functions, and write the function to be deterministic (up to any
+    numerical issues) for fixed seed.
+
+    - Rationale: This provides the client with the ability to reproduce
+      results.  Accepting an immutable seed rather than a mutable PRNG
+      object reduces code coupling, permitting different sections to be
+      reproducible independently.
+
+  - Use that seed only to initialize a local `SeedStream` instance (line b).
+
+    - Rationale: Avoids accidental seed reuse.
+
+  - Supply the name of the function being implemented as a salt to the
+    `SeedStream` instance (line b).  This serves to keep the salts
+    unique; unique salts ensure that clients of TF Probability will see
+    different functions always produce independent results even if
+    called with the same seeds.
+
+  - Seed each callee operation with the output of a unique call to the
+    `SeedStream` instance (lines c).  This ensures reproducibility of
+    results while preventing seed reuse across callee invocations.
+
+  # Why salt?
+
+  Salting the `SeedStream` instances (with unique salts) is defensive
+  programming against a client accidentally committing a mistake
+  similar to our motivating example.  Consider the following situation
+  that might arise without salting:
+
+  ```python
+  def tfp_foo(seed):
+    seed = SeedStream(seed, salt="")
+    foo_stuff = tf.random_normal(seed=seed())
+    ...
+
+  def tfp_bar(seed):
+    seed = SeedStream(seed, salt="")
+    bar_stuff = tf.random_normal(seed=seed())
+    ...
+
+  def client_baz(seed):
+    foo = tfp_foo(seed=seed)
+    bar = tfp_bar(seed=seed)
+    ...
+  ```
+
+  The client should have used different seeds as inputs to `foo` and
+  `bar`.  However, because they didn't, *and because `foo` and `bar`
+  both sample a Gaussian internally as their first action*, the
+  internal `foo_stuff` and `bar_stuff` will be the same, and the
+  returned `foo` and `bar` will not be independent, leading to subtly
+  incorrect answers from the client's simulation.  This kind of bug is
+  particularly insidious for the client, because it depends on a
+  Distributions implementation detail, namely the order in which `foo`
+  and `bar` invoke the samplers they depend on.  In particular, a
+  Bayesflow team member can introduce such a bug in previously
+  (accidentally) correct client code by performing an internal
+  refactoring that causes this operation order alignment.
+
+  A salting discipline eliminates this problem by making sure that the
+  seeds seen by `foo`'s callees will differ from those seen by `bar`'s
+  callees, even if `foo` and `bar` are invoked with the same input
+  seed.
+  """
+
+  def __init__(self, seed, salt):
+    """Initializes a `SeedStream`.
+
+    Args:
+      seed: Any Python object convertible to string, supplying the
+        initial entropy.  If `None`, operations seeded with seeds
+        drawn from this `SeedStream` will follow TensorFlow semantics
+        for not being seeded.
+      salt: Any Python object convertible to string, supplying
+        auxiliary entropy.  Must be unique across the Distributions
+        and TensorFlow Probability code base.  See class docstring for
+        rationale.
+    """
+    self._seed = seed
+    self._salt = salt
+    self._counter = 0
+
+  def __call__(self):
+    """Returns a fresh integer usable as a seed in downstream operations.
+
+    If this `SeedStream` was initialized with `seed=None`, returns
+    `None`.  This has the effect that downstream operations (both
+    `SeedStream`s and primitive TensorFlow ops) will behave as though
+    they were unseeded.
+
+    The returned integer is non-negative, and uniformly distributed in
+    the half-open interval `[0, 2**512)`.  This is consistent with
+    TensorFlow, as TensorFlow operations internally use the residue of
+    the given seed modulo `2**31 - 1` (see
+    `tensorflow/python/framework/random_seed.py`).
+
+    Returns:
+      seed: A fresh integer usable as a seed in downstream operations,
+        or `None`.
+    """
+    self._counter += 1
+    if self._seed is None:
+      return None
+    composite = str((self._seed, self._counter, self._salt)).encode("utf-8")
+    return int(hashlib.sha512(composite).hexdigest(), 16)
+
+  @property
+  def original_seed(self):
+    return self._seed
+
+  @property
+  def salt(self):
+    return self._salt
+
+# Design rationales for the SeedStream class
+#
+# - Salts are accepted for the reason given above to supply them.
+#
+# - A `None` seed propagates to downstream seeds, so they exhibit
+#   their "unseeded" behavior.
+#
+# - The return value is a Python int so it can be passed directly to
+#   TensorFlow operations as a seed.  It is large to avoid losing seed
+#   space needlessly (TF will internally read only the last 31 bits).
+#
+# - The output is hashed with a crypto-grade hash function as a form
+#   of defensive programming: this reliably prevents all possible
+#   accidental resonances with all possible downstream PRNGs.  The
+#   specific function used is not important; SHA512 was ready to hand.
+#
+# - The internal state update is a simple counter because (a) given
+#   that the output is hashed anyway, this is enough, and (b) letting
+#   it be this predictable permits a future "generate many seeds in
+#   parallel" operation whose results would agree with running
+#   sequentially.
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index 5fb6f0c7eaa8c4734ea4c161b0eee6f24d4c9850..6a7f28713acefd2285b07a212e2e47a6db1ae5e1 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -32,45 +32,50 @@ from tensorflow.python.ops.distributions import util as distribution_util
 class _DistributionShape(object):
   """Manage and manipulate `Distribution` shape.
 
-  Terminology:
-    Recall that a `Tensor` has:
-      - `shape`: size of `Tensor` dimensions,
-      - `ndims`: size of `shape`; number of `Tensor` dimensions,
-      - `dims`: indexes into `shape`; useful for transpose, reduce.
-
-    `Tensor`s sampled from a `Distribution` can be partitioned by `sample_dims`,
-    `batch_dims`, and `event_dims`. To understand the semantics of these
-    dimensions, consider when two of the three are fixed and the remaining
-    is varied:
-      - `sample_dims`: indexes independent draws from identical
-                       parameterizations of the `Distribution`.
-      - `batch_dims`:  indexes independent draws from non-identical
-                       parameterizations of the `Distribution`.
-      - `event_dims`:  indexes event coordinates from one sample.
-
-    The `sample`, `batch`, and `event` dimensions constitute the entirety of a
-    `Distribution` `Tensor`'s shape.
-
-    The dimensions are always in `sample`, `batch`, `event` order.
-
-  Purpose:
-    This class partitions `Tensor` notions of `shape`, `ndims`, and `dims` into
-    `Distribution` notions of `sample,` `batch,` and `event` dimensions. That
-    is, it computes any of:
+  #### Terminology
 
-    ```
-    sample_shape     batch_shape     event_shape
-    sample_dims      batch_dims      event_dims
-    sample_ndims     batch_ndims     event_ndims
-    ```
+  Recall that a `Tensor` has:
+    - `shape`: size of `Tensor` dimensions,
+    - `ndims`: size of `shape`; number of `Tensor` dimensions,
+    - `dims`: indexes into `shape`; useful for transpose, reduce.
+
+  `Tensor`s sampled from a `Distribution` can be partitioned by `sample_dims`,
+  `batch_dims`, and `event_dims`. To understand the semantics of these
+  dimensions, consider when two of the three are fixed and the remaining
+  is varied:
+    - `sample_dims`: indexes independent draws from identical
+                     parameterizations of the `Distribution`.
+    - `batch_dims`:  indexes independent draws from non-identical
+                     parameterizations of the `Distribution`.
+    - `event_dims`:  indexes event coordinates from one sample.
+
+  The `sample`, `batch`, and `event` dimensions constitute the entirety of a
+  `Distribution` `Tensor`'s shape.
+
+  The dimensions are always in `sample`, `batch`, `event` order.
+
+  #### Purpose
+
+  This class partitions `Tensor` notions of `shape`, `ndims`, and `dims` into
+  `Distribution` notions of `sample,` `batch,` and `event` dimensions. That
+  is, it computes any of:
+
+  ```
+  sample_shape     batch_shape     event_shape
+  sample_dims      batch_dims      event_dims
+  sample_ndims     batch_ndims     event_ndims
+  ```
 
-    for a given `Tensor`, e.g., the result of
-    `Distribution.sample(sample_shape=...)`.
+  for a given `Tensor`, e.g., the result of
+  `Distribution.sample(sample_shape=...)`.
 
-    For a given `Tensor`, this class computes the above table using minimal
-    information: `batch_ndims` and `event_ndims`.
+  For a given `Tensor`, this class computes the above table using minimal
+  information: `batch_ndims` and `event_ndims`.
+
+  #### Examples
+
+  We show examples of distribution shape semantics.
 
-  Examples of `Distribution` `shape` semantics:
     - Sample dimensions:
       Computing summary statistics, i.e., the average is a reduction over sample
       dimensions.
@@ -111,52 +116,54 @@ class _DistributionShape(object):
       tf.div(1., tf.reduce_prod(x, event_dims))
       ```
 
-  Examples using this class:
-    Write `S, B, E` for `sample_shape`, `batch_shape`, and `event_shape`.
-
-    ```python
-    # 150 iid samples from one multivariate Normal with two degrees of freedom.
-    mu = [0., 0]
-    sigma = [[1., 0],
-             [0,  1]]
-    mvn = MultivariateNormal(mu, sigma)
-    rand_mvn = mvn.sample(sample_shape=[3, 50])
-    shaper = DistributionShape(batch_ndims=0, event_ndims=1)
-    S, B, E = shaper.get_shape(rand_mvn)
-    # S = [3, 50]
-    # B = []
-    # E = [2]
-
-    # 12 iid samples from one Wishart with 2x2 events.
-    sigma = [[1., 0],
-             [2,  1]]
-    wishart = Wishart(df=5, scale=sigma)
-    rand_wishart = wishart.sample(sample_shape=[3, 4])
-    shaper = DistributionShape(batch_ndims=0, event_ndims=2)
-    S, B, E = shaper.get_shape(rand_wishart)
-    # S = [3, 4]
-    # B = []
-    # E = [2, 2]
-
-    # 100 iid samples from two, non-identical trivariate Normal distributions.
-    mu    = ...  # shape(2, 3)
-    sigma = ...  # shape(2, 3, 3)
-    X = MultivariateNormal(mu, sigma).sample(shape=[4, 25])
-    # S = [4, 25]
-    # B = [2]
-    # E = [3]
-    ```
-
-  Argument Validation:
-    When `validate_args=False`, checks that cannot be done during
-    graph construction are performed at graph execution. This may result in a
-    performance degradation because data must be switched from GPU to CPU.
-
-    For example, when `validate_args=False` and `event_ndims` is a
-    non-constant `Tensor`, it is checked to be a non-negative integer at graph
-    execution. (Same for `batch_ndims`). Constant `Tensor`s and non-`Tensor`
-    arguments are always checked for correctness since this can be done for
-    "free," i.e., during graph construction.
+  We show examples using this class.
+
+  Write `S, B, E` for `sample_shape`, `batch_shape`, and `event_shape`.
+
+  ```python
+  # 150 iid samples from one multivariate Normal with two degrees of freedom.
+  mu = [0., 0]
+  sigma = [[1., 0],
+           [0,  1]]
+  mvn = MultivariateNormal(mu, sigma)
+  rand_mvn = mvn.sample(sample_shape=[3, 50])
+  shaper = DistributionShape(batch_ndims=0, event_ndims=1)
+  S, B, E = shaper.get_shape(rand_mvn)
+  # S = [3, 50]
+  # B = []
+  # E = [2]
+
+  # 12 iid samples from one Wishart with 2x2 events.
+  sigma = [[1., 0],
+           [2,  1]]
+  wishart = Wishart(df=5, scale=sigma)
+  rand_wishart = wishart.sample(sample_shape=[3, 4])
+  shaper = DistributionShape(batch_ndims=0, event_ndims=2)
+  S, B, E = shaper.get_shape(rand_wishart)
+  # S = [3, 4]
+  # B = []
+  # E = [2, 2]
+
+  # 100 iid samples from two, non-identical trivariate Normal distributions.
+  mu    = ...  # shape(2, 3)
+  sigma = ...  # shape(2, 3, 3)
+  X = MultivariateNormal(mu, sigma).sample(shape=[4, 25])
+  # S = [4, 25]
+  # B = [2]
+  # E = [3]
+  ```
+
+  #### Argument Validation
+
+  When `validate_args=False`, checks that cannot be done during
+  graph construction are performed at graph execution. This may result in a
+  performance degradation because data must be switched from GPU to CPU.
+
+  For example, when `validate_args=False` and `event_ndims` is a
+  non-constant `Tensor`, it is checked to be a non-negative integer at graph
+  execution. (Same for `batch_ndims`). Constant `Tensor`s and non-`Tensor`
+  arguments are always checked for correctness since this can be done for
+  "free," i.e., during graph construction.
   """
 
   def __init__(self,
@@ -432,7 +439,7 @@ class _DistributionShape(object):
           if self._batch_ndims_is_0 and expand_batch_dim:
             squeeze_dims += [1]
           if squeeze_dims:
-            x = array_ops.squeeze(x, squeeze_dims=squeeze_dims)
+            x = array_ops.squeeze(x, axis=squeeze_dims)
             # x.shape: [prod(S)]+B+E
         _, batch_shape, event_shape = self.get_shape(x)
       else:
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index c4b8f055b7fbc3f0835b503eddd7617610326d8c..03828fa61277eeaf7ce90de8023b4ed91f6cc4dc 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -134,7 +134,8 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
     """
     parameters = locals()
 
-    with ops.name_scope(name, values=[loc, scale, skewness, tailweight]):
+    with ops.name_scope(name,
+                        values=[loc, scale, skewness, tailweight]) as name:
       loc = ops.convert_to_tensor(loc, name="loc")
       dtype = loc.dtype
       scale = ops.convert_to_tensor(scale, name="scale", dtype=dtype)
@@ -166,21 +167,20 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
 
       # Make the SAS bijector, 'F'.
       f = bijectors.SinhArcsinh(
-          skewness=skewness, tailweight=tailweight, event_ndims=0)
+          skewness=skewness, tailweight=tailweight)
       if has_default_skewness:
         f_noskew = f
       else:
         f_noskew = bijectors.SinhArcsinh(
             skewness=skewness.dtype.as_numpy_dtype(0.),
-            tailweight=tailweight, event_ndims=0)
+            tailweight=tailweight)
 
-      # Make the Affine bijector, Z --> loc + scale * Z (2 / F_0(2))
+      # Make the AffineScalar bijector, Z --> loc + scale * Z (2 / F_0(2))
       c = 2 * scale / f_noskew.forward(ops.convert_to_tensor(2, dtype=dtype))
-      affine = bijectors.Affine(
+      affine = bijectors.AffineScalar(
           shift=loc,
-          scale_identity_multiplier=c,
-          validate_args=validate_args,
-          event_ndims=0)
+          scale=c,
+          validate_args=validate_args)
 
       bijector = bijectors.Chain([affine, f])
 
diff --git a/tensorflow/contrib/distributions/python/ops/statistical_testing.py b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c69435fac109914ff29b307dfad105f62849339
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
@@ -0,0 +1,838 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Statistical test assertions calibrated for their error rates.
+
+Statistical tests have an inescapable probability of error: a correct
+sampler can still fail a test by chance, and an incorrect sampler can
+still pass a test by chance.  This library is about bounding both of
+those error rates.  This requires admitting a task-specific notion of
+"discrepancy": Correct code will fail rarely, code that misbehaves by
+more than the discrepancy will pass rarely, and nothing reliable can
+be said about code that misbehaves, but misbehaves by less than the
+discrepancy.
+
+# Example
+
+Consider testing that the mean of a scalar probability distribution P
+is some expected constant.  Suppose the support of P is the interval
+`[0, 1]`.  Then you might do this:
+
+```python
+tfd = tf.contrib.distributions
+
+expected_mean = ...
+num_samples = 5000
+samples = ... draw 5000 samples from P
+
+# Check that the mean looks right
+check1 = tfd.assert_true_mean_equal_by_dkwm(
+    samples, low=0., high=1., expected=expected_mean,
+    false_fail_rate=1e-6)
+
+# Check that the difference in means detectable with 5000 samples is
+# small enough
+check2 = tf.assert_less(
+    tfd.min_discrepancy_of_true_means_detectable_by_dkwm(
+        num_samples, low=0., high=1.0,
+        false_fail_rate=1e-6, false_pass_rate=1e-6),
+    0.01)
+
+# Be sure to execute both assertion ops
+sess.run([check1, check2])
+```
+
+The second assertion is an instance of experiment design.  It's a
+deterministic computation (independent of the code under test) that
+checks that `5000` samples is enough to reliably resolve mean
+differences of `0.01` or more.  Here "reliably" means that if the code
+under test is correct, the probability of drawing an unlucky sample
+that causes this test to fail is at most 1e-6; and if the code under
+test is incorrect enough that its true mean is 0.01 more or less than
+expected, then the probability of drawing a "lucky" sample that causes
+the test to false-pass is also at most 1e-6.
+
+# Overview
+
+Every function in this library can be characterized in terms of:
+
+- The property being tested, such as the full density of the
+  distribution under test, or just its true mean, or a single
+  Bernoulli probability, etc.
+
+- The relation being asserted, e.g., whether the mean is less, more,
+  or equal to the given expected value.
+
+- The stochastic bound being relied upon, such as the
+  [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval)
+  or the CDF of the binomial distribution (for assertions about
+  Bernoulli probabilities).
+
+- The number of sample sets in the statistical test.  For example,
+  testing equality of means has a one-sample variant, where the
+  expected mean is given exactly, and a two-sample variant, where the
+  expected mean is itself given by a set of samples (e.g., from an
+  alternative algorithm).
+
+- What operation(s) of the test are to be performed.  Each test has
+  three of these:
+
+  1. `assert` executes the test.  Specifically, it creates a TF op that
+     produces an error if it has enough evidence to prove that the
+     property under test is violated.  These functions depend on the
+     desired false failure rate, because that determines the sizes of
+     appropriate confidence intervals, etc.
+
+  2. `min_discrepancy` computes the smallest difference reliably
+     detectable by that test, given the sample count and error rates.
+     What it's a difference of is test-specific.  For example, a test
+     for equality of means would make detection guarantees about the
+     difference the true means.
+
+  3. `min_num_samples` computes the minimum number of samples needed
+     to reliably detect a given discrepancy with given error rates.
+
+  The latter two are for experimental design, and are meant to be
+  usable either interactively or inline in the overall test method.
+
+This library follows a naming convention, to make room for every
+combination of the above.  A name mentions the operation first, then
+the property, then the relation, then the bound, then, if the test
+takes more than one set of samples, a token indicating this.  For
+example, `assert_true_mean_equal_by_dkwm` (which is implicitly
+one-sample).  Each name is a grammatically sound noun phrase (or verb
+phrase, for the asserts).
+
+# Asymptotic properties
+
+The number of samples needed tends to scale as `O(1/discrepancy**2)` and
+as `O(log(1/error_rate))`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+
+__all__ = [
+    "true_mean_confidence_interval_by_dkwm",
+    "assert_true_mean_equal_by_dkwm",
+    "min_discrepancy_of_true_means_detectable_by_dkwm",
+    "min_num_samples_for_dkwm_mean_test",
+    "assert_true_mean_equal_by_dkwm_two_sample",
+    "min_discrepancy_of_true_means_detectable_by_dkwm_two_sample",
+    "min_num_samples_for_dkwm_mean_two_sample_test",
+]
+
+
+def _batch_sort_vector(x, ascending=True, name=None):
+  with ops.name_scope(name, "_batch_sort_vector", [x]):
+    x = ops.convert_to_tensor(x, name="x")
+    n = array_ops.shape(x)[-1]
+    if ascending:
+      y, _ = nn_ops.top_k(-x, k=n, sorted=True)
+      y = -y
+    else:
+      y, _ = nn_ops.top_k(x, k=n, sorted=True)
+    y.set_shape(x.shape)
+    return y
+
+
+def _do_maximum_mean(samples, envelope, high, name=None):
+  """Common code between maximum_mean and minimum_mean."""
+  with ops.name_scope(name, "do_maximum_mean", [samples, envelope, high]):
+    n = array_ops.rank(samples)
+    # Move the batch dimension of `samples` to the rightmost position,
+    # where the _batch_sort_vector function wants it.
+    perm = array_ops.concat([math_ops.range(1, n), [0]], axis=0)
+    samples = array_ops.transpose(samples, perm)
+
+    samples = _batch_sort_vector(samples)
+
+    # The maximum mean is given by taking `envelope`-worth of
+    # probability from the smallest samples and moving it to the
+    # maximum value.  This amounts to:
+    # - ignoring the smallest k samples, where `k/n < envelope`
+    # - taking a `1/n - (envelope - k/n)` part of the index k sample
+    # - taking all the other samples
+    # - and adding `envelope * high` at the end.
+    # The following is a vectorized and batched way of computing this.
+    # `max_mean_contrib` is a mask implementing the previous.
+    batch_size = array_ops.shape(samples)[-1]
+    batch_size = math_ops.cast(batch_size, dtype=samples.dtype.base_dtype)
+    step = 1. / batch_size
+    cum_steps = step * math_ops.range(
+        1, batch_size + 1, dtype=samples.dtype.base_dtype)
+    max_mean_contrib = clip_ops.clip_by_value(
+        cum_steps - envelope[..., array_ops.newaxis],
+        clip_value_min=0.,
+        clip_value_max=step)
+    return math_ops.reduce_sum(
+        samples * max_mean_contrib, axis=-1) + envelope * high
+
+
+def _maximum_mean(samples, envelope, high, name=None):
+  """Returns a stochastic upper bound on the mean of a scalar distribution.
+
+  The idea is that if the true CDF is within an `eps`-envelope of the
+  empirical CDF of the samples, and the support is bounded above, then
+  the mean is bounded above as well.  In symbols,
+
+  ```none
+  sup_x(|F_n(x) - F(x)|) < eps
+  ```
+
+  The 0th dimension of `samples` is interpreted as independent and
+  identically distributed samples.  The remaining dimensions are
+  broadcast together with `envelope` and `high`, and operated on
+  separately.
+
+  Args:
+    samples: Floating-point tensor of samples from the distribution(s)
+      of interest.  Entries are assumed IID across the 0th dimension.
+      The other dimensions must broadcast with `envelope` and `high`.
+    envelope: Floating-point tensor of sizes of admissible CDF
+      envelopes (i.e., the `eps` above).
+    high: Floating-point tensor of upper bounds on the distributions'
+      supports.
+    name: A name for this operation (optional).
+
+  Returns:
+    bound: Floating-point tensor of upper bounds on the true means.
+
+  Raises:
+    InvalidArgumentError: If some `sample` is found to be larger than
+      the corresponding `high`.
+  """
+  with ops.name_scope(name, "maximum_mean", [samples, envelope, high]):
+    samples = ops.convert_to_tensor(samples, name="samples")
+    envelope = ops.convert_to_tensor(envelope, name="envelope")
+    high = ops.convert_to_tensor(high, name="high")
+
+    xmax = math_ops.reduce_max(samples, axis=[0])
+    msg = "Given sample maximum value exceeds expectations"
+    check_op = check_ops.assert_less_equal(xmax, high, message=msg)
+    with ops.control_dependencies([check_op]):
+      return array_ops.identity(_do_maximum_mean(samples, envelope, high))
+
+
+def _minimum_mean(samples, envelope, low, name=None):
+  """Returns a stochastic lower bound on the mean of a scalar distribution.
+
+  The idea is that if the true CDF is within an `eps`-envelope of the
+  empirical CDF of the samples, and the support is bounded below, then
+  the mean is bounded below as well.  In symbols,
+
+  ```none
+  sup_x(|F_n(x) - F(x)|) < eps
+  ```
+
+  The 0th dimension of `samples` is interpreted as independent and
+  identically distributed samples.  The remaining dimensions are
+  broadcast together with `envelope` and `low`, and operated on
+  separately.
+
+  Args:
+    samples: Floating-point tensor of samples from the distribution(s)
+      of interest.  Entries are assumed IID across the 0th dimension.
+      The other dimensions must broadcast with `envelope` and `low`.
+    envelope: Floating-point tensor of sizes of admissible CDF
+      envelopes (i.e., the `eps` above).
+    low: Floating-point tensor of lower bounds on the distributions'
+      supports.
+    name: A name for this operation (optional).
+
+  Returns:
+    bound: Floating-point tensor of lower bounds on the true means.
+
+  Raises:
+    InvalidArgumentError: If some `sample` is found to be smaller than
+      the corresponding `low`.
+  """
+  with ops.name_scope(name, "minimum_mean", [samples, envelope, low]):
+    samples = ops.convert_to_tensor(samples, name="samples")
+    envelope = ops.convert_to_tensor(envelope, name="envelope")
+    low = ops.convert_to_tensor(low, name="low")
+
+    xmin = math_ops.reduce_min(samples, axis=[0])
+    msg = "Given sample minimum value falls below expectations"
+    check_op = check_ops.assert_greater_equal(xmin, low, message=msg)
+    with ops.control_dependencies([check_op]):
+      return - _do_maximum_mean(-samples, envelope, -low)
+
+
+def _dkwm_cdf_envelope(n, error_rate, name=None):
+  """Computes the CDF envelope that the DKWM inequality licenses.
+
+  The [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval)
+  gives a stochastic bound on the distance between the true cumulative
+  distribution function (CDF) of any distribution and its empirical
+  CDF.  To wit, for `n` iid samples from any distribution with CDF F,
+
+  ```none
+  P(sup_x |F_n(x) - F(x)| > eps) < 2exp(-2n eps^2)
+  ```
+
+  This function computes the envelope size `eps` as a function of the
+  number of samples `n` and the desired limit on the left-hand
+  probability above.
+
+  Args:
+    n: Tensor of numbers of samples drawn.
+    error_rate: Floating-point tensor of admissible rates of mistakes.
+    name: A name for this operation (optional).
+
+  Returns:
+    eps: Tensor of maximum distances the true CDF can be from the
+      empirical CDF.  This scales as `O(sqrt(-log(error_rate)))` and
+      as `O(1 / sqrt(n))`.  The shape is the broadcast of `n` and
+      `error_rate`.
+  """
+  with ops.name_scope(name, "dkwm_cdf_envelope", [n, error_rate]):
+    n = math_ops.cast(n, dtype=error_rate.dtype)
+    return math_ops.sqrt(-gen_math_ops.log(error_rate / 2.) / (2. * n))
+
+
+def _check_shape_dominates(samples, parameters):
+  """Check that broadcasting `samples` against `parameters` does not expand it.
+
+  Why?  Because I want to be very sure that the samples tensor is not
+  accidentally enlarged by broadcasting against tensors that are
+  supposed to be describing the distribution(s) sampled from, lest the
+  sample counts end up inflated.
+
+  Args:
+    samples: A Tensor whose shape is to be protected against broadcasting.
+    parameters: A list of Tensors who are parameters for the statistical test.
+
+  Returns:
+    samples: Return original `samples` with control dependencies attached
+      to ensure no broadcasting.
+  """
+  def check(t):
+    samples_batch_shape = array_ops.shape(samples)[1:]
+    broadcasted_batch_shape = array_ops.broadcast_dynamic_shape(
+        samples_batch_shape, array_ops.shape(t))
+    # This rank check ensures that I don't get a wrong answer from the
+    # _shapes_ broadcasting against each other.
+    samples_batch_ndims = array_ops.size(samples_batch_shape)
+    ge = check_ops.assert_greater_equal(
+        samples_batch_ndims, array_ops.rank(t))
+    eq = check_ops.assert_equal(samples_batch_shape, broadcasted_batch_shape)
+    return ge, eq
+  checks = list(itertools.chain(*[check(t) for t in parameters]))
+  with ops.control_dependencies(checks):
+    return array_ops.identity(samples)
+
+
+def true_mean_confidence_interval_by_dkwm(
+    samples, low, high, error_rate=1e-6, name=None):
+  """Computes a confidence interval for the mean of a scalar distribution.
+
+  In batch mode, computes confidence intervals for all distributions
+  in the batch (which need not be identically distributed).
+
+  Relies on the [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval).
+
+  The probability (over the randomness of drawing the given samples)
+  that any true mean is outside the corresponding returned interval is
+  no more than the given `error_rate`.  The size of the intervals
+  scale as
+  `O(1 / sqrt(#samples))`, as `O(high - low)`, and as `O(-log(error_rate))`.
+
+  Note that `error_rate` is a total error rate for all the confidence
+  intervals in the batch.  As such, if the batch is nontrivial, the
+  error rate is not broadcast but divided (evenly) among the batch
+  members.
+
+  Args:
+    samples: Floating-point tensor of samples from the distribution(s)
+      of interest.  Entries are assumed IID across the 0th dimension.
+      The other dimensions must broadcast with `low` and `high`.
+    low: Floating-point tensor of lower bounds on the distributions'
+      supports.
+    high: Floating-point tensor of upper bounds on the distributions'
+      supports.
+    error_rate: *Scalar* admissible total rate of mistakes.
+    name: A name for this operation (optional).
+
+  Returns:
+    low: A floating-point tensor of stochastic lower bounds on the true means.
+    high: A floating-point tensor of stochastic upper bounds on the true means.
+  """
+  with ops.name_scope(
+      name, "true_mean_confidence_interval_by_dkwm",
+      [samples, low, high, error_rate]):
+    samples = ops.convert_to_tensor(samples, name="samples")
+    low = ops.convert_to_tensor(low, name="low")
+    high = ops.convert_to_tensor(high, name="high")
+    error_rate = ops.convert_to_tensor(error_rate, name="error_rate")
+    samples = _check_shape_dominates(samples, [low, high])
+    check_ops.assert_scalar(error_rate)  # Static shape
+    error_rate = _itemwise_error_rate(error_rate, [low, high], samples)
+    n = array_ops.shape(samples)[0]
+    envelope = _dkwm_cdf_envelope(n, error_rate)
+    min_mean = _minimum_mean(samples, envelope, low)
+    max_mean = _maximum_mean(samples, envelope, high)
+    return min_mean, max_mean
+
+
+def _itemwise_error_rate(
+    total_error_rate, param_tensors, sample_tensor=None, name=None):
+  with ops.name_scope(
+      name, "itemwise_error_rate",
+      [total_error_rate, param_tensors, sample_tensor]):
+    result_shape = [1]
+    for p_tensor in param_tensors:
+      result_shape = array_ops.broadcast_dynamic_shape(
+          array_ops.shape(p_tensor), result_shape)
+    if sample_tensor is not None:
+      result_shape = array_ops.broadcast_dynamic_shape(
+          array_ops.shape(sample_tensor)[1:], result_shape)
+    num_items = math_ops.reduce_prod(result_shape)
+    return total_error_rate / math_ops.cast(
+        num_items, dtype=total_error_rate.dtype)
+
+
+def assert_true_mean_equal_by_dkwm(
+    samples, low, high, expected, false_fail_rate=1e-6, name=None):
+  """Asserts the mean of the given distribution is as expected.
+
+  More precisely, fails if there is enough evidence (using the
+  [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval))
+  that the true mean of some distribution from which the given samples are
+  drawn is _not_ the given expected mean with statistical significance
+  `false_fail_rate` or stronger, otherwise passes.  If you also want to
+  check that you are gathering enough evidence that a pass is not
+  spurious, see `min_num_samples_for_dkwm_mean_test` and
+  `min_discrepancy_of_true_means_detectable_by_dkwm`.
+
+  Note that `false_fail_rate` is a total false failure rate for all
+  the assertions in the batch.  As such, if the batch is nontrivial,
+  the assertion will insist on stronger evidence to fail any one member.
+
+  Args:
+    samples: Floating-point tensor of samples from the distribution(s)
+      of interest.  Entries are assumed IID across the 0th dimension.
+      The other dimensions must broadcast with `low` and `high`.
+    low: Floating-point tensor of lower bounds on the distributions'
+      supports.
+    high: Floating-point tensor of upper bounds on the distributions'
+      supports.
+    expected: Floating-point tensor of expected true means.
+    false_fail_rate: *Scalar* admissible total rate of mistakes.
+    name: A name for this operation (optional).
+
+  Returns:
+    check: Op that raises `InvalidArgumentError` if any expected mean is
+      outside the corresponding confidence interval.
+  """
+  with ops.name_scope(
+      name, "assert_true_mean_equal_by_dkwm",
+      [samples, low, high, expected, false_fail_rate]):
+    samples = ops.convert_to_tensor(samples, name="samples")
+    low = ops.convert_to_tensor(low, name="low")
+    high = ops.convert_to_tensor(high, name="high")
+    expected = ops.convert_to_tensor(expected, name="expected")
+    false_fail_rate = ops.convert_to_tensor(
+        false_fail_rate, name="false_fail_rate")
+    samples = _check_shape_dominates(samples, [low, high, expected])
+    min_mean, max_mean = true_mean_confidence_interval_by_dkwm(
+        samples, low, high, error_rate=false_fail_rate)
+    less_op = check_ops.assert_less(
+        min_mean, expected, message="Mean confidence interval too high")
+    with ops.control_dependencies([less_op]):
+      return check_ops.assert_greater(
+          max_mean, expected, message="Mean confidence interval too low")
+
+
+def min_discrepancy_of_true_means_detectable_by_dkwm(
+    n, low, high, false_fail_rate, false_pass_rate, name=None):
+  """Returns the minimum mean discrepancy that a DKWM-based test can detect.
+
+  DKWM is the [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval).
+
+  Note that `false_fail_rate` is a total false failure rate for all
+  the tests in the batch.  As such, if the batch is nontrivial, each
+  member will demand more samples.  The `false_pass_rate` is also
+  interpreted as a total, but is treated asymmetrically: If each test
+  in the batch detects its corresponding discrepancy with probability
+  at least `1 - false_pass_rate`, then running all those tests and
+  failing if any one fails will jointly detect all those discrepancies
+  with the same `false_pass_rate`.
+
+  Args:
+    n: Tensor of numbers of samples to be drawn from the distributions
+      of interest.
+    low: Floating-point tensor of lower bounds on the distributions'
+      supports.
+    high: Floating-point tensor of upper bounds on the distributions'
+      supports.
+    false_fail_rate: *Scalar* admissible total rate of false failures.
+    false_pass_rate: *Scalar* admissible rate of false passes.
+    name: A name for this operation (optional).
+
+  Returns:
+    discr: Tensor of lower bounds on the distances between true
+       means detectable by a DKWM-based test.
+
+  For each batch member `i`, of `K` total, drawing `n[i]` samples from
+  some scalar distribution supported on `[low[i], high[i]]` is enough
+  to detect a difference in means of size `discr[i]` or more.
+  Specifically, we guarantee that (a) if the true mean is the expected
+  mean, `assert_true_mean_equal_by_dkwm` will fail with probability at
+  most `false_fail_rate / K` (which amounts to `false_fail_rate` if
+  applied to the whole batch at once), and (b) if the true mean
+  differs from the expected mean by at least `discr[i]`,
+  `assert_true_mean_equal_by_dkwm` will pass with probability at most
+  `false_pass_rate`.
+
+  The detectable discrepancy scales as
+
+  - `O(high[i] - low[i])`,
+  - `O(1 / sqrt(n[i]))`,
+  - `O(-log(false_fail_rate/K))`, and
+  - `O(-log(false_pass_rate))`.
+  """
+  with ops.name_scope(
+      name, "min_discrepancy_of_true_means_detectable_by_dkwm",
+      [n, low, high, false_fail_rate, false_pass_rate]):
+    n = ops.convert_to_tensor(n, name="n")
+    low = ops.convert_to_tensor(low, name="low")
+    high = ops.convert_to_tensor(high, name="high")
+    false_fail_rate = ops.convert_to_tensor(
+        false_fail_rate, name="false_fail_rate")
+    false_pass_rate = ops.convert_to_tensor(
+        false_pass_rate, name="false_pass_rate")
+    # Algorithm: Assume a true CDF F.  The DKWM inequality gives a
+    # stochastic bound on how far the observed empirical CDF F_n can be.
+    # Then, using the DKWM inequality again gives a stochastic bound on
+    # the farthest candidate true CDF F' that
+    # true_mean_confidence_interval_by_dkwm might consider.  At worst, these
+    # errors may go in the same direction, so the distance between F and
+    # F' is bounded by the sum.
+    # On batching: false fail rates sum, so I need to reduce
+    # the input to account for the batching.  False pass rates
+    # max, so I don't.
+    sampling_envelope = _dkwm_cdf_envelope(n, false_pass_rate)
+    false_fail_rate = _itemwise_error_rate(false_fail_rate, [n, low, high])
+    analysis_envelope = _dkwm_cdf_envelope(n, false_fail_rate)
+    return (high - low) * (sampling_envelope + analysis_envelope)
+
+
+def min_num_samples_for_dkwm_mean_test(
+    discrepancy, low, high,
+    false_fail_rate=1e-6, false_pass_rate=1e-6, name=None):
+  """Returns how many samples suffice for a one-sample DKWM mean test.
+
+  To wit, returns an upper bound on the number of samples necessary to
+  guarantee detecting a mean difference of at least the given
+  `discrepancy`, with the given `false_fail_rate` and `false_pass_rate`,
+  using the [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval)
+  on a scalar distribution supported on `[low, high]`.
+
+  Args:
+    discrepancy: Floating-point tensor of desired upper limits on mean
+      differences that may go undetected with probability higher than
+      `1 - false_pass_rate`.
+    low: Tensor of lower bounds on the distributions' support.
+    high: Tensor of upper bounds on the distributions' support.
+    false_fail_rate: *Scalar* admissible total rate of false failures.
+    false_pass_rate: *Scalar* admissible rate of false passes.
+    name: A name for this operation (optional).
+
+  Returns:
+    n: Tensor of numbers of samples to be drawn from the distributions
+      of interest.
+
+  The `discrepancy`, `low`, and `high` tensors must have
+  broadcast-compatible shapes.
+
+  For each batch member `i`, of `K` total, drawing `n[i]` samples from
+  some scalar distribution supported on `[low[i], high[i]]` is enough
+  to detect a difference in means of size `discrepancy[i]` or more.
+  Specifically, we guarantee that (a) if the true mean is the expected
+  mean, `assert_true_mean_equal_by_dkwm` will fail with probability at
+  most `false_fail_rate / K` (which amounts to `false_fail_rate` if
+  applied to the whole batch at once), and (b) if the true mean
+  differs from the expected mean by at least `discrepancy[i]`,
+  `assert_true_mean_equal_by_dkwm` will pass with probability at most
+  `false_pass_rate`.
+
+  The required number of samples scales
+  as `O((high[i] - low[i])**2)`, `O(-log(false_fail_rate/K))`,
+  `O(-log(false_pass_rate))`, and `O(1 / discrepancy[i]**2)`.
+  """
+  with ops.name_scope(
+      name, "min_num_samples_for_dkwm_mean_test",
+      [low, high, false_fail_rate, false_pass_rate, discrepancy]):
+    discrepancy = ops.convert_to_tensor(
+        discrepancy, name="discrepancy")
+    low = ops.convert_to_tensor(low, name="low")
+    high = ops.convert_to_tensor(high, name="high")
+    false_fail_rate = ops.convert_to_tensor(
+        false_fail_rate, name="false_fail_rate")
+    false_pass_rate = ops.convert_to_tensor(
+        false_pass_rate, name="false_pass_rate")
+    # Could choose to cleverly allocate envelopes, but this is sound.
+    envelope1 = discrepancy / (2. * (high - low))
+    envelope2 = envelope1
+    false_fail_rate = _itemwise_error_rate(
+        false_fail_rate, [low, high, discrepancy])
+    n1 = -math_ops.log(false_fail_rate / 2.) / (2. * envelope1**2)
+    n2 = -math_ops.log(false_pass_rate / 2.) / (2. * envelope2**2)
+    return math_ops.maximum(n1, n2)
+
+
+def assert_true_mean_equal_by_dkwm_two_sample(
+    samples1, low1, high1, samples2, low2, high2,
+    false_fail_rate=1e-6, name=None):
+  """Asserts the means of the given distributions are equal.
+
+  More precisely, fails if there is enough evidence (using the
+  [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval))
+  that the means of the distributions from which the given samples are
+  drawn are _not_ equal with statistical significance `false_fail_rate`
+  or stronger, otherwise passes.  If you also want to check that you
+  are gathering enough evidence that a pass is not spurious, see
+  `min_num_samples_for_dkwm_mean_two_sample_test` and
+  `min_discrepancy_of_true_means_detectable_by_dkwm_two_sample`.
+
+  Note that `false_fail_rate` is a total false failure rate for all
+  the assertions in the batch.  As such, if the batch is nontrivial,
+  the assertion will insist on stronger evidence to fail any one member.
+
+  Args:
+    samples1: Floating-point tensor of samples from the
+      distribution(s) A.  Entries are assumed IID across the 0th
+      dimension.  The other dimensions must broadcast with `low1`,
+      `high1`, `low2`, and `high2`.
+    low1: Floating-point tensor of lower bounds on the supports of the
+      distributions A.
+    high1: Floating-point tensor of upper bounds on the supports of
+      the distributions A.
+    samples2: Floating-point tensor of samples from the
+      distribution(s) B.  Entries are assumed IID across the 0th
+      dimension.  The other dimensions must broadcast with `low1`,
+      `high1`, `low2`, and `high2`.
+    low2: Floating-point tensor of lower bounds on the supports of the
+      distributions B.
+    high2: Floating-point tensor of upper bounds on the supports of
+      the distributions B.
+    false_fail_rate: *Scalar* admissible total rate of mistakes.
+    name: A name for this operation (optional).
+
+  Returns:
+    check: Op that raises `InvalidArgumentError` if any pair of confidence
+      intervals true for corresponding true means do not overlap.
+  """
+  with ops.name_scope(
+      name, "assert_true_mean_equal_by_dkwm_two_sample",
+      [samples1, low1, high1, samples2, low2, high2, false_fail_rate]):
+    samples1 = ops.convert_to_tensor(samples1, name="samples1")
+    low1 = ops.convert_to_tensor(low1, name="low1")
+    high1 = ops.convert_to_tensor(high1, name="high1")
+    samples2 = ops.convert_to_tensor(samples2, name="samples2")
+    low2 = ops.convert_to_tensor(low2, name="low2")
+    high2 = ops.convert_to_tensor(high2, name="high2")
+    false_fail_rate = ops.convert_to_tensor(
+        false_fail_rate, name="false_fail_rate")
+    samples1 = _check_shape_dominates(samples1, [low1, high1])
+    samples2 = _check_shape_dominates(samples2, [low2, high2])
+    compatible_samples = check_ops.assert_equal(
+        array_ops.shape(samples1)[1:], array_ops.shape(samples2)[1:])
+    with ops.control_dependencies([compatible_samples]):
+      # Could in principle play games with cleverly allocating
+      # significance instead of the even split below.  It may be possible
+      # to get tighter intervals, in order to obtain a higher power test.
+      # Any allocation strategy that depends only on the support bounds
+      # and sample counts should be valid; however, because the intervals
+      # scale as O(-log(false_fail_rate)), there doesn't seem to be much
+      # room to win.
+      min_mean_1, max_mean_1 = true_mean_confidence_interval_by_dkwm(
+          samples1, low1, high1, false_fail_rate / 2.)
+      min_mean_2, max_mean_2 = true_mean_confidence_interval_by_dkwm(
+          samples2, low2, high2, false_fail_rate / 2.)
+      # I want to assert
+      #   not (max_mean_1 < min_mean_2 or min_mean_1 > max_mean_2),
+      # but I think I only have and-combination of asserts, so use DeMorgan.
+      check_confidence_intervals_can_intersect = check_ops.assert_greater_equal(
+          max_mean_1, min_mean_2, message="Confidence intervals do not "
+          "intersect: samples1 has a smaller mean than samples2")
+      with ops.control_dependencies([check_confidence_intervals_can_intersect]):
+        return check_ops.assert_less_equal(
+            min_mean_1, max_mean_2, message="Confidence intervals do not "
+            "intersect: samples2 has a smaller mean than samples1")
+
+
+def min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
+    n1, low1, high1, n2, low2, high2,
+    false_fail_rate, false_pass_rate, name=None):
+  """Returns the minimum mean discrepancy for a two-sample DKWM-based test.
+
+  DKWM is the [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval).
+
+  Note that `false_fail_rate` is a total false failure rate for all
+  the tests in the batch.  As such, if the batch is nontrivial, each
+  member will demand more samples.  The `false_pass_rate` is also
+  interpreted as a total, but is treated asymmetrically: If each test
+  in the batch detects its corresponding discrepancy with probability
+  at least `1 - false_pass_rate`, then running all those tests and
+  failing if any one fails will jointly detect all those discrepancies
+  with the same `false_pass_rate`.
+
+  Args:
+    n1: Tensor of numbers of samples to be drawn from the distributions A.
+    low1: Floating-point tensor of lower bounds on the supports of the
+      distributions A.
+    high1: Floating-point tensor of upper bounds on the supports of
+      the distributions A.
+    n2: Tensor of numbers of samples to be drawn from the distributions B.
+    low2: Floating-point tensor of lower bounds on the supports of the
+      distributions B.
+    high2: Floating-point tensor of upper bounds on the supports of
+      the distributions B.
+    false_fail_rate: *Scalar* admissible total rate of false failures.
+    false_pass_rate: *Scalar* admissible rate of false passes.
+    name: A name for this operation (optional).
+
+  Returns:
+    discr: Tensor of lower bounds on the distances between true means
+       detectable by a two-sample DKWM-based test.
+
+  For each batch member `i`, of `K` total, drawing `n1[i]` samples
+  from scalar distribution A supported on `[low1[i], high1[i]]` and `n2[i]`
+  samples from scalar distribution B supported on `[low2[i], high2[i]]`
+  is enough to detect a difference in their true means of size
+  `discr[i]` or more.  Specifically, we guarantee that (a) if their
+  true means are equal, `assert_true_mean_equal_by_dkwm_two_sample`
+  will fail with probability at most `false_fail_rate/K` (which
+  amounts to `false_fail_rate` if applied to the whole batch at once),
+  and (b) if their true means differ by at least `discr[i]`,
+  `assert_true_mean_equal_by_dkwm_two_sample` will pass with
+  probability at most `false_pass_rate`.
+
+  The detectable distribution scales as
+
+  - `O(high1[i] - low1[i])`, `O(high2[i] - low2[i])`,
+  - `O(1 / sqrt(n1[i]))`, `O(1 / sqrt(n2[i]))`,
+  - `O(-log(false_fail_rate/K))`, and
+  - `O(-log(false_pass_rate))`.
+  """
+  with ops.name_scope(
+      name, "min_discrepancy_of_true_means_detectable_by_dkwm_two_sample",
+      [n1, low1, high1, n2, low2, high2, false_fail_rate, false_pass_rate]):
+    n1 = ops.convert_to_tensor(n1, name="n1")
+    low1 = ops.convert_to_tensor(low1, name="low1")
+    high1 = ops.convert_to_tensor(high1, name="high1")
+    n2 = ops.convert_to_tensor(n2, name="n2")
+    low2 = ops.convert_to_tensor(low2, name="low2")
+    high2 = ops.convert_to_tensor(high2, name="high2")
+    false_fail_rate = ops.convert_to_tensor(
+        false_fail_rate, name="false_fail_rate")
+    false_pass_rate = ops.convert_to_tensor(
+        false_pass_rate, name="false_pass_rate")
+    det_disc1 = min_discrepancy_of_true_means_detectable_by_dkwm(
+        n1, low1, high1, false_fail_rate / 2., false_pass_rate / 2.)
+    det_disc2 = min_discrepancy_of_true_means_detectable_by_dkwm(
+        n2, low2, high2, false_fail_rate / 2., false_pass_rate / 2.)
+    return det_disc1 + det_disc2
+
+
+def min_num_samples_for_dkwm_mean_two_sample_test(
+    discrepancy, low1, high1, low2, high2,
+    false_fail_rate=1e-6, false_pass_rate=1e-6, name=None):
+  """Returns how many samples suffice for a two-sample DKWM mean test.
+
+  DKWM is the [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval).
+
+  Args:
+    discrepancy: Floating-point tensor of desired upper limits on mean
+      differences that may go undetected with probability higher than
+      `1 - false_pass_rate`.
+    low1: Floating-point tensor of lower bounds on the supports of the
+      distributions A.
+    high1: Floating-point tensor of upper bounds on the supports of
+      the distributions A.
+    low2: Floating-point tensor of lower bounds on the supports of the
+      distributions B.
+    high2: Floating-point tensor of upper bounds on the supports of
+      the distributions B.
+    false_fail_rate: *Scalar* admissible total rate of false failures.
+    false_pass_rate: *Scalar* admissible rate of false passes.
+    name: A name for this operation (optional).
+
+  Returns:
+    n1: Tensor of numbers of samples to be drawn from the distributions A.
+    n2: Tensor of numbers of samples to be drawn from the distributions B.
+
+  For each batch member `i`, of `K` total, drawing `n1[i]` samples
+  from scalar distribution A supported on `[low1[i], high1[i]]` and `n2[i]`
+  samples from scalar distribution B supported on `[low2[i], high2[i]]`
+  is enough to detect a difference in their true means of size
+  `discr[i]` or more.  Specifically, we guarantee that (a) if their
+  true means are equal, `assert_true_mean_equal_by_dkwm_two_sample`
+  will fail with probability at most `false_fail_rate/K` (which
+  amounts to `false_fail_rate` if applied to the whole batch at once),
+  and (b) if their true means differ by at least `discr[i]`,
+  `assert_true_mean_equal_by_dkwm_two_sample` will pass with
+  probability at most `false_pass_rate`.
+
+  The required number of samples scales as
+
+  - `O((high1[i] - low1[i])**2)`, `O((high2[i] - low2[i])**2)`,
+  - `O(-log(false_fail_rate/K))`,
+  - `O(-log(false_pass_rate))`, and
+  - `O(1 / discrepancy[i]**2)`.
+  """
+  with ops.name_scope(
+      name, "min_num_samples_for_dkwm_mean_two_sample_test",
+      [low1, high1, low2, high2,
+       false_fail_rate, false_pass_rate, discrepancy]):
+    discrepancy = ops.convert_to_tensor(discrepancy, name="discrepancy")
+    low1 = ops.convert_to_tensor(low1, name="low1")
+    high1 = ops.convert_to_tensor(high1, name="high1")
+    low2 = ops.convert_to_tensor(low2, name="low2")
+    high2 = ops.convert_to_tensor(high2, name="high2")
+    false_fail_rate = ops.convert_to_tensor(
+        false_fail_rate, name="false_fail_rate")
+    false_pass_rate = ops.convert_to_tensor(
+        false_pass_rate, name="false_pass_rate")
+    # Could choose to cleverly allocate discrepancy tolerances and
+    # failure probabilities, but this is sound.
+    n1 = min_num_samples_for_dkwm_mean_test(
+        discrepancy / 2., low1, high1,
+        false_fail_rate / 2., false_pass_rate / 2.)
+    n2 = min_num_samples_for_dkwm_mean_test(
+        discrepancy / 2., low2, high2,
+        false_fail_rate / 2., false_pass_rate / 2.)
+    return n1, n2
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index 0c747f8e68529484ae6f695b8500cde74857bb11..af6ff8162b173015dca2d568e13d63127af7853a 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -181,7 +181,7 @@ def quadrature_scheme_softmaxnormal_quantiles(
       edges = array_ops.reshape(edges, shape=array_ops.concat([
           [-1], array_ops.ones([batch_ndims], dtype=dtypes.int32)], axis=0))
       quantiles = dist.quantile(edges)
-      quantiles = SoftmaxCentered(event_ndims=1).forward(quantiles)
+      quantiles = SoftmaxCentered().forward(quantiles)
       # Cyclically permute left by one.
       perm = array_ops.concat([
           math_ops.range(1, 1 + batch_ndims), [0]], axis=0)
@@ -248,11 +248,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   The default quadrature scheme chooses `z_{N, n}` as `N` midpoints of
   the quantiles of `p(z)` (generalized quantiles if `K > 2`).
 
-  See [1] for more details.
-
-  [1]. "Quadrature Compound: An approximating family of distributions"
-       Joshua Dillon, Ian Langmore, arXiv preprints
-       https://arxiv.org/abs/1801.03080
+  See [Dillon and Langmore (2018)][1] for more details.
 
   #### About `Vector` distributions in TensorFlow.
 
@@ -313,6 +309,13 @@ class VectorDiffeomixture(distribution_lib.Distribution):
             is_positive_definite=True),
       ],
       validate_args=True)
+  ```
+
+  #### References
+
+  [1]: Joshua Dillon and Ian Langmore. Quadrature Compound: An approximating
+       family of distributions. _arXiv preprint arXiv:1801.03080_, 2018.
+       https://arxiv.org/abs/1801.03080
   """
 
   def __init__(self,
@@ -393,7 +396,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       ValueError: if `not distribution.is_scalar_event`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[mix_loc, temperature]):
+    with ops.name_scope(name, values=[mix_loc, temperature]) as name:
       if not scale or len(scale) < 2:
         raise ValueError("Must specify list (or list-like object) of scale "
                          "LinearOperators, one for each component with "
@@ -424,7 +427,6 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       self._endpoint_affine = [
           AffineLinearOperator(shift=loc_,
                                scale=scale_,
-                               event_ndims=1,
                                validate_args=validate_args,
                                name="endpoint_affine_{}".format(k))
           for k, (loc_, scale_) in enumerate(zip(loc, scale))]
@@ -464,7 +466,6 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       self._interpolated_affine = [
           AffineLinearOperator(shift=loc_,
                                scale=scale_,
-                               event_ndims=1,
                                validate_args=validate_args,
                                name="interpolated_affine_{}".format(k))
           for k, (loc_, scale_) in enumerate(zip(
@@ -618,9 +619,11 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     log_prob = math_ops.reduce_sum(self.distribution.log_prob(y), axis=-2)
     # Because the affine transformation has a constant Jacobian, it is the case
     # that `affine.fldj(x) = -affine.ildj(x)`. This is not true in general.
-    fldj = array_ops.stack(
-        [aff.forward_log_det_jacobian(x) for aff in self.interpolated_affine],
-        axis=-1)
+    fldj = array_ops.stack([
+        aff.forward_log_det_jacobian(
+            x,
+            event_ndims=array_ops.rank(self.event_shape_tensor())
+        ) for aff in self.interpolated_affine], axis=-1)
     return math_ops.reduce_logsumexp(
         self.mixture_distribution.logits - fldj + log_prob, axis=-1)
 
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
index 526fe2d39aef9aed833b889de80e849c469435e7..e265b5d0f7c10b2782a1a8924babdca9b986f622 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
@@ -176,7 +176,7 @@ class VectorExponentialDiag(
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
         # No need to validate_args while making diag_scale.  The returned
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
index 9d5fd9ac4178a1ae29b1ce32f304b22fd3d234dc..89136d6760bb663b5ff86a77c5945ce900f072b9 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -181,7 +181,7 @@ class VectorExponentialLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    with ops.name_scope(name, values=[loc] + scale.graph_parents) as name:
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index e1ccf116457a97261b9ce3965552764771d3bdd2..1438ede26500bca4541fa9b2020ff22d4c071098 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -169,7 +169,7 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
         name,
         values=[
             loc, scale_diag, scale_identity_multiplier, skewness, tailweight
-        ]):
+        ]) as name:
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
       tailweight = 1. if tailweight is None else tailweight
       has_default_skewness = skewness is None
@@ -215,19 +215,19 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
       tailweight = ops.convert_to_tensor(
           tailweight, dtype=dtype, name="tailweight")
       f = bijectors.SinhArcsinh(
-          skewness=skewness, tailweight=tailweight, event_ndims=1)
+          skewness=skewness, tailweight=tailweight)
       if has_default_skewness:
         f_noskew = f
       else:
         f_noskew = bijectors.SinhArcsinh(
             skewness=skewness.dtype.as_numpy_dtype(0.),
-            tailweight=tailweight, event_ndims=0)
+            tailweight=tailweight)
 
       # Make the Affine bijector, Z --> loc + C * Z.
       c = 2 * scale_diag_part / f_noskew.forward(
           ops.convert_to_tensor(2, dtype=dtype))
       affine = bijectors.Affine(
-          shift=loc, scale_diag=c, validate_args=validate_args, event_ndims=1)
+          shift=loc, scale_diag=c, validate_args=validate_args)
 
       bijector = bijectors.Chain([affine, f])
 
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 8c67647a618d22a58428d78865c4ebf7d98bdf9e..7e78ded9df07564126b46b6beeeccf95bf1eef94 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -66,7 +66,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   This distribution is an Affine transformation of iid
   [Student's t-distributions](
   https://en.wikipedia.org/wiki/Student%27s_t-distribution)
-  and should not be confused with the [Multivate Student's t-distribution](
+  and should not be confused with the [Multivariate Student's t-distribution](
   https://en.wikipedia.org/wiki/Multivariate_t-distribution). The
   traditional Multivariate Student's t-distribution is type of
   [elliptical distribution](
@@ -178,7 +178,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
     parameters = locals()
     graph_parents = [df, loc, scale_identity_multiplier, scale_diag,
                      scale_tril, scale_perturb_factor, scale_perturb_diag]
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=graph_parents):
         # The shape of the _VectorStudentT distribution is governed by the
         # relationship between df.batch_shape and affine.batch_shape. In
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index e4ac65012b9c7e3ed5ada3ed75020f3905740156..91453fed5d279178a0e062b71dad3b0f957b11b4 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -109,7 +109,7 @@ class _WishartLinearOperator(distribution.Distribution):
     """
     parameters = locals()
     self._cholesky_input_output_matrices = cholesky_input_output_matrices
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[df, scale_operator]):
         if not scale_operator.dtype.is_floating:
           raise TypeError(
@@ -163,7 +163,7 @@ class _WishartLinearOperator(distribution.Distribution):
         parameters=parameters,
         graph_parents=([self._df, self._dimension] +
                        self._scale_operator.graph_parents),
-        name=ns)
+        name=name)
 
   @property
   def df(self):
@@ -228,9 +228,12 @@ class _WishartLinearOperator(distribution.Distribution):
     # Complexity: O(nbk)
     # This parametrization is equivalent to Chi2, i.e.,
     # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2)
+    expanded_df = self.df * array_ops.ones(
+        self.scale_operator.batch_shape_tensor(),
+        dtype=self.df.dtype.base_dtype)
     g = random_ops.random_gamma(shape=[n],
                                 alpha=self._multi_gamma_sequence(
-                                    0.5 * self.df, self.dimension),
+                                    0.5 * expanded_df, self.dimension),
                                 beta=0.5,
                                 dtype=self.dtype,
                                 seed=distribution_util.gen_new_seed(
@@ -528,7 +531,7 @@ class WishartCholesky(_WishartLinearOperator):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[scale]):
+    with ops.name_scope(name, values=[scale]) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
         if validate_args:
@@ -644,7 +647,7 @@ class WishartFull(_WishartLinearOperator):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
         if validate_args:
@@ -663,5 +666,5 @@ class WishartFull(_WishartLinearOperator):
         cholesky_input_output_matrices=cholesky_input_output_matrices,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
-        name=ns)
+        name=name)
     self._parameters = parameters
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index 9d2ca07c3a25fa7acb9b0f5806b763d9a57b51fa..9a3b780af888a597d2440b243ffb8dc98d764f18 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -1,12 +1,8 @@
 # Eager Execution
 
-> *WARNING*: This is a preview/pre-alpha version. The API and performance
-> characteristics are subject to change.
-
-Eager execution is an experimental interface to TensorFlow that provides an
-imperative programming style (à la [NumPy](http://www.numpy.org)). When you
-enable eager execution, TensorFlow operations execute immediately; you do not
-execute a pre-constructed graph with
+Eager execution provides an imperative interface to TensorFlow (similiar to
+[NumPy](http://www.numpy.org)). When you enable eager execution, TensorFlow
+operations execute immediately; you do not execute a pre-constructed graph with
 [`Session.run()`](https://www.tensorflow.org/api_docs/python/tf/Session).
 
 For example, consider a simple computation in TensorFlow:
@@ -33,7 +29,7 @@ print(m)
 ## Caveats
 
 This feature is in early stages and work remains to be done in terms of smooth
-support for distributed and multi-GPU training and CPU performance.
+support for distributed and multi-GPU training and performance.
 
 - [Known issues](https://github.com/tensorflow/tensorflow/issues?q=is%3Aissue%20is%3Aopen%20label%3Acomp%3Aeager)
 - Feedback is welcome, please consider
@@ -41,21 +37,23 @@ support for distributed and multi-GPU training and CPU performance.
 
 ## Installation
 
-Eager execution is included in TensorFlow versions 1.5 and above.
+Eager execution is included in TensorFlow versions 1.7 and above.
 Installation instructions at https://www.tensorflow.org/install/
 
 ## Documentation
 
 For an introduction to eager execution in TensorFlow, see:
 
-- [User Guide](python/g3doc/guide.md)
+- [User Guide](https://www.tensorflow.org/programmers_guide/eager) ([source](../../docs_src/programmers_guide/eager.md))
 - Notebook: [Basic Usage](python/examples/notebooks/1_basics.ipynb)
 - Notebook: [Gradients](python/examples/notebooks/2_gradients.ipynb)
 - Notebook: [Importing Data](python/examples/notebooks/3_datasets.ipynb)
 
 ## Changelog
 
-- 2017/10/31: Initial preview release.
+- 2017/10/31: Initial preview release (in TensorFlow 1.5)
 - 2017/12/01: Example of dynamic neural network:
   [SPINN: Stack-augmented Parser-Interpreter Neural Network](https://arxiv.org/abs/1603.06021).
   See [README.md](python/examples/spinn/README.md) for details.
+- 2017/03: Core functionality moved out of the experimental tf.contrib namespace
+  in TensorFlow 1.7.
diff --git a/tensorflow/contrib/eager/proto/BUILD b/tensorflow/contrib/eager/proto/BUILD
deleted file mode 100644
index aedfec8924e7314addd22349c0576a84a58d9aa3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/proto/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-tf_proto_library(
-    name = "checkpointable_object_graph_proto",
-    srcs = [
-        "checkpointable_object_graph.proto",
-    ],
-    visibility = ["//tensorflow/contrib/eager/python:__subpackages__"],
-)
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index ad40e55cb48aac08eca7022846a0bd07b8accb3f..99abbae03fc14f241dae27f317902f7335819037 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -17,16 +17,15 @@ py_library(
         ":saver",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:numerics",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:template",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:core",
-        "//tensorflow/python/eager:custom_gradient",
         "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:function",
     ],
@@ -69,14 +68,19 @@ cuda_py_test(
     srcs = ["datasets_test.py"],
     additional_deps = [
         ":datasets",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
+        "//tensorflow/contrib/data/python/ops:threadpool",
+        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:training",
         "//tensorflow/python/data",
         "//tensorflow/python/eager:test",
     ],
+    tags = ["noguitar"],
 )
 
 py_library(
@@ -115,13 +119,14 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpointable",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
@@ -135,11 +140,11 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":metrics",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/contrib/summary:summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -156,10 +161,10 @@ py_library(
     deps = [
         ":datasets",
         ":metrics",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
         "@six_archive//:six",
@@ -218,62 +223,3 @@ py_test(
         "//tensorflow/python/eager:test",
     ],
 )
-
-py_library(
-    name = "checkpointable_utils",
-    srcs = ["checkpointable_utils.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/contrib/eager/proto:checkpointable_object_graph_proto_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-py_test(
-    name = "checkpointable_utils_test",
-    srcs = ["checkpointable_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":checkpointable_utils",
-        ":network",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-        "@six_archive//:six",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/eager/python/checkpointable_utils.py
deleted file mode 100644
index 0506af391cf22ec89d5174f17208c8eb393ddc54..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/checkpointable_utils.py
+++ /dev/null
@@ -1,435 +0,0 @@
-"""Utilities for working with Checkpointable objects."""
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import checkpointable as core_checkpointable
-from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.training import saver as saver_lib
-
-
-_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
-
-# Keyword for identifying that the next bit of a checkpoint variable name is a
-# slot name. Checkpoint names for slot variables look like:
-#
-#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
-#
-# Where <path to variable> is a full path from the checkpoint root to the
-# variable being slotted for.
-_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
-# Keyword for separating the path to an object from the name of an
-# attribute in checkpoint names. Used like:
-#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
-_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
-# Key where the object graph proto is saved in a TensorBundle
-_OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
-
-
-# TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
-# or consolidating the implementation with get_variable.
-def _default_getter(name, shape, dtype, initializer=None,
-                    partition_info=None, **kwargs):
-  """A pared-down version of get_variable which does not reuse variables."""
-  dtype = dtypes.as_dtype(dtype)
-  shape_object = tensor_shape.as_shape(shape)
-  with ops.init_scope():
-    if initializer is None:
-      initializer, initializing_from_value = (
-          variable_scope._get_default_variable_store()._get_default_initializer(  # pylint: disable=protected-access
-              name=name, shape=shape_object, dtype=dtype))
-    else:
-      initializing_from_value = not callable(initializer)
-    # Same logic as get_variable
-    variable_dtype = dtype.base_dtype
-    if initializing_from_value:
-      if shape is not None:
-        raise ValueError("If initializer is a constant, do not specify shape.")
-      initial_value = initializer
-    else:
-      # Instantiate initializer if provided initializer is a type object.
-      if isinstance(initializer, type(init_ops.Initializer)):
-        initializer = initializer(dtype=dtype)
-      def initial_value():
-        return initializer(
-            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
-    return resource_variable_ops.ResourceVariable(
-        initial_value=initial_value,
-        name=name,
-        dtype=variable_dtype,
-        **kwargs
-    )
-
-
-def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
-                 initializer=None):
-  """Add a variable to a Checkpointable with no scope influence."""
-  return checkpointable._add_variable_with_custom_getter(  # pylint: disable=protected-access
-      name=name, shape=shape, dtype=dtype,
-      initializer=initializer, getter=_default_getter)
-
-
-def _breadth_first_checkpointable_traversal(root_checkpointable):
-  """Find shortest paths to all variables owned by dependencies of root."""
-  bfs_sorted = []
-  to_visit = collections.deque([root_checkpointable])
-  path_to_root = {root_checkpointable: ()}
-  while to_visit:
-    current_checkpointable = to_visit.popleft()
-    current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-    bfs_sorted.append(current_checkpointable)
-    for child_checkpointable in (
-        current_checkpointable._checkpoint_dependencies):  # pylint: disable=protected-access
-      if child_checkpointable.ref not in path_to_root:
-        path_to_root[child_checkpointable.ref] = (
-            path_to_root[current_checkpointable] + (child_checkpointable,))
-        to_visit.append(child_checkpointable.ref)
-  return bfs_sorted, path_to_root
-
-
-def _escape_local_name(name):
-  # We need to support slashes in local names for compatibility, since this
-  # naming scheme is being patched in to things like Layer.add_variable where
-  # slashes were previously accepted. We also want to use slashes to indicate
-  # edges traversed to reach the variable, so we escape forward slashes in
-  # names.
-  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
-          .replace(r"/", _ESCAPE_CHAR + "S"))
-
-
-def _object_prefix_from_path(path_to_root):
-  return "/".join(
-      (_escape_local_name(checkpointable.name)
-       for checkpointable in path_to_root))
-
-
-def _slot_variable_naming_for_optimizer(optimizer_path):
-  """Make a function for naming slot variables in an optimizer."""
-  # Name slot variables:
-  #
-  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
-  #
-  # where <variable name> is exactly the checkpoint name used for the original
-  # variable, including the path from the checkpoint root and the local name in
-  # the object which owns it. Note that we only save slot variables if the
-  # variable it's slotting for is also being saved.
-
-  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
-
-  def _name_slot_variable(variable_path, slot_name):
-    """With an optimizer specified, name a slot variable."""
-    return (variable_path
-            + optimizer_identifier
-            + _escape_local_name(slot_name))
-
-  return _name_slot_variable
-
-
-def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
-  """Gather and name slot variables."""
-  non_slot_objects = list(checkpointable_objects)
-  slot_variables = {}
-  for checkpointable in non_slot_objects:
-    if isinstance(checkpointable, optimizer_lib.Optimizer):
-      naming_scheme = _slot_variable_naming_for_optimizer(
-          optimizer_path=object_names[checkpointable])
-      slot_names = checkpointable.get_slot_names()
-      for slot_name in slot_names:
-        for original_variable_node_id, original_variable in enumerate(
-            non_slot_objects):
-          try:
-            slot_variable = checkpointable.get_slot(
-                original_variable, slot_name)
-          except AttributeError:
-            slot_variable = None
-          if slot_variable is None:
-            continue
-          slot_variable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
-            # TODO(allenl): Gather dependencies of slot variables.
-            raise NotImplementedError(
-                "Currently only variables with no dependencies can be saved as "
-                "slot variables. File a feature request if this limitation "
-                "bothers you.")
-          if slot_variable in node_ids:
-            raise NotImplementedError(
-                "A slot variable was re-used as a dependency of a "
-                "Checkpointable object. This is not currently allowed. File a "
-                "feature request if this limitation bothers you.")
-          checkpoint_name = naming_scheme(
-              variable_path=object_names[original_variable],
-              slot_name=slot_name)
-          object_names[slot_variable] = checkpoint_name
-          slot_variable_node_id = len(checkpointable_objects)
-          node_ids[slot_variable] = slot_variable_node_id
-          checkpointable_objects.append(slot_variable)
-          slot_variable_proto = (
-              checkpointable_object_graph_pb2.CheckpointableObjectGraph
-              .Object.SlotVariableReference(
-                  slot_name=slot_name,
-                  original_variable_node_id=original_variable_node_id,
-                  slot_variable_node_id=slot_variable_node_id))
-          slot_variables.setdefault(checkpointable, []).append(
-              slot_variable_proto)
-  return slot_variables
-
-
-def _serialize_checkpointables(
-    checkpointable_objects, node_ids, object_names, slot_variables):
-  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  named_saveables = {}
-
-  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
-    assert node_ids[checkpointable] == checkpoint_id
-    object_proto = object_graph_proto.nodes.add()
-    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
-    object_name = object_names[checkpointable]
-    for name, saveable in (
-        checkpointable._gather_tensors_for_checkpoint().items()):  # pylint: disable=protected-access
-      attribute = object_proto.attributes.add()
-      attribute.name = name
-      attribute.checkpoint_key = "%s/%s/%s" % (
-          object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
-      # Figure out the name-based Saver's name for this variable.
-      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
-          [saveable], convert_variable_to_tensor=False)
-      attribute.full_name, = saver_dict.keys()
-      named_saveables[attribute.checkpoint_key] = saveable
-
-    for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
-      child_proto = object_proto.children.add()
-      child_proto.node_id = node_ids[child.ref]
-      child_proto.local_name = child.name
-
-  return named_saveables, object_graph_proto
-
-
-def _serialize_object_graph(root_checkpointable):
-  """Determine checkpoint keys for variables and build a serialized graph.
-
-  Non-slot variables are keyed based on a shortest path from the root saveable
-  to the object which owns the variable (i.e. the one which called
-  `Checkpointable._add_variable` to create it).
-
-  Slot variables are keyed based on a shortest path to the variable being
-  slotted for, a shortest path to their optimizer, and the slot name.
-
-  Args:
-    root_checkpointable: A `Checkpointable` object whose variables (including
-      the variables of dependencies, recursively) should be saved.
-
-  Returns:
-    A tuple of (named_variables, object_graph_proto):
-      named_variables: A dictionary mapping names to variable objects.
-      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
-        the serialized object graph and variable references.
-
-  Raises:
-    ValueError: If there are invalid characters in an optimizer's slot names.
-  """
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = {
-      obj: _object_prefix_from_path(path)
-      for obj, path in path_to_root.items()}
-  node_ids = {node: node_id for node_id, node
-              in enumerate(checkpointable_objects)}
-  slot_variables = _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  return _serialize_checkpointables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names,
-      slot_variables=slot_variables)
-
-
-class _NoRestoreSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
-
-  def __init__(self, tensor, name):
-    spec = saver_lib.BaseSaverBuilder.SaveSpec(tensor, "", name)
-    super(_NoRestoreSaveable, self).__init__(tensor, [spec], name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    return control_flow_ops.no_op()
-
-
-def save(file_prefix, root_checkpointable, checkpoint_number=None,
-         session=None):
-  """Save a training checkpoint.
-
-  Args:
-    file_prefix: A prefix to use for the checkpoint filenames
-      (/path/to/directory/and_a_prefix). Names are generated based on this
-      prefix and the global step, if provided.
-    root_checkpointable: A Checkpointable object to save. The checkpoint
-      includes variables created by this object and any Checkpointable objects
-      it depends on.
-    checkpoint_number: An integer variable or Tensor, used to number
-      checkpoints. Typically this value is saved along with other variables in
-      training checkpoints, which will happen automatically if it was created by
-      `root_checkpointable` or one of its dependencies (via
-      `Checkpointable._add_variable`).
-    session: The session to evaluate variables in. Ignored when executing
-      eagerly. If not provided when graph building, the default session is used.
-
-  Returns:
-    The full path to the checkpoint.
-  """
-  named_variables, serialized_graph = _serialize_object_graph(
-      root_checkpointable)
-  if context.in_graph_mode():
-    if session is None:
-      session = ops.get_default_session()
-  else:
-    session = None
-  assert _OBJECT_GRAPH_PROTO_KEY not in named_variables
-  # TODO(allenl): Feed rather than embedding a constant.
-  named_variables[_OBJECT_GRAPH_PROTO_KEY] = _NoRestoreSaveable(
-      tensor=constant_op.constant(
-          serialized_graph.SerializeToString(), dtype=dtypes.string),
-      name=_OBJECT_GRAPH_PROTO_KEY)
-  with ops.device("/device:CPU:0"):
-    save_path = saver_lib.Saver(var_list=named_variables).save(
-        sess=session,
-        save_path=file_prefix,
-        write_meta_graph=False,
-        global_step=checkpoint_number)
-  return save_path
-
-
-class CheckpointLoadStatus(object):
-  """Checks the status of checkpoint loading."""
-
-  def __init__(self, checkpoint):
-    self._checkpoint = checkpoint
-
-  def assert_consumed(self):
-    """Asserts that all objects in the checkpoint have been created/matched."""
-    for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
-      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if checkpointable is None:
-        raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
-      if checkpointable._update_uid < self._checkpoint.restore_uid:  # pylint: disable=protected-access
-        raise AssertionError(
-            "Object not assigned a value from checkpoint: %s" % (node,))
-    if self._checkpoint.slot_restorations:
-      # Sanity check; this collection should be clear if everything has been
-      # restored.
-      raise AssertionError("Unresolved slot restorations: %s" % (
-          self._checkpoint.slot_restorations,))
-    return self
-
-  @property
-  def restore_ops(self):
-    """Operations to restore objects in the dependency graph."""
-    return self._checkpoint.restore_ops
-
-
-def restore(save_path, root_checkpointable, session=None):
-  """Restore a training checkpoint.
-
-  Restores the values of variables created with `Checkpointable._add_variable`
-  in `root_checkpointable` and any objects that it tracks (transitive). Either
-  assigns values immediately if variables to restore have been created already,
-  or defers restoration until the variables are created. Dependencies added to
-  `root_checkpointable` after this call will be matched if they have a
-  corresponding object in the checkpoint.
-
-  When building a graph, restorations are added to the graph but not run. A
-  session is required to retrieve checkpoint metadata.
-
-  To disallow deferred loading, assert immediately that all checkpointed
-  variables have been matched to variable objects:
-
-  ```python
-  restore(path, root).assert_consumed()
-  ```
-
-  An exception will be raised unless every object was matched and its variables
-  already exist.
-
-  When graph building, `assert_consumed()` indicates that all of the restore ops
-  which will be created for this checkpoint have been created. They are
-  available in the `restore_ops` property of the status object:
-
-  ```python
-  session.run(restore(path, root).assert_consumed().restore_ops)
-  ```
-
-  If the checkpoint has not been consumed completely, then the list of
-  `restore_ops` will grow as more objects are added to the dependency graph.
-
-  Args:
-    save_path: The path to the checkpoint, as returned by `save` or
-      `tf.train.latest_checkpoint`. If None (as when there is no latest
-      checkpoint for `tf.train.latest_checkpoint` to return), does nothing.
-    root_checkpointable: The root of the object graph to restore. Variables to
-      restore need not have been created yet, but all dependencies on other
-      `Checkpointable` objects should already be declared. Objects in the
-      dependency graph are matched to objects in the checkpointed graph, and
-      matching objects have their variables restored (or the checkpointed values
-      saved for eventual restoration when the variable is created).
-    session: The session to retrieve metadata with. Ignored when executing
-      eagerly. If not provided when graph building, the default session is used.
-  Returns:
-    A `CheckpointLoadStatus` object, which can be used to make assertions about
-    the status of checkpoint restoration and fetch restore ops.
-  """
-  if save_path is None:
-    return
-  if context.in_graph_mode():
-    if session is None:
-      session = ops.get_default_session()
-  else:
-    session = None
-  object_graph_string, = io_ops.restore_v2(
-      prefix=save_path,
-      tensor_names=[_OBJECT_GRAPH_PROTO_KEY],
-      shape_and_slices=[""],
-      dtypes=[dtypes.string],
-      name="object_graph_proto_read")
-  if session is not None:
-    object_graph_string = session.run(object_graph_string)
-  else:
-    object_graph_string = object_graph_string.numpy()
-  object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  object_graph_proto.ParseFromString(object_graph_string)
-  checkpoint = core_checkpointable._Checkpoint(  # pylint: disable=protected-access
-      object_graph_proto=object_graph_proto,
-      save_path=save_path)
-  core_checkpointable._CheckpointPosition(  # pylint: disable=protected-access
-      checkpoint=checkpoint, proto_id=0).restore(root_checkpointable)
-  load_status = CheckpointLoadStatus(checkpoint)
-  return load_status
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
deleted file mode 100644
index 21ba6adc6a26a11783264be2e217373453224e79..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ /dev/null
@@ -1,886 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-import unittest
-
-import six
-
-from tensorflow.contrib.eager.python import checkpointable_utils
-from tensorflow.contrib.eager.python import network as network_lib
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.layers import base
-from tensorflow.python.layers import core
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.training import adam
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import saver as core_saver
-from tensorflow.python.training import training_util
-
-
-class CheckpointableDenseLayer(core.Dense, checkpointable.Checkpointable):
-
-  def __init__(self, *args, **kwargs):
-    checkpointable.Checkpointable.__init__(self)
-    core.Dense.__init__(self, *args, **kwargs)
-
-  def add_variable(self, name, shape, **kwargs):
-    # Calls both Checkpointable._add_variable and Layer.add_variable. Eventually
-    # Layer.add_variable should inherit from Checkpointable and simply call
-    # super and then do post-processing.
-    return checkpointable.Checkpointable._add_variable_with_custom_getter(
-        self,
-        name=name,
-        shape=shape,
-        getter=functools.partial(core.Dense.add_variable, self),
-        **kwargs)
-
-
-# pylint: disable=not-callable
-class CheckpointableNetwork(network_lib.Network, checkpointable.Checkpointable):
-
-  def __setattr__(self, name, value):
-    if isinstance(value, base.Layer):
-      self.track_layer(value, name=name)
-    # Checkpointable is next in the method resolution order, so this will catch
-    # Checkpointable objects which aren't Layers.
-    super(CheckpointableNetwork, self).__setattr__(name, value)
-
-  def track_layer(self, layer, name):
-    self._track_checkpointable(layer, name=name)
-    return super(CheckpointableNetwork, self).track_layer(layer)
-
-
-class CheckpointableAdam(adam.AdamOptimizer, checkpointable.Checkpointable):
-
-  # NOTE: Copied from Optimizer with modifications to use add_variable
-  # for non-slot variables. These contortions are necessary to maintain
-  # checkpoint compatibility with variable.name based saving.
-  # TODO(allenl): Make this cleaner.
-  def _create_non_slot_variable(self, initial_value, name, colocate_with):
-    """Add an extra variable, not associated with a slot."""
-    if context.in_graph_mode():
-      graph = colocate_with.graph
-    else:
-      graph = None
-
-    key = (name, graph)
-    v = self._non_slot_dict.get(key, None)
-    if v is None:
-      with ops.colocate_with(colocate_with):
-        def _variable_getter(name, shape, dtype, initializer):
-          del shape, dtype  # not used, but there for compatibility
-          return variable_scope.variable(
-              name=name, initial_value=initializer, trainable=False)
-
-        initial_value = ops.convert_to_tensor(initial_value)
-        v = self._add_variable_with_custom_getter(
-            name=name,
-            shape=initial_value.get_shape(),
-            initializer=initial_value,
-            getter=_variable_getter)
-
-      self._non_slot_dict[key] = v
-
-    return v
-
-
-class NonLayerCheckpointable(checkpointable.Checkpointable):
-
-  def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
-        self, name="a_variable", shape=[])
-
-
-class MyNetwork(CheckpointableNetwork):
-  """A concrete Network for testing."""
-
-  def __init__(self):
-    super(MyNetwork, self).__init__()
-    self._named_dense = CheckpointableDenseLayer(1, use_bias=True)
-    self._via_track_layer = self.track_layer(
-        CheckpointableDenseLayer(1, use_bias=False), name="via_track_layer")
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
-
-  def call(self, values):
-    return self._via_track_layer(self._named_dense(values))
-
-
-class Checkpoint(checkpointable.Checkpointable):
-  """A utility class which groups `Checkpointable` objects."""
-
-  def __init__(self, **kwargs):
-    super(Checkpoint, self).__init__()
-    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      setattr(self, k, v)
-    self._save_counter = None
-
-  @property
-  def save_counter(self):
-    """An integer variable which starts at zero and is incremented on save.
-
-    Used to number checkpoints.
-
-    Returns:
-      The save counter variable.
-    """
-    if self._save_counter is None:
-      # Initialized to 0 and incremented before saving.
-      self._save_counter = checkpointable_utils.add_variable(
-          self, name="save_counter", initializer=0, dtype=dtypes.int64)
-    return self._save_counter
-
-  def save(self, file_prefix, session=None):
-    assign_op = self.save_counter.assign_add(1)
-    if context.in_graph_mode():
-      if session is None:
-        session = ops.get_default_session()
-      session.run(assign_op)
-    return checkpointable_utils.save(
-        file_prefix=file_prefix,
-        root_checkpointable=self,
-        checkpoint_number=self.save_counter,
-        session=session)
-
-  def restore(self, save_path):
-    return checkpointable_utils.restore(
-        save_path=save_path,
-        root_checkpointable=self)
-
-
-class InterfaceTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testAddVariable(self):
-    obj = NonLayerCheckpointable()
-    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
-      checkpointable_utils.add_variable(
-          obj, name="shape_specified_twice", shape=[], initializer=1)
-    constant_initializer = checkpointable_utils.add_variable(
-        obj, name="constant_initializer", initializer=1)
-    with variable_scope.variable_scope("some_variable_scope"):
-      ones_initializer = checkpointable_utils.add_variable(
-          obj,
-          name="ones_initializer",
-          shape=[2],
-          initializer=init_ops.ones_initializer(dtype=dtypes.float32))
-    bare_initializer = checkpointable_utils.add_variable(
-        obj,
-        name="bare_initializer",
-        shape=[2, 2],
-        dtype=dtypes.float64,
-        initializer=init_ops.zeros_initializer)
-
-    # Even in graph mode, there are no naming conflicts between objects, only
-    # naming conflicts within an object.
-    other_duplicate = resource_variable_ops.ResourceVariable(
-        name="duplicate", initial_value=1.)
-    duplicate = checkpointable_utils.add_variable(
-        obj, name="duplicate", shape=[])
-    with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"):
-      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])
-
-    if context.in_graph_mode():
-      self.evaluate(variables.global_variables_initializer())
-    self.assertEqual("constant_initializer:0", constant_initializer.name)
-    self.assertEqual(1, self.evaluate(constant_initializer))
-    self.assertEqual("some_variable_scope/ones_initializer:0",
-                     ones_initializer.name)
-    self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
-    self.assertAllEqual([[0., 0.],
-                         [0., 0.]], self.evaluate(bare_initializer))
-    self.assertEqual("a_variable:0", obj.a_variable.name)
-    self.assertEqual("duplicate:0", other_duplicate.name)
-    if context.in_graph_mode():
-      # The .name attribute may be globally influenced, but the checkpoint name
-      # won't be (tested below).
-      self.assertEqual("duplicate_1:0", duplicate.name)
-    else:
-      # When executing eagerly, there's no uniquification of variable names. The
-      # checkpoint name will be the same.
-      self.assertEqual("duplicate:0", duplicate.name)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(obj)
-    expected_checkpoint_names = (
-        "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
-        "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-        "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-        "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
-        "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-    )
-    six.assertCountEqual(
-        self, expected_checkpoint_names, named_variables.keys())
-
-  def testInitNotCalled(self):
-
-    class NoInit(checkpointable.Checkpointable):
-
-      def __init__(self):
-        pass
-
-    # __init__ for Checkpointable will be called implicitly.
-    checkpointable_utils.add_variable(NoInit(), "var", shape=[])
-
-  def testShapeDtype(self):
-    root = checkpointable.Checkpointable()
-    v1 = checkpointable_utils.add_variable(
-        root, name="v1", initializer=3., dtype=dtypes.float64)
-    self.assertEqual(dtypes.float64, v1.dtype)
-    v2 = checkpointable_utils.add_variable(
-        root,
-        name="v2",
-        shape=[3],
-        initializer=init_ops.ones_initializer,
-        dtype=dtypes.float64)
-    self.assertEqual(dtypes.float64, v2.dtype)
-    self.assertAllEqual([1., 1., 1.], self.evaluate(v2))
-
-
-class CheckpointingTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNamingWithOptimizer(self):
-    input_value = constant_op.constant([[3.]])
-    network = MyNetwork()
-    # A nuisance Network using the same optimizer. Its slot variables should not
-    # go in the checkpoint, since it is never depended on.
-    other_network = MyNetwork()
-    optimizer = CheckpointableAdam(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = Checkpoint(
-        optimizer=optimizer, network=network, optimizer_step=optimizer_step)
-    if context.in_eager_mode():
-      optimizer.minimize(
-          lambda: network(input_value),
-          global_step=optimizer_step)
-      optimizer.minimize(
-          lambda: other_network(input_value),
-          global_step=optimizer_step)
-    else:
-      train_op = optimizer.minimize(
-          network(input_value), global_step=optimizer_step)
-      optimizer.minimize(
-          other_network(input_value),
-          global_step=optimizer_step)
-      self.evaluate(variables.global_variables_initializer())
-      self.evaluate(train_op)
-    named_variables, serialized_graph = (
-        checkpointable_utils._serialize_object_graph(root_checkpointable))
-    expected_checkpoint_names = (
-        # Created in the root node, so no prefix.
-        "optimizer_step",
-        # No name provided to track_checkpointable(), so the position is used
-        # instead (one-based).
-        "network/via_track_layer/kernel",
-        # track_checkpointable() with a name provided, so that's used
-        "network/_named_dense/kernel",
-        "network/_named_dense/bias",
-        # non-Layer dependency of the network
-        "network/_non_layer/a_variable",
-        # The optimizer creates two non-slot variables
-        "optimizer/beta1_power",
-        "optimizer/beta2_power",
-        # Slot variables
-        "network/via_track_layer/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "network/via_track_layer/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "network/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "network/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "network/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
-        "network/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
-    )
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    expected_checkpoint_names = [
-        name + suffix for name in expected_checkpoint_names]
-    six.assertCountEqual(self, expected_checkpoint_names,
-                         named_variables.keys())
-    # Check that we've mapped to the right variable objects (not exhaustive)
-    self.assertEqual(
-        "global_step:0",
-        named_variables["optimizer_step" + suffix].name)
-    self.assertEqual(
-        "my_network/checkpointable_dense_layer_1/kernel:0",
-        named_variables["network/via_track_layer/kernel" + suffix].name)
-    self.assertEqual(
-        "my_network/checkpointable_dense_layer/kernel:0",
-        named_variables["network/_named_dense/kernel" + suffix].name)
-    self.assertEqual(
-        "beta1_power:0",
-        named_variables["optimizer/beta1_power" + suffix].name)
-    self.assertEqual(
-        "beta2_power:0",
-        named_variables["optimizer/beta2_power" + suffix].name)
-    # Spot check the generated protocol buffers.
-    self.assertEqual("optimizer",
-                     serialized_graph.nodes[0].children[1].local_name)
-    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
-        1].node_id]
-    self.assertEqual("beta1_power",
-                     optimizer_node.children[0].local_name)
-    self.assertEqual("beta1_power",
-                     serialized_graph.nodes[optimizer_node.children[0].node_id]
-                     .attributes[0].full_name)
-    self.assertEqual(
-        "my_network/checkpointable_dense_layer/kernel",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .original_variable_node_id]
-        .attributes[0].full_name)
-    # We strip off the :0 suffix, as variable.name-based saving does.
-    self.assertEqual(
-        "my_network/checkpointable_dense_layer/kernel/Adam",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .slot_variable_node_id]
-        .attributes[0].full_name)
-    self.assertEqual(
-        "my_network/checkpointable_dense_layer/kernel/Adam:0",
-        optimizer.get_slot(
-            var=named_variables["network/_named_dense/kernel" + suffix],
-            name="m").name)
-    self.assertEqual(
-        "network/_named_dense/kernel" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .original_variable_node_id].attributes[0].checkpoint_key)
-    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
-    self.assertEqual(
-        "network/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .slot_variable_node_id].attributes[0].checkpoint_key)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testSaveRestore(self):
-    network = MyNetwork()
-    optimizer = CheckpointableAdam(0.001)
-    root_checkpointable = Checkpoint(optimizer=optimizer, network=network)
-    input_value = constant_op.constant([[3.]])
-    if context.in_eager_mode():
-      optimizer.minimize(
-          lambda: network(input_value))
-    else:
-      train_op = optimizer.minimize(network(input_value))
-      # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
-      self.evaluate(variables.global_variables_initializer())
-      self.evaluate(train_op)
-    prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    self.evaluate(state_ops.assign(network._named_dense.variables[1], [42.]))
-    m_bias_slot = optimizer.get_slot(network._named_dense.variables[1], "m")
-    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
-    self.evaluate(state_ops.assign(network._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
-    optimizer_variables = self.evaluate(optimizer.variables())
-    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
-    # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
-    self.evaluate(status.restore_ops)
-    self.assertAllEqual([42.], self.evaluate(network._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
-    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
-    if context.in_graph_mode():
-      return  # Restore-on-create is only supported when executing eagerly
-    on_create_network = MyNetwork()
-    on_create_optimizer = CheckpointableAdam(0.001)
-    on_create_root = Checkpoint(
-        optimizer=on_create_optimizer, network=on_create_network)
-    # Deferred restoration
-    status = on_create_root.restore(save_path=save_path)
-    on_create_network(constant_op.constant([[3.]]))  # create variables
-    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
-    self.assertAllEqual([42.],
-                        self.evaluate(
-                            on_create_network._named_dense.variables[1]))
-    on_create_m_bias_slot = on_create_optimizer.get_slot(
-        on_create_network._named_dense.variables[1], "m")
-    # Optimizer slot variables are created when the original variable is
-    # restored.
-    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-    self.assertAllEqual(optimizer_variables[2:],
-                        self.evaluate(on_create_optimizer.variables()))
-    on_create_optimizer._create_slots(
-        [resource_variable_ops.ResourceVariable([1.])])
-    status.assert_consumed()
-    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
-
-  def testDeferredRestorationUsageEager(self):
-    """An idiomatic eager execution example."""
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      network = MyNetwork()
-      optimizer = CheckpointableAdam(0.001)
-      root = Checkpoint(
-          optimizer=optimizer, network=network,
-          optimizer_step=training_util.get_or_create_global_step())
-      root.restore(core_saver.latest_checkpoint(checkpoint_directory))
-      for _ in range(num_training_steps):
-        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
-        input_value = constant_op.constant([[3.]])
-        optimizer.minimize(
-            lambda: network(input_value),  # pylint: disable=cell-var-from-loop
-            global_step=root.optimizer_step)
-      root.save(file_prefix=checkpoint_prefix)
-      self.assertEqual((training_continuation + 1) * num_training_steps,
-                       root.optimizer_step.numpy())
-
-  def testUsageGraph(self):
-    """Expected usage when graph building."""
-    with context.graph_mode():
-      num_training_steps = 10
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      for training_continuation in range(3):
-        with ops.Graph().as_default():
-          network = MyNetwork()
-          optimizer = CheckpointableAdam(0.001)
-          root = Checkpoint(
-              optimizer=optimizer, network=network,
-              global_step=training_util.get_or_create_global_step())
-          input_value = constant_op.constant([[3.]])
-          train_op = optimizer.minimize(
-              network(input_value),
-              global_step=root.global_step)
-          root.save_counter  # pylint: disable=pointless-statement
-          init_op = variables.global_variables_initializer()
-          checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
-          with self.test_session(graph=ops.get_default_graph()) as session:
-            if checkpoint_path is None:
-              self.assertEqual(0, training_continuation)
-              session.run(init_op)
-              # Another alternative would be to run initializers automatically
-              # if no checkpoint is being loaded. This would make deferred
-              # loading a bit more useful with graph execution.
-            else:
-              status = checkpointable_utils.restore(
-                  save_path=checkpoint_path,
-                  root_checkpointable=root,
-                  session=session).assert_consumed()
-              session.run(status.restore_ops)
-            for _ in range(num_training_steps):
-              session.run(train_op)
-            root.save(file_prefix=checkpoint_prefix,
-                      session=session)
-            self.assertEqual((training_continuation + 1) * num_training_steps,
-                             session.run(root.global_step))
-            self.assertEqual(training_continuation + 1,
-                             session.run(root.save_counter))
-
-  def _get_checkpoint_name(self, name):
-    root = checkpointable.Checkpointable()
-    checkpointable_utils.add_variable(
-        root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    checkpoint_name, = named_variables.keys()
-    with ops.name_scope("root/" + checkpoint_name):
-      pass  # Make sure we can use this as an op name if we prefix it.
-    return checkpoint_name
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testVariableNameEscaping(self):
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    self.assertEqual(r"a.Sb.Sc" + suffix, self._get_checkpoint_name(r"a/b/c"))
-    self.assertEqual(r"b" + suffix, self._get_checkpoint_name(r"b"))
-    self.assertEqual(r"c.S" + suffix, self._get_checkpoint_name(r"c/"))
-    self.assertEqual(r"d.S..S" + suffix, self._get_checkpoint_name(r"d/.S"))
-    self.assertEqual(r"d.S..ATTRIBUTES.Sf" + suffix,
-                     self._get_checkpoint_name(r"d/.ATTRIBUTES/f"))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNumberedPath(self):
-    root = checkpointable.Checkpointable()
-    leaf = checkpointable.Checkpointable()
-    root.leaf = leaf
-    checkpointable_utils.add_variable(leaf, name="v", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    variable_name, = named_variables.keys()
-    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", variable_name)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testLocalNameValidation(self):
-    root = checkpointable.Checkpointable()
-    leaf = checkpointable.Checkpointable()
-    # Dots are escaped, which avoids conflicts with reserved names.
-    root._track_checkpointable(leaf, name=".ATTRIBUTES")
-    checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    name, = named_variables.keys()
-    self.assertEqual(name, "..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE")
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testLateDependencyTracking(self):
-
-    class Dependency(checkpointable.Checkpointable):
-
-      def build(self):
-        self.var = checkpointable_utils.add_variable(
-            self, "var", initializer=0.)
-
-    class LateDependencies(checkpointable.Checkpointable):
-
-      def add_dep(self):
-        self.dep = Dependency()
-        self.dep.build()
-
-    original = LateDependencies()
-    original.add_dep()
-    self.evaluate(state_ops.assign(original.dep.var, 123.))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.save(checkpoint_prefix, original)
-    load_into = LateDependencies()
-    status = checkpointable_utils.restore(save_path, load_into)
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    load_into.add_dep()
-    status.assert_consumed()
-    self.evaluate(status.restore_ops)
-    self.assertEqual(123., self.evaluate(load_into.dep.var))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testDepAfterVar(self):
-
-    class Dependency(checkpointable.Checkpointable):
-
-      def build(self):
-        self.var = checkpointable_utils.add_variable(
-            self, "var", initializer=0.)
-
-    class DepAfterVar(checkpointable.Checkpointable):
-
-      def add_dep(self):
-        dep = Dependency()
-        dep.build()
-        self.dep = dep
-
-    dep_after_var = DepAfterVar()
-    dep_after_var.add_dep()
-    self.evaluate(state_ops.assign(dep_after_var.dep.var, -14.))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.save(
-        checkpoint_prefix, dep_after_var)
-
-    loaded_dep_after_var = DepAfterVar()
-    status = checkpointable_utils.restore(
-        save_path, loaded_dep_after_var)
-    loaded_dep_after_var.add_dep()
-    status.assert_consumed()
-    self.evaluate(status.restore_ops)
-    self.assertEqual(-14., self.evaluate(loaded_dep_after_var.dep.var))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testDeferredSlotRestoration(self):
-    checkpoint_directory = self.get_temp_dir()
-
-    root = checkpointable.Checkpointable()
-    root.var = checkpointable_utils.add_variable(
-        root, name="var", initializer=0.)
-    optimizer = CheckpointableAdam(0.1)
-    if context.in_graph_mode():
-      train_op = optimizer.minimize(root.var)
-      self.evaluate(variables.global_variables_initializer())
-      self.evaluate(train_op)
-    else:
-      optimizer.minimize(root.var.read_value)
-    self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = checkpointable_utils.save(
-        os.path.join(checkpoint_directory, "no_slots"), root)
-    root.optimizer = optimizer
-    self.evaluate(state_ops.assign(root.var, 13.))
-    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
-                                   14.))
-    slots_path = checkpointable_utils.save(
-        os.path.join(checkpoint_directory, "with_slots"), root)
-    new_root = checkpointable.Checkpointable()
-    # Load the slot-containing checkpoint (deferred), then immediately overwrite
-    # the non-slot variable (also deferred).
-    slot_status = checkpointable_utils.restore(
-        slots_path, new_root)
-    no_slot_status = checkpointable_utils.restore(
-        no_slots_path, new_root)
-    with self.assertRaises(AssertionError):
-      no_slot_status.assert_consumed()
-    new_root.var = checkpointable_utils.add_variable(
-        new_root, name="var", shape=[])
-    no_slot_status.assert_consumed()
-    self.evaluate(no_slot_status.restore_ops)
-    self.assertEqual(12., self.evaluate(new_root.var))
-    new_root.optimizer = CheckpointableAdam(0.1)
-    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
-      slot_status.assert_consumed()
-    self.assertEqual(12., self.evaluate(new_root.var))
-    if context.in_eager_mode():
-      # Slot variables are only created with restoring initializers when
-      # executing eagerly.
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
-    else:
-      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
-                    None)
-    if context.in_graph_mode():
-      train_op = new_root.optimizer.minimize(new_root.var)
-      # The slot variable now exists; restore() didn't create it, but we should
-      # now have a restore op for it.
-      self.evaluate(slot_status.restore_ops)
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
-      self.evaluate(train_op)
-    else:
-      new_root.optimizer.minimize(new_root.var.read_value)
-    slot_status.assert_consumed()
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testOverlappingRestores(self):
-    checkpoint_directory = self.get_temp_dir()
-    save_root = checkpointable.Checkpointable()
-    save_root.dep = checkpointable.Checkpointable()
-    save_root.dep.var = checkpointable_utils.add_variable(
-        save_root.dep, name="var", initializer=0.)
-    self.evaluate(state_ops.assign(save_root.dep.var, 12.))
-    first_path = checkpointable_utils.save(
-        os.path.join(checkpoint_directory, "first"), save_root)
-    self.evaluate(state_ops.assign(save_root.dep.var, 13.))
-    second_path = checkpointable_utils.save(
-        os.path.join(checkpoint_directory, "second"), save_root)
-
-    first_root = checkpointable.Checkpointable()
-    second_root = checkpointable.Checkpointable()
-    first_status = checkpointable_utils.restore(
-        first_path, first_root)
-    second_status = checkpointable_utils.restore(
-        second_path, second_root)
-    load_dep = checkpointable.Checkpointable()
-    load_dep.var = checkpointable_utils.add_variable(
-        load_dep, name="var", shape=[])
-    first_root.dep = load_dep
-    first_status.assert_consumed()
-    self.evaluate(first_status.restore_ops)
-    self.assertEqual([], second_status.restore_ops)
-    self.assertEqual(12., self.evaluate(load_dep.var))
-    second_root.dep = load_dep
-    second_status.assert_consumed()
-    self.evaluate(second_status.restore_ops)
-    self.assertEqual(13., self.evaluate(load_dep.var))
-
-    # Try again with the order of the restore() reversed. The last restore
-    # determines the final value.
-    first_root = checkpointable.Checkpointable()
-    second_root = checkpointable.Checkpointable()
-    second_status = checkpointable_utils.restore(
-        second_path, second_root)
-    first_status = checkpointable_utils.restore(
-        first_path, first_root)
-    load_dep = checkpointable.Checkpointable()
-    load_dep.var = checkpointable_utils.add_variable(
-        load_dep, name="var", shape=[])
-    first_root.dep = load_dep
-    first_status.assert_consumed()
-    self.assertEqual([], second_status.restore_ops)
-    self.evaluate(first_status.restore_ops)
-    self.assertEqual(12., self.evaluate(load_dep.var))
-    second_root.dep = load_dep
-    second_status.assert_consumed()
-    self.evaluate(second_status.restore_ops)
-    self.assertEqual(12., self.evaluate(load_dep.var))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testAmbiguousLoad(self):
-    # Not OK to split one checkpoint object into two
-    checkpoint_directory = self.get_temp_dir()
-    save_root = checkpointable.Checkpointable()
-    save_root.dep_one = checkpointable.Checkpointable()
-    save_root.dep_two = checkpointable.Checkpointable()
-    dep_three = checkpointable.Checkpointable()
-    save_root.dep_one.dep_three = dep_three
-    save_root.dep_two.dep_three = dep_three
-    checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
-    self.evaluate(variables.global_variables_initializer())
-    save_path = checkpointable_utils.save(
-        os.path.join(checkpoint_directory, "ckpt"), save_root)
-    load_root = checkpointable.Checkpointable()
-    checkpointable_utils.restore(save_path, load_root)
-    load_root.dep_one = checkpointable.Checkpointable()
-    load_root.dep_two = checkpointable.Checkpointable()
-    load_root.dep_one.dep_three = checkpointable.Checkpointable()
-    with self.assertRaisesRegexp(AssertionError,
-                                 "resolved to different objects"):
-      load_root.dep_two.dep_three = checkpointable.Checkpointable()
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testObjectsCombined(self):
-    # Currently fine to load two checkpoint objects into one Python object
-    checkpoint_directory = self.get_temp_dir()
-    save_root = checkpointable.Checkpointable()
-    save_root.dep_one = checkpointable.Checkpointable()
-    save_root.dep_two = checkpointable.Checkpointable()
-    checkpointable_utils.add_variable(
-        save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
-    checkpointable_utils.add_variable(
-        save_root.dep_two, name="var2", initializer=64., dtype=dtypes.float64)
-    self.evaluate(variables.global_variables_initializer())
-    save_path = checkpointable_utils.save(
-        os.path.join(checkpoint_directory, "ckpt"), save_root)
-    load_root = checkpointable.Checkpointable()
-    load_root.dep_one = checkpointable.Checkpointable()
-    load_root.dep_two = load_root.dep_one
-    v1 = checkpointable_utils.add_variable(
-        load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
-    v2 = checkpointable_utils.add_variable(
-        load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
-    status = checkpointable_utils.restore(
-        save_path, load_root).assert_consumed()
-    self.evaluate(status.restore_ops)
-    self.assertEqual(32., self.evaluate(v1))
-    self.assertEqual(64., self.evaluate(v2))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testDependencyLoop(self):
-    # Note: this test creates garbage during eager execution because it
-    # purposefully creates a reference cycle.
-    first = checkpointable.Checkpointable()
-    second = checkpointable.Checkpointable()
-    first.second = second
-    second.first = first
-    first.v = checkpointable_utils.add_variable(
-        first, "v1", initializer=[3., 1., 4.])
-    second.v = checkpointable_utils.add_variable(
-        second, "v2", initializer=[1., 1., 2., 3.])
-    self.evaluate(variables.global_variables_initializer())
-    checkpoint_directory = self.get_temp_dir()
-    save_path = checkpointable_utils.save(
-        os.path.join(checkpoint_directory, "ckpt"), first)
-
-    # Test deferred loading
-    first_load = checkpointable.Checkpointable()
-    status = checkpointable_utils.restore(save_path, first_load)
-    second_load = checkpointable.Checkpointable()
-    first_load.second = second_load
-    second_load.first = first_load
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    first_load.v = checkpointable_utils.add_variable(
-        first_load, "v1", shape=[3])
-    second_load.v = checkpointable_utils.add_variable(
-        second_load, "v2", shape=[4])
-    status.assert_consumed()
-    self.evaluate(status.restore_ops)
-    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
-    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
-
-    # Test loading when variables have already been created
-    self.evaluate(first_load.v.assign([2., 7., 1.]))
-    self.assertAllEqual([2., 7., 1.], self.evaluate(first_load.v))
-    self.evaluate(second_load.v.assign([2., 7., 1., 8.]))
-    self.assertAllEqual([2., 7., 1., 8.], self.evaluate(second_load.v))
-    status = checkpointable_utils.restore(
-        save_path, first_load).assert_consumed()
-    self.evaluate(status.restore_ops)
-    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
-    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testRestoreOnAssign(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_graph = ops.Graph()
-    with save_graph.as_default(), self.test_session(save_graph):
-      first = checkpointable.Checkpointable()
-      first.var1 = variable_scope.get_variable(
-          name="outside_var", initializer=0.)
-      first.var2 = variable_scope.get_variable(
-          name="blah", initializer=0.)
-      self.evaluate(first.var1.assign(4.))
-      self.evaluate(first.var2.assign(8.))
-      save_path = checkpointable_utils.save(
-          checkpoint_prefix, root_checkpointable=first)
-    restore_graph = ops.Graph()
-    with restore_graph.as_default(), self.test_session(restore_graph):
-      second = checkpointable.Checkpointable()
-      second.var2 = variable_scope.get_variable(
-          name="blah", initializer=0.)
-      status = checkpointable_utils.restore(
-          save_path, root_checkpointable=second)
-      recreated_var1 = variable_scope.get_variable(
-          name="outside_var", initializer=0.)
-      self.evaluate(status.restore_ops)
-      self.assertEqual(8., self.evaluate(second.var2))
-      self.evaluate(recreated_var1.assign(-2.))
-      self.assertEqual(-2., self.evaluate(recreated_var1))
-      second.var1 = recreated_var1
-      self.evaluate(status.restore_ops)
-      self.assertEqual(4., self.evaluate(recreated_var1))
-
-  # TODO(allenl): Saver class that doesn't pollute the graph with constants.
-  @unittest.skip("todo")
-  def testManySavesGraph(self):
-    """Saves after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
-        checkpoint_directory = self.get_temp_dir()
-        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = checkpointable.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = CheckpointableAdam(0.1)
-        obj.opt.minimize(obj.var.read_value())
-        self.evaluate(variables.global_variables_initializer())
-        checkpointable_utils.save(
-            checkpoint_prefix, root_checkpointable=obj)
-        before_ops = graph.get_operations()
-        checkpointable_utils.save(
-            checkpoint_prefix, root_checkpointable=obj)
-        self.assertEqual(before_ops, graph.get_operations())
-
-  @unittest.skip("todo")
-  def testManyRestoresGraph(self):
-    """Restores after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
-        checkpoint_directory = self.get_temp_dir()
-        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = checkpointable.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = CheckpointableAdam(0.1)
-        obj.opt.minimize(obj.var.read_value())
-        self.evaluate(variables.global_variables_initializer())
-        save_path = checkpointable_utils.save(
-            checkpoint_prefix, root_checkpointable=obj)
-        checkpointable_utils.restore(
-            save_path, root_checkpointable=obj)
-        before_ops = graph.get_operations()
-        checkpointable_utils.restore(
-            save_path, root_checkpointable=obj)
-        self.assertEqual(before_ops, graph.get_operations())
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index d177bfeab2d1fdc05d7ced54df8723fae2c77fdb..0783d1b5d70e502e6edd80b59f37fdd93b413e12 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -27,11 +27,12 @@ from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training.saver import BaseSaverBuilder
 
 _uid_counter = 0
 _uid_lock = threading.Lock()
@@ -45,8 +46,13 @@ def _generate_shared_name(prefix):
   return "{}{}".format(prefix, uid)
 
 
-class Iterator(object):
-  """An iterator producing tf.Tensor objects from a tf.data.Dataset."""
+class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
+  """An iterator producing tf.Tensor objects from a tf.data.Dataset.
+
+  NOTE: Unlike the iterator created by the
+  @{tf.data.Dataset.make_one_shot_iterator} method, this class enables
+  additional experimental functionality, such as prefetching to the GPU.
+  """
 
   def __init__(self, dataset):
     """Creates a new iterator over the given dataset.
@@ -65,39 +71,21 @@ class Iterator(object):
       dataset: A `tf.data.Dataset` object.
 
     Raises:
+      TypeError: If `dataset` is an unsupported type.
       RuntimeError: When invoked without eager execution enabled.
     """
+    if isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset):  # pylint: disable=protected-access
+      raise TypeError(
+          "`tf.contrib.data.prefetch_to_device()` is not compatible with "
+          "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
+          "over the dataset instead.")
 
-    if not context.in_eager_mode():
-      raise RuntimeError(
-          "{} objects can only be used when eager execution is enabled, use "
-          "tf.data.Dataset.make_iterator or "
-          "tf.data.Dataset.make_one_shot_iterator for graph construction".
-          format(type(self)))
-    with ops.device("/device:CPU:0"):
-      ds_variant = dataset._as_variant_tensor()  # pylint: disable=protected-access
-      self._output_classes = dataset.output_classes
-      self._output_types = dataset.output_types
-      self._output_shapes = dataset.output_shapes
-      self._flat_output_types = nest.flatten(
-          sparse.as_dense_types(self._output_types, self._output_classes))
-      self._flat_output_shapes = nest.flatten(
-          sparse.as_dense_shapes(self._output_shapes, self._output_classes))
-      self._resource = gen_dataset_ops.iterator(
-          shared_name="",
-          container=_generate_shared_name("eageriterator"),
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
-      gen_dataset_ops.make_iterator(ds_variant, self._resource)
-      # Delete the resource when this object is deleted
-      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
-          handle=self._resource, handle_device="/device:CPU:0")
-    self._device = context.context().device_name
-    self._buffer_resource_handle = None
+    super(Iterator, self).__init__(dataset)
     if not context.context().device_spec.device_type:
       is_remote_device = False
     else:
       is_remote_device = context.context().device_spec.device_type != "CPU"
+    self._buffer_resource_handle = None
     if is_remote_device:
       with ops.device("/device:CPU:0"):
         iter_string_handle = gen_dataset_ops.iterator_to_string_handle(
@@ -106,7 +94,7 @@ class Iterator(object):
         @function.Defun(dtypes.string)
         def remote_fn(h):
           remote_iterator = iterator_ops.Iterator.from_string_handle(
-              h, self._output_types, self._output_shapes)
+              h, self.output_types, self.output_shapes, self.output_classes)
           return remote_iterator.get_next()
 
         remote_fn.add_to_graph(None)
@@ -117,96 +105,53 @@ class Iterator(object):
             f=remote_fn,
             target_device=target,
             buffer_size=10,
-            thread_pool_size=1,
             container="",
             shared_name=_generate_shared_name("function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
 
-  def __iter__(self):
-    return self
-
-  def __next__(self):  # For Python 3 compatibility
-    return self.next()
-
   def _next_internal(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
     """
-    with ops.device(self._device):
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
       if self._buffer_resource_handle is not None:
-        ret = prefetching_ops.function_buffering_resource_get_next(
-            function_buffer_resource=self._buffer_resource_handle,
-            output_types=self._flat_output_types)
+        with ops.device(self._device):
+          ret = prefetching_ops.function_buffering_resource_get_next(
+              function_buffer_resource=self._buffer_resource_handle,
+              output_types=self._flat_output_types)
+        return sparse.deserialize_sparse_tensors(
+            nest.pack_sequence_as(self._output_types, ret), self._output_types,
+            self._output_shapes, self._output_classes)
       else:
-        # TODO(ashankar): Consider removing this ops.device() contextmanager
-        # and instead mimic ops placement in graphs: Operations on resource
-        # handles execute on the same device as where the resource is placed.
-        # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
-        # because in eager mode this code will run synchronously on the calling
-        # thread. Therefore we do not need to make a defensive context switch
-        # to a background thread, and can achieve a small constant performance
-        # boost by invoking the iterator synchronously.
-        ret = gen_dataset_ops.iterator_get_next_sync(
-            self._resource,
-            output_types=self._flat_output_types,
-            output_shapes=self._flat_output_shapes)
-
-    return sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(self._output_types, ret), self._output_types,
-        self._output_shapes, self._output_classes)
-
-  def next(self):
-    """Returns a nested structure of `tf.Tensor`s containing the next element.
-    """
-    try:
-      return self._next_internal()
-    except errors.OutOfRangeError:
-      raise StopIteration
-
-  @property
-  def output_classes(self):
-    """Returns the class of each component of an element of this iterator.
-
-    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+        return super(Iterator, self)._next_internal()
 
-    Returns:
-      A nested structure of Python `type` objects corresponding to each
-      component of an element of this dataset.
-    """
-    return self._output_classes
+  # TODO(shivaniagrawal): Expose checkpointable stateful objects from dataset
+  # attributes(potential).
 
-  @property
-  def output_shapes(self):
-    """Returns the shape of each component of an element of this iterator.
+  class _Saveable(BaseSaverBuilder.SaveableObject):
+    """SaveableObject for saving/restoring iterator state."""
 
-    Returns:
-      A nested structure of `tf.TensorShape` objects corresponding to each
-      component of an element of this dataset.
-    """
-    return self._output_shapes
+    def __init__(self, iterator_resource, name):
+      serialized_iterator = gen_dataset_ops.serialize_iterator(
+          iterator_resource)
+      specs = [
+          BaseSaverBuilder.SaveSpec(serialized_iterator, "", name + "_STATE")
+      ]
+      # pylint: disable=protected-access
+      super(Iterator._Saveable, self).__init__(iterator_resource, specs, name)
 
-  @property
-  def output_types(self):
-    """Returns the type of each component of an element of this iterator.
+    def restore(self, restored_tensors, restored_shapes):
+      with ops.colocate_with(self.op):
+        return gen_dataset_ops.deserialize_iterator(self.op,
+                                                    restored_tensors[0])
 
-    Returns:
-      A nested structure of `tf.DType` objects corresponding to each component
-      of an element of this dataset.
-    """
-    return self._output_types
+  def _gather_saveables_for_checkpoint(self):
 
-  def get_next(self, name=None):
-    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    def _saveable_factory(name):
+      return self._Saveable(self._resource, name)
 
-    Args:
-      name: (Optional.) A name for the created operation. Currently unused.
-
-    Returns:
-      A nested structure of `tf.Tensor` objects.
-
-    Raises:
-      `tf.errors.OutOfRangeError`: If the end of the dataset has been reached.
-    """
-    del name
-    return self._next_internal()
+    return {"ITERATOR": _saveable_factory}
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index a1611e92b113839c2dd2a3b2560b0ba90c0a7ef0..7b123707cc3a26073088cf2c57c6211e831c19fd 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -16,11 +16,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
+import threading
 import time
 
 import numpy as np
 
 from tensorflow.contrib import lookup
+from tensorflow.contrib.data.python.ops import prefetching_ops
+from tensorflow.contrib.data.python.ops import threadpool
+from tensorflow.contrib.data.python.ops import unique
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.data import Dataset
 from tensorflow.python.eager import test
@@ -31,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.training import checkpointable_utils
 
 
 class IteratorTest(test.TestCase):
@@ -41,6 +48,18 @@ class IteratorTest(test.TestCase):
       got.append(t.numpy())
     self.assertAllEqual([0, 1, 2, 3], got)
 
+  def testBasicOneShotIterator(self):
+    got = []
+    for t in Dataset.range(4).make_one_shot_iterator():
+      got.append(t.numpy())
+    self.assertAllEqual([0, 1, 2, 3], got)
+
+  def testBasicImplicitIterator(self):
+    got = []
+    for t in Dataset.range(4):
+      got.append(t.numpy())
+    self.assertAllEqual([0, 1, 2, 3], got)
+
   def testGetNext(self):
     iterator = datasets.Iterator(Dataset.range(4))
     self.assertEqual(0, iterator.get_next().numpy())
@@ -50,6 +69,15 @@ class IteratorTest(test.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       iterator.get_next()
 
+  def testGetNextOneShotIterator(self):
+    iterator = Dataset.range(4).make_one_shot_iterator()
+    self.assertEqual(0, iterator.get_next().numpy())
+    self.assertEqual(1, iterator.get_next().numpy())
+    self.assertEqual(2, iterator.get_next().numpy())
+    self.assertEqual(3, iterator.get_next().numpy())
+    with self.assertRaises(errors.OutOfRangeError):
+      iterator.get_next()
+
   def testMultipleIteratorsOnTheSameDataset(self):
     ds = Dataset.range(4)
     it1 = datasets.Iterator(ds)
@@ -165,6 +193,105 @@ class IteratorTest(test.TestCase):
       x = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], x.numpy())
 
+  def testTensorsExplicitPrefetchToDevice(self):
+    ds = Dataset.from_tensor_slices([0., 1.])
+    ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name()))
+
+    with self.assertRaisesRegexp(TypeError, 'prefetch_to_device'):
+      datasets.Iterator(ds)
+
+    for i, x in enumerate(ds):
+      with ops.device(test.gpu_device_name()):
+        x = math_ops.add(x, x)
+        self.assertEqual(float(i) + float(i), x.numpy())
+
+  def testOverrideThreadPool(self):
+
+    def get_thread_id(_):
+      # Python creates a dummy thread object to represent the current
+      # thread when called from an "alien" thread (such as a
+      # `PrivateThreadPool` thread in this case). It does not include
+      # the TensorFlow-given display name, but it has a unique
+      # identifier that maps one-to-one with the underlying OS thread.
+      return np.array(threading.current_thread().ident).astype(np.int64)
+
+    for num_threads in [1, 2, 4, 8, 16]:
+
+      dataset = (
+          Dataset.range(1000).map(
+              lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64),
+              num_parallel_calls=32).apply(unique.unique()))
+
+      dataset = threadpool.override_threadpool(
+          dataset,
+          threadpool.PrivateThreadPool(
+              num_threads, display_name='private_thread_pool_%d' % num_threads))
+
+      thread_ids = []
+      for next_element in datasets.Iterator(dataset):
+        thread_ids.append(next_element)
+      self.assertEqual(len(thread_ids), len(set(thread_ids)))
+      self.assertGreater(len(thread_ids), 0)
+      # NOTE(mrry): We don't control the thread pool scheduling, and
+      # so cannot guarantee that all of the threads in the pool will
+      # perform work.
+      self.assertLessEqual(len(thread_ids), num_threads)
+
+  def testSaveRestore(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    dataset = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset.map(math_ops.square).batch(2)
+    iterator = datasets.Iterator(dataset)
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual([1, 4], iterator.get_next().numpy())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([9, 16], iterator.get_next().numpy())
+    self.assertAllEqual([25, 36], iterator.get_next().numpy())
+    checkpoint.restore(save_path)
+    self.assertAllEqual([9, 16], iterator.get_next().numpy())
+    self.assertAllEqual([25, 36], iterator.get_next().numpy())
+
+  def testSaveRestoreMultipleIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    dataset = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset.map(math_ops.square).batch(2)
+    iterator_1 = datasets.Iterator(dataset)
+    iterator_2 = datasets.Iterator(dataset)
+    dataset_2 = Dataset.range(10)
+    iterator_3 = datasets.Iterator(dataset_2)
+
+    checkpoint = checkpointable_utils.Checkpoint(
+        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
+    self.assertAllEqual([1, 4], iterator_1.get_next().numpy())
+    self.assertEqual(0, iterator_3.get_next().numpy())
+    self.assertEqual(1, iterator_3.get_next().numpy())
+    self.assertEqual(2, iterator_3.get_next().numpy())
+
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([1, 4], iterator_2.get_next().numpy())
+    self.assertAllEqual([9, 16], iterator_2.get_next().numpy())
+    self.assertEqual(3, iterator_3.get_next().numpy())
+    checkpoint.restore(save_path)
+    self.assertAllEqual([9, 16], iterator_1.get_next().numpy())
+    self.assertAllEqual([1, 4], iterator_2.get_next().numpy())
+    self.assertEqual(3, iterator_3.get_next().numpy())
+
+  def testRestoreExhaustedIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    dataset = Dataset.range(3)
+    iterator = datasets.Iterator(dataset)
+
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertEqual(0, iterator.get_next().numpy())
+    self.assertEqual(1, iterator.get_next().numpy())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertEqual(2, iterator.get_next().numpy())
+    checkpoint.restore(save_path)
+    self.assertEqual(2, iterator.get_next().numpy())
+
 
 class DatasetConstructorBenchmark(test.Benchmark):
 
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 68e7b5421fec7f73f10e381ca45f9d900de299d7..7949a3f6da293abdd85512209242bae76ab4d816 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -22,12 +22,12 @@ import six
 
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 
 
 class Evaluator(object):
@@ -57,7 +57,7 @@ class Evaluator(object):
     self._model = model
     self._metrics = {}
     self._evaluators = {}
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.call = function.defun(self.call)
 
   # ---- API for users ----
@@ -90,7 +90,7 @@ class Evaluator(object):
     Only for graph execution.
     @end_compatibility
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError("Evaluator.init_variables() not needed when "
                          "eager execution is enabled.")
     return control_flow_ops.group([m.init_variables() for _, m in self.metrics])
@@ -113,7 +113,8 @@ class Evaluator(object):
         with summary_ops.create_file_writer(
             summary_logdir).as_default(), summary_ops.always_record_summaries():
           return self._all_metric_results()
-      if context.in_eager_mode():
+
+      if context.executing_eagerly():
         return f()
       else:
         return function.defun(f)()
@@ -158,16 +159,16 @@ class Evaluator(object):
       @end_compatibility
     """
     summary_logdir = kwargs.pop("summary_logdir", None)
-    if context.in_graph_mode():
-      call_op = self.__call__(dataset.make_one_shot_iterator().get_next(),
-                              *args, **kwargs)
-      init_op = self.init_variables()
-      results_op = self.all_metric_results(summary_logdir)
-      return (init_op, call_op, results_op)
-    # Eager case
-    for example in datasets.Iterator(dataset):
-      self.__call__(example, *args, **kwargs)
-    return self.all_metric_results(summary_logdir)
+    if context.executing_eagerly():
+      for example in datasets.Iterator(dataset):
+        self.__call__(example, *args, **kwargs)
+      return self.all_metric_results(summary_logdir)
+    # Graph construction
+    call_op = self.__call__(dataset.make_one_shot_iterator().get_next(), *args,
+                            **kwargs)
+    init_op = self.init_variables()
+    results_op = self.all_metric_results(summary_logdir)
+    return (init_op, call_op, results_op)
 
   @staticmethod
   def run_evaluation(init_op, call_op, results_op, sess=None):
@@ -192,7 +193,7 @@ class Evaluator(object):
     Only for graph execution.
     @end_compatibility
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError("Evaluator.run_evaluation() not supported when "
                          "eager execution is enabled.")
     sess = sess or ops.get_default_session()
diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD
index 15a21885f66eface291a39fa0ee1ff28bc297548..c1fd9e0ed020beeb722204edf1adfe1dfcf8ff03 100644
--- a/tensorflow/contrib/eager/python/examples/BUILD
+++ b/tensorflow/contrib/eager/python/examples/BUILD
@@ -8,7 +8,6 @@ py_library(
     deps = [
         "//tensorflow/contrib/eager/python/examples/gan:mnist",
         "//tensorflow/contrib/eager/python/examples/linear_regression",
-        "//tensorflow/contrib/eager/python/examples/mnist",
         "//tensorflow/contrib/eager/python/examples/resnet50",
         "//tensorflow/contrib/eager/python/examples/rnn_colorbot",
         "//tensorflow/contrib/eager/python/examples/rnn_ptb",
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist.py b/tensorflow/contrib/eager/python/examples/gan/mnist.py
index b9ac79f46c83bb709918e3b72830b90ddcfd71b4..b80c90902353709b7f739585291ec3b5890c27c7 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist.py
@@ -32,10 +32,11 @@ import tensorflow as tf
 import tensorflow.contrib.eager as tfe
 from tensorflow.examples.tutorials.mnist import input_data
 
+layers = tf.keras.layers
 FLAGS = None
 
 
-class Discriminator(tfe.Network):
+class Discriminator(tf.keras.Model):
   """GAN Discriminator.
 
   A network to differentiate between generated and real handwritten digits.
@@ -56,19 +57,15 @@ class Discriminator(tfe.Network):
     else:
       assert data_format == 'channels_last'
       self._input_shape = [-1, 28, 28, 1]
-    self.conv1 = self.track_layer(tf.layers.Conv2D(64, 5, padding='SAME',
-                                                   data_format=data_format,
-                                                   activation=tf.tanh))
-    self.pool1 = self.track_layer(
-        tf.layers.AveragePooling2D(2, 2, data_format=data_format))
-    self.conv2 = self.track_layer(tf.layers.Conv2D(128, 5,
-                                                   data_format=data_format,
-                                                   activation=tf.tanh))
-    self.pool2 = self.track_layer(
-        tf.layers.AveragePooling2D(2, 2, data_format=data_format))
-    self.flatten = self.track_layer(tf.layers.Flatten())
-    self.fc1 = self.track_layer(tf.layers.Dense(1024, activation=tf.tanh))
-    self.fc2 = self.track_layer(tf.layers.Dense(1, activation=None))
+    self.conv1 = layers.Conv2D(
+        64, 5, padding='SAME', data_format=data_format, activation=tf.tanh)
+    self.pool1 = layers.AveragePooling2D(2, 2, data_format=data_format)
+    self.conv2 = layers.Conv2D(
+        128, 5, data_format=data_format, activation=tf.tanh)
+    self.pool2 = layers.AveragePooling2D(2, 2, data_format=data_format)
+    self.flatten = layers.Flatten()
+    self.fc1 = layers.Dense(1024, activation=tf.tanh)
+    self.fc2 = layers.Dense(1, activation=None)
 
   def call(self, inputs):
     """Return two logits per image estimating input authenticity.
@@ -95,7 +92,7 @@ class Discriminator(tfe.Network):
     return x
 
 
-class Generator(tfe.Network):
+class Generator(tf.keras.Model):
   """Generator of handwritten digits similar to the ones in the MNIST dataset.
   """
 
@@ -116,18 +113,17 @@ class Generator(tfe.Network):
     else:
       assert data_format == 'channels_last'
       self._pre_conv_shape = [-1, 6, 6, 128]
-    self.fc1 = self.track_layer(tf.layers.Dense(6 * 6 * 128,
-                                                activation=tf.tanh))
+    self.fc1 = layers.Dense(6 * 6 * 128, activation=tf.tanh)
 
     # In call(), we reshape the output of fc1 to _pre_conv_shape
 
     # Deconvolution layer. Resulting image shape: (batch, 14, 14, 64)
-    self.conv1 = self.track_layer(tf.layers.Conv2DTranspose(
-        64, 4, strides=2, activation=None, data_format=data_format))
+    self.conv1 = layers.Conv2DTranspose(
+        64, 4, strides=2, activation=None, data_format=data_format)
 
     # Deconvolution layer. Resulting image shape: (batch, 28, 28, 1)
-    self.conv2 = self.track_layer(tf.layers.Conv2DTranspose(
-        1, 2, strides=2, activation=tf.nn.sigmoid, data_format=data_format))
+    self.conv2 = layers.Conv2DTranspose(
+        1, 2, strides=2, activation=tf.nn.sigmoid, data_format=data_format)
 
   def call(self, inputs):
     """Return a batch of generated images.
@@ -168,7 +164,8 @@ def discriminator_loss(discriminator_real_outputs, discriminator_gen_outputs):
   """
 
   loss_on_real = tf.losses.sigmoid_cross_entropy(
-      tf.ones_like(discriminator_real_outputs), discriminator_real_outputs,
+      tf.ones_like(discriminator_real_outputs),
+      discriminator_real_outputs,
       label_smoothing=0.25)
   loss_on_generated = tf.losses.sigmoid_cross_entropy(
       tf.zeros_like(discriminator_gen_outputs), discriminator_gen_outputs)
@@ -198,9 +195,9 @@ def generator_loss(discriminator_gen_outputs):
   return loss
 
 
-def train_one_epoch(generator, discriminator,
-                    generator_optimizer, discriminator_optimizer,
-                    dataset, log_interval, noise_dim):
+def train_one_epoch(generator, discriminator, generator_optimizer,
+                    discriminator_optimizer, dataset, step_counter,
+                    log_interval, noise_dim):
   """Trains `generator` and `discriminator` models on `dataset`.
 
   Args:
@@ -209,7 +206,8 @@ def train_one_epoch(generator, discriminator,
     generator_optimizer: Optimizer to use for generator.
     discriminator_optimizer: Optimizer to use for discriminator.
     dataset: Dataset of images to train on.
-    log_interval: How many global steps to wait between logging and collecting
+    step_counter: An integer variable, used to write summaries regularly.
+    log_interval: How many steps to wait between logging and collecting
       summaries.
     noise_dim: Dimension of noise vector to use.
   """
@@ -218,18 +216,23 @@ def train_one_epoch(generator, discriminator,
   total_discriminator_loss = 0.0
   for (batch_index, images) in enumerate(tfe.Iterator(dataset)):
     with tf.device('/cpu:0'):
-      tf.assign_add(tf.train.get_global_step(), 1)
+      tf.assign_add(step_counter, 1)
 
-    with tf.contrib.summary.record_summaries_every_n_global_steps(log_interval):
+    with tf.contrib.summary.record_summaries_every_n_global_steps(
+        log_interval, global_step=step_counter):
       current_batch_size = images.shape[0]
-      noise = tf.random_uniform(shape=[current_batch_size, noise_dim],
-                                minval=-1., maxval=1., seed=batch_index)
+      noise = tf.random_uniform(
+          shape=[current_batch_size, noise_dim],
+          minval=-1.,
+          maxval=1.,
+          seed=batch_index)
 
       with tfe.GradientTape(persistent=True) as g:
         generated_images = generator(noise)
-        tf.contrib.summary.image('generated_images',
-                                 tf.reshape(generated_images, [-1, 28, 28, 1]),
-                                 max_images=10)
+        tf.contrib.summary.image(
+            'generated_images',
+            tf.reshape(generated_images, [-1, 28, 28, 1]),
+            max_images=10)
 
         discriminator_gen_outputs = discriminator(generated_images)
         discriminator_real_outputs = discriminator(images)
@@ -244,18 +247,16 @@ def train_one_epoch(generator, discriminator,
       discriminator_grad = g.gradient(discriminator_loss_val,
                                       discriminator.variables)
 
-      with tf.variable_scope('generator'):
-        generator_optimizer.apply_gradients(zip(generator_grad,
-                                                generator.variables))
-      with tf.variable_scope('discriminator'):
-        discriminator_optimizer.apply_gradients(zip(discriminator_grad,
-                                                    discriminator.variables))
+      generator_optimizer.apply_gradients(
+          zip(generator_grad, generator.variables))
+      discriminator_optimizer.apply_gradients(
+          zip(discriminator_grad, discriminator.variables))
 
       if log_interval and batch_index > 0 and batch_index % log_interval == 0:
         print('Batch #%d\tAverage Generator Loss: %.6f\t'
-              'Average Discriminator Loss: %.6f' % (
-                  batch_index, total_generator_loss/batch_index,
-                  total_discriminator_loss/batch_index))
+              'Average Discriminator Loss: %.6f' %
+              (batch_index, total_generator_loss / batch_index,
+               total_discriminator_loss / batch_index))
 
 
 def main(_):
@@ -266,18 +267,18 @@ def main(_):
 
   # Load the datasets
   data = input_data.read_data_sets(FLAGS.data_dir)
-  dataset = (tf.data.Dataset
-             .from_tensor_slices(data.train.images)
-             .shuffle(60000)
-             .batch(FLAGS.batch_size))
-
-  # Create the models and optimizers
-  generator = Generator(data_format)
-  discriminator = Discriminator(data_format)
-  with tf.variable_scope('generator'):
-    generator_optimizer = tf.train.AdamOptimizer(FLAGS.lr)
-  with tf.variable_scope('discriminator'):
-    discriminator_optimizer = tf.train.AdamOptimizer(FLAGS.lr)
+  dataset = (
+      tf.data.Dataset.from_tensor_slices(data.train.images).shuffle(60000)
+      .batch(FLAGS.batch_size))
+
+  # Create the models and optimizers.
+  model_objects = {
+      'generator': Generator(data_format),
+      'discriminator': Discriminator(data_format),
+      'generator_optimizer': tf.train.AdamOptimizer(FLAGS.lr),
+      'discriminator_optimizer': tf.train.AdamOptimizer(FLAGS.lr),
+      'step_counter': tf.train.get_or_create_global_step(),
+  }
 
   # Prepare summary writer and checkpoint info
   summary_writer = tf.contrib.summary.create_summary_file_writer(
@@ -286,28 +287,22 @@ def main(_):
   latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
   if latest_cpkt:
     print('Using latest checkpoint at ' + latest_cpkt)
+  checkpoint = tfe.Checkpoint(**model_objects)
+  # Restore variables on creation if a checkpoint exists.
+  checkpoint.restore(latest_cpkt)
 
   with tf.device(device):
-    for epoch in range(1, 101):
-      with tfe.restore_variables_on_create(latest_cpkt):
-        global_step = tf.train.get_or_create_global_step()
-        start = time.time()
-        with summary_writer.as_default():
-          train_one_epoch(generator, discriminator, generator_optimizer,
-                          discriminator_optimizer,
-                          dataset, FLAGS.log_interval, FLAGS.noise)
-        end = time.time()
-        print('\nTrain time for epoch #%d (global step %d): %f' % (
-            epoch, global_step.numpy(), end - start))
-
-      all_variables = (
-          generator.variables
-          + discriminator.variables
-          + generator_optimizer.variables()
-          + discriminator_optimizer.variables()
-          + [global_step])
-      tfe.Saver(all_variables).save(
-          checkpoint_prefix, global_step=global_step)
+    for _ in range(100):
+      start = time.time()
+      with summary_writer.as_default():
+        train_one_epoch(dataset=dataset, log_interval=FLAGS.log_interval,
+                        noise_dim=FLAGS.noise, **model_objects)
+      end = time.time()
+      checkpoint.save(checkpoint_prefix)
+      print('\nTrain time for epoch #%d (step %d): %f' %
+            (checkpoint.save_counter.numpy(),
+             checkpoint.step_counter.numpy(),
+             end - start))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist_test.py b/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
index 4a3ca8d82bc2619b05a734f6d2e58431c1a45995..bd35e50c1f434d167c5a8c5aa7d224912523ce28 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
@@ -62,7 +62,7 @@ class MnistEagerGanBenchmark(tf.test.Benchmark):
                         for _ in range(measure_batches)]
       measure_dataset = tf.data.Dataset.from_tensor_slices(measure_images)
 
-      tf.train.get_or_create_global_step()
+      step_counter = tf.train.get_or_create_global_step()
       with tf.device(device()):
         # Create the models and optimizers
         generator = mnist.Generator(data_format())
@@ -78,13 +78,15 @@ class MnistEagerGanBenchmark(tf.test.Benchmark):
           # warm up
           mnist.train_one_epoch(generator, discriminator, generator_optimizer,
                                 discriminator_optimizer,
-                                burn_dataset, log_interval=SUMMARY_INTERVAL,
+                                burn_dataset, step_counter,
+                                log_interval=SUMMARY_INTERVAL,
                                 noise_dim=NOISE_DIM)
           # measure
           start = time.time()
           mnist.train_one_epoch(generator, discriminator, generator_optimizer,
                                 discriminator_optimizer,
-                                measure_dataset, log_interval=SUMMARY_INTERVAL,
+                                measure_dataset, step_counter,
+                                log_interval=SUMMARY_INTERVAL,
                                 noise_dim=NOISE_DIM)
           self._report('train', start, measure_batches, batch_size)
 
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index f86331af6f7928f0f86c888e22706c6e0a5978b2..2f6cfdf31e852d5d69a7a87980c9a441da504cf2 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -22,6 +22,7 @@ cuda_py_test(
         ":linear_regression",
         "//tensorflow:tensorflow_py",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index 6ce4de6ee0bf50400eff339ac04e132252a2b53e..4e1380afb2e6e722de65c691d4fbf44621072e87 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -32,24 +32,16 @@ import tensorflow as tf
 
 import tensorflow.contrib.eager as tfe
 
+layers = tf.keras.layers
 
-class LinearModel(tfe.Network):
-  """A TensorFlow linear regression model.
 
-  Uses TensorFlow's eager execution.
-
-  For those familiar with TensorFlow graphs, notice the absence of
-  `tf.Session`. The `forward()` method here immediately executes and
-  returns output values. The `loss()` method immediately compares the
-  output of `forward()` with the target and returns the MSE loss value.
-  The `fit()` performs gradient-descent training on the model's weights
-  and bias.
-  """
+class LinearModel(tf.keras.Model):
+  """A TensorFlow linear regression model."""
 
   def __init__(self):
     """Constructs a LinearModel object."""
     super(LinearModel, self).__init__()
-    self._hidden_layer = self.track_layer(tf.layers.Dense(1))
+    self._hidden_layer = layers.Dense(1)
 
   def call(self, xs):
     """Invoke the linear model.
@@ -64,7 +56,7 @@ class LinearModel(tfe.Network):
 
 
 def mean_square_loss(model, xs, ys):
-  return tf.reduce_mean(tf.square(model(xs) - ys))
+  return tf.reduce_mean(tf.square(tf.subtract(model(xs), ys)))
 
 
 def fit(model, dataset, optimizer, verbose=False, logdir=None):
diff --git a/tensorflow/contrib/eager/python/examples/mnist/BUILD b/tensorflow/contrib/eager/python/examples/mnist/BUILD
deleted file mode 100644
index c61ec2dbae60a782c0e6589701554b045dcb92ae..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/examples/mnist/BUILD
+++ /dev/null
@@ -1,36 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-
-py_binary(
-    name = "mnist",
-    srcs = ["mnist.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/eager/python:tfe",
-        "//tensorflow/examples/tutorials/mnist:input_data",
-    ],
-)
-
-cuda_py_test(
-    name = "mnist_test",
-    srcs = ["mnist_test.py"],
-    additional_deps = [
-        ":mnist",
-        "//tensorflow/contrib/eager/python:tfe",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-cuda_py_test(
-    name = "mnist_graph_test",
-    srcs = ["mnist_graph_test.py"],
-    additional_deps = [
-        ":mnist",
-        "//third_party/py/numpy",
-        "//tensorflow:tensorflow_py",
-    ],
-)
diff --git a/tensorflow/contrib/eager/python/examples/mnist/README.md b/tensorflow/contrib/eager/python/examples/mnist/README.md
index e987996b88ccf54a322749aadec4f9840760a90f..d1c079ff6b5cb187bbcfe2742293982b1bedd2d4 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/README.md
+++ b/tensorflow/contrib/eager/python/examples/mnist/README.md
@@ -1,10 +1 @@
-Classification model for the MNIST dataset using eager execution.
-
-To run:
-
-```
-python mnist.py
-```
-
-`mnist_graph_test.py` demonstrates that the same code that is executed eagerly
-in `mnist.py` is used to construct a TensorFlow graph.
+See https://github.com/tensorflow/models/tree/master/official/mnist/mnist_eager.py
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
deleted file mode 100644
index 58b1e89d15895cf38331e6f7bd5a311a2f5f6467..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A deep MNIST classifier using convolutional layers.
-
-Sample usage:
-  python mnist.py --help
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import sys
-import time
-
-import tensorflow as tf
-
-import tensorflow.contrib.eager as tfe
-from tensorflow.examples.tutorials.mnist import input_data
-
-FLAGS = None
-
-
-class MNISTModel(tf.keras.Model):
-  """MNIST Network.
-
-  Network structure is equivalent to:
-  https://github.com/tensorflow/tensorflow/blob/r1.6/tensorflow/examples/tutorials/mnist/mnist_deep.py
-  and
-  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
-
-  But written using the tf.layers API.
-  """
-
-  def __init__(self, data_format):
-    """Creates a model for classifying a hand-written digit.
-
-    Args:
-      data_format: Either 'channels_first' or 'channels_last'.
-        'channels_first' is typically faster on GPUs while 'channels_last' is
-        typically faster on CPUs. See
-        https://www.tensorflow.org/performance/performance_guide#data_formats
-    """
-    super(MNISTModel, self).__init__(name='')
-    if data_format == 'channels_first':
-      self._input_shape = [-1, 1, 28, 28]
-    else:
-      assert data_format == 'channels_last'
-      self._input_shape = [-1, 28, 28, 1]
-    self.conv1 = tf.layers.Conv2D(
-        32, 5, data_format=data_format, activation=tf.nn.relu)
-    self.conv2 = tf.layers.Conv2D(
-        64, 5, data_format=data_format, activation=tf.nn.relu)
-    self.fc1 = tf.layers.Dense(1024, activation=tf.nn.relu)
-    self.fc2 = tf.layers.Dense(10)
-    self.dropout = tf.layers.Dropout(0.5)
-    self.max_pool2d = tf.layers.MaxPooling2D(
-        (2, 2), (2, 2), padding='SAME', data_format=data_format)
-
-  def call(self, inputs, training=False):
-    """Computes labels from inputs.
-
-    Users should invoke __call__ to run the network, which delegates to this
-    method (and not call this method directly).
-
-    Args:
-      inputs: A batch of images as a Tensor with shape [batch_size, 784].
-      training: True if invoked in the context of training (causing dropout to
-        be applied).  False otherwise.
-
-    Returns:
-      A Tensor with shape [batch_size, 10] containing the predicted logits
-      for each image in the batch, for each of the 10 classes.
-    """
-
-    x = tf.reshape(inputs, self._input_shape)
-    x = self.conv1(x)
-    x = self.max_pool2d(x)
-    x = self.conv2(x)
-    x = self.max_pool2d(x)
-    x = tf.layers.flatten(x)
-    x = self.fc1(x)
-    x = self.dropout(x, training=training)
-    x = self.fc2(x)
-    return x
-
-
-def loss(predictions, labels):
-  return tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(
-          logits=predictions, labels=labels))
-
-
-def compute_accuracy(predictions, labels):
-  return tf.reduce_sum(
-      tf.cast(
-          tf.equal(
-              tf.argmax(predictions, axis=1,
-                        output_type=tf.int64),
-              tf.argmax(labels, axis=1,
-                        output_type=tf.int64)),
-          dtype=tf.float32)) / float(predictions.shape[0].value)
-
-
-def train_one_epoch(model, optimizer, dataset, log_interval=None):
-  """Trains model on `dataset` using `optimizer`."""
-
-  tf.train.get_or_create_global_step()
-
-  for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
-    with tf.contrib.summary.record_summaries_every_n_global_steps(10):
-      with tfe.GradientTape() as tape:
-        prediction = model(images, training=True)
-        loss_value = loss(prediction, labels)
-        tf.contrib.summary.scalar('loss', loss_value)
-        tf.contrib.summary.scalar('accuracy',
-                                  compute_accuracy(prediction, labels))
-      grads = tape.gradient(loss_value, model.variables)
-      optimizer.apply_gradients(zip(grads, model.variables))
-      if log_interval and batch % log_interval == 0:
-        print('Batch #%d\tLoss: %.6f' % (batch, loss_value))
-
-
-def test(model, dataset):
-  """Perform an evaluation of `model` on the examples from `dataset`."""
-  avg_loss = tfe.metrics.Mean('loss')
-  accuracy = tfe.metrics.Accuracy('accuracy')
-
-  for (images, labels) in tfe.Iterator(dataset):
-    predictions = model(images, training=False)
-    avg_loss(loss(predictions, labels))
-    accuracy(tf.argmax(predictions, axis=1, output_type=tf.int64),
-             tf.argmax(labels, axis=1, output_type=tf.int64))
-  print('Test set: Average loss: %.4f, Accuracy: %4f%%\n' %
-        (avg_loss.result(), 100 * accuracy.result()))
-  with tf.contrib.summary.always_record_summaries():
-    tf.contrib.summary.scalar('loss', avg_loss.result())
-    tf.contrib.summary.scalar('accuracy', accuracy.result())
-
-
-def load_data(data_dir):
-  """Returns training and test tf.data.Dataset objects."""
-  data = input_data.read_data_sets(data_dir, one_hot=True)
-  train_ds = tf.data.Dataset.from_tensor_slices((data.train.images,
-                                                 data.train.labels))
-  test_ds = tf.data.Dataset.from_tensors((data.test.images, data.test.labels))
-  return (train_ds, test_ds)
-
-
-def main(_):
-  tfe.enable_eager_execution()
-
-  (device, data_format) = ('/gpu:0', 'channels_first')
-  if FLAGS.no_gpu or tfe.num_gpus() <= 0:
-    (device, data_format) = ('/cpu:0', 'channels_last')
-  print('Using device %s, and data format %s.' % (device, data_format))
-
-  # Load the datasets
-  (train_ds, test_ds) = load_data(FLAGS.data_dir)
-  train_ds = train_ds.shuffle(60000).batch(FLAGS.batch_size)
-
-  # Create the model and optimizer
-  model = MNISTModel(data_format)
-  optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum)
-
-  if FLAGS.output_dir:
-    train_dir = os.path.join(FLAGS.output_dir, 'train')
-    test_dir = os.path.join(FLAGS.output_dir, 'eval')
-    tf.gfile.MakeDirs(FLAGS.output_dir)
-  else:
-    train_dir = None
-    test_dir = None
-  summary_writer = tf.contrib.summary.create_file_writer(
-      train_dir, flush_millis=10000)
-  test_summary_writer = tf.contrib.summary.create_file_writer(
-      test_dir, flush_millis=10000, name='test')
-  checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt')
-
-  with tf.device(device):
-    for epoch in range(1, 11):
-      with tfe.restore_variables_on_create(
-          tf.train.latest_checkpoint(FLAGS.checkpoint_dir)):
-        global_step = tf.train.get_or_create_global_step()
-        start = time.time()
-        with summary_writer.as_default():
-          train_one_epoch(model, optimizer, train_ds, FLAGS.log_interval)
-        end = time.time()
-        print('\nTrain time for epoch #%d (global step %d): %f' % (
-            epoch, global_step.numpy(), end - start))
-      with test_summary_writer.as_default():
-        test(model, test_ds)
-      all_variables = (
-          model.variables
-          + optimizer.variables()
-          + [global_step])
-      tfe.Saver(all_variables).save(
-          checkpoint_prefix, global_step=global_step)
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--data-dir',
-      type=str,
-      default='/tmp/tensorflow/mnist/input_data',
-      help='Directory for storing input data')
-  parser.add_argument(
-      '--batch-size',
-      type=int,
-      default=64,
-      metavar='N',
-      help='input batch size for training (default: 64)')
-  parser.add_argument(
-      '--log-interval',
-      type=int,
-      default=10,
-      metavar='N',
-      help='how many batches to wait before logging training status')
-  parser.add_argument(
-      '--output_dir',
-      type=str,
-      default=None,
-      metavar='N',
-      help='Directory to write TensorBoard summaries')
-  parser.add_argument(
-      '--checkpoint_dir',
-      type=str,
-      default='/tmp/tensorflow/mnist/checkpoints/',
-      metavar='N',
-      help='Directory to save checkpoints in (once per epoch)')
-  parser.add_argument(
-      '--lr',
-      type=float,
-      default=0.01,
-      metavar='LR',
-      help='learning rate (default: 0.01)')
-  parser.add_argument(
-      '--momentum',
-      type=float,
-      default=0.5,
-      metavar='M',
-      help='SGD momentum (default: 0.5)')
-  parser.add_argument(
-      '--no-gpu',
-      action='store_true',
-      default=False,
-      help='disables GPU usage even if a GPU is available')
-
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist_graph_test.py b/tensorflow/contrib/eager/python/examples/mnist/mnist_graph_test.py
deleted file mode 100644
index 1af26553120b34d4682b17b1c29c81dc65e421d4..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist_graph_test.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-from tensorflow.contrib.eager.python.examples.mnist import mnist
-
-
-def data_format():
-  return "channels_first" if tf.test.is_gpu_available() else "channels_last"
-
-
-class MNISTGraphTest(tf.test.TestCase):
-
-  def testTrainGraph(self):
-    # The MNISTModel class can be executed eagerly (as in mnist.py and
-    # mnist_test.py) and also be used to construct a TensorFlow graph, which is
-    # then trained in a session.
-    with tf.Graph().as_default():
-      # Generate some random data.
-      batch_size = 64
-      images = np.random.randn(batch_size, 784).astype(np.float32)
-      digits = np.random.randint(low=0, high=10, size=batch_size)
-      labels = np.zeros((batch_size, 10))
-      labels[np.arange(batch_size), digits] = 1.
-
-      # Create a model, optimizer, and dataset as would be done
-      # for eager execution as well.
-      model = mnist.MNISTModel(data_format())
-      optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
-      dataset = tf.data.Dataset.from_tensors((images, labels))
-
-      # Define the loss tensor (as opposed to a loss function when
-      # using eager execution).
-      (images, labels) = dataset.make_one_shot_iterator().get_next()
-      predictions = model(images, training=True)
-      loss = mnist.loss(predictions, labels)
-
-      train_op = optimizer.minimize(loss)
-      init = tf.global_variables_initializer()
-      with tf.Session() as sess:
-        # Variables have to be initialized in the session.
-        sess.run(init)
-        # Train using the optimizer.
-        sess.run(train_op)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist_test.py b/tensorflow/contrib/eager/python/examples/mnist/mnist_test.py
deleted file mode 100644
index 136085eba21284a42282395e54f32c33bf63b5c3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist_test.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-import tensorflow.contrib.eager as tfe
-from tensorflow.contrib.eager.python.examples.mnist import mnist
-
-
-def device():
-  return "/device:GPU:0" if tfe.num_gpus() else "/device:CPU:0"
-
-
-def data_format():
-  return "channels_first" if tfe.num_gpus() else "channels_last"
-
-
-def random_dataset():
-  batch_size = 64
-  images = tf.random_normal([batch_size, 784])
-  digits = tf.random_uniform([batch_size], minval=0, maxval=10, dtype=tf.int32)
-  labels = tf.one_hot(digits, 10)
-  return tf.data.Dataset.from_tensors((images, labels))
-
-
-def train_one_epoch(defun=False):
-  model = mnist.MNISTModel(data_format())
-  if defun:
-    model.call = tfe.defun(model.call)
-  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-  dataset = random_dataset()
-  with tf.device(device()):
-    tf.train.get_or_create_global_step()
-    mnist.train_one_epoch(model, optimizer, dataset)
-
-
-def evaluate(defun=False):
-  model = mnist.MNISTModel(data_format())
-  dataset = random_dataset()
-  if defun:
-    model.call = tfe.defun(model.call)
-  with tf.device(device()):
-    tf.train.get_or_create_global_step()
-    mnist.test(model, dataset)
-
-
-class MNISTTest(tf.test.TestCase):
-
-  def testTrainOneEpoch(self):
-    train_one_epoch(defun=False)
-
-  def testTest(self):
-    evaluate(defun=False)
-
-  def testTrainOneEpochWithDefunCall(self):
-    train_one_epoch(defun=True)
-
-  def testTestWithDefunCall(self):
-    evaluate(defun=True)
-
-
-if __name__ == "__main__":
-  tfe.enable_eager_execution()
-  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
index 536cad998d94e45187d30fce3be0d7a57178e0c1..0c0e28dd95c68dc300384a128eb5aa2208f63a0d 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD
+++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
@@ -14,6 +14,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "resnet50_test_lib",
+    srcs = ["resnet50_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":resnet50",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/eager/python:tfe",
+    ],
+)
+
 cuda_py_test(
     name = "resnet50_test",
     size = "large",
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
index 9982fdb07eefa665379e7be095f4f8017d92cf97..a28bc8a43d7c90737c9baf9a634d736e9de52948 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
@@ -27,10 +27,11 @@ from __future__ import print_function
 import functools
 
 import tensorflow as tf
-import tensorflow.contrib.eager as tfe
 
+layers = tf.keras.layers
 
-class _IdentityBlock(tfe.Network):
+
+class _IdentityBlock(tf.keras.Model):
   """_IdentityBlock is the block that has no conv layer at shortcut.
 
   Args:
@@ -50,31 +51,24 @@ class _IdentityBlock(tfe.Network):
     bn_name_base = 'bn' + str(stage) + block + '_branch'
     bn_axis = 1 if data_format == 'channels_first' else 3
 
-    self.conv2a = self.track_layer(
-        tf.layers.Conv2D(
-            filters1, (1, 1),
-            name=conv_name_base + '2a',
-            data_format=data_format))
-    self.bn2a = self.track_layer(
-        tf.layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2a'))
-
-    self.conv2b = self.track_layer(
-        tf.layers.Conv2D(
-            filters2,
-            kernel_size,
-            padding='same',
-            data_format=data_format,
-            name=conv_name_base + '2b'))
-    self.bn2b = self.track_layer(
-        tf.layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2b'))
-
-    self.conv2c = self.track_layer(
-        tf.layers.Conv2D(
-            filters3, (1, 1),
-            name=conv_name_base + '2c',
-            data_format=data_format))
-    self.bn2c = self.track_layer(
-        tf.layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2c'))
+    self.conv2a = layers.Conv2D(
+        filters1, (1, 1), name=conv_name_base + '2a', data_format=data_format)
+    self.bn2a = layers.BatchNormalization(
+        axis=bn_axis, name=bn_name_base + '2a')
+
+    self.conv2b = layers.Conv2D(
+        filters2,
+        kernel_size,
+        padding='same',
+        data_format=data_format,
+        name=conv_name_base + '2b')
+    self.bn2b = layers.BatchNormalization(
+        axis=bn_axis, name=bn_name_base + '2b')
+
+    self.conv2c = layers.Conv2D(
+        filters3, (1, 1), name=conv_name_base + '2c', data_format=data_format)
+    self.bn2c = layers.BatchNormalization(
+        axis=bn_axis, name=bn_name_base + '2c')
 
   def call(self, input_tensor, training=False):
     x = self.conv2a(input_tensor)
@@ -92,7 +86,7 @@ class _IdentityBlock(tfe.Network):
     return tf.nn.relu(x)
 
 
-class _ConvBlock(tfe.Network):
+class _ConvBlock(tf.keras.Model):
   """_ConvBlock is the block that has a conv layer at shortcut.
 
   Args:
@@ -121,41 +115,35 @@ class _ConvBlock(tfe.Network):
     bn_name_base = 'bn' + str(stage) + block + '_branch'
     bn_axis = 1 if data_format == 'channels_first' else 3
 
-    self.conv2a = self.track_layer(
-        tf.layers.Conv2D(
-            filters1, (1, 1),
-            strides=strides,
-            name=conv_name_base + '2a',
-            data_format=data_format))
-    self.bn2a = self.track_layer(
-        tf.layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2a'))
-
-    self.conv2b = self.track_layer(
-        tf.layers.Conv2D(
-            filters2,
-            kernel_size,
-            padding='same',
-            name=conv_name_base + '2b',
-            data_format=data_format))
-    self.bn2b = self.track_layer(
-        tf.layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2b'))
-
-    self.conv2c = self.track_layer(
-        tf.layers.Conv2D(
-            filters3, (1, 1),
-            name=conv_name_base + '2c',
-            data_format=data_format))
-    self.bn2c = self.track_layer(
-        tf.layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2c'))
-
-    self.conv_shortcut = self.track_layer(
-        tf.layers.Conv2D(
-            filters3, (1, 1),
-            strides=strides,
-            name=conv_name_base + '1',
-            data_format=data_format))
-    self.bn_shortcut = self.track_layer(
-        tf.layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '1'))
+    self.conv2a = layers.Conv2D(
+        filters1, (1, 1),
+        strides=strides,
+        name=conv_name_base + '2a',
+        data_format=data_format)
+    self.bn2a = layers.BatchNormalization(
+        axis=bn_axis, name=bn_name_base + '2a')
+
+    self.conv2b = layers.Conv2D(
+        filters2,
+        kernel_size,
+        padding='same',
+        name=conv_name_base + '2b',
+        data_format=data_format)
+    self.bn2b = layers.BatchNormalization(
+        axis=bn_axis, name=bn_name_base + '2b')
+
+    self.conv2c = layers.Conv2D(
+        filters3, (1, 1), name=conv_name_base + '2c', data_format=data_format)
+    self.bn2c = layers.BatchNormalization(
+        axis=bn_axis, name=bn_name_base + '2c')
+
+    self.conv_shortcut = layers.Conv2D(
+        filters3, (1, 1),
+        strides=strides,
+        name=conv_name_base + '1',
+        data_format=data_format)
+    self.bn_shortcut = layers.BatchNormalization(
+        axis=bn_axis, name=bn_name_base + '1')
 
   def call(self, input_tensor, training=False):
     x = self.conv2a(input_tensor)
@@ -176,7 +164,8 @@ class _ConvBlock(tfe.Network):
     return tf.nn.relu(x)
 
 
-class ResNet50(tfe.Network):
+# pylint: disable=not-callable
+class ResNet50(tf.keras.Model):
   """Instantiates the ResNet50 architecture.
 
   Args:
@@ -220,32 +209,28 @@ class ResNet50(tfe.Network):
     self.include_top = include_top
 
     def conv_block(filters, stage, block, strides=(2, 2)):
-      l = _ConvBlock(
+      return _ConvBlock(
           3,
           filters,
           stage=stage,
           block=block,
           data_format=data_format,
           strides=strides)
-      return self.track_layer(l)
 
     def id_block(filters, stage, block):
-      l = _IdentityBlock(
+      return _IdentityBlock(
           3, filters, stage=stage, block=block, data_format=data_format)
-      return self.track_layer(l)
-
-    self.conv1 = self.track_layer(
-        tf.layers.Conv2D(
-            64, (7, 7),
-            strides=(2, 2),
-            data_format=data_format,
-            padding='same',
-            name='conv1'))
+
+    self.conv1 = layers.Conv2D(
+        64, (7, 7),
+        strides=(2, 2),
+        data_format=data_format,
+        padding='same',
+        name='conv1')
     bn_axis = 1 if data_format == 'channels_first' else 3
-    self.bn_conv1 = self.track_layer(
-        tf.layers.BatchNormalization(axis=bn_axis, name='bn_conv1'))
-    self.max_pool = self.track_layer(
-        tf.layers.MaxPooling2D((3, 3), strides=(2, 2), data_format=data_format))
+    self.bn_conv1 = layers.BatchNormalization(axis=bn_axis, name='bn_conv1')
+    self.max_pool = layers.MaxPooling2D(
+        (3, 3), strides=(2, 2), data_format=data_format)
 
     self.l2a = conv_block([64, 64, 256], stage=2, block='a', strides=(1, 1))
     self.l2b = id_block([64, 64, 256], stage=2, block='b')
@@ -267,13 +252,12 @@ class ResNet50(tfe.Network):
     self.l5b = id_block([512, 512, 2048], stage=5, block='b')
     self.l5c = id_block([512, 512, 2048], stage=5, block='c')
 
-    self.avg_pool = self.track_layer(
-        tf.layers.AveragePooling2D(
-            (7, 7), strides=(7, 7), data_format=data_format))
+    self.avg_pool = layers.AveragePooling2D(
+        (7, 7), strides=(7, 7), data_format=data_format)
 
     if self.include_top:
-      self.fc1000 = self.track_layer(
-          tf.layers.Dense(classes, name='fc1000'))
+      self.flatten = layers.Flatten()
+      self.fc1000 = layers.Dense(classes, name='fc1000')
     else:
       reduction_indices = [1, 2] if data_format == 'channels_last' else [2, 3]
       reduction_indices = tf.constant(reduction_indices)
@@ -288,7 +272,7 @@ class ResNet50(tfe.Network):
       else:
         self.global_pooling = None
 
-  def call(self, input_tensor, training=False):
+  def call(self, input_tensor, training):
     x = self.conv1(input_tensor)
     x = self.bn_conv1(x, training=training)
     x = tf.nn.relu(x)
@@ -317,7 +301,7 @@ class ResNet50(tfe.Network):
     x = self.avg_pool(x)
 
     if self.include_top:
-      return self.fc1000(tf.layers.flatten(x))
+      return self.fc1000(self.flatten(x))
     elif self.global_pooling:
       return self.global_pooling(x)
     else:
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
index 23317886e712323f4b520000e0fd372734fc53a1..551c76b0df71c88919df9cd6d81b4176b23b0ba3 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
@@ -55,7 +55,7 @@ class ResNet50GraphTest(tf.test.TestCase):
     with tf.Graph().as_default():
       images = tf.placeholder(tf.float32, image_shape(None))
       model = resnet50.ResNet50(data_format())
-      predictions = model(images)
+      predictions = model(images, training=False)
 
       init = tf.global_variables_initializer()
 
@@ -114,7 +114,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
     with tf.Graph().as_default():
       images = tf.placeholder(tf.float32, image_shape(None))
       model = resnet50.ResNet50(data_format())
-      predictions = model(images)
+      predictions = model(images, training=False)
 
       init = tf.global_variables_initializer()
 
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 0ff8746884c288f824f5f22ab4c550370d0e0302..8517a3bf7b6aebf4ecd2f148d2160cfea1b1b9c0 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -36,8 +36,8 @@ def device_and_data_format():
                                                               'channels_last')
 
 
-def random_batch(batch_size):
-  _, data_format = device_and_data_format()
+def random_batch(batch_size, device_and_format=None):
+  _, data_format = device_and_format or device_and_data_format()
 
   shape = (3, 224, 224) if data_format == 'channels_first' else (224, 224, 3)
   shape = (batch_size,) + shape
@@ -64,28 +64,35 @@ def train_one_step(model, images, labels, optimizer):
 
 class ResNet50Test(tf.test.TestCase):
 
-  def _apply(self, defun=False):
+  def _apply(self, defun=False, execution_mode=None):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format)
     if defun:
       model.call = tfe.defun(model.call)
-    with tf.device(device):
+    with tf.device(device), tfe.execution_mode(execution_mode):
       images, _ = random_batch(2)
-      output = model(images)
+      output = model(images, training=False)
+      tfe.async_wait()
     self.assertEqual((2, 1000), output.shape)
 
   def test_apply(self):
     self._apply(defun=False)
 
+  def test_apply_async(self):
+    self._apply(defun=False, execution_mode=tfe.ASYNC)
+
   def test_apply_with_defun(self):
     self._apply(defun=True)
 
+  def test_apply_with_defun_async(self):
+    self._apply(defun=True, execution_mode=tfe.ASYNC)
+
   def test_apply_no_top(self):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
     with tf.device(device):
       images, _ = random_batch(2)
-      output = model(images)
+      output = model(images, training=False)
     output_shape = ((2, 2048, 1, 1)
                     if data_format == 'channels_first' else (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
@@ -95,10 +102,10 @@ class ResNet50Test(tf.test.TestCase):
     model = resnet50.ResNet50(data_format, include_top=False, pooling='avg')
     with tf.device(device):
       images, _ = random_batch(2)
-      output = model(images)
+      output = model(images, training=False)
     self.assertEqual((2, 2048), output.shape)
 
-  def test_train(self):
+  def _test_train(self, execution_mode=None):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format)
     tf.train.get_or_create_global_step()
@@ -106,15 +113,22 @@ class ResNet50Test(tf.test.TestCase):
     with tf.contrib.summary.create_file_writer(
         logdir, max_queue=0,
         name='t0').as_default(), tf.contrib.summary.always_record_summaries():
-      with tf.device(device):
+      with tf.device(device), tfe.execution_mode(execution_mode):
         optimizer = tf.train.GradientDescentOptimizer(0.1)
         images, labels = random_batch(2)
         train_one_step(model, images, labels, optimizer)
         self.assertEqual(320, len(model.variables))
+        tfe.async_wait()
     events = summary_test_util.events_from_logdir(logdir)
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].tag, 'loss')
 
+  def test_train(self):
+    self._test_train()
+
+  def test_train_async(self):
+    self._test_train(execution_mode=tfe.ASYNC)
+
   def test_no_garbage(self):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format)
@@ -155,7 +169,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
   def _train_batch_sizes(self):
     """Choose batch sizes based on GPU capability."""
     for device in device_lib.list_local_devices():
-      if 'GPU:0' in device.name:
+      if tf.DeviceSpec.from_string(device.name).device_type == 'GPU':
         # Avoid OOM errors with larger batch sizes, which seem to cause errors
         # later on even if caught.
         #
@@ -166,76 +180,108 @@ class ResNet50Benchmarks(tf.test.Benchmark):
           return (16,)
         if 'P100' in device.physical_device_desc:
           return (16, 32, 64)
+
+      if tf.DeviceSpec.from_string(device.name).device_type == 'TPU':
+        # TODO(iga): Training fails with batch size of 16, probably because of
+        # no layout optimizations with op-by-op mode. Investigate more.
+        return (8,)
     return (16, 32)
 
   def _report(self, label, start, num_iters, device, batch_size, data_format):
     avg_time = (time.time() - start) / num_iters
-    dev = 'cpu' if 'cpu' in device else 'gpu'
+    dev = tf.DeviceSpec.from_string(device).device_type.lower()
     name = '%s_%s_batch_%d_%s' % (label, dev, batch_size, data_format)
     extras = {'examples_per_sec': batch_size / avg_time}
     self.report_benchmark(
         iters=num_iters, wall_time=avg_time, name=name, extras=extras)
 
-  def _force_gpu_sync(self):
-    # If this function is called in the context of a GPU device
+  def _force_device_sync(self):
+    # If this function is called in the context of a non-CPU device
     # (e.g., inside a 'with tf.device("/gpu:0")' block)
-    # then this will force a copy from CPU->GPU->CPU, which forces
-    # a sync. This is a roundabout way, yes.
+    # then this will force a copy from CPU->NON_CPU_DEVICE->CPU,
+    # which forces a sync. This is a roundabout way, yes.
     tf.constant(1.).cpu()
 
-  def _benchmark_eager_apply(self, label, defun=False):
-    device, data_format = device_and_data_format()
-    model = resnet50.ResNet50(data_format)
-    if defun:
-      model.call = tfe.defun(model.call)
-    batch_size = 64
-    num_burn = 5
-    num_iters = 30
-    with tf.device(device):
-      images, _ = random_batch(batch_size)
-      for _ in xrange(num_burn):
-        model(images).cpu()
-      gc.collect()
-      start = time.time()
-      for _ in xrange(num_iters):
-        model(images).cpu()
-      self._report(label, start, num_iters, device, batch_size, data_format)
-
-  def benchmark_eager_apply(self):
-    self._benchmark_eager_apply('eager_apply', defun=False)
-
-  def benchmark_eager_apply_with_defun(self):
-    self._benchmark_eager_apply('eager_apply_with_defun', defun=True)
-
-  def _benchmark_eager_train(self, label, make_iterator, defun=False):
-    device, data_format = device_and_data_format()
-    for batch_size in self._train_batch_sizes():
-      (images, labels) = random_batch(batch_size)
-      num_burn = 3
-      num_iters = 10
+  def _benchmark_eager_apply(self, label, defun=False, execution_mode=None,
+                             device_and_format=None):
+    with tfe.execution_mode(execution_mode):
+      device, data_format = device_and_format or device_and_data_format()
       model = resnet50.ResNet50(data_format)
       if defun:
         model.call = tfe.defun(model.call)
-      optimizer = tf.train.GradientDescentOptimizer(0.1)
-
+      batch_size = 64
+      num_burn = 5
+      num_iters = 30
       with tf.device(device):
-        iterator = make_iterator((images, labels))
+        images, _ = random_batch(batch_size, device_and_format)
         for _ in xrange(num_burn):
-          (images, labels) = iterator.next()
-          train_one_step(model, images, labels, optimizer)
-        self._force_gpu_sync()
+          model(images, training=False).cpu()
+        if execution_mode:
+          tfe.async_wait()
         gc.collect()
-
         start = time.time()
         for _ in xrange(num_iters):
-          (images, labels) = iterator.next()
-          train_one_step(model, images, labels, optimizer)
-        self._force_gpu_sync()
+          model(images, training=False).cpu()
+        if execution_mode:
+          tfe.async_wait()
         self._report(label, start, num_iters, device, batch_size, data_format)
 
-  def benchmark_eager_train(self):
+  def benchmark_eager_apply_sync(self):
+    self._benchmark_eager_apply('eager_apply', defun=False)
+
+  def benchmark_eager_apply_async(self):
+    self._benchmark_eager_apply(
+        'eager_apply_async', defun=False, execution_mode=tfe.ASYNC)
+
+  def benchmark_eager_apply_with_defun(self):
+    self._benchmark_eager_apply('eager_apply_with_defun', defun=True)
+
+  def _benchmark_eager_train(self,
+                             label,
+                             make_iterator,
+                             defun=False,
+                             execution_mode=None,
+                             device_and_format=None):
+    with tfe.execution_mode(execution_mode):
+      device, data_format = device_and_format or device_and_data_format()
+      for batch_size in self._train_batch_sizes():
+        (images, labels) = random_batch(batch_size, device_and_format)
+        num_burn = 3
+        num_iters = 10
+        model = resnet50.ResNet50(data_format)
+        if defun:
+          model.call = tfe.defun(model.call)
+        optimizer = tf.train.GradientDescentOptimizer(0.1)
+
+        with tf.device(device):
+          iterator = make_iterator((images, labels))
+          for _ in xrange(num_burn):
+            (images, labels) = iterator.next()
+            train_one_step(model, images, labels, optimizer)
+          if execution_mode:
+            tfe.async_wait()
+          self._force_device_sync()
+          gc.collect()
+
+          start = time.time()
+          for _ in xrange(num_iters):
+            (images, labels) = iterator.next()
+            train_one_step(model, images, labels, optimizer)
+          if execution_mode:
+            tfe.async_wait()
+          self._force_device_sync()
+          self._report(label, start, num_iters, device, batch_size, data_format)
+
+  def benchmark_eager_train_sync(self):
     self._benchmark_eager_train('eager_train', MockIterator, defun=False)
 
+  def benchmark_eager_train_async(self):
+    self._benchmark_eager_train(
+        'eager_train_async',
+        MockIterator,
+        defun=False,
+        execution_mode=tfe.ASYNC)
+
   def benchmark_eager_train_with_defun(self):
     self._benchmark_eager_train(
         'eager_train_with_defun', MockIterator, defun=True)
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index aa87b94e7b0876e65405f6bcb2d6aabde36582bf..492adbe1d80941f9df96d6636e4933d11239408e 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -60,6 +60,7 @@ import functools
 import os
 import sys
 import time
+import urllib
 
 import six
 import tensorflow as tf
@@ -72,6 +73,8 @@ try:
 except ImportError:
   HAS_MATPLOTLIB = False
 
+layers = tf.keras.layers
+
 
 def parse(line):
   """Parse a line from the colors dataset."""
@@ -89,13 +92,35 @@ def parse(line):
   return rgb, chars, length
 
 
+def maybe_download(filename, work_directory, source_url):
+  """Download the data from source url, unless it's already here.
+
+  Args:
+    filename: string, name of the file in the directory.
+    work_directory: string, path to working directory.
+    source_url: url to download from if file doesn't exist.
+
+  Returns:
+    Path to resulting file.
+  """
+  if not tf.gfile.Exists(work_directory):
+    tf.gfile.MakeDirs(work_directory)
+  filepath = os.path.join(work_directory, filename)
+  if not tf.gfile.Exists(filepath):
+    temp_file_name, _ = urllib.request.urlretrieve(source_url)
+    tf.gfile.Copy(temp_file_name, filepath)
+    with tf.gfile.GFile(filepath) as f:
+      size = f.size()
+      print("Successfully downloaded", filename, size, "bytes.")
+  return filepath
+
+
 def load_dataset(data_dir, url, batch_size):
   """Loads the colors data at path into a PaddedDataset."""
 
   # Downloads data at url into data_dir/basename(url). The dataset has a header
   # row (color_name, r, g, b) followed by comma-separated lines.
-  path = tf.contrib.learn.datasets.base.maybe_download(
-      os.path.basename(url), data_dir, url)
+  path = maybe_download(os.path.basename(url), data_dir, url)
 
   # This chain of commands loads our data by:
   #   1. skipping the header; (.skip(1))
@@ -109,7 +134,7 @@ def load_dataset(data_dir, url, batch_size):
 
 
 # pylint: disable=not-callable
-class RNNColorbot(tfe.Network):
+class RNNColorbot(tf.keras.Model):
   """Multi-layer (LSTM) RNN that regresses on real-valued vector labels.
   """
 
@@ -127,23 +152,20 @@ class RNNColorbot(tfe.Network):
     self.label_dimension = label_dimension
     self.keep_prob = keep_prob
 
-    # Note the calls to `track_layer` below; these calls register the layers as
-    # network components that house trainable variables.
-    self.cells = [
-        self.track_layer(tf.nn.rnn_cell.BasicLSTMCell(size))
-        for size in rnn_cell_sizes
-    ]
-    self.relu = self.track_layer(
-        tf.layers.Dense(label_dimension, activation=tf.nn.relu, name="relu"))
+    self.cells = self._add_cells(
+        [tf.nn.rnn_cell.BasicLSTMCell(size) for size in rnn_cell_sizes])
+    self.relu = layers.Dense(
+        label_dimension, activation=tf.nn.relu, name="relu")
 
-  def call(self, chars, sequence_length, training=False):
+  def call(self, inputs, training=False):
     """Implements the RNN logic and prediction generation.
 
     Args:
-      chars: a Tensor of dimension [batch_size, time_steps, 256] holding a
-        batch of one-hot encoded color names
-      sequence_length: a Tensor of dimension [batch_size] holding the length
-        of each character sequence (i.e., color name)
+      inputs: A tuple (chars, sequence_length), where chars is a batch of
+        one-hot encoded color names represented as a Tensor with dimensions
+        [batch_size, time_steps, 256] and sequence_length holds the length
+        of each character sequence (color name) as a Tensor with dimension
+        [batch_size].
       training: whether the invocation is happening during training
 
     Returns:
@@ -151,6 +173,7 @@ class RNNColorbot(tfe.Network):
       passing chars through a multi-layer RNN and applying a ReLU to the final
       hidden state.
     """
+    (chars, sequence_length) = inputs
     # Transpose the first and second dimensions so that chars is of shape
     # [time_steps, batch_size, dimension].
     chars = tf.transpose(chars, [1, 0, 2])
@@ -181,6 +204,14 @@ class RNNColorbot(tfe.Network):
     hidden_states = tf.gather_nd(chars, indices)
     return self.relu(hidden_states)
 
+  def _add_cells(self, cells):
+    # "Magic" required for keras.Model classes to track all the variables in
+    # a list of layers.Layer objects.
+    # TODO(ashankar): Figure out API so user code doesn't have to do this.
+    for i, c in enumerate(cells):
+      setattr(self, "cell-%d" % i, c)
+    return cells
+
 
 def loss(labels, predictions):
   """Computes mean squared loss."""
@@ -191,7 +222,7 @@ def test(model, eval_data):
   """Computes the average loss on eval_data, which should be a Dataset."""
   avg_loss = tfe.metrics.Mean("loss")
   for (labels, chars, sequence_length) in tfe.Iterator(eval_data):
-    predictions = model(chars, sequence_length, training=False)
+    predictions = model((chars, sequence_length), training=False)
     avg_loss(loss(labels, predictions))
   print("eval/loss: %.6f\n" % avg_loss.result())
   with tf.contrib.summary.always_record_summaries():
@@ -204,7 +235,7 @@ def train_one_epoch(model, optimizer, train_data, log_interval=10):
   tf.train.get_or_create_global_step()
 
   def model_loss(labels, chars, sequence_length):
-    predictions = model(chars, sequence_length, training=True)
+    predictions = model((chars, sequence_length), training=True)
     loss_value = loss(labels, predictions)
     tf.contrib.summary.scalar("loss", loss_value)
     return loss_value
@@ -277,7 +308,7 @@ def main(_):
       (chars, length) = (tf.identity(chars), tf.identity(length))
       chars = tf.expand_dims(chars, 0)
       length = tf.expand_dims(length, 0)
-      preds = tf.unstack(model(chars, length, training=False)[0])
+      preds = tf.unstack(model((chars, length), training=False)[0])
 
     # Predictions cannot be negative, as they are generated by a ReLU layer;
     # they may, however, be greater than 1.
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index 5c5c59c87744f4ffa6db90e5d8d3aa3bc8132756..be5d60449d7e08c99cc28e76befce56f468c77fd 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -38,22 +38,26 @@ import tensorflow as tf
 from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
 from tensorflow.contrib.eager.python import tfe
 
+layers = tf.keras.layers
 
-class RNN(tfe.Network):
+
+class RNN(tf.keras.Model):
   """A static RNN.
 
-  Similar to tf.nn.static_rnn, implemented as a tf.layer.Layer.
+  Similar to tf.nn.static_rnn, implemented as a class.
   """
 
   def __init__(self, hidden_dim, num_layers, keep_ratio):
     super(RNN, self).__init__()
     self.keep_ratio = keep_ratio
-    for _ in range(num_layers):
-      self.track_layer(tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_dim))
+    self.cells = self._add_cells([
+        tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_dim)
+        for _ in range(num_layers)
+    ])
 
   def call(self, input_seq, training):
     batch_size = int(input_seq.shape[1])
-    for c in self.layers:
+    for c in self.cells:
       state = c.zero_state(batch_size, tf.float32)
       outputs = []
       input_seq = tf.unstack(input_seq, num=int(input_seq.shape[0]), axis=0)
@@ -64,10 +68,22 @@ class RNN(tfe.Network):
       input_seq = tf.stack(outputs, axis=0)
       if training:
         input_seq = tf.nn.dropout(input_seq, self.keep_ratio)
-    return input_seq, None
-
-
-class Embedding(tf.layers.Layer):
+    # Returning a list instead of a single tensor so that the line:
+    # y = self.rnn(y, ...)[0]
+    # in PTBModel.call works for both this RNN and CudnnLSTM (which returns a
+    # tuple (output, output_states).
+    return [input_seq]
+
+  def _add_cells(self, cells):
+    # "Magic" required for keras.Model classes to track all the variables in
+    # a list of Layer objects.
+    # TODO(ashankar): Figure out API so user code doesn't have to do this.
+    for i, c in enumerate(cells):
+      setattr(self, "cell-%d" % i, c)
+    return cells
+
+
+class Embedding(layers.Layer):
   """An Embedding layer."""
 
   def __init__(self, vocab_size, embedding_dim, **kwargs):
@@ -87,7 +103,8 @@ class Embedding(tf.layers.Layer):
     return tf.nn.embedding_lookup(self.embedding, x)
 
 
-class PTBModel(tfe.Network):
+# pylint: disable=not-callable
+class PTBModel(tf.keras.Model):
   """LSTM for word language modeling.
 
   Model described in:
@@ -109,19 +126,16 @@ class PTBModel(tfe.Network):
 
     self.keep_ratio = 1 - dropout_ratio
     self.use_cudnn_rnn = use_cudnn_rnn
-    self.embedding = self.track_layer(Embedding(vocab_size, embedding_dim))
+    self.embedding = Embedding(vocab_size, embedding_dim)
 
     if self.use_cudnn_rnn:
       self.rnn = cudnn_rnn.CudnnLSTM(
           num_layers, hidden_dim, dropout=dropout_ratio)
     else:
       self.rnn = RNN(hidden_dim, num_layers, self.keep_ratio)
-    self.track_layer(self.rnn)
 
-    self.linear = self.track_layer(
-        tf.layers.Dense(
-            vocab_size,
-            kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1)))
+    self.linear = layers.Dense(
+        vocab_size, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
     self._output_shape = [-1, embedding_dim]
 
   def call(self, input_seq, training):
@@ -136,7 +150,7 @@ class PTBModel(tfe.Network):
     y = self.embedding(input_seq)
     if training:
       y = tf.nn.dropout(y, self.keep_ratio)
-    y, _ = self.rnn(y, training=training)
+    y = self.rnn(y, training=training)[0]
     return self.linear(tf.reshape(y, self._output_shape))
 
 
@@ -148,7 +162,7 @@ def clip_gradients(grads_and_vars, clip_ratio):
 
 def loss_fn(model, inputs, targets, training):
   labels = tf.reshape(targets, [-1])
-  outputs = model(inputs, training)
+  outputs = model(inputs, training=training)
   return tf.reduce_mean(
       tf.nn.sparse_softmax_cross_entropy_with_logits(
           labels=labels, logits=outputs))
@@ -301,32 +315,37 @@ def main(_):
   have_gpu = tfe.num_gpus() > 0
   use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu
 
-  with tfe.restore_variables_on_create(
-      tf.train.latest_checkpoint(FLAGS.logdir)):
-    with tf.device("/device:GPU:0" if have_gpu else None):
-      # Make learning_rate a Variable so it can be included in the checkpoint
-      # and we can resume training with the last saved learning_rate.
-      learning_rate = tfe.Variable(20.0, name="learning_rate")
-      sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
-      model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
-                       FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
-                       use_cudnn_rnn)
-      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-
-      best_loss = None
-      for _ in range(FLAGS.epoch):
-        train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
-        eval_loss = evaluate(model, eval_data)
-        if not best_loss or eval_loss < best_loss:
-          if FLAGS.logdir:
-            tfe.Saver(model.trainable_weights + [learning_rate]).save(
-                os.path.join(FLAGS.logdir, "ckpt"))
-          best_loss = eval_loss
-        else:
-          learning_rate.assign(learning_rate / 4.0)
-          sys.stderr.write("eval_loss did not reduce in this epoch, "
-                           "changing learning rate to %f for the next epoch\n" %
-                           learning_rate.numpy())
+  with tf.device("/device:GPU:0" if have_gpu else None):
+    # Make learning_rate a Variable so it can be included in the checkpoint
+    # and we can resume training with the last saved learning_rate.
+    learning_rate = tfe.Variable(20.0, name="learning_rate")
+    model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
+                     FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
+                     use_cudnn_rnn)
+    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+    checkpoint = tfe.Checkpoint(
+        learning_rate=learning_rate, model=model,
+        # GradientDescentOptimizer has no state to checkpoint, but noting it
+        # here lets us swap in an optimizer that does.
+        optimizer=optimizer)
+    # Restore existing variables now (learning_rate), and restore new variables
+    # on creation if a checkpoint exists.
+    checkpoint.restore(tf.train.latest_checkpoint(FLAGS.logdir))
+    sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
+
+    best_loss = None
+    for _ in range(FLAGS.epoch):
+      train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
+      eval_loss = evaluate(model, eval_data)
+      if not best_loss or eval_loss < best_loss:
+        if FLAGS.logdir:
+          checkpoint.save(os.path.join(FLAGS.logdir, "ckpt"))
+        best_loss = eval_loss
+      else:
+        learning_rate.assign(learning_rate / 4.0)
+        sys.stderr.write("eval_loss did not reduce in this epoch, "
+                         "changing learning rate to %f for the next epoch\n" %
+                         learning_rate.numpy())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
index a1f8a759e2a556bc219f0aa13942f293c4f34cfa..5966f1d4873e8e77b3ad5914da7bfc7e69d4e341 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/BUILD
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -38,5 +38,9 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
     ],
-    tags = ["no_pip"],  # because spinn.py is under third_party/.
+    tags = [
+        "no-internal-py3",  # flaky
+        "no_cuda_on_cpu_tap",
+        "no_pip",  # because spinn.py is under third_party/.
+    ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 081b0af14fcc983a3f85d2a50e2bb04d2f2493b3..f825a2a7363fbe144162eca96398920ead0c4e50 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -34,6 +34,7 @@ import tensorflow.contrib.eager as tfe
 from tensorflow.contrib.eager.python.examples.spinn import data
 from third_party.examples.eager.spinn import spinn
 from tensorflow.contrib.summary import summary_test_util
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.training import checkpoint_utils
@@ -417,12 +418,17 @@ class SpinnTest(test_util.TensorFlowTestCase):
                     if event.summary.value
                     and event.summary.value[0].tag == "train/loss"]
     self.assertEqual(config.epochs, len(train_losses))
-    self.assertLess(train_losses[-1], train_losses[0])
 
     # 5. Verify that checkpoints exist and contains all the expected variables.
     self.assertTrue(glob.glob(os.path.join(config.logdir, "ckpt*")))
-    ckpt_variable_names = [
-        item[0] for item in checkpoint_utils.list_variables(config.logdir)]
+    object_graph_string = checkpoint_utils.load_variable(
+        config.logdir, name="_CHECKPOINTABLE_OBJECT_GRAPH")
+    object_graph = checkpointable_object_graph_pb2.CheckpointableObjectGraph()
+    object_graph.ParseFromString(object_graph_string)
+    ckpt_variable_names = set()
+    for node in object_graph.nodes:
+      for attribute in node.attributes:
+        ckpt_variable_names.add(attribute.full_name)
     self.assertIn("global_step", ckpt_variable_names)
     for v in trainer.variables:
       variable_name = v.name[:v.name.index(":")] if ":" in v.name else v.name
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index d97ff6b74cf033617154f7cbbd00cb6492a1d2f4..2d2aba6908b168e0bf63f4706b6344cbb4ca82bd 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -1,903 +1,18 @@
-# TensorFlow Eager Execution
-
-## What is this?
+# Eager execution
 
 Eager execution is a feature that makes TensorFlow execute operations
-immediately: concrete values are returned, instead of a computational graph to
-be executed later.
-
-As a result, enabling eager execution provides:
-
--   A [NumPy](http://www.numpy.org/)-like library for numerical computation with
-    support for GPU acceleration and automatic differentiation.
--   A flexible platform for machine learning research and experimentation.
-
-Eager execution is under active development. This guide walks through an
-alpha/preview release. In particular, not all TensorFlow APIs currently work
-with eager execution enabled, and some models may be slow to execute, compared
-to models defined without using eager execution.
-
-## Installation
-
-Eager execution is included in TensorFlow versions 1.5 and above.
-Installation instructions at https://www.tensorflow.org/install/
-
-The contents of this guide are compatible with TensorFlow 1.5.
-However, if you run into bugs that are fixed in source but not the
-release, you may want to either [build from
-source](https://www.tensorflow.org/install/install_sources)
-or try a nightly build. The nightly builds are available as:
-
-- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
-
-- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
-
-For example, to run the latest nightly docker image:
-
-```sh
-# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
-docker pull tensorflow/tensorflow:nightly-gpu
-docker run --runtime=nvidia -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
-
-# If you do not have a GPU, use the CPU-only image
-docker pull tensorflow/tensorflow:nightly
-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-```
-
-And then visit http://localhost:8888 in your browser for a Jupyter notebook
-environment.
-
-## Getting Started
-
-With TensorFlow installed, eager execution is enabled via a single call:
-
-```python
-import tensorflow as tf
-
-import tensorflow.contrib.eager as tfe
-
-tfe.enable_eager_execution()
-```
-
-Enabling eager execution changes how TensorFlow functions behave (in particular,
-`Tensor` objects will reference concrete values instead of being symbolic
-handles to nodes in a computational graph). As a result, eager execution should
-be enabled at the beginning of a program and cannot be disabled afterwards in
-the same program.
-
-Code examples in the rest of this guide assume that eager execution has been
-enabled.
-
-## A library for numerical computation
-
-A significant fraction of the [TensorFlow
-API](https://www.tensorflow.org/api_docs/python/) consists of numerical
-operations:
-[arithmetic operations](https://www.tensorflow.org/api_guides/python/math_ops#Arithmetic_Operators),
-[matrix operations](https://www.tensorflow.org/api_guides/python/math_ops#Matrix_Math_Functions),
-[linear algebra operations](https://www.tensorflow.org/versions/master/api_docs/python/tf/linalg),
-etc.
-
-With eager execution enabled, these operations consume and return
-multi-dimensional arrays as `Tensor` objects, similar to NumPy
-[`ndarray`s](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.ndarray.html).
-For example:
-
-```python
-# Multiply two 2x2 matrices
-x = tf.matmul([[1, 2],
-               [3, 4]],
-              [[4, 5],
-               [6, 7]])
-# Add one to each element
-# (tf.add supports broadcasting)
-y = tf.add(x, 1)
-
-# Create a random random 5x3 matrix
-z = tf.random_uniform([5, 3])
-
-print(x)
-print(y)
-print(z)
-```
-
-Output:
-
-```
-tf.Tensor(
-[[16 19]
- [36 43]], shape=(2, 2), dtype=int32)
-tf.Tensor(
-[[17 20]
- [37 44]], shape=(2, 2), dtype=int32)
-tf.Tensor(
-[[ 0.25058532  0.0929395   0.54113817]
- [ 0.3108716   0.93350542  0.84909797]
- [ 0.53081679  0.12788558  0.01767385]
- [ 0.29725885  0.33540785  0.83588314]
- [ 0.38877153  0.39720535  0.78914213]], shape=(5, 3), dtype=float32)
-```
-
-For convenience, these operations can also be triggered via operator overloading
-of the `Tensor` object. For example, the `+` operator is equivalent to `tf.add`,
-`-` to `tf.subtract`, `*` to `tf.multiply`, etc.:
-
-```python
-x = (tf.ones([1], dtype=tf.float32) + 1) * 2 - 1
-print(x)
-```
-
-Output:
-
-```
-tf.Tensor([ 3.], shape=(1,), dtype=float32)
-```
-
-### Converting to and from NumPy
-
-The operations above automatically convert Python objects (like lists of
-numbers) and NumPy arrays to `Tensor` objects. `Tensor` objects can also be used
-as NumPy arrays by numpy operations.
-
-```python
-import numpy as np
-
-x = tf.add(1, 1)                     # tf.Tensor with a value of 2
-y = tf.add(np.array(1), np.array(1)) # tf.Tensor with a value of 2
-z = np.multiply(x, y)                # numpy.int64 with a value of 4
-```
-
-Alternatively, they can be explicitly converted using
-[`tf.constant`](https://www.tensorflow.org/api_docs/python/tf/constant), as
-shown in the next example.
-
-Conversely, you can call the `numpy()` method of a `Tensor` object' to obtain
-its NumPy `ndarray` value. For example:
-
-```python
-import numpy as np
-
-np_x = np.array(2., dtype=np.float32)
-x = tf.constant(np_x)
-
-py_y = 3.
-y = tf.constant(py_y)
-
-z = x + y + 1
-
-print(z)
-print(z.numpy())
-```
-
-Output:
-
-```
-tf.Tensor(6.0, shape=(), dtype=float32)
-6.0
-```
-
-### GPU acceleration
-
-Many TensorFlow operations support GPU acceleration. With eager execution
-enabled, [computation is *not* automatically
-offloaded](https://www.tensorflow.org/tutorials/using_gpu) to GPUs. Instead, you
-must explicitly specify when GPUs should be used.
-
-The simplest way to do this is to enclose your computation in a `with
-tf.device('/gpu:0')` block. Also of interest is the `tfe.num_gpus()` function,
-which returns the number of available GPUs.
-
-For example, consider this snippet to measure the time to multiply two 1000x1000
-matrices on CPU:
-
-```python
-import time
-
-def measure(x):
-  # The very first time a GPU is used by TensorFlow, it is initialized.
-  # So exclude the first run from timing.
-  tf.matmul(x, x)
-
-  start = time.time()
-  for i in range(10):
-    tf.matmul(x, x)
-  end = time.time()
-
-  return "Took %s seconds to multiply a %s matrix by itself 10 times" % (end - start, x.shape)
-
-# Run on CPU:
-with tf.device("/cpu:0"):
-  print("CPU: %s" % measure(tf.random_normal([1000, 1000])))
-
-# If a GPU is available, run on GPU:
-if tfe.num_gpus() > 0:
-  with tf.device("/gpu:0"):
-    print("GPU: %s" % measure(tf.random_normal([1000, 1000])))
-```
-
-Output (exact numbers will depend on the characteristics of the hardware):
-
-```python
-CPU: Took 0.145531892776 seconds to multiply a (1000, 1000) matrix by itself 10 times
-GPU: Took 0.000458955764771 seconds to multiply a (1000, 1000) matrix by itself 10 times
-```
-
-Alternatively, methods on the `Tensor` object can be used to explicitly copy the
-`Tensor` to a different device. Operations are typically executed on the device
-on which the inputs are placed. For example:
-
-```python
-x = tf.random_normal([10, 10])
-
-x_gpu0 = x.gpu()
-x_cpu = x.cpu()
-
-_ = tf.matmul(x_cpu, x_cpu)  # Runs on CPU
-_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
-
-if tfe.num_gpus() > 1:
-  x_gpu1 = x.gpu(1)
-  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
-```
-
-### Automatic Differentiation
-
-[Automatic
-differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) is
-very useful when implementing many machine learning algorithms (e.g.,
-[backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
-neural networks). For this purpose, TensorFlow eager execution provides an
-[autograd](https://github.com/HIPS/autograd)-style API for automatic
-differentiation. Specifically, the functions:
-
--   `tfe.gradients_function(f)`: Returns a Python function that computes the
-    derivatives of the Python function `f` with respect to its arguments. `f`
-    must return a scalar value. When the returned function is invoked, it
-    returns a list of `Tensor` objects (one element for each argument of `f`).
--   `tfe.value_and_gradients_function(f)`: Similar to `tfe.gradients_function`,
-    except that when the returned function is invoked, it returns the value of
-    `f` in addition to the list of derivatives of `f` with respect to its
-    arguments.
-
-These functions naturally apply to higher order differentiation as well. For
-example:
-
-```python
-def f(x):
-  return tf.multiply(x, x)  # Or x * x
-assert 9 == f(3.).numpy()
-
-df = tfe.gradients_function(f)
-assert 6 == df(3.)[0].numpy()
-
-# Second order deriviative.
-d2f = tfe.gradients_function(lambda x: df(x)[0])
-assert 2 == d2f(3.)[0].numpy()
-
-# Third order derivative.
-d3f = tfe.gradients_function(lambda x : d2f(x)[0])
-assert 0 == d3f(3.)[0].numpy()
-```
-
-These functions can be used to train models. For example, consider the following
-simple linear regression model:
-
-```python
-def prediction(input, weight, bias):
-  return input * weight + bias
-
-# A toy dataset of points around 3 * x + 2
-NUM_EXAMPLES = 1000
-training_inputs = tf.random_normal([NUM_EXAMPLES])
-noise = tf.random_normal([NUM_EXAMPLES])
-training_outputs = training_inputs * 3 + 2 + noise
-
-# A loss function: Mean-squared error
-def loss(weight, bias):
-  error = prediction(training_inputs, weight, bias) - training_outputs
-  return tf.reduce_mean(tf.square(error))
-
-# Function that returns the derivative of loss with respect to
-# weight and bias
-grad = tfe.gradients_function(loss)
-
-# Train for 200 steps (starting from some random choice for W and B, on the same
-# batch of data).
-W = 5.
-B = 10.
-learning_rate = 0.01
-print("Initial loss: %f" % loss(W, B).numpy())
-for i in range(200):
-  (dW, dB) = grad(W, B)
-  W -= dW * learning_rate
-  B -= dB * learning_rate
-  if i % 20 == 0:
-    print("Loss at step %d: %f" % (i, loss(W, B).numpy()))
-print("Final loss: %f" % loss(W, B).numpy())
-print("W, B = %f, %f" % (W.numpy(), B.numpy()))
-```
-
-Output: (the exact numbers may vary depending on the randomness in noise)
-
-```
-Initial loss: 66.730003
-Loss at step 0: 64.200096
-Loss at step 20: 29.872814
-Loss at step 40: 14.233772
-Loss at step 60: 7.090570
-Loss at step 80: 3.819887
-Loss at step 100: 2.318821
-Loss at step 120: 1.628385
-Loss at step 140: 1.310142
-Loss at step 160: 1.163167
-Loss at step 180: 1.095162
-Final loss: 1.064711
-W, B = 3.094944, 2.161383
-```
-
-To utilize the GPU, place the code above within a `with tf.device("/gpu:0"):`
-block. (However, this particular model, with only two floating point parameters,
-is unlikely to benefit from GPU acceleration.)
-
-### Customizing gradients
-
-One may want to define custom gradients for an operation, or for a function.
-This may be useful for multiple reasons, including providing a more efficient
-or more [numerically stable](https://en.wikipedia.org/wiki/Numerical_stability)
-gradient for a sequence of operations.
-
-For example, consider the function `log(1 + e^x)`, which commonly occurs in the
-computation of cross entropy and log likelihoods.
-
-```python
-def log1pexp(x):
-  return tf.log(1 + tf.exp(x))
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# Works fine at x = 0.
-assert 0.5 == float(grad_log1pexp(0.)[0])
-
-# Returns a `nan` at x = 100 due to numerical instability.
-import math
-assert math.isnan(float(grad_log1pexp(100.)[0]))
-```
-
-We can define a custom gradient for the above function that analytically
-simplifies the gradient expression.
-
-```python
-@tfe.custom_gradient
-def log1pexp(x):
-  e = tf.exp(x)
-  def grad(dy):
-    return dy * (1 - 1 / (1 + e))
-  return tf.log(1 + e), grad
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# Works as before at x = 0.
-assert 0.5 == float(grad_log1pexp(0.)[0])
-
-# But now works at x = 100 as well.
-assert 1.0 == float(grad_log1pexp(100.)[0])
-```
-Also notice how the gradient function implementation reuses an expression
-(`tf.exp(x)`) computed during the forward pass, hence making the gradient
-computation more efficient by avoiding redundant computation.
-
-## Building and training models
-
-In practice, your computation may have many parameters to be optimized (by
-computing derivatives). Encapsulating them into re-usable classes/objects
-makes the code easier to follow than writing a single top-level function with
-many arguments.
-
-In fact, eager execution encourages use of the [Keras](https://keras.io)-style
-"Layer" classes in the
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
-module.
-
-Furthermore, you may want to apply more sophisticated techniques to compute
-parameter updates, such as those in
-[`tf.train.Optimizer`](https://www.tensorflow.org/api_guides/python/train#Optimizers)
-implementations.
-
-This next section walks through using the same `Optimizer` and `Layer` APIs used
-to build trainable TensorFlow graphs in an environment where eager execution is
-enabled.
-
-### Variables and Optimizers
-
-`tfe.Variable` objects store mutable `Tensor` values that can be accessed during
-training, making automatic differentiation easier. In particular, parameters of
-a model can be encapsulated in Python classes as variables.
-
-`tfe.gradients_function(f)` introduced earlier computes the derivatives of `f`
-with respect to its arguments. However, it requires all parameters of interest
-to be arguments of `f`, which becomes cumbersome when `f` depends on a large
-number of trainable parameters.
-
-`tfe.implicit_gradients` is an alternative function with some useful properties:
-
--   It computes the derivatives of `f` with respect to all the `tfe.Variable`s
-    used by `f`.
--   When the returned function is invoked, it returns a list of
-    (gradient value, Variable object) tuples.
-
-Representing model parameters as `Variable` objects, along with the use of
-`tfe.implicit_gradients`, typically results in better encapsulation. For
-example, the linear regression model described above can be written into a
-class:
-
-```python
-class Model(object):
-  def __init__(self):
-    self.W = tfe.Variable(5., name='weight')
-    self.B = tfe.Variable(10., name='bias')
-
-  def predict(self, inputs):
-    return inputs * self.W + self.B
-
-
-# The loss function to be optimized
-def loss(model, inputs, targets):
-  error = model.predict(inputs) - targets
-  return tf.reduce_mean(tf.square(error))
-
-# A toy dataset of points around 3 * x + 2
-NUM_EXAMPLES = 1000
-training_inputs = tf.random_normal([NUM_EXAMPLES])
-noise = tf.random_normal([NUM_EXAMPLES])
-training_outputs = training_inputs * 3 + 2 + noise
-
-# Define:
-# 1. A model
-# 2. Derivatives of a loss function with respect to model parameters
-# 3. A strategy for updating the variables based on the derivatives
-model = Model()
-grad = tfe.implicit_gradients(loss)
-optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-
-# The training loop
-print("Initial loss: %f" %
-      loss(model, training_inputs, training_outputs).numpy())
-for i in range(201):
-  optimizer.apply_gradients(grad(model, training_inputs, training_outputs))
-  if i % 20 == 0:
-    print("Loss at step %d: %f" %
-          (i, loss(model, training_inputs, training_outputs).numpy()))
-print("Final loss: %f" % loss(model, training_inputs, training_outputs).numpy())
-print("W, B = %s, %s" % (model.W.numpy(), model.B.numpy()))
-```
-
-Output:
-
-```
-Initial loss: 69.693184
-Loss at step 0: 66.987854
-Loss at step 20: 30.553387
-Loss at step 40: 14.250237
-Loss at step 60: 6.955020
-Loss at step 80: 3.690550
-Loss at step 100: 2.229739
-Loss at step 120: 1.576032
-Loss at step 140: 1.283496
-Loss at step 160: 1.152584
-Loss at step 180: 1.093999
-Final loss: 1.067780
-W, B = 3.0114281, 2.0865183
-```
-
-Using `implicit_gradients` avoids the need to provide all the trainable
-parameters of the model as arguments to the `loss` function.
-
-### Using Keras and the Layers API
-
-[Keras](https://keras.io) is a popular API for defining model structures. The
-[`tf.keras.layers`](https://www.tensorflow.org/api_docs/python/tf/keras/layers)
-module provides a set of building blocks for models and is implemented using the
-`tf.layers.Layer` subclasses in the
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
-module. We encourage the use of these same building blocks when using
-TensorFlow's eager execution feature. For example, the very same linear
-regression model can be built using `tf.layers.Dense`:
-
-```python
-class Model(object):
-  def __init__(self):
-    self.layer = tf.layers.Dense(1)
-
-  def predict(self, inputs):
-    return self.layer(inputs)
-```
-
-The `tf.layers` API makes it more convenient to define more sophisticated
-models. For example, the following will train an MNIST model:
-
-```python
-class MNISTModel(object):
-  def __init__(self, data_format):
-    # 'channels_first' is typically faster on GPUs
-    # while 'channels_last' is typically faster on CPUs.
-    # See: https://www.tensorflow.org/performance/performance_guide#data_formats
-    if data_format == 'channels_first':
-      self._input_shape = [-1, 1, 28, 28]
-    else:
-      self._input_shape = [-1, 28, 28, 1]
-    self.conv1 = tf.layers.Conv2D(32, 5,
-                                  padding='same',
-                                  activation=tf.nn.relu,
-                                  data_format=data_format)
-    self.max_pool2d = tf.layers.MaxPooling2D(
-        (2, 2), (2, 2), padding='same', data_format=data_format)
-    self.conv2 = tf.layers.Conv2D(64, 5,
-                                  padding='same',
-                                  activation=tf.nn.relu,
-                                  data_format=data_format)
-    self.dense1 = tf.layers.Dense(1024, activation=tf.nn.relu)
-    self.dropout = tf.layers.Dropout(0.5)
-    self.dense2 = tf.layers.Dense(10)
-
-  def predict(self, inputs):
-    x = tf.reshape(inputs, self._input_shape)
-    x = self.max_pool2d(self.conv1(x))
-    x = self.max_pool2d(self.conv2(x))
-    x = tf.layers.flatten(x)
-    x = self.dropout(self.dense1(x))
-    return self.dense2(x)
-
-def loss(model, inputs, targets):
-  return tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(
-          logits=model.predict(inputs), labels=targets))
-
-
-# Load the training and validation data
-from tensorflow.examples.tutorials.mnist import input_data
-data = input_data.read_data_sets("./mnist_data", one_hot=True)
-
-# Train
-device = "gpu:0" if tfe.num_gpus() else "cpu:0"
-model = MNISTModel('channels_first' if tfe.num_gpus() else 'channels_last')
-optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
-grad = tfe.implicit_gradients(loss)
-for i in range(20001):
-  with tf.device(device):
-    (inputs, targets) = data.train.next_batch(50)
-    optimizer.apply_gradients(grad(model, inputs, targets))
-    if i % 100 == 0:
-      print("Step %d: Loss on training set : %f" %
-            (i, loss(model, inputs, targets).numpy()))
-print("Loss on test set: %f" % loss(model, data.test.images, data.test.labels).numpy())
-```
-
-For a more complete example, see
-[`tensorflow/contrib/eager/python/examples/mnist.py`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist/mnist.py)
-
-### Checkpointing trained variables
-
-TensorFlow Variables (`tfe.Variable`) provides a way to represent shared,
-persistent state of your model. The `tfe.Saver` class (which is a thin wrapper
-over the
-[`tf.train.Saver`](https://www.tensorflow.org/api_docs/python/tf/train/Saver)
-class) provides a means to save and restore variables to and from _checkpoints_.
-
-For example:
-
-```python
-# Create variables.
-x = tfe.Variable(10., name='x')
-y = tfe.Variable(5., name='y')
-
-# Create a Saver.
-saver = tfe.Saver([x, y])
-
-# Assign new values to the variables and save.
-x.assign(2.)
-saver.save('/tmp/ckpt')
-
-# Change the variable after saving.
-x.assign(11.)
-assert 16. == (x + y).numpy()  # 11 + 5
-
-# Restore the values in the checkpoint.
-saver.restore('/tmp/ckpt')
-
-assert 7. == (x + y).numpy()  # 2 + 5
-```
-
-### `tfe.Network`
-
-You may often want to organize your models using classes, like the `MNISTModel`
-class described above. We recommend inheriting from the `tfe.Network` class as
-it provides conveniences like keeping track of all model variables and methods
-to save and restore from checkpoints.
-
-Sub-classes of `tfe.Network` may register `Layer`s (like classes in
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers),
-or [Keras
-layers](https://www.tensorflow.org/api_docs/python/tf/keras/layers))
-using a call to `self.track_layer()` and define the computation in an
-implementation of `call()`.
-
-Note that `tf.layers.Layer` objects (like `tf.layers.Dense`) create variables
-lazily, when the first input is encountered.
-
-For example, consider the following two-layer neural network:
-
-```python
-class TwoLayerNet(tfe.Network):
-  def __init__(self):
-    super(TwoLayerNet, self).__init__()
-    self.layer1 = self.track_layer(
-      tf.layers.Dense(2, activation=tf.nn.relu, use_bias=False))
-    self.layer2 = self.track_layer(tf.layers.Dense(3, use_bias=False))
-
-  def call(self, x):
-    return self.layer2(self.layer1(x))
-
-net = TwoLayerNet()
-
-# No variables created yet
-assert 0 == len(net.variables)
-
-# They are created on first input:
-inp = tf.constant([[1.]])
-
-# Since input is a 1x1 matrix, net.l1 has 2 units and net.l2 has 3 units,
-# the output is the product of a 1x1 matrix with a 1x2 matrix with a 2x3
-# matrix.
-assert [1, 3] == net(inp).shape.as_list()  # Invoke net; get output shape.
-assert 1 == len(net.layer1.variables)
-assert 1 == len(net.layer2.variables)
-assert 2 == len(net.variables)  # weights for each layer.
-assert [1, 2] == net.variables[0].shape.as_list()  # weights of layer1.
-assert [2, 3] == net.variables[1].shape.as_list()  # weights of layer2.
-```
-
-The `tfe.Network` class is itself a sub-class of `tf.layers.Layer`. This allows
-instances of `tfe.Network` to be embedded in other networks. For example:
-
-```python
-class ThreeLayerNet(tfe.Network):
-  def __init__(self):
-    super(ThreeLayerNet, self).__init__()
-    self.a = self.track_layer(TwoLayerNet())
-    self.b = self.track_layer(tf.layers.Dense(4, use_bias=False))
-
-  def call(self, x):
-    return self.b(self.a(x))
-
-net = ThreeLayerNet()
-
-assert [1, 4] == net(inp).shape.as_list()
-assert 3 == len(net.variables)
-assert [1, 2] == net.variables[0].shape.as_list()
-assert [2, 3] == net.variables[1].shape.as_list()
-assert [3, 4] == net.variables[2].shape.as_list()
-```
-
-See more examples in
-[`tensorflow/contrib/eager/python/examples`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples).
-
-`tfe.Saver` in combination with `tfe.restore_variables_on_create` provides a
-convenient way to save and load checkpoints without changing the program once
-the checkpoint has been created. For example, we can set an objective for the
-output of our network, choose an optimizer, and a location for the checkpoint:
-
-```python
-objective = tf.constant([[2., 3., 4., 5.]])
-optimizer = tf.train.AdamOptimizer(0.01)
-checkpoint_directory = '/tmp/tfe_example'
-checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-net = ThreeLayerNet()
-```
-
-Note that variables have not been created yet. We want them to be restored from
-a checkpoint, if one exists, so we create them inside a
-`tfe.restore_variables_on_create` context manager. Then our training loop is the
-same whether starting training or resuming from a previous checkpoint:
-
-```python
-with tfe.restore_variables_on_create(
-    tf.train.latest_checkpoint(checkpoint_directory)):
-  global_step = tf.train.get_or_create_global_step()
-  for _ in range(100):
-    loss_fn = lambda: tf.norm(net(inp) - objective)
-    optimizer.minimize(loss_fn, global_step=global_step)
-    if tf.equal(global_step % 20, 0):
-      print("Step %d, output %s" % (global_step.numpy(),
-                                    net(inp).numpy()))
-      all_variables = (
-          net.variables
-          + optimizer.variables()
-          + [global_step])
-      # Save the checkpoint.
-      tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
-```
-
-The first time it runs, `Network` variables are initialized randomly. Then the
-output is trained to match the objective we've set:
-
-```
-Step 20, output [[ 0.03575622  0.29863232  0.03474367  0.24735749]]
-Step 40, output [[ 0.40646029  0.9856872   0.46851286  0.95358551]]
-Step 60, output [[ 1.74541104  2.800704    1.79055595  2.74783421]]
-Step 80, output [[ 2.14977384  3.44340849  3.96120024  5.16242075]]
-Step 100, output [[ 1.99943113  3.02364397  3.93500996  4.9610076 ]]
-```
-
-In subsequent iterations, variables are initialized with the values read from
-the latest checkpoint. Running the same code again, we continue from where we
-left off:
-
-```
-Step 120, output [[ 1.99234128  3.0271616   3.98732996  4.96401167]]
-Step 140, output [[ 2.00133467  3.01270437  4.00616646  5.00406504]]
-Step 160, output [[ 1.99647415  2.9956708   3.99064088  4.99632359]]
-Step 180, output [[ 2.00699997  3.00904822  4.00706148  5.01193142]]
-Step 200, output [[ 1.98334622  2.98249531  3.97375059  4.97123432]]
-```
-
-
-### Summaries, metrics and TensorBoard
-
-[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard)
-is a popular tool for understanding, debugging and optimizing the model training
-process. To benefit from the visualizations offered by TensorBoard, summary
-events need to be written during the course of execution of your program. You
-might find many Tensorflow programs that include the
-[`tf.summary`](https://www.tensorflow.org/api_guides/python/summary) operations
-during graph construction.
-
-`tf.summary` operations are *not* compatible with eager execution, but an
-equivalent alternative exists in
-[`tf.contrib.summary`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/summary)
-that is compatible with both eager execution and graph construction.
-
-During model construction simply insert summary operations like
-`tf.contrib.summary.scalar`. These operations do nothing by default, unless a
-summary writer is currently active and a writing policy is set.
-
-For example, to record summaries once every 100 global steps, use:
-
-```python
-tf.train.get_or_create_global_step()  # Ensuring the global step variable exists
-writer = tf.contrib.summary.create_file_writer(logdir)
-
-for _ in range(iterations):
-  with writer.as_default():
-    with tf.contrib.summary.record_summaries_every_n_global_steps(100):
-      # your model code goes here
-      tf.contrib.summary.scalar('loss', loss)
-      # ...
-```
-
-See the full mnist example in
-[`tensorflow/contrib/eager/python/examples/mnist`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
-for a full model using `tf.contrib.summary`.
-
-Similarly to summaries, the metrics in `tf.metrics` are currently not compatible
-with eager execution. We instead provide object-oriented metrics in the
-`tfe.metrics` package, which are compatible with graph construction as well.
-
-Metrics in the `tfe.metrics`, such as `tfe.metrics.Mean` and
-`tfe.Metrics.Accuracy`, all implement an intuitive object-oriented
-interface. Here's an example of how to use the `tfe.metrics.Mean` metric:
-
-```python
-# Metrics are objects, which can be created and destroyed.
-my_mean = tfe.metrics.Mean(name='my_mean')
-# While a metric is active, you can call it as a function to accumulate into its
-# internal state.
-my_mean(0.0)
-my_mean(10.0)
-# Once you've finished updating the metric, you can get its result. In this case
-# a simple average over all the calls to it. If a summary writer is active the
-# metric will write the appropriate summaries using the metric name.
-assert 5.0 == my_mean.result().numpy()
-```
-
-For a full example of a model using metrics for evaluation, see the mnist
-example in
-[`tensorflow/contrib/eager/python/examples/mnist`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist).
-
-### Input Pipelines
-
-The discussion above has been centered around the computation executed by your
-model. The
-[`tf.data`](https://www.tensorflow.org/api_docs/python/tf/data)
-module provides APIs to build complex input pipelines from simple, reusable
-pieces.
-
-If you're familiar with constructing `tf.data.Dataset` objects when building
-TensorFlow graphs, the same API calls are used when eager execution is enabled.
-However, the process of iterating over elements of the dataset differs between
-eager execution and graph construction. When eager execution is enabled, the
-discussion on iterator creation using `make_one_shot_iterator()` and
-`get_next()` in the
-[Programmer's Guide](https://www.tensorflow.org/programmers_guide/datasets) is
-*not* applicable. Instead, a more Pythonic `Iterator` class is available.
-
-For example:
-
-```python
-# Create a source Dataset from in-memory numpy arrays.
-# For reading from files on disk, you may want to use other Dataset classes
-# like the TextLineDataset or the TFRecordDataset.
-dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])
-
-# Apply transformations, shuffling, batching etc.
-dataset = dataset.map(tf.square).shuffle(2).batch(2)
-
-# Use tfe.Iterator to iterate over the dataset.
-for x in tfe.Iterator(dataset):
-  print(x)
-```
-
-Output:
-
-```
-tf.Tensor([4 9], shape=(2,), dtype=int32)
-tf.Tensor([16 25], shape=(2,), dtype=int32)
-tf.Tensor([36  1], shape=(2,), dtype=int32)
-```
-
-## Interoperating with Graphs
-
-Eager execution improves the process of model development in Python; however,
-because it is in its earliest stages, it does not yet support some features
-available to [TensorFlow
-graphs](https://www.tensorflow.org/get_started/get_started#the_computational_graph)
-that are desirable when deploying models in production. In particular, eager
-execution does not yet support distributed training, exporting models (to other
-[programming languages](https://www.tensorflow.org/api_docs/), [TensorFlow
-serving](https://www.tensorflow.org/serving/), and mobile applications), and
-various memory and computation optimizations that are applied to TensorFlow's
-dataflow graphs.
-
-That said, the APIs used to build modes are exactly the same whether executing
-eagerly or constructing graphs. This means that you can iteratively develop your
-model with eager execution enabled and later, if needed, use the same code to
-reap the benefits of representing models as computational graphs.
-
-For example,
-[`mnist.py`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist/mnist.py)
-defines a model that is eagerly executed. That same code is used to construct
-and execute a graph in
-[`mnist_graph_test.py`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist/mnist_graph_test.py).
-
-Other models in the [examples
-directory](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/)
-demonstrate this as well.
-
-Some differences worth noting:
-
--   There is no notion of a `tf.placeholder` or a `tf.Session` when eager
-    execution is enabled.
--   Many properties on the `tf.Tensor` object, like `tf.Tensor.name`,
-    `tf.Tensor.op`, `tf.Tensor.inputs` are not meaningful when eager execution
-    is enabled and their use will raise an `AttributeError`.
--   To use `tfe.implicit_gradients` in graph construction, variables must be
-    created with [`use_resource=True`] provided to
-    [`tf.get_variable()`](https://www.tensorflow.org/api_docs/python/tf/get_variable)
-    or
-    [`tf.variable_scope()`](https://www.tensorflow.org/api_docs/python/tf/variable_scope).
--   Some API calls (such as the functional-style `tf.layers.dense`,
-    `tf.layers.conv2d`) are not compatible with eager execution. Use of such
-    methods should raise an error indicating the alternative (e.g., the
-    `tf.layers.Dense` and `tf.layers.Conv2D` classes).
-
-## What next?
+immediately: concrete values are returned, instead of creating a computational
+graph that is executed later.
 
-Please give eager execution a spin. This feature is in early stages and is
-evolving, so we welcome your feedback via issues on GitHub (see [known
-issues](https://github.com/tensorflow/tensorflow/labels/comp:eager)).
+A user guide is available: https://www.tensorflow.org/programmers_guide/eager
+([source file](../../../../docs_src/programmers_guide/eager.md))
 
-You may want to browse through some sample code, including benchmarks for some:
+We welcome feedback through [GitHub issues](https://github.com/tensorflow/tensorflow/labels/comp:eager).
 
--   [Linear Regression](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/linear_regression)
--   [MNIST handwritten digit classifier](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
--   [ResNet50 image classification](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/resnet50)
--   [RNN to generate colors](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_colorbot)
--   [RNN language model](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_ptb)
+Sample code is available, including benchmarks for some:
 
+- [Linear Regression](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/linear_regression)
+- [MNIST handwritten digit classifier](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
+- [ResNet50 image classification](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/resnet50)
+- [RNN to generate colors](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_colorbot)
+- [RNN language model](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_ptb)
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index ea8dbf2b46ea4bd0e33645ae3c590c4dd13f7a52..907f9204c2d31a652ca2a0539a23db4722b4e154 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import re
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -29,13 +28,14 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
-
+from tensorflow.python.training import checkpointable
 
 _to_replace = re.compile("[^A-Za-z0-9.]")
 
 
-class Metric(object):
+class Metric(checkpointable.CheckpointableBase):
   """A metric holds state for aggregating statistics over an evaluation run.
 
   Example use with eager execution:
@@ -93,11 +93,12 @@ class Metric(object):
   `aggregate()`, it is for use by TensorFlow infrastructure.
   """
 
-  def __init__(self, name=None):
+  def __init__(self, name=None, use_global_variables=False):
     self._built = False
     self._vars = []
     self._initial_values = {}
     self._updates = []
+    self._use_global_variables = use_global_variables
     name = name or self.__class__.__name__
     # Replace things like spaces in name to create a valid scope name.
     scope_name = _to_replace.sub("_", name)
@@ -108,13 +109,25 @@ class Metric(object):
       pos = scope.name.rfind(scope_name)
       self._name = name + scope.name[pos + len(scope_name):]
       self._scope = scope
-    if context.in_graph_mode():
+
+    # Ensures that if the user calls build directly we still set self._built to
+    # True to prevent variables from being recreated.
+    self._build = self.build
+
+    def actual_build(*args, **kwargs):
+      self._build(*args, **kwargs)
+      self._built = True
+    self.build = actual_build
+    self.build.__doc__ = self._build.__doc__
+
+    # Captures construction scope for proper initialization.
+    if context.executing_eagerly():
+      self._construction_scope = context.eager_mode
+    else:
       # We make self.call() into a graph callable here, so that we can
       # return a single op that performs all of the variable updates.
       self._construction_scope = ops.get_default_graph().as_default
       self.call = function.defun(self.call)
-    else:
-      self._construction_scope = context.eager_mode
 
   # ---- API for users ----
   def __call__(self, *args, **kwargs):
@@ -155,10 +168,11 @@ class Metric(object):
       initialization. Under eager execution, the variables are reset to their
       initial values as a side effect and this function returns None.
     """
-    if context.in_graph_mode():
+    if context.executing_eagerly():
+      for v in self._vars:
+        v.assign(self._initial_values[v])
+    else:
       return control_flow_ops.group([v.initializer for v in self._vars])
-    for v in self._vars:
-      v.assign(self._initial_values[v])
 
   # ---- To be implemented by descendants ---
   def build(self, *args, **kwargs):
@@ -200,10 +214,10 @@ class Metric(object):
 
   def value(self):
     """In graph mode returns the result Tensor while in eager the callable."""
-    if context.in_graph_mode():
-      return self.result()
-    else:
+    if context.executing_eagerly():
       return self.result
+    else:
+      return self.result()
 
   # We can support two different strategies of for doing data-parallel
   # distributed metric computations:
@@ -245,19 +259,31 @@ class Metric(object):
     """***Only for use by descendants of Metric***."""
     if self._built:
       raise RuntimeError("Can't call add_variable() except in build().")
-    collections = None if context.in_eager_mode() else [
-        ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.METRIC_VARIABLES
-    ]
-    v = variable_scope.get_variable(
-        name,
-        shape,
-        dtype,
-        initializer,
+    if context.executing_eagerly():
+      collections = None
+    else:
+      if self._use_global_variables:
+        collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+      else:
+        collections = [ops.GraphKeys.LOCAL_VARIABLES]
+      collections += [ops.GraphKeys.METRIC_VARIABLES]
+    # Variables are Checkpointable dependencies of Metrics regardless of the
+    # global/local distinction. Users can avoid saving variables by not adding a
+    # dependency on the Metric.
+    v = self._add_variable_with_custom_getter(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
         trainable=False,
         collections=collections,
-        use_resource=True)
+        use_resource=True,
+        getter=variable_scope.get_variable,
+        # Raise duplicate variable exceptions from get_variable rather than
+        # Checkpointable.
+        overwrite=True)
     self._vars.append(v)
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       self._initial_values[v] = v.value()
     return v
 
@@ -267,8 +293,10 @@ class Mean(Metric):
   # TODO(josh11b): Maybe have a dtype argument that defaults to tf.float64?
   # Or defaults to type of the input if it is tf.float32, else tf.float64?
 
-  def __init__(self, name=None, dtype=dtypes.float64):
-    super(Mean, self).__init__(name=name)
+  def __init__(self, name=None, dtype=dtypes.float64,
+               use_global_variables=False):
+    super(Mean, self).__init__(name=name,
+                               use_global_variables=use_global_variables)
     self.dtype = dtype
 
   def build(self, *args, **kwargs):
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index a9ecaa3f8bced3043ea0eb0ac3aa8bfa65e9e1ff..f0fe4ce8c53bb80c03a3f0de37078bcdb975a0b4 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
 
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
@@ -29,6 +29,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import training_util
 
 
@@ -50,6 +52,19 @@ class MetricsTest(test.TestCase):
       self.assertEqual(
           set(m.variables),
           set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES)))
+      self.assertEqual(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES), [])
+      self.assertEqual(
+          set(m.variables),
+          set(ops.get_collection(ops.GraphKeys.METRIC_VARIABLES)))
+
+  def testUseGlobalVariablesCollections(self):
+    with context.graph_mode(), ops.Graph().as_default():
+      m = metrics.Mean(use_global_variables=True)
+      m(1000)
+      self.assertEqual(
+          set(m.variables),
+          set(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertEqual(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES), [])
       self.assertEqual(
           set(m.variables),
           set(ops.get_collection(ops.GraphKeys.METRIC_VARIABLES)))
@@ -180,6 +195,15 @@ class MetricsTest(test.TestCase):
         m2 = metrics.Mean()
         m2(2)
 
+  def testBuildMean(self):
+    # Verify that calling build() on Mean and then calling it won't recreate
+    # variables.
+    m = metrics.Mean()
+    m.build()
+    old_numer = m.numer
+    m(0.0)
+    self.assertTrue(old_numer is m.numer)
+
   def testMetricsChain(self):
     with context.graph_mode(), self.test_session():
       m1 = metrics.Mean()
@@ -193,6 +217,31 @@ class MetricsTest(test.TestCase):
       self.assertAllEqual(m2.result().eval(), 2.0)
       self.assertAllEqual(m1.result().eval(), 1.0)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestore(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    mean = metrics.Mean()
+    checkpoint = checkpointable_utils.Checkpoint(mean=mean)
+    mean.build()
+    mean._built = True
+    self.evaluate(mean.init_variables())
+    self.evaluate(mean(100.))
+    self.evaluate(mean(200.))
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(mean(1000.))
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.evaluate(mean(300.))
+    self.assertAllEqual(200., self.evaluate(mean.value()))
+
+    restore_mean = metrics.Mean()
+    restore_checkpoint = checkpointable_utils.Checkpoint(mean=restore_mean)
+    status = restore_checkpoint.restore(save_path)
+    restore_update = restore_mean(300.)
+    status.assert_consumed().run_restore_ops()
+    self.evaluate(restore_update)
+    self.assertAllEqual(200., self.evaluate(restore_mean.value()))
+    self.assertEqual(3, self.evaluate(restore_mean.denom))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index e3c13cbd2e8ccd2ab79da74e0e97905c6ed5c02d..44828bea50c660815e457f21a1990cd706c40876 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -25,11 +25,14 @@ import weakref
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl.keras.engine import base_layer as keras_base_layer
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow.python.util import deprecation
 
 # pylint: disable=protected-access
 # Explanation for protected-access disable: Network has lots of same-class and
@@ -51,9 +54,40 @@ def _network_name_scope_naming(current_variable_scope):
   return current_variable_scope.name + "/"
 
 
+_NETWORK_DEPRECATION_MESSAGE = (
+    "Please inherit from `tf.keras.Model`, and see its documentation for "
+    "details. `tf.keras.Model` should be a drop-in replacement for "
+    "`tfe.Network` in most cases, but note that `track_layer` is no longer "
+    "necessary or supported. Instead, `Layer` instances are tracked on "
+    "attribute assignment (see the section of `tf.keras.Model`'s documentation "
+    "on subclassing). Since the output of `track_layer` is often assigned to "
+    "an attribute anyway, most code can be ported by simply removing the "
+    "`track_layer` calls.\n\n`tf.keras.Model` works with all TensorFlow "
+    "`Layer` instances, including those from `tf.layers`, but switching to "
+    "the `tf.keras.layers` versions along with the migration to "
+    "`tf.keras.Model` is recommended, since it will preserve variable names. "
+    "Feel free to import it with an alias to avoid excess typing :)."
+)
+
+
 class Network(base.Layer):
   """Represents the composition of a set of Layers.
 
+  *Deprecated*. Please inherit from `tf.keras.Model`, and see its documentation
+  for details. `tf.keras.Model` should be a drop-in replacement for
+  `tfe.Network` in most cases, but note that `track_layer` is no longer
+  necessary or supported. Instead, `Layer` instances are tracked on attribute
+  assignment (see the section of `tf.keras.Model`'s documentation on
+  subclassing). Since the output of `track_layer` is often assigned to an
+  attribute anyway, most code can be ported by simply removing the `track_layer`
+  calls.
+
+  `tf.keras.Model` works with all TensorFlow `Layer` instances, including those
+  from `tf.layers`, but switching to the `tf.keras.layers` versions along with
+  the migration to `tf.keras.Model` is recommended, since it will preserve
+  variable names.  Feel free to import it with an alias to avoid excess typing
+  :).
+
   `Network` implements the `Layer` interface and adds convenience methods for
   managing sub-`Layer`s, such as listing variables.
 
@@ -111,6 +145,7 @@ class Network(base.Layer):
   # - Detect layers used in __call__ that weren't registered with track_layer.
   # - Convert inputs to __call__ to tensors.
 
+  @deprecation.deprecated(date=None, instructions=_NETWORK_DEPRECATION_MESSAGE)
   def __init__(self, name=None):
     """Configure the `Network`.
 
@@ -129,6 +164,10 @@ class Network(base.Layer):
       ValueError: If `name` is not valid. Note that some naming errors will
         instead be raised when the `Network` is called.
     """
+    if context.executing_eagerly():
+      logging.warning(
+          ("** tfe.Network is deprecated and will be removed in a future "
+           "version.\n\n%s") % _NETWORK_DEPRECATION_MESSAGE)
     if isinstance(name, variable_scope.VariableScope):
       raise ValueError("VariableScopes are not valid Network names.")
     if name is not None and "/" in name:
@@ -149,7 +188,12 @@ class Network(base.Layer):
     # check we might have name collisions if the parent scope on init gets
     # closed before build is called.
     self._variable_scope_counts_on_init = (
-        variable_scope._get_default_variable_store().variable_scopes_count)
+        variable_scope.get_variable_scope_store().variable_scopes_count)
+
+  def _gather_saveables_for_checkpoint(self):
+    raise NotImplementedError(
+        "tfe.Network does not support object-based checkpointing.\n\n%s"
+        % _NETWORK_DEPRECATION_MESSAGE)
 
   def _name_scope_name(self, current_variable_scope):
     """Overrides Layer op naming to match variable naming."""
@@ -176,7 +220,7 @@ class Network(base.Layer):
         avoid_names = parent_network._owned_layers
         name_uid_map = parent_network._sub_layer_name_uids
       else:
-        name_uid_map = base._get_default_graph_uid_map()
+        name_uid_map = keras_base_layer.get_default_graph_uid_map()
         # Figure out which names we have to avoid based on which variable scope
         # we're nested in.
         strip_name = self._default_parent_variable_scope.name
@@ -326,6 +370,8 @@ class Network(base.Layer):
       raise TypeError(
           "Network.track_layer() passed type %s, not a tf.layers.Layer" %
           (type(layer),))
+    # Always use `ResourceVariable` with legacy layers.
+    layer._use_resource_variables = True
     if isinstance(layer, Network):
       layer._finalize_name(parent_network=self)
     else:
@@ -639,7 +685,7 @@ def _make_custom_getter_for_deferred_restorations():
       # Mark as already restored from this checkpoint.
       delayed_restoration.checkpointed_variables_to_restore[
           checkpoint_name] = None
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         delayed_restoration.session.run(variable.initializer)
     if found_value:
       # Error checking should run even if we've already restored a value.
@@ -703,6 +749,9 @@ def _make_prefix_stripping_map_fn(scope_name):
   return _strip_variable_prefix
 
 
+@deprecation.deprecated(date=None, instructions=(
+    "Please inherit from tf.keras.Model instead of tfe.Network, and use "
+    "tf.keras.Model.save_weights."))
 def save_network_checkpoint(
     network, save_path, global_step=None, map_func=None):
   """Save variables from the Network to a checkpoint.
@@ -772,7 +821,7 @@ def save_network_checkpoint(
                  variable_map[mapped_name]._shared_name,
                  variable._shared_name,
                  network.scope_name))
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     sess = None
   else:
     sess = ops.get_default_session()
@@ -853,7 +902,7 @@ def _restore_existing_variables(network, save_path, map_func, user_map_func):
             network_name=network.name,
             network_scope_name=network.scope_name))
   if existing_variables_by_checkpoint_name:
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       sess = None
     else:
       sess = ops.get_default_session()
@@ -880,7 +929,7 @@ def _set_restore_on_create(network, save_path, map_func, user_map_func,
   # _DeferredRestoration objects once a Network has been built (so that
   # restoring in a loop does not take increasing amounts of memory).
   if checkpointed_variables_to_restore:
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       sess = None
     else:
       sess = ops.get_default_session()
@@ -902,6 +951,9 @@ def _set_restore_on_create(network, save_path, map_func, user_map_func,
     _add_deferred_restoration(network, deferred_restoration)
 
 
+@deprecation.deprecated(date=None, instructions=(
+    "Please inherit from tf.keras.Model instead of tfe.Network, and use "
+    "tf.keras.Model.load_weights."))
 def restore_network_checkpoint(network, save_path, map_func=None):
   """Restore the Network from a checkpoint.
 
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 3329fc6c513265deff41a368f5688dd605209c14..6a51d03de52914d2ad0ac3ad05d1ba01d856ad9a 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -20,18 +20,17 @@ import gc
 
 from tensorflow.contrib.eager.python import network
 from tensorflow.contrib.layers.python.layers import regularizers
-from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import training_util
 
 
@@ -64,6 +63,12 @@ class RegularizedNetwork(network.Network):
 
 class NetworkTest(test.TestCase):
 
+  def test_checkpointing_not_implemented(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint = checkpointable_utils.Checkpoint(net=MyNetwork())
+    with self.assertRaises(NotImplementedError):
+      checkpoint.save(checkpoint_directory)
+
   def _save_modify_load_network_built(self, net, global_step=None):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_path = network.save_network_checkpoint(
@@ -469,36 +474,6 @@ class NetworkTest(test.TestCase):
     self.assertIsInstance(net.trainable_weights[0],
                           resource_variable_ops.ResourceVariable)
 
-  def testGraphOpNames(self):
-    """Network operation names should match variable naming."""
-
-    def _check_op_prefixes(expected_prefix, checked_ops):
-      for operation in ops.get_default_graph().get_operations():
-        if operation.name == "ignore":
-          continue
-        if operation.name in checked_ops:
-          continue
-        checked_ops.add(operation.name)
-        self.assertStartsWith(expected_start=expected_prefix,
-                              actual=operation.name)
-        self.assertNotIn("my_network", operation.name[len(expected_prefix):])
-        self.assertNotIn("dense", operation.name[len(expected_prefix):])
-
-    with context.graph_mode():
-      net = MyNetwork()
-      zero = constant_op.constant([[0.]], name="ignore")
-      net(zero)
-      checked_ops = set()
-      _check_op_prefixes(expected_prefix="my_network/dense/",
-                         checked_ops=checked_ops)
-      net.net2 = net.track_layer(MyNetwork())
-      net.net2(zero)
-      _check_op_prefixes(expected_prefix="my_network/my_network/dense/",
-                         checked_ops=checked_ops)
-      MyNetwork()(zero)
-      _check_op_prefixes(expected_prefix="my_network_1/dense/",
-                         checked_ops=checked_ops)
-
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testVariableRegularizers(self):
     net = RegularizedNetwork()
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index 62421849c766a1124c726812428985c913c653a3..fdaca90fd13576e6ca8a3408aaf528dbc2384b0c 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -73,7 +73,7 @@ def restore_variables_on_create(save_path, map_func=None):
     NotFoundError: If the variable is not found in checkpoint.
     ValueError: If not used in eager mode or map_func is not callable.
   """
-  if context.in_graph_mode():
+  if not context.executing_eagerly():
     raise ValueError(
         "Currently, restore_variables_on_create can only be used with "
         "eager execution enabled.")
@@ -131,7 +131,7 @@ class Saver(object):
     Raises:
       RuntimeError: if invoked when eager execution has not been enabled.
     """
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       raise RuntimeError("tfe.Saver can only be used when eager "
                          "execution is enabled. Use tf.train.Saver when "
                          "building graphs.")
diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py
index 1a7f7b85e688e80e3cf482f2754462888187d311..4032e755f6e7dea9dcb42587f14e8386e5db2338 100644
--- a/tensorflow/contrib/eager/python/saver_test.py
+++ b/tensorflow/contrib/eager/python/saver_test.py
@@ -102,7 +102,6 @@ class SaverTest(test.TestCase):
       # Can still restore it.
       saver.restore(ckpt_prefix)
       self.assertEqual(v1.read_value().numpy(), 1.0)
-      self.assertEqual(v1.read_value().numpy(), 1.0)
       # However, cannot restore it with default name.
       with self.assertRaisesOpError('not found in checkpoint'):
         saver = _saver.Saver([v1, v2]).restore(ckpt_prefix)
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index d32bebf90c1e768d1efec26b3b78bf1a522a8f00..79dd117854e5fe9f066f671d8ce62e08579e0ed9 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -56,14 +56,24 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@save_network_checkpoint
 @@restore_network_checkpoint
 
+@@Checkpoint
+@@Checkpointable
+@@CheckpointableSaver
+
+@@executing_eagerly
 @@in_eager_mode
-@@in_graph_mode
+@@set_execution_mode
+@@execution_mode
+@@async_wait
+@@async_clear_error
 
 @@run_test_in_graph_and_eager_modes
 
 @@DEVICE_PLACEMENT_EXPLICIT
 @@DEVICE_PLACEMENT_WARN
 @@DEVICE_PLACEMENT_SILENT
+@@SYNC
+@@ASYNC
 """
 
 from __future__ import absolute_import
@@ -87,11 +97,15 @@ from tensorflow.python.eager import function
 from tensorflow.python.eager.context import DEVICE_PLACEMENT_EXPLICIT
 from tensorflow.python.eager.context import DEVICE_PLACEMENT_WARN
 from tensorflow.python.eager.context import DEVICE_PLACEMENT_SILENT
-from tensorflow.python.eager.context import in_eager_mode
-from tensorflow.python.eager.context import in_graph_mode
+from tensorflow.python.eager.context import executing_eagerly
 from tensorflow.python.eager.context import list_devices
+from tensorflow.python.eager.context import set_execution_mode
+from tensorflow.python.eager.context import execution_mode
+from tensorflow.python.eager.context import async_wait
+from tensorflow.python.eager.context import async_clear_error
+from tensorflow.python.eager.context import SYNC
+from tensorflow.python.eager.context import ASYNC
 from tensorflow.python.eager.context import num_gpus
-from tensorflow.python.eager.custom_gradient import custom_gradient
 from tensorflow.python.eager.execution_callbacks import add_execution_callback
 from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
 from tensorflow.python.eager.execution_callbacks import inf_callback
@@ -101,10 +115,14 @@ from tensorflow.python.eager.execution_callbacks import seterr
 from tensorflow.python.framework.ops import enable_eager_execution
 from tensorflow.python.framework.ops import eager_run as run
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
+from tensorflow.python.ops.custom_gradient import custom_gradient
 from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
 from tensorflow.python.ops.variable_scope import EagerVariableStore
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import template
+from tensorflow.python.training.checkpointable import Checkpointable
+from tensorflow.python.training.checkpointable_utils import CheckpointableSaver
+from tensorflow.python.training.checkpointable_utils import Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
 
 py_func = script_ops.eager_py_func
@@ -115,5 +133,6 @@ implicit_value_and_gradients = backprop.implicit_val_and_grad
 gradients_function = backprop.gradients_function
 value_and_gradients_function = backprop.val_and_grad_function
 GradientTape = backprop.GradientTape  # pylint: disable=invalid-name
+in_eager_mode = executing_eagerly
 
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index b6659c2a1797feab261d756e78b45231dbea5a02..e80ccbb74d8623e977a98cb7fa5eb41f3c9bf250 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -47,7 +47,8 @@ class TFETest(test_util.TensorFlowTestCase):
 
   def testVariableError(self):
     with self.assertRaisesRegexp(
-        RuntimeError, r'Variable not supported in Eager mode'):
+        RuntimeError,
+        r'Variable not supported when eager execution is enabled'):
       variables.Variable(initial_value=1.0)
 
   def testGradients(self):
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 6cdbed5b896577f5622b1bd0123c289c798bc0a5..b473de86ee8be92e8111ee5098b2536d4b957a8c 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -9,23 +9,12 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "estimator_py",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":boosted_trees",
         ":dnn",
         ":dnn_linear_combined",
         ":extenders",
@@ -34,10 +23,41 @@ py_library(
         ":logit_fns",
         ":multi_head",
         ":replicate_model_fn",
+        ":rnn",
         "//tensorflow/python:util",
     ],
 )
 
+py_library(
+    name = "boosted_trees",
+    srcs = ["python/estimator/boosted_trees.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:boosted_trees",
+    ],
+)
+
+py_test(
+    name = "boosted_trees_test",
+    size = "medium",
+    srcs = ["python/estimator/boosted_trees_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":boosted_trees",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "dnn",
     srcs = ["python/estimator/dnn.py"],
@@ -70,6 +90,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -110,6 +131,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -138,9 +160,11 @@ py_test(
     size = "medium",
     srcs = ["python/estimator/extenders_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],  # b/62863147
     deps = [
         ":extenders",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/predictor",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
@@ -169,9 +193,11 @@ py_library(
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
+        "//tensorflow/python:nn",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:summary",
+        "//tensorflow/python:training",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:head",
         "//tensorflow/python/estimator:metric_keys",
@@ -184,13 +210,14 @@ py_library(
 
 py_test(
     name = "head_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/estimator/head_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":head",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -223,7 +250,7 @@ py_library(
 
 py_test(
     name = "linear_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/estimator/linear_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -242,6 +269,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -288,6 +316,8 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:head",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
@@ -337,6 +367,7 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:model_fn",
@@ -351,6 +382,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["python/estimator/replicate_model_fn_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:dnn",
         "//tensorflow/python/estimator:export_export",
@@ -382,3 +414,58 @@ cuda_py_test(
         "notap",
     ],
 )
+
+py_library(
+    name = "rnn",
+    srcs = ["python/estimator/rnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":extenders",
+        "//tensorflow/contrib/feature_column:feature_column_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:head",
+        "//tensorflow/python/estimator:optimizers",
+        "//tensorflow/python/feature_column",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "rnn_test",
+    size = "medium",
+    srcs = ["python/estimator/rnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "noasan",  # times out
+        "notsan",
+    ],
+    deps = [
+        ":rnn",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 0f75b77050b0ba4c752a6a74fdc7024170b6f318..be20d1b7770d3f3df21ac9c0f811d924bf4152ee 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.contrib.estimator.python.estimator.boosted_trees import *
 from tensorflow.contrib.estimator.python.estimator.dnn import *
 from tensorflow.contrib.estimator.python.estimator.dnn_linear_combined import *
 from tensorflow.contrib.estimator.python.estimator.extenders import *
@@ -27,6 +28,7 @@ from tensorflow.contrib.estimator.python.estimator.linear import *
 from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
 from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
+from tensorflow.contrib.estimator.python.estimator.rnn import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
@@ -39,15 +41,19 @@ _allowed_symbols = [
     'multi_class_head',
     'multi_head',
     'multi_label_head',
+    'poisson_regression_head',
     'regression_head',
     'DNNEstimator',
     'DNNLinearCombinedEstimator',
     'LinearEstimator',
+    'boosted_trees_classifier_train_in_memory',
+    'boosted_trees_regressor_train_in_memory',
     'call_logit_fn',
     'dnn_logit_fn_builder',
     'linear_logit_fn_builder',
     'replicate_model_fn',
     'TowerOptimizer',
+    'RNNClassifier',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd641014e9eec6623d66574bccd08ff03ebc28ac
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -0,0 +1,357 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Boosted Trees estimators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
+
+
+def _validate_input_fn_and_repeat_dataset(train_input_fn):
+  """Validates whether the input_fn is valid, and repeat() if tf.Dataset."""
+  def _input_fn():
+    result_input_fn = train_input_fn()
+    if isinstance(result_input_fn, dataset_ops.Dataset):
+      return result_input_fn.repeat()
+    return result_input_fn
+
+  return _input_fn
+
+
+class _BoostedTreesEstimator(estimator.Estimator):
+  """An Estimator for Tensorflow Boosted Trees models."""
+
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               head,
+               model_dir=None,
+               weight_column=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               min_node_weight=0.,
+               config=None):
+    """Initializes a `BoostedTreesEstimator` instance.
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      n_batches_per_layer: the number of batches to collect statistics per
+        layer.
+      head: the `Head` instance defined for Estimator.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      n_trees: number trees to be created.
+      max_depth: maximum depth of the tree to grow.
+      learning_rate: shrinkage parameter to be used when a tree added to the
+        model.
+      l1_regularization: regularization multiplier applied to the absolute
+        weights of the tree leafs.
+      l2_regularization: regularization multiplier applied to the square weights
+        of the tree leafs.
+      tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
+      config: `RunConfig` object to configure the runtime settings.
+    """
+    # pylint:disable=protected-access
+    # HParams for the model.
+    tree_hparams = canned_boosted_trees._TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity, min_node_weight)
+
+    def _model_fn(features, labels, mode, config):
+      return canned_boosted_trees._bt_model_fn(
+          features, labels, mode, head, feature_columns, tree_hparams,
+          n_batches_per_layer, config)
+
+    super(_BoostedTreesEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+    # pylint:enable=protected-access
+
+
+def boosted_trees_classifier_train_in_memory(
+    train_input_fn,
+    feature_columns,
+    model_dir=None,
+    n_classes=canned_boosted_trees._HOLD_FOR_MULTI_CLASS_SUPPORT,
+    weight_column=None,
+    label_vocabulary=None,
+    n_trees=100,
+    max_depth=6,
+    learning_rate=0.1,
+    l1_regularization=0.,
+    l2_regularization=0.,
+    tree_complexity=0.,
+    min_node_weight=0.,
+    config=None,
+    train_hooks=None):
+  """Trains a boosted tree classifier with in memory dataset.
+
+  Example:
+
+  ```python
+  bucketized_feature_1 = bucketized_column(
+    numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+  bucketized_feature_2 = bucketized_column(
+    numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+  def train_input_fn():
+    dataset = create-dataset-from-training-data
+    # This is tf.data.Dataset of a tuple of feature dict and label.
+    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
+    #                     Dataset.from_tensors(label_array)))
+    # The returned Dataset shouldn't be batched.
+    # If Dataset repeats, only the first repetition would be used for training.
+    return dataset
+
+  classifier = boosted_trees_classifier_train_in_memory(
+      train_input_fn,
+      feature_columns=[bucketized_feature_1, bucketized_feature_2],
+      n_trees=100,
+      ... <some other params>
+  )
+
+  def input_fn_eval():
+    ...
+    return dataset
+
+  metrics = classifier.evaluate(input_fn=input_fn_eval, steps=10)
+  ```
+
+  Args:
+    train_input_fn: the input function returns a dataset containing a single
+      epoch of *unbatched* features and labels.
+    feature_columns: An iterable containing all the feature columns used by
+      the model. All items in the set should be instances of classes derived
+      from `FeatureColumn`.
+    model_dir: Directory to save model parameters, graph and etc. This can
+      also be used to load checkpoints from the directory into a estimator
+      to continue training a previously saved model.
+    n_classes: number of label classes. Default is binary classification.
+      Multiclass support is not yet implemented.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to downweight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
+    label_vocabulary: A list of strings represents possible label values. If
+      given, labels must be string type and have any value in
+      `label_vocabulary`. If it is not given, that means labels are
+      already encoded as integer or float within [0, 1] for `n_classes=2` and
+      encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+      Also there will be errors if vocabulary is not provided and labels are
+      string.
+    n_trees: number trees to be created.
+    max_depth: maximum depth of the tree to grow.
+    learning_rate: shrinkage parameter to be used when a tree added to the
+      model.
+    l1_regularization: regularization multiplier applied to the absolute
+      weights of the tree leafs.
+    l2_regularization: regularization multiplier applied to the square weights
+      of the tree leafs.
+    tree_complexity: regularization factor to penalize trees with more leaves.
+    min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
+    config: `RunConfig` object to configure the runtime settings.
+    train_hooks: a list of Hook instances to be passed to estimator.train().
+
+  Returns:
+    a `BoostedTreesClassifier` instance created with the given arguments and
+      trained with the data loaded up on memory from the input_fn.
+
+  Raises:
+    ValueError: when wrong arguments are given or unsupported functionalities
+       are requested.
+  """
+  # pylint: disable=protected-access
+  # TODO(nponomareva): Support multi-class cases.
+  if n_classes == canned_boosted_trees._HOLD_FOR_MULTI_CLASS_SUPPORT:
+    n_classes = 2
+  head, closed_form = (
+      canned_boosted_trees._create_classification_head_and_closed_form(
+          n_classes, weight_column, label_vocabulary=label_vocabulary))
+
+  # HParams for the model.
+  tree_hparams = canned_boosted_trees._TreeHParams(
+      n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+      tree_complexity, min_node_weight)
+
+  def _model_fn(features, labels, mode, config):
+    return canned_boosted_trees._bt_model_fn(
+        features,
+        labels,
+        mode,
+        head,
+        feature_columns,
+        tree_hparams,
+        n_batches_per_layer=1,
+        config=config,
+        closed_form_grad_and_hess_fn=closed_form,
+        train_in_memory=True)
+
+  in_memory_classifier = estimator.Estimator(
+      model_fn=_model_fn, model_dir=model_dir, config=config)
+
+  in_memory_classifier.train(
+      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
+      hooks=train_hooks)
+
+  return in_memory_classifier
+  # pylint: enable=protected-access
+
+
+def boosted_trees_regressor_train_in_memory(
+    train_input_fn,
+    feature_columns,
+    model_dir=None,
+    label_dimension=canned_boosted_trees._HOLD_FOR_MULTI_DIM_SUPPORT,
+    weight_column=None,
+    n_trees=100,
+    max_depth=6,
+    learning_rate=0.1,
+    l1_regularization=0.,
+    l2_regularization=0.,
+    tree_complexity=0.,
+    min_node_weight=0.,
+    config=None,
+    train_hooks=None):
+  """Trains a boosted tree regressor with in memory dataset.
+
+  Example:
+
+  ```python
+  bucketized_feature_1 = bucketized_column(
+    numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+  bucketized_feature_2 = bucketized_column(
+    numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+  def train_input_fn():
+    dataset = create-dataset-from-training-data
+    # This is tf.data.Dataset of a tuple of feature dict and label.
+    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
+    #                     Dataset.from_tensors(label_array)))
+    # The returned Dataset shouldn't be batched.
+    # If Dataset repeats, only the first repetition would be used for training.
+    return dataset
+
+  regressor = boosted_trees_regressor_train_in_memory(
+      train_input_fn,
+      feature_columns=[bucketized_feature_1, bucketized_feature_2],
+      n_trees=100,
+      ... <some other params>
+  )
+
+  def input_fn_eval():
+    ...
+    return dataset
+
+  metrics = regressor.evaluate(input_fn=input_fn_eval, steps=10)
+  ```
+
+  Args:
+    train_input_fn: the input function returns a dataset containing a single
+      epoch of *unbatched* features and labels.
+    feature_columns: An iterable containing all the feature columns used by
+      the model. All items in the set should be instances of classes derived
+      from `FeatureColumn`.
+    model_dir: Directory to save model parameters, graph and etc. This can
+      also be used to load checkpoints from the directory into a estimator
+      to continue training a previously saved model.
+    label_dimension: Number of regression targets per example.
+      Multi-dimensional support is not yet implemented.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to downweight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
+    n_trees: number trees to be created.
+    max_depth: maximum depth of the tree to grow.
+    learning_rate: shrinkage parameter to be used when a tree added to the
+      model.
+    l1_regularization: regularization multiplier applied to the absolute
+      weights of the tree leafs.
+    l2_regularization: regularization multiplier applied to the square weights
+      of the tree leafs.
+    tree_complexity: regularization factor to penalize trees with more leaves.
+    min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
+    config: `RunConfig` object to configure the runtime settings.
+    train_hooks: a list of Hook instances to be passed to estimator.train().
+
+  Returns:
+    a `BoostedTreesClassifier` instance created with the given arguments and
+      trained with the data loaded up on memory from the input_fn.
+
+  Raises:
+    ValueError: when wrong arguments are given or unsupported functionalities
+       are requested.
+  """
+  # pylint: disable=protected-access
+  # TODO(nponomareva): Extend it to multi-dimension cases.
+  if label_dimension == canned_boosted_trees._HOLD_FOR_MULTI_DIM_SUPPORT:
+    label_dimension = 1
+  head = canned_boosted_trees._create_regression_head(label_dimension,
+                                                      weight_column)
+
+  # HParams for the model.
+  tree_hparams = canned_boosted_trees._TreeHParams(
+      n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+      tree_complexity, min_node_weight)
+
+  def _model_fn(features, labels, mode, config):
+    return canned_boosted_trees._bt_model_fn(
+        features,
+        labels,
+        mode,
+        head,
+        feature_columns,
+        tree_hparams,
+        n_batches_per_layer=1,
+        config=config,
+        train_in_memory=True)
+
+  in_memory_regressor = estimator.Estimator(
+      model_fn=_model_fn, model_dir=model_dir, config=config)
+
+  in_memory_regressor.train(
+      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
+      hooks=train_hooks)
+
+  return in_memory_regressor
+  # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..76cbefe5e94502188388df6fc2816d130ac896d5
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -0,0 +1,222 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests boosted_trees estimators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.estimator.python.estimator import boosted_trees
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import checkpoint_utils
+
+NUM_FEATURES = 3
+
+BUCKET_BOUNDARIES = [-2., .5, 12.]  # Boundaries for all the features.
+INPUT_FEATURES = np.array(
+    [
+        [12.5, 1.0, -2.001, -2.0001, -1.999],  # feature_0 quantized:[3,2,0,0,1]
+        [2.0, -3.0, 0.5, 0.0, 0.4995],         # feature_1 quantized:[2,0,2,1,1]
+        [3.0, 20.0, 50.0, -100.0, 102.75],     # feature_2 quantized:[2,3,3,0,3]
+    ],
+    dtype=np.float32)
+CLASSIFICATION_LABELS = [[0.], [1.], [1.], [0.], [0.]]
+REGRESSION_LABELS = [[1.5], [0.3], [0.2], [2.], [5.]]
+FEATURES_DICT = {'f_%d' % i: INPUT_FEATURES[i] for i in range(NUM_FEATURES)}
+
+
+def _make_train_input_fn(is_classification):
+  """Makes train input_fn for classification/regression."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    return features_dict, labels
+
+  return _input_fn
+
+
+def _make_train_input_fn_dataset(is_classification):
+  """Makes input_fn using Dataset."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(features_dict),
+         dataset_ops.Dataset.from_tensors(labels)
+        ))
+    return ds
+
+  return _input_fn
+
+
+class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._head = canned_boosted_trees._create_regression_head(label_dimension=1)
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES)
+        for i in range(NUM_FEATURES)
+    }
+
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
+
+  def testTrainAndEvaluateEstimator(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        head=self._head,
+        max_depth=5)
+
+    # It will stop after 10 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
+
+  def testInferEstimator(self):
+    train_input_fn = _make_train_input_fn(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5,
+        head=self._head)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Validate predictions.
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testBinaryClassifierTrainInMemoryAndEvalAndInfer(self):
+    train_input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_classifier_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    # Validate predictions.
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
+  def testBinaryClassifierTrainInMemoryWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_classifier_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
+  def testRegressorTrainInMemoryAndEvalAndInfer(self):
+    train_input_fn = _make_train_input_fn(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_regressor_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testRegressorTrainInMemoryWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_regressor_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
index b5e4d34dc70ccaa4806ae8b8ed5001bd971ee7b4..dd009a6753f3231638f93e50fc8f19eae8820139 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -52,7 +53,9 @@ def _dnn_only_estimator_fn(
     config=None):
   return dnn_linear_combined.DNNLinearCombinedEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       model_dir=model_dir,
       dnn_feature_columns=feature_columns,
       dnn_optimizer=optimizer,
@@ -100,7 +103,9 @@ def _linear_only_estimator_fn(
     partitioner=None):
   return dnn_linear_combined.DNNLinearCombinedEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       model_dir=model_dir,
       linear_feature_columns=feature_columns,
       linear_optimizer=optimizer,
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_test.py
index 71f810acec856d42d389260e7b9fea32123348b4..75e3107670d658e55ce23d983e47311f1c180104 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -41,7 +42,9 @@ def _dnn_estimator_fn(weight_column=None, label_dimension=1, *args, **kwargs):
   """Returns a DNNEstimator that uses regression_head."""
   return dnn.DNNEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       *args, **kwargs)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index c99bf8badb35e6fffb7cae8761db9d402b8b3a8f..201699ed775f701bc9f215fff11a688175d51645 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import util as estimator_util
+from tensorflow.python.estimator.export.export_output import PredictOutput
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.ops import clip_ops
@@ -33,7 +34,7 @@ _VALID_METRIC_FN_ARGS = set(['features', 'labels', 'predictions', 'config'])
 
 
 def add_metrics(estimator, metric_fn):
-  """Creates a new ${tf.estimator.Estimator} which has given metrics.
+  """Creates a new @{tf.estimator.Estimator} which has given metrics.
 
   Example:
 
@@ -60,7 +61,7 @@ def add_metrics(estimator, metric_fn):
   ```
 
   Args:
-    estimator: A ${tf.estimator.Estimator} object.
+    estimator: A @{tf.estimator.Estimator} object.
     metric_fn: A function which should obey the following signature:
       - Args: can only have following four arguments in any order:
         * predictions: Predictions `Tensor` or dict of `Tensor` created by given
@@ -78,7 +79,7 @@ def add_metrics(estimator, metric_fn):
          function, namely a `(metric_tensor, update_op)` tuple.
 
   Returns:
-      A new ${tf.estimator.Estimator} which has a union of original metrics with
+      A new @{tf.estimator.Estimator} which has a union of original metrics with
         given ones.
   """
   _verify_metric_fn_args(metric_fn)
@@ -96,7 +97,10 @@ def add_metrics(estimator, metric_fn):
   return estimator_lib.Estimator(
       model_fn=new_model_fn,
       model_dir=estimator.model_dir,
-      config=estimator.config)
+      config=estimator.config,
+      # pylint: disable=protected-access
+      warm_start_from=estimator._warm_start_settings)
+      # pylint: enable=protected-access
 
 
 def clip_gradients_by_norm(optimizer, clip_norm):
@@ -161,14 +165,14 @@ def forward_features(estimator, keys=None):
   ```
 
   Args:
-    estimator: A ${tf.estimator.Estimator} object.
+    estimator: A @{tf.estimator.Estimator} object.
     keys: a `string` or a `list` of `string`. If it is `None`, all of the
       `features` in `dict` is forwarded to the `predictions`. If it is a
       `string`, only given key is forwarded. If it is a `list` of strings, all
       the given `keys` are forwarded.
 
   Returns:
-      A new ${tf.estimator.Estimator} which forwards features to predictions.
+      A new @{tf.estimator.Estimator} which forwards features to predictions.
 
   Raises:
     ValueError:
@@ -233,7 +237,17 @@ def forward_features(estimator, keys=None):
             'argument of forward_features to filter unwanted features. Type of '
             'features[{}] is {}.'.format(key, key, type(feature)))
       predictions[key] = feature
-    return spec._replace(predictions=predictions)
+    spec = spec._replace(predictions=predictions)
+    if spec.export_outputs:
+      for ekey in ['predict', 'serving_default']:
+        if (ekey in spec.export_outputs and
+            isinstance(spec.export_outputs[ekey],
+                       PredictOutput)):
+          export_outputs = spec.export_outputs[ekey].outputs
+          for key in get_keys(features):
+            export_outputs[key] = predictions[key]
+
+    return spec
 
   return estimator_lib.Estimator(
       model_fn=new_model_fn,
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders_test.py b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
index ad1a8ef152b07ecbab33d9eb3184a2ae89def27d..407af2deaf0928361a4f0b0e44e842b7750118cb 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
@@ -18,20 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import tempfile
 import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import extenders
+from tensorflow.contrib.predictor import from_saved_model
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import linear
 from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import training
+from tensorflow.python.util import compat
 
 
 def get_input_fn(x, y):
@@ -177,6 +184,44 @@ class ForwardFeaturesTest(test.TestCase):
     self.assertIn('id', predictions)
     self.assertEqual(101, predictions['id'])
 
+  def test_forward_in_exported(self):
+
+    def serving_input_fn():
+      features_ph = {
+          'x': array_ops.placeholder(dtypes.float32, [None]),
+          'id': array_ops.placeholder(dtypes.int32, [None])
+      }
+      features = {
+          key: array_ops.expand_dims(tensor, -1)
+          for key, tensor in features_ph.items()
+      }
+      return estimator_lib.export.ServingInputReceiver(features, features_ph)
+    def input_fn():
+      return {'x': [[3.], [5.]], 'id': [[101], [102]]}, [[1.], [2.]]
+    # create estimator
+    feature_columns = [fc.numeric_column('x')]
+    estimator = linear.LinearRegressor(feature_columns)
+    estimator.train(input_fn=input_fn, steps=1)
+    estimator = extenders.forward_features(estimator, 'id')
+
+    # export saved model
+    tmpdir = tempfile.mkdtemp()
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = estimator.export_savedmodel(export_dir_base, serving_input_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+    # restore model
+    predict_fn = from_saved_model(export_dir, signature_def_key='predict')
+    predictions = predict_fn({'x': [3], 'id': [101]})
+
+    # verify that 'id' exists in predictions
+    self.assertIn('id', predictions)
+    self.assertEqual(101, predictions['id'])
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
   def test_forward_list(self):
 
     def input_fn():
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 238cf287b768eee28b20202084eb244c085c8b75..3dcf0374c8a12b5907fbaf20d1ad72211a45ab5c 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -31,10 +31,12 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
@@ -42,7 +44,7 @@ _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 def multi_class_head(n_classes,
                      weight_column=None,
                      label_vocabulary=None,
-                     loss_reduction=losses.Reduction.SUM,
+                     loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                      loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi class classification.
@@ -83,7 +85,8 @@ def multi_class_head(n_classes,
       have any value in `label_vocabulary`. Note that errors will be raised if
       `label_vocabulary` is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
@@ -108,7 +111,7 @@ def binary_classification_head(
     weight_column=None,
     thresholds=None,
     label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM,
+    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
     loss_fn=None,
     name=None):
   """Creates a `_Head` for single label binary classification.
@@ -152,7 +155,8 @@ def binary_classification_head(
       `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
       is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
@@ -175,8 +179,9 @@ def binary_classification_head(
 
 def regression_head(weight_column=None,
                     label_dimension=1,
-                    loss_reduction=losses.Reduction.SUM,
+                    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                     loss_fn=None,
+                    inverse_link_fn=None,
                     name=None):
   """Creates a `_Head` for regression using the `mean_squared_error` loss.
 
@@ -195,10 +200,16 @@ def regression_head(weight_column=None,
   `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
   `[D0, D1, ... DN, label_dimension]`.
 
-  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  Supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
   `(labels, logits, features)` as arguments and returns unreduced loss with
   shape `[D0, D1, ... DN, label_dimension]`.
 
+  Also supports custom `inverse_link_fn`, also known as 'mean function'.
+  `inverse_link_fn` takes `logits` as argument and returns predicted values.
+  This function is the inverse of the link function defined in
+  https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function
+  Namely, for poisson regression, set `inverse_link_fn=tf.exp`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -208,8 +219,12 @@ def regression_head(weight_column=None,
       of the last dimension of the labels `Tensor` (typically, this has shape
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
-    loss_fn: Optional loss function.
+      reduce training loss over batch and label dimension. Defaults to
+      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
+      `batch size * label_dimension`. See `tf.losses.Reduction`.
+    loss_fn: Optional loss function. Defaults to `mean_squared_error`.
+    inverse_link_fn: Optional inverse link function, also known as 'mean
+      function'. Defaults to identity.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -224,6 +239,69 @@ def regression_head(weight_column=None,
       label_dimension=label_dimension,
       loss_reduction=loss_reduction,
       loss_fn=loss_fn,
+      inverse_link_fn=inverse_link_fn,
+      name=name)
+
+
+def poisson_regression_head(
+    weight_column=None,
+    label_dimension=1,
+    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
+    compute_full_loss=True,
+    name=None):
+  """Creates a `_Head` for poisson regression using `tf.nn.log_poisson_loss`.
+
+  The loss is the weighted sum over all input dimensions. Namely, if the input
+  labels have shape `[batch_size, label_dimension]`, the loss is the weighted
+  sum over both `batch_size` and `label_dimension`.
+
+  The head expects `logits` with shape `[D0, D1, ... DN, label_dimension]`.
+  In many applications, the shape is `[batch_size, label_dimension]`.
+
+  The `labels` shape must match `logits`, namely
+  `[D0, D1, ... DN, label_dimension]`. If `label_dimension=1`, shape
+  `[D0, D1, ... DN]` is also supported.
+
+  If `weight_column` is specified, weights must be of shape
+  `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
+  `[D0, D1, ... DN, label_dimension]`.
+
+  This is implemented as a generalized linear model, see
+  https://en.wikipedia.org/wiki/Generalized_linear_model.
+
+  Args:
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    label_dimension: Number of regression labels per example. This is the size
+      of the last dimension of the labels `Tensor` (typically, this has shape
+      `[batch_size, label_dimension]`).
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+      reduce training loss over batch and label dimension. Defaults to
+      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
+      `batch size * label_dimension`. See `tf.losses.Reduction`.
+    compute_full_loss: Whether to include the constant `log(z!)` term in
+      computing the poisson loss. See `tf.nn.log_poisson_loss` for the full
+      documentation.
+    name: name of the head. If provided, summary and metrics keys will be
+      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
+
+  Returns:
+    An instance of `_Head` for poisson regression.
+
+  Raises:
+    ValueError: If `label_dimension` or `loss_reduction` is invalid.
+  """
+  def _poisson_loss(labels, logits):
+    return nn.log_poisson_loss(
+        targets=labels, log_input=logits, compute_full_loss=compute_full_loss)
+  return head_lib._regression_head_with_mean_squared_error_loss(  # pylint:disable=protected-access
+      weight_column=weight_column,
+      label_dimension=label_dimension,
+      loss_reduction=loss_reduction,
+      loss_fn=_poisson_loss,
+      inverse_link_fn=math_ops.exp,
       name=name)
 
 
@@ -231,7 +309,7 @@ def multi_label_head(n_classes,
                      weight_column=None,
                      thresholds=None,
                      label_vocabulary=None,
-                     loss_reduction=losses.Reduction.SUM,
+                     loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                      loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi-label classification.
@@ -282,7 +360,8 @@ def multi_label_head(n_classes,
       string type and have any value in `label_vocabulary`. Also there will be
       errors if vocabulary is not provided and labels are string.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
@@ -331,7 +410,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                weight_column=None,
                thresholds=None,
                label_vocabulary=None,
-               loss_reduction=losses.Reduction.SUM,
+               loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                loss_fn=None,
                name=None):
     self._n_classes = n_classes
@@ -406,7 +485,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
           reduction=losses.Reduction.NONE)
       # Averages loss over classes.
       unweighted_loss = math_ops.reduce_mean(
-          unweighted_loss, axis=-1, keep_dims=True)
+          unweighted_loss, axis=-1, keepdims=True)
     weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
         features=features, weight_column=self._weight_column, logits=logits)
     training_loss = losses.compute_weighted_loss(
@@ -418,8 +497,8 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         processed_labels=processed_labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -431,8 +510,11 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with
         `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when
         `mode` equals `TRAIN` or `EVAL`.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Required in TRAIN mode.
+        `train_op`. Used if `optimizer` is `None`.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses. These losses are
         usually expressed as a batch average, so for best results users need to
@@ -442,7 +524,8 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
     Returns:
       `EstimatorSpec`.
     Raises:
-      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode, or if both are set.
     """
     with ops.name_scope(self._name, 'head'):
       logits = head_lib._check_logits_final_dim(logits, self.logits_dimension)  # pylint:disable=protected-access
@@ -494,8 +577,16 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                 regularization_loss=regularization_loss))
 
       # Train.
-      if train_op_fn is None:
-        raise ValueError('train_op_fn can not be None.')
+      if optimizer is not None:
+        if train_op_fn is not None:
+          raise ValueError('train_op_fn and optimizer cannot both be set.')
+        train_op = optimizer.minimize(
+            regularized_training_loss,
+            global_step=training_util.get_global_step())
+      elif train_op_fn is not None:
+        train_op = train_op_fn(regularized_training_loss)
+      else:
+        raise ValueError('train_op_fn and optimizer cannot both be None.')
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -521,7 +612,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
-        train_op=train_op_fn(regularized_training_loss))
+        train_op=train_op)
 
   def _eval_metric_ops(
       self, labels, probabilities, weights, unreduced_loss,
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index 1411635228457218578c0297d4d901e9c86ca91a..98962ca4277a3e8fbbdb3fb2d26df9acc45168b5 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
@@ -271,9 +272,9 @@ class MultiLabelHead(test.TestCase):
 
     logits = np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32)
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    expected_training_loss = np.sum(
+    # loss = (labels * -log(sigmoid(logits)) +
+    #         (1 - labels) * -log(1 - sigmoid(logits))) / 2
+    expected_training_loss = 0.5 * np.sum(
         _sigmoid_cross_entropy(labels=labels, logits=logits))
     actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
@@ -297,7 +298,7 @@ class MultiLabelHead(test.TestCase):
     # For large logits, this is approximated as:
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits
-    expected_training_loss = np.sum(
+    expected_training_loss = 0.5 * np.sum(
         np.array([[(10. + 10.) / 2.], [(15. + 0.) / 2.]], dtype=np.float32))
     actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
@@ -360,7 +361,7 @@ class MultiLabelHead(test.TestCase):
         labels=labels_input)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+      self.assertAllClose(np.sum(loss) / 2., actual_training_loss.eval())
 
   def test_eval_create_loss_loss_fn_wrong_shape(self):
     """Tests custom loss_fn that returns Tensor of unexpected shape."""
@@ -437,16 +438,17 @@ class MultiLabelHead(test.TestCase):
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples.
-    expected_loss = np.sum(_sigmoid_cross_entropy(labels=labels, logits=logits))
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels, logits=logits))
     keys = metric_keys.MetricKeys
     expected_metrics = {
         # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss / 2,
+        keys.LOSS_MEAN: expected_loss,
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
-        keys.AUC_PR: 0.5972,
+        keys.AUC_PR: 0.7639,
     }
     self._test_eval(
         head=head,
@@ -467,18 +469,17 @@ class MultiLabelHead(test.TestCase):
     labels_multi_hot = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples.
-    expected_loss = (
-        np.sum(_sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
-    )
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
     keys = metric_keys.MetricKeys
     expected_metrics = {
         # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss / 2,
+        keys.LOSS_MEAN: expected_loss,
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
-        keys.AUC_PR: 0.5972,
+        keys.AUC_PR: 0.7639,
     }
     self._test_eval(
         head=head,
@@ -509,7 +510,7 @@ class MultiLabelHead(test.TestCase):
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
-        keys.AUC_PR: 0.5972,
+        keys.AUC_PR: 0.7639,
     }
     self._test_eval(
         head=head,
@@ -532,18 +533,17 @@ class MultiLabelHead(test.TestCase):
     labels_multi_hot = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples.
-    expected_loss = (
-        np.sum(_sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
-    )
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
     keys = metric_keys.MetricKeys
     expected_metrics = {
         # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss / 2,
+        keys.LOSS_MEAN: expected_loss,
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
-        keys.AUC_PR: 0.5972,
+        keys.AUC_PR: 0.7639,
     }
     self._test_eval(
         head=head,
@@ -561,19 +561,18 @@ class MultiLabelHead(test.TestCase):
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples.
-    expected_loss = (
-        np.sum(_sigmoid_cross_entropy(labels=labels, logits=logits))
-    )
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels, logits=logits))
 
     keys = metric_keys.MetricKeys
     expected_metrics = {
         # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss / 2,
+        keys.LOSS_MEAN: expected_loss,
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
-        keys.AUC_PR: 0.5972,
+        keys.AUC_PR: 0.7639,
         keys.ACCURACY_AT_THRESHOLD % thresholds[0]: 2. / 4.,
         keys.PRECISION_AT_THRESHOLD % thresholds[0]: 2. / 3.,
         keys.RECALL_AT_THRESHOLD % thresholds[0]: 2. / 3.,
@@ -602,8 +601,9 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, weighted sum over examples.
-    expected_loss = 25.
+    # Average over classes, weighted sum over examples, divide by batch_size.
+    # loss = ( 1 * (10 + 10) / 2 + 2 * (15 + 0) / 2) / 2
+    expected_loss = 12.5
 
     spec = head.create_estimator_spec(
         features={
@@ -616,12 +616,12 @@ class MultiLabelHead(test.TestCase):
 
     keys = metric_keys.MetricKeys
     expected_metrics = {
-        # Average loss over weighted examples.
-        keys.LOSS_MEAN: expected_loss / 3,
+        # Average loss over weighted examples (denominator is sum(weights)).
+        keys.LOSS_MEAN: expected_loss * (2. / 3.),
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.2000,
-        keys.AUC_PR: 0.5833,
+        keys.AUC_PR: 0.7833,
     }
 
     # Assert spec contains expected tensors.
@@ -662,7 +662,7 @@ class MultiLabelHead(test.TestCase):
     #        (1 - labels) * (logits > 0) * logits
     expected_unreduced_loss = [[(10. + 10.) / 2.], [(15. + 0.) / 2.]]
     expected_weights = [[1.], [2.]]
-    expected_training_loss = 1. * (10. + 10.) / 2. + 2. * (15. + 0.) / 2.
+    expected_training_loss = (1. * (10. + 10.) / 2. + 2. * (15. + 0.) / 2.) / 2.
     training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features={
             'x': np.array(((42,),), dtype=np.int32),
@@ -808,11 +808,8 @@ class MultiLabelHead(test.TestCase):
       self.assertEqual(
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # Average loss over examples.
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-      }, summary_str, tol)
+      _assert_simple_summaries(
+          self, {metric_keys.MetricKeys.LOSS: expected_loss}, summary_str, tol)
 
   def test_train(self):
     head = head_lib.multi_label_head(n_classes=2)
@@ -822,8 +819,9 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over weights.
-    expected_loss = 17.5
+    # Average over classes, sum over examples, divide by batch_size.
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
+    expected_loss = 8.75
     self._test_train(
         head=head, logits=logits, labels=labels, expected_loss=expected_loss)
 
@@ -839,8 +837,9 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over weights.
-    expected_loss = 17.5
+    # Average over classes, sum over examples, divide by batch_size.
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
+    expected_loss = 8.75
     self._test_train(
         head=head, logits=logits, labels=labels, expected_loss=expected_loss)
 
@@ -857,11 +856,49 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over weights.
-    expected_loss = 17.5
+    # Average over classes, sum over examples, divide by batch_size.
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
+    expected_loss = 8.75
     self._test_train(
         head=head, logits=logits, labels=labels, expected_loss=expected_loss)
 
+  def test_train_with_optimizer(self):
+    head = head_lib.multi_label_head(n_classes=2)
+    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
+    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    # For large logits, sigmoid cross entropy loss is approximated as:
+    # loss = labels * (logits < 0) * (-logits) +
+    #        (1 - labels) * (logits > 0) * logits =>
+    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # Average over classes, sum over examples, divide by batch_size.
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
+    expected_loss = 8.75
+    expected_train_result = 'my_train_op'
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        return string_ops.string_join(
+            [constant_op.constant(expected_train_result),
+             string_ops.as_string(loss, precision=3)])
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    tol = 1e-3
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
+          train_result)
+
   def test_train_with_regularization_losses(self):
     head = head_lib.multi_label_head(
         n_classes=2, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
@@ -915,8 +952,9 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, weighted sum over examples.
-    expected_loss = 25.
+    # Average over classes, weighted sum over examples, divide by batch_size.
+    # loss = ( 1 * (10 + 10) / 2 + 2 * (15 + 0) / 2 ) / 2
+    expected_loss = 12.5
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
       return string_ops.string_join(
@@ -950,11 +988,8 @@ class MultiLabelHead(test.TestCase):
       self.assertEqual(
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # Average loss over weighted examples.
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 3,
-      }, summary_str, tol)
+      _assert_simple_summaries(
+          self, {metric_keys.MetricKeys.LOSS: expected_loss,}, summary_str, tol)
 
   def test_multi_dim_weighted_train_create_loss(self):
     """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
@@ -971,8 +1006,8 @@ class MultiLabelHead(test.TestCase):
     expected_unreduced_loss = [[[20./3.], [10./3.]], [[4.], [8.]]]
     # weights are reshaped to [2, 2, 1] to match logits.
     expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
-    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_training_loss = 39.6667
+    # loss = (1*20/3 + 1.5*10/3 + 2*4 + 2.5*8) / 4 = 9.9167
+    expected_training_loss = 9.9167
     training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features={'weights': weights},
         mode=model_fn.ModeKeys.TRAIN,
@@ -998,8 +1033,8 @@ class MultiLabelHead(test.TestCase):
     weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
     # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
     #      = [[20/3, 10/3], [4, 8]]
-    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_loss = 39.6667
+    # loss = (1*20/3 + 1.5*10/3 + 2*4 + 2.5*8) / 4 = 9.9167
+    expected_loss = 9.9167
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
       return string_ops.string_join(
@@ -1087,15 +1122,15 @@ class MultiLabelHead(test.TestCase):
     weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
     # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
     #      = [[20/3, 10/3], [4, 8]]
-    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_loss = 39.6667
+    # loss = (1*20/3 + 1.5*10/3 + 2*4 + 2.5*8) / 4 = 9.9167
+    expected_loss = 9.9167
     keys = metric_keys.MetricKeys
     expected_metrics = {
-        keys.LOSS_MEAN: expected_loss / np.sum(weights),
+        keys.LOSS_MEAN: expected_loss * (4. / np.sum(weights)),
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.4977,
-        keys.AUC_PR: 0.4037,
+        keys.AUC_PR: 0.6645,
     }
     self._test_eval(
         head=head,
@@ -1106,5 +1141,75 @@ class MultiLabelHead(test.TestCase):
         expected_metrics=expected_metrics)
 
 
+class PoissonRegressionHead(test.TestCase):
+
+  def setUp(self):
+    ops.reset_default_graph()
+
+  def test_train(self):
+    head = head_lib.poisson_regression_head()
+
+    # Create estimator spec.
+    logits = np.array([[0], [-1], [1]], dtype=np.float32)
+    labels = np.array([[1], [2], [3]], dtype=np.int32)
+    # With x = exp(logits), z = labels.
+    # loss = -ln(exp(-x) * (x^z) / z!)
+    #      = x - z * ln(x) + ln(z!)
+    #      = exp(logits) - labels * logits - ln(labels!)
+    # But for ln(z!) and z > 1, the Stirling approximation is used
+    # ln(z!) = z*ln(z) - z + 0.5*ln(2*pi*z)
+    # loss = [exp(0) - 1 * 0 + ln(1!),
+    #         exp(-1) - 2 * (-1) + 2*ln(2) - 2 + 0.5*ln(2*pi*2),
+    #         exp(1) - 3 * 1 + 3*ln(3) - 3 + 0.5*ln(2*pi*3)]
+    #      = [1.0, 3.020, 1.482]
+    # training_loss = (1.0 + 3.020 + 1.482) / 3
+    expected_loss = 1.834
+    atol = 0.001
+    expected_train_result = b'my_train_op'
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_near(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          atol=atol, name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run([spec.loss, spec.train_op])
+      self.assertAlmostEqual(expected_loss, loss, delta=atol)
+      self.assertEqual(expected_train_result, train_result)
+
+  def test_predict(self):
+    head = head_lib.poisson_regression_head()
+
+    # Create estimator spec.
+    logits = np.array([[0], [-1], [1]], dtype=np.float32)
+    expected_predictions = np.exp(logits)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    # Assert spec contains expected tensors.
+    keys = prediction_keys.PredictionKeys
+    self.assertItemsEqual(
+        (keys.PREDICTIONS, keys.LOGITS), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[keys.PREDICTIONS].dtype)
+    self.assertEqual(dtypes.float32, spec.predictions[keys.LOGITS].dtype)
+
+    # Assert predictions.
+    with self.test_session():
+      _initialize_variables(self, spec.scaffold)
+      self.assertAllClose(
+          expected_predictions, spec.predictions[keys.PREDICTIONS].eval())
+      self.assertAllClose(logits, spec.predictions[keys.LOGITS].eval())
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/linear_test.py b/tensorflow/contrib/estimator/python/estimator/linear_test.py
index c63514eb688af48577f0a3b7ce9e7478309f2c30..c41996b9c6871d294f157411662f2eb9d4c09e5c 100644
--- a/tensorflow/contrib/estimator/python/estimator/linear_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/linear_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -42,7 +43,9 @@ def _linear_estimator_fn(
   """Returns a LinearEstimator that uses regression_head."""
   return linear.LinearEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       *args, **kwargs)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index 0346ddc24bffd61068177f4622bd03be4acd53d9..ce758992140d43529037b14cbbf958d5aa763fb4 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.export import export_output as export_output_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -30,6 +31,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -226,8 +228,10 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
         weights=example_weights_by_head,
         processed_labels=labels_by_head)
 
+  # TODO(b/65403806): Support regularization_losses arg.
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None):
     """See `_Head`."""
     if isinstance(logits, dict):
       logits_dict = logits
@@ -248,9 +252,10 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
               train_op_fn=_no_op_train_fn))
 
     if mode == model_fn.ModeKeys.TRAIN:
-      if train_op_fn is None:
-        raise ValueError('train_op_fn can not be None in TRAIN mode.')
-      spec = self._merge_train(all_estimator_spec, train_op_fn)
+      spec = self._merge_train(
+          all_estimator_spec=all_estimator_spec,
+          optimizer=optimizer,
+          train_op_fn=train_op_fn)
       with ops.name_scope(''):
         summary.scalar(metric_keys.MetricKeys.LOSS, spec.loss)
       return spec
@@ -279,16 +284,21 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
         begin_idx += head.logits_dimension
     return logits_dict
 
-  def _merge_train(self, all_estimator_spec, train_op_fn):
+  def _merge_train(self, all_estimator_spec, optimizer, train_op_fn):
     """Merges list of `EstimatorSpec` for training.
 
     Args:
       all_estimator_spec: list of `EstimatorSpec` for the individual heads.
-      train_op_fn: Function to create train op. See `create_estimator_spec`
-        documentation for more details.
+      optimizer: `Optimizer` instance to create train op. See
+        `create_estimator_spec` documentation for more details.
+      train_op_fn: Function to create train op. Used if `optimizer` is `None`.
 
     Returns:
       `EstimatorSpec` that merges all heads for TRAIN.
+
+    Raises:
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode.
     """
     losses = []
     metrics = {}
@@ -297,11 +307,20 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
       # Metric keys already contain head.name.
       metrics.update(spec.eval_metric_ops or {})
     loss = _merge_losses(losses, self._head_weights)
+    if optimizer is not None:
+      if train_op_fn is not None:
+        raise ValueError('train_op_fn and optimizer cannot both be set.')
+      train_op = optimizer.minimize(
+          loss, global_step=training_util.get_global_step())
+    elif train_op_fn is not None:
+      train_op = train_op_fn(loss)
+    else:
+      raise ValueError('train_op_fn and optimizer cannot both be None.')
 
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         loss=loss,
-        train_op=train_op_fn(loss),
+        train_op=train_op,
         eval_metric_ops=metrics)
 
   def _merge_predict(self, all_estimator_spec):
@@ -319,16 +338,24 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
             all_estimator_spec[0].export_outputs,
             self._heads[0].name),
     }
+    merged_predict_outputs = {}
     for head, spec in zip(self._heads, all_estimator_spec):
       head_name = head.name
       for k, v in six.iteritems(spec.export_outputs):
         if k == _DEFAULT_SERVING_KEY:
           key = head_name
         else:
-          key = '%s/%s' % (k, head_name)
+          key = '%s/%s' % (head_name, k)
         export_outputs[key] = v
+        if (k == head_lib._PREDICT_SERVING_KEY and  # pylint:disable=protected-access
+            isinstance(v, export_output_lib.PredictOutput)):
+          for kp, vp in six.iteritems(v.outputs):
+            key = '%s/%s' % (head_name, kp)
+            merged_predict_outputs[key] = vp
       for k, v in six.iteritems(spec.predictions):
         predictions[(head_name, k)] = v
+    export_outputs[head_lib._PREDICT_SERVING_KEY] = (  # pylint:disable=protected-access
+        export_output_lib.PredictOutput(merged_predict_outputs))
 
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.PREDICT,
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index e47a6788f3b5440c4906b9f0430c802cf73237e3..3d6fccb1180c435f64552667306be004437f62ba 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -127,8 +127,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'head1', 'classification/head1', 'predict/head1',
-         'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
+         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -158,6 +158,22 @@ class MultiHeadTest(test.TestCase):
       self.assertAllClose(
           expected_probabilities['head2'],
           sess.run(spec.export_outputs['head2'].scores))
+      self.assertAllClose(
+          expected_probabilities['head1'],
+          sess.run(
+              spec.export_outputs['predict'].outputs['head1/probabilities']))
+      self.assertAllClose(
+          expected_probabilities['head2'],
+          sess.run(
+              spec.export_outputs['predict'].outputs['head2/probabilities']))
+      self.assertAllClose(
+          expected_probabilities['head1'],
+          sess.run(
+              spec.export_outputs['head1/predict'].outputs['probabilities']))
+      self.assertAllClose(
+          expected_probabilities['head2'],
+          sess.run(
+              spec.export_outputs['head2/predict'].outputs['probabilities']))
 
   def test_predict_two_heads_logits_tensor(self):
     """Tests predict with logits as Tensor."""
@@ -181,8 +197,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'head1', 'classification/head1', 'predict/head1',
-         'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
+         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -238,8 +254,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'head1', 'regression/head1', 'predict/head1',
-         'head2', 'regression/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/regression',
+         'head1/predict', 'head2', 'head2/regression', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -283,10 +299,11 @@ class MultiHeadTest(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # head1: expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
     # head2: expected_unweighted_loss = [[20., 20., 20.], [30., 0., 0]]
-    # Average over classes, weighted sum over batch and heads.
-    expected_loss_head1 = 17.5
-    expected_loss_head2 = 30.0
+    # loss = ( (20 + 20 + 20) / 3 + (30 + 0 + 0) / 3 ) / 2 = 15
+    expected_loss_head1 = 8.75
+    expected_loss_head2 = 15.
     expected_loss = 1. * expected_loss_head1 + 2. * expected_loss_head2
 
     spec = multi_head.create_estimator_spec(
@@ -300,14 +317,14 @@ class MultiHeadTest(test.TestCase):
         keys.LOSS + '/head1': expected_loss_head1,
         keys.LOSS + '/head2': expected_loss_head2,
         # Average loss over examples.
-        keys.LOSS_MEAN + '/head1': expected_loss_head1 / 2,
-        keys.LOSS_MEAN + '/head2': expected_loss_head2 / 2,
+        keys.LOSS_MEAN + '/head1': expected_loss_head1,
+        keys.LOSS_MEAN + '/head2': expected_loss_head2,
         # auc and auc_pr cannot be reliably calculated for only 4-6 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC + '/head1': 0.1667,
         keys.AUC + '/head2': 0.3333,
-        keys.AUC_PR + '/head1': 0.49999964,
-        keys.AUC_PR + '/head2': 0.33333313,
+        keys.AUC_PR + '/head1': 0.6667,
+        keys.AUC_PR + '/head2': 0.5000,
     }
 
     # Assert spec contains expected tensors.
@@ -347,8 +364,8 @@ class MultiHeadTest(test.TestCase):
     tol = 1e-3
     with self.test_session():
       # Unreduced loss of the head is [[(10 + 10) / 2], (15 + 0) / 2]
-      # (averaged over classes, sum-reduced over examples).
-      self.assertAllClose(17.5, loss.eval(), rtol=tol, atol=tol)
+      # (averaged over classes, averaged over examples).
+      self.assertAllClose(8.75, loss.eval(), rtol=tol, atol=tol)
 
   def test_train_create_loss_two_heads_with_weights(self):
     # Use different example weighting for each head weighting.
@@ -383,18 +400,18 @@ class MultiHeadTest(test.TestCase):
     with self.test_session():
       # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
       # = [10, 7.5]
-      # training_loss = 1 * 10 + 2 * 7.5 = 25
+      # training_loss = (1 * 10 + 2 * 7.5) / 2 = 12.5
       # head-weighted unreduced_loss = 1 * [10, 7.5]
       self.assertAllClose(
           [[10.], [7.5]], unreduced_losses['head1'].eval(), rtol=tol, atol=tol)
       # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
       # = [20, 10]
-      # training_loss = 2 * 20 + 3 * 10 = 70
+      # training_loss = (2 * 20 + 3 * 10) / 2 = 35
       # head-weighted unreduced_loss = 2 * [20, 10]
       self.assertAllClose(
           [[40.], [20.]], unreduced_losses['head2'].eval(), rtol=tol, atol=tol)
-      # head-weighted training_loss = 1 * 25 + 2 * 70 = 165
-      self.assertAllClose(165, training_loss.eval(), rtol=tol, atol=tol)
+      # head-weighted training_loss = 1 * 12.5 + 2 * 35 = 82.5
+      self.assertAllClose(82.5, training_loss.eval(), rtol=tol, atol=tol)
       # head-weighted example weights
       self.assertAllClose(
           [[1.], [2.]], weights['head1'].eval(), rtol=tol, atol=tol)
@@ -431,18 +448,18 @@ class MultiHeadTest(test.TestCase):
     with self.test_session():
       # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
       # = [10, 7.5]
-      # training_loss = 1 * 10 + 2 * 7.5 = 25
+      # training_loss = (1 * 10 + 2 * 7.5) / 2 = 12.5
       # head-weighted unreduced_loss = 1 * [10, 7.5]
       self.assertAllClose(
           [[10.], [7.5]], unreduced_losses['head1'].eval(), rtol=tol, atol=tol)
       # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
       # = [20, 10]
-      # training_loss = 2 * 20 + 3 * 10 = 70
+      # training_loss = (2 * 20 + 3 * 10) / 2 = 35
       # head-weighted unreduced_loss = 2 * [20, 10]
       self.assertAllClose(
           [[40.], [20.]], unreduced_losses['head2'].eval(), rtol=tol, atol=tol)
-      # head-weighted training_loss = 1 * 25 + 2 * 70 = 165
-      self.assertAllClose(165, training_loss.eval(), rtol=tol, atol=tol)
+      # head-weighted training_loss = 1 * 12.5 + 2 * 35 = 82.5
+      self.assertAllClose(82.5, training_loss.eval(), rtol=tol, atol=tol)
       # head-weighted example weights
       self.assertAllClose(
           [[1.], [2.]], weights['head1'].eval(), rtol=tol, atol=tol)
@@ -466,14 +483,14 @@ class MultiHeadTest(test.TestCase):
                            [[2., 2., 0.], [2., 2., 0.]]], dtype=np.float32),
     }
     # Loss for the first head:
-    # loss1 = (1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
-    #         (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2
-    #       = 28
+    # loss1 = ((1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
+    #          (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2) / 8
+    #       = 3.5
     # Loss for the second head:
-    # loss2 = (0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
-    #         (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2
-    #       = 74
-    expected_training_loss = 28. + 74.
+    # loss2 = ((0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
+    #          (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2) / 12
+    #       = 6.167
+    expected_training_loss = 3.5 + 6.167
 
     training_loss = multi_head.create_loss(
         features={},
@@ -495,8 +512,8 @@ class MultiHeadTest(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over weights.
-    expected_loss = 17.5
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
+    expected_loss = 8.75
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
       return string_ops.string_join(
@@ -530,10 +547,46 @@ class MultiHeadTest(test.TestCase):
       _assert_simple_summaries(self, {
           metric_keys.MetricKeys.LOSS: expected_loss,
           metric_keys.MetricKeys.LOSS + '/head1': expected_loss,
-          # Average loss over examples.
-          metric_keys.MetricKeys.LOSS_MEAN + '/head1': expected_loss / 2,
       }, summary_str, tol)
 
+  def test_train_one_head_with_optimizer(self):
+    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
+    multi_head = multi_head_lib.multi_head([head1])
+
+    logits = {'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)}
+    labels = {'head1': np.array([[1, 0], [1, 1]], dtype=np.int64)}
+    # For large logits, sigmoid cross entropy loss is approximated as:
+    # loss = labels * (logits < 0) * (-logits) +
+    #        (1 - labels) * (logits > 0) * logits =>
+    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
+    expected_loss = 8.75
+    expected_train_result = 'my_train_op'
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        return string_ops.string_join(
+            [constant_op.constant(expected_train_result),
+             string_ops.as_string(loss, precision=3)])
+
+    spec = multi_head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    tol = 1e-3
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
+          train_result)
+
   def test_train_two_heads_with_weights(self):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     head2 = head_lib.multi_label_head(n_classes=3, name='head2')
@@ -553,10 +606,12 @@ class MultiHeadTest(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # head1: expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
     # head2: expected_unweighted_loss = [[20., 20., 20.], [30., 0., 0]]
+    # loss = ( (20 + 20 + 20) / 3 + (30 + 0 + 0) / 3 ) / 2 = 15
     # Average over classes, weighted sum over batch and heads.
-    expected_loss_head1 = 17.5
-    expected_loss_head2 = 30.0
+    expected_loss_head1 = 8.75
+    expected_loss_head2 = 15.0
     expected_loss = 1. * expected_loss_head1 + 2. * expected_loss_head2
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
@@ -592,9 +647,6 @@ class MultiHeadTest(test.TestCase):
           metric_keys.MetricKeys.LOSS: expected_loss,
           metric_keys.MetricKeys.LOSS + '/head1': expected_loss_head1,
           metric_keys.MetricKeys.LOSS + '/head2': expected_loss_head2,
-          # Average loss over examples.
-          metric_keys.MetricKeys.LOSS_MEAN + '/head1': expected_loss_head1 / 2,
-          metric_keys.MetricKeys.LOSS_MEAN + '/head2': expected_loss_head2 / 2,
       }, summary_str, tol)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index e0fae2c99292385c6dd32cc6002cee2076a2bb20..f8564446e5da3e785b85010998d18dca0424d16b 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -47,8 +47,12 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import device_setter as device_setter_lib
 from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(
+    '2018-05-31',
+    'Please use `tf.contrib.distribute.MirroredStrategy` instead.')
 def replicate_model_fn(model_fn,
                        loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
                        devices=None):
@@ -136,7 +140,7 @@ def replicate_model_fn(model_fn,
       the train_op argument of `EstimatorSpec`.
     loss_reduction: controls whether losses are summed or averaged.
     devices: Optional list of devices to replicate the model across.  This
-      argument can be used to replice only on the subset of available GPUs.
+      argument can be used to replicate only on the subset of available GPUs.
       If `None`, then all available GPUs are going to be used for replication.
       If no GPUs are available, then the model is going to be placed on the CPU.
 
@@ -255,6 +259,9 @@ class TowerOptimizer(optimizer_lib.Optimizer):
 
   COLLECTION_FOR_GRAPH_STATES = 'replicate_model_fn_graph_states'
 
+  @deprecation.deprecated(
+      '2018-05-31',
+      'Please use `tf.contrib.distribute.MirroredStrategy` instead.')
   def __init__(self, optimizer_or_optimizer_fn):
     """Wrap an existing optimizer for gathering gradients across towers.
 
@@ -456,7 +463,7 @@ def _get_local_devices(device_type):
 
 
 def _split_batch(features, labels, number_of_shards, device):
-  """Split input features and labes into batches."""
+  """Split input features and labels into batches."""
 
   def ensure_divisible_by_shards(sequence):
     batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
@@ -602,7 +609,7 @@ def _local_device_setter(worker_device, ps_devices, ps_strategy):
 
 
 def _scale_tower_loss(tower_spec, loss_reduction, number_of_towers):
-  """Produce an EstimatorSpec with approproriately scaled loss."""
+  """Produce an EstimatorSpec with appropriately scaled loss."""
   if tower_spec.loss is None:
     return tower_spec
 
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index d46a18aacfcd911c56a9f22dc9581060c7b458a6..dd8a3a95f1b83bfd29e8a38ec1512f90e22968d9 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import re
 import shutil
 import tempfile
+from absl.testing import parameterized
 import numpy as np
 import six
 
@@ -57,26 +58,19 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import training
 
 
-# TODO(isaprykin):  Parametrize all the tests on
-#   replicate_model_fn._VariableDistributionMode when it's supported.
-class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
+class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase,
+                                   parameterized.TestCase):
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
 
-  def test_complete_flow_with_public_version(self):
-    return self._complete_flow_with_mode(mode=None)
-
-  def test_complete_flow_with_mode_local_ps_server(self):
-    return self._complete_flow_with_mode(
-        replicate_model_fn._VariableDistributionMode.
-        SHARED_LOCAL_PARAMETER_SERVER)
-
-  def test_complete_flow_with_mode_round_robin(self):
-    return self._complete_flow_with_mode(
-        replicate_model_fn._VariableDistributionMode.SHARED_ROUND_ROBIN)
-
-  def _complete_flow_with_mode(self, mode):
+  @parameterized.named_parameters(
+      ('PublicInterface', None),
+      ('ParameterServerMode', replicate_model_fn._VariableDistributionMode.
+       SHARED_LOCAL_PARAMETER_SERVER),
+      ('RoundRobinMode',
+       replicate_model_fn._VariableDistributionMode.SHARED_ROUND_ROBIN))
+  def test_complete_flow_with_mode(self, mode):
     n_classes = 3
     input_dimension = 2
     batch_size = 12
@@ -546,59 +540,6 @@ class ReplicateAcrossASingleDeviceWithoutTowerOptimizer(
         self.assertEqual(7.0, session.run(c))
 
 
-class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    features = features['features']
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = replicate_model_fn.TowerOptimizer(
-        gradient_descent.GradientDescentOptimizer(params['learning_rate']))
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(loss))
-
-  @property
-  def params(self):
-    params = {}
-    params['learning_rate'] = 1.0
-    return params
-
-  def test_train_single_tower(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'features': features}, y=labels, batch_size=2, shuffle=False)
-
-    with self.test_session():
-      estimator = estimator_lib.Estimator(
-          model_fn=self.model_fn,
-          model_dir=tempfile.mkdtemp(),
-          params=self.params)
-      estimator.train(train_input_fn, steps=1)
-
-      self.assertEqual(7.0, estimator.get_variable_value('c'))
-
-
 class MakeSureSyncReplicasOptimizerWorks(test_util.TensorFlowTestCase):
 
   def model_fn(self, mode, features, labels, params):
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn.py b/tensorflow/contrib/estimator/python/estimator/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b475c12f5af3aedc766a0880a98c5c1e29bddbb7
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/rnn.py
@@ -0,0 +1,481 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent Neural Network estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.estimator.python.estimator import extenders
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import training_util
+
+
+# The defaults are historical artifacts of the initial implementation, but seem
+# reasonable choices.
+_DEFAULT_LEARNING_RATE = 0.05
+_DEFAULT_CLIP_NORM = 5.0
+
+_CELL_TYPES = {'basic_rnn': rnn_cell.BasicRNNCell,
+               'lstm': rnn_cell.BasicLSTMCell,
+               'gru': rnn_cell.GRUCell}
+
+# Indicates no value was provided by the user to a kwarg.
+USE_DEFAULT = object()
+
+
+def _single_rnn_cell(num_units, cell_type):
+  cell_type = _CELL_TYPES.get(cell_type, cell_type)
+  if not cell_type or not issubclass(cell_type, rnn_cell.RNNCell):
+    raise ValueError('Supported cell types are {}; got {}'.format(
+        list(_CELL_TYPES.keys()), cell_type))
+  return cell_type(num_units=num_units)
+
+
+def _make_rnn_cell_fn(num_units, cell_type='basic_rnn'):
+  """Convenience function to create `rnn_cell_fn` for canned RNN Estimators.
+
+  Args:
+    num_units: Iterable of integer number of hidden units per RNN layer.
+    cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
+      the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
+      `'gru'`.
+
+  Returns:
+    A function that takes a single argument, an instance of
+    `tf.estimator.ModeKeys`, and returns an instance derived from
+    `tf.nn.rnn_cell.RNNCell`.
+
+  Raises:
+    ValueError: If cell_type is not supported.
+  """
+  def rnn_cell_fn(mode):
+    # Unused. Part of the rnn_cell_fn interface since user specified functions
+    # may need different behavior across modes (e.g. dropout).
+    del mode
+    cells = [_single_rnn_cell(n, cell_type) for n in num_units]
+    if len(cells) == 1:
+      return cells[0]
+    return rnn_cell.MultiRNNCell(cells)
+  return rnn_cell_fn
+
+
+def _concatenate_context_input(sequence_input, context_input):
+  """Replicates `context_input` across all timesteps of `sequence_input`.
+
+  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
+  This value is appended to `sequence_input` on dimension 2 and the result is
+  returned.
+
+  Args:
+    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
+      padded_length, d0]`.
+    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
+
+  Returns:
+    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
+    d0 + d1]`.
+
+  Raises:
+    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
+      not have rank 2.
+  """
+  seq_rank_check = check_ops.assert_rank(
+      sequence_input,
+      3,
+      message='sequence_input must have rank 3',
+      data=[array_ops.shape(sequence_input)])
+  seq_type_check = check_ops.assert_type(
+      sequence_input,
+      dtypes.float32,
+      message='sequence_input must have dtype float32; got {}.'.format(
+          sequence_input.dtype))
+  ctx_rank_check = check_ops.assert_rank(
+      context_input,
+      2,
+      message='context_input must have rank 2',
+      data=[array_ops.shape(context_input)])
+  ctx_type_check = check_ops.assert_type(
+      context_input,
+      dtypes.float32,
+      message='context_input must have dtype float32; got {}.'.format(
+          context_input.dtype))
+  with ops.control_dependencies(
+      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
+    padded_length = array_ops.shape(sequence_input)[1]
+    tiled_context_input = array_ops.tile(
+        array_ops.expand_dims(context_input, 1),
+        array_ops.concat([[1], [padded_length], [1]], 0))
+  return array_ops.concat([sequence_input, tiled_context_input], 2)
+
+
+def _select_last_activations(activations, sequence_lengths):
+  """Selects the nth set of activations for each n in `sequence_length`.
+
+  Returns a `Tensor` of shape `[batch_size, k]`. If `sequence_length` is not
+  `None`, then `output[i, :] = activations[i, sequence_length[i] - 1, :]`. If
+  `sequence_length` is `None`, then `output[i, :] = activations[i, -1, :]`.
+
+  Args:
+    activations: A `Tensor` with shape `[batch_size, padded_length, k]`.
+    sequence_lengths: A `Tensor` with shape `[batch_size]` or `None`.
+  Returns:
+    A `Tensor` of shape `[batch_size, k]`.
+  """
+  with ops.name_scope(
+      'select_last_activations', values=[activations, sequence_lengths]):
+    activations_shape = array_ops.shape(activations)
+    batch_size = activations_shape[0]
+    padded_length = activations_shape[1]
+    output_units = activations_shape[2]
+    if sequence_lengths is None:
+      sequence_lengths = padded_length
+    start_indices = math_ops.to_int64(
+        math_ops.range(batch_size) * padded_length)
+    last_indices = start_indices + sequence_lengths - 1
+    reshaped_activations = array_ops.reshape(
+        activations, [batch_size * padded_length, output_units])
+
+    last_activations = array_ops.gather(reshaped_activations, last_indices)
+    last_activations.set_shape([activations.shape[0], activations.shape[2]])
+    return last_activations
+
+
+def _rnn_logit_fn_builder(output_units, rnn_cell_fn, sequence_feature_columns,
+                          context_feature_columns, input_layer_partitioner):
+  """Function builder for a rnn logit_fn.
+
+  Args:
+    output_units: An int indicating the dimension of the logit layer.
+    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+      returns an object of type `tf.nn.rnn_cell.RNNCell`.
+    sequence_feature_columns: An iterable containing the `FeatureColumn`s
+      that represent sequential input.
+    context_feature_columns: An iterable containing the `FeatureColumn`s
+      that represent contextual input.
+    input_layer_partitioner: Partitioner for input layer.
+
+  Returns:
+    A logit_fn (see below).
+
+  Raises:
+    ValueError: If output_units is not an int.
+  """
+  if not isinstance(output_units, int):
+    raise ValueError('output_units must be an int.  Given type: {}'.format(
+        type(output_units)))
+
+  def rnn_logit_fn(features, mode):
+    """Recurrent Neural Network logit_fn.
+
+    Args:
+      features: This is the first item returned from the `input_fn`
+                passed to `train`, `evaluate`, and `predict`. This should be a
+                single `Tensor` or `dict` of same.
+      mode: Optional. Specifies if this training, evaluation or prediction. See
+            `ModeKeys`.
+
+    Returns:
+      A `Tensor` representing the logits.
+    """
+    with variable_scope.variable_scope(
+        'sequence_input_layer',
+        values=tuple(six.itervalues(features)),
+        partitioner=input_layer_partitioner):
+      sequence_input, sequence_length = seq_fc.sequence_input_layer(
+          features=features, feature_columns=sequence_feature_columns)
+      summary.histogram('sequence_length', sequence_length)
+
+      if context_feature_columns:
+        context_input = feature_column_lib.input_layer(
+            features=features,
+            feature_columns=context_feature_columns)
+        sequence_input = _concatenate_context_input(sequence_input,
+                                                    context_input)
+
+    cell = rnn_cell_fn(mode)
+    # Ignore output state.
+    rnn_outputs, _ = rnn.dynamic_rnn(
+        cell=cell,
+        inputs=sequence_input,
+        dtype=dtypes.float32,
+        time_major=False)
+    last_activations = _select_last_activations(rnn_outputs, sequence_length)
+
+    with variable_scope.variable_scope('logits', values=(rnn_outputs,)):
+      logits = core_layers.dense(
+          last_activations,
+          units=output_units,
+          activation=None,
+          kernel_initializer=init_ops.glorot_uniform_initializer())
+    return logits
+
+  return rnn_logit_fn
+
+
+def _rnn_model_fn(features,
+                  labels,
+                  mode,
+                  head,
+                  rnn_cell_fn,
+                  sequence_feature_columns,
+                  context_feature_columns,
+                  optimizer='Adagrad',
+                  input_layer_partitioner=None,
+                  config=None):
+  """Recurrent Neural Net model_fn.
+
+  Args:
+    features: dict of `Tensor` and `SparseTensor` objects returned from
+      `input_fn`.
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] with labels.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `head_lib._Head` instance.
+    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+      returns an object of type `tf.nn.rnn_cell.RNNCell`.
+    sequence_feature_columns: Iterable containing `FeatureColumn`s that
+      represent sequential model inputs.
+    context_feature_columns: Iterable containing `FeatureColumn`s that
+      represent model inputs not associated with a specific timestep.
+    optimizer: String, `tf.Optimizer` object, or callable that creates the
+      optimizer to use for training. If not specified, will use the Adagrad
+      optimizer with a default learning rate of 0.05 and gradient clip norm of
+      5.0.
+    input_layer_partitioner: Partitioner for input layer. Defaults
+      to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    An `EstimatorSpec` instance.
+
+  Raises:
+    ValueError: If mode or optimizer is invalid, or features has the wrong type.
+  """
+  if not isinstance(features, dict):
+    raise ValueError('features should be a dictionary of `Tensor`s. '
+                     'Given type: {}'.format(type(features)))
+
+  # If user does not provide an optimizer instance, use the optimizer specified
+  # by the string with default learning rate and gradient clipping.
+  if not isinstance(optimizer, optimizer_lib.Optimizer):
+    optimizer = optimizers.get_optimizer_instance(
+        optimizer, learning_rate=_DEFAULT_LEARNING_RATE)
+    optimizer = extenders.clip_gradients_by_norm(optimizer, _DEFAULT_CLIP_NORM)
+
+  num_ps_replicas = config.num_ps_replicas if config else 0
+  partitioner = partitioned_variables.min_max_variable_partitioner(
+      max_partitions=num_ps_replicas)
+  with variable_scope.variable_scope(
+      'rnn',
+      values=tuple(six.itervalues(features)),
+      partitioner=partitioner):
+    input_layer_partitioner = input_layer_partitioner or (
+        partitioned_variables.min_max_variable_partitioner(
+            max_partitions=num_ps_replicas,
+            min_slice_size=64 << 20))
+
+    logit_fn = _rnn_logit_fn_builder(
+        output_units=head.logits_dimension,
+        rnn_cell_fn=rnn_cell_fn,
+        sequence_feature_columns=sequence_feature_columns,
+        context_feature_columns=context_feature_columns,
+        input_layer_partitioner=input_layer_partitioner)
+    logits = logit_fn(features=features, mode=mode)
+
+    def _train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizer.minimize(
+          loss,
+          global_step=training_util.get_global_step())
+
+    return head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+
+class RNNClassifier(estimator.Estimator):
+  """A classifier for TensorFlow RNN models.
+
+  Trains a recurrent neural network model to classify instances into one of
+  multiple classes.
+
+  Example:
+
+  ```python
+  token_sequence = sequence_categorical_column_with_hash_bucket(...)
+  token_emb = embedding_column(categorical_column=token_sequence, ...)
+
+  estimator = RNNClassifier(
+      num_units=[32, 16], cell_type='lstm',
+      sequence_feature_columns=[token_emb])
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `sequence_feature_columns`:
+    - a feature with `key=column.name` whose `value` is a `SparseTensor`.
+  * for each `column` in `context_feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  Loss is calculated by using softmax cross entropy.
+
+  @compatibility(eager)
+  Estimators are not compatible with eager execution.
+  @end_compatibility
+  """
+
+  def __init__(self,
+               sequence_feature_columns,
+               context_feature_columns=None,
+               num_units=None,
+               cell_type=USE_DEFAULT,
+               rnn_cell_fn=None,
+               model_dir=None,
+               n_classes=2,
+               weight_column=None,
+               label_vocabulary=None,
+               optimizer='Adagrad',
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a `RNNClassifier` instance.
+
+    Args:
+      sequence_feature_columns: An iterable containing the `FeatureColumn`s
+        that represent sequential input. All items in the set should either be
+        sequence columns (e.g. `sequence_numeric_column`) or constructed from
+        one (e.g. `embedding_column` with `sequence_categorical_column_*` as
+        input).
+      context_feature_columns: An iterable containing the `FeatureColumn`s
+        for contextual input. The data represented by these columns will be
+        replicated and given to the RNN at each timestep. These columns must be
+        instances of classes derived from `_DenseColumn` such as
+        `numeric_column`, not the sequential variants.
+      num_units: Iterable of integer number of hidden units per RNN layer. If
+        set, `cell_type` must also be specified and `rnn_cell_fn` must be
+        `None`.
+      cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
+        the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
+        `'gru'`. If set, `num_units` must also be specified and `rnn_cell_fn`
+        must be `None`.
+      rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+        returns an object of type `tf.nn.rnn_cell.RNNCell` that will be used to
+        construct the RNN. If set, `num_units` and `cell_type` cannot be set.
+        This is for advanced users who need additional customization beyond
+        `num_units` and `cell_type`. Note that `tf.nn.rnn_cell.MultiRNNCell` is
+        needed for stacked RNNs.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      n_classes: Number of label classes. Defaults to 2, namely binary
+        classification. Must be > 1.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to Adagrad optimizer.
+      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
+        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: If `num_units`, `cell_type`, and `rnn_cell_fn` are not
+        compatible.
+    """
+    if rnn_cell_fn and (num_units or cell_type != USE_DEFAULT):
+      raise ValueError(
+          'num_units and cell_type must not be specified when using rnn_cell_fn'
+      )
+    if not rnn_cell_fn:
+      if cell_type == USE_DEFAULT:
+        cell_type = 'basic_rnn'
+      rnn_cell_fn = _make_rnn_cell_fn(num_units, cell_type)
+
+    if n_classes == 2:
+      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
+          weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    else:
+      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
+          n_classes, weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    def _model_fn(features, labels, mode, config):
+      return _rnn_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          rnn_cell_fn=rnn_cell_fn,
+          sequence_feature_columns=tuple(sequence_feature_columns or []),
+          context_feature_columns=tuple(context_feature_columns or []),
+          optimizer=optimizer,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+    super(RNNClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn_test.py b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..393f94f5c7de02c56d93993bbeb8aaec4ea8234c
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
@@ -0,0 +1,1131 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rnn.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import rnn
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_util
+
+
+# Names of variables created by BasicRNNCell model.
+TOKEN_EMBEDDING_NAME = 'rnn/sequence_input_layer/input_layer/tokens_sequential_embedding/embedding_weights'
+CELL_WEIGHTS_NAME = 'rnn/rnn/basic_rnn_cell/kernel'
+CELL_BIAS_NAME = 'rnn/rnn/basic_rnn_cell/bias'
+MULTI_CELL_WEIGHTS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/kernel'
+MULTI_CELL_BIAS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/bias'
+LOGITS_WEIGHTS_NAME = 'rnn/logits/dense/kernel'
+LOGITS_BIAS_NAME = 'rnn/logits/dense/bias'
+
+
+def _assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=('Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        name=scope)
+
+
+def create_checkpoint(rnn_weights, rnn_biases, logits_weights, logits_biases,
+                      global_step, model_dir):
+  """Create checkpoint file with provided model weights.
+
+  Args:
+    rnn_weights: Iterable of values of weights for the RNN cell.
+    rnn_biases: Iterable of values of biases for the RNN cell.
+    logits_weights: Iterable of values for matrix connecting RNN output to
+      logits.
+    logits_biases: Iterable of values for logits bias term.
+    global_step: Initial global step to save in checkpoint.
+    model_dir: Directory into which checkpoint is saved.
+  """
+  model_weights = {}
+  model_weights[CELL_WEIGHTS_NAME] = rnn_weights
+  model_weights[CELL_BIAS_NAME] = rnn_biases
+  model_weights[LOGITS_WEIGHTS_NAME] = logits_weights
+  model_weights[LOGITS_BIAS_NAME] = logits_biases
+
+  with ops.Graph().as_default():
+    # Create model variables.
+    for k, v in six.iteritems(model_weights):
+      variables_lib.Variable(v, name=k, dtype=dtypes.float32)
+
+    # Create non-model variables.
+    global_step_var = training_util.create_global_step()
+    assign_op = global_step_var.assign(global_step)
+
+    # Initialize vars and save checkpoint.
+    with monitored_session.MonitoredTrainingSession(
+        checkpoint_dir=model_dir) as sess:
+      sess.run(assign_op)
+
+
+class RNNLogitFnTest(test.TestCase):
+  """Tests correctness of logits calculated from _rnn_logit_fn_builder."""
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_logits(self, mode, rnn_units, logits_dimension, features_fn,
+                   sequence_feature_columns, context_feature_columns,
+                   expected_logits):
+    """Tests that the expected logits are calculated."""
+    with ops.Graph().as_default():
+      # Global step needed for MonitoredSession, which is in turn used to
+      # explicitly set variable weights through a checkpoint.
+      training_util.create_global_step()
+      # Use a variable scope here with 'rnn', emulating the rnn model_fn, so
+      # the checkpoint naming is shared.
+      with variable_scope.variable_scope('rnn'):
+        input_layer_partitioner = (
+            partitioned_variables.min_max_variable_partitioner(
+                max_partitions=0, min_slice_size=64 << 20))
+        logit_fn = rnn._rnn_logit_fn_builder(
+            output_units=logits_dimension,
+            rnn_cell_fn=rnn._make_rnn_cell_fn(rnn_units),
+            sequence_feature_columns=sequence_feature_columns,
+            context_feature_columns=context_feature_columns,
+            input_layer_partitioner=input_layer_partitioner)
+        # Features are constructed within this function, otherwise the Tensors
+        # containing the features would be defined outside this graph.
+        logits = logit_fn(features=features_fn(), mode=mode)
+        with monitored_session.MonitoredTrainingSession(
+            checkpoint_dir=self._model_dir) as sess:
+          self.assertAllClose(expected_logits, sess.run(logits), atol=1e-4)
+
+  def testOneDimLogits(self):
+    """Tests one-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10]], [[5]]]
+    initial_state = [0, 0]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
+                          = [[0.53, -0.37]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3]] = [[-0.6033]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033]])
+
+  def testMultiDimLogits(self):
+    """Tests multi-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10]], [[5]]]
+    initial_state = [0, 0]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
+                          = [[0.53, -0.37]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3],
+              [0.5*0.53 + 0.3*0.37 + 0.4],
+              [0.2*0.53 - 0.1*0.37 + 0.5]
+           = [[-0.6033, 0.7777, 0.5698]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=3,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033, 0.7777, 0.5698]])
+
+  def testMultiExampleMultiDim(self):
+    """Tests multiple examples and multi-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10], [5]], [[2], [7]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91], [0.38, 0.10]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
+                             [tanh(.1*7 + .2*.38 + .3*.10 +.2),
+                              tanh(-.2*7 - .3*.38 - .4*.10 +.5)]]
+                          = [[0.53, -0.37], [0.76, -0.78]
+    logits = [[-1*0.53 - 1*0.37 + 0.3,
+               0.5*0.53 + 0.3*0.37 + 0.4,
+               0.2*0.53 - 0.1*0.37 + 0.5],
+              [-1*0.76 - 1*0.78 + 0.3,
+               0.5*0.76 +0.3*0.78 + 0.4,
+               0.2*0.76 -0.1*0.78 + 0.5]]
+           = [[-0.6033, 0.7777, 0.5698], [-1.2473, 1.0170, 0.5745]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))
+    ]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=3,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033, 0.7777, 0.5698],
+                           [-1.2473, 1.0170, 0.5745]])
+
+  def testMultiExamplesDifferentLength(self):
+    """Tests multiple examples with different lengths.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10], [5]], [[2], [0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91], [0.38, 0.10]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.53, -0.37], [<ignored-padding>]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3],
+              [-1*0.38 + 1*0.10 + 0.3]]
+           = [[-0.6033], [0.0197]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033], [0.0197]])
+
+  def testMultiExamplesWithContext(self):
+    """Tests multiple examples with context features.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10, -0.5], [5, -0.5]], [[2, 0.8], [0, 0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 - 1*.5 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - 0.9*.5 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + 1*.8 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 + .9*.8 - .3*0 - .4*0 +.5)]]
+                          = [[0.60, -0.96], [0.83, 0.68]]
+    rnn_output_timestep_2 = [[tanh(.1*5 - 1*.5 + .2*.60 - .3*.96 +.2),
+                              tanh(-.2*5 - .9*.5 - .3*.60 + .4*.96 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.03, -0.63], [<ignored-padding>]]
+    logits = [[-1*0.03 - 1*0.63 + 0.3],
+              [-1*0.83 + 1*0.68 + 0.3]]
+           = [[-0.3662], [0.1414]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        # Context features weights are inserted between input and state weights.
+        rnn_weights=[[.1, -.2], [1., 0.9], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+          'context': [[-0.5], [0.8]],
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = [fc.numeric_column('context', shape=(1,))]
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.3662], [0.1414]])
+
+  def testMultiExamplesMultiFeatures(self):
+    """Tests examples with multiple sequential feature columns.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[1, 0, 10], [0, 1, 5]], [[1, 0, 2], [0, 0, 0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.5*1 + 1*0 + .1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.5*1 - 1*0 - .2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.5*1 + 1*0 + .1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.5*1 - 1*0 - .2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.94, -0.96], [0.72, -0.38]]
+    rnn_output_timestep_2 = [[tanh(.5*0 + 1*1 + .1*5 + .2*.94 - .3*.96 +.2),
+                              tanh(-.5*0 - 1*1 - .2*5 - .3*.94 + .4*.96 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.92, -0.88], [<ignored-padding>]]
+    logits = [[-1*0.92 - 1*0.88 + 0.3],
+              [-1*0.72 - 1*0.38 + 0.3]]
+           = [[-1.5056], [-0.7962]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        # FeatureColumns are sorted alphabetically, so on_sale weights are
+        # inserted before price.
+        rnn_weights=[[.5, -.5], [1., -1.], [.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+          'on_sale':
+              sparse_tensor.SparseTensor(
+                  values=[0, 1, 0],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }
+
+    price_column = seq_fc.sequence_numeric_column('price', shape=(1,))
+    on_sale_column = fc.indicator_column(
+        seq_fc.sequence_categorical_column_with_identity(
+            'on_sale', num_buckets=2))
+    sequence_feature_columns = [price_column, on_sale_column]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-1.5056], [-0.7962]])
+
+
+class RNNClassifierTrainingTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _assert_checkpoint(
+      self, n_classes, input_units, cell_units, expected_global_step):
+
+    shapes = {
+        name: shape for (name, shape) in
+        checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(
+        expected_global_step,
+        checkpoint_utils.load_variable(
+            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
+
+    # RNN Cell variables.
+    if len(cell_units) > 1:
+      for i, cell_unit in enumerate(cell_units):
+        self.assertEqual([input_units + cell_unit, cell_unit],
+                         shapes[MULTI_CELL_WEIGHTS_NAME_PATTERN % i])
+        self.assertEqual([cell_unit],
+                         shapes[MULTI_CELL_BIAS_NAME_PATTERN % i])
+        input_units = cell_unit
+    elif len(cell_units) == 1:
+      self.assertEqual([input_units + cell_unit, cell_unit],
+                       shapes[CELL_WEIGHTS_NAME])
+      self.assertEqual([cell_unit], shapes[CELL_BIAS_NAME])
+
+    # Logits variables.
+    logits_dimension = n_classes if n_classes > 2 else 1
+    self.assertEqual([cell_units[-1], logits_dimension],
+                     shapes[LOGITS_WEIGHTS_NAME])
+    self.assertEqual([logits_dimension], shapes[LOGITS_BIAS_NAME])
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s/part_0:0' % CELL_BIAS_NAME,
+        '%s/part_0:0' % CELL_WEIGHTS_NAME,
+        '%s/part_0:0' % LOGITS_BIAS_NAME,
+        '%s/part_0:0' % LOGITS_WEIGHTS_NAME,
+    ]
+
+    def _minimize(loss, global_step):
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(
+          expected_var_names,
+          [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        return state_ops.assign_add(global_step, 1).op
+      assert_loss = _assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        return state_ops.assign_add(global_step, 1).op
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def testConflictingRNNCellFn(self):
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    cell_units = [4, 2]
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'num_units and cell_type must not be specified when using rnn_cell_fn'):
+      rnn.RNNClassifier(
+          sequence_feature_columns=[embed],
+          rnn_cell_fn=lambda x: x,
+          num_units=cell_units)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'num_units and cell_type must not be specified when using rnn_cell_fn'):
+      rnn.RNNClassifier(
+          sequence_feature_columns=[embed],
+          rnn_cell_fn=lambda x: x,
+          cell_type='lstm')
+
+  def _testFromScratchWithDefaultOptimizer(self, n_classes):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat'],
+                  indices=[[0, 0], [0, 1], [0, 2]],
+                  dense_shape=[1, 3]),
+      }, [[1]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        sequence_feature_columns=[embed],
+        num_units=cell_units,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def testBinaryClassFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=2)
+
+  def testMultiClassFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=4)
+
+  def testFromScratchWithCustomRNNCellFn(self):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat'],
+                  indices=[[0, 0], [0, 1], [0, 2]],
+                  dense_shape=[1, 3]),
+      }, [[1]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+    cell_units = [4, 2]
+    n_classes = 2
+
+    def rnn_cell_fn(mode):
+      del mode  # unused
+      cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units]
+      return rnn_cell.MultiRNNCell(cells)
+
+    est = rnn.RNNClassifier(
+        sequence_feature_columns=[embed],
+        rnn_cell_fn=rnn_cell_fn,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def _testExampleWeight(self, n_classes):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat', 'dog', 'barked'],
+                  indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
+                  dense_shape=[2, 3]),
+          'w': [[1], [2]],
+      }, [[1], [0]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        num_units=cell_units,
+        sequence_feature_columns=[embed],
+        n_classes=n_classes,
+        weight_column='w',
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def testBinaryClassWithExampleWeight(self):
+    self._testExampleWeight(n_classes=2)
+
+  def testMultiClassWithExampleWeight(self):
+    self._testExampleWeight(n_classes=4)
+
+  def testBinaryClassFromCheckpoint(self):
+    initial_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=initial_global_step,
+        model_dir=self._model_dir)
+
+    def train_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    # Uses same checkpoint and examples as testBinaryClassEvaluationMetrics.
+    # See that test for loss calculation.
+    mock_optimizer = self._mock_optimizer(expected_loss=1.119661)
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+    est.train(input_fn=train_input_fn, steps=10)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+  def testMultiClassFromCheckpoint(self):
+    initial_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=initial_global_step,
+        model_dir=self._model_dir)
+
+    def train_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    # Uses same checkpoint and examples as testMultiClassEvaluationMetrics.
+    # See that test for loss calculation.
+    mock_optimizer = self._mock_optimizer(expected_loss=2.662932)
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+    est.train(input_fn=train_input_fn, steps=10)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+
+def sorted_key_dict(unsorted_dict):
+  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
+
+
+class RNNClassifierEvaluationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def testBinaryClassEvaluationMetrics(self):
+    global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=global_step,
+        model_dir=self._model_dir)
+
+    def eval_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(eval_input_fn, steps=1)
+
+    # Uses identical numbers to testMultiExamplesWithDifferentLength.
+    # See that test for logits calculation.
+    # logits = [[-0.603282], [0.019719]]
+    # probability = exp(logits) / (1 + exp(logits)) = [[0.353593], [0.504930]]
+    # loss = -label * ln(p) - (1 - label) * ln(1 - p)
+    #      = [[0.436326], [0.683335]]
+    expected_metrics = {
+        ops.GraphKeys.GLOBAL_STEP: global_step,
+        metric_keys.MetricKeys.LOSS: 1.119661,
+        metric_keys.MetricKeys.LOSS_MEAN: 0.559831,
+        metric_keys.MetricKeys.ACCURACY: 1.0,
+        metric_keys.MetricKeys.PREDICTION_MEAN: 0.429262,
+        metric_keys.MetricKeys.LABEL_MEAN: 0.5,
+        metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
+        # With default threshold of 0.5, the model is a perfect classifier.
+        metric_keys.MetricKeys.RECALL: 1.0,
+        metric_keys.MetricKeys.PRECISION: 1.0,
+        # Positive example is scored above negative, so AUC = 1.0.
+        metric_keys.MetricKeys.AUC: 1.0,
+        metric_keys.MetricKeys.AUC_PR: 1.0,
+    }
+    self.assertAllClose(
+        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
+
+  def testMultiClassEvaluationMetrics(self):
+    global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=global_step,
+        model_dir=self._model_dir)
+
+    def eval_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(eval_input_fn, steps=1)
+
+    # Uses identical numbers to testMultiExampleMultiDim.
+    # See that test for logits calculation.
+    # logits = [[-0.603282, 0.777708, 0.569756],
+    #           [-1.247356, 1.017018, 0.574481]]
+    # logits_exp = exp(logits) / (1 + exp(logits))
+    #            = [[0.547013, 2.176468, 1.767836],
+    #               [0.287263, 2.764937, 1.776208]]
+    # softmax_probabilities = logits_exp / logits_exp.sum()
+    #                       = [[0.121793, 0.484596, 0.393611],
+    #                          [0.059494, 0.572639, 0.367866]]
+    # loss = -1. * log(softmax[label])
+    #      = [[2.105432], [0.557500]]
+    expected_metrics = {
+        ops.GraphKeys.GLOBAL_STEP: global_step,
+        metric_keys.MetricKeys.LOSS: 2.662932,
+        metric_keys.MetricKeys.LOSS_MEAN: 1.331466,
+        metric_keys.MetricKeys.ACCURACY: 0.5,
+    }
+
+    self.assertAllClose(
+        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
+
+
+class RNNClassifierPredictionTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def testBinaryClassPredictions(self):
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=0,
+        model_dir=self._model_dir)
+
+    def predict_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    label_vocabulary = ['class_0', 'class_1']
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        label_vocabulary=label_vocabulary,
+        model_dir=self._model_dir)
+    # Uses identical numbers to testOneDimLogits.
+    # See that test for logits calculation.
+    # logits = [-0.603282]
+    # logistic = exp(-0.6033) / (1 + exp(-0.6033)) = [0.353593]
+    # probabilities = [0.646407, 0.353593]
+    # class_ids = argmax(probabilities) = [0]
+    predictions = next(est.predict(predict_input_fn))
+    self.assertAllClose([-0.603282],
+                        predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose([0.353593],
+                        predictions[prediction_keys.PredictionKeys.LOGISTIC])
+    self.assertAllClose(
+        [0.646407, 0.353593],
+        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllClose([0],
+                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertEqual([b'class_0'],
+                     predictions[prediction_keys.PredictionKeys.CLASSES])
+
+  def testMultiClassPredictions(self):
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=0,
+        model_dir=self._model_dir)
+
+    def predict_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    label_vocabulary = ['class_0', 'class_1', 'class_2']
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        label_vocabulary=label_vocabulary,
+        model_dir=self._model_dir)
+    # Uses identical numbers to testMultiDimLogits.
+    # See that test for logits calculation.
+    # logits = [-0.603282, 0.777708, 0.569756]
+    # logits_exp = exp(logits) = [0.547013, 2.176468, 1.767836]
+    # softmax_probabilities = logits_exp / logits_exp.sum()
+    #                       = [0.121793, 0.484596, 0.393611]
+    # class_ids = argmax(probabilities) = [1]
+    predictions = next(est.predict(predict_input_fn))
+    self.assertAllClose([-0.603282, 0.777708, 0.569756],
+                        predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose(
+        [0.121793, 0.484596, 0.393611],
+        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllClose([1],
+                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertEqual([b'class_1'],
+                     predictions[prediction_keys.PredictionKeys.CLASSES])
+
+
+class RNNClassifierIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, n_classes,
+      batch_size):
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    feature_columns = [embed]
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        num_units=cell_units,
+        sequence_feature_columns=feature_columns,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUATE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predicted_proba = np.array([
+        x[prediction_keys.PredictionKeys.PROBABILITIES]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
+
+    # EXPORT
+    feature_spec = {
+        'tokens': parsing_ops.VarLenFeature(dtypes.string),
+        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def testNumpyInputFn(self):
+    """Tests complete flow with numpy_input_fn."""
+    n_classes = 3
+    batch_size = 10
+    words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept']
+    # Numpy only supports dense input, so all examples will have same length.
+    # TODO(b/73160931): Update test when support for prepadded data exists.
+    sequence_length = 3
+
+    features = []
+    for _ in range(batch_size):
+      sentence = random.sample(words, sequence_length)
+      features.append(sentence)
+
+    x_data = np.array(features)
+    y_data = np.random.randint(n_classes, size=batch_size)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+  def testParseExampleInputFn(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    n_classes = 3
+    batch_size = 10
+    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']
+
+    serialized_examples = []
+    for _ in range(batch_size):
+      sequence_length = random.randint(1, len(words))
+      sentence = random.sample(words, sequence_length)
+      label = random.randint(0, n_classes - 1)
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'tokens':
+                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                      value=sentence)),
+              'label':
+                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                      value=[label])),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'tokens': parsing_ops.VarLenFeature(dtypes.string),
+        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    def _train_input_fn():
+      features = parsing_ops.parse_example(serialized_examples, feature_spec)
+      labels = features.pop('label')
+      return features, labels
+    def _eval_input_fn():
+      features = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      labels = features.pop('label')
+      return features, labels
+    def _predict_input_fn():
+      features = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features.pop('label')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 180f1b68f3b56113dfbbfc100bd04efc3bb8b31f..effec42f028fe472593a8d06e15a0831346d6f50 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -66,6 +66,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -214,6 +215,7 @@ tf_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
     ],
+    shard_count = 4,
 )
 
 # Estimators tests
@@ -223,7 +225,10 @@ py_test(
     srcs = ["python/ops/kmeans_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67512932
+    tags = [
+        "nomac",  # b/73741358
+        "notsan",  # b/67512932
+    ],
     deps = [
         ":factorization_py",
         ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
@@ -238,6 +243,7 @@ py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -342,16 +348,3 @@ cuda_py_test(
     ],
     main = "python/kernel_tests/masked_matmul_benchmark.py",
 )
-
-# All files
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/factorization/examples/BUILD b/tensorflow/contrib/factorization/examples/BUILD
index bbe842bd5ccc7357805adda1df42ba8799fcd8f2..363baa121ab3854a802ca3606e35597d31b35a57 100644
--- a/tensorflow/contrib/factorization/examples/BUILD
+++ b/tensorflow/contrib/factorization/examples/BUILD
@@ -21,14 +21,3 @@ tf_py_test(
     ],
     tags = ["notsan"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/factorization/kernels/BUILD b/tensorflow/contrib/factorization/kernels/BUILD
index 44eab56011dad2f6fbe843b3569b4acc5c5e542a..ea8b9a17a27093cb57564861815edd6ecb18a014 100644
--- a/tensorflow/contrib/factorization/kernels/BUILD
+++ b/tensorflow/contrib/factorization/kernels/BUILD
@@ -67,14 +67,3 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
index dd61f59585aee2e0245cfd6797b313b972c19bc5..025534d540bb82cdb87bb2977d08dfa4f02f1bc8 100644
--- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
@@ -32,6 +32,7 @@
 #include "tensorflow/core/lib/gtl/top_n.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -353,7 +354,7 @@ class NearestNeighborsOp : public OpKernel {
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
     const int64 num_threads = worker_threads.num_threads;
     // This kernel might be configured to use fewer than the total number of
-    // available CPUs on the host machine. To avoid descructive interference
+    // available CPUs on the host machine. To avoid destructive interference
     // with other jobs running on the host machine, we must only use a fraction
     // of total available L3 cache. Unfortunately, we cannot query the host
     // machine to get the number of physical CPUs. So, we use a fixed per-CPU
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 23137e0a973c0bdd2cdbd97159f7fd310178bf54..84e80791f4991ad2b67d0a00ee1e00cf0d0daadc 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -41,11 +41,12 @@ from tensorflow.python.platform import resource_loader
 _clustering_ops = loader.load_op_library(
     resource_loader.get_path_to_datafile('_clustering_ops.so'))
 
-# Euclidean distance between vectors U and V is defined as ||U - V||_F which is
-# the square root of the sum of the absolute squares of the elements difference.
+# Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
+# which is the square root of the sum of the absolute squares of the elements
+# difference.
 SQUARED_EUCLIDEAN_DISTANCE = 'squared_euclidean'
 # Cosine distance between vectors U and V is defined as
-# 1 - (U \dot V) / (||U||_F ||V||_F)
+# \\(1 - (U \dot V) / (||U||_F ||V||_F)\\)
 COSINE_DISTANCE = 'cosine'
 
 RANDOM_INIT = 'random'
@@ -472,8 +473,8 @@ class KMeans(object):
         # Locally compute the sum of inputs mapped to each id.
         # For a cluster with old cluster value x, old count n, and with data
         # d_1,...d_k newly assigned to it, we recompute the new value as
-        # x += (sum_i(d_i) - k * x) / (n + k).
-        # Compute sum_i(d_i), see comment above.
+        # \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
+        # Compute \\(sum_i(d_i)\\), see comment above.
         cluster_center_updates = math_ops.unsorted_segment_sum(
             inp, unique_idx, num_unique_cluster_idx)
         # Shape to enable broadcasting count_updates and learning_rate to inp.
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 054888e734086c153f7af59f4548d4d20abab813..811fa89bc38c61b16710a441b99d9e5dfac67668 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -51,9 +51,9 @@ class WALSModel(object):
   r"""A model for Weighted Alternating Least Squares matrix factorization.
 
   It minimizes the following loss function over U, V:
-   \\(
-   \|\sqrt W \odot (A - U V^T) \|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)
-   )\\
+  $$
+   \|\sqrt W \odot (A - U V^T)\|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)
+  $$
     where,
     A: input matrix,
     W: weight matrix. Note that the (element-wise) square root of the weights
@@ -61,12 +61,12 @@ class WALSModel(object):
     U, V: row_factors and column_factors matrices,
     \\(\lambda)\\: regularization.
   Also we assume that W is of the following special form:
-  \\( W_{ij} = W_0 + R_i * C_j )\\  if \\(A_{ij} \ne 0)\\,
-  \\(W_{ij} = W_0)\\ otherwise.
+  \\( W_{ij} = W_0 + R_i * C_j \\)  if \\(A_{ij} \ne 0\\),
+  \\(W_{ij} = W_0\\) otherwise.
   where,
-  \\(W_0)\\: unobserved_weight,
-  \\(R_i)\\: row_weights,
-  \\(C_j)\\: col_weights.
+  \\(W_0\\): unobserved_weight,
+  \\(R_i\\): row_weights,
+  \\(C_j\\): col_weights.
 
   Note that the current implementation supports two operation modes: The default
   mode is for the condition where row_factors and col_factors can individually
@@ -82,14 +82,15 @@ class WALSModel(object):
   normalized as follows:
     _, _, unregularized_loss, regularization, sum_weights =
         update_row_factors(sp_input)
-  if sp_input contains the rows {A_i, i \in I}, and the input matrix A has n
-  total rows, then the minibatch loss = unregularized_loss + regularization is
-   \\(
+  if sp_input contains the rows \\({A_i, i \in I}\\), and the input matrix A
+  has n total rows, then the minibatch loss = unregularized_loss +
+  regularization is
+   $$
    (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 + \lambda \|U_I\|_F^2) * n / |I| +
    \lambda \|V\|_F^2
-   )\\
+   $$
   The sum_weights tensor contains the normalized sum of weights
-  sum(W_I) * n / |I|.
+  \\(sum(W_I) * n / |I|\\).
 
   A typical usage example (pseudocode):
 
@@ -106,7 +107,7 @@ class WALSModel(object):
       # the prep_gramian_op for row(column) can be run.
       worker_init_op = model.worker_init
 
-      # To be run once per interation sweep before the row(column) update
+      # To be run once per integration sweep before the row(column) update
       # initialize ops can be run. Note that in the distributed training
       # situations, this should only be run by the chief trainer. All other
       # trainers need to block until this is done.
@@ -118,9 +119,9 @@ class WALSModel(object):
       init_row_update_op = model.initialize_row_update_op
       init_col_update_op = model.initialize_col_update_op
 
-      # Ops to upate row(column). This can either take the entire sparse tensor
-      # or slices of sparse tensor. For distributed trainer, each trainer
-      # handles just part of the matrix.
+      # Ops to update row(column). This can either take the entire sparse
+      # tensor or slices of sparse tensor. For distributed trainer, each
+      # trainer handles just part of the matrix.
       _, row_update_op, unreg_row_loss, row_reg, _ = model.update_row_factors(
            sp_input=matrix_slices_from_queue_for_worker_shard)
       row_loss = unreg_row_loss + row_reg
@@ -220,10 +221,10 @@ class WALSModel(object):
         in the form of [[w_0, w_1, ...], [w_k, ... ], [...]], with the number of
         inner lists matching the number of row factor shards and the elements in
         each inner list are the weights for the rows of the corresponding row
-        factor shard. In this case,  w_ij = unonbserved_weight +
+        factor shard. In this case,  w_ij = unobserved_weight +
                                             row_weights[i] * col_weights[j].
         - If this is a single non-negative real number, this value is used for
-        all row weights and w_ij = unobserved_weight + row_weights *
+        all row weights and \\(w_ij\\) = unobserved_weight + row_weights *
                                    col_weights[j].
         Note that it is allowed to have row_weights as a list while col_weights
         a single number or vice versa.
@@ -435,7 +436,7 @@ class WALSModel(object):
       gramian: Variable storing the gramian calculated from the factors.
 
     Returns:
-      A op that updates the gramian with the calcuated value from the factors.
+      A op that updates the gramian with the calculated value from the factors.
     """
     partial_gramians = []
     for f in factors:
@@ -564,7 +565,7 @@ class WALSModel(object):
 
     Note that specifically this initializes the cache of the row and column
     weights on workers when `use_factors_weights_cache` is True. In this case,
-    if these weights are being calcualted and reset after the object is created,
+    if these weights are being calculated and reset after the object is created,
     it is important to ensure this ops is run afterwards so the cache reflects
     the correct values.
     """
@@ -665,18 +666,18 @@ class WALSModel(object):
         factors.
       unregularized_loss: A tensor (scalar) that contains the normalized
         minibatch loss corresponding to sp_input, without the regularization
-        term. If sp_input contains the rows {A_{i, :}, i \in I}, and the input
-        matrix A has n total rows, then the unregularized loss is:
-        (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 * n / |I|
+        term. If sp_input contains the rows \\({A_{i, :}, i \in I}\\), and the
+        input matrix A has n total rows, then the unregularized loss is:
+        \\(\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 * n / |I|\\)
         The total loss is unregularized_loss + regularization.
       regularization: A tensor (scalar) that contains the normalized
         regularization term for the minibatch loss corresponding to sp_input.
-        If sp_input contains the rows {A_{i, :}, i \in I}, and the input matrix
-        A has n total rows, then the regularization term is:
-        \lambda \|U_I\|_F^2) * n / |I| + \lambda \|V\|_F^2.
+        If sp_input contains the rows \\({A_{i, :}, i \in I}\\), and the input
+        matrix A has n total rows, then the regularization term is:
+        \\(\lambda \|U_I\|_F^2) * n / |I| + \lambda \|V\|_F^2\\).
       sum_weights: The sum of the weights W_I corresponding to sp_input,
-        normalized by a factor of n / |I|. The root weighted squared error is:
-        \sqrt(unregularized_loss / sum_weights).
+        normalized by a factor of \\(n / |I|\\). The root weighted squared
+        error is: \sqrt(unregularized_loss / sum_weights).
     """
     return self._process_input_helper(
         True, sp_input=sp_input, transpose_input=transpose_input)
@@ -698,18 +699,18 @@ class WALSModel(object):
         factors.
       unregularized_loss: A tensor (scalar) that contains the normalized
         minibatch loss corresponding to sp_input, without the regularization
-        term. If sp_input contains the columns {A_{:, j}, j \in J}, and the
-        input matrix A has m total columns, then the unregularized loss is:
-        (\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 * m / |I|
+        term. If sp_input contains the columns \\({A_{:, j}, j \in J}\\), and
+        the input matrix A has m total columns, then the unregularized loss is:
+        \\(\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 * m / |I|\\)
         The total loss is unregularized_loss + regularization.
       regularization: A tensor (scalar) that contains the normalized
         regularization term for the minibatch loss corresponding to sp_input.
-        If sp_input contains the columns {A_{:, j}, j \in J}, and the input
-        matrix A has m total columns, then the regularization term is:
-        \lambda \|V_J\|_F^2) * m / |J| + \lambda \|U\|_F^2.
+        If sp_input contains the columns \\({A_{:, j}, j \in J}\\), and the
+        input matrix A has m total columns, then the regularization term is:
+        \\(\lambda \|V_J\|_F^2) * m / |J| + \lambda \|U\|_F^2\\).
       sum_weights: The sum of the weights W_J corresponding to sp_input,
-        normalized by a factor of m / |J|. The root weighted squared error is:
-        \sqrt(unregularized_loss / sum_weights).
+        normalized by a factor of \\(m / |J|\\). The root weighted squared
+        error is: \sqrt(unregularized_loss / sum_weights).
     """
     return self._process_input_helper(
         False, sp_input=sp_input, transpose_input=transpose_input)
@@ -720,8 +721,8 @@ class WALSModel(object):
                           projection_weights=None):
     """Projects the row factors.
 
-    This computes the row embedding u_i for an observed row a_i by solving
-    one iteration of the update equations.
+    This computes the row embedding \\(u_i\\) for an observed row \\(a_i\\) by
+    solving one iteration of the update equations.
 
     Args:
       sp_input: A SparseTensor representing a set of rows. Please note that the
@@ -753,8 +754,8 @@ class WALSModel(object):
                           projection_weights=None):
     """Projects the column factors.
 
-    This computes the column embedding v_j for an observed column a_j by solving
-    one iteration of the update equations.
+    This computes the column embedding \\(v_j\\) for an observed column
+    \\(a_j\\) by solving one iteration of the update equations.
 
     Args:
       sp_input: A SparseTensor representing a set of columns. Please note that
@@ -938,7 +939,7 @@ class WALSModel(object):
     loss_sp_input = (sparse_ops.sparse_transpose(new_sp_input)
                      if transpose_input else new_sp_input)
     # sp_approx is the low rank estimate of the input matrix, formed by
-    # computing the product <u_i, v_j> for (i, j) in loss_sp_input.indices.
+    # computing the product <\\(u_i, v_j\\)> for (i, j) in loss_sp_input.indices.
     sp_approx_vals = gen_factorization_ops.masked_matmul(
         new_left_values,
         right,
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
index c8137339155ef1da8ee53967eea84a550f12ecbc..bb5140aeb3bf0238ca7cb52067ea6328dd1736d5 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
@@ -210,7 +210,7 @@ class WalsModelTest(test.TestCase):
 
       # Test row projection.
       # Using the specified projection weights for the 2 row feature vectors.
-      # This is expected to reprodue the same row factors in the model as the
+      # This is expected to reproduce the same row factors in the model as the
       # weights and feature vectors are identical to that used in model
       # training.
       projected_rows = wals_model.project_row_factors(
@@ -283,8 +283,8 @@ class WalsModelTest(test.TestCase):
 
       # Test column projection.
       # Using the specified projection weights for the 3 column feature vectors.
-      # This is expected to reprodue the same column factors in the model as the
-      # weights and feature vectors are identical to that used in model
+      # This is expected to reproduce the same column factors in the model as
+      # the weights and feature vectors are identical to that used in model
       # training.
       projected_cols = wals_model.project_col_factors(
           sp_input=sp_feeder,
@@ -385,7 +385,7 @@ class WalsModelTest(test.TestCase):
 
       # Test row projection.
       # Using the specified projection weights for the 2 row feature vectors.
-      # This is expected to reprodue the same row factors in the model as the
+      # This is expected to reproduce the same row factors in the model as the
       # weights and feature vectors are identical to that used in model
       # training.
       projected_rows = wals_model.project_row_factors(
@@ -462,8 +462,8 @@ class WalsModelTest(test.TestCase):
 
       # Test column projection.
       # Using the specified projection weights for the 2 column feature vectors.
-      # This is expected to reprodue the same column factors in the model as the
-      # weights and feature vectors are identical to that used in model
+      # This is expected to reproduce the same column factors in the model as
+      # the weights and feature vectors are identical to that used in model
       # training.
       projected_cols = wals_model.project_col_factors(
           sp_input=sp_feeder,
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index 98d6434f4752b224201e38bed05ccd14428a758b..e076631bc16fd379a2ad31af9055a7388d98c7ca 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -54,10 +54,10 @@ def _covariance(x, diag):
   diagonal matrix just the diagonal is returned.
   """
   num_points = math_ops.to_float(array_ops.shape(x)[0])
-  x -= math_ops.reduce_mean(x, 0, keep_dims=True)
+  x -= math_ops.reduce_mean(x, 0, keepdims=True)
   if diag:
     cov = math_ops.reduce_sum(
-        math_ops.square(x), 0, keep_dims=True) / (num_points - 1)
+        math_ops.square(x), 0, keepdims=True) / (num_points - 1)
   else:
     cov = math_ops.matmul(x, x, transpose_a=True) / (num_points - 1)
   return cov
@@ -280,7 +280,7 @@ class GmmAlgorithm(object):
     self._define_score_samples()
 
   def _define_full_covariance_probs(self, shard_id, shard):
-    """Defines the full covariance probabilties per example in a class.
+    """Defines the full covariance probabilities per example in a class.
 
     Updates a matrix with dimension num_examples X num_classes.
 
@@ -313,7 +313,7 @@ class GmmAlgorithm(object):
     # TODO(xavigonzalvo): look into alternatives to log for
     # reparametrization of variance parameters.
     det_expanded = math_ops.reduce_sum(
-        math_ops.log(self._covs + 1e-3), 1, keep_dims=True)
+        math_ops.log(self._covs + 1e-3), 1, keepdims=True)
     diff = shard - self._means
     x2 = math_ops.square(diff)
     cov_expanded = array_ops.expand_dims(1.0 / (self._covs + 1e-3), 2)
@@ -344,21 +344,21 @@ class GmmAlgorithm(object):
   def _define_prior_log_prob_operation(self, shard_id):
     """Computes the prior probability of all samples.
 
-    Updates a vector where each item is the prior probabibility of an
+    Updates a vector where each item is the prior probability of an
     input example.
 
     Args:
       shard_id: id of current shard_id.
     """
     self._prior_probs[shard_id] = math_ops.reduce_logsumexp(
-        self._probs[shard_id], axis=1, keep_dims=True)
+        self._probs[shard_id], axis=1, keepdims=True)
 
   def _define_expectation_operation(self, shard_id):
     # Shape broadcasting.
     probs = array_ops.expand_dims(self._probs[shard_id], 0)
     # Membership weights are computed as:
-    # w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}
-    #               {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}
+    # $$w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}$$
+    # $$            {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}$$
     # where "i" is the i-th example, "k" is the k-th mixture, theta are
     # the model parameters and y_i the observations.
     # These are defined for each shard.
@@ -375,7 +375,7 @@ class GmmAlgorithm(object):
     """
     # Soft assignment of each data point to each of the two clusters.
     self._points_in_k[shard_id] = math_ops.reduce_sum(
-        self._w[shard_id], 0, keep_dims=True)
+        self._w[shard_id], 0, keepdims=True)
     # Partial means.
     w_mul_x = array_ops.expand_dims(
         math_ops.matmul(
@@ -397,7 +397,7 @@ class GmmAlgorithm(object):
     # Compute the effective number of data points assigned to component k.
     with ops.control_dependencies(self._w):
       points_in_k = array_ops.squeeze(
-          math_ops.add_n(self._points_in_k), squeeze_dims=[0])
+          math_ops.add_n(self._points_in_k), axis=[0])
       # Update alpha.
       if 'w' in self._params:
         final_points_in_k = points_in_k / num_batches
@@ -454,7 +454,7 @@ class GmmAlgorithm(object):
     for shard_id, prior_probs in enumerate(self._prior_probs):
       op.append(prior_probs + math_ops.log(self._w[shard_id]))
     self._scores = array_ops.squeeze(
-        math_ops.reduce_logsumexp(op, axis=2, keep_dims=True), axis=0)
+        math_ops.reduce_logsumexp(op, axis=2, keepdims=True), axis=0)
 
 
 def gmm(inp,
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_test.py b/tensorflow/contrib/factorization/python/ops/gmm_test.py
index 00a4734eb6d89cd02484f1c5161366377cc71208..4fc9c96e9d0a317ef757d5e1bb6563ed7c8832af 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_test.py
@@ -210,7 +210,7 @@ class GMMTestQueues(test.TestCase):
     return _fn
 
   # This test makes sure that there are no deadlocks when using a QueueRunner.
-  # Note that since cluster initialization is dependendent on inputs, if input
+  # Note that since cluster initialization is dependent on inputs, if input
   # is generated using a QueueRunner, one has to make sure that these runners
   # are started before the initialization.
   def test_queues(self):
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index c861cfff544a78617aa1ace730b50c094cf16330..9ffdd3ba5e8ac496533d0207f2b6846dbc92bc89 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.factorization.python.ops import clustering_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export_output
+from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -61,8 +62,8 @@ class _LossRelativeChangeHook(session_run_hook.SessionRunHook):
     loss = run_values.results
     assert loss is not None
     if self._prev_loss:
-      relative_change = (abs(loss - self._prev_loss) /
-                         (1 + abs(self._prev_loss)))
+      relative_change = (
+          abs(loss - self._prev_loss) / (1 + abs(self._prev_loss)))
       if relative_change < self._tolerance:
         run_context.request_stop()
     self._prev_loss = loss
@@ -105,24 +106,32 @@ class _InitializeClustersHook(session_run_hook.SessionRunHook):
         logging.info(e)
 
 
-def _parse_tensor_or_dict(features):
+def _parse_features_if_necessary(features, feature_columns):
   """Helper function to convert the input points into a usable format.
 
   Args:
-    features: The input points.
+    features: The input features.
+    feature_columns: An optionable iterable containing all the feature columns
+      used by the model. All items in the set should be feature column instances
+      that can be passed to `tf.feature_column.input_layer`. If this is None,
+      all features will be used.
 
   Returns:
-    If `features` is a dict of `k` features, each of which is a vector of `n`
-    scalars, the return value is a Tensor of shape `(n, k)` representing `n`
-    input points, where the items in the `k` dimension are sorted
-    lexicographically by `features` key. If `features` is not a dict, it is
-    returned unmodified.
+    If `features` is a dict of `k` features (optionally filtered by
+    `feature_columns`), each of which is a vector of `n` scalars, the return
+    value is a Tensor of shape `(n, k)` representing `n` input points, where the
+    items in the `k` dimension are sorted lexicographically by `features` key.
+    If `features` is not a dict, it is returned unmodified.
   """
-  if isinstance(features, dict):
-    keys = sorted(features.keys())
-    with ops.colocate_with(features[keys[0]]):
-      features = array_ops.concat([features[k] for k in keys], axis=1)
-  return features
+  if not isinstance(features, dict):
+    return features
+
+  if feature_columns:
+    return fc.input_layer(features, feature_columns)
+
+  keys = sorted(features.keys())
+  with ops.colocate_with(features[keys[0]]):
+    return array_ops.concat([features[k] for k in keys], axis=1)
 
 
 class _ModelFn(object):
@@ -130,7 +139,8 @@ class _ModelFn(object):
 
   def __init__(self, num_clusters, initial_clusters, distance_metric,
                random_seed, use_mini_batch, mini_batch_steps_per_iteration,
-               kmeans_plus_plus_num_retries, relative_tolerance):
+               kmeans_plus_plus_num_retries, relative_tolerance,
+               feature_columns):
     self._num_clusters = num_clusters
     self._initial_clusters = initial_clusters
     self._distance_metric = distance_metric
@@ -139,6 +149,7 @@ class _ModelFn(object):
     self._mini_batch_steps_per_iteration = mini_batch_steps_per_iteration
     self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
     self._relative_tolerance = relative_tolerance
+    self._feature_columns = feature_columns
 
   def model_fn(self, features, mode, config):
     """Model function for the estimator.
@@ -166,7 +177,7 @@ class _ModelFn(object):
     # input_points is a single Tensor. Therefore, the sharding functionality
     # in clustering_ops is unused, and some of the values below are lists of a
     # single item.
-    input_points = _parse_tensor_or_dict(features)
+    input_points = _parse_features_if_necessary(features, self._feature_columns)
 
     # Let N = the number of input_points.
     # all_distances: A list of one matrix of shape (N, num_clusters). Each value
@@ -233,7 +244,57 @@ class _ModelFn(object):
 
 # TODO(agarwal,ands): support sharded input.
 class KMeansClustering(estimator.Estimator):
-  """An Estimator for K-Means clustering."""
+  """An Estimator for K-Means clustering.
+
+  Example:
+  ```
+  import numpy as np
+  import tensorflow as tf
+
+  num_points = 100
+  dimensions = 2
+  points = np.random.uniform(0, 1000, [num_points, dimensions])
+
+  def input_fn():
+    return tf.train.limit_epochs(
+        tf.convert_to_tensor(points, dtype=tf.float32), num_epochs=1)
+
+  num_clusters = 5
+  kmeans = tf.contrib.factorization.KMeansClustering(
+      num_clusters=num_clusters, use_mini_batch=False)
+
+  # train
+  num_iterations = 10
+  previous_centers = None
+  for _ in xrange(num_iterations):
+    kmeans.train(input_fn)
+    cluster_centers = kmeans.cluster_centers()
+    if previous_centers is not None:
+      print 'delta:', cluster_centers - previous_centers
+    previous_centers = cluster_centers
+    print 'score:', kmeans.score(input_fn)
+  print 'cluster centers:', cluster_centers
+
+  # map the input points to their clusters
+  cluster_indices = list(kmeans.predict_cluster_index(input_fn))
+  for i, point in enumerate(points):
+    cluster_index = cluster_indices[i]
+    center = cluster_centers[cluster_index]
+    print 'point:', point, 'is in cluster', cluster_index, 'centered at', center
+  ```
+
+  The `SavedModel` saved by the `export_savedmodel` method does not include the
+  cluster centers. However, the cluster centers may be retrieved by the
+  latest checkpoint saved during training. Specifically,
+  ```
+  kmeans.cluster_centers()
+  ```
+  is equivalent to
+  ```
+  tf.train.load_variable(
+      kmeans.model_dir, KMeansClustering.CLUSTER_CENTERS_VAR_NAME)
+  ```
+  """
 
   # Valid values for the distance_metric constructor argument.
   SQUARED_EUCLIDEAN_DISTANCE = clustering_ops.SQUARED_EUCLIDEAN_DISTANCE
@@ -253,6 +314,9 @@ class KMeansClustering(estimator.Estimator):
   CLUSTER_INDEX = 'cluster_index'
   ALL_DISTANCES = 'all_distances'
 
+  # Variable name used by cluster_centers().
+  CLUSTER_CENTERS_VAR_NAME = clustering_ops.CLUSTERS_VAR_NAME
+
   def __init__(self,
                num_clusters,
                model_dir=None,
@@ -263,7 +327,8 @@ class KMeansClustering(estimator.Estimator):
                mini_batch_steps_per_iteration=1,
                kmeans_plus_plus_num_retries=2,
                relative_tolerance=None,
-               config=None):
+               config=None,
+               feature_columns=None):
     """Creates an Estimator for running KMeans training and inference.
 
     This Estimator implements the following variants of the K-means algorithm:
@@ -309,11 +374,11 @@ class KMeansClustering(estimator.Estimator):
               than `num_clusters`, a TensorFlow runtime error occurs.
       distance_metric: The distance metric used for clustering. One of:
         * `KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE`: Euclidean distance
-             between vectors `u` and `v` is defined as `||u - v||_2` which is
-             the square root of the sum of the absolute squares of the elements'
-             difference.
+             between vectors `u` and `v` is defined as \\(||u - v||_2\\)
+             which is the square root of the sum of the absolute squares of
+             the elements' difference.
         * `KMeansClustering.COSINE_DISTANCE`: Cosine distance between vectors
-             `u` and `v` is defined as `1 - (u . v) / (||u||_2 ||v||_2)`.
+             `u` and `v` is defined as \\(1 - (u . v) / (||u||_2 ||v||_2)\\).
       random_seed: Python integer. Seed for PRNG used to initialize centers.
       use_mini_batch: A boolean specifying whether to use the mini-batch k-means
         algorithm. See explanation above.
@@ -330,6 +395,10 @@ class KMeansClustering(estimator.Estimator):
         iterations. Stops learning if the loss changes less than this amount.
         This may not work correctly if `use_mini_batch=True`.
       config: See @{tf.estimator.Estimator}.
+      feature_columns: An optionable iterable containing all the feature columns
+        used by the model. All items in the set should be feature column
+        instances that can be passed to `tf.feature_column.input_layer`. If this
+        is None, all features will be used.
 
     Raises:
       ValueError: An invalid argument was passed to `initial_clusters` or
@@ -349,7 +418,8 @@ class KMeansClustering(estimator.Estimator):
         model_fn=_ModelFn(
             num_clusters, initial_clusters, distance_metric, random_seed,
             use_mini_batch, mini_batch_steps_per_iteration,
-            kmeans_plus_plus_num_retries, relative_tolerance).model_fn,
+            kmeans_plus_plus_num_retries, relative_tolerance,
+            feature_columns).model_fn,
         model_dir=model_dir,
         config=config)
 
@@ -406,4 +476,4 @@ class KMeansClustering(estimator.Estimator):
 
   def cluster_centers(self):
     """Returns the cluster centers."""
-    return self.get_variable_value(clustering_ops.CLUSTERS_VAR_NAME)
+    return self.get_variable_value(KMeansClustering.CLUSTER_CENTERS_VAR_NAME)
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index f9598bfc08c05ea3bba88b3135da0cf2e6bb0c95..88eb9cf692992fe2e1fc4f060ac98dd721c22307 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -27,6 +27,7 @@ from sklearn.cluster import KMeans as SklearnKMeans
 # pylint: disable=g-import-not-at-top
 from tensorflow.contrib.factorization.python.ops import kmeans as kmeans_lib
 from tensorflow.python.estimator import run_config
+from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -226,6 +227,44 @@ class KMeansTest(KMeansTestBase):
     self._infer_helper(kmeans, clusters, 10)
     self._infer_helper(kmeans, clusters, 1)
 
+  def _parse_feature_dict_helper(self, features, parsed_feature_dict):
+    # Perform a sanity check.
+    self.assertEqual(features.shape, parsed_feature_dict.shape)
+    self.assertEqual(features.dtype, parsed_feature_dict.dtype)
+    # Then check that running the tensor yields the original list of points.
+    with self.test_session() as sess:
+      parsed_points = sess.run(parsed_feature_dict)
+      self.assertAllEqual(self.points, parsed_points)
+
+  def test_parse_features(self):
+    """Tests the various behaviours of kmeans._parse_features_if_necessary."""
+
+    # No-op if a tensor is passed in.
+    features = constant_op.constant(self.points)
+    parsed_features = kmeans_lib._parse_features_if_necessary(features, None)
+    self.assertAllEqual(features, parsed_features)
+
+    # All values from a feature dict are transformed into a tensor.
+    feature_dict = {
+        'x': [[point[0]] for point in self.points],
+        'y': [[point[1]] for point in self.points]
+    }
+    parsed_feature_dict = kmeans_lib._parse_features_if_necessary(
+        feature_dict, None)
+    self._parse_feature_dict_helper(features, parsed_feature_dict)
+
+    # Only the feature_columns of a feature dict are transformed into a tensor.
+    feature_dict_with_extras = {
+        'foo': 'bar',
+        'x': [[point[0]] for point in self.points],
+        'baz': {'fizz': 'buzz'},
+        'y': [[point[1]] for point in self.points]
+    }
+    feature_columns = [fc.numeric_column(key='x'), fc.numeric_column(key='y')]
+    parsed_feature_dict = kmeans_lib._parse_features_if_necessary(
+        feature_dict_with_extras, feature_columns)
+    self._parse_feature_dict_helper(features, parsed_feature_dict)
+
 
 class KMeansTestMultiStageInit(KMeansTestBase):
 
@@ -374,7 +413,7 @@ class KMeansCosineDistanceTest(KMeansTestBase):
     self.assertAllClose(score, self.true_score, atol=1e-2)
 
   def test_predict_kmeans_plus_plus(self):
-    # Most points are concetrated near one center. KMeans++ is likely to find
+    # Most points are concentrated near one center. KMeans++ is likely to find
     # the less populated centers.
     points = np.array(
         [[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3], [-3.1, -3.2],
@@ -394,7 +433,6 @@ class KMeansCosineDistanceTest(KMeansTestBase):
     true_assignments = [0] * 2 + [1] * 2 + [2] * 8
     true_score = len(points) - np.tensordot(
         normalize(points), true_centers[true_assignments])
-
     kmeans = kmeans_lib.KMeansClustering(
         3,
         initial_clusters=self.initial_clusters,
@@ -566,7 +604,7 @@ class KMeansTestQueues(test.TestCase):
     return _fn
 
   # This test makes sure that there are no deadlocks when using a QueueRunner.
-  # Note that since cluster initialization is dependendent on inputs, if input
+  # Note that since cluster initialization is dependent on inputs, if input
   # is generated using a QueueRunner, one has to make sure that these runners
   # are started before the initialization.
   def test_queues(self):
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
index 4fe22ea26ec5f5a43f1c99d1fee518b1d326c5c9..ca46c39baa16a7fddb96121e0402fc35d24ce1c2 100644
--- a/tensorflow/contrib/factorization/python/ops/wals.py
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -216,7 +216,7 @@ def _wals_factorization_model_function(features, labels, mode, params):
         name=WALSMatrixFactorization.LOSS,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     # The root weighted squared error =
-    #   \sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )
+    #   \\(\sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )\\)
     rwse_var = variable_scope.variable(
         0.,
         trainable=False,
@@ -235,7 +235,7 @@ def _wals_factorization_model_function(features, labels, mode, params):
         num_items: An integer, the total number of items of this axis.
         update_fn: A function that takes one argument (`sp_input`), and that
         returns a tuple of
-          * new_factors: A flot Tensor of the factor values after update.
+          * new_factors: A float Tensor of the factor values after update.
           * update_op: a TensorFlow op which updates the factors.
           * loss: A float Tensor, the unregularized loss.
           * reg_loss: A float Tensor, the regularization loss.
@@ -490,11 +490,11 @@ class WALSMatrixFactorization(estimator.Estimator):
           and the problem simplifies to ALS. Note that, in this case,
           col_weights must also be set to "None".
         - List of lists of non-negative scalars, of the form
-          [[w_0, w_1, ...], [w_k, ... ], [...]],
+          \\([[w_0, w_1, ...], [w_k, ... ], [...]]\\),
           where the number of inner lists equal to the number of row factor
           shards and the elements in each inner list are the weights for the
           rows of that shard. In this case,
-          w_ij = unonbserved_weight + row_weights[i] * col_weights[j].
+          \\(w_ij = unonbserved_weight + row_weights[i] * col_weights[j]\\).
         - A non-negative scalar: This value is used for all row weights.
           Note that it is allowed to have row_weights as a list and col_weights
           as a scalar, or vice-versa.
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index 6fc053759c58d30c24657dd22e7d12be46fc7a7e..aab7d0c9e8874269bfa5f33193b0dc0ba4bbc9cd 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -8,30 +8,47 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "feature_column_py",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":sequential_feature_column",
+        ":sequence_feature_column",
+        "//tensorflow/python:util",
     ],
 )
 
 py_library(
-    name = "sequential_feature_column",
-    srcs = ["python/feature_column/sequential_feature_column.py"],
+    name = "sequence_feature_column",
+    srcs = ["python/feature_column/sequence_feature_column.py"],
     srcs_version = "PY2AND3",
-    deps = [],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_test(
+    name = "sequence_feature_column_test",
+    srcs = ["python/feature_column/sequence_feature_column_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":sequence_feature_column",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+    ],
 )
diff --git a/tensorflow/contrib/feature_column/__init__.py b/tensorflow/contrib/feature_column/__init__.py
index 6da7b126931effae9cc97091a27070d7013450d4..baa8c1567a5aeb39976ab04c54ae2728ba050a7c 100644
--- a/tensorflow/contrib/feature_column/__init__.py
+++ b/tensorflow/contrib/feature_column/__init__.py
@@ -19,12 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.feature_column.python.feature_column.sequential_feature_column import *
+from tensorflow.contrib.feature_column.python.feature_column.sequence_feature_column import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
 
 _allowed_symbols = [
+    'sequence_categorical_column_with_hash_bucket',
+    'sequence_categorical_column_with_identity',
+    'sequence_categorical_column_with_vocabulary_list',
+    'sequence_categorical_column_with_vocabulary_file',
+    'sequence_input_layer',
+    'sequence_numeric_column',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
new file mode 100644
index 0000000000000000000000000000000000000000..555beddeaab419bcb23d06f960d370b706d744c8
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -0,0 +1,447 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental methods for tf.feature_column sequence input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import collections
+
+
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
+
+# pylint: disable=protected-access
+# TODO(b/73827486): Support SequenceExample.
+
+
+def sequence_input_layer(
+    features,
+    feature_columns,
+    weight_collections=None,
+    trainable=True):
+  """"Builds input layer for sequence input.
+
+  All `feature_columns` must be sequence dense columns with the same
+  `sequence_length`. The output of this method can be fed into sequence
+  networks, such as RNN.
+
+  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
+  `T` is the maximum sequence length for this batch, which could differ from
+  batch to batch.
+
+  If multiple `feature_columns` are given with `Di` `num_elements` each, their
+  outputs are concatenated. So, the final `Tensor` has shape
+  `[batch_size, T, D0 + D1 + ... + Dn]`.
+
+  Example:
+
+  ```python
+  rating = sequence_numeric_column('rating')
+  watches = sequence_categorical_column_with_identity(
+      'watches', num_buckets=1000)
+  watches_embedding = embedding_column(watches, dimension=10)
+  columns = [rating, watches]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    features: A dict mapping keys to tensors.
+    feature_columns: An iterable of dense sequence columns. Valid columns are
+      - `embedding_column` that wraps a `sequence_categorical_column_with_*`
+      - `sequence_numeric_column`.
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES`.
+
+  Returns:
+    An `(input_layer, sequence_length)` tuple where:
+    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
+        `T` is the maximum sequence length for this batch, which could differ
+        from batch to batch. `D` is the sum of `num_elements` for all
+        `feature_columns`.
+    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
+        length for each example.
+
+  Raises:
+    ValueError: If any of the `feature_columns` is the wrong type.
+  """
+  feature_columns = fc._clean_feature_columns(feature_columns)
+  for c in feature_columns:
+    if not isinstance(c, fc._SequenceDenseColumn):
+      raise ValueError(
+          'All feature_columns must be of type _SequenceDenseColumn. '
+          'You can wrap a sequence_categorical_column with an embedding_column '
+          'or indicator_column. '
+          'Given (type {}): {}'.format(type(c), c))
+
+  with variable_scope.variable_scope(
+      None, default_name='sequence_input_layer', values=features.values()):
+    builder = fc._LazyBuilder(features)
+    output_tensors = []
+    sequence_lengths = []
+    ordered_columns = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name):
+        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        # Flattens the final dimension to produce a 3D Tensor.
+        num_elements = column._variable_shape.num_elements()
+        shape = array_ops.shape(dense_tensor)
+        output_tensors.append(
+            array_ops.reshape(
+                dense_tensor,
+                shape=array_ops.concat([shape[:2], [num_elements]], axis=0)))
+        sequence_lengths.append(sequence_length)
+    fc._verify_static_batch_size_equality(output_tensors, ordered_columns)
+    fc._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
+    sequence_length = _assert_all_equal_and_return(sequence_lengths)
+    return array_ops.concat(output_tensors, -1), sequence_length
+
+
+def sequence_categorical_column_with_identity(
+    key, num_buckets, default_value=None):
+  """Returns a feature column that represents sequences of integers.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  watches = sequence_categorical_column_with_identity(
+      'watches', num_buckets=1000)
+  watches_embedding = embedding_column(watches, dimension=10)
+  columns = [watches_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    num_buckets: Range of inputs. Namely, inputs are expected to be in the
+      range `[0, num_buckets)`.
+    default_value: If `None`, this column's graph operations will fail for
+      out-of-range inputs. Otherwise, this value must be in the range
+      `[0, num_buckets)`, and will replace out-of-range inputs.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: if `num_buckets` is less than one.
+    ValueError: if `default_value` is not in range `[0, num_buckets)`.
+  """
+  return fc._SequenceCategoricalColumn(
+      fc.categorical_column_with_identity(
+          key=key,
+          num_buckets=num_buckets,
+          default_value=default_value))
+
+
+def sequence_categorical_column_with_hash_bucket(
+    key, hash_bucket_size, dtype=dtypes.string):
+  """A sequence of categorical terms where ids are set by hashing.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  tokens = sequence_categorical_column_with_hash_bucket(
+      'tokens', hash_bucket_size=1000)
+  tokens_embedding = embedding_column(tokens, dimension=10)
+  columns = [tokens_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    hash_bucket_size: An int > 1. The number of buckets.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: `hash_bucket_size` is not greater than 1.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return fc._SequenceCategoricalColumn(
+      fc.categorical_column_with_hash_bucket(
+          key=key,
+          hash_bucket_size=hash_bucket_size,
+          dtype=dtype))
+
+
+def sequence_categorical_column_with_vocabulary_file(
+    key, vocabulary_file, vocabulary_size=None, num_oov_buckets=0,
+    default_value=None, dtype=dtypes.string):
+  """A sequence of categorical terms where ids use a vocabulary file.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  states = sequence_categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  states_embedding = embedding_column(states, dimension=10)
+  columns = [states_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored. If None, it is set to the length of `vocabulary_file`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: `vocabulary_file` is missing or cannot be opened.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return fc._SequenceCategoricalColumn(
+      fc.categorical_column_with_vocabulary_file(
+          key=key,
+          vocabulary_file=vocabulary_file,
+          vocabulary_size=vocabulary_size,
+          num_oov_buckets=num_oov_buckets,
+          default_value=default_value,
+          dtype=dtype))
+
+
+def sequence_categorical_column_with_vocabulary_list(
+    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+  """A sequence of categorical terms where ids use an in-memory list.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  colors = sequence_categorical_column_with_vocabulary_list(
+      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
+      num_oov_buckets=2)
+  colors_embedding = embedding_column(colors, dimension=3)
+  columns = [colors_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
+      is mapped to the index of its value (if present) in `vocabulary_list`.
+      Must be castable to `dtype`.
+    dtype: The type of features. Only string and integer types are supported.
+      If `None`, it will be inferred from `vocabulary_list`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
+      hash of the input value. A positive `num_oov_buckets` can not be specified
+      with `default_value`.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: if `dtype` is not integer or string.
+  """
+  return fc._SequenceCategoricalColumn(
+      fc.categorical_column_with_vocabulary_list(
+          key=key,
+          vocabulary_list=vocabulary_list,
+          dtype=dtype,
+          default_value=default_value,
+          num_oov_buckets=num_oov_buckets))
+
+
+def sequence_numeric_column(
+    key,
+    shape=(1,),
+    default_value=0.,
+    dtype=dtypes.float32):
+  """Returns a feature column that represents sequences of numeric data.
+
+  Example:
+
+  ```python
+  temperature = sequence_numeric_column('temperature')
+  columns = [temperature]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input features.
+    shape: The shape of the input data per sequence id. E.g. if `shape=(2,)`,
+      each example must contain `2 * sequence_length` values.
+    default_value: A single value compatible with `dtype` that is used for
+      padding the sparse data into a dense `Tensor`.
+    dtype: The type of values.
+
+  Returns:
+    A `_SequenceNumericColumn`.
+
+  Raises:
+    TypeError: if any dimension in shape is not an int.
+    ValueError: if any dimension in shape is not a positive integer.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  shape = fc._check_shape(shape=shape, key=key)
+  if not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype must be convertible to float. '
+                     'dtype: {}, key: {}'.format(dtype, key))
+
+  return _SequenceNumericColumn(
+      key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype)
+
+
+def _assert_all_equal_and_return(tensors, name=None):
+  """Asserts that all tensors are equal and returns the first one."""
+  with ops.name_scope(name, 'assert_all_equal', values=tensors):
+    if len(tensors) == 1:
+      return tensors[0]
+    assert_equal_ops = []
+    for t in tensors[1:]:
+      assert_equal_ops.append(check_ops.assert_equal(tensors[0], t))
+    with ops.control_dependencies(assert_equal_ops):
+      return array_ops.identity(tensors[0])
+
+
+class _SequenceNumericColumn(
+    fc._SequenceDenseColumn,
+    collections.namedtuple(
+        '_SequenceNumericColumn',
+        ['key', 'shape', 'default_value', 'dtype'])):
+  """Represents sequences of numeric data."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.key)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(self.shape)
+
+  def _get_sequence_dense_tensor(
+      self, inputs, weight_collections=None, trainable=None):
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    sp_tensor = inputs.get(self)
+    dense_tensor = sparse_ops.sparse_tensor_to_dense(
+        sp_tensor, default_value=self.default_value)
+    # Reshape into [batch_size, T, variable_shape].
+    dense_shape = array_ops.concat(
+        [array_ops.shape(dense_tensor)[:1], [-1], self._variable_shape],
+        axis=0)
+    dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape)
+    sequence_length = fc._sequence_length_from_sparse_tensor(
+        sp_tensor, num_elements=self._variable_shape.num_elements())
+    return fc._SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
+
+# pylint: enable=protected-access
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f5d535162939e063eb1e7f43d495137c5adef4
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -0,0 +1,816 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sequential_feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.platform import test
+from tensorflow.python.training import monitored_session
+
+
+class SequenceInputLayerTest(test.TestCase):
+
+  def test_embedding_column(self):
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [1]
+        # example 1, ids [2, 0]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 2, 0),
+        dense_shape=(2, 2))
+
+    embedding_dimension_a = 2
+    embedding_values_a = (
+        (1., 2.),  # id 0
+        (3., 4.),  # id 1
+        (5., 6.)  # id 2
+    )
+    embedding_dimension_b = 3
+    embedding_values_b = (
+        (11., 12., 13.),  # id 0
+        (14., 15., 16.),  # id 1
+        (17., 18., 19.)  # id 2
+    )
+    def _get_initializer(embedding_dimension, embedding_values):
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
+      return _initializer
+
+    expected_input_layer = [
+        # example 0, ids_a [2], ids_b [1]
+        [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
+        # example 1, ids_a [0, 1], ids_b [2, 0]
+        [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],
+    ]
+    expected_sequence_length = [1, 2]
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=embedding_dimension_a,
+        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_b = fc.embedding_column(
+        categorical_column_b, dimension=embedding_dimension_b,
+        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        # Test that columns are reordered alphabetically.
+        feature_columns=[embedding_column_b, embedding_column_a])
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('sequence_input_layer/aaa_embedding/embedding_weights:0',
+         'sequence_input_layer/bbb_embedding/embedding_weights:0'),
+        tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
+      self.assertAllEqual(embedding_values_b, global_vars[1].eval(session=sess))
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_embedding_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence categorical column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must be of '
+        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
+      _, _ = sfc.sequence_input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[embedding_column_a])
+
+  def test_indicator_column(self):
+    vocabulary_size_a = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    vocabulary_size_b = 2
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [1]
+        # example 1, ids [1, 0]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 1, 0),
+        dense_shape=(2, 2))
+
+    expected_input_layer = [
+        # example 0, ids_a [2], ids_b [1]
+        [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
+        # example 1, ids_a [0, 1], ids_b [1, 0]
+        [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]],
+    ]
+    expected_sequence_length = [1, 2]
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size_b)
+    indicator_column_b = fc.indicator_column(categorical_column_b)
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        # Test that columns are reordered alphabetically.
+        feature_columns=[indicator_column_b, indicator_column_a])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_indicator_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence categorical column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must be of '
+        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
+      _, _ = sfc.sequence_input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[indicator_column_a])
+
+  def test_numeric_column(self):
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    expected_input_layer = [
+        [[0.], [1.]],
+        [[10.], [0.]],
+    ]
+    expected_sequence_length = [2, 1]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_numeric_column_multi_dim(self):
+    """Tests sequence_input_layer for multi-dimensional numeric_column."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+        # example 1, [[[10., 11.],  [12., 13.]]]
+        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
+                 (1, 0), (1, 1), (1, 2), (1, 3)),
+        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+        dense_shape=(2, 8))
+    # The output of numeric_column._get_dense_tensor should be flattened.
+    expected_input_layer = [
+        [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+        [[10., 11., 12., 13.], [0., 0., 0., 0.]],
+    ]
+    expected_sequence_length = [2, 1]
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_sequence_length_not_equal(self):
+    """Tests that an error is raised when sequence lengths are not equal."""
+    # Input a with sequence_length = [2, 1]
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    # Input b with sequence_length = [1, 1]
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0)),
+        values=(1., 10.),
+        dense_shape=(2, 2))
+    numeric_column_a = sfc.sequence_numeric_column('aaa')
+    numeric_column_b = sfc.sequence_numeric_column('bbb')
+
+    _, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        feature_columns=[numeric_column_a, numeric_column_b])
+
+    with monitored_session.MonitoredSession() as sess:
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[Condition x == y did not hold element-wise:\] '
+          r'\[x \(sequence_input_layer/aaa/sequence_length:0\) = \] \[2 1\] '
+          r'\[y \(sequence_input_layer/bbb/sequence_length:0\) = \] \[1 1\]'):
+        sess.run(sequence_length)
+
+
+class InputLayerTest(test.TestCase):
+  """Tests input_layer with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type _SequenceCategoricalColumn\.'):
+      _ = fc.input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[embedding_column_a])
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type _SequenceCategoricalColumn\.'):
+      _ = fc.input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[indicator_column_a])
+
+
+def _assert_sparse_tensor_value(test_case, expected, actual):
+  _assert_sparse_tensor_indices_shape(test_case, expected, actual)
+
+  test_case.assertEqual(
+      np.array(expected.values).dtype, np.array(actual.values).dtype)
+  test_case.assertAllEqual(expected.values, actual.values)
+
+
+def _assert_sparse_tensor_indices_shape(test_case, expected, actual):
+  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
+  test_case.assertAllEqual(expected.indices, actual.indices)
+
+  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
+  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
+
+
+class SequenceCategoricalColumnWithIdentityTest(test.TestCase):
+
+  def test_get_sparse_tensors(self):
+    column = sfc.sequence_categorical_column_with_identity(
+        'aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 2, 0),
+        dense_shape=(2, 2))
+    expected_sparse_ids = sparse_tensor.SparseTensorValue(
+        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+        values=np.array((1, 2, 0), dtype=np.int64),
+        dense_shape=(2, 2, 1))
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self,
+          expected_sparse_ids,
+          id_weight_pair.id_tensor.eval(session=sess))
+
+  def test_get_sparse_tensors_inputs3d(self):
+    """Tests _get_sparse_tensors when the input is already 3D Tensor."""
+    column = sfc.sequence_categorical_column_with_identity(
+        'aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+        values=(1, 2, 0),
+        dense_shape=(2, 2, 1))
+
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r'Column aaa expected ID tensor of rank 2\.\s*'
+        r'id_tensor shape:\s*\[2 2 1\]'):
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({'aaa': inputs}))
+      with monitored_session.MonitoredSession() as sess:
+        id_weight_pair.id_tensor.eval(session=sess)
+
+
+class SequenceCategoricalColumnWithHashBucketTest(test.TestCase):
+
+  def test_get_sparse_tensors(self):
+    column = sfc.sequence_categorical_column_with_hash_bucket(
+        'aaa', hash_bucket_size=10)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+
+    expected_sparse_ids = sparse_tensor.SparseTensorValue(
+        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+        # Ignored to avoid hash dependence in test.
+        values=np.array((0, 0, 0), dtype=np.int64),
+        dense_shape=(2, 2, 1))
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_indices_shape(
+          self,
+          expected_sparse_ids,
+          id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceCategoricalColumnWithVocabularyFileTest(test.TestCase):
+
+  def _write_vocab(self, vocab_strings, file_name):
+    vocab_file = os.path.join(self.get_temp_dir(), file_name)
+    with open(vocab_file, 'w') as f:
+      f.write('\n'.join(vocab_strings))
+    return vocab_file
+
+  def setUp(self):
+    super(SequenceCategoricalColumnWithVocabularyFileTest, self).setUp()
+
+    vocab_strings = ['omar', 'stringer', 'marlo']
+    self._wire_vocabulary_file_name = self._write_vocab(vocab_strings,
+                                                        'wire_vocabulary.txt')
+    self._wire_vocabulary_size = 3
+
+  def test_get_sparse_tensors(self):
+    column = sfc.sequence_categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    expected_sparse_ids = sparse_tensor.SparseTensorValue(
+        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+        values=np.array((2, -1, 0), dtype=np.int64),
+        dense_shape=(2, 2, 1))
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self,
+          expected_sparse_ids,
+          id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceCategoricalColumnWithVocabularyListTest(test.TestCase):
+
+  def test_get_sparse_tensors(self):
+    column = sfc.sequence_categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    expected_sparse_ids = sparse_tensor.SparseTensorValue(
+        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+        values=np.array((2, -1, 0), dtype=np.int64),
+        dense_shape=(2, 2, 1))
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self,
+          expected_sparse_ids,
+          id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceEmbeddingColumnTest(test.TestCase):
+
+  def test_get_sequence_dense_tensor(self):
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 2))
+
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    expected_lookups = [
+        # example 0, ids [2]
+        [[7., 11.], [0., 0.]],
+        # example 1, ids [0, 1]
+        [[1., 2.], [3., 5.]],
+        # example 2, ids []
+        [[0., 0.], [0., 0.]],
+        # example 3, ids [1]
+        [[3., 5.], [0., 0.]],
+    ]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval(session=sess))
+
+  def test_sequence_length(self):
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    expected_sequence_length = [1, 2]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=2)
+
+    _, sequence_length = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length = sess.run(sequence_length)
+      self.assertAllEqual(expected_sequence_length, sequence_length)
+      self.assertEqual(np.int64, sequence_length.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [2]
+        # example 2, ids [0, 1]
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids []
+        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 1, 2, 0, 1, 0]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=2)
+
+    _, sequence_length = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+class SequenceIndicatorColumnTest(test.TestCase):
+
+  def test_get_sequence_dense_tensor(self):
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 2))
+
+    expected_lookups = [
+        # example 0, ids [2]
+        [[0., 0., 1.], [0., 0., 0.]],
+        # example 1, ids [0, 1]
+        [[1., 0., 0.], [0., 1., 0.]],
+        # example 2, ids []
+        [[0., 0., 0.], [0., 0., 0.]],
+        # example 3, ids [1]
+        [[0., 1., 0.], [0., 0., 0.]],
+    ]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column = fc.indicator_column(categorical_column)
+
+    indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_lookups, indicator_tensor.eval(session=sess))
+
+  def test_sequence_length(self):
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    expected_sequence_length = [1, 2]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column = fc.indicator_column(categorical_column)
+
+    _, sequence_length = indicator_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length = sess.run(sequence_length)
+      self.assertAllEqual(expected_sequence_length, sequence_length)
+      self.assertEqual(np.int64, sequence_length.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [2]
+        # example 2, ids [0, 1]
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids []
+        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 1, 2, 0, 1, 0]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column = fc.indicator_column(categorical_column)
+
+    _, sequence_length = indicator_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+class SequenceNumericColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    a = sfc.sequence_numeric_column('aaa')
+    self.assertEqual('aaa', a.key)
+    self.assertEqual('aaa', a.name)
+    self.assertEqual('aaa', a._var_scope_name)
+    self.assertEqual((1,), a.shape)
+    self.assertEqual(0., a.default_value)
+    self.assertEqual(dtypes.float32, a.dtype)
+
+  def test_shape_saved_as_tuple(self):
+    a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
+    self.assertEqual((1, 2), a.shape)
+
+  def test_shape_must_be_positive_integer(self):
+    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+      sfc.sequence_numeric_column('aaa', shape=[1.0])
+
+    with self.assertRaisesRegexp(
+        ValueError, 'shape dimensions must be greater than 0'):
+      sfc.sequence_numeric_column('aaa', shape=[0])
+
+  def test_dtype_is_convertible_to_float(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'dtype must be convertible to float'):
+      sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
+
+  def test_get_sequence_dense_tensor(self):
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    expected_dense_tensor = [
+        [[0.], [1.]],
+        [[10.], [0.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  def test_get_sequence_dense_tensor_with_shape(self):
+    """Tests get_sequence_dense_tensor with shape !=(1,)."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
+        # example 1, [[10., 11., 12.]]
+        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
+                 (1, 0), (1, 1), (1, 2)),
+        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
+        dense_shape=(2, 6))
+    expected_dense_tensor = [
+        [[0., 1., 2.], [3., 4., 5.]],
+        [[10., 11., 12.], [0., 0., 0.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  def test_get_dense_tensor_multi_dim(self):
+    """Tests get_sequence_dense_tensor for multi-dim numeric_column."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+        # example 1, [[[10., 11.],  [12., 13.]]]
+        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
+                 (1, 0), (1, 1), (1, 2), (1, 3)),
+        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+        dense_shape=(2, 8))
+    expected_dense_tensor = [
+        [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
+        [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]],
+    ]
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  def test_sequence_length(self):
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
+        # example 1, [[10., 11., 12.]]
+        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
+                 (1, 0), (1, 1), (1, 2)),
+        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
+        dense_shape=(2, 6))
+    expected_sequence_length = [2, 1]
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
+
+    _, sequence_length = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length = sess.run(sequence_length)
+      self.assertAllEqual(expected_sequence_length, sequence_length)
+      self.assertEqual(np.int64, sequence_length.dtype)
+
+  def test_sequence_length_with_shape(self):
+    """Tests _sequence_length with shape !=(1,)."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    expected_sequence_length = [2, 1]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    _, sequence_length = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values []
+        # example 1, values [[0.], [1.]]
+        # example 2, [[2.]]
+        # example 3, values []
+        # example 4, [[3.]]
+        # example 5, values []
+        indices=((1, 0), (1, 1), (2, 0), (4, 0)),
+        values=(0., 1., 2., 3.),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 2, 1, 0, 1, 0]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    _, sequence_length = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index eccce99071dc1477cf4f3bb152f3304b3b0fc35a..f7b3273a4d35eadb9fad49399b7bf18d4bd33503 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -180,15 +180,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/ffmpeg/default/BUILD b/tensorflow/contrib/ffmpeg/default/BUILD
index 6b455567d766dbe6d380a498bd7f521db27e077b..59bad8982dd163f89f37e1a0a9d5017d0c495de3 100644
--- a/tensorflow/contrib/ffmpeg/default/BUILD
+++ b/tensorflow/contrib/ffmpeg/default/BUILD
@@ -74,15 +74,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index e61221a6b0d34373279a379f356c99c379488182..cca1a054193815793846a8753678f75bdfd72a6c 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -28,7 +28,7 @@
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 
 using tensorflow::strings::StrCat;
@@ -256,6 +256,9 @@ Status ReadInfoFile(const string& filename, uint32* width, uint32* height,
         if (p != std::string::npos) {
           string rgb24 = line.substr(p + 9, line.find(" ", p + 9));
           rgb24 = rgb24.substr(0, rgb24.find(","));
+          // Strip anything after " ", in case the format is
+          // `640x360 [SAR 1:1 DAR 16:9]`
+          rgb24 = rgb24.substr(0, rgb24.find(" "));
           string rgb24_width = rgb24.substr(0, rgb24.find("x"));
           string rgb24_height = rgb24.substr(rgb24_width.length() + 1);
           if (strings::safe_strtou32(rgb24_width, &width_value) &&
@@ -270,8 +273,10 @@ Status ReadInfoFile(const string& filename, uint32* width, uint32* height,
       // We only look for the first stream mapping to have the number of the
       // frames.
       // Once processed we will not further process stream mapping section.
-      if (line.find("frame=  ") == 0) {
-        string number = line.substr(8, line.find(" ", 8));
+      if (line.find("frame=") == 0) {
+        // The format might be `frame=  166 ` or `frame=12488 `
+        string number = line.substr(6);
+        number = number.substr(number.find_first_not_of(" "));
         number = number.substr(0, number.find(" "));
         if (strings::safe_strtou32(number, &frames_value)) {
           in_mapping = false;
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 9e5f54f0973eae899ca65e4098358107053cb7d4..249debbdf6dff412a5be6cb1032fc4a3567c7d0b 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -28,7 +28,6 @@ tf_custom_op_py_library(
         "python/framework/graph_util.py",
         "python/framework/tensor_util.py",
         "python/ops/__init__.py",
-        "python/ops/accumulate_n_v2.py",
         "python/ops/arg_scope.py",
         "python/ops/audio_ops.py",
         "python/ops/checkpoint_ops.py",
@@ -63,7 +62,9 @@ tf_custom_op_py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:smart_cond",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:state_ops_gen",
@@ -92,6 +93,7 @@ tf_kernel_library(
     ],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
     ],
     alwayslink = 1,
@@ -161,23 +163,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "accumulate_n_v2_test",
-    size = "small",
-    srcs = ["python/ops/accumulate_n_v2_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
 cuda_py_test(
     name = "critical_section_test",
     size = "medium",
@@ -185,31 +170,16 @@ cuda_py_test(
     additional_deps = [
         "//tensorflow/python:client_testlib",
         ":framework_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:resource_variable_ops",
-    ],
-)
-
-py_test(
-    name = "accumulate_n_v2_eager_test",
-    size = "small",
-    srcs = ["python/ops/accumulate_n_v2_eager_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -354,15 +324,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 4746cfe0720cb20f530dc919fe062db17a1dfe84..10d1ecc738de6777784200ba934a521dff592e28 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -71,6 +71,10 @@ See the @{$python/contrib.framework} guide.
 @@model_variable
 @@variable
 @@VariableDeviceChooser
+@@convolutional_delta_orthogonal
+@@convolutional_orthogonal_1d
+@@convolutional_orthogonal_2d
+@@convolutional_orthogonal_3d
 @@zero_initializer
 
 @@load_checkpoint
@@ -82,11 +86,16 @@ See the @{$python/contrib.framework} guide.
 @@load_linear_multiclass_bias_initializer
 @@load_variable_slot_initializer
 
+@@argsort
 @@py_func
 @@sort
 
 @@get_placeholders
 
+@@smart_cond
+@@smart_constant_value
+@@smart_case
+
 @@CriticalSection
 
 @@BoundedTensorSpec
@@ -99,19 +108,39 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.framework.python.framework import *
+from tensorflow.contrib.framework.python.framework import nest
 from tensorflow.contrib.framework.python.ops import *
 # pylint: enable=unused-import,wildcard-import
 
 from tensorflow.python.framework.ops import prepend_name_scope
 from tensorflow.python.framework.ops import strip_name_scope
-from tensorflow.python.ops.control_flow_ops import smart_cond
-from tensorflow.python.ops.control_flow_ops import smart_constant_value
-
+from tensorflow.python.framework.smart_cond import smart_case
+from tensorflow.python.framework.smart_cond import smart_cond
+from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
-
+from tensorflow.python.ops.array_ops import broadcast_to
+from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
+from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
+from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
+from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest']
-
+_allowed_symbols = ['nest', 'broadcast_to']
+_nest_allowed_symbols = [
+    'assert_same_structure',
+    'is_sequence',
+    'flatten',
+    'flatten_dict_items',
+    'pack_sequence_as',
+    'map_structure',
+    'assert_shallow_structure',
+    'flatten_up_to',
+    'map_structure_up_to',
+    'get_traverse_shallow_structure',
+    'yield_flat_paths',
+    'flatten_with_joined_string_paths',
+]
+
+remove_undocumented(nest.__name__, allowed_exception_list=_nest_allowed_symbols)
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
index 5bf6b67529579e71a615c27e035111a58d5c02e0..6ab3f460b36d5dd632daee1af68d62529df9cb09 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_var.h"
 
 namespace tensorflow {
 
@@ -85,4 +86,74 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ZeroVarInitializer : public OpKernel {
+ public:
+  explicit ZeroVarInitializer(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    Var* variable = nullptr;
+    OP_REQUIRES_OK(ctx, LookupOrCreateResource<Var>(
+                            ctx, HandleFromInput(ctx, 0), &variable,
+                            [this, ctx](Var** var_ptr) {
+                              *var_ptr = new Var(dtype_);
+                              PersistentTensor unused;
+                              Tensor* var_tensor = nullptr;
+                              AllocatorAttributes attr;
+                              attr.set_gpu_compatible(true);
+                              attr.set_nic_compatible(true);
+                              TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+                                  dtype_, shape_, &unused, &var_tensor, attr));
+
+                              functor::TensorSetZero<Device, T>()(
+                                  ctx->eigen_device<Device>(),
+                                  var_tensor->flat<T>());
+
+                              *(*var_ptr)->tensor() = *var_tensor;
+
+                              return Status::OK();
+                            }));
+
+    core::ScopedUnref scoped(variable);
+    mutex_lock ml(*variable->mu());
+
+    OP_REQUIRES(ctx, !variable->is_initialized,
+                errors::InvalidArgument("input is already initialized"));
+
+    variable->is_initialized = true;
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    output->scalar<ResourceHandle>()() = HandleFromInput(ctx, 0);
+  }
+
+ private:
+  DataType dtype_;
+  TensorShape shape_;
+};
+
+#define REGISTER_CPU_KERNELS(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer")          \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<type>("dtype"), \
+                          ZeroVarInitializer<Eigen::ThreadPoolDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU_KERNELS(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer")         \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<type>("dtype") \
+                              .HostMemory("var"),            \
+                          ZeroVarInitializer<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc
index 706134ba9a51de6253ba7463b17ff662ea740ed0..f6ee6cdb5713c113aff2228db58244ac73536d9a 100644
--- a/tensorflow/contrib/framework/ops/variable_ops.cc
+++ b/tensorflow/contrib/framework/ops/variable_ops.cc
@@ -39,4 +39,33 @@ ref: Should be from a `Variable` node.
 output_ref:= Same as "ref".
 )doc");
 
+REGISTER_OP("ZeroVarInitializer")
+    .Input("var: resource")
+    .Output("output_var: resource")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .SetAllowsUninitializedInput()
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t));
+      PartialTensorShape p;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &p));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s));
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Initialize 'var' with all zeros. This op requires that the resource var is not
+initialized. The var will first be allocated memory, then be filled with all
+zeros. This op is intended to save memory during initialization,
+if you use this op, you should not run initializer of the var.
+
+var: Should be a ResourceVariable.
+output_var:= Same as "var".
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/python/framework/experimental_test.py b/tensorflow/contrib/framework/python/framework/experimental_test.py
index 8e54e09e04ee3c0ddbd4fa84cc0912cb70c93e62..cfdc7df7d8fd4c1406bf447a79038ac33b11e047 100644
--- a/tensorflow/contrib/framework/python/framework/experimental_test.py
+++ b/tensorflow/contrib/framework/python/framework/experimental_test.py
@@ -49,7 +49,6 @@ class ExperimentalTest(test.TestCase):
                      "\nTHIS FUNCTION IS EXPERIMENTAL. It may change or "
                      "be removed at any time, and without warning."
                      "\n"
-                     "\n"
                      "\nArgs:"
                      "\n  arg0: Arg 0."
                      "\n  arg1: Arg 1."
diff --git a/tensorflow/contrib/framework/python/framework/graph_util.py b/tensorflow/contrib/framework/python/framework/graph_util.py
index 49eec3a3f1a0f357ea3adfade51e71cb0f89942d..2703224b1bf62831b6088558d4f93950fe938c10 100644
--- a/tensorflow/contrib/framework/python/framework/graph_util.py
+++ b/tensorflow/contrib/framework/python/framework/graph_util.py
@@ -85,14 +85,19 @@ def fuse_op(graph_def, input_nodes, output_nodes, output_dtypes,
       if n not in reachable_by_input and n not in output_nodes_set:
         # n is between input and output, i.e., part of the fused op
         next_to_visit = [n]
+        visited = set()
         while next_to_visit:
           cur_node = next_to_visit[0]
+          visited.add(cur_node)
           del next_to_visit[0]
           if cur_node in reachable_by_input and cur_node not in input_nodes_set:
             raise TypeError("Node %s uses input %s not in input_nodes." %
                             (n, cur_node))
           if cur_node not in input_nodes_set:
-            next_to_visit += name_to_input_name[cur_node]
+            next_to_visit += [
+                input_node for input_node in name_to_input_name[cur_node]
+                if input_node not in visited
+            ]
     elif n not in reachable_by_input:
       nodes_post_output.append(n)
 
diff --git a/tensorflow/contrib/framework/python/framework/graph_util_test.py b/tensorflow/contrib/framework/python/framework/graph_util_test.py
index b8a6d109e19211d271c2b15bac66ddacd38fe395..812c5fbd8cb759aef6eb1aad532c03794b2ceaf4 100644
--- a/tensorflow/contrib/framework/python/framework/graph_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/graph_util_test.py
@@ -42,7 +42,8 @@ class GraphUtilTest(test.TestCase):
     graph_def = graph_pb2.GraphDef()
     node_a = GetNewNode('A', 'Placeholder', [])
     node_b = GetNewNode('B', 'Op1', ['A'])
-    node_c = GetNewNode('C', 'Op1', ['B'])
+    # A loop in the part that will be fused.
+    node_c = GetNewNode('C', 'Op1', ['B', 'C'])
     node_d = GetNewNode('D', 'Op1', ['C'])
     node_e = GetNewNode('E', 'Op1', ['D'])
     graph_def.node.extend([node_a, node_b, node_c, node_d, node_e])
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index 8cdb340f2ddd9b3a7f55c1937ef045f4627e99be..8fc4f60492b0bfb22ea78cb7b5906e452bb6da58 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -48,7 +48,7 @@ class LocalVariabletest(test.TestCase):
       variables = variables_lib.local_variables()
       self.assertEquals(2, len(variables))
       self.assertRaises(errors_impl.OpError, sess.run, variables)
-      variables_lib.initialize_variables(variables).run()
+      variables_lib.variables_initializer(variables).run()
       self.assertAllEqual(set([value0, value1]), set(sess.run(variables)))
 
 
@@ -209,6 +209,7 @@ class WithShapeTest(test.TestCase):
         self.assertRaisesRegexp(errors_impl.OpError, "Wrong shape",
                                 tensor_2x2.eval, {tensor_no_shape: [42.0]})
 
+  @test_util.enable_c_shapes
   def test_with_shape_partial(self):
     with self.test_session():
       tensor_partial_shape = array_ops.placeholder(dtypes.float32)
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
deleted file mode 100644
index 476528b0dd3df05239d5dc402b466e06dd789985..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Ops that will eventually be folded into tensorflow/python/ops/math_ops.py
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import math_ops
-
-
-
-def accumulate_n_v2(inputs, shape=None, tensor_dtype=None, name=None):
-  """Returns the element-wise sum of a list of tensors.
-
-  Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
-  otherwise, these are inferred.
-
-  `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-  wait for all of its inputs to be ready before beginning to sum. This can
-  save memory if inputs are ready at different times, since minimum temporary
-  storage is proportional to the output size rather than the inputs size.
-
-  Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-
-  For example:
-
-  ```python
-  a = tf.constant([[1, 2], [3, 4]])
-  b = tf.constant([[5, 0], [0, 6]])
-  tf.accumulate_n_v2([a, b, a])  # [[7, 4], [6, 14]]
-
-  # Explicitly pass shape and type
-  tf.accumulate_n_v2([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
-                                                                   # [[7,  4],
-                                                                   #  [6, 14]]
-  ```
-
-  Args:
-    inputs: A list of `Tensor` objects, each with same shape and type.
-    shape: Shape of elements of `inputs`.
-    tensor_dtype: The type of `inputs`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` of same shape and type as the elements of `inputs`.
-
-  Raises:
-    ValueError: If `inputs` don't all have same shape and dtype or the shape
-    cannot be inferred.
-  """
-  _INPUTS_ERR_MSG = ValueError("inputs must be a list of at least one Tensor"
-                               "with the same dtype and shape")
-  if not inputs or not isinstance(inputs, (list, tuple)):
-    raise _INPUTS_ERR_MSG
-  inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
-  if not all(isinstance(x, ops.Tensor) for x in inputs):
-    raise _INPUTS_ERR_MSG
-  if not all(x.dtype == inputs[0].dtype for x in inputs):
-    raise _INPUTS_ERR_MSG
-  if shape is not None:
-    shape = tensor_shape.as_shape(shape)
-  else:
-    shape = tensor_shape.unknown_shape()
-  for input_tensor in inputs:
-    if isinstance(input_tensor, ops.Tensor):
-      shape = shape.merge_with(input_tensor.get_shape())
-
-  # tensor_dtype is for safety only; operator's output type computed in C++
-  if tensor_dtype is not None and tensor_dtype != inputs[0].dtype:
-    raise TypeError("tensor_dtype is {}, but input is of type {}"
-                    .format(tensor_dtype, inputs[0].dtype))
-
-  if len(inputs) == 1 and name is None:
-    return inputs[0]
-  elif len(inputs) == 1 and name is not None:
-    return array_ops.identity(inputs[0], name=name)
-  elif context.in_eager_mode():
-    # TemporaryVariable not currently supported in eager mode; fall back
-    # onto AddN for now.
-    # TODO(frreiss) remove this once the lifetime of eager variables gets
-    # addressed
-    return math_ops.add_n(inputs, name=name)
-  else:
-    return gen_math_ops._accumulate_nv2(inputs, name=name, shape=shape)
-
-# The following code should eventually be merged into
-# tensorflow/python/ops/math_grad.py
-@ops.RegisterGradient("AccumulateNV2")
-def _AddNGrad(op, grad):
-  """Same as gradient for AddN. Copies the gradient to all inputs."""
-  # Not broadcasting.
-  return [grad] * len(op.inputs)
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index 409657fe1da0e5540cd2ad6070d86737c039e91f..5b150339953f961c756c0909dd1795341159b9cd 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -68,7 +68,7 @@ from tensorflow.python.util import tf_decorator
 
 __all__ = [
     'arg_scope', 'add_arg_scope', 'current_arg_scope', 'has_arg_scope',
-    'arg_scoped_arguments'
+    'arg_scoped_arguments', 'arg_scope_func_key'
 ]
 
 _ARGSTACK = [{}]
@@ -89,7 +89,7 @@ def current_arg_scope():
   return stack[-1]
 
 
-def _key_op(op):
+def arg_scope_func_key(op):
   return getattr(op, '_key_op', str(op))
 
 
@@ -103,9 +103,9 @@ def _kwarg_names(func):
 
 
 def _add_op(op):
-  key_op = _key_op(op)
-  if key_op not in _DECORATED_OPS:
-    _DECORATED_OPS[key_op] = _kwarg_names(op)
+  key = arg_scope_func_key(op)
+  if key not in _DECORATED_OPS:
+    _DECORATED_OPS[key] = _kwarg_names(op)
 
 
 @tf_contextlib.contextmanager
@@ -142,21 +142,21 @@ def arg_scope(list_ops_or_scope, **kwargs):
   else:
     # Assumes that list_ops_or_scope is a list/tuple of ops with kwargs.
     if not isinstance(list_ops_or_scope, (list, tuple)):
-      raise TypeError('list_ops_or_scope must either be a list/tuple or reused'
+      raise TypeError('list_ops_or_scope must either be a list/tuple or reused '
                       'scope (i.e. dict)')
     try:
       current_scope = current_arg_scope().copy()
       for op in list_ops_or_scope:
-        key_op = _key_op(op)
+        key = arg_scope_func_key(op)
         if not has_arg_scope(op):
           raise ValueError('%s is not decorated with @add_arg_scope',
                            _name_op(op))
-        if key_op in current_scope:
-          current_kwargs = current_scope[key_op].copy()
+        if key in current_scope:
+          current_kwargs = current_scope[key].copy()
           current_kwargs.update(kwargs)
-          current_scope[key_op] = current_kwargs
+          current_scope[key] = current_kwargs
         else:
-          current_scope[key_op] = kwargs.copy()
+          current_scope[key] = kwargs.copy()
       _get_arg_stack().append(current_scope)
       yield current_scope
     finally:
@@ -176,14 +176,14 @@ def add_arg_scope(func):
   def func_with_args(*args, **kwargs):
     current_scope = current_arg_scope()
     current_args = kwargs
-    key_func = _key_op(func)
+    key_func = arg_scope_func_key(func)
     if key_func in current_scope:
       current_args = current_scope[key_func].copy()
       current_args.update(kwargs)
     return func(*args, **current_args)
 
   _add_op(func)
-  setattr(func_with_args, '_key_op', _key_op(func))
+  setattr(func_with_args, '_key_op', arg_scope_func_key(func))
   return tf_decorator.make_decorator(func, func_with_args)
 
 
@@ -196,7 +196,7 @@ def has_arg_scope(func):
   Returns:
     a boolean.
   """
-  return _key_op(func) in _DECORATED_OPS
+  return arg_scope_func_key(func) in _DECORATED_OPS
 
 
 def arg_scoped_arguments(func):
@@ -209,4 +209,4 @@ def arg_scoped_arguments(func):
     a list of kwargs names.
   """
   assert has_arg_scope(func)
-  return _DECORATED_OPS[_key_op(func)]
+  return _DECORATED_OPS[arg_scope_func_key(func)]
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope_test.py b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
index 7ba9d4ffa90f6860629b15a2ea91e0c573bf6368..4c3879d4fc08b53ea8be5f1256a830a64fb39af6 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope_test.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
@@ -170,6 +170,30 @@ class ArgScopeTest(test.TestCase):
         self.assertTupleEqual(args, func1_args)
         self.assertDictEqual(kwargs, func1_kwargs)
 
+  def testNestedArgScopeObjectCreatedOutsideScopeOverridesArgScope(self):
+
+    def get_scope_object():
+      with arg_scope([func1], a=1, b=None, c=[1]) as sc:
+        return sc
+
+    scope_object = get_scope_object()
+    with arg_scope([func1], b=2, d=10):
+      with arg_scope(scope_object):
+        args, kwargs = func1(0)
+        self.assertTupleEqual(args, (0,))
+        self.assertDictEqual(kwargs, {'a': 1, 'b': None, 'c': [1]})
+
+  def testArgScopeObjectCreatedWithinScopeInheritsArgScope(self):
+    def get_scope_object():
+      with arg_scope([func1], a=1, b=None, c=[1]) as sc:
+        return sc
+
+    with arg_scope([func1], b=2, d=10):
+      with arg_scope(get_scope_object()):
+        args, kwargs = func1(0)
+        self.assertTupleEqual(args, (0,))
+        self.assertDictEqual(kwargs, {'a': 1, 'b': None, 'c': [1], 'd': 10})
+
   def testSharedArgScope(self):
     func1_args = (0,)
     func1_kwargs = {'a': 1, 'b': None, 'c': [1]}
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
index 182fec924febb74a23b82b1664d137f033f3b1b4..bd764ed57a6da0a4d356235108e998a80ac34362 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_ops.py
+++ b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
@@ -24,10 +24,12 @@ import collections
 # from tensorflow.core.protobuf import critical_section_pb2
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 
 
@@ -38,11 +40,32 @@ CRITICAL_SECTION_EXECUTIONS = "critical_section_executions"
 
 class _ExecutionSignature(
     collections.namedtuple("_ExecutionSignature",
-                           ("op", "exclusive_resource_access"))):
+                           ("op", "handle",
+                            "resources", "exclusive_resource_access"))):
   """A class storing an `ExecuteInCriticalResource` op and associated attrs."""
   pass
 
 
+def _identity(x):
+  """Identity op that recognizes `TensorArray`, `Operation`, and `Tensor`."""
+  if isinstance(x, tensor_array_ops.TensorArray):
+    return x.identity()
+  elif isinstance(x, ops.Operation):
+    return control_flow_ops.group(x)
+  elif context.executing_eagerly() and x is None:
+    return None
+  else:
+    return array_ops.identity(x)
+
+
+def _get_colocation(op):
+  """Get colocation symbol from op, if any."""
+  try:
+    return op.get_attr("_class")
+  except ValueError:
+    return None
+
+
 class CriticalSection(object):
   """Critical section.
 
@@ -112,16 +135,18 @@ class CriticalSection(object):
   ```
   """
 
-  def __init__(self, name=None, critical_section_def=None, import_scope=None):
+  def __init__(self, name=None, shared_name=None,
+               critical_section_def=None, import_scope=None):
     """Creates a critical section."""
     if critical_section_def and name is not None:
-      raise ValueError("critical_section_def and name are mutually exclusive.")
+      raise ValueError("critical_section_def and shared_name are "
+                       "mutually exclusive.")
     if critical_section_def:
       self._init_from_proto(critical_section_def, import_scope=import_scope)
     else:
-      self._init_from_args(name)
+      self._init_from_args(name, shared_name)
 
-  def _init_from_proto(self, critical_section_def, import_scope):
+  def _init_from_proto(self, critical_section_def, import_scope):  # pylint: disable=invalid-name
     raise NotImplementedError("Not yet implemented")
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # assert isinstance(
@@ -133,19 +158,21 @@ class CriticalSection(object):
     #         critical_section_def.critical_section_name,
     #         import_scope=import_scope))
 
-  def _init_from_args(self, name):
+  def _init_from_args(self, name, shared_name):  # pylint: disable=invalid-name
     """Initialize the CriticalSection from constructor arguments."""
     with ops.name_scope(name, "CriticalSection", []) as name:
-      with ops.control_dependencies(None):
+      with ops.init_scope():
         # pylint: disable=protected-access
-        handle_name = ops._name_from_scope_name(name)
         container = ops.get_default_graph()._container
         # pylint: enable=protected-access
+        if shared_name is None:
+          shared_name = name
         if container is None:
           container = ""
-        self._handle = gen_resource_variable_ops.critical_section_op(
-            shared_name=handle_name, name=name)
-    if context.in_graph_mode():
+        self._handle = gen_resource_variable_ops.mutex_v2(
+            shared_name=shared_name, container=container, name=name)
+
+    if not context.executing_eagerly():
       ops.add_to_collections(CRITICAL_SECTIONS, self)
 
   @property
@@ -171,8 +198,8 @@ class CriticalSection(object):
       The tensors returned from `fn(*args, **kwargs)`.
 
     Raises:
-      ValueError: If `fn` attempts to use this `CriticalSection` in any nested
-        way.
+      ValueError: If `fn` attempts to lock this `CriticalSection` in any nested
+        or lazy way that may cause a deadlock.
       ValueError: If `exclusive_resource_access` is not provided (is `True`) and
         another `CriticalSection` has an execution requesting the same
         resources as in `*args`, `**kwargs`, and any additionaly captured
@@ -183,68 +210,163 @@ class CriticalSection(object):
     name = kwargs.pop("name", None)
     exclusive_resource_access = kwargs.pop("exclusive_resource_access", True)
 
-    args = nest.map_structure(ops.convert_to_tensor, args)
     with ops.name_scope(name, "critical_section_execute", []):
-      fn_op = function.make_defun_op(fn, *args, **kwargs)
-      flat_dtypes = nest.flatten(fn_op.output_dtypes)
-      flat_shapes = nest.flatten(fn_op.output_shapes)
-      all_inputs = nest.flatten(args) + fn_op.captured_inputs
-      if self._handle in all_inputs:
-        raise ValueError("The function fn attempts to access the "
-                         "CriticalSection in which it would be running.  This "
-                         "is illegal and would cause deadlocks.  "
-                         "CriticalSection: %s." % self._handle)
-
-      if context.in_graph_mode():
-        # Collections and op introspection does not work in eager
-        # mode.  This is generally ok; since eager mode (as of
-        # writing) executes sequentially anyway.
-        all_input_resources = [
-            x for x in all_inputs if x.dtype == dtypes.resource]
-        for sg in ops.get_collection(CRITICAL_SECTION_EXECUTIONS):
-          if sg.op.inputs[0].name == self._handle.name:
-            # Other executions in the same critical section are allowed.
-            continue
-          if not (exclusive_resource_access or sg.exclusive_resource_access):
-            # Neither execution requested exclusive access.
-            continue
-          sg_input_names = [y.name for y in sg.op.inputs[1:]]
-          for res in all_input_resources:
-            if res.name in sg_input_names:
-              raise ValueError(
-                  "This execution would access resource %s; but either this "
-                  "execution (CriticalSection: %s) or Execution '%s' "
-                  "(CriticalSection: %s) requested exclusive resource access "
-                  "of this resource for their critical section.  Did you mean "
-                  "to call execute with keyword argument "
-                  "exclusive_resource_access=False?"
-                  % (res.name,
-                     self.name,
-                     sg.op.name,
-                     sg.op.inputs[0].op.name))
-
-      flat_outputs = gen_resource_variable_ops.execute_in_critical_section(
-          critical_section=self._handle,
-          arguments=all_inputs,
-          f=fn_op,
-          output_types=flat_dtypes,
-          output_shapes=flat_shapes)
-
-      if context.in_graph_mode():
-        if isinstance(flat_outputs, ops.Operation):
-          flat_outputs = [flat_outputs]
-        op = (flat_outputs[0].op if isinstance(flat_outputs[0], ops.Tensor)
-              else flat_outputs[0])
+
+      # Ensure that mutex locking only happens *after* all args and
+      # kwargs have been executed.  This avoids certain types of deadlocks.
+      lock = gen_resource_variable_ops.mutex_lock(self._handle)
+
+      if not context.executing_eagerly():
+        # NOTE(ebrevdo): This is to ensure we don't pick up spurious
+        # Operations created by other threads.
+        with ops.get_default_graph()._lock:  # pylint: disable=protected-access
+          existing_ops = ops.get_default_graph().get_operations()
+          with ops.control_dependencies([lock]):
+            r = fn(*args, **kwargs)
+          # TODO(ebrevdo): If creating critical sections in a python loop, this
+          # makes graph creation time quadratic.  Revisit if this
+          # becomes a problem.
+          created_ops = (set(ops.get_default_graph().get_operations())
+                         .difference(existing_ops))
+      else:
+        with ops.control_dependencies([lock]):
+          r = fn(*args, **kwargs)
+
+      if not context.executing_eagerly():
+        self._add_control_dependencies_to_lock(created_ops, lock.op)
+
+        # captured_resources is a list of resources that are directly
+        # accessed only by ops created during fn(), not by any
+        # ancestors of those ops in the graph.
+        captured_resources = set([
+            input_ for op in created_ops
+            for input_ in op.inputs
+            if input_.dtype == dtypes.resource
+        ])
+
+        # NOTE(ebrevdo): The only time self._is_self_handle() is True
+        # in this call is if one of the recently created ops, within
+        # the execute(), themselves attempt to access the
+        # CriticalSection.  This will cause a deadlock.
+        if any(self._is_self_handle(x) for x in captured_resources):
+          raise ValueError("The function fn attempts to directly access the "
+                           "CriticalSection in which it would be running.  "
+                           "This is illegal and would cause deadlocks.")
+
+        self._check_multiple_access_to_resources(
+            captured_resources, exclusive_resource_access)
+
+      r_flat = [_identity(x) for x in nest.flatten(r)]
+
+      with ops.control_dependencies(r_flat):
+        # The identity must run on the same machine as self._handle
+        with ops.colocate_with(self._handle):
+          # Do not use array_ops.identity as there are special
+          # optimizations within TensorFlow which seem to elide it
+          # even when optimizations are disabled(!).
+          ensure_lock_exists = gen_resource_variable_ops.consume_mutex_lock(
+              lock)
+
+        # Make sure that if any element of r is accessed, all of
+        # them are executed together.
+        r = nest.pack_sequence_as(r, control_flow_ops.tuple(nest.flatten(r)))
+
+      with ops.control_dependencies([ensure_lock_exists]):
+        outputs = nest.map_structure(_identity, r)
+
+      if not context.executing_eagerly():
         signature = _ExecutionSignature(
-            op=op,
+            op=lock.op,
+            handle=self._handle,
+            resources=list(captured_resources),
             exclusive_resource_access=exclusive_resource_access)
         ops.add_to_collections(
             CRITICAL_SECTION_EXECUTIONS, signature)
 
-      return (flat_outputs[0]
-              if (len(flat_outputs) == 1
-                  and isinstance(flat_outputs[0], ops.Operation))
-              else nest.pack_sequence_as(fn_op.output_dtypes, flat_outputs))
+      return outputs
+
+  def _add_control_dependencies_to_lock(self, created_ops, lock_op):
+    """To avoid deadlocks, all args must be executed before lock_op."""
+    # Get all arguments (explicit and captured) of all ops created by fn().
+    all_args = set([input_.op for op in created_ops for input_ in op.inputs])
+    all_args.update(
+        input_op for op in created_ops for input_op in op.control_inputs)
+    # Unfortunately, we can't use sets throughout because TF seems to
+    # create new Operation objects for the same op sometimes; and we
+    # can't rely on id(op).
+
+    # pylint: disable=protected-access
+    all_args_dict = dict((op._id, op) for op in all_args)
+
+    # Remove ops created within fn, or that lock_op already has a
+    # control dependency on.  Also remove a possible self-loop.
+    for op in created_ops:
+      all_args_dict.pop(op._id, None)
+    for op in lock_op.control_inputs:
+      all_args_dict.pop(op._id, None)
+    for input_ in lock_op.inputs:
+      all_args_dict.pop(input_.op._id, None)
+    all_args_dict.pop(lock_op._id, None)
+
+    all_args = all_args_dict.values()
+
+    if not all_args:
+      # No control dependencies to add; return early.
+      return
+
+    # This group is important: it ensures that any ops in all_args
+    # outside the control context of the lock_op (and this fn, which
+    # runs in the same context) are added to this context before
+    # being added to the control dependencies of lock_op.
+    all_args = control_flow_ops.group(*all_args)
+
+    lock_op._add_control_input(all_args)
+    # pylint: enable=protected-access
+
+  def _is_self_handle(self, x):
+    """Check if the tensor `x` is the same Mutex as `self._handle`."""
+    return (x.op.type == "MutexV2"
+            # blank shared_name means the op will create a unique one.
+            and x.op.get_attr("shared_name")
+            and (x.op.get_attr("shared_name") ==
+                 self._handle.op.get_attr("shared_name"))
+            and (x.op.device == self._handle.op.device
+                 or _get_colocation(x.op) == _get_colocation(self._handle.op)))
+
+  def _check_multiple_access_to_resources(
+      self, captured_resources, exclusive_resource_access):
+    """Raise if captured_resources are accessed by another CriticalSection.
+
+    Args:
+      captured_resources: Set of tensors of type resource.
+      exclusive_resource_access: Whether this execution requires exclusive
+        resource access.
+
+    Raises:
+      ValueError: If any tensors in `captured_resources` are also accessed
+        by another `CriticalSection`, and at least one of them requires
+        exclusive resource access.
+    """
+    # Collections and op introspection does not work in eager
+    # mode.  This is generally ok; since eager mode (as of
+    # writing) executes sequentially anyway.
+    for sg in ops.get_collection(CRITICAL_SECTION_EXECUTIONS):
+      if self._is_self_handle(sg.handle):
+        # Other executions in the same critical section are allowed.
+        continue
+      if not (exclusive_resource_access or sg.exclusive_resource_access):
+        # Neither execution requested exclusive access.
+        continue
+      resource_intersection = captured_resources.intersection(sg.resources)
+      if resource_intersection:
+        raise ValueError(
+            "This execution would access resources: %s.  Either this "
+            "lock (CriticalSection: %s) or lock '%s' "
+            "(CriticalSection: %s) requested exclusive resource access "
+            "of this resource.  Did you mean to call execute with keyword "
+            "argument exclusive_resource_access=False?" %
+            (list(resource_intersection), self._handle.name,
+             sg.op.name, sg.handle.name))
 
   # TODO(ebrevdo): Re-enable once CriticalSection is in core.
 
@@ -276,6 +398,7 @@ class CriticalSection(object):
 
 # def _execution_to_proto_fn(execution_signature, export_scope=None):
 #   """Converts `_ExecutionSignature` to a `CriticalSectionExecutionDef`.
+#   # TODO(ebrevdo): Update for _ExecutionSignature storing resource list.
 
 #   Args:
 #     execution_signature: Instance of `_ExecutionSignature`.
@@ -298,6 +421,7 @@ class CriticalSection(object):
 
 # def _execution_from_proto_fn(op_def, import_scope=None):
 #   """Converts a `CriticalSectionExecutionDef` to a `_ExecutionSignature`."""
+#   # TODO(ebrevdo): Update for _ExecutionSignature storing resource list.
 #   assert isinstance(
 #       op_def, critical_section_pb2.CriticalSectionExecutionDef)
 
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_test.py b/tensorflow/contrib/framework/python/ops/critical_section_test.py
index a416724d3ba1719471d70667e140f9cd2daf86c7..df7d7e9dae80722569efccbc9cc0d1b75e90cf03 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_test.py
+++ b/tensorflow/contrib/framework/python/ops/critical_section_test.py
@@ -19,14 +19,15 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework.python.ops import critical_section_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 # TODO(ebrevdo): Re-enable once CriticalSection is in core.
 # from tensorflow.python.training import saver as saver_lib
 
@@ -35,26 +36,82 @@ class CriticalSectionTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testCreateCriticalSection(self):
-    cs = critical_section_ops.CriticalSection(name="cs")
+    cs = critical_section_ops.CriticalSection(shared_name="cs")
     v = resource_variable_ops.ResourceVariable(0.0, name="v")
 
     def fn(a, b):
-      c = v.read_value()
+      c = v.value()
       with ops.control_dependencies([c]):
         nv = v.assign_add(a * b)
         with ops.control_dependencies([nv]):
           return array_ops.identity(c)
 
-    num_concurrent = 1000
+    num_concurrent = 100
     r = [cs.execute(fn, 1.0, 2.0) for _ in range(num_concurrent)]
     self.evaluate(v.initializer)
     r_value = self.evaluate(r)
     self.assertAllClose([2.0 * i for i in range(num_concurrent)],
                         sorted(r_value))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testCriticalSectionWithControlFlow(self):
+    for outer_cond in [False, True]:
+      for inner_cond in [False, True]:
+        cs = critical_section_ops.CriticalSection(shared_name="cs")
+        v = resource_variable_ops.ResourceVariable(0.0, name="v")
+        num_concurrent = 100
+
+        # pylint: disable=cell-var-from-loop
+        def fn(a, b):
+          c = v.read_value()
+          def true_fn():
+            with ops.control_dependencies([c]):
+              nv = v.assign_add(a * b)
+              with ops.control_dependencies([nv]):
+                return array_ops.identity(c)
+          return control_flow_ops.cond(
+              array_ops.identity(inner_cond), true_fn, lambda: c)
+
+        def execute():
+          return cs.execute(fn, 1.0, 2.0)
+
+        r = [
+            control_flow_ops.cond(array_ops.identity(outer_cond),
+                                  execute,
+                                  v.read_value)
+            for _ in range(num_concurrent)
+        ]
+        # pylint: enable=cell-var-from-loop
+
+        self.evaluate(v.initializer)
+        r_value = self.evaluate(r)
+        if inner_cond and outer_cond:
+          self.assertAllClose([2.0 * i for i in range(num_concurrent)],
+                              sorted(r_value))
+        else:
+          self.assertAllClose([0] * num_concurrent, r_value)
+
+  def testCriticalSectionInParallelDoesntDeadlockOnError(self):
+    # No eager mode execution of this test because eager does not
+    # run fn() in parallel, which is where the deadlock could
+    # potentially occur (in graph mode).
+    cs = critical_section_ops.CriticalSection(shared_name="cs")
+    v = resource_variable_ops.ResourceVariable(0.0, name="v")
+
+    def fn(i):
+      error = control_flow_ops.Assert((i % 2) == 1, ["Error"])
+      with ops.control_dependencies([error]):
+        return v.read_value()
+    num_concurrent = 2
+    r = [cs.execute(fn, i) for i in range(num_concurrent)]
+    self.evaluate(v.initializer)
+    for _ in range(100):
+      with self.assertRaisesOpError("Error"):
+        self.evaluate(r)
+
   @test_util.run_in_graph_and_eager_modes()
   def testCreateCriticalSectionFnReturnsOp(self):
-    cs = critical_section_ops.CriticalSection(name="cs")
+    cs = critical_section_ops.CriticalSection(shared_name="cs")
     v = resource_variable_ops.ResourceVariable(0.0, name="v")
 
     def fn_return_op(a, b):
@@ -62,7 +119,7 @@ class CriticalSectionTest(test.TestCase):
       with ops.control_dependencies([c]):
         nv = v.assign_add(a * b)
         with ops.control_dependencies([nv]):
-          return ()
+          return control_flow_ops.no_op()
 
     num_concurrent = 100
     r = [cs.execute(fn_return_op, 1.0, 2.0) for _ in range(num_concurrent)]
@@ -71,52 +128,166 @@ class CriticalSectionTest(test.TestCase):
     final_v = self.evaluate(v)
     self.assertAllClose(2.0 * num_concurrent, final_v)
 
-  def testCreateCriticalSectionRaw(self):
-    cs = critical_section_ops.CriticalSection(name="cs")
-    v = resource_variable_ops.ResourceVariable(0.0, name="v")
-
-    @function.Defun(dtypes.float32, dtypes.float32)
-    def fn(a, b):
-      c = v.read_value()
-      with ops.control_dependencies([c]):
-        nv = v.assign_add(a * b)
-        with ops.control_dependencies([nv]):
-          return array_ops.identity(c)
-
-    def execute(fn, *args):
-      output_args = fn.definition.signature.output_arg
-      return resource_variable_ops.execute_in_critical_section(
-          critical_section=cs._handle,
-          arguments=list(args) + fn.captured_inputs,
-          f=fn,
-          output_types=[out.type for out in output_args],
-          output_shapes=[tensor_shape.TensorShape(None) for _ in output_args])
-
-    num_concurrent = 1000
-    r = [execute(fn, 1.0, 2.0)[0] for _ in range(num_concurrent)]
-    self.evaluate(v.initializer)
-    r_value = self.evaluate(r)
-    self.assertAllClose([2.0 * i for i in range(num_concurrent)],
-                        sorted(r_value))
-
   def testCollection(self):
-    cs = critical_section_ops.CriticalSection(name="cs")
+    cs = critical_section_ops.CriticalSection(shared_name="cs")
     self.assertIn(
         cs, ops.get_collection(critical_section_ops.CRITICAL_SECTIONS))
-    execute_op = cs.execute(lambda x: x + 1, 1.0).op
+    execute = cs.execute(lambda x: x + 1, 1.0, name="my_execute")
+    execute_op = [
+        x for x in execute.graph.get_operations()
+        if "my_execute" in x.name and "MutexLock" in x.type
+    ][0]
     self.assertIn(
         execute_op,
         [signature.op for signature in
          ops.get_collection(critical_section_ops.CRITICAL_SECTION_EXECUTIONS)])
 
-  @test_util.run_in_graph_and_eager_modes()
   def testRecursiveCriticalSectionAccessIsIllegal(self):
-    cs = critical_section_ops.CriticalSection(name="cs")
+    # This does not work properly in eager mode.  Eager users will
+    # just hit a deadlock if they do this.  But at least it'll be easier
+    # to debug.
+    cs = critical_section_ops.CriticalSection()
     def fn(x):
-      return cs.execute(lambda x: x+1, x)
+      return cs.execute(lambda y: y + 1, x)
     with self.assertRaisesRegexp(
         ValueError,
-        r"attempts to access the CriticalSection in which it would be running"):
+        r"attempts to directly access the CriticalSection in which it "
+        r"would be running"):
+      cs.execute(fn, 1.0)
+
+  def testRecursiveCriticalSectionAccessViaCapturedTensorIsProtected(self):
+    # This one is subtle; and we're being overly cautious here.  The
+    # deadlock we are ensuring we catch is:
+    #
+    # to_capture = CS[lambda x: x + 1](1.0)
+    # deadlocked = CS[lambda x: x + to_capture](1.0)
+    #
+    # This would have caused a deadlock because executing `deadlocked` will
+    # lock the mutex on CS; but then due to dependencies, will attempt
+    # to compute `to_capture`.  This computation requires locking CS,
+    # but that is not possible now because CS is already locked by
+    # `deadlocked`.
+    #
+    # We check that CriticalSection.execute properly inserts new
+    # control dependencies to its lock to ensure all captured
+    # operations are finished before anything runs within the critical section.
+    cs = critical_section_ops.CriticalSection(shared_name="cs")
+    fn = array_ops.identity
+    to_capture = cs.execute(fn, 1.0)
+    fn_captures = lambda x: x + to_capture
+    to_capture_too = array_ops.identity(to_capture)
+
+    ex_0 = cs.execute(fn_captures, 1.0)
+
+    with ops.control_dependencies([to_capture]):
+      # This is OK because to_capture will execute before this next call
+      ex_1 = cs.execute(fn_captures, 1.0)
+
+    dependency = array_ops.identity(to_capture)
+
+    fn_captures_dependency = lambda x: x + dependency
+
+    ex_2 = cs.execute(fn_captures_dependency, 1.0)
+
+    with ops.control_dependencies([to_capture_too]):
+      ex_3 = cs.execute(fn_captures_dependency, 1.0)
+
+    # Ensure there's no actual deadlock on to_execute.
+    self.assertEquals(2.0, self.evaluate(ex_0))
+    self.assertEquals(2.0, self.evaluate(ex_1))
+    self.assertEquals(2.0, self.evaluate(ex_2))
+    self.assertEquals(2.0, self.evaluate(ex_3))
+
+  def testRecursiveCriticalSectionAccessWithinLoopIsProtected(self):
+    cs = critical_section_ops.CriticalSection(shared_name="cs")
+
+    def body_implicit_capture(i, j):
+      # This would have caused a deadlock if not for logic in execute
+      # that inserts additional control dependencies onto the lock op:
+      #   * Loop body argument j is captured by fn()
+      #   * i is running in parallel to move forward the execution
+      #   * j is not being checked by the predicate function
+      #   * output of cs.execute() is returned as next j.
+      fn = lambda: j + 1
+      return (i + 1, cs.execute(fn))
+
+    (i_n, j_n) = control_flow_ops.while_loop(
+        lambda i, _: i < 1000,
+        body_implicit_capture,
+        [0, 0],
+        parallel_iterations=25)
+    logging.warn(
+        "\n==============\nRunning "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_implicit_capture'\n"
+        "==============\n")
+    self.assertEquals((1000, 1000), self.evaluate((i_n, j_n)))
+    logging.warn(
+        "\n==============\nSuccessfully finished running "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_implicit_capture'\n"
+        "==============\n")
+
+    def body_implicit_capture_protected(i, j):
+      # This version is ok because we manually add a control
+      # dependency on j, which is an argument to the while_loop body
+      # and captured by fn.
+      fn = lambda: j + 1
+      with ops.control_dependencies([j]):
+        return (i + 1, cs.execute(fn))
+
+    (i_n, j_n) = control_flow_ops.while_loop(
+        lambda i, _: i < 1000,
+        body_implicit_capture_protected,
+        [0, 0],
+        parallel_iterations=25)
+    logging.warn(
+        "\n==============\nRunning "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_implicit_capture_protected'\n"
+        "==============\n")
+    self.assertEquals((1000, 1000), self.evaluate((i_n, j_n)))
+    logging.warn(
+        "\n==============\nSuccessfully finished running "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_implicit_capture_protected'\n"
+        "==============\n")
+
+    def body_args_capture(i, j):
+      # This version is ok because j is an argument to fn and we can
+      # ensure there's a control dependency on j.
+      fn = lambda x: x + 1
+      return (i + 1, cs.execute(fn, j))
+
+    (i_n, j_n) = control_flow_ops.while_loop(
+        lambda i, _: i < 1000,
+        body_args_capture,
+        [0, 0],
+        parallel_iterations=25)
+    logging.warn(
+        "\n==============\nRunning "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_args_capture'\n"
+        "==============\n")
+    self.assertEquals((1000, 1000), self.evaluate((i_n, j_n)))
+    logging.warn(
+        "\n==============\nSuccessfully finished running "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_args_capture'\n"
+        "==============\n")
+
+  def testRecursiveCriticalSectionAccessIsIllegalSameSharedName(self):
+    # This does not work properly in eager mode.  Eager users will
+    # just hit a deadlock if they do this.  But at least it'll be easier
+    # to debug.
+    cs = critical_section_ops.CriticalSection(shared_name="cs")
+    cs_same = critical_section_ops.CriticalSection(shared_name="cs")
+    def fn(x):
+      return cs_same.execute(lambda x: x+1, x)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"attempts to directly access the CriticalSection in which it "
+        r"would be running"):
       cs.execute(fn, 1.0)
 
   def testMultipleCSExecutionsRequestSameResource(self):
@@ -147,6 +318,39 @@ class CriticalSectionTest(test.TestCase):
         ValueError, "requested exclusive resource access"):
       cs1.execute(lambda: v2 + 1)
 
+  def testControlDependencyFromOutsideWhileLoopMixedWithInsideLoop(self):
+    cs = critical_section_ops.CriticalSection()
+    v = resource_variable_ops.ResourceVariable(0, name="v")
+    # Make sure that the control dependencies on v do not cause issues
+    # in the lock_op's automatic control dependency adder.
+    #
+    # Note, here v must be a resource variable (or something similar),
+    # otherwise it gets hoisted into the while_loop by the time we add
+    # control dependencies to the lock_op.
+    out = control_flow_ops.while_loop(
+        lambda i: i < 10, lambda i: cs.execute(lambda j: v + j + 1, i), [0])
+    self.evaluate(v.initializer)
+    self.assertEqual(10, self.evaluate(out))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testInsideFunction(self):
+    cs = critical_section_ops.CriticalSection()
+    v = resource_variable_ops.ResourceVariable(1)
+    def fn():
+      return v.read_value()
+
+    # map() creates a TensorFlow function.
+    ds = dataset_ops.Dataset.range(1).map(lambda _: cs.execute(fn))
+
+    def get_first():
+      if context.executing_eagerly():
+        return self.evaluate(ds.make_one_shot_iterator().get_next())
+      itr = ds.make_initializable_iterator()
+      self.evaluate([v.initializer, itr.initializer])
+      return self.evaluate(itr.get_next())
+
+    self.assertEqual(1, get_first())
+
   # TODO(ebrevdo): Re-enable once CriticalSection is in core.
   #
   # def testCriticalSectionAndExecuteOpSaverRoundTrip(self):
@@ -167,7 +371,7 @@ class CriticalSectionTest(test.TestCase):
   #     self.assertEqual(restored_exec[0].op.name, "imported/%s" % r.op.name)
 
   # def testToProto(self):
-  #   cs = critical_section_ops.CriticalSection(name="cs")
+  #   cs = critical_section_ops.CriticalSection(shared_name="cs")
   #   proto = cs.to_proto()
   #   self.assertEqual(proto.critical_section_name, cs._handle.name)
   #   cs_copy = critical_section_ops.CriticalSection.from_proto(proto)
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops.py b/tensorflow/contrib/framework/python/ops/sort_ops.py
index 8f62f0ea7b9b561f235b9496ffda97a9f378d530..1921a77c1e96ee3531d1ed0f98e41c27c9d427ac 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops.py
+++ b/tensorflow/contrib/framework/python/ops/sort_ops.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Support for sorting tensors.
 
+@@argsort
 @@sort
 """
 
@@ -21,6 +22,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -47,64 +51,141 @@ def sort(values, axis=-1, direction='ASCENDING', name=None):
     ValueError: If axis is not a constant scalar, or the direction is invalid.
   """
   with framework_ops.name_scope(name, 'sort'):
-    if direction not in _SORT_IMPL:
-      raise ValueError('%s should be one of %s' %
-                       (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
-    # Axis must be an integer, not a Tensor.
-    axis = framework_ops.convert_to_tensor(axis, name='axis')
-    axis_static = tensor_util.constant_value(axis)
-    if axis.shape.ndims != 0 or axis_static is None:
-      raise ValueError('axis must be a constant scalar')
-    axis_static = int(axis_static)  # Avoids NumPy casting error
+    return _sort_or_argsort(values, axis, direction, return_argsort=False)
+
+
+def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
+  """Returns the indices of a tensor that give its sorted order along an axis.
+
+  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
+  `tf.sort(values)`. For higher dimensions, the output has the same shape as
+  `values`, but along the given axis, values represent the index of the sorted
+  element in that slice of the tensor at the given position.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+        axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+        `'DESCENDING'`).
+    stable: If True, equal elements in the original tensor will not be
+        re-ordered in the returned order. Unstable sort is not yet implemented,
+        but will eventually be the default for performance reasons. If you
+        require a stable order, pass `stable=True` for forwards compatibility.
+    name: Optional name for the operation.
+
+  Returns:
+    An int32 `Tensor` with the same shape as `values`. The indices that would
+        sort each slice of the given `values` along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  del stable  # Unused.
+  with framework_ops.name_scope(name, 'argsort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=True)
+
+
+def _sort_or_argsort(values, axis, direction, return_argsort):
+  """Internal sort/argsort implementation.
+
+  Args:
+    values: The input values.
+    axis: The axis along which to sort.
+    direction: 'ASCENDING' or 'DESCENDING'.
+    return_argsort: Whether to return the argsort result.
+
+  Returns:
+    Either the sorted values, or the indices of the sorted values in the
+        original tensor. See the `sort` and `argsort` docstrings.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  if direction not in _SORT_IMPL:
+    raise ValueError('%s should be one of %s' %
+                     (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
+  # Axis must be an integer, not a Tensor.
+  axis = framework_ops.convert_to_tensor(axis, name='axis')
+  axis_static = tensor_util.constant_value(axis)
+  if axis.shape.ndims != 0 or axis_static is None:
+    raise ValueError('axis must be a constant scalar')
+  axis_static = int(axis_static)  # Avoids NumPy casting error
 
-    values = framework_ops.convert_to_tensor(values, name='values')
+  values = framework_ops.convert_to_tensor(values, name='values')
 
-    return _SORT_IMPL[direction](values, axis_static)
+  return _SORT_IMPL[direction](values, axis_static, return_argsort)
 
 
-def _descending_sort(values, axis):
+def _descending_sort(values, axis, return_argsort=False):
   """Sorts values in reverse using `top_k`.
 
   Args:
     values: Tensor of numeric values.
     axis: Index of the axis which values should be sorted along.
+    return_argsort: If False, return the sorted values. If True, return the
+        indices that would sort the values.
 
   Returns:
     The sorted values.
   """
   k = array_ops.shape(values)[axis]
   rank = array_ops.rank(values)
+  static_rank = values.shape.ndims
   # Fast path: sorting the last axis.
   if axis == -1 or axis + 1 == values.get_shape().ndims:
-    return nn_ops.top_k(values, k)[0]
-
-  # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
-  if axis < 0:
-    # Make axis a Tensor with the real axis index if needed.
-    axis += rank
-  transposition = array_ops.concat(
-      [
-          # Axes up to axis are unchanged.
-          math_ops.range(axis),
-          # Swap axis and rank - 1.
-          [rank - 1],
-          # Axes in [axis + 1, rank - 1) are unchanged.
-          math_ops.range(axis + 1, rank - 1),
-          # Swap axis and rank - 1.
-          [axis]
-      ],
-      axis=0)
-  top_k_input = array_ops.transpose(values, transposition)
-  values, unused_indices = nn_ops.top_k(top_k_input, k)
-  # transposition contains a single cycle of length 2 (swapping 2 elements),
-  # so it is an involution (it is its own inverse).
-  return array_ops.transpose(values, transposition)
-
-
-def _ascending_sort(values, axis):
+    top_k_input = values
+    transposition = None
+  else:
+    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
+    if axis < 0:
+      # Calculate the actual axis index if counting from the end. Use the static
+      # rank if available, or else make the axis back into a tensor.
+      axis += static_rank or rank
+    if static_rank is not None:
+      # Prefer to calculate the transposition array in NumPy and make it a
+      # constant.
+      transposition = constant_op.constant(
+          np.r_[
+              # Axes up to axis are unchanged.
+              np.arange(axis),
+              # Swap axis and rank - 1.
+              [static_rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              np.arange(axis + 1, static_rank - 1),
+              # Swap axis and rank - 1.
+              [axis]],
+          name='transposition')
+    else:
+      # Generate the transposition array from the tensors.
+      transposition = array_ops.concat(
+          [
+              # Axes up to axis are unchanged.
+              math_ops.range(axis),
+              # Swap axis and rank - 1.
+              [rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              math_ops.range(axis + 1, rank - 1),
+              # Swap axis and rank - 1.
+              [axis]
+          ],
+          axis=0)
+    top_k_input = array_ops.transpose(values, transposition)
+
+  values, indices = nn_ops.top_k(top_k_input, k)
+  return_value = indices if return_argsort else values
+  if transposition is not None:
+    # transposition contains a single cycle of length 2 (swapping 2 elements),
+    # so it is an involution (it is its own inverse).
+    return_value = array_ops.transpose(return_value, transposition)
+  return return_value
+
+
+def _ascending_sort(values, axis, return_argsort=False):
   # Negate the values to get the ascending order from descending sort.
-  values_or_indices = _descending_sort(-values, axis)
-  return -values_or_indices
+  values_or_indices = _descending_sort(-values, axis, return_argsort)
+  # If not argsort, negate the values again.
+  return values_or_indices if return_argsort else -values_or_indices
 
 
 _SORT_IMPL = {
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops_test.py b/tensorflow/contrib/framework/python/ops/sort_ops_test.py
index d08ae502f10d98ee14d8bea2f76b18bedb935cea..a8fb94b245dccc8c7cf0e94cef9b436f881fe408 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/sort_ops_test.py
@@ -24,6 +24,8 @@ from tensorflow.contrib.framework.python.ops import sort_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -90,6 +92,38 @@ class SortTest(test.TestCase):
               axis=0,
               direction='DESCENDING').eval())
 
+  def testSort_staticallyKnownRank_constantTransposition(self):
+    # The transposition array should be a constant if the rank of "values" is
+    # statically known.
+    tensor = random_ops.random_uniform(
+        # Rank is statically known to be 5, but the dimension lengths are not
+        # known.
+        random_ops.random_uniform(
+            shape=(5,), minval=0, maxval=10, dtype=dtypes.int32))
+    sort_ops.sort(tensor, axis=1)
+    transposition = (
+        ops.get_default_graph().get_tensor_by_name('sort/transposition:0'))
+    self.assertFalse(tensor_util.constant_value(transposition) is None)
+    self.assertAllEqual(
+        # Swaps "1" and "4" to put "1" at the end.
+        tensor_util.constant_value(transposition),
+        [0, 4, 2, 3, 1])
+
+  def testArgsort_1d(self):
+    arr = np.random.random(42)
+    with self.test_session():
+      self.assertAllEqual(
+          np.sort(arr),
+          array_ops.gather(arr, sort_ops.argsort(arr)).eval())
+
+  def testArgsort(self):
+    arr = np.random.random((5, 6, 7, 8))
+    for axis in range(4):
+      with self.test_session():
+        self.assertAllEqual(
+            np.argsort(arr, axis=axis),
+            sort_ops.argsort(arr, axis=axis).eval())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index 0754c3e0e30a340910a43a3ce86f6ca10afe848e..40ae01bfcce1dde580e6a5f6d9c8ec1aa1abb83f 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import tf_logging as logging
@@ -82,7 +83,12 @@ def zero_initializer(ref, use_locking=True, name="zero_initializer"):
   """
   loader.load_op_library(
       resource_loader.get_path_to_datafile("_variable_ops.so"))
-  return gen_variable_ops.zero_initializer(ref, name=name)
+  if resource_variable_ops.is_resource_variable(ref):
+    return gen_variable_ops.zero_var_initializer(
+        ref.handle, shape=ref.shape, dtype=ref.dtype, name=name)
+  else:
+    return gen_variable_ops.zero_initializer(ref, name=name)
+
 
 @deprecated(None, "Please switch to tf.train.assert_global_step")
 def assert_global_step(global_step_tensor):
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index 2f06df93acb0a4c0b36c68839ff531e3c22c5ee3..37ea6eb12aba7d25656f19cbbc86475c1228d916 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -1284,6 +1284,32 @@ class ZeroInitializerOpTest(test.TestCase):
                 [10, 20], dtype=dtype), use_init)
 
 
+class ZeroVarInitializerOpTest(test.TestCase):
+
+  def _testZeroVarInitializer(self, shape, initializer, use_init):
+    var = resource_variable_ops.ResourceVariable(initializer)
+    var_zero = variables_lib2.zero_initializer(var)
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError('Error while reading resource variable'):
+        var.eval()
+      if use_init:
+        sess.run(var.initializer)
+        with self.assertRaisesOpError('input is already initialized'):
+          var_zero.eval()
+        self.assertAllClose(np.ones(shape), var.eval())
+      else:
+        var_zero.eval()
+        self.assertAllClose(np.zeros(shape), var.eval())
+
+  def testZeroVarInitializer(self):
+    for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64):
+      for use_init in (False, True):
+        self._testZeroVarInitializer([10, 20],
+                                     array_ops.ones([10, 20], dtype=dtype),
+                                     use_init)
+
+
 class FilterVariablesTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index ce37672895b37275770d2f5410f662e9acf1de9d..0eb6889db1fae1c74aeb4392441b308392b091a5 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -157,15 +157,3 @@ cuda_py_test(
         "requires_cudnn6",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 0e06575d96f9b9538f0245b12d48cfd7c0e8d981..2458f7554afdc12709571c551a8323cda7fa5c17 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -247,7 +247,7 @@ class FusedConv2DBiasActivationOp : public OpKernel {
 };
 
 #if GOOGLE_CUDA
-namespace dnn = ::perftools::gputools::dnn;
+namespace dnn = se::dnn;
 
 // A dummy type to group forward convolution autotune results together.
 struct ConvBiasActivationAutoTuneGroup {
@@ -543,7 +543,8 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
                                 fused_conv_parameters, &algorithm_config)) {
     std::vector<dnn::AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
-        fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(),
+        fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+            stream->parent()),
         &algorithms));
     dnn::ProfileResult best_result;
     dnn::ProfileResult best_result_no_scratch;
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
index a97adf622e6e576f8b5ce2babe004cb3a46d80a5..983b6dc8e5a1512ba81ecbc8d5ca5adaea09afe4 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
@@ -65,7 +65,7 @@ def fused_conv2d_bias_activation(conv_input,
     side_input_scale: A scalar `float32` that will be multiplied by side_input.
         This is optional and defaults to 0.
     side_input: A `Tensor` of the format specified by `data_format`.
-        This is useful for imlementing ResNet blocks.
+        This is useful for implementing ResNet blocks.
     activation_mode: (optional) currently must be the default "Relu".
         Note that in qint8 mode, it also clips to 127, so acts like ReluX.
     data_format: Specifies the data format.
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index bb155aa2496cbafd9f0630d3dffb2ba69395186c..3d0ed899322c26bf4ae428930899d7a5885e9f21 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -566,7 +566,7 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding,
   return Test
 
 
-def CalculateCovolvedOutputDim(input_dim, filter_dim, stride, padding_type):
+def CalculateConvolvedOutputDim(input_dim, filter_dim, stride, padding_type):
   """Calculates the size of an output dimension of a strided convolution.
 
   Given the sizes of the corresponding dimension of the input and filter shapes,
@@ -827,10 +827,10 @@ class FusedConvInt8Tests(test.TestCase):
             maxval=1.0,
             dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
 
-    output_height = CalculateCovolvedOutputDim(input_height, filter_height,
-                                               vertical_stride, padding_type)
-    output_width = CalculateCovolvedOutputDim(input_width, filter_width,
-                                              horizontal_stride, padding_type)
+    output_height = CalculateConvolvedOutputDim(input_height, filter_height,
+                                                vertical_stride, padding_type)
+    output_width = CalculateConvolvedOutputDim(input_width, filter_width,
+                                               horizontal_stride, padding_type)
     print("output_height=", output_height, ", output_width=", output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 0eb0e3cbe20f5804db5476c08167d4e1c9080cfa..b305f37791d71f5a6edeada2bb710a2e5f23087d 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -354,6 +354,7 @@ py_test(
     name = "classifier_metrics_test",
     srcs = ["python/eval/python/classifier_metrics_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":classifier_metrics",
         "//tensorflow/core:protos_all_py",
@@ -363,6 +364,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -544,15 +546,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 082c42eba180917e732bb7890129dfa94bf00fec..e3fc6bf0f034051fc33ff5966e2f4ea85aa538db 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -88,8 +88,8 @@ class GANEstimator(estimator.Estimator):
           discriminator_fn=discriminator_fn,
           generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
           discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
-          generator_optimizer=tf.train.AdamOptimizier(0.1, 0.5),
-          discriminator_optimizer=tf.train.AdamOptimizier(0.1, 0.5))
+          generator_optimizer=tf.train.AdamOptimizer(0.1, 0.5),
+          discriminator_optimizer=tf.train.AdamOptimizer(0.1, 0.5))
 
       # Train estimator.
       gan_estimator.train(train_input_fn, steps)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index fdfabd07c13f689d075ecbb8786d725fa8a62d01..d914f549457a1e893ed43a3b8bc1ae5be7bb4303 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -44,11 +44,11 @@ from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import resource_loader
 
-
 __all__ = [
     'get_graph_def_from_disk',
     'get_graph_def_from_resource',
@@ -62,10 +62,11 @@ __all__ = [
     'frechet_inception_distance',
     'frechet_classifier_distance',
     'frechet_classifier_distance_from_activations',
+    'mean_only_frechet_classifier_distance_from_activations',
+    'diagonal_only_frechet_classifier_distance_from_activations',
     'INCEPTION_DEFAULT_IMAGE_SIZE',
 ]
 
-
 INCEPTION_URL = 'http://download.tensorflow.org/models/frozen_inception_v1_2015_12_05.tar.gz'
 INCEPTION_FROZEN_GRAPH = 'inceptionv1_for_inception_score.pb'
 INCEPTION_INPUT = 'Mul:0'
@@ -77,8 +78,7 @@ INCEPTION_DEFAULT_IMAGE_SIZE = 299
 def _validate_images(images, image_size):
   images = ops.convert_to_tensor(images)
   images.shape.with_rank(4)
-  images.shape.assert_is_compatible_with(
-      [None, image_size, image_size, None])
+  images.shape.assert_is_compatible_with([None, image_size, image_size, None])
   return images
 
 
@@ -109,9 +109,10 @@ def _symmetric_matrix_square_root(mat, eps=1e-10):
       math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
 
 
-def preprocess_image(
-    images, height=INCEPTION_DEFAULT_IMAGE_SIZE,
-    width=INCEPTION_DEFAULT_IMAGE_SIZE, scope=None):
+def preprocess_image(images,
+                     height=INCEPTION_DEFAULT_IMAGE_SIZE,
+                     width=INCEPTION_DEFAULT_IMAGE_SIZE,
+                     scope=None):
   """Prepare a batch of images for evaluation.
 
   This is the preprocessing portion of the graph from
@@ -272,8 +273,11 @@ def run_inception(images,
   return activations
 
 
-def run_image_classifier(tensor, graph_def, input_tensor,
-                         output_tensor, scope='RunClassifier'):
+def run_image_classifier(tensor,
+                         graph_def,
+                         input_tensor,
+                         output_tensor,
+                         scope='RunClassifier'):
   """Runs a network from a frozen graph.
 
   Args:
@@ -317,7 +321,7 @@ def classifier_score(images, classifier_fn, num_batches=1):
 
   NOTE: This function consumes images, computes their logits, and then
   computes the classifier score. If you would like to precompute many logits for
-  large batches, use clasifier_score_from_logits(), which this method also
+  large batches, use classifier_score_from_logits(), which this method also
   uses.
 
   Args:
@@ -433,8 +437,8 @@ def trace_sqrt_product(sigma, sigma_v):
   sqrt_sigma = _symmetric_matrix_square_root(sigma)
 
   # This is sqrt(A sigma_v A) above
-  sqrt_a_sigmav_a = math_ops.matmul(
-      sqrt_sigma, math_ops.matmul(sigma_v, sqrt_sigma))
+  sqrt_a_sigmav_a = math_ops.matmul(sqrt_sigma,
+                                    math_ops.matmul(sigma_v, sqrt_sigma))
 
   return math_ops.trace(_symmetric_matrix_square_root(sqrt_a_sigmav_a))
 
@@ -450,9 +454,9 @@ def frechet_classifier_distance(real_images,
 
   This technique is described in detail in https://arxiv.org/abs/1706.08500.
   Given two Gaussian distribution with means m and m_w and covariance matrices
-  C and C_w, this function calcuates
+  C and C_w, this function calculates
 
-  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
+              |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
 
   which captures how different the distributions of real images and generated
   images (or more accurately, their visual features) are. Note that unlike the
@@ -463,7 +467,7 @@ def frechet_classifier_distance(real_images,
   Frechet distance is biased. It is more biased for small sample sizes. (e.g.
   even if the two distributions are the same, for a small sample size, the
   expected Frechet distance is large). It is important to use the same
-  sample size to compute frechet classifier distance when comparing two
+  sample size to compute Frechet classifier distance when comparing two
   generative models.
 
   NOTE: This function consumes images, computes their activations, and then
@@ -484,25 +488,25 @@ def frechet_classifier_distance(real_images,
     The Frechet Inception distance. A floating-point scalar of the same type
     as the output of `classifier_fn`.
   """
-
   real_images_list = array_ops.split(
       real_images, num_or_size_splits=num_batches)
   generated_images_list = array_ops.split(
       generated_images, num_or_size_splits=num_batches)
 
-  imgs = array_ops.stack(real_images_list + generated_images_list)
+  real_imgs = array_ops.stack(real_images_list)
+  generated_imgs = array_ops.stack(generated_images_list)
 
   # Compute the activations using the memory-efficient `map_fn`.
-  activations = functional_ops.map_fn(
-      fn=classifier_fn,
-      elems=imgs,
-      parallel_iterations=1,
-      back_prop=False,
-      swap_memory=True,
-      name='RunClassifier')
+  def compute_activations(elems):
+    return functional_ops.map_fn(fn=classifier_fn,
+                                 elems=elems,
+                                 parallel_iterations=1,
+                                 back_prop=False,
+                                 swap_memory=True,
+                                 name='RunClassifier')
 
-  # Split the activations by the real and generated images.
-  real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0)
+  real_a = compute_activations(real_imgs)
+  gen_a = compute_activations(generated_imgs)
 
   # Ensure the activations have the right shapes.
   real_a = array_ops.concat(array_ops.unstack(real_a), 0)
@@ -511,10 +515,142 @@ def frechet_classifier_distance(real_images,
   return frechet_classifier_distance_from_activations(real_a, gen_a)
 
 
-def frechet_classifier_distance_from_activations(
+def mean_only_frechet_classifier_distance_from_activations(
     real_activations, generated_activations):
   """Classifier distance for evaluating a generative model from activations.
 
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calcuates
+
+                                |m - m_w|^2
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
+  In this variant, we only compute the difference between the means of the
+  fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet
+  still retains much of the same information as FID.
+
+  Args:
+    real_activations: 2D array of activations of real images of size
+      [num_images, num_dims] to use to compute Frechet Inception distance.
+    generated_activations: 2D array of activations of generated images of size
+      [num_images, num_dims] to use to compute Frechet Inception distance.
+
+  Returns:
+    The mean-only Frechet Inception distance. A floating-point scalar of the
+    same type as the output of the activations.
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
+
+  # Compute means of activations.
+  m = math_ops.reduce_mean(real_activations, 0)
+  m_w = math_ops.reduce_mean(generated_activations, 0)
+
+  # Next the distance between means.
+  mean = math_ops.reduce_sum(
+      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
+  mofid = mean
+  if activations_dtype != dtypes.float64:
+    mofid = math_ops.cast(mofid, activations_dtype)
+
+  return mofid
+
+
+def diagonal_only_frechet_classifier_distance_from_activations(
+    real_activations, generated_activations):
+  """Classifier distance for evaluating a generative model.
+
+  This is based on the Frechet Inception distance, but for an arbitrary
+  classifier.
+
+  This technique is described in detail in https://arxiv.org/abs/1706.08500.
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calcuates
+
+          |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2))
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images. In this variant, we compute diagonal-only covariance matrices.
+  As a result, instead of computing an expensive matrix square root, we can do
+  something much simpler, and has O(n) vs O(n^2) space complexity.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
+  Args:
+    real_activations: Real images to use to compute Frechet Inception distance.
+    generated_activations: Generated images to use to compute Frechet Inception
+      distance.
+
+  Returns:
+    The diagonal-only Frechet Inception distance. A floating-point scalar of
+    the same type as the output of the activations.
+
+  Raises:
+    ValueError: If the shape of the variance and mean vectors are not equal.
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
+
+  # Compute mean and covariance matrices of activations.
+  m, var = nn_impl.moments(real_activations, axes=[0])
+  m_w, var_w = nn_impl.moments(generated_activations, axes=[0])
+
+  actual_shape = var.get_shape()
+  expected_shape = m.get_shape()
+
+  if actual_shape != expected_shape:
+    raise ValueError('shape: {} must match expected shape: {}'.format(
+        actual_shape, expected_shape))
+
+  # Compute the two components of FID.
+
+  # First the covariance component.
+  # Here, note that trace(A + B) = trace(A) + trace(B)
+  trace = math_ops.reduce_sum(
+      (var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w)))
+
+  # Next the distance between means.
+  mean = math_ops.reduce_sum(
+      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
+  dofid = trace + mean
+  if activations_dtype != dtypes.float64:
+    dofid = math_ops.cast(dofid, activations_dtype)
+
+  return dofid
+
+
+def frechet_classifier_distance_from_activations(real_activations,
+                                                 generated_activations):
+  """Classifier distance for evaluating a generative model.
+
   This methods computes the Frechet classifier distance from activations of
   real images and generated images. This can be used independently of the
   frechet_classifier_distance() method, especially in the case of using large
@@ -523,15 +659,22 @@ def frechet_classifier_distance_from_activations(
 
   This technique is described in detail in https://arxiv.org/abs/1706.08500.
   Given two Gaussian distribution with means m and m_w and covariance matrices
-  C and C_w, this function calcuates
+  C and C_w, this function calculates
 
-  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
+                |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
 
   which captures how different the distributions of real images and generated
   images (or more accurately, their visual features) are. Note that unlike the
   Inception score, this is a true distance and utilizes information about real
   world images.
 
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
   Args:
     real_activations: 2D Tensor containing activations of real data. Shape is
       [batch_size, activation_size].
@@ -553,36 +696,40 @@ def frechet_classifier_distance_from_activations(
 
   # Compute mean and covariance matrices of activations.
   m = math_ops.reduce_mean(real_activations, 0)
-  m_v = math_ops.reduce_mean(generated_activations, 0)
-  num_examples = math_ops.to_double(array_ops.shape(real_activations)[0])
+  m_w = math_ops.reduce_mean(generated_activations, 0)
+  num_examples_real = math_ops.to_double(array_ops.shape(real_activations)[0])
+  num_examples_generated = math_ops.to_double(
+      array_ops.shape(generated_activations)[0])
 
   # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
   real_centered = real_activations - m
   sigma = math_ops.matmul(
-      real_centered, real_centered, transpose_a=True) / (num_examples - 1)
+      real_centered, real_centered, transpose_a=True) / (
+          num_examples_real - 1)
 
-  gen_centered = generated_activations - m_v
-  sigma_v = math_ops.matmul(
-      gen_centered, gen_centered, transpose_a=True) / (num_examples - 1)
+  gen_centered = generated_activations - m_w
+  sigma_w = math_ops.matmul(
+      gen_centered, gen_centered, transpose_a=True) / (
+          num_examples_generated - 1)
 
-  # Find the Tr(sqrt(sigma sigma_v)) component of FID
-  sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)
+  # Find the Tr(sqrt(sigma sigma_w)) component of FID
+  sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)
 
   # Compute the two components of FID.
 
   # First the covariance component.
   # Here, note that trace(A + B) = trace(A) + trace(B)
-  trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component
+  trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component
 
   # Next the distance between means.
-  mean = math_ops.square(linalg_ops.norm(m - m_v))  # This uses the L2 norm.
+  mean = math_ops.reduce_sum(
+      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
   fid = trace + mean
   if activations_dtype != dtypes.float64:
     fid = math_ops.cast(fid, activations_dtype)
 
   return fid
 
-
 frechet_inception_distance = functools.partial(
     frechet_classifier_distance,
     classifier_fn=functools.partial(
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index 61dc8646ddc10605561ae6b19e90f4739c346608..4fb8d58bc9125664d42260de72b83b2362eff9ba 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -22,6 +22,7 @@ import os
 import tarfile
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 from scipy import linalg as scp_linalg
 
@@ -50,6 +51,26 @@ def _expected_inception_score(logits):
   return np.exp(np.mean(per_example_logincscore))
 
 
+def _expected_mean_only_fid(real_imgs, gen_imgs):
+  m = np.mean(real_imgs, axis=0)
+  m_v = np.mean(gen_imgs, axis=0)
+  mean = np.square(m - m_v).sum()
+  mofid = mean
+  return mofid
+
+
+def _expected_diagonal_only_fid(real_imgs, gen_imgs):
+  m = np.mean(real_imgs, axis=0)
+  m_v = np.mean(gen_imgs, axis=0)
+  var = np.var(real_imgs, axis=0)
+  var_v = np.var(gen_imgs, axis=0)
+  sqcc = np.sqrt(var * var_v)
+  mean = (np.square(m - m_v)).sum()
+  trace = (var + var_v - 2 * sqcc).sum()
+  dofid = mean + trace
+  return dofid
+
+
 def _expected_fid(real_imgs, gen_imgs):
   m = np.mean(real_imgs, axis=0)
   m_v = np.mean(gen_imgs, axis=0)
@@ -162,13 +183,20 @@ def _run_with_mock(function, *args, **kwargs):
     return function(*args, **kwargs)
 
 
-class ClassifierMetricsTest(test.TestCase):
+class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
 
-  def test_run_inception_graph(self):
+  @parameterized.named_parameters(
+      ('GraphDef', False),
+      ('DefaultGraphDefFn', True))
+  def test_run_inception_graph(self, use_default_graph_def):
     """Test `run_inception` graph construction."""
     batch_size = 7
     img = array_ops.ones([batch_size, 299, 299, 3])
-    logits = _run_with_mock(classifier_metrics.run_inception, img)
+
+    if use_default_graph_def:
+      logits = _run_with_mock(classifier_metrics.run_inception, img)
+    else:
+      logits = classifier_metrics.run_inception(img, _get_dummy_graphdef())
 
     self.assertTrue(isinstance(logits, ops.Tensor))
     logits.shape.assert_is_compatible_with([batch_size, 1001])
@@ -176,14 +204,23 @@ class ClassifierMetricsTest(test.TestCase):
     # Check that none of the model variables are trainable.
     self.assertListEqual([], variables.trainable_variables())
 
-  def test_run_inception_graph_pool_output(self):
+  @parameterized.named_parameters(
+      ('GraphDef', False),
+      ('DefaultGraphDefFn', True))
+  def test_run_inception_graph_pool_output(self, use_default_graph_def):
     """Test `run_inception` graph construction with pool output."""
     batch_size = 3
     img = array_ops.ones([batch_size, 299, 299, 3])
-    pool = _run_with_mock(
-        classifier_metrics.run_inception,
-        img,
-        output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
+
+    if use_default_graph_def:
+      pool = _run_with_mock(
+          classifier_metrics.run_inception,
+          img,
+          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
+    else:
+      pool = classifier_metrics.run_inception(
+          img, _get_dummy_graphdef(),
+          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
 
     self.assertTrue(isinstance(pool, ops.Tensor))
     pool.shape.assert_is_compatible_with([batch_size, 2048])
@@ -285,6 +322,46 @@ class ClassifierMetricsTest(test.TestCase):
 
     self.assertAllClose(_expected_inception_score(logits), incscore_np)
 
+  def test_mean_only_frechet_classifier_distance_value(self):
+    """Test that `frechet_classifier_distance` gives the correct value."""
+    np.random.seed(0)
+
+    pool_real_a = np.float32(np.random.randn(256, 2048))
+    pool_gen_a = np.float32(np.random.randn(256, 2048))
+
+    tf_pool_real_a = array_ops.constant(pool_real_a)
+    tf_pool_gen_a = array_ops.constant(pool_gen_a)
+
+    mofid_op = classifier_metrics.mean_only_frechet_classifier_distance_from_activations(  # pylint: disable=line-too-long
+        tf_pool_real_a, tf_pool_gen_a)
+
+    with self.test_session() as sess:
+      actual_mofid = sess.run(mofid_op)
+
+    expected_mofid = _expected_mean_only_fid(pool_real_a, pool_gen_a)
+
+    self.assertAllClose(expected_mofid, actual_mofid, 0.0001)
+
+  def test_diagonal_only_frechet_classifier_distance_value(self):
+    """Test that `frechet_classifier_distance` gives the correct value."""
+    np.random.seed(0)
+
+    pool_real_a = np.float32(np.random.randn(256, 2048))
+    pool_gen_a = np.float32(np.random.randn(256, 2048))
+
+    tf_pool_real_a = array_ops.constant(pool_real_a)
+    tf_pool_gen_a = array_ops.constant(pool_gen_a)
+
+    dofid_op = classifier_metrics.diagonal_only_frechet_classifier_distance_from_activations(  # pylint: disable=line-too-long
+        tf_pool_real_a, tf_pool_gen_a)
+
+    with self.test_session() as sess:
+      actual_dofid = sess.run(dofid_op)
+
+    expected_dofid = _expected_diagonal_only_fid(pool_real_a, pool_gen_a)
+
+    self.assertAllClose(expected_dofid, actual_dofid, 0.0001)
+
   def test_frechet_classifier_distance_value(self):
     """Test that `frechet_classifier_distance` gives the correct value."""
     np.random.seed(0)
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
index 9bebcacbe46d85fc4226c4275b71b3ecbde57a97..4b1105f6bd4f21a0da02338b0fc9db87a41b145f 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
@@ -161,7 +161,7 @@ def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
     proj = random_ops.random_normal(
         [array_ops.shape(a)[1], random_projection_dim])
     proj *= math_ops.rsqrt(
-        math_ops.reduce_sum(math_ops.square(proj), 0, keep_dims=True))
+        math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True))
     # Project both distributions and sort them.
     proj_a = math_ops.matmul(a, proj)
     proj_b = math_ops.matmul(b, proj)
@@ -212,7 +212,7 @@ def sliced_wasserstein_distance(real_images,
   Args:
       real_images: (tensor) Real images (batch, height, width, channels).
       fake_images: (tensor) Fake images (batch, height, width, channels).
-      resolution_min: (int) Minimum resolution for the Laplacion pyramid.
+      resolution_min: (int) Minimum resolution for the Laplacian pyramid.
       patches_per_image: (int) Number of patches to extract per image per
         Laplacian level.
       patch_size: (int) Width of a square patch.
@@ -221,7 +221,7 @@ def sliced_wasserstein_distance(real_images,
       use_svd: experimental method to compute a more accurate distance.
   Returns:
       List of tuples (distance_real, distance_fake) for each level of the
-      Laplacian pyramid from the highest resoluion to the lowest.
+      Laplacian pyramid from the highest resolution to the lowest.
         distance_real is the Wasserstein distance between real images
         distance_fake is the Wasserstein distance between real and fake images.
   Raises:
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index 0d1afad72da8a8e087239868e25ddebe23490d1e..508f487722fba89cc8391a340f73673a526e86c4 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -31,6 +31,7 @@ __all__ = [
     'add_image_comparison_summaries',
     'add_gan_model_summaries',
     'add_regularization_loss_summaries',
+    'add_cyclegan_image_summaries',
 ]
 
 
@@ -51,14 +52,9 @@ def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
     ValueError: If real and generated data aren't images.
   """
   if isinstance(gan_model, namedtuples.CycleGANModel):
-    saved_params = locals()
-    saved_params.pop('gan_model', None)
-    with ops.name_scope('cyclegan_x2y_image_summaries'):
-      add_gan_model_image_summaries(gan_model.model_x2y, **saved_params)
-    with ops.name_scope('cyclegan_y2x_image_summaries'):
-      add_gan_model_image_summaries(gan_model.model_y2x, **saved_params)
-    return
-
+    raise ValueError(
+        '`add_gan_model_image_summaries` does not take CycleGANModels. Please '
+        'use `add_cyclegan_image_summaries` instead.')
   _assert_is_image(gan_model.real_data)
   _assert_is_image(gan_model.generated_data)
 
@@ -89,6 +85,49 @@ def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
     add_gan_model_summaries(gan_model)
 
 
+def add_cyclegan_image_summaries(cyclegan_model):
+  """Adds image summaries for CycleGAN.
+
+  There are two summaries, one for each generator. The first image is the
+  generator input, the second is the generator output, and the third is G(F(x)).
+
+  Args:
+    cyclegan_model: A CycleGANModel tuple.
+
+  Raises:
+    ValueError: If `cyclegan_model` isn't a CycleGANModel.
+    ValueError: If generated data, generator inputs, and reconstructions aren't
+      images.
+    ValueError: If the generator input, generated data, and reconstructions
+      aren't all the same size.
+  """
+  if not isinstance(cyclegan_model, namedtuples.CycleGANModel):
+    raise ValueError('`cyclegan_model` was not a CycleGANModel. Instead, was '
+                     '%s' % type(cyclegan_model))
+
+  _assert_is_image(cyclegan_model.model_x2y.generator_inputs)
+  _assert_is_image(cyclegan_model.model_x2y.generated_data)
+  _assert_is_image(cyclegan_model.reconstructed_x)
+  _assert_is_image(cyclegan_model.model_y2x.generator_inputs)
+  _assert_is_image(cyclegan_model.model_y2x.generated_data)
+  _assert_is_image(cyclegan_model.reconstructed_y)
+
+  def _add_comparison_summary(gan_model, reconstructions):
+    image_list = (array_ops.unstack(gan_model.generator_inputs[:1]) +
+                  array_ops.unstack(gan_model.generated_data[:1]) +
+                  array_ops.unstack(reconstructions[:1]))
+    summary.image(
+        'image_comparison', eval_utils.image_reshaper(
+            image_list, num_cols=len(image_list)), max_outputs=1)
+
+  with ops.name_scope('x2y_image_comparison_summaries'):
+    _add_comparison_summary(
+        cyclegan_model.model_x2y, cyclegan_model.reconstructed_x)
+  with ops.name_scope('y2x_image_comparison_summaries'):
+    _add_comparison_summary(
+        cyclegan_model.model_y2x, cyclegan_model.reconstructed_y)
+
+
 def add_image_comparison_summaries(gan_model, num_comparisons=2,
                                    display_diffs=False):
   """Adds image summaries to compare triplets of images.
@@ -109,15 +148,6 @@ def add_image_comparison_summaries(gan_model, num_comparisons=2,
     ValueError: If the generator input, real, and generated data aren't all the
       same size.
   """
-  if isinstance(gan_model, namedtuples.CycleGANModel):
-    saved_params = locals()
-    saved_params.pop('gan_model', None)
-    with ops.name_scope('cyclegan_x2y_image_comparison_summaries'):
-      add_image_comparison_summaries(gan_model.model_x2y, **saved_params)
-    with ops.name_scope('cyclegan_y2x_image_comparison_summaries'):
-      add_image_comparison_summaries(gan_model.model_y2x, **saved_params)
-    return
-
   _assert_is_image(gan_model.generator_inputs)
   _assert_is_image(gan_model.generated_data)
   _assert_is_image(gan_model.real_data)
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 45eb108586bed07434ac29595164745eac6054c1..33d51bfc218ab93fb52439b1eefed98a4568c4a1 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -65,15 +65,14 @@ def get_cyclegan_model():
   return namedtuples.CycleGANModel(
       model_x2y=model_x2y,
       model_y2x=model_y2x,
-      reconstructed_x=array_ops.zeros([3, 30, 35, 6]),
-      reconstructed_y=array_ops.zeros([3, 30, 35, 6]))
+      reconstructed_x=array_ops.zeros([4, 32, 32, 3]),
+      reconstructed_y=array_ops.zeros([4, 32, 32, 3]))
 
 
 class SummariesTest(test.TestCase):
 
-  def _test_add_gan_model_image_summaries_impl(self, get_model_fn,
-                                               expected_num_summary_ops,
-                                               model_summaries):
+  def _test_add_gan_model_image_summaries_impl(
+      self, get_model_fn, expected_num_summary_ops, model_summaries):
     summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2,
                                             model_summaries=model_summaries)
 
@@ -89,8 +88,9 @@ class SummariesTest(test.TestCase):
   def test_add_gan_model_image_summaries_no_model(self):
     self._test_add_gan_model_image_summaries_impl(get_gan_model, 2, False)
 
-  def test_add_gan_model_image_summaries_for_cyclegan(self):
-    self._test_add_gan_model_image_summaries_impl(get_cyclegan_model, 10, True)
+  def test_cyclegan_image_summaries_dont_work(self):
+    with self.assertRaises(ValueError):
+      summaries.add_gan_model_image_summaries(get_cyclegan_model())
 
   def _test_add_gan_model_summaries_impl(self, get_model_fn,
                                          expected_num_summary_ops):
@@ -137,7 +137,11 @@ class SummariesTest(test.TestCase):
     self._test_add_image_comparison_summaries_impl(get_gan_model, 1)
 
   def test_add_image_comparison_summaries_for_cyclegan(self):
-    self._test_add_image_comparison_summaries_impl(get_cyclegan_model, 2)
+    summaries.add_cyclegan_image_summaries(get_cyclegan_model())
+
+    self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
+    with self.test_session(use_gpu=True):
+      summary.merge_all().eval()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
index cd31c62667fc048b1003d334377405b284f32af5..e2594faf85bcf91cbe09f266e4d4211d20bdee17 100644
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Miscellanous utilities for TFGAN code and examples.
+"""Miscellaneous utilities for TFGAN code and examples.
 
 Includes:
 1) Conditioning the value of a Tensor, based on techniques from
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
index 4cfae0de4451880cf8229903b0eb74b1c6e2e04d..9e4ec59e7098443efc53506a4ba159e84b5c1618 100644
--- a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
@@ -17,7 +17,7 @@
 We use this to keep a history of values created by a generator, such that
 a discriminator can randomly be trained on some older samples, not just the
 current one. This can help to not let the discriminator get too far ahead of the
-generator and also to keep the system from oscilating, if the discriminator
+generator and also to keep the system from oscillating, if the discriminator
 forgets too fast what past samples from the generator looked like.
 
 See the following papers for more details.
@@ -97,7 +97,7 @@ def tensor_pool(input_values,
         dtypes=[v.dtype for v in input_values],
         shapes=None)
 
-    # In pseudeo code this code does the following:
+    # In pseudo code this code does the following:
     # if not pool_full:
     #   enqueue(input_values)
     #   return input_values
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
index f8b372546b60ec8fa5fd1d72b57adaf67596c059..650eab97a3952e9aec2b489fffcc83c3bc49f2dd 100644
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
@@ -64,11 +64,11 @@ def _statistics(x, axes):
   y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
 
   # Compute true mean while keeping the dims for proper broadcasting.
-  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keep_dims=True))
+  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keepdims=True))
 
-  shifted_mean = math_ops.reduce_mean(y - shift, axes, keep_dims=True)
+  shifted_mean = math_ops.reduce_mean(y - shift, axes, keepdims=True)
   mean = shifted_mean + shift
-  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keep_dims=True)
+  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keepdims=True)
 
   mean = array_ops.squeeze(mean, axes)
   mean_squared = array_ops.squeeze(mean_squared, axes)
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
index 845f89827b6e60eda41a55a80671f43460247b05..2fe06a287284ff994326d5a977a2e4d4634268ae 100644
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
+++ b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
@@ -148,7 +148,7 @@ class VirtualBatchnormTest(test.TestCase):
       self.assertAllClose(bn_np[i, ...], vb_np)
 
   def test_minibatch_independent(self):
-    """Test that virtual batch normalized exampels are independent.
+    """Test that virtual batch normalized examples are independent.
 
     Unlike batch normalization, virtual batch normalization has the property
     that the virtual batch normalized value of an example is independent of the
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index 39588b7219ebac1cc4855532be3fcc38e6381134..1ba3a641671c7f2a411a0c5f99228ca16eee1080 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -306,6 +306,7 @@ def wasserstein_gradient_penalty(
     discriminator_scope,
     epsilon=1e-10,
     target=1.0,
+    one_sided=False,
     weights=1.0,
     scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -327,6 +328,8 @@ def wasserstein_gradient_penalty(
       computing the gradient norm.
     target: Optional Python number or `Tensor` indicating the target value of
       gradient norm. Defaults to 1.0.
+    one_sided: If `True`, penalty proposed in https://arxiv.org/abs/1709.08894
+      is used. Defaults to `False`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `real_data` and `generated_data`, and must be broadcastable to
       them (i.e., all dimensions must be either `1`, or the same as the
@@ -377,10 +380,13 @@ def wasserstein_gradient_penalty(
     # For numerical stability, add epsilon to the sum before taking the square
     # root. Note tf.norm does not add epsilon.
     slopes = math_ops.sqrt(gradient_squares + epsilon)
-    penalties = math_ops.square(slopes / target - 1.0)
+    penalties = slopes / target - 1.0
+    if one_sided:
+      penalties = math_ops.maximum(0., penalties)
+    penalties_squared = math_ops.square(penalties)
     penalty = losses.compute_weighted_loss(
-        penalties, weights, scope=scope, loss_collection=loss_collection,
-        reduction=reduction)
+        penalties_squared, weights, scope=scope,
+        loss_collection=loss_collection, reduction=reduction)
 
     if add_summaries:
       summary.scalar('gradient_penalty_loss', penalty)
@@ -665,7 +671,7 @@ def least_squares_discriminator_loss(
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
     add_summaries=False):
-  """Least squares generator loss.
+  """Least squares discriminator loss.
 
   This loss comes from `Least Squares Generative Adversarial Networks`
   (https://arxiv.org/abs/1611.04076).
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index dbaa624ae9d6a5a5949db692e52c0c1deb18b8df..2889e937436d2faa66b5693c19046e122cbaf652 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -481,6 +481,28 @@ class GradientPenaltyTest(test.TestCase, _PenaltyTest):
                       })
       self.assertAlmostEqual(self._expected_loss, loss, 5)
 
+  def test_loss_using_one_sided_mode(self):
+    generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+    real_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+
+    loss = tfgan_losses.wasserstein_gradient_penalty(
+        generated_data,
+        real_data,
+        self._kwargs['generator_inputs'],
+        self._kwargs['discriminator_fn'],
+        self._kwargs['discriminator_scope'],
+        one_sided=True)
+    self.assertEqual(generated_data.dtype, loss.dtype)
+
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      loss = sess.run(loss,
+                      feed_dict={
+                          generated_data: self._generated_data_np,
+                          real_data: self._real_data_np,
+                      })
+      self.assertAlmostEqual(self._expected_loss, loss, 5)
+
   def test_loss_with_gradient_norm_target(self):
     """Test loss value with non default gradient norm target."""
     generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 776eb11ecb1624544d24611d8fe6ca19768b8313..6fa43059f3125daea080f780210223363d0a89f9 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -461,6 +461,7 @@ def gan_loss(
     gradient_penalty_weight=None,
     gradient_penalty_epsilon=1e-10,
     gradient_penalty_target=1.0,
+    gradient_penalty_one_sided=False,
     mutual_information_penalty_weight=None,
     aux_cond_generator_weight=None,
     aux_cond_discriminator_weight=None,
@@ -485,6 +486,8 @@ def gan_loss(
     gradient_penalty_target: If `gradient_penalty_weight` is not None, a Python
       number or `Tensor` indicating the target value of gradient norm. See the
       CIFAR10 section of https://arxiv.org/abs/1710.10196. Defaults to 1.0.
+    gradient_penalty_one_sided: If `True`, penalty proposed in
+      https://arxiv.org/abs/1709.08894 is used. Defaults to `False`.
     mutual_information_penalty_weight: If not `None`, must be a non-negative
       Python number or Tensor indicating how much to weight the mutual
       information penalty. See https://arxiv.org/abs/1606.03657 for more
@@ -546,6 +549,7 @@ def gan_loss(
         model,
         epsilon=gradient_penalty_epsilon,
         target=gradient_penalty_target,
+        one_sided=gradient_penalty_one_sided,
         add_summaries=add_summaries)
     dis_loss += gradient_penalty_weight * gp_loss
   if _use_aux_loss(mutual_information_penalty_weight):
@@ -706,7 +710,10 @@ def gan_train_ops(
     be used to train a generator/discriminator pair.
   """
   if isinstance(model, namedtuples.CycleGANModel):
-    saved_params = locals()
+    # Get and store all arguments other than model and loss from locals.
+    # Contents of locals should not be modified, may not affect values. So make
+    # a copy. https://docs.python.org/2/library/functions.html#locals.
+    saved_params = dict(locals())
     saved_params.pop('model', None)
     saved_params.pop('loss', None)
     kwargs = saved_params.pop('kwargs', {})
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index f9bdaa74c948ecee11d5cfd89f06087924f8dace..3ebbe55d059e5e72607bc4efdbf95a6c96d99f11 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -359,10 +359,12 @@ class GANLossTest(test.TestCase):
     self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
 
   # Test gradient penalty option.
-  def _test_grad_penalty_helper(self, create_gan_model_fn):
+  def _test_grad_penalty_helper(self, create_gan_model_fn, one_sided=False):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
-    loss_gp = train.gan_loss(model, gradient_penalty_weight=1.0)
+    loss_gp = train.gan_loss(model,
+                             gradient_penalty_weight=1.0,
+                             gradient_penalty_one_sided=one_sided)
     self.assertTrue(isinstance(loss_gp, namedtuples.GANLoss))
 
     # Check values.
@@ -394,6 +396,25 @@ class GANLossTest(test.TestCase):
   def test_grad_penalty_callable_acgan(self):
     self._test_grad_penalty_helper(create_callable_acgan_model)
 
+  def test_grad_penalty_one_sided_gan(self):
+    self._test_grad_penalty_helper(create_gan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_gan(self):
+    self._test_grad_penalty_helper(create_callable_gan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_infogan(self):
+    self._test_grad_penalty_helper(create_infogan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_infogan(self):
+    self._test_grad_penalty_helper(
+        create_callable_infogan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_acgan(self):
+    self._test_grad_penalty_helper(create_acgan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_acgan(self):
+    self._test_grad_penalty_helper(create_callable_acgan_model, one_sided=True)
+
   # Test mutual information penalty option.
   def _test_mutual_info_penalty_helper(self, create_gan_model_fn):
     train.gan_loss(create_gan_model_fn(),
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index 707ae25d485c64f15694ee0e357f32b619d3cd33..e534fdc17749974ebe713c2730682bea6d7a85e4 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -9,18 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 28f68cec8cce126f1b177a73e197ccd7ab749f4a..94f522c04e5a09ed2d9355fa675125c340407923 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -155,7 +155,7 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
     }
 
     Device* dst_device;
-    Status s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    Status s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
     if (!s.ok()) {
       sess->worker_cache->ReleaseWorker(src_worker, rwi);
       done(s, Args(), recv_args, Tensor{}, false);
diff --git a/tensorflow/contrib/graph_editor/BUILD b/tensorflow/contrib/graph_editor/BUILD
index 967ad2fc090906e93f22c777816eede37f9a1b04..1711100e3a857dba0d15c5b4f6c96cddc568e800 100644
--- a/tensorflow/contrib/graph_editor/BUILD
+++ b/tensorflow/contrib/graph_editor/BUILD
@@ -39,18 +39,6 @@ py_library(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "match",
     srcs = ["tests/match.py"],
diff --git a/tensorflow/contrib/graph_editor/reroute.py b/tensorflow/contrib/graph_editor/reroute.py
index 7ffdbb7139281734917fdb715601b317eb58b82f..95c02a64d47c26e731ef2628fb551529e9bc3f4d 100644
--- a/tensorflow/contrib/graph_editor/reroute.py
+++ b/tensorflow/contrib/graph_editor/reroute.py
@@ -471,9 +471,10 @@ def remove_control_inputs(op, cops):
     if cop not in op.control_inputs:
       raise ValueError("{} is not a control_input of {}".format(op.name,
                                                                 cop.name))
+  control_inputs = [cop for cop in op.control_inputs if cop not in cops]
   # pylint: disable=protected-access
-  op._control_inputs = [cop for cop in op._control_inputs if cop not in cops]
-  op._recompute_node_def()
+  op._remove_all_control_inputs()
+  op._add_control_inputs(control_inputs)
   # pylint: enable=protected-access
 
 
@@ -496,9 +497,6 @@ def add_control_inputs(op, cops):
     if cop in op.control_inputs:
       raise ValueError("{} is already a control_input of {}".format(cop.name,
                                                                     op.name))
-  # pylint: disable=protected-access
-  op._control_inputs += cops
-  op._recompute_node_def()
-  # pylint: enable=protected-access
+  op._add_control_inputs(cops)  # pylint: disable=protected-access
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/graph_editor/select.py b/tensorflow/contrib/graph_editor/select.py
index 3ea6ff4d6163b107ca0daaf3b9ad1daf0ccc1f6f..d700e6e1a7523622f845acbbc353eb0f438c9bc2 100644
--- a/tensorflow/contrib/graph_editor/select.py
+++ b/tensorflow/contrib/graph_editor/select.py
@@ -383,6 +383,7 @@ def get_within_boundary_ops(ops,
 def get_forward_walk_ops(seed_ops,
                          inclusive=True,
                          within_ops=None,
+                         within_ops_fn=None,
                          stop_at_ts=(),
                          control_outputs=None):
   """Do a forward graph walk and return all the visited ops.
@@ -395,6 +396,9 @@ def get_forward_walk_ops(seed_ops,
     within_ops: an iterable of `tf.Operation` within which the search is
       restricted. If `within_ops` is `None`, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     stop_at_ts: an iterable of tensors at which the graph walk stops.
     control_outputs: a `util.ControlOutputs` instance or None.
       If not `None`, it will be used while walking the graph forward.
@@ -423,7 +427,8 @@ def get_forward_walk_ops(seed_ops,
     seed_ops &= within_ops
 
   def is_within(op):
-    return within_ops is None or op in within_ops
+    return (within_ops is None or op in within_ops) and (
+        within_ops_fn is None or within_ops_fn(op))
 
   result = list(seed_ops)
   wave = set(seed_ops)
@@ -450,6 +455,7 @@ def get_forward_walk_ops(seed_ops,
 def get_backward_walk_ops(seed_ops,
                           inclusive=True,
                           within_ops=None,
+                          within_ops_fn=None,
                           stop_at_ts=(),
                           control_inputs=False):
   """Do a backward graph walk and return all the visited ops.
@@ -462,6 +468,9 @@ def get_backward_walk_ops(seed_ops,
     within_ops: an iterable of `tf.Operation` within which the search is
       restricted. If `within_ops` is `None`, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     stop_at_ts: an iterable of tensors at which the graph walk stops.
     control_inputs: if True, control inputs will be used while moving backward.
   Returns:
@@ -488,7 +497,8 @@ def get_backward_walk_ops(seed_ops,
     seed_ops &= within_ops
 
   def is_within(op):
-    return within_ops is None or op in within_ops
+    return (within_ops is None or op in within_ops) and (
+        within_ops_fn is None or within_ops_fn(op))
 
   result = list(seed_ops)
   wave = set(seed_ops)
@@ -516,6 +526,7 @@ def get_walks_intersection_ops(forward_seed_ops,
                                forward_inclusive=True,
                                backward_inclusive=True,
                                within_ops=None,
+                               within_ops_fn=None,
                                control_inputs=False,
                                control_outputs=None,
                                control_ios=None):
@@ -535,6 +546,9 @@ def get_walks_intersection_ops(forward_seed_ops,
     within_ops: an iterable of tf.Operation within which the search is
       restricted. If within_ops is None, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     control_inputs: A boolean indicating whether control inputs are enabled.
     control_outputs: An instance of util.ControlOutputs or None. If not None,
       control outputs are enabled.
@@ -555,11 +569,13 @@ def get_walks_intersection_ops(forward_seed_ops,
       forward_seed_ops,
       inclusive=forward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_outputs=control_outputs)
   backward_ops = get_backward_walk_ops(
       backward_seed_ops,
       inclusive=backward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_inputs=control_inputs)
   return [op for op in forward_ops if op in backward_ops]
 
@@ -569,6 +585,7 @@ def get_walks_union_ops(forward_seed_ops,
                         forward_inclusive=True,
                         backward_inclusive=True,
                         within_ops=None,
+                        within_ops_fn=None,
                         control_inputs=False,
                         control_outputs=None,
                         control_ios=None):
@@ -587,6 +604,9 @@ def get_walks_union_ops(forward_seed_ops,
       resulting set.
     within_ops: restrict the search within those operations. If within_ops is
       None, the search is done within the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     control_inputs: A boolean indicating whether control inputs are enabled.
     control_outputs: An instance of util.ControlOutputs or None. If not None,
       control outputs are enabled.
@@ -607,11 +627,13 @@ def get_walks_union_ops(forward_seed_ops,
       forward_seed_ops,
       inclusive=forward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_outputs=control_outputs)
   backward_ops = get_backward_walk_ops(
       backward_seed_ops,
       inclusive=backward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_inputs=control_inputs)
   return util.concatenate_unique(forward_ops, backward_ops)
 
diff --git a/tensorflow/contrib/graph_editor/tests/select_test.py b/tensorflow/contrib/graph_editor/tests/select_test.py
index 82f999637d0c1866a5a329974f021fe2e30fd33f..d12c6d3cbd11dde2b609a59154297a8907b0cadc 100644
--- a/tensorflow/contrib/graph_editor/tests/select_test.py
+++ b/tensorflow/contrib/graph_editor/tests/select_test.py
@@ -77,12 +77,10 @@ class SelectTest(test.TestCase):
     """Test for ge.get_ops_ios."""
     control_outputs = ge.util.ControlOutputs(self.graph)
     self.assertEqual(
-        len(ge.get_ops_ios(
-            self.h.op, control_ios=control_outputs)), 3)
+        len(ge.get_ops_ios(self.h.op, control_ios=control_outputs)), 3)
     self.assertEqual(len(ge.get_ops_ios(self.h.op)), 2)
     self.assertEqual(
-        len(ge.get_ops_ios(
-            self.c.op, control_ios=control_outputs)), 6)
+        len(ge.get_ops_ios(self.c.op, control_ios=control_outputs)), 6)
     self.assertEqual(len(ge.get_ops_ios(self.c.op)), 5)
 
   def test_compute_boundary_ts_0(self):
@@ -135,16 +133,49 @@ class SelectTest(test.TestCase):
     ops = ge.get_walks_intersection_ops([self.c.op], [self.g.op])
     self.assertEqual(len(ops), 2)
 
+    ops = ge.get_walks_intersection_ops([self.a.op], [self.f.op])
+    self.assertEqual(len(ops), 3)
+    self.assertTrue(self.a.op in ops)
+    self.assertTrue(self.c.op in ops)
+    self.assertTrue(self.f.op in ops)
+
+    within_ops = [self.a.op, self.f.op]
+    ops = ge.get_walks_intersection_ops(
+        [self.a.op], [self.f.op], within_ops=within_ops)
+    self.assertEqual(len(ops), 0)
+
+    within_ops_fn = lambda op: op in [self.a.op, self.f.op]
+    ops = ge.get_walks_intersection_ops(
+        [self.a.op], [self.f.op], within_ops_fn=within_ops_fn)
+    self.assertEqual(len(ops), 0)
+
   def test_get_walks_union(self):
     """Test for ge.get_walks_union_ops."""
     ops = ge.get_walks_union_ops([self.f.op], [self.g.op])
     self.assertEqual(len(ops), 6)
 
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op])
+    self.assertEqual(len(ops), 8)
+
+    within_ops = [self.a.op, self.c.op, self.d.op, self.f.op]
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op],
+                                 within_ops=within_ops)
+    self.assertEqual(len(ops), 4)
+    self.assertTrue(self.b.op not in ops)
+
+    within_ops_fn = lambda op: op in [self.a.op, self.c.op, self.f.op]
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op],
+                                 within_ops_fn=within_ops_fn)
+    self.assertEqual(len(ops), 3)
+    self.assertTrue(self.b.op not in ops)
+    self.assertTrue(self.d.op not in ops)
+
   def test_select_ops(self):
     parameters = (
         (("^foo/",), 7),
         (("^foo/bar/",), 4),
-        (("^foo/bar/", "a"), 5),)
+        (("^foo/bar/", "a"), 5),
+    )
     for param, length in parameters:
       ops = ge.select_ops(*param, graph=self.graph)
       self.assertEqual(len(ops), length)
@@ -152,7 +183,8 @@ class SelectTest(test.TestCase):
   def test_select_ts(self):
     parameters = (
         (".*:0", 8),
-        (r".*/bar/\w+:0", 4),)
+        (r".*/bar/\w+:0", 4),
+    )
     for regex, length in parameters:
       ts = ge.select_ts(regex, graph=self.graph)
       self.assertEqual(len(ts), length)
@@ -160,12 +192,121 @@ class SelectTest(test.TestCase):
   def test_select_ops_and_ts(self):
     parameters = (
         (("^foo/.*",), 7, 0),
-        (("^foo/.*", "(?#ts)^foo/bar/.*"), 7, 4),)
+        (("^foo/.*", "(?#ts)^foo/bar/.*"), 7, 4),
+    )
     for param, l0, l1 in parameters:
       ops, ts = ge.select_ops_and_ts(*param, graph=self.graph)
       self.assertEqual(len(ops), l0)
       self.assertEqual(len(ts), l1)
 
+  def test_forward_walk_ops(self):
+    seed_ops = [self.a.op, self.d.op]
+    # Include all ops except for self.g.op
+    within_ops = [
+        x.op for x in [self.a, self.b, self.c, self.d, self.e, self.f, self.h]
+    ]
+    # For the fn, exclude self.e.op.
+    within_ops_fn = lambda op: op not in (self.e.op,)
+    stop_at_ts = (self.f,)
+
+    with self.graph.as_default():
+      # No b.op since it's an independent source node.
+      # No g.op from within_ops.
+      # No e.op from within_ops_fn.
+      # No h.op from stop_at_ts and within_ops.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(
+          set(ops), set([self.a.op, self.c.op, self.d.op, self.f.op]))
+
+      # Also no a.op and d.op when inclusive=False
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.c.op, self.f.op]))
+
+      # Not using within_ops_fn adds e.op.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.c.op, self.e.op, self.f.op]))
+
+      # Not using stop_at_ts adds back h.op.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops, inclusive=False, within_ops=within_ops)
+      self.assertEqual(
+          set(ops), set([self.c.op, self.e.op, self.f.op, self.h.op]))
+
+      # Starting just form a (the tensor, not op) omits a, b, d.
+      ops = ge.select.get_forward_walk_ops([self.a], inclusive=True)
+      self.assertEqual(
+          set(ops), set([self.c.op, self.e.op, self.f.op, self.g.op,
+                         self.h.op]))
+
+  def test_backward_walk_ops(self):
+    seed_ops = [self.h.op]
+    # Include all ops except for self.g.op
+    within_ops = [
+        x.op for x in [self.a, self.b, self.c, self.d, self.e, self.f, self.h]
+    ]
+    # For the fn, exclude self.c.op.
+    within_ops_fn = lambda op: op not in (self.c.op,)
+    stop_at_ts = (self.f,)
+
+    with self.graph.as_default():
+      # Backward walk only includes h since we stop at f and g is not within.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.h.op]))
+
+      # If we do inclusive=False, the result is empty.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set())
+
+      # Removing stop_at_fs adds f.op, d.op.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn)
+      self.assertEqual(set(ops), set([self.d.op, self.f.op, self.h.op]))
+
+      # Not using within_ops_fn adds back ops for a, b, c.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops, inclusive=True, within_ops=within_ops)
+      self.assertEqual(
+          set(ops),
+          set([
+              self.a.op, self.b.op, self.c.op, self.d.op, self.f.op, self.h.op
+          ]))
+
+      # Vanially backward search via self.h.op includes everything excpet e.op.
+      ops = ge.select.get_backward_walk_ops(seed_ops, inclusive=True)
+      self.assertEqual(
+          set(ops),
+          set([
+              self.a.op, self.b.op, self.c.op, self.d.op, self.f.op, self.g.op,
+              self.h.op
+          ]))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/graph_editor/tests/transform_test.py b/tensorflow/contrib/graph_editor/tests/transform_test.py
index ca00394388f67e2ed9508684a47b23c3ee9e79e8..97f38c923f4a19cedf3e16203ca1e66b7e5e45d2 100644
--- a/tensorflow/contrib/graph_editor/tests/transform_test.py
+++ b/tensorflow/contrib/graph_editor/tests/transform_test.py
@@ -18,11 +18,14 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import numpy as np
 from tensorflow.contrib import graph_editor as ge
 from tensorflow.contrib.graph_editor.tests import match
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -41,6 +44,7 @@ class TransformTest(test.TestCase):
     self.graph = ops.Graph()
     with self.graph.as_default():
       c0 = constant_op.constant(1.0, shape=[10], name="Const")
+      c0.op._set_attr("_foo", attr_value_pb2.AttrValue(s=b"foo"))
       c1 = constant_op.constant(1.0, shape=[10], name="Const")
       c2 = constant_op.constant(1.0, shape=[10], name="Const")
       i = constant_op.constant(1.0, shape=[10], name="Input")
@@ -84,9 +88,9 @@ class TransformTest(test.TestCase):
   def test_transform(self):
     transformer = ge.Transformer()
 
-    def my_transform_op_handler(info, op):
+    def my_transform_op_handler(info, op, new_inputs):
       add_noise = op.name.startswith("Add")
-      op_, op_outputs_ = ge.transform.copy_op_handler(info, op)
+      op_, op_outputs_ = ge.transform.copy_op_handler(info, op, new_inputs)
       if not add_noise:
         return op_, op_outputs_
       # add some noise to op
@@ -111,6 +115,32 @@ class TransformTest(test.TestCase):
     top = ge.select_ops("^AddNoise_2$", graph=graph)[0]
     self.assertTrue(matcher2(top))
 
+  def test_transform_nodedef_fn(self):
+    transformer = ge.Transformer()
+
+    def nodedef_fn(node_def):
+      if "_foo" in node_def.attr:
+        del node_def.attr["_foo"]
+      node_def.attr["_bar"].s = b"bar"
+      return node_def
+
+    my_copy_op_handler = functools.partial(
+        ge.transform.copy_op_handler, nodedef_fn=nodedef_fn)
+    transformer.transform_op_handler = my_copy_op_handler
+
+    graph = ops.Graph()
+    transformer(self.graph, graph, "", "")
+
+    c0_before = self.graph.get_operation_by_name("Const")
+    c0_after = graph.get_operation_by_name("Const")
+    self.assertEquals(c0_before.get_attr("_foo"), b"foo")
+    with self.assertRaises(ValueError):
+      c0_after.get_attr("_foo")
+
+    all_ops = graph.get_operations()
+    for op in all_ops:
+      self.assertEquals(op.get_attr("_bar"), b"bar")
+
   def test_copy_with_input_replacements(self):
     with self.graph.as_default():
       ten = constant_op.constant(10.0, shape=[10], name="Input")
@@ -201,15 +231,56 @@ class TransformTest(test.TestCase):
                         get_operation_by_name("res/grad/mul1_grad/Mul_1"))
 
     # Make sure _original_ops are as expected.
-    self.assertEquals(original_mul1_grad._original_op.name, u"mul1")
-    self.assertEquals(result_mul1_grad._original_op.name, u"res/mul1")
-    self.assertNotEquals(res.name, g.name)
+    self.assertEqual(original_mul1_grad._original_op.name, u"mul1")
+    self.assertEqual(result_mul1_grad._original_op.name, u"res/mul1")
+    self.assertNotEqual(res.name, g.name)
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
       g_val, res_val = sess.run([g, res])
     self.assertNear(g_val, 0.0, ERROR_TOLERANCE)
     self.assertNear(res_val, 0.0, ERROR_TOLERANCE)
 
+  def test_graph_while_loop(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      max_index = array_ops.placeholder(dtype=dtypes.int32, shape=tuple())
+      index_start = constant_op.constant(1)
+      sum_start = constant_op.constant(0)
+      _, result = control_flow_ops.while_loop(
+          cond=lambda i, unused_s: i <= max_index,
+          body=lambda i, s: (i + 1, s + i),
+          loop_vars=[index_start, sum_start])
+    copied_graph = ops.Graph()
+    _, copy_info = ge.copy(
+        graph, dst_graph=copied_graph, dst_scope="imported")
+    copied_result = copy_info.transformed(result)
+    copied_max_index = copy_info.transformed(max_index)
+    with copied_graph.as_default():
+      with session.Session() as sess:
+        n = 10
+        sum_val = sess.run(copied_result, feed_dict={copied_max_index: n})
+        self.assertEqual(sum_val, 55)
+
+  def test_graph_cond(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      choice = array_ops.placeholder(shape=(), dtype=dtypes.bool)
+      result = control_flow_ops.cond(
+          choice,
+          lambda: constant_op.constant(1),
+          lambda: constant_op.constant(2))
+    copied_graph = ops.Graph()
+    _, copy_info = ge.copy(
+        graph, dst_graph=copied_graph, dst_scope="imported")
+    copied_result = copy_info.transformed(result)
+    copied_choice = copy_info.transformed(choice)
+    with copied_graph.as_default():
+      with session.Session() as sess:
+        res = sess.run(copied_result, feed_dict={copied_choice: True})
+        self.assertEqual(res, 1)
+        res = sess.run(copied_result, feed_dict={copied_choice: False})
+        self.assertEqual(res, 2)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 14ac5296657d48c7f9e94d220c9e7e28af4d4353..a320a3f232fc1dc8c9ccfd1d0f2a9a40225db5cb 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -129,36 +129,51 @@ def transform_op_if_inside_handler(info, op, keep_if_possible=True):
       return None
 
 
-def copy_op_handler(info, op, copy_shape=True):
+def copy_op_handler(info, op, new_inputs, copy_shape=True, nodedef_fn=None):
   """Copy a `tf.Operation`.
 
   Args:
     info: Transform._TmpInfo instance.
     op: the `tf.Operation` to be copied.
+    new_inputs: The new inputs for this op.
     copy_shape: also copy the shape of the tensor
+    nodedef_fn: If provided, a function that will be run on the NodeDef
+      and should return a mutated NodeDef before a new Operation is created.
+      This is useful as certain features cannot be set on the Operation and
+      must be modified in NodeDef.
+
   Returns:
     A `(op, op_outputs)` tuple containing the transformed op and its outputs.
   """
+  # The `new_inputs` was added to this function. For compatibility reason,
+  # let's raise an error if `new_inputs` is a boolean.
+  if isinstance(new_inputs, bool):
+    raise TypeError("the `new_inputs` argument must be an iterable.")
+
   # pylint: disable=protected-access
 
   # Clone the node def:
-  node_def_ = deepcopy(op._node_def)
+  node_def_ = deepcopy(op.node_def)
 
   # Transform name:
   name_ = info.new_name(op.name)
   name_ = info.graph_.unique_name(name_)
   node_def_.name = name_
 
+  # Mutate NodeDef if requested:
+  if nodedef_fn is not None:
+    node_def_ = nodedef_fn(node_def_)
+
   # Copy the other inputs needed for initialization
   output_types_ = op._output_types[:]
   input_types_ = op._input_types[:]
 
   # Make a copy of the op_def too.
   # Its unique to every _type_ of Operation.
-  op_def_ = deepcopy(op._op_def)
+  op_def_ = deepcopy(op.op_def)
 
   # Initialize a new Operation instance
-  op_ = tf_ops.Operation(node_def_, info.graph_, [], output_types_,
+  op_ = tf_ops.Operation(node_def_, info.graph_, new_inputs, output_types_,
                          [], input_types_, None, op_def_)
 
   # copy the shape over
@@ -170,6 +185,7 @@ def copy_op_handler(info, op, copy_shape=True):
   # attribute to exist, we will create a dummy original_op first and then
   # later finalise it with the actual original_op when all the ops have
   # been copied.
+  # TODO(fkp): Stop worrying about _original_op and remove this code?
   if op._original_op:
     op_._original_op = op._original_op
 
@@ -328,6 +344,14 @@ class _TmpInfo(object):
                             for key in self.graph.get_all_collection_keys())
     self.cyclic_ops = []
     self.transform_original_op_handler = transform_op_if_inside_handler
+    # The graph is transformed op by op, in the same order the original ops
+    # were created. However, this is sometimes not possible due to cycles
+    # (i.e. while loops). So when the transformer creates a new op whose
+    # inputs do not exist yet, temporary placeholders are created and stored
+    # in this `tmp_cyclic_ts` container. During a second pass,
+    # those temporary tensors are replaced by the proper transformed tensors
+    # (see the function `_finalize_cycles`).
+    self.tmp_cyclic_ts = []
 
   def new_name(self, name):
     """Compute a destination name from a source name.
@@ -428,10 +452,10 @@ class Transformer(object):
 
     # Create temporary info used during this transform call
     info = _TmpInfo(sgv, dst_graph, dst_scope, src_scope)
-    info.transform_original_op_handler = self.transform_original_op_handler
 
     self._copy_ops(info)
-    self._connect_ops(info)
+    self._finalize_cycles(info)
+    self._connect_control_inputs(info)
 
     # Compute information about the transformation
     res_info = TransformerInfo(info)
@@ -440,10 +464,10 @@ class Transformer(object):
 
   def _copy_ops(self, info):
     """Copy ops without connecting them."""
-    for op in info.sgv.ops:
-      logging.debug("Copying op: %s", op.name)
-      # TODO(fkp): return a subgraph?
-      op_, op_outputs_ = self.transform_op_handler(info, op)
+    sorted_ops = sorted(info.sgv.ops, key=lambda op: op._id)  # pylint: disable=protected-access
+    for op in sorted_ops:
+      new_inputs = [self._transformed_t(info, t, op) for t in op.inputs]
+      op_, op_outputs_ = self.transform_op_handler(info, op, new_inputs)
       if op is op_:
         raise ValueError("In-place transformation not allowed.")
 
@@ -456,27 +480,36 @@ class Transformer(object):
         info.transformed_ts[op_output] = op_output_
         self.assign_collections_handler(info, op_output, op_output_)
 
-  def _connect_ops(self, info):
+  def _finalize_cycles(self, info):
+    """Reconnects the cyclic tensors."""
+    for t, tmp_t_, consumer_op in info.tmp_cyclic_ts:
+      if t not in info.transformed_ts:
+        raise ValueError("The tensor {} should be transformed by now.".format(
+            t.name))
+      if consumer_op not in info.transformed_ops:
+        raise ValueError("The op {} should be transformed by now.".format(
+            consumer_op.name))
+      t_ = info.transformed_ts[t]
+      consumer_op_ = info.transformed_ops[consumer_op]
+      t_index_ = list(consumer_op_.inputs).index(tmp_t_)
+      consumer_op_._update_input(t_index_, t_, update_dtype=False)  # pylint: disable=protected-access
+
+  def _connect_control_inputs(self, info):
     """Connect the previously copied ops."""
     for op in info.sgv.ops:
-      logging.debug("Finalizing op: %s", op.name)
+      logging.debug("Connecting control inputs of op: %s", op.name)
       op_ = info.transformed_ops[op]
 
-      # pylint: disable=protected-access
-      if op_.inputs:
-        raise ValueError("The newly transformed op should not have "
-                         "any inputs yet: {}".format(op_.name))
-      inputs_ = [self._transformed_t(info, t) for t in op.inputs]
-      for t in inputs_:
-        op_._add_input(t)
-
       # Finalize original op.
+      # TODO(fkp): Stop worrying about _original_op and remove this code?
+      # pylint: disable=protected-access
       if op._original_op:
-        original_op = info.transform_original_op_handler(info, op._original_op)
+        original_op = self.transform_original_op_handler(info, op._original_op)
         if original_op is None:
           logging.debug("Could not find original op for: %s", op_.name)
         else:
           op_._original_op = original_op
+      # pylint: enable=protected-access
 
       # Finalize control inputs:
       control_inputs_ = [self.transform_control_input_handler(info, ci)
@@ -525,19 +558,38 @@ class Transformer(object):
 
     return sgv_.remap(input_map_, output_map_)
 
-  def _transformed_t(self, info, t):
+  def _transformed_t(self, info, t, consumer_op):
     """Return tre transformed tensor of `t`."""
-    if t not in info.transformed_ts:
-      # If op is not in the subgraph.
-      if t in info.sgv_inputs_set:
-        # t is an input of the subgraph.
-        return self.transform_external_input_handler(info, t)
+    if t in info.transformed_ts:
+      # If op is in the subgraph, just return its transformed counterpart.
+      return info.transformed_ts[t]
+
+    if t in info.sgv_inputs_set:
+      # `t` is an input of the subgraph.
+      return self.transform_external_input_handler(info, t)
+    elif t.op in info.ops:
+      # `t` is an internal tensor but is not transformed yet because it
+      # belongs to a graph cycle.
+      logging.debug("Cyclic tensor: t.name = %s", t.name)
+      # Try to find an existing tensor we can use for now,
+      # otherwise create one. We'll rewire this later.
+      if consumer_op.type == "Merge":
+        first_input = consumer_op.inputs[0]
+        tmp_t_ = self._transformed_t(info, first_input, consumer_op)
+      elif t.op.type == "Enter":
+        enter_input = t.op.inputs[0]
+        tmp_t_ = self._transformed_t(info, enter_input, consumer_op)
       else:
-        # t is a hidden input of the subgraph.
-        return self.transform_external_hidden_input_handler(info, t)
+        with info.graph_.as_default():
+          tmp_t_ = util.make_placeholder_from_tensor(t, scope=info.scope_,
+                                                     prefix="geph_tmp")
+        logging.debug("Created temporary placeholder: %s.", tmp_t_.name)
+      # Register as temporary and return.
+      info.tmp_cyclic_ts.append((t, tmp_t_, consumer_op))
+      return tmp_t_
     else:
-      # If op is in the subgraph, just return its transformed.
-      return info.transformed_ts[t]
+      # `t` is a hidden input of the subgraph.
+      return self.transform_external_hidden_input_handler(info, t)
 
 
 def copy(sgv, dst_graph=None, dst_scope="", src_scope="",
@@ -624,6 +676,40 @@ def copy_with_input_replacements(sgv, replacement_ts,
       sgv, dst_graph, dst_scope, src_scope, reuse_dst_scope=reuse_dst_scope)
 
 
+def _add_control_flow_ops(ops, control_ios):
+  """Complete `ops` so that the tranformed graph is valid.
+
+  Partially copying a graph can lead to a malformed graph. For instance,
+  copying half of a while construct is likely to result in an invalid graph.
+  This function attempts to add missing ops so that the transformation result
+  in a valid graph.
+
+  Args:
+    ops: list of ops (modifed in-place).
+    control_ios: object created by a call to `util.ControlOutputs`.
+  """
+  # Find while contexts.
+  control_flow_contexts = set()
+  for op in ops:
+    cfc = op._control_flow_context  # pylint: disable=protected-access
+    if cfc:
+      control_flow_contexts.add(cfc)
+  # Find new ops.
+  new_ops = []
+  for cfc in control_flow_contexts:
+    if cfc.IsWhileContext():
+      new_ops += select.get_walks_intersection_ops(
+          [enter_t.op for enter_t in cfc.loop_enters],
+          [exit_t.op for exit_t in cfc.loop_exits],
+          control_ios=control_ios)
+  # Add new ops.
+  new_ops_set = set(new_ops)
+  ops_set = frozenset(ops)
+  for op in new_ops_set:
+    if op not in ops_set:
+      ops.append(op)
+
+
 def graph_replace(target_ts, replacement_ts, dst_scope="",
                   src_scope="", reuse_dst_scope=False):
   """Create a new graph which compute the targets from the replaced Tensors.
@@ -657,8 +743,13 @@ def graph_replace(target_ts, replacement_ts, dst_scope="",
                                           control_ios=control_ios)
   if not ops:
     raise ValueError("Targets and replacements are not connected!")
+
+  # Complete ops to avoid malformed control flow.
+  # TODO(fkp): Consider moving this function deeper (in the transformer?).
+  _add_control_flow_ops(ops, control_ios)
+
   # Create a copy of the relevant subgraph
-  _, info = copy_with_input_replacements(
+  unused_sgv_, info = copy_with_input_replacements(
       ops, replacement_ts, None, dst_scope, src_scope, reuse_dst_scope)
   # Return the transformed targets but keep the original if the transformed
   # counterpart cannot be found
diff --git a/tensorflow/contrib/graph_editor/util.py b/tensorflow/contrib/graph_editor/util.py
index 30bc33b9ee42ba78bc7307c67c0fc0af9f3356ef..584f4509ccc0aab30edc2be3bad7a9cb938d6e6a 100644
--- a/tensorflow/contrib/graph_editor/util.py
+++ b/tensorflow/contrib/graph_editor/util.py
@@ -38,6 +38,11 @@ __all__ = [
 ]
 
 
+# The graph editor sometimes need to create placeholders, they are named
+# "geph_*". "geph" stands for Graph-Editor PlaceHolder.
+_DEFAULT_PLACEHOLDER_PREFIX = "geph"
+
+
 def concatenate_unique(la, lb):
   """Add all the elements of `lb` to `la` if they are not there already.
 
@@ -405,7 +410,7 @@ def scope_basename(scope):
   return scope[slash + 1:]
 
 
-def placeholder_name(t=None, scope=None):
+def placeholder_name(t=None, scope=None, prefix=_DEFAULT_PLACEHOLDER_PREFIX):
   """Create placeholder name for the graph editor.
 
   Args:
@@ -413,6 +418,7 @@ def placeholder_name(t=None, scope=None):
       on
     scope: absolute scope with which to prefix the placeholder's name. None
       means that the scope of t is preserved. "" means the root scope.
+    prefix: placeholder name prefix.
   Returns:
     A new placeholder name prefixed by "geph". Note that "geph" stands for
       Graph Editor PlaceHolder. This convention allows to quickly identify the
@@ -430,19 +436,20 @@ def placeholder_name(t=None, scope=None):
     if scope is None:
       scope = op_dirname
 
-    if op_basename.startswith("geph__"):
+    if op_basename.startswith("{}__".format(prefix)):
       ph_name = op_basename
     else:
-      ph_name = "geph__{}_{}".format(op_basename, t.value_index)
+      ph_name = "{}__{}_{}".format(prefix, op_basename, t.value_index)
 
     return scope + ph_name
   else:
     if scope is None:
       scope = ""
-    return scope + "geph"
+    return "{}{}".format(scope, prefix)
 
 
-def make_placeholder_from_tensor(t, scope=None):
+def make_placeholder_from_tensor(t, scope=None,
+                                 prefix=_DEFAULT_PLACEHOLDER_PREFIX):
   """Create a `tf.placeholder` for the Graph Editor.
 
   Note that the correct graph scope must be set by the calling function.
@@ -452,17 +459,19 @@ def make_placeholder_from_tensor(t, scope=None):
       (see function placeholder_name).
     scope: absolute scope within which to create the placeholder. None
       means that the scope of `t` is preserved. `""` means the root scope.
+    prefix: placeholder name prefix.
   Returns:
     A newly created `tf.placeholder`.
   Raises:
     TypeError: if `t` is not `None` or a `tf.Tensor`.
   """
   return tf_array_ops.placeholder(
-      dtype=t.dtype, shape=t.get_shape(), name=placeholder_name(
-          t, scope=scope))
+      dtype=t.dtype, shape=t.get_shape(),
+      name=placeholder_name(t, scope=scope, prefix=prefix))
 
 
-def make_placeholder_from_dtype_and_shape(dtype, shape=None, scope=None):
+def make_placeholder_from_dtype_and_shape(dtype, shape=None, scope=None,
+                                          prefix=_DEFAULT_PLACEHOLDER_PREFIX):
   """Create a tf.placeholder for the Graph Editor.
 
   Note that the correct graph scope must be set by the calling function.
@@ -474,11 +483,13 @@ def make_placeholder_from_dtype_and_shape(dtype, shape=None, scope=None):
     shape: the tensor shape (optional).
     scope: absolute scope within which to create the placeholder. None
       means that the scope of t is preserved. "" means the root scope.
+    prefix: placeholder name prefix.
   Returns:
     A newly created tf.placeholder.
   """
   return tf_array_ops.placeholder(
-      dtype=dtype, shape=shape, name=placeholder_name(scope=scope))
+      dtype=dtype, shape=shape,
+      name=placeholder_name(scope=scope, prefix=prefix))
 
 
 _INTERNAL_VARIABLE_RE = re.compile(r"^__\w+__$")
diff --git a/tensorflow/contrib/grid_rnn/BUILD b/tensorflow/contrib/grid_rnn/BUILD
index d601a1ec6f7a219bcd461d819ab2dfc64135a3ae..d0b44640667010b58c017d933d50ae5f87e8b275 100644
--- a/tensorflow/contrib/grid_rnn/BUILD
+++ b/tensorflow/contrib/grid_rnn/BUILD
@@ -41,15 +41,3 @@ cuda_py_tests(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
index 252788140f8c1906718c150574b963385b6ecfa1..bcd2a34c4e791a2ab66a439109145d6b78c14e22 100644
--- a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
+++ b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
@@ -110,7 +110,7 @@ class GridRNNCell(rnn.RNNCell):
       logging.warning('%s: Using a concatenated state is slower and will '
                       'soon be deprecated.  Use state_is_tuple=True.', self)
     if not output_is_tuple:
-      logging.warning('%s: Using a concatenated output is slower and will'
+      logging.warning('%s: Using a concatenated output is slower and will '
                       'soon be deprecated.  Use output_is_tuple=True.', self)
 
     if num_dims < 1:
diff --git a/tensorflow/contrib/hooks/BUILD b/tensorflow/contrib/hooks/BUILD
index 1b528d7afc1112f5dc0667ae299ade02bc8fd04b..d65b2d6026dd89959aa62b57e07b073eef84572c 100644
--- a/tensorflow/contrib/hooks/BUILD
+++ b/tensorflow/contrib/hooks/BUILD
@@ -23,14 +23,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index 163993a3f6bb1bedcdffb32944a98c7cc846878e..68e34f3b0938f795c8ad4c8c75226f6b0afe188d 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -42,11 +42,12 @@ If you've finished walking through the quick start guide, you may want to try bu
 
 ### Build libhexagon\_nn\_skel.so
 
-Download Hexagon NN library from codeaurora.org and build it.
+Download Hexagon NN library from codeaurora.org and build it. For Hexagon SDK 3.0, we need use the compatible version([721b2d58f](https://source.codeaurora.org/quic/hexagon_nn/nnlib/commit/?id=721b2d58f0f4e2d5b182f41e6b7c4db5356bf0fb)) of nnlib.
 
 ```shell
 git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
 cd nnlib
+git reset 721b2d58f --hard
 ```
 
 Just follow the instructions in `README.HOW_TO_BUILD`. You can find the file `libhexagon_nn_skel.so` in `hexagon_Release_dynamic_toolv72_v60/ship`.
diff --git a/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
index 324035100df366b80f57af9052c4bd935655b248..e39c60b252a1b49a68d51302fff47734869dddfe 100644
--- a/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
+++ b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
@@ -13,18 +13,6 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//visibility:public"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_cc_binary(
     name = "clock_cycle_profiling",
     testonly = 1,
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
index 909dc396a33b6fef1b2d51c3f52fab7782fc8ea5..0081fb61770075a2c36e92f65e01126f657edeb4 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -10,17 +10,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
-
 tf_cc_binary(
     name = "hvx_ops_support_checker",
     testonly = 1,
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index 3ff02e085ee63fabf42b3cc4389f4605455f3800..da450480b30b548484e69c61c85667d6dd390417 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -78,7 +78,10 @@ tf_custom_op_py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":dense_image_warp_py",
         ":image_ops",
+        ":interpolate_spline_py",
+        ":sparse_image_warp_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:common_shapes",
@@ -194,6 +197,117 @@ cuda_py_test(
     ],
 )
 
+py_library(
+    name = "dense_image_warp_py",
+    srcs = [
+        "python/ops/dense_image_warp.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "interpolate_spline_py",
+    srcs = [
+        "python/ops/interpolate_spline.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "sparse_image_warp_py",
+    srcs = [
+        "python/ops/sparse_image_warp.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dense_image_warp_py",
+        ":interpolate_spline_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+    ],
+)
+
+cuda_py_test(
+    name = "sparse_image_warp_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/sparse_image_warp_test.py"],
+    additional_deps = [
+        ":sparse_image_warp_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/core:protos_all_py",
+    ],
+    data = [":sparse_image_warp_test_data"],
+    tags = ["no_pip"],
+)
+
+filegroup(
+    name = "sparse_image_warp_test_data",
+    srcs = glob(["python/kernel_tests/test_data/*.png"]),
+)
+
+cuda_py_test(
+    name = "dense_image_warp_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/dense_image_warp_test.py"],
+    additional_deps = [
+        ":dense_image_warp_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+cuda_py_test(
+    name = "interpolate_spline_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/interpolate_spline_test.py"],
+    additional_deps = [
+        ":interpolate_spline_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 tf_py_test(
     name = "segmentation_test",
     size = "medium",
@@ -270,15 +384,3 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
index cc8ed117ba2edcc7a53e609381166f17a2fbb45e..8f406ace1d5dcc13a018e56cc98c621a511da29b 100755
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -25,11 +25,16 @@ projective transforms (including rotation) are supported.
 @@angles_to_projective_transforms
 @@compose_transforms
 @@adjust_yiq_hsv
+@@flat_transforms_to_matrices
+@@matrices_to_flat_transforms
 @@random_yiq_hsv
 @@rotate
 @@transform
 @@translate
 @@translations_to_projective_transforms
+@@dense_image_warp
+@@interpolate_spline
+@@sparse_image_warp
 
 ## Image Segmentation `Ops`
 
@@ -47,17 +52,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.image.python.ops.dense_image_warp import dense_image_warp
+
 from tensorflow.contrib.image.python.ops.distort_image_ops import adjust_hsv_in_yiq
 from tensorflow.contrib.image.python.ops.distort_image_ops import random_hsv_in_yiq
 
 from tensorflow.contrib.image.python.ops.image_ops import angles_to_projective_transforms
 from tensorflow.contrib.image.python.ops.image_ops import compose_transforms
 from tensorflow.contrib.image.python.ops.image_ops import connected_components
+from tensorflow.contrib.image.python.ops.image_ops import flat_transforms_to_matrices
+from tensorflow.contrib.image.python.ops.image_ops import matrices_to_flat_transforms
 from tensorflow.contrib.image.python.ops.image_ops import rotate
 from tensorflow.contrib.image.python.ops.image_ops import transform
 from tensorflow.contrib.image.python.ops.image_ops import translate
 from tensorflow.contrib.image.python.ops.image_ops import translations_to_projective_transforms
+from tensorflow.contrib.image.python.ops.interpolate_spline import interpolate_spline
 from tensorflow.contrib.image.python.ops.single_image_random_dot_stereograms import single_image_random_dot_stereograms
+from tensorflow.contrib.image.python.ops.sparse_image_warp import sparse_image_warp
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
index b71ff9cd507faac66b3a33d3c02ec9b5901d814a..bbb3a3b18fd7bfdc68e8b8532568985245154794 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -53,13 +53,13 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
   OP_REQUIRES_OK(ctx, ctx->allocate_temp(
                           DT_FLOAT, TensorShape({kChannelSize * kChannelSize}),
                           &tranformation_matrix));
-  // TODO(huangyp): It takes about 3.5 us to comute tranformation_matrix
+  // TODO(huangyp): It takes about 3.5 us to compute tranformation_matrix
   // with one thread. Improve its performance if necessary.
   internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
       delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
       tranformation_matrix.flat<float>().size());
   // Call cuBlas C = A * B directly.
-  auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+  auto no_transpose = se::blas::Transpose::kNoTranspose;
   auto a_ptr =
       AsDeviceMemory(input->flat<float>().data(), input->flat<float>().size());
   auto b_ptr = AsDeviceMemory(tranformation_matrix.flat<float>().data(),
diff --git a/tensorflow/contrib/image/kernels/segmentation_ops.cc b/tensorflow/contrib/image/kernels/segmentation_ops.cc
index fe8bf6e21c7b7310527668324571774e8bc50893..93722896233f0278c6cbb44af7203345e58c3172 100644
--- a/tensorflow/contrib/image/kernels/segmentation_ops.cc
+++ b/tensorflow/contrib/image/kernels/segmentation_ops.cc
@@ -101,8 +101,8 @@ struct ImageConnectedComponentsFunctor<CPUDevice, T> {
       int cost = (union_find.block_height() + union_find.block_width()) * 20;
       Shard(worker_threads->num_threads, worker_threads->workers,
             num_images * num_blocks_vertically * num_blocks_horizontally, cost,
-            [&union_find, num_images, num_blocks_vertically,
-             num_blocks_horizontally](int64 start_block, int64 limit_block) {
+            [&union_find, num_blocks_vertically, num_blocks_horizontally](
+                int64 start_block, int64 limit_block) {
               for (int64 i = start_block; i < limit_block; i++) {
                 int64 block_x = i % num_blocks_horizontally;
                 int64 block_y =
diff --git a/tensorflow/contrib/image/ops/distort_image_ops.cc b/tensorflow/contrib/image/ops/distort_image_ops.cc
index b169b0b2b22ad6432baed2cc96711da5ca995875..ca49635d5d0bc7bb84b19508a74be74362d96ddf 100644
--- a/tensorflow/contrib/image/ops/distort_image_ops.cc
+++ b/tensorflow/contrib/image/ops/distort_image_ops.cc
@@ -36,9 +36,9 @@ REGISTER_OP("AdjustHsvInYiq")
 Adjust the YIQ hue of one or more images.
 
 `images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
+interpreted as channels, and must be three.
 
-We used linear transfomation described in:
+We used linear transformation described in:
  beesbuzz.biz/code/hsv_color_transforms.php
 The input image is considered in the RGB colorspace. Conceptually, the RGB
 colors are first mapped into YIQ space, rotated around the Y channel by
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 68771b3d054a64ba94141c092e20df1ed6b2339b..ebdcaea7abae2a967786831b62b331897aa3f6a3 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -93,7 +93,7 @@ row_to_col_match_indices: A vector of length num_rows, which is the number of
   If `row_to_col_match_indices[i]` is not -1, row i is matched to column
   `row_to_col_match_indices[i]`.
 col_to_row_match_indices: A vector of length num_columns, which is the number
-  of columns of the input ditance matrix.
+  of columns of the input distance matrix.
   If `col_to_row_match_indices[j]` is not -1, column j is matched to row
   `col_to_row_match_indices[j]`.
 )doc");
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 8139d4272d6950815bd39a64e86e0f7422e6f799..bd784c6bda0344c092c1ae0af2c60be50fdff102 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -69,7 +69,7 @@ Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
 Given the 2-D tensor 'depth_values' with encoded Z values, this operation will
 encode 3-D data into a 2-D image.  The output of this Op is suitable for the
 encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
-encode 3-D data witin the image.
+encode 3-D data within the image.
 
 This Op is based upon:
 'http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper'
@@ -111,7 +111,7 @@ output_image_shape: Output size of returned image in X,Y, Channels 1-grayscale,
 output_data_window: Size of "DATA" window, must be equal to or smaller than 'output_image_shape', will be centered
   and use 'convergence_dots_size' for best fit to avoid overlap if possible
 
-image:= A tensor of size 'output_image_shape' with the encloded 'depth_values'
+image:= A tensor of size 'output_image_shape' with the encoded 'depth_values'
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a58b6a247ed6ae252db25a12f1e47c08c9a5c147
--- /dev/null
+++ b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
@@ -0,0 +1,267 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dense_image_warp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+
+from tensorflow.contrib.image.python.ops import dense_image_warp
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+from tensorflow.python.training import adam
+
+
+class DenseImageWarpTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    np.random.seed(0)
+
+  def test_interpolate_small_grid_ij(self):
+    grid = constant_op.constant(
+        [[0., 1., 2.], [3., 4., 5.], [6., 7., 8.]], shape=[1, 3, 3, 1])
+    query_points = constant_op.constant(
+        [[0., 0.], [1., 0.], [2., 0.5], [1.5, 1.5]], shape=[1, 4, 2])
+    expected_results = np.reshape(np.array([0., 3., 6.5, 6.]), [1, 4, 1])
+
+    interp = dense_image_warp._interpolate_bilinear(grid, query_points)
+
+    with self.test_session() as sess:
+      predicted = sess.run(interp)
+      self.assertAllClose(expected_results, predicted)
+
+  def test_interpolate_small_grid_xy(self):
+    grid = constant_op.constant(
+        [[0., 1., 2.], [3., 4., 5.], [6., 7., 8.]], shape=[1, 3, 3, 1])
+    query_points = constant_op.constant(
+        [[0., 0.], [0., 1.], [0.5, 2.0], [1.5, 1.5]], shape=[1, 4, 2])
+    expected_results = np.reshape(np.array([0., 3., 6.5, 6.]), [1, 4, 1])
+
+    interp = dense_image_warp._interpolate_bilinear(
+        grid, query_points, indexing='xy')
+
+    with self.test_session() as sess:
+      predicted = sess.run(interp)
+      self.assertAllClose(expected_results, predicted)
+
+  def test_interpolate_small_grid_batched(self):
+    grid = constant_op.constant(
+        [[[0., 1.], [3., 4.]], [[5., 6.], [7., 8.]]], shape=[2, 2, 2, 1])
+    query_points = constant_op.constant([[[0., 0.], [1., 0.], [0.5, 0.5]],
+                                         [[0.5, 0.], [1., 0.], [1., 1.]]])
+    expected_results = np.reshape(
+        np.array([[0., 3., 2.], [6., 7., 8.]]), [2, 3, 1])
+
+    interp = dense_image_warp._interpolate_bilinear(grid, query_points)
+
+    with self.test_session() as sess:
+      predicted = sess.run(interp)
+      self.assertAllClose(expected_results, predicted)
+
+  def get_image_and_flow_placeholders(self, shape, image_type, flow_type):
+    batch_size, height, width, numchannels = shape
+    image_shape = [batch_size, height, width, numchannels]
+    flow_shape = [batch_size, height, width, 2]
+
+    tf_type = {
+        'float16': dtypes.half,
+        'float32': dtypes.float32,
+        'float64': dtypes.float64
+    }
+
+    image = array_ops.placeholder(dtype=tf_type[image_type], shape=image_shape)
+
+    flows = array_ops.placeholder(dtype=tf_type[flow_type], shape=flow_shape)
+    return image, flows
+
+  def get_random_image_and_flows(self, shape, image_type, flow_type):
+    batch_size, height, width, numchannels = shape
+    image_shape = [batch_size, height, width, numchannels]
+    image = np.random.normal(size=image_shape)
+    flow_shape = [batch_size, height, width, 2]
+    flows = np.random.normal(size=flow_shape) * 3
+    return image.astype(image_type), flows.astype(flow_type)
+
+  def assert_correct_interpolation_value(self,
+                                         image,
+                                         flows,
+                                         pred_interpolation,
+                                         batch_index,
+                                         y_index,
+                                         x_index,
+                                         low_precision=False):
+    """Assert that the tf interpolation matches hand-computed value."""
+
+    height = image.shape[1]
+    width = image.shape[2]
+    displacement = flows[batch_index, y_index, x_index, :]
+    float_y = y_index - displacement[0]
+    float_x = x_index - displacement[1]
+    floor_y = max(min(height - 2, math.floor(float_y)), 0)
+    floor_x = max(min(width - 2, math.floor(float_x)), 0)
+    ceil_y = floor_y + 1
+    ceil_x = floor_x + 1
+
+    alpha_y = min(max(0.0, float_y - floor_y), 1.0)
+    alpha_x = min(max(0.0, float_x - floor_x), 1.0)
+
+    floor_y = int(floor_y)
+    floor_x = int(floor_x)
+    ceil_y = int(ceil_y)
+    ceil_x = int(ceil_x)
+
+    top_left = image[batch_index, floor_y, floor_x, :]
+    top_right = image[batch_index, floor_y, ceil_x, :]
+    bottom_left = image[batch_index, ceil_y, floor_x, :]
+    bottom_right = image[batch_index, ceil_y, ceil_x, :]
+
+    interp_top = alpha_x * (top_right - top_left) + top_left
+    interp_bottom = alpha_x * (bottom_right - bottom_left) + bottom_left
+    interp = alpha_y * (interp_bottom - interp_top) + interp_top
+    atol = 1e-6
+    rtol = 1e-6
+    if low_precision:
+      atol = 1e-2
+      rtol = 1e-3
+    self.assertAllClose(
+        interp,
+        pred_interpolation[batch_index, y_index, x_index, :],
+        atol=atol,
+        rtol=rtol)
+
+  def check_zero_flow_correctness(self, shape, image_type, flow_type):
+    """Assert using zero flows doesn't change the input image."""
+
+    image, flows = self.get_image_and_flow_placeholders(shape, image_type,
+                                                        flow_type)
+    interp = dense_image_warp.dense_image_warp(image, flows)
+
+    with self.test_session() as sess:
+      rand_image, rand_flows = self.get_random_image_and_flows(
+          shape, image_type, flow_type)
+      rand_flows *= 0
+
+      predicted_interpolation = sess.run(
+          interp, feed_dict={
+              image: rand_image,
+              flows: rand_flows
+          })
+      self.assertAllClose(rand_image, predicted_interpolation)
+
+  def test_zero_flows(self):
+    """Apply check_zero_flow_correctness() for a few sizes and types."""
+
+    shapes_to_try = [[3, 4, 5, 6], [1, 2, 2, 1]]
+    for shape in shapes_to_try:
+      self.check_zero_flow_correctness(
+          shape, image_type='float32', flow_type='float32')
+
+  def check_interpolation_correctness(self,
+                                      shape,
+                                      image_type,
+                                      flow_type,
+                                      num_probes=5):
+    """Interpolate, and then assert correctness for a few query locations."""
+
+    image, flows = self.get_image_and_flow_placeholders(shape, image_type,
+                                                        flow_type)
+    interp = dense_image_warp.dense_image_warp(image, flows)
+    low_precision = image_type == 'float16' or flow_type == 'float16'
+    with self.test_session() as sess:
+      rand_image, rand_flows = self.get_random_image_and_flows(
+          shape, image_type, flow_type)
+
+      pred_interpolation = sess.run(
+          interp, feed_dict={
+              image: rand_image,
+              flows: rand_flows
+          })
+
+      for _ in range(num_probes):
+        batch_index = np.random.randint(0, shape[0])
+        y_index = np.random.randint(0, shape[1])
+        x_index = np.random.randint(0, shape[2])
+
+        self.assert_correct_interpolation_value(
+            rand_image,
+            rand_flows,
+            pred_interpolation,
+            batch_index,
+            y_index,
+            x_index,
+            low_precision=low_precision)
+
+  def test_interpolation(self):
+    """Apply check_interpolation_correctness() for a few sizes and types."""
+
+    shapes_to_try = [[3, 4, 5, 6], [1, 5, 5, 3], [1, 2, 2, 1]]
+    for im_type in ['float32', 'float64', 'float16']:
+      for flow_type in ['float32', 'float64', 'float16']:
+        for shape in shapes_to_try:
+          self.check_interpolation_correctness(shape, im_type, flow_type)
+
+  def test_gradients_exist(self):
+    """Check that backprop can run.
+
+    The correctness of the gradients is assumed, since the forward propagation
+    is tested to be correct and we only use built-in tf ops.
+    However, we perform a simple test to make sure that backprop can actually
+    run. We treat the flows as a tf.Variable and optimize them to minimize
+    the difference between the interpolated image and the input image.
+    """
+
+    batch_size, height, width, numchannels = [4, 5, 6, 7]
+    image_shape = [batch_size, height, width, numchannels]
+    image = random_ops.random_normal(image_shape)
+    flow_shape = [batch_size, height, width, 2]
+    init_flows = np.float32(np.random.normal(size=flow_shape) * 0.25)
+    flows = variables.Variable(init_flows)
+
+    interp = dense_image_warp.dense_image_warp(image, flows)
+    loss = math_ops.reduce_mean(math_ops.square(interp - image))
+
+    optimizer = adam.AdamOptimizer(1.0)
+    grad = gradients.gradients(loss, [flows])
+    opt_func = optimizer.apply_gradients(zip(grad, [flows]))
+    init_op = variables.global_variables_initializer()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(10):
+        sess.run(opt_func)
+
+  def test_size_exception(self):
+    """Make sure it throws an exception for images that are too small."""
+
+    shape = [1, 2, 1, 1]
+    msg = 'Should have raised an exception for invalid image size'
+    with self.assertRaises(ValueError, msg=msg):
+      self.check_interpolation_correctness(shape, 'float32', 'float32')
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py b/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1939caaa2d8586413cf9ecba6ce73cf64910d6fc
--- /dev/null
+++ b/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
@@ -0,0 +1,264 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for interpolate_spline."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import interpolate as sc_interpolate
+
+from tensorflow.contrib.image.python.ops import interpolate_spline
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+from tensorflow.python.training import momentum
+
+
+class _InterpolationProblem(object):
+  """Abstract class for interpolation problem descriptions."""
+
+  def get_problem(self, optimizable=False, extrapolate=True, dtype='float32'):
+    """Make data for an interpolation problem where all x vectors are n-d.
+
+    Args:
+      optimizable: If True, then make train_points a tf.Variable.
+      extrapolate: If False, then clamp the query_points values to be within
+      the max and min of train_points.
+      dtype: The data type to use.
+
+    Returns:
+      query_points, query_values, train_points, train_values: training and
+      test tensors for interpolation problem
+    """
+
+    # The values generated here depend on a seed of 0.
+    np.random.seed(0)
+
+    batch_size = 1
+    num_training_points = 10
+    num_query_points = 4
+
+    init_points = np.random.uniform(
+        size=[batch_size, num_training_points, self.DATA_DIM])
+
+    init_points = init_points.astype(dtype)
+    train_points = (
+        variables.Variable(init_points)
+        if optimizable else constant_op.constant(init_points))
+    train_values = self.tf_function(train_points)
+
+    query_points_np = np.random.uniform(
+        size=[batch_size, num_query_points, self.DATA_DIM])
+    query_points_np = query_points_np.astype(dtype)
+    if not extrapolate:
+      query_points_np = np.clip(query_points_np, np.min(init_points),
+                                np.max(init_points))
+
+    query_points = constant_op.constant(query_points_np)
+    query_values = self.np_function(query_points_np)
+
+    return query_points, query_values, train_points, train_values
+
+
+class _QuadraticPlusSinProblem1D(_InterpolationProblem):
+  """1D interpolation problem used for regression testing."""
+  DATA_DIM = 1
+  HARDCODED_QUERY_VALUES = {
+      (1.0, 0.0): [6.2647187603, -7.84362604077, -5.63690142322, 1.42928896387],
+      (1.0,
+       0.01): [6.77688289946, -8.02163669853, -5.79491157027, 1.4063285693],
+      (2.0,
+       0.0): [8.67110264937, -8.41281390883, -5.80190044693, 1.50155606059],
+      (2.0,
+       0.01): [6.70797816797, -7.49709587663, -5.28965776238, 1.52284731741],
+      (3.0,
+       0.0): [9.37691802935, -8.50390141515, -5.80786417426, 1.63467762122],
+      (3.0,
+       0.01): [4.47106304758, -5.71266128361, -3.92529303296, 1.86755293857],
+      (4.0,
+       0.0): [9.58172461111, -8.51432104771, -5.80967675388, 1.63361164256],
+      (4.0, 0.01): [
+          -3.87902711352, -0.0253462273846, 1.79857618022, -0.769339675725
+      ]
+  }
+
+  def np_function(self, x):
+    """Takes np array, evaluates the test function, and returns np array."""
+    return np.sum(
+        np.power((x - 0.5), 3) - 0.25 * x + 10 * np.sin(x * 10),
+        axis=2,
+        keepdims=True)
+
+  def tf_function(self, x):
+    """Takes tf tensor, evaluates the test function,  and returns tf tensor."""
+    return math_ops.reduce_mean(
+        math_ops.pow((x - 0.5), 3) - 0.25 * x + 10 * math_ops.sin(x * 10),
+        2,
+        keepdims=True)
+
+
+class _QuadraticPlusSinProblemND(_InterpolationProblem):
+  """3D interpolation problem used for regression testing."""
+
+  DATA_DIM = 3
+  HARDCODED_QUERY_VALUES = {
+      (1.0, 0.0): [1.06609663962, 1.28894849357, 1.10882405595, 1.63966936885],
+      (1.0, 0.01): [1.03123780748, 1.2952930985, 1.10366822954, 1.65265118569],
+      (2.0, 0.0): [0.627787735064, 1.43802857251, 1.00194632358, 1.91667538215],
+      (2.0, 0.01): [0.730159985046, 1.41702471595, 1.0065827217, 1.85758519312],
+      (3.0, 0.0): [0.350460417862, 1.67223539464, 1.00475331246, 2.31580322491],
+      (3.0,
+       0.01): [0.624557250556, 1.63138876667, 0.976588193162, 2.12511237866],
+      (4.0,
+       0.0): [0.898129669986, 1.24434133638, -0.938056116931, 1.59910338833],
+      (4.0,
+       0.01): [0.0930360338179, -3.38791305538, -1.00969032567, 0.745535080382],
+  }
+
+  def np_function(self, x):
+    """Takes np array, evaluates the test function, and returns np array."""
+    return np.sum(
+        np.square(x - 0.5) + 0.25 * x + 1 * np.sin(x * 15),
+        axis=2,
+        keepdims=True)
+
+  def tf_function(self, x):
+    """Takes tf tensor, evaluates the test function,  and returns tf tensor."""
+    return math_ops.reduce_sum(
+        math_ops.square(x - 0.5) + 0.25 * x + 1 * math_ops.sin(x * 15),
+        2,
+        keepdims=True)
+
+
+class InterpolateSplineTest(test_util.TensorFlowTestCase):
+
+  def test_1d_linear_interpolation(self):
+    """For 1d linear interpolation, we can compare directly to scipy."""
+
+    tp = _QuadraticPlusSinProblem1D()
+    (query_points, _, train_points, train_values) = tp.get_problem(
+        extrapolate=False, dtype='float64')
+    interpolation_order = 1
+
+    with ops.name_scope('interpolator'):
+      interpolator = interpolate_spline.interpolate_spline(
+          train_points, train_values, query_points, interpolation_order)
+      with self.test_session() as sess:
+        fetches = [query_points, train_points, train_values, interpolator]
+        query_points_, train_points_, train_values_, interp_ = sess.run(fetches)
+
+        # Just look at the first element of the minibatch.
+        # Also, trim the final singleton dimension.
+        interp_ = interp_[0, :, 0]
+        query_points_ = query_points_[0, :, 0]
+        train_points_ = train_points_[0, :, 0]
+        train_values_ = train_values_[0, :, 0]
+
+        # Compute scipy interpolation.
+        scipy_interp_function = sc_interpolate.interp1d(
+            train_points_, train_values_, kind='linear')
+
+        scipy_interpolation = scipy_interp_function(query_points_)
+        scipy_interpolation_on_train = scipy_interp_function(train_points_)
+
+        # Even with float64 precision, the interpolants disagree with scipy a
+        # bit due to the fact that we add the EPSILON to prevent sqrt(0), etc.
+        tol = 1e-3
+
+        self.assertAllClose(
+            train_values_, scipy_interpolation_on_train, atol=tol, rtol=tol)
+        self.assertAllClose(interp_, scipy_interpolation, atol=tol, rtol=tol)
+
+  def test_1d_interpolation(self):
+    """Regression test for interpolation with 1-D points."""
+
+    tp = _QuadraticPlusSinProblem1D()
+    (query_points, _, train_points,
+     train_values) = tp.get_problem(dtype='float64')
+
+    for order in (1, 2, 3):
+      for reg_weight in (0, 0.01):
+        interpolator = interpolate_spline.interpolate_spline(
+            train_points, train_values, query_points, order, reg_weight)
+
+        target_interpolation = tp.HARDCODED_QUERY_VALUES[(order, reg_weight)]
+        target_interpolation = np.array(target_interpolation)
+        with self.test_session() as sess:
+          interp_val = sess.run(interpolator)
+          self.assertAllClose(interp_val[0, :, 0], target_interpolation)
+
+  def test_nd_linear_interpolation(self):
+    """Regression test for interpolation with N-D points."""
+
+    tp = _QuadraticPlusSinProblemND()
+    (query_points, _, train_points,
+     train_values) = tp.get_problem(dtype='float64')
+
+    for order in (1, 2, 3):
+      for reg_weight in (0, 0.01):
+        interpolator = interpolate_spline.interpolate_spline(
+            train_points, train_values, query_points, order, reg_weight)
+
+        target_interpolation = tp.HARDCODED_QUERY_VALUES[(order, reg_weight)]
+        target_interpolation = np.array(target_interpolation)
+        with self.test_session() as sess:
+          interp_val = sess.run(interpolator)
+          self.assertAllClose(interp_val[0, :, 0], target_interpolation)
+
+  def test_interpolation_gradient(self):
+    """Make sure that backprop can run. Correctness of gradients is assumed.
+
+    Here, we create a use a small 'training' set and a more densely-sampled
+    set of query points, for which we know the true value in advance. The goal
+    is to choose x locations for the training data such that interpolating using
+    this training data yields the best reconstruction for the function
+    values at the query points. The training data locations are optimized
+    iteratively using gradient descent.
+    """
+    tp = _QuadraticPlusSinProblemND()
+    (query_points, query_values, train_points,
+     train_values) = tp.get_problem(optimizable=True)
+
+    regularization = 0.001
+    for interpolation_order in (1, 2, 3, 4):
+      interpolator = interpolate_spline.interpolate_spline(
+          train_points, train_values, query_points, interpolation_order,
+          regularization)
+
+      loss = math_ops.reduce_mean(math_ops.square(query_values - interpolator))
+
+      optimizer = momentum.MomentumOptimizer(0.001, 0.9)
+      grad = gradients.gradients(loss, [train_points])
+      grad, _ = clip_ops.clip_by_global_norm(grad, 1.0)
+      opt_func = optimizer.apply_gradients(zip(grad, [train_points]))
+      init_op = variables.global_variables_initializer()
+
+      with self.test_session() as sess:
+        sess.run(init_op)
+        for _ in range(100):
+          sess.run([loss, opt_func])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py b/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0135c66e293693345c3da7fdb21e28ca6d160154
--- /dev/null
+++ b/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py
@@ -0,0 +1,254 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sparse_image_warp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.image.python.ops import sparse_image_warp
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+from tensorflow.python.training import momentum
+
+
+class SparseImageWarpTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    np.random.seed(0)
+
+  def testGetBoundaryLocations(self):
+    image_height = 11
+    image_width = 11
+    num_points_per_edge = 4
+    locs = sparse_image_warp._get_boundary_locations(image_height, image_width,
+                                                     num_points_per_edge)
+    num_points = locs.shape[0]
+    self.assertEqual(num_points, 4 + 4 * num_points_per_edge)
+    locs = [(locs[i, 0], locs[i, 1]) for i in range(num_points)]
+    for i in (0, image_height - 1):
+      for j in (0, image_width - 1):
+        self.assertIn((i, j), locs, '{},{} not in the locations'.format(i, j))
+
+      for i in (2, 4, 6, 8):
+        for j in (0, image_width - 1):
+          self.assertIn((i, j), locs, '{},{} not in the locations'.format(i, j))
+
+      for i in (0, image_height - 1):
+        for j in (2, 4, 6, 8):
+          self.assertIn((i, j), locs, '{},{} not in the locations'.format(i, j))
+
+  def testGetGridLocations(self):
+    image_height = 5
+    image_width = 3
+    grid = sparse_image_warp._get_grid_locations(image_height, image_width)
+    for i in range(image_height):
+      for j in range(image_width):
+        self.assertEqual(grid[i, j, 0], i)
+        self.assertEqual(grid[i, j, 1], j)
+
+  def testZeroShift(self):
+    """Run assertZeroShift for various hyperparameters."""
+    for order in (1, 2):
+      for regularization in (0, 0.01):
+        for num_boundary_points in (0, 1):
+          self.assertZeroShift(order, regularization, num_boundary_points)
+
+  def assertZeroShift(self, order, regularization, num_boundary_points):
+    """Check that warping with zero displacements doesn't change the image."""
+    batch_size = 1
+    image_height = 4
+    image_width = 4
+    channels = 3
+
+    image = np.random.uniform(
+        size=[batch_size, image_height, image_width, channels])
+
+    input_image_op = constant_op.constant(np.float32(image))
+
+    control_point_locations = [[1., 1.], [2., 2.], [2., 1.]]
+    control_point_locations = constant_op.constant(
+        np.float32(np.expand_dims(control_point_locations, 0)))
+
+    control_point_displacements = np.zeros(
+        control_point_locations.shape.as_list())
+    control_point_displacements = constant_op.constant(
+        np.float32(control_point_displacements))
+
+    (warped_image_op, flow_field) = sparse_image_warp.sparse_image_warp(
+        input_image_op,
+        control_point_locations,
+        control_point_locations + control_point_displacements,
+        interpolation_order=order,
+        regularization_weight=regularization,
+        num_boundary_points=num_boundary_points)
+
+    with self.test_session() as sess:
+      warped_image, input_image, _ = sess.run(
+          [warped_image_op, input_image_op, flow_field])
+
+      self.assertAllClose(warped_image, input_image)
+
+  def testMoveSinglePixel(self):
+    """Run assertMoveSinglePixel for various hyperparameters and data types."""
+    for order in (1, 2):
+      for num_boundary_points in (1, 2):
+        for type_to_use in (dtypes.float32, dtypes.float64):
+          self.assertMoveSinglePixel(order, num_boundary_points, type_to_use)
+
+  def assertMoveSinglePixel(self, order, num_boundary_points, type_to_use):
+    """Move a single block in a small grid using warping."""
+    batch_size = 1
+    image_height = 7
+    image_width = 7
+    channels = 3
+
+    image = np.zeros([batch_size, image_height, image_width, channels])
+    image[:, 3, 3, :] = 1.0
+    input_image_op = constant_op.constant(image, dtype=type_to_use)
+
+    # Place a control point at the one white pixel.
+    control_point_locations = [[3., 3.]]
+    control_point_locations = constant_op.constant(
+        np.float32(np.expand_dims(control_point_locations, 0)),
+        dtype=type_to_use)
+    # Shift it one pixel to the right.
+    control_point_displacements = [[0., 1.0]]
+    control_point_displacements = constant_op.constant(
+        np.float32(np.expand_dims(control_point_displacements, 0)),
+        dtype=type_to_use)
+
+    (warped_image_op, flow_field) = sparse_image_warp.sparse_image_warp(
+        input_image_op,
+        control_point_locations,
+        control_point_locations + control_point_displacements,
+        interpolation_order=order,
+        num_boundary_points=num_boundary_points)
+
+    with self.test_session() as sess:
+      warped_image, input_image, flow = sess.run(
+          [warped_image_op, input_image_op, flow_field])
+      # Check that it moved the pixel correctly.
+      self.assertAllClose(
+          warped_image[0, 4, 5, :],
+          input_image[0, 4, 4, :],
+          atol=1e-5,
+          rtol=1e-5)
+
+      # Test that there is no flow at the corners.
+      for i in (0, image_height - 1):
+        for j in (0, image_width - 1):
+          self.assertAllClose(
+              flow[0, i, j, :], np.zeros([2]), atol=1e-5, rtol=1e-5)
+
+  def load_image(self, image_file, sess):
+    image_op = image_ops.decode_png(
+        io_ops.read_file(image_file), dtype=dtypes.uint8, channels=4)[:, :, 0:3]
+    return sess.run(image_op)
+
+  def testSmileyFace(self):
+    """Check warping accuracy by comparing to hardcoded warped images."""
+
+    test_data_dir = test.test_src_dir_path('contrib/image/python/'
+                                           'kernel_tests/test_data/')
+    input_file = test_data_dir + 'Yellow_Smiley_Face.png'
+    with self.test_session() as sess:
+      input_image = self.load_image(input_file, sess)
+    control_points = np.asarray([[64, 59], [180 - 64, 59], [39, 111],
+                                 [180 - 39, 111], [90, 143], [58, 134],
+                                 [180 - 58, 134]])  # pyformat: disable
+    control_point_displacements = np.asarray(
+        [[-10.5, 10.5], [10.5, 10.5], [0, 0], [0, 0], [0, -10], [-20, 10.25],
+         [10, 10.75]])
+    control_points_op = constant_op.constant(
+        np.expand_dims(np.float32(control_points[:, [1, 0]]), 0))
+    control_point_displacements_op = constant_op.constant(
+        np.expand_dims(np.float32(control_point_displacements[:, [1, 0]]), 0))
+    float_image = np.expand_dims(np.float32(input_image) / 255, 0)
+    input_image_op = constant_op.constant(float_image)
+
+    for interpolation_order in (1, 2, 3):
+      for num_boundary_points in (0, 1, 4):
+        warp_op, _ = sparse_image_warp.sparse_image_warp(
+            input_image_op,
+            control_points_op,
+            control_points_op + control_point_displacements_op,
+            interpolation_order=interpolation_order,
+            num_boundary_points=num_boundary_points)
+        with self.test_session() as sess:
+          warped_image = sess.run(warp_op)
+          out_image = np.uint8(warped_image[0, :, :, :] * 255)
+          target_file = (
+              test_data_dir +
+              'Yellow_Smiley_Face_Warp-interp' + '-{}-clamp-{}.png'.format(
+                  interpolation_order, num_boundary_points))
+
+          target_image = self.load_image(target_file, sess)
+
+          # Check that the target_image and out_image difference is no
+          # bigger than 2 (on a scale of 0-255). Due to differences in
+          # floating point computation on different devices, the float
+          # output in warped_image may get rounded to a different int
+          # than that in the saved png file loaded into target_image.
+          self.assertAllClose(target_image, out_image, atol=2, rtol=1e-3)
+
+  def testThatBackpropRuns(self):
+    """Run optimization to ensure that gradients can be computed."""
+
+    batch_size = 1
+    image_height = 9
+    image_width = 12
+    image = variables.Variable(
+        np.float32(
+            np.random.uniform(size=[batch_size, image_height, image_width, 3])))
+    control_point_locations = [[3., 3.]]
+    control_point_locations = constant_op.constant(
+        np.float32(np.expand_dims(control_point_locations, 0)))
+    control_point_displacements = [[0.25, -0.5]]
+    control_point_displacements = constant_op.constant(
+        np.float32(np.expand_dims(control_point_displacements, 0)))
+    warped_image, _ = sparse_image_warp.sparse_image_warp(
+        image,
+        control_point_locations,
+        control_point_locations + control_point_displacements,
+        num_boundary_points=3)
+
+    loss = math_ops.reduce_mean(math_ops.abs(warped_image - image))
+    optimizer = momentum.MomentumOptimizer(0.001, 0.9)
+    grad = gradients.gradients(loss, [image])
+    grad, _ = clip_ops.clip_by_global_norm(grad, 1.0)
+    opt_func = optimizer.apply_gradients(zip(grad, [image]))
+    init_op = variables.global_variables_initializer()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run([loss, opt_func])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face.png
new file mode 100644
index 0000000000000000000000000000000000000000..7e303881e213a82e412d18de9d9d86f368726f06
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-0.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..7fd9e4e6d69f3120428d1d778846d495cea1a989
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-0.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-1.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..86d225e5d2158804f88dca881f69ed3ab287d866
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-1.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-4.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..37e8ffae114625d0cc6a07ab2b8dbbb7413a3829
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-4.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-0.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..e49b5816120d43a669264915f1b6747606e080e0
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-0.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-1.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..df3cf2004312ed0ed0ebf1f0340cbfec7fd9ac46
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-1.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-4.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1799a87c8542d7e515b6185d7e8f6f75fe73f3e
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-4.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-0.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..2c346e0ce5487e21d41aa4e6306fd83a7b4ffdb4
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-0.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-1.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..6f8b65451cc08a463e4305ddc4be0dbe2879fae9
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-1.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-4.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e78146d955ae8f02230121e6314f3285e87611e
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-4.png differ
diff --git a/tensorflow/contrib/image/python/ops/dense_image_warp.py b/tensorflow/contrib/image/python/ops/dense_image_warp.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9b219ada492466919c615d8978e462e6c619d33
--- /dev/null
+++ b/tensorflow/contrib/image/python/ops/dense_image_warp.py
@@ -0,0 +1,201 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Image warping using per-pixel flow vectors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _interpolate_bilinear(grid,
+                          query_points,
+                          name='interpolate_bilinear',
+                          indexing='ij'):
+  """Similar to Matlab's interp2 function.
+
+  Finds values for query points on a grid using bilinear interpolation.
+
+  Args:
+    grid: a 4-D float `Tensor` of shape `[batch, height, width, channels]`.
+    query_points: a 3-D float `Tensor` of N points with shape `[batch, N, 2]`.
+    name: a name for the operation (optional).
+    indexing: whether the query points are specified as row and column (ij),
+      or Cartesian coordinates (xy).
+
+  Returns:
+    values: a 3-D `Tensor` with shape `[batch, N, channels]`
+
+  Raises:
+    ValueError: if the indexing mode is invalid, or if the shape of the inputs
+      invalid.
+  """
+  if indexing != 'ij' and indexing != 'xy':
+    raise ValueError('Indexing mode must be \'ij\' or \'xy\'')
+
+  with ops.name_scope(name):
+    grid = ops.convert_to_tensor(grid)
+    query_points = ops.convert_to_tensor(query_points)
+    shape = grid.get_shape().as_list()
+    if len(shape) != 4:
+      msg = 'Grid must be 4 dimensional. Received size: '
+      raise ValueError(msg + str(grid.get_shape()))
+
+    batch_size, height, width, channels = shape
+    query_type = query_points.dtype
+    grid_type = grid.dtype
+
+    if (len(query_points.get_shape()) != 3 or
+        query_points.get_shape()[2].value != 2):
+      msg = ('Query points must be 3 dimensional and size 2 in dim 2. Received '
+             'size: ')
+      raise ValueError(msg + str(query_points.get_shape()))
+
+    _, num_queries, _ = query_points.get_shape().as_list()
+
+    if height < 2 or width < 2:
+      msg = 'Grid must be at least batch_size x 2 x 2 in size. Received size: '
+      raise ValueError(msg + str(grid.get_shape()))
+
+    alphas = []
+    floors = []
+    ceils = []
+
+    index_order = [0, 1] if indexing == 'ij' else [1, 0]
+    unstacked_query_points = array_ops.unstack(query_points, axis=2)
+
+    for dim in index_order:
+      with ops.name_scope('dim-' + str(dim)):
+        queries = unstacked_query_points[dim]
+
+        size_in_indexing_dimension = shape[dim + 1]
+
+        # max_floor is size_in_indexing_dimension - 2 so that max_floor + 1
+        # is still a valid index into the grid.
+        max_floor = math_ops.cast(size_in_indexing_dimension - 2, query_type)
+        min_floor = constant_op.constant(0.0, dtype=query_type)
+        floor = math_ops.minimum(
+            math_ops.maximum(min_floor, math_ops.floor(queries)), max_floor)
+        int_floor = math_ops.cast(floor, dtypes.int32)
+        floors.append(int_floor)
+        ceil = int_floor + 1
+        ceils.append(ceil)
+
+        # alpha has the same type as the grid, as we will directly use alpha
+        # when taking linear combinations of pixel values from the image.
+        alpha = math_ops.cast(queries - floor, grid_type)
+        min_alpha = constant_op.constant(0.0, dtype=grid_type)
+        max_alpha = constant_op.constant(1.0, dtype=grid_type)
+        alpha = math_ops.minimum(math_ops.maximum(min_alpha, alpha), max_alpha)
+
+        # Expand alpha to [b, n, 1] so we can use broadcasting
+        # (since the alpha values don't depend on the channel).
+        alpha = array_ops.expand_dims(alpha, 2)
+        alphas.append(alpha)
+
+    if batch_size * height * width > np.iinfo(np.int32).max / 8:
+      error_msg = """The image size or batch size is sufficiently large
+                     that the linearized addresses used by array_ops.gather
+                     may exceed the int32 limit."""
+      raise ValueError(error_msg)
+
+    flattened_grid = array_ops.reshape(grid,
+                                       [batch_size * height * width, channels])
+    batch_offsets = array_ops.reshape(
+        math_ops.range(batch_size) * height * width, [batch_size, 1])
+
+    # This wraps array_ops.gather. We reshape the image data such that the
+    # batch, y, and x coordinates are pulled into the first dimension.
+    # Then we gather. Finally, we reshape the output back. It's possible this
+    # code would be made simpler by using array_ops.gather_nd.
+    def gather(y_coords, x_coords, name):
+      with ops.name_scope('gather-' + name):
+        linear_coordinates = batch_offsets + y_coords * width + x_coords
+        gathered_values = array_ops.gather(flattened_grid, linear_coordinates)
+        return array_ops.reshape(gathered_values,
+                                 [batch_size, num_queries, channels])
+
+    # grab the pixel values in the 4 corners around each query point
+    top_left = gather(floors[0], floors[1], 'top_left')
+    top_right = gather(floors[0], ceils[1], 'top_right')
+    bottom_left = gather(ceils[0], floors[1], 'bottom_left')
+    bottom_right = gather(ceils[0], ceils[1], 'bottom_right')
+
+    # now, do the actual interpolation
+    with ops.name_scope('interpolate'):
+      interp_top = alphas[1] * (top_right - top_left) + top_left
+      interp_bottom = alphas[1] * (bottom_right - bottom_left) + bottom_left
+      interp = alphas[0] * (interp_bottom - interp_top) + interp_top
+
+    return interp
+
+
+def dense_image_warp(image, flow, name='dense_image_warp'):
+  """Image warping using per-pixel flow vectors.
+
+  Apply a non-linear warp to the image, where the warp is specified by a dense
+  flow field of offset vectors that define the correspondences of pixel values
+  in the output image back to locations in the  source image. Specifically, the
+  pixel value at output[b, j, i, c] is
+  images[b, j - flow[b, j, i, 0], i - flow[b, j, i, 1], c].
+
+  The locations specified by this formula do not necessarily map to an int
+  index. Therefore, the pixel value is obtained by bilinear
+  interpolation of the 4 nearest pixels around
+  (b, j - flow[b, j, i, 0], i - flow[b, j, i, 1]). For locations outside
+  of the image, we use the nearest pixel values at the image boundary.
+
+
+  Args:
+    image: 4-D float `Tensor` with shape `[batch, height, width, channels]`.
+    flow: A 4-D float `Tensor` with shape `[batch, height, width, 2]`.
+    name: A name for the operation (optional).
+
+    Note that image and flow can be of type tf.half, tf.float32, or tf.float64,
+    and do not necessarily have to be the same type.
+
+  Returns:
+    A 4-D float `Tensor` with shape`[batch, height, width, channels]`
+      and same type as input image.
+
+  Raises:
+    ValueError: if height < 2 or width < 2 or the inputs have the wrong number
+                of dimensions.
+  """
+  with ops.name_scope(name):
+    batch_size, height, width, channels = image.get_shape().as_list()
+    # The flow is defined on the image grid. Turn the flow into a list of query
+    # points in the grid space.
+    grid_x, grid_y = array_ops.meshgrid(
+        math_ops.range(width), math_ops.range(height))
+    stacked_grid = math_ops.cast(
+        array_ops.stack([grid_y, grid_x], axis=2), flow.dtype)
+    batched_grid = array_ops.expand_dims(stacked_grid, axis=0)
+    query_points_on_grid = batched_grid - flow
+    query_points_flattened = array_ops.reshape(query_points_on_grid,
+                                               [batch_size, height * width, 2])
+    # Compute values at the query points, then reshape the result back to the
+    # image grid.
+    interpolated = _interpolate_bilinear(image, query_points_flattened)
+    interpolated = array_ops.reshape(interpolated,
+                                     [batch_size, height, width, channels])
+    return interpolated
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index c139ae89d8d682d6b87813c3a21703ffa762f28e..cd984c80543886be1f682933e2e003bd3374e425 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -433,7 +433,7 @@ def bipartite_match(distance_mat,
       of rows of the input `distance_matrix`. If `row_to_col_match_indices[i]`
       is not -1, row i is matched to column `row_to_col_match_indices[i]`.
     col_to_row_match_indices: A vector of length num_columns, which is the
-      number of columns of the input ditance matrix.
+      number of columns of the input distance matrix.
       If `col_to_row_match_indices[j]` is not -1, column j is matched to row
       `col_to_row_match_indices[j]`.
   """
diff --git a/tensorflow/contrib/image/python/ops/interpolate_spline.py b/tensorflow/contrib/image/python/ops/interpolate_spline.py
new file mode 100644
index 0000000000000000000000000000000000000000..daf8c56456327f102f1409296a91f9f7b68ec799
--- /dev/null
+++ b/tensorflow/contrib/image/python/ops/interpolate_spline.py
@@ -0,0 +1,291 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Polyharmonic spline interpolation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+
+EPSILON = 0.0000000001
+
+
+def _cross_squared_distance_matrix(x, y):
+  """Pairwise squared distance between two (batch) matrices' rows (2nd dim).
+
+  Computes the pairwise distances between rows of x and rows of y
+  Args:
+    x: [batch_size, n, d] float `Tensor`
+    y: [batch_size, m, d] float `Tensor`
+
+  Returns:
+    squared_dists: [batch_size, n, m] float `Tensor`, where
+    squared_dists[b,i,j] = ||x[b,i,:] - y[b,j,:]||^2
+  """
+  x_norm_squared = math_ops.reduce_sum(math_ops.square(x), 2)
+  y_norm_squared = math_ops.reduce_sum(math_ops.square(y), 2)
+
+  # Expand so that we can broadcast.
+  x_norm_squared_tile = array_ops.expand_dims(x_norm_squared, 2)
+  y_norm_squared_tile = array_ops.expand_dims(y_norm_squared, 1)
+
+  x_y_transpose = math_ops.matmul(x, y, adjoint_b=True)
+
+  # squared_dists[b,i,j] = ||x_bi - y_bj||^2 = x_bi'x_bi- 2x_bi'x_bj + x_bj'x_bj
+  squared_dists = x_norm_squared_tile - 2 * x_y_transpose + y_norm_squared_tile
+
+  return squared_dists
+
+
+def _pairwise_squared_distance_matrix(x):
+  """Pairwise squared distance among a (batch) matrix's rows (2nd dim).
+
+  This saves a bit of computation vs. using _cross_squared_distance_matrix(x,x)
+
+  Args:
+    x: `[batch_size, n, d]` float `Tensor`
+
+  Returns:
+    squared_dists: `[batch_size, n, n]` float `Tensor`, where
+    squared_dists[b,i,j] = ||x[b,i,:] - x[b,j,:]||^2
+  """
+
+  x_x_transpose = math_ops.matmul(x, x, adjoint_b=True)
+  x_norm_squared = array_ops.matrix_diag_part(x_x_transpose)
+  x_norm_squared_tile = array_ops.expand_dims(x_norm_squared, 2)
+
+  # squared_dists[b,i,j] = ||x_bi - x_bj||^2 = x_bi'x_bi- 2x_bi'x_bj + x_bj'x_bj
+  squared_dists = x_norm_squared_tile - 2 * x_x_transpose + array_ops.transpose(
+      x_norm_squared_tile, [0, 2, 1])
+
+  return squared_dists
+
+
+def _solve_interpolation(train_points, train_values, order,
+                         regularization_weight):
+  """Solve for interpolation coefficients.
+
+  Computes the coefficients of the polyharmonic interpolant for the 'training'
+  data defined by (train_points, train_values) using the kernel phi.
+
+  Args:
+    train_points: `[b, n, d]` interpolation centers
+    train_values: `[b, n, k]` function values
+    order: order of the interpolation
+    regularization_weight: weight to place on smoothness regularization term
+
+  Returns:
+    w: `[b, n, k]` weights on each interpolation center
+    v: `[b, d, k]` weights on each input dimension
+  """
+
+  b, n, d = train_points.get_shape().as_list()
+  _, _, k = train_values.get_shape().as_list()
+
+  # First, rename variables so that the notation (c, f, w, v, A, B, etc.)
+  # follows https://en.wikipedia.org/wiki/Polyharmonic_spline.
+  # To account for python style guidelines we use
+  # matrix_a for A and matrix_b for B.
+
+  c = train_points
+  f = train_values
+
+  # Next, construct the linear system.
+  with ops.name_scope('construct_linear_system'):
+
+    matrix_a = _phi(_pairwise_squared_distance_matrix(c), order)  # [b, n, n]
+    if regularization_weight > 0:
+      batch_identity_matrix = np.expand_dims(np.eye(n), 0)
+      batch_identity_matrix = constant_op.constant(
+          batch_identity_matrix, dtype=train_points.dtype)
+
+      matrix_a += regularization_weight * batch_identity_matrix
+
+    # Append ones to the feature values for the bias term in the linear model.
+    ones = array_ops.ones([b, n, 1], train_points.dtype)
+    matrix_b = array_ops.concat([c, ones], 2)  # [b, n, d + 1]
+
+    # [b, n + d + 1, n]
+    left_block = array_ops.concat(
+        [matrix_a, array_ops.transpose(matrix_b, [0, 2, 1])], 1)
+
+    num_b_cols = matrix_b.get_shape()[2]  # d + 1
+    lhs_zeros = array_ops.zeros([b, num_b_cols, num_b_cols], train_points.dtype)
+    right_block = array_ops.concat([matrix_b, lhs_zeros],
+                                   1)  # [b, n + d + 1, d + 1]
+    lhs = array_ops.concat([left_block, right_block],
+                           2)  # [b, n + d + 1, n + d + 1]
+
+    rhs_zeros = array_ops.zeros([b, d + 1, k], train_points.dtype)
+    rhs = array_ops.concat([f, rhs_zeros], 1)  # [b, n + d + 1, k]
+
+  # Then, solve the linear system and unpack the results.
+  with ops.name_scope('solve_linear_system'):
+    w_v = linalg_ops.matrix_solve(lhs, rhs)
+    w = w_v[:, :n, :]
+    v = w_v[:, n:, :]
+
+  return w, v
+
+
+def _apply_interpolation(query_points, train_points, w, v, order):
+  """Apply polyharmonic interpolation model to data.
+
+  Given coefficients w and v for the interpolation model, we evaluate
+  interpolated function values at query_points.
+
+  Args:
+    query_points: `[b, m, d]` x values to evaluate the interpolation at
+    train_points: `[b, n, d]` x values that act as the interpolation centers
+                    ( the c variables in the wikipedia article)
+    w: `[b, n, k]` weights on each interpolation center
+    v: `[b, d, k]` weights on each input dimension
+    order: order of the interpolation
+
+  Returns:
+    Polyharmonic interpolation evaluated at points defined in query_points.
+  """
+
+  batch_size = train_points.get_shape()[0].value
+  num_query_points = query_points.get_shape()[1].value
+
+  # First, compute the contribution from the rbf term.
+  pairwise_dists = _cross_squared_distance_matrix(query_points, train_points)
+  phi_pairwise_dists = _phi(pairwise_dists, order)
+
+  rbf_term = math_ops.matmul(phi_pairwise_dists, w)
+
+  # Then, compute the contribution from the linear term.
+  # Pad query_points with ones, for the bias term in the linear model.
+  query_points_pad = array_ops.concat([
+      query_points,
+      array_ops.ones([batch_size, num_query_points, 1], train_points.dtype)
+  ], 2)
+  linear_term = math_ops.matmul(query_points_pad, v)
+
+  return rbf_term + linear_term
+
+
+def _phi(r, order):
+  """Coordinate-wise nonlinearity used to define the order of the interpolation.
+
+  See https://en.wikipedia.org/wiki/Polyharmonic_spline for the definition.
+
+  Args:
+    r: input op
+    order: interpolation order
+
+  Returns:
+    phi_k evaluated coordinate-wise on r, for k = r
+  """
+
+  # using EPSILON prevents log(0), sqrt0), etc.
+  # sqrt(0) is well-defined, but its gradient is not
+  with ops.name_scope('phi'):
+    if order == 1:
+      r = math_ops.maximum(r, EPSILON)
+      r = math_ops.sqrt(r)
+      return r
+    elif order == 2:
+      return 0.5 * r * math_ops.log(math_ops.maximum(r, EPSILON))
+    elif order == 4:
+      return 0.5 * math_ops.square(r) * math_ops.log(
+          math_ops.maximum(r, EPSILON))
+    elif order % 2 == 0:
+      r = math_ops.maximum(r, EPSILON)
+      return 0.5 * math_ops.pow(r, 0.5 * order) * math_ops.log(r)
+    else:
+      r = math_ops.maximum(r, EPSILON)
+      return math_ops.pow(r, 0.5 * order)
+
+
+def interpolate_spline(train_points,
+                       train_values,
+                       query_points,
+                       order,
+                       regularization_weight=0.0,
+                       name='interpolate_spline'):
+  r"""Interpolate signal using polyharmonic interpolation.
+
+  The interpolant has the form
+  $$f(x) = \sum_{i = 1}^n w_i \phi(||x - c_i||) + v^T x + b.$$
+
+  This is a sum of two terms: (1) a weighted sum of radial basis function (RBF)
+  terms, with the centers \\(c_1, ... c_n\\), and (2) a linear term with a bias.
+  The \\(c_i\\) vectors are 'training' points. In the code, b is absorbed into v
+  by appending 1 as a final dimension to x. The coefficients w and v are
+  estimated such that the interpolant exactly fits the value of the function at
+  the \\(c_i\\) points, the vector w is orthogonal to each \\(c_i\\), and the
+  vector w sums to 0. With these constraints, the coefficients can be obtained
+  by solving a linear system.
+
+  \\(\phi\\) is an RBF, parametrized by an interpolation
+  order. Using order=2 produces the well-known thin-plate spline.
+
+  We also provide the option to perform regularized interpolation. Here, the
+  interpolant is selected to trade off between the squared loss on the training
+  data and a certain measure of its curvature
+  ([details](https://en.wikipedia.org/wiki/Polyharmonic_spline)).
+  Using a regularization weight greater than zero has the effect that the
+  interpolant will no longer exactly fit the training data. However, it may be
+  less vulnerable to overfitting, particularly for high-order interpolation.
+
+  Note the interpolation procedure is differentiable with respect to all inputs
+  besides the order parameter.
+
+  Args:
+    train_points: `[batch_size, n, d]` float `Tensor` of n d-dimensional
+      locations. These do not need to be regularly-spaced.
+    train_values: `[batch_size, n, k]` float `Tensor` of n c-dimensional values
+      evaluated at train_points.
+    query_points: `[batch_size, m, d]` `Tensor` of m d-dimensional locations
+      where we will output the interpolant's values.
+    order: order of the interpolation. Common values are 1 for
+      \\(\phi(r) = r\\), 2 for \\(\phi(r) = r^2 * log(r)\\) (thin-plate spline),
+       or 3 for \\(\phi(r) = r^3\\).
+    regularization_weight: weight placed on the regularization term.
+      This will depend substantially on the problem, and it should always be
+      tuned. For many problems, it is reasonable to use no regularization.
+      If using a non-zero value, we recommend a small value like 0.001.
+    name: name prefix for ops created by this function
+
+  Returns:
+    `[b, m, k]` float `Tensor` of query values. We use train_points and
+    train_values to perform polyharmonic interpolation. The query values are
+    the values of the interpolant evaluated at the locations specified in
+    query_points.
+  """
+  with ops.name_scope(name):
+    train_points = ops.convert_to_tensor(train_points)
+    train_values = ops.convert_to_tensor(train_values)
+    query_points = ops.convert_to_tensor(query_points)
+
+    # First, fit the spline to the observed data.
+    with ops.name_scope('solve'):
+      w, v = _solve_interpolation(train_points, train_values, order,
+                                  regularization_weight)
+
+    # Then, evaluate the spline at the query locations.
+    with ops.name_scope('predict'):
+      query_values = _apply_interpolation(query_points, train_points, w, v,
+                                          order)
+
+  return query_values
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index d4a6a5bcbb52511d4093587814100b2a0e8b2420..0ceb683ff4c6965a5ee4bcb04846a69d4d8ea0a5 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -45,7 +45,7 @@ def single_image_random_dot_stereograms(depth_values,
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
   will encode 3-D data into a 2-D image.  The output of this Op is suitable
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
-  corrupt the encode 3-D data witin the image.
+  corrupt the encode 3-D data within the image.
 
   Based upon [this
   paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
diff --git a/tensorflow/contrib/image/python/ops/sparse_image_warp.py b/tensorflow/contrib/image/python/ops/sparse_image_warp.py
new file mode 100644
index 0000000000000000000000000000000000000000..54a215d6db6ded56a1a4a018a7e176f35fe6397e
--- /dev/null
+++ b/tensorflow/contrib/image/python/ops/sparse_image_warp.py
@@ -0,0 +1,201 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Image warping using sparse flow defined at control points."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.image.python.ops import dense_image_warp
+from tensorflow.contrib.image.python.ops import interpolate_spline
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+
+
+def _get_grid_locations(image_height, image_width):
+  """Wrapper for np.meshgrid."""
+
+  y_range = np.linspace(0, image_height - 1, image_height)
+  x_range = np.linspace(0, image_width - 1, image_width)
+  y_grid, x_grid = np.meshgrid(y_range, x_range, indexing='ij')
+  return np.stack((y_grid, x_grid), -1)
+
+
+def _expand_to_minibatch(np_array, batch_size):
+  """Tile arbitrarily-sized np_array to include new batch dimension."""
+  tiles = [batch_size] + [1] * np_array.ndim
+  return np.tile(np.expand_dims(np_array, 0), tiles)
+
+
+def _get_boundary_locations(image_height, image_width, num_points_per_edge):
+  """Compute evenly-spaced indices along edge of image."""
+  y_range = np.linspace(0, image_height - 1, num_points_per_edge + 2)
+  x_range = np.linspace(0, image_width - 1, num_points_per_edge + 2)
+  ys, xs = np.meshgrid(y_range, x_range, indexing='ij')
+  is_boundary = np.logical_or(
+      np.logical_or(xs == 0, xs == image_width - 1),
+      np.logical_or(ys == 0, ys == image_height - 1))
+  return np.stack([ys[is_boundary], xs[is_boundary]], axis=-1)
+
+
+def _add_zero_flow_controls_at_boundary(control_point_locations,
+                                        control_point_flows, image_height,
+                                        image_width, boundary_points_per_edge):
+  """Add control points for zero-flow boundary conditions.
+
+   Augment the set of control points with extra points on the
+   boundary of the image that have zero flow.
+
+  Args:
+    control_point_locations: input control points
+    control_point_flows: their flows
+    image_height: image height
+    image_width: image width
+    boundary_points_per_edge: number of points to add in the middle of each
+                           edge (not including the corners).
+                           The total number of points added is
+                           4 + 4*(boundary_points_per_edge).
+
+  Returns:
+    merged_control_point_locations: augmented set of control point locations
+    merged_control_point_flows: augmented set of control point flows
+  """
+
+  batch_size = control_point_locations.get_shape()[0].value
+
+  boundary_point_locations = _get_boundary_locations(image_height, image_width,
+                                                     boundary_points_per_edge)
+
+  boundary_point_flows = np.zeros([boundary_point_locations.shape[0], 2])
+
+  type_to_use = control_point_locations.dtype
+  boundary_point_locations = constant_op.constant(
+      _expand_to_minibatch(boundary_point_locations, batch_size),
+      dtype=type_to_use)
+
+  boundary_point_flows = constant_op.constant(
+      _expand_to_minibatch(boundary_point_flows, batch_size), dtype=type_to_use)
+
+  merged_control_point_locations = array_ops.concat(
+      [control_point_locations, boundary_point_locations], 1)
+
+  merged_control_point_flows = array_ops.concat(
+      [control_point_flows, boundary_point_flows], 1)
+
+  return merged_control_point_locations, merged_control_point_flows
+
+
+def sparse_image_warp(image,
+                      source_control_point_locations,
+                      dest_control_point_locations,
+                      interpolation_order=2,
+                      regularization_weight=0.0,
+                      num_boundary_points=0,
+                      name='sparse_image_warp'):
+  """Image warping using correspondences between sparse control points.
+
+  Apply a non-linear warp to the image, where the warp is specified by
+  the source and destination locations of a (potentially small) number of
+  control points. First, we use a polyharmonic spline
+  (@{tf.contrib.image.interpolate_spline}) to interpolate the displacements
+  between the corresponding control points to a dense flow field.
+  Then, we warp the image using this dense flow field
+  (@{tf.contrib.image.dense_image_warp}).
+
+  Let t index our control points. For regularization_weight=0, we have:
+  warped_image[b, dest_control_point_locations[b, t, 0],
+                  dest_control_point_locations[b, t, 1], :] =
+  image[b, source_control_point_locations[b, t, 0],
+           source_control_point_locations[b, t, 1], :].
+
+  For regularization_weight > 0, this condition is met approximately, since
+  regularized interpolation trades off smoothness of the interpolant vs.
+  reconstruction of the interpolant at the control points.
+  See @{tf.contrib.image.interpolate_spline} for further documentation of the
+  interpolation_order and regularization_weight arguments.
+
+
+  Args:
+    image: `[batch, height, width, channels]` float `Tensor`
+    source_control_point_locations: `[batch, num_control_points, 2]` float
+      `Tensor`
+    dest_control_point_locations: `[batch, num_control_points, 2]` float
+      `Tensor`
+    interpolation_order: polynomial order used by the spline interpolation
+    regularization_weight: weight on smoothness regularizer in interpolation
+    num_boundary_points: How many zero-flow boundary points to include at
+      each image edge.Usage:
+        num_boundary_points=0: don't add zero-flow points
+        num_boundary_points=1: 4 corners of the image
+        num_boundary_points=2: 4 corners and one in the middle of each edge
+          (8 points total)
+        num_boundary_points=n: 4 corners and n-1 along each edge
+    name: A name for the operation (optional).
+
+    Note that image and offsets can be of type tf.half, tf.float32, or
+    tf.float64, and do not necessarily have to be the same type.
+
+  Returns:
+    warped_image: `[batch, height, width, channels]` float `Tensor` with same
+      type as input image.
+    flow_field: `[batch, height, width, 2]` float `Tensor` containing the dense
+      flow field produced by the interpolation.
+  """
+
+  image = ops.convert_to_tensor(image)
+  source_control_point_locations = ops.convert_to_tensor(
+      source_control_point_locations)
+  dest_control_point_locations = ops.convert_to_tensor(
+      dest_control_point_locations)
+
+  control_point_flows = (
+      dest_control_point_locations - source_control_point_locations)
+
+  clamp_boundaries = num_boundary_points > 0
+  boundary_points_per_edge = num_boundary_points - 1
+
+  with ops.name_scope(name):
+
+    batch_size, image_height, image_width, _ = image.get_shape().as_list()
+
+    # This generates the dense locations where the interpolant
+    # will be evaluated.
+    grid_locations = _get_grid_locations(image_height, image_width)
+
+    flattened_grid_locations = np.reshape(grid_locations,
+                                          [image_height * image_width, 2])
+
+    flattened_grid_locations = constant_op.constant(
+        _expand_to_minibatch(flattened_grid_locations, batch_size), image.dtype)
+
+    if clamp_boundaries:
+      (dest_control_point_locations,
+       control_point_flows) = _add_zero_flow_controls_at_boundary(
+           dest_control_point_locations, control_point_flows, image_height,
+           image_width, boundary_points_per_edge)
+
+    flattened_flows = interpolate_spline.interpolate_spline(
+        dest_control_point_locations, control_point_flows,
+        flattened_grid_locations, interpolation_order, regularization_weight)
+
+    dense_flows = array_ops.reshape(flattened_flows,
+                                    [batch_size, image_height, image_width, 2])
+
+    warped_image = dense_image_warp.dense_image_warp(image, dense_flows)
+
+    return warped_image, dense_flows
diff --git a/tensorflow/contrib/input_pipeline/BUILD b/tensorflow/contrib/input_pipeline/BUILD
index 9d6b4d5d87e24d72b29ab33ee805fe0d068cc30a..0e34315db45d61282af1882631dc769a72965c3e 100644
--- a/tensorflow/contrib/input_pipeline/BUILD
+++ b/tensorflow/contrib/input_pipeline/BUILD
@@ -114,14 +114,3 @@ tf_cc_tests(
         "//tensorflow/core:testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/input_pipeline/kernels/BUILD b/tensorflow/contrib/input_pipeline/kernels/BUILD
index f20a6e38d4e80f869e9274d6fc49338a95fc6788..797605b8fe66e8375edcc70668a07a8d2a6d73f3 100644
--- a/tensorflow/contrib/input_pipeline/kernels/BUILD
+++ b/tensorflow/contrib/input_pipeline/kernels/BUILD
@@ -17,14 +17,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/integrate/BUILD b/tensorflow/contrib/integrate/BUILD
index 66948c1ea1f3f239d3f43a57626f8c229fe24ad9..0b7d64f4edd7587000ca5b9ecae257fe8fedd4a1 100644
--- a/tensorflow/contrib/integrate/BUILD
+++ b/tensorflow/contrib/integrate/BUILD
@@ -42,14 +42,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/integrate/__init__.py b/tensorflow/contrib/integrate/__init__.py
index 68bf511099ab473d158108df6ff07827819297d9..694f0c14bd4e74535c70fab76c5f7ac58f452559 100644
--- a/tensorflow/contrib/integrate/__init__.py
+++ b/tensorflow/contrib/integrate/__init__.py
@@ -18,6 +18,7 @@
 See the @{$python/contrib.integrate} guide.
 
 @@odeint
+@@odeint_fixed
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD
index efb403462a6e5df5b69ac0735ffc03f40d4a252c..3913c9dc7abfba2829bde5e86fe2927e8fc29a9d 100644
--- a/tensorflow/contrib/kafka/BUILD
+++ b/tensorflow/contrib/kafka/BUILD
@@ -1,66 +1,93 @@
-package(
-    default_visibility = ["//visibility:private"],
-)
+package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_libs",
+    "tf_py_test",
+)
 
-tf_kernel_library(
-    name = "kafka_kernels",
+py_library(
+    name = "kafka",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dataset_ops",
+    ],
+)
+
+tf_custom_op_library(
+    name = "_dataset_ops.so",
+    srcs = ["ops/dataset_ops.cc"],
+    deps = [":dataset_kernels"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["dataset_ops"],
+)
+
+cc_library(
+    name = "dataset_kernels",
     srcs = ["kernels/kafka_dataset_ops.cc"],
-    visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:bounds_check_lib",
-        "//tensorflow/core/kernels:dataset",
+        "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
         "@kafka",
+        "@protobuf_archive//:protobuf_headers",
     ],
+    alwayslink = 1,
 )
 
-tf_gen_op_libs(
-    op_lib_names = ["kafka_ops"],
+py_library(
+    name = "dataset_ops",
+    srcs = [
+        "python/ops/kafka_dataset_ops.py",
+    ],
+    srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/core:lib",
+        ":kafka_op_loader",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
     ],
 )
 
 tf_gen_op_wrapper_py(
-    name = "gen_kafka_ops",
-    out = "python/ops/gen_kafka_ops.py",
-    require_shape_functions = True,
-    deps = [":kafka_ops_op_lib"],
+    name = "gen_dataset_ops",
+    out = "python/ops/gen_dataset_ops.py",
+    deps = ["//tensorflow/contrib/kafka:dataset_ops_op_lib"],
 )
 
-py_library(
-    name = "kafka",
-    srcs = [
-        "__init__.py",
-        "python/ops/kafka_dataset_ops.py",
+tf_kernel_library(
+    name = "dataset_ops_kernels",
+    deps = [
+        ":dataset_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_py_library(
+    name = "kafka_op_loader",
+    srcs = ["python/ops/kafka_op_loader.py"],
+    dso = ["//tensorflow/contrib/kafka:_dataset_ops.so"],
+    kernels = [
+        ":dataset_ops_kernels",
+        "//tensorflow/contrib/kafka:dataset_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
     deps = [
-        ":gen_kafka_ops",
+        ":gen_dataset_ops",
         "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
     ],
 )
 
@@ -88,18 +115,7 @@ tf_py_test(
     ],
     tags = [
         "manual",
+        "no_windows",
         "notap",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
index 88ef5f357113372b0a2d0cb13382ac980a61252d..a4cd4a2cc4b99b5906185bd2b942ed15c1ddf5e4 100644
--- a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/dataset.h"
-
-#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/dataset.h"
 
 #include "src-cpp/rdkafkacpp.h"
 
diff --git a/tensorflow/contrib/kafka/ops/kafka_ops.cc b/tensorflow/contrib/kafka/ops/dataset_ops.cc
similarity index 100%
rename from tensorflow/contrib/kafka/ops/kafka_ops.cc
rename to tensorflow/contrib/kafka/ops/dataset_ops.cc
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
index 8e51d27a342359881de072c3979a2b5a7fc034ea..a1624614d1ab1be31463c5cdc0b4cfb653165a0c 100644
--- a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -17,8 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.kafka.python.ops import gen_kafka_ops
-from tensorflow.python.data.ops.readers import Dataset
+from tensorflow.contrib.kafka.python.ops import kafka_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.kafka.python.ops import gen_dataset_ops
+from tensorflow.python.data.ops.dataset_ops import Dataset
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -58,8 +59,8 @@ class KafkaDataset(Dataset):
         timeout, dtype=dtypes.int64, name="timeout")
 
   def _as_variant_tensor(self):
-    return gen_kafka_ops.kafka_dataset(self._topics, self._servers, self._group,
-                                       self._eof, self._timeout)
+    return gen_dataset_ops.kafka_dataset(self._topics, self._servers,
+                                         self._group, self._eof, self._timeout)
 
   @property
   def output_classes(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py b/tensorflow/contrib/kafka/python/ops/kafka_op_loader.py
similarity index 75%
rename from tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
rename to tensorflow/contrib/kafka/python/ops/kafka_op_loader.py
index 690a44ff4368663306733300a1ea70397fb93e1e..ec2fdea962ef946d3f8f32b9e630b92649d612fe 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_op_loader.py
@@ -12,8 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Experimental methods for tf.feature_column sequential input."""
-
+"""Python helper for loading kafka ops and kernels."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index 7e0019ce4ad6c96e09ac9e222e2f4e2840273983..7a4cab20d1a3471af2a2a402a6d1443a90fa7f9b 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -52,15 +52,3 @@ py_library(
         "//tensorflow/python/keras",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kernel_methods/BUILD b/tensorflow/contrib/kernel_methods/BUILD
index eff7dfeb4c1117e40f4faf43c5e92a52cffd6528..87c2dcd89b63fa9f92d93c87abce91fd3460d44e 100644
--- a/tensorflow/contrib/kernel_methods/BUILD
+++ b/tensorflow/contrib/kernel_methods/BUILD
@@ -90,15 +90,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index f182fef067b7f523bc5ca63227265be40528b171..4ef0a66a52429233c6e6f70667a451466493629c 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -43,10 +43,10 @@ def sparse_multiclass_hinge_loss(
 
   This is a generalization of standard (binary) hinge loss. For a given instance
   with correct label c*, the loss is given by:
-    loss = max_{c != c*} logits_c - logits_{c*} + 1.
+    $$loss = max_{c != c*} logits_c - logits_{c*} + 1.$$
   or equivalently
-    loss = max_c { logits_c - logits_{c*} + I_{c != c*} }
-  where I_{c != c*} = 1 if c != c* and 0 otherwise.
+    $$loss = max_c { logits_c - logits_{c*} + I_{c != c*} }$$
+  where \\(I_{c != c*} = 1\ \text{if}\ c != c*\\) and 0 otherwise.
 
   Args:
     labels: `Tensor` of shape [batch_size] or [batch_size, 1]. Corresponds to
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
index 9dc01124ab195ae17b8795a11e4ebefe3f2c746b..9a721a9d440e66eb30bb94daf2b6878318f1e75f 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@@ -34,33 +34,31 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
   r"""Class that implements Random Fourier Feature Mapping (RFFM) in TensorFlow.
 
   The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
-  ```
-  exp(-||x-y||_2^2 / (2 * sigma^2))
-  ```
+  $$(exp(-||x-y||_2^2 / (2 * \sigma^2))$$
 
   The implementation of RFFM is based on the following paper:
   "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
   (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
 
-  The mapping uses a matrix `Omega \in R^{d x D}` and a bias vector `b \in R^D`
-  where `d` is the input dimension (number of dense input features) and `D` is
-  the output dimension (i.e., dimension of the feature space the input is mapped
-  to). Each entry of `Omega` is sampled i.i.d. from a (scaled) Gaussian
-  distribution and each entry of `b` is sampled independently and uniformly from
-  [0, 2 * pi].
-
-  For a single input feature vector x in R^d, its RFFM is defined as:
-  ```
-      sqrt(2/D) * cos(x * Omega + b)
-  ```
-  where `cos` is the element-wise cosine function and `x, b` are represented as
-  row vectors. The aforementioned paper shows that the linear kernel of
-  RFFM-mapped vectors approximates the Gaussian kernel of the initial vectors.
+  The mapping uses a matrix \\(\Omega \in R^{d x D}\\) and a bias vector
+  \\(b \in R^D\\) where \\(d\\) is the input dimension (number of dense input
+  features) and \\(D\\) is the output dimension (i.e., dimension of the feature
+  space the input is mapped to). Each entry of \\(\Omega\\) is sampled i.i.d.
+  from a (scaled) Gaussian distribution and each entry of \\(b\\) is sampled
+  independently and uniformly from [0, \\(2 * \pi\\)].
+
+  For a single input feature vector \\(x \in R^d\\), its RFFM is defined as:
+  $$\sqrt(2/D) * cos(x * \Omega + b)$$
+
+  where \\(cos\\) is the element-wise cosine function and \\(x, b\\) are
+  represented as row vectors. The aforementioned paper shows that the linear
+  kernel of RFFM-mapped vectors approximates the Gaussian kernel of the initial
+  vectors.
 
   """
 
   def __init__(self, input_dim, output_dim, stddev=1.0, seed=1, name=None):
-    """Constructs a RandomFourierFeatureMapper instance.
+    r"""Constructs a RandomFourierFeatureMapper instance.
 
     Args:
       input_dim: The dimension (number of features) of the tensors to be mapped.
@@ -68,11 +66,11 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
       stddev: The standard deviation of the Gaussian kernel to be approximated.
         The error of the classifier trained using this approximation is very
         sensitive to this parameter.
-      seed: An integer used to initialize the parameters (`Omega` and `b`) of
-        the mapper. For repeatable sequences across different invocations of the
-        mapper object (for instance, to ensure consistent mapping both at
-        training and eval/inference if these happen in different invocations),
-        set this to the same integer.
+      seed: An integer used to initialize the parameters (\\(\Omega\\) and
+        \\(b\\)) of the mapper. For repeatable sequences across different
+        invocations of the mapper object (for instance, to ensure consistent
+        mapping both at training and eval/inference if these happen in
+        different invocations), set this to the same integer.
       name: name for the mapper object.
     """
     # TODO(sibyl-vie3Poto): Maybe infer input_dim and/or output_dim (if not explicitly
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
index 6f4a264485993ab737723171409042b4a9673669..91929184a2e6f3cccae92cb819501a7c6ef81673 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
@@ -34,7 +34,7 @@ def _inner_product(x, y):
   """Inner product between tensors x and y.
 
   The input tensors are assumed to be in ROW representation, that is, the method
-  returns x * y^T.
+  returns \\(x * y^T\\).
 
   Args:
     x: input tensor in row format
diff --git a/tensorflow/contrib/kfac/BUILD b/tensorflow/contrib/kfac/BUILD
index 9a5759bf14f753bbc50d3ef8f54ceab7daf745ab..b719046b37ac761d56e8d5aa34772103be691cd6 100644
--- a/tensorflow/contrib/kfac/BUILD
+++ b/tensorflow/contrib/kfac/BUILD
@@ -24,15 +24,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/examples/BUILD b/tensorflow/contrib/kfac/examples/BUILD
index 89965eda374b2b403f680fc77eb923d0e660d1e2..8186fa1c62cb952f86614a96c3965bcddae1686e 100644
--- a/tensorflow/contrib/kfac/examples/BUILD
+++ b/tensorflow/contrib/kfac/examples/BUILD
@@ -28,8 +28,28 @@ py_library(
 )
 
 py_binary(
-    name = "convnet_mnist_main",
-    srcs = ["convnet_mnist_main.py"],
+    name = "convnet_mnist_single_main",
+    srcs = ["convnet_mnist_single_main.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "convnet_mnist_multi_tower_main",
+    srcs = ["convnet_mnist_multi_tower_main.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "convnet_mnist_distributed_main",
+    srcs = ["convnet_mnist_distributed_main.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":convnet",
@@ -58,15 +78,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/examples/convnet.py b/tensorflow/contrib/kfac/examples/convnet.py
index 39d80addaac1fe855a37255b32bf4412b99df46a..b261f41bf97db188f38bc057d83dc78cc5aafcbf 100644
--- a/tensorflow/contrib/kfac/examples/convnet.py
+++ b/tensorflow/contrib/kfac/examples/convnet.py
@@ -37,6 +37,8 @@ import tensorflow as tf
 
 from tensorflow.contrib.kfac.examples import mlp
 from tensorflow.contrib.kfac.examples import mnist
+from tensorflow.contrib.kfac.python.ops import optimizer as opt
+
 
 lc = tf.contrib.kfac.layer_collection
 oq = tf.contrib.kfac.op_queue
@@ -48,12 +50,18 @@ __all__ = [
     "linear_layer",
     "build_model",
     "minimize_loss_single_machine",
-    "minimize_loss_distributed",
+    "distributed_grads_only_and_ops_chief_worker",
+    "distributed_grads_and_ops_dedicated_workers",
     "train_mnist_single_machine",
-    "train_mnist_distributed",
+    "train_mnist_distributed_sync_replicas",
+    "train_mnist_multitower"
 ]
 
 
+# Inverse update ops will be run every _INVERT_EVRY iterations.
+_INVERT_EVERY = 10
+
+
 def conv_layer(layer_id, inputs, kernel_size, out_channels):
   """Builds a convolutional layer with ReLU non-linearity.
 
@@ -161,8 +169,9 @@ def build_model(examples, labels, num_labels, layer_collection):
   accuracy = tf.reduce_mean(
       tf.cast(tf.equal(labels, tf.argmax(logits, axis=1)), dtype=tf.float32))
 
-  tf.summary.scalar("loss", loss)
-  tf.summary.scalar("accuracy", accuracy)
+  with tf.device("/cpu:0"):
+    tf.summary.scalar("loss", loss)
+    tf.summary.scalar("accuracy", accuracy)
 
   # Register parameters. K-FAC needs to know about the inputs, outputs, and
   # parameters of each conv/fully connected layer and the logits powering the
@@ -181,41 +190,59 @@ def build_model(examples, labels, num_labels, layer_collection):
 def minimize_loss_single_machine(loss,
                                  accuracy,
                                  layer_collection,
+                                 device="/gpu:0",
                                  session_config=None):
   """Minimize loss with K-FAC on a single machine.
 
-  A single Session is responsible for running all of K-FAC's ops.
+  A single Session is responsible for running all of K-FAC's ops. The covariance
+  and inverse update ops are placed on `device`. All model variables are on CPU.
 
   Args:
     loss: 0-D Tensor. Loss to be minimized.
     accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
     layer_collection: LayerCollection instance describing model architecture.
       Used by K-FAC to construct preconditioner.
+    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and invserse
+      update ops are run on this device.
     session_config: None or tf.ConfigProto. Configuration for tf.Session().
 
   Returns:
     final value for 'accuracy'.
   """
   # Train with K-FAC.
-  global_step = tf.train.get_or_create_global_step()
+  g_step = tf.train.get_or_create_global_step()
   optimizer = opt.KfacOptimizer(
       learning_rate=0.0001,
       cov_ema_decay=0.95,
       damping=0.001,
       layer_collection=layer_collection,
+      placement_strategy="round_robin",
+      cov_devices=[device],
+      inv_devices=[device],
       momentum=0.9)
-  train_op = optimizer.minimize(loss, global_step=global_step)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+
+  def make_update_op(update_thunks):
+    update_ops = [thunk() for thunk in update_thunks]
+    return tf.group(*update_ops)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([cov_update_op]):
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(g_step, _INVERT_EVERY), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
+    with tf.control_dependencies([inverse_op]):
+      with tf.device(device):
+        train_op = optimizer.minimize(loss, global_step=g_step)
 
   tf.logging.info("Starting training.")
   with tf.train.MonitoredTrainingSession(config=session_config) as sess:
     while not sess.should_stop():
-      global_step_, loss_, accuracy_, _, _ = sess.run(
-          [global_step, loss, accuracy, train_op, optimizer.cov_update_op])
-
-      if global_step_ % 100 == 0:
-        sess.run(optimizer.inv_update_op)
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [g_step, loss, accuracy, train_op])
 
-      if global_step_ % 100 == 0:
+      if global_step_ % _INVERT_EVERY == 0:
         tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
                         global_step_, loss_, accuracy_)
 
@@ -250,16 +277,62 @@ def _num_gradient_tasks(num_tasks):
   return int(np.ceil(0.6 * num_tasks))
 
 
-def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
-                              checkpoint_dir, loss, accuracy, layer_collection):
-  """Minimize loss with an synchronous implementation of K-FAC.
+def _make_distributed_train_op(
+    task_id,
+    num_worker_tasks,
+    num_ps_tasks,
+    layer_collection
+):
+  """Creates optimizer and distributed training op.
+
+  Constructs KFAC optimizer and wraps it in `sync_replicas` optimizer. Makes
+  the train op.
+
+  Args:
+   task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    num_worker_tasks: int. Number of workers in this distributed training setup.
+    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
+      parameter servers are not used.
+    layer_collection: LayerCollection instance describing model architecture.
+      Used by K-FAC to construct preconditioner.
+
+  Returns:
+    sync_optimizer: `tf.train.SyncReplicasOptimizer` instance which wraps KFAC
+      optimizer.
+    optimizer: Instance of `opt.KfacOptimizer`.
+    global_step: `tensor`, Global step.
+  """
+  tf.logging.info("Task id : %d", task_id)
+  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
+    global_step = tf.train.get_or_create_global_step()
+    optimizer = opt.KfacOptimizer(
+        learning_rate=0.0001,
+        cov_ema_decay=0.95,
+        damping=0.001,
+        layer_collection=layer_collection,
+        momentum=0.9)
+    sync_optimizer = tf.train.SyncReplicasOptimizer(
+        opt=optimizer,
+        replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks),
+        total_num_replicas=num_worker_tasks)
+    return sync_optimizer, optimizer, global_step
+
+
+def distributed_grads_only_and_ops_chief_worker(
+    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
+    loss, accuracy, layer_collection, invert_every=10):
+  """Minimize loss with a synchronous implementation of K-FAC.
 
-  Different tasks are responsible for different parts of K-FAC's Ops. The first
-  60% of tasks update weights; the next 20% accumulate covariance statistics;
-  the last 20% invert the matrices used to precondition gradients.
+  All workers perform gradient computation. Chief worker applies gradient after
+  averaging the gradients obtained from all the workers. All workers block
+  execution untill the update is applied. Chief worker runs covariance and
+  inverse update ops. Covariance and inverse matrices are placed on parameter
+  servers in a round robin manner. For further details on synchronous
+  distributed optimization check `tf.train.SyncReplicasOptimizer`.
 
   Args:
     task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
     num_worker_tasks: int. Number of workers in this distributed training setup.
     num_ps_tasks: int. Number of parameter servers holding variables. If 0,
       parameter servers are not used.
@@ -271,6 +344,7 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
       run with each step.
     layer_collection: LayerCollection instance describing model architecture.
       Used by K-FAC to construct preconditioner.
+    invert_every: `int`, Number of steps between update the inverse.
 
   Returns:
     final value for 'accuracy'.
@@ -278,20 +352,82 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
   Raises:
     ValueError: if task_id >= num_worker_tasks.
   """
-  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
-    global_step = tf.train.get_or_create_global_step()
-    optimizer = opt.KfacOptimizer(
-        learning_rate=0.0001,
-        cov_ema_decay=0.95,
-        damping=0.001,
-        layer_collection=layer_collection,
-        momentum=0.9)
-    inv_update_queue = oq.OpQueue(optimizer.inv_update_ops)
-    sync_optimizer = tf.train.SyncReplicasOptimizer(
-        opt=optimizer,
-        replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks))
+
+  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
+      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+
+  tf.logging.info("Starting training.")
+  hooks = [sync_optimizer.make_session_run_hook(is_chief)]
+
+  def make_update_op(update_thunks):
+    update_ops = [thunk() for thunk in update_thunks]
+    return tf.group(*update_ops)
+
+  if is_chief:
+    cov_update_op = make_update_op(cov_update_thunks)
+    with tf.control_dependencies([cov_update_op]):
+      inverse_op = tf.cond(
+          tf.equal(tf.mod(global_step, invert_every), 0),
+          lambda: make_update_op(inv_update_thunks),
+          tf.no_op)
+      with tf.control_dependencies([inverse_op]):
+        train_op = sync_optimizer.minimize(loss, global_step=global_step)
+  else:
     train_op = sync_optimizer.minimize(loss, global_step=global_step)
 
+  with tf.train.MonitoredTrainingSession(
+      master=master,
+      is_chief=is_chief,
+      checkpoint_dir=checkpoint_dir,
+      hooks=hooks,
+      stop_grace_period_secs=0) as sess:
+    while not sess.should_stop():
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [global_step, loss, accuracy, train_op])
+      tf.logging.info("global_step: %d | loss: %f | accuracy: %s", global_step_,
+                      loss_, accuracy_)
+  return accuracy_
+
+
+def distributed_grads_and_ops_dedicated_workers(
+    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
+    loss, accuracy, layer_collection):
+  """Minimize loss with a synchronous implementation of K-FAC.
+
+  Different workers are responsible for different parts of K-FAC's Ops. The
+  first 60% of tasks compute gradients; the next 20% accumulate covariance
+  statistics; the last 20% invert the matrices used to precondition gradients.
+  The chief worker applies the gradient .
+
+  Args:
+    task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
+    num_worker_tasks: int. Number of workers in this distributed training setup.
+    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
+      parameter servers are not used.
+    master: string. IP and port of TensorFlow runtime process. Set to empty
+      string to run locally.
+    checkpoint_dir: string or None. Path to store checkpoints under.
+    loss: 0-D Tensor. Loss to be minimized.
+    accuracy: dict mapping strings to 0-D Tensors. Additional accuracy to
+      run with each step.
+    layer_collection: LayerCollection instance describing model architecture.
+      Used by K-FAC to construct preconditioner.
+
+  Returns:
+    final value for 'accuracy'.
+
+  Raises:
+    ValueError: if task_id >= num_worker_tasks.
+  """
+  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
+      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
+  _, cov_update_op, inv_update_ops, _, _, _ = optimizer.make_ops_and_vars()
+  train_op = sync_optimizer.minimize(loss, global_step=global_step)
+  inv_update_queue = oq.OpQueue(inv_update_ops)
+
   tf.logging.info("Starting training.")
   is_chief = (task_id == 0)
   hooks = [sync_optimizer.make_session_run_hook(is_chief)]
@@ -306,7 +442,7 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
       if _is_gradient_task(task_id, num_worker_tasks):
         learning_op = train_op
       elif _is_cov_update_task(task_id, num_worker_tasks):
-        learning_op = optimizer.cov_update_op
+        learning_op = cov_update_op
       elif _is_inv_update_task(task_id, num_worker_tasks):
         # TODO(duckworthd): Running this op before cov_update_op has been run a
         # few times can result in "InvalidArgumentError: Cholesky decomposition
@@ -324,13 +460,18 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
   return accuracy_
 
 
-def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False):
+def train_mnist_single_machine(data_dir,
+                               num_epochs,
+                               use_fake_data=False,
+                               device="/gpu:0"):
   """Train a ConvNet on MNIST.
 
   Args:
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     use_fake_data: bool. If True, generate a synthetic dataset.
+    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and inverse
+      update ops are run on this device.
 
   Returns:
     accuracy of model on the final minibatch of training data.
@@ -350,22 +491,38 @@ def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False):
       examples, labels, num_labels=10, layer_collection=layer_collection)
 
   # Fit model.
-  return minimize_loss_single_machine(loss, accuracy, layer_collection)
+  return minimize_loss_single_machine(
+      loss, accuracy, layer_collection, device=device)
 
 
 def train_mnist_multitower(data_dir, num_epochs, num_towers,
-                           use_fake_data=True):
+                           use_fake_data=True, devices=None):
   """Train a ConvNet on MNIST.
 
+  Training data is split equally among the towers. Each tower computes loss on
+  its own batch of data and the loss is aggregated on the CPU. The model
+  variables are placed on first tower. The covariance and inverse update ops
+  and variables are placed on GPUs in a round robin manner.
+
   Args:
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     num_towers: int. Number of CPUs to split inference across.
     use_fake_data: bool. If True, generate a synthetic dataset.
+    devices: string, Either list of CPU or GPU. The covaraince and inverse
+      update ops are run on this device.
 
   Returns:
     accuracy of model on the final minibatch of training data.
   """
+  if devices:
+    device_count = {"GPU": num_towers}
+  else:
+    device_count = {"CPU": num_towers}
+
+  devices = devices or [
+      "/cpu:{}".format(tower_id) for tower_id in range(num_towers)
+  ]
   # Load a dataset.
   tf.logging.info("Loading MNIST into memory.")
   tower_batch_size = 128
@@ -388,7 +545,7 @@ def train_mnist_multitower(data_dir, num_epochs, num_towers,
   layer_collection = lc.LayerCollection()
   tower_results = []
   for tower_id in range(num_towers):
-    with tf.device("/cpu:%d" % tower_id):
+    with tf.device(devices[tower_id]):
       with tf.name_scope("tower%d" % tower_id):
         with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)):
           tf.logging.info("Building tower %d." % tower_id)
@@ -402,34 +559,79 @@ def train_mnist_multitower(data_dir, num_epochs, num_towers,
   accuracy = tf.reduce_mean(accuracies)
 
   # Fit model.
+
   session_config = tf.ConfigProto(
-      allow_soft_placement=False, device_count={
-          "CPU": num_towers
-      })
-  return minimize_loss_single_machine(
-      loss, accuracy, layer_collection, session_config=session_config)
+      allow_soft_placement=False,
+      device_count=device_count,
+  )
 
+  g_step = tf.train.get_or_create_global_step()
+  optimizer = opt.KfacOptimizer(
+      learning_rate=0.0001,
+      cov_ema_decay=0.95,
+      damping=0.001,
+      layer_collection=layer_collection,
+      placement_strategy="round_robin",
+      cov_devices=devices,
+      inv_devices=devices,
+      momentum=0.9)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
 
-def train_mnist_distributed(task_id,
-                            num_worker_tasks,
-                            num_ps_tasks,
-                            master,
-                            data_dir,
-                            num_epochs,
-                            use_fake_data=False):
-  """Train a ConvNet on MNIST.
+  def make_update_op(update_thunks):
+    update_ops = [thunk() for thunk in update_thunks]
+    return tf.group(*update_ops)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([cov_update_op]):
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(g_step, _INVERT_EVERY), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
+    with tf.control_dependencies([inverse_op]):
+      train_op = optimizer.minimize(loss, global_step=g_step)
+
+  tf.logging.info("Starting training.")
+  with tf.train.MonitoredTrainingSession(config=session_config) as sess:
+    while not sess.should_stop():
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [g_step, loss, accuracy, train_op])
+
+      if global_step_ % _INVERT_EVERY == 0:
+        tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
+                        global_step_, loss_, accuracy_)
+
+
+def train_mnist_distributed_sync_replicas(task_id,
+                                          is_chief,
+                                          num_worker_tasks,
+                                          num_ps_tasks,
+                                          master,
+                                          data_dir,
+                                          num_epochs,
+                                          op_strategy,
+                                          use_fake_data=False):
+  """Train a ConvNet on MNIST using Sync replicas optimizer.
 
   Args:
     task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
     num_worker_tasks: int. Number of workers in this distributed training setup.
     num_ps_tasks: int. Number of parameter servers holding variables.
     master: string. IP and port of TensorFlow runtime process.
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
+    op_strategy: `string`, Strategy to run the covariance and inverse
+      ops. If op_strategy == `chief_worker` then covaraiance and inverse
+      update ops are run on chief worker otherwise they are run on dedicated
+      workers.
+
     use_fake_data: bool. If True, generate a synthetic dataset.
 
   Returns:
     accuracy of model on the final minibatch of training data.
+
+  Raises:
+    ValueError: If `op_strategy` not in ["chief_worker", "dedicated_workers"].
   """
   # Load a dataset.
   tf.logging.info("Loading MNIST into memory.")
@@ -448,9 +650,17 @@ def train_mnist_distributed(task_id,
 
   # Fit model.
   checkpoint_dir = None if data_dir is None else os.path.join(data_dir, "kfac")
-  return minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks,
-                                   master, checkpoint_dir, loss, accuracy,
-                                   layer_collection)
+  if op_strategy == "chief_worker":
+    return distributed_grads_only_and_ops_chief_worker(
+        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
+        checkpoint_dir, loss, accuracy, layer_collection)
+  elif op_strategy == "dedicated_workers":
+    return distributed_grads_and_ops_dedicated_workers(
+        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
+        checkpoint_dir, loss, accuracy, layer_collection)
+  else:
+    raise ValueError("Only supported op strategies are : {}, {}".format(
+        "chief_worker", "dedicated_workers"))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4c2d4a9e9bfcc4bfb55a25d2f23e66afe5b1375
--- /dev/null
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Train a ConvNet on MNIST using K-FAC.
+
+Distributed training with sync replicas optimizer. See
+`convnet.train_mnist_distributed_sync_replicas` for details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.contrib.kfac.examples import convnet
+
+FLAGS = flags.FLAGS
+flags.DEFINE_integer("task", -1, "Task identifier")
+flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
+flags.DEFINE_string(
+    "cov_inv_op_strategy", "chief_worker",
+    "In dist training mode run the cov, inv ops on chief or dedicated workers."
+)
+flags.DEFINE_string("master", "local", "Session master.")
+flags.DEFINE_integer("ps_tasks", 2,
+                     "Number of tasks in the parameter server job.")
+flags.DEFINE_integer("replicas_to_aggregate", 5,
+                     "Number of replicas to aggregate.")
+flags.DEFINE_integer("worker_replicas", 5, "Number of replicas in worker job.")
+flags.DEFINE_integer("num_epochs", None, "Number of epochs.")
+
+
+def _is_chief():
+  """Determines whether a job is the chief worker."""
+  if "chief_worker" in FLAGS.brain_jobs:
+    return FLAGS.brain_job_name == "chief_worker"
+  else:
+    return FLAGS.task == 0
+
+
+def main(unused_argv):
+  _ = unused_argv
+  convnet.train_mnist_distributed_sync_replicas(
+      FLAGS.task, _is_chief(), FLAGS.worker_replicas, FLAGS.ps_tasks,
+      FLAGS.master, FLAGS.data_dir, FLAGS.num_epochs, FLAGS.cov_inv_op_strategy)
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
similarity index 57%
rename from tensorflow/contrib/kfac/examples/convnet_mnist_main.py
rename to tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
index b0c6fbde198850c76af0bc1600dc23e926227229..4249bf8a8d9d3a5beb87d4140a55b0ee6eadbc64 100644
--- a/tensorflow/contrib/kfac/examples/convnet_mnist_main.py
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
@@ -14,44 +14,35 @@
 # ==============================================================================
 r"""Train a ConvNet on MNIST using K-FAC.
 
-See convnet.py for details.
+Multi tower training mode. See `convnet.train_mnist_multitower` for details.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
-import sys
 
+from absl import flags
 import tensorflow as tf
 
 from tensorflow.contrib.kfac.examples import convnet
 
-FLAGS = None
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/tmp/multitower_1/mnist", "local mnist dir")
+flags.DEFINE_integer("num_towers", 2,
+                     "Number of towers for multi tower training.")
 
 
-def main(argv):
-  _ = argv
-
-  if FLAGS.num_towers > 1:
-    convnet.train_mnist_multitower(
-        FLAGS.data_dir, num_epochs=200, num_towers=FLAGS.num_towers)
-  else:
-    convnet.train_mnist_single_machine(FLAGS.data_dir, num_epochs=200)
+def main(unused_argv):
+  _ = unused_argv
+  assert FLAGS.num_towers > 1
+  devices = ["/gpu:{}".format(tower_id) for tower_id in range(FLAGS.num_towers)]
+  convnet.train_mnist_multitower(
+      FLAGS.data_dir,
+      num_epochs=200,
+      num_towers=FLAGS.num_towers,
+      devices=devices)
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      "--data_dir",
-      type=str,
-      default="/tmp/mnist",
-      help="Directory to store dataset in.")
-  parser.add_argument(
-      "--num_towers",
-      type=int,
-      default=1,
-      help="Number of CPUs to split minibatch across.")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/bayesflow/python/ops/custom_grad.py b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
similarity index 63%
rename from tensorflow/contrib/bayesflow/python/ops/custom_grad.py
rename to tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
index ca1ecb9c40204c3c723fa3423cfe148e823adc28..2c1f09936073a34816da61d771f59e848b8787af 100644
--- a/tensorflow/contrib/bayesflow/python/ops/custom_grad.py
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
@@ -12,23 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functions for specifying custom gradients.
+r"""Train a ConvNet on MNIST using K-FAC.
 
-See ${python/contrib.bayesflow.custom_gradient}.
+Train on single machine. See `convnet.train_mnist_single_machine` for details.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.custom_grad_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = [
-    'custom_gradient',
-]
+from absl import flags
+import tensorflow as tf
 
-remove_undocumented(__name__, _allowed_symbols)
+from tensorflow.contrib.kfac.examples import convnet
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
+
+
+def main(unused_argv):
+  convnet.train_mnist_single_machine(FLAGS.data_dir, num_epochs=200)
+
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/mlp.py b/tensorflow/contrib/kfac/examples/mlp.py
index 87eed03888c894a04c0521d1ce5ee8975b60776b..ea2b252a05702d5adcdc5f70d713277ba604f691 100644
--- a/tensorflow/contrib/kfac/examples/mlp.py
+++ b/tensorflow/contrib/kfac/examples/mlp.py
@@ -105,18 +105,21 @@ def build_model(examples, labels, num_labels, layer_collection):
   return loss, accuracy
 
 
-def minimize(loss, accuracy, layer_collection, session_config=None):
+def minimize(loss, accuracy, layer_collection, num_towers, session_config=None):
   """Minimize 'loss' with KfacOptimizer.
 
   Args:
     loss: 0-D Tensor. Loss to be minimized.
     accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
     layer_collection: LayerCollection instance. Describes layers in model.
+    num_towers: int. Number of CPUs to split minibatch across.
     session_config: tf.ConfigProto. Configuration for tf.Session().
 
   Returns:
     accuracy of classifier on final minibatch.
   """
+  devices = tuple("/cpu:%d" % tower_id for tower_id in range(num_towers))
+
   # Train with K-FAC. We'll use a decreasing learning rate that's cut in 1/2
   # every 10k iterations.
   tf.logging.info("Building KFAC Optimizer.")
@@ -125,27 +128,38 @@ def minimize(loss, accuracy, layer_collection, session_config=None):
       learning_rate=tf.train.exponential_decay(
           0.00002, global_step, 10000, 0.5, staircase=True),
       cov_ema_decay=0.95,
-      damping=0.0001,
+      damping=0.0005,
       layer_collection=layer_collection,
-      momentum=0.99)
-  train_op = optimizer.minimize(loss, global_step=global_step)
+      momentum=0.99,
+      placement_strategy="round_robin",
+      cov_devices=devices,
+      inv_devices=devices)
+
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+
+  def make_update_op(update_thunks):
+    update_ops = [thunk() for thunk in update_thunks]
+    return tf.group(*update_ops)
+
+  # TODO(b/78537047): change (some) examples to use PeriodicInvCovUpdateKfacOpt
+  # once that gets moved over?  Could still leave more advanced examples as they
+  # are (e.g. train_mnist_estimator in this file)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([cov_update_op]):
+    # We update the inverses only every 20 iterations.
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(global_step, 100), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
+    with tf.control_dependencies([inverse_op]):
+      train_op = optimizer.minimize(loss, global_step=global_step)
 
   tf.logging.info("Starting training.")
   with tf.train.MonitoredTrainingSession(config=session_config) as sess:
     while not sess.should_stop():
-      # K-FAC has 3 primary ops,
-      # - train_op: Update the weights with the minibatch's gradient.
-      # - cov_update_op: Update statistics used for building K-FAC's
-      #   preconditioner matrix.
-      # - inv_update_op: Update preconditioner matrix using statistics.
-      #
-      # The first 2 of these are cheap and should be done with each step. The
-      # latter is more expensive, and should be updated ~100 iterations.
-      global_step_, loss_, accuracy_, _, _ = sess.run(
-          [global_step, loss, accuracy, train_op, optimizer.cov_update_op])
-
-      if global_step_ % 100 == 0:
-        sess.run(optimizer.inv_update_op)
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [global_step, loss, accuracy, train_op])
 
       if global_step_ % 100 == 0:
         tf.logging.info("global_step: %d | loss: %f | accuracy: %f",
@@ -180,7 +194,7 @@ def train_mnist(data_dir, num_epochs, use_fake_data=False):
   loss, accuracy = build_model(examples, labels, 10, layer_collection)
 
   # Fit model.
-  minimize(loss, accuracy, layer_collection)
+  minimize(loss, accuracy, layer_collection, 1)
 
 
 def train_mnist_multitower(data_dir,
@@ -238,7 +252,8 @@ def train_mnist_multitower(data_dir,
           "CPU": num_towers
       })
   return minimize(
-      loss, accuracy, layer_collection, session_config=session_config)
+      loss, accuracy, layer_collection, num_towers,
+      session_config=session_config)
 
 
 def train_mnist_estimator(data_dir, num_epochs, use_fake_data=False):
@@ -298,13 +313,26 @@ def train_mnist_estimator(data_dir, num_epochs, use_fake_data=False):
         layer_collection=layer_collection,
         momentum=0.99)
 
+    (cov_update_thunks,
+     inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+
+    def make_update_op(update_thunks):
+      update_ops = [thunk() for thunk in update_thunks]
+      return tf.group(*update_ops)
+
+    def make_batch_executed_op(update_thunks, batch_size=1):
+      return tf.group(*tf.contrib.kfac.utils.batch_execute(
+          global_step, update_thunks, batch_size=batch_size))
+
     # Run cov_update_op every step. Run 1 inv_update_ops per step.
-    cov_update_op = optimizer.cov_update_op
-    inv_update_op = tf.group(
-        tf.contrib.kfac.utils.batch_execute(
-            global_step, optimizer.inv_update_thunks, batch_size=1))
-    with tf.control_dependencies([cov_update_op, inv_update_op]):
-      train_op = optimizer.minimize(loss, global_step=global_step)
+    cov_update_op = make_update_op(cov_update_thunks)
+    with tf.control_dependencies([cov_update_op]):
+      # But make sure to execute all the inverse ops on the first step
+      inverse_op = tf.cond(tf.equal(global_step, 0),
+                           lambda: make_update_op(inv_update_thunks),
+                           lambda: make_batch_executed_op(inv_update_thunks))
+      with tf.control_dependencies([inverse_op]):
+        train_op = optimizer.minimize(loss, global_step=global_step)
 
     # Print metrics every 5 sec.
     hooks = [
diff --git a/tensorflow/contrib/kfac/examples/tests/BUILD b/tensorflow/contrib/kfac/examples/tests/BUILD
index ce7da95c124beaed4773d68ce0d0c41f187f7c9d..ede7f183fe24f26bd86e232e831dea5f8ea1fdc4 100644
--- a/tensorflow/contrib/kfac/examples/tests/BUILD
+++ b/tensorflow/contrib/kfac/examples/tests/BUILD
@@ -50,15 +50,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/examples/tests/convnet_test.py b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
index 8d86c2bb5150cd4bc8a2b21ba050e904929e0fe9..adecda71666ee74bc577859589060fa65baf5166 100644
--- a/tensorflow/contrib/kfac/examples/tests/convnet_test.py
+++ b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
@@ -112,15 +112,16 @@ class ConvNetTest(tf.test.TestCase):
   def testMinimizeLossSingleMachine(self):
     with tf.Graph().as_default():
       loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.minimize_loss_single_machine(loss, accuracy,
-                                                       layer_collection)
-      self.assertLess(accuracy_, 1.0)
+      accuracy_ = convnet.minimize_loss_single_machine(
+          loss, accuracy, layer_collection, device="/cpu:0")
+      self.assertLess(accuracy_, 2.0)
 
   def testMinimizeLossDistributed(self):
     with tf.Graph().as_default():
       loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.minimize_loss_distributed(
+      accuracy_ = convnet.distributed_grads_only_and_ops_chief_worker(
           task_id=0,
+          is_chief=True,
           num_worker_tasks=1,
           num_ps_tasks=0,
           master="",
@@ -128,7 +129,7 @@ class ConvNetTest(tf.test.TestCase):
           loss=loss,
           accuracy=accuracy,
           layer_collection=layer_collection)
-      self.assertLess(accuracy_, 1.0)
+      self.assertLess(accuracy_, 2.0)
 
   def testTrainMnistSingleMachine(self):
     with tf.Graph().as_default():
@@ -138,7 +139,7 @@ class ConvNetTest(tf.test.TestCase):
       # but there are too few parameters for the model to effectively memorize
       # the training set the way an MLP can.
       convnet.train_mnist_single_machine(
-          data_dir=None, num_epochs=1, use_fake_data=True)
+          data_dir=None, num_epochs=1, use_fake_data=True, device="/cpu:0")
 
   def testTrainMnistMultitower(self):
     with tf.Graph().as_default():
@@ -149,13 +150,15 @@ class ConvNetTest(tf.test.TestCase):
   def testTrainMnistDistributed(self):
     with tf.Graph().as_default():
       # Ensure model training doesn't crash.
-      convnet.train_mnist_distributed(
+      convnet.train_mnist_distributed_sync_replicas(
           task_id=0,
+          is_chief=True,
           num_worker_tasks=1,
           num_ps_tasks=0,
           master="",
           data_dir=None,
-          num_epochs=1,
+          num_epochs=2,
+          op_strategy="chief_worker",
           use_fake_data=True)
 
 
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
index f4ed978174a9ddd8b54a88e60bfb48a67a2e76d2..6e4a8d71baa85d05d514e4683016c2f4d299ec8e 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
@@ -36,6 +36,7 @@ py_test(
     srcs = ["fisher_factors_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/kfac/python/ops:fisher_blocks",
         "//tensorflow/contrib/kfac/python/ops:fisher_factors",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -57,6 +58,7 @@ py_test(
     deps = [
         "//tensorflow/contrib/kfac/python/ops:fisher_blocks",
         "//tensorflow/contrib/kfac/python/ops:layer_collection",
+        "//tensorflow/contrib/kfac/python/ops:linear_operator",
         "//tensorflow/contrib/kfac/python/ops:utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -95,6 +97,7 @@ py_test(
     srcs = ["optimizer_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/kfac/python/ops:fisher_factors",
         "//tensorflow/contrib/kfac/python/ops:kfac_optimizer",
         "//tensorflow/contrib/kfac/python/ops:layer_collection",
         "//tensorflow/python:array_ops",
@@ -113,6 +116,7 @@ py_test(
     name = "utils_test",
     srcs = ["utils_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         "//tensorflow/contrib/kfac/python/ops:utils",
         "//tensorflow/contrib/tpu",
@@ -154,15 +158,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
index bfdb69ad02caaa57827e0ae6b3c9fc0d0ed03754..0e65d419a31838a62d8ab37a5f30427c925382b4 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.contrib.kfac.python.ops import estimator
 from tensorflow.contrib.kfac.python.ops import layer_collection as lc
 from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -40,30 +39,6 @@ from tensorflow.python.training import training_util
 _ALL_ESTIMATION_MODES = ["gradients", "empirical", "curvature_prop", "exact"]
 
 
-class DeviceContextGeneratorTest(test.TestCase):
-
-  def testNoDevice(self):
-    device_context_generator = estimator._DeviceContextGenerator(None)
-    with ops.device("/device:CPU:0"):  # This is what will be used
-      with device_context_generator():  # Does nothing
-        a = constant_op.constant([2.0], name="a")
-    self.assertEqual("/device:CPU:0", a.op.device)
-
-  def testTwoDevices(self):
-    device_context_generator = estimator._DeviceContextGenerator(
-        ["/device:GPU:0", "/device:GPU:1"])
-    with ops.device("/device:CPU:0"):  # Will be over-ridden by the inner scopes
-      with device_context_generator():
-        a = constant_op.constant([2.0], name="a")
-      with device_context_generator():
-        b = constant_op.constant([2.0], name="b")
-      with device_context_generator():
-        c = constant_op.constant([2.0], name="c")
-    self.assertEqual("/device:GPU:0", a.op.device)
-    self.assertEqual("/device:GPU:1", b.op.device)
-    self.assertEqual("/device:GPU:0", c.op.device)
-
-
 class EstimatorTest(test.TestCase):
 
   def setUp(self):
@@ -90,57 +65,113 @@ class EstimatorTest(test.TestCase):
   def testEstimatorInitManualRegistration(self):
     with self._graph.as_default():
       # We should be able to build an estimator for only the registered vars.
-      estimator.FisherEstimator([self.weights], 0.1, 0.2, self.layer_collection)
+      estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection
+      )
 
       # Check that we throw an error if we try to build an estimator for vars
       # that were not manually registered.
       with self.assertRaises(ValueError):
-        estimator.FisherEstimator([self.weights, self.bias], 0.1, 0.2,
-                                  self.layer_collection)
+        est = estimator.FisherEstimatorRoundRobin(
+            variables=[self.weights, self.bias],
+            cov_ema_decay=0.1,
+            damping=0.2,
+            layer_collection=self.layer_collection
+        )
+        est.make_vars_and_create_op_thunks()
 
       # Check that we throw an error if we don't include registered variables,
       # i.e. self.weights
       with self.assertRaises(ValueError):
-        estimator.FisherEstimator([], 0.1, 0.2, self.layer_collection)
+        est = estimator.FisherEstimatorRoundRobin(
+            variables=[],
+            cov_ema_decay=0.1,
+            damping=0.2,
+            layer_collection=self.layer_collection)
+        est.make_vars_and_create_op_thunks()
 
   @test.mock.patch.object(utils.SubGraph, "variable_uses", return_value=42)
   def testVariableWrongNumberOfUses(self, mock_uses):
     with self.assertRaises(ValueError):
-      estimator.FisherEstimator([self.weights], 0.1, 0.2, self.layer_collection)
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection)
+      est.make_vars_and_create_op_thunks()
 
   def testInvalidEstimationMode(self):
     with self.assertRaises(ValueError):
-      estimator.FisherEstimator([self.weights], 0.1, 0.2, self.layer_collection,
-                                "not_a_real_mode")
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="not_a_real_mode")
+      est.make_vars_and_create_op_thunks()
 
-  def testModeListCorrect(self):
+  def testGradientsModeBuild(self):
     with self._graph.as_default():
-      est = estimator.FisherEstimator([self.weights], 0.1, 0.2,
-                                      self.layer_collection)
-    self.assertItemsEqual(_ALL_ESTIMATION_MODES, est._gradient_fns.keys())
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="gradients")
+      est.make_vars_and_create_op_thunks()
 
-  def testAllModesBuild(self):
-    for mode in _ALL_ESTIMATION_MODES:
-      with self._graph.as_default():
-        estimator.FisherEstimator([self.weights], 0.1, 0.2,
-                                  self.layer_collection, mode)
+  def testEmpiricalModeBuild(self):
+    with self._graph.as_default():
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="empirical")
+      est.make_vars_and_create_op_thunks()
+
+  def testCurvaturePropModeBuild(self):
+    with self._graph.as_default():
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="curvature_prop")
+      est.make_vars_and_create_op_thunks()
+
+  def testExactModeBuild(self):
+    with self._graph.as_default():
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="exact")
+      est.make_vars_and_create_op_thunks()
 
   def test_cov_update_thunks(self):
     """Ensures covariance update ops run once per global_step."""
     with self._graph.as_default(), self.test_session() as sess:
-      fisher_estimator = estimator.FisherEstimator(
+      fisher_estimator = estimator.FisherEstimatorRoundRobin(
           variables=[self.weights],
           layer_collection=self.layer_collection,
-          cov_ema_decay=0.0,
-          damping=0.0)
+          damping=0.2,
+          cov_ema_decay=0.0)
 
       # Construct an op that executes one covariance update per step.
       global_step = training_util.get_or_create_global_step()
+      (cov_variable_thunks, cov_update_op_thunks, _,
+       _) = fisher_estimator.create_ops_and_vars_thunks()
+      for thunk in cov_variable_thunks:
+        thunk()
       cov_matrices = [
           fisher_factor.get_cov()
           for fisher_factor in self.layer_collection.get_factors()
       ]
-      cov_update_op_thunks = fisher_estimator.cov_update_thunks
       cov_update_op = control_flow_ops.case(
           [(math_ops.equal(global_step, i), thunk)
            for i, thunk in enumerate(cov_update_op_thunks)])
@@ -172,23 +203,64 @@ class EstimatorTest(test.TestCase):
         sess.run(cov_update_op)
         sess.run(increment_global_step)
 
+  def test_round_robin_placement(self):
+    """Check if the ops and variables are placed on devices correctly."""
+    with self._graph.as_default():
+      fisher_estimator = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          layer_collection=self.layer_collection,
+          damping=0.2,
+          cov_ema_decay=0.0,
+          cov_devices=["/cpu:{}".format(i) for i in range(2)],
+          inv_devices=["/cpu:{}".format(i) for i in range(2)])
+
+      # Construct an op that executes one covariance update per step.
+      (cov_update_thunks,
+       inv_update_thunks) = fisher_estimator.make_vars_and_create_op_thunks(
+           scope="test")
+      cov_update_ops = tuple(thunk() for thunk in cov_update_thunks)
+      inv_update_ops = tuple(thunk() for thunk in inv_update_thunks)
+      self.assertEqual(cov_update_ops[0].device, "/device:CPU:0")
+      self.assertEqual(cov_update_ops[1].device, "/device:CPU:1")
+      self.assertEqual(inv_update_ops[0].device, "/device:CPU:0")
+      self.assertEqual(inv_update_ops[1].device, "/device:CPU:1")
+      cov_matrices = [
+          fisher_factor.get_cov()
+          for fisher_factor in self.layer_collection.get_factors()
+      ]
+      inv_matrices = [
+          matrix
+          for fisher_factor in self.layer_collection.get_factors()
+          for matrix in fisher_factor._matpower_by_exp_and_damping.values()
+      ]
+      self.assertEqual(cov_matrices[0].device, "/device:CPU:0")
+      self.assertEqual(cov_matrices[1].device, "/device:CPU:1")
+      # Inverse matrices need to be explicitly placed.
+      self.assertEqual(inv_matrices[0].device, "")
+      self.assertEqual(inv_matrices[1].device, "")
+
   def test_inv_update_thunks(self):
     """Ensures inverse update ops run once per global_step."""
     with self._graph.as_default(), self.test_session() as sess:
-      fisher_estimator = estimator.FisherEstimator(
+      fisher_estimator = estimator.FisherEstimatorRoundRobin(
           variables=[self.weights],
           layer_collection=self.layer_collection,
-          cov_ema_decay=0.0,
-          damping=0.0)
+          damping=0.2,
+          cov_ema_decay=0.0)
 
       # Construct op that updates one inverse per global step.
       global_step = training_util.get_or_create_global_step()
+      (cov_variable_thunks, _, inv_variable_thunks,
+       inv_update_op_thunks) = fisher_estimator.create_ops_and_vars_thunks()
+      for thunk in cov_variable_thunks:
+        thunk()
+      for thunk in inv_variable_thunks:
+        thunk()
       inv_matrices = [
           matrix
           for fisher_factor in self.layer_collection.get_factors()
-          for matrix in fisher_factor._inverses_by_damping.values()
+          for matrix in fisher_factor._matpower_by_exp_and_damping.values()
       ]
-      inv_update_op_thunks = fisher_estimator.inv_update_thunks
       inv_update_op = control_flow_ops.case(
           [(math_ops.equal(global_step, i), thunk)
            for i, thunk in enumerate(inv_update_op_thunks)])
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
index fb4b3a241c1e9fd82e7bf630fd57295917048fbd..86ec7a095afdf4ecf7892a7e4e5d47dcdc239ed1 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
@@ -21,7 +21,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.kfac.python.ops import fisher_blocks as fb
+from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
 from tensorflow.contrib.kfac.python.ops import layer_collection as lc
+from tensorflow.contrib.kfac.python.ops import linear_operator as lo
 from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -34,6 +36,19 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import test
 
 
+# We need to set these constants since the numerical values used in the tests
+# were chosen when these used to be the defaults.
+ff.set_global_constants(init_covariances_at_zero=False,
+                        zero_debias=False,
+                        init_inverses_at_zero=False)
+
+# TODO(b/78538100): As far as I can tell, all the tests that say "Make sure our
+# inverse is something other than the identity" are actually broken. They never
+# run the covariance update ops and so the inverse actually is the identity
+# (possible plus the damping term, which would still make it a multiple of the
+# identity).
+
+
 def _make_psd(dim):
   """Constructs a PSD matrix of the given dimension."""
   mat = np.ones((dim, dim), dtype=np.float32)
@@ -46,8 +61,9 @@ class UtilsTest(test.TestCase):
   def testComputePiTracenorm(self):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
-      left_factor = array_ops.diag([1., 2., 0., 1.])
-      right_factor = array_ops.ones([2., 2.])
+      diag = ops.convert_to_tensor([1., 2., 0., 1.])
+      left_factor = lo.LinearOperatorDiag(diag)
+      right_factor = lo.LinearOperatorFullMatrix(array_ops.ones([2, 2]))
 
       # pi is the sqrt of the left trace norm divided by the right trace norm
       pi = fb.compute_pi_tracenorm(left_factor, right_factor)
@@ -63,7 +79,7 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -72,7 +88,7 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -81,7 +97,7 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors(grads, 0.5)
@@ -91,9 +107,12 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors((grads,), 0.5)
+      block._factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._factor.instantiate_inv_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(tf_variables.global_variables_initializer())
@@ -109,9 +128,12 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = array_ops.constant([[1.], [2.]])
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = params**2
       block.instantiate_factors((grads,), 0.5)
+      block._factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._factor.instantiate_inv_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(tf_variables.global_variables_initializer())
@@ -127,10 +149,13 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = (array_ops.constant([2., 3.]), array_ops.constant(4.))
       damping = 0.5
       block.instantiate_factors((grads,), damping)
+      block._factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._factor.instantiate_inv_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(state_ops.assign(block._factor._cov, _make_psd(3)))
@@ -154,7 +179,7 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -163,7 +188,7 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -172,7 +197,7 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors(grads, 0.5)
@@ -182,9 +207,10 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors((grads,), 0.5)
+      block._factor.instantiate_cov_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(tf_variables.global_variables_initializer())
@@ -200,9 +226,10 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = array_ops.constant([[1.], [2.]])
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = params**2
       block.instantiate_factors((grads,), 0.5)
+      block._factor.instantiate_cov_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(tf_variables.global_variables_initializer())
@@ -217,10 +244,11 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       damping = 0.5
       block.instantiate_factors((grads,), damping)
+      block._factor.instantiate_cov_variables()
 
       cov = array_ops.reshape(array_ops.constant([2., 3., 4.]), [-1, 1])
       sess.run(state_ops.assign(block._factor._cov, cov))
@@ -233,7 +261,6 @@ class NaiveDiagonalFBTest(test.TestCase):
 
       full = sess.run(block.full_fisher_block())
       explicit = np.dot(np.linalg.inv(full + damping * np.eye(3)), v_flat)
-
       self.assertAllClose(output_flat, explicit)
 
 
@@ -312,8 +339,8 @@ class FullyConnectedDiagonalFBTest(test.TestCase):
 
     self.assertAllClose(expected_result, result)
 
-  def testRegisterAdditionalMinibatch(self):
-    """Ensure 1 big minibatch and 2 small minibatches are equivalent."""
+  def testRegisterAdditionalTower(self):
+    """Ensure 1 big tower and 2 small towers are equivalent."""
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
@@ -364,9 +391,10 @@ class FullyConnectedDiagonalFBTest(test.TestCase):
       block = fb.FullyConnectedDiagonalFB(
           lc.LayerCollection(), has_bias=isinstance(params, (tuple, list)))
       for (i, o) in zip(inputs, outputs):
-        block.register_additional_minibatch(i, o)
+        block.register_additional_tower(i, o)
 
       block.instantiate_factors((output_grads,), damping=0.0)
+      block._factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
       sess.run(block._factor.make_covariance_update_op(0.0))
@@ -389,12 +417,12 @@ class EmbeddingKFACFBTest(test.TestCase):
       # Add some examples.
       inputs = array_ops.constant([[0, 1], [1, 2], [2, 3]])
       outputs = array_ops.constant([[0.], [1.], [2.]])
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       # Instantiate factor's variables. Ensure it doesn't fail.
       grads = outputs**2.
       damping = array_ops.constant(0.)
-      block.instantiate_factors(([grads],), damping)
+      block.instantiate_factors(((grads,),), damping)
 
   def testMultiplyInverse(self):
     with ops.Graph().as_default(), self.test_session() as sess:
@@ -407,12 +435,17 @@ class EmbeddingKFACFBTest(test.TestCase):
       # Add some examples.
       inputs = array_ops.constant([[0, 1], [1, 2], [2, 3]])
       outputs = array_ops.constant([[0.], [1.], [2.]])
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       # Instantiate factor's variables. Ensure it doesn't fail.
       grads = outputs**2.
       damping = array_ops.constant(0.)
-      block.instantiate_factors(([grads],), damping)
+      block.instantiate_factors(((grads,),), damping)
+      block._input_factor.instantiate_cov_variables()
+      block._output_factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._input_factor.instantiate_inv_variables()
+      block._output_factor.instantiate_inv_variables()
 
       # Create a sparse update.
       indices = array_ops.constant([1, 3, 4])
@@ -443,7 +476,7 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([1., 2.])
       outputs = array_ops.constant([3., 4.])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection())
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       self.assertAllEqual([outputs], block.tensors_to_compute_grads())
 
@@ -453,10 +486,10 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([[1., 2.], [3., 4.]])
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=True)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       grads = outputs**2
-      block.instantiate_factors(([grads],), 0.5)
+      block.instantiate_factors(((grads,),), 0.5)
 
   def testInstantiateFactorsNoBias(self):
     with ops.Graph().as_default():
@@ -464,10 +497,10 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([[1., 2.], [3., 4.]])
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       grads = outputs**2
-      block.instantiate_factors(([grads],), 0.5)
+      block.instantiate_factors(((grads,),), 0.5)
 
   def testMultiplyInverseTuple(self):
     with ops.Graph().as_default(), self.test_session() as sess:
@@ -475,9 +508,15 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]])
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
-      block.instantiate_factors(([grads],), 0.5)
+      block.instantiate_factors(((grads,),), 0.5)
+
+      block._input_factor.instantiate_cov_variables()
+      block._output_factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._input_factor.instantiate_inv_variables()
+      block._output_factor.instantiate_inv_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(tf_variables.global_variables_initializer())
@@ -501,9 +540,14 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([[1., 2.], [3., 4.]])
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
-      block.instantiate_factors(([grads],), 0.5)
+      block.instantiate_factors(((grads,),), 0.5)
+      block._input_factor.instantiate_cov_variables()
+      block._output_factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._input_factor.instantiate_inv_variables()
+      block._output_factor.instantiate_inv_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(tf_variables.global_variables_initializer())
@@ -524,13 +568,20 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       outputs = array_ops.zeros([32, output_dim])
       params = array_ops.zeros([input_dim, output_dim])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
       damping = 0.  # This test is only valid without damping.
-      block.instantiate_factors(([grads],), damping)
+      block.instantiate_factors(((grads,),), damping)
+      block._input_factor.instantiate_cov_variables()
+      block._output_factor.instantiate_cov_variables()
 
       sess.run(state_ops.assign(block._input_factor._cov, _make_psd(3)))
       sess.run(state_ops.assign(block._output_factor._cov, _make_psd(2)))
+
+      block.register_inverse()
+      block._input_factor.instantiate_inv_variables()
+      block._output_factor.instantiate_inv_variables()
+
       sess.run(block._input_factor.make_inverse_update_ops())
       sess.run(block._output_factor.make_inverse_update_ops())
 
@@ -653,8 +704,8 @@ class ConvDiagonalFBTest(test.TestCase):
 
     self.assertAllClose(expected_result, result, atol=1e-3)
 
-  def testRegisterAdditionalMinibatch(self):
-    """Ensure 1 big minibatch and 2 small minibatches are equivalent."""
+  def testRegisterAdditionalTower(self):
+    """Ensure 1 big tower and 2 small towers are equivalent."""
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
@@ -715,9 +766,10 @@ class ConvDiagonalFBTest(test.TestCase):
       block = fb.ConvDiagonalFB(
           lc.LayerCollection(), params, strides=[1, 1, 1, 1], padding='SAME')
       for (i, o) in zip(inputs, outputs):
-        block.register_additional_minibatch(i, o)
+        block.register_additional_tower(i, o)
 
       block.instantiate_factors((output_grads,), damping=0.0)
+      block._factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
       sess.run(block._factor.make_covariance_update_op(0.0))
@@ -727,6 +779,54 @@ class ConvDiagonalFBTest(test.TestCase):
     return multiply_result, multiply_inverse_result
 
 
+class DepthwiseConvKFCBasicFBTest(test.TestCase):
+
+  def testInstantiateFactors(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      params = random_ops.random_normal((3, 3, 8, 2))
+      inputs = random_ops.random_normal((32, 5, 5, 8))
+      outputs = random_ops.random_normal((32, 5, 5, 16))
+      layer_collection = lc.LayerCollection()
+      block = fb.DepthwiseConvKFCBasicFB(
+          layer_collection, params=params, strides=[1, 1, 1, 1], padding='SAME')
+      block.register_additional_tower(inputs, outputs)
+      grads = outputs**2
+      block.instantiate_factors(([grads],), 0.5)
+
+  def testMultiplyInverse(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      params = random_ops.random_normal((3, 3, 8, 2))
+      inputs = random_ops.random_normal((32, 5, 5, 8))
+      outputs = random_ops.random_normal((32, 5, 5, 16))
+      layer_collection = lc.LayerCollection()
+      block = fb.DepthwiseConvKFCBasicFB(
+          layer_collection, params=params, strides=[1, 1, 1, 1], padding='SAME')
+      block.register_additional_tower(inputs, outputs)
+      grads = outputs**2
+      block.instantiate_factors(([grads],), 0.5)
+      block._input_factor.instantiate_cov_variables()
+      block._output_factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._input_factor.instantiate_inv_variables()
+      block._output_factor.instantiate_inv_variables()
+
+      # Ensure inverse update op doesn't crash.
+      sess.run(tf_variables.global_variables_initializer())
+      sess.run([
+          factor.make_inverse_update_ops()
+          for factor in layer_collection.get_factors()
+      ])
+
+      # Ensure inverse-vector multiply doesn't crash.
+      output = block.multiply_inverse(params)
+      sess.run(output)
+
+      # Ensure same shape.
+      self.assertAllEqual(output.shape, params.shape)
+
+
 class ConvKFCBasicFBTest(test.TestCase):
 
   def _testConvKFCBasicFBInitParams(self, params):
@@ -738,16 +838,17 @@ class ConvKFCBasicFBTest(test.TestCase):
         params = array_ops.constant(params)
       inputs = random_ops.random_normal((2, 2, 2))
       outputs = random_ops.random_normal((2, 2, 2))
-      block = fb.ConvKFCBasicFB(lc.LayerCollection(), params, [1, 1, 1], 'SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block = fb.ConvKFCBasicFB(
+          lc.LayerCollection(), params=params, padding='SAME')
+      block.register_additional_tower(inputs, outputs)
 
       self.assertAllEqual([outputs], block.tensors_to_compute_grads())
 
   def testConvKFCBasicFBInitParamsParamsTuple(self):
-    self._testConvKFCBasicFBInitParams([np.array([1., 2.]), np.array(3.)])
+    self._testConvKFCBasicFBInitParams([np.ones([1, 2, 2]), np.ones([2])])
 
   def testConvKFCBasicFBInitParamsParamsSingle(self):
-    self._testConvKFCBasicFBInitParams([np.array([1., 2.])])
+    self._testConvKFCBasicFBInitParams([np.ones([1, 2, 2])])
 
   def testMultiplyInverseTuple(self):
     with ops.Graph().as_default(), self.test_session() as sess:
@@ -755,11 +856,16 @@ class ConvKFCBasicFBTest(test.TestCase):
       params = random_ops.random_normal((2, 2, 2, 2))
       inputs = random_ops.random_normal((2, 2, 2, 2))
       outputs = random_ops.random_normal((2, 2, 2, 2))
-      block = fb.ConvKFCBasicFB(lc.LayerCollection(), params, (1, 1, 1, 1),
-                                'SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block = fb.ConvKFCBasicFB(
+          lc.LayerCollection(), params=params, padding='SAME')
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
-      block.instantiate_factors(([grads],), 0.5)
+      block.instantiate_factors(((grads,),), 0.5)
+      block._input_factor.instantiate_cov_variables()
+      block._output_factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._input_factor.instantiate_inv_variables()
+      block._output_factor.instantiate_inv_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(tf_variables.global_variables_initializer())
@@ -781,12 +887,17 @@ class ConvKFCBasicFBTest(test.TestCase):
       params = random_ops.random_normal((2, 2, 2, 2))
       inputs = random_ops.random_normal((2, 2, 2, 2))
       outputs = random_ops.random_normal((2, 2, 2, 2))
-      block = fb.ConvKFCBasicFB(lc.LayerCollection(), params, (1, 1, 1, 1),
-                                'SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block = fb.ConvKFCBasicFB(
+          lc.LayerCollection(), params=params, padding='SAME')
+      block.register_additional_tower(inputs, outputs)
       self.assertFalse(block._has_bias)
       grads = outputs**2
-      block.instantiate_factors(([grads],), 0.5)
+      block.instantiate_factors(((grads,),), 0.5)
+      block._input_factor.instantiate_cov_variables()
+      block._output_factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._input_factor.instantiate_inv_variables()
+      block._output_factor.instantiate_inv_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(tf_variables.global_variables_initializer())
@@ -804,12 +915,17 @@ class ConvKFCBasicFBTest(test.TestCase):
       params = [random_ops.random_normal((2, 2, 2, 2))]
       inputs = random_ops.random_normal((2, 2, 2, 2))
       outputs = random_ops.random_normal((2, 2, 2, 2))
-      block = fb.ConvKFCBasicFB(lc.LayerCollection(), params, (1, 1, 1, 1),
-                                'SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block = fb.ConvKFCBasicFB(
+          lc.LayerCollection(), params=params, padding='SAME')
+      block.register_additional_tower(inputs, outputs)
       self.assertTrue(block._has_bias)
       grads = outputs**2
-      block.instantiate_factors(([grads],), 0.5)
+      block.instantiate_factors(((grads,),), 0.5)
+      block._input_factor.instantiate_cov_variables()
+      block._output_factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._input_factor.instantiate_inv_variables()
+      block._output_factor.instantiate_inv_variables()
 
       # Make sure our inverse is something other than the identity.
       sess.run(tf_variables.global_variables_initializer())
@@ -827,12 +943,17 @@ class ConvKFCBasicFBTest(test.TestCase):
       params = array_ops.zeros((2, 2, 2, 2))
       inputs = array_ops.zeros((2, 2, 2, 2))
       outputs = array_ops.zeros((2, 2, 2, 2))
-      block = fb.ConvKFCBasicFB(lc.LayerCollection(), params, (1, 1, 1, 1),
-                                'SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block = fb.ConvKFCBasicFB(
+          lc.LayerCollection(), params=params, padding='SAME')
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
       damping = 0.  # This test is only valid without damping.
-      block.instantiate_factors(([grads],), damping)
+      block.instantiate_factors(((grads,),), damping)
+      block._input_factor.instantiate_cov_variables()
+      block._output_factor.instantiate_cov_variables()
+      block.register_inverse()
+      block._input_factor.instantiate_inv_variables()
+      block._output_factor.instantiate_inv_variables()
 
       sess.run(state_ops.assign(block._input_factor._cov, _make_psd(8)))
       sess.run(state_ops.assign(block._output_factor._cov, _make_psd(2)))
@@ -857,9 +978,9 @@ class FullyConnectedSeriesFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       inputs = array_ops.constant([1., 2.])
       outputs = array_ops.constant([3., 4.])
-      block = fb.FullyConnectedSeriesFB(
-          lc.LayerCollection(), inputs=[inputs], outputs=[outputs])
-      self.assertAllEqual([outputs], block.tensors_to_compute_grads())
+      block = fb.FullyConnectedSeriesFB(lc.LayerCollection())
+      block.register_additional_tower([inputs], [outputs])
+      self.assertAllEqual([[outputs]], block.tensors_to_compute_grads())
 
   def testInstantiateFactorsHasBias(self):
     with ops.Graph().as_default():
@@ -868,11 +989,10 @@ class FullyConnectedSeriesFBTest(test.TestCase):
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedSeriesFB(
           lc.LayerCollection(),
-          inputs=[inputs],
-          outputs=[outputs],
           has_bias=True)
+      block.register_additional_tower([inputs], [outputs])
       grads = outputs**2
-      block.instantiate_factors(((grads,),), 0.5)
+      block.instantiate_factors((((grads,),),), 0.5)
 
   def testInstantiateFactorsNoBias(self):
     with ops.Graph().as_default():
@@ -881,11 +1001,10 @@ class FullyConnectedSeriesFBTest(test.TestCase):
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedSeriesFB(
           lc.LayerCollection(),
-          inputs=[inputs],
-          outputs=[outputs],
           has_bias=False)
+      block.register_additional_tower([inputs], [outputs])
       grads = outputs**2
-      block.instantiate_factors(((grads,),), 0.5)
+      block.instantiate_factors((((grads,),),), 0.5)
 
 
 def as_tensors(tensor_or_tuple):
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
index 66e18974abfadaad5d7a20b40d0b1352bfda67ee..fad47cd02f372e0b180645b5636965514bafe6b0 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 import numpy.random as npr
 
+from tensorflow.contrib.kfac.python.ops import fisher_blocks as fb
 from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -29,36 +30,20 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import test
 
 
-class MaybeColocateTest(test.TestCase):
+# We need to set these constants since the numerical values used in the tests
+# were chosen when these used to be the defaults.
+ff.set_global_constants(init_covariances_at_zero=False,
+                        zero_debias=False,
+                        init_inverses_at_zero=False)
 
-  def setUp(self):
-    self._colocate_cov_ops_with_inputs = ff.COLOCATE_COV_OPS_WITH_INPUTS
-
-  def tearDown(self):
-    ff.set_global_constants(
-        colocate_cov_ops_with_inputs=self._colocate_cov_ops_with_inputs)
 
-  def testFalse(self):
-    ff.set_global_constants(colocate_cov_ops_with_inputs=False)
-    with tf_ops.Graph().as_default():
-      a = constant_op.constant([2.0], name='a')
-      with ff.maybe_colocate_with(a):
-        b = constant_op.constant(3.0, name='b')
-      self.assertEqual([b'loc:@a'], a.op.colocation_groups())
-      self.assertEqual([b'loc:@b'], b.op.colocation_groups())
-
-  def testTrue(self):
-    ff.set_global_constants(colocate_cov_ops_with_inputs=True)
-    with tf_ops.Graph().as_default():
-      a = constant_op.constant([2.0], name='a')
-      with ff.maybe_colocate_with(a):
-        b = constant_op.constant(3.0, name='b')
-      self.assertEqual([b'loc:@a'], a.op.colocation_groups())
-      self.assertEqual([b'loc:@a'], b.op.colocation_groups())
+def make_damping_func(damping):
+  return fb._package_func(lambda: damping, damping)
 
 
 class FisherFactorTestingDummy(ff.FisherFactor):
@@ -92,26 +77,44 @@ class FisherFactorTestingDummy(ff.FisherFactor):
   def get_cov(self):
     return NotImplementedError
 
-  def left_multiply(self, x, damping):
+  def instantiate_inv_variables(self):
     return NotImplementedError
 
-  def right_multiply(self, x, damping):
-    return NotImplementedError
+  def _num_towers(self):
+    raise NotImplementedError
 
-  def left_multiply_inverse(self, x, damping):
-    return NotImplementedError
+  def _get_data_device(self):
+    raise NotImplementedError
 
-  def right_multiply_inverse(self, x, damping):
-    return NotImplementedError
+  def register_matpower(self, exp, damping_func):
+    raise NotImplementedError
+
+  def register_cholesky(self, damping_func):
+    raise NotImplementedError
+
+  def register_cholesky_inverse(self, damping_func):
+    raise NotImplementedError
+
+  def get_matpower(self, exp, damping_func):
+    raise NotImplementedError
+
+  def get_cholesky(self, damping_func):
+    raise NotImplementedError
+
+  def get_cholesky_inverse(self, damping_func):
+    raise NotImplementedError
+
+  def get_cov_as_linear_operator(self):
+    raise NotImplementedError
 
 
-class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
-  """Dummy class to test the non-abstract methods on ff.InverseProvidingFactor.
+class DenseSquareMatrixFactorTestingDummy(ff.DenseSquareMatrixFactor):
+  """Dummy class to test the non-abstract methods on ff.DenseSquareMatrixFactor.
   """
 
   def __init__(self, shape):
     self._shape = shape
-    super(InverseProvidingFactorTestingDummy, self).__init__()
+    super(DenseSquareMatrixFactorTestingDummy, self).__init__()
 
   @property
   def _var_scope(self):
@@ -135,6 +138,12 @@ class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
   def instantiate_covariance(self):
     pass
 
+  def _num_towers(self):
+    raise NotImplementedError
+
+  def _get_data_device(self):
+    raise NotImplementedError
+
 
 class NumericalUtilsTest(test.TestCase):
 
@@ -237,50 +246,64 @@ class FisherFactorTest(test.TestCase):
       self.assertEqual(0, len(factor.make_inverse_update_ops()))
 
 
-class InverseProvidingFactorTest(test.TestCase):
+class DenseSquareMatrixFactorTest(test.TestCase):
 
   def testRegisterDampedInverse(self):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       shape = [2, 2]
-      factor = InverseProvidingFactorTestingDummy(shape)
+      factor = DenseSquareMatrixFactorTestingDummy(shape)
       factor_var_scope = 'dummy/a_b_c'
 
-      dampings = 0.1, 1e-1, 0.00001, 1e-5
+      damping_funcs = [make_damping_func(0.1),
+                       make_damping_func(0.1),
+                       make_damping_func(1e-5),
+                       make_damping_func(1e-5)]
+      for damping_func in damping_funcs:
+        factor.register_inverse(damping_func)
 
-      for damping in dampings:
-        factor.register_damped_inverse(damping)
+      factor.instantiate_inv_variables()
 
-      self.assertEqual(set(dampings), set(factor._inverses_by_damping.keys()))
-      inv = factor._inverses_by_damping[dampings[0]]
-      self.assertEqual(inv, factor._inverses_by_damping[dampings[1]])
-      self.assertNotEqual(inv, factor._inverses_by_damping[dampings[2]])
-      self.assertEqual(factor._inverses_by_damping[dampings[2]],
-                       factor._inverses_by_damping[dampings[3]])
+      inv = factor.get_inverse(damping_funcs[0]).to_dense()
+      self.assertEqual(inv, factor.get_inverse(damping_funcs[1]).to_dense())
+      self.assertNotEqual(inv, factor.get_inverse(damping_funcs[2]).to_dense())
+      self.assertEqual(factor.get_inverse(damping_funcs[2]).to_dense(),
+                       factor.get_inverse(damping_funcs[3]).to_dense())
       factor_vars = tf_ops.get_collection(tf_ops.GraphKeys.GLOBAL_VARIABLES,
                                           factor_var_scope)
-      self.assertListEqual([inv, factor._inverses_by_damping[dampings[2]]],
-                           factor_vars)
+      factor_tensors = (tf_ops.convert_to_tensor(var) for var in factor_vars)
+
+      self.assertEqual(set([inv,
+                            factor.get_inverse(damping_funcs[2]).to_dense()]),
+                       set(factor_tensors))
       self.assertEqual(shape, inv.get_shape())
 
   def testRegisterMatpower(self):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       shape = [3, 3]
-      factor = InverseProvidingFactorTestingDummy(shape)
+      factor = DenseSquareMatrixFactorTestingDummy(shape)
       factor_var_scope = 'dummy/a_b_c'
 
-      factor.register_matpower(1, 0.5)
-      factor.register_matpower(2, 0.5)
+      # TODO(b/74201126): Change to using the same func for both once
+      # Topohash is in place.
+      damping_func_1 = make_damping_func(0.5)
+      damping_func_2 = make_damping_func(0.5)
+
+      factor.register_matpower(-0.5, damping_func_1)
+      factor.register_matpower(2, damping_func_2)
+
+      factor.instantiate_inv_variables()
 
-      self.assertEqual(
-          set([(1, 0.5), (2, 0.5)]),
-          set(factor._matpower_by_exp_and_damping.keys()))
       factor_vars = tf_ops.get_collection(tf_ops.GraphKeys.GLOBAL_VARIABLES,
                                           factor_var_scope)
-      matpower1 = factor.get_matpower(1, 0.5)
-      matpower2 = factor.get_matpower(2, 0.5)
-      self.assertListEqual([matpower1, matpower2], factor_vars)
+
+      factor_tensors = (tf_ops.convert_to_tensor(var) for var in factor_vars)
+
+      matpower1 = factor.get_matpower(-0.5, damping_func_1).to_dense()
+      matpower2 = factor.get_matpower(2, damping_func_2).to_dense()
+
+      self.assertEqual(set([matpower1, matpower2]), set(factor_tensors))
 
       self.assertEqual(shape, matpower1.get_shape())
       self.assertEqual(shape, matpower2.get_shape())
@@ -296,20 +319,28 @@ class InverseProvidingFactorTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       cov = np.array([[1., 2.], [3., 4.]])
-      factor = InverseProvidingFactorTestingDummy(cov.shape)
+      factor = DenseSquareMatrixFactorTestingDummy(cov.shape)
       factor._cov = array_ops.constant(cov, dtype=dtypes.float32)
 
+      damping_funcs = []
       for i in range(1, ff.EIGENVALUE_DECOMPOSITION_THRESHOLD + 1):
-        factor.register_damped_inverse(1. / i)
+        damping_funcs.append(make_damping_func(1./i))
+
+      for i in range(ff.EIGENVALUE_DECOMPOSITION_THRESHOLD):
+        factor.register_inverse(damping_funcs[i])
+
+      factor.instantiate_inv_variables()
       ops = factor.make_inverse_update_ops()
       self.assertEqual(1, len(ops))
 
       sess.run(tf_variables.global_variables_initializer())
       new_invs = []
       sess.run(ops)
-      for i in range(1, ff.EIGENVALUE_DECOMPOSITION_THRESHOLD + 1):
+      for i in range(ff.EIGENVALUE_DECOMPOSITION_THRESHOLD):
         # The inverse op will assign the damped inverse of cov to the inv var.
-        new_invs.append(sess.run(factor._inverses_by_damping[1. / i]))
+        new_invs.append(
+            sess.run(factor.get_inverse(damping_funcs[i]).to_dense()))
+
       # We want to see that the new invs are all different from each other.
       for i in range(len(new_invs)):
         for j in range(i + 1, len(new_invs)):
@@ -320,18 +351,20 @@ class InverseProvidingFactorTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       cov = np.array([[6., 2.], [2., 4.]])
-      factor = InverseProvidingFactorTestingDummy(cov.shape)
+      factor = DenseSquareMatrixFactorTestingDummy(cov.shape)
       factor._cov = array_ops.constant(cov, dtype=dtypes.float32)
       exp = 2  # NOTE(mattjj): must be int to test with np.linalg.matrix_power
       damping = 0.5
+      damping_func = make_damping_func(damping)
 
-      factor.register_matpower(exp, damping)
+      factor.register_matpower(exp, damping_func)
+      factor.instantiate_inv_variables()
       ops = factor.make_inverse_update_ops()
       self.assertEqual(1, len(ops))
 
       sess.run(tf_variables.global_variables_initializer())
       sess.run(ops[0])
-      matpower = sess.run(factor._matpower_by_exp_and_damping[(exp, damping)])
+      matpower = sess.run(factor.get_matpower(exp, damping_func).to_dense())
       matpower_np = np.linalg.matrix_power(cov + np.eye(2) * damping, exp)
       self.assertAllClose(matpower, matpower_np)
 
@@ -339,21 +372,24 @@ class InverseProvidingFactorTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       cov = np.array([[5., 2.], [2., 4.]])  # NOTE(mattjj): must be symmetric
-      factor = InverseProvidingFactorTestingDummy(cov.shape)
+      factor = DenseSquareMatrixFactorTestingDummy(cov.shape)
       factor._cov = array_ops.constant(cov, dtype=dtypes.float32)
 
-      factor.register_damped_inverse(0)
+      damping_func = make_damping_func(0)
+
+      factor.register_inverse(damping_func)
+      factor.instantiate_inv_variables()
       ops = factor.make_inverse_update_ops()
       self.assertEqual(1, len(ops))
 
       sess.run(tf_variables.global_variables_initializer())
       # The inverse op will assign the damped inverse of cov to the inv var.
-      old_inv = sess.run(factor._inverses_by_damping[0])
+      old_inv = sess.run(factor.get_inverse(damping_func).to_dense())
       self.assertAllClose(
           sess.run(ff.inverse_initializer(cov.shape, dtypes.float32)), old_inv)
 
       sess.run(ops)
-      new_inv = sess.run(factor._inverses_by_damping[0])
+      new_inv = sess.run(factor.get_inverse(damping_func).to_dense())
       self.assertAllClose(new_inv, np.linalg.inv(cov))
 
 
@@ -364,6 +400,7 @@ class FullFactorTest(test.TestCase):
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), name='a/b/c')
       factor = ff.FullFactor((tensor,), 32)
+      factor.instantiate_cov_variables()
       self.assertEqual([6, 6], factor.get_cov().get_shape().as_list())
 
   def testFullFactorInitFloat64(self):
@@ -372,6 +409,7 @@ class FullFactorTest(test.TestCase):
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
       factor = ff.FullFactor((tensor,), 32)
+      factor.instantiate_cov_variables()
       cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
       self.assertEqual([6, 6], cov.get_shape().as_list())
@@ -381,6 +419,7 @@ class FullFactorTest(test.TestCase):
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([1., 2.], name='a/b/c')
       factor = ff.FullFactor((tensor,), 2)
+      factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
       new_cov = sess.run(factor.make_covariance_update_op(.5))
@@ -394,7 +433,8 @@ class NaiveDiagonalFactorTest(test.TestCase):
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), name='a/b/c')
       factor = ff.NaiveDiagonalFactor((tensor,), 32)
-      self.assertEqual([6, 1], factor.get_cov_var().get_shape().as_list())
+      factor.instantiate_cov_variables()
+      self.assertEqual([6, 1], factor.get_cov().get_shape().as_list())
 
   def testNaiveDiagonalFactorInitFloat64(self):
     with tf_ops.Graph().as_default():
@@ -402,7 +442,8 @@ class NaiveDiagonalFactorTest(test.TestCase):
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
       factor = ff.NaiveDiagonalFactor((tensor,), 32)
-      cov = factor.get_cov_var()
+      factor.instantiate_cov_variables()
+      cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
       self.assertEqual([6, 1], cov.get_shape().as_list())
 
@@ -411,6 +452,7 @@ class NaiveDiagonalFactorTest(test.TestCase):
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([1., 2.], name='a/b/c')
       factor = ff.NaiveDiagonalFactor((tensor,), 2)
+      factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
       new_cov = sess.run(factor.make_covariance_update_op(.5))
@@ -424,7 +466,8 @@ class EmbeddingInputKroneckerFactorTest(test.TestCase):
       input_ids = array_ops.constant([[0], [1], [4]])
       vocab_size = 5
       factor = ff.EmbeddingInputKroneckerFactor((input_ids,), vocab_size)
-      cov = factor.get_cov_var()
+      factor.instantiate_cov_variables()
+      cov = factor.get_cov()
       self.assertEqual(cov.shape.as_list(), [vocab_size])
 
   def testCovarianceUpdateOp(self):
@@ -432,6 +475,7 @@ class EmbeddingInputKroneckerFactorTest(test.TestCase):
       input_ids = array_ops.constant([[0], [1], [4]])
       vocab_size = 5
       factor = ff.EmbeddingInputKroneckerFactor((input_ids,), vocab_size)
+      factor.instantiate_cov_variables()
       cov_update_op = factor.make_covariance_update_op(0.0)
 
       with self.test_session() as sess:
@@ -440,6 +484,118 @@ class EmbeddingInputKroneckerFactorTest(test.TestCase):
         self.assertAllClose(np.array([1., 1., 0., 0., 1.]) / 3., new_cov)
 
 
+class ConvDiagonalFactorTest(test.TestCase):
+
+  def setUp(self):
+    self.batch_size = 10
+    self.height = self.width = 32
+    self.in_channels = 3
+    self.out_channels = 1
+    self.kernel_height = self.kernel_width = 3
+    self.strides = [1, 2, 2, 1]
+    self.data_format = 'NHWC'
+    self.padding = 'SAME'
+    self.kernel_shape = [
+        self.kernel_height, self.kernel_width, self.in_channels,
+        self.out_channels
+    ]
+
+  def testInit(self):
+    with tf_ops.Graph().as_default():
+      inputs = random_ops.random_uniform(
+          [self.batch_size, self.height, self.width, self.in_channels])
+      outputs_grads = [
+          random_ops.random_uniform([
+              self.batch_size, self.height // self.strides[1],
+              self.width // self.strides[2], self.out_channels
+          ]) for _ in range(3)
+      ]
+
+      factor = ff.ConvDiagonalFactor(
+          (inputs,),
+          (outputs_grads,),
+          self.kernel_shape,
+          self.strides,
+          self.padding,
+          data_format=self.data_format)
+      factor.instantiate_cov_variables()
+
+      # Ensure covariance matrix's shape makes sense.
+      self.assertEqual([
+          self.kernel_height * self.kernel_width * self.in_channels,
+          self.out_channels
+      ],
+                       factor.get_cov().shape.as_list())
+
+  def testMakeCovarianceUpdateOp(self):
+    with tf_ops.Graph().as_default():
+      # Construct all arguments such that convolution kernel is applied in
+      # exactly one spatial location.
+      inputs = np.random.randn(
+          1,  # batch_size
+          self.kernel_height,
+          self.kernel_width,
+          self.in_channels)  # in_channels
+      outputs_grad = np.random.randn(
+          1,  # batch_size
+          1,  # output_height
+          1,  # output_width
+          self.out_channels)
+
+      factor = ff.ConvDiagonalFactor(
+          (constant_op.constant(inputs),),
+          ((constant_op.constant(outputs_grad),),),
+          self.kernel_shape,
+          strides=[1, 1, 1, 1],
+          padding='VALID')
+      factor.instantiate_cov_variables()
+
+      # Completely forget initial value on first update.
+      cov_update_op = factor.make_covariance_update_op(0.0)
+
+      # Ensure new covariance value is same as outer-product of inputs/outputs
+      # vectorized, squared.
+      with self.test_session() as sess:
+        sess.run(tf_variables.global_variables_initializer())
+        cov = sess.run(cov_update_op)
+        expected_cov = np.outer(inputs.flatten(), outputs_grad.flatten())**2
+        self.assertAllClose(expected_cov, cov)
+
+  def testHasBias(self):
+    with tf_ops.Graph().as_default():
+      inputs = random_ops.random_uniform(
+          [self.batch_size, self.height, self.width, self.in_channels])
+      outputs_grads = [
+          random_ops.random_uniform([
+              self.batch_size, self.height // self.strides[1],
+              self.width // self.strides[2], self.out_channels
+          ]) for _ in range(3)
+      ]
+
+      factor = ff.ConvDiagonalFactor(
+          (inputs,),
+          (outputs_grads,),
+          self.kernel_shape,
+          self.strides,
+          self.padding,
+          data_format=self.data_format,
+          has_bias=True)
+      factor.instantiate_cov_variables()
+
+      # Ensure shape accounts for bias.
+      self.assertEqual([
+          self.kernel_height * self.kernel_width * self.in_channels + 1,
+          self.out_channels
+      ],
+                       factor.get_cov().shape.as_list())
+
+      # Ensure update op doesn't crash.
+      cov_update_op = factor.make_covariance_update_op(0.0)
+      with self.test_session() as sess:
+        sess.run(tf_variables.global_variables_initializer())
+        sess.run(cov_update_op)
+
+
 class FullyConnectedKroneckerFactorTest(test.TestCase):
 
   def _testFullyConnectedKroneckerFactorInit(self,
@@ -449,7 +605,8 @@ class FullyConnectedKroneckerFactorTest(test.TestCase):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
-      factor = ff.FullyConnectedKroneckerFactor((tensor,), has_bias=has_bias)
+      factor = ff.FullyConnectedKroneckerFactor(((tensor,),), has_bias=has_bias)
+      factor.instantiate_cov_variables()
       cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
       self.assertEqual(final_shape, cov.get_shape().as_list())
@@ -466,7 +623,8 @@ class FullyConnectedKroneckerFactorTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      factor = ff.FullyConnectedKroneckerFactor((tensor,), has_bias=True)
+      factor = ff.FullyConnectedKroneckerFactor(((tensor,),), has_bias=True)
+      factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
       new_cov = sess.run(factor.make_covariance_update_op(.5))
@@ -476,40 +634,171 @@ class FullyConnectedKroneckerFactorTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      factor = ff.FullyConnectedKroneckerFactor((tensor,))
+      factor = ff.FullyConnectedKroneckerFactor(((tensor,),))
+      factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
       new_cov = sess.run(factor.make_covariance_update_op(.5))
       self.assertAllClose([[3, 3.5], [3.5, 5.5]], new_cov)
 
 
-class ConvInputKroneckerFactorTest(test.TestCase):
+class ConvFactorTestCase(test.TestCase):
+
+  def assertMatrixRank(self, rank, matrix, atol=1e-5):
+    assert rank <= matrix.shape[0], 'Rank cannot be larger than matrix size.'
+    eigvals = np.linalg.eigvals(matrix)
+    nnz_eigvals = np.sum(eigvals > atol)
+    self.assertEqual(
+        rank,
+        nnz_eigvals,
+        msg=('Found %d of %d expected non-zero eigenvalues: %s.' %
+             (nnz_eigvals, rank, eigvals)))
+
+
+class ConvInputKroneckerFactorTest(ConvFactorTestCase):
+
+  def test3DConvolution(self):
+    with tf_ops.Graph().as_default():
+      batch_size = 1
+      width = 3
+      in_channels = 3**3
+      out_channels = 4
+
+      factor = ff.ConvInputKroneckerFactor(
+          inputs=(random_ops.random_uniform(
+              (batch_size, width, width, width, in_channels), seed=0),),
+          filter_shape=(width, width, width, in_channels, out_channels),
+          padding='SAME',
+          strides=(2, 2, 2),
+          extract_patches_fn='extract_convolution_patches',
+          has_bias=False)
+      factor.instantiate_cov_variables()
+
+      # Ensure shape of covariance matches input size of filter.
+      input_size = in_channels * (width**3)
+      self.assertEqual([input_size, input_size],
+                       factor.get_cov().shape.as_list())
+
+      # Ensure cov_update_op doesn't crash.
+      with self.test_session() as sess:
+        sess.run(tf_variables.global_variables_initializer())
+        sess.run(factor.make_covariance_update_op(0.0))
+        cov = sess.run(factor.get_cov())
+
+      # Cov should be rank-8, as the filter will be applied at each corner of
+      # the 4-D cube.
+      self.assertMatrixRank(8, cov)
+
+  def testPointwiseConv2d(self):
+    with tf_ops.Graph().as_default():
+      batch_size = 1
+      width = 3
+      in_channels = 3**2
+      out_channels = 4
+
+      factor = ff.ConvInputKroneckerFactor(
+          inputs=(random_ops.random_uniform(
+              (batch_size, width, width, in_channels), seed=0),),
+          filter_shape=(1, 1, in_channels, out_channels),
+          padding='SAME',
+          strides=(1, 1, 1, 1),
+          extract_patches_fn='extract_pointwise_conv2d_patches',
+          has_bias=False)
+      factor.instantiate_cov_variables()
+
+      # Ensure shape of covariance matches input size of filter.
+      self.assertEqual([in_channels, in_channels],
+                       factor.get_cov().shape.as_list())
+
+      # Ensure cov_update_op doesn't crash.
+      with self.test_session() as sess:
+        sess.run(tf_variables.global_variables_initializer())
+        sess.run(factor.make_covariance_update_op(0.0))
+        cov = sess.run(factor.get_cov())
+
+      # Cov should be rank-9, as the filter will be applied at each location.
+      self.assertMatrixRank(9, cov)
+
+  def testStrides(self):
+    with tf_ops.Graph().as_default():
+      batch_size = 1
+      width = 3
+      in_channels = 3**2
+      out_channels = 4
+
+      factor = ff.ConvInputKroneckerFactor(
+          inputs=(random_ops.random_uniform(
+              (batch_size, width, width, in_channels), seed=0),),
+          filter_shape=(1, 1, in_channels, out_channels),
+          padding='SAME',
+          strides=(1, 2, 1, 1),
+          extract_patches_fn='extract_image_patches',
+          has_bias=False)
+      factor.instantiate_cov_variables()
+
+      with self.test_session() as sess:
+        sess.run(tf_variables.global_variables_initializer())
+        sess.run(factor.make_covariance_update_op(0.0))
+        cov = sess.run(factor.get_cov())
+
+      # Cov should be the sum of 3 * 2 = 6 outer products.
+      self.assertMatrixRank(6, cov)
+
+  def testDilationRate(self):
+    with tf_ops.Graph().as_default():
+      batch_size = 1
+      width = 3
+      in_channels = 2
+      out_channels = 4
+
+      factor = ff.ConvInputKroneckerFactor(
+          inputs=(random_ops.random_uniform(
+              (batch_size, width, width, in_channels), seed=0),),
+          filter_shape=(3, 3, in_channels, out_channels),
+          padding='SAME',
+          extract_patches_fn='extract_image_patches',
+          strides=(1, 1, 1, 1),
+          dilation_rate=(1, width, width, 1),
+          has_bias=False)
+      factor.instantiate_cov_variables()
+
+      with self.test_session() as sess:
+        sess.run(tf_variables.global_variables_initializer())
+        sess.run(factor.make_covariance_update_op(0.0))
+        cov = sess.run(factor.get_cov())
+
+      # Cov should be rank = in_channels, as only the center of the filter
+      # receives non-zero input for each input channel.
+      self.assertMatrixRank(in_channels, cov)
 
   def testConvInputKroneckerFactorInitNoBias(self):
     with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), name='a/b/c')
+      tensor = array_ops.ones((64, 1, 2, 3), name='a/b/c')
       factor = ff.ConvInputKroneckerFactor(
-          tensor, (1, 2, 3, 4), 3, 2, has_bias=False)
+          inputs=(tensor,),
+          filter_shape=(1, 2, 3, 4),
+          padding='SAME',
+          has_bias=False)
+      factor.instantiate_cov_variables()
       self.assertEqual([1 * 2 * 3, 1 * 2 * 3],
                        factor.get_cov().get_shape().as_list())
 
   def testConvInputKroneckerFactorInit(self):
     with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), name='a/b/c')
+      tensor = array_ops.ones((64, 1, 2, 3), name='a/b/c')
       factor = ff.ConvInputKroneckerFactor(
-          tensor, (1, 2, 3, 4), 3, 2, has_bias=True)
+          (tensor,), filter_shape=(1, 2, 3, 4), padding='SAME', has_bias=True)
+      factor.instantiate_cov_variables()
       self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
                        factor.get_cov().get_shape().as_list())
 
   def testConvInputKroneckerFactorInitFloat64(self):
     with tf_ops.Graph().as_default():
       dtype = dtypes.float64_ref
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      tensor = array_ops.ones((64, 1, 2, 3), name='a/b/c', dtype=dtypes.float64)
       factor = ff.ConvInputKroneckerFactor(
-          tensor, (1, 2, 3, 4), 3, 2, has_bias=True)
+          (tensor,), filter_shape=(1, 2, 3, 4), padding='SAME', has_bias=True)
+      factor.instantiate_cov_variables()
       cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
       self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
@@ -517,37 +806,82 @@ class ConvInputKroneckerFactorTest(test.TestCase):
 
   def testMakeCovarianceUpdateOpWithBias(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
+      input_shape = (2, 1, 1, 1)
       tensor = array_ops.constant(
-          np.arange(1., 17.).reshape(2, 2, 2, 2), dtype=dtypes.float32)
+          np.arange(1, 1 + np.prod(input_shape)).reshape(input_shape).astype(
+              np.float32))
       factor = ff.ConvInputKroneckerFactor(
-          tensor, (1, 2, 1, 1), [1, 1, 1, 1], 'SAME', has_bias=True)
+          (tensor,), filter_shape=(1, 1, 1, 1), padding='SAME', has_bias=True)
+      factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(.5))
-      self.assertAllClose([[34.375, 37, 3.125], [37, 41, 3.5], [3.125, 3.5, 1]],
-                          new_cov)
+      new_cov = sess.run(factor.make_covariance_update_op(0.))
+      self.assertAllClose(
+          [
+              [(1. + 4.) / 2., (1. + 2.) / 2.],  #
+              [(1. + 2.) / 2., (1. + 1.) / 2.]
+          ],  #
+          new_cov)
 
   def testMakeCovarianceUpdateOpNoBias(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
+      input_shape = (2, 1, 1, 1)
       tensor = array_ops.constant(
-          np.arange(1., 17.).reshape(2, 2, 2, 2), dtype=dtypes.float32)
-      factor = ff.ConvInputKroneckerFactor(tensor, (1, 2, 1, 1), [1, 1, 1, 1],
-                                           'SAME')
+          np.arange(1, 1 + np.prod(input_shape)).reshape(input_shape).astype(
+              np.float32))
+      factor = ff.ConvInputKroneckerFactor(
+          (tensor,), filter_shape=(1, 1, 1, 1), padding='SAME')
+      factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(.5))
-      self.assertAllClose([[34.375, 37], [37, 41]], new_cov)
+      new_cov = sess.run(factor.make_covariance_update_op(0.))
+      self.assertAllClose([[(1. + 4.) / 2.]], new_cov)
+
+  def testSubSample(self):
+    with tf_ops.Graph().as_default():
+      patches_1 = array_ops.constant(1, shape=(10, 2))
+      patches_2 = array_ops.constant(1, shape=(10, 8))
+      patches_3 = array_ops.constant(1, shape=(3, 3))
+      patches_1_sub = ff._subsample_for_cov_computation(patches_1)
+      patches_2_sub = ff._subsample_for_cov_computation(patches_2)
+      patches_3_sub = ff._subsample_for_cov_computation(patches_3)
+      patches_1_sub_batch_size = patches_1_sub.shape.as_list()[0]
+      patches_2_sub_batch_size = patches_2_sub.shape.as_list()[0]
+      patches_3_sub_batch_size = patches_3_sub.shape.as_list()[0]
+      self.assertEqual(2, patches_1_sub_batch_size)
+      self.assertEqual(8, patches_2_sub_batch_size)
+      self.assertEqual(3, patches_3_sub_batch_size)
+
+
+class ConvOutputKroneckerFactorTest(ConvFactorTestCase):
+
+  def test3DConvolution(self):
+    with tf_ops.Graph().as_default():
+      batch_size = 1
+      width = 3
+      out_channels = width**3
 
+      factor = ff.ConvOutputKroneckerFactor(outputs_grads=([
+          random_ops.random_uniform(
+              (batch_size, width, width, width, out_channels), seed=0)
+      ],))
+      factor.instantiate_cov_variables()
 
-class ConvOutputKroneckerFactorTest(test.TestCase):
+      with self.test_session() as sess:
+        sess.run(tf_variables.global_variables_initializer())
+        sess.run(factor.make_covariance_update_op(0.0))
+        cov = sess.run(factor.get_cov())
+
+      # Cov should be rank 3^3, as each spatial position donates a rank-1
+      # update.
+      self.assertMatrixRank(width**3, cov)
 
   def testConvOutputKroneckerFactorInit(self):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3, 4, 5), name='a/b/c')
-      factor = ff.ConvOutputKroneckerFactor((tensor,))
+      factor = ff.ConvOutputKroneckerFactor(((tensor,),))
+      factor.instantiate_cov_variables()
       self.assertEqual([5, 5], factor.get_cov().get_shape().as_list())
 
   def testConvOutputKroneckerFactorInitFloat64(self):
@@ -555,23 +889,18 @@ class ConvOutputKroneckerFactorTest(test.TestCase):
       dtype = dtypes.float64_ref
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3, 4, 5), dtype=dtype, name='a/b/c')
-      factor = ff.ConvOutputKroneckerFactor((tensor,))
+      factor = ff.ConvOutputKroneckerFactor(((tensor,),))
+      factor.instantiate_cov_variables()
       cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
       self.assertEqual([5, 5], cov.get_shape().as_list())
 
-  def testConvOutputKroneckerFactorInitNotEnoughDims(self):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), name='a/b/c')
-      with self.assertRaises(IndexError):
-        ff.ConvOutputKroneckerFactor(tensor)
-
   def testMakeCovarianceUpdateOp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = np.arange(1, 17).reshape(2, 2, 2, 2).astype(np.float32)
-      factor = ff.ConvOutputKroneckerFactor((array_ops.constant(tensor),))
+      factor = ff.ConvOutputKroneckerFactor(((array_ops.constant(tensor),),))
+      factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
       new_cov = sess.run(factor.make_covariance_update_op(.5))
@@ -584,8 +913,8 @@ class FullyConnectedMultiKFTest(test.TestCase):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), name='a/b/c')
-      tensor_list = [tensor]
-      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      factor = ff.FullyConnectedMultiKF(((tensor,),), has_bias=False)
+      factor.instantiate_cov_variables()
       self.assertEqual([3, 3], factor.get_cov().get_shape().as_list())
 
   def testFullyConnectedMultiKFInitFloat64(self):
@@ -593,8 +922,8 @@ class FullyConnectedMultiKFTest(test.TestCase):
       dtype = dtypes.float64_ref
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
-      tensor_list = [tensor]
-      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      factor = ff.FullyConnectedMultiKF(((tensor,),), has_bias=False)
+      factor.instantiate_cov_variables()
       cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
       self.assertEqual([3, 3], cov.get_shape().as_list())
@@ -603,8 +932,8 @@ class FullyConnectedMultiKFTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      tensor_list = [tensor]
-      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=True)
+      factor = ff.FullyConnectedMultiKF(((tensor,),), has_bias=True)
+      factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
       new_cov = sess.run(factor.make_covariance_update_op(.5))
@@ -614,8 +943,8 @@ class FullyConnectedMultiKFTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      tensor_list = [tensor]
-      factor = ff.FullyConnectedMultiKF((tensor_list,))
+      factor = ff.FullyConnectedMultiKF(((tensor,),))
+      factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
       new_cov = sess.run(factor.make_covariance_update_op(.5))
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py b/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
index b8ccbeadd0a9d69edb41fef50e3edb090457adf2..cb80fca3705308f92e308e2a840336fb72d0fa62 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.platform import test
 class MockFisherBlock(object):
   """A fake FisherBlock."""
 
-  num_registered_minibatches = 2
+  num_registered_towers = 2
 
   def __init__(self, name='MockFisherBlock'):
     self.name = name
@@ -104,22 +104,53 @@ class LayerCollectionTest(test.TestCase):
           array_ops.constant(3),
           approx=layer_collection.APPROX_DIAGONAL_NAME)
       lc.register_conv2d(
-          array_ops.constant(4), [1, 1, 1, 1], 'SAME',
-          array_ops.ones((1, 1, 1, 1)), array_ops.constant(3))
+          params=array_ops.ones((2, 3, 4, 5)),
+          strides=[1, 1, 1, 1],
+          padding='SAME',
+          inputs=array_ops.ones((1, 2, 3, 4)),
+          outputs=array_ops.ones((1, 1, 1, 5)))
       lc.register_conv2d(
-          array_ops.constant(4), [1, 1, 1, 1],
-          'SAME',
-          array_ops.ones((1, 1, 1, 1)),
-          array_ops.constant(3),
+          params=array_ops.ones((2, 3, 4, 5)),
+          strides=[1, 1, 1, 1],
+          padding='SAME',
+          inputs=array_ops.ones((1, 2, 3, 4)),
+          outputs=array_ops.ones((1, 1, 1, 5)),
           approx=layer_collection.APPROX_DIAGONAL_NAME)
+      lc.register_separable_conv2d(
+          depthwise_params=array_ops.ones((3, 3, 1, 2)),
+          pointwise_params=array_ops.ones((1, 1, 2, 4)),
+          inputs=array_ops.ones((32, 5, 5, 1)),
+          depthwise_outputs=array_ops.ones((32, 5, 5, 2)),
+          pointwise_outputs=array_ops.ones((32, 5, 5, 4)),
+          strides=[1, 1, 1, 1],
+          padding='SAME')
+      lc.register_convolution(
+          params=array_ops.ones((3, 3, 1, 8)),
+          inputs=array_ops.ones((32, 5, 5, 1)),
+          outputs=array_ops.ones((32, 5, 5, 8)),
+          padding='SAME')
       lc.register_generic(
           array_ops.constant(5), 16, approx=layer_collection.APPROX_FULL_NAME)
       lc.register_generic(
           array_ops.constant(6),
           16,
           approx=layer_collection.APPROX_DIAGONAL_NAME)
-
-      self.assertEqual(6, len(lc.get_blocks()))
+      lc.register_fully_connected_multi(
+          array_ops.constant(1),
+          (array_ops.constant(2), array_ops.constant(3)),
+          (array_ops.constant(4), array_ops.constant(5)))
+      lc.register_conv2d_multi(
+          params=array_ops.ones((2, 3, 4, 5)),
+          strides=[1, 1, 1, 1],
+          padding='SAME',
+          inputs=(array_ops.ones((1, 2, 3, 4)), array_ops.ones((5, 6, 7, 8))),
+          outputs=(array_ops.ones((1, 1, 1, 5)), array_ops.ones((2, 2, 2, 10))))
+      lc.register_embedding_multi(
+          array_ops.constant((1,)),
+          (array_ops.constant(2), array_ops.constant(3)),
+          (array_ops.constant(4), array_ops.constant(5)))
+
+      self.assertEqual(12, len(lc.get_blocks()))
 
   def testRegisterBlocksMultipleRegistrations(self):
     with ops.Graph().as_default():
@@ -237,16 +268,16 @@ class LayerCollectionTest(test.TestCase):
 
       # Create a new loss function by name.
       lc.register_categorical_predictive_distribution(logits, name='loss1')
-      self.assertEqual(1, len(lc.losses))
+      self.assertEqual(1, len(lc.towers_by_loss))
 
       # Add logits to same loss function.
       lc.register_categorical_predictive_distribution(
           logits, name='loss1', reuse=True)
-      self.assertEqual(1, len(lc.losses))
+      self.assertEqual(1, len(lc.towers_by_loss))
 
       # Add another new loss function.
       lc.register_categorical_predictive_distribution(logits, name='loss2')
-      self.assertEqual(2, len(lc.losses))
+      self.assertEqual(2, len(lc.towers_by_loss))
 
   def testLossFunctionWithoutName(self):
     """Ensure loss functions get unique names if 'name' not specified."""
@@ -298,13 +329,9 @@ class LayerCollectionTest(test.TestCase):
             name='loss1',
             reuse=layer_collection.VARIABLE_SCOPE)
 
-      self.assertEqual(len(lc.losses), 1)
-      loss = lc.losses[0]
-
+      self.assertEqual(len(lc.towers_by_loss), 1)
       # Three successful registrations.
-      self.assertEqual(loss.params.shape.as_list(),
-                       [3 * batch_size, output_size])
-      self.assertEqual(loss.targets.shape.as_list(), [3 * batch_size])
+      self.assertEqual(len(lc.towers_by_loss[0]), 3)
 
   def testRegisterCategoricalPredictiveDistributionBatchSize1(self):
     with ops.Graph().as_default():
@@ -441,13 +468,13 @@ class LayerCollectionTest(test.TestCase):
       b = variable_scope.get_variable('b', [3])
       lc = layer_collection.LayerCollection()
       lc.register_fully_connected(w, inputs, outputs)
-      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 1)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_towers, 1)
       with self.assertRaises(KeyError):
         lc.register_fully_connected((w, b), inputs, outputs, reuse=True)
       self.assertNotIn((w, b), lc.fisher_blocks)
-      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 1)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_towers, 1)
       lc.register_fully_connected(w, inputs, outputs, reuse=True)
-      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 2)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_towers, 2)
 
   def testMakeOrGetFactor(self):
     with ops.Graph().as_default():
@@ -479,17 +506,6 @@ class LayerCollectionTest(test.TestCase):
       variables = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
       self.assertTrue(all([var.name.startswith(scope) for var in variables]))
 
-  def testGetUseCountMap(self):
-    """Ensure get_use_count_map() sums 'num_registered_minibatches'."""
-    lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {
-        'a': MockFisherBlock(),
-        ('a', 'c'): MockFisherBlock(),
-        ('b', 'c'): MockFisherBlock()
-    }
-    use_count_map = lc.get_use_count_map()
-    self.assertDictEqual({'a': 4, 'b': 2, 'c': 4}, use_count_map)
-
   def testIdentifyLinkedParametersSomeRegisteredInOtherTuples(self):
     x = variable_scope.get_variable('x', shape=())
     y = variable_scope.get_variable('y', shape=())
@@ -550,6 +566,32 @@ class LayerCollectionTest(test.TestCase):
     self.assertIsInstance(lc.fisher_blocks[b_0], fisher_blocks.FullFB)
     self.assertIsInstance(lc.fisher_blocks[b_1], fisher_blocks.NaiveDiagonalFB)
 
+  def testDefaultLayerCollection(self):
+    with ops.Graph().as_default():
+      # Can't get default if there isn't one set.
+      with self.assertRaises(ValueError):
+        layer_collection.get_default_layer_collection()
+
+      # Can't set default twice.
+      lc = layer_collection.LayerCollection()
+      layer_collection.set_default_layer_collection(lc)
+      with self.assertRaises(ValueError):
+        layer_collection.set_default_layer_collection(lc)
+
+      # Same as one set.
+      self.assertTrue(lc is layer_collection.get_default_layer_collection())
+
+      # Can set to None.
+      layer_collection.set_default_layer_collection(None)
+      with self.assertRaises(ValueError):
+        layer_collection.get_default_layer_collection()
+
+      # as_default() is the same as setting/clearing.
+      with lc.as_default():
+        self.assertTrue(lc is layer_collection.get_default_layer_collection())
+      with self.assertRaises(ValueError):
+        layer_collection.get_default_layer_collection()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py b/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
index ae787b6f1ac90218f2ac73d37fb270df0b822de2..c00af5593f085e3b1f3e030a24f4b821115cc869 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
@@ -24,7 +24,6 @@ from tensorflow.contrib.kfac.python.ops import loss_functions
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
@@ -97,22 +96,6 @@ class CategoricalLogitsNegativeLogProbLossTest(test.TestCase):
       # difficult to say if the output is correct or not...
       neg_log_prob = sess.run(neg_log_prob)
 
-  def testMultiMinibatchRegistration(self):
-    """Ensure this loss function supports registering multiple minibatches."""
-    with ops.Graph().as_default():
-      tower_logits = []
-      loss = None
-      num_towers = 5
-      for _ in range(num_towers):
-        logits = random_ops.random_uniform(shape=[2, 3])
-        tower_logits.append(logits)
-        if loss is None:
-          loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(logits)
-        else:
-          loss.register_additional_minibatch(logits)
-      self.assertListEqual(loss.input_minibatches, tower_logits)
-      self.assertEqual(loss.num_registered_minibatches, num_towers)
-
   def testMultiplyFisherSingleVector(self):
     with ops.Graph().as_default(), self.test_session() as sess:
       logits = np.array([1., 2., 3.])
@@ -203,23 +186,5 @@ class OnehotCategoricalLogitsNegativeLogProbLossTest(test.TestCase):
       # difficult to say if the output is correct or not...
       neg_log_prob = sess.run(neg_log_prob)
 
-  def testMultiMinibatchRegistration(self):
-    """Ensure this loss function supports registering multiple minibatches."""
-    with ops.Graph().as_default():
-      tower_logits = []
-      loss = None
-      num_towers = 5
-      for _ in range(num_towers):
-        logits = random_ops.random_uniform(shape=[2, 3])
-        tower_logits.append(logits)
-        if loss is None:
-          loss = loss_functions.OnehotCategoricalLogitsNegativeLogProbLoss(
-              logits)
-        else:
-          loss.register_additional_minibatch(logits)
-      self.assertListEqual(loss.input_minibatches, tower_logits)
-      self.assertEqual(loss.num_registered_minibatches, num_towers)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py b/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py
index 9325aa1b7325fa9cf546d66e6505affa1af7db4d..560a9b0b426eccb262296a505df7f782a96d9c1d 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
 from tensorflow.contrib.kfac.python.ops import layer_collection as lc
 from tensorflow.contrib.kfac.python.ops import optimizer
 from tensorflow.python.framework import ops
@@ -32,6 +33,13 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import test
 
 
+# We need to set these constants since the numerical values used in the tests
+# were chosen when these used to be the defaults.
+ff.set_global_constants(init_covariances_at_zero=False,
+                        zero_debias=False,
+                        init_inverses_at_zero=False)
+
+
 def dummy_layer_collection():
   lcoll = lc.LayerCollection()
   dummy = array_ops.constant([1., 2.])
@@ -186,6 +194,11 @@ class OptimizerTest(test.TestCase):
           layer_collection,
           momentum=0.5,
           momentum_type='regular')
+      (cov_update_thunks,
+       inv_update_thunks) = opt.make_vars_and_create_op_thunks()
+      cov_update_ops = tuple(thunk() for thunk in cov_update_thunks)
+      inv_update_ops = tuple(thunk() for thunk in inv_update_thunks)
+
       grads_and_vars = opt.compute_gradients(output, [weights, bias])
       all_vars = [grad_and_var[1] for grad_and_var in grads_and_vars]
 
@@ -193,6 +206,8 @@ class OptimizerTest(test.TestCase):
 
       sess.run(tf_variables.global_variables_initializer())
       old_vars = sess.run(all_vars)
+      sess.run(cov_update_ops)
+      sess.run(inv_update_ops)
       sess.run(op)
       new_vars = sess.run(all_vars)
 
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py b/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
index 97a97adbf5577cd2694d3055acaa59258ad27964..2cee01212a11595669e9df0fc95a5657926c1038 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
@@ -29,6 +29,8 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -325,6 +327,84 @@ class UtilsTest(test.TestCase):
           ],
           values)
 
+  def testExtractConvolutionPatches(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      batch_size = 10
+      image_spatial_shape = [9, 10, 11]
+      in_channels = out_channels = 32
+      kernel_spatial_shape = [5, 3, 3]
+      spatial_strides = [1, 2, 1]
+      spatial_dilation = [1, 1, 1]
+      padding = 'SAME'
+
+      images = random_ops.random_uniform(
+          [batch_size] + image_spatial_shape + [in_channels], seed=0)
+      kernel_shape = kernel_spatial_shape + [in_channels, out_channels]
+      kernel = random_ops.random_uniform(kernel_shape, seed=1)
+
+      # Ensure shape matches expectation.
+      patches = utils.extract_convolution_patches(
+          images,
+          kernel_shape,
+          padding,
+          strides=spatial_strides,
+          dilation_rate=spatial_dilation)
+      result_spatial_shape = (
+          patches.shape.as_list()[1:1 + len(image_spatial_shape)])
+      self.assertEqual(patches.shape.as_list(),
+                       [batch_size] + result_spatial_shape +
+                       kernel_spatial_shape + [in_channels])
+
+      # Ensure extract...patches() + matmul() and convolution() implementation
+      # give the same answer.
+      outputs = nn_ops.convolution(
+          images,
+          kernel,
+          padding,
+          strides=spatial_strides,
+          dilation_rate=spatial_dilation)
+
+      patches_flat = array_ops.reshape(
+          patches, [-1, np.prod(kernel_spatial_shape) * in_channels])
+      kernel_flat = array_ops.reshape(kernel, [-1, out_channels])
+      outputs_flat = math_ops.matmul(patches_flat, kernel_flat)
+
+      outputs_, outputs_flat_ = sess.run([outputs, outputs_flat])
+      self.assertAllClose(outputs_.flatten(), outputs_flat_.flatten())
+
+  def testExtractPointwiseConv2dPatches(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      batch_size = 10
+      image_height = image_width = 8
+      in_channels = out_channels = 3
+      kernel_height = kernel_width = 1
+      strides = [1, 1, 1, 1]
+      padding = 'VALID'
+
+      images = random_ops.random_uniform(
+          [batch_size, image_height, image_width, in_channels], seed=0)
+      kernel_shape = [kernel_height, kernel_width, in_channels, out_channels]
+      kernel = random_ops.random_uniform(kernel_shape, seed=1)
+
+      # Ensure shape matches expectation.
+      patches = utils.extract_pointwise_conv2d_patches(images, kernel_shape)
+      self.assertEqual(patches.shape.as_list(), [
+          batch_size, image_height, image_width, kernel_height, kernel_width,
+          in_channels
+      ])
+
+      # Ensure extract...patches() + matmul() and conv2d() implementation
+      # give the same answer.
+      outputs = nn_ops.conv2d(images, kernel, strides, padding)
+
+      patches_flat = array_ops.reshape(
+          patches, [-1, kernel_height * kernel_width * in_channels])
+      kernel_flat = array_ops.reshape(kernel, [-1, out_channels])
+      outputs_flat = math_ops.matmul(patches_flat, kernel_flat)
+
+      outputs_, outputs_flat_ = sess.run([outputs, outputs_flat])
+      self.assertAllClose(outputs_.flatten(), outputs_flat_.flatten())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD
index ee6549b109399766579b6ea18a987ae2c8275983..3c01eb65e7a687d6c477b858b8d91ea7f309dc64 100644
--- a/tensorflow/contrib/kfac/python/ops/BUILD
+++ b/tensorflow/contrib/kfac/python/ops/BUILD
@@ -35,12 +35,16 @@ py_library(
     srcs = ["fisher_factors.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":linear_operator",
         ":utils",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:special_math_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
@@ -60,6 +64,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "linear_operator",
+    srcs = ["linear_operator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/ops/linalg",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "loss_functions",
     srcs = ["loss_functions.py"],
@@ -144,10 +161,13 @@ py_library(
         ":fisher_estimator",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
 )
@@ -168,6 +188,7 @@ py_library(
     name = "fisher_estimator",
     srcs = [
         "estimator.py",
+        "placement.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -177,6 +198,7 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -239,15 +261,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index a7b1f9d35c931fc44408be804479e758f28f7110..854f885c26f2b4340555adb91bc3b9749962d869 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -18,68 +18,59 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-import itertools
-
+import abc
 import numpy as np
+import six
 
+from tensorflow.contrib.kfac.python.ops import placement
 from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 
 
-class _DeviceContextGenerator(object):
-  """Class for generating device contexts in a round-robin fashion."""
-
-  def __init__(self, devices):
-    """Creates a _DeviceContextGenerator object.
+# The linter is confused.
+# pylint: disable=abstract-class-instantiated
+def make_fisher_estimator(placement_strategy=None, **kwargs):
+  """Creates Fisher estimator instances based on the placement strategy.
 
-    Example usage:
+  For example if the `placement_strategy` is 'round_robin' then
+  `FisherEstimatorRoundRobin` instance is returned.
 
-    ```python
-    dcg = _DeviceContextGenerator(['/gpu:0', 'gpu:1'])
-    with dcg():
-      # All operations in this context will be placed on GPU 0
-      ...
-    with dcg():
-      # All operations in this context will be placed on GPU 1
-      ...
-    ```
+  Args:
+    placement_strategy: `string`, Strategy to be used for placing covariance
+      variables, covariance ops and inverse ops. Check
+      `placement.FisherEstimatorRoundRobin` for a concrete example.
+   **kwargs: Arguments to be passed into `FisherEstimator` class initializer.
 
-    Args:
-      devices: An iterable of device strings (or None). Successive calls to
-          __call__ will give contexts which place devices on these devices in
-          a round-robin fashion.
-    """
-    self._cycle = None if devices is None else itertools.cycle(devices)
+  Returns:
+    An instance of class which inherits from `FisherEstimator` and the mixin
+    which implements specific placement strategy. See,
+    `FisherEstimatorRoundRobin` which inherits from `FisherEstimator` and
+    `RoundRobinPlacementMixin`.
 
-  @contextlib.contextmanager
-  def __call__(self):
-    """Returns a context manager specifying the default device."""
-    if self._cycle is None:
-      yield
-    else:
-      with tf_ops.device(next(self._cycle)):
-        yield
+  Raises:
+    ValueError: If the `placement_strategy` is not equal to 'round_robin'.
+  """
+  if placement_strategy in [None, "round_robin"]:
+    return FisherEstimatorRoundRobin(**kwargs)
+  else:
+    raise ValueError("Unimplemented vars and ops "
+                     "placement strategy : {}".format(placement_strategy))
+# pylint: enable=abstract-class-instantiated
 
 
+@six.add_metaclass(abc.ABCMeta)
 class FisherEstimator(object):
   """Fisher estimator class supporting various approximations of the Fisher.
 
-  Attributes:
-    cov_update_thunks: list of no-arg functions. Executing a function adds
-      covariance update ops for a single FisherFactor to the graph.
-    cov_update_ops: List of Ops. Running an op updates covariance matrices for a
-      single FisherFactor.
-    cov_update_op: Op. Running updates covariance matrices for all
-      FisherFactors.
-    inv_update_thunks: list of no-arg functions.  Executing a function adds
-      inverse update ops for a single FisherFactor to the graph.
-    inv_update_ops: List of Ops. Running an op updates inverse matrices for a
-      single FisherFactor.
-    inv_update_op: Op. Running updates inverse matrices for all FisherFactors.
+  This is an abstract base class which does not implement a strategy for
+  placing covariance variables, covariance update ops and inverse update ops.
+  The placement strategies are implemented in `placement.py`. See
+  `FisherEstimatorRoundRobin` for example of a concrete subclass with
+  a round-robin placement strategy.
   """
 
   def __init__(self,
@@ -87,26 +78,33 @@ class FisherEstimator(object):
                cov_ema_decay,
                damping,
                layer_collection,
+               exps=(-1,),
                estimation_mode="gradients",
                colocate_gradients_with_ops=True,
-               cov_devices=None,
-               inv_devices=None):
+               name="FisherEstimator",
+               compute_cholesky=False,
+               compute_cholesky_inverse=False):
     """Create a FisherEstimator object.
 
     Args:
-      variables: A list of the variables for which to estimate the Fisher. This
-          must match the variables registered in layer_collection (if it is not
-          None).
+      variables: A `list` of variables or `callable` which returns the variables
+          for which to estimate the Fisher. This must match the variables
+          registered in layer_collection (if it is not None).
       cov_ema_decay: The decay factor used when calculating the covariance
           estimate moving averages.
-      damping: The damping factor used to stabilize training due to errors in
-          the local approximation with the Fisher information matrix, and to
-          regularize the update direction by making it closer to the gradient.
-          (Higher damping means the update looks more like a standard gradient
-          update - see Tikhonov regularization.)
+      damping: float. The damping factor used to stabilize training due to
+          errors in the local approximation with the Fisher information matrix,
+          and to regularize the update direction by making it closer to the
+          gradient. (Higher damping means the update looks more like a standard
+          gradient update - see Tikhonov regularization.)
       layer_collection: The layer collection object, which holds the fisher
           blocks, kronecker factors, and losses associated with the
           graph.
+      exps: List of floats or ints. These represent the different matrix
+          powers of the approximate Fisher that the FisherEstimator will be able
+          to multiply vectors by. If the user asks for a matrix power other
+          one of these (or 1, which is always supported), there will be a
+          failure. (Default: (-1,))
       estimation_mode: The type of estimator to use for the Fishers.  Can be
           'gradients', 'empirical', 'curvature_prop', or 'exact'.
           (Default: 'gradients').  'gradients' is the basic estimation approach
@@ -125,24 +123,23 @@ class FisherEstimator(object):
           equal to the output dimension, roughly speaking.
       colocate_gradients_with_ops: Whether we should request gradients be
           colocated with their respective ops. (Default: True)
-      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
-          computations will be placed on these devices in a round-robin fashion.
-          Can be None, which means that no devices are specified.
-      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
-          computations will be placed on these devices in a round-robin fashion.
-          Can be None, which means that no devices are specified.
-
+      name: A string. A name given to this estimator, which is added to the
+          variable scope when constructing variables and ops.
+          (Default: "FisherEstimator")
+      compute_cholesky: Bool. Whether or not the FisherEstimator will be
+          able to multiply vectors by the Cholesky factor.
+          (Default: False)
+      compute_cholesky_inverse: Bool. Whether or not the FisherEstimator
+          will be able to multiply vectors by the Cholesky factor inverse.
+          (Default: False)
     Raises:
       ValueError: If no losses have been registered with layer_collection.
     """
-
-    self._cov_ema_decay = cov_ema_decay
     self._variables = variables
+    self._cov_ema_decay = cov_ema_decay
     self._damping = damping
     self._estimation_mode = estimation_mode
     self._layers = layer_collection
-    self._layers.create_subgraph()
-    self._layers.check_registration(variables)
     self._gradient_fns = {
         "gradients": self._get_grads_lists_gradients,
         "empirical": self._get_grads_lists_empirical,
@@ -151,39 +148,71 @@ class FisherEstimator(object):
     }
     self._colocate_gradients_with_ops = colocate_gradients_with_ops
 
-    # TODO(b/70674513): Factor device placement outside of this class.
-    self._cov_device_context_generator = _DeviceContextGenerator(cov_devices)
-    if inv_devices == cov_devices:
-      self._inv_device_context_generator = self._cov_device_context_generator
-    else:
-      self._inv_device_context_generator = _DeviceContextGenerator(inv_devices)
-
-    self._instantiate_factors()
-
-    self.cov_update_thunks = [
-        self._create_cov_update_thunk(factor)
-        for factor in self._layers.get_factors()
-    ]
-    self.cov_update_ops = [thunk() for thunk in self.cov_update_thunks]
-    self.cov_update_op = control_flow_ops.group(
-        self.cov_update_ops, name="cov_update_op")
+    self._made_vars = False
+    self._exps = exps
+    self._compute_cholesky = compute_cholesky
+    self._compute_cholesky_inverse = compute_cholesky_inverse
 
-    self.inv_update_thunks = [
-        self._create_inv_update_thunk(factor)
-        for factor in self._layers.get_factors()
-    ]
-    self.inv_update_ops = [thunk() for thunk in self.inv_update_thunks]
-    self.inv_update_op = control_flow_ops.group(
-        self.inv_update_ops, name="inv_update_op")
+    self._name = name
 
   @property
   def variables(self):
-    return self._variables
+    if callable(self._variables):
+      return self._variables()
+    else:
+      return self._variables
 
   @property
   def damping(self):
     return self._damping
 
+  @property
+  def blocks(self):
+    """All registered FisherBlocks."""
+    return self._layers.get_blocks()
+
+  @property
+  def factors(self):
+    """All registered FisherFactors."""
+    return self._layers.get_factors()
+
+  @property
+  def name(self):
+    return self._name
+
+  @abc.abstractmethod
+  def make_vars_and_create_op_thunks(self, scope=None):
+    """Make vars and create op thunks with a specific placement strategy.
+
+    For each factor, all of that factor's cov variables and their associated
+    update ops will be placed on a particular device.  A new device is chosen
+    for each factor by cycling through list of devices in the cov_devices
+    argument. If cov_devices is None then no explicit device placement occurs.
+
+    An analogous strategy is followed for inverse update ops, with the list of
+    devices being given by the inv_devices argument.
+
+    Inverse variables on the other hand are not placed on any specific device
+    (they will just use the current the device placement context, whatever
+    that happens to be).  The idea is that the inverse variable belong where
+    they will be accessed most often, which is the device that actually applies
+    the preconditioner to the gradient. The user will be responsible for setting
+    the device context for this.
+
+    Args:
+      scope: A string or None.  If None it will be set to the name of this
+        estimator (given by the name property). All variables will be created,
+        and all thunks will execute, inside of a variable scope of the given
+        name. (Default: None)
+
+    Returns:
+      cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+      inv_update_thunks: List of inv update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+    """
+    pass
+
   def _apply_transformation(self, vecs_and_vars, transform):
     """Applies an block-wise transformation to the corresponding vectors.
 
@@ -217,9 +246,7 @@ class FisherEstimator(object):
       A list of (transformed vector, var) pairs in the same order as
       vecs_and_vars.
     """
-
-    return self._apply_transformation(vecs_and_vars,
-                                      lambda fb, vec: fb.multiply_inverse(vec))
+    return self.multiply_matpower(-1, vecs_and_vars)
 
   def multiply(self, vecs_and_vars):
     """Multiplies the vectors by the corresponding (damped) blocks.
@@ -231,9 +258,67 @@ class FisherEstimator(object):
       A list of (transformed vector, var) pairs in the same order as
       vecs_and_vars.
     """
+    return self.multiply_matpower(1, vecs_and_vars)
+
+  def multiply_matpower(self, exp, vecs_and_vars):
+    """Multiplies the vecs by the corresponding matrix powers of the blocks.
 
-    return self._apply_transformation(vecs_and_vars,
-                                      lambda fb, vec: fb.multiply(vec))
+    Args:
+      exp: A float representing the power to raise the blocks by before
+        multiplying it by the vector.
+      vecs_and_vars: List of (vector, variable) pairs.
+
+    Returns:
+      A list of (transformed vector, var) pairs in the same order as
+      vecs_and_vars.
+    """
+    assert exp in self._exps
+
+    fcn = lambda fb, vec: fb.multiply_matpower(vec, exp)
+    return self._apply_transformation(vecs_and_vars, fcn)
+
+  def multiply_cholesky(self, vecs_and_vars, transpose=False):
+    """Multiplies the vecs by the corresponding Cholesky factors.
+
+    Args:
+      vecs_and_vars: List of (vector, variable) pairs.
+      transpose: Bool. If true the Cholesky factors are transposed before
+        multiplying the vecs. (Default: False)
+
+    Returns:
+      A list of (transformed vector, var) pairs in the same order as
+      vecs_and_vars.
+    """
+    assert self._compute_cholesky
+
+    fcn = lambda fb, vec: fb.multiply_cholesky(vec, transpose=transpose)
+    return self._apply_transformation(vecs_and_vars, fcn)
+
+  def multiply_cholesky_inverse(self, vecs_and_vars, transpose=False):
+    """Mults the vecs by the inverses of the corresponding Cholesky factors.
+
+      Note: if you are using Cholesky inverse multiplication to sample from
+      a matrix-variate Gaussian you will want to multiply by the transpose.
+      Let L be the Cholesky factor of F and observe that
+
+        L^-T * L^-1 = (L * L^T)^-1 = F^-1 .
+
+      Thus we want to multiply by L^-T in order to sample from Gaussian with
+      covariance F^-1.
+
+    Args:
+      vecs_and_vars: List of (vector, variable) pairs.
+      transpose: Bool. If true the Cholesky factor inverses are transposed
+        before multiplying the vecs. (Default: False)
+
+    Returns:
+      A list of (transformed vector, var) pairs in the same order as
+      vecs_and_vars.
+    """
+    assert self._compute_cholesky_inverse
+
+    fcn = lambda fb, vec: fb.multiply_cholesky_inverse(vec, transpose=transpose)
+    return self._apply_transformation(vecs_and_vars, fcn)
 
   def _instantiate_factors(self):
     """Instantiates FisherFactors' variables.
@@ -241,9 +326,9 @@ class FisherEstimator(object):
     Raises:
       ValueError: If estimation_mode was improperly specified at construction.
     """
-    fisher_blocks_list = self._layers.get_blocks()
+    blocks = self.blocks
     tensors_to_compute_grads = [
-        fb.tensors_to_compute_grads() for fb in fisher_blocks_list
+        block.tensors_to_compute_grads() for block in blocks
     ]
 
     try:
@@ -253,45 +338,135 @@ class FisherEstimator(object):
       raise ValueError("Unrecognized value {} for estimation_mode.".format(
           self._estimation_mode))
 
-    # TODO(b/68033310): This loop round-robins the "concat" operations which
-    # gather the inputs for the cov_updates. In future, we might do these
-    # computations locally then communicate the results, which would require a
-    # modification to this code.
-    for grads_list, fb in zip(grads_lists, fisher_blocks_list):
-      with self._cov_device_context_generator():
-        fb.instantiate_factors(grads_list, self.damping)
+    for grads_list, block in zip(grads_lists, blocks):
+      block.instantiate_factors(grads_list, self.damping)
+
+  def _check_vars_unmade_and_set_made_flag(self):
+    if self._made_vars:
+      raise Exception("Already made variables.")
+    self._made_vars = True
+
+  def made_vars(self):
+    return self._made_vars
+
+  def _register_matrix_functions(self):
+    for block in self.blocks:
+      for exp in self._exps:
+        block.register_matpower(exp)
+      if self._compute_cholesky:
+        block.register_cholesky()
+      if self._compute_cholesky_inverse:
+        block.register_cholesky_inverse()
+
+  def _finalize_layer_collection(self):
+    self._layers.create_subgraph()
+    self._layers.check_registration(self.variables)
+    self._instantiate_factors()
+    self._register_matrix_functions()
+
+  def create_ops_and_vars_thunks(self, scope=None):
+    """Create thunks that make the ops and vars on demand.
+
+    This function returns 4 lists of thunks: cov_variable_thunks,
+    cov_update_thunks, inv_variable_thunks, and inv_update_thunks.
+
+    The length of each list is the number of factors and the i-th element of
+    each list corresponds to the i-th factor (given by the "factors" property).
+
+    Note that the execution of these thunks must happen in a certain
+    partial order.  The i-th element of cov_variable_thunks must execute
+    before the i-th element of cov_update_thunks (and also the i-th element
+    of inv_update_thunks).  Similarly, the i-th element of inv_variable_thunks
+    must execute before the i-th element of inv_update_thunks.
 
-  def _create_cov_update_thunk(self, factor):
+    TL;DR (oversimplified): Execute the thunks according to the order that
+    they are returned.
+
+    Args:
+      scope: A string or None.  If None it will be set to the name of this
+        estimator (given by the name property). All thunks will execute inside
+        of a variable scope of the given name. (Default: None)
+    Returns:
+      cov_variable_thunks: A list of thunks that make the cov variables.
+      cov_update_thunks: A list of thunks that make the cov update ops.
+      inv_variable_thunks: A list of thunks that make the inv variables.
+      inv_update_thunks: A list of thunks that make the inv update ops.
+    """
+    self._check_vars_unmade_and_set_made_flag()
+
+    self._finalize_layer_collection()
+
+    scope = self.name if scope is None else scope
+
+    cov_variable_thunks = [
+        self._create_cov_variable_thunk(factor, scope)
+        for factor in self.factors
+    ]
+    cov_update_thunks = [
+        self._create_cov_update_thunk(factor, scope) for factor in self.factors
+    ]
+    inv_variable_thunks = [
+        self._create_inv_variable_thunk(factor, scope)
+        for factor in self.factors
+    ]
+    inv_update_thunks = [
+        self._create_inv_update_thunk(factor, scope) for factor in self.factors
+    ]
+
+    return (cov_variable_thunks, cov_update_thunks,
+            inv_variable_thunks, inv_update_thunks)
+
+  def _create_cov_variable_thunk(self, factor, scope):
+    """Constructs a covariance variable thunk for a single FisherFactor."""
+
+    def thunk():
+      with variable_scope.variable_scope(scope):
+        return factor.instantiate_cov_variables()
+
+    return thunk
+
+  def _create_cov_update_thunk(self, factor, scope):
     """Constructs a covariance update thunk for a single FisherFactor."""
 
     def thunk():
-      with tf_ops.name_scope(
-          "create_cov_update_thunk", values=[self._cov_ema_decay]):
+      with variable_scope.variable_scope(scope):
         return factor.make_covariance_update_op(self._cov_ema_decay)
 
     return thunk
 
-  def _create_inv_update_thunk(self, factor):
+  def _create_inv_variable_thunk(self, factor, scope):
+    """Constructs a inverse variable thunk for a single FisherFactor."""
+
+    def thunk():
+      with variable_scope.variable_scope(scope):
+        return factor.instantiate_inv_variables()
+
+    return thunk
+
+  def _create_inv_update_thunk(self, factor, scope):
     """Constructs an inverse update thunk for a single FisherFactor."""
 
     def thunk():
-      with tf_ops.name_scope("create_inv_update_thunk"):
-        with self._inv_device_context_generator():
-          return control_flow_ops.group(factor.make_inverse_update_ops())
+      with variable_scope.variable_scope(scope):
+        return control_flow_ops.group(factor.make_inverse_update_ops())
 
     return thunk
 
   def _get_grads_lists_gradients(self, tensors):
+    # Passing in a list of loss values is better than passing in the sum as
+    # the latter creates unnessesary ops on the default device
     grads_flat = gradients_impl.gradients(
-        self._layers.total_sampled_loss(),
+        self._layers.eval_losses_on_samples(),
         nest.flatten(tensors),
         colocate_gradients_with_ops=self._colocate_gradients_with_ops)
     grads_all = nest.pack_sequence_as(tensors, grads_flat)
     return tuple((grad,) for grad in grads_all)
 
   def _get_grads_lists_empirical(self, tensors):
+    # Passing in a list of loss values is better than passing in the sum as
+    # the latter creates unnessesary ops on the default device
     grads_flat = gradients_impl.gradients(
-        self._layers.total_loss(),
+        self._layers.eval_losses(),
         nest.flatten(tensors),
         colocate_gradients_with_ops=self._colocate_gradients_with_ops)
     grads_all = nest.pack_sequence_as(tensors, grads_flat)
@@ -300,9 +475,10 @@ class FisherEstimator(object):
   def _get_transformed_random_signs(self):
     transformed_random_signs = []
     for loss in self._layers.losses:
-      transformed_random_signs.append(
-          loss.multiply_fisher_factor(
-              utils.generate_random_signs(loss.fisher_factor_inner_shape)))
+      with tf_ops.colocate_with(self._layers.loss_colocation_ops[loss]):
+        transformed_random_signs.append(
+            loss.multiply_fisher_factor(
+                utils.generate_random_signs(loss.fisher_factor_inner_shape)))
     return transformed_random_signs
 
   def _get_grads_lists_curvature_prop(self, tensors):
@@ -321,13 +497,20 @@ class FisherEstimator(object):
     # Loop over all coordinates of all losses.
     grads_all = []
     for loss in self._layers.losses:
-      for index in np.ndindex(*loss.fisher_factor_inner_static_shape[1:]):
-        transformed_one_hot = loss.multiply_fisher_factor_replicated_one_hot(
-            index)
-        grads_flat = gradients_impl.gradients(
-            loss.inputs,
-            nest.flatten(tensors),
-            grad_ys=transformed_one_hot,
-            colocate_gradients_with_ops=self._colocate_gradients_with_ops)
-        grads_all.append(nest.pack_sequence_as(tensors, grads_flat))
+      with tf_ops.colocate_with(self._layers.loss_colocation_ops[loss]):
+        for index in np.ndindex(*loss.fisher_factor_inner_static_shape[1:]):
+          transformed_one_hot = loss.multiply_fisher_factor_replicated_one_hot(
+              index)
+          grads_flat = gradients_impl.gradients(
+              loss.inputs,
+              nest.flatten(tensors),
+              grad_ys=transformed_one_hot,
+              colocate_gradients_with_ops=self._colocate_gradients_with_ops)
+          grads_all.append(nest.pack_sequence_as(tensors, grads_flat))
     return zip(*grads_all)
+
+
+class FisherEstimatorRoundRobin(placement.RoundRobinPlacementMixin,
+                                FisherEstimator):
+  """Fisher estimator which provides round robin device placement strategy."""
+  pass
diff --git a/tensorflow/contrib/kfac/python/ops/estimator_lib.py b/tensorflow/contrib/kfac/python/ops/estimator_lib.py
index 33c969650615bf8e439c2f669b4a1efaf2f565ff..9c9fef471f8033bec53ceb1e4f073dd921cbe3c7 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator_lib.py
@@ -25,6 +25,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'FisherEstimator',
+    'make_fisher_estimator',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index cf38d28b43836dced8babe2ffa7853b1c4b1b369..32c776cb381f1b55e7e8eb979377f7fd0cb4c6f7 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -19,11 +19,11 @@ Information matrix. Suppose one has a model that parameterizes a posterior
 distribution over 'y' given 'x' with parameters 'params', p(y | x, params). Its
 Fisher Information matrix is given by,
 
-  F(params) = E[ v(x, y, params) v(x, y, params)^T ]
+  $$F(params) = E[ v(x, y, params) v(x, y, params)^T ]$$
 
 where,
 
-  v(x, y, params) = (d / d params) log p(y | x, params)
+  $$v(x, y, params) = (d / d params) log p(y | x, params)$$
 
 and the expectation is taken with respect to the data's distribution for 'x' and
 the model's posterior distribution for 'y',
@@ -40,12 +40,15 @@ from __future__ import print_function
 import abc
 import enum  # pylint: disable=g-bad-import-order
 
+import numpy as np
 import six
 
 from tensorflow.contrib.kfac.python.ops import fisher_factors
 from tensorflow.contrib.kfac.python.ops import utils
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import nest
 
 # For blocks corresponding to convolutional layers, or any type of block where
 # the parameters can be thought of as being replicated in time or space,
@@ -80,34 +83,22 @@ def normalize_damping(damping, num_replications):
 
 
 def compute_pi_tracenorm(left_cov, right_cov):
-  """Computes the scalar constant pi for Tikhonov regularization/damping.
+  r"""Computes the scalar constant pi for Tikhonov regularization/damping.
 
-  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
+  $$\pi = \sqrt{ (trace(A) / dim(A)) / (trace(B) / dim(B)) }$$
   See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
 
   Args:
-    left_cov: The left Kronecker factor "covariance".
-    right_cov: The right Kronecker factor "covariance".
+    left_cov: A LinearOperator object. The left Kronecker factor "covariance".
+    right_cov: A LinearOperator object. The right Kronecker factor "covariance".
 
   Returns:
     The computed scalar constant pi for these Kronecker Factors (as a Tensor).
   """
-
-  def _trace(cov):
-    if len(cov.shape) == 1:
-      # Diagonal matrix.
-      return math_ops.reduce_sum(cov)
-    elif len(cov.shape) == 2:
-      # Full matrix.
-      return math_ops.trace(cov)
-    else:
-      raise ValueError(
-          "What's the trace of a Tensor of rank %d?" % len(cov.shape))
-
   # Instead of dividing by the dim of the norm, we multiply by the dim of the
   # other norm. This works out the same in the ratio.
-  left_norm = _trace(left_cov) * right_cov.shape.as_list()[0]
-  right_norm = _trace(right_cov) * left_cov.shape.as_list()[0]
+  left_norm = left_cov.trace() * int(right_cov.domain_dimension)
+  right_norm = right_cov.trace() * int(left_cov.domain_dimension)
   return math_ops.sqrt(left_norm / right_norm)
 
 
@@ -121,12 +112,44 @@ def compute_pi_adjusted_damping(left_cov, right_cov, damping):
     return (damping, damping)
 
 
+class PackagedFunc(object):
+  """A Python thunk with a stable ID.
+
+  Enables stable names for lambdas.
+  """
+
+  def __init__(self, func, func_id):
+    """Initializes PackagedFunc.
+
+    Args:
+      func: a zero-arg Python function.
+      func_id: a hashable, function that produces a hashable, or a list/tuple
+        thereof.
+    """
+    self._func = func
+    func_id = func_id if isinstance(func_id, (tuple, list)) else (func_id,)
+    self._func_id = func_id
+
+  def __call__(self):
+    return self._func()
+
+  @property
+  def func_id(self):
+    """A hashable identifier for this function."""
+    return tuple(elt() if callable(elt) else elt for elt in self._func_id)
+
+
+def _package_func(func, func_id):
+  return PackagedFunc(func, func_id)
+
+
 @six.add_metaclass(abc.ABCMeta)
 class FisherBlock(object):
   """Abstract base class for objects modeling approximate Fisher matrix blocks.
 
-  Subclasses must implement multiply_inverse(), instantiate_factors(), and
-  tensors_to_compute_grads() methods.
+  Subclasses must implement register_matpower, multiply_matpower,
+  instantiate_factors, tensors_to_compute_grads, and num_registered_towers
+  methods.
   """
 
   def __init__(self, layer_collection):
@@ -145,6 +168,42 @@ class FisherBlock(object):
     pass
 
   @abc.abstractmethod
+  def register_matpower(self, exp):
+    """Registers a matrix power to be computed by the block.
+
+    Args:
+      exp: A float representing the power to raise the block by.
+    """
+    pass
+
+  @abc.abstractmethod
+  def register_cholesky(self):
+    """Registers a Cholesky factor to be computed by the block."""
+    pass
+
+  @abc.abstractmethod
+  def register_cholesky_inverse(self):
+    """Registers an inverse Cholesky factor to be computed by the block."""
+    pass
+
+  def register_inverse(self):
+    """Registers a matrix inverse to be computed by the block."""
+    self.register_matpower(-1)
+
+  @abc.abstractmethod
+  def multiply_matpower(self, vector, exp):
+    """Multiplies the vector by the (damped) matrix-power of the block.
+
+    Args:
+      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
+      exp: A float representing the power to raise the block by before
+        multiplying it by the vector.
+
+    Returns:
+      The vector left-multiplied by the (damped) matrix-power of the block.
+    """
+    pass
+
   def multiply_inverse(self, vector):
     """Multiplies the vector by the (damped) inverse of the block.
 
@@ -154,9 +213,8 @@ class FisherBlock(object):
     Returns:
       The vector left-multiplied by the (damped) inverse of the block.
     """
-    pass
+    return self.multiply_matpower(vector, -1)
 
-  @abc.abstractmethod
   def multiply(self, vector):
     """Multiplies the vector by the (damped) block.
 
@@ -166,6 +224,33 @@ class FisherBlock(object):
     Returns:
       The vector left-multiplied by the (damped) block.
     """
+    return self.multiply_matpower(vector, 1)
+
+  @abc.abstractmethod
+  def multiply_cholesky(self, vector, transpose=False):
+    """Multiplies the vector by the (damped) Cholesky-factor of the block.
+
+    Args:
+      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
+      transpose: Bool. If true the Cholesky factor is transposed before
+        multiplying the vector. (Default: False)
+
+    Returns:
+      The vector left-multiplied by the (damped) Cholesky-factor of the block.
+    """
+    pass
+
+  @abc.abstractmethod
+  def multiply_cholesky_inverse(self, vector, transpose=False):
+    """Multiplies vector by the (damped) inverse Cholesky-factor of the block.
+
+    Args:
+      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
+      transpose: Bool. If true the Cholesky factor inverse is transposed
+        before multiplying the vector. (Default: False)
+    Returns:
+      Vector left-multiplied by (damped) inverse Cholesky-factor of the block.
+    """
     pass
 
   @abc.abstractmethod
@@ -175,8 +260,8 @@ class FisherBlock(object):
     pass
 
   @abc.abstractproperty
-  def num_registered_minibatches(self):
-    """Number of minibatches registered for this FisherBlock.
+  def num_registered_towers(self):
+    """Number of towers registered for this FisherBlock.
 
     Typically equal to the number of towers in a multi-tower setup.
     """
@@ -207,32 +292,46 @@ class FullFB(FisherBlock):
     super(FullFB, self).__init__(layer_collection)
 
   def instantiate_factors(self, grads_list, damping):
-    self._damping = damping
+    self._damping_func = _package_func(lambda: damping, (damping,))
+
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.FullFactor, (grads_list, self._batch_size))
-    self._factor.register_damped_inverse(damping)
 
-  def multiply_inverse(self, vector):
-    vector_flat = utils.tensors_to_column(vector)
-    out_flat = self._factor.left_multiply_inverse(
-        vector_flat, self._damping)
-    return utils.column_to_tensors(vector, out_flat)
+  def register_matpower(self, exp):
+    self._factor.register_matpower(exp, self._damping_func)
 
-  def multiply(self, vector):
+  def register_cholesky(self):
+    self._factor.register_cholesky(self._damping_func)
+
+  def register_cholesky_inverse(self):
+    self._factor.register_cholesky_inverse(self._damping_func)
+
+  def _multiply_matrix(self, matrix, vector, transpose=False):
     vector_flat = utils.tensors_to_column(vector)
-    out_flat = self._factor.left_multiply(
-        vector_flat, self._damping)
+    out_flat = matrix.matmul(vector_flat, adjoint=transpose)
     return utils.column_to_tensors(vector, out_flat)
 
+  def multiply_matpower(self, vector, exp):
+    matrix = self._factor.get_matpower(exp, self._damping_func)
+    return self._multiply_matrix(matrix, vector)
+
+  def multiply_cholesky(self, vector, transpose=False):
+    matrix = self._factor.get_cholesky(self._damping_func)
+    return self._multiply_matrix(matrix, vector, transpose=transpose)
+
+  def multiply_cholesky_inverse(self, vector, transpose=False):
+    matrix = self._factor.get_cholesky_inverse(self._damping_func)
+    return self._multiply_matrix(matrix, vector, transpose=transpose)
+
   def full_fisher_block(self):
     """Explicitly constructs the full Fisher block."""
-    return self._factor.get_cov()
+    return self._factor.get_cov_as_linear_operator().to_dense()
 
   def tensors_to_compute_grads(self):
     return self._params
 
-  def register_additional_minibatch(self, batch_size):
-    """Register an additional minibatch.
+  def register_additional_tower(self, batch_size):
+    """Register an additional tower.
 
     Args:
       batch_size: The batch size, used in the covariance estimator.
@@ -240,7 +339,7 @@ class FullFB(FisherBlock):
     self._batch_sizes.append(batch_size)
 
   @property
-  def num_registered_minibatches(self):
+  def num_registered_towers(self):
     return len(self._batch_sizes)
 
   @property
@@ -248,7 +347,47 @@ class FullFB(FisherBlock):
     return math_ops.reduce_sum(self._batch_sizes)
 
 
-class NaiveDiagonalFB(FisherBlock):
+@six.add_metaclass(abc.ABCMeta)
+class DiagonalFB(FisherBlock):
+  """A base class for FisherBlocks that use diagonal approximations."""
+
+  def register_matpower(self, exp):
+    # Not needed for this.  Matrix powers are computed on demand in the
+    # diagonal case
+    pass
+
+  def register_cholesky(self):
+    # Not needed for this.  Cholesky's are computed on demand in the
+    # diagonal case
+    pass
+
+  def register_cholesky_inverse(self):
+    # Not needed for this.  Cholesky inverses's are computed on demand in the
+    # diagonal case
+    pass
+
+  def _multiply_matrix(self, matrix, vector):
+    vector_flat = utils.tensors_to_column(vector)
+    out_flat = matrix.matmul(vector_flat)
+    return utils.column_to_tensors(vector, out_flat)
+
+  def multiply_matpower(self, vector, exp):
+    matrix = self._factor.get_matpower(exp, self._damping_func)
+    return self._multiply_matrix(matrix, vector)
+
+  def multiply_cholesky(self, vector, transpose=False):
+    matrix = self._factor.get_cholesky(self._damping_func)
+    return self._multiply_matrix(matrix, vector)
+
+  def multiply_cholesky_inverse(self, vector, transpose=False):
+    matrix = self._factor.get_cholesky_inverse(self._damping_func)
+    return self._multiply_matrix(matrix, vector)
+
+  def full_fisher_block(self):
+    return self._factor.get_cov_as_linear_operator().to_dense()
+
+
+class NaiveDiagonalFB(DiagonalFB):
   """FisherBlock using a diagonal matrix approximation.
 
   This type of approximation is generically applicable but quite primitive.
@@ -271,32 +410,16 @@ class NaiveDiagonalFB(FisherBlock):
     super(NaiveDiagonalFB, self).__init__(layer_collection)
 
   def instantiate_factors(self, grads_list, damping):
-    self._damping = damping
+    self._damping_func = _package_func(lambda: damping, (damping,))
+
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.NaiveDiagonalFactor, (grads_list, self._batch_size))
 
-  def multiply_inverse(self, vector):
-    vector_flat = utils.tensors_to_column(vector)
-    print("vector_flat: %s" % vector_flat)
-    out_flat = self._factor.left_multiply_inverse(
-        vector_flat, self._damping)
-    print("out_flat: %s" % out_flat)
-    return utils.column_to_tensors(vector, out_flat)
-
-  def multiply(self, vector):
-    vector_flat = utils.tensors_to_column(vector)
-    out_flat = self._factor.left_multiply(
-        vector_flat, self._damping)
-    return utils.column_to_tensors(vector, out_flat)
-
-  def full_fisher_block(self):
-    return self._factor.get_cov()
-
   def tensors_to_compute_grads(self):
     return self._params
 
-  def register_additional_minibatch(self, batch_size):
-    """Register an additional minibatch.
+  def register_additional_tower(self, batch_size):
+    """Register an additional tower.
 
     Args:
       batch_size: The batch size, used in the covariance estimator.
@@ -304,7 +427,7 @@ class NaiveDiagonalFB(FisherBlock):
     self._batch_sizes.append(batch_size)
 
   @property
-  def num_registered_minibatches(self):
+  def num_registered_towers(self):
     return len(self._batch_sizes)
 
   @property
@@ -312,7 +435,92 @@ class NaiveDiagonalFB(FisherBlock):
     return math_ops.reduce_sum(self._batch_sizes)
 
 
-class FullyConnectedDiagonalFB(FisherBlock):
+class InputOutputMultiTower(object):
+  """Mix-in class for blocks with inputs & outputs and multiple mini-batches."""
+
+  def __init__(self, *args, **kwargs):
+    self.__inputs = []
+    self.__outputs = []
+    super(InputOutputMultiTower, self).__init__(*args, **kwargs)
+
+  def _process_data(self, grads_list):
+    """Process data into the format used by the factors.
+
+    This function takes inputs and grads_lists data and processes it into
+    one of the formats expected by the FisherFactor classes (depending on
+    the value of the global configuration variable TOWER_STRATEGY).
+
+    The initial format of self._inputs is expected to be a list of Tensors
+    over towers. Similarly grads_lists is expected to be a list over sources
+    of such lists.
+
+    If TOWER_STRATEGY is "concat", 'inputs' becomes a tuple containing a single
+    tensor (represented as a PartitionedTensor object) equal to the
+    concatenation (across towers) of all of the elements of self._inputs. And
+    similarly grads_list is formatted into a tuple (over sources) of such
+    tensors (also represented as PartitionedTensors).
+
+    If TOWER_STRATEGY is "separate", formatting of inputs and grads_list
+    remains unchanged from the initial format (although possibly converting
+    from lists into tuples).
+
+    Args:
+      grads_list: grads_list in its initial format (see above).
+
+    Returns:
+      inputs: self._inputs transformed into the appropriate format (see
+        above).
+      grads_list: grads_list transformed into the appropriate format (see
+        above).
+
+    Raises:
+      ValueError: if TOWER_STRATEGY is not one of "separate" or "concat".
+    """
+    inputs = self._inputs
+    # inputs is a list over towers of Tensors
+    # grads_list is a list of list with the first index being sources and the
+    # second being towers.
+    if fisher_factors.TOWER_STRATEGY == "concat":
+      # Merge towers together into a PartitionedTensor. We package it in
+      # a singleton tuple since the factors will expect a list over towers
+      inputs = (utils.PartitionedTensor(inputs),)
+      # Do the same for grads_list but preserve leading sources dimension
+      grads_list = tuple((utils.PartitionedTensor(grads),)
+                         for grads in grads_list)
+    elif fisher_factors.TOWER_STRATEGY == "separate":
+      inputs = tuple(inputs)
+      grads_list = tuple(grads_list)
+
+    else:
+      raise ValueError("Global config variable TOWER_STRATEGY must be one of "
+                       "'concat' or 'separate'.")
+
+    return inputs, grads_list
+
+  def tensors_to_compute_grads(self):
+    """Tensors to compute derivative of loss with respect to."""
+    return tuple(self._outputs)
+
+  def register_additional_tower(self, inputs, outputs):
+    self._inputs.append(inputs)
+    self._outputs.append(outputs)
+
+  @property
+  def num_registered_towers(self):
+    result = len(self._inputs)
+    assert result == len(self._outputs)
+    return result
+
+  @property
+  def _inputs(self):
+    return self.__inputs
+
+  @property
+  def _outputs(self):
+    return self.__outputs
+
+
+class FullyConnectedDiagonalFB(InputOutputMultiTower, DiagonalFB):
   """FisherBlock for fully-connected (dense) layers using a diagonal approx.
 
   Estimates the Fisher Information matrix's diagonal entries for a fully
@@ -322,14 +530,14 @@ class FullyConnectedDiagonalFB(FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider fully connected layer in this model with (unshared) weight matrix
   'w'. For an example 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( a (d loss / d s)^T )
+    $$v(x, y, w) = vec( a (d loss / d s)^T )$$
 
   This FisherBlock tracks Fisher(params)[i, i] for all indices 'i' corresponding
   to the layer's parameters 'w'.
@@ -344,80 +552,22 @@ class FullyConnectedDiagonalFB(FisherBlock):
       has_bias: Whether the component Kronecker factors have an additive bias.
           (Default: False)
     """
-    self._inputs = []
-    self._outputs = []
     self._has_bias = has_bias
 
     super(FullyConnectedDiagonalFB, self).__init__(layer_collection)
 
   def instantiate_factors(self, grads_list, damping):
-    inputs = _concat_along_batch_dim(self._inputs)
-    grads_list = tuple(_concat_along_batch_dim(grads) for grads in grads_list)
+    inputs, grads_list = self._process_data(grads_list)
 
-    self._damping = damping
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.FullyConnectedDiagonalFactor,
         (inputs, grads_list, self._has_bias))
 
-  def multiply_inverse(self, vector):
-    """Approximate damped inverse Fisher-vector product.
-
-    Args:
-      vector: Tensor or 2-tuple of Tensors. if self._has_bias, Tensor of shape
-        [input_size, output_size] corresponding to layer's weights. If not, a
-        2-tuple of the former and a Tensor of shape [output_size] corresponding
-        to the layer's bias.
-
-    Returns:
-      Tensor of the same shape, corresponding to the inverse Fisher-vector
-      product.
-    """
-    reshaped_vec = utils.layer_params_to_mat2d(vector)
-    reshaped_out = self._factor.left_multiply_inverse(
-        reshaped_vec, self._damping)
-    return utils.mat2d_to_layer_params(vector, reshaped_out)
-
-  def multiply(self, vector):
-    """Approximate damped Fisher-vector product.
-
-    Args:
-      vector: Tensor or 2-tuple of Tensors. if self._has_bias, Tensor of shape
-        [input_size, output_size] corresponding to layer's weights. If not, a
-        2-tuple of the former and a Tensor of shape [output_size] corresponding
-        to the layer's bias.
-
-    Returns:
-      Tensor of the same shape, corresponding to the Fisher-vector product.
-    """
-    reshaped_vec = utils.layer_params_to_mat2d(vector)
-    reshaped_out = self._factor.left_multiply(
-        reshaped_vec, self._damping)
-    return utils.mat2d_to_layer_params(vector, reshaped_out)
-
-  def tensors_to_compute_grads(self):
-    """Tensors to compute derivative of loss with respect to."""
-    return self._outputs
-
-  def register_additional_minibatch(self, inputs, outputs):
-    """Registers an additional minibatch to the FisherBlock.
-
-    Args:
-      inputs: Tensor of shape [batch_size, input_size]. Inputs to the
-        matrix-multiply.
-      outputs: Tensor of shape [batch_size, output_size]. Layer preactivations.
-    """
-    self._inputs.append(inputs)
-    self._outputs.append(outputs)
-
-  @property
-  def num_registered_minibatches(self):
-    result = len(self._inputs)
-    assert result == len(self._outputs)
-    return result
+    self._damping_func = _package_func(lambda: damping, (damping,))
 
 
-class ConvDiagonalFB(FisherBlock):
-  """FisherBlock for convolutional layers using a diagonal approx.
+class ConvDiagonalFB(InputOutputMultiTower, DiagonalFB):
+  """FisherBlock for 2-D convolutional layers using a diagonal approx.
 
   Estimates the Fisher Information matrix's diagonal entries for a convolutional
   layer. Unlike NaiveDiagonalFB this uses the low-variance "sum of squares"
@@ -426,14 +576,14 @@ class ConvDiagonalFB(FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider a convoluational layer in this model with (unshared) filter matrix
   'w'. For an example image 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )
+    $$v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )$$
 
   where 'loc' is a single (x, y) location in an image.
 
@@ -441,7 +591,13 @@ class ConvDiagonalFB(FisherBlock):
   to the layer's parameters 'w'.
   """
 
-  def __init__(self, layer_collection, params, strides, padding):
+  def __init__(self,
+               layer_collection,
+               params,
+               strides,
+               padding,
+               data_format=None,
+               dilations=None):
     """Creates a ConvDiagonalFB block.
 
     Args:
@@ -453,92 +609,113 @@ class ConvDiagonalFB(FisherBlock):
         containing the previous and a Tensor of shape [out_channels].
       strides: The stride size in this layer (1-D Tensor of length 4).
       padding: The padding in this layer (e.g. "SAME").
+      data_format: str or None. Format of input data.
+      dilations: List of 4 ints or None. Rate for dilation along all dimensions.
+
+    Raises:
+      ValueError: if strides is not length-4.
+      ValueError: if dilations is not length-4.
+      ValueError: if channel is not last dimension.
     """
-    self._inputs = []
-    self._outputs = []
-    self._strides = tuple(strides) if isinstance(strides, list) else strides
+    if len(strides) != 4:
+      raise ValueError("strides must contain 4 numbers.")
+
+    if dilations is None:
+      dilations = [1, 1, 1, 1]
+
+    if len(dilations) != 4:
+      raise ValueError("dilations must contain 4 numbers.")
+
+    if not utils.is_data_format_channel_last(data_format):
+      raise ValueError("data_format must be channels-last.")
+
+    self._strides = maybe_tuple(strides)
     self._padding = padding
+    self._data_format = data_format
+    self._dilations = maybe_tuple(dilations)
     self._has_bias = isinstance(params, (tuple, list))
 
     fltr = params[0] if self._has_bias else params
     self._filter_shape = tuple(fltr.shape.as_list())
 
+    if len(self._filter_shape) != 4:
+      raise ValueError(
+          "Convolution filter must be of shape"
+          " [filter_height, filter_width, in_channels, out_channels].")
+
     super(ConvDiagonalFB, self).__init__(layer_collection)
 
   def instantiate_factors(self, grads_list, damping):
-    # Concatenate inputs, grads_list into single Tensors.
-    inputs = _concat_along_batch_dim(self._inputs)
-    grads_list = tuple(_concat_along_batch_dim(grads) for grads in grads_list)
+    inputs, grads_list = self._process_data(grads_list)
 
     # Infer number of locations upon which convolution is applied.
-    inputs_shape = tuple(inputs.shape.as_list())
-    self._num_locations = (
-        inputs_shape[1] * inputs_shape[2] //
-        (self._strides[1] * self._strides[2]))
-
-    self._damping = (self._num_locations
-                     * normalize_damping(damping, self._num_locations))
+    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
+                                             self._strides)
 
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvDiagonalFactor,
         (inputs, grads_list, self._filter_shape, self._strides, self._padding,
-         self._has_bias))
+         self._data_format, self._dilations, self._has_bias))
 
-  def multiply_inverse(self, vector):
-    reshaped_vect = utils.layer_params_to_mat2d(vector)
-    reshaped_out = self._factor.left_multiply_inverse(
-        reshaped_vect, self._damping)
-    return utils.mat2d_to_layer_params(vector, reshaped_out)
+    def damping_func():
+      return self._num_locations * normalize_damping(damping,
+                                                     self._num_locations)
 
-  def multiply(self, vector):
-    reshaped_vect = utils.layer_params_to_mat2d(vector)
-    reshaped_out = self._factor.left_multiply(
-        reshaped_vect, self._damping)
-    return utils.mat2d_to_layer_params(vector, reshaped_out)
-
-  def tensors_to_compute_grads(self):
-    return self._outputs
-
-  def register_additional_minibatch(self, inputs, outputs):
-    """Registers an additional minibatch to the FisherBlock.
-
-    Args:
-      inputs: Tensor of shape [batch_size, height, width, input_size]. Inputs to
-        the convolution.
-      outputs: Tensor of shape [batch_size, height, width, output_size]. Layer
-        preactivations.
-    """
-    self._inputs.append(inputs)
-    self._outputs.append(outputs)
-
-  @property
-  def num_registered_minibatches(self):
-    return len(self._inputs)
+    damping_id = (self._num_locations, "mult", "normalize_damping", damping,
+                  self._num_locations)
+    self._damping_func = _package_func(damping_func, damping_id)
 
 
 class KroneckerProductFB(FisherBlock):
-  """A base class for FisherBlocks with separate input and output factors.
+  """A base class for blocks with separate input and output Kronecker factors.
 
   The Fisher block is approximated as a Kronecker product of the input and
   output factors.
   """
 
-  def _register_damped_input_and_output_inverses(self, damping):
-    """Registers damped inverses for both the input and output factors.
+  def __init__(self, layer_collection):
+    super(KroneckerProductFB, self).__init__(layer_collection)
+
+  def _setup_damping(self, damping, normalization=None):
+    """Makes functions that compute the damping values for both factors."""
+    def compute_damping():
+      if normalization is not None:
+        maybe_normalized_damping = normalize_damping(damping, normalization)
+      else:
+        maybe_normalized_damping = damping
+
+      return compute_pi_adjusted_damping(
+          self._input_factor.get_cov_as_linear_operator(),
+          self._output_factor.get_cov_as_linear_operator(),
+          maybe_normalized_damping**0.5)
+
+    if normalization is not None:
+      damping_id = ("compute_pi_adjusted_damping",
+                    "cov", self._input_factor.name,
+                    "cov", self._output_factor.name,
+                    "normalize_damping", damping, normalization, "power", 0.5)
+    else:
+      damping_id = ("compute_pi_adjusted_damping",
+                    "cov", self._input_factor.name,
+                    "cov", self._output_factor.name,
+                    damping, "power", 0.5)
 
-    Sets the instance members _input_damping and _output_damping. Requires the
-    instance members _input_factor and _output_factor.
+    self._input_damping_func = _package_func(lambda: compute_damping()[0],
+                                             damping_id + ("ref", 0))
+    self._output_damping_func = _package_func(lambda: compute_damping()[1],
+                                              damping_id + ("ref", 1))
 
-    Args:
-      damping: The base damping factor (float or Tensor) for the damped inverse.
-    """
-    self._input_damping, self._output_damping = compute_pi_adjusted_damping(
-        self._input_factor.get_cov(),
-        self._output_factor.get_cov(),
-        damping**0.5)
+  def register_matpower(self, exp):
+    self._input_factor.register_matpower(exp, self._input_damping_func)
+    self._output_factor.register_matpower(exp, self._output_damping_func)
+
+  def register_cholesky(self):
+    self._input_factor.register_cholesky(self._input_damping_func)
+    self._output_factor.register_cholesky(self._output_damping_func)
 
-    self._input_factor.register_damped_inverse(self._input_damping)
-    self._output_factor.register_damped_inverse(self._output_damping)
+  def register_cholesky_inverse(self):
+    self._input_factor.register_cholesky_inverse(self._input_damping_func)
+    self._output_factor.register_cholesky_inverse(self._output_damping_func)
 
   @property
   def _renorm_coeff(self):
@@ -552,29 +729,46 @@ class KroneckerProductFB(FisherBlock):
     """
     return 1.0
 
-  def multiply_inverse(self, vector):
+  def _multiply_factored_matrix(self, left_factor, right_factor, vector,
+                                extra_scale=1.0, transpose_left=False,
+                                transpose_right=False):
     reshaped_vector = utils.layer_params_to_mat2d(vector)
-    reshaped_out = self._output_factor.right_multiply_inverse(
-        reshaped_vector,
-        self._output_damping)
-    reshaped_out = self._input_factor.left_multiply_inverse(
-        reshaped_out, self._input_damping)
-    if self._renorm_coeff != 1.0:
-      reshaped_out /= math_ops.cast(
-          self._renorm_coeff, dtype=reshaped_out.dtype)
+    reshaped_out = right_factor.matmul_right(reshaped_vector,
+                                             adjoint=transpose_right)
+    reshaped_out = left_factor.matmul(reshaped_out,
+                                      adjoint=transpose_left)
+    if extra_scale != 1.0:
+      reshaped_out *= math_ops.cast(extra_scale, dtype=reshaped_out.dtype)
     return utils.mat2d_to_layer_params(vector, reshaped_out)
 
-  def multiply(self, vector):
-    reshaped_vector = utils.layer_params_to_mat2d(vector)
-    reshaped_out = self._output_factor.right_multiply(
-        reshaped_vector,
-        self._output_damping)
-    reshaped_out = self._input_factor.left_multiply(
-        reshaped_out, self._input_damping)
-    if self._renorm_coeff != 1.0:
-      reshaped_out *= math_ops.cast(
-          self._renorm_coeff, dtype=reshaped_out.dtype)
-    return utils.mat2d_to_layer_params(vector, reshaped_out)
+  def multiply_matpower(self, vector, exp):
+    left_factor = self._input_factor.get_matpower(
+        exp, self._input_damping_func)
+    right_factor = self._output_factor.get_matpower(
+        exp, self._output_damping_func)
+    extra_scale = float(self._renorm_coeff)**exp
+    return self._multiply_factored_matrix(left_factor, right_factor, vector,
+                                          extra_scale=extra_scale)
+
+  def multiply_cholesky(self, vector, transpose=False):
+    left_factor = self._input_factor.get_cholesky(self._input_damping_func)
+    right_factor = self._output_factor.get_cholesky(self._output_damping_func)
+    extra_scale = float(self._renorm_coeff)**0.5
+    return self._multiply_factored_matrix(left_factor, right_factor, vector,
+                                          extra_scale=extra_scale,
+                                          transpose_left=transpose,
+                                          transpose_right=not transpose)
+
+  def multiply_cholesky_inverse(self, vector, transpose=False):
+    left_factor = self._input_factor.get_cholesky_inverse(
+        self._input_damping_func)
+    right_factor = self._output_factor.get_cholesky_inverse(
+        self._output_damping_func)
+    extra_scale = float(self._renorm_coeff)**-0.5
+    return self._multiply_factored_matrix(left_factor, right_factor, vector,
+                                          extra_scale=extra_scale,
+                                          transpose_left=transpose,
+                                          transpose_right=not transpose)
 
   def full_fisher_block(self):
     """Explicitly constructs the full Fisher block.
@@ -584,16 +778,16 @@ class KroneckerProductFB(FisherBlock):
     Returns:
       The full Fisher block.
     """
-    left_factor = self._input_factor.get_cov()
-    right_factor = self._output_factor.get_cov()
+    left_factor = self._input_factor.get_cov_as_linear_operator().to_dense()
+    right_factor = self._output_factor.get_cov_as_linear_operator().to_dense()
     return self._renorm_coeff * utils.kronecker_product(left_factor,
                                                         right_factor)
 
 
-class EmbeddingKFACFB(KroneckerProductFB):
+class EmbeddingKFACFB(InputOutputMultiTower, KroneckerProductFB):
   """K-FAC FisherBlock for embedding layers.
 
-  This FisherBlock is similar to EmbeddingKFACFB, except that its
+  This FisherBlock is similar to FullyConnectedKFACBasicFB, except that its
   input factor is approximated by a diagonal matrix. In the case that each
   example references exactly one embedding, this approximation is exact.
 
@@ -608,8 +802,6 @@ class EmbeddingKFACFB(KroneckerProductFB):
           Fisher information matrix to which this FisherBlock belongs.
       vocab_size: int. Size of vocabulary for this embedding layer.
     """
-    self._inputs = []
-    self._outputs = []
     self._vocab_size = vocab_size
 
     super(EmbeddingKFACFB, self).__init__(layer_collection)
@@ -624,41 +816,17 @@ class EmbeddingKFACFB(KroneckerProductFB):
       damping: 0-D Tensor or float. 'damping' * identity is approximately added
         to this FisherBlock's Fisher approximation.
     """
-    # TODO(b/68033310): Validate which of,
-    #   (1) summing on a single device (as below), or
-    #   (2) on each device in isolation and aggregating
-    # is faster.
-    inputs = _concat_along_batch_dim(self._inputs)
-    grads_list = tuple(_concat_along_batch_dim(grads) for grads in grads_list)
-
-    self._input_factor = self._layer_collection.make_or_get_factor(  #
-        fisher_factors.EmbeddingInputKroneckerFactor,  #
-        ((inputs,), self._vocab_size))
-    self._output_factor = self._layer_collection.make_or_get_factor(  #
-        fisher_factors.FullyConnectedKroneckerFactor,  #
-        (grads_list,))
-    self._register_damped_input_and_output_inverses(damping)
-
-  def tensors_to_compute_grads(self):
-    return self._outputs
-
-  def register_additional_minibatch(self, inputs, outputs):
-    """Registers an additional minibatch to the FisherBlock.
-
-    Args:
-      inputs: Tensor of shape [batch_size, input_size]. Inputs to the
-        matrix-multiply.
-      outputs: Tensor of shape [batch_size, output_size]. Layer preactivations.
-    """
-    self._inputs.append(inputs)
-    self._outputs.append(outputs)
+    inputs, grads_list = self._process_data(grads_list)
 
-  @property
-  def num_registered_minibatches(self):
-    return len(self._inputs)
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.EmbeddingInputKroneckerFactor,
+        (inputs, self._vocab_size))
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedKroneckerFactor, (grads_list,))
+    self._setup_damping(damping)
 
 
-class FullyConnectedKFACBasicFB(KroneckerProductFB):
+class FullyConnectedKFACBasicFB(InputOutputMultiTower, KroneckerProductFB):
   """K-FAC FisherBlock for fully-connected (dense) layers.
 
   This uses the Kronecker-factorized approximation from the original
@@ -674,8 +842,6 @@ class FullyConnectedKFACBasicFB(KroneckerProductFB):
       has_bias: Whether the component Kronecker factors have an additive bias.
           (Default: False)
     """
-    self._inputs = []
-    self._outputs = []
     self._has_bias = has_bias
 
     super(FullyConnectedKFACBasicFB, self).__init__(layer_collection)
@@ -690,42 +856,19 @@ class FullyConnectedKFACBasicFB(KroneckerProductFB):
       damping: 0-D Tensor or float. 'damping' * identity is approximately added
         to this FisherBlock's Fisher approximation.
     """
-    # TODO(b/68033310): Validate which of,
-    #   (1) summing on a single device (as below), or
-    #   (2) on each device in isolation and aggregating
-    # is faster.
-    inputs = _concat_along_batch_dim(self._inputs)
-    grads_list = tuple(_concat_along_batch_dim(grads) for grads in grads_list)
-
-    self._input_factor = self._layer_collection.make_or_get_factor(  #
-        fisher_factors.FullyConnectedKroneckerFactor,  #
+    inputs, grads_list = self._process_data(grads_list)
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedKroneckerFactor,
         ((inputs,), self._has_bias))
-    self._output_factor = self._layer_collection.make_or_get_factor(  #
-        fisher_factors.FullyConnectedKroneckerFactor,  #
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedKroneckerFactor,
         (grads_list,))
-    self._register_damped_input_and_output_inverses(damping)
-
-  def tensors_to_compute_grads(self):
-    return self._outputs
-
-  def register_additional_minibatch(self, inputs, outputs):
-    """Registers an additional minibatch to the FisherBlock.
-
-    Args:
-      inputs: Tensor of shape [batch_size, input_size]. Inputs to the
-        matrix-multiply.
-      outputs: Tensor of shape [batch_size, output_size]. Layer preactivations.
-    """
-    self._inputs.append(inputs)
-    self._outputs.append(outputs)
-
-  @property
-  def num_registered_minibatches(self):
-    return len(self._inputs)
+    self._setup_damping(damping)
 
 
-class ConvKFCBasicFB(KroneckerProductFB):
-  """FisherBlock for 2D convolutional layers using the basic KFC approx.
+class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB):
+  r"""FisherBlock for convolutional layers using the basic KFC approx.
 
   Estimates the Fisher Information matrix's blog for a convolutional
   layer.
@@ -734,12 +877,12 @@ class ConvKFCBasicFB(KroneckerProductFB):
   'w'. For a minibatch that produces inputs 'a' and output preactivations 's',
   this FisherBlock estimates,
 
-    F(w) = #locations * kronecker(E[flat(a) flat(a)^T],
-                                  E[flat(ds) flat(ds)^T])
+    $$F(w) = \#locations * kronecker(E[flat(a) flat(a)^T],
+                                  E[flat(ds) flat(ds)^T])$$
 
   where
 
-    ds = (d / ds) log p(y | x, w)
+    $$ds = (d / ds) log p(y | x, w)$$
     #locations = number of (x, y) locations where 'w' is applied.
 
   where the expectation is taken over all examples and locations and flat()
@@ -748,23 +891,40 @@ class ConvKFCBasicFB(KroneckerProductFB):
   See equation 23 in https://arxiv.org/abs/1602.01407 for details.
   """
 
-  def __init__(self, layer_collection, params, strides, padding):
+  def __init__(self,
+               layer_collection,
+               params,
+               padding,
+               strides=None,
+               dilation_rate=None,
+               data_format=None,
+               extract_patches_fn=None):
     """Creates a ConvKFCBasicFB block.
 
     Args:
       layer_collection: The collection of all layers in the K-FAC approximate
           Fisher information matrix to which this FisherBlock belongs.
       params: The parameters (Tensor or tuple of Tensors) of this layer. If
-        kernel alone, a Tensor of shape [kernel_height, kernel_width,
+        kernel alone, a Tensor of shape [..spatial_filter_shape..,
         in_channels, out_channels]. If kernel and bias, a tuple of 2 elements
         containing the previous and a Tensor of shape [out_channels].
-      strides: The stride size in this layer (1-D Tensor of length 4).
-      padding: The padding in this layer (1-D of Tensor length 4).
+      padding: str. Padding method.
+      strides: List of ints or None. Contains [..spatial_filter_strides..] if
+        'extract_patches_fn' is compatible with tf.nn.convolution(), else
+        [1, ..spatial_filter_strides, 1].
+      dilation_rate: List of ints or None. Rate for dilation along each spatial
+        dimension if 'extract_patches_fn' is compatible with
+        tf.nn.convolution(), else [1, ..spatial_dilation_rates.., 1].
+      data_format: str or None. Format of input data.
+      extract_patches_fn: str or None. Name of function that extracts image
+        patches. One of "extract_convolution_patches", "extract_image_patches",
+        "extract_pointwise_conv2d_patches".
     """
-    self._inputs = []
-    self._outputs = []
-    self._strides = tuple(strides) if isinstance(strides, list) else strides
     self._padding = padding
+    self._strides = maybe_tuple(strides)
+    self._dilation_rate = maybe_tuple(dilation_rate)
+    self._data_format = data_format
+    self._extract_patches_fn = extract_patches_fn
     self._has_bias = isinstance(params, (tuple, list))
 
     fltr = params[0] if self._has_bias else params
@@ -773,145 +933,610 @@ class ConvKFCBasicFB(KroneckerProductFB):
     super(ConvKFCBasicFB, self).__init__(layer_collection)
 
   def instantiate_factors(self, grads_list, damping):
-    # TODO(b/68033310): Validate which of,
-    #   (1) summing on a single device (as below), or
-    #   (2) on each device in isolation and aggregating
-    # is faster.
-    inputs = _concat_along_batch_dim(self._inputs)
-    grads_list = tuple(_concat_along_batch_dim(grads) for grads in grads_list)
+    inputs, grads_list = self._process_data(grads_list)
 
     # Infer number of locations upon which convolution is applied.
-    self._num_locations = num_conv_locations(inputs.shape.as_list(),
+    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
                                              self._strides)
 
     self._input_factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvInputKroneckerFactor,
-        (inputs, self._filter_shape, self._strides, self._padding,
+        (inputs, self._filter_shape, self._padding, self._strides,
+         self._dilation_rate, self._data_format, self._extract_patches_fn,
          self._has_bias))
     self._output_factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvOutputKroneckerFactor, (grads_list,))
 
-    damping = normalize_damping(damping, self._num_locations)
-    self._register_damped_input_and_output_inverses(damping)
-    self._damping = damping
+    self._setup_damping(damping, normalization=self._num_locations)
 
   @property
   def _renorm_coeff(self):
     return self._num_locations
 
-  def tensors_to_compute_grads(self):
-    return self._outputs
 
-  def register_additional_minibatch(self, inputs, outputs):
-    """Registers an additional minibatch to the FisherBlock.
+class DepthwiseConvDiagonalFB(ConvDiagonalFB):
+  """FisherBlock for depthwise_conv2d().
+
+  Equivalent to ConvDiagonalFB applied to each input channel in isolation.
+  """
+
+  def __init__(self,
+               layer_collection,
+               params,
+               strides,
+               padding,
+               rate=None,
+               data_format=None):
+    """Creates a DepthwiseConvKFCBasicFB block.
 
     Args:
-      inputs: Tensor of shape [batch_size, height, width, input_size]. Inputs to
-        the convolution.
-      outputs: Tensor of shape [batch_size, height, width, output_size]. Layer
-        preactivations.
+      layer_collection: The collection of all layers in the K-FAC approximate
+          Fisher information matrix to which this FisherBlock belongs.
+      params: Tensor of shape [filter_height, filter_width, in_channels,
+        channel_multiplier].
+      strides: List of 4 ints. Strides along all dimensions.
+      padding: str. Padding method.
+      rate: List of 4 ints or None. Rate for dilation along all dimensions.
+      data_format: str or None. Format of input data.
+
+    Raises:
+      NotImplementedError: If parameters contains bias.
+      ValueError: If filter is not 4-D.
+      ValueError: If strides is not length-4.
+      ValueError: If rates is not length-2.
+      ValueError: If channels are not last dimension.
     """
-    self._inputs.append(inputs)
-    self._outputs.append(outputs)
+    if isinstance(params, (tuple, list)):
+      raise NotImplementedError("Bias not yet supported.")
 
-  @property
-  def num_registered_minibatches(self):
-    return len(self._inputs)
+    if params.shape.ndims != 4:
+      raise ValueError("Filter must be 4-D.")
+
+    if len(strides) != 4:
+      raise ValueError("strides must account for 4 dimensions.")
+
+    if rate is not None:
+      if len(rate) != 2:
+        raise ValueError("rate must only account for spatial dimensions.")
+      rate = [1, rate[0], rate[1], 1]  # conv2d expects 4-element rate.
+
+    if not utils.is_data_format_channel_last(data_format):
+      raise ValueError("data_format must be channels-last.")
+
+    super(DepthwiseConvDiagonalFB, self).__init__(
+        layer_collection=layer_collection,
+        params=params,
+        strides=strides,
+        padding=padding,
+        dilations=rate,
+        data_format=data_format)
+
+    # This is a hack to overwrite the same setting in ConvKFCBasicFB.__init__().
+    filter_height, filter_width, in_channels, channel_multiplier = (
+        params.shape.as_list())
+    self._filter_shape = (filter_height, filter_width, in_channels,
+                          in_channels * channel_multiplier)
+
+  def _multiply_matrix(self, matrix, vector):
+    conv2d_vector = depthwise_conv2d_filter_to_conv2d_filter(vector)
+    conv2d_result = super(
+        DepthwiseConvDiagonalFB, self)._multiply_matrix(matrix, conv2d_vector)
+    return conv2d_filter_to_depthwise_conv2d_filter(conv2d_result)
+
+
+class DepthwiseConvKFCBasicFB(ConvKFCBasicFB):
+  """FisherBlock for depthwise_conv2d().
+
+  Equivalent to ConvKFCBasicFB applied to each input channel in isolation.
+  """
+
+  def __init__(self,
+               layer_collection,
+               params,
+               strides,
+               padding,
+               rate=None,
+               data_format=None):
+    """Creates a DepthwiseConvKFCBasicFB block.
 
+    Args:
+      layer_collection: The collection of all layers in the K-FAC approximate
+          Fisher information matrix to which this FisherBlock belongs.
+      params: Tensor of shape [filter_height, filter_width, in_channels,
+        channel_multiplier].
+      strides: List of 4 ints. Strides along all dimensions.
+      padding: str. Padding method.
+      rate: List of 4 ints or None. Rate for dilation along all dimensions.
+      data_format: str or None. Format of input data.
+
+    Raises:
+      NotImplementedError: If parameters contains bias.
+      ValueError: If filter is not 4-D.
+      ValueError: If strides is not length-4.
+      ValueError: If rates is not length-2.
+      ValueError: If channels are not last dimension.
+    """
+    if isinstance(params, (tuple, list)):
+      raise NotImplementedError("Bias not yet supported.")
+
+    if params.shape.ndims != 4:
+      raise ValueError("Filter must be 4-D.")
+
+    if len(strides) != 4:
+      raise ValueError("strides must account for 4 dimensions.")
+
+    if rate is not None:
+      if len(rate) != 2:
+        raise ValueError("rate must only account for spatial dimensions.")
+      rate = [1, rate[0], rate[1], 1]  # conv2d expects 4-element rate.
+
+    if not utils.is_data_format_channel_last(data_format):
+      raise ValueError("data_format must be channels-last.")
+
+    super(DepthwiseConvKFCBasicFB, self).__init__(
+        layer_collection=layer_collection,
+        params=params,
+        padding=padding,
+        strides=strides,
+        dilation_rate=rate,
+        data_format=data_format,
+        extract_patches_fn="extract_image_patches")
+
+    # This is a hack to overwrite the same setting in ConvKFCBasicFB.__init__().
+    filter_height, filter_width, in_channels, channel_multiplier = (
+        params.shape.as_list())
+    self._filter_shape = (filter_height, filter_width, in_channels,
+                          in_channels * channel_multiplier)
+
+  def _multiply_factored_matrix(self, left_factor, right_factor, vector,
+                                extra_scale=1.0, transpose_left=False,
+                                transpose_right=False):
+    conv2d_vector = depthwise_conv2d_filter_to_conv2d_filter(vector)
+    conv2d_result = super(
+        DepthwiseConvKFCBasicFB, self)._multiply_factored_matrix(
+            left_factor, right_factor, conv2d_vector, extra_scale=extra_scale,
+            transpose_left=transpose_left, transpose_right=transpose_right)
+    return conv2d_filter_to_depthwise_conv2d_filter(conv2d_result)
+
+
+def depthwise_conv2d_filter_to_conv2d_filter(filter, name=None):  # pylint: disable=redefined-builtin
+  """Converts a convolution filter for use with conv2d.
+
+  Transforms a filter for use with tf.nn.depthwise_conv2d() to one that's
+  compatible with tf.nn.conv2d().
 
-def _concat_along_batch_dim(tensor_list):
-  """Concatenate tensors along batch (first) dimension.
+  Args:
+    filter: Tensor of shape [height, width, in_channels, channel_multiplier].
+    name: None or str. Name of Op.
+
+  Returns:
+    Tensor of shape [height, width, in_channels, out_channels].
+
+  """
+  with ops.name_scope(name, "depthwise_conv2d_filter_to_conv2d_filter",
+                      [filter]):
+    filter = ops.convert_to_tensor(filter)
+    filter_height, filter_width, in_channels, channel_multiplier = (
+        filter.shape.as_list())
+
+    results = []
+    for i in range(in_channels):
+      # Slice out one in_channel's filter. Insert zeros around it to force it
+      # to affect that channel and that channel alone.
+      elements = []
+      if i > 0:
+        elements.append(
+            array_ops.zeros(
+                [filter_height, filter_width, i, channel_multiplier]))
+      elements.append(filter[:, :, i:(i + 1), :])
+      if i + 1 < in_channels:
+        elements.append(
+            array_ops.zeros([
+                filter_height, filter_width, in_channels - (i + 1),
+                channel_multiplier
+            ]))
+
+      # Concat along in_channel.
+      results.append(
+          array_ops.concat(elements, axis=-2, name="in_channel_%d" % i))
+
+    # Concat along out_channel.
+    return array_ops.concat(results, axis=-1, name="out_channel")
+
+
+def conv2d_filter_to_depthwise_conv2d_filter(filter, name=None):  # pylint: disable=redefined-builtin
+  """Converts a convolution filter for use with depthwise_conv2d.
+
+  Transforms a filter for use with tf.nn.conv2d() to one that's
+  compatible with tf.nn.depthwise_conv2d(). Ignores all filters but those along
+  the diagonal.
 
   Args:
-    tensor_list: list of Tensors or list of tuples of Tensors.
+    filter: Tensor of shape [height, width, in_channels, out_channels].
+    name: None or str. Name of Op.
 
   Returns:
-    Tensor or tuple of Tensors.
+    Tensor of shape,
+      [height, width, in_channels, channel_multiplier]
 
   Raises:
-    ValueError: If 'tensor_list' is empty.
-
+    ValueError: if out_channels is not evenly divisible by in_channels.
   """
-  if not tensor_list:
-    raise ValueError(
-        "Cannot concatenate Tensors if there are no Tensors to concatenate.")
-
-  if isinstance(tensor_list[0], (tuple, list)):
-    # [(tensor1a, tensor1b),
-    #  (tensor2a, tensor2b), ...] --> (tensor_a, tensor_b)
-    return tuple(
-        array_ops.concat(tensors, axis=0) for tensors in zip(*tensor_list))
-  else:
-    # [tensor1, tensor2] --> tensor
-    return array_ops.concat(tensor_list, axis=0)
+  with ops.name_scope(name, "conv2d_filter_to_depthwise_conv2d_filter",
+                      [filter]):
+    filter = ops.convert_to_tensor(filter)
+    filter_height, filter_width, in_channels, out_channels = (
+        filter.shape.as_list())
+
+    if out_channels % in_channels != 0:
+      raise ValueError("out_channels must be evenly divisible by in_channels.")
+    channel_multiplier = out_channels // in_channels
+
+    results = []
+    filter = array_ops.reshape(filter, [
+        filter_height, filter_width, in_channels, in_channels,
+        channel_multiplier
+    ])
+    for i in range(in_channels):
+      # Slice out output corresponding to the correct filter.
+      filter_slice = array_ops.reshape(
+          filter[:, :, i, i, :],
+          [filter_height, filter_width, 1, channel_multiplier])
+      results.append(filter_slice)
+
+    # Concat along out_channel.
+    return array_ops.concat(results, axis=-2, name="in_channels")
+
+
+def maybe_tuple(obj):
+  if not isinstance(obj, list):
+    return obj
+  return tuple(obj)
 
 
 def num_conv_locations(input_shape, strides):
   """Returns the number of spatial locations a 2D Conv kernel is applied to.
 
   Args:
-    input_shape: list representing shape of inputs to the Conv layer.
-    strides: list representing strides for the Conv kernel.
+    input_shape: List of ints representing shape of inputs to
+      tf.nn.convolution().
+    strides: List of ints representing strides along spatial dimensions as
+      passed in to tf.nn.convolution().
 
   Returns:
     A scalar |T| denoting the number of spatial locations for the Conv layer.
   """
-  return input_shape[1] * input_shape[2] // (strides[1] * strides[2])
+  spatial_input_locations = np.prod(input_shape[1:-1])
+
+  if strides is None:
+    spatial_strides_divisor = 1
+  else:
+    spatial_strides_divisor = np.prod(strides)
+
+  return spatial_input_locations // spatial_strides_divisor
+
+
+class InputOutputMultiTowerMultiUse(InputOutputMultiTower):
+  """Adds methods for multi-use/time-step case to InputOutputMultiTower."""
+
+  def __init__(self, num_uses=None, *args, **kwargs):
+    self._num_uses = num_uses
+    super(InputOutputMultiTowerMultiUse, self).__init__(*args, **kwargs)
+
+  def _process_data(self, grads_list):
+    """Process temporal/multi-use data into the format used by the factors.
+
+    This function takes inputs and grads_lists data and processes it into
+    one of the formats expected by the FisherFactor classes (depending on
+    the value of the global configuration variable TOWER_STRATEGY).
+
+    It accepts the data in one of two initial formats. The first possible
+    format is where self._inputs is a list of list of Tensors. The first index
+    is tower, the second is use/time-step. grads_list, meanwhile, is a list
+    over sources of such lists of lists.
+
+    The second possible data format is where self._inputs is a Tensor with
+    uses/times-steps folded into the batch dimension.  i.e. it is a Tensor
+    of shape [num_uses * size_batch, ...] which represents a reshape of a
+    Tensor of shape [num_uses, size_batch, ...].  And similarly grads_list is
+    a list over sources of such Tensors.
 
+    There are two possible formats which inputs and grads_list are transformed
+    into.
 
-class FullyConnectedMultiIndepFB(KroneckerProductFB):
+    If TOWER_STRATEGY is "concat", 'inputs' becomes a tuple containing
+    a single tensor (represented as a PartitionedTensor object) with all of
+    the data from the towers, as well as the uses/time-steps, concatenated
+    together. In this tensor the leading dimension is the batch and
+    use/time-step dimensions folded together (with 'use' being the major of
+    these two, so that the tensors can be thought of as reshapes of ones of
+    shape [num_uses, batch_size, ...]). grads_list is similarly formatted as a
+    tuple over sources of such tensors.
+
+    If TOWER_STRATEGY is "separate" the inputs are formatted into lists of
+    tensors over towers. Each of these tensors has a similar format to
+    the tensor produced by the "concat" option, except that each contains
+    only the data from a single tower.  grads_list is similarly formatted
+    into a tuple over sources of such tuples.
+
+    Args:
+      grads_list: grads_list in its initial format (see above).
+
+    Returns:
+      inputs: self._inputs transformed into the appropriate format (see
+        above).
+      grads_list: grads_list transformed into the appropriate format (see
+        above).
+
+    Raises:
+      ValueError: If TOWER_STRATEGY is not one of "separate" or "concat".
+      ValueError: If the given/initial format of self._inputs and grads_list
+        isn't recognized, or doesn't agree with self._num_uses.
+    """
+
+    inputs = self._inputs
+
+    if isinstance(inputs[0], (list, tuple)):
+      num_uses = len(inputs[0])
+      if self._num_uses is not None and self._num_uses != num_uses:
+        raise ValueError("num_uses argument doesn't match length of inputs.")
+      else:
+        self._num_uses = num_uses
+
+      # Check that all mini-batches/towers have the same number of uses
+      if not all(len(input_) == num_uses for input_ in inputs):
+        raise ValueError("Length of inputs argument is inconsistent across "
+                         "towers.")
+
+      if fisher_factors.TOWER_STRATEGY == "concat":
+        # Reverse the tower and use/time-step indices, so that use is now first,
+        # and towers is second
+        inputs = tuple(zip(*inputs))
+
+        # Flatten the two dimensions
+        inputs = nest.flatten(inputs)
+
+        # Merge everything together into a PartitionedTensor. We package it in
+        # a singleton tuple since the factors will expect a list over towers
+        inputs = (utils.PartitionedTensor(inputs),)
+
+      elif fisher_factors.TOWER_STRATEGY == "separate":
+        # Merge together the uses/time-step dimension into PartitionedTensors,
+        # but keep the leading dimension (towers) intact for the factors to
+        # process individually.
+        inputs = tuple(utils.PartitionedTensor(input_) for input_ in inputs)
+
+      else:
+        raise ValueError("Global config variable TOWER_STRATEGY must be one of "
+                         "'concat' or 'separate'.")
+
+    # Now we perform the analogous processing for grads_list
+    if isinstance(grads_list[0][0], (list, tuple)):
+      num_uses = len(grads_list[0][0])
+      if self._num_uses is not None and self._num_uses != num_uses:
+        raise ValueError("num_uses argument doesn't match length of outputs, "
+                         "or length of outputs is inconsistent with length of "
+                         "inputs.")
+      else:
+        self._num_uses = num_uses
+
+      if not all(len(grad) == num_uses for grads in grads_list
+                 for grad in grads):
+        raise ValueError("Length of outputs argument is inconsistent across "
+                         "towers.")
+
+      if fisher_factors.TOWER_STRATEGY == "concat":
+        # Reverse the tower and use/time-step indices, so that use is now first,
+        # and towers is second
+        grads_list = tuple(tuple(zip(*grads)) for grads in grads_list)
+
+        # Flatten the two dimensions, leaving the leading dimension (source)
+        # intact
+        grads_list = tuple(nest.flatten(grads) for grads in grads_list)
+
+        # Merge inner dimensions together into PartitionedTensors. We package
+        # them in a singleton tuple since the factors will expect a list over
+        # towers
+        grads_list = tuple((utils.PartitionedTensor(grads),)
+                           for grads in grads_list)
+
+      elif fisher_factors.TOWER_STRATEGY == "separate":
+        # Merge together the uses/time-step dimension into PartitionedTensors,
+        # but keep the leading dimension (towers) intact for the factors to
+        # process individually.
+        grads_list = tuple(tuple(utils.PartitionedTensor(grad)
+                                 for grad in grads)
+                           for grads in grads_list)
+
+      else:
+        raise ValueError("Global config variable TOWER_STRATEGY must be one of "
+                         "'concat' or 'separate'.")
+
+    if self._num_uses is None:
+      raise ValueError("You must supply a value for the num_uses argument if "
+                       "the number of uses cannot be inferred from inputs or "
+                       "outputs arguments (e.g. if they are both given in the "
+                       "single Tensor format, instead of as lists of Tensors.")
+
+    return inputs, grads_list
+
+
+class FullyConnectedMultiIndepFB(InputOutputMultiTowerMultiUse,
+                                 KroneckerProductFB):
   """FisherBlock for fully-connected layers that share parameters.
+
+  This class implements the "independence across time" approximation from the
+  following paper:
+    https://openreview.net/pdf?id=HyMTkQZAb
   """
 
-  def __init__(self, layer_collection, inputs, outputs, has_bias=False):
+  def __init__(self, layer_collection, has_bias=False, num_uses=None):
     """Creates a FullyConnectedMultiIndepFB block.
 
     Args:
       layer_collection: LayerCollection instance.
-      inputs: list or tuple of Tensors. Each Tensor has shape [batch_size,
-        inputs_size].
-      outputs: list or tuple of Tensors. Each Tensor has shape [batch_size,
-        outputs_size].
       has_bias: bool. If True, estimates Fisher with respect to a bias
         parameter as well as the layer's parameters.
+      num_uses: int or None. Number of uses of the layer in the model's graph.
+        Only required if the data is formatted with uses/time folded into the
+        batch dimension (instead of uses/time being a list dimension).
+        (Default: None)
     """
-
-    assert len(inputs) == len(outputs)
-    # We need to make sure inputs and outputs are tuples and not lists so that
-    # they get hashed by layer_collection.make_or_get_factor properly.
-    self._inputs = tuple(inputs)
-    self._outputs = tuple(outputs)
     self._has_bias = has_bias
-    self._num_uses = len(inputs)
-
-    super(FullyConnectedMultiIndepFB, self).__init__(layer_collection)
 
-  @property
-  def num_registered_minibatches(self):
-    # TODO(b/69411207): Add support for registering additional minibatches.
-    return 1
+    super(FullyConnectedMultiIndepFB, self).__init__(
+        layer_collection=layer_collection,
+        num_uses=num_uses)
 
   def instantiate_factors(self, grads_list, damping):
+    inputs, grads_list = self._process_data(grads_list)
 
     self._input_factor = self._layer_collection.make_or_get_factor(
         fisher_factors.FullyConnectedMultiKF,
-        ((self._inputs,), self._has_bias))
+        ((inputs,), self._num_uses, self._has_bias))
 
     self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+        fisher_factors.FullyConnectedMultiKF, (grads_list, self._num_uses))
 
-    damping = normalize_damping(damping, self._num_uses)
-    self._register_damped_input_and_output_inverses(damping)
+    self._setup_damping(damping, normalization=self._num_uses)
 
   @property
   def _renorm_coeff(self):
-    return self._num_uses
+    return float(self._num_uses)
 
-  def tensors_to_compute_grads(self):
-    return self._outputs
 
-  def num_inputs(self):
-    return len(self._inputs)
+class ConvKFCBasicMultiIndepFB(InputOutputMultiTowerMultiUse,
+                               KroneckerProductFB):
+  """FisherBlock for 2D convolutional layers using the basic KFC approx.
+
+  Similar to ConvKFCBasicFB except that this version supports multiple
+  uses/time-steps via a standard independence approximation.  Similar to the
+  "independence across time" used in FullyConnectedMultiIndepFB but generalized
+  in the obvious way to conv layers.
+  """
+
+  def __init__(self,
+               layer_collection,
+               params,
+               padding,
+               strides=None,
+               dilation_rate=None,
+               data_format=None,
+               extract_patches_fn=None,
+               num_uses=None):
+    """Creates a ConvKFCBasicMultiIndepFB block.
+
+    Args:
+      layer_collection: The collection of all layers in the K-FAC approximate
+          Fisher information matrix to which this FisherBlock belongs.
+      params: The parameters (Tensor or tuple of Tensors) of this layer. If
+        kernel alone, a Tensor of shape [..spatial_filter_shape..,
+        in_channels, out_channels]. If kernel and bias, a tuple of 2 elements
+        containing the previous and a Tensor of shape [out_channels].
+      padding: str. Padding method.
+      strides: List of ints or None. Contains [..spatial_filter_strides..] if
+        'extract_patches_fn' is compatible with tf.nn.convolution(), else
+        [1, ..spatial_filter_strides, 1].
+      dilation_rate: List of ints or None. Rate for dilation along each spatial
+        dimension if 'extract_patches_fn' is compatible with
+        tf.nn.convolution(), else [1, ..spatial_dilation_rates.., 1].
+      data_format: str or None. Format of input data.
+      extract_patches_fn: str or None. Name of function that extracts image
+        patches. One of "extract_convolution_patches", "extract_image_patches",
+        "extract_pointwise_conv2d_patches".
+      num_uses: int or None. Number of uses of the layer in the model's graph.
+        Only required if the data is formatted with uses/time folded into the
+        batch dimension (instead of uses/time being a list dimension).
+        (Default: None)
+    """
+    self._padding = padding
+    self._strides = maybe_tuple(strides)
+    self._dilation_rate = maybe_tuple(dilation_rate)
+    self._data_format = data_format
+    self._extract_patches_fn = extract_patches_fn
+    self._has_bias = isinstance(params, (tuple, list))
+
+    fltr = params[0] if self._has_bias else params
+    self._filter_shape = tuple(fltr.shape.as_list())
+
+    super(ConvKFCBasicMultiIndepFB, self).__init__(
+        layer_collection=layer_collection,
+        num_uses=num_uses)
+
+  def instantiate_factors(self, grads_list, damping):
+    inputs, grads_list = self._process_data(grads_list)
+
+    # Infer number of locations upon which convolution is applied.
+    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
+                                             self._strides)
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.ConvInputKroneckerFactor,
+        (inputs, self._filter_shape, self._padding, self._strides,
+         self._dilation_rate, self._data_format, self._extract_patches_fn,
+         self._has_bias))
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.ConvOutputKroneckerFactor, (grads_list,))
+
+    self._setup_damping(damping, normalization=
+                        (self._num_locations * self._num_uses))
+
+  @property
+  def _renorm_coeff(self):
+    return self._num_locations * self._num_uses
+
+
+class EmbeddingKFACMultiIndepFB(InputOutputMultiTowerMultiUse,
+                                KroneckerProductFB):
+  """K-FAC FisherBlock for embedding layers used multiple times in the graph.
+
+  Similar to EmbeddingKFACFB except that this version supports multiple uses
+  of the parameter within a single model. These uses could correspond to time
+  steps in an RNN architecture, but they don't have to.
+
+  Does not support bias parameters.
+  """
+
+  def __init__(self, layer_collection, vocab_size, num_uses=None):
+    """Creates a EmbeddingKFACMultiIndepFB block.
+
+    Args:
+      layer_collection: The collection of all layers in the K-FAC approximate
+          Fisher information matrix to which this FisherBlock belongs.
+      vocab_size: int. Size of vocabulary for this embedding layer.
+      num_uses: int or None. Number of uses of the layer in the model's graph.
+        Only required if the data is formatted with time folded into the batch
+        dimension (instead of time being a list dimension). (Default: None)
+    """
+    self._vocab_size = vocab_size
+
+    super(EmbeddingKFACMultiIndepFB, self).__init__(
+        layer_collection=layer_collection,
+        num_uses=num_uses)
+
+  def instantiate_factors(self, grads_list, damping):
+    """Instantiate Kronecker Factors for this FisherBlock.
+
+    Args:
+      grads_list: List of list of list of Tensors. grads_list[i][j][k] is the
+        gradient of the loss with respect to 'outputs' from source 'i',
+        tower/mini-batch 'j', and use/time-step 'k'. Each Tensor has shape
+        [tower_minibatch_size, output_size].
+      damping: 0-D Tensor or float. 'damping' * identity is approximately added
+        to this FisherBlock's Fisher approximation.
+    """
+    inputs, grads_list = self._process_data(grads_list)
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.EmbeddingInputKroneckerFactor,
+        (inputs, self._vocab_size))
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, (grads_list, self._num_uses))
+    self._setup_damping(damping, normalization=self._num_uses)
+
+  @property
+  def _renorm_coeff(self):
+    return float(self._num_uses)
 
 
 class SeriesFBApproximation(enum.IntEnum):
@@ -920,34 +1545,35 @@ class SeriesFBApproximation(enum.IntEnum):
   option2 = 2
 
 
-class FullyConnectedSeriesFB(FisherBlock):
+class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
+                             KroneckerProductFB):
   """FisherBlock for fully-connected layers that share parameters across time.
 
-  See the following preprint for details:
+  This class implements the "Option 1" and "Option 2" approximation from the
+  following paper:
     https://openreview.net/pdf?id=HyMTkQZAb
 
   See the end of the appendix of the paper for a pseudo-code of the
-  algorithm being implemented by multiply_inverse here.  Note that we are
+  algorithm being implemented by multiply_matpower here.  Note that we are
   using pre-computed versions of certain matrix-matrix products to speed
   things up.  This is explicitly explained wherever it is done.
   """
 
   def __init__(self,
                layer_collection,
-               inputs,
-               outputs,
                has_bias=False,
+               num_uses=None,
                option=SeriesFBApproximation.option2):
     """Constructs a new `FullyConnectedSeriesFB`.
 
     Args:
       layer_collection: The collection of all layers in the K-FAC approximate
         Fisher information matrix to which this FisherBlock belongs.
-      inputs: List of tensors of shape [batch_size, input_size].
-        Inputs to the layer.
-      outputs: List of tensors of shape [batch_size, input_size].
-        Outputs of the layer (before activations).
       has_bias: Whether the layer includes a bias parameter.
+      num_uses: int or None. Number of time-steps over which the layer
+        is used. Only required if the data is formatted with time folded into
+        the batch dimension (instead of time being a list dimension).
+        (Default: None)
       option: A `SeriesFBApproximation` specifying the simplifying assumption
         to be used in this block. `option1` approximates the cross-covariance
         over time as a symmetric matrix, while `option2` makes
@@ -955,48 +1581,58 @@ class FullyConnectedSeriesFB(FisherBlock):
         3.5 of the paper for more details.
     """
 
-    assert len(inputs) == len(outputs)
-    # We need to make sure inputs and outputs are tuples and not lists so that
-    # they get hashed by layer_collection.make_or_get_factor properly.
-    self._inputs = tuple(inputs)
-    self._outputs = tuple(outputs)
     self._has_bias = has_bias
-    self._num_timesteps = len(inputs)
     self._option = option
 
-    super(FullyConnectedSeriesFB, self).__init__(layer_collection)
+    super(FullyConnectedSeriesFB, self).__init__(
+        layer_collection=layer_collection,
+        num_uses=num_uses)
+
+  @property
+  def _num_timesteps(self):
+    return self._num_uses
 
   @property
-  def num_registered_minibatches(self):
-    # TODO(b/69411207): Add support for registering additional minibatches.
-    return 1
+  def _renorm_coeff(self):
+    # This should no longer be used since the multiply_X functions from the base
+    # class have been overridden
+    assert False
 
   def instantiate_factors(self, grads_list, damping):
+    inputs, grads_list = self._process_data(grads_list)
 
     self._input_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF, ((self._inputs,), self._has_bias))
+        fisher_factors.FullyConnectedMultiKF,
+        ((inputs,), self._num_uses, self._has_bias))
+    self._input_factor.register_cov_dt1()
 
     self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+        fisher_factors.FullyConnectedMultiKF, (grads_list, self._num_uses))
+    self._output_factor.register_cov_dt1()
 
-    damping = normalize_damping(damping, self._num_timesteps)
-    self._damping_input, self._damping_output = compute_pi_adjusted_damping(
-        self._input_factor.get_cov(),
-        self._output_factor.get_cov(),
-        damping**0.5)
+    self._setup_damping(damping, normalization=self._num_uses)
+
+  def register_matpower(self, exp):
+    if exp != -1:
+      raise NotImplementedError("FullyConnectedSeriesFB only supports inverse"
+                                "multiplications.")
 
     if self._option == SeriesFBApproximation.option1:
-      self._input_factor.register_option1quants(self._damping_input)
-      self._output_factor.register_option1quants(self._damping_output)
+      self._input_factor.register_option1quants(self._input_damping_func)
+      self._output_factor.register_option1quants(self._output_damping_func)
     elif self._option == SeriesFBApproximation.option2:
-      self._input_factor.register_option2quants(self._damping_input)
-      self._output_factor.register_option2quants(self._damping_output)
+      self._input_factor.register_option2quants(self._input_damping_func)
+      self._output_factor.register_option2quants(self._output_damping_func)
     else:
       raise ValueError(
           "Unrecognized FullyConnectedSeriesFB approximation: {}".format(
               self._option))
 
-  def multiply_inverse(self, vector):
+  def multiply_matpower(self, vector, exp):
+    if exp != -1:
+      raise NotImplementedError("FullyConnectedSeriesFB only supports inverse"
+                                "multiplications.")
+
     # pylint: disable=invalid-name
 
     Z = utils.layer_params_to_mat2d(vector)
@@ -1007,9 +1643,11 @@ class FullyConnectedSeriesFB(FisherBlock):
 
     if self._option == SeriesFBApproximation.option1:
 
-      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
-      L_A, psi_A = self._input_factor.get_option1quants(self._damping_input)
-      L_G, psi_G = self._output_factor.get_option1quants(self._damping_output)
+      # Note that \\(L_A = A0^{-1/2} * U_A and L_G = G0^{-1/2} * U_G.\\)
+      L_A, psi_A = self._input_factor.get_option1quants(
+          self._input_damping_func)
+      L_G, psi_G = self._output_factor.get_option1quants(
+          self._output_damping_func)
 
       def gamma(x):
         # We are assuming that each case has the same number of time-steps.
@@ -1019,60 +1657,61 @@ class FullyConnectedSeriesFB(FisherBlock):
         T = self._num_timesteps
         return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
 
-      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
+      # \\(Y = \gamma( psi_G*psi_A^T )\\) (computed element-wise)
       # Even though Y is Z-independent we are recomputing it from the psi's
       # each since Y depends on both A and G quantities, and it is relatively
       # cheap to compute.
       Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
 
-      # Z = L_G^T * Z * L_A
+      # \\(Z = L_G^T * Z * L_A\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = U_G^T * Z * U_A
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = U_G^T * Z * U_A\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
 
-      # Z = Z .* Y
+      # \\(Z = Z .* Y\\)
       Z *= Y
 
-      # Z = L_G * Z * L_A^T
+      # \\(Z = L_G * Z * L_A^T\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = U_G * Z * U_A^T
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # \\(Z = U_G * Z * U_A^T\\)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
 
     elif self._option == SeriesFBApproximation.option2:
 
-      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
-      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
-      P_A, K_A, mu_A = self._input_factor.get_option2quants(self._damping_input)
+      # Note that \\(P_A = A_1^T * A_0^{-1} and P_G = G_1^T * G_0^{-1}\\),
+      # and \\(K_A = A_0^{-1/2} * E_A\ and\ K_G = G_0^{-1/2} * E_G.\\)
+      P_A, K_A, mu_A = self._input_factor.get_option2quants(
+          self._input_damping_func)
       P_G, K_G, mu_G = self._output_factor.get_option2quants(
-          self._damping_output)
+          self._output_damping_func)
 
       # Our approach differs superficially from the pseudo-code in the paper
       # in order to reduce the total number of matrix-matrix multiplies.
       # In particular, the first three computations in the pseudo code are
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = Z - hPsi_G^T * Z * hPsi_A
-      # Z = E_G^T * Z * E_A
-      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
-      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = Z - hPsi_G^T * Z * hPsi_A\\)
+      # \\(Z = E_G^T * Z * E_A\\)
+      # Noting that hPsi = C0^{-1/2} * C1 * C0^{-1/2}\\), so that
+      # \\(C0^{-1/2} * hPsi = C0^{-1} * C1 * C0^{-1/2} = P^T * C0^{-1/2}\\)
       # the entire computation can be written as
-      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
-      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
-      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
-      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
-      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
+      # \\(Z = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - hPsi_G^T * G0^{-1/2} * Z * A0^{-1/2} * hPsi_A) * E_A\\)
+      # \\(  = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2}) * E_A\\)
+      # \\(  = E_G^T * G0^{-1/2} * Z * A0^{-1/2} * E_A\\)
+      # \\(    -  E_G^T* G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2} * E_A\\)
+      # \\(  = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A\\)
       # This final expression is computed by the following two lines:
-      # Z = Z - P_G * Z * P_A^T
+      # \\(Z = Z - P_G * Z * P_A^T\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
-      # Z = K_G^T * Z * K_A
+      # \\(Z = K_G^T * Z * K_A\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
 
-      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
+      # \\(Z = Z ./ (1*1^T - mu_G*mu_A^T)\\)
       # Be careful with the outer product.  We don't want to accidentally
       # make it an inner-product instead.
       tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
@@ -1083,13 +1722,13 @@ class FullyConnectedSeriesFB(FisherBlock):
       # We now perform the transpose/reverse version of the operations
       # derived above, whose derivation from the original pseudo-code is
       # analgous.
-      # Z = K_G * Z * K_A^T
+      # \\(Z = K_G * Z * K_A^T\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
 
-      # Z = Z - P_G^T * Z * P_A
+      # \\(Z = Z - P_G^T * Z * P_A\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
 
-      # Z = normalize (1/E[T]) * Z
+      # \\(Z = normalize (1/E[T]) * Z\\)
       # Note that this normalization is done because we compute the statistics
       # by averaging, not summing, over time. (And the gradient is presumably
       # summed over time, not averaged, and thus their scales are different.)
@@ -1102,11 +1741,11 @@ class FullyConnectedSeriesFB(FisherBlock):
 
     # pylint: enable=invalid-name
 
-  def multiply(self, vector):
-    raise NotImplementedError
+  def multiply_cholesky(self, vector):
+    raise NotImplementedError("FullyConnectedSeriesFB does not support "
+                              "Cholesky computations.")
 
-  def tensors_to_compute_grads(self):
-    return self._outputs
+  def multiply_cholesky_inverse(self, vector):
+    raise NotImplementedError("FullyConnectedSeriesFB does not support "
+                              "Cholesky computations.")
 
-  def num_inputs(self):
-    return len(self._inputs)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index 603d8b8b210279ee6d8f1de0ce10869fde23f4d9..b43232dfafaa6d90ca3feda65e5c412d3b755651 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -24,6 +24,7 @@ import contextlib
 import numpy as np
 import six
 
+from tensorflow.contrib.kfac.python.ops import linear_operator as lo
 from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as tf_ops
@@ -32,17 +33,24 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import moving_averages
+from tensorflow.python.util import nest
+
 
 # Whether to initialize covariance estimators at a zero matrix (or the identity
 # matrix).
-INIT_COVARIANCES_AT_ZERO = False
+INIT_COVARIANCES_AT_ZERO = True
 
 # Whether to zero-debias the moving averages.
-ZERO_DEBIAS = False
+ZERO_DEBIAS = True
+
+# Whether to initialize inverse (and other such matrices computed from the cov
+# matrices) to the zero matrix (or the identity matrix).
+INIT_INVERSES_AT_ZERO = True
 
 # When the number of inverses requested from a FisherFactor exceeds this value,
 # the inverses are computed using an eigenvalue decomposition.
@@ -53,63 +61,99 @@ EIGENVALUE_DECOMPOSITION_THRESHOLD = 2
 # matrix powers. Must be nonnegative.
 EIGENVALUE_CLIPPING_THRESHOLD = 0.0
 
-# Colocate the covariance ops and variables with the input tensors for each
-# factor.
-COLOCATE_COV_OPS_WITH_INPUTS = True
+# Used to subsample the flattened extracted image patches. The number of
+# outer products per row of the covariance matrix should not exceed this
+# value. This parameter is used only if `_SUB_SAMPLE_OUTER_PRODUCTS` is True.
+_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = 1
 
+# Used to subsample the inputs passed to the extract image patches. The batch
+# size of number of inputs to extract image patches is multiplied by this
+# factor. This parameter is used only if `_SUB_SAMPLE_INPUTS` is True.
+_INPUTS_TO_EXTRACT_PATCHES_FACTOR = 0.5
 
-@contextlib.contextmanager
-def maybe_colocate_with(op):
-  """Context to colocate with `op` if `COLOCATE_COV_OPS_WITH_INPUTS`."""
-  if COLOCATE_COV_OPS_WITH_INPUTS:
-    if isinstance(op, (list, tuple)):
-      with tf_ops.colocate_with(op[0]):
-        yield
-    else:
-      with tf_ops.colocate_with(op):
-        yield
-  else:
-    yield
+# If True, then subsamples the tensor passed to compute the covaraince matrix.
+_SUB_SAMPLE_OUTER_PRODUCTS = False
+
+# If True, then subsamples the tensor passed to compute the covaraince matrix.
+_SUB_SAMPLE_INPUTS = False
+
+# TOWER_STRATEGY can be one of "concat" or "separate".  If "concat", the data
+# passed to the factors from the blocks will be concatenated across towers
+# (lazilly via PartitionedTensor objects).  Otherwise a tuple of tensors over
+# towers will be passed in, and the factors will iterate over this and do the
+# cov computations separately for each one, averaging the results together.
+TOWER_STRATEGY = "concat"
 
 
 def set_global_constants(init_covariances_at_zero=None,
                          zero_debias=None,
+                         init_inverses_at_zero=None,
                          eigenvalue_decomposition_threshold=None,
                          eigenvalue_clipping_threshold=None,
-                         colocate_cov_ops_with_inputs=None):
+                         max_num_outer_products_per_cov_row=None,
+                         sub_sample_outer_products=None,
+                         inputs_to_extract_patches_factor=None,
+                         sub_sample_inputs=None,
+                         tower_strategy=None):
   """Sets various global constants used by the classes in this module."""
   global INIT_COVARIANCES_AT_ZERO
   global ZERO_DEBIAS
+  global INIT_INVERSES_AT_ZERO
   global EIGENVALUE_DECOMPOSITION_THRESHOLD
   global EIGENVALUE_CLIPPING_THRESHOLD
-  global COLOCATE_COV_OPS_WITH_INPUTS
+  global _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW
+  global _SUB_SAMPLE_OUTER_PRODUCTS
+  global _INPUTS_TO_EXTRACT_PATCHES_FACTOR
+  global _SUB_SAMPLE_INPUTS
+  global TOWER_STRATEGY
 
   if init_covariances_at_zero is not None:
     INIT_COVARIANCES_AT_ZERO = init_covariances_at_zero
   if zero_debias is not None:
     ZERO_DEBIAS = zero_debias
+  if init_inverses_at_zero is not None:
+    INIT_INVERSES_AT_ZERO = init_inverses_at_zero
   if eigenvalue_decomposition_threshold is not None:
     EIGENVALUE_DECOMPOSITION_THRESHOLD = eigenvalue_decomposition_threshold
   if eigenvalue_clipping_threshold is not None:
     EIGENVALUE_CLIPPING_THRESHOLD = eigenvalue_clipping_threshold
-  if colocate_cov_ops_with_inputs is not None:
-    COLOCATE_COV_OPS_WITH_INPUTS = colocate_cov_ops_with_inputs
+  if max_num_outer_products_per_cov_row is not None:
+    _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = max_num_outer_products_per_cov_row
+  if sub_sample_outer_products is not None:
+    _SUB_SAMPLE_OUTER_PRODUCTS = sub_sample_outer_products
+  if inputs_to_extract_patches_factor is not None:
+    _INPUTS_TO_EXTRACT_PATCHES_FACTOR = inputs_to_extract_patches_factor
+  if sub_sample_inputs is not None:
+    _SUB_SAMPLE_INPUTS = sub_sample_inputs
+  if tower_strategy is not None:
+    TOWER_STRATEGY = tower_strategy
 
 
 def inverse_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
-  return array_ops.diag(array_ops.ones(shape[0], dtype))
+  if INIT_INVERSES_AT_ZERO:
+    return array_ops.zeros(shape, dtype=dtype)
+  return linalg_ops.eye(num_rows=shape[0], dtype=dtype)
 
 
 def covariance_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
   if INIT_COVARIANCES_AT_ZERO:
-    return array_ops.diag(array_ops.zeros(shape[0], dtype))
-  return array_ops.diag(array_ops.ones(shape[0], dtype))
+    return array_ops.zeros(shape, dtype=dtype)
+  return linalg_ops.eye(num_rows=shape[0], dtype=dtype)
 
 
-def diagonal_covariance_initializer(shape, dtype, partition_info):  # pylint: disable=unused-argument
+def diagonal_covariance_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
   if INIT_COVARIANCES_AT_ZERO:
-    return array_ops.zeros(shape, dtype)
-  return array_ops.ones(shape, dtype)
+    return array_ops.zeros(shape, dtype=dtype)
+  return array_ops.ones(shape, dtype=dtype)
+
+
+@contextlib.contextmanager
+def place_on_device(device):
+  if device is not None and len(device):
+    with tf_ops.device(device):
+      yield
+  else:
+    yield
 
 
 def compute_cov(tensor, tensor_right=None, normalizer=None):
@@ -181,7 +225,9 @@ def scope_string_from_params(params):
 
   name_parts = []
   for param in params:
-    if isinstance(param, (tuple, list)):
+    if param is None:
+      name_parts.append("None")
+    elif isinstance(param, (tuple, list)):
       if all([isinstance(p, int) for p in param]):
         name_parts.append("-".join([str(p) for p in param]))
       else:
@@ -190,6 +236,8 @@ def scope_string_from_params(params):
       name_parts.append(str(param))
     elif isinstance(param, (tf_ops.Tensor, variables.Variable)):
       name_parts.append(scope_string_from_name(param))
+    elif isinstance(param, utils.PartitionedTensor):
+      name_parts.append(scope_string_from_name(param.tensors))
     else:
       raise ValueError("Encountered an unsupported param type {}".format(
           type(param)))
@@ -207,6 +255,74 @@ def scalar_or_tensor_to_string(val):
   return repr(val) if np.isscalar(val) else scope_string_from_name(val)
 
 
+def list_to_string(lst):
+  return "_".join(val if isinstance(val, six.string_types)
+                  else scalar_or_tensor_to_string(val) for val in lst)
+
+
+def graph_func_to_id(func):
+  """Returns a hashable object that represents func's computation."""
+  # TODO(b/74201126): replace with Topohash of func's output
+  return func.func_id
+
+
+def graph_func_to_string(func):
+  # TODO(b/74201126): replace with Topohash of func's output
+  return list_to_string(func.func_id)
+
+
+def _subsample_for_cov_computation(array, name=None):
+  """Subsamples the first dimension of the array.
+
+  `array`(A) is a tensor of shape `[batch_size, dim_2]`. Then the covariance
+  matrix(A^TA) is of shape `dim_2 ** 2`. Subsample only if the number of outer
+  products per row of the covariance matrix is greater than
+  `_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW`.
+
+  Args:
+    array: Tensor, of shape `[batch_size, dim_2]`.
+    name: `string`, Default(None)
+
+  Returns:
+    A tensor of shape `[max_samples, dim_2]`.
+
+  Raises:
+    ValueError: If array's is not matrix-shaped.
+    ValueError: If array's batch_size cannot be inferred.
+
+  """
+  with tf_ops.name_scope(name, "subsample", [array]):
+    array = tf_ops.convert_to_tensor(array)
+    if len(array.shape) != 2:
+      raise ValueError("Input param array must be a matrix.")
+
+    batch_size = array.shape.as_list()[0]
+    if batch_size is None:
+      raise ValueError("Unable to get batch_size from input param array.")
+
+    num_cov_rows = array.shape.as_list()[-1]
+    max_batch_size = int(_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW * num_cov_rows)
+    if batch_size <= max_batch_size:
+      return array
+
+    return _random_tensor_gather(array, max_batch_size)
+
+
+def _random_tensor_gather(array, max_size):
+  """Generates a random set of indices and gathers the value at the indcices.
+
+  Args:
+    array: Tensor, of shape `[batch_size, dim_2]`.
+    max_size: int, Number of indices to sample.
+
+  Returns:
+    A tensor of shape `[max_size, ...]`.
+  """
+  batch_size = array.shape.as_list()[0]
+  indices = random_ops.random_shuffle(math_ops.range(0, batch_size))[:max_size]
+  return array_ops.gather(array, indices)
+
+
 @six.add_metaclass(abc.ABCMeta)
 class FisherFactor(object):
   """Base class for objects modeling factors of approximate Fisher blocks.
@@ -223,13 +339,10 @@ class FisherFactor(object):
   Note that for blocks that aren't based on approximations, a 'factor' can
   be the entire block itself, as is the case for the diagonal and full
   representations.
-
-  Subclasses must implement the _compute_new_cov() method, and the _var_scope
-  and _cov_shape properties.
   """
 
   def __init__(self):
-    self.instantiate_covariance()
+    self._cov = None
 
   @abc.abstractproperty
   def _var_scope(self):
@@ -240,6 +353,10 @@ class FisherFactor(object):
     """
     pass
 
+  @property
+  def name(self):
+    return self._var_scope
+
   @abc.abstractproperty
   def _cov_shape(self):
     """The shape of the variable backing this FisherFactor."""
@@ -257,6 +374,10 @@ class FisherFactor(object):
     """
     pass
 
+  @abc.abstractproperty
+  def _num_towers(self):
+    pass
+
   @abc.abstractproperty
   def _dtype(self):
     """dtype for variable backing this factor."""
@@ -267,8 +388,9 @@ class FisherFactor(object):
     """Function for initializing covariance variable."""
     return covariance_initializer
 
-  def instantiate_covariance(self):
-    """Instantiates the covariance Variable as the instance member _cov."""
+  def instantiate_cov_variables(self):
+    """Makes the internal cov variable(s)."""
+    assert self._cov is None
     with variable_scope.variable_scope(self._var_scope):
       self._cov = variable_scope.get_variable(
           "cov",
@@ -278,15 +400,17 @@ class FisherFactor(object):
           dtype=self._dtype)
 
   @abc.abstractmethod
-  def _compute_new_cov(self, idx=0):
+  def _compute_new_cov(self, source, tower):
     """Computes minibatch-estimated covariance for a single source.
 
     Args:
-      idx: int in [0, self._num_sources). Which source to use when estimating
-        covariance.
+      source: int in [0, self._num_sources). Which source to use when computing
+        the cov update.
+      tower: int in [0, self._num_towers). Which tower to use when computing
+        the cov update.
 
     Returns:
-      Tensor of same shape as self.get_cov_var().
+      Tensor of same shape as self.get_cov().
     """
     pass
 
@@ -298,123 +422,80 @@ class FisherFactor(object):
     Returns:
       An Op for updating the covariance Variable referenced by _cov.
     """
-    new_cov_contribs = tuple(self._compute_new_cov(idx)
-                             for idx in range(self._num_sources))
-    # This gets the job done but we might want a better solution in the future.
-    # In particular, we could have a separate way of specifying where the
-    # the cov variables finally end up, independent of where their various
-    # contributions are computed.  Right now these are the same thing, but in
-    # the future we might want to perform the cov computations on each tower,
-    # so that each tower will be considered a "source" (allowing us to reuse
-    # the existing "source" code for this).
-    with maybe_colocate_with(new_cov_contribs[0]):
-      new_cov = math_ops.add_n(new_cov_contribs)
-      # Synchronize value across all TPU cores.
-      if utils.on_tpu():
-        new_cov = utils.cross_replica_mean(new_cov)
-      return moving_averages.assign_moving_average(
-          self._cov, new_cov, ema_decay, zero_debias=ZERO_DEBIAS)
+    new_cov_contribs = []
+    for source in range(self._num_sources):
+      for tower in range(self._num_towers):
+        device = (self._get_data_device(tower)
+                  if TOWER_STRATEGY == "separate" else None)
+        with place_on_device(device):
+          new_cov_contribs.append(self._compute_new_cov(source, tower))
+
+    new_cov = math_ops.add_n(new_cov_contribs) / float(self._num_towers)
+
+    # Compute average of 'new_cov' across all TPU cores. On a TPU, each
+    # instance of 'new_cov' will be based on a different minibatch. This ensures
+    # that by the end of assign_moving_average(), all TPU cores see the same
+    # value for self._cov.
+    #
+    # Other implementations of make_covariance_update_op() that accumulate
+    # statistics in other variables should mimic this behavior.
+    if utils.on_tpu():
+      new_cov = utils.cross_replica_mean(new_cov)
+
+    return moving_averages.assign_moving_average(
+        self._cov, new_cov, ema_decay, zero_debias=ZERO_DEBIAS)
 
   @abc.abstractmethod
-  def make_inverse_update_ops(self):
-    """Create and return update ops corresponding to registered computations."""
+  def _get_data_device(self, tower):
     pass
 
   @abc.abstractmethod
-  def get_cov(self):
-    """Get full covariance matrix.
-
-    Returns:
-      Tensor of shape [n, n]. Represents all parameter-parameter correlations
-      captured by this FisherFactor.
-    """
+  def instantiate_inv_variables(self):
+    """Makes the internal "inverse" variable(s)."""
     pass
 
-  def get_cov_var(self):
-    """Get variable backing this FisherFactor.
-
-    May or may not be the same as self.get_cov()
+  @abc.abstractmethod
+  def make_inverse_update_ops(self):
+    """Create and return update ops corresponding to registered computations."""
+    pass
 
-    Returns:
-      Variable of shape self._cov_shape.
-    """
+  def get_cov(self):
     return self._cov
 
   @abc.abstractmethod
-  def left_multiply(self, x, damping):
-    """Multiplies 'x' by the damped covariance of this factor.
-
-    Let C be the covariance matrix this factor represents, and
-    D = C + damping * I be its damped variant. This method calculates
-    matmul(D, vec(x)).
-
-    Args:
-      x: Tensor. Represents a single vector. Shape depends on implementation.
-      damping: 0-D Tensor. Damping to add to C's diagonal.
-
-    Returns:
-      Tensor of same shape as 'x'.
-    """
+  def get_cov_as_linear_operator(self):
     pass
 
   @abc.abstractmethod
-  def right_multiply(self, x, damping):
-    """Multiplies 'x' by the damped covariance of this factor.
-
-    Let C be the covariance matrix this factor represents, and
-    D = C + damping * I be its damped variant. This method calculates
-    matmul(vec(x), D).
-
-    Args:
-      x: Tensor. Represents a single vector. Shape depends on implementation.
-      damping: 0-D Tensor. Damping to add to C's diagonal.
-
-    Returns:
-      Tensor of same shape as 'x'.
-    """
+  def register_matpower(self, exp, damping_func):
     pass
 
   @abc.abstractmethod
-  def left_multiply_inverse(self, x, damping):
-    """Multiplies 'x' by damped inverse of this factor.
-
-    Let C be the covariance matrix this factor represents and
-    E = inv(C + damping * I) be its damped inverse. This method calculates
-    matmul(E, vec(x)).
-
-    Args:
-      x: Tensor. Represents a single vector. Shape depends on implementation.
-      damping: 0-D Tensor. Damping to add to C's diagonal.
-
-    Returns:
-      Tensor of same shape as 'x'.
-    """
+  def register_cholesky(self, damping_func):
     pass
 
   @abc.abstractmethod
-  def right_multiply_inverse(self, x, damping):
-    """Multiplies 'x' by damped inverse of this factor.
+  def register_cholesky_inverse(self, damping_func):
+    pass
 
-    Let C be the covariance matrix this factor represents and
-    E = inv(C + damping * I) be its damped inverse. This method calculates
-    matmul(vec(x), E).
+  @abc.abstractmethod
+  def get_matpower(self, exp, damping_func):
+    pass
 
-    Args:
-      x: Tensor. Represents a single vector. Shape depends on implementation.
-      damping: 0-D Tensor. Damping to add to C's diagonal.
+  @abc.abstractmethod
+  def get_cholesky(self, damping_func):
+    pass
 
-    Returns:
-      Tensor of same shape as 'x'.
-    """
+  @abc.abstractmethod
+  def get_cholesky_inverse(self, damping_func):
     pass
 
 
-class InverseProvidingFactor(FisherFactor):
-  """Base class for FisherFactors that maintain inverses explicitly.
+class DenseSquareMatrixFactor(FisherFactor):
+  """Base class for FisherFactors that are stored as dense square matrices.
 
-  This class explicitly calculates and stores inverses of covariance matrices
-  provided by the underlying FisherFactor implementation. It is assumed that
-  vectors can be represented as 2-D matrices.
+  This class explicitly calculates and stores inverses of their `cov` matrices,
+  which must be square dense matrices.
 
   Subclasses must implement the _compute_new_cov method, and the _var_scope and
   _cov_shape properties.
@@ -428,47 +509,94 @@ class InverseProvidingFactor(FisherFactor):
   # the latter.
 
   def __init__(self):
-    self._inverses_by_damping = {}
-    self._matpower_by_exp_and_damping = {}
+    self._matpower_by_exp_and_damping = {}  # { (float, hashable): variable }
+    self._matpower_registrations = set()  # { (float, hashable) }
     self._eigendecomp = None
+    self._damping_funcs_by_id = {}  # {hashable: lambda}
+
+    self._cholesky_registrations = set()  # { hashable }
+    self._cholesky_inverse_registrations = set()  # { hashable }
+
+    self._cholesky_by_damping = {}  # { hashable: variable }
+    self._cholesky_inverse_by_damping = {}  # { hashable: variable }
 
-    super(InverseProvidingFactor, self).__init__()
+    super(DenseSquareMatrixFactor, self).__init__()
 
-  def register_damped_inverse(self, damping):
-    """Registers a damped inverse needed by a FisherBlock.
+  def get_cov_as_linear_operator(self):
+    assert self.get_cov().shape.ndims == 2
+    return lo.LinearOperatorFullMatrix(self.get_cov(),
+                                       is_self_adjoint=True,
+                                       is_square=True)
+
+  def _register_damping(self, damping_func):
+    damping_id = graph_func_to_id(damping_func)
+    if damping_id not in self._damping_funcs_by_id:
+      self._damping_funcs_by_id[damping_id] = damping_func
+    return damping_id
+
+  def register_inverse(self, damping_func):
+    # Just for backwards compatibility of some old code and tests
+    self.register_matpower(-1, damping_func)
+
+  def register_matpower(self, exp, damping_func):
+    """Registers a matrix power to be maintained and served on demand.
 
     This creates a variable and signals make_inverse_update_ops to make the
     corresponding update op.  The variable can be read via the method
-    get_inverse.
+    get_matpower.
 
     Args:
-      damping: The damping value (float or Tensor) for this factor.
+      exp: float.  The exponent to use in the matrix power.
+      damping_func: A function that computes a 0-D Tensor or a float which will
+        be the damping value used.  i.e. damping = damping_func().
     """
-    if damping not in self._inverses_by_damping:
-      damping_string = scalar_or_tensor_to_string(damping)
-      with variable_scope.variable_scope(self._var_scope):
-        inv = variable_scope.get_variable(
-            "inv_damp{}".format(damping_string),
-            initializer=inverse_initializer,
-            shape=self._cov_shape,
-            trainable=False,
-            dtype=self._dtype)
-      self._inverses_by_damping[damping] = inv
+    if exp == 1.0:
+      return
+
+    damping_id = self._register_damping(damping_func)
 
-  def register_matpower(self, exp, damping):
-    """Registers a matrix power needed by a FisherBlock.
+    if (exp, damping_id) not in self._matpower_registrations:
+      self._matpower_registrations.add((exp, damping_id))
+
+  def register_cholesky(self, damping_func):
+    """Registers a Cholesky factor to be maintained and served on demand.
 
     This creates a variable and signals make_inverse_update_ops to make the
     corresponding update op.  The variable can be read via the method
-    get_matpower.
+    get_cholesky.
 
     Args:
-      exp: The exponent (float or Tensor) to raise the matrix to.
-      damping: The damping value (float or Tensor).
+      damping_func: A function that computes a 0-D Tensor or a float which will
+        be the damping value used.  i.e. damping = damping_func().
     """
-    if (exp, damping) not in self._matpower_by_exp_and_damping:
+    damping_id = self._register_damping(damping_func)
+
+    if damping_id not in self._cholesky_registrations:
+      self._cholesky_registrations.add(damping_id)
+
+  def register_cholesky_inverse(self, damping_func):
+    """Registers an inverse Cholesky factor to be maintained/served on demand.
+
+    This creates a variable and signals make_inverse_update_ops to make the
+    corresponding update op.  The variable can be read via the method
+    get_cholesky_inverse.
+
+    Args:
+      damping_func: A function that computes a 0-D Tensor or a float which will
+        be the damping value used.  i.e. damping = damping_func().
+    """
+    damping_id = self._register_damping(damping_func)
+
+    if damping_id not in self._cholesky_inverse_registrations:
+      self._cholesky_inverse_registrations.add(damping_id)
+
+  def instantiate_inv_variables(self):
+    """Makes the internal "inverse" variable(s)."""
+
+    for (exp, damping_id) in self._matpower_registrations:
       exp_string = scalar_or_tensor_to_string(exp)
-      damping_string = scalar_or_tensor_to_string(damping)
+      damping_func = self._damping_funcs_by_id[damping_id]
+      damping_string = graph_func_to_string(damping_func)
       with variable_scope.variable_scope(self._var_scope):
         matpower = variable_scope.get_variable(
             "matpower_exp{}_damp{}".format(exp_string, damping_string),
@@ -476,34 +604,62 @@ class InverseProvidingFactor(FisherFactor):
             shape=self._cov_shape,
             trainable=False,
             dtype=self._dtype)
-      self._matpower_by_exp_and_damping[(exp, damping)] = matpower
+      assert (exp, damping_id) not in self._matpower_by_exp_and_damping
+      self._matpower_by_exp_and_damping[(exp, damping_id)] = matpower
+
+    for damping_id in self._cholesky_registrations:
+      damping_func = self._damping_funcs_by_id[damping_id]
+      damping_string = graph_func_to_string(damping_func)
+      with variable_scope.variable_scope(self._var_scope):
+        chol = variable_scope.get_variable(
+            "cholesky_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+      assert damping_id not in self._cholesky_by_damping
+      self._cholesky_by_damping[damping_id] = chol
+
+    for damping_id in self._cholesky_inverse_registrations:
+      damping_func = self._damping_funcs_by_id[damping_id]
+      damping_string = graph_func_to_string(damping_func)
+      with variable_scope.variable_scope(self._var_scope):
+        cholinv = variable_scope.get_variable(
+            "cholesky_inverse_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+      assert damping_id not in self._cholesky_inverse_by_damping
+      self._cholesky_inverse_by_damping[damping_id] = cholinv
 
   def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
     ops = []
 
-    # We do this to ensure that we don't reuse the eigendecomp from old calls
-    # to make_inverse_update_ops that may be placed on different devices.  This
-    # can happen is the user has both a permanent and lazily constructed
-    # version of the inverse ops (and only uses one of them).
-    self.reset_eigendecomp()
+    num_inverses = sum(1 for (exp, _) in self._matpower_by_exp_and_damping
+                       if exp == -1)
+
+    num_other_matpower = len(self._matpower_by_exp_and_damping) - num_inverses
+
+    other_matrix_power_registered = num_other_matpower >= 1
 
-    num_inverses = len(self._inverses_by_damping)
-    matrix_power_registered = bool(self._matpower_by_exp_and_damping)
     use_eig = (
-        self._eigendecomp or matrix_power_registered or
+        self._eigendecomp or other_matrix_power_registered or
         num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)
 
+    # We precompute these so we don't need to evaluate them multiple times (for
+    # each matrix power that uses them)
+    damping_value_by_id = {damping_id: math_ops.cast(
+        self._damping_funcs_by_id[damping_id](), self._dtype)
+                           for damping_id in self._damping_funcs_by_id}
+
     if use_eig:
       eigenvalues, eigenvectors = self.get_eigendecomp()  # pylint: disable=unpacking-non-sequence
 
-      for damping, inv in self._inverses_by_damping.items():
-        ops.append(
-            inv.assign(
-                math_ops.matmul(eigenvectors / (eigenvalues + damping),
-                                array_ops.transpose(eigenvectors))))
-
-      for (exp, damping), matpower in self._matpower_by_exp_and_damping.items():
+      for (exp, damping_id), matpower in (
+          self._matpower_by_exp_and_damping.items()):
+        damping = damping_value_by_id[damping_id]
         ops.append(
             matpower.assign(
                 math_ops.matmul(eigenvectors *
@@ -512,30 +668,95 @@ class InverseProvidingFactor(FisherFactor):
       # These ops share computation and should be run on a single device.
       ops = [control_flow_ops.group(*ops)]
     else:
-      for damping, inv in self._inverses_by_damping.items():
-        ops.append(inv.assign(utils.posdef_inv(self._cov, damping)))
-
+      for (exp, damping_id), matpower in (
+          self._matpower_by_exp_and_damping.items()):
+        assert exp == -1
+        damping = damping_value_by_id[damping_id]
+        ops.append(matpower.assign(utils.posdef_inv(self.get_cov(), damping)))
+
+    # TODO(b/77902055): If inverses are being computed with Cholesky's
+    # we can share the work. Instead this code currently just computes the
+    # Cholesky a second time. It does at least share work between requests for
+    # Cholesky's and Cholesky inverses with the same damping id.
+    for damping_id, cholesky_inv in self._cholesky_inverse_by_damping.items():
+      cholesky_ops = []
+
+      damping = damping_value_by_id[damping_id]
+      cholesky_value = utils.cholesky(self.get_cov(), damping)
+
+      if damping_id in self._cholesky_by_damping:
+        cholesky = self._cholesky_by_damping[damping_id]
+        cholesky_ops.append(cholesky.assign(cholesky_value))
+
+      identity = linalg_ops.eye(cholesky_value.shape.as_list()[0],
+                                dtype=cholesky_value.dtype)
+      cholesky_inv_value = linalg_ops.matrix_triangular_solve(cholesky_value,
+                                                              identity)
+      cholesky_ops.append(cholesky_inv.assign(cholesky_inv_value))
+
+      ops.append(control_flow_ops.group(*cholesky_ops))
+
+    for damping_id, cholesky in self._cholesky_by_damping.items():
+      if damping_id not in self._cholesky_inverse_by_damping:
+        damping = damping_value_by_id[damping_id]
+        cholesky_value = utils.cholesky(self.get_cov(), damping)
+        ops.append(cholesky.assign(cholesky_value))
+
+    self._eigendecomp = False
     return ops
 
-  def get_damped_inverse(self, damping):
+  def get_inverse(self, damping_func):
+    # Just for backwards compatibility of some old code and tests
+    return self.get_matpower(-1, damping_func)
+
+  def get_matpower(self, exp, damping_func):
     # Note that this function returns a variable which gets updated by the
     # inverse ops.  It may be stale / inconsistent with the latest value of
     # get_cov().
-    return self._inverses_by_damping[damping]
-
-  def get_matpower(self, exp, damping):
+    if exp != 1:
+      damping_id = graph_func_to_id(damping_func)
+      matpower = self._matpower_by_exp_and_damping[(exp, damping_id)]
+    else:
+      matpower = self.get_cov()
+      identity = linalg_ops.eye(matpower.shape.as_list()[0],
+                                dtype=matpower.dtype)
+      matpower += math_ops.cast(damping_func(), dtype=matpower.dtype)*identity
+
+    assert matpower.shape.ndims == 2
+    return lo.LinearOperatorFullMatrix(matpower,
+                                       is_non_singular=True,
+                                       is_self_adjoint=True,
+                                       is_positive_definite=True,
+                                       is_square=True)
+
+  def get_cholesky(self, damping_func):
     # Note that this function returns a variable which gets updated by the
     # inverse ops.  It may be stale / inconsistent with the latest value of
     # get_cov().
-    return self._matpower_by_exp_and_damping[(exp, damping)]
+    damping_id = graph_func_to_id(damping_func)
+    cholesky = self._cholesky_by_damping[damping_id]
+    assert cholesky.shape.ndims == 2
+    return lo.LinearOperatorFullMatrix(cholesky,
+                                       is_non_singular=True,
+                                       is_square=True)
+
+  def get_cholesky_inverse(self, damping_func):
+    # Note that this function returns a variable which gets updated by the
+    # inverse ops.  It may be stale / inconsistent with the latest value of
+    # get_cov().
+    damping_id = graph_func_to_id(damping_func)
+    cholesky_inv = self._cholesky_inverse_by_damping[damping_id]
+    assert cholesky_inv.shape.ndims == 2
+    return lo.LinearOperatorFullMatrix(cholesky_inv,
+                                       is_non_singular=True,
+                                       is_square=True)
 
   def get_eigendecomp(self):
     """Creates or retrieves eigendecomposition of self._cov."""
-    # Unlike get_inverse and get_matpower this doesn't retrieve a stored
-    # variable, but instead always computes a fresh version from the current
-    # value of get_cov().
+    # Unlike get_matpower this doesn't retrieve a stored variable, but instead
+    # always computes a fresh version from the current value of get_cov().
     if not self._eigendecomp:
-      eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(self._cov)
+      eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(self.get_cov())
 
       # The matrix self._cov is positive semidefinite by construction, but the
       # numerical eigenvalues could be negative due to numerical errors, so here
@@ -546,66 +767,8 @@ class InverseProvidingFactor(FisherFactor):
 
     return self._eigendecomp
 
-  def reset_eigendecomp(self):
-    self._eigendecomp = None
-
-  def get_cov(self):
-    # Variable contains full covariance matrix.
-    return self.get_cov_var()
-
-  def left_multiply(self, x, damping):
-    n = self.get_cov().shape[0]
-    damped_cov = self.get_cov() + damping * array_ops.eye(n)
-
-    if isinstance(x, tf_ops.IndexedSlices):
-      raise NotImplementedError(
-          "Left-multiply not yet supported for IndexedSlices.")
-
-    if len(x.shape) != 2:
-      raise ValueError(
-          "InverseProvidingFactors apply to matrix-shaped vectors. Found: %s."
-          % (x,))
-
-    return math_ops.matmul(damped_cov, x)
-
-  def right_multiply(self, x, damping):
-    n = self.get_cov().shape[0]
-    damped_cov = self.get_cov() + damping * array_ops.eye(n)
-
-    if isinstance(x, tf_ops.IndexedSlices):
-      return utils.matmul_sparse_dense(x, damped_cov)
-
-    if len(x.shape) != 2:
-      raise ValueError(
-          "InverseProvidingFactors apply to matrix-shaped vectors. Found: %s."
-          % (x,))
-
-    return math_ops.matmul(x, damped_cov)
-
-  def left_multiply_inverse(self, x, damping):
-    if isinstance(x, tf_ops.IndexedSlices):
-      raise ValueError("Left-multiply not yet supported for IndexedSlices.")
 
-    if x.shape.ndims != 2:
-      raise ValueError(
-          "InverseProvidingFactors apply to matrix-shaped vectors. Found: %s."
-          % (x,))
-
-    return math_ops.matmul(self.get_damped_inverse(damping), x)
-
-  def right_multiply_inverse(self, x, damping):
-    if isinstance(x, tf_ops.IndexedSlices):
-      return utils.matmul_sparse_dense(x, self.get_damped_inverse(damping))
-
-    if x.shape.ndims != 2:
-      raise ValueError(
-          "InverseProvidingFactors apply to matrix-shaped vectors. Found: %s."
-          % (x,))
-
-    return math_ops.matmul(x, self.get_damped_inverse(damping))
-
-
-class FullFactor(InverseProvidingFactor):
+class FullFactor(DenseSquareMatrixFactor):
   """FisherFactor for a full matrix representation of the Fisher of a parameter.
 
   Note that this uses the naive "square the sum estimator", and so is applicable
@@ -622,7 +785,7 @@ class FullFactor(InverseProvidingFactor):
 
   @property
   def _var_scope(self):
-    return "ff_full/" + scope_string_from_params(
+    return "ff_full_" + scope_string_from_params(
         [self._params_grads, self._batch_size])
 
   @property
@@ -635,17 +798,25 @@ class FullFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._params_grads)
 
+  @property
+  def _num_towers(self):
+    return 1
+
   @property
   def _dtype(self):
     return self._params_grads[0][0].dtype
 
-  def _compute_new_cov(self, idx=0):
+  def _compute_new_cov(self, source, tower):
+    assert tower == 0
+
     # This will be a very basic rank 1 estimate
-    with maybe_colocate_with(self._params_grads[idx]):
-      params_grads_flat = utils.tensors_to_column(self._params_grads[idx])
-      return ((params_grads_flat * array_ops.transpose(
-          params_grads_flat)) / math_ops.cast(self._batch_size,
-                                              params_grads_flat.dtype))
+    params_grads_flat = utils.tensors_to_column(self._params_grads[source])
+    return ((params_grads_flat * array_ops.transpose(
+        params_grads_flat)) / math_ops.cast(self._batch_size,
+                                            params_grads_flat.dtype))
+
+  def _get_data_device(self, tower):
+    return None
 
 
 class DiagonalFactor(FisherFactor):
@@ -658,51 +829,49 @@ class DiagonalFactor(FisherFactor):
   def __init__(self):
     super(DiagonalFactor, self).__init__()
 
+  def get_cov_as_linear_operator(self):
+    assert self._matrix_diagonal.shape.ndims == 1
+    return lo.LinearOperatorDiag(self._matrix_diagonal,
+                                 is_self_adjoint=True,
+                                 is_square=True)
+
   @property
   def _cov_initializer(self):
     return diagonal_covariance_initializer
 
+  @property
+  def _matrix_diagonal(self):
+    return array_ops.reshape(self.get_cov(), [-1])
+
   def make_inverse_update_ops(self):
     return []
 
-  def get_cov(self):
-    # self.get_cov() could be any shape, but it must have one entry per
-    # parameter. Flatten it into a vector.
-    cov_diag_vec = array_ops.reshape(self.get_cov_var(), [-1])
-    return array_ops.diag(cov_diag_vec)
-
-  def left_multiply(self, x, damping):
-    damped_cov = self.get_cov_var() + damping
-    if isinstance(x, tf_ops.IndexedSlices):
-      return utils.matmul_diag_sparse(array_ops.reshape(damped_cov, [-1]), x)
-
-    if x.shape != damped_cov.shape:
-      raise ValueError("x (%s) and cov (%s) must have same shape." %
-                       (x, damped_cov))
-
-    return damped_cov * x
-
-  def right_multiply(self, x, damping):
-    raise NotImplementedError("Only left-multiply is currently supported.")
+  def instantiate_inv_variables(self):
+    pass
 
-  def left_multiply_inverse(self, x, damping):
-    inverse = 1. / (self.get_cov_var() + damping)
+  def register_matpower(self, exp, damping_func):
+    pass
 
-    if isinstance(x, tf_ops.IndexedSlices):
-      return utils.matmul_diag_sparse(array_ops.reshape(inverse, [-1]), x)
+  def register_cholesky(self, damping_func):
+    pass
 
-    if x.shape != inverse.shape:
-      raise ValueError("x (%s) and cov (%s) must have same shape." %
-                       (x, inverse))
+  def register_cholesky_inverse(self, damping_func):
+    pass
 
-    return inverse * x
+  def get_matpower(self, exp, damping_func):
+    matpower_diagonal = (self._matrix_diagonal
+                         + math_ops.cast(damping_func(), self._dtype))**exp
+    return lo.LinearOperatorDiag(matpower_diagonal,
+                                 is_non_singular=True,
+                                 is_self_adjoint=True,
+                                 is_positive_definite=True,
+                                 is_square=True)
 
-  def right_multiply_inverse(self, x, damping):
-    raise NotImplementedError("Only left-multiply is currently supported.")
+  def get_cholesky(self, damping_func):
+    return self.get_matpower(0.5, damping_func)
 
-  def register_damped_inverse(self, damping):
-    # DiagonalFactors don't keep explicit inverses.
-    pass
+  def get_cholesky_inverse(self, damping_func):
+    return self.get_matpower(-0.5, damping_func)
 
 
 class NaiveDiagonalFactor(DiagonalFactor):
@@ -730,7 +899,7 @@ class NaiveDiagonalFactor(DiagonalFactor):
 
   @property
   def _var_scope(self):
-    return "ff_naivediag/" + scope_string_from_params(
+    return "ff_naivediag_" + scope_string_from_params(
         [self._params_grads, self._batch_size])
 
   @property
@@ -743,15 +912,23 @@ class NaiveDiagonalFactor(DiagonalFactor):
   def _num_sources(self):
     return len(self._params_grads)
 
+  @property
+  def _num_towers(self):
+    return 1
+
   @property
   def _dtype(self):
     return self._params_grads[0][0].dtype
 
-  def _compute_new_cov(self, idx=0):
-    with maybe_colocate_with(self._params_grads[idx]):
-      params_grads_flat = utils.tensors_to_column(self._params_grads[idx])
-      return (math_ops.square(params_grads_flat) / math_ops.cast(
-          self._batch_size, params_grads_flat.dtype))
+  def _compute_new_cov(self, source, tower):
+    assert tower == 0
+
+    params_grads_flat = utils.tensors_to_column(self._params_grads[source])
+    return (math_ops.square(params_grads_flat) / math_ops.cast(
+        self._batch_size, params_grads_flat.dtype))
+
+  def _get_data_device(self, tower):
+    return None
 
 
 class EmbeddingInputKroneckerFactor(DiagonalFactor):
@@ -772,8 +949,8 @@ class EmbeddingInputKroneckerFactor(DiagonalFactor):
     """Instantiate EmbeddingInputKroneckerFactor.
 
     Args:
-      input_ids: Tuple of Tensors of shape [batch_size, input_size] and dtype
-        int32.  Indices into embedding matrix.
+      input_ids: List of Tensors of shape [batch_size, input_size] and dtype
+        int32. Indices into embedding matrix. List index is tower.
       vocab_size: int or 0-D Tensor. Maximum value for entries in 'input_ids'.
       dtype: dtype for covariance statistics. Must be a floating point type.
         Defaults to float32.
@@ -786,7 +963,7 @@ class EmbeddingInputKroneckerFactor(DiagonalFactor):
 
   @property
   def _var_scope(self):
-    return "ff_diag_embedding/" + scope_string_from_params(self._input_ids)
+    return "ff_diag_embedding_" + scope_string_from_params(self._input_ids)
 
   @property
   def _cov_shape(self):
@@ -794,42 +971,51 @@ class EmbeddingInputKroneckerFactor(DiagonalFactor):
 
   @property
   def _num_sources(self):
+    return 1
+
+  @property
+  def _num_towers(self):
     return len(self._input_ids)
 
   @property
   def _dtype(self):
     return self._cov_dtype
 
-  def _compute_new_cov(self, idx=0):
-    with maybe_colocate_with(self._input_ids):
-      input_ids = self._input_ids[idx]
-      if len(input_ids.shape) > 2:
-        raise ValueError(
-            "Input to embeddings must have rank <= 2. Found rank %d." % len(
-                input_ids.shape))
+  def _compute_new_cov(self, source, tower):
+    assert source == 0
+
+    input_ids = self._input_ids[tower]
+
+    if len(input_ids.shape) > 2:
+      raise ValueError(
+          "Input to embeddings must have rank <= 2. Found rank %d." % len(
+              input_ids.shape))
 
-      batch_size = array_ops.shape(input_ids)[0]
+    batch_size = array_ops.shape(input_ids)[0]
 
-      # Transform indices into one-hot vectors.
-      #
-      # TODO(b/72714822): There must be a faster way to construct the diagonal
-      # covariance matrix! This operation is O(batch_size * vocab_size), where
-      # it should be O(batch_size * input_size).
-      flat_input_ids = array_ops.reshape(input_ids, [-1])
-      one_hots = array_ops.one_hot(flat_input_ids,
-                                   self._vocab_size)  # [?, vocab_size]
+    # Transform indices into one-hot vectors.
+    #
+    # TODO(b/72714822): There must be a faster way to construct the diagonal
+    # covariance matrix! This operation is O(batch_size * vocab_size), where
+    # it should be O(batch_size * input_size).
+    flat_input_ids = array_ops.reshape(input_ids, [-1])
+    one_hots = array_ops.one_hot(flat_input_ids,
+                                 self._vocab_size)  # [?, vocab_size]
 
-      # Take average across examples. Note that, because all entries have
-      # magnitude zero or one, there's no need to square the entries.
-      #
-      # TODO(b/72714822): Support for SparseTensor, other kinds of aggregation
-      # within an example such as average.
-      #
-      # TODO(b/72714822): Support for partitioned embeddings.
-      new_cov = math_ops.reduce_sum(one_hots, axis=0)  # [vocab_size]
-      new_cov /= math_ops.cast(batch_size, new_cov.dtype)
+    # Take average across examples. Note that, because all entries have
+    # magnitude zero or one, there's no need to square the entries.
+    #
+    # TODO(b/72714822): Support for SparseTensor, other kinds of aggregation
+    # within an example such as average.
+    #
+    # TODO(b/72714822): Support for partitioned embeddings.
+    new_cov = math_ops.reduce_sum(one_hots, axis=0)  # [vocab_size]
+    new_cov /= math_ops.cast(batch_size, new_cov.dtype)
 
-      return new_cov
+    return new_cov
+
+  def _get_data_device(self, tower):
+    return self._input_ids[tower].device
 
 
 class FullyConnectedDiagonalFactor(DiagonalFactor):
@@ -850,58 +1036,75 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
     """Instantiate FullyConnectedDiagonalFactor.
 
     Args:
-      inputs: Tensor of shape [batch_size, input_size]. Inputs to fully
-        connected layer.
-      outputs_grads: List of Tensors of shape [batch_size, output_size].
-        Gradient of loss with respect to layer's preactivations.
+      inputs: List of Tensors of shape [batch_size, input_size]. Inputs to this
+        layer.  List index is towers.
+      outputs_grads: List of Tensors, each of shape [batch_size, output_size],
+        which are the gradients of the loss with respect to the layer's
+        outputs. First index is source, second is tower.
+
       has_bias: bool. If True, append '1' to each input.
     """
     self._inputs = inputs
     self._has_bias = has_bias
     self._outputs_grads = outputs_grads
-    self._batch_size = array_ops.shape(inputs)[0]
     self._squared_inputs = None
 
     super(FullyConnectedDiagonalFactor, self).__init__()
 
   @property
   def _var_scope(self):
-    return "ff_diagfc/" + scope_string_from_params(
-        (self._inputs,) + tuple(self._outputs_grads))
+    return "ff_diagfc_" + scope_string_from_params(
+        tuple(self._inputs) + tuple(nest.flatten(self._outputs_grads)))
 
   @property
   def _cov_shape(self):
-    input_size = self._inputs.shape[1] + self._has_bias
-    output_size = self._outputs_grads[0].shape[1]
+    input_size = self._inputs[0].shape[1] + self._has_bias
+    output_size = self._outputs_grads[0][0].shape[1]
     return [input_size, output_size]
 
   @property
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _num_towers(self):
+    return len(self._inputs)
+
   @property
   def _dtype(self):
-    return self._outputs_grads[0].dtype
+    return self._outputs_grads[0][0].dtype
+
+  def make_covariance_update_op(self, ema_decay):
+
+    self._squared_inputs = []
+    for tower in range(self._num_towers):
+      inputs = self._inputs[tower]
+
+      with place_on_device(self._get_data_device(tower)):
+        if self._has_bias:
+          inputs = append_homog(inputs)
+        self._squared_inputs.append(math_ops.square(inputs))
+
+    return super(FullyConnectedDiagonalFactor, self).make_covariance_update_op(
+        ema_decay)
+
+  def _compute_new_cov(self, source, tower):
+    batch_size = array_ops.shape(self._squared_inputs[tower])[0]
+    outputs_grad = self._outputs_grads[source][tower]
 
-  def _compute_new_cov(self, idx=0):
     # The well-known special formula that uses the fact that the entry-wise
     # square of an outer product is the outer-product of the entry-wise squares.
     # The gradient is the outer product of the input and the output gradients,
     # so we just square both and then take their outer-product.
-    with maybe_colocate_with(self._outputs_grads[idx]):
-      # We only need to compute squared_inputs once
-      if self._squared_inputs is None:
-        inputs = self._inputs
-        if self._has_bias:
-          inputs = append_homog(self._inputs)
-        self._squared_inputs = math_ops.square(inputs)
+    new_cov = math_ops.matmul(
+        self._squared_inputs[tower],
+        math_ops.square(outputs_grad),
+        transpose_a=True)
+    new_cov /= math_ops.cast(batch_size, new_cov.dtype)
+    return new_cov
 
-      new_cov = math_ops.matmul(
-          self._squared_inputs,
-          math_ops.square(self._outputs_grads[idx]),
-          transpose_a=True)
-      new_cov /= math_ops.cast(self._batch_size, new_cov.dtype)
-      return new_cov
+  def _get_data_device(self, tower):
+    return self._inputs[tower].device
 
 
 class ConvDiagonalFactor(DiagonalFactor):
@@ -913,36 +1116,67 @@ class ConvDiagonalFactor(DiagonalFactor):
                filter_shape,
                strides,
                padding,
+               data_format=None,
+               dilations=None,
                has_bias=False):
     """Creates a ConvDiagonalFactor object.
 
     Args:
-      inputs: Tensor of shape [batch_size, height, width, in_channels].
-        Input activations to this layer.
-      outputs_grads: Tensor of shape [batch_size, height, width, out_channels].
-        Per-example gradients to the loss with respect to the layer's output
-        preactivations.
+      inputs: List of Tensors of shape [batch_size, height, width, in_channels].
+        Input activations to this layer.  List index is towers.
+      outputs_grads: List of Tensors, each of shape [batch_size,
+        height, width, out_channels], which are the gradients of the loss
+        with respect to the layer's outputs.  First index is source, second
+        index is tower.
       filter_shape: Tuple of 4 ints: (kernel_height, kernel_width, in_channels,
         out_channels). Represents shape of kernel used in this layer.
       strides: The stride size in this layer (1-D Tensor of length 4).
       padding: The padding in this layer (1-D of Tensor length 4).
+      data_format: None or str. Format of conv2d inputs.
+      dilations: None or tuple of 4 ints.
       has_bias: Python bool. If True, the layer is assumed to have a bias
         parameter in addition to its filter parameter.
+
+    Raises:
+      ValueError: If inputs, output_grads, and filter_shape do not agree on
+        in_channels or out_channels.
+      ValueError: If strides, dilations are not length-4 lists of ints.
+      ValueError: If data_format does not put channel last.
     """
+    if not utils.is_data_format_channel_last(data_format):
+      raise ValueError("Channel must be last.")
+    if any(input_.shape.ndims != 4 for input_ in inputs):
+      raise ValueError("inputs must be a list of 4-D Tensors.")
+    if any(input_.shape.as_list()[-1] != filter_shape[-2] for input_ in inputs):
+      raise ValueError("inputs and filter_shape must agree on in_channels.")
+    for i, outputs_grad in enumerate(outputs_grads):
+      if any(output_grad.shape.ndims != 4 for output_grad in outputs_grad):
+        raise ValueError("outputs[%d] must be 4-D Tensor." % i)
+      if any(output_grad.shape.as_list()[-1] != filter_shape[-1]
+             for output_grad in outputs_grad):
+        raise ValueError(
+            "outputs[%d] and filter_shape must agree on out_channels." % i)
+    if len(strides) != 4:
+      raise ValueError("strides must be length-4 list of ints.")
+    if dilations is not None and len(dilations) != 4:
+      raise ValueError("dilations must be length-4 list of ints.")
+
     self._inputs = inputs
+    self._outputs_grads = outputs_grads
     self._filter_shape = filter_shape
     self._strides = strides
     self._padding = padding
+    self._data_format = data_format
+    self._dilations = dilations
     self._has_bias = has_bias
-    self._outputs_grads = outputs_grads
     self._patches = None
 
     super(ConvDiagonalFactor, self).__init__()
 
   @property
   def _var_scope(self):
-    return "ff_convdiag/" + scope_string_from_name(
-        (self._inputs,) + tuple(self._outputs_grads))
+    return "ff_convdiag_" + scope_string_from_params(
+        tuple(self._inputs) + tuple(nest.flatten(self._outputs_grads)))
 
   @property
   def _cov_shape(self):
@@ -956,43 +1190,50 @@ class ConvDiagonalFactor(DiagonalFactor):
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _num_towers(self):
+    return len(self._inputs)
+
   @property
   def _dtype(self):
-    return self._outputs_grads[0].dtype
+    return self._inputs[0].dtype
 
   def make_covariance_update_op(self, ema_decay):
-    with maybe_colocate_with(self._inputs):
-      filter_height, filter_width, _, _ = self._filter_shape
+    filter_height, filter_width, _, _ = self._filter_shape
 
-      # TODO(b/64144716): there is potential here for a big savings in terms
-      # of memory use.
-      patches = array_ops.extract_image_patches(
-          self._inputs,
-          ksizes=[1, filter_height, filter_width, 1],
-          strides=self._strides,
-          rates=[1, 1, 1, 1],
-          padding=self._padding)
-
-      if self._has_bias:
-        patches = append_homog(patches)
-
-      self._patches = patches
+    # TODO(b/64144716): there is potential here for a big savings in terms
+    # of memory use.
+    if self._dilations is None:
+      rates = (1, 1, 1, 1)
+    else:
+      rates = tuple(self._dilations)
+
+    self._patches = []
+    for tower in range(self._num_towers):
+      with place_on_device(self._get_data_device(tower)):
+        patches = array_ops.extract_image_patches(
+            self._inputs[tower],
+            ksizes=[1, filter_height, filter_width, 1],
+            strides=self._strides,
+            rates=rates,
+            padding=self._padding)
 
-    op = super(ConvDiagonalFactor, self).make_covariance_update_op(ema_decay)
+        if self._has_bias:
+          patches = append_homog(patches)
 
-    self._patches = None
+        self._patches.append(patches)
 
-    return op
+    return super(ConvDiagonalFactor, self).make_covariance_update_op(ema_decay)
 
-  def _compute_new_cov(self, idx=0):
-    with maybe_colocate_with(self._outputs_grads[idx]):
-      outputs_grad = self._outputs_grads[idx]
-      batch_size = array_ops.shape(self._patches)[0]
+  def _compute_new_cov(self, source, tower):
+    patches = self._patches[tower]
+    batch_size = array_ops.shape(patches)[0]
+    outputs_grad = self._outputs_grads[source][tower]
 
-      new_cov = self._convdiag_sum_of_squares(self._patches, outputs_grad)
-      new_cov /= math_ops.cast(batch_size, new_cov.dtype)
+    new_cov = self._convdiag_sum_of_squares(patches, outputs_grad)
+    new_cov /= math_ops.cast(batch_size, new_cov.dtype)
 
-      return new_cov
+    return new_cov
 
   def _convdiag_sum_of_squares(self, patches, outputs_grad):
     # This computes the sum of the squares of the per-training-case "gradients".
@@ -1002,8 +1243,11 @@ class ConvDiagonalFactor(DiagonalFactor):
                                                   outputs_grad)
     return math_ops.reduce_sum(math_ops.square(case_wise_gradients), axis=0)
 
+  def _get_data_device(self, tower):
+    return self._inputs[tower].device
+
 
-class FullyConnectedKroneckerFactor(InverseProvidingFactor):
+class FullyConnectedKroneckerFactor(DenseSquareMatrixFactor):
   """Kronecker factor for the input or output side of a fully-connected layer.
   """
 
@@ -1013,8 +1257,9 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
     """Instantiate FullyConnectedKroneckerFactor.
 
     Args:
-      tensors: List of Tensors of shape [batch_size, n]. Represents either a
-        layer's inputs or its output's gradients.
+      tensors: List of list of Tensors, each of shape [batch_size, n]. The
+        Tensors are typically either a layer's inputs or its output's gradients.
+        The first list index is source, the second is tower.
       has_bias: bool. If True, append '1' to each row.
     """
     # The tensor argument is either a tensor of input activations or a tensor of
@@ -1025,31 +1270,37 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
 
   @property
   def _var_scope(self):
-    return "ff_fckron/" + scope_string_from_params(
-        [self._tensors, self._has_bias])
+    return "ff_fckron_" + scope_string_from_params(
+        tuple(nest.flatten(self._tensors)) + (self._has_bias,))
 
   @property
   def _cov_shape(self):
-    size = self._tensors[0].shape[1] + self._has_bias
+    size = self._tensors[0][0].shape[1] + self._has_bias
     return [size, size]
 
   @property
   def _num_sources(self):
     return len(self._tensors)
 
+  @property
+  def _num_towers(self):
+    return len(self._tensors[0])
+
   @property
   def _dtype(self):
-    return self._tensors[0].dtype
+    return self._tensors[0][0].dtype
+
+  def _compute_new_cov(self, source, tower):
+    tensor = self._tensors[source][tower]
+    if self._has_bias:
+      tensor = append_homog(tensor)
+    return compute_cov(tensor)
 
-  def _compute_new_cov(self, idx=0):
-    with maybe_colocate_with(self._tensors[idx]):
-      tensor = self._tensors[idx]
-      if self._has_bias:
-        tensor = append_homog(tensor)
-      return compute_cov(tensor)
+  def _get_data_device(self, tower):
+    return self._tensors[0][tower].device
 
 
-class ConvInputKroneckerFactor(InverseProvidingFactor):
+class ConvInputKroneckerFactor(DenseSquareMatrixFactor):
   r"""Kronecker factor for the input side of a convolutional layer.
 
   Estimates E[ a a^T ] where a is the inputs to a convolutional layer given
@@ -1062,39 +1313,69 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
   def __init__(self,
                inputs,
                filter_shape,
-               strides,
                padding,
-               has_bias=False):
+               strides=None,
+               dilation_rate=None,
+               data_format=None,
+               extract_patches_fn=None,
+               has_bias=False,
+               sub_sample_inputs=None,
+               sub_sample_patches=None):
     """Initializes ConvInputKroneckerFactor.
 
     Args:
-      inputs: Tensor of shape [batch_size, height, width, in_channels]. Inputs
-        to layer.
-      filter_shape: 1-D Tensor of length 4. Contains [kernel_height,
-        kernel_width, in_channels, out_channels].
-      strides: 1-D Tensor of length 4. Contains [batch_stride, height_stride,
-        width_stride, in_channel_stride].
+      inputs: List of Tensors of shape [batch_size, ..spatial_input_size..,
+        in_channels]. Inputs to layer. List index is tower.
+      filter_shape: List of ints. Contains [..spatial_filter_size..,
+        in_channels, out_channels]. Shape of convolution kernel.
       padding: str. Padding method for layer. "SAME" or "VALID".
+      strides: List of ints or None. Contains [..spatial_filter_strides..] if
+        'extract_patches_fn' is compatible with tf.nn.convolution(), else
+        [1, ..spatial_filter_strides, 1].
+      dilation_rate: List of ints or None. Rate for dilation along each spatial
+        dimension if 'extract_patches_fn' is compatible with
+        tf.nn.convolution(), else [1, ..spatial_dilation_rates.., 1].
+      data_format: str or None. Format of input data.
+      extract_patches_fn: str or None. Name of function that extracts image
+        patches. One of "extract_convolution_patches", "extract_image_patches",
+        "extract_pointwise_conv2d_patches".
       has_bias: bool. If True, append 1 to in_channel.
+      sub_sample_inputs: `bool`. If True, then subsample the inputs from which
+        the image patches are extracted. (Default: None)
+      sub_sample_patches: `bool`, If `True` then subsample the extracted
+        patches.(Default: None)
     """
+    self._inputs = inputs
     self._filter_shape = filter_shape
     self._strides = strides
     self._padding = padding
+    self._dilation_rate = dilation_rate
+    self._data_format = data_format
+    self._extract_patches_fn = extract_patches_fn
     self._has_bias = has_bias
-    self._inputs = inputs
+    if sub_sample_inputs is None:
+      self._sub_sample_inputs = _SUB_SAMPLE_INPUTS
+    else:
+      self._sub_sample_inputs = sub_sample_inputs
+
+    if sub_sample_patches is None:
+      self._sub_sample_patches = _SUB_SAMPLE_OUTER_PRODUCTS
+    else:
+      self._sub_sample_patches = sub_sample_patches
     super(ConvInputKroneckerFactor, self).__init__()
 
   @property
   def _var_scope(self):
-    return "ff_convinkron/" + scope_string_from_params([
-        self._inputs, self._filter_shape, self._strides, self._padding,
-        self._has_bias
-    ])
+    return "ff_convinkron_" + scope_string_from_params(
+        tuple(self._inputs) +
+        tuple((self._filter_shape, self._strides, self._padding,
+               self._dilation_rate, self._data_format, self._has_bias)))
 
   @property
   def _cov_shape(self):
-    filter_height, filter_width, in_channels, _ = self._filter_shape
-    size = filter_height * filter_width * in_channels + self._has_bias
+    spatial_filter_shape = self._filter_shape[0:-2]
+    in_channels = self._filter_shape[-2]
+    size = np.prod(spatial_filter_shape) * in_channels + self._has_bias
     return [size, size]
 
   @property
@@ -1102,47 +1383,88 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
     return 1
 
   @property
-  def _dtype(self):
-    return self._inputs.dtype
-
-  def _compute_new_cov(self, idx=0):
-    if idx != 0:
-      raise ValueError("ConvInputKroneckerFactor only supports idx = 0")
-
-    with maybe_colocate_with(self._inputs):
-      filter_height, filter_width, in_channels, _ = self._filter_shape
+  def _num_towers(self):
+    return len(self._inputs)
 
-      # TODO(b/64144716): there is potential here for a big savings in terms of
-      # memory use.
+  @property
+  def _dtype(self):
+    return self._inputs[0].dtype
+
+  def _compute_new_cov(self, source, tower):
+    assert source == 0
+
+    inputs = self._inputs[tower]
+    if self._sub_sample_inputs:
+      batch_size = inputs.shape.as_list()[0]
+      max_size = int(batch_size * _INPUTS_TO_EXTRACT_PATCHES_FACTOR)
+      inputs = _random_tensor_gather(inputs, max_size)
+
+    # TODO(b/64144716): there is potential here for a big savings in terms of
+    # memory use.
+    if self._extract_patches_fn in [None, "extract_convolution_patches"]:
+      patches = utils.extract_convolution_patches(
+          inputs,
+          self._filter_shape,
+          padding=self._padding,
+          strides=self._strides,
+          dilation_rate=self._dilation_rate,
+          data_format=self._data_format)
+
+    elif self._extract_patches_fn == "extract_image_patches":
+      assert inputs.shape.ndims == 4
+      assert len(self._filter_shape) == 4
+      assert len(self._strides) == 4, self._strides
+      if self._dilation_rate is None:
+        rates = [1, 1, 1, 1]
+      else:
+        rates = self._dilation_rate
+        assert len(rates) == 4
+        assert rates[0] == rates[-1] == 1
       patches = array_ops.extract_image_patches(
-          self._inputs,
-          ksizes=[1, filter_height, filter_width, 1],
+          inputs,
+          ksizes=[1] + list(self._filter_shape[0:-2]) + [1],
           strides=self._strides,
-          rates=[1, 1, 1, 1],
+          rates=rates,
           padding=self._padding)
 
-      flatten_size = (filter_height * filter_width * in_channels)
-      # patches_flat below is the matrix [[A_l]] from the KFC paper (tilde
-      # omitted over A for clarity). It has shape M|T| x J|Delta| (eq. 14),
-      # where M = minibatch size, |T| = number of spatial locations,
-      # |Delta| = number of spatial offsets, and J = number of input maps
-      # for convolutional layer l.
-      patches_flat = array_ops.reshape(patches, [-1, flatten_size])
-      # We append a homogenous coordinate to patches_flat if the layer has
-      # bias parameters. This gives us [[A_l]]_H from the paper.
-      if self._has_bias:
-        patches_flat = append_homog(patches_flat)
-      # We call compute_cov without passing in a normalizer. compute_cov uses
-      # the first dimension of patches_flat i.e. M|T| as the normalizer by
-      # default. Hence we end up computing 1/M|T| * [[A_l]]^T [[A_l]], with
-      # shape J|Delta| x J|Delta|. This is related to hat{Omega}_l from
-      # the paper but has a different scale here for consistency with
-      # ConvOutputKroneckerFactor.
-      # (Tilde omitted over A for clarity.)
-      return compute_cov(patches_flat)
-
-
-class ConvOutputKroneckerFactor(InverseProvidingFactor):
+    elif self._extract_patches_fn == "extract_pointwise_conv2d_patches":
+      assert self._strides in [None, [1, 1, 1, 1], (1, 1, 1, 1)]
+      assert self._filter_shape[0] == self._filter_shape[1] == 1
+      patches = utils.extract_pointwise_conv2d_patches(
+          inputs, self._filter_shape, data_format=None)
+
+    else:
+      raise NotImplementedError(self._extract_patches_fn)
+
+    flatten_size = np.prod(self._filter_shape[0:-1])
+    # patches_flat below is the matrix [[A_l]] from the KFC paper (tilde
+    # omitted over A for clarity). It has shape M|T| x J|Delta| (eq. 14),
+    # where M = minibatch size, |T| = number of spatial locations,
+    # |Delta| = number of spatial offsets, and J = number of input maps
+    # for convolutional layer l.
+    patches_flat = array_ops.reshape(patches, [-1, flatten_size])
+
+    # We append a homogenous coordinate to patches_flat if the layer has
+    # bias parameters. This gives us [[A_l]]_H from the paper.
+    if self._sub_sample_patches:
+      patches_flat = _subsample_for_cov_computation(patches_flat)
+
+    if self._has_bias:
+      patches_flat = append_homog(patches_flat)
+    # We call compute_cov without passing in a normalizer. compute_cov uses
+    # the first dimension of patches_flat i.e. M|T| as the normalizer by
+    # default. Hence we end up computing 1/M|T| * [[A_l]]^T [[A_l]], with
+    # shape J|Delta| x J|Delta|. This is related to hat{Omega}_l from
+    # the paper but has a different scale here for consistency with
+    # ConvOutputKroneckerFactor.
+    # (Tilde omitted over A for clarity.)
+    return compute_cov(patches_flat)
+
+  def _get_data_device(self, tower):
+    return self._inputs[tower].device
+
+
+class ConvOutputKroneckerFactor(DenseSquareMatrixFactor):
   r"""Kronecker factor for the output side of a convolutional layer.
 
   Estimates E[ ds ds^T ] where s is the preactivations of a convolutional layer
@@ -1153,20 +1475,28 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
   Section 3.1 Estimating the factors.
   """
 
-  def __init__(self, outputs_grads):
+  def __init__(self, outputs_grads, data_format=None):
     """Initializes ConvOutputKroneckerFactor.
 
     Args:
-      outputs_grads: list of Tensors. Each Tensor is of shape
-          [batch_size, height, width, out_channels].
+      outputs_grads: List of list of Tensors. Each Tensor is of shape
+          [batch_size, ..spatial_input_size.., out_channels].  First list index
+          is source, the second is tower.
+      data_format: None or str. Format of outputs_grads.
+
+    Raises:
+      ValueError: If channels are not final dimension.
     """
-    self._out_channels = outputs_grads[0].shape.as_list()[3]
+    if not utils.is_data_format_channel_last(data_format):
+      raise ValueError("Channel must be last.")
+    self._out_channels = outputs_grads[0][0].shape.as_list()[-1]
     self._outputs_grads = outputs_grads
     super(ConvOutputKroneckerFactor, self).__init__()
 
   @property
   def _var_scope(self):
-    return "ff_convoutkron/" + scope_string_from_params(self._outputs_grads)
+    return "ff_convoutkron_" + scope_string_from_params(
+        nest.flatten(self._outputs_grads))
 
   @property
   def _cov_shape(self):
@@ -1177,134 +1507,150 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _num_towers(self):
+    return len(self._outputs_grads[0])
+
   @property
   def _dtype(self):
-    return self._outputs_grads[0].dtype
-
-  def _compute_new_cov(self, idx=0):
-    with maybe_colocate_with(self._outputs_grads[idx]):
-      # reshaped_tensor below is the matrix DS_l defined in the KFC paper
-      # (tilde omitted over S for clarity). It has shape M|T| x I, where
-      # M = minibatch size, |T| = number of spatial locations, and
-      # I = number of output maps for convolutional layer l.
-      reshaped_tensor = array_ops.reshape(self._outputs_grads[idx],
-                                          [-1, self._out_channels])
-      # Following the reasoning in ConvInputKroneckerFactor._compute_new_cov,
-      # compute_cov here returns 1/M|T| * DS_l^T DS_l = hat{Gamma}_l
-      # as defined in the paper, with shape I x I.
-      # (Tilde omitted over S for clarity.)
-      return compute_cov(reshaped_tensor)
-
-
-class FullyConnectedMultiKF(InverseProvidingFactor):
-  """Kronecker factor for a fully connected recurrent layer."""
+    return self._outputs_grads[0][0].dtype
+
+  def _compute_new_cov(self, source, tower):
+    outputs_grad = self._outputs_grads[source][tower]
+
+    # reshaped_tensor below is the matrix DS_l defined in the KFC paper
+    # (tilde omitted over S for clarity). It has shape M|T| x I, where
+    # M = minibatch size, |T| = number of spatial locations, and
+    # I = number of output maps for convolutional layer l.
+    reshaped_tensor = array_ops.reshape(outputs_grad, [-1, self._out_channels])
+    # Following the reasoning in ConvInputKroneckerFactor._compute_new_cov,
+    # compute_cov here returns 1/M|T| * DS_l^T DS_l = hat{Gamma}_l
+    # as defined in the paper, with shape I x I.
+    # (Tilde omitted over S for clarity.)
+    return compute_cov(reshaped_tensor)
+
+  def _get_data_device(self, tower):
+    return self._outputs_grads[0][tower].device
+
+
+class FullyConnectedMultiKF(FullyConnectedKroneckerFactor):
+  """Kronecker factor for a fully connected layer used multiple times."""
 
   def __init__(self,
-               tensor_lists,
+               tensors,
+               num_uses=None,
                has_bias=False):
     """Constructs a new `FullyConnectedMultiKF`.
 
     Args:
-      tensor_lists: List of lists of Tensors of shape [batch_size, n].
+      tensors: List of list of Tensors of shape, each of shape
+        [num_uses * batch_size, n], and is a reshape version of a Tensor of
+        shape [num_uses, batch_size, n]. Each of these tensors is usually a
+        layer's inputs or its output's gradients. The first list index is
+        sources, the second is towers.
+      num_uses: int. The number of time-steps / uses.
       has_bias: bool. If True, '1' is appended to each row.
     """
 
-    self._tensor_lists = tensor_lists
-    self._has_bias = has_bias
-    self._batch_size = array_ops.shape(tensor_lists[0][0])[0]
-    self._num_timesteps = len(tensor_lists[0])
-    self._tensors = [None] * len(tensor_lists)
+    self._num_uses = num_uses
 
     self._cov_dt1 = None
+    self._make_cov_dt1 = False
     self._option1quants_by_damping = {}
     self._option2quants_by_damping = {}
+    self._option1quants_registrations = set()
+    self._option2quants_registrations = set()
 
-    super(FullyConnectedMultiKF, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_fc_multi/" + scope_string_from_params(self._tensor_lists)
+    super(FullyConnectedMultiKF, self).__init__(tensors=tensors,
+                                                has_bias=has_bias)
 
   @property
-  def _num_sources(self):
-    return len(self._tensor_lists)
+  def _num_timesteps(self):
+    return self._num_uses
 
   @property
-  def _dtype(self):
-    return self._tensor_lists[0][0].dtype
+  def _var_scope(self):
+    return "ff_fc_multi_" + scope_string_from_params(
+        tuple(nest.flatten(self._tensors))
+        + (self._num_timesteps, self._has_bias,))
 
   def make_covariance_update_op(self, ema_decay):
 
     op = super(FullyConnectedMultiKF, self).make_covariance_update_op(ema_decay)
 
     if self._cov_dt1 is not None:
-      new_cov_dt1_contribs = tuple(self._compute_new_cov_dt1(idx)
-                                   for idx in range(self._num_sources))
+      new_cov_dt1_contribs = []
+      for source in range(self._num_sources):
+        for tower in range(self._num_towers):
+          with place_on_device(self._get_data_device(tower)):
+            new_cov_dt1_contribs.append(self._compute_new_cov_dt1(source,
+                                                                  tower))
+
+      new_cov_dt1 = (math_ops.add_n(new_cov_dt1_contribs)
+                     / float(self._num_towers))
 
-      with maybe_colocate_with(new_cov_dt1_contribs[0]):
-        new_cov_dt1 = math_ops.add_n(new_cov_dt1_contribs)
+      # See comments in FisherFactor.make_covariance_update_op() for details.
+      if utils.on_tpu():
+        new_cov_dt1 = utils.cross_replica_mean(new_cov_dt1)
 
-        op2 = moving_averages.assign_moving_average(
-            self._cov_dt1, new_cov_dt1, ema_decay, zero_debias=ZERO_DEBIAS)
+      op2 = moving_averages.assign_moving_average(
+          self._cov_dt1, new_cov_dt1, ema_decay, zero_debias=ZERO_DEBIAS)
 
-        # TODO(b/69112164):
-        # It's important that _cov and _cov_dt1 remain consistent with each
-        # other while the inverse ops are happening. How can we ensure this?
-        # We will need to add explicit synchronization for this to
-        # work with asynchronous training.
-        op = control_flow_ops.group(op, op2)
+      # TODO(b/69112164):
+      # It's important that _cov and _cov_dt1 remain consistent with each
+      # other while the inverse ops are happening. How can we ensure this?
+      # We will need to add explicit synchronization for this to
+      # work with asynchronous training.
+      op = control_flow_ops.group(op, op2)
 
     return op
 
-  def _compute_new_cov(self, idx=0):
-    with maybe_colocate_with(self._tensor_lists[idx]):
-      tensor = array_ops.concat(self._tensor_lists[idx], 0)
-      if self._has_bias:
-        tensor = append_homog(tensor)
-      # We save these so they can be used by _compute_new_cov_dt1
-      self._tensors[idx] = tensor
-      return compute_cov(tensor)
-
-  def _compute_new_cov_dt1(self, idx=0):
-    tensor = self._tensors[idx]
-    with maybe_colocate_with(tensor):
-      # Is there a more elegant way to do this computation?
-      tensor_present = tensor[:-self._batch_size, :]
-      tensor_future = tensor[self._batch_size:, :]
-      # We specify a normalizer for this computation to ensure a PSD Fisher
-      # block estimate.  This is equivalent to padding with zeros, as was done
-      # in Section B.2 of the appendix.
-      normalizer = self._num_timesteps * self._batch_size
-      return compute_cov(
-          tensor_future, tensor_right=tensor_present, normalizer=normalizer)
+  def _compute_new_cov_dt1(self, source, tower):  # pylint: disable=missing-docstring
+    tensor = self._tensors[source][tower]
+    if self._has_bias:
+      # This appending is technically done twice (the other time is for
+      # _compute_new_cov())
+      tensor = append_homog(tensor)
 
-  @property
-  def _cov_shape(self):
-    size = self._tensor_lists[0][0].shape[1] + self._has_bias
-    return [size, size]
+    total_len = array_ops.shape(tensor)[0]
+    batch_size = total_len // self._num_timesteps
+
+    tensor_present = tensor[:-batch_size, :]
+    tensor_future = tensor[batch_size:, :]
+
+    # We specify a normalizer for this computation to ensure a PSD Fisher
+    # block estimate.  This is equivalent to padding with zeros, as was done
+    # in Section B.2 of the appendix.
+    return compute_cov(
+        tensor_future, tensor_right=tensor_present, normalizer=total_len)
+
+  def _get_data_device(self, tower):
+    return self._tensors[0][tower].device
 
   @property
   def _vec_shape(self):
-    size = self._tensor_lists[0][0].shape[1] + self._has_bias
+    size = self._tensors[0][0].shape[1] + self._has_bias
     return [size]
 
-  def get_option1quants(self, damping):
-    return self._option1quants_by_damping[damping]
+  def get_option1quants(self, damping_func):
+    damping_id = graph_func_to_id(damping_func)
+    return self._option1quants_by_damping[damping_id]
 
-  def get_option2quants(self, damping):
-    return self._option2quants_by_damping[damping]
+  def get_option2quants(self, damping_func):
+    damping_id = graph_func_to_id(damping_func)
+    return self._option2quants_by_damping[damping_id]
 
   def get_cov_dt1(self):
     assert self._cov_dt1 is not None
     return self._cov_dt1
 
   def register_cov_dt1(self):
-    """Create a variable representing temporal cross-covariance.
+    self._make_cov_dt1 = True
 
-    (This is technically the second moment, not covariance, since it's
-    not mean subtracted.)
-    """
-    if self._cov_dt1 is None:
+  def instantiate_cov_variables(self):
+    super(FullyConnectedMultiKF, self).instantiate_cov_variables()
+    assert self._cov_dt1 is None
+    if self._make_cov_dt1:
       with variable_scope.variable_scope(self._var_scope):
         self._cov_dt1 = variable_scope.get_variable(
             "cov_dt1",
@@ -1313,15 +1659,25 @@ class FullyConnectedMultiKF(InverseProvidingFactor):
             trainable=False,
             dtype=self._dtype)
 
-  def register_option1quants(self, damping):
+  def register_option1quants(self, damping_func):
+    damping_id = self._register_damping(damping_func)
+    if damping_id not in self._option1quants_registrations:
+      self._option1quants_registrations.add(damping_id)
 
-    self.register_cov_dt1()
+  def register_option2quants(self, damping_func):
+    damping_id = self._register_damping(damping_func)
+    if damping_id not in self._option2quants_registrations:
+      self._option2quants_registrations.add(damping_id)
 
-    if damping not in self._option1quants_by_damping:
+  def instantiate_inv_variables(self):
+    super(FullyConnectedMultiKF, self).instantiate_inv_variables()
+
+    for damping_id in self._option1quants_registrations:
+      damping_func = self._damping_funcs_by_id[damping_id]
+      damping_string = graph_func_to_string(damping_func)
       # It's questionable as to whether we should initialize with stuff like
       # this at all.  Ideally these values should never be used until they are
       # updated at least once.
-      damping_string = scalar_or_tensor_to_string(damping)
       with variable_scope.variable_scope(self._var_scope):
         Lmat = variable_scope.get_variable(  # pylint: disable=invalid-name
             "Lmat_damp{}".format(damping_string),
@@ -1336,17 +1692,15 @@ class FullyConnectedMultiKF(InverseProvidingFactor):
             trainable=False,
             dtype=self._dtype)
 
-      self._option1quants_by_damping[damping] = (Lmat, psi)
-
-  def register_option2quants(self, damping):
+      assert damping_id not in self._option1quants_by_damping
+      self._option1quants_by_damping[damping_id] = (Lmat, psi)
 
-    self.register_cov_dt1()
-
-    if damping not in self._option2quants_by_damping:
+    for damping_id in self._option2quants_registrations:
+      damping_func = self._damping_funcs_by_id[damping_id]
+      damping_string = graph_func_to_string(damping_func)
       # It's questionable as to whether we should initialize with stuff like
       # this at all.  Ideally these values should never be used until they are
       # updated at least once.
-      damping_string = scalar_or_tensor_to_string(damping)
       with variable_scope.variable_scope(self._var_scope):
         Pmat = variable_scope.get_variable(  # pylint: disable=invalid-name
             "Lmat_damp{}".format(damping_string),
@@ -1367,14 +1721,15 @@ class FullyConnectedMultiKF(InverseProvidingFactor):
             trainable=False,
             dtype=self._dtype)
 
-      self._option2quants_by_damping[damping] = (Pmat, Kmat, mu)
+      assert damping_id not in self._option2quants_by_damping
+      self._option2quants_by_damping[damping_id] = (Pmat, Kmat, mu)
 
   def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
     # TODO(b/69918258): Add correctness tests for this method.
     # pylint: disable=invalid-name
 
-    ops = super(FullyConnectedMultiKF, self).make_inverse_update_ops()
+    ops = []
 
     if (len(self._option1quants_by_damping) +
         len(self._option2quants_by_damping)):
@@ -1395,8 +1750,11 @@ class FullyConnectedMultiKF(InverseProvidingFactor):
       # consistently, or are somehow read between or during the cov updates.
       # Can this possibly happen?  Is there a way to prevent it?
 
-      for damping, (Lmat_var,
-                    psi_var) in self._option1quants_by_damping.items():
+      for damping_id, (Lmat_var,
+                       psi_var) in self._option1quants_by_damping.items():
+
+        damping = self._damping_funcs_by_id[damping_id]()
+        damping = math_ops.cast(damping, self._dtype)
 
         invsqrtC0 = math_ops.matmul(
             eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
@@ -1421,8 +1779,11 @@ class FullyConnectedMultiKF(InverseProvidingFactor):
         ops.append(Lmat_var.assign(Lmat))
         ops.append(psi_var.assign(psi))
 
-      for damping, (Pmat_var, Kmat_var,
-                    mu_var) in self._option2quants_by_damping.items():
+      for damping_id, (Pmat_var, Kmat_var,
+                       mu_var) in self._option2quants_by_damping.items():
+
+        damping = self._damping_funcs_by_id[damping_id]()
+        damping = math_ops.cast(damping, self._dtype)
 
         # compute C0^(-1/2)
         invsqrtC0 = math_ops.matmul(
@@ -1463,6 +1824,7 @@ class FullyConnectedMultiKF(InverseProvidingFactor):
         ops.append(Kmat_var.assign(Kmat))
         ops.append(mu_var.assign(mu))
 
+    ops += super(FullyConnectedMultiKF, self).make_inverse_update_ops()
     return [control_flow_ops.group(*ops)]
 
     # pylint: enable=invalid-name
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index ce9005b9ce99a4efa5f2821c56e199dd2086482e..cbbfe7212c9d946d4b5bf3690796cb248f72e8d3 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -26,7 +26,9 @@ from __future__ import print_function
 
 from collections import defaultdict
 from collections import OrderedDict
+from contextlib import contextmanager
 from functools import partial
+import warnings
 
 import math
 import six
@@ -59,6 +61,10 @@ _CONV2D_APPROX_TO_BLOCK_TYPES = {
     APPROX_DIAGONAL_NAME: fb.ConvDiagonalFB,
 }
 
+_EMBEDDING_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_NAME: fb.EmbeddingKFACFB
+}
+
 APPROX_KRONECKER_INDEP_NAME = "kron_indep"
 APPROX_KRONECKER_SERIES_1_NAME = "kron_series_1"
 APPROX_KRONECKER_SERIES_2_NAME = "kron_series_2"
@@ -71,10 +77,39 @@ _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES = {
                                             option=2)
 }
 
-# Possible value for 'reuse' keyword argument. Sets 'reuse' to
+_CONV2D_MULTI_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_INDEP_NAME: fb.ConvKFCBasicMultiIndepFB
+}
+
+_EMBEDDING_MULTI_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_INDEP_NAME: fb.EmbeddingKFACMultiIndepFB
+}
+
+# Possible value for `reuse` keyword argument. Sets `reuse` to
 # tf.get_variable_scope().reuse.
 VARIABLE_SCOPE = "VARIABLE_SCOPE"
 
+_DEFAULT_LAYER_COLLECTION = None
+
+
+def get_default_layer_collection():
+  """Get default LayerCollection."""
+  if _DEFAULT_LAYER_COLLECTION is None:
+    raise ValueError(
+        "Attempted to retrieve default LayerCollection when none is set. Use "
+        "LayerCollection.as_default().")
+
+  return _DEFAULT_LAYER_COLLECTION
+
+
+def set_default_layer_collection(layer_collection):
+  global _DEFAULT_LAYER_COLLECTION
+
+  if _DEFAULT_LAYER_COLLECTION is not None and layer_collection is not None:
+    raise ValueError("Default LayerCollection is already set.")
+
+  _DEFAULT_LAYER_COLLECTION = layer_collection
+
 
 class LayerParametersDict(OrderedDict):
   """An OrderedDict where keys are Tensors or tuples of Tensors.
@@ -130,11 +165,16 @@ class LayerCollection(object):
     fisher_factors: an OrderedDict mapping tuples to FisherFactor instances.
     losses: a list of LossFunction objects. The loss to be optimized is their
         sum.
+    loss_colocation_ops: ops to colocate loss function evaluations with.  These
+        will typically be the inputs to the losses.
   """
 
   def __init__(self,
                graph=None,
                name="LayerCollection"):
+    warnings.warn(
+        "tf.contrib.kfac is deprecated and will be removed by 2018-11-01. "
+        "Use https://pypi.python.org/pypi/kfac instead.")
     self.fisher_blocks = LayerParametersDict()
     self.fisher_factors = OrderedDict()
     self._linked_parameters = dict(
@@ -142,20 +182,30 @@ class LayerCollection(object):
     self._graph = graph or ops.get_default_graph()
     self._loss_dict = {}  # {str: LossFunction}
     self._subgraph = None
-    self._default_generic_approximation = APPROX_FULL_NAME
+    self._default_generic_approximation = APPROX_DIAGONAL_NAME
     self._default_embedding_approximation = APPROX_KRONECKER_NAME
     self._default_fully_connected_approximation = APPROX_KRONECKER_NAME
-    self._default_convolution_2d_approximation = APPROX_KRONECKER_NAME
+    self._default_conv2d_approximation = APPROX_KRONECKER_NAME
     self._default_fully_connected_multi_approximation = (
-        APPROX_KRONECKER_SERIES_2_NAME)
+        APPROX_KRONECKER_INDEP_NAME)
+    self._default_conv2d_multi_approximation = (
+        APPROX_KRONECKER_INDEP_NAME)
+    self._default_embedding_multi_approximation = APPROX_KRONECKER_INDEP_NAME
+    self.loss_colocation_ops = {}
+    self._vars_to_uses = defaultdict(lambda: 0)
 
     with variable_scope.variable_scope(None, default_name=name) as scope:
       self._var_scope = scope.name
 
   @property
   def losses(self):
-    """LossFunctions registered with this LayerCollection."""
-    return list(self._loss_dict.values())
+    """Tuple of LossFunction objects registered with this LayerCollection."""
+    return nest.flatten(self.towers_by_loss)
+
+  @property
+  def towers_by_loss(self):
+    """Tuple across losses of LossFunction objects registered to each tower."""
+    return tuple(tuple(lst) for lst in self._loss_dict.values())
 
   @property
   def registered_variables(self):
@@ -214,14 +264,14 @@ class LayerCollection(object):
 
   @property
   def default_conv2d_approximation(self):
-    return self._default_convolution_2d_approximation
+    return self._default_conv2d_approximation
 
   def set_default_conv2d_approximation(self, value):
     if value not in _CONV2D_APPROX_TO_BLOCK_TYPES:
       raise ValueError(
           "{} is not a valid approximation for 2d convolutional layers.".format(
               value))
-    self._default_convolution_2d_approximation = value
+    self._default_conv2d_approximation = value
 
   @property
   def default_fully_connected_multi_approximation(self):
@@ -233,6 +283,14 @@ class LayerCollection(object):
                        "multi layer.".format(value))
     self._default_fully_connected_multi_approximation = value
 
+  @property
+  def default_conv2d_multi_approximation(self):
+    return self._default_conv2d_multi_approximation
+
+  @property
+  def default_embedding_multi_approximation(self):
+    return self._default_embedding_multi_approximation
+
   def register_block(self, layer_key, fisher_block, reuse=VARIABLE_SCOPE):
     """Validates and registers the layer_key associated with the fisher_block.
 
@@ -240,8 +298,8 @@ class LayerCollection(object):
       layer_key: A variable or tuple of variables. The key to check for in
           existing registrations and to register if valid.
       fisher_block: The associated `FisherBlock`.
-      reuse: Method to use for inserting new `FisherBlock`s. One of True, False,
-        or 'VARIABLE_SCOPE'.
+      reuse: Method to use for inserting new `FisherBlock's. One of True, False,
+        or `VARIABLE_SCOPE`.
 
     Raises:
       ValueError: If `layer_key` was already registered and reuse is `False`,
@@ -290,23 +348,73 @@ class LayerCollection(object):
     self.fisher_blocks[layer_key] = fisher_block
     return fisher_block
 
-  def get_use_count_map(self):
-    """Returns a dict of variables to their number of registrations."""
-    # TODO(b/70283403): Reimplement this in the old way, where each
-    # registration function would be responsible for incrementing the count.
-    # Also, this version has a bug: it won't do the right thing for generic
-    # registration for parameters that are shared.  i.e. it won't set the use
-    # count to infinity.
-    vars_to_uses = defaultdict(int)
-    for key, block in six.iteritems(self.fisher_blocks):
-      n = (
-          block.num_inputs()*block.num_registered_minibatches if isinstance(
-              block, (fb.FullyConnectedSeriesFB, fb.FullyConnectedMultiIndepFB))
-          else block.num_registered_minibatches)
-      key = utils.ensure_sequence(key)
-      for k in key:
-        vars_to_uses[k] += n
-    return vars_to_uses
+  def register_loss_function(self,
+                             loss,
+                             colocation_op,
+                             base_name,
+                             name=None,
+                             reuse=VARIABLE_SCOPE):
+    """Registers a LossFunction object.
+
+    Args:
+      loss: The LossFunction object.
+      colocation_op: The op to colocate the loss function's computations with.
+      base_name: The name to derive a new unique name from is the name argument
+        is None.
+      name: (OPTIONAL) str or None. Unique name for this loss function. If None,
+        a new name is generated. (Default: None)
+      reuse: (OPTIONAL) bool or str.  If True, adds `loss` as an additional
+        tower for the existing loss function.
+
+    Raises:
+      ValueError: If reuse == True and name == None.
+      ValueError: If reuse == True and seed != None.
+      KeyError: If reuse == True and no existing LossFunction with `name` found.
+      KeyError: If reuse == False and existing LossFunction with `name` found.
+    """
+
+    name = name or self._graph.unique_name(base_name)
+
+    if reuse == VARIABLE_SCOPE:
+      reuse = variable_scope.get_variable_scope().reuse
+
+    if reuse:
+      if name is None:
+        raise ValueError(
+            "If reuse is enabled, loss function's name must be set.")
+
+      loss_list = self._loss_dict.get(name, None)
+
+      if loss_list is None:
+        raise KeyError(
+            "Unable to find loss function named {}. Register a new loss "
+            "function with reuse=False.".format(name))
+    else:
+      if name in self._loss_dict:
+        raise KeyError(
+            "Loss function named {} already exists. Set reuse=True to append "
+            "another tower.".format(name))
+
+      loss_list = []
+      self._loss_dict[name] = loss_list
+
+    loss_list.append(loss)
+    self.loss_colocation_ops[loss] = colocation_op
+
+  def _get_use_count_map(self):
+    """Returns a dict mapping variables to their number of registrations."""
+    return self._vars_to_uses
+
+  def _add_uses(self, params, uses):
+    """Register additional uses by params in the graph.
+
+    Args:
+      params: Variable or tuple of Variables. Parameters for a layer.
+      uses: int or float. Number of additional uses for these parameters.
+    """
+    params = params if isinstance(params, (tuple, list)) else (params,)
+    for var in params:
+      self._vars_to_uses[var] += uses
 
   def check_registration(self, variables):
     """Checks that all variable uses have been registered properly.
@@ -324,7 +432,7 @@ class LayerCollection(object):
     # Note that overlapping parameters (i.e. those that share variables) will
     # be caught by layer_collection.LayerParametersDict during registration.
 
-    reg_use_map = self.get_use_count_map()
+    reg_use_map = self._get_use_count_map()
 
     error_messages = []
 
@@ -386,24 +494,24 @@ class LayerCollection(object):
     """
     params = frozenset(utils.ensure_sequence(params))
 
-    # Check if any of the variables in 'params' is already in
-    # 'self.fisher_blocks.keys()'.
+    # Check if any of the variables in `params` is already in
+    # 'self.fisher_blocks.keys()`.
     for registered_params, fisher_block in self.fisher_blocks.items():
       registered_params_set = set(utils.ensure_sequence(registered_params))
       for variable in params:
         if (variable in registered_params_set and
             params != registered_params_set):
           raise ValueError(
-              "Can't link parameters {}, variable {} was already registered in "
+              "Can`t link parameters {}, variable {} was already registered in "
               "group {} with layer {}".format(params, variable,
                                               registered_params, fisher_block))
 
-    # Check if any of the variables in 'params' is already in
-    # 'self.linked_parameters'.
+    # Check if any of the variables in `params` is already in
+    # 'self.linked_parameters`.
     for variable in params:
       for other_linked_params in self.linked_parameters:
         if variable in other_linked_params:
-          raise ValueError("Can't link parameters {}, variable {} was already "
+          raise ValueError("Can`t link parameters {}, variable {} was already "
                            "linked in group {}.".format(params, variable,
                                                         other_linked_params))
     self._linked_parameters[params] = approximation
@@ -414,12 +522,27 @@ class LayerCollection(object):
     inputs_to_losses = nest.flatten(tuple(loss.inputs for loss in self.losses))
     self._subgraph = utils.SubGraph(inputs_to_losses)
 
+  def eval_losses(self):
+    """Return evaluated losses (colocated with inputs to losses)."""
+    evals = []
+    for loss in self.losses:
+      with ops.colocate_with(self.loss_colocation_ops[loss]):
+        evals.append(loss.evaluate())
+    return evals
+
+  def eval_losses_on_samples(self):
+    """Return losses evaluated on samples (colocated with inputs to losses)."""
+    evals = []
+    for loss in self.losses:
+      with ops.colocate_with(self.loss_colocation_ops[loss]):
+        evals.append(loss.evaluate_on_sample())
+    return evals
+
   def total_loss(self):
-    return math_ops.add_n(tuple(loss.evaluate() for loss in self.losses))
+    return math_ops.add_n(self.eval_losses())
 
   def total_sampled_loss(self):
-    return math_ops.add_n(
-        tuple(loss.evaluate_on_sample() for loss in self.losses))
+    return math_ops.add_n(self.eval_losses_on_samples())
 
   def _get_linked_approx(self, params):
     """If params were linked, return their specified approximation."""
@@ -429,45 +552,56 @@ class LayerCollection(object):
     else:
       return None
 
+  def _get_block_type(self, params, approx, default, approx_to_type):
+    if approx is None:
+      approx = self._get_linked_approx(params)
+      if approx is None:
+        approx = default
+
+    if approx not in approx_to_type:
+      raise ValueError("Bad value {} for approx.".format(approx))
+
+    return approx_to_type[approx], approx
+
   def register_embedding(self,
                          params,
                          inputs,
                          outputs,
                          approx=None,
                          reuse=VARIABLE_SCOPE):
-    """Registers a fully connnected layer.
+    """Registers an embedding layer.
 
     Args:
       params: Embedding matrix of shape [vocab_size, embedding_size].
       inputs: Tensor of shape [batch_size, input_size] and dtype int32. Indices
         into embedding matrix.
-      outputs: Tensor of shape [batch_size, output_size]. Outputs
+      outputs: Tensor of shape [batch_size, embedding_size]. Outputs
         produced by layer.
-      approx: str. Must be "kron".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      approx: str or None. If not None must be "kron".  The Fisher
+        approximation to use. If None the default value is used. (Default: None)
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_embedding_approximation
-
-    if approx != APPROX_KRONECKER_NAME:
-      raise ValueError("Bad value {} for approx.".format(approx))
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_embedding_approximation,
+        _EMBEDDING_APPROX_TO_BLOCK_TYPES)
 
     if isinstance(params, (tuple, list)):
       raise ValueError("Bias not supported.")
-
     vocab_size = int(params.shape[0])
     block = self.register_block(
-        params, fb.EmbeddingKFACFB(self, vocab_size), reuse=reuse)
-    block.register_additional_minibatch(inputs, outputs)
+        params, block_type(self, vocab_size), reuse=reuse)
+    block.register_additional_tower(inputs, outputs)
+
+    self._add_uses(params, 1)
 
   def register_fully_connected(self,
                                params,
@@ -484,29 +618,31 @@ class LayerCollection(object):
       inputs: Tensor of shape [batch_size, input_size]. Inputs to layer.
       outputs: Tensor of shape [batch_size, output_size]. Outputs
         produced by layer.
-      approx: str. One of "kron" or "diagonal".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      approx: str or None. If not None must be one of "kron" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_fully_connected_approximation
 
-    if approx not in _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES:
-      raise ValueError("Bad value {} for approx.".format(approx))
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_fully_connected_approximation,
+        _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES)
 
-    block_type = _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES[approx]
     has_bias = isinstance(params, (tuple, list))
+    block = self.register_block(params, block_type(self, has_bias=has_bias),
+                                reuse=reuse)
+    block.register_additional_tower(inputs, outputs)
 
-    block = self.register_block(params, block_type(self, has_bias), reuse=reuse)
-    block.register_additional_minibatch(inputs, outputs)
+    self._add_uses(params, 1)
 
   def register_conv2d(self,
                       params,
@@ -514,44 +650,262 @@ class LayerCollection(object):
                       padding,
                       inputs,
                       outputs,
+                      data_format=None,
+                      dilations=None,
                       approx=None,
                       reuse=VARIABLE_SCOPE):
-    """Registers a convolutional layer.
+    """Registers a call to tf.nn.conv2d().
 
     Args:
       params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
         this layer. Weight matrix should have shape [kernel_height,
         kernel_width, in_channels, out_channels].  Bias should have shape
         [out_channels].
-      strides: 1-D Tensor of length 4. Strides for convolution kernel.
+      strides: List of 4 ints. Strides for convolution kernel.
       padding: string. see tf.nn.conv2d for valid values.
       inputs: Tensor of shape [batch_size, height, width, in_channels]. Inputs
         to layer.
       outputs: Tensor of shape [batch_size, height, width, out_channels].
         Output produced by layer.
-      approx: str. One of "kron" or "diagonal".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      data_format: str or None. Format of data.
+      dilations: List of 4 ints. Dilations along each dimension.
+      approx: str or None. If not None must be one of "kron" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
 
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_conv2d_approximation
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_conv2d_approximation,
+        _CONV2D_APPROX_TO_BLOCK_TYPES)
+
+    # It feels bad to pass in configuration that has to do with the internal
+    # implementation.  And then we can`t use the same constructor for both
+    # anymore and are thus forced to use this ugly if-statement.
+    # TODO(b/74793309): Clean this up?
+    if approx == APPROX_KRONECKER_NAME:
+      block = self.register_block(
+          params,
+          block_type(
+              layer_collection=self,
+              params=params,
+              padding=padding,
+              strides=strides,
+              data_format=data_format,
+              dilation_rate=dilations,
+              extract_patches_fn="extract_image_patches"),
+          reuse=reuse)
+    elif approx == APPROX_DIAGONAL_NAME:
+      assert strides[0] == strides[-1] == 1
+      block = self.register_block(
+          params,
+          block_type(
+              layer_collection=self,
+              params=params,
+              padding=padding,
+              strides=strides,
+              dilations=dilations,
+              data_format=data_format),
+          reuse=reuse)
+    else:
+      raise NotImplementedError(approx)
 
-    if approx not in _CONV2D_APPROX_TO_BLOCK_TYPES:
-      raise ValueError("Bad value {} for approx.".format(approx))
+    block.register_additional_tower(inputs, outputs)
+
+    self._add_uses(params, 1)
+
+  def register_convolution(self,
+                           params,
+                           inputs,
+                           outputs,
+                           padding,
+                           strides=None,
+                           dilation_rate=None,
+                           data_format=None,
+                           approx=None,
+                           reuse=VARIABLE_SCOPE):
+    """Register a call to tf.nn.convolution().
+
+    Args:
+      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
+        this layer. Weight matrix should have shape [..filter_spatial_size..,
+        in_channels, out_channels].  Bias should have shape [out_channels].
+      inputs: Tensor of shape [batch_size, ..input_spatial_size.., in_channels].
+        Inputs to layer.
+      outputs: Tensor of shape [batch_size, ..output_spatial_size..,
+        out_channels].  Output produced by layer.
+      padding: string. see tf.nn.conv2d for valid values.
+      strides: List of ints of length len(..input_spatial_size..). Strides for
+        convolution kernel in spatial dimensions.
+      dilation_rate: List of ints of length len(..input_spatial_size..).
+        Dilations along spatial dimension.
+      data_format: str or None. Format of data.
+      approx: str or None. If not None must be one of "kron" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
+
+    Raises:
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
+      ValueError: If reuse == True and FisherBlock found but of the wrong type.
+    """
+    # TODO(b/74793309): Have this use _get_block_type like the other
+    # registration functions?
+    assert approx is None or approx == APPROX_KRONECKER_NAME
 
-    block_type = _CONV2D_APPROX_TO_BLOCK_TYPES[approx]
     block = self.register_block(
-        params, block_type(self, params, strides, padding), reuse=reuse)
-    block.register_additional_minibatch(inputs, outputs)
+        params,
+        fb.ConvKFCBasicFB(
+            layer_collection=self,
+            params=params,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilation_rate,
+            data_format=data_format),
+        reuse=reuse)
+    block.register_additional_tower(inputs, outputs)
+
+    self._add_uses(params, 1)
+
+  def register_depthwise_conv2d(self,
+                                params,
+                                inputs,
+                                outputs,
+                                strides,
+                                padding,
+                                rate=None,
+                                data_format=None,
+                                approx=None,
+                                reuse=VARIABLE_SCOPE):
+    """Register a call to tf.nn.depthwise_conv2d().
+
+    Args:
+      params: 4-D Tensor of shape [filter_height, filter_width,
+        in_channels, channel_multiplier].  Convolutional filter.
+      inputs: Tensor of shape [batch_size, input_height, input_width,
+        in_channels].  Inputs to layer.
+      outputs: Tensor of shape [batch_size, output_height, output_width,
+        in_channels * channel_multiplier].  Output produced by depthwise conv2d.
+      strides: List of ints of length 4. Strides along all dimensions.
+      padding: string. see tf.nn.conv2d for valid values.
+      rate: None or List of ints of length 2. Dilation rates in spatial
+        dimensions.
+      data_format: str or None. Format of data.
+      approx: str or None. If not None must "diagonal".  The Fisher
+        approximation to use. If None the default value is used. (Default: None)
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
+
+    Raises:
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
+      ValueError: If reuse == True and FisherBlock found but of the wrong type.
+    """
+    # TODO(b/74793309): Have this use _get_block_type like the other
+    # registration functions?
+    assert approx is None or approx == APPROX_DIAGONAL_NAME
+    assert data_format in [None, "NHWC"]
+
+    block = self.register_block(
+        params,
+        fb.DepthwiseConvDiagonalFB(
+            layer_collection=self,
+            params=params,
+            strides=strides,
+            padding=padding,
+            rate=rate,
+            data_format=data_format),
+        reuse=reuse)
+    block.register_additional_tower(inputs, outputs)
+
+    self._add_uses(params, 1)
+
+  def register_separable_conv2d(self,
+                                depthwise_params,
+                                pointwise_params,
+                                inputs,
+                                depthwise_outputs,
+                                pointwise_outputs,
+                                strides,
+                                padding,
+                                rate=None,
+                                data_format=None,
+                                approx=None,
+                                reuse=VARIABLE_SCOPE):
+    """Register a call to tf.nn.separable_conv2d().
+
+    Note: This requires access to intermediate outputs between depthwise and
+    pointwise convolutions.
+
+    Args:
+      depthwise_params: 4-D Tensor of shape [filter_height, filter_width,
+        in_channels, channel_multiplier].  Filter for depthwise conv2d.
+      pointwise_params: 4-D Tensor of shape [1, 1, in_channels *
+        channel_multiplier, out_channels].  Filter for pointwise conv2d.
+      inputs: Tensor of shape [batch_size, input_height, input_width,
+        in_channels].  Inputs to layer.
+      depthwise_outputs: Tensor of shape [batch_size, output_height,
+        output_width, in_channels * channel_multiplier].  Output produced by
+        depthwise conv2d.
+      pointwise_outputs: Tensor of shape [batch_size, output_height,
+        output_width, out_channels].  Output produced by pointwise conv2d.
+      strides: List of ints of length 4. Strides for depthwise conv2d kernel in
+        all dimensions.
+      padding: string. see tf.nn.conv2d for valid values.
+      rate: None or List of ints of length 2. Dilation rate of depthwise conv2d
+        kernel in spatial dimensions.
+      data_format: str or None. Format of data.
+      approx: str or None. If not None must be one of "kron" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
+
+    Raises:
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
+      ValueError: If reuse == True and FisherBlock found but of the wrong type.
+    """
+    self.register_depthwise_conv2d(
+        params=depthwise_params,
+        inputs=inputs,
+        outputs=depthwise_outputs,
+        strides=strides,
+        padding=padding,
+        rate=rate,
+        data_format=data_format,
+        approx=APPROX_DIAGONAL_NAME,
+        reuse=reuse)
+
+    self.register_conv2d(
+        params=pointwise_params,
+        inputs=depthwise_outputs,
+        outputs=pointwise_outputs,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        data_format=data_format,
+        approx=approx,
+        reuse=reuse)
 
   def register_generic(self,
                        params,
@@ -562,32 +916,32 @@ class LayerCollection(object):
 
     Args:
       params: Tensor or tuple of Tensors corresponding to the parameters.
-      batch_size: 0-D Tensor. Size of the minibatch.
-      approx: str. One of "full" or "diagonal".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      batch_size: 0-D Tensor. Size of the minibatch (for this tower).
+      approx: str or None. It not None, must be one of "full" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str. If True, this adds `batch_size` to the total
+        mini-batch size use when estimating the Fisher block for this layer
+        (which must have already been registered). If "VARIABLE_SCOPE", use
+        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_generic_approximation,
+        _GENERIC_APPROX_TO_BLOCK_TYPES)
 
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_generic_approximation
-
-    if approx not in _GENERIC_APPROX_TO_BLOCK_TYPES:
-      raise ValueError("Bad value {} for approx.".format(approx))
-
-    block_type = _GENERIC_APPROX_TO_BLOCK_TYPES[approx]
     block = self.register_block(params, block_type(self, params), reuse=reuse)
-    block.register_additional_minibatch(batch_size)
+    block.register_additional_tower(batch_size)
+
+    self._add_uses(params, float("inf"))
 
   def register_fully_connected_multi(self, params, inputs, outputs,
-                                     approx=None):
+                                     num_uses=None, approx=None,
+                                     reuse=VARIABLE_SCOPE):
     """Register fully connected layers with shared parameters.
 
     This can handle general fully-connected layers with shared parameters, but
@@ -598,34 +952,194 @@ class LayerCollection(object):
       params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
         this layer. Weight matrix should have shape [input_size, output_size].
         Bias should have shape [output_size].
-      inputs: A list of tensors, each of shape [batch_size, input_size]. Inputs
-        to layer. In the case of RNNs, one Tensor per time step.
-      outputs: A list of tensors, the same length as 'inputs', each of shape
-        [batch_size, output_size]. Outputs produced by layer. In the case of
-        RNNs, one Tensor per time step.
-      approx: str. One of "kron_indep", "kron_series_1", or "kron_series_2".
+      inputs: A list of Tensors, each of shape [batch_size, input_size]. Inputs
+        to layer. The list indexes each use in the graph (which might
+        correspond to a "time-step" in an RNN). OR, can be single Tensor, of
+        shape [num_uses * batch_size , input_size], which is a reshaped version
+        of a Tensor of shape [num_uses, batch_size, input_size].
+      outputs: A list of Tensors, the same length as `inputs`, each of shape
+        [batch_size, output_size]. Outputs produced by layer. The list indexes
+        each use in the graph (which might correspond to a "time-step" in an
+        RNN). Needs to correspond with the order used in `inputs`.  OR, can be
+        a single Tensor of shape [num_uses * batch_size, output_size], which is
+        a reshaped version of a Tensor of shape [num_uses, batch_size,
+        output_size].
+      num_uses: int or None. The number uses/time-steps in the graph where the
+        layer appears. Only needed if both inputs and outputs are given in the
+        single Tensor format. (Default: None)
+      approx: str or None. If not None, must be of "kron_indep", "kron_series_1"
+        or "kron_series_2". The Fisher approximation to use. If None the default
+        value is used. (Default: None)
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
+      ValueError: For improper value to `approx`.
     """
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_fully_connected_multi_approximation
-    has_bias = isinstance(params, (tuple, list))
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_fully_connected_multi_approximation,
+        _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES)
 
     # TODO(b/70283649): something along the lines of find_canonical_output
     # should be added back in here (and for the other block types, arguably).
 
-    if approx not in _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES:
-      raise ValueError("Bad value {} for approx.".format(approx))
-    block_type = _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES[approx]
+    has_bias = isinstance(params, (tuple, list))
+    block = self.register_block(params, block_type(self, has_bias=has_bias,
+                                                   num_uses=num_uses),
+                                reuse=reuse)
+    block.register_additional_tower(inputs, outputs)
+    if isinstance(inputs, (tuple, list)):
+      assert len(inputs) == len(outputs)
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
+
+  def register_conv2d_multi(self,
+                            params,
+                            strides,
+                            padding,
+                            inputs,
+                            outputs,
+                            num_uses=None,
+                            data_format=None,
+                            dilations=None,
+                            approx=None,
+                            reuse=VARIABLE_SCOPE):
+    """Registers convolutional layers with shared parameters.
+
+    Args:
+      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
+        this layer. Weight matrix should have shape [kernel_height,
+        kernel_width, in_channels, out_channels].  Bias should have shape
+        [out_channels].
+      strides: 1-D Tensor of length 4. Strides for convolution kernel.
+      padding: string. see tf.nn.conv2d for valid values.
+      inputs: A list of Tensors, each of shape [batch_size, height, width,
+        in_channels]. Inputs to layer. The list indexes each use in the graph
+        (which might correspond to a "time-step" in an RNN). OR, can be single
+        Tensor, of shape [num_uses * batch_size, height, width, in_channels],
+        which is a reshaped version of a Tensor of shape [num_uses, batch_size,
+        height, width, in_channels].
+      outputs: A list of Tensors, each of shape [batch_size, height, width,
+        out_channels]. Output produced by layer. The list indexes each use
+        in the graph (which might correspond to a "time-step" in an RNN).
+        Needs to correspond with the order used in `inputs`.  OR, can be a
+        single Tensor, of shape [num_uses * batch_size, height, width,
+        out_channels], which is a reshaped version of a Tensor of shape
+        [num_uses, batch_size, height, width, out_channels].
+      num_uses: int or None. The number uses/time-steps in the graph where the
+        layer appears. Only needed if both inputs and outputs are given in the
+        single Tensor format. (Default: None)
+      data_format: str or None. Format of data.
+      dilations: List of 4 ints. Dilations along each dimension.
+      approx: str or None. If not None must by "kron_indep". The Fisher
+        approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
+        (Default: "VARIABLE_SCOPE")
+
+    Raises:
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
+      ValueError: If reuse == True and FisherBlock found but of the wrong type.
+    """
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_conv2d_multi_approximation,
+        _CONV2D_MULTI_APPROX_TO_BLOCK_TYPES)
+
+    block = self.register_block(
+        params,
+        block_type(
+            layer_collection=self,
+            params=params,
+            padding=padding,
+            strides=strides,
+            data_format=data_format,
+            dilation_rate=dilations,
+            extract_patches_fn="extract_image_patches",
+            num_uses=num_uses),
+        reuse=reuse)
+
+    block.register_additional_tower(inputs, outputs)
+    if isinstance(inputs, (tuple, list)):
+      assert len(inputs) == len(outputs)
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
+
+  # TODO(b/74108452): change the loss registration functions names to refer
+  # to "loss functions" instead of distributions.  Following naming convention
+  # of the loss function classes themselves.
+
+  def register_embedding_multi(self,
+                               params,
+                               inputs,
+                               outputs,
+                               num_uses=None,
+                               approx=None,
+                               reuse=VARIABLE_SCOPE):
+    """Registers embedding layers with shared parameters.
+
+    Args:
+      params: Embedding matrix of shape [vocab_size, embedding_size].
+      inputs: A list of Tensors, each of shape [batch_size, input_size] and
+        dtype int32. Indices into embedding matrix. The list indexes each use
+        in the graph (which might correspond to a "time-step" in an RNN).
+        OR, can be single Tensor, of shape [num_uses*batch_size, input_size],
+        which is a reshaped version of a Tensor of shape [num_uses, batch_size,
+        input_size].
+      outputs: A list of Tensors, each of shape [batch_size, embedding_size].
+        Outputs produced by layer. The list indexes each use in the graph
+        (which might correspond to a "time-step" in an RNN). Needs to
+        correspond with the order used in `inputs`. OR, can be a
+        single Tensor, of shape [num_uses * batch_size, embedding_size], which
+        is a reshaped version of a Tensor of shape [num_uses, batch_size,
+        embedding_size].
+      num_uses: int or None. The number uses/time-steps in the graph where the
+        layer appears. Only needed if both inputs and outputs are given in the
+        single Tensor format. (Default: None)
+      approx: str or None. If not None must by "kron_indep". The Fisher
+        approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
+        (Default: "VARIABLE_SCOPE")
+
+    Raises:
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
+      ValueError: If reuse == True and FisherBlock found but of the wrong type.
+    """
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_embedding_multi_approximation,
+        _EMBEDDING_MULTI_APPROX_TO_BLOCK_TYPES)
+
+    if isinstance(params, (tuple, list)):
+      raise ValueError("Bias not supported.")
+    vocab_size = int(params.shape[0])
+
+    block = self.register_block(
+        params, block_type(self, vocab_size, num_uses=num_uses), reuse=reuse)
+    block.register_additional_tower(inputs, outputs)
 
-    # For now we don't support multiple minibatches for this type of layer, so
-    # we set reuse=False
-    self.register_block(params,
-                        block_type(self, inputs, outputs, has_bias=has_bias),
-                        reuse=False)
+    if isinstance(inputs, (tuple, list)):
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
 
   def register_categorical_predictive_distribution(self,
                                                    logits,
@@ -645,53 +1159,24 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: (OPTIONAL) bool or str.  If True, reuse an existing FisherBlock.
-        If False, create a new FisherBlock.  If VARIABLE_SCOPE, use
-        tf.get_variable_scope().reuse.
-
-    Raises:
-      ValueError: If reuse == True and name == None.
-      ValueError: If reuse == True and seed != None.
-      KeyError: If reuse == True and no existing LossFunction with 'name' found.
-      KeyError: If reuse == False and existing LossFunction with 'name' found.
+      reuse: bool or str.  If True, this adds `logits` as an additional
+        mini-batch/tower of inputs to the loss-function/predictive distribution
+        (which must have already been registered). If "VARIABLE_SCOPE", use
+        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
     """
-    name = name or self._graph.unique_name(
-        "register_categorical_predictive_distribution")
-
-    if reuse == VARIABLE_SCOPE:
-      reuse = variable_scope.get_variable_scope().reuse
-
-    if reuse:
-      if name is None:
-        raise ValueError(
-            "If reuse is enabled, loss function's name must be set.")
-      if seed is not None:
-        raise ValueError(
-            "Seed can only be specified at LossFunction instantiation.")
-
-      loss = self._loss_dict.get(name, None)
-
-      if loss is None:
-        raise KeyError(
-            "Unable to find loss function named {}. Create a new LossFunction "
-            "with reuse=False.".format(name))
-
-      loss.register_additional_minibatch(logits, targets=targets)
-    else:
-      if name in self._loss_dict:
-        raise KeyError(
-            "Loss function named {} already exists. Set reuse=True to append "
-            "another minibatch.".format(name))
-      loss = lf.CategoricalLogitsNegativeLogProbLoss(
-          logits, targets=targets, seed=seed)
-      self._loss_dict[name] = loss
+    loss = lf.CategoricalLogitsNegativeLogProbLoss(logits, targets=targets,
+                                                   seed=seed)
+    self.register_loss_function(loss, logits,
+                                "categorical_predictive_distribution",
+                                name=name, reuse=reuse)
 
   def register_normal_predictive_distribution(self,
                                               mean,
                                               var=0.5,
                                               seed=None,
                                               targets=None,
-                                              name=None):
+                                              name=None,
+                                              reuse=VARIABLE_SCOPE):
     """Registers a normal predictive distribution.
 
     Args:
@@ -708,21 +1193,23 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
+      reuse: bool or str.  If True, this adds `mean` and `var` as an additional
+        mini-batch/tower of inputs to the loss-function/predictive distribution
+        (which must have already been registered). If "VARIABLE_SCOPE", use
+        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
     """
-    name = name or self._graph.unique_name(
-        "register_normal_predictive_distribution")
-    if name in self._loss_dict:
-      raise NotImplementedError(
-          "Adding logits to an existing LossFunction not yet supported.")
-    loss = lf.NormalMeanNegativeLogProbLoss(
-        mean, var, targets=targets, seed=seed)
-    self._loss_dict[name] = loss
+    loss = lf.NormalMeanNegativeLogProbLoss(mean, var, targets=targets,
+                                            seed=seed)
+    self.register_loss_function(loss, mean,
+                                "normal_predictive_distribution",
+                                name=name, reuse=reuse)
 
   def register_multi_bernoulli_predictive_distribution(self,
                                                        logits,
                                                        seed=None,
                                                        targets=None,
-                                                       name=None):
+                                                       name=None,
+                                                       reuse=VARIABLE_SCOPE):
     """Registers a multi-Bernoulli predictive distribution.
 
     Args:
@@ -735,29 +1222,30 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
+      reuse: bool or str.  If True, this adds `logits` as an additional
+        mini-batch/tower of inputs to the loss-function/predictive distribution
+        (which must have already been registered). If "VARIABLE_SCOPE", use
+        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
     """
-    name = name or self._graph.unique_name(
-        "register_multi_bernoulli_predictive_distribution")
-    if name in self._loss_dict:
-      raise NotImplementedError(
-          "Adding logits to an existing LossFunction not yet supported.")
-    loss = lf.MultiBernoulliNegativeLogProbLoss(
-        logits, targets=targets, seed=seed)
-    self._loss_dict[name] = loss
+    loss = lf.MultiBernoulliNegativeLogProbLoss(logits, targets=targets,
+                                                seed=seed)
+    self.register_loss_function(loss, logits,
+                                "multi_bernoulli_predictive_distribution",
+                                name=name, reuse=reuse)
 
   def make_or_get_factor(self, cls, args):
-    """Insert 'cls(args)' into 'self.fisher_factors' if not already present.
+    """Insert `cls(args)` into 'self.fisher_factors` if not already present.
 
-    Wraps constructor in 'tf.variable_scope()' to ensure variables constructed
-    in 'cls.__init__' are placed under this LayerCollection's scope.
+    Wraps constructor in `tf.variable_scope()` to ensure variables constructed
+    in `cls.__init__` are placed under this LayerCollection's scope.
 
     Args:
       cls: Class that implements FisherFactor.
-      args: Tuple of arguments to pass into 'cls's constructor. Must be
+      args: Tuple of arguments to pass into `cls's constructor. Must be
         hashable.
 
     Returns:
-      Instance of 'cls' found in self.fisher_factors.
+      Instance of `cls` found in self.fisher_factors.
     """
     try:
       hash(args)
@@ -772,3 +1260,10 @@ class LayerCollection(object):
       with variable_scope.variable_scope(self._var_scope):
         self.fisher_factors[key] = cls(*args)
     return self.fisher_factors[key]
+
+  @contextmanager
+  def as_default(self):
+    """Sets this LayerCollection as the default."""
+    set_default_layer_collection(self)
+    yield
+    set_default_layer_collection(None)
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py b/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py
index f8aa230d9ca1f542950f56b1e6cf1ab7ccd3d05f..9f4685380705bd409dbcd7e85d0e3bb4189a6adc 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py
@@ -30,6 +30,8 @@ from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
 
 _allowed_symbols = [
+    "get_default_layer_collection",
+    "set_default_layer_collection",
     "LayerParametersDict",
     "LayerCollection",
     "APPROX_KRONECKER_NAME",
diff --git a/tensorflow/contrib/kfac/python/ops/linear_operator.py b/tensorflow/contrib/kfac/python/ops/linear_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..61cb955ae85df9e56cbe165acba98ece750cba90
--- /dev/null
+++ b/tensorflow/contrib/kfac/python/ops/linear_operator.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SmartMatrices definitions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kfac.python.ops import utils
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg
+from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.ops.linalg import linear_operator_util as lou
+
+
+class LinearOperatorExtras(object):  # pylint: disable=missing-docstring
+
+  def matmul(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
+
+    with self._name_scope(name, values=[x]):
+      if isinstance(x, ops.IndexedSlices):
+        return self._matmul_sparse(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+      x = ops.convert_to_tensor(x, name="x")
+      self._check_input_dtype(x)
+
+      self_dim = -2 if adjoint else -1
+      arg_dim = -1 if adjoint_arg else -2
+      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
+
+      return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def matmul_right(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
+
+    with self._name_scope(name, values=[x]):
+
+      if isinstance(x, ops.IndexedSlices):
+        return self._matmul_right_sparse(
+            x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+      x = ops.convert_to_tensor(x, name="x")
+      self._check_input_dtype(x)
+
+      self_dim = -1 if adjoint else -2
+      arg_dim = -2 if adjoint_arg else -1
+      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
+
+      return self._matmul_right(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+
+class LinearOperatorFullMatrix(LinearOperatorExtras,
+                               linalg.LinearOperatorFullMatrix):
+
+  # TODO(b/78117889) Remove this definition once core LinearOperator
+  # has _matmul_right.
+  def _matmul_right(self, x, adjoint=False, adjoint_arg=False):
+    return lou.matmul_with_broadcast(
+        x, self._matrix, adjoint_a=adjoint_arg, adjoint_b=adjoint)
+
+  def _matmul_sparse(self, x, adjoint=False, adjoint_arg=False):
+    raise NotImplementedError
+
+  def _matmul_right_sparse(self, x, adjoint=False, adjoint_arg=False):
+    assert not adjoint and not adjoint_arg
+    return utils.matmul_sparse_dense(x, self._matrix)
+
+
+class LinearOperatorDiag(LinearOperatorExtras,  # pylint: disable=missing-docstring
+                         linalg.LinearOperatorDiag):
+
+  def _matmul_right(self, x, adjoint=False, adjoint_arg=False):
+    diag_mat = math_ops.conj(self._diag) if adjoint else self._diag
+    x = linalg_impl.adjoint(x) if adjoint_arg else x
+    return diag_mat * x
+
+  def _matmul_sparse(self, x, adjoint=False, adjoint_arg=False):
+    diag_mat = math_ops.conj(self._diag) if adjoint else self._diag
+    assert not adjoint_arg
+    return utils.matmul_diag_sparse(diag_mat, x)
+
+  def _matmul_right_sparse(self, x, adjoint=False, adjoint_arg=False):
+    raise NotImplementedError
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
index cb3e698b9ceab920785adf735f88bd8e535a628f..42d525c2c21f5ba3457cba041261dc3b225dc11e 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions.py
@@ -57,30 +57,6 @@ class LossFunction(object):
     """The inputs to the loss function (excluding the targets)."""
     pass
 
-  @property
-  def input_minibatches(self):
-    """A `list` of inputs to the loss function, separated by minibatch.
-
-    Typically there will be one minibatch per tower in a multi-tower setup.
-    Returns a list consisting of `self.inputs` by default; `LossFunction`s
-    supporting registering multiple minibatches should override this method.
-
-    Returns:
-      A `list` of `Tensor`s representing
-    """
-    return [self.inputs]
-
-  @property
-  def num_registered_minibatches(self):
-    """Number of minibatches registered for this LossFunction.
-
-    Typically equal to the number of towers in a multi-tower setup.
-
-    Returns:
-      An `int` representing the number of registered minibatches.
-    """
-    return len(self.input_minibatches)
-
   def evaluate(self):
     """Evaluate the loss function on the targets."""
     if self.targets is not None:
@@ -474,7 +450,6 @@ class NormalMeanVarianceNegativeLogProbLoss(DistributionNegativeLogProbLoss):
     assert len(variance.shape) == 2, "Expect 2D variance tensor."
     self._mean = mean
     self._variance = variance
-    self._scale = math_ops.sqrt(variance)
     self._targets = targets
     super(NormalMeanVarianceNegativeLogProbLoss, self).__init__(seed=seed)
 
@@ -484,7 +459,7 @@ class NormalMeanVarianceNegativeLogProbLoss(DistributionNegativeLogProbLoss):
 
   @property
   def dist(self):
-    return normal.Normal(loc=self._mean, scale=self._scale)
+    return normal.Normal(loc=self._mean, scale=math_ops.sqrt(self._variance))
 
   @property
   def params(self):
@@ -502,7 +477,7 @@ class NormalMeanVarianceNegativeLogProbLoss(DistributionNegativeLogProbLoss):
 
   @property
   def _fisher_mean_factor(self):
-    return 1. / self._scale
+    return 1. / math_ops.sqrt(self._variance)
 
   @property
   def _fisher_var(self):
@@ -611,36 +586,13 @@ class CategoricalLogitsNegativeLogProbLoss(DistributionNegativeLogProbLoss,
         index in [0, output_size).
       seed: int or None. Default random seed when sampling.
     """
-    self._logits_components = []
-    self._targets_components = []
-    self.register_additional_minibatch(logits, targets=targets)
+    self._logits = logits
+    self._targets = targets
     super(CategoricalLogitsNegativeLogProbLoss, self).__init__(seed=seed)
 
-  def register_additional_minibatch(self, logits, targets=None):
-    """Register an additiona minibatch's worth of parameters.
-
-    Args:
-      logits: Tensor of shape [batch_size, output_size]. Parameters for
-        underlying distribution.
-      targets: None or Tensor of shape [batch_size, output_size].  Each row must
-        be a one-hot vector.
-    """
-    self._logits_components.append(logits)
-    self._targets_components.append(targets)
-
-  @property
-  def _logits(self):
-    return array_ops.concat(self._logits_components, axis=0)
-
-  @property
-  def input_minibatches(self):
-    return self._logits_components
-
   @property
   def targets(self):
-    if all(target is None for target in self._targets_components):
-      return None
-    return array_ops.concat(self._targets_components, axis=0)
+    return self._targets
 
   @property
   def dist(self):
@@ -661,19 +613,19 @@ class CategoricalLogitsNegativeLogProbLoss(DistributionNegativeLogProbLoss,
   def multiply_fisher(self, vector):
     probs = self._probs
     return vector * probs - probs * math_ops.reduce_sum(
-        vector * probs, axis=-1, keep_dims=True)
+        vector * probs, axis=-1, keepdims=True)
 
   def multiply_fisher_factor(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - probs * math_ops.reduce_sum(
-        sqrt_probs * vector, axis=-1, keep_dims=True)
+        sqrt_probs * vector, axis=-1, keepdims=True)
 
   def multiply_fisher_factor_transpose(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - sqrt_probs * math_ops.reduce_sum(
-        probs * vector, axis=-1, keep_dims=True)
+        probs * vector, axis=-1, keepdims=True)
 
   def multiply_fisher_factor_replicated_one_hot(self, index):
     assert len(index) == 1, "Length of index was {}".format(len(index))
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
index 705a871d482565897e7ac850327729a6186f1746..4279cb2792854249e3e076d200e2656bc615779d 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
@@ -33,7 +33,6 @@ _allowed_symbols = [
     "CategoricalLogitsNegativeLogProbLoss",
     "OnehotCategoricalLogitsNegativeLogProbLoss",
     "MultiBernoulliNegativeLogProbLoss",
-    "MultiBernoulliNegativeLogProbLoss",
     "insert_slice_in_zeros",
 ]
 
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index 1974b07acfc879dc4bc844db9af88fd1043d6698..45a760c9f1013da828a3bff105c0205b6a24243d 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -23,11 +23,14 @@ from tensorflow.contrib.kfac.python.ops import curvature_matrix_vector_products
 from tensorflow.contrib.kfac.python.ops import estimator as est
 # pylint enable=long-line
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training import gradient_descent
 
@@ -47,8 +50,9 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
                name="KFAC",
                estimation_mode="gradients",
                colocate_gradients_with_ops=True,
-               cov_devices=None,
-               inv_devices=None):
+               batch_size=None,
+               placement_strategy=None,
+               **kwargs):
     """Initializes the KFAC optimizer with the given settings.
 
     Args:
@@ -61,6 +65,8 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       damping: The damping factor used to stabilize training due to errors in
           the local approximation with the Fisher information matrix, and to
           regularize the update direction by making it closer to the gradient.
+          If damping is adapted during training then this value is used for
+          initializing damping varaible.
           (Higher damping means the update looks more like a standard gradient
           update - see Tikhonov regularization.)
       layer_collection: The layer collection object, which holds the fisher
@@ -86,12 +92,13 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       colocate_gradients_with_ops: Whether we should request gradients we
           compute in the estimator be colocated with their respective ops.
           (Default: True)
-      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
-          computations will be placed on these devices in a round-robin fashion.
-          Can be None, which means that no devices are specified.
-      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
-          computations will be placed on these devices in a round-robin fashion.
-          Can be None, which means that no devices are specified.
+      batch_size: The size of the mini-batch. Only needed when momentum_type
+          == 'qmodel' or when automatic adjustment is used.  (Default: None)
+      placement_strategy: string, Device placement strategy used when creating
+        covariance variables, covariance ops, and inverse ops.
+        (Default: `None`)
+      **kwargs: Arguments to be passesd to specific placement
+        strategy mixin. Check `placement.RoundRobinPlacementMixin` for example.
 
     Raises:
       ValueError: If the momentum type is unsupported.
@@ -100,20 +107,32 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       ValueError: If momentum is non-zero and momentum_type is not 'regular'
           or 'adam'.
     """
-
-    variables = var_list
-    if variables is None:
-      variables = tf_variables.trainable_variables()
-
-    self._fisher_est = est.FisherEstimator(
-        variables,
-        cov_ema_decay,
-        damping,
-        layer_collection,
-        estimation_mode=estimation_mode,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
-        cov_devices=cov_devices,
-        inv_devices=inv_devices)
+    # Parameters to be passed to the Fisher estimator:
+    self._variables = var_list or tf_variables.trainable_variables
+    self._cov_ema_decay = cov_ema_decay
+    self._layers = layer_collection
+    self._estimation_mode = estimation_mode
+    self._colocate_gradients_with_ops = colocate_gradients_with_ops
+
+    # The below paramaters are required only if damping needs to be adapated.
+    # These parameters can be set by calling
+    # set_damping_adaptation_params() explicitly.
+    self._damping_adaptation_decay = 0.95
+    self._damping_adaptation_interval = 5
+    # Check section 6.5 KFAC paper. omega(1) = pow(damping decay, interval)
+    self._omega = (
+        self._damping_adaptation_decay**self._damping_adaptation_interval)
+    self._adapt_damping = False
+    self._min_damping = 1e-5
+    self._prev_train_batch = None
+    self._is_chief = False
+    self._loss_fn = None
+    self._damping_constant = damping
+    self._damping = None
+    self._rho = None
+    self._prev_loss = None
+    self._q_model_change = None
+    self._update_damping_op = None
 
     momentum_type = momentum_type.lower()
     legal_momentum_types = ["regular", "adam", "qmodel"]
@@ -122,46 +141,91 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       raise ValueError("Unsupported momentum type {}. Must be one of {}."
                        .format(momentum_type, legal_momentum_types))
     if momentum_type != "regular" and norm_constraint is not None:
-      raise ValueError("Update clipping is only supported with momentum"
+      raise ValueError("Update clipping is only supported with momentum "
                        "type 'regular'.")
     if momentum_type not in ["regular", "adam"] and momentum != 0:
       raise ValueError("Momentum must be unspecified if using a momentum_type "
                        "other than 'regular' or 'adam'.")
 
+    # Extra parameters of the optimizer
     self._momentum = momentum
     self._momentum_type = momentum_type
     self._norm_constraint = norm_constraint
-
-    # this is a bit of a hack
-    # TODO(duckworthd): Handle this in a better way (e.g. pass it in?)
-    self._batch_size = array_ops.shape(layer_collection.losses[0].inputs)[0]
-    self._losses = layer_collection.losses
+    self._batch_size = batch_size
+    self._placement_strategy = placement_strategy
+
+    with variable_scope.variable_scope(name):
+      self._fisher_est = est.make_fisher_estimator(
+          placement_strategy=placement_strategy,
+          variables=self._variables,
+          cov_ema_decay=self._cov_ema_decay,
+          damping=self.damping,
+          layer_collection=self._layers,
+          exps=(-1,),
+          estimation_mode=self._estimation_mode,
+          colocate_gradients_with_ops=self._colocate_gradients_with_ops,
+          **kwargs)
 
     super(KfacOptimizer, self).__init__(learning_rate, name=name)
 
-  @property
-  def cov_update_thunks(self):
-    return self._fisher_est.cov_update_thunks
+  def set_damping_adaptation_params(self,
+                                    is_chief,
+                                    prev_train_batch,
+                                    loss_fn,
+                                    min_damping=1e-5,
+                                    damping_adaptation_decay=0.99,
+                                    damping_adaptation_interval=5):
+    """Sets parameters required to adapt damping during training.
 
-  @property
-  def cov_update_ops(self):
-    return self._fisher_est.cov_update_ops
+    When called, enables damping adaptation according to the Levenberg-Marquardt
+    style rule described in Section 6.5 of "Optimizing Neural Networks with
+    Kronecker-factored Approximate Curvature".
 
-  @property
-  def cov_update_op(self):
-    return self._fisher_est.cov_update_op
-
-  @property
-  def inv_update_thunks(self):
-    return self._fisher_est.inv_update_thunks
+    Note that this function creates Tensorflow variables which store a few
+    scalars and are accessed by the ops which update the damping (as part
+    of the training op returned by the minimize() method).
 
-  @property
-  def inv_update_ops(self):
-    return self._fisher_est.inv_update_ops
+    Args:
+      is_chief: `Boolean`, `True` if the worker is chief.
+      prev_train_batch: Training data used to minimize loss in the previous
+        step. This will be used to evaluate loss by calling
+        `loss_fn(prev_train_batch)`.
+      loss_fn: `function` that takes as input training data tensor and returns
+        a scalar loss.
+      min_damping: `float`(Optional), Minimum value the damping parameter
+        can take. Default value 1e-5.
+      damping_adaptation_decay: `float`(Optional), The `damping` parameter is
+        multipled by the `damping_adaptation_decay` every
+        `damping_adaptation_interval` number of iterations. Default value 0.99.
+      damping_adaptation_interval: `int`(Optional), Number of steps in between
+        updating the `damping` parameter. Default value 5.
 
-  @property
-  def inv_update_op(self):
-    return self._fisher_est.inv_update_op
+    Raises:
+      ValueError: If `set_damping_adaptation_params` is already called and the
+        the `adapt_damping` is `True`.
+    """
+    if self._adapt_damping:
+      raise ValueError("Damping adaptation parameters already set.")
+
+    with variable_scope.variable_scope(self.get_name()):
+      self._adapt_damping = True
+      self._is_chief = is_chief
+      self._prev_train_batch = prev_train_batch
+      self._loss_fn = loss_fn
+      self._damping_adaptation_decay = damping_adaptation_decay
+      self._damping_adaptation_interval = damping_adaptation_interval
+      self._omega = (
+          self._damping_adaptation_decay**self._damping_adaptation_interval)
+      self._min_damping = min_damping
+
+      self._rho = variable_scope.get_variable(
+          "rho", shape=(), dtype=dtypes.float32, trainable=False)  # LM ratio.
+      self._prev_loss = variable_scope.get_variable(
+          "prev_loss", shape=(), dtype=dtypes.float32, trainable=False)
+      self._q_model_change = variable_scope.get_variable(
+          "q_model_change", shape=(), dtype=dtypes.float32, trainable=False)
+      self._damping = variable_scope.get_variable(
+          "damping", initializer=self._damping_constant, trainable=False)
 
   @property
   def variables(self):
@@ -169,14 +233,76 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
 
   @property
   def damping(self):
-    return self._fisher_est.damping
+    if self._damping:
+      return self._damping
+    else:
+      return self._damping_constant
+
+  @property
+  def damping_adaptation_interval(self):
+    return self._damping_adaptation_interval
+
+  def make_vars_and_create_op_thunks(self):
+    """Make vars and create op thunks.
+
+    Returns:
+      cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+      inv_update_thunks: List of inv update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+    """
+    scope = self.get_name() + "/" + self._fisher_est.name
+    return self._fisher_est.make_vars_and_create_op_thunks(scope=scope)
+
+  def create_ops_and_vars_thunks(self):
+    """Create thunks that make the ops and vars on demand.
+
+    This function returns 4 lists of thunks: cov_variable_thunks,
+    cov_update_thunks, inv_variable_thunks, and inv_update_thunks.
+
+    The length of each list is the number of factors and the i-th element of
+    each list corresponds to the i-th factor (given by the "factors" property).
+
+    Note that the execution of these thunks must happen in a certain
+    partial order.  The i-th element of cov_variable_thunks must execute
+    before the i-th element of cov_update_thunks (and also the i-th element
+    of inv_update_thunks).  Similarly, the i-th element of inv_variable_thunks
+    must execute before the i-th element of inv_update_thunks.
+
+    TL;DR (oversimplified): Execute the thunks according to the order that
+    they are returned.
+
+    Returns:
+      cov_variable_thunks: A list of thunks that make the cov variables.
+      cov_update_thunks: A list of thunks that make the cov update ops.
+      inv_variable_thunks: A list of thunks that make the inv variables.
+      inv_update_thunks: A list of thunks that make the inv update ops.
+    """
+    scope = self.get_name() + "/" + self._fisher_est.name
+    return self._fisher_est.create_ops_and_vars_thunks(scope=scope)
 
   def minimize(self, *args, **kwargs):
-    kwargs["var_list"] = kwargs.get("var_list") or self.variables
-    if set(kwargs["var_list"]) != set(self.variables):
-      raise ValueError("var_list doesn't match with set of Fisher-estimating "
-                       "variables.")
-    return super(KfacOptimizer, self).minimize(*args, **kwargs)
+    # Should this variable scope encompass everything below?  Or will the super-
+    # class make another copy of the same name scope?
+    with variable_scope.variable_scope(self.get_name()):
+      kwargs["var_list"] = kwargs.get("var_list") or self.variables
+      if set(kwargs["var_list"]) != set(self.variables):
+        raise ValueError("var_list doesn't match with set of Fisher-estimating "
+                         "variables.")
+      if self._adapt_damping and self._is_chief:
+        global_step = kwargs.get("global_step", None)
+        if not global_step:
+          raise KeyError("global_step needs to be passed to optimizer.minimize "
+                         "if damping parameter is adapted.")
+        update_damping_op = self._update_damping(self._prev_train_batch,
+                                                 global_step)
+        with ops.control_dependencies([update_damping_op]):
+          loss = args[0]
+          loss_assign_op = state_ops.assign(self._prev_loss, loss)
+          train_op = super(KfacOptimizer, self).minimize(*args, **kwargs)
+          return control_flow_ops.group(loss_assign_op, train_op)
+      else:
+        return super(KfacOptimizer, self).minimize(*args, **kwargs)
 
   def compute_gradients(self, *args, **kwargs):
     # args[1] could be our var_list
@@ -185,6 +311,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     else:
       kwargs["var_list"] = kwargs.get("var_list") or self.variables
       var_list = kwargs["var_list"]
+
     if set(var_list) != set(self.variables):
       raise ValueError("var_list doesn't match with set of Fisher-estimating "
                        "variables.")
@@ -296,6 +423,20 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     coeff = self._update_clip_coeff(grads_and_vars, precon_grads_and_vars)
     return [(pgrad * coeff, var) for pgrad, var in precon_grads_and_vars]
 
+  def _compute_prev_updates(self, variables):
+    """Computes previous updates as negative velocities scaled by learning rate.
+
+    Args:
+      variables: List of variables in the graph that the update will be
+          applied to.
+
+    Returns:
+      List of previous updates applied to the `variables`.
+    """
+    return list(
+        -1 * self._learning_rate * self._zeros_slot(var, "velocity", self._name)
+        for var in variables)
+
   def _compute_qmodel_hyperparams(self, precon_grads, prev_updates, grads,
                                   variables):
     """Compute optimal update hyperparameters from the quadratic model.
@@ -336,12 +477,12 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
                     = qmodel(alpha*precon_grad + mu*prev_update) - L(theta).
     """
 
-    cmvpc = cmvp.CurvatureMatrixVectorProductComputer(self._losses, variables)
+    cmvpc = cmvp.CurvatureMatrixVectorProductComputer(self._layers.losses,
+                                                      variables)
 
     # compute the matrix-vector products with the transposed Fisher factor
     fft_precon_grads = cmvpc.multiply_fisher_factor_transpose(precon_grads)
     fft_prev_updates = cmvpc.multiply_fisher_factor_transpose(prev_updates)
-
     batch_size = math_ops.cast(
         self._batch_size, dtype=fft_precon_grads[0].dtype)
 
@@ -374,9 +515,9 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       c = ops.convert_to_tensor([[_inner_product_list(grads, precon_grads)],
                                  [_inner_product_list(grads, prev_updates)]])
 
-      sol = _two_by_two_solve(m, c)
-      alpha = -sol[0]
-      mu = -sol[1]
+      sol = -1. * _two_by_two_solve(m, c)
+      alpha = sol[0]
+      mu = sol[1]
       qmodel_change = 0.5 * math_ops.reduce_sum(sol * c)
 
       return alpha, mu, qmodel_change
@@ -404,6 +545,52 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     return control_flow_ops.cond(
         math_ops.equal(m_22, 0.0), zero_prevupd_case, non_zero_prevupd_case)
 
+  def _assign_q_model_change(self, q_model_change):
+    """Assigns `q_model_change` to `self._q_model_change` if damping is adapted.
+
+    Note only the chief worker does the assignment.
+
+    Args:
+      q_model_change: Scalar tensor of type `float32`.
+
+    Returns:
+      If `adapt_damping` is `True` then returns an assign op, Otherwise returns
+      a no_op().
+    """
+    if self._adapt_damping and self._is_chief:
+      q_model_assign_op = state_ops.assign(self._q_model_change, q_model_change)
+    else:
+      q_model_assign_op = control_flow_ops.no_op()
+    return q_model_assign_op
+
+  def _compute_qmodel_hyperparams_wrapper(self, grads_and_vars,
+                                          precon_grads_and_vars):
+    """Wrapper function for `self._compute_qmodel_hyperparams`.
+
+    Constructs a list of preconditioned gradients and variables. Also creates a
+    op to asssign the computed q model change to `self._q_model_change`.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs.
+      precon_grads_and_vars: List of (preconditioned gradients, variable)
+        pairs.
+
+    Returns:
+      (alpha, mu, q_model_assign_op), where alpha and mu are chosen to optimize
+      the quadratic model, `q_model_assign_op` assigns the computed q model
+      change to `self._q_model_change`.
+    """
+    precon_grads = list(
+        precon_grad for (precon_grad, _) in precon_grads_and_vars)
+    grads = list(grad for (grad, _) in grads_and_vars)
+    variables = list(var for (_, var) in grads_and_vars)
+    prev_updates = self._compute_prev_updates(variables)
+    # Compute optimal velocity update parameters according to quadratic model
+    alpha, mu, q_model_change = self._compute_qmodel_hyperparams(
+        precon_grads, prev_updates, grads, variables)
+
+    return alpha, mu, self._assign_q_model_change(q_model_change)
+
   def _compute_update_steps(self, grads_and_vars):
     """Computes the update steps for the variables given the gradients.
 
@@ -411,8 +598,10 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       grads_and_vars: List of (gradient, variable) pairs.
 
     Returns:
-      An 'Operation that computes the update steps for the given variables.
+      A list of tuple (assign_op ,var) where `assign_op` assigns the update
+      steps to `var`.
     """
+
     if self._momentum_type == "regular":
       # Compute "preconditioned" gradient.
       precon_grads_and_vars = self._fisher_est.multiply_inverse(grads_and_vars)
@@ -423,8 +612,13 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
                                                    precon_grads_and_vars)
 
       # Update the velocity with this and return it as the step.
-      return self._update_velocities(precon_grads_and_vars, self._momentum)
-
+      if self._adapt_damping and self._is_chief:
+        _, _, q_model_assign_op = self._compute_qmodel_hyperparams_wrapper(
+            grads_and_vars, precon_grads_and_vars)
+        with ops.control_dependencies([q_model_assign_op]):
+          return self._update_velocities(precon_grads_and_vars, self._momentum)
+      else:
+        return self._update_velocities(precon_grads_and_vars, self._momentum)
     elif self._momentum_type == "adam":
       # Update velocity.
       velocities_and_vars = self._update_velocities(grads_and_vars,
@@ -436,23 +630,13 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       # Compute "preconditioned" gradient.
       precon_grads_and_vars = self._fisher_est.multiply_inverse(grads_and_vars)
 
-      # Extract out singleton lists from the tuple-lists
-      precon_grads = list(
-          precon_grad for (precon_grad, _) in precon_grads_and_vars)
-      grads = list(grad for (grad, _) in grads_and_vars)
-      variables = list(var for (_, var) in grads_and_vars)
-      # previous updates are the negative velocities (up to scaling by LR)
-      prev_updates = list(
-          -self._zeros_slot(var, "velocity", self._name) for var in variables)
-
       # Compute optimal velocity update parameters according to quadratic model
-      alpha, mu, _ = self._compute_qmodel_hyperparams(
-          precon_grads, prev_updates, grads, variables)
+      alpha, mu, q_model_assign_op = self._compute_qmodel_hyperparams_wrapper(
+          grads_and_vars, precon_grads_and_vars)
 
-      # Update the velocity with precon_grads according to these params
-      # and return it as the step.
-      return self._update_velocities(
-          precon_grads_and_vars, mu, vec_coeff=-alpha)
+      with ops.control_dependencies([q_model_assign_op]):
+        return self._update_velocities(
+            precon_grads_and_vars, mu, vec_coeff=-alpha)
 
   def _update_velocities(self, vecs_and_vars, decay, vec_coeff=1.0):
     """Updates the velocities of the variables with the given vectors.
@@ -482,6 +666,50 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     # Go through variable and update its associated part of the velocity vector.
     return [_update_velocity(vec, var) for vec, var in vecs_and_vars]
 
+  def _update_damping(self, prev_batch, global_step):
+    """Adapts damping parameter. Check KFAC (Section 6.5) for the details.
+
+    The damping parameter is updated according to the Levenberg-Marquardt rule
+    every `self._damping_adaptation_interval` iterations.
+
+    Args:
+      prev_batch: Tensor or tuple of tensors which can be passed to
+        `self._loss_fn` to evaluate loss.
+      global_step: `Variable` which keeps track of number of times the training
+        variables have been updated.
+    Returns:
+      A `tf.cond` op which updates the damping parameter.
+    """
+    def compute_damping():
+      """"Adapts damping parameter based on "reduction ratio".
+
+      Reduction ratio captures how closely the quadratic approximation to the
+      loss function approximates the actual loss within a trust region. The
+      damping update tries to make the damping as small as possible while
+      maintaining the property that the quadratic model remains a good local
+      approximation to the loss function.
+
+      Returns:
+        An Op to assign newly computed damping value to `self._damping`.
+      """
+      prev_batch_loss = self._loss_fn(prev_batch)
+      with ops.control_dependencies([prev_batch_loss]):
+        rho_assign = self._rho.assign(
+            (prev_batch_loss - self._prev_loss) / self._q_model_change)
+        with ops.control_dependencies([rho_assign]):
+          new_damping = control_flow_ops.case(
+              [(self._rho < 0.25, lambda: self.damping / self._omega),
+               (self._rho > 0.75, lambda: self.damping * self._omega)],
+              lambda: self.damping)
+          with ops.control_dependencies([new_damping]):
+            new_damping_min = math_ops.maximum(new_damping, self._min_damping)
+            return control_flow_ops.group(self._damping.assign(new_damping_min))
+
+    return control_flow_ops.cond(
+        math_ops.equal(
+            math_ops.mod(global_step + 1, self._damping_adaptation_interval),
+            0), compute_damping, control_flow_ops.no_op)
+
 
 def _inner_product_list(list1, list2):
   return math_ops.add_n(
diff --git a/tensorflow/contrib/kfac/python/ops/placement.py b/tensorflow/contrib/kfac/python/ops/placement.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a20ebe19844e62bf112dbafce1f816413ea7878
--- /dev/null
+++ b/tensorflow/contrib/kfac/python/ops/placement.py
@@ -0,0 +1,114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements placement strategies for cov and inv ops, cov variables."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.framework import ops as tf_ops
+
+
+def _make_thunk_on_device(func, device):
+  def thunk():
+    with tf_ops.device(device):
+      return func()
+  return thunk
+
+
+class RoundRobinPlacementMixin(object):
+  """Implements round robin placement strategy for ops and variables."""
+
+  def __init__(self, cov_devices=None, inv_devices=None, **kwargs):
+    """Initializes the RoundRobinPlacementMixin class.
+
+    Args:
+      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
+        computations will be placed on these devices in a round-robin fashion.
+        Can be None, which means that no devices are specified.
+      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
+        computations will be placed on these devices in a round-robin fashion.
+        Can be None, which means that no devices are specified.
+      **kwargs: Need something here?
+
+    """
+    super(RoundRobinPlacementMixin, self).__init__(**kwargs)
+    self._cov_devices = cov_devices
+    self._inv_devices = inv_devices
+
+  def make_vars_and_create_op_thunks(self, scope=None):
+    """Make vars and create op thunks w/ a round-robin device placement strat.
+
+    For each factor, all of that factor's cov variables and their associated
+    update ops will be placed on a particular device.  A new device is chosen
+    for each factor by cycling through list of devices in the
+    `self._cov_devices` attribute. If `self._cov_devices` is `Non`e then no
+    explicit device placement occurs.
+
+    An analogous strategy is followed for inverse update ops, with the list of
+    devices being given by the `self._inv_devices` attribute.
+
+    Inverse variables on the other hand are not placed on any specific device
+    (they will just use the current the device placement context, whatever
+    that happens to be).  The idea is that the inverse variable belong where
+    they will be accessed most often, which is the device that actually applies
+    the preconditioner to the gradient. The user will be responsible for setting
+    the device context for this.
+
+    Args:
+      scope: A string or None.  If None it will be set to the name of this
+        estimator (given by the name property). All variables will be created,
+        and all thunks will execute, inside of a variable scope of the given
+        name. (Default: None)
+
+    Returns:
+      cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+      inv_update_thunks: List of inv update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+    """
+    # Note: `create_ops_and_vars_thunks` is implemented in `FisherEstimator`.
+    (cov_variable_thunks_raw, cov_update_thunks_raw, inv_variable_thunks_raw,
+     inv_update_thunks_raw) = self.create_ops_and_vars_thunks(scope=scope)
+
+    if self._cov_devices:
+      cov_update_thunks = []
+      for cov_variable_thunk, cov_update_thunk, device in zip(
+          cov_variable_thunks_raw, cov_update_thunks_raw,
+          itertools.cycle(self._cov_devices)):
+        with tf_ops.device(device):
+          cov_variable_thunk()
+        cov_update_thunks.append(_make_thunk_on_device(cov_update_thunk,
+                                                       device))
+    else:
+      for cov_variable_thunk in cov_variable_thunks_raw:
+        cov_variable_thunk()
+      cov_update_thunks = cov_update_thunks_raw
+
+    for inv_variable_thunk in inv_variable_thunks_raw:
+      inv_variable_thunk()
+
+    if self._inv_devices:
+      inv_update_thunks = []
+      for inv_update_thunk, device in zip(inv_update_thunks_raw,
+                                          itertools.cycle(self._inv_devices)):
+        inv_update_thunks.append(_make_thunk_on_device(inv_update_thunk,
+                                                       device))
+    else:
+      inv_update_thunks = inv_update_thunks_raw
+
+    return cov_update_thunks, inv_update_thunks
diff --git a/tensorflow/contrib/kfac/python/ops/utils.py b/tensorflow/contrib/kfac/python/ops/utils.py
index f5bd97cb4e7d547394050e944f75b43a40887f34..144295f4c7e36f61b4bae4178a6f57f6657204c5 100644
--- a/tensorflow/contrib/kfac/python/ops/utils.py
+++ b/tensorflow/contrib/kfac/python/ops/utils.py
@@ -24,11 +24,13 @@ from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
@@ -233,6 +235,13 @@ posdef_eig_functions = {
 }
 
 
+def cholesky(tensor, damping):
+  """Computes the inverse of tensor + damping * identity."""
+  identity = linalg_ops.eye(tensor.shape.as_list()[0], dtype=tensor.dtype)
+  damping = math_ops.cast(damping, dtype=tensor.dtype)
+  return linalg_ops.cholesky(tensor + damping * identity)
+
+
 class SubGraph(object):
   """Defines a subgraph given by all the dependencies of a given set of outputs.
   """
@@ -241,19 +250,22 @@ class SubGraph(object):
     # Set of all ancestor Tensors, Ops to 'outputs'.
     self._members = set()
 
-    self._recurse_add(outputs)
+    self._iter_add(outputs)
 
-  def _recurse_add(self, nodes):
-    """Recursively adds all of nodes' ancestors."""
-    for node in nodes:
-      if node in self._members:
-        continue
-      self._members.add(node)
+  def _iter_add(self, root):
+    """Iteratively adds all of nodes' ancestors using depth first search."""
+    stack = [root]
+    while stack:
+      nodes = stack.pop()
+      for node in nodes:
+        if node in self._members:
+          continue
+        self._members.add(node)
 
-      if isinstance(node, ops.Tensor):
-        self._recurse_add((node.op,))
-      elif isinstance(node, ops.Operation):
-        self._recurse_add(node.inputs)
+        if isinstance(node, ops.Tensor):
+          stack.append((node.op,))
+        elif isinstance(node, ops.Operation):
+          stack.append(node.inputs)
 
   def is_member(self, node):
     """Check if 'node' is in this subgraph."""
@@ -427,13 +439,138 @@ def batch_execute(global_step, thunks, batch_size, name=None):
     return result
 
 
-def matmul_sparse_dense(A, B, name=None):  # pylint: disable=invalid-name
+def extract_convolution_patches(inputs,
+                                filter_shape,
+                                padding,
+                                strides=None,
+                                dilation_rate=None,
+                                name=None,
+                                data_format=None):
+  """Extracts inputs to each output coordinate in tf.nn.convolution.
+
+  This is a generalization of tf.extract_image_patches() to tf.nn.convolution(),
+  where the number of spatial dimensions may be something other than 2.
+
+  Assumes,
+  - First dimension of inputs is batch_size
+  - Convolution filter is applied to all input channels.
+
+  Args:
+    inputs: Tensor of shape [batch_size, ..spatial_image_shape..,
+      ..spatial_filter_shape.., in_channels]. Inputs to tf.nn.convolution().
+    filter_shape: List of ints. Shape of filter passed to tf.nn.convolution().
+    padding: string. Padding method. One of "VALID", "SAME".
+    strides: None or list of ints. Strides along spatial dimensions.
+    dilation_rate: None or list of ints. Dilation along spatial dimensions.
+    name: None or str. Name of Op.
+    data_format: None or str. Format of data.
+
+  Returns:
+    Tensor of shape [batch_size, ..spatial_image_shape..,
+      ..spatial_filter_shape.., in_channels]
+
+  Raises:
+    ValueError: If data_format does not put channel last.
+    ValueError: If inputs and filter disagree on in_channels.
+  """
+  if not is_data_format_channel_last(data_format):
+    raise ValueError("Channel must be last dimension.")
+  with ops.name_scope(name, "extract_convolution_patches",
+                      [inputs, filter_shape, padding, strides, dilation_rate]):
+    batch_size = inputs.shape.as_list()[0]
+    in_channels = inputs.shape.as_list()[-1]
+
+    # filter_shape = spatial_filter_shape + [in_channels, out_channels]
+    spatial_filter_shape = filter_shape[:-2]
+    if in_channels != filter_shape[-2]:
+      raise ValueError("inputs and filter_shape must agree on in_channels.")
+
+    # Map each input feature to a location in the output.
+    out_channels = np.prod(spatial_filter_shape) * in_channels
+    filters = linalg_ops.eye(out_channels)
+    filters = array_ops.reshape(
+        filters,
+        list(spatial_filter_shape) + [in_channels, out_channels])
+
+    result = nn_ops.convolution(
+        inputs,
+        filters,
+        padding=padding,
+        strides=strides,
+        dilation_rate=dilation_rate)
+    spatial_output_shape = result.shape.as_list()[1:-1]
+    result = array_ops.reshape(result,
+                               [batch_size or -1] + spatial_output_shape +
+                               list(spatial_filter_shape) + [in_channels])
+
+    return result
+
+
+def extract_pointwise_conv2d_patches(inputs,
+                                     filter_shape,
+                                     name=None,
+                                     data_format=None):
+  """Extract patches for a 1x1 conv2d.
+
+  Args:
+    inputs: 4-D Tensor of shape [batch_size, height, width, in_channels].
+    filter_shape: List of 4 ints. Shape of filter to apply with conv2d()
+    name: None or str. Name for Op.
+    data_format: None or str. Format for data. See 'data_format' in
+      tf.nn.conv2d() for details.
+
+  Returns:
+    Tensor of shape [batch_size, ..spatial_input_shape..,
+    ..spatial_filter_shape.., in_channels]
+
+  Raises:
+    ValueError: if inputs is not 4-D.
+    ValueError: if filter_shape is not [1, 1, ?, ?]
+    ValueError: if data_format is not channels-last.
+  """
+  if inputs.shape.ndims != 4:
+    raise ValueError("inputs must have 4 dims.")
+  if len(filter_shape) != 4:
+    raise ValueError("filter_shape must have 4 dims.")
+  if filter_shape[0] != 1 or filter_shape[1] != 1:
+    raise ValueError("filter_shape must have shape 1 along spatial dimensions.")
+  if not is_data_format_channel_last(data_format):
+    raise ValueError("data_format must be channels last.")
+  with ops.name_scope(name, "extract_pointwise_conv2d_patches",
+                      [inputs, filter_shape]):
+    ksizes = [1, 1, 1, 1]  # Spatial shape is 1x1.
+    strides = [1, 1, 1, 1]  # Operate on all pixels.
+    rates = [1, 1, 1, 1]  # Dilation has no meaning with spatial shape = 1.
+    padding = "VALID"  # Doesn't matter.
+    result = array_ops.extract_image_patches(inputs, ksizes, strides, rates,
+                                             padding)
+
+    batch_size, input_height, input_width, in_channels = inputs.shape.as_list()
+    filter_height, filter_width, in_channels, _ = filter_shape
+    return array_ops.reshape(result, [
+        batch_size, input_height, input_width, filter_height, filter_width,
+        in_channels
+    ])
+
+
+def is_data_format_channel_last(data_format):
+  """True if data_format puts channel last."""
+  if data_format is None:
+    return True
+  return data_format.endswith("C")
+
+
+def matmul_sparse_dense(A, B, name=None, transpose_a=False, transpose_b=False):  # pylint: disable=invalid-name
   """Computes matmul(A, B) where A is sparse, B is dense.
 
   Args:
     A: tf.IndexedSlices with dense shape [m, n].
     B: tf.Tensor with shape [n, k].
     name: str. Name of op.
+    transpose_a: Bool. If true we transpose A before multiplying it by B.
+      (Default: False)
+    transpose_b: Bool. If true we transpose B before multiplying it by A.
+      (Default: False)
 
   Returns:
     tf.IndexedSlices resulting from matmul(A, B).
@@ -447,7 +584,8 @@ def matmul_sparse_dense(A, B, name=None):  # pylint: disable=invalid-name
       raise ValueError("A must represent a matrix. Found: %s." % A)
     if B.shape.ndims != 2:
       raise ValueError("B must be a matrix.")
-    new_values = math_ops.matmul(A.values, B)
+    new_values = math_ops.matmul(
+        A.values, B, transpose_a=transpose_a, transpose_b=transpose_b)
     return ops.IndexedSlices(
         new_values,
         A.indices,
@@ -479,5 +617,93 @@ def matmul_diag_sparse(A_diag, B, name=None):  # pylint: disable=invalid-name
     a = array_ops.reshape(a, list(a.shape) + [1] * (B.values.shape.ndims - 1))
     return ops.IndexedSlices(a * B.values, B.indices, dense_shape=B.dense_shape)
 
+
+class PartitionedTensor(object):
+  """A Tensor partitioned across its 0-th dimension."""
+
+  def __init__(self, tensors):
+    """Initializes PartitionedTensor.
+
+    Args:
+      tensors: List of Tensors. All Tensors must agree on shape (excepting
+        batch dimension) and dtype.
+
+    Raises:
+      ValueError: If 'tensors' has length zero.
+      ValueError: if contents of 'tensors' don't agree on shape or dtype.
+    """
+    if not tensors:
+      raise ValueError("tensors must be a list of 1+ Tensors.")
+
+    dtype = tensors[0].dtype
+    if not all(tensor.dtype == dtype for tensor in tensors):
+      raise ValueError("all tensors must have dtype = %s." % dtype)
+
+    shape = tensors[0].shape[1:]
+    if not all(tensor.shape[1:] == shape for tensor in tensors):
+      raise ValueError("All tensors must have shape = %s (excluding batch "
+                       "dimension)." % shape)
+
+    self.tensors = tensors
+    self._concats = {}  # {device: Tensor}
+
+  @property
+  def shape(self):
+    feature_shape = self.tensors[0].shape[1:]
+    batch_size = sum([tensor.shape[0] for tensor in self.tensors],
+                     tensor_shape.Dimension(0))
+    return tensor_shape.TensorShape([batch_size]).concatenate(feature_shape)
+
+  def get_shape(self):
+    return self.shape
+
+  @property
+  def dtype(self):
+    return self.tensors[0].dtype
+
+  def __str__(self):
+    return "PartitionedTensor([%s, ...], dtype=%s, shape=%s)" % (
+        self.tensors[0].name, self.dtype.name, tuple(self.shape.as_list()))
+
+  def __hash__(self):
+    return hash(tuple(self.tensors))
+
+  def __eq__(self, other):
+    if not isinstance(other, PartitionedTensor):
+      return False
+    return self.tensors == other.tensors
+
+  def __ne__(self, other):
+    return not self == other  # pylint: disable=g-comparison-negation
+
+  def __getitem__(self, key):
+    return self.as_tensor()[key]
+
+  def as_tensor(self, dtype=None, name=None, as_ref=False):
+    with ops.name_scope(name, "PartitionedTensor.as_tensor", self.tensors):
+      assert not as_ref
+      assert dtype in [None, self.dtype]
+      result = array_ops.concat(self.tensors, axis=0)
+
+      # Cache 'result' if we haven't already cached a value for this device.
+      if result.device not in self._concats:
+        self._concats[result.device] = result
+      return self._concats[result.device]
+
+  @property
+  def device(self):
+    # PartitionedTensors in general do not live on a single device.  If the
+    # device cannot be determined unambiguously this property will return None.
+    device = self.tensors[0].device
+    if all(tensor.device == device for tensor in self.tensors):
+      return device
+    return None
+
+
+ops.register_tensor_conversion_function(
+    PartitionedTensor,
+    lambda val, dtype, name, as_ref: val.as_tensor(dtype, name, as_ref))
+
+
 # TODO(b/69623235): Add a function for finding tensors that share gradients
 # to eliminate redundant fisher factor computations.
diff --git a/tensorflow/contrib/kfac/python/ops/utils_lib.py b/tensorflow/contrib/kfac/python/ops/utils_lib.py
index 8e424a794691484fdea7d8481677aa641c433d4c..330d222dbf70fcfa02ffd47261c0513d9dd6e0e9 100644
--- a/tensorflow/contrib/kfac/python/ops/utils_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/utils_lib.py
@@ -40,6 +40,9 @@ _allowed_symbols = [
     "fwd_gradients",
     "ensure_sequence",
     "batch_execute",
+    "extract_convolution_patches",
+    "extract_pointwise_conv2d_patches",
+    "is_data_format_channel_last",
     "matmul_sparse_dense",
     "matmul_diag_sparse",
 ]
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 894e6f6946bb59810a9da2d304cc0dd43d25201d..c8812d4b23f94102d093db878a709b090a3318d6 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -70,6 +70,7 @@ py_test(
         "python/ops/core_test.py",
     ],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":_typecheck",
         ":core",
@@ -213,14 +214,3 @@ py_test(
         "//tensorflow/python:math_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
index 0727f4cf88728dc3d919e662d65c93a658ac730b..39e9d65407f3b1e79804317023ea03dd81484ff5 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
@@ -660,7 +660,7 @@ class ReduceSumTest(Base):
     sum_lt = ops.reduce_sum(self.original_lt, {('channel', 'hihowareyou')})
     golden_lt = core.LabeledTensor(
         math_ops.reduce_sum(
-            self.original_lt.tensor, 1, keep_dims=True),
+            self.original_lt.tensor, 1, keepdims=True),
         [self.a0, ('channel', ['hihowareyou']), self.a2, self.a3])
     self.assertLabeledTensorsEqual(sum_lt, golden_lt)
 
@@ -668,7 +668,7 @@ class ReduceSumTest(Base):
     sum_lt = ops.reduce_sum(self.original_lt, ('channel', 'hihowareyou'))
     golden_lt = core.LabeledTensor(
         math_ops.reduce_sum(
-            self.original_lt.tensor, 1, keep_dims=True),
+            self.original_lt.tensor, 1, keepdims=True),
         [self.a0, ('channel', ['hihowareyou']), self.a2, self.a3])
     self.assertLabeledTensorsEqual(sum_lt, golden_lt)
 
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 852d06e1e3cc8f8deecd15b7436cd4e4a393ad66..d5b3b279a1b7327602790c0260349cb0c758aa86 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -188,6 +188,7 @@ py_test(
     size = "small",
     srcs = ["python/layers/normalization_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":layers_py",
         "//tensorflow/contrib/framework:framework_py",
@@ -353,6 +354,7 @@ py_test(
     size = "small",
     srcs = ["python/ops/sparse_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":layers_py",
         "//tensorflow/python:array_ops",
@@ -390,15 +392,3 @@ py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 337c9e06b870b2cca53fcdbf3d94225660e193c4..00f03a111ae8be7f49761ef5fb5a82810bcca182 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -104,6 +104,7 @@ See the @{$python/contrib.layers} guide.
 @@infer_real_valued_columns
 @@sequence_input_from_feature_columns
 
+@@group_norm
 @@instance_norm
 """
 
@@ -122,6 +123,7 @@ _allowed_symbols = ['bias_add',
                     'conv3d',
                     'elu',
                     'feature_column',
+                    'group_norm',
                     'instance_norm',
                     'legacy_fully_connected',
                     'legacy_linear',
diff --git a/tensorflow/contrib/layers/kernels/BUILD b/tensorflow/contrib/layers/kernels/BUILD
index e407a9ce015603094c7bbab72856403e2f0eb1a1..7aae09ff3e9995b2d92b05211b3bf8a94a26ff43 100644
--- a/tensorflow/contrib/layers/kernels/BUILD
+++ b/tensorflow/contrib/layers/kernels/BUILD
@@ -18,14 +18,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
index f701647c2b297015f025eb53bd191a1a8c54ec62..28ddaa69a14776e0c157c2e68105ee9e17bc3cbb 100644
--- a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
+++ b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
@@ -200,7 +200,7 @@ class SparseCrossOpTest(test.TestCase):
       self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 
   def test_large_batch(self):
-    """Tests with large batch size to force multithreding.
+    """Tests with large batch size to force multithreading.
     """
     batch_size = 5000
     col1 = []
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index b62e3050cd7003f1ba72061b133ff9b5d6b616da..49c3faf3b7f5eaa3b1542a1fdddcfaff99737a24 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -140,6 +140,9 @@ def safe_embedding_lookup_sparse(embedding_weights,
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+    if combiner != "sum":
+      sparse_ids, sparse_weights = _prune_invalid_weights(
+          sparse_ids, sparse_weights)
 
     # Fill in dummy values for empty features, if necessary.
     sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
@@ -188,13 +191,23 @@ def _prune_invalid_ids(sparse_ids, sparse_weights):
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
   if sparse_weights is not None:
     is_id_valid = math_ops.logical_and(
-        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+        is_id_valid,
+        array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool))
   sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
   if sparse_weights is not None:
     sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
   return sparse_ids, sparse_weights
 
 
+def _prune_invalid_weights(sparse_ids, sparse_weights):
+  """Prune invalid weights (< 0) from the input ids and weights."""
+  if sparse_weights is not None:
+    is_weights_valid = math_ops.greater(sparse_weights.values, 0)
+    sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid)
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid)
+  return sparse_ids, sparse_weights
+
+
 def scattered_embedding_lookup(params,
                                values,
                                dimension,
@@ -470,7 +483,7 @@ def embedding_lookup_unique(params, ids, name=None):
     ids = ops.convert_to_tensor(ids)
     shape = array_ops.shape(ids)
     ids_flat = array_ops.reshape(
-        ids, math_ops.reduce_prod(shape, keep_dims=True))
+        ids, math_ops.reduce_prod(shape, keepdims=True))
     unique_ids, idx = array_ops.unique(ids_flat)
     unique_embeddings = embedding_ops.embedding_lookup(params, unique_ids)
     embeds_flat = array_ops.gather(unique_embeddings, idx)
diff --git a/tensorflow/contrib/layers/python/layers/encoders.py b/tensorflow/contrib/layers/python/layers/encoders.py
index 89c9d37bd09cb6c43eebb91f3a16600eae9cb490..f42112206d0db9d2e42bd4cff19f6a6533951d46 100644
--- a/tensorflow/contrib/layers/python/layers/encoders.py
+++ b/tensorflow/contrib/layers/python/layers/encoders.py
@@ -125,7 +125,7 @@ def embed_sequence(ids,
       `reuse` is `None` or `False`.
   """
   if not (reuse or (vocab_size and embed_dim)):
-    raise ValueError('Must specify vocab size and embedding dimension when not'
+    raise ValueError('Must specify vocab size and embedding dimension when not '
                      'reusing. Got vocab_size=%s and embed_dim=%s' % (
                          vocab_size, embed_dim))
   with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 9ccb589d698ad83c9654f5523ccdcb35b031b3da..3ae07cedab0be2da8ec633cfd84e07cfdfb11457 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -48,7 +48,7 @@ you should choose depends on (1) the feature type and (2) the model type.
    recommended.
 
      embedded_dept_column = embedding_column(
-       sparse_column_with_keys("department", ["math", "philosphy", ...]),
+       sparse_column_with_keys("department", ["math", "philosophy", ...]),
        dimension=10)
 
 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index 78affea44cbfb92523063968dbc1be98841854db..06060b99e7e58787994f20f037ffa451abbc7459 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -815,7 +815,7 @@ class _Transformer(object):
   """
 
   def __init__(self, columns_to_tensors):
-    """Initializes transfomer.
+    """Initializes transformer.
 
     Args:
       columns_to_tensors: A mapping from feature columns to tensors. 'string'
@@ -908,7 +908,7 @@ def _gather_feature_columns(feature_columns):
 
 
 def _check_forbidden_sequence_columns(feature_columns):
-  """Recursively cecks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
+  """Recursively checks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
   all_feature_columns = _gather_feature_columns(feature_columns)
   for feature_column in all_feature_columns:
     if isinstance(feature_column, _FORBIDDEN_SEQUENCE_COLUMNS):
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index e27b36908eba7cc1b992079b1abb5c5367340de1..2f3e57653c5d6d949c4dcc91635690322b7f90c4 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -51,7 +51,6 @@ from tensorflow.python.ops import standard_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training import moving_averages
-from tensorflow.python.layers.maxout import maxout
 
 # TODO(b/28426988): Replace legacy_* fns migrated from slim.
 # TODO(b/28426988): Remove legacy_* when all uses have migrated to new API.
@@ -933,7 +932,8 @@ def convolution(inputs,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
-                scope=None):
+                scope=None,
+                conv_dims=None):
   """Adds an N-D convolution followed by an optional batch_norm layer.
 
   It is required that 1 <= N <= 3.
@@ -994,6 +994,10 @@ def convolution(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for `variable_scope`.
+    conv_dims: Optional convolution dimensionality, when set it would use the
+      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
+      leaved to None it would select the convolution dimensionality based on
+      the input rank (i.e. Conv ND, with N = input_rank - 2).
 
   Returns:
     A tensor representing the output of the operation.
@@ -1016,6 +1020,9 @@ def convolution(inputs,
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
 
+    if conv_dims is not None and conv_dims + 2 != input_rank:
+      raise ValueError('Convolution expects input with rank %d, got %d' %
+                       (conv_dims + 2, input_rank))
     if input_rank == 3:
       layer_class = convolutional_layers.Convolution1D
     elif input_rank == 4:
@@ -1062,10 +1069,134 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+@add_arg_scope
+def convolution1d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=1)
+
+convolution1d.__doc__ = convolution.__doc__
 
-convolution2d = convolution
-convolution3d = convolution
+@add_arg_scope
+def convolution2d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=2)
+
+convolution2d.__doc__ = convolution.__doc__
 
+@add_arg_scope
+def convolution3d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=3)
+
+convolution3d.__doc__ = convolution.__doc__
 
 @add_arg_scope
 def convolution2d_in_plane(
@@ -1405,13 +1536,14 @@ def convolution3d_transpose(
 @add_arg_scope
 def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   """Converts a dense tensor into a sparse tensor.
+
   An example use would be to convert dense labels to sparse ones
   so that they can be fed to the ctc_loss.
 
   Args:
      tensor: An `int` `Tensor` to be converted to a `Sparse`.
      eos_token: An integer.
-       It is part of the target label that signfies the end of a sentence.
+       It is part of the target label that signifies the end of a sentence.
      outputs_collections: Collection to add the outputs.
      scope: Optional scope for name_scope.
   """
@@ -1555,7 +1687,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
     output_collections: Collection to which the outputs will be added.
     scope: Optional scope for `name_scope`.
   Returns:
-    A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but
+    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
     with innermost dimensions flattened to obtain rank `new_rank`.
 
   Raises:
@@ -2192,11 +2324,16 @@ def images_to_sequence(inputs,
                        outputs_collections=None,
                        scope=None):
   """Convert a batch of images into a batch of sequences.
+
   Args:
     inputs: a (num_images, height, width, depth) tensor
     data_format: A string. `NHWC` (default) and `NCHW` are supported.
     outputs_collections: The collections to which the outputs are added.
     scope: Optional scope for name_scope.
+
+  Raises:
+     ValueError: If `data_format` is not either NCHW or NHWC.
+
   Returns:
     (width, num_images*height, depth) sequence tensor
   """
@@ -2702,6 +2839,7 @@ def sequence_to_images(inputs,
                        outputs_collections=None,
                        scope=None):
   """Convert a batch of sequences into a batch of images.
+
   Args:
     inputs: (num_steps, num_batches, depth) sequence tensor
     height: the height of the images
@@ -2709,6 +2847,7 @@ def sequence_to_images(inputs,
       Currently supports `'channels_first'` and `'channels_last'`.
     outputs_collections: The collections to which the outputs are added.
     scope: Optional scope for name_scope.
+
   Returns:
     A tensor representing the output of the operation.
   """
@@ -2718,7 +2857,7 @@ def sequence_to_images(inputs,
     if num_batches is None:
       num_batches = -1
     else:
-      num_batches = num_batches // height
+      num_batches //= height
     reshaped = array_ops.reshape(inputs,
                                  [width, num_batches, height, depth])
     if output_data_format == 'channels_first':
@@ -2748,7 +2887,7 @@ def softmax(logits, scope=None):
     logits_2d = array_ops.reshape(logits, [-1, num_logits])
     predictions = nn.softmax(logits_2d)
     predictions = array_ops.reshape(predictions, array_ops.shape(logits))
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       predictions.set_shape(logits.get_shape())
     return predictions
 
@@ -2941,6 +3080,53 @@ def unit_norm(inputs, dim, epsilon=1e-7, scope=None):
     return math_ops.div(inputs, array_ops.tile(lengths, multiples))
 
 
+@add_arg_scope
+def maxout(inputs, num_units, axis=-1, scope=None):
+  """Adds a maxout op from https://arxiv.org/abs/1302.4389
+
+  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+  Courville,
+   Yoshua Bengio
+
+  Usually the operation is performed in the filter/channel dimension. This can
+  also be
+  used after fully-connected layers to reduce number of features.
+
+  Arguments:
+    inputs: Tensor input
+    num_units: Specifies how many features will remain after maxout
+      in the `axis` dimension (usually channel).
+      This must be multiple of number of `axis`.
+    axis: The dimension where max pooling will be performed. Default is the
+    last dimension.
+    scope: Optional scope for variable_scope.
+
+  Returns:
+    A `Tensor` representing the results of the pooling operation.
+
+  Raises:
+    ValueError: if num_units is not multiple of number of features.
+  """
+  with variable_scope.variable_scope(scope, 'MaxOut', [inputs]):
+    inputs = ops.convert_to_tensor(inputs)
+    shape = inputs.get_shape().as_list()
+    num_channels = shape[axis]
+    if num_channels % num_units:
+      raise ValueError('number of features({}) is not '
+                       'a multiple of num_units({})'.format(
+                           num_channels, num_units))
+    shape[axis] = -1
+    shape += [num_channels // num_units]
+
+    # Dealing with batches with arbitrary sizes
+    for i in range(len(shape)):
+      if shape[i] is None:
+        shape[i] = array_ops.shape(inputs)[i]
+    outputs = math_ops.reduce_max(
+        array_ops.reshape(inputs, shape), -1, keepdims=False)
+    return outputs
+
+
 def poincare_normalize(x, axis=1, epsilon=1e-5, name=None):
   """Project into the Poincare ball with norm <= 1.0 - epsilon.
 
@@ -2999,16 +3185,16 @@ def legacy_fully_connected(x,
   `activation_fn` is `None`, the result of `y = w * x + b` is
   returned.
 
-  If `x` has shape [\\\(\\text{dim}_0, \\text{dim}_1, ..., \\text{dim}_n\\\)]
-  with more than 2 dimensions (\\\(n > 1\\\)), then we repeat the matrix
+  If `x` has shape [\\(\text{dim}_0, \text{dim}_1, ..., \text{dim}_n\\)]
+  with more than 2 dimensions (\\(n > 1\\)), then we repeat the matrix
   multiply along the first dimensions. The result r is a tensor of shape
-  [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`],
-  where \\\( r_{i_0, ..., i_{n-1}, k} =
-  \\sum_{0 \\leq j < \\text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\\).
+  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`],
+  where \\( r_{i_0, ..., i_{n-1}, k} =
+  \sum_{0 \leq j < \text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\).
   This is accomplished by reshaping `x` to 2-D
-  [\\\(\\text{dim}_0 \\cdot ... \\cdot \\text{dim}_{n-1}, \\text{dim}_n\\\)]
+  [\\(\text{dim}_0 \cdot ... \cdot \text{dim}_{n-1}, \text{dim}_n\\)]
   before the matrix multiply and afterwards reshaping it to
-  [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`].
+  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`].
 
   This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting
   `bias_init` to `None`.
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 0f062adbab3ca9acfb89543b69c7c957bbdf5dd8..b01fd5d5c95ac15c76f9dbe7c77f7e76f12149a9 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -310,6 +310,17 @@ class BiasAddTest(test.TestCase):
 
 class ConvolutionTest(test.TestCase):
 
+  def testInvalidShape(self):
+    with self.test_session():
+      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 5, got 4'):
+        layers_lib.convolution3d(images_2d, 32, 3)
+      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 4, got 5'):
+        layers_lib.convolution2d(images_3d, 32, 3)
+
   def testInvalidDataFormat(self):
     height, width = 7, 9
     with self.test_session():
@@ -3155,7 +3166,7 @@ class RepeatTests(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
 
   def testRepeatWithScope(self):
@@ -3749,7 +3760,7 @@ class StackTests(test.TestCase):
           layers_lib.convolution2d, [10, 20, 30],
           kernel_size=[3, 3],
           padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])
 
   def testStackWithScope(self):
@@ -4135,5 +4146,31 @@ class LegacyFullyConnectedTest(test.TestCase):
         _layers.legacy_fully_connected(x, 2, activation_fn=nn_ops.softmax)
 
 
+class MaxOutTest(test.TestCase):
+
+  def test_simple(self):
+    inputs = random_ops.random_uniform((64, 10, 36), seed=1)
+    graph = _layers.maxout(inputs, num_units=3)
+    self.assertEqual(graph.get_shape().as_list(), [64, 10, 3])
+
+  def test_fully_connected(self):
+    inputs = random_ops.random_uniform((64, 50), seed=1)
+    graph = _layers.fully_connected(inputs, 50)
+    graph = _layers.maxout(graph, num_units=10)
+    self.assertEqual(graph.get_shape().as_list(), [64, 10])
+
+  def test_nchw(self):
+    inputs = random_ops.random_uniform((10, 100, 100, 3), seed=1)
+    graph = _layers.conv2d(inputs, 10, 3, padding='SAME')
+    graph = _layers.maxout(graph, num_units=1)
+    self.assertEqual(graph.get_shape().as_list(), [10, 100, 100, 1])
+
+  def test_invalid_shape(self):
+    inputs = random_ops.random_uniform((10, 100, 100, 3), seed=1)
+    graph = _layers.conv2d(inputs, 3, 10)
+    with self.assertRaisesRegexp(ValueError, 'number of features'):
+      graph = _layers.maxout(graph, num_units=2)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/layers/python/layers/normalization.py b/tensorflow/contrib/layers/python/layers/normalization.py
index e7d4080ff769327cc74b6629a7705ddfa552169b..c807ab0f2e5c8ac3ec2ae1d84a5b36b5f4ba76a4 100644
--- a/tensorflow/contrib/layers/python/layers/normalization.py
+++ b/tensorflow/contrib/layers/python/layers/normalization.py
@@ -24,11 +24,13 @@ from tensorflow.contrib.layers.python.layers import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope
 
 
 __all__ = [
+    'group_norm',
     'instance_norm',
 ]
 
@@ -158,3 +160,196 @@ def instance_norm(inputs,
     if activation_fn is not None:
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
+
+@add_arg_scope
+def group_norm(inputs,
+               groups=32,
+               channels_axis=-1,
+               reduction_axes=(-3, -2),
+               center=True,
+               scale=True,
+               epsilon=1e-6,
+               activation_fn=None,
+               param_initializers=None,
+               reuse=None,
+               variables_collections=None,
+               outputs_collections=None,
+               trainable=True,
+               scope=None):
+  """Functional interface for the group normalization layer.
+
+  Reference: https://arxiv.org/abs/1803.08494.
+
+    "Group Normalization", Yuxin Wu, Kaiming He
+
+  Args:
+    inputs: A Tensor with at least 2 dimensions one which is channels. All
+     shape dimensions must be fully defined.
+    groups: Integer. Divide the channels into this number of groups over which
+      normalization statistics are computed. This number must be commensurate
+      with the number of channels in `inputs`.
+    channels_axis: An integer. Specifies index of channels axis which will be
+      broken into `groups`, each of which whose statistics will be computed
+      across. Must be mutually exclusive with `reduction_axes`. Preferred usage
+      is to specify negative integers to be agnostic as to whether a batch
+      dimension is included.
+    reduction_axes: Tuple of integers. Specifies dimensions over which
+       statistics will be accumulated. Must be mutually exclusive with
+       `channels_axis`. Statistics will not be accumulated across axes not
+       specified in `reduction_axes` nor `channel_axis`. Preferred usage is to
+       specify negative integers to be agnostic to whether a batch dimension is
+       included.
+
+      Some sample usage cases:
+        NHWC format: channels_axis=-1, reduction_axes=[-3, -2]
+        NCHW format: channels_axis=-3, reduction_axes=[-2, -1]
+
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is
+      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling can be done by the next layer.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    activation_fn: Activation function, default set to None to skip it and
+      maintain a linear activation.
+    param_initializers: Optional initializers for beta, gamma, moving mean and
+      moving variance.
+    reuse: Whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: Optional collections for the variables.
+    outputs_collections: Collections to add the outputs.
+    trainable: If `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    scope: Optional scope for `variable_scope`.
+
+  Returns:
+    A `Tensor` representing the output of the operation.
+
+  Raises:
+    ValueError: If the rank of `inputs` is undefined.
+    ValueError: If rank or channels dimension of `inputs` is undefined.
+    ValueError: If number of groups is not commensurate with number of channels.
+    ValueError: If reduction_axes or channels_axis are out of bounds.
+    ValueError: If reduction_axes are not mutually exclusive with channels_axis.
+  """
+  # TODO(shlens): Support partially defined shapes for the inputs.
+  inputs = ops.convert_to_tensor(inputs)
+  original_shape = inputs.shape
+
+  if inputs.shape.ndims is None:
+    raise ValueError('Inputs %s has undefined rank.' % inputs.name)
+  if channels_axis > (inputs.shape.ndims - 1):
+    raise ValueError('Axis is out of bounds.')
+
+  # Standardize the channels_axis to be positive and identify # of channels.
+  if channels_axis < 0:
+    channels_axis = inputs.shape.ndims + channels_axis
+  channels = inputs.shape[channels_axis].value
+
+  if channels is None:
+    raise ValueError('Inputs %s has undefined channel dimension: %d.' % (
+        inputs.name, channels_axis))
+
+  # Standardize the reduction_axes to be positive.
+  reduction_axes = list(reduction_axes)
+  for i in range(len(reduction_axes)):
+    if reduction_axes[i] < 0:
+      reduction_axes[i] += inputs.shape.ndims
+
+  for a in reduction_axes:
+    if a > inputs.shape.ndims:
+      raise ValueError('Axis is out of bounds.')
+    if inputs.shape[a].value is None:
+      raise ValueError('Inputs %s has undefined dimensions %d.' % (
+          inputs.name, a))
+    if channels_axis == a:
+      raise ValueError('reduction_axis must be mutually exclusive '
+                       'with channels_axis')
+  if groups > channels:
+    raise ValueError('Invalid groups %d for %d channels.' % (groups, channels))
+  if channels % groups != 0:
+    raise ValueError('%d channels is not commensurate with %d groups.' %
+                     (channels, groups))
+
+  # Determine axes before channels. Some examples of common image formats:
+  #  'NCHW': before = [N], after = [HW]
+  #  'NHWC': before = [NHW], after = []
+  axes_before_channels = inputs.shape.as_list()[:channels_axis]
+  axes_after_channels = inputs.shape.as_list()[channels_axis+1:]
+
+  # Manually broadcast the parameters to conform to the number of groups.
+  params_shape_broadcast = ([1] * len(axes_before_channels) +
+                            [groups, channels // groups] +
+                            [1] * len(axes_after_channels))
+
+  # Reshape the input by the group within the channel dimension.
+  inputs_shape = (axes_before_channels + [groups, channels // groups] +
+                  axes_after_channels)
+  inputs = array_ops.reshape(inputs, inputs_shape)
+
+  # Determine the dimensions across which moments are calculated.
+  moments_axes = [channels_axis + 1]
+  for a in reduction_axes:
+    if a > channels_axis:
+      moments_axes.append(a + 1)
+    else:
+      moments_axes.append(a)
+
+  with variable_scope.variable_scope(
+      scope, 'GroupNorm', [inputs], reuse=reuse) as sc:
+    # Note that the params_shape is the number of channels always.
+    params_shape = [channels]
+
+    # Allocate parameters for the beta and gamma of the normalization.
+    beta, gamma = None, None
+    dtype = inputs.dtype.base_dtype
+    if param_initializers is None:
+      param_initializers = {}
+    if center:
+      beta_collections = utils.get_variable_collections(
+          variables_collections, 'beta')
+      beta_initializer = param_initializers.get(
+          'beta', init_ops.zeros_initializer())
+      beta = variables.model_variable('beta',
+                                      shape=params_shape,
+                                      dtype=dtype,
+                                      initializer=beta_initializer,
+                                      collections=beta_collections,
+                                      trainable=trainable)
+      beta = array_ops.reshape(beta, params_shape_broadcast)
+
+    if scale:
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
+      gamma_initializer = param_initializers.get(
+          'gamma', init_ops.ones_initializer())
+      gamma = variables.model_variable('gamma',
+                                       shape=params_shape,
+                                       dtype=dtype,
+                                       initializer=gamma_initializer,
+                                       collections=gamma_collections,
+                                       trainable=trainable)
+      gamma = array_ops.reshape(gamma, params_shape_broadcast)
+
+    # Calculate the moments.
+    mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)
+
+    # Compute normalization.
+    # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor
+    # appropriately so that this operation may be faster.
+    gain = math_ops.rsqrt(variance + epsilon)
+    offset = -mean * gain
+    if gamma is not None:
+      gain *= gamma
+      offset *= gamma
+    if beta is not None:
+      offset += beta
+    outputs = inputs * gain + offset
+
+    # Collapse the groups into the channel dimension.
+    outputs = array_ops.reshape(outputs, original_shape)
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
diff --git a/tensorflow/contrib/layers/python/layers/normalization_test.py b/tensorflow/contrib/layers/python/layers/normalization_test.py
index 5cff1bf0ebb2fe8bc6933de882ecd47a9edf0f94..b6e96350db92baf4770683273be7e5dde73dbcec 100644
--- a/tensorflow/contrib/layers/python/layers/normalization_test.py
+++ b/tensorflow/contrib/layers/python/layers/normalization_test.py
@@ -166,5 +166,231 @@ class InstanceNormTest(test.TestCase):
   def testOutputBigInput5DNCHW(self):
     self.doOutputTest((1, 100, 100, 1, 1), 'NCHW', tol=1e-3)
 
+
+class GroupNormTest(test.TestCase):
+
+  def testInvalidGroupSize(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(5, 2, 10, 10))
+    with self.assertRaisesRegexp(ValueError,
+                                 'Invalid groups 10 for 2 channels.'):
+      normalization.group_norm(inputs, groups=10,
+                               reduction_axes=[-2, -1], channels_axis=-3)
+
+  def testBadCommensurateGroup(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(5, 4, 10, 10))
+    with self.assertRaisesRegexp(ValueError,
+                                 '4 channels is not commensurate with '
+                                 '3 groups.'):
+      normalization.group_norm(inputs, groups=3,
+                               reduction_axes=[-2, -1], channels_axis=-3)
+
+  def testAxisIsBad(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 2, 4, 5))
+    with self.assertRaisesRegexp(ValueError,
+                                 'Axis is out of bounds.'):
+      normalization.group_norm(inputs, channels_axis=5)
+    with self.assertRaisesRegexp(ValueError,
+                                 'Axis is out of bounds.'):
+      normalization.group_norm(inputs, reduction_axes=[1, 5])
+
+  def testNotMutuallyExclusiveAxis(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(10, 32, 32, 32))
+    # Specify axis with negative values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=-2, reduction_axes=[-2])
+    # Specify axis with positive values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=1, reduction_axes=[1, 3])
+    # Specify axis with mixed positive and negative values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=-2, reduction_axes=[2])
+
+  def testUnknownShape(self):
+    inputs = array_ops.placeholder(dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'undefined rank'):
+      normalization.group_norm(inputs)
+
+  def testParamsShapeNotFullyDefinedReductionAxes(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 32, None, 4))
+    with self.assertRaisesRegexp(ValueError, 'undefined dimensions'):
+      normalization.group_norm(inputs)
+
+  def testParamsShapeNotFullyDefinedChannelsAxis(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 3, 4, None))
+    with self.assertRaisesRegexp(ValueError, 'undefined channel dimension'):
+      normalization.group_norm(inputs, channels_axis=-1,
+                               reduction_axes=[-3, -2])
+
+  def testCreateOp(self):
+    height, width, groups = 3, 3, 4
+    images = random_ops.random_uniform((5, height, width, 2*groups), seed=1)
+    output = normalization.group_norm(images, groups=groups, channels_axis=-1,
+                                      reduction_axes=[-3, -2])
+    print('name: ', output.op.name)
+    self.assertListEqual([5, height, width, 2*groups], output.shape.as_list())
+
+  def testCreateOpFloat64(self):
+    height, width, groups = 3, 3, 5
+    images = random_ops.random_uniform(
+        (5, height, width, 4*groups), dtype=dtypes.float64, seed=1)
+    output = normalization.group_norm(images, groups=groups)
+    self.assertEqual(dtypes.float64, output.dtype)
+    self.assertListEqual([5, height, width, 4*groups], output.shape.as_list())
+
+  def testCreateOpNoScaleCenter(self):
+    height, width, groups = 3, 3, 7
+    images = random_ops.random_uniform(
+        (5, height, width, 3*groups), dtype=dtypes.float32, seed=1)
+    output = normalization.group_norm(images, groups=groups, center=False,
+                                      scale=False)
+    self.assertListEqual([5, height, width, 3*groups], output.shape.as_list())
+    self.assertEqual(0, len(contrib_variables.get_variables_by_name('beta')))
+    self.assertEqual(0, len(contrib_variables.get_variables_by_name('gamma')))
+
+  def testCreateVariables_NHWC(self):
+    height, width = 3, 3
+    images = random_ops.random_uniform((5, height, width, 8), seed=1)
+    normalization.group_norm(images, groups=4,
+                             channels_axis=-1, reduction_axes=(-3, -2),
+                             center=True, scale=True)
+    beta = contrib_variables.get_variables_by_name('beta')[0]
+    gamma = contrib_variables.get_variables_by_name('gamma')[0]
+    self.assertEqual('GroupNorm/beta', beta.op.name)
+    self.assertEqual('GroupNorm/gamma', gamma.op.name)
+
+  def testCreateVariables_NCHW(self):
+    height, width, groups = 3, 3, 4
+    images = random_ops.random_uniform((5, 2*groups, height, width), seed=1)
+    normalization.group_norm(images, groups=4,
+                             channels_axis=-3, reduction_axes=(-2, -1),
+                             center=True, scale=True)
+    beta = contrib_variables.get_variables_by_name('beta')[0]
+    gamma = contrib_variables.get_variables_by_name('gamma')[0]
+    self.assertEqual('GroupNorm/beta', beta.op.name)
+    self.assertEqual('GroupNorm/gamma', gamma.op.name)
+
+  def testReuseVariables(self):
+    height, width = 3, 3
+    images = random_ops.random_uniform((5, height, width, 4), seed=1)
+    normalization.group_norm(images, groups=2, scale=True, scope='IN')
+    normalization.group_norm(images, groups=2, scale=True, scope='IN',
+                             reuse=True)
+    beta = contrib_variables.get_variables_by_name('beta')
+    gamma = contrib_variables.get_variables_by_name('gamma')
+    self.assertEqual(1, len(beta))
+    self.assertEqual(1, len(gamma))
+
+  def testValueCorrectWithReuseVars(self):
+    height, width = 3, 3
+    image_shape = (10, height, width, 4)
+    images = random_ops.random_uniform(image_shape, seed=1)
+    output_train = normalization.group_norm(images, groups=2, scope='IN')
+    output_eval = normalization.group_norm(images, groups=2, scope='IN',
+                                           reuse=True)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      # output_train and output_eval should be the same.
+      train_np, eval_np = sess.run([output_train, output_eval])
+      self.assertAllClose(train_np, eval_np)
+
+  def doOutputTest(self, input_shape, channels_axis=None, reduction_axes=None,
+                   groups=2, tol=1e-2):
+    # Select the axis for the channel and the dimensions along which statistics
+    # are accumulated.
+    if channels_axis < 0:
+      channels_axis += len(input_shape)
+    reduced_axes = [channels_axis + 1]
+    for a in reduction_axes:
+      if a < 0:
+        a += len(input_shape)
+      if a < channels_axis:
+        reduced_axes.append(a)
+      else:
+        reduced_axes.append(a+1)
+    reduced_axes = tuple(reduced_axes)
+
+    # Calculate the final shape for the output Tensor.
+    axes_before_channels = input_shape[:channels_axis]
+    axes_after_channels = input_shape[channels_axis+1:]
+    channels = input_shape[channels_axis]
+    outputs_shape = (axes_before_channels + [groups, channels // groups] +
+                     axes_after_channels)
+
+    # Calculate the final shape for the output statistics.
+    reduced_shape = []
+    for i, a in enumerate(outputs_shape):
+      if i not in reduced_axes:
+        reduced_shape.append(a)
+
+    for mu in (0.0, 1e2):
+      for sigma in (1.0, 0.1):
+        # Determine shape of Tensor after normalization.
+        expected_mean = np.zeros(reduced_shape)
+        expected_var = np.ones(reduced_shape)
+
+        inputs = random_ops.random_uniform(input_shape, seed=0) * sigma + mu
+        output_op = normalization.group_norm(
+            inputs, groups=groups, center=False, scale=False,
+            channels_axis=channels_axis,
+            reduction_axes=reduction_axes)
+        with self.test_session() as sess:
+          sess.run(variables.global_variables_initializer())
+          outputs = sess.run(output_op)
+          # Make sure that there are no NaNs
+          self.assertFalse(np.isnan(outputs).any())
+
+          outputs = np.reshape(outputs, outputs_shape)
+          mean = np.mean(outputs, axis=reduced_axes)
+          var = np.var(outputs, axis=reduced_axes)
+          # The mean and variance of each example should be close to 0 and 1
+          # respectively.
+          self.assertAllClose(expected_mean, mean, rtol=tol, atol=tol)
+          self.assertAllClose(expected_var, var, rtol=tol, atol=tol)
+
+  def testOutputSmallInput4D_NHWC(self):
+    input_shape = [10, 10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=3, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+
+  def testOutputSmallInput3D_NHWC(self):
+    input_shape = [10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=2, reduction_axes=[0, 1])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+
+  def testOutputSmallInput4D_NCHW(self):
+    input_shape = [10, 10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=1, reduction_axes=[2, 3])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+
+  def testOutputSmallInput3D_NCHW(self):
+    input_shape = [10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=0, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+
+  def testOutputBigInput4D_NHWC(self):
+    self.doOutputTest([5, 100, 100, 1], channels_axis=3, reduction_axes=[1, 2],
+                      groups=1)
+
+  def testOutputBigInput4D_NCHW(self):
+    self.doOutputTest([1, 100, 100, 4], channels_axis=1, reduction_axes=[2, 3],
+                      groups=4)
+
+  def testOutputSmallInput2D_NC(self):
+    self.doOutputTest([10, 7*100], channels_axis=1, reduction_axes=[], groups=7)
+
+  def testOutputSmallInput5D_NCXXX(self):
+    self.doOutputTest([10, 10, 20, 40, 5],
+                      channels_axis=1,
+                      reduction_axes=[2, 3, 4],
+                      groups=5)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py
index cdceea6fee5bdb5aeb6537ea55d25ccf107def4c..69d927e1b3001d14dd1af2f890b07c1a57ab2cfc 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@@ -41,7 +41,7 @@ OPTIMIZER_CLS_NAMES = {
     "Adagrad": train.AdagradOptimizer,
     "Adam": train.AdamOptimizer,
     "Ftrl": train.FtrlOptimizer,
-    "Momentum": lambda lr: train.MomentumOptimizer(lr, momentum=0.9),
+    "Momentum": lambda learning_rate: train.MomentumOptimizer(learning_rate, momentum=0.9),  # pylint: disable=line-too-long
     "RMSProp": train.RMSPropOptimizer,
     "SGD": train.GradientDescentOptimizer,
 }
diff --git a/tensorflow/contrib/layers/python/layers/optimizers_test.py b/tensorflow/contrib/layers/python/layers/optimizers_test.py
index 1ea25bd1a5685eb6f840e621b5739029a660aa0f..a4461a20e54c289886f1a1beb255de12fc054afe 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@@ -61,7 +61,8 @@ class OptimizersTest(test.TestCase):
     optimizers = [
         "SGD", gradient_descent.GradientDescentOptimizer,
         gradient_descent.GradientDescentOptimizer(learning_rate=0.1),
-        lambda lr: gradient_descent.GradientDescentOptimizer(learning_rate=lr)
+        lambda lr: gradient_descent.GradientDescentOptimizer(learning_rate=lr),
+        "Momentum"
     ]
     for optimizer in optimizers:
       with ops.Graph().as_default() as g:
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 123275e1fde047cd3772528641b2e3b09742fbdc..8ed9f446bcd5f222f486e43125dafc595852e5ce 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -29,14 +29,18 @@ from __future__ import print_function
 import functools
 import re
 
+import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.framework.python import ops as contrib_framework_ops
-from tensorflow.python.framework import function
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
@@ -46,6 +50,14 @@ from tensorflow.python.util import nest
 __all__ = ["rev_block", "RevBlock", "recompute_grad"]
 
 LAYER_RE = re.compile(".*revlayer_([0-9]*)/([fg])/.*")
+_USE_DEFAULT = "__rev_block_lib_default"
+_WRONG_VARS_ERR = """\
+The variables used on recompute were different than the variables originally
+used. The function wrapped with @recompute_grad likley creates its own variable
+scope with a default name and has been called twice in the same enclosing scope.
+To fix, ensure each call to the function happens in its own unique variable
+scope.
+"""
 
 
 def _acc_grads(*lists_of_grads):
@@ -142,7 +154,7 @@ def _scope_wrap(fn, scope):
 
   @functools.wraps(fn)
   def wrap(*args, **kwargs):
-    with variable_scope.variable_scope(scope):
+    with variable_scope.variable_scope(scope, use_resource=True):
       return fn(*args, **kwargs)
 
   return wrap
@@ -217,91 +229,95 @@ class RevBlock(base.Layer):
                  "build.")
     self.built = True
 
-  def _efficient_grad_fn(self, inputs, variables, ys, grad_ys):
-    """Custom gradient fn for a block of reversible residual layers."""
-    side_inputs = inputs[2:]
-    f_side_idxs = [None] * len(self.f_side_input)
-    g_side_idxs = [None] * len(self.g_side_input)
-    assert len(side_inputs) == len(self.f_side_input) + len(self.g_side_input)
-
-    for i, t in enumerate(side_inputs):
-      if t in self.f_side_input:
-        f_side_idxs[self.f_side_input.index(t)] = i
-      elif t in self.g_side_input:
-        g_side_idxs[self.g_side_input.index(t)] = i
-      else:
-        assert False
-
-    f_vars = [[] for _ in range(self.num_layers)]
-    g_vars = [[] for _ in range(self.num_layers)]
-    f_vars_idxs = [[] for _ in range(self.num_layers)]
-    g_vars_idxs = [[] for _ in range(self.num_layers)]
-
-    for i, t in enumerate(variables):
-      ref = _underlying_variable_ref(t)
-
-      # Use the name to identify the layer number and function (f or g)
-      regex = LAYER_RE.match(ref.name)
-      layer_no = int(regex.group(1))
-      fn_name = regex.group(2)
-      if fn_name == "f":
-        f_vars[layer_no].append(ref)
-        f_vars_idxs[layer_no].append(i)
-      else:
-        assert fn_name == "g"
-        g_vars[layer_no].append(ref)
-        g_vars_idxs[layer_no].append(i)
-
-    f_var_grads = []
-    g_var_grads = []
-    f_side_grads = []
-    g_side_grads = []
-
-    # Reverse variable containers to go backward
-    f_vars.reverse()
-    g_vars.reverse()
-    f = list(self.f)
-    g = list(self.g)
-    f.reverse()
-    g.reverse()
-
-    with variable_scope.variable_scope(self.scope_name, reuse=True):
-      for i in xrange(self.num_layers):
-        ys, grad_ys, f_ret, g_ret = _rev_layer_backward(
-            ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i],
-            self.g_side_input)
-
-        grad_f_vars, grad_f_side = f_ret
-        grad_g_vars, grad_g_side = g_ret
-        f_var_grads.append(grad_f_vars)
-        g_var_grads.append(grad_g_vars)
-        f_side_grads.append(grad_f_side)
-        g_side_grads.append(grad_g_side)
-
-    # Accumulate layer gradients for f_side_input and g_side_input
-    acc_f_side_grads = _acc_grads(*f_side_grads)
-    acc_g_side_grads = _acc_grads(*g_side_grads)
-
-    # Use the stored idxs to put gradients in the passed-in order.
-    side_input_grads = [None] * len(side_inputs)
-    variable_grads = [None] * len(variables)
-
-    # Variable gradients were collected in reverse layer order. Reverse to match
-    # idxs.
-    f_var_grads.reverse()
-    g_var_grads.reverse()
-    for idxs, grads in list(zip(f_vars_idxs, f_var_grads)) + list(
-        zip(g_vars_idxs, g_var_grads)):
-      for i, grad in zip(idxs, grads):
-        variable_grads[i] = grad
-
-    for i, grad in zip(f_side_idxs, acc_f_side_grads):
-      side_input_grads[i] = grad
-    for i, grad in zip(g_side_idxs, acc_g_side_grads):
-      side_input_grads[i] = grad
-
-    grad_x1, grad_x2 = grad_ys
-    return [grad_x1, grad_x2] + side_input_grads, variable_grads
+  def _make_efficient_grad_fn(self, inputs_, ys_):
+    def _efficient_grad_fn(*grad_ys, **kwargs):
+      """Custom gradient fn for a block of reversible residual layers."""
+      inputs = inputs_
+      ys = ys_
+      variables = kwargs["variables"]
+      side_inputs = inputs[2:]
+
+      f_side_idxs = [None] * len(self.f_side_input)
+      g_side_idxs = [None] * len(self.g_side_input)
+      assert len(side_inputs) == len(self.f_side_input) + len(self.g_side_input)
+
+      for i, t in enumerate(side_inputs):
+        if t in self.f_side_input:
+          f_side_idxs[self.f_side_input.index(t)] = i
+        elif t in self.g_side_input:
+          g_side_idxs[self.g_side_input.index(t)] = i
+        else:
+          assert False
+
+      f_vars = [[] for _ in range(self.num_layers)]
+      g_vars = [[] for _ in range(self.num_layers)]
+      f_vars_idxs = [[] for _ in range(self.num_layers)]
+      g_vars_idxs = [[] for _ in range(self.num_layers)]
+
+      for i, ref in enumerate(variables):
+        # Use the name to identify the layer number and function (f or g)
+        regex = LAYER_RE.match(ref.name)
+        layer_no = int(regex.group(1))
+        fn_name = regex.group(2)
+        if fn_name == "f":
+          f_vars[layer_no].append(ref)
+          f_vars_idxs[layer_no].append(i)
+        else:
+          assert fn_name == "g"
+          g_vars[layer_no].append(ref)
+          g_vars_idxs[layer_no].append(i)
+
+      f_var_grads = []
+      g_var_grads = []
+      f_side_grads = []
+      g_side_grads = []
+
+      # Reverse variable containers to go backward
+      f_vars.reverse()
+      g_vars.reverse()
+      f = list(self.f)
+      g = list(self.g)
+      f.reverse()
+      g.reverse()
+
+      with variable_scope.variable_scope(self.scope_name, reuse=True):
+        for i in xrange(self.num_layers):
+          ys, grad_ys, f_ret, g_ret = _rev_layer_backward(
+              ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i],
+              self.g_side_input)
+
+          grad_f_vars, grad_f_side = f_ret
+          grad_g_vars, grad_g_side = g_ret
+          f_var_grads.append(grad_f_vars)
+          g_var_grads.append(grad_g_vars)
+          f_side_grads.append(grad_f_side)
+          g_side_grads.append(grad_g_side)
+
+      # Accumulate layer gradients for f_side_input and g_side_input
+      acc_f_side_grads = _acc_grads(*f_side_grads)
+      acc_g_side_grads = _acc_grads(*g_side_grads)
+
+      # Use the stored idxs to put gradients in the passed-in order.
+      side_input_grads = [None] * len(side_inputs)
+      variable_grads = [None] * len(variables)
+
+      # Variable gradients were collected in reverse layer order. Reverse to
+      # match idxs.
+      f_var_grads.reverse()
+      g_var_grads.reverse()
+      for idxs, grads in list(zip(f_vars_idxs, f_var_grads)) + list(
+          zip(g_vars_idxs, g_var_grads)):
+        for i, grad in zip(idxs, grads):
+          variable_grads[i] = grad
+
+      for i, grad in zip(f_side_idxs, acc_f_side_grads):
+        side_input_grads[i] = grad
+      for i, grad in zip(g_side_idxs, acc_g_side_grads):
+        side_input_grads[i] = grad
+
+      grad_x1, grad_x2 = grad_ys
+      return [grad_x1, grad_x2] + side_input_grads, variable_grads
+    return _efficient_grad_fn
 
   def _forward(self, x1, x2):
     """Run forward through the reversible layers."""
@@ -309,10 +325,6 @@ class RevBlock(base.Layer):
     side_inputs = [self.f_side_input, self.g_side_input]
     flat_side_inputs = nest.flatten(side_inputs)
 
-    custom_grad_fn = (
-        self._efficient_grad_fn if self._use_efficient_backprop else None)
-
-    @_fn_with_custom_grad(custom_grad_fn)
     def _forward_wrap(x1_, x2_, *flat_side_inputs):
       f_side, g_side = nest.pack_sequence_as(side_inputs, flat_side_inputs)
       return _rev_block_forward(
@@ -325,7 +337,16 @@ class RevBlock(base.Layer):
           g_side_input=g_side,
           gate_outputs=self._use_efficient_backprop)
 
-    return _forward_wrap(x1, x2, *flat_side_inputs)
+    @custom_gradient.custom_gradient
+    def _forward_with_custom_grad(*args):
+      out = _forward_wrap(*args)  # pylint: disable=no-value-for-parameter
+      grad_fn = self._make_efficient_grad_fn(args, out)
+      return out, grad_fn
+
+    if self._use_efficient_backprop:
+      return _forward_with_custom_grad(x1, x2, *flat_side_inputs)
+    else:
+      return _forward_wrap(x1, x2, *flat_side_inputs)
 
   def _backward(self, y1, y2):
     """Run backward through the reversible layers."""
@@ -405,12 +426,40 @@ def rev_block(x1,
   return block.forward(x1, x2)
 
 
-def recompute_grad(fn):
+def enable_with_args(dec):
+  """A decorator for decorators to enable their usage with or without args."""
+
+  @functools.wraps(dec)
+  def new_dec(*args, **kwargs):
+    if len(args) == 1 and not kwargs and callable(args[0]):
+      # Used as decorator without args
+      fn = args[0]
+      return dec(fn)
+    else:
+      return lambda fn: dec(fn, *args, **kwargs)
+
+  return new_dec
+
+
+@enable_with_args
+def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """Decorator that recomputes the function on the backwards pass.
 
+  To use this function, you must use `ResourceVariable`s (i.e.
+  `variable_scope(name, use_resource=True), which are the default in Eager mode
+  and when running on TPU.
+
   Args:
     fn: a function that takes Tensors (all as positional arguments) and returns
       a tuple of Tensors.
+    use_data_dep: `bool`, if `True` will use a dummy data dependency to force
+      the recompute to happen. If `False` will use a control dependency. By
+      default will be `True` if in an XLA context and `False` otherwise. XLA
+      ignores control dependencies and so this data dependency is necessary.
+    tupleize_grads: `bool`, if `True` will use control dependencies to ensure
+      that all gradients are produced before any are consumed by downstream ops.
+      If `use_data_dep` is also `True`, will use a data dependency instead of
+      a control dependency.
 
   Returns:
     A wrapped fn that is identical to fn when called, but its activations will
@@ -420,43 +469,75 @@ def recompute_grad(fn):
 
   @functools.wraps(fn)
   def wrapped(*args):
-    return _recompute_grad(fn, args)
+    return _recompute_grad(
+        fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads)
 
   return wrapped
 
 
-def _recompute_grad(fn, args):
-  """See recompute_grad."""
+def _is_on_tpu():
+  ctxt = framework_ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
+  return control_flow_util.GetContainingXLAContext(ctxt) is not None
+
 
-  cached_vs = []
-  cached_arg_scope = []
-
-  def grad_fn(inputs, variables, outputs, output_grads):
-    """Recompute outputs for gradient computation."""
-    del outputs
-    # Recompute outputs
-    with framework_ops.control_dependencies(output_grads):
-      with contrib_framework_ops.arg_scope(cached_arg_scope[0]):
-        with variable_scope.variable_scope(cached_vs[0], reuse=True):
-          outputs = fn(*inputs)
-
-    if not (isinstance(outputs, list) or isinstance(outputs, tuple)):
-      outputs = [outputs]
-    outputs = list(outputs)
-    grads = gradients_impl.gradients(outputs, inputs + variables, output_grads)
-    grad_inputs = grads[:len(inputs)]
-    grad_vars = grads[len(inputs):]
-    return grad_inputs, grad_vars
-
-  @_fn_with_custom_grad(grad_fn)
+def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
+  """See recompute_grad."""
+  for arg in args:
+    if not isinstance(arg, framework_ops.Tensor):
+      raise ValueError("All inputs to function must be Tensors")
+  use_data_dep_ = use_data_dep
+  if use_data_dep_ == _USE_DEFAULT:
+    use_data_dep_ = _is_on_tpu()
+
+  @custom_gradient.custom_gradient
   def fn_with_recompute(*args):
-    cached_vs.append(variable_scope.get_variable_scope())
-    # TODO(rsepassi): Rm conditional in TF 1.4
-    if hasattr(contrib_framework_ops, "current_arg_scope"):
-      cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
-    else:
-      cached_arg_scope.append({})
-    return fn(*args)
+    """Wrapper for fn."""
+    # Forward pass
+    vs = variable_scope.get_variable_scope()
+    arg_scope = contrib_framework_ops.current_arg_scope()
+    with backprop.GradientTape() as tape:
+      outputs = fn(*args)
+    original_vars = set(tape.watched_variables())
+
+    # Backward pass
+    def grad_fn(*output_grads, **kwargs):
+      """Recompute outputs for gradient computation."""
+      variables = []
+      if original_vars:
+        variables = kwargs["variables"]
+      if set(variables) != original_vars:
+        raise ValueError(_WRONG_VARS_ERR)
+      del kwargs
+      inputs = list(args)
+      # Recompute outputs
+      with framework_ops.control_dependencies(output_grads):
+        if use_data_dep_:
+          inputs = _force_data_dependency(output_grads, inputs)
+        with contrib_framework_ops.arg_scope(arg_scope):
+          with variable_scope.variable_scope(vs, reuse=True):
+            with backprop.GradientTape() as tape:
+              outputs = fn(*inputs)
+            recompute_vars = set(tape.watched_variables())
+            if original_vars != recompute_vars:
+              raise ValueError(_WRONG_VARS_ERR)
+
+      if not (isinstance(outputs, list) or isinstance(outputs, tuple)):
+        outputs = [outputs]
+      outputs = list(outputs)
+      grads = gradients_impl.gradients(outputs, inputs + variables,
+                                       output_grads)
+
+      if tupleize_grads:
+        if use_data_dep_:
+          grads = _tuple_with_data_dep(grads)
+        else:
+          grads = control_flow_ops.tuple(grads)
+
+      grad_inputs = grads[:len(inputs)]
+      grad_vars = grads[len(inputs):]
+      return grad_inputs, grad_vars
+
+    return outputs, grad_fn
 
   return fn_with_recompute(*args)
 
@@ -483,101 +564,46 @@ def _underlying_variable_ref(t):
     return None
 
 
-def _fn_with_custom_grad(grad_fn, use_global_vars=False):
-  """Decorator to create a subgraph with a custom gradient function.
+def _force_data_dependency(first_compute, then_compute):
+  """Force all of `then_compute` to depend on all of `first_compute`.
 
-  The subgraph created by the decorated function is NOT put in a Defun and so
-  does not suffer from the limitations of the Defun (all subgraph ops on the
-  same device, no summaries).
+  Uses a dummy data dependency, which is useful when running on TPUs because
+  XLA ignores control dependencies. Only supports float arguments.
 
   Args:
-    grad_fn: function with signature
-      (inputs, variables, outputs, output_grads) -> (grad_inputs, grad_vars),
-      all of which are lists of Tensors.
-    use_global_vars: if True, variables will be the global variables created.
-      If False, will be the trainable variables.
+    first_compute: `list<Tensor>`. These will be made to run before the
+      `Tensor`s `then_compute`.
+    then_compute: `list<Tensor>`. These will run after all the `Tensor`s in
+      `first_compute`.
 
   Returns:
-    Decorator for function such that the gradient is defined by grad_fn.
-  """
-
-  def dec(fn):
-
-    @functools.wraps(fn)
-    def wrapped(*args):
-      return _fn_with_custom_grad_internal(
-          fn, args, grad_fn, use_global_vars=use_global_vars)
+    `list<Tensor>`, same length as `then_compute`.
 
-    return wrapped
-
-  return dec
-
-
-def _fn_with_custom_grad_internal(fn, inputs, grad_fn, use_global_vars=False):
-  """Create a subgraph with a custom gradient.
-
-  Args:
-    fn: function that takes inputs as arguments and produces 1 or more Tensors.
-    inputs: list<Tensor>, will be passed as fn(*inputs).
-    grad_fn: function with signature
-      (inputs, vars, outputs, output_grads) -> (grad_inputs, grad_vars),
-      all of which are lists of Tensors.
-    use_global_vars: if True, variables will be the global variables created.
-      If False, will be the trainable variables.
-
-  Returns:
-    fn(*inputs)
+  Raises:
+    ValueError: if ranks are unknown or types are not floating.
   """
-  vs = variable_scope.get_variable_scope()
-  get_vars_fn = (
-      vs.global_variables if use_global_vars else vs.trainable_variables)
-  len_before_vars = len(get_vars_fn())
-  inputs = list(inputs)
-  outputs = fn(*inputs)
-  train_vars = get_vars_fn()[len_before_vars:]
-
-  if grad_fn is None:
-    return outputs
-
-  if not (isinstance(outputs, tuple) or isinstance(outputs, list)):
-    outputs = [outputs]
-  outputs = list(outputs)
-
-  defun_inputs = [inputs, train_vars, outputs]
-
-  def custom_grad_fn(op, *dys):
-    """Custom grad fn applying grad_fn for identity Defun."""
-    fn_inputs, fn_vars, fn_outputs = nest.pack_sequence_as(
-        defun_inputs, list(op.inputs))
-    dys = list(dys)
-    assert len(fn_outputs) == len(outputs)
-    assert len(fn_outputs) == len(dys)
-
-    grad_inputs, grad_vars = grad_fn(fn_inputs, fn_vars, fn_outputs, dys)
-    grad_outputs = [None] * len(fn_outputs)
-    return tuple(grad_inputs + grad_vars + grad_outputs)
-
-  # The Defun takes as input the original inputs, the trainable variables
-  # created in fn, and the outputs. In the forward it passes through the
-  # outputs. In the backwards, it produces gradients for the original inputs
-  # and the trainable variables.
-  in_types = [t.dtype for t in inputs]
-  out_types = [t.dtype for t in outputs]
-  var_types = [t.dtype for t in train_vars]
-
-  # Get a unique name for the Defun
-  with framework_ops.name_scope("identity_custom_grad") as ns:
-    defun_name = ns
-
-  @function.Defun(
-      *(in_types + var_types + out_types),
-      func_name=defun_name,
-      python_grad_func=custom_grad_fn,
-      shape_func=lambda _: [t.get_shape() for t in outputs])
-  def identity(*args):
-    _, _, outs = nest.pack_sequence_as(defun_inputs, args)
-    return tuple([array_ops.identity(t) for t in outs])
-
-  flat_inputs = nest.flatten(defun_inputs)
-  id_out = identity(*flat_inputs)
-  return id_out
+
+  def _first_element(x):
+    if x.get_shape().ndims is None:
+      raise ValueError("Rank of Tensor %s must be known" % x)
+    ndims = x.get_shape().ndims
+    begin = framework_ops.convert_to_tensor([0] * ndims, dtype=dtypes.int32)
+    size = framework_ops.convert_to_tensor([1] * ndims, dtype=dtypes.int32)
+    return array_ops.reshape(array_ops.slice(x, begin, size), [])
+
+  first_compute_sum = math_ops.add_n(
+      [_first_element(x) for x in first_compute if x is not None])
+  dtype = first_compute_sum.dtype
+  if not dtype.is_floating:
+    raise ValueError("_force_data_dependency only supports floating dtypes.")
+  epsilon = np.finfo(dtype.as_numpy_dtype).tiny
+  zero = array_ops.stop_gradient(epsilon * first_compute_sum)
+
+  return [
+      array_ops.identity(x) + zero if x is not None else None
+      for x in then_compute
+  ]
+
+
+def _tuple_with_data_dep(tensors):
+  return _force_data_dependency(tensors, tensors)
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index cbcbcd75114a522b95631e4e7e95c1641b0a9987..997f53b9e1bbf9ac151cadd4a9f8e79c2e0ebca2 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -60,8 +60,8 @@ class RevBlockTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
       x1, x2, x1_inv, x2_inv = sess.run([x1, x2, x1_inv, x2_inv])
 
-      self.assertAllClose(x1, x1_inv)
-      self.assertAllClose(x2, x2_inv)
+      self.assertAllClose(x1, x1_inv, atol=1e-5)
+      self.assertAllClose(x2, x2_inv, atol=1e-5)
 
   def testBackwardForward(self):
 
@@ -83,8 +83,8 @@ class RevBlockTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
       y1, y2, y1_inv, y2_inv = sess.run([y1, y2, y1_inv, y2_inv])
 
-      self.assertAllClose(y1, y1_inv)
-      self.assertAllClose(y2, y2_inv)
+      self.assertAllClose(y1, y1_inv, rtol=1e-5)
+      self.assertAllClose(y2, y2_inv, rtol=1e-5)
 
   def _testRevBlock(self,
                     x=None,
@@ -154,7 +154,7 @@ class RevBlockTest(test.TestCase):
       y_val, yd_val, gd_val, g_val = sess.run([y, y_rev, grads_rev, grads])
       self.assertAllClose(y_val, yd_val)
       for g1, g2 in zip(gd_val, g_val):
-        self.assertAllClose(g1, g2)
+        self.assertAllClose(g1, g2, rtol=1e-5)
 
   def testRevBlock(self):
     self._testRevBlock()
@@ -179,18 +179,16 @@ class RevBlockTest(test.TestCase):
 
     self._testRevBlock(f=[f1, f2, f1, f2])
 
-  # TODO(rsepassi): Recent change to conv seems to have broken this test. Find
-  # out why.
-  def _testConvAndBatchNorm(self):
+  def testConvAndBatchNorm(self):
 
     x = random_ops.random_uniform(
         [self.BATCH_SIZE, 10, self.CHANNELS], dtype=dtypes.float32)
 
     def f(x):
       x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
-      x = layers.batch_norm(x, is_training=True)
+      x = layers.batch_norm(x, is_training=False)
       x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
-      x = layers.batch_norm(x, is_training=True)
+      x = layers.batch_norm(x, is_training=False)
       return x
 
     self._testRevBlock(x=x, f=f)
@@ -255,109 +253,94 @@ class RecomputeTest(test.TestCase):
     def fn_recompute(x):
       return fn(x)
 
+    @rev_block_lib.recompute_grad(use_data_dep=True)
+    def fn_use_data_dep(x):
+      return fn(x)
+
+    @rev_block_lib.recompute_grad(tupleize_grads=True)
+    def fn_tupleize(x):
+      return fn(x)
+
+    @rev_block_lib.recompute_grad(use_data_dep=True, tupleize_grads=True)
+    def fn_both(x):
+      return fn(x)
+
     x = random_ops.random_uniform((3, 1, 3))
-    recompute_vars = None
-    with variable_scope.variable_scope("recompute") as vs:
-      out1 = math_ops.reduce_sum(fn_recompute(x))
-      recompute_vars = vs.trainable_variables()
-    reg_vars = None
-    with variable_scope.variable_scope("regular") as vs:
-      out2 = math_ops.reduce_sum(fn(x))
-      reg_vars = vs.trainable_variables()
-
-    grad1 = gradients_impl.gradients(out1, recompute_vars)
-    grad2 = gradients_impl.gradients(out2, reg_vars)
+
+    names_and_fns = [
+        ("recompute", fn_recompute),
+        ("regular", fn),
+        ("use_data_dep", fn_use_data_dep),
+        ("tupleize", fn_tupleize),
+        ("tuple_and_data_dep", fn_both),
+    ]
+    outputs_and_vars = []
+    for name, wrapped_fn in names_and_fns:
+      with variable_scope.variable_scope(name, use_resource=True) as vs:
+        out = math_ops.reduce_sum(wrapped_fn(x))
+        outputs_and_vars.append((out, vs.trainable_variables()))
+
+    all_grads = []
+    for out, scope_vars in outputs_and_vars:
+      all_grads.append(gradients_impl.gradients(out, scope_vars))
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      outs = sess.run([out1, out2, grad1, grad2])
-      self.assertAllClose(outs[0], outs[1])
-      for g1, g2 in zip(outs[2], outs[3]):
-        self.assertAllClose(g1, g2)
+      outputs = list(zip(*outputs_and_vars))[0]
+      outs, all_grads_val = sess.run([outputs, all_grads])
 
+      # All outputs are the same
+      current = outs[0]
+      for out in outs[1:]:
+        self.assertAllClose(current, out)
+        current = out
 
-class FnWithCustomGradTest(test.TestCase):
+      # All gradients are the same
+      for grads in zip(all_grads_val):
+        current = grads[0]
+        for g in grads[1:]:
+          self.assertAllClose(current, g)
+          current = g
 
-  def testCorrectness(self):
+  def testDoubleCallInSameScopeFails(self):
 
-    w = random_ops.random_uniform([6, 10])
+    @rev_block_lib.recompute_grad
+    def layer_with_recompute(inputs):
+      return core_layers.dense(inputs, 2)
 
-    def fn(a, b, c):
-      return core_layers.dense(
-          a,
-          10,
-          use_bias=False,
-          kernel_initializer=lambda shape, dtype, partition_info: w
-      ) + math_ops.matmul(b, c)
-
-    def grad_fn(inputs, trainable_variables, outputs, grad_outputs):
-      outputs = outputs[0]
-      grad_outputs = grad_outputs[0]
-      grad_inputs = gradients_impl.gradients(
-          outputs, inputs, grad_ys=grad_outputs)
-      grad_vars = gradients_impl.gradients(
-          outputs, trainable_variables, grad_ys=grad_outputs)
-      return grad_inputs, grad_vars
-
-    custom_fn = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)
-
-    a = random_ops.random_uniform([11, 6])
-    b = random_ops.random_uniform([11, 7])
-    c = random_ops.random_uniform([7, 10])
-
-    out = fn(a, b, c)
-    custom_out = custom_fn(a, b, c)
-    self.assertEqual(out.get_shape().as_list(),
-                     custom_out.get_shape().as_list())
-
-    loss = math_ops.reduce_mean(out)
-    custom_loss = math_ops.reduce_mean(custom_out)
-
-    grads = gradients_impl.gradients(
-        loss, [a, b, c] + [variables.trainable_variables()[0]])
-    custom_grads = gradients_impl.gradients(
-        custom_loss, [a, b, c] + [variables.trainable_variables()[1]])
+    with variable_scope.variable_scope("layer", use_resource=True):
+      inputs = array_ops.ones((2, 4), dtypes.float32)
+      out1 = layer_with_recompute(inputs)
+      out2 = layer_with_recompute(inputs) + out1
+      out = math_ops.reduce_sum(out2)
 
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      out_val, custom_out_val, grads_val, custom_grads_val = sess.run(
-          [out, custom_out, grads, custom_grads])
-      self.assertAllClose(out_val, custom_out_val)
-      for g1, g2 in zip(grads_val, custom_grads_val):
-        self.assertAllClose(g1, g2)
-
-  def testCustomGrad(self):
-
-    def fn(a, b, c):
-      return core_layers.dense(a, 10, use_bias=False) + math_ops.matmul(b, c)
-
-    def grad_fn(inputs, trainable_variables, unused_outputs,
-                unused_grad_outputs):
-      grad_inputs = [
-          array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)
-      ]
-      grad_vars = [
-          array_ops.ones_like(t) * (i + len(inputs) + 1.)
-          for i, t in enumerate(trainable_variables)
-      ]
-      return grad_inputs, grad_vars
-
-    a = random_ops.random_uniform([11, 6])
-    b = random_ops.random_uniform([11, 7])
-    c = random_ops.random_uniform([7, 10])
-    w = random_ops.random_uniform([6, 10])
-    out = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)(a, b, c)
-    loss = math_ops.reduce_mean(out)
-    grads = gradients_impl.gradients(
-        loss, [a, b, c, variables.trainable_variables()[0]])
-    expected_grads = [
-        array_ops.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w])
-    ]
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      g_val, eg_val = sess.run([grads, expected_grads])
-      for g1, g2 in zip(g_val, eg_val):
-        self.assertAllClose(g1, g2)
+    tvars = variables.trainable_variables()
+    assert len(tvars) == 4
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "called twice in the same enclosing scope"):
+      gradients_impl.gradients(out, [inputs] + tvars)
+
+  def testDoubleCallInUniqueScope(self):
+
+    @rev_block_lib.recompute_grad
+    def layer_with_recompute(inputs):
+      with variable_scope.variable_scope("inner", use_resource=True):
+        return core_layers.dense(inputs, 2)
+
+    with variable_scope.variable_scope("layer", use_resource=True):
+      inputs = array_ops.ones((2, 4), dtypes.float32)
+
+      with variable_scope.variable_scope("layer1", use_resource=True):
+        out1 = layer_with_recompute(inputs)
+      with variable_scope.variable_scope("layer2", use_resource=True):
+        out2 = layer_with_recompute(inputs) + out1
+      out = math_ops.reduce_sum(out2)
+
+    tvars = variables.trainable_variables()
+    assert len(tvars) == 4
+    grads = gradients_impl.gradients(out, [inputs] + tvars)
+    for grad in grads:
+      self.assertTrue(grad is not None)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py
index 3e639a180ef11af5f7f498c647eb25417f918eb9..69bb6be81453f5f5487f25547f017dc5f87c2f2c 100644
--- a/tensorflow/contrib/layers/python/layers/target_column.py
+++ b/tensorflow/contrib/layers/python/layers/target_column.py
@@ -270,7 +270,7 @@ class _RegressionTargetColumn(_TargetColumn):
 
   def logits_to_predictions(self, logits, proba=False):
     if self.num_label_columns == 1:
-      return array_ops.squeeze(logits, squeeze_dims=[1])
+      return array_ops.squeeze(logits, axis=[1])
     return logits
 
   def get_eval_ops(self, features, logits, labels, metrics=None):
@@ -418,7 +418,7 @@ def _softmax_cross_entropy_loss(logits, target):
                      "Instead got %s." % target.dtype)
   # sparse_softmax_cross_entropy_with_logits requires [batch_size] target.
   if len(target.get_shape()) == 2:
-    target = array_ops.squeeze(target, squeeze_dims=[1])
+    target = array_ops.squeeze(target, axis=[1])
   loss_vec = nn.sparse_softmax_cross_entropy_with_logits(
       labels=target, logits=logits)
   return loss_vec
diff --git a/tensorflow/contrib/layers/python/layers/utils_test.py b/tensorflow/contrib/layers/python/layers/utils_test.py
index 3409860add8f8c393ffd342633e7023931867dd9..645dc1291eb6370a5e504306fc00a5454dde77ed 100644
--- a/tensorflow/contrib/layers/python/layers/utils_test.py
+++ b/tensorflow/contrib/layers/python/layers/utils_test.py
@@ -294,7 +294,6 @@ class NPositiveIntegersTest(test.TestCase):
     self.assertEqual(utils.n_positive_integers(2, 2), (2, 2))
     self.assertEqual(utils.n_positive_integers(2, (2, 3)), (2, 3))
     self.assertEqual(utils.n_positive_integers(3, (2, 3, 1)), (2, 3, 1))
-    self.assertEqual(utils.n_positive_integers(3, (2, 3, 1)), (2, 3, 1))
     self.assertEqual(
         utils.n_positive_integers(3, tensor_shape.TensorShape([2, 3, 1])),
         (2, 3, 1))
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index abf6e393bb0fbbce4e43f6d209e9b30517df36c3..3b053cd4c66952cf6c494186b16c17f38801bcaf 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -5,6 +5,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 package(default_visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
     "//tensorflow:internal",
@@ -115,6 +117,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/data_feeder_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
@@ -170,6 +173,7 @@ tf_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 py_test(
@@ -188,6 +192,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/graph_actions_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/framework:framework_py",
@@ -224,6 +229,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/monitors_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip_gpu"],  # b/74437598
     deps = [
         ":learn",
         "//tensorflow/contrib/framework:framework_py",
@@ -275,7 +281,10 @@ py_test(
     size = "medium",
     srcs = ["python/learn/estimators/estimator_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "noasan",  # times out
+    ],
     deps = [
         ":learn",
         "//tensorflow/contrib/framework:framework_py",
@@ -426,6 +435,10 @@ py_test(
     size = "medium",
     srcs = ["python/learn/estimators/kmeans_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "noasan",  # b/73741358
+        "nomac",
+    ],
     deps = [
         ":learn",
         "//tensorflow/python:array_ops",
@@ -584,6 +597,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/io_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/learn/python/learn/datasets",
@@ -813,6 +827,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/utils/saved_model_export_utils_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
@@ -867,15 +882,3 @@ py_binary(
         "//tensorflow/python:platform",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/learn/README.md b/tensorflow/contrib/learn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d516bffc5e0327a3400068b35de5503e5a925a54
--- /dev/null
+++ b/tensorflow/contrib/learn/README.md
@@ -0,0 +1,143 @@
+EVERYTHING IN THIS DIRECTORY IS DEPRECATED.
+
+Using functions or classes will result in warnings.
+
+Instructions for converting to current alternatives are included in the
+warnings. A high-level overview is below.
+
+## Canned Estimators
+
+Many canned estimators (subclasses of `Estimator`) have equivalents in core:
+`DNNClassifier`, `DNNRegressor`, `DNNEstimator`, `LinearClassifier`,
+`LinearRegressor`, `DNNLinearCombinedClassifier` and
+`DNNLinearCombinedRegressor`. They are exposed under `tf.estimator`.
+`DNNEstimator`, `LinearEstimator` and `DNNLinearCombinedEstimator`
+are exposed under `tf.contrib.estimator`.
+
+To migrate to the new api, users need to take the following steps:
+
+* Replace `tf.contrib.learn` with `tf.estimator`.
+* If you subclass any of the estimators, stop doing that. You should be able to
+  write a factory method that returns a canned estimator instead. If this is not
+  possible (if you override methods from the canned estimator), consider writing
+  a custom estimator instead. See `tf.estimator.Estimator`.
+* Set `loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE` to preserve loss
+  reduction as the average over batch.
+* Some optimizer-related arguments are no longer passed in the estimator
+  constructor. Instead, we provide methods that perform the same job by wrapping
+  an optimizer. Specifically:
+  *  `gradient_clip_norm`: Use `tf.contrib.estimator.clip_gradients_by_norm`
+  *  `embedding_lr_multipliers`: Not supported.
+  Other arguments:
+  * `input_layer_min_slice_size`: Replaced by `input_layer_partitioner`
+  * `enable_centered_bias`: Not supported. Dropping this argument is unlikely to
+    harm your model.
+  * `feature_engineering_fn`: Not supported. You can call your
+    `feature_engineering_fn` inside your input_fn:
+    ```python
+    def new_input_fn():
+      features, labels = old_input_fn()
+      return feature_engineering_fn(features, labels)
+    ```
+* Use `tf.reshape` to reshape labels in your `input_fn`. `tf.estimator`
+  classifiers and regressors expect labels as a 2D Tensor of shape
+  `[batch_size, 1]`, or `[batch_size, n_labels]`. In contrast,
+  `tf.contrib.learn` classifiers and regressors supported labels with shape
+  `[batch_size]`.
+* If you pass custom metrics from the `evaluate()` method call, use
+  `tf.contrib.estimator.add_metrics`.
+* Replace your `serving_input_fn` with a `serving_input_receiver_fn`.
+  Note this should be entirely distinct from your training `input_fn`, so if you
+  previously had one `input_fn` with different "modes", you should now factor
+  that apart.  Where the former returned either a simple `(features, labels)`
+  tuple or `InputFnOps`, you should now return a `ServingInputReceiver`.
+  If you were generating your `serving_input_fn` using the
+  `build_parsing_serving_input_fn` helper, you can simply drop in the
+  replacement `build_parsing_serving_input_receiver_fn`.
+
+Some remaining estimators/classes:
+
+* `DynamicRnnEstimator`:  Consider a custom `model_fn`.
+* `KMeansClustering`: Use `tf.contrib.factorization.KMeansClustering`.
+* `LogisticRegressor`: Not supported. Instead, use `binary_classification_head`
+  with a custom `model_fn`, or with `DNNEstimator`.
+* `StateSavingRnnEstimator`: Consider a custom `model_fn`.
+* SVM: Consider a custom `model_fn`.
+* `LinearComposableModel` and `DNNComposableModel`: Not supported. 
+  Consider `tf.contrib.estimator.DNNEstimator`, or write a custom model_fn.
+* `MetricSpec`: Deprecated. For adding custom metrics to canned Estimators, use
+  `tf.contrib.estimator.add_metrics`.
+
+## Estimator
+`tf.contrib.learn.Estimator` is migrated to `tf.estimator.Estimator`.
+
+To migrate, users need to take the following steps:
+
+* Replace `tf.contrib.learn.Estimator` with `tf.estimator.Estimator`.
+* If you pass a `config` argument to `Estimator`, this must be
+  `tf.estimator.RunConfig`. You may need to edit your code accordingly.
+* Edit your `model_fn` to return `tf.estimator.EstimatorSpec`. Refer to
+  `EstimatorSpec` for documentation of specific fields.
+* If your `model_fn` uses the `mode` argument, use `tf.estimator.ModeKeys`.
+
+Some related classes:
+* `Evaluable`, `Trainable`: Not supported, merged into `tf.estimator.Estimator`.
+* ExportStrategy: Replaced by `tf.estimator.Exporter`.
+
+## Head/MultiHead
+These classes are now supported under `tf.contrib.estimator`, e.g.
+`tf.contrib.estimator.multi_class_head` and `tf.contrib.estimator.multi_head`.
+
+Some differences:
+
+* `multi_class_head`: If you use `tf.contrib.learn.multi_class_head` with
+  `n_classes=2`, switch to `tf.contrib.estimator.binary_classification_head`.
+* `loss_only_head`: Not supported.
+* `poisson_regression_head`: Not supported (yet).
+* `binary_svm_head`: Not supported (yet).
+* `no_op_train_fn`: Replace it with `tf.no_op`.
+
+Some arguments are renamed, please refer to documentation. In addition:
+
+* `loss_fn`: Supported for `multi_label_head`. If you need it for other heads,
+  please open an issue.
+* `metric_class_ids`: Not supported (yet).
+* `enable_centered_bias`: Not supported. Dropping this argument is unlikely to
+  harm your model.
+* `label_name`: Not needed in `tf.estimator`. If you don’t use `multi_head`,
+  drop this argument. If you use `multi_head`, refer to
+  `tf.contrib.estimator.multi_head` documentation.
+
+## Experiment Class - Distributed Training Tooling
+
+Switch to `tf.estimator.train_and_evaluate`. Some differences:
+
+* Most of the constructor arguments, like `train_input_fn`, `eval_input_fn`,
+  should be wrapped into `tf.estimator.TrainSpec` and `tf.estimator.EvalSpec`.
+* Remove the `experiment_fn`. Instead, create the `Estimator`,
+  `train_spec` and `eval_spec`, then call `tf.estimator.train_and_evaluate`
+  directly.
+* Inside `tf.estimator.EvalSpec`, the `exporter` field is the replacement
+  for `export_strategy`. To be precise, `tf.estimator.LatestExporter` is the
+  replacement for `tf.contrib.learn.make_export_strategy`. If you want to export
+  only at the end of training  use `tf.estimator.FinalExporter`.
+* If the `TF_CONFIG` environment variable is constructed manually, please read
+  the `train_and_evaluate` documentation for the new requirementds (in
+  particular, the chief node and evaluator node).
+
+## Others Classes and Functions
+
+* `tf.contrib.learn.datasets` is deprecated. We are adding ready to use datasets
+  to tensorflow/models. Many smaller datasets are available from other sources,
+  such as scikits.learn. Some Python processing may have to be written, but this
+  is straightforward to implement using the standard modules.
+* `tf.contrib.learn.preprocessing`: Deprecated. The python-only preprocessing
+  functions are not a good fit for TensorFlow. Please use `tf.data`, and
+  consider tensorflow/transform for more complex use cases.
+* `tf.contrib.learn.models`: Not supported, use canned estimators instead.
+* `tf.contrib.learn.monitors`: Implement `SessionRunHook` instead. Hook
+  implementations are in `tf.train`.
+* `tf.contrib.learn.learn_io`: Use the methods in `tf.estimator.inputs`, such as
+  `tf.estimator.inputs.numpy_input_fn`. Some utility functions have no
+  equivalent, we encourage the use of `tf.data`.
+
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index 3698af027e38f1063ad829c26eb179734968f813..79bd73faaf1301a2fc4999b64f88d30542577980 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -13,8 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
-# TODO(ptucker,ipolosukhin): Improve descriptions.
-"""High level API for learning.
+"""High level API for learning (DEPRECATED).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 See the @{$python/contrib.learn} guide.
 
diff --git a/tensorflow/contrib/learn/python/__init__.py b/tensorflow/contrib/learn/python/__init__.py
index bbebd5ab9792cb937219cf937f08c4d4e6e44a92..df23aeb2c433c2b4392f706730f715246ce01cea 100644
--- a/tensorflow/contrib/learn/python/__init__.py
+++ b/tensorflow/contrib/learn/python/__init__.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""High level API for learning with TensorFlow."""
+"""High level API for learning with TensorFlow (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/__init__.py b/tensorflow/contrib/learn/python/learn/__init__.py
index cdc67c77d5fd1df61016835dc75ba44feb458cf9..76e0e8ac8f19026086959f3b197cfd1a81e65a3e 100644
--- a/tensorflow/contrib/learn/python/learn/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/__init__.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""High level API for learning with TensorFlow."""
+"""High level API for learning with TensorFlow (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/basic_session_run_hooks.py b/tensorflow/contrib/learn/python/learn/basic_session_run_hooks.py
index 2284ec46e971731af74f17678fc0d1d3888419e2..fed1c44d1970bf07c808ace817aa9972d7776d88 100644
--- a/tensorflow/contrib/learn/python/learn/basic_session_run_hooks.py
+++ b/tensorflow/contrib/learn/python/learn/basic_session_run_hooks.py
@@ -12,20 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Some common SessionRunHook classes."""
+"""Some common SessionRunHook classes (deprected).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.util.deprecation import deprecated_alias
 
 # pylint: disable=invalid-name
-LoggingTensorHook = basic_session_run_hooks.LoggingTensorHook
-StopAtStepHook = basic_session_run_hooks.StopAtStepHook
-CheckpointSaverHook = basic_session_run_hooks.CheckpointSaverHook
-StepCounterHook = basic_session_run_hooks.StepCounterHook
-NanLossDuringTrainingError = basic_session_run_hooks.NanLossDuringTrainingError
-NanTensorHook = basic_session_run_hooks.NanTensorHook
-SummarySaverHook = basic_session_run_hooks.SummarySaverHook
+LoggingTensorHook = deprecated_alias(
+    'tf.contrib.learn.basic_session_run_hooks.LoggingTensorHook',
+    'tf.train.LoggingTensorHook',
+    basic_session_run_hooks.LoggingTensorHook)
+StopAtStepHook = deprecated_alias(
+    'tf.contrib.learn.basic_session_run_hooks.StopAtStepHook',
+    'tf.train.StopAtStepHook',
+    basic_session_run_hooks.StopAtStepHook)
+CheckpointSaverHook = deprecated_alias(
+    'tf.contrib.learn.basic_session_run_hooks.CheckpointSaverHook',
+    'tf.train.CheckpointSaverHook',
+    basic_session_run_hooks.CheckpointSaverHook)
+StepCounterHook = deprecated_alias(
+    'tf.contrib.learn.basic_session_run_hooks.StepCounterHook',
+    'tf.train.StepCounterHook',
+    basic_session_run_hooks.StepCounterHook)
+NanLossDuringTrainingError = deprecated_alias(
+    'tf.contrib.learn.basic_session_run_hooks.NanLossDuringTrainingError',
+    'tf.train.NanLossDuringTrainingError',
+    basic_session_run_hooks.NanLossDuringTrainingError)
+NanTensorHook = deprecated_alias(
+    'tf.contrib.learn.basic_session_run_hooks.NanTensorHook',
+    'tf.train.NanTensorHook',
+    basic_session_run_hooks.NanTensorHook)
+SummarySaverHook = deprecated_alias(
+    'tf.contrib.learn.basic_session_run_hooks.SummarySaverHook',
+    'tf.train.SummarySaverHook',
+    basic_session_run_hooks.SummarySaverHook)
 # pylint: enable=invalid-name
diff --git a/tensorflow/contrib/learn/python/learn/datasets/BUILD b/tensorflow/contrib/learn/python/learn/datasets/BUILD
index 8bf372841d04dc9e1339925474801d5aa3af4ccd..2c7215bba3816ff3762e5b7927f650d1c9cbf617 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/BUILD
+++ b/tensorflow/contrib/learn/python/learn/datasets/BUILD
@@ -44,18 +44,6 @@ py_binary(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_test(
     name = "base_test",
     size = "small",
diff --git a/tensorflow/contrib/learn/python/learn/datasets/__init__.py b/tensorflow/contrib/learn/python/learn/datasets/__init__.py
index 7240b0de149051afa045a8113f9e9b212840c311..3c34712ac859d32f549468345950a93d2ed2aa56 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/__init__.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Dataset utilities and synthetic/reference datasets."""
+"""Dataset utilities and synthetic/reference datasets (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -27,6 +32,7 @@ from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.datasets import mnist
 from tensorflow.contrib.learn.python.learn.datasets import synthetic
 from tensorflow.contrib.learn.python.learn.datasets import text_datasets
+from tensorflow.python.util.deprecation import deprecated
 
 # Export load_iris and load_boston.
 load_iris = base.load_iris
@@ -51,6 +57,7 @@ SYNTHETIC = {
 }
 
 
+@deprecated(None, 'Please use tf.data.')
 def load_dataset(name, size='small', test_with_fake_data=False):
   """Loads dataset by name.
 
@@ -73,8 +80,9 @@ def load_dataset(name, size='small', test_with_fake_data=False):
     return DATASETS[name]()
 
 
+@deprecated(None, 'Please use tf.data.')
 def make_dataset(name, n_samples=100, noise=None, seed=42, *args, **kwargs):
-  """Creates binary synthetic datasets
+  """Creates binary synthetic datasets.
 
   Args:
     name: str, name of the dataset to generate
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index ca720ae5ed26e74da12bd6c5a37231b41442f76f..4676eedb206147d178c6a652aa7c2cb48ef888c0 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -12,7 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Base utilities for loading datasets."""
+
+"""Base utilities for loading datasets (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -29,11 +35,14 @@ import numpy as np
 from six.moves import urllib
 
 from tensorflow.python.platform import gfile
+from tensorflow.python.util.deprecation import deprecated
+
 
 Dataset = collections.namedtuple('Dataset', ['data', 'target'])
 Datasets = collections.namedtuple('Datasets', ['train', 'validation', 'test'])
 
 
+@deprecated(None, 'Use tf.data instead.')
 def load_csv_with_header(filename,
                          target_dtype,
                          features_dtype,
@@ -53,6 +62,7 @@ def load_csv_with_header(filename,
   return Dataset(data=data, target=target)
 
 
+@deprecated(None, 'Use tf.data instead.')
 def load_csv_without_header(filename,
                             target_dtype,
                             features_dtype,
@@ -70,6 +80,7 @@ def load_csv_without_header(filename,
   return Dataset(data=data, target=target)
 
 
+@deprecated(None, 'Use tf.data instead.')
 def shrink_csv(filename, ratio):
   """Create a smaller dataset of only 1/ratio of original data."""
   filename_small = filename.replace('.', '_small.')
@@ -84,6 +95,7 @@ def shrink_csv(filename, ratio):
         i += 1
 
 
+@deprecated(None, 'Use scikits.learn.datasets.')
 def load_iris(data_path=None):
   """Load Iris dataset.
 
@@ -100,6 +112,7 @@ def load_iris(data_path=None):
       data_path, target_dtype=np.int, features_dtype=np.float)
 
 
+@deprecated(None, 'Use scikits.learn.datasets.')
 def load_boston(data_path=None):
   """Load Boston housing dataset.
 
@@ -116,20 +129,58 @@ def load_boston(data_path=None):
       data_path, target_dtype=np.float, features_dtype=np.float)
 
 
-def retry(initial_delay, max_delay, factor=2.0, jitter=0.25, is_retriable=None):
+@deprecated(None, 'Use the retry module or similar alternatives.')
+def retry(initial_delay,
+          max_delay,
+          factor=2.0,
+          jitter=0.25,
+          is_retriable=None):
   """Simple decorator for wrapping retriable functions.
 
   Args:
     initial_delay: the initial delay.
+    max_delay: the maximum delay allowed (actual max is
+        max_delay * (1 + jitter).
     factor: each subsequent retry, the delay is multiplied by this value.
         (must be >= 1).
     jitter: to avoid lockstep, the returned delay is multiplied by a random
         number between (1-jitter) and (1+jitter). To add a 20% jitter, set
         jitter = 0.2. Must be < 1.
+    is_retriable: (optional) a function that takes an Exception as an argument
+        and returns true if retry should be applied.
+
+  Returns:
+    A function that wraps another function to automatically retry it.
+  """
+  return _internal_retry(
+      initial_delay=initial_delay,
+      max_delay=max_delay,
+      factor=factor,
+      jitter=jitter,
+      is_retriable=is_retriable)
+
+
+def _internal_retry(initial_delay,
+                    max_delay,
+                    factor=2.0,
+                    jitter=0.25,
+                    is_retriable=None):
+  """Simple decorator for wrapping retriable functions, for internal use only.
+
+  Args:
+    initial_delay: the initial delay.
     max_delay: the maximum delay allowed (actual max is
         max_delay * (1 + jitter).
+    factor: each subsequent retry, the delay is multiplied by this value.
+        (must be >= 1).
+    jitter: to avoid lockstep, the returned delay is multiplied by a random
+        number between (1-jitter) and (1+jitter). To add a 20% jitter, set
+        jitter = 0.2. Must be < 1.
     is_retriable: (optional) a function that takes an Exception as an argument
         and returns true if retry should be applied.
+
+  Returns:
+    A function that wraps another function to automatically retry it.
   """
   if factor < 1:
     raise ValueError('factor must be >= 1; was %f' % (factor,))
@@ -152,7 +203,7 @@ def retry(initial_delay, max_delay, factor=2.0, jitter=0.25, is_retriable=None):
       for delay in delays():
         try:
           return fn(*args, **kwargs)
-        except Exception as e:  # pylint: disable=broad-except)
+        except Exception as e:  # pylint: disable=broad-except
           if is_retriable is None:
             continue
 
@@ -176,11 +227,13 @@ def _is_retriable(e):
   return isinstance(e, IOError) and e.errno in _RETRIABLE_ERRNOS
 
 
-@retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
+@deprecated(None, 'Please use urllib or similar directly.')
+@_internal_retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
 def urlretrieve_with_retry(url, filename=None):
   return urllib.request.urlretrieve(url, filename)
 
 
+@deprecated(None, 'Please write your own downloading logic.')
 def maybe_download(filename, work_directory, source_url):
   """Download the data from source url, unless it's already here.
 
diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
index 37f9175015a239f763c7721cf36ab8063c0a3e32..abbb44c2f5b701829ce16f64eadd8ebc04c84e2c 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functions for downloading and reading MNIST data."""
+"""Functions for downloading and reading MNIST data (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -27,6 +32,7 @@ from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.platform import gfile
+from tensorflow.python.util.deprecation import deprecated
 
 # CVDF mirror of http://yann.lecun.com/exdb/mnist/
 DEFAULT_SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'
@@ -37,6 +43,7 @@ def _read32(bytestream):
   return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]
 
 
+@deprecated(None, 'Please use tf.data to implement this functionality.')
 def extract_images(f):
   """Extract the images into a 4D uint8 numpy array [index, y, x, depth].
 
@@ -65,6 +72,7 @@ def extract_images(f):
     return data
 
 
+@deprecated(None, 'Please use tf.one_hot on tensors.')
 def dense_to_one_hot(labels_dense, num_classes):
   """Convert class labels from scalars to one-hot vectors."""
   num_labels = labels_dense.shape[0]
@@ -74,6 +82,7 @@ def dense_to_one_hot(labels_dense, num_classes):
   return labels_one_hot
 
 
+@deprecated(None, 'Please use tf.data to implement this functionality.')
 def extract_labels(f, one_hot=False, num_classes=10):
   """Extract the labels into a 1D uint8 numpy array [index].
 
@@ -103,7 +112,15 @@ def extract_labels(f, one_hot=False, num_classes=10):
 
 
 class DataSet(object):
+  """Container class for a dataset (deprecated).
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
+  @deprecated(None, 'Please use alternatives such as official/mnist/dataset.py'
+              ' from tensorflow/models.')
   def __init__(self,
                images,
                labels,
@@ -210,6 +227,8 @@ class DataSet(object):
       return self._images[start:end], self._labels[start:end]
 
 
+@deprecated(None, 'Please use alternatives such as official/mnist/dataset.py'
+            ' from tensorflow/models.')
 def read_data_sets(train_dir,
                    fake_data=False,
                    one_hot=False,
@@ -275,5 +294,7 @@ def read_data_sets(train_dir,
   return base.Datasets(train=train, validation=validation, test=test)
 
 
+@deprecated(None, 'Please use alternatives such as official/mnist/dataset.py'
+            ' from tensorflow/models.')
 def load_mnist(train_dir='MNIST-data'):
   return read_data_sets(train_dir)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/produce_small_datasets.py b/tensorflow/contrib/learn/python/learn/datasets/produce_small_datasets.py
index 6e0ba38941ce4650ede9f7210e284bde2ed8e6a9..a4848fa64a72f031ef35c0c3256e97a7326acd60 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/produce_small_datasets.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/produce_small_datasets.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Produce DBpedia datasets of a smaller size."""
+"""Produce DBpedia datasets of a smaller size (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
index 9a843168c27d9cae3f55efe4fe4c688d86c745f3..6a0e3350b3d1052249160a2a997a76de7a5040c3 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Synthetic dataset generators."""
+"""Synthetic dataset generators (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +26,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.datasets.base import Dataset
+from tensorflow.python.util.deprecation import deprecated
 
 
+@deprecated(None, 'Consider using synthetic datasets from scikits.learn.')
 def circles(n_samples=100,
             noise=None,
             seed=None,
@@ -93,6 +100,7 @@ def circles(n_samples=100,
   return Dataset(data=X[indices], target=y[indices])
 
 
+@deprecated(None, 'Consider using synthetic datasets from scikits.learn.')
 def spirals(n_samples=100,
             noise=None,
             seed=None,
diff --git a/tensorflow/contrib/learn/python/learn/datasets/text_datasets.py b/tensorflow/contrib/learn/python/learn/datasets/text_datasets.py
index 2596a2ecaf1572506504831e8b08fab9b5dbc119..ce9466301728082f8e9d99c90989ba8fe623bcf0 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/text_datasets.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/text_datasets.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Text datasets."""
+"""Text datasets (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,10 +31,12 @@ import numpy as np
 
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.python.platform import gfile
+from tensorflow.python.util.deprecation import deprecated
 
 DBPEDIA_URL = 'https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz'
 
 
+@deprecated(None, 'See contrib/learn/README.md')
 def maybe_download_dbpedia(data_dir):
   """Download if DBpedia data is not present."""
   train_path = os.path.join(data_dir, 'dbpedia_csv/train.csv')
@@ -41,6 +48,7 @@ def maybe_download_dbpedia(data_dir):
     tfile.extractall(data_dir)
 
 
+@deprecated(None, 'See contrib/learn/README.md')
 def load_dbpedia(size='small', test_with_fake_data=False):
   """Get DBpedia datasets from CSV files."""
   if not test_with_fake_data:
diff --git a/tensorflow/contrib/learn/python/learn/estimators/__init__.py b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
index 4981750c94c7ac31e23b7a3f71ca30e3c9573a20..3e64595f312bcc2a2e8dcba589fb993a249b684b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""An estimator is a rule for calculating an estimate of a given quantity.
+"""An estimator is a rule for calculating an estimate of a given quantity (deprecated).
+
+These classes are deprecated and replaced with `tf.estimator`.
+
+See [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 # Estimators
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py b/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
index 15277415a1ce83dc1d4a334e60fe1933ba244df0..1f0e4663d060a3850e2002b27f809fde1db47e48 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""sklearn cross-support."""
+"""sklearn cross-support (deprecated)."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -132,6 +132,8 @@ class _TransformerMixin():
 class NotFittedError(ValueError, AttributeError):
   """Exception class to raise if estimator is used before fitting.
 
+  USE OF THIS EXCEPTION IS DEPRECATED.
+
   This class inherits from both ValueError and AttributeError to help with
   exception handling and backward compatibility.
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model.py
index a02c726c74946d93b8e1726473db746220b00795..1fa58271e2b886cd143683a759145fd750791473 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TensorFlow composable models used as building blocks for estimators."""
+"""TensorFlow composable models used as building blocks for estimators (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -34,6 +39,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
+from tensorflow.python.util.deprecation import deprecated
 
 
 class _ComposableModel(object):
@@ -46,6 +52,7 @@ class _ComposableModel(object):
   _ComposableModel and its subclasses are not part of the public tf.learn API.
   """
 
+  @deprecated(None, "Please use model_fns in tf.estimator.")
   def __init__(self,
                num_label_columns,
                optimizer,
@@ -141,6 +148,10 @@ class _ComposableModel(object):
 class LinearComposableModel(_ComposableModel):
   """A _ComposableModel that implements linear regression.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Instances of this class can be used to build estimators through the use
   of composition.
   """
@@ -252,6 +263,10 @@ class LinearComposableModel(_ComposableModel):
 class DNNComposableModel(_ComposableModel):
   """A _ComposableModel that implements a DNN.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Instances of this class can be used to build estimators through the use
   of composition.
   """
diff --git a/tensorflow/contrib/learn/python/learn/estimators/constants.py b/tensorflow/contrib/learn/python/learn/estimators/constants.py
index fc69e810244a182b864be856e6720f8584f7aa65..d2548946bc77dea7c452d61c7e2b6e12c3d6239a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/constants.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/constants.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Constants regarding Estimators.
+"""Constants regarding Estimators (deprecated).
 
-This file is obsoleted in the move of Estimator to core.
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -25,6 +27,8 @@ from __future__ import print_function
 class ProblemType(object):
   """Enum-like values for the type of problem that the model solves.
 
+  THIS CLASS IS DEPRECATED.
+
   These values are used when exporting the model to produce the appropriate
   signature function for serving.
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug.py b/tensorflow/contrib/learn/python/learn/estimators/debug.py
index 9d5f6c2bf969d7c85d251bf1b06a0307a41b2297..24b067b7e38b12df3d1d0c49f626344217218571 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/debug.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Debug estimators.
+"""Debug estimators (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 Debug estimators are bias-only estimators that can be used for debugging
 and as simple baselines.
@@ -118,6 +122,10 @@ def debug_model_fn(features, labels, mode, params, config=None):
 class DebugClassifier(estimator.Estimator):
   """A classifier for TensorFlow Debug models.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Example:
 
   ```python
@@ -237,6 +245,10 @@ class DebugClassifier(estimator.Estimator):
 class DebugRegressor(estimator.Estimator):
   """A regressor for TensorFlow Debug models.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Example:
 
   ```python
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index c17b41c0f767e19d9c3635a8f60347a49b297cfb..eabebb7e881558471c343c0573cc9a8f4a425312 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Deep Neural Network estimators."""
+"""Deep Neural Network estimators (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -212,6 +217,10 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
 class DNNClassifier(estimator.Estimator):
   """A classifier for TensorFlow DNN models.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Example:
 
   ```python
@@ -521,6 +530,10 @@ class DNNClassifier(estimator.Estimator):
 class DNNRegressor(estimator.Estimator):
   """A regressor for TensorFlow DNN models.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Example:
 
   ```python
@@ -796,6 +809,10 @@ class DNNRegressor(estimator.Estimator):
 class DNNEstimator(estimator.Estimator):
   """A Estimator for TensorFlow DNN models with user specified _Head.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Example:
 
   ```python
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 726612235050def6e7addb503cc6646a25de0e42..3d85533d92d17095bae9a69f229171e1bf61ba10 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""TensorFlow estimators for Linear and DNN joined training models."""
+"""TensorFlow estimators for Linear and DNN joined training models (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -372,6 +377,10 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
 class DNNLinearCombinedEstimator(estimator.Estimator):
   """An estimator for TensorFlow Linear and DNN joined training models.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Note: New users must set `fix_global_step_increment_bug=True` when creating an
   estimator.
 
@@ -490,6 +499,10 @@ class DNNLinearCombinedEstimator(estimator.Estimator):
 class DNNLinearCombinedClassifier(estimator.Estimator):
   """A classifier for TensorFlow Linear and DNN joined training models.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Note: New users must set `fix_global_step_increment_bug=True` when creating an
   estimator.
 
@@ -832,6 +845,10 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
 class DNNLinearCombinedRegressor(estimator.Estimator):
   """A regressor for TensorFlow Linear and DNN joined training models.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Note: New users must set `fix_global_step_increment_bug=True` when creating an
   estimator.
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
index 69440e823ef1ed2d739f28bc14587891f2de80bb..a703dc66e922d48ceb64edc2a979061b8e45db49 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Estimator for Dynamic RNNs."""
+"""Estimator for Dynamic RNNs (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -540,6 +545,12 @@ def _get_dynamic_rnn_model_fn(
 
 
 class DynamicRnnEstimator(estimator.Estimator):
+  """Dynamically unrolled RNN (deprecated).
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
   def __init__(self,
                problem_type,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 4b63e08ab3372849309ee5d28d754de82e9632f4..7a026a15e4aeea0dde4ed9f7de053a757a0abb58 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Base Estimator class."""
+"""Base Estimator class (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -138,6 +143,7 @@ def _get_input_fn(x, y, input_fn, feed_fn, batch_size, shuffle=False, epochs=1):
   return df.input_builder, df.get_feed_dict_fn()
 
 
+@deprecated(None, 'Please specify feature columns explicitly.')
 def infer_real_valued_columns_from_input_fn(input_fn):
   """Creates `FeatureColumn` objects for inputs defined by `input_fn`.
 
@@ -158,6 +164,7 @@ def infer_real_valued_columns_from_input_fn(input_fn):
     return layers.infer_real_valued_columns(features)
 
 
+@deprecated(None, 'Please specify feature columns explicitly.')
 def infer_real_valued_columns_from_input(x):
   """Creates `FeatureColumn` objects for inputs defined by input `x`.
 
@@ -389,6 +396,10 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
                     trainable.Trainable):
   """Abstract BaseEstimator class to train and evaluate TensorFlow models.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Users should not instantiate or subclass this class. Instead, use an
   `Estimator`.
   """
@@ -399,6 +410,8 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
   # TODO(wicke): Remove this once launcher takes over config functionality
   _Config = run_config.RunConfig  # pylint: disable=invalid-name
 
+  @deprecated(None, 'Please replace uses of any Estimator from tf.contrib.learn'
+              ' with an Estimator from tf.estimator.*')
   def __init__(self, model_dir=None, config=None):
     """Initializes a BaseEstimator instance.
 
@@ -457,6 +470,20 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
     # TODO(wicke): make RunConfig immutable, and then return it without a copy.
     return copy.deepcopy(self._config)
 
+  @property
+  def model_fn(self):
+    """Returns the model_fn which is bound to self.params.
+
+    Returns:
+      The model_fn with the following signature:
+        `def model_fn(features, labels, mode, metrics)`
+    """
+
+    def public_model_fn(features, labels, mode, config):
+      return self._call_model_fn(features, labels, mode, config=config)
+
+    return public_model_fn
+
   @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
                    ('x', None), ('y', None), ('batch_size', None))
   def fit(self,
@@ -890,8 +917,8 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
       if feed_fn:
         hooks.append(basic_session_run_hooks.FeedFnHook(feed_fn))
       if steps == 0:
-        logging.warning('evaluation steps are 0. If `input_fn` does not raise'
-                        'OutOfRangeError`, the evaluation will never stop.'
+        logging.warning('evaluation steps are 0. If `input_fn` does not raise '
+                        '`OutOfRangeError`, the evaluation will never stop. '
                         'Use steps=None if intended.')
       if steps:
         hooks.append(
@@ -1074,6 +1101,10 @@ def _identity_feature_engineering_fn(features, labels):
 
 class Estimator(BaseEstimator):
   """Estimator class is the basic TensorFlow model trainer/evaluator.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
   """
 
   def __init__(self,
@@ -1162,7 +1193,7 @@ class Estimator(BaseEstimator):
     self._feature_engineering_fn = (
         feature_engineering_fn or _identity_feature_engineering_fn)
 
-  def _call_model_fn(self, features, labels, mode, metrics=None):
+  def _call_model_fn(self, features, labels, mode, metrics=None, config=None):
     """Calls model function with support of 2, 3 or 4 arguments.
 
     Args:
@@ -1170,6 +1201,7 @@ class Estimator(BaseEstimator):
       labels: labels dict.
       mode: ModeKeys
       metrics: Dict of metrics.
+      config: RunConfig.
 
     Returns:
       A `ModelFnOps` object. If model_fn returns a tuple, wraps them up in a
@@ -1186,7 +1218,10 @@ class Estimator(BaseEstimator):
     if 'params' in model_fn_args:
       kwargs['params'] = self.params
     if 'config' in model_fn_args:
-      kwargs['config'] = self.config
+      if config:
+        kwargs['config'] = config
+      else:
+        kwargs['config'] = self.config
     if 'model_dir' in model_fn_args:
       kwargs['model_dir'] = self.model_dir
     model_fn_results = self._model_fn(features, labels, **kwargs)
@@ -1458,8 +1493,14 @@ class Estimator(BaseEstimator):
 # For time of deprecation x,y from Estimator allow direct access.
 # pylint: disable=protected-access
 class SKCompat(sklearn.BaseEstimator):
-  """Scikit learn wrapper for TensorFlow Learn Estimator."""
+  """Scikit learn wrapper for TensorFlow Learn Estimator.
+  
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
+  @deprecated(None, 'Please switch to the Estimator interface.')
   def __init__(self, estimator):
     self._estimator = estimator
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index d81a534b79bc90fe91ffd3cb97a7865a7cb4c2a9..9e5aaf3118dfed4ce64dd244a915860b5a2eef44 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -715,7 +715,9 @@ class EstimatorTest(test.TestCase):
     ckpt = checkpoint_state_pb2.CheckpointState()
     text_format.Merge(checkpoint_file_content, ckpt)
     self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
-    self.assertAllEqual(['model.ckpt-1', 'model.ckpt-5'],
+    # TODO(b/78461127): Please modify tests to not directly rely on names of
+    # checkpoints.
+    self.assertAllEqual(['model.ckpt-0', 'model.ckpt-5'],
                         ckpt.all_model_checkpoint_paths)
 
   def test_train_save_copy_reload(self):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py
index fd47710e3015de9ae6a453f98978b0ef8f88968c..e4c31396baf8271c49395a2b87b454dbc77177e2 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utils for Estimator."""
+"""Utils for Estimator (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 9b124b2c19f16bbc9b2afeadb82a32006e1a0ae9..e28e6854a5097d66cb486be3e82f3726f5cc70fd 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -12,8 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Abstractions for the head(s) of a model.
+"""Abstractions for the head(s) of a model (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -47,11 +52,16 @@ from tensorflow.python.summary import summary
 from tensorflow.python.training import training
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.deprecation import deprecated
 
 
 class Head(object):
   """Interface for the head/top of a model.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Given logits (or output of a hidden layer), a Head knows how to compute
   predictions, loss, default metric and export signature. It is meant to,
 
@@ -177,6 +187,7 @@ class Head(object):
     raise NotImplementedError("Calling an abstract method.")
 
 
+@deprecated(None, "Please switch to tf.contrib.estimator.*_head.")
 def regression_head(label_name=None,
                     weight_column_name=None,
                     label_dimension=1,
@@ -216,6 +227,7 @@ def regression_head(label_name=None,
       link_fn=(link_fn if link_fn is not None else array_ops.identity))
 
 
+@deprecated(None, "Please switch to tf.contrib.estimator.*_head.")
 def poisson_regression_head(label_name=None,
                             weight_column_name=None,
                             label_dimension=1,
@@ -254,6 +266,7 @@ def poisson_regression_head(label_name=None,
 # TODO(zakaria): Consider adding a _RegressionHead for logistic_regression
 
 
+@deprecated(None, "Please switch to tf.contrib.estimator.*_head.")
 def multi_class_head(n_classes,
                      label_name=None,
                      weight_column_name=None,
@@ -335,6 +348,7 @@ def multi_class_head(n_classes,
       label_keys=label_keys)
 
 
+@deprecated(None, "Please switch to tf.contrib.estimator.*_head.")
 def binary_svm_head(
     label_name=None,
     weight_column_name=None,
@@ -370,6 +384,7 @@ def binary_svm_head(
       thresholds=thresholds)
 
 
+@deprecated(None, "Please switch to tf.contrib.estimator.*_head.")
 def multi_label_head(n_classes,
                      label_name=None,
                      weight_column_name=None,
@@ -430,6 +445,7 @@ def multi_label_head(n_classes,
       loss_fn=_wrap_custom_loss_fn(loss_fn) if loss_fn else None)
 
 
+@deprecated(None, "Please switch to tf.contrib.estimator.*_head.")
 def loss_only_head(loss_fn, head_name=None):
   """Creates a Head that contains only loss terms.
 
@@ -447,6 +463,7 @@ def loss_only_head(loss_fn, head_name=None):
   return _LossOnlyHead(loss_fn, head_name=head_name)
 
 
+@deprecated(None, "Please switch to tf.contrib.estimator.*_head.")
 def multi_head(heads, loss_weights=None):
   """Creates a MultiHead stemming from same logits/hidden layer.
 
@@ -479,6 +496,7 @@ def multi_head(heads, loss_weights=None):
   return _MultiHead(heads, loss_merger=_weighted_loss_merger)
 
 
+@deprecated(None, "Use 'lambda _: tf.no_op()'.")
 def no_op_train_fn(loss):
   del loss
   return control_flow_ops.no_op()
@@ -759,7 +777,7 @@ class _RegressionHead(_SingleHead):
     key = prediction_key.PredictionKey.SCORES
     with ops.name_scope(None, "predictions", (logits,)):
       if self.logits_dimension == 1:
-        logits = array_ops.squeeze(logits, squeeze_dims=(1,), name=key)
+        logits = array_ops.squeeze(logits, axis=(1,), name=key)
       return {key: self._link_fn(logits)}
 
   def _metrics(self, eval_loss, predictions, labels, weights):
@@ -956,7 +974,7 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None):
     is_squeezed_labels = False
     # TODO(ptucker): This will break for dynamic shapes.
     if len(labels.get_shape()) == 2:
-      labels = array_ops.squeeze(labels, squeeze_dims=(1,))
+      labels = array_ops.squeeze(labels, axis=(1,))
       is_squeezed_labels = True
 
     loss = nn.sparse_softmax_cross_entropy_with_logits(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index 6d5da81b4c2087fb9c5307902e452a6220a17cd0..7c2d9bb0767cb979dae9c84b5342d129225677ed 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -362,7 +362,7 @@ class MultiLabelHeadTest(test.TestCase):
         "auc_precision_recall": 0.166667,
         "auc_precision_recall/class0": 0,
         "auc_precision_recall/class1": 0.,
-        "auc_precision_recall/class2": 0.49999,
+        "auc_precision_recall/class2": 1.,
         "labels/actual_label_mean/class0": self._labels[0][0],
         "labels/actual_label_mean/class1": self._labels[0][1],
         "labels/actual_label_mean/class2": self._labels[0][2],
@@ -748,7 +748,7 @@ class BinaryClassificationHeadTest(test.TestCase):
         "accuracy/baseline_label_mean": label_mean,
         "accuracy/threshold_0.500000_mean": 1. / 2,
         "auc": 1. / 2,
-        "auc_precision_recall": 0.25,
+        "auc_precision_recall": 0.749999,
         "labels/actual_label_mean": label_mean,
         "labels/prediction_mean": .731059,  # softmax
         "loss": expected_loss,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index 8f9d6fc318a357853bdb8e3264f6691b410006b1..66ebcfd1d81904b9afe5be6bd1a648fe325e1e0b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of k-means clustering on top of `Estimator` API.
+"""Implementation of k-means clustering on top of `Estimator` API (deprecated).
 
 This module is deprecated. Please use
 @{tf.contrib.factorization.KMeansClustering} instead of
@@ -153,7 +153,12 @@ def _kmeans_clustering_model_fn(features, labels, mode, params, config):
 
 # TODO(agarwal,ands): support sharded input.
 class KMeansClustering(estimator.Estimator):
-  """An Estimator for K-Means clustering."""
+  """An Estimator for K-Means clustering.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
   SQUARED_EUCLIDEAN_DISTANCE = clustering_ops.SQUARED_EUCLIDEAN_DISTANCE
   COSINE_DISTANCE = clustering_ops.COSINE_DISTANCE
   RANDOM_INIT = clustering_ops.RANDOM_INIT
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
index b28835a809736a099ad2f08d127dc68d7977a3c1..584556992a0db2345e182e92c4a7f7582d3cd8dc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index 37aa8b339622415d082933cdf66d2472a4119b48..70b70af98c51dcb991c19152607272673953ee2a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Linear Estimators."""
+"""Linear Estimators (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -238,8 +243,8 @@ def sdca_model_fn(features, labels, mode, params):
 
   parent_scope = "linear"
 
-  with variable_scope.variable_op_scope(
-      features.values(), parent_scope) as scope:
+  with variable_scope.variable_scope(
+      values=features.values(), name_or_scope=parent_scope) as scope:
     features = features.copy()
     features.update(layers.transform_features(features, feature_columns))
     logits, columns_to_variables, bias = (
@@ -305,6 +310,10 @@ class _SdcaUpdateWeightsHook(session_run_hook.SessionRunHook):
 class LinearClassifier(estimator.Estimator):
   """Linear classifier model.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Train a linear model to classify instances into one of multiple possible
   classes. When number of possible classes is 2, this is binary classification.
 
@@ -625,6 +634,10 @@ class LinearClassifier(estimator.Estimator):
 class LinearRegressor(estimator.Estimator):
   """Linear regressor model.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Train a linear regression model to predict label value given observation of
   feature values.
 
@@ -860,6 +873,10 @@ class LinearRegressor(estimator.Estimator):
 class LinearEstimator(estimator.Estimator):
   """Linear model with user specified head.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Train a generalized linear model to predict label value given observation of
   feature values.
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
index fb339160d58e09d4ffd50090f2dbbcec08bebe47..3cbcc6e98de1c915c302617e4591c9baa33adeaf 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Logistic regression (aka binary classifier) class.
+"""Logistic regression (aka binary classifier) class (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 This defines some useful basic metrics for using logistic regression to classify
 a binary event (0 vs 1).
@@ -75,6 +79,10 @@ def LogisticRegressor(  # pylint: disable=invalid-name
     feature_engineering_fn=None):
   """Builds a logistic regression Estimator for binary classification.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   This method provides a basic Estimator with some additional metrics for custom
   binary classification models, including AUC, precision/recall and accuracy.
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/metric_key.py b/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
index 99388f116b345bd038f2985606c6203011597ea2..f264248e44d9aa48f26ee32e36746bd4c3145a8d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
@@ -12,14 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Enum for metric keys."""
+"""Enum for metric keys (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 
 class MetricKey(object):
-  """Metric key strings."""
+  """Metric key strings (deprecated)."""
+  
   LOSS = "loss"
   AUC = "auc"
   AUC_PR = "auc_precision_recall"
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
index 44e6c7c52dac524a22e9099e33e2aef82f8fe7ba..dcb161180c99ce71195c820217e8bdaf79d70901 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Classes and methods related to model_fn."""
+"""Classes and methods related to model_fn (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -37,10 +42,13 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.deprecation import deprecated
 
 
 class ModeKeys(object):
-  """Standard names for model modes.
+  """Standard names for model modes (deprecated).
+
+  THIS CLASS IS DEPRECATED.
 
   The following standard keys are defined:
 
@@ -65,8 +73,16 @@ class ModelFnOps(
         'output_alternatives', 'training_chief_hooks', 'training_hooks',
         'scaffold', 'mode'
     ])):
-  """Ops returned from a model_fn."""
+  """Ops returned from a model_fn.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
+  @deprecated(None, 'When switching to tf.estimator.Estimator, use '
+              'tf.estimator.EstimatorSpec. You can use the `estimator_spec`'
+              ' method to create an equivalent one.')
   def __new__(cls,
               mode,
               predictions=None,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/prediction_key.py b/tensorflow/contrib/learn/python/learn/estimators/prediction_key.py
index f8d87b8914307a86eb2f46123a28ff11eb925eda..6fd2fc9d592cef4e44a640e2f27cb28b367d44d5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/prediction_key.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/prediction_key.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Enum for model prediction keys.
+"""Enum for model prediction keys (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 This file is obsoleted in the move of Estimator to core.
 """
@@ -22,6 +26,8 @@ from __future__ import print_function
 
 
 class PredictionKey(object):
+  """THIS CLASS IS DEPRECATED."""
+
   CLASSES = "classes"
   PROBABILITIES = "probabilities"
   LOGITS = "logits"
diff --git a/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py b/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py
index 2752bc2d90ee0f51b2c40ccc4d24a4eb21cff38f..215022e5d9e5d3cd5d6a96583b325b19a1719568 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Common operations for RNN Estimators."""
+"""Common operations for RNN Estimators (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index fd90fd1cc6277e7d80287aefdbab6134dac7c0d5..14ee2ba6094760d52180d6de7763ea88b8ee98c8 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Run Config."""
+"""Run Config (deprecated, use tf.estimator.RunConfig instead).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -29,11 +34,12 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as core_run_config
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.util.deprecation import deprecated
 
 
 # A list of the property names in RunConfig user allows to change. They will
 # not affect the execution framework, so when execution framework checks the
-# `uid` of the RunConfig, it should be ingored.
+# `uid` of the RunConfig, it should be ignored.
 _DEFAULT_UID_WHITE_LIST = [
     'tf_random_seed',
     'save_summary_steps',
@@ -47,6 +53,7 @@ _DEFAULT_UID_WHITE_LIST = [
 
 
 class Environment(object):
+  """DEPRECATED CLASS."""
   # For running general distributed training.
   CLOUD = 'cloud'
   # For running Google-internal distributed training.
@@ -56,6 +63,7 @@ class Environment(object):
 
 
 class TaskType(object):
+  """DEPRECATED CLASS."""
   MASTER = 'master'
   PS = 'ps'
   WORKER = 'worker'
@@ -64,6 +72,8 @@ class TaskType(object):
 class ClusterConfig(object):
   """This class specifies the configurations for a distributed run.
 
+  THIS CLASS IS DEPRECATED. Use tf.estimator.RunConfig instead.
+
   If you're using an `Estimator`, you should probably use the subclass
   RunConfig instead.
   """
@@ -211,10 +221,13 @@ class ClusterConfig(object):
 class RunConfig(ClusterConfig, core_run_config.RunConfig):
   """This class specifies the configurations for an `Estimator` run.
 
-  This class is the implementation of @{tf.estimator.RunConfig} interface.
+  This class is a deprecated implementation of @{tf.estimator.RunConfig}
+  interface.
   """
   _USE_DEFAULT = 0
 
+  @deprecated(None, 'When switching to tf.estimator.Estimator, use'
+              ' tf.estimator.RunConfig instead.')
   def __init__(self,
                master=None,
                num_cores=0,
@@ -277,8 +290,16 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
         Note - using this argument, it is easy to provide settings which break
         otherwise perfectly good models. Use with care.
     """
-    super(RunConfig, self).__init__(
-        master=master, evaluation_master=evaluation_master)
+    # Neither parent class calls super().__init__(), so here we have to
+    # manually call their __init__() methods.
+    ClusterConfig.__init__(
+        self, master=master, evaluation_master=evaluation_master)
+    # For too long this code didn't call:
+    #   core_run_config.RunConfig.__init__(self)
+    # so instead of breaking compatibility with that assumption, we
+    # just manually initialize this field:
+    self._train_distribute = None
+    self._device_fn = None
 
     gpu_options = config_pb2.GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
index 0cea35e219a4457417a161a3ac4ac4292fd24f53..de78c72c3ae3ef14f5f7c46b1d47f82e8266c7c6 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Estimator for State Saving RNNs."""
+"""Estimator for State Saving RNNs (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -528,6 +533,12 @@ def _get_rnn_model_fn(cell_type,
 
 
 class StateSavingRnnEstimator(estimator.Estimator):
+  """RNN with static unrolling and state saving (deprecated).
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
   def __init__(self,
                problem_type,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm.py b/tensorflow/contrib/learn/python/learn/estimators/svm.py
index 72920d73c0c92886e54f533ad7fe170fe27d9870..3459997baba16fc0d4045e50819ecdd0e7121657 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Support Vector Machine (SVM) Estimator."""
+"""Support Vector Machine (SVM) Estimator (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -36,6 +41,10 @@ def _as_iterable(preds, output):
 class SVM(estimator.Estimator):
   """Support Vector Machine (SVM) model for binary classification.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Currently, only linear SVMs are supported. For the underlying optimization
   problem, the `SDCAOptimizer` is used. For performance and convergence tuning,
   the num_loss_partitions parameter passed to `SDCAOptimizer` (see `__init__()`
diff --git a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature.py b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature.py
index a120bc6cc3975a3d4559d018c8aa74ff42a16d2d..71b5658dd174d2b47e33860844359f68e6768026 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""TensorSignature class and utilities."""
+"""TensorSignature class and utilities (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -33,6 +38,10 @@ class TensorSignature(collections.namedtuple(
     "TensorSignature", ["dtype", "shape", "is_sparse"])):
   """Signature of the `Tensor` object.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Useful to check compatibility of tensors.
 
   Example:
diff --git a/tensorflow/contrib/learn/python/learn/estimators/test_data.py b/tensorflow/contrib/learn/python/learn/estimators/test_data.py
index ed201bfc58f273e6587850032386c2686aea4148..e4b057b4f5a9e081c2d891bd9828ffc315e51e91 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/test_data.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/test_data.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Test data utilities."""
+"""Test data utilities (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
index 8f6cd39864b437f163dd7c1140dc88755ce98529..10881ca885599bc81386e15f814a2687d907f63b 100644
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""`Evaluable` interface."""
+"""`Evaluable` interface (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,6 +28,10 @@ import abc
 
 class Evaluable(object):
   """Interface for objects that are evaluatable by, e.g., `Experiment`.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
   """
   __metaclass__ = abc.ABCMeta
 
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index bec976afd2719138117976381669ca3292360480..3744abd860e7f460133873eb534fd75887182f78 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Experiment class collecting information needed for a single training run."""
+"""Experiment class collecting information for a single training run (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,7 +30,6 @@ import os
 import time
 
 from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.framework import deprecated_args
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import export_strategy
@@ -118,6 +122,10 @@ class _EvalAndExportListener(basic_session_run_hooks.CheckpointSaverListener):
 class Experiment(object):
   """Experiment is a class containing all information needed to train a model.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   After an experiment is created (by passing an Estimator and inputs for
   training and evaluation), an Experiment instance knows how to invoke training
   and eval loops in a sensible fashion for distributed training.
@@ -125,16 +133,8 @@ class Experiment(object):
 
   # TODO(ispir): remove delay_workers_by_global_step and make global step based
   # waiting as only behavior.
-  @deprecated_args(
-      "2016-10-23",
-      "local_eval_frequency is deprecated as local_run will be renamed to "
-      "train_and_evaluate. Use min_eval_frequency and call train_and_evaluate "
-      "instead. Note, however, that the default for min_eval_frequency is 1, "
-      "meaning models will be evaluated every time a new checkpoint is "
-      "available. In contrast, the default for local_eval_frequency is None, "
-      "resulting in evaluation occurring only after training has completed. "
-      "min_eval_frequency is ignored when calling the deprecated local_run.",
-      "local_eval_frequency")
+  @deprecated(None, "Please switch to tf.estimator.train_and_evaluate. You will"
+              " also have to convert to a tf.estimator.Estimator.")
   def __init__(self,
                estimator,
                train_input_fn,
@@ -152,7 +152,8 @@ class Experiment(object):
                export_strategies=None,
                train_steps_per_iteration=None,
                checkpoint_and_export=False,
-               saving_listeners=None):
+               saving_listeners=None,
+               check_interval_secs=5):
     """Constructor for `Experiment`.
 
     Creates an Experiment instance. None of the functions passed to this
@@ -190,8 +191,9 @@ class Experiment(object):
         number of steps between evaluations. Of course, evaluation does not
         occur if no new snapshot is available, hence, this is the minimum.
         If 0, the evaluation will only happen after training.
-        If None, defaults to 1, unless model_dir is on GCS, in which case the
-        default is 1000.
+        If None, defaults to 1. To avoid checking for new checkpoints too
+        frequent, the interval is further limited to be at least
+        check_interval_secs between checks.
       delay_workers_by_global_step: if `True` delays training workers
         based on global step instead of time.
       export_strategies: Iterable of `ExportStrategy`s, or a single one, or
@@ -215,7 +217,10 @@ class Experiment(object):
       saving_listeners: list of `CheckpointSaverListener` objects. Used by
         tf.estimator.Estimator for callbacks that run immediately before or
         after checkpoint savings.
-
+      check_interval_secs:
+        Minimum time between subsequent checks for a new checkpoint. This
+        mostly applies if both min_eval_frequency and the time spent per
+        training step is low.
     Raises:
       ValueError: if `estimator` does not implement Estimator interface,
         or if export_strategies has the wrong type.
@@ -261,13 +266,9 @@ class Experiment(object):
     self._continuous_eval_throttle_secs = continuous_eval_throttle_secs
     self._checkpoint_and_export = checkpoint_and_export
     self._saving_listeners = saving_listeners
-    # Using 1 on a non-cached file system requires a lot of overhead to
-    # read the checkpoint state file. This is particular bad on GCS, so
-    # we use a different default. This is a temporary band-aid, to be
-    # fixed holistically later (b/36498507).
-    default_min_eval_frequency = 1000 if _is_gcs(estimator.model_dir) else 1
     self._min_eval_frequency = min_eval_frequency if (
-        min_eval_frequency is not None) else default_min_eval_frequency
+        min_eval_frequency is not None) else 1
+    self._check_interval_secs = check_interval_secs
     self._delay_workers_by_global_step = delay_workers_by_global_step
     self._train_monitors = train_monitors[:] if train_monitors else []
     self._eval_hooks = eval_hooks[:] if eval_hooks else []
@@ -357,7 +358,7 @@ class Experiment(object):
         self._start_server()
     elif config.cluster_spec and config.master:
       raise ValueError(
-          "For distributed runtime, Experiment class only works with"
+          "For distributed runtime, Experiment class only works with "
           "tf.contrib.learn.RunConfig for now, but provided {}".format(
               type(config)))
 
@@ -646,12 +647,19 @@ class Experiment(object):
         self._train_monitors += [saver_hook]
       else:
         if self._min_eval_frequency:
+          # Using low min_eval_frequency (default is 1) on a non-cached file
+          # system requires a lot of overhead to read the checkpoint state file.
+          # This is particular bad on GCS and CNS. See also b/36498507 for
+          # context. `check_interval_secs = 5` avoids polling a remote
+          # fileystem too often.
+
           self._train_monitors += [
               monitors.ValidationMonitor(
                   input_fn=self._eval_input_fn,
                   eval_steps=self._eval_steps,
                   metrics=self._eval_metrics,
                   every_n_steps=self._min_eval_frequency,
+                  check_interval_secs=self._check_interval_secs,
                   name=eval_dir_suffix,
                   hooks=self._eval_hooks)
           ]
@@ -928,7 +936,3 @@ def _new_attr_context(obj, attr):
     yield
   finally:
     setattr(obj, attr, saved)
-
-
-def _is_gcs(model_dir):
-  return model_dir and model_dir.startswith("gs://")
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index 545d7d8924c0c10544e6113e2968b7ae3d2090fc..d10927a0cdd5c67c8d2a8e569153235ee175ec4d 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -674,37 +674,11 @@ class ExperimentTest(test.TestCase):
   def test_min_eval_frequency_defaults(self):
     def dummy_model_fn(features, labels):  # pylint: disable=unused-argument
       pass
-
-    # The default value when model_dir is on GCS is 1000
-    estimator = core_estimator.Estimator(dummy_model_fn, 'gs://dummy_bucket')
-    ex = experiment.Experiment(
-        estimator, train_input_fn=None, eval_input_fn=None)
-    self.assertEquals(ex._min_eval_frequency, 1000)
-
-    # The default value when model_dir is not on GCS is 1
     estimator = core_estimator.Estimator(dummy_model_fn, '/tmp/dummy')
     ex = experiment.Experiment(
         estimator, train_input_fn=None, eval_input_fn=None)
     self.assertEquals(ex._min_eval_frequency, 1)
 
-    # Make sure default not used when explicitly set
-    estimator = core_estimator.Estimator(dummy_model_fn, 'gs://dummy_bucket')
-    ex = experiment.Experiment(
-        estimator,
-        min_eval_frequency=123,
-        train_input_fn=None,
-        eval_input_fn=None)
-    self.assertEquals(ex._min_eval_frequency, 123)
-
-    # Make sure default not used when explicitly set as 0
-    estimator = core_estimator.Estimator(dummy_model_fn, 'gs://dummy_bucket')
-    ex = experiment.Experiment(
-        estimator,
-        min_eval_frequency=0,
-        train_input_fn=None,
-        eval_input_fn=None)
-    self.assertEquals(ex._min_eval_frequency, 0)
-
   def test_continuous_train_and_eval(self):
     for est in self._estimators_for_tests(eval_dict={'global_step': 100}):
       if isinstance(est, core_estimator.Estimator):
diff --git a/tensorflow/contrib/learn/python/learn/export_strategy.py b/tensorflow/contrib/learn/python/learn/export_strategy.py
index 55a8b824312b89e0ac66513242191f4201ac212a..075cab536ecb5279e7e6f23abb0b70c75043a7ec 100644
--- a/tensorflow/contrib/learn/python/learn/export_strategy.py
+++ b/tensorflow/contrib/learn/python/learn/export_strategy.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""ExportStrategy class represents different flavors of model export."""
+"""ExportStrategy class represents different flavors of model export (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +26,7 @@ from __future__ import print_function
 import collections
 
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.deprecation import deprecated
 
 __all__ = ['ExportStrategy']
 
@@ -30,6 +36,10 @@ class ExportStrategy(
                            ['name', 'export_fn', 'strip_default_attrs'])):
   """A class representing a type of model export.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Typically constructed by a utility function specific to the exporter, such as
   `saved_model_export_utils.make_export_strategy()`.
 
@@ -56,6 +66,8 @@ class ExportStrategy(
         forward compatibility of the resulting `SavedModel`.
   """
 
+  @deprecated(None, 'Please switch to tf.estimator.train_and_evaluate, and use '
+              'tf.estimator.Exporter.')
   def __new__(cls, name, export_fn, strip_default_attrs=None):
     return super(ExportStrategy, cls).__new__(
         cls, name, export_fn, strip_default_attrs)
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py
index 98365c05f663e5d2a06703457fc5663d7135f7d9..a997fab723a16dddf150aa9397863605e4e77933 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""High level operations on graphs."""
+"""High level operations on graphs (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -68,6 +73,7 @@ def clear_summary_writers():
   return summary_io.SummaryWriterCache.clear()
 
 
+@deprecated(None, 'Use `SummaryWriterCache.get` directly.')
 def get_summary_writer(logdir):
   """Returns single SummaryWriter per logdir in current run.
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
index 06c3782a471537cf3879450e6bd20899a35d96ac..8b133a4440d8cbc19abca64f972791fc16ade6f8 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tools to allow different io formats."""
+"""Tools to allow different io formats (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/dask_io.py b/tensorflow/contrib/learn/python/learn/learn_io/dask_io.py
index 7d666391cea3c0a52a2cb7e324c00d5f480710d5..e0a1948d95a727675dac8ff3ce9f55c35d5f8d8d 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/dask_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/dask_io.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Methods to allow dask.DataFrame."""
+"""Methods to allow dask.DataFrame (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +26,8 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.util.deprecation import deprecated
+
 try:
   # pylint: disable=g-import-not-at-top
   import dask.dataframe as dd
@@ -60,6 +67,7 @@ def _construct_dask_df_with_divisions(df):
     return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
 
 
+@deprecated(None, 'Please feed input to tf.data to support dask.')
 def extract_dask_data(data):
   """Extract data from dask.Series or dask.DataFrame for predictors.
 
@@ -81,6 +89,7 @@ def extract_dask_data(data):
     return data
 
 
+@deprecated(None, 'Please feed input to tf.data to support dask.')
 def extract_dask_labels(labels):
   """Extract data from dask.Series or dask.DataFrame for labels.
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index 96be8b1bc402479d5611965f27abb197363cb939..c45b1d186471125776d6536112aebb66bb5ad558 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementations of different data feeders to provide data for TF trainer."""
+"""Implementations of different data feeders to provide data for TF trainer (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 # TODO(ipolosukhin): Replace this module with feed-dict queue runners & queues.
 
@@ -31,6 +36,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.deprecation import deprecated
 
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
@@ -101,6 +107,7 @@ def _is_iterable(x):
   return hasattr(x, 'next') or hasattr(x, '__next__')
 
 
+@deprecated(None, 'Please use tensorflow/transform or tf.data.')
 def setup_train_data_feeder(x,
                             y,
                             n_classes,
@@ -188,6 +195,7 @@ def _batch_data(x, batch_size=None):
     yield np.matrix(chunk)
 
 
+@deprecated(None, 'Please use tensorflow/transform or tf.data.')
 def setup_predict_data_feeder(x, batch_size=None):
   """Returns an iterable for feeding into predict step.
 
@@ -219,6 +227,7 @@ def setup_predict_data_feeder(x, batch_size=None):
   return [x]
 
 
+@deprecated(None, 'Please use tensorflow/transform or tf.data.')
 def setup_processor_data_feeder(x):
   """Sets up processor iterable.
 
@@ -233,6 +242,7 @@ def setup_processor_data_feeder(x):
   return x
 
 
+@deprecated(None, 'Please convert numpy dtypes explicitly.')
 def check_array(array, dtype):
   """Checks array on dtype and converts it if different.
 
@@ -275,8 +285,14 @@ def _check_dtype(dtype):
 
 
 class DataFeeder(object):
-  """Data feeder is an example class to sample data for TF trainer."""
+  """Data feeder is an example class to sample data for TF trainer.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
+  @deprecated(None, 'Please use tensorflow/transform or tf.data.')
   def __init__(self,
                x,
                y,
@@ -563,6 +579,10 @@ class DataFeeder(object):
 class StreamingDataFeeder(DataFeeder):
   """Data feeder for TF trainer that reads data from iterator.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Streaming data feeder allows to read data as it comes it from disk or
   somewhere else. It's custom to have this iterators rotate infinetly over
   the dataset, to allow control of how much to learn on the trainer side.
@@ -771,11 +791,16 @@ class StreamingDataFeeder(DataFeeder):
 class DaskDataFeeder(object):
   """Data feeder for that reads data from dask.Series and dask.DataFrame.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Numpy arrays can be serialized to disk and it's possible to do random seeks
   into them. DaskDataFeeder will remove requirement to have full dataset in the
   memory and still do random seeks for sampling of batches.
   """
 
+  @deprecated(None, 'Please feed input to tf.data to support dask.')
   def __init__(self,
                x,
                y,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
index 82848be7df653dd60219317d28f233767746f544..1f439965daf956665bbedc919281df0ee07b5d62 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os.path
 import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -26,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.learn.python.learn.learn_io import *
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 # pylint: enable=wildcard-import
@@ -35,6 +37,13 @@ class DataFeederTest(test.TestCase):
   # pylint: disable=undefined-variable
   """Tests for `DataFeeder`."""
 
+  def setUp(self):
+    self._base_dir = os.path.join(self.get_temp_dir(), 'base_dir')
+    file_io.create_dir(self._base_dir)
+
+  def tearDown(self):
+    file_io.delete_recursively(self._base_dir)
+
   def _wrap_dict(self, data, prepend=''):
     return {prepend + '1': data, prepend + '2': data}
 
@@ -45,14 +54,14 @@ class DataFeederTest(test.TestCase):
   def _assert_dtype(self, expected_np_dtype, expected_tf_dtype, input_data):
     feeder = data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
     if isinstance(input_data, dict):
-      for k, v in list(feeder.input_dtype.items()):
+      for v in list(feeder.input_dtype.values()):
         self.assertEqual(expected_np_dtype, v)
     else:
       self.assertEqual(expected_np_dtype, feeder.input_dtype)
     with ops.Graph().as_default() as g, self.test_session(g):
       inp, _ = feeder.input_builder()
       if isinstance(inp, dict):
-        for k, v in list(inp.items()):
+        for v in list(inp.values()):
           self.assertEqual(expected_tf_dtype, v.dtype)
       else:
         self.assertEqual(expected_tf_dtype, inp.dtype)
@@ -301,7 +310,10 @@ class DataFeederTest(test.TestCase):
                                                 [0.60000002, 0.2]])
       self.assertAllClose(feed_dict[out.name], [[0., 0., 1.], [0., 1., 0.]])
 
-  def test_hdf5_data_feeder(self):
+  # TODO(rohanj): Fix this test by fixing data_feeder. Currently, h5py doesn't
+  # support permutation based indexing lookups (More documentation at
+  # http://docs.h5py.org/en/latest/high/dataset.html#fancy-indexing)
+  def DISABLED_test_hdf5_data_feeder(self):
 
     def func(df):
       inp, out = df.input_builder()
@@ -314,11 +326,12 @@ class DataFeederTest(test.TestCase):
       import h5py  # pylint: disable=g-import-not-at-top
       x = np.matrix([[1, 2], [3, 4]])
       y = np.array([1, 2])
-      h5f = h5py.File('test_hdf5.h5', 'w')
+      file_path = os.path.join(self._base_dir, 'test_hdf5.h5')
+      h5f = h5py.File(file_path, 'w')
       h5f.create_dataset('x', data=x)
       h5f.create_dataset('y', data=y)
       h5f.close()
-      h5f = h5py.File('test_hdf5.h5', 'r')
+      h5f = h5py.File(file_path, 'r')
       x = h5f['x']
       y = h5f['y']
       func(data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3))
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
index 884faf8335e2a3ca1d27d2d93b4c817131648774..f8aaa0c9e3e5b589a6ad47678dba3dc38de7c471 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Methods to allow generator of dict with numpy arrays."""
+"""Methods to allow generator of dict with numpy arrays (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,8 +28,10 @@ from types import FunctionType
 from types import GeneratorType
 
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _enqueue_data as enqueue_data
+from tensorflow.python.util.deprecation import deprecated
 
 
+@deprecated(None, 'Please use tf.data.')
 def generator_input_fn(x,
                        target_key=None,
                        batch_size=128,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 3a46c239688017f9204d2c6182a6f81cd325a417..9e816f54b6cf8dee84c6d62406ab3db700054d06 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Methods to read data in the graph."""
+"""Methods to read data in the graph (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -34,11 +39,13 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.summary import summary
 from tensorflow.python.training import input as input_ops
 from tensorflow.python.training import queue_runner
+from tensorflow.python.util.deprecation import deprecated
 
 # Default name for key in the feature dict.
 KEY_FEATURE_NAME = '__key__'
 
 
+@deprecated(None, 'Use tf.data.')
 def read_batch_examples(file_pattern,
                         batch_size,
                         reader,
@@ -106,6 +113,7 @@ def read_batch_examples(file_pattern,
   return examples
 
 
+@deprecated(None, 'Use tf.data.')
 def read_keyed_batch_examples(file_pattern,
                               batch_size,
                               reader,
@@ -175,6 +183,7 @@ def read_keyed_batch_examples(file_pattern,
       seed=seed)
 
 
+@deprecated(None, 'Use tf.data.')
 def read_keyed_batch_examples_shared_queue(file_pattern,
                                            batch_size,
                                            reader,
@@ -452,6 +461,7 @@ def _read_keyed_batch_examples_helper(file_pattern,
     return queued_examples_with_keys
 
 
+@deprecated(None, 'Use tf.data.')
 def read_keyed_batch_features(file_pattern,
                               batch_size,
                               features,
@@ -540,6 +550,7 @@ def read_keyed_batch_features(file_pattern,
         name=scope)
 
 
+@deprecated(None, 'Use tf.data.')
 def read_keyed_batch_features_shared_queue(file_pattern,
                                            batch_size,
                                            features,
@@ -620,6 +631,7 @@ def read_keyed_batch_features_shared_queue(file_pattern,
         name=scope)
 
 
+@deprecated(None, 'Use tf.data.')
 def queue_parsed_features(parsed_features,
                           keys=None,
                           feature_queue_capacity=100,
@@ -742,6 +754,7 @@ def queue_parsed_features(parsed_features,
     return dequeued_keys, dequeued_parsed_features
 
 
+@deprecated(None, 'Use tf.data.')
 def read_batch_features(file_pattern,
                         batch_size,
                         features,
@@ -821,6 +834,7 @@ def read_batch_features(file_pattern,
   return features
 
 
+@deprecated(None, 'Use tf.data.')
 def read_batch_record_features(file_pattern,
                                batch_size,
                                features,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
index 692438807fbd7febb156d4db73b5d3deba6c987d..29552d24f1eaa0d85a99c8b09f69d007e7e4fe9f 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
@@ -12,15 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Methods to allow dict of numpy arrays."""
+"""Methods to allow dict of numpy arrays (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn as core_numpy_input_fn
+from tensorflow.python.util.deprecation import deprecated
 
 
+@deprecated(None, 'Use tf.estimator.inputs.numpy_input_fn.')
 def numpy_input_fn(x,
                    y=None,
                    batch_size=128,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
index ede7558eafa9237dc63aa95a62e599c5e9755822..b4ef055f5ae484ec704ad42efcf2c00c4a7a4f56 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
@@ -13,13 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Methods to allow pandas.DataFrame."""
+"""Methods to allow pandas.DataFrame (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.estimator.inputs.pandas_io import pandas_input_fn as core_pandas_input_fn
+from tensorflow.python.util.deprecation import deprecated
 
 try:
   # pylint: disable=g-import-not-at-top
@@ -47,6 +53,7 @@ PANDAS_DTYPES = {
 }
 
 
+@deprecated(None, 'Please use tf.estimator.inputs.pandas_input_fn')
 def pandas_input_fn(x,
                     y=None,
                     batch_size=128,
@@ -66,6 +73,7 @@ def pandas_input_fn(x,
                               target_column=target_column)
 
 
+@deprecated(None, 'Please access pandas data directly.')
 def extract_pandas_data(data):
   """Extract data from pandas.DataFrame for predictors.
 
@@ -96,6 +104,7 @@ def extract_pandas_data(data):
                      'float, or bool. Found: ' + ', '.join(error_report))
 
 
+@deprecated(None, 'Please access pandas data directly.')
 def extract_pandas_matrix(data):
   """Extracts numpy matrix from pandas DataFrame.
 
@@ -111,6 +120,7 @@ def extract_pandas_matrix(data):
   return data.as_matrix()
 
 
+@deprecated(None, 'Please access pandas data directly.')
 def extract_pandas_labels(labels):
   """Extract data from pandas.DataFrame for labels.
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner.py b/tensorflow/contrib/learn/python/learn/learn_runner.py
index 2af723a0d64822e81fa0fbeb106ab812de6ab4e8..d719a3e488b9905ef7903e21d90dbaae0449735c 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Runs an Experiment."""
+"""Runs an Experiment (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,6 +27,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config as run_c
 from tensorflow.contrib.learn.python.learn.experiment import Experiment
 from tensorflow.contrib.training.python.training import hparam as hparam_lib
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.deprecation import deprecated
 
 
 # TODO(xiejw): Refactor the learn_runner to make code reusable.
@@ -99,6 +105,7 @@ def _wrapped_experiment_fn_with_uid_check(experiment_fn, require_hparams=False):
   return wrapped_experiment_fn
 
 
+@deprecated(None, 'Use tf.estimator.train_and_evaluate.')
 def run(experiment_fn, output_dir=None, schedule=None, run_config=None,
         hparams=None):
   """Make and run an experiment.
@@ -218,6 +225,7 @@ def run(experiment_fn, output_dir=None, schedule=None, run_config=None,
   return _execute_schedule(experiment, schedule)
 
 
+@deprecated(None, 'Use tf.estimator.train_and_evaluate.')
 def tune(experiment_fn, tuner):
   """Tune an experiment with hyper-parameters.
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner_lib.py b/tensorflow/contrib/learn/python/learn/learn_runner_lib.py
index 7d9b1c7716f0ab1f2274ca53406175240b613027..ba2d067787c1dfd4e4820ecc916f1053e9f3cf60 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner_lib.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner_lib.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities to run and tune an Experiment.
+"""Utilities to run and tune an Experiment (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 @@run
 @@tune
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index 6440bc204b8e339ff51311dcc87b36f556b94092..97220365d5dddb82b602369f06bea021a86d584f 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""The metric spec class to flexibly connect models and metrics."""
+"""The metric spec class to flexibly connect models and metrics (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,6 +27,7 @@ import six
 
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.deprecation import deprecated
 
 
 def _assert_named_args(sentinel):
@@ -223,6 +229,10 @@ def _adapt_metric_fn(
 class MetricSpec(object):
   """MetricSpec connects a model to metric functions.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   The MetricSpec class contains all information necessary to connect the
   output of a `model_fn` to the metrics (usually, streaming metrics) that are
   used in evaluation.
@@ -284,6 +294,7 @@ class MetricSpec(object):
 
   """
 
+  @deprecated(None, 'Use tf.estimator.EstimatorSpec.eval_metric_ops.')
   def __init__(self,
                metric_fn,
                prediction_key=None,
diff --git a/tensorflow/contrib/learn/python/learn/models.py b/tensorflow/contrib/learn/python/learn/models.py
index 4283240d018c949bb35aeb12032d2ee8b75884a5..bd4bbf9f8c9ad7e8a0fc06d8c0dc24672536c158 100644
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Various high level TF models."""
+"""Various high level TF models (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,8 +33,10 @@ from tensorflow.python.ops import array_ops as array_ops_
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.summary import summary
+from tensorflow.python.util.deprecation import deprecated
 
 
+@deprecated(None, 'Consider using a tf.estimator.LinearRegressor')
 def linear_regression_zero_init(x, y):
   """Linear regression subgraph with zero-value initial weights and bias.
 
@@ -43,6 +50,7 @@ def linear_regression_zero_init(x, y):
   return linear_regression(x, y, init_mean=0.0, init_stddev=0.0)
 
 
+@deprecated(None, 'Consider using a class from tf.estimator.LinearClassifier')
 def logistic_regression_zero_init(x, y):
   """Logistic regression subgraph with zero-value initial weights and bias.
 
@@ -56,6 +64,7 @@ def logistic_regression_zero_init(x, y):
   return logistic_regression(x, y, init_mean=0.0, init_stddev=0.0)
 
 
+@deprecated(None, 'Consider using a class from tf.estimator.')
 def linear_regression(x, y, init_mean=None, init_stddev=1.0):
   """Creates linear regression TensorFlow subgraph.
 
@@ -107,6 +116,7 @@ def linear_regression(x, y, init_mean=None, init_stddev=1.0):
     return losses_ops.mean_squared_error_regressor(x, y, weights, bias)
 
 
+@deprecated(None, 'Consider using a class from tf.estimator.')
 def logistic_regression(x,
                         y,
                         class_weight=None,
@@ -203,6 +213,7 @@ def _reverse_seq(input_seq, lengths):
   return result
 
 
+@deprecated(None, 'Please consider `tf.nn.bidirectional_dynamic_rnn`.')
 def bidirectional_rnn(cell_fw,
                       cell_bw,
                       inputs,
@@ -283,6 +294,7 @@ def bidirectional_rnn(cell_fw,
 # End of TensorFlow 0.7
 
 
+@deprecated(None, 'Please consider tensorflow/tensor2tensor.')
 def get_rnn_model(rnn_size, cell_type, num_layers, input_op_fn, bidirectional,
                   target_predictor_fn, sequence_length, initial_state,
                   attn_length, attn_size, attn_vec_size):
diff --git a/tensorflow/contrib/learn/python/learn/monitored_session.py b/tensorflow/contrib/learn/python/learn/monitored_session.py
index 22602e9f69d972505d83a66a6f9183b5e4d15c44..ac0433f1775feeed2ec3cf49291da01500bef01b 100644
--- a/tensorflow/contrib/learn/python/learn/monitored_session.py
+++ b/tensorflow/contrib/learn/python/learn/monitored_session.py
@@ -13,7 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A wrapper of Session API which runs hooks."""
+"""A wrapper of Session API which runs hooks (deprecated).
+
+These are deprecated aliases for classes and functions in `tf.train`. Please use
+those directly.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 51381a7427c919592b8e818c4b46dba974992610..77f7c73d5412d40b338eaff4cf04d99fd0892723 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Monitors instrument the training process.
+"""Monitors instrument the training process (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 @@get_default_monitors
 @@BaseMonitor
@@ -59,6 +63,10 @@ from tensorflow.python.util import tf_inspect
 class BaseMonitor(object):
   """Base class for Monitors.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Defines basic interfaces of Monitors.
   Monitors can either be run on all workers or, more commonly, restricted
   to run exclusively on the elected chief worker.
@@ -229,6 +237,10 @@ def _extract_output(outputs, request):
 class EveryN(BaseMonitor):
   """Base class for monitors that execute callbacks every N steps.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   This class adds three new callbacks:
     - every_n_step_begin
     - every_n_step_end
@@ -418,6 +430,10 @@ class StopAtStep(BaseMonitor):
 class PrintTensor(EveryN):
   """Prints given tensors every N steps.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   This is an `EveryN` monitor and has consistent semantic for `every_n`
   and `first_n`.
 
@@ -455,9 +471,12 @@ class PrintTensor(EveryN):
 class LoggingTrainable(EveryN):
   """Writes trainable variable values into log every N steps.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Write the tensors in trainable variables `every_n` steps,
   starting with the `first_n`th step.
-
   """
 
   def __init__(self, scope=None, every_n=100, first_n=1):
@@ -493,7 +512,12 @@ class LoggingTrainable(EveryN):
 
 
 class SummarySaver(EveryN):
-  """Saves summaries every N steps."""
+  """Saves summaries every N steps.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
   def __init__(self,
                summary_op,
@@ -554,6 +578,10 @@ class SummarySaver(EveryN):
 class ValidationMonitor(EveryN):
   """Runs evaluation of a given estimator, at most every N steps.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Note that the evaluation is done based on the saved checkpoint, which will
   usually be older than the current step.
 
@@ -573,7 +601,8 @@ class ValidationMonitor(EveryN):
                early_stopping_rounds=None,
                early_stopping_metric="loss",
                early_stopping_metric_minimize=True,
-               name=None):
+               name=None,
+               check_interval_secs=5):
     """Initializes a ValidationMonitor.
 
     Args:
@@ -600,6 +629,9 @@ class ValidationMonitor(EveryN):
           loss metrics like mean squared error, and False for performance
           metrics like accuracy.
       name: See `BaseEstimator.evaluate`.
+      check_interval_secs: Only check for new checkpoint if at least
+          `check_interval_secs` have passed. Ignore if None. Default is 5 secs.
+
 
     Raises:
       ValueError: If both x and input_fn are provided.
@@ -626,6 +658,8 @@ class ValidationMonitor(EveryN):
     self._early_stopped = False
     self._latest_path = None
     self._latest_path_step = None
+    self._last_checkpoint_check_time = None
+    self._check_interval_secs = check_interval_secs
 
   @property
   def early_stopped(self):
@@ -690,6 +724,16 @@ class ValidationMonitor(EveryN):
     # that's what is being evaluated.
     if self._estimator is None:
       raise ValueError("Missing call to set_estimator.")
+    current_time = time.time()
+    if (self._check_interval_secs is not None and
+        self._last_checkpoint_check_time is not None and
+        current_time - self._last_checkpoint_check_time <=
+        self._check_interval_secs):
+      logging.debug(
+          "Skipping evaluation since less than %d seconds have passed since "
+          "last check for a new checkpoint.", self._check_interval_secs)
+      return False
+    self._last_checkpoint_check_time = current_time
     # Check that we are not running evaluation on the same checkpoint.
     latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir)
     if latest_path is None:
@@ -740,6 +784,10 @@ class ValidationMonitor(EveryN):
 class CaptureVariable(EveryN):
   """Captures a variable's values into a collection.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   This monitor is useful for unit testing. You should exercise caution when
   using this monitor in production, since it never discards values.
 
@@ -778,6 +826,7 @@ class CaptureVariable(EveryN):
     self._var_values[step] = _extract_output(outputs, self._var_name)
 
 
+@deprecation.deprecated(None, "Use tf.train.MonitoredTrainingSession.")
 def get_default_monitors(loss_op=None,
                          summary_op=None,
                          save_summary_steps=100,
@@ -812,6 +861,10 @@ def get_default_monitors(loss_op=None,
 class GraphDump(BaseMonitor):
   """Dumps almost all tensors in the graph at every step.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Note, this is very expensive, prefer `PrintTensor` in production.
   """
 
@@ -901,7 +954,12 @@ class GraphDump(BaseMonitor):
 
 
 class ExportMonitor(EveryN):
-  """Monitor that exports Estimator every N steps."""
+  """Monitor that exports Estimator every N steps.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
   @deprecation.deprecated("2017-03-25",
                           "ExportMonitor is deprecated. Please pass an "
@@ -1024,7 +1082,12 @@ class ExportMonitor(EveryN):
 
 
 class CheckpointSaver(BaseMonitor):
-  """Saves checkpoints every N steps or N seconds."""
+  """Saves checkpoints every N steps or N seconds.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
   def __init__(self,
                checkpoint_dir,
@@ -1109,7 +1172,12 @@ class CheckpointSaver(BaseMonitor):
 
 
 class StepCounter(EveryN):
-  """Steps per second monitor."""
+  """Steps per second monitor.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
   def __init__(self, every_n_steps=100, output_dir=None, summary_writer=None):
     super(StepCounter, self).__init__(every_n_steps=every_n_steps)
@@ -1149,6 +1217,10 @@ class NanLossDuringTrainingError(RuntimeError):
 class NanLoss(EveryN):
   """NaN Loss monitor.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Monitors loss and stops training if loss is NaN.
   Can either fail with exception or just stop training.
   """
diff --git a/tensorflow/contrib/learn/python/learn/monitors_test.py b/tensorflow/contrib/learn/python/learn/monitors_test.py
index b2b24776c60183113a5f936dd276ff312d6d0079..5c34d0ddb01f3bcdc407e6926e7c5b73be1863b4 100644
--- a/tensorflow/contrib/learn/python/learn/monitors_test.py
+++ b/tensorflow/contrib/learn/python/learn/monitors_test.py
@@ -385,7 +385,11 @@ class MonitorsTest(test.TestCase):
     estimator.evaluate.return_value = validation_outputs
 
     monitor = learn.monitors.ValidationMonitor(
-        x=constant_op.constant(2.0), every_n_steps=0, early_stopping_rounds=2)
+        x=constant_op.constant(2.0),
+        every_n_steps=0,
+        early_stopping_rounds=2,
+        check_interval_secs=None)
+
     self._assert_validation_monitor(monitor)
     monitor.set_estimator(estimator)
     with ops.Graph().as_default() as g, self.test_session(g):
diff --git a/tensorflow/contrib/learn/python/learn/ops/__init__.py b/tensorflow/contrib/learn/python/learn/ops/__init__.py
index 33962e34cc685ce2c830a7bbfd1b5c626bcd8b31..efb1f47cf5bb2dcd0fb37b7b85cd8f170d56e4d1 100644
--- a/tensorflow/contrib/learn/python/learn/ops/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/ops/__init__.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Various TensorFlow Ops."""
+"""Various TensorFlow Ops (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/ops/embeddings_ops.py b/tensorflow/contrib/learn/python/learn/ops/embeddings_ops.py
index fa3b7323e343371e986b763d30a8a44620894549..8f9811cf251ae0af1e0055a56e1358c2771b1367 100644
--- a/tensorflow/contrib/learn/python/learn/ops/embeddings_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/embeddings_ops.py
@@ -13,7 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
-"""TensorFlow Ops to work with embeddings.
+"""TensorFlow Ops to work with embeddings (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 Note: categorical variables are handled via embeddings in many cases.
 For example, in case of words.
@@ -57,7 +61,7 @@ def embedding_lookup(params, ids, name='embedding_lookup'):
     ids = ops.convert_to_tensor(ids)
     shape = array_ops_.shape(ids)
     ids_flat = array_ops_.reshape(
-        ids, math_ops.reduce_prod(shape, keep_dims=True))
+        ids, math_ops.reduce_prod(shape, keepdims=True))
     embeds_flat = nn.embedding_lookup(params, ids_flat, name)
     embed_shape = array_ops_.concat([shape, [-1]], 0)
     embeds = array_ops_.reshape(embeds_flat, embed_shape)
diff --git a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
index b040ab3bb6c516158589a8e30d56fff1f7728951..9f2cadb01747c5a8e4ee75ac38f423f85e11bbba 100644
--- a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""TensorFlow Ops for loss computation."""
+"""TensorFlow Ops for loss computation (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -35,7 +40,7 @@ def mean_squared_error_regressor(tensor_in, labels, weights, biases, name=None):
                       [tensor_in, labels]):
     predictions = nn.xw_plus_b(tensor_in, weights, biases)
     if len(labels.get_shape()) == 1 and len(predictions.get_shape()) == 2:
-      predictions = array_ops_.squeeze(predictions, squeeze_dims=[1])
+      predictions = array_ops_.squeeze(predictions, axis=[1])
     return predictions, losses.mean_squared_error(labels, predictions)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py
index 45727faab4362abeab18f77861353eb53976023a..aa37cb4a76e2a6157bf077d327248353bd516472 100644
--- a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""TensorFlow Ops for Sequence to Sequence models."""
+"""TensorFlow Ops for Sequence to Sequence models (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,8 +31,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util.deprecation import deprecated
 
 
+@deprecated(None, 'Please use tf.nn/tf.layers directly.')
 def sequence_classifier(decoding, labels, sampling_decoding=None, name=None):
   """Returns predictions and loss for sequence of predictions.
 
@@ -57,6 +64,7 @@ def sequence_classifier(decoding, labels, sampling_decoding=None, name=None):
     return array_ops.stack(predictions, axis=1), loss
 
 
+@deprecated(None, 'Please use tf.nn/tf.layers directly.')
 def seq2seq_inputs(x, y, input_length, output_length, sentinel=None, name=None):
   """Processes inputs for Sequence to Sequence models.
 
@@ -87,6 +95,7 @@ def seq2seq_inputs(x, y, input_length, output_length, sentinel=None, name=None):
     return in_x, in_y, out_y
 
 
+@deprecated(None, 'Please use tf.nn/tf.layers directly.')
 def rnn_decoder(decoder_inputs, initial_state, cell, scope=None):
   """RNN Decoder that creates training and sampling sub-graphs.
 
@@ -123,6 +132,7 @@ def rnn_decoder(decoder_inputs, initial_state, cell, scope=None):
   return outputs, states, sampling_outputs, sampling_states
 
 
+@deprecated(None, 'Please use tf.nn/tf.layers directly.')
 def rnn_seq2seq(encoder_inputs,
                 decoder_inputs,
                 encoder_cell,
diff --git a/tensorflow/contrib/learn/python/learn/preprocessing/__init__.py b/tensorflow/contrib/learn/python/learn/preprocessing/__init__.py
index 7bcc177d4ea0ab57f092d68888a72de2b2fd5edc..e8c6e1acf80f0791421bee59aff30e67bccb44b2 100644
--- a/tensorflow/contrib/learn/python/learn/preprocessing/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/preprocessing/__init__.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Preprocessing tools useful for building models."""
+"""Preprocessing tools useful for building models (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/preprocessing/categorical.py b/tensorflow/contrib/learn/python/learn/preprocessing/categorical.py
index 154739d497ec1029026eaca1e93b37cd225f1050..faba3b2025e8abb51d1989c3fafbd5e711d6559b 100644
--- a/tensorflow/contrib/learn/python/learn/preprocessing/categorical.py
+++ b/tensorflow/contrib/learn/python/learn/preprocessing/categorical.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Implements preprocessing transformers for categorical variables."""
+"""Implements preprocessing transformers for categorical variables (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,6 +27,8 @@ from __future__ import print_function
 import math
 import numpy as np
 
+from tensorflow.python.util.deprecation import deprecated
+
 # pylint: disable=g-bad-import-order
 from . import categorical_vocabulary
 from ..learn_io.data_feeder import setup_processor_data_feeder
@@ -31,10 +38,16 @@ from ..learn_io.data_feeder import setup_processor_data_feeder
 class CategoricalProcessor(object):
   """Maps documents to sequences of word ids.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   As a common convention, Nan values are handled as unknown tokens.
   Both float('nan') and np.nan are accepted.
   """
 
+  @deprecated(None, 'Please use tensorflow/transform or tf.data for sequence '
+              'processing.')
   def __init__(self, min_frequency=0, share=False, vocabularies=None):
     """Initializes a CategoricalProcessor instance.
 
diff --git a/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py b/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py
index 5709955c49fba50ca4a299a443a2902bbd9c6b23..3ac370a6ab4423846e810900514445ad5269b680 100644
--- a/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py
+++ b/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py
@@ -13,7 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Categorical vocabulary classes to map categories to indexes.
+"""Categorical vocabulary classes to map categories to indexes (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 Can be used for categorical variables, sparse variables and words.
 """
@@ -25,14 +29,21 @@ from __future__ import print_function
 import collections
 import six
 
+from tensorflow.python.util.deprecation import deprecated
+
 
 class CategoricalVocabulary(object):
   """Categorical variables vocabulary class.
 
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+
   Accumulates and provides mapping from classes to indexes.
   Can be easily used for words.
   """
 
+  @deprecated(None, 'Please use tensorflow/transform or tf.data.')
   def __init__(self, unknown_token="<UNK>", support_reverse=True):
     self._unknown_token = unknown_token
     self._mapping = {unknown_token: 0}
diff --git a/tensorflow/contrib/learn/python/learn/preprocessing/text.py b/tensorflow/contrib/learn/python/learn/preprocessing/text.py
index 3af2074c2a46f0258c04111fff0235ba8309625e..f2b6776be7789a9433bfe41eb9354b74347059ec 100644
--- a/tensorflow/contrib/learn/python/learn/preprocessing/text.py
+++ b/tensorflow/contrib/learn/python/learn/preprocessing/text.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Implements a number of text preprocessing utilities."""
+"""Implements a number of text preprocessing utilities (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,6 +29,7 @@ import numpy as np
 import six
 
 from tensorflow.python.platform import gfile
+from tensorflow.python.util.deprecation import deprecated
 
 from .categorical_vocabulary import CategoricalVocabulary  # pylint: disable=g-bad-import-order
 
@@ -38,6 +44,7 @@ TOKENIZER_RE = re.compile(r"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+",
                           re.UNICODE)
 
 
+@deprecated(None, 'Please use tensorflow/transform or tf.data.')
 def tokenizer(iterator):
   """Tokenizer generator.
 
@@ -51,9 +58,16 @@ def tokenizer(iterator):
     yield TOKENIZER_RE.findall(value)
 
 
+@deprecated(None, 'Please use tensorflow/transform or tf.data.')
 class ByteProcessor(object):
-  """Maps documents into sequence of ids for bytes."""
+  """Maps documents into sequence of ids for bytes.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
+  @deprecated(None, 'Please use tensorflow/transform or tf.data.')
   def __init__(self, max_document_length):
     self.max_document_length = max_document_length
 
@@ -108,8 +122,14 @@ class ByteProcessor(object):
 
 
 class VocabularyProcessor(object):
-  """Maps documents to sequences of word ids."""
+  """Maps documents to sequences of word ids.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
+  @deprecated(None, 'Please use tensorflow/transform or tf.data.')
   def __init__(self,
                max_document_length,
                min_frequency=0,
diff --git a/tensorflow/contrib/learn/python/learn/session_run_hook.py b/tensorflow/contrib/learn/python/learn/session_run_hook.py
index a8ba2be97206f2b974d256ad2c62c21a4e3e55d8..87edc9b720bdb3edcd5f2dcd1662d14da53c51cf 100644
--- a/tensorflow/contrib/learn/python/learn/session_run_hook.py
+++ b/tensorflow/contrib/learn/python/learn/session_run_hook.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""This file is deprecated. Use tensorflow.python.training.session_run_hook."""
+"""This file is deprecated. Use `tensorflow.python.training.session_run_hook`.
+
+See [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/summary_writer_cache.py b/tensorflow/contrib/learn/python/learn/summary_writer_cache.py
index 919d415c302b8ec17202aad34ff0bee69bfee2c7..d663cf5fb79c428b0e70d66b0f1305f0559a05c9 100644
--- a/tensorflow/contrib/learn/python/learn/summary_writer_cache.py
+++ b/tensorflow/contrib/learn/python/learn/summary_writer_cache.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Wrapper for a Session-like object that handles threads and recovery.
+"""Wrapper for a Session-like object that handles threads and recovery (deprecated).
+
+These are deprecated aliases for classes and functions in `tf.train`. Please use
+those directly.
 
 Based on an original design of Illia Polosukhin.
 """
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
index 429b6040be21d8cbe1f2bba58090366552fdfbe7..a1a3f20dcd8cb5ff7baa559ac41d5e5c40780511 100644
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""`Trainable` interface."""
+"""`Trainable` interface (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,6 +28,8 @@ import abc
 
 class Trainable(object):
   """Interface for objects that are trainable by, e.g., `Experiment`.
+
+  THIS CLASS IS DEPRECATED.
   """
   __metaclass__ = abc.ABCMeta
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/__init__.py b/tensorflow/contrib/learn/python/learn/utils/__init__.py
index 48978d0ac34cec2b18e6794dcf3b260bc3b683c4..66d8dc6fd43b383919a16515bc96be492a253bf6 100644
--- a/tensorflow/contrib/learn/python/learn/utils/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/utils/__init__.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""TensorFlow Learn Utils."""
+"""TensorFlow Learn Utils (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py
index cb34cb1d26b6812c7f3f39e9f965615de5a8ef07..3eacac7a3d3dcff4d39025fdee88e16e385b1b84 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export.py
@@ -13,14 +13,18 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Export utilities."""
+"""Export utilities (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework import deprecated
-from tensorflow.python.training import training_util
 from tensorflow.contrib.session_bundle import exporter
 from tensorflow.contrib.session_bundle import gc
 from tensorflow.python.client import session as tf_session
@@ -32,6 +36,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as tf_saver
+from tensorflow.python.training import training_util
 
 
 @deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc.py b/tensorflow/contrib/learn/python/learn/utils/gc.py
index 226915987a4934626066b12810f579ae675107b2..916aecbea88b10bbef316ffb89d4c4d89667cb29 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc.py
@@ -13,7 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
-r"""System for specifying garbage collection (GC) of path based data.
+r"""System for specifying garbage collection (GC) of path based data (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 This framework allows for GC of data specified by path names, for example files
 on disk.  gc.Path objects each represent a single item stored at a path and may
@@ -73,10 +77,12 @@ import os
 
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated
 
 Path = collections.namedtuple('Path', 'path export_version')
 
 
+@deprecated(None, 'Please implement your own file management or use Saver.')
 def largest_export_versions(n):
   """Creates a filter that keeps the largest n export versions.
 
@@ -97,6 +103,7 @@ def largest_export_versions(n):
   return keep
 
 
+@deprecated(None, 'Please implement your own file management or use Saver.')
 def one_of_every_n_export_versions(n):
   """Creates a filter that keeps one of every n export versions.
 
@@ -128,6 +135,7 @@ def one_of_every_n_export_versions(n):
   return keep
 
 
+@deprecated(None, 'Please implement your own file management or use Saver.')
 def mod_export_version(n):
   """Creates a filter that keeps every export that is a multiple of n.
 
@@ -146,6 +154,7 @@ def mod_export_version(n):
   return keep
 
 
+@deprecated(None, 'Please implement your own file management or use Saver.')
 def union(lf, rf):
   """Creates a filter that keeps the union of two filters.
 
@@ -163,6 +172,7 @@ def union(lf, rf):
   return keep
 
 
+@deprecated(None, 'Please implement your own file management or use Saver.')
 def negation(f):
   """Negate a filter.
 
@@ -179,6 +189,7 @@ def negation(f):
   return keep
 
 
+@deprecated(None, 'Please implement your own file name management.')
 def get_paths(base_dir, parser):
   """Gets a list of Paths in a given directory.
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py b/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py
index b2521933e524e7ec24d73d4b5171f33e507dd88c..b92eb9fea8b7ccea56c781df74dcfa1cc5508e48 100644
--- a/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for creating input_fns.
+"""Utilities for creating input_fns (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 Contents of this file are moved to tensorflow/python/estimator/export.py.
 InputFnOps is renamed to ServingInputReceiver.
@@ -32,13 +36,17 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.util.deprecation import deprecated
 
 
 class InputFnOps(collections.namedtuple('InputFnOps',
                                         ['features',
                                          'labels',
                                          'default_inputs'])):
-  """A return type for an input_fn.
+  """A return type for an input_fn (deprecated).
+
+  THIS CLASS IS DEPRECATED. Please use tf.estimator.export.ServingInputReceiver
+  instead.
 
   This return type is currently only supported for serving input_fn.
   Training and eval input_fn should return a `(features, labels)` tuple.
@@ -56,6 +64,8 @@ class InputFnOps(collections.namedtuple('InputFnOps',
   """
 
 
+@deprecated(None, 'Please use '
+            'tf.estimator.export.build_parsing_serving_input_receiver_fn.')
 def build_parsing_serving_input_fn(feature_spec, default_batch_size=None):
   """Build an input_fn appropriate for serving, expecting fed tf.Examples.
 
@@ -84,6 +94,8 @@ def build_parsing_serving_input_fn(feature_spec, default_batch_size=None):
   return input_fn
 
 
+@deprecated(None, 'Please use '
+            'tf.estimator.export.build_raw_serving_input_receiver_fn.')
 def build_default_serving_input_fn(features, default_batch_size=None):
   """Build an input_fn appropriate for serving, expecting feature Tensors.
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/inspect_checkpoint.py b/tensorflow/contrib/learn/python/learn/utils/inspect_checkpoint.py
index 6a63fb545a56e6040b0b0c3bbb6a17cd96925895..6dbaa15f8391b0044be8e30ca191753beb88db93 100644
--- a/tensorflow/contrib/learn/python/learn/utils/inspect_checkpoint.py
+++ b/tensorflow/contrib/learn/python/learn/utils/inspect_checkpoint.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A simple script for inspect checkpoint files."""
+"""A simple script for inspect checkpoint files (deprecated)."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 1593380007b2799fb1d17e92408ab19a7b47fe1e..c7cdb4131215c388412407a008113de13bdd0934 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities supporting export to SavedModel.
+"""Utilities supporting export to SavedModel (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
 
 Some contents of this file are moved to tensorflow/python/estimator/export.py:
 
@@ -52,8 +56,9 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.training import saver
-
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated
+
 
 # A key for use in the input_alternatives dict indicating the default input.
 # This is the input that will be expected when a serving request does not
@@ -77,6 +82,7 @@ FEATURES_INPUT_ALTERNATIVE_KEY = 'features_input_alternative'
 _FALLBACK_DEFAULT_OUTPUT_ALTERNATIVE_KEY = 'default_output_alternative'
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def build_standardized_signature_def(input_tensors, output_tensors,
                                      problem_type):
   """Build a SignatureDef using problem type and input and output Tensors.
@@ -156,6 +162,7 @@ def _is_regression_problem(problem_type, input_tensors, output_tensors):
           len(input_tensors) == 1 and len(output_tensors) == 1)
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def get_input_alternatives(input_ops):
   """Obtain all input alternatives using the input_fn output and heuristics."""
   input_alternatives = {}
@@ -181,6 +188,7 @@ def get_input_alternatives(input_ops):
   return input_alternatives, features
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def get_output_alternatives(model_fn_ops, default_output_alternative_key=None):
   """Obtain all output alternatives using the model_fn output and heuristics.
 
@@ -246,6 +254,7 @@ def get_output_alternatives(model_fn_ops, default_output_alternative_key=None):
                        sorted(output_alternatives.keys())))
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def build_all_signature_defs(input_alternatives, output_alternatives,
                              actual_default_output_alternative_key):
   """Build `SignatureDef`s from all pairs of input and output alternatives."""
@@ -279,6 +288,7 @@ def build_all_signature_defs(input_alternatives, output_alternatives,
 MAX_DIRECTORY_CREATION_ATTEMPTS = 10
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def get_timestamped_export_dir(export_dir_base):
   """Builds a path to a new subdirectory within the base directory.
 
@@ -317,6 +327,7 @@ def get_timestamped_export_dir(export_dir_base):
                      '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS))
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def get_temp_export_dir(timestamped_export_dir):
   """Builds a directory name based on the argument but starting with 'temp-'.
 
@@ -344,6 +355,7 @@ def _export_version_parser(path):
   return path._replace(export_version=int(filename))
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def get_most_recent_export(export_dir_base):
   """Locate the most recent SavedModel export in a directory of many exports.
 
@@ -363,6 +375,7 @@ def get_most_recent_export(export_dir_base):
   return next(iter(results or []), None)
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def garbage_collect_exports(export_dir_base, exports_to_keep):
   """Deletes older exports, retaining only a given number of the most recent.
 
@@ -387,6 +400,7 @@ def garbage_collect_exports(export_dir_base, exports_to_keep):
       logging.warn('Can not delete %s recursively: %s', p.path, e)
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def make_export_strategy(serving_input_fn,
                          default_output_alternative_key=None,
                          assets_extra=None,
@@ -400,7 +414,7 @@ def make_export_strategy(serving_input_fn,
       `InputFnOps`.
     default_output_alternative_key: the name of the head to serve when an
       incoming serving request does not explicitly request a specific head.
-      Must be `None` if the estimator inherits from ${tf.estimator.Estimator}
+      Must be `None` if the estimator inherits from @{tf.estimator.Estimator}
       or for single-headed models.
     assets_extra: A dict specifying how to populate the assets.extra directory
       within the exported SavedModel.  Each key should give the destination
@@ -438,7 +452,7 @@ def make_export_strategy(serving_input_fn,
       The string path to the exported directory.
 
     Raises:
-      ValueError: If `estimator` is a ${tf.estimator.Estimator} instance
+      ValueError: If `estimator` is a @{tf.estimator.Estimator} instance
         and `default_output_alternative_key` was specified.
     """
     if isinstance(estimator, core_estimator.Estimator):
@@ -469,6 +483,8 @@ def make_export_strategy(serving_input_fn,
   return export_strategy.ExportStrategy('Servo', export_fn, strip_default_attrs)
 
 
+@deprecated(None,
+            'Use tf.estimator.export.build_parsing_serving_input_receiver_fn')
 def make_parsing_export_strategy(feature_columns,
                                  default_output_alternative_key=None,
                                  assets_extra=None,
@@ -487,7 +503,7 @@ def make_parsing_export_strategy(feature_columns,
       that must be provided at serving time (excluding labels!).
     default_output_alternative_key: the name of the head to serve when an
       incoming serving request does not explicitly request a specific head.
-      Must be `None` if the estimator inherits from ${tf.estimator.Estimator}
+      Must be `None` if the estimator inherits from @{tf.estimator.Estimator}
       or for single-headed models.
     assets_extra: A dict specifying how to populate the assets.extra directory
       within the exported SavedModel.  Each key should give the destination
@@ -555,8 +571,14 @@ def _default_compare_fn(curr_best_eval_result, cand_eval_result):
 
 
 class BestModelSelector(object):
-  """A helper that keeps track of export selection candidates."""
+  """A helper that keeps track of export selection candidates.
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
 
+  @deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
   def __init__(self, event_file_pattern=None, compare_fn=None):
     """Constructor of this class.
 
@@ -622,6 +644,7 @@ class BestModelSelector(object):
     return best_eval_result
 
 
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def make_best_model_export_strategy(
     serving_input_fn,
     exports_to_keep=1,
@@ -707,6 +730,7 @@ def make_best_model_export_strategy(
 
 # TODO(b/67013778): Revisit this approach when corresponding changes to
 # TF Core are finalized.
+@deprecated(None, 'Switch to tf.estimator.Exporter and associated utilities.')
 def extend_export_strategy(base_export_strategy,
                            post_export_fn,
                            post_export_name=None):
@@ -741,7 +765,7 @@ def extend_export_strategy(base_export_strategy,
       The string path to the SavedModel indicated by post_export_fn.
 
     Raises:
-      ValueError: If `estimator` is a ${tf.estimator.Estimator} instance
+      ValueError: If `estimator` is a @{tf.estimator.Estimator} instance
         and `default_output_alternative_key` was specified or if post_export_fn
         does not return a valid directory.
       RuntimeError: If unable to create temporary or final export directory.
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
index 1fa55132b1fc0cd3367ca2eb331b6870edc30c3b..4ce91a140f816ddc8bdc60287e4cbc807172ec6d 100644
--- a/tensorflow/contrib/legacy_seq2seq/BUILD
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -58,17 +58,8 @@ cuda_py_tests(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = ["noasan"],  # times out b/63678675
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    tags = [
+        "noasan",  # times out b/63678675
+        "optonly",  # times out (flaky)
+    ],
 )
diff --git a/tensorflow/contrib/libsvm/BUILD b/tensorflow/contrib/libsvm/BUILD
index df96402a4ffd51840f77d58d8066487030362340..4dccb9be7cd2e603edcf10c020cc0ee1675f518a 100644
--- a/tensorflow/contrib/libsvm/BUILD
+++ b/tensorflow/contrib/libsvm/BUILD
@@ -88,15 +88,3 @@ tf_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index 208e7bc69be76680868c766bc99429eea5870c80..2e92ad6eb39d8aa8876a34572f50d5b6aff0511a 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -43,14 +43,46 @@ cuda_py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+cuda_py_test(
+    name = "linear_operator_block_diag_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/linear_operator_block_diag_test.py"],
+    additional_deps = [
+        ":linalg_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",
+        "optonly",
+    ],
+)
+
+cuda_py_test(
+    name = "linear_operator_kronecker_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/linear_operator_kronecker_test.py"],
+    additional_deps = [
+        ":linalg_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 8,
+    tags = [
+        "noasan",
+        "optonly",
+    ],
 )
diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py
index 4720692c3384ba1bede1f486c1b1e0e69d10a63a..554854da84715ee8c8d00ec7f8e3156642b43d80 100644
--- a/tensorflow/contrib/linalg/__init__.py
+++ b/tensorflow/contrib/linalg/__init__.py
@@ -17,10 +17,15 @@
 See the @{$python/contrib.linalg} guide.
 
 @@LinearOperator
+@@LinearOperatorBlockDiag
+@@LinearOperatorCirculant
+@@LinearOperatorCirculant2D
+@@LinearOperatorCirculant3D
 @@LinearOperatorDiag
 @@LinearOperatorIdentity
 @@LinearOperatorScaledIdentity
 @@LinearOperatorFullMatrix
+@@LinearOperatorKronecker
 @@LinearOperatorLowerTriangular
 @@LinearOperatorLowRankUpdate
 @@LinearOperatorComposition
@@ -34,7 +39,10 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.contrib.linalg.python.ops.linear_operator_addition import *
+from tensorflow.contrib.linalg.python.ops.linear_operator_block_diag import *
+from tensorflow.contrib.linalg.python.ops.linear_operator_kronecker import *
 from tensorflow.python.ops.linalg.linear_operator import *
+from tensorflow.python.ops.linalg.linear_operator_circulant import *
 from tensorflow.python.ops.linalg.linear_operator_composition import *
 from tensorflow.python.ops.linalg.linear_operator_diag import *
 from tensorflow.python.ops.linalg.linear_operator_full_matrix import *
@@ -45,4 +53,5 @@ from tensorflow.python.ops.linalg.linear_operator_lower_triangular import *
 # pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.python.util.all_util import remove_undocumented
+
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7407ede11409a47f4d9db96ad5b5d801ef1625d
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
@@ -0,0 +1,190 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.linalg.python.ops import linear_operator_block_diag as block_diag
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+random_seed.set_random_seed(23)
+rng = np.random.RandomState(0)
+
+
+def _block_diag_dense(expected_shape, blocks):
+  """Convert a list of blocks, into a dense block diagonal matrix."""
+  rows = []
+  num_cols = 0
+  for block in blocks:
+    # Get the batch shape for the block.
+    batch_row_shape = array_ops.shape(block)[:-1]
+
+    zeros_to_pad_before_shape = array_ops.concat(
+        [batch_row_shape, [num_cols]], axis=-1)
+    zeros_to_pad_before = array_ops.zeros(
+        shape=zeros_to_pad_before_shape, dtype=block.dtype)
+    num_cols += array_ops.shape(block)[-1]
+    zeros_to_pad_after_shape = array_ops.concat(
+        [batch_row_shape, [expected_shape[-2] - num_cols]], axis=-1)
+    zeros_to_pad_after = array_ops.zeros(
+        zeros_to_pad_after_shape, dtype=block.dtype)
+
+    rows.append(array_ops.concat(
+        [zeros_to_pad_before, block, zeros_to_pad_after], axis=-1))
+
+  return array_ops.concat(rows, axis=-2)
+
+
+class SquareLinearOperatorBlockDiagTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    # Increase from 1e-6 to 1e-4
+    self._atol[dtypes.float32] = 1e-4
+    self._atol[dtypes.complex64] = 1e-4
+    self._rtol[dtypes.float32] = 1e-4
+    self._rtol[dtypes.complex64] = 1e-4
+
+  @property
+  def _operator_build_infos(self):
+    build_info = linear_operator_test_util.OperatorBuildInfo
+    return [
+        build_info((0, 0)),
+        build_info((1, 1)),
+        build_info((1, 3, 3)),
+        build_info((5, 5), blocks=[(2, 2), (3, 3)]),
+        build_info((3, 7, 7), blocks=[(1, 2, 2), (3, 2, 2), (1, 3, 3)]),
+        build_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
+    ]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
+    expected_blocks = (
+        build_info.__dict__["blocks"] if "blocks" in build_info.__dict__
+        else [shape])
+    matrices = [
+        linear_operator_test_util.random_positive_definite_matrix(
+            block_shape, dtype, force_well_conditioned=True)
+        for block_shape in expected_blocks
+    ]
+
+    if use_placeholder:
+      matrices_ph = [
+          array_ops.placeholder(dtype=dtype) for _ in expected_blocks
+      ]
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # values are random and we want the same value used for both mat and
+      # feed_dict.
+      matrices = self.evaluate(matrices)
+      operator = block_diag.LinearOperatorBlockDiag(
+          [linalg.LinearOperatorFullMatrix(
+              m_ph, is_square=True) for m_ph in matrices_ph],
+          is_square=True)
+      feed_dict = {m_ph: m for (m_ph, m) in zip(matrices_ph, matrices)}
+    else:
+      operator = block_diag.LinearOperatorBlockDiag(
+          [linalg.LinearOperatorFullMatrix(
+              m, is_square=True) for m in matrices])
+      feed_dict = None
+      # Should be auto-set.
+      self.assertTrue(operator.is_square)
+
+    # Broadcast the shapes.
+    expected_shape = list(build_info.shape)
+
+    matrices = linear_operator_util.broadcast_matrix_batch_dims(matrices)
+
+    block_diag_dense = _block_diag_dense(expected_shape, matrices)
+
+    if not use_placeholder:
+      block_diag_dense.set_shape(
+          expected_shape[:-2] + [expected_shape[-1], expected_shape[-1]])
+
+    return operator, block_diag_dense, feed_dict
+
+  def test_is_x_flags(self):
+    # Matrix with two positive eigenvalues, 1, and 1.
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [linalg.LinearOperatorFullMatrix(matrix)],
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertFalse(operator.is_self_adjoint)
+
+  def test_is_non_singular_auto_set(self):
+    # Matrix with two positive eigenvalues, 11 and 8.
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[11., 0.], [1., 8.]]
+    operator_1 = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
+    operator_2 = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
+
+    operator = block_diag.LinearOperatorBlockDiag(
+        [operator_1, operator_2],
+        is_positive_definite=False,  # No reason it HAS to be False...
+        is_non_singular=None)
+    self.assertFalse(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+
+    with self.assertRaisesRegexp(ValueError, "always non-singular"):
+      block_diag.LinearOperatorBlockDiag(
+          [operator_1, operator_2], is_non_singular=False)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator_1 = linalg.LinearOperatorFullMatrix(matrix, name="left")
+    operator_2 = linalg.LinearOperatorFullMatrix(matrix, name="right")
+
+    operator = block_diag.LinearOperatorBlockDiag([operator_1, operator_2])
+
+    self.assertEqual("left_ds_right", operator.name)
+
+  def test_different_dtypes_raises(self):
+    operators = [
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3)),
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3).astype(np.float32))
+    ]
+    with self.assertRaisesRegexp(TypeError, "same dtype"):
+      block_diag.LinearOperatorBlockDiag(operators)
+
+  def test_non_square_operator_raises(self):
+    operators = [
+        linalg.LinearOperatorFullMatrix(rng.rand(3, 4), is_square=False),
+        linalg.LinearOperatorFullMatrix(rng.rand(3, 3))
+    ]
+    with self.assertRaisesRegexp(ValueError, "square matrices"):
+      block_diag.LinearOperatorBlockDiag(operators)
+
+  def test_empty_operators_raises(self):
+    with self.assertRaisesRegexp(ValueError, "non-empty"):
+      block_diag.LinearOperatorBlockDiag([])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6574da22a188c7aa25ad8426522d0b446af8f5f3
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py
@@ -0,0 +1,194 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.linalg.python.ops import linear_operator_kronecker as kronecker
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+random_seed.set_random_seed(23)
+rng = np.random.RandomState(0)
+
+
+def _kronecker_dense(factors):
+  """Convert a list of factors, into a dense Kronecker product."""
+  product = factors[0]
+  for factor in factors[1:]:
+    product = product[..., array_ops.newaxis, :, array_ops.newaxis]
+    factor_to_mul = factor[..., array_ops.newaxis, :, array_ops.newaxis, :]
+    product *= factor_to_mul
+    product = array_ops.reshape(
+        product,
+        shape=array_ops.concat(
+            [array_ops.shape(product)[:-4],
+             [array_ops.shape(product)[-4] * array_ops.shape(product)[-3],
+              array_ops.shape(product)[-2] * array_ops.shape(product)[-1]]
+            ], axis=0))
+
+  return product
+
+
+class KroneckerDenseTest(test.TestCase):
+
+  def testKroneckerDenseMatrix(self):
+    x = ops.convert_to_tensor([[2., 3.], [1., 2.]], dtype=dtypes.float32)
+    y = ops.convert_to_tensor([[1., 2.], [5., -1.]], dtype=dtypes.float32)
+    # From explicitly writing out the kronecker product of x and y.
+    z = ops.convert_to_tensor([
+        [2., 4., 3., 6.],
+        [10., -2., 15., -3.],
+        [1., 2., 2., 4.],
+        [5., -1., 10., -2.]], dtype=dtypes.float32)
+    # From explicitly writing out the kronecker product of y and x.
+    w = ops.convert_to_tensor([
+        [2., 3., 4., 6.],
+        [1., 2., 2., 4.],
+        [10., 15., -2., -3.],
+        [5., 10., -1., -2.]], dtype=dtypes.float32)
+
+    with self.test_session():
+      self.assertAllClose(_kronecker_dense([x, y]).eval(), z.eval())
+      self.assertAllClose(_kronecker_dense([y, x]).eval(), w.eval())
+
+
+class SquareLinearOperatorKroneckerTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    # Increase from 1e-6 to 1e-4
+    self._atol[dtypes.float32] = 1e-4
+    self._atol[dtypes.complex64] = 1e-4
+    self._rtol[dtypes.float32] = 1e-4
+    self._rtol[dtypes.complex64] = 1e-4
+
+  @property
+  def _operator_build_infos(self):
+    build_info = linear_operator_test_util.OperatorBuildInfo
+    return [
+        build_info((1, 1), factors=[(1, 1), (1, 1)]),
+        build_info((8, 8), factors=[(2, 2), (2, 2), (2, 2)]),
+        build_info((12, 12), factors=[(2, 2), (3, 3), (2, 2)]),
+        build_info((1, 3, 3), factors=[(1, 1), (1, 3, 3)]),
+        build_info((3, 6, 6), factors=[(3, 1, 1), (1, 2, 2), (1, 3, 3)]),
+    ]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
+    expected_factors = build_info.__dict__["factors"]
+    matrices = [
+        linear_operator_test_util.random_positive_definite_matrix(
+            block_shape, dtype, force_well_conditioned=True)
+        for block_shape in expected_factors
+    ]
+
+    if use_placeholder:
+      matrices_ph = [
+          array_ops.placeholder(dtype=dtype) for _ in expected_factors
+      ]
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # values are random and we want the same value used for both mat and
+      # feed_dict.
+      matrices = self.evaluate(matrices)
+      operator = kronecker.LinearOperatorKronecker(
+          [linalg.LinearOperatorFullMatrix(
+              m_ph, is_square=True) for m_ph in matrices_ph],
+          is_square=True)
+      feed_dict = {m_ph: m for (m_ph, m) in zip(matrices_ph, matrices)}
+    else:
+      operator = kronecker.LinearOperatorKronecker(
+          [linalg.LinearOperatorFullMatrix(
+              m, is_square=True) for m in matrices])
+      feed_dict = None
+      # Should be auto-set.
+      self.assertTrue(operator.is_square)
+
+    matrices = linear_operator_util.broadcast_matrix_batch_dims(matrices)
+
+    kronecker_dense = _kronecker_dense(matrices)
+
+    if not use_placeholder:
+      kronecker_dense.set_shape(shape)
+
+    return operator, kronecker_dense, feed_dict
+
+  def test_is_x_flags(self):
+    # Matrix with two positive eigenvalues, 1, and 1.
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [linalg.LinearOperatorFullMatrix(matrix),
+         linalg.LinearOperatorFullMatrix(matrix)],
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertFalse(operator.is_self_adjoint)
+
+  def test_is_non_singular_auto_set(self):
+    # Matrix with two positive eigenvalues, 11 and 8.
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[11., 0.], [1., 8.]]
+    operator_1 = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
+    operator_2 = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
+
+    operator = kronecker.LinearOperatorKronecker(
+        [operator_1, operator_2],
+        is_positive_definite=False,  # No reason it HAS to be False...
+        is_non_singular=None)
+    self.assertFalse(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+
+    with self.assertRaisesRegexp(ValueError, "always non-singular"):
+      kronecker.LinearOperatorKronecker(
+          [operator_1, operator_2], is_non_singular=False)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator_1 = linalg.LinearOperatorFullMatrix(matrix, name="left")
+    operator_2 = linalg.LinearOperatorFullMatrix(matrix, name="right")
+
+    operator = kronecker.LinearOperatorKronecker([operator_1, operator_2])
+
+    self.assertEqual("left_x_right", operator.name)
+
+  def test_different_dtypes_raises(self):
+    operators = [
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3)),
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3).astype(np.float32))
+    ]
+    with self.assertRaisesRegexp(TypeError, "same dtype"):
+      kronecker.LinearOperatorKronecker(operators)
+
+  def test_empty_or_one_operators_raises(self):
+    with self.assertRaisesRegexp(ValueError, ">=1 operators"):
+      kronecker.LinearOperatorKronecker([])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d3af66c92b59dd030d4b2a829ab733eec6cf0c1
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
@@ -0,0 +1,370 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Create a Block Diagonal operator from one or more `LinearOperators`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
+
+
+class LinearOperatorBlockDiag(linear_operator.LinearOperator):
+  """Combines one or more `LinearOperators` in to a Block Diagonal matrix.
+
+  This operator combines one or more linear operators `[op1,...,opJ]`,
+  building a new `LinearOperator`, whose underlying matrix representation is
+  square and has each operator `opi` on the main diagonal, and zero's elsewhere.
+
+  #### Shape compatibility
+
+  If `opj` acts like a [batch] square matrix `Aj`, then `op_combined` acts like
+  the [batch] square matrix formed by having each matrix `Aj` on the main
+  diagonal.
+
+
+  Each `opj` is required to represent a square matrix, and hence will have
+  shape `batch_shape_j + [M_j, M_j]`.
+
+  If `opj` has shape `batch_shape_j + [M_j, M_j]`, then the combined operator
+  has shape `broadcast_batch_shape + [sum M_j, sum M_j]`, where
+  `broadcast_batch_shape` is the mutual broadcast of `batch_shape_j`,
+  `j = 1,...,J`, assuming the intermediate batch shapes broadcast.
+  Even if the combined shape is well defined, the combined operator's
+  methods may fail due to lack of broadcasting ability in the defining
+  operators' methods.
+
+  ```python
+  # Create a 4 x 4 linear operator combined of two 2 x 2 operators.
+  operator_1 = LinearOperatorFullMatrix([[1., 2.], [3., 4.]])
+  operator_2 = LinearOperatorFullMatrix([[1., 0.], [0., 1.]])
+  operator = LinearOperatorBlockDiag([operator_1, operator_2])
+
+  operator.to_dense()
+  ==> [[1., 2., 0., 0.],
+       [3., 4., 0., 0.],
+       [0., 0., 1., 0.],
+       [0., 0., 0., 1.]]
+
+  operator.shape
+  ==> [4, 4]
+
+  operator.log_abs_determinant()
+  ==> scalar Tensor
+
+  x1 = ... # Shape [2, 2] Tensor
+  x2 = ... # Shape [2, 2] Tensor
+  x = tf.concat([x1, x2], 0)  # Shape [2, 4] Tensor
+  operator.matmul(x)
+  ==> tf.concat([operator_1.matmul(x1), operator_2.matmul(x2)])
+
+  # Create a [2, 3] batch of 4 x 4 linear operators.
+  matrix_44 = tf.random_normal(shape=[2, 3, 4, 4])
+  operator_44 = LinearOperatorFullMatrix(matrix)
+
+  # Create a [1, 3] batch of 5 x 5 linear operators.
+  matrix_55 = tf.random_normal(shape=[1, 3, 5, 5])
+  operator_55 = LinearOperatorFullMatrix(matrix_55)
+
+  # Combine to create a [2, 3] batch of 9 x 9 operators.
+  operator_99 = LinearOperatorBlockDiag([operator_44, operator_55])
+
+  # Create a shape [2, 3, 9] vector.
+  x = tf.random_normal(shape=[2, 3, 9])
+  operator_99.matmul(x)
+  ==> Shape [2, 3, 9] Tensor
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorBlockDiag` on any operation is equal to
+  the sum of the individual operators' operations.
+
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operators,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=True,
+               name=None):
+    r"""Initialize a `LinearOperatorBlockDiag`.
+
+    `LinearOperatorBlockDiag` is initialized with a list of operators
+    `[op_1,...,op_J]`.
+
+    Args:
+      operators:  Iterable of `LinearOperator` objects, each with
+        the same `dtype` and composable shape.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+        This is true by default, and will raise a `ValueError` otherwise.
+      name: A name for this `LinearOperator`.  Default is the individual
+        operators names joined with `_o_`.
+
+    Raises:
+      TypeError:  If all operators do not have the same `dtype`.
+      ValueError:  If `operators` is empty or are non-square.
+    """
+    # Validate operators.
+    check_ops.assert_proper_iterable(operators)
+    operators = list(operators)
+    if not operators:
+      raise ValueError(
+          "Expected a non-empty list of operators. Found: %s" % operators)
+    self._operators = operators
+
+    # Validate dtype.
+    dtype = operators[0].dtype
+    for operator in operators:
+      if operator.dtype != dtype:
+        name_type = (str((o.name, o.dtype)) for o in operators)
+        raise TypeError(
+            "Expected all operators to have the same dtype.  Found %s"
+            % "   ".join(name_type))
+
+    # Auto-set and check hints.
+    if all(operator.is_non_singular for operator in operators):
+      if is_non_singular is False:
+        raise ValueError(
+            "The direct sum of non-singular operators is always non-singular.")
+      is_non_singular = True
+
+    if all(operator.is_self_adjoint for operator in operators):
+      if is_self_adjoint is False:
+        raise ValueError(
+            "The direct sum of self-adjoint operators is always self-adjoint.")
+      is_self_adjoint = True
+
+    if all(operator.is_positive_definite for operator in operators):
+      if is_positive_definite is False:
+        raise ValueError(
+            "The direct sum of positive definite operators is always "
+            "positive definite.")
+      is_positive_definite = True
+
+    if not (is_square and all(operator.is_square for operator in operators)):
+      raise ValueError(
+          "Can only represent a block diagonal of square matrices.")
+
+    # Initialization.
+    graph_parents = []
+    for operator in operators:
+      graph_parents.extend(operator.graph_parents)
+
+    if name is None:
+      # Using ds to mean direct sum.
+      name = "_ds_".join(operator.name for operator in operators)
+    with ops.name_scope(name, values=graph_parents):
+      super(LinearOperatorBlockDiag, self).__init__(
+          dtype=dtype,
+          graph_parents=graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=True,
+          name=name)
+
+  @property
+  def operators(self):
+    return self._operators
+
+  def _shape(self):
+    # Get final matrix shape.
+    domain_dimension = self.operators[0].domain_dimension
+    range_dimension = self.operators[0].range_dimension
+    for operator in self.operators[1:]:
+      domain_dimension += operator.domain_dimension
+      range_dimension += operator.range_dimension
+
+    matrix_shape = tensor_shape.TensorShape([domain_dimension, range_dimension])
+
+    # Get broadcast batch shape.
+    # broadcast_shape checks for compatibility.
+    batch_shape = self.operators[0].batch_shape
+    for operator in self.operators[1:]:
+      batch_shape = common_shapes.broadcast_shape(
+          batch_shape, operator.batch_shape)
+
+    return batch_shape.concatenate(matrix_shape)
+
+  def _shape_tensor(self):
+    # Avoid messy broadcasting if possible.
+    if self.shape.is_fully_defined():
+      return ops.convert_to_tensor(
+          self.shape.as_list(), dtype=dtypes.int32, name="shape")
+
+    domain_dimension = self.operators[0].domain_dimension_tensor()
+    range_dimension = self.operators[0].range_dimension_tensor()
+    for operator in self.operators[1:]:
+      domain_dimension += operator.domain_dimension_tensor()
+      range_dimension += operator.range_dimension_tensor()
+
+    matrix_shape = array_ops.stack([domain_dimension, range_dimension])
+
+    # Dummy Tensor of zeros.  Will never be materialized.
+    zeros = array_ops.zeros(shape=self.operators[0].batch_shape_tensor())
+    for operator in self.operators[1:]:
+      zeros += array_ops.zeros(shape=operator.batch_shape_tensor())
+    batch_shape = array_ops.shape(zeros)
+
+    return array_ops.concat((batch_shape, matrix_shape), 0)
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    split_dim = -1 if adjoint_arg else -2
+    # Split input by rows normally, and otherwise columns.
+    split_x = self._split_input_into_blocks(x, axis=split_dim)
+
+    result_list = []
+    for index, operator in enumerate(self.operators):
+      result_list += [operator.matmul(
+          split_x[index], adjoint=adjoint, adjoint_arg=adjoint_arg)]
+    result_list = linear_operator_util.broadcast_matrix_batch_dims(
+        result_list)
+    return array_ops.concat(result_list, axis=-2)
+
+  def _determinant(self):
+    result = self.operators[0].determinant()
+    for operator in self.operators[1:]:
+      result *= operator.determinant()
+    return result
+
+  def _log_abs_determinant(self):
+    result = self.operators[0].log_abs_determinant()
+    for operator in self.operators[1:]:
+      result += operator.log_abs_determinant()
+    return result
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    split_dim = -1 if adjoint_arg else -2
+    # Split input by rows normally, and otherwise columns.
+    split_rhs = self._split_input_into_blocks(rhs, axis=split_dim)
+
+    solution_list = []
+    for index, operator in enumerate(self.operators):
+      solution_list += [operator.solve(
+          split_rhs[index], adjoint=adjoint, adjoint_arg=adjoint_arg)]
+
+    solution_list = linear_operator_util.broadcast_matrix_batch_dims(
+        solution_list)
+    return array_ops.concat(solution_list, axis=-2)
+
+  def _diag_part(self):
+    diag_list = []
+    for operator in self.operators:
+      # Extend the axis for broadcasting.
+      diag_list += [operator.diag_part()[..., array_ops.newaxis]]
+    diag_list = linear_operator_util.broadcast_matrix_batch_dims(diag_list)
+    diagonal = array_ops.concat(diag_list, axis=-2)
+    return array_ops.squeeze(diagonal, axis=-1)
+
+  def _trace(self):
+    result = self.operators[0].trace()
+    for operator in self.operators[1:]:
+      result += operator.trace()
+    return result
+
+  def _to_dense(self):
+    num_cols = 0
+    rows = []
+    broadcasted_blocks = [operator.to_dense() for operator in self.operators]
+    broadcasted_blocks = linear_operator_util.broadcast_matrix_batch_dims(
+        broadcasted_blocks)
+    for block in broadcasted_blocks:
+      batch_row_shape = array_ops.shape(block)[:-1]
+
+      zeros_to_pad_before_shape = array_ops.concat(
+          [batch_row_shape, [num_cols]], axis=-1)
+      zeros_to_pad_before = array_ops.zeros(
+          shape=zeros_to_pad_before_shape, dtype=block.dtype)
+      num_cols += array_ops.shape(block)[-1]
+      zeros_to_pad_after_shape = array_ops.concat(
+          [batch_row_shape,
+           [self.domain_dimension_tensor() - num_cols]], axis=-1)
+      zeros_to_pad_after = array_ops.zeros(
+          shape=zeros_to_pad_after_shape, dtype=block.dtype)
+
+      rows.append(array_ops.concat(
+          [zeros_to_pad_before, block, zeros_to_pad_after], axis=-1))
+
+    mat = array_ops.concat(rows, axis=-2)
+    mat.set_shape(self.shape)
+    return mat
+
+  def _assert_non_singular(self):
+    return control_flow_ops.group([
+        operator.assert_non_singular() for operator in self.operators])
+
+  def _assert_self_adjoint(self):
+    return control_flow_ops.group([
+        operator.assert_self_adjoint() for operator in self.operators])
+
+  def _assert_positive_definite(self):
+    return control_flow_ops.group([
+        operator.assert_positive_definite() for operator in self.operators])
+
+  def _split_input_into_blocks(self, x, axis=-1):
+    """Split `x` into blocks matching `operators`'s `domain_dimension`.
+
+    Specifically, if we have a block diagonal matrix, with block sizes
+    `[M_j, M_j] j = 1..J`,  this method splits `x` on `axis` into `J`
+    tensors, whose shape at `axis` is `M_j`.
+
+    Args:
+      x: `Tensor`. `x` is split into `J` tensors.
+      axis: Python `Integer` representing the axis to split `x` on.
+
+    Returns:
+      A list of `Tensor`s.
+    """
+    block_sizes = []
+    if self.shape.is_fully_defined():
+      for operator in self.operators:
+        block_sizes += [operator.domain_dimension.value]
+    else:
+      for operator in self.operators:
+        block_sizes += [operator.domain_dimension_tensor()]
+
+    return array_ops.split(x, block_sizes, axis=axis)
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py b/tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py
new file mode 100644
index 0000000000000000000000000000000000000000..79080d194f59b7ebce045ab3e3d262ca948d9391
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py
@@ -0,0 +1,560 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Construct the Kronecker product of one or more `LinearOperators`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+
+
+def _vec(x):
+  """Stacks column of matrix to form a single column."""
+  return array_ops.reshape(
+      array_ops.matrix_transpose(x),
+      array_ops.concat(
+          [array_ops.shape(x)[:-2], [-1]], axis=0))
+
+
+def _unvec_by(y, num_col):
+  """Unstack vector to form a matrix, with a specified amount of columns."""
+  return array_ops.matrix_transpose(
+      array_ops.reshape(
+          y,
+          array_ops.concat(
+              [array_ops.shape(y)[:-1], [num_col, -1]], axis=0)))
+
+
+def _rotate_last_dim(x, rotate_right=False):
+  """Rotate the last dimension either left or right."""
+  ndims = array_ops.rank(x)
+  if rotate_right:
+    transpose_perm = array_ops.concat(
+        [[ndims - 1], math_ops.range(0, ndims - 1)], axis=0)
+  else:
+    transpose_perm = array_ops.concat(
+        [math_ops.range(1, ndims), [0]], axis=0)
+  return array_ops.transpose(x, transpose_perm)
+
+
+class LinearOperatorKronecker(linear_operator.LinearOperator):
+  """Kronecker product between two `LinearOperators`.
+
+  This operator composes one or more linear operators `[op1,...,opJ]`,
+  building a new `LinearOperator` representing the Kronecker product:
+  `op1 x op2 x .. opJ` (we omit parentheses as the Kronecker product is
+  associative).
+
+  If `opj` has shape `batch_shape_j` + [M_j, N_j`, then the composed operator
+  will have shape equal to `broadcast_batch_shape + [prod M_j, prod N_j]`,
+  where the product is over all operators.
+
+  ```python
+  # Create a 4 x 4 linear operator composed of two 2 x 2 operators.
+  operator_1 = LinearOperatorFullMatrix([[1., 2.], [3., 4.]])
+  operator_2 = LinearOperatorFullMatrix([[1., 0.], [2., 1.]])
+  operator = LinearOperatorKronecker([operator_1, operator_2])
+
+  operator.to_dense()
+  ==> [[1., 2., 0., 0.],
+       [3., 4., 0., 0.],
+       [2., 4., 1., 2.],
+       [6., 8., 3., 4.]]
+
+  operator.shape
+  ==> [4, 4]
+
+  operator.log_abs_determinant()
+  ==> scalar Tensor
+
+  x = ... Shape [4, 2] Tensor
+  operator.matmul(x)
+  ==> Shape [4, 2] Tensor
+
+  # Create a [2, 3] batch of 4 x 5 linear operators.
+  matrix_45 = tf.random_normal(shape=[2, 3, 4, 5])
+  operator_45 = LinearOperatorFullMatrix(matrix)
+
+  # Create a [2, 3] batch of 5 x 6 linear operators.
+  matrix_56 = tf.random_normal(shape=[2, 3, 5, 6])
+  operator_56 = LinearOperatorFullMatrix(matrix_56)
+
+  # Compose to create a [2, 3] batch of 20 x 30 operators.
+  operator_large = LinearOperatorKronecker([operator_45, operator_56])
+
+  # Create a shape [2, 3, 20, 2] vector.
+  x = tf.random_normal(shape=[2, 3, 6, 2])
+  operator_large.matmul(x)
+  ==> Shape [2, 3, 30, 2] Tensor
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorKronecker` on any operation is equal to
+  the sum of the individual operators' operations.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operators,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorKronecker`.
+
+    `LinearOperatorKronecker` is initialized with a list of operators
+    `[op_1,...,op_J]`.
+
+    Args:
+      operators:  Iterable of `LinearOperator` objects, each with
+        the same `dtype` and composable shape, representing the Kronecker
+        factors.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`.  Default is the individual
+        operators names joined with `_x_`.
+
+    Raises:
+      TypeError:  If all operators do not have the same `dtype`.
+      ValueError:  If `operators` is empty.
+    """
+    # Validate operators.
+    check_ops.assert_proper_iterable(operators)
+    operators = list(operators)
+    if not operators:
+      raise ValueError(
+          "Expected a list of >=1 operators. Found: %s" % operators)
+    self._operators = operators
+
+    # Validate dtype.
+    dtype = operators[0].dtype
+    for operator in operators:
+      if operator.dtype != dtype:
+        name_type = (str((o.name, o.dtype)) for o in operators)
+        raise TypeError(
+            "Expected all operators to have the same dtype.  Found %s"
+            % "   ".join(name_type))
+
+    # Auto-set and check hints.
+    # A Kronecker product is invertible, if and only if all factors are
+    # invertible.
+    if all(operator.is_non_singular for operator in operators):
+      if is_non_singular is False:
+        raise ValueError(
+            "The Kronecker product of non-singular operators is always "
+            "non-singular.")
+      is_non_singular = True
+
+    if all(operator.is_self_adjoint for operator in operators):
+      if is_self_adjoint is False:
+        raise ValueError(
+            "The Kronecker product of self-adjoint operators is always "
+            "self-adjoint.")
+      is_self_adjoint = True
+
+    # The eigenvalues of a Kronecker product are equal to the products of eigen
+    # values of the corresponding factors.
+    if all(operator.is_positive_definite for operator in operators):
+      if is_positive_definite is False:
+        raise ValueError("The Kronecker product of positive-definite operators "
+                         "is always positive-definite.")
+      is_positive_definite = True
+
+    # Initialization.
+    graph_parents = []
+    for operator in operators:
+      graph_parents.extend(operator.graph_parents)
+
+    if name is None:
+      name = operators[0].name
+      for operator in operators[1:]:
+        name += "_x_" + operator.name
+    with ops.name_scope(name, values=graph_parents):
+      super(LinearOperatorKronecker, self).__init__(
+          dtype=dtype,
+          graph_parents=graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operators(self):
+    return self._operators
+
+  def _shape(self):
+    # Get final matrix shape.
+    domain_dimension = self.operators[0].domain_dimension
+    for operator in self.operators[1:]:
+      domain_dimension *= operator.domain_dimension
+
+    range_dimension = self.operators[0].range_dimension
+    for operator in self.operators[1:]:
+      range_dimension *= operator.range_dimension
+
+    matrix_shape = tensor_shape.TensorShape([
+        range_dimension, domain_dimension])
+
+    # Get broadcast batch shape.
+    # broadcast_shape checks for compatibility.
+    batch_shape = self.operators[0].batch_shape
+    for operator in self.operators[1:]:
+      batch_shape = common_shapes.broadcast_shape(
+          batch_shape, operator.batch_shape)
+
+    return batch_shape.concatenate(matrix_shape)
+
+  def _shape_tensor(self):
+    domain_dimension = self.operators[0].domain_dimension_tensor()
+    for operator in self.operators[1:]:
+      domain_dimension *= operator.domain_dimension_tensor()
+
+    range_dimension = self.operators[0].range_dimension_tensor()
+    for operator in self.operators[1:]:
+      range_dimension *= operator.range_dimension_tensor()
+
+    matrix_shape = [range_dimension, domain_dimension]
+
+    # Get broadcast batch shape.
+    # broadcast_shape checks for compatibility.
+    batch_shape = self.operators[0].batch_shape_tensor()
+    for operator in self.operators[1:]:
+      batch_shape = array_ops.broadcast_dynamic_shape(
+          batch_shape, operator.batch_shape_tensor())
+
+    return array_ops.concat((batch_shape, matrix_shape), 0)
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    # Here we heavily rely on Roth's column Lemma [1]:
+    # (A x B) * vec X = vec BXA^T,
+    # where vec stacks all the columns of the matrix under each other. In our
+    # case, x represents a batch of vec X (i.e. we think of x as a batch of
+    # column vectors, rather than a matrix). Each member of the batch can be
+    # reshaped to a matrix (hence we get a batch of matrices).
+    # We can iteratively apply this lemma by noting that if B is a Kronecker
+    # product, then we can apply the lemma again.
+
+    # [1] W. E. Roth, "On direct product matrices,"
+    # Bulletin of the American Mathematical Society, vol. 40, pp. 461-468,
+    # 1934
+
+    # Efficiency
+
+    # Naively doing the Kronecker product, by calculating the dense matrix and
+    # applying it will can take cubic time in  the size of domain_dimension
+    # (assuming a square matrix). The other issue is that calculating the dense
+    # matrix can be prohibitively expensive, in that it can take a large amount
+    # of memory.
+    #
+    # This implementation avoids this memory blow up by only computing matmuls
+    # with the factors. In this way, we don't have to realize the dense matrix.
+    # In terms of complexity, if we have Kronecker Factors of size:
+    # (n1, n1), (n2, n2), (n3, n3), ... (nJ, nJ), with N = \prod n_i, and we
+    # have as input a [N, M] matrix, the naive approach would take O(N^2 M).
+    # With this approach (ignoring reshaping of tensors and transposes for now),
+    # the time complexity can be O(M * (\sum n_i) * N). There is also the
+    # benefit of batched multiplication (In this example, the batch size is
+    # roughly M * N) so this can be much faster. However, not factored in are
+    # the costs of the several transposing of tensors, which can affect cache
+    # behavior.
+
+    # Below we document the shape manipulation for adjoint=False,
+    # adjoint_arg=False, but the general case of different adjoints is still
+    # handled.
+
+    if adjoint_arg:
+      x = linalg.adjoint(x)
+
+    # Always add a batch dimension to enable broadcasting to work.
+    batch_shape = array_ops.concat(
+        [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0)
+    x += array_ops.zeros(batch_shape, dtype=x.dtype.base_dtype)
+
+    # x has shape [B, R, C], where B represent some number of batch dimensions,
+    # R represents the number of rows, and C represents the number of columns.
+    # In order to apply Roth's column lemma, we need to operate on a batch of
+    # column vectors, so we reshape into a batch of column vectors. We put it
+    # at the front to ensure that broadcasting between operators to the batch
+    # dimensions B still works.
+    output = _rotate_last_dim(x, rotate_right=True)
+
+    # Also expand the shape to be [A, C, B, R]. The first dimension will be
+    # used to accumulate dimensions from each operator matmul.
+    output = output[array_ops.newaxis, ...]
+
+    # In this loop, A is going to refer to the value of the accumulated
+    # dimension. A = 1 at the start, and will end up being self.range_dimension.
+    # V will refer to the last dimension. V = R at the start, and will end up
+    # being 1 in the end.
+    for operator in self.operators[:-1]:
+      # Reshape output from [A, C, B, V] to be
+      # [A, C, B, V / op.domain_dimension, op.domain_dimension]
+      if adjoint:
+        operator_dimension = operator.range_dimension_tensor()
+      else:
+        operator_dimension = operator.domain_dimension_tensor()
+
+      output = _unvec_by(output, operator_dimension)
+
+      # We are computing (XA^T) = (AX^T)^T.
+      # output has [A, C, B, V / op.domain_dimension, op.domain_dimension],
+      # which is being converted to:
+      # [A, C, B, V / op.domain_dimension, op.range_dimension]
+      output = array_ops.matrix_transpose(output)
+      output = operator.matmul(output, adjoint=adjoint, adjoint_arg=False)
+      output = array_ops.matrix_transpose(output)
+      # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension]
+      output = _rotate_last_dim(output, rotate_right=False)
+      output = _vec(output)
+      output = _rotate_last_dim(output, rotate_right=True)
+
+    # After the loop, we will have
+    # A = self.range_dimension / op[-1].range_dimension
+    # V = op[-1].domain_dimension
+
+    # We convert that using matvec to get:
+    # [A, C, B, op[-1].range_dimension]
+    output = self.operators[-1].matvec(output, adjoint=adjoint)
+    # Rearrange shape to be [B1, ... Bn, self.range_dimension, C]
+    output = _rotate_last_dim(output, rotate_right=False)
+    output = _vec(output)
+    output = _rotate_last_dim(output, rotate_right=False)
+
+    if x.shape.is_fully_defined():
+      column_dim = x.shape[-1]
+      broadcast_batch_shape = common_shapes.broadcast_shape(
+          x.shape[:-2], self.batch_shape)
+      if adjoint:
+        matrix_dimensions = [self.domain_dimension, column_dim]
+      else:
+        matrix_dimensions = [self.range_dimension, column_dim]
+
+      print("x: ", x)
+      print("bathc_shape:", self.batch_shape)
+      print("self.shape:", self.shape)
+      print("output: ", output)
+      output.set_shape(broadcast_batch_shape.concatenate(
+          matrix_dimensions))
+
+    return output
+
+  def _determinant(self):
+    # Note that we have |X1 x X2| = |X1| ** n * |X2| ** m, where X1 is an m x m
+    # matrix, and X2 is an n x n matrix. We can iteratively apply this property
+    # to get the determinant of |X1 x X2 x X3 ...|. If T is the product of the
+    # domain dimension of all operators, then we have:
+    # |X1 x X2 x X3 ...| =
+    #    |X1| ** (T / m) * |X2 x X3 ... | ** m =
+    #    |X1| ** (T / m) * |X2| ** (m * (T / m) / n) *  ... =
+    #    |X1| ** (T / m) * |X2| ** (T / n) * | X3 x X4... | ** (m * n)
+    #    And by doing induction we have product(|X_i| ** (T / dim(X_i))).
+    total = self.domain_dimension_tensor()
+    determinant = 1.
+    for operator in self.operators:
+      determinant *= operator.determinant() ** math_ops.cast(
+          total / operator.domain_dimension_tensor(),
+          dtype=operator.dtype)
+    return determinant
+
+  def _log_abs_determinant(self):
+    # This will be sum((total / dim(x_i)) * log |X_i|)
+    total = self.domain_dimension_tensor()
+    log_abs_det = 0.
+    for operator in self.operators:
+      log_abs_det += operator.log_abs_determinant() * math_ops.cast(
+          total / operator.domain_dimension_tensor(),
+          dtype=operator.dtype)
+    return log_abs_det
+
+  def _trace(self):
+    # tr(A x B) = tr(A) * tr(B)
+    trace = 1.
+    for operator in self.operators:
+      trace *= operator.trace()
+    return trace
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    # Here we follow the same use of Roth's column lemma as in `matmul`, with
+    # the key difference that we replace all `matmul` instances with `solve`.
+    # This follows from the property that inv(A x B) = inv(A) x inv(B).
+
+    # Below we document the shape manipulation for adjoint=False,
+    # adjoint_arg=False, but the general case of different adjoints is still
+    # handled.
+
+    if adjoint_arg:
+      rhs = linalg.adjoint(rhs)
+
+    # Always add a batch dimension to enable broadcasting to work.
+    batch_shape = array_ops.concat(
+        [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0)
+    rhs += array_ops.zeros(batch_shape, dtype=rhs.dtype.base_dtype)
+
+    # rhs has shape [B, R, C], where B represent some number of batch
+    # dimensions,
+    # R represents the number of rows, and C represents the number of columns.
+    # In order to apply Roth's column lemma, we need to operate on a batch of
+    # column vectors, so we reshape into a batch of column vectors. We put it
+    # at the front to ensure that broadcasting between operators to the batch
+    # dimensions B still works.
+    output = _rotate_last_dim(rhs, rotate_right=True)
+
+    # Also expand the shape to be [A, C, B, R]. The first dimension will be
+    # used to accumulate dimensions from each operator matmul.
+    output = output[array_ops.newaxis, ...]
+
+    # In this loop, A is going to refer to the value of the accumulated
+    # dimension. A = 1 at the start, and will end up being self.range_dimension.
+    # V will refer to the last dimension. V = R at the start, and will end up
+    # being 1 in the end.
+    for operator in self.operators[:-1]:
+      # Reshape output from [A, C, B, V] to be
+      # [A, C, B, V / op.domain_dimension, op.domain_dimension]
+      if adjoint:
+        operator_dimension = operator.range_dimension_tensor()
+      else:
+        operator_dimension = operator.domain_dimension_tensor()
+
+      output = _unvec_by(output, operator_dimension)
+
+      # We are computing (XA^-1^T) = (A^-1 X^T)^T.
+      # output has [A, C, B, V / op.domain_dimension, op.domain_dimension],
+      # which is being converted to:
+      # [A, C, B, V / op.domain_dimension, op.range_dimension]
+      output = array_ops.matrix_transpose(output)
+      output = operator.solve(output, adjoint=adjoint, adjoint_arg=False)
+      output = array_ops.matrix_transpose(output)
+      # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension]
+      output = _rotate_last_dim(output, rotate_right=False)
+      output = _vec(output)
+      output = _rotate_last_dim(output, rotate_right=True)
+
+    # After the loop, we will have
+    # A = self.range_dimension / op[-1].range_dimension
+    # V = op[-1].domain_dimension
+
+    # We convert that using matvec to get:
+    # [A, C, B, op[-1].range_dimension]
+    output = self.operators[-1].solvevec(output, adjoint=adjoint)
+    # Rearrange shape to be [B1, ... Bn, self.range_dimension, C]
+    output = _rotate_last_dim(output, rotate_right=False)
+    output = _vec(output)
+    output = _rotate_last_dim(output, rotate_right=False)
+
+    if rhs.shape.is_fully_defined():
+      column_dim = rhs.shape[-1]
+      broadcast_batch_shape = common_shapes.broadcast_shape(
+          rhs.shape[:-2], self.batch_shape)
+      if adjoint:
+        matrix_dimensions = [self.domain_dimension, column_dim]
+      else:
+        matrix_dimensions = [self.range_dimension, column_dim]
+
+      output.set_shape(broadcast_batch_shape.concatenate(
+          matrix_dimensions))
+
+    return output
+
+  def _diag_part(self):
+    diag_part = self.operators[0].diag_part()
+    for operator in self.operators[1:]:
+      diag_part = diag_part[..., :, array_ops.newaxis]
+      op_diag_part = operator.diag_part()[..., array_ops.newaxis, :]
+      diag_part *= op_diag_part
+      diag_part = array_ops.reshape(
+          diag_part,
+          shape=array_ops.concat(
+              [array_ops.shape(diag_part)[:-2], [-1]], axis=0))
+    if self.range_dimension > self.domain_dimension:
+      diag_dimension = self.domain_dimension
+    else:
+      diag_dimension = self.range_dimension
+    diag_part.set_shape(
+        self.batch_shape.concatenate(diag_dimension))
+    return diag_part
+
+  def _to_dense(self):
+    product = self.operators[0].to_dense()
+    for operator in self.operators[1:]:
+      # Product has shape [B, R1, 1, C1].
+      product = product[
+          ..., :, array_ops.newaxis, :, array_ops.newaxis]
+      # Operator has shape [B, 1, R2, 1, C2].
+      op_to_mul = operator.to_dense()[
+          ..., array_ops.newaxis, :, array_ops.newaxis, :]
+      # This is now [B, R1, R2, C1, C2].
+      product *= op_to_mul
+      # Now merge together dimensions to get [B, R1 * R2, C1 * C2].
+      product = array_ops.reshape(
+          product,
+          shape=array_ops.concat(
+              [array_ops.shape(product)[:-4],
+               [array_ops.shape(product)[-4] * array_ops.shape(product)[-3],
+                array_ops.shape(product)[-2] * array_ops.shape(product)[-1]]
+              ], axis=0))
+    product.set_shape(self.shape)
+    return product
+
+  def _assert_non_singular(self):
+    if all(operator.is_square for operator in self.operators):
+      asserts = [operator.assert_non_singular() for operator in self.operators]
+      return control_flow_ops.group(asserts)
+    else:
+      raise errors.InvalidArgumentError(
+          node_def=None, op=None, message="All Kronecker factors must be "
+          "square for the product to be invertible.")
+
+  def _assert_self_adjoint(self):
+    if all(operator.is_square for operator in self.operators):
+      asserts = [operator.assert_self_adjoint() for operator in self.operators]
+      return control_flow_ops.group(asserts)
+    else:
+      raise errors.InvalidArgumentError(
+          node_def=None, op=None, message="All Kronecker factors must be "
+          "square for the product to be self adjoint.")
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index cea3627ed565f0de86d8d9bb6b45c4b19c5b5558..5b89c6cef9fa9fdef7c26ddee1efa03f3056d881 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -138,14 +138,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index 70f777f08bd5b8157e601f19019075d3e7543811..b5741967ab52568725d7c9f03a0cc0b0f63f7459 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import random
 import threading
 
 from tensorflow.contrib.linear_optimizer.python.ops.sdca_ops import SdcaModel
@@ -38,8 +39,8 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import googletest
 
 _MAX_ITERATIONS = 100
-_SHARD_NUMBERS = [None, 1, 3, 10]
-_NUM_LOSS_PARTITIONS = [2, 4]
+_SHARD_NUMBERS = [None, 1, 3]
+_NUM_LOSS_PARTITIONS = [4]
 
 
 def make_example_proto(feature_dict, target, value=1.0):
@@ -102,6 +103,35 @@ def make_example_dict(example_protos, example_weights):
       example_ids=['%d' % i for i in range(0, len(example_protos))])
 
 
+def make_random_examples_and_variables_dicts(num_examples, dim, num_non_zero):
+  random.seed(1)
+
+  sparse_features = [
+      SparseFeatureColumn(
+          [i for i in range(num_examples) for _ in range(num_non_zero)], [
+              i for _ in range(num_examples)
+              for i in random.sample(range(dim), num_non_zero)
+          ],
+          [num_non_zero**(-0.5) for _ in range(num_examples * num_non_zero)])
+  ]
+  examples_dict = dict(
+      sparse_features=sparse_features,
+      dense_features=[],
+      example_weights=[random.random() for _ in range(num_examples)],
+      example_labels=[
+          1. if random.random() > 0.5 else 0. for _ in range(num_examples)
+      ],
+      example_ids=[str(i) for i in range(num_examples)])
+
+  weights = variables_lib.Variable(
+      array_ops.zeros([dim], dtype=dtypes.float32))
+  variables_dict = dict(
+      sparse_features_weights=[weights],
+      dense_features_weights=[])
+
+  return examples_dict, variables_dict
+
+
 def make_variable_dict(max_age, max_gender):
   # TODO(sibyl-toe9oF2e):  Figure out how to derive max_age & max_gender from
   # examples_dict.
@@ -235,6 +265,60 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
         self.assertAllClose(
             0.01, lr.approximate_duality_gap().eval(), rtol=1e-2, atol=1e-2)
 
+  def testSparseRandom(self):
+    dim = 20
+    num_examples = 1000
+    # Number of non-zero features per example.
+    non_zeros = 10
+    # Setup test data.
+    with self._single_threaded_test_session():
+      examples, variables = make_random_examples_and_variables_dicts(
+          num_examples, dim, non_zeros)
+      options = dict(
+          symmetric_l2_regularization=.1,
+          symmetric_l1_regularization=0,
+          num_table_shards=1,
+          adaptive=False,
+          loss_type='logistic_loss')
+
+      lr = SdcaModel(examples, variables, options)
+      variables_lib.global_variables_initializer().run()
+      train_op = lr.minimize()
+      for _ in range(4):
+        train_op.run()
+      lr.update_weights(train_op).run()
+      # Duality gap is 1.4e-5.
+      # It would be 0.01 without shuffling and 0.02 with adaptive sampling.
+      self.assertNear(0.0, lr.approximate_duality_gap().eval(), err=1e-3)
+
+  def testSparseDuplicate(self):
+    # Setup test data
+    example_protos = [
+        make_example_proto({
+            'age': [0] * 5,
+            'gender': [0] * 5
+        }, 0),
+        make_example_proto({
+            'age': [1] * 5,
+            'gender': [1] * 5
+        }, 1),
+    ]
+    example_weights = [1.0, 1.0]
+    with self._single_threaded_test_session():
+      examples = make_example_dict(example_protos, example_weights)
+      variables = make_variable_dict(1, 1)
+      options = dict(
+          symmetric_l2_regularization=1,
+          symmetric_l1_regularization=0,
+          loss_type='logistic_loss')
+
+      lr = SdcaModel(examples, variables, options)
+      variables_lib.global_variables_initializer().run()
+      train_op = lr.minimize()
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   'Duplicate'):
+        train_op.run()
+
   def testDistributedSimple(self):
     # Setup test data
     example_protos = [
@@ -270,14 +354,14 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
 
           train_op = lr.minimize()
 
-          def Minimize():
+          def minimize():
             with self._single_threaded_test_session():
               for _ in range(_MAX_ITERATIONS):
-                train_op.run()
+                train_op.run()  # pylint: disable=cell-var-from-loop
 
           threads = []
           for _ in range(num_loss_partitions):
-            threads.append(threading.Thread(target=Minimize))
+            threads.append(threading.Thread(target=minimize))
             threads[-1].start()
 
           for t in threads:
@@ -395,7 +479,7 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
         predicted_labels = get_binary_predictions_for_logistic(predictions)
         self.assertAllClose([0, 1, 1, 1], predicted_labels.eval())
         self.assertAllClose(
-            0.01, lr.approximate_duality_gap().eval(), rtol=1e-2, atol=1e-2)
+            0.0, lr.approximate_duality_gap().eval(), rtol=1e-2, atol=1e-2)
 
   def testFractionalExampleLabel(self):
     # Setup test data with 1 positive, and 1 mostly-negative example.
@@ -407,7 +491,7 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
         make_example_proto({
             'age': [1],
             'gender': [1]
-        }, 1),
+        }, 0.9),
     ]
     example_weights = [1.0, 1.0]
     for num_shards in _SHARD_NUMBERS:
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index 3f5fdc18bb8f47cceee8f81dd5ded02059344b8b..f980746a19fb8e0a02b9d023c127da7ab33e457f 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -168,6 +168,10 @@ class SdcaModel(object):
     # of workers
     return self._options.get('num_loss_partitions', 1)
 
+  def _adaptive(self):
+    # Perform adaptive sampling.
+    return self._options.get('adaptive', True)
+
   def _num_table_shards(self):
     # Number of hash table shards.
     # Return 1 if not specified or if the value is 'None'
@@ -344,7 +348,8 @@ class SdcaModel(object):
           l1=self._options['symmetric_l1_regularization'],
           l2=self._symmetric_l2_regularization(),
           num_loss_partitions=self._num_loss_partitions(),
-          num_inner_iterations=1)
+          num_inner_iterations=1,
+          adaptative=self._adaptive())
       # pylint: enable=protected-access
 
       with ops.control_dependencies([esu]):
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index ec726bbed41a86eb314e3591ecaedaa6bf0e5e9b..5015fb0848107950dd27eb81431dd308f22858bc 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -49,6 +49,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
                default_value,
                empty_key,
                num_shards=1,
+               checkpoint=True,
                name='ShardedMutableHashTable'):
     with ops.name_scope(name, 'sharded_mutable_hash_table') as scope:
       super(ShardedMutableDenseHashTable, self).__init__(key_dtype,
@@ -61,6 +62,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
                 value_dtype=value_dtype,
                 default_value=default_value,
                 empty_key=empty_key,
+                checkpoint=checkpoint,
                 name='%s-%d-of-%d' % (name, i + 1, num_shards)))
       self._table_shards = table_shards
       # TODO(andreasst): add a value_shape() method to LookupInterface
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index 05794a42c5f2d0eece6adab36fb5610078cece31..d4e54c82f988e0adcd16aad29702ee9f8b16aea3 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -140,8 +140,8 @@ def sdca_model_fn(features, labels, mode, params, config=None):
 
   parent_scope = "linear"
 
-  with variable_scope.variable_op_scope(features.values(),
-                                        parent_scope) as scope:
+  with variable_scope.variable_scope(
+      values=features.values(), name_or_scope=parent_scope) as scope:
     features = features.copy()
     features.update(layers.transform_features(features, feature_columns))
     logits, columns_to_variables, bias = (
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 79a5928a21cb9a2633b2aac178f185ba333790d6..bed3d5139fcbf9d9e8b85605c752736f26af6793 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -30,6 +30,13 @@ from tensorflow.python.platform import test
 
 class SDCALogisticClassifierTest(test.TestCase):
 
+  def _single_threaded_test_session(self):
+    # TODO(andreasst): figure out why SDCALinearRegressor needs a single
+    # threaded session to pass in tsan mode but SDCALogisticClassifier does not.
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
+    return self.test_session(config=config)
+
   def testRealValuedFeatures(self):
     """Tests SDCALogisticClassifier works with real valued features."""
 
@@ -41,7 +48,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0]])
       }, constant_op.constant([[0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       maintenance_cost = feature_column_lib.real_valued_column(
           'maintenance_cost')
       sq_footage = feature_column_lib.real_valued_column('sq_footage')
@@ -66,7 +73,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[500.0, 800.0], [200.0, 600.0]])
       }, constant_op.constant([[0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       dense_feature = feature_column_lib.real_valued_column(
           'dense_feature', dimension=2)
       classifier = sdca_estimator.SDCALogisticClassifier(
@@ -86,7 +93,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price_bucket = feature_column_lib.bucketized_column(
           feature_column_lib.real_valued_column('price'),
           boundaries=[500.0, 700.0])
@@ -120,7 +127,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price = feature_column_lib.real_valued_column('price')
       country = feature_column_lib.sparse_column_with_hash_bucket(
           'country', hash_bucket_size=5)
@@ -151,7 +158,7 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 5])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       country = feature_column_lib.sparse_column_with_hash_bucket(
           'country', hash_bucket_size=5)
       country_weighted_by_price = feature_column_lib.weighted_sparse_column(
@@ -163,6 +170,38 @@ class SDCALogisticClassifierTest(test.TestCase):
       metrics = classifier.evaluate(input_fn=input_fn, steps=1)
       self.assertGreater(metrics['accuracy'], 0.9)
 
+  def testSparseFeaturesWithDuplicates(self):
+    """Tests SDCALogisticClassifier with duplicated sparse features."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2']),
+          'age':
+              sparse_tensor.SparseTensor(
+                  values=['20-29'] * 5 + ['31-40'] * 5,
+                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
+                           [1, 0], [1, 0], [1, 0], [1, 0]],
+                  dense_shape=[2, 1]),
+          'gender':
+              sparse_tensor.SparseTensor(
+                  values=['m'] * 5 + ['f'] * 5,
+                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
+                           [1, 0], [1, 0], [1, 0], [1, 0]],
+                  dense_shape=[2, 1]),
+      }, constant_op.constant([[1], [0]])
+
+    with self._single_threaded_test_session():
+      age = feature_column_lib.sparse_column_with_hash_bucket(
+          'age', hash_bucket_size=10)
+      gender = feature_column_lib.sparse_column_with_hash_bucket(
+          'gender', hash_bucket_size=10)
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id', feature_columns=[age, gender])
+      classifier.fit(input_fn=input_fn, steps=50)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertLess(metrics['loss'], 0.060)
+
   def testCrossedFeatures(self):
     """Tests SDCALogisticClassifier with crossed features."""
 
@@ -182,7 +221,7 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 1])
       }, constant_op.constant([[0], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       language = feature_column_lib.sparse_column_with_hash_bucket(
           'language', hash_bucket_size=5)
       country = feature_column_lib.sparse_column_with_hash_bucket(
@@ -215,7 +254,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[3.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price = feature_column_lib.real_valued_column('price')
       sq_footage_bucket = feature_column_lib.bucketized_column(
           feature_column_lib.real_valued_column('sq_footage'),
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index 92d022f2a30ffeb77e81d3bd01365afcd14826b5..213c2eced5c7f99bad4a8ad41c71837c8186c18b 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 from tensorflow.contrib import layers
 from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
 from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -36,18 +37,18 @@ class SDCAOptimizer(object):
   Example usage:
 
   ```python
-    real_feature_column = real_valued_column(...)
-    sparse_feature_column = sparse_column_with_hash_bucket(...)
-    sdca_optimizer = linear.SDCAOptimizer(example_id_column='example_id',
-                                          num_loss_partitions=1,
-                                          num_table_shards=1,
-                                          symmetric_l2_regularization=2.0)
-    classifier = tf.contrib.learn.LinearClassifier(
-        feature_columns=[real_feature_column, sparse_feature_column],
-        weight_column_name=...,
-        optimizer=sdca_optimizer)
-    classifier.fit(input_fn_train, steps=50)
-    classifier.evaluate(input_fn=input_fn_eval)
+  real_feature_column = real_valued_column(...)
+  sparse_feature_column = sparse_column_with_hash_bucket(...)
+  sdca_optimizer = linear.SDCAOptimizer(example_id_column='example_id',
+                                        num_loss_partitions=1,
+                                        num_table_shards=1,
+                                        symmetric_l2_regularization=2.0)
+  classifier = tf.contrib.learn.LinearClassifier(
+      feature_columns=[real_feature_column, sparse_feature_column],
+      weight_column_name=...,
+      optimizer=sdca_optimizer)
+  classifier.fit(input_fn_train, steps=50)
+  classifier.evaluate(input_fn=input_fn_eval)
   ```
 
   Here the expectation is that the `input_fn_*` functions passed to train and
@@ -71,12 +72,14 @@ class SDCAOptimizer(object):
                num_loss_partitions=1,
                num_table_shards=None,
                symmetric_l1_regularization=0.0,
-               symmetric_l2_regularization=1.0):
+               symmetric_l2_regularization=1.0,
+               adaptive=True):
     self._example_id_column = example_id_column
     self._num_loss_partitions = num_loss_partitions
     self._num_table_shards = num_table_shards
     self._symmetric_l1_regularization = symmetric_l1_regularization
     self._symmetric_l2_regularization = symmetric_l2_regularization
+    self._adaptive = adaptive
 
   def get_name(self):
     return 'SDCAOptimizer'
@@ -101,6 +104,10 @@ class SDCAOptimizer(object):
   def symmetric_l2_regularization(self):
     return self._symmetric_l2_regularization
 
+  @property
+  def adaptive(self):
+    return self._adaptive
+
   def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
                      features, targets, global_step):
     """Returns the training operation of an SdcaModel optimizer."""
@@ -175,28 +182,42 @@ class SDCAOptimizer(object):
         elif isinstance(
             column,
             (
+                layers.feature_column._WeightedSparseColumn,  # pylint: disable=protected-access
                 layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
                 layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
-          sparse_features.append(
-              SparseFeatureColumn(
-                  array_ops.reshape(
-                      array_ops.split(
-                          value=transformed_tensor.indices,
-                          num_or_size_splits=2,
-                          axis=1)[0], [-1]),
-                  array_ops.reshape(transformed_tensor.values, [-1]), None))
-          sparse_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
-          id_tensor = column.id_tensor(transformed_tensor)
-          weight_tensor = column.weight_tensor(transformed_tensor)
+
+          if isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
+            id_tensor = column.id_tensor(transformed_tensor)
+            weight_tensor = array_ops.reshape(
+                column.weight_tensor(transformed_tensor).values, [-1])
+          else:
+            id_tensor = transformed_tensor
+            weight_tensor = array_ops.ones(
+                [array_ops.shape(id_tensor.indices)[0]], dtypes.float32)
+
+          example_ids = array_ops.reshape(id_tensor.indices[:, 0], [-1])
+
+          flat_ids = array_ops.reshape(id_tensor.values, [-1])
+          projection_length = math_ops.reduce_max(flat_ids) + 1
+          # project ids based on example ids so that we can dedup ids that
+          # occur multiple times for a single example.
+          projected_ids = projection_length * example_ids + flat_ids
+
+          # Remove any redudant ids.
+          ids, idx = array_ops.unique(projected_ids)
+          # Keep only one example id per duplicated ids.
+          example_ids_filtered = math_ops.unsorted_segment_min(
+              example_ids, idx,
+              array_ops.shape(ids)[0])
+
+          # reproject ids back feature id space.
+          reproject_ids = (ids - projection_length * example_ids_filtered)
+
+          weights = array_ops.reshape(
+              math_ops.unsorted_segment_sum(weight_tensor, idx,
+                                            array_ops.shape(ids)[0]), [-1])
           sparse_feature_with_values.append(
-              SparseFeatureColumn(
-                  array_ops.reshape(
-                      array_ops.split(
-                          value=id_tensor.indices, num_or_size_splits=2, axis=1)
-                      [0], [-1]),
-                  array_ops.reshape(id_tensor.values, [-1]),
-                  array_ops.reshape(weight_tensor.values, [-1])))
+              SparseFeatureColumn(example_ids_filtered, reproject_ids, weights))
           sparse_feature_with_values_weights.append(
               columns_to_variables[column][0])
         else:
@@ -228,6 +249,7 @@ class SDCAOptimizer(object):
         options=dict(
             symmetric_l1_regularization=self._symmetric_l1_regularization,
             symmetric_l2_regularization=self._symmetric_l2_regularization,
+            adaptive=self._adaptive,
             num_loss_partitions=self._num_loss_partitions,
             num_table_shards=self._num_table_shards,
             loss_type=loss_type))
diff --git a/tensorflow/contrib/lite/Android.bp b/tensorflow/contrib/lite/Android.bp
index bb1ea46b61bb7330c688dc42cd8a32f1dfeeb0b0..8301f9263693eb8254ae8351d3177f9d6165bb0b 100644
--- a/tensorflow/contrib/lite/Android.bp
+++ b/tensorflow/contrib/lite/Android.bp
@@ -42,16 +42,19 @@ cc_library_static {
         "allocation.cc",
         "arena_planner.cc",
         "error_reporter.cc",
-	"graph_info.cc",
+        "graph_info.cc",
         "interpreter.cc",
         "model.cc",
         "nnapi_delegate.cc",
         "optional_debug_tools.cc",
         "simple_memory_arena.cc",
         "string_util.cc",
+        "util.cc",
+        "kernels/eigen_support.cc",
         "kernels/gemm_support.cc",
     ],
     header_libs: [
+        "libeigen",
         "flatbuffer_headers",
         "gemmlowp_headers",
     ],
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 44c4a7e2ca8d019ca602c7f2b492cd1e70b17561..1534f97d7600151e78c7fa7e8509d9e871240421 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -89,6 +89,7 @@ cc_library(
     hdrs = [
         "builtin_op_data.h",
     ],
+    deps = [":context"],
 )
 
 cc_library(
@@ -132,10 +133,12 @@ cc_library(
         ":memory_planner",
         ":schema_fbs_version",
         ":simple_memory_arena",
+        ":util",
+        "//tensorflow/contrib/lite/kernels:eigen_support",
         "//tensorflow/contrib/lite/kernels:gemm_support",
         "//tensorflow/contrib/lite/nnapi:nnapi_lib",
+        "//tensorflow/contrib/lite/profiling:profiler",
         "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/core:lib_platform",
     ],
 )
 
@@ -169,6 +172,7 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/contrib/lite/kernels:kernel_util",
         "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
@@ -232,6 +236,27 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    deps = [
+        ":context",
+    ],
+)
+
+cc_test(
+    name = "util_test",
+    size = "small",
+    srcs = ["util_test.cc"],
+    deps = [
+        ":context",
+        ":util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # Test the serialization of a model with optional tensors.
 
 # Model tests
@@ -248,18 +273,3 @@ cc_test(
 #        ],
 #    }),
 #)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "downloads",
-            "examples",
-            "gen",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index 7f316292724ea0baaf034d4e914773ad97a957d4..65fba52d461461f4594e2222ef6df3849b741f99 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -27,10 +27,10 @@ LIBDIR := $(MAKEFILE_DIR)/gen/lib/
 GENDIR := $(MAKEFILE_DIR)/gen/obj/
 
 # Settings for the host compiler.
-CXX := $(CC_PREFIX) gcc
+CXX := $(CC_PREFIX)gcc
 CXXFLAGS := --std=c++11 -O3 -DNDEBUG
-CC := $(CC_PREFIX) gcc
-CFLAGS :=
+CC := $(CC_PREFIX)gcc
+CFLAGS := -O3 -DNDEBUG
 LDOPTS :=
 LDOPTS += -L/usr/local/lib
 ARFLAGS := -r
@@ -57,10 +57,11 @@ LIBS := \
 
 # If we're on Linux, also link in the dl library.
 ifeq ($(HOST_OS),LINUX)
-	LIBS += -ldl -lpthread
+	LIBS += -ldl
 endif
 
 include $(MAKEFILE_DIR)/ios_makefile.inc
+include $(MAKEFILE_DIR)/rpi_makefile.inc
 
 # This library is the main target for this makefile. It will contain a minimal
 # runtime that can be linked in to other programs.
@@ -89,7 +90,8 @@ $(wildcard tensorflow/contrib/lite/kernels/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.c) \
-$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc)
+$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc) \
+$(wildcard tensorflow/contrib/lite/downloads/fft2d/fftsg.c)
 # Remove any duplicates.
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
 CORE_CC_EXCLUDE_SRCS := \
diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index 00e93d2c4f3ab27057b855fba6fccf2ec8d7a1c1..a676b705f143b393c7e5bfa9e40d23f9adb68dcc 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@@ -1,235 +1,8 @@
 # TensorFlow Lite
-TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded devices. It enables low-latency inference of on-device machine learning models with a small binary size and fast performance supporting hardware acceleration.
 
-TensorFlow Lite uses many techniques for achieving low latency like optimizing the kernels for specific mobile apps, pre-fused activations, quantized kernels that allow smaller and faster (fixed-point math) models, and in the future, leverage specialized machine learning hardware to get the best possible performance for a particular model on a particular device.
+TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded
+devices. It enables low-latency inference of on-device machine learning models
+with a small binary size and fast performance supporting hardware acceleration.
 
-![image](g3doc/TFLite-Architecture.jpg)
-# Getting Started with an Android Demo App
-
-This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using either a quantized Mobilenet model or a floating point Inception-v3 model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
-
-There are 3 ways to get the demo app to your device
- - Download the prebuilt binary or
- - Use Android Studio to build the application or
- - Download the source code for TensorFlow Lite and the demo and build it using bazel
-
-## Description
-In the demo app, inference is done using the TensorFlow Lite Java API. The demo app classifies frames in real-time, displaying the top most probable classifications. It also displays the time taken to detect the object.
-
-## Downloading the pre-built binary
-The fastest path to trying the demo, is to download the pre-built binary
-[TfLiteCameraDemo.apk](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
-
-Once the apk is installed, click the app icon to start the app. The first-time the app is opened, the app asks for runtime permissions to access the device camera. The demo app opens the back-camera of the device and recognizes the objects in the camera's field of view. At the bottom of the image (or at the left of the image if the device is in landscape mode), it shows the latency of classification and the top three objects classified.
-
-## Building in Android Studio using TensorFlow Lite AAR from JCenter
-The simplest way to compile the demo app, and try out changes to the project code is to use AndroidStudio.
-
- - Install the latest version of Android Studio 3 as specified [here](https://developer.android.com/studio/index.html).
- - Make sure the Android SDK version is greater than 26 and NDK version is greater than 14 (in the Android Studio Settings).
- - Import the `tensorflow/contrib/lite/java/demo` directory as a new Android Studio project.
- - Click through installing all the Gradle extensions it requests.
- - Either
-     - Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
-         - unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
-           `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
-     - Or download the floating point Inception-v3 model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
-         - unzip and copy inceptionv3_non_slim_2015.tflite to the assets directory
-         - change the chosen classifier in [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java) from
-         `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`
-         to
-         `classifier = new ImageClassifierFloatInception(getActivity());`
- - Build and run the demo app
-
-## Building TensorFlow Lite and the demo app from source
-
-### Clone the TensorFlow repo
-- git clone
-  [https://github.com/tensorflow/tensorflow](https://github.com/tensorflow/tensorflow)
-
-### Install Bazel
-If bazel is not installed on your system, install it now by following [these directions](https://bazel.build/versions/master/docs/install.html)
-
-NOTE: Bazel does not fully support building Android on Windows yet. Full support for Gradle/CMake builds is coming soon, but in the meantime Windows users should download the [prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead.
-
-### Install Android NDK and SDK
-Bazel is the primary build system for TensorFlow. Bazel and the Android NDK and SDK must be installed on your system.
- - Install the latest version of Bazel as per the instructions on the [Bazel website](https://bazel.build/versions/master/docs/install.html)
- - The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The current recommended version is 14b, which can be found [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads).
- - The Android SDK and build tools may be obtained [here](https://developer.android.com/tools/revisions/build-tools.html), or alternatively as part of [Android Studio](https://developer.android.com/studio/index.html). Build tools API >= 23 is required to build the TF Android demo (though it will run on API >= 21 devices).
- - In the root of the TensorFlow repository update the `WORKSPACE` file with the `api_level` and location of the SDK and NDK. If you installed it with AndroidStudio the SDK path can be found in the SDK manager, and the default NDK path is:`{SDK path}/ndk-bundle.`
-
-```
-android_sdk_repository (
-    name = "androidsdk",
-    api_level = 23,
-    build_tools_version = "23.0.2",
-    path = "/home/xxxx/android-sdk-linux/",
-)
-
-android_ndk_repository(
-    name = "androidndk",
-    path = "/home/xxxx/android-ndk-r10e/",
-    api_level = 19,
-)
-```
-
-Additional details on building with Android can be found [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
-
-### Build the source code
-Run bazel with the following command to build the demo.
-
-Build the demo app:
-
-```
-bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
-```
-
-### Note
-
-Currently, we only support building the Android demo app within a Python 2
-environment (due to a Bazel bug).
-
-### More about the demo
-The demo is resizing each camera image frame to (224 width * 224 height) to match the quantized Mobilenet model being used (229 * 229 for Inception-v3). The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch. 224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. Both models have 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The model file must be downloaded and bundled within the assets directory of the app.
-
-# iOS Demo App
-
-Similar to the Android demo app, there's an iOS camera app that uses exactly the same model (224 * 224 quantized Mobilenet).
-
-This demo app requires a camera so it doesn't work with simulators. It need to be executed on a real iOS device. Follow the instructions to build and run the demo app:
-
-1.   Run `third_party/tensorflow/contrib/lite/examples/ios/download_models.sh` to download the model files used by the demo app.
-1.   Install [CocoaPods](https://cocoapods.org/) if it wasn't installed yet: `sudo gem install cocoapods`.
-1.   Run `pod install` in `tensorflow/contrib/lite/examples/ios/camera` to generate the workspace file.
-1.   Open the project by running `open tflite_camera_example.xcworkspace`, and build the app in XCode.
-
-# TensorFlow Lite Quick Start
-
-## Step 1. Decide which GraphDef to use
- Depending on the use case, the developer may choose to use one of the popular
- open-sourced models such as InceptionV3 or MobileNets, re-train these models
- with their own custom data set or even build their own custom model.
-
-### Using a pre-trained model
-
-[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html) is a family of mobile-first computer vision models for [TensorFlow](https://www.tensorflow.org/) designed to effectively maximize accuracy while being mindful of the restricted resources for an on-device or embedded application. MobileNets are small, low-latency, low-power models parameterized to meet the resource constraints of a variety of use cases. They can be built upon for classification, detection, embeddings and segmentation similar to how other popular large scale models, such as [Inception](https://arxiv.org/pdf/1602.07261.pdf), are used. Google provides 16 pre-trained [ImageNet](http://www.image-net.org/challenges/LSVRC/)  classification checkpoints for MobileNets for use in mobile projects of all sizes.
-
-[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model which achieves fairly high accuracy in recognizing general objects with 1000 classes, like "Zebra", "Dalmatian", and "Dishwasher". The model extracts general features from input images using a convolutional neural network and classifies them based on those features with fully-connected and softmax layers.
-
-[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)  is an on-device model which provides one-touch replies for an incoming text message by suggesting contextually relevant messages. The model is built specifically for memory constrained devices such as watches & phones and it has been successfully used to surface [Smart Replies on Android Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html). Note that this model only works on Android as of now.
-
-These pre-trained models can be downloaded from [here](g3doc/models.md).
-
-### Retrain Inception-V3 or MobileNet for a custom data set
-The above pre-trained models have been trained on the ImageNet data set, which consists of 1000 predefined classes. A model will need to be re-trained if these classes are not relevant or useful for a given use case. This technique is called transfer learning, which starts with a model that has been already trained on a problem and will then be retrained on a similar problem. Deep learning from scratch can take days, but transfer learning can be done fairly quickly. In order to do this, a developer will need to generate their custom data set labeled with the relevant classes.
-
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/) codelab walks through this process step-by-step. The retraining code supports retraining for both floating point and quantized inference.
-
-
-### Train a custom model
-A developer may choose to train a custom model using Tensorflow. TensorFlow documentation has [several tutorials](https://www.tensorflow.org/tutorials/) for building and training models. If the user has written a model using TensorFlow's Slim Framework the first step is to export this to a GraphDef file. This is necessary because Slim does not store the model structure outside the code, so to communicate with other parts of the framework it needs to be exported. Documentation for the export can be found [here](https://github.com/tensorflow/models/tree/master/research/slim#Export). The output of this step will be a .pb file for the custom model.
-
-TensorFlow Lite currently supports a subset of TensorFlow operators. Please refer to [this document](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for details of supported operators and their usage. This
-set will continue to expand in future releases of Tensorflow Lite.
-
-
-## Step 2. Model format conversion
-
-The model generated in Step 1 is a standard Tensorflow model. After the completion of Step 1 a user should have a standard .pb or .pbtxt GraphDef file. If the application developer is using a pre-trained model (as defined in Step 1 above), they can download a ready to use, already converted model for use from [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/models.md). Models generated using retraining (aka transfer learning) or custom models will need to be converted using the steps mentioned below.
-
-A prerequisite to converting the model to the Tensorflow Lite format is to freeze the graph.
-
-Since we employ several formats, the following definitions may be useful:
- - GraphDef (.pb) - a protobuf that represents the TensorFlow training and or computation graph. This contains operators, tensors, and variables definitions.
-
- - CheckPoint (.ckpt) - Serialized variables from a TensorFlow graph. Note, this does not contain the graph structure, so alone it cannot typically be interpreted.
-
- - FrozenGraphDef - a subclass of GraphDef that contains no variables. A GraphDef can be converted to a frozen graphdef by taking a checkpoint and a graphdef and converting every variable into a constant with the value looked up in the checkpoint.
-
- - SavedModel - A collection of GraphDef and CheckPoint together with a signature that labels input and output arguments to a model. A GraphDef and Checkpoint can be extracted from a saved model.
-
- - TensorFlow lite model (.tflite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
-
-### Freeze Graph
-To use this .pb GraphDef file within TensorFlow Lite, the application developer will need checkpoints containing trained weight parameters. The .pb contains only the structure of the graph. The process of merging the checkpoint values with the graph structure is known as "freezing" the graph.
-
-The developer should know where the checkpoints folder is present or checkpoints can also be downloaded for a pre-trained model (Example: Here is a link to the [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
-
-Graph freezing can be done using the command below (and modifying the arguments appropriately)
-
-```
-bazel build tensorflow/python/tools:freeze_graph
-
-bazel-bin/tensorflow/python/tools/freeze_graph\
-    --input_graph=/tmp/mobilenet_v1_224.pb \
-    --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
-    --input_binary=true --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
-    --output_node_names=MobileNet/Predictions/Reshape_1
-```
-
-The user has to first build the freeze_graph script using bazel and then run the script.  The input_binary flag has to be enabled to ensure that the protobuf is read and written in binary format.  The user has to input the .pb and the .ckpt files to freeze the graph The output_node_names may not be obvious outside of the code that built the model. The easiest way to find them is to visualize the graph, either with
-graphviz, or [in tensorboard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3).
-
-This frozen Graphdef is now ready to be converted to flatbuffer format (.tflite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
-
-Here is a sample command line to convert the frozen Graphdef to '.tflite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
-(Here is a link to the pb [file](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)).
-
-```
-bazel build tensorflow/contrib/lite/toco:toco
-
-bazel-bin/tensorflow/contrib/lite/toco/toco \
-  --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
-  --input_format=TENSORFLOW_GRAPHDEF  --output_format=TFLITE \
-  --output_file=/tmp/mobilenet_v1_1.0_224.tflite --inference_type=FLOAT \
-  --input_type=FLOAT --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 --input_shapes=1,224,224,3
-```
-
-- The input_file argument should point to the frozen GraphDef file that holds the model architecture.
-- The output_file argument should point to where the TensorFlow Lite model file should be generated.
-- The input_type and inference_type arguments should be set to FLOAT, unless converted a [quantized](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/) model.
-- Setting the input_array, output_array and input_shape arguments are a bit trickier. The easiest way to find these values is to explore the graph in tensorboard .  The user should reuse the arguments that were used for specifying the output nodes for inference in the `freeze_graph`step.
-
-Note, it is also possible to use the Tensorflow Optimizing Converter through protos either from Python or from the command line see the
-documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py). A developer can then integrate the conversion step into their model design workflow to ensure that a model will be easily convertible to a mobile inference graph. For example,
-
-```python
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-out = tf.identity(val, name="out")
-with tf.Session() as sess:
-  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
-  open("converteds_model.tflite", "wb").write(tflite_model)
-
-```
-For detailed instructions on how to use the Tensorflow Optimizing Converter, please see [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
-
-You may refer to the [Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for troubleshooting help. If that doesn't help, please file an [issue](https://github.com/tensorflow/tensorflow/issues).
-
-If you would like to see a visual description of your TensorFlow Lite model after conversion, you can use tensorflow/contrib/lite/tools/visualize.py by running
-```sh
-bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
-```
-and then visualize the resulting HTML file in a browser.
-
-## Step 3. Use the TensorFlow Lite model for inference in a mobile app
-
-After completion of Step 2 the developer should have a .tflite model.
-
-### For Android
-Because Android apps need to be written in Java, and core TensorFlow is in C++, a JNI library is provided to interface between the two. Its interface is aimed only at inference, so it provides the ability to load a graph, set up inputs, and run the model to calculate particular outputs. The full documentation for the set of methods can be seen [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/). The demo app is also open sourced on [github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
-
-The [demo app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app)  uses this interface, so it's a good place to look for example usage. You can also download the prebuilt binary [here](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-
-Note that you'd need to follow instructions for installing TensorFlow on Android, setting up bazel and Android Studio outlined [here](https://www.tensorflow.org/mobile/android_build).
-
-### For iOS
-Follow the documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md) to get integrate a TFLite model into your app.
-
-## Core ML support
-
-Core ML is a machine learning framework used across Apple products. In addition to using Tensorflow Lite models directly in their applications, developers have the option to convert their trained Tensorflow models to the [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple devices. For information on how to use the converter please refer to the [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+See the documentation: https://www.tensorflow.org/mobile/tflite/
+Documentation edits can be made here: [tensorflow/docs_src/mobile/tflite](../../docs_src/mobile/tflite)
diff --git a/tensorflow/contrib/lite/allocation.cc b/tensorflow/contrib/lite/allocation.cc
index 4b322e027d48f4bf9f90d5b873c449d1ec31cc49..a4772731ecda92431c412672610a39c188dabf27 100644
--- a/tensorflow/contrib/lite/allocation.cc
+++ b/tensorflow/contrib/lite/allocation.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+#include <utility>
 
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/contrib/lite/arena_planner.cc
index 87b17c338e7afc33d32dd9688cc0825ac319dd19..4f836d367747e06de682b5764206d33f6e2fb983 100644
--- a/tensorflow/contrib/lite/arena_planner.cc
+++ b/tensorflow/contrib/lite/arena_planner.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/arena_planner.h"
+#include <utility>
 
 namespace tflite {
 
@@ -128,6 +129,11 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
 }
 
 TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
+  // Grow the size of `allocs_` if necessary. This allows allocating temporary
+  // tensors in op's `prepare` function.
+  TF_LITE_ENSURE(context_, graph_info_->num_tensors() >= allocs_.size());
+  allocs_.resize(graph_info_->num_tensors());
+
   TF_LITE_ENSURE_STATUS(CalculateAllocations(first_node, last_node));
   TF_LITE_ENSURE_STATUS(Commit());
 
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
index 58bc164619c2c053b9492e9a0e5de2da30e199af..e9d0fbc5a9b5aec06e28da8757466b25f40da2f5 100644
--- a/tensorflow/contrib/lite/arena_planner.h
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -25,7 +25,7 @@ limitations under the License.
 
 namespace tflite {
 
-class AllocationInfo;
+struct AllocationInfo;
 
 // A memory planner that makes all the allocations using arenas.
 //
@@ -33,7 +33,7 @@ class AllocationInfo;
 // each tensor needs to be allocated and deallocated, and preallocates all the
 // necessary memory (the PlanAllocations phase). It then assigns portions of
 // this memory buffer to each tensor (the ExecuteAllocations phase). Tensors may
-// share some of the bufer if a tensor B is to be allocated after another tensor
+// share some of the buffer if a tensor B is to be allocated after another tensor
 // A has been deallocated.
 //
 // If dynamic tensors are used the planning steps can be repeated during model
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 19829e4991651111e13fc1805f97daef8bc016a7..85216776823eab2ab3ac2a3bc666f21e312acc6c 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -104,7 +104,7 @@ def tflite_jni_binary(name,
   """Builds a jni binary for TFLite."""
   linkopts = linkopts + [
       "-Wl,--version-script",  # Export only jni functions & classes.
-      linkscript,
+      "$(location {})".format(linkscript),
   ]
   native.cc_binary(
       name=name,
@@ -124,19 +124,19 @@ def tf_to_tflite(name, src, options, out):
     out: name of the output flatbuffer file.
   """
 
-  toco = "//tensorflow/contrib/lite/toco:toco"
+  toco_cmdline = " ".join([
+      "//tensorflow/contrib/lite/toco:toco",
+      "--input_format=TENSORFLOW_GRAPHDEF",
+      "--output_format=TFLITE",
+      ("--input_file=$(location %s)" % src),
+      ("--output_file=$(location %s)" % out),
+  ] + options )
   native.genrule(
       name = name,
-      srcs=[src, options],
+      srcs=[src],
       outs=[out],
-      cmd = ("$(location %s) " +
-             "   --input_file=$(location %s) " +
-             "   --output_file=$(location %s) " +
-             "   --input_format=TENSORFLOW_GRAPHDEF" +
-             "   --output_format=TFLITE" +
-             "   `cat $(location %s)`")
-            % (toco, src, out, options),
-      tools= [toco],
+      cmd = toco_cmdline,
+      tools= ["//tensorflow/contrib/lite/toco:toco"],
   )
 
 def tflite_to_json(name, src, out):
@@ -200,8 +200,7 @@ def gen_zipped_test_files(name, files):
     native.genrule(
         name = name + "_" + f + ".files",
         cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
-               + " --zip_to_output " + f +
-               " $(@D) zipped"),
+               + " --zip_to_output " + f + " $(@D)"),
         outs = [out_file],
         tools = [
             ":generate_examples",
diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/build_ios_universal_lib.sh
index 4a9023ff33de15dd384531d51e39de4ffeecdb8b..9f398f4a9f3dcafd7bd49fd5d95e9991b8b36b75 100755
--- a/tensorflow/contrib/lite/build_ios_universal_lib.sh
+++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh
@@ -19,11 +19,16 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../.."
 
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_x86_64/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_i386/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7s/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_arm64/libtensorflow-lite.a
 
 lipo \
 tensorflow/contrib/lite/gen/lib/ios_x86_64/libtensorflow-lite.a \
diff --git a/tensorflow/contrib/lite/build_rpi_lib.sh b/tensorflow/contrib/lite/build_rpi_lib.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3824b16412ed26a6cab79df3242da6017c3322b0
--- /dev/null
+++ b/tensorflow/contrib/lite/build_rpi_lib.sh
@@ -0,0 +1,22 @@
+#!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../.."
+
+CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/contrib/lite/Makefile TARGET=RPI TARGET_ARCH=armv7
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 5fc8954743e5b3b458e5c2004f4378cbad6056c0..4910c89eaebabb7bd9a4e003b75fa6de4d5af69d 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include "tensorflow/contrib/lite/context.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
@@ -51,6 +53,8 @@ typedef struct {
   TfLitePadding padding;
   int stride_width;
   int stride_height;
+  int dilation_width_factor;
+  int dilation_height_factor;
   TfLiteFusedActivation activation;
 } TfLiteConvParams;
 
@@ -174,6 +178,11 @@ typedef struct {
   int block_size;
 } TfLiteSpaceToDepthParams;
 
+typedef struct {
+  TfLiteType in_data_type;
+  TfLiteType out_data_type;
+} TfLiteCastParams;
+
 typedef enum {
   kTfLiteCombinerTypeSum = 0,
   kTfLiteCombinerTypeMean = 1,
@@ -214,6 +223,10 @@ typedef struct {
   int shrink_axis_mask;
 } TfLiteStridedSliceParams;
 
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMaxParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 4ebd1586de791eecf0304637bde76232d9f0a11d..21e0e04ef6bc5b1e467ef5e27035e866f21049a0 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -23,13 +23,17 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
+// The enum for builtin operators.
+// Note: CUSTOM and DELEGATE are 2 special ops which are not real built-in ops.
 typedef enum {
   kTfLiteBuiltinAdd = 0,
   kTfLiteBuiltinAveragePool2d = 1,
   kTfLiteBuiltinConcatenation = 2,
   kTfLiteBuiltinConv2d = 3,
   kTfLiteBuiltinDepthwiseConv2d = 4,
+  kTfLiteBuiltinDequantize = 6,
   kTfLiteBuiltinEmbeddingLookup = 7,
+  kTfLiteBuiltinFloor = 8,
   kTfLiteBuiltinFullyConnected = 9,
   kTfLiteBuiltinHashtableLookup = 10,
   kTfLiteBuiltinL2Normalization = 11,
@@ -71,6 +75,15 @@ typedef enum {
   kTfLiteBuiltinExp = 47,
   kTfLiteBuiltinTopkV2 = 48,
   kTfLiteBuiltinSplit = 49,
+  kTfLiteBuiltinLogSoftmax = 50,
+  kTfLiteBuiltinDelegate = 51,
+  kTfLiteBuiltinBidirectionalSequenceLstm = 52,
+  kTfLiteBuiltinCast = 53,
+  kTfLiteBuiltinPrelu = 54,
+  kTfLiteBuiltinMaximum = 55,
+  kTfLiteBuiltinArgMax = 56,
+  kTfLiteBuiltinMinimum = 57,
+  kTfLiteBuiltinLess = 58,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/context.c b/tensorflow/contrib/lite/context.c
index c09e838c5c2e50e0f4a38eaf66e55246fd9a6f7f..5c6f5e72a47180cd98be46f60cfa8eaf28197806 100644
--- a/tensorflow/contrib/lite/context.c
+++ b/tensorflow/contrib/lite/context.c
@@ -17,9 +17,14 @@ limitations under the License.
 #include <stdio.h>
 #include <string.h>
 
+int TfLiteIntArrayGetSizeInBytes(int size) {
+  static TfLiteIntArray dummy;
+  return sizeof(dummy) + sizeof(dummy.data[0]) * size;
+}
+
 TfLiteIntArray* TfLiteIntArrayCreate(int size) {
   TfLiteIntArray* ret =
-      (TfLiteIntArray*)malloc(sizeof(*ret) + sizeof(ret->data[0]) * size);
+      (TfLiteIntArray*)malloc(TfLiteIntArrayGetSizeInBytes(size));
   ret->size = size;
   return ret;
 }
@@ -55,12 +60,16 @@ TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src) {
 
 void TfLiteIntArrayFree(TfLiteIntArray* a) { free(a); }
 
-void TfLiteTensorFree(TfLiteTensor* t) {
+void TfLiteTensorDataFree(TfLiteTensor* t) {
   if (t->allocation_type == kTfLiteDynamic && t->data.raw) {
     free(t->data.raw);
   }
-  if (t->dims) TfLiteIntArrayFree(t->dims);
   t->data.raw = NULL;
+}
+
+void TfLiteTensorFree(TfLiteTensor* t) {
+  TfLiteTensorDataFree(t);
+  if (t->dims) TfLiteIntArrayFree(t->dims);
   t->dims = NULL;
 }
 
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index b0c4d3431f9a67bc87d51ada91ed73f1661023a2..12841d233cc1d3c5e1219fc505b1975d2a7fa3e3 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -29,6 +29,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 #define TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
 
@@ -40,6 +41,7 @@ typedef enum { kTfLiteOk = 0, kTfLiteError = 1 } TfLiteStatus;
 
 // Forward declare so GetNode can use this is in Context.
 typedef struct _TfLiteRegistration TfLiteRegistration;
+typedef struct _TfLiteDelegate TfLiteDelegate;
 
 #define kOptionalTensor (-1)
 
@@ -57,6 +59,10 @@ typedef struct {
 #endif
 } TfLiteIntArray;
 
+// Given the size (number of elements) in a TfLiteIntArray, calculate its size
+// in bytes.
+int TfLiteIntArrayGetSizeInBytes(int size);
+
 // Create a array of a given `size` (uninitialized entries).
 // This returns a pointer, that you must free using TfLiteIntArrayFree().
 TfLiteIntArray* TfLiteIntArrayCreate(int size);
@@ -131,6 +137,7 @@ typedef enum {
   kTfLiteUInt8 = 3,
   kTfLiteInt64 = 4,
   kTfLiteString = 5,
+  kTfLiteBool = 6,
 } TfLiteType;
 
 // Parameters for asymmetric quantization. Quantized values can be converted
@@ -149,6 +156,7 @@ typedef union {
   char* raw;
   const char* raw_const;
   uint8_t* uint8;
+  bool* b;
 } TfLitePtrUnion;
 
 // Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
@@ -162,6 +170,11 @@ typedef enum {
   kTfLiteDynamic,
 } TfLiteAllocationType;
 
+// The delegates should use zero or positive integers to represent handles.
+// -1 is reserved from unallocated status.
+typedef int TfLiteBufferHandle;
+const TfLiteBufferHandle kTfLiteNullBufferHandle = -1;
+
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
 typedef struct {
@@ -194,8 +207,27 @@ typedef struct {
 
   // Null-terminated name of this tensor.
   const char* name;
+
+  // The delegate which knows how to handle `buffer_handle`.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteDelegate* delegate;
+
+  // An integer buffer handle that can be handled by `delegate`.
+  // The value is valid only when delegate is not null.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteBufferHandle buffer_handle;
+
+  // If the delegate uses its own buffer (e.g. GPU memory), the delegate is
+  // responsible to set data_is_stale to true.
+  // `delegate->CopyFromBufferHandle` can be called to copy the data from
+  // delegate buffer.
+  // WARNING: This is an // experimental interface that is subject to change.
+  bool data_is_stale;
 } TfLiteTensor;
 
+// Free data memory of tensor `t`;
+void TfLiteTensorDataFree(TfLiteTensor* t);
+
 // Free memory of tensor `t`;
 void TfLiteTensorFree(TfLiteTensor* t);
 
@@ -234,11 +266,16 @@ typedef struct {
   // WARNING: This is an experimental interface that is subject to change.
   const void* custom_initial_data;
   int custom_initial_data_size;
+
+  // The pointer to the delegate. This is non-null only when the node is
+  // created by calling `interpreter.ModifyGraphWithDelegate`.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteDelegate* delegate;
 } TfLiteNode;
 
 typedef struct TfLiteContext {
   // Number of tensors in the context.
-  int tensors_size;
+  size_t tensors_size;
 
   // The execution plan contains a list of the node indices in execution
   // order. execution_plan->size is the current number of nodes. And,
@@ -258,7 +295,7 @@ typedef struct TfLiteContext {
   TfLiteStatus (*GetExecutionPlan)(struct TfLiteContext* context,
                                    TfLiteIntArray** execution_plan);
 
-  // An tensor of tensors in the interpreter context (of length `tensors_size`)
+  // An array of tensors in the interpreter context (of length `tensors_size`)
   TfLiteTensor* tensors;
 
   // opaque full context ptr (an opaque c++ data structure)
@@ -283,14 +320,20 @@ typedef struct TfLiteContext {
                                          TfLiteNode** node,
                                          TfLiteRegistration** registration);
 
-  // Replace ops with delegate.
+  // Replace ops with one or more stub delegate operations. This function
+  // does not take ownership of `nodes_to_replace`.
   TfLiteStatus (*ReplaceSubgraphsWithDelegateKernels)(
       struct TfLiteContext*, TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace);
+      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
+
+  // Number of threads that are recommended to subsystems like gemmlowp and
+  // eigen.
+  int recommended_num_threads;
 
   // TODO(ahentz): we should create a more general mechanism for this sort of
   // library-global objects.
   void* gemm_context;
+  void* eigen_context;
 } TfLiteContext;
 
 typedef struct _TfLiteRegistration {
@@ -337,19 +380,47 @@ typedef struct _TfLiteRegistration {
 } TfLiteRegistration;
 
 // WARNING: This is an experimental interface that is subject to change.
-typedef struct {
+typedef struct _TfLiteDelegate {
   // Data that delegate needs to identify itself. This data is owned by the
   // delegate. The delegate is owned in the user code, so the delegate is
   // responsible for doing this when it is destroyed.
   void* data_;
+
   // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
   // delegate a view of the current graph through TfLiteContext*. It typically
   // will look at the nodes and call ReplaceSubgraphsWithDelegateKernels()
   // to ask the TensorFlow lite runtime to create macro-nodes to represent
   // delegated subgraphs of the original graph.
-  TfLiteStatus (*Prepare)(TfLiteContext* context, void* data);
+  TfLiteStatus (*Prepare)(TfLiteContext* context, TfLiteDelegate* delegate);
+
+  // Copy the data from delegate buffer handle to raw memory.
+  // This can be null if the delegate doesn't use its own buffer.
+  TfLiteStatus (*CopyFromBufferHandle)(TfLiteDelegate* delegate,
+                                       TfLiteBufferHandle buffer_handle,
+                                       void* data, size_t size);
+
+  // Copy the data from raw memory to delegate buffer handle.
+  // This can be null if the delegate doesn't use its own buffer.
+  TfLiteStatus (*CopyToBufferHandle)(TfLiteDelegate* delegate,
+                                     TfLiteBufferHandle buffer_handle,
+                                     void* data, size_t size);
+
+  // Free the Delegate Buffer Handle. Note: This only frees the handle, but
+  // this doesn't release the underlying resource (e.g. textures). The
+  // resources are either owned by application layer or the delegate.
+  // This can be null if the delegate doesn't use its own buffer.
+  void (*FreeBufferHandle)(TfLiteDelegate* delegate,
+                           TfLiteBufferHandle* handle);
 } TfLiteDelegate;
 
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct {
+  TfLiteDelegate* delegate;
+  TfLiteIntArray* nodes_to_replace;
+  TfLiteIntArray* input_tensors;
+  TfLiteIntArray* output_tensors;
+} TfLiteDelegateParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index a93ed201d647ddf2359a57254a959871c13fb94f..436c3e1d4cad5e6ee355d7e9cf8ee7da1a8385ce 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,12 +30,15 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
 FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
 FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/master.zip"
+FFT2D_URL="https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -91,6 +94,7 @@ download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
 download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
diff --git a/tensorflow/contrib/lite/error_reporter.h b/tensorflow/contrib/lite/error_reporter.h
index da193d2586e9123341b9a41be049ee2a4382017a..3c5f805f12f6a1fb7185c140604f692ac282a143 100644
--- a/tensorflow/contrib/lite/error_reporter.h
+++ b/tensorflow/contrib/lite/error_reporter.h
@@ -30,7 +30,7 @@ namespace tflite {
 //  va_list args;
 //  foo.Report("test %d", args); // where args is va_list
 //
-// Sublclass ErrorReporter to provide another reporting destination.
+// Subclass ErrorReporter to provide another reporting destination.
 // For example, if you have a GUI program, you might redirect to a buffer
 // that drives a GUI error log box.
 class ErrorReporter {
diff --git a/tensorflow/contrib/lite/examples/android/AndroidManifest.xml b/tensorflow/contrib/lite/examples/android/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..bc9574d646b7661de8ac9b745bd53cbba1eb9f31
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/AndroidManifest.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.tensorflow.lite.demo">
+
+    <uses-permission android:name="android.permission.CAMERA" />
+    <uses-feature android:name="android.hardware.camera" />
+    <uses-feature android:name="android.hardware.camera.autofocus" />
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
+    <uses-permission android:name="android.permission.RECORD_AUDIO" />
+
+    <uses-sdk
+        android:minSdkVersion="21"
+        android:targetSdkVersion="23" />
+
+    <application android:allowBackup="true"
+        android:debuggable="true"
+        android:label="@string/app_name"
+        android:icon="@drawable/ic_launcher"
+        android:theme="@style/MaterialTheme">
+
+        <activity android:name="org.tensorflow.demo.ClassifierActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_classification">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+        <activity android:name="org.tensorflow.demo.DetectorActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_detection">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+        <activity android:name="org.tensorflow.demo.SpeechActivity"
+            android:screenOrientation="portrait"
+            android:label="@string/activity_name_speech">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
diff --git a/tensorflow/contrib/lite/examples/android/BUILD b/tensorflow/contrib/lite/examples/android/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..49280129971e38247c2216d9422bc5de9176e13d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/BUILD
@@ -0,0 +1,86 @@
+# Description:
+#   TensorFlow camera demo app for Android.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Build the demo native demo lib from the original directory to reduce code
+# reuse. Note that the Java counterparts (ObjectTracker.java and
+# ImageUtils.java) are still duplicated.
+cc_library(
+    name = "tensorflow_native_libs",
+    srcs = [
+        "//tensorflow/examples/android:libtensorflow_demo.so",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
+android_binary(
+    name = "tflite_demo",
+    srcs = glob([
+        "src/**/*.java",
+    ]),
+    # Package assets from assets dir as well as all model targets.
+    # Remove undesired models (and corresponding Activities in source)
+    # to reduce APK size.
+    assets = [
+        "//tensorflow/contrib/lite/examples/android/assets:labels_mobilenet_quant_v1_224.txt",
+        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
+        "@tflite_conv_actions_frozen//:conv_actions_frozen.tflite",
+        "//tensorflow/contrib/lite/examples/android/assets:conv_actions_labels.txt",
+        "@tflite_mobilenet_ssd//:mobilenet_ssd.tflite",
+        "//tensorflow/contrib/lite/examples/android/assets:box_priors.txt",
+        "//tensorflow/contrib/lite/examples/android/assets:coco_labels_list.txt",
+    ],
+    assets_dir = "",
+    custom_package = "org.tensorflow.lite.demo",
+    inline_constants = 1,
+    manifest = "AndroidManifest.xml",
+    manifest_merger = "android",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":tensorflow_native_libs",
+        "//tensorflow/contrib/lite/java:tensorflowlite",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "bin/**",
+            "gen/**",
+            "gradleBuild/**",
+            "libs/**",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "java_files",
+    srcs = glob(["src/**/*.java"]),
+)
+
+filegroup(
+    name = "resource_files",
+    srcs = glob(["res/**"]),
+)
+
+exports_files(["AndroidManifest.xml"])
diff --git a/tensorflow/contrib/lite/examples/android/assets/BUILD b/tensorflow/contrib/lite/examples/android/assets/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..dd0cd6c98ff878e9c41875cab74c12191cadb173
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/examples/android/assets/box_priors.txt b/tensorflow/contrib/lite/examples/android/assets/box_priors.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7246b073fe7fd8b1d1340536457c8aeac24cd5a3
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/box_priors.txt
@@ -0,0 +1,5 @@
+        0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.16666667 0.16666667 0.16666666 0.16666667 0.16666669 0.16666667 0.16666667 0.16666667 0.16666666 0.16666667 0.16666669 0.16666667 0.16666667 0.16666667 0.16666666 0.16666667 0.16666669 0.16666667 0.5 0.5 0.49999997 0.5 0.5 0.5 0.5 0.5 0.49999997 0.5 0.5 0.5 0.5 0.5 0.49999997 0.5 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.25 0.25 0.25 0.24999999 0.25 0.25 0.25 0.25 0.25 0.24999999 0.25 0.25 0.75 0.75 0.75 0.75 0.74999994 0.75 0.75 0.75 0.75 0.75 0.74999994 0.75 0.5 0.5 0.5 0.5 0.5 0.5 
+        0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.16666667 0.16666669 0.16666667 0.16666669 0.16666667 0.16666667 0.49999997 0.5 0.5 0.50000006 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333333 0.8333334 0.8333334 0.16666667 0.16666669 0.16666667 0.16666669 0.16666667 0.16666667 0.49999997 0.5 0.5 0.50000006 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333333 0.8333334 0.8333334 0.16666667 0.16666669 0.16666667 0.16666669 0.16666667 0.16666667 0.49999997 0.5 0.5 0.50000006 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333333 0.8333334 0.8333334 0.25 0.25 0.25 0.25 0.25 0.25 0.75 0.75 0.75 0.75 0.75 0.75 0.25 0.25 0.25 0.25 0.25 0.25 0.75 0.75 0.75 0.75 0.75 0.75 0.5 0.5 0.5 0.5 0.5 0.5 
+        0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.65000004 0.45961943 0.91923887 0.37527767 1.1258893 0.7211102 0.65000004 0.45961943 0.91923887 0.37527767 1.1258893 0.7211102 0.65000004 0.45961943 0.91923887 0.37527767 1.1258893 0.7211102 0.6500001 0.4596194 0.9192388 0.37527764 1.1258893 0.7211102 0.6500001 0.4596194 0.9192388 0.37527764 1.1258893 0.7211102 0.6500001 0.4596194 0.9192388 0.37527764 1.1258893 0.7211102 0.6500001 0.45961946 0.9192388 0.3752777 1.1258893 0.72111017 0.6500001 0.45961946 0.9192388 0.3752777 1.1258893 0.72111017 0.6500001 0.45961946 0.9192388 0.3752777 1.1258893 0.72111017 0.8000001 0.5656855 1.131371 0.4618802 1.3857099 0.8717798 0.8000001 0.5656855 1.131371 0.4618802 1.3857099 0.8717798 0.80000013 0.5656855 1.131371 0.4618802 1.3857098 0.87177986 0.80000013 0.5656855 1.131371 0.4618802 1.3857098 0.87177986 0.95000005 0.6717515 1.343503 0.5484828 1.6455305 0.97467947 
+        0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.6499999 0.9192387 0.45961934 1.1258329 0.3752589 0.7211102 0.64999986 0.9192387 0.4596193 1.125833 0.37525892 0.7211102 0.64999986 0.91923875 0.45961928 1.1258328 0.37525892 0.72111017 0.6499999 0.9192387 0.45961934 1.1258329 0.3752589 0.7211102 0.64999986 0.9192387 0.4596193 1.125833 0.37525892 0.7211102 0.64999986 0.91923875 0.45961928 1.1258328 0.37525892 0.72111017 0.6499999 0.9192387 0.45961934 1.1258329 0.3752589 0.7211102 0.64999986 0.9192387 0.4596193 1.125833 0.37525892 0.7211102 0.64999986 0.91923875 0.45961928 1.1258328 0.37525892 0.72111017 0.79999995 1.1313708 0.5656854 1.3856406 0.46185714 0.8717798 0.79999995 1.1313708 0.56568533 1.3856406 0.46185708 0.87177986 0.79999995 1.1313708 0.5656854 1.3856406 0.46185714 0.8717798 0.79999995 1.1313708 0.56568533 1.3856406 0.46185708 0.87177986 0.9499999 1.3435028 0.6717514 1.6454482 0.54845536 0.97467947 
+
diff --git a/tensorflow/contrib/lite/examples/android/assets/coco_labels_list.txt b/tensorflow/contrib/lite/examples/android/assets/coco_labels_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a70ff82aa7b0fa7315ca591820e4cf7d2f5ad18
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/coco_labels_list.txt
@@ -0,0 +1,91 @@
+???
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+???
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+???
+backpack
+umbrella
+???
+???
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+???
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+???
+dining table
+???
+???
+toilet
+???
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+???
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/tensorflow/contrib/lite/examples/android/assets/conv_actions_labels.txt b/tensorflow/contrib/lite/examples/android/assets/conv_actions_labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba416458b011a7f4b96739eb6fcb6275a6ab3bec
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/conv_actions_labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/examples/android/assets/labels_mobilenet_quant_v1_224.txt b/tensorflow/contrib/lite/examples/android/assets/labels_mobilenet_quant_v1_224.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe811239d8e2989de19fecabb1ebb0c9dddac514
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/labels_mobilenet_quant_v1_224.txt
@@ -0,0 +1,1001 @@
+background
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/contrib/lite/examples/android/build.gradle b/tensorflow/contrib/lite/examples/android/build.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..0d4de358156a5d139e35cc542b8d36ab24e763b9
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/build.gradle
@@ -0,0 +1,52 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 26
+    buildToolsVersion "26.0.1"
+    defaultConfig {
+        applicationId "org.tensorflow.lite.demo"
+        minSdkVersion 15
+        targetSdkVersion 26
+        versionCode 1
+        versionName "1.0"
+        testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+
+        // Remove this block.
+        jackOptions {
+            enabled true
+        }
+    }
+    lintOptions {
+        abortOnError false
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+}
+
+repositories {
+    maven {
+        url 'https://google.bintray.com/tensorflow'
+    }
+}
+
+dependencies {
+    compile fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+        exclude group: 'com.android.support', module: 'support-annotations'
+    })
+    compile 'org.tensorflow:tensorflow-lite:+'
+
+    testCompile 'junit:junit:4.12'
+}
diff --git a/tensorflow/contrib/lite/examples/android/res/animator/color_animation.xml b/tensorflow/contrib/lite/examples/android/res/animator/color_animation.xml
new file mode 100644
index 0000000000000000000000000000000000000000..891d8cc1d4f3e59d0371030fd763c5ad468e7887
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/animator/color_animation.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<set xmlns:android="http://schemas.android.com/apk/res/android"
+  android:ordering="sequentially">
+  <objectAnimator
+    android:propertyName="backgroundColor"
+    android:duration="375"
+    android:valueFrom="0x00b3ccff"
+    android:valueTo="0xffb3ccff"
+    android:valueType="colorType"/>
+  <objectAnimator
+    android:propertyName="backgroundColor"
+    android:duration="375"
+    android:valueFrom="0xffb3ccff"
+    android:valueTo="0x00b3ccff"
+    android:valueType="colorType"/>
+</set>
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..32bd1aabcabb85ded957230533c00e735183a323
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3113cd15c3255405ee34c622a1e83674e6e5487
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/tile.9.png b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/tile.9.png
new file mode 100644
index 0000000000000000000000000000000000000000..135862883e26eddce2b19db021adf62e10357ad0
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/tile.9.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..8efbbf8b3c44418551699db9388cd77a88362112
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f87ee6507cebec6bff32b1a03b36ffc711689d
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba143ea7a80f03b0e850775ad672ccb2d6195e4c
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..6361d792dacd8ce09a14258878b5ce6db5e0debb
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..394eb7e534905e36fd24c3defac92c09b403ee39
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e27bec9785d4d51fe597bced7f04508994aa10c
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable/border.xml b/tensorflow/contrib/lite/examples/android/res/drawable/border.xml
new file mode 100644
index 0000000000000000000000000000000000000000..dd1d64d1d61f359422c79533f726991c78e47d99
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/drawable/border.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<shape xmlns:android="http://schemas.android.com/apk/res/android" android:shape="rectangle" >
+  <solid android:color="#00000000" />
+  <stroke android:width="1dip" android:color="#cccccc" />
+</shape>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/activity_camera.xml b/tensorflow/contrib/lite/examples/android/res/layout/activity_camera.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1a22d4b33ebbd755104272863c5cc6c93793b86b
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/activity_camera.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/container"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#000"
+    tools:context="org.tensorflow.demo.CameraActivity" />
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/activity_speech.xml b/tensorflow/contrib/lite/examples/android/res/layout/activity_speech.xml
new file mode 100644
index 0000000000000000000000000000000000000000..2fe1338da57122c7e26c64c653076b6746a25497
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/activity_speech.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context="org.tensorflow.demo.SpeechActivity">
+
+    <TextView
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Say one of the words below!"
+        android:id="@+id/textView"
+        android:textAlignment="center"
+        android:layout_gravity="top"
+        android:textSize="24dp"
+        android:layout_marginTop="10dp"
+        android:layout_marginLeft="10dp"
+        />
+
+    <ListView
+        android:id="@+id/list_view"
+        android:layout_width="240dp"
+        android:layout_height="wrap_content"
+        android:background="@drawable/border"
+        android:layout_gravity="top|center_horizontal"
+        android:textAlignment="center"
+        android:layout_marginTop="100dp"
+        />
+
+    <Button
+        android:id="@+id/quit"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Quit"
+        android:layout_gravity="bottom|center_horizontal"
+        android:layout_marginBottom="10dp"
+        />
+
+</FrameLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment.xml b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a1bbdf1702cea79088715d30b8746f7fc8fdac56
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+    <org.tensorflow.demo.AutoFitTextureView
+        android:id="@+id/texture"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_alignParentBottom="true" />
+
+    <org.tensorflow.demo.RecognitionScoreView
+        android:id="@+id/results"
+        android:layout_width="match_parent"
+        android:layout_height="112dp"
+        android:layout_alignParentTop="true" />
+
+    <org.tensorflow.demo.OverlayView
+        android:id="@+id/debug_overlay"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_alignParentBottom="true" />
+
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_stylize.xml b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_stylize.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1cdb24cab03222934ca2aa326a765150d58aa6a8
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_stylize.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+      android:orientation="vertical"
+      android:layout_width="match_parent"
+      android:layout_height="match_parent">
+  <org.tensorflow.demo.AutoFitTextureView
+    android:id="@+id/texture"
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    android:layout_alignParentTop="true" />
+
+  <RelativeLayout
+    android:id="@+id/black"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#FF000000" />
+
+  <GridView
+    android:id="@+id/grid_layout"
+    android:numColumns="7"
+    android:stretchMode="columnWidth"
+    android:layout_alignParentBottom="true"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content" />
+
+  <org.tensorflow.demo.OverlayView
+      android:id="@+id/overlay"
+      android:layout_width="match_parent"
+      android:layout_height="match_parent"
+      android:layout_alignParentTop="true" />
+
+  <org.tensorflow.demo.OverlayView
+    android:id="@+id/debug_overlay"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:layout_alignParentTop="true" />
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_tracking.xml b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_tracking.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ca18ea075dbb65d93bc895bc33211a171e52d62e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_tracking.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+      <org.tensorflow.demo.AutoFitTextureView
+          android:id="@+id/texture"
+          android:layout_width="wrap_content"
+          android:layout_height="wrap_content"/>
+
+      <org.tensorflow.demo.OverlayView
+          android:id="@+id/tracking_overlay"
+          android:layout_width="match_parent"
+          android:layout_height="match_parent"/>
+
+      <org.tensorflow.demo.OverlayView
+          android:id="@+id/debug_overlay"
+          android:layout_width="match_parent"
+          android:layout_height="match_parent"/>
+</FrameLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/list_text_item.xml b/tensorflow/contrib/lite/examples/android/res/layout/list_text_item.xml
new file mode 100644
index 0000000000000000000000000000000000000000..526017fbb24ecfa6765a21378f1ae0890a97a004
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/list_text_item.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<TextView
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/list_text_item"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:text="TextView"
+    android:textSize="24dp"
+    android:textAlignment="center"
+    android:gravity="center_horizontal"
+    />
diff --git a/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-dimens.xml b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-dimens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..820eda0e5585284c4b3f2bbaebdfee9d074d4c19
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-dimens.xml
@@ -0,0 +1,24 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Semantic definitions -->
+
+    <dimen name="horizontal_page_margin">@dimen/margin_huge</dimen>
+    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..09303314e91eed623b5aca91189372aaea767c9e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-styles.xml
@@ -0,0 +1,25 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <style name="Widget.SampleMessage">
+        <item name="android:textAppearance">?android:textAppearanceLarge</item>
+        <item name="android:lineSpacingMultiplier">1.2</item>
+        <item name="android:shadowDy">-6.5</item>
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v11/styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v11/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c2d1babc121ec5680f85e9d8b6a8f65f8fefbb6e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v11/styles.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+  <!--
+        Base application theme for API 11+. This theme completely replaces
+        AppBaseTheme from res/values/styles.xml on API 11+ devices.
+  -->
+  <style name="AppBaseTheme" parent="android:Theme.Holo.Light">
+    <!-- API 11 theme customizations can go here. -->
+  </style>
+
+  <style name="FullscreenTheme" parent="android:Theme.Holo">
+    <item name="android:actionBarStyle">@style/FullscreenActionBarStyle</item>
+    <item name="android:windowActionBarOverlay">true</item>
+    <item name="android:windowBackground">@null</item>
+    <item name="metaButtonBarStyle">?android:attr/buttonBarStyle</item>
+    <item name="metaButtonBarButtonStyle">?android:attr/buttonBarButtonStyle</item>
+  </style>
+
+  <style name="FullscreenActionBarStyle" parent="android:Widget.Holo.ActionBar">
+    <!--  <item name="android:background">@color/black_overlay</item>  -->
+  </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v11/template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v11/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1ad048439cf1c5207a609d4664674e9a4278ee6c
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v11/template-styles.xml
@@ -0,0 +1,22 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Activity themes -->
+    <style name="Theme.Base" parent="android:Theme.Holo.Light" />
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v14/styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v14/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..cc370849c0f90627283345bcfa03d0bb0b40e1b2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v14/styles.xml
@@ -0,0 +1,12 @@
+<resources>
+
+  <!--
+        Base application theme for API 14+. This theme completely replaces
+        AppBaseTheme from BOTH res/values/styles.xml and
+        res/values-v11/styles.xml on API 14+ devices.
+  -->
+  <style name="AppBaseTheme" parent="android:Theme.Holo.Light.DarkActionBar">
+    <!-- API 14 theme customizations can go here. -->
+  </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v21/base-colors.xml b/tensorflow/contrib/lite/examples/android/res/values-v21/base-colors.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c16da7c51ceeb3e634c349ff098f86eccb53b8f8
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v21/base-colors.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v21/base-template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v21/base-template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..8890d2f4a507e30c28457ea9692f03af5834c82f
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v21/base-template-styles.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+
+    <!-- Activity themes -->
+    <style name="Theme.Base" parent="android:Theme.Material.Light">
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/attrs.xml b/tensorflow/contrib/lite/examples/android/res/values/attrs.xml
new file mode 100644
index 0000000000000000000000000000000000000000..56e5beae76f2a148c147d599fe0e02bd78a5f729
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/attrs.xml
@@ -0,0 +1,14 @@
+<resources>
+
+  <!--
+         Declare custom theme attributes that allow changing which styles are
+         used for button bars depending on the API level.
+         ?android:attr/buttonBarStyle is new as of API 11 so this is
+         necessary to support previous API levels.
+  -->
+  <declare-styleable name="ButtonBarContainerTheme">
+    <attr name="metaButtonBarStyle" format="reference" />
+    <attr name="metaButtonBarButtonStyle" format="reference" />
+  </declare-styleable>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/base-strings.xml b/tensorflow/contrib/lite/examples/android/res/values/base-strings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ebc5dc8423ca6a481bbfcfabcbcd66e4367428eb
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/base-strings.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+    <string name="app_name">TFLite Demo</string>
+    <string name="activity_name_classification">TFL Classify</string>
+    <string name="activity_name_detection">TFL Detect</string>
+    <string name="activity_name_speech">TFL Speech</string>
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/colors.xml b/tensorflow/contrib/lite/examples/android/res/values/colors.xml
new file mode 100644
index 0000000000000000000000000000000000000000..584ed6052d4746bce5b60fef0b25633777262a11
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/colors.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <color name="control_background">#cc4285f4</color>
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/strings.xml b/tensorflow/contrib/lite/examples/android/res/values/strings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ea20ee78e0b99c0ad7f1c315269a7fd5435cff98
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/strings.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <string name="description_info">Info</string>
+    <string name="request_permission">This sample needs camera permission.</string>
+    <string name="camera_error">This device doesn\'t support Camera2 API.</string>
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/styles.xml b/tensorflow/contrib/lite/examples/android/res/values/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..dd1d973e9be8c82b68e39f755650efec71d95005
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/styles.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <style name="MaterialTheme" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" />
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/template-dimens.xml b/tensorflow/contrib/lite/examples/android/res/values/template-dimens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..069977b6a6f4c9d14ed859d4e8dd95d42f7ce74f
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/template-dimens.xml
@@ -0,0 +1,32 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Define standard dimensions to comply with Holo-style grids and rhythm. -->
+
+    <dimen name="margin_tiny">4dp</dimen>
+    <dimen name="margin_small">8dp</dimen>
+    <dimen name="margin_medium">16dp</dimen>
+    <dimen name="margin_large">32dp</dimen>
+    <dimen name="margin_huge">64dp</dimen>
+
+    <!-- Semantic definitions -->
+
+    <dimen name="horizontal_page_margin">@dimen/margin_medium</dimen>
+    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1b87714a49409a40d3a4649d83f0d0ff0fd57b9d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/template-styles.xml
@@ -0,0 +1,42 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Activity themes -->
+
+    <style name="Theme.Base" parent="android:Theme.Light" />
+
+    <style name="Theme.Sample" parent="Theme.Base" />
+
+    <style name="AppTheme" parent="Theme.Sample" />
+    <!-- Widget styling -->
+
+    <style name="Widget" />
+
+    <style name="Widget.SampleMessage">
+        <item name="android:textAppearance">?android:textAppearanceMedium</item>
+        <item name="android:lineSpacingMultiplier">1.1</item>
+    </style>
+
+    <style name="Widget.SampleMessageTile">
+        <item name="android:background">@drawable/tile</item>
+        <item name="android:shadowColor">#7F000000</item>
+        <item name="android:shadowDy">-3.5</item>
+        <item name="android:shadowRadius">2</item>
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java
new file mode 100644
index 0000000000000000000000000000000000000000..eff24afdba44e5d56c760c1692df5fc40f5c2f42
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.util.AttributeSet;
+import android.view.TextureView;
+
+/**
+ * A {@link TextureView} that can be adjusted to a specified aspect ratio.
+ */
+public class AutoFitTextureView extends TextureView {
+  private int ratioWidth = 0;
+  private int ratioHeight = 0;
+
+  public AutoFitTextureView(final Context context) {
+    this(context, null);
+  }
+
+  public AutoFitTextureView(final Context context, final AttributeSet attrs) {
+    this(context, attrs, 0);
+  }
+
+  public AutoFitTextureView(final Context context, final AttributeSet attrs, final int defStyle) {
+    super(context, attrs, defStyle);
+  }
+
+  /**
+   * Sets the aspect ratio for this view. The size of the view will be measured based on the ratio
+   * calculated from the parameters. Note that the actual sizes of parameters don't matter, that
+   * is, calling setAspectRatio(2, 3) and setAspectRatio(4, 6) make the same result.
+   *
+   * @param width  Relative horizontal size
+   * @param height Relative vertical size
+   */
+  public void setAspectRatio(final int width, final int height) {
+    if (width < 0 || height < 0) {
+      throw new IllegalArgumentException("Size cannot be negative.");
+    }
+    ratioWidth = width;
+    ratioHeight = height;
+    requestLayout();
+  }
+
+  @Override
+  protected void onMeasure(final int widthMeasureSpec, final int heightMeasureSpec) {
+    super.onMeasure(widthMeasureSpec, heightMeasureSpec);
+    final int width = MeasureSpec.getSize(widthMeasureSpec);
+    final int height = MeasureSpec.getSize(heightMeasureSpec);
+    if (0 == ratioWidth || 0 == ratioHeight) {
+      setMeasuredDimension(width, height);
+    } else {
+      if (width < height * ratioWidth / ratioHeight) {
+        setMeasuredDimension(width, width * ratioHeight / ratioWidth);
+      } else {
+        setMeasuredDimension(height * ratioWidth / ratioHeight, height);
+      }
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..15d5456f027b27ae3cbb93f736dbb104af0218de
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -0,0 +1,450 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.Manifest;
+import android.app.Activity;
+import android.app.Fragment;
+import android.content.Context;
+import android.content.pm.PackageManager;
+import android.hardware.Camera;
+import android.hardware.camera2.CameraAccessException;
+import android.hardware.camera2.CameraCharacteristics;
+import android.hardware.camera2.CameraManager;
+import android.hardware.camera2.params.StreamConfigurationMap;
+import android.media.Image;
+import android.media.Image.Plane;
+import android.media.ImageReader;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.Build;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.os.Trace;
+import android.util.Size;
+import android.view.KeyEvent;
+import android.view.Surface;
+import android.view.WindowManager;
+import android.widget.Toast;
+import java.nio.ByteBuffer;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public abstract class CameraActivity extends Activity
+    implements OnImageAvailableListener, Camera.PreviewCallback {
+  private static final Logger LOGGER = new Logger();
+
+  private static final int PERMISSIONS_REQUEST = 1;
+
+  private static final String PERMISSION_CAMERA = Manifest.permission.CAMERA;
+  private static final String PERMISSION_STORAGE = Manifest.permission.WRITE_EXTERNAL_STORAGE;
+
+  private boolean debug = false;
+
+  private Handler handler;
+  private HandlerThread handlerThread;
+  private boolean useCamera2API;
+  private boolean isProcessingFrame = false;
+  private byte[][] yuvBytes = new byte[3][];
+  private int[] rgbBytes = null;
+  private int yRowStride;
+
+  protected int previewWidth = 0;
+  protected int previewHeight = 0;
+
+  private Runnable postInferenceCallback;
+  private Runnable imageConverter;
+
+  @Override
+  protected void onCreate(final Bundle savedInstanceState) {
+    LOGGER.d("onCreate " + this);
+    super.onCreate(null);
+    getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+
+    setContentView(R.layout.activity_camera);
+
+    if (hasPermission()) {
+      setFragment();
+    } else {
+      requestPermission();
+    }
+  }
+
+
+  protected int[] getRgbBytes() {
+    imageConverter.run();
+    return rgbBytes;
+  }
+
+  protected int getLuminanceStride() {
+    return yRowStride;
+  }
+
+  protected byte[] getLuminance() {
+    return yuvBytes[0];
+  }
+
+  /**
+   * Callback for android.hardware.Camera API
+   */
+  @Override
+  public void onPreviewFrame(final byte[] bytes, final Camera camera) {
+    if (isProcessingFrame) {
+      LOGGER.w("Dropping frame!");
+      return;
+    }
+
+    try {
+      // Initialize the storage bitmaps once when the resolution is known.
+      if (rgbBytes == null) {
+        Camera.Size previewSize = camera.getParameters().getPreviewSize();
+        previewHeight = previewSize.height;
+        previewWidth = previewSize.width;
+        rgbBytes = new int[previewWidth * previewHeight];
+        onPreviewSizeChosen(new Size(previewSize.width, previewSize.height), 90);
+      }
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+      return;
+    }
+
+    isProcessingFrame = true;
+    yuvBytes[0] = bytes;
+    yRowStride = previewWidth;
+
+    imageConverter =
+        new Runnable() {
+          @Override
+          public void run() {
+            ImageUtils.convertYUV420SPToARGB8888(bytes, previewWidth, previewHeight, rgbBytes);
+          }
+        };
+
+    postInferenceCallback =
+        new Runnable() {
+          @Override
+          public void run() {
+            camera.addCallbackBuffer(bytes);
+            isProcessingFrame = false;
+          }
+        };
+    processImage();
+  }
+
+  /**
+   * Callback for Camera2 API
+   */
+  @Override
+  public void onImageAvailable(final ImageReader reader) {
+    //We need wait until we have some size from onPreviewSizeChosen
+    if (previewWidth == 0 || previewHeight == 0) {
+      return;
+    }
+    if (rgbBytes == null) {
+      rgbBytes = new int[previewWidth * previewHeight];
+    }
+    try {
+      final Image image = reader.acquireLatestImage();
+
+      if (image == null) {
+        return;
+      }
+
+      if (isProcessingFrame) {
+        image.close();
+        return;
+      }
+      isProcessingFrame = true;
+      Trace.beginSection("imageAvailable");
+      final Plane[] planes = image.getPlanes();
+      fillBytes(planes, yuvBytes);
+      yRowStride = planes[0].getRowStride();
+      final int uvRowStride = planes[1].getRowStride();
+      final int uvPixelStride = planes[1].getPixelStride();
+
+      imageConverter =
+          new Runnable() {
+            @Override
+            public void run() {
+              ImageUtils.convertYUV420ToARGB8888(
+                  yuvBytes[0],
+                  yuvBytes[1],
+                  yuvBytes[2],
+                  previewWidth,
+                  previewHeight,
+                  yRowStride,
+                  uvRowStride,
+                  uvPixelStride,
+                  rgbBytes);
+            }
+          };
+
+      postInferenceCallback =
+          new Runnable() {
+            @Override
+            public void run() {
+              image.close();
+              isProcessingFrame = false;
+            }
+          };
+
+      processImage();
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+      Trace.endSection();
+      return;
+    }
+    Trace.endSection();
+  }
+
+  @Override
+  public synchronized void onStart() {
+    LOGGER.d("onStart " + this);
+    super.onStart();
+  }
+
+  @Override
+  public synchronized void onResume() {
+    LOGGER.d("onResume " + this);
+    super.onResume();
+
+    handlerThread = new HandlerThread("inference");
+    handlerThread.start();
+    handler = new Handler(handlerThread.getLooper());
+  }
+
+  @Override
+  public synchronized void onPause() {
+    LOGGER.d("onPause " + this);
+
+    if (!isFinishing()) {
+      LOGGER.d("Requesting finish");
+      finish();
+    }
+
+    handlerThread.quitSafely();
+    try {
+      handlerThread.join();
+      handlerThread = null;
+      handler = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+
+    super.onPause();
+  }
+
+  @Override
+  public synchronized void onStop() {
+    LOGGER.d("onStop " + this);
+    super.onStop();
+  }
+
+  @Override
+  public synchronized void onDestroy() {
+    LOGGER.d("onDestroy " + this);
+    super.onDestroy();
+  }
+
+  protected synchronized void runInBackground(final Runnable r) {
+    if (handler != null) {
+      handler.post(r);
+    }
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      final int requestCode, final String[] permissions, final int[] grantResults) {
+    if (requestCode == PERMISSIONS_REQUEST) {
+      if (grantResults.length > 0
+          && grantResults[0] == PackageManager.PERMISSION_GRANTED
+          && grantResults[1] == PackageManager.PERMISSION_GRANTED) {
+        setFragment();
+      } else {
+        requestPermission();
+      }
+    }
+  }
+
+  private boolean hasPermission() {
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      return checkSelfPermission(PERMISSION_CAMERA) == PackageManager.PERMISSION_GRANTED &&
+          checkSelfPermission(PERMISSION_STORAGE) == PackageManager.PERMISSION_GRANTED;
+    } else {
+      return true;
+    }
+  }
+
+  private void requestPermission() {
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      if (shouldShowRequestPermissionRationale(PERMISSION_CAMERA) ||
+          shouldShowRequestPermissionRationale(PERMISSION_STORAGE)) {
+        Toast.makeText(CameraActivity.this,
+            "Camera AND storage permission are required for this demo", Toast.LENGTH_LONG).show();
+      }
+      requestPermissions(new String[] {PERMISSION_CAMERA, PERMISSION_STORAGE}, PERMISSIONS_REQUEST);
+    }
+  }
+
+  // Returns true if the device supports the required hardware level, or better.
+  private boolean isHardwareLevelSupported(
+      CameraCharacteristics characteristics, int requiredLevel) {
+    int deviceLevel = characteristics.get(CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL);
+    if (deviceLevel == CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_LEGACY) {
+      return requiredLevel == deviceLevel;
+    }
+    // deviceLevel is not LEGACY, can use numerical sort
+    return requiredLevel <= deviceLevel;
+  }
+
+  private String chooseCamera() {
+    final CameraManager manager = (CameraManager) getSystemService(Context.CAMERA_SERVICE);
+    try {
+      for (final String cameraId : manager.getCameraIdList()) {
+        final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
+
+        // We don't use a front facing camera in this sample.
+        final Integer facing = characteristics.get(CameraCharacteristics.LENS_FACING);
+        if (facing != null && facing == CameraCharacteristics.LENS_FACING_FRONT) {
+          continue;
+        }
+
+        final StreamConfigurationMap map =
+            characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
+
+        if (map == null) {
+          continue;
+        }
+
+        // Fallback to camera1 API for internal cameras that don't have full support.
+        // This should help with legacy situations where using the camera2 API causes
+        // distorted or otherwise broken previews.
+        useCamera2API = (facing == CameraCharacteristics.LENS_FACING_EXTERNAL)
+            || isHardwareLevelSupported(characteristics, 
+                                        CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
+        LOGGER.i("Camera API lv2?: %s", useCamera2API);
+        return cameraId;
+      }
+    } catch (CameraAccessException e) {
+      LOGGER.e(e, "Not allowed to access camera");
+    }
+
+    return null;
+  }
+
+  protected void setFragment() {
+    String cameraId = chooseCamera();
+
+    Fragment fragment;
+    if (useCamera2API) {
+      CameraConnectionFragment camera2Fragment =
+          CameraConnectionFragment.newInstance(
+              new CameraConnectionFragment.ConnectionCallback() {
+                @Override
+                public void onPreviewSizeChosen(final Size size, final int rotation) {
+                  previewHeight = size.getHeight();
+                  previewWidth = size.getWidth();
+                  CameraActivity.this.onPreviewSizeChosen(size, rotation);
+                }
+              },
+              this,
+              getLayoutId(),
+              getDesiredPreviewFrameSize());
+
+      camera2Fragment.setCamera(cameraId);
+      fragment = camera2Fragment;
+    } else {
+      fragment =
+          new LegacyCameraConnectionFragment(this, getLayoutId(), getDesiredPreviewFrameSize());
+    }
+
+    getFragmentManager()
+        .beginTransaction()
+        .replace(R.id.container, fragment)
+        .commit();
+  }
+
+  protected void fillBytes(final Plane[] planes, final byte[][] yuvBytes) {
+    // Because of the variable row stride it's not possible to know in
+    // advance the actual necessary dimensions of the yuv planes.
+    for (int i = 0; i < planes.length; ++i) {
+      final ByteBuffer buffer = planes[i].getBuffer();
+      if (yuvBytes[i] == null) {
+        LOGGER.d("Initializing buffer %d at size %d", i, buffer.capacity());
+        yuvBytes[i] = new byte[buffer.capacity()];
+      }
+      buffer.get(yuvBytes[i]);
+    }
+  }
+
+  public boolean isDebug() {
+    return debug;
+  }
+
+  public void requestRender() {
+    final OverlayView overlay = (OverlayView) findViewById(R.id.debug_overlay);
+    if (overlay != null) {
+      overlay.postInvalidate();
+    }
+  }
+
+  public void addCallback(final OverlayView.DrawCallback callback) {
+    final OverlayView overlay = (OverlayView) findViewById(R.id.debug_overlay);
+    if (overlay != null) {
+      overlay.addCallback(callback);
+    }
+  }
+
+  public void onSetDebug(final boolean debug) {}
+
+  @Override
+  public boolean onKeyDown(final int keyCode, final KeyEvent event) {
+    if (keyCode == KeyEvent.KEYCODE_VOLUME_DOWN || keyCode == KeyEvent.KEYCODE_VOLUME_UP) {
+      debug = !debug;
+      requestRender();
+      onSetDebug(debug);
+      return true;
+    }
+    return super.onKeyDown(keyCode, event);
+  }
+
+  protected void readyForNextImage() {
+    if (postInferenceCallback != null) {
+      postInferenceCallback.run();
+    }
+  }
+
+  protected int getScreenOrientation() {
+    switch (getWindowManager().getDefaultDisplay().getRotation()) {
+      case Surface.ROTATION_270:
+        return 270;
+      case Surface.ROTATION_180:
+        return 180;
+      case Surface.ROTATION_90:
+        return 90;
+      default:
+        return 0;
+    }
+  }
+
+  protected abstract void processImage();
+
+  protected abstract void onPreviewSizeChosen(final Size size, final int rotation);
+  protected abstract int getLayoutId();
+  protected abstract Size getDesiredPreviewFrameSize();
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
new file mode 100644
index 0000000000000000000000000000000000000000..51a1adb538e48cad4807d35f3efc6feefe81309b
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
@@ -0,0 +1,634 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.app.Activity;
+import android.app.AlertDialog;
+import android.app.Dialog;
+import android.app.DialogFragment;
+import android.app.Fragment;
+import android.content.Context;
+import android.content.DialogInterface;
+import android.content.res.Configuration;
+import android.graphics.ImageFormat;
+import android.graphics.Matrix;
+import android.graphics.RectF;
+import android.graphics.SurfaceTexture;
+import android.hardware.camera2.CameraAccessException;
+import android.hardware.camera2.CameraCaptureSession;
+import android.hardware.camera2.CameraCharacteristics;
+import android.hardware.camera2.CameraDevice;
+import android.hardware.camera2.CameraManager;
+import android.hardware.camera2.CaptureRequest;
+import android.hardware.camera2.CaptureResult;
+import android.hardware.camera2.TotalCaptureResult;
+import android.hardware.camera2.params.StreamConfigurationMap;
+import android.media.ImageReader;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.text.TextUtils;
+import android.util.Size;
+import android.util.SparseIntArray;
+import android.view.LayoutInflater;
+import android.view.Surface;
+import android.view.TextureView;
+import android.view.View;
+import android.view.ViewGroup;
+import android.widget.Toast;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.TimeUnit;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public class CameraConnectionFragment extends Fragment {
+  private static final Logger LOGGER = new Logger();
+
+  /**
+   * The camera preview size will be chosen to be the smallest frame by pixel size capable of
+   * containing a DESIRED_SIZE x DESIRED_SIZE square.
+   */
+  private static final int MINIMUM_PREVIEW_SIZE = 320;
+
+  /**
+   * Conversion from screen rotation to JPEG orientation.
+   */
+  private static final SparseIntArray ORIENTATIONS = new SparseIntArray();
+  private static final String FRAGMENT_DIALOG = "dialog";
+
+  static {
+    ORIENTATIONS.append(Surface.ROTATION_0, 90);
+    ORIENTATIONS.append(Surface.ROTATION_90, 0);
+    ORIENTATIONS.append(Surface.ROTATION_180, 270);
+    ORIENTATIONS.append(Surface.ROTATION_270, 180);
+  }
+
+  /**
+   * {@link android.view.TextureView.SurfaceTextureListener} handles several lifecycle events on a
+   * {@link TextureView}.
+   */
+  private final TextureView.SurfaceTextureListener surfaceTextureListener =
+      new TextureView.SurfaceTextureListener() {
+        @Override
+        public void onSurfaceTextureAvailable(
+            final SurfaceTexture texture, final int width, final int height) {
+          openCamera(width, height);
+        }
+
+        @Override
+        public void onSurfaceTextureSizeChanged(
+            final SurfaceTexture texture, final int width, final int height) {
+          configureTransform(width, height);
+        }
+
+        @Override
+        public boolean onSurfaceTextureDestroyed(final SurfaceTexture texture) {
+          return true;
+        }
+
+        @Override
+        public void onSurfaceTextureUpdated(final SurfaceTexture texture) {}
+      };
+
+  /**
+   * Callback for Activities to use to initialize their data once the
+   * selected preview size is known.
+   */
+  public interface ConnectionCallback {
+    void onPreviewSizeChosen(Size size, int cameraRotation);
+  }
+
+  /**
+   * ID of the current {@link CameraDevice}.
+   */
+  private String cameraId;
+
+  /**
+   * An {@link AutoFitTextureView} for camera preview.
+   */
+  private AutoFitTextureView textureView;
+
+  /**
+   * A {@link CameraCaptureSession } for camera preview.
+   */
+  private CameraCaptureSession captureSession;
+
+  /**
+   * A reference to the opened {@link CameraDevice}.
+   */
+  private CameraDevice cameraDevice;
+
+  /**
+   * The rotation in degrees of the camera sensor from the display.
+   */
+  private Integer sensorOrientation;
+
+  /**
+   * The {@link android.util.Size} of camera preview.
+   */
+  private Size previewSize;
+
+  /**
+   * {@link android.hardware.camera2.CameraDevice.StateCallback}
+   * is called when {@link CameraDevice} changes its state.
+   */
+  private final CameraDevice.StateCallback stateCallback =
+      new CameraDevice.StateCallback() {
+        @Override
+        public void onOpened(final CameraDevice cd) {
+          // This method is called when the camera is opened.  We start camera preview here.
+          cameraOpenCloseLock.release();
+          cameraDevice = cd;
+          createCameraPreviewSession();
+        }
+
+        @Override
+        public void onDisconnected(final CameraDevice cd) {
+          cameraOpenCloseLock.release();
+          cd.close();
+          cameraDevice = null;
+        }
+
+        @Override
+        public void onError(final CameraDevice cd, final int error) {
+          cameraOpenCloseLock.release();
+          cd.close();
+          cameraDevice = null;
+          final Activity activity = getActivity();
+          if (null != activity) {
+            activity.finish();
+          }
+        }
+      };
+
+  /**
+   * An additional thread for running tasks that shouldn't block the UI.
+   */
+  private HandlerThread backgroundThread;
+
+  /**
+   * A {@link Handler} for running tasks in the background.
+   */
+  private Handler backgroundHandler;
+
+  /**
+   * An {@link ImageReader} that handles preview frame capture.
+   */
+  private ImageReader previewReader;
+
+  /**
+   * {@link android.hardware.camera2.CaptureRequest.Builder} for the camera preview
+   */
+  private CaptureRequest.Builder previewRequestBuilder;
+
+  /**
+   * {@link CaptureRequest} generated by {@link #previewRequestBuilder}
+   */
+  private CaptureRequest previewRequest;
+
+  /**
+   * A {@link Semaphore} to prevent the app from exiting before closing the camera.
+   */
+  private final Semaphore cameraOpenCloseLock = new Semaphore(1);
+
+  /**
+   * A {@link OnImageAvailableListener} to receive frames as they are available.
+   */
+  private final OnImageAvailableListener imageListener;
+
+  /** The input size in pixels desired by TensorFlow (width and height of a square bitmap). */
+  private final Size inputSize;
+
+  /**
+   * The layout identifier to inflate for this Fragment.
+   */
+  private final int layout;
+
+
+  private final ConnectionCallback cameraConnectionCallback;
+
+  private CameraConnectionFragment(
+      final ConnectionCallback connectionCallback,
+      final OnImageAvailableListener imageListener,
+      final int layout,
+      final Size inputSize) {
+    this.cameraConnectionCallback = connectionCallback;
+    this.imageListener = imageListener;
+    this.layout = layout;
+    this.inputSize = inputSize;
+  }
+
+  /**
+   * Shows a {@link Toast} on the UI thread.
+   *
+   * @param text The message to show
+   */
+  private void showToast(final String text) {
+    final Activity activity = getActivity();
+    if (activity != null) {
+      activity.runOnUiThread(
+          new Runnable() {
+            @Override
+            public void run() {
+              Toast.makeText(activity, text, Toast.LENGTH_SHORT).show();
+            }
+          });
+    }
+  }
+
+  /**
+   * Given {@code choices} of {@code Size}s supported by a camera, chooses the smallest one whose
+   * width and height are at least as large as the minimum of both, or an exact match if possible.
+   *
+   * @param choices The list of sizes that the camera supports for the intended output class
+   * @param width The minimum desired width
+   * @param height The minimum desired height
+   * @return The optimal {@code Size}, or an arbitrary one if none were big enough
+   */
+  protected static Size chooseOptimalSize(final Size[] choices, final int width, final int height) {
+    final int minSize = Math.max(Math.min(width, height), MINIMUM_PREVIEW_SIZE);
+    final Size desiredSize = new Size(width, height);
+
+    // Collect the supported resolutions that are at least as big as the preview Surface
+    boolean exactSizeFound = false;
+    final List<Size> bigEnough = new ArrayList<Size>();
+    final List<Size> tooSmall = new ArrayList<Size>();
+    for (final Size option : choices) {
+      if (option.equals(desiredSize)) {
+        // Set the size but don't return yet so that remaining sizes will still be logged.
+        exactSizeFound = true;
+      }
+
+      if (option.getHeight() >= minSize && option.getWidth() >= minSize) {
+        bigEnough.add(option);
+      } else {
+        tooSmall.add(option);
+      }
+    }
+
+    LOGGER.i("Desired size: " + desiredSize + ", min size: " + minSize + "x" + minSize);
+    LOGGER.i("Valid preview sizes: [" + TextUtils.join(", ", bigEnough) + "]");
+    LOGGER.i("Rejected preview sizes: [" + TextUtils.join(", ", tooSmall) + "]");
+
+    if (exactSizeFound) {
+      LOGGER.i("Exact size match found.");
+      return desiredSize;
+    }
+
+    // Pick the smallest of those, assuming we found any
+    if (bigEnough.size() > 0) {
+      final Size chosenSize = Collections.min(bigEnough, new CompareSizesByArea());
+      LOGGER.i("Chosen size: " + chosenSize.getWidth() + "x" + chosenSize.getHeight());
+      return chosenSize;
+    } else {
+      LOGGER.e("Couldn't find any suitable preview size");
+      return choices[0];
+    }
+  }
+
+  public static CameraConnectionFragment newInstance(
+      final ConnectionCallback callback,
+      final OnImageAvailableListener imageListener,
+      final int layout,
+      final Size inputSize) {
+    return new CameraConnectionFragment(callback, imageListener, layout, inputSize);
+  }
+
+  @Override
+  public View onCreateView(
+      final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
+    return inflater.inflate(layout, container, false);
+  }
+
+  @Override
+  public void onViewCreated(final View view, final Bundle savedInstanceState) {
+    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
+  }
+
+  @Override
+  public void onActivityCreated(final Bundle savedInstanceState) {
+    super.onActivityCreated(savedInstanceState);
+  }
+
+  @Override
+  public void onResume() {
+    super.onResume();
+    startBackgroundThread();
+
+    // When the screen is turned off and turned back on, the SurfaceTexture is already
+    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
+    // a camera and start preview from here (otherwise, we wait until the surface is ready in
+    // the SurfaceTextureListener).
+    if (textureView.isAvailable()) {
+      openCamera(textureView.getWidth(), textureView.getHeight());
+    } else {
+      textureView.setSurfaceTextureListener(surfaceTextureListener);
+    }
+  }
+
+  @Override
+  public void onPause() {
+    closeCamera();
+    stopBackgroundThread();
+    super.onPause();
+  }
+
+  public void setCamera(String cameraId) {
+    this.cameraId = cameraId;
+  }
+
+  /**
+   * Sets up member variables related to camera.
+   */
+  private void setUpCameraOutputs() {
+    final Activity activity = getActivity();
+    final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
+    try {
+      final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
+
+      final StreamConfigurationMap map =
+          characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
+
+      sensorOrientation = characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION);
+
+      // Danger, W.R.! Attempting to use too large a preview size could  exceed the camera
+      // bus' bandwidth limitation, resulting in gorgeous previews but the storage of
+      // garbage capture data.
+      previewSize =
+          chooseOptimalSize(map.getOutputSizes(SurfaceTexture.class),
+              inputSize.getWidth(),
+              inputSize.getHeight());
+
+      // We fit the aspect ratio of TextureView to the size of preview we picked.
+      final int orientation = getResources().getConfiguration().orientation;
+      if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
+        textureView.setAspectRatio(previewSize.getWidth(), previewSize.getHeight());
+      } else {
+        textureView.setAspectRatio(previewSize.getHeight(), previewSize.getWidth());
+      }
+    } catch (final CameraAccessException e) {
+      LOGGER.e(e, "Exception!");
+    } catch (final NullPointerException e) {
+      // Currently an NPE is thrown when the Camera2API is used but not supported on the
+      // device this code runs.
+      // TODO(andrewharp): abstract ErrorDialog/RuntimeException handling out into new method and
+      // reuse throughout app.
+      ErrorDialog.newInstance(getString(R.string.camera_error))
+          .show(getChildFragmentManager(), FRAGMENT_DIALOG);
+      throw new RuntimeException(getString(R.string.camera_error));
+    }
+
+    cameraConnectionCallback.onPreviewSizeChosen(previewSize, sensorOrientation);
+  }
+
+  /**
+   * Opens the camera specified by {@link CameraConnectionFragment#cameraId}.
+   */
+  private void openCamera(final int width, final int height) {
+    setUpCameraOutputs();
+    configureTransform(width, height);
+    final Activity activity = getActivity();
+    final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
+    try {
+      if (!cameraOpenCloseLock.tryAcquire(2500, TimeUnit.MILLISECONDS)) {
+        throw new RuntimeException("Time out waiting to lock camera opening.");
+      }
+      manager.openCamera(cameraId, stateCallback, backgroundHandler);
+    } catch (final CameraAccessException e) {
+      LOGGER.e(e, "Exception!");
+    } catch (final InterruptedException e) {
+      throw new RuntimeException("Interrupted while trying to lock camera opening.", e);
+    }
+  }
+
+  /**
+   * Closes the current {@link CameraDevice}.
+   */
+  private void closeCamera() {
+    try {
+      cameraOpenCloseLock.acquire();
+      if (null != captureSession) {
+        captureSession.close();
+        captureSession = null;
+      }
+      if (null != cameraDevice) {
+        cameraDevice.close();
+        cameraDevice = null;
+      }
+      if (null != previewReader) {
+        previewReader.close();
+        previewReader = null;
+      }
+    } catch (final InterruptedException e) {
+      throw new RuntimeException("Interrupted while trying to lock camera closing.", e);
+    } finally {
+      cameraOpenCloseLock.release();
+    }
+  }
+
+  /**
+   * Starts a background thread and its {@link Handler}.
+   */
+  private void startBackgroundThread() {
+    backgroundThread = new HandlerThread("ImageListener");
+    backgroundThread.start();
+    backgroundHandler = new Handler(backgroundThread.getLooper());
+  }
+
+  /**
+   * Stops the background thread and its {@link Handler}.
+   */
+  private void stopBackgroundThread() {
+    backgroundThread.quitSafely();
+    try {
+      backgroundThread.join();
+      backgroundThread = null;
+      backgroundHandler = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  private final CameraCaptureSession.CaptureCallback captureCallback =
+      new CameraCaptureSession.CaptureCallback() {
+        @Override
+        public void onCaptureProgressed(
+            final CameraCaptureSession session,
+            final CaptureRequest request,
+            final CaptureResult partialResult) {}
+
+        @Override
+        public void onCaptureCompleted(
+            final CameraCaptureSession session,
+            final CaptureRequest request,
+            final TotalCaptureResult result) {}
+      };
+
+  /**
+   * Creates a new {@link CameraCaptureSession} for camera preview.
+   */
+  private void createCameraPreviewSession() {
+    try {
+      final SurfaceTexture texture = textureView.getSurfaceTexture();
+      assert texture != null;
+
+      // We configure the size of default buffer to be the size of camera preview we want.
+      texture.setDefaultBufferSize(previewSize.getWidth(), previewSize.getHeight());
+
+      // This is the output Surface we need to start preview.
+      final Surface surface = new Surface(texture);
+
+      // We set up a CaptureRequest.Builder with the output Surface.
+      previewRequestBuilder = cameraDevice.createCaptureRequest(CameraDevice.TEMPLATE_PREVIEW);
+      previewRequestBuilder.addTarget(surface);
+
+      LOGGER.i("Opening camera preview: " + previewSize.getWidth() + "x" + previewSize.getHeight());
+
+      // Create the reader for the preview frames.
+      previewReader =
+          ImageReader.newInstance(
+              previewSize.getWidth(), previewSize.getHeight(), ImageFormat.YUV_420_888, 2);
+
+      previewReader.setOnImageAvailableListener(imageListener, backgroundHandler);
+      previewRequestBuilder.addTarget(previewReader.getSurface());
+
+      // Here, we create a CameraCaptureSession for camera preview.
+      cameraDevice.createCaptureSession(
+          Arrays.asList(surface, previewReader.getSurface()),
+          new CameraCaptureSession.StateCallback() {
+
+            @Override
+            public void onConfigured(final CameraCaptureSession cameraCaptureSession) {
+              // The camera is already closed
+              if (null == cameraDevice) {
+                return;
+              }
+
+              // When the session is ready, we start displaying the preview.
+              captureSession = cameraCaptureSession;
+              try {
+                // Auto focus should be continuous for camera preview.
+                previewRequestBuilder.set(
+                    CaptureRequest.CONTROL_AF_MODE,
+                    CaptureRequest.CONTROL_AF_MODE_CONTINUOUS_PICTURE);
+                // Flash is automatically enabled when necessary.
+                previewRequestBuilder.set(
+                    CaptureRequest.CONTROL_AE_MODE, CaptureRequest.CONTROL_AE_MODE_ON_AUTO_FLASH);
+
+                // Finally, we start displaying the camera preview.
+                previewRequest = previewRequestBuilder.build();
+                captureSession.setRepeatingRequest(
+                    previewRequest, captureCallback, backgroundHandler);
+              } catch (final CameraAccessException e) {
+                LOGGER.e(e, "Exception!");
+              }
+            }
+
+            @Override
+            public void onConfigureFailed(final CameraCaptureSession cameraCaptureSession) {
+              showToast("Failed");
+            }
+          },
+          null);
+    } catch (final CameraAccessException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  /**
+   * Configures the necessary {@link android.graphics.Matrix} transformation to `mTextureView`.
+   * This method should be called after the camera preview size is determined in
+   * setUpCameraOutputs and also the size of `mTextureView` is fixed.
+   *
+   * @param viewWidth  The width of `mTextureView`
+   * @param viewHeight The height of `mTextureView`
+   */
+  private void configureTransform(final int viewWidth, final int viewHeight) {
+    final Activity activity = getActivity();
+    if (null == textureView || null == previewSize || null == activity) {
+      return;
+    }
+    final int rotation = activity.getWindowManager().getDefaultDisplay().getRotation();
+    final Matrix matrix = new Matrix();
+    final RectF viewRect = new RectF(0, 0, viewWidth, viewHeight);
+    final RectF bufferRect = new RectF(0, 0, previewSize.getHeight(), previewSize.getWidth());
+    final float centerX = viewRect.centerX();
+    final float centerY = viewRect.centerY();
+    if (Surface.ROTATION_90 == rotation || Surface.ROTATION_270 == rotation) {
+      bufferRect.offset(centerX - bufferRect.centerX(), centerY - bufferRect.centerY());
+      matrix.setRectToRect(viewRect, bufferRect, Matrix.ScaleToFit.FILL);
+      final float scale =
+          Math.max(
+              (float) viewHeight / previewSize.getHeight(),
+              (float) viewWidth / previewSize.getWidth());
+      matrix.postScale(scale, scale, centerX, centerY);
+      matrix.postRotate(90 * (rotation - 2), centerX, centerY);
+    } else if (Surface.ROTATION_180 == rotation) {
+      matrix.postRotate(180, centerX, centerY);
+    }
+    textureView.setTransform(matrix);
+  }
+
+  /**
+   * Compares two {@code Size}s based on their areas.
+   */
+  static class CompareSizesByArea implements Comparator<Size> {
+    @Override
+    public int compare(final Size lhs, final Size rhs) {
+      // We cast here to ensure the multiplications won't overflow
+      return Long.signum(
+          (long) lhs.getWidth() * lhs.getHeight() - (long) rhs.getWidth() * rhs.getHeight());
+    }
+  }
+
+  /**
+   * Shows an error message dialog.
+   */
+  public static class ErrorDialog extends DialogFragment {
+    private static final String ARG_MESSAGE = "message";
+
+    public static ErrorDialog newInstance(final String message) {
+      final ErrorDialog dialog = new ErrorDialog();
+      final Bundle args = new Bundle();
+      args.putString(ARG_MESSAGE, message);
+      dialog.setArguments(args);
+      return dialog;
+    }
+
+    @Override
+    public Dialog onCreateDialog(final Bundle savedInstanceState) {
+      final Activity activity = getActivity();
+      return new AlertDialog.Builder(activity)
+          .setMessage(getArguments().getString(ARG_MESSAGE))
+          .setPositiveButton(
+              android.R.string.ok,
+              new DialogInterface.OnClickListener() {
+                @Override
+                public void onClick(final DialogInterface dialogInterface, final int i) {
+                  activity.finish();
+                }
+              })
+          .create();
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/Classifier.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/Classifier.java
new file mode 100644
index 0000000000000000000000000000000000000000..07995febaf5caab65dd4dfcc262ccf3750cfa303
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/Classifier.java
@@ -0,0 +1,107 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.graphics.Bitmap;
+import android.graphics.RectF;
+import java.util.List;
+
+/**
+ * Generic interface for interacting with different recognition engines.
+ */
+public interface Classifier {
+  /**
+   * An immutable result returned by a Classifier describing what was recognized.
+   */
+  public class Recognition {
+    /**
+     * A unique identifier for what has been recognized. Specific to the class, not the instance of
+     * the object.
+     */
+    private final String id;
+
+    /**
+     * Display name for the recognition.
+     */
+    private final String title;
+
+    /**
+     * A sortable score for how good the recognition is relative to others. Higher should be better.
+     */
+    private final Float confidence;
+
+    /** Optional location within the source image for the location of the recognized object. */
+    private RectF location;
+
+    public Recognition(
+        final String id, final String title, final Float confidence, final RectF location) {
+      this.id = id;
+      this.title = title;
+      this.confidence = confidence;
+      this.location = location;
+    }
+
+    public String getId() {
+      return id;
+    }
+
+    public String getTitle() {
+      return title;
+    }
+
+    public Float getConfidence() {
+      return confidence;
+    }
+
+    public RectF getLocation() {
+      return new RectF(location);
+    }
+
+    public void setLocation(RectF location) {
+      this.location = location;
+    }
+
+    @Override
+    public String toString() {
+      String resultString = "";
+      if (id != null) {
+        resultString += "[" + id + "] ";
+      }
+
+      if (title != null) {
+        resultString += title + " ";
+      }
+
+      if (confidence != null) {
+        resultString += String.format("(%.1f%%) ", confidence * 100.0f);
+      }
+
+      if (location != null) {
+        resultString += location + " ";
+      }
+
+      return resultString.trim();
+    }
+  }
+
+  List<Recognition> recognizeImage(Bitmap bitmap);
+
+  void enableStatLogging(final boolean debug);
+
+  String getStatString();
+
+  void close();
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..dcbbefbeab6627b37579902cd25841c0ae257dda
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.graphics.Bitmap;
+import android.graphics.Bitmap.Config;
+import android.graphics.Canvas;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Typeface;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.SystemClock;
+import android.util.Size;
+import android.util.TypedValue;
+import java.util.List;
+import java.util.Vector;
+import org.tensorflow.demo.OverlayView.DrawCallback;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public class ClassifierActivity extends CameraActivity implements OnImageAvailableListener {
+  private static final Logger LOGGER = new Logger();
+
+  protected static final boolean SAVE_PREVIEW_BITMAP = false;
+
+  private ResultsView resultsView;
+
+  private Bitmap rgbFrameBitmap = null;
+  private Bitmap croppedBitmap = null;
+  private Bitmap cropCopyBitmap = null;
+
+  private long lastProcessingTimeMs;
+
+  // These are the settings for the original v1 Inception model. If you want to
+  // use a model that's been produced from the TensorFlow for Poets codelab,
+  // you'll need to set IMAGE_SIZE = 299, IMAGE_MEAN = 128, IMAGE_STD = 128,
+  // INPUT_NAME = "Mul", and OUTPUT_NAME = "final_result".
+  // You'll also need to update the MODEL_FILE and LABEL_FILE paths to point to
+  // the ones you produced.
+  //
+  // To use v3 Inception model, strip the DecodeJpeg Op from your retrained
+  // model first:
+  //
+  // python strip_unused.py \
+  // --input_graph=<retrained-pb-file> \
+  // --output_graph=<your-stripped-pb-file> \
+  // --input_node_names="Mul" \
+  // --output_node_names="final_result" \
+  // --input_binary=true
+  private static final int INPUT_SIZE = 224;
+
+  private static final String MODEL_FILE = "mobilenet_quant_v1_224.tflite";
+  private static final String LABEL_FILE = "labels_mobilenet_quant_v1_224.txt";
+
+  private static final boolean MAINTAIN_ASPECT = true;
+
+  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
+
+
+  private Integer sensorOrientation;
+  private Classifier classifier;
+  private Matrix frameToCropTransform;
+  private Matrix cropToFrameTransform;
+
+  private BorderedText borderedText;
+
+  @Override
+  protected int getLayoutId() {
+    return R.layout.camera_connection_fragment;
+  }
+
+  @Override
+  protected Size getDesiredPreviewFrameSize() {
+    return DESIRED_PREVIEW_SIZE;
+  }
+
+  private static final float TEXT_SIZE_DIP = 10;
+
+  @Override
+  public void onPreviewSizeChosen(final Size size, final int rotation) {
+    final float textSizePx = TypedValue.applyDimension(
+        TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+    borderedText.setTypeface(Typeface.MONOSPACE);
+
+    classifier = TFLiteImageClassifier.create(getAssets(), MODEL_FILE, LABEL_FILE, INPUT_SIZE);
+
+    previewWidth = size.getWidth();
+    previewHeight = size.getHeight();
+
+    sensorOrientation = rotation - getScreenOrientation();
+    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
+
+    LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
+    rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
+    croppedBitmap = Bitmap.createBitmap(INPUT_SIZE, INPUT_SIZE, Config.ARGB_8888);
+
+    frameToCropTransform = ImageUtils.getTransformationMatrix(
+        previewWidth, previewHeight,
+        INPUT_SIZE, INPUT_SIZE,
+        sensorOrientation, MAINTAIN_ASPECT);
+
+    cropToFrameTransform = new Matrix();
+    frameToCropTransform.invert(cropToFrameTransform);
+
+    addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            renderDebug(canvas);
+          }
+        });
+  }
+
+  @Override
+  protected void processImage() {
+    rgbFrameBitmap.setPixels(getRgbBytes(), 0, previewWidth, 0, 0, previewWidth, previewHeight);
+    final Canvas canvas = new Canvas(croppedBitmap);
+    canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
+
+    // For examining the actual TF input.
+    if (SAVE_PREVIEW_BITMAP) {
+      ImageUtils.saveBitmap(croppedBitmap);
+    }
+    runInBackground(
+        new Runnable() {
+          @Override
+          public void run() {
+            final long startTime = SystemClock.uptimeMillis();
+            final List<Classifier.Recognition> results = classifier.recognizeImage(croppedBitmap);
+            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
+            LOGGER.i("Detect: %s", results);
+            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+            if (resultsView == null) {
+              resultsView = (ResultsView) findViewById(R.id.results);
+            }
+            resultsView.setResults(results);
+            requestRender();
+            readyForNextImage();
+          }
+        });
+  }
+
+  @Override
+  public void onSetDebug(boolean debug) {
+    classifier.enableStatLogging(debug);
+  }
+
+  private void renderDebug(final Canvas canvas) {
+    if (!isDebug()) {
+      return;
+    }
+    final Bitmap copy = cropCopyBitmap;
+    if (copy != null) {
+      final Matrix matrix = new Matrix();
+      final float scaleFactor = 2;
+      matrix.postScale(scaleFactor, scaleFactor);
+      matrix.postTranslate(
+          canvas.getWidth() - copy.getWidth() * scaleFactor,
+          canvas.getHeight() - copy.getHeight() * scaleFactor);
+      canvas.drawBitmap(copy, matrix, new Paint());
+
+      final Vector<String> lines = new Vector<String>();
+      if (classifier != null) {
+        String statString = classifier.getStatString();
+        String[] statLines = statString.split("\n");
+        for (String line : statLines) {
+          lines.add(line);
+        }
+      }
+
+      lines.add("Frame: " + previewWidth + "x" + previewHeight);
+      lines.add("Crop: " + copy.getWidth() + "x" + copy.getHeight());
+      lines.add("View: " + canvas.getWidth() + "x" + canvas.getHeight());
+      lines.add("Rotation: " + sensorOrientation);
+      lines.add("Inference time: " + lastProcessingTimeMs + "ms");
+
+      borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/DetectorActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..de997e454a1e33254cb7c2c932ca79d0072539fa
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.graphics.Bitmap;
+import android.graphics.Bitmap.Config;
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Paint.Style;
+import android.graphics.RectF;
+import android.graphics.Typeface;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.SystemClock;
+import android.util.Size;
+import android.util.TypedValue;
+import android.widget.Toast;
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Vector;
+import org.tensorflow.demo.OverlayView.DrawCallback;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.tracking.MultiBoxTracker;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+/**
+ * An activity that uses a TensorFlowMultiBoxDetector and ObjectTracker to detect and then track
+ * objects.
+ */
+public class DetectorActivity extends CameraActivity implements OnImageAvailableListener {
+  private static final Logger LOGGER = new Logger();
+
+  // Configuration values for the prepackaged SSD model.
+  private static final int TF_OD_API_INPUT_SIZE = 300;
+  private static final String TF_OD_API_MODEL_FILE = "mobilenet_ssd.tflite";
+  private static final String TF_OD_API_LABELS_FILE = "file:///android_asset/coco_labels_list.txt";
+
+  // Which detection model to use: by default uses Tensorflow Object Detection API frozen
+  // checkpoints.
+  private enum DetectorMode {
+    TF_OD_API;
+  }
+
+  private static final DetectorMode MODE = DetectorMode.TF_OD_API;
+
+  // Minimum detection confidence to track a detection.
+  private static final float MINIMUM_CONFIDENCE_TF_OD_API = 0.6f;
+
+  private static final boolean MAINTAIN_ASPECT = false;
+
+  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
+
+  private static final boolean SAVE_PREVIEW_BITMAP = false;
+  private static final float TEXT_SIZE_DIP = 10;
+
+  private Integer sensorOrientation;
+
+  private Classifier detector;
+
+  private long lastProcessingTimeMs;
+  private Bitmap rgbFrameBitmap = null;
+  private Bitmap croppedBitmap = null;
+  private Bitmap cropCopyBitmap = null;
+
+  private boolean computingDetection = false;
+
+  private long timestamp = 0;
+
+  private Matrix frameToCropTransform;
+  private Matrix cropToFrameTransform;
+
+  private MultiBoxTracker tracker;
+
+  private byte[] luminanceCopy;
+
+  private BorderedText borderedText;
+  @Override
+  public void onPreviewSizeChosen(final Size size, final int rotation) {
+    final float textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+    borderedText.setTypeface(Typeface.MONOSPACE);
+
+    tracker = new MultiBoxTracker(this);
+
+    int cropSize = TF_OD_API_INPUT_SIZE;
+
+    try {
+      detector =
+          TFLiteObjectDetectionAPIModel.create(
+              getAssets(), TF_OD_API_MODEL_FILE, TF_OD_API_LABELS_FILE, TF_OD_API_INPUT_SIZE);
+      cropSize = TF_OD_API_INPUT_SIZE;
+    } catch (final IOException e) {
+      LOGGER.e("Exception initializing classifier!", e);
+      Toast toast =
+          Toast.makeText(
+              getApplicationContext(), "Classifier could not be initialized", Toast.LENGTH_SHORT);
+      toast.show();
+      finish();
+    }
+
+
+    previewWidth = size.getWidth();
+    previewHeight = size.getHeight();
+
+    sensorOrientation = rotation - getScreenOrientation();
+    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
+
+    LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
+    rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
+    croppedBitmap = Bitmap.createBitmap(cropSize, cropSize, Config.ARGB_8888);
+
+    frameToCropTransform =
+        ImageUtils.getTransformationMatrix(
+            previewWidth, previewHeight,
+            cropSize, cropSize,
+            sensorOrientation, MAINTAIN_ASPECT);
+
+    cropToFrameTransform = new Matrix();
+    frameToCropTransform.invert(cropToFrameTransform);
+
+    trackingOverlay = (OverlayView) findViewById(R.id.tracking_overlay);
+    trackingOverlay.addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            tracker.draw(canvas);
+            if (isDebug()) {
+              tracker.drawDebug(canvas);
+            }
+          }
+        });
+
+    addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            if (!isDebug()) {
+              return;
+            }
+            final Bitmap copy = cropCopyBitmap;
+            if (copy == null) {
+              return;
+            }
+
+            final int backgroundColor = Color.argb(100, 0, 0, 0);
+            canvas.drawColor(backgroundColor);
+
+            final Matrix matrix = new Matrix();
+            final float scaleFactor = 2;
+            matrix.postScale(scaleFactor, scaleFactor);
+            matrix.postTranslate(
+                canvas.getWidth() - copy.getWidth() * scaleFactor,
+                canvas.getHeight() - copy.getHeight() * scaleFactor);
+            canvas.drawBitmap(copy, matrix, new Paint());
+
+            final Vector<String> lines = new Vector<String>();
+            if (detector != null) {
+              final String statString = detector.getStatString();
+              final String[] statLines = statString.split("\n");
+              for (final String line : statLines) {
+                lines.add(line);
+              }
+            }
+            lines.add("");
+
+            lines.add("Frame: " + previewWidth + "x" + previewHeight);
+            lines.add("Crop: " + copy.getWidth() + "x" + copy.getHeight());
+            lines.add("View: " + canvas.getWidth() + "x" + canvas.getHeight());
+            lines.add("Rotation: " + sensorOrientation);
+            lines.add("Inference time: " + lastProcessingTimeMs + "ms");
+
+            borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
+          }
+        });
+  }
+
+  OverlayView trackingOverlay;
+
+  @Override
+  protected void processImage() {
+    ++timestamp;
+    final long currTimestamp = timestamp;
+    byte[] originalLuminance = getLuminance();
+    tracker.onFrame(
+        previewWidth,
+        previewHeight,
+        getLuminanceStride(),
+        sensorOrientation,
+        originalLuminance,
+        timestamp);
+    trackingOverlay.postInvalidate();
+
+    // No mutex needed as this method is not reentrant.
+    if (computingDetection) {
+      readyForNextImage();
+      return;
+    }
+    computingDetection = true;
+    LOGGER.i("Preparing image " + currTimestamp + " for detection in bg thread.");
+
+    rgbFrameBitmap.setPixels(getRgbBytes(), 0, previewWidth, 0, 0, previewWidth, previewHeight);
+
+    if (luminanceCopy == null) {
+      luminanceCopy = new byte[originalLuminance.length];
+    }
+    System.arraycopy(originalLuminance, 0, luminanceCopy, 0, originalLuminance.length);
+    readyForNextImage();
+
+    final Canvas canvas = new Canvas(croppedBitmap);
+    canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
+    // For examining the actual TF input.
+    if (SAVE_PREVIEW_BITMAP) {
+      ImageUtils.saveBitmap(croppedBitmap);
+    }
+
+    runInBackground(
+        new Runnable() {
+          @Override
+          public void run() {
+            LOGGER.i("Running detection on image " + currTimestamp);
+            final long startTime = SystemClock.uptimeMillis();
+            final List<Classifier.Recognition> results = detector.recognizeImage(croppedBitmap);
+            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
+
+            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+            final Canvas canvas = new Canvas(cropCopyBitmap);
+            final Paint paint = new Paint();
+            paint.setColor(Color.RED);
+            paint.setStyle(Style.STROKE);
+            paint.setStrokeWidth(2.0f);
+
+            float minimumConfidence = MINIMUM_CONFIDENCE_TF_OD_API;
+            switch (MODE) {
+              case TF_OD_API:
+                minimumConfidence = MINIMUM_CONFIDENCE_TF_OD_API;
+                break;
+            }
+
+            final List<Classifier.Recognition> mappedRecognitions =
+                new LinkedList<Classifier.Recognition>();
+
+            for (final Classifier.Recognition result : results) {
+              final RectF location = result.getLocation();
+              if (location != null && result.getConfidence() >= minimumConfidence) {
+                canvas.drawRect(location, paint);
+
+                cropToFrameTransform.mapRect(location);
+                result.setLocation(location);
+                mappedRecognitions.add(result);
+              }
+            }
+
+            tracker.trackResults(mappedRecognitions, luminanceCopy, currTimestamp);
+            trackingOverlay.postInvalidate();
+
+            requestRender();
+            computingDetection = false;
+          }
+        });
+  }
+
+  @Override
+  protected int getLayoutId() {
+    return R.layout.camera_connection_fragment_tracking;
+  }
+
+  @Override
+  protected Size getDesiredPreviewFrameSize() {
+    return DESIRED_PREVIEW_SIZE;
+  }
+
+  @Override
+  public void onSetDebug(final boolean debug) {
+    detector.enableStatLogging(debug);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
new file mode 100644
index 0000000000000000000000000000000000000000..fd830297533bb8366e008a44a32255788d5e1ea6
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
@@ -0,0 +1,216 @@
+package org.tensorflow.demo;
+
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import android.app.Fragment;
+import android.graphics.SurfaceTexture;
+import android.hardware.Camera;
+import android.hardware.Camera.CameraInfo;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.util.Size;
+import android.util.SparseIntArray;
+import android.view.LayoutInflater;
+import android.view.Surface;
+import android.view.TextureView;
+import android.view.View;
+import android.view.ViewGroup;
+import java.io.IOException;
+import java.util.List;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public class LegacyCameraConnectionFragment extends Fragment {
+  private Camera camera;
+  private static final Logger LOGGER = new Logger();
+  private Camera.PreviewCallback imageListener;
+  private Size desiredSize;
+
+  /**
+   * The layout identifier to inflate for this Fragment.
+   */
+  private int layout;
+
+  public LegacyCameraConnectionFragment(
+      final Camera.PreviewCallback imageListener, final int layout, final Size desiredSize) {
+    this.imageListener = imageListener;
+    this.layout = layout;
+    this.desiredSize = desiredSize;
+  }
+
+  /**
+   * Conversion from screen rotation to JPEG orientation.
+   */
+  private static final SparseIntArray ORIENTATIONS = new SparseIntArray();
+
+  static {
+    ORIENTATIONS.append(Surface.ROTATION_0, 90);
+    ORIENTATIONS.append(Surface.ROTATION_90, 0);
+    ORIENTATIONS.append(Surface.ROTATION_180, 270);
+    ORIENTATIONS.append(Surface.ROTATION_270, 180);
+  }
+
+  /**
+   * {@link android.view.TextureView.SurfaceTextureListener} handles several lifecycle events on a
+   * {@link TextureView}.
+   */
+  private final TextureView.SurfaceTextureListener surfaceTextureListener =
+      new TextureView.SurfaceTextureListener() {
+        @Override
+        public void onSurfaceTextureAvailable(
+            final SurfaceTexture texture, final int width, final int height) {
+
+          int index = getCameraId();
+          camera = Camera.open(index);
+
+          try {
+            Camera.Parameters parameters = camera.getParameters();
+            List<String> focusModes = parameters.getSupportedFocusModes();
+            if (focusModes != null
+                && focusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE)) {
+              parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
+            }
+            List<Camera.Size> cameraSizes = parameters.getSupportedPreviewSizes();
+            Size[] sizes = new Size[cameraSizes.size()];
+            int i = 0;
+            for (Camera.Size size : cameraSizes) {
+              sizes[i++] = new Size(size.width, size.height);
+            }
+            Size previewSize =
+                CameraConnectionFragment.chooseOptimalSize(
+                    sizes, desiredSize.getWidth(), desiredSize.getHeight());
+            parameters.setPreviewSize(previewSize.getWidth(), previewSize.getHeight());
+            camera.setDisplayOrientation(90);
+            camera.setParameters(parameters);
+            camera.setPreviewTexture(texture);
+          } catch (IOException exception) {
+            camera.release();
+          }
+
+          camera.setPreviewCallbackWithBuffer(imageListener);
+          Camera.Size s = camera.getParameters().getPreviewSize();
+          camera.addCallbackBuffer(new byte[ImageUtils.getYUVByteSize(s.height, s.width)]);
+
+          textureView.setAspectRatio(s.height, s.width);
+
+          camera.startPreview();
+        }
+
+        @Override
+        public void onSurfaceTextureSizeChanged(
+            final SurfaceTexture texture, final int width, final int height) {}
+
+        @Override
+        public boolean onSurfaceTextureDestroyed(final SurfaceTexture texture) {
+          return true;
+        }
+
+        @Override
+        public void onSurfaceTextureUpdated(final SurfaceTexture texture) {}
+      };
+
+  /**
+   * An {@link AutoFitTextureView} for camera preview.
+   */
+  private AutoFitTextureView textureView;
+
+  /**
+   * An additional thread for running tasks that shouldn't block the UI.
+   */
+  private HandlerThread backgroundThread;
+
+  @Override
+  public View onCreateView(
+      final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
+    return inflater.inflate(layout, container, false);
+  }
+
+  @Override
+  public void onViewCreated(final View view, final Bundle savedInstanceState) {
+    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
+  }
+
+  @Override
+  public void onActivityCreated(final Bundle savedInstanceState) {
+    super.onActivityCreated(savedInstanceState);
+  }
+
+  @Override
+  public void onResume() {
+    super.onResume();
+    startBackgroundThread();
+    // When the screen is turned off and turned back on, the SurfaceTexture is already
+    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
+    // a camera and start preview from here (otherwise, we wait until the surface is ready in
+    // the SurfaceTextureListener).
+
+    if (textureView.isAvailable()) {
+      camera.startPreview();
+    } else {
+      textureView.setSurfaceTextureListener(surfaceTextureListener);
+    }
+  }
+
+  @Override
+  public void onPause() {
+    stopCamera();
+    stopBackgroundThread();
+    super.onPause();
+  }
+
+  /**
+   * Starts a background thread and its {@link Handler}.
+   */
+  private void startBackgroundThread() {
+    backgroundThread = new HandlerThread("CameraBackground");
+    backgroundThread.start();
+  }
+
+  /**
+   * Stops the background thread and its {@link Handler}.
+   */
+  private void stopBackgroundThread() {
+    backgroundThread.quitSafely();
+    try {
+      backgroundThread.join();
+      backgroundThread = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  protected void stopCamera() {
+    if (camera != null) {
+      camera.stopPreview();
+      camera.setPreviewCallback(null);
+      camera.release();
+      camera = null;
+    }
+  }
+
+  private int getCameraId() {
+    CameraInfo ci = new CameraInfo();
+    for (int i = 0; i < Camera.getNumberOfCameras(); i++) {
+      Camera.getCameraInfo(i, ci);
+      if (ci.facing == CameraInfo.CAMERA_FACING_BACK)
+        return i;
+    }
+    return -1; // No camera found
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/OverlayView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/OverlayView.java
new file mode 100644
index 0000000000000000000000000000000000000000..0f8d109fb46d769d0ada9c9daa6292a80470be8a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/OverlayView.java
@@ -0,0 +1,52 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.util.AttributeSet;
+import android.view.View;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * A simple View providing a render callback to other classes.
+ */
+public class OverlayView extends View {
+  private final List<DrawCallback> callbacks = new LinkedList<DrawCallback>();
+
+  public OverlayView(final Context context, final AttributeSet attrs) {
+    super(context, attrs);
+  }
+
+  /**
+   * Interface defining the callback for client classes.
+   */
+  public interface DrawCallback {
+    public void drawCallback(final Canvas canvas);
+  }
+
+  public void addCallback(final DrawCallback callback) {
+    callbacks.add(callback);
+  }
+
+  @Override
+  public synchronized void draw(final Canvas canvas) {
+    for (final DrawCallback callback : callbacks) {
+      callback.drawCallback(canvas);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
new file mode 100644
index 0000000000000000000000000000000000000000..31a4b07c8387bf0b1da9e967f37628d0ce642dc4
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
@@ -0,0 +1,67 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.graphics.Paint;
+import android.util.AttributeSet;
+import android.util.TypedValue;
+import android.view.View;
+import java.util.List;
+import org.tensorflow.demo.Classifier.Recognition;
+
+public class RecognitionScoreView extends View implements ResultsView {
+  private static final float TEXT_SIZE_DIP = 24;
+  private List<Recognition> results;
+  private final float textSizePx;
+  private final Paint fgPaint;
+  private final Paint bgPaint;
+
+  public RecognitionScoreView(final Context context, final AttributeSet set) {
+    super(context, set);
+
+    textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    fgPaint = new Paint();
+    fgPaint.setTextSize(textSizePx);
+
+    bgPaint = new Paint();
+    bgPaint.setColor(0xcc4285f4);
+  }
+
+  @Override
+  public void setResults(final List<Recognition> results) {
+    this.results = results;
+    postInvalidate();
+  }
+
+  @Override
+  public void onDraw(final Canvas canvas) {
+    final int x = 10;
+    int y = (int) (fgPaint.getTextSize() * 1.5f);
+
+    canvas.drawPaint(bgPaint);
+
+    if (results != null) {
+      for (final Recognition recog : results) {
+        canvas.drawText(recog.getTitle() + ": " + recog.getConfidence(), x, y, fgPaint);
+        y += (int) (fgPaint.getTextSize() * 1.5f);
+      }
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognizeCommands.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
new file mode 100644
index 0000000000000000000000000000000000000000..9e91aea7efc8c1aea00913ba863eb57f0692343a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.util.Log;
+import android.util.Pair;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Deque;
+import java.util.List;
+
+/** Reads in results from an instantaneous audio recognition model and smoothes them over time. */
+public class RecognizeCommands {
+  // Configuration settings.
+  private List<String> labels = new ArrayList<String>();
+  private long averageWindowDurationMs;
+  private float detectionThreshold;
+  private int suppressionMs;
+  private int minimumCount;
+  private long minimumTimeBetweenSamplesMs;
+
+  // Working variables.
+  private Deque<Pair<Long, float[]>> previousResults = new ArrayDeque<Pair<Long, float[]>>();
+  private String previousTopLabel;
+  private int labelsCount;
+  private long previousTopLabelTime;
+  private float previousTopLabelScore;
+
+  private static final String SILENCE_LABEL = "_silence_";
+  private static final long MINIMUM_TIME_FRACTION = 4;
+
+  public RecognizeCommands(
+      List<String> inLabels,
+      long inAverageWindowDurationMs,
+      float inDetectionThreshold,
+      int inSuppressionMS,
+      int inMinimumCount,
+      long inMinimumTimeBetweenSamplesMS) {
+    labels = inLabels;
+    averageWindowDurationMs = inAverageWindowDurationMs;
+    detectionThreshold = inDetectionThreshold;
+    suppressionMs = inSuppressionMS;
+    minimumCount = inMinimumCount;
+    labelsCount = inLabels.size();
+    previousTopLabel = SILENCE_LABEL;
+    previousTopLabelTime = Long.MIN_VALUE;
+    previousTopLabelScore = 0.0f;
+    minimumTimeBetweenSamplesMs = inMinimumTimeBetweenSamplesMS;
+  }
+
+  /** Holds information about what's been recognized. */
+  public static class RecognitionResult {
+    public final String foundCommand;
+    public final float score;
+    public final boolean isNewCommand;
+
+    public RecognitionResult(String inFoundCommand, float inScore, boolean inIsNewCommand) {
+      foundCommand = inFoundCommand;
+      score = inScore;
+      isNewCommand = inIsNewCommand;
+    }
+  }
+
+  private static class ScoreForSorting implements Comparable<ScoreForSorting> {
+    public final float score;
+    public final int index;
+
+    public ScoreForSorting(float inScore, int inIndex) {
+      score = inScore;
+      index = inIndex;
+    }
+
+    @Override
+    public int compareTo(ScoreForSorting other) {
+      if (this.score > other.score) {
+        return -1;
+      } else if (this.score < other.score) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  }
+
+  public RecognitionResult processLatestResults(float[] currentResults, long currentTimeMS) {
+    if (currentResults.length != labelsCount) {
+      throw new RuntimeException(
+          "The results for recognition should contain "
+              + labelsCount
+              + " elements, but there are "
+              + currentResults.length);
+    }
+
+    if ((!previousResults.isEmpty()) && (currentTimeMS < previousResults.getFirst().first)) {
+      throw new RuntimeException(
+          "You must feed results in increasing time order, but received a timestamp of "
+              + currentTimeMS
+              + " that was earlier than the previous one of "
+              + previousResults.getFirst().first);
+    }
+
+    final int howManyResults = previousResults.size();
+    // Ignore any results that are coming in too frequently.
+    if (howManyResults > 1) {
+      final long timeSinceMostRecent = currentTimeMS - previousResults.getLast().first;
+      if (timeSinceMostRecent < minimumTimeBetweenSamplesMs) {
+        return new RecognitionResult(previousTopLabel, previousTopLabelScore, false);
+      }
+    }
+
+    // Add the latest results to the head of the queue.
+    previousResults.addLast(new Pair<Long, float[]>(currentTimeMS, currentResults));
+
+    // Prune any earlier results that are too old for the averaging window.
+    final long timeLimit = currentTimeMS - averageWindowDurationMs;
+    while (previousResults.getFirst().first < timeLimit) {
+      previousResults.removeFirst();
+    }
+
+    // If there are too few results, assume the result will be unreliable and
+    // bail.
+    final long earliestTime = previousResults.getFirst().first;
+    final long samplesDuration = currentTimeMS - earliestTime;
+    if ((howManyResults < minimumCount)
+        || (samplesDuration < (averageWindowDurationMs / MINIMUM_TIME_FRACTION))) {
+      Log.v("RecognizeResult", "Too few results");
+      return new RecognitionResult(previousTopLabel, 0.0f, false);
+    }
+
+    // Calculate the average score across all the results in the window.
+    float[] averageScores = new float[labelsCount];
+    for (Pair<Long, float[]> previousResult : previousResults) {
+      final float[] scoresTensor = previousResult.second;
+      int i = 0;
+      while (i < scoresTensor.length) {
+        averageScores[i] += scoresTensor[i] / howManyResults;
+        ++i;
+      }
+    }
+
+    // Sort the averaged results in descending score order.
+    ScoreForSorting[] sortedAverageScores = new ScoreForSorting[labelsCount];
+    for (int i = 0; i < labelsCount; ++i) {
+      sortedAverageScores[i] = new ScoreForSorting(averageScores[i], i);
+    }
+    Arrays.sort(sortedAverageScores);
+
+    // See if the latest top score is enough to trigger a detection.
+    final int currentTopIndex = sortedAverageScores[0].index;
+    final String currentTopLabel = labels.get(currentTopIndex);
+    final float currentTopScore = sortedAverageScores[0].score;
+    // If we've recently had another label trigger, assume one that occurs too
+    // soon afterwards is a bad result.
+    long timeSinceLastTop;
+    if (previousTopLabel.equals(SILENCE_LABEL) || (previousTopLabelTime == Long.MIN_VALUE)) {
+      timeSinceLastTop = Long.MAX_VALUE;
+    } else {
+      timeSinceLastTop = currentTimeMS - previousTopLabelTime;
+    }
+    boolean isNewCommand;
+    if ((currentTopScore > detectionThreshold) && (timeSinceLastTop > suppressionMs)) {
+      previousTopLabel = currentTopLabel;
+      previousTopLabelTime = currentTimeMS;
+      previousTopLabelScore = currentTopScore;
+      isNewCommand = true;
+    } else {
+      isNewCommand = false;
+    }
+    return new RecognitionResult(currentTopLabel, currentTopScore, isNewCommand);
+  }
+}
diff --git a/tensorflow/core/platform/posix/tracing.cc b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ResultsView.java
similarity index 51%
rename from tensorflow/core/platform/posix/tracing.cc
rename to tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ResultsView.java
index 1d1aa53f2ca328cf31d067ec9c56a544927b822a..211d7e66fb20ce00e4e91ecc9134fbf2852e9f3d 100644
--- a/tensorflow/core/platform/posix/tracing.cc
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ResultsView.java
@@ -13,28 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/tracing.h"
+package org.tensorflow.demo;
 
-#include <stdlib.h>
-#include <unistd.h>
+import java.util.List;
+import org.tensorflow.demo.Classifier.Recognition;
 
-namespace tensorflow {
-namespace port {
-
-static bool TryGetEnv(const char* name, const char** value) {
-  *value = getenv(name);
-  return *value != nullptr && (*value)[0] != '\0';
-}
-
-const char* Tracing::LogDir() {
-  const char* dir;
-  if (TryGetEnv("TEST_TMPDIR", &dir)) return dir;
-  if (TryGetEnv("TMP", &dir)) return dir;
-  if (TryGetEnv("TMPDIR", &dir)) return dir;
-  dir = "/tmp";
-  if (access(dir, R_OK | W_OK | X_OK) == 0) return dir;
-  return ".";  // Default to current directory.
+public interface ResultsView {
+  public void setResults(final List<Recognition> results);
 }
-
-}  // namespace port
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/SpeechActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/SpeechActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..9c9c30bc0985e529b46c322fd0ff02590967afa2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/SpeechActivity.java
@@ -0,0 +1,381 @@
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Demonstrates how to run an audio recognition model in Android.
+
+This example loads a simple speech recognition model trained by the tutorial at
+https://www.tensorflow.org/tutorials/audio_training
+
+The model files should be downloaded automatically from the TensorFlow website,
+but if you have a custom model you can update the LABEL_FILENAME and
+MODEL_FILENAME constants to point to your own files.
+
+The example application displays a list view with all of the known audio labels,
+and highlights each one when it thinks it has detected one through the
+microphone. The averaging of results to give a more reliable signal happens in
+the RecognizeCommands helper class.
+*/
+
+package org.tensorflow.demo;
+
+import android.animation.ValueAnimator;
+import android.app.Activity;
+import android.content.pm.PackageManager;
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.View;
+import android.widget.ArrayAdapter;
+import android.widget.Button;
+import android.widget.ListView;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.locks.ReentrantLock;
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+/**
+ * An activity that listens for audio and then uses a TensorFlow model to detect particular classes,
+ * by default a small set of action words.
+ */
+public class SpeechActivity extends Activity {
+
+  // Constants that control the behavior of the recognition code and model
+  // settings. See the audio recognition tutorial for a detailed explanation of
+  // all these, but you should customize them to match your training settings if
+  // you are running your own model.
+  private static final int SAMPLE_RATE = 16000;
+  private static final int SAMPLE_DURATION_MS = 1000;
+  private static final int RECORDING_LENGTH = (int) (SAMPLE_RATE * SAMPLE_DURATION_MS / 1000);
+  private static final long AVERAGE_WINDOW_DURATION_MS = 500;
+  private static final float DETECTION_THRESHOLD = 0.70f;
+  private static final int SUPPRESSION_MS = 1500;
+  private static final int MINIMUM_COUNT = 3;
+  private static final long MINIMUM_TIME_BETWEEN_SAMPLES_MS = 30;
+  private static final String LABEL_FILENAME = "file:///android_asset/conv_actions_labels.txt";
+  private static final String MODEL_FILENAME = "file:///android_asset/conv_actions_frozen.tflite";
+
+  // UI elements.
+  private static final int REQUEST_RECORD_AUDIO = 13;
+  private Button quitButton;
+  private ListView labelsListView;
+  private static final String LOG_TAG = SpeechActivity.class.getSimpleName();
+
+  // Working variables.
+  short[] recordingBuffer = new short[RECORDING_LENGTH];
+  int recordingOffset = 0;
+  boolean shouldContinue = true;
+  private Thread recordingThread;
+  boolean shouldContinueRecognition = true;
+  private Thread recognitionThread;
+  private final ReentrantLock recordingBufferLock = new ReentrantLock();
+
+  private List<String> labels = new ArrayList<String>();
+  private List<String> displayedLabels = new ArrayList<>();
+  private RecognizeCommands recognizeCommands = null;
+
+  private Interpreter tfLite;
+
+  /** Memory-map the model file in Assets. */
+  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
+      throws IOException {
+    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    // Set up the UI.
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_speech);
+    quitButton = (Button) findViewById(R.id.quit);
+    quitButton.setOnClickListener(
+        new View.OnClickListener() {
+          @Override
+          public void onClick(View view) {
+            moveTaskToBack(true);
+            android.os.Process.killProcess(android.os.Process.myPid());
+            System.exit(1);
+          }
+        });
+    labelsListView = (ListView) findViewById(R.id.list_view);
+
+    // Load the labels for the model, but only display those that don't start
+    // with an underscore.
+    String actualLabelFilename = LABEL_FILENAME.split("file:///android_asset/", -1)[1];
+    Log.i(LOG_TAG, "Reading labels from: " + actualLabelFilename);
+    BufferedReader br = null;
+    try {
+      br = new BufferedReader(new InputStreamReader(getAssets().open(actualLabelFilename)));
+      String line;
+      while ((line = br.readLine()) != null) {
+        labels.add(line);
+        if (line.charAt(0) != '_') {
+          displayedLabels.add(line.substring(0, 1).toUpperCase() + line.substring(1));
+        }
+      }
+      br.close();
+    } catch (IOException e) {
+      throw new RuntimeException("Problem reading label file!", e);
+    }
+
+    // Build a list view based on these labels.
+    ArrayAdapter<String> arrayAdapter =
+        new ArrayAdapter<String>(this, R.layout.list_text_item, displayedLabels);
+    labelsListView.setAdapter(arrayAdapter);
+
+    // Set up an object to smooth recognition results to increase accuracy.
+    recognizeCommands =
+        new RecognizeCommands(
+            labels,
+            AVERAGE_WINDOW_DURATION_MS,
+            DETECTION_THRESHOLD,
+            SUPPRESSION_MS,
+            MINIMUM_COUNT,
+            MINIMUM_TIME_BETWEEN_SAMPLES_MS);
+
+    String actualModelFilename = MODEL_FILENAME.split("file:///android_asset/", -1)[1];
+    try {
+      tfLite = new Interpreter(loadModelFile(getAssets(), actualModelFilename));
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+    tfLite.resizeInput(0, new int[] {RECORDING_LENGTH, 1});
+    tfLite.resizeInput(1, new int[] {1});
+
+    // Start the recording and recognition threads.
+    requestMicrophonePermission();
+    startRecording();
+    startRecognition();
+  }
+
+  private void requestMicrophonePermission() {
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      requestPermissions(
+          new String[]{android.Manifest.permission.RECORD_AUDIO}, REQUEST_RECORD_AUDIO);
+    }
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      int requestCode, String[] permissions, int[] grantResults) {
+    if (requestCode == REQUEST_RECORD_AUDIO
+        && grantResults.length > 0
+        && grantResults[0] == PackageManager.PERMISSION_GRANTED) {
+      startRecording();
+      startRecognition();
+    }
+  }
+
+  public synchronized void startRecording() {
+    if (recordingThread != null) {
+      return;
+    }
+    shouldContinue = true;
+    recordingThread =
+        new Thread(
+            new Runnable() {
+              @Override
+              public void run() {
+                record();
+              }
+            });
+    recordingThread.start();
+  }
+
+  public synchronized void stopRecording() {
+    if (recordingThread == null) {
+      return;
+    }
+    shouldContinue = false;
+    recordingThread = null;
+  }
+
+  private void record() {
+    android.os.Process.setThreadPriority(android.os.Process.THREAD_PRIORITY_AUDIO);
+
+    // Estimate the buffer size we'll need for this device.
+    int bufferSize =
+        AudioRecord.getMinBufferSize(
+            SAMPLE_RATE, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT);
+    if (bufferSize == AudioRecord.ERROR || bufferSize == AudioRecord.ERROR_BAD_VALUE) {
+      bufferSize = SAMPLE_RATE * 2;
+    }
+    short[] audioBuffer = new short[bufferSize / 2];
+
+    AudioRecord record =
+        new AudioRecord(
+            MediaRecorder.AudioSource.DEFAULT,
+            SAMPLE_RATE,
+            AudioFormat.CHANNEL_IN_MONO,
+            AudioFormat.ENCODING_PCM_16BIT,
+            bufferSize);
+
+    if (record.getState() != AudioRecord.STATE_INITIALIZED) {
+      Log.e(LOG_TAG, "Audio Record can't initialize!");
+      return;
+    }
+
+    record.startRecording();
+
+    Log.v(LOG_TAG, "Start recording");
+
+    // Loop, gathering audio data and copying it to a round-robin buffer.
+    while (shouldContinue) {
+      int numberRead = record.read(audioBuffer, 0, audioBuffer.length);
+      int maxLength = recordingBuffer.length;
+      int newRecordingOffset = recordingOffset + numberRead;
+      int secondCopyLength = Math.max(0, newRecordingOffset - maxLength);
+      int firstCopyLength = numberRead - secondCopyLength;
+      // We store off all the data for the recognition thread to access. The ML
+      // thread will copy out of this buffer into its own, while holding the
+      // lock, so this should be thread safe.
+      recordingBufferLock.lock();
+      try {
+        System.arraycopy(audioBuffer, 0, recordingBuffer, recordingOffset, firstCopyLength);
+        System.arraycopy(audioBuffer, firstCopyLength, recordingBuffer, 0, secondCopyLength);
+        recordingOffset = newRecordingOffset % maxLength;
+      } finally {
+        recordingBufferLock.unlock();
+      }
+    }
+
+    record.stop();
+    record.release();
+  }
+
+  public synchronized void startRecognition() {
+    if (recognitionThread != null) {
+      return;
+    }
+    shouldContinueRecognition = true;
+    recognitionThread =
+        new Thread(
+            new Runnable() {
+              @Override
+              public void run() {
+                recognize();
+              }
+            });
+    recognitionThread.start();
+  }
+
+  public synchronized void stopRecognition() {
+    if (recognitionThread == null) {
+      return;
+    }
+    shouldContinueRecognition = false;
+    recognitionThread = null;
+  }
+
+  private void recognize() {
+    Log.v(LOG_TAG, "Start recognition");
+
+    short[] inputBuffer = new short[RECORDING_LENGTH];
+    float[][] floatInputBuffer = new float[RECORDING_LENGTH][1];
+    float[][] outputScores = new float[1][labels.size()];
+    int[] sampleRateList = new int[] {SAMPLE_RATE};
+
+    // Loop, grabbing recorded data and running the recognition model on it.
+    while (shouldContinueRecognition) {
+      // The recording thread places data in this round-robin buffer, so lock to
+      // make sure there's no writing happening and then copy it to our own
+      // local version.
+      recordingBufferLock.lock();
+      try {
+        int maxLength = recordingBuffer.length;
+        int firstCopyLength = maxLength - recordingOffset;
+        int secondCopyLength = recordingOffset;
+        System.arraycopy(recordingBuffer, recordingOffset, inputBuffer, 0, firstCopyLength);
+        System.arraycopy(recordingBuffer, 0, inputBuffer, firstCopyLength, secondCopyLength);
+      } finally {
+        recordingBufferLock.unlock();
+      }
+
+      // We need to feed in float values between -1.0f and 1.0f, so divide the
+      // signed 16-bit inputs.
+      for (int i = 0; i < RECORDING_LENGTH; ++i) {
+        floatInputBuffer[i][0] = inputBuffer[i] / 32767.0f;
+      }
+
+      Object[] inputArray = {floatInputBuffer, sampleRateList};
+      Map<Integer, Object> outputMap = new HashMap<>();
+      outputMap.put(0, outputScores);
+
+      // Run the model.
+      tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
+
+      // Use the smoother to figure out if we've had a real recognition event.
+      long currentTime = System.currentTimeMillis();
+      final RecognizeCommands.RecognitionResult result =
+          recognizeCommands.processLatestResults(outputScores[0], currentTime);
+
+      runOnUiThread(
+          new Runnable() {
+            @Override
+            public void run() {
+              // If we do have a new command, highlight the right list entry.
+              if (!result.foundCommand.startsWith("_") && result.isNewCommand) {
+                int labelIndex = -1;
+                for (int i = 0; i < labels.size(); ++i) {
+                  if (labels.get(i).equals(result.foundCommand)) {
+                    labelIndex = i;
+                  }
+                }
+                final View labelView = (View) labelsListView.getChildAt(labelIndex - 2);
+                ValueAnimator colorAnimation =
+                    ValueAnimator.ofArgb(0x00b3ccff, 0xffb3ccff, 0x00b3ccff);
+                colorAnimation.setDuration(750);
+                colorAnimation.addUpdateListener(
+                    new ValueAnimator.AnimatorUpdateListener() {
+                      @Override
+                      public void onAnimationUpdate(ValueAnimator animator) {
+                        labelView.setBackgroundColor((int) animator.getAnimatedValue());
+                      }
+                    });
+                colorAnimation.start();
+              }
+            }
+          });
+      try {
+        // We don't need to run too frequently, so snooze for a bit.
+        Thread.sleep(MINIMUM_TIME_BETWEEN_SAMPLES_MS);
+      } catch (InterruptedException e) {
+        // Ignore
+      }
+    }
+
+    Log.v(LOG_TAG, "End recognition");
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteImageClassifier.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteImageClassifier.java
new file mode 100644
index 0000000000000000000000000000000000000000..d75c3ceadabd2dad73b1e5feda3ae88181769e74
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteImageClassifier.java
@@ -0,0 +1,209 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.os.SystemClock;
+import android.os.Trace;
+import android.util.Log;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Vector;
+import org.tensorflow.lite.Interpreter;
+
+/** A classifier specialized to label images using TensorFlow. */
+public class TFLiteImageClassifier implements Classifier {
+  private static final String TAG = "TFLiteImageClassifier";
+
+  // Only return this many results with at least this confidence.
+  private static final int MAX_RESULTS = 3;
+
+  private Interpreter tfLite;
+
+  /** Dimensions of inputs. */
+  private static final int DIM_BATCH_SIZE = 1;
+
+  private static final int DIM_PIXEL_SIZE = 3;
+
+  private static final int DIM_IMG_SIZE_X = 224;
+  private static final int DIM_IMG_SIZE_Y = 224;
+
+  byte[][] labelProb;
+
+  // Pre-allocated buffers.
+  private Vector<String> labels = new Vector<String>();
+  private int[] intValues;
+  private ByteBuffer imgData = null;
+
+  private TFLiteImageClassifier() {}
+
+  /** Memory-map the model file in Assets. */
+  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
+      throws IOException {
+    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  /**
+   * Initializes a native TensorFlow session for classifying images.
+   *
+   * @param assetManager The asset manager to be used to load assets.
+   * @param modelFilename The filepath of the model GraphDef protocol buffer.
+   * @param labelFilename The filepath of label file for classes.
+   * @param inputSize The input size. A square image of inputSize x inputSize is assumed.
+   * @throws IOException
+   */
+  public static Classifier create(
+      AssetManager assetManager, String modelFilename, String labelFilename, int inputSize) {
+    TFLiteImageClassifier c = new TFLiteImageClassifier();
+
+    // Read the label names into memory.
+    // TODO(andrewharp): make this handle non-assets.
+    Log.i(TAG, "Reading labels from: " + labelFilename);
+    BufferedReader br = null;
+    try {
+      br = new BufferedReader(new InputStreamReader(assetManager.open(labelFilename)));
+      String line;
+      while ((line = br.readLine()) != null) {
+        c.labels.add(line);
+      }
+      br.close();
+    } catch (IOException e) {
+      throw new RuntimeException("Problem reading label file!" , e);
+    }
+
+    c.imgData =
+        ByteBuffer.allocateDirect(
+            DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y * DIM_PIXEL_SIZE);
+
+    c.imgData.order(ByteOrder.nativeOrder());
+    try {
+      c.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+    // The shape of the output is [N, NUM_CLASSES], where N is the batch size.
+    Log.i(TAG, "Read " + c.labels.size() + " labels");
+
+    // Pre-allocate buffers.
+    c.intValues = new int[inputSize * inputSize];
+
+    c.labelProb = new byte[1][c.labels.size()];
+
+    return c;
+  }
+
+  /** Writes Image data into a {@code ByteBuffer}. */
+  private void convertBitmapToByteBuffer(Bitmap bitmap) {
+    if (imgData == null) {
+      return;
+    }
+    imgData.rewind();
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+    // Convert the image to floating point.
+    int pixel = 0;
+    long startTime = SystemClock.uptimeMillis();
+    for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
+      for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
+        final int val = intValues[pixel++];
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    long endTime = SystemClock.uptimeMillis();
+    Log.d(TAG, "Timecost to put values into ByteBuffer: " + Long.toString(endTime - startTime));
+  }
+
+  @Override
+  public List<Recognition> recognizeImage(final Bitmap bitmap) {
+    // Log this method so that it can be analyzed with systrace.
+    Trace.beginSection("recognizeImage");
+
+    Trace.beginSection("preprocessBitmap");
+
+    long startTime;
+    long endTime;
+    startTime = SystemClock.uptimeMillis();
+
+    convertBitmapToByteBuffer(bitmap);
+
+    // Run the inference call.
+    Trace.beginSection("run");
+    startTime = SystemClock.uptimeMillis();
+    tfLite.run(imgData, labelProb);
+    endTime = SystemClock.uptimeMillis();
+    Log.i(TAG, "Inf time: " + (endTime - startTime));
+    Trace.endSection();
+
+    // Find the best classifications.
+    PriorityQueue<Recognition> pq =
+        new PriorityQueue<Recognition>(
+            3,
+            new Comparator<Recognition>() {
+              @Override
+              public int compare(Recognition lhs, Recognition rhs) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return Float.compare(rhs.getConfidence(), lhs.getConfidence());
+              }
+            });
+    for (int i = 0; i < labels.size(); ++i) {
+      pq.add(
+          new Recognition(
+              "" + i,
+              labels.size() > i ? labels.get(i) : "unknown",
+              (float) labelProb[0][i],
+              null));
+    }
+    final ArrayList<Recognition> recognitions = new ArrayList<Recognition>();
+    int recognitionsSize = Math.min(pq.size(), MAX_RESULTS);
+    for (int i = 0; i < recognitionsSize; ++i) {
+      recognitions.add(pq.poll());
+    }
+    Trace.endSection(); // "recognizeImage"
+    return recognitions;
+  }
+
+  @Override
+  public void enableStatLogging(boolean logStats) {
+  }
+
+  @Override
+  public String getStatString() {
+    return "";
+  }
+
+  @Override
+  public void close() {
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
new file mode 100644
index 0000000000000000000000000000000000000000..bfb4a0a04bc90566736864bf62340d1032961858
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
@@ -0,0 +1,292 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.graphics.RectF;
+import android.os.Trace;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.StringTokenizer;
+import java.util.Vector;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.Interpreter;
+
+/**
+ * Wrapper for frozen detection models trained using the Tensorflow Object Detection API:
+ * github.com/tensorflow/models/tree/master/research/object_detection
+ */
+public class TFLiteObjectDetectionAPIModel implements Classifier {
+  private static final Logger LOGGER = new Logger();
+
+  // Only return this many results.
+  private static final int NUM_RESULTS = 1917;
+  private static final int NUM_CLASSES = 91;
+
+  private static final float Y_SCALE = 10.0f;
+  private static final float X_SCALE = 10.0f;
+  private static final float H_SCALE = 5.0f;
+  private static final float W_SCALE = 5.0f;
+
+  // Config values.
+  private int inputSize;
+
+  private final float[][] boxPriors = new float[4][NUM_RESULTS];
+
+  // Pre-allocated buffers.
+  private Vector<String> labels = new Vector<String>();
+  private int[] intValues;
+  private float[][][] outputLocations;
+  private float[][][] outputClasses;
+
+  float[][][][] img;
+
+  private Interpreter tfLite;
+
+  private float expit(final float x) {
+    return (float) (1. / (1. + Math.exp(-x)));
+  }
+
+  /** Memory-map the model file in Assets. */
+  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
+      throws IOException {
+    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  private void loadCoderOptions(
+      final AssetManager assetManager, final String locationFilename, final float[][] boxPriors)
+      throws IOException {
+    // Try to be intelligent about opening from assets or sdcard depending on prefix.
+    final String assetPrefix = "file:///android_asset/";
+    InputStream is;
+    if (locationFilename.startsWith(assetPrefix)) {
+      is = assetManager.open(locationFilename.split(assetPrefix, -1)[1]);
+    } else {
+      is = new FileInputStream(locationFilename);
+    }
+
+    final BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+
+    for (int lineNum = 0; lineNum < 4; ++lineNum) {
+      String line = reader.readLine();
+      final StringTokenizer st = new StringTokenizer(line, ", ");
+      int priorIndex = 0;
+      while (st.hasMoreTokens()) {
+        final String token = st.nextToken();
+        try {
+          final float number = Float.parseFloat(token);
+          boxPriors[lineNum][priorIndex++] = number;
+        } catch (final NumberFormatException e) {
+          // Silently ignore.
+        }
+      }
+      if (priorIndex != NUM_RESULTS) {
+        throw new RuntimeException(
+            "BoxPrior length mismatch: " + priorIndex + " vs " + NUM_RESULTS);
+      }
+    }
+
+    LOGGER.i("Loaded box priors!");
+  }
+
+  void decodeCenterSizeBoxes(float[][][] predictions) {
+    for (int i = 0; i < NUM_RESULTS; ++i) {
+      float ycenter = predictions[0][i][0] / Y_SCALE * boxPriors[2][i] + boxPriors[0][i];
+      float xcenter = predictions[0][i][1] / X_SCALE * boxPriors[3][i] + boxPriors[1][i];
+      float h = (float) Math.exp(predictions[0][i][2] / H_SCALE) * boxPriors[2][i];
+      float w = (float) Math.exp(predictions[0][i][3] / W_SCALE) * boxPriors[3][i];
+
+      float ymin = ycenter - h / 2.f;
+      float xmin = xcenter - w / 2.f;
+      float ymax = ycenter + h / 2.f;
+      float xmax = xcenter + w / 2.f;
+
+      predictions[0][i][0] = ymin;
+      predictions[0][i][1] = xmin;
+      predictions[0][i][2] = ymax;
+      predictions[0][i][3] = xmax;
+    }
+  }
+
+  /**
+   * Initializes a native TensorFlow session for classifying images.
+   *
+   * @param assetManager The asset manager to be used to load assets.
+   * @param modelFilename The filepath of the model GraphDef protocol buffer.
+   * @param labelFilename The filepath of label file for classes.
+   */
+  public static Classifier create(
+      final AssetManager assetManager,
+      final String modelFilename,
+      final String labelFilename,
+      final int inputSize) throws IOException {
+    final TFLiteObjectDetectionAPIModel d = new TFLiteObjectDetectionAPIModel();
+
+    d.loadCoderOptions(assetManager, "file:///android_asset/box_priors.txt", d.boxPriors);
+
+    InputStream labelsInput = null;
+    String actualFilename = labelFilename.split("file:///android_asset/")[1];
+    labelsInput = assetManager.open(actualFilename);
+    BufferedReader br = null;
+    br = new BufferedReader(new InputStreamReader(labelsInput));
+    String line;
+    while ((line = br.readLine()) != null) {
+      LOGGER.w(line);
+      d.labels.add(line);
+    }
+    br.close();
+
+    d.inputSize = inputSize;
+
+    try {
+      d.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+    // Pre-allocate buffers.
+    d.img = new float[1][inputSize][inputSize][3];
+
+    d.intValues = new int[d.inputSize * d.inputSize];
+    d.outputLocations = new float[1][NUM_RESULTS][4];
+    d.outputClasses = new float[1][NUM_RESULTS][NUM_CLASSES];
+    return d;
+  }
+
+  private TFLiteObjectDetectionAPIModel() {}
+
+  @Override
+  public List<Recognition> recognizeImage(final Bitmap bitmap) {
+    // Log this method so that it can be analyzed with systrace.
+    Trace.beginSection("recognizeImage");
+
+    Trace.beginSection("preprocessBitmap");
+    // Preprocess the image data from 0-255 int to normalized float based
+    // on the provided parameters.
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+
+    for (int i = 0; i < inputSize; ++i) {
+      for (int j = 0; j < inputSize; ++j) {
+        int pixel = intValues[j * inputSize + i];
+        img[0][j][i][2] = (float) (pixel & 0xFF) / 128.0f - 1.0f;
+        img[0][j][i][1] = (float) ((pixel >> 8) & 0xFF) / 128.0f - 1.0f;
+        img[0][j][i][0] = (float) ((pixel >> 16) & 0xFF) / 128.0f - 1.0f;
+      }
+    }
+    Trace.endSection(); // preprocessBitmap
+
+    // Copy the input data into TensorFlow.
+    Trace.beginSection("feed");
+    outputLocations = new float[1][NUM_RESULTS][4];
+    outputClasses = new float[1][NUM_RESULTS][NUM_CLASSES];
+
+    Object[] inputArray = {img};
+    Map<Integer, Object> outputMap = new HashMap<>();
+    outputMap.put(0, outputLocations);
+    outputMap.put(1, outputClasses);
+    Trace.endSection();
+
+    // Run the inference call.
+    Trace.beginSection("run");
+    tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
+    Trace.endSection();
+
+    decodeCenterSizeBoxes(outputLocations);
+
+    // Find the best detections.
+    final PriorityQueue<Recognition> pq =
+        new PriorityQueue<Recognition>(
+            1,
+            new Comparator<Recognition>() {
+              @Override
+              public int compare(final Recognition lhs, final Recognition rhs) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return Float.compare(rhs.getConfidence(), lhs.getConfidence());
+              }
+            });
+
+    // Scale them back to the input size.
+    for (int i = 0; i < NUM_RESULTS; ++i) {
+      float topClassScore = -1000f;
+      int topClassScoreIndex = -1;
+
+      // Skip the first catch-all class.
+      for (int j = 1; j < NUM_CLASSES; ++j) {
+        float score = expit(outputClasses[0][i][j]);
+
+        if (score > topClassScore) {
+          topClassScoreIndex = j;
+          topClassScore = score;
+        }
+      }
+
+      if (topClassScore > 0.001f) {
+        final RectF detection =
+            new RectF(
+                outputLocations[0][i][1] * inputSize,
+                outputLocations[0][i][0] * inputSize,
+                outputLocations[0][i][3] * inputSize,
+                outputLocations[0][i][2] * inputSize);
+
+        pq.add(
+            new Recognition(
+                "" + i,
+                labels.get(topClassScoreIndex),
+                outputClasses[0][i][topClassScoreIndex],
+                detection));
+      }
+    }
+
+    final ArrayList<Recognition> recognitions = new ArrayList<Recognition>();
+    for (int i = 0; i < Math.min(pq.size(), 10); ++i) {
+      Recognition recog = pq.poll();
+      recognitions.add(recog);
+    }
+    Trace.endSection(); // "recognizeImage"
+    return recognitions;
+  }
+
+  @Override
+  public void enableStatLogging(final boolean logStats) {
+  }
+
+  @Override
+  public String getStatString() {
+    return "";
+  }
+
+  @Override
+  public void close() {
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/AssetUtils.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/AssetUtils.java
new file mode 100644
index 0000000000000000000000000000000000000000..c50efdf889145ad717445015fb94a37568939b73
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/AssetUtils.java
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.content.Context;
+import android.content.res.AssetManager;
+import android.util.Log;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/** Utilities for dealing with assets. */
+public class AssetUtils {
+
+  private static final String TAG = AssetUtils.class.getSimpleName();
+
+  private static final int BYTE_BUF_SIZE = 2048;
+
+  /**
+   * Copies a file from assets.
+   *
+   * @param context application context used to discover assets.
+   * @param assetName the relative file name within assets.
+   * @param targetName the target file name, always over write the existing file.
+   * @throws IOException if operation fails.
+   */
+  public static void copy(Context context, String assetName, String targetName) throws IOException {
+
+    Log.d(TAG, "creating file " + targetName + " from " + assetName);
+
+    File targetFile = null;
+    InputStream inputStream = null;
+    FileOutputStream outputStream = null;
+
+    try {
+      AssetManager assets = context.getAssets();
+      targetFile = new File(targetName);
+      inputStream = assets.open(assetName);
+      // TODO(kanlig): refactor log messages to make them more useful.
+      Log.d(TAG, "Creating outputstream");
+      outputStream = new FileOutputStream(targetFile, false /* append */);
+      copy(inputStream, outputStream);
+    } finally {
+      if (outputStream != null) {
+        outputStream.close();
+      }
+      if (inputStream != null) {
+        inputStream.close();
+      }
+    }
+  }
+
+  private static void copy(InputStream from, OutputStream to) throws IOException {
+    byte[] buf = new byte[BYTE_BUF_SIZE];
+    while (true) {
+      int r = from.read(buf);
+      if (r == -1) {
+        break;
+      }
+      to.write(buf, 0, r);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/BorderedText.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/BorderedText.java
new file mode 100644
index 0000000000000000000000000000000000000000..decfc3d8793d127800feb5d58cdaf3f84512d840
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/BorderedText.java
@@ -0,0 +1,117 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Paint;
+import android.graphics.Paint.Align;
+import android.graphics.Paint.Style;
+import android.graphics.Rect;
+import android.graphics.Typeface;
+import java.util.Vector;
+
+/**
+ * A class that encapsulates the tedious bits of rendering legible, bordered text onto a canvas.
+ */
+public class BorderedText {
+  private final Paint interiorPaint;
+  private final Paint exteriorPaint;
+
+  private final float textSize;
+
+  /**
+   * Creates a left-aligned bordered text object with a white interior, and a black exterior with
+   * the specified text size.
+   *
+   * @param textSize text size in pixels
+   */
+  public BorderedText(final float textSize) {
+    this(Color.WHITE, Color.BLACK, textSize);
+  }
+
+  /**
+   * Create a bordered text object with the specified interior and exterior colors, text size and
+   * alignment.
+   *
+   * @param interiorColor the interior text color
+   * @param exteriorColor the exterior text color
+   * @param textSize text size in pixels
+   */
+  public BorderedText(final int interiorColor, final int exteriorColor, final float textSize) {
+    interiorPaint = new Paint();
+    interiorPaint.setTextSize(textSize);
+    interiorPaint.setColor(interiorColor);
+    interiorPaint.setStyle(Style.FILL);
+    interiorPaint.setAntiAlias(false);
+    interiorPaint.setAlpha(255);
+
+    exteriorPaint = new Paint();
+    exteriorPaint.setTextSize(textSize);
+    exteriorPaint.setColor(exteriorColor);
+    exteriorPaint.setStyle(Style.FILL_AND_STROKE);
+    exteriorPaint.setStrokeWidth(textSize / 8);
+    exteriorPaint.setAntiAlias(false);
+    exteriorPaint.setAlpha(255);
+
+    this.textSize = textSize;
+  }
+
+  public void setTypeface(Typeface typeface) {
+    interiorPaint.setTypeface(typeface);
+    exteriorPaint.setTypeface(typeface);
+  }
+
+  public void drawText(final Canvas canvas, final float posX, final float posY, final String text) {
+    canvas.drawText(text, posX, posY, exteriorPaint);
+    canvas.drawText(text, posX, posY, interiorPaint);
+  }
+
+  public void drawLines(Canvas canvas, final float posX, final float posY, Vector<String> lines) {
+    int lineNum = 0;
+    for (final String line : lines) {
+      drawText(canvas, posX, posY - getTextSize() * (lines.size() - lineNum - 1), line);
+      ++lineNum;
+    }
+  }
+
+  public void setInteriorColor(final int color) {
+    interiorPaint.setColor(color);
+  }
+
+  public void setExteriorColor(final int color) {
+    exteriorPaint.setColor(color);
+  }
+
+  public float getTextSize() {
+    return textSize;
+  }
+
+  public void setAlpha(final int alpha) {
+    interiorPaint.setAlpha(alpha);
+    exteriorPaint.setAlpha(alpha);
+  }
+
+  public void getTextBounds(
+      final String line, final int index, final int count, final Rect lineBounds) {
+    interiorPaint.getTextBounds(line, index, count, lineBounds);
+  }
+
+  public void setTextAlign(final Align align) {
+    interiorPaint.setTextAlign(align);
+    exteriorPaint.setTextAlign(align);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
new file mode 100644
index 0000000000000000000000000000000000000000..e02c6559176d40d3df42bccc0c374e60f70371b2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
@@ -0,0 +1,344 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.graphics.Bitmap;
+import android.graphics.Matrix;
+import android.os.Environment;
+import java.io.File;
+import java.io.FileOutputStream;
+
+/**
+ * Utility class for manipulating images.
+ **/
+public class ImageUtils {
+  @SuppressWarnings("unused")
+  private static final Logger LOGGER = new Logger();
+
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.w("Native library not found, native RGB -> YUV conversion may be unavailable.");
+    }
+  }
+
+  /**
+   * Utility method to compute the allocated size in bytes of a YUV420SP image
+   * of the given dimensions.
+   */
+  public static int getYUVByteSize(final int width, final int height) {
+    // The luminance plane requires 1 byte per pixel.
+    final int ySize = width * height;
+
+    // The UV plane works on 2x2 blocks, so dimensions with odd size must be rounded up.
+    // Each 2x2 block takes 2 bytes to encode, one each for U and V.
+    final int uvSize = ((width + 1) / 2) * ((height + 1) / 2) * 2;
+
+    return ySize + uvSize;
+  }
+
+  /**
+   * Saves a Bitmap object to disk for analysis.
+   *
+   * @param bitmap The bitmap to save.
+   */
+  public static void saveBitmap(final Bitmap bitmap) {
+    saveBitmap(bitmap, "preview.png");
+  }
+
+  /**
+   * Saves a Bitmap object to disk for analysis.
+   *
+   * @param bitmap The bitmap to save.
+   * @param filename The location to save the bitmap to.
+   */
+  public static void saveBitmap(final Bitmap bitmap, final String filename) {
+    final String root =
+        Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "tensorflow";
+    LOGGER.i("Saving %dx%d bitmap to %s.", bitmap.getWidth(), bitmap.getHeight(), root);
+    final File myDir = new File(root);
+
+    if (!myDir.mkdirs()) {
+      LOGGER.i("Make dir failed");
+    }
+
+    final String fname = filename;
+    final File file = new File(myDir, fname);
+    if (file.exists()) {
+      file.delete();
+    }
+    try {
+      final FileOutputStream out = new FileOutputStream(file);
+      bitmap.compress(Bitmap.CompressFormat.PNG, 99, out);
+      out.flush();
+      out.close();
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  // This value is 2 ^ 18 - 1, and is used to clamp the RGB values before their ranges
+  // are normalized to eight bits.
+  static final int kMaxChannelValue = 262143;
+
+  // Always prefer the native implementation if available.
+  private static boolean useNativeConversion = false;
+
+  public static void convertYUV420SPToARGB8888(
+      byte[] input,
+      int width,
+      int height,
+      int[] output) {
+    if (useNativeConversion) {
+      try {
+        ImageUtils.convertYUV420SPToARGB8888(input, output, width, height, false);
+        return;
+      } catch (UnsatisfiedLinkError e) {
+        LOGGER.w(
+            "Native YUV420SP -> RGB implementation not found, falling back to Java implementation");
+        useNativeConversion = false;
+      }
+    }
+
+    // Java implementation of YUV420SP to ARGB8888 converting
+    final int frameSize = width * height;
+    for (int j = 0, yp = 0; j < height; j++) {
+      int uvp = frameSize + (j >> 1) * width;
+      int u = 0;
+      int v = 0;
+
+      for (int i = 0; i < width; i++, yp++) {
+        int y = 0xff & input[yp];
+        if ((i & 1) == 0) {
+          v = 0xff & input[uvp++];
+          u = 0xff & input[uvp++];
+        }
+
+        output[yp] = YUV2RGB(y, u, v);
+      }
+    }
+  }
+
+  private static int YUV2RGB(int y, int u, int v) {
+    // Adjust and check YUV values
+    y = (y - 16) < 0 ? 0 : (y - 16);
+    u -= 128;
+    v -= 128;
+
+    // This is the floating point equivalent. We do the conversion in integer
+    // because some Android devices do not have floating point in hardware.
+    // nR = (int)(1.164 * nY + 2.018 * nU);
+    // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
+    // nB = (int)(1.164 * nY + 1.596 * nV);
+    int y1192 = 1192 * y;
+    int r = (y1192 + 1634 * v);
+    int g = (y1192 - 833 * v - 400 * u);
+    int b = (y1192 + 2066 * u);
+
+    // Clipping RGB values to be inside boundaries [ 0 , kMaxChannelValue ]
+    r = r > kMaxChannelValue ? kMaxChannelValue : (r < 0 ? 0 : r);
+    g = g > kMaxChannelValue ? kMaxChannelValue : (g < 0 ? 0 : g);
+    b = b > kMaxChannelValue ? kMaxChannelValue : (b < 0 ? 0 : b);
+
+    return 0xff000000 | ((r << 6) & 0xff0000) | ((g >> 2) & 0xff00) | ((b >> 10) & 0xff);
+  }
+
+
+  public static void convertYUV420ToARGB8888(
+      byte[] yData,
+      byte[] uData,
+      byte[] vData,
+      int width,
+      int height,
+      int yRowStride,
+      int uvRowStride,
+      int uvPixelStride,
+      int[] out) {
+    if (useNativeConversion) {
+      try {
+        convertYUV420ToARGB8888(
+            yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false);
+        return;
+      } catch (UnsatisfiedLinkError e) {
+        LOGGER.w(
+            "Native YUV420 -> RGB implementation not found, falling back to Java implementation");
+        useNativeConversion = false;
+      }
+    }
+
+    int yp = 0;
+    for (int j = 0; j < height; j++) {
+      int pY = yRowStride * j;
+      int pUV = uvRowStride * (j >> 1);
+
+      for (int i = 0; i < width; i++) {
+        int uv_offset = pUV + (i >> 1) * uvPixelStride;
+
+        out[yp++] = YUV2RGB(
+            0xff & yData[pY + i],
+            0xff & uData[uv_offset],
+            0xff & vData[uv_offset]);
+      }
+    }
+  }
+
+
+  /**
+   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width and height. The
+   * input and output must already be allocated and non-null. For efficiency, no error checking is
+   * performed.
+   *
+   * @param input The array of YUV 4:2:0 input data.
+   * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   * @param halfSize If true, downsample to 50% in each dimension, otherwise not.
+   */
+  private static native void convertYUV420SPToARGB8888(
+      byte[] input, int[] output, int width, int height, boolean halfSize);
+
+  /**
+   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width
+   * and height. The input and output must already be allocated and non-null.
+   * For efficiency, no error checking is performed.
+   *
+   * @param y
+   * @param u
+   * @param v
+   * @param uvPixelStride
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   * @param halfSize If true, downsample to 50% in each dimension, otherwise not.
+   * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
+   */
+  private static native void convertYUV420ToARGB8888(
+      byte[] y,
+      byte[] u,
+      byte[] v,
+      int[] output,
+      int width,
+      int height,
+      int yRowStride,
+      int uvRowStride,
+      int uvPixelStride,
+      boolean halfSize);
+
+  /**
+   * Converts YUV420 semi-planar data to RGB 565 data using the supplied width
+   * and height. The input and output must already be allocated and non-null.
+   * For efficiency, no error checking is performed.
+   *
+   * @param input The array of YUV 4:2:0 input data.
+   * @param output A pre-allocated array for the RGB 5:6:5 output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   */
+  private static native void convertYUV420SPToRGB565(
+      byte[] input, byte[] output, int width, int height);
+
+  /**
+   * Converts 32-bit ARGB8888 image data to YUV420SP data.  This is useful, for
+   * instance, in creating data to feed the classes that rely on raw camera
+   * preview frames.
+   *
+   * @param input An array of input pixels in ARGB8888 format.
+   * @param output A pre-allocated array for the YUV420SP output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   */
+  private static native void convertARGB8888ToYUV420SP(
+      int[] input, byte[] output, int width, int height);
+
+  /**
+   * Converts 16-bit RGB565 image data to YUV420SP data.  This is useful, for
+   * instance, in creating data to feed the classes that rely on raw camera
+   * preview frames.
+   *
+   * @param input An array of input pixels in RGB565 format.
+   * @param output A pre-allocated array for the YUV420SP output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   */
+  private static native void convertRGB565ToYUV420SP(
+      byte[] input, byte[] output, int width, int height);
+
+  /**
+   * Returns a transformation matrix from one reference frame into another.
+   * Handles cropping (if maintaining aspect ratio is desired) and rotation.
+   *
+   * @param srcWidth Width of source frame.
+   * @param srcHeight Height of source frame.
+   * @param dstWidth Width of destination frame.
+   * @param dstHeight Height of destination frame.
+   * @param applyRotation Amount of rotation to apply from one frame to another.
+   *  Must be a multiple of 90.
+   * @param maintainAspectRatio If true, will ensure that scaling in x and y remains constant,
+   * cropping the image if necessary.
+   * @return The transformation fulfilling the desired requirements.
+   */
+  public static Matrix getTransformationMatrix(
+      final int srcWidth,
+      final int srcHeight,
+      final int dstWidth,
+      final int dstHeight,
+      final int applyRotation,
+      final boolean maintainAspectRatio) {
+    final Matrix matrix = new Matrix();
+
+    if (applyRotation != 0) {
+      if (applyRotation % 90 != 0) {
+        LOGGER.w("Rotation of %d % 90 != 0", applyRotation);
+      }
+
+      // Translate so center of image is at origin.
+      matrix.postTranslate(-srcWidth / 2.0f, -srcHeight / 2.0f);
+
+      // Rotate around origin.
+      matrix.postRotate(applyRotation);
+    }
+
+    // Account for the already applied rotation, if any, and then determine how
+    // much scaling is needed for each axis.
+    final boolean transpose = (Math.abs(applyRotation) + 90) % 180 == 0;
+
+    final int inWidth = transpose ? srcHeight : srcWidth;
+    final int inHeight = transpose ? srcWidth : srcHeight;
+
+    // Apply scaling if necessary.
+    if (inWidth != dstWidth || inHeight != dstHeight) {
+      final float scaleFactorX = dstWidth / (float) inWidth;
+      final float scaleFactorY = dstHeight / (float) inHeight;
+
+      if (maintainAspectRatio) {
+        // Scale by minimum factor so that dst is filled completely while
+        // maintaining the aspect ratio. Some image may fall off the edge.
+        final float scaleFactor = Math.max(scaleFactorX, scaleFactorY);
+        matrix.postScale(scaleFactor, scaleFactor);
+      } else {
+        // Scale exactly to fill dst from src.
+        matrix.postScale(scaleFactorX, scaleFactorY);
+      }
+    }
+
+    if (applyRotation != 0) {
+      // Translate back from origin centered reference to destination frame.
+      matrix.postTranslate(dstWidth / 2.0f, dstHeight / 2.0f);
+    }
+
+    return matrix;
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Logger.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Logger.java
new file mode 100644
index 0000000000000000000000000000000000000000..0d984096a08cff5640a9dad3a33069fd9c77bbd0
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Logger.java
@@ -0,0 +1,190 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.util.Log;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Wrapper for the platform log function, allows convenient message prefixing and log disabling.
+ */
+public final class Logger {
+  private static final String DEFAULT_TAG = "tensorflow";
+  private static final int DEFAULT_MIN_LOG_LEVEL = Log.DEBUG;
+
+  // Classes to be ignored when examining the stack trace
+  private static final Set<String> IGNORED_CLASS_NAMES;
+
+  static {
+    IGNORED_CLASS_NAMES = new HashSet<String>(3);
+    IGNORED_CLASS_NAMES.add("dalvik.system.VMStack");
+    IGNORED_CLASS_NAMES.add("java.lang.Thread");
+    IGNORED_CLASS_NAMES.add(Logger.class.getCanonicalName());
+  }
+
+  private final String tag;
+  private final String messagePrefix;
+  private int minLogLevel = DEFAULT_MIN_LOG_LEVEL;
+
+  /**
+   * Creates a Logger using the class name as the message prefix.
+   *
+   * @param clazz the simple name of this class is used as the message prefix.
+   */
+  public Logger(final Class<?> clazz) {
+    this(clazz.getSimpleName());
+  }
+
+  /**
+   * Creates a Logger using the specified message prefix.
+   *
+   * @param messagePrefix is prepended to the text of every message.
+   */
+  public Logger(final String messagePrefix) {
+    this(DEFAULT_TAG, messagePrefix);
+  }
+
+  /**
+   * Creates a Logger with a custom tag and a custom message prefix. If the message prefix
+   * is set to <pre>null</pre>, the caller's class name is used as the prefix.
+   *
+   * @param tag identifies the source of a log message.
+   * @param messagePrefix prepended to every message if non-null. If null, the name of the caller is
+   *                      being used
+   */
+  public Logger(final String tag, final String messagePrefix) {
+    this.tag = tag;
+    final String prefix = messagePrefix == null ? getCallerSimpleName() : messagePrefix;
+    this.messagePrefix = (prefix.length() > 0) ? prefix + ": " : prefix;
+  }
+
+  /**
+   * Creates a Logger using the caller's class name as the message prefix.
+   */
+  public Logger() {
+    this(DEFAULT_TAG, null);
+  }
+
+  /**
+   * Creates a Logger using the caller's class name as the message prefix.
+   */
+  public Logger(final int minLogLevel) {
+    this(DEFAULT_TAG, null);
+    this.minLogLevel = minLogLevel;
+  }
+
+  public void setMinLogLevel(final int minLogLevel) {
+    this.minLogLevel = minLogLevel;
+  }
+
+  public boolean isLoggable(final int logLevel) {
+    return logLevel >= minLogLevel || Log.isLoggable(tag, logLevel);
+  }
+
+  /**
+   * Return caller's simple name.
+   *
+   * Android getStackTrace() returns an array that looks like this:
+   *     stackTrace[0]: dalvik.system.VMStack
+   *     stackTrace[1]: java.lang.Thread
+   *     stackTrace[2]: com.google.android.apps.unveil.env.UnveilLogger
+   *     stackTrace[3]: com.google.android.apps.unveil.BaseApplication
+   *
+   * This function returns the simple version of the first non-filtered name.
+   *
+   * @return caller's simple name
+   */
+  private static String getCallerSimpleName() {
+    // Get the current callstack so we can pull the class of the caller off of it.
+    final StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
+
+    for (final StackTraceElement elem : stackTrace) {
+      final String className = elem.getClassName();
+      if (!IGNORED_CLASS_NAMES.contains(className)) {
+        // We're only interested in the simple name of the class, not the complete package.
+        final String[] classParts = className.split("\\.");
+        return classParts[classParts.length - 1];
+      }
+    }
+
+    return Logger.class.getSimpleName();
+  }
+
+  private String toMessage(final String format, final Object... args) {
+    return messagePrefix + (args.length > 0 ? String.format(format, args) : format);
+  }
+
+  public void v(final String format, final Object... args) {
+    if (isLoggable(Log.VERBOSE)) {
+      Log.v(tag, toMessage(format, args));
+    }
+  }
+
+  public void v(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.VERBOSE)) {
+      Log.v(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void d(final String format, final Object... args) {
+    if (isLoggable(Log.DEBUG)) {
+      Log.d(tag, toMessage(format, args));
+    }
+  }
+
+  public void d(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.DEBUG)) {
+      Log.d(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void i(final String format, final Object... args) {
+    if (isLoggable(Log.INFO)) {
+      Log.i(tag, toMessage(format, args));
+    }
+  }
+
+  public void i(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.INFO)) {
+      Log.i(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void w(final String format, final Object... args) {
+    if (isLoggable(Log.WARN)) {
+      Log.w(tag, toMessage(format, args));
+    }
+  }
+
+  public void w(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.WARN)) {
+      Log.w(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void e(final String format, final Object... args) {
+    if (isLoggable(Log.ERROR)) {
+      Log.e(tag, toMessage(format, args));
+    }
+  }
+
+  public void e(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.ERROR)) {
+      Log.e(tag, toMessage(format, args), t);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Size.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Size.java
new file mode 100644
index 0000000000000000000000000000000000000000..ef15d14daa841bf185d1839393c68c211d1e04d7
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Size.java
@@ -0,0 +1,143 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.graphics.Bitmap;
+import android.text.TextUtils;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Size class independent of a Camera object.
+ */
+public class Size implements Comparable<Size>, Serializable {
+
+  // 1.4 went out with this UID so we'll need to maintain it to preserve pending queries when
+  // upgrading.
+  public static final long serialVersionUID = 7689808733290872361L;
+
+  public final int width;
+  public final int height;
+
+  public Size(final int width, final int height) {
+    this.width = width;
+    this.height = height;
+  }
+
+  public Size(final Bitmap bmp) {
+    this.width = bmp.getWidth();
+    this.height = bmp.getHeight();
+  }
+
+  /**
+   * Rotate a size by the given number of degrees.
+   * @param size Size to rotate.
+   * @param rotation Degrees {0, 90, 180, 270} to rotate the size.
+   * @return Rotated size.
+   */
+  public static Size getRotatedSize(final Size size, final int rotation) {
+    if (rotation % 180 != 0) {
+      // The phone is portrait, therefore the camera is sideways and frame should be rotated.
+      return new Size(size.height, size.width);
+    }
+    return size;
+  }
+
+  public static Size parseFromString(String sizeString) {
+    if (TextUtils.isEmpty(sizeString)) {
+      return null;
+    }
+
+    sizeString = sizeString.trim();
+
+    // The expected format is "<width>x<height>".
+    final String[] components = sizeString.split("x");
+    if (components.length == 2) {
+      try {
+        final int width = Integer.parseInt(components[0]);
+        final int height = Integer.parseInt(components[1]);
+        return new Size(width, height);
+      } catch (final NumberFormatException e) {
+        return null;
+      }
+    } else {
+      return null;
+    }
+  }
+
+  public static List<Size> sizeStringToList(final String sizes) {
+    final List<Size> sizeList = new ArrayList<Size>();
+    if (sizes != null) {
+      final String[] pairs = sizes.split(",");
+      for (final String pair : pairs) {
+        final Size size = Size.parseFromString(pair);
+        if (size != null) {
+          sizeList.add(size);
+        }
+      }
+    }
+    return sizeList;
+  }
+
+  public static String sizeListToString(final List<Size> sizes) {
+    String sizesString = "";
+    if (sizes != null && sizes.size() > 0) {
+      sizesString = sizes.get(0).toString();
+      for (int i = 1; i < sizes.size(); i++) {
+        sizesString += "," + sizes.get(i).toString();
+      }
+    }
+    return sizesString;
+  }
+
+  public final float aspectRatio() {
+    return (float) width / (float) height;
+  }
+
+  @Override
+  public int compareTo(final Size other) {
+    return width * height - other.width * other.height;
+  }
+
+  @Override
+  public boolean equals(final Object other) {
+    if (other == null) {
+      return false;
+    }
+
+    if (!(other instanceof Size)) {
+      return false;
+    }
+
+    final Size otherSize = (Size) other;
+    return (width == otherSize.width && height == otherSize.height);
+  }
+
+  @Override
+  public int hashCode() {
+    return width * 32713 + height;
+  }
+
+  @Override
+  public String toString() {
+    return dimensionsAsString(width, height);
+  }
+
+  public static final String dimensionsAsString(final int width, final int height) {
+    return width + "x" + height;
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/SplitTimer.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/SplitTimer.java
new file mode 100644
index 0000000000000000000000000000000000000000..459b0a0d4dbae0a9929f1a57d0b1f48b5d96b7ef
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/SplitTimer.java
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.os.SystemClock;
+
+/**
+ * A simple utility timer for measuring CPU time and wall-clock splits.
+ */
+public class SplitTimer {
+  private final Logger logger;
+
+  private long lastWallTime;
+  private long lastCpuTime;
+
+  public SplitTimer(final String name) {
+    logger = new Logger(name);
+    newSplit();
+  }
+
+  public void newSplit() {
+    lastWallTime = SystemClock.uptimeMillis();
+    lastCpuTime = SystemClock.currentThreadTimeMillis();
+  }
+
+  public void endSplit(final String splitName) {
+    final long currWallTime = SystemClock.uptimeMillis();
+    final long currCpuTime = SystemClock.currentThreadTimeMillis();
+
+    logger.i(
+        "%s: cpu=%dms wall=%dms",
+        splitName, currCpuTime - lastCpuTime, currWallTime - lastWallTime);
+
+    lastWallTime = currWallTime;
+    lastCpuTime = currCpuTime;
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
new file mode 100644
index 0000000000000000000000000000000000000000..af6af2bc8f508a70aa7e44a7236f0e7ea5e3d71c
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -0,0 +1,421 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.tracking;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Paint.Cap;
+import android.graphics.Paint.Join;
+import android.graphics.Paint.Style;
+import android.graphics.RectF;
+import android.text.TextUtils;
+import android.util.Pair;
+import android.util.TypedValue;
+import android.widget.Toast;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import org.tensorflow.demo.Classifier.Recognition;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+
+/**
+ * A tracker wrapping ObjectTracker that also handles non-max suppression and matching existing
+ * objects to new detections.
+ */
+public class MultiBoxTracker {
+  private final Logger logger = new Logger();
+
+  private static final float TEXT_SIZE_DIP = 18;
+
+  // Maximum percentage of a box that can be overlapped by another box at detection time. Otherwise
+  // the lower scored box (new or old) will be removed.
+  private static final float MAX_OVERLAP = 0.2f;
+
+  private static final float MIN_SIZE = 16.0f;
+
+  // Allow replacement of the tracked box with new results if
+  // correlation has dropped below this level.
+  private static final float MARGINAL_CORRELATION = 0.75f;
+
+  // Consider object to be lost if correlation falls below this threshold.
+  private static final float MIN_CORRELATION = 0.3f;
+
+  private static final int[] COLORS = {
+    Color.BLUE, Color.RED, Color.GREEN, Color.YELLOW, Color.CYAN, Color.MAGENTA, Color.WHITE,
+    Color.parseColor("#55FF55"), Color.parseColor("#FFA500"), Color.parseColor("#FF8888"),
+    Color.parseColor("#AAAAFF"), Color.parseColor("#FFFFAA"), Color.parseColor("#55AAAA"),
+    Color.parseColor("#AA33AA"), Color.parseColor("#0D0068")
+  };
+
+  private final Queue<Integer> availableColors = new LinkedList<Integer>();
+
+  public ObjectTracker objectTracker;
+
+  final List<Pair<Float, RectF>> screenRects = new LinkedList<Pair<Float, RectF>>();
+
+  private static class TrackedRecognition {
+    ObjectTracker.TrackedObject trackedObject;
+    RectF location;
+    float detectionConfidence;
+    int color;
+    String title;
+  }
+
+  private final List<TrackedRecognition> trackedObjects = new LinkedList<TrackedRecognition>();
+
+  private final Paint boxPaint = new Paint();
+
+  private final float textSizePx;
+  private final BorderedText borderedText;
+
+  private Matrix frameToCanvasMatrix;
+
+  private int frameWidth;
+  private int frameHeight;
+
+  private int sensorOrientation;
+  private Context context;
+
+  public MultiBoxTracker(final Context context) {
+    this.context = context;
+    for (final int color : COLORS) {
+      availableColors.add(color);
+    }
+
+    boxPaint.setColor(Color.RED);
+    boxPaint.setStyle(Style.STROKE);
+    boxPaint.setStrokeWidth(12.0f);
+    boxPaint.setStrokeCap(Cap.ROUND);
+    boxPaint.setStrokeJoin(Join.ROUND);
+    boxPaint.setStrokeMiter(100);
+
+    textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, context.getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+  }
+
+  private Matrix getFrameToCanvasMatrix() {
+    return frameToCanvasMatrix;
+  }
+
+  public synchronized void drawDebug(final Canvas canvas) {
+    final Paint textPaint = new Paint();
+    textPaint.setColor(Color.WHITE);
+    textPaint.setTextSize(60.0f);
+
+    final Paint boxPaint = new Paint();
+    boxPaint.setColor(Color.RED);
+    boxPaint.setAlpha(200);
+    boxPaint.setStyle(Style.STROKE);
+
+    for (final Pair<Float, RectF> detection : screenRects) {
+      final RectF rect = detection.second;
+      canvas.drawRect(rect, boxPaint);
+      canvas.drawText("" + detection.first, rect.left, rect.top, textPaint);
+      borderedText.drawText(canvas, rect.centerX(), rect.centerY(), "" + detection.first);
+    }
+
+    if (objectTracker == null) {
+      return;
+    }
+
+    // Draw correlations.
+    for (final TrackedRecognition recognition : trackedObjects) {
+      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
+
+      final RectF trackedPos = trackedObject.getTrackedPositionInPreviewFrame();
+
+      if (getFrameToCanvasMatrix().mapRect(trackedPos)) {
+        final String labelString = String.format("%.2f", trackedObject.getCurrentCorrelation());
+        borderedText.drawText(canvas, trackedPos.right, trackedPos.bottom, labelString);
+      }
+    }
+
+    final Matrix matrix = getFrameToCanvasMatrix();
+    objectTracker.drawDebug(canvas, matrix);
+  }
+
+  public synchronized void trackResults(
+      final List<Recognition> results, final byte[] frame, final long timestamp) {
+    logger.i("Processing %d results from %d", results.size(), timestamp);
+    processResults(timestamp, results, frame);
+  }
+
+  public synchronized void draw(final Canvas canvas) {
+    final boolean rotated = sensorOrientation % 180 == 90;
+    final float multiplier =
+        Math.min(canvas.getHeight() / (float) (rotated ? frameWidth : frameHeight),
+                 canvas.getWidth() / (float) (rotated ? frameHeight : frameWidth));
+    frameToCanvasMatrix =
+        ImageUtils.getTransformationMatrix(
+            frameWidth,
+            frameHeight,
+            (int) (multiplier * (rotated ? frameHeight : frameWidth)),
+            (int) (multiplier * (rotated ? frameWidth : frameHeight)),
+            sensorOrientation,
+            false);
+    for (final TrackedRecognition recognition : trackedObjects) {
+      final RectF trackedPos =
+          (objectTracker != null)
+              ? recognition.trackedObject.getTrackedPositionInPreviewFrame()
+              : new RectF(recognition.location);
+
+      getFrameToCanvasMatrix().mapRect(trackedPos);
+      boxPaint.setColor(recognition.color);
+
+      final float cornerSize = Math.min(trackedPos.width(), trackedPos.height()) / 8.0f;
+      canvas.drawRoundRect(trackedPos, cornerSize, cornerSize, boxPaint);
+
+      final String labelString =
+          !TextUtils.isEmpty(recognition.title)
+              ? String.format("%s %.2f", recognition.title, recognition.detectionConfidence)
+              : String.format("%.2f", recognition.detectionConfidence);
+      borderedText.drawText(canvas, trackedPos.left + cornerSize, trackedPos.bottom, labelString);
+    }
+  }
+
+  private boolean initialized = false;
+
+  public synchronized void onFrame(
+      final int w,
+      final int h,
+      final int rowStride,
+      final int sensorOrientation,
+      final byte[] frame,
+      final long timestamp) {
+    if (objectTracker == null && !initialized) {
+      ObjectTracker.clearInstance();
+
+      logger.i("Initializing ObjectTracker: %dx%d", w, h);
+      objectTracker = ObjectTracker.getInstance(w, h, rowStride, true);
+      frameWidth = w;
+      frameHeight = h;
+      this.sensorOrientation = sensorOrientation;
+      initialized = true;
+
+      if (objectTracker == null) {
+        String message =
+            "Object tracking support not found. "
+                + "See tensorflow/examples/android/README.md for details.";
+        Toast.makeText(context, message, Toast.LENGTH_LONG).show();
+        logger.e(message);
+      }
+    }
+
+    if (objectTracker == null) {
+      return;
+    }
+
+    objectTracker.nextFrame(frame, null, timestamp, null, true);
+
+    // Clean up any objects not worth tracking any more.
+    final LinkedList<TrackedRecognition> copyList =
+        new LinkedList<TrackedRecognition>(trackedObjects);
+    for (final TrackedRecognition recognition : copyList) {
+      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
+      final float correlation = trackedObject.getCurrentCorrelation();
+      if (correlation < MIN_CORRELATION) {
+        logger.v("Removing tracked object %s because NCC is %.2f", trackedObject, correlation);
+        trackedObject.stopTracking();
+        trackedObjects.remove(recognition);
+
+        availableColors.add(recognition.color);
+      }
+    }
+  }
+
+  private void processResults(
+      final long timestamp, final List<Recognition> results, final byte[] originalFrame) {
+    final List<Pair<Float, Recognition>> rectsToTrack = new LinkedList<Pair<Float, Recognition>>();
+
+    screenRects.clear();
+    final Matrix rgbFrameToScreen = new Matrix(getFrameToCanvasMatrix());
+
+    for (final Recognition result : results) {
+      if (result.getLocation() == null) {
+        continue;
+      }
+      final RectF detectionFrameRect = new RectF(result.getLocation());
+
+      final RectF detectionScreenRect = new RectF();
+      rgbFrameToScreen.mapRect(detectionScreenRect, detectionFrameRect);
+
+      logger.v(
+          "Result! Frame: " + result.getLocation() + " mapped to screen:" + detectionScreenRect);
+
+      screenRects.add(new Pair<Float, RectF>(result.getConfidence(), detectionScreenRect));
+
+      if (detectionFrameRect.width() < MIN_SIZE || detectionFrameRect.height() < MIN_SIZE) {
+        logger.w("Degenerate rectangle! " + detectionFrameRect);
+        continue;
+      }
+
+      rectsToTrack.add(new Pair<Float, Recognition>(result.getConfidence(), result));
+    }
+
+    if (rectsToTrack.isEmpty()) {
+      logger.v("Nothing to track, aborting.");
+      return;
+    }
+
+    if (objectTracker == null) {
+      trackedObjects.clear();
+      for (final Pair<Float, Recognition> potential : rectsToTrack) {
+        final TrackedRecognition trackedRecognition = new TrackedRecognition();
+        trackedRecognition.detectionConfidence = potential.first;
+        trackedRecognition.location = new RectF(potential.second.getLocation());
+        trackedRecognition.trackedObject = null;
+        trackedRecognition.title = potential.second.getTitle();
+        trackedRecognition.color = COLORS[trackedObjects.size()];
+        trackedObjects.add(trackedRecognition);
+
+        if (trackedObjects.size() >= COLORS.length) {
+          break;
+        }
+      }
+      return;
+    }
+
+    logger.i("%d rects to track", rectsToTrack.size());
+    for (final Pair<Float, Recognition> potential : rectsToTrack) {
+      handleDetection(originalFrame, timestamp, potential);
+    }
+  }
+
+  private void handleDetection(
+      final byte[] frameCopy, final long timestamp, final Pair<Float, Recognition> potential) {
+    final ObjectTracker.TrackedObject potentialObject =
+        objectTracker.trackObject(potential.second.getLocation(), timestamp, frameCopy);
+
+    final float potentialCorrelation = potentialObject.getCurrentCorrelation();
+    logger.v(
+        "Tracked object went from %s to %s with correlation %.2f",
+        potential.second, potentialObject.getTrackedPositionInPreviewFrame(), potentialCorrelation);
+
+    if (potentialCorrelation < MARGINAL_CORRELATION) {
+      logger.v("Correlation too low to begin tracking %s.", potentialObject);
+      potentialObject.stopTracking();
+      return;
+    }
+
+    final List<TrackedRecognition> removeList = new LinkedList<TrackedRecognition>();
+
+    float maxIntersect = 0.0f;
+
+    // This is the current tracked object whose color we will take. If left null we'll take the
+    // first one from the color queue.
+    TrackedRecognition recogToReplace = null;
+
+    // Look for intersections that will be overridden by this object or an intersection that would
+    // prevent this one from being placed.
+    for (final TrackedRecognition trackedRecognition : trackedObjects) {
+      final RectF a = trackedRecognition.trackedObject.getTrackedPositionInPreviewFrame();
+      final RectF b = potentialObject.getTrackedPositionInPreviewFrame();
+      final RectF intersection = new RectF();
+      final boolean intersects = intersection.setIntersect(a, b);
+
+      final float intersectArea = intersection.width() * intersection.height();
+      final float totalArea = a.width() * a.height() + b.width() * b.height() - intersectArea;
+      final float intersectOverUnion = intersectArea / totalArea;
+
+      // If there is an intersection with this currently tracked box above the maximum overlap
+      // percentage allowed, either the new recognition needs to be dismissed or the old
+      // recognition needs to be removed and possibly replaced with the new one.
+      if (intersects && intersectOverUnion > MAX_OVERLAP) {
+        if (potential.first < trackedRecognition.detectionConfidence
+            && trackedRecognition.trackedObject.getCurrentCorrelation() > MARGINAL_CORRELATION) {
+          // If track for the existing object is still going strong and the detection score was
+          // good, reject this new object.
+          potentialObject.stopTracking();
+          return;
+        } else {
+          removeList.add(trackedRecognition);
+
+          // Let the previously tracked object with max intersection amount donate its color to
+          // the new object.
+          if (intersectOverUnion > maxIntersect) {
+            maxIntersect = intersectOverUnion;
+            recogToReplace = trackedRecognition;
+          }
+        }
+      }
+    }
+
+    // If we're already tracking the max object and no intersections were found to bump off,
+    // pick the worst current tracked object to remove, if it's also worse than this candidate
+    // object.
+    if (availableColors.isEmpty() && removeList.isEmpty()) {
+      for (final TrackedRecognition candidate : trackedObjects) {
+        if (candidate.detectionConfidence < potential.first) {
+          if (recogToReplace == null
+              || candidate.detectionConfidence < recogToReplace.detectionConfidence) {
+            // Save it so that we use this color for the new object.
+            recogToReplace = candidate;
+          }
+        }
+      }
+      if (recogToReplace != null) {
+        logger.v("Found non-intersecting object to remove.");
+        removeList.add(recogToReplace);
+      } else {
+        logger.v("No non-intersecting object found to remove");
+      }
+    }
+
+    // Remove everything that got intersected.
+    for (final TrackedRecognition trackedRecognition : removeList) {
+      logger.v(
+          "Removing tracked object %s with detection confidence %.2f, correlation %.2f",
+          trackedRecognition.trackedObject,
+          trackedRecognition.detectionConfidence,
+          trackedRecognition.trackedObject.getCurrentCorrelation());
+      trackedRecognition.trackedObject.stopTracking();
+      trackedObjects.remove(trackedRecognition);
+      if (trackedRecognition != recogToReplace) {
+        availableColors.add(trackedRecognition.color);
+      }
+    }
+
+    if (recogToReplace == null && availableColors.isEmpty()) {
+      logger.e("No room to track this object, aborting.");
+      potentialObject.stopTracking();
+      return;
+    }
+
+    // Finally safe to say we can track this object.
+    logger.v(
+        "Tracking object %s (%s) with detection confidence %.2f at position %s",
+        potentialObject,
+        potential.second.getTitle(),
+        potential.first,
+        potential.second.getLocation());
+    final TrackedRecognition trackedRecognition = new TrackedRecognition();
+    trackedRecognition.detectionConfidence = potential.first;
+    trackedRecognition.trackedObject = potentialObject;
+    trackedRecognition.title = potential.second.getTitle();
+
+    // Use the color from a replaced object before taking one from the color queue.
+    trackedRecognition.color =
+        recogToReplace != null ? recogToReplace.color : availableColors.poll();
+    trackedObjects.add(trackedRecognition);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
new file mode 100644
index 0000000000000000000000000000000000000000..8b4248d8fbcfa2d58621fb429edbc9498956d273
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
@@ -0,0 +1,661 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.tracking;
+
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.PointF;
+import android.graphics.RectF;
+import android.graphics.Typeface;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Vector;
+import javax.microedition.khronos.opengles.GL10;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.env.Size;
+
+/**
+ * True object detector/tracker class that tracks objects across consecutive preview frames.
+ * It provides a simplified Java interface to the analogous native object defined by
+ * jni/client_vision/tracking/object_tracker.*.
+ *
+ * Currently, the ObjectTracker is a singleton due to native code restrictions, and so must
+ * be allocated by ObjectTracker.getInstance(). In addition, release() should be called
+ * as soon as the ObjectTracker is no longer needed, and before a new one is created.
+ *
+ * nextFrame() should be called as new frames become available, preferably as often as possible.
+ *
+ * After allocation, new TrackedObjects may be instantiated via trackObject(). TrackedObjects
+ * are associated with the ObjectTracker that created them, and are only valid while that
+ * ObjectTracker still exists.
+ */
+public class ObjectTracker {
+  private static final Logger LOGGER = new Logger();
+
+  private static boolean libraryFound = false;
+
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+      libraryFound = true;
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.e("libtensorflow_demo.so not found, tracking unavailable");
+    }
+  }
+
+  private static final boolean DRAW_TEXT = false;
+
+  /**
+   * How many history points to keep track of and draw in the red history line.
+   */
+  private static final int MAX_DEBUG_HISTORY_SIZE = 30;
+
+  /**
+   * How many frames of optical flow deltas to record.
+   * TODO(andrewharp): Push this down to the native level so it can be polled
+   * efficiently into a an array for upload, instead of keeping a duplicate
+   * copy in Java.
+   */
+  private static final int MAX_FRAME_HISTORY_SIZE = 200;
+
+  private static final int DOWNSAMPLE_FACTOR = 2;
+
+  private final byte[] downsampledFrame;
+
+  protected static ObjectTracker instance;
+
+  private final Map<String, TrackedObject> trackedObjects;
+
+  private long lastTimestamp;
+
+  private FrameChange lastKeypoints;
+
+  private final Vector<PointF> debugHistory;
+
+  private final LinkedList<TimestampedDeltas> timestampedDeltas;
+
+  protected final int frameWidth;
+  protected final int frameHeight;
+  private final int rowStride;
+  protected final boolean alwaysTrack;
+
+  private static class TimestampedDeltas {
+    final long timestamp;
+    final byte[] deltas;
+
+    public TimestampedDeltas(final long timestamp, final byte[] deltas) {
+      this.timestamp = timestamp;
+      this.deltas = deltas;
+    }
+  }
+
+  /**
+   * A simple class that records keypoint information, which includes
+   * local location, score and type. This will be used in calculating
+   * FrameChange.
+   */
+  public static class Keypoint {
+    public final float x;
+    public final float y;
+    public final float score;
+    public final int type;
+
+    public Keypoint(final float x, final float y) {
+      this.x = x;
+      this.y = y;
+      this.score = 0;
+      this.type = -1;
+    }
+
+    public Keypoint(final float x, final float y, final float score, final int type) {
+      this.x = x;
+      this.y = y;
+      this.score = score;
+      this.type = type;
+    }
+
+    Keypoint delta(final Keypoint other) {
+      return new Keypoint(this.x - other.x, this.y - other.y);
+    }
+  }
+
+  /**
+   * A simple class that could calculate Keypoint delta.
+   * This class will be used in calculating frame translation delta
+   * for optical flow.
+   */
+  public static class PointChange {
+    public final Keypoint keypointA;
+    public final Keypoint keypointB;
+    Keypoint pointDelta;
+    private final boolean wasFound;
+
+    public PointChange(final float x1, final float y1,
+                       final float x2, final float y2,
+                       final float score, final int type,
+                       final boolean wasFound) {
+      this.wasFound = wasFound;
+
+      keypointA = new Keypoint(x1, y1, score, type);
+      keypointB = new Keypoint(x2, y2);
+    }
+
+    public Keypoint getDelta() {
+      if (pointDelta == null) {
+        pointDelta = keypointB.delta(keypointA);
+      }
+      return pointDelta;
+    }
+  }
+
+  /** A class that records a timestamped frame translation delta for optical flow. */
+  public static class FrameChange {
+    public static final int KEYPOINT_STEP = 7;
+
+    public final Vector<PointChange> pointDeltas;
+
+    private final float minScore;
+    private final float maxScore;
+
+    public FrameChange(final float[] framePoints) {
+      float minScore = 100.0f;
+      float maxScore = -100.0f;
+
+      pointDeltas = new Vector<PointChange>(framePoints.length / KEYPOINT_STEP);
+
+      for (int i = 0; i < framePoints.length; i += KEYPOINT_STEP) {
+        final float x1 = framePoints[i + 0] * DOWNSAMPLE_FACTOR;
+        final float y1 = framePoints[i + 1] * DOWNSAMPLE_FACTOR;
+
+        final boolean wasFound = framePoints[i + 2] > 0.0f;
+
+        final float x2 = framePoints[i + 3] * DOWNSAMPLE_FACTOR;
+        final float y2 = framePoints[i + 4] * DOWNSAMPLE_FACTOR;
+        final float score = framePoints[i + 5];
+        final int type = (int) framePoints[i + 6];
+
+        minScore = Math.min(minScore, score);
+        maxScore = Math.max(maxScore, score);
+
+        pointDeltas.add(new PointChange(x1, y1, x2, y2, score, type, wasFound));
+      }
+
+      this.minScore = minScore;
+      this.maxScore = maxScore;
+    }
+  }
+
+  public static synchronized ObjectTracker getInstance(
+      final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
+    if (!libraryFound) {
+      LOGGER.e(
+          "Native object tracking support not found. "
+              + "See tensorflow/examples/android/README.md for details.");
+      return null;
+    }
+
+    if (instance == null) {
+      instance = new ObjectTracker(frameWidth, frameHeight, rowStride, alwaysTrack);
+      instance.init();
+    } else {
+      throw new RuntimeException(
+          "Tried to create a new objectracker before releasing the old one!");
+    }
+    return instance;
+  }
+
+  public static synchronized void clearInstance() {
+    if (instance != null) {
+      instance.release();
+    }
+  }
+
+  protected ObjectTracker(
+      final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
+    this.frameWidth = frameWidth;
+    this.frameHeight = frameHeight;
+    this.rowStride = rowStride;
+    this.alwaysTrack = alwaysTrack;
+    this.timestampedDeltas = new LinkedList<TimestampedDeltas>();
+
+    trackedObjects = new HashMap<String, TrackedObject>();
+
+    debugHistory = new Vector<PointF>(MAX_DEBUG_HISTORY_SIZE);
+
+    downsampledFrame =
+        new byte
+            [(frameWidth + DOWNSAMPLE_FACTOR - 1)
+                / DOWNSAMPLE_FACTOR
+                * (frameWidth + DOWNSAMPLE_FACTOR - 1)
+                / DOWNSAMPLE_FACTOR];
+  }
+
+  protected void init() {
+    // The native tracker never sees the full frame, so pre-scale dimensions
+    // by the downsample factor.
+    initNative(frameWidth / DOWNSAMPLE_FACTOR, frameHeight / DOWNSAMPLE_FACTOR, alwaysTrack);
+  }
+
+  private final float[] matrixValues = new float[9];
+
+  private long downsampledTimestamp;
+
+  @SuppressWarnings("unused")
+  public synchronized void drawOverlay(final GL10 gl,
+      final Size cameraViewSize, final Matrix matrix) {
+    final Matrix tempMatrix = new Matrix(matrix);
+    tempMatrix.preScale(DOWNSAMPLE_FACTOR, DOWNSAMPLE_FACTOR);
+    tempMatrix.getValues(matrixValues);
+    drawNative(cameraViewSize.width, cameraViewSize.height, matrixValues);
+  }
+
+  public synchronized void nextFrame(
+      final byte[] frameData, final byte[] uvData,
+      final long timestamp, final float[] transformationMatrix,
+      final boolean updateDebugInfo) {
+    if (downsampledTimestamp != timestamp) {
+      ObjectTracker.downsampleImageNative(
+          frameWidth, frameHeight, rowStride, frameData, DOWNSAMPLE_FACTOR, downsampledFrame);
+      downsampledTimestamp = timestamp;
+    }
+
+    // Do Lucas Kanade using the fullframe initializer.
+    nextFrameNative(downsampledFrame, uvData, timestamp, transformationMatrix);
+
+    timestampedDeltas.add(new TimestampedDeltas(timestamp, getKeypointsPacked(DOWNSAMPLE_FACTOR)));
+    while (timestampedDeltas.size() > MAX_FRAME_HISTORY_SIZE) {
+      timestampedDeltas.removeFirst();
+    }
+
+    for (final TrackedObject trackedObject : trackedObjects.values()) {
+      trackedObject.updateTrackedPosition();
+    }
+
+    if (updateDebugInfo) {
+      updateDebugHistory();
+    }
+
+    lastTimestamp = timestamp;
+  }
+
+  public synchronized void release() {
+    releaseMemoryNative();
+    synchronized (ObjectTracker.class) {
+      instance = null;
+    }
+  }
+
+  private void drawHistoryDebug(final Canvas canvas) {
+    drawHistoryPoint(
+        canvas, frameWidth * DOWNSAMPLE_FACTOR / 2, frameHeight * DOWNSAMPLE_FACTOR / 2);
+  }
+
+  private void drawHistoryPoint(final Canvas canvas, final float startX, final float startY) {
+    final Paint p = new Paint();
+    p.setAntiAlias(false);
+    p.setTypeface(Typeface.SERIF);
+
+    p.setColor(Color.RED);
+    p.setStrokeWidth(2.0f);
+
+    // Draw the center circle.
+    p.setColor(Color.GREEN);
+    canvas.drawCircle(startX, startY, 3.0f, p);
+
+    p.setColor(Color.RED);
+
+    // Iterate through in backwards order.
+    synchronized (debugHistory) {
+      final int numPoints = debugHistory.size();
+      float lastX = startX;
+      float lastY = startY;
+      for (int keypointNum = 0; keypointNum < numPoints; ++keypointNum) {
+        final PointF delta = debugHistory.get(numPoints - keypointNum - 1);
+        final float newX = lastX + delta.x;
+        final float newY = lastY + delta.y;
+        canvas.drawLine(lastX, lastY, newX, newY, p);
+        lastX = newX;
+        lastY = newY;
+      }
+    }
+  }
+
+  private static int floatToChar(final float value) {
+    return Math.max(0, Math.min((int) (value * 255.999f), 255));
+  }
+
+  private void drawKeypointsDebug(final Canvas canvas) {
+    final Paint p = new Paint();
+    if (lastKeypoints == null) {
+      return;
+    }
+    final int keypointSize = 3;
+
+    final float minScore = lastKeypoints.minScore;
+    final float maxScore = lastKeypoints.maxScore;
+
+    for (final PointChange keypoint : lastKeypoints.pointDeltas) {
+      if (keypoint.wasFound) {
+        final int r =
+            floatToChar((keypoint.keypointA.score - minScore) / (maxScore - minScore));
+        final int b =
+            floatToChar(1.0f - (keypoint.keypointA.score - minScore) / (maxScore - minScore));
+
+        final int color = 0xFF000000 | (r << 16) | b;
+        p.setColor(color);
+
+        final float[] screenPoints = {keypoint.keypointA.x, keypoint.keypointA.y,
+                                      keypoint.keypointB.x, keypoint.keypointB.y};
+        canvas.drawRect(screenPoints[2] - keypointSize,
+                        screenPoints[3] - keypointSize,
+                        screenPoints[2] + keypointSize,
+                        screenPoints[3] + keypointSize, p);
+        p.setColor(Color.CYAN);
+        canvas.drawLine(screenPoints[2], screenPoints[3],
+                        screenPoints[0], screenPoints[1], p);
+
+        if (DRAW_TEXT) {
+          p.setColor(Color.WHITE);
+          canvas.drawText(keypoint.keypointA.type + ": " + keypoint.keypointA.score,
+              keypoint.keypointA.x, keypoint.keypointA.y, p);
+        }
+      } else {
+        p.setColor(Color.YELLOW);
+        final float[] screenPoint = {keypoint.keypointA.x, keypoint.keypointA.y};
+        canvas.drawCircle(screenPoint[0], screenPoint[1], 5.0f, p);
+      }
+    }
+  }
+
+  private synchronized PointF getAccumulatedDelta(final long timestamp, final float positionX,
+      final float positionY, final float radius) {
+    final RectF currPosition = getCurrentPosition(timestamp,
+        new RectF(positionX - radius, positionY - radius, positionX + radius, positionY + radius));
+    return new PointF(currPosition.centerX() - positionX, currPosition.centerY() - positionY);
+  }
+
+  private synchronized RectF getCurrentPosition(final long timestamp, final RectF
+      oldPosition) {
+    final RectF downscaledFrameRect = downscaleRect(oldPosition);
+
+    final float[] delta = new float[4];
+    getCurrentPositionNative(timestamp, downscaledFrameRect.left, downscaledFrameRect.top,
+        downscaledFrameRect.right, downscaledFrameRect.bottom, delta);
+
+    final RectF newPosition = new RectF(delta[0], delta[1], delta[2], delta[3]);
+
+    return upscaleRect(newPosition);
+  }
+
+  private void updateDebugHistory() {
+    lastKeypoints = new FrameChange(getKeypointsNative(false));
+
+    if (lastTimestamp == 0) {
+      return;
+    }
+
+    final PointF delta =
+        getAccumulatedDelta(
+            lastTimestamp, frameWidth / DOWNSAMPLE_FACTOR, frameHeight / DOWNSAMPLE_FACTOR, 100);
+
+    synchronized (debugHistory) {
+      debugHistory.add(delta);
+
+      while (debugHistory.size() > MAX_DEBUG_HISTORY_SIZE) {
+        debugHistory.remove(0);
+      }
+    }
+  }
+
+  public synchronized void drawDebug(final Canvas canvas, final Matrix frameToCanvas) {
+    canvas.save();
+    canvas.setMatrix(frameToCanvas);
+
+    drawHistoryDebug(canvas);
+    drawKeypointsDebug(canvas);
+
+    canvas.restore();
+  }
+
+  public Vector<String> getDebugText() {
+    final Vector<String> lines = new Vector<String>();
+
+    if (lastKeypoints != null) {
+      lines.add("Num keypoints " + lastKeypoints.pointDeltas.size());
+      lines.add("Min score: " + lastKeypoints.minScore);
+      lines.add("Max score: " + lastKeypoints.maxScore);
+    }
+
+    return lines;
+  }
+
+  public synchronized List<byte[]> pollAccumulatedFlowData(final long endFrameTime) {
+    final List<byte[]> frameDeltas = new ArrayList<byte[]>();
+    while (timestampedDeltas.size() > 0) {
+      final TimestampedDeltas currentDeltas = timestampedDeltas.peek();
+      if (currentDeltas.timestamp <= endFrameTime) {
+        frameDeltas.add(currentDeltas.deltas);
+        timestampedDeltas.removeFirst();
+      } else {
+        break;
+      }
+    }
+
+    return frameDeltas;
+  }
+
+  private RectF downscaleRect(final RectF fullFrameRect) {
+    return new RectF(
+        fullFrameRect.left / DOWNSAMPLE_FACTOR,
+        fullFrameRect.top / DOWNSAMPLE_FACTOR,
+        fullFrameRect.right / DOWNSAMPLE_FACTOR,
+        fullFrameRect.bottom / DOWNSAMPLE_FACTOR);
+  }
+
+  private RectF upscaleRect(final RectF downsampledFrameRect) {
+    return new RectF(
+        downsampledFrameRect.left * DOWNSAMPLE_FACTOR,
+        downsampledFrameRect.top * DOWNSAMPLE_FACTOR,
+        downsampledFrameRect.right * DOWNSAMPLE_FACTOR,
+        downsampledFrameRect.bottom * DOWNSAMPLE_FACTOR);
+  }
+
+  /**
+   * A TrackedObject represents a native TrackedObject, and provides access to the
+   * relevant native tracking information available after every frame update. They may
+   * be safely passed around and accessed externally, but will become invalid after
+   * stopTracking() is called or the related creating ObjectTracker is deactivated.
+   *
+   * @author andrewharp@google.com (Andrew Harp)
+   */
+  public class TrackedObject {
+    private final String id;
+
+    private long lastExternalPositionTime;
+
+    private RectF lastTrackedPosition;
+    private boolean visibleInLastFrame;
+
+    private boolean isDead;
+
+    TrackedObject(final RectF position, final long timestamp, final byte[] data) {
+      isDead = false;
+
+      id = Integer.toString(this.hashCode());
+
+      lastExternalPositionTime = timestamp;
+
+      synchronized (ObjectTracker.this) {
+        registerInitialAppearance(position, data);
+        setPreviousPosition(position, timestamp);
+        trackedObjects.put(id, this);
+      }
+    }
+
+    public void stopTracking() {
+      checkValidObject();
+
+      synchronized (ObjectTracker.this) {
+        isDead = true;
+        forgetNative(id);
+        trackedObjects.remove(id);
+      }
+    }
+
+    public float getCurrentCorrelation() {
+      checkValidObject();
+      return ObjectTracker.this.getCurrentCorrelation(id);
+    }
+
+    void registerInitialAppearance(final RectF position, final byte[] data) {
+      final RectF externalPosition = downscaleRect(position);
+      registerNewObjectWithAppearanceNative(id,
+            externalPosition.left, externalPosition.top,
+            externalPosition.right, externalPosition.bottom,
+            data);
+    }
+
+    synchronized void setPreviousPosition(final RectF position, final long timestamp) {
+      checkValidObject();
+      synchronized (ObjectTracker.this) {
+        if (lastExternalPositionTime > timestamp) {
+          LOGGER.w("Tried to use older position time!");
+          return;
+        }
+        final RectF externalPosition = downscaleRect(position);
+        lastExternalPositionTime = timestamp;
+
+        setPreviousPositionNative(id,
+            externalPosition.left, externalPosition.top,
+            externalPosition.right, externalPosition.bottom,
+            lastExternalPositionTime);
+
+        updateTrackedPosition();
+      }
+    }
+
+    void setCurrentPosition(final RectF position) {
+      checkValidObject();
+      final RectF downsampledPosition = downscaleRect(position);
+      synchronized (ObjectTracker.this) {
+        setCurrentPositionNative(id,
+            downsampledPosition.left, downsampledPosition.top,
+            downsampledPosition.right, downsampledPosition.bottom);
+      }
+    }
+
+    private synchronized void updateTrackedPosition() {
+      checkValidObject();
+
+      final float[] delta = new float[4];
+      getTrackedPositionNative(id, delta);
+      lastTrackedPosition = new RectF(delta[0], delta[1], delta[2], delta[3]);
+
+      visibleInLastFrame = isObjectVisible(id);
+    }
+
+    public synchronized RectF getTrackedPositionInPreviewFrame() {
+      checkValidObject();
+
+      if (lastTrackedPosition == null) {
+        return null;
+      }
+      return upscaleRect(lastTrackedPosition);
+    }
+
+    synchronized long getLastExternalPositionTime() {
+      return lastExternalPositionTime;
+    }
+
+    public synchronized boolean visibleInLastPreviewFrame() {
+      return visibleInLastFrame;
+    }
+
+    private void checkValidObject() {
+      if (isDead) {
+        throw new RuntimeException("TrackedObject already removed from tracking!");
+      } else if (ObjectTracker.this != instance) {
+        throw new RuntimeException("TrackedObject created with another ObjectTracker!");
+      }
+    }
+  }
+
+  public synchronized TrackedObject trackObject(
+      final RectF position, final long timestamp, final byte[] frameData) {
+    if (downsampledTimestamp != timestamp) {
+      ObjectTracker.downsampleImageNative(
+          frameWidth, frameHeight, rowStride, frameData, DOWNSAMPLE_FACTOR, downsampledFrame);
+      downsampledTimestamp = timestamp;
+    }
+    return new TrackedObject(position, timestamp, downsampledFrame);
+  }
+
+  public synchronized TrackedObject trackObject(final RectF position, final byte[] frameData) {
+    return new TrackedObject(position, lastTimestamp, frameData);
+  }
+
+  /** ********************* NATIVE CODE ************************************ */
+
+  /** This will contain an opaque pointer to the native ObjectTracker */
+  private long nativeObjectTracker;
+
+  private native void initNative(int imageWidth, int imageHeight, boolean alwaysTrack);
+
+  protected native void registerNewObjectWithAppearanceNative(
+      String objectId, float x1, float y1, float x2, float y2, byte[] data);
+
+  protected native void setPreviousPositionNative(
+      String objectId, float x1, float y1, float x2, float y2, long timestamp);
+
+  protected native void setCurrentPositionNative(
+      String objectId, float x1, float y1, float x2, float y2);
+
+  protected native void forgetNative(String key);
+
+  protected native String getModelIdNative(String key);
+
+  protected native boolean haveObject(String key);
+  protected native boolean isObjectVisible(String key);
+  protected native float getCurrentCorrelation(String key);
+
+  protected native float getMatchScore(String key);
+
+  protected native void getTrackedPositionNative(String key, float[] points);
+
+  protected native void nextFrameNative(
+      byte[] frameData, byte[] uvData, long timestamp, float[] frameAlignMatrix);
+
+  protected native void releaseMemoryNative();
+
+  protected native void getCurrentPositionNative(long timestamp,
+      final float positionX1, final float positionY1,
+      final float positionX2, final float positionY2,
+      final float[] delta);
+
+  protected native byte[] getKeypointsPacked(float scaleFactor);
+
+  protected native float[] getKeypointsNative(boolean onlyReturnCorrespondingKeypoints);
+
+  protected native void drawNative(int viewWidth, int viewHeight, float[] frameToCanvas);
+
+  protected static native void downsampleImageNative(
+      int width, int height, int rowStride, byte[] input, int factor, byte[] output);
+}
diff --git a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index b0236e9c608ec35437bcfe79c51149a76f9f416e..98d3b5bb8ad45bf34f6996b3361291896a451a6f 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -326,10 +326,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = YES;
@@ -373,10 +369,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
index 959347b5491514ddc13af57ea6f7385a0d39e418..9322e186a280e932a2441ab16ac8579d9ab67ee2 100644
--- a/tensorflow/contrib/lite/examples/label_image/BUILD
+++ b/tensorflow/contrib/lite/examples/label_image/BUILD
@@ -69,15 +69,3 @@ cc_library(
 #         "//testing/base/public:gunit",
 #     ],
 # )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
index a91467d345fdce1268635a69a96939921dc170e8..456c5c6dc782f4e21a5062e353635117a39cacb9 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <memory>
 #include <sstream>
@@ -70,6 +71,23 @@ TfLiteStatus ReadLabelsFile(const string& file_name,
   return kTfLiteOk;
 }
 
+void PrintProfilingInfo(const profiling::ProfileEvent* e, uint32_t op_index,
+                        TfLiteRegistration registration) {
+  // output something like
+  // time (ms) , Node xxx, OpCode xxx, symblic name
+  //      5.352, Node   5, OpCode   4, DEPTHWISE_CONV_2D
+
+
+  LOG(INFO) << std::fixed << std::setw(10) << std::setprecision(3)
+            << (e->end_timestamp_us - e->begin_timestamp_us) / 1000.0
+            << ", Node " << std::setw(3) << std::setprecision(3) << op_index
+            << ", OpCode " << std::setw(3) << std::setprecision(3)
+            << registration.builtin_code << ", "
+            << EnumNameBuiltinOperator(
+                   (BuiltinOperator)registration.builtin_code)
+            << "\n";
+}
+
 void RunInference(Settings* s) {
   if (!s->model_name.c_str()) {
     LOG(ERROR) << "no model file name\n";
@@ -166,6 +184,11 @@ void RunInference(Settings* s) {
       exit(-1);
   }
 
+  profiling::Profiler* profiler = new profiling::Profiler();
+  interpreter->SetProfiler(profiler);
+
+  if (s->profiling) profiler->StartProfiling();
+
   struct timeval start_time, stop_time;
   gettimeofday(&start_time, NULL);
   for (int i = 0; i < s->loop_count; i++) {
@@ -179,6 +202,18 @@ void RunInference(Settings* s) {
             << (get_us(stop_time) - get_us(start_time)) / (s->loop_count * 1000)
             << " ms \n";
 
+  if (s->profiling) {
+    profiler->StopProfiling();
+    auto profile_events = profiler->GetProfileEvents();
+    for (int i = 0; i < profile_events.size(); i++) {
+      auto op_index = profile_events[i]->event_metadata;
+      const auto node_and_registration =
+          interpreter->node_and_registration(op_index);
+      const TfLiteRegistration registration = node_and_registration->second;
+      PrintProfilingInfo(profile_events[i], op_index, registration);
+    }
+  }
+
   const int output_size = 1000;
   const size_t num_results = 5;
   const float threshold = 0.001f;
@@ -217,13 +252,14 @@ void RunInference(Settings* s) {
 
 void display_usage() {
   LOG(INFO) << "label_image\n"
-            << "--accelerated, -a: [0|1], use Android NNAPI or note\n"
+            << "--accelerated, -a: [0|1], use Android NNAPI or not\n"
             << "--count, -c: loop interpreter->Invoke() for certain times\n"
             << "--input_mean, -b: input mean\n"
             << "--input_std, -s: input standard deviation\n"
             << "--image, -i: image_name.bmp\n"
             << "--labels, -l: labels for the model\n"
             << "--tflite_model, -m: model_name.tflite\n"
+            << "--profiling, -p: [0|1], profiling or not\n"
             << "--threads, -t: number of threads\n"
             << "--verbose, -v: [0|1] print more information\n"
             << "\n";
@@ -241,6 +277,7 @@ int Main(int argc, char** argv) {
         {"image", required_argument, 0, 'i'},
         {"labels", required_argument, 0, 'l'},
         {"tflite_model", required_argument, 0, 'm'},
+        {"profiling", required_argument, 0, 'p'},
         {"threads", required_argument, 0, 't'},
         {"input_mean", required_argument, 0, 'b'},
         {"input_std", required_argument, 0, 's'},
@@ -249,7 +286,7 @@ int Main(int argc, char** argv) {
     /* getopt_long stores the option index here. */
     int option_index = 0;
 
-    c = getopt_long(argc, argv, "a:b:c:f:i:l:m:s:t:v:", long_options,
+    c = getopt_long(argc, argv, "a:b:c:f:i:l:m:p:s:t:v:", long_options,
                     &option_index);
 
     /* Detect the end of the options. */
@@ -276,6 +313,10 @@ int Main(int argc, char** argv) {
       case 'm':
         s.model_name = optarg;
         break;
+      case 'p':
+        s.profiling = strtol(  // NOLINT(runtime/deprecated_fn)
+            optarg, (char**)NULL, 10);
+        break;
       case 's':
         s.input_std = strtod(optarg, NULL);
         break;
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/contrib/lite/examples/label_image/label_image.h
index 4de32e33fb4ef2ab5d0e111886cdc737398147e9..4b48014e1c77eca1eca081f0fe906441a5dcce22 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.h
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.h
@@ -25,6 +25,7 @@ struct Settings {
   bool verbose = false;
   bool accel = false;
   bool input_floating = false;
+  bool profiling = false;
   int loop_count = 1;
   float input_mean = 127.5f;
   float input_std = 127.5f;
diff --git a/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg b/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg
deleted file mode 100644
index bc83946647c6a923a8a0bd3a041b42e4febe6a31..0000000000000000000000000000000000000000
Binary files a/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg and /dev/null differ
diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/contrib/lite/g3doc/apis.md
index fe208e47d1ac10995881e55c8596ae14ff4242df..50cc146a87ee9ab94aea6a92fb2fb5c531f83369 100644
--- a/tensorflow/contrib/lite/g3doc/apis.md
+++ b/tensorflow/contrib/lite/g3doc/apis.md
@@ -29,7 +29,7 @@ interpreter->AllocateTensors();
 float* input = interpreter->typed_input_tensor<float>(0);
 // Fill `input`.
 interpreter->Invoke();
-float* output = interpreter->type_output_tensor<float>(0);
+float* output = interpreter->typed_output_tensor<float>(0);
 ```
 ### Data Alignment
 
diff --git a/tensorflow/contrib/lite/g3doc/ios.md b/tensorflow/contrib/lite/g3doc/ios.md
index a359b8d4b481dbc15cc86db14eabda5433722b8b..e0358a444d6dffc377bf13ee72ba5477359d6e07 100644
--- a/tensorflow/contrib/lite/g3doc/ios.md
+++ b/tensorflow/contrib/lite/g3doc/ios.md
@@ -22,6 +22,15 @@ Then install
 brew install automake
 brew install libtool
 ```
+If you get an error where either automake or libtool install but do not link correctly, you'll first need to:
+```bash
+sudo chown -R $(whoami) /usr/local/*
+```
+Then follow the instructions to perform the linking:
+```bash
+brew link automake
+brew link libtool
+```
 
 Then you need to run a shell script to download the dependencies you need:
 
diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/contrib/lite/g3doc/models.md
index 5b393140d61544e6d6e40d4b6ee1872b22cc84b2..d8134d5a00097b3eef24d5583d7f114c34e3bef2 100644
--- a/tensorflow/contrib/lite/g3doc/models.md
+++ b/tensorflow/contrib/lite/g3doc/models.md
@@ -1,7 +1,13 @@
-#List of Hosted Models
+# List of Hosted Models
 
-*   [Inception V3 2015](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_2015_2017_11_10.zip)
-*   [Inception V3 Slim 2016](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
+*   [NASNet large](https://storage.googleapis.com/download.tensorflow.org/models/tflite/nasnet_large_2018_03_27.zip)
+*   [NASNet mobile](https://storage.googleapis.com/download.tensorflow.org/models/tflite/nasnet_mobile_2018_03_27.zip)
+*   [ResNet v2 101](https://storage.googleapis.com/download.tensorflow.org/models/tflite/resnet_v2_101_2018_03_27.zip)
+*   [ResNet v2 50](https://storage.googleapis.com/download.tensorflow.org/models/tflite/resnet_v2_50_2018_03_27.zip)
+*   [Inception ResNet v2](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_resnet_v2_2018_03_27.zip)
+*   [Inception v4](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v4_2018_03_27.zip)
+*   [Inception v3 2015](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_2015_2017_11_10.zip)
+*   [Inception v3 Slim 2016](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
 *   [Mobilenet 0.25 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_128_float_2017_11_08.zip)
 *   [Mobilenet 0.25 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_160_float_2017_11_08.zip)
 *   [Mobilenet 0.25 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_192_float_2017_11_08.zip)
diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/contrib/lite/g3doc/rpi.md
new file mode 100644
index 0000000000000000000000000000000000000000..7a3a231626d0e1c71e474ff4ff16789ebe2901db
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/rpi.md
@@ -0,0 +1,50 @@
+# TensorFlow Lite for Raspberry Pi
+
+## Cross compiling
+### Installing toolchian
+This has been tested on Ubuntu 16.04.3 64bit and Tensorflow devel docker image [tensorflow/tensorflow:nightly-devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+
+To cross compiling TensorFlow Lite. First you should install the toolchain and libs.
+```bash
+sudo apt-get update
+sudo apt-get install crossbuild-essential-armhf
+```
+> If you are using docker, you may not use `sudo`
+
+### Building
+Clone this Tensorflow repository, Run this script at the root of the repository to download all the dependencies:
+> The Tensorflow repository is in `/tensorflow` if you are using `tensorflow/tensorflow:nightly-devel` docker image, just try it.
+```bash
+./tensorflow/contrib/lite/download_dependencies.sh
+```
+Note than you only need to to this once.
+
+You should then be able to compile:
+```bash
+./tensorflow/contrib/lite/build_rpi_lib.sh
+```
+
+This should compile a static library in:
+`tensorflow/contrib/lite/gen/lib/rpi_armv7/libtensorflow-lite.a`.
+
+## Native compiling
+This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1).
+
+Log in to you RPI, install the toolchain.
+```bash
+sudo apt-get instal build-essential
+```
+
+First, clone this TensorFlow repository. Run this at the root of the repository:
+```bash
+./tensorflow/contrib/lite/download_dependencies.sh
+```
+Note than you only need to to this once.
+
+You should then be able to compile:
+```bash
+./tensorflow/contrib/lite/build_rpi_lib.sh
+```
+
+This should compile a static library in:
+`tensorflow/contrib/lite/gen/lib/rpi_armv7/libtensorflow-lite.a`.
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index b1bbb7c67013acfb575cc1e9f9390ba191cbd08e..aa28f8d050944e3b4ad8be91871388b32f593e2d 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -30,13 +30,18 @@ quantized training is necessary before conversion.
 ## Data Format and Broadcasting
 
 At the moment TensorFlow Lite supports only TensorFlow's "NHWC" format, and
-broadcasting in operations like tf.add and tf.mul is generally not supported.
+broadcasting is only support in a limited number of ops (tf.add, tf.mul, tf.sub,
+and tf.div).
 
 ## Compatible Operations
 
 The following TensorFlow operations are usually mapped to their TensorFlow Lite
 counterparts:
 
+*   [tf.batch_to_space_nd](https://www.tensorflow.org/api_docs/python/tf/batch_to_space_nd) -
+    *as long as the input tensor is 4D (1 batch + 2 spatial + 1 other) and the
+    crops attribute is not used*
+*   [tf.exp](https://www.tensorflow.org/api_docs/python/tf/exp)
 *   [tf.matmul](https://www.tensorflow.org/api_docs/python/tf/matmul) - *as long
     as the second argument is constant and transposition is not used*
 *   [tf.nn.avg_pool](https://www.tensorflow.org/api_docs/python/tf/nn/avg_pool)
@@ -47,12 +52,30 @@ counterparts:
 *   [tf.nn.l2_normalize](https://www.tensorflow.org/api_docs/python/tf/nn/l2_normalize) -
     *as long as normalization is done along the last dimension*
 *   [tf.nn.local_response_normalization](https://www.tensorflow.org/api_docs/python/tf/nn/local_response_normalization)
+*   [tf.nn.log_softmax](https://www.tensorflow.org/api_docs/python/tf/nn/log_softmax) -
+    *as long as axis is not provided*
 *   [tf.nn.max_pool](https://www.tensorflow.org/api_docs/python/tf/nn/max_pool)
 *   [tf.nn.softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) -
     *as long as tensors are 2D and axis is the last dimension*
+*   [tf.nn.top_k](https://www.tensorflow.org/api_docs/python/tf/nn/top_k)
+*   [tf.pad](https://www.tensorflow.org/api_docs/python/tf/pad) - *as long as
+    mode and constant_values are not used*
+*   [tf.reduce_mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean) -
+    *as long as the reduction_indices attribute is not used*
 *   [tf.reshape](https://www.tensorflow.org/api_docs/python/tf/reshape)
 *   [tf.sigmoid](https://www.tensorflow.org/api_docs/python/tf/sigmoid)
+*   [tf.space_to_batch_nd](https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd) -
+    *as long as the input tensor is 4D (1 batch + 2 spatial + 1 other)*
 *   [tf.space_to_depth](https://www.tensorflow.org/api_docs/python/tf/space_to_depth)
+*   [tf.split](https://www.tensorflow.org/api_docs/python/tf/split) - *as long
+    as num is not provided and num_or_size_split contains number of splits as a
+    0D tensor*
+*   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze) - *as
+    long as axis is not provided*
+*   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice) -
+    *as long as ellipsis_mask and new_axis_mask are not used*
+*   [tf.transpose](https://www.tensorflow.org/versions/master/api_docs/python/tf/transpose) -
+    *as long as conjugate is not used*
 
 ## Straightforward Conversions, Constant-Folding and Fusing
 
@@ -91,7 +114,6 @@ Here is a list of TensorFlow operations that are usually removed from the graph:
 *   [tf.shape](https://www.tensorflow.org/api_docs/python/tf/shape)
 *   [tf.sqrt](https://www.tensorflow.org/api_docs/python/tf/sqrt)
 *   [tf.square](https://www.tensorflow.org/api_docs/python/tf/square)
-*   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze)
 *   [tf.subtract](https://www.tensorflow.org/api_docs/python/tf/subtract)
 *   [tf.tile](https://www.tensorflow.org/api_docs/python/tf/tile)
 *   [tf.nn.batch_norm_with_global_normalization](https://www.tensorflow.org/api_docs/python/tf/nn/batch_norm_with_global_normalization)
@@ -109,17 +131,10 @@ fused.
 TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
-*   [tf.batch_to_space_nd](https://www.tensorflow.org/api_docs/python/tf/batch_to_space_nd)
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
-*   [tf.floor](https://www.tensorflow.org/api_docs/python/tf/floor)
 *   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
-*   [tf.pad](https://www.tensorflow.org/api_docs/python/tf/pad)
-*   [tf.reduce_mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)
 *   [tf.slice](https://www.tensorflow.org/api_docs/python/tf/slice)
-*   [tf.space_to_batch_nd](https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd)
-*   [tf.split](https://www.tensorflow.org/api_docs/python/tf/split)
-*   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
 ## TensorFlow Lite Operations
@@ -160,6 +175,20 @@ Options {
 }
 ```
 
+**BATCH_TO_SPACE_ND**
+
+```
+Inputs {
+  0: 4D tensor
+  1: 1D tensor
+  2: 2D tensor
+}
+Outputs {
+  0: tensor rearranged using block_shape. See tf.batch_to_space_nd for
+     details.
+}
+```
+
 **CONCATENATION**
 
 ```
@@ -213,6 +242,28 @@ Options {
 }
 ```
 
+**EXP**
+
+```
+Inputs {
+  0: tensor
+}
+Outputs {
+  0: result of computing element-wise exponential of the input tensor
+}
+```
+
+**FLOOR**
+
+```
+inputs {
+  0: tensor
+}
+outputs: {
+  0: result of computing element-wise floor of the input tensor
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
@@ -261,6 +312,19 @@ Options {
 }
 ```
 
+**LESS**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is less
+  than the corresponding element of the second tensor.
+}
+```
+
 **LOCAL_RESPONSE_NORMALIZATION**
 
 ```
@@ -289,6 +353,17 @@ Outputs {
 }
 ```
 
+**LOG_SOFTMAX**
+
+```
+Inputs {
+  0: tensor
+}
+Outputs {
+  0: tensor equivalent to logits - log(reduce_sum(exp(logits), -1))
+}
+```
+
 **MAX_POOL_2D**
 
 ```
@@ -322,6 +397,34 @@ Options {
 }
 ```
 
+**PAD**
+
+```
+Inputs {
+  0: tensor
+  1: tensor
+}
+Outputs {
+  0: tensor where additional values are added before and after the contents of
+     each dimension
+}
+```
+
+**MEAN (tf.reduce_mean)**
+
+```
+Inputs {
+  0: tensor
+  1: tensor
+}
+Outputs {
+  0: tensor containing the mean of the elements
+}
+Options {
+  keep_dims: whether to retain reduced dimensions
+}
+```
+
 **RELU**
 
 ```
@@ -399,6 +502,93 @@ Options {
 }
 ```
 
+**SPACE_TO_BATCH_ND**
+
+```
+Inputs {
+  0: 4D tensor
+  1: 1D tensor
+  2: 2D tensor
+}
+Outputs {
+  0: a tensor rearranged using block_shape. See tf.space_to_batch_nd for
+     details.
+}
+```
+
+**SPLIT**
+
+```
+Inputs {
+  0: 0D tensor (axis)
+  1: tensor (input)
+}
+Outputs {
+  0-N: subtensors built from the input tensors
+}
+Options {
+  num_splits: Specifies number of outputs
+}
+```
+
+**SQUEEZE**
+
+```
+Inputs {
+  0: tensor
+}
+Outputs {
+  0: tensor without any dimensions of size 1
+}
+Options {
+  squeeze_dims
+}
+```
+
+**STRIDED_SLICE**
+
+```
+Inputs {
+  0: tensor
+  1: 1D tensor
+  2: 1D tensor
+  3: 1D tensor
+}
+Outputs {
+  0: slice of the input tensor of the given size
+}
+Options {
+  begin_mask: mask for begin indicies
+  end_mask: mask for end indices
+  shrink_axis_mask: mask that indicates which dimensions to remove
+}
+```
+
+**TOP_K**
+
+```
+Inputs {
+  0: tensor
+  1: OD tensor
+}
+Outputs {
+  0: k largest element along each last dimensional slice
+  1: indicies of values within the last dimension of the input ensor
+}
+```
+
+**TRANSPOSE**
+
+```
+Inputs {
+  0: tensor
+  1: tensor
+}
+Outputs {
+  0: tensor permuted according to perm
+}
+```
+
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 028449211b8108d004df4d1cd8a58b4a08df6604..9d8ea55fd1edc0dacc821536cc2b564c59f65b71 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -14,27 +14,47 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/interpreter.h"
+
 #include <cassert>
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+
 #include "tensorflow/contrib/lite/arena_planner.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/graph_info.h"
+#include "tensorflow/contrib/lite/kernels/eigen_support.h"
 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
 #include "tensorflow/contrib/lite/memory_planner.h"
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
+#include "tensorflow/contrib/lite/profiling/profiler.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/util.h"
+
+namespace tflite {
 
 namespace {
 
-// std::vector preallocation tuning.
-constexpr const int kSlotsToReserve = 128;
+// Stub method which returns kTfLiteError when the function is forbidden.
+// We're registrating this function to several different function to save
+// compiled binary size. Please note the restrictions:
+// * The type of first parameter have to be `TfLiteContext*`.
+// * All paramteters must be trivailly destructible. (E.g. No C++ class)
+TfLiteStatus ForbiddenContextFunction(TfLiteContext* context, ...) {
+  context->ReportError(context,
+                       "The function is forbidden if not calling in delegate.");
+  return kTfLiteError;
+}
+
+// Set the ForbiddenContextFunction to a compatible function pointer.
+template <typename FunctionType>
+void SetForbiddenContextFunction(FunctionType* func) {
+  *func = reinterpret_cast<FunctionType>(ForbiddenContextFunction);
+}
 
 }  // namespace
 
-namespace tflite {
-
 // A trivial implementation of GraphInfo around the Interpreter.
 // NOTE: this interpreter info represents the subset of the
 // graph that is executed according to execution plan. Thus,
@@ -76,16 +96,18 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   context_.AddTensors = AddTensors;
   context_.tensors = nullptr;
   context_.tensors_size = 0;
+  context_.eigen_context = nullptr;
   context_.gemm_context = nullptr;
+  context_.recommended_num_threads = -1;
 
   // Invalid to call these these except from TfLiteDelegate
-  context_.GetNodeAndRegistration = nullptr;
-  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
-  context_.GetExecutionPlan = nullptr;
+  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
+  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
+  SetForbiddenContextFunction(&context_.GetExecutionPlan);
 
   // Reserve some space for the tensors to avoid excessive resizing.
-  tensors_.reserve(kSlotsToReserve);
-  nodes_and_registration_.reserve(kSlotsToReserve);
+  tensors_.reserve(kTensorsReservedCapacity);
+  nodes_and_registration_.reserve(kTensorsReservedCapacity);
   next_execution_plan_index_to_prepare_ = 0;
   UseNNAPI(false);
 }
@@ -102,19 +124,102 @@ Interpreter::~Interpreter() {
   }
 
   for (int i = 0; i < context_.tensors_size; i++) {
-    TfLiteTensorFree(&context_.tensors[i]);
+    TfLiteTensor* tensor = &context_.tensors[i];
+    if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
+      tensor->delegate->FreeBufferHandle(tensor->delegate,
+                                         &tensor->buffer_handle);
+    }
+    TfLiteTensorFree(tensor);
   }
 }
 
 TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
     TfLiteContext* context, TfLiteRegistration registration,
-    const TfLiteIntArray* nodes_to_replace) {
+    const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
   return static_cast<Interpreter*>(context->impl_)
-      ->ReplaceSubgraphsWithDelegateKernels(registration, nodes_to_replace);
+      ->ReplaceSubgraphsWithDelegateKernels(registration, nodes_to_replace,
+                                            delegate);
 }
 
+namespace {
+
+// Copy a std::vector<int> to an existing TfLiteIntArray.
+// This is a low-level data manipulation function, and it's caller's
+// responsibility to ensure TfLiteIntArray has enough size.
+void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
+                                TfLiteIntArray* arr) {
+  arr->size = vec.size();
+  memcpy(arr->data, vec.data(), sizeof(int) * arr->size);
+}
+
+// This function allocates a continuous memory space that contains a
+// TfLiteDelegateParams followed by a several TfLiteIntArray.
+// When calling `free` at TfLiteDelegateParams*, all the allocated space
+// will be freed together.
+//
+// +-----------------------------------+
+// | TfLiteDelegateParams              |
+// | TfLiteDelegate* delegate;         |
+// | TfLiteIntArray* nodes_to_replace; |--\
+// | TfLiteIntArray* input_tensors;    |--+--\
+// | TfLiteIntArray* output_tensors;   |--+--+--\
+// +-----------------------------------+  |  |  |
+// | TfLiteIntArray (variable size)    |<-/  |  |
+// +-----------------------------------+     |  |
+// | TfLiteIntArray (variable size)    |<----/  |
+// +-----------------------------------+        |
+// | TfLiteIntArray (variable size)    |<-------/
+// +-----------------------------------+
+TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
+                                           const Subgraph& subgraph) {
+  // Step 1: Calculate the allocation size.
+  int allocation_size = sizeof(TfLiteDelegateParams);
+
+  int nodes_to_replace_size =
+      TfLiteIntArrayGetSizeInBytes(subgraph.nodes.size());
+  allocation_size += nodes_to_replace_size;
+
+  int input_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(subgraph.input_tensors.size());
+  allocation_size += input_tensors_size;
+
+  int output_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(subgraph.output_tensors.size());
+  allocation_size += output_tensors_size;
+
+  // Step 2: Allocate the memory.
+  // Use `char*` for conveniently step through the allocated space by bytes.
+  char* allocation = reinterpret_cast<char*>(malloc(allocation_size));
+
+  // Step 3: Fill all data structures structures.
+  TfLiteDelegateParams* params =
+      reinterpret_cast<TfLiteDelegateParams*>(allocation);
+  params->delegate = delegate;
+  allocation += sizeof(TfLiteDelegateParams);
+
+  params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(subgraph.nodes, params->nodes_to_replace);
+  allocation += nodes_to_replace_size;
+
+  params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(subgraph.input_tensors, params->input_tensors);
+  allocation += input_tensors_size;
+
+  params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(subgraph.output_tensors, params->output_tensors);
+  allocation += output_tensors_size;
+
+  return params;
+}
+
+}  // namespace
+
 TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
-    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace) {
+    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+    TfLiteDelegate* delegate) {
+  // Annotate the registration as DELEGATE op.
+  registration.builtin_code = BuiltinOperator_DELEGATE;
+
   // Analyze the graph to find all independent subgraphs that are either
   // fully not-this-delegate or this-delegate computation.
   InterpreterInfo info(this);
@@ -123,30 +228,34 @@ TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
 
   execution_plan_.clear();
   for (auto& subgraph : subgraphs) {
-    // Turn subgraph.nodes into a TfLiteIntArray compatible data structure.
-    // TODO(aselle): Avoid this copy by constructing subgraph.nodes that way
-    // in the first place
-    subgraph.nodes.insert(subgraph.nodes.begin(),
-                          static_cast<int>(subgraph.nodes.size()));
     // Subgraphs calimed by the delegate should have a "macro" op created, the
     // other subgraphs (kTfNonPartition) just have their nodes added back to
     // the execution plan.
     switch (subgraph.type) {
       case Subgraph::kTfNonPartition:
-        for (auto it = subgraph.nodes.begin() + 1; it != subgraph.nodes.end();
+        for (auto it = subgraph.nodes.begin(); it != subgraph.nodes.end();
              ++it) {
           execution_plan_.push_back(*it);
         }
         break;
       case Subgraph::kTfPartition: {
-        void* builtin_data = nullptr;
         int node_index;
-        // Create a node that represents computation of this subgraph.
-        AddNodeWithParameters(
-            subgraph.input_tensors, subgraph.output_tensors,
-            reinterpret_cast<const char*>(subgraph.nodes.data()),
-            subgraph.nodes.size() * sizeof(subgraph.nodes[0]), builtin_data,
-            &registration, &node_index);
+
+        TfLiteDelegateParams* params = CreateDelegateParams(delegate, subgraph);
+        AddNodeWithParameters(subgraph.input_tensors, subgraph.output_tensors,
+                              nullptr, 0, params, &registration, &node_index);
+
+        // Initialize the output tensors's delegate-related fields.
+        for (int tensor_index : subgraph.output_tensors) {
+          TfLiteTensor* tensor = &tensors_[tensor_index];
+          TF_LITE_ENSURE(&context_, tensor->delegate == nullptr ||
+                                        tensor->delegate == delegate);
+          tensor->delegate = delegate;
+        }
+
+        // Associate the node with the delegate.
+        TfLiteNode* node = &nodes_and_registration_[node_index].first;
+        node->delegate = delegate;
       } break;
       case Subgraph::kTfUnexplored:
         return kTfLiteError;
@@ -165,8 +274,8 @@ TfLiteStatus Interpreter::GetExecutionPlan(TfLiteIntArray** execution_plan) {
   *execution_plan = plan_cache_.get();
   static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
                 "TfLiteIntArray and execution_plan do not contain same type.");
-  memcpy(plan_cache_->data, execution_plan_.data(),
-         sizeof(plan_cache_->data[0]) * execution_plan_.size());
+  std::memcpy(plan_cache_->data, execution_plan_.data(),
+              sizeof(plan_cache_->data[0]) * execution_plan_.size());
   return kTfLiteOk;
 }
 
@@ -199,7 +308,12 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label,
 
   for (int i = 0; i < length; i++) {
     int index = indices[i];
-    if (index < kOptionalTensor || index >= context_.tensors_size) {
+    // Continue if index == kOptionalTensor before additional comparisons below,
+    // size_t(-1) is always >= context_tensors_size.
+    if (index == kOptionalTensor) {
+      continue;
+    }
+    if (index < 0 || static_cast<size_t>(index) >= context_.tensors_size) {
       ReportError(&context_, "Invalid tensor index %d in %s\n", index, label);
       consistent_ = false;
       return kTfLiteError;
@@ -209,7 +323,7 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label,
 }
 
 TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
-                                        int dims_size, size_t* bytes) {
+                                        size_t dims_size, size_t* bytes) {
   // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
   // MultiplyWithoutOverflow.
   TF_LITE_ENSURE(&context_, bytes != nullptr);
@@ -228,22 +342,18 @@ TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
     case kTfLiteInt64:
       *bytes = sizeof(int64_t) * count;
       break;
+    case kTfLiteBool:
+      *bytes = sizeof(bool) * count;
+      break;
     default:
-      ReportError(&context_,
-                  "Only float32, int32, int64, uint8 supported currently.");
+      ReportError(
+          &context_,
+          "Only float32, int32, int64, uint8, bool supported currently.");
       return kTfLiteError;
   }
   return kTfLiteOk;
 }
 
-namespace {
-TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector<int>& x) {
-  TfLiteIntArray* lite = TfLiteIntArrayCreate(x.size());
-  for (size_t i = 0; i < x.size(); i++) lite->data[i] = x[i];
-  return lite;
-}
-}  // namespace
-
 TfLiteStatus Interpreter::AllocateTensors() {
   next_execution_plan_index_to_prepare_ = 0;
   if (memory_planner_) {
@@ -256,7 +366,11 @@ TfLiteStatus Interpreter::AllocateTensors() {
   }
 
   TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-  invokable_ = true;
+  if (state_ == kStateUninvokable) {
+    state_ = kStateInvokable;
+  }
+  TF_LITE_ENSURE(&context_, state_ == kStateInvokable ||
+                                state_ == kStateInvokableAndImmutable);
   return kTfLiteOk;
 }
 
@@ -264,7 +378,12 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
     const std::vector<int>& inputs, const std::vector<int>& outputs,
     const char* init_data, size_t init_data_size, void* builtin_data,
     const TfLiteRegistration* registration, int* node_index) {
-  invokable_ = false;
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(&context_,
+                "AddNodeWithParameters is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  state_ = kStateUninvokable;
 
   std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
                                                               free);
@@ -278,7 +397,6 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
   int new_node_index = nodes_and_registration_.size();
   if (node_index) *node_index = new_node_index;
   nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
-
   auto& node_and_reg = nodes_and_registration_.back();
   TfLiteNode& node = node_and_reg.first;
   if (node.inputs) TfLiteIntArrayFree(node.inputs);
@@ -288,8 +406,8 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
   // NOTE, here we are not using move semantics yet, since our internal
   // representation isn't std::vector, but in the future we would like to avoid
   // copies, so we want the interface to take r-value references now.
-  node.inputs = convertVectorToTfLiteIntArray(inputs);
-  node.outputs = convertVectorToTfLiteIntArray(outputs);
+  node.inputs = ConvertVectorToTfLiteIntArray(inputs);
+  node.outputs = ConvertVectorToTfLiteIntArray(outputs);
   node.temporaries = TfLiteIntArrayCreate(0);
   if (init_data) {
     node.user_data = OpInit(*registration, init_data, init_data_size);
@@ -298,7 +416,22 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
         OpInit(*registration,
                reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
   }
+
   node.builtin_data = builtin_data_deleter.release();
+  // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
+  // properly for nodes generated by ReplaceSubgraphsWithDelegateKernels.
+
+  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
+    // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
+    // `Operator` table is passed in.
+    node.custom_initial_data = init_data;
+    node.custom_initial_data_size = init_data_size;
+  } else {
+    node.custom_initial_data = nullptr;
+    node.custom_initial_data_size = 0;
+  }
+
+  node.delegate = nullptr;
   node_and_reg.second = *registration;
   execution_plan_.push_back(new_node_index);
   return kTfLiteOk;
@@ -306,13 +439,18 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
 
 TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
                                             const std::vector<int>& dims) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(&context_,
+                "ResizeInputTensor is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  state_ = kStateUninvokable;
+
   // TODO(aselle): All bounds checks can be implemented as one-sided bounds
   // checks by casting to unsigned for efficiency. Profile before doing this.
-
   TF_LITE_ENSURE(&context_,
                  tensor_index < context_.tensors_size && tensor_index >= 0);
-  invokable_ = false;
-  TfLiteIntArray* dims_lite = convertVectorToTfLiteIntArray(dims);
+  TfLiteIntArray* dims_lite = ConvertVectorToTfLiteIntArray(dims);
   return ResizeTensorImpl(&context_.tensors[tensor_index], dims_lite);
 }
 
@@ -336,6 +474,7 @@ TfLiteStatus Interpreter::PrepareOpsStartingAt(
     TfLiteNode& node = nodes_and_registration_[node_index].first;
     const TfLiteRegistration& registration =
         nodes_and_registration_[node_index].second;
+    EnsureTensorsVectorCapacity();
     if (OpPrepare(registration, &node) == kTfLiteError) {
       return kTfLiteError;
     }
@@ -375,7 +514,7 @@ TfLiteStatus Interpreter::Invoke() {
     ReportError(&context_, "Invoke called on model that is not consistent.");
     return kTfLiteError;
   }
-  if (!invokable_) {
+  if (state_ == kStateUninvokable) {
     ReportError(&context_, "Invoke called on model that is not ready.");
     return kTfLiteError;
   }
@@ -413,10 +552,36 @@ TfLiteStatus Interpreter::Invoke() {
     TfLiteNode& node = nodes_and_registration_[node_index].first;
     const TfLiteRegistration& registration =
         nodes_and_registration_[node_index].second;
+    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
+
+    // TODO(ycling): This is an extra loop through inputs to check if the data
+    // need to be copied from Delegate buffer to raw memory, which is often not
+    // needed. We may want to cache this in prepare to know if this needs to be
+    // done for a node or not.
+    for (int i = 0; i < node.inputs->size; ++i) {
+      int tensor_index = node.inputs->data[i];
+      if (tensor_index == kOptionalTensor) {
+        continue;
+      }
+      TfLiteTensor* tensor = &tensors_[tensor_index];
+      if (tensor->delegate && tensor->delegate != node.delegate &&
+          tensor->data_is_stale) {
+        EnsureTensorDataIsReadable(tensor_index);
+      }
+    }
+
+    EnsureTensorsVectorCapacity();
     if (OpInvoke(registration, &node) == kTfLiteError) {
       status = kTfLiteError;
     }
   }
+
+  if (!allow_buffer_handle_output_) {
+    for (int tensor_index : outputs_) {
+      EnsureTensorDataIsReadable(tensor_index);
+    }
+  }
+
   return status;
 }
 
@@ -452,6 +617,7 @@ TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
   tensors_.resize(tensors_.size() + tensors_to_add);
   for (int i = base_index; i < tensors_.size(); i++) {
     memset(&tensors_[i], 0, sizeof(tensors_[i]));
+    tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
   }
   context_.tensors = tensors_.data();
   context_.tensors_size = tensors_.size();
@@ -484,9 +650,16 @@ TfLiteStatus Interpreter::GetNodeAndRegistration(
 }
 
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
-    int tensor_index, TfLiteType type, const char* name,
-    const std::vector<int>& dims, TfLiteQuantizationParams quantization,
-    const char* buffer, size_t bytes, const Allocation* allocation) {
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
+    size_t bytes, const Allocation* allocation) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        &context_,
+        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
   TF_LITE_ENSURE(&context_,
                  tensor_index < context_.tensors_size && tensor_index >= 0);
   // For most tensors we know exactly how much memory is necessary so we can
@@ -494,14 +667,27 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
   // because their sizes change with the contents of the individual strings.
   if (type != kTfLiteString) {
     size_t required_bytes;
-    TF_LITE_ENSURE_OK(&context_, BytesRequired(type, dims.data(), dims.size(),
-                                               &required_bytes));
+    TF_LITE_ENSURE_OK(&context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
     TF_LITE_ENSURE_EQ(&context_, required_bytes, bytes);
   }
-  invokable_ = false;
-  TfLiteTensorReset(type, name, convertVectorToTfLiteIntArray(dims),
-                    quantization, const_cast<char*>(buffer), bytes,
-                    kTfLiteMmapRo, allocation, &context_.tensors[tensor_index]);
+
+  TfLiteTensor& tensor = context_.tensors[tensor_index];
+  if (type == tensor.type &&
+      EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
+    // Fast path which does not invalidate the invokable property.
+    TfLiteTensorDataFree(&tensor);
+    tensor.data.raw = const_cast<char*>(buffer);
+    if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
+    tensor.params = quantization;
+    tensor.allocation_type = kTfLiteMmapRo;
+    tensor.allocation = allocation;
+  } else {
+    state_ = kStateUninvokable;
+    TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                      quantization, const_cast<char*>(buffer), bytes,
+                      kTfLiteMmapRo, allocation, &tensor);
+  }
   return kTfLiteOk;
 }
 
@@ -510,9 +696,14 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
 // bytes. The lifetime of buffer must be ensured to be greater or equal
 // to Interpreter.
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
-    int tensor_index, TfLiteType type, const char* name,
-    const std::vector<int>& dims, TfLiteQuantizationParams quantization) {
-  invokable_ = false;
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        &context_,
+        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
   TF_LITE_ENSURE(&context_,
                  tensor_index < context_.tensors_size && tensor_index >= 0);
   size_t required_bytes = 0;
@@ -521,10 +712,10 @@ TfLiteStatus Interpreter::SetTensorParametersReadWrite(
     // many bytes we will need based on the dimensions. String tensors are
     // allocated dynamically and we can't know ahead of time how much space
     // they will require.
-    TF_LITE_ENSURE_OK(&context_, BytesRequired(type, dims.data(), dims.size(),
-                                               &required_bytes));
+    TF_LITE_ENSURE_OK(&context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
   }
-  TfLiteTensorReset(type, name, convertVectorToTfLiteIntArray(dims),
+  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
                     quantization,
                     /*buffer=*/nullptr, required_bytes,
                     type == kTfLiteString ? kTfLiteDynamic : kTfLiteArenaRw,
@@ -587,26 +778,95 @@ void Interpreter::UseNNAPI(bool enable) {
 }
 
 void Interpreter::SetNumThreads(int num_threads) {
-  // TODO(ahentz): this forces us to link against gemmlowp even when the ops
-  // don't use it. We should implement some dynamic mechanism for this sort of
-  // library-specific initialization.
-  tflite::gemm_support::SetMaxNumThreads(&context_, num_threads);
-}
+  context_.recommended_num_threads = num_threads;
+
+  // TODO(ahentz): find a way to avoid this. It causes gemmlowp and eigen to
+  // be required in order to compile the framework.
+  gemm_support::SetNumThreads(&context_, num_threads);
+  eigen_support::SetNumThreads(&context_, num_threads);
+}
+
+TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate,
+                                                  bool allow_dynamic_tensors) {
+  if (!allow_dynamic_tensors) {
+    int last_execution_plan_index_prepared;
+    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
+                                     0, &last_execution_plan_index_prepared));
+
+    bool has_dynamic_tensors = true;
+    // Dynamic tensors exist if not all nodes can be prepared.
+    if (last_execution_plan_index_prepared + 1 == execution_plan_.size()) {
+      // If all the nodes can be prepared, check if the last node has dynamic
+      // tensors.
+      int node_index = execution_plan_[last_execution_plan_index_prepared];
+      TfLiteNode& node = nodes_and_registration_[node_index].first;
+      if (!HasDynamicTensor(context_, node.outputs)) {
+        has_dynamic_tensors = false;
+      }
+    }
+    if (has_dynamic_tensors) {
+      ReportError(&context_, "Attempting to resize a fixed-size tensor.");
+      return kTfLiteError;
+    }
+  }
 
-TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   // TODO(aselle): Consider if it is worth storing pointers to delegates.
-  // Setup additional context interface
+  // Setup additional context interface.
   context_.GetNodeAndRegistration = GetNodeAndRegistration;
   context_.ReplaceSubgraphsWithDelegateKernels =
       ReplaceSubgraphsWithDelegateKernels;
   context_.GetExecutionPlan = GetExecutionPlan;
 
-  TfLiteStatus status = delegate->Prepare(&context_, delegate->data_);
+  TfLiteStatus status = delegate->Prepare(&context_, delegate);
+
   // Remove additional context info.
-  context_.GetNodeAndRegistration = nullptr;
-  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
-  context_.GetExecutionPlan = nullptr;
+  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
+  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
+  SetForbiddenContextFunction(&context_.GetExecutionPlan);
+
+  TF_LITE_ENSURE_OK(&context_, status);
+
+  if (!allow_dynamic_tensors) {
+    TF_LITE_ENSURE_OK(&context_, AllocateTensors());
+    TF_LITE_ENSURE(&context_, state_ == kStateInvokable ||
+                                  state_ == kStateInvokableAndImmutable);
+    // After using a delegate which doesn't support dynamic tensors, make the
+    // entire graph immutable.
+    state_ = kStateInvokableAndImmutable;
+  }
+
   return status;
 }
 
+TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
+                                          TfLiteBufferHandle buffer_handle,
+                                          TfLiteDelegate* delegate) {
+  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
+  TfLiteTensor* tensor = &tensors_[tensor_index];
+
+  TF_LITE_ENSURE(&context_,
+                 tensor->delegate == nullptr || tensor->delegate == delegate);
+  tensor->delegate = delegate;
+  if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
+    TF_LITE_ENSURE(&context_, tensor->delegate->FreeBufferHandle != nullptr);
+    tensor->delegate->FreeBufferHandle(tensor->delegate,
+                                       &tensor->buffer_handle);
+  }
+  tensor->buffer_handle = buffer_handle;
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
+                                          TfLiteBufferHandle* buffer_handle,
+                                          TfLiteDelegate** delegate) {
+  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
+  TfLiteTensor* tensor = &tensors_[tensor_index];
+
+  *delegate = tensor->delegate;
+  *buffer_handle = tensor->buffer_handle;
+
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index bab56a9d72f8992a9d8af23f92133c7c918fd46d..6f3433abcf71b6090b434d47e925775a2e517064 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -20,10 +20,12 @@ limitations under the License.
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
+
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/memory_planner.h"
+#include "tensorflow/contrib/lite/profiling/profiler.h"
 
 namespace tflite {
 
@@ -48,6 +50,10 @@ template <>
 constexpr TfLiteType typeToTfLiteType<unsigned char>() {
   return kTfLiteUInt8;
 }
+template <>
+constexpr TfLiteType typeToTfLiteType<bool>() {
+  return kTfLiteBool;
+}
 
 // Forward declare since NNAPIDelegate uses Interpreter.
 class NNAPIDelegate;
@@ -133,18 +139,34 @@ class Interpreter {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
-  TfLiteStatus SetTensorParametersReadOnly(
+  inline TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes,
+      const Allocation* allocation = nullptr) {
+    return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
+                                       dims.data(), quantization, buffer, bytes,
+                                       allocation);
+  };
+
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization,
       const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
 
   // Set description of inputs/outputs/data/fptrs for node `node_index`.
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
-  TfLiteStatus SetTensorParametersReadWrite(
+  inline TfLiteStatus SetTensorParametersReadWrite(
       int tensor_index, TfLiteType type, const char* name,
-      const std::vector<int>& dims, TfLiteQuantizationParams quantization);
+      const std::vector<int>& dims, TfLiteQuantizationParams quantization) {
+    return SetTensorParametersReadWrite(tensor_index, type, name, dims.size(),
+                                        dims.data(), quantization);
+  }
+  TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization);
 
   // Functions to access tensor data
 
@@ -167,10 +189,10 @@ class Interpreter {
   }
 
   // Return the number of tensors in the model.
-  int tensors_size() const { return context_.tensors_size; }
+  size_t tensors_size() const { return context_.tensors_size; }
 
   // Return the number of ops in the model.
-  int nodes_size() const { return nodes_and_registration_.size(); }
+  size_t nodes_size() const { return nodes_and_registration_.size(); }
 
   // WARNING: Experimental interface, subject to change
   const std::vector<int>& execution_plan() const { return execution_plan_; }
@@ -192,7 +214,7 @@ class Interpreter {
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
-      int node_index) {
+      int node_index) const {
     if (node_index >= nodes_and_registration_.size() || node_index < 0)
       return nullptr;
     return &nodes_and_registration_[node_index];
@@ -256,7 +278,76 @@ class Interpreter {
   // Allow a delegate to look at the graph and modify the graph to handle
   // parts of the graph themselves. After this is called, the graph may
   // contain new nodes that replace 1 more nodes.
-  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate,
+                                       bool allow_dynamic_tensors = false);
+
+  // Ensure the data in `tensor.data` is readable. In case delegate is used,
+  // it might require to copy the data from delegate buffer to raw memory.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
+    TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
+    TfLiteTensor* tensor = &tensors_[tensor_index];
+    if (tensor->data_is_stale) {
+      TF_LITE_ENSURE(&context_, tensor->delegate != nullptr);
+      TF_LITE_ENSURE(&context_,
+                     tensor->buffer_handle != kTfLiteNullBufferHandle);
+      // This can be null if the delegate doesn't use its own buffer.
+      TF_LITE_ENSURE(&context_,
+                     tensor->delegate->CopyFromBufferHandle != nullptr);
+      tensor->delegate->CopyFromBufferHandle(tensor->delegate,
+                                             tensor->buffer_handle,
+                                             tensor->data.raw, tensor->bytes);
+      tensor->data_is_stale = false;
+    }
+    return kTfLiteOk;
+  }
+
+  // Set the delegate buffer handle to a tensor. It can be called in the
+  // following cases:
+  // 1. Set the buffer handle to a tensor that's not being written by a
+  //    delegate. For example, feeding an OpenGL texture as the input of the
+  //    inference graph.
+  // 2. Set the buffer handle to a tensor that uses the same delegate.
+  //    For example, set an OpenGL texture as the output of inference, while
+  //    the node which produces output is an OpenGL delegate node.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus SetBufferHandle(int tensor_index,
+                               TfLiteBufferHandle buffer_handle,
+                               TfLiteDelegate* delegate);
+
+  // Get the delegate buffer handle, and the delegate which can process the
+  // buffer handle.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus GetBufferHandle(int tensor_index,
+                               TfLiteBufferHandle* buffer_handle,
+                               TfLiteDelegate** delegate);
+
+  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+
+  profiling::Profiler* GetProfiler(profiling::Profiler* profiler) {
+    return profiler_;
+  }
+
+  // The default capacity of `tensors_` vector.
+  static constexpr int kTensorsReservedCapacity = 128;
+  // The capacity headroom of `tensors_` vector before calling ops'
+  // `prepare` and `invoke` function. In these functions, it's guaranteed
+  // allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
+  // pointers to existing tensors.
+  static constexpr int kTensorsCapacityHeadroom = 16;
+
+  // Set if buffer handle output is allowed.
+  //
+  // When using hardware delegation, Interpreter will make the data of output
+  // tensors available in `tensor->data` by default. If the application can
+  // consume the buffer handle directly (e.g. reading output from OpenGL
+  // texture), it can set this flag to false, so Interpreter won't copy the data
+  // from buffer handle to CPU memory.
+  // WARNING: This is an experimental API and subject to change.
+  void SetAllowBufferHandleOutput(bool allow_buffer_handle_output) {
+    allow_buffer_handle_output_ = allow_buffer_handle_output;
+  }
 
  private:
   // Give 'op_reg' a chance to initialize itself using the contents of
@@ -315,7 +406,7 @@ class Interpreter {
   // Compute the number of bytes required to represent a tensor with dimensions
   // specified by the array dims (of length dims_size). Returns the status code
   // and bytes.
-  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, int dims_size,
+  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
                              size_t* bytes);
 
   // Request an tensor be resized implementation. If the given tensor is of
@@ -340,14 +431,15 @@ class Interpreter {
   // Entry point for C API ReplaceSubgraphsWithDelegateKernels
   static TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
       TfLiteContext* context, TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace);
+      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
 
   // Update the execution graph to replace some of the nodes with stub
   // nodes. Specifically any node index that has `nodes[index]==1` will be
   // slated for replacement with a delegate kernel specified by registration.
   // WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
-      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace);
+      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegate* delegate);
 
   // WARNING: This is an experimental interface that is subject to change.
   // Gets the internal pointer to a TensorFlow lite node by node_index.
@@ -370,6 +462,32 @@ class Interpreter {
   static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
                                        TfLiteIntArray** execution_plan);
 
+  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
+  // capacity. Calling this function may invalidate existing pointers to
+  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
+  // more tensors won't invalidate the pointer to existing tensors.
+  void EnsureTensorsVectorCapacity() {
+    const size_t required_capacity = tensors_size() + kTensorsCapacityHeadroom;
+    if (required_capacity > tensors_.capacity()) {
+      tensors_.reserve(required_capacity);
+      context_.tensors = tensors_.data();
+    }
+  }
+
+  // The state of the Interpreter.
+  enum State {
+    // The interpreter isn't ready to be invoked.
+    // `AllocateTensor` need to be called to enter an invokable state.
+    kStateUninvokable = 0,
+    // The interpreter is ready to be invoked.
+    kStateInvokable,
+    // The interpreter is ready to be invoked, and graph can't be further
+    // modified. The interpreter will enter this state when calling
+    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
+    kStateInvokableAndImmutable,
+  };
+  State state_ = kStateUninvokable;
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
@@ -385,10 +503,6 @@ class Interpreter {
   // the tensor array.
   bool consistent_ = true;
 
-  // Whether the model is safe to invoke (if any errors occurred this
-  // will be false).
-  bool invokable_ = false;
-
   // Array of indices representing the tensors that are inputs to the
   // interpreter.
   std::vector<int> inputs_;
@@ -404,7 +518,7 @@ class Interpreter {
   // During Invoke(), Interpreter will allocate input tensors first, which are
   // known to be fixed size. Then it will allocate outputs from nodes as many
   // as possible. When there is a node that produces dynamic sized tensor.
-  // Intepreter will stop allocating tensors, set the value of next allocate
+  // Interpreter will stop allocating tensors, set the value of next allocate
   // node id, and execute the node to generate the output tensor before continue
   // to allocate successors. This process repeats until all nodes are executed.
   // NOTE: this relies on the order of nodes that is in topological order.
@@ -425,6 +539,11 @@ class Interpreter {
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
 
   std::unique_ptr<MemoryPlanner> memory_planner_;
+
+  bool allow_buffer_handle_output_ = false;
+
+  // Profiler for this interpreter instance.
+  profiling::Profiler* profiler_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index 28c96e5dde6ffa62bb073db9716a00f91c6e0bdf..453c1ada1cf6263be14a3b170f209e3a30580cc3 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -17,9 +17,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/testing/util.h"
+
 namespace tflite {
 namespace {
 
@@ -40,7 +42,7 @@ TEST(BasicInterpreter, InvokeInvalidModel) {
   ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
 }
 
-// Test size accesser functions.
+// Test size accessor functions.
 TEST(BasicInterpreter, TestSizeFunctions) {
   Interpreter interpreter;
   int base_index;
@@ -439,12 +441,12 @@ TEST(BasicInterpreter, ThreeStepAllocate) {
   // String-in String-out node.
   TfLiteRegistration reg_copy = {nullptr, nullptr, nullptr, nullptr};
   reg_copy.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
-    TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]];
+    TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
     DynamicBuffer buf;
-    StringRef str_ref = GetString(a0, 0);
+    StringRef str_ref = GetString(input, 0);
     buf.AddString(str_ref);
-    buf.WriteToTensor(a1);
+    buf.WriteToTensor(output);
     return kTfLiteOk;
   };
 
@@ -561,6 +563,86 @@ TEST(BasicInterpreter, TestCustomErrorReporter) {
   ASSERT_EQ(reporter.calls, 1);
 }
 
+TEST(BasicInterpreter, TestUnsupportedDelegateFunctions) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+  TfLiteRegistration registration = {
+      .init = nullptr, .free = nullptr, .prepare = nullptr, .invoke = nullptr};
+  // These functions are only supported inside Delegate's Prepare function.
+  // The test verifies that these functions returns `kTfLiteError`, but not
+  // `kTfLiteOk` or just crashes.
+  registration.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    {
+      TfLiteIntArray* execution_plan;
+      EXPECT_EQ(context->GetExecutionPlan(context, &execution_plan),
+                kTfLiteError);
+    }
+    {
+      TfLiteNode* node;
+      TfLiteRegistration* registration;
+      EXPECT_EQ(
+          context->GetNodeAndRegistration(context, 0, &node, &registration),
+          kTfLiteError);
+    }
+    {
+      TfLiteRegistration delegate_registration = {nullptr, nullptr, nullptr,
+                                                  nullptr};
+      TfLiteIntArray nodes_to_replace;
+      nodes_to_replace.size = 0;
+      EXPECT_EQ(context->ReplaceSubgraphsWithDelegateKernels(
+                    context, delegate_registration, &nodes_to_replace, nullptr),
+                kTfLiteError);
+    }
+    return kTfLiteError;
+  };
+  ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetOutputs({0}), kTfLiteOk);
+  ASSERT_EQ(interpreter.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr,
+                                              &registration),
+            kTfLiteOk);
+  EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteError);
+}
+
+TEST(InterpreterTensorsCapacityTest, TestWithinHeadroom) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(Interpreter::kTensorsReservedCapacity),
+            kTfLiteOk);
+  TfLiteRegistration registration = {nullptr, nullptr, nullptr, nullptr};
+  registration.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    TfLiteTensor* first_tensor = context->tensors;
+
+    int new_tensor_index;
+    context->AddTensors(context, Interpreter::kTensorsCapacityHeadroom,
+                        &new_tensor_index);
+    EXPECT_EQ(first_tensor, context->tensors);
+    return kTfLiteOk;
+  };
+  ASSERT_EQ(interpreter.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr,
+                                              &registration),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+}
+
+TEST(InterpreterTensorsCapacityTest, TestExceedHeadroom) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(Interpreter::kTensorsReservedCapacity),
+            kTfLiteOk);
+  TfLiteRegistration registration = {nullptr, nullptr, nullptr, nullptr};
+  registration.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    TfLiteTensor* first_tensor = context->tensors;
+
+    int new_tensor_index;
+    context->AddTensors(context, Interpreter::kTensorsCapacityHeadroom + 1,
+                        &new_tensor_index);
+    EXPECT_NE(first_tensor, context->tensors);
+    return kTfLiteOk;
+  };
+  ASSERT_EQ(interpreter.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr,
+                                              &registration),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+}
+
 // Test fixture that allows playing with execution plans. It creates a two
 // node graph that can be executed in either [0,1] order or [1,0] order.
 // The CopyOp records when it is invoked in the class member run_order_
@@ -698,13 +780,17 @@ TfLiteRegistration AddOpRegistration() {
 
   reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
     // Set output size to input size
-    TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
-    TfLiteTensor* tensor1 = &context->tensors[node->inputs->data[1]];
-    TfLiteTensor* tensor2 = &context->tensors[node->outputs->data[0]];
-    TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
-    TfLiteIntArray* newSizeOther = TfLiteIntArrayCopy(tensor1->dims);
-    TF_LITE_ENSURE_EQ(context, newSize->size, newSizeOther->size);
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, tensor2, newSize));
+    TfLiteTensor* input1 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* input2 = &context->tensors[node->inputs->data[1]];
+    TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+
+    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
+    for (int i = 0; i < input1->dims->size; ++i) {
+      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
+    }
+
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+        context, output, TfLiteIntArrayCopy(input1->dims)));
     return kTfLiteOk;
   };
 
@@ -723,26 +809,40 @@ TfLiteRegistration AddOpRegistration() {
 }
 
 class TestDelegate : public ::testing::Test {
- public:
-  TestDelegate() {
-    interpreter_.AddTensors(5);
-    interpreter_.SetInputs({0, 1});
-    interpreter_.SetOutputs({3, 4});
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
     TfLiteQuantizationParams quant;
-    interpreter_.SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                              quant);
-    interpreter_.SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                              quant);
-    interpreter_.SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
-                                              quant);
-    interpreter_.SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
-                                              quant);
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
     TfLiteRegistration reg = AddOpRegistration();
-    interpreter_.AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
-    interpreter_.AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
-    interpreter_.AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+  void TearDown() override {
+    // Interpreter relies on delegate_ to free the resources properly. Thus
+    // the life cycle of delegate must be longer than interpreter.
+    interpreter_.reset();
+    delegate_.reset();
   }
 
+  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
+
+  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
+
  protected:
   class SimpleDelegate {
    public:
@@ -751,8 +851,8 @@ class TestDelegate : public ::testing::Test {
     // value-copyable and compatible with TfLite.
     explicit SimpleDelegate(const std::vector<int>& nodes) : nodes_(nodes) {
       delegate_.Prepare = [](TfLiteContext* context,
-                             void* data) -> TfLiteStatus {
-        auto* simple = reinterpret_cast<SimpleDelegate*>(data);
+                             TfLiteDelegate* delegate) -> TfLiteStatus {
+        auto* simple = reinterpret_cast<SimpleDelegate*>(delegate->data_);
         TfLiteIntArray* nodes_to_separate =
             TfLiteIntArrayCreate(simple->nodes_.size());
         // Mark nodes that we want in TfLiteIntArray* structure.
@@ -783,10 +883,26 @@ class TestDelegate : public ::testing::Test {
         }
 
         context->ReplaceSubgraphsWithDelegateKernels(
-            context, FakeFusedRegistration(), nodes_to_separate);
+            context, FakeFusedRegistration(), nodes_to_separate, delegate);
         TfLiteIntArrayFree(nodes_to_separate);
         return kTfLiteOk;
       };
+      delegate_.CopyToBufferHandle =
+          [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle,
+             void* data, size_t size) -> TfLiteStatus {
+        // TODO(ycling): Implement tests to test buffer copying logic.
+        return kTfLiteOk;
+      };
+      delegate_.CopyFromBufferHandle =
+          [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle,
+             void* data, size_t size) -> TfLiteStatus {
+        // TODO(ycling): Implement tests to test buffer copying logic.
+        return kTfLiteOk;
+      };
+      delegate_.FreeBufferHandle = [](TfLiteDelegate* delegate,
+                                      TfLiteBufferHandle* handle) {
+        *handle = kTfLiteNullBufferHandle;
+      };
       // Store type-punned data SimpleDelegate structure.
       delegate_.data_ = reinterpret_cast<void*>(this);
     }
@@ -803,36 +919,196 @@ class TestDelegate : public ::testing::Test {
     std::vector<int> nodes_;
     TfLiteDelegate delegate_;
   };
-  Interpreter interpreter_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SimpleDelegate> delegate_;
 };
 
 TEST_F(TestDelegate, BasicDelegate) {
-  interpreter_.Invoke();
-  SimpleDelegate simple({0, 1, 2});
-  interpreter_.ModifyGraphWithDelegate(simple.get_tf_lite_delegate());
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
 
-  ASSERT_EQ(interpreter_.execution_plan().size(), 1);
-  int node = interpreter_.execution_plan()[0];
-  const auto* node_and_reg = interpreter_.node_and_registration(node);
-  ASSERT_EQ(node_and_reg->second.custom_name,
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  int node = interpreter_->execution_plan()[0];
+  const auto* node_and_reg = interpreter_->node_and_registration(node);
+  EXPECT_EQ(node_and_reg->second.custom_name,
             SimpleDelegate::FakeFusedRegistration().custom_name);
+
+  const TfLiteDelegateParams* params =
+      reinterpret_cast<const TfLiteDelegateParams*>(
+          node_and_reg->first.builtin_data);
+  ASSERT_EQ(params->nodes_to_replace->size, 3);
+  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
+  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
+  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
+
+  ASSERT_EQ(params->input_tensors->size, 2);
+  EXPECT_EQ(params->input_tensors->data[0], 0);
+  EXPECT_EQ(params->input_tensors->data[1], 1);
+
+  ASSERT_EQ(params->output_tensors->size, 2);
+  EXPECT_EQ(params->output_tensors->data[0], 3);
+  EXPECT_EQ(params->output_tensors->data[1], 4);
 }
 
 TEST_F(TestDelegate, ComplexDeligate) {
-  interpreter_.Invoke();
-  SimpleDelegate simple({1, 2});
-  interpreter_.ModifyGraphWithDelegate(simple.get_tf_lite_delegate());
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
 
-  ASSERT_EQ(interpreter_.execution_plan().size(), 2);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
   // 0th should be a non-delegated original op
-  ASSERT_EQ(interpreter_.execution_plan()[0], 0);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
   // 1st should be a new macro op (3) which didn't exist)
-  ASSERT_EQ(interpreter_.execution_plan()[1], 3);
-  const auto* node_and_reg = interpreter_.node_and_registration(3);
+  ASSERT_EQ(interpreter_->execution_plan()[1], 3);
+  const auto* node_and_reg = interpreter_->node_and_registration(3);
   ASSERT_EQ(node_and_reg->second.custom_name,
             SimpleDelegate::FakeFusedRegistration().custom_name);
 }
 
+TEST_F(TestDelegate, SetBufferHandleToInput) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 0;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  ASSERT_EQ(tensor->delegate, nullptr);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+}
+
+TEST_F(TestDelegate, SetBufferHandleToOutput) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+}
+
+TEST_F(TestDelegate, SetInvalidHandleToTensor) {
+  interpreter_->Invoke();
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate, true);
+
+  SimpleDelegate another_simple_delegate({0, 1, 2});
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status = interpreter_->SetBufferHandle(
+      kOutputTensorIndex, handle,
+      another_simple_delegate.get_tf_lite_delegate());
+  // Setting a buffer handle to a tensor with another delegate will fail.
+  ASSERT_EQ(status, kTfLiteError);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+}
+
+TEST_F(TestDelegate, ResizeInputWithNonDynamicDelegateShouldFail) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 2}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 2}), kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 2}), kTfLiteError);
+}
+
+class TestDelegateWithDynamicTensors : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+
+    interpreter_->AddTensors(2);
+    interpreter_->SetInputs({0});
+    interpreter_->SetOutputs({1});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = DynamicCopyOpRegistration();
+    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
+
+    delegate_.Prepare = [](TfLiteContext* context,
+                           TfLiteDelegate* delegate) -> TfLiteStatus {
+      // In this test, the delegate replaces all the nodes if this function is
+      // called.
+      TfLiteIntArray* execution_plan;
+      TF_LITE_ENSURE_STATUS(
+          context->GetExecutionPlan(context, &execution_plan));
+      context->ReplaceSubgraphsWithDelegateKernels(
+          context, DelegateRegistration(), execution_plan, delegate);
+      return kTfLiteOk;
+    };
+  }
+
+  static TfLiteRegistration DynamicCopyOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+      SetTensorToDynamic(output);
+      return kTfLiteOk;
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      // Not implemented since this isn't required in testing.
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  static TfLiteRegistration DelegateRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+    return reg;
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  TfLiteDelegate delegate_;
+};
+
+TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
+  interpreter_->ModifyGraphWithDelegate(&delegate_, false);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The interpreter should not call delegate's `Prepare` when dynamic tensors
+  // exist. So the node ID isn't changed.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
+  interpreter_->ModifyGraphWithDelegate(&delegate_, true);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The node should be replaced because dynamic tensors are allowed. Therefore
+  // only node ID in the execution plan is changed from 0 to 1.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/ios_makefile.inc b/tensorflow/contrib/lite/ios_makefile.inc
index fc6594c3a04ba6aabba99bb631f85737baf389f1..079320586ffd01fc77818a81e0c5962f1d28c1f1 100644
--- a/tensorflow/contrib/lite/ios_makefile.inc
+++ b/tensorflow/contrib/lite/ios_makefile.inc
@@ -31,9 +31,6 @@ ifeq ($(TARGET), IOS)
 		${IPHONEOS_SYSROOT} \
 		-arch $(IOS_ARCH) \
 		-O3
-	ifeq ($(IOS_ARCH), x86_64)
-		CXXFLAGS += -msse4.1
-	endif
 	CCFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-fembed-bitcode \
 		-mno-thumb \
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index 35aacb70002d1d454f675484e4398bcdffc4acf1..1dda55b8edf8f85293c473b51b8a19066bac5f73 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -29,7 +29,7 @@ android_library(
     visibility = ["//visibility:public"],
     deps = [
         ":tflite_runtime",
-        "@javax_validation",
+        "@org_checkerframework_qual",
     ],
 )
 
@@ -42,7 +42,24 @@ android_library(
     ),
     visibility = ["//visibility:public"],
     deps = [
-        "@javax_validation",
+        "@org_checkerframework_qual",
+    ],
+)
+
+java_library(
+    name = "ovicbenchmarkerlib",
+    srcs = [
+        "ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+        "ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
+    ],
+    javacopts = JAVACOPTS,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtensorflowlite_jni.so",
+        ":tensorflowlite_java",
+        "//tensorflow/contrib/lite/java/src/main/native",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
     ],
 )
 
@@ -58,7 +75,7 @@ java_library(
     deps = [
         ":libtensorflowlite_jni.so",
         "//tensorflow/contrib/lite/java/src/main/native",
-        "@javax_validation",
+        "@org_checkerframework_qual",
     ],
 )
 
@@ -100,6 +117,7 @@ java_test(
         "src/testdata/int64.bin",
         "src/testdata/invalid_model.bin",
         "src/testdata/uint8.bin",
+        "src/testdata/with_custom_op.lite",
     ],
     javacopts = JAVACOPTS,
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
@@ -147,6 +165,28 @@ java_test(
     ],
 )
 
+java_test(
+    name = "OvicClassifierTest",
+    size = "medium",
+    srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+    data = [
+        "ovic/src/testdata/float_model.lite",
+        "ovic/src/testdata/labels.txt",
+        "ovic/src/testdata/low_res_model.lite",
+        "ovic/src/testdata/quantized_model.lite",
+        "ovic/src/testdata/test_image_128.jpg",
+        "ovic/src/testdata/test_image_224.jpg",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.ovic.OvicClassifierTest",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":ovicbenchmarkerlib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
 filegroup(
     name = "libtensorflowlite_jni",
     srcs = select({
@@ -167,15 +207,3 @@ tflite_jni_binary(
         "//tensorflow/contrib/lite/java/src/main/native",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
index 654fa9d6d2799fc3cafa3e0e042cb2a5746bf2c5..d6fbef9cc938993b283103984307ab51e609dd6e 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
@@ -6,7 +6,7 @@ android_binary(
     name = "TfLiteCameraDemo",
     srcs = glob(["java/**/*.java"]),
     assets = [
-        "@tflite_mobilenet//:labels.txt",
+        "//tensorflow/contrib/lite/java/demo/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
         "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
     ],
     assets_dir = "",
@@ -27,15 +27,3 @@ android_binary(
         "@androidsdk//com.android.support:support-v4-25.2.0",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
index dd0cd6c98ff878e9c41875cab74c12191cadb173..ce68160b68efd446c1dfa4c70c37aaa4048e4f2f 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
@@ -10,15 +10,3 @@ exports_files(
         ],
     ),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index d2048b41b1e76fe42c919c9b889df5be8a94957f..4f5662bc2d15f1bf6bfec0b9ec79b09f9e124186 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -47,6 +47,8 @@ import android.os.HandlerThread;
 import android.support.annotation.NonNull;
 import android.support.v13.app.FragmentCompat;
 import android.support.v4.content.ContextCompat;
+import android.text.SpannableString;
+import android.text.SpannableStringBuilder;
 import android.util.Log;
 import android.util.Size;
 import android.view.LayoutInflater;
@@ -54,6 +56,9 @@ import android.view.Surface;
 import android.view.TextureView;
 import android.view.View;
 import android.view.ViewGroup;
+import android.widget.CompoundButton;
+import android.widget.NumberPicker;
+import android.widget.ToggleButton;
 import android.widget.TextView;
 import android.widget.Toast;
 import java.io.IOException;
@@ -82,6 +87,8 @@ public class Camera2BasicFragment extends Fragment
   private boolean runClassifier = false;
   private boolean checkedPermissions = false;
   private TextView textView;
+  private ToggleButton toggle;
+  private NumberPicker np;
   private ImageClassifier classifier;
 
   /** Max preview width that is guaranteed by Camera2 API */
@@ -202,14 +209,21 @@ public class Camera2BasicFragment extends Fragment
    *
    * @param text The message to show
    */
-  private void showToast(final String text) {
+  private void showToast(String s) {
+    SpannableStringBuilder builder = new SpannableStringBuilder();
+    SpannableString str1 = new SpannableString(s);
+    builder.append(str1);
+    showToast(builder);
+  }
+
+  private void showToast(SpannableStringBuilder builder) {
     final Activity activity = getActivity();
     if (activity != null) {
       activity.runOnUiThread(
           new Runnable() {
             @Override
             public void run() {
-              textView.setText(text);
+              textView.setText(builder, TextView.BufferType.SPANNABLE);
             }
           });
     }
@@ -289,6 +303,24 @@ public class Camera2BasicFragment extends Fragment
   public void onViewCreated(final View view, Bundle savedInstanceState) {
     textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
     textView = (TextView) view.findViewById(R.id.text);
+    toggle = (ToggleButton) view.findViewById(R.id.button);
+
+    toggle.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+      public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
+        classifier.setUseNNAPI(isChecked);
+      }
+    });
+
+    np = (NumberPicker) view.findViewById(R.id.np);
+    np.setMinValue(1);
+    np.setMaxValue(10);
+    np.setWrapSelectorWheel(true);
+    np.setOnValueChangedListener(new NumberPicker.OnValueChangeListener() {
+      @Override
+      public void onValueChange(NumberPicker picker, int oldVal, int newVal){
+        classifier.setNumThreads(newVal);
+      }
+    });
   }
 
   /** Load the model and labels. */
@@ -299,7 +331,7 @@ public class Camera2BasicFragment extends Fragment
       // create either a new ImageClassifierQuantizedMobileNet or an ImageClassifierFloatInception
       classifier = new ImageClassifierQuantizedMobileNet(getActivity());
     } catch (IOException e) {
-      Log.e(TAG, "Failed to initialize an image classifier.");
+      Log.e(TAG, "Failed to initialize an image classifier.", e);
     }
     startBackgroundThread();
   }
@@ -433,7 +465,7 @@ public class Camera2BasicFragment extends Fragment
         return;
       }
     } catch (CameraAccessException e) {
-      e.printStackTrace();
+      Log.e(TAG, "Failed to access Camera", e);
     } catch (NullPointerException e) {
       // Currently an NPE is thrown when the Camera2API is used but not supported on the
       // device this code runs.
@@ -478,7 +510,7 @@ public class Camera2BasicFragment extends Fragment
       }
       manager.openCamera(cameraId, stateCallback, backgroundHandler);
     } catch (CameraAccessException e) {
-      e.printStackTrace();
+      Log.e(TAG, "Failed to open Camera", e);
     } catch (InterruptedException e) {
       throw new RuntimeException("Interrupted while trying to lock camera opening.", e);
     }
@@ -545,7 +577,7 @@ public class Camera2BasicFragment extends Fragment
         runClassifier = false;
       }
     } catch (InterruptedException e) {
-      e.printStackTrace();
+      Log.e(TAG, "Interrupted when stopping background thread", e);
     }
   }
 
@@ -604,7 +636,7 @@ public class Camera2BasicFragment extends Fragment
                 captureSession.setRepeatingRequest(
                     previewRequest, captureCallback, backgroundHandler);
               } catch (CameraAccessException e) {
-                e.printStackTrace();
+                Log.e(TAG, "Failed to set up config to capture Camera", e);
               }
             }
 
@@ -615,7 +647,7 @@ public class Camera2BasicFragment extends Fragment
           },
           null);
     } catch (CameraAccessException e) {
-      e.printStackTrace();
+      Log.e(TAG, "Failed to preview Camera", e);
     }
   }
 
@@ -659,9 +691,9 @@ public class Camera2BasicFragment extends Fragment
       showToast("Uninitialized Classifier or invalid context.");
       return;
     }
-    Bitmap bitmap =
-        textureView.getBitmap(classifier.getImageSizeX(), classifier.getImageSizeY());
-    String textToShow = classifier.classifyFrame(bitmap);
+    SpannableStringBuilder textToShow = new SpannableStringBuilder();
+    Bitmap bitmap = textureView.getBitmap(classifier.getImageSizeX(), classifier.getImageSizeY());
+    classifier.classifyFrame(bitmap, textToShow);
     bitmap.recycle();
     showToast(textToShow);
   }
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index c319bff9f11546ac4d49c6b34c5ecdbc41547d58..7bb6afd9d8b77159bb180fad6bbe43ca454f9d14 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -19,10 +19,11 @@ import android.app.Activity;
 import android.content.res.AssetFileDescriptor;
 import android.graphics.Bitmap;
 import android.os.SystemClock;
+import android.text.SpannableString;
+import android.text.SpannableStringBuilder;
+import android.text.style.ForegroundColorSpan;
+import android.text.style.RelativeSizeSpan;
 import android.util.Log;
-
-import org.tensorflow.lite.Interpreter;
-
 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -37,11 +38,15 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.PriorityQueue;
+import org.tensorflow.lite.Interpreter;
 
 /**
  * Classifies images with Tensorflow Lite.
  */
 public abstract class ImageClassifier {
+  // Display preferences
+  private static final float GOOD_PROB_THRESHOLD = 0.3f;
+  private static final int SMALL_COLOR = 0xffddaa88;
 
   /** Tag for the {@link Log}. */
   private static final String TAG = "TfLiteCameraDemo";
@@ -88,18 +93,23 @@ public abstract class ImageClassifier {
     labelList = loadLabelList(activity);
     imgData =
         ByteBuffer.allocateDirect(
-                DIM_BATCH_SIZE * getImageSizeX() * getImageSizeY() * DIM_PIXEL_SIZE *
-                        getNumBytesPerChannel());
+            DIM_BATCH_SIZE
+                * getImageSizeX()
+                * getImageSizeY()
+                * DIM_PIXEL_SIZE
+                * getNumBytesPerChannel());
     imgData.order(ByteOrder.nativeOrder());
     filterLabelProbArray = new float[FILTER_STAGES][getNumLabels()];
     Log.d(TAG, "Created a Tensorflow Lite Image Classifier.");
   }
 
   /** Classifies a frame from the preview stream. */
-  String classifyFrame(Bitmap bitmap) {
+  void classifyFrame(Bitmap bitmap, SpannableStringBuilder builder) {
+    printTopKLabels(builder);
+
     if (tflite == null) {
       Log.e(TAG, "Image classifier has not been initialized; Skipped.");
-      return "Uninitialized Classifier.";
+      builder.append(new SpannableString("Uninitialized Classifier."));
     }
     convertBitmapToByteBuffer(bitmap);
     // Here's where the magic happens!!!
@@ -112,9 +122,10 @@ public abstract class ImageClassifier {
     applyFilter();
 
     // Print the results.
-    String textToShow = printTopKLabels();
-    textToShow = Long.toString(endTime - startTime) + "ms" + textToShow;
-    return textToShow;
+    long duration = endTime - startTime;
+    SpannableString span = new SpannableString(duration + " ms");
+    span.setSpan(new ForegroundColorSpan(android.graphics.Color.LTGRAY), 0, span.length(), 0);
+    builder.append(span);
   }
 
   void applyFilter() {
@@ -139,6 +150,16 @@ public abstract class ImageClassifier {
     }
   }
 
+  public void setUseNNAPI(Boolean nnapi) {
+    if (tflite != null)
+        tflite.setUseNNAPI(nnapi);
+  }
+
+  public void setNumThreads(int num_threads) {
+    if (tflite != null)
+        tflite.setNumThreads(num_threads);
+  }
+
   /** Closes tflite to release resources. */
   public void close() {
     tflite.close();
@@ -189,7 +210,7 @@ public abstract class ImageClassifier {
   }
 
   /** Prints top-K labels, to be shown in UI as the results. */
-  private String printTopKLabels() {
+  private void printTopKLabels(SpannableStringBuilder builder) {
     for (int i = 0; i < getNumLabels(); ++i) {
       sortedLabels.add(
           new AbstractMap.SimpleEntry<>(labelList.get(i), getNormalizedProbability(i)));
@@ -197,55 +218,75 @@ public abstract class ImageClassifier {
         sortedLabels.poll();
       }
     }
-    String textToShow = "";
+
     final int size = sortedLabels.size();
-    for (int i = 0; i < size; ++i) {
+    for (int i = 0; i < size; i++) {
       Map.Entry<String, Float> label = sortedLabels.poll();
-      textToShow = String.format("\n%s: %4.2f", label.getKey(), label.getValue()) + textToShow;
+      SpannableString span =
+          new SpannableString(String.format("%s: %4.2f\n", label.getKey(), label.getValue()));
+      int color;
+      // Make it white when probability larger than threshold.
+      if (label.getValue() > GOOD_PROB_THRESHOLD) {
+        color = android.graphics.Color.WHITE;
+      } else {
+        color = SMALL_COLOR;
+      }
+      // Make first item bigger.
+      if (i == size - 1) {
+        float sizeScale = (i == size - 1) ? 1.25f : 0.8f;
+        span.setSpan(new RelativeSizeSpan(sizeScale), 0, span.length(), 0);
+      }
+      span.setSpan(new ForegroundColorSpan(color), 0, span.length(), 0);
+      builder.insert(0, span);
     }
-    return textToShow;
   }
 
   /**
    * Get the name of the model file stored in Assets.
+   *
    * @return
    */
   protected abstract String getModelPath();
 
   /**
    * Get the name of the label file stored in Assets.
+   *
    * @return
    */
   protected abstract String getLabelPath();
 
   /**
    * Get the image size along the x axis.
+   *
    * @return
    */
   protected abstract int getImageSizeX();
 
   /**
    * Get the image size along the y axis.
+   *
    * @return
    */
   protected abstract int getImageSizeY();
 
   /**
    * Get the number of bytes that is used to store a single color channel value.
+   *
    * @return
    */
   protected abstract int getNumBytesPerChannel();
 
   /**
    * Add pixelValue to byteBuffer.
+   *
    * @param pixelValue
    */
   protected abstract void addPixelValue(int pixelValue);
 
   /**
-   * Read the probability value for the specified label
-   * This is either the original value as it was read from the net's output or the updated value
-   * after the filter was applied.
+   * Read the probability value for the specified label This is either the original value as it was
+   * read from the net's output or the updated value after the filter was applied.
+   *
    * @param labelIndex
    * @return
    */
@@ -253,29 +294,32 @@ public abstract class ImageClassifier {
 
   /**
    * Set the probability value for the specified label.
+   *
    * @param labelIndex
    * @param value
    */
   protected abstract void setProbability(int labelIndex, Number value);
 
   /**
-   * Get the normalized probability value for the specified label.
-   * This is the final value as it will be shown to the user.
+   * Get the normalized probability value for the specified label. This is the final value as it
+   * will be shown to the user.
+   *
    * @return
    */
   protected abstract float getNormalizedProbability(int labelIndex);
 
   /**
-   * Run inference using the prepared input in {@link #imgData}.
-   * Afterwards, the result will be provided by getProbability().
+   * Run inference using the prepared input in {@link #imgData}. Afterwards, the result will be
+   * provided by getProbability().
    *
-   * This additional method is necessary, because we don't have a common base for different
+   * <p>This additional method is necessary, because we don't have a common base for different
    * primitive data types.
    */
   protected abstract void runInference();
 
   /**
    * Get the total number of labels.
+   *
    * @return
    */
   protected int getNumLabels() {
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
index 156c895146940adfe71f111be6e354e02b75ea48..e164ac75543ebab12e6b1c057c4ed487eb9accdf 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
@@ -16,7 +16,6 @@ limitations under the License.
 package com.example.android.tflitecamerademo;
 
 import android.app.Activity;
-
 import java.io.IOException;
 
 /**
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png
index c22509d8dfccae14d9470e3042a9ed5b469ca2c9..52cf2ab95296d675dd42533bb9136707adebd98c 100644
Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png
index d68af39186ca9cd2bc755cad8397467a11844a1d..b75f892c462a12cae4f09851d019db23b286f843 100644
Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png
index 15e419b7ccd88651bd21dac36853a827fc4075b8..36e14c48d14a8d3e5bf37d3caaee661061cec3be 100644
Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png
index 342ce34e1663960d8d7050a9be57face3571d336..06dd2a740ec2abaec4919c991dd17ee007ffcf28 100644
Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..b94bcfc081e0b036fbba271d7cbfb986575d4abf
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
index a84f1bbfa0cb48a3fc335c9bc4aa7d8e93d20e75..20f520814d7154764932638c5e9dddc32639b677 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -14,37 +14,50 @@
  limitations under the License.
 -->
 <RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
     android:layout_width="match_parent"
     android:layout_height="match_parent">
 
-    <com.example.android.tflitecamerademo.AutoFitTextureView
-        android:id="@+id/texture"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true"
-        android:layout_alignParentStart="true"
-        android:layout_alignParentTop="true" />
-
-    <FrameLayout
-        android:id="@+id/control"
+    <LinearLayout
         android:layout_width="match_parent"
-        android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true"
-        android:layout_alignParentEnd="true"
-        android:layout_alignParentTop="true"
-        android:layout_toRightOf="@id/texture"
-        android:background="@color/control_background"
-        android:orientation="horizontal">
-
-        <TextView android:id="@+id/text"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
+        android:layout_height="match_parent"
+        android:background="#bb7700"
+        android:orientation="horizontal"
+        android:weightSum="100">
+
+        <LinearLayout
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_weight="30"
+            android:orientation="vertical">
+
+            <com.example.android.tflitecamerademo.AutoFitTextureView
+                android:id="@+id/texture"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent"
+                android:layout_weight="100" />
+
+            <ImageView
+                android:id="@+id/logoview"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:layout_weight="100"
+                android:scaleType="centerCrop"
+                android:src="@drawable/logo" />
+
+        </LinearLayout>
+
+        <TextView
+            android:id="@+id/text"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_weight="70"
+            android:paddingLeft="5dp"
             android:paddingTop="20dp"
             android:textColor="#FFF"
             android:textSize="20sp"
             android:textStyle="bold" />
 
-
-    </FrameLayout>
+    </LinearLayout>
 
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
new file mode 100644
index 0000000000000000000000000000000000000000..72a229ecdb19f5309994e994d82e0b5b5ed617a2
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
@@ -0,0 +1,88 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#bb7700">
+
+    <com.example.android.tflitecamerademo.AutoFitTextureView
+        android:id="@+id/texture"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_weight="1" />
+
+    <LinearLayout
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_alignParentBottom="true"
+        android:layout_alignParentEnd="false"
+        android:layout_alignParentStart="true"
+        android:layout_alignParentTop="false"
+        android:background="#bb7700"
+        android:orientation="vertical"
+        android:weightSum="100">
+
+        <ImageView
+            android:id="@+id/logoview2"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_weight="30"
+            android:scaleType="fitStart"
+            android:src="@drawable/logo" />
+
+        <TextView
+            android:id="@+id/text"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_alignParentBottom="true"
+            android:layout_alignParentEnd="true"
+            android:layout_alignParentRight="true"
+            android:layout_weight="30"
+            android:textColor="#FFF"
+            android:textSize="20sp"
+            android:textStyle="bold" />
+
+    </LinearLayout>
+
+    <RelativeLayout
+        android:id="@+id/control2"
+        android:layout_width="match_parent"
+        android:layout_height="135dp"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentStart="true"
+        android:layout_alignTop="@+id/control"
+        android:layout_marginLeft="300dp"
+        android:layout_marginStart="300dp"
+        android:background="#bb7700">
+
+        <ToggleButton
+            android:id="@+id/button"
+            android:textOff="@string/tflite"
+            android:textOn="@string/nnapi"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentLeft="true"
+            android:layout_alignParentStart="true" />
+
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_below="@+id/button"
+            android:visibility="visible" />
+    </RelativeLayout>
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index 15305c436e0d997af15a326ab4027ea713ed8098..d12435d5abda45917b8a4f12c4b3179997eae689 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -14,32 +14,102 @@
  limitations under the License.
 -->
 <RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
     android:layout_width="match_parent"
     android:layout_height="match_parent">
 
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:orientation="vertical"
+        android:weightSum="60">
+
+        <FrameLayout
+            android:id="@+id/control"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_alignParentBottom="true"
+            android:layout_alignParentStart="true"
+            android:layout_weight="60"
+            android:background="#cc7700"
+            android:paddingLeft="20dp"
+            android:paddingStart="20dp">
+
+        </FrameLayout>
+
     <com.example.android.tflitecamerademo.AutoFitTextureView
         android:id="@+id/texture"
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
         android:layout_alignParentStart="true"
+        android:layout_alignParentLeft="true"
         android:layout_alignParentTop="true" />
 
-    <FrameLayout
-        android:id="@+id/control"
+        <TextView
+            android:id="@+id/text"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_weight="20"
+            android:textColor="#FFF"
+            android:textSize="20sp"
+            android:textStyle="bold" />
+    </LinearLayout>
+
+    <RelativeLayout
+        android:id="@+id/control2"
+        android:layout_width="match_parent"
+        android:layout_height="135dp"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentStart="true"
+        android:layout_alignTop="@+id/control"
+        android:layout_marginLeft="300dp"
+        android:layout_marginStart="300dp"
+        android:background="#bb7700">
+
+        <ToggleButton
+            android:id="@+id/button"
+            android:textOff="@string/tflite"
+            android:textOn="@string/nnapi"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentLeft="true"
+            android:layout_alignParentStart="true" />
+
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_below="@+id/button"
+            android:visibility="visible" />
+    </RelativeLayout>
+
+    <RelativeLayout
+        android:id="@+id/control2"
         android:layout_width="match_parent"
-        android:layout_height="112dp"
-        android:layout_alignParentBottom="true"
+        android:layout_height="135dp"
+        android:layout_alignParentLeft="true"
         android:layout_alignParentStart="true"
+        android:layout_alignTop="@+id/control"
+        android:layout_marginLeft="300dp"
+        android:layout_marginStart="300dp"
         android:background="@color/control_background">
 
-        <TextView android:id="@+id/text"
+        <ToggleButton
+            android:id="@+id/button"
+            android:textOff="@string/tflite"
+            android:textOn="@string/nnapi"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:paddingLeft="80dp"
-            android:textColor="#FFF"
-            android:textSize="20sp"
-            android:textStyle="bold" />
+            android:layout_alignParentLeft="true"
+            android:layout_alignParentStart="true" />
 
-    </FrameLayout>
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_below="@+id/button"
+            android:visibility="visible" />
+    </RelativeLayout>
 
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
index a08ec3eb629250a727cec49a822375fe5569f455..29a033bcd437c951ef6e8ba78f4fc3a0fcafac96 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
@@ -21,4 +21,6 @@
     <string name="toggle_turn_on">NN:On</string>
     <string name="toggle_turn_off">NN:Off</string>
     <string name="toggle">Use NNAPI</string>
+    <string name="tflite">tflite</string>
+    <string name="nnapi">NNAPI</string>
 </resources>
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..76c33838bfe5b8596d78cae7d022c51d2a379e76
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -0,0 +1,83 @@
+# Benchmarker for LPIRC Workshop at CVPR 2018
+
+This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
+
+## Pre-requesits
+
+Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
+
+## To test the benchmarker:
+
+The testing utilities helps the developers (you) to make sure that your submissions in TfLite format will be processed as expected in the competition's benchmarking system.
+
+Note: for now the tests only provides correctness checks, i.e. classifier predicts the correct category on the test image, but no on-device latency measurements. To test the latency measurement functionality, the tests will print the latency running on a desktop computer, which is not indicative of the on-device run-time.
+We are releasing an benchmarker Apk that would allow developers to measure latency on their own devices.
+
+### Obtain the sample models
+
+The test data (models and images) should be downloaded automatically for you by Bazel. In case they are not, you can manually install them as below.
+
+Note: all commands should be called from your tensorflow installation folder (under this folder you should find `tensorflow/contrib/lite`).
+
+
+* Download the [testdata package](https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip):
+
+```sh
+curl -L https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip -o /tmp/ovic.zip
+```
+
+* Unzip the package into the testdata folder:
+
+```sh
+unzip -j /tmp/ovic.zip -d tensorflow/contrib/lite/java/ovic/src/testdata/
+```
+
+### Run tests
+
+You can run test with Bazel as below. This helps to ensure that the installation is correct.
+
+```sh
+bazel test --cxxopt=--std=c++11 //tensorflow/contrib/lite/java:OvicClassifierTest --test_output=all
+```
+
+### Test your submissions
+
+Once you have a submission that follows the instructions from the [competition site](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018), you can verify it as below.
+
+* Move your submission to the testdata folder:
+
+Let say the submission file is located at `/tmp/my_model.lite`, then
+
+```sh
+cp /tmp/my_model.lite tensorflow/contrib/lite/java/ovic/src/testdata/
+```
+
+* Resize the test image to the resolutions that are expected by your submission:
+
+The test images can be found at `tensorflow/contrib/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these images if your image resolutions are 128x128 or 224x224.
+
+* Add your model and test image to the BUILD rule:
+
+```JSON
+java_test(
+  name = "OvicClassifierTest",
+  size = "medium",
+  srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+  data = [
+      "ovic/src/testdata/float_model.lite",
+      "ovic/src/testdata/labels.txt",
+      "ovic/src/testdata/low_res_model.lite",
+      "ovic/src/testdata/quantized_model.lite",
+      "ovic/src/testdata/test_image_128.jpg",
+      "ovic/src/testdata/test_image_224.jpg",
+      "ovic/src/testdata/my_model.lite",        # <--- Your submission.
+      "ovic/src/testdata/my_test_image.jpg",    # <--- Your test image.
+  ],
+      ...
+```
+
+* Modify `OvicClassifierTest.java` to test your model.
+
+Change `TEST_IMAGE_PATH` to `testdata/my_test_image.jpg`. If your model runs inference in floating point, change `FLOAT_MODEL_PATH` to `testdata/my_model.lite`. If your model runs [quantized inference](https://www.tensorflow.org/performance/quantization), change `QUANTIZED_MODEL_PATH` to `testdata/my_model.lite`.
+
+Now you can run the bazel tests to catch any runtime issues with the submission.
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
new file mode 100644
index 0000000000000000000000000000000000000000..d0102883e6b41f5c33a0061c5fd53b5f69b8ab54
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
@@ -0,0 +1,197 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import android.graphics.Bitmap;
+import android.os.SystemClock;
+import android.util.Log;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+
+/**
+ * Class that benchmarks image classifier models.
+ *
+ * <p>===================== General workflow =======================
+ *
+ * <pre>{@code
+ * benchmarker = new OvicBenchmarker();
+ * benchmarker.getReadyToTest(labelInputStream, model);
+ * while (!benchmarker.shouldStop()) {
+ *   Bitmap bitmap = ...
+ *   benchmarker.doTestIteration(bitmap);
+ * }
+ * }</pre>
+ */
+public class OvicBenchmarker {
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "OvicBenchmarker";
+
+  /** Evaluation transformation parameters. */
+  private static final float CENTRAL_FRACTION = 0.875f;
+
+  /** Dimensions of inputs. */
+  private static final int DIM_BATCH_SIZE = 1;
+  private static final int DIM_PIXEL_SIZE = 3;
+  private int imgHeight = 224;
+  private int imgWidth = 224;
+
+  /* Preallocated buffers for storing image data in. */
+  private int[] intValues = null;
+
+  /** A ByteBuffer to hold image data, to be feed into classifier as inputs. */
+  private ByteBuffer imgData = null;
+
+  private OvicClassifier classifier;
+
+  /** Total runtime in ms. */
+  private double totalRuntime = 0.0;
+  /** Total allowed runtime in ms. */
+  private double wallTime = 20000 * 30.0;
+
+  private Boolean benchmarkStarted = null;
+
+  /**
+   * Initializes an {@link OvicBenchmarker}
+   *
+   * @param wallTime: a double number specifying the total amount of time to benchmark.
+   */
+  public OvicBenchmarker(double wallTime) {
+    benchmarkStarted = false;
+    totalRuntime = 0.0;
+    this.wallTime = wallTime;
+  }
+
+  /** Check whether the benchmarker should stop. */
+  public Boolean shouldStop() {
+    if (totalRuntime >= wallTime) {
+      Log.e(
+          TAG,
+          "Total runtime "
+              + Double.toString(totalRuntime)
+              + " exceeded walltime "
+              + Double.toString(wallTime));
+      return true;
+    }
+    return false;
+  }
+
+  /** Check whether the benchmarker is ready to start classifying images. */
+  public Boolean readyToTest() {
+    return (classifier != null);
+  }
+
+  /**
+   * Getting the benchmarker ready for classifying images.
+   *
+   * @param labelInputStream: an {@link InputStream} specifying where the list of labels should be
+   *     read from.
+   * @param model: a {@link MappedByteBuffer} model to benchmark.
+   */
+  public void getReadyToTest(InputStream labelInputStream, MappedByteBuffer model) {
+    try {
+      Log.i(TAG, "Creating classifier.");
+      classifier = new OvicClassifier(labelInputStream, model);
+      int [] inputDims = classifier.getInputDims();
+      imgHeight = inputDims[1];
+      imgWidth = inputDims[2];
+      // Only accept QUANTIZED_UINT8 input.
+      imgData = ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE);
+      imgData.order(ByteOrder.nativeOrder());
+      intValues = new int[imgHeight * imgWidth];
+    } catch (Exception e) {
+        Log.e(TAG, e.getMessage());
+        Log.e(TAG, "Failed to initialize ImageNet classifier for the benchmarker.");
+    }
+  }
+
+  /** Return how many classes are predicted per image. */
+  public int getNumPredictions() {
+    return classifier.getNumPredictions();
+  }
+
+  /**
+   * Perform test on a single bitmap image.
+   *
+   * @param bitmap: a {@link Bitmap} image to classify.
+   */
+  public OvicSingleImageResult doTestIteration(Bitmap bitmap)
+      throws IOException, InterruptedException {
+    if (shouldStop() || !readyToTest()) {
+      return null;
+    }
+    OvicSingleImageResult iterResult = null;
+    try {
+      Log.i(TAG, "Converting bitmap.");
+      convertBitmapToInput(bitmap);
+      Log.i(TAG, "Classifying image.");
+      iterResult = classifier.classifyByteBuffer(imgData);
+    } catch (RuntimeException e) {
+      Log.e(TAG, e.getMessage());
+      Log.e(TAG, "Failed to classify image.");
+    }
+    if (iterResult == null || iterResult.latency == null) {
+      throw new RuntimeException("Classification result or timing is invalid.");
+    }
+    Log.d(TAG, "Native inference latency: " + iterResult.latency);
+    Log.i(TAG, iterResult.toString());
+
+    if (!benchmarkStarted) {  // Skip the first image to discount warming-up time.
+      benchmarkStarted = true;
+    } else {
+      totalRuntime += (double) iterResult.latency;
+    }
+    return iterResult;
+  }
+
+  /**
+   * Writes Image data into a {@link ByteBuffer}.
+   *
+   * @param bitmap: a {@link Bitmap} source image.
+   */
+  private void convertBitmapToInput(Bitmap bitmap) throws RuntimeException {
+    if (imgData == null) {
+      throw new RuntimeException("Benchmarker is not yet ready to test.");
+    }
+    imgData.rewind();
+    // Perform transformations corresponding to evaluation mode.
+    float width = (float) bitmap.getWidth();
+    float height = (float) bitmap.getHeight();
+    int stWidth = Math.round((width - width * CENTRAL_FRACTION) / 2);
+    int stHeight = Math.round((height - height * CENTRAL_FRACTION) / 2);
+    int newWidth = Math.round(width - stWidth * 2);
+    int newHeight = Math.round(height - stHeight * 2);
+    bitmap = Bitmap.createBitmap(bitmap, stWidth, stHeight, newWidth, newHeight);
+    bitmap = Bitmap.createScaledBitmap(bitmap, imgWidth, imgHeight, true);
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+
+    // Convert the image to ByteBuffer.
+    int pixel = 0;
+    long startTime = SystemClock.uptimeMillis();
+
+    for (int i = 0; i < imgHeight; ++i) {
+      for (int j = 0; j < imgWidth; ++j) {
+        final int val = intValues[pixel++];
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    long endTime = SystemClock.uptimeMillis();
+    Log.d(TAG, "Timecost to put values into ByteBuffer: " + Long.toString(endTime - startTime));
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
new file mode 100644
index 0000000000000000000000000000000000000000..b2dfd8f2e710324f6c11a3098b858ffee8b28b3c
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
@@ -0,0 +1,209 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.TestHelper;
+
+/** Benchmark ImageNet Classifier with Tensorflow Lite. */
+public class OvicClassifier {
+
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "OvicClassifier";
+
+  /** Number of results to show (i.e. the "K" in top-K predictions). */
+  private static final int RESULTS_TO_SHOW = 5;
+
+  /** An instance of the driver class to run model inference with Tensorflow Lite. */
+  private Interpreter tflite;
+
+  /** Labels corresponding to the output of the vision model. */
+  private List<String> labelList;
+
+  /** An array to hold inference results, to be feed into Tensorflow Lite as outputs. */
+  private byte[][] inferenceOutputArray = null;
+  /** An array to hold final prediction probabilities. */
+  private float[][] labelProbArray = null;
+
+  /** Input resultion. */
+  private int[] inputDims = null;
+  /** Whether the model runs as float or quantized. */
+  private Boolean outputIsFloat = null;
+
+  private PriorityQueue<Map.Entry<Integer, Float>> sortedLabels =
+      new PriorityQueue<>(
+          RESULTS_TO_SHOW,
+          new Comparator<Map.Entry<Integer, Float>>() {
+            @Override
+            public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+              return (o1.getValue()).compareTo(o2.getValue());
+            }
+          });
+
+  /** Initializes an {@code OvicClassifier}. */
+  OvicClassifier(InputStream labelInputStream, MappedByteBuffer model)
+      throws IOException, RuntimeException {
+    if (model == null) {
+      throw new RuntimeException("Input model is empty.");
+    }
+    labelList = loadLabelList(labelInputStream);
+    // OVIC uses one thread for CPU inference.
+    tflite = new Interpreter(model, 1);
+    inputDims = TestHelper.getInputDims(tflite, 0);
+    if (inputDims.length != 4) {
+      throw new RuntimeException("The model's input dimensions must be 4 (BWHC).");
+    }
+    if (inputDims[0] != 1) {
+      throw new RuntimeException("The model must have a batch size of 1, got "
+          + inputDims[0] + " instead.");
+    }
+    if (inputDims[3] != 3) {
+      throw new RuntimeException("The model must have three color channels, got "
+          + inputDims[3] + " instead.");
+    }
+    int minSide = Math.min(inputDims[1], inputDims[2]);
+    int maxSide = Math.max(inputDims[1], inputDims[2]);
+    if (minSide <= 0 || maxSide > 1000) {
+      throw new RuntimeException("The model's resolution must be between (0, 1000].");
+    }
+    String outputDataType = TestHelper.getOutputDataType(tflite, 0);
+    if (outputDataType.equals("float")) {
+      outputIsFloat = true;
+    } else if (outputDataType.equals("byte")) {
+      outputIsFloat = false;
+    } else {
+      throw new RuntimeException("Cannot process output type: " + outputDataType);
+    }
+    inferenceOutputArray = new byte[1][labelList.size()];
+    labelProbArray = new float[1][labelList.size()];
+  }
+
+  /** Classifies a {@link ByteBuffer} image. */
+  // @throws RuntimeException if model is uninitialized.
+  OvicSingleImageResult classifyByteBuffer(ByteBuffer imgData) throws RuntimeException {
+    if (tflite == null) {
+      throw new RuntimeException(TAG + ": ImageNet classifier has not been initialized; Failed.");
+    }
+    if (outputIsFloat == null) {
+      throw new RuntimeException(TAG + ": Classifier output type has not been resolved.");
+    }
+    if (outputIsFloat) {
+      tflite.run(imgData, labelProbArray);
+    } else {
+      tflite.run(imgData, inferenceOutputArray);
+      /** Convert results to float */
+      for (int i = 0; i < inferenceOutputArray[0].length; i++) {
+        labelProbArray[0][i] = (inferenceOutputArray[0][i] & 0xff) / 255.0f;
+      }
+    }
+    OvicSingleImageResult iterResult = computeTopKLabels();
+    iterResult.latency = getLastNativeInferenceLatencyMilliseconds();
+    return iterResult;
+  }
+
+  /** Return the probability array of all classes. */
+  public float[][] getlabelProbArray() {
+    return labelProbArray;
+  }
+
+  /** Return the number of top labels predicted by the classifier. */
+  public int getNumPredictions() {
+    return RESULTS_TO_SHOW;
+  }
+
+  /** Return the four dimensions of the input image. */
+  public int[] getInputDims() {
+    return inputDims;
+  }
+
+  /*
+   * Get native inference latency of last image classification run.
+   *  @throws RuntimeException if model is uninitialized.
+   */
+  public Long getLastNativeInferenceLatencyMilliseconds() {
+    if (tflite == null) {
+      throw new RuntimeException(TAG + ": ImageNet classifier has not been initialized; Failed.");
+    }
+    Long latency = tflite.getLastNativeInferenceDurationNanoseconds();
+    return (latency == null) ? null : (Long) (latency / 1000000);
+  }
+
+  /** Closes tflite to release resources. */
+  public void close() {
+    tflite.close();
+    tflite = null;
+  }
+
+  /** Reads label list from Assets. */
+  private static List<String> loadLabelList(InputStream labelInputStream) throws IOException {
+    List<String> labelList = new ArrayList<String>();
+    try (BufferedReader reader =
+        new BufferedReader(new InputStreamReader(labelInputStream, StandardCharsets.UTF_8))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        labelList.add(line);
+      }
+    }
+    return labelList;
+  }
+
+  /** Computes top-K labels. */
+  private OvicSingleImageResult computeTopKLabels() {
+    if (labelList == null) {
+      throw new RuntimeException("Label file has not been loaded.");
+    }
+    for (int i = 0; i < labelList.size(); ++i) {
+      sortedLabels.add(new AbstractMap.SimpleEntry<>(i, labelProbArray[0][i]));
+      if (sortedLabels.size() > RESULTS_TO_SHOW) {
+        sortedLabels.poll();
+      }
+    }
+    OvicSingleImageResult singleImageResult = new OvicSingleImageResult();
+    if (sortedLabels.size() != RESULTS_TO_SHOW) {
+      throw new RuntimeException(
+          "Number of returned labels does not match requirement: "
+              + sortedLabels.size()
+              + " returned, but "
+              + RESULTS_TO_SHOW
+              + " required.");
+    }
+    for (int i = 0; i < RESULTS_TO_SHOW; ++i) {
+      Map.Entry<Integer, Float> label = sortedLabels.poll();
+      // ImageNet model prediction indices are 0-based.
+      singleImageResult.topKIndices.add(label.getKey());
+      singleImageResult.topKClasses.add(labelList.get(label.getKey()));
+      singleImageResult.topKProbs.add(label.getValue());
+    }
+    // Labels with lowest probability are returned first, hence need to reverse them.
+    Collections.reverse(singleImageResult.topKIndices);
+    Collections.reverse(singleImageResult.topKClasses);
+    Collections.reverse(singleImageResult.topKProbs);
+    return singleImageResult;
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java
new file mode 100644
index 0000000000000000000000000000000000000000..4af9a65c2f45c57b979bf9629e34f52bb0853a44
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java
@@ -0,0 +1,54 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import java.util.ArrayList;
+
+/** Result class for inference run on a single image. */
+public class OvicSingleImageResult {
+
+  /** Top K classes and probabilities. */
+  public ArrayList<String> topKClasses;
+  public ArrayList<Float> topKProbs;
+  public ArrayList<Integer> topKIndices;
+
+  /** Latency (ms). */
+  public Long latency;
+
+  OvicSingleImageResult() {
+    topKClasses = new ArrayList<>();
+    topKProbs = new ArrayList<>();
+    topKIndices = new ArrayList<>();
+    latency = -1L;
+  }
+
+  @Override
+  public String toString() {
+    String textToShow = latency + "ms";
+    for (int k = 0; k < topKProbs.size(); ++k) {
+      textToShow +=
+          "\nPrediction ["
+              + k
+              + "] = Class "
+              + Integer.toString(topKIndices.get(k))
+              + " ("
+              + topKClasses.get(k)
+              + ") : "
+              + Float.toString(topKProbs.get(k));
+    }
+    return textToShow;
+  }
+
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..098ed8ceba52b6d44868353b80cab0862e334d4d
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -0,0 +1,177 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.fail;
+
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import javax.imageio.ImageIO;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.ovic.OvicClassifier}. */
+@RunWith(JUnit4.class)
+public final class OvicClassifierTest {
+
+  private OvicClassifier classifier;
+  private InputStream labelsInputStream = null;
+  private MappedByteBuffer quantizedModel = null;
+  private MappedByteBuffer floatModel = null;
+  private MappedByteBuffer lowResModel = null;
+  private ByteBuffer testImage = null;
+  private ByteBuffer lowResTestImage = null;
+  private OvicSingleImageResult testResult = null;
+  private static final String LABELS_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
+  private static final String QUANTIZED_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/quantized_model.lite";
+  private static final String LOW_RES_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/low_res_model.lite";
+  private static final String FLOAT_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/float_model.lite";
+  private static final String TEST_IMAGE_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_224.jpg";
+  private static final String TEST_LOW_RES_IMAGE_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_128.jpg";
+  private static final int TEST_IMAGE_GROUNDTRUTH = 653; // "military uniform"
+
+  @Before
+  public void setUp() {
+    try {
+      File labelsfile = new File(LABELS_PATH);
+      labelsInputStream = new FileInputStream(labelsfile);
+      quantizedModel = loadModelFile(QUANTIZED_MODEL_PATH);
+      floatModel = loadModelFile(FLOAT_MODEL_PATH);
+      lowResModel = loadModelFile(LOW_RES_MODEL_PATH);
+      File imageFile = new File(TEST_IMAGE_PATH);
+      BufferedImage img = ImageIO.read(imageFile);
+      testImage = toByteBuffer(img);
+      // Low res image and models.
+      imageFile = new File(TEST_LOW_RES_IMAGE_PATH);
+      img = ImageIO.read(imageFile);
+      lowResTestImage = toByteBuffer(img);
+    } catch (IOException e) {
+      System.out.print(e.getMessage());
+    }
+    System.out.println("Successful setup");
+  }
+
+  @Test
+  public void ovicClassifier_quantizedModelCreateSuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, quantizedModel);
+    assertThat(classifier != null).isTrue();
+  }
+
+  @Test
+  public void ovicClassifier_floatModelCreateSuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, floatModel);
+    assertThat(classifier != null).isTrue();
+  }
+
+  @Test
+  public void ovicClassifier_quantizedModelClassifySuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, quantizedModel);
+    testResult = classifier.classifyByteBuffer(testImage);
+    assertCorrectTopK(testResult);
+  }
+
+  @Test
+  public void ovicClassifier_floatModelClassifySuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, floatModel);
+    testResult = classifier.classifyByteBuffer(testImage);
+    assertCorrectTopK(testResult);
+  }
+
+  @Test
+  public void ovicClassifier_lowResModelClassifySuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, lowResModel);
+    testResult = classifier.classifyByteBuffer(lowResTestImage);
+    assertCorrectTopK(testResult);
+  }
+
+  @Test
+  public void ovicClassifier_latencyNotNull() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, floatModel);
+    testResult = classifier.classifyByteBuffer(testImage);
+    assertThat(testResult.latency != null).isTrue();
+  }
+
+  @Test
+  public void ovicClassifier_mismatchedInputResolutionFails() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, lowResModel);
+    int[] inputDims = classifier.getInputDims();
+    assertThat((inputDims[1] == 128) && (inputDims[2] == 128)).isTrue();
+    try {
+      testResult = classifier.classifyByteBuffer(testImage);
+      fail();
+    } catch (RuntimeException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Failed to get input dimensions. 0-th input should have 49152 bytes, "
+                  + "but found 150528 bytes.");
+    }
+  }
+
+  private static ByteBuffer toByteBuffer(BufferedImage image) {
+    ByteBuffer imgData = ByteBuffer.allocateDirect(
+        image.getHeight() * image.getWidth() * 3);
+    imgData.order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int val = image.getRGB(x, y);
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    return imgData;
+  }
+
+  private static void assertCorrectTopK(OvicSingleImageResult testResult) {
+    assertThat(testResult.topKClasses.size() > 0).isTrue();
+    Boolean topKAccurate = false;
+    // Assert that the correct class is in the top K.
+    for (int i = 0; i < testResult.topKIndices.size(); i++) {
+      if (testResult.topKIndices.get(i) == TEST_IMAGE_GROUNDTRUTH) {
+        topKAccurate = true;
+        break;
+      }
+    }
+    System.out.println(testResult.toString());
+    System.out.flush();
+    assertThat(topKAccurate).isTrue();
+  }
+
+  private static MappedByteBuffer loadModelFile(String modelFilePath) throws IOException {
+    File modelfile = new File(modelFilePath);
+    FileInputStream inputStream = new FileInputStream(modelfile);
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = 0L;
+    long declaredLength = fileChannel.size();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt b/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe811239d8e2989de19fecabb1ebb0c9dddac514
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt
@@ -0,0 +1,1001 @@
+background
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/contrib/lite/java/proguard.flags b/tensorflow/contrib/lite/java/proguard.flags
new file mode 100644
index 0000000000000000000000000000000000000000..8ee3d7e7ae728b27789336ac56208acdf13ee424
--- /dev/null
+++ b/tensorflow/contrib/lite/java/proguard.flags
@@ -0,0 +1,3 @@
+-keepclassmembers class org.tensorflow.lite.NativeInterpreterWrapper {
+    private long inferenceDurationNanoseconds;
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
index d63c299589d2e8ce1051a52d29b533ed126bbcf7..75334cd96e8daadc356dadea063eee30ef6d5245 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
@@ -51,7 +51,11 @@ enum DataType {
       }
     }
     throw new IllegalArgumentException(
-        "DataType " + c + " is not recognized in Java (version " + TensorFlowLite.version() + ")");
+        "DataType error: DataType "
+            + c
+            + " is not recognized in Java (version "
+            + TensorFlowLite.version()
+            + ")");
   }
 
   /** Returns byte size of the type. */
@@ -68,7 +72,26 @@ enum DataType {
       case BYTEBUFFER:
         return 1;
     }
-    throw new IllegalArgumentException("DataType " + this + " is not supported yet");
+    throw new IllegalArgumentException(
+        "DataType error: DataType " + this + " is not supported yet");
+  }
+
+  /** Gets string names of the data type. */
+  String toStringName() {
+    switch (this) {
+      case FLOAT32:
+        return "float";
+      case INT32:
+        return "int";
+      case UINT8:
+        return "byte";
+      case INT64:
+        return "long";
+      case BYTEBUFFER:
+        return "ByteBuffer";
+    }
+    throw new IllegalArgumentException(
+        "DataType error: DataType " + this + " is not supported yet");
   }
 
   // Cached to avoid copying it
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index dd883d69d2065236ee29012b9bde99972aefbcf7..e84ee7112983ec584308b7cbcd919f119eccbcc9 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -19,7 +19,7 @@ import java.io.File;
 import java.nio.MappedByteBuffer;
 import java.util.HashMap;
 import java.util.Map;
-import javax.validation.constraints.NotNull;
+import org.checkerframework.checker.nullness.qual.NonNull;
 
 /**
  * Driver class to drive model inference with TensorFlow Lite.
@@ -60,33 +60,60 @@ public final class Interpreter implements AutoCloseable {
    *
    * @param modelFile: a File of a pre-trained TF Lite model.
    */
-  public Interpreter(@NotNull File modelFile) {
+  public Interpreter(@NonNull File modelFile) {
     if (modelFile == null) {
       return;
     }
     wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath());
   }
 
+  /**
+   * Initializes a {@code Interpreter} and specifies the number of threads used for inference.
+   *
+   * @param modelFile: a file of a pre-trained TF Lite model
+   * @param numThreads: number of threads to use for inference
+   */
+  public Interpreter(@NonNull File modelFile, int numThreads) {
+    if (modelFile == null) {
+      return;
+    }
+    wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath(), numThreads);
+  }
+
   /**
    * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file.
    *
    * <p>The {@code MappedByteBuffer} should remain unchanged after the construction of a {@code
    * Interpreter}.
    */
-  public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer) {
+  public Interpreter(@NonNull MappedByteBuffer mappedByteBuffer) {
     wrapper = new NativeInterpreterWrapper(mappedByteBuffer);
   }
 
+  /**
+   * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file and
+   * specifies the number of threads used for inference.
+   *
+   * <p>The {@code MappedByteBuffer} should remain unchanged after the construction of a {@code
+   * Interpreter}.
+   */
+  public Interpreter(@NonNull MappedByteBuffer mappedByteBuffer, int numThreads) {
+    wrapper = new NativeInterpreterWrapper(mappedByteBuffer, numThreads);
+  }
+
   /**
    * Runs model inference if the model takes only one input, and provides only one output.
    *
+   * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
+   * consider using {@link ByteBuffer} to feed input data for better performance.
+   *
    * @param input an array or multidimensional array, or a {@link ByteBuffer} of primitive types
    *     including int, float, long, and byte. {@link ByteBuffer} is the preferred way to pass large
    *     input data. When {@link ByteBuffer} is used, its content should remain unchanged until
    *     model inference is done.
    * @param output a multidimensional array of output data.
    */
-  public void run(@NotNull Object input, @NotNull Object output) {
+  public void run(@NonNull Object input, @NonNull Object output) {
     Object[] inputs = {input};
     Map<Integer, Object> outputs = new HashMap<>();
     outputs.put(0, output);
@@ -96,6 +123,9 @@ public final class Interpreter implements AutoCloseable {
   /**
    * Runs model inference if the model takes multiple inputs, or returns multiple outputs.
    *
+   * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
+   * consider using {@link ByteBuffer} to feed input data for better performance.
+   *
    * @param inputs an array of input data. The inputs should be in the same order as inputs of the
    *     model. Each input can be an array or multidimensional array, or a {@link ByteBuffer} of
    *     primitive types including int, float, long, and byte. {@link ByteBuffer} is the preferred
@@ -105,19 +135,21 @@ public final class Interpreter implements AutoCloseable {
    *     needs to keep entries for the outputs to be used.
    */
   public void runForMultipleInputsOutputs(
-      @NotNull Object[] inputs, @NotNull Map<Integer, Object> outputs) {
+      @NonNull Object[] inputs, @NonNull Map<Integer, Object> outputs) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     Tensor[] tensors = wrapper.run(inputs);
     if (outputs == null || tensors == null || outputs.size() > tensors.length) {
-      throw new IllegalArgumentException("Outputs do not match with model outputs.");
+      throw new IllegalArgumentException("Output error: Outputs do not match with model outputs.");
     }
     final int size = tensors.length;
     for (Integer idx : outputs.keySet()) {
       if (idx == null || idx < 0 || idx >= size) {
         throw new IllegalArgumentException(
-            String.format("Invalid index of output %d (should be in range [0, %d))", idx, size));
+            String.format(
+                "Output error: Invalid index of output %d (should be in range [0, %d))",
+                idx, size));
       }
       tensors[idx].copyTo(outputs.get(idx));
     }
@@ -128,9 +160,9 @@ public final class Interpreter implements AutoCloseable {
    *
    * <p>IllegalArgumentException will be thrown if it fails to resize.
    */
-  public void resizeInput(int idx, @NotNull int[] dims) {
+  public void resizeInput(int idx, @NonNull int[] dims) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     wrapper.resizeInput(idx, dims);
   }
@@ -143,7 +175,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public int getInputIndex(String opName) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     return wrapper.getInputIndex(opName);
   }
@@ -156,11 +188,40 @@ public final class Interpreter implements AutoCloseable {
    */
   public int getOutputIndex(String opName) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     return wrapper.getOutputIndex(opName);
   }
 
+  /**
+   * Returns native inference timing.
+   * <p>IllegalArgumentException will be thrown if the model is not initialized by the
+   * {@link Interpreter}.
+   */
+  public Long getLastNativeInferenceDurationNanoseconds() {
+    if (wrapper == null) {
+      throw new IllegalStateException("Internal error: The interpreter has already been closed.");
+    }
+    return wrapper.getLastNativeInferenceDurationNanoseconds();
+  }
+
+  /** Turns on/off Android NNAPI for hardware acceleration when it is available. */
+  public void setUseNNAPI(boolean useNNAPI) {
+    if (wrapper != null) {
+      wrapper.setUseNNAPI(useNNAPI);
+    } else {
+      throw new IllegalStateException(
+          "Internal error: NativeInterpreterWrapper has already been closed.");
+    }
+  }
+
+  public void setNumThreads(int num_threads) {
+    if (wrapper == null) {
+      throw new IllegalStateException("The interpreter has already been closed.");
+    }
+    wrapper.setNumThreads(num_threads);
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 5ee594dec492ad2fee22e603a6de311b3fed4cac..2fc803715be5e5afbc5d548d46c1665785f6055d 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -32,9 +32,14 @@ import java.util.Map;
 final class NativeInterpreterWrapper implements AutoCloseable {
 
   NativeInterpreterWrapper(String modelPath) {
+    this(modelPath, /* numThreads= */ -1);
+  }
+
+  NativeInterpreterWrapper(String modelPath, int numThreads) {
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModel(modelPath, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle);
+    interpreterHandle = createInterpreter(modelHandle, errorHandle, numThreads);
+    isMemoryAllocated = true;
   }
 
   /**
@@ -43,10 +48,20 @@ final class NativeInterpreterWrapper implements AutoCloseable {
    * NativeInterpreterWrapper}.
    */
   NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer) {
+    this(mappedByteBuffer, /* numThreads= */ -1);
+  }
+
+  /**
+   * Initializes a {@code NativeInterpreterWrapper} with a {@code MappedByteBuffer} and specifies
+   * the number of inference threads. The MappedByteBuffer should not be modified after the
+   * construction of a {@code NativeInterpreterWrapper}.
+   */
+  NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer, int numThreads) {
     modelByteBuffer = mappedByteBuffer;
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle);
+    interpreterHandle = createInterpreter(modelHandle, errorHandle, numThreads);
+    isMemoryAllocated = true;
   }
 
   /** Releases resources associated with this {@code NativeInterpreterWrapper}. */
@@ -59,12 +74,13 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     modelByteBuffer = null;
     inputsIndexes = null;
     outputsIndexes = null;
+    isMemoryAllocated = false;
   }
 
   /** Sets inputs, runs model inference and returns outputs. */
   Tensor[] run(Object[] inputs) {
     if (inputs == null || inputs.length == 0) {
-      throw new IllegalArgumentException("Invalid inputs. Inputs should not be null or empty.");
+      throw new IllegalArgumentException("Input error: Inputs should not be null or empty.");
     }
     int[] dataTypes = new int[inputs.length];
     Object[] sizes = new Object[inputs.length];
@@ -76,7 +92,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
         ByteBuffer buffer = (ByteBuffer) inputs[i];
         if (buffer.order() != ByteOrder.nativeOrder()) {
           throw new IllegalArgumentException(
-              "Invalid ByteBuffer. It shoud use ByteOrder.nativeOrder().");
+              "Input error: ByteBuffer shoud use ByteOrder.nativeOrder().");
         }
         numsOfBytes[i] = buffer.limit();
         sizes[i] = getInputDims(interpreterHandle, i, numsOfBytes[i]);
@@ -87,15 +103,25 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       } else {
         throw new IllegalArgumentException(
             String.format(
-                "%d-th element of the %d inputs is not an array or a ByteBuffer.",
+                "Input error: %d-th element of the %d inputs is not an array or a ByteBuffer.",
                 i, inputs.length));
       }
     }
+    inferenceDurationNanoseconds = -1;
     long[] outputsHandles =
-        run(interpreterHandle, errorHandle, sizes, dataTypes, numsOfBytes, inputs);
+        run(
+            interpreterHandle,
+            errorHandle,
+            sizes,
+            dataTypes,
+            numsOfBytes,
+            inputs,
+            this,
+            isMemoryAllocated);
     if (outputsHandles == null || outputsHandles.length == 0) {
-      throw new IllegalStateException("Interpreter has no outputs.");
+      throw new IllegalStateException("Internal error: Interpreter has no outputs.");
     }
+    isMemoryAllocated = true;
     Tensor[] outputs = new Tensor[outputsHandles.length];
     for (int i = 0; i < outputsHandles.length; ++i) {
       outputs[i] = Tensor.fromHandle(outputsHandles[i]);
@@ -109,20 +135,28 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       Object[] sizes,
       int[] dtypes,
       int[] numsOfBytes,
-      Object[] values);
+      Object[] values,
+      NativeInterpreterWrapper wrapper,
+      boolean memoryAllocated);
 
   /** Resizes dimensions of a specific input. */
   void resizeInput(int idx, int[] dims) {
-    resizeInput(interpreterHandle, errorHandle, idx, dims);
+    if (resizeInput(interpreterHandle, errorHandle, idx, dims)) {
+      isMemoryAllocated = false;
+    }
   }
 
-  private static native void resizeInput(
+  private static native boolean resizeInput(
       long interpreterHandle, long errorHandle, int inputIdx, int[] dims);
 
   void setUseNNAPI(boolean useNNAPI) {
     useNNAPI(interpreterHandle, useNNAPI);
   }
 
+  void setNumThreads(int num_threads) {
+    numThreads(interpreterHandle, num_threads);
+  }
+
   /** Gets index of an input given its name. */
   int getInputIndex(String name) {
     if (inputsIndexes == null) {
@@ -139,7 +173,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     } else {
       throw new IllegalArgumentException(
           String.format(
-              "%s is not a valid name for any input. The indexes of the inputs are %s",
+              "Input error: %s is not a valid name for any input. "
+                  + "The indexes of the inputs are %s",
               name, inputsIndexes.toString()));
     }
   }
@@ -160,7 +195,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     } else {
       throw new IllegalArgumentException(
           String.format(
-              "%s is not a valid name for any output. The indexes of the outputs are %s",
+              "Input error: %s is not a valid name for any output. "
+                  + "The indexes of the outputs are %s",
               name, outputsIndexes.toString()));
     }
   }
@@ -199,7 +235,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
         return DataType.BYTEBUFFER;
       }
     }
-    throw new IllegalArgumentException("cannot resolve DataType of " + o.getClass().getName());
+    throw new IllegalArgumentException(
+        "DataType error: cannot resolve DataType of " + o.getClass().getName());
   }
 
   /** Returns the shape of an object as an int array. */
@@ -215,7 +252,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       return 0;
     }
     if (Array.getLength(o) == 0) {
-      throw new IllegalArgumentException("array lengths cannot be 0.");
+      throw new IllegalArgumentException("Array lengths cannot be 0.");
     }
     return 1 + numDimensions(Array.get(o, 0));
   }
@@ -229,13 +266,42 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       shape[dim] = len;
     } else if (shape[dim] != len) {
       throw new IllegalArgumentException(
-          String.format("mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim));
+          String.format("Mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim));
     }
     for (int i = 0; i < len; ++i) {
       fillShape(Array.get(o, i), dim + 1, shape);
     }
   }
 
+  /**
+   * Gets the last inference duration in nanoseconds. It returns null if there is no previous
+   * inference run or the last inference run failed.
+   */
+  Long getLastNativeInferenceDurationNanoseconds() {
+    return (inferenceDurationNanoseconds < 0) ? null : inferenceDurationNanoseconds;
+  }
+
+  /**
+   * Gets the dimensions of an input. It throws IllegalArgumentException if input index is invalid.
+   */
+  int[] getInputDims(int index) {
+    return getInputDims(interpreterHandle, index, -1);
+  }
+
+  /**
+   * Gets the dimensions of an input. If numBytes >= 0, it will check whether num of bytes match the
+   * input.
+   */
+  private static native int[] getInputDims(long interpreterHandle, int inputIdx, int numBytes);
+
+  /** Gets the type of an output. It throws IllegalArgumentException if output index is invalid. */
+  String getOutputDataType(int index) {
+    int type = getOutputDataType(interpreterHandle, index);
+    return DataType.fromNumber(type).toStringName();
+  }
+
+  private static native int getOutputDataType(long interpreterHandle, int outputIdx);
+
   private static final int ERROR_BUFFER_SIZE = 512;
 
   private long errorHandle;
@@ -246,30 +312,34 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private int inputSize;
 
+  private long inferenceDurationNanoseconds = -1;
+
   private MappedByteBuffer modelByteBuffer;
 
   private Map<String, Integer> inputsIndexes;
 
   private Map<String, Integer> outputsIndexes;
 
+  private boolean isMemoryAllocated = false;
+
   private static native String[] getInputNames(long interpreterHandle);
 
   private static native String[] getOutputNames(long interpreterHandle);
 
   private static native void useNNAPI(long interpreterHandle, boolean state);
 
+  private static native void numThreads(long interpreterHandle, int num_threads);
+
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
 
   private static native long createModelWithBuffer(MappedByteBuffer modelBuffer, long errorHandle);
 
-  private static native long createInterpreter(long modelHandle, long errorHandle);
+  private static native long createInterpreter(long modelHandle, long errorHandle, int numThreads);
 
   private static native void delete(long errorHandle, long modelHandle, long interpreterHandle);
 
-  private static native int[] getInputDims(long interpreterHandle, int inputIdx, int numBytes);
-
   static {
     TensorFlowLite.init();
   }
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 54ace6c63ce5bd1b38be744176d0378e3cc8a1d3..09e887aae3339e9f114c07d689c0d7b5e2fc384b 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -34,15 +34,16 @@ final class Tensor {
     if (NativeInterpreterWrapper.dataTypeOf(dst) != dtype) {
       throw new IllegalArgumentException(
           String.format(
-              "Cannot convert an TensorFlowLite tensor with type %s to a Java object of "
-                  + "type %s (which is compatible with the TensorFlowLite type %s)",
+              "Output error: Cannot convert an TensorFlowLite tensor with type %s to a Java "
+                  + "object of type %s (which is compatible with the TensorFlowLite type %s)",
               dtype, dst.getClass().getName(), NativeInterpreterWrapper.dataTypeOf(dst)));
     }
     int[] dstShape = NativeInterpreterWrapper.shapeOf(dst);
     if (!Arrays.equals(dstShape, shapeCopy)) {
       throw new IllegalArgumentException(
           String.format(
-              "Shape of output target %s does not match with the shape of the Tensor %s.",
+              "Output error: Shape of output target %s does not match with the shape of the "
+                  + "Tensor %s.",
               Arrays.toString(dstShape), Arrays.toString(shapeCopy)));
     }
     readMultiDimensionalArray(nativeHandle, dst);
diff --git a/tensorflow/contrib/lite/java/src/main/native/BUILD b/tensorflow/contrib/lite/java/src/main/native/BUILD
index 15806d57c8ed7a45d2db9b80e2aab8e22349ee3e..4399ed202597082fba36c04a744bf6378e4539a2 100644
--- a/tensorflow/contrib/lite/java/src/main/native/BUILD
+++ b/tensorflow/contrib/lite/java/src/main/native/BUILD
@@ -11,6 +11,7 @@ licenses(["notice"])  # Apache 2.0
 cc_library(
     name = "native_framework_only",
     srcs = [
+        "duration_utils_jni.cc",
         "exception_jni.cc",
         "nativeinterpreterwrapper_jni.cc",
         "tensor_jni.cc",
@@ -94,15 +95,3 @@ exports_files(
         "version_script.lds",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/src/main/native/duration_utils_jni.cc b/tensorflow/contrib/lite/java/src/main/native/duration_utils_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e08a04370592f6e3c92b5811fa7e163f808e03c
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/duration_utils_jni.cc
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <jni.h>
+#include <time.h>
+
+namespace tflite {
+
+// Gets the elapsed wall-clock timespec.
+timespec getCurrentTime() {
+  timespec time;
+  clock_gettime(CLOCK_MONOTONIC, &time);
+  return time;
+}
+
+// Computes the time diff from two timespecs. Returns '-1' if 'stop' is earlier
+// than 'start'.
+jlong timespec_diff_nanoseconds(struct timespec* start, struct timespec* stop) {
+  jlong result = stop->tv_sec - start->tv_sec;
+  if (result < 0) return -1;
+  result = 1000000000 * result + (stop->tv_nsec - start->tv_nsec);
+  if (result < 0) return -1;
+  return result;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
index 1578c9e3ddd034ad9ce17c8c3ae6c942258e2a55..34d91be04cd6c855a2068510ca810c0b93637584 100644
--- a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
@@ -44,7 +44,8 @@ BufferErrorReporter::BufferErrorReporter(JNIEnv* env, int limit) {
   buffer_ = new char[limit];
   if (!buffer_) {
     throwException(env, kNullPointerException,
-                   "Malloc of BufferErrorReporter to hold %d char failed.",
+                   "Internal error: Malloc of BufferErrorReporter to hold %d "
+                   "char failed.",
                    limit);
     return;
   }
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index c346f9f92e360c0722ebac440d790da6441ceecf..45f510da1d940a288e2794cb3e08f66451956b64 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h"
-
 namespace {
 
 const int kByteBufferValue = 999;
@@ -23,7 +22,7 @@ const int kBufferSize = 256;
 tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
-                   "Invalid handle to Interpreter.");
+                   "Internal error: Invalid handle to Interpreter.");
     return nullptr;
   }
   return reinterpret_cast<tflite::Interpreter*>(handle);
@@ -31,7 +30,8 @@ tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
 
 tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalArgumentException, "Invalid handle to model.");
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Invalid handle to model.");
     return nullptr;
   }
   return reinterpret_cast<tflite::FlatBufferModel*>(handle);
@@ -40,7 +40,7 @@ tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) {
 BufferErrorReporter* convertLongToErrorReporter(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
-                   "Invalid handle to ErrorReporter.");
+                   "Internal error: Invalid handle to ErrorReporter.");
     return nullptr;
   }
   return reinterpret_cast<BufferErrorReporter*>(handle);
@@ -52,7 +52,7 @@ std::vector<int> convertJIntArrayToVector(JNIEnv* env, jintArray inputs) {
   jint* ptr = env->GetIntArrayElements(inputs, nullptr);
   if (ptr == nullptr) {
     throwException(env, kIllegalArgumentException,
-                   "Empty dimensions of input array.");
+                   "Array has empty dimensions.");
     return {};
   }
   for (int i = 0; i < size; ++i) {
@@ -79,6 +79,21 @@ TfLiteType resolveDataType(jint data_type) {
   }
 }
 
+int getDataType(TfLiteType data_type) {
+  switch (data_type) {
+    case kTfLiteFloat32:
+      return 1;
+    case kTfLiteInt32:
+      return 2;
+    case kTfLiteUInt8:
+      return 3;
+    case kTfLiteInt64:
+      return 4;
+    default:
+      return -1;
+  }
+}
+
 void printDims(char* buffer, int max_size, int* dims, int num_dims) {
   if (max_size <= 0) return;
   buffer[0] = '?';
@@ -99,7 +114,7 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
                          jobjectArray sizes) {
   if (input_size != interpreter->inputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Expected num of inputs is %d but got %d",
+                   "Input error: Expected num of inputs is %d but got %d",
                    interpreter->inputs().size(), input_size);
     return kTfLiteError;
   }
@@ -107,8 +122,9 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
       input_size != env->GetArrayLength(nums_of_bytes) ||
       input_size != env->GetArrayLength(values)) {
     throwException(env, kIllegalArgumentException,
-                   "Arrays in arguments should be of the same length, but got "
-                   "%d sizes, %d data_types, %d nums_of_bytes, and %d values",
+                   "Internal error: Arrays in arguments should be of the same "
+                   "length, but got %d sizes, %d data_types, %d nums_of_bytes, "
+                   "and %d values",
                    input_size, env->GetArrayLength(data_types),
                    env->GetArrayLength(nums_of_bytes),
                    env->GetArrayLength(values));
@@ -122,8 +138,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
     int num_dims = static_cast<int>(env->GetArrayLength(dims));
     if (target->dims->size != num_dims) {
       throwException(env, kIllegalArgumentException,
-                     "%d-th input should have %d dimensions, but found %d "
-                     "dimensions",
+                     "Input error: %d-th input should have %d dimensions, but "
+                     "found %d dimensions",
                      i, target->dims->size, num_dims);
       return kTfLiteError;
     }
@@ -136,7 +152,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
                   num_dims);
         printDims(obtained_dims.get(), kBufferSize, ptr, num_dims);
         throwException(env, kIllegalArgumentException,
-                       "%d-th input dimension should be [%s], but found [%s]",
+                       "Input error: %d-th input dimension should be [%s], but "
+                       "found [%s]",
                        i, expected_dims.get(), obtained_dims.get());
         env->ReleaseIntArrayElements(dims, ptr, JNI_ABORT);
         return kTfLiteError;
@@ -149,6 +166,45 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
   return kTfLiteOk;
 }
 
+// Checks whether there is any difference between dimensions of a tensor and a
+// given dimensions. Returns true if there is difference, else false.
+bool areDimsDifferent(JNIEnv* env, TfLiteTensor* tensor, jintArray dims) {
+  int num_dims = static_cast<int>(env->GetArrayLength(dims));
+  jint* ptr = env->GetIntArrayElements(dims, nullptr);
+  if (ptr == nullptr) {
+    throwException(env, kIllegalArgumentException,
+                   "Empty dimensions of input array.");
+    return true;
+  }
+  if (tensor->dims->size != num_dims) {
+    return true;
+  }
+  for (int i = 0; i < num_dims; ++i) {
+    if (ptr[i] != tensor->dims->data[i]) {
+      return true;
+    }
+  }
+  env->ReleaseIntArrayElements(dims, ptr, JNI_ABORT);
+  return false;
+}
+
+bool areInputDimensionsTheSame(JNIEnv* env, tflite::Interpreter* interpreter,
+                               int input_size, jobjectArray sizes) {
+  if (interpreter->inputs().size() != input_size) {
+    return false;
+  }
+  for (int i = 0; i < input_size; ++i) {
+    int input_idx = interpreter->inputs()[i];
+    jintArray dims =
+        static_cast<jintArray>(env->GetObjectArrayElement(sizes, i));
+    TfLiteTensor* target = interpreter->tensor(input_idx);
+    if (areDimsDifferent(env, target, dims)) return false;
+    env->DeleteLocalRef(dims);
+    if (env->ExceptionCheck()) return false;
+  }
+  return true;
+}
+
 TfLiteStatus resizeInputs(JNIEnv* env, tflite::Interpreter* interpreter,
                           int input_size, jobjectArray sizes) {
   for (int i = 0; i < input_size; ++i) {
@@ -183,8 +239,8 @@ TfLiteStatus setInputs(JNIEnv* env, tflite::Interpreter* interpreter,
       TfLiteType type = resolveDataType(data_type[i]);
       if (type != target->type) {
         throwException(env, kIllegalArgumentException,
-                       "DataType (%d) of input data does not match with the "
-                       "DataType (%d) of model inputs.",
+                       "Input error: DataType (%d) of input data does not "
+                       "match with the DataType (%d) of model inputs.",
                        type, target->type);
         return kTfLiteError;
       }
@@ -217,7 +273,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env,
   jclass string_class = env->FindClass("java/lang/String");
   if (string_class == nullptr) {
     throwException(env, kUnsupportedOperationException,
-                   "Can not find java/lang/String class to get input names.");
+                   "Internal error: Can not find java/lang/String class to get "
+                   "input names.");
     return nullptr;
   }
   size_t size = interpreter->inputs().size();
@@ -239,7 +296,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
   jclass string_class = env->FindClass("java/lang/String");
   if (string_class == nullptr) {
     throwException(env, kUnsupportedOperationException,
-                   "Can not find java/lang/String class to get output names.");
+                   "Internal error: Can not find java/lang/String class to get "
+                   "output names.");
     return nullptr;
   }
   size_t size = interpreter->outputs().size();
@@ -262,6 +320,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
   interpreter->UseNNAPI(static_cast<bool>(state));
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return;
+  interpreter->SetNumThreads(static_cast<int>(num_threads));
+}
+
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
     JNIEnv* env, jclass clazz, jint size) {
@@ -270,6 +338,19 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
   return reinterpret_cast<jlong>(error_reporter);
 }
 
+// Verifies whether the model is a flatbuffer file.
+class JNIFlatBufferVerifier : public tflite::TfLiteVerifier {
+ public:
+  bool Verify(const char* data, int length,
+              tflite::ErrorReporter* reporter) override {
+    if (!VerifyModel(data, length)) {
+      reporter->Report("The model is not a valid Flatbuffer file");
+      return false;
+    }
+    return true;
+  }
+};
+
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
     JNIEnv* env, jclass clazz, jstring model_file, jlong error_handle) {
@@ -278,21 +359,15 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
   if (error_reporter == nullptr) return 0;
   const char* path = env->GetStringUTFChars(model_file, nullptr);
 
-  {
-    tflite::FileCopyAllocation allocation(path, nullptr);
-    if (!VerifyModel(allocation.base(), allocation.bytes())) {
-      throwException(env, kIllegalArgumentException,
-                     "Contents of %s is not a valid flatbuffer model", path);
-      env->ReleaseStringUTFChars(model_file, path);
-      return 0;
-    }
-  }
+  std::unique_ptr<tflite::TfLiteVerifier> verifier;
+  verifier.reset(new JNIFlatBufferVerifier());
 
-  auto model = tflite::FlatBufferModel::BuildFromFile(path, error_reporter);
+  auto model = tflite::FlatBufferModel::VerifyAndBuildFromFile(
+      path, verifier.get(), error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
-                   "Contents of %s does not encode a valid TensorFlowLite "
-                   "model: %s",
+                   "Contents of %s does not encode a valid "
+                   "TensorFlowLite model: %s",
                    path, error_reporter->CachedErrorMessage());
     env->ReleaseStringUTFChars(model_file, path);
     return 0;
@@ -320,8 +395,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
       buf, static_cast<size_t>(capacity), error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
-                   "MappedByteBuffer does not encode a valid TensorFlowLite "
-                   "model: %s",
+                   "MappedByteBuffer does not encode a valid "
+                   "TensorFlowLite model: %s",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
@@ -330,7 +405,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
 
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
-    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle) {
+    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle,
+    jint num_threads) {
   tflite::FlatBufferModel* model = convertLongToModel(env, model_handle);
   if (model == nullptr) return 0;
   BufferErrorReporter* error_reporter =
@@ -338,12 +414,21 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
   if (error_reporter == nullptr) return 0;
   auto resolver = ::tflite::CreateOpResolver();
   std::unique_ptr<tflite::Interpreter> interpreter;
-  TfLiteStatus status =
-      tflite::InterpreterBuilder(*model, *(resolver.get()))(&interpreter);
+  TfLiteStatus status = tflite::InterpreterBuilder(*model, *(resolver.get()))(
+      &interpreter, static_cast<int>(num_threads));
   if (status != kTfLiteOk) {
     throwException(env, kIllegalArgumentException,
-                   "Cannot create interpreter: %s",
+                   "Internal error: Cannot create interpreter: %s",
                    error_reporter->CachedErrorMessage());
+    return 0;
+  }
+  // allocates memory
+  status = interpreter->AllocateTensors();
+  if (status != kTfLiteOk) {
+    throwException(env, kNullPointerException,
+                   "Internal error: Cannot allocate memory for the interpreter",
+                   error_reporter->CachedErrorMessage());
+    return 0;
   }
   return reinterpret_cast<jlong>(interpreter.release());
 }
@@ -353,7 +438,7 @@ JNIEXPORT jlongArray JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
     JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
     jobjectArray sizes, jintArray data_types, jintArray nums_of_bytes,
-    jobjectArray values) {
+    jobjectArray values, jobject wrapper, jboolean memory_allocated) {
   tflite::Interpreter* interpreter =
       convertLongToInterpreter(env, interpreter_handle);
   if (interpreter == nullptr) return nullptr;
@@ -365,37 +450,55 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
   TfLiteStatus status = checkInputs(env, interpreter, input_size, data_types,
                                     nums_of_bytes, values, sizes);
   if (status != kTfLiteOk) return nullptr;
-  // resizes inputs
-  status = resizeInputs(env, interpreter, input_size, sizes);
-  if (status != kTfLiteOk) {
-    throwException(env, kNullPointerException, "Can not resize the input: %s",
-                   error_reporter->CachedErrorMessage());
-    return nullptr;
-  }
-  // allocates memory
-  status = interpreter->AllocateTensors();
-  if (status != kTfLiteOk) {
-    throwException(env, kNullPointerException,
-                   "Can not allocate memory for the given inputs: %s",
-                   error_reporter->CachedErrorMessage());
-    return nullptr;
+  if (!memory_allocated ||
+      !areInputDimensionsTheSame(env, interpreter, input_size, sizes)) {
+    // resizes inputs
+    status = resizeInputs(env, interpreter, input_size, sizes);
+    if (status != kTfLiteOk) {
+      throwException(env, kNullPointerException,
+                     "Internal error: Can not resize the input: %s",
+                     error_reporter->CachedErrorMessage());
+      return nullptr;
+    }
+    // allocates memory
+    status = interpreter->AllocateTensors();
+    if (status != kTfLiteOk) {
+      throwException(env, kNullPointerException,
+                     "Internal error: Can not allocate memory for the given "
+                     "inputs: %s",
+                     error_reporter->CachedErrorMessage());
+      return nullptr;
+    }
   }
   // sets inputs
   status = setInputs(env, interpreter, input_size, data_types, nums_of_bytes,
                      values);
   if (status != kTfLiteOk) return nullptr;
+  timespec beforeInference = ::tflite::getCurrentTime();
   // runs inference
   if (interpreter->Invoke() != kTfLiteOk) {
     throwException(env, kIllegalArgumentException,
-                   "Failed to run on the given Interpreter: %s",
+                   "Internal error: Failed to run on the given Interpreter: %s",
                    error_reporter->CachedErrorMessage());
     return nullptr;
   }
+  timespec afterInference = ::tflite::getCurrentTime();
+  jclass wrapper_clazz = env->GetObjectClass(wrapper);
+  jfieldID fid =
+      env->GetFieldID(wrapper_clazz, "inferenceDurationNanoseconds", "J");
+  if (env->ExceptionCheck()) {
+    env->ExceptionClear();
+  } else if (fid != nullptr) {
+    env->SetLongField(
+        wrapper, fid,
+        ::tflite::timespec_diff_nanoseconds(&beforeInference, &afterInference));
+  }
   // returns outputs
   const std::vector<int>& results = interpreter->outputs();
   if (results.empty()) {
-    throwException(env, kIllegalArgumentException,
-                   "The Interpreter does not have any outputs.");
+    throwException(
+        env, kIllegalArgumentException,
+        "Internal error: The Interpreter does not have any outputs.");
     return nullptr;
   }
   jlongArray outputs = env->NewLongArray(results.size());
@@ -414,53 +517,82 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims(
   tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
   if (interpreter == nullptr) return nullptr;
   const int idx = static_cast<int>(input_idx);
-  if (input_idx >= interpreter->inputs().size()) {
+  if (input_idx < 0 || input_idx >= interpreter->inputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Out of range: Failed to get %d-th input out of %d inputs",
+                   "Input error: Out of range: Failed to get %d-th input out of"
+                   " %d inputs",
                    input_idx, interpreter->inputs().size());
     return nullptr;
   }
   TfLiteTensor* target = interpreter->tensor(interpreter->inputs()[idx]);
   int size = target->dims->size;
-  int expected_num_bytes = elementByteSize(target->type);
-  for (int i = 0; i < size; ++i) {
-    expected_num_bytes *= target->dims->data[i];
-  }
-  if (num_bytes != expected_num_bytes) {
-    throwException(env, kIllegalArgumentException,
-                   "Failed to get input dimensions. %d-th input should have"
-                   " %d bytes, but found %d bytes.",
-                   idx, expected_num_bytes, num_bytes);
-    return nullptr;
+  if (num_bytes >= 0) {  // verifies num of bytes matches if num_bytes if valid.
+    int expected_num_bytes = elementByteSize(target->type);
+    for (int i = 0; i < size; ++i) {
+      expected_num_bytes *= target->dims->data[i];
+    }
+    if (num_bytes != expected_num_bytes) {
+      throwException(env, kIllegalArgumentException,
+                     "Input error: Failed to get input dimensions. %d-th input "
+                     "should have %d bytes, but found %d bytes.",
+                     idx, expected_num_bytes, num_bytes);
+      return nullptr;
+    }
   }
   jintArray outputs = env->NewIntArray(size);
   env->SetIntArrayRegion(outputs, 0, size, &(target->dims->data[0]));
   return outputs;
 }
 
-JNIEXPORT void JNICALL
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputDataType(
+    JNIEnv* env, jclass clazz, jlong handle, jint output_idx) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return -1;
+  const int idx = static_cast<int>(output_idx);
+  if (output_idx < 0 || output_idx >= interpreter->outputs().size()) {
+    throwException(env, kIllegalArgumentException,
+                   "Failed to get %d-th output out of %d outputs", output_idx,
+                   interpreter->outputs().size());
+    return -1;
+  }
+  TfLiteTensor* target = interpreter->tensor(interpreter->outputs()[idx]);
+  int type = getDataType(target->type);
+  return static_cast<jint>(type);
+}
+
+JNIEXPORT jboolean JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
     JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
     jint input_idx, jintArray dims) {
   BufferErrorReporter* error_reporter =
       convertLongToErrorReporter(env, error_handle);
-  if (error_reporter == nullptr) return;
+  if (error_reporter == nullptr) return JNI_FALSE;
   tflite::Interpreter* interpreter =
       convertLongToInterpreter(env, interpreter_handle);
-  if (interpreter == nullptr) return;
+  if (interpreter == nullptr) return JNI_FALSE;
   const int idx = static_cast<int>(input_idx);
   if (idx < 0 || idx >= interpreter->inputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Can not resize %d-th input for a model having %d inputs.",
+                   "Input error: Can not resize %d-th input for a model having "
+                   "%d inputs.",
                    idx, interpreter->inputs().size());
+    return JNI_FALSE;
   }
-  TfLiteStatus status = interpreter->ResizeInputTensor(
-      interpreter->inputs()[idx], convertJIntArrayToVector(env, dims));
-  if (status != kTfLiteOk) {
-    throwException(env, kIllegalArgumentException,
-                   "Failed to resize %d-th input: %s", idx,
-                   error_reporter->CachedErrorMessage());
+  // check whether it is resizing with the same dimensions.
+  TfLiteTensor* target = interpreter->tensor(input_idx);
+  bool is_changed = areDimsDifferent(env, target, dims);
+  if (is_changed) {
+    TfLiteStatus status = interpreter->ResizeInputTensor(
+        interpreter->inputs()[idx], convertJIntArrayToVector(env, dims));
+    if (status != kTfLiteOk) {
+      throwException(env, kIllegalArgumentException,
+                     "Internal error: Failed to resize %d-th input: %s", idx,
+                     error_reporter->CachedErrorMessage());
+      return JNI_FALSE;
+    }
   }
+  return is_changed ? JNI_TRUE : JNI_FALSE;
 }
 
 JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index c52a7e4e439936344be26d5761fb5747db64794a..eaa765cb343e9764bd0ef018d636a76f4b8a13e4 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <jni.h>
 #include <stdio.h>
+#include <time.h>
 #include <vector>
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/interpreter.h"
@@ -28,6 +29,9 @@ limitations under the License.
 namespace tflite {
 // This is to be provided at link-time by a library.
 extern std::unique_ptr<OpResolver> CreateOpResolver();
+extern timespec getCurrentTime();
+extern jlong timespec_diff_nanoseconds(struct timespec* start,
+                                       struct timespec* stop);
 }  // namespace tflite
 
 #ifdef __cplusplus
@@ -57,7 +61,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (JZ)
+ *  Signature: (JZ)V
  */
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
@@ -65,6 +69,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
                                                            jlong handle,
                                                            jboolean state);
 
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JI)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads);
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
@@ -95,30 +109,33 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (JJ)J
+ *  Signature: (JJI)J
  */
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
-    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle);
+    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle,
+    jint num_threads);
 
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (JJ[Ljava/lang/Object;[I[I[Ljava/lang/Object;)[J
+ *  Signature:
+ * (JJ[Ljava/lang/Object;[I[I[Ljava/lang/Object;Ljava/lang/Object;Z)[J
  */
 JNIEXPORT jlongArray JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
     JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
     jobjectArray sizes, jintArray data_types, jintArray nums_of_bytes,
-    jobjectArray values);
+    jobjectArray values, jobject wrapper, jboolean memory_allocated);
 
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
  *  Signature: (JII)[I
  *
- * It gets input dimensions if num_bytes matches number of bytes required by
- * the input, else returns null and throws IllegalArgumentException.
+ * Gets input dimensions. If num_bytes is non-negative, it will check whether
+ * num_bytes matches num of bytes required by the input, and return null and
+ * throw IllegalArgumentException if not.
  */
 JNIEXPORT jintArray JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims(
@@ -127,11 +144,23 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims(
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (JJI[I)
+ *  Signature: (JI)I
  *
- * It resizes dimensions of a input.
+ * Gets output dimensions.
  */
-JNIEXPORT void JNICALL
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputDataType(
+    JNIEnv* env, jclass clazz, jlong handle, jint output_idx);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JJI[I)Z
+ *
+ * It returns true if resizing input tensor to different dimensions, else return
+ * false.
+ */
+JNIEXPORT jboolean JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
     JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
     jint input_idx, jintArray dims);
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
index 65126e78a3003f8a69c69326124d613e878c0f9d..17f4be09c63a9e80feda9d0324d49cc0418fe66a 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
@@ -23,7 +23,7 @@ namespace {
 TfLiteTensor* convertLongToTensor(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
-                   "Invalid handle to TfLiteTensor.");
+                   "Internal error: Invalid handle to TfLiteTensor.");
     return nullptr;
   }
   return reinterpret_cast<TfLiteTensor*>(handle);
@@ -36,7 +36,8 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
   size_t to_copy = num_elements * elementByteSize(type);
   if (to_copy > dst_size) {
     throwException(env, kIllegalStateException,
-                   "cannot write Java array of %d bytes to Tensor of %d bytes",
+                   "Internal error: cannot write Java array of %d bytes to "
+                   "Tensor of %d bytes",
                    to_copy, dst_size);
     return 0;
   }
@@ -71,10 +72,10 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
     }
     default: {
       throwException(env, kUnsupportedOperationException,
-                     "TensorFlowLite currently supports float (32 bits), "
-                     "int (32 bits), byte (8 bits), and long (64 bits), "
-                     "support for other types (DataType %d in this case) will "
-                     "be added in the future",
+                     "DataType error: TensorFlowLite currently supports float "
+                     "(32 bits), int (32 bits), byte (8 bits), and long "
+                     "(64 bits), support for other types (DataType %d in this "
+                     "case) will be added in the future",
                      kTfLiteFloat32, type);
       return 0;
     }
@@ -88,8 +89,9 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
   if (size > src_size) {
     throwException(
         env, kIllegalStateException,
-        "cannot fill a Java array of %d bytes with a Tensor of %d bytes", size,
-        src_size);
+        "Internal error: cannot fill a Java array of %d bytes with a Tensor of "
+        "%d bytes",
+        size, src_size);
     return 0;
   }
   switch (data_type) {
@@ -117,8 +119,8 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
       return size;
     }
     default: {
-      throwException(env, kIllegalStateException, "invalid DataType(%d)",
-                     data_type);
+      throwException(env, kIllegalStateException,
+                     "DataType error: invalid DataType(%d)", data_type);
     }
   }
   return 0;
@@ -152,19 +154,22 @@ size_t elementByteSize(TfLiteType data_type) {
   switch (data_type) {
     case kTfLiteFloat32:
       static_assert(sizeof(jfloat) == 4,
-                    "Java float not compatible with kTfLiteFloat");
+                    "Interal error: Java float not compatible with "
+                    "kTfLiteFloat");
       return 4;
     case kTfLiteInt32:
       static_assert(sizeof(jint) == 4,
-                    "Java int not compatible with kTfLiteInt");
+                    "Interal error: Java int not compatible with kTfLiteInt");
       return 4;
     case kTfLiteUInt8:
       static_assert(sizeof(jbyte) == 1,
-                    "Java byte not compatible with kTfLiteUInt8");
+                    "Interal error: Java byte not compatible with "
+                    "kTfLiteUInt8");
       return 1;
     case kTfLiteInt64:
       static_assert(sizeof(jlong) == 8,
-                    "Java long not compatible with kTfLiteInt64");
+                    "Interal error: Java long not compatible with "
+                    "kTfLiteInt64");
       return 8;
     default:
       return 0;
@@ -212,7 +217,7 @@ Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
   int num_dims = tensor->dims->size;
   if (num_dims == 0) {
     throwException(env, kIllegalArgumentException,
-                   "copyTo() is not meant for scalar Tensors.");
+                   "Internal error: Cannot copy empty/scalar Tensors.");
     return;
   }
   readMultiDimensionalArray(env, tensor->type, tensor->data.raw, tensor->bytes,
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 424b3de6c97672e310c54230a7ac1204f46d9ac8..61d6c35ec86beebf78dd81e17e145863516802fa 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -218,4 +218,52 @@ public final class InterpreterTest {
     int index = interpreter.getOutputIndex("MobilenetV1/Predictions/Softmax");
     assertThat(index).isEqualTo(0);
   }
+
+  @Test
+  public void testTurnOffNNAPI() throws Exception {
+    Path path = MODEL_FILE.toPath();
+    FileChannel fileChannel =
+        (FileChannel) Files.newByteChannel(path, EnumSet.of(StandardOpenOption.READ));
+    MappedByteBuffer mappedByteBuffer =
+        fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size());
+    Interpreter interpreter = new Interpreter(mappedByteBuffer);
+    interpreter.setUseNNAPI(true);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.setUseNNAPI(false);
+    interpreter.run(fourD, parsedOutputs);
+    outputOneD = parsedOutputs[0][0][0];
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.close();
+    fileChannel.close();
+  }
+
+  @Test
+  public void testTurnOnNNAPI() throws Exception {
+    Path path = MODEL_FILE.toPath();
+    FileChannel fileChannel =
+        (FileChannel) Files.newByteChannel(path, EnumSet.of(StandardOpenOption.READ));
+    MappedByteBuffer mappedByteBuffer =
+        fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size());
+    Interpreter interpreter = new Interpreter(mappedByteBuffer);
+    interpreter.setUseNNAPI(true);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.close();
+    fileChannel.close();
+  }
 }
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index 90323555d88419d837a76bca7de6d9998e388fca..7c00d3196fd001a288d77d4e01f0b30978d72afe 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -47,6 +47,9 @@ public final class NativeInterpreterWrapperTest {
   private static final String MODEL_WITH_CUSTOM_OP_PATH =
       "tensorflow/contrib/lite/java/src/testdata/with_custom_op.lite";
 
+  private static final String NONEXISTING_MODEL_PATH =
+      "tensorflow/contrib/lite/java/src/testdata/nonexisting_model.bin";
+
   @Test
   public void testConstructor() {
     NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
@@ -60,7 +63,18 @@ public final class NativeInterpreterWrapperTest {
       NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(INVALID_MODEL_PATH);
       fail();
     } catch (IllegalArgumentException e) {
-      assertThat(e).hasMessageThat().contains("is not a valid flatbuffer model");
+      assertThat(e).hasMessageThat().contains("The model is not a valid Flatbuffer file");
+    }
+  }
+
+  @Test
+  public void testConstructorWithNonexistingModel() {
+    try {
+      NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(NONEXISTING_MODEL_PATH);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("The model is not a valid Flatbuffer file");
+      assertThat(e).hasMessageThat().contains("Could not open");
     }
   }
 
@@ -94,6 +108,30 @@ public final class NativeInterpreterWrapperTest {
     wrapper.close();
   }
 
+  @Test
+  public void testRunWithInputsOfSameDims() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    float[] oneD = {1.23f, -6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    Tensor[] outputs = wrapper.run(inputs);
+    assertThat(outputs.length).isEqualTo(1);
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    outputs[0].copyTo(parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, -19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    outputs = wrapper.run(inputs);
+    assertThat(outputs.length).isEqualTo(1);
+    parsedOutputs = new float[2][8][8][3];
+    outputs[0].copyTo(parsedOutputs);
+    outputOneD = parsedOutputs[0][0][0];
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    wrapper.close();
+  }
+
   @Test
   public void testRunWithInt() {
     NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(INT_MODEL_PATH);
@@ -283,9 +321,7 @@ public final class NativeInterpreterWrapperTest {
       wrapper.run(inputs);
       fail();
     } catch (IllegalArgumentException e) {
-      assertThat(e)
-          .hasMessageThat()
-          .contains("Invalid inputs. Inputs should not be null or empty.");
+      assertThat(e).hasMessageThat().contains("Inputs should not be null or empty.");
     }
     wrapper.close();
   }
@@ -402,7 +438,7 @@ public final class NativeInterpreterWrapperTest {
       NativeInterpreterWrapper.numDimensions(emptyArray);
       fail();
     } catch (IllegalArgumentException e) {
-      assertThat(e).hasMessageThat().contains("array lengths cannot be 0.");
+      assertThat(e).hasMessageThat().contains("Array lengths cannot be 0.");
     }
   }
 
@@ -417,4 +453,87 @@ public final class NativeInterpreterWrapperTest {
     assertThat(shape[1]).isEqualTo(3);
     assertThat(shape[2]).isEqualTo(1);
   }
+
+  @Test
+  public void testGetInferenceLatency() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    Tensor[] outputs = wrapper.run(inputs);
+    assertThat(outputs.length).isEqualTo(1);
+    assertThat(wrapper.getLastNativeInferenceDurationNanoseconds()).isGreaterThan(0L);
+    wrapper.close();
+  }
+
+  @Test
+  public void testGetInferenceLatencyWithNewWrapper() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    assertThat(wrapper.getLastNativeInferenceDurationNanoseconds()).isNull();
+    wrapper.close();
+  }
+
+  @Test
+  public void testGetLatencyAfterFailedInference() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    try {
+      wrapper.run(inputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains("0-th input dimension should be [?,8,8,3], but found [?,8,7,3]");
+    }
+    assertThat(wrapper.getLastNativeInferenceDurationNanoseconds()).isNull();
+    wrapper.close();
+  }
+
+  @Test
+  public void testGetInputDims() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    int[] expectedDims = {1, 8, 8, 3};
+    assertThat(wrapper.getInputDims(0)).isEqualTo(expectedDims);
+    wrapper.close();
+  }
+
+  @Test
+  public void testGetInputDimsOutOfRange() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    try {
+      wrapper.getInputDims(-1);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("Out of range");
+    }
+    try {
+      wrapper.getInputDims(1);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("Out of range");
+    }
+    wrapper.close();
+  }
+
+  @Test
+  public void testGetOutputDataType() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    assertThat(wrapper.getOutputDataType(0)).contains("float");
+    wrapper.close();
+    wrapper = new NativeInterpreterWrapper(LONG_MODEL_PATH);
+    assertThat(wrapper.getOutputDataType(0)).contains("long");
+    wrapper.close();
+    wrapper = new NativeInterpreterWrapper(INT_MODEL_PATH);
+    assertThat(wrapper.getOutputDataType(0)).contains("int");
+    wrapper.close();
+    wrapper = new NativeInterpreterWrapper(BYTE_MODEL_PATH);
+    assertThat(wrapper.getOutputDataType(0)).contains("byte");
+    wrapper.close();
+  }
 }
diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
index 2b4f37bc6cfe1dbc0c178a56b892f545e8ad4f3b..b524246d436858bbf506809a38cead2897f78d93 100644
--- a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
+++ b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
@@ -16,15 +16,3 @@ android_library(
         "//tensorflow/contrib/lite/java:tensorflowlite_java",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
index 8660cabf709e6531a5667a16e5cf43a93c7135bd..3aef0c3bb6cc4748de0e55d31f0215a77320ae69 100644
--- a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
+++ b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
@@ -32,4 +32,55 @@ public class TestHelper {
       throw new IllegalArgumentException("Interpreter has not initialized; Failed to setUseNNAPI.");
     }
   }
+
+  /**
+   * Gets the last inference duration in nanoseconds. It returns null if there is no previous
+   * inference run or the last inference run failed.
+   *
+   * @param interpreter an instance of {@code Interpreter}. If it is not initialized, an {@code
+   *     IllegalArgumentException} will be thrown.
+   */
+  public static Long getLastNativeInferenceDurationNanoseconds(Interpreter interpreter) {
+    if (interpreter != null && interpreter.wrapper != null) {
+      return interpreter.wrapper.getLastNativeInferenceDurationNanoseconds();
+    } else {
+      throw new IllegalArgumentException("Interpreter has not initialized; Failed to get latency.");
+    }
+  }
+
+  /**
+   * Gets the dimensions of an input.
+   *
+   * @param interpreter an instance of {@code Interpreter}. If it is not initialized, an {@code
+   *     IllegalArgumentException} will be thrown.
+   * @param index an integer index of the input. If it is invalid, an {@code
+   *     IllegalArgumentException} will be thrown.
+   */
+  public static int[] getInputDims(Interpreter interpreter, int index) {
+    if (interpreter != null && interpreter.wrapper != null) {
+      return interpreter.wrapper.getInputDims(index);
+    } else {
+      throw new IllegalArgumentException(
+          "Interpreter has not initialized;" + " Failed to get input dimensions.");
+    }
+  }
+
+  /**
+   * Gets the string name of the data type of an output.
+   *
+   * @param interpreter an instance of {@code Interpreter}. If it is not initialized, an {@code
+   *     IllegalArgumentException} will be thrown.
+   * @param index an integer index of the output. If it is invalid, an {@code
+   *     IllegalArgumentException} will be thrown.
+   * @return string name of the data type. Possible values include "float", "int", "byte", and
+   *     "long".
+   */
+  public static String getOutputDataType(Interpreter interpreter, int index) {
+    if (interpreter != null && interpreter.wrapper != null) {
+      return interpreter.wrapper.getOutputDataType(index);
+    } else {
+      throw new IllegalArgumentException(
+          "Interpreter has not initialized;" + " Failed to get output data type.");
+    }
+  }
 }
diff --git a/tensorflow/contrib/lite/kernels/Android.bp b/tensorflow/contrib/lite/kernels/Android.bp
index f2a51c0270862833c1f66894370e311e9c82f094..b923840830919f2b6c422fa2be66005d7d118100 100644
--- a/tensorflow/contrib/lite/kernels/Android.bp
+++ b/tensorflow/contrib/lite/kernels/Android.bp
@@ -37,16 +37,22 @@ cc_library_static {
     srcs: [
         "activations.cc",
         "add.cc",
+        "arg_max.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
 	"bidirectional_sequence_rnn.cc",
+        "bidirectional_sequence_lstm.cc",
+        "cast.cc",
+        "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
         "depthwise_conv.cc",
+        "dequantize.cc",
         "div.cc",
         "embedding_lookup.cc",
         "embedding_lookup_sparse.cc",
-	"exp.cc",
+        "exp.cc",
+        "floor.cc",
         "fully_connected.cc",
         "gather.cc",
         "hashtable_lookup.cc",
@@ -55,6 +61,7 @@ cc_library_static {
         "local_response_norm.cc",
         "lsh_projection.cc",
         "lstm.cc",
+        "maximum_minimum.cc",
         "mean.cc",
         "mul.cc",
         "pad.cc",
@@ -93,6 +100,7 @@ cc_library_static {
         "-Wno-invalid-partial-specialization",
         "-Wno-missing-field-initializers",
         "-Wno-sign-compare",
+        "-Wno-unused-local-typedef",
         "-Wno-unused-variable",
     ],
 }
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index b59dc5ffb339caade28626d1954d41bc821fae41..689f9bfa7151eb5a8e3d92719bfe897fd9fb9023 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -5,15 +5,14 @@ package(default_visibility = [
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
+load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 tf_cc_test(
     name = "optional_tensor_test",
     size = "small",
     srcs = ["optional_tensor_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -33,11 +32,27 @@ cc_library(
         "//tensorflow/contrib/lite:schema_fbs_version",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/testing:util",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:tflite_portable_logging",
         "@com_google_googletest//:gtest",
     ],
 )
 
+cc_library(
+    name = "eigen_support",
+    srcs = [
+        "eigen_support.cc",
+    ],
+    hdrs = [
+        "eigen_support.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":op_macros",
+        "//tensorflow/contrib/lite:context",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "gemm_support",
     srcs = [
@@ -90,6 +105,7 @@ tf_cc_test(
     name = "kernel_util_test",
     size = "small",
     srcs = ["kernel_util_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":kernel_util",
         "//tensorflow/contrib/lite/testing:util",
@@ -97,21 +113,39 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "test_util_test",
+    size = "small",
+    srcs = ["test_util_test.cc"],
+    deps = [
+        ":test_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "builtin_ops",
     srcs = [
         "activations.cc",
         "add.cc",
+        "arg_max.cc",
+        "audio_spectrogram.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
+        "bidirectional_sequence_lstm.cc",
         "bidirectional_sequence_rnn.cc",
+        "cast.cc",
+        "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
         "depthwise_conv.cc",
+        "dequantize.cc",
         "div.cc",
         "embedding_lookup.cc",
         "embedding_lookup_sparse.cc",
         "exp.cc",
+        "floor.cc",
         "fully_connected.cc",
         "gather.cc",
         "hashtable_lookup.cc",
@@ -119,7 +153,9 @@ cc_library(
         "local_response_norm.cc",
         "lsh_projection.cc",
         "lstm.cc",
+        "maximum_minimum.cc",
         "mean.cc",
+        "mfcc.cc",
         "mul.cc",
         "pad.cc",
         "pooling.cc",
@@ -153,21 +189,51 @@ cc_library(
     }),
     deps = [
         ":activation_functor",
+        ":eigen_support",
         ":kernel_util",
         ":op_macros",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:gemm_support",
+        "//tensorflow/contrib/lite/kernels/internal:audio_utils",
         "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
         "//tensorflow/contrib/lite/kernels/internal:optimized",
         "//tensorflow/contrib/lite/kernels/internal:optimized_base",
         "//tensorflow/contrib/lite/kernels/internal:quantization_util",
         "//tensorflow/contrib/lite/kernels/internal:reference",
         "//tensorflow/contrib/lite/kernels/internal:reference_base",
-        "//tensorflow/contrib/lite/kernels/internal:round",
         "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
         "@farmhash_archive//:farmhash",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "audio_spectrogram_test",
+    size = "small",
+    srcs = ["audio_spectrogram_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "mfcc_test",
+    size = "small",
+    srcs = ["mfcc_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -175,6 +241,7 @@ tf_cc_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -187,6 +254,48 @@ tf_cc_test(
     name = "add_test",
     size = "small",
     srcs = ["add_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "arg_max_test",
+    size = "small",
+    srcs = ["arg_max_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "div_test",
+    size = "small",
+    srcs = ["div_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "sub_test",
+    size = "small",
+    srcs = ["sub_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -199,6 +308,7 @@ tf_cc_test(
     name = "transpose_test",
     size = "small",
     srcs = ["transpose_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -213,6 +323,7 @@ tf_cc_test(
     name = "space_to_batch_nd_test",
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -225,6 +336,20 @@ tf_cc_test(
     name = "batch_to_space_nd_test",
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "cast_test",
+    size = "small",
+    srcs = ["cast_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -237,6 +362,7 @@ tf_cc_test(
     name = "concatenation_test",
     size = "small",
     srcs = ["concatenation_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -249,6 +375,7 @@ tf_cc_test(
     name = "conv_test",
     size = "small",
     srcs = ["conv_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -262,6 +389,7 @@ tf_cc_test(
     name = "depthwise_conv_test",
     size = "small",
     srcs = ["depthwise_conv_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -270,10 +398,51 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "dequantize_test",
+    size = "small",
+    srcs = ["dequantize_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "basic_rnn_test",
     size = "small",
     srcs = ["basic_rnn_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "bidirectional_sequence_lstm_test",
+    size = "small",
+    srcs = ["bidirectional_sequence_lstm_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "floor_test",
+    size = "small",
+    srcs = ["floor_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -286,6 +455,7 @@ tf_cc_test(
     name = "unidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["unidirectional_sequence_lstm_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -298,6 +468,9 @@ tf_cc_test(
     name = "bidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["bidirectional_sequence_rnn_test.cc"],
+    tags = [
+        "tflite_not_portable",
+    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -310,6 +483,7 @@ tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["unidirectional_sequence_rnn_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -322,6 +496,7 @@ tf_cc_test(
     name = "l2norm_test",
     size = "small",
     srcs = ["l2norm_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -334,6 +509,20 @@ tf_cc_test(
     name = "exp_test",
     size = "small",
     srcs = ["exp_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "maximum_minimum_test",
+    size = "small",
+    srcs = ["maximum_minimum_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -346,6 +535,7 @@ tf_cc_test(
     name = "mean_test",
     size = "small",
     srcs = ["mean_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -358,6 +548,7 @@ tf_cc_test(
     name = "mul_test",
     size = "small",
     srcs = ["mul_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -370,6 +561,7 @@ tf_cc_test(
     name = "pad_test",
     size = "small",
     srcs = ["pad_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -382,6 +574,7 @@ tf_cc_test(
     name = "reshape_test",
     size = "small",
     srcs = ["reshape_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -394,6 +587,7 @@ tf_cc_test(
     name = "gather_test",
     size = "small",
     srcs = ["gather_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:builtin_op_data",
@@ -407,6 +601,7 @@ tf_cc_test(
     name = "topk_v2_test",
     size = "small",
     srcs = ["topk_v2_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:builtin_op_data",
@@ -420,6 +615,7 @@ tf_cc_test(
     name = "resize_bilinear_test",
     size = "small",
     srcs = ["resize_bilinear_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -432,6 +628,7 @@ tf_cc_test(
     name = "svdf_test",
     size = "small",
     srcs = ["svdf_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -444,6 +641,7 @@ tf_cc_test(
     name = "embedding_lookup_test",
     size = "small",
     srcs = ["embedding_lookup_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -456,6 +654,7 @@ tf_cc_test(
     name = "embedding_lookup_sparse_test",
     size = "small",
     srcs = ["embedding_lookup_sparse_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -468,10 +667,12 @@ tf_cc_test(
     name = "fully_connected_test",
     size = "small",
     srcs = ["fully_connected_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -480,6 +681,7 @@ tf_cc_test(
     name = "local_response_norm_test",
     size = "small",
     srcs = ["local_response_norm_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -492,6 +694,7 @@ tf_cc_test(
     name = "pooling_test",
     size = "small",
     srcs = ["pooling_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -504,6 +707,21 @@ tf_cc_test(
     name = "softmax_test",
     size = "small",
     srcs = ["softmax_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "//tensorflow/contrib/lite/kernels/internal:reference_base",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "log_softmax_test",
+    size = "small",
+    srcs = ["log_softmax_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -517,6 +735,7 @@ tf_cc_test(
     name = "lsh_projection_test",
     size = "small",
     srcs = ["lsh_projection_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -529,6 +748,7 @@ tf_cc_test(
     name = "hashtable_lookup_test",
     size = "small",
     srcs = ["hashtable_lookup_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -542,6 +762,7 @@ tf_cc_test(
     name = "lstm_test",
     size = "small",
     srcs = ["lstm_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -554,6 +775,7 @@ tf_cc_test(
     name = "skip_gram_test",
     size = "small",
     srcs = ["skip_gram_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -567,6 +789,7 @@ tf_cc_test(
     name = "space_to_depth_test",
     size = "small",
     srcs = ["space_to_depth_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -579,6 +802,7 @@ tf_cc_test(
     name = "split_test",
     size = "small",
     srcs = ["split_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -591,6 +815,7 @@ tf_cc_test(
     name = "squeeze_test",
     size = "small",
     srcs = ["squeeze_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -603,6 +828,24 @@ tf_cc_test(
     name = "strided_slice_test",
     size = "small",
     srcs = ["strided_slice_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "comparisons_test",
+    size = "small",
+    srcs = [
+        "comparisons_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -622,3 +865,5 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 3c5c77815d0f2592ab549152b4d77f45b967a660..39a54c93962b33f3a787b3387d9a133119d0e80a 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -63,6 +63,33 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
+TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  if (input->type == kTfLiteUInt8) {
+    static constexpr int kInputIntegerBits = 4;
+
+    const double input_real_multiplier =
+        input->params.scale *
+        static_cast<double>(1 << (31 - kInputIntegerBits));
+
+    QuantizeMultiplierGreaterThanOne(input_real_multiplier,
+                                     &data->input_multiplier,
+                                     &data->input_left_shift);
+    data->input_range_radius =
+        CalculateInputRadius(kInputIntegerBits, data->input_left_shift);
+  }
+
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
 TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -123,6 +150,34 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
+TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* alpha = GetInput(context, node, 1);
+
+  output->type = input->type;
+
+  // Currently only Float32 is supported
+  // TODO(ycling): Support other data types.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, alpha->type, kTfLiteFloat32);
+
+  // Currently, only support 4D `input` and 3D `alpha` with shape
+  // (1, 1, channels).
+  // TODO(impjdi): Support other cases where `alpha` is broadcastable
+  // to `input`.
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->size, 3);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[0], 1);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[1], 1);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[2], input->dims->data[3]);
+
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
 TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -180,6 +235,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
   TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
@@ -191,6 +247,14 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       for (; in < in_end; in++, out++) *out = std::tanh(*in);
       return kTfLiteOk;
     } break;
+    case kTfLiteUInt8: {
+      optimized_ops::Tanh(GetTensorData<uint8_t>(input), GetTensorDims(input),
+                          input->params.zero_point, data->input_range_radius,
+                          data->input_multiplier, data->input_left_shift,
+                          GetTensorData<uint8_t>(output),
+                          GetTensorDims(output));
+      return kTfLiteOk;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -337,6 +401,50 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32:
+      optimized_ops::LogSoftmax(
+          GetTensorData<float>(input), GetTensorDims(input),
+          GetTensorData<float>(output), GetTensorDims(output));
+      return kTfLiteOk;
+    default:
+      context->ReportError(context, "Only float32 supported currently.");
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* alpha = GetInput(context, node, 1);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  if (input->type != kTfLiteFloat32) {
+    context->ReportError(context, "Only float32 supported currently.");
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
+  const int batches = input->dims->data[0];
+  const int height = input->dims->data[1];
+  const int width = input->dims->data[2];
+  const int channels = input->dims->data[3];
+
+  TF_LITE_ENSURE_EQ(context, alpha->dims->size, 3);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[0], 1);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[1], 1);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[2], channels);
+
+  const int n = batches * height * width * channels;
+  for (int i = 0; i < n; ++i) {
+    const float x = input->data.f[i];
+    output->data.f[i] = x >= 0.0f ? x : alpha->data.f[i % channels] * x;
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace activations
 
 TfLiteRegistration* Register_RELU() {
@@ -361,8 +469,8 @@ TfLiteRegistration* Register_RELU6() {
 }
 
 TfLiteRegistration* Register_TANH() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 activations::GenericPrepare,
+  static TfLiteRegistration r = {activations::Init, activations::Free,
+                                 activations::TanhPrepare,
                                  activations::TanhEval};
   return &r;
 }
@@ -381,6 +489,20 @@ TfLiteRegistration* Register_SOFTMAX() {
   return &r;
 }
 
+TfLiteRegistration* Register_LOG_SOFTMAX() {
+  static TfLiteRegistration r = {activations::Init, activations::Free,
+                                 activations::GenericPrepare,
+                                 activations::LogSoftmaxEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_PRELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::PreluPrepare,
+                                 activations::PreluEval};
+  return &r;
+}
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
index 68d49944e51b043b6b82aa1589d22f6ebed37574..50a84edd475c8051a563cf8ed9fc03099829b786 100644
--- a/tensorflow/contrib/lite/kernels/activations_test.cc
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -52,6 +52,14 @@ class BaseActivationsOpModel : public SingleOpModel {
     BuildInterpreter({GetShape(input_)});
   }
 
+  BaseActivationsOpModel(BuiltinOperator type, const TensorData &input,
+                         const TensorData &output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(type, BuiltinOptions_NONE, 0);
+    BuildInterpreter({GetShape(input_)});
+  }
+
  protected:
   int input_;
   int output_;
@@ -143,6 +151,27 @@ TEST(FloatActivationsOpTest, Tanh) {
                              })));
 }
 
+TEST(QuantizedActivationsOpTest, Tanh) {
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_TANH,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -8, 8},
+      /*output=*/{TensorType_UINT8, {1, 2, 4, 1}, -1, 1});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      -4, -2, 8, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0, -0.999987, 0.964027, 0.999329,     //
+                      -0.996078, -0.96402, 0.99999, 0.76159,  //
+                  },
+                  4 * (1. / 256))));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({128, 0, 251, 255, 0, 5, 255, 226}));
+}
+
 TEST(FloatActivationsOpTest, Sigmoid) {
   FloatActivationsOpModel m(BuiltinOperator_LOGISTIC,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
@@ -313,6 +342,90 @@ TEST(QuantizedActivationsOpTest, Softmax2D) {
                                              kQuantizedTolerance)));
 }
 
+// This contains the same test values as the Softmax test, but reference answer
+// generated via the following snippet of python:
+//   logits1 = tf.constant([[0, -6, 2, 4],[3, -2, 10, 1]], dtype=tf.float32)
+//   logits2 = tf.constant([[0,-6],[2,4],[3,-2],[10,1]], dtype=tf.float32)
+//   lsm1 = tf.nn.log_softmax(logits1)
+//   lsm2 = tf.nn.log_softmax(logits2)
+//   with tf.Session() as sess:
+//     print('lsm1', sess.run(lsm1))
+//     print('lsm2', sess.run(lsm2))
+
+TEST(FloatActivationsOpTest, LogSoftmax) {
+  FloatActivationsOpModel m(BuiltinOperator_LOG_SOFTMAX,
+                            /*input=*/{TensorType_FLOAT32, {2, 4}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 -4.14297, -10.14297, -2.14297, -.142971,    //
+                                 -7.00104, -12.00104, -.00104087, -9.00104,  //
+                             })));
+
+  // Same input, but a different shape.
+  FloatActivationsOpModel m2(BuiltinOperator_LOG_SOFTMAX,
+                             /*input=*/{TensorType_FLOAT32, {4, 2}});
+  m2.SetInput({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                  -.00247565, -6.00247,   //
+                                  -2.12692, -.126928,     //
+                                  -.00671534, -5.00671,   //
+                                  -.000123374, -9.00012,  //
+                              })));
+}
+
+class PReluOpModel : public SingleOpModel {
+ public:
+  PReluOpModel(const TensorData& input, const TensorData& alpha) {
+    input_ = AddInput(input);
+    alpha_ = AddInput(alpha);
+    output_ = AddOutput(input);
+    SetBuiltinOp(BuiltinOperator_PRELU, BuiltinOptions_NONE, 0);
+    BuildInterpreter({GetShape(input_), GetShape(alpha_)});
+  }
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetAlpha(std::initializer_list<float> data) {
+    PopulateTensor(alpha_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input_;
+  int alpha_;
+  int output_;
+};
+
+TEST(FloatActivationsOpTest, PRelu) {
+  PReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                 {TensorType_FLOAT32, {1, 1, 3}});
+
+  m.SetInput({
+      0.0f, 0.0f, 0.0f,     // Row 1, Column 1
+      1.0f, 1.0f, 1.0f,     // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
+      -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
+  });
+  m.SetAlpha({0.0f, 1.0f, 2.0f});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                                 1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                                 0.0f, -1.0f, -2.0f,  // Row 2, Column 1
+                                 0.0f, -2.0f, -4.0f,  // Row 1, Column 2
+                             }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index 63ea89df56bafa995950afec3a58267681af304f..e0aa070e2d02cecb9c6ff500ab32b8ad2159db6e 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -176,7 +176,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   output);
   } else {
     context->ReportError(context,
-                         "Inputs and outputs not all float|unit8 types.");
+                         "Inputs and outputs not all float|uint8 types.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/arg_max.cc b/tensorflow/contrib/lite/kernels/arg_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2c5e4ceadbc905d22eb02b450c88745a351f58f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/arg_max.cc
@@ -0,0 +1,178 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace arg_max {
+
+constexpr int kInputTensor = 0;
+constexpr int kAxis = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  // Make sure the axis is only 1 dimension.
+  TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
+
+  // Make sure the axis is only either int32 or int64.
+  TF_LITE_ENSURE(context,
+                 axis->type == kTfLiteInt32 || axis->type == kTfLiteInt64);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  auto* params = reinterpret_cast<TfLiteArgMaxParams*>(node->builtin_data);
+  switch (params->output_type) {
+    case kTfLiteInt32:
+      output->type = kTfLiteInt32;
+      break;
+    case kTfLiteInt64:
+      output->type = kTfLiteInt64;
+      break;
+    default:
+      context->ReportError(context, "Unknown index output data type");
+      return kTfLiteError;
+  }
+
+  // Check conditions for different types.
+  switch (input->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt32:
+      break;
+
+    default:
+      context->ReportError(context, "Only float32 and int types are supported");
+      return kTfLiteError;
+  }
+
+  // Copy the input dimensions to output except make the last dimension 1.
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  output_size->data[NumDimensions(input) - 1] = 1;
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+// The current impl actually ignores the axis argument.
+// Only determine the index of the maximum value in the last dimension.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+#define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                     \
+  TF_LITE_ENSURE_EQ(context, GetTensorData<axis_type>(axis)[0], 3);            \
+  optimized_ops::ArgMax(GetTensorData<axis_type>(axis),                        \
+                        GetTensorData<data_type>(input), GetTensorDims(input), \
+                        GetTensorData<output_type>(output),                    \
+                        GetTensorDims(output))
+  if (axis->type == kTfLiteInt32) {
+    switch (output->type) {
+      case kTfLiteInt32: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int32_t, int32_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int32_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int32_t, int32_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      case kTfLiteInt64: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int32_t, int64_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int64_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int32_t, int64_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      default:
+        return kTfLiteError;
+    }
+  } else {
+    switch (output->type) {
+      case kTfLiteInt32: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int64_t, int32_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int32_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int64_t, int32_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      case kTfLiteInt64: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int64_t, int64_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int64_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int64_t, int64_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      default:
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_ARG_MAX
+
+  return kTfLiteOk;
+}
+
+}  // namespace arg_max
+
+TfLiteRegistration* Register_ARG_MAX() {
+  static TfLiteRegistration r = {nullptr, nullptr, arg_max::Prepare,
+                                 arg_max::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/arg_max_test.cc b/tensorflow/contrib/lite/kernels/arg_max_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31b15fe19ab87027c28bde9eaff7d88d03b2c213
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/arg_max_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ArgMaxOpModel : public SingleOpModel {
+ public:
+  ArgMaxOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+                TensorType output_type, TensorType index_output_type) {
+    input_ = AddInput(input_type);
+    axis_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(output_type);
+    SetBuiltinOp(BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions,
+                 CreateArgMaxOptions(builder_, index_output_type).Union());
+    BuildInterpreter({input_shape, {1, 1, 1, 1}});
+  }
+
+  int input() { return input_; }
+  int axis() { return axis_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+TEST(ArgMaxOpTest, GetMaxArgFloat) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_FLOAT32,
+                               TensorType_INT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input(), {0.1, 0.9, 0.7, 0.3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgInt) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
+  ArgMaxOpModel<int32_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgOutput64) {
+  ArgMaxOpModel<int64_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT64,
+                               TensorType_INT64);
+  model.PopulateTensor<int>(model.input(), {10, 2, 7, 8, 1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
new file mode 100644
index 0000000000000000000000000000000000000000..602f3888c10b3790dc0328c817bdd83276544b25
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
@@ -0,0 +1,165 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/spectrogram.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+#include "flatbuffers/flexbuffers.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace audio_spectrogram {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+enum KernelType {
+  kReference,
+};
+
+typedef struct {
+  int window_size;
+  int stride;
+  bool magnitude_squared;
+  int output_height;
+  internal::Spectrogram* spectrogram;
+} TfLiteAudioSpectrogramParams;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new TfLiteAudioSpectrogramParams;
+
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  data->window_size = m["window_size"].AsInt64();
+  data->stride = m["stride"].AsInt64();
+  data->magnitude_squared = m["magnitude_squared"].AsBool();
+
+  data->spectrogram = new internal::Spectrogram;
+
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  auto* params = reinterpret_cast<TfLiteAudioSpectrogramParams*>(buffer);
+  delete params->spectrogram;
+  delete params;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteAudioSpectrogramParams*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
+
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  TF_LITE_ENSURE(context, params->spectrogram->Initialize(params->window_size,
+                                                          params->stride));
+  const int64_t sample_count = input->dims->data[0];
+  const int64_t length_minus_window = (sample_count - params->window_size);
+  if (length_minus_window < 0) {
+    params->output_height = 0;
+  } else {
+    params->output_height = 1 + (length_minus_window / params->stride);
+  }
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
+  output_size->data[0] = input->dims->data[1];
+  output_size->data[1] = params->output_height;
+  output_size->data[2] = params->spectrogram->output_frequency_channels();
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteAudioSpectrogramParams*>(node->user_data);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE(context, params->spectrogram->Initialize(params->window_size,
+                                                          params->stride));
+
+  const float* input_data = GetTensorData<float>(input);
+
+  const int64_t sample_count = input->dims->data[0];
+  const int64_t channel_count = input->dims->data[1];
+
+  const int64_t output_width = params->spectrogram->output_frequency_channels();
+
+  float* output_flat = GetTensorData<float>(output);
+
+  std::vector<float> input_for_channel(sample_count);
+  for (int64_t channel = 0; channel < channel_count; ++channel) {
+    float* output_slice =
+        output_flat + (channel * params->output_height * output_width);
+    for (int i = 0; i < sample_count; ++i) {
+      input_for_channel[i] = input_data[i * channel_count + channel];
+    }
+    std::vector<std::vector<float>> spectrogram_output;
+    TF_LITE_ENSURE(context,
+                   params->spectrogram->ComputeSquaredMagnitudeSpectrogram(
+                       input_for_channel, &spectrogram_output));
+    TF_LITE_ENSURE_EQ(context, spectrogram_output.size(),
+                      params->output_height);
+    TF_LITE_ENSURE(context, spectrogram_output.empty() ||
+                                (spectrogram_output[0].size() == output_width));
+    for (int row_index = 0; row_index < params->output_height; ++row_index) {
+      const std::vector<float>& spectrogram_row = spectrogram_output[row_index];
+      TF_LITE_ENSURE_EQ(context, spectrogram_row.size(), output_width);
+      float* output_row = output_slice + (row_index * output_width);
+      if (params->magnitude_squared) {
+        for (int i = 0; i < output_width; ++i) {
+          output_row[i] = spectrogram_row[i];
+        }
+      } else {
+        for (int i = 0; i < output_width; ++i) {
+          output_row[i] = sqrtf(spectrogram_row[i]);
+        }
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace audio_spectrogram
+
+TfLiteRegistration* Register_AUDIO_SPECTROGRAM() {
+  static TfLiteRegistration r = {
+      audio_spectrogram::Init, audio_spectrogram::Free,
+      audio_spectrogram::Prepare,
+      audio_spectrogram::Eval<audio_spectrogram::kReference>};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d460fdfc610ef9a867acd492ca0558fb6eab8c3
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class BaseAudioSpectrogramOpModel : public SingleOpModel {
+ public:
+  BaseAudioSpectrogramOpModel(const TensorData& input1,
+                              const TensorData& output, int window_size,
+                              int stride, bool magnitude_squared) {
+    input1_ = AddInput(input1);
+    output_ = AddOutput(output);
+
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Int("window_size", window_size);
+      fbb.Int("stride", stride);
+      fbb.Bool("magnitude_squared", magnitude_squared);
+    });
+    fbb.Finish();
+    SetCustomOp("AudioSpectrogram", fbb.GetBuffer(),
+                Register_AUDIO_SPECTROGRAM);
+    BuildInterpreter({GetShape(input1_)});
+  }
+
+  int input1() { return input1_; }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int output_;
+};
+
+TEST(BaseAudioSpectrogramOpModel, NonSquaredTest) {
+  BaseAudioSpectrogramOpModel m({TensorType_FLOAT32, {8, 1}},
+                                {TensorType_FLOAT32, {}}, 8, 1, false);
+  m.PopulateTensor<float>(m.input1(),
+                          {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+  m.Invoke();
+
+  std::vector<int> output_shape = m.GetOutputShape();
+  EXPECT_EQ(3, output_shape.size());
+  EXPECT_THAT(output_shape, ElementsAre(1, 1, 5));
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {0.0f, 1.0f, 2.0f, 1.0f, 0.0f}, 1e-3)));
+}
+
+TEST(SpectrogramOpTest, SquaredTest) {
+  BaseAudioSpectrogramOpModel m({TensorType_FLOAT32, {8, 1}},
+                                {TensorType_FLOAT32, {}}, 8, 1, true);
+  m.PopulateTensor<float>(m.input1(),
+                          {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+  m.Invoke();
+
+  std::vector<int> output_shape = m.GetOutputShape();
+  EXPECT_EQ(3, output_shape.size());
+  EXPECT_THAT(output_shape, ElementsAre(1, 1, 5));
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {0.f, 1.f, 4.f, 1.f, 0.f}, 1e-3)));
+}
+
+TEST(SpectrogramOpTest, StrideTest) {
+  BaseAudioSpectrogramOpModel m({TensorType_FLOAT32, {10, 1}},
+                                {TensorType_FLOAT32, {}}, 8, 2, true);
+  m.PopulateTensor<float>(m.input1(), {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f,
+                                       1.0f, 0.0f, 1.0f, 0.0f});
+
+  m.Invoke();
+
+  std::vector<int> output_shape = m.GetOutputShape();
+  EXPECT_THAT(output_shape, ElementsAre(1, 2, 5));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {0, 1, 4, 1, 0, 1, 2, 1, 2, 1}, 1e-3)));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index bc438f99c6a72fdbc2794dee03524db6a7523834..90edf4f9e3683f2987dd8299a1cd5233caa24479 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -123,6 +123,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                        GetTensorDims(op_context.input),                \
                        GetTensorData<int32_t>(op_context.block_shape), \
                        GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<int32_t>(op_context.crops),       \
+                       GetTensorDims(op_context.crops),                \
                        GetTensorData<scalar>(op_context.output),       \
                        GetTensorDims(op_context.output))
   switch (op_context.input->type) {  // Already know in/out types are same.
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a64ac42bc43336db928d2682e290f5263f3db0f4
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
@@ -0,0 +1,702 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace bidirectional_sequence_lstm {
+
+// Input Tensors of size {max_time, n_batch, n_input}
+constexpr int kInputTensor = 0;
+
+// Forward LSTM cell tensors.
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kFwInputToInputWeightsTensor = 1;  // Optional
+constexpr int kFwInputToForgetWeightsTensor = 2;
+constexpr int kFwInputToCellWeightsTensor = 3;
+constexpr int kFwInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kFwRecurrentToInputWeightsTensor = 5;  // Optional
+constexpr int kFwRecurrentToForgetWeightsTensor = 6;
+constexpr int kFwRecurrentToCellWeightsTensor = 7;
+constexpr int kFwRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kFwCellToInputWeightsTensor = 9;    // Optional
+constexpr int kFwCellToForgetWeightsTensor = 10;  // Optional
+constexpr int kFwCellToOutputWeightsTensor = 11;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kFwInputGateBiasTensor = 12;  // Optional
+constexpr int kFwForgetGateBiasTensor = 13;
+constexpr int kFwCellGateBiasTensor = 14;
+constexpr int kFwOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kFwProjectionWeightsTensor = 16;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kFwProjectionBiasTensor = 17;  // Optional
+
+// Backward LSTM cell tensors.
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kBwInputToInputWeightsTensor = 18;  // Optional
+constexpr int kBwInputToForgetWeightsTensor = 19;
+constexpr int kBwInputToCellWeightsTensor = 20;
+constexpr int kBwInputToOutputWeightsTensor = 21;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kBwRecurrentToInputWeightsTensor = 22;  // Optional
+constexpr int kBwRecurrentToForgetWeightsTensor = 23;
+constexpr int kBwRecurrentToCellWeightsTensor = 24;
+constexpr int kBwRecurrentToOutputWeightsTensor = 25;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kBwCellToInputWeightsTensor = 26;   // Optional
+constexpr int kBwCellToForgetWeightsTensor = 27;  // Optional
+constexpr int kBwCellToOutputWeightsTensor = 28;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kBwInputGateBiasTensor = 29;  // Optional
+constexpr int kBwForgetGateBiasTensor = 30;
+constexpr int kBwCellGateBiasTensor = 31;
+constexpr int kBwOutputGateBiasTensor = 32;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kBwProjectionWeightsTensor = 33;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kBwProjectionBiasTensor = 34;  // Optional
+
+// Output tensors.
+constexpr int kFwScratchBufferTensor = 0;
+constexpr int kFwOutputStateTensor = 1;
+constexpr int kFwCellStateTensor = 2;
+constexpr int kFwOutputTensor = 3;
+
+constexpr int kBwScratchBufferTensor = 4;
+constexpr int kBwOutputStateTensor = 5;
+constexpr int kBwCellStateTensor = 6;
+constexpr int kBwOutputTensor = 7;
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus CheckLstmTensorDimensions(
+    TfLiteContext* context, TfLiteNode* node, int n_input, int n_output,
+    int n_cell, int input_to_input_weights_tensor,
+    int input_to_forget_weights_tensor, int input_to_cell_weights_tensor,
+    int input_to_output_weights_tensor, int recurrent_to_input_weights_tensor,
+    int recurrent_to_forget_weights_tensor,
+    int recurrent_to_cell_weights_tensor,
+    int recurrent_to_output_weights_tensor, int cell_to_input_weights_tensor,
+    int cell_to_forget_weights_tensor, int cell_to_output_weights_tensor,
+    int input_gate_bias_tensor, int forget_gate_bias_tensor,
+    int cell_gate_bias_tensor, int output_gate_bias_tensor,
+    int projection_weights_tensor, int projection_bias_tensor) {
+  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+  // Making sure clipping parameters have valid values.
+  // == 0 means no clipping
+  //  > 0 means clipping
+  TF_LITE_ENSURE(context, params->cell_clip >= 0);
+  TF_LITE_ENSURE(context, params->proj_clip >= 0);
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, input_to_input_weights_tensor);
+  if (input_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+  }
+
+  TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, input_to_forget_weights_tensor);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+
+  TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, input_to_cell_weights_tensor);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, recurrent_to_input_weights_tensor);
+  if (recurrent_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+                      n_output);
+  }
+
+  TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, recurrent_to_forget_weights_tensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
+                    n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+                    n_output);
+
+  TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, recurrent_to_cell_weights_tensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+                    n_output);
+
+  // We make sure the input-gate's parameters are either both present (regular
+  // LSTM) or not at all (CIFG-LSTM).
+  const bool cifg_weights_all_or_none =
+      ((input_to_input_weights != nullptr) &&
+       (recurrent_to_input_weights != nullptr)) ||
+      ((input_to_input_weights == nullptr) &&
+       (recurrent_to_input_weights == nullptr));
+  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
+
+  TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, cell_to_input_weights_tensor);
+  if (cell_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, cell_to_forget_weights_tensor);
+  if (cell_to_forget_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, cell_to_output_weights_tensor);
+  if (cell_to_output_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+  }
+
+  // Making sure the peephole weights are there all or none.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool peephole_weights_all_or_none =
+      ((cell_to_input_weights != nullptr || use_cifg) &&
+       (cell_to_forget_weights != nullptr) &&
+       (cell_to_output_weights != nullptr)) ||
+      ((cell_to_input_weights == nullptr) &&
+       (cell_to_forget_weights == nullptr) &&
+       (cell_to_output_weights == nullptr));
+  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
+
+  // Make sure the input gate bias is present only when not a CIFG-LSTM.
+  TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, input_gate_bias_tensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, forget_gate_bias_tensor);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* cell_bias = GetInput(context, node, cell_gate_bias_tensor);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* output_gate_bias =
+      GetInput(context, node, output_gate_bias_tensor);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, projection_weights_tensor);
+  if (projection_weights) {
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+  }
+
+  TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, projection_bias_tensor);
+  if (projection_bias) {
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+  }
+
+  // Making sure the projection tensors are consistent:
+  // 1) If projection weight is not present, then projection bias should not be
+  // present.
+  // 2) If projection weight is present, then projection bias is optional.
+  // TODO(ghodrat): make sure this is correct.
+  const bool projecton_tensors_consistent =
+      ((projection_weights != nullptr) || (projection_bias == nullptr));
+  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
+                                        TfLiteNode* node, int n_input,
+                                        int n_output, int n_cell) {
+  CheckLstmTensorDimensions(
+      context, node, n_input, n_output, n_cell, kFwInputToInputWeightsTensor,
+      kFwInputToForgetWeightsTensor, kFwInputToCellWeightsTensor,
+      kFwInputToOutputWeightsTensor, kFwRecurrentToInputWeightsTensor,
+      kFwRecurrentToForgetWeightsTensor, kFwRecurrentToCellWeightsTensor,
+      kFwRecurrentToOutputWeightsTensor, kFwCellToInputWeightsTensor,
+      kFwCellToForgetWeightsTensor, kFwCellToOutputWeightsTensor,
+      kFwInputGateBiasTensor, kFwForgetGateBiasTensor, kFwCellGateBiasTensor,
+      kFwOutputGateBiasTensor, kFwProjectionWeightsTensor,
+      kFwProjectionBiasTensor);
+
+  CheckLstmTensorDimensions(
+      context, node, n_input, n_output, n_cell, kBwInputToInputWeightsTensor,
+      kBwInputToForgetWeightsTensor, kBwInputToCellWeightsTensor,
+      kBwInputToOutputWeightsTensor, kBwRecurrentToInputWeightsTensor,
+      kBwRecurrentToForgetWeightsTensor, kBwRecurrentToCellWeightsTensor,
+      kBwRecurrentToOutputWeightsTensor, kBwCellToInputWeightsTensor,
+      kBwCellToForgetWeightsTensor, kBwCellToOutputWeightsTensor,
+      kBwInputGateBiasTensor, kBwForgetGateBiasTensor, kBwCellGateBiasTensor,
+      kBwOutputGateBiasTensor, kBwProjectionWeightsTensor,
+      kBwProjectionBiasTensor);
+
+  // Check if Forward and Backward tensors match along required dimensions.
+  return kTfLiteOk;
+}
+
+// Resize the output, state and scratch tensors based on the sizes of the input
+// tensors. Also check that the size of the input tensors match each other.
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 35);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 8);
+
+  // Inferring batch size, number of outputs and sequence length and
+  // number of cells from the input tensors.
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input->dims->size > 1);
+  const int max_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+
+  TfLiteTensor* fw_input_to_output_weights =
+      GetInput(context, node, kFwInputToOutputWeightsTensor);
+  const int n_fw_cell = fw_input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->data[1],
+                    n_input);
+
+  TfLiteTensor* fw_recurrent_to_output_weights =
+      GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->data[0],
+                    n_fw_cell);
+  const int n_fw_output = fw_recurrent_to_output_weights->dims->data[1];
+
+  // Check that input tensor dimensions matches with each other.
+  CheckInputTensorDimensions(context, node, n_input, n_fw_output, n_fw_cell);
+
+  // Get the pointer to output, state and scratch buffer tensors.
+  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+  TfLiteTensor* fw_output_state =
+      GetOutput(context, node, kFwOutputStateTensor);
+  TfLiteTensor* fw_cell_state = GetOutput(context, node, kFwCellStateTensor);
+  // TODO(ghodrat): Modify this as soon as we have a finalized method for
+  // scratch buffers.
+  TfLiteTensor* fw_scratch_buffer =
+      GetOutput(context, node, kFwScratchBufferTensor);
+
+  // Resize the output and output_state tensors.
+  TfLiteIntArray* fw_output_size = TfLiteIntArrayCreate(3);
+  fw_output_size->data[0] = max_time;
+  fw_output_size->data[1] = n_batch;
+  fw_output_size->data[2] = n_fw_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, fw_output, fw_output_size));
+
+  TfLiteIntArray* fw_output_state_size = TfLiteIntArrayCreate(2);
+  fw_output_state_size->data[0] = n_batch;
+  fw_output_state_size->data[1] = n_fw_output;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_output_state,
+                                                   fw_output_state_size));
+
+  // Resize the scratch buffer tensor.
+  TfLiteIntArray* fw_cell_size = TfLiteIntArrayCreate(2);
+  fw_cell_size->data[0] = n_batch;
+  fw_cell_size->data[1] = n_fw_cell;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, fw_cell_state, fw_cell_size));
+
+  // Mark state tensors as persistent tensors.
+  fw_output_state->allocation_type = kTfLiteArenaRwPersistent;
+  fw_cell_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  TfLiteTensor* fw_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
+  const bool fw_use_cifg = (fw_input_to_input_weights == nullptr);
+  TfLiteIntArray* fw_scratch_buffer_size = TfLiteIntArrayCreate(2);
+  fw_scratch_buffer_size->data[0] = n_batch;
+  if (fw_use_cifg) {
+    // Reserving space for Cell, Forget, Output gates
+    fw_scratch_buffer_size->data[1] = n_fw_cell * 3;
+  } else {
+    // Reserving space for Input, Cell, Forget, Output gates
+    fw_scratch_buffer_size->data[1] = n_fw_cell * 4;
+  }
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_scratch_buffer,
+                                                   fw_scratch_buffer_size));
+  // Same for the backward cell.
+  TfLiteTensor* bw_input_to_output_weights =
+      GetInput(context, node, kBwInputToOutputWeightsTensor);
+  const int n_bw_cell = bw_input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1],
+                    n_input);
+
+  TfLiteTensor* bw_recurrent_to_output_weights =
+      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0],
+                    n_bw_cell);
+  const int n_bw_output = bw_recurrent_to_output_weights->dims->data[1];
+
+  // Check that input tensor dimensions matches with each other.
+  CheckInputTensorDimensions(context, node, n_input, n_bw_output, n_bw_cell);
+
+  // Get the pointer to output, state and scratch buffer tensors.
+  TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+  TfLiteTensor* bw_output_state =
+      GetOutput(context, node, kBwOutputStateTensor);
+  TfLiteTensor* bw_cell_state = GetOutput(context, node, kBwCellStateTensor);
+  // TODO(ghodrat): Modify this as soon as we have a finalized method for
+  // scratch buffers.
+  TfLiteTensor* bw_scratch_buffer =
+      GetOutput(context, node, kBwScratchBufferTensor);
+
+  // Resize the output and output_state tensors.
+  TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3);
+  bw_output_size->data[0] = max_time;
+  bw_output_size->data[1] = n_batch;
+  bw_output_size->data[2] = n_bw_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, bw_output, bw_output_size));
+
+  TfLiteIntArray* bw_output_state_size = TfLiteIntArrayCreate(2);
+  bw_output_state_size->data[0] = n_batch;
+  bw_output_state_size->data[1] = n_bw_output;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_output_state,
+                                                   bw_output_state_size));
+
+  // Resize the scratch buffer tensor.
+  TfLiteIntArray* bw_cell_size = TfLiteIntArrayCreate(2);
+  bw_cell_size->data[0] = n_batch;
+  bw_cell_size->data[1] = n_bw_cell;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, bw_cell_state, bw_cell_size));
+
+  // Mark state tensors as persistent tensors.
+  bw_output_state->allocation_type = kTfLiteArenaRwPersistent;
+  bw_cell_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  TfLiteTensor* bw_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
+  const bool bw_use_cifg = (bw_input_to_input_weights == nullptr);
+  TfLiteIntArray* bw_scratch_buffer_size = TfLiteIntArrayCreate(2);
+  bw_scratch_buffer_size->data[0] = n_batch;
+  if (bw_use_cifg) {
+    // Reserving space for Cell, Forget, Output gates
+    bw_scratch_buffer_size->data[1] = n_bw_cell * 3;
+  } else {
+    // Reserving space for Input, Cell, Forget, Output gates
+    bw_scratch_buffer_size->data[1] = n_bw_cell * 4;
+  }
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_scratch_buffer,
+                                                   bw_scratch_buffer_size));
+  return kTfLiteOk;
+}
+
+// The LSTM Op engine.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+  // Input tensor.
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const int max_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+
+  // Tensors for the forward cell.
+  TfLiteTensor* fw_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
+  TfLiteTensor* fw_input_to_forget_weights =
+      GetInput(context, node, kFwInputToForgetWeightsTensor);
+  TfLiteTensor* fw_input_to_cell_weights =
+      GetInput(context, node, kFwInputToCellWeightsTensor);
+  TfLiteTensor* fw_input_to_output_weights =
+      GetInput(context, node, kFwInputToOutputWeightsTensor);
+
+  TfLiteTensor* fw_recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwRecurrentToInputWeightsTensor);
+  TfLiteTensor* fw_recurrent_to_forget_weights =
+      GetInput(context, node, kFwRecurrentToForgetWeightsTensor);
+  TfLiteTensor* fw_recurrent_to_cell_weights =
+      GetInput(context, node, kFwRecurrentToCellWeightsTensor);
+  TfLiteTensor* fw_recurrent_to_output_weights =
+      GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
+
+  TfLiteTensor* fw_cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwCellToInputWeightsTensor);
+  TfLiteTensor* fw_cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kFwCellToForgetWeightsTensor);
+  TfLiteTensor* fw_cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kFwCellToOutputWeightsTensor);
+
+  TfLiteTensor* fw_input_gate_bias =
+      GetOptionalInputTensor(context, node, kFwInputGateBiasTensor);
+  TfLiteTensor* fw_forget_gate_bias =
+      GetInput(context, node, kFwForgetGateBiasTensor);
+  TfLiteTensor* fw_cell_bias = GetInput(context, node, kFwCellGateBiasTensor);
+  TfLiteTensor* fw_output_gate_bias =
+      GetInput(context, node, kFwOutputGateBiasTensor);
+
+  TfLiteTensor* fw_projection_weights =
+      GetOptionalInputTensor(context, node, kFwProjectionWeightsTensor);
+  TfLiteTensor* fw_projection_bias =
+      GetOptionalInputTensor(context, node, kFwProjectionBiasTensor);
+
+  TfLiteTensor* fw_output_state =
+      GetOutput(context, node, kFwOutputStateTensor);
+  TfLiteTensor* fw_cell_state = GetOutput(context, node, kFwCellStateTensor);
+  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+
+  // Tensors for the backward cell.
+  TfLiteTensor* bw_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
+  TfLiteTensor* bw_input_to_forget_weights =
+      GetInput(context, node, kBwInputToForgetWeightsTensor);
+  TfLiteTensor* bw_input_to_cell_weights =
+      GetInput(context, node, kBwInputToCellWeightsTensor);
+  TfLiteTensor* bw_input_to_output_weights =
+      GetInput(context, node, kBwInputToOutputWeightsTensor);
+
+  TfLiteTensor* bw_recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwRecurrentToInputWeightsTensor);
+  TfLiteTensor* bw_recurrent_to_forget_weights =
+      GetInput(context, node, kBwRecurrentToForgetWeightsTensor);
+  TfLiteTensor* bw_recurrent_to_cell_weights =
+      GetInput(context, node, kBwRecurrentToCellWeightsTensor);
+  TfLiteTensor* bw_recurrent_to_output_weights =
+      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
+
+  TfLiteTensor* bw_cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwCellToInputWeightsTensor);
+  TfLiteTensor* bw_cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kBwCellToForgetWeightsTensor);
+  TfLiteTensor* bw_cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kBwCellToOutputWeightsTensor);
+
+  TfLiteTensor* bw_input_gate_bias =
+      GetOptionalInputTensor(context, node, kBwInputGateBiasTensor);
+  TfLiteTensor* bw_forget_gate_bias =
+      GetInput(context, node, kBwForgetGateBiasTensor);
+  TfLiteTensor* bw_cell_bias = GetInput(context, node, kBwCellGateBiasTensor);
+  TfLiteTensor* bw_output_gate_bias =
+      GetInput(context, node, kBwOutputGateBiasTensor);
+
+  TfLiteTensor* bw_projection_weights =
+      GetOptionalInputTensor(context, node, kBwProjectionWeightsTensor);
+  TfLiteTensor* bw_projection_bias =
+      GetOptionalInputTensor(context, node, kBwProjectionBiasTensor);
+
+  TfLiteTensor* bw_output_state =
+      GetOutput(context, node, kBwOutputStateTensor);
+  TfLiteTensor* bw_cell_state = GetOutput(context, node, kBwCellStateTensor);
+  TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_fw_cell = fw_input_to_output_weights->dims->data[0];
+  const int n_fw_output = fw_recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool fw_use_cifg = (fw_input_to_input_weights == nullptr);
+  const bool fw_use_peephole = (fw_cell_to_output_weights != nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* fw_scratch_buffer =
+      GetOutput(context, node, kFwScratchBufferTensor);
+  float* fw_input_gate_scratch = nullptr;
+  float* fw_cell_scratch = nullptr;
+  float* fw_forget_gate_scratch = nullptr;
+  float* fw_output_gate_scratch = nullptr;
+  if (fw_use_cifg) {
+    fw_cell_scratch = fw_scratch_buffer->data.f;
+    fw_forget_gate_scratch = fw_scratch_buffer->data.f + n_fw_cell * n_batch;
+    fw_output_gate_scratch =
+        fw_scratch_buffer->data.f + 2 * n_fw_cell * n_batch;
+  } else {
+    fw_input_gate_scratch = fw_scratch_buffer->data.f;
+    fw_cell_scratch = fw_scratch_buffer->data.f + n_fw_cell * n_batch;
+    fw_forget_gate_scratch =
+        fw_scratch_buffer->data.f + 2 * n_fw_cell * n_batch;
+    fw_output_gate_scratch =
+        fw_scratch_buffer->data.f + 3 * n_fw_cell * n_batch;
+  }
+
+  // Check optional tensors, the respective pointers can be null.
+  const float* fw_input_to_input_weights_ptr =
+      (fw_use_cifg) ? nullptr : fw_input_to_input_weights->data.f;
+  const float* fw_recurrent_to_input_weights_ptr =
+      (fw_use_cifg) ? nullptr : fw_recurrent_to_input_weights->data.f;
+  const float* fw_input_gate_bias_ptr =
+      (fw_use_cifg) ? nullptr : fw_input_gate_bias->data.f;
+  const float* fw_cell_to_input_weights_ptr =
+      (fw_use_peephole && !fw_use_cifg) ? fw_cell_to_input_weights->data.f
+                                        : nullptr;
+  const float* fw_cell_to_forget_weights_ptr =
+      (fw_use_peephole) ? fw_cell_to_forget_weights->data.f : nullptr;
+  const float* fw_cell_to_output_weights_ptr =
+      (fw_use_peephole) ? fw_cell_to_output_weights->data.f : nullptr;
+  const float* fw_projection_weights_ptr = (fw_projection_weights == nullptr)
+                                               ? nullptr
+                                               : fw_projection_weights->data.f;
+  const float* fw_projection_bias_ptr =
+      (fw_projection_bias == nullptr) ? nullptr : fw_projection_bias->data.f;
+
+  // Loop through the sequence.
+  for (int t = 0; t < max_time; t++) {
+    const float* input_ptr_batch = input->data.f + t * n_batch * n_input;
+    float* output_ptr_time = fw_output->data.f + t * n_batch * n_fw_output;
+
+    kernel_utils::LstmStep(
+        input_ptr_batch, fw_input_to_input_weights_ptr,
+        fw_input_to_forget_weights->data.f, fw_input_to_cell_weights->data.f,
+        fw_input_to_output_weights->data.f, fw_recurrent_to_input_weights_ptr,
+        fw_recurrent_to_forget_weights->data.f,
+        fw_recurrent_to_cell_weights->data.f,
+        fw_recurrent_to_output_weights->data.f, fw_cell_to_input_weights_ptr,
+        fw_cell_to_forget_weights_ptr, fw_cell_to_output_weights_ptr,
+        fw_input_gate_bias_ptr, fw_forget_gate_bias->data.f,
+        fw_cell_bias->data.f, fw_output_gate_bias->data.f,
+        fw_projection_weights_ptr, fw_projection_bias_ptr, params, n_batch,
+        n_fw_cell, n_input, n_fw_output, fw_output_state->data.f,
+        fw_cell_state->data.f, fw_input_gate_scratch, fw_forget_gate_scratch,
+        fw_cell_scratch, fw_output_gate_scratch, output_ptr_time);
+  }
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_bw_cell = bw_input_to_output_weights->dims->data[0];
+  const int n_bw_output = bw_recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool bw_use_cifg = (bw_input_to_input_weights == nullptr);
+  const bool bw_use_peephole = (bw_cell_to_output_weights != nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* bw_scratch_buffer =
+      GetOutput(context, node, kBwScratchBufferTensor);
+  float* bw_input_gate_scratch = nullptr;
+  float* bw_cell_scratch = nullptr;
+  float* bw_forget_gate_scratch = nullptr;
+  float* bw_output_gate_scratch = nullptr;
+  if (bw_use_cifg) {
+    bw_cell_scratch = bw_scratch_buffer->data.f;
+    bw_forget_gate_scratch = bw_scratch_buffer->data.f + n_bw_cell * n_batch;
+    bw_output_gate_scratch =
+        bw_scratch_buffer->data.f + 2 * n_bw_cell * n_batch;
+  } else {
+    bw_input_gate_scratch = bw_scratch_buffer->data.f;
+    bw_cell_scratch = bw_scratch_buffer->data.f + n_bw_cell * n_batch;
+    bw_forget_gate_scratch =
+        bw_scratch_buffer->data.f + 2 * n_bw_cell * n_batch;
+    bw_output_gate_scratch =
+        bw_scratch_buffer->data.f + 3 * n_bw_cell * n_batch;
+  }
+
+  // Check optional tensors, the respective pointers can be null.
+  const float* bw_input_to_input_weights_ptr =
+      (bw_use_cifg) ? nullptr : bw_input_to_input_weights->data.f;
+  const float* bw_recurrent_to_input_weights_ptr =
+      (bw_use_cifg) ? nullptr : bw_recurrent_to_input_weights->data.f;
+  const float* bw_input_gate_bias_ptr =
+      (bw_use_cifg) ? nullptr : bw_input_gate_bias->data.f;
+  const float* bw_cell_to_input_weights_ptr =
+      (bw_use_peephole && !bw_use_cifg) ? bw_cell_to_input_weights->data.f
+                                        : nullptr;
+  const float* bw_cell_to_forget_weights_ptr =
+      (bw_use_peephole) ? bw_cell_to_forget_weights->data.f : nullptr;
+  const float* bw_cell_to_output_weights_ptr =
+      (bw_use_peephole) ? bw_cell_to_output_weights->data.f : nullptr;
+  const float* bw_projection_weights_ptr = (bw_projection_weights == nullptr)
+                                               ? nullptr
+                                               : bw_projection_weights->data.f;
+  const float* bw_projection_bias_ptr =
+      (bw_projection_bias == nullptr) ? nullptr : bw_projection_bias->data.f;
+
+  // Loop through the sequence backwards.
+  for (int t = max_time - 1; t >= 0; t--) {
+    const float* input_ptr_batch = input->data.f + t * n_batch * n_input;
+    float* output_ptr_time = bw_output->data.f + t * n_batch * n_bw_output;
+
+    kernel_utils::LstmStep(
+        input_ptr_batch, bw_input_to_input_weights_ptr,
+        bw_input_to_forget_weights->data.f, bw_input_to_cell_weights->data.f,
+        bw_input_to_output_weights->data.f, bw_recurrent_to_input_weights_ptr,
+        bw_recurrent_to_forget_weights->data.f,
+        bw_recurrent_to_cell_weights->data.f,
+        bw_recurrent_to_output_weights->data.f, bw_cell_to_input_weights_ptr,
+        bw_cell_to_forget_weights_ptr, bw_cell_to_output_weights_ptr,
+        bw_input_gate_bias_ptr, bw_forget_gate_bias->data.f,
+        bw_cell_bias->data.f, bw_output_gate_bias->data.f,
+        bw_projection_weights_ptr, bw_projection_bias_ptr, params, n_batch,
+        n_bw_cell, n_input, n_bw_output, bw_output_state->data.f,
+        bw_cell_state->data.f, bw_input_gate_scratch, bw_forget_gate_scratch,
+        bw_cell_scratch, bw_output_gate_scratch, output_ptr_time);
+  }
+
+  // Backward step.
+  return kTfLiteOk;
+}
+
+}  // namespace bidirectional_sequence_lstm
+
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 bidirectional_sequence_lstm::Prepare,
+                                 bidirectional_sequence_lstm::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cca857bac0633ded01d40273d2e9e8dde488d61e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -0,0 +1,1411 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Bidirectional LSTM op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BidirectionalLSTMOpModel : public SingleOpModel {
+ public:
+  BidirectionalLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                           int sequence_length, bool use_cifg,
+                           bool use_peephole, bool use_projection_weights,
+                           bool use_projection_bias, float cell_clip,
+                           float proj_clip,
+                           const std::vector<std::vector<int>>& input_shapes)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_fw_cell_(n_cell),
+        n_bw_cell_(n_cell),
+        n_fw_output_(n_output),
+        n_bw_output_(n_output),
+        sequence_length_(sequence_length) {
+    input_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      fw_input_to_input_weights_ = AddNullInput();
+    } else {
+      fw_input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    fw_input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    fw_input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    fw_input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      fw_recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      fw_recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    fw_recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    fw_recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    fw_recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        fw_cell_to_input_weights_ = AddNullInput();
+      } else {
+        fw_cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      }
+      fw_cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+      fw_cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    } else {
+      fw_cell_to_input_weights_ = AddNullInput();
+      fw_cell_to_forget_weights_ = AddNullInput();
+      fw_cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      fw_input_gate_bias_ = AddNullInput();
+    } else {
+      fw_input_gate_bias_ = AddInput(TensorType_FLOAT32);
+    }
+    fw_forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+    fw_cell_bias_ = AddInput(TensorType_FLOAT32);
+    fw_output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      fw_projection_weights_ = AddInput(TensorType_FLOAT32);
+      if (use_projection_bias) {
+        fw_projection_bias_ = AddInput(TensorType_FLOAT32);
+      } else {
+        fw_projection_bias_ = AddNullInput();
+      }
+    } else {
+      fw_projection_weights_ = AddNullInput();
+      fw_projection_bias_ = AddNullInput();
+    }
+
+    fw_scratch_buffer_ = AddOutput(TensorType_FLOAT32);
+    // TODO(ghodrat): Modify these states when we have a permanent solution for
+    // persistent buffer.
+    fw_output_state_ = AddOutput(TensorType_FLOAT32);
+    fw_cell_state_ = AddOutput(TensorType_FLOAT32);
+    fw_output_ = AddOutput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      bw_input_to_input_weights_ = AddNullInput();
+    } else {
+      bw_input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    bw_input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    bw_input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    bw_input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      bw_recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      bw_recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    bw_recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    bw_recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    bw_recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        bw_cell_to_input_weights_ = AddNullInput();
+      } else {
+        bw_cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      }
+      bw_cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+      bw_cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    } else {
+      bw_cell_to_input_weights_ = AddNullInput();
+      bw_cell_to_forget_weights_ = AddNullInput();
+      bw_cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      bw_input_gate_bias_ = AddNullInput();
+    } else {
+      bw_input_gate_bias_ = AddInput(TensorType_FLOAT32);
+    }
+    bw_forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+    bw_cell_bias_ = AddInput(TensorType_FLOAT32);
+    bw_output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      bw_projection_weights_ = AddInput(TensorType_FLOAT32);
+      if (use_projection_bias) {
+        bw_projection_bias_ = AddInput(TensorType_FLOAT32);
+      } else {
+        bw_projection_bias_ = AddNullInput();
+      }
+    } else {
+      bw_projection_weights_ = AddNullInput();
+      bw_projection_bias_ = AddNullInput();
+    }
+
+    bw_scratch_buffer_ = AddOutput(TensorType_FLOAT32);
+    // TODO(ghodrat): Modify these states when we have a permanent solution for
+    // persistent buffer.
+    bw_output_state_ = AddOutput(TensorType_FLOAT32);
+    bw_cell_state_ = AddOutput(TensorType_FLOAT32);
+    bw_output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+                 BuiltinOptions_LSTMOptions,
+                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                                   cell_clip, proj_clip)
+                     .Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  // Set weights in forward and backward cells to be the same.
+  void SetInputToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_input_to_input_weights_, f);
+    PopulateTensor(bw_input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_input_to_forget_weights_, f);
+    PopulateTensor(bw_input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_input_to_cell_weights_, f);
+    PopulateTensor(bw_input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_input_to_output_weights_, f);
+    PopulateTensor(bw_input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_recurrent_to_input_weights_, f);
+    PopulateTensor(bw_recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_recurrent_to_forget_weights_, f);
+    PopulateTensor(bw_recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_recurrent_to_cell_weights_, f);
+    PopulateTensor(bw_recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_recurrent_to_output_weights_, f);
+    PopulateTensor(bw_recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_cell_to_input_weights_, f);
+    PopulateTensor(bw_cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_cell_to_forget_weights_, f);
+    PopulateTensor(bw_cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_cell_to_output_weights_, f);
+    PopulateTensor(bw_cell_to_output_weights_, f);
+  }
+
+  void SetInputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(fw_input_gate_bias_, f);
+    PopulateTensor(bw_input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(std::initializer_list<float> f) {
+    PopulateTensor(fw_forget_gate_bias_, f);
+    PopulateTensor(bw_forget_gate_bias_, f);
+  }
+
+  void SetCellBias(std::initializer_list<float> f) {
+    PopulateTensor(fw_cell_bias_, f);
+    PopulateTensor(bw_cell_bias_, f);
+  }
+
+  void SetOutputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(fw_output_gate_bias_, f);
+    PopulateTensor(bw_output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_projection_weights_, f);
+    PopulateTensor(bw_projection_weights_, f);
+  }
+
+  void SetProjectionBias(std::initializer_list<float> f) {
+    PopulateTensor(fw_projection_bias_, f);
+    PopulateTensor(bw_projection_bias_, f);
+  }
+
+  void ResetFwOutputAndCellStates() {
+    const int zero_buffer_size = n_fw_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(fw_output_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+    PopulateTensor(fw_cell_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void ResetBwOutputAndCellStates() {
+    const int zero_buffer_size = n_bw_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(bw_output_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+    PopulateTensor(bw_cell_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
+  std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_fw_outputs() { return n_fw_output_; }
+  int num_bw_outputs() { return n_bw_output_; }
+  int num_fw_cells() { return n_fw_cell_; }
+  int num_bw_cells() { return n_bw_cell_; }
+  int num_batches() { return n_batch_; }
+  int sequence_length() { return sequence_length_; }
+
+ private:
+  int input_;
+  int fw_input_to_input_weights_;
+  int fw_input_to_forget_weights_;
+  int fw_input_to_cell_weights_;
+  int fw_input_to_output_weights_;
+
+  int fw_recurrent_to_input_weights_;
+  int fw_recurrent_to_forget_weights_;
+  int fw_recurrent_to_cell_weights_;
+  int fw_recurrent_to_output_weights_;
+
+  int fw_cell_to_input_weights_;
+  int fw_cell_to_forget_weights_;
+  int fw_cell_to_output_weights_;
+
+  int fw_input_gate_bias_;
+  int fw_forget_gate_bias_;
+  int fw_cell_bias_;
+  int fw_output_gate_bias_;
+
+  int fw_projection_weights_;
+  int fw_projection_bias_;
+
+  int bw_input_to_input_weights_;
+  int bw_input_to_forget_weights_;
+  int bw_input_to_cell_weights_;
+  int bw_input_to_output_weights_;
+
+  int bw_recurrent_to_input_weights_;
+  int bw_recurrent_to_forget_weights_;
+  int bw_recurrent_to_cell_weights_;
+  int bw_recurrent_to_output_weights_;
+
+  int bw_cell_to_input_weights_;
+  int bw_cell_to_forget_weights_;
+  int bw_cell_to_output_weights_;
+
+  int bw_input_gate_bias_;
+  int bw_forget_gate_bias_;
+  int bw_cell_bias_;
+  int bw_output_gate_bias_;
+
+  int bw_projection_weights_;
+  int bw_projection_bias_;
+
+  int fw_output_;
+  int fw_output_state_;
+  int fw_cell_state_;
+  int fw_scratch_buffer_;
+
+  int bw_output_;
+  int bw_output_state_;
+  int bw_cell_state_;
+  int bw_scratch_buffer_;
+
+  int n_batch_;
+  int n_input_;
+  int n_fw_cell_;
+  int n_bw_cell_;
+  int n_fw_output_;
+  int n_bw_output_;
+  int sequence_length_;
+};
+
+TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  BidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/false, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          // Forward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          // Backward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912, -0.15680569,
+                               -0.34856534, 0.43890524});
+
+  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113,
+                              -0.29909778});
+
+  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155, -0.35593212});
+
+  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
+                                0.19487578});
+
+  lstm.SetInputGateBias({0., 0., 0., 0.});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_fw_golden_output[] = {
+      -0.02973187, 0.1229473,  0.20885126, -0.15358765,
+      -0.03716109, 0.12507336, 0.41193449, -0.20860538,
+      -0.15053082, 0.09120187, 0.24278517, -0.12222792};
+  static float lstm_bw_golden_output[] = {
+      -0.0806187, 0.139077, 0.400476, -0.197842,
+      -0.0332076, 0.123838, 0.309777, -0.17621,
+      -0.0490733, 0.0739237, 0.067706, -0.0208124};
+
+  // Resetting cell_state and output_state
+  lstm.ResetFwOutputAndCellStates();
+  lstm.ResetBwOutputAndCellStates();
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  float* fw_golden_start = lstm_fw_golden_output;
+  float* fw_golden_end =
+      fw_golden_start + lstm.num_fw_outputs() * lstm.sequence_length();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), fw_golden_start, fw_golden_end);
+  EXPECT_THAT(lstm.GetFwOutput(),
+              ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  float* bw_golden_start = lstm_bw_golden_output;
+  float* bw_golden_end =
+      bw_golden_start + lstm.num_bw_outputs() * lstm.sequence_length();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), bw_golden_start, bw_golden_end);
+  EXPECT_THAT(lstm.GetBwOutput(),
+              ElementsAreArray(ArrayFloatNear(bw_expected)));
+
+  // Check reversed inputs.
+  static float lstm_input_reversed[] = {1., 1., 3., 4., 2., 3.};
+
+  // Resetting cell_state and output_state
+  lstm.ResetFwOutputAndCellStates();
+  lstm.ResetBwOutputAndCellStates();
+
+  batch0_start = lstm_input_reversed;
+  batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  fw_expected.clear();
+  for (int s = 0; s < lstm.sequence_length(); s++) {
+    fw_golden_start = lstm_fw_golden_output + s * lstm.num_fw_outputs();
+    fw_golden_end = fw_golden_start + lstm.num_fw_outputs();
+    fw_expected.insert(fw_expected.begin(), fw_golden_start, fw_golden_end);
+  }
+  EXPECT_THAT(lstm.GetBwOutput(),
+              ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  bw_expected.clear();
+  for (int s = 0; s < lstm.sequence_length(); s++) {
+    bw_golden_start = lstm_bw_golden_output + s * lstm.num_bw_outputs();
+    bw_golden_end = bw_golden_start + lstm.num_bw_outputs();
+    bw_expected.insert(bw_expected.begin(), bw_golden_start, bw_golden_end);
+  }
+  EXPECT_THAT(lstm.GetFwOutput(),
+              ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  BidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
+      /*use_peephole=*/true, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+                              0.04717243, 0.48944736, -0.38535351,
+                              -0.17212132});
+
+  lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988,
+                                -0.3633365, -0.22755712, 0.28253698, 0.24407166,
+                                0.33826375});
+
+  lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593,
+                                -0.09426838, -0.44257352, 0.54939759,
+                                0.01533556, 0.42751634});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToCellWeights(
+      {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711,
+       0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004,
+       0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288,
+       0.21193194});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827,
+       0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795,
+       -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908,
+       -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835,
+       0.50248802, 0.26114327, -0.43736315, 0.33149987});
+
+  lstm.SetCellToForgetWeights(
+      {0.47485286, -0.51955009, -0.24458408, 0.31544167});
+  lstm.SetCellToOutputWeights(
+      {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_fw_golden_output[] = {
+      -0.36444446, -0.00352185, 0.12886585, -0.05163646,
+      -0.42312205, -0.01218222, 0.24201041, -0.08124574,
+      -0.358325,   -0.04621704, 0.21641694, -0.06471302};
+  static float lstm_bw_golden_output[] = {
+      -0.401685, -0.0232794, 0.288642,  -0.123074,   -0.42915,  -0.00871577,
+      0.20912,   -0.103567,  -0.166398, -0.00486649, 0.0697471, -0.0537578};
+
+  // Resetting cell_state and output_state
+  lstm.ResetFwOutputAndCellStates();
+  lstm.ResetBwOutputAndCellStates();
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  float* fw_golden_start = lstm_fw_golden_output;
+  float* fw_golden_end =
+      fw_golden_start + lstm.num_fw_outputs() * lstm.sequence_length();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), fw_golden_start, fw_golden_end);
+  EXPECT_THAT(lstm.GetFwOutput(),
+              ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  float* bw_golden_start = lstm_bw_golden_output;
+  float* bw_golden_end =
+      bw_golden_start + lstm.num_bw_outputs() * lstm.sequence_length();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), bw_golden_start, bw_golden_end);
+  EXPECT_THAT(lstm.GetBwOutput(),
+              ElementsAreArray(ArrayFloatNear(bw_expected)));
+
+  // Check reversed inputs.
+  static float lstm_input_reversed[] = {1., 1., 3., 4., 2., 3.};
+
+  // Resetting cell_state and output_state
+  lstm.ResetFwOutputAndCellStates();
+  lstm.ResetBwOutputAndCellStates();
+
+  batch0_start = lstm_input_reversed;
+  batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  fw_expected.clear();
+  for (int s = 0; s < lstm.sequence_length(); s++) {
+    fw_golden_start = lstm_fw_golden_output + s * lstm.num_fw_outputs();
+    fw_golden_end = fw_golden_start + lstm.num_fw_outputs();
+    fw_expected.insert(fw_expected.begin(), fw_golden_start, fw_golden_end);
+  }
+  EXPECT_THAT(lstm.GetBwOutput(),
+              ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  bw_expected.clear();
+  for (int s = 0; s < lstm.sequence_length(); s++) {
+    bw_golden_start = lstm_bw_golden_output + s * lstm.num_bw_outputs();
+    bw_golden_end = bw_golden_start + lstm.num_bw_outputs();
+    bw_expected.insert(bw_expected.begin(), bw_golden_start, bw_golden_end);
+  }
+  EXPECT_THAT(lstm.GetFwOutput(),
+              ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+  const int sequence_length = 4;
+
+  BidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/true, /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights(
+      {0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+       0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+       -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+       -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+       -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+       -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+       -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+       0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+       0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+       0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+       -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+       0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+       -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+       -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+       -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+       0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+       -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+       -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+       -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+       -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677});
+
+  lstm.SetInputToForgetWeights(
+      {-0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
+       -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
+       -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
+       0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+       0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
+       -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
+       -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
+       0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+       0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
+       0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
+       0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
+       -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+       0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
+       -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
+       -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
+       0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+       0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
+       0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
+       -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
+       0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496});
+
+  lstm.SetInputToCellWeights(
+      {-0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
+       -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
+       -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
+       -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
+       -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+       0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
+       -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
+       0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
+       -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
+       -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+       -0.025174323,  0.0396852,     0.081777506,   0.06157468,
+       0.10210095,    -0.009658194,  0.046511717,   0.03603906,
+       0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
+       0.053568836,   0.06408714,    0.12835667,    -0.008714329,
+       -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+       -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
+       -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
+       -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
+       -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
+       -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+       0.05453865,    0.091149814,   0.06387331,    0.007518393,
+       0.055960953,   0.069779344,   0.046411168,   0.10509911,
+       0.07463894,    0.0075130584,  0.012850982,   0.04555431,
+       0.056955688,   0.06555285,    0.050801456,   -0.009862683,
+       0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042});
+
+  lstm.SetInputToOutputWeights(
+      {-0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+       -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+       0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+       -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+       -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+       0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+       -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+       -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+       -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+       -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+       0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+       0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+       0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+       -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+       0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+       0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+       -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+       0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+       -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+       -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956});
+
+  lstm.SetInputGateBias(
+      {0.02234832,  0.14757581,   0.18176508,  0.10380666,  0.053110216,
+       -0.06928846, -0.13942584,  -0.11816189, 0.19483899,  0.03652339,
+       -0.10250295, 0.036714908,  -0.18426876, 0.036065217, 0.21810818,
+       0.02383196,  -0.043370757, 0.08690144,  -0.04444982, 0.00030581196});
+
+  lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                          0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                          0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                          -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                          0.40694186,  0.06030037,   0.012413437, -0.06108739});
+
+  lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                    -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                    -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                    -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                    0.016178843,  0.1749513,    0.13975595,   0.92058027});
+
+  lstm.SetOutputGateBias(
+      {0.046159424,  -0.0012809046, 0.03563469,   0.12648113, 0.027195795,
+       0.35373217,   -0.018957434,  0.008907322,  -0.0762701, 0.12018895,
+       0.04216877,   0.0022856654,  0.040952638,  0.3147856,  0.08225149,
+       -0.057416286, -0.14995944,   -0.008040261, 0.13208859, 0.029760877});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+       -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+       -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+       -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+       0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+       0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+       -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+       0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+       -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+       0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+       -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+       0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+       -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+       0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+       -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+       -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+       -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+       -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+       -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+       0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+       0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+       0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+       0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+       0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+       -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+       -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+       0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+       -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+       -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+       -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+       -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+       -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+       -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+       0.0365468,      0.07590991,     0.08838724,    0.021681072,
+       -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+       0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+       -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+       -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+       0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+       -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+       -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+       0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+       -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+       0.015963363,    0.00871737,     0.060130805,   0.028611384,
+       0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+       0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+       0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+       0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+       0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+       -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+       -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+       -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+       -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+       -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+       0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+       0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+       -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+       0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+       0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+       0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+       -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+       -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+       0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+       -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+       -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+       0.06358255,     0.18531723,     0.07759293,    0.12006465,
+       0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+       -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+       -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+       0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+       0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+       0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+       0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+       -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+       -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+       -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+       -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+       0.026351685,    0.012641483,    0.07466548,    0.044301085,
+       -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+       -0.04106223,    -0.028126027,   0.028473156,   0.10467447});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+       0.14811787,    0.10826372,    0.09471067,     0.03987225,
+       -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+       0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+       0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+       -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+       -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+       0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+       -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+       -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+       0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+       -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+       -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+       -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+       0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+       0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+       -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+       0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+       0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+       -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+       -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+       0.060212336,   0.055259194,   0.06974018,     0.049454916,
+       -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+       0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+       -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+       0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+       -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+       0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+       0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+       0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+       -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+       -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+       -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+       0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+       0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+       0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+       0.052958444,   0.07558703,    0.04817258,     0.044462286,
+       -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+       0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+       0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+       -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+       -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+       -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+       0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+       0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+       0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+       0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+       -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+       -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+       0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+       -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+       -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+       -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+       -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+       0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+       -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+       -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+       0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+       -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+       0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+       0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+       0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+       0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+       0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+       0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+       -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+       0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+       -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+       -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+       0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+       -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+       -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+       0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+       0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+       0.014410365,   0.020995233,   0.17040324,     0.11511526,
+       0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+       -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+       -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+       0.007076659,   0.10964551,    0.0409152,      0.008275321,
+       -0.07283536,   0.07937492,    0.04192024,     -0.1075027});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+       0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+       0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+       -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+       0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+       0.08089997,     0.05143358,    0.038261272,   0.03339287,
+       -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+       -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+       -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+       -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+       0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+       -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+       -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+       0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+       0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+       0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+       -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+       0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+       0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+       -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+       0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+       0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+       0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+       -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+       0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+       -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+       0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+       -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+       0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+       -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+       0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+       0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+       -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+       0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+       -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+       0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+       -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+       -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+       -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+       -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+       0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+       0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+       -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+       0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+       0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+       0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+       -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+       0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+       0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+       0.02295182,     0.030739572,   0.056506045,   0.004612461,
+       0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+       -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+       0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+       -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+       0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+       -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+       -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+       -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+       -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+       0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+       0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+       -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+       -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+       -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+       -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+       -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+       0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+       0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+       -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+       0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+       0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+       -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+       -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+       0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+       -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+       -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+       0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+       -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+       -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+       -0.008799762,   0.056595087,   0.0022273948,  0.055752404});
+
+  lstm.SetRecurrentToOutputWeights({
+      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
+      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
+      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
+      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
+      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
+      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
+      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
+      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
+      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
+      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
+      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
+      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
+      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
+      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
+      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
+      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
+      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
+      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
+      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
+      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
+      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
+      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
+      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
+      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
+      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
+      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
+      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
+      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
+      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
+      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
+      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
+      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
+      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
+      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
+      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
+      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
+      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
+      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
+      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
+      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
+      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
+      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
+      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
+      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
+      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
+      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
+      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
+      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
+      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
+      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
+      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
+      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
+      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
+      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
+      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
+      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
+      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
+      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
+      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
+      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
+      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
+      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
+      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
+      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
+  });
+
+  lstm.SetCellToInputWeights(
+      {0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+       -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+       -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+       0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175});
+
+  lstm.SetCellToForgetWeights(
+      {-0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+       -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+       -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+       0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355});
+
+  lstm.SetCellToOutputWeights(
+      {0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
+       -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
+       -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
+       0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733});
+
+  lstm.SetProjectionWeights(
+      {-0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
+       0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
+       -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
+       -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
+       0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
+       0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
+       0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
+       0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
+       -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
+       -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
+       -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
+       0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
+       0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
+       0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
+       0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
+       0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
+       -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
+       0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
+       -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
+       0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
+       -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
+       -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
+       0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
+       -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
+       0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
+       -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
+       -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
+       0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
+       -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
+       -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
+       -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
+       0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
+       0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
+       -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
+       0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
+       0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
+       0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
+       0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
+       0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
+       -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
+       -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
+       0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
+       -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
+       -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
+       0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
+       0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
+       0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
+       -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
+       -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
+       -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
+       0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
+       -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
+       0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
+       0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
+       -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
+       -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
+       -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
+       0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
+       -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
+       -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
+       -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
+       0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
+       0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
+       0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656});
+
+  static float lstm_input[][20] = {
+      {// Batch0: 4 (input_sequence_size) * 5 (n_input)
+       0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386,
+       0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199,
+       0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339},
+
+      {// Batch1: 4 (input_sequence_size) * 5 (n_input)
+       0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260,
+       0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485,
+       0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}};
+
+  static float lstm_fw_golden_output[][64] = {
+      {// Batch0: 4 (input_sequence_size) * 16 (n_output)
+       -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
+       -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
+       -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
+       0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
+       -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
+       -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
+       0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
+       0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
+       0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
+       0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
+       -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
+       -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
+       0.0286833,   0.00824207,   0.0264887,   0.0305169},
+      {// Batch1: 4 (input_sequence_size) * 16 (n_output)
+       -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
+       -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
+       0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
+       0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
+       -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
+       -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
+       0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
+       0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
+       0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
+       0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
+       -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
+       -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
+       0.0412031,    0.0118723,   0.0239643,   0.0394009}};
+
+  static float lstm_combined_golden_output[][64] = {
+    {
+      -0.022014,  0.073544, -0.002235,  0.040068, -0.037136, -0.052788,
+      0.075325, -0.029378,  0.024298, -0.07733 , -0.030674, -0.060229,
+      0.040599,  0.011608,  0.042005,  0.045977, -0.039225,  0.076294,
+      0.000735,  0.032852, -0.069869, -0.053312,  0.073527, -0.028136,
+      0.021585, -0.102679, -0.004327, -0.043304,  0.072861,  0.027077,
+      0.034558,  0.068292, -0.036292,  0.069832, -0.003032,  0.053829,
+      -0.043821, -0.072713,  0.085029, -0.040374,  0.020014, -0.104521,
+      -0.034504, -0.059759,  0.062569,  0.025652,  0.049306,  0.061189,
+      -0.025146,  0.079643, -0.005188,  0.033080, -0.048079, -0.048082,
+      0.069369, -0.028900,  0.024572, -0.077547, -0.022517, -0.054477,
+      0.038857,  0.013336,  0.043234,  0.044788},
+    {
+      -0.039186,  0.070792, -0.005913,  0.02642,  -0.068274, -0.05022,
+      0.061444, -0.031241,  0.014996, -0.094544, -0.004146, -0.03464,
+      0.058981,  0.026097,  0.039781,  0.058408, -0.031887,  0.069252,
+      0.00576,   0.054062, -0.042801, -0.059974,  0.085272, -0.034453,
+      0.026097, -0.0959,   -0.031164, -0.058699,  0.06839,   0.020512,
+      0.044727,  0.063609, -0.039863,  0.084819, -0.003909,  0.028666,
+      -0.075677, -0.045125,  0.070379, -0.033895,  0.022111, -0.097184,
+      -0.004921, -0.040851,  0.062316,  0.017435,  0.041437,  0.064568,
+      -0.039656,  0.060726, -0.003402,  0.036854, -0.056503, -0.058554,
+      0.068588, -0.034879,  0.01352,  -0.09962,  -0.01434,  -0.039505,
+      0.065133,  0.024321,  0.038473,  0.062438
+    }};
+
+  // Resetting cell_state and output_state
+  lstm.ResetFwOutputAndCellStates();
+  lstm.ResetBwOutputAndCellStates();
+
+  for (int i = 0; i < lstm.sequence_length(); i++) {
+    float* batch0_start = lstm_input[0] + i * lstm.num_inputs();
+    float* batch0_end = batch0_start + lstm.num_inputs();
+
+    lstm.SetInput(2 * i * lstm.num_inputs(), batch0_start, batch0_end);
+
+    float* batch1_start = lstm_input[1] + i * lstm.num_inputs();
+    float* batch1_end = batch1_start + lstm.num_inputs();
+    lstm.SetInput((2 * i + 1) * lstm.num_inputs(), batch1_start, batch1_end);
+  }
+
+  lstm.Invoke();
+
+  std::vector<float> expected;
+  for (int i = 0; i < lstm.sequence_length(); i++) {
+    float* golden_start_batch0 =
+        lstm_fw_golden_output[0] + i * lstm.num_fw_outputs();
+    float* golden_end_batch0 = golden_start_batch0 + lstm.num_fw_outputs();
+    float* golden_start_batch1 =
+        lstm_fw_golden_output[1] + i * lstm.num_fw_outputs();
+    float* golden_end_batch1 = golden_start_batch1 + lstm.num_fw_outputs();
+    expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
+    expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
+  }
+  EXPECT_THAT(lstm.GetFwOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Check if the sum of forward backward matches the golden.
+  expected.clear();
+  for (int i = 0; i < lstm.sequence_length(); i++) {
+    float* golden_start_batch0 =
+        lstm_combined_golden_output[0] + i * lstm.num_fw_outputs();
+    float* golden_end_batch0 = golden_start_batch0 + lstm.num_fw_outputs();
+    float* golden_start_batch1 =
+        lstm_combined_golden_output[1] + i * lstm.num_fw_outputs();
+    float* golden_end_batch1 = golden_start_batch1 + lstm.num_fw_outputs();
+    expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
+    expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
+  }
+
+  std::vector<float> combined;
+  for (int i = 0; i < lstm.GetFwOutput().size(); ++i) {
+    combined.push_back(lstm.GetFwOutput()[i] + lstm.GetBwOutput()[i]);
+  }
+  EXPECT_THAT(combined, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
index 12f4ff97cfd90e3a6894a24d15fcbc356f96cde2..911b108eaad605a8a58a2e3b35586c9851d4e719 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -161,7 +161,7 @@ static float rnn_golden_bw_output[] = {
     0,        0,          1.86126,   0,         0.728256,  0.750013,  0.011861,
     0.576383, 3.38891,    1.29273,   0};
 
-constexpr std::initializer_list<float> weights = {
+const std::initializer_list<float> weights = {
     0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
     0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
     0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
@@ -628,12 +628,12 @@ static float golden_endtoend_output[] = {
     -2.080307, 0.896140,  -3.104050, 0.983158,  -0.424898, -1.154270, -3.805728,
     1.978917,  -1.314387, 1.235096,  -3.148906, 1.113173,  0.111713,  2.055213,
     -7.565283, 2.100342};
-constexpr std::initializer_list<float> biases = {
+const std::initializer_list<float> biases = {
     0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
     -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
     0.37197268,  0.61957061,  0.3956964,  -0.37609905};
 
-constexpr std::initializer_list<float> recurrent_weights = {
+const std::initializer_list<float> recurrent_weights = {
     0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/tensorflow/contrib/lite/kernels/cast.cc b/tensorflow/contrib/lite/kernels/cast.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17ef2c572ebbfa54ba6856f7eebbcd6fd9e63868
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/cast.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <algorithm>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace cast {
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(ahentz): these two checks would make the new implementation
+  // incompatible with some existing models, where params is not specified. It
+  // is OK not to have them because toco would have set input and output types
+  // to match the parameters.
+  // auto* params = reinterpret_cast<TfLiteCastParams*>(node->builtin_data);
+  // TF_LITE_ENSURE_EQ(context, input->type, params->in_data_type);
+  // TF_LITE_ENSURE_EQ(context, output->type, params->out_data_type);
+
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
+template <typename FromT, typename ToT>
+void copyCast(const FromT* in, ToT* out, int num_elements) {
+  std::transform(in, in + num_elements, out,
+                 [](FromT a) { return static_cast<ToT>(a); });
+}
+
+template <typename FromT>
+TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
+                          int num_elements) {
+  switch (out->type) {
+    case kTfLiteInt64:
+      copyCast(in, out->data.i64, num_elements);
+      break;
+    case kTfLiteInt32:
+      copyCast(in, out->data.i32, num_elements);
+      break;
+    case kTfLiteUInt8:
+      copyCast(in, out->data.uint8, num_elements);
+      break;
+    case kTfLiteFloat32:
+      copyCast(in, out->data.f, num_elements);
+      break;
+    default:
+      // Unsupported type.
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const int num_elements = NumElements(input);
+  TF_LITE_ENSURE_EQ(context, num_elements, NumElements(output));
+  switch (input->type) {
+    case kTfLiteInt64:
+      return copyToTensor(input->data.i64, output, num_elements);
+    case kTfLiteInt32:
+      return copyToTensor(input->data.i32, output, num_elements);
+    case kTfLiteUInt8:
+      return copyToTensor(input->data.uint8, output, num_elements);
+    case kTfLiteFloat32:
+      return copyToTensor(input->data.f, output, num_elements);
+    default:
+      // Unsupported type.
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+}  // namespace cast
+
+TfLiteRegistration* Register_CAST() {
+  static TfLiteRegistration r = {nullptr, nullptr, cast::Prepare, cast::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/cast_test.cc b/tensorflow/contrib/lite/kernels/cast_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e56482a371550b6275a6380e2beebe3cef958ff
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/cast_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class CastOpModel : public SingleOpModel {
+ public:
+  CastOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_CAST, BuiltinOptions_CastOptions,
+                 CreateCastOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() const { return input_; }
+  int output() const { return output_; }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(CastOpModel, CastIntToFloat) {
+  CastOpModel m({TensorType_INT64, {2, 3}}, {TensorType_FLOAT32, {2, 3}});
+  m.PopulateTensor<int64_t>(m.input(), {100, 200, 300, 400, 500, 600});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray({100.f, 200.f, 300.f, 400.f, 500.f, 600.f}));
+}
+
+TEST(CastOpModel, CastFloatToInt) {
+  CastOpModel m({TensorType_FLOAT32, {3, 2}}, {TensorType_INT32, {3, 2}});
+  m.PopulateTensor<float>(m.input(), {100.f, 20.f, 3.f, 0.4f, 0.999f, 1.1f});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int>(m.output()),
+              ElementsAreArray({100, 20, 3, 0, 0, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/contrib/lite/kernels/comparisons.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87c413cb982dafd239818040d067738e786d43ff
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/comparisons.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace comparisons {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus LessPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Don't support string and bool.
+  TF_LITE_ENSURE(context,
+                 input1->type != kTfLiteString || input1->type != kTfLiteBool);
+  // Currently only support tensors have the same type.
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = kTfLiteBool;
+
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+
+#define TF_LITE_LESS(type, opname)                                          \
+  reference_ops::opname(GetTensorData<type>(input1), GetTensorDims(input1), \
+                        GetTensorData<type>(input2), GetTensorDims(input2), \
+                        GetTensorData<bool>(output), GetTensorDims(output));
+
+  // TODO(renjieliu): Support quantized data.
+  if (requires_broadcast) {
+    switch (input1->type) {
+      case kTfLiteFloat32:
+        TF_LITE_LESS(float, BroadcastLess);
+        break;
+      case kTfLiteInt32:
+        TF_LITE_LESS(int32_t, BroadcastLess);
+        break;
+      case kTfLiteInt64:
+        TF_LITE_LESS(int64_t, BroadcastLess);
+        break;
+      default:
+        context->ReportError(context,
+                             "Does not support type other than float|int");
+        return kTfLiteError;
+    }
+  } else {
+    switch (input1->type) {
+      case kTfLiteFloat32:
+        TF_LITE_LESS(float, Less);
+        break;
+      case kTfLiteInt32:
+        TF_LITE_LESS(int32_t, Less);
+        break;
+      case kTfLiteInt64:
+        TF_LITE_LESS(int64_t, Less);
+        break;
+      default:
+        context->ReportError(context,
+                             "Does not support type other than float|int");
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_LESS
+  return kTfLiteOk;
+}
+
+}  // namespace comparisons
+
+TfLiteRegistration* Register_LESS() {
+  static TfLiteRegistration r = {nullptr, nullptr, comparisons::LessPrepare,
+                                 comparisons::LessEval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/contrib/lite/kernels/comparisons_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da2d7f858984a4d3bb09ca8e485fe1599bea7ded
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/comparisons_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LessOpModel : public SingleOpModel {
+ public:
+  LessOpModel(std::initializer_list<int> input1_shape,
+              std::initializer_list<int> input2_shape, TensorType input_type) {
+    input1_ = AddInput(input_type);
+    input2_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_BOOL);
+    SetBuiltinOp(BuiltinOperator_LESS, BuiltinOptions_LessOptions,
+                 CreateLessOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(ArgMaxOpTest, LessFloat) {
+  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ArgMaxOpTest, LessInt) {
+  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 6, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ArgMaxOpTest, LessBroadcast) {
+  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ArgMaxOpTest, LessBroadcastTwoD) {
+  LessOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 6, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true,
+                                                   true, false, false, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
index a619ada86af64c299f8e518a7493db20f1011a50..45ea8d00498455be98467f2f1addc8ad7dcf35fa 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -67,10 +67,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* t = &context->tensors[node->inputs->data[i]];
     TF_LITE_ENSURE_EQ(context, t->dims->size, t0->dims->size);
     TF_LITE_ENSURE_EQ(context, t->type, input_type);
-    if (input_type == kTfLiteUInt8) {
-      TF_LITE_ENSURE_EQ(context, t->params.zero_point, t0->params.zero_point);
-      TF_LITE_ENSURE_EQ(context, t->params.scale, t0->params.scale);
-    }
     for (int d = 0; d < t0->dims->size; ++d) {
       if (d == axis) {
         sum_axis += t->dims->data[axis];
@@ -87,11 +83,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
   TF_LITE_ENSURE_EQ(context, output->type, input_type);
-  if (input_type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point,
-                      t0->params.zero_point);
-    TF_LITE_ENSURE_EQ(context, output->params.scale, t0->params.scale);
-  }
 
   return context->ResizeTensor(context, output, output_size);
 }
@@ -115,6 +106,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       all_inputs.dims(), node->inputs->size, GetTensorData<scalar>(output), \
       GetTensorDims(output))
 
+#define TF_LITE_CONCATENATION_QUANTIZED(type)                                  \
+  VectorOfQuantizedTensors all_inputs(*context, *node->inputs);                \
+  type::Concatenation(                                                         \
+      RemapDim(NumDimensions(output), axis), all_inputs.data(),                \
+      all_inputs.dims(), all_inputs.zero_point(), all_inputs.scale(),          \
+      node->inputs->size, GetTensorData<uint8>(output), GetTensorDims(output), \
+      output->params.zero_point, output->params.scale)
+
   switch (output->type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
@@ -125,9 +124,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     case kTfLiteUInt8:
       if (kernel_type == kReference) {
-        TF_LITE_CONCATENATION(reference_ops, uint8_t);
+        TF_LITE_CONCATENATION_QUANTIZED(reference_ops);
       } else {
-        TF_LITE_CONCATENATION(optimized_ops, uint8_t);
+        TF_LITE_CONCATENATION_QUANTIZED(optimized_ops);
       }
       break;
     default:
@@ -136,6 +135,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
+#undef TF_LITE_CONCATENATION_QUANTIZED
 #undef TF_LITE_CONCATENATION
 
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/contrib/lite/kernels/concatenation_test.cc
index ba1ffc5f8423b9626c9c8e2a1086ea0dcca43f50..467ff6f7e149e35ae1fd11031c10d7087c4b398c 100644
--- a/tensorflow/contrib/lite/kernels/concatenation_test.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation_test.cc
@@ -28,6 +28,7 @@ class BaseConcatenationOpModel : public SingleOpModel {
  public:
   // TODO(ahentz): Also test different activation types, axis, input
   // dimensions.
+  BaseConcatenationOpModel() {}
   BaseConcatenationOpModel(const TensorData& input_template, int axis,
                            int num_inputs) {
     std::vector<std::vector<int>> all_input_shapes;
@@ -60,6 +61,23 @@ class ConcatenationOpModel : public BaseConcatenationOpModel {
 class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
  public:
   using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  QuantizedConcatenationOpModel(const std::vector<TensorData>& input_template,
+                                int axis, int num_inputs,
+                                const TensorData& output_template) {
+    std::vector<std::vector<int>> all_input_shapes;
+    CHECK_EQ(input_template.size(), num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      all_input_shapes.push_back(input_template[i].shape);
+      AddInput(input_template[i]);
+    }
+    output_ = AddOutput({output_template.type, /*shape=*/{},
+                         output_template.min, output_template.max});
+    SetBuiltinOp(
+        BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+        CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+            .Union());
+    BuildInterpreter(all_input_shapes);
+  }
   void SetInput(int index, std::initializer_list<float> data) {
     QuantizeAndPopulate<uint8_t>(index, data);
   }
@@ -168,6 +186,56 @@ TEST(ConcatenationOpTest, FourInputsQuantized) {
                               }));
 }
 
+TEST(ConcatenationOpTest, FourInputsQuantizedMixedRange) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -12.7, 12.8});
+
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              })));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                                  167, 197, 168, 198, 169, 199, 170, 200,  //
+                              }));
+}
+
+TEST(ConcatenationOpTest, FourInputsQuantizedMixedRangeClampingLogic) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -1., 1.});
+
+  m0.SetInput(0, {1.0f, -3.0f, -4.0f, -7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, -3.2f, -4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f,   //
+                      -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,  //
+                  },
+                  4e-3)));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  255, 0, 255, 255, 255, 0, 255, 255,  //
+                                  0, 0, 255, 255, 0, 255, 255, 255,    //
+                              }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 66d2c04bba4a164bbcdcf4b1a097d9aac0b3aeeb..3b467b3aa284586ab8e67ede55583adffbe06cc7 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/eigen_support.h"
 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
@@ -43,6 +44,8 @@ namespace conv {
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
+  // kMultithreadOptimized is a mixture of an Eigen-based kernel when threads
+  // are available and kGenericOptimized when we must use only one thread.
   kMultithreadOptimized,
   // The kernel uses use CBLAS interface for matrix multiplication.
   // It's fast when an optimized CBLAS implementation is available (e.g. Apple
@@ -51,15 +54,17 @@ enum KernelType {
   kCblasOptimized,
 };
 
+const int kTensorNotAllocated = -1;
+
 struct OpData {
   // IDs are the arbitrary identifiers used by TF Lite to identify and access
   // memory buffers.
-  int im2col_id;
-  int hwcn_weights_id;
+  int im2col_id = kTensorNotAllocated;
+  int hwcn_weights_id = kTensorNotAllocated;
 
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multipler plus a left shift.
+  // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
   // The range of the fused activation layer. For example for kNone and
@@ -73,6 +78,8 @@ struct OpData {
   bool need_hwcn_weights;
   bool have_weights_been_transposed;
   bool need_im2col;
+
+  bool run_multithreaded_kernel;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -80,13 +87,13 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // Instead, we allocate a new object to use as scratch space for im2col, and
   // to carry information from Prepare() to Eval().
   auto* data = new OpData;
-  context->AddTensors(context, 1, &data->im2col_id);
-  context->AddTensors(context, 1, &data->hwcn_weights_id);
   gemm_support::IncrementUsageCounter(context);
+  eigen_support::IncrementUsageCounter(context);
   return data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
+  eigen_support::DecrementUsageCounter(context);
   gemm_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
@@ -107,10 +114,69 @@ void TransposeFloatTensor(TfLiteTensor* input, TfLiteTensor* output) {
   }
 }
 
+// Allocate temporary tensors (`im2col`, `hwcn_weights` if necessary).
+// Note: `context->AddTensors` might invalidate pointers to existing tensors.
+// Therefore the logic to add tensors are isolated into this function.
+static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context,
+                                                       TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE(context, node->inputs->size >= 2);
+  TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+  TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
+
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+
+  // We don't always need to allocate im2col. It is only used in some versions
+  // of the optimized Conv. This test just mimics something that happens inside
+  // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
+  data->need_im2col =
+      (params->stride_width != 1 || params->stride_height != 1 ||
+       filter_width != 1 || filter_height != 1);
+  // If we're using the optimized multithreaded EigenTensor implementation of
+  // convolution, it expects the filter weights to be transposed compared to
+  // the normal TF Lite buffer format. Typical TF Lite weights are
+  // [filter_count, filter_height, filter_width, input_depth], but for the float
+  // implementation we need them as [filter_height, filter_width, input_depth,
+  // filter_count]. We get to that format by transposing, and create a temporary
+  // buffer to store the results.
+  // This path is only used for float processing, so only create the buffer if
+  // we're running with that data type.
+  data->need_hwcn_weights =
+      (input->type == kTfLiteFloat32 && data->run_multithreaded_kernel);
+
+  int temporaries_count = 0;
+  if (data->need_im2col) {
+    data->im2col_index = temporaries_count;
+    if (data->im2col_id == kTensorNotAllocated) {
+      context->AddTensors(context, 1, &data->im2col_id);
+    }
+    ++temporaries_count;
+  }
+  if (data->need_hwcn_weights) {
+    data->hwcn_weights_index = temporaries_count;
+    if (data->hwcn_weights_id == kTensorNotAllocated) {
+      context->AddTensors(context, 1, &data->hwcn_weights_id);
+    }
+    ++temporaries_count;
+  }
+
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(temporaries_count);
+
+  return kTfLiteOk;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
+  data->run_multithreaded_kernel = context->recommended_num_threads != 1;
+
+  TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(context, node));
+
   bool hasBias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, hasBias || node->inputs->size == 2);
@@ -118,6 +184,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
   TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
   TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
+
   // Check dimensionality of input, filter
   TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
   TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
@@ -158,22 +225,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Matching GetWindowedOutputSize in TensorFlow.
   auto padding = params->padding;
-  auto computeOutSize = [padding](int imageSize, int filterSize,
-                                  int stride) -> int {
+  auto computeOutSize = [padding](int imageSize, int filterSize, int stride,
+                                  int dilationRate) -> int {
+    int effectiveFilterSize = (filterSize - 1) * dilationRate + 1;
     return padding == kTfLitePaddingSame
                ? (imageSize + stride - 1) / stride
                : padding == kTfLitePaddingValid
-                     ? (imageSize - filterSize + stride) / stride
+                     ? (imageSize - effectiveFilterSize + stride) / stride
                      : 0;
   };
 
-  int outWidth = computeOutSize(width, filter_width, params->stride_width);
-  int outHeight = computeOutSize(height, filter_height, params->stride_height);
+  int outWidth = computeOutSize(width, filter_width, params->stride_width,
+                                params->dilation_width_factor);
+  int outHeight = computeOutSize(height, filter_height, params->stride_height,
+                                 params->dilation_height_factor);
 
   data->padding.height =
-      ComputePadding(params->stride_height, height, filter_height, outHeight);
+      ComputePadding(params->stride_height, params->dilation_height_factor,
+                     height, filter_height, outHeight);
   data->padding.width =
-      ComputePadding(params->stride_width, width, filter_width, outWidth);
+      ComputePadding(params->stride_width, params->dilation_width_factor, width,
+                     filter_width, outWidth);
 
   TF_LITE_ENSURE(context, hasBias);
 
@@ -199,36 +271,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   if (output_status != kTfLiteOk) return output_status;
 
-  // We don't always need to allocate im2col. It is only used in some versions
-  // of the optimized Conv. This test just mimics something that happens inside
-  // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
-  data->need_im2col =
-      (params->stride_width != 1 || params->stride_height != 1 ||
-       filter_width != 1 || filter_height != 1);
-  // If we're using the optimized multithreaded EigenTensor implementation of
-  // convolution, it expects the filter weights to be transposed compared to
-  // the normal TF Lite buffer format. Typical TF Lite weights are
-  // [filter_count, filter_height, filter_width, input_depth], but for the float
-  // implementation we need them as [filter_height, filter_width, input_depth,
-  // filter_count]. We get to that format by transposing, and create a temporary
-  // buffer to store the results.
-  // This path is only used for float processing, so only create the buffer if
-  // we're running with that data type.
-  data->need_hwcn_weights = (data_type == kTfLiteFloat32);
-
-  int temporaries_count = 0;
-  if (data->need_im2col) {
-    data->im2col_index = temporaries_count;
-    ++temporaries_count;
-  }
-  if (data->need_hwcn_weights) {
-    data->hwcn_weights_index = temporaries_count;
-    ++temporaries_count;
-  }
-
-  TfLiteIntArrayFree(node->temporaries);
-  node->temporaries = TfLiteIntArrayCreate(temporaries_count);
-
   if (data->need_im2col) {
     node->temporaries->data[data->im2col_index] = data->im2col_id;
 
@@ -338,28 +380,40 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-
-  switch (kernel_type) {
+  KernelType effective_kernel_type;
+  if (((kernel_type == kMultithreadOptimized) ||
+       (kernel_type == kCblasOptimized)) &&
+      ((params->dilation_width_factor != 1) ||
+       (params->dilation_height_factor != 1))) {
+    // kMultithreadOptimized and kCblasOptimized do not support dilation.
+    // Therefore, fallback to optimized.
+    effective_kernel_type = kGenericOptimized;
+  } else {
+    effective_kernel_type = kernel_type;
+  }
+  switch (effective_kernel_type) {
     case kReference: {
-      reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                          GetTensorData<float>(filter), GetTensorDims(filter),
-                          GetTensorData<float>(bias), GetTensorDims(bias),
-                          params->stride_width, params->stride_height,
-                          data->padding.width, data->padding.height,
-                          output_activation_min, output_activation_max,
-                          GetTensorData<float>(output), GetTensorDims(output),
-                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      reference_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input),
+          GetTensorData<float>(filter), GetTensorDims(filter),
+          GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width,
+          params->stride_height, params->dilation_width_factor,
+          params->dilation_height_factor, data->padding.width,
+          data->padding.height, output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
       break;
     }
     case kGenericOptimized: {
-      optimized_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                          GetTensorData<float>(filter), GetTensorDims(filter),
-                          GetTensorData<float>(bias), GetTensorDims(bias),
-                          params->stride_width, params->stride_height,
-                          data->padding.width, data->padding.height,
-                          output_activation_min, output_activation_max,
-                          GetTensorData<float>(output), GetTensorDims(output),
-                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      optimized_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input),
+          GetTensorData<float>(filter), GetTensorDims(filter),
+          GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width,
+          params->stride_height, params->dilation_width_factor,
+          params->dilation_height_factor, data->padding.width,
+          data->padding.height, output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
       break;
     }
     case kMultithreadOptimized: {
@@ -422,8 +476,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // separate ops to avoid dispatch overhead here.
   switch (input->type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
-      EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
-                             im2col, hwcn_weights, output);
+      if (data->run_multithreaded_kernel) {
+        EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
+                               im2col, hwcn_weights, output);
+      } else {
+        EvalFloat<kGenericOptimized>(context, node, params, data, input, filter,
+                                     bias, im2col, hwcn_weights, output);
+      }
       break;
     case kTfLiteUInt8:
       EvalQuantized<kernel_type>(context, node, params, data, input, filter,
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index d2393c3c97bb9516e2b8a6c8ae037dc0dfdfe64b..0dcfc826fd218d2d2dfbf89201d2c13fbfe6f0e1 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -46,7 +46,8 @@ class BaseConvolutionOpModel : public SingleOpModel {
       TfLiteRegistration* registration, const TensorData& input,
       const TensorData& filter, const TensorData& output, int stride_width = 2,
       int stride_height = 2, enum Padding padding = Padding_VALID,
-      enum ActivationFunctionType activation = ActivationFunctionType_NONE) {
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE,
+      int dilation_width_factor = 1, int dilation_height_factor = 1) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
 
@@ -71,8 +72,9 @@ class BaseConvolutionOpModel : public SingleOpModel {
     }
 
     SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
-                 CreateConv2DOptions(builder_, padding, stride_width,
-                                     stride_height, activation)
+                 CreateConv2DOptions(
+                     builder_, padding, stride_width, stride_height, activation,
+                     dilation_width_factor, dilation_height_factor)
                      .Union());
 
     resolver_ = absl::make_unique<SingleOpResolver>(BuiltinOperator_CONV_2D,
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
index 15dbfe08c82befcf001b9ed9a053528b5606053e..eeda1bc3c5ba2da5b6546ce36925a6f20fc9cbae 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -52,7 +52,7 @@ enum KernelType {
 struct OpData {
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multipler plus a left shift.
+  // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
   // The range of the fused activation layer. For example for kNone and
@@ -140,10 +140,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int out_height =
       compute_out_size(height, filter_height, params->stride_height);
 
-  data->padding.height =
-      ComputePadding(params->stride_height, height, filter_height, out_height);
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
+                                        filter_height, out_height);
   data->padding.width =
-      ComputePadding(params->stride_width, width, filter_width, out_width);
+      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
diff --git a/tensorflow/contrib/lite/kernels/dequantize.cc b/tensorflow/contrib/lite/kernels/dequantize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e685f2465f627cf30e02564e6f16e1ec69e208e2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/dequantize.cc
@@ -0,0 +1,77 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace dequantize {
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    input = GetInput(context, node, 0);
+    output = GetOutput(context, node, 0);
+  }
+  TfLiteTensor* input;
+  TfLiteTensor* output;
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  OpContext op_context(context, node);
+
+  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8);
+
+  op_context.output->type = kTfLiteFloat32;
+  return context->ResizeTensor(context, op_context.output,
+                               TfLiteIntArrayCopy(op_context.input->dims));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+
+  auto zero_point = op_context.input->params.zero_point;
+  auto scale = op_context.input->params.scale;
+
+  optimized_ops::Dequantize(GetTensorData<uint8_t>(op_context.input),
+                            GetTensorDims(op_context.input), zero_point, scale,
+                            GetTensorData<float>(op_context.output),
+                            GetTensorDims(op_context.output));
+  return kTfLiteOk;
+}
+
+}  // namespace dequantize
+
+TfLiteRegistration* Register_DEQUANTIZE_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, dequantize::Prepare,
+                                 dequantize::Eval};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEQUANTIZE() { return Register_DEQUANTIZE_OPT(); }
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/dequantize_test.cc b/tensorflow/contrib/lite/kernels/dequantize_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fcd74206177a0a97db168338e3619d4b95c052a9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/dequantize_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class DequantizeOpModel : public SingleOpModel {
+ public:
+  DequantizeOpModel(std::initializer_list<int> shape, float min, float max) {
+    input_ = AddInput({TensorType_UINT8, shape, min, max});
+    output_ = AddOutput({TensorType_FLOAT32, shape});
+    SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
+                 CreateDequantizeOptions(builder_).Union());
+
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  void SetInput(std::initializer_list<uint8_t> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(SplitOpTest, FourDimensional) {
+  DequantizeOpModel m({2, 5}, -63.5, 64);
+
+  m.SetInput({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc
index 44bd0dc85d50c98ec6b6888e05064a8f2e2731c0..ec380c8e4956e5bcd0d7559bfd8f89a52d9d233c 100644
--- a/tensorflow/contrib/lite/kernels/div.cc
+++ b/tensorflow/contrib/lite/kernels/div.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,51 +61,67 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
-void EvalDivFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteDivParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDivParams* params, const OpData* data,
+               TfLiteTensor* input1, TfLiteTensor* input2,
+               TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_DIV(type)                                        \
-  type::Div(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_DIV(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
   if (kernel_type == kReference) {
-    TF_LITE_DIV(reference_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_DIV(reference_ops, BroadcastDiv);
+    } else {
+      TF_LITE_DIV(reference_ops, Div);
+    }
   } else {
-    TF_LITE_DIV(optimized_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_DIV(optimized_ops, BroadcastDiv);
+    } else {
+      TF_LITE_DIV(optimized_ops, Div);
+    }
   }
 #undef TF_LITE_DIV
 }
 
+
+
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalDivFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Div only supports FLOAT32 and quantized UINT8 now.");
     return kTfLiteError;
   }
 
@@ -99,19 +131,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace div
 
 TfLiteRegistration* Register_DIV_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+  static TfLiteRegistration r = {div::Init, div::Free, div::Prepare,
                                  div::Eval<div::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_DIV_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+  static TfLiteRegistration r = {div::Init, div::Free, div::Prepare,
                                  div::Eval<div::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_DIV_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+  static TfLiteRegistration r = {div::Init, div::Free, div::Prepare,
                                  div::Eval<div::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/div_test.cc b/tensorflow/contrib/lite/kernels/div_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..276b8289fbc1b4dcbf4624b76b854300d0fd4912
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/div_test.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseDivOpModel : public SingleOpModel {
+ public:
+  BaseDivOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
+                 ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_DIV, BuiltinOptions_DivOptions,
+                 CreateDivOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatDivOpModel : public BaseDivOpModel {
+ public:
+  using BaseDivOpModel::BaseDivOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+TEST(FloatDivOpTest, NoActivation) {
+  FloatDivOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.5, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-0.4, 1.0, 0.8, 1.6})));
+}
+
+TEST(FloatDivOpTest, ActivationRELU_N1_TO_1) {
+  FloatDivOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-1.0, 1.0, 0.8, 1.0})));
+}
+
+TEST(FloatDivOpTest, VariousInputShapes) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatDivOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.3, 0.8, 1.1, -2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.6, 0.5, -1.1, -0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-20.0, 1.0, 0.5, 1.6, -1.0, 20.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatDivOpTest, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatDivOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, 0.07, 0.08, 0.11, -0.123});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-2.0, 2.0, 0.7, 0.8, 1.1, -1.23})))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/eigen_support.cc b/tensorflow/contrib/lite/kernels/eigen_support.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97f5e908240d76a88b3963c4d14aa3a6f1ea313c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/eigen_support.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/eigen_support.h"
+
+#include "Eigen/Core"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace eigen_support {
+
+struct RefCountedEigenContext {
+  int num_references = 0;
+};
+
+void IncrementUsageCounter(TfLiteContext* context) {
+  auto* ptr = reinterpret_cast<RefCountedEigenContext*>(context->eigen_context);
+  if (ptr == nullptr) {
+    if (context->recommended_num_threads != -1) {
+      Eigen::setNbThreads(context->recommended_num_threads);
+    }
+    ptr = new RefCountedEigenContext;
+    ptr->num_references = 0;
+    context->eigen_context = ptr;
+  }
+  ptr->num_references++;
+}
+
+void DecrementUsageCounter(TfLiteContext* context) {
+  auto* ptr = reinterpret_cast<RefCountedEigenContext*>(context->eigen_context);
+  if (ptr == nullptr) {
+    TF_LITE_FATAL(
+        "Call to DecrementUsageCounter() not preceded by "
+        "IncrementUsageCounter()");
+  }
+  if (--ptr->num_references == 0) {
+    delete ptr;
+    context->eigen_context = nullptr;
+  }
+}
+
+void SetNumThreads(TfLiteContext* context, int num_threads) {
+  IncrementUsageCounter(context);
+  Eigen::setNbThreads(num_threads);
+  DecrementUsageCounter(context);
+}
+
+}  // namespace eigen_support
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/eigen_support.h b/tensorflow/contrib/lite/kernels/eigen_support.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa8c351fd8e8dae45f7d4807ce24d80bb393c41c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/eigen_support.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_EIGEN_SUPPORT_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_EIGEN_SUPPORT_H_
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+namespace eigen_support {
+
+// Let the framework know that the op will be using Eigen. If necessary a set of
+// temporary Eigen objects might be created and placed in 'context'.
+void IncrementUsageCounter(TfLiteContext* context);
+
+// Let the framework know that the op stopped using Eigen. If there are no more
+// usages all temporary Eigen objects will be deleted.
+void DecrementUsageCounter(TfLiteContext* context);
+
+// Set the number of threads that can be used by Eigen.
+void SetNumThreads(TfLiteContext* context, int num_threads);
+
+}  // namespace eigen_support
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_EIGEN_SUPPORT_H_
diff --git a/tensorflow/contrib/lite/kernels/floor.cc b/tensorflow/contrib/lite/kernels/floor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b4395f711614a3b7047dc8f144ca3fa36b8a89b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/floor.cc
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace floor {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  output->type = input->type;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  optimized_ops::Floor(GetTensorData<float>(input), GetTensorDims(input),
+                       GetTensorData<float>(output), GetTensorDims(output));
+  return kTfLiteOk;
+}
+}  // namespace floor
+
+TfLiteRegistration* Register_FLOOR() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, floor::Prepare, floor::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/floor_test.cc b/tensorflow/contrib/lite/kernels/floor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b71e0400b6dc92899721342fc4ebbd51a8876455
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/floor_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class FloorOpModel : public SingleOpModel {
+ public:
+  FloorOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_FLOOR, BuiltinOptions_NONE, 0);
+    BuildInterpreter({
+        input_shape,
+    });
+  }
+
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(FloorOpTest, SingleDim) {
+  FloorOpModel model({2}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {8.5, 0.0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(FloorOpTest, MultiDims) {
+  FloorOpModel model({2, 1, 1, 5}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {
+                                                 0.0001,
+                                                 8.0001,
+                                                 0.9999,
+                                                 9.9999,
+                                                 0.5,
+                                                 -0.0001,
+                                                 -8.0001,
+                                                 -0.9999,
+                                                 -9.9999,
+                                                 -0.5,
+                                             });
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
index a77fe94e499078bc2f0660e8e49fd557ed0f625d..888e67966c0a408257e763a405bf6e928310f4d9 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -48,7 +48,7 @@ enum KernelType {
 
 struct OpData {
   // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multipler plus a left shift.
+  // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
   // The range of the fused activation layer. For example for kNone and
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
index a0f766c4f4580d7679275c0b63aa200410fcb5ad..87413000a93a0a361d81b1f0eb46550b5b90f9ac 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -19,12 +19,25 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
 #include "tensorflow/contrib/lite/model.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_FULLY_CONNECTED_REF();
+TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT();
+TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT();
+TfLiteRegistration* Register_FULLY_CONNECTED_PIE();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
 using ::testing::ElementsAre;
@@ -119,7 +132,8 @@ static float fully_connected_golden_output[] = {
 class BaseFullyConnectedOpModel : public SingleOpModel {
  public:
   // TODO(ahentz): test different activation types too.
-  BaseFullyConnectedOpModel(int units, int batches, const TensorData& input,
+  BaseFullyConnectedOpModel(TfLiteRegistration* registration, int units,
+                            int batches, const TensorData& input,
                             const TensorData& output = {TensorType_FLOAT32})
       : batches_(batches), units_(units) {
     int total_input_size = 1;
@@ -149,6 +163,8 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
         BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
         CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
             .Union());
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_FULLY_CONNECTED, registration);
     BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
   }
 
@@ -208,10 +224,25 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
   }
 };
 
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
+    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
+    {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
+    {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
+});
+
+class FullyConnectedOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
 // TODO(ahentz): add more small tests like this one, focused on making sure the
 // calculations are correct.
-TEST(FullyConnectedOpTest, SimpleTest) {
-  FloatFullyConnectedOpModel m(3, 2, {TensorType_FLOAT32, {2, 10}});
+TEST_P(FullyConnectedOpTest, SimpleTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), 3, 2,
+                               {TensorType_FLOAT32, {2, 10}});
   m.SetWeights({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
@@ -229,9 +260,9 @@ TEST(FullyConnectedOpTest, SimpleTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
 }
 
-TEST(FullyConnectedOpTest, SimpleTestQuantized) {
+TEST_P(FullyConnectedOpTest, SimpleTestQuantized) {
   QuantizedFullyConnectedOpModel m(
-      3, 2,
+      GetRegistration(), 3, 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
@@ -261,7 +292,8 @@ TEST(FullyConnectedOpTest, SimpleTest4DInput) {
   // Note that it is not required that the first dimension be the number of
   // batches. All we care is that the input can be evenly distributed in
   // batches. In this case, we need the input to have multiples of '2'.
-  FloatFullyConnectedOpModel m(/*units=*/3,
+  FloatFullyConnectedOpModel m(ops::builtin::Register_FULLY_CONNECTED_PIE(),
+                               /*units=*/3,
                                /*batches=*/2,
                                /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}});
   m.SetWeights({
@@ -284,9 +316,9 @@ TEST(FullyConnectedOpTest, SimpleTest4DInput) {
                              }));
 }
 
-TEST(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
+TEST_P(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
   QuantizedFullyConnectedOpModel m(
-      3, 2,
+      GetRegistration(), 3, 2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
@@ -312,10 +344,15 @@ TEST(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+INSTANTIATE_TEST_CASE_P(
+    FullyConnectedOpTest, FullyConnectedOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 // TODO(ahentz): Reconsider this test. Having arbitrary weights makes it hard
 // to debug errors and doesn't necessarily test all the important details.
-TEST(FullyConnectedOpTest, BlackBoxTest) {
-  FloatFullyConnectedOpModel m(16, 2, {TensorType_FLOAT32, {2, 8}});
+TEST_P(FullyConnectedOpTest, BlackBoxTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), 16, 2,
+                               {TensorType_FLOAT32, {2, 8}});
   m.SetWeights(
       {0.091327,  0.103366,  -0.316505, -0.083120, 0.149366,  -0.196636,
        -0.123672, 0.062800,  0.063031,  0.191670,  -0.062001, -0.061504,
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.cc b/tensorflow/contrib/lite/kernels/gemm_support.cc
index eb2b0aacf7ecc3ed5dbde5ccce7a46dcda0a93b3..95f45ea768be7f9bae9570563f161792afbff436 100644
--- a/tensorflow/contrib/lite/kernels/gemm_support.cc
+++ b/tensorflow/contrib/lite/kernels/gemm_support.cc
@@ -29,6 +29,9 @@ void IncrementUsageCounter(TfLiteContext* context) {
   if (ptr == nullptr) {
     ptr = new RefCountedGemmContext;
     ptr->gemm_context_ = new gemmlowp::GemmContext();
+    if (context->recommended_num_threads != -1) {
+      ptr->gemm_context_->set_max_num_threads(context->recommended_num_threads);
+    }
     ptr->num_references_ = 0;
     context->gemm_context = ptr;
   }
@@ -58,7 +61,7 @@ gemmlowp::GemmContext* GetFromContext(TfLiteContext* context) {
   return ptr->gemm_context_;
 }
 
-void SetMaxNumThreads(TfLiteContext* context, int num_threads) {
+void SetNumThreads(TfLiteContext* context, int num_threads) {
   IncrementUsageCounter(context);
   GetFromContext(context)->set_max_num_threads(num_threads);
   DecrementUsageCounter(context);
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.h b/tensorflow/contrib/lite/kernels/gemm_support.h
index 466781cbcecc7fb851d9078c450cc6c12364d2bb..f033501cb6e341aa014fa4d956b531bd79aa555b 100644
--- a/tensorflow/contrib/lite/kernels/gemm_support.h
+++ b/tensorflow/contrib/lite/kernels/gemm_support.h
@@ -45,8 +45,8 @@ void IncrementUsageCounter(TfLiteContext* context);
 // 'context'. If there are no more usages the GemmContext will be deleted.
 void DecrementUsageCounter(TfLiteContext* context);
 
-// Set the maximum number threads available for gemmlowp operations.
-void SetMaxNumThreads(TfLiteContext* context, int num_threads);
+// Set the number of threads that can be used by gemmlowp.
+void SetNumThreads(TfLiteContext* context, int num_threads);
 
 }  // namespace gemm_support
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index f47fb04cbaa688b75e763ff9d3cb7df44ac3f166..c5539afb9c84d030c6b7835266396185e73592c7 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -10,21 +10,25 @@ tflite_deps_intel = [
     "@arm_neon_2_x86_sse",
 ]
 
+HARD_FP_FLAGS_IF_APPLICABLE = select({
+    "//tensorflow:android_arm": ["-mfloat-abi=softfp"],
+    "//tensorflow:android_arm64": ["-mfloat-abi=softfp"],
+    "//tensorflow:android_armeabi": ["-mfloat-abi=softfp"],
+    "//conditions:default": [],
+})
+
 NEON_FLAGS_IF_APPLICABLE = select({
     ":arm": [
         "-O3",
         "-mfpu=neon",
-        "-mfloat-abi=softfp",
     ],
     ":armeabi-v7a": [
         "-O3",
         "-mfpu=neon",
-        "-mfloat-abi=softfp",
     ],
     ":armv7a": [
         "-O3",
         "-mfpu=neon",
-        "-mfloat-abi=softfp",
     ],
     "//conditions:default": [
         "-O3",
@@ -145,10 +149,13 @@ cc_library(
         "common.h",
         "optimized/depthwiseconv_float.h",
         "optimized/depthwiseconv_uint8.h",
+        "optimized/depthwiseconv_uint8_3x3_filter.h",
         "optimized/optimized_ops.h",
     ],
     copts = tflite_copts(),
     deps = [
+        ":quantization_util",
+        ":strided_slice_logic",
         ":types",
         ":round",
         "//third_party/eigen3",
@@ -208,7 +215,10 @@ cc_library(
         "compatibility.h",
         "quantization_util.h",
     ],
-    deps = [":round"],
+    deps = [
+        ":round",
+        ":types",
+    ],
 )
 
 cc_test(
@@ -220,6 +230,17 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "strided_slice_logic",
+    srcs = [],
+    hdrs = [
+        "strided_slice_logic.h",
+    ],
+    deps = [
+        ":types",
+    ],
+)
+
 cc_library(
     name = "reference_base",
     srcs = [],
@@ -230,7 +251,9 @@ cc_library(
         "reference/reference_ops.h",
     ],
     deps = [
+        ":quantization_util",
         ":round",
+        ":strided_slice_logic",
         ":types",
         "//third_party/eigen3",
         "@gemmlowp",
@@ -266,6 +289,7 @@ cc_library(
         "reference/portable_tensor_utils.h",
     ],
     deps = [
+        ":round",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite/kernels:activation_functor",
         "//tensorflow/contrib/lite/kernels:op_macros",
@@ -283,10 +307,11 @@ cc_library(
         "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
     ],
-    copts = NEON_FLAGS_IF_APPLICABLE,
+    copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
     deps = [
         ":cpu_check",
         ":portable_tensor_utils",
+        ":round",
         ":types",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite/kernels:activation_functor",
@@ -305,6 +330,27 @@ cc_library(
     ],
 )
 
+# Audio support classes imported directly from TensorFlow.
+cc_library(
+    name = "audio_utils",
+    srcs = [
+        "mfcc.cc",
+        "mfcc_dct.cc",
+        "mfcc_mel_filterbank.cc",
+        "spectrogram.cc",
+    ],
+    hdrs = [
+        "mfcc.h",
+        "mfcc_dct.h",
+        "mfcc_mel_filterbank.h",
+        "spectrogram.h",
+    ],
+    deps = [
+        "//third_party/fft2d:fft2d_headers",
+        "@fft2d",
+    ],
+)
+
 cc_library(
     name = "tensor_utils",
     srcs = [
@@ -401,16 +447,13 @@ cc_library(
     ),
 )
 
-exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+cc_test(
+    name = "batch_to_space_nd_test",
+    srcs = ["batch_to_space_nd_test.cc"],
+    deps = [
+        ":optimized_base",
+        "@com_google_googletest//:gtest_main",
+    ],
 )
+
+exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
diff --git a/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a2901ac8c297265e542cc30d3127fe774c19e78
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+// A light wrapper of GetIndexRange which returns a pair of start / end
+// indices.
+std::pair<int, int> GetIndexRange(int spatial_index_dim, int block_shape_dim,
+                                  int input_dim, int output_dim) {
+  int index_start = 0;
+  int index_end = 0;
+  optimized_ops::GetIndexRange(spatial_index_dim, block_shape_dim, input_dim,
+                               output_dim, &index_start, &index_end);
+  return {index_start, index_end};
+}
+
+TEST(BatchToSpaceNDTest, TestIndexRange) {
+  // Simple test case, no cropping.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/6,
+                          /*input_dim=*/1, /*output_dim=*/6),
+            std::make_pair(0, 1));
+
+  // No cropping and input_dim > 1.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/2, /*block_shape_dim=*/6,
+                          /*input_dim=*/5, /*output_dim=*/30),
+            std::make_pair(0, 5));
+
+  // With small cropping values (can be either at the beginning or at the end).
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/2,
+                          /*input_dim=*/3, /*output_dim=*/4),
+            std::make_pair(0, 2));
+
+  // With positive cropping values at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-2, /*block_shape_dim=*/2,
+                          /*input_dim=*/3, /*output_dim=*/4),
+            std::make_pair(1, 3));
+
+  // Large crop at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(6, 7));
+
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-26, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(6, 7));
+
+  // Large crop at the end.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/4, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  // Rounding up incorrectly will fail this test.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  // Extreme cropping with output of a single spatial location.
+  // Valid position 1, when large crop at the end.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(0, 1));
+
+  // Valid position 2, when large crop at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(6, 7));
+
+  // Invalid positions.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/1, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(0, 0));
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-29, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(6, 6));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index 51426bb1c584b82af7b1a2ffaf5a675a1dd9a6fd..93fc6b6a76f67e2e75ba3a766e5ea6fb6bada77a 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -77,6 +77,7 @@ limitations under the License.
 #endif
 
 // TODO(ahentz): Clean up.
+using int8 = std::int8_t;
 using uint8 = std::uint8_t;
 using int16 = std::int16_t;
 using uint16 = std::uint16_t;
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
index 510395126ce3785b1d44fec1e0eb994c29ff0db7..f142374269606bdd3d4184af013749102666ab89 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -40,5 +40,152 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
                                         hidden_state_ptr_batch);
 }
 
+void LstmStep(
+    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
+    const float* input_to_forget_weights_ptr,
+    const float* input_to_cell_weights_ptr,
+    const float* input_to_output_weights_ptr,
+    const float* recurrent_to_input_weights_ptr,
+    const float* recurrent_to_forget_weights_ptr,
+    const float* recurrent_to_cell_weights_ptr,
+    const float* recurrent_to_output_weights_ptr,
+    const float* cell_to_input_weights_ptr,
+    const float* cell_to_forget_weights_ptr,
+    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
+    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
+    int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr,
+    float* cell_state_ptr, float* input_gate_scratch,
+    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
+    float* output_ptr_batch) {
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+  // Initialize scratch buffers with bias.
+  if (!use_cifg) {
+    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
+                                          input_gate_scratch);
+  }
+  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                        forget_gate_scratch);
+  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                        cell_scratch);
+  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                        output_gate_scratch);
+
+  // For each batch and cell: compute input_weight * input.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+        input_gate_scratch, /*result_stride=*/1);
+  }
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+      forget_gate_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+      cell_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+      output_gate_scratch, /*result_stride=*/1);
+
+  // For each batch and cell: compute recurrent_weight * output_state.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
+        n_batch, input_gate_scratch,
+        /*result_stride=*/1);
+  }
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, forget_gate_scratch,
+      /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, cell_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, output_gate_scratch,
+      /*result_stride=*/1);
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole) {
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
+          input_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                       input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
+        forget_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                     forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
+                                         n_batch * n_cell, cell_state_ptr);
+  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             forget_gate_scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  }
+  if (params->cell_clip > 0.0) {
+    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
+                             params->cell_clip, cell_state_ptr);
+  }
+
+  // For each batch and cell: update the output gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
+        output_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                     output_gate_scratch);
+  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                         n_batch * n_cell, output_gate_scratch);
+
+  // For each batch: update the projection and output_state.
+  const bool use_projection_weight = (projection_weights_ptr != nullptr);
+  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+  if (use_projection_weight) {
+    if (use_projection_bias) {
+      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                            n_batch, output_ptr_batch);
+    } else {
+      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
+        output_ptr_batch, /*result_stride=*/1);
+    if (params->proj_clip > 0.0) {
+      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
+                               params->proj_clip, output_ptr_batch);
+    }
+  } else {
+    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                             output_ptr_batch);
+  }
+  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
+                           output_state_ptr);
+}
+
 }  // namespace kernel_utils
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
index 9872d4500b862388ed4b96c97e3755f548e35d35..3ec60ee57a87833959a34ba95d32df15bea188a4 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -35,6 +35,42 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
                   TfLiteFusedActivation activation,
                   float* hidden_state_ptr_batch, float* output_ptr_batch);
 
+// Performs an LSTM batch inference step for input specified by input_ptr_batch.
+// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
+// biases (*_bias_ptr), and buffers (*_scratch), along with additional
+// parameters:
+//  - params: various LSTM params including activation, clipping, etc.,
+//  - n_batch: size of batch,
+//  - n_cell: number of cells (or units),
+//  - n_input: the input size,
+//  - n_output: the output size.
+//
+// The pointers to the cell and output state and the output are updated. Unless
+// projection is specified output and output state contain the same data.
+//
+// The pointers with the suffix "_batch" point to data aligned in batch_major
+// order, and each step processes batch_size many inputs from input_ptr_batch,
+// and updates batch_size many cell and output states.
+void LstmStep(
+    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
+    const float* input_to_forget_weights_ptr,
+    const float* input_to_cell_weights_ptr,
+    const float* input_to_output_weights_ptr,
+    const float* recurrent_to_input_weights_ptr,
+    const float* recurrent_to_forget_weights_ptr,
+    const float* recurrent_to_cell_weights_ptr,
+    const float* recurrent_to_output_weights_ptr,
+    const float* cell_to_input_weights_ptr,
+    const float* cell_to_forget_weights_ptr,
+    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
+    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
+    int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr,
+    float* cell_state_ptr, float* input_gate_scratch,
+    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
+    float* output_ptr_batch);
+
 }  // namespace kernel_utils
 }  // namespace tflite
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc.cc b/tensorflow/contrib/lite/kernels/internal/mfcc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eafe0c7afee6fabd5a4a258aa5176e23f5e8d62a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/mfcc.cc
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <math.h>
+
+#include "tensorflow/contrib/lite/kernels/internal/mfcc.h"
+
+namespace tflite {
+namespace internal {
+
+const double kDefaultUpperFrequencyLimit = 4000;
+const double kDefaultLowerFrequencyLimit = 20;
+const double kFilterbankFloor = 1e-12;
+const int kDefaultFilterbankChannelCount = 40;
+const int kDefaultDCTCoefficientCount = 13;
+
+Mfcc::Mfcc()
+    : initialized_(false),
+      lower_frequency_limit_(kDefaultLowerFrequencyLimit),
+      upper_frequency_limit_(kDefaultUpperFrequencyLimit),
+      filterbank_channel_count_(kDefaultFilterbankChannelCount),
+      dct_coefficient_count_(kDefaultDCTCoefficientCount) {}
+
+bool Mfcc::Initialize(int input_length, double input_sample_rate) {
+  bool initialized = mel_filterbank_.Initialize(
+      input_length, input_sample_rate, filterbank_channel_count_,
+      lower_frequency_limit_, upper_frequency_limit_);
+  initialized &=
+      dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_);
+  initialized_ = initialized;
+  return initialized;
+}
+
+void Mfcc::Compute(const std::vector<double>& spectrogram_frame,
+                   std::vector<double>* output) const {
+  if (!initialized_) {
+    // LOG(ERROR) << "Mfcc not initialized.";
+    return;
+  }
+  std::vector<double> working;
+  mel_filterbank_.Compute(spectrogram_frame, &working);
+  for (int i = 0; i < working.size(); ++i) {
+    double val = working[i];
+    if (val < kFilterbankFloor) {
+      val = kFilterbankFloor;
+    }
+    working[i] = log(val);
+  }
+  dct_.Compute(working, output);
+}
+
+}  // namespace internal
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc.h b/tensorflow/contrib/lite/kernels/internal/mfcc.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8500ecdcf38e5dcfe9eb89915501678455b3dd9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/mfcc.h
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for computing MFCCs from spectrogram slices.
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h"
+#include "tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h"
+
+namespace tflite {
+namespace internal {
+
+class Mfcc {
+ public:
+  Mfcc();
+  bool Initialize(int input_length, double input_sample_rate);
+
+  // Input is a single squared-magnitude spectrogram frame. The input spectrum
+  // is converted to linear magnitude and weighted into bands using a
+  // triangular mel filterbank, and a discrete cosine transform (DCT) of the
+  // values is taken. Output is populated with the lowest dct_coefficient_count
+  // of these values.
+  void Compute(const std::vector<double>& spectrogram_frame,
+               std::vector<double>* output) const;
+
+  void set_upper_frequency_limit(double upper_frequency_limit) {
+    // CHECK(!initialized_) << "Set frequency limits before calling
+    // Initialize.";
+    upper_frequency_limit_ = upper_frequency_limit;
+  }
+
+  void set_lower_frequency_limit(double lower_frequency_limit) {
+    // CHECK(!initialized_) << "Set frequency limits before calling
+    // Initialize.";
+    lower_frequency_limit_ = lower_frequency_limit;
+  }
+
+  void set_filterbank_channel_count(int filterbank_channel_count) {
+    /// CHECK(!initialized_) << "Set channel count before calling Initialize.";
+    filterbank_channel_count_ = filterbank_channel_count;
+  }
+
+  void set_dct_coefficient_count(int dct_coefficient_count) {
+    // CHECK(!initialized_) << "Set coefficient count before calling
+    // Initialize.";
+    dct_coefficient_count_ = dct_coefficient_count;
+  }
+
+ private:
+  MfccMelFilterbank mel_filterbank_;
+  MfccDct dct_;
+  bool initialized_;
+  double lower_frequency_limit_;
+  double upper_frequency_limit_;
+  int filterbank_channel_count_;
+  int dct_coefficient_count_;
+};
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc_dct.cc b/tensorflow/contrib/lite/kernels/internal/mfcc_dct.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0b7d181bdcf01688a387f33a3e64fc904324b50
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/mfcc_dct.cc
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h"
+
+#include <math.h>
+
+namespace tflite {
+namespace internal {
+
+MfccDct::MfccDct() : initialized_(false) {}
+
+bool MfccDct::Initialize(int input_length, int coefficient_count) {
+  coefficient_count_ = coefficient_count;
+  input_length_ = input_length;
+
+  if (coefficient_count_ < 1) {
+    return false;
+  }
+
+  if (input_length < 1) {
+    return false;
+  }
+
+  if (coefficient_count_ > input_length_) {
+    return false;
+  }
+
+  cosines_.resize(coefficient_count_);
+  double fnorm = sqrt(2.0 / input_length_);
+  // Some platforms don't have M_PI, so define a local constant here.
+  const double pi = atan(1) * 4;
+  double arg = pi / input_length_;
+  for (int i = 0; i < coefficient_count_; ++i) {
+    cosines_[i].resize(input_length_);
+    for (int j = 0; j < input_length_; ++j) {
+      cosines_[i][j] = fnorm * cos(i * arg * (j + 0.5));
+    }
+  }
+  initialized_ = true;
+  return true;
+}
+
+void MfccDct::Compute(const std::vector<double> &input,
+                      std::vector<double> *output) const {
+  if (!initialized_) {
+    return;
+  }
+
+  output->resize(coefficient_count_);
+  int length = input.size();
+  if (length > input_length_) {
+    length = input_length_;
+  }
+
+  for (int i = 0; i < coefficient_count_; ++i) {
+    double sum = 0.0;
+    for (int j = 0; j < length; ++j) {
+      sum += cosines_[i][j] * input[j];
+    }
+    (*output)[i] = sum;
+  }
+}
+
+}  // namespace internal
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc_dct.h b/tensorflow/contrib/lite/kernels/internal/mfcc_dct.h
new file mode 100644
index 0000000000000000000000000000000000000000..a53f5cbd9bb70c7c9dd49672681140bb9cbd2f4f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/mfcc_dct.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic minimal DCT class for MFCC speech processing.
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
+
+#include <vector>
+
+namespace tflite {
+namespace internal {
+
+class MfccDct {
+ public:
+  MfccDct();
+  bool Initialize(int input_length, int coefficient_count);
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  bool initialized_;
+  int coefficient_count_;
+  int input_length_;
+  std::vector<std::vector<double> > cosines_;
+};
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.cc b/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3deb33d91a47bfe54b7c84d2a615df2422f90cc
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.cc
@@ -0,0 +1,204 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This code resamples the FFT bins, and smooths then with triangle-shaped
+// weights to create a mel-frequency filter bank. For filter i centered at f_i,
+// there is a triangular weighting of the FFT bins that extends from
+// filter f_i-1 (with a value of zero at the left edge of the triangle) to f_i
+// (where the filter value is 1) to f_i+1 (where the filter values returns to
+// zero).
+
+// Note: this code fails if you ask for too many channels.  The algorithm used
+// here assumes that each FFT bin contributes to at most two channels: the
+// right side of a triangle for channel i, and the left side of the triangle
+// for channel i+1.  If you ask for so many channels that some of the
+// resulting mel triangle filters are smaller than a single FFT bin, these
+// channels may end up with no contributing FFT bins.  The resulting mel
+// spectrum output will have some channels that are always zero.
+
+#include "tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h"
+
+#include <math.h>
+
+namespace tflite {
+namespace internal {
+
+MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
+
+bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
+                                   int output_channel_count,
+                                   double lower_frequency_limit,
+                                   double upper_frequency_limit) {
+  num_channels_ = output_channel_count;
+  sample_rate_ = input_sample_rate;
+  input_length_ = input_length;
+
+  if (num_channels_ < 1) {
+    // LOG(ERROR) << "Number of filterbank channels must be positive.";
+    return false;
+  }
+
+  if (sample_rate_ <= 0) {
+    // LOG(ERROR) << "Sample rate must be positive.";
+    return false;
+  }
+
+  if (input_length < 2) {
+    // LOG(ERROR) << "Input length must greater than 1.";
+    return false;
+  }
+
+  if (lower_frequency_limit < 0) {
+    // LOG(ERROR) << "Lower frequency limit must be nonnegative.";
+    return false;
+  }
+
+  if (upper_frequency_limit <= lower_frequency_limit) {
+    /// LOG(ERROR) << "Upper frequency limit must be greater than "
+    //           << "lower frequency limit.";
+    return false;
+  }
+
+  // An extra center frequency is computed at the top to get the upper
+  // limit on the high side of the final triangular filter.
+  center_frequencies_.resize(num_channels_ + 1);
+  const double mel_low = FreqToMel(lower_frequency_limit);
+  const double mel_hi = FreqToMel(upper_frequency_limit);
+  const double mel_span = mel_hi - mel_low;
+  const double mel_spacing = mel_span / static_cast<double>(num_channels_ + 1);
+  for (int i = 0; i < num_channels_ + 1; ++i) {
+    center_frequencies_[i] = mel_low + (mel_spacing * (i + 1));
+  }
+
+  // Always exclude DC; emulate HTK.
+  const double hz_per_sbin =
+      0.5 * sample_rate_ / static_cast<double>(input_length_ - 1);
+  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin));
+  end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
+
+  // Maps the input spectrum bin indices to filter bank channels/indices. For
+  // each FFT bin, band_mapper tells us which channel this bin contributes to
+  // on the right side of the triangle.  Thus this bin also contributes to the
+  // left side of the next channel's triangle response.
+  band_mapper_.resize(input_length_);
+  int channel = 0;
+  for (int i = 0; i < input_length_; ++i) {
+    double melf = FreqToMel(i * hz_per_sbin);
+    if ((i < start_index_) || (i > end_index_)) {
+      band_mapper_[i] = -2;  // Indicate an unused Fourier coefficient.
+    } else {
+      while ((center_frequencies_[channel] < melf) &&
+             (channel < num_channels_)) {
+        ++channel;
+      }
+      band_mapper_[i] = channel - 1;  // Can be == -1
+    }
+  }
+
+  // Create the weighting functions to taper the band edges.  The contribution
+  // of any one FFT bin is based on its distance along the continuum between two
+  // mel-channel center frequencies.  This bin contributes weights_[i] to the
+  // current channel and 1-weights_[i] to the next channel.
+  weights_.resize(input_length_);
+  for (int i = 0; i < input_length_; ++i) {
+    channel = band_mapper_[i];
+    if ((i < start_index_) || (i > end_index_)) {
+      weights_[i] = 0.0;
+    } else {
+      if (channel >= 0) {
+        weights_[i] =
+            (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) /
+            (center_frequencies_[channel + 1] - center_frequencies_[channel]);
+      } else {
+        weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
+                      (center_frequencies_[0] - mel_low);
+      }
+    }
+  }
+  // Check the sum of FFT bin weights for every mel band to identify
+  // situations where the mel bands are so narrow that they don't get
+  // significant weight on enough (or any) FFT bins -- i.e., too many
+  // mel bands have been requested for the given FFT size.
+  std::vector<int> bad_channels;
+  for (int c = 0; c < num_channels_; ++c) {
+    float band_weights_sum = 0.0;
+    for (int i = 0; i < input_length_; ++i) {
+      if (band_mapper_[i] == c - 1) {
+        band_weights_sum += (1.0 - weights_[i]);
+      } else if (band_mapper_[i] == c) {
+        band_weights_sum += weights_[i];
+      }
+    }
+    // The lowest mel channels have the fewest FFT bins and the lowest
+    // weights sum.  But given that the target gain at the center frequency
+    // is 1.0, if the total sum of weights is 0.5, we're in bad shape.
+    if (band_weights_sum < 0.5) {
+      bad_channels.push_back(c);
+    }
+  }
+  if (!bad_channels.empty()) {
+    /*
+    LOG(ERROR) << "Missing " << bad_channels.size() << " bands "
+               << " starting at " << bad_channels[0]
+               << " in mel-frequency design. "
+               << "Perhaps too many channels or "
+               << "not enough frequency resolution in spectrum. ("
+               << "input_length: " << input_length
+               << " input_sample_rate: " << input_sample_rate
+               << " output_channel_count: " << output_channel_count
+               << " lower_frequency_limit: " << lower_frequency_limit
+               << " upper_frequency_limit: " << upper_frequency_limit;
+               */
+  }
+  initialized_ = true;
+  return true;
+}
+
+// Compute the mel spectrum from the squared-magnitude FFT input by taking the
+// square root, then summing FFT magnitudes under triangular integration windows
+// whose widths increase with frequency.
+void MfccMelFilterbank::Compute(const std::vector<double> &input,
+                                std::vector<double> *output) const {
+  if (!initialized_) {
+    // LOG(ERROR) << "Mel Filterbank not initialized.";
+    return;
+  }
+
+  if (input.size() <= end_index_) {
+    // LOG(ERROR) << "Input too short to compute filterbank";
+    return;
+  }
+
+  // Ensure output is right length and reset all values.
+  output->assign(num_channels_, 0.0);
+
+  for (int i = start_index_; i <= end_index_; i++) {  // For each FFT bin
+    double spec_val = sqrt(input[i]);
+    double weighted = spec_val * weights_[i];
+    int channel = band_mapper_[i];
+    if (channel >= 0)
+      (*output)[channel] += weighted;  // Right side of triangle, downward slope
+    channel++;
+    if (channel < num_channels_)
+      (*output)[channel] += spec_val - weighted;  // Left side of triangle
+  }
+}
+
+double MfccMelFilterbank::FreqToMel(double freq) const {
+  return 1127.0 * log(1.0 + (freq / 700.0));
+}
+
+}  // namespace internal
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h b/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1db28243eea39a694b7613ac7144dce9b294897
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for applying a mel-scale mapping to a power spectrum.
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
+
+#include <vector>
+
+namespace tflite {
+namespace internal {
+
+class MfccMelFilterbank {
+ public:
+  MfccMelFilterbank();
+  bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
+                  double input_sample_rate, int output_channel_count,
+                  double lower_frequency_limit, double upper_frequency_limit);
+
+  // Takes a squared-magnitude spectrogram slice as input, computes a
+  // triangular-mel-weighted linear-magnitude filterbank, and places the result
+  // in output.
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  double FreqToMel(double freq) const;
+  bool initialized_;
+  int num_channels_;
+  double sample_rate_;
+  int input_length_;
+  std::vector<double> center_frequencies_;  // In mel, for each mel channel.
+
+  // Each FFT bin b contributes to two triangular mel channels, with
+  // proportion weights_[b] going into mel channel band_mapper_[b], and
+  // proportion (1 - weights_[b]) going into channel band_mapper_[b] + 1.
+  // Thus, weights_ contains the weighting applied to each FFT bin for the
+  // upper-half of the triangular band.
+  std::vector<double> weights_;  // Right-side weight for this fft  bin.
+
+  // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
+  std::vector<int> band_mapper_;
+  int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
+  int end_index_;    // Highest FFT bin used to calculate mel spectrum.
+};
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index dbc4f0d6fdca8279072d6ea225334722d6a89eb2..dd6932ffe7b7a6f1101f146ce6472b0df4cbda3b 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -1692,6 +1693,22 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   const int output_width = ArraySize(output_dims, 1);
   TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
 
+#ifdef __aarch64__
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (Fast3x3FilterKernelSupported(input_dims, filter_dims, stride_width,
+                                   stride_height, pad_width, pad_height,
+                                   depth_multiplier, output_dims)) {
+    DepthwiseConv3x3Filter(input_data, input_dims, input_offset, filter_data,
+                           filter_dims, filter_offset, bias_data, bias_dims,
+                           stride_width, stride_height, pad_width, pad_height,
+                           depth_multiplier, output_offset, output_multiplier,
+                           output_shift, output_activation_min,
+                           output_activation_max, output_data, output_dims);
+    return;
+  }
+#endif
+
   static const int kAccBufferMaxSize = 2048;
   int32 acc_buffer[kAccBufferMaxSize];
   TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
new file mode 100644
index 0000000000000000000000000000000000000000..55e0d5c3aa9ebb8b46403550e190b00a54cb53e5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -0,0 +1,4584 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+#ifdef __aarch64__
+
+inline void preload_l1_keep(const uint8* ptr) {
+#ifdef GEMMLOWP_ARM_64
+  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#else
+  gemmlowp::Prefetch(ptr);
+#endif
+}
+
+// Implementation of quantized DepthwiseConv for 3x3 filters.
+
+// Below are helper structs to remove the use of arrays.
+// There is an llvm bug that causes significant slowdown when using arrays for
+// NEON intrinsics vector data types.
+// See: https://bugs.llvm.org/show_bug.cgi?id=34945
+
+struct Int32x8 {
+  int32x4_t low, high;
+};
+
+struct Filter3x3x8 {
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7, f8;
+};
+
+// Loads 3x3 filter of depth 8 and adds filter offsets.
+inline Filter3x3x8 Load3x3Filter(const uint8* filter_ptr, int32 filter_offset,
+                                 int output_depth) {
+  Filter3x3x8 filter;
+
+  uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5,
+      temp_u8_6, temp_u8_7, temp_u8_8;
+  int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+
+  temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth);
+  temp_u8_1 = vld1_u8(filter_ptr + 1 * output_depth);
+  temp_u8_2 = vld1_u8(filter_ptr + 2 * output_depth);
+  temp_u8_3 = vld1_u8(filter_ptr + 3 * output_depth);
+  temp_u8_4 = vld1_u8(filter_ptr + 4 * output_depth);
+  temp_u8_5 = vld1_u8(filter_ptr + 5 * output_depth);
+  temp_u8_6 = vld1_u8(filter_ptr + 6 * output_depth);
+  temp_u8_7 = vld1_u8(filter_ptr + 7 * output_depth);
+  temp_u8_8 = vld1_u8(filter_ptr + 8 * output_depth);
+
+  filter.f0 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0));
+  filter.f1 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1));
+  filter.f2 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2));
+  filter.f3 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3));
+  filter.f4 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4));
+  filter.f5 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5));
+  filter.f6 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6));
+  filter.f7 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7));
+  filter.f8 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8));
+
+  filter.f0 = vaddq_s16(filter.f0, filter_offset_vec);
+  filter.f1 = vaddq_s16(filter.f1, filter_offset_vec);
+  filter.f2 = vaddq_s16(filter.f2, filter_offset_vec);
+  filter.f3 = vaddq_s16(filter.f3, filter_offset_vec);
+  filter.f4 = vaddq_s16(filter.f4, filter_offset_vec);
+  filter.f5 = vaddq_s16(filter.f5, filter_offset_vec);
+  filter.f6 = vaddq_s16(filter.f6, filter_offset_vec);
+  filter.f7 = vaddq_s16(filter.f7, filter_offset_vec);
+  filter.f8 = vaddq_s16(filter.f8, filter_offset_vec);
+
+  return filter;
+}
+
+// Applies activation, offset and downquantize on a set of accumulator
+// registers that correspond to a 2x2 output of depth 8.
+// Stores results to output.
+inline void DownquantizeAndStore2x2Output(
+    Int32x8 acc_0, Int32x8 acc_1, Int32x8 acc_2, Int32x8 acc_3,
+    int32 output_offset, int32 output_multiplier, int output_shift,
+    int32 output_activation_min, int32 output_activation_max, uint8* output_ptr,
+    int output_depth, int output_width) {
+  using gemmlowp::RoundingDivideByPOT;
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+  const int32x4_t output_activation_min_vec =
+      vdupq_n_s32(output_activation_min);
+  const int32x4_t output_activation_max_vec =
+      vdupq_n_s32(output_activation_max);
+
+  // Fixed-point multiplication.
+  acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
+  acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
+  acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
+  acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
+  acc_2.low = vqrdmulhq_n_s32(acc_2.low, output_multiplier);
+  acc_2.high = vqrdmulhq_n_s32(acc_2.high, output_multiplier);
+  acc_3.low = vqrdmulhq_n_s32(acc_3.low, output_multiplier);
+  acc_3.high = vqrdmulhq_n_s32(acc_3.high, output_multiplier);
+
+  acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
+  acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
+  acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
+  acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
+  acc_2.low = RoundingDivideByPOT(acc_2.low, output_shift);
+  acc_2.high = RoundingDivideByPOT(acc_2.high, output_shift);
+  acc_3.low = RoundingDivideByPOT(acc_3.low, output_shift);
+  acc_3.high = RoundingDivideByPOT(acc_3.high, output_shift);
+
+  // Add the output offset.
+  acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
+  acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
+  acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
+  acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
+  acc_2.low = vaddq_s32(acc_2.low, output_offset_vec);
+  acc_2.high = vaddq_s32(acc_2.high, output_offset_vec);
+  acc_3.low = vaddq_s32(acc_3.low, output_offset_vec);
+  acc_3.high = vaddq_s32(acc_3.high, output_offset_vec);
+
+  // Apply the activation function.
+  acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
+  acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
+  acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
+  acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
+  acc_2.low = vmaxq_s32(acc_2.low, output_activation_min_vec);
+  acc_2.high = vmaxq_s32(acc_2.high, output_activation_min_vec);
+  acc_3.low = vmaxq_s32(acc_3.low, output_activation_min_vec);
+  acc_3.high = vmaxq_s32(acc_3.high, output_activation_min_vec);
+
+  acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
+  acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
+  acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
+  acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
+  acc_2.low = vminq_s32(acc_2.low, output_activation_max_vec);
+  acc_2.high = vminq_s32(acc_2.high, output_activation_max_vec);
+  acc_3.low = vminq_s32(acc_3.low, output_activation_max_vec);
+  acc_3.high = vminq_s32(acc_3.high, output_activation_max_vec);
+
+  // Saturating cast to uint8 and store to destination.
+  int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
+  int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
+  int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
+  int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
+  int16x4_t acc_2_low_s16 = vqmovn_s32(acc_2.low);
+  int16x4_t acc_2_high_s16 = vqmovn_s32(acc_2.high);
+  int16x4_t acc_3_low_s16 = vqmovn_s32(acc_3.low);
+  int16x4_t acc_3_high_s16 = vqmovn_s32(acc_3.high);
+
+  int16x8_t res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
+  int16x8_t res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
+  int16x8_t res_2_s16 = vcombine_s16(acc_2_low_s16, acc_2_high_s16);
+  int16x8_t res_3_s16 = vcombine_s16(acc_3_low_s16, acc_3_high_s16);
+
+  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
+  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
+  uint8x8_t res_2_u8 = vqmovun_s16(res_2_s16);
+  uint8x8_t res_3_u8 = vqmovun_s16(res_3_s16);
+
+  vst1_u8(output_ptr, res_0_u8);
+  vst1_u8(output_ptr + output_depth, res_1_u8);
+  vst1_u8(output_ptr + output_depth * output_width, res_2_u8);
+  vst1_u8(output_ptr + output_depth * output_width + output_depth, res_3_u8);
+}
+
+inline void DownquantizeAndStore(Int32x8 acc, int32 output_offset,
+                                 int32 output_multiplier, int output_shift,
+                                 int32 output_activation_min,
+                                 int32 output_activation_max,
+                                 uint8* output_ptr) {
+  using gemmlowp::RoundingDivideByPOT;
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+  const int32x4_t output_activation_min_vec =
+      vdupq_n_s32(output_activation_min);
+  const int32x4_t output_activation_max_vec =
+      vdupq_n_s32(output_activation_max);
+
+  acc.low = vqrdmulhq_n_s32(acc.low, output_multiplier);
+  acc.high = vqrdmulhq_n_s32(acc.high, output_multiplier);
+
+  acc.low = RoundingDivideByPOT(acc.low, output_shift);
+  acc.high = RoundingDivideByPOT(acc.high, output_shift);
+
+  acc.low = vaddq_s32(acc.low, output_offset_vec);
+  acc.high = vaddq_s32(acc.high, output_offset_vec);
+
+  acc.low = vmaxq_s32(acc.low, output_activation_min_vec);
+  acc.high = vmaxq_s32(acc.high, output_activation_min_vec);
+
+  acc.low = vminq_s32(acc.low, output_activation_max_vec);
+  acc.high = vminq_s32(acc.high, output_activation_max_vec);
+
+  int16x4_t acc_low_s16 = vqmovn_s32(acc.low);
+  int16x4_t acc_high_s16 = vqmovn_s32(acc.high);
+
+  int16x8_t res_s16 = vcombine_s16(acc_low_s16, acc_high_s16);
+  uint8x8_t res_u8 = vqmovun_s16(res_s16);
+  vst1_u8(output_ptr, res_u8);
+}
+
+inline void DownquantizeAndStore2Output(
+    Int32x8 acc_0, Int32x8 acc_1, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  {
+    using gemmlowp::RoundingDivideByPOT;
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    const int32x4_t output_activation_min_vec =
+        vdupq_n_s32(output_activation_min);
+    const int32x4_t output_activation_max_vec =
+        vdupq_n_s32(output_activation_max);
+
+    // Fixed-point multiplication.
+    acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
+    acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
+    acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
+    acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
+
+    acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
+    acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
+    acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
+    acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
+
+    // Add the output offset.
+    acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
+    acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
+    acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
+    acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
+
+    // Apply the activation function.
+    acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
+    acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
+    acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
+    acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
+
+    acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
+    acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
+    acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
+    acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
+  }
+
+  // Saturating cast to uint8 and store to destination.
+  int16x8_t res_0_s16;
+  {
+    int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
+    int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
+    res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
+  }
+
+  int16x8_t res_1_s16;
+  {
+    int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
+    int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
+    res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
+  }
+
+  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
+  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
+  vst1_u8(output_ptr, res_0_u8);
+  vst1_u8(output_ptr + output_ptr_offset, res_1_u8);
+}
+
+// Performs multiply accumulate on 3 inputs of depth 8.
+inline Int32x8 MultiplyAccumulateRow(Int32x8 accum, int16x8_t f0, int16x8_t f1,
+                                     int16x8_t f2, int16x8_t i0, int16x8_t i1,
+                                     int16x8_t i2) {
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f0), vget_low_s16(i0));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f0), vget_high_s16(i0));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f1), vget_low_s16(i1));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f1), vget_high_s16(i1));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f2), vget_low_s16(i2));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f2), vget_high_s16(i2));
+  return accum;
+}
+
+// Performs multiply accumulate on 3 inputs of depth 8.
+inline Int32x8 MultiplyAccumulate3x3Filter(const Filter3x3x8& f, int16x8_t i0,
+                                           int16x8_t i1, int16x8_t i2,
+                                           int16x8_t i3, int16x8_t i4,
+                                           int16x8_t i5, int16x8_t i6,
+                                           int16x8_t i7, int16x8_t i8,
+                                           Int32x8 accum) {
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f0), vget_low_s16(i0));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f0), vget_high_s16(i0));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f1), vget_low_s16(i1));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f1), vget_high_s16(i1));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f2), vget_low_s16(i2));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f2), vget_high_s16(i2));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f3), vget_low_s16(i3));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f3), vget_high_s16(i3));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f4), vget_low_s16(i4));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f4), vget_high_s16(i4));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f5), vget_low_s16(i5));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f5), vget_high_s16(i5));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f6), vget_low_s16(i6));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f6), vget_high_s16(i6));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f7), vget_low_s16(i7));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f7), vget_high_s16(i7));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f8), vget_low_s16(i8));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f8), vget_high_s16(i8));
+  return accum;
+}
+
+inline void DotProductAndStore(const Filter3x3x8& filter, int16x8_t i0,
+                               int16x8_t i1, int16x8_t i2, int16x8_t i3,
+                               int16x8_t i4, int16x8_t i5, int16x8_t i6,
+                               int16x8_t i7, int16x8_t i8,
+                               const int32* bias_ptr, int32 output_offset,
+                               int32 output_multiplier, int output_shift,
+                               int32 output_activation_min,
+                               int32 output_activation_max, uint8* output_ptr) {
+  Int32x8 acc;
+  acc.low = vld1q_s32(bias_ptr);
+  acc.high = vld1q_s32(bias_ptr + 4);
+
+  acc = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7, i8,
+                                    acc);
+
+  DownquantizeAndStore(acc, output_offset, output_multiplier, output_shift,
+                       output_activation_min, output_activation_max,
+                       output_ptr);
+}
+
+// Performs multiply-accumulate on a 3x4 input for 2 horizontal outputs.
+inline void DotProductAndStore2xStride1(
+    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
+    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
+    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
+    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  Int32x8 acc_0, acc_1;
+  acc_0.low = vld1q_s32(bias_ptr);
+  acc_1.low = vld1q_s32(bias_ptr);
+  acc_0.high = vld1q_s32(bias_ptr + 4);
+  acc_1.high = vld1q_s32(bias_ptr + 4);
+
+  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i4, i5, i6, i8, i9,
+                                      i10, acc_0);
+  acc_1 = MultiplyAccumulate3x3Filter(filter, i1, i2, i3, i5, i6, i7, i9, i10,
+                                      i11, acc_1);
+  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
+                              output_shift, output_activation_min,
+                              output_activation_max, output_ptr,
+                              output_ptr_offset);
+}
+
+// Performs multiply-accumulate on a 4x3 input for 2 vertical outputs.
+inline void DotProductAndStore2yStride1(
+    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
+    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
+    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
+    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  Int32x8 acc_0, acc_1;
+  acc_0.low = vld1q_s32(bias_ptr);
+  acc_1.low = vld1q_s32(bias_ptr);
+  acc_0.high = vld1q_s32(bias_ptr + 4);
+  acc_1.high = vld1q_s32(bias_ptr + 4);
+
+  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7,
+                                      i8, acc_0);
+  acc_1 = MultiplyAccumulate3x3Filter(filter, i3, i4, i5, i6, i7, i8, i9, i10,
+                                      i11, acc_1);
+  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
+                              output_shift, output_activation_min,
+                              output_activation_max, output_ptr,
+                              output_ptr_offset);
+}
+
+// A kernel that is optimized on the number of output cells in the x and y
+// direction, and the stride. Assumes 3x3 filters of 8 depth.
+template <int kFixedOutputY, int kFixedOutputX, int kFixedStrideWidth,
+          int kFixedStrideHeight>
+struct ConvKernel3x3FilterDepth8 {};
+
+template <>
+struct ConvKernel3x3FilterDepth8<8, 8, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    // To process 8x8 outputs using a 3x3 filter, we require 10x10 inputs.
+    // Load inputs for the first 2 filters on the top left, then slide to
+    // the right, down, left, down, right, etc. in a snake-like path. This
+    // minimizes the total number of loads.
+    //
+    //        INPUT                          OUTPUT
+    //   |\----------------\               |\------------\
+    //   | \                \              | \            \
+    //   |  \----------------\             |  \------------\
+    //   |  | 0    ...     9 |             |  | 0  ...   7 |
+    //   |  | 10   ...    19 |     --->    |  | 8  ...  15 |
+    //   |  | 20   ...    29 |              \ | .. ...  .. |
+    //    \ | ..   ...    .. |               \| 56 ...  63 |
+    //     \| 90   ...   109 |                |------------|
+    //      |----------------|
+    //
+    // The first set of loads corresponds to:
+    //
+    //        INPUT                          OUTPUT
+    //   |\-----------------                |\-----------
+    //   | \                                | \
+    //   |  \-----------------              |  \----------
+    //   |  | 0  1   2  3 ...               |  | 0  1 ...
+    //   |  | 10 11 12 13 ...     --->      |  | ..   ...
+    //   |  | 20 21 22 23 ...                  | ..   ...
+    //   |  | ..   ...    ...
+    //
+    // The next set of loads correspond to a sliding window to the right.
+    // It loads inputs 4, 5, 14, 15, 23, 24 and keeps 2, 3, 12, 13, and 22:
+    //
+    //        INPUT                          OUTPUT
+    //   |\-------------------                |\-------------
+    //   | \                                  | \
+    //   |  \-------------------              |  \------------
+    //   |  | .. 2  3   4  5 ...              |  | .. 2  3 ...
+    //   |  | .. 12 13 14 15 ...     --->     |  | ..      ...
+    //   |  | .. 21 22 23 24 ...                 | ..      ...
+    //   |  | ..    ...      ...
+    //
+    // And so on...
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (0) and (1).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Slide to the right for outputs x = [2, 3], y = 0. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (2) and (3).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Slide to the right again for outputs x = [4, 5], y = 0. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (4) and (5).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_depth, output_depth);
+
+    // Slide to the right one last time for outputs x = [6, 7], y = 0.
+    // Referring to the indexes in the diagram above, this corresponds to
+    // outputs (6) and (7).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_depth, output_depth);
+
+    // Slide to down for outputs x = [6, 7], y = 1. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (14) and (15).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 1. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (12) and (13).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left again for outputs x = [2, 3], y = 1. Referring to the indexes
+    // in the diagram above, this corresponds to outputs (10) and (11).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 1. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (8) and (9).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (16) and (17).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (18) and (19).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (20) and (21).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (22) and (23).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (30) and (31).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (28) and (29).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (26) and (27).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 3. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (24) and (25).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (32) and (33).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (34) and (35).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (36) and (37).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 4. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (38) and (39).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (46) and (47).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (44) and (45).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (42) and (43).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 5. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (40) and (41).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 5 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (48) and (49).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 8 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (50) and (51).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (52) and (53).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 6. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (54) and (55).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (62) and (63).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 9 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (60) and (61).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (58) and (59).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 7. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (56) and (57).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 7 * output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 4, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    // To process 4x4 outputs using a 3x3 filter, we require 6x6 inputs.
+    // Load inputs for the first 2 filters on the top left, then slide to
+    // the right, down, left, down, right, etc. in a snake-like path. This
+    // minimizes the total number of loads.
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the top right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 2, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load next inputs one row down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load next row.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load last row.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 1, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 2x1 outputs starting from the top.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_row_size);
+
+    // Load inputs for bottom 2 rows.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_6, input_7, input_8, input_9, input_10, input_11, input_0,
+        input_1, input_2, input_3, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size,
+        output_row_size);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 2, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1, acc_2, acc_3;
+
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_2.low = vld1q_s32(bias_ptr);
+    acc_3.low = vld1q_s32(bias_ptr);
+
+    bias_ptr += 4;
+    acc_0.high = vld1q_s32(bias_ptr);
+    acc_1.high = vld1q_s32(bias_ptr);
+    acc_2.high = vld1q_s32(bias_ptr);
+    acc_3.high = vld1q_s32(bias_ptr);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    // Add scope for input registers to help the compiler know that it is
+    // not needed.
+    {
+      // To process 2x2 outputs using a 3x3 filter, we require 4x4 inputs.
+      // Load inputs for the top two filters first.
+      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+          input_7, input_8, input_9, input_10, input_11;
+
+      const uint8* ptr = input_ptr;
+
+      // Load top 3 rows.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+        input_10 = vaddq_s16(input_10, input_offset_vec);
+        input_11 = vaddq_s16(input_11, input_offset_vec);
+      }
+
+      // Multiply-accum for top-left output.
+      acc_0 = MultiplyAccumulate3x3Filter(filter, input_0, input_1, input_2,
+                                          input_4, input_5, input_6, input_8,
+                                          input_9, input_10, acc_0);
+
+      // Multiply-accum for top-right output.
+      acc_1 = MultiplyAccumulate3x3Filter(filter, input_1, input_2, input_3,
+                                          input_5, input_6, input_7, input_9,
+                                          input_10, input_11, acc_1);
+
+      // Now load the bottom row.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+      }
+
+      // Multiply-accum for bottom-left output.
+      acc_2 = MultiplyAccumulate3x3Filter(filter, input_4, input_5, input_6,
+                                          input_8, input_9, input_10, input_0,
+                                          input_1, input_2, acc_2);
+
+      // Multiply-accum for bottom-right output.
+      acc_3 = MultiplyAccumulate3x3Filter(filter, input_5, input_6, input_7,
+                                          input_9, input_10, input_11, input_1,
+                                          input_2, input_3, acc_3);
+    }
+
+    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
+                                  output_multiplier, output_shift,
+                                  output_activation_min, output_activation_max,
+                                  output_ptr, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 4, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the top right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 4, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_depth * 4;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 1, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    // To process 2x1 outputs using a 3x3 filter, we require 4x3 inputs.
+    // Load all inputs at the beginning.
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth * output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 2, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
+
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1;
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9;
+
+    const uint8* ptr = input_ptr;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+    // Load first 2 rows.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load last row.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 4, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    // Reuse 4x2 kernel twice.
+    ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
+        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth,
+        output_width);
+
+    ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
+        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
+        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_ptr + 2 * output_depth, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 1, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
+
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
+        input_4, input_5, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Third output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_3, input_4, input_5, input_6, input_7, input_8, input_0,
+        input_1, input_2, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Fourth output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 2, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1, acc_2, acc_3;
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_2.low = vld1q_s32(bias_ptr);
+    acc_3.low = vld1q_s32(bias_ptr);
+
+    bias_ptr += 4;
+    acc_0.high = vld1q_s32(bias_ptr);
+    acc_1.high = vld1q_s32(bias_ptr);
+    acc_2.high = vld1q_s32(bias_ptr);
+    acc_3.high = vld1q_s32(bias_ptr);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    // Add scope for input registers to help the compiler know that it is
+    // not needed.
+    {
+      // To process 2x2 outputs using a 3x3 filter at stride 2, we require
+      // 5x5 inputs. We load the first 5x2 inputs at a time.
+      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+          input_7, input_8, input_9;
+
+      const uint8* ptr = input_ptr;
+
+      // Load inputs.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+      }
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                    input_0, input_1, input_2);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                    input_2, input_3, input_4);
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                    input_5, input_6, input_7);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                    input_7, input_8, input_9);
+
+      // Load next inputs.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+      }
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                    input_0, input_1, input_2);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                    input_2, input_3, input_4);
+
+      // Moving onto the two bottom outputs.
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f0, filter.f1, filter.f2,
+                                    input_0, input_1, input_2);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f0, filter.f1, filter.f2,
+                                    input_2, input_3, input_4);
+
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f3, filter.f4, filter.f5,
+                                    input_5, input_6, input_7);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f3, filter.f4, filter.f5,
+                                    input_7, input_8, input_9);
+
+      // Load last input row.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+      }
+
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f6, filter.f7, filter.f8,
+                                    input_0, input_1, input_2);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f6, filter.f7, filter.f8,
+                                    input_2, input_3, input_4);
+    }
+
+    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
+                                  output_multiplier, output_shift,
+                                  output_activation_min, output_activation_max,
+                                  output_ptr, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 4, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    // Reuse 2x2 kernel twice.
+    ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
+        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth,
+        output_width);
+
+    ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
+        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
+        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_ptr + 2 * output_depth, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 1, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
+
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
+        input_4, input_5, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 2, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 3 * input_depth;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
+        input_6, input_7, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 4, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 3 * input_depth;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
+        input_6, input_7, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Third output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 5 * input_depth;
+    temp_2 = vld1_u8(ptr);
+    temp_0 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_5 = vld1_u8(ptr);
+    temp_3 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_8 = vld1_u8(ptr);
+    temp_6 = vld1_u8(ptr + input_depth);
+
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_1, input_2, input_0, input_4, input_5, input_3, input_7,
+        input_8, input_6, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Fourth output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 7 * input_depth;
+    temp_1 = vld1_u8(ptr);
+    temp_2 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_4 = vld1_u8(ptr);
+    temp_5 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_7 = vld1_u8(ptr);
+    temp_8 = vld1_u8(ptr + input_depth);
+
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+
+    uint8x8_t temp_0 = vld1_u8(input_ptr);
+    uint8x8_t temp_1 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_2 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_ptr += input_row_size;
+    uint8x8_t temp_3 = vld1_u8(input_ptr);
+    uint8x8_t temp_4 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_5 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_ptr += input_row_size;
+    uint8x8_t temp_6 = vld1_u8(input_ptr);
+    uint8x8_t temp_7 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_8 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+inline void ShuffleInput(const uint8* input_ptr, int input_depth,
+                         int input_width, int input_height, int output_depth,
+                         int output_width, int output_height,
+                         uint8* output_ptr) {
+  const int input_row_size = input_depth * input_width;
+
+  for (int y = 0; y < output_height; y++) {
+    const uint8* ptr = input_ptr;
+    for (int x = 0; x < output_width; x++) {
+      memcpy(output_ptr, ptr, output_depth);
+      output_ptr += output_depth;
+      ptr += input_depth;
+    }
+    input_ptr += input_row_size;
+  }
+}
+
+template <int kFixedHeight, int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvRow3x3FilterDepth8 {};
+
+template <int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth, kFixedStrideHeight> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 1x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<1, 4, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * kFixedStrideWidth * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // 1x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += kFixedStrideWidth * input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth, kFixedStrideHeight> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 2x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 4, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * kFixedStrideWidth * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // 2x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 2, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * kFixedStrideWidth * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 2x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 1, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += kFixedStrideWidth * input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <>
+struct ConvRow3x3FilterDepth8<4, 1, 1> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 4x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 4, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // Handle the rest of the right side.
+    // 4x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 2, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 4x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 1, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <>
+struct ConvRow3x3FilterDepth8<4, 2, 2> {
+  // The buffer size of the shuffled input.
+  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 9 * 9; }
+
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    // Branch and cache misses increase substantially with stride 2 kernels.
+    // Adding prefetching reduces latency by as much as 2x.
+    const int i0 = 0;
+    const int i1 = input_depth;
+    const int i2 = 2 * input_depth;
+    const int i3 = 3 * input_depth;
+    const int i4 = 4 * input_depth;
+    const int i5 = 5 * input_depth;
+    const int i6 = 6 * input_depth;
+    const int i7 = 7 * input_depth;
+    const int i8 = 8 * input_depth;
+
+#define DEPTHWISECONV_PRELOAD_ROW(input_ptr, i)         \
+  preload_l1_keep(input_ptr + i * input_row_size + i0); \
+  preload_l1_keep(input_ptr + i * input_row_size + i1); \
+  preload_l1_keep(input_ptr + i * input_row_size + i2); \
+  preload_l1_keep(input_ptr + i * input_row_size + i3); \
+  preload_l1_keep(input_ptr + i * input_row_size + i4); \
+  preload_l1_keep(input_ptr + i * input_row_size + i5); \
+  preload_l1_keep(input_ptr + i * input_row_size + i6); \
+  preload_l1_keep(input_ptr + i * input_row_size + i7); \
+  preload_l1_keep(input_ptr + i * input_row_size + i8);
+
+    int out_x = start_x;
+    // 4x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      int depth = 0;
+      for (; depth <= output_depth - 64; depth += 64) {
+        // Preload 9x9 input.
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
+
+        // For a large input window (64x9x9) that is small enough to fit in L1
+        // cache, copy the input into a separate buffer and run the kernel on
+        // this new buffer. This reduces the likelihood of cache misses when
+        // the kernel is loading input data. If this size is ever changed,
+        // update the ShuffleWorkspaceSize() function to return the new size.
+        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 9,
+                     9, shuffle_workspace);
+        const uint8* shuffled_ptr = &shuffle_workspace[0];
+
+        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
+          ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
+              shuffled_ptr, 64, input_offset, 64 * 9, filter_ptr, filter_offset,
+              bias_ptr, output_offset, output_multiplier, output_shift,
+              output_activation_min, output_activation_max, output_ptr,
+              output_depth, output_width);
+
+          shuffled_ptr += 8;
+          output_ptr += 8;
+          filter_ptr += 8;
+          bias_ptr += 8;
+        }
+        input_ptr += 64;
+      }
+
+      // Preload 9x9 input one more time for the rest of the depth.
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
+
+      for (; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * 2 * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+#undef DEPTHWISECONV_PRELOAD_ROW
+
+    // Handle the rest of the right side.
+    // 4x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * 2 * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 4x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 1, 2, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <>
+struct ConvRow3x3FilterDepth8<8, 2, 2> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    // Reuse 4 row kernels twice.
+    ConvRow3x3FilterDepth8<4, 2, 2>::Run(
+        input_data, start_x, start_y, input_depth, input_width, input_height,
+        input_row_size, input_offset, filter_data, filter_offset, bias_data,
+        output_offset, output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_data, output_depth, output_width,
+        shuffle_workspace);
+
+    ConvRow3x3FilterDepth8<4, 2, 2>::Run(
+        input_data + 2 * 4 * input_row_size, start_x, start_y + 4, input_depth,
+        input_width, input_height, input_row_size, input_offset, filter_data,
+        filter_offset, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_data + 4 * output_depth * output_width, output_depth,
+        output_width, shuffle_workspace);
+  }
+};
+
+template <>
+struct ConvRow3x3FilterDepth8<8, 1, 1> {
+  // The buffer size of the shuffled input.
+  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 10 * 10; }
+
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+    // 8x8 at a time.
+    for (; out_x <= output_width - 8; out_x += 8) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      int depth = 0;
+      for (; depth <= output_depth - 64; depth += 64) {
+        // For a large input window (64x10x10) that is small enough to fit in L1
+        // cache, copy the input into a separate buffer and run the kernel on
+        // this new buffer. This reduces the likelihood of cache misses when
+        // the kernel is loading input data. If the size of the input window
+        // changes, update the function ShuffleWorkspaceSize() with the new
+        // size.
+        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 10,
+                     10, shuffle_workspace);
+        const uint8* shuffled_ptr = shuffle_workspace;
+
+        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
+          ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
+              shuffled_ptr, 64, input_offset, 64 * 10, filter_ptr,
+              filter_offset, bias_ptr, output_offset, output_multiplier,
+              output_shift, output_activation_min, output_activation_max,
+              output_ptr, output_depth, output_width);
+
+          shuffled_ptr += 8;
+          output_ptr += 8;
+          filter_ptr += 8;
+          bias_ptr += 8;
+        }
+        input_ptr += 64;
+      }
+
+      for (; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 8 * input_depth;
+      output_data += 8 * output_depth;
+    }
+
+    // Handle the rest of the right side by re-using 4 row kernels twice.
+    ConvRow3x3FilterDepth8<4, 1, 1>::Run(
+        input_data, out_x, start_y, input_depth, input_width, input_height,
+        input_row_size, input_offset, filter_data, filter_offset, bias_data,
+        output_offset, output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_data, output_depth, output_width,
+        shuffle_workspace);
+
+    ConvRow3x3FilterDepth8<4, 1, 1>::Run(
+        input_data + 4 * input_row_size, out_x, start_y + 4, input_depth,
+        input_width, input_height, input_row_size, input_offset, filter_data,
+        filter_offset, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_data + 4 * output_depth * output_width, output_depth,
+        output_width, shuffle_workspace);
+  }
+};
+
+inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims,
+                                         const Dims<4>& filter_dims,
+                                         int stride_width, int stride_height,
+                                         int pad_width, int pad_height,
+                                         int depth_multiplier,
+                                         const Dims<4>& output_dims) {
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+
+  bool supported = filter_width == 3 && filter_height == 3 &&
+                   depth_multiplier == 1 &&
+                   (stride_width == 1 || stride_width == 2) &&
+                   (stride_height == 1 || stride_height == 2) &&
+                   (stride_width == stride_height) && pad_width == 0 &&
+                   pad_height == 0 && (input_depth % 8) == 0;
+
+  if (!supported) {
+    return false;
+  }
+
+  // Handle case where padding is zero but padding type is not kValid.
+  // This would require special boundary case handling that is not supported.
+
+  const int out_x = output_width - 1;
+  const int out_y = output_height - 1;
+
+  const int in_x_origin = (out_x * stride_width) - pad_width;
+  const int in_y_origin = (out_y * stride_height) - pad_height;
+
+  const int in_x_end = in_x_origin + filter_width;
+  const int in_y_end = in_y_origin + filter_height;
+
+  // Supported only if filter on the right and bottom boundary lies completely
+  // within the input.
+  return in_x_end <= input_width && in_y_end <= input_height;
+}
+
+inline void DepthwiseConv3x3Filter(
+    const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
+    const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
+    const int32* bias_data, const Dims<4>& bias_dims, int stride_width,
+    int stride_height, int pad_width, int pad_height, int depth_multiplier,
+    int32 output_offset, int32 output_multiplier, int output_shift,
+    int32 output_activation_min, int32 output_activation_max,
+    uint8* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+
+  // Algorithm assumes below constraints. It is optimized for depth multiplier
+  // of 1, 3x3 filter, no padding and strides 1 and 2.
+  TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+  TFLITE_DCHECK(depth_multiplier == 1);
+  TFLITE_DCHECK(filter_height == 3);
+  TFLITE_DCHECK(filter_width == 3);
+  TFLITE_DCHECK(pad_height == 0);
+  TFLITE_DCHECK(pad_width == 0);
+  TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
+  TFLITE_DCHECK(stride_width == stride_height);
+
+  const int input_row_size = input_depth * (input_width + 2 * pad_width);
+  const int output_row_size = output_depth * output_width;
+  const int input_batch_size = input_row_size * (input_height + 2 * pad_height);
+  const int output_batch_size = output_depth * output_width * output_height;
+
+  using conv_row_func_t = decltype(&ConvRow3x3FilterDepth8<1, 1, 1>::Run);
+  conv_row_func_t conv_1_output_row = ConvRow3x3FilterDepth8<1, 1, 1>::Run;
+  conv_row_func_t conv_2_output_rows = ConvRow3x3FilterDepth8<2, 1, 1>::Run;
+  conv_row_func_t conv_4_output_rows = ConvRow3x3FilterDepth8<4, 1, 1>::Run;
+  conv_row_func_t conv_8_output_rows = ConvRow3x3FilterDepth8<8, 1, 1>::Run;
+
+  if (stride_width == 2) {
+    conv_1_output_row = ConvRow3x3FilterDepth8<1, 2, 2>::Run;
+    conv_2_output_rows = ConvRow3x3FilterDepth8<2, 2, 2>::Run;
+    conv_4_output_rows = ConvRow3x3FilterDepth8<4, 2, 2>::Run;
+    conv_8_output_rows = ConvRow3x3FilterDepth8<8, 2, 2>::Run;
+  }
+
+  // Allocate maximum memory needed for shuffled input.
+  // TODO(mariewhite): The size of this workspace is small enough to be
+  // allocated on the stack. Eventually we will want to move it to the heap
+  // and have it allocated outside of this function, like the im2col_array used
+  // in gemmlowp.
+#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
+  uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
+
+  // Make sure the kernels using this buffer will not run out of bounds.
+  static_assert(ConvRow3x3FilterDepth8<8, 1, 1>::ShuffleWorkspaceSize() <=
+                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
+                "Shuffle workspace size is too small.");
+  static_assert(ConvRow3x3FilterDepth8<4, 2, 2>::ShuffleWorkspaceSize() <=
+                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
+                "Shuffle workspace size is too small.");
+
+#undef DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE
+
+  for (int b = 0; b < batches; ++b) {
+    const uint8* input_ptr = input_data + b * input_batch_size;
+    uint8* output_ptr = output_data + b * output_batch_size;
+
+    int out_y = 0;
+
+    // Handle 8 rows at a time.
+    for (; out_y <= output_height - 8; out_y += 8) {
+      conv_8_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
+
+      input_ptr += 8 * stride_height * input_row_size;
+      output_ptr += 8 * output_row_size;
+    }
+
+    // Handle 4 rows at a time.
+    for (; out_y <= output_height - 4; out_y += 4) {
+      conv_4_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
+
+      input_ptr += 4 * stride_height * input_row_size;
+      output_ptr += 4 * output_row_size;
+    }
+
+    // Handle 2 rows at a time.
+    for (; out_y <= output_height - 2; out_y += 2) {
+      conv_2_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
+
+      input_ptr += 2 * stride_height * input_row_size;
+      output_ptr += 2 * output_row_size;
+    }
+
+    // Handle one row at a time.
+    for (; out_y < output_height; out_y++) {
+      conv_1_output_row(input_ptr, 0, out_y, input_depth, input_width,
+                        input_height, input_row_size, input_offset, filter_data,
+                        filter_offset, bias_data, output_offset,
+                        output_multiplier, output_shift, output_activation_min,
+                        output_activation_max, output_ptr, output_depth,
+                        output_width, shuffle_workspace);
+
+      input_ptr += stride_height * input_row_size;
+      output_ptr += output_row_size;
+    }
+  }
+}
+
+#endif  // __aarch64__
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 780401e052733cccae0cc34f495df090c1530624..47dfcbeb01a046be4c506b1a303eee4fe1e65588 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdlib.h>
 #include <string.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
 
 #ifdef USE_NEON
 
@@ -248,6 +249,83 @@ void NeonClipVector(const float* vector, int v_size, float abs_limit,
   }
 }
 
+void NeonSymmetricQuantizeFloats(const float* values, const int size,
+                                 int8_t* quantized_values, float* min,
+                                 float* max, float* scaling_factor) {
+  // TODO(raziel): vectorize min/max calculation.
+  auto minmax = std::minmax_element(values, values + size);
+  *min = *minmax.first;
+  *max = *minmax.second;
+  const int kScale = 127;
+  const float range = std::max(std::abs(*min), std::abs(*max));
+  if (range == 0) {
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
+    return;
+  }
+  *scaling_factor = kScale / range;
+
+  const int postamble_start =
+      size - (size & (2 * kFloatWeightsPerNeonLane - 1));
+
+  // Vectorized constants.
+  const float32x4_t q_factor_f32x4 = vmovq_n_f32(*scaling_factor);
+  const float32x4_t point5_f32x4 = vmovq_n_f32(0.5);
+  const float32x4_t zero_f32x4 = vmovq_n_f32(0.0);
+  const int32x4_t scale_i32x4 = vmovq_n_s32(kScale);
+  const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale);
+
+  for (int i = 0; i < postamble_start; i += 2 * kFloatWeightsPerNeonLane) {
+    // Implements the vectorized version of the following:
+    // const int32 quantized_value = static_cast<int32>(
+    //    std::round(*scaling_factor * values[i]));
+    // Since the vectorized round intrinsics (vrndqa_f32) is not supported
+    // on all Neon flavors, we use the following method for rounding: if (x
+    // < 0) (int)(x - 0.5) if (x >= 0) (int)(x + 0.5)
+    float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
+    float32x4_t value1_f32x4 = vld1q_f32(&values[i + kFloatWeightsPerNeonLane]);
+    float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4);
+    float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4);
+
+    int32x4_t cmp_with_zero0_ui32x4 =
+        (int32x4_t)vcltq_f32(mul0_f32x4, zero_f32x4);  // NOLINT
+    int32x4_t cmp_with_zero1_ui32x4 =
+        (int32x4_t)vcltq_f32(mul1_f32x4, zero_f32x4);  // NOLINT
+
+    float32x4_t cmp_with_zero0_f32x4 = vcvtq_f32_s32(cmp_with_zero0_ui32x4);
+    float32x4_t cmp_with_zero1_f32x4 = vcvtq_f32_s32(cmp_with_zero1_ui32x4);
+    cmp_with_zero0_f32x4 = vaddq_f32(cmp_with_zero0_f32x4, point5_f32x4);
+    cmp_with_zero1_f32x4 = vaddq_f32(cmp_with_zero1_f32x4, point5_f32x4);
+
+    mul0_f32x4 = vaddq_f32(mul0_f32x4, cmp_with_zero0_f32x4);
+    mul1_f32x4 = vaddq_f32(mul1_f32x4, cmp_with_zero1_f32x4);
+
+    int32x4_t f2i0_i32x4 = vcvtq_s32_f32(mul0_f32x4);
+    int32x4_t f2i1_i32x4 = vcvtq_s32_f32(mul1_f32x4);
+
+    // Implements the vectorized version of the folowing block:
+    //  quantized_values[i] = std::min(kScale, std::max(-kScale,
+    //  quantized_value));
+    int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4);
+    int32x4_t max1_i32x4 = vmaxq_s32(f2i1_i32x4, neg_scale_i32x4);
+    int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4);
+    int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4);
+
+    int16x4_t min0_16x4 = vmovn_s32(min0_i32x4);
+    int16x4_t min1_16x4 = vmovn_s32(min1_i32x4);
+
+    int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4);
+    int8x8_t min_s8x8 = vqmovn_s16(min_16x8);
+    vst1_s8(&quantized_values[i], min_s8x8);
+  }
+
+  for (int i = postamble_start; i < size; ++i) {
+    const int32 quantized_value =
+        static_cast<int32>(TfLiteRound(*scaling_factor * values[i]));
+    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+  }
+}
+
 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
                                  int v_size) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
index b7e317dc60e2c68e9e993ff45c9090a01bd13b94..3b6f4bd583a85d11fa8d12c6fdd6e8b77113123d 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -97,6 +97,13 @@ void ClipVector(const float* vector, int v_size, float abs_limit,
   NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
 }
 
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min, float* max,
+                             float* scaling_factor) {
+  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min,
+                   max, scaling_factor);
+}
+
 void VectorShiftLeft(float* vector, int v_size, float shift_value) {
   NEON_OR_PORTABLE(VectorShiftLeft, vector, v_size, shift_value);
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 132c9e1e5f07657da1f0d168867b18e74c7a69ee..b14f18dacc19a6c838056ad0e32024cb2b4f4f69 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -30,7 +30,9 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -324,578 +326,774 @@ void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
   }
 }
 
-inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
-                           const float* weights_data,
-                           const Dims<4>& weights_dims, const float* bias_data,
-                           const Dims<4>& bias_dims,
-                           float output_activation_min,
-                           float output_activation_max, float* output_data,
-                           const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("FullyConnected");
-  // TODO(b/62193649): this convoluted shape computation (determining
-  // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
-  // is because the current --variable_batch hack consists in overwriting the
-  // 3rd dimension with the runtime batch size, as we don't keep track for each
-  // array of which dimension is the batch dimension in it.
-  // When that is fixed, this should become:
-  // const auto input_matrix_map =
-  //     MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
-  const int input_rows = ArraySize(weights_dims, 0);
-  const auto input_matrix_map =
-      MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows);
-  const auto filter_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims);
-  auto output_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
-
-  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
-  AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data,
-                                   output_dims, output_activation_min,
-                                   output_activation_max);
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void FullyConnected(const float* input_data, const Dims<4>& input_dims,
-                    const float* weights_data, const Dims<4>& weights_dims,
-                    const float* bias_data, const Dims<4>& bias_dims,
-                    float* output_data, const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
-                 bias_dims, output_activation_min, output_activation_max,
-                 output_data, output_dims);
+inline void optimized_ops_preload_l1_stream(const uint8* ptr) {
+#ifdef GEMMLOWP_ARM_64
+  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#else
+  gemmlowp::Prefetch(ptr);
+#endif
 }
 
-inline void preload_l1_stream(const uint8* ptr) {
+inline void optimized_ops_preload_l1_keep(const uint8* ptr) {
 #ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
 #else
   gemmlowp::Prefetch(ptr);
 #endif
 }
 
-#ifdef USE_NEON
-inline void FullyConnectedAsGEMV(
-    const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
-    const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
-    const int32* bias_data, const Dims<4>& bias_dims, int32 output_offset,
-    int32 output_multiplier, int output_shift, int32 output_activation_min,
-    int32 output_activation_max, uint8* output_data,
-    const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
+#ifdef GEMMLOWP_NEON
+// In the common case of batch size 1, a fully-connected node degenerates
+// to a matrix*vector product. LSTM cells contain a fully-connected node;
+// when quantized, this becomes a special type of GEMV operation where
+// the output is 16bit-quantized, thus needs its own special path.
+inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims,
+                            const uint8* weights_data,
+                            const Dims<4>& weights_dims,
+                            uint8 weights_zero_point, const int32* bias_data,
+                            const Dims<4>& bias_dims, int32 accum_multiplier,
+                            int accum_shift, int16* output_data,
+                            const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCell");
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
   TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
                        ArraySize(output_dims, 3),
                    1);
   const int input_size = input_dims.strides[3];
-  const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
-  static constexpr int kPeel = 4;
-  for (int k = 0; k < input_size; k += 64) {
-    preload_l1_stream(input_data + k);
-  }
-  for (int k = 0; k < kPeel * input_size; k += 64) {
-    preload_l1_stream(filter_data + k);
-  }
-  TFLITE_DCHECK(!(output_size % kPeel));
+  const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  // This special fast path for quantized LSTM cells does not try to support
+  // odd sizes that we haven't encountered in any LSTM cell, that would
+  // require special code (that would go untested until any LSTM cell
+  // exercises it). We just guard our assumptions about size evenness with
+  // the following assertions.
+  TFLITE_DCHECK(!(output_size % 4));
+  TFLITE_DCHECK(!(input_size % 8));
   const int32* bias_ptr = bias_data;
-  uint8* output_ptr = output_data;
-  for (int out = 0; out < output_size; out += kPeel) {
-    int32x4_t acc[kPeel];
-    for (int k = 0; k < kPeel; k++) {
-      acc[k] = vdupq_n_s32(0);
-    }
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+  int16* output_ptr = output_data;
+  for (int out = 0; out < output_size; out += 4) {
+    int32x4_t acc_0 = vdupq_n_s32(0);
+    int32x4_t acc_1 = vdupq_n_s32(0);
+    int32x4_t acc_2 = vdupq_n_s32(0);
+    int32x4_t acc_3 = vdupq_n_s32(0);
+    const int16x8_t input_offset_vec = vdupq_n_s16(-128);
+    const int16x8_t weights_offset_vec = vdupq_n_s16(-weights_zero_point);
     int in = 0;
+    // Handle 16 levels of depth at a time.
     for (; in <= input_size - 16; in += 16) {
       const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
-      uint8x16_t filter_val_u8[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
-        filter_val_u8[k] = vld1q_u8(filter_ptr);
-        preload_l1_stream(filter_ptr + 64);
-      }
-      int16x8_t input_val[2];
+      const uint8* weights_ptr = weights_data + in + out * input_size;
+      uint8x16_t weights_val_u8_0 = vld1q_u8(weights_ptr + 0 * input_size);
+      uint8x16_t weights_val_u8_1 = vld1q_u8(weights_ptr + 1 * input_size);
+      uint8x16_t weights_val_u8_2 = vld1q_u8(weights_ptr + 2 * input_size);
+      uint8x16_t weights_val_u8_3 = vld1q_u8(weights_ptr + 3 * input_size);
+      int16x8_t input_val_0, input_val_1;
       const uint8x8_t low = vget_low_u8(input_val_u8);
       const uint8x8_t high = vget_high_u8(input_val_u8);
-      input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low));
-      input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high));
-      input_val[0] = vaddq_s16(input_val[0], input_offset_vec);
-      input_val[1] = vaddq_s16(input_val[1], input_offset_vec);
-      int16x8_t filter_val[kPeel][2];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8x8_t low = vget_low_u8(filter_val_u8[k]);
-        const uint8x8_t high = vget_high_u8(filter_val_u8[k]);
-        filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low));
-        filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high));
-        filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec);
-        filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec);
-      }
-      for (int p = 0; p < 2; p++) {
-        for (int k = 0; k < kPeel; k++) {
-          acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]),
-                             vget_low_s16(input_val[p]));
-        }
-        for (int k = 0; k < kPeel; k++) {
-          acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]),
-                             vget_high_s16(input_val[p]));
-        }
-      }
+      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      int16x8_t weights_val_0_0, weights_val_1_0, weights_val_2_0,
+          weights_val_3_0;
+      int16x8_t weights_val_0_1, weights_val_1_1, weights_val_2_1,
+          weights_val_3_1;
+      weights_val_0_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_0))),
+          weights_offset_vec);
+      weights_val_0_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_0))),
+          weights_offset_vec);
+      weights_val_1_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_1))),
+          weights_offset_vec);
+      weights_val_1_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_1))),
+          weights_offset_vec);
+      weights_val_2_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_2))),
+          weights_offset_vec);
+      weights_val_2_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_2))),
+          weights_offset_vec);
+      weights_val_3_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_3))),
+          weights_offset_vec);
+      weights_val_3_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_3))),
+          weights_offset_vec);
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_0),
+                        vget_low_s16(input_val_0));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_0),
+                        vget_low_s16(input_val_0));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_0),
+                        vget_low_s16(input_val_0));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_0),
+                        vget_low_s16(input_val_0));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_0),
+                        vget_high_s16(input_val_0));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_0),
+                        vget_high_s16(input_val_0));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_0),
+                        vget_high_s16(input_val_0));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_0),
+                        vget_high_s16(input_val_0));
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_1),
+                        vget_low_s16(input_val_1));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_1),
+                        vget_low_s16(input_val_1));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_1),
+                        vget_low_s16(input_val_1));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_1),
+                        vget_low_s16(input_val_1));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_1),
+                        vget_high_s16(input_val_1));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_1),
+                        vget_high_s16(input_val_1));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_1),
+                        vget_high_s16(input_val_1));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_1),
+                        vget_high_s16(input_val_1));
     }
-    for (; in <= input_size - 8; in += 8) {
+    // Handle 8 levels of depth at a time.
+    for (; in < input_size; in += 8) {
       const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
-      uint8x8_t filter_val_u8[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
-        filter_val_u8[k] = vld1_u8(filter_ptr);
-      }
+      const uint8* weights_ptr = weights_data + in + out * input_size;
+      uint8x8_t weights_val_u8_0 = vld1_u8(weights_ptr + 0 * input_size);
+      uint8x8_t weights_val_u8_1 = vld1_u8(weights_ptr + 1 * input_size);
+      uint8x8_t weights_val_u8_2 = vld1_u8(weights_ptr + 2 * input_size);
+      uint8x8_t weights_val_u8_3 = vld1_u8(weights_ptr + 3 * input_size);
       int16x8_t input_val;
       input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
       input_val = vaddq_s16(input_val, input_offset_vec);
-      int16x8_t filter_val[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k]));
-        filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec);
-      }
-      for (int k = 0; k < kPeel; k++) {
-        acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]),
-                           vget_low_s16(input_val));
-      }
-      for (int k = 0; k < kPeel; k++) {
-        acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]),
-                           vget_high_s16(input_val));
-      }
-    }
-    if (in < input_size) {
-      int32 buf[4 * kPeel];
-      for (int k = 0; k < 4; k++) {
-        vst1q_s32(buf + 4 * k, acc[k]);
-      }
-      for (; in < input_size; in++) {
-        int lane = (in + 8 - input_size) % 4;
-        const int32 input_val = input_data[in] + input_offset;
-        for (int k = 0; k < kPeel; k++) {
-          int32 filter_val =
-              filter_data[in + (out + k) * input_size] + filter_offset;
-          buf[lane + 4 * k] += filter_val * input_val;
-        }
-      }
-      for (int k = 0; k < 4; k++) {
-        acc[k] = vld1q_s32(buf + 4 * k);
-      }
-    }
-
-    // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc[kPeel];
-    for (int k = 0; k < kPeel; k++) {
-      pairwise_reduced_acc[k] =
-          vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k]));
-    }
-    static_assert(kPeel == 4, "the code below currently assumes kPeel = 4");
-    const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]);
+      int16x8_t weights_val_0, weights_val_1, weights_val_2, weights_val_3;
+      weights_val_0 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_0)),
+                    weights_offset_vec);
+      weights_val_1 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_1)),
+                    weights_offset_vec);
+      weights_val_2 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_2)),
+                    weights_offset_vec);
+      weights_val_3 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_3)),
+                    weights_offset_vec);
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0),
+                        vget_low_s16(input_val));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1),
+                        vget_low_s16(input_val));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2),
+                        vget_low_s16(input_val));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3),
+                        vget_low_s16(input_val));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0),
+                        vget_high_s16(input_val));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1),
+                        vget_high_s16(input_val));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2),
+                        vget_high_s16(input_val));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3),
+                        vget_high_s16(input_val));
+    }
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
     const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]);
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
     int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
     // Add bias values.
     int32x4_t bias_vec = vld1q_s32(bias_ptr);
     bias_ptr += 4;
     reduced = vaddq_s32(reduced, bias_vec);
+    int left_shift = accum_shift > 0 ? accum_shift : 0;
+    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
     // Multiply by the fixed-point multiplier.
-    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
     // Rounding-shift-right.
     using gemmlowp::RoundingDivideByPOT;
-    reduced = RoundingDivideByPOT(reduced, output_shift);
-    // Add the output offset.
-    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    reduced = vaddq_s32(reduced, output_offset_vec);
+    reduced = RoundingDivideByPOT(reduced, right_shift);
     // Narrow values down to 16 bit signed.
     const int16x4_t res16 = vqmovn_s32(reduced);
-    // Narrow values down to 8 bit unsigned, saturating.
-    uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
-    // Apply the clamping from the activation function
-    res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
-    res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
-    // Store results to destination. Assumes 32bit alignment.
-    vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr),
-                  vreinterpret_u32_u8(res8), 0);
-    output_ptr += kPeel;
+    vst1_s16(output_ptr, res16);
+    output_ptr += 4;
   }
 }
-#endif  // USE_NEON
+#endif
 
-struct GemmlowpOutputPipeline {
-  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
-      ColVectorMap;
-  typedef std::tuple<
-      gemmlowp::OutputStageBiasAddition<ColVectorMap>,
-      gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
-      gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
-      Pipeline;
-  static Pipeline Make(const int32* bias_data, int output_rows,
-                       int32 output_offset, int32 output_multiplier,
-                       int output_shift, int32 output_activation_min,
-                       int32 output_activation_max) {
-    ColVectorMap bias_vector(bias_data, output_rows);
-    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-    bias_addition_stage.bias_vector = bias_vector;
-    gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
-        quantize_down_stage;
-    quantize_down_stage.result_offset_after_shift = output_offset;
-    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
-    quantize_down_stage.result_shift = output_shift;
-    gemmlowp::OutputStageClamp clamp_stage;
-    clamp_stage.min = output_activation_min;
-    clamp_stage.max = output_activation_max;
-    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
-    return std::make_tuple(bias_addition_stage, quantize_down_stage,
-                           clamp_stage, saturating_cast_stage);
+#ifdef GEMMLOWP_NEON
+inline void GEMVForLstmCellWithSymmetricRange(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 accum_multiplier,
+    int accum_shift, int16* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCellWithSymmetricRange");
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                       ArraySize(output_dims, 3),
+                   1);
+  const int input_size = input_dims.strides[3];
+  const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  // This special fast path for quantized LSTM cells does not try to support
+  // odd sizes that we haven't encountered in any LSTM cell, that would
+  // require special code (that would go untested until any LSTM cell
+  // exercises it). We just guard our assumptions about size evenness with
+  // the following assertions.
+  TFLITE_DCHECK(!(output_size % 4));
+  TFLITE_DCHECK(!(input_size % 64));
+  const int32* bias_ptr = bias_data;
+  int16* output_ptr = output_data;
+  const uint8x16_t signbit = vdupq_n_u8(0x80);
+  for (int in = 0; in < input_size; in += 32) {
+    optimized_ops_preload_l1_keep(input_data + in);
   }
-};
+  const int left_shift = accum_shift > 0 ? accum_shift : 0;
+  const int right_shift = accum_shift > 0 ? 0 : -accum_shift;
+  for (int out = 0; out < output_size; out += 4) {
+    // Load the bias values
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
 
-inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
-                           int32 input_offset, const uint8* filter_data,
-                           const Dims<4>& filter_dims, int32 filter_offset,
-                           const int32* bias_data, const Dims<4>& bias_dims,
-                           int32 output_offset, int32 output_multiplier,
-                           int output_shift, int32 output_activation_min,
-                           int32 output_activation_max, uint8* output_data,
-                           const Dims<4>& output_dims,
-                           gemmlowp::GemmContext* gemm_context) {
-  gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit");
-  // TODO(benoitjacob): This really should be:
-  //     const int batches = ArraySize(output_dims, 1);
-  // but the current --variable_batch hack consists in overwriting the 3rd
-  // dimension with the runtime batch size, as we don't keep track for each
-  // array of which dimension is the batch dimension in it.
-  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                      ArraySize(output_dims, 3);
-#ifdef USE_NEON
-  const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
-  if (batches == 1 && !(output_size % 4)) {
-    return FullyConnectedAsGEMV(
-        input_data, input_dims, input_offset, filter_data, filter_dims,
-        filter_offset, bias_data, bias_dims, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_data,
-        output_dims);
+    // Clear accumulators. We use 2 accumulator registers per row,
+    // for 4 rows. row_accumRN is the N-th accumulator for row R.
+    int32x4_t row_accum00 = vdupq_n_s32(0);
+    int32x4_t row_accum01 = vdupq_n_s32(0);
+    int32x4_t row_accum10 = vdupq_n_s32(0);
+    int32x4_t row_accum11 = vdupq_n_s32(0);
+    int32x4_t row_accum20 = vdupq_n_s32(0);
+    int32x4_t row_accum21 = vdupq_n_s32(0);
+    int32x4_t row_accum30 = vdupq_n_s32(0);
+    int32x4_t row_accum31 = vdupq_n_s32(0);
+
+    // kReadAhead parametrizes how far ahead we prefetch weights into L1 cache.
+    const int kReadAhead = 512;
+    // Prefetch the first weights values.
+    for (int k = 0; k < kReadAhead; k += 64) {
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      k);
+    }
+    // Loop along the rows, handling 64 bytes per iteration because that's
+    // cache line size on most current ARM-architecture CPUs.
+    for (int in = 0; in < input_size; in += 64) {
+      // Prefetch some future weights values.
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      in + kReadAhead);
+
+      // We will use 2 local 16-bit accumulators per row, for 2 rows.
+      // See below (*) for the rationale of processing only 2 rows at a time.
+      // local_accumRN is the N-th local accumulator for row R.
+      int16x8_t local_accum00;
+      int16x8_t local_accum01;
+      int16x8_t local_accum10;
+      int16x8_t local_accum11;
+
+      // Load 64 bytes of input activations values. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t input0 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 0)));
+      int8x16_t input1 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 1)));
+      int8x16_t input2 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 2)));
+      int8x16_t input3 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 3)));
+
+      // Beginning of the core accumulation. Notice how while we have 4
+      // rows to process, this code is taking care of only 2 rows at a time,
+      // thus being divided into two parts looking similar ("Rows 0 and 1" and
+      // "Rows 2 and 3").
+      //
+      // (*) The rationale for handling only 2 rows at a time is to avoid
+      // cache aliasing issues on 4-way set-associative L1-cache CPUs, such
+      // as Cortex-A53. With sufficiently large, power-of-two matrix dimensions,
+      // we may find ourselves in a situation where rows alias each other in
+      // the L1 cache, and moreover may also mutually alias with the input
+      // activations. If we try to load 4 rows at a time, together with the
+      // input activations, that may be 5 mutually-aliasing vectors, resulting
+      // in constant mutual eviction from L1 cache. Handling 2 rows at a time
+      // here largely mitigates these issues, and seems at least to be very
+      // effective on Cortex-A53:
+      //                          Before       After
+      // big (Cortex-A73)         2.85 ms      2.85 ms
+      // little (Cortex-A53)      11.0 ms      5.16 ms
+
+      // Rows 0 and 1:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 0)));
+      int8x16_t weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 1)));
+      int8x16_t weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 2)));
+      int8x16_t weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 3)));
+      int8x16_t weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 0)));
+      int8x16_t weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 1)));
+      int8x16_t weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 2)));
+      int8x16_t weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+
+      // Rows 2 and 3:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 0)));
+      weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 1)));
+      weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 2)));
+      weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 3)));
+      weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 0)));
+      weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 1)));
+      weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 2)));
+      weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+    }
+
+    row_accum00 = vaddq_s32(row_accum00, row_accum01);
+    row_accum10 = vaddq_s32(row_accum10, row_accum11);
+    row_accum20 = vaddq_s32(row_accum20, row_accum21);
+    row_accum30 = vaddq_s32(row_accum30, row_accum31);
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(row_accum00), vget_high_s32(row_accum00));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(row_accum10), vget_high_s32(row_accum10));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(row_accum20), vget_high_s32(row_accum20));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(row_accum30), vget_high_s32(row_accum30));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    reduced = vaddq_s32(reduced, bias_vec);
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_ptr, res16);
+    output_ptr += 4;
   }
-#endif  // USE_NEON
-  const int filter_rows = filter_dims.sizes[1];
-  const int filter_cols = filter_dims.sizes[0];
-  TFLITE_DCHECK_EQ(filter_dims.sizes[2], 1);
-  TFLITE_DCHECK_EQ(filter_dims.sizes[3], 1);
-  const int output_rows = output_dims.sizes[0];
-  TFLITE_DCHECK_EQ(output_rows, filter_rows);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+}
+#endif
 
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, output_rows, filter_cols, filter_cols);
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
-      input_data, filter_cols, batches, filter_cols);
-  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
-      output_data, output_rows, batches, output_rows);
-  const auto& output_pipeline = GemmlowpOutputPipeline::Make(
-      bias_data, output_rows, output_offset, output_multiplier, output_shift,
-      output_activation_min, output_activation_max);
-  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
-                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
-      input_offset, output_pipeline);
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                           const float* weights_data,
+                           const Dims<4>& weights_dims, const float* bias_data,
+                           const Dims<4>& bias_dims,
+                           float output_activation_min,
+                           float output_activation_max, float* output_data,
+                           const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected");
+  // TODO(b/62193649): this convoluted shape computation (determining
+  // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
+  // is because the current --variable_batch hack consists in overwriting the
+  // 3rd dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  // When that is fixed, this should become:
+  // const auto input_matrix_map =
+  //     MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const int input_rows = ArraySize(weights_dims, 0);
+  const auto input_matrix_map =
+      MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows);
+  const auto filter_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+  AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data,
+                                   output_dims, output_activation_min,
+                                   output_activation_max);
 }
 
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
-void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
-                    int32 input_offset, const uint8* filter_data,
-                    const Dims<4>& filter_dims, int32 filter_offset,
-                    const int32* bias_data, const Dims<4>& bias_dims,
-                    int32 output_offset, int32 output_multiplier,
-                    int output_shift, int32 output_activation_min,
-                    int32 output_activation_max, uint8* output_data,
-                    const Dims<4>& output_dims,
-                    gemmlowp::GemmContext* gemm_context) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
-                 filter_offset, bias_data, bias_dims, output_offset,
-                 output_multiplier, output_shift, output_activation_min,
-                 output_activation_max, output_data, output_dims, gemm_context);
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                    const float* weights_data, const Dims<4>& weights_dims,
+                    const float* bias_data, const Dims<4>& bias_dims,
+                    float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+                 bias_dims, output_activation_min, output_activation_max,
+                 output_data, output_dims);
 }
 
-template <typename T>
-inline void ExtractPatchIntoBufferColumn(
-    const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
-    int stride_width, int stride_height, int pad_width, int pad_height,
-    int in_width, int in_height, int in_depth, int single_buffer_length,
-    int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) {
-  gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn");
-  // This chunk of code reshapes all the inputs corresponding to
-  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
-  const int kwidth_times_indepth = kwidth * in_depth;
-  const int inwidth_times_indepth = in_width * in_depth;
-  const int ih_ungated_start = h * stride_height - pad_height;
-  const int ih_ungated_end = (ih_ungated_start + kheight);
-  const int ih_end = std::min(ih_ungated_end, in_height);
-  const int iw_ungated_start = w * stride_width - pad_width;
-  const int iw_ungated_end = (iw_ungated_start + kwidth);
-  const int iw_end = std::min(iw_ungated_end, in_width);
-  // If the patch is off the edge of the input image, skip writing those rows
-  // and columns from the patch into the output array.
-  const int h_offset = std::max(0, -ih_ungated_start);
-  const int w_offset = std::max(0, -iw_ungated_start);
-  const int ih_start = std::max(0, ih_ungated_start);
-  const int iw_start = std::max(0, iw_ungated_start);
-  const int single_row_num =
-      std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
-  const int output_row_offset = (buffer_id * single_buffer_length);
-  int out_offset =
-      output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
-  int in_offset = Offset(input_dims, 0, iw_start, ih_start, b);
-
-  // Express all of the calculations as padding around the input patch.
-  const int top_padding = h_offset;
-  const int bottom_padding = (ih_ungated_end - ih_end);
-  const int left_padding = w_offset;
-  const int right_padding = (iw_ungated_end - iw_end);
-  assert(single_row_num ==
-         ((kwidth - (left_padding + right_padding)) * in_depth));
-
-  // Write out zeroes to the elements representing the top rows of the input
-  // patch that are off the edge of the input image.
-  if (top_padding > 0) {
-    const int top_row_elements = (top_padding * kwidth * in_depth);
-    memset(conv_buffer_data + output_row_offset, byte_zero,
-           (top_row_elements * sizeof(T)));
+#ifdef USE_NEON
+inline void FullyConnectedAsGEMV(
+    const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
+    const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, uint8* output_data,
+    const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                       ArraySize(output_dims, 3),
+                   1);
+  const int input_size = input_dims.strides[3];
+  const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
+  static constexpr int kPeel = 4;
+  for (int k = 0; k < input_size; k += 64) {
+    optimized_ops_preload_l1_stream(input_data + k);
   }
-
-  // If the patch is on the interior of the input image horizontally, just copy
-  // over the rows sequentially, otherwise add zero padding at the start or end.
-  if ((left_padding == 0) && (right_padding == 0)) {
-    for (int ih = ih_start; ih < ih_end; ++ih) {
-      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
-             single_row_num * sizeof(T));
-      out_offset += kwidth_times_indepth;
-      in_offset += inwidth_times_indepth;
+  for (int k = 0; k < kPeel * input_size; k += 64) {
+    optimized_ops_preload_l1_stream(filter_data + k);
+  }
+  TFLITE_DCHECK(!(output_size % kPeel));
+  const int32* bias_ptr = bias_data;
+  uint8* output_ptr = output_data;
+  for (int out = 0; out < output_size; out += kPeel) {
+    int32x4_t acc[kPeel];
+    for (int k = 0; k < kPeel; k++) {
+      acc[k] = vdupq_n_s32(0);
     }
-  } else {
-    for (int ih = ih_start; ih < ih_end; ++ih) {
-      if (left_padding > 0) {
-        const int left_start = (out_offset - (left_padding * in_depth));
-        memset(conv_buffer_data + left_start, byte_zero,
-               (left_padding * in_depth * sizeof(T)));
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    int in = 0;
+    for (; in <= input_size - 16; in += 16) {
+      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
+      uint8x16_t filter_val_u8[kPeel];
+      for (int k = 0; k < kPeel; k++) {
+        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
+        filter_val_u8[k] = vld1q_u8(filter_ptr);
+        optimized_ops_preload_l1_stream(filter_ptr + 64);
       }
-      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
-             single_row_num * sizeof(T));
-      if (right_padding > 0) {
-        const int right_start = (out_offset + single_row_num);
-        memset(conv_buffer_data + right_start, byte_zero,
-               (right_padding * in_depth * sizeof(T)));
+      int16x8_t input_val[2];
+      const uint8x8_t low = vget_low_u8(input_val_u8);
+      const uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val[0] = vaddq_s16(input_val[0], input_offset_vec);
+      input_val[1] = vaddq_s16(input_val[1], input_offset_vec);
+      int16x8_t filter_val[kPeel][2];
+      for (int k = 0; k < kPeel; k++) {
+        const uint8x8_t low = vget_low_u8(filter_val_u8[k]);
+        const uint8x8_t high = vget_high_u8(filter_val_u8[k]);
+        filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low));
+        filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high));
+        filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec);
+        filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec);
+      }
+      for (int p = 0; p < 2; p++) {
+        for (int k = 0; k < kPeel; k++) {
+          acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]),
+                             vget_low_s16(input_val[p]));
+        }
+        for (int k = 0; k < kPeel; k++) {
+          acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]),
+                             vget_high_s16(input_val[p]));
+        }
       }
-      out_offset += kwidth_times_indepth;
-      in_offset += inwidth_times_indepth;
     }
-  }
-
-  // If the bottom of the patch falls off the input image, pad the values
-  // representing those input rows with zeroes.
-  if (bottom_padding > 0) {
-    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
-    const int bottom_start =
-        output_row_offset +
-        ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
-    memset(conv_buffer_data + bottom_start, byte_zero,
-           (bottom_row_elements * sizeof(T)));
-  }
-}
-
-template <typename T>
-void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
-            int stride_height, int pad_width, int pad_height, int kheight,
-            int kwidth, uint8 byte_zero, T* output_data,
-            const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Im2col");
-  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int input_depth = ArraySize(input_dims, 0);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_height = ArraySize(input_dims, 2);
-  const int output_depth = ArraySize(output_dims, 0);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-
-  int buffer_id = 0;
-  // Loop over the output nodes.
-  for (int b = 0; b < batches; ++b) {
-    for (int h = 0; h < output_height; ++h) {
-      for (int w = 0; w < output_width; ++w) {
-        ExtractPatchIntoBufferColumn(
-            input_dims, w, h, b, kheight, kwidth, stride_width, stride_height,
-            pad_width, pad_height, input_width, input_height, input_depth,
-            output_depth, buffer_id, input_data, output_data, byte_zero);
-        ++buffer_id;
+    for (; in <= input_size - 8; in += 8) {
+      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
+      uint8x8_t filter_val_u8[kPeel];
+      for (int k = 0; k < kPeel; k++) {
+        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
+        filter_val_u8[k] = vld1_u8(filter_ptr);
+      }
+      int16x8_t input_val;
+      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t filter_val[kPeel];
+      for (int k = 0; k < kPeel; k++) {
+        filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k]));
+        filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec);
+      }
+      for (int k = 0; k < kPeel; k++) {
+        acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]),
+                           vget_low_s16(input_val));
+      }
+      for (int k = 0; k < kPeel; k++) {
+        acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]),
+                           vget_high_s16(input_val));
+      }
+    }
+    if (in < input_size) {
+      int32 buf[4 * kPeel];
+      for (int k = 0; k < 4; k++) {
+        vst1q_s32(buf + 4 * k, acc[k]);
+      }
+      for (; in < input_size; in++) {
+        int lane = (in + 8 - input_size) % 4;
+        const int32 input_val = input_data[in] + input_offset;
+        for (int k = 0; k < kPeel; k++) {
+          int32 filter_val =
+              filter_data[in + (out + k) * input_size] + filter_offset;
+          buf[lane + 4 * k] += filter_val * input_val;
+        }
+      }
+      for (int k = 0; k < 4; k++) {
+        acc[k] = vld1q_s32(buf + 4 * k);
       }
     }
-  }
-}
 
-// legacy, for compatibility with old checked-in code
-template <typename T>
-void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
-            int pad_width, int pad_height, int kheight, int kwidth,
-            uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
-  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
-         kwidth, byte_zero, output_data, output_dims);
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc[kPeel];
+    for (int k = 0; k < kPeel; k++) {
+      pairwise_reduced_acc[k] =
+          vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k]));
+    }
+    static_assert(kPeel == 4, "the code below currently assumes kPeel = 4");
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+    reduced = vaddq_s32(reduced, bias_vec);
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, output_shift);
+    // Add the output offset.
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    reduced = vaddq_s32(reduced, output_offset_vec);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    // Narrow values down to 8 bit unsigned, saturating.
+    uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
+    // Apply the clamping from the activation function
+    res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
+    res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
+    // Store results to destination. Assumes 32bit alignment.
+    vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr),
+                  vreinterpret_u32_u8(res8), 0);
+    output_ptr += kPeel;
+  }
 }
+#endif  // USE_NEON
 
-inline void Conv(const float* input_data, const Dims<4>& input_dims,
-                 const float* filter_data, const Dims<4>& filter_dims,
-                 const float* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, float output_activation_min,
-                 float output_activation_max, float* output_data,
-                 const Dims<4>& output_dims, float* im2col_data,
-                 const Dims<4>& im2col_dims) {
-  (void)im2col_data;
-  (void)im2col_dims;
-  gemmlowp::ScopedProfilingLabel label("Conv");
-
-  const float* gemm_input_data = nullptr;
-  const Dims<4>* gemm_input_dims = nullptr;
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int filter_height = ArraySize(filter_dims, 2);
-  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
-                           filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
-    TFLITE_DCHECK(im2col_data);
-    Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
-           pad_height, filter_height, filter_width, 0, im2col_data,
-           im2col_dims);
-    gemm_input_data = im2col_data;
-    gemm_input_dims = &im2col_dims;
-  } else {
-    // TODO(aselle): We need to make sure to not send im2col if it is not
-    // needed.
-    TFLITE_DCHECK(!im2col_data);
-    gemm_input_data = input_data;
-    gemm_input_dims = &input_dims;
+struct GemmlowpOutputPipeline {
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<
+      gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+      gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
+      gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
+      Pipeline;
+  static Pipeline Make(const int32* bias_data, int output_rows,
+                       int32 output_offset, int32 output_multiplier,
+                       int output_shift, int32 output_activation_min,
+                       int32 output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
+        quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_shift = output_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
   }
+};
 
-  const auto im2col_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims);
-  const auto filter_matrix_map =
-      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
-  auto output_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
-
-  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
-
-  AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data,
-                                   output_dims, output_activation_min,
-                                   output_activation_max);
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Conv(const float* input_data, const Dims<4>& input_dims,
-          const float* filter_data, const Dims<4>& filter_dims,
-          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
-          int stride_height, int pad_width, int pad_height, float* output_data,
-          const Dims<4>& output_dims, float* im2col_data,
-          const Dims<4>& im2col_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
-       stride_width, stride_height, pad_width, pad_height,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims);
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Conv(const float* input_data, const Dims<4>& input_dims,
-          const float* filter_data, const Dims<4>& filter_dims,
-          const float* bias_data, const Dims<4>& bias_dims, int stride,
-          int pad_width, int pad_height, float* output_data,
-          const Dims<4>& output_dims, float* im2col_data,
-          const Dims<4>& im2col_dims) {
-  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
-           bias_dims, stride, stride, pad_width, pad_height, output_data,
-           output_dims, im2col_data, im2col_dims);
-}
-
-inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
-                 int32 input_offset, const uint8* filter_data,
-                 const Dims<4>& filter_dims, int32 filter_offset,
-                 const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
-                 gemmlowp::GemmContext* gemm_context) {
-  gemmlowp::ScopedProfilingLabel label("Conv/8bit");
-
-  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-
-  const uint8* gemm_input_data = nullptr;
-  const Dims<4>* gemm_input_dims = nullptr;
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int filter_height = ArraySize(filter_dims, 2);
-  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
-                           filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
-    TFLITE_DCHECK(im2col_data);
-    const int input_zero_point = -input_offset;
-    TFLITE_DCHECK_GE(input_zero_point, 0);
-    TFLITE_DCHECK_LE(input_zero_point, 255);
-    Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
-           pad_height, filter_height, filter_width, input_zero_point,
-           im2col_data, im2col_dims);
-    gemm_input_data = im2col_data;
-    gemm_input_dims = &im2col_dims;
-  } else {
-    TFLITE_DCHECK(!im2col_data);
-    gemm_input_data = input_data;
-    gemm_input_dims = &input_dims;
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                           int32 input_offset, const uint8* filter_data,
+                           const Dims<4>& filter_dims, int32 filter_offset,
+                           const int32* bias_data, const Dims<4>& bias_dims,
+                           int32 output_offset, int32 output_multiplier,
+                           int output_shift, int32 output_activation_min,
+                           int32 output_activation_max, uint8* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit");
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+#ifdef USE_NEON
+  const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
+  if (batches == 1 && !(output_size % 4)) {
+    return FullyConnectedAsGEMV(
+        input_data, input_dims, input_offset, filter_data, filter_dims,
+        filter_offset, bias_data, bias_dims, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_data,
+        output_dims);
   }
-
-  const int gemm_input_rows = gemm_input_dims->sizes[0];
-  const int gemm_input_cols = gemm_input_dims->sizes[1] *
-                              gemm_input_dims->sizes[2] *
-                              gemm_input_dims->sizes[3];
-  const int filter_rows = filter_dims.sizes[3];
-  const int filter_cols =
-      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+#endif  // USE_NEON
+  const int filter_rows = filter_dims.sizes[1];
+  const int filter_cols = filter_dims.sizes[0];
+  TFLITE_DCHECK_EQ(filter_dims.sizes[2], 1);
+  TFLITE_DCHECK_EQ(filter_dims.sizes[3], 1);
   const int output_rows = output_dims.sizes[0];
-  const int output_cols =
-      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
   TFLITE_DCHECK_EQ(output_rows, filter_rows);
-  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
-  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
   TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
   TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
   TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
   TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+
   gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, filter_rows, filter_cols);
+      filter_data, output_rows, filter_cols, filter_cols);
   gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
-      gemm_input_data, gemm_input_rows, gemm_input_cols);
+      input_data, filter_cols, batches, filter_cols);
   gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
-      output_data, output_rows, output_cols);
+      output_data, output_rows, batches, output_rows);
   const auto& output_pipeline = GemmlowpOutputPipeline::Make(
       bias_data, output_rows, output_offset, output_multiplier, output_shift,
       output_activation_min, output_activation_max);
@@ -905,671 +1103,988 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
       input_offset, output_pipeline);
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
-                 int32 input_offset, const uint8* filter_data,
-                 const Dims<4>& filter_dims, int32 filter_offset,
-                 const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
-                 gemmlowp::GemmContext* gemm_context) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
+inline void FullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
+    const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
+    const int32* bias_data_int32, const Dims<4>& bias_dims, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, int16* output_data, const Dims<4>& output_dims,
+    gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected/Uint8Int16");
+  // This is a copy of the reference implementation. We do not currently have a
+  // properly optimized version.
+  (void)gemm_context;  // only used in properly optimized code.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(output_offset, 0);
+
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(filter_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+
+  // Implementation of the fully connected node suited to the inside of an LSTM
+  // cell. The operands are 8-bit integers, the accumulators are internally
+  // 32bit integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+#ifdef GEMMLOWP_NEON
+  if (batches == 1 && input_offset == -128 && output_activation_min == -32768 &&
+      output_activation_max == 32767) {
+    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 64)) {
+      GEMVForLstmCellWithSymmetricRange(input_data, input_dims, filter_data,
+                                        filter_dims, bias_data_int32, bias_dims,
+                                        output_multiplier, -output_shift,
+                                        output_data, output_dims);
+      return;
+    }
+    if (!(output_depth % 4) && !(accum_depth % 8)) {
+      GEMVForLstmCell(input_data, input_dims, filter_data, filter_dims,
+                      filter_offset, bias_data_int32, bias_dims,
+                      output_multiplier, -output_shift, output_data,
+                      output_dims);
+      return;
+    }
   }
-  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
-       filter_offset, bias_data, bias_dims, stride_width, stride_height,
-       pad_width, pad_height, output_offset, output_multiplier, output_shift,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims, gemm_context);
+#endif
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> weights_matrix(
+      filter_data, output_depth, accum_depth);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, accum_depth, batches);
+  gemmlowp::MatrixMap<int16, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_depth, batches);
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  ColVectorMap bias_vector(bias_data_int32, output_depth);
+  gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+  bias_addition_stage.bias_vector = bias_vector;
+  gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
+  scale_stage.result_offset_after_shift = 0;
+  scale_stage.result_fixedpoint_multiplier = output_multiplier;
+  // Note that this shift is negated wrt ordinary FC.
+  scale_stage.result_exponent = -output_shift;
+  gemmlowp::OutputStageClamp clamp_stage;
+  clamp_stage.min = output_activation_min;
+  clamp_stage.max = output_activation_max;
+  gemmlowp::OutputStageSaturatingCastToInt16 saturating_cast_int16_stage;
+  auto output_pipeline =
+      std::make_tuple(bias_addition_stage, scale_stage, clamp_stage,
+                      saturating_cast_int16_stage);
+  gemmlowp::GemmWithOutputPipeline<uint8, int16,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, weights_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
 }
 
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
-void Conv(const uint8* input_data, const Dims<4>& input_dims,
-          int32 input_offset, const uint8* filter_data,
-          const Dims<4>& filter_dims, int32 filter_offset,
-          const int32* bias_data, const Dims<4>& bias_dims, int stride,
-          int pad_width, int pad_height, int32 output_offset,
-          int32 output_multiplier, int output_shift,
-          int32 output_activation_min, int32 output_activation_max,
-          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
-          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
+void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                    int32 input_offset, const uint8* filter_data,
+                    const Dims<4>& filter_dims, int32 filter_offset,
+                    const int32* bias_data, const Dims<4>& bias_dims,
+                    int32 output_offset, int32 output_multiplier,
+                    int output_shift, int32 output_activation_min,
+                    int32 output_activation_max, uint8* output_data,
+                    const Dims<4>& output_dims,
+                    gemmlowp::GemmContext* gemm_context) {
   static_assert(Ac == FusedActivationFunctionType::kNone ||
                     Ac == FusedActivationFunctionType::kRelu ||
                     Ac == FusedActivationFunctionType::kRelu6 ||
                     Ac == FusedActivationFunctionType::kRelu1,
                 "");
-  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
-       filter_offset, bias_data, bias_dims, stride, stride, pad_width,
-       pad_height, output_offset, output_multiplier, output_shift,
-       output_activation_min, output_activation_max, output_data, output_dims,
-       im2col_data, im2col_dims, gemm_context);
+  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+                 filter_offset, bias_data, bias_dims, output_offset,
+                 output_multiplier, output_shift, output_activation_min,
+                 output_activation_max, output_data, output_dims, gemm_context);
 }
 
-template <typename T>
-inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
-                         int block_size, T* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("DepthToSpace");
-
-  const int input_depth = ArraySize(input_dims, 0);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_height = ArraySize(input_dims, 2);
-
-  const int output_depth = ArraySize(output_dims, 0);
-  const int batch_size = ArraySize(output_dims, 3);
-
-  // Number of continuous values that we can copy in one interation.
-  const int stride = block_size * output_depth;
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int in_h = 0; in_h < input_height; ++in_h) {
-      const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch);
-      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
-        const T* src = input_ptr;
-        for (int in_w = 0; in_w < input_width; ++in_w) {
-          memcpy(output_data, src, stride * sizeof(T));
-          output_data += stride;
-          src += input_depth;
-        }
-        input_ptr += stride;
+// Internal function doing the actual arithmetic work for
+// ExperimentalShuffledFullyConnected.
+// May be called either directly by it (single-threaded case) or may be used
+// as the 'task' for worker threads to run (multi-threaded case, see
+// ExperimentalShuffledFullyConnectedWorkerTask below).
+inline void ExperimentalShuffledFullyConnectedWorkerImpl(
+    const uint8* shuffled_input_workspace_data,
+    const int8* shuffled_weights_data, int batches, int output_depth,
+    int output_stride, int accum_depth, const int32* bias_data,
+    int32 output_multiplier, int output_shift, int16* output_data) {
+#if defined USE_NEON
+  const int8* shuffled_weights_ptr = shuffled_weights_data;
+  if (batches == 1) {
+    const int right_shift = output_shift > 0 ? output_shift : 0;
+    const int left_shift = output_shift > 0 ? 0 : -output_shift;
+    for (int c = 0; c < output_depth; c += 4) {
+      // Accumulation loop.
+      int32x4_t row_accum0 = vdupq_n_s32(0);
+      int32x4_t row_accum1 = vdupq_n_s32(0);
+      int32x4_t row_accum2 = vdupq_n_s32(0);
+      int32x4_t row_accum3 = vdupq_n_s32(0);
+      for (int d = 0; d < accum_depth; d += 16) {
+        int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
+        int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
+        int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
+        int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
+        shuffled_weights_ptr += 64;
+        int8x16_t input =
+            vreinterpretq_s8_u8(vld1q_u8(shuffled_input_workspace_data + d));
+        int16x8_t local_accum0 =
+            vmull_s8(vget_low_s8(weights0), vget_low_s8(input));
+        int16x8_t local_accum1 =
+            vmull_s8(vget_low_s8(weights1), vget_low_s8(input));
+        int16x8_t local_accum2 =
+            vmull_s8(vget_low_s8(weights2), vget_low_s8(input));
+        int16x8_t local_accum3 =
+            vmull_s8(vget_low_s8(weights3), vget_low_s8(input));
+        local_accum0 =
+            vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input));
+        local_accum1 =
+            vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input));
+        local_accum2 =
+            vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input));
+        local_accum3 =
+            vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input));
+        row_accum0 = vpadalq_s16(row_accum0, local_accum0);
+        row_accum1 = vpadalq_s16(row_accum1, local_accum1);
+        row_accum2 = vpadalq_s16(row_accum2, local_accum2);
+        row_accum3 = vpadalq_s16(row_accum3, local_accum3);
       }
+      // Horizontally reduce accumulators
+      int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+          pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+      pairwise_reduced_acc_0 =
+          vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0));
+      pairwise_reduced_acc_1 =
+          vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1));
+      pairwise_reduced_acc_2 =
+          vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2));
+      pairwise_reduced_acc_3 =
+          vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3));
+      const int32x2_t reduced_lo =
+          vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+      const int32x2_t reduced_hi =
+          vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+      int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+      // Add bias values.
+      int32x4_t bias_vec = vld1q_s32(bias_data + c);
+      reduced = vaddq_s32(reduced, bias_vec);
+      reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+      // Rounding-shift-right.
+      using gemmlowp::RoundingDivideByPOT;
+      reduced = RoundingDivideByPOT(reduced, right_shift);
+      // Narrow values down to 16 bit signed.
+      const int16x4_t res16 = vqmovn_s32(reduced);
+      vst1_s16(output_data + c, res16);
     }
+  } else if (batches == 4) {
+    const int right_shift = output_shift > 0 ? output_shift : 0;
+    const int left_shift = output_shift > 0 ? 0 : -output_shift;
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8* shuffled_input_ptr =
+          reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+      // Accumulation loop.
+      int32x4_t row_accum00 = vdupq_n_s32(0);
+      int32x4_t row_accum10 = vdupq_n_s32(0);
+      int32x4_t row_accum20 = vdupq_n_s32(0);
+      int32x4_t row_accum30 = vdupq_n_s32(0);
+      int32x4_t row_accum01 = vdupq_n_s32(0);
+      int32x4_t row_accum11 = vdupq_n_s32(0);
+      int32x4_t row_accum21 = vdupq_n_s32(0);
+      int32x4_t row_accum31 = vdupq_n_s32(0);
+      int32x4_t row_accum02 = vdupq_n_s32(0);
+      int32x4_t row_accum12 = vdupq_n_s32(0);
+      int32x4_t row_accum22 = vdupq_n_s32(0);
+      int32x4_t row_accum32 = vdupq_n_s32(0);
+      int32x4_t row_accum03 = vdupq_n_s32(0);
+      int32x4_t row_accum13 = vdupq_n_s32(0);
+      int32x4_t row_accum23 = vdupq_n_s32(0);
+      int32x4_t row_accum33 = vdupq_n_s32(0);
+      for (int d = 0; d < accum_depth; d += 16) {
+        int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
+        int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
+        int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
+        int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
+        shuffled_weights_ptr += 64;
+        int8x16_t input0 = vld1q_s8(shuffled_input_ptr + 0);
+        int8x16_t input1 = vld1q_s8(shuffled_input_ptr + 16);
+        int8x16_t input2 = vld1q_s8(shuffled_input_ptr + 32);
+        int8x16_t input3 = vld1q_s8(shuffled_input_ptr + 48);
+        shuffled_input_ptr += 64;
+        int16x8_t local_accum0, local_accum1, local_accum2, local_accum3;
+#define TFLITE_SHUFFLED_FC_ACCUM(B)                                           \
+  local_accum0 = vmull_s8(vget_low_s8(weights0), vget_low_s8(input##B));      \
+  local_accum1 = vmull_s8(vget_low_s8(weights1), vget_low_s8(input##B));      \
+  local_accum2 = vmull_s8(vget_low_s8(weights2), vget_low_s8(input##B));      \
+  local_accum3 = vmull_s8(vget_low_s8(weights3), vget_low_s8(input##B));      \
+  local_accum0 =                                                              \
+      vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input##B)); \
+  local_accum1 =                                                              \
+      vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input##B)); \
+  local_accum2 =                                                              \
+      vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input##B)); \
+  local_accum3 =                                                              \
+      vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input##B)); \
+  row_accum0##B = vpadalq_s16(row_accum0##B, local_accum0);                   \
+  row_accum1##B = vpadalq_s16(row_accum1##B, local_accum1);                   \
+  row_accum2##B = vpadalq_s16(row_accum2##B, local_accum2);                   \
+  row_accum3##B = vpadalq_s16(row_accum3##B, local_accum3);
+
+        TFLITE_SHUFFLED_FC_ACCUM(0)
+        TFLITE_SHUFFLED_FC_ACCUM(1)
+        TFLITE_SHUFFLED_FC_ACCUM(2)
+        TFLITE_SHUFFLED_FC_ACCUM(3)
+
+#undef TFLITE_SHUFFLED_FC_ACCUM
+      }
+      // Horizontally reduce accumulators
+
+#define TFLITE_SHUFFLED_FC_STORE(B)                                           \
+  {                                                                           \
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,                 \
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;                       \
+    pairwise_reduced_acc_0 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum0##B), vget_high_s32(row_accum0##B)); \
+    pairwise_reduced_acc_1 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum1##B), vget_high_s32(row_accum1##B)); \
+    pairwise_reduced_acc_2 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum2##B), vget_high_s32(row_accum2##B)); \
+    pairwise_reduced_acc_3 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum3##B), vget_high_s32(row_accum3##B)); \
+    const int32x2_t reduced_lo =                                              \
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);            \
+    const int32x2_t reduced_hi =                                              \
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);            \
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);                 \
+    int32x4_t bias_vec = vld1q_s32(bias_data + c);                            \
+    reduced = vaddq_s32(reduced, bias_vec);                                   \
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));                    \
+    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);                    \
+    using gemmlowp::RoundingDivideByPOT;                                      \
+    reduced = RoundingDivideByPOT(reduced, right_shift);                      \
+    const int16x4_t res16 = vqmovn_s32(reduced);                              \
+    vst1_s16(output_data + c + B * output_stride, res16);                     \
   }
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
-            int pad_width, int pad_height, int kheight, int kwidth,
-            uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
-  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
-         kwidth, byte_zero, output_data, output_dims);
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void ConvAsGemm(const float* input_data, const Dims<4>& input_dims,
-                const float* filter_data, const Dims<4>& filter_dims,
-                const float* bias_data, const Dims<4>& bias_dims,
-                float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("ConvAsGemm");
-
-  const auto input_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
-  const auto filter_matrix_map =
-      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
-  auto output_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
-
-  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
-
-  AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
-                                       output_dims);
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
-                int32 input_offset, const uint8* filter_data,
-                const Dims<4>& filter_dims, int32 filter_offset,
-                const int32* bias_data, const Dims<4>& bias_dims,
-                int32 output_offset, int32 output_multiplier, int output_shift,
-                int32 output_activation_min, int32 output_activation_max,
-                uint8* output_data, const Dims<4>& output_dims,
-                gemmlowp::GemmContext* gemm_context) {
-  gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit");
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  const int input_rows = input_dims.sizes[0];
-  const int input_cols =
-      input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3];
-  const int filter_rows = filter_dims.sizes[3];
-  const int filter_cols =
-      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
-  const int output_rows = output_dims.sizes[0];
-  const int output_cols =
-      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
-  TFLITE_DCHECK_EQ(output_rows, filter_rows);
-  TFLITE_DCHECK_EQ(output_cols, input_cols);
-  TFLITE_DCHECK_EQ(filter_cols, input_rows);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
-  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, output_rows, filter_cols, filter_cols);
-  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
-      input_data, filter_cols, output_cols, filter_cols);
-  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
-      output_data, output_rows, output_cols, output_rows);
-  const auto& output_pipeline = GemmlowpOutputPipeline::Make(
-      bias_data, output_rows, output_offset, output_multiplier, output_shift,
-      output_activation_min, output_activation_max);
-  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
-                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
-      input_offset, output_pipeline);
-}
-
-template <typename T>
-inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
-                         int block_size, T* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("SpaceToDepth");
 
-  const int output_depth = ArraySize(output_dims, 0);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-
-  const int input_depth = ArraySize(input_dims, 0);
-  const int batch_size = ArraySize(input_dims, 3);
-
-  // Number of continuous values that we can copy in one interation.
-  const int stride = block_size * input_depth;
+      TFLITE_SHUFFLED_FC_STORE(0);
+      TFLITE_SHUFFLED_FC_STORE(1);
+      TFLITE_SHUFFLED_FC_STORE(2);
+      TFLITE_SHUFFLED_FC_STORE(3);
 
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int out_h = 0; out_h < output_height; ++out_h) {
-      T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch);
-      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
-        T* dst = output_ptr;
-        for (int out_w = 0; out_w < output_width; ++out_w) {
-          memcpy(dst, input_data, stride * sizeof(T));
-          input_data += stride;
-          dst += output_depth;
-        }
-        output_ptr += stride;
-      }
+#undef TFLITE_SHUFFLED_FC_STORE
     }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
   }
-}
-
-template <FusedActivationFunctionType Ac>
-void NonGlobalBatchNormalization(
-    const float* input_data, const Dims<4>& input_dims, const float* mean_data,
-    const Dims<4>& mean_dims, const float* multiplier_data,
-    const Dims<4>& multiplier_dims, const float* offset_data,
-    const Dims<4>& offset_dims, float* output_data,
-    const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
-                        offset_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
-                        offset_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
-                        offset_dims, 0, output_dims, 0);
-
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, x, y, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
-              offset_data[Offset(offset_dims, c, x, y, 0)]);
+#else
+  if (batches == 1) {
+    int16* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8 values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8* shuffled_weights_ptr =
+        reinterpret_cast<const int8*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8* shuffled_input_data =
+        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum[4] = {0};
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int j = 0; j < 16; j++) {
+            int8 input_val = shuffled_input_data[d + j];
+            int8 weights_val = *shuffled_weights_ptr++;
+            accum[i] += weights_val * input_val;
+          }
         }
       }
+      for (int i = 0; i < 4; i++) {
+        // Add bias value
+        int acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32 accumulator to the scale used by our
+        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+        // multiplier and shift here have been pre-computed offline
+        // (e.g. by toco).
+        acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                            -output_shift);
+        // Saturate, cast to int16, and store to output array.
+        acc = std::max(acc, -32768);
+        acc = std::min(acc, 32767);
+        output_ptr[c + i] = acc;
+      }
     }
-  }
-}
-
-template <FusedActivationFunctionType Ac>
-void GlobalBatchNormalization(const float* input_data,
-                              const Dims<4>& input_dims, const float* mean_data,
-                              const Dims<4>& mean_dims,
-                              const float* multiplier_data,
-                              const Dims<4>& multiplier_dims,
-                              const float* offset_data,
-                              const Dims<4>& offset_dims, float* output_data,
-                              const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
-                        offset_dims, 0, output_dims, 0);
-
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
-              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
+  } else if (batches == 4) {
+    int16* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8 values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8* shuffled_weights_ptr =
+        reinterpret_cast<const int8*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8* shuffled_input_data =
+        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8* shuffled_input_ptr = shuffled_input_data;
+      // Accumulation loop.
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum[4][4];
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          accum[i][b] = 0;
+        }
+      }
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int b = 0; b < 4; b++) {
+            for (int j = 0; j < 16; j++) {
+              int8 input_val = shuffled_input_ptr[16 * b + j];
+              int8 weights_val = shuffled_weights_ptr[16 * i + j];
+              accum[i][b] += weights_val * input_val;
+            }
+          }
+        }
+        shuffled_input_ptr += 64;
+        shuffled_weights_ptr += 64;
+      }
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          // Add bias value
+          int acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32 accumulator to the scale used by our
+          // (16-bit, typically 3 integer bits) fixed-point format. The
+          // quantized multiplier and shift here have been pre-computed offline
+          // (e.g. by toco).
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              -output_shift);
+          // Saturate, cast to int16, and store to output array.
+          acc = std::max(acc, -32768);
+          acc = std::min(acc, 32767);
+          output_ptr[b * output_stride + c + i] = acc;
         }
       }
     }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
   }
+#endif
 }
 
-inline void Relu(const float* input_data, const Dims<4>& input_dims,
-                 float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Relu (not fused)");
+// Wraps ExperimentalShuffledFullyConnectedWorkerImpl into a Task class
+// to allow using gemmlowp's threadpool.
+struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task {
+  ExperimentalShuffledFullyConnectedWorkerTask(
+      const uint8* input_data, const int8* shuffled_weights_data, int batches,
+      int output_depth, int output_stride, int accum_depth,
+      const int32* bias_data, int32 output_multiplier, int output_shift,
+      int16* output_data)
+      : input_data_(input_data),
+        shuffled_weights_data_(shuffled_weights_data),
+        batches_(batches),
+        output_depth_(output_depth),
+        output_stride_(output_stride),
+        accum_depth_(accum_depth),
+        bias_data_(bias_data),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_data_(output_data) {}
+
+  void Run() override {
+    ExperimentalShuffledFullyConnectedWorkerImpl(
+        input_data_, shuffled_weights_data_, batches_, output_depth_,
+        output_stride_, accum_depth_, bias_data_, output_multiplier_,
+        output_shift_, output_data_);
+  }
 
-  const auto input = MapAsVector(input_data, input_dims);
-  auto output = MapAsVector(output_data, output_dims);
-  output = input.cwiseMax(0.0f);
-}
+  const uint8* input_data_;
+  const int8* shuffled_weights_data_;
+  int batches_;
+  int output_depth_;
+  int output_stride_;
+  int accum_depth_;
+  const int32* bias_data_;
+  int32 output_multiplier_;
+  int output_shift_;
+  int16* output_data_;
+};
 
-inline void Relu1(const float* input_data, const Dims<4>& input_dims,
-                  float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 1;
-          const float lower = -1;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
+inline void ExperimentalShuffledFullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    int16* output_data, const Dims<4>& output_dims,
+    uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label(
+      "ExperimentalShuffledFullyConnected/8bit");
+  (void)gemm_context;  // only used in optimized code.
+  TFLITE_DCHECK_EQ(output_activation_min, -32768);
+  TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  TFLITE_DCHECK((accum_depth % 16) == 0);
+  TFLITE_DCHECK((output_depth % 4) == 0);
+  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+  // so that just reinterpreting them as int8 values is equivalent to
+  // subtracting 128 from them, thus implementing for free the subtraction of
+  // the zero_point value 128.
+  const int8* int8_shuffled_weights_data =
+      reinterpret_cast<const int8*>(shuffled_weights_data);
+
+  // Shuffling and xoring of input activations into the workspace buffer
+  if (batches == 1) {
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (int i = 0; i < accum_depth; i += 16) {
+      uint8x16_t val = vld1q_u8(input_data + i);
+      val = veorq_u8(val, signbit);
+      vst1q_u8(shuffled_input_workspace_data + i, val);
+    }
+#else
+    for (int i = 0; i < accum_depth; i++) {
+      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+    }
+#endif
+  } else if (batches == 4) {
+    uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+    int c = 0;
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (c = 0; c < accum_depth; c += 16) {
+      const uint8* src_data_ptr = input_data + c;
+      uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth);
+      uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth);
+      uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth);
+      uint8x16_t val3 = vld1q_u8(src_data_ptr + 3 * accum_depth);
+      val0 = veorq_u8(val0, signbit);
+      val1 = veorq_u8(val1, signbit);
+      val2 = veorq_u8(val2, signbit);
+      val3 = veorq_u8(val3, signbit);
+      vst1q_u8(shuffled_input_workspace_ptr + 0, val0);
+      vst1q_u8(shuffled_input_workspace_ptr + 16, val1);
+      vst1q_u8(shuffled_input_workspace_ptr + 32, val2);
+      vst1q_u8(shuffled_input_workspace_ptr + 48, val3);
+      shuffled_input_workspace_ptr += 64;
+    }
+#else
+    for (c = 0; c < accum_depth; c += 16) {
+      for (int b = 0; b < 4; b++) {
+        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        for (int j = 0; j < 16; j++) {
+          uint8 src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the kernel will only need to
+          // reinterpret these uint8 values as int8, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8 dst_val = src_val ^ 0x80;
+          *shuffled_input_workspace_ptr++ = dst_val;
         }
       }
     }
+#endif
+  } else {
+    TFLITE_DCHECK(false);
+    return;
   }
-}
 
-inline void Relu6(const float* input_data, const Dims<4>& input_dims,
-                  float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 6;
-          const float lower = 0;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemm_context->max_num_threads(), output_depth, batches, accum_depth);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    ExperimentalShuffledFullyConnectedWorkerImpl(
+        shuffled_input_workspace_data, int8_shuffled_weights_data, batches,
+        output_depth, output_depth, accum_depth, bias_data, output_multiplier,
+        output_shift, output_data);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<gemmlowp::Task*> tasks(thread_count);
+  const int kRowsPerWorker =
+      gemmlowp::RoundUp<kKernelRows>(output_depth / thread_count);
+  int row_start = 0;
+  for (int i = 0; i < thread_count; i++) {
+    int row_end = std::min(output_depth, row_start + kRowsPerWorker);
+    tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask(
+        shuffled_input_workspace_data,
+        int8_shuffled_weights_data + row_start * accum_depth, batches,
+        row_end - row_start, output_depth, accum_depth, bias_data + row_start,
+        output_multiplier, output_shift, output_data + row_start);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_depth);
+  gemm_context->workers_pool()->Execute(tasks);
+}
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(
+    const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
+    int stride_width, int stride_height, int pad_width, int pad_height,
+    int in_width, int in_height, int in_depth, int single_buffer_length,
+    int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) {
+  gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn");
+  // This chunk of code reshapes all the inputs corresponding to
+  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
+  const int kwidth_times_indepth = kwidth * in_depth;
+  const int inwidth_times_indepth = in_width * in_depth;
+  const int ih_ungated_start = h * stride_height - pad_height;
+  const int ih_ungated_end = (ih_ungated_start + kheight);
+  const int ih_end = std::min(ih_ungated_end, in_height);
+  const int iw_ungated_start = w * stride_width - pad_width;
+  const int iw_ungated_end = (iw_ungated_start + kwidth);
+  const int iw_end = std::min(iw_ungated_end, in_width);
+  // If the patch is off the edge of the input image, skip writing those rows
+  // and columns from the patch into the output array.
+  const int h_offset = std::max(0, -ih_ungated_start);
+  const int w_offset = std::max(0, -iw_ungated_start);
+  const int ih_start = std::max(0, ih_ungated_start);
+  const int iw_start = std::max(0, iw_ungated_start);
+  const int single_row_num =
+      std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
+  const int output_row_offset = (buffer_id * single_buffer_length);
+  int out_offset =
+      output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+  int in_offset = Offset(input_dims, 0, iw_start, ih_start, b);
+
+  // Express all of the calculations as padding around the input patch.
+  const int top_padding = h_offset;
+  const int bottom_padding = (ih_ungated_end - ih_end);
+  const int left_padding = w_offset;
+  const int right_padding = (iw_ungated_end - iw_end);
+  assert(single_row_num ==
+         ((kwidth - (left_padding + right_padding)) * in_depth));
+
+  // Write out zeroes to the elements representing the top rows of the input
+  // patch that are off the edge of the input image.
+  if (top_padding > 0) {
+    const int top_row_elements = (top_padding * kwidth * in_depth);
+    memset(conv_buffer_data + output_row_offset, byte_zero,
+           (top_row_elements * sizeof(T)));
+  }
+
+  // If the patch is on the interior of the input image horizontally, just copy
+  // over the rows sequentially, otherwise add zero padding at the start or end.
+  if ((left_padding == 0) && (right_padding == 0)) {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  } else {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      if (left_padding > 0) {
+        const int left_start = (out_offset - (left_padding * in_depth));
+        memset(conv_buffer_data + left_start, byte_zero,
+               (left_padding * in_depth * sizeof(T)));
+      }
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      if (right_padding > 0) {
+        const int right_start = (out_offset + single_row_num);
+        memset(conv_buffer_data + right_start, byte_zero,
+               (right_padding * in_depth * sizeof(T)));
       }
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
     }
   }
+
+  // If the bottom of the patch falls off the input image, pad the values
+  // representing those input rows with zeroes.
+  if (bottom_padding > 0) {
+    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
+    const int bottom_start =
+        output_row_offset +
+        ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+    memset(conv_buffer_data + bottom_start, byte_zero,
+           (bottom_row_elements * sizeof(T)));
+  }
 }
 
-template <FusedActivationFunctionType Ac>
-void L2Normalization(const float* input_data, const Dims<4>& input_dims,
-                     float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("L2Normalization");
-  static_assert(Ac == FusedActivationFunctionType::kNone, "");
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
+            int stride_height, int pad_width, int pad_height, int kheight,
+            int kwidth, uint8 byte_zero, T* output_data,
+            const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Im2col");
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_height = ArraySize(input_dims, 2);
+  const int output_depth = ArraySize(output_dims, 0);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
   for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        float squared_l2_norm = 0;
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          squared_l2_norm += val * val;
-        }
-        float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm;
-        }
+    for (int h = 0; h < output_height; ++h) {
+      for (int w = 0; w < output_width; ++w) {
+        ExtractPatchIntoBufferColumn(
+            input_dims, w, h, b, kheight, kwidth, stride_width, stride_height,
+            pad_width, pad_height, input_width, input_height, input_depth,
+            output_depth, buffer_id, input_data, output_data, byte_zero);
+        ++buffer_id;
       }
     }
   }
 }
 
-inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
-                                          int* output_shift) {
-  *output_shift = 11;
-  while (input >= (1 << 29)) {
-    input /= 4;
-    ++*output_shift;
-  }
-  TFLITE_DCHECK_GT(input, 0);
-  const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
-  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
-  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
-  *output_shift -= left_shift_bit_pairs;
-  input <<= 2 * left_shift_bit_pairs;
-  TFLITE_DCHECK_GE(input, (1 << 27));
-  TFLITE_DCHECK_LT(input, (1 << 29));
-  using gemmlowp::FixedPoint;
-  using gemmlowp::Rescale;
-  using gemmlowp::SaturatingRoundingMultiplyByPOT;
-  // Using 3 integer bits gives us enough room for the internal arithmetic in
-  // this Newton-Raphson iteration.
-  using F3 = FixedPoint<int32, 3>;
-  using F0 = FixedPoint<int32, 0>;
-  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
-  const F3 fixedpoint_half_input =
-      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
-  const F3 fixedpoint_half_three =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
-  // Newton-Raphson iteration
-  // Naive unoptimized starting guess: x = 1
-  F3 x = F3::One();
-  // Naive unoptimized number of iterations: 5
-  for (int i = 0; i < 5; i++) {
-    const F3 x3 = Rescale<3>(x * x * x);
-    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
-  }
-  const F0 fixedpoint_half_sqrt_2 =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
-  x = x * fixedpoint_half_sqrt_2;
-  *output_inv_sqrt = x.raw();
-  if (*output_shift < 0) {
-    *output_inv_sqrt <<= -*output_shift;
-    *output_shift = 0;
-  }
+// legacy, for compatibility with old checked-in code
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int kheight, int kwidth,
+            uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
+  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+         kwidth, byte_zero, output_data, output_dims);
 }
 
-inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
-                            int32 input_zero_point, uint8* output_data,
-                            const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit");
+inline void DilatedConv(const float* input_data, const Dims<4>& input_dims,
+                        const float* filter_data, const Dims<4>& filter_dims,
+                        const float* bias_data, const Dims<4>& bias_dims,
+                        int stride_width, int stride_height,
+                        int dilation_width_factor, int dilation_height_factor,
+                        int pad_width, int pad_height,
+                        float output_activation_min,
+                        float output_activation_max, float* output_data,
+                        const Dims<4>& output_dims, float* im2col_data,
+                        const Dims<4>& im2col_dims) {
+  gemmlowp::ScopedProfilingLabel label("DilatedConv");
+  // This is a copy of the reference Conv implementation. We do not currently
+  // have an optimized path for dilation.
+  (void)im2col_data;  // only used in optimized code.
+  (void)im2col_dims;  // only used in optimized code.
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(batches, 1);
-  TFLITE_DCHECK_EQ(height, 1);
-  TFLITE_DCHECK_EQ(width, 1);
-  int32 square_l2_norm = 0;
-  for (int i = 0; i < depth; i++) {
-    int32 diff = input_data[i] - input_zero_point;
-    square_l2_norm += diff * diff;
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0));
   }
-  int32 inv_l2norm_multiplier;
-  int inv_l2norm_shift;
-  GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
-                                &inv_l2norm_shift);
-
-  for (int i = 0; i < depth; i++) {
-    int32 diff = input_data[i] - input_zero_point;
-    int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
-        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
-    int32 unclamped_output_val = 128 + rescaled_diff;
-    int32 output_val = std::min(255, std::max(0, unclamped_output_val));
-    output_data[i] = static_cast<uint8>(output_val);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          float total = 0.f;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  float input_value = input_data[Offset(input_dims, in_channel,
+                                                        in_x, in_y, batch)];
+                  float filter_value =
+                      filter_data[Offset(filter_dims, in_channel, filter_x,
+                                         filter_y, out_channel)];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+          }
+          float bias_value = 0.0f;
+          if (bias_data) {
+            bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
+          }
+          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
+              ActivationFunctionWithMinMax(total + bias_value,
+                                           output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
   }
 }
 
-inline void Add(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Add");
-  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
-                                              output_dims, 3);
-  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
-                                             output_dims, 2);
-  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
-                                            output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
-                                            output_dims, 0);
-  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-
-  int i = 0;
-  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
-#ifdef USE_NEON
-  const auto activation_min = vdupq_n_f32(output_activation_min);
-  const auto activation_max = vdupq_n_f32(output_activation_max);
-  for (; i <= size - 16; i += 16) {
-    auto a10 = vld1q_f32(input1_data + i);
-    auto a11 = vld1q_f32(input1_data + i + 4);
-    auto a12 = vld1q_f32(input1_data + i + 8);
-    auto a13 = vld1q_f32(input1_data + i + 12);
-    auto a20 = vld1q_f32(input2_data + i);
-    auto a21 = vld1q_f32(input2_data + i + 4);
-    auto a22 = vld1q_f32(input2_data + i + 8);
-    auto a23 = vld1q_f32(input2_data + i + 12);
-    auto x0 = vaddq_f32(a10, a20);
-    auto x1 = vaddq_f32(a11, a21);
-    auto x2 = vaddq_f32(a12, a22);
-    auto x3 = vaddq_f32(a13, a23);
-    x0 = vmaxq_f32(activation_min, x0);
-    x1 = vmaxq_f32(activation_min, x1);
-    x2 = vmaxq_f32(activation_min, x2);
-    x3 = vmaxq_f32(activation_min, x3);
-    x0 = vminq_f32(activation_max, x0);
-    x1 = vminq_f32(activation_max, x1);
-    x2 = vminq_f32(activation_max, x2);
-    x3 = vminq_f32(activation_max, x3);
-    vst1q_f32(output_data + i, x0);
-    vst1q_f32(output_data + i + 4, x1);
-    vst1q_f32(output_data + i + 8, x2);
-    vst1q_f32(output_data + i + 12, x3);
-  }
-  for (; i <= size - 4; i += 4) {
-    auto a1 = vld1q_f32(input1_data + i);
-    auto a2 = vld1q_f32(input2_data + i);
-    auto x = vaddq_f32(a1, a2);
-    x = vmaxq_f32(activation_min, x);
-    x = vminq_f32(activation_max, x);
-    vst1q_f32(output_data + i, x);
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 float output_activation_min, float output_activation_max,
+                 float* output_data, const Dims<4>& output_dims,
+                 float* im2col_data, const Dims<4>& im2col_dims) {
+  if ((dilation_width_factor != 1) || (dilation_height_factor != 1)) {
+    return DilatedConv(input_data, input_dims, filter_data, filter_dims,
+                       bias_data, bias_dims, stride_width, stride_height,
+                       dilation_width_factor, dilation_height_factor, pad_width,
+                       pad_height, output_activation_min, output_activation_max,
+                       output_data, output_dims, im2col_data, im2col_dims);
   }
-#endif  // NEON
 
-  for (; i < size; i++) {
-    auto x = input1_data[i] + input2_data[i];
-    output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
-                                                  output_activation_max);
+  (void)im2col_data;
+  (void)im2col_dims;
+  gemmlowp::ScopedProfilingLabel label("Conv");
+
+  const float* gemm_input_data = nullptr;
+  const Dims<4>* gemm_input_dims = nullptr;
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
+           pad_height, filter_height, filter_width, 0, im2col_data,
+           im2col_dims);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else {
+    // TODO(aselle): We need to make sure to not send im2col if it is not
+    // needed.
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_dims = &input_dims;
   }
+
+  const auto im2col_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
+
+  AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data,
+                                   output_dims, output_activation_min,
+                                   output_activation_max);
+}
+
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int dilation_width_factor,
+          int dilation_height_factor, int pad_width, int pad_height,
+          float* output_data, const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, dilation_width_factor,
+       dilation_height_factor, pad_width, pad_height, output_activation_min,
+       output_activation_max, output_data, output_dims, im2col_data,
+       im2col_dims);
 }
 
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
-void Add(const float* input1_data, const Dims<4>& input1_dims,
-         const float* input2_data, const Dims<4>& input2_dims,
-         float* output_data, const Dims<4>& output_dims) {
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
   float output_activation_min, output_activation_max;
   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, 1, 1, pad_width, pad_height,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims);
+}
 
-  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
-      output_activation_max, output_data, output_dims);
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+           bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data,
+           output_dims, im2col_data, im2col_dims);
+}
+
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("Conv/8bit");
+
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+  const uint8* gemm_input_data = nullptr;
+  const Dims<4>* gemm_input_dims = nullptr;
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
+           pad_height, filter_height, filter_width, input_zero_point,
+           im2col_data, im2col_dims);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_dims = &input_dims;
+  }
+
+  const int gemm_input_rows = gemm_input_dims->sizes[0];
+  const int gemm_input_cols = gemm_input_dims->sizes[1] *
+                              gemm_input_dims->sizes[2] *
+                              gemm_input_dims->sizes[3];
+  const int filter_rows = filter_dims.sizes[3];
+  const int filter_cols =
+      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+  const int output_rows = output_dims.sizes[0];
+  const int output_cols =
+      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, filter_rows, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      gemm_input_data, gemm_input_rows, gemm_input_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols);
+  const auto& output_pipeline = GemmlowpOutputPipeline::Make(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
 }
 
+// legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
-inline void Add(int left_shift, const uint8* input1_data,
-                const Dims<4>& input1_dims, int32 input1_offset,
-                int32 input1_multiplier, int input1_shift,
-                const uint8* input2_data, const Dims<4>& input2_dims,
-                int32 input2_offset, int32 input2_multiplier, int input2_shift,
-                int32 output_offset, int32 output_multiplier, int output_shift,
-                int32 output_activation_min, int32 output_activation_max,
-                uint8* output_data, const Dims<4>& output_dims) {
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
   static_assert(Ac == FusedActivationFunctionType::kNone ||
                     Ac == FusedActivationFunctionType::kRelu ||
                     Ac == FusedActivationFunctionType::kRelu6 ||
                     Ac == FusedActivationFunctionType::kRelu1,
                 "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   if (Ac == FusedActivationFunctionType::kNone) {
     TFLITE_DCHECK_EQ(output_activation_min, 0);
     TFLITE_DCHECK_EQ(output_activation_max, 255);
   }
-  gemmlowp::ScopedProfilingLabel label("Add/8bit");
-  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
-                                              output_dims, 3);
-  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
-                                             output_dims, 2);
-  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
-                                            output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
-                                            output_dims, 0);
-  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-
-  int i = 0;
-  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
-  TFLITE_DCHECK_GT(input1_offset, -256);
-  TFLITE_DCHECK_GT(input2_offset, -256);
-  TFLITE_DCHECK_LT(input1_offset, 256);
-  TFLITE_DCHECK_LT(input2_offset, 256);
-#ifdef USE_NEON
-  for (; i <= size - 8; i += 8) {
-    const auto input1_val_original = vld1_u8(input1_data + i);
-    const auto input2_val_original = vld1_u8(input2_data + i);
-    const auto input1_val_s16 =
-        vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
-    const auto input2_val_s16 =
-        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
-    const auto input1_val =
-        vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset));
-    const auto input2_val =
-        vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset));
-    const auto input1_val_high = vget_high_s16(input1_val);
-    const auto input1_val_low = vget_low_s16(input1_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
-    const auto input2_val_low = vget_low_s16(input2_val);
-    auto x11 = vmovl_s16(input1_val_low);
-    auto x12 = vmovl_s16(input1_val_high);
-    auto x21 = vmovl_s16(input2_val_low);
-    auto x22 = vmovl_s16(input2_val_high);
-    const auto left_shift_dup = vdupq_n_s32(left_shift);
-    x11 = vshlq_s32(x11, left_shift_dup);
-    x12 = vshlq_s32(x12, left_shift_dup);
-    x21 = vshlq_s32(x21, left_shift_dup);
-    x22 = vshlq_s32(x22, left_shift_dup);
-    x11 = vqrdmulhq_n_s32(x11, input1_multiplier);
-    x12 = vqrdmulhq_n_s32(x12, input1_multiplier);
-    x21 = vqrdmulhq_n_s32(x21, input2_multiplier);
-    x22 = vqrdmulhq_n_s32(x22, input2_multiplier);
-    const auto input1_shift_dup = vdupq_n_s32(-input1_shift);
-    const auto input2_shift_dup = vdupq_n_s32(-input2_shift);
-    x11 = vshlq_s32(x11, input1_shift_dup);
-    x12 = vshlq_s32(x12, input1_shift_dup);
-    x21 = vshlq_s32(x21, input2_shift_dup);
-    x22 = vshlq_s32(x22, input2_shift_dup);
-    auto s1 = vaddq_s32(x11, x21);
-    auto s2 = vaddq_s32(x12, x22);
-    s1 = vqrdmulhq_n_s32(s1, output_multiplier);
-    s2 = vqrdmulhq_n_s32(s2, output_multiplier);
-    using gemmlowp::RoundingDivideByPOT;
-    s1 = RoundingDivideByPOT(s1, output_shift);
-    s2 = RoundingDivideByPOT(s2, output_shift);
-    const auto s1_narrowed = vmovn_s32(s1);
-    const auto s2_narrowed = vmovn_s32(s2);
-    const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                             vdupq_n_s16(output_offset));
-    vst1_u8(output_data + i, vqmovun_s16(s));
-  }
-#endif  // NEON
-
-  for (; i < size; i++) {
-    const int32 input1_val = input1_offset + input1_data[i];
-    const int32 input2_val = input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << left_shift);
-    const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
-        shifted_input1_val, input1_multiplier, input1_shift);
-    const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
-        shifted_input2_val, input2_multiplier, input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne(
-                                 raw_sum, output_multiplier, output_shift) +
-                             output_offset;
-    const int32 clamped_output = std::min(
-        output_activation_max, std::max(output_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
-  }
-}
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
 
+// legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
-void Add(const int32* input1_data, const Dims<4>& input1_dims,
-         const int32* input2_data, const Dims<4>& input2_dims,
-         int32* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Add/int32");
-  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
-
-  auto input1_map = MapAsVector(input1_data, input1_dims);
-  auto input2_map = MapAsVector(input2_data, input2_dims);
-  auto output_map = MapAsVector(output_data, output_dims);
-  if (AreSameDims(input1_dims, input2_dims)) {
-    output_map.array() = input1_map.array() + input2_map.array();
-  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
-    auto scalar = input2_data[0];
-    output_map.array() = input1_map.array() + scalar;
-  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
-    auto scalar = input1_data[0];
-    output_map.array() = scalar + input2_map.array();
-  } else {
-    // Should not come here.
-    TFLITE_DCHECK(false);
-  }
+void Conv(const uint8* input_data, const Dims<4>& input_dims,
+          int32 input_offset, const uint8* filter_data,
+          const Dims<4>& filter_dims, int32 filter_offset,
+          const int32* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, int32 output_offset,
+          int32 output_multiplier, int output_shift,
+          int32 output_activation_min, int32 output_activation_max,
+          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
+          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride, stride, pad_width,
+       pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
 }
 
-// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
 template <typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("DepthToSpace");
 
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_height = ArraySize(input_dims, 2);
 
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
-                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
-                  output_activation_min, output_activation_max);
+  const int output_depth = ArraySize(output_dims, 0);
+  const int batch_size = ArraySize(output_dims, 3);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = block_size * output_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch);
+      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
+        const T* src = input_ptr;
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          memcpy(output_data, src, stride * sizeof(T));
+          output_data += stride;
+          src += input_depth;
         }
+        input_ptr += stride;
       }
     }
   }
@@ -1577,792 +2092,1403 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
 
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac, typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T* output_data, const Dims<4>& output_dims) {
-  T output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
-               output_activation_min, output_activation_max, output_data,
-               output_dims);
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int kheight, int kwidth,
+            uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
+  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+         kwidth, byte_zero, output_data, output_dims);
 }
 
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
-                         const Dims<4>& input1_dims, int32 input1_offset,
-                         int32 input1_multiplier, int input1_shift,
-                         const uint8* input2_data, const Dims<4>& input2_dims,
-                         int32 input2_offset, int32 input2_multiplier,
-                         int input2_shift, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const float* input_data, const Dims<4>& input_dims,
+                const float* filter_data, const Dims<4>& filter_dims,
+                const float* bias_data, const Dims<4>& bias_dims,
+                float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("ConvAsGemm");
 
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  const auto input_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
 
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
-          const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
-          const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
-          const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
-          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sum, output_multiplier, output_shift) +
-              output_offset;
-          const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, raw_output));
-          output_data[Offset(output_dims, c, x, y, b)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
-  }
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+
+  AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
+                                       output_dims);
 }
 
+// legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
-                         const Dims<4>& input1_dims, int32 input1_offset,
-                         int32 input1_multiplier, int input1_shift,
-                         const uint8* input2_data, const Dims<4>& input2_dims,
-                         int32 input2_offset, int32 input2_multiplier,
-                         int input2_shift, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
+void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
+                int32 input_offset, const uint8* filter_data,
+                const Dims<4>& filter_dims, int32 filter_offset,
+                const int32* bias_data, const Dims<4>& bias_dims,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims,
+                gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit");
   static_assert(Ac == FusedActivationFunctionType::kNone ||
                     Ac == FusedActivationFunctionType::kRelu ||
                     Ac == FusedActivationFunctionType::kRelu6 ||
                     Ac == FusedActivationFunctionType::kRelu1,
                 "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
-               input1_multiplier, input1_shift, input2_data, input2_dims,
-               input2_offset, input2_multiplier, input2_shift, output_offset,
-               output_multiplier, output_shift, output_activation_min,
-               output_activation_max, output_data, output_dims);
+  const int input_rows = input_dims.sizes[0];
+  const int input_cols =
+      input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3];
+  const int filter_rows = filter_dims.sizes[3];
+  const int filter_cols =
+      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+  const int output_rows = output_dims.sizes[0];
+  const int output_cols =
+      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, input_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, output_cols, filter_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipeline::Make(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
 }
 
-inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Mul");
-  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
-                                              output_dims, 3);
-  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
-                                             output_dims, 2);
-  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
-                                            output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
-                                            output_dims, 0);
-  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-
-  int i = 0;
-  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
-#ifdef USE_NEON
-  const auto activation_min = vdupq_n_f32(output_activation_min);
-  const auto activation_max = vdupq_n_f32(output_activation_max);
-  for (; i <= size - 16; i += 16) {
-    auto a10 = vld1q_f32(input1_data + i);
-    auto a11 = vld1q_f32(input1_data + i + 4);
-    auto a12 = vld1q_f32(input1_data + i + 8);
-    auto a13 = vld1q_f32(input1_data + i + 12);
-    auto a20 = vld1q_f32(input2_data + i);
-    auto a21 = vld1q_f32(input2_data + i + 4);
-    auto a22 = vld1q_f32(input2_data + i + 8);
-    auto a23 = vld1q_f32(input2_data + i + 12);
-    auto x0 = vmulq_f32(a10, a20);
-    auto x1 = vmulq_f32(a11, a21);
-    auto x2 = vmulq_f32(a12, a22);
-    auto x3 = vmulq_f32(a13, a23);
-
-    x0 = vmaxq_f32(activation_min, x0);
-    x1 = vmaxq_f32(activation_min, x1);
-    x2 = vmaxq_f32(activation_min, x2);
-    x3 = vmaxq_f32(activation_min, x3);
-    x0 = vminq_f32(activation_max, x0);
-    x1 = vminq_f32(activation_max, x1);
-    x2 = vminq_f32(activation_max, x2);
-    x3 = vminq_f32(activation_max, x3);
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("SpaceToDepth");
 
-    vst1q_f32(output_data + i, x0);
-    vst1q_f32(output_data + i + 4, x1);
-    vst1q_f32(output_data + i + 8, x2);
-    vst1q_f32(output_data + i + 12, x3);
-  }
-  for (; i <= size - 4; i += 4) {
-    auto a1 = vld1q_f32(input1_data + i);
-    auto a2 = vld1q_f32(input2_data + i);
-    auto x = vmulq_f32(a1, a2);
+  const int output_depth = ArraySize(output_dims, 0);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
 
-    x = vmaxq_f32(activation_min, x);
-    x = vminq_f32(activation_max, x);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int batch_size = ArraySize(input_dims, 3);
 
-    vst1q_f32(output_data + i, x);
-  }
-#endif  // NEON
+  // Number of continuous values that we can copy in one interation.
+  const int stride = block_size * input_depth;
 
-  for (; i < size; i++) {
-    auto x = input1_data[i] * input2_data[i];
-    output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
-                                                  output_activation_max);
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch);
+      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
+        T* dst = output_ptr;
+        for (int out_w = 0; out_w < output_width; ++out_w) {
+          memcpy(dst, input_data, stride * sizeof(T));
+          input_data += stride;
+          dst += output_depth;
+        }
+        output_ptr += stride;
+      }
+    }
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Mul(const float* input1_data, const Dims<4>& input1_dims,
-         const float* input2_data, const Dims<4>& input2_dims,
-         float* output_data, const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
-      output_activation_max, output_data, output_dims);
-}
-
 template <FusedActivationFunctionType Ac>
-void Mul(const int32* input1_data, const Dims<4>& input1_dims,
-         const int32* input2_data, const Dims<4>& input2_dims,
-         int32* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Mul/int32");
-  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+void NonGlobalBatchNormalization(
+    const float* input_data, const Dims<4>& input_dims, const float* mean_data,
+    const Dims<4>& mean_dims, const float* multiplier_data,
+    const Dims<4>& multiplier_dims, const float* offset_data,
+    const Dims<4>& offset_dims, float* output_data,
+    const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
+                        offset_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
+                        offset_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+                        offset_dims, 0, output_dims, 0);
 
-  auto input1_map = MapAsVector(input1_data, input1_dims);
-  auto input2_map = MapAsVector(input2_data, input2_dims);
-  auto output_map = MapAsVector(output_data, output_dims);
-  if (AreSameDims(input1_dims, input2_dims)) {
-    output_map.array() = input1_map.array() * input2_map.array();
-  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
-    auto scalar = input2_data[0];
-    output_map.array() = input1_map.array() * scalar;
-  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
-    auto scalar = input1_data[0];
-    output_map.array() = scalar * input2_map.array();
-  } else {
-    // Should not come here.
-    TFLITE_DCHECK(false);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              (input_data[Offset(input_dims, c, x, y, b)] -
+               mean_data[Offset(mean_dims, c, x, y, 0)]) *
+                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
+              offset_data[Offset(offset_dims, c, x, y, 0)]);
+        }
+      }
+    }
   }
 }
 
-// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastMul is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-template <typename T>
-void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastMul");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+template <FusedActivationFunctionType Ac>
+void GlobalBatchNormalization(const float* input_data,
+                              const Dims<4>& input_dims, const float* mean_data,
+                              const Dims<4>& mean_dims,
+                              const float* multiplier_data,
+                              const Dims<4>& multiplier_dims,
+                              const float* offset_data,
+                              const Dims<4>& offset_dims, float* output_data,
+                              const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+                        offset_dims, 0, output_dims, 0);
 
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
-                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
-                  output_activation_min, output_activation_max);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              (input_data[Offset(input_dims, c, x, y, b)] -
+               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
+                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
+              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
         }
       }
     }
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T* output_data, const Dims<4>& output_dims) {
-  T output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+inline void Relu(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Relu (not fused)");
 
-  BroadcastMul(input1_data, input1_dims, input2_data, input2_dims,
-               output_activation_min, output_activation_max, output_data,
-               output_dims);
-}
-
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
-          const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 unclamped_result =
-              output_offset +
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  input1_val * input2_val, output_multiplier, output_shift);
-          const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, unclamped_result));
-          output_data[Offset(output_dims, c, x, y, b)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
-  }
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
-               input2_dims, input2_offset, output_offset, output_multiplier,
-               output_shift, output_activation_min, output_activation_max,
-               output_data, output_dims);
+  const auto input = MapAsVector(input_data, input_dims);
+  auto output = MapAsVector(output_data, output_dims);
+  output = input.cwiseMax(0.0f);
 }
 
-// TODO(aselle): This is not actually optimized yet.
-inline void Div(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+inline void Relu1(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < height; ++y) {
       for (int x = 0; x < width; ++x) {
         for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] /
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float upper = 1;
+          const float lower = -1;
+          float clamped = val > upper ? upper : val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
         }
       }
     }
   }
 }
 
-// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-template <typename T>
-void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
-                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
-                  output_activation_min, output_activation_max);
+inline void Relu6(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float upper = 6;
+          const float lower = 0;
+          float clamped = val > upper ? upper : val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
         }
       }
     }
   }
 }
 
-// TODO(aselle): This is not actually optimized yet.
-inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("L2Normalization");
+  static_assert(Ac == FusedActivationFunctionType::kNone, "");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < height; ++y) {
       for (int x = 0; x < width; ++x) {
+        float squared_l2_norm = 0;
         for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] -
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          squared_l2_norm += val * val;
         }
-      }
-    }
-  }
-}
-
-// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-template <typename T>
-void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+        float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm);
+        for (int c = 0; c < depth; ++c) {
           output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
-                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
-                  output_activation_min, output_activation_max);
+              input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm;
         }
       }
     }
   }
 }
 
-template <FusedActivationFunctionType Ac, typename Scalar>
-void Concatenation(int concat_dim, const Scalar* const* input_data,
-                   const Dims<4>* const* input_dims, int inputs_count,
-                   Scalar* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Concatenation");
-  int concat_size = 0;
-  for (int i = 0; i < inputs_count; i++) {
-    for (int j = 0; j < 4; j++) {
-      if (j != concat_dim) {
-        MatchingArraySize(*input_dims[i], j, output_dims, j);
-      }
-    }
-    concat_size += ArraySize(*input_dims[i], concat_dim);
+inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
+                                          int* output_shift) {
+  *output_shift = 11;
+  while (input >= (1 << 29)) {
+    input /= 4;
+    ++*output_shift;
   }
-  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  // for now we dont have a model with a Concatenation
-  // with fused activation function.
-  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
-  int outer_size = 1;
-  for (int i = concat_dim + 1; i < 4; i++) {
-    outer_size *= output_dims.sizes[i];
+  TFLITE_DCHECK_GT(input, 0);
+  const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  TFLITE_DCHECK_GE(input, (1 << 27));
+  TFLITE_DCHECK_LT(input, (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32, 3>;
+  using F0 = FixedPoint<int32, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input =
+      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++) {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
   }
-  Scalar* output_ptr = output_data;
-  for (int k = 0; k < outer_size; k++) {
-    for (int i = 0; i < inputs_count; ++i) {
-      const int copy_size =
-          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
-      memcpy(output_ptr, input_data[i] + k * copy_size,
-             copy_size * sizeof(Scalar));
-      output_ptr += copy_size;
-    }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0) {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
   }
 }
 
-template <FusedActivationFunctionType Ac, typename Scalar>
-void DepthConcatenation(const Scalar* const* input_data,
-                        const Dims<4>* const* input_dims, int inputs_count,
-                        Scalar* output_data, const Dims<4>& output_dims) {
-  Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
-                            output_data, output_dims);
+inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
+                            int32 input_zero_point, uint8* output_data,
+                            const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK_EQ(batches, 1);
+  TFLITE_DCHECK_EQ(height, 1);
+  TFLITE_DCHECK_EQ(width, 1);
+  int32 square_l2_norm = 0;
+  for (int i = 0; i < depth; i++) {
+    int32 diff = input_data[i] - input_zero_point;
+    square_l2_norm += diff * diff;
+  }
+  int32 inv_l2norm_multiplier;
+  int inv_l2norm_shift;
+  GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
+                                &inv_l2norm_shift);
+
+  for (int i = 0; i < depth; i++) {
+    int32 diff = input_data[i] - input_zero_point;
+    int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
+        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+    int32 unclamped_output_val = 128 + rescaled_diff;
+    int32 output_val = std::min(255, std::max(0, unclamped_output_val));
+    output_data[i] = static_cast<uint8>(output_val);
+  }
 }
 
-inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
-                     const float* prev_activ_data,
-                     const Dims<4>& prev_activ_dims, const float* weights_data,
-                     const Dims<4>& weights_dims, const float* bias_data,
-                     const Dims<4>& bias_dims, const float* prev_state_data,
-                     const Dims<4>& prev_state_dims, float* output_state_data,
-                     const Dims<4>& output_state_dims, float* output_activ_data,
-                     const Dims<4>& output_activ_dims, float* concat_temp_data,
-                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
-                     const Dims<4>& activ_temp_dims) {
-  gemmlowp::ScopedProfilingLabel label("LstmCell");
-  MatchingArraySize(  // batches
-      input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims,
-      3, output_activ_dims, 3);
-  MatchingArraySize(  // height
-      input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims,
-      2, output_activ_dims, 2);
-  MatchingArraySize(  // width
-      input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims,
-      1, output_activ_dims, 1);
-  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
-  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
-  const int input_depth = ArraySize(input_dims, 0);
-  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
-  const int total_input_depth = prev_activ_depth + input_depth;
-  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
-  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
-                  1);
-  const int intern_activ_depth =
-      MatchingArraySize(weights_dims, 1, bias_dims, 0);
-  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
-  const int output_depth =
-      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
-                        output_state_dims, 0, output_activ_dims, 0);
-  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+inline void Add(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Add");
+  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+                                              output_dims, 3);
+  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+                                             output_dims, 2);
+  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+                                            output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+                                            output_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
 
-  // Concatenate prev_activ and input data together
-  std::vector<float const*> concat_input_arrays_data;
-  std::vector<Dims<4> const*> concat_input_arrays_dims;
-  concat_input_arrays_data.push_back(input_data);
-  concat_input_arrays_data.push_back(prev_activ_data);
-  concat_input_arrays_dims.push_back(&input_dims);
-  concat_input_arrays_dims.push_back(&prev_activ_dims);
-  Concatenation<FusedActivationFunctionType::kNone, float>(
-      0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
-      concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
+  int i = 0;
+  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(output_activation_min);
+  const auto activation_max = vdupq_n_f32(output_activation_max);
+  for (; i <= size - 16; i += 16) {
+    auto a10 = vld1q_f32(input1_data + i);
+    auto a11 = vld1q_f32(input1_data + i + 4);
+    auto a12 = vld1q_f32(input1_data + i + 8);
+    auto a13 = vld1q_f32(input1_data + i + 12);
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = vaddq_f32(a10, a20);
+    auto x1 = vaddq_f32(a11, a21);
+    auto x2 = vaddq_f32(a12, a22);
+    auto x3 = vaddq_f32(a13, a23);
+    x0 = vmaxq_f32(activation_min, x0);
+    x1 = vmaxq_f32(activation_min, x1);
+    x2 = vmaxq_f32(activation_min, x2);
+    x3 = vmaxq_f32(activation_min, x3);
+    x0 = vminq_f32(activation_max, x0);
+    x1 = vminq_f32(activation_max, x1);
+    x2 = vminq_f32(activation_max, x2);
+    x3 = vminq_f32(activation_max, x3);
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4) {
+    auto a1 = vld1q_f32(input1_data + i);
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = vaddq_f32(a1, a2);
+    x = vmaxq_f32(activation_min, x);
+    x = vminq_f32(activation_max, x);
+    vst1q_f32(output_data + i, x);
+  }
+#endif  // NEON
 
-  // Fully connected
-  FullyConnected<FusedActivationFunctionType::kNone>(
-      concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
-      bias_dims, activ_temp_data, activ_temp_dims);
+  for (; i < size; i++) {
+    auto x = input1_data[i] + input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
+                                                  output_activation_max);
+  }
+}
 
-  // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
-  // operations.
-  ArrayMap<float> activ_temp_map =
-      MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims);
-  auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
-                                            activ_temp_map.cols());
-  auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
-                                           activ_temp_map.cols());
-  auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
-                                             activ_temp_map.cols());
-  auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
-                                             activ_temp_map.cols());
-  ArrayMap<const float> prev_state_map =
-      MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims);
-  ArrayMap<float> output_state_map =
-      MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims);
-  ArrayMap<float> output_activ_map =
-      MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims);
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
 
-  // Combined memory state and final output calculation
-  gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
-  output_state_map =
-      input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
-          new_input_sm.tanh() +
-      forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
-          prev_state_map;
-  output_activ_map =
-      output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
-      output_state_map.tanh();
+  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+      output_activation_max, output_data, output_dims);
 }
 
-#ifdef GEMMLOWP_NEON
-// In the common case of batch size 1, a fully-connected node degenerates
-// to a matrix*vector product. LSTM cells contain a fully-connected node;
-// when quantized, this becomes a special type of GEMV operation where
-// the output is 16bit-quantized, thus needs its own special path.
-inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims,
-                            const uint8* weights_data,
-                            const Dims<4>& weights_dims,
-                            uint8 weights_zero_point, const int32* bias_data,
-                            const Dims<4>& bias_dims, int32 accum_multiplier,
-                            int accum_shift, int16* output_data,
-                            const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCell");
-  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                       ArraySize(output_dims, 3),
-                   1);
-  const int input_size = input_dims.strides[3];
-  const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
-  // This special fast path for quantized LSTM cells does not try to support
-  // odd sizes that we haven't encountered in any LSTM cell, that would
-  // require special code (that would go untested until any LSTM cell
-  // exercises it). We just guard our assumptions about size evenness with
-  // the following assertions.
-  TFLITE_DCHECK(!(output_size % 4));
-  TFLITE_DCHECK(!(input_size % 8));
-  const int32* bias_ptr = bias_data;
-  int16* output_ptr = output_data;
-  for (int out = 0; out < output_size; out += 4) {
-    int32x4_t acc_0 = vdupq_n_s32(0);
-    int32x4_t acc_1 = vdupq_n_s32(0);
-    int32x4_t acc_2 = vdupq_n_s32(0);
-    int32x4_t acc_3 = vdupq_n_s32(0);
-    const int16x8_t input_offset_vec = vdupq_n_s16(-128);
-    const int16x8_t weights_offset_vec = vdupq_n_s16(-weights_zero_point);
-    int in = 0;
-    // Handle 16 levels of depth at a time.
-    for (; in <= input_size - 16; in += 16) {
-      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
-      const uint8* weights_ptr = weights_data + in + out * input_size;
-      uint8x16_t weights_val_u8_0 = vld1q_u8(weights_ptr + 0 * input_size);
-      uint8x16_t weights_val_u8_1 = vld1q_u8(weights_ptr + 1 * input_size);
-      uint8x16_t weights_val_u8_2 = vld1q_u8(weights_ptr + 2 * input_size);
-      uint8x16_t weights_val_u8_3 = vld1q_u8(weights_ptr + 3 * input_size);
-      int16x8_t input_val_0, input_val_1;
-      const uint8x8_t low = vget_low_u8(input_val_u8);
-      const uint8x8_t high = vget_high_u8(input_val_u8);
-      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
-      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
-      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
-      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
-      int16x8_t weights_val_0_0, weights_val_1_0, weights_val_2_0,
-          weights_val_3_0;
-      int16x8_t weights_val_0_1, weights_val_1_1, weights_val_2_1,
-          weights_val_3_1;
-      weights_val_0_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_0))),
-          weights_offset_vec);
-      weights_val_0_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_0))),
-          weights_offset_vec);
-      weights_val_1_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_1))),
-          weights_offset_vec);
-      weights_val_1_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_1))),
-          weights_offset_vec);
-      weights_val_2_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_2))),
-          weights_offset_vec);
-      weights_val_2_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_2))),
-          weights_offset_vec);
-      weights_val_3_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_3))),
-          weights_offset_vec);
-      weights_val_3_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_3))),
-          weights_offset_vec);
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_0),
-                        vget_low_s16(input_val_0));
-      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_0),
-                        vget_low_s16(input_val_0));
-      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_0),
-                        vget_low_s16(input_val_0));
-      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_0),
-                        vget_low_s16(input_val_0));
-      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_0),
-                        vget_high_s16(input_val_0));
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_0),
-                        vget_high_s16(input_val_0));
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_0),
-                        vget_high_s16(input_val_0));
-      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_0),
-                        vget_high_s16(input_val_0));
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_1),
-                        vget_low_s16(input_val_1));
-      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_1),
-                        vget_low_s16(input_val_1));
-      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_1),
-                        vget_low_s16(input_val_1));
-      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_1),
-                        vget_low_s16(input_val_1));
-      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_1),
-                        vget_high_s16(input_val_1));
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_1),
-                        vget_high_s16(input_val_1));
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_1),
-                        vget_high_s16(input_val_1));
-      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_1),
-                        vget_high_s16(input_val_1));
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+                const Dims<4>& input1_dims, int32 input1_offset,
+                int32 input1_multiplier, int input1_shift,
+                const uint8* input2_data, const Dims<4>& input2_dims,
+                int32 input2_offset, int32 input2_multiplier, int input2_shift,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  gemmlowp::ScopedProfilingLabel label("Add/8bit");
+  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+                                              output_dims, 3);
+  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+                                             output_dims, 2);
+  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+                                            output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+                                            output_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+  int i = 0;
+  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+  TFLITE_DCHECK_GT(input1_offset, -256);
+  TFLITE_DCHECK_GT(input2_offset, -256);
+  TFLITE_DCHECK_LT(input1_offset, 256);
+  TFLITE_DCHECK_LT(input2_offset, 256);
+#ifdef USE_NEON
+  const auto output_activation_min_vector = vdup_n_u8(output_activation_min);
+  const auto output_activation_max_vector = vdup_n_u8(output_activation_max);
+  for (; i <= size - 8; i += 8) {
+    const auto input1_val_original = vld1_u8(input1_data + i);
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+    const auto input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input1_val =
+        vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset));
+    const auto input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset));
+    const auto input1_val_high = vget_high_s16(input1_val);
+    const auto input1_val_low = vget_low_s16(input1_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
+    const auto input2_val_low = vget_low_s16(input2_val);
+    auto x11 = vmovl_s16(input1_val_low);
+    auto x12 = vmovl_s16(input1_val_high);
+    auto x21 = vmovl_s16(input2_val_low);
+    auto x22 = vmovl_s16(input2_val_high);
+    const auto left_shift_dup = vdupq_n_s32(left_shift);
+    x11 = vshlq_s32(x11, left_shift_dup);
+    x12 = vshlq_s32(x12, left_shift_dup);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x11 = vqrdmulhq_n_s32(x11, input1_multiplier);
+    x12 = vqrdmulhq_n_s32(x12, input1_multiplier);
+    x21 = vqrdmulhq_n_s32(x21, input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, input2_multiplier);
+    const auto input1_shift_dup = vdupq_n_s32(-input1_shift);
+    const auto input2_shift_dup = vdupq_n_s32(-input2_shift);
+    x11 = vshlq_s32(x11, input1_shift_dup);
+    x12 = vshlq_s32(x12, input1_shift_dup);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    auto s1 = vaddq_s32(x11, x21);
+    auto s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    s1 = RoundingDivideByPOT(s1, output_shift);
+    s2 = RoundingDivideByPOT(s2, output_shift);
+    const auto s1_narrowed = vmovn_s32(s1);
+    const auto s2_narrowed = vmovn_s32(s2);
+    const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                             vdupq_n_s16(output_offset));
+    const auto clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    const int32 input1_val = input1_offset + input1_data[i];
+    const int32 input2_val = input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << left_shift);
+    const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+        shifted_input1_val, input1_multiplier, input1_shift);
+    const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+        shifted_input2_val, input2_multiplier, input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne(
+                                 raw_sum, output_multiplier, output_shift) +
+                             output_offset;
+    const int32 clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
+                int input1_shift, const int16* input2_data,
+                const Dims<4>& input2_dims, int input2_shift,
+                int16 output_activation_min, int16 output_activation_max,
+                int16* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Add/Int16");
+  // This is a copy of the reference implementation. We do not currently have a
+  // properly optimized version.
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, -32768);
+    TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  }
+
+  const int flat_size = RequiredBufferSizeForDims(output_dims);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+
+  TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
+  TFLITE_DCHECK_GE(input1_shift, 0);
+  TFLITE_DCHECK_GE(input2_shift, 0);
+  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_shift = input1_shift == 0 ? input2_shift : input1_shift;
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+    F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift));
+    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
+    const int16 raw_output = result.raw();
+    const int16 clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = clamped_output;
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const int32* input1_data, const Dims<4>& input1_dims,
+         const int32* input2_data, const Dims<4>& input2_dims,
+         int32* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Add/int32");
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+  auto input1_map = MapAsVector(input1_data, input1_dims);
+  auto input2_map = MapAsVector(input2_data, input2_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  if (AreSameDims(input1_dims, input2_dims)) {
+    output_map.array() = input1_map.array() + input2_map.array();
+  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+    auto scalar = input2_data[0];
+    output_map.array() = input1_map.array() + scalar;
+  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+    auto scalar = input1_data[0];
+    output_map.array() = scalar + input2_map.array();
+  } else {
+    // Should not come here.
+    TFLITE_DCHECK(false);
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sum, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
+               input1_multiplier, input1_shift, input2_data, input2_dims,
+               input2_offset, input2_multiplier, input2_shift, output_offset,
+               output_multiplier, output_shift, output_activation_min,
+               output_activation_max, output_data, output_dims);
+}
+
+inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mul");
+  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+                                              output_dims, 3);
+  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+                                             output_dims, 2);
+  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+                                            output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+                                            output_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+  int i = 0;
+  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(output_activation_min);
+  const auto activation_max = vdupq_n_f32(output_activation_max);
+  for (; i <= size - 16; i += 16) {
+    auto a10 = vld1q_f32(input1_data + i);
+    auto a11 = vld1q_f32(input1_data + i + 4);
+    auto a12 = vld1q_f32(input1_data + i + 8);
+    auto a13 = vld1q_f32(input1_data + i + 12);
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = vmulq_f32(a10, a20);
+    auto x1 = vmulq_f32(a11, a21);
+    auto x2 = vmulq_f32(a12, a22);
+    auto x3 = vmulq_f32(a13, a23);
+
+    x0 = vmaxq_f32(activation_min, x0);
+    x1 = vmaxq_f32(activation_min, x1);
+    x2 = vmaxq_f32(activation_min, x2);
+    x3 = vmaxq_f32(activation_min, x3);
+    x0 = vminq_f32(activation_max, x0);
+    x1 = vminq_f32(activation_max, x1);
+    x2 = vminq_f32(activation_max, x2);
+    x3 = vminq_f32(activation_max, x3);
+
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4) {
+    auto a1 = vld1q_f32(input1_data + i);
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = vmulq_f32(a1, a2);
+
+    x = vmaxq_f32(activation_min, x);
+    x = vminq_f32(activation_max, x);
+
+    vst1q_f32(output_data + i, x);
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    auto x = input1_data[i] * input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
+                                                  output_activation_max);
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Mul(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+      output_activation_max, output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+void Mul(const int32* input1_data, const Dims<4>& input1_dims,
+         const int32* input2_data, const Dims<4>& input2_dims,
+         int32* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mul/int32");
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+  auto input1_map = MapAsVector(input1_data, input1_dims);
+  auto input2_map = MapAsVector(input2_data, input2_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  if (AreSameDims(input1_dims, input2_dims)) {
+    output_map.array() = input1_map.array() * input2_map.array();
+  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+    auto scalar = input2_data[0];
+    output_map.array() = input1_map.array() * scalar;
+  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+    auto scalar = input1_data[0];
+    output_map.array() = scalar * input2_map.array();
+  } else {
+    // Should not come here.
+    TFLITE_DCHECK(false);
+  }
+}
+
+inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
+                const int16* input2_data, const Dims<4>& input2_dims,
+                int16* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mul/Int16");
+  // This is a copy of the reference implementation. We do not currently have a
+  // properly optimized version.
+
+  const int flat_size = RequiredBufferSizeForDims(output_dims);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    output_data[i] = unclamped_result.raw();
+  }
+}
+
+inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
+                const int16* input2_data, const Dims<4>& input2_dims,
+                int32 output_offset, int32 output_activation_min,
+                int32 output_activation_max, uint8* output_data,
+                const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8");
+  // This is a copy of the reference implementation. We do not currently have a
+  // properly optimized version.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size = RequiredBufferSizeForDims(output_dims);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16 rescaled_result =
+        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16 clamped_result =
+        std::min<int16>(output_activation_max - output_offset, rescaled_result);
+    clamped_result =
+        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastMul is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastMul(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 unclamped_result =
+              output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  input1_val * input2_val, output_multiplier, output_shift);
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, unclamped_result));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
+// TODO(aselle): This is not actually optimized yet.
+inline void Div(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] /
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
     }
-    // Handle 8 levels of depth at a time.
-    for (; in < input_size; in += 8) {
-      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
-      const uint8* weights_ptr = weights_data + in + out * input_size;
-      uint8x8_t weights_val_u8_0 = vld1_u8(weights_ptr + 0 * input_size);
-      uint8x8_t weights_val_u8_1 = vld1_u8(weights_ptr + 1 * input_size);
-      uint8x8_t weights_val_u8_2 = vld1_u8(weights_ptr + 2 * input_size);
-      uint8x8_t weights_val_u8_3 = vld1_u8(weights_ptr + 3 * input_size);
-      int16x8_t input_val;
-      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
-      input_val = vaddq_s16(input_val, input_offset_vec);
-      int16x8_t weights_val_0, weights_val_1, weights_val_2, weights_val_3;
-      weights_val_0 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_0)),
-                    weights_offset_vec);
-      weights_val_1 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_1)),
-                    weights_offset_vec);
-      weights_val_2 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_2)),
-                    weights_offset_vec);
-      weights_val_3 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_3)),
-                    weights_offset_vec);
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0),
-                        vget_low_s16(input_val));
-      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1),
-                        vget_low_s16(input_val));
-      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2),
-                        vget_low_s16(input_val));
-      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3),
-                        vget_low_s16(input_val));
-      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0),
-                        vget_high_s16(input_val));
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1),
-                        vget_high_s16(input_val));
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2),
-                        vget_high_s16(input_val));
-      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3),
-                        vget_high_s16(input_val));
+  }
+}
+
+// TODO(aselle): This is not actually optimized yet.
+inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] -
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastSub(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sub, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void Concatenation(int concat_dim, const Scalar* const* input_data,
+                   const Dims<4>* const* input_dims, int inputs_count,
+                   Scalar* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Concatenation");
+  int concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  // for now we dont have a model with a Concatenation
+  // with fused activation function.
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+  int outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  Scalar* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      memcpy(output_ptr, input_data[i] + k * copy_size,
+             copy_size * sizeof(Scalar));
+      output_ptr += copy_size;
+    }
+  }
+}
+
+// TODO(prabhumk): This is the same as the reference implementation.
+// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void Concatenation(int concat_dim, const uint8* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8* output_data, const Dims<4>& output_dims,
+                          const int32 output_zeropoint,
+                          const float output_scale) {
+  // The arguments input_zeropoint and input_scale are expected to be an array
+  // that have the quantization paramaters for all the inputs to the concat
+  // operator.
+  gemmlowp::ScopedProfilingLabel label("Concatenation");
+  TFLITE_DCHECK_GT(inputs_count, 1);
+  int concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  int outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      const uint8* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
     }
-    // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
-        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
-    pairwise_reduced_acc_0 =
-        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
-    pairwise_reduced_acc_1 =
-        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
-    pairwise_reduced_acc_2 =
-        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
-    pairwise_reduced_acc_3 =
-        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
-    const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
-    const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
-    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
-    // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_ptr);
-    bias_ptr += 4;
-    reduced = vaddq_s32(reduced, bias_vec);
-    int left_shift = accum_shift > 0 ? accum_shift : 0;
-    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
-    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
-    // Multiply by the fixed-point multiplier.
-    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
-    // Rounding-shift-right.
-    using gemmlowp::RoundingDivideByPOT;
-    reduced = RoundingDivideByPOT(reduced, right_shift);
-    // Narrow values down to 16 bit signed.
-    const int16x4_t res16 = vqmovn_s32(reduced);
-    vst1_s16(output_ptr, res16);
-    output_ptr += 4;
   }
 }
-#endif
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void DepthConcatenation(const Scalar* const* input_data,
+                        const Dims<4>* const* input_dims, int inputs_count,
+                        Scalar* output_data, const Dims<4>& output_dims) {
+  Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
+                            output_data, output_dims);
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+                     const float* prev_activ_data,
+                     const Dims<4>& prev_activ_dims, const float* weights_data,
+                     const Dims<4>& weights_dims, const float* bias_data,
+                     const Dims<4>& bias_dims, const float* prev_state_data,
+                     const Dims<4>& prev_state_dims, float* output_state_data,
+                     const Dims<4>& output_state_dims, float* output_activ_data,
+                     const Dims<4>& output_activ_dims, float* concat_temp_data,
+                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
+                     const Dims<4>& activ_temp_dims) {
+  gemmlowp::ScopedProfilingLabel label("LstmCell");
+  MatchingArraySize(  // batches
+      input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims,
+      3, output_activ_dims, 3);
+  MatchingArraySize(  // height
+      input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims,
+      2, output_activ_dims, 2);
+  MatchingArraySize(  // width
+      input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims,
+      1, output_activ_dims, 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
+                  1);
+  const int intern_activ_depth =
+      MatchingArraySize(weights_dims, 1, bias_dims, 0);
+  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
+                        output_state_dims, 0, output_activ_dims, 0);
+  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const*> concat_input_arrays_data;
+  std::vector<Dims<4> const*> concat_input_arrays_dims;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_dims.push_back(&input_dims);
+  concat_input_arrays_dims.push_back(&prev_activ_dims);
+  Concatenation<FusedActivationFunctionType::kNone, float>(
+      0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
+      concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
+
+  // Fully connected
+  FullyConnected<FusedActivationFunctionType::kNone>(
+      concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
+      bias_dims, activ_temp_data, activ_temp_dims);
+
+  // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
+  // operations.
+  ArrayMap<float> activ_temp_map =
+      MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims);
+  auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
+                                            activ_temp_map.cols());
+  auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
+                                           activ_temp_map.cols());
+  auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  ArrayMap<const float> prev_state_map =
+      MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims);
+  ArrayMap<float> output_state_map =
+      MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims);
+  ArrayMap<float> output_activ_map =
+      MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims);
+
+  // Combined memory state and final output calculation
+  gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
+  output_state_map =
+      input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+          new_input_sm.tanh() +
+      forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+          prev_state_map;
+  output_activ_map =
+      output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      output_state_map.tanh();
+}
 
 // Quantized LSTM cell. Currently just a copy of the reference impl in
 // reference_ops.h. See the big function comment there, not replicating it
@@ -2526,9 +3652,19 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
       FS new_state = gemmlowp::SaturatingAdd(
           gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
           prev_state_times_forget_state);
-      // Implementation of last internal tanh node, still in fixed-point.
-      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
       // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
       vst1q_s16(output_state_data_ptr, new_state.raw());
       output_state_data_ptr += 8;
       // Down-scale the output activations to 8-bit integers, saturating,
@@ -2585,9 +3721,19 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
       FS new_state = gemmlowp::SaturatingAdd(
           gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
           prev_state_times_forget_state);
-      // Implementation of last internal tanh node, still in fixed-point.
-      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
       // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
       *output_state_data_ptr++ = new_state.raw();
       // Down-scale the output activations to 8-bit integers, saturating,
       // and store back to memory.
@@ -3229,7 +4375,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
-  gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
+gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
@@ -3423,6 +4569,132 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   }
 }
 
+// TODO(myenik): This is the same as the reference implementation, not actually
+// optimized yet.
+inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
+                       float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("LogSoftmax");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        // Find max element value which we'll use to ensure numerical stability
+        // taking advantage of the following equality:
+        // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+        float max = std::numeric_limits<float>::lowest();
+        for (int c = 0; c < depth; ++c) {
+          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
+        }
+
+        // Compute sum.
+        float sum = 0.f;
+        for (int c = 0; c < depth; ++c) {
+          sum += std::exp(input_data[Offset(input_dims, c, x, y, b)] - max);
+        }
+
+        // Compute result.
+        const float log_sum = std::log(sum);
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input_data[Offset(input_dims, c, x, y, b)] - max - log_sum;
+        }
+      }
+    }
+  }
+}
+
+// Currently just a copy of the reference code.
+inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("LogSoftmax/Uint8");
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    // TODO(b/77858996): Implement fixed-point log().
+    // Not a fully-quantized implementation: floating-point log().
+    const float float_log_sum_of_exps =
+        std::log(static_cast<float>(sum_of_exps.raw()) /
+                 (1 << (31 - kAccumulationIntegerBits)));
+    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
+        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
+    const int adjusted_diff_min =
+        std::max(diff_min - 1,  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     reverse_scaling_right_shift));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32 unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
+      }
+    }
+  }
+}
+
 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic");
@@ -3436,7 +4708,7 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_zero_point, int32 input_range_radius,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Logistic");
+  gemmlowp::ScopedProfilingLabel label("Logistic/Uint8");
   /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
   /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
   /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
@@ -3573,6 +4845,67 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
+                     int16* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Logistic/Int16");
+  const int flat_size = RequiredBufferSizeForDims(output_dims);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+
+  for (int i = 0; i < flat_size; i++) {
+  }
+
+  int c = 0;
+  const int16* input_data_ptr = input_data;
+  int16* output_data_ptr = output_data;
+#ifdef GEMMLOWP_NEON
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    for (; c <= flat_size - 16; c += 16) {
+      F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
+      F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      vst1q_s16(output_data_ptr, output0.raw());
+      vst1q_s16(output_data_ptr + 8, output1.raw());
+
+      input_data_ptr += 16;
+      output_data_ptr += 16;
+    }
+    for (; c <= flat_size - 8; c += 8) {
+      F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
+      F0 output = gemmlowp::logistic(input);
+      vst1q_s16(output_data_ptr, output.raw());
+
+      input_data_ptr += 8;
+      output_data_ptr += 8;
+    }
+  }
+#endif
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    for (; c < flat_size; ++c) {
+      F3 input = F3::FromRaw(*input_data_ptr);
+      F0 output = gemmlowp::logistic(input);
+      *output_data_ptr = output.raw();
+
+      ++input_data_ptr;
+      ++output_data_ptr;
+    }
+  }
+}
+
 inline void Tanh(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Tanh");
@@ -3731,6 +5064,108 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
     output_data[c] = output_val;
   }
 }
+
+inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
+                 int input_left_shift, int16* output_data,
+                 const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Tanh/Int16");
+  // Support for shifts is limited until we have a parameterized version of
+  // SaturatingRoundingMultiplyByPOT().
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  TFLITE_DCHECK_LE(input_left_shift, 1);
+
+  const int flat_size = RequiredBufferSizeForDims(output_dims);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+
+  int c = 0;
+  const int16* input_data_ptr = input_data;
+  int16* output_data_ptr = output_data;
+#ifdef GEMMLOWP_NEON
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
+        F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        vst1q_s16(output_data_ptr, output0.raw());
+        vst1q_s16(output_data_ptr + 8, output1.raw());
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
+        F0 output = gemmlowp::tanh(input);
+        vst1q_s16(output_data_ptr, output.raw());
+
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    } else {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr)));
+        F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr + 8)));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        vst1q_s16(output_data_ptr, output0.raw());
+        vst1q_s16(output_data_ptr + 8, output1.raw());
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr)));
+        F0 output = gemmlowp::tanh(input);
+        vst1q_s16(output_data_ptr, output.raw());
+
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    }
+  }
+#endif
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c < flat_size; ++c) {
+        F3 input = F3::FromRaw(*input_data_ptr);
+        F0 output = gemmlowp::tanh(input);
+        *output_data_ptr = output.raw();
+
+        ++input_data_ptr;
+        ++output_data_ptr;
+      }
+    } else {
+      for (; c < flat_size; ++c) {
+        F3 input = F3::FromRaw(
+            gemmlowp::SaturatingRoundingMultiplyByPOT<1>(*input_data_ptr));
+        F0 output = gemmlowp::tanh(input);
+        *output_data_ptr = output.raw();
+
+        ++input_data_ptr;
+        ++output_data_ptr;
+      }
+    }
+  }
+}
+
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
@@ -3753,66 +5188,23 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
 }
 
 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
-                      float rmin, float rmax, float* output_data,
+                      float rmin, float rmax, int num_bits, float* output_data,
                       const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("FakeQuant");
 
   // 0 should always be a representable value. Let's assume that the initial
   // min,max range contains 0.
-  TFLITE_DCHECK_LE(rmin, 0.);
-  TFLITE_DCHECK_GE(rmax, 0.);
-
-  // Determine quantization parameters: zero_point, scale.
-  using Integer = uint8;
-  const Integer qmin = std::numeric_limits<Integer>::min();
-  const Integer qmax = std::numeric_limits<Integer>::max();
-  const float qmin_float = qmin;
-  const float qmax_float = qmax;
-  int32 zero_point = 0;
-  float scale = 0.f;
-  // If rmin==rmax, both must be zero per the above assertion,
-  // so we are done.
-  if (rmin != rmax) {
-    // First determine the scale.
-    scale = (rmax - rmin) / (qmax_float - qmin_float);
-
-    // Zero-point computation.
-    // First the initial floating-point computation. The zero-point can be
-    // determined from solving an affine equation for any known pair
-    // (real value, corresponding quantized value).
-    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-    // The arithmetic error on the zero point computed from either pair
-    // will be roughly machine_epsilon * (sum of absolute values of terms)
-    // so we want to use the variant that adds the smaller terms.
-    const float zero_point_from_min = qmin_float - rmin / scale;
-    const float zero_point_from_max = qmax_float - rmax / scale;
-    const float zero_point_from_min_error =
-        std::abs(qmin_float) + std::abs(rmin / scale);
-    const float zero_point_from_max_error =
-        std::abs(qmax_float) + std::abs(rmax / scale);
-
-    const float zero_point_float =
-        zero_point_from_min_error < zero_point_from_max_error
-            ? zero_point_from_min
-            : zero_point_from_max;
-
-    // Now we need to nudge the zero point to be an integer
-    // (our zero points are integer, and this is motivated by the requirement
-    // to be able to represent the real value "0" exactly as a quantized value,
-    // which is required in multiple places, for example in Im2col with SAME
-    // padding).
-    if (zero_point_float < qmin_float) {
-      zero_point = qmin;
-    } else if (zero_point_float > qmax_float) {
-      zero_point = qmax;
-    } else {
-      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
-    }
-    // The zero point should always be in the range of quantized value,
-    // [qmin, qmax].
-    TFLITE_DCHECK_GE(zero_point, qmin);
-    TFLITE_DCHECK_LE(zero_point, qmax);
-  }
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
+                         &nudged_max, &nudged_scale);
+  const float inv_nudged_scale = 1.0f / nudged_scale;
 
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
@@ -3823,11 +5215,12 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
       for (int x = 0; x < width; ++x) {
         for (int c = 0; c < depth; ++c) {
           const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float unclamped_quantized_val =
-              TfLiteRound(zero_point + src_val / scale);
-          const float quantized_val = std::min(
-              qmax_float, std::max(qmin_float, unclamped_quantized_val));
-          const float dst_val = scale * (quantized_val - zero_point);
+          const float clamped =
+              std::min(nudged_max, std::max(nudged_min, src_val));
+          const float clamped_shifted = clamped - nudged_min;
+          const float dst_val =
+              TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+              nudged_min;
           output_data[Offset(output_dims, c, x, y, b)] = dst_val;
         }
       }
@@ -4256,6 +5649,7 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* paddings_data,
                            const Dims<4>& paddings_dims, T* output_data,
                            const Dims<4>& output_dims) {
+  // Unoptimized - Straight copy from reference ops.
   gemmlowp::ScopedProfilingLabel label("SpaceToBatchND");
 
   const int output_batch_size = ArraySize(output_dims, 3);
@@ -4297,29 +5691,76 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Helper methods for BatchToSpaceND.
+// `spatial_index_dim` specifies post-crop offset index in this spatial
+// dimension, i.e. spatial offset introduced by flattening batch to spatial
+// dimension minus the crop size at beginning. `block_shape_dim` is the block
+// size in current dimension. `input_dim` and `output_dim` are input and output
+// size of BatchToSpaceND operation in current dimension.
+// Output start index is inclusive and end index is exclusive.
+inline void GetIndexRange(int spatial_index_dim, int block_shape_dim,
+                          int input_dim, int output_dim, int* start_index,
+                          int* end_index) {
+  // (*start_index) * block_shape_dim is effectively rounded up to the next
+  // multiple of block_shape_dim by the integer division.
+  *start_index =
+      std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+  // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
+  // end_index is exclusive).
+  *end_index = std::min(
+      input_dim,
+      (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+}
+
 template <typename T>
 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
-                           const Dims<4>& block_shape_dims, T* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& block_shape_dims,
+                           const int32* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BatchToSpaceND");
 
   const int output_batch_size = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
   const int input_batch_size = ArraySize(input_dims, 3);
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int depth = ArraySize(input_dims, 0);
   const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
+  const int crops_top = crops_data[0];
+  const int crops_left = crops_data[2];
 
   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
-    for (int in_h = 0; in_h < input_height; ++in_h) {
-      for (int in_w = 0; in_w < input_width; ++in_w) {
-        int out_batch = in_batch % output_batch_size;
-        int out_w = in_w * block_shape_width +
-                    (in_batch / output_batch_size) % block_shape_width;
-        int out_h = in_h * block_shape_height +
-                    (in_batch / output_batch_size) / block_shape_width;
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
+
+    int in_h_start = 0;
+    int in_h_end = 0;
+    // GetIndexRange ensures start and end indices are in [0, output_height).
+    GetIndexRange(spatial_offset / block_shape_width - crops_top,
+                  block_shape_height, input_height, output_height, &in_h_start,
+                  &in_h_end);
+
+    for (int in_h = in_h_start; in_h < in_h_end; ++in_h) {
+      const int out_h = in_h * block_shape_height +
+                        spatial_offset / block_shape_width - crops_top;
+      TFLITE_DCHECK_GE(out_h, 0);
+      TFLITE_DCHECK_LT(out_h, output_height);
+
+      int in_w_start = 0;
+      int in_w_end = 0;
+      // GetIndexRange ensures start and end indices are in [0, output_width).
+      GetIndexRange(spatial_offset % block_shape_width - crops_left,
+                    block_shape_width, input_width, output_width, &in_w_start,
+                    &in_w_end);
+
+      for (int in_w = in_w_start; in_w < in_w_end; ++in_w) {
+        const int out_w = in_w * block_shape_width +
+                          spatial_offset % block_shape_width - crops_left;
+        TFLITE_DCHECK_GE(out_w, 0);
+        TFLITE_DCHECK_LT(out_w, output_width);
         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
         const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
         memcpy(out, in, depth * sizeof(T));
@@ -4332,8 +5773,11 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
   gemmlowp::ScopedProfilingLabel label("Pad");
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
@@ -4352,27 +5796,27 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   const int input_depth = ArraySize(input_dims, 0);
 
   if (left_b_padding != 0) {
-    memset(output_data, 0,
+    memset(output_data, pad_value,
            left_b_padding * output_height * output_width * output_depth *
                sizeof(T));
   }
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), 0,
+      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), pad_value,
              left_h_padding * output_width * output_depth * sizeof(T));
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), 0,
+        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), pad_value,
                left_w_padding * output_depth * sizeof(T));
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
-          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b), 0,
-                 left_d_padding * sizeof(T));
+          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+                 pad_value, left_d_padding * sizeof(T));
         }
 
         T* out = output_data +
@@ -4386,20 +5830,21 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
           memset(
               output_data + Offset(output_dims, output_depth - right_d_padding,
                                    out_w, out_h, out_b),
-              0, right_d_padding * sizeof(T));
+              pad_value, right_d_padding * sizeof(T));
         }
       }
       if (right_w_padding != 0) {
         memset(
             output_data + Offset(output_dims, 0, output_width - right_w_padding,
                                  out_h, out_b),
-            0, right_w_padding * output_depth * sizeof(T));
+            pad_value, right_w_padding * output_depth * sizeof(T));
       }
     }
     if (right_h_padding != 0) {
       memset(output_data + Offset(output_dims, 0, 0,
                                   output_height - right_h_padding, out_b),
-             0, right_h_padding * output_width * output_depth * sizeof(T));
+             pad_value,
+             right_h_padding * output_width * output_depth * sizeof(T));
     }
   }
   if (right_b_padding != 0) {
@@ -4411,43 +5856,57 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
+}
+
+// UNOPTIMIZED COPY of StridedSlice from reference_ops.h.
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
-                         const std::vector<int>& starts,
-                         const std::vector<int>& stops,
+                         const std::vector<int>& start_indices,
+                         const std::vector<int>& stop_indices,
                          const std::vector<int>& strides, T* output_data,
                          const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("StridedSlice");
-  const int start_b = (begin_mask & 8) ? 0 : starts[3];
-  const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
-  const int start_h = (begin_mask & 4) ? 0 : starts[2];
-  const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
-  const int start_w = (begin_mask & 2) ? 0 : starts[1];
-  const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
-  const int start_d = (begin_mask & 1) ? 0 : starts[0];
-  const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
+  TFLITE_DCHECK_EQ(start_indices.size(), 4);
+  TFLITE_DCHECK_EQ(stop_indices.size(), 4);
+  TFLITE_DCHECK_EQ(strides.size(), 4);
+  const int start_b = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 3);
+  const int stop_b = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 3);
+  const int start_h = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 2);
+  const int stop_h = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 2);
+  const int start_w = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 1);
+  const int stop_w = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 1);
+  const int start_d = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 0);
+  const int stop_d = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 0);
 
   T* out_ptr = output_data;
-  if (strides[0] == 0) {
-    for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
-      for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
-        for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
-          const int len = stop_d - start_d;
-          memcpy(out_ptr,
-                 input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
-                 len * sizeof(T));
-          out_ptr += len;
-        }
-      }
-    }
-  } else {
-    for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
-      for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
-        for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
-          for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
-            *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
-          }
+  for (int in_b = start_b;
+       !strided_slice::LoopCondition(in_b, stop_b, strides[3]);
+       in_b += strides[3]) {
+    for (int in_h = start_h;
+         !strided_slice::LoopCondition(in_h, stop_h, strides[2]);
+         in_h += strides[2]) {
+      for (int in_w = start_w;
+           !strided_slice::LoopCondition(in_w, stop_w, strides[1]);
+           in_w += strides[1]) {
+        for (int in_d = start_d;
+             !strided_slice::LoopCondition(in_d, stop_d, strides[0]);
+             in_d += strides[0]) {
+          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
       }
     }
@@ -4638,6 +6097,107 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+void Transpose(const T* input, const Dims<4>& input_dims, T* output,
+               const Dims<4>& output_dims, const int* permuted_axes) {
+  int out_sizes[4];
+  // Compute the inverse permutation array so we can do an output centered
+  // transpose. Also, check to make sure output_dims is matching input_dims.
+  for (int k = 0; k < 4; k++) {
+    out_sizes[k] =
+        MatchingArraySize(input_dims, permuted_axes[k], output_dims, k);
+  }
+
+  // Naive transpose loop (iterate on output index and compute input index).
+  int o[4];  // loop index (on output).
+  int i[4];
+  for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) {
+    i[permuted_axes[3]] = o[3];
+    for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) {
+      i[permuted_axes[2]] = o[2];
+      for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) {
+        i[permuted_axes[1]] = o[1];
+        for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) {
+          i[permuted_axes[0]] = o[0];
+          output[Offset(output_dims, o)] = input[Offset(input_dims, i)];
+        }
+      }
+    }
+  }
+}
+
+inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, float* output_data,
+                          const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("TransposeConv");
+  // THIS FUNCTION IS A COPY FROM reference_ops.h.
+  // To optimize, start by using the conv code with transposed weights for the
+  // case of stride_height = stride_width = 1.
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+
+  // Although transpose convolution simplifies to convolution with transposed
+  // weights for strides of 1, non-unitary striding complicates matters. To
+  // keep this reference implementation as clear as possible, we use a "scatter"
+  // access pattern, where we loop through all the input elements, computing
+  // their influence on the output, rather than looping through the output
+  // elements in the typical "gather" access pattern of a conv. We therefore
+  // must initialize the output array to zero.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
+              0.0f;
+        }
+      }
+    }
+  }
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < input_depth;
+                   ++out_channel) {
+                // Compute output element location
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  float input_value = input_data[Offset(input_dims, in_channel,
+                                                        in_x, in_y, batch)];
+                  float filter_value =
+                      filter_data[Offset(filter_dims, out_channel, filter_x,
+                                         filter_y, in_channel)];
+                  output_data[Offset(output_dims, out_channel, out_x, out_y,
+                                     batch)] += input_value * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
index 4e324a5e107cf5a90c0042331899edab831c8e51..19220470f4ef73a6b1ee7a09a2e1acc3fed2f888 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
-#define TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
@@ -117,6 +117,14 @@ void PortableZeroVector(float* vector, int v_size);
 // Limit a float input f between +abs_limit and -abs_limit.
 float PortableClip(float f, float abs_limit);
 
+// Symmetric quantizer.
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min,
+                                     float* max, float* scaling_factor);
+void NeonSymmetricQuantizeFloats(const float* values, const int size,
+                                 int8_t* quantized_values, float* min,
+                                 float* max, float* scaling_factor);
+
 // Shift left a vector in place with v_size size.
 void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
 void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
@@ -135,4 +143,4 @@ void NeonReductionSumVector(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
index 18be6777a5caeb45a4ffabd8b7f1793de7b053f8..b0951aac8cbb98a181d9dcaef88770fadfc74f62 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -78,6 +78,22 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
                                    quantized_multiplier, left_shift);
 }
 
+void PreprocessLogSoftmaxScaling(double beta, double input_scale,
+                                 int input_integer_bits,
+                                 int32_t* quantized_multiplier, int* left_shift,
+                                 int32_t* reverse_scaling_divisor,
+                                 int* reverse_scaling_right_shift) {
+  PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
+                           quantized_multiplier, left_shift);
+
+  // Also calculate what amounts to the inverse scaling factor for the input.
+  const double real_reverse_scaling_divisor =
+      (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
+  tflite::QuantizeMultiplierSmallerThanOne(real_reverse_scaling_divisor,
+                                           reverse_scaling_divisor,
+                                           reverse_scaling_right_shift);
+}
+
 int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
   const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
                                     (1ll << (31 - input_integer_bits)) /
@@ -88,4 +104,25 @@ int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
   return static_cast<int>(std::floor(max_input_rescaled));
 }
 
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max,
+                            float* scale) {
+  // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
+  const float quant_min_float = static_cast<float>(quant_min);
+  const float quant_max_float = static_cast<float>(quant_max);
+  *scale = (max - min) / (quant_max_float - quant_min_float);
+  const float zero_point_from_min = quant_min_float - min / *scale;
+  uint16 nudged_zero_point;
+  if (zero_point_from_min < quant_min_float) {
+    nudged_zero_point = static_cast<uint16>(quant_min);
+  } else if (zero_point_from_min > quant_max_float) {
+    nudged_zero_point = static_cast<uint16>(quant_max);
+  } else {
+    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
+  }
+  *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
+  *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
index ba06bc0975b6847b24592daa60efe99983d03707..4a217515f142b2451ebd61e423871b95cdc09748 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -12,13 +12,156 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef PHOTOS_VISION_LEARNING_TENSORFLOW_MINI_QUANTIZATION_UTIL_H_
-#define PHOTOS_VISION_LEARNING_TENSORFLOW_MINI_QUANTIZATION_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
 
+#include <cmath>
 #include <cstdint>
+#include <limits>
+
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
 
 namespace tflite {
 
+// Given the min and max values of a float array, return
+// reasonable quantization parameters to use for this array.
+template <typename T>
+QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
+  const T qmin = std::numeric_limits<T>::min();
+  const T qmax = std::numeric_limits<T>::max();
+  const double qmin_double = qmin;
+  const double qmax_double = qmax;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_CHECK_LE(rmin, 0.);
+  TFLITE_CHECK_GE(rmax, 0.);
+  if (rmin == rmax) {
+    // Special case where the min,max range is a point. Should be {0}.
+    TFLITE_CHECK_EQ(rmin, 0.);
+    TFLITE_CHECK_EQ(rmax, 0.);
+    QuantizationParams quantization_params;
+    quantization_params.zero_point = 0;
+    quantization_params.scale = 0.;
+    return quantization_params;
+  }
+
+  // General case.
+  //
+  // First determine the scale.
+  const double scale = (rmax - rmin) / (qmax_double - qmin_double);
+
+  // Zero-point computation.
+  // First the initial floating-point computation. The zero-point can be
+  // determined from solving an affine equation for any known pair
+  // (real value, corresponding quantized value).
+  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair
+  // will be roughly machine_epsilon * (sum of absolute values of terms)
+  // so we want to use the variant that adds the smaller terms.
+  const double zero_point_from_min = qmin_double - rmin / scale;
+  const double zero_point_from_max = qmax_double - rmax / scale;
+  const double zero_point_from_min_error =
+      std::abs(qmin_double) + std::abs(rmin / scale);
+  const double zero_point_from_max_error =
+      std::abs(qmax_double) + std::abs(rmax / scale);
+
+  const double zero_point_double =
+      zero_point_from_min_error < zero_point_from_max_error
+          ? zero_point_from_min
+          : zero_point_from_max;
+
+  // Now we need to nudge the zero point to be an integer
+  // (our zero points are integer, and this is motivated by the requirement
+  // to be able to represent the real value "0" exactly as a quantized value,
+  // which is required in multiple places, for example in Im2col with SAME
+  // padding).
+  T nudged_zero_point = 0;
+  if (zero_point_double < qmin_double) {
+    nudged_zero_point = qmin;
+  } else if (zero_point_double > qmax_double) {
+    nudged_zero_point = qmax;
+  } else {
+    nudged_zero_point = static_cast<T>(round(zero_point_double));
+  }
+  // The zero point should always be in the range of quantized value,
+  // [qmin, qmax].
+  TFLITE_CHECK_GE(nudged_zero_point, qmin);
+  TFLITE_CHECK_LE(nudged_zero_point, qmax);
+
+  // Finally, store the result nudged quantization params.
+  QuantizationParams quantization_params;
+  quantization_params.zero_point = nudged_zero_point;
+  quantization_params.scale = scale;
+  return quantization_params;
+}
+
+// Converts a floating-point number to an integer. For all inputs x where
+// static_cast<IntOut>(x) is legal according to the C++ standard, the result
+// is identical to that cast (i.e. the result is x with its fractional part
+// truncated whenever that is representable as IntOut).
+//
+// static_cast would cause undefined behavior for the following cases, which
+// have well-defined behavior for this function:
+//
+//  1. If x is NaN, the result is zero.
+//
+//  2. If the truncated form of x is above the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::max().
+//
+//  3. If the truncated form of x is below the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::min().
+//
+// Note that cases #2 and #3 cover infinities as well as finite numbers.
+//
+// The range of FloatIn must include the range of IntOut, otherwise
+// the results are undefined.
+// TODO(sfeuz): Replace by absl::SafeCast once available.
+template <class IntOut, class FloatIn>
+IntOut SafeCast(FloatIn x) {
+  static_assert(!std::numeric_limits<FloatIn>::is_integer,
+                "FloatIn is integer");
+  static_assert(std::numeric_limits<IntOut>::is_integer,
+                "IntOut is not integer");
+  static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
+
+  // Special case NaN, for which the logic below doesn't work.
+  if (std::isnan(x)) {
+    return 0;
+  }
+
+  // Negative values all clip to zero for unsigned results.
+  if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
+    return 0;
+  }
+
+  // Handle infinities.
+  if (std::isinf(x)) {
+    return x < 0 ? std::numeric_limits<IntOut>::min()
+                 : std::numeric_limits<IntOut>::max();
+  }
+
+  // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
+  // unless x is zero in which case exp == 0. Note that this implies that the
+  // magnitude of x is strictly less than 2^exp.
+  int exp = 0;
+  std::frexp(x, &exp);
+
+  // Let N be the number of non-sign bits in the representation of IntOut. If
+  // the magnitude of x is strictly less than 2^N, the truncated version of x
+  // is representable as IntOut. The only representable integer for which this
+  // is not the case is kMin for signed types (i.e. -2^N), but that is covered
+  // by the fall-through below.
+  if (exp <= std::numeric_limits<IntOut>::digits) {
+    return x;
+  }
+
+  // Handle numbers with magnitude >= 2^N.
+  return x < 0 ? std::numeric_limits<IntOut>::min()
+               : std::numeric_limits<IntOut>::max();
+}
+
 // Decompose a double multiplier into a Q0.31 int32 representation of its
 // significand, and shift representation of NEGATIVE its exponent ---
 // this is intended as a RIGHT-shift.
@@ -53,14 +196,27 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
 void PreprocessSoftmaxScaling(double beta, double input_scale,
                               int input_integer_bits,
                               int32_t* quantized_multiplier, int* left_shift);
-
+// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
+void PreprocessLogSoftmaxScaling(double beta, double input_scale,
+                                 int input_integer_bits,
+                                 int32_t* quantized_multiplier, int* left_shift,
+                                 int32_t* reverse_scaling_divisor,
+                                 int* reverse_scaling_right_shift);
 // Calculate the largest input that will result in a within-bounds intermediate
 // result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
 // it must not overflow before we reduce the value by multiplication by the
-// input multiplier.  The negative radius is used as the minimum difference
-// in Softmax.
+// input multiplier.  The negative radius is used as the minimum difference in
+// Softmax.
 int CalculateInputRadius(int input_integer_bits, int input_left_shift);
 
+// Nudges a min/max quantization range to ensure zero is zero.
+// Gymnastics with nudged zero point is to ensure that real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+// Outputs nudged_min, nudged_max, nudged_scale.
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max, float* scale);
+
 }  // namespace tflite
 
-#endif  // PHOTOS_VISION_LEARNING_TENSORFLOW_MINI_QUANTIZATION_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
index 19b1b408ec74b0939065b0ad10b91ecfc2cd4765..3e9a3c29ee26e96612bb05eb9cd1e1badad10c7a 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -22,6 +22,177 @@ namespace {
 
 using ::testing::Pair;
 
+template <class FloatIn, class IntOut>
+void RunSafeCastTests() {
+  const IntOut imax = std::numeric_limits<IntOut>::max();
+  EXPECT_GT(imax, 0);
+  const IntOut imin = std::numeric_limits<IntOut>::min();
+  const bool s = std::numeric_limits<IntOut>::is_signed;
+  if (s) {
+    EXPECT_LT(imin, 0);
+  } else {
+    EXPECT_EQ(0, imin);
+  }
+
+  // Some basic tests.
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0.0)), 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-0.0)), 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0.99)), 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.0)), 1);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.01)), 1);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.99)), 1);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(2.0)), 2);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(2.01)), 2);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-0.99)), 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-1.0)), s ? -1 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-1.01)), s ? -1 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-1.99)), s ? -1 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-2.0)), s ? -2 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-2.01)), s ? -2 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(117.9)), 117);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(118.0)), 118);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(118.1)), 118);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-117.9)), s ? -117 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-118.0)), s ? -118 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-118.1)), s ? -118 : 0);
+
+  // Some edge cases.
+  EXPECT_EQ(SafeCast<IntOut>(std::numeric_limits<FloatIn>::max()), imax);
+  EXPECT_EQ(SafeCast<IntOut>(std::numeric_limits<FloatIn>::lowest()), imin);
+  EXPECT_EQ(SafeCast<IntOut>(std::numeric_limits<FloatIn>::infinity()), imax);
+  EXPECT_EQ(SafeCast<IntOut>(-std::numeric_limits<FloatIn>::infinity()), imin);
+  EXPECT_EQ(SafeCast<IntOut>(std::numeric_limits<FloatIn>::quiet_NaN()), 0);
+
+  // Some larger numbers.
+  if (sizeof(IntOut) >= 4 && sizeof(FloatIn) > 4) {
+    EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0x76543210)), 0x76543210);
+  }
+
+  if (sizeof(FloatIn) > sizeof(IntOut)) {
+    // Check values near imax.
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 0.1)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 0.99)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 1.0)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 1.99)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 2.0)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 0.1)),
+              imax - 1);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 0.99)),
+              imax - 1);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 1.0)),
+              imax - 1);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 1.01)),
+              imax - 2);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 1.99)),
+              imax - 2);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 2.0)),
+              imax - 2);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 2.01)),
+              imax - 3);
+  }
+
+  // Check values considerably larger in magnitude than imin and imax
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) * 2)),
+      imax);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) * 20)),
+      imax);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) * 100)),
+      imax);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imin) * 2)),
+      imin);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imin) * 20)),
+      imin);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imin) * 100)),
+      imin);
+}
+
+TEST(QuantizationUtilTest, SafeCast) {
+  RunSafeCastTests<float, int8>();
+  RunSafeCastTests<double, int8>();
+  RunSafeCastTests<float, int16>();
+  RunSafeCastTests<double, int16>();
+  RunSafeCastTests<float, int32>();
+  RunSafeCastTests<double, int32>();
+  RunSafeCastTests<float, int64>();
+  RunSafeCastTests<double, int64>();
+  RunSafeCastTests<float, uint8>();
+  RunSafeCastTests<double, uint8>();
+  RunSafeCastTests<float, uint16>();
+  RunSafeCastTests<double, uint16>();
+  RunSafeCastTests<float, uint32>();
+  RunSafeCastTests<double, uint32>();
+  RunSafeCastTests<float, uint64>();
+  RunSafeCastTests<double, uint64>();
+}
+
+// Example taken from http://www.tensorflow.org/performance/quantization
+//
+//  Quantized | Float
+//  --------- | -----
+//  0         | -10.0
+//  255       | 30.0
+//  128       | 10.0
+TEST(QuantizationUtilTest, ChooseQuantizationParams) {
+  QuantizationParams qp = ChooseQuantizationParams<uint8>(-10.0, 30.0);
+  EXPECT_NEAR(qp.scale, 0.156863, 1e-5);
+  EXPECT_EQ(qp.zero_point, 64);
+}
+
+TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroPointOnMinBoundary) {
+  QuantizationParams qp = ChooseQuantizationParams<uint8>(0.0, 30.0);
+  EXPECT_NEAR(qp.scale, 0.117647, 1e-5);
+  EXPECT_EQ(qp.zero_point, 0);
+}
+
+TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroNotInRange) {
+  // Assumption is that zero is within the range.
+  EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, 30.0), "");
+}
+
+TEST(QuantizationUtilTest, ChooseQuantizationParamsEmptyRangePositive) {
+  // Assumption is that zero is within the range.
+  EXPECT_DEATH(ChooseQuantizationParams<uint8>(30.0, 30.0), "");
+}
+
+TEST(QuantizationUtilTest, ChooseQuantizationParamsEmptyRangeZero) {
+  QuantizationParams qp = ChooseQuantizationParams<uint8>(0.0, 0.0);
+  EXPECT_NEAR(qp.scale, 0.0, 1e-5);
+  EXPECT_EQ(qp.zero_point, 0);
+}
+
+TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroPointOnMaxBoundary) {
+  QuantizationParams qp = ChooseQuantizationParams<uint8>(-10.0, 0.0);
+  EXPECT_NEAR(qp.scale, 0.039216, 1e-5);
+  EXPECT_EQ(qp.zero_point, 255);
+}
+
+TEST(QuantizationUtilTest, ChooseQuantizationParamsInvalidRange) {
+  EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, -30.0), "");
+}
+
 TEST(QuantizationUtilTest, QuantizeMultiplierSmallerThanOne) {
   auto quantize = [](double d) {
     int32_t q;
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
index c5b0bccc9da5fa2ff9c3a9d430725b613435abf1..5e7586eeda7f2174d8ee0fc2f4bf2363cb75ecd6 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdlib.h>
 #include <string.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -27,6 +29,28 @@ float PortableClip(float f, float abs_limit) {
   return result;
 }
 
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min,
+                                     float* max, float* scaling_factor) {
+  auto minmax = std::minmax_element(values, values + size);
+  *min = *minmax.first;
+  *max = *minmax.second;
+  const int kScale = 127;
+  const float range = std::max(std::abs(*min), std::abs(*max));
+  if (range == 0) {
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
+    return;
+  }
+  *scaling_factor = kScale / range;
+  for (int i = 0; i < size; ++i) {
+    const int32_t quantized_value =
+        static_cast<int32_t>(TfLiteRound(*scaling_factor * values[i]));
+    // Clamp: just in case some odd numeric offset.
+    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+  }
+}
+
 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
                                                  int m_rows, int m_cols,
                                                  const float* vector,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
index c05c21b472b05f2cbe133adf94d91ab0c6d9ef40..478cda8e1939718f68a0cd6d547af30f4c81d950 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -25,6 +25,10 @@ namespace tensor_utils {
 // Limit a float input f between +abs_limit and -abs_limit.
 float PortableClip(float f, float abs_limit);
 
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min,
+                                     float* max, float* scaling_factor);
+
 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector.
 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
@@ -103,6 +107,13 @@ void PortableReductionSumVector(const float* input_vector, float* output_vector,
 
 float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
 
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min, float* max,
+                             float* scaling_factor) {
+  return PortableSymmetricQuantizeFloats(values, size, quantized_values, min,
+                                         max, scaling_factor);
+}
+
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int m_cols, const float* vector,
                                          int n_batch, float* result,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 0c6f198170384556a4ab0287512524eab17f064e..d5abf904fda4543a1ecc4ba6a7fb558d720fe8db 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -27,7 +27,9 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -157,11 +159,11 @@ inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
 inline void Conv(const float* input_data, const Dims<4>& input_dims,
                  const float* filter_data, const Dims<4>& filter_dims,
                  const float* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, float output_activation_min,
-                 float output_activation_max, float* output_data,
-                 const Dims<4>& output_dims, float* im2col_data,
-                 const Dims<4>& im2col_dims) {
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 float output_activation_min, float output_activation_max,
+                 float* output_data, const Dims<4>& output_dims,
+                 float* im2col_data, const Dims<4>& im2col_dims) {
   (void)im2col_data;  // only used in optimized code.
   (void)im2col_dims;  // only used in optimized code.
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
@@ -186,8 +188,9 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims,
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + filter_x;
-                const int in_y = in_y_origin + filter_y;
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
                 // If the location is outside the bounds of the input image,
                 // use zero as a default value.
                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
@@ -216,6 +219,23 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int dilation_width_factor,
+          int dilation_height_factor, int pad_width, int pad_height,
+          float* output_data, const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, dilation_width_factor,
+       dilation_height_factor, pad_width, pad_height, output_activation_min,
+       output_activation_max, output_data, output_dims, im2col_data,
+       im2col_dims);
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 void Conv(const float* input_data, const Dims<4>& input_dims,
@@ -227,7 +247,7 @@ void Conv(const float* input_data, const Dims<4>& input_dims,
   float output_activation_min, output_activation_max;
   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
   Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
-       stride_width, stride_height, pad_width, pad_height,
+       stride_width, stride_height, 1, 1, pad_width, pad_height,
        output_activation_min, output_activation_max, output_data, output_dims,
        im2col_data, im2col_dims);
 }
@@ -241,7 +261,7 @@ void Conv(const float* input_data, const Dims<4>& input_dims,
           const Dims<4>& output_dims, float* im2col_data,
           const Dims<4>& im2col_dims) {
   Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
-           bias_dims, stride, stride, pad_width, pad_height, output_data,
+           bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data,
            output_dims, im2col_data, im2col_dims);
 }
 
@@ -386,6 +406,7 @@ inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
           const int in_d =
               out_d + ((out_h % block_size) * block_size + out_w % block_size) *
                           output_depth;
+
           const int in_w = out_w / block_size;
           const int in_h = out_h / block_size;
           const int in_b = out_b;
@@ -533,6 +554,203 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                           int32 input_offset, const uint8* filter_data,
+                           const Dims<4>& filter_dims, int32 filter_offset,
+                           const int32* bias_data, const Dims<4>& bias_dims,
+                           int32 output_offset, int32 output_multiplier,
+                           int output_shift, int32 output_activation_min,
+                           int32 output_activation_max, int16* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemm_context) {
+  (void)gemm_context;  // only used in optimized code.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(output_offset, 0);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(filter_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum = bias_data[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; ++d) {
+        int16 input_val = input_data[b * accum_depth + d] + input_offset;
+        int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
+        accum += filter_val * input_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum = MultiplyByQuantizedMultiplier(accum, output_multiplier,
+                                            -output_shift);
+      // Saturate, cast to int16, and store to output array.
+      accum = std::max(accum, output_activation_min - output_offset);
+      accum = std::min(accum, output_activation_max - output_offset);
+      accum += output_offset;
+      output_data[out_c + output_depth * b] = accum;
+    }
+  }
+}
+
+inline void ExperimentalShuffledFullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    int16* output_data, const Dims<4>& output_dims,
+    uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) {
+  (void)gemm_context;  // only used in optimized code.
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  TFLITE_DCHECK((accum_depth % 16) == 0);
+  TFLITE_DCHECK((output_depth % 4) == 0);
+
+  // Shuffling and xoring of input activations into the workspace buffer
+  uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+  if (batches == 1) {
+    for (int i = 0; i < accum_depth; i++) {
+      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+    }
+  } else if (batches == 4) {
+    for (int c = 0; c < accum_depth; c += 16) {
+      for (int b = 0; b < 4; b++) {
+        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        for (int j = 0; j < 16; j++) {
+          uint8 src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the kernel will only need to
+          // reinterpret these uint8 values as int8, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8 dst_val = src_val ^ 0x80;
+          *shuffled_input_workspace_ptr++ = dst_val;
+        }
+      }
+    }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+
+  // Actual computation
+  if (batches == 1) {
+    int16* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8 values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8* shuffled_weights_ptr =
+        reinterpret_cast<const int8*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8* shuffled_input_data =
+        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum[4] = {0};
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int j = 0; j < 16; j++) {
+            int8 input_val = shuffled_input_data[d + j];
+            int8 weights_val = *shuffled_weights_ptr++;
+            accum[i] += weights_val * input_val;
+          }
+        }
+      }
+      for (int i = 0; i < 4; i++) {
+        // Add bias value
+        int acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32 accumulator to the scale used by our
+        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+        // multiplier and shift here have been pre-computed offline
+        // (e.g. by toco).
+        acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                            -output_shift);
+        // Saturate, cast to int16, and store to output array.
+        acc = std::max(acc, output_activation_min);
+        acc = std::min(acc, output_activation_max);
+        output_ptr[c + i] = acc;
+      }
+    }
+  } else if (batches == 4) {
+    int16* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8 values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8* shuffled_weights_ptr =
+        reinterpret_cast<const int8*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8* shuffled_input_data =
+        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8* shuffled_input_ptr = shuffled_input_data;
+      // Accumulation loop.
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum[4][4];
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          accum[i][b] = 0;
+        }
+      }
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int b = 0; b < 4; b++) {
+            for (int j = 0; j < 16; j++) {
+              int8 input_val = shuffled_input_ptr[16 * b + j];
+              int8 weights_val = shuffled_weights_ptr[16 * i + j];
+              accum[i][b] += weights_val * input_val;
+            }
+          }
+        }
+        shuffled_input_ptr += 64;
+        shuffled_weights_ptr += 64;
+      }
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          // Add bias value
+          int acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32 accumulator to the scale used by our
+          // (16-bit, typically 3 integer bits) fixed-point format. The
+          // quantized multiplier and shift here have been pre-computed offline
+          // (e.g. by toco).
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              -output_shift);
+          // Saturate, cast to int16, and store to output array.
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_ptr[b * output_depth + c + i] = acc;
+        }
+      }
+    }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
@@ -567,27 +785,14 @@ void NonGlobalBatchNormalization(
     const Dims<4>& offset_dims, float* output_data,
     const Dims<4>& output_dims) {
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
-                        offset_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
-                        offset_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
-                        offset_dims, 0, output_dims, 0);
+  const int inner_size = MatchingFlatSizeSkipDim(
+      input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims);
 
   for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, x, y, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
-              offset_data[Offset(offset_dims, c, x, y, 0)]);
-        }
-      }
+    for (int i = 0; i < inner_size; ++i) {
+      output_data[b * inner_size + i] = ActivationFunction<Ac>(
+          (input_data[b * inner_size + i] - mean_data[i]) * multiplier_data[i] +
+          offset_data[i]);
     }
   }
 }
@@ -601,87 +806,52 @@ void GlobalBatchNormalization(const float* input_data,
                               const float* offset_data,
                               const Dims<4>& offset_dims, float* output_data,
                               const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth =
       MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
                         offset_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
-              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = ActivationFunction<Ac>(
+          (input_data[depth * i + c] - mean_data[c]) * multiplier_data[c] +
+          offset_data[c]);
     }
   }
 }
 
 inline void Relu(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float lower = 0;
-          float clamped = val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float lower = 0;
+    const float clamped = val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu1(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 1;
-          const float lower = -1;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 1;
+    const float lower = -1;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu6(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 6;
-          const float lower = 0;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
@@ -689,24 +859,17 @@ template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   static_assert(Ac == FusedActivationFunctionType::kNone, "");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        float squared_l2_norm = 0;
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          squared_l2_norm += val * val;
-        }
-        float l2_norm = std::sqrt(squared_l2_norm);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] / l2_norm;
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[depth * i + c];
+      squared_l2_norm += val * val;
+    }
+    const float l2_norm = std::sqrt(squared_l2_norm);
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
     }
   }
 }
@@ -791,26 +954,11 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] +
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] + input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -885,6 +1033,49 @@ inline void Add(int left_shift, const uint8* input1_data,
   }
 }
 
+template <FusedActivationFunctionType Ac>
+inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
+                int input1_shift, const int16* input2_data,
+                const Dims<4>& input2_dims, int input2_shift,
+                int16 output_activation_min, int16 output_activation_max,
+                int16* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, -32768);
+    TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  }
+
+  const int flat_size = RequiredBufferSizeForDims(output_dims);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+
+  TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
+  TFLITE_DCHECK_GE(input1_shift, 0);
+  TFLITE_DCHECK_GE(input2_shift, 0);
+  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_shift = input1_shift == 0 ? input2_shift : input1_shift;
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+    F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift));
+    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
+    const int16 raw_output = result.raw();
+    const int16 clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = clamped_output;
+  }
+}
+
 // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -1030,26 +1221,11 @@ inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] *
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] * input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1166,6 +1342,53 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
+                const int16* input2_data, const Dims<4>& input2_dims,
+                int16* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mul/Int16");
+
+  const int flat_size = RequiredBufferSizeForDims(output_dims);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    output_data[i] = unclamped_result.raw();
+  }
+}
+
+inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
+                const int16* input2_data, const Dims<4>& input2_dims,
+                int32 output_offset, int32 output_activation_min,
+                int32 output_activation_max, uint8* output_data,
+                const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size = RequiredBufferSizeForDims(output_dims);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
+  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16 rescaled_result =
+        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16 clamped_result =
+        std::min<int16>(output_activation_max - output_offset, rescaled_result);
+    clamped_result =
+        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
@@ -1208,38 +1431,164 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] -
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
-  }
-}
-
-template <FusedActivationFunctionType Ac, typename Scalar>
-void Concatenation(int concat_dim, const Scalar* const* input_data,
-                   const Dims<4>* const* input_dims, int inputs_count,
-                   Scalar* output_data, const Dims<4>& output_dims) {
-  TFLITE_DCHECK_GT(inputs_count, 1);
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], output_activation_min,
+        output_activation_max);
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastSub(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sub, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void Concatenation(int concat_dim, const Scalar* const* input_data,
+                   const Dims<4>* const* input_dims, int inputs_count,
+                   Scalar* output_data, const Dims<4>& output_dims) {
+  TFLITE_DCHECK_GT(inputs_count, 1);
   int concat_size = 0;
   for (int i = 0; i < inputs_count; i++) {
     for (int j = 0; j < 4; j++) {
@@ -1267,6 +1616,61 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
   }
 }
 
+// TODO(prabhumk): This is the same as the optimized implementation.
+// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void Concatenation(int concat_dim, const uint8* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8* output_data, const Dims<4>& output_dims,
+                          const int32 output_zeropoint,
+                          const float output_scale) {
+  // The arguments input_zeropoint and input_scale are expected to be an array
+  // that have the quantization paramaters for all the inputs to the concat
+  // operator.
+  TFLITE_DCHECK_GT(inputs_count, 1);
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  int64_t outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      const uint8* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void DepthConcatenation(const Scalar* const* input_data,
                         const Dims<4>* const* input_dims, int inputs_count,
@@ -1458,15 +1862,9 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
   (void)gemm_context;  // only used in optimized code.
 
   // Gather dimensions information, and perform consistency checks.
-  const int batches =
-      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
-                        output_state_dims, 3, output_activ_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
-                        output_state_dims, 2, output_activ_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
-                        output_state_dims, 1, output_activ_dims, 1);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_dims, 0, prev_activ_dims, prev_state_dims,
+                              output_state_dims, output_activ_dims);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
   const int input_depth = ArraySize(input_dims, 0);
@@ -1482,9 +1880,7 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
       MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
                         output_state_dims, 0, output_activ_dims, 0);
   TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
-  const int fc_batches = ArraySize(activ_temp_dims, 1) *
-                         ArraySize(activ_temp_dims, 2) *
-                         ArraySize(activ_temp_dims, 3);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_dims, 0);
   const int fc_output_depth =
       MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
   const int fc_accum_depth = ArraySize(weights_dims, 0);
@@ -1529,7 +1925,6 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
 
   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
   // and muls, all done in 16-bit fixed-point.
-  const int outer_size = batches * width * height;
   for (int b = 0; b < outer_size; ++b) {
     for (int c = 0; c < output_depth; ++c) {
       // Define the fixed-point data types that we will use here. All use
@@ -2064,28 +2459,20 @@ inline void LocalResponseNormalization(const float* input_data,
                                        float bias, float alpha, float beta,
                                        float* output_data,
                                        const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const int begin_input_c = std::max(0, c - range);
-          const int end_input_c = std::min(depth, c + range);
-          float accum = 0.f;
-          for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
-            const float input_val =
-                input_data[Offset(input_dims, input_c, x, y, b)];
-            accum += input_val * input_val;
-          }
-          const float multiplier = std::pow(bias + alpha * accum, -beta);
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] * multiplier;
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      const int begin_input_c = std::max(0, c - range);
+      const int end_input_c = std::min(depth, c + range);
+      float accum = 0.f;
+      for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
+        const float input_val = input_data[i * depth + input_c];
+        accum += input_val * input_val;
       }
+      const float multiplier = std::pow(bias + alpha * accum, -beta);
+      output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
     }
   }
 }
@@ -2093,37 +2480,28 @@ inline void LocalResponseNormalization(const float* input_data,
 inline void Softmax(const float* input_data, const Dims<4>& input_dims,
                     float beta, float* output_data,
                     const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        // Find max element value which we'll use to ensure numerical stability
-        // taking advantage of the following equality:
-        // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
-        float max = std::numeric_limits<float>::lowest();
-        for (int c = 0; c < depth; ++c) {
-          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
 
-        // Compute sum.
-        float sum = 0.f;
-        for (int c = 0; c < depth; ++c) {
-          sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
-                          beta);
-        }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp((input_data[i * depth + c] - max) * beta);
+    }
 
-        // Compute result.
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
-                       beta) /
-              sum;
-        }
-      }
+    // Compute result.
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] =
+          std::exp((input_data[i * depth + c] - max) * beta) / sum;
     }
   }
 }
@@ -2144,158 +2522,257 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int x = 0; x < width; ++x) {
-      for (int y = 0; y < height; ++y) {
-        uint8 max_in_row = 0;
-        for (int c = 0; c < depth; ++c) {
-          max_in_row =
-              std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
 
-        FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
-        for (int c = 0; c < depth; ++c) {
-          int32 input_diff =
-              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
-              max_in_row;
-          if (input_diff >= diff_min) {
-            const int32 input_diff_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_diff, input_beta_multiplier, input_beta_left_shift);
-            const FixedPointScaledDiff scaled_diff_f8 =
-                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-            sum_of_exps =
-                sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                  exp_on_negative_values(scaled_diff_f8));
-          }
-        }
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
 
-        int32 fixed_sum_of_exps = sum_of_exps.raw();
-        int headroom_plus_one =
-            CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
-        // This is the number of bits to the left of the binary point above 1.0.
-        // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
-        // no later adjustment will be needed.
-        int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
-        int32 shifted_sum_minus_one = static_cast<int32>(
-            (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
-            (static_cast<uint32>(1) << 31));
+    int32 fixed_sum_of_exps = sum_of_exps.raw();
+    int headroom_plus_one =
+        CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
+    // This is the number of bits to the left of the binary point above 1.0.
+    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
+    // no later adjustment will be needed.
+    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+    int32 shifted_sum_minus_one = static_cast<int32>(
+        (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+        (static_cast<uint32>(1) << 31));
+
+    FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+        FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+        int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+            (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
 
-        FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-            FixedPoint0::FromRaw(shifted_sum_minus_one));
+      } else {
+        output_data[i * depth + c] = 0;
+      }
+    }
+  }
+}
 
-        for (int c = 0; c < depth; ++c) {
-          int32 input_diff =
-              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
-              max_in_row;
-          if (input_diff >= diff_min) {
-            const int32 input_diff_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_diff, input_beta_multiplier, input_beta_left_shift);
-            const FixedPointScaledDiff scaled_diff_f8 =
-                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
-            FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-            int32 unsat_output = gemmlowp::RoundingDivideByPOT(
-                (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
-
-            output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>(
-                std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
+                       float* output_data, const Dims<4>& output_dims) {
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-          } else {
-            output_data[Offset(output_dims, c, x, y, b)] = 0;
-          }
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp(input_data[i * depth + c] - max);
+    }
+
+    // Compute result.
+    const float log_sum = std::log(sum);
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
     }
   }
 }
 
-inline void Logistic(const float* input_data, const Dims<4>& input_dims,
-                     float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const Dims<4>& output_dims) {
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = 1.f / (1.f + std::exp(-val));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
+
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    // TODO(b/77858996): Implement fixed-point log().
+    // Not a fully-quantized implementation: floating-point log().
+    const float float_log_sum_of_exps =
+        std::log(static_cast<float>(sum_of_exps.raw()) /
+                 (1 << (31 - kAccumulationIntegerBits)));
+    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
+        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
+    const int adjusted_diff_min =
+        std::max(diff_min - 1,  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     reverse_scaling_right_shift));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32 unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
       }
     }
   }
 }
 
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = 1.f / (1.f + std::exp(-val));
+    output_data[i] = result;
+  }
+}
+
 inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_zero_point, int32 input_range_radius,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
-          const int32 input_val_centered =
-              static_cast<int32>(input_val_u8) - input_zero_point;
-          uint8 output_val;
-          if (input_val_centered <= -input_range_radius) {
-            output_val = 0;
-          } else if (input_val_centered >= input_range_radius) {
-            output_val = 255;
-          } else {
-            const int32 input_val_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_val_centered, input_multiplier, input_left_shift);
-            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-            const FixedPoint4 input_val_f4 =
-                FixedPoint4::FromRaw(input_val_rescaled);
-            const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
-            using gemmlowp::RoundingDivideByPOT;
-            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
-            if (output_val_s32 == 256) {
-              output_val_s32 = 255;
-            }
-            TFLITE_DCHECK_GE(output_val_s32, 0);
-            TFLITE_DCHECK_LE(output_val_s32, 255);
-            output_val = static_cast<uint8>(output_val_s32);
-          }
-          output_data[Offset(output_dims, c, x, y, b)] = output_val;
-        }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8 input_val_u8 = input_data[i];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      // Convert from Q0.31 to Q23.8.
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
       }
+      // Reinterpret as U0.8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
     }
+    output_data[i] = output_val;
+  }
+}
+
+inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
+                     int16* output_data, const Dims<4>& output_dims) {
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    const F3 input = F3::FromRaw(input_data[i]);
+    F0 output = gemmlowp::logistic(input);
+    output_data[i] = output.raw();
   }
 }
 
 inline void Tanh(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = std::tanh(val);
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
   }
 }
 
@@ -2304,45 +2781,70 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_multiplier, int input_left_shift,
                  uint8* output_data, const Dims<4>& output_dims) {
   const int32 output_zero_point = 128;
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
-          const int32 input_val_centered =
-              static_cast<int32>(input_val_u8) - input_zero_point;
-          uint8 output_val;
-          if (input_val_centered <= -input_range_radius) {
-            output_val = 0;
-          } else if (input_val_centered >= input_range_radius) {
-            output_val = 255;
-          } else {
-            const int32 input_val_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_val_centered, input_multiplier, input_left_shift);
-            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-            const FixedPoint4 input_val_f4 =
-                FixedPoint4::FromRaw(input_val_rescaled);
-            const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
-
-            using gemmlowp::RoundingDivideByPOT;
-            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
-            output_val_s32 += output_zero_point;
-            if (output_val_s32 == 256) {
-              output_val_s32 = 255;
-            }
-            TFLITE_DCHECK_GE(output_val_s32, 0);
-            TFLITE_DCHECK_LE(output_val_s32, 255);
-            output_val = static_cast<uint8>(output_val_s32);
-          }
-          output_data[Offset(output_dims, c, x, y, b)] = output_val;
-        }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8 input_val_u8 = input_data[i];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      // Convert from Q0.31 to Q24.7.
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
       }
+      // Reinterpret as Q0.7, encoded in uint8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
+    }
+    output_data[i] = output_val;
+  }
+}
+
+inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
+                 int input_left_shift, int16* output_data,
+                 const Dims<4>& output_dims) {
+  // Support for shifts is limited until we have a parameterized version of
+  // SaturatingRoundingMultiplyByPOT().
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  TFLITE_DCHECK_LE(input_left_shift, 1);
+
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  // F0 uses 0 integer bits, range [-1, 1].
+  // This is the return type of math functions such as tanh, logistic,
+  // whose range is in [-1, 1].
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+  if (input_left_shift == 0) {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(input_data[i]);
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  } else {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(
+          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
     }
   }
 }
@@ -2350,138 +2852,62 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int32 val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = static_cast<float>(scale * (val - zero_point));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int32 val = input_data[i];
+    float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
   }
 }
 
 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
-                      float rmin, float rmax, float* output_data,
+                      float rmin, float rmax, int num_bits, float* output_data,
                       const Dims<4>& output_dims) {
   // 0 should always be a representable value. Let's assume that the initial
   // min,max range contains 0.
-  TFLITE_DCHECK_LE(rmin, 0.);
-  TFLITE_DCHECK_GE(rmax, 0.);
-
-  // Determine quantization parameters: zero_point, scale.
-  using Integer = uint8;
-  const Integer qmin = std::numeric_limits<Integer>::min();
-  const Integer qmax = std::numeric_limits<Integer>::max();
-  const float qmin_float = qmin;
-  const float qmax_float = qmax;
-  int32 zero_point = 0;
-  float scale = 0.f;
-  // If rmin==rmax, both must be zero per the above assertion,
-  // so we are done.
-  if (rmin != rmax) {
-    // First determine the scale.
-    scale = (rmax - rmin) / (qmax_float - qmin_float);
-
-    // Zero-point computation.
-    // First the initial floating-point computation. The zero-point can be
-    // determined from solving an affine equation for any known pair
-    // (real value, corresponding quantized value).
-    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-    // The arithmetic error on the zero point computed from either pair
-    // will be roughly machine_epsilon * (sum of absolute values of terms)
-    // so we want to use the variant that adds the smaller terms.
-    const float zero_point_from_min = qmin_float - rmin / scale;
-    const float zero_point_from_max = qmax_float - rmax / scale;
-    const float zero_point_from_min_error =
-        std::abs(qmin_float) + std::abs(rmin / scale);
-    const float zero_point_from_max_error =
-        std::abs(qmax_float) + std::abs(rmax / scale);
-
-    const float zero_point_float =
-        zero_point_from_min_error < zero_point_from_max_error
-            ? zero_point_from_min
-            : zero_point_from_max;
-
-    // Now we need to nudge the zero point to be an integer
-    // (our zero points are integer, and this is motivated by the requirement
-    // to be able to represent the real value "0" exactly as a quantized value,
-    // which is required in multiple places, for example in Im2col with SAME
-    // padding).
-    if (zero_point_float < qmin_float) {
-      zero_point = qmin;
-    } else if (zero_point_float > qmax_float) {
-      zero_point = qmax;
-    } else {
-      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
-    }
-    // The zero point should always be in the range of quantized value,
-    // [qmin, qmax].
-    TFLITE_DCHECK_GE(zero_point, qmin);
-    TFLITE_DCHECK_LE(zero_point, qmax);
-  }
-
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float unclamped_quantized_val =
-              TfLiteRound(zero_point + src_val / scale);
-          const float quantized_val = std::min(
-              qmax_float, std::max(qmin_float, unclamped_quantized_val));
-          const float dst_val = scale * (quantized_val - zero_point);
-          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
-        }
-      }
-    }
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
+                         &nudged_max, &nudged_scale);
+  const float inv_nudged_scale = 1.0f / nudged_scale;
+
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+  for (int i = 0; i < flat_size; i++) {
+    const float src_val = input_data[i];
+    const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
+    const float clamped_shifted = clamped - nudged_min;
+    const float dst_val =
+        TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+        nudged_min;
+    output_data[i] = dst_val;
   }
 }
 
 template <typename SrcT, typename DstT>
 inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
                  DstT* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int offset = Offset(input_dims, c, x, y, b);
-          output_data[offset] = static_cast<DstT>(input_data[offset]);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = static_cast<DstT>(input_data[offset]);
   }
 }
 
 inline void Floor(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int offset = Offset(input_dims, c, x, y, b);
-          output_data[offset] = std::floor(input_data[offset]);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::floor(input_data[offset]);
   }
 }
 
@@ -2611,24 +3037,37 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
 template <typename T>
 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
-                           const Dims<4>& block_shape_dims, T* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& block_shape_dims,
+                           const int32* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
   const int output_batch_size = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
   const int input_batch_size = ArraySize(input_dims, 3);
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int depth = ArraySize(input_dims, 0);
   const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
+  const int crops_top = crops_data[0];
+  const int crops_left = crops_data[2];
 
   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
     for (int in_h = 0; in_h < input_height; ++in_h) {
+      const int out_h = in_h * block_shape_height +
+                        spatial_offset / block_shape_width - crops_top;
+      if (out_h < 0 || out_h >= output_height) {
+        continue;
+      }
       for (int in_w = 0; in_w < input_width; ++in_w) {
-        int out_batch = in_batch % output_batch_size;
-        int out_w = in_w * block_shape_width +
-                    (in_batch / output_batch_size) % block_shape_width;
-        int out_h = in_h * block_shape_height +
-                    (in_batch / output_batch_size) / block_shape_width;
+        const int out_w = in_w * block_shape_width +
+                          spatial_offset % block_shape_width - crops_left;
+
+        if (out_w < 0 || out_w >= output_width) {
+          continue;
+        }
         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
         const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
         memcpy(out, in, depth * sizeof(T));
@@ -2641,7 +3080,10 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
@@ -2671,7 +3113,7 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
               out_w >= output_width - right_w_padding ||
               out_d < left_d_padding ||
               out_d >= output_depth - right_d_padding) {
-            *out_ptr++ = 0;
+            *out_ptr++ = static_cast<T>(pad_value);
           } else {
             *out_ptr++ = *in_ptr++;
           }
@@ -2681,59 +3123,56 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-inline bool LoopCondition(int index, int stop, int stride) {
-  return stride > 0 ? index < stop : index > stop;
-}
-
-inline int StartIndex(int start, int stride, int dim, bool masked) {
-  return masked ? (stride > 0 ? 0 : dim - 1) : start;
-}
-
-inline int StopIndex(int start, int stop, int stride, int dim, bool masked,
-                     bool shrink_axis_masked) {
-  return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1
-                            : masked ? (stride > 0 ? dim : -1) : stop;
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
 }
 
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask, int shrink_axis_mask,
-                         const std::vector<int>& starts,
-                         const std::vector<int>& stops,
+                         int begin_mask, int end_mask,
+                         const std::vector<int>& start_indices,
+                         const std::vector<int>& stop_indices,
                          const std::vector<int>& strides, T* output_data,
                          const Dims<4>& output_dims) {
-  TFLITE_DCHECK_EQ(starts.size(), 4);
-  TFLITE_DCHECK_EQ(stops.size(), 4);
+  // Note that the axis orders are reversed for runtime ops, so the indices,
+  // strides and masks must be as well too.
+  TFLITE_DCHECK_EQ(start_indices.size(), 4);
+  TFLITE_DCHECK_EQ(stop_indices.size(), 4);
   TFLITE_DCHECK_EQ(strides.size(), 4);
-  const int start_b =
-      StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
-  const int stop_b =
-      StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3],
-                end_mask & 8, shrink_axis_mask & 8);
-  const int start_h =
-      StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
-  const int stop_h =
-      StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2],
-                end_mask & 4, shrink_axis_mask & 4);
-  const int start_w =
-      StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
-  const int stop_w =
-      StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1],
-                end_mask & 2, shrink_axis_mask & 2);
-  const int start_d =
-      StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
-  const int stop_d =
-      StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0],
-                end_mask & 1, shrink_axis_mask & 1);
+  const int start_b = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 3);
+  const int stop_b = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 3);
+  const int start_h = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 2);
+  const int stop_h = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 2);
+  const int start_w = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 1);
+  const int stop_w = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 1);
+  const int start_d = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 0);
+  const int stop_d = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 0);
 
   T* out_ptr = output_data;
-  for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
+  for (int in_b = start_b;
+       !strided_slice::LoopCondition(in_b, stop_b, strides[3]);
        in_b += strides[3]) {
-    for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]);
+    for (int in_h = start_h;
+         !strided_slice::LoopCondition(in_h, stop_h, strides[2]);
          in_h += strides[2]) {
-      for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]);
+      for (int in_w = start_w;
+           !strided_slice::LoopCondition(in_w, stop_w, strides[1]);
            in_w += strides[1]) {
-        for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]);
+        for (int in_d = start_d;
+             !strided_slice::LoopCondition(in_d, stop_d, strides[0]);
              in_d += strides[0]) {
           *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
@@ -2742,18 +3181,6 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-template <typename T>
-inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask,
-                         const std::vector<int>& starts,
-                         const std::vector<int>& stops,
-                         const std::vector<int>& strides, T* output_data,
-                         const Dims<4>& output_dims) {
-  StridedSlice(input_data, input_dims, begin_mask, end_mask,
-               /*shrink_axis_mask=*/0, starts, stops, strides, output_data,
-               output_dims);
-}
-
 template <typename T>
 inline void Slice(const T* input_data, const Dims<4>& input_dims,
                   const std::vector<int>& begin, const std::vector<int>& size,
@@ -2794,19 +3221,20 @@ inline void Exp(const T* input_data, const size_t num_elements,
   }
 }
 
-template <typename T>
-inline void Mean(T* input_data, const int* input_dims, const int input_num_dims,
+template <typename T, typename U>
+inline bool Mean(T* input_data, const int* input_dims, const int input_num_dims,
                  T* output_data, const int* output_dims,
                  const int output_num_dims, const int* axis,
                  const int num_axis_dimensions, bool keep_dims, int* temp_index,
-                 int* resolved_axis) {
+                 int* resolved_axis, U* temp_sum) {
   // resets output data.
   size_t num_outputs = 1;
   for (int idx = 0; idx < output_num_dims; ++idx) {
     num_outputs *= static_cast<size_t>(output_dims[idx]);
   }
   for (size_t idx = 0; idx < num_outputs; ++idx) {
-    output_data[idx] = 0;
+    output_data[idx] = T();
+    temp_sum[idx] = U();
   }
   // resets temp index.
   for (int idx = 0; idx < input_num_dims; ++idx) {
@@ -2839,17 +3267,24 @@ inline void Mean(T* input_data, const int* input_dims, const int input_num_dims,
     size_t output_offset =
         ReducedOutputOffset(input_num_dims, input_dims, temp_index,
                             num_resolved_axis, resolved_axis);
-    output_data[output_offset] += input_data[input_offset];
+    temp_sum[output_offset] += static_cast<U>(input_data[input_offset]);
   }
   // takes average by num of elements added to get mean.
   size_t num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
-    num_elements_in_axis *= static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
   }
-  for (size_t idx = 0; idx < num_outputs; ++idx) {
-    output_data[idx] = static_cast<T>(static_cast<float>(output_data[idx]) /
-                                      num_elements_in_axis);
+  if (num_elements_in_axis > 0) {
+    for (size_t idx = 0; idx < num_outputs; ++idx) {
+      output_data[idx] =
+          static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
+    }
   }
+  return true;
 }
 
 template <typename T>
@@ -2922,23 +3357,11 @@ template <typename T>
 void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
-  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
-  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
-  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
 
   auto min_value = input2_data[0];
-
-  for (int b = 0; b < batches; b++) {
-    for (int y = 0; y < input_height; y++) {
-      for (int x = 0; x < input_width; x++) {
-        for (int c = 0; c < depth; c++) {
-          int offset = Offset(input1_dims, c, x, y, b);
-          output_data[offset] =
-              input1_data[offset] > min_value ? min_value : input1_data[offset];
-        }
-      }
-    }
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
   }
 }
 
@@ -2946,20 +3369,33 @@ template <typename T>
 void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
-  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
-  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
-  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
 
   auto max_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
+  }
+}
+
+template <typename T, typename Op>
+void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                              const T* input2_data, const Dims<4>& input2_dims,
+                              T* output_data, const Dims<4>& output_dims,
+                              Op op) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
 
-  for (int b = 0; b < batches; b++) {
-    for (int y = 0; y < input_height; y++) {
-      for (int x = 0; x < input_width; x++) {
-        for (int c = 0; c < depth; c++) {
-          int offset = Offset(input1_dims, c, x, y, b);
-          output_data[offset] =
-              input1_data[offset] < max_value ? max_value : input1_data[offset];
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          auto out_idx = Offset(output_dims, c, x, y, b);
+          auto in1_idx = SubscriptToIndex(desc1, c, x, y, b);
+          auto in2_idx = SubscriptToIndex(desc2, c, x, y, b);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = op(in1_val, in2_val);
         }
       }
     }
@@ -2978,25 +3414,20 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
   // input dimensions here. We enforce the constraint that the last dimension
   // must always be 1.
   TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = ArraySize(input_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
-        int max_index = 0;
-        for (int d = 1; d < depth; ++d) {
-          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
-          if (curr_value > max_value) {
-            max_value = curr_value;
-            max_index = d;
-          }
-        }
-        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+
+  for (int i = 0; i < outer_size; ++i) {
+    auto max_value = input_data[i * depth];
+    int max_index = 0;
+    for (int d = 1; d < depth; ++d) {
+      const auto& curr_value = input_data[i * depth + d];
+      if (curr_value > max_value) {
+        max_value = curr_value;
+        max_index = d;
       }
     }
+    output_data[i] = max_index;
   }
 }
 
@@ -3029,6 +3460,112 @@ void Transpose(const T* input, const Dims<4>& input_dims, T* output,
   }
 }
 
+inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, float* output_data,
+                          const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+
+  // Although transpose convolution simplifies to convolution with transposed
+  // weights for strides of 1, non-unitary striding complicates matters. To
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
+  for (int i = 0; i < RequiredBufferSizeForDims(output_dims); i++) {
+    output_data[i] = 0.0f;
+  }
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  float input_value = input_data[Offset(input_dims, in_channel,
+                                                        in_x, in_y, batch)];
+                  float filter_value =
+                      filter_data[Offset(filter_dims, out_channel, filter_x,
+                                         filter_y, in_channel)];
+                  output_data[Offset(output_dims, out_channel, out_x, out_y,
+                                     batch)] += input_value * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Less(int64_t num_elements, const T* input1, const T* input2,
+                 bool* output) {
+  for (int64_t i = 0; i < num_elements; ++i) {
+    output[i] = input1[i] < input2[i];
+  }
+}
+
+template <typename T>
+inline void Less(const T* input1_data, const Dims<4>& input1_dims,
+                 const T* input2_data, const Dims<4>& input2_dims,
+                 bool* output_data, const Dims<4>& output_dims) {
+  const int64_t batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int64_t height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int64_t width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int64_t depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  Less(batches * height * width * depth, input1_data, input2_data, output_data);
+}
+
+template <typename T1, typename T2>
+inline void BroadcastLess(T1* input1_data, const Dims<4>& input1_dims,
+                          T2* input2_data, const Dims<4>& input2_dims,
+                          bool* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastLess");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] <
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4eddf7bf0a2cbca695dae20ba8ba56a9cd72e4ba
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
@@ -0,0 +1,244 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/kernels/internal/spectrogram.h"
+
+#include <assert.h>
+#include <math.h>
+
+#include "third_party/fft2d/fft.h"
+
+namespace tflite {
+namespace internal {
+
+using std::complex;
+
+namespace {
+// Returns the default Hann window function for the spectrogram.
+void GetPeriodicHann(int window_length, std::vector<double>* window) {
+  // Some platforms don't have M_PI, so define a local constant here.
+  const double pi = std::atan(1) * 4;
+  window->resize(window_length);
+  for (int i = 0; i < window_length; ++i) {
+    (*window)[i] = 0.5 - 0.5 * cos((2 * pi * i) / window_length);
+  }
+}
+}  // namespace
+
+bool Spectrogram::Initialize(int window_length, int step_length) {
+  std::vector<double> window;
+  GetPeriodicHann(window_length, &window);
+  return Initialize(window, step_length);
+}
+
+inline int Log2Floor(uint n) {
+  if (n == 0) return -1;
+  int log = 0;
+  uint value = n;
+  for (int i = 4; i >= 0; --i) {
+    int shift = (1 << i);
+    uint x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+
+inline int Log2Ceiling(uint n) {
+  int floor = Log2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two
+    return floor;
+  else
+    return floor + 1;
+}
+
+inline uint NextPowerOfTwo(uint value) {
+  int exponent = Log2Ceiling(value);
+  // DCHECK_LT(exponent, std::numeric_limits<uint32>::digits);
+  return 1 << exponent;
+}
+
+bool Spectrogram::Initialize(const std::vector<double>& window,
+                             int step_length) {
+  window_length_ = window.size();
+  window_ = window;  // Copy window.
+  if (window_length_ < 2) {
+    // LOG(ERROR) << "Window length too short.";
+    initialized_ = false;
+    return false;
+  }
+
+  step_length_ = step_length;
+  if (step_length_ < 1) {
+    // LOG(ERROR) << "Step length must be positive.";
+    initialized_ = false;
+    return false;
+  }
+
+  fft_length_ = NextPowerOfTwo(window_length_);
+  // CHECK(fft_length_ >= window_length_);
+  output_frequency_channels_ = 1 + fft_length_ / 2;
+
+  // Allocate 2 more than what rdft needs, so we can rationalize the layout.
+  fft_input_output_.assign(fft_length_ + 2, 0.0);
+
+  int half_fft_length = fft_length_ / 2;
+  fft_double_working_area_.assign(half_fft_length, 0.0);
+  fft_integer_working_area_.assign(2 + static_cast<int>(sqrt(half_fft_length)),
+                                   0);
+  // Set flag element to ensure that the working areas are initialized
+  // on the first call to cdft.  It's redundant given the assign above,
+  // but keep it as a reminder.
+  fft_integer_working_area_[0] = 0;
+  input_queue_.clear();
+  samples_to_next_step_ = window_length_;
+  initialized_ = true;
+  return true;
+}
+
+template <class InputSample, class OutputSample>
+bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<InputSample>& input,
+    std::vector<std::vector<complex<OutputSample>>>* output) {
+  if (!initialized_) {
+    // LOG(ERROR) << "ComputeComplexSpectrogram() called before successful call
+    // "
+    //           << "to Initialize().";
+    return false;
+  }
+  // CHECK(output);
+  output->clear();
+  int input_start = 0;
+  while (GetNextWindowOfSamples(input, &input_start)) {
+    // DCHECK_EQ(input_queue_.size(), window_length_);
+    ProcessCoreFFT();  // Processes input_queue_ to fft_input_output_.
+    // Add a new slice vector onto the output, to save new result to.
+    output->resize(output->size() + 1);
+    // Get a reference to the newly added slice to fill in.
+    auto& spectrogram_slice = output->back();
+    spectrogram_slice.resize(output_frequency_channels_);
+    for (int i = 0; i < output_frequency_channels_; ++i) {
+      // This will convert double to float if it needs to.
+      spectrogram_slice[i] = complex<OutputSample>(
+          fft_input_output_[2 * i], fft_input_output_[2 * i + 1]);
+    }
+  }
+  return true;
+}
+// Instantiate it four ways:
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<complex<float>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<double>& input,
+    std::vector<std::vector<complex<float>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<float>& input,
+    std::vector<std::vector<complex<double>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<double>& input,
+    std::vector<std::vector<complex<double>>>*);
+
+template <class InputSample, class OutputSample>
+bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<InputSample>& input,
+    std::vector<std::vector<OutputSample>>* output) {
+  if (!initialized_) {
+    // LOG(ERROR) << "ComputeSquaredMagnitudeSpectrogram() called before "
+    //           << "successful call to Initialize().";
+    return false;
+  }
+  // CHECK(output);
+  output->clear();
+  int input_start = 0;
+  while (GetNextWindowOfSamples(input, &input_start)) {
+    // DCHECK_EQ(input_queue_.size(), window_length_);
+    ProcessCoreFFT();  // Processes input_queue_ to fft_input_output_.
+    // Add a new slice vector onto the output, to save new result to.
+    output->resize(output->size() + 1);
+    // Get a reference to the newly added slice to fill in.
+    auto& spectrogram_slice = output->back();
+    spectrogram_slice.resize(output_frequency_channels_);
+    for (int i = 0; i < output_frequency_channels_; ++i) {
+      // Similar to the Complex case, except storing the norm.
+      // But the norm function is known to be a performance killer,
+      // so do it this way with explicit real and imagninary temps.
+      const double re = fft_input_output_[2 * i];
+      const double im = fft_input_output_[2 * i + 1];
+      // Which finally converts double to float if it needs to.
+      spectrogram_slice[i] = re * re + im * im;
+    }
+  }
+  return true;
+}
+// Instantiate it four ways:
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<float>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<double>& input, std::vector<std::vector<float>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<double>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<double>& input, std::vector<std::vector<double>>*);
+
+// Return true if a full window of samples is prepared; manage the queue.
+template <class InputSample>
+bool Spectrogram::GetNextWindowOfSamples(const std::vector<InputSample>& input,
+                                         int* input_start) {
+  auto input_it = input.begin() + *input_start;
+  int input_remaining = input.end() - input_it;
+  if (samples_to_next_step_ > input_remaining) {
+    // Copy in as many samples are left and return false, no full window.
+    input_queue_.insert(input_queue_.end(), input_it, input.end());
+    *input_start += input_remaining;  // Increases it to input.size().
+    samples_to_next_step_ -= input_remaining;
+    return false;  // Not enough for a full window.
+  } else {
+    // Copy just enough into queue to make a new window, then trim the
+    // front off the queue to make it window-sized.
+    input_queue_.insert(input_queue_.end(), input_it,
+                        input_it + samples_to_next_step_);
+    *input_start += samples_to_next_step_;
+    input_queue_.erase(
+        input_queue_.begin(),
+        input_queue_.begin() + input_queue_.size() - window_length_);
+    // DCHECK_EQ(window_length_, input_queue_.size());
+    samples_to_next_step_ = step_length_;  // Be ready for next time.
+    return true;  // Yes, input_queue_ now contains exactly a window-full.
+  }
+}
+
+void Spectrogram::ProcessCoreFFT() {
+  for (int j = 0; j < window_length_; ++j) {
+    fft_input_output_[j] = input_queue_[j] * window_[j];
+  }
+  // Zero-pad the rest of the input buffer.
+  for (int j = window_length_; j < fft_length_; ++j) {
+    fft_input_output_[j] = 0.0;
+  }
+  const int kForwardFFT = 1;  // 1 means forward; -1 reverse.
+  // This real FFT is a fair amount faster than using cdft here.
+  rdft(fft_length_, kForwardFFT, &fft_input_output_[0],
+       &fft_integer_working_area_[0], &fft_double_working_area_[0]);
+  // Make rdft result look like cdft result;
+  // unpack the last real value from the first position's imag slot.
+  fft_input_output_[fft_length_] = fft_input_output_[1];
+  fft_input_output_[fft_length_ + 1] = 0;
+  fft_input_output_[1] = 0;
+}
+
+}  // namespace internal
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/spectrogram.h b/tensorflow/contrib/lite/kernels/internal/spectrogram.h
new file mode 100644
index 0000000000000000000000000000000000000000..b77a68f7dfe6edb07ec4e5db540c673b2d6f6d6e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/spectrogram.h
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class for generating spectrogram slices from a waveform.
+// Initialize() should be called before calls to other functions.  Once
+// Initialize() has been called and returned true, The Compute*() functions can
+// be called repeatedly with sequential input data (ie. the first element of the
+// next input vector directly follows the last element of the previous input
+// vector). Whenever enough audio samples are buffered to produce a
+// new frame, it will be placed in output. Output is cleared on each
+// call to Compute*(). This class is thread-unsafe, and should only be
+// called from one thread at a time.
+// With the default parameters, the output of this class should be very
+// close to the results of the following MATLAB code:
+// overlap_samples = window_length_samples - step_samples;
+// window = hann(window_length_samples, 'periodic');
+// S = abs(spectrogram(audio, window, overlap_samples)).^2;
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
+
+#include <complex>
+#include <deque>
+#include <vector>
+
+#include "third_party/fft2d/fft.h"
+
+namespace tflite {
+namespace internal {
+
+class Spectrogram {
+ public:
+  Spectrogram() : initialized_(false) {}
+  ~Spectrogram() {}
+
+  // Initializes the class with a given window length and step length
+  // (both in samples). Internally a Hann window is used as the window
+  // function. Returns true on success, after which calls to Process()
+  // are possible. window_length must be greater than 1 and step
+  // length must be greater than 0.
+  bool Initialize(int window_length, int step_length);
+
+  // Initialize with an explicit window instead of a length.
+  bool Initialize(const std::vector<double>& window, int step_length);
+
+  // Processes an arbitrary amount of audio data (contained in input)
+  // to yield complex spectrogram frames. After a successful call to
+  // Initialize(), Process() may be called repeatedly with new input data
+  // each time.  The audio input is buffered internally, and the output
+  // vector is populated with as many temporally-ordered spectral slices
+  // as it is possible to generate from the input.  The output is cleared
+  // on each call before the new frames (if any) are added.
+  //
+  // The template parameters can be float or double.
+  template <class InputSample, class OutputSample>
+  bool ComputeComplexSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<std::complex<OutputSample>>>* output);
+
+  // This function works as the one above, but returns the power
+  // (the L2 norm, or the squared magnitude) of each complex value.
+  template <class InputSample, class OutputSample>
+  bool ComputeSquaredMagnitudeSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<OutputSample>>* output);
+
+  // Return reference to the window function used internally.
+  const std::vector<double>& GetWindow() const { return window_; }
+
+  // Return the number of frequency channels in the spectrogram.
+  int output_frequency_channels() const { return output_frequency_channels_; }
+
+ private:
+  template <class InputSample>
+  bool GetNextWindowOfSamples(const std::vector<InputSample>& input,
+                              int* input_start);
+  void ProcessCoreFFT();
+
+  int fft_length_;
+  int output_frequency_channels_;
+  int window_length_;
+  int step_length_;
+  bool initialized_;
+  int samples_to_next_step_;
+
+  std::vector<double> window_;
+  std::vector<double> fft_input_output_;
+  std::deque<double> input_queue_;
+
+  // Working data areas for the FFT routines.
+  std::vector<int> fft_integer_working_area_;
+  std::vector<double> fft_double_working_area_;
+};
+
+}  // namespace internal
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h b/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef77371bf65cc975dfa35275c8daa32de112a249
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+
+#include <limits>
+#include <vector>
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+namespace strided_slice {
+
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int v, const int lo, const int hi) {
+  TFLITE_DCHECK(!(hi < lo));
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axis_size - 1] that can be used to index
+// directly into the data.
+template <typename IntType>
+inline int StartForAxis(int begin_mask,
+                        std::vector<IntType> const& start_indices,
+                        std::vector<IntType> const& strides,
+                        int const* input_shape, int axis) {
+  // Begin with the specified index
+  int start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & 1 << axis) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape[axis];
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+template <typename IntType>
+inline int StopForAxis(int end_mask, std::vector<IntType> const& stop_indices,
+                       std::vector<IntType> const& strides,
+                       int const* input_shape, int axis) {
+  // Begin with the specified index
+  int stop = stop_indices[axis];
+
+  // end_mask override
+  if (end_mask & (1 << axis)) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape[axis];
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
+inline bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
+}
+
+}  // namespace strided_slice
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 62e38e0d4c3e023d0ed2242fc9438b096b86dc59..62cea143e6afc0631493012be26808a89eb03138 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -44,6 +44,11 @@ inline int64_t* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.i64 : nullptr;
 }
 
+template <>
+inline bool* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.b : nullptr;
+}
+
 inline int RemapDim(int max_dimensions, int d) {
   return max_dimensions - d - 1;
 }
@@ -126,6 +131,29 @@ class VectorOfTensors {
   std::vector<Dims<4>*> all_dims_ptr_;
 };
 
+// A list of quantized tensors in a format that can be used by kernels like
+// split and concatenation.
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfQuantizedTensors(const TfLiteContext& context,
+                           const TfLiteIntArray& tensor_list)
+      : VectorOfTensors<uint8>(context, tensor_list) {
+    for (int i = 0; i < tensor_list.size; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      zero_point_.push_back(t->params.zero_point);
+      scale_.push_back(t->params.scale);
+    }
+  }
+
+  const float* scale() const { return scale_.data(); }
+  const int32* zero_point() const { return zero_point_.data(); }
+
+ private:
+  std::vector<int32> zero_point_;
+  std::vector<float> scale_;
+};
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
index 40d144979b2f965725db86ff311e90f39438802f..997dc4425d31e8ed71aefe4417562345af6b508e 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
@@ -23,6 +23,14 @@ namespace tensor_utils {
 // Limit a float input f between +abs_limit and -abs_limit.
 float Clip(float f, float abs_limit);
 
+// Quantizes a buffer of floating point values using a symmetric quantization
+// (i.e. linear quantization without an offset) to 8-bit signed integers.
+// It also outputs the range (min, max) of the floating point buffer, and the
+// scaling factor used to quantize the values.
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min, float* max,
+                             float* scaling_factor);
+
 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector using a stride value provided in result_stride. 'result_stride' shows
 // how the number of elements between consecutive result values. For example
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
index 588f1a428b8c84367d659c2c5bb59a411cd8bb34..22b016746fe0fb36c61d2f157eb7f8170b2002a7 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
@@ -32,6 +32,55 @@ TEST(uKernels, ClipTest) {
                   {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0})));
 }
 
+TEST(uKernels, SymmetricQuantizeFloatsTest) {
+  constexpr int kVectorSize = 9;
+  static float input[kVectorSize] = {-640, -635.0, -630, 10.0,  2.0,
+                                     -5.0, -10.0,  0.0,  1000.0};
+
+  int8 output[kVectorSize];
+  float min, max, scaling_factor;
+  SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
+                          &scaling_factor);
+
+  EXPECT_EQ(min, -640);
+  EXPECT_EQ(max, 1000);
+  EXPECT_NEAR(scaling_factor, 0.127, 1e-6);  // EQ won't work due to fpoint.
+  EXPECT_THAT(output,
+              testing::ElementsAreArray({-81, -81, -80, 1, 0, -1, -1, 0, 127}));
+}
+
+TEST(uKernels, SymmetricQuantizeFloatsAllZerosTest) {
+  constexpr int kVectorSize = 9;
+  static float input[kVectorSize] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  int8 output[kVectorSize];
+  float min, max, scaling_factor;
+  SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
+                          &scaling_factor);
+
+  EXPECT_EQ(min, 0);
+  EXPECT_EQ(max, 0);
+  EXPECT_EQ(scaling_factor, 1);
+  EXPECT_THAT(output, testing::ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0, 0}));
+}
+
+TEST(uKernels, SymmetricQuantizeFloatsAllAlmostZeroTest) {
+  constexpr int kVectorSize = 9;
+  static float input[kVectorSize] = {-1e-5, 3e-5, -7e-6, -9e-5, 1e-6,
+                                     4e-5,  9e-6, 2e-4,  0};
+
+  int8 output[kVectorSize];
+  float min, max, scaling_factor;
+  SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
+                          &scaling_factor);
+
+  EXPECT_NEAR(min, -9e-05, 1e-6);
+  EXPECT_NEAR(max, 0.0002, 1e-6);
+  EXPECT_EQ(scaling_factor, 635000);
+  EXPECT_THAT(output,
+              testing::ElementsAreArray({-6, 19, -4, -57, 1, 25, 6, 127, 0}));
+}
+
 TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
   constexpr int kRow = 3;
   constexpr int kCol = 4;
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index afe131b06ec41201395e80aa5415fd7db990f8d4..3290c364c18224edb733c177ad72bf86b6892434 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -21,6 +21,22 @@ namespace tflite {
 
 enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
 
+// Quantization parameters, determining the mapping of quantized values
+// to real values (i.e. determining how quantized values are mathematically
+// interpreted).
+//
+// The correspondence is as follows:
+//
+//   real_value = scale * (quantized_value - zero_point);
+//
+// In other words, zero_point designates which quantized value corresponds to
+// the real 0 value, and scale designates the difference between the real values
+// corresponding to consecutive quantized values differing by 1.
+struct QuantizationParams {
+  int32 zero_point = 0;
+  double scale = 0.0;
+};
+
 template <int N>
 struct Dims {
   int sizes[N];
@@ -114,14 +130,125 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
   return MatchingArraySize(array1, index1, args...);
 }
 
-inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+template <int N>
+inline int FlatSize(const Dims<N>& dims) {
   int max_offset = 0;
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < N; i++) {
     max_offset += (dims.sizes[i] - 1) * dims.strides[i];
   }
   return max_offset + 1;
 }
 
+// Deprecated. Prefer FlatSize.
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  return FlatSize(dims);
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2,
+                            const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
+}
+
+// Data is required to be contiguous, and so many operators can use either the
+// full array flat size or the flat size with one dimension skipped (commonly
+// the depth).
+template <int N>
+inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
+  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < N);
+  int flat_size = 1;
+  for (int i = 0; i < N; i++) {
+    flat_size *= (i == skip_dim) ? 1 : dims.sizes[i];
+  }
+  return flat_size;
+}
+
+// A combination of MatchingFlatSize() and FlatSizeSkipDim().
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return FlatSizeSkipDim(dims, skip_dim);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2,
+                                   const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2,
+                                 check_dims_3);
+}
+
 template <int N>
 bool IsPackedWithoutStrides(const Dims<N>& dims) {
   int expected_stride = 1;
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index 28f53b9fbbc5620f2fab5c73e40bed8af4af5f1e..2f407b5da31594335dba31b3057737e67a974057 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -53,13 +53,13 @@ inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
 }
 
 // Determines whether tensor is constant.
-inline bool IsConstantTensor(TfLiteTensor* tensor) {
+inline bool IsConstantTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteMmapRo;
 }
 
 // Determines whether tensor is dynamic. Note that a tensor can be non-const and
-// not dynamic. This function specificially checks for a dynamic tensor.
-inline bool IsDynamicTensor(TfLiteTensor* tensor) {
+// not dynamic. This function specifically checks for a dynamic tensor.
+inline bool IsDynamicTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteDynamic;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc
index ee8bfe56d95e9f383ef49b40b8f58b63d61da3e1..e67f4e06f3680f8c9447a9e831b63415994ea176 100644
--- a/tensorflow/contrib/lite/kernels/l2norm.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm.cc
@@ -45,10 +45,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
 
-  // TODO(ahentz): Our current implementations only support float32.
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE(
+      context, output->type == kTfLiteFloat32 || output->type == kTfLiteUInt8);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
+  if (output->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
+  }
+
   // TODO(ahentz): For some reason our implementations don't support
   // activations.
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
@@ -75,6 +80,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_L2NORM(optimized_ops);
     }
 #undef TF_LITE_L2NORM
+  } else if (output->type == kTfLiteUInt8) {
+#define TF_LITE_L2NORM(type)                                               \
+  type::L2Normalization(GetTensorData<uint8>(input), GetTensorDims(input), \
+                        input->params.zero_point,                          \
+                        GetTensorData<uint8>(output), GetTensorDims(output))
+
+    if (kernel_type == kReference) {
+      TF_LITE_L2NORM(reference_ops);
+    }
+    if (kernel_type == kGenericOptimized) {
+      TF_LITE_L2NORM(optimized_ops);
+    }
+#undef TF_LITE_L2NORM
   } else {
     context->ReportError(context, "Inputs and outputs not all float types.");
     return kTfLiteError;
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
index 30e103f3303484c339ef98e6a68e0438291c102f..042314ccf55cb6de12c743448fbe040f35e7baab 100644
--- a/tensorflow/contrib/lite/kernels/l2norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -25,10 +25,22 @@ using ::testing::ElementsAreArray;
 
 class L2NormOpModel : public SingleOpModel {
  public:
-  L2NormOpModel(std::initializer_list<int> input_shape,
-                ActivationFunctionType activation_type) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+  L2NormOpModel(const std::initializer_list<int> input_shape,
+                const TensorType tensor_type,
+                const ActivationFunctionType activation_type) {
+    TensorData data = TensorData{tensor_type};
+    if (tensor_type != TensorType_FLOAT32) {
+      data.min = -2.0;
+      data.max = 2.0;
+      data.scale = 2.0;
+      data.zero_point = 128;
+    }
+    input_ = AddInput(data);
+    if (tensor_type != TensorType_FLOAT32) {
+      data.min = -1.0;
+      data.max = 127.0 / 128.0;
+    }
+    output_ = AddOutput(data);
     SetBuiltinOp(BuiltinOperator_L2_NORMALIZATION, BuiltinOptions_L2NormOptions,
                  CreateL2NormOptions(builder_, activation_type).Union());
     BuildInterpreter({input_shape});
@@ -38,7 +50,17 @@ class L2NormOpModel : public SingleOpModel {
     PopulateTensor(input_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
+  int input() const { return input_; }
 
  private:
   int input_;
@@ -46,13 +68,26 @@ class L2NormOpModel : public SingleOpModel {
 };
 
 TEST(L2NormOpTest, SimpleTest) {
-  L2NormOpModel m({1, 1, 1, 6}, ActivationFunctionType_NONE);
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_FLOAT32,
+                  ActivationFunctionType_NONE);
   m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
 }
 
+TEST(L2NormOpTest, SimpleUint8Test) {
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_UINT8, ActivationFunctionType_NONE);
+
+  m.QuantizeAndPopulate<uint8_t>(m.input(), {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({58, 166, 173, 205, 83, 134}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}, 0.1)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/log_softmax_test.cc b/tensorflow/contrib/lite/kernels/log_softmax_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62820a2f5113cb6ae252386aaf3842135383b79f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/log_softmax_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite LOG_SOFTMAX op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+class LogSoftmaxOpModel : public SingleOpModel {
+ public:
+  LogSoftmaxOpModel(int batches, int size)
+      : batches_(batches), input_size_(size) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_LOG_SOFTMAX, BuiltinOptions_LogSoftmaxOptions,
+                 CreateLogSoftmaxOptions(builder_).Union());
+    BuildInterpreter({{batches_, input_size_}});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+
+  int batches_;
+  int input_size_;
+};
+
+TEST(LogSoftmaxOpTest, SimpleTest) {
+  LogSoftmaxOpModel m(/*batches=*/2, /*size=*/5);
+  m.SetInput({
+      1.0, 2.0, 3.0, 4.0, 5.0,       // b = 0
+      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {-4.45191431, -3.45191431, -2.45191431, -1.45191443, -0.4519144,
+           -0.4519144, -1.45191443, -2.45191431, -3.45191431, -4.45191431},
+          1e-6)));
+}
+
+TEST(LogSoftmaxOpTest, CompareWithTFmini) {
+  const int batch_size = 2;
+  const int input_size = 5;
+  static float input_buffer[] = {
+      1.0,  2.0,  3.0,  4.0,  5.0,   // b = 0
+      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 1
+  };
+
+  LogSoftmaxOpModel m(batch_size, input_size);
+
+  m.SetInput(0, input_buffer, input_buffer + input_size * batch_size);
+
+  m.Invoke();
+
+  std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]);
+  static tflite::Dims<4> input_dims = {{input_size, 1, 1, batch_size},
+                                       {1, 0, 0, input_size}};
+  tflite::reference_ops::LogSoftmax(input_buffer, input_dims,
+                                    output_buffer.get(), input_dims);
+
+  std::vector<float> expected;
+  expected.insert(expected.end(), output_buffer.get(),
+                  output_buffer.get() + input_size * batch_size);
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected, 1e-6)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/contrib/lite/kernels/lsh_projection.cc
index 585b44f87018ad5dbce82adbd8775a9bcee25e55..af5d971f21152e592f6229bb52d7eed96c27f710 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection.cc
+++ b/tensorflow/contrib/lite/kernels/lsh_projection.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// LSH Projection projects an input to a bit vector via locality senstive
+// LSH Projection projects an input to a bit vector via locality sensitive
 // hashing.
 //
 // Options:
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index 6c06264d845c24e71647b6fd2374734be32383ef..8cf1165135bdb0d4669bb97fd2d98e3dc044b4d9 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
@@ -212,9 +213,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   // present.
   // 2) If projection weight is present, then projection bias is optional.
   // TODO(ghodrat): make sure this is correct.
-  const bool projecton_tensors_consistent =
+  const bool projection_tensors_consistent =
       ((projection_weights != nullptr) || (projection_bias == nullptr));
-  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
+  TF_LITE_ENSURE(context, projection_tensors_consistent == true);
 
   return kTfLiteOk;
 }
@@ -356,7 +357,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
+  // check the existence of only one to get the condition.
   const bool use_cifg = (input_to_input_weights == nullptr);
   const bool use_peephole = (cell_to_output_weights != nullptr);
 
@@ -377,127 +378,54 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
   }
 
-  // Initialize scratch buffers with bias.
-  if (!use_cifg) {
-    tensor_utils::VectorBatchVectorAssign(input_gate_bias->data.f, n_cell,
-                                          n_batch, input_gate_scratch);
-  }
-  tensor_utils::VectorBatchVectorAssign(forget_gate_bias->data.f, n_cell,
-                                        n_batch, forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAssign(cell_bias->data.f, n_cell, n_batch,
-                                        cell_scratch);
-  tensor_utils::VectorBatchVectorAssign(output_gate_bias->data.f, n_cell,
-                                        n_batch, output_gate_scratch);
-
-  // For each batch and cell: compute input_weight * input.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_input_weights->data.f, n_cell, n_input, input->data.f, n_batch,
-        input_gate_scratch, /*result_stride=*/1);
-  }
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_forget_weights->data.f, n_cell, n_input, input->data.f, n_batch,
-      forget_gate_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_cell_weights->data.f, n_cell, n_input, input->data.f, n_batch,
-      cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_output_weights->data.f, n_cell, n_input, input->data.f, n_batch,
-      output_gate_scratch, /*result_stride=*/1);
-
-  // For each batch and cell: compute recurrent_weight * output_state.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_input_weights->data.f, n_cell, n_output,
-        output_state->data.f, n_batch, input_gate_scratch, /*result_stride=*/1);
-  }
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_forget_weights->data.f, n_cell, n_output,
-      output_state->data.f, n_batch, forget_gate_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_cell_weights->data.f, n_cell, n_output, output_state->data.f,
-      n_batch, cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_output_weights->data.f, n_cell, n_output,
-      output_state->data.f, n_batch, output_gate_scratch, /*result_stride=*/1);
-
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weights->data.f, n_cell, cell_state->data.f, n_batch,
-          input_gate_scratch);
-    }
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weights->data.f, n_cell, cell_state->data.f, n_batch,
-        forget_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch,
-                                         cell_state->data.f, n_batch * n_cell,
-                                         cell_state->data.f);
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell,
-        cell_state->data.f);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state->data.f);
-  }
-  if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state->data.f, n_batch * n_cell,
-                             params->cell_clip, cell_state->data.f);
-  }
-
-  // For each batch and cell: update the output gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weights->data.f, n_cell, cell_state->data.f, n_batch,
-        output_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state->data.f, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
-
-  // For each batch: update the projection and output_state.
-  const bool use_projection_weight = (projection_weights != nullptr);
-  const bool use_projection_bias = (projection_bias != nullptr);
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias->data.f, n_output,
-                                            n_batch, output->data.f);
-    } else {
-      tensor_utils::ZeroVector(output->data.f, n_batch * n_output);
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        projection_weights->data.f, n_output, n_cell, output_gate_scratch,
-        n_batch, output->data.f, /*result_stride=*/1);
-    if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output->data.f, n_batch * n_output,
-                               params->proj_clip, output->data.f);
-    }
-  } else {
-    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                             output->data.f);
-  }
-  tensor_utils::CopyVector(output->data.f, n_batch * n_output,
-                           output_state->data.f);
+  // Check optional tensors, the respective pointers can be null.
+  const float* input_to_input_weights_ptr =
+      (use_cifg) ? nullptr : input_to_input_weights->data.f;
+  const float* recurrent_to_input_weights_ptr =
+      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
+  const float* input_gate_bias_ptr =
+      (use_cifg) ? nullptr : input_gate_bias->data.f;
+  const float* cell_to_input_weights_ptr =
+      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
+  const float* cell_to_forget_weights_ptr =
+      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
+  const float* cell_to_output_weights_ptr =
+      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
+  const float* projection_weights_ptr =
+      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
+  const float* projection_bias_ptr =
+      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+
+  // Required tensors, pointers are non-null.
+  const float* input_ptr_batch = input->data.f;
+  const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f;
+  const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f;
+  const float* input_to_output_weights_ptr = input_to_output_weights->data.f;
+  const float* recurrent_to_forget_weights_ptr =
+      recurrent_to_forget_weights->data.f;
+  const float* recurrent_to_cell_weights_ptr =
+      recurrent_to_cell_weights->data.f;
+  const float* recurrent_to_output_weights_ptr =
+      recurrent_to_output_weights->data.f;
+  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
+  const float* cell_bias_ptr = cell_bias->data.f;
+  const float* output_gate_bias_ptr = output_gate_bias->data.f;
+
+  float* output_state_ptr = output_state->data.f;
+  float* cell_state_ptr = cell_state->data.f;
+  float* output_ptr_batch = output->data.f;
+
+  kernel_utils::LstmStep(
+      input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr,
+      input_to_cell_weights_ptr, input_to_output_weights_ptr,
+      recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
+      recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
+      cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
+      cell_to_output_weights_ptr, input_gate_bias_ptr, forget_gate_bias_ptr,
+      cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
+      projection_bias_ptr, params, n_batch, n_cell, n_input, n_output,
+      output_state_ptr, cell_state_ptr, input_gate_scratch, forget_gate_scratch,
+      cell_scratch, output_gate_scratch, output_ptr_batch);
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum.cc b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a28d663c9e756040746f0a98b356afba76cceab
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace maximum_minimum {
+
+// This file has a reference implemenation of TFMaximum/TFMinimum.
+enum KernelType {
+  kReference,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    input1 = GetInput(context, node, kInputTensor1);
+    input2 = GetInput(context, node, kInputTensor2);
+    output = GetOutput(context, node, kOutputTensor);
+  }
+  TfLiteTensor* input1;
+  TfLiteTensor* input2;
+  TfLiteTensor* output;
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  OpContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, op_context.input1->type, op_context.input2->type);
+  op_context.output->type = op_context.input1->type;
+
+  bool requires_broadcast =
+      !HaveSameShapes(op_context.input1, op_context.input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (requires_broadcast) {
+    TF_LITE_ENSURE_OK(
+        context, CalculateShapeForBroadcast(context, op_context.input1,
+                                            op_context.input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(op_context.input1->dims);
+  }
+
+  return context->ResizeTensor(context, op_context.output, output_size);
+}
+
+struct MaximumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 > el2 ? el1 : el2;
+  }
+};
+
+struct MinimumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 < el2 ? el1 : el2;
+  }
+};
+
+template <typename data_type, typename op_type>
+void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
+                      const OpContext& op_context) {
+  reference_ops::TensorFlowMaximumMinimum<data_type>(
+      GetTensorData<data_type>(op_context.input1),
+      GetTensorDims(op_context.input1),
+      GetTensorData<data_type>(op_context.input2),
+      GetTensorDims(op_context.input2),
+      GetTensorData<data_type>(op_context.output),
+      GetTensorDims(op_context.output), op_type::template op<data_type>);
+}
+
+template <KernelType kernel_type, typename OpType>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+
+  if (kernel_type == kReference) {
+    switch (op_context.output->type) {
+      case kTfLiteFloat32:
+        TFLiteOperation<float, OpType>(context, node, op_context);
+        break;
+      case kTfLiteUInt8:
+        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt32:
+       TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt64:
+        TFLiteOperation<int64_t, OpType>(context, node, op_context);
+        break;
+      default:
+        context->ReportError(context,
+                             "Type %d is currently not supported by Maximum.",
+                             op_context.output->type);
+        return kTfLiteError;
+    }
+  } else {
+    context->ReportError(context,
+                         "Type %d is currently not supported by Maximum.",
+                         op_context.output->type);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace maximum_minimum
+
+TfLiteRegistration* Register_MAXIMUM_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MaximumOp>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MINIMUM_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
+TfLiteRegistration* Register_MAXIMUM() { return Register_MAXIMUM_REF(); }
+TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0752aa1804722accb1f88910fe013ffd632a4503
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class MaxMinOpModel : public SingleOpModel {
+ public:
+  MaxMinOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                const TensorData& input2, const TensorType& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  template <class T>
+  void SetInput1(std::initializer_list<T> data) {
+    PopulateTensor(input1_, data);
+  }
+
+  template <class T>
+  void SetInput2(std::initializer_list<T> data) {
+    PopulateTensor(input2_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+template <typename data_type>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<data_type> input1_values,
+               std::initializer_list<data_type> input2_values,
+               std::initializer_list<data_type> output_values) {
+  MaxMinOpModel m(op, input1, input2, output.type);
+  m.SetInput1<data_type>(input1_values);
+  m.SetInput2<data_type>(input2_values);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(output.shape));
+  EXPECT_THAT(m.GetOutput<data_type>(), ElementsAreArray(output_values));
+}
+
+template <>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<float> input1_values,
+               std::initializer_list<float> input2_values,
+               std::initializer_list<float> output_values) {
+  MaxMinOpModel m(op, input1, input2, output.type);
+  m.SetInput1<float>(input1_values);
+  m.SetInput2<float>(input2_values);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(output.shape));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear(output_values)));
+}
+
+TEST(MaximumOpTest, FloatTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  TestModel<float>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}}, data1, data2,
+                   {1.0, 0.0, 1.0, 12.0, -2.0, -1.43});
+  TestModel<float>(BuiltinOperator_MINIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}}, data1, data2,
+                   {-1.0, 0.0, -1.0, 11.0, -3.0, -1.44});
+}
+
+TEST(MaxMinOpTest, Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MAXIMUM, {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}}, data1, data2,
+                     {1, 0, 2, 12, 255, 23});
+  TestModel<uint8_t>(BuiltinOperator_MINIMUM, {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}}, data1, data2,
+                     {0, 0, 1, 11, 2, 1});
+}
+
+TEST(MaximumOpTest, FloatWithBroadcastTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
+  std::initializer_list<float> data2 = {0.5, 2.0};
+  TestModel<float>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {2}}, {TensorType_FLOAT32, {3, 1, 2}},
+                   data1, data2, {1.0, 2.0, 0.5, 2.0, 0.5, 11.0});
+  TestModel<float>(BuiltinOperator_MINIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {2}}, {TensorType_FLOAT32, {3, 1, 2}},
+                   data1, data2, {0.5, 0.0, -1.0, -2.0, -1.44, 2.0});
+}
+
+TEST(MaximumOpTest, Int32WithBroadcastTest) {
+  std::initializer_list<int32_t> data1 = {1, 0, -1, -2, 3, 11};
+  std::initializer_list<int32_t> data2 = {2};
+  TestModel<int32>(BuiltinOperator_MAXIMUM, {TensorType_INT32, {3, 1, 2}},
+                   {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}},
+                   data1, data2, {2, 2, 2, 2, 3, 11});
+  TestModel<int32>(BuiltinOperator_MINIMUM, {TensorType_INT32, {3, 1, 2}},
+                   {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}},
+                   data1, data2, {1, 0, -1, -2, 2, 2});
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
index aff19581ea56f94c08638b7b388ae181f566cf4f..047bdd1039b993783ce8f2d69d83864e609c13fd 100644
--- a/tensorflow/contrib/lite/kernels/mean.cc
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
@@ -48,7 +49,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // Creates two temp tensors to store index and axis for internal
   // implementation only.
   auto* scratch_tensor_index = new int;
-  context->AddTensors(context, 2, scratch_tensor_index);
+  context->AddTensors(context, 3, scratch_tensor_index);
   return scratch_tensor_index;
 }
 
@@ -64,6 +65,14 @@ TfLiteStatus ResizeTempAxis(TfLiteContext* context, MeanContext* op_context,
   return context->ResizeTensor(context, resolved_axis, axis_size);
 }
 
+// Resizes the temp tensor that stores temp sum of reduced elements.
+TfLiteStatus ResizeTempSum(TfLiteContext* context, MeanContext* op_context,
+                           TfLiteTensor* temp_sum) {
+  TfLiteIntArray* size = TfLiteIntArrayCreate(1);
+  size->data[0] = static_cast<int>(NumElements(op_context->output));
+  return context->ResizeTensor(context, temp_sum, size);
+}
+
 // Resizes output array based on the input size and resolved axis.
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 MeanContext* op_context) {
@@ -135,7 +144,7 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   // Creates a temp index to iterate through input data.
   int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
   TfLiteIntArrayFree(node->temporaries);
-  node->temporaries = TfLiteIntArrayCreate(2);
+  node->temporaries = TfLiteIntArrayCreate(3);
   node->temporaries->data[0] = *scratch_tensor_index;
   TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
   scratch_tensor->type = kTfLiteInt32;
@@ -149,6 +158,25 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   node->temporaries->data[1] = *scratch_tensor_index + 1;
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
   resolved_axis->type = kTfLiteInt32;
+  // Creates a temp tensor to store temp sums when calculating mean.
+  node->temporaries->data[2] = *scratch_tensor_index + 2;
+  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
+  switch (op_context->input->type) {
+    case kTfLiteFloat32:
+      temp_sum->type = kTfLiteFloat32;
+      break;
+    case kTfLiteInt32:
+      temp_sum->type = kTfLiteInt64;
+      break;
+    case kTfLiteInt64:
+      temp_sum->type = kTfLiteInt64;
+      break;
+    case kTfLiteUInt8:
+      temp_sum->type = kTfLiteInt32;
+      break;
+    default:
+      return kTfLiteError;
+  }
   return kTfLiteOk;
 }
 
@@ -160,16 +188,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
 
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
   // Leaves work to Eval if axis is not constant; else resizes output.
   if (!IsConstantTensor(op_context.axis)) {
     SetTensorToDynamic(op_context.output);
     SetTensorToDynamic(resolved_axis);
+    SetTensorToDynamic(temp_sum);
     return kTfLiteOk;
   }
   resolved_axis->allocation_type = kTfLiteArenaRw;
   TF_LITE_ENSURE_OK(context,
                     ResizeTempAxis(context, &op_context, resolved_axis));
-  return ResizeOutputTensor(context, &op_context);
+  TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  temp_sum->allocation_type = kTfLiteArenaRw;
+  return ResizeTempSum(context, &op_context, temp_sum);
 }
 
 template <KernelType kernel_type>
@@ -178,14 +210,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   int num_axis = static_cast<int>(NumElements(op_context.axis));
   TfLiteTensor* temp_index = &context->tensors[node->temporaries->data[0]];
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context,
                       ResizeTempAxis(context, &op_context, resolved_axis));
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum));
   }
 
-#define TF_LITE_MEAN(kernel_type, data_type)                        \
+#define TF_LITE_MEAN(kernel_type, data_type, temp_data_type)        \
   kernel_type::Mean<>(                                              \
       GetTensorData<data_type>(op_context.input),                   \
       op_context.input->dims->data, op_context.input->dims->size,   \
@@ -193,21 +227,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       op_context.output->dims->data, op_context.output->dims->size, \
       GetTensorData<int>(op_context.axis), num_axis,                \
       op_context.params->keep_dims, GetTensorData<int>(temp_index), \
-      GetTensorData<int>(resolved_axis))
+      GetTensorData<int>(resolved_axis),                            \
+      GetTensorData<temp_data_type>(temp_sum))
 
   if (kernel_type == kReference) {
     switch (op_context.input->type) {
       case kTfLiteFloat32:
-        TF_LITE_MEAN(reference_ops, float);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
         break;
       case kTfLiteInt32:
-        TF_LITE_MEAN(reference_ops, int);
-        break;
-      case kTfLiteUInt8:
-        TF_LITE_MEAN(reference_ops, uint8_t);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int, int64_t));
         break;
       case kTfLiteInt64:
-        TF_LITE_MEAN(reference_ops, int64_t);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int64_t, int64_t));
+        break;
+      case kTfLiteUInt8:
+        TF_LITE_ENSURE_EQ(context, op_context.input->params.scale,
+                          op_context.output->params.scale);
+        TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point,
+                          op_context.output->params.zero_point);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int));
         break;
       default:
         return kTfLiteError;
@@ -216,7 +255,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 #undef TF_LITE_MEAN
   return kTfLiteOk;
 }
-
 }  // namespace mean
 
 TfLiteRegistration* Register_MEAN_REF() {
diff --git a/tensorflow/contrib/lite/kernels/mean_test.cc b/tensorflow/contrib/lite/kernels/mean_test.cc
index c4c53c2ded351849e7c458fc754c36395a25ebd0..79c9957f76fdb994be0a71f2e90b883435de4815 100644
--- a/tensorflow/contrib/lite/kernels/mean_test.cc
+++ b/tensorflow/contrib/lite/kernels/mean_test.cc
@@ -37,8 +37,15 @@ class BaseMeanOpModel : public SingleOpModel {
     return ExtractVector<T>(output_);
   }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  int Input() { return input_; }
+
  protected:
   int input_;
   int axis_;
@@ -74,7 +81,7 @@ class MeanOpDynamicModel : public BaseMeanOpModel {
   }
 };
 
-TEST(ConstMeanOpTest, NotKeepDims) {
+TEST(ConstFloatMeanOpTest, NotKeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
@@ -86,7 +93,7 @@ TEST(ConstMeanOpTest, NotKeepDims) {
   EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
 }
 
-TEST(ConstMeanOpTest, KeepDims) {
+TEST(ConstFloatMeanOpTest, KeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
@@ -99,7 +106,7 @@ TEST(ConstMeanOpTest, KeepDims) {
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
 
-TEST(DynamicMeanOpTest, NotKeepDims) {
+TEST(DynamicFloatMeanOpTest, NotKeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
@@ -114,7 +121,7 @@ TEST(DynamicMeanOpTest, NotKeepDims) {
   EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
 }
 
-TEST(DynamicMeanOpTest, KeepDims) {
+TEST(DynamicFloatMeanOpTest, KeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
@@ -130,6 +137,78 @@ TEST(DynamicMeanOpTest, KeepDims) {
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
 
+TEST(DynamicFloatMeanOpTest, Scale) {
+  std::initializer_list<float> data = {9.527};
+  MeanOpDynamicModel m({TensorType_FLOAT32, {1}}, {TensorType_FLOAT32, {1}},
+                       {TensorType_INT32, {1}}, true);
+  std::initializer_list<int> axis = {0};
+  m.SetAxis(axis);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527})));
+}
+
+// for quantized Add, the error shouldn't exceed step
+float GetTolerance(int min, int max) { return (max - min) / 255.0; }
+
+TEST(ConstUint8MeanOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::initializer_list<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MeanOpConstModel m({TensorType_UINT8, {1, 3, 2}, -1.0, 1.0},
+                     {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                            {0.4, 0.4}, kQuantizedTolerance)));
+}
+
+TEST(ConstUint8MeanOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::initializer_list<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MeanOpConstModel m({TensorType_UINT8, {3, 2}, -1.0, 1.0},
+                     {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
+}
+
+TEST(DynamicUint8MeanOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::initializer_list<float> data = {1.3, -4.8, -3.6, 0.24};
+  MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -5.0, 2.0},
+                       {TensorType_UINT8, {2}, -5.0, 2.0},
+                       {TensorType_INT32, {1}}, false);
+  std::initializer_list<int> axis = {1};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({-1.75, -1.68}, kQuantizedTolerance)));
+}
+
+TEST(DynamicUint8MeanOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::initializer_list<float> data = {11.14, -0.14, 7.423, 0.879};
+  MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -10.0, 12.0},
+                       {TensorType_UINT8, {2}, -10.0, 12.0},
+                       {TensorType_INT32, {1}}, true);
+  std::initializer_list<int> axis = {0};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/mfcc.cc b/tensorflow/contrib/lite/kernels/mfcc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..018db0dc54c5d281bf3fb3ff8a1f111b427fe76b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mfcc.cc
@@ -0,0 +1,154 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/mfcc.h"
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h"
+#include "tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace mfcc {
+
+enum KernelType {
+  kReference,
+};
+
+typedef struct {
+  float upper_frequency_limit;
+  float lower_frequency_limit;
+  int filterbank_channel_count;
+  int dct_coefficient_count;
+} TfLiteMfccParams;
+
+constexpr int kInputTensorWav = 0;
+constexpr int kInputTensorRate = 1;
+constexpr int kOutputTensor = 0;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new TfLiteMfccParams;
+
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  data->upper_frequency_limit = m["upper_frequency_limit"].AsInt64();
+  data->lower_frequency_limit = m["lower_frequency_limit"].AsInt64();
+  data->filterbank_channel_count = m["filterbank_channel_count"].AsInt64();
+  data->dct_coefficient_count = m["dct_coefficient_count"].AsInt64();
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<TfLiteMfccParams*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(inputWav), 3);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(inputRate), 1);
+
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, inputWav->type, output->type);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
+  output_size->data[0] = inputWav->dims->data[0];
+  output_size->data[1] = inputWav->dims->data[1];
+  output_size->data[2] = params->dct_coefficient_count;
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+// Input is a single squared-magnitude spectrogram frame. The input spectrum
+// is converted to linear magnitude and weighted into bands using a
+// triangular mel filterbank, and a discrete cosine transform (DCT) of the
+// values is taken. Output is populated with the lowest dct_coefficient_count
+// of these values.
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
+
+  TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  const int32 sample_rate = *GetTensorData<int>(inputRate);
+
+  const int spectrogram_channels = inputWav->dims->data[2];
+  const int spectrogram_samples = inputWav->dims->data[1];
+  const int audio_channels = inputWav->dims->data[0];
+
+  internal::Mfcc mfcc;
+  mfcc.set_upper_frequency_limit(params->upper_frequency_limit);
+  mfcc.set_lower_frequency_limit(params->lower_frequency_limit);
+  mfcc.set_filterbank_channel_count(params->filterbank_channel_count);
+  mfcc.set_dct_coefficient_count(params->dct_coefficient_count);
+
+  mfcc.Initialize(spectrogram_channels, sample_rate);
+
+  const float* spectrogram_flat = GetTensorData<float>(inputWav);
+  float* output_flat = GetTensorData<float>(output);
+
+  for (int audio_channel = 0; audio_channel < audio_channels; ++audio_channel) {
+    for (int spectrogram_sample = 0; spectrogram_sample < spectrogram_samples;
+         ++spectrogram_sample) {
+      const float* sample_data =
+          spectrogram_flat +
+          (audio_channel * spectrogram_samples * spectrogram_channels) +
+          (spectrogram_sample * spectrogram_channels);
+      std::vector<double> mfcc_input(sample_data,
+                                     sample_data + spectrogram_channels);
+      std::vector<double> mfcc_output;
+      mfcc.Compute(mfcc_input, &mfcc_output);
+      TF_LITE_ENSURE_EQ(context, params->dct_coefficient_count,
+                        mfcc_output.size());
+      float* output_data = output_flat +
+                           (audio_channel * spectrogram_samples *
+                            params->dct_coefficient_count) +
+                           (spectrogram_sample * params->dct_coefficient_count);
+      for (int i = 0; i < params->dct_coefficient_count; ++i) {
+        output_data[i] = mfcc_output[i];
+      }
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace mfcc
+
+TfLiteRegistration* Register_MFCC() {
+  static TfLiteRegistration r = {mfcc::Init, mfcc::Free, mfcc::Prepare,
+                                 mfcc::Eval<mfcc::kReference>};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/mfcc_test.cc b/tensorflow/contrib/lite/kernels/mfcc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0291ca8c1c58ea6ab3bb7c22bc436ed3404cba74
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mfcc_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_MFCC();
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class BaseMfccOpModel : public SingleOpModel {
+ public:
+  BaseMfccOpModel(const TensorData& input1, const TensorData& input2,
+                  const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Int("upper_frequency_limit", 4000);
+      fbb.Int("lower_frequency_limit", 20);
+      fbb.Int("filterbank_channel_count", 40);
+      fbb.Int("dct_coefficient_count", 13);
+    });
+    fbb.Finish();
+    SetCustomOp("Mfcc", fbb.GetBuffer(), Register_MFCC);
+
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(MfccOpTest, SimpleTest) {
+  BaseMfccOpModel m({TensorType_FLOAT32, {1, 1, 513}}, {TensorType_INT32, {1}},
+                    {TensorType_FLOAT32, {}});
+
+  std::vector<float> data(513);
+  for (int i = 0; i < data.size(); ++i) {
+    data[i] = i + 1;
+  }
+  m.PopulateTensor<float>(m.input1(), 0, data.data(),
+                          data.data() + data.size());
+  m.PopulateTensor<int>(m.input2(), {22050});
+
+  m.Invoke();
+
+  std::vector<int> output_shape = m.GetOutputShape();
+  EXPECT_THAT(output_shape, ElementsAre(1, 1, 13));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+           -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+           -0.0769791, -0.10806114, -0.06047613},
+          1e-3)));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index c29da3862e84d6756bf5ef34b2ca06307b0a065d..4f9449a225c66a0fb2a9285e6aff3a1f7147f5dd 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -119,39 +119,46 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     after_padding.push_back(paddings_data[idx * 2 + 1]);
   }
 
-#define TF_LITE_PAD(type, scalar)                                           \
+#define TF_LITE_PAD(type, scalar, pad_value)                                \
   type::Pad(GetTensorData<scalar>(op_context.input),                        \
             GetTensorDims(op_context.input), before_padding, after_padding, \
             GetTensorData<scalar>(op_context.output),                       \
-            GetTensorDims(op_context.output))
+            GetTensorDims(op_context.output), pad_value)
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, float);
+        TF_LITE_PAD(reference_ops, float, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, float);
+        TF_LITE_PAD(optimized_ops, float, 0);
       }
       break;
     case kTfLiteUInt8:
+      // Quantized Pad requires that 0 is represented in the quantized range.
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                  std::numeric_limits<uint8_t>::min());
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                  std::numeric_limits<uint8_t>::max());
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, uint8_t);
+        TF_LITE_PAD(reference_ops, uint8_t,
+                    op_context.output->params.zero_point);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, uint8_t);
+        TF_LITE_PAD(optimized_ops, uint8_t,
+                    op_context.output->params.zero_point);
       }
       break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int32_t);
+        TF_LITE_PAD(reference_ops, int32_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int32_t);
+        TF_LITE_PAD(optimized_ops, int32_t, 0);
       }
       break;
     case kTfLiteInt64:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int64_t);
+        TF_LITE_PAD(reference_ops, int64_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int64_t);
+        TF_LITE_PAD(optimized_ops, int64_t, 0);
       }
       break;
     default:
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
index 28834ad0719291b2e868bca2d86a6685e6eb9962..c06237e5720874e66c5953edab2d3749cc88af28 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -22,6 +22,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::Matcher;
 
 class PadOpModel : public SingleOpModel {
  public:
@@ -29,6 +30,10 @@ class PadOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
@@ -36,6 +41,11 @@ class PadOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
  protected:
   int input_;
   int output_;
@@ -50,16 +60,17 @@ class PadOpModel : public SingleOpModel {
 //    m.Invoke();
 class PadOpConstModel : public PadOpModel {
  public:
-  PadOpConstModel(std::initializer_list<int> input_shape,
+  PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
-                  std::initializer_list<int> paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
+                  std::initializer_list<int> paddings,
+                  const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape});
+    BuildInterpreter({input.shape});
   }
 };
 
@@ -72,40 +83,45 @@ class PadOpConstModel : public PadOpModel {
 //    m.Invoke();
 class PadOpDynamicModel : public PadOpModel {
  public:
-  PadOpDynamicModel(std::initializer_list<int> input_shape,
-                    std::initializer_list<int> paddings_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  PadOpDynamicModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape, paddings_shape});
+    BuildInterpreter({input.shape, paddings_shape});
   }
 };
 
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {9, 2},
-                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9},
+                      {TensorType_FLOAT32}),
       "dims != 4");
 }
 
 TEST(PadOpTest, UnequalDimensions) {
-  EXPECT_DEATH(PadOpConstModel({1, 1, 2, 1}, {3, 2}, {1, 1, 2, 2, 3, 3}),
+  EXPECT_DEATH(PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
+                               {1, 1, 2, 2, 3, 3}, {TensorType_FLOAT32}),
                "3 != 4");
 }
 
 TEST(PadOpTest, InvalidPadValue) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 1, 2, 1}, {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                      {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
       "Pad value has to be greater than equal to 0.");
 }
 
 TEST(PadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({1, 2, 2, 1}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -114,7 +130,8 @@ TEST(PadOpTest, SimpleConstTest) {
 }
 
 TEST(PadOpTest, SimpleDynamicTest) {
-  PadOpDynamicModel m({1, 2, 2, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -124,7 +141,8 @@ TEST(PadOpTest, SimpleDynamicTest) {
 }
 
 TEST(PadOpTest, AdvancedConstTest) {
-  PadOpConstModel m({1, 2, 3, 1}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
@@ -134,7 +152,8 @@ TEST(PadOpTest, AdvancedConstTest) {
 }
 
 TEST(PadOpTest, AdvancedDynamicTest) {
-  PadOpDynamicModel m({1, 2, 3, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
@@ -144,6 +163,80 @@ TEST(PadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+class QuantizedPadOpTest : public ::testing::Test {
+ protected:
+  std::vector<Matcher<float>> DequantizedArrayNear(
+      const std::vector<float>& values, const float min, const float max) {
+    const float quantization_tolerance = (max - min) / 255.0;
+    return ArrayFloatNear(values, quantization_tolerance);
+  }
+};
+
+TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
+  // The test_util and actual quantization code currently ensure that the range
+  // must include zero, but if that ever changes, this test will catch it.
+  EXPECT_DEATH(PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
+                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                                 {TensorType_UINT8, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedPadOpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, SimpleDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedConstTest) {
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h
index 40b8476b3779c66e31a04856bce8aebd378f1e5f..e81b970e0fb149e8c5d95ed12622917fdc336f7a 100644
--- a/tensorflow/contrib/lite/kernels/padding.h
+++ b/tensorflow/contrib/lite/kernels/padding.h
@@ -17,9 +17,10 @@ limitations under the License.
 
 namespace tflite {
 
-inline int ComputePadding(int stride, int in_size, int filter_size,
-                          int out_size) {
-  int padding = ((out_size - 1) * stride + filter_size - in_size) / 2;
+inline int ComputePadding(int stride, int dilation_rate, int in_size,
+                          int filter_size, int out_size) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  int padding = ((out_size - 1) * stride + effective_filter_size - in_size) / 2;
   return padding > 0 ? padding : 0;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc
index b79880110897a1438a589d97363fd861c61667e7..0bf27c34c1337b4ae4b8b73ee2dafcc931c7ce3c 100644
--- a/tensorflow/contrib/lite/kernels/pooling.cc
+++ b/tensorflow/contrib/lite/kernels/pooling.cc
@@ -94,9 +94,9 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   int outHeight =
       computeOutSize(height, params->filter_height, params->stride_height);
 
-  data->padding.height = ComputePadding(params->stride_height, height,
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
                                         params->filter_height, outHeight);
-  data->padding.width = ComputePadding(params->stride_width, width,
+  data->padding.width = ComputePadding(params->stride_width, 1, width,
                                        params->filter_width, outWidth);
 
   if (input->type == kTfLiteUInt8) {
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index edc4e26edbd44784f8604e7da32156a8e695d2e2..98c5e18a0e40314b32e00983367bda1d070f3db0 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -17,6 +17,14 @@ limitations under the License.
 
 namespace tflite {
 namespace ops {
+
+namespace custom {
+
+TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
+TfLiteRegistration* Register_MFCC();
+
+}  // namespace custom
+
 namespace builtin {
 
 TfLiteRegistration* Register_RELU();
@@ -49,6 +57,7 @@ TfLiteRegistration* Register_MUL();
 TfLiteRegistration* Register_L2_NORMALIZATION();
 TfLiteRegistration* Register_LOCAL_RESPONSE_NORMALIZATION();
 TfLiteRegistration* Register_LSTM();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration* Register_PAD();
 TfLiteRegistration* Register_RESHAPE();
@@ -63,6 +72,15 @@ TfLiteRegistration* Register_SQUEEZE();
 TfLiteRegistration* Register_STRIDED_SLICE();
 TfLiteRegistration* Register_EXP();
 TfLiteRegistration* Register_TOPK_V2();
+TfLiteRegistration* Register_LOG_SOFTMAX();
+TfLiteRegistration* Register_CAST();
+TfLiteRegistration* Register_DEQUANTIZE();
+TfLiteRegistration* Register_PRELU();
+TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_MINIMUM();
+TfLiteRegistration* Register_ARG_MAX();
+TfLiteRegistration* Register_LESS();
+TfLiteRegistration* Register_FLOOR();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -97,6 +115,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM());
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+             Register_BIDIRECTIONAL_SEQUENCE_LSTM());
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
              Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
   AddBuiltin(BuiltinOperator_PAD, Register_PAD());
@@ -114,6 +134,23 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
   AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
+  AddBuiltin(BuiltinOperator_CAST, Register_CAST());
+  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
+  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
+  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+
+#if 0
+  // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
+  // custom ops aren't always included by default.
+  AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
+  AddCustom("AudioSpectrogram",
+            tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
+#endif
 }
 
 TfLiteRegistration* BuiltinOpResolver::FindOp(
diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc
index f3e6ddc9f480e3863cac52157ae28b7329ee2088..438f70d3115130efe477a3ceeccd2e77108c979a 100644
--- a/tensorflow/contrib/lite/kernels/reshape.cc
+++ b/tensorflow/contrib/lite/kernels/reshape.cc
@@ -49,20 +49,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(params->num_dimensions);
   int num_output_elements = 1;
-  int strech_dim = -1;
+  int stretch_dim = -1;
   for (int i = 0; i < params->num_dimensions; ++i) {
     int value = params->shape[i];
     if (value == -1) {
-      TF_LITE_ENSURE_EQ(context, strech_dim, -1);
-      strech_dim = i;
+      TF_LITE_ENSURE_EQ(context, stretch_dim, -1);
+      stretch_dim = i;
     } else {
       num_output_elements *= value;
       output_size->data[i] = value;
     }
   }
-  if (strech_dim != -1) {
-    output_size->data[strech_dim] = num_input_elements / num_output_elements;
-    num_output_elements *= output_size->data[strech_dim];
+  if (stretch_dim != -1) {
+    output_size->data[stretch_dim] = num_input_elements / num_output_elements;
+    num_output_elements *= output_size->data[stretch_dim];
   }
 
   TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
diff --git a/tensorflow/contrib/lite/kernels/reshape_test.cc b/tensorflow/contrib/lite/kernels/reshape_test.cc
index 0fbcf6e6aa311d2cac491336ee54ccf58bbda8fd..aecbd0399f7454045e8189072f45b695b0525204 100644
--- a/tensorflow/contrib/lite/kernels/reshape_test.cc
+++ b/tensorflow/contrib/lite/kernels/reshape_test.cc
@@ -60,7 +60,7 @@ TEST(ReshapeOpTest, TooManyDimensions) {
 
 TEST(ReshapeOpTest, TooManySpecialDimensions) {
   EXPECT_DEATH(ReshapeOpModel({1, 2, 4, 1}, {-1, -1, 2, 4}),
-               "strech_dim != -1");
+               "stretch_dim != -1");
 }
 
 TEST(ReshapeOpTest, SimpleTest) {
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index fb1e11e0ca00abb36d7f29d562711a7bbcbeca1c..40ac436b7dcabe7a12166e5381f0381941a204d3 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -48,7 +48,7 @@ struct StridedSliceContext {
     output = GetOutput(context, node, kOutputTensor);
     dims = NumDimensions(input);
   }
-  TfLiteStridedSliceParams* params;
+  const TfLiteStridedSliceParams* params;
   TfLiteTensor* input;
   TfLiteTensor* begin;
   TfLiteTensor* end;
@@ -87,6 +87,8 @@ inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) {
                           std::min(std::max(index, -dim), dim - 1), dim));
 }
 
+// TODO(b/77971377) this logic should be removed, as it's a duplication of
+// StartForAxis() & StopForAxis() in kernels/internal/reference/reference_ops.h
 inline int32_t GetBeginValueAtIndex(StridedSliceContext* op_context, int idx) {
   const int dim = op_context->input->dims->data[idx];
   const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
@@ -188,8 +190,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   std::vector<int32_t> strides;
 
   for (int idx = op_context.dims - 1; idx >= 0; --idx) {
-    starts.emplace_back(GetBeginValueAtIndex(&op_context, idx));
-    stops.emplace_back(GetEndValueAtIndex(&op_context, idx));
+    starts.emplace_back(GetTensorData<int32_t>(op_context.begin)[idx]);
+    stops.emplace_back(GetTensorData<int32_t>(op_context.end)[idx]);
     strides.emplace_back(GetTensorData<int32_t>(op_context.strides)[idx]);
   }
 
@@ -199,20 +201,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     strides.emplace_back(1);
   }
 
-  op_context.params->begin_mask =
+  int begin_mask =
       ReverseMaskBits(op_context.params->begin_mask, op_context.dims);
-  op_context.params->end_mask =
-      ReverseMaskBits(op_context.params->end_mask, op_context.dims);
-  op_context.params->shrink_axis_mask =
-      ReverseMaskBits(op_context.params->shrink_axis_mask, op_context.dims);
-
-#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                      \
-  kernel_type::StridedSlice(                                               \
-      GetTensorData<data_type>(op_context.input),                          \
-      GetTensorDims(op_context.input), op_context.params->begin_mask,      \
-      op_context.params->end_mask, op_context.params->shrink_axis_mask,    \
-      starts, stops, strides, GetTensorData<data_type>(op_context.output), \
-      GetTensorDims(op_context.output))
+  int end_mask = ReverseMaskBits(op_context.params->end_mask, op_context.dims);
+
+#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                    \
+  kernel_type::StridedSlice(GetTensorData<data_type>(op_context.input),  \
+                            GetTensorDims(op_context.input), begin_mask, \
+                            end_mask, starts, stops, strides,            \
+                            GetTensorData<data_type>(op_context.output), \
+                            GetTensorDims(op_context.output))
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
@@ -230,6 +228,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_STRIDED_SLICE(reference_ops, int64_t);
       }
       break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
+      }
+      break;
     default:
       context->ReportError(context,
                            "Type is currently not supported "
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index 5cac04b38364958c5b0794c21742e8b592372ae9..cc39179bc705aa1083e74b06f8f7f3fb45e9f616 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -24,6 +24,8 @@ namespace {
 using ::int32;
 using ::testing::ElementsAreArray;
 
+template <typename input_type = float,
+          TensorType tensor_input_type = TensorType_FLOAT32>
 class StridedSliceOpModel : public SingleOpModel {
  public:
   StridedSliceOpModel(std::initializer_list<int> input_shape,
@@ -32,11 +34,11 @@ class StridedSliceOpModel : public SingleOpModel {
                       std::initializer_list<int> strides_shape, int begin_mask,
                       int end_mask, int ellipsis_mask, int new_axis_mask,
                       int shrink_axis_mask) {
-    input_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput(tensor_input_type);
     begin_ = AddInput(TensorType_INT32);
     end_ = AddInput(TensorType_INT32);
     strides_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(tensor_input_type);
     SetBuiltinOp(
         BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
         CreateStridedSliceOptions(builder_, begin_mask, end_mask, ellipsis_mask,
@@ -45,8 +47,8 @@ class StridedSliceOpModel : public SingleOpModel {
     BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
   }
 
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  void SetInput(std::initializer_list<input_type> data) {
+    PopulateTensor<input_type>(input_, data);
   }
   void SetBegin(std::initializer_list<int32> data) {
     PopulateTensor<int32>(begin_, data);
@@ -58,7 +60,9 @@ class StridedSliceOpModel : public SingleOpModel {
     PopulateTensor<int32>(strides_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<input_type> GetOutput() {
+    return ExtractVector<input_type>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  private:
@@ -71,19 +75,19 @@ class StridedSliceOpModel : public SingleOpModel {
 
 TEST(StridedSliceOpTest, UnsupportedInputSize) {
   EXPECT_DEATH(
-      StridedSliceOpModel({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
+      StridedSliceOpModel<>({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
       "StridedSlice op only supports 1D-4D input arrays.");
 }
 
 TEST(StridedSliceOpTest, UnssupportedArgs) {
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
+  EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
                "ellipsis_mask is not implemented yet.");
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
+  EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
 }
 
 TEST(StridedSliceOpTest, In1D) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -94,7 +98,7 @@ TEST(StridedSliceOpTest, In1D) {
 }
 
 TEST(StridedSliceOpTest, In1D_EmptyOutput) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({10});
   m.SetEnd({3});
@@ -104,7 +108,7 @@ TEST(StridedSliceOpTest, In1D_EmptyOutput) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBegin) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({3});
@@ -115,7 +119,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBegin) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-5});
   m.SetEnd({3});
@@ -126,7 +130,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeEnd) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({-2});
@@ -137,7 +141,7 @@ TEST(StridedSliceOpTest, In1D_NegativeEnd) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({5});
@@ -148,7 +152,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
 }
 
 TEST(StridedSliceOpTest, In1D_BeginMask) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -159,7 +163,7 @@ TEST(StridedSliceOpTest, In1D_BeginMask) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-2});
   m.SetEnd({-3});
@@ -170,7 +174,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({5});
   m.SetEnd({2});
@@ -181,7 +185,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({2});
   m.SetEnd({-4});
@@ -192,7 +196,7 @@ TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({-5});
@@ -203,7 +207,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_EndMask) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -214,7 +218,7 @@ TEST(StridedSliceOpTest, In1D_EndMask) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegStride) {
-  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
   m.SetBegin({-1});
   m.SetEnd({-4});
@@ -225,7 +229,7 @@ TEST(StridedSliceOpTest, In1D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
-  StridedSliceOpModel m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2});
   m.SetBegin({0});
   m.SetEnd({2});
@@ -236,7 +240,7 @@ TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
 }
 
 TEST(StridedSliceOpTest, In1D_OddLenStride2) {
-  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
   m.SetBegin({0});
   m.SetEnd({3});
@@ -247,7 +251,7 @@ TEST(StridedSliceOpTest, In1D_OddLenStride2) {
 }
 
 TEST(StridedSliceOpTest, In2D_Identity) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -258,7 +262,7 @@ TEST(StridedSliceOpTest, In2D_Identity) {
 }
 
 TEST(StridedSliceOpTest, In2D) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -269,7 +273,7 @@ TEST(StridedSliceOpTest, In2D) {
 }
 
 TEST(StridedSliceOpTest, In2D_Stride2) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -280,7 +284,7 @@ TEST(StridedSliceOpTest, In2D_Stride2) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStride) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -1});
   m.SetEnd({2, -4});
@@ -291,7 +295,7 @@ TEST(StridedSliceOpTest, In2D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In2D_BeginMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -302,7 +306,7 @@ TEST(StridedSliceOpTest, In2D_BeginMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_EndMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -313,7 +317,7 @@ TEST(StridedSliceOpTest, In2D_EndMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -2});
   m.SetEnd({2, -4});
@@ -324,7 +328,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -2});
   m.SetEnd({2, -3});
@@ -335,7 +339,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
 }
 
 TEST(StridedSliceOpTest, In3D_Identity) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -347,7 +351,7 @@ TEST(StridedSliceOpTest, In3D_Identity) {
 }
 
 TEST(StridedSliceOpTest, In3D_NegStride) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({-1, -1, -1});
   m.SetEnd({-3, -4, -3});
@@ -359,7 +363,7 @@ TEST(StridedSliceOpTest, In3D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In3D_Strided2) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -370,32 +374,21 @@ TEST(StridedSliceOpTest, In3D_Strided2) {
 }
 
 TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
-  m.SetEnd({3});
+  m.SetEnd({2});
   m.SetStrides({1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
 }
 
-TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
-  m.SetInput({1, 2, 3, 4});
-  m.SetBegin({2});
-  m.SetEnd({1});
-  m.SetStrides({1});
-  m.Invoke();
-  EXPECT_TRUE(m.GetOutputShape().empty());
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
-}
-
 TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
-  m.SetEnd({3});
+  m.SetEnd({1});
   m.SetStrides({1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
@@ -403,7 +396,7 @@ TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-2});
   m.SetEnd({-3});
@@ -414,10 +407,10 @@ TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
-  m.SetEnd({2, 3});
+  m.SetEnd({1, 3});
   m.SetStrides({1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
@@ -425,10 +418,10 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
-  m.SetEnd({2, 3});
+  m.SetEnd({2, 1});
   m.SetStrides({1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
@@ -436,10 +429,10 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
-  m.SetEnd({2, 3});
+  m.SetEnd({1, 1});
   m.SetStrides({1, 1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
@@ -447,10 +440,10 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 3, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
@@ -458,10 +451,10 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({2, 1, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
@@ -469,10 +462,10 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 1, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
@@ -480,10 +473,10 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({2, 3, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
@@ -491,10 +484,10 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 3, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
@@ -502,10 +495,10 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({2, 1, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
@@ -513,15 +506,48 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 1, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
 }
+
+// This tests catches a very subtle bug that was fixed by cl/188403234.
+TEST(StridedSliceOpTest, RunTwice) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+
+  auto setup_inputs = [&m]() {
+    m.SetInput({1, 2, 3, 4, 5, 6});
+    m.SetBegin({1, 0});
+    m.SetEnd({2, 2});
+    m.SetStrides({1, 1});
+  };
+
+  setup_inputs();
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 4, 5}));
+
+  setup_inputs();
+  m.Invoke();
+  // Prior to cl/188403234 this was {4, 5}.
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 4, 5}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
+  StridedSliceOpModel<uint8, TensorType_UINT8> m({2, 3, 2}, {3}, {3}, {3}, 0, 0,
+                                                 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({1, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index ddaf498d5bac0109429224e7cf66cb3debcabc22..7c60a4fdbffdc96b8967f52f8dbab3e18ecbcc0a 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -26,7 +26,7 @@ namespace ops {
 namespace builtin {
 namespace sub {
 
-// This file has three implementation of Div.
+// This file has three implementation of Sub.
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,51 +61,121 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
-void EvalSubFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteSubParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteSubParams* params, const OpData* data,
+               TfLiteTensor* input1, TfLiteTensor* input2,
+               TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_Sub(type)                                        \
-  type::Sub(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_SUB(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    if (data->requires_broadcast) {
+      TF_LITE_SUB(reference_ops, BroadcastSub);
+    } else {
+      TF_LITE_SUB(reference_ops, Sub);
+    }
+  } else {
+    if (data->requires_broadcast) {
+      TF_LITE_SUB(optimized_ops, BroadcastSub);
+    } else {
+      TF_LITE_SUB(optimized_ops, Sub);
+    }
+  }
+#undef TF_LITE_SUB
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteSubParams* params, const OpData* data,
+                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   TfLiteTensor* output) {
+  auto input1_offset = -input1->params.zero_point;
+  auto input2_offset = -input2->params.zero_point;
+  auto output_offset = output->params.zero_point;
+  const int left_shift = 20;
+  const double twice_max_input_scale =
+      2 * std::max(input1->params.scale, input2->params.scale);
+  const double real_input1_multiplier =
+      input1->params.scale / twice_max_input_scale;
+  const double real_input2_multiplier =
+      input2->params.scale / twice_max_input_scale;
+  const double real_output_multiplier =
+      twice_max_input_scale / ((1 << left_shift) * output->params.scale);
+
+  int32 input1_multiplier;
+  int input1_shift;
+  QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &input1_multiplier,
+                                   &input1_shift);
+  int32 input2_multiplier;
+  int input2_shift;
+  QuantizeMultiplierSmallerThanOne(real_input2_multiplier, &input2_multiplier,
+                                   &input2_shift);
+  int32 output_multiplier;
+  int output_shift;
+  QuantizeMultiplierSmallerThanOne(real_output_multiplier, &output_multiplier,
+                                   &output_shift);
+
+  int32 output_activation_min, output_activation_max;
+  CalculateActivationRangeUint8(params->activation, output,
+                                &output_activation_min, &output_activation_max);
+
+#define TF_LITE_SUB(type, opname)                                            \
+  type::opname(left_shift, GetTensorData<uint8_t>(input1),                   \
+               GetTensorDims(input1), input1_offset, input1_multiplier,      \
+               input1_shift, GetTensorData<uint8_t>(input2),                 \
+               GetTensorDims(input2), input2_offset, input2_multiplier,      \
+               input2_shift, output_offset, output_multiplier, output_shift, \
+               output_activation_min, output_activation_max,                 \
+               GetTensorData<uint8_t>(output), GetTensorDims(output));
+  // The quantized version of Sub doesn't support activations, so we
+  // always use BroadcastSub.
   if (kernel_type == kReference) {
-    TF_LITE_Sub(reference_ops);
+    TF_LITE_SUB(reference_ops, BroadcastSub);
   } else {
-    TF_LITE_Sub(optimized_ops);
+    TF_LITE_SUB(optimized_ops, BroadcastSub);
   }
-#undef TF_LITE_Sub
+#undef TF_LITE_SUB
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalSubFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
+  } else if (output->type == kTfLiteUInt8) {
+    EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
+                               output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Inputs and outputs not all float|uint8 types.");
     return kTfLiteError;
   }
 
@@ -99,19 +185,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace sub
 
 TfLiteRegistration* Register_SUB_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+  static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
                                  sub::Eval<sub::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_SUB_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+  static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
                                  sub::Eval<sub::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_SUB_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+  static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
                                  sub::Eval<sub::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/sub_test.cc b/tensorflow/contrib/lite/kernels/sub_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff07aeec49dbfcc0e1f65df3d674d5ec30f1b54c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sub_test.cc
@@ -0,0 +1,218 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseSubOpModel : public SingleOpModel {
+ public:
+  BaseSubOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
+                 ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SUB, BuiltinOptions_SubOptions,
+                 CreateSubOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatSubOpModel : public BaseSubOpModel {
+ public:
+  using BaseSubOpModel::BaseSubOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedSubOpModel : public BaseSubOpModel {
+ public:
+  using BaseSubOpModel::BaseSubOpModel;
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+// for quantized Sub, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+  float kQuantizedStep = (max - min) / 255.0;
+  float kQuantizedTolerance = 2.0 * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(FloatSubOpModel, NoActivation) {
+  FloatSubOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-2.1, 0.0, 1.4, -0.3})));
+}
+
+TEST(FloatSubOpModel, ActivationRELU_N1_TO_1) {
+  FloatSubOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-1.0, 0.0, 1.0, -0.3})));
+}
+
+TEST(FloatSubOpModel, VariousInputShapes) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5, -1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.8, -1.1, 0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-2.1, 0.0, 1.4, -0.3, 0.0, 1.9})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatSubOpModel, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5, -1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.5});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-2.5, -0.3, 1.2, 0.0, -1.6, 1.5})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<std::initializer_list<float>> inputs1 = {
+      {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
+  std::vector<std::initializer_list<float>> inputs2 = {
+      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.2}, {0.6, 0.4, -0.18, 0.5}};
+  std::vector<std::initializer_list<float>> results = {
+      {-0.5, -0.2, 0.0, 0.3},
+      {-0.8, -0.2, -0.1, 0.9},
+      {-0.61, -0.2, 0.88, -0.2}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {}, -1.0, 1.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                              results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                                       {-0.8, 0.2, 0.7, 0.5}};
+  std::vector<std::initializer_list<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                                       {0.6, 0.4, -0.8, 0.3}};
+  std::vector<std::initializer_list<float>> results = {{-1.0, -0.2, 0.0, 1.0},
+                                                       {-1.0, -0.2, 1.0, 0.2}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {}, -1.0, 1.0},
+                          ActivationFunctionType_RELU_N1_TO_1);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                              results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-2.1, -0.1, 0.4, 0.3, 0.0, 1.9}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.7});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-2.7, -0.5, 0.0, 0.1, 0.4, 1.3}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index 373310bd87370a670a847cf5328633956028a850..0bb28b50b2a5e5a9fd803ecf1b0928026f63881e 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -141,8 +141,8 @@ void SingleOpModel::SetBuiltinOp(BuiltinOperator type,
 
 void SingleOpModel::SetCustomOp(
     const string& name, const std::vector<uint8_t>& custom_option,
-    const std::function<TfLiteRegistration*()>& registeration) {
-  custom_registrations_[name] = registeration;
+    const std::function<TfLiteRegistration*()>& registration) {
+  custom_registrations_[name] = registration;
   opcodes_.push_back(
       CreateOperatorCodeDirect(builder_, BuiltinOperator_CUSTOM, name.data()));
   operators_.push_back(CreateOperator(
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index 7d476ba1eaffbb24fb77390c0e71c32d60b6411e..a9064d54e7704d52eefa34f6bf446ec1cfe68fe1 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -39,10 +39,10 @@ inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
                                int32_t zero_point) {
   std::vector<T> q;
   for (float f : data) {
-    q.push_back(std::max(
+    q.push_back(static_cast<T>(std::max<float>(
         std::numeric_limits<T>::min(),
-        std::min(std::numeric_limits<T>::max(),
-                 static_cast<T>(std::round(zero_point + (f / scale))))));
+        std::min<float>(std::numeric_limits<T>::max(),
+                        std::round(zero_point + (f / scale))))));
   }
   return q;
 }
diff --git a/tensorflow/contrib/lite/kernels/test_util_test.cc b/tensorflow/contrib/lite/kernels/test_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e10e89061213b6fcabd404310893dd97a51d83f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/test_util_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+TEST(TestUtilTest, QuantizeVector) {
+  std::vector<float> data = {-1.0, -0.5, 0.0, 0.5, 1.0, 1000.0};
+  auto q_data = Quantize<uint8>(data, /*scale=*/1.0, /*zero_point=*/0);
+  std::vector<uint8> expected = {0, 0, 0, 1, 1, 255};
+  EXPECT_THAT(q_data, ElementsAreArray(expected));
+}
+
+TEST(TestUtilTest, QuantizeVectorScalingDown) {
+  std::vector<float> data = {-1.0, -0.5, 0.0, 0.5, 1.0, 1000.0};
+  auto q_data = Quantize<uint8>(data, /*scale=*/10.0, /*zero_point=*/0);
+  std::vector<uint8> expected = {0, 0, 0, 0, 0, 100};
+  EXPECT_THAT(q_data, ElementsAreArray(expected));
+}
+
+TEST(TestUtilTest, QuantizeVectorScalingUp) {
+  std::vector<float> data = {-1.0, -0.5, 0.0, 0.5, 1.0, 1000.0};
+  auto q_data = Quantize<uint8>(data, /*scale=*/0.1, /*zero_point=*/0);
+  std::vector<uint8> expected = {0, 0, 0, 5, 10, 255};
+  EXPECT_THAT(q_data, ElementsAreArray(expected));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/topk_v2.cc b/tensorflow/contrib/lite/kernels/topk_v2.cc
index 807e84609f8b23d25324d99d26086331d78a0684..ad9b744f1af2715a37cc60ef61b0b9540fe2532b 100644
--- a/tensorflow/contrib/lite/kernels/topk_v2.cc
+++ b/tensorflow/contrib/lite/kernels/topk_v2.cc
@@ -25,8 +25,8 @@ namespace builtin {
 namespace topk_v2 {
 constexpr int kInputTensor = 0;
 constexpr int kInputTopK = 1;
-constexpr int kOutputIndexes = 0;
-constexpr int kOutputValues = 1;
+constexpr int kOutputValues = 0;
+constexpr int kOutputIndexes = 1;
 
 namespace {
 TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
diff --git a/tensorflow/contrib/lite/kernels/topk_v2_test.cc b/tensorflow/contrib/lite/kernels/topk_v2_test.cc
index 29f2a057cd45e1cded3ff1aa0f0fdcad666ce2fa..212f8acc76d4afba56933029175f69b34ea87a3e 100644
--- a/tensorflow/contrib/lite/kernels/topk_v2_test.cc
+++ b/tensorflow/contrib/lite/kernels/topk_v2_test.cc
@@ -31,8 +31,8 @@ class TopKV2OpModel : public SingleOpModel {
                 int top_k) {
     input_ = AddInput(input_type);
     top_k_ = AddInput(TensorType_INT32);
-    output_indexes_ = AddOutput(TensorType_INT32);
     output_values_ = AddOutput(input_type);
+    output_indexes_ = AddOutput(TensorType_INT32);
     SetBuiltinOp(BuiltinOperator_TOPK_V2, BuiltinOptions_TopKV2Options, 0);
     BuildInterpreter({input_shape, {1}});
     PopulateTensor<int32_t>(top_k_, {top_k});
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
index 9cdb58714edb5fee771fc45f3c53a570f8fb28d1..42941a97db70adb37c20500c8f9438adfea25389 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
@@ -359,7 +360,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
+  // check the existence of only one to get the condition.
   const bool use_cifg = (input_to_input_weights == nullptr);
   const bool use_peephole = (cell_to_output_weights != nullptr);
 
@@ -380,135 +381,57 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
   }
 
+  // Check optional tensors, the respective pointers can be null.
+  const float* input_to_input_weights_ptr =
+      (use_cifg) ? nullptr : input_to_input_weights->data.f;
+  const float* recurrent_to_input_weights_ptr =
+      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
+  const float* input_gate_bias_ptr =
+      (use_cifg) ? nullptr : input_gate_bias->data.f;
+  const float* cell_to_input_weights_ptr =
+      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
+  const float* cell_to_forget_weights_ptr =
+      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
+  const float* cell_to_output_weights_ptr =
+      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
+  const float* projection_weights_ptr =
+      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
+  const float* projection_bias_ptr =
+      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+
+  // Required tensors, pointers are non-null.
+  const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f;
+  const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f;
+  const float* input_to_output_weights_ptr = input_to_output_weights->data.f;
+  const float* recurrent_to_forget_weights_ptr =
+      recurrent_to_forget_weights->data.f;
+  const float* recurrent_to_cell_weights_ptr =
+      recurrent_to_cell_weights->data.f;
+  const float* recurrent_to_output_weights_ptr =
+      recurrent_to_output_weights->data.f;
+  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
+  const float* cell_bias_ptr = cell_bias->data.f;
+  const float* output_gate_bias_ptr = output_gate_bias->data.f;
+
+  float* output_state_ptr = output_state->data.f;
+  float* cell_state_ptr = cell_state->data.f;
+
   for (int t = 0; t < max_time; t++) {
-    const float* input_ptr_time = input->data.f + t * n_batch * n_input;
-    // Initialize scratch buffers with bias.
-    if (!use_cifg) {
-      tensor_utils::VectorBatchVectorAssign(input_gate_bias->data.f, n_cell,
-                                            n_batch, input_gate_scratch);
-    }
-    tensor_utils::VectorBatchVectorAssign(forget_gate_bias->data.f, n_cell,
-                                          n_batch, forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(cell_bias->data.f, n_cell, n_batch,
-                                          cell_scratch);
-    tensor_utils::VectorBatchVectorAssign(output_gate_bias->data.f, n_cell,
-                                          n_batch, output_gate_scratch);
-
-    // For each batch and cell: compute input_weight * input.
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          input_to_input_weights->data.f, n_cell, n_input, input_ptr_time,
-          n_batch, input_gate_scratch, /*result_stride=*/1);
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_forget_weights->data.f, n_cell, n_input, input_ptr_time,
-        n_batch, forget_gate_scratch, /*result_stride=*/1);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_cell_weights->data.f, n_cell, n_input, input_ptr_time, n_batch,
-        cell_scratch, /*result_stride=*/1);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_output_weights->data.f, n_cell, n_input, input_ptr_time,
-        n_batch, output_gate_scratch, /*result_stride=*/1);
-
-    // For each batch and cell: compute recurrent_weight * output_state.
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          recurrent_to_input_weights->data.f, n_cell, n_output,
-          output_state->data.f, n_batch, input_gate_scratch,
-          /*result_stride=*/1);
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_forget_weights->data.f, n_cell, n_output,
-        output_state->data.f, n_batch, forget_gate_scratch,
-        /*result_stride=*/1);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_cell_weights->data.f, n_cell, n_output,
-        output_state->data.f, n_batch, cell_scratch, /*result_stride=*/1);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_output_weights->data.f, n_cell, n_output,
-        output_state->data.f, n_batch, output_gate_scratch,
-        /*result_stride=*/1);
-
-    // For each batch and cell: update input gate.
-    if (!use_cifg) {
-      if (use_peephole) {
-        tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-            cell_to_input_weights->data.f, n_cell, cell_state->data.f, n_batch,
-            input_gate_scratch);
-      }
-      tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                         input_gate_scratch);
-    }
-
-    // For each batch and cell: update forget gate.
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_forget_weights->data.f, n_cell, cell_state->data.f, n_batch,
-          forget_gate_scratch);
-    }
-    tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                       forget_gate_scratch);
-
-    // For each batch and cell: update the cell.
-    tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch,
-                                           cell_state->data.f, n_batch * n_cell,
-                                           cell_state->data.f);
-    tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                          params->activation, cell_scratch);
-    if (use_cifg) {
-      tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                               forget_gate_scratch);
-      tensor_utils::VectorVectorCwiseProductAccumulate(
-          cell_scratch, forget_gate_scratch, n_batch * n_cell,
-          cell_state->data.f);
-    } else {
-      tensor_utils::VectorVectorCwiseProductAccumulate(
-          cell_scratch, input_gate_scratch, n_batch * n_cell,
-          cell_state->data.f);
-    }
-    if (params->cell_clip > 0.0) {
-      tensor_utils::ClipVector(cell_state->data.f, n_batch * n_cell,
-                               params->cell_clip, cell_state->data.f);
-    }
-
-    // For each batch and cell: update the output gate.
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_output_weights->data.f, n_cell, cell_state->data.f, n_batch,
-          output_gate_scratch);
-    }
-    tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                       output_gate_scratch);
-    tensor_utils::ApplyActivationToVector(cell_state->data.f, n_batch * n_cell,
-                                          params->activation, cell_scratch);
-    tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                           n_batch * n_cell,
-                                           output_gate_scratch);
-
-    // For each batch: update the projection and output_state.
-    const bool use_projection_weight = (projection_weights != nullptr);
-    const bool use_projection_bias = (projection_bias != nullptr);
-    float* output_ptr_time = output->data.f + t * n_batch * n_output;
-    if (use_projection_weight) {
-      if (use_projection_bias) {
-        tensor_utils::VectorBatchVectorAssign(projection_bias->data.f, n_output,
-                                              n_batch, output_ptr_time);
-      } else {
-        tensor_utils::ZeroVector(output_ptr_time, n_batch * n_output);
-      }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights->data.f, n_output, n_cell, output_gate_scratch,
-          n_batch, output_ptr_time, /*result_stride=*/1);
-      if (params->proj_clip > 0.0) {
-        tensor_utils::ClipVector(output_ptr_time, n_batch * n_output,
-                                 params->proj_clip, output_ptr_time);
-      }
-    } else {
-      tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                               output_ptr_time);
-    }
-    tensor_utils::CopyVector(output_ptr_time, n_batch * n_output,
-                             output_state->data.f);
+    const float* input_ptr_batch = input->data.f + t * n_batch * n_input;
+    float* output_ptr_batch = output->data.f + t * n_batch * n_output;
+
+    kernel_utils::LstmStep(
+        input_ptr_batch, input_to_input_weights_ptr,
+        input_to_forget_weights_ptr, input_to_cell_weights_ptr,
+        input_to_output_weights_ptr, recurrent_to_input_weights_ptr,
+        recurrent_to_forget_weights_ptr, recurrent_to_cell_weights_ptr,
+        recurrent_to_output_weights_ptr, cell_to_input_weights_ptr,
+        cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
+        input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
+        output_gate_bias_ptr, projection_weights_ptr, projection_bias_ptr,
+        params, n_batch, n_cell, n_input, n_output, output_state_ptr,
+        cell_state_ptr, input_gate_scratch, forget_gate_scratch, cell_scratch,
+        output_gate_scratch, output_ptr_batch);
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/memory_planner.h b/tensorflow/contrib/lite/memory_planner.h
index 5cd6c208500f3ea84ab8146f7f136e8b7851ff03..0294ec815c4820d41361b8cd4a814b74c3c1d770 100644
--- a/tensorflow/contrib/lite/memory_planner.h
+++ b/tensorflow/contrib/lite/memory_planner.h
@@ -34,8 +34,8 @@ class MemoryPlanner {
   // [first_node, last_node].
   virtual TfLiteStatus ExecuteAllocations(int first_node, int last_node) = 0;
 
-  // Invalidates allocations made earliers. This is called when tensors sizes
-  // have change. All planned allocations remain, but can't be used until
+  // Invalidates allocations made earlier. This is called when tensors sizes
+  // have changed. All planned allocations remain, but can't be used until
   // ExecuteAllocations() is called.
   virtual TfLiteStatus ResetAllocations() = 0;
 };
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index d6522fc077d03bb49fe54b7a04fa6341ccf4cf3a..e15f1be7d3880200abf129fb4fca71146994e87f 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -30,50 +30,114 @@ limitations under the License.
 
 namespace tflite {
 
+namespace {
+// Ensure that ErrorReporter is non-null.
+ErrorReporter* ValidateErrorReporter(ErrorReporter* e) {
+  return e ? e : DefaultErrorReporter();
+}
+}  // namespace
+
 const char* kEmptyTensorName = "";
 
+TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
+                               ErrorReporter* error_reporter) {
+  switch (tensor_type) {
+    case TensorType_FLOAT32:
+      *type = kTfLiteFloat32;
+      break;
+    case TensorType_INT32:
+      *type = kTfLiteInt32;
+      break;
+    case TensorType_UINT8:
+      *type = kTfLiteUInt8;
+      break;
+    case TensorType_INT64:
+      *type = kTfLiteInt64;
+      break;
+    case TensorType_STRING:
+      *type = kTfLiteString;
+      break;
+    case TensorType_BOOL:
+      *type = kTfLiteBool;
+      break;
+    default:
+      error_reporter->Report("Unimplemented data type %s (%d) in tensor\n",
+                             EnumNameTensorType(tensor_type), tensor_type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+// Loads a model from `filename`. If `mmap_file` is true then use mmap,
+// otherwise make a copy of the model in a buffer.
+std::unique_ptr<Allocation> GetAllocationFromFile(const char* filename,
+                                                  bool mmap_file,
+                                                  ErrorReporter* error_reporter,
+                                                  bool use_nnapi) {
+  std::unique_ptr<Allocation> allocation;
+  if (mmap_file) {
+    if (use_nnapi && NNAPIExists())
+      allocation.reset(new NNAPIAllocation(filename, error_reporter));
+    else
+      allocation.reset(new MMAPAllocation(filename, error_reporter));
+  } else {
+    allocation.reset(new FileCopyAllocation(filename, error_reporter));
+  }
+  return allocation;
+}
+
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
     const char* filename, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
+  std::unique_ptr<FlatBufferModel> model;
+  auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
+                                          error_reporter, /*use_nnapi=*/true);
+  model.reset(new FlatBufferModel(allocation.release(), error_reporter));
+  if (!model->initialized()) model.reset();
+  return model;
+}
+
+std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
+    const char* filename, TfLiteVerifier* verifier,
+    ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
-  model.reset(new FlatBufferModel(filename, /*mmap_file=*/true, error_reporter,
-                                  /*use_nnapi=*/true));
+  auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
+                                          error_reporter, /*use_nnapi=*/true);
+  if (verifier &&
+      !verifier->Verify(static_cast<const char*>(allocation->base()),
+                        allocation->bytes(), error_reporter)) {
+    return model;
+  }
+  model.reset(new FlatBufferModel(allocation.release(), error_reporter));
   if (!model->initialized()) model.reset();
   return model;
 }
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
     const char* buffer, size_t buffer_size, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
-  model.reset(new FlatBufferModel(buffer, buffer_size, error_reporter));
+  Allocation* allocation =
+      new MemoryAllocation(buffer, buffer_size, error_reporter);
+  model.reset(new FlatBufferModel(allocation, error_reporter));
   if (!model->initialized()) model.reset();
   return model;
 }
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
     const tflite::Model* model_spec, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   model.reset(new FlatBufferModel(model_spec, error_reporter));
   if (!model->initialized()) model.reset();
   return model;
 }
 
-FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
-                                 ErrorReporter* error_reporter, bool use_nnapi)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
-  if (mmap_file) {
-    if (use_nnapi && NNAPIExists())
-      allocation_ = new NNAPIAllocation(filename, error_reporter);
-    else
-      allocation_ = new MMAPAllocation(filename, error_reporter);
-  } else {
-    allocation_ = new FileCopyAllocation(filename, error_reporter);
-  }
-  if (!allocation_->valid() || !CheckModelIdentifier()) return;
-
-  model_ = ::tflite::GetModel(allocation_->base());
-}
-
 bool FlatBufferModel::CheckModelIdentifier() const {
   if (!tflite::ModelBufferHasIdentifier(allocation_->base())) {
     const char* ident = flatbuffers::GetBufferIdentifier(allocation_->base());
@@ -85,21 +149,19 @@ bool FlatBufferModel::CheckModelIdentifier() const {
   return true;
 }
 
-FlatBufferModel::FlatBufferModel(const char* ptr, size_t num_bytes,
+FlatBufferModel::FlatBufferModel(const Model* model,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
-  allocation_ = new MemoryAllocation(ptr, num_bytes, error_reporter);
-  if (!allocation_->valid()) return;
-
-  model_ = ::tflite::GetModel(allocation_->base());
+    : error_reporter_(ValidateErrorReporter(error_reporter)) {
+  model_ = model;
 }
 
-FlatBufferModel::FlatBufferModel(const Model* model,
+FlatBufferModel::FlatBufferModel(Allocation* allocation,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
-  model_ = model;
+    : error_reporter_(ValidateErrorReporter(error_reporter)) {
+  allocation_ = allocation;
+  if (!allocation_->valid() || !CheckModelIdentifier()) return;
+
+  model_ = ::tflite::GetModel(allocation_->base());
 }
 
 FlatBufferModel::~FlatBufferModel() { delete allocation_; }
@@ -108,7 +170,7 @@ InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
                                        const OpResolver& op_resolver)
     : model_(model.GetModel()),
       op_resolver_(op_resolver),
-      error_reporter_(model.error_reporter()),
+      error_reporter_(ValidateErrorReporter(model.error_reporter())),
       allocation_(model.allocation()) {}
 
 InterpreterBuilder::InterpreterBuilder(const ::tflite::Model* model,
@@ -116,22 +178,27 @@ InterpreterBuilder::InterpreterBuilder(const ::tflite::Model* model,
                                        ErrorReporter* error_reporter)
     : model_(model),
       op_resolver_(op_resolver),
-      error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {}
+      error_reporter_(ValidateErrorReporter(error_reporter)) {}
 
 TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
   TfLiteStatus status = kTfLiteOk;
   auto opcodes = model_->operator_codes();
   for (const OperatorCode* opcode : *opcodes) {
     TfLiteRegistration* registration = nullptr;
-
-    if (opcode->builtin_code() != BuiltinOperator_CUSTOM) {
-      auto x = opcode->builtin_code();
-      flatbuffer_op_index_to_registration_types_.push_back(x);
-      registration = op_resolver_.FindOp(x);
+    auto builtin_code = opcode->builtin_code();
+    if (builtin_code > BuiltinOperator_MAX ||
+        builtin_code < BuiltinOperator_MIN) {
+      error_reporter_->Report(
+          "Op builtin_code out or range: %d. Are you using old TFLite binary "
+          "with newer model?",
+          builtin_code);
+      status = kTfLiteError;
+    } else if (builtin_code != BuiltinOperator_CUSTOM) {
+      flatbuffer_op_index_to_registration_types_.push_back(builtin_code);
+      registration = op_resolver_.FindOp(builtin_code);
       if (registration == nullptr) {
         error_reporter_->Report("Didn't find op for builtin opcode '%s'\n",
-                                EnumNameBuiltinOperator(x));
+                                EnumNameBuiltinOperator(builtin_code));
         status = kTfLiteError;
       }
     } else if (!opcode->custom_code()) {
@@ -197,13 +264,11 @@ T* MallocPOD() {
 // Parse the appropriate data out of the op.
 //
 // This handles builtin data explicitly as there are flatbuffer schemas.
-//
-// Returns memory that must be feed.
-//
-// TODO(nupurgarg): Pass in void ** and return TfLiteStatus to ensure program
-// crashes if error reporter is called.
-void* ParseOpData(const Operator* op, BuiltinOperator op_type,
-                  ErrorReporter* error_reporter) {
+// If it returns kTfLiteOk, it passes the data out with `builtin_data`, which
+// need to be released by calling `free`.`
+// If it returns kTfLiteError, `builtin_data` will be `nullptr`.
+TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter, void** builtin_data) {
   auto parse_padding = [](Padding padding) {
     switch (padding) {
       case Padding_SAME:
@@ -252,7 +317,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     }
   };
 
-  void* builtin_data = nullptr;
+  *builtin_data = nullptr;
   switch (op_type) {
     case BuiltinOperator_CALL:
       // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
@@ -268,8 +333,10 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->stride_height = conv_params->stride_h();
         params->activation =
             parse_activation(conv_params->fused_activation_function());
+        params->dilation_width_factor = conv_params->dilation_w_factor();
+        params->dilation_height_factor = conv_params->dilation_h_factor();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_TANH:
@@ -280,14 +347,35 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_CONCAT_EMBEDDINGS:
     case BuiltinOperator_EXP:
     case BuiltinOperator_TOPK_V2:
+    case BuiltinOperator_LOG_SOFTMAX:
+    case BuiltinOperator_DEQUANTIZE:
+    case BuiltinOperator_PRELU:
+    case BuiltinOperator_FLOOR:
+      break;
+    case BuiltinOperator_CAST: {
+      TfLiteCastParams* params = MallocPOD<TfLiteCastParams>();
+      if (auto* schema_params = op->builtin_options_as_CastOptions()) {
+        auto in_status =
+            ConvertTensorType(schema_params->in_data_type(),
+                              &params->in_data_type, error_reporter);
+        auto out_status =
+            ConvertTensorType(schema_params->out_data_type(),
+                              &params->out_data_type, error_reporter);
+        if (in_status != kTfLiteOk || out_status != kTfLiteOk) {
+          free(params);
+          return kTfLiteError;
+        }
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
+    }
     case BuiltinOperator_LSH_PROJECTION: {
       TfLiteLSHProjectionParams* params =
           MallocPOD<TfLiteLSHProjectionParams>();
       if (auto* lshParams = op->builtin_options_as_LSHProjectionOptions()) {
         params->type = parseLSHProjectionType(lshParams->type());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_AVERAGE_POOL_2D:
@@ -303,7 +391,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(pool_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DEPTHWISE_CONV_2D: {
@@ -317,7 +405,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(conv_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SVDF: {
@@ -327,7 +415,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(svdf_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
@@ -339,7 +427,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(sequence_rnn_params->fused_activation_function());
         params->time_major = sequence_rnn_params->time_major();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_RNN: {
@@ -348,7 +436,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(rnn_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_EMBEDDING_LOOKUP:
@@ -361,7 +449,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
               op->builtin_options_as_EmbeddingLookupSparseOptions()) {
         params->combiner = parseCombinerType(embedding_params->combiner());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_FULLY_CONNECTED: {
@@ -372,7 +460,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation = parse_activation(
             fully_connected_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_HASHTABLE_LOOKUP:
@@ -383,7 +471,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* softmax_params = op->builtin_options_as_SoftmaxOptions()) {
         params->beta = softmax_params->beta();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_CONCATENATION: {
@@ -395,7 +483,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(concatenation_params->fused_activation_function());
         params->axis = concatenation_params->axis();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_MUL: {
@@ -404,7 +492,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_ADD: {
@@ -413,7 +501,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DIV: {
@@ -422,7 +510,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SUB: {
@@ -431,7 +519,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_L2_NORMALIZATION: {
@@ -440,7 +528,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: {
@@ -452,9 +540,10 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->alpha = schema_params->alpha();
         params->beta = schema_params->beta();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
     case BuiltinOperator_LSTM: {
       TfLiteLSTMParams* params = MallocPOD<TfLiteLSTMParams>();
@@ -464,7 +553,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->cell_clip = lstm_params->cell_clip();
         params->proj_clip = lstm_params->proj_clip();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_RESIZE_BILINEAR: {
@@ -473,7 +562,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
               op->builtin_options_as_ResizeBilinearOptions()) {
         params->align_corners = schema_params->align_corners();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_PAD: {
@@ -487,7 +576,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                                    params->shape, error_reporter);
         params->num_dimensions = new_shape->Length();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SKIP_GRAM: {
@@ -497,7 +586,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->max_skip_size = skip_gram_params->max_skip_size();
         params->include_all_ngrams = skip_gram_params->include_all_ngrams();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPACE_TO_DEPTH: {
@@ -505,7 +594,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_SpaceToDepthOptions()) {
         params->block_size = schema_params->block_size();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_GATHER: {
@@ -515,7 +604,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->axis = gather_params->axis();
       }
 
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPACE_TO_BATCH_ND: {
@@ -532,7 +621,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
         params->keep_dims = schema_params->keep_dims();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPLIT: {
@@ -540,7 +629,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_SplitOptions()) {
         params->num_splits = schema_params->num_splits();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SQUEEZE: {
@@ -551,7 +640,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                                    params->squeeze_dims, error_reporter);
         params->num_squeeze_dims = squeeze_dims->Length();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_STRIDED_SLICE: {
@@ -563,11 +652,32 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->new_axis_mask = schema_params->new_axis_mask();
         params->shrink_axis_mask = schema_params->shrink_axis_mask();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_MAXIMUM:
+    case BuiltinOperator_MINIMUM: {
+      break;
+    }
+    case BuiltinOperator_ARG_MAX: {
+      auto* params = MallocPOD<TfLiteArgMaxParams>();
+      if (auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
+        ConvertTensorType(schema_params->output_type(), &params->output_type,
+                          error_reporter);
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_LESS: {
+      break;
+    }
+    case BuiltinOperator_DELEGATE: {
+      // TODO(ycling): Revisit when supporting saving delegated models.
+      error_reporter->Report("DELEGATE op shouldn't exist in model.");
+      return kTfLiteError;
+    }
   }
-  return builtin_data;
+  return kTfLiteOk;
 }
 
 }  // namespace
@@ -607,10 +717,13 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
           reinterpret_cast<const char*>(op->custom_options()->data()),
           op->custom_options()->size(), nullptr, reg);
     } else {
+      void* builtin_data = nullptr;
+      TF_LITE_ENSURE_STATUS(
+          ParseOpData(op, op_type, error_reporter_, &builtin_data));
       interpreter->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
-          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0,
-          ParseOpData(op, op_type, error_reporter_), reg);
+          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0, builtin_data,
+          reg);
     }
   }
 
@@ -644,35 +757,34 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
       // but we really only support one value for the whole tensor.
       // TODO(aselle): This breaks as well if these are nullptr's.
       // TODO(aselle): This assumes non per-channel quantization.
-      if (q_params->scale()) quantization.scale = q_params->scale()->Get(0);
-      if (q_params->zero_point())
+
+      if (q_params->scale()) {
+        if (q_params->scale()->size() != 1) {
+          error_reporter_->Report(
+              "QuantizationParam has %d scale values (only 1 is supported).",
+              q_params->scale()->size());
+          return kTfLiteError;
+        }
+        quantization.scale = q_params->scale()->Get(0);
+      }
+
+      if (q_params->zero_point()) {
+        if (q_params->zero_point()->size() != 1) {
+          error_reporter_->Report(
+              "QuantizationParam has %d zero_point values"
+              " (only 1 is supported).",
+              q_params->zero_point()->size());
+          return kTfLiteError;
+        }
         quantization.zero_point = q_params->zero_point()->Get(0);
+      }
     }
 
     TfLiteType type;
-    switch (tensor->type()) {
-      case TensorType_FLOAT32:
-        type = kTfLiteFloat32;
-        break;
-      case TensorType_INT32:
-        type = kTfLiteInt32;
-        break;
-      case TensorType_UINT8:
-        type = kTfLiteUInt8;
-        break;
-      case TensorType_INT64:
-        type = kTfLiteInt64;
-        break;
-      case TensorType_STRING:
-        type = kTfLiteString;
-        break;
-      default:
-        // tensorType = ArrayType::NONE;
-        error_reporter_->Report("Unimplemented data type %s (%d) in tensor\n",
-                                EnumNameTensorType(tensor->type()),
-                                tensor->type());
-        status = kTfLiteError;
-        continue;
+    if (ConvertTensorType(tensor->type(), &type, error_reporter_) !=
+        kTfLiteOk) {
+      status = kTfLiteError;
+      continue;
     }
     auto get_readonly_data = [&](const char** buffer_data,
                                  size_t* buffer_size) {
@@ -724,6 +836,11 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 
 TfLiteStatus InterpreterBuilder::operator()(
     std::unique_ptr<Interpreter>* interpreter) {
+  return operator()(interpreter, /*num_threads=*/-1);
+}
+
+TfLiteStatus InterpreterBuilder::operator()(
+    std::unique_ptr<Interpreter>* interpreter, int num_threads) {
   if (!interpreter) {
     error_reporter_->Report(
         "Null output pointer passed to InterpreterBuilder.");
@@ -778,7 +895,8 @@ TfLiteStatus InterpreterBuilder::operator()(
   if ((**interpreter).AddTensors(tensors->Length()) != kTfLiteOk) {
     return cleanup_and_error();
   }
-
+  // Set num threads
+  (**interpreter).SetNumThreads(num_threads);
   // Parse inputs/outputs
   (**interpreter).SetInputs(FlatBufferIntArrayToVector(subgraph->inputs()));
   (**interpreter).SetOutputs(FlatBufferIntArrayToVector(subgraph->outputs()));
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index a467df5bb4eee3f6ce814512cb8b74bf09a6a4e7..5a55b031a8c28085e02782608eb820a3cfe78dde 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -41,30 +41,57 @@ limitations under the License.
 
 namespace tflite {
 
+// Abstract interface that verifies whether a given model is legit.
+// It facilitates the use-case to verify and build a model without loading it
+// twice.
+class TfLiteVerifier {
+ public:
+  // Returns true if the model is legit.
+  virtual bool Verify(const char* data, int length,
+                      ErrorReporter* reporter) = 0;
+  virtual ~TfLiteVerifier() {}
+};
+
 // An RAII object that represents a read-only tflite model, copied from disk,
 // or mmapped. This uses flatbuffers as the serialization format.
 class FlatBufferModel {
  public:
-  // Builds a model based on a file. Returns a nullptr in case of failure.
+  // Builds a model based on a file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromFile(
       const char* filename,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
+  // Verifies whether the content of the file is legit, then builds a model
+  // based on the file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
+      const char* filename, TfLiteVerifier* verifier = nullptr,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
   // Builds a model based on a pre-loaded flatbuffer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Returns a nullptr in case of failure.
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
       const char* buffer, size_t buffer_size,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Builds a model directly from a flatbuffer pointer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Returns a nullptr in case of failure.
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromModel(
       const tflite::Model* model_spec,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Releases memory or unmaps mmaped meory.
+  // Releases memory or unmaps mmaped memory.
   ~FlatBufferModel();
 
   // Copying or assignment is disallowed to simplify ownership semantics.
@@ -82,23 +109,12 @@ class FlatBufferModel {
   bool CheckModelIdentifier() const;
 
  private:
-  // Loads a model from `filename`. If `mmap_file` is true then use mmap,
-  // otherwise make a copy of the model in a buffer.
-  //
-  // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
-  // used.
-  explicit FlatBufferModel(
-      const char* filename, bool mmap_file = true,
-      ErrorReporter* error_reporter = DefaultErrorReporter(),
-      bool use_nnapi = false);
-
-  // Loads a model from `ptr` and `num_bytes` of the model file. The `ptr` has
-  // to remain alive and unchanged until the end of this flatbuffermodel's
-  // lifetime.
-  //
-  // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
-  // used.
-  FlatBufferModel(const char* ptr, size_t num_bytes,
+  // Loads a model from a given allocation. FlatBufferModel will take over the
+  // ownership of `allocation`, and delete it in destructor. The ownership of
+  // `error_reporter`remains with the caller and must have lifetime at least
+  // as much as FlatBufferModel. This is to allow multiple models to use the
+  // same ErrorReporter instance.
+  FlatBufferModel(Allocation* allocation,
                   ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Loads a model from Model flatbuffer. The `model` has to remain alive and
@@ -108,7 +124,10 @@ class FlatBufferModel {
   // Flatbuffer traverser pointer. (Model* is a pointer that is within the
   // allocated memory of the data allocated by allocation's internals.
   const tflite::Model* model_ = nullptr;
+  // The error reporter to use for model errors and subsequent errors when
+  // the interpreter is created
   ErrorReporter* error_reporter_;
+  // The allocator used for holding memory of the model.
   Allocation* allocation_ = nullptr;
 };
 
@@ -151,6 +170,8 @@ class InterpreterBuilder {
   InterpreterBuilder(const InterpreterBuilder&) = delete;
   InterpreterBuilder& operator=(const InterpreterBuilder&) = delete;
   TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter);
+  TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter,
+                          int num_threads);
 
  private:
   TfLiteStatus BuildLocalIndexToRegistrationMapping();
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index 66f22fd66a9ae0d35553a1f780ef73a5c5994c99..ae6c1ece18963f11f48a6f07bea4065ce39687e0 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -209,6 +209,38 @@ TEST(BasicFlatBufferModel, TestNullModel) {
   ASSERT_EQ(interpreter.get(), nullptr);
 }
 
+// Mocks the verifier by setting the result in ctor.
+class FakeVerifier : public tflite::TfLiteVerifier {
+ public:
+  explicit FakeVerifier(bool result) : result_(result) {}
+  bool Verify(const char* data, int length,
+              tflite::ErrorReporter* reporter) override {
+    return result_;
+  }
+
+ private:
+  bool result_;
+};
+
+TEST(BasicFlatBufferModel, TestWithTrueVerifier) {
+  FakeVerifier verifier(true);
+  ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
+      "tensorflow/contrib/lite/testdata/test_model.bin",
+      &verifier));
+}
+
+TEST(BasicFlatBufferModel, TestWithFalseVerifier) {
+  FakeVerifier verifier(false);
+  ASSERT_FALSE(FlatBufferModel::VerifyAndBuildFromFile(
+      "tensorflow/contrib/lite/testdata/test_model.bin",
+      &verifier));
+}
+
+TEST(BasicFlatBufferModel, TestWithNullVerifier) {
+  ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
+      "tensorflow/contrib/lite/testdata/test_model.bin", nullptr));
+}
+
 struct TestErrorReporter : public ErrorReporter {
   int Report(const char* format, va_list args) override {
     calls++;
diff --git a/tensorflow/contrib/lite/models/BUILD b/tensorflow/contrib/lite/models/BUILD
index 6a1255b586ef04b80159156a78f0c4569a4661c5..efa47b06fa7f06cc6312535713ec582af4705d85 100644
--- a/tensorflow/contrib/lite/models/BUILD
+++ b/tensorflow/contrib/lite/models/BUILD
@@ -12,15 +12,3 @@ load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
 exports_files(glob([
     "testdata/*",
 ]))
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/models/smartreply/BUILD b/tensorflow/contrib/lite/models/smartreply/BUILD
index 733c3f4c7fa0605f24a1e6b4c458e34310c079c4..a82d1f2eb673b9b7211581f5a9f9febc140d4d1e 100644
--- a/tensorflow/contrib/lite/models/smartreply/BUILD
+++ b/tensorflow/contrib/lite/models/smartreply/BUILD
@@ -86,15 +86,3 @@ cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/models/speech_test.cc b/tensorflow/contrib/lite/models/speech_test.cc
index daa8c3100b64e9290256aa14a6ab641f19174a0a..a354179a9480c136d65f83836d81f69c2089fdbe 100644
--- a/tensorflow/contrib/lite/models/speech_test.cc
+++ b/tensorflow/contrib/lite/models/speech_test.cc
@@ -97,7 +97,12 @@ bool ConvertCsvData(const string& model_name, const string& in_name,
   return true;
 }
 
-TEST(SpeechTest, HotwordOkGoogleRank1Test) {
+class SpeechTest : public ::testing::TestWithParam<int> {
+ protected:
+  int GetMaxInvocations() { return GetParam(); }
+};
+
+TEST_P(SpeechTest, HotwordOkGoogleRank1Test) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData(
       "speech_hotword_model_rank1.tflite", "speech_hotword_model_in.csv",
@@ -105,11 +110,11 @@ TEST(SpeechTest, HotwordOkGoogleRank1Test) {
       /*output_tensor=*/"18", /*persistent_tensors=*/"4",
       /*sequence_size=*/40, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
-  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
 
-TEST(SpeechTest, HotwordOkGoogleRank2Test) {
+TEST_P(SpeechTest, HotwordOkGoogleRank2Test) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData(
       "speech_hotword_model_rank2.tflite", "speech_hotword_model_in.csv",
@@ -117,11 +122,11 @@ TEST(SpeechTest, HotwordOkGoogleRank2Test) {
       /*output_tensor=*/"18", /*persistent_tensors=*/"1",
       /*sequence_size=*/40, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
-  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
 
-TEST(SpeechTest, SpeakerIdOkGoogleTest) {
+TEST_P(SpeechTest, SpeakerIdOkGoogleTest) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData(
       "speech_speakerid_model.tflite", "speech_speakerid_model_in.csv",
@@ -130,11 +135,11 @@ TEST(SpeechTest, SpeakerIdOkGoogleTest) {
       /*persistent_tensors=*/"19,20,40,41,61,62",
       /*sequence_size=*/80, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
-  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
 
-TEST(SpeechTest, AsrAmTest) {
+TEST_P(SpeechTest, AsrAmTest) {
   std::stringstream os;
   ASSERT_TRUE(
       ConvertCsvData("speech_asr_am_model.tflite", "speech_asr_am_model_in.csv",
@@ -143,7 +148,7 @@ TEST(SpeechTest, AsrAmTest) {
                      /*persistent_tensors=*/"19,20,40,41,61,62,82,83,103,104",
                      /*sequence_size=*/320, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
-  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
 
@@ -151,15 +156,16 @@ TEST(SpeechTest, AsrAmTest) {
 // through the interpreter and stored the sum of all the output, which was them
 // compared for correctness. In this test we are comparing all the intermediate
 // results.
-TEST(SpeechTest, AsrLmTest) {
+TEST_P(SpeechTest, AsrLmTest) {
   std::ifstream in_file;
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
   ASSERT_TRUE(Init("speech_asr_lm_model.test_spec", &test_driver, &in_file));
-  ASSERT_TRUE(testing::ParseAndRunTests(&in_file, &test_driver))
+  ASSERT_TRUE(
+      testing::ParseAndRunTests(&in_file, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
 
-TEST(SpeechTest, EndpointerTest) {
+TEST_P(SpeechTest, EndpointerTest) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData(
       "speech_endpointer_model.tflite", "speech_endpointer_model_in.csv",
@@ -168,11 +174,11 @@ TEST(SpeechTest, EndpointerTest) {
       /*persistent_tensors=*/"28,29,49,50",
       /*sequence_size=*/320, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
-  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
 
-TEST(SpeechTest, TtsTest) {
+TEST_P(SpeechTest, TtsTest) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData("speech_tts_model.tflite",
                              "speech_tts_model_in.csv",
@@ -181,9 +187,19 @@ TEST(SpeechTest, TtsTest) {
                              /*persistent_tensors=*/"25,26,46,47,67,68,73",
                              /*sequence_size=*/334, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
-  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
       << test_driver.GetErrorMessage();
 }
 
+// Define two instantiations. The "ShortTests" instantiations is used when
+// running the tests on Android, in order to prevent timeouts (It takes about
+// 200s just to bring up the Android emulator.)
+static const int kAllInvocations = -1;
+static const int kFirstFewInvocations = 10;
+INSTANTIATE_TEST_CASE_P(LongTests, SpeechTest,
+                        ::testing::Values(kAllInvocations));
+INSTANTIATE_TEST_CASE_P(ShortTests, SpeechTest,
+                        ::testing::Values(kFirstFewInvocations));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/nnapi/BUILD b/tensorflow/contrib/lite/nnapi/BUILD
index 402f1e949b7bb576de4970a8ebb41541fcee1cb2..467a2b7a7bc9a40135428240585cd2c2a133cf9f 100644
--- a/tensorflow/contrib/lite/nnapi/BUILD
+++ b/tensorflow/contrib/lite/nnapi/BUILD
@@ -11,15 +11,3 @@ cc_library(
     ],
     linkopts = ["-ldl"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 76032771af2c8e099aed498b2071816646f3b606..4a648e42837fbf6b7326c315be202ae0a80a47ca 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 // helpers
 
-#define NNAPI_LOG(format, ...) printf(format "\n", __VA_ARGS__);
+#define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
 #define LOAD_FUNCTION(name) \
   static name##_fn fn = reinterpret_cast<name##_fn>(loadFunction(#name));
 #define EXECUTE_FUNCTION(...) \
@@ -34,10 +34,13 @@ limitations under the License.
 inline void* loadLibrary(const char* name) {
   // TODO: change RTLD_LOCAL? Assumes there can be multiple instances of nn
   // api RT
-  void* handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
+  void* handle = nullptr;
+#ifdef __ANDROID__
+  handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
   if (handle == nullptr) {
     NNAPI_LOG("nnapi error: unable to open library %s", name);
   }
+#endif
   return handle;
 }
 
@@ -569,7 +572,7 @@ enum {
   ANEURALNETWORKS_LOGISTIC = 14,
 
   /**
-   * Projects an input to a bit vector via locality senstive hashing.
+   * Projects an input to a bit vector via locality sensitive hashing.
    *
    * Inputs:
    * * 0: Hash functions. Dim.size == 2, DataType: Float.
@@ -606,7 +609,7 @@ enum {
    * Long short-term memory unit (LSTM) recurrent network layer.
    *
    * The default non-peephole implementation is based on:
-   * http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+   * http://www.bioinf.jku.at/publications/older/2604.pdf
    * S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural
    * Computation, 9(8):1735-1780, 1997.
    *
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index c1dcf0e4aee4f840d3ef4d532795f1d6245b06df..175101aa66021a0cbb1766c57e4b318c600bf887 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -166,7 +166,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
     };
 
     auto duplicate_state_tensor_float32 =
-        [interpreter, &nn_model, &augmented_inputs, &next_id](int tensor_id) {
+        [interpreter, &nn_model, &augmented_inputs](int tensor_id) {
           const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
           CHECK_NN(ANeuralNetworksModel_setOperandValue(
               nn_model, tensor_id, tensor->data.raw, tensor->bytes));
@@ -282,6 +282,9 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_TANH:
         nn_op_type = ANEURALNETWORKS_TANH;
         break;
+      case tflite::BuiltinOperator_FLOOR:
+        nn_op_type = ANEURALNETWORKS_FLOOR;
+        break;
       case tflite::BuiltinOperator_LOGISTIC:
         nn_op_type = ANEURALNETWORKS_LOGISTIC;
         break;
@@ -327,6 +330,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
+      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_L2_NORMALIZATION:
       case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
@@ -348,6 +352,15 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_SQUEEZE:
       case tflite::BuiltinOperator_STRIDED_SLICE:
       case tflite::BuiltinOperator_EXP:
+      case tflite::BuiltinOperator_LOG_SOFTMAX:
+      case tflite::BuiltinOperator_DEQUANTIZE:
+      case tflite::BuiltinOperator_DELEGATE:
+      case tflite::BuiltinOperator_CAST:
+      case tflite::BuiltinOperator_PRELU:
+      case tflite::BuiltinOperator_MAXIMUM:
+      case tflite::BuiltinOperator_MINIMUM:
+      case tflite::BuiltinOperator_ARG_MAX:
+      case tflite::BuiltinOperator_LESS:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc
index 1f762e6688d0cc2a91417b9d82201446e3060a6f..dfdd80ea8a42af683632be1d7e8ab0062847077d 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.cc
+++ b/tensorflow/contrib/lite/optional_debug_tools.cc
@@ -48,6 +48,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt64";
     case kTfLiteString:
       return "kTfLiteString";
+    case kTfLiteBool:
+      return "kTfLiteBool";
   }
   return "(invalid)";
 }
@@ -70,7 +72,7 @@ const char* AllocTypeName(TfLiteAllocationType type) {
 
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(Interpreter* interpreter) {
-  printf("Interpreter has %d tensors and %d nodes\n",
+  printf("Interpreter has %zu tensors and %zu nodes\n",
          interpreter->tensors_size(), interpreter->nodes_size());
   printf("Inputs:");
   PrintIntVector(interpreter->inputs());
diff --git a/tensorflow/contrib/lite/profiling/BUILD b/tensorflow/contrib/lite/profiling/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..15999e5d4188db1e191936ae6d84faf8cce5ca6e
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+common_copts = [
+    "-Wall",
+]
+
+cc_library(
+    name = "profiler",
+    hdrs = ["profiler.h"],
+    copts = common_copts,
+    deps = [":profile_buffer"],
+)
+
+cc_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.cc"],
+    copts = ["-DTFLITE_PROFILING_ENABLED"],
+    defines = ["TFLITE_PROFILING_ENABLED"],
+    deps = [
+        ":profiler",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "profile_buffer",
+    hdrs = ["profile_buffer.h"],
+    copts = common_copts,
+)
+
+cc_test(
+    name = "profile_buffer_test",
+    srcs = ["profile_buffer_test.cc"],
+    copts = ["-DTFLITE_PROFILING_ENABLED"],
+    defines = ["TFLITE_PROFILING_ENABLED"],
+    deps = [
+        ":profile_buffer",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/profiling/profile_buffer.h b/tensorflow/contrib/lite/profiling/profile_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..299b2a9cad161ce05ba68f39cf612f9866a0b656
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profile_buffer.h
@@ -0,0 +1,150 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace tflite {
+namespace profiling {
+
+// A profiling event.
+struct ProfileEvent {
+  // Describes the type of event.
+  // The event_metadata field may contain additional data for interpreting
+  // the event.
+  enum class EventType {
+    // Default event type, the metadata field has no special significance.
+    DEFAULT = 0,
+    // The event is an operator invocation and the event_metadata field is the
+    // index of operator node.
+    OPERATOR_INVOKE_EVENT = 1
+  };
+
+  // Label of the event. This usually describes the event.
+  const char* tag;
+  // Timestamp in microseconds when the event began.
+  uint64_t begin_timestamp_us;
+  // Timestamp in microseconds when the event ended.
+  uint64_t end_timestamp_us;
+  // The field containing the type of event. This must be one of the event types
+  // in EventType.
+  EventType event_type;
+  // Extra data describing the details of the event.
+  uint32_t event_metadata;
+};
+}  // namespace profiling
+}  // namespace tflite
+
+#ifdef TFLITE_PROFILING_ENABLED
+
+#include <sys/time.h>
+#include <vector>
+
+namespace tflite {
+namespace profiling {
+constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
+
+// A ring buffer of profile events.
+// This class is not thread safe.
+class ProfileBuffer {
+ public:
+  ProfileBuffer(uint32_t max_num_entries, bool enabled)
+      : enabled_(enabled), current_index_(0), event_buffer_(max_num_entries) {}
+
+  // Adds an event to the buffer with begin timestamp set to the current
+  // timestamp. Returns a handle to event that can be used to call EndEvent. If
+  // buffer is disabled this has no affect.
+  // The tag of the event should remain valid till the buffer is valid.
+  uint32_t BeginEvent(const char* tag, ProfileEvent::EventType event_type,
+                      uint32_t event_metadata) {
+    if (!enabled_) {
+      return kInvalidEventHandle;
+    }
+    uint64_t timestamp = NowMicros();
+    int index = current_index_ % event_buffer_.size();
+    event_buffer_[index].tag = tag;
+    event_buffer_[index].event_type = event_type;
+    event_buffer_[index].event_metadata = event_metadata;
+    event_buffer_[index].begin_timestamp_us = timestamp;
+    event_buffer_[index].end_timestamp_us = 0;
+    current_index_++;
+    return index;
+  }
+
+  // Sets the enabled state of buffer to |enabled|
+  void SetEnabled(bool enabled) { enabled_ = enabled; }
+
+  // Sets the end timestamp for event for the handle to current time.
+  // If the buffer is disabled or previous event has been overwritten this
+  // operation has not effect.
+  void EndEvent(uint32_t event_handle) {
+    if (!enabled_ || event_handle == kInvalidEventHandle ||
+        event_handle > current_index_) {
+      return;
+    }
+    const uint32_t max_size = event_buffer_.size();
+    if (current_index_ > (max_size + event_handle)) {
+      // Ignore, buffer has already overflowed.
+      return;
+    }
+
+    int event_index = event_handle % max_size;
+    event_buffer_[event_index].end_timestamp_us = NowMicros();
+  }
+
+  // Returns the size of the buffer.
+  size_t Size() const {
+    return (current_index_ >= event_buffer_.size()) ? event_buffer_.size()
+                                                    : current_index_;
+  }
+
+  // Resets the buffer.
+  void Reset() {
+    enabled_ = false;
+    current_index_ = 0;
+  }
+
+  // Returns the profile event at the given index. If the index is invalid a
+  // nullptr is returned. The return event may get overwritten if more events
+  // are added to buffer.
+  const struct ProfileEvent* const At(int index) const {
+    size_t size = Size();
+    if (index >= size) {
+      return nullptr;
+    }
+    const uint32_t max_size = event_buffer_.size();
+    uint32_t start =
+        (current_index_ > max_size) ? current_index_ % max_size : max_size;
+    index = (index + start) % max_size;
+    return &event_buffer_[index];
+  }
+
+ private:
+  static uint64_t NowMicros() {
+    // TODO(shashishekhar): Refactor this to a separate file.
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  }
+  bool enabled_;
+  uint32_t current_index_;
+  std::vector<ProfileEvent> event_buffer_;
+};
+}  // namespace profiling
+}  // namespace tflite
+#endif  // TFLITE_PROFILING_ENABLED
+#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
diff --git a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b8784cca455cfc301f2cc30c9c6d031b7174f829
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/profiling/profile_buffer.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace profiling {
+
+namespace {
+
+std::vector<const ProfileEvent*> GetProfileEvents(const ProfileBuffer& buffer) {
+  std::vector<const ProfileEvent*> events;
+  for (auto i = 0; i < buffer.Size(); i++) {
+    events.push_back(buffer.At(i));
+  }
+  return events;
+}
+
+TEST(ProfileBufferTest, Empty) {
+  ProfileBuffer buffer(/*max_size*/ 0, /*enabled*/ true);
+  EXPECT_EQ(0, buffer.Size());
+}
+
+TEST(ProfileBufferTest, AddEvent) {
+  ProfileBuffer buffer(/*max_size*/ 10, /*enabled*/ true);
+  EXPECT_EQ(0, buffer.Size());
+  auto event_handle = buffer.BeginEvent(
+      "hello", ProfileEvent::EventType::DEFAULT, /* event_metadata */ 42);
+
+  EXPECT_GE(event_handle, 0);
+  EXPECT_EQ(1, buffer.Size());
+
+  auto event = GetProfileEvents(buffer)[0];
+  EXPECT_EQ(event->tag, "hello");
+  EXPECT_GT(event->begin_timestamp_us, 0);
+  EXPECT_EQ(event->event_type, ProfileEvent::EventType::DEFAULT);
+  EXPECT_EQ(event->event_metadata, 42);
+
+  buffer.EndEvent(event_handle);
+  EXPECT_EQ(1, buffer.Size());
+  EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us);
+}
+
+TEST(ProfileBufferTest, OverFlow) {
+  const int max_size = 4;
+  ProfileBuffer buffer{max_size, true};
+  std::vector<std::string> eventNames = {"first", "second", "third", "fourth"};
+  for (int i = 0; i < 2 * max_size; i++) {
+    buffer.BeginEvent(eventNames[i % 4].c_str(),
+                      ProfileEvent::EventType::DEFAULT, i);
+    size_t expected_size = std::min(i + 1, max_size);
+    EXPECT_EQ(expected_size, buffer.Size());
+  }
+  EXPECT_EQ(max_size, buffer.Size());
+  for (int j = 0; j < buffer.Size(); ++j) {
+    auto event = buffer.At(j);
+    EXPECT_EQ(eventNames[j % 4], event->tag);
+    EXPECT_EQ(ProfileEvent::EventType::DEFAULT, event->event_type);
+    EXPECT_EQ(4 + j, event->event_metadata);
+  }
+}
+
+TEST(ProfileBufferTest, Enable) {
+  ProfileBuffer buffer(/*max_size*/ 10, /*enabled*/ false);
+  EXPECT_EQ(0, buffer.Size());
+  auto event_handle = buffer.BeginEvent(
+      "hello", ProfileEvent::EventType::DEFAULT, /* event_metadata */ 42);
+  EXPECT_EQ(kInvalidEventHandle, event_handle);
+  EXPECT_EQ(0, buffer.Size());
+  buffer.SetEnabled(true);
+  event_handle = buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
+                                   /* event_metadata */ 42);
+  EXPECT_GE(event_handle, 0);
+  EXPECT_EQ(1, buffer.Size());
+}
+
+}  // namespace
+}  // namespace profiling
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/profiling/profiler.h b/tensorflow/contrib/lite/profiling/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfa98a6708edc874708537dc324d8c340664dc63
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profiler.h
@@ -0,0 +1,174 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/lite/profiling/profile_buffer.h"
+
+#ifdef TFLITE_PROFILING_ENABLED
+
+namespace tflite {
+namespace profiling {
+class ScopedProfile;
+class ScopedOperatorProfile;
+
+// Controls whether profiling is enabled or disabled and collects profiles.
+// TFLite is used on platforms that don't have posix threads, so the profiler is
+// kept as simple as possible. It is designed to be used only on a single
+// thread.
+//
+// Profiles are collected using Scoped*Profile objects that begin and end a
+// profile event.
+// An example usage is shown in the example below:
+//
+// Say Worker class has a DoWork method and we are interested in profiling
+// the overall execution time for DoWork and time spent in Task1 and Task2
+// functions.
+//
+// class Worker {
+//  public:
+//   void DoWork() {
+//    ScopedProfile(&controller, "DoWork");
+//    Task1();
+//    Task2();
+//    .....
+//   }
+//
+//   void Task1() {
+//    ScopedProfile(&controller, "Task1");
+//    ....
+//   }
+//
+//   void Task2() {
+//    ScopedProfile(&controller, "Task2");
+//   }
+//
+//    Profiler profiler;
+// }
+//
+// We instrument the functions that need to be profiled.
+//
+// Profile can be collected by enable profiling and then getting profile
+// events.
+//
+//  void ProfileWorker() {
+//    Worker worker;
+//    worker.profiler.EnableProfiling();
+//    worker.DoWork();
+//    worker.profiler.DisableProfiling();
+//    // Profiling is complete, extract profiles.
+//    auto profile_events = worker.profiler.GetProfiles();
+//  }
+//
+//
+class Profiler {
+ public:
+  Profiler() : buffer_(1024, false) {}
+
+  void StartProfiling() { buffer_.SetEnabled(true); }
+  void StopProfiling() { buffer_.SetEnabled(false); }
+  void Reset() { buffer_.Reset(); }
+  std::vector<const ProfileEvent*> GetProfileEvents() {
+    std::vector<const ProfileEvent*> profile_events;
+    profile_events.reserve(buffer_.Size());
+    for (int i = 0; i < buffer_.Size(); i++) {
+      profile_events.push_back(buffer_.At(i));
+    }
+    return profile_events;
+  }
+
+ private:
+  friend class ScopedProfile;
+  friend class ScopedOperatorProfile;
+  ProfileBuffer* GetProfileBuffer() { return &buffer_; }
+  ProfileBuffer buffer_;
+};
+
+class ScopedProfile {
+ public:
+  // Adds a profile event to profile that begins with the construction
+  // of object and ends when the object goes out of scope.
+  // The lifetime of tag should be at least the lifetime of profiler.
+  ScopedProfile(Profiler* profiler, const char* tag) {
+    if (profiler) {
+      buffer_ = profiler->GetProfileBuffer();
+      event_handle_ =
+          buffer_->BeginEvent(tag, ProfileEvent::EventType::DEFAULT, 0);
+    }
+  }
+  ~ScopedProfile() {
+    if (buffer_) {
+      buffer_->EndEvent(event_handle_);
+    }
+  }
+
+ private:
+  ProfileBuffer* buffer_;
+  int32_t event_handle_;
+};
+
+class ScopedOperatorProfile {
+ public:
+  // Adds a profile event to profile that begins with the construction
+  // of object and ends when the object goes out of scope.
+  // The lifetime of tag should be at least the lifetime of profiler.
+  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index) {
+    if (profiler) {
+      buffer_ = profiler->GetProfileBuffer();
+      event_handle_ = buffer_->BeginEvent(
+          tag, ProfileEvent::EventType::OPERATOR_INVOKE_EVENT, node_index);
+    }
+  }
+
+  ~ScopedOperatorProfile() {
+    if (buffer_) {
+      buffer_->EndEvent(event_handle_);
+    }
+  }
+
+ private:
+  ProfileBuffer* buffer_;
+  int32_t event_handle_;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#define SCOPED_OPERATOR_PROFILE(profiler, node_index)                       \
+  tflite::profiling::ScopedOperatorProfile _profile((profiler), "OpInvoke", \
+                                                    (node_index))
+#else
+
+namespace tflite {
+namespace profiling {
+// A noop version of profiler when profiling is disabled.
+class Profiler {
+ public:
+  Profiler() {}
+  void StartProfiling() {}
+  void StopProfiling() {}
+  void Reset() {}
+  std::vector<const ProfileEvent*> GetProfileEvents() { return {}; }
+};
+}  // namespace profiling
+}  // namespace tflite
+
+#define SCOPED_OPERATOR_PROFILE(profiler, node_index)
+
+#endif  // TFLITE_PROFILING_ENABLED
+
+#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
diff --git a/tensorflow/contrib/lite/profiling/profiler_test.cc b/tensorflow/contrib/lite/profiling/profiler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ea1d8f7d341b6bd57f8dfe2e404f41515f6a8e1
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profiler_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+
+#include <chrono>  // NOLINT(build/c++11)
+#include <cmath>
+#include <thread>  // NOLINT(build/c++11)
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/profiling/profiler.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace profiling {
+namespace {
+
+void AssertDurationOfEventAroundMs(const ProfileEvent* event,
+                                   double expected_ms, double eps_ms) {
+  double duration_ms =
+      (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
+  EXPECT_NEAR(expected_ms, duration_ms, eps_ms);
+}
+
+void SleepForQuarterSecond(Profiler* profiler) {
+  ScopedProfile profile(profiler, "SleepForQuarter");
+  std::this_thread::sleep_for(std::chrono::milliseconds(250));
+}
+
+void ChildFunction(Profiler* profiler) {
+  ScopedProfile profile(profiler, "Child");
+  SleepForQuarterSecond(profiler);
+}
+
+void ParentFunction(Profiler* profiler) {
+  ScopedProfile profile(profiler, "Parent");
+  for (int i = 0; i < 2; i++) {
+    ChildFunction(profiler);
+  }
+}
+
+TEST(ProfilerTest, NoProfilesAreCollectedWhenDisabled) {
+  Profiler profiler;
+  ParentFunction(&profiler);
+  auto profile_events = profiler.GetProfileEvents();
+  EXPECT_EQ(0, profile_events.size());
+}
+
+TEST(ProfilingTest, ProfilesAreCollected) {
+  Profiler profiler;
+  profiler.StartProfiling();
+  ParentFunction(&profiler);
+  profiler.StopProfiling();
+  auto profile_events = profiler.GetProfileEvents();
+  // ParentFunction calls the ChildFunction 2 times.
+  // Each ChildFunction calls SleepForQuarterSecond once.
+  // We expect 1 entry for ParentFunction, 2 for ChildFunction and 2 for
+  // SleepForQuarterSecond: Total: 1+ 2 + 2 = 5
+  //  Profiles should look like:
+  //  Parent ~ 500 ms (due to 2 Child calls)
+  //   - Child ~ 250 ms (due to SleepForQuarter calls)
+  //       - SleepForQuarter ~ 250ms
+  //   - Child ~ 250 ms (due to SleepForQuarter calls)
+  //      - SleepForQuarter ~ 250ms
+  //
+  ASSERT_EQ(5, profile_events.size());
+  EXPECT_EQ("Parent", profile_events[0]->tag);
+  EXPECT_EQ("Child", profile_events[1]->tag);
+  EXPECT_EQ("SleepForQuarter", profile_events[2]->tag);
+  EXPECT_EQ("Child", profile_events[3]->tag);
+  EXPECT_EQ("SleepForQuarter", profile_events[4]->tag);
+
+#ifndef ADDRESS_SANITIZER
+  // ASAN build is sometimes very slow.
+  const int eps_ms = 10;
+  AssertDurationOfEventAroundMs(profile_events[0], /*expected_ms*/ 500, eps_ms);
+  AssertDurationOfEventAroundMs(profile_events[1], /*expected_ms*/ 250, eps_ms);
+  AssertDurationOfEventAroundMs(profile_events[2], /*expected_ms*/ 250, eps_ms);
+  AssertDurationOfEventAroundMs(profile_events[3], /*expected_ms*/ 250, eps_ms);
+  AssertDurationOfEventAroundMs(profile_events[4], /*expected_ms*/ 250, eps_ms);
+#endif
+}
+
+}  // namespace
+}  // namespace profiling
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 82feae0f0041997949212613c654a5695f468d56..e6dcc7aa099ccde848ef6cd9322639ea2d0f1c01 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -4,19 +4,70 @@ package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+filegroup(
+    name = "interpreter_test_data",
+    srcs = glob(["**/testdata/*"]),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "interpreter",
+    srcs = [
+        "interpreter.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/lite/python/interpreter_wrapper:tensorflow_wrap_interpreter_wrapper",
+    ],
+)
+
+py_test(
+    name = "interpreter_test",
+    srcs = ["interpreter_test.py"],
+    data = [":interpreter_test_data"],
+    srcs_version = "PY2AND3",
+    tags = ["no_oss"],
+    deps = [
+        ":interpreter",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 py_library(
     name = "lite",
     srcs = ["lite.py"],
-    # data = [
-    #     "//tensorflow/contrib/lite/toco/python:toco_from_protos",
-    # ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":convert",
+        ":convert_saved_model",
         ":op_hint",
+    ],
+)
+
+py_library(
+    name = "lite_constants",
+    srcs = ["lite_constants.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
+    ],
+)
+
+py_library(
+    name = "convert",
+    srcs = ["convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lite_constants",
         "//tensorflow/contrib/lite/toco:model_flags_proto_py",
         "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
         "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco",
+        "//tensorflow/contrib/lite/toco/python:toco_from_protos",
         "//tensorflow/python:platform",
     ],
 )
@@ -34,12 +85,15 @@ py_library(
 )
 
 py_test(
-    name = "lite_test",
-    srcs = ["lite_test.py"],
+    name = "convert_test",
+    srcs = ["convert_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],
+    tags = [
+        "no-internal-py3",
+        "no_oss",
+    ],
     deps = [
-        ":lite",
+        ":convert",
         ":op_hint",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -49,14 +103,66 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+py_library(
+    name = "convert_saved_model",
+    srcs = ["convert_saved_model.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":convert",
+        ":lite_constants",
+        "//tensorflow/contrib/saved_model:saved_model_py",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python/tools:freeze_graph_lib",
+    ],
+)
+
+py_binary(
+    name = "create_custom_op",
+    srcs = ["create_custom_op.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "@absl_py//absl/flags",
+    ],
+)
+
+py_test(
+    name = "convert_saved_model_test",
+    srcs = ["convert_saved_model_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":convert_saved_model",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/saved_model",
+    ],
+)
+
+py_binary(
+    name = "convert_saved_model_to_frozen_graph",
+    srcs = ["convert_saved_model_to_frozen_graph.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convert_saved_model",
+    ],
+)
+
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "tf_lite_py_pip",
+    deps = [
+        ":convert_saved_model",
+    ],
 )
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4200c879ba0e17b3bd183f4004eb75ebdd2f5ee
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -0,0 +1,187 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts a frozen graph into a TFLite FlatBuffer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os as _os
+import subprocess as _subprocess
+import tempfile as _tempfile
+
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
+from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.platform import resource_loader as _resource_loader
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+
+# Lazy load since some of the performance benchmark skylark rules
+# break dependencies.
+_toco_python = LazyLoader(
+    "tensorflow_wrap_toco", globals(),
+    "tensorflow.contrib.lite.toco.python."
+    "tensorflow_wrap_toco")
+del LazyLoader
+
+# Find the toco_from_protos binary using the resource loader if using from
+# bazel, otherwise we are in a pip where console_scripts already has
+# the toco_from_protos tool.
+if lite_constants.EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
+  _toco_from_proto_bin = ""
+else:
+  _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
+      "../toco/python/toco_from_protos")
+
+if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
+  _toco_from_proto_bin = "toco_from_protos"
+
+
+def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
+  """Convert `input_data_str` according to model and toco parameters.
+
+  Unless you know what you are doing consider using
+  the more friendly @{tf.contrib.lite.toco_convert}}.
+
+  Args:
+    model_flags_str: Serialized proto describing model properties, see
+      `toco/model_flags.proto`.
+    toco_flags_str: Serialized proto describing conversion properties, see
+      `toco/toco_flags.proto`.
+    input_data_str: Input data in serialized form (e.g. a graphdef is common)
+  Returns:
+    Converted model in serialized form (e.g. a TFLITE model is common).
+  Raises:
+    RuntimeError: When conversion fails, an exception is raised with the error
+      message embedded.
+  """
+  # TODO(aselle): When toco does not use fatal errors for failure, we can
+  # switch this on.
+  if not _toco_from_proto_bin:
+    return _toco_python.TocoConvert(
+        model_flags_str, toco_flags_str, input_data_str)
+
+  with _tempfile.NamedTemporaryFile() as fp_toco, \
+           _tempfile.NamedTemporaryFile() as fp_model, \
+           _tempfile.NamedTemporaryFile() as fp_input, \
+           _tempfile.NamedTemporaryFile() as fp_output:
+    fp_model.write(model_flags_str)
+    fp_toco.write(toco_flags_str)
+    fp_input.write(input_data_str)
+    fp_model.flush()
+    fp_toco.flush()
+    fp_input.flush()
+
+    cmd = [
+        _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name,
+        fp_output.name
+    ]
+    cmdline = " ".join(cmd)
+    proc = _subprocess.Popen(
+        cmdline,
+        shell=True,
+        stdout=_subprocess.PIPE,
+        stderr=_subprocess.STDOUT,
+        close_fds=True)
+    stdout, stderr = proc.communicate()
+    exitcode = proc.returncode
+    if exitcode == 0:
+      stuff = fp_output.read()
+      return stuff
+    else:
+      raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" %
+                         (stdout, stderr))
+
+
+def tensor_name(x):
+  return x.name.split(":")[0]
+
+
+def toco_convert(input_data,
+                 input_tensors,
+                 output_tensors,
+                 inference_type=lite_constants.FLOAT,
+                 input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+                 output_format=lite_constants.TFLITE,
+                 quantized_input_stats=None,
+                 drop_control_dependency=True):
+  """Convert a model using TOCO from `input_format` to `output_format`.
+
+  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
+  case the default `input_format` and `output_format` are sufficient.
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`).
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
+    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
+    output_format: Type of data to write (currently must be TFLITE or
+      GRAPHVIZ_DOT)
+    quantized_input_stats: For each member of input_tensors the mean and
+      std deviation of training data. Only needed if `inference_type` is
+      `QUANTIZED_UINT8`.
+    drop_control_dependency: Drops control dependencies silently. This is due
+      to tf lite not supporting control dependencies.
+
+  Returns:
+    The converted data. For example if tflite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    ValueError: If the input tensor type is unknown
+    RuntimeError: If TOCO fails to convert (in which case the runtime error's
+      error text will contain the TOCO error log)
+  """
+  toco = _toco_flags_pb2.TocoFlags()
+  toco.input_format = input_format
+  toco.output_format = output_format
+  toco.drop_control_dependency = drop_control_dependency
+  model = _model_flags_pb2.ModelFlags()
+  toco.inference_type = inference_type
+  for idx, input_tensor in enumerate(input_tensors):
+    if input_tensor.dtype == _dtypes.float32:
+      tflite_input_type = lite_constants.FLOAT
+    elif input_tensor.dtype == _dtypes.int32:
+      tflite_input_type = lite_constants.INT32
+    elif input_tensor.dtype == _dtypes.int64:
+      tflite_input_type = lite_constants.INT64
+    # TODO(aselle): Insert strings when they are available
+    else:
+      raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
+                                                         input_tensor.dtype))
+
+    input_array = model.input_arrays.add()
+
+    if inference_type == lite_constants.QUANTIZED_UINT8:
+      if tflite_input_type == lite_constants.FLOAT:
+        tflite_input_type = lite_constants.QUANTIZED_UINT8
+      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
+
+    input_array.name = tensor_name(input_tensor)
+    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
+
+  for output_tensor in output_tensors:
+    model.output_arrays.append(tensor_name(output_tensor))
+
+  # TODO(aselle): Consider handling the case of allowing quantized
+  # inputs to be converted to float (via the toco.inference_input_type field).
+  data = toco_convert_protos(model.SerializeToString(),
+                             toco.SerializeToString(),
+                             input_data.SerializeToString())
+  return data
diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7eddf3408f54dff5fa49ff6fa7b61cd0b8a22e4
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert_saved_model.py
@@ -0,0 +1,421 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to convert SavedModel to frozen GraphDefs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.lite.python import convert
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.toco import model_flags_pb2
+from tensorflow.contrib.saved_model.python.saved_model import reader
+from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import graph_util as tf_graph_util
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+
+
+def _write_and_flush_file(file_path, data_str):
+  """Writes data to file path.
+
+  Args:
+    file_path: Full path of the file to store data in.
+    data_str: Data represented as a string.
+
+  Returns: None.
+  """
+  with gfile.Open(file_path, "wb") as data_file:
+    data_file.write(data_str)
+    data_file.flush()
+
+
+def _log_tensor_details(tensor_info):
+  """Log tensor details: name, shape, and type."""
+  for key in tensor_info:
+    val = tensor_info[key]
+    dtype = types_pb2.DataType.Name(val.dtype)
+    if val.tensor_shape.unknown_rank:
+      shape = "unknown_rank"
+    else:
+      dims = [str(dim.size) for dim in val.tensor_shape.dim]
+      shape = "({})".format(", ".join(dims))
+
+    logging.info("Tensor's key in saved_model's tensor_map: %s", key)
+    logging.info(" tensor name: %s, shape: %s, type: %s", val.name, shape,
+                 dtype)
+
+
+def _get_meta_graph_def(saved_model_dir, tag_set):
+  """Validate saved_model and extract MetaGraphDef.
+
+  Args:
+    saved_model_dir: saved_model path to convert.
+    tag_set: Set of tag(s) of the MetaGraphDef to load.
+
+  Returns:
+    The meta_graph_def used for tflite conversion.
+
+  Raises:
+    ValueError: No valid MetaGraphDef for given tag_set.
+  """
+  saved_model = reader.read_saved_model(saved_model_dir)
+  tag_sets = []
+  result_meta_graph_def = None
+  for meta_graph_def in saved_model.meta_graphs:
+    meta_graph_tag_set = set(meta_graph_def.meta_info_def.tags)
+    tag_sets.append(meta_graph_tag_set)
+    if meta_graph_tag_set == tag_set:
+      result_meta_graph_def = meta_graph_def
+  logging.info("The given saved_model contains the following tags: %s",
+               tag_sets)
+  if result_meta_graph_def is not None:
+    return result_meta_graph_def
+  else:
+    raise ValueError("No valid MetaGraphDef for this tag_set '{}'. Possible "
+                     "values are '{}'. ".format(tag_set, tag_sets))
+
+
+def _get_signature_def(meta_graph, signature_key):
+  """Get the signature def from meta_graph with given signature_key.
+
+  Args:
+    meta_graph: meta_graph_def.
+    signature_key: signature_def in the meta_graph_def.
+
+  Returns:
+    The signature_def used for tflite conversion.
+
+  Raises:
+    ValueError: Given signature_key is not valid for this meta_graph.
+  """
+  signature_def_map = meta_graph.signature_def
+  signature_def_keys = set(signature_def_map.keys())
+  logging.info(
+      "The given saved_model MetaGraphDef contains SignatureDefs with the "
+      "following keys: %s", signature_def_keys)
+  if signature_key not in signature_def_keys:
+    raise ValueError("No '{}' in the saved_model\'s SignatureDefs. Possible "
+                     "values are '{}'. ".format(signature_key,
+                                                signature_def_keys))
+  signature_def = signature_def_utils.get_signature_def_by_key(
+      meta_graph, signature_key)
+  return signature_def
+
+
+def _get_inputs_outputs(signature_def):
+  """Get inputs and outputs from SignatureDef.
+
+  Args:
+    signature_def: SignatureDef in the meta_graph_def for conversion.
+
+  Returns:
+    The inputs and outputs in the graph for conversion.
+  """
+  inputs_tensor_info = signature_def.inputs
+  outputs_tensor_info = signature_def.outputs
+  logging.info("input tensors info: ")
+  _log_tensor_details(inputs_tensor_info)
+  logging.info("output tensors info: ")
+  _log_tensor_details(outputs_tensor_info)
+
+  def gather_names(tensor_info):
+    return [tensor_info[key].name for key in tensor_info]
+
+  inputs = gather_names(inputs_tensor_info)
+  outputs = gather_names(outputs_tensor_info)
+  return inputs, outputs
+
+
+def _get_tensors(graph, signature_def_tensor_names=None,
+                 user_tensor_names=None):
+  """Gets the tensors associated with the tensor names.
+
+  Either signature_def_tensor_names or user_tensor_names should be provided. If
+  the user provides tensors, the tensors associated with the user provided
+  tensor names are provided. Otherwise, the tensors associated with the names in
+  the SignatureDef are provided.
+
+  Args:
+    graph: GraphDef representing graph.
+    signature_def_tensor_names: Tensor names stored in either the inputs or
+      outputs of a SignatureDef. (default None)
+    user_tensor_names: Tensor names provided by the user. (default None)
+
+  Returns:
+    List of tensors.
+
+  Raises:
+    ValueError:
+      signature_def_tensors and user_tensor_names are undefined or empty.
+      user_tensor_names are not valid.
+  """
+  tensors = []
+  if user_tensor_names:
+    # Get the list of all of the tensors with and without the tensor index.
+    all_tensor_names = [
+        tensor.name for op in graph.get_operations() for tensor in op.outputs
+    ]
+    all_tensor_names_only = [name.split(":")[0] for name in all_tensor_names]
+
+    # Sort the tensor names.
+    user_tensor_names = sorted(user_tensor_names)
+
+    # Get the tensors associated with the tensor names.
+    tensors = []
+    invalid_tensors = []
+    for name in user_tensor_names:
+      if name not in all_tensor_names_only:
+        invalid_tensors.append(name)
+      else:
+        idx = all_tensor_names_only.index(name)
+        tensors.append(graph.get_tensor_by_name(all_tensor_names[idx]))
+
+    # Throw ValueError if any user input names are not valid tensors.
+    if invalid_tensors:
+      raise ValueError("Invalid tensors '{}' were found.".format(
+          ",".join(invalid_tensors)))
+  elif signature_def_tensor_names:
+    tensors = [
+        graph.get_tensor_by_name(name)
+        for name in sorted(signature_def_tensor_names)
+    ]
+  else:
+    # Throw ValueError if signature_def_tensors and user_tensor_names are both
+    # either undefined or empty.
+    raise ValueError(
+        "Specify either signature_def_tensor_names or user_tensor_names")
+
+  return tensors
+
+
+def _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
+                        output_arrays, tag_set, signature_key, batch_size):
+  """Converts a SavedModel to a frozen graph.
+
+  Args:
+    saved_model_dir: SavedModel directory to convert.
+    input_arrays: List of input tensors to freeze graph with. Uses input arrays
+      from SignatureDef when none are provided. (default None)
+    input_shapes: Map of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+      (default None)
+    output_arrays: List of output tensors to freeze graph with. Uses output
+      arrays from SignatureDef when none are provided. (default None)
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present. (default "serve")
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    batch_size: Batch size for the model. Replaces the first dimension of an
+      input size array if undefined. (default 1)
+
+  Returns:
+    frozen_graph_def: Frozen GraphDef.
+    in_tensors: List of input tensors for the graph.
+    out_tensors: List of output tensors for the graph.
+
+  Raises:
+    ValueError:
+      SavedModel doesn't contain a MetaGraphDef identified by tag_set.
+      signature_key is not in the MetaGraphDef.
+      input_shapes does not match the length of input_arrays.
+      input_shapes has a None value after the 1st dimension.
+      input_arrays or output_arrays are not valid.
+      Unable to load Session.
+  """
+  # Set default values for inputs if they are set to None.
+  if signature_key is None:
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+  if tag_set is None:
+    tag_set = set([tag_constants.SERVING])
+  if batch_size is None:
+    batch_size = 1
+
+  # Read SignatureDef.
+  meta_graph = _get_meta_graph_def(saved_model_dir, tag_set)
+  signature_def = _get_signature_def(meta_graph, signature_key)
+  inputs, outputs = _get_inputs_outputs(signature_def)
+
+  graph = ops.Graph()
+  with session.Session(graph=graph) as sess:
+    # TODO(nupurgarg): Throw ValueError if SavedModel has assets/ directory.
+    loader.load(sess, meta_graph.meta_info_def.tags, saved_model_dir)
+
+    # Gets input and output tensors.
+    # TODO(zhixianyan): Use TFLite supported Op list to filter outputs.
+    in_tensors = _get_tensors(graph, inputs, input_arrays)
+    out_tensors = _get_tensors(graph, outputs, output_arrays)
+
+    # Gets fully defined tensor shape. An input tensor with None in the first
+    # dimension, e.g. (None, 224, 224, 3), is replaced with the batch_size.
+    # Shapes with None after the first dimension result in a ValueError.
+    # TODO(zhixianyan): Add supports for input tensor with more None in shape.
+    for tensor in in_tensors:
+      if (input_shapes and tensor.name in input_shapes and
+          input_shapes[tensor.name] is not None):
+        shape = input_shapes[tensor.name]
+      else:
+        shape = tensor.get_shape().as_list()
+
+      if None in shape[1:]:
+        raise ValueError(
+            "None is only supported in the 1st dimension. Tensor '{0}' has "
+            "invalid shape '{1}'.".format(tensor.name, shape))
+      elif shape[0] is None:
+        shape[0] = batch_size
+      tensor.set_shape(shape)
+
+    output_names = [node.split(":")[0] for node in outputs]
+    frozen_graph_def = tf_graph_util.convert_variables_to_constants(
+        sess, graph.as_graph_def(), output_names)
+
+    return frozen_graph_def, in_tensors, out_tensors
+  raise ValueError("Unable to load Session.")
+
+
+def saved_model_to_frozen_graphdef(
+    saved_model_dir,
+    output_file_model,
+    output_file_flags,
+    input_arrays=None,
+    input_shapes=None,
+    output_arrays=None,
+    tag_set=None,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    batch_size=1):
+  """Converts a SavedModel to a frozen graph. Writes graph to tmp directory.
+
+  Stores frozen graph and command line flags in the tmp directory.
+
+  Args:
+    saved_model_dir: SavedModel directory to convert.
+    output_file_model: Full file path to save frozen graph.
+    output_file_flags: Full file path to save ModelFlags.
+    input_arrays: List of input tensors to freeze graph with. Uses input arrays
+      from SignatureDef when none are provided. (default None)
+    input_shapes: Map of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+      (default None)
+    output_arrays: List of output tensors to freeze graph with. Uses output
+      arrays from SignatureDef when none are provided. (default None)
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present. (default "serve")
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    batch_size: Batch size for the model. Replaces the first dimension of an
+      input size array if undefined. (default 1)
+
+  Returns: None.
+
+  Raises:
+    ValueError: Unable to convert to frozen graph.
+  """
+  frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model(
+      saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set,
+      signature_key, batch_size)
+
+  # Initialize model flags.
+  model = model_flags_pb2.ModelFlags()
+
+  for input_tensor in in_tensors:
+    input_array = model.input_arrays.add()
+    input_array.name = convert.tensor_name(input_tensor)
+    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
+
+  for output_tensor in out_tensors:
+    model.output_arrays.append(convert.tensor_name(output_tensor))
+
+  # Write model and ModelFlags to file. ModelFlags contain input array and
+  # output array information that is parsed from the SignatureDef and used for
+  # analysis by TOCO.
+  _write_and_flush_file(output_file_model, frozen_graph_def.SerializeToString())
+  _write_and_flush_file(output_file_flags, model.SerializeToString())
+
+
+def tflite_from_saved_model(
+    saved_model_dir,
+    output_file=None,
+    input_arrays=None,
+    input_shapes=None,
+    output_arrays=None,
+    tag_set=None,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    batch_size=1,
+    inference_type=lite_constants.FLOAT,
+    input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+    output_format=lite_constants.TFLITE,
+    quantized_input_stats=None,
+    drop_control_dependency=True):
+  """Converts a SavedModel to TFLite FlatBuffer.
+
+  Args:
+    saved_model_dir: SavedModel directory to convert.
+    output_file: File path to write result TFLite FlatBuffer.
+    input_arrays: List of input tensors to freeze graph with. Uses input arrays
+      from SignatureDef when none are provided. (default None)
+    input_shapes: Map of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+      (default None)
+    output_arrays: List of output tensors to freeze graph with. Uses output
+      arrays from SignatureDef when none are provided. (default None)
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present. (default "serve")
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    batch_size: Batch size for the model. Replaces the first dimension of an
+      input size array if undefined. (default 1)
+    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
+    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
+    output_format: Type of data to write (currently must be TFLITE or
+      GRAPHVIZ_DOT)
+    quantized_input_stats: For each member of input_tensors the mean and
+      std deviation of training data. Only needed if `inference_type` is
+      `QUANTIZED_UINT8`.
+    drop_control_dependency: Drops control dependencies silently. This is due
+      to tf lite not supporting control dependencies.
+
+  Returns:
+    The converted data. For example if tflite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    ValueError: Unable to convert to frozen graph.
+  """
+  frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model(
+      saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set,
+      signature_key, batch_size)
+
+  result = convert.toco_convert(
+      input_data=frozen_graph_def,
+      input_tensors=in_tensors,
+      output_tensors=out_tensors,
+      inference_type=inference_type,
+      input_format=input_format,
+      output_format=output_format,
+      quantized_input_stats=quantized_input_stats,
+      drop_control_dependency=drop_control_dependency)
+
+  if output_file is not None:
+    with gfile.Open(output_file, "wb") as f:
+      f.write(result)
+    logging.info("Successfully converted to: %s", output_file)
+
+  return result
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..db95fc8ad7f94b52d33c72f6ec5819bdfe8cf05f
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -0,0 +1,394 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFLite SavedModel conversion test cases.
+
+  - Tests converting simple SavedModel graph to TFLite FlatBuffer.
+  - Tests converting simple SavedModel graph to frozen graph.
+  - Tests converting MNIST SavedModel to TFLite FlatBuffer.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from tensorflow.contrib.lite.python import convert_saved_model
+from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
+from tensorflow.python import keras
+from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator_lib as estimator
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import saved_model
+from tensorflow.python.training import training as train
+
+
+class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
+
+  def _createSimpleSavedModel(self, shape):
+    """Create a simple SavedModel on the fly."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
+    with session.Session() as sess:
+      in_tensor = array_ops.placeholder(shape=shape, dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      inputs = {"x": in_tensor}
+      outputs = {"y": out_tensor}
+      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  def testSimpleSavedModel(self):
+    """Test a simple SavedModel created on the fly."""
+    # Create a simple SavedModel
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    # Convert to tflite
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir)
+    self.assertTrue(result)
+
+  def testSimpleSavedModelWithNoneBatchSizeInShape(self):
+    """Test a simple SavedModel, with None in input tensor's shape."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir)
+    self.assertTrue(result)
+
+  def testSimpleSavedModelWithMoreNoneInShape(self):
+    """Test a simple SavedModel, fail as more None in input shape."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, None, 3])
+    # Convert to tflite: this should raise ValueError, as 3rd dim is None.
+    with self.assertRaises(ValueError):
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir)
+
+  def testSimpleSavedModelWithWrongSignatureKey(self):
+    """Test a simple SavedModel, fail as given signature is invalid."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    # Convert to tflite: this should raise ValueError, as
+    # signature_key does not exit in the saved_model.
+    with self.assertRaises(ValueError):
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir, signature_key="wrong-key")
+
+  def testSimpleSavedModelWithWrongOutputArray(self):
+    """Test a simple SavedModel, fail as given output_arrays is invalid."""
+    # Create a simple SavedModel
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    # Convert to tflite: this should raise ValueError, as
+    # output_arrays is not valid for the saved_model.
+    with self.assertRaises(ValueError):
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir, output_arrays=["wrong-output"])
+
+  def testSimpleSavedModelWithWrongInputArrays(self):
+    """Test a simple SavedModel, fail as given input_arrays is invalid."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    # Checks invalid input_arrays.
+    with self.assertRaises(ValueError):
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir, input_arrays=["wrong-input"])
+    # Checks valid and invalid input_arrays.
+    with self.assertRaises(ValueError):
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir,
+          input_arrays=["Placeholder", "wrong-input"])
+
+  def testSimpleSavedModelWithCorrectArrays(self):
+    """Test a simple SavedModel, with correct input_arrays and output_arrays."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["Placeholder"],
+        output_arrays=["add"])
+    self.assertTrue(result)
+
+  def testSimpleSavedModelWithCorrectInputArrays(self):
+    """Test a simple SavedModel, with correct input_arrays and input_shapes."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["Placeholder"],
+        input_shapes={"Placeholder": [1, 16, 16, 3]})
+    self.assertTrue(result)
+
+  def testMultipleMetaGraphDef(self):
+    """Test saved model with multiple MetaGraphDef."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "savedmodel_two_mgd")
+    builder = saved_model.builder.SavedModelBuilder(saved_model_dir)
+    with session.Session(graph=ops.Graph()) as sess:
+      # MetaGraphDef 1
+      in_tensor = array_ops.placeholder(shape=[1, 28, 28], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sig_input_tensor = saved_model.utils.build_tensor_info(in_tensor)
+      sig_input_tensor_signature = {"x": sig_input_tensor}
+      sig_output_tensor = saved_model.utils.build_tensor_info(out_tensor)
+      sig_output_tensor_signature = {"y": sig_output_tensor}
+      predict_signature_def = (
+          saved_model.signature_def_utils.build_signature_def(
+              sig_input_tensor_signature, sig_output_tensor_signature,
+              saved_model.signature_constants.PREDICT_METHOD_NAME))
+      signature_def_map = {
+          saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              predict_signature_def
+      }
+      builder.add_meta_graph_and_variables(
+          sess,
+          tags=[saved_model.tag_constants.SERVING, "additional_test_tag"],
+          signature_def_map=signature_def_map)
+
+      # MetaGraphDef 2
+      builder.add_meta_graph(tags=["tflite"])
+      builder.save(True)
+
+    # Convert to tflite
+    convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir,
+        tag_set=set([saved_model.tag_constants.SERVING, "additional_test_tag"]))
+
+
+class ConvertSavedModelTestBasicGraphToText(test_util.TensorFlowTestCase):
+
+  def _createSimpleSavedModel(self, shape):
+    """Create a simple SavedModel."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
+    with session.Session() as sess:
+      in_tensor_1 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputB")
+      in_tensor_2 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputA")
+      out_tensor = in_tensor_1 + in_tensor_2
+      inputs = {"x": in_tensor_1, "y": in_tensor_2}
+      outputs = {"z": out_tensor}
+      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  def _getInputArrayNames(self, model_proto):
+    return [data.name for data in model_proto.input_arrays]
+
+  def _getInputArrayShapes(self, model_proto):
+    return [
+        [dim for dim in data.shape.dims] for data in model_proto.input_arrays
+    ]
+
+  def _get_model_flags_proto_from_file(self, filename):
+    proto = _model_flags_pb2.ModelFlags()
+    with gfile.Open(filename, "rb") as output_file:
+      proto.ParseFromString(output_file.read())
+      output_file.close()
+    return proto
+
+  def testSimpleSavedModel(self):
+    """Test a simple SavedModel."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    output_file_model = os.path.join(self.get_temp_dir(), "model.pb")
+    output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt")
+
+    convert_saved_model.saved_model_to_frozen_graphdef(
+        saved_model_dir=saved_model_dir,
+        output_file_model=output_file_model,
+        output_file_flags=output_file_flags,
+        input_arrays=["inputB", "inputA"])
+
+    proto = self._get_model_flags_proto_from_file(output_file_flags)
+    self.assertEqual(proto.output_arrays, ["add"])
+    self.assertEqual(self._getInputArrayNames(proto), ["inputA", "inputB"])
+    self.assertEqual(
+        self._getInputArrayShapes(proto), [[1, 16, 16, 3], [1, 16, 16, 3]])
+
+  def testSimpleSavedModelWithDifferentInputNames(self):
+    """Test a simple SavedModel."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    output_file_model = os.path.join(self.get_temp_dir(), "model.pb")
+    output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt")
+
+    # Check case where input shape is given.
+    convert_saved_model.saved_model_to_frozen_graphdef(
+        saved_model_dir=saved_model_dir,
+        output_file_model=output_file_model,
+        output_file_flags=output_file_flags,
+        input_arrays=["inputA"],
+        input_shapes={"inputA": [1, 16, 16, 3]})
+
+    proto = self._get_model_flags_proto_from_file(output_file_flags)
+    self.assertEqual(proto.output_arrays, ["add"])
+    self.assertEqual(self._getInputArrayNames(proto), ["inputA"])
+    self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]])
+
+    # Check case where input shape is None.
+    convert_saved_model.saved_model_to_frozen_graphdef(
+        saved_model_dir=saved_model_dir,
+        output_file_model=output_file_model,
+        output_file_flags=output_file_flags,
+        input_arrays=["inputA"],
+        input_shapes={"inputA": None})
+
+    proto = self._get_model_flags_proto_from_file(output_file_flags)
+    self.assertEqual(proto.output_arrays, ["add"])
+    self.assertEqual(self._getInputArrayNames(proto), ["inputA"])
+    self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]])
+
+
+class Model(keras.Model):
+  """Model to recognize digits in the MNIST dataset.
+
+  Train and export SavedModel, used for testOnflyTrainMnistSavedModel
+
+  Network structure is equivalent to:
+  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  and
+  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
+
+  But written as a ops.keras.Model using the layers API.
+  """
+
+  def __init__(self, data_format):
+    """Creates a model for classifying a hand-written digit.
+
+    Args:
+      data_format: Either "channels_first" or "channels_last".
+        "channels_first" is typically faster on GPUs while "channels_last" is
+        typically faster on CPUs. See
+        https://www.tensorflow.org/performance/performance_guide#data_formats
+    """
+    super(Model, self).__init__()
+    self._input_shape = [-1, 28, 28, 1]
+
+    self.conv1 = layers.Conv2D(
+        32, 5, padding="same", data_format=data_format, activation=nn.relu)
+    self.conv2 = layers.Conv2D(
+        64, 5, padding="same", data_format=data_format, activation=nn.relu)
+    self.fc1 = layers.Dense(1024, activation=nn.relu)
+    self.fc2 = layers.Dense(10)
+    self.dropout = layers.Dropout(0.4)
+    self.max_pool2d = layers.MaxPooling2D(
+        (2, 2), (2, 2), padding="same", data_format=data_format)
+
+  def __call__(self, inputs, training):
+    """Add operations to classify a batch of input images.
+
+    Args:
+      inputs: A Tensor representing a batch of input images.
+      training: A boolean. Set to True to add operations required only when
+        training the classifier.
+
+    Returns:
+      A logits Tensor with shape [<batch_size>, 10].
+    """
+    y = array_ops.reshape(inputs, self._input_shape)
+    y = self.conv1(y)
+    y = self.max_pool2d(y)
+    y = self.conv2(y)
+    y = self.max_pool2d(y)
+    y = layers.flatten(y)
+    y = self.fc1(y)
+    y = self.dropout(y, training=training)
+    return self.fc2(y)
+
+
+def model_fn(features, labels, mode, params):
+  """The model_fn argument for creating an Estimator."""
+  model = Model(params["data_format"])
+  image = features
+  if isinstance(image, dict):
+    image = features["image"]
+
+  if mode == estimator.ModeKeys.PREDICT:
+    logits = model(image, training=False)
+    predictions = {
+        "classes": math_ops.argmax(logits, axis=1),
+        "probabilities": nn.softmax(logits),
+    }
+    return estimator.EstimatorSpec(
+        mode=estimator.ModeKeys.PREDICT,
+        predictions=predictions,
+        export_outputs={
+            "classify": estimator.export.PredictOutput(predictions)
+        })
+
+  elif mode == estimator.ModeKeys.TRAIN:
+    optimizer = train.AdamOptimizer(learning_rate=1e-4)
+
+    logits = model(image, training=True)
+    loss = losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+    return estimator.EstimatorSpec(
+        mode=estimator.ModeKeys.TRAIN,
+        loss=loss,
+        train_op=optimizer.minimize(loss, train.get_or_create_global_step()))
+
+  elif mode == estimator.ModeKeys.EVAL:
+    logits = model(image, training=False)
+    loss = losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+    return estimator.EstimatorSpec(
+        mode=estimator.ModeKeys.EVAL,
+        loss=loss,
+        eval_metric_ops={
+            "accuracy":
+                ops.metrics.accuracy(
+                    labels=labels, predictions=math_ops.argmax(logits, axis=1)),
+        })
+
+
+def dummy_input_fn():
+  image = random_ops.random_uniform([100, 784])
+  labels = random_ops.random_uniform([100, 1], maxval=9, dtype=dtypes.int32)
+  return image, labels
+
+
+class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
+
+  def testTrainedMnistSavedModel(self):
+    """Test mnist SavedModel, trained with dummy data and small steps."""
+    # Build classifier
+    classifier = estimator.Estimator(
+        model_fn=model_fn,
+        params={
+            "data_format": "channels_last"  # tflite format
+        })
+
+    # Train and pred for serving
+    classifier.train(input_fn=dummy_input_fn, steps=2)
+    image = array_ops.placeholder(dtypes.float32, [None, 28, 28])
+    pred_input_fn = estimator.export.build_raw_serving_input_receiver_fn({
+        "image": image,
+    })
+
+    # Export SavedModel
+    saved_model_dir = os.path.join(self.get_temp_dir(), "mnist_savedmodel")
+    classifier.export_savedmodel(saved_model_dir, pred_input_fn)
+
+    # Convert to tflite and test output
+    saved_model_name = os.listdir(saved_model_dir)[0]
+    saved_model_final_dir = os.path.join(saved_model_dir, saved_model_name)
+    output_file = os.path.join(saved_model_dir, saved_model_final_dir + ".lite")
+    # TODO(zhixianyan): no need to limit output_arrays to `Softmax'
+    # once b/74205001 fixed and argmax implemented in tflite.
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_final_dir,
+        output_arrays=["Softmax"],
+        output_file=output_file)
+
+    self.assertTrue(result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d9782f4a6a9e853c3afdbd97d4264a818937e63
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python console command for generating frozen models from SavedModels.
+
+This exists to add SavedModel compatibility to TOCO.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+from tensorflow.contrib.lite.python.convert_saved_model import saved_model_to_frozen_graphdef
+from tensorflow.python.platform import app
+
+FLAGS = None
+
+
+def execute(unused_args):
+  """Calls function to convert the SavedModel to a frozen graph."""
+  # Error handling.
+  if FLAGS.input_shapes and not FLAGS.input_arrays:
+    raise ValueError("Input shapes requires input arrays to be specified.")
+
+  # Calls saved_model_to_frozen_graphdef function to generate frozen graph.
+  input_arrays = (FLAGS.input_arrays.split(",") if FLAGS.input_arrays else None)
+  input_shapes = None
+  if FLAGS.input_shapes:
+    input_shapes = {
+        input_arrays[idx]: shape.split(",")
+        for idx, shape in enumerate(FLAGS.input_shapes.split(":"))
+    }
+  output_arrays = (
+      FLAGS.output_arrays.split(",") if FLAGS.output_arrays else None)
+  tag_set = set(FLAGS.tag_set.split(",")) if FLAGS.tag_set else None
+
+  saved_model_to_frozen_graphdef(
+      saved_model_dir=FLAGS.saved_model_directory,
+      output_file_model=FLAGS.output_file_model,
+      output_file_flags=FLAGS.output_file_flags,
+      input_arrays=input_arrays,
+      input_shapes=input_shapes,
+      output_arrays=output_arrays,
+      tag_set=tag_set,
+      signature_key=FLAGS.signature_key,
+      batch_size=FLAGS.batch_size)
+
+
+def main():
+  global FLAGS
+  # Parses flags.
+  parser = argparse.ArgumentParser(
+      description="Invoke SavedModel to frozen model converter.")
+  parser.add_argument(
+      "saved_model_directory",
+      type=str,
+      help="Full path to directory containing the SavedModel.")
+  parser.add_argument(
+      "output_file_model",
+      type=str,
+      help="Full file path to save frozen graph.")
+  parser.add_argument(
+      "output_file_flags", type=str, help="Full file path to save ModelFlags.")
+  parser.add_argument(
+      "--input_arrays",
+      type=str,
+      help="Name of the input arrays, comma-separated.")
+  parser.add_argument(
+      "--input_shapes",
+      type=str,
+      help="Shapes corresponding to --input_arrays, colon-separated.")
+  parser.add_argument(
+      "--output_arrays",
+      type=str,
+      help="Name of the output arrays, comma-separated.")
+  parser.add_argument(
+      "--tag_set", type=str, help="Name of output arrays, comma-separated.")
+  parser.add_argument(
+      "--signature_key",
+      type=str,
+      help="Key identifying SignatureDef containing inputs and outputs.")
+  parser.add_argument(
+      "--batch_size",
+      type=int,
+      help="Batch size for the model. Replaces the first dimension of an "
+      "input size array if undefined.")
+
+  FLAGS, unparsed = parser.parse_known_args()
+
+  app.run(main=execute, argv=[sys.argv[0]] + unparsed)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/convert_test.py
similarity index 82%
rename from tensorflow/contrib/lite/python/lite_test.py
rename to tensorflow/contrib/lite/python/convert_test.py
index b8b4510188bee867b32ffde714b27f41a1df778a..dc21a9b66933f595a5f31b0b91ff247a5458dad6 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/convert_test.py
@@ -17,8 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.lite.python import lite
-from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base
+from tensorflow.contrib.lite.python import convert
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.python import op_hint
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -29,7 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class LiteTest(test_util.TensorFlowTestCase):
+class ConvertTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
     in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
@@ -37,13 +38,13 @@ class LiteTest(test_util.TensorFlowTestCase):
     out_tensor = in_tensor + in_tensor
     sess = session.Session()
     # Try running on valid graph
-    result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
+    result = convert.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
     self.assertTrue(result)
     # TODO(aselle): remove tests that fail (we must get TOCO to not fatal
     # all the time).
     # Try running on identity graph (known fail)
     # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
-    #   result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
+    #   result = convert.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
 
   def testQuantization(self):
     in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
@@ -51,13 +52,14 @@ class LiteTest(test_util.TensorFlowTestCase):
     out_tensor = array_ops.fake_quant_with_min_max_args(in_tensor + in_tensor,
                                                         min=0., max=1.)
     sess = session.Session()
-    result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor],
-                               inference_type=lite.QUANTIZED_UINT8,
-                               quantized_input_stats=[(0., 1.)])
+    result = convert.toco_convert(
+        sess.graph_def, [in_tensor], [out_tensor],
+        inference_type=lite_constants.QUANTIZED_UINT8,
+        quantized_input_stats=[(0., 1.)])
     self.assertTrue(result)
 
 
-class LiteTestOpHint(test_util.TensorFlowTestCase):
+class ConvertTestOpHint(test_util.TensorFlowTestCase):
   """Test the hint to stub functionality."""
 
   def _getGraphOpTypes(self, graphdef, output_nodes):
@@ -99,7 +101,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
     swish_scale = array_ops.constant(1.0)
 
     def _swish(input_tensor, scale):
-      custom = lite.OpHint("cool_activation")
+      custom = op_hint.OpHint("cool_activation")
       input_tensor, scale = custom.add_inputs(input_tensor, scale)
       output = math_ops.sigmoid(input_tensor) * input_tensor * scale
       output, = custom.add_outputs(output)
@@ -111,11 +113,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
       # and 1 final output).
       self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
 
-      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
-              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output)]),
           ["cool_activation", "Const", "Identity"])
 
   def testScaleAndBiasAndIdentity(self):
@@ -125,7 +128,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
     b = array_ops.constant([4., 5.])
 
     def _scaled_and_bias_and_identity(a, x, b):
-      custom = lite.OpHint("scale_and_bias_and_identity")
+      custom = op_hint.OpHint("scale_and_bias_and_identity")
       a, x, b = custom.add_inputs(a, x, b)
       return custom.add_outputs(a * x + b, x)
     output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b),
@@ -136,11 +139,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
 
-      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
-              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output)]),
           ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
 
   def testTwoFunctions(self):
@@ -148,7 +152,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
     a = array_ops.constant([1.])
     b = array_ops.constant([1.])
     def _double_values(x):
-      custom = lite.OpHint("add_test")
+      custom = op_hint.OpHint("add_test")
       x = custom.add_inputs(x)
       output = math_ops.multiply(x, x)
       output, = custom.add_outputs(output)
@@ -160,10 +164,11 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
       # make sure one identity for each input (2) and output (2) => 2 + 2
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
-      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
       self.assertCountEqual(
           self._getGraphOpTypes(
-              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output)]),
           ["add_test", "Const", "Identity", "Add"])
 
 
diff --git a/tensorflow/contrib/lite/python/create_custom_op.py b/tensorflow/contrib/lite/python/create_custom_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..830f95358c455047db2cbad15cfed8c221e95dca
--- /dev/null
+++ b/tensorflow/contrib/lite/python/create_custom_op.py
@@ -0,0 +1,111 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Replaces a subgraph of a TensorFlow GraphDef with a single node.
+
+In conjunction with TOCO's --allow_custom_op this script allows selected
+portions of a TensorFlow GraphDef to be executed by custom code.
+
+Example:
+
+bazel run tensorflow/contrib/lite/python:create_custom_op  -- \
+  --input_graph=/tmp/input.pb \
+  --output_graph=/tmp/output.pb \
+  --inputs=concat,concat_1 \
+  --outputs=detection_classes \
+  --op_definition='op:"PostProcessing" attr{key:"num" value:{i:10}}'
+
+The above will identify a subgraph starting at nodes 'concat' and 'concat_1',
+and ending at 'detection_classes'. All nodes in between will be removed and
+replaced by a new op called 'PostProcessing'.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import uuid as _uuid
+from absl import app
+from absl import flags
+from google.protobuf import text_format
+from tensorflow.contrib.framework.python.framework.graph_util import fuse_op
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.platform import gfile
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_graph", "", "Binary graphdef to load.")
+flags.DEFINE_string("output_graph", "", "Resulting binary graphdef.")
+
+flags.DEFINE_string("inputs", "",
+                    "Comma-separated list of inputs to the subgraph.")
+flags.DEFINE_string("outputs", "",
+                    "Comma-separated list of outputs of the subgraph.")
+flags.DEFINE_string("op_definition", "",
+                    "A text NodeDef defining the contents of the custom op.")
+
+
+def _read_graph_def(filename):
+  if not gfile.Exists(filename):
+    raise ValueError("Input graph file '" + filename + "' does not exist!")
+
+  graph_def = graph_pb2.GraphDef()
+  with gfile.FastGFile(filename, "rb") as f:
+    graph_def.ParseFromString(f.read())
+  return graph_def
+
+
+def _write_graph_def(graph_def, filename):
+  if not filename:
+    raise ValueError("Output graph file not specified")
+
+  with gfile.Open(filename, "wb") as f:
+    f.write(graph_def.SerializeToString())
+
+
+def _collapse_subgraph(graph_def, inputs, outputs, op_definition):
+  """Substitute a custom op for the subgraph delimited by inputs and outputs."""
+  name = _uuid.uuid1().hex
+  # We need a default type, but it can be changed using 'op_definition'.
+  default_type = types_pb2.DT_FLOAT
+  new_graph = fuse_op(
+      graph_def=graph_def,
+      input_nodes=inputs,
+      output_nodes=outputs,
+      output_dtypes=[default_type for _ in outputs],
+      output_quantized=False,
+      op_name=name,
+      op_type="CustomTfLiteOp")
+  node_def = node_def_pb2.NodeDef()
+  text_format.Parse(op_definition, node_def)
+  for node in new_graph.node:
+    if node.name == name:
+      node.MergeFrom(node_def)
+  return new_graph
+
+
+def main(argv):
+  del argv  # unused
+  graph = _read_graph_def(filename=flags.FLAGS.input_graph)
+  graph = _collapse_subgraph(
+      graph_def=graph,
+      inputs=flags.FLAGS.inputs.split(","),
+      outputs=flags.FLAGS.outputs.split(","),
+      op_definition=flags.FLAGS.op_definition)
+  _write_graph_def(graph_def=graph, filename=flags.FLAGS.output_graph)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb9c0d3121895595ffce91e254bea3f527714809
--- /dev/null
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -0,0 +1,151 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python TF-Lite interpreter."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.lite.python.interpreter_wrapper import tensorflow_wrap_interpreter_wrapper as interpreter_wrapper
+
+
+class Interpreter(object):
+  """Interpreter inferace for TF-Lite Models."""
+
+  def __init__(self, model_path=None, model_content=None):
+    """Constructor.
+
+    Args:
+      model_path: Path to TF-Lite Flatbuffer file.
+      model_content: Content of model.
+
+    Raises:
+      ValueError: If the interpreter was unable to create.
+    """
+    if model_path and not model_content:
+      self._interpreter = (
+          interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromFile(
+              model_path))
+      if not self._interpreter:
+        raise ValueError('Failed to open {}'.format(model_path))
+    elif model_content and not model_path:
+      self._interpreter = (
+          interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
+              model_content, len(model_content)))
+      if not self._interpreter:
+        raise ValueError(
+            'Failed to create model from {} bytes'.format(len(model_content)))
+    elif not model_path and not model_path:
+      raise ValueError('`model_path` or `model_content` must be specified.')
+    else:
+      raise ValueError('Can\'t both provide `model_path` and `model_content`')
+
+  def allocate_tensors(self):
+    if not self._interpreter.AllocateTensors():
+      raise ValueError('Failed to allocate tensors')
+
+  def _get_tensor_details(self, tensor_index):
+    """Gets tensor details.
+
+    Args:
+      tensor_index: Tensor index of tensor to query.
+
+    Returns:
+      a dictionary containing the name, index, shape and type of the tensor.
+
+    Raises:
+      ValueError: If tensor_index is invalid.
+    """
+    tensor_index = int(tensor_index)
+    tensor_name = self._interpreter.TensorName(tensor_index)
+    tensor_size = self._interpreter.TensorSize(tensor_index)
+    tensor_type = self._interpreter.TensorType(tensor_index)
+    tensor_quantization = self._interpreter.TensorQuantization(tensor_index)
+
+    if not tensor_name or not tensor_type:
+      raise ValueError('Could not get tensor details')
+
+    details = {
+        'name': tensor_name,
+        'index': tensor_index,
+        'shape': tensor_size,
+        'dtype': tensor_type,
+        'quantization': tensor_quantization,
+    }
+
+    return details
+
+  def get_input_details(self):
+    """Gets model input details.
+
+    Returns:
+      A list of input details.
+    """
+    return [
+        self._get_tensor_details(i) for i in self._interpreter.InputIndices()
+    ]
+
+  def set_tensor(self, tensor_index, value):
+    """Sets the value of the input.
+
+    Args:
+      tensor_index: Tensor index of tensor to set. This value can be gotten from
+                    the 'index' field in get_input_details.
+      value: Value of tensor to set.
+
+    Raises:
+      ValueError: If the interpreter could not set the tensor.
+    """
+    if not self._interpreter.SetTensor(tensor_index, value):
+      raise ValueError('Failed to set tensor')
+
+  def resize_tensor_input(self, input_index, tensor_size):
+    """Resizes an input tensor.
+
+    Args:
+      input_index: Tensor index of input to set. This value can be gotten from
+                   the 'index' field in get_input_details.
+      tensor_size: The tensor_shape to resize the input to.
+
+    Raises:
+      ValueError: If the interpreter could not resize the input tensor.
+    """
+    if not self._interpreter.ResizeInputTensor(input_index, tensor_size):
+      raise ValueError('Failed to resize input')
+
+  def get_output_details(self):
+    """Gets model output details.
+
+    Returns:
+      A list of output details.
+    """
+    return [
+        self._get_tensor_details(i) for i in self._interpreter.OutputIndices()
+    ]
+
+  def get_tensor(self, tensor_index):
+    """Sets the value of the input.
+
+    Args:
+      tensor_index: Tensor index of tensor to get. This value can be gotten from
+                    the 'index' field in get_output_details.
+
+    Returns:
+      a numpy array.
+    """
+    return self._interpreter.GetTensor(tensor_index)
+
+  def invoke(self):
+    if not self._interpreter.Invoke():
+      raise ValueError('Failed to invoke TFLite model')
diff --git a/tensorflow/contrib/lite/python/interpreter_test.py b/tensorflow/contrib/lite/python/interpreter_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f802edf020db8a9d4e7bb890aadaae7e34e983a8
--- /dev/null
+++ b/tensorflow/contrib/lite/python/interpreter_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite Python Interface: Sanity check."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+import numpy as np
+
+from tensorflow.contrib.lite.python import interpreter as interpreter_wrapper
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
+
+
+class InterpreterTest(test_util.TensorFlowTestCase):
+
+  def testFloat(self):
+    interpreter = interpreter_wrapper.Interpreter(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/permute_float.tflite'))
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('input', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 4] == input_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('output', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 4] == output_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), output_details[0]['quantization'])
+
+    test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
+    expected_output = np.array([[4.0, 3.0, 2.0, 1.0]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
+
+  def testUint8(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'testdata/permute_uint8.tflite')
+    with io.open(model_path, 'rb') as model_file:
+      data = model_file.read()
+
+    interpreter = interpreter_wrapper.Interpreter(model_content=data)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('input', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertTrue(([1, 4] == input_details[0]['shape']).all())
+    self.assertEqual((1.0, 0), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('output', output_details[0]['name'])
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertTrue(([1, 4] == output_details[0]['shape']).all())
+    self.assertEqual((1.0, 0), output_details[0]['quantization'])
+
+    test_input = np.array([[1, 2, 3, 4]], dtype=np.uint8)
+    expected_output = np.array([[4, 3, 2, 1]], dtype=np.uint8)
+    interpreter.resize_tensor_input(input_details[0]['index'],
+                                    np.array(test_input.shape, dtype=np.int32))
+    interpreter.allocate_tensors()
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD b/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..453eda6e7345762666917fd501b69c7181c349e8
--- /dev/null
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD
@@ -0,0 +1,32 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
+cc_library(
+    name = "interpreter_wrapper_lib",
+    srcs = ["interpreter_wrapper.cc"],
+    hdrs = ["interpreter_wrapper.h"],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/core:lib",
+        "//tensorflow/python:numpy_lib",
+        "//util/python:python_headers",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "tensorflow_wrap_interpreter_wrapper",
+    srcs = [
+        "interpreter_wrapper.i",
+    ],
+    deps = [
+        ":interpreter_wrapper_lib",
+        "//util/python:python_headers",
+    ],
+)
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04fc098129854e168d68de3b308eabbcaa968ea8
--- /dev/null
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -0,0 +1,341 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h"
+
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/numpy.h"
+
+#if PY_MAJOR_VERSION >= 3
+#define PY_TO_CPPSTRING PyBytes_AsStringAndSize
+#define CPP_TO_PYSTRING PyBytes_FromStringAndSize
+#else
+#define PY_TO_CPPSTRING PyString_AsStringAndSize
+#define CPP_TO_PYSTRING PyString_FromStringAndSize
+#endif
+
+namespace tflite {
+namespace interpreter_wrapper {
+
+namespace {
+std::unique_ptr<tflite::Interpreter> CreateInterpreter(
+    const tflite::FlatBufferModel* model,
+    const tflite::ops::builtin::BuiltinOpResolver& resolver) {
+  if (!model) {
+    return nullptr;
+  }
+
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (interpreter) {
+    for (const int input_index : interpreter->inputs()) {
+      const TfLiteTensor* tensor = interpreter->tensor(input_index);
+      CHECK(tensor);
+      const TfLiteIntArray* dims = tensor->dims;
+      if (!dims) {
+        continue;
+      }
+
+      std::vector<int> input_dims(dims->data, dims->data + dims->size);
+      interpreter->ResizeInputTensor(input_index, input_dims);
+    }
+  }
+  return interpreter;
+}
+
+int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
+  switch (tf_lite_type) {
+    case kTfLiteFloat32:
+      return NPY_FLOAT32;
+    case kTfLiteInt32:
+      return NPY_INT32;
+    case kTfLiteUInt8:
+      return NPY_UINT8;
+    case kTfLiteInt64:
+      return NPY_INT64;
+    case kTfLiteString:
+      return NPY_OBJECT;
+    case kTfLiteBool:
+      return NPY_BOOL;
+    case kTfLiteNoType:
+      return -1;
+  }
+  LOG(ERROR) << "Unknown TfLiteType " << tf_lite_type;
+  return -1;
+}
+
+TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
+  int pyarray_type = PyArray_TYPE(array);
+  switch (pyarray_type) {
+    case NPY_FLOAT32:
+      return kTfLiteFloat32;
+    case NPY_INT32:
+      return kTfLiteInt32;
+    case NPY_UINT8:
+      return kTfLiteUInt8;
+    case NPY_INT64:
+      return kTfLiteInt64;
+    case NPY_BOOL:
+      return kTfLiteBool;
+    case NPY_OBJECT:
+    case NPY_STRING:
+    case NPY_UNICODE:
+      return kTfLiteString;
+  }
+  LOG(ERROR) << "Unknown PyArray dtype " << pyarray_type;
+  return kTfLiteNoType;
+}
+
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+PyObject* PyArrayFromIntVector(const int* data, npy_intp size) {
+  void* pydata = malloc(size * sizeof(int));
+  memcpy(pydata, data, size * sizeof(int));
+  return PyArray_SimpleNewFromData(1, &size, NPY_INT32, pydata);
+}
+
+PyObject* PyTupleFromQuantizationParam(const TfLiteQuantizationParams& param) {
+  PyObject* result = PyTuple_New(2);
+  PyTuple_SET_ITEM(result, 0, PyFloat_FromDouble(param.scale));
+  PyTuple_SET_ITEM(result, 1, PyInt_FromLong(param.zero_point));
+  return result;
+}
+
+}  // namespace
+
+InterpreterWrapper::InterpreterWrapper(
+    std::unique_ptr<tflite::FlatBufferModel> model)
+    : model_(std::move(model)),
+      resolver_(absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>()),
+      interpreter_(CreateInterpreter(model_.get(), *resolver_)) {}
+
+InterpreterWrapper::~InterpreterWrapper() {}
+
+bool InterpreterWrapper::AllocateTensors() {
+  if (!interpreter_) {
+    LOG(ERROR) << "Cannot allocate tensors: invalid interpreter.";
+    return false;
+  }
+
+  if (interpreter_->AllocateTensors() != kTfLiteOk) {
+    LOG(ERROR) << "Unable to allocate tensors.";
+    return false;
+  }
+
+  return true;
+}
+
+bool InterpreterWrapper::Invoke() {
+  return interpreter_ ? (interpreter_->Invoke() == kTfLiteOk) : false;
+}
+
+PyObject* InterpreterWrapper::InputIndices() const {
+  PyObject* np_array = PyArrayFromIntVector(interpreter_->inputs().data(),
+                                            interpreter_->inputs().size());
+
+  return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+}
+
+PyObject* InterpreterWrapper::OutputIndices() const {
+  PyObject* np_array = PyArrayFromIntVector(interpreter_->outputs().data(),
+                                            interpreter_->outputs().size());
+
+  return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+}
+
+bool InterpreterWrapper::ResizeInputTensor(int i, PyObject* value) {
+  if (!interpreter_) {
+    LOG(ERROR) << "Invalid interpreter.";
+    return false;
+  }
+
+  std::unique_ptr<PyObject, PyDecrefDeleter> array_safe(
+      PyArray_FromAny(value, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr));
+  if (!array_safe) {
+    LOG(ERROR) << "Failed to convert value into readable tensor.";
+    return false;
+  }
+
+  PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
+
+  if (PyArray_NDIM(array) != 1) {
+    LOG(ERROR) << "Expected 1-D defining input shape.";
+    return false;
+  }
+
+  if (PyArray_TYPE(array) != NPY_INT32) {
+    LOG(ERROR) << "Shape must be an int32 array";
+    return false;
+  }
+
+  std::vector<int> dims(PyArray_SHAPE(array)[0]);
+  memcpy(dims.data(), PyArray_BYTES(array), dims.size() * sizeof(int));
+
+  return (interpreter_->ResizeInputTensor(i, dims) == kTfLiteOk);
+}
+
+std::string InterpreterWrapper::TensorName(int i) const {
+  if (!interpreter_ || i >= interpreter_->tensors_size() || i < 0) {
+    return "";
+  }
+
+  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  return tensor->name;
+}
+
+PyObject* InterpreterWrapper::TensorType(int i) const {
+  if (!interpreter_ || i >= interpreter_->tensors_size() || i < 0) {
+    return nullptr;
+  }
+
+  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  int typenum = TfLiteTypeToPyArrayType(tensor->type);
+  return PyArray_TypeObjectFromType(typenum);
+}
+
+PyObject* InterpreterWrapper::TensorSize(int i) const {
+  if (!interpreter_ || i >= interpreter_->tensors_size() || i < 0) {
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+
+  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  PyObject* np_array =
+      PyArrayFromIntVector(tensor->dims->data, tensor->dims->size);
+
+  return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+}
+
+PyObject* InterpreterWrapper::TensorQuantization(int i) const {
+  if (!interpreter_ || i >= interpreter_->tensors_size() || i < 0) {
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+
+  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  return PyTupleFromQuantizationParam(tensor->params);
+}
+
+bool InterpreterWrapper::SetTensor(int i, PyObject* value) {
+  if (!interpreter_) {
+    LOG(ERROR) << "Invalid interpreter.";
+    return false;
+  }
+
+  if (i >= interpreter_->tensors_size()) {
+    LOG(ERROR) << "Invalid tensor index: " << i << " exceeds max tensor index "
+               << interpreter_->tensors_size();
+    return false;
+  }
+
+  std::unique_ptr<PyObject, PyDecrefDeleter> array_safe(
+      PyArray_FromAny(value, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr));
+  if (!array_safe) {
+    LOG(ERROR) << "Failed to convert value into readable tensor.";
+    return false;
+  }
+
+  PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
+  const TfLiteTensor* tensor = interpreter_->tensor(i);
+
+  if (TfLiteTypeFromPyArray(array) != tensor->type) {
+    LOG(ERROR) << "Cannot set tensor:"
+               << " Got tensor of type " << TfLiteTypeFromPyArray(array)
+               << " but expected type " << tensor->type << " for input " << i;
+    return false;
+  }
+
+  if (PyArray_NDIM(array) != tensor->dims->size) {
+    LOG(ERROR) << "Cannot set tensor: Dimension mismatch";
+    return false;
+  }
+
+  for (int j = 0; j < PyArray_NDIM(array); j++) {
+    if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
+      LOG(ERROR) << "Cannot set tensor: Dimension mismatch";
+      return false;
+    }
+  }
+
+  size_t size = PyArray_NBYTES(array);
+  DCHECK_EQ(size, tensor->bytes);
+  memcpy(tensor->data.raw, PyArray_DATA(array), size);
+  return true;
+}
+
+PyObject* InterpreterWrapper::GetTensor(int i) const {
+  if (!interpreter_) {
+    LOG(ERROR) << "Invalid interpreter.";
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+
+  if (i >= interpreter_->tensors_size()) {
+    LOG(ERROR) << "Invalid tensor index: " << i << " exceeds max tensor index "
+               << interpreter_->inputs().size();
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+
+  const TfLiteTensor* output_tensor = interpreter_->tensor(i);
+  const int tensor_size = output_tensor->bytes;
+  if (tensor_size <= 0) {
+    LOG(ERROR) << "Invalid tensor size";
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+
+  int type_num = TfLiteTypeToPyArrayType(output_tensor->type);
+  if (type_num == -1) {
+    LOG(ERROR) << "Unknown tensor type " << output_tensor->type;
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+
+  void* data = malloc(tensor_size);
+  memcpy(data, output_tensor->data.raw, tensor_size);
+
+  const TfLiteIntArray* output_dims = output_tensor->dims;
+  std::vector<npy_intp> dims(output_dims->data,
+                             output_dims->data + output_dims->size);
+  PyObject* np_array =
+      PyArray_SimpleNewFromData(dims.size(), dims.data(), type_num, data);
+
+  return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+}
+
+InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
+    const char* model_path) {
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromFile(model_path);
+  return model ? new InterpreterWrapper(std::move(model)) : nullptr;
+}
+
+InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
+    const char* data, size_t len) {
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromBuffer(data, len);
+  return model ? new InterpreterWrapper(std::move(model)) : nullptr;
+}
+
+}  // namespace interpreter_wrapper
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..0972c572595f5044a305a81afaccbea5f131247c
--- /dev/null
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -0,0 +1,77 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
+#define TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <Python.h>
+
+// We forward declare TFLite classes here to avoid exposing them to SWIG.
+namespace tflite {
+namespace ops {
+namespace builtin {
+class BuiltinOpResolver;
+}  // namespace builtin
+}  // namespace ops
+
+class FlatBufferModel;
+class Interpreter;
+
+namespace interpreter_wrapper {
+
+class InterpreterWrapper {
+ public:
+  // SWIG caller takes ownership of pointer.
+  static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
+
+  // SWIG caller takes ownership of pointer.
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
+                                                        size_t len);
+
+  ~InterpreterWrapper();
+  bool AllocateTensors();
+  bool Invoke();
+
+  PyObject* InputIndices() const;
+  PyObject* OutputIndices() const;
+  bool ResizeInputTensor(int i, PyObject* value);
+
+  std::string TensorName(int i) const;
+  PyObject* TensorType(int i) const;
+  PyObject* TensorSize(int i) const;
+  PyObject* TensorQuantization(int i) const;
+  bool SetTensor(int i, PyObject* value);
+  PyObject* GetTensor(int i) const;
+
+ private:
+  InterpreterWrapper(std::unique_ptr<tflite::FlatBufferModel> model);
+
+  // InterpreterWrapper is not copyable or assignable. We avoid the use of
+  // InterpreterWrapper() = delete here for SWIG compatibility.
+  InterpreterWrapper();
+  InterpreterWrapper(const InterpreterWrapper& rhs);
+
+  const std::unique_ptr<tflite::FlatBufferModel> model_;
+  const std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver_;
+  const std::unique_ptr<tflite::Interpreter> interpreter_;
+};
+
+}  // namespace interpreter_wrapper
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.i b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.i
new file mode 100644
index 0000000000000000000000000000000000000000..7f51f9f00d1b2fe057052f7b7bd52bcb65231164
--- /dev/null
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.i
@@ -0,0 +1,25 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "std_string.i"
+
+
+%{
+#define SWIG_FILE_WITH_INIT
+#include "tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h"
+%}
+
+
+%include "tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h"
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 5d2f21653762a405a57288a7ba38323e5e42b3e1..4ea40201f73bb0aabee60866dde2337862149dc7 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -18,6 +18,7 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
 @@toco_convert
 @@toco_convert_protos
+@@tflite_from_saved_model
 @@OpHint
 @@convert_op_hints_to_stubs
 
@@ -25,203 +26,11 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import os
-import subprocess
-import tempfile
 
 # pylint: disable=unused-import
+from tensorflow.contrib.lite.python.convert import toco_convert
+from tensorflow.contrib.lite.python.convert import toco_convert_protos
+from tensorflow.contrib.lite.python.convert_saved_model import tflite_from_saved_model
 from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.contrib.lite.python.op_hint import OpHint
 # pylint: enable=unused-import
-from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
-from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
-from tensorflow.python.framework import dtypes as _dtypes
-from tensorflow.python.platform import resource_loader as _resource_loader
-from tensorflow.python.util.all_util import remove_undocumented
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-# Lazy load since some of the performance benchmark skylark rules
-# break dependencies.
-_toco_python = LazyLoader(
-    "tensorflow_wrap_toco", globals(),
-    "tensorflow.contrib.lite.toco.python."
-    "tensorflow_wrap_toco")
-del LazyLoader
-
-# Enum types from the protobuf promoted to the API
-FLOAT = _types_pb2.FLOAT
-INT32 = _types_pb2.INT32
-INT64 = _types_pb2.INT64
-STRING = _types_pb2.STRING
-QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
-TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
-TFLITE = _toco_flags_pb2.TFLITE
-GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
-
-# Currently the default mode of operation is to shell to another python process
-# to protect against crashes. However, it breaks some dependent targets because
-# it forces us to depend on an external py_binary. The experimental API doesn't
-# have that drawback.
-EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
-
-# Find the toco_from_protos binary using the resource loader if using from
-# bazel, otherwise we are in a pip where console_scripts already has
-# the toco_from_protos tool.
-if EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
-  _toco_from_proto_bin = ""
-else:
-  _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
-      "../toco/python/toco_from_protos")
-
-if _toco_from_proto_bin and not os.path.exists(_toco_from_proto_bin):
-  _toco_from_proto_bin = "toco_from_protos"
-
-
-def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
-  """Convert `input_data_str` according to model and toco parameters.
-
-  Unless you know what you are doing consider using
-  the more friendly @{tf.contrib.lite.toco_convert}}.
-
-  Args:
-    model_flags_str: Serialized proto describing model properties, see
-      `toco/model_flags.proto`.
-    toco_flags_str: Serialized proto describing conversion properties, see
-      `toco/toco_flags.proto`.
-    input_data_str: Input data in serialized form (e.g. a graphdef is common)
-  Returns:
-    Converted model in serialized form (e.g. a TFLITE model is common).
-  Raises:
-    RuntimeError: When conversion fails, an exception is raised with the error
-      message embedded.
-  """
-  # TODO(aselle): When toco does not use fatal errors for failure, we can
-  # switch this on.
-  if not _toco_from_proto_bin:
-    return _toco_python.TocoConvert(
-        model_flags_str, toco_flags_str, input_data_str)
-
-  with tempfile.NamedTemporaryFile() as fp_toco, \
-           tempfile.NamedTemporaryFile() as fp_model, \
-           tempfile.NamedTemporaryFile() as fp_input, \
-           tempfile.NamedTemporaryFile() as fp_output:
-    fp_model.write(model_flags_str)
-    fp_toco.write(toco_flags_str)
-    fp_input.write(input_data_str)
-    fp_model.flush()
-    fp_toco.flush()
-    fp_input.flush()
-
-    cmd = [
-        _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name,
-        fp_output.name
-    ]
-    cmdline = " ".join(cmd)
-    proc = subprocess.Popen(
-        cmdline,
-        shell=True,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        close_fds=True)
-    stdout, stderr = proc.communicate()
-    exitcode = proc.returncode
-    if exitcode == 0:
-      stuff = fp_output.read()
-      return stuff
-    else:
-      raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" %
-                         (stdout, stderr))
-
-
-def _tensor_name(x):
-  return x.name.split(":")[0]
-
-
-def toco_convert(input_data,
-                 input_tensors,
-                 output_tensors,
-                 inference_type=FLOAT,
-                 input_format=TENSORFLOW_GRAPHDEF,
-                 output_format=TFLITE,
-                 quantized_input_stats=None,
-                 drop_control_dependency=True):
-  """Convert a model using TOCO from `input_format` to `output_format`.
-
-  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
-  case the default `input_format` and `output_format` are sufficient.
-
-  Args:
-    input_data: Input data (i.e. often `sess.graph_def`).
-    input_tensors: List of input tensors. Type and shape are computed using
-      `foo.get_shape()` and `foo.dtype`.
-    output_tensors: List of output tensors (only .name is used from this).
-    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
-    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
-    output_format: Type of data to write (currently must be TFLITE or
-      GRAPHVIZ_DOT)
-    quantized_input_stats: For each member of input_tensors the mean and
-      std deviation of training data. Only needed if `inference_type` is
-      `QUANTIZED_UINT8`.
-    drop_control_dependency: Drops control dependencies silently. This is due
-      to tf lite not supporting control dependencies.
-
-  Returns:
-    The converted data. For example if tflite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
-
-  Raises:
-    ValueError: If the input tensor type is unknown
-    RuntimeError: If TOCO fails to convert (in which case the runtime error's
-      error text will contain the TOCO error log)
-  """
-  toco = _toco_flags_pb2.TocoFlags()
-  toco.input_format = input_format
-  toco.output_format = output_format
-  toco.drop_control_dependency = drop_control_dependency
-  model = _model_flags_pb2.ModelFlags()
-  toco.inference_type = inference_type
-  for idx, input_tensor in enumerate(input_tensors):
-    if input_tensor.dtype == _dtypes.float32:
-      tflite_input_type = FLOAT
-    elif input_tensor.dtype == _dtypes.int32:
-      tflite_input_type = INT32
-    elif input_tensor.dtype == _dtypes.int64:
-      tflite_input_type = INT64
-    # TODO(aselle): Insert strings when they are available
-    else:
-      raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
-                                                         input_tensor.dtype))
-
-    input_array = model.input_arrays.add()
-
-    if inference_type == QUANTIZED_UINT8:
-      if tflite_input_type == FLOAT:
-        tflite_input_type = QUANTIZED_UINT8
-      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
-
-    input_array.name = _tensor_name(input_tensor)
-    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
-    toco.inference_input_type = tflite_input_type
-
-  for output_tensor in output_tensors:
-    model.output_arrays.append(_tensor_name(output_tensor))
-
-  data = toco_convert_protos(model.SerializeToString(),
-                             toco.SerializeToString(),
-                             input_data.SerializeToString())
-  return data
-
-
-_allowed_symbols = [
-    "FLOAT",
-    "INT32",
-    "INT64",
-    "STRING",
-    "QUANTIZED_UINT8",
-    "TENSORFLOW_GRAPHDEF",
-    "TFLITE",
-    "GRAPHVIZ_DOT",
-    "EXPERIMENTAL_USE_TOCO_API_DIRECTLY",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/python/lite_constants.py b/tensorflow/contrib/lite/python/lite_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..195d7a732f337676937c7af5137d4dea84989c03
--- /dev/null
+++ b/tensorflow/contrib/lite/python/lite_constants.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constants for TFLite."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.util.all_util import remove_undocumented
+
+# Enum types from the protobuf promoted to the API
+FLOAT = _types_pb2.FLOAT
+INT32 = _types_pb2.INT32
+INT64 = _types_pb2.INT64
+STRING = _types_pb2.STRING
+QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
+TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
+TFLITE = _toco_flags_pb2.TFLITE
+GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
+
+# Currently the default mode of operation is to shell to another python process
+# to protect against crashes. However, it breaks some dependent targets because
+# it forces us to depend on an external py_binary. The experimental API doesn't
+# have that drawback.
+EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
+
+
+_allowed_symbols = [
+    "FLOAT",
+    "INT32",
+    "INT64",
+    "STRING",
+    "QUANTIZED_UINT8",
+    "TENSORFLOW_GRAPHDEF",
+    "TFLITE",
+    "GRAPHVIZ_DOT",
+    "EXPERIMENTAL_USE_TOCO_API_DIRECTLY",
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/python/op_hint.py b/tensorflow/contrib/lite/python/op_hint.py
index 9a3971228a683211e84b4c55d3a3e8d574b5ed94..7908689ce4a719ab15bd49a368a87f9cad7c6d61 100644
--- a/tensorflow/contrib/lite/python/op_hint.py
+++ b/tensorflow/contrib/lite/python/op_hint.py
@@ -119,8 +119,10 @@ class OpHint(object):
 
   def _setattr(self, dest_op, name, value):
     tensor_value = _ops.convert_to_tensor(value)
-    dest_op.op.node_def.attr[name].tensor.CopyFrom(
-        tensor_value.op.node_def.attr["value"].tensor)
+    # pylint: disable=protected-access
+    dest_op.op._set_attr(name, _attr_value_pb2.AttrValue(
+        tensor=tensor_value.op.node_def.attr["value"].tensor))
+    # pylint: enable=protected-access
 
   def add_inputs(self, *args):
     """Add a sequence of inputs to the function invocation.
diff --git a/tensorflow/contrib/lite/rpi_makefile.inc b/tensorflow/contrib/lite/rpi_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..832ef5824bea86a368184bd7e3d17915739e9d46
--- /dev/null
+++ b/tensorflow/contrib/lite/rpi_makefile.inc
@@ -0,0 +1,33 @@
+# Settings for Raspberry Pi.
+ifeq ($(TARGET), RPI)
+	ifeq ($(TARGET_ARCH), armv7)
+		CXXFLAGS += \
+			-march=armv7-a \
+			-mfpu=neon-vfpv4 \
+			-funsafe-math-optimizations \
+			-ftree-vectorize
+
+		CCFLAGS += \
+			-march=armv7-a \
+			-mfpu=neon-vfpv4 \
+			-funsafe-math-optimizations \
+			-ftree-vectorize
+
+		LDFLAGS := \
+			-Wl,--no-export-dynamic \
+			-Wl,--exclude-libs,ALL \
+			-Wl,--gc-sections \
+			-Wl,--as-needed
+	endif
+
+	LIBS := \
+	-lstdc++ \
+	-lpthread \
+	-lm \
+	-ldl
+
+	OBJDIR := $(OBJDIR)rpi_$(TARGET_ARCH)/
+	LIBDIR := $(LIBDIR)rpi_$(TARGET_ARCH)/
+	BINDIR := $(BINDIR)rpi_$(TARGET_ARCH)/
+	DEPDIR := $(DEPDIR)rpi_$(TARGET_ARCH)/
+endif
diff --git a/tensorflow/contrib/lite/schema/BUILD b/tensorflow/contrib/lite/schema/BUILD
index 54167ddd9a5a003d0ff21e6627a1dbe94afa3e87..9717a4a1a496b888348514584888e62c4e3703b4 100644
--- a/tensorflow/contrib/lite/schema/BUILD
+++ b/tensorflow/contrib/lite/schema/BUILD
@@ -5,6 +5,7 @@ package(default_visibility = [
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 py_binary(
     name = "upgrade_schema",
@@ -62,6 +63,9 @@ cc_test(
         "schema.fbs",
         "schema_v3.fbs",
     ],
+    tags = [
+        "tflite_not_portable_android",
+    ],
     deps = [
         "//tensorflow/core:lib_platform",
         "@com_google_googletest//:gtest",
@@ -69,14 +73,4 @@ cc_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
index b983d59d85955b241a22012b4e9adbeea346f80d..ac408d2f94b98d505afe4c951d7cc2ff960606fb 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
+++ b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
@@ -45,6 +45,8 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
+// The enum for builtin operators.
+// Note: CUSTOM and DELEGATE are 2 special ops which are not real built-in ops.
 typedef enum {
 )";
 
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 75970b41267613058199c22a0fcb0c80a1c8f04f..b16baf02dcfa124e224f1d850f4b1727b326f98e 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -33,6 +33,7 @@ enum TensorType : byte {
   UINT8 = 3,
   INT64 = 4,
   STRING = 5,
+  BOOL = 6,
 }
 
 // Parameters for converting a quantized tensor back to float. Given a
@@ -75,9 +76,9 @@ enum BuiltinOperator : byte {
   CONV_2D = 3,
   DEPTHWISE_CONV_2D = 4,
   // DEPTH_TO_SPACE = 5,
-  // DEQUANTIZE = 6,
+  DEQUANTIZE = 6,
   EMBEDDING_LOOKUP = 7,
-  // FLOOR = 8,
+  FLOOR = 8,
   FULLY_CONNECTED = 9,
   HASHTABLE_LOOKUP = 10,
   L2_NORMALIZATION = 11,
@@ -123,6 +124,18 @@ enum BuiltinOperator : byte {
   EXP = 47,
   TOPK_V2 = 48,
   SPLIT = 49,
+  LOG_SOFTMAX = 50,
+  // DELEGATE is a special op type for the operations which are delegated to
+  // other backends.
+  // WARNING: Experimental interface, subject to change
+  DELEGATE = 51,
+  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
 }
 
 // Options for the builtin operators.
@@ -162,6 +175,12 @@ union BuiltinOptions {
   ExpOptions,
   TopKV2Options,
   SplitOptions,
+  LogSoftmaxOptions,
+  CastOptions,
+  DequantizeOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
+  LessOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -180,6 +199,8 @@ table Conv2DOptions {
   stride_w:int;
   stride_h:int;
   fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
 }
 
 table Pool2DOptions {
@@ -364,6 +385,27 @@ table StridedSliceOptions {
   shrink_axis_mask: int;
 }
 
+table LogSoftmaxOptions {
+}
+
+table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
+}
+
+table DequantizeOptions {
+}
+
+table MaximumMinimumOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
+table LessOptions {
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
@@ -393,21 +435,25 @@ table Operator {
   custom_options_format:CustomOptionsFormat;
 }
 
-// The root type, defining a model.
+// The root type, defining a subgraph, which typically represents an entire
+// model.
 table SubGraph {
-  // A list of all tensors used in this model.
+  // A list of all tensors used in this subgraph.
   tensors:[Tensor];
 
-  // Indices of the input tensors.
+  // Indices of the tensors that are inputs into this subgraph. Note this is
+  // the list of non-static tensors that feed into the subgraph for inference.
   inputs:[int];
 
-  // Indices of the output tensors.
+  // Indices of the tensors that are outputs out of this subgraph. Note this is
+  // the list of output tensors that are considered the product of the
+  // subgraph's inference.
   outputs:[int];
 
   // All operators, in execution order.
   operators:[Operator];
 
-  // Name of subgraph (used for debugging).
+  // Name of this subgraph (used for debugging).
   name:string;
 }
 
@@ -433,9 +479,14 @@ table Model {
   // A description of the model.
   description:string;
 
-  // Buffers of the model
+  // Buffers of the model.
+  // Note the 0th entry of this array must be an empty buffer (sentinel).
+  // This is a convention so that tensors without a buffer can provide 0 as
+  // their buffer.
   buffers:[Buffer];
 
+  // Metadata about the model.  Indirects into the existings buffers list.
+  metadata_buffer:[int];
 }
 
 root_type Model;
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 06989c7b61dc9904bff380e7f1cdc11097cb340d..25ed9abd9f8dedde219d52777f9f12b972f95fe2 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -136,6 +136,24 @@ struct SplitOptionsT;
 struct StridedSliceOptions;
 struct StridedSliceOptionsT;
 
+struct LogSoftmaxOptions;
+struct LogSoftmaxOptionsT;
+
+struct CastOptions;
+struct CastOptionsT;
+
+struct DequantizeOptions;
+struct DequantizeOptionsT;
+
+struct MaximumMinimumOptions;
+struct MaximumMinimumOptionsT;
+
+struct ArgMaxOptions;
+struct ArgMaxOptionsT;
+
+struct LessOptions;
+struct LessOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -158,18 +176,20 @@ enum TensorType {
   TensorType_UINT8 = 3,
   TensorType_INT64 = 4,
   TensorType_STRING = 5,
+  TensorType_BOOL = 6,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_STRING
+  TensorType_MAX = TensorType_BOOL
 };
 
-inline TensorType (&EnumValuesTensorType())[6] {
+inline TensorType (&EnumValuesTensorType())[7] {
   static TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
     TensorType_INT32,
     TensorType_UINT8,
     TensorType_INT64,
-    TensorType_STRING
+    TensorType_STRING,
+    TensorType_BOOL
   };
   return values;
 }
@@ -182,6 +202,7 @@ inline const char **EnumNamesTensorType() {
     "UINT8",
     "INT64",
     "STRING",
+    "BOOL",
     nullptr
   };
   return names;
@@ -198,7 +219,9 @@ enum BuiltinOperator {
   BuiltinOperator_CONCATENATION = 2,
   BuiltinOperator_CONV_2D = 3,
   BuiltinOperator_DEPTHWISE_CONV_2D = 4,
+  BuiltinOperator_DEQUANTIZE = 6,
   BuiltinOperator_EMBEDDING_LOOKUP = 7,
+  BuiltinOperator_FLOOR = 8,
   BuiltinOperator_FULLY_CONNECTED = 9,
   BuiltinOperator_HASHTABLE_LOOKUP = 10,
   BuiltinOperator_L2_NORMALIZATION = 11,
@@ -240,18 +263,29 @@ enum BuiltinOperator {
   BuiltinOperator_EXP = 47,
   BuiltinOperator_TOPK_V2 = 48,
   BuiltinOperator_SPLIT = 49,
+  BuiltinOperator_LOG_SOFTMAX = 50,
+  BuiltinOperator_DELEGATE = 51,
+  BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  BuiltinOperator_CAST = 53,
+  BuiltinOperator_PRELU = 54,
+  BuiltinOperator_MAXIMUM = 55,
+  BuiltinOperator_ARG_MAX = 56,
+  BuiltinOperator_MINIMUM = 57,
+  BuiltinOperator_LESS = 58,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_SPLIT
+  BuiltinOperator_MAX = BuiltinOperator_LESS
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[47] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[58] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
     BuiltinOperator_CONCATENATION,
     BuiltinOperator_CONV_2D,
     BuiltinOperator_DEPTHWISE_CONV_2D,
+    BuiltinOperator_DEQUANTIZE,
     BuiltinOperator_EMBEDDING_LOOKUP,
+    BuiltinOperator_FLOOR,
     BuiltinOperator_FULLY_CONNECTED,
     BuiltinOperator_HASHTABLE_LOOKUP,
     BuiltinOperator_L2_NORMALIZATION,
@@ -292,7 +326,16 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[47] {
     BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
     BuiltinOperator_EXP,
     BuiltinOperator_TOPK_V2,
-    BuiltinOperator_SPLIT
+    BuiltinOperator_SPLIT,
+    BuiltinOperator_LOG_SOFTMAX,
+    BuiltinOperator_DELEGATE,
+    BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+    BuiltinOperator_CAST,
+    BuiltinOperator_PRELU,
+    BuiltinOperator_MAXIMUM,
+    BuiltinOperator_ARG_MAX,
+    BuiltinOperator_MINIMUM,
+    BuiltinOperator_LESS
   };
   return values;
 }
@@ -305,9 +348,9 @@ inline const char **EnumNamesBuiltinOperator() {
     "CONV_2D",
     "DEPTHWISE_CONV_2D",
     "",
-    "",
+    "DEQUANTIZE",
     "EMBEDDING_LOOKUP",
-    "",
+    "FLOOR",
     "FULLY_CONNECTED",
     "HASHTABLE_LOOKUP",
     "L2_NORMALIZATION",
@@ -349,6 +392,15 @@ inline const char **EnumNamesBuiltinOperator() {
     "EXP",
     "TOPK_V2",
     "SPLIT",
+    "LOG_SOFTMAX",
+    "DELEGATE",
+    "BIDIRECTIONAL_SEQUENCE_LSTM",
+    "CAST",
+    "PRELU",
+    "MAXIMUM",
+    "ARG_MAX",
+    "MINIMUM",
+    "LESS",
     nullptr
   };
   return names;
@@ -396,11 +448,17 @@ enum BuiltinOptions {
   BuiltinOptions_ExpOptions = 33,
   BuiltinOptions_TopKV2Options = 34,
   BuiltinOptions_SplitOptions = 35,
+  BuiltinOptions_LogSoftmaxOptions = 36,
+  BuiltinOptions_CastOptions = 37,
+  BuiltinOptions_DequantizeOptions = 38,
+  BuiltinOptions_MaximumMinimumOptions = 39,
+  BuiltinOptions_ArgMaxOptions = 40,
+  BuiltinOptions_LessOptions = 41,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_SplitOptions
+  BuiltinOptions_MAX = BuiltinOptions_LessOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[36] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[42] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -437,7 +495,13 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[36] {
     BuiltinOptions_StridedSliceOptions,
     BuiltinOptions_ExpOptions,
     BuiltinOptions_TopKV2Options,
-    BuiltinOptions_SplitOptions
+    BuiltinOptions_SplitOptions,
+    BuiltinOptions_LogSoftmaxOptions,
+    BuiltinOptions_CastOptions,
+    BuiltinOptions_DequantizeOptions,
+    BuiltinOptions_MaximumMinimumOptions,
+    BuiltinOptions_ArgMaxOptions,
+    BuiltinOptions_LessOptions
   };
   return values;
 }
@@ -480,6 +544,12 @@ inline const char **EnumNamesBuiltinOptions() {
     "ExpOptions",
     "TopKV2Options",
     "SplitOptions",
+    "LogSoftmaxOptions",
+    "CastOptions",
+    "DequantizeOptions",
+    "MaximumMinimumOptions",
+    "ArgMaxOptions",
+    "LessOptions",
     nullptr
   };
   return names;
@@ -634,6 +704,30 @@ template<> struct BuiltinOptionsTraits<SplitOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SplitOptions;
 };
 
+template<> struct BuiltinOptionsTraits<LogSoftmaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogSoftmaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<CastOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CastOptions;
+};
+
+template<> struct BuiltinOptionsTraits<DequantizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DequantizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<MaximumMinimumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MaximumMinimumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<ArgMaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<LessOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -945,6 +1039,54 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_SplitOptions ?
       reinterpret_cast<const SplitOptionsT *>(value) : nullptr;
   }
+  LogSoftmaxOptionsT *AsLogSoftmaxOptions() {
+    return type == BuiltinOptions_LogSoftmaxOptions ?
+      reinterpret_cast<LogSoftmaxOptionsT *>(value) : nullptr;
+  }
+  const LogSoftmaxOptionsT *AsLogSoftmaxOptions() const {
+    return type == BuiltinOptions_LogSoftmaxOptions ?
+      reinterpret_cast<const LogSoftmaxOptionsT *>(value) : nullptr;
+  }
+  CastOptionsT *AsCastOptions() {
+    return type == BuiltinOptions_CastOptions ?
+      reinterpret_cast<CastOptionsT *>(value) : nullptr;
+  }
+  const CastOptionsT *AsCastOptions() const {
+    return type == BuiltinOptions_CastOptions ?
+      reinterpret_cast<const CastOptionsT *>(value) : nullptr;
+  }
+  DequantizeOptionsT *AsDequantizeOptions() {
+    return type == BuiltinOptions_DequantizeOptions ?
+      reinterpret_cast<DequantizeOptionsT *>(value) : nullptr;
+  }
+  const DequantizeOptionsT *AsDequantizeOptions() const {
+    return type == BuiltinOptions_DequantizeOptions ?
+      reinterpret_cast<const DequantizeOptionsT *>(value) : nullptr;
+  }
+  MaximumMinimumOptionsT *AsMaximumMinimumOptions() {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  const MaximumMinimumOptionsT *AsMaximumMinimumOptions() const {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<const MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  ArgMaxOptionsT *AsArgMaxOptions() {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<ArgMaxOptionsT *>(value) : nullptr;
+  }
+  const ArgMaxOptionsT *AsArgMaxOptions() const {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<const ArgMaxOptionsT *>(value) : nullptr;
+  }
+  LessOptionsT *AsLessOptions() {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<LessOptionsT *>(value) : nullptr;
+  }
+  const LessOptionsT *AsLessOptions() const {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<const LessOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -1338,11 +1480,15 @@ struct Conv2DOptionsT : public flatbuffers::NativeTable {
   int32_t stride_w;
   int32_t stride_h;
   ActivationFunctionType fused_activation_function;
+  int32_t dilation_w_factor;
+  int32_t dilation_h_factor;
   Conv2DOptionsT()
       : padding(Padding_SAME),
         stride_w(0),
         stride_h(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
+        fused_activation_function(ActivationFunctionType_NONE),
+        dilation_w_factor(1),
+        dilation_h_factor(1) {
   }
 };
 
@@ -1352,7 +1498,9 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_PADDING = 4,
     VT_STRIDE_W = 6,
     VT_STRIDE_H = 8,
-    VT_FUSED_ACTIVATION_FUNCTION = 10
+    VT_FUSED_ACTIVATION_FUNCTION = 10,
+    VT_DILATION_W_FACTOR = 12,
+    VT_DILATION_H_FACTOR = 14
   };
   Padding padding() const {
     return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
@@ -1366,12 +1514,20 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   ActivationFunctionType fused_activation_function() const {
     return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PADDING) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR) &&
            verifier.EndTable();
   }
   Conv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -1394,6 +1550,12 @@ struct Conv2DOptionsBuilder {
   void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
+  }
   explicit Conv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -1411,8 +1573,12 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
     Padding padding = Padding_SAME,
     int32_t stride_w = 0,
     int32_t stride_h = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1) {
   Conv2DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
   builder_.add_stride_h(stride_h);
   builder_.add_stride_w(stride_w);
   builder_.add_fused_activation_function(fused_activation_function);
@@ -3568,6 +3734,286 @@ inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
 
 flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct LogSoftmaxOptionsT : public flatbuffers::NativeTable {
+  typedef LogSoftmaxOptions TableType;
+  LogSoftmaxOptionsT() {
+  }
+};
+
+struct LogSoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LogSoftmaxOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogSoftmaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogSoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LogSoftmaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogSoftmaxOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LogSoftmaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LogSoftmaxOptionsBuilder &operator=(const LogSoftmaxOptionsBuilder &);
+  flatbuffers::Offset<LogSoftmaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LogSoftmaxOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LogSoftmaxOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CastOptionsT : public flatbuffers::NativeTable {
+  typedef CastOptions TableType;
+  TensorType in_data_type;
+  TensorType out_data_type;
+  CastOptionsT()
+      : in_data_type(TensorType_FLOAT32),
+        out_data_type(TensorType_FLOAT32) {
+  }
+};
+
+struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CastOptionsT NativeTableType;
+  enum {
+    VT_IN_DATA_TYPE = 4,
+    VT_OUT_DATA_TYPE = 6
+  };
+  TensorType in_data_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_IN_DATA_TYPE, 0));
+  }
+  TensorType out_data_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_OUT_DATA_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IN_DATA_TYPE) &&
+           VerifyField<int8_t>(verifier, VT_OUT_DATA_TYPE) &&
+           verifier.EndTable();
+  }
+  CastOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CastOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CastOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_in_data_type(TensorType in_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_IN_DATA_TYPE, static_cast<int8_t>(in_data_type), 0);
+  }
+  void add_out_data_type(TensorType out_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_OUT_DATA_TYPE, static_cast<int8_t>(out_data_type), 0);
+  }
+  explicit CastOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CastOptionsBuilder &operator=(const CastOptionsBuilder &);
+  flatbuffers::Offset<CastOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CastOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CastOptions> CreateCastOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType in_data_type = TensorType_FLOAT32,
+    TensorType out_data_type = TensorType_FLOAT32) {
+  CastOptionsBuilder builder_(_fbb);
+  builder_.add_out_data_type(out_data_type);
+  builder_.add_in_data_type(in_data_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DequantizeOptionsT : public flatbuffers::NativeTable {
+  typedef DequantizeOptions TableType;
+  DequantizeOptionsT() {
+  }
+};
+
+struct DequantizeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DequantizeOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  DequantizeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DequantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DequantizeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DequantizeOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit DequantizeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  DequantizeOptionsBuilder &operator=(const DequantizeOptionsBuilder &);
+  flatbuffers::Offset<DequantizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DequantizeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  DequantizeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MaximumMinimumOptionsT : public flatbuffers::NativeTable {
+  typedef MaximumMinimumOptions TableType;
+  MaximumMinimumOptionsT() {
+  }
+};
+
+struct MaximumMinimumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MaximumMinimumOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MaximumMinimumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MaximumMinimumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MaximumMinimumOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit MaximumMinimumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  MaximumMinimumOptionsBuilder &operator=(const MaximumMinimumOptionsBuilder &);
+  flatbuffers::Offset<MaximumMinimumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MaximumMinimumOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  MaximumMinimumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ArgMaxOptionsT : public flatbuffers::NativeTable {
+  typedef ArgMaxOptions TableType;
+  TensorType output_type;
+  ArgMaxOptionsT()
+      : output_type(TensorType_FLOAT32) {
+  }
+};
+
+struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ArgMaxOptionsT NativeTableType;
+  enum {
+    VT_OUTPUT_TYPE = 4
+  };
+  TensorType output_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE) &&
+           verifier.EndTable();
+  }
+  ArgMaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ArgMaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArgMaxOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_output_type(TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMaxOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ArgMaxOptionsBuilder &operator=(const ArgMaxOptionsBuilder &);
+  flatbuffers::Offset<ArgMaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ArgMaxOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType output_type = TensorType_FLOAT32) {
+  ArgMaxOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LessOptionsT : public flatbuffers::NativeTable {
+  typedef LessOptions TableType;
+  LessOptionsT() {
+  }
+};
+
+struct LessOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LessOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LessOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LessOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LessOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LessOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LessOptionsBuilder &operator=(const LessOptionsBuilder &);
+  flatbuffers::Offset<LessOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LessOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LessOptions> CreateLessOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LessOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -3790,6 +4236,24 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const SplitOptions *builtin_options_as_SplitOptions() const {
     return builtin_options_type() == BuiltinOptions_SplitOptions ? static_cast<const SplitOptions *>(builtin_options()) : nullptr;
   }
+  const LogSoftmaxOptions *builtin_options_as_LogSoftmaxOptions() const {
+    return builtin_options_type() == BuiltinOptions_LogSoftmaxOptions ? static_cast<const LogSoftmaxOptions *>(builtin_options()) : nullptr;
+  }
+  const CastOptions *builtin_options_as_CastOptions() const {
+    return builtin_options_type() == BuiltinOptions_CastOptions ? static_cast<const CastOptions *>(builtin_options()) : nullptr;
+  }
+  const DequantizeOptions *builtin_options_as_DequantizeOptions() const {
+    return builtin_options_type() == BuiltinOptions_DequantizeOptions ? static_cast<const DequantizeOptions *>(builtin_options()) : nullptr;
+  }
+  const MaximumMinimumOptions *builtin_options_as_MaximumMinimumOptions() const {
+    return builtin_options_type() == BuiltinOptions_MaximumMinimumOptions ? static_cast<const MaximumMinimumOptions *>(builtin_options()) : nullptr;
+  }
+  const ArgMaxOptions *builtin_options_as_ArgMaxOptions() const {
+    return builtin_options_type() == BuiltinOptions_ArgMaxOptions ? static_cast<const ArgMaxOptions *>(builtin_options()) : nullptr;
+  }
+  const LessOptions *builtin_options_as_LessOptions() const {
+    return builtin_options_type() == BuiltinOptions_LessOptions ? static_cast<const LessOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -3956,6 +4420,30 @@ template<> inline const SplitOptions *Operator::builtin_options_as<SplitOptions>
   return builtin_options_as_SplitOptions();
 }
 
+template<> inline const LogSoftmaxOptions *Operator::builtin_options_as<LogSoftmaxOptions>() const {
+  return builtin_options_as_LogSoftmaxOptions();
+}
+
+template<> inline const CastOptions *Operator::builtin_options_as<CastOptions>() const {
+  return builtin_options_as_CastOptions();
+}
+
+template<> inline const DequantizeOptions *Operator::builtin_options_as<DequantizeOptions>() const {
+  return builtin_options_as_DequantizeOptions();
+}
+
+template<> inline const MaximumMinimumOptions *Operator::builtin_options_as<MaximumMinimumOptions>() const {
+  return builtin_options_as_MaximumMinimumOptions();
+}
+
+template<> inline const ArgMaxOptions *Operator::builtin_options_as<ArgMaxOptions>() const {
+  return builtin_options_as_ArgMaxOptions();
+}
+
+template<> inline const LessOptions *Operator::builtin_options_as<LessOptions>() const {
+  return builtin_options_as_LessOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -4423,6 +4911,8 @@ inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resol
   { auto _e = stride_w(); _o->stride_w = _e; };
   { auto _e = stride_h(); _o->stride_h = _e; };
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  { auto _e = dilation_w_factor(); _o->dilation_w_factor = _e; };
+  { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; };
 }
 
 inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4437,12 +4927,16 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatB
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
   auto _fused_activation_function = _o->fused_activation_function;
+  auto _dilation_w_factor = _o->dilation_w_factor;
+  auto _dilation_h_factor = _o->dilation_h_factor;
   return tflite::CreateConv2DOptions(
       _fbb,
       _padding,
       _stride_w,
       _stride_h,
-      _fused_activation_function);
+      _fused_activation_function,
+      _dilation_w_factor,
+      _dilation_h_factor);
 }
 
 inline Pool2DOptionsT *Pool2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -5415,6 +5909,153 @@ inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(flatbu
       _shrink_axis_mask);
 }
 
+inline LogSoftmaxOptionsT *LogSoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LogSoftmaxOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LogSoftmaxOptions::UnPackTo(LogSoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LogSoftmaxOptions> LogSoftmaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogSoftmaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogSoftmaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogSoftmaxOptions(
+      _fbb);
+}
+
+inline CastOptionsT *CastOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CastOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CastOptions::UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = in_data_type(); _o->in_data_type = _e; };
+  { auto _e = out_data_type(); _o->out_data_type = _e; };
+}
+
+inline flatbuffers::Offset<CastOptions> CastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCastOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _in_data_type = _o->in_data_type;
+  auto _out_data_type = _o->out_data_type;
+  return tflite::CreateCastOptions(
+      _fbb,
+      _in_data_type,
+      _out_data_type);
+}
+
+inline DequantizeOptionsT *DequantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new DequantizeOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void DequantizeOptions::UnPackTo(DequantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<DequantizeOptions> DequantizeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDequantizeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DequantizeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateDequantizeOptions(
+      _fbb);
+}
+
+inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MaximumMinimumOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMaximumMinimumOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMaximumMinimumOptions(
+      _fbb);
+}
+
+inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ArgMaxOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = output_type(); _o->output_type = _e; };
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArgMaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _output_type = _o->output_type;
+  return tflite::CreateArgMaxOptions(
+      _fbb,
+      _output_type);
+}
+
+inline LessOptionsT *LessOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LessOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LessOptions::UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LessOptions> LessOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessOptions(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -5735,6 +6376,30 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const SplitOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const LogSoftmaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<const CastOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const LessOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -5893,6 +6558,30 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const SplitOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const LogSoftmaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<const CastOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const LessOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -6039,6 +6728,30 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const SplitOptionsT *>(value);
       return CreateSplitOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const LogSoftmaxOptionsT *>(value);
+      return CreateLogSoftmaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<const CastOptionsT *>(value);
+      return CreateCastOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<const DequantizeOptionsT *>(value);
+      return CreateDequantizeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptionsT *>(value);
+      return CreateMaximumMinimumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptionsT *>(value);
+      return CreateArgMaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const LessOptionsT *>(value);
+      return CreateLessOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -6185,6 +6898,30 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new SplitOptionsT(*reinterpret_cast<SplitOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      value = new LogSoftmaxOptionsT(*reinterpret_cast<LogSoftmaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CastOptions: {
+      value = new CastOptionsT(*reinterpret_cast<CastOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      value = new DequantizeOptionsT(*reinterpret_cast<DequantizeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      value = new MaximumMinimumOptionsT(*reinterpret_cast<MaximumMinimumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      value = new ArgMaxOptionsT(*reinterpret_cast<ArgMaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LessOptions: {
+      value = new LessOptionsT(*reinterpret_cast<LessOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -6367,6 +7104,36 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<LogSoftmaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<CastOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<DequantizeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<MaximumMinimumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<ArgMaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<LessOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/schema/upgrade_schema.py b/tensorflow/contrib/lite/schema/upgrade_schema.py
index 94f5730be5d991ae13fb019e4d035e23f76fe441..e0b36d3d3ee94b00cccd3968d14c63fe19c3c27c 100644
--- a/tensorflow/contrib/lite/schema/upgrade_schema.py
+++ b/tensorflow/contrib/lite/schema/upgrade_schema.py
@@ -39,8 +39,8 @@ import tensorflow as tf
 from tensorflow.python.platform import resource_loader
 
 parser = argparse.ArgumentParser(
-    description="Script to move TFLite models from pre-release schema to"
-    " new schema.")
+    description="Script to move TFLite models from pre-release schema to "
+    "new schema.")
 parser.add_argument(
     "input",
     type=str,
@@ -48,7 +48,7 @@ parser.add_argument(
 parser.add_argument(
     "output",
     type=str,
-    help="Output json or bin TensorFlow lite model compliant with"
+    help="Output json or bin TensorFlow lite model compliant with "
     "the new schema. Extension must be `.json`, `.bin` or `.tflite`.")
 
 
@@ -258,7 +258,7 @@ class Converter(object):
       # Check if builtin_code is the appropriate string type
       # use type("") instead of str or unicode. for py2and3
       if not isinstance(operator_code["builtin_code"], type(u"")):
-        raise ValueError("builtin_code %r is non-string. this usually means"
+        raise ValueError("builtin_code %r is non-string. this usually means "
                          "your model has consistency problems." %
                          (operator_code["builtin_code"]))
       operator_code["builtin_code"] = (RemapOperator(
diff --git a/tensorflow/contrib/lite/simple_memory_arena.cc b/tensorflow/contrib/lite/simple_memory_arena.cc
index 4aab244989ca5300fbe74162e03deaac89af60ad..2f2004f56bcad5b56f9dd6d4bc824ec14d79e795 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.cc
+++ b/tensorflow/contrib/lite/simple_memory_arena.cc
@@ -113,21 +113,21 @@ TfLiteStatus SimpleMemoryArena::Commit(TfLiteContext* context) {
     underlying_buffer_size_ = required_size;
     underlying_buffer_aligned_ptr_ = new_underlying_buffer_aligned_ptr;
   }
-  commited_ = true;
+  committed_ = true;
   return underlying_buffer_ != nullptr ? kTfLiteOk : kTfLiteError;
 }
 
 TfLiteStatus SimpleMemoryArena::ResolveAlloc(TfLiteContext* context,
                                              const ArenaAlloc& alloc,
                                              char** output_ptr) {
-  TF_LITE_ENSURE(context, commited_);
+  TF_LITE_ENSURE(context, committed_);
   TF_LITE_ENSURE(context, output_ptr != nullptr);
   *output_ptr = underlying_buffer_aligned_ptr_ + alloc.offset;
   return kTfLiteOk;
 }
 
 TfLiteStatus SimpleMemoryArena::Clear() {
-  commited_ = false;
+  committed_ = false;
   high_water_mark_ = 0;
   allocs_.clear();
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/simple_memory_arena.h b/tensorflow/contrib/lite/simple_memory_arena.h
index 0535522374c63459d029c252ebe94628cf3122d5..5faf78b59e3755d22e4e866d433e622baa6c66c1 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.h
+++ b/tensorflow/contrib/lite/simple_memory_arena.h
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tflite {
 
 // This little structure holds the offset and the size for a dynamic memory
-// allocation in the memory arena. When the arena is commited and the
+// allocation in the memory arena. When the arena is committed and the
 // underlying buffer is set, the alloc can be resolved into an actual memory
 // pointer.
 struct ArenaAlloc {
@@ -43,7 +43,7 @@ struct ArenaAlloc {
 class SimpleMemoryArena {
  public:
   explicit SimpleMemoryArena(size_t arena_alignment)
-      : commited_(false),
+      : committed_(false),
         arena_alignment_(arena_alignment),
         high_water_mark_(0),
         underlying_buffer_size_(0),
@@ -73,7 +73,7 @@ class SimpleMemoryArena {
   }
 
  private:
-  bool commited_;
+  bool committed_;
   size_t arena_alignment_;
   size_t high_water_mark_;
   std::unique_ptr<char[]> underlying_buffer_;
diff --git a/tensorflow/contrib/lite/special_rules.bzl b/tensorflow/contrib/lite/special_rules.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..54083c49182c707620cbd231b957405cfe24be92
--- /dev/null
+++ b/tensorflow/contrib/lite/special_rules.bzl
@@ -0,0 +1,6 @@
+"""External versions of build rules that differ outside of Google."""
+
+def tflite_portable_test_suite(**kwargs):
+  """This is a no-op outside of Google."""
+  _ignore = [kwargs]
+  pass
diff --git a/tensorflow/contrib/lite/string_util.cc b/tensorflow/contrib/lite/string_util.cc
index cd41299d38361321503d421272426a9d1082c937..a89776b29f895fe82ee71efe00c0949c58c109df 100644
--- a/tensorflow/contrib/lite/string_util.cc
+++ b/tensorflow/contrib/lite/string_util.cc
@@ -24,7 +24,10 @@ namespace tflite {
 namespace {
 
 // Convenient method to get pointer to int32_t.
-int32_t* GetIntPtr(char* ptr) { return reinterpret_cast<int32_t*>(ptr); }
+const int32_t* GetIntPtr(const char* ptr) {
+  return reinterpret_cast<const int32_t*>(ptr);
+}
+
 }  // namespace
 
 void DynamicBuffer::AddString(const char* str, size_t len) {
@@ -64,7 +67,7 @@ void DynamicBuffer::AddJoinedString(const std::vector<StringRef>& strings,
   offset_.push_back(offset_.back() + total_len);
 }
 
-void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
+int DynamicBuffer::WriteToBuffer(char** buffer) {
   // Allocate sufficient memory to tensor buffer.
   int32_t num_strings = offset_.size() - 1;
   // Total bytes include:
@@ -75,43 +78,57 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
   int32_t bytes = data_.size()                            // size of content
                   + sizeof(int32_t) * (num_strings + 2);  // size of header
 
-  // Output tensor will take over the ownership of tensor_buffer, and free it
-  // during Interpreter destruction.
-  char* tensor_buffer = static_cast<char*>(malloc(bytes));
+  // Caller will take ownership of buffer.
+  *buffer = reinterpret_cast<char*>(malloc(bytes));
 
   // Set num of string
-  memcpy(tensor_buffer, &num_strings, sizeof(int32_t));
+  memcpy(*buffer, &num_strings, sizeof(int32_t));
 
   // Set offset of strings.
   int32_t start = sizeof(int32_t) * (num_strings + 2);
   for (int i = 0; i < offset_.size(); i++) {
     int32_t offset = start + offset_[i];
-    memcpy(tensor_buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t));
+    memcpy(*buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t));
   }
 
   // Copy data of strings.
-  memcpy(tensor_buffer + start, data_.data(), data_.size());
+  memcpy(*buffer + start, data_.data(), data_.size());
+  return bytes;
+}
+
+void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
+  char* tensor_buffer;
+  int bytes = WriteToBuffer(&tensor_buffer);
 
   // Set tensor content pointer to tensor_buffer, and release original data.
   auto dims = TfLiteIntArrayCreate(1);
-  dims->data[0] = num_strings;
+  dims->data[0] = offset_.size() - 1;  // Store number of strings.
   TfLiteTensorReset(tensor->type, tensor->name, dims, tensor->params,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
                     tensor);
 }
 
+int GetStringCount(const char* raw_buffer) {
+  // The first integers in the raw buffer is the number of strings.
+  return *GetIntPtr(raw_buffer);
+}
+
 int GetStringCount(const TfLiteTensor* tensor) {
   // The first integers in the raw buffer is the number of strings.
-  return *GetIntPtr(tensor->data.raw);
+  return GetStringCount(tensor->data.raw);
 }
 
-StringRef GetString(const TfLiteTensor* tensor, int string_index) {
-  int32_t* offset =
-      GetIntPtr(tensor->data.raw + sizeof(int32_t) * (string_index + 1));
+StringRef GetString(const char* raw_buffer, int string_index) {
+  const int32_t* offset =
+      GetIntPtr(raw_buffer + sizeof(int32_t) * (string_index + 1));
   return {
-      tensor->data.raw + (*offset),
+      raw_buffer + (*offset),
       (*(offset + 1)) - (*offset),
   };
 }
 
+StringRef GetString(const TfLiteTensor* tensor, int string_index) {
+  return GetString(tensor->data.raw, string_index);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/string_util.h b/tensorflow/contrib/lite/string_util.h
index 4c5d8578ac1cd59d757df895fc67394b837b4fae..08a20ce2160e946e9199abc2e1bcb9d4b78bd09e 100644
--- a/tensorflow/contrib/lite/string_util.h
+++ b/tensorflow/contrib/lite/string_util.h
@@ -49,7 +49,7 @@ namespace tflite {
 
 // Convenient structure to store string pointer and length.
 typedef struct {
-  char* str;
+  const char* str;
   int len;
 } StringRef;
 
@@ -70,6 +70,10 @@ class DynamicBuffer {
   // buffer.
   void AddJoinedString(const std::vector<StringRef>& strings, char separator);
 
+  // Fill content into a buffer and returns the number of bytes stored.
+  // The function allocates space for the buffer but does NOT take ownership.
+  int WriteToBuffer(char** buffer);
+
   // Fill content into a string tensor.
   void WriteToTensor(TfLiteTensor* tensor);
 
@@ -81,10 +85,12 @@ class DynamicBuffer {
 };
 
 // Return num of strings in a String tensor.
+int GetStringCount(const char* raw_buffer);
 int GetStringCount(const TfLiteTensor* tensor);
 
 // Get String pointer and length of index-th string in tensor.
 // NOTE: This will not create a copy of string data.
+StringRef GetString(const char* raw_buffer, int string_index);
 StringRef GetString(const TfLiteTensor* tensor, int string_index);
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 06570ae9aa3d10c3cb73ab362e30244ec0b78a35..a1162cef38693e3f347413a7ce6b647571175bd6 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -8,6 +8,7 @@ load(
     "//tensorflow/contrib/lite:build_def.bzl",
     "gen_zipped_test_files",
 )
+load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -17,6 +18,7 @@ gen_zipped_test_files(
     name = "optest",
     files = [
         "add.zip",
+        "arg_max.zip",
         "avg_pool.zip",
         "batch_to_space_nd.zip",
         "concat.zip",
@@ -26,15 +28,20 @@ gen_zipped_test_files(
         "depthwiseconv.zip",
         "div.zip",
         "exp.zip",
+        "floor.zip",
         "fully_connected.zip",
         "fused_batch_norm.zip",
         "gather.zip",
         "global_batch_norm.zip",
         "l2_pool.zip",
         "l2norm.zip",
+        "less.zip",
         "local_response_norm.zip",
+        "log_softmax.zip",
         "max_pool.zip",
+        "maximum.zip",
         "mean.zip",
+        "minimum.zip",
         "mul.zip",
         "pad.zip",
         "relu.zip",
@@ -156,6 +163,9 @@ cc_test(
     size = "small",
     srcs = ["tflite_driver_test.cc"],
     data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
+    tags = [
+        "tflite_not_portable_android",
+    ],
     deps = [
         ":tflite_driver",
         "@com_google_googletest//:gtest_main",
@@ -190,7 +200,6 @@ cc_library(
 
 cc_library(
     name = "util",
-    testonly = 1,
     hdrs = ["util.h"],
 )
 
@@ -234,12 +243,110 @@ cc_test(
     size = "small",
     srcs = ["tf_driver_test.cc"],
     data = ["//tensorflow/contrib/lite:testdata/multi_add.pb"],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":tf_driver",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "generate_testspec",
+    srcs = ["generate_testspec.cc"],
+    hdrs = ["generate_testspec.h"],
     deps = [
+        ":join",
+        ":split",
         ":tf_driver",
+        "//tensorflow/core:framework",
+    ],
+)
+
+cc_test(
+    name = "generate_testspec_test",
+    size = "small",
+    srcs = ["generate_testspec_test.cc"],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":generate_testspec",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
+cc_library(
+    name = "tflite_diff_util",
+    srcs = ["tflite_diff_util.cc"],
+    hdrs = ["tflite_diff_util.h"],
+    deps = [
+        ":generate_testspec",
+        ":parse_testdata_lib",
+        ":split",
+        ":tflite_driver",
+        ":util",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
+)
+
+cc_library(
+    name = "tflite_diff_flags",
+    hdrs = ["tflite_diff_flags.h"],
+    deps = [
+        ":split",
+        ":tflite_diff_util",
+    ] + select({
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "tflite_diff_example_test",
+    size = "medium",
+    srcs = ["tflite_diff_example_test.cc"],
+    args = [
+        "--tensorflow_model=third_party/tensorflow/contrib/lite/testdata/multi_add.pb",
+        "--tflite_model=third_party/tensorflow/contrib/lite/testdata/multi_add.bin",
+        "--input_layer=a,b,c,d",
+        "--input_layer_type=float,float,float,float",
+        "--input_layer_shape=1,3,4,3:1,3,4,3:1,3,4,3:1,3,4,3",
+        "--output_layer=x,y",
+    ],
+    data = [
+        "//tensorflow/contrib/lite:testdata/multi_add.bin",
+        "//tensorflow/contrib/lite:testdata/multi_add.pb",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_oss",
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":tflite_diff_flags",
+        ":tflite_diff_util",
+    ],
+)
+
+cc_binary(
+    name = "tflite_diff",
+    srcs = ["tflite_diff_example_test.cc"],
+    deps = [
+        ":tflite_diff_flags",
+        ":tflite_diff_util",
+    ],
+)
+
 tf_cc_test(
     name = "generated_examples_zip_test",
     size = "large",
@@ -252,7 +359,10 @@ tf_cc_test(
     ],
     data = [":optest"],
     shard_count = 20,
-    tags = ["no_oss"],
+    tags = [
+        "no_oss",
+        "tflite_not_portable",
+    ],
     deps = [
         ":parse_testdata_lib",
         ":tflite_driver",
@@ -275,14 +385,4 @@ tf_cc_test(
     }),
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index b6c09306d6adb8e54d5108dac850f0249ffcb838..2f8f7a1a7956296b964bce3ff5f75a7299065b0e 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -17,10 +17,9 @@
 
 Usage:
 
-generate_examples <output directory> zipped
+generate_examples <output directory>
 
 bazel run //tensorflow/contrib/lite/testing:generate_examples
-    third_party/tensorflow/contrib/lite/testing/generated_examples zipped
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -36,6 +35,7 @@ import traceback
 import zipfile
 import numpy as np
 from six import StringIO
+from six.moves import xrange
 
 # TODO(aselle): Disable GPU for now
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -46,12 +46,11 @@ from google.protobuf import text_format
 # TODO(aselle): switch to TensorFlow's resource_loader
 from tensorflow.contrib.lite.testing import generate_examples_report as report_lib
 from tensorflow.python.framework import graph_util as tf_graph_util
+from tensorflow.python.ops import rnn
 
 parser = argparse.ArgumentParser(description="Script to generate TFLite tests.")
 parser.add_argument("output_path",
                     help="Directory where the outputs will be go.")
-# TODO(ahentz): remove this flag
-parser.add_argument("type", help="zipped")
 parser.add_argument("--zip_to_output",
                     type=str,
                     help="Particular zip to output.",
@@ -94,9 +93,6 @@ KNOWN_BUGS = {
     r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
     # SpaceToDepth only supports float32.
     r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
-    # BatchToSpaceND doesn't support cropping. This catches test cases with
-    # const tensors as crops.
-    r"batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\]": "70594634",
     # BatchToSpaceND only supports 4D tensors.
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
@@ -105,14 +101,30 @@ KNOWN_BUGS = {
     r"strided_slice.*begin=\[0\].*end=\[1\].*": "73170889",
     # No support for SplitV
     r"split.*num_or_size_splits=\[2,2\]": "73377559",
+    # Needs support for dimensions other than the last one in argmax.
+    r"arg_max.*axis=0.*": "77546240",
+    r"arg_max.*axis=1.*": "77546240",
+    r"arg_max.*axis=2.*": "77546240",
 }
 
 
+class ExtraTocoOptions(object):
+  """Additonal toco options besides input, output, shape."""
+
+  def __init__(self):
+    # Whether to ignore control dependency nodes.
+    self.drop_control_dependency = False
+    # Allow custom ops in the toco conversion.
+    self.allow_custom_ops = False
+    # Rnn states that are used to support rnn / lstm cells.
+    self.rnn_states = None
+
+
 def toco_options(data_types,
                  input_arrays,
                  output_arrays,
                  shapes,
-                 drop_control_dependency):
+                 extra_toco_options=ExtraTocoOptions()):
   """Create TOCO options to process a model.
 
   Args:
@@ -120,8 +132,7 @@ def toco_options(data_types,
     input_arrays: names of the input tensors
     output_arrays: name of the output tensors
     shapes: shapes of the input tensors
-    drop_control_dependency: whether to ignore control dependency nodes.
-
+    extra_toco_options: additional toco options
   Returns:
     the options in a string.
   """
@@ -137,37 +148,15 @@ def toco_options(data_types,
        " --input_arrays=%s" % ",".join(input_arrays) +
        " --input_shapes=%s" % shape_str +
        " --output_arrays=%s" % ",".join(output_arrays))
-  if drop_control_dependency:
+  if extra_toco_options.drop_control_dependency:
     s += " --drop_control_dependency"
+  if extra_toco_options.allow_custom_ops:
+    s += " --allow_custom_ops"
+  if extra_toco_options.rnn_states:
+    s += (" --rnn_states='" + extra_toco_options.rnn_states + "'")
   return s
 
 
-def write_toco_options(filename,
-                       data_types,
-                       input_arrays,
-                       output_arrays,
-                       shapes,
-                       drop_control_dependency=False):
-  """Create TOCO options to process a model.
-
-  Args:
-    filename: Filename to write the options to.
-    data_types: input and inference types used by TOCO.
-    input_arrays: names of the input tensors
-    output_arrays: names of the output tensors
-    shapes: shapes of the input tensors
-    drop_control_dependency: whether to ignore control dependency nodes.
-  """
-  with open(filename, "w") as fp:
-    fp.write(
-        toco_options(
-            data_types=data_types,
-            input_arrays=input_arrays,
-            output_arrays=output_arrays,
-            shapes=shapes,
-            drop_control_dependency=drop_control_dependency))
-
-
 def write_examples(fp, examples):
   """Given a list `examples`, write a text format representation.
 
@@ -285,12 +274,14 @@ def make_control_dep_tests(zip_path):
     return [input_values], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_values])))
 
+  extra_toco_options = ExtraTocoOptions()
+  extra_toco_options.drop_control_dependency = True
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    drop_control_dependency=True)
+                    extra_toco_options)
 
 
 def toco_convert(graph_def_str, input_tensors, output_tensors,
-                 drop_control_dependency=False):
+                 extra_toco_options):
   """Convert a model's graph def into a tflite model.
 
   NOTE: this currently shells out to the toco binary, but we would like
@@ -298,9 +289,9 @@ def toco_convert(graph_def_str, input_tensors, output_tensors,
 
   Args:
     graph_def_str: Graph def proto in serialized string format.
-    input_tensors: List of input tensor tuples `(name, shape, type)`
-    output_tensors: List of output tensors (names)
-    drop_control_dependency: whether to ignore control dependency nodes.
+    input_tensors: List of input tensor tuples `(name, shape, type)`.
+    output_tensors: List of output tensors (names).
+    extra_toco_options: Additional toco options.
 
   Returns:
     output tflite model, log_txt from conversion
@@ -312,7 +303,7 @@ def toco_convert(graph_def_str, input_tensors, output_tensors,
       input_arrays=[x[0] for x in input_tensors],
       shapes=[x[1] for x in input_tensors],
       output_arrays=output_tensors,
-      drop_control_dependency=drop_control_dependency)
+      extra_toco_options=extra_toco_options)
 
   with tempfile.NamedTemporaryFile() as graphdef_file, \
        tempfile.NamedTemporaryFile() as output_file, \
@@ -341,7 +332,8 @@ def make_zip_of_tests(zip_path,
                       test_parameters,
                       make_graph,
                       make_test_inputs,
-                      drop_control_dependency=False):
+                      extra_toco_options=ExtraTocoOptions(),
+                      use_frozen_graph=False):
   """Helper to make a zip file of a bunch of TensorFlow models.
 
   This does a cartestian product of the dictionary of test_parameters and
@@ -359,7 +351,9 @@ def make_zip_of_tests(zip_path,
       `[input1, input2, ...], [output1, output2, ...]`
     make_test_inputs: function taking `curr_params`, `session`, `input_tensors`,
       `output_tensors` and returns tuple `(input_values, output_values)`.
-    drop_control_dependency: whether to ignore control dependency nodes.
+    extra_toco_options: Additional toco options.
+    use_frozen_graph: Whether or not freeze graph before toco converter.
+
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
   """
@@ -419,21 +413,25 @@ def make_zip_of_tests(zip_path,
           return None, report
         report["toco"] = report_lib.FAILED
         report["tf"] = report_lib.SUCCESS
-
         # Convert graph to toco
+        input_tensors = [(input_tensor.name.split(":")[0],
+                          input_tensor.get_shape(), input_tensor.dtype)
+                         for input_tensor in inputs]
+        output_tensors = [normalize_output_name(out.name) for out in outputs]
+        graph_def = freeze_graph(
+            sess,
+            tf.global_variables() + inputs +
+            outputs) if use_frozen_graph else sess.graph_def
         tflite_model_binary, toco_log = toco_convert(
-            sess.graph_def.SerializeToString(),
-            [(input_tensor.name.split(":")[0], input_tensor.get_shape(),
-              input_tensor.dtype) for input_tensor in inputs],
-            [normalize_output_name(out.name) for out in outputs],
-            drop_control_dependency)
+            graph_def.SerializeToString(), input_tensors, output_tensors,
+            extra_toco_options)
         report["toco"] = (report_lib.SUCCESS if tflite_model_binary is not None
                           else report_lib.FAILED)
         report["toco_log"] = toco_log
 
         if FLAGS.save_graphdefs:
           archive.writestr(label + ".pb",
-                           text_format.MessageToString(sess.graph_def),
+                           text_format.MessageToString(graph_def),
                            zipfile.ZIP_DEFLATED)
 
         if tflite_model_binary:
@@ -543,6 +541,18 @@ def make_pool_tests(pool_op_in):
   return f
 
 
+def make_l2_pool_tests(zip_path):
+  make_pool_tests(make_l2_pool)(zip_path)
+
+
+def make_avg_pool_tests(zip_path):
+  make_pool_tests(tf.nn.avg_pool)(zip_path)
+
+
+def make_max_pool_tests(zip_path):
+  make_pool_tests(tf.nn.max_pool)(zip_path)
+
+
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
 
@@ -706,7 +716,7 @@ def make_mean_tests(zip_path):
           [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
       ],
       "const_axis": [True, False],
-      "keep_dims": [True, False],
+      "keepdims": [True, False],
   }, {
       "input_dtype": [tf.float32, tf.int32, tf.int64],
       "input_shape": [[1, 224, 224, 3]],
@@ -717,7 +727,7 @@ def make_mean_tests(zip_path):
           [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
       ],
       "const_axis": [True, False],
-      "keep_dims": [True, False],
+      "keepdims": [True, False],
   }]
 
   def build_graph(parameters):
@@ -740,7 +750,7 @@ def make_mean_tests(zip_path):
       input_tensors = [input_tensor, axis]
 
     out = tf.reduce_mean(
-        input_tensor, axis=axis, keep_dims=parameters["keep_dims"])
+        input_tensor, axis=axis, keepdims=parameters["keepdims"])
     return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -783,11 +793,128 @@ def make_exp_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_log_softmax_tests(zip_path):
+  """Make a set of tests to do log_softmax."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[1, 100], [4, 2], [5, 224]],
+  }]
+
+  def build_graph(parameters):
+    """Build the log_softmax op testing graph."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    out = tf.nn.log_softmax(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(
+            parameters["input_dtype"],
+            parameters["input_shape"],
+            min_value=-100,
+            max_value=9)
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_maximum_tests(zip_path):
+  """Make a set of tests to do maximum."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the maximum op testing graph."""
+    input_tensor_1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_1",
+        shape=parameters["input_shape_1"])
+    input_tensor_2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_2",
+        shape=parameters["input_shape_2"])
+
+    out = tf.maximum(input_tensor_1, input_tensor_2)
+    return [input_tensor_1, input_tensor_2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_1"]),
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_2"])
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_minimum_tests(zip_path):
+  """Make a set of tests to do minimum."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the minimum op testing graph."""
+    input_tensor_1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_1",
+        shape=parameters["input_shape_1"])
+    input_tensor_2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_2",
+        shape=parameters["input_shape_2"])
+
+    out = tf.minimum(input_tensor_1, input_tensor_2)
+    return [input_tensor_1, input_tensor_2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_1"]),
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_2"])
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_binary_op_tests_func(binary_operator):
   """Return a function that does a test on a binary operator."""
   return lambda zip_path: make_binary_op_tests(zip_path, binary_operator)
 
 
+def make_add_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.add)
+
+
+def make_div_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.div)
+
+
+def make_sub_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.subtract)
+
+
+def make_mul_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.multiply)
+
+
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
@@ -795,12 +922,11 @@ def make_gather_tests(zip_path):
       # TODO(mgubin): add string tests when they are supported by Toco.
       # TODO(mgubin): add tests for Nd indices when they are supported by
       # TfLite.
-      # TODO(mgubin): add tests for axis != 0 when it is supported by TfLite.
       "params_dtype": [tf.float32, tf.int32],
       "params_shape": [[10], [1, 2, 20]],
       "indices_dtype": [tf.int32],
       "indices_shape": [[3], [5]],
-      "axis": [0],  # axis!=0 is GatherV2
+      "axis": [0, 1],
   }]
 
   def build_graph(parameters):
@@ -913,6 +1039,7 @@ def make_conv_tests(zip_path):
           "input_shape": [[1, 3, 4, 3]],
           "filter_shape": [[1, 1, 3, 2]],
           "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "dilations": [[1, 1, 1, 1], [1, 3, 2, 1], [1, 2, 2, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
           "constant_filter": [True, False],
@@ -921,6 +1048,7 @@ def make_conv_tests(zip_path):
           "input_shape": [[2, 14, 14, 2]],
           "filter_shape": [[6, 6, 2, 2]],
           "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "dilations": [[1, 1, 1, 1], [1, 2, 2, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
           "constant_filter": [True, False],
@@ -946,6 +1074,7 @@ def make_conv_tests(zip_path):
         input_tensor,
         filter_input,
         strides=parameters["strides"],
+        dilations=parameters["dilations"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     return input_tensors, [out]
@@ -1056,7 +1185,7 @@ def make_split_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_concatenation_tests(zip_path):
+def make_concat_tests(zip_path):
   """Make a set of tests to do concatenation."""
 
   test_parameters = [{
@@ -1466,7 +1595,7 @@ def make_batch_to_space_nd_tests(zip_path):
   test_parameters = [
       {
           "dtype": [tf.float32, tf.int64, tf.int32],
-          "input_shape": [[12, 2, 2, 1]],
+          "input_shape": [[12, 3, 3, 1]],
           "block_shape": [[1, 4], [2, 2], [3, 4]],
           "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
           "constant_block_shape": [True, False],
@@ -1540,7 +1669,7 @@ def make_transpose_tests(zip_path):
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 2, 3, 4, 5]],
-      "perm": [[0, 1, 2, 3, 4]],
+      "perm": [[4, 3, 2, 1, 0]],
       "constant_perm": [True, False],
   }]
 
@@ -1629,19 +1758,7 @@ def make_strided_slice_tests(zip_path):
           "shrink_axis_mask": [None, 1, 8, 11, 15, -1],
           "constant_indices": [False, True],
       },
-      #
-      {
-          "dtype": [tf.float32],
-          "index_type": [tf.int32],
-          "input_shape": [[12, 2, 2, 5]],
-          "begin": [[0]],
-          "end": [[1]],
-          "strides": [[1]],
-          "begin_mask": [0],
-          "end_mask": [0],
-          "shrink_axis_mask": [1],
-          "constant_indices": [True],
-      },
+      # TODO(b/73170889) Restore test paramaters removed in cl/191608113.
       # 2-D
       {
           "dtype": [tf.float32, tf.int32, tf.int64],
@@ -1655,6 +1772,19 @@ def make_strided_slice_tests(zip_path):
           "shrink_axis_mask": [None, 1, 2, 3, -1],
           "constant_indices": [False, True],
       },
+      # 1-D Exhaustive
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[4]],
+          "begin": [[-100], [-3], [-2], [-1], [0], [1], [2], [3], [100]],
+          "end": [[-100], [-3], [-2], [-1], [0], [1], [2], [3], [100]],
+          "strides": [-2, -1, 1, 2],
+          "begin_mask": [0, 1],
+          "end_mask": [0, 1],
+          "shrink_axis_mask": [0],
+          "constant_indices": [False],
+      },
       # Negative strides
       {
           "dtype": [tf.float32],
@@ -1730,6 +1860,84 @@ def make_strided_slice_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_lstm_tests(zip_path):
+  """Make a set of tests to do basic Lstm cell."""
+
+  test_parameters = [
+      {
+          "dtype": [tf.float32],
+          "num_batchs": [1],
+          "time_step_size": [1],
+          "input_vec_size": [3],
+          "num_cells": [4],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build a simple graph with BasicLSTMCell."""
+
+    num_batchs = parameters["num_batchs"]
+    time_step_size = parameters["time_step_size"]
+    input_vec_size = parameters["input_vec_size"]
+    num_cells = parameters["num_cells"]
+    inputs_after_split = []
+    for i in xrange(time_step_size):
+      one_timestamp_input = tf.placeholder(
+          dtype=parameters["dtype"],
+          name="split_{}".format(i),
+          shape=[num_batchs, input_vec_size])
+      inputs_after_split.append(one_timestamp_input)
+    # Currently lstm identifier has a few limitations: only supports
+    # forget_bias == 0, inner state activiation == tanh.
+    # TODO(zhixianyan): Add another test with forget_bias == 1.
+    # TODO(zhixianyan): Add another test with relu as activation.
+    lstm_cell = tf.contrib.rnn.BasicLSTMCell(
+        num_cells, forget_bias=0.0, state_is_tuple=True)
+    cell_outputs, _ = rnn.static_rnn(
+        lstm_cell, inputs_after_split, dtype=tf.float32)
+    out = cell_outputs[-1]
+    return inputs_after_split, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Feed inputs, assign vairables, and freeze graph."""
+
+    with tf.variable_scope("", reuse=True):
+      kernel = tf.get_variable("rnn/basic_lstm_cell/kernel")
+      bias = tf.get_variable("rnn/basic_lstm_cell/bias")
+      kernel_values = create_tensor_data(
+          parameters["dtype"], [kernel.shape[0], kernel.shape[1]], -1, 1)
+      bias_values = create_tensor_data(parameters["dtype"], [bias.shape[0]], 0,
+                                       1)
+      sess.run(tf.group(kernel.assign(kernel_values), bias.assign(bias_values)))
+
+    num_batchs = parameters["num_batchs"]
+    time_step_size = parameters["time_step_size"]
+    input_vec_size = parameters["input_vec_size"]
+    input_values = []
+    for _ in xrange(time_step_size):
+      tensor_data = create_tensor_data(parameters["dtype"],
+                                       [num_batchs, input_vec_size], 0, 1)
+      input_values.append(tensor_data)
+    out = sess.run(outputs, feed_dict=dict(zip(inputs, input_values)))
+    return input_values, out
+
+  # TODO(zhixianyan): Automatically generate rnn_states for lstm cell.
+  extra_toco_options = ExtraTocoOptions()
+  extra_toco_options.rnn_states = (
+      "{state_array:rnn/BasicLSTMCellZeroState/zeros,"
+      "back_edge_source_array:rnn/basic_lstm_cell/Add_1,size:4},"
+      "{state_array:rnn/BasicLSTMCellZeroState/zeros_1,"
+      "back_edge_source_array:rnn/basic_lstm_cell/Mul_2,size:4}")
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      extra_toco_options,
+      use_frozen_graph=True)
+
+
 def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
   """Given an input perform a sequence of TensorFlow ops to produce l2pool."""
   return tf.sqrt(tf.nn.avg_pool(
@@ -1738,7 +1946,7 @@ def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
 
 
 def make_topk_tests(zip_path):
-  """Make a set of tests to do gather."""
+  """Make a set of tests to do topk."""
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
@@ -1746,7 +1954,7 @@ def make_topk_tests(zip_path):
   }]
 
   def build_graph(parameters):
-    """Build the gather op testing graph."""
+    """Build the topk op testing graph."""
     input_value = tf.placeholder(
         dtype=parameters["input_dtype"],
         name="input",
@@ -1763,6 +1971,96 @@ def make_topk_tests(zip_path):
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
+
+def make_arg_max_tests(zip_path):
+  """Make a set of tests to do arg_max."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
+      "axis": [0, 1, 2, 3],
+      "output_type": [tf.int32, tf.int64],
+  }]
+
+  def build_graph(parameters):
+    """Build the topk op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    axis = tf.constant(parameters["axis"], name="axis")
+    out = tf.arg_max(input_value, axis, output_type=parameters["output_type"])
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_less_tests(zip_path):
+  """Make a set of tests to do less."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the less op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.less(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_floor_tests(zip_path):
+  """Make a set of tests to do floor."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    """Build the floor op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.floor(input_value)
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict={inputs[0]: input_value})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
@@ -1775,65 +2073,26 @@ def main(unused_args):
       if not os.path.isdir(x):
         raise RuntimeError("Failed to create dir %r" % x)
 
-  if FLAGS.type == "zipped":
-    opstest_path = os.path.join(FLAGS.output_path)
-    mkdir_if_not_exist(opstest_path)
-    def _path(filename):
-      return os.path.join(opstest_path, filename)
-
-    dispatch = {
-        "control_dep.zip": make_control_dep_tests,
-        "add.zip": make_binary_op_tests_func(tf.add),
-        "space_to_batch_nd.zip": make_space_to_batch_nd_tests,
-        "div.zip": make_binary_op_tests_func(tf.div),
-        "sub.zip": make_binary_op_tests_func(tf.subtract),
-        "batch_to_space_nd.zip": make_batch_to_space_nd_tests,
-        "conv.zip": make_conv_tests,
-        "constant.zip": make_constant_tests,
-        "depthwiseconv.zip": make_depthwiseconv_tests,
-        "concat.zip": make_concatenation_tests,
-        "fully_connected.zip": make_fully_connected_tests,
-        "global_batch_norm.zip": make_global_batch_norm_tests,
-        "gather.zip": make_gather_tests,
-        "fused_batch_norm.zip": make_fused_batch_norm_tests,
-        "l2norm.zip": make_l2norm_tests,
-        "local_response_norm.zip": make_local_response_norm_tests,
-        "mul.zip": make_binary_op_tests_func(tf.multiply),
-        "relu.zip": make_relu_tests,
-        "relu1.zip": make_relu1_tests,
-        "relu6.zip": make_relu6_tests,
-        "l2_pool.zip": make_pool_tests(make_l2_pool),
-        "avg_pool.zip": make_pool_tests(tf.nn.avg_pool),
-        "max_pool.zip": make_pool_tests(tf.nn.max_pool),
-        "pad.zip": make_pad_tests,
-        "reshape.zip": make_reshape_tests,
-        "resize_bilinear.zip": make_resize_bilinear_tests,
-        "sigmoid.zip": make_sigmoid_tests,
-        "softmax.zip": make_softmax_tests,
-        "space_to_depth.zip": make_space_to_depth_tests,
-        "topk.zip": make_topk_tests,
-        "split.zip": make_split_tests,
-        "transpose.zip": make_transpose_tests,
-        "mean.zip": make_mean_tests,
-        "squeeze.zip": make_squeeze_tests,
-        "strided_slice.zip": make_strided_slice_tests,
-        "exp.zip": make_exp_tests,
-    }
-    out = FLAGS.zip_to_output
-    bin_path = FLAGS.toco
-    if out in dispatch:
-      dispatch[out](_path(out))
-    else:
-      raise RuntimeError("Invalid zip to output %r" % out)
+  opstest_path = os.path.join(FLAGS.output_path)
+  mkdir_if_not_exist(opstest_path)
 
-  else:
-    raise RuntimeError("Invalid argument for type of generation.")
+  out = FLAGS.zip_to_output
+  bin_path = FLAGS.toco
+  test_function = ("make_%s_tests" % out.replace(".zip", ""))
+  if test_function not in globals():
+    raise RuntimeError("Can't find a test function to create %r. Tried %r" %
+                       (out, test_function))
+
+  # TODO(ahentz): accessing globals() is not very elegant. We should either
+  # break this file into multiple tests or use decorator-based registration to
+  # avoid using globals().
+  globals()[test_function](os.path.join(opstest_path, out))
 
 
 if __name__ == "__main__":
   FLAGS, unparsed = parser.parse_known_args()
 
   if unparsed:
-    print("Usage: %s <path out> zipped <zip file to generate>")
+    print("Usage: %s <path out> <zip file to generate>")
   else:
     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6580845af42b3cdded19b578b41c682089aaf9ef
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -0,0 +1,109 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/testing/generate_testspec.h"
+#include "tensorflow/contrib/lite/testing/join.h"
+#include "tensorflow/contrib/lite/testing/split.h"
+#include "tensorflow/contrib/lite/testing/tf_driver.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tflite {
+namespace testing {
+
+template <typename T>
+void GenerateCsv(const std::vector<int>& shape, float min, float max,
+                 string* out) {
+  auto random_float = [](int min, int max) {
+    static unsigned int seed;
+    return min + (max - min) * static_cast<float>(rand_r(&seed)) / RAND_MAX;
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(shape, random_t);
+  *out = Join(data.data(), data.size(), ",");
+}
+
+bool GenerateTestSpecFromTensorflowModel(
+    std::iostream& stream, const string& tensorflow_model_path,
+    const string& tflite_model_path, const std::vector<string>& input_layer,
+    const std::vector<string>& input_layer_type,
+    const std::vector<string>& input_layer_shape,
+    const std::vector<string>& output_layer) {
+  CHECK_EQ(input_layer.size(), input_layer_type.size());
+  CHECK_EQ(input_layer.size(), input_layer_shape.size());
+
+  // Generate inputs.
+  std::vector<string> input_values;
+  input_values.resize(input_layer.size());
+  for (int i = 0; i < input_layer.size(); i++) {
+    tensorflow::DataType type;
+    CHECK(DataTypeFromString(input_layer_type[i], &type));
+    auto shape = Split<int>(input_layer_shape[i], ",");
+
+    switch (type) {
+      case tensorflow::DT_FLOAT:
+        GenerateCsv<float>(shape, -0.5, 0.5, &input_values[i]);
+        break;
+      case tensorflow::DT_UINT8:
+        GenerateCsv<uint8_t>(shape, 0, 255, &input_values[i]);
+        break;
+      case tensorflow::DT_INT32:
+        GenerateCsv<int32_t>(shape, -100, 100, &input_values[i]);
+        break;
+      case tensorflow::DT_INT64:
+        GenerateCsv<int64_t>(shape, -100, 100, &input_values[i]);
+        break;
+      case tensorflow::DT_BOOL:
+        GenerateCsv<int>(shape, 0.01, 1.99, &input_values[i]);
+        break;
+      default:
+        fprintf(stderr, "Unsupported type %d (%s) when generating testspec.\n",
+                type, input_layer_type[i].c_str());
+        return false;
+    }
+  }
+
+  // Invoke tensorflow model.
+  TfDriver runner(input_layer, input_layer_type, input_layer_shape,
+                  output_layer);
+  runner.LoadModel(tensorflow_model_path);
+  for (int i = 0; i < input_values.size(); i++) {
+    runner.SetInput(i, input_values[i]);
+  }
+  runner.Invoke();
+
+  // Write test spec.
+  stream << "load_model: " << tflite_model_path << "\n";
+  stream << "reshape {\n";
+  for (const auto& shape : input_layer_shape) {
+    stream << "  input: \"" << shape << "\"\n";
+  }
+  stream << "}\n";
+  stream << "invoke {\n";
+  for (const auto& value : input_values) {
+    stream << "  input: \"" << value << "\"\n";
+  }
+  for (int i = 0; i < output_layer.size(); i++) {
+    stream << "  output: \"" << runner.ReadOutput(i) << "\"\n";
+  }
+  stream << "}\n";
+
+  return true;
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.h b/tensorflow/contrib/lite/testing/generate_testspec.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e31a853c3f7f82a89126ff83af784ffd418741a
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/generate_testspec.h
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_GENERATE_TESTSPEC_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_GENERATE_TESTSPEC_H_
+
+#include <functional>
+#include <iostream>
+#include <vector>
+
+namespace tflite {
+namespace testing {
+
+// Generate test spec by executing TensorFlow model on random inputs.
+// The test spec can be consumed by ParseAndRunTests.
+// See test spec format in parse_testdata.h
+//
+// Inputs:
+//   stream: mutable iostream that contains the contents of test spec.
+//   tensorflow_model_path: path to TensorFlow model.
+//   tflite_model_path: path to tflite_model_path that the test spec runs
+//   against. input_layer: names of input tensors. Example: input1
+//   input_layer_type: datatypes of input tensors. Example: float
+//   input_layer_shape: shapes of input tensors, separated by comma. example:
+//   1,3,4 output_layer: names of output tensors. Example: output
+bool GenerateTestSpecFromTensorflowModel(
+    std::iostream& stream, const string& tensorflow_model_path,
+    const string& tflite_model_path, const std::vector<string>& input_layer,
+    const std::vector<string>& input_layer_type,
+    const std::vector<string>& input_layer_shape,
+    const std::vector<string>& output_layer);
+
+// Generates random values that are filled into the tensor.
+// random_func returns the generated random element at given index.
+template <typename T>
+std::vector<T> GenerateRandomTensor(const std::vector<int>& shape,
+                                    const std::function<T(int)>& random_func) {
+  int64_t num_elements = 1;
+  for (const int dim : shape) {
+    num_elements *= dim;
+  }
+
+  std::vector<T> result(num_elements);
+  for (int i = 0; i < num_elements; i++) {
+    result[i] = random_func(i);
+  }
+  return result;
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_GENERATE_TESTSPEC_H_
diff --git a/tensorflow/contrib/lite/testing/generate_testspec_test.cc b/tensorflow/contrib/lite/testing/generate_testspec_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a97b757a413246c9ad9b5f453741b13e381c903
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/generate_testspec_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/generate_testspec.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+namespace {
+
+TEST(GenerateRandomTensor, FloatValue) {
+  static unsigned int seed = 0;
+  std::function<float(int)> float_rand = [](int idx) {
+    return static_cast<float>(rand_r(&seed)) / RAND_MAX - 0.5f;
+  };
+
+  std::set<float> values;
+  float sum_x_square = 0.0f;
+  float sum_x = 0.0f;
+  for (int i = 0; i < 100; i++) {
+    const auto& data = GenerateRandomTensor<float>({1, 3, 4}, float_rand);
+    for (float value : data) {
+      values.insert(value);
+      sum_x_square += value * value;
+      sum_x += value;
+    }
+  }
+
+  // Eech round, generated tensor has different values.
+  EXPECT_GT(values.size(), 200);
+  int num = 1 * 3 * 4 * 100;
+  float stddev = sum_x_square / num - (sum_x / num) * (sum_x / num);
+
+  // Stddev is greater than 1/2 stddev of uniform distribution: (B-A)^2 / 12
+  float minstddev = 1.0f / 12 / 2;
+  EXPECT_GT(stddev, minstddev);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 49766cedac8d1acd96f9b38665119e99f8bb9ac0..34abb213c937cc2783ed071f68a34d9a4ad67e18 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -47,10 +47,6 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // Key is a substring of the test name and value is a bug number.
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
-    // Sub and Div don't support broadcasting.
-    {R"(^\/diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(^\/suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-
     // Add only supports float32. (and "constant" tests use Add)
     {R"(^\/adda.*int32)", "68808744"},
     {R"(^\/constant.*int32)", "68808744"},
@@ -92,6 +88,17 @@ std::map<string, string> kBrokenTests = {
 
     // Transpose only supports 1D-4D input tensors.
     {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"},
+
+    // PRelu only supports 4D input with (1, 1, channels) 3D alpha now.
+    {R"(^\/prelu.*shared_axes=\[1\])", "75975192"},
+
+    // No support for axis!=0 in GatherV2.
+    {R"(^\/gather.*axis=1)", "76910444"},
+
+    // No support for arbitrary dimensions in ArgMax.
+    {R"(^\/arg_max.*axis=0)", "77546240"},
+    {R"(^\/arg_max.*axis=1)", "77546240"},
+    {R"(^\/arg_max.*axis=2)", "77546240"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -234,25 +241,33 @@ TEST_P(OpsTest, RunStuff) {
       ::testing::ValuesIn(UnarchiveZipAndFindTestNames(#zip_base ".zip")));
 
 INSTANTIATE_TESTS(add)
+INSTANTIATE_TESTS(arg_max)
 INSTANTIATE_TESTS(avg_pool)
-INSTANTIATE_TESTS(space_to_batch_nd)
 INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
 INSTANTIATE_TESTS(constant)
 INSTANTIATE_TESTS(control_dep)
 INSTANTIATE_TESTS(conv)
 INSTANTIATE_TESTS(depthwiseconv)
+INSTANTIATE_TESTS(div)
 INSTANTIATE_TESTS(exp)
+INSTANTIATE_TESTS(floor)
 INSTANTIATE_TESTS(fully_connected)
 INSTANTIATE_TESTS(fused_batch_norm)
 INSTANTIATE_TESTS(gather)
 INSTANTIATE_TESTS(global_batch_norm)
-INSTANTIATE_TESTS(l2norm)
 INSTANTIATE_TESTS(l2_pool)
+INSTANTIATE_TESTS(l2norm)
+INSTANTIATE_TESTS(less)
 INSTANTIATE_TESTS(local_response_norm)
+INSTANTIATE_TESTS(log_softmax)
 INSTANTIATE_TESTS(max_pool)
+INSTANTIATE_TESTS(maximum)
+INSTANTIATE_TESTS(mean)
+INSTANTIATE_TESTS(minimum)
 INSTANTIATE_TESTS(mul)
 INSTANTIATE_TESTS(pad)
+// INSTANTIATE_TESTS(prelu)
 INSTANTIATE_TESTS(relu)
 INSTANTIATE_TESTS(relu1)
 INSTANTIATE_TESTS(relu6)
@@ -260,14 +275,13 @@ INSTANTIATE_TESTS(reshape)
 INSTANTIATE_TESTS(resize_bilinear)
 INSTANTIATE_TESTS(sigmoid)
 INSTANTIATE_TESTS(softmax)
+INSTANTIATE_TESTS(space_to_batch_nd)
 INSTANTIATE_TESTS(space_to_depth)
-INSTANTIATE_TESTS(sub)
 INSTANTIATE_TESTS(split)
-INSTANTIATE_TESTS(div)
-INSTANTIATE_TESTS(transpose)
-INSTANTIATE_TESTS(mean)
 INSTANTIATE_TESTS(squeeze)
 INSTANTIATE_TESTS(strided_slice)
+INSTANTIATE_TESTS(sub)
+INSTANTIATE_TESTS(transpose)
 
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.cc b/tensorflow/contrib/lite/testing/parse_testdata.cc
index 0caef0fe2201a668b2235a98304eb353072a3c2f..389688d552051ea735ce71533943af33df5059ef 100644
--- a/tensorflow/contrib/lite/testing/parse_testdata.cc
+++ b/tensorflow/contrib/lite/testing/parse_testdata.cc
@@ -192,27 +192,25 @@ TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter,
   int model_outputs = interpreter->outputs().size();
   TF_LITE_ENSURE_EQ(context, model_outputs, example.outputs.size());
   for (size_t i = 0; i < interpreter->outputs().size(); i++) {
+    bool tensors_differ = false;
     int output_index = interpreter->outputs()[i];
     if (const float* data = interpreter->typed_tensor<float>(output_index)) {
       for (size_t idx = 0; idx < example.outputs[i].flat_data.size(); idx++) {
         float computed = data[idx];
         float reference = example.outputs[0].flat_data[idx];
         float diff = std::abs(computed - reference);
-        bool error_is_large = false;
         // For very small numbers, try absolute error, otherwise go with
         // relative.
-        if (std::abs(reference) < kRelativeThreshold) {
-          error_is_large = (diff > kAbsoluteThreshold);
-        } else {
-          error_is_large = (diff > kRelativeThreshold * std::abs(reference));
-        }
-        if (error_is_large) {
+        bool local_tensors_differ =
+            std::abs(reference) < kRelativeThreshold
+                ? diff > kAbsoluteThreshold
+                : diff > kRelativeThreshold * std::abs(reference);
+        if (local_tensors_differ) {
           fprintf(stdout, "output[%zu][%zu] did not match %f vs reference %f\n",
                   i, idx, data[idx], reference);
-          return kTfLiteError;
+          tensors_differ = local_tensors_differ;
         }
       }
-      fprintf(stderr, "\n");
     } else if (const int32_t* data =
                    interpreter->typed_tensor<int32_t>(output_index)) {
       for (size_t idx = 0; idx < example.outputs[i].flat_data.size(); idx++) {
@@ -221,10 +219,9 @@ TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter,
         if (std::abs(computed - reference) > 0) {
           fprintf(stderr, "output[%zu][%zu] did not match %d vs reference %d\n",
                   i, idx, computed, reference);
-          return kTfLiteError;
+          tensors_differ = true;
         }
       }
-      fprintf(stderr, "\n");
     } else if (const int64_t* data =
                    interpreter->typed_tensor<int64_t>(output_index)) {
       for (size_t idx = 0; idx < example.outputs[i].flat_data.size(); idx++) {
@@ -235,14 +232,15 @@ TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter,
                   "output[%zu][%zu] did not match %" PRId64
                   " vs reference %" PRId64 "\n",
                   i, idx, computed, reference);
-          return kTfLiteError;
+          tensors_differ = true;
         }
       }
-      fprintf(stderr, "\n");
     } else {
       fprintf(stderr, "output[%zu] was not float or int data\n", i);
       return kTfLiteError;
     }
+    fprintf(stderr, "\n");
+    if (tensors_differ) return kTfLiteError;
   }
   return kTfLiteOk;
 }
@@ -319,8 +317,9 @@ class Reshape : public Message {
 // This is the top-level message in a test file.
 class TestData : public Message {
  public:
-  explicit TestData(TestRunner* test_runner) : test_runner_(test_runner) {}
-
+  explicit TestData(TestRunner* test_runner)
+      : test_runner_(test_runner), num_invocations_(0), max_invocations_(-1) {}
+  void SetMaxInvocations(int max) { max_invocations_ = max; }
   void SetField(const std::string& name, const std::string& value) override {
     if (name == "load_model") {
       test_runner_->LoadModel(value);
@@ -334,7 +333,12 @@ class TestData : public Message {
   Message* AddChild(const std::string& s) override {
     if (s == "invoke") {
       test_runner_->AllocateTensors();
-      return Store(new Invoke(test_runner_));
+      if (max_invocations_ == -1 || num_invocations_ < max_invocations_) {
+        ++num_invocations_;
+        return Store(new Invoke(test_runner_));
+      } else {
+        return nullptr;
+      }
     } else if (s == "reshape") {
       return Store(new Reshape(test_runner_));
     }
@@ -343,10 +347,14 @@ class TestData : public Message {
 
  private:
   TestRunner* test_runner_;
+  int num_invocations_;
+  int max_invocations_;
 };
 
-bool ParseAndRunTests(std::istream* input, TestRunner* test_runner) {
+bool ParseAndRunTests(std::istream* input, TestRunner* test_runner,
+                      int max_invocations) {
   TestData test_data(test_runner);
+  test_data.SetMaxInvocations(max_invocations);
   Message::Read(input, &test_data);
   return test_runner->IsValid() && test_runner->GetOverallSuccess();
 }
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.h b/tensorflow/contrib/lite/testing/parse_testdata.h
index 7ebf362eb99c5f4cf6ea3654cf71e13ff1de99b3..d94361d735e2be8dc130dc8d6bf0bb5c822ebb7c 100644
--- a/tensorflow/contrib/lite/testing/parse_testdata.h
+++ b/tensorflow/contrib/lite/testing/parse_testdata.h
@@ -66,7 +66,8 @@ TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter, const Example&);
 //     output: "12,3,4,545,3"
 //     output: "0.01,0.02"
 //   }
-bool ParseAndRunTests(std::istream* input, TestRunner* test_runner);
+bool ParseAndRunTests(std::istream* input, TestRunner* test_runner,
+                      int max_invocations = -1);
 
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/split.h b/tensorflow/contrib/lite/testing/split.h
index 428cfda4f216f0ee6409a32c43a4cf91ecc11922..896f2949efa6aeda76940bae18a11dccf3c2f01b 100644
--- a/tensorflow/contrib/lite/testing/split.h
+++ b/tensorflow/contrib/lite/testing/split.h
@@ -80,6 +80,16 @@ inline std::vector<uint8_t> Split(const string& s, const string& delimiter) {
   return fields;
 }
 
+template <>
+inline std::vector<bool> Split(const string& s, const string& delimiter) {
+  std::vector<bool> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(
+        static_cast<bool>(strtol(s.data() + p.first, nullptr, 10)));
+  }
+  return fields;
+}
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/testing/split_test.cc b/tensorflow/contrib/lite/testing/split_test.cc
index 3d1e25d9c7dab50984928adfe0d7392675578662..76b918cbcd83ef43c52057b84bcc2a8f4ff6b8f7 100644
--- a/tensorflow/contrib/lite/testing/split_test.cc
+++ b/tensorflow/contrib/lite/testing/split_test.cc
@@ -52,6 +52,11 @@ TEST(SplitTest, SplitUint8) {
   EXPECT_THAT(Split<uint8_t>("1,-1,258", ","), ElementsAre(1, 255, 2));
 }
 
+TEST(SplitTest, SplitBool) {
+  EXPECT_THAT(Split<bool>("1, 0, 0, 1", ","),
+              ElementsAre(true, false, false, true));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index 2c253bb1983e5ddc5bc12858c929585d1bcee710..7b295875aab12bf48da2341ce05dd53442464cf0 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -87,10 +87,9 @@ TfDriver::TfDriver(const std::vector<string>& input_layer,
 
 void TfDriver::LoadModel(const string& bin_file_path) {
   if (!IsValid()) return;
-  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
   std::ifstream model(bin_file_path);
   if (model.fail()) {
-    Invalidate("Failed to find the model");
+    Invalidate("Failed to find the model " + bin_file_path);
     return;
   }
 
@@ -121,6 +120,10 @@ void TfDriver::SetInput(int id, const string& csv_values) {
       FillTensorWithData<int32_t>(&tensor, csv_values);
       break;
     }
+    case tensorflow::DT_UINT8: {
+      FillTensorWithData<uint8_t>(&tensor, csv_values);
+      break;
+    }
     default:
       fprintf(stderr, "Unsupported type %d in SetInput\n", input_types_[id]);
       Invalidate("Unsupported tensor data type");
@@ -162,6 +165,8 @@ string TfDriver::ReadOutput(int id) {
       return TensorDataToCsvString<float>(output_tensors_[id]);
     case tensorflow::DT_INT32:
       return TensorDataToCsvString<int32_t>(output_tensors_[id]);
+    case tensorflow::DT_UINT8:
+      return TensorDataToCsvString<uint8_t>(output_tensors_[id]);
     default:
       fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
       Invalidate("Unsupported tensor data type");
diff --git a/tensorflow/compiler/xla/array2d.cc b/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
similarity index 51%
rename from tensorflow/compiler/xla/array2d.cc
rename to tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
index 418587c1f75c7249f92e925455d40685d870c57a..5afa0f800cdaa8bf70a11cb6e2ac64ace8138e79 100644
--- a/tensorflow/compiler/xla/array2d.cc
+++ b/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
@@ -13,24 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/contrib/lite/testing/tflite_diff_flags.h"
+#include "tensorflow/contrib/lite/testing/tflite_diff_util.h"
 
-namespace xla {
-
-std::unique_ptr<Array2D<float>> MakeLinspaceArray2D(float from, float to,
-                                                    int64 n1, int64 n2) {
-  auto array = MakeUnique<Array2D<float>>(n1, n2);
-  int64 count = n1 * n2;
-  float step = (count > 1) ? (to - from) / (count - 1) : 0.0f;
-  auto set = [&array, n1, n2](int64 index, float value) {
-    (*array)(index / n2, index % n2) = value;
-  };
-  for (int64 i = 0; i < count - 1; ++i) {
-    set(i, from + i * step);
+int main(int argc, char** argv) {
+  ::tflite::testing::DiffOptions options =
+      ::tflite::testing::ParseTfliteDiffFlags(&argc, argv);
+  if (options.tensorflow_model.empty()) return 1;
+  int failure_count = 0;
+  for (int i = 0; i < 100; i++) {
+    if (!tflite::testing::RunDiffTest(options)) {
+      ++failure_count;
+    }
   }
-  set(count - 1, to);
-  return array;
+  fprintf(stderr, "Num errors: %d\n", failure_count);
+  return failure_count != 0 ? 1 : 0;
 }
-
-}  // namespace xla
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..706108ed73bb3fd9bd784cffffe322d6981433e6
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
+
+#include "tensorflow/contrib/lite/testing/split.h"
+#include "tensorflow/contrib/lite/testing/tflite_diff_util.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tflite {
+namespace testing {
+
+DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
+  struct {
+    string tensorflow_model;
+    string tflite_model;
+    string input_layer;
+    string input_layer_type;
+    string input_layer_shape;
+    string output_layer;
+  } values;
+
+  std::vector<tensorflow::Flag> flags = {
+      tensorflow::Flag("tensorflow_model", &values.tensorflow_model,
+                       "Path of tensorflow model."),
+      tensorflow::Flag("tflite_model", &values.tflite_model,
+                       "Path of tensorflow lite model."),
+      tensorflow::Flag("input_layer", &values.input_layer,
+                       "Names of input tensors, separated by comma. Example: "
+                       "input_1,input_2"),
+      tensorflow::Flag("input_layer_type", &values.input_layer_type,
+                       "Data types of input tensors, separated by comma. "
+                       "Example: float,int"),
+      tensorflow::Flag(
+          "input_layer_shape", &values.input_layer_shape,
+          "Shapes of input tensors, separated by colon. Example: 1,3,4,1:2"),
+      tensorflow::Flag("output_layer", &values.output_layer,
+                       "Names of output tensors, separated by comma. Example "
+                       "output_1,output_2"),
+  };
+
+  bool no_inputs = *argc == 1;
+  bool success = tensorflow::Flags::Parse(argc, argv, flags);
+  if (!success || no_inputs || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
+    fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
+  }
+
+  return {values.tensorflow_model,
+          values.tflite_model,
+          Split<string>(values.input_layer, ","),
+          Split<string>(values.input_layer_type, ","),
+          Split<string>(values.input_layer_shape, ":"),
+          Split<string>(values.output_layer, ",")};
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.cc b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f601d3752ddb5df9f2b5ac73d9bc303efaade4a5
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+
+#include "tensorflow/contrib/lite/testing/generate_testspec.h"
+#include "tensorflow/contrib/lite/testing/parse_testdata.h"
+#include "tensorflow/contrib/lite/testing/tflite_diff_util.h"
+#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+
+bool RunDiffTest(const DiffOptions& options) {
+  std::stringstream tflite_stream;
+  if (!GenerateTestSpecFromTensorflowModel(
+          tflite_stream, options.tensorflow_model, options.tflite_model,
+          options.input_layer, options.input_layer_type,
+          options.input_layer_shape, options.output_layer))
+    return false;
+  TfLiteDriver tflite_driver(/*use_nnapi=*/true);
+  tflite_driver.LoadModel(options.tflite_model);
+  return tflite::testing::ParseAndRunTests(&tflite_stream, &tflite_driver);
+}
+}  // namespace testing
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.h b/tensorflow/contrib/lite/testing/tflite_diff_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..326fa6c3e28000dee9b6eb9cc5b3a6c5c87e28d0
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_UTIL_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+// Configurations to run Tflite diff test.
+struct DiffOptions {
+  // Path of tensorflow model.
+  string tensorflow_model;
+  // Path of tensorflow lite model.
+  string tflite_model;
+  // Names of input tensors.
+  // Example: input_1,input_2
+  std::vector<string> input_layer;
+  // Data types of input tensors.
+  // Example: float,int
+  std::vector<string> input_layer_type;
+  // Shapes of input tensors, separated by comma.
+  // Example: 1,3,4,1
+  std::vector<string> input_layer_shape;
+  // Names of output tensors.
+  // Example output_1,output_2
+  std::vector<string> output_layer;
+};
+
+// Run a single TensorFLow Lite diff test with a given options.
+bool RunDiffTest(const DiffOptions& options);
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_UTIL_H_
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 613223f3d4ff212cb8672494243b2d7a1d06b3db..58fe5bd6e40b3d5979d64fae659eb39bfe87c265 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -42,6 +42,10 @@ template <>
 uint8_t Value(const TfLitePtrUnion& data, int index) {
   return data.uint8[index];
 }
+template <>
+bool Value(const TfLitePtrUnion& data, int index) {
+  return data.b[index];
+}
 
 template <typename T>
 void SetTensorData(const std::vector<T>& values, TfLitePtrUnion* data) {
@@ -56,12 +60,16 @@ void SetTensorData(const std::vector<T>& values, TfLitePtrUnion* data) {
 
 class TfLiteDriver::Expectation {
  public:
-  Expectation() { data_.raw = nullptr; }
+  Expectation() {
+    data_.raw = nullptr;
+    num_elements_ = 0;
+  }
   ~Expectation() { delete[] data_.raw; }
   template <typename T>
   void SetData(const string& csv_values) {
     const auto& values = testing::Split<T>(csv_values, ",");
-    data_.raw = new char[values.size() * sizeof(T)];
+    num_elements_ = values.size();
+    data_.raw = new char[num_elements_ * sizeof(T)];
     SetTensorData(values, &data_);
   }
 
@@ -75,6 +83,8 @@ class TfLiteDriver::Expectation {
         return TypedCheck<int64_t>(verbose, tensor);
       case kTfLiteUInt8:
         return TypedCheck<uint8_t>(verbose, tensor);
+      case kTfLiteBool:
+        return TypedCheck<bool>(verbose, tensor);
       default:
         fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
         return false;
@@ -88,7 +98,13 @@ class TfLiteDriver::Expectation {
     constexpr double kRelativeThreshold = 1e-2f;
     constexpr double kAbsoluteThreshold = 1e-4f;
 
-    int tensor_size = tensor.bytes / sizeof(T);
+    size_t tensor_size = tensor.bytes / sizeof(T);
+
+    if (tensor_size != num_elements_) {
+      std::cerr << "Expected a tensor with " << num_elements_
+                << " elements, got " << tensor_size << std::endl;
+      return false;
+    }
 
     bool good_output = true;
     for (int i = 0; i < tensor_size; ++i) {
@@ -115,6 +131,7 @@ class TfLiteDriver::Expectation {
   }
 
   TfLitePtrUnion data_;
+  size_t num_elements_;
 };
 
 TfLiteDriver::TfLiteDriver(bool use_nnapi) : use_nnapi_(use_nnapi) {}
@@ -132,7 +149,6 @@ void TfLiteDriver::AllocateTensors() {
 
 void TfLiteDriver::LoadModel(const string& bin_file_path) {
   if (!IsValid()) return;
-  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
 
   model_ = FlatBufferModel::BuildFromFile(GetFullPath(bin_file_path).c_str());
   if (!model_) {
@@ -193,6 +209,12 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
       SetTensorData(values, &tensor->data);
       break;
     }
+    case kTfLiteBool: {
+      const auto& values = testing::Split<bool>(csv_values, ",");
+      if (!CheckSizes<bool>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
     default:
       fprintf(stderr, "Unsupported type %d in SetInput\n", tensor->type);
       Invalidate("Unsupported tensor data type");
@@ -221,6 +243,9 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
     case kTfLiteUInt8:
       expected_output_[id]->SetData<uint8_t>(csv_values);
       break;
+    case kTfLiteBool:
+      expected_output_[id]->SetData<bool>(csv_values);
+      break;
     default:
       fprintf(stderr, "Unsupported type %d in SetExpectation\n", tensor->type);
       Invalidate("Unsupported tensor data type");
diff --git a/tensorflow/contrib/lite/tflite_static.bp b/tensorflow/contrib/lite/tflite_static.bp
index 9b7d0adbbfa691e326abd73542c1c422cbb30ec1..910a0d33f5e94bad2386864a736875396b815db8 100644
--- a/tensorflow/contrib/lite/tflite_static.bp
+++ b/tensorflow/contrib/lite/tflite_static.bp
@@ -29,18 +29,26 @@ cc_library_static {
         "optional_debug_tools.cc",
         "simple_memory_arena.cc",
         "string_util.cc",
+        "util.cc",
         "kernels/activations.cc",
         "kernels/add.cc",
+        "kernels/arg_max.cc",
         "kernels/basic_rnn.cc",
         "kernels/batch_to_space_nd.cc",
-	"kernels/bidirectional_sequence_rnn.cc",
+        "kernels/bidirectional_sequence_rnn.cc",
+        "kernels/bidirectional_sequence_lstm.cc",
+        "kernels/cast.cc",
+        "kernels/comparisons.cc",
         "kernels/concatenation.cc",
         "kernels/conv.cc",
         "kernels/depthwise_conv.cc",
+        "kernels/dequantize.cc",
         "kernels/div.cc",
-	"kernels/exp.cc",
+        "kernels/eigen_support.cc",
+        "kernels/exp.cc",
         "kernels/embedding_lookup.cc",
         "kernels/embedding_lookup_sparse.cc",
+        "kernels/floor.cc",
         "kernels/fully_connected.cc",
         "kernels/gather.cc",
         "kernels/gemm_support.cc",
@@ -50,6 +58,7 @@ cc_library_static {
         "kernels/local_response_norm.cc",
         "kernels/lsh_projection.cc",
         "kernels/lstm.cc",
+        "kernels/maximum_minimum.cc",
         "kernels/mean.cc",
         "kernels/mul.cc",
         "kernels/pad.cc",
@@ -96,8 +105,9 @@ cc_library_static {
         "-Wno-mismatched-tags",
         "-Wno-missing-field-initializers",
         "-Wno-sign-compare",
-	"-Wno-typedef-redefinition",
+        "-Wno-typedef-redefinition",
         "-Wno-unused-lambda-capture",
+        "-Wno-unused-local-typedef",
         "-Wno-unused-parameter",
         "-Wno-unused-variable",
     ],
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index e2879fad327d965799d84da5c9092a12a36aa65b..f92e546ab8aa3ce8a70829dbec32c96b5d4a9d5d 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -124,6 +124,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -142,6 +143,7 @@ cc_library(
         ":toco_graphviz_dump_options",
         ":toco_port",
         ":types_proto_cc",
+        "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
@@ -167,12 +169,48 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "toco_saved_model",
+    srcs = [
+        "toco_saved_model.cc",
+    ],
+    hdrs = [
+        "toco_saved_model.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_flags_proto_cc",
+        ":types_proto_cc",
+        "//tensorflow/cc/tools:freeze_saved_model",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "toco_saved_model_test",
+    srcs = ["toco_saved_model_test.cc"],
+    deps = [
+        ":model_cmdline_flags",
+        ":toco_cmdline_flags",
+        ":toco_saved_model",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "graph_transformations",
     srcs = [
         "graph_transformations/convert_expanddims_to_reshape.cc",
         "graph_transformations/convert_pure_conv_to_depthwise.cc",
         "graph_transformations/convert_reorder_axes.cc",
+        "graph_transformations/convert_squeeze_to_reshape.cc",
         "graph_transformations/convert_trivial_addn_to_add.cc",
         "graph_transformations/convert_trivial_stack_to_reshape.cc",
         "graph_transformations/convert_trivial_transpose_to_reshape.cc",
@@ -181,21 +219,31 @@ cc_library(
         "graph_transformations/drop_fake_quant.cc",
         "graph_transformations/drop_im2col_arrays.cc",
         "graph_transformations/ensure_bias_vectors.cc",
+        "graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc",
+        "graph_transformations/experimental_shuffle_fc_weights.cc",
         "graph_transformations/fuse_activation_functions.cc",
         "graph_transformations/fuse_binary_into_following_affine.cc",
         "graph_transformations/fuse_binary_into_preceding_affine.cc",
         "graph_transformations/graph_transformations.cc",
         "graph_transformations/hardcode_min_max.cc",
+        "graph_transformations/identify_dilated_conv.cc",
         "graph_transformations/identify_l2_normalization.cc",
         "graph_transformations/identify_l2_pool.cc",
         "graph_transformations/identify_lstm.cc",
         "graph_transformations/identify_lstm_merge_inputs.cc",
         "graph_transformations/identify_lstm_split_inputs.cc",
+        "graph_transformations/identify_prelu.cc",
         "graph_transformations/identify_relu1.cc",
         "graph_transformations/lstm_utils.cc",
         "graph_transformations/make_initial_dequantize_operator.cc",
+        "graph_transformations/merge_reshape_into_preceding_transpose.cc",
+        "graph_transformations/propagate_activation_function_into_constants.cc",
         "graph_transformations/propagate_array_data_types.cc",
+        "graph_transformations/propagate_default_min_max.cc",
+        "graph_transformations/propagate_fake_quant_num_bits.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
+        "graph_transformations/quantization_util.cc",
+        "graph_transformations/quantization_util.h",
         "graph_transformations/quantize.cc",
         "graph_transformations/read_fake_quant_min_max.cc",
         "graph_transformations/remove_final_dequantize_op.cc",
@@ -204,20 +252,26 @@ cc_library(
         "graph_transformations/remove_trivial_binary.cc",
         "graph_transformations/remove_trivial_concatenation.cc",
         "graph_transformations/remove_trivial_concatenation_input.cc",
+        "graph_transformations/remove_trivial_fake_quant.cc",
         "graph_transformations/remove_trivial_passthrough.cc",
         "graph_transformations/remove_trivial_passthrough.h",
         "graph_transformations/remove_trivial_quantized_activation_func.cc",
+        "graph_transformations/remove_trivial_quantized_min_max.cc",
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_trivial_slice.cc",
         "graph_transformations/remove_unused_op.cc",
-        "graph_transformations/reorder_activation_functions.cc",
+        "graph_transformations/reorder_elementwise_unary.cc",
+        "graph_transformations/reorder_reshape_transpose.cc",
         "graph_transformations/resolve_batch_normalization.cc",
         "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
         "graph_transformations/resolve_constant_binary.cc",
         "graph_transformations/resolve_constant_concatenation.cc",
         "graph_transformations/resolve_constant_fake_quant.cc",
         "graph_transformations/resolve_constant_fill.cc",
+        "graph_transformations/resolve_constant_gather.cc",
+        "graph_transformations/resolve_constant_random_uniform.cc",
         "graph_transformations/resolve_constant_range.cc",
+        "graph_transformations/resolve_constant_reshape.cc",
         "graph_transformations/resolve_constant_shape_or_rank.cc",
         "graph_transformations/resolve_constant_stack.cc",
         "graph_transformations/resolve_constant_strided_slice.cc",
@@ -239,6 +293,7 @@ cc_library(
         "graph_transformations/resolve_tensorflow_tile.cc",
         "graph_transformations/resolve_transpose_attributes.cc",
         "graph_transformations/unfuse_activation_functions.cc",
+        "graph_transformations/unpartition_embedding_lookup.cc",
         "graph_transformations/unroll_batch_matmul.cc",
     ],
     hdrs = [
@@ -252,7 +307,8 @@ cc_library(
         ":runtime",
         ":toco_port",
         ":tooling_util",
-        ":types_proto_cc",
+        "//tensorflow/contrib/lite/kernels/internal:quantization_util",
+        "//tensorflow/contrib/lite/kernels/internal:strided_slice_logic",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -356,6 +412,7 @@ tf_cc_binary(
         ":toco_cmdline_flags",
         ":toco_flags_proto_cc",
         ":toco_port",
+        ":toco_saved_model",
         ":toco_tooling",
         ":types_proto_cc",
         "//tensorflow/core:lib",
@@ -374,15 +431,3 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/README.md b/tensorflow/contrib/lite/toco/README.md
index 281b2ea5e4c5553ff7aa240cdef3cb9819f19b49..522e260ad2a14c5f8e080c0a0f538f4192b7ed2d 100644
--- a/tensorflow/contrib/lite/toco/README.md
+++ b/tensorflow/contrib/lite/toco/README.md
@@ -1,26 +1,27 @@
-# The TensorFlow Lite Optimizing Converter
+# TOCO: TensorFlow Lite Optimizing Converter
 
-The TensorFlow Lite Optimizing Converter's most typical use is converting from the TensorFlow GraphDef to the TensorFlow Lite
-format, but it supports much more than that.
+The TensorFlow Lite Optimizing Converter converts TensorFlow graphs into
+TensorFlow Lite graphs. There are additional usages that are also detailed in
+the usage documentation.
 
 ## Usage documentation
 
 Usage information is given in these documents:
 
+*   [Command-line glossary](g3doc/cmdline_reference.md)
 *   [Command-line examples](g3doc/cmdline_examples.md)
-*   [Command-line reference](g3doc/cmdline_reference.md)
-*   [Python API](g3doc/python_api.md)
-
-## Design documentation
-
-Coming soon!
+*   [Python API examples](g3doc/python_api.md)
 
 ## Where the converter fits in the TensorFlow landscape
 
-In the typical case, an application developer is using TensorFlow to design and
-train models, then uses TensorFlow's freeze_graph.py to generate a frozen
-inference graph, then uses the converter to convert that into a TensorFlow Lite flatbuffer file,
-then ships that file to client devices where the TensorFlow Lite interpreter handles them
-on-device. This is represented in the following diagram:
-
-![drawing](https://storage.googleapis.com/download.tensorflow.org/example_images/tensorflow_landscape.svg)
+Once an application developer has a trained TensorFlow model, TOCO will accept
+that model and generate a TensorFlow Lite
+[FlatBuffer](https://google.github.io/flatbuffers/) file. TOCO currently supports
+[SavedModels](https://www.tensorflow.org/programmers_guide/saved_model#using_savedmodel_with_estimators)
+and frozen graphs (models generated via
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)).
+The TensorFlow Lite FlatBuffer file can be shipped to client devices, generally
+mobile devices, where the TensorFlow Lite interpreter handles them on-device.
+This flow is represented in the diagram below.
+
+![drawing](g3doc/toco_landscape.svg)
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 49cc1fc2aa365925cde86ceb658ff2b354d06911..1f3ea2e1c71e7de7e9ede2224796b489d7518d18 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -200,6 +200,12 @@ void DeallocateTransientArray(const Model& model, const string& array_name,
   allocator->Deallocate(*array->alloc);
 }
 
+void PushBackIfNotFound(const string& s, std::vector<string>* v) {
+  if (std::find(v->begin(), v->end(), s) == v->end()) {
+    v->push_back(s);
+  }
+}
+
 }  // namespace
 
 void AllocateTransientArrays(Model* model,
@@ -248,29 +254,37 @@ void AllocateTransientArrays(Model* model,
        op_index++) {
     const auto& op = model->operators[op_index];
     // Allocate those arrays whose lifespan starts exactly here.
+    std::vector<string> arrays_to_allocate;
     for (const auto& input : op->inputs) {
       if (StartsAt(array_lifespans[input], op_index)) {
-        AllocateTransientArray(*model, input, &allocator,
-                               transient_data_alignment);
+        PushBackIfNotFound(input, &arrays_to_allocate);
       }
     }
     for (const auto& output : op->outputs) {
       if (StartsAt(array_lifespans[output], op_index)) {
-        AllocateTransientArray(*model, output, &allocator,
-                               transient_data_alignment);
+        PushBackIfNotFound(output, &arrays_to_allocate);
       }
     }
+    for (const string& array : arrays_to_allocate) {
+      AllocateTransientArray(*model, array, &allocator,
+                             transient_data_alignment);
+    }
+
     // Deallocate those arrays whose lifespan ends exactly here.
+    std::vector<string> arrays_to_deallocate;
     for (const auto& input : op->inputs) {
       if (EndsAt(array_lifespans[input], op_index)) {
-        DeallocateTransientArray(*model, input, &allocator);
+        PushBackIfNotFound(input, &arrays_to_deallocate);
       }
     }
     for (const auto& output : op->outputs) {
       if (EndsAt(array_lifespans[output], op_index)) {
-        DeallocateTransientArray(*model, output, &allocator);
+        PushBackIfNotFound(output, &arrays_to_deallocate);
       }
     }
+    for (const string& array : arrays_to_deallocate) {
+      DeallocateTransientArray(*model, array, &allocator);
+    }
   }
 
   // Just out of curiosity (not used in the actual allocation process)
@@ -290,17 +304,21 @@ void AllocateTransientArrays(Model* model,
     // for each operator, compute the sum of the sizes of the array that must
     // be live during the execution of this operator, plus the size of
     // persistent arrays that must be live at all times.
-    std::size_t size = persistent_alloc_size;
+    std::vector<string> non_persistent_edges;
     for (const auto& input : op->inputs) {
       if (!array_lifespans[input].persistent) {
-        size += TransientArraySize(*model, input, transient_data_alignment);
+        PushBackIfNotFound(input, &non_persistent_edges);
       }
     }
     for (const auto& output : op->outputs) {
       if (!array_lifespans[output].persistent) {
-        size += TransientArraySize(*model, output, transient_data_alignment);
+        PushBackIfNotFound(output, &non_persistent_edges);
       }
     }
+    std::size_t size = persistent_alloc_size;
+    for (const string& edge : non_persistent_edges) {
+      size += TransientArraySize(*model, edge, transient_data_alignment);
+    }
     // The optimal total size is the maximum of all operator-specific sizes.
     optimal_transient_alloc_size = std::max(optimal_transient_alloc_size, size);
   }
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index b97a4720a7c4e69f8b69574475d19e0522cfe86d..fe30b88344c5340dc8647cd89e244987c86e47fe 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -26,6 +26,7 @@ limitations under the License.
 #endif
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
+#include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 
@@ -190,6 +191,7 @@ struct ParsedModelFlags {
   Arg<string> output_array;
   Arg<string> output_arrays;
   Arg<string> input_shapes;
+  Arg<int> batch_size = Arg<int>(1);
   Arg<float> mean_value = Arg<float>(0.f);
   Arg<string> mean_values;
   Arg<float> std_value = Arg<float>(1.f);
@@ -200,6 +202,7 @@ struct ParsedModelFlags {
   Arg<toco::IntList> input_shape;
   Arg<toco::StringMapList> rnn_states;
   Arg<toco::StringMapList> model_checks;
+  Arg<bool> change_concat_input_ranges = Arg<bool>(true);
   // Debugging output options.
   // TODO(benoitjacob): these shouldn't be ModelFlags.
   Arg<string> graphviz_first_array;
@@ -209,18 +212,23 @@ struct ParsedModelFlags {
   Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
   Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
   Arg<string> arrays_extra_info_file;
+  Arg<string> model_flags_file;
 };
 
 // Flags that describe the operation you would like to do (what conversion
 // you want). See toco_cmdline_flags.cc for details.
 struct ParsedTocoFlags {
   Arg<string> input_file;
+  Arg<string> savedmodel_directory;
   Arg<string> output_file;
-  Arg<string> input_format;
-  Arg<string> output_format;
+  Arg<string> input_format = Arg<string>("TENSORFLOW_GRAPHDEF");
+  Arg<string> output_format = Arg<string>("TFLITE");
+  Arg<string> savedmodel_tagset = Arg<string>(tensorflow::kSavedModelTagServe);
   // TODO(aselle): command_line_flags  doesn't support doubles
   Arg<float> default_ranges_min = Arg<float>(0.);
   Arg<float> default_ranges_max = Arg<float>(0.);
+  Arg<float> default_int16_ranges_min = Arg<float>(0.);
+  Arg<float> default_int16_ranges_max = Arg<float>(0.);
   Arg<string> inference_type;
   Arg<string> inference_input_type;
   Arg<bool> drop_fake_quant = Arg<bool>(false);
@@ -229,7 +237,10 @@ struct ParsedTocoFlags {
   // Deprecated flags
   Arg<string> input_type;
   Arg<string> input_types;
+  Arg<bool> debug_disable_recurrent_cell_fusion = Arg<bool>(false);
   Arg<bool> drop_control_dependency = Arg<bool>(false);
+  Arg<bool> propagate_fake_quant_num_bits = Arg<bool>(false);
+  Arg<bool> allow_nudging_weights_to_use_fast_gemm_kernel = Arg<bool>(false);
 };
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index c726eb6d8678e2703f5acba8b3d8d740186939f5..5bb0e3ba4d289ccfcb54198230f53c62222963a2 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -95,10 +95,8 @@ Color GetColorForArray(const Model& model, const string& array_name) {
       array_name == dump_options.graphviz_last_array) {
     return Color(0x9E, 0x9E, 0x9E);
   }
-  for (const string& output_array : model.flags.output_arrays()) {
-    if (array_name == output_array) {
-      return Color(0x9E, 0x9E, 0x9E);
-    }
+  if (IsOutputArray(model, array_name)) {
+    return Color(0x9E, 0x9E, 0x9E);
   }
   // Remaining arrays are intermediate activation arrays.
   // Lighter tone of the same grey as for input/output arrays:
@@ -119,6 +117,12 @@ void AppendArrayVal(string* string, Array const& array, int index) {
       return;
     }
     AppendF(string, "%d", data[index]);
+  } else if (array.buffer->type == ArrayDataType::kInt16) {
+    const auto& data = array.GetBuffer<ArrayDataType::kInt16>().data;
+    if (index >= data.size()) {
+      return;
+    }
+    AppendF(string, "%d", data[index]);
   } else if (array.buffer->type == ArrayDataType::kInt32) {
     const auto& data = array.GetBuffer<ArrayDataType::kInt32>().data;
     if (index >= data.size()) {
@@ -142,14 +146,8 @@ NodeProperties GetPropertiesForArray(const Model& model,
 
   // Append array shape to the label.
   auto& array = model.GetArray(array_name);
-
-  if (array.data_type == ArrayDataType::kFloat) {
-    AppendF(&node_properties.label, "\\nType: float");
-  } else if (array.data_type == ArrayDataType::kInt32) {
-    AppendF(&node_properties.label, "\\nType: int32");
-  } else if (array.data_type == ArrayDataType::kUint8) {
-    AppendF(&node_properties.label, "\\nType: uint8");
-  }
+  AppendF(&node_properties.label, "\\nType: %s",
+          ArrayDataTypeName(array.data_type));
 
   if (array.has_shape()) {
     auto& array_shape = array.shape();
@@ -199,12 +197,12 @@ NodeProperties GetPropertiesForArray(const Model& model,
   }
 
   if (array.minmax) {
-    AppendF(&node_properties.label, "\\nMinMax: [%.3g, %.3g]",
+    AppendF(&node_properties.label, "\\nMinMax: [%.7g, %.7g]",
             array.minmax->min, array.minmax->max);
   }
 
   if (array.quantization_params) {
-    AppendF(&node_properties.label, "\\nQuantization: %.3g * (x - %d)",
+    AppendF(&node_properties.label, "\\nQuantization: %7g * (x - %d)",
             array.quantization_params->scale,
             array.quantization_params->zero_point);
   }
@@ -261,6 +259,19 @@ NodeProperties GetPropertiesForOperator(const Operator& op) {
       node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
       break;
     }
+    case OperatorType::kFakeQuant: {
+      const auto& fakequant_op = static_cast<const FakeQuantOperator&>(op);
+      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      if (fakequant_op.minmax) {
+        AppendF(&node_properties.label, "\\n%dbit [%g,%g]",
+                fakequant_op.num_bits, fakequant_op.minmax->min,
+                fakequant_op.minmax->max);
+      } else {
+        AppendF(&node_properties.label, "\\n%dbit [?,?]",
+                fakequant_op.num_bits);
+      }
+      break;
+    }
     default:
       node_properties.color = Color(0xDB, 0x44, 0x37);
       break;
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 570cc7943b2926136f6fdb21d20b8aa6acf8cd26..99ccfaea648077b7b72af30b32dd53b42b85d3a2 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -37,6 +37,7 @@ limitations under the License.
 
 using tensorflow::DT_BOOL;
 using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT16;
 using tensorflow::DT_INT32;
 using tensorflow::DT_INT64;
 using tensorflow::DT_UINT8;
@@ -239,6 +240,7 @@ void ConvertIntTensorConst(const Model& model, const string& name,
 }
 
 void CreateIntTensorConst(const string& name, const std::vector<int32>& data,
+                          const std::vector<int32>& shape,
                           GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -252,8 +254,13 @@ void CreateIntTensorConst(const string& name, const std::vector<int32>& data,
   for (auto index : data) {
     tensor->add_int_val(index);
   }
-  auto* shape = tensor->mutable_tensor_shape();
-  shape->add_dim()->set_size(data.size());
+  auto* tensor_shape = tensor->mutable_tensor_shape();
+  int num_elements = 1;
+  for (int size : shape) {
+    tensor_shape->add_dim()->set_size(size);
+    num_elements *= size;
+  }
+  CHECK_EQ(num_elements, data.size());
 }
 
 void CreateMatrixShapeTensorConst(const string& name, int rows, int cols,
@@ -351,6 +358,14 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
+  if ((src_op.dilation_width_factor != 1) ||
+      (src_op.dilation_height_factor != 1)) {
+    auto& dilations = (*conv2d_op->mutable_attr())["dilations"];
+    dilations.mutable_list()->add_i(1);
+    dilations.mutable_list()->add_i(src_op.dilation_height_factor);
+    dilations.mutable_list()->add_i(src_op.dilation_width_factor);
+    dilations.mutable_list()->add_i(1);
+  }
   string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
@@ -464,6 +479,38 @@ void ConvertDepthwiseConvOperator(const Model& model,
   }
 }
 
+void ConvertTransposeConvOperator(const Model& model,
+                                  const TransposeConvOperator& src_op,
+                                  GraphDef* tensorflow_graph) {
+  auto* conv2d_op = tensorflow_graph->add_node();
+  conv2d_op->set_op("Conv2DBackpropInput");
+  conv2d_op->set_name(src_op.outputs[0]);
+  *conv2d_op->add_input() = src_op.inputs[0];
+  *conv2d_op->add_input() = src_op.inputs[1];
+  *conv2d_op->add_input() = src_op.inputs[2];
+  (*conv2d_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  const string& weights_array_name = WalkUpToConstantArray(
+      model, src_op.inputs[TransposeConvOperator::WEIGHTS]);
+  const auto& weights_array = model.GetArray(weights_array_name);
+  CHECK(weights_array.buffer->type == ArrayDataType::kFloat);
+  ConvertFloatTensorConst(model, weights_array_name, AxesOrder::kOHWI,
+                          AxesOrder::kHWIO, tensorflow_graph);
+  auto& strides = (*conv2d_op->mutable_attr())["strides"];
+  strides.mutable_list()->add_i(1);
+  strides.mutable_list()->add_i(src_op.stride_height);
+  strides.mutable_list()->add_i(src_op.stride_width);
+  strides.mutable_list()->add_i(1);
+  string padding;
+  if (src_op.padding.type == PaddingType::kSame) {
+    padding = "SAME";
+  } else if (src_op.padding.type == PaddingType::kValid) {
+    padding = "VALID";
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+  (*conv2d_op->mutable_attr())["padding"].set_s(padding);
+}
+
 void ConvertDepthToSpaceOperator(const Model& model,
                                  const DepthToSpaceOperator& src_op,
                                  GraphDef* tensorflow_graph) {
@@ -520,7 +567,7 @@ void ConvertFullyConnectedOperator(const Model& model,
       AvailableArrayName(model, matmul_output + "/transpose_weights");
   const string transpose_perm =
       AvailableArrayName(model, transpose_output + "/perm");
-  CreateIntTensorConst(transpose_perm, {1, 0}, tensorflow_graph);
+  CreateIntTensorConst(transpose_perm, {1, 0}, {2}, tensorflow_graph);
   auto transpose_op = tensorflow_graph->add_node();
   transpose_op->set_op("Transpose");
   transpose_op->set_name(transpose_output);
@@ -657,6 +704,15 @@ void ConvertRelu6Operator(const Relu6Operator& src_op,
   (*relu_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertLogOperator(const LogOperator& src_op, GraphDef* tensorflow_graph) {
+  auto* op = tensorflow_graph->add_node();
+  op->set_op("Log");
+  op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *op->add_input() = src_op.inputs[0];
+  (*op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertLogisticOperator(const LogisticOperator& src_op,
                              GraphDef* tensorflow_graph) {
   auto* relu_op = tensorflow_graph->add_node();
@@ -720,7 +776,8 @@ void ConvertLogSoftmaxOperator(const Model& model,
                                GraphDef* tensorflow_graph) {
   string softmax_input;
   Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
-  if (providing_op->type == OperatorType::kTensorFlowReshape) {
+  if (providing_op != nullptr &&
+      providing_op->type == OperatorType::kTensorFlowReshape) {
     softmax_input = src_op.inputs[0];
   } else {
     // Insert a reshape operator that reduces the dimensions down to the 2 that
@@ -826,6 +883,9 @@ void ConvertFakeQuantOperator(const FakeQuantOperator& src_op,
   CHECK(src_op.minmax);
   (*fakequant_op->mutable_attr())["min"].set_f(src_op.minmax->min);
   (*fakequant_op->mutable_attr())["max"].set_f(src_op.minmax->max);
+  if (src_op.num_bits) {
+    (*fakequant_op->mutable_attr())["num_bits"].set_i(src_op.num_bits);
+  }
 }
 
 void ConvertMaxPoolOperator(const MaxPoolOperator& src_op,
@@ -1537,9 +1597,11 @@ void ConvertSqueezeOperator(const Model& model, const SqueezeOperator& src_op,
   const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
   (*new_op->mutable_attr())["T"].set_type(params_type);
 
-  auto& squeeze_dims = (*new_op->mutable_attr())["squeeze_dims"];
-  for (int i : src_op.squeeze_dims) {
-    squeeze_dims.mutable_list()->add_i(i);
+  if (!src_op.squeeze_dims.empty()) {
+    auto& squeeze_dims = (*new_op->mutable_attr())["squeeze_dims"];
+    for (int i : src_op.squeeze_dims) {
+      squeeze_dims.mutable_list()->add_i(i);
+    }
   }
 }
 
@@ -1592,6 +1654,23 @@ void ConvertTopKV2Operator(const Model& model, const TopKV2Operator& src_op,
   (*topk_op->mutable_attr())["sorted"].set_b(true);
 }
 
+void ConvertRandomUniformOperator(const Model& model,
+                                  const RandomUniformOperator& src_op,
+                                  GraphDef* tensorflow_graph) {
+  CHECK(tensorflow_graph != nullptr);
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("RandomUniform");
+  CHECK_EQ(src_op.inputs.size(), 1);
+  new_op->set_name(src_op.outputs[0]);
+  *new_op->add_input() = src_op.inputs[0];
+  const auto shape_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(shape_type);
+  (*new_op->mutable_attr())["dtype"].set_type(
+      GetTensorFlowDataType(src_op.dtype));
+  (*new_op->mutable_attr())["seed"].set_i(src_op.seed);
+  (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -1636,6 +1715,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kRelu6) {
     ConvertRelu6Operator(static_cast<const Relu6Operator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLog) {
+    ConvertLogOperator(static_cast<const LogOperator&>(src_op),
+                       tensorflow_graph);
   } else if (src_op.type == OperatorType::kLogistic) {
     ConvertLogisticOperator(static_cast<const LogisticOperator&>(src_op),
                             tensorflow_graph);
@@ -1769,6 +1851,14 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertExpandDimsOperator(model,
                               static_cast<const ExpandDimsOperator&>(src_op),
                               tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTransposeConv) {
+    ConvertTransposeConvOperator(
+        model, static_cast<const TransposeConvOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRandomUniform) {
+    ConvertRandomUniformOperator(
+        model, static_cast<const RandomUniformOperator&>(src_op),
+        tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
@@ -1794,6 +1884,9 @@ void AddPlaceholder(const string& name, ArrayDataType type,
     case ArrayDataType::kInt64:
       (*placeholder->mutable_attr())["dtype"].set_type(DT_INT64);
       break;
+    case ArrayDataType::kInt16:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_INT16);
+      break;
     default:
       LOG(FATAL) << "Unexpected data type in array \"" << name << "\"";
   }
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 372c52558973f4aacc180ac44b9e95a5e9b199ef..495014c6fc67ab0ad7c975d0570034545e90f9bc 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -1,73 +1,72 @@
 # TensorFlow Lite Optimizing Converter command-line examples
 
-This page is a guide to using the TensorFlow Lite Optimizing Converter by
-looking at some example command lines. It is complemented by the following other
-documents:
+This page provides examples on how to use TOCO via command line. It is
+complemented by the following documents:
 
 *   [README](../README.md)
-*   [Command-line reference](cmdline_reference.md)
+*   [Command-line glossary](cmdline_reference.md)
+*   [Python API examples](python_api.md)
 
 Table of contents:
 
-[TOC]
-
-## Convert a TensorFlow GraphDef to TensorFlow Lite for float inference
-
-In this example, we look at the most common task: we have an ordinary TensorFlow
-GraphDef and want to convert it to a TensorFlow Lite flatbuffer to perform
-floating-point inference.
+*   [Convert a TensorFlow SavedModel to TensorFlow Lite](#savedmodel)
+*   [Convert a TensorFlow GraphDef to TensorFlow Lite for float
+    inference](#graphdef-float)
+*   [Quantization](#quantization)
+    *   [Convert a TensorFlow GraphDef to TensorFlow Lite for quantized
+        inference](#graphdef-quant)
+    *   [Use "dummy-quantization" to try out quantized inference on a float
+        graph](#dummy-quant)
+*   [Specifying input and output arrays](#specifying-input-and-output-arrays)
+    *   [Multiple output arrays](#multiple-output-arrays)
+    *   [Multiple input arrays](#multiple-input-arrays)
+    *   [Specifying subgraphs](#specifying-subgraphs)
+*   [Other conversions supported by TOCO](#other-conversions)
+    *   [Optimize a TensorFlow GraphDef](#optimize-graphdef)
+    *   [Convert a TensorFlow Lite FlatBuffer back into TensorFlow GraphDef
+        format](#to-graphdef)
+*   [Logging](#logging)
+    *   [Standard logging](#standard-logging)
+    *   [Verbose logging](#verbose-logging)
+    *   [Graph "video" logging](#graph-video-logging)
+*   [Graph visualizations](#graph-visualizations)
+    *   [Using --output_format=GRAPHVIZ_DOT](#using-output-formatgraphviz-dot)
+    *   [Using --dump_graphviz](#using-dump-graphviz)
+    *   [Legend for the graph visualizations](#graphviz-legend)
+
+## Convert a TensorFlow SavedModel to TensorFlow Lite <a name="savedmodel"></a>
+
+The follow example converts a basic TensorFlow SavedModel into a Tensorflow Lite
+FlatBuffer to perform floating-point inference.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
 bazel run --config=opt \
-  //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.tflite \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TFLITE \
-  --inference_type=FLOAT \
-  --input_shape=1,128,128,3 \
-  --input_array=input \
-  --output_array=MobilenetV1/Predictions/Reshape_1
+  third_party/tensorflow/contrib/lite/toco:toco -- \
+  --savedmodel_directory=/tmp/saved_model \
+  --output_file=/tmp/foo.tflite
 ```
 
-To explain each of these flags:
-
-*   `--input_format` and `--output_format` determine the formats of the input
-    and output files: here we are converting from `TENSORFLOW_GRAPHDEF` to
-    `TFLITE`.
-*   `--input_file` specifies the path of the input file, to be converted. When
-    `--input_format=TENSORFLOW_GRAPHDEF`, this file should be a
-    *[frozen](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)*
-    *inference* graph. Being frozen means in particular that the input file is
-    self-contained, and does not reference any external "checkpoint" file. An
-    *inference* graph is a version of a graph meant to be used for inference,
-    typically not the same graph file as was used for training a given model.
-*   `--output_file` specifies the destination to write the converted file to.
-*   `--input_array` specifies the input activations, that is, the input "tensor"
-    in the input TensorFlow GraphDef file. The array designated by
-    `--input_array` is the one that the user will have to provide the contents
-    of as input to the runtime inference code.
-*   `--output_array` specifies the output activations, that is, the output
-    "tensor" in the input TensorFlow GraphDef file. The runtime inference code
-    will store its results in the array designated by `--output_array`.
-*   `--input_shape` specifies the shape of the input array. It is currently
-    required, but the plan is for a future version to no longer require it,
-    allowing to defer the specification of the input shape until runtime. The
-    format of `input_shape` is always a comma-separated list of dimensions,
-    always in TensorFlow convention.
-*   `--inference_type` specifies what type of arithmetic the output file should
-    be relying on. It implies in particular the choice of type of the output
-    arrays in the output file.
-
-## Just optimize a TensorFlow GraphDef
+[SavedModel](https://www.tensorflow.org/programmers_guide/saved_model#using_savedmodel_with_estimators)
+has fewer required flags than frozen graphs (described [below](#graphdef-float))
+due to access to additional data contained within the SavedModel. The values for
+`--input_arrays` and `--output_arrays` are an aggregated, alphabetized list of
+the inputs and outputs in the
+[SignatureDefs](https://www.tensorflow.org/serving/signature_defs) within the
+[MetaGraphDef](https://www.tensorflow.org/programmers_guide/saved_model#apis_to_build_and_load_a_savedmodel)
+specified by `--savedmodel_tagset`. The value for `input_shapes` is
+automatically determined from the MetaGraphDef whenever possible. The default
+value for `--inference_type` for SavedModels is `FLOAT`.
 
-The converter accepts both TENSORFLOW_GRAPHDEF and TFLITE file formats as both
-`--input_format` and `--output_format`. This means that conversion from and to
-any supported format is possible, and in particular, same-format "conversions"
-are possible, and effectively ask the converter to optimize and simplify a
-graph. Example:
+There is currently no support for MetaGraphDefs without a SignatureDef or for
+MetaGraphDefs that use the [`assets/`
+directory](https://www.tensorflow.org/programmers_guide/saved_model#structure_of_a_savedmodel_directory).
+
+## Convert a TensorFlow GraphDef to TensorFlow Lite for float inference <a name="graphdef-float"></a>
+
+The follow example converts a basic TensorFlow GraphDef (frozen by
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py))
+into a TensorFlow Lite FlatBuffer to perform floating-point inference. Frozen
+graphs contain the variables stored in Checkpoint files as Const ops.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -75,56 +74,27 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.pb \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TENSORFLOW_GRAPHDEF \
+  --output_file=/tmp/foo.tflite \
+  --inference_type=FLOAT \
   --input_shape=1,128,128,3 \
   --input_array=input \
   --output_array=MobilenetV1/Predictions/Reshape_1
 ```
 
-Here we did not pass `--inference_type` because it is not considered applicable
-to the TensorFlow GraphDef format (as far as we are concerned, TensorFlow
-GraphDefs are technically always float, and the only flavor of "quantized"
-GraphDef that the converter deals with is "FakeQuantized" graphs that are still
-technically float graphs).
+## Quantization
 
-Below in the section about passing arbitrary input/output arrays we give another
-example, using the converter to extract just a sub-graph from a TensorFlow
-GraphDef.
+### Convert a TensorFlow GraphDef to TensorFlow Lite for quantized inference <a name="graphdef-quant"></a>
 
-## Convert a TensorFlow Lite flatbuffer back into TensorFlow GraphDef format
+TOCO is compatible with fixed point quantization models described
+[here](https://www.tensorflow.org/performance/quantization). These are float
+models with
+[`FakeQuant*`](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization)
+ops inserted at the boundaries of fused layers to record min-max range
+information. This generates a quantized inference workload that reproduces the
+quantization behavior that was used during training.
 
-As we mentioned that the converter supports file format conversions in any
-direction, let us just give an example of that:
-
-```
-bazel run --config=opt \
-  //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/foo.tflite \
-  --output_file=/tmp/foo.pb \
-  --input_format=TFLITE \
-  --output_format=TENSORFLOW_GRAPHDEF \
-  --input_shape=1,128,128,3 \
-  --input_array=input \
-  --output_array=MobilenetV1/Predictions/Reshape_1
-```
-
-## Convert a TensorFlow GraphDef to TensorFlow Lite for quantized inference
-
-Let us now look at a quantized model. As mentioned above, the only flavor of
-quantized TensorFlow GraphDefs that the converter is concerned with, is
-"FakeQuantized" models. These are technically float models, but with special
-`FakeQuant*` ops inserted at the boundaries of fused layers to record min-max
-range information allowing to generate a quantized inference workload that is
-able to reproduce exactly the specific quantization behavior that was used
-during training. Indeed, the whole point of quantized training is to allow for
-both training and inference to perform exactly the same arithmetic, so that the
-way that the training process about around quantization inaccuracy is
-effectively helping the quantized inference process to be more accurate.
-
-Given a quantized TensorFlow GraphDef, generating a quantized TensorFlow Lite
-flatbuffer is done like this:
+The following command generates a quantized TensorFlow Lite FlatBuffer from a
+"quantized" TensorFlow GraphDef.
 
 ```
 bazel run --config=opt \
@@ -141,36 +111,17 @@ bazel run --config=opt \
   --std_value=127
 ```
 
-Here, besides changing `--input_file` to point to a (fake-)quantized GraphDef,
-the only other changes are:
-
-*   To change `--inference_type` to `QUANTIZED_UINT8`. This effectively tells
-    the converter to generate an output file that performs quantized inference
-    on a quantized input.
-*   To pass `--mean_value` and `--std_value` flags to describe how the quantized
-    uint8 input array values are to be interpreted as the mathematical real
-    numbers that the graph is concerned with (keep in mind that even a
-    "fake-quantized" TensorFlow GraphDef is still technically a float graph).
-    The meaning of `--mean_value` and `--std_value` is explained in the
-    command-line reference; it suffices for now to say that they are a property
-    of each model.
+### Use \"dummy-quantization\" to try out quantized inference on a float graph <a name="dummy-quant"></a>
 
-## Use dummy-quantization to try out quantized inference on a float graph
+In order to evaluate the possible benefit of generating a quantized graph, TOCO
+allows "dummy-quantization" on float graphs. The flags `--default_ranges_min`
+and `--default_ranges_max` accept plausable values for the min-max ranges of the
+values in all arrays that do not have min-max information. "Dummy-quantization"
+will produce lower accuracy but will emulate the performance of a correctly
+quantized model.
 
-Sometimes, one only has a plain float graph, and one is curious as to how much
-faster inference might run if one could perform quantized inference instead of
-float inference. Rather than requiring users to first invest in quantizing their
-graphs before they can evaluate a possible benefit, the converter allows to
-simply experiment with what we call "dummy quantization": provide some vaguely
-plausible values for the min-max ranges of values in all arrays that do not have
-min-max information, so that quantization can carry on, certainly producing
-inaccurate results (do not use that in production!) but with performance
-characteristics that should be identical to those of an actually quantized
-flavor of the model.
-
-In the present example, we have a model using Relu6 activation functions almost
-everywhere, so a reasonable guess is that most activation ranges should be
-contained in [0, 6] and roughly comparable to it.
+The example below contains a model using Relu6 activation functions. Therefore,
+a reasonable guess is that most activation ranges should be contained in [0, 6].
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -191,15 +142,13 @@ bazel run --config=opt \
   --std_value=127.5
 ```
 
-## Multiple output arrays
+## Specifying input and output arrays
 
-Some models have multiple outputs. Even in a model with only one output, you may
-want for the inference code to return the contents of other arrays as well, or
-to perform inference on a subgraph with multiple outputs (see the section below
-on specifying arbitrary arrays as input/output arrays).
+### Multiple output arrays
 
-Either way, using `--output_arrays` instead of `--output_array` allows to
-specify a comma-separated list of output arrays.
+The flag `output_arrays` takes in a comma-separated list of output arrays as
+seen in the example below. This is useful for models or subgraphs with multiple
+outputs.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
@@ -216,18 +165,11 @@ bazel run --config=opt \
   --output_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu
 ```
 
-## Multiple input arrays
-
-Some models have multiple inputs; even in a model with a single input, you may
-want for the inference code to implement only a subgraph with multiple inputs
-(see the section below on specifying arbitrary arrays as input/output arrays).
+### Multiple input arrays
 
-Either way, multiple input arrays are specified by using `--input_arrays`
-instead of `--input_array` to specify a comma-separated list of input arrays. In
-that case, one also needs to use `--input_shapes` instead of `--input_shape`.
-The syntax for `--input_shapes` is a bit trickier, since already the singular
-`--input_shape` was a comma-separated list of integers! Multiple input shapes
-are delimited by a colon (`:`) in `--input_shapes`.
+The flag `input_arrays` takes in a comma-separated list of input arrays as seen
+in the example below. This is useful for models or subgraphs with multiple
+inputs.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
@@ -244,54 +186,93 @@ bazel run --config=opt \
   --output_array=InceptionV1/Logits/Predictions/Reshape_1
 ```
 
-## Specifying arbitrary arrays in a graph as input or output arrays
+Note that `input_shapes` is provided as a colon-separated list. Each input shape
+corresponds to the input array at the same position in the respective list.
 
-Any array in the input file can be specified as an input or output array. This
-allows to use the converter to extract a sub-graph out of the input graph file.
-The converter then automatically discards any part of the graph that is not
-needed for the subgraph identified by the specified input and output arrays.
-Another use case for specifying multiple output arrays is to get inference code
-to return the contents of some specified intermediate activations array, not
-just the output activations.
+### Specifying subgraphs
 
-In order to know which array you want to pass as `--input_arrays` /
-`--output_arrays`, it helps to have a visualization of the graph. See the
-section below on graph visualization. When using graph visualization for that
-purpose, make sure to use `--dump_graphviz=` to visualize exactly the graph as
-it is in the actual final form being exported to the output file.
+Any array in the input file can be specified as an input or output array in
+order to extract subgraphs out of an input graph file. TOCO discards the parts
+of the graph outside of the specific subgraph. Use [graph
+visualizations](#graph-visualizations) to identify the input and output arrays
+that make up the desired subgraph.
+
+The follow command shows how to extract a single fused layer out of a TensorFlow
+GraphDef.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --output_file=/tmp/foo.pb \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TENSORFLOW_GRAPHDEF \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
+  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --output_array=InceptionV1/InceptionV1/Mixed_3b/concat_v2
+```
 
 Note that the final representation of an on-device inference workload (say, in
-TensorFlow Lite flatbuffers format) tends to have coarser granularity than the
+TensorFlow Lite FlatBuffers format) tends to have coarser granularity than the
 very fine granularity of the TensorFlow GraphDef representation. For example,
 while a fully-connected layer is typically represented as at least four separate
 ops in TensorFlow GraphDef (Reshape, MatMul, BiasAdd, Relu...), it is typically
 represented as a single "fused" op (FullyConnected) in the converter's optimized
 representation and in the final on-device representation (e.g. in TensorFlow
-Lite flatbuffer format). As the level of granularity gets coarser, some
+Lite FlatBuffer format). As the level of granularity gets coarser, some
 intermediate arrays (say, the array between the MatMul and the BiasAdd in the
 TensorFlow GraphDef) are dropped. When specifying intermediate arrays as
-`--input_arrays` / `--output_arrays`, it is generally at least desirable (and
-often required) to specify arrays that are meant to survive in the final form of
-the graph, after fusing. These are typically the outputs of activation functions
-(since everything in each layer until the activation function tends to get
-fused).
+`--input_arrays` / `--output_arrays`, it is desirable (and often required) to
+specify arrays that are meant to survive in the final form of the graph, after
+fusing. These are typically the outputs of activation functions (since
+everything in each layer until the activation function tends to get fused).
+
+## Other conversions supported by TOCO <a name="other-conversions"></a>
+
+The converter accepts both TENSORFLOW_GRAPHDEF and TFLITE file formats as both
+`--input_format` and `--output_format`. This means that conversion to and from
+any supported format is possible.
 
-Here is an example of extracting just a sub-graph, namely just a single fused
-layer, out of a TensorFlow GraphDef, and exporting a TensorFlow GraphDef
-containing just that subgraph:
+### Optimize a TensorFlow GraphDef <a name="optimize-graphdef"></a>
+
+Same-format "conversions" can be used to optimize and simplify a graph or be
+used to [get a subgraph](#specifying-subgraphs) of a graph. The flag
+`--inference_type` is not required because TensorFlow graphs, including those
+containing the
+[`FakeQuant*`](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization)
+ops are always float graphs.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
   | tar xzv -C /tmp
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
   --output_file=/tmp/foo.pb \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TENSORFLOW_GRAPHDEF \
-  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
-  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
-  --output_array=InceptionV1/InceptionV1/Mixed_3b/concat_v2
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
+```
+
+### Convert a TensorFlow Lite FlatBuffer back into TensorFlow GraphDef format <a name="to-graphdef"></a>
+
+The converter supports file format conversions from TensorFlow Lite, back into
+TensorFlow GraphDef format.
+
+```
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/foo.tflite \
+  --output_file=/tmp/foo.pb \
+  --input_format=TFLITE \
+  --output_format=TENSORFLOW_GRAPHDEF \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
 ```
 
 ## Logging
@@ -299,8 +280,8 @@ bazel run --config=opt \
 ### Standard logging
 
 The converter generates some informative log messages during processing. The
-easiest way to view them is to add `--logtostderr` to command lines. For the
-previous example, that gives:
+easiest way to view them is to add `--logtostderr` to command lines as seen in
+the following example.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -333,42 +314,34 @@ I1101 21:51:33.309484    5339 toco_tooling.cc:249] Estimated count of arithmetic
 For debugging purposes, the converter supports two levels of verbose logging,
 which can be set by passing a `--v=` flag:
 
-*   At `--v=1`, the converter generates text dumps of the graph at various
-    points during processing, as well as log messages about every graph
-    transformation that did take place, typically answering questions of the
-    form "why was my graph transformed in this way"?
-*   At `--v=2`, the converter additionally generates log messages about graph
-    transformations that were considered but not actually performed, typically
-    answering questions of the form "why was my graph NOT transformed when I
-    expected it would be?".
+*   For `--v=1`, the converter generates text dumps of the graph at various
+    points during processing as well as log messages about every graph
+    transformation that took place.
+*   For `--v=2`, the converter additionally generates log messages about graph
+    transformations that were considered but not performed.
 
 ### Graph "video" logging
 
-When `--dump_graphviz=` is used (see the section on Graph visualizations), one
-may additionally pass `--dump_graphviz_video`, which causes a graph
-visualization to be dumped after each individual graph transformations, often
-resulting in thousands of files. Typically, one would then bisect into these
-files to understand when a given change was introduced in the graph.
+When `--dump_graphviz=` is used (see the section on [graph
+visualizations](#graph-visualizations)), one may additionally pass
+`--dump_graphviz_video`, which causes a graph visualization to be dumped after
+each individual graph transformation. This results in thousands of files.
+Typically, one would then bisect into these files to understand when a given
+change was introduced in the graph.
 
 ## Graph visualizations
 
-The converter is able to export a graph to the GraphViz Dot format, for easy
-visualization. Combined with the converter's ability to transform the graph into
-a simpler, coarser-granularity representation, that makes it a very powerful
-visualization tool.
-
-There are two ways to get the converter to export a GraphViz Dot file,
-corresponding to two separate use cases. Understanding the difference between
-them is key to getting useful graph visualizations.
+TOCO can export a graph to the GraphViz Dot format for easy visualization via
+either the `--output_format` flag or the `--dump_graphviz` flag. The subsections
+below outline the use cases for each.
 
 ### Using `--output_format=GRAPHVIZ_DOT`
 
-The first way to get a graphviz rendering is to pass
-`--output_format=GRAPHVIZ_DOT`, instead of the `--output_format` that you would
-otherwise use. This says: "I just want to get a plausible visualization of that
-graph". The upside is that it makes for very simple command lines, and makes the
-converter very lax about aspects of the graph or the command line that it would
-otherwise complain about. Example:
+The first way to get a graphviz rendering is to pass `GRAPHVIZ_DOT` into
+`--output_format`. This results in a plausable visualization of the graph. This
+reduces the requirements that normally exist during conversion between other
+input and output formats. For example, this may be useful if conversion from
+TENSORFLOW_GRAPHDEF to TFLITE is failing.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -391,7 +364,7 @@ dot -Tpdf -O /tmp/foo.dot
 ```
 
 And the resulting `.dot.pdf` can be viewed in any PDF viewer, but we suggest one
-with a good ability to pan and zoom across a very large page; Google Chrome does
+with a good ability to pan and zoom across a very large page. Google Chrome does
 well in that respect.
 
 ```
@@ -400,14 +373,14 @@ google-chrome /tmp/foo.dot.pdf
 
 Example PDF files are viewable online in the next section.
 
-### Using `--dump_graphviz=`
+### Using `--dump_graphviz`
 
-The second way to get a graphviz rendering is to pass a `--dump_graphviz=` flag
-specifying a destination directory to dump GraphViz rendering to. Unlike the
-previous approach, this one allows you to keep your real command-line (with your
-real `--output_format` and other flags) unchanged, just appending a
-`--dump_graphviz=` flag to it. This says: "I want visualizations of the actual
-graph during this specific conversion process". Example:
+The second way to get a graphviz rendering is to pass the `--dump_graphviz=`
+flag, specifying a destination directory to dump GraphViz rendering to. Unlike
+the previous approach, this one allows you to keep your real command-line (with
+your real `--output_format` and other flags) unchanged, just appending a
+`--dump_graphviz=` flag to it. This provides a visualization of the actual graph
+during a specific conversion process.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -425,8 +398,8 @@ bazel run --config=opt \
   --dump_graphviz=/tmp
 ```
 
-This generates a few files in the destination directory, here `/tmp`. Most
-important are these two files:
+This generates a few files in the destination directory, here `/tmp`. The two
+most important files are:
 
 ```
 /tmp/toco_AT_IMPORT.dot
@@ -442,8 +415,7 @@ conversion subsequently fails).
 
 `toco_AFTER_TRANSFORMATIONS.dot` represents the graph after all transformations
 were applied to it, just before it was exported to the `--output_file`.
-Typically, this is a much smaller graph, and it conveys much more information
-about each node.
+Typically, this is a much smaller graph with more information about each node.
 
 Again, these can be rendered to PDFs:
 
@@ -451,12 +423,12 @@ Again, these can be rendered to PDFs:
 dot -Tpdf -O /tmp/toco_*.dot
 ```
 
-The resulting files can be seen here:
+Sample output files can be seen here:
 
 *   [toco_AT_IMPORT.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AT_IMPORT.dot.pdf)
 *   [toco_AFTER_TRANSFORMATIONS.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AFTER_TRANSFORMATIONS.dot.pdf).
 
-### Legend for the graph visualizations
+### Legend for the graph visualizations <a name="graphviz-legend"></a>
 
 *   Operators are red square boxes with the following hues of red:
     *   Most operators are
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
index 5e077952235fa1aac1e12403d3d83633a617ccb7..9e99287f828c22aa81eb216c087f3261e378fc14 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -1,84 +1,47 @@
-# TensorFlow Lite Optimizing Converter command-line reference
+# TensorFlow Lite Optimizing Converter command-line glossary
 
 This page is complete reference of command-line flags. It is complemented by the
 following other documents:
 
 *   [README](../README.md)
 *   [Command-line examples](cmdline_examples.md)
+*   [Python API examples](python_api.md)
 
 Table of contents:
 
-[TOC]
-
-## High-level overview
-
-A full list and detailed specification of all flags is given in the next
-section. For now we focus on a higher-level description of command lines:
-
-```
-toco \
-  --input_format=... \
-  --output_format=... \
-  --input_file=... \
-  --output_file=... \
-  [model flags...] \
-  [transformation flags...] \
-  [logging flags...]
-```
-
-In other words, the converter requires at least the following mandatory flags:
-`--input_format`, `--output_format`, `--input_file`, `--output_file`. Depending
-on the input and output formats, additional flags may be allowed or mandatory:
-
-*   *Model flags* provide additional information about the model stored in the
-    input file.
-    *   `--output_array` or `--output_arrays` specify which arrays in the input
-        file are to be considered the output activations.
-    *   `--input_array` or `--input_arrays` specify which arrays in the input
-        file are to be considered the input activations.
-    *   `--input_shape` or `--input_shapes` specify the shapes of the input
-        arrays.
-    *   `--input_data_type` or `--input_data_types` specify the data types of
-        input arrays, which can be used if the input file does not already
-        specify them.
-    *   `--mean_value` or `--mean_values`, and `--std_value` or `--std_values`,
-        give the dequantization parameters of the input arrays, for the case
-        when the output file will accept quantized input arrays.
-*   *Transformation flags* specify options of the transformations to be applied
-    to the graph, i.e. they specify requested properties that the output file
-    should have.
-    *   `--inference_type` specifies the type of real-numbers arrays in the
-        output file. This only affects arrays of real numbers and allows to
-        control their quantization or dequantization, effectively switching
-        between floating-point and quantized arithmetic for the inference
-        workload, as far as real numbers are concerned. Other data types are
-        unaffected (e.g. plain integers, and strings).
-    *   `--inference_input_type` is like `--inference_type` but specifically
-        controlling input arrays, separately from other arrays. If not
-        specified, then `--inference_type` is used. The use case for specifying
-        `--inference_input_type` is when one wants to perform floating-point
-        inference on a quantized input, as is common in image models operating
-        on bitmap image inputs.
-    *   Some transformation flags allow to carry on with quantization when the
-        input graph is not properly quantized: `--default_ranges_min`,
-        `--default_ranges_max`, `--drop_fake_quant`,
-        `--reorder_across_fake_quant`.
-*   *Logging flags* described below.
-
-## Command-line flags complete reference
-
-### Mandatory flags
-
-*   `--input_format`. Type: string. Specifies the format of the input file.
-    Allowed values:
+*   [High-level flags](#high-level-flags)
+*   [Model flags](#model-flags)
+*   [Transformation flags](#transformation-flags)
+*   [Logging flags](#logging-flags)
+
+## High-level flags
+
+The following high level flags specify the location of the input and output
+files. The flag `--output_file` is always required. Additionally, either
+`--input_file` or `--savedmodel_directory` is required.
+
+*   `--savedmodel_directory`. Type: string. Specifies the full path to the
+    directory containing the SavedModel.
+*   `--savedmodel_tagset`. Type: string. Default:
+    [kSavedModelTagServe](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/tag_constants.h).
+    Specifies a comma-separated set of tags identifying the MetaGraphDef within
+    the SavedModel to analyze. All tags in the tag set must be specified.
+*   `--input_file`. Type: string. Specifies the path of the input file. This may
+    be either an absolute or a relative path.
+*   `--output_file`. Type: string. Specifies the path of the output file.
+
+The following high level flags specify the types of the input and output files:
+
+*   `--input_format`. Type: string. Default: `TENSORFLOW_GRAPHDEF`. Specifies
+    the format of the input file. Allowed values:
     *   `TENSORFLOW_GRAPHDEF` &mdash; The TensorFlow GraphDef format. Both
         binary and text proto formats are allowed.
-    *   `TFLITE` &mdash; The TensorFlow Lite flatbuffers format.
-*   `--output_format`. Type: string. Specifies the format of the output file.
-    Allowed values:
+    *   `TFLITE` &mdash; The TensorFlow Lite FlatBuffers format.
+*   `--output_format`. Type: string. Default: `TFLITE`. Specifies the format of
+    the output file. Allowed values:
     *   `TENSORFLOW_GRAPHDEF` &mdash; The TensorFlow GraphDef format. Always
         produces a file in binary (not text) proto format.
-    *   `TFLITE` &mdash; The TensorFlow Lite flatbuffers format.
+    *   `TFLITE` &mdash; The TensorFlow Lite FlatBuffers format.
         *   Whether a float or quantized TensorFlow Lite file will be produced
             depends on the `--inference_type` flag.
     *   `GRAPHVIZ_DOT` &mdash; The GraphViz `.dot` format. This asks the
@@ -95,11 +58,11 @@ on the input and output formats, additional flags may be allowed or mandatory:
             you get in your actual output format as opposed to just a merely
             plausible visualization of a model, consider using `--dump_graphviz`
             instead and keeping your true `--output_format`.
-*   `--input_file`. Type: string. Specifies the path of the input file. This may
-    be either an absolute or a relative path.
-*   `--output_file`. Type: string. Specifies the path of the output file.
 
-### Model flags
+## Model flags
+
+*Model flags* provide additional information about the model stored in the input
+file.
 
 *   `--output_array`. Type: string. Specifies a single array as the output
     activations. Incompatible with `--output_arrays`.
@@ -111,6 +74,10 @@ on the input and output formats, additional flags may be allowed or mandatory:
 *   `--input_arrays`. Type: comma-separated list of strings. Specifies a list of
     arrays as the input activations, for models with multiple inputs.
     Incompatible with `--input_array`.
+*   `--batch_size`. Type: integer. Default: 1. Specifies the batch size for the
+    model. Replaces the first dimension of an input size array if undefined. Use
+    only with SavedModels when neither `--input_shape` nor `input_shapes` flags
+    are specified. Incompatible with GraphDefs.
 
 When `--input_array` is used, the following flags are available to provide
 additional information about the single input array:
@@ -160,7 +127,11 @@ additional information about the multiple input arrays:
     the input arrays specified in `--input_arrays`, in the same order. See
     `--mean_value`, `--std_value` for details.
 
-### Transformation flags
+## Transformation flags
+
+*Transformation flags* specify options of the transformations to be applied to
+the graph, i.e. they specify requested properties that the output file should
+have.
 
 *   `--inference_type`. Type: string. Sets the type of real-number arrays in the
     output file, that is, controls the representation (quantization) of real
@@ -232,7 +203,7 @@ additional information about the multiple input arrays:
     graph transformations on them, at the cost of no longer faithfully matching
     inference and training arithmetic.
 
-### Logging flags
+## Logging flags
 
 The following are standard Google logging flags:
 
diff --git a/tensorflow/contrib/lite/toco/g3doc/python_api.md b/tensorflow/contrib/lite/toco/g3doc/python_api.md
index 440f9c367c25726e20aa8828e3050cd1dc1b230d..f0fd638a618c75c75d336a746f9b1d8dccaea470 100644
--- a/tensorflow/contrib/lite/toco/g3doc/python_api.md
+++ b/tensorflow/contrib/lite/toco/g3doc/python_api.md
@@ -1,5 +1,12 @@
 # TensorFlow Lite Optimizing Converter (TOCO) Python API reference
 
+This page provides examples on how to use TOCO via the Python API. It is
+complemented by the following documents:
+
+*   [README](../README.md)
+*   [Command-line examples](cmdline_examples.md)
+*   [Command-line glossary](cmdline_reference.md)
+
 ## High-level overview
 
 While the TensorFlow Lite Optimizing Converter can be used from the command
@@ -28,7 +35,7 @@ val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
 out = tf.identity(val, name="out")
 with tf.Session() as sess:
   tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
-  open("test.tflite", "wb").write(tflite_modeL)
+  open("test.tflite", "wb").write(tflite_model)
 ```
 
 **NOTE** Currently, the TOCO command will cause a fatal error to the Python
diff --git a/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg b/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg
new file mode 100644
index 0000000000000000000000000000000000000000..a47c088991299159be39bc490149720dae43eb53
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 720.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path fill="#434343" d="m338.49512 374.66016q-0.609375 0 -1.171875 -0.140625q-0.546875 -0.15625 -0.96875 -0.421875q-0.25 -0.15625 -0.359375 -0.296875q-0.09375 -0.140625 -0.09375 -0.34375q0 -0.171875 0.09375 -0.28125q0.109375 -0.109375 0.265625 -0.109375q0.171875 0 0.46875 0.1875q0.40625 0.25 0.796875 0.390625q0.390625 0.140625 0.984375 0.140625q0.71875 0 1.109375 -0.25q0.40625 -0.265625 0.40625 -0.734375q0 -0.296875 -0.15625 -0.46875q-0.140625 -0.1875 -0.5 -0.328125q-0.359375 -0.140625 -1.046875 -0.296875q-1.171875 -0.25 -1.6875 -0.671875q-0.5 -0.421875 -0.5 -1.15625q0 -0.578125 0.3125 -1.015625q0.328125 -0.4375 0.890625 -0.6875q0.5625 -0.265625 1.28125 -0.265625q0.53125 0 1.015625 0.140625q0.484375 0.140625 0.859375 0.390625q0.453125 0.328125 0.453125 0.671875q0 0.171875 -0.109375 0.296875q-0.109375 0.125 -0.25 0.125q-0.15625 0 -0.484375 -0.234375q-0.375 -0.234375 -0.703125 -0.359375q-0.328125 -0.140625 -0.828125 -0.140625q-0.625 0 -1.015625 0.28125q-0.375 0.265625 -0.375 0.734375q0 0.296875 0.140625 0.484375q0.140625 0.171875 0.46875 0.3125q0.328125 0.140625 0.9375 0.28125q0.90625 0.1875 1.40625 0.4375q0.5 0.234375 0.703125 0.578125q0.21875 0.34375 0.21875 0.890625q0 0.828125 -0.703125 1.34375q-0.703125 0.515625 -1.859375 0.515625zm9.241241 -1.59375q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.5551147 -0.8125q0.546875 -0.03125 0.546875 0.453125q0 0.21875 -0.125 0.34375q-0.109375 0.125 -0.40625 0.15625l-0.390625 0.03125q-0.890625 0.078125 -1.328125 0.640625q-0.4375 0.546875 -0.4375 1.296875l0 3.234375q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.359375 0.140625q0.140625 0.140625 0.140625 0.375l0 0.75q0.28125 -0.578125 0.796875 -0.890625q0.515625 -0.3125 1.1875 -0.359375l0.1875 -0.015625zm6.157959 0.328125q0.15625 -0.3125 0.46875 -0.3125q0.203125 0 0.359375 0.140625q0.15625 0.125 0.15625 0.328125q0 0.109375 -0.046875 0.203125l-2.59375 5.609375q-0.078125 0.171875 -0.25 0.28125q-0.15625 0.09375 -0.34375 0.09375q-0.171875 0 -0.328125 -0.09375q-0.15625 -0.109375 -0.25 -0.28125l-2.59375 -5.609375q-0.046875 -0.09375 -0.046875 -0.1875q0 -0.203125 0.171875 -0.34375q0.1875 -0.15625 0.390625 -0.15625q0.140625 0 0.265625 0.078125q0.125 0.078125 0.1875 0.234375l2.234375 5.0l2.21875 -4.984375zm7.2099915 4.796875q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.5551453 -0.8125q0.546875 -0.03125 0.546875 0.453125q0 0.21875 -0.125 0.34375q-0.109375 0.125 -0.40625 0.15625l-0.390625 0.03125q-0.890625 0.078125 -1.328125 0.640625q-0.4375 0.546875 -0.4375 1.296875l0 3.234375q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.359375 0.140625q0.140625 0.140625 0.140625 0.375l0 0.75q0.28125 -0.578125 0.796875 -0.890625q0.515625 -0.3125 1.1875 -0.359375l0.1875 -0.015625z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path fill="#434343" d="m134.36497 56.831844q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm9.004181 -1.421875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.839676 -0.75q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm5.84729 6.0625q-0.56248474 0 -1.0624847 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.87498474 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0624847 -0.234375 -1.5156097 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.1562347 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.56248474 0 -0.90623474 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84373474 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.2131653 0q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm7.1288147 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm1.970398 6.03125q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.5434265 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm4.721527 0.015625q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm12.222534 -4.9375q0.125 -0.28125 0.390625 -0.28125q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.078125 -0.03125 0.171875l-1.984375 5.046875q-0.078125 0.15625 -0.21875 0.25q-0.140625 0.078125 -0.296875 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-1.65625 -4.21875l-1.640625 4.21875q-0.0625 0.15625 -0.203125 0.25q-0.140625 0.078125 -0.3125 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-1.984375 -5.03125q-0.046875 -0.09375 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.171875 -0.140625 0.359375 -0.140625q0.296875 0 0.40625 0.296875l1.65625 4.421875l1.6875 -4.390625q0.078125 -0.15625 0.203125 -0.234375q0.125 -0.09375 0.265625 -0.09375q0.15625 0 0.28125 0.09375q0.125 0.078125 0.1875 0.234375l1.6875 4.375l1.65625 -4.40625zm12.637604 5.09375q0.046875 0.09375 0.046875 0.203125q0 0.171875 -0.140625 0.296875q-0.140625 0.125 -0.328125 0.125q-0.296875 0 -0.421875 -0.296875l-0.84375 -1.9375l-4.53125 0l-0.859375 1.9375q-0.125 0.296875 -0.421875 0.296875q-0.1875 0 -0.34375 -0.125q-0.140625 -0.125 -0.140625 -0.3125q0 -0.09375 0.046875 -0.1875l3.4375 -7.640625q0.078125 -0.15625 0.21875 -0.234375q0.140625 -0.09375 0.3125 -0.09375q0.171875 0 0.3125 0.09375q0.15625 0.078125 0.21875 0.234375l3.4375 7.640625zm-5.859375 -2.421875l3.8125 0l-1.90625 -4.3125l-1.90625 4.3125zm7.78656 3.046875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm4.9744263 4.34375q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm4.4157715 0.015625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m409.42255 374.66803q-0.90625 0 -1.609375 -0.40625q-0.6875 -0.421875 -1.078125 -1.171875q-0.375 -0.765625 -0.375 -1.765625q0 -1.0 0.390625 -1.765625q0.40625 -0.78125 1.109375 -1.203125q0.703125 -0.4375 1.625 -0.4375q0.5 0 1.0 0.140625q0.5 0.140625 0.875 0.40625q0.234375 0.171875 0.328125 0.328125q0.109375 0.140625 0.109375 0.328125q0 0.1875 -0.109375 0.3125q-0.09375 0.109375 -0.25 0.109375q-0.09375 0 -0.203125 -0.046875q-0.09375 -0.046875 -0.171875 -0.09375q-0.078125 -0.0625 -0.09375 -0.078125q-0.359375 -0.234375 -0.671875 -0.359375q-0.3125 -0.140625 -0.765625 -0.140625q-0.96875 0 -1.515625 0.671875q-0.53125 0.65625 -0.53125 1.828125q0 1.171875 0.53125 1.8125q0.546875 0.640625 1.515625 0.640625q0.453125 0 0.78125 -0.125q0.328125 -0.140625 0.65625 -0.375q0.15625 -0.09375 0.28125 -0.15625q0.140625 -0.0625 0.234375 -0.0625q0.140625 0 0.234375 0.125q0.109375 0.109375 0.109375 0.296875q0 0.171875 -0.09375 0.3125q-0.09375 0.140625 -0.34375 0.3125q-0.375 0.25 -0.90625 0.40625q-0.515625 0.15625 -1.0625 0.15625zm4.2591553 -0.03125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -8.46875q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 8.46875q0 0.25 -0.15625 0.390625q-0.15625 0.140625 -0.375 0.140625zm3.092102 0q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.234375 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 5.625q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125zm0 -8.09375q-0.3125 0 -0.515625 -0.171875q-0.203125 -0.1875 -0.203125 -0.5q0 -0.296875 0.203125 -0.484375q0.203125 -0.1875 0.515625 -0.1875q0.328125 0 0.515625 0.1875q0.203125 0.1875 0.203125 0.484375q0 0.3125 -0.203125 0.5q-0.1875 0.171875 -0.515625 0.171875zm7.5765076 6.53125q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.6020203 -0.84375q2.328125 0 2.328125 2.578125l0 3.609375q0 0.25 -0.140625 0.390625q-0.140625 0.140625 -0.390625 0.140625q-0.25 0 -0.40625 -0.140625q-0.140625 -0.140625 -0.140625 -0.390625l0 -3.546875q0 -0.90625 -0.359375 -1.3125q-0.34375 -0.421875 -1.125 -0.421875q-0.890625 0 -1.421875 0.546875q-0.53125 0.546875 -0.53125 1.484375l0 3.25q0 0.25 -0.140625 0.390625q-0.140625 0.140625 -0.390625 0.140625q-0.25 0 -0.40625 -0.140625q-0.140625 -0.140625 -0.140625 -0.390625l0 -5.625q0 -0.234375 0.140625 -0.375q0.15625 -0.15625 0.40625 -0.15625q0.234375 0 0.375 0.15625q0.140625 0.140625 0.140625 0.359375l0 0.6875q0.328125 -0.609375 0.890625 -0.921875q0.578125 -0.3125 1.3125 -0.3125zm7.304718 5.875q0.46875 0.03125 0.46875 0.421875q0 0.21875 -0.171875 0.34375q-0.171875 0.109375 -0.5 0.078125l-0.359375 -0.015625q-1.0625 -0.09375 -1.578125 -0.640625q-0.5 -0.5625 -0.5 -1.703125l0 -3.34375l-0.890625 0q-0.234375 0 -0.359375 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.203125 0.125 -0.3125q0.125 -0.125 0.359375 -0.125l0.890625 0l0 -1.515625q0 -0.25 0.140625 -0.390625q0.15625 -0.140625 0.40625 -0.140625q0.234375 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 1.515625l1.484375 0q0.203125 0 0.328125 0.125q0.140625 0.109375 0.140625 0.3125q0 0.1875 -0.140625 0.296875q-0.125 0.109375 -0.328125 0.109375l-1.484375 0l0 3.40625q0 0.734375 0.296875 1.0625q0.296875 0.3125 0.90625 0.359375l0.359375 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path fill="#000000" d="m237.0857 213.5031q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm4.248535 1.71875q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.417801 3.875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.199051 4.46875q0.203125 0 0.296875 0.109375q0.109375 0.09375 0.109375 0.265625q0 0.1875 -0.109375 0.296875q-0.09375 0.09375 -0.296875 0.09375l-4.203125 0q-0.203125 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.3125q0 -0.1875 0.140625 -0.359375l3.546875 -4.28125l-3.28125 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l4.0625 0q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.3125q0 0.1875 -0.140625 0.359375l-3.5625 4.28125l3.421875 0zm6.2547913 -0.59375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm3.3865662 5.875q-0.171875 0 -0.28125 -0.09375q-0.109375 -0.09375 -0.109375 -0.21875q0 -0.140625 0.109375 -0.234375q0.109375 -0.09375 0.28125 -0.09375l5.21875 0q0.171875 0 0.28125 0.09375q0.109375 0.09375 0.109375 0.234375q0 0.125 -0.109375 0.21875q-0.109375 0.09375 -0.28125 0.09375l-5.21875 0zm11.2500305 -6.609375q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 5.09375q0 1.296875 -0.671875 1.96875q-0.671875 0.671875 -1.984375 0.671875q-1.28125 0 -2.140625 -0.515625q-0.421875 -0.234375 -0.421875 -0.546875q0 -0.171875 0.078125 -0.28125q0.09375 -0.109375 0.234375 -0.109375q0.125 0 0.4375 0.171875q0.421875 0.21875 0.828125 0.34375q0.40625 0.140625 0.96875 0.140625q0.859375 0 1.28125 -0.453125q0.4375 -0.453125 0.4375 -1.3125l0 -1.03125q-0.25 0.5625 -0.78125 0.859375q-0.515625 0.296875 -1.21875 0.296875q-0.765625 0 -1.359375 -0.359375q-0.59375 -0.359375 -0.9375 -1.015625q-0.328125 -0.65625 -0.328125 -1.515625q0 -0.875 0.328125 -1.53125q0.34375 -0.65625 0.9375 -1.015625q0.59375 -0.359375 1.359375 -0.359375q0.6875 0 1.203125 0.296875q0.515625 0.296875 0.78125 0.84375l0 -0.640625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625zm-2.28125 4.984375q0.84375 0 1.3125 -0.546875q0.484375 -0.5625 0.484375 -1.546875q0 -0.984375 -0.46875 -1.53125q-0.46875 -0.5625 -1.328125 -0.5625q-0.84375 0 -1.34375 0.5625q-0.484375 0.546875 -0.484375 1.53125q0 0.984375 0.484375 1.546875q0.5 0.546875 1.34375 0.546875zm7.4695435 -4.984375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.20282 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.331665 6.046875q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm5.2167664 -6.046875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.45282 -4.9375q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-3.375 7.28125q-0.0625 0.125 -0.171875 0.1875q-0.109375 0.078125 -0.234375 0.078125q-0.1875 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.09375 0.046875 -0.1875l0.84375 -1.8125l-2.375 -5.140625q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875z" fill-rule="nonzero"/><path fill="#f4cccc" d="m154.36745 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m154.36745 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m184.89111 339.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm8.160431 0.03125q-1.171875 0 -2.046875 -0.515625q-0.859375 -0.53125 -1.328125 -1.5q-0.46875 -0.984375 -0.46875 -2.296875q0 -1.34375 0.453125 -2.3125q0.46875 -0.984375 1.328125 -1.5q0.875 -0.53125 2.0625 -0.53125q1.1875 0 2.0625 0.53125q0.875 0.515625 1.328125 1.5q0.46875 0.96875 0.46875 2.296875q0 1.3125 -0.46875 2.296875q-0.46875 0.984375 -1.34375 1.515625q-0.859375 0.515625 -2.046875 0.515625zm0 -0.84375q1.34375 0 2.09375 -0.90625q0.75 -0.90625 0.75 -2.578125q0 -1.6875 -0.75 -2.578125q-0.734375 -0.90625 -2.09375 -0.90625q-1.34375 0 -2.09375 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.09375 0.90625zm9.214935 0.84375q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm8.077179 0q-1.171875 0 -2.046875 -0.515625q-0.859375 -0.53125 -1.328125 -1.5q-0.46875 -0.984375 -0.46875 -2.296875q0 -1.34375 0.453125 -2.3125q0.46875 -0.984375 1.328125 -1.5q0.875 -0.53125 2.0625 -0.53125q1.1875 0 2.0625 0.53125q0.875 0.515625 1.328125 1.5q0.46875 0.96875 0.46875 2.296875q0 1.3125 -0.46875 2.296875q-0.46875 0.984375 -1.34375 1.515625q-0.859375 0.515625 -2.046875 0.515625zm0 -0.84375q1.34375 0 2.09375 -0.90625q0.75 -0.90625 0.75 -2.578125q0 -1.6875 -0.75 -2.578125q-0.734375 -0.90625 -2.09375 -0.90625q-1.34375 0 -2.09375 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.09375 0.90625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m314.7006 332.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm5.113556 0q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.6840515 -0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -7.5625q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.171875l3.875 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-4.375 0zm6.3394165 0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm4.987152 6.515625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#000000" d="m303.37402 346.47687q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.5434265 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm4.674652 -6.046875q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm4.3300476 -5.28125q0.765625 0 1.34375 0.375q0.59375 0.359375 0.921875 1.046875q0.328125 0.6875 0.328125 1.59375q0 0.90625 -0.328125 1.59375q-0.328125 0.6875 -0.921875 1.078125q-0.578125 0.375 -1.34375 0.375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 0.640625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.203125q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.59375q0.46875 -0.59375 0.46875 -1.65625q0 -1.046875 -0.46875 -1.625q-0.46875 -0.578125 -1.328125 -0.578125q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.687164 -5.25q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm4.8726807 -1.71875q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm3.9360352 0q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm5.873535 6.328125q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m443.6039 332.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm5.113556 0q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.6840515 -0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -7.5625q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.171875l3.875 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-4.375 0zm6.3394165 0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm4.987152 6.515625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.908142 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#000000" d="m429.9527 346.47687q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm5.237152 1.234375q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.56604 5.28125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm4.282898 -0.015625q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.14032 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.5896606 4.53125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m371.61902 334.89435l41.417297 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m371.61902 334.89435l37.990234 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m409.60925 334.89435l-1.1245728 1.1246033l3.0897522 -1.1246033l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m587.0588 293.13934q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.375 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84375 0 1.5625 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.15625 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.109375 0 2.03125 -0.328125l0 -2.578125l-1.75 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.234375 0zm2.8911743 4.46875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm7.7869263 4.375q-1.65625 0 -2.515625 -0.859375q-0.84375 -0.859375 -0.84375 -2.546875l0 -4.703125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.78125q0 1.25 0.609375 1.875q0.609375 0.609375 1.78125 0.609375q1.171875 0 1.765625 -0.609375q0.609375 -0.625 0.609375 -1.875l0 -4.78125q0 -0.234375 0.140625 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.703125q0 1.671875 -0.859375 2.546875q-0.859375 0.859375 -2.5 0.859375z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m584.63763 339.50812q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm5.0302734 -0.03125q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm7.7869263 4.375q-1.65625 0 -2.515625 -0.859375q-0.84375 -0.859375 -0.84375 -2.546875l0 -4.703125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.78125q0 1.25 0.609375 1.875q0.609375 0.609375 1.78125 0.609375q1.171875 0 1.765625 -0.609375q0.609375 -0.625 0.609375 -1.875l0 -4.78125q0 -0.234375 0.140625 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.703125q0 1.671875 -0.859375 2.546875q-0.859375 0.859375 -2.5 0.859375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m241.86351 334.89435l42.267715 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m241.86351 334.89435l38.840652 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m280.70413 334.89435l-1.1245728 1.1246033l3.0897827 -1.1246033l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#d9ead3" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m437.52493 135.68242l73.763794 0l0 31.748032l-73.763794 0z" fill-rule="evenodd"/><path fill="#000000" d="m448.0718 156.20241q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm8.3211975 -5.140625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.767517 -5.28125q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm10.15921 0.75q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm8.691681 -5.71875q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-3.375 7.28125q-0.0625 0.125 -0.171875 0.1875q-0.109375 0.078125 -0.234375 0.078125q-0.1875 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.09375 0.046875 -0.1875l0.84375 -1.8125l-2.375 -5.140625q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875zm4.902405 -0.328125q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.76532 -0.640625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m544.40155 135.68242l100.0 0l0 31.748032l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m554.9328 156.26491q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm5.3845215 -6.046875q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.456726 -1.703125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm4.248535 1.71875q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm6.3444214 0.765625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.47876 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm4.283142 -5.265625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.782898 0q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm4.7008057 6.046875q-0.8125 0 -1.453125 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.6875 -0.34375 -1.578125q0 -0.90625 0.359375 -1.59375q0.359375 -0.703125 0.984375 -1.078125q0.640625 -0.390625 1.46875 -0.390625q0.453125 0 0.90625 0.125q0.453125 0.125 0.78125 0.359375q0.21875 0.140625 0.3125 0.28125q0.09375 0.140625 0.09375 0.3125q0 0.171875 -0.09375 0.28125q-0.09375 0.09375 -0.234375 0.09375q-0.078125 0 -0.1875 -0.046875q-0.09375 -0.046875 -0.15625 -0.09375q-0.0625 -0.046875 -0.09375 -0.0625q-0.3125 -0.203125 -0.59375 -0.3125q-0.28125 -0.125 -0.6875 -0.125q-0.875 0 -1.359375 0.59375q-0.484375 0.59375 -0.484375 1.65625q0 1.046875 0.484375 1.625q0.484375 0.578125 1.359375 0.578125q0.40625 0 0.703125 -0.109375q0.296875 -0.125 0.59375 -0.328125q0.140625 -0.09375 0.25 -0.15625q0.125 -0.0625 0.203125 -0.0625q0.140625 0 0.21875 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.15625 -0.09375 0.28125q-0.078125 0.125 -0.296875 0.28125q-0.34375 0.234375 -0.8125 0.375q-0.46875 0.125 -0.953125 0.125zm6.029297 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.830017 -5.265625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm5.1851807 0q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#d9ead3" d="m78.872284 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m78.872284 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m93.328064 272.6459q-0.90625 0 -1.734375 -0.265625q-0.8125 -0.265625 -1.3125 -0.734375q-0.171875 -0.15625 -0.171875 -0.40625q0 -0.171875 0.09375 -0.296875q0.09375 -0.125 0.234375 -0.125q0.15625 0 0.328125 0.125q1.109375 0.859375 2.546875 0.859375q1.03125 0 1.578125 -0.390625q0.5625 -0.390625 0.5625 -1.125q0 -0.421875 -0.265625 -0.671875q-0.265625 -0.265625 -0.703125 -0.421875q-0.4375 -0.15625 -1.15625 -0.328125q-0.984375 -0.21875 -1.625 -0.46875q-0.625 -0.265625 -1.015625 -0.734375q-0.390625 -0.46875 -0.390625 -1.21875q0 -0.71875 0.390625 -1.265625q0.390625 -0.5625 1.09375 -0.875q0.703125 -0.3125 1.59375 -0.3125q0.84375 0 1.5625 0.265625q0.734375 0.25 1.234375 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.125 0 -0.34375 -0.140625q-0.59375 -0.46875 -1.09375 -0.65625q-0.5 -0.203125 -1.21875 -0.203125q-0.984375 0 -1.546875 0.421875q-0.546875 0.40625 -0.546875 1.15625q0 0.625 0.484375 0.953125q0.484375 0.3125 1.5 0.5625q1.09375 0.25 1.71875 0.484375q0.625 0.21875 1.03125 0.671875q0.421875 0.4375 0.421875 1.171875q0 0.71875 -0.390625 1.265625q-0.390625 0.53125 -1.109375 0.828125q-0.703125 0.296875 -1.609375 0.296875zm6.9353027 -6.078125q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm8.578796 -4.96875q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-2.34375 5.046875q-0.0625 0.15625 -0.21875 0.25q-0.140625 0.078125 -0.3125 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-2.328125 -5.046875q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875zm6.480545 4.296875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.589676 -3.28125q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.328125l0 7.625q0 0.21875 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.640625q-0.265625 0.546875 -0.78125 0.84375q-0.5 0.296875 -1.1875 0.296875q-0.765625 0 -1.359375 -0.375q-0.578125 -0.390625 -0.90625 -1.078125q-0.328125 -0.6875 -0.328125 -1.59375q0 -0.90625 0.328125 -1.59375q0.328125 -0.6875 0.90625 -1.046875q0.59375 -0.375 1.359375 -0.375q0.6875 0 1.1875 0.296875q0.515625 0.296875 0.78125 0.84375l0 -3.203125q0 -0.21875 0.125 -0.34375q0.125 -0.125 0.359375 -0.125zm-2.25 7.796875q0.84375 0 1.296875 -0.578125q0.46875 -0.59375 0.46875 -1.65625q0 -1.0625 -0.46875 -1.640625q-0.453125 -0.578125 -1.296875 -0.578125q-0.859375 0 -1.34375 0.578125q-0.46875 0.578125 -0.46875 1.625q0 1.0625 0.46875 1.65625q0.484375 0.59375 1.34375 0.59375zm12.202805 -7.796875q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.359375l0 7.59375q0 0.21875 -0.125 0.359375q-0.109375 0.125 -0.328125 0.125q-0.21875 0 -0.328125 -0.125q-0.109375 -0.140625 -0.109375 -0.359375l0 -6.125l-2.59375 4.984375q-0.171875 0.34375 -0.5 0.34375q-0.3125 0 -0.484375 -0.34375l-2.625 -4.921875l0 6.0625q0 0.21875 -0.109375 0.359375q-0.109375 0.125 -0.328125 0.125q-0.21875 0 -0.34375 -0.125q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.59375q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.3125 0 0.484375 0.34375l3.046875 5.84375l3.015625 -5.84375q0.09375 -0.1875 0.203125 -0.265625q0.125 -0.078125 0.28125 -0.078125zm4.8576965 8.59375q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm8.925674 -7.796875q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.328125l0 7.625q0 0.21875 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.640625q-0.265625 0.546875 -0.78125 0.84375q-0.5 0.296875 -1.1875 0.296875q-0.765625 0 -1.359375 -0.375q-0.578125 -0.390625 -0.90625 -1.078125q-0.328125 -0.6875 -0.328125 -1.59375q0 -0.90625 0.328125 -1.59375q0.328125 -0.6875 0.90625 -1.046875q0.59375 -0.375 1.359375 -0.375q0.6875 0 1.1875 0.296875q0.515625 0.296875 0.78125 0.84375l0 -3.203125q0 -0.21875 0.125 -0.34375q0.125 -0.125 0.359375 -0.125zm-2.25 7.796875q0.84375 0 1.296875 -0.578125q0.46875 -0.59375 0.46875 -1.65625q0 -1.0625 -0.46875 -1.640625q-0.453125 -0.578125 -1.296875 -0.578125q-0.859375 0 -1.34375 0.578125q-0.46875 0.578125 -0.46875 1.625q0 1.0625 0.46875 1.65625q0.484375 0.59375 1.34375 0.59375zm9.06218 -0.640625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm4.386551 5.296875q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m215.10997 150.37688q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.375 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84375 0 1.5625 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.15625 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.109375 0 2.03125 -0.328125l0 -2.578125l-1.75 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.234375 0zm5.1568146 -1.5625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.2028046 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.5035553 5.984375q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm10.461807 -0.515625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.480301 -2.453125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m260.00964 265.61465q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm8.9496765 -6.03125q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.767273 6.046875q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm8.535065 -0.046875q0.203125 0 0.296875 0.109375q0.109375 0.09375 0.109375 0.265625q0 0.1875 -0.109375 0.296875q-0.09375 0.09375 -0.296875 0.09375l-4.203125 0q-0.203125 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.3125q0 -0.1875 0.140625 -0.359375l3.546875 -4.28125l-3.28125 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l4.0625 0q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.3125q0 0.1875 -0.140625 0.359375l-3.5625 4.28125l3.421875 0zm6.2547913 -0.59375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.8396606 -0.75q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125z" fill-rule="nonzero"/><path fill="#000000" d="m258.07846 275.1459q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.3749847 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84373474 0 1.5624847 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.1562347 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.1093597 0 2.0312347 -0.328125l0 -2.578125l-1.7499847 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.2343597 0zm5.15683 -1.5625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.2027893 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.5035706 5.984375q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm10.461792 -0.515625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.480316 -2.453125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 232.16667l0 20.377945" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 232.16667l0 16.950867" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.85565 249.11754l-1.1246033 -1.124588l1.1246033 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#f4cccc" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m67.63894 87.62236q0.171875 0.15625 0.171875 0.359375q0 0.15625 -0.140625 0.296875q-0.140625 0.140625 -0.3125 0.140625q-0.15625 0 -0.328125 -0.140625l-4.484375 -3.921875l0 3.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 3.4375l4.28125 -3.796875q0.125 -0.140625 0.3125 -0.140625q0.171875 0 0.296875 0.140625q0.140625 0.140625 0.140625 0.3125q0 0.171875 -0.15625 0.328125l-3.875 3.421875l4.09375 3.5625zm5.8329315 -0.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.792801 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm6.3444214 0.765625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m127.74803 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.74803 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m147.45874 88.37367q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.484375 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-4.015625 0l0 2.9375l3.78125 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.78125 0l0 3.078125l4.015625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-4.484375 0zm8.31218 0.078125q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.4787903 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm1.8769073 0.765625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm8.799652 1.234375q1.9375 0 1.9375 2.3125l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.328125 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.890625 -0.359375q-0.734375 0 -1.15625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.90625 -0.359375q-0.71875 0 -1.140625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.359375 -0.140625q0.203125 0 0.328125 0.125q0.140625 0.125 0.140625 0.34375l0 0.578125q0.265625 -0.515625 0.734375 -0.78125q0.46875 -0.28125 1.078125 -0.28125q1.375 0 1.78125 1.140625q0.265625 -0.515625 0.78125 -0.828125q0.515625 -0.3125 1.171875 -0.3125zm6.0990753 0q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.8144073 0.78125q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm7.1287994 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m269.00754 88.46742q-0.90625 0 -1.734375 -0.265625q-0.8125 -0.265625 -1.3125 -0.734375q-0.171875 -0.15625 -0.171875 -0.40625q0 -0.171875 0.09375 -0.296875q0.09375 -0.125 0.234375 -0.125q0.15625 0 0.328125 0.125q1.109375 0.859375 2.546875 0.859375q1.03125 0 1.578125 -0.390625q0.5625 -0.390625 0.5625 -1.125q0 -0.421875 -0.265625 -0.671875q-0.265625 -0.265625 -0.703125 -0.421875q-0.4375 -0.15625 -1.15625 -0.328125q-0.984375 -0.21875 -1.625 -0.46875q-0.625 -0.265625 -1.015625 -0.734375q-0.390625 -0.46875 -0.390625 -1.21875q0 -0.71875 0.390625 -1.265625q0.390625 -0.5625 1.09375 -0.875q0.703125 -0.3125 1.59375 -0.3125q0.84375 0 1.5625 0.265625q0.734375 0.25 1.234375 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.125 0 -0.34375 -0.140625q-0.59375 -0.46875 -1.09375 -0.65625q-0.5 -0.203125 -1.21875 -0.203125q-0.984375 0 -1.546875 0.421875q-0.546875 0.40625 -0.546875 1.15625q0 0.625 0.484375 0.953125q0.484375 0.3125 1.5 0.5625q1.09375 0.25 1.71875 0.484375q0.625 0.21875 1.03125 0.671875q0.421875 0.4375 0.421875 1.171875q0 0.71875 -0.390625 1.265625q-0.390625 0.53125 -1.109375 0.828125q-0.703125 0.296875 -1.609375 0.296875zm5.0446777 -0.03125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm2.784027 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm8.799652 1.234375q1.9375 0 1.9375 2.3125l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.328125 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.890625 -0.359375q-0.734375 0 -1.15625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.90625 -0.359375q-0.71875 0 -1.140625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.359375 -0.140625q0.203125 0 0.328125 0.125q0.140625 0.125 0.140625 0.34375l0 0.578125q0.265625 -0.515625 0.734375 -0.78125q0.46875 -0.28125 1.078125 -0.28125q1.375 0 1.78125 1.140625q0.265625 -0.515625 0.78125 -0.828125q0.515625 -0.3125 1.171875 -0.3125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m297.8283 154.87688q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm7.358429 -6.078125q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm8.37854 4.625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.308441 5.3125q-0.8125 0 -1.453125 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.6875 -0.34375 -1.578125q0 -0.90625 0.359375 -1.59375q0.359375 -0.703125 0.984375 -1.078125q0.640625 -0.390625 1.46875 -0.390625q0.453125 0 0.90625 0.125q0.453125 0.125 0.78125 0.359375q0.21875 0.140625 0.3125 0.28125q0.09375 0.140625 0.09375 0.3125q0 0.171875 -0.09375 0.28125q-0.09375 0.09375 -0.234375 0.09375q-0.078125 0 -0.1875 -0.046875q-0.09375 -0.046875 -0.15625 -0.09375q-0.0625 -0.046875 -0.09375 -0.0625q-0.3125 -0.203125 -0.59375 -0.3125q-0.28125 -0.125 -0.6875 -0.125q-0.875 0 -1.359375 0.59375q-0.484375 0.59375 -0.484375 1.65625q0 1.046875 0.484375 1.625q0.484375 0.578125 1.359375 0.578125q0.40625 0 0.703125 -0.109375q0.296875 -0.125 0.59375 -0.328125q0.140625 -0.09375 0.25 -0.15625q0.125 -0.0625 0.203125 -0.0625q0.140625 0 0.21875 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.15625 -0.09375 0.28125q-0.078125 0.125 -0.296875 0.28125q-0.34375 0.234375 -0.8125 0.375q-0.46875 0.125 -0.953125 0.125zm7.998047 -0.84375q0.203125 0.171875 0.203125 0.375q0 0.1875 -0.125 0.328125q-0.125 0.125 -0.3125 0.125q-0.15625 0 -0.328125 -0.140625l-3.125 -2.703125l0 2.359375q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 4.875l2.859375 -2.625q0.15625 -0.140625 0.328125 -0.140625q0.1875 0 0.3125 0.140625q0.140625 0.125 0.140625 0.296875q0 0.203125 -0.171875 0.359375l-2.375 2.109375l2.59375 2.265625zm4.2812805 -5.21875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm6.67157 0.796875q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm4.722534 0.78125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm5.237152 1.234375q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.5660706 5.28125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.361267 0.78125q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m233.1085 268.03217l-66.74016 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m233.10852 268.03217l-63.313095 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m169.79543 268.03217l1.124588 -1.1246033l-3.0897675 1.1246033l3.0897675 1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m75.62205 99.34843l0 19.652092l46.992126 0l0 133.54475" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m75.62205 99.34843l0 19.652084l46.992126 0l0 130.11768" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.614174 249.1182l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m171.49606 99.34974l0 19.650558l-48.88189 0l0 133.5463" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m171.49606 99.34974l0 19.650558l-48.88189 0l0 130.1192" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.614174 249.1195l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m233.89502 131.35573l-1.124588 -1.124588l1.124588 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m326.24146 131.35573l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m579.47955 247.1612q0.203125 0 0.328125 0.140625q0.125 0.125 0.125 0.359375l0 7.578125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.234375 0 -0.390625 -0.203125l-4.984375 -6.65625l0 6.359375q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.21875 0 -0.34375 -0.140625q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.40625 0.203125l4.96875 6.65625l0 -6.359375q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.34375 -0.140625zm8.868103 0q0.203125 0 0.328125 0.140625q0.125 0.125 0.125 0.359375l0 7.578125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.234375 0 -0.390625 -0.203125l-4.984375 -6.65625l0 6.359375q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.21875 0 -0.34375 -0.140625q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.40625 0.203125l4.96875 6.65625l0 -6.359375q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.34375 -0.140625zm12.917175 7.953125q0.046875 0.09375 0.046875 0.203125q0 0.171875 -0.140625 0.296875q-0.140625 0.125 -0.328125 0.125q-0.296875 0 -0.421875 -0.296875l-0.84375 -1.9375l-4.53125 0l-0.859375 1.9375q-0.125 0.296875 -0.421875 0.296875q-0.1875 0 -0.34375 -0.125q-0.140625 -0.125 -0.140625 -0.3125q0 -0.09375 0.046875 -0.1875l3.4375 -7.640625q0.078125 -0.15625 0.21875 -0.234375q0.140625 -0.09375 0.3125 -0.09375q0.171875 0 0.3125 0.09375q0.15625 0.078125 0.21875 0.234375l3.4375 7.640625zm-5.859375 -2.421875l3.8125 0l-1.90625 -4.3125l-1.90625 4.3125zm7.78656 3.046875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm4.9744263 4.34375q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m589.5417 213.87056q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm2.7480469 0q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm2.7479858 0q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m122.620316 283.52823l0 14.9730835l75.49606 0l0 20.90091" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m122.620316 283.52823l0 14.9730835l75.49608 0l0 17.473846" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m198.1164 315.97516l-1.124588 -1.1246033l1.124588 3.0897827l1.1245728 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85654 283.52823l0 14.9730835l-78.74016 0l0 20.90091" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85654 283.52823l0 14.9730835l-78.74014 0l0 17.473846" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m198.1164 315.97516l-1.124588 -1.1246033l1.124588 3.0897827l1.1245728 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 0.06298828l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 334.95734l-1.1245728 1.1246033l3.0897827 -1.1246033l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -41.858246l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -41.858246l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 293.0361l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.1246033z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -83.74802l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -83.74802l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 251.14633l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -125.60629l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -125.60629l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 209.28806l-1.1245728 1.124588l3.0897827 -1.124588l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m233.88803 165.75919l0 17.70752l42.960632 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.88805 165.75919l0 17.70752l42.960617 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.84866 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.86572 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index d38db85280d7bd935a47cda70227d383a513fbac..0fffab574ddd8ad75ec07ae4442f363a36ed289e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -33,6 +33,11 @@ bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
   if (conv_op->stride_width != conv_op->stride_height) {
     return false;
   }
+  if ((conv_op->dilation_width_factor != 1) ||
+      (conv_op->dilation_height_factor != 1)) {
+    // Depthwise conv does not support dilation
+    return false;
+  }
   auto& weights_array = model->GetArray(conv_op->inputs[1]);
   if (!weights_array.buffer) {
     // Yield until the weights are resolved as a constant array.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81cedb5dad751aacbbb32326db73de386aba282d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Replaces a tf.squeeze operator with a reshape.
+// Squeeze removes dimensions == 1 (if in the list of squeeze_dims). This
+// means that the data layout will never change with this op, just the shape.
+// By converting these to reshapes once we have run shape propagation we allow
+// standard reshape optimization transforms to do their magic.
+bool ConvertSqueezeToReshape::Run(Model* model, std::size_t op_index) {
+  auto squeeze_it = model->operators.begin() + op_index;
+  if (squeeze_it->get()->type != OperatorType::kSqueeze) {
+    return false;
+  }
+  auto squeeze_op = static_cast<SqueezeOperator*>(squeeze_it->get());
+  CHECK_EQ(squeeze_op->inputs.size(), 1);
+  CHECK_EQ(squeeze_op->outputs.size(), 1);
+
+  const auto& input_array = model->GetArray(squeeze_op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return false;
+  }
+  if (input_array.shape().dimensions_count() == 0) {
+    // Input array cannot be 0-D.
+    return false;
+  }
+  if (!model->HasArray(squeeze_op->outputs[0]) ||
+      !model->GetArray(squeeze_op->outputs[0]).has_shape()) {
+    // Yield until shape propagation has set the output shape for us.
+    return false;
+  }
+
+  // We use the output shape that has been calculated by shape propagation.
+  const auto& output_shape = model->GetArray(squeeze_op->outputs[0]).shape();
+
+  // Empty shapes will not work as empty data arrays.
+  if (output_shape.dimensions_count() == 0) {
+    return false;
+  }
+
+  auto* reshape_op = new TensorFlowReshapeOperator;
+  reshape_op->inputs = {
+      squeeze_op->inputs[0],
+      CreateInt32Array(model, squeeze_op->outputs[0] + "_shape",
+                       output_shape.dims()),
+  };
+  reshape_op->outputs = squeeze_op->outputs;
+
+  AddMessageF("Replacing %s with %s", LogName(*squeeze_op),
+              LogName(*reshape_op));
+
+  // Replace the operator in the graph.
+  const auto reshape_it = model->operators.emplace(squeeze_it, reshape_op);
+  squeeze_it = reshape_it + 1;
+  CHECK_EQ(squeeze_it->get(), squeeze_op);
+  model->operators.erase(squeeze_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index c2b166033c33b777bad88cb712adf8517be1762a..5a36a90b3841504d6f018832777e50bac95218d7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -21,6 +21,33 @@ limitations under the License.
 
 namespace toco {
 
+namespace {
+
+bool TransposeAffectsMemoryOrder(std::vector<int> perm,
+                                 std::vector<int> in_shape) {
+  CHECK_EQ(perm.size(), in_shape.size());
+  // See what the ordering of the non-unary columns are before and after
+  // transpose permutation. If the major indices stay in the same order (not
+  // just the shape) then the flat buffer representation shouldn't change.
+  std::vector<int> old_major_index_ordering;
+  std::vector<int> new_major_index_ordering;
+  for (int i = 0; i < in_shape.size(); i++) {
+    if (in_shape[i] != 1) {
+      old_major_index_ordering.push_back(i);
+    }
+
+    if (in_shape[perm[i]] != 1) {
+      new_major_index_ordering.push_back(perm[i]);
+    }
+  }
+
+  CHECK_EQ(new_major_index_ordering.size(), old_major_index_ordering.size());
+
+  return old_major_index_ordering != new_major_index_ordering;
+}
+
+}  // namespace
+
 bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
   auto transpose_it = model->operators.begin() + op_index;
   if (transpose_it->get()->type != OperatorType::kTranspose) {
@@ -29,23 +56,26 @@ bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
   TransposeOperator* transpose_op =
       static_cast<TransposeOperator*>(transpose_it->get());
 
+  const auto& input_array = model->GetArray(transpose_op->inputs[0]);
   const auto& output_array = model->GetArray(transpose_op->outputs[0]);
-  if (!output_array.has_shape()) {
+  if (!input_array.has_shape() || !output_array.has_shape()) {
     // Yield until PropagateFixedSizes has been run on this op.
     return false;
   }
   // Note: We can assume we have error checked inputs in PropagateFixedSizes.
 
-  // This transpose is trivial if we only have one non-unitary dimension.
-  std::vector<int> const& dims = output_array.shape().dims();
-  unsigned non_unitary_axis_count = 0;
-  for (int i = 0; i < dims.size(); i++) {
-    if (dims[i] != 1) {
-      non_unitary_axis_count++;
-    }
+  // Check that the permutation has propogated.
+  std::vector<int> const& perm = transpose_op->perm;
+  if (perm.empty()) {
+    return false;
   }
-  if (non_unitary_axis_count > 1) {
-    // Transpose is not trivial
+
+  // This transpose is trivial if non-unitary dimensions remain in the same
+  // order.
+  std::vector<int> const& input_dims = input_array.shape().dims();
+  std::vector<int> const& output_dims = output_array.shape().dims();
+
+  if (TransposeAffectsMemoryOrder(perm, input_dims)) {
     return false;
   }
 
@@ -61,11 +91,11 @@ bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
   string shape_array_name = toco::AvailableArrayName(*model, perm_array_name);
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   *(shape_array.mutable_shape()->mutable_dims()) = {
-      1, static_cast<int>(dims.size())};
+      1, static_cast<int>(output_dims.size())};
   reshape_op->inputs.push_back(shape_array_name);
   shape_array.data_type = ArrayDataType::kInt32;
   auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
-  shape_buffer.data = dims;
+  shape_buffer.data = output_dims;
 
   // Delete perm array if unused
   if (IsDiscardableArray(*model, perm_array_name) &&
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
index badefeca883b1e1d67f7de5276389c5e6e7f7cd3..708ecf6e0a96811ab274fbb25f748f562cd3afad 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
@@ -47,7 +47,7 @@ bool EnsureBiasVectors::Run(Model* model, std::size_t op_index) {
       op->type == OperatorType::kDepthwiseConv ||
       op->type == OperatorType::kFullyConnected) {
     if (ProcessLinearOperator(model, op)) {
-      AddMessageF("Added bias vector to %s", LogName(*op));
+      AddMessageF("Added bias vector to %s as %s", LogName(*op), op->inputs[2]);
       return true;
     }
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..394fa349e2663e2806344f27a96a5132a2d4a810
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -0,0 +1,209 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// === Summary ===
+//
+// TLDR: Some of our 8-bit arithmetic operations require uint8 weight values
+// to avoid the value 0, thus ranging only in [1, 255]. This enables faster
+// runtime arithmetic kernels on ARM NEON. This is not relevant on most
+// other hardware architectures, and will cease to be relevant on ARM NEON
+// in the future. These topics are elaborated below ("Context").
+//
+// Having just one isolated uint8 value equal to 0 is fine. The bad case is when
+// two uint8 values are both zero and are less than 16 bytes apart.
+//
+// By default, toco generates a fatal error when that happens. The user may opt
+// in to more lax behavior by passing
+//   --allow_nudging_weights_to_use_fast_gemm_kernel.
+// This causes toco to nudge such bad 0 values into the value 1, thus avoiding
+// the problem in exchange for compromising on accuracy.
+//
+// The present graph transformation implements both the default fatal-erroring
+// behavior, and, when allow_nudging_weights is set, also the lax nudging
+// behavior.
+//
+//
+// === Context ===
+//
+// Since March 2017, we have been using a trick to perform faster
+// 8bit matrix multiplications, to our knowledge first implemented in gemmlowp
+// here:
+//   https://github.com/google/gemmlowp/commit/25b2989415b99e797e1ab977837111b2e231f81f
+//
+// This trick is explained in Appendix B of our paper,
+//   https://arxiv.org/abs/1712.05877
+//
+// Here is the relevant paragraph:
+//
+//      For efficient NEON implementation of the matrix multiplication’s
+//      core accumulation, we use the following trick.
+//      In the multiply-add operation in (10), we first change the
+//      operands’ type from uint8 to int8 (which can be done by
+//      subtracting 128 from the quantized values and zero-points).
+//      Thus the core multiply-add becomes
+//
+//            int32 += int8 * int8. (B.1)
+//
+//      As mentioned in section 3, with a minor tweak of the quantized
+//      training process, we can ensure that the weights, once
+//      quantized as int8 values, never take the value −128. Hence,
+//      the product in (B.1) is never −128 ∗ −128, and is therefore
+//      always less than 2^14 in absolute value. Hence, (B.1)
+//      can accumulate two products on a local int16 accumulator
+//      before that needs to be accumulated into the true int32 accumulator.
+//      This allows the use of an 8-way SIMD multiplication
+//      (SMULL on int8 operands), followed by an 8-way
+//      SIMD multiply-add (SMLAL on int8 operands), followed
+//      by a pairwise-add-and-accumulate into the int32 accumulators
+//      (SADALP).
+//
+// As that paragraph notes, quantized training should be suitably modified to
+// ensure that quantized uint8 weights value only range in [1, 255]. So the
+// problem that we are dealing with is only about the existing 8-bit quantized
+// models that haven't been trained specifically to get 8-bit weights only in
+// [1, 255].
+//
+// This spreadsheet shows the speed benefit of this trick across many existing
+// ARM-architecture CPUs:
+//
+//    https://docs.google.com/spreadsheets/d/1-0LjdMvW0XtH1bYknC0bQINoFaxjTuL9eplZZcitykI/edit?usp=sharing
+//
+// Compare Row 18 (fast int8 trick) to Row 20 (regular uint8 kernel).
+//
+// The introduction of the 'dotprod' extension to ARM NEON, specifically the
+// SDOT instruction, renders this eventually moot. See the experimental
+// kernels contributed by ARM here,
+//
+//     https://github.com/google/gemmlowp/pull/116
+//
+// However, as of April 2018, there don't seem to be any commercially available
+// CPU supporting these instructions (yet); we are waiting for
+// Cortex-A{75,55}-r1 to become available; the "-r1" is key here. Even if such
+// CPUs become available soon, it will presumably take years for them to
+// overtake the large volume of existing CPUs not supporting these new
+// instructions, especially in current and future low-end devices. All in all,
+// we can foresee these 'fast int8 kernels' to remain important to have into
+// the 2020s.
+//
+bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
+                                                   std::size_t op_index) {
+  const auto& op = *model->operators[op_index];
+  int weights_index = 0;
+  switch (op.type) {
+    case OperatorType::kConv:
+      weights_index = 1;
+      break;
+    case OperatorType::kLstmCell:
+      weights_index = 2;
+      break;
+    case OperatorType::kFullyConnected: {
+      weights_index = 1;
+      const auto& fc_op = static_cast<const toco::FullyConnectedOperator&>(op);
+      CHECK(!fc_op.experimental_shuffled_weights)
+          << "This graph transformation expects to run before FC weights get "
+             "shuffled.";
+      break;
+    }
+    default:
+      // Other operator types are unaffected by this graph transformation,
+      // because their runtime implementations don't use the fast int8 trick.
+      // In particular that's the case of DepthwiseConv at the moment.
+      // We have to update this logic when that changes, e.g. if in the future
+      // some DepthwiseConv kernel wants to use the trick.
+      //
+      // The reason why that's not so likely, hence why it's fairly safe to
+      // stay conservative in the list of operators that we handle here, is that
+      // the fast int8 kernel trick is only applicable to ops that either are
+      // implemented as a GEMM, or use symmetric ranges for both weights and
+      // activations. The reason why GEMM is special (can use the trick even
+      // without symmetric ranges) is that it is so arithmetic-intense that
+      // it can use techniques reducing its implementation to the symmetric
+      // ranges case, with limited relative overhead (O(N^2) overhead vs
+      // O(N^3) GEMM cost). See https://arxiv.org/pdf/1712.05877, section
+      // 2.3 Efficient handling of zero-points.
+      //
+      // That's why at the moment we only handle operators that use a GEMM
+      // (Conv, fully-connected --- note that LSTM merely wraps a
+      // fully-connected operator).
+      return false;
+  }
+
+  const string& name = op.inputs[weights_index];
+  auto& array = model->GetArray(name);
+  if (!array.buffer) {
+    return false;
+  }
+  if (array.data_type != ArrayDataType::kUint8) {
+    return false;
+  }
+  auto& buffer_data = array.GetMutableBuffer<ArrayDataType::kUint8>().data;
+
+  int count_bad = 0;
+  int index_of_previous_bad_value = 0;
+  bool changed = false;
+
+  for (int i = 0; i < buffer_data.size(); i++) {
+    if (buffer_data[i] == 0) {
+      count_bad++;
+      if (count_bad > 1) {
+        const int distance = i - index_of_previous_bad_value;
+        // Semi-arbitrary threshold. The idea is that trouble only occurs
+        // when two bad values are very close to each other so that they
+        // are jointly used within registers inside some GEMM kernel.
+        // The details of that depend on the kernel. Our current fast ARM64
+        // kernel, for instance, only has an issue when the distance between
+        // consecutive bad values is exactly 8. We do not want to track such
+        // kernel details too closely here, so we pick a threshold that's
+        // a bit larger than that, to give us room to change kernels in the
+        // future without worrying.
+        static constexpr int kMinDistanceBetweenBadValues = 16;
+        if (distance < kMinDistanceBetweenBadValues) {
+          if (allow_nudging_weights()) {
+            buffer_data[i] = 1;
+            changed = true;
+            continue;
+          }
+          LOG(FATAL) << "Bad value for " << name << " at index " << i
+                     << ", previous bad value at index "
+                     << index_of_previous_bad_value << ", distance=" << distance
+                     << ", kMinDistanceBetweenBadValues="
+                     << kMinDistanceBetweenBadValues << ". Consider passing "
+                     << "--allow_nudging_weights_to_use_fast_gemm_kernel "
+                     << "if you don't care about accuracy.";
+        }
+      }
+      index_of_previous_bad_value = i;
+    }
+  }
+
+  if (changed) {
+    AddMessageF("Tweaked weights values for %s", LogName(op));
+  }
+
+  return changed;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c00cdcb944b085dda41033b95c96537cc2e047c3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
@@ -0,0 +1,158 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) {
+  Operator* op = model->operators[op_index].get();
+  if (op->type != OperatorType::kFullyConnected) {
+    return false;
+  }
+  FullyConnectedOperator* fc_op = static_cast<FullyConnectedOperator*>(op);
+  // Exit if this FC op already has shuffled weights
+  if (fc_op->experimental_shuffled_weights) {
+    return false;
+  }
+  const Array& input_array = model->GetArray(fc_op->inputs[0]);
+  const string& weights_name = fc_op->inputs[1];
+  Array& weights_array = model->GetArray(weights_name);
+  const Array& output_array = model->GetArray(fc_op->outputs[0]);
+  // Exit if this FC op isn't quantized with uint8 inputs and int16 outputs,
+  // the only case where we are currently interested in providing a fast path
+  // with shuffled weights.
+  if (input_array.data_type != ArrayDataType::kUint8 ||
+      weights_array.data_type != ArrayDataType::kUint8 ||
+      output_array.data_type != ArrayDataType::kInt16 ||
+      !input_array.quantization_params || !weights_array.quantization_params ||
+      !output_array.quantization_params) {
+    return false;
+  }
+  // Exit if the shapes aren't known
+  if (!input_array.has_shape() || !weights_array.has_shape()) {
+    return false;
+  }
+  // Exit if, based on the known shapes, this FC op is not a GEMV.
+  // The shuffling of FC weights is only useful to enable fast GEMV paths.
+  const Shape& input_shape = input_array.shape();
+  for (int i = 1; i < input_shape.dimensions_count() - 1; i++) {
+    if (input_shape.dims(i) != 1) {
+      // The input activations, shaped as a matrix, have multiple columns.
+      // This FC op isn't a matrix*vector multiplication.
+      AddMessageF(
+          "Not applying experimental shuffling to the weights of %s because "
+          "the input shape is not 1D or 2D (possibly with additional inner "
+          "dimensions of size 1)",
+          LogName(*op));
+      return false;
+    }
+  }
+  if (input_shape.dims(0) != 1 && input_shape.dims(0) != 4) {
+    AddMessageF(
+        "Not applying experimental shuffling to the weights of %s because "
+        "the input shape's leading dimension, i.e. the 'batch size', is not "
+        "equal to 1 or 4",
+        LogName(*op));
+    return false;
+  }
+  // Exit if the weights shape isn't an integral multiple of the shuffled
+  // block shape, 4x16. We don't want to have to write code dealing with
+  // odd sizes, that would go un-exercised at the moment as the models
+  // for which we need this shuffling have shapes that are multiples of that
+  // 4x16 block size. In fact, much of the rationale for this shuffling is
+  // to avoid cache aliasin issue with large power-of-two depths, with our
+  // models motivating this shuffling having FC weights shapes like
+  // 4096x2048. Thus, if some model doesn't get the shuffling because of that
+  // size requirement, that might be just fine --- that model might just not
+  // suffer from that cache aliasing issue that we have with large powers of
+  // two.
+  const Shape& weights_shape = weights_array.shape();
+  if (weights_shape.dimensions_count() != 2) {
+    return false;
+  }
+  const int rows = weights_shape.dims(0);
+  const int cols = weights_shape.dims(1);
+  if (rows % 4 || cols % 16) {
+    AddMessageF(
+        "Not applying experimental shuffling to the weights of %s because its "
+        "shape isn't a multiple of the shuffling block shape, 4x16",
+        LogName(*op));
+    return false;
+  }
+  // Exit if the weights aren't already a constant array.
+  if (!weights_array.buffer) {
+    return false;
+  }
+  // Exit if the weights are used by more than one op.
+  if (CountOpsWithInput(*model, weights_name) != 1) {
+    AddMessageF(
+        "Not applying experimental shuffling to the weights of %s because that "
+        "array is consumed by other operators",
+        LogName(*op));
+    return false;
+  }
+  // Compute the shuffled weights
+  auto& weights_data =
+      weights_array.GetMutableBuffer<ArrayDataType::kUint8>().data;
+  CHECK_EQ(rows * cols, weights_data.size());
+  std::vector<uint8> shuffled_data(weights_data.size());
+  uint8* shuffled_data_ptr = shuffled_data.data();
+  for (int r = 0; r < rows; r += 4) {
+    for (int c = 0; c < cols; c += 16) {
+      for (int i = 0; i < 4; i++) {
+        const uint8* src_data_ptr = weights_data.data() + (r + i) * cols + c;
+        for (int j = 0; j < 16; j++) {
+          uint8 src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the runtime will only need to
+          // reinterpret these uint8 values as int8, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8 dst_val = src_val ^ 0x80;
+          *shuffled_data_ptr++ = dst_val;
+        }
+      }
+    }
+  }
+  CHECK_EQ(shuffled_data_ptr, shuffled_data.data() + rows * cols);
+  // Switch this FC op to using the shuffled weights.
+  weights_data = std::move(shuffled_data);
+  fc_op->experimental_shuffled_weights = true;
+  AddMessageF("Applied experimental shuffling to the weights of %s",
+              LogName(*op));
+  // Add a second output array to this FC op, serving as a workspace to perform
+  // runtime shuffling/xoring of its input activations.
+  CHECK_EQ(fc_op->outputs.size(), 1);
+  const string& shuffled_input_workspace_array_name =
+      AvailableArrayName(*model, fc_op->inputs[0] + "_shuffled");
+  fc_op->outputs.push_back(shuffled_input_workspace_array_name);
+  auto& shuffled_input_workspace_array =
+      model->GetOrCreateArray(shuffled_input_workspace_array_name);
+  shuffled_input_workspace_array.data_type = input_array.data_type;
+  *shuffled_input_workspace_array.mutable_shape() = input_array.shape();
+  shuffled_input_workspace_array.GetOrCreateMinMax() = input_array.GetMinMax();
+  shuffled_input_workspace_array.GetOrCreateQuantizationParams() =
+      input_array.GetQuantizationParams();
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
index ab943f72d1dd87ae9ff4bd53a807cd4923a88c38..c5ce3fcd95eb0aaf63dcc7f43b96d8a13ed93929 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -42,9 +42,9 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
 
   if (CountTrueOutputs(*model, *op) > 1) {
     AddMessageF(
-        "Not fusing activation function into %s because it has more than one "
-        " consumed output",
-        LogName(*op));
+        "Not fusing activation function %s into %s because it has more than "
+        "one  consumed output",
+        LogName(*ac_op), LogName(*op));
     return false;
   }
 
@@ -56,22 +56,31 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
     AddMessageF(
         "Not fusing activation function into %s because it is consumed by more "
         "than 1 other operator",
-        LogName(*op));
+        LogName(*ac_op), LogName(*op));
+    return false;
+  }
+
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    AddMessageF(
+        "Not fusing activation function %s into %s because output %s it is not "
+        "discardable",
+        LogName(*ac_op), LogName(*op), op->outputs[0]);
     return false;
   }
 
   if (op->fused_activation_function != FusedActivationFunctionType::kNone) {
     AddMessageF(
-        "Not fusing activation function into %s because it already has a fused "
-        "activation function",
-        LogName(*op));
+        "Not fusing activation function %s into %s because it already has a "
+        "fused activation function",
+        LogName(*ac_op), LogName(*op));
     return false;
   }
 
   if (!OperatorSupportsFusedActivation(op->type)) {
     AddMessageF(
-        "Not fusing activation function because the %s op doesn't support it",
-        LogName(*op));
+        "Not fusing activation function %s because the %s op doesn't support "
+        "it",
+        LogName(*ac_op), LogName(*op));
     return false;
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 5b57178b18d2d60e1f301a1a8b257d8057618550..76c6be00d407ca30b898d088c9fa34cd7f76f656 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -50,7 +50,17 @@ void FuseAddOrSubParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
 
   // TODO(b/62904716): Bias array should become 1-D when padding removed.
   const int depth = bias_shape.dims(bias_shape.dimensions_count() - 1);
-  CHECK_EQ(depth, operand_shape.dims(operand_shape.dimensions_count() - 1));
+  int operand_channel_increment = 0;
+  if (operand_shape.dimensions_count() >= 1 &&
+      operand_shape.dims(operand_shape.dimensions_count() - 1) ==
+          bias_shape.dims(bias_shape.dimensions_count() - 1)) {
+    operand_channel_increment = 1;
+  } else if (operand_shape.dimensions_count() == 0 ||
+             operand_shape.dims(operand_shape.dimensions_count() - 1) == 1) {
+    operand_channel_increment = 0;
+  } else {
+    LOG(FATAL) << "Operand shape mismatch.";
+  }
 
   enum class OpType { BiasPlusOperand, BiasMinusOperand, OperandMinusBias };
 
@@ -60,9 +70,10 @@ void FuseAddOrSubParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
                                   ? OpType::BiasMinusOperand
                                   : OpType::OperandMinusBias;
 
+  int operand_channel = 0;
   for (int i = 0; i < depth; i++) {
     float& bias_val = bias_data[i];
-    const float operand_val = operand_data[i];
+    const float operand_val = operand_data[operand_channel];
     if (optype == OpType::BiasPlusOperand) {
       bias_val += operand_val;
     } else if (optype == OpType::BiasMinusOperand) {
@@ -72,6 +83,7 @@ void FuseAddOrSubParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
     } else {
       LOG(FATAL) << "Should not get here.";
     }
+    operand_channel += operand_channel_increment;
   }
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 616bdac268c41d29135368d685729c961f44132b..72ffd51db45d0808feab3d07436c13db2420a680 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -114,6 +114,7 @@ void RunGraphTransformations(Model* model, const string& message,
 // List of all graph transformations
 DECLARE_GRAPH_TRANSFORMATION(ConvertExpandDimsToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertPureConvToDepthwise)
+DECLARE_GRAPH_TRANSFORMATION(ConvertSqueezeToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialAddNToAdd)
 DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialStackToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialTransposeToReshape)
@@ -127,9 +128,14 @@ DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
 DECLARE_GRAPH_TRANSFORMATION(SplitLstmCellInputs)
 DECLARE_GRAPH_TRANSFORMATION(MergeLstmCellInputs)
+DECLARE_GRAPH_TRANSFORMATION(MergeReshapeIntoPrecedingTranspose)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyRelu1)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyPRelu)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyDilatedConv)
 DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
+DECLARE_GRAPH_TRANSFORMATION(PropagateActivationFunctionIntoConstants)
 DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes)
+DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits);
 DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
 DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
 DECLARE_GRAPH_TRANSFORMATION(Quantize)
@@ -139,8 +145,10 @@ DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialBinaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenation)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenationInput)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialSlice)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedActivationFunc)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedMinMax)
 DECLARE_GRAPH_TRANSFORMATION(RemoveUnusedOp)
 DECLARE_GRAPH_TRANSFORMATION(ResolveBatchNormalization)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantBinaryOperator)
@@ -148,7 +156,8 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax)
-DECLARE_GRAPH_TRANSFORMATION(ReorderActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(ReorderElementwiseUnary)
+DECLARE_GRAPH_TRANSFORMATION(ReorderReshapeTranspose)
 DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
@@ -156,8 +165,8 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMerge)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSqueezeAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSwitch)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowTile)
-DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantConcatenation)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantReshape)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTranspose)
 DECLARE_GRAPH_TRANSFORMATION(DropFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(UnfuseActivationFunctions)
@@ -169,13 +178,35 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMeanAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTransposeAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRandomUniform)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRange)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantShapeOrRank)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStack)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStridedSlice)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFill)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantGather)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMultiplyByZero)
 DECLARE_GRAPH_TRANSFORMATION(Dequantize)
+DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup)
+DECLARE_GRAPH_TRANSFORMATION(ExperimentalShuffleFCWeights)
+
+class PropagateDefaultMinMax : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override { return "PropagateDefaultMinMax"; }
+
+  bool has_any_ranges_defined() const { return !type_ranges_.empty(); }
+  void DefineTypeRange(ArrayDataType data_type, double min, double max) {
+    MinMax minmax;
+    minmax.min = min;
+    minmax.max = max;
+    type_ranges_.emplace_back(data_type, minmax);
+  }
+
+ private:
+  bool SetArrayMinMax(const string& array_name, Array* array);
+  std::vector<std::pair<ArrayDataType, MinMax>> type_ranges_;
+};
 
 class ResolveReshapeAttributes : public GraphTransformation {
  public:
@@ -198,6 +229,36 @@ class RemoveTrivialReshape : public GraphTransformation {
   bool treat_expand_dims_as_trivial_ = false;
 };
 
+class ResolveConstantFakeQuant : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override { return "ResolveConstantFakeQuant"; }
+
+  // True if the num_bits should adjust the final data type.
+  bool propagate_fake_quant_num_bits() const {
+    return propagate_fake_quant_num_bits_;
+  }
+  void set_propagate_fake_quant_num_bits(bool val) {
+    propagate_fake_quant_num_bits_ = val;
+  }
+
+ private:
+  bool propagate_fake_quant_num_bits_ = false;
+};
+
+class EnsureUint8WeightsSafeForFastInt8Kernels : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override {
+    return "EnsureUint8WeightsSafeForFastInt8Kernels";
+  }
+  bool allow_nudging_weights() const { return allow_nudging_weights_; }
+  void set_allow_nudging_weights(bool val) { allow_nudging_weights_ = val; }
+
+ private:
+  bool allow_nudging_weights_ = false;
+};
+
 #undef DECLARE_GRAPH_TRANSFORMATION
 
 }  // end namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 1b0be858107b54f5a6ecd2a1cb87c9dbde1c06bb..437e30a91803bfc847bf246875fa2924b7c0d3fe 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -95,36 +95,64 @@ bool HardcodeMinMaxForConcatenation(Model* model, Operator* op) {
   overall_minmax.min = overall_min;
   overall_minmax.max = overall_max;
   bool changed = false;
-  for (const auto& input : op->inputs) {
-    auto& array = model->GetArray(input);
-    if (!array.minmax) {
-      changed = true;
-    } else if (!(overall_minmax == array.GetMinMax())) {
-      changed = true;
-      LOG(WARNING)
-          << "Tweaking the MinMax of array " << input << ", which is "
-          << "an input to " << LogName(*op) << ", because we want all inputs "
-          << "and outputs of a Concatenation operator to have the same MinMax "
-          << "so that it can be implemented as a pure byte-copy, no "
-             "arithmetic.";
+  if (model->flags.change_concat_input_ranges()) {
+    for (const auto& input : op->inputs) {
+      auto& array = model->GetArray(input);
+      if (!array.minmax) {
+        changed = true;
+      } else if (!(overall_minmax == array.GetMinMax())) {
+        changed = true;
+        LOG(WARNING)
+            << "Tweaking the MinMax of array " << input << ", which is "
+            << "an input to " << LogName(*op) << ", because we want all inputs "
+            << "and outputs of a Concatenation operator to have the same "
+            << "MinMax so that it can be implemented as a pure byte-copy, no "
+               "arithmetic.";
+      }
+      array.GetOrCreateMinMax() = overall_minmax;
     }
-    array.GetOrCreateMinMax() = overall_minmax;
   }
   if (!output.minmax) {
     changed = true;
   } else if (!(overall_minmax == output.GetMinMax())) {
-    changed = true;
-    LOG(WARNING)
-        << "Tweaking the MinMax of the output array of " << LogName(*op)
-        << ", because we want all inputs "
-        << "and outputs of a Concatenation operator to have the same MinMax "
-        << "so that it can be implemented as a pure byte-copy, no arithmetic.";
+    if (model->flags.change_concat_input_ranges()) {
+      changed = true;
+      LOG(WARNING)
+          << "Tweaking the MinMax of the output array of " << LogName(*op)
+          << ", because we want all inputs "
+          << "and outputs of a Concatenation operator to have the same MinMax "
+          << "so that it can be implemented as a pure byte-copy, no "
+          << "arithmetic.";
+    } else {
+      return false;
+    }
   }
   output.GetOrCreateMinMax() = overall_minmax;
 
   return changed;
 }
 
+bool HardcodeMinMaxForSplit(Model* model, Operator* op) {
+  for (const auto& output : op->outputs) {
+    if (model->GetArray(output).minmax) {
+      LOG(WARNING) << "Skipping min-max setting for " << LogName(*op)
+                   << " because output " << output << " already has min-max.";
+      return false;
+    }
+  }
+  // Data is in second input.
+  auto& input_array = model->GetArray(op->inputs[1]);
+  if (!input_array.minmax) {
+    return false;
+  } else {
+    for (const auto& output : op->outputs) {
+      auto& array = model->GetArray(output);
+      array.GetOrCreateMinMax() = *input_array.minmax;
+    }
+    return true;
+  }
+}
+
 // The output of average or max pooling is within the same range as its input.
 bool HardcodeMinMaxForAverageOrMaxPool(Model* model, Operator* op) {
   auto& output_array = model->GetArray(op->outputs[0]);
@@ -202,8 +230,11 @@ bool PropagateMinMaxAmongArrays(Model* model,
     if (array.minmax) {
       CHECK(*array.minmax == *reference_minmax)
           << "Both the following arrays have minmax, and they disagree: "
-          << reference_array_name << " and " << array_name
-          << ". Expected that either only one of them would have minmax, or at "
+          << reference_array_name << " (" << reference_minmax->min << ","
+          << reference_minmax->max << ") and " << array_name << " ("
+          << array.minmax->min << "," << array.minmax->max
+          << "). Expected that either only one of them would have minmax, or "
+             "at "
              "least that they would agree.";
     } else {
       array.GetOrCreateMinMax() = *reference_minmax;
@@ -296,14 +327,22 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
       changed = HardcodeMinMaxForConcatenation(model, op);
       break;
 
+    case OperatorType::kTensorFlowSplit:
+      changed = HardcodeMinMaxForSplit(model, op);
+      break;
+
     case OperatorType::kAveragePool:
     case OperatorType::kMaxPool:
       changed = HardcodeMinMaxForAverageOrMaxPool(model, op);
       break;
 
+    case OperatorType::kStridedSlice:
     case OperatorType::kSqueeze:
     case OperatorType::kTensorFlowReshape:
     case OperatorType::kPad:
+    case OperatorType::kGather:
+    case OperatorType::kTranspose:
+    case OperatorType::kMean:
       changed = HardcodeMinMaxFromFirstInput(model, op);
       break;
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_dilated_conv.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_dilated_conv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae3301f467de5714230e731b4bab87ddc1637201
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_dilated_conv.cc
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// A dilated convolution can be emulated with a regular convolution by chaining
+// SpaceToBatch and BatchToSpace ops before and after it:
+//
+//     SpaceToBatchND -> Conv2D -> BatchToSpaceND
+//
+// This method was common before Conv2D fully supported dilated convolution in
+// TensorFlow. This transformation detects this "emulation", and replaces it
+// with a true dilated convolution, eliminating the SpaceToBatch and
+// BatchtoSpace ops.
+//
+// Detecting this alone would be relatively easy. However, in practice some
+// extra ops are used, so we detect the following patterns:
+//
+//
+//   SpaceToBatchND -> Expand -> Conv2D -> Squeeze -> BatchToSpaceND -> BiasAdd
+//
+//   SpaceToBatchND -> Expand -> Conv2D -> Squeeze -> Pad -> BatchToSpaceND ->
+//   BiasAdd
+//
+//   SpaceToBatchND -> Expand -> Conv2D -> Squeeze -> BiasAdd -> BatchToSpaceND
+//
+//   SpaceToBatchND -> Conv2D -> Pad -> BatchToSpaceND -> BiasAdd
+//
+//   SpaceToBatchND -> Conv2D -> BatchToSpaceND -> BiasAdd
+//
+//
+// The Expand/Squeeze combination is used to adapt a 3D array (such as in
+// WaveNet) to the 4D arrays that Conv2D requires. Padding and BiasAdd are
+// thrown in just for the extra headache. Padding adapts non-conforming input
+// sizes, and can be discarded. The bias is necessary, so is kept.
+
+bool IdentifyDilatedConv::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* stb_op = it->get();
+
+  // 1. IDENTIFY OPERATORS
+  // ***************************************************************************
+  // SpaceToBatch Op.
+  if (stb_op->type != OperatorType::kSpaceToBatchND) {
+    return false;
+  }
+  if (stb_op->inputs.size() != 3) {
+    return false;
+  }
+  CHECK_EQ(stb_op->outputs.size(), 1);
+  // Extract the dilation factor from Input[1] of SpaceToBatch
+  // TODO(mjmatthews): Support 2D dilation factors.
+  const auto& block_shape_array = model->GetArray(stb_op->inputs[1]);
+  if (!block_shape_array.buffer) {
+    return false;
+  }
+  CHECK_EQ(block_shape_array.shape().dimensions_count(), 1);
+  int dilation_factor =
+      block_shape_array.Array::GetBuffer<ArrayDataType::kInt32>().data[0];
+
+  // Expand Op
+  auto* post_stb_op = GetOpWithInput(*model, stb_op->outputs[0]);
+  if (!post_stb_op) {
+    return false;
+  }
+  bool has_expand_op = false;
+  if (post_stb_op->type == OperatorType::kExpandDims) {
+    has_expand_op = true;
+    CHECK_EQ(post_stb_op->inputs.size(), 2);
+    CHECK_EQ(post_stb_op->outputs.size(), 1);
+  }
+
+  // Conv Op
+  ConvOperator* conv_op = dynamic_cast<ConvOperator*>(
+      has_expand_op ? GetOpWithInput(*model, post_stb_op->outputs[0])
+                    : GetOpWithInput(*model, stb_op->outputs[0]));
+  if (!conv_op || conv_op->type != OperatorType::kConv) {
+    return false;
+  }
+  if (conv_op->inputs.size() != 2) {
+    // The conv op must only have weights, no bias.
+    return false;
+  }
+  CHECK_EQ(conv_op->outputs.size(), 1);
+
+  // Squeeze Op
+  auto* post_conv_op = GetOpWithInput(*model, conv_op->outputs[0]);
+  if (!post_conv_op) {
+    return false;
+  }
+  if (has_expand_op) {
+    if (post_conv_op->type != OperatorType::kSqueeze) {
+      // If an expand op was used, the post-conv op must be a squeeze op
+      return false;
+    }
+    CHECK_EQ(post_conv_op->inputs.size(), 1);
+    CHECK_EQ(post_conv_op->outputs.size(), 1);
+  }
+
+  // Pad Op
+  const auto* pad_op = has_expand_op
+                           ? GetOpWithInput(*model, post_conv_op->outputs[0])
+                           : GetOpWithInput(*model, conv_op->outputs[0]);
+  bool has_pad_op = false;
+  if (pad_op->type == OperatorType::kPad) {
+    has_pad_op = true;
+    CHECK_EQ(pad_op->inputs.size(), 2);
+    CHECK_EQ(pad_op->outputs.size(), 1);
+  }
+  // TODO(mjmatthews): Perform validity checking on padding dimensions.
+
+  // Pre-BatchToSpace Bias Op
+  auto* next_op = has_pad_op
+                      ? GetOpWithInput(*model, pad_op->outputs[0])
+                      : has_expand_op
+                            ? GetOpWithInput(*model, post_conv_op->outputs[0])
+                            : GetOpWithInput(*model, conv_op->outputs[0]);
+  bool has_bias_before_bts = false;
+  if (next_op->type == OperatorType::kAdd) {
+    has_bias_before_bts = true;
+  }
+  auto final_op = GetOpWithInput(*model, next_op->outputs[0]);
+
+  // BatchToSpace Op
+  const auto* bts_op = has_bias_before_bts ? final_op : next_op;
+  if (bts_op->type != OperatorType::kBatchToSpaceND) {
+    return false;
+  }
+  CHECK_EQ(bts_op->inputs.size(), 3);
+  CHECK_EQ(bts_op->outputs.size(), 1);
+
+  // Post-BatchToSpace Bias Op
+  Operator* bias_add_op = !has_bias_before_bts ? final_op : next_op;
+  if (bias_add_op->type != OperatorType::kAdd) {
+    // Bias op is required before or after BatchToSpace
+    return false;
+  }
+  CHECK_EQ(bias_add_op->inputs.size(), 2);
+  CHECK_EQ(bias_add_op->outputs.size(), 1);
+
+  LOG(INFO) << "Identified sub-network emulating dilated convolution.";
+
+  // 2. RE-WIRE OPERATORS
+  // ***************************************************************************
+  // Re-use the existing Conv2D op.
+  conv_op->dilation_width_factor = dilation_factor;
+  conv_op->dilation_height_factor = dilation_factor;
+  conv_op->padding.type = PaddingType::kSame;
+
+  // Rewire the ops to bypass SpaceToBatch, BatchToSpace, and Pad.
+  bias_add_op->outputs[0] = final_op->outputs[0];
+  if (has_expand_op) {
+    bias_add_op->inputs[0] = post_conv_op->outputs[0];
+    post_conv_op->inputs[0] = conv_op->outputs[0];
+    conv_op->inputs[0] = post_stb_op->outputs[0];
+    post_stb_op->inputs[0] = stb_op->inputs[0];
+  } else {
+    bias_add_op->inputs[0] = conv_op->outputs[0];
+    conv_op->inputs[0] = stb_op->inputs[0];
+  }
+  // TODO(mjmatthews): Connect bias directly into the Conv2D?
+
+  // 3. DELETE LEFTOVER OPERATORS
+  // ***************************************************************************
+  // Order is important. Delete the output array first, then the op, then it's
+  // redundant inputs.
+  // BatchToSpace Op
+  DeleteArrayIfUnused(bts_op->outputs[0], model);
+  std::vector<string> bts_op_inputs = bts_op->inputs;
+  model->operators.erase(FindOp(*model, bts_op));
+  DeleteArrayIfUnused(bts_op_inputs[1], model);
+  DeleteArrayIfUnused(bts_op_inputs[2], model);
+
+  // Pad Op if present
+  if (has_pad_op) {
+    DeleteArrayIfUnused(pad_op->outputs[0], model);
+    std::vector<string> pad_op_inputs = pad_op->inputs;
+    model->operators.erase(FindOp(*model, pad_op));
+    DeleteArrayIfUnused(pad_op_inputs[1], model);
+  }
+
+  // SpaceToBatch Op
+  DeleteArrayIfUnused(stb_op->outputs[0], model);
+  std::vector<string> stb_op_inputs = stb_op->inputs;
+  model->operators.erase(FindOp(*model, stb_op));
+  DeleteArrayIfUnused(stb_op_inputs[1], model);
+  DeleteArrayIfUnused(stb_op_inputs[2], model);
+
+  LOG(INFO) << "Replaced with Dilated Conv2D op outputting \""
+            << conv_op->outputs[0] << "\".";
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
index c363b93394f0af7bcfc37c1e8be5f98aca6667ae..e9842524c829b839b97b3453a36c41efe186efbb 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
@@ -306,6 +306,12 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
+  if (static_cast<FullyConnectedOperator*>(fully_connected)
+          ->experimental_shuffled_weights) {
+    // Not yet implemented: experimental shuffled weights in fused LSTM cell.
+    return false;
+  }
+
   // Emplace a new LSTM cell operator
   auto* lstm_cell_op = new LstmCellOperator;
   lstm_cell_op->inputs.resize(LstmCellOperator::NUM_INPUTS);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_prelu.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_prelu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30be4ac0aa5e9f639bbf0630e142c2806faa3260
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_prelu.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+// This transformation rule tries to identify the PRelu structure generated by
+// Keras, and convert it to a single op.
+//
+// The formula of PReLU is:
+// f(x) = alpha * x for x < 0, f(x) = x for x >= 0.
+//
+// `x` is the input, and `alpha` is a trainable tensor which can be broadcasted
+// to the shape of `x`.
+//
+// There's no native PRelu op in TensorFlow, so Keras generates the following
+// structure which does the equivalent calculation:
+// f(x) = Relu(x) + (-alpha * Relu(-x))
+//
+// Practically, alpha is always a constant in the inference graph, and Toco have
+// other graph transformations which fold the activation functions to other ops.
+// Therefore, we're looking for the structure:
+//
+// f(x) = Relu(x) + (negative_alpha * Neg(x, activation=Relu))
+
+namespace toco {
+
+bool IdentifyPRelu::Run(Model* model, std::size_t op_index) {
+  const auto add_op_it = model->operators.begin() + op_index;
+  const auto* add_op = add_op_it->get();
+  if (add_op == nullptr || add_op->type != OperatorType::kAdd ||
+      add_op->inputs.size() != 2 ||
+      add_op->fused_activation_function != FusedActivationFunctionType::kNone) {
+    return false;
+  }
+
+  const auto* relu_input_op = GetOpWithOutput(*model, add_op->inputs[0]);
+  if (relu_input_op == nullptr || relu_input_op->type != OperatorType::kRelu ||
+      relu_input_op->inputs.size() != 1 ||
+      relu_input_op->fused_activation_function !=
+          FusedActivationFunctionType::kNone) {
+    return false;
+  }
+
+  // TODO(ycling): Both Add and Mul are commutative. Support the case where
+  // the position of operands are exchanged.
+  const auto* mul_op = GetOpWithOutput(*model, add_op->inputs[1]);
+  if (mul_op == nullptr || mul_op->type != OperatorType::kMul ||
+      mul_op->inputs.size() != 2 ||
+      mul_op->fused_activation_function != FusedActivationFunctionType::kNone) {
+    return false;
+  }
+
+  const auto neg_alpha_tensor_name = mul_op->inputs[0];
+
+  const auto* relu_neg_input_op = GetOpWithOutput(*model, mul_op->inputs[1]);
+
+  if (relu_neg_input_op == nullptr ||
+      relu_neg_input_op->type != OperatorType::kNeg ||
+      relu_neg_input_op->fused_activation_function !=
+          FusedActivationFunctionType::kRelu ||
+      relu_neg_input_op->inputs.size() != 1) {
+    return false;
+  }
+
+  if (relu_input_op->inputs[0] != relu_neg_input_op->inputs[0]) {
+    return false;
+  }
+
+  const auto input_tensor_name = relu_input_op->inputs[0];
+  const auto output_tensor_name = add_op->outputs[0];
+
+  // Construct a tensor for positive alpha (double negative).
+  const auto alpha_tensor_name =
+      AvailableArrayName(*model, neg_alpha_tensor_name + "_neg");
+  model->GetOrCreateArray(alpha_tensor_name);
+
+  auto* neg_neg_alpha_op = new NegOperator;
+  neg_neg_alpha_op->inputs = {neg_alpha_tensor_name};
+  neg_neg_alpha_op->outputs = {alpha_tensor_name};
+  model->operators.emplace(add_op_it, neg_neg_alpha_op);
+
+  auto* prelu_op = new PReluOperator;
+  prelu_op->inputs = {input_tensor_name, alpha_tensor_name};
+  prelu_op->outputs = {output_tensor_name};
+  model->operators.emplace(add_op_it, prelu_op);
+  AddMessageF("Creating %s replacing equivalent subgraph", LogName(*prelu_op));
+
+  DeleteArrayIfUsedOnce(neg_alpha_tensor_name, model);
+  DeleteArrayIfUsedOnce(add_op->inputs[0], model);
+  DeleteArrayIfUsedOnce(add_op->inputs[1], model);
+  DeleteArrayIfUsedOnce(mul_op->inputs[1], model);
+  // Remove the existing Add op that outputs the final result. If the other
+  // intermediate tensors aren't used by other ops, those will be removed by
+  // other graph transformation rules.
+  model->operators.erase(FindOp(*model, add_op));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
index 881c2d4dc892625d4640cac867a2f49c24b638f5..4a9974ed4e0ebec4381b86798156f4f51bb154a0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
+
 #include <iostream>
 #include <string>
 #include <vector>
@@ -100,3 +103,5 @@ bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
                          string* rnn_array);
 
 }  // namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index d83603e9a2c59ae74a5e5fda5b11178740336bfb..45d9f73a1e6416b8f3fe3936c740da637961b7fc 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -78,15 +79,21 @@ bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
   image_input_op->outputs = {dequantized_input_name};
   model->operators.emplace(model->operators.begin(), image_input_op);
 
-  CHECK(input_array.final_data_type == ArrayDataType::kUint8);
-  input_array.data_type = ArrayDataType::kUint8;
   dequantized_input_array.data_type = ArrayDataType::kFloat;
   const auto& input_minmax = input_array.GetMinMax();
   auto& dequantized_input_minmax = dequantized_input_array.GetOrCreateMinMax();
   dequantized_input_minmax = input_minmax;
   auto& input_qparams = input_array.GetOrCreateQuantizationParams();
-  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(
-      model->flags, input_minmax, &input_qparams);
+  input_array.data_type = input_array.final_data_type;
+  if (input_array.data_type == ArrayDataType::kUint8) {
+    GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(input_minmax,
+                                                           &input_qparams);
+  } else if (input_array.data_type == ArrayDataType::kInt16) {
+    GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(input_minmax,
+                                                           &input_qparams);
+  } else {
+    LOG(FATAL) << "unhandled data type";
+  }
 
   transformation->AddMessageF(
       "Created %s"
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5065004093434475172a39efdcfd26c10c49148b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -0,0 +1,190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool OperatorReady(const Model& model, const Operator* op) {
+  if (!model.HasArray(op->inputs[0]) || !model.HasArray(op->inputs[1]) ||
+      !model.HasArray(op->outputs[0])) {
+    // Arrays are missing.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[0]).has_shape() ||
+      !model.GetArray(op->outputs[0]).has_shape()) {
+    // Input and output needs the shape.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[1]).buffer) {
+    // Buffer needs to be a constant.
+    return false;
+  }
+
+  return true;
+}
+
+// Returns whether the reshape could be a transpose.
+std::vector<int32> ReshapeToTranspose(const Model& model,
+                                      const TensorFlowReshapeOperator* op) {
+  CHECK(!op->shape.empty());
+  CHECK(model.HasArray(op->inputs[0]));
+  CHECK(model.HasArray(op->outputs[0]));
+
+  const auto& input_array = model.GetArray(op->inputs[0]);
+  const auto& output_array = model.GetArray(op->outputs[0]);
+
+  CHECK(input_array.has_shape());
+  CHECK(output_array.has_shape());
+
+  std::vector<int> in_shape = input_array.shape().dims();
+  std::vector<int> out_shape = output_array.shape().dims();
+
+  std::vector<int> one_indices;
+  std::vector<int> not_one_indices;
+
+  // Separate into one indices and not one indices.
+  for (int i = 0; i < in_shape.size(); i++) {
+    if (in_shape[i] == 1) {
+      one_indices.push_back(i);
+    } else {
+      not_one_indices.push_back(i);
+    }
+  }
+
+  // Reorder the vertices.
+  std::vector<int> perm;
+  perm.reserve(in_shape.size());
+  int one_index = 0;
+  int not_one_index = 0;
+  for (const auto val : out_shape) {
+    if (val == 1) {
+      perm.push_back(one_indices[one_index]);
+      one_index++;
+    } else {
+      perm.push_back(not_one_indices[not_one_index]);
+      not_one_index++;
+    }
+  }
+
+  return perm;
+}
+
+}  // namespace
+
+// When a transpose is fed into a reshape, it is possible for the two operators
+// to be merged if the reshape does not affect memory ordering and does not
+// affects the number of dimensions. This only occurs when only unary dimensions
+// are shifting position.
+bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
+                                             std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  auto* reshape_op = ConvertOperator<TensorFlowReshapeOperator*>(
+      it->get(), OperatorType::kTensorFlowReshape);
+
+  if (reshape_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
+    return false;
+  }
+
+  const string intermediate_name = reshape_op->inputs[0];
+  const string output_name = reshape_op->outputs[0];
+
+  // Guarantee the input is only consume by the reshape.
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    return false;
+  }
+
+  // Check for the parent operator.
+  const auto& transpose_it = FindOpWithOutput(*model, intermediate_name);
+  if (transpose_it == model->operators.end()) {
+    return false;
+  }
+
+  // Find the parent operator and guarantee it is a transpose.
+  TransposeOperator* transpose_op = ConvertOperator<TransposeOperator*>(
+      transpose_it->get(), OperatorType::kTranspose);
+
+  if (transpose_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
+    return false;
+  }
+
+  if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
+                                      false /*allow_extra_unary_dimensions*/)) {
+    return false;
+  }
+
+  // Check that the intermediate is not an output array.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot fuse %s and %s as it would invalidate the transpose "
+        "output array.",
+        LogName(*transpose_op), LogName(*reshape_op));
+    return false;
+  }
+
+  AddMessageF("Merging operations %s and %s", LogName(*transpose_op),
+              LogName(*reshape_op));
+
+  // const auto& intermediate_array = model->GetArray(intermediate_name);
+  // const auto& output_array = model->GetArray(output_name);
+
+  auto merged_perm = ReshapeToTranspose(*model, reshape_op);
+
+  // Combine the permutations.
+  const auto& transpose_perm = transpose_op->perm;
+  for (int i = 0; i < merged_perm.size(); i++) {
+    merged_perm[i] = transpose_perm[merged_perm[i]];
+  }
+
+  // Remove the reshape as passthrough operation.
+  if (!RemoveTrivialPassthroughOp(this, model, op_index)) {
+    return false;
+  }
+
+  // Update transpose_op's constant buffer to contain the new permutation.
+  model->GetArray(transpose_op->inputs[1])
+      .GetMutableBuffer<ArrayDataType::kInt32>()
+      .data = merged_perm;
+  transpose_op->perm = merged_perm;
+
+  // transpose_ops's shape will likely has changed.
+  model->GetArray(transpose_op->outputs[0]).clear_shape();
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf17c49b1098d02468935aa72d1d1e73b4addbe1
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool PropagateActivationFunctionIntoConstants::Run(Model* model,
+                                                   std::size_t op_index) {
+  const auto ac_it = model->operators.begin() + op_index;
+  const auto* ac_op = ac_it->get();
+  if (ac_op->type != OperatorType::kRelu6 &&
+      ac_op->type != OperatorType::kRelu1 &&
+      ac_op->type != OperatorType::kRelu) {
+    return false;
+  }
+
+  // Find the op producing the array passed to this activation function.
+  auto* src_op = GetOpWithOutput(*model, ac_op->inputs[0]);
+  if (!src_op) {
+    return false;
+  }
+
+  // Ensure the src_op is not used without the activation function applied.
+  if (CountTrueOutputs(*model, *src_op) > 1) {
+    AddMessageF(
+        "Not propagating activation function %s into %s because it has more "
+        "than one consumed output",
+        LogName(*ac_op), LogName(*src_op));
+  }
+
+  // Filter to the list of supported ops.
+  string src_op_input;
+  switch (src_op->type) {
+    case OperatorType::kGather:
+      src_op_input = src_op->inputs[0];
+      break;
+    default:
+      return false;
+  }
+  CHECK_EQ(src_op->outputs[0], ac_op->inputs[0]);
+
+  // Ensure the input is constant as otherwise this needs to happen at runtime.
+  // If we bail here, it's still possible that FuseActivationFunctions will fuse
+  // the activation if it's supported by the op.
+  if (!IsConstantParameterArray(*model, src_op_input)) {
+    AddMessageF(
+        "Not propagating activation function %s into %s:%s because it is not "
+        "constant",
+        LogName(*ac_op), LogName(*src_op), src_op_input);
+    return false;
+  }
+
+  // Get the array we'll be working with and ensure it's a compatible type.
+  auto& const_array = model->GetArray(src_op_input);
+  if (const_array.data_type != ArrayDataType::kFloat) {
+    AddMessageF(
+        "Not propagating activation function %s into %s:%s because it is "
+        "non-float data",
+        LogName(*ac_op), LogName(*src_op), src_op_input);
+    return false;
+  }
+  auto& const_array_data =
+      const_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+
+  // Perform the activation function directly into the constant data array.
+  for (size_t i = 0; i < const_array_data.size(); ++i) {
+    const float value = const_array_data[i];
+    float new_value = value;
+    switch (ac_op->type) {
+      case OperatorType::kRelu: {
+        static constexpr float kLower = 0;
+        new_value = value < kLower ? kLower : value;
+        break;
+      }
+      case OperatorType::kRelu1: {
+        static constexpr float kUpper = 1;
+        static constexpr float kLower = -1;
+        new_value = value > kUpper ? kUpper : value < kLower ? kLower : value;
+        break;
+      }
+      case OperatorType::kRelu6: {
+        static constexpr float kUpper = 6;
+        static constexpr float kLower = 0;
+        new_value = value > kUpper ? kUpper : value < kLower ? kLower : value;
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unsupported activation function " << LogName(*ac_op);
+        return false;
+    }
+    const_array_data[i] = new_value;
+  }
+
+  AddMessageF("Propagated activation function %s into %s:%s", LogName(*ac_op),
+              LogName(*src_op), src_op_input);
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index f0d107232b4517115aa3f64b39b825dbaffb83ce..c1cf79f62614c44606daaf9294d0822c50019f92 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -50,70 +50,117 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     old_output_data_types[output] = model->GetArray(output).data_type;
   }
   // Do the actual output data types propagation.
-  if (op->type == OperatorType::kDequantize ||
-      op->type == OperatorType::kResizeBilinear) {
-    // These operators unconditionally produce float outputs
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
-  } else if (op->type == OperatorType::kTensorFlowLess ||
-             op->type == OperatorType::kTensorFlowLessEqual ||
-             op->type == OperatorType::kTensorFlowGreater ||
-             op->type == OperatorType::kTensorFlowGreaterEqual) {
-    // These operators unconditionally produce bool outputs
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
-  } else if (op->type == OperatorType::kRank ||
-             op->type == OperatorType::kTensorFlowShape) {
-    // These operators only produce int32 outputs.
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
-  } else if (op->type == OperatorType::kTensorFlowSplit ||
-             op->type == OperatorType::kTensorFlowConcat ||
-             op->type == OperatorType::kFill) {
-    // These operators produce an output with the same type as their 2nd input
-    CHECK_GE(op->inputs.size(), 2);
-    const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kCast) {
-    // Data type of the Cast op is specified.
-    CHECK_EQ(op->outputs.size(), 1);
-    auto* cast_op = static_cast<CastOperator*>(op);
-    model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
-  } else if (op->type == OperatorType::kArgMax) {
-    // Data type of the ArgMax op is specified.
-    CHECK_EQ(op->outputs.size(), 1);
-    auto* argmax_op = static_cast<ArgMaxOperator*>(op);
-    model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
-  } else if (op->type == OperatorType::kRange) {
-    auto* range_op = static_cast<RangeOperator*>(op);
-    // Output type of the Range op can be set via an attribute
-    ArrayDataType data_type;
-    if (range_op->dtype != ArrayDataType::kNone) {
-      // Use the type if specified
-      data_type = range_op->dtype;
-    } else {
-      // Otherwise use the first input
-      CHECK_GE(op->inputs.size(), 1);
-      data_type = model->GetArray(op->inputs[0]).data_type;
+  switch (op->type) {
+    case OperatorType::kDequantize:
+    case OperatorType::kResizeBilinear:
+      // These operators unconditionally produce float outputs
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
+      break;
+    case OperatorType::kTensorFlowLess:
+    case OperatorType::kTensorFlowLessEqual:
+    case OperatorType::kTensorFlowGreater:
+    case OperatorType::kTensorFlowGreaterEqual:
+      // These operators unconditionally produce bool outputs
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
+      break;
+    case OperatorType::kRank:
+    case OperatorType::kTensorFlowShape:
+      // These operators only produce int32 outputs.
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
+      break;
+    case OperatorType::kTensorFlowSplit:
+    case OperatorType::kTensorFlowConcat:
+    case OperatorType::kFill: {
+      // These operators produce an output with the same type as their 2nd input
+      CHECK_GE(op->inputs.size(), 2);
+      const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
     }
-    CHECK_EQ(op->outputs.size(), 1);
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kTensorFlowUnsupported) {
-    auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
-    if (unsupported_op->output_data_types.size() != op->outputs.size()) {
+    case OperatorType::kTransposeConv: {
+      // These operators produce an output with the same type as their 3rd input
+      CHECK_GE(op->inputs.size(), 3);
+      const ArrayDataType data_type = model->GetArray(op->inputs[2]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kCast: {
+      // Data type of the Cast op is specified.
+      CHECK_EQ(op->outputs.size(), 1);
+      auto* cast_op = static_cast<CastOperator*>(op);
+      model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
+      break;
+    }
+    case OperatorType::kArgMax: {
+      // Data type of the ArgMax op is specified.
+      CHECK_EQ(op->outputs.size(), 1);
+      auto* argmax_op = static_cast<ArgMaxOperator*>(op);
+      model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
+      break;
+    }
+    case OperatorType::kRange: {
+      auto* range_op = static_cast<RangeOperator*>(op);
+      // Output type of the Range op can be set via an attribute
+      ArrayDataType data_type;
+      if (range_op->dtype != ArrayDataType::kNone) {
+        // Use the type if specified
+        data_type = range_op->dtype;
+      } else {
+        // Otherwise use the first input
+        CHECK_GE(op->inputs.size(), 1);
+        data_type = model->GetArray(op->inputs[0]).data_type;
+      }
+      CHECK_EQ(op->outputs.size(), 1);
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kRandomUniform: {
+      auto* rand_op = static_cast<RandomUniformOperator*>(op);
+      // The output type of RandomUniform is specified with an attribute
+      if (rand_op->dtype == ArrayDataType::kNone) {
+        return false;
+      }
+      CHECK_EQ(op->outputs.size(), 1);
+      SetDataTypeForAllOutputs(model, op, rand_op->dtype);
+      break;
+    }
+    case OperatorType::kTopK_V2: {
+      // topk(values: T, k: int32) -> values: T, indices: int32
+      CHECK_EQ(op->inputs.size(), 2);
+      CHECK_EQ(op->outputs.size(), 2);
+      CHECK(model->GetArray(op->inputs[1]).data_type == ArrayDataType::kInt32);
+      model->GetArray(op->outputs[0]).data_type = model->GetArray(op->inputs[0]).data_type;
+      model->GetArray(op->outputs[1]).data_type = ArrayDataType ::kInt32;
+      break;
+    }
+    case OperatorType::kTensorFlowUnsupported: {
+      auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
+      // Some output tensors from the op could be eliminated by optimization.
+      // This can make unsupported_op->output_data_types have more elements than
+      // op->outputs.
+      if (unsupported_op->output_data_types.size() < op->outputs.size()) {
+        return false;
+      }
+      for (int i = 0; i < op->outputs.size(); ++i) {
+        auto output = op->outputs[i];
+        auto data_type = unsupported_op->output_data_types[i];
+        model->GetArray(output).data_type = data_type;
+      }
+      break;
+    }
+    case OperatorType::kExpandDims: {
+      // Yield on ExpandDim until it is converted to Reshape
       return false;
     }
-    for (int i = 0; i < unsupported_op->output_data_types.size(); ++i) {
-      auto output = op->outputs[i];
-      auto data_type = unsupported_op->output_data_types[i];
-      model->GetArray(output).data_type = data_type;
+    default: {
+      // These operators produce outputs with the same type as their 1st input
+      CHECK_GT(op->inputs.size(), 0);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
     }
-  } else if (op->type == OperatorType::kExpandDims) {
-    // Yield on ExpandDim until it is converted to Reshape
-    return false;
-  } else {
-    // These operators produce outputs with the same type as their 1st input
-    CHECK_GT(op->inputs.size(), 0);
-    const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
   }
+
   // Return true if any output data type changed, false if none changed.
   for (const auto& output : op->outputs) {
     if (old_output_data_types[output] != model->GetArray(output).data_type) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50b90e7c2bfddb0382a4d44ad6c90fc7f7701273
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Propagates default min/max values to any operator input/output array that
+// is missing them.
+//
+// When provided a set of min/max values for uint8 arrays this will rescale
+// the values for other data types as required and preserving the floating point
+// range within the new type.
+bool PropagateDefaultMinMax::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  const auto* op = it->get();
+
+  bool did_change = false;
+
+  for (const auto& input : op->inputs) {
+    auto& input_array = model->GetArray(input);
+    if (!input_array.minmax && !input_array.buffer) {
+      did_change |= SetArrayMinMax(input, &input_array);
+    }
+  }
+
+  for (const auto& output : op->outputs) {
+    auto& output_array = model->GetArray(output);
+    if (!output_array.minmax && !output_array.buffer) {
+      did_change |= SetArrayMinMax(output, &output_array);
+    }
+  }
+
+  return did_change;
+}
+
+// Sets the min/max on the given array, adjusting the reference_minmax for the
+// final data type of the array if it is already specified.
+bool PropagateDefaultMinMax::SetArrayMinMax(const string& array_name,
+                                            Array* array) {
+  CHECK(!array->minmax);
+
+  ArrayDataType quantized_data_type =
+      GetQuantizedDataType(*array, ArrayDataType::kUint8);
+  for (const auto& type_range : type_ranges_) {
+    if (type_range.first == quantized_data_type) {
+      array->GetOrCreateMinMax() = type_range.second;
+      break;
+    }
+  }
+  if (!array->minmax) {
+    AddMessageF(
+        "No defaults specified for quantized data type %s of array %s, "
+        "skipping",
+        ArrayDataTypeName(quantized_data_type), array_name);
+    return false;
+  }
+
+  AddMessageF("Adding default minmax %g,%g to array %s when quantized as %s",
+              array->GetMinMax().min, array->GetMinMax().max, array_name,
+              ArrayDataTypeName(quantized_data_type));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0bce183c1897dfba6f2c393ffc0306c054366725
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -0,0 +1,307 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+void ChangeArrayDataType(GraphTransformation* transformation, Array* array,
+                         ArrayDataType new_data_type,
+                         const MinMax* new_minmax) {
+  // Ensure the array ends up in the new type (if it hasn't yet been quantized).
+  array->final_data_type = new_data_type;
+
+  if (array->minmax && array->quantization_params) {
+    // The array is already quantized and has min/max info.
+    // As we are changing the data type we need to fix up the existing min/max
+    // to the new data type range.
+
+    double old_quantized_min, old_quantized_max;
+    CHECK(GetQuantizedDataTypeNumericalRange(
+        array->data_type, &old_quantized_min, &old_quantized_max))
+        << "Existing data type is not quantized: "
+        << ArrayDataTypeName(array->data_type);
+    double new_quantized_min, new_quantized_max;
+    CHECK(GetQuantizedDataTypeNumericalRange(new_data_type, &new_quantized_min,
+                                             &new_quantized_max))
+        << "New data type is not quantized: "
+        << ArrayDataTypeName(new_data_type);
+
+    // Compute new minmax values.
+    double min = (old_quantized_min - array->quantization_params->zero_point) *
+                 array->quantization_params->scale;
+    double max =
+        (old_quantized_max + 1 - array->quantization_params->zero_point) *
+        array->quantization_params->scale;
+    max = max - 1.0 / (new_quantized_max + 1);
+
+    auto& array_minmax = array->GetOrCreateMinMax();
+    transformation->AddMessageF(
+        "Rescaling min/max from %g,%g (%s) to %g,%g (%s)", array_minmax.min,
+        array_minmax.max, ArrayDataTypeName(array->data_type), min, max,
+        ArrayDataTypeName(new_data_type));
+
+    array_minmax.min = min;
+    array_minmax.max = max;
+    GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
+        array_minmax, array->quantization_params.get());
+
+    // Directly change the type as the array was already quantized.
+    array->data_type = new_data_type;
+  } else {
+    // Array has not yet been quantized so we can just set the final data type
+    // and assign the new min/max value (if provided).
+    CHECK(!array->quantization_params);
+
+    if (!array->minmax && new_minmax) {
+      transformation->AddMessageF("Forcing new minmax to %g,%g (%s)",
+                                  new_minmax->min, new_minmax->max,
+                                  ArrayDataTypeName(new_data_type));
+      auto& array_minmax = array->GetOrCreateMinMax();
+      array_minmax.min = new_minmax->min;
+      array_minmax.max = new_minmax->max;
+    }
+  }
+}
+
+// Returns true if the op blocks our backward recursive data type propagation.
+bool DoesOpBlockBackwardPropagation(const Operator& op) {
+  switch (op.type) {
+    case OperatorType::kConcatenation:
+    case OperatorType::kTensorFlowConcat:
+    case OperatorType::kTensorFlowConcatV2:
+      // Concat shouldn't block propagation, but we do expect that all inputs
+      // have the same range.
+      return false;
+    case OperatorType::kDequantize:
+      // Dequantize ops are inserted between the value we care about and the
+      // FakeQuant so make sure we move across them.
+    case OperatorType::kGather:
+      // Gathers need their parameters changed to the appropriate data type.
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTranspose:
+      // Reshapes and transposes don't change values.
+      return false;
+    default:
+      return true;
+  }
+}
+
+// Returns true if the input of an op blocks our backward recursive data type
+// propagation.
+bool DoesOpInputBlockBackwardPropagation(const Operator& op, int input_index) {
+  switch (op.type) {
+    case OperatorType::kGather:
+      // Ignore gather indices.
+      return input_index != 0;
+      break;
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTranspose:
+      // Ignore reshape/transpose shapes/dimensions.
+      return input_index != 0;
+    default:
+      return false;
+  }
+}
+
+// Propagates the data type up into the input arrays if they are model inputs
+// that may need their type changed. May act recursively if the inputs are
+// produced by ops that we can move over (such as Dequantize).
+bool RecursivelyBackwardPropagateDataType(GraphTransformation* transformation,
+                                          Model* model, Operator* op,
+                                          ArrayDataType new_data_type,
+                                          const MinMax& new_minmax) {
+  bool did_change = false;
+  for (int input_index = 0; input_index < op->inputs.size(); ++input_index) {
+    const auto& input = op->inputs[input_index];
+    auto& input_array = model->GetArray(input);
+    if (input_array.final_data_type == new_data_type) {
+      // Final data type is already - skip.
+      continue;
+    }
+
+    // Prevent moving into constant param args that we don't want to modify.
+    if (DoesOpInputBlockBackwardPropagation(*op, input_index)) {
+      continue;
+    }
+
+    if (input_array.final_data_type != new_data_type) {
+      transformation->AddMessageF(
+          "Adjusting input final data type of array %s from %s to %s", input,
+          ArrayDataTypeName(input_array.final_data_type),
+          ArrayDataTypeName(new_data_type));
+      did_change = true;
+      ChangeArrayDataType(transformation, &input_array, new_data_type,
+                          &new_minmax);
+
+      // Walk up into all ops producing the inputs to this op.
+      for (auto& producing_op : model->operators) {
+        if (!DoesOpBlockBackwardPropagation(*producing_op)) {
+          for (const auto& output : producing_op->outputs) {
+            if (input == output) {
+              did_change |= RecursivelyBackwardPropagateDataType(
+                  transformation, model, producing_op.get(), new_data_type,
+                  new_minmax);
+            }
+          }
+        }
+      }
+    }
+  }
+  return did_change;
+}
+
+// Returns true if the op blocks our forward recursive data type propagation.
+bool DoesOpBlockForwardPropagation(const Operator& op) {
+  switch (op.type) {
+    case OperatorType::kFakeQuant:
+      // Always stop at another FakeQuant, as it will likely have different
+      // parameters.
+      return true;
+    default:
+      return false;
+  }
+}
+
+// Recurses down the graph setting the data type of all arrays until an operator
+// that blocks propagation (like another FakeQuant) or a final_data_type is
+// already specified.
+bool RecursivelyForwardPropagateDataType(GraphTransformation* transformation,
+                                         Model* model, Operator* op,
+                                         ArrayDataType new_data_type) {
+  bool did_change = false;
+  for (const auto& output : op->outputs) {
+    auto& output_array = model->GetArray(output);
+    if (output_array.final_data_type == new_data_type) {
+      // Final data type is already - skip.
+      continue;
+    }
+
+    if (output_array.final_data_type == ArrayDataType::kNone ||
+        output_array.final_data_type != new_data_type) {
+      transformation->AddMessageF(
+          "Adjusting output final data type of array %s from %s to %s", output,
+          ArrayDataTypeName(output_array.final_data_type),
+          ArrayDataTypeName(new_data_type));
+      did_change = true;
+      ChangeArrayDataType(transformation, &output_array, new_data_type,
+                          nullptr);
+
+      // Walk down into all ops consuming the output of this op.
+      for (auto& consuming_op : model->operators) {
+        if (!DoesOpBlockForwardPropagation(*consuming_op)) {
+          for (const auto& input : consuming_op->inputs) {
+            if (input == output) {
+              did_change |= RecursivelyForwardPropagateDataType(
+                  transformation, model, consuming_op.get(), new_data_type);
+            }
+          }
+        }
+      }
+    }
+  }
+  return did_change;
+}
+
+}  // namespace
+
+// Propagates the num_bits on a FakeQuant operator into the final data types
+// of inputs and outputs. For example, if FakeQuant.num_bits==16 then we know
+// the output must be int16 and assume all inputs up until the preceding op are
+// also 16.
+//
+// This can be thought of as a bidirectional flood-fill of the num_bits implied
+// final_data_type that terminates at other FakeQuant ops (and a few others as
+// determined by DoesOpBlockBackwardPropagation/DoesOpBlockForwardPropagation).
+// Once all FakeQuant ops have been visted the arrays should all have
+// appropriate final_data_types if the source graph was annotated with the
+// proper FakeQuant ops.
+//
+// Annotating a graph requires following a few hard rules:
+// - every input MUST have a FakeQuant immediately following it
+// - every output MUST have a FakeQuant immediately preceding it
+// - important arithmetic ops (such as FullyConnected) SHOULD have a FakeQuant
+//   immediately following it
+// - all trained weights (RHS of FullyConnected ops, params on Gather ops, etc)
+//   MUST have FakeQuants between them and the consuming op
+// Additional FakeQuants may be used if desired, especially in areas that may
+// suffer from large precision changes - such as between a Softmax and a
+// FullyConnected. Only by validating accuracy differences between float
+// inference with the FakeQuant ops simulating quantization and the actually
+// quantized graph can you be sure the appropriate FakeQuant ops are present.
+//
+// You can tell if you're missing some FakeQuants by looking for warnings from
+// quantize.cc about minmax ranges being determined by the contents of constant
+// arrays. This will almost never produce functional models during inference.
+//
+// As this op may change the data types and ranges of input and output arrays
+// downstream tools must also be sure to parse the output model flags to get the
+// post-Transform values that may have changed due to this transformation.
+//
+// This isn't a GraphTransformation in the traditional respect as it affects ops
+// outside of the one under transformation. This is primarily so that we can
+// utilize the graph traversal and repeated pass system underlying the
+// transformation system to exhaustively find all FakeQuant ops. It also gets us
+// nice logging and integration with the graphviz video dumping mode.
+// In general you should not copy this style of transformation and stick to
+// local-only changes as seen in the other transformations.
+bool PropagateFakeQuantNumBits::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  if (op->type != OperatorType::kFakeQuant) {
+    return false;
+  }
+  auto* fakequant_op = static_cast<FakeQuantOperator*>(op);
+
+  ArrayDataType quantized_data_type = ArrayDataType::kNone;
+  if (!InferQuantizedDataTypeFromFakeQuant(*fakequant_op,
+                                           &quantized_data_type)) {
+    AddMessageF("FakeQuant op %s num_bits=%d is out of range, ignoring",
+                LogName(*op), fakequant_op->num_bits);
+    return false;
+  }
+  const auto& final_minmax = *fakequant_op->minmax;
+
+  AddMessageF(
+      "Beginning propagation of fake quant %s num_bits=%d min=%g max=%g to %s",
+      LogName(*op), fakequant_op->num_bits, final_minmax.min, final_minmax.max,
+      ArrayDataTypeName(quantized_data_type));
+
+  bool did_change = false;
+
+  // Propagate the FakeQuant information backward up the graph.
+  // This will possibly adjust input arrays or constant types (like Gather).
+  did_change |= RecursivelyBackwardPropagateDataType(
+      this, model, op, quantized_data_type, final_minmax);
+
+  // Propagate the FakeQuant information forward down the graph.
+  // This will possibly adjust output arrays.
+  did_change |=
+      RecursivelyForwardPropagateDataType(this, model, op, quantized_data_type);
+
+  return did_change;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 0cf0994b43bb048616bab1abe79db1aae2223d37..4923f83d91defb77655728e97eddccd9595225f6 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -31,17 +32,32 @@ namespace {
 
 void ComputeConvSizes(const Shape& input_shape, int output_depth, int kwidth,
                       int kheight, int stride_width, int stride_height,
+                      int dilation_width_factor, int dilation_height_factor,
                       PaddingType padding_type, Shape* output_shape,
                       FixedPadding* fixed_padding) {
   const int input_width = input_shape.dims(2);
   const int input_height = input_shape.dims(1);
   const int batch = input_shape.dims(0);
 
+  CHECK_GE(input_width, 1);
+  CHECK_GE(input_height, 1);
+  CHECK_GE(batch, 1);
+  CHECK_GE(kwidth, 1);
+  CHECK_GE(kheight, 1);
+  CHECK_GE(stride_width, 1);
+  CHECK_GE(stride_height, 1);
+  CHECK_GE(dilation_width_factor, 1);
+  CHECK_GE(dilation_height_factor, 1);
+
+  int dilated_kwidth = dilation_width_factor * (kwidth - 1) + 1;
+  int dilated_kheight = dilation_height_factor * (kheight - 1) + 1;
+
   int output_height = 0;
   int output_width = 0;
   if (padding_type == PaddingType::kValid) {
-    output_height = (input_height + stride_height - kheight) / stride_height;
-    output_width = (input_width + stride_width - kwidth) / stride_width;
+    output_height =
+        (input_height + stride_height - dilated_kheight) / stride_height;
+    output_width = (input_width + stride_width - dilated_kwidth) / stride_width;
   } else if (padding_type == PaddingType::kSame) {
     output_height = (input_height + stride_height - 1) / stride_height;
     output_width = (input_width + stride_width - 1) / stride_width;
@@ -49,10 +65,12 @@ void ComputeConvSizes(const Shape& input_shape, int output_depth, int kwidth,
     LOG(FATAL) << "Only supporting SAME or VALID padding";
   }
 
-  fixed_padding->height = std::max(
-      0, ((output_height - 1) * stride_height + kheight - input_height) / 2);
+  fixed_padding->height = std::max(0, ((output_height - 1) * stride_height +
+                                       dilated_kheight - input_height) /
+                                          2);
   fixed_padding->width = std::max(
-      0, ((output_width - 1) * stride_width + kwidth - input_width) / 2);
+      0,
+      ((output_width - 1) * stride_width + dilated_kwidth - input_width) / 2);
 
   // Actually had to debug a situation where those were negative due to bad
   // propagation of placeholder -1 sizes in TensorFlowReshape.
@@ -151,7 +169,9 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
     return;
   }
   const auto& input_shape = input_array.shape();
-  CHECK_EQ(input_shape.dimensions_count(), 4);
+  CHECK(input_shape.dimensions_count() == 4)
+      << "Conv ops require 4D inputs. Input array \"" << op->inputs[0]
+      << "\" is " << input_shape.dimensions_count() << "D.";
 
   const auto& weights_array = model->GetArray(op->inputs[1]);
   // Yield until weights dims have been resolved.
@@ -166,7 +186,8 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
   const int kheight = weights_shape.dims(1);
   const int kwidth = weights_shape.dims(2);
   ComputeConvSizes(input_shape, output_depth, kwidth, kheight, op->stride_width,
-                   op->stride_height, op->padding.type,
+                   op->stride_height, op->dilation_width_factor,
+                   op->dilation_height_factor, op->padding.type,
                    output_array.mutable_shape(),
                    &op->padding.GetOrCreateFixedPadding());
   CHECK_EQ(output_array.shape().dimensions_count(), 4);
@@ -182,6 +203,90 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
   }
 }
 
+void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) {
+  // TransposeConv is unique in that it is specifically given the output shape
+  // as a 1D array on it's 1st input. Theoretically then, resolving the output
+  // shape is as easy as waiting for this input to be resolved. However, we also
+  // have to calculate the padding which requires the weights shape. So, we
+  // might as well calculate the output shape and ensure it matches the
+  // specified one
+
+  // Check if we have already run.
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    return;
+  }
+
+  // SPECIFIED OUTPUT SHAPE
+  // The below is the specified, or prescribed output shape, _given_ to the
+  // operator as an input.
+  auto& specified_output_shape_array =
+      model->GetArray(op->inputs[TransposeConvOperator::OUTPUT_SHAPE]);
+  if (!specified_output_shape_array.has_shape() ||
+      !specified_output_shape_array.buffer) {
+    // Yield until the specified output shape is resolved as a constant
+    return;
+  }
+
+  CHECK(specified_output_shape_array.data_type == ArrayDataType::kInt32)
+      << "TransposeConv input_dims must be int32";
+
+  CHECK(specified_output_shape_array.shape().dimensions_count() == 1 &&
+        specified_output_shape_array.shape().dims(0) == 4)
+      << "TransposeConv requires a 1D, 4 element array on it's 0th input "
+         "specifying the output shape. \""
+      << op->inputs[TransposeConvOperator::OUTPUT_SHAPE] << "\" had shape "
+      << toco::ShapeToString(specified_output_shape_array.shape());
+
+  // COMPUTE PADDING
+  // We require the weights shape to calculate padding.
+  const auto& weights_array =
+      model->GetArray(op->inputs[TransposeConvOperator::WEIGHTS]);
+  if (!weights_array.has_shape()) {
+    // Yield until weights dims have been resolved.
+    return;
+  }
+  const auto& weights_shape = weights_array.shape();
+  CHECK_EQ(weights_shape.dimensions_count(), 4)
+      << "TransposeConv weights must have 4 input dimensions. Input weights \""
+      << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape "
+      << toco::ShapeToString(weights_shape) << ".";
+
+  // Compute padding
+  const int kheight = weights_shape.dims(1);
+  const int kwidth = weights_shape.dims(2);
+  op->padding.GetOrCreateFixedPadding();
+  if (op->padding.type == PaddingType::kValid) {
+    op->padding.fixed->height = 0;
+    op->padding.fixed->width = 0;
+  } else if (op->padding.type == PaddingType::kSame) {
+    op->padding.fixed->height = (kheight - 1) / 2;
+    op->padding.fixed->width = (kwidth - 1) / 2;
+  } else {
+    LOG(FATAL) << "TransposeConv only supports SAME or VALID padding";
+  }
+
+  // VALIDATE some dimensions and set the output shape.
+  const auto& input_array =
+      model->GetArray(op->inputs[TransposeConvOperator::DATA_INPUT]);
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_EQ(input_shape.dimensions_count(), 4)
+      << "TransposeConv input shape must have 4 dimensions. Input \""
+      << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape "
+      << toco::ShapeToString(weights_shape) << ".";
+  CHECK_EQ(input_shape.dims(3), weights_shape.dims(0))
+      << "Input shape depth and weight depth do not agree";
+
+  // Set the output shape according to the specified output shape.
+  std::vector<int32> const& specified_output_shape =
+      specified_output_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  *(output_array.mutable_shape()->mutable_dims()) = specified_output_shape;
+}
+
 void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
   if (!EnsureBiasVectorShape(model, op)) {
     return;
@@ -222,7 +327,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
   const int kheight = weights_shape.dims(1);
   const int kwidth = weights_shape.dims(2);
   ComputeConvSizes(input_shape, output_depth, kwidth, kheight, op->stride_width,
-                   op->stride_height, op->padding.type,
+                   op->stride_height, 1, 1, op->padding.type,
                    model->GetArray(output_name).mutable_shape(),
                    &op->padding.GetOrCreateFixedPadding());
 }
@@ -274,8 +379,7 @@ void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
                          depth * block_size * block_size}));
 }
 
-void ProcessFillOperator(Model* model, FillOperator* op) {
-  CHECK_EQ(op->inputs.size(), 2);
+void ProcessOpWithShapeInput(Model* model, Operator* op) {
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
@@ -697,7 +801,7 @@ void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
   const string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
-                   op->stride_width, op->stride_height, op->padding.type,
+                   op->stride_width, op->stride_height, 1, 1, op->padding.type,
                    model->GetArray(output_name).mutable_shape(),
                    &op->padding.GetOrCreateFixedPadding());
 }
@@ -714,7 +818,7 @@ void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
   const string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
-                   op->stride_width, op->stride_height, op->padding.type,
+                   op->stride_width, op->stride_height, 1, 1, op->padding.type,
                    model->GetArray(output_name).mutable_shape(),
                    &op->padding.GetOrCreateFixedPadding());
 }
@@ -733,7 +837,7 @@ void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
   const string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
-                   op->stride_width, op->stride_height, op->padding.type,
+                   op->stride_width, op->stride_height, 1, 1, op->padding.type,
                    model->GetArray(output_name).mutable_shape(),
                    &op->padding.GetOrCreateFixedPadding());
 }
@@ -933,17 +1037,15 @@ void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
   }
   QCHECK(crops_array.data_type == ArrayDataType::kInt32);
   const auto& crops_data = crops_array.GetBuffer<ArrayDataType::kInt32>().data;
-  // We don't support crops now.
-  QCHECK_EQ(crops_data[0], 0);
-  QCHECK_EQ(crops_data[1], 0);
-  QCHECK_EQ(crops_data[2], 0);
-  QCHECK_EQ(crops_data[3], 0);
-
+  const int crops_top = crops_data[0];
+  const int crops_bottom = crops_data[1];
+  const int crops_left = crops_data[2];
+  const int crops_right = crops_data[3];
+  const int output_height =
+      input_height * block_height - crops_top - crops_bottom;
+  const int output_width = input_width * block_width - crops_left - crops_right;
   QCHECK_EQ(input_shape.dims(0) % (block_height * block_width), 0);
 
-  int output_height = input_height * block_height;
-  int output_width = input_width * block_width;
-
   model->GetArray(op->outputs[0])
       .copy_shape(Shape({input_shape.dims(0) / (block_height * block_width),
                          output_height, output_width, input_shape.dims(3)}));
@@ -985,8 +1087,8 @@ void ProcessGatherOperator(Model* model, GatherOperator* op) {
 void ProcessTopkV2Operator(Model* model, TopKV2Operator* op) {
   const auto& input_values = model->GetArray(op->inputs[0]);
   const auto& input_k = model->GetArray(op->inputs[1]);
-  auto& output_indexes = model->GetArray(op->outputs[0]);
-  auto& output_values = model->GetArray(op->outputs[1]);
+  auto& output_values = model->GetArray(op->outputs[0]);
+  auto& output_indexes = model->GetArray(op->outputs[1]);
 
   // Bail if we already know the output shape.
   if (output_indexes.has_shape()) {
@@ -1054,6 +1156,11 @@ void ProcessRankOperator(Model* model, RankOperator* op) {
     return;
   }
 
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
@@ -1075,6 +1182,11 @@ void ProcessShapeOperator(Model* model, TensorFlowShapeOperator* op) {
     return;
   }
 
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
@@ -1105,10 +1217,6 @@ void ProcessStackOperator(Model* model, StackOperator* op) {
     }
 
     Shape shape = input_array.shape();
-    if (shape.dimensions_count() == 0) {
-      // Convert 0D scalars to 1D scalars of shape {1}.
-      shape.mutable_dims()->push_back(1);
-    }
     if (!stacked_shape) {
       stacked_shape.reset(new Shape(shape));
     } else {
@@ -1165,43 +1273,45 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
     return;
   }
 
-  int dim_count = input_array.shape().dimensions_count();
-  CHECK(op->start_indices.size() == dim_count)
-      << ": Incorrect number of start indices supplied to StridedSlice op with "
-         "output \""
-      << op->outputs[0] << "\". Op requires " << dim_count << " start indices";
-  CHECK(op->stop_indices.size() == dim_count)
-      << ": Incorrect number of stop indices supplied to StridedSlice op with "
-         "output \""
-      << op->outputs[0] << "\". Op requires " << dim_count << " stop indices";
-  CHECK(op->strides.size() == dim_count)
-      << ": Incorrect number of strides supplied to StridedSlice op with "
-         " output \""
-      << op->outputs[0] << "\". Op requires " << dim_count << " strides";
+  int num_input_axes = input_array.shape().dimensions_count();
+  CHECK_LE(op->start_indices.size(), num_input_axes)
+      << "StridedSlice op with output \"" << op->outputs[0]
+      << "\", requires no more than " << num_input_axes << " start indices";
+  CHECK_LE(op->stop_indices.size(), num_input_axes)
+      << "StridedSlice op with output \"" << op->outputs[0]
+      << "\", requires no more than " << num_input_axes << " stop indices";
+  CHECK_LE(op->strides.size(), num_input_axes)
+      << "StridedSlice op with output \"" << op->outputs[0]
+      << "\", requires no more than " << num_input_axes << " strides";
+  for (int i = 0; i < op->strides.size(); i++) {
+    CHECK_NE(op->strides[i], 0) << "Strides must be non-zero. Axis " << i
+                                << " has stride=" << op->strides[i] << ".";
+  }
 
   // Create output shape
   std::vector<int>* dims = output_array.mutable_shape()->mutable_dims();
 
   // Compute output shape
-  for (int i = 0; i < dim_count; ++i) {
-    const int mask = 1 << i;
-    int start = (op->begin_mask & mask) ? 0 : op->start_indices[i];
-    if (start < 0) {
-      // handle negative indices
-      start += input_array.shape().dims(i);
-    }
-    int stop = (op->end_mask & mask) ? input_array.shape().dims(i)
-                                     : op->stop_indices[i];
-    if (stop < 0) {
-      // handle negative indices
-      stop += input_array.shape().dims(i);
-    }
-
-    int dim_size = ceil((stop - start) / static_cast<float>(op->strides[i]));
-    dim_size = dim_size < 0 ? 0 : dim_size;
-    if (op->shrink_axis_mask & mask) {
-      CHECK_EQ(dim_size, 1) << "Output size for an axis must compute to 1 when "
-                               "shrinking that axis";
+  for (int axis = 0; axis < num_input_axes; ++axis) {
+    int start_index = tflite::strided_slice::StartForAxis(
+        op->begin_mask, op->start_indices, op->strides,
+        input_array.shape().dims().data(), axis);
+    int stop_index = tflite::strided_slice::StopForAxis(
+        op->end_mask, op->stop_indices, op->strides,
+        input_array.shape().dims().data(), axis);
+    int dim_size =
+        ceil(static_cast<float>(stop_index - start_index) / op->strides[axis]);
+
+    CHECK_GT(dim_size, 0)
+        << "Output size for an axis must be greater than 0. Axis " << axis
+        << " computes to size " << dim_size
+        << " for StridedSlice op with output \"" << op->outputs[0] << "\".";
+    if (op->shrink_axis_mask & (1 << axis)) {
+      CHECK_EQ(dim_size, 1)
+          << "Output size for an axis must compute to 1 when shrinking an "
+             "axis. Axis "
+          << axis << " computes to size " << dim_size
+          << " for StridedSlice op with output \"" << op->outputs[0] << "\".";
     } else {
       dims->push_back(dim_size);
     }
@@ -1292,7 +1402,7 @@ void ProcessTransposeOperator(Model* model, TransposeOperator* op) {
   std::vector<int32> const& perm =
       perm_array.GetBuffer<ArrayDataType::kInt32>().data;
   CHECK_EQ(perm.size(), input_shape.dimensions_count())
-      << "Transpose permutation input " << op->inputs[0]
+      << "Transpose permutation input " << op->inputs[1]
       << " must be same length as input dimensions";
   std::vector<int>* output_dims = output_array.mutable_shape()->mutable_dims();
   for (int i = 0; i < perm.size(); i++) {
@@ -1311,13 +1421,10 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
     return;
   }
 
-  // The current ArgMax implementation only supports 4-dimensional inputs with
-  // the last dimension as the axis to perform ArgMax for.
   const std::vector<int>& input_dims = input_array.shape().dims();
-  CHECK_EQ(input_dims.size(), 4);
   std::vector<int> output_dims;
 
-  output_dims.reserve(input_dims.size() - 1);
+  output_dims.reserve(input_dims.size());
   for (int i = 0; i < input_dims.size() - 1; ++i) {
     output_dims.push_back(input_dims[i]);
   }
@@ -1349,8 +1456,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kRelu:
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
+    case OperatorType::kPRelu:
     case OperatorType::kSoftmax:
     case OperatorType::kLogSoftmax:
+    case OperatorType::kLog:
     case OperatorType::kLogistic:
     case OperatorType::kTanh:
     case OperatorType::kLocalResponseNormalization:
@@ -1394,8 +1503,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       ProcessConvOperator(model, static_cast<ConvOperator*>(op));
       break;
     case OperatorType::kTransposeConv:
-      // Unimplemented, hopefully another graph transformation will drop it or
-      // rewrite it.
+      ProcessTransposeConvOperator(model,
+                                   static_cast<TransposeConvOperator*>(op));
       break;
     case OperatorType::kDepthwiseConv:
       ProcessDepthwiseConvOperator(model,
@@ -1410,7 +1519,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
                                   static_cast<SpaceToDepthOperator*>(op));
       break;
     case OperatorType::kFill:
-      ProcessFillOperator(model, static_cast<FillOperator*>(op));
+      CHECK_EQ(op->inputs.size(), 2);
+      ProcessOpWithShapeInput(model, op);
       break;
     case OperatorType::kFullyConnected:
       ProcessFullyConnectedOperator(model,
@@ -1534,6 +1644,16 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kTranspose:
       ProcessTransposeOperator(model, static_cast<TransposeOperator*>(op));
       break;
+    case OperatorType::kDynamicPartition:
+    case OperatorType::kDynamicStitch:
+      // DynamicPartition/DynamicStitch are currently only supported for
+      // transforms that remove them, so we avoid propagating shapes through
+      // them and let things settle once they've been removed.
+      break;
+    case OperatorType::kRandomUniform:
+      CHECK_EQ(op->inputs.size(), 1);
+      ProcessOpWithShapeInput(model, op);
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d74cad9a626b3a472e2740d6bdaaaf7aab5bd484
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
@@ -0,0 +1,261 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool InferQuantizedDataTypeFromFakeQuant(
+    const FakeQuantOperator& op, ArrayDataType* out_quantized_data_type) {
+  if (op.num_bits <= 8) {
+    *out_quantized_data_type = ArrayDataType::kUint8;
+    return true;
+  } else if (op.num_bits <= 16) {
+    *out_quantized_data_type = ArrayDataType::kInt16;
+    return true;
+  } else {
+    *out_quantized_data_type = ArrayDataType::kNone;
+    return false;
+  }
+}
+
+bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
+                                        double* out_min_value,
+                                        double* out_max_value) {
+  switch (data_type) {
+    case ArrayDataType::kUint8:
+      *out_min_value = 0;
+      *out_max_value = 255;
+      return true;
+    case ArrayDataType::kInt16:
+      *out_min_value = -32768;
+      *out_max_value = 32767;
+      return true;
+    default:
+      return false;
+  }
+}
+
+ArrayDataType GetQuantizedDataType(const Array& array,
+                                   ArrayDataType default_type) {
+  switch (array.final_data_type) {
+    case ArrayDataType::kInt8:
+    case ArrayDataType::kUint8:
+    case ArrayDataType::kInt16:
+    case ArrayDataType::kUint16:
+    case ArrayDataType::kInt32:
+    case ArrayDataType::kUint32:
+    case ArrayDataType::kInt64:
+    case ArrayDataType::kUint64:
+      return array.final_data_type;
+    case ArrayDataType::kFloat:
+    case ArrayDataType::kNone:
+      return default_type;
+    default:
+      LOG(FATAL) << "Unhandled final quantization type "
+                 << static_cast<int>(array.final_data_type);
+  }
+}
+
+void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
+                           QuantizationParams* quantization_params) {
+  switch (data_type) {
+    case ArrayDataType::kInt8:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt8>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint8:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kInt16:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint16:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint16>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kInt32:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt32>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint32:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint32>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kInt64:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt64>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint64:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint64>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kFloat:
+    case ArrayDataType::kNone:
+    default:
+      LOG(FATAL) << "Unhandled final quantization type "
+                 << static_cast<int>(data_type);
+  }
+}
+
+namespace {
+
+template <ArrayDataType A>
+std::unique_ptr<GenericBuffer> QuantizeBuffer(
+    const GenericBuffer& buffer,
+    const QuantizationParams& quantization_params) {
+  const auto inverse_scale = 1. / quantization_params.scale;
+  CHECK(buffer.type == ArrayDataType::kFloat);
+  const auto& float_buffer =
+      static_cast<const Buffer<ArrayDataType::kFloat>&>(buffer);
+  auto* quantized_buffer = new Buffer<A>;
+  quantized_buffer->data.resize(float_buffer.data.size());
+  for (std::size_t i = 0; i < float_buffer.data.size(); i++) {
+    const float src_val = float_buffer.data[i];
+    double scaled_val;  // Astonishingly, using 'float' degrades accuracy just
+                        // enough to make a few tests fail!
+    if (quantization_params.scale == 0) {
+      CHECK_EQ(src_val, 0) << "The quantization scale for this array is 0, "
+                           << "so all its values should be 0.";
+      scaled_val = quantization_params.zero_point;
+    } else {
+      scaled_val = quantization_params.zero_point + inverse_scale * src_val;
+    }
+    quantized_buffer->data[i] =
+        tflite::SafeCast<DataType<A>>(std::round(scaled_val));
+  }
+  return std::unique_ptr<GenericBuffer>(quantized_buffer);
+}
+
+template <ArrayDataType A>
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name,
+                   const QuantizationParams& quantization_params) {
+  auto& array = model->GetArray(name);
+  CHECK(array.data_type == ArrayDataType::kFloat);
+  CHECK(!array.quantization_params);
+  array.GetOrCreateQuantizationParams() = quantization_params;
+  if (array.buffer) {
+    array.buffer = QuantizeBuffer<A>(*array.buffer, quantization_params);
+  }
+  array.data_type = A;
+  array.final_data_type = A;
+  transformation->AddMessageF(
+      "Quantized array %s to %s zero_point=%g, scale=%g", name,
+      ArrayDataTypeName(array.data_type), quantization_params.zero_point,
+      quantization_params.scale);
+}
+
+}  // namespace
+
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name, ArrayDataType quantized_data_type,
+                   const QuantizationParams& quantization_params) {
+  ArrayDataType adjusted_data_type = quantized_data_type;
+  auto& array = model->GetArray(name);
+  if (array.final_data_type == ArrayDataType::kInt16) {
+    adjusted_data_type = array.final_data_type;
+  }
+
+  switch (adjusted_data_type) {
+    case ArrayDataType::kUint8:
+      return QuantizeArray<ArrayDataType::kUint8>(transformation, model, name,
+                                                  quantization_params);
+    case ArrayDataType::kInt16:
+      return QuantizeArray<ArrayDataType::kInt16>(transformation, model, name,
+                                                  quantization_params);
+    case ArrayDataType::kInt32:
+      return QuantizeArray<ArrayDataType::kInt32>(transformation, model, name,
+                                                  quantization_params);
+    default:
+      LOG(FATAL) << "Unhandled case.";
+  }
+}
+
+bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
+                                 const Array& array, double clamp_min,
+                                 double clamp_max) {
+  ArrayDataType quantized_data_type =
+      GetQuantizedDataType(array, array.data_type);
+  if (quantized_data_type == ArrayDataType::kNone ||
+      quantized_data_type == ArrayDataType::kFloat) {
+    // The array is not (or never will be) quantized.
+    return false;
+  }
+
+  QuantizationParams quantization_params;
+  if (!array.quantization_params) {
+    if (!array.minmax) {
+      transformation->AddMessageF("No quantization params and no minmax");
+      return false;
+    } else {
+      // Work around cases where we are asking for this prior to the Quantize
+      // transformation having added the quantization_params.
+      GetQuantizationParams(quantized_data_type, *array.minmax,
+                            &quantization_params);
+      transformation->AddMessageF(
+          "No quantization params - infering from data type %s with minmax "
+          "%g,%g as zero_point=%g, scale=%g",
+          ArrayDataTypeName(quantized_data_type), array.minmax->min,
+          array.minmax->max, quantization_params.zero_point,
+          quantization_params.scale);
+    }
+  } else {
+    quantization_params = array.GetQuantizationParams();
+  }
+
+  double quantized_min, quantized_max;
+  CHECK(GetQuantizedDataTypeNumericalRange(quantized_data_type, &quantized_min,
+                                           &quantized_max))
+      << "Type is not quantized";
+
+  bool has_nontrivial_min_bound = false;
+  bool has_nontrivial_max_bound = false;
+
+  double lowest_representable_output =
+      (quantized_min - quantization_params.zero_point) *
+      quantization_params.scale;
+  if (lowest_representable_output < clamp_min) {
+    has_nontrivial_min_bound = true;
+    transformation->AddMessageF(
+        "Quantized activation function is not trivial: "
+        "the lowest representable output value %g"
+        " less than the clamp min bound %g.",
+        lowest_representable_output, clamp_min);
+  }
+
+  double highest_representable_output =
+      (quantized_max - quantization_params.zero_point) *
+      quantization_params.scale;
+  if (highest_representable_output > clamp_max) {
+    has_nontrivial_max_bound = true;
+    transformation->AddMessageF(
+        "Quantized activation function is not trivial: "
+        "the highest representable output value %g"
+        " is greater than the clamp max bound %g.",
+        highest_representable_output, clamp_max);
+  }
+
+  return !has_nontrivial_min_bound && !has_nontrivial_max_bound;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..79a2ce7e50887b4608b278471da0e5e63b5673e3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+// Gets the target quantized data type of an array based on the fake quant op.
+// For example, if the num_bits is 8 the data type will be kUint8.
+bool InferQuantizedDataTypeFromFakeQuant(
+    const FakeQuantOperator& op, ArrayDataType* out_quantized_data_type);
+
+// Gets the min/max numerical range for the given quantized data type.
+// For example, kUint8 will return [0,255].
+// Returns true if the ranges were set and false if the type is not quantized.
+bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
+                                        double* out_min_value,
+                                        double* out_max_value);
+
+// Returns the quantized data type of an array, falling back to the provided
+// default data type.
+ArrayDataType GetQuantizedDataType(const Array& array,
+                                   ArrayDataType default_type);
+
+// Returns the quantization params for the array with the given data type and
+// minmax.
+void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
+                           QuantizationParams* quantization_params);
+
+// Returns the quantization params for the data type and minmax values.
+template <ArrayDataType A>
+void GetQuantizationParamsFromMinMax(const MinMax& minmax,
+                                     QuantizationParams* quantization_params) {
+  using Integer = DataType<A>;
+  const double rmin = minmax.min;
+  const double rmax = minmax.max;
+  *quantization_params =
+      ::tflite::ChooseQuantizationParams<Integer>(rmin, rmax);
+}
+
+// Quantizes an array by setting its data type and (if constant) quantizing
+// all values in the array.
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name, ArrayDataType quantized_data_type,
+                   const QuantizationParams& quantization_params);
+
+// Returns true if the given array, when quantized, contains only values between
+// the provided clamp min/max.
+// Either clamp_min or clamp_max may be +/-infinity to indicate that the value
+// is unbounded on that side.
+bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
+                                 const Array& array, double clamp_min,
+                                 double clamp_max);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index d7f804ee432598cafe6b6c05d03219aa7d2783fa..fa46e6bc3805d3f3a7c9223ff4b111a4ed8e8559 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -44,75 +45,16 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kTensorFlowMinimum ||
          type == OperatorType::kTensorFlowMaximum ||
          type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
+         type == OperatorType::kLogSoftmax ||
          type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kTensorFlowReshape ||
          type == OperatorType::kTanh || type == OperatorType::kMul ||
          type == OperatorType::kSpaceToDepth ||
-         type == OperatorType::kDepthToSpace || type == OperatorType::kLstmCell;
-}
-
-template <ArrayDataType A>
-std::unique_ptr<GenericBuffer> QuantizeBuffer(
-    const GenericBuffer& buffer,
-    const QuantizationParams& quantization_params) {
-  const auto inverse_scale = 1. / quantization_params.scale;
-  CHECK(buffer.type == ArrayDataType::kFloat);
-  const auto& float_buffer =
-      static_cast<const Buffer<ArrayDataType::kFloat>&>(buffer);
-  auto* quantized_buffer = new Buffer<A>;
-  quantized_buffer->data.resize(float_buffer.data.size());
-  const auto qmin = static_cast<int32>(std::numeric_limits<DataType<A>>::min());
-  const auto qmax = static_cast<int32>(std::numeric_limits<DataType<A>>::max());
-  for (std::size_t i = 0; i < float_buffer.data.size(); i++) {
-    const float src_val = float_buffer.data[i];
-    double scaled_val;  // Astonishingly, using 'float' degrades accuracy just
-                        // enough to make a few tests fail!
-    if (quantization_params.scale == 0) {
-      CHECK_EQ(src_val, 0) << "The quantization scale for this array is 0, "
-                           << "so all its values should be 0.";
-      scaled_val = quantization_params.zero_point;
-    } else {
-      scaled_val = quantization_params.zero_point + inverse_scale * src_val;
-    }
-    const auto rounded_val = static_cast<int32>(std::round(scaled_val));
-    const auto clamped_val = std::min(qmax, std::max(qmin, rounded_val));
-    quantized_buffer->data[i] = static_cast<DataType<A>>(clamped_val);
-  }
-  return std::unique_ptr<GenericBuffer>(quantized_buffer);
-}
-
-template <ArrayDataType A>
-void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name,
-                   const QuantizationParams& quantization_params) {
-  auto& array = model->GetArray(name);
-  CHECK(array.data_type == ArrayDataType::kFloat);
-  CHECK(!array.quantization_params);
-  array.GetOrCreateQuantizationParams() = quantization_params;
-  if (array.buffer) {
-    array.buffer = QuantizeBuffer<A>(*array.buffer, quantization_params);
-  }
-  array.data_type = A;
-  transformation->AddMessageF("Quantized array %s", name);
-}
-
-void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
-                   const QuantizationParams& quantization_params) {
-  switch (quantized_data_type) {
-    case ArrayDataType::kUint8:
-      return QuantizeArray<ArrayDataType::kUint8>(transformation, model, name,
-                                                  quantization_params);
-    case ArrayDataType::kInt16:
-      return QuantizeArray<ArrayDataType::kInt16>(transformation, model, name,
-                                                  quantization_params);
-    case ArrayDataType::kInt32:
-      return QuantizeArray<ArrayDataType::kInt32>(transformation, model, name,
-                                                  quantization_params);
-    default:
-      LOG(FATAL) << "Unhandled case.";
-  }
+         type == OperatorType::kStridedSlice ||
+         type == OperatorType::kDepthToSpace ||
+         type == OperatorType::kLstmCell || type == OperatorType::kGather ||
+         type == OperatorType::kTranspose || type == OperatorType::kMean;
 }
 
 const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
@@ -166,6 +108,38 @@ const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
                 "proceed with quantization.";
 }
 
+struct QuantizationPoints {
+  int64 min_value;
+  int64 max_value;
+  int64 central_value;
+};
+
+template <ArrayDataType A>
+QuantizationPoints GetQuantizationPoints() {
+  QuantizationPoints qp;
+  using Integer = DataType<A>;
+  qp.min_value = std::numeric_limits<Integer>::min();
+  qp.max_value = std::numeric_limits<Integer>::max();
+  // eg [-128,127]...
+  qp.central_value = (qp.min_value / 2 +        // -128 -> -64.
+                      (qp.max_value - 1) / 2 +  // 127 -> 63.
+                      1);
+  return qp;
+}
+
+QuantizationPoints GetQuantizationPoints(ArrayDataType data_type) {
+  switch (data_type) {
+    case ArrayDataType::kUint8:
+      return GetQuantizationPoints<ArrayDataType::kUint8>();
+    case ArrayDataType::kInt16:
+      return GetQuantizationPoints<ArrayDataType::kInt16>();
+    case ArrayDataType::kInt32:
+      return GetQuantizationPoints<ArrayDataType::kInt32>();
+    default:
+      LOG(FATAL) << "Unhandled case.";
+  }
+}
+
 bool ChooseQuantizationForOperatorInput(
     GraphTransformation* transformation, Model* model, const Operator& op,
     std::size_t input_index, ArrayDataType* quantized_data_type,
@@ -205,6 +179,8 @@ bool ChooseQuantizationForOperatorInput(
     const auto& input_weights = model->GetArray(op.inputs[weights_input_index]);
     if (!input_activations.quantization_params ||
         !input_weights.quantization_params) {
+      transformation->AddMessageF(
+          "Input array %s is a bias vector but has no qparams", input);
       return false;
     }
     const auto input_activations_scale =
@@ -212,7 +188,7 @@ bool ChooseQuantizationForOperatorInput(
     const auto input_weights_scale = input_weights.quantization_params->scale;
     quantization_params->scale = input_activations_scale * input_weights_scale;
     quantization_params->zero_point = 0;
-    *quantized_data_type = ArrayDataType::kInt32;
+    *quantized_data_type = GetQuantizedDataType(array, ArrayDataType::kInt32);
     transformation->AddMessageF(
         "Input array %s is a bias vector. Choosing quantization params "
         "accordingly.",
@@ -224,23 +200,20 @@ bool ChooseQuantizationForOperatorInput(
 
   if (op.type == OperatorType::kLstmCell) {
     if (input_index == LstmCellOperator::PREV_STATE_INPUT) {
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
-          model->flags, minmax, quantization_params);
       *quantized_data_type = ArrayDataType::kInt16;
+      GetQuantizationParams(*quantized_data_type, minmax, quantization_params);
       return true;
     }
   }
 
-  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(model->flags, minmax,
-                                                         quantization_params);
+  *quantized_data_type = GetQuantizedDataType(array, ArrayDataType::kUint8);
+  GetQuantizationParams(*quantized_data_type, minmax, quantization_params);
   transformation->AddMessageF(
-      "For input array %s with min=%g"
-      ", max=%g"
-      ", chose to quantize as uint8 with zero_point=%d"
-      ", scale=%g",
-      input, minmax.min, minmax.max, quantization_params->zero_point,
+      "For input array %s with min=%g, max=%g, chose to quantize as %s (f=%s) "
+      "with zero_point=%d, scale=%g",
+      input, minmax.min, minmax.max, ArrayDataTypeName(*quantized_data_type),
+      ArrayDataTypeName(array.final_data_type), quantization_params->zero_point,
       quantization_params->scale);
-  *quantized_data_type = ArrayDataType::kUint8;
   return true;
 }
 
@@ -262,16 +235,18 @@ bool IsExactlyRepresentable(double real_value, ArrayDataType data_type,
   return true;
 }
 
+// Quantized data type is preset to the type of the input before this function.
 bool ChooseHardcodedQuantizationForOperatorOutput(
-    const Operator& op, ArrayDataType* quantized_data_type,
+    const Operator& op, const Array& array, ArrayDataType* quantized_data_type,
     QuantizationParams* quantization_params) {
   if (op.type == OperatorType::kL2Normalization) {
     // L2Normalization has range: [-1, 1].
     // 0 should be exactly representable, as values will typically be centered
     // around 0, with many values near 0.
-    *quantized_data_type = ArrayDataType::kUint8;
-    quantization_params->zero_point = 128;
-    quantization_params->scale = 1. / 128.;
+    *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
+    const QuantizationPoints qp = GetQuantizationPoints(*quantized_data_type);
+    quantization_params->zero_point = qp.central_value;
+    quantization_params->scale = 1. / (qp.central_value - qp.min_value);
     CHECK(
         IsExactlyRepresentable(0., *quantized_data_type, *quantization_params));
     return true;
@@ -284,18 +259,33 @@ bool ChooseHardcodedQuantizationForOperatorOutput(
     // will typically exploit the symmetry logistic(-x) = 1 - logistic(x), and
     // the glueing of the two halves of the graph will only be seamless if we
     // are accurately representing logistic(0) == 0.5.
-    *quantized_data_type = ArrayDataType::kUint8;
+    *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
+    const QuantizationPoints qp = GetQuantizationPoints(*quantized_data_type);
     quantization_params->zero_point = 0;
-    quantization_params->scale = 1. / 256.;
+    quantization_params->scale = 1. / (qp.max_value + 1);
     CHECK(IsExactlyRepresentable(0.5, *quantized_data_type,
                                  *quantization_params));
     return true;
   }
+  if (op.type == OperatorType::kLogSoftmax) {
+    // LogSoftmax has range: [LogSoftmaxOperator::kOutputRangeMin, 0].
+    *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
+    const QuantizationPoints qp = GetQuantizationPoints(*quantized_data_type);
+    quantization_params->zero_point = qp.max_value;
+    quantization_params->scale =
+        -LogSoftmaxOperator::kOutputRangeMin / (qp.max_value + 1);
+    // While not strictly necessary, it is easier to interpret output data and
+    // quantization if the scale is similar to others (such as power of 2).
+    CHECK(IsExactlyRepresentable(LogSoftmaxOperator::kOutputRangeMin / 2,
+                                 *quantized_data_type, *quantization_params));
+    return true;
+  }
   if (op.type == OperatorType::kTanh) {
     // Tanh has the range: [-1, 1].
-    *quantized_data_type = ArrayDataType::kUint8;
-    quantization_params->zero_point = 128;
-    quantization_params->scale = 1. / 128.;
+    *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
+    const QuantizationPoints qp = GetQuantizationPoints(*quantized_data_type);
+    quantization_params->zero_point = qp.central_value;
+    quantization_params->scale = 1. / (qp.central_value - qp.min_value);
     // 0 should be exactly representable, as values will typically be centered
     // around 0, with many values near 0.
     CHECK(
@@ -312,10 +302,14 @@ bool ChooseQuantizationForOperatorOutput(
   const auto& output = op.outputs[output_index];
   auto& array = model->GetArray(output);
   if (array.data_type != ArrayDataType::kFloat) {
+    transformation->AddMessageF("Array data type already set to %s, final=%s",
+                                ArrayDataTypeName(array.data_type),
+                                ArrayDataTypeName(array.final_data_type));
     return false;
   }
-  if (ChooseHardcodedQuantizationForOperatorOutput(op, quantized_data_type,
-                                                   quantization_params)) {
+  *quantized_data_type = model->GetArray(op.inputs[0]).data_type;
+  if (ChooseHardcodedQuantizationForOperatorOutput(
+          op, array, quantized_data_type, quantization_params)) {
     transformation->AddMessageF(
         "Output array %s is produced by a %s operator. Choosing fixed "
         "quantization params accordingly.",
@@ -323,12 +317,22 @@ bool ChooseQuantizationForOperatorOutput(
     return true;
   }
   if ((op.type == OperatorType::kDepthToSpace) ||
-      (op.type == OperatorType::kSpaceToDepth)) {
-    // DepthToSpace and SpaceToDepth should preserve the quantization parameters
-    // of the input array, as these are simple reshape operations.
-    const auto& input_quantization_params =
-        model->GetArray(op.inputs[0]).GetQuantizationParams();
-    *quantized_data_type = ArrayDataType::kUint8;
+      (op.type == OperatorType::kSpaceToDepth) ||
+      (op.type == OperatorType::kTensorFlowReshape) ||
+      (op.type == OperatorType::kTensorFlowSplit) ||
+      (op.type == OperatorType::kConcatenation &&
+       model->flags.change_concat_input_ranges())) {
+    int data_input_index = 0;
+    if (op.type == OperatorType::kTensorFlowSplit) {
+      data_input_index = 1;
+    }
+    // Copying and rearrangement ops should preserve the quantization parameters
+    // of the input array.
+    const auto& input_array = model->GetArray(op.inputs[data_input_index]);
+    const auto& input_quantization_params = input_array.GetQuantizationParams();
+    *quantized_data_type =
+        GetQuantizedDataType(input_array, ArrayDataType::kUint8);
+    *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
     quantization_params->zero_point = input_quantization_params.zero_point;
     quantization_params->scale = input_quantization_params.scale;
 
@@ -342,24 +346,57 @@ bool ChooseQuantizationForOperatorOutput(
   if (op.type == OperatorType::kLstmCell) {
     if (output_index == LstmCellOperator::STATE_OUTPUT ||
         output_index == LstmCellOperator::ACTIV_TEMP) {
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
-          model->flags, minmax, quantization_params);
       *quantized_data_type = ArrayDataType::kInt16;
+      GetQuantizationParams(*quantized_data_type, minmax, quantization_params);
       return true;
     }
   }
-  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(model->flags, minmax,
-                                                         quantization_params);
-  *quantized_data_type = ArrayDataType::kUint8;
+  *quantized_data_type = GetQuantizedDataType(array, ArrayDataType::kUint8);
+  GetQuantizationParams(*quantized_data_type, minmax, quantization_params);
   transformation->AddMessageF(
       "For output array %s with min=%g, max=%g"
-      ", chose to quantize as uint8 with zero_point=%d"
+      ", chose to quantize as %s with zero_point=%d"
       ", scale=%g",
-      output, minmax.min, minmax.max, quantization_params->zero_point,
-      quantization_params->scale);
+      output, minmax.min, minmax.max, ArrayDataTypeName(*quantized_data_type),
+      quantization_params->zero_point, quantization_params->scale);
 
   return true;
 }
+
+// Fixes array minmax info to match the quantization parameters.
+// This is required for when quantization parameters change for an array during
+// quantization (such as ChooseQuantizationForOperatorOutput).
+void FixMinMaxPostQuantization(GraphTransformation* transformation,
+                               ArrayDataType quantized_data_type,
+                               const QuantizationParams& quantization_params,
+                               MinMax* minmax) {
+  double quantized_min, quantized_max;
+  if (!GetQuantizedDataTypeNumericalRange(quantized_data_type, &quantized_min,
+                                          &quantized_max)) {
+    // Not quantized - no update required.
+    return;
+  }
+
+  // Compute new minmax values.
+  double min = (quantized_min - quantization_params.zero_point) *
+               quantization_params.scale;
+  double max = (quantized_max - quantization_params.zero_point) *
+               quantization_params.scale;
+
+  // If we are close to the existing minmax values don't bother changing them.
+  // This prevents propagating small floating point precision errors.
+  constexpr double kMinMaxThreshold = 1e-5;
+  const double width = max - min;
+  if (std::abs(min - minmax->min) > kMinMaxThreshold * width ||
+      std::abs(max - minmax->max) > kMinMaxThreshold * width) {
+    transformation->AddMessageF(
+        "Adjusting min/max from %g,%g to %g,%g to match quantization params",
+        minmax->min, minmax->max, min, max);
+    minmax->min = min;
+    minmax->max = max;
+  }
+}
+
 }  // namespace
 
 bool Quantize::Run(Model* model, std::size_t op_index) {
@@ -398,9 +435,11 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
   //
   // Let us just guard this assumption by the following assertion:
   for (const auto& input : op.inputs) {
-    if (IsInputArray(*model, input)) {
-      const auto& input_array = model->GetArray(input);
-      CHECK(input_array.quantization_params);
+    const auto& input_array = model->GetArray(input);
+    if (IsInputArray(*model, input) &&
+        input_array.data_type == ArrayDataType::kFloat) {
+      CHECK(input_array.quantization_params)
+          << "Input array " << input << " is missing quantization_params";
     }
   }
   if (!SupportsQuantization(op)) {
@@ -462,10 +501,33 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
             // input instead.
             for (int i = 0; i < model->flags.output_arrays_size(); i++) {
               if (model->flags.output_arrays(i) == dequantize_op->outputs[0]) {
-                model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+                // TODO(b/78013785): never rename output arrays.
+                if (IsInputArray(*model, dequantize_op->inputs[0])) {
+                  // The op input is an input array and the output is an output
+                  // array and we can't have an array be both. Insert a copy
+                  // op to ensure the two arrays stay separate.
+                  AddMessageF(
+                      "Tried to rename output array %d while removing dequant "
+                      "op %s but array is also an input; inserting copy %s "
+                      "-> %s",
+                      i, LogName(*dequantize_op), model->flags.output_arrays(i),
+                      dequantize_op->inputs[0]);
+                  InsertCopyOperator(model, dequantize_op->inputs[0],
+                                     dequantize_op->outputs[0]);
+                } else {
+                  // Op output is strictly used as an output array, so we can
+                  // just rename the array and directly bypass the op.
+                  AddMessageF(
+                      "Renaming output array %d after removing dequant op %s: "
+                      "%s -> %s",
+                      i, LogName(*dequantize_op), model->flags.output_arrays(i),
+                      dequantize_op->inputs[0]);
+                  model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+                  model->EraseArray(dequantize_op->outputs[0]);
+                }
+                break;
               }
             }
-            model->EraseArray(dequantize_op->outputs[0]);
             model->operators.erase(dequantize_it);
           }
           changed = true;
@@ -504,15 +566,25 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
                                             &quantization_params)) {
       changed = true;
       const auto& output = op.outputs[output_index];
+      auto& output_array = model->GetArray(output);
+
+      // Fix up the min/max information on the output array to match the chosen
+      // quantization parameters.
+      CHECK(output_array.minmax)
+          << "Output array named " << output << " lacks minmax";
+      auto& output_minmax = output_array.GetMinMax();
+      FixMinMaxPostQuantization(this, quantized_data_type, quantization_params,
+                                &output_minmax);
+
       QuantizeArray(this, model, output, quantized_data_type,
                     quantization_params);
+
       const auto& dequantized_output =
           AvailableArrayName(*model, output + "_dequantized");
-      const auto& output_array = model->GetArray(output);
-      const auto& output_minmax = output_array.GetMinMax();
       auto& dequantized_output_array =
           model->GetOrCreateArray(dequantized_output);
       dequantized_output_array.data_type = ArrayDataType::kFloat;
+      dequantized_output_array.final_data_type = output_array.data_type;
       auto& dequantized_output_minmax =
           dequantized_output_array.GetOrCreateMinMax();
       dequantized_output_minmax.min = output_minmax.min;
@@ -529,6 +601,12 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
       dequantize_op->outputs = {dequantized_output};
       for (int i = 0; i < model->flags.output_arrays_size(); i++) {
         if (model->flags.output_arrays(i) == output) {
+          // TODO(b/78013785): never rename output arrays.
+          AddMessageF(
+              "Renaming output array %d after inserting dequant op %s: %s -> "
+              "%s",
+              i, LogName(*dequantize_op), model->flags.output_arrays(i),
+              dequantized_output);
           model->flags.set_output_arrays(i, dequantized_output);
         }
       }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
index 11f8d4b6eea836c5fe4efcbd5136e6183a59dc62..bdcca5b7caf61a62203debaa32c4d7a9b2eb43fa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
@@ -72,6 +72,13 @@ bool ReadFakeQuantMinMax::Run(Model* model, std::size_t op_index) {
     minmax.min = min_array.GetBuffer<ArrayDataType::kFloat>().data[0];
     minmax.max = max_array.GetBuffer<ArrayDataType::kFloat>().data[0];
     // We always want [min, max] to contain 0.
+    if (minmax.min > 0 || minmax.max < 0) {
+      LOG(ERROR) << "For " << LogName(*fakequant_op) << " the MinMax range "
+                 << "[" << minmax.min << ", " << minmax.max
+                 << "] does not contain 0. "
+                 << "Proceeding by tweaking it to contain 0, which will result "
+                    "in poor accuracy.";
+    }
     minmax.min = std::min(minmax.min, 0.);
     minmax.max = std::max(minmax.max, 0.);
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c8d04440f251f792d2a09155dd26fc01a732109
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsFakeQuantTrivial(GraphTransformation* transformation, const Model& model,
+                        const FakeQuantOperator& fakequant_op) {
+  CHECK(fakequant_op.type == OperatorType::kFakeQuant);
+
+  if (!fakequant_op.minmax) {
+    // Require ReadFakeQuantMinMax to have run.
+    return false;
+  }
+
+  // FakeQuants are trivial if they are taking input from another identical
+  // FakeQuant op.
+  auto* producing_op = GetOpWithOutput(model, fakequant_op.inputs[0]);
+  if (!producing_op || producing_op->type != OperatorType::kFakeQuant) {
+    return false;
+  }
+  const auto& producing_fakequant_op =
+      *static_cast<FakeQuantOperator*>(producing_op);
+  if (!producing_fakequant_op.minmax) {
+    // Require ReadFakeQuantMinMax to have run.
+    return false;
+  }
+
+  if (*fakequant_op.minmax == *producing_fakequant_op.minmax &&
+      fakequant_op.num_bits == producing_fakequant_op.num_bits) {
+    transformation->AddMessageF(
+        "%s is trivial because it is preceded by an identical FakeQuant %s",
+        LogName(fakequant_op), LogName(producing_fakequant_op));
+    return true;
+  }
+
+  return false;
+}
+
+}  // namespace
+
+// Removes FakeQuant ops that are trivial (have no effect, are redundant, etc).
+bool RemoveTrivialFakeQuant::Run(Model* model, std::size_t op_index) {
+  const auto op_it = model->operators.begin() + op_index;
+  auto* op = op_it->get();
+  if (op->type != OperatorType::kFakeQuant) {
+    return false;
+  }
+  auto* fakequant_op = static_cast<FakeQuantOperator*>(op);
+
+  if (!IsFakeQuantTrivial(this, *model, *fakequant_op)) {
+    AddMessageF("%s is not trivial", LogName(*fakequant_op));
+    return false;
+  }
+
+  AddMessageF("Removing trivial %s", LogName(*fakequant_op));
+
+  CHECK_EQ(fakequant_op->inputs.size(), 1);
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index 587f171bbf823408a45083c36d52f1d38c300123..3e021b819fc82d66fb70596a62fd7cee4911d4e8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -60,7 +60,9 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
   for (int i = 0; i < passthru_op->inputs.size(); i++) {
     if (!model->GetArray(passthru_op->inputs[i]).buffer) {
       count_nonconstant_input_arrays++;
-      main_input_array_index = i;
+      if (count_nonconstant_input_arrays == 1) {
+        main_input_array_index = i;
+      }
     }
   }
 
@@ -80,22 +82,13 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
 
   if (IsDiscardableArray(*model, output_name)) {
     transformation->AddMessageF(
-        "Removing %s, keeping its non-constant input array",
-        LogName(*passthru_op));
-    for (const string& input : passthru_op->inputs) {
-      if (IsDiscardableArray(*model, input) && input != main_input_name &&
-          CountOpsWithInput(*model, input) == 1) {
-      }
-    }
+        "Removing %s, keeping its non-constant input array %s and removing %s",
+        LogName(*passthru_op), main_input_name, output_name);
     RerouteEdges(output_name, main_input_name, model);
   } else if (IsDiscardableArray(*model, main_input_name)) {
-    transformation->AddMessageF("Removing %s, keeping its output array",
-                                LogName(*passthru_op));
-    for (const string& input : passthru_op->inputs) {
-      if (IsDiscardableArray(*model, input) &&
-          (input == main_input_name || CountOpsWithInput(*model, input) == 1)) {
-      }
-    }
+    transformation->AddMessageF(
+        "Removing %s, keeping its output array %s and removing input %s",
+        LogName(*passthru_op), output_name, main_input_name);
     RerouteEdges(main_input_name, output_name, model);
   } else {
     transformation->AddMessageF(
@@ -111,6 +104,16 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
   // Remove any array that is no longer used.
   for (const string& removal_candidate : removal_candidates) {
     bool is_referenced = false;
+    for (const auto& array : model->flags.input_arrays()) {
+      if (array.name() == removal_candidate) {
+        is_referenced = true;
+      }
+    }
+    for (const auto& array_name : model->flags.output_arrays()) {
+      if (array_name == removal_candidate) {
+        is_referenced = true;
+      }
+    }
     for (const auto& op : model->operators) {
       for (const string& input : op->inputs) {
         if (input == removal_candidate) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
index 28f76c9d36d6f68c8997fa0cf620c8aec4273619..752560e075a087bcc2b0a3cb19dad484fb582d42 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <limits>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/runtime/types.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
@@ -25,63 +28,98 @@ limitations under the License.
 
 namespace toco {
 
+namespace {
+
+bool IsTrivialUnfusedActivationFunc(GraphTransformation* transformation,
+                                    const Model& model, OperatorType op_type,
+                                    const string& input_array_name) {
+  double clamp_min;
+  double clamp_max;
+  switch (op_type) {
+    case OperatorType::kRelu:
+      clamp_min = 0.0;
+      clamp_max = std::numeric_limits<double>::infinity();
+      break;
+    case OperatorType::kRelu1:
+      clamp_min = -1.0;
+      clamp_max = 1.0;
+      break;
+    case OperatorType::kRelu6:
+      clamp_min = 0.0;
+      clamp_max = 6.0;
+      break;
+    default:
+      return false;
+  }
+
+  const auto& input_array = model.GetArray(input_array_name);
+  return IsArrayQuantizedRangeSubset(transformation, input_array, clamp_min,
+                                     clamp_max);
+}
+
+bool IsTrivialFusedActivationFunc(
+    GraphTransformation* transformation, const Model& model,
+    FusedActivationFunctionType activation_function,
+    const string& output_array_name) {
+  double clamp_min;
+  double clamp_max;
+  switch (activation_function) {
+    case FusedActivationFunctionType::kNone:
+      return false;
+    case FusedActivationFunctionType::kRelu:
+      clamp_min = 0.0;
+      clamp_max = std::numeric_limits<double>::infinity();
+      break;
+    case FusedActivationFunctionType::kRelu1:
+      clamp_min = -1.0;
+      clamp_max = 1.0;
+      break;
+    case FusedActivationFunctionType::kRelu6:
+      clamp_min = 0.0;
+      clamp_max = 6.0;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported fused activation type: "
+                 << static_cast<int>(activation_function);
+      return false;
+  }
+
+  const auto& output_array = model.GetArray(output_array_name);
+  return IsArrayQuantizedRangeSubset(transformation, output_array, clamp_min,
+                                     clamp_max);
+}
+
+}  // namespace
+
+// Attempts to remove both fused and unfused activation functions if the
+// quantization params indicate that the representable values fall inside the
+// activation range.
 bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
                                                std::size_t op_index) {
   const auto it = model->operators.begin() + op_index;
   auto* op = it->get();
-  if (op->fused_activation_function != FusedActivationFunctionType::kRelu &&
-      op->fused_activation_function != FusedActivationFunctionType::kRelu6) {
+  if (op->inputs.empty()) {
     return false;
   }
-  const auto& output_array = model->GetArray(op->outputs[0]);
-  if (!output_array.quantization_params) {
-    return false;
-  }
-  if (output_array.data_type != ArrayDataType::kUint8) {
-    return false;
-  }
-  const auto& quantization_params = output_array.GetQuantizationParams();
-
-  bool has_nontrivial_min_bound = false;
-  bool has_nontrivial_max_bound = false;
 
-  if (op->fused_activation_function == FusedActivationFunctionType::kRelu ||
-      op->fused_activation_function == FusedActivationFunctionType::kRelu6) {
-    double lowest_representable_output =
-        (0. - quantization_params.zero_point) * quantization_params.scale;
-    if (lowest_representable_output < 0.) {
-      has_nontrivial_min_bound = true;
-      AddMessageF(
-          "Quantized activation function is not trivial: "
-          "the lowest representable output value %g"
-          " less than the clamp min bound.",
-          lowest_representable_output);
-    }
+  if (IsTrivialUnfusedActivationFunc(this, *model, op->type, op->inputs[0])) {
+    AddMessageF(
+        "Removing trivial unfused activation function %s because the input "
+        "minmax imply at least as tight a clamp anyway.",
+        LogName(*op));
+    return RemoveTrivialPassthroughOp(this, model, op_index);
   }
-  if (op->fused_activation_function == FusedActivationFunctionType::kRelu6) {
-    double highest_representable_output =
-        (255. - quantization_params.zero_point) * quantization_params.scale;
-    if (highest_representable_output > 6.) {
-      has_nontrivial_max_bound = true;
-      AddMessageF(
-          "Quantized activation function is not trivial: "
-          "the highest representable output value %g"
-          " is greater than the clamp max bound.",
-          highest_representable_output);
-    }
+  if (IsTrivialFusedActivationFunc(this, *model, op->fused_activation_function,
+                                   op->outputs[0])) {
+    op->fused_activation_function = FusedActivationFunctionType::kNone;
+    AddMessageF(
+        "Removing trivial quantized activation function on %s "
+        "because the output quantization parameters imply at least as tight "
+        "a clamp anyway.",
+        LogName(*op));
+    return true;
   }
-
-  if (has_nontrivial_min_bound || has_nontrivial_max_bound) {
-    return false;
-  }
-
-  op->fused_activation_function = FusedActivationFunctionType::kNone;
-  AddMessageF(
-      "Removing trivial quantized activation function on %s"
-      " because the output quantization parameters imply at least as tight"
-      " a clamp anyway.",
-      LogName(*op));
-  return true;
+  return false;
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eaee1c662b7cedb2baec7be47e12e348c3e7b25c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsTrivialMinMax(GraphTransformation* transformation, const Model& model,
+                     OperatorType op_type, const string& input_array_name,
+                     const string& clamp_value_array_name) {
+  const auto& clamp_value_array = model.GetArray(clamp_value_array_name);
+  if (!IsConstantParameterArray(model, clamp_value_array_name)) {
+    transformation->AddMessageF("Clip value array %s is non-constant",
+                                clamp_value_array_name);
+    return false;
+  }
+  const auto& clamp_value_buffer =
+      clamp_value_array.GetBuffer<ArrayDataType::kFloat>();
+  CHECK_EQ(clamp_value_buffer.Length(), 1);
+  float clamp_value = clamp_value_buffer.data[0];
+
+  double clamp_min;
+  double clamp_max;
+  switch (op_type) {
+    case OperatorType::kTensorFlowMinimum:
+      clamp_min = -std::numeric_limits<double>::infinity();
+      clamp_max = clamp_value;
+      break;
+    case OperatorType::kTensorFlowMaximum:
+      clamp_min = clamp_value;
+      clamp_max = std::numeric_limits<double>::infinity();
+      break;
+    default:
+      CHECK(false);
+      return false;
+  }
+
+  const auto& input_array = model.GetArray(input_array_name);
+  return IsArrayQuantizedRangeSubset(transformation, input_array, clamp_min,
+                                     clamp_max);
+}
+
+}  // namespace
+
+// Attempts to remove min/max functions if the quantization params indicate that
+// the representable values fall inside the clip range.
+bool RemoveTrivialQuantizedMinMax::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  if ((op->type != OperatorType::kTensorFlowMinimum &&
+       op->type != OperatorType::kTensorFlowMaximum) ||
+      op->inputs.size() != 2) {
+    return false;
+  }
+  if (IsTrivialMinMax(this, *model, op->type, op->inputs[0], op->inputs[1])) {
+    AddMessageF(
+        "Removing trivial min/max %s because the quantization parameters imply "
+        "at least as tight a clamp anyway.",
+        LogName(*op));
+    return RemoveTrivialPassthroughOp(this, model, op_index);
+  }
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
index 90f9381ec154f145cda826ff9730ff332cd96701..e28d8cf01eafee64e08ac2cc4b43ea7c227456c2 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
@@ -41,8 +41,8 @@ bool IsReshapeTrivial(const Model& model, const Operator& op,
         ShapesAgreeUpToExtending(input_array.shape(), output_array.shape())) {
       transformation->AddMessageF(
           "%s is trivial because its input and output shapes are equal up to "
-          "extending "
-          "by 1's, and we are told to aggressively discard such Reshape ops.",
+          "extending by 1's, and we are told to aggressively discard such "
+          "Reshape ops.",
           LogName(op));
       return true;
     }
@@ -61,8 +61,8 @@ bool IsReshapeTrivial(const Model& model, const Operator& op,
     if (next_op->type == OperatorType::kTensorFlowReshape) {
       transformation->AddMessageF(
           "%s is trivial because its output is only consumed by another "
-          "Reshape op",
-          LogName(op));
+          "Reshape op %s",
+          LogName(op), LogName(*next_op));
       return true;
     }
   }
@@ -80,6 +80,7 @@ bool RemoveTrivialReshape::Run(Model* model, std::size_t op_index) {
   }
 
   if (!IsReshapeTrivial(*model, *reshape_op, this)) {
+    AddMessageF("%s is not trivial", LogName(*reshape_op));
     return false;
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
index aa2c293382a98b476bee783ed8e177b19d35b858..8e6aaf544aa5310b4233d93e7bc8f484f6164b8a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
@@ -47,7 +47,8 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     bool found_output_as_rnn_state_array = false;
     for (const auto& rnn_state : model->flags.rnn_states()) {
       if (output == rnn_state.state_array()) {
-        CHECK(op->type == OperatorType::kFill);
+        CHECK(op->type == OperatorType::kFill ||
+              op->type == OperatorType::kTensorFlowIdentity);
         found_output_as_rnn_state_array = true;
         break;
       }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
deleted file mode 100644
index 30a005c789bb12e880e8e4534088d99ebacba84a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ReorderActivationFunctions::Run(Model* model, std::size_t op_index) {
-  const auto ac_it = model->operators.begin() + op_index;
-  std::unique_ptr<Operator>& ac_op = *ac_it;
-  DCHECK(ac_op);
-
-  if (ac_op->type != OperatorType::kRelu6 &&
-      ac_op->type != OperatorType::kRelu1 &&
-      ac_op->type != OperatorType::kRelu) {
-    return false;
-  }
-
-  auto exchange_it = FindOpWithOutput(*model, ac_op->inputs[0]);
-  if (exchange_it == model->operators.end()) return false;
-  // Find the op producing the array passed to this activation function
-  std::unique_ptr<Operator>& exchange_op = *exchange_it;
-  DCHECK(exchange_op);
-
-  if (exchange_op->type != OperatorType::kTensorFlowReshape) {
-    return false;
-  }
-
-  DCHECK_EQ(exchange_op->outputs[0], ac_op->inputs[0]);
-  const auto& exchange_op_input = exchange_op->inputs[0];
-  const auto& intermediate_array = exchange_op->outputs[0];
-  const auto& ac_op_output = ac_op->outputs[0];
-
-  int count_ops_consuming_output =
-      CountOpsWithInput(*model, intermediate_array);
-  DCHECK_GE(count_ops_consuming_output, 1);
-  if (count_ops_consuming_output > 1) {
-    AddMessageF(
-        "Not exchanging activation function with %s because it is consumed by "
-        "more than 1 other operator",
-        LogName(*exchange_op));
-    return false;
-  }
-
-  // If the ac_op was originally producing an output_array we can't reorder as
-  // otherwise the output array would change. It'd be nice to still be able to
-  // reorder but if code is relying on the fetch names instead of array indices
-  // this won't work.
-  for (int i = 0; i < model->flags.output_arrays_size(); ++i) {
-    if (model->flags.output_arrays(i) == ac_op->outputs[0]) {
-      AddMessageF(
-          "Not exchanging activation function with %s to preserve output array "
-          "name %s",
-          LogName(*exchange_op), ac_op->outputs[0]);
-      return false;
-    }
-  }
-
-  // Rewire by changing inputs, including all consumers.
-  Operator* consumer = GetFirstOpWithInput(*model, ac_op_output);
-  while (consumer) {
-    for (int i = 0; i < consumer->inputs.size(); ++i) {
-      if (consumer->inputs[i] == ac_op_output) {
-        consumer->inputs[i] = intermediate_array;
-      }
-    }
-    consumer = GetFirstOpWithInput(*model, ac_op_output);
-  }
-  ac_op->inputs[0] = exchange_op_input;
-  exchange_op->inputs[0] = ac_op_output;
-
-  // Clear shapes; this will allow shape propagation to fix the sizes for us.
-  model->GetOrCreateArray(ac_op->outputs[0]).clear_shape();
-  model->GetOrCreateArray(exchange_op->outputs[0]).clear_shape();
-
-  // Finally, reorder operators.  Note that this only works when there are no
-  // other direct descendents of the exchange_op.
-  ac_op.swap(exchange_op);
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f5b7920cb937b021eb23fc1d5fdc3c1ff18a72d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsElementwiseOperator(OperatorType optype) {
+  switch (optype) {
+    case OperatorType::kCast:
+    case OperatorType::kExp:
+    case OperatorType::kFloor:
+    case OperatorType::kNeg:
+    case OperatorType::kRelu:
+    case OperatorType::kRelu1:
+    case OperatorType::kRelu6:
+    case OperatorType::kTanh:
+    case OperatorType::kTensorFlowSqrt:
+    case OperatorType::kTensorFlowSquare:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsMoveOperator(OperatorType optype) {
+  switch (optype) {
+    case OperatorType::kDepthToSpace:
+    case OperatorType::kExpandDims:
+    case OperatorType::kSpaceToDepth:
+    case OperatorType::kSqueeze:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTranspose:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace
+
+// Swap elementwise operators such that all value operators occur before all
+// element move operators, e.g. negation then transpose.
+bool ReorderElementwiseUnary::Run(Model* model, std::size_t op_index) {
+  const auto element_op_it = model->operators.begin() + op_index;
+  std::unique_ptr<Operator>& element_op = *element_op_it;
+  if (!IsElementwiseOperator(element_op->type)) {
+    return false;
+  }
+
+  const string intermediate_name = element_op->inputs[0];
+  auto it = FindOpWithOutput(*model, intermediate_name);
+  if (it == model->operators.end()) {
+    AddMessageF("No preceding operator");
+    return false;
+  }
+
+  std::unique_ptr<Operator>& move_op = *it;
+  if (!IsMoveOperator(move_op->type)) {
+    AddMessageF("Preceding operator is not a move operator");
+    return false;
+  }
+
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    AddMessageF("Input %s used elsewhere", intermediate_name);
+    return false;
+  }
+
+  // Check that the intermediate is discardable.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot swap elementwise as it would invalidate %s which is "
+        "an output array.",
+        intermediate_name);
+    return false;
+  }
+
+  // op->inputs may change so we need to keep a value by copy.
+  const string input_name = move_op->inputs[0];
+  const string output_name = element_op->outputs[0];
+
+  AddMessageF("Swapping around operators with %s and %s", LogName(*element_op),
+              LogName(*move_op));
+
+  // If the output array is an exit node for the graph then we need to retain
+  // the name as an output node. This makes the naming scheme a little confusing
+  // but is required in this rare case.
+  if (!IsDiscardableArray(*model, output_name)) {
+    // The output name of the sequence needs to stay static, so create a new
+    // array new use for the intermediate.
+    const auto new_intermediate_name =
+        AvailableArrayName(*model, element_op->outputs[0] + "_reorder");
+    AddMessageF("Adding new array %s to preserve output array name %s",
+                new_intermediate_name, output_name);
+
+    element_op->inputs[0] = input_name;
+    element_op->outputs[0] = new_intermediate_name;
+    model->EraseArray(intermediate_name);
+    move_op->inputs[0] = new_intermediate_name;
+    move_op->outputs[0] = output_name;
+  } else {
+    // The intermediate array is now the output array.
+    for (int i = 0; i < model->operators.size(); i++) {
+      Operator* consumer = model->operators[i].get();
+      for (int j = 0; j < consumer->inputs.size(); j++) {
+        if (consumer->inputs[j] == output_name) {
+          consumer->inputs[j] = intermediate_name;
+        }
+      }
+    }
+
+    element_op->inputs[0] = input_name;
+    move_op->inputs[0] = output_name;
+  }
+
+  // Reset both arrays as shape, type, min/max, etc can all change because of
+  // the position swap.
+  model->EraseArray(element_op->outputs[0]);
+  model->EraseArray(move_op->outputs[0]);
+
+  // Reconstruct.
+  model->GetOrCreateArray(element_op->outputs[0]);
+  model->GetOrCreateArray(move_op->outputs[0]);
+
+  // Swap the order of the operators.
+  element_op.swap(move_op);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e7fe1b1ccd851dd998e59e75ff798f52f7c6e5a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -0,0 +1,248 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool OperatorReady(const Model& model, const Operator* op) {
+  if (!model.HasArray(op->inputs[0]) || !model.HasArray(op->inputs[1]) ||
+      !model.HasArray(op->outputs[0])) {
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[0]).has_shape() ||
+      !model.GetArray(op->outputs[0]).has_shape()) {
+    // Input and output needs the shape.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[1]).buffer) {
+    // Buffer needs to be a constant.
+    return false;
+  }
+
+  return true;
+}
+
+// Utility function to filter out a value.
+void Filter(std::vector<int>* vec, int value) {
+  vec->erase(std::remove(vec->begin(), vec->end(), value), vec->end());
+}
+
+// Computes a new permutation used to swap a reshape-transpose to a
+// transpose-reshape. In this case the permutation operates on the intermediate
+// shape.
+std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
+                                std::vector<int> intermediate_dims,
+                                std::vector<int> perm) {
+  // These are the major axis of the input.
+  std::vector<int> input_indices;
+  for (int i = 0; i < input_dims.size(); i++) {
+    if (input_dims[i] != 1) {
+      input_indices.push_back(i);
+    }
+  }
+
+  // This maps which indices of the input produced the intermediate indices for
+  // non-unary dimensions.
+  std::unordered_map<int, int> intermediate_to_input_indices_map;
+  for (int i = 0; i < intermediate_dims.size(); i++) {
+    if (intermediate_dims[i] != 1) {
+      intermediate_to_input_indices_map[i] =
+          input_indices[intermediate_to_input_indices_map.size()];
+    }
+  }
+
+  // Translate the transpose permutation to a new permutation starting with the
+  // major indices.
+  std::vector<int> new_perm;
+  new_perm.reserve(input_dims.size());
+  for (int i = 0; i < perm.size(); i++) {
+    if (intermediate_dims[perm[i]] == 1) continue;
+
+    new_perm.push_back(intermediate_to_input_indices_map[perm[i]]);
+  }
+
+  // Fill the rest of the transpose in with the ones.
+  for (int index = 0; index < input_dims.size(); index++) {
+    if (input_dims[index] == 1) {
+      new_perm.push_back(index);
+    }
+  }
+
+  CHECK_EQ(new_perm.size(), input_dims.size());
+  return new_perm;
+}
+
+}  // namespace
+
+// Swaps reshape-transpose to transpose-reshape whenever possible. This is
+// possible when the reshape does not affect memory ordering.
+bool ReorderReshapeTranspose::Run(Model* model, std::size_t op_index) {
+  auto transpose_it = model->operators.begin() + op_index;
+
+  TransposeOperator* transpose_op = ConvertOperator<TransposeOperator*>(
+      transpose_it->get(), OperatorType::kTranspose);
+
+  if (transpose_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
+    // Wait for values to propagate.
+    return false;
+  }
+
+  // Find the operator that produces the transpose op.
+  auto reshape_it = FindOpWithOutput(*model, transpose_op->inputs[0]);
+  if (reshape_it == model->operators.end()) {
+    return false;
+  }
+
+  TensorFlowReshapeOperator* reshape_op =
+      ConvertOperator<TensorFlowReshapeOperator*>(
+          reshape_it->get(), OperatorType::kTensorFlowReshape);
+  if (reshape_op == nullptr) {
+    return false;
+  }
+
+  // Ignore if the reshape is uninitialized.
+  if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
+    return false;
+  }
+
+  // Need to copy to keep static if permutated.
+  const string input_name = reshape_op->inputs[0];
+  const string intermediate_name = reshape_op->outputs[0];
+  const string output_name = transpose_op->outputs[0];
+
+  // Intermediate should not be consumed by any other operators.
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    AddMessageF("Input %s used elsewhere", intermediate_name);
+    return false;
+  }
+
+  // Check that the intermediate is not an output array.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot reorder reshape-transpose as it would invalidate %s which is "
+        "an output array.",
+        intermediate_name);
+    return false;
+  }
+
+  // Get the arrays.
+  const auto& input_array = model->GetArray(input_name);
+  const auto& intermediate_array = model->GetArray(intermediate_name);
+  const auto& output_array = model->GetArray(output_name);
+
+  // Get the shapes of each array.
+  Shape input_shape = input_array.shape();
+  Shape intermediate_shape = intermediate_array.shape();
+  Shape output_shape = output_array.shape();
+
+  // Assign ids to non-unary indices.
+  std::vector<int> input_dims = input_shape.dims();
+  std::vector<int> intermediate_dims = intermediate_shape.dims();
+  std::vector<int> output_dims = output_shape.dims();
+
+  // If the reshape is equivalent to a transpose with fewer/more unary
+  // dimensions then it can be moved between the transpose.
+  if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
+                                      true /*allow_extra_unary_dims*/)) {
+    return false;
+  }
+
+  if (!IsDiscardableArray(*model, output_name)) {
+    // The output name of the sequence needs to stay static, so create a new
+    // array new use for the intermediate.
+    const auto new_intermediate_name =
+        AvailableArrayName(*model, transpose_op->outputs[0] + "_exchange");
+    AddMessageF("Adding new array %s to preserve output array name %s",
+                new_intermediate_name, transpose_op->outputs[0]);
+    transpose_op->inputs[0] = input_name;
+    transpose_op->outputs[0] = new_intermediate_name;
+    reshape_op->inputs[0] = new_intermediate_name;
+    reshape_op->outputs[0] = output_name;
+    model->EraseArray(intermediate_name);
+  } else {
+    // The intermediate array is now the output array.
+    for (int i = 0; i < model->operators.size(); i++) {
+      Operator* consumer = model->operators[i].get();
+      for (int j = 0; j < consumer->inputs.size(); j++) {
+        if (consumer->inputs[j] == output_name) {
+          consumer->inputs[j] = intermediate_name;
+        }
+      }
+    }
+
+    transpose_op->inputs[0] = input_name;
+    reshape_op->inputs[0] = output_name;
+  }
+
+  // If transposes constant buffer is used elsewhere, make a new copy.
+  if (CountOpsWithInput(*model, transpose_op->inputs[1]) != 1) {
+    transpose_op->inputs[1] =
+        AvailableArrayName(*model, transpose_op->inputs[1] + "_copy");
+  }
+
+  // Make the new transpose permutation.
+  const std::vector<int> new_perm =
+      ComputeNewPerm(input_dims, intermediate_dims, transpose_op->perm);
+  CHECK_EQ(input_dims.size(), new_perm.size());
+
+  auto& transpose_array = model->GetOrCreateArray(transpose_op->inputs[1]);
+  transpose_array.GetMutableBuffer<ArrayDataType::kInt32>().data = new_perm;
+  *(transpose_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(new_perm.size())};
+  transpose_op->perm = new_perm;
+
+  // If the reshape's constant buffer is reused, create a new one.
+  if (CountOpsWithInput(*model, reshape_op->inputs[1]) != 1) {
+    reshape_op->inputs[1] =
+        AvailableArrayName(*model, reshape_op->inputs[1] + "_copy");
+  }
+
+  // We need to modify the reshape input array to target the new output size.
+  auto& reshape_array = model->GetOrCreateArray(reshape_op->inputs[1]);
+  reshape_array.GetMutableBuffer<ArrayDataType::kInt32>().data = output_dims;
+  *(reshape_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(output_shape.dimensions_count())};
+  reshape_op->shape.clear();
+
+  AddMessageF("Swapping around operators between %s and %s", input_name,
+              output_name);
+
+  model->GetOrCreateArray(transpose_op->outputs[0]).clear_shape();
+  model->GetOrCreateArray(reshape_op->outputs[0]).clear_shape();
+
+  // Swap the order of the operators.
+  transpose_it->swap(*reshape_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
index fb109eb91b16e3a73005230f821c18b9ef82d2fb..2b3ee36ad10e24ab7367ca44c03a234688a63a9b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -33,7 +33,7 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   const auto* bn_op =
       static_cast<const BatchNormalizationOperator*>(bn_it->get());
 
-  const auto& mean_array = model->GetArray(bn_op->inputs[1]);
+  auto& mean_array = model->GetArray(bn_op->inputs[1]);
   const auto& multiplier_array = model->GetArray(bn_op->inputs[2]);
   const auto& offset_array = model->GetArray(bn_op->inputs[3]);
 
@@ -49,6 +49,13 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   CHECK(multiplier_array.data_type == ArrayDataType::kFloat);
   CHECK(offset_array.data_type == ArrayDataType::kFloat);
 
+  // This graph transformations will need to address constant buffers below,
+  // so we need to exit early if these buffers don't exist (i.e. if the params
+  // haven't yet been resolved as constants).
+  if (!mean_array.buffer || !multiplier_array.buffer || !offset_array.buffer) {
+    return false;
+  }
+
   // Create the new Mul, Add operators
   auto* mul_op = new MulOperator;
   auto* add_op = new AddOperator;
@@ -80,9 +87,15 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   DCHECK_EQ(bn_it->get(), bn_op);
 
   // Create the new param arrays
-  const auto& mean_shape = mean_array.shape();
+  auto& mean_shape = *mean_array.mutable_shape();
   const auto& multiplier_shape = multiplier_array.shape();
   const auto& offset_shape = offset_array.shape();
+  if (mean_shape.dims().empty()) {
+    *mean_shape.mutable_dims() = multiplier_shape.dims();
+    auto& data = mean_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+    CHECK_EQ(data.size(), 1);
+    data.resize(RequiredBufferSizeForShape(mean_shape), data[0]);
+  }
   CHECK(mean_shape.dims() == multiplier_shape.dims());
   CHECK(mean_shape.dims() == offset_shape.dims());
   const auto& param_shape = mean_shape;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
index 5e779f6765262326bd59db886c2feed603e0102e..6e78653fad238085da5ba66166884093ea9b0214 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -233,7 +233,12 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
   }
 
   // Check that input data types agree.
-  CHECK(input0_array.data_type == input1_array.data_type);
+  CHECK(input0_array.data_type == input1_array.data_type)
+      << "Dissimilar data types given to op outputting \""
+      << binary_op->outputs[0] << "\". 0:\"" << binary_op->inputs[0] << "\"("
+      << static_cast<int>(input0_array.data_type) << ")   1:\""
+      << binary_op->inputs[1] << "\"("
+      << static_cast<int>(input1_array.data_type) << ").";
 
   // Do the actual constants propagation
   EvaluateBinaryOperatorOnConstantInputs(model, binary_op);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 064810b53e7c3bee4601204c9dbd976c374a6a60..d916ae0ddf017fe6a2fb2709db6e9de8c258adfc 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -105,7 +106,8 @@ void ConcatenateTensorBuffers(const std::vector<Array*>& input_arrays,
 // already set (e.g. because of previous pass in TOCO), it doesn't change it and
 // returns. Otherwise it uses the input arrays min and max values to compute the
 // concatenated array min and max.
-void SetMinMaxForConcatenedArray(const std::vector<Array*>& input_arrays,
+void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
+                                 const std::vector<Array*>& input_arrays,
                                  Array* concatenated_array) {
   CHECK(concatenated_array->data_type == ArrayDataType::kFloat);
   // If the minmax is already set, use it
@@ -125,6 +127,9 @@ void SetMinMaxForConcatenedArray(const std::vector<Array*>& input_arrays,
   MinMax& minmax = concatenated_array->GetOrCreateMinMax();
   minmax.min = concat_min;
   minmax.max = concat_max;
+
+  transformation->AddMessageF("Setting concatenated array min/max to %g,%g",
+                              concat_min, concat_max);
 }
 
 }  // namespace
@@ -161,11 +166,14 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
     input_arrays.push_back(&model->GetArray(input_name));
   }
 
+  AddMessageF("Performing constant concat of %s into %s",
+              absl::StrJoin(concat_op->inputs, ", "), concatenated_array_name);
+
   switch (concatenated_array.data_type) {
     case ArrayDataType::kFloat:
       ConcatenateTensorBuffers<ArrayDataType::kFloat>(
           input_arrays, concatenation_axis, &concatenated_array);
-      SetMinMaxForConcatenedArray(input_arrays, &concatenated_array);
+      SetMinMaxForConcatenedArray(this, input_arrays, &concatenated_array);
       break;
     case ArrayDataType::kUint8:
       ConcatenateTensorBuffers<ArrayDataType::kUint8>(
@@ -189,13 +197,13 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
 
   // Remove all the resolved arrays.
   for (const string& input_name : concat_op->inputs) {
-    // Check to prevent removal of shared tensors
+    // Check to prevent removal of shared tensors.
     if (CountOpsWithInput(*model, input_name) == 1) {
       model->EraseArray(input_name);
     }
   }
 
-  // Remove concatenate operator
+  // Remove concatenate operator.
   model->operators.erase(concat_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index 944901ece77430708013ea4ca340a30511ba0174..efb7bb218421dd045e3e8e2a38b9c70989f222e1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -45,9 +46,29 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
   }
 
   const auto& input_array = model->GetArray(fakequant_op->inputs[0]);
+  CHECK(input_array.data_type == ArrayDataType::kFloat);
+
+  // Determine the final data type in the same way as PropagateFakeQuantNumBits.
+  ArrayDataType quantized_data_type = input_array.final_data_type;
+  if (!InferQuantizedDataTypeFromFakeQuant(*fakequant_op,
+                                           &quantized_data_type)) {
+    AddMessageF("Unsupported FakeQuant num_bits=%d", fakequant_op->num_bits);
+    return false;
+  }
+
+  AddMessageF("Resolving constant %s", LogName(*fakequant_op));
+
   auto& output_array = model->GetArray(fakequant_op->outputs[0]);
   CHECK(input_array.data_type == ArrayDataType::kFloat);
   output_array.data_type = ArrayDataType::kFloat;
+
+  // We'll set the final data type to what the fake quant indicates we should
+  // have (and would have been set if this stayed around until
+  // PropagateFakeQuantNumBits).
+  if (propagate_fake_quant_num_bits()) {
+    output_array.final_data_type = quantized_data_type;
+  }
+
   CHECK(!output_array.buffer);
   const auto& input_buffer = input_array.GetBuffer<ArrayDataType::kFloat>();
   output_array.GetOrCreateMinMax() = *fakequant_op->minmax;
@@ -55,8 +76,8 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
   const int size = input_buffer.data.size();
   output_buffer.data.resize(size);
   QuantizationParams qparams;
-  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(
-      model->flags, *fakequant_op->minmax, &qparams);
+  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(*fakequant_op->minmax,
+                                                         &qparams);
   for (int i = 0; i < size; i++) {
     const double src_val = input_buffer.data[i];
     const double unclamped_quantized_val =
@@ -66,7 +87,9 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
     const double dst_val = qparams.scale * (quantized_val - qparams.zero_point);
     output_buffer.data[i] = dst_val;
   }
-  if (CountOpsWithInput(*model, fakequant_op->inputs[0]) == 1) {
+
+  if (IsDiscardableArray(*model, fakequant_op->inputs[0]) &&
+      CountOpsWithInput(*model, fakequant_op->inputs[0]) == 1) {
     model->EraseArray(fakequant_op->inputs[0]);
   }
   model->operators.erase(fakequant_it);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
new file mode 100644
index 0000000000000000000000000000000000000000..debe298a5a93034bcb928d7384b5ec1fc7439e47
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -0,0 +1,144 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+// Gathers data from axis 0.
+template <ArrayDataType Type>
+inline void Gather(const Array& input_array, int input_rank,
+                   const Array& coords_array, Array* output_array) {
+  const Shape& input_shape = input_array.shape();
+  const std::vector<DataType<Type>>& input_data =
+      input_array.GetBuffer<Type>().data;
+  const Shape& coords_shape = coords_array.shape();
+  const std::vector<int32>& coords_data =
+      coords_array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  const Shape& output_shape = output_array->shape();
+  std::vector<DataType<Type>>& output_data =
+      output_array->GetMutableBuffer<Type>().data;
+  output_data.resize(RequiredBufferSizeForShape(output_shape));
+
+  int rev_input_rank = input_shape.dimensions_count() - 1 - (input_rank - 1);
+  CHECK_EQ(coords_shape.dims(0), output_array->shape().dims(rev_input_rank));
+
+  int stride = 1;
+  for (int i = input_shape.dimensions_count() - 1; i >= input_rank - 1; --i) {
+    stride *= input_shape.dims(i);
+  }
+
+  for (int i = 0; i < coords_shape.dims(0); ++i) {
+    DCHECK_GE(coords_data[i], 0);
+    DCHECK_LT(coords_data[i], input_shape.dims(rev_input_rank));
+    DataType<Type>* out = output_data.data() + i * stride;
+    const DataType<Type>* in = input_data.data() + coords_data[i] * stride;
+    memcpy(out, in, sizeof(DataType<Type>) * stride);
+  }
+}
+
+}  // namespace
+
+// Resolves a constant Gather operation.
+// This simply performs the gather and produces the output array with the
+// appropriate values.
+bool ResolveConstantGather::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kGather) {
+    return false;
+  }
+  const auto* op = static_cast<const GatherOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  // Only handling axis=0 for now.
+  if (op->axis != 0) {
+    AddMessageF("%s has axis %d; only axis=0 is supported", LogName(*op),
+                op->axis);
+    return false;
+  }
+
+  // We require constant inputs.
+  if (!IsConstantParameterArray(*model, op->inputs[0]) ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return false;
+  }
+  const Array& input_array = model->GetArray(op->inputs[0]);
+  const Array& coords_array = model->GetArray(op->inputs[1]);
+  CHECK(coords_array.data_type == ArrayDataType::kInt32)
+      << "Only int32 indices are supported";
+
+  // Copy min/max info if present. The ranges of the selected values may be
+  // a subset of the original range but we want to ensure the quantization
+  // params stay the same.
+  if (input_array.minmax) {
+    const auto& input_minmax = input_array.GetMinMax();
+    auto& output_minmax = output_array.GetOrCreateMinMax();
+    output_minmax.min = input_minmax.min;
+    output_minmax.max = input_minmax.max;
+  }
+
+  CHECK(!output_array.buffer);
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      Gather<ArrayDataType::kFloat>(input_array, op->input_rank, coords_array,
+                                    &output_array);
+      break;
+    case ArrayDataType::kUint8:
+      Gather<ArrayDataType::kUint8>(input_array, op->input_rank, coords_array,
+                                    &output_array);
+      break;
+    case ArrayDataType::kInt32:
+      Gather<ArrayDataType::kInt32>(input_array, op->input_rank, coords_array,
+                                    &output_array);
+      break;
+    case ArrayDataType::kInt64:
+      Gather<ArrayDataType::kInt64>(input_array, op->input_rank, coords_array,
+                                    &output_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type given to Gather op with output \""
+                 << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used after we remove the op.
+  DeleteArrayIfUsedOnce(op->inputs[0], model);
+  DeleteArrayIfUsedOnce(op->inputs[1], model);
+
+  // Erase the operator.
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88d06d7dc75005c89a69b881aa0064d1162227d5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace toco {
+
+template <ArrayDataType Type>
+bool ComputeRandomUniformArray(Model* model, RandomUniformOperator* op) {
+  typedef tensorflow::random::UniformDistribution<
+      tensorflow::random::PhiloxRandom, DataType<Type>>
+      Distribution;
+
+  // Allocate output
+  auto& output_array = model->GetArray(op->outputs[0]);
+  CHECK(output_array.data_type == Type);
+  std::vector<DataType<Type>>& data =
+      output_array.GetMutableBuffer<Type>().data;
+  data.resize(RequiredBufferSizeForShape(output_array.shape()));
+
+  // We use the same random number generator and distribution as TensorFlow to
+  // produce the exact same values given the same seeds. See
+  // tensorflow::functor::FillPhiloxRandomTask<Distribution, false> in
+  // //third_party/tensorflow/core/kernels/random_op.cc for the implementation.
+  tensorflow::random::PhiloxRandom generator(op->seed, op->seed2);
+  Distribution dist;
+
+  // The generator creates Distribution::kResultElementCount samples at a time.
+  size_t offset = 0;
+  size_t num_samples = Distribution::kResultElementCount;
+  while (offset < data.size()) {
+    const typename Distribution::ResultType samples = dist(&generator);
+    std::copy(&samples[0],
+              &samples[0] + std::min(num_samples, data.size() - offset),
+              &data[0] + offset);
+    offset += num_samples;
+  }
+
+  return true;
+}
+
+bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* base_op = it->get();
+  if (base_op->type != OperatorType::kRandomUniform) {
+    return false;
+  }
+  auto* op = static_cast<RandomUniformOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 1);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes
+    return false;
+  }
+
+  if ((op->seed == 0) && (op->seed2 == 0)) {
+    LOG(WARNING) << "RandomUniform op outputting \"" << op->outputs[0]
+                 << "\" is truly random (using /dev/random system entropy). "
+                    "Therefore, cannot resolve as constant. Set \"seed\" or "
+                    "\"seed2\" attr non-zero to fix this";
+    return false;
+  }
+
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      if (!ComputeRandomUniformArray<ArrayDataType::kFloat>(model, op)) {
+        return false;
+      }
+      break;
+    // For future support of double or half.
+    // case ArrayDataType::kDouble...
+    default:
+      LOG(FATAL)
+          << "Unsupported data type given to RandomUniform op with output \""
+          << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used
+  toco::DeleteArrayIfUsedOnce(op->inputs[0], model);
+
+  // Erase the operator
+  model->operators.erase(it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e7ad383e7789891f5396845241e70143dc8b76f
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Resolves a constant reshape operation by copying the buffer.
+bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+  const auto* op = static_cast<const TensorFlowReshapeOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  // We require constant inputs.
+  if (!IsConstantParameterArray(*model, op->inputs[0]) ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return false;
+  }
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  const Array& input_array = model->GetArray(op->inputs[0]);
+  if (!ShapesAgreeUpToExtending(input_array.shape(), output_array.shape())) {
+    AddMessageF("Constant reshape is non-trivial (%s -> %s)",
+                ShapeToString(input_array.shape()),
+                ShapeToString(output_array.shape()));
+    return false;
+  }
+
+  CHECK(!output_array.buffer);
+  switch (input_array.data_type) {
+    case ArrayDataType::kBool:
+      CopyArrayBuffer<ArrayDataType::kBool>(input_array, &output_array);
+      break;
+    case ArrayDataType::kFloat:
+      CopyArrayBuffer<ArrayDataType::kFloat>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt8:
+      CopyArrayBuffer<ArrayDataType::kInt8>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint8:
+      CopyArrayBuffer<ArrayDataType::kUint8>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt16:
+      CopyArrayBuffer<ArrayDataType::kInt16>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint16:
+      CopyArrayBuffer<ArrayDataType::kUint16>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt32:
+      CopyArrayBuffer<ArrayDataType::kInt32>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint32:
+      CopyArrayBuffer<ArrayDataType::kUint32>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt64:
+      CopyArrayBuffer<ArrayDataType::kInt64>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint64:
+      CopyArrayBuffer<ArrayDataType::kUint64>(input_array, &output_array);
+      break;
+    case ArrayDataType::kString:
+      CopyArrayBuffer<ArrayDataType::kString>(input_array, &output_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type: "
+                 << ArrayDataTypeName(input_array.data_type);
+      return false;
+  }
+
+  AddMessageF("Resolving constant reshape of %s", LogName(*op));
+
+  if (input_array.minmax) {
+    output_array.GetOrCreateMinMax() = input_array.GetMinMax();
+  }
+  if (input_array.quantization_params) {
+    output_array.GetOrCreateQuantizationParams() =
+        input_array.GetQuantizationParams();
+  }
+
+  // Erase input arrays if no longer used.
+  for (const auto& input : op->inputs) {
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1) {
+      model->EraseArray(input);
+    }
+  }
+
+  // Erase the operator.
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index a0cfc3d59763dc1211ed4d1ac114d371a4a7ee0b..1dd52e906900e997f282740404a81b9fcd21e867 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
+#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -23,40 +24,6 @@ namespace toco {
 
 namespace {
 
-int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape,
-                 int axis) {
-  int start;
-  if (op.begin_mask & 1 << axis) {
-    // If begin mask bit is set, use the first element
-    start = 0;
-  } else {
-    // Otherwise, use the specified element
-    start = op.start_indices[axis];
-    if (start < 0) {
-      // Handle negative indices
-      start += input_shape.dims(axis);
-    }
-  }
-  return start;
-}
-
-int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape,
-                int axis) {
-  int stop;
-  if (op.end_mask & (1 << axis)) {
-    // If end mask bit set, use the last element
-    stop = input_shape.dims(axis);
-  } else {
-    // Otherwise, use the specified element
-    stop = op.stop_indices[axis];
-    if (stop < 0) {
-      // Handle negative indices
-      stop += input_shape.dims(axis);
-    }
-  }
-  return stop;
-}
-
 template <ArrayDataType Type>
 void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
                   Array* output_array) {
@@ -73,9 +40,6 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
   int num_input_axes = op.start_indices.size();
   CHECK_EQ(num_input_axes, op.stop_indices.size());
   CHECK_EQ(num_input_axes, op.strides.size());
-  for (int i = 0; i < op.strides.size(); i++) {
-    CHECK_GE(op.strides[i], 0) << "Negative strides usupported";
-  }
 
   // Create a buffer for the output array
   std::vector<DataType<Type>>& output_data =
@@ -87,7 +51,9 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
   Buffer<Type> const& input_buffer = input_array.GetBuffer<Type>();
   std::vector<int> src_coord(op.start_indices.size());
   for (int axis = 0; axis < num_input_axes; axis++) {
-    src_coord[axis] = StartForAxis(op, input_shape, axis);
+    src_coord[axis] = tflite::strided_slice::StartForAxis(
+        op.begin_mask, op.start_indices, op.strides, input_shape.dims().data(),
+        axis);
   }
 
   // In order to handle any number (N) of dimensions, we copy elements one by
@@ -103,15 +69,21 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
     // Compute next source input coordinates.
     bool carry = true;
     for (int axis = 0; axis < num_input_axes; axis++) {
+      int stride = op.strides[axis];
       // Increment this axis if we carried from the previous one
       if (carry) {
-        src_coord[axis] += op.strides[axis];
+        src_coord[axis] += stride;
       }
 
       // Check if we've overflowed.
-      if (src_coord[axis] >= StopForAxis(op, input_shape, axis)) {
+      int stop = tflite::strided_slice::StopForAxis(
+          op.end_mask, op.stop_indices, op.strides, input_shape.dims().data(),
+          axis);
+      if (tflite::strided_slice::LoopCondition(src_coord[axis], stop, stride)) {
         // Reset axis and set carry
-        src_coord[axis] = StartForAxis(op, input_shape, axis);
+        src_coord[axis] = tflite::strided_slice::StartForAxis(
+            op.begin_mask, op.start_indices, op.strides,
+            input_shape.dims().data(), axis);
         carry = true;
       } else {
         carry = false;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
index 4f984bfde55b3457694bb411bbfdf30723c7066e..1fd20314b14d98bd82e2b20a4e70f5d9c2c3b298 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -131,6 +131,10 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
   if (input_array.minmax) {
     output_array.GetOrCreateMinMax() = input_array.GetMinMax();
   }
+  if (input_array.quantization_params) {
+    output_array.GetOrCreateQuantizationParams() =
+        input_array.GetQuantizationParams();
+  }
 
   if (op->perm.empty()) {
     // Yield until perm has been populated by ResolveTransposeAttributes.
@@ -164,6 +168,8 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
       break;
   }
 
+  AddMessageF("Resolving constant transpose of %s", LogName(*op));
+
   // Erase input arrays if no longer used.
   for (const auto& input : op->inputs) {
     if (IsDiscardableArray(*model, input) &&
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index f227554bc505efe6a758fdd9894fee43f2500641..f6c8f79d8d3311dc2294e3ec406a184b2a16a6b5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -28,21 +28,46 @@ limitations under the License.
 
 namespace toco {
 
+bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
+  auto& output_array = model->GetArray(op.outputs[0]);
+  if (output_array.minmax) {
+    return false;
+  }
+  const auto& input_array = model->GetArray(op.inputs[0]);
+  if (!input_array.minmax) {
+    return false;
+  }
+  const auto& input_minmax = input_array.GetMinMax();
+  CHECK(!output_array.minmax);
+  auto& output_minmax = output_array.GetOrCreateMinMax();
+  output_minmax.min = input_minmax.min;
+  output_minmax.max = input_minmax.max;
+  return true;
+}
+
 bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   const auto unary_it = model->operators.begin() + op_index;
   const auto* unary_op = unary_it->get();
-  // Test for unary ops of types that we know how to resolve
-  if (unary_op->type != OperatorType::kCast &&
-      unary_op->type != OperatorType::kNeg &&
-      unary_op->type != OperatorType::kTensorFlowRsqrt &&
-      unary_op->type != OperatorType::kTensorFlowSqrt &&
-      unary_op->type != OperatorType::kTensorFlowSquare &&
-      unary_op->type != OperatorType::kTensorFlowSum &&
-      unary_op->type != OperatorType::kTensorFlowMin &&
-      unary_op->type != OperatorType::kTensorFlowMax &&
-      unary_op->type != OperatorType::kTensorFlowReshape) {
-    return false;
+  // Test for unary ops of types that we know how to resolve.
+  switch (unary_op->type) {
+    case OperatorType::kCast:
+    case OperatorType::kLog:
+    case OperatorType::kNeg:
+    case OperatorType::kTensorFlowRsqrt:
+    case OperatorType::kTensorFlowSqrt:
+    case OperatorType::kTensorFlowSquare:
+    case OperatorType::kTensorFlowSum:
+    case OperatorType::kTensorFlowMin:
+    case OperatorType::kTensorFlowMax:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kRelu6:
+    case OperatorType::kRelu1:
+    case OperatorType::kRelu:
+      break;
+    default:
+      return false;
   }
+
   // Check if the input is a constant parameter.
   if (!IsConstantParameterArray(*model, unary_op->inputs[0])) {
     return false;
@@ -76,6 +101,12 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
+  // The min-max is only copied for ops that copy data without arithmetic.
+  // In future trivial transpose, etc, can be handled here.
+  if (unary_op->type == OperatorType::kTensorFlowReshape) {
+    CopyMinMaxFromFirstInput(*unary_op, model);
+  }
+
   const auto& input_array = model->GetArray(unary_op->inputs[0]);
   // We have already tested above for existence of buffers (synonymous to being
   // a constant param).
@@ -135,15 +166,34 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     }
   } else if (unary_op->type == OperatorType::kTensorFlowReshape) {
     CHECK(input_buffer_size == output_buffer_size);
-    memcpy(output_float_data.data(), (*input_float_data).data(),
-           output_buffer_size * sizeof(output_float_data[0]));
+    output_float_data = *input_float_data;
   } else if (unary_op->type == OperatorType::kTensorFlowSum) {
-    // At the moment only full reduction across all dimensions is supported.
-    float sum = 0.f;
-    for (int i = 0; i < input_buffer_size; i++) {
-      sum += (*input_float_data)[i];
+    CHECK_EQ(unary_op->inputs.size(), 2) << "Sum needs 2 inputs";
+    if (!IsConstantParameterArray(*model, unary_op->inputs[1])) {
+      AddMessageF("Axis input is non-constant");
+      return false;
     }
-    for (int i = 0; i < output_buffer_size; ++i) {
+    auto& axis_array = model->GetArray(unary_op->inputs[1]);
+    CHECK(axis_array.data_type == ArrayDataType::kInt32);
+    int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+    CHECK_LT(axis, input_shape.dimensions_count()) << "Axis out of bounds";
+
+    // We currently only handle reduction on axis 0.
+    CHECK_EQ(axis, 0) << "Only reduction along axis 0 is supported";
+    // We currently only handle 1-D and 2-D input tensors.
+    CHECK_LE(input_shape.dimensions_count(), 2) << "Rank >2 not yet supported";
+    // We only support keep_dims=true; shape prop will need to change otherwise.
+    auto sum_op = static_cast<const TensorFlowSumOperator*>(unary_op);
+    CHECK(sum_op->keep_dims) << "Only keep_dims=true is supported";
+
+    std::vector<int> indices(input_shape.dimensions_count());
+    for (int i = 0; i < input_shape.dims(1); ++i) {
+      indices[1] = i;
+      float sum = 0.f;
+      for (int j = 0; j < input_shape.dims(0); ++j) {
+        indices[0] = j;
+        sum += (*input_float_data)[Offset(input_shape, indices)];
+      }
       output_float_data[i] = sum;
     }
   } else if (unary_op->type == OperatorType::kTensorFlowMin) {
@@ -169,6 +219,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     }
     output_float_data[0] = max;
   } else if (unary_op->type == OperatorType::kNeg ||
+             unary_op->type == OperatorType::kLog ||
              unary_op->type == OperatorType::kTensorFlowRsqrt ||
              unary_op->type == OperatorType::kTensorFlowSqrt ||
              unary_op->type == OperatorType::kTensorFlowSquare) {
@@ -182,6 +233,8 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
       float outval = 0.f;
       if (unary_op->type == OperatorType::kNeg) {
         outval = -val;
+      } else if (unary_op->type == OperatorType::kLog) {
+        outval = std::log(val);
       } else if (unary_op->type == OperatorType::kTensorFlowRsqrt) {
         outval = 1.0f / std::sqrt(val);
       } else if (unary_op->type == OperatorType::kTensorFlowSqrt) {
@@ -193,6 +246,37 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
       }
       output_float_data[i] = outval;
     }
+  } else if (unary_op->type == OperatorType::kRelu6 &&
+             unary_op->type == OperatorType::kRelu1 &&
+             unary_op->type == OperatorType::kRelu) {
+    for (size_t i = 0; i < output_buffer_size; ++i) {
+      const float value = (*input_float_data)[i];
+      float new_value = 0.0f;
+      switch (unary_op->type) {
+        case OperatorType::kRelu: {
+          static constexpr float kLower = 0;
+          new_value = value < kLower ? kLower : value;
+          break;
+        }
+        case OperatorType::kRelu1: {
+          static constexpr float kUpper = 1;
+          static constexpr float kLower = -1;
+          new_value = value > kUpper ? kUpper : value < kLower ? kLower : value;
+          break;
+        }
+        case OperatorType::kRelu6: {
+          static constexpr float kUpper = 6;
+          static constexpr float kLower = 0;
+          new_value = value > kUpper ? kUpper : value < kLower ? kLower : value;
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported activation function "
+                     << LogName(*unary_op);
+          return false;
+      }
+      output_float_data[i] = new_value;
+    }
   } else {
     LOG(FATAL) << "should not get here.";
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
index 37beb41dfc5904fc6ace79ebea2420d2ab92fbfb..4bb1217828a9c480241a3b503dffe26462df4063 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
@@ -60,6 +60,11 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
   const auto& output_array_name = mul_op->outputs[0];
   auto& output_array = model->GetArray(output_array_name);
 
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
   // Yield if the output shape is not known yet.
   if (!output_array.has_shape()) {
     return false;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index 7e8b249b07ecca551cbb75afd8007efad0b52eaf..021e9918f2cf22d3854491762c31061832359a46 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -31,6 +31,12 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   }
 
   CHECK_EQ(op->inputs.size(), 4);
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // We require the dimensionality of the input to pad the indices
+    return false;
+  }
+
   const auto& start_array = model->GetArray(op->inputs[1]);
   if (!start_array.has_shape()) return false;
   if (toco::RequiredBufferSizeForShape(start_array.shape()) > 4) {
@@ -57,6 +63,21 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->stop_indices.size(), op->start_indices.size());
   CHECK_EQ(op->strides.size(), op->stop_indices.size());
 
+  // The TensorFlow documentation is not explicit on how it handles fewer
+  // supplied indices than dimensions, but they are accepted. We emulate TF's
+  // behavior by fully iterating over each omitted dimension.
+  int num_input_axes = input_array.shape().dimensions_count();
+  CHECK_LE(op->start_indices.size(), num_input_axes)
+      << "StridedSlice op requires no more than " << num_input_axes
+      << " start indices";
+  CHECK_LE(op->stop_indices.size(), num_input_axes)
+      << "StridedSlice op requires no more than " << num_input_axes
+      << " stop indices";
+  CHECK_LE(op->strides.size(), num_input_axes)
+      << "StridedSlice op requires no more than " << num_input_axes
+      << " strides";
+  op->PadIndices(num_input_axes);
+
   // Ideally, we would remove the input arrays after they have been resolved.
   // However, we must then reconstitute these input arrays for all supported
   // export formats. For now, leave the arrays so we don't have to modify our
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index f38203c80fcb7ab8bc1639129fd98e4e342e5cb7..2a236d3f98784e8244942f94d5a250b5bc00a8ad 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -60,6 +60,13 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   string input_lhs = matmul_op->inputs[0];
   string input_rhs = transpose_op->outputs[0];
 
+  // Construct the new FullyConnectedOperator.
+  auto* fc_op = new FullyConnectedOperator;
+  fc_op->outputs = matmul_op->outputs;
+
+  // Insert the newly constructed FullyConnectedOperator.
+  model->operators.emplace(matmul_it, fc_op) + 1;
+
   // Find the op producing the array passed to this MatMul
   auto previous_op_it = model->operators.begin();
   bool found = false;
@@ -76,13 +83,6 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   }
   Operator* previous_op = (found) ? previous_op_it->get() : nullptr;
 
-  // Construct the new FullyConnectedOperator.
-  auto* fc_op = new FullyConnectedOperator;
-  fc_op->outputs = matmul_op->outputs;
-
-  // Insert the newly constructed FullyConnectedOperator.
-  model->operators.emplace(matmul_it, fc_op) + 1;
-
   // Refresh iterator.
   matmul_it = model->operators.begin();
   for (; matmul_it != model->operators.end(); ++matmul_it) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
index 477e7f13da3d88a68547d494011cd4984936b909..38e0005890ac10410df4ddb5290be8fcc948c349 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -32,7 +32,7 @@ bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
   }
 
   // We need to yield until this Merge node has only 1 input, which will mean
-  // that that is the selected input. Other graph transformations on other nodes
+  // that is the selected input. Other graph transformations on other nodes
   // such as ResolveTensorFlowSwitch, will take care of trimming the
   // non-selected inputs, so that at some point there will be only 1 input left.
   if (merge_op->inputs.size() > 1) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
index 2f94f9cd8a9ab24809fb3d137b5d05ab12f43003..8dcd4adc90b188c745cadb9815c3c46383705833 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
@@ -28,15 +28,3 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbea39bcc09ea6787c055d5aaca7f291c2b47a7f
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -0,0 +1,240 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
+  // Collapses a partitioned tf.nn.embedding_lookup back into a single Gather.
+  // https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup
+  // This transform attempts to identify the len(params) > 1 case and collapse
+  // it to the len(params) = 1 case by concatenating the original params and
+  // reversing the partitioning.
+  //
+  // If len(params) to the tf.nn.embedding_lookup == 1, the whole op becomes
+  // simply a gather:
+  // https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/python/ops/embedding_ops.py#L150
+  //
+  // Notes on this implementation:
+  // - only supports partition_strategy='mod'
+  //
+  // A rough graph of a partitioned embedding_lookup looks like:
+  //   (ids)--+-->FloorDiv--+-->DynamicPartition-->[[Gather]]--\
+  //          \-->FloorMod--/                                  |
+  //                 V                                         |
+  //   Range-->DynamicPartition-------->DynamicStitch<---------/
+  //  (const)                                V
+  //                                     (embeddings)
+
+  // First look for the final DynamicStitch.
+  auto op_it = model->operators.begin() + op_index;
+  if (op_it->get()->type != OperatorType::kDynamicStitch) {
+    return false;
+  }
+  auto* stitch_op = static_cast<DynamicStitchOperator*>(op_it->get());
+
+  // Split up the DynamicStitch inputs into the indices and data.
+  std::vector<string> stitch_indices_inputs;
+  std::vector<string> stitch_data_inputs;
+  for (size_t i = 0; i < stitch_op->num_partitions; ++i) {
+    stitch_indices_inputs.push_back(stitch_op->inputs[i]);
+  }
+  for (size_t i = stitch_op->num_partitions; i < stitch_op->num_partitions * 2;
+       ++i) {
+    stitch_data_inputs.push_back(stitch_op->inputs[i]);
+  }
+
+  // Validate all indices come from the same DynamicPartition.
+  DynamicPartitionOperator* indices_partition_op = nullptr;
+  for (const string& indices_partition_output_name : stitch_indices_inputs) {
+    auto* op = GetOpWithOutput(*model, indices_partition_output_name);
+    CHECK(op) << "Source of " << indices_partition_output_name << " not found";
+    if (op->type != OperatorType::kDynamicPartition) {
+      AddMessageF(
+          "Skipping because indices input %s into "
+          "%s is unexpected",
+          LogName(*op), LogName(*stitch_op));
+      return false;
+    }
+    if (!indices_partition_op) {
+      indices_partition_op = static_cast<DynamicPartitionOperator*>(op);
+    } else {
+      // Ensure this is the same op as previous ones.
+      if (op != indices_partition_op) {
+        AddMessageF(
+            "Skipping because indices input %s into "
+            "%s is from a different source op than others",
+            LogName(*op), LogName(*stitch_op));
+        return false;
+      }
+    }
+  }
+  CHECK(indices_partition_op) << "No indices inputs";
+
+  // The data for the indices must be a constant range of the array shape.
+  if (!IsConstantParameterArray(*model, indices_partition_op->inputs[0])) {
+    AddMessageF("Skipping because indices partition data is non-constant");
+    return false;
+  }
+  auto& indices_data_array = model->GetArray(indices_partition_op->inputs[0]);
+  if (indices_data_array.data_type == ArrayDataType::kNone) {
+    // Yield until data types are propagated.
+    return false;
+  }
+  CHECK(indices_data_array.data_type == ArrayDataType::kInt32)
+      << "Indices partition inputs must be int32";
+  const auto& indices_data_buffer =
+      indices_data_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (size_t i = 0; i < indices_data_buffer.size(); ++i) {
+    CHECK_EQ(indices_data_buffer[i], i) << "Indices range must be identity";
+  }
+
+  // Find all of the gathers used for the data inputs.
+  std::vector<GatherOperator*> gather_ops;
+  for (const string& gather_output_name : stitch_data_inputs) {
+    auto* op = GetOpWithOutput(*model, gather_output_name);
+    CHECK(op) << "Source of " << gather_output_name << " not found";
+    if (op->type != OperatorType::kGather) {
+      AddMessageF(
+          "Skipping because data input %s into %s "
+          "is unexpected",
+          LogName(*op), LogName(*stitch_op));
+      return false;
+    }
+    gather_ops.push_back(static_cast<GatherOperator*>(op));
+  }
+
+  // Validate all gathers come from the same DynamicPartition.
+  DynamicPartitionOperator* data_partition_op = nullptr;
+  for (auto* gather_op : gather_ops) {
+    auto* op = GetOpWithOutput(*model, gather_op->inputs[1]);
+    CHECK(op) << "Source of " << gather_op->inputs[1] << " not found";
+    if (op->type != OperatorType::kDynamicPartition) {
+      AddMessageF(
+          "Skipping because data input %s into "
+          "%s is unexpected",
+          LogName(*op), LogName(*gather_op));
+      return false;
+    }
+    if (!data_partition_op) {
+      data_partition_op = static_cast<DynamicPartitionOperator*>(op);
+    } else {
+      // Ensure this is the same op as previous ones.
+      if (op != data_partition_op) {
+        AddMessageF(
+            "Skipping because data input %s into "
+            "%s is from a different source op than others",
+            LogName(*op), LogName(*gather_op));
+        return false;
+      }
+    }
+  }
+  CHECK(data_partition_op) << "No data inputs";
+
+  // Validate the partition ops have the same sizes.
+  CHECK_EQ(indices_partition_op->num_partitions,
+           data_partition_op->num_partitions)
+      << "Indices and data partition ops have differing dimensions";
+  int num_partitions = indices_partition_op->num_partitions;
+
+  // Partition strategy of 'mod' gives us a FloorMod and FloorDiv.
+  // The gather partition uses the FloorDiv as the data and FloorMod as the
+  // partitions and the indices use the FloorMod as their partitions.
+  Operator* div_op = GetOpWithOutput(*model, data_partition_op->inputs[0]);
+  Operator* mod_op = GetOpWithOutput(*model, data_partition_op->inputs[1]);
+  CHECK(div_op && div_op->type == OperatorType::kFloorDiv)
+      << "Unsupported partition strategy";
+  CHECK(mod_op && mod_op->type == OperatorType::kFloorMod)
+      << "Unsupported partition strategy";
+  CHECK_EQ(mod_op, GetOpWithOutput(*model, indices_partition_op->inputs[1]))
+      << "Indices and data partition ops require the same partition strategy "
+         "and inputs";
+
+  // Glob together all of the gather data. This is not yet in the correct order.
+  auto* gather_params_concat_op = new ConcatenationOperator;
+  for (const auto& gather_op : gather_ops) {
+    gather_params_concat_op->inputs.push_back(gather_op->inputs[0]);
+  }
+  gather_params_concat_op->outputs.push_back(
+      AvailableArrayName(*model, gather_ops[0]->inputs[0] + "_unpartitioned"));
+  op_it = model->operators.emplace(op_it, gather_params_concat_op) + 1;
+  model->GetOrCreateArray(gather_params_concat_op->outputs[0]);
+
+  // Permute the gather params to undo the partitioning that was originally
+  // done.
+  auto* gather_params_permute_op = new GatherOperator;
+  gather_params_permute_op->inputs.push_back(
+      gather_params_concat_op->outputs[0]);
+  gather_params_permute_op->inputs.push_back(
+      AvailableArrayName(*model, gather_ops[0]->inputs[0] + "_permuted/perm"));
+  gather_params_permute_op->outputs.push_back(
+      AvailableArrayName(*model, gather_ops[0]->inputs[0] + "_permuted"));
+  op_it = model->operators.emplace(op_it, gather_params_permute_op) + 1;
+  model->GetOrCreateArray(gather_params_permute_op->outputs[0]);
+  const auto& partition_array = model->GetArray(gather_ops[0]->inputs[0]);
+  const auto& partition_array_dims = partition_array.shape().dims();
+  gather_params_permute_op->input_rank =
+      partition_array.shape().dimensions_count();
+  auto& perm_array =
+      model->GetOrCreateArray(gather_params_permute_op->inputs[1]);
+  perm_array.data_type = ArrayDataType::kInt32;
+  perm_array.mutable_shape()->ReplaceDims(
+      {num_partitions * partition_array_dims[0]});
+  auto& perm_data = perm_array.GetMutableBuffer<ArrayDataType::kInt32>().data;
+  perm_data.resize(RequiredBufferSizeForShape(perm_array.shape()));
+  // NOTE: this is what relies on the partition_strategy.
+  for (int i = 0; i < num_partitions * partition_array_dims[0]; ++i) {
+    int p = i % num_partitions;
+    perm_data[i] = p * partition_array_dims[0] + i / num_partitions;
+  }
+
+  // Insert the new unpartitioned gather op.
+  auto* merged_gather_op = new GatherOperator;
+  merged_gather_op->inputs = {gather_params_permute_op->outputs[0],
+                              mod_op->inputs[0]};
+  merged_gather_op->outputs = {stitch_op->outputs[0]};
+  merged_gather_op->input_rank = partition_array.shape().dimensions_count();
+  model->operators.emplace(op_it, merged_gather_op);
+
+  AddMessageF(
+      "Replacing suspected partitioned tf.nn.embedding_lookup (starting at %s "
+      "+ %s and ending at %s) with a single unpartitioned gather %s",
+      LogName(*div_op), LogName(*mod_op), LogName(*stitch_op),
+      LogName(*merged_gather_op));
+
+  // Ensure the stitch output array is dead, as we don't want whatever was in it
+  // previously now that we've redefined it. It'll be recreated when needed.
+  model->EraseArray(stitch_op->outputs[0]);
+  model->GetOrCreateArray(merged_gather_op->outputs[0]);
+
+  // Erase all the original ops.
+  DeleteOpAndArraysIfUnused(model, div_op);
+  DeleteOpAndArraysIfUnused(model, mod_op);
+  for (auto* gather_op : gather_ops) {
+    DeleteOpAndArraysIfUnused(model, gather_op);
+  }
+  DeleteOpAndArraysIfUnused(model, indices_partition_op);
+  DeleteOpAndArraysIfUnused(model, data_partition_op);
+  DeleteOpAndArraysIfUnused(model, stitch_op);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 9c01b67420603a0d3c0e095dafe6a3359f2514b5..2b413c0290f6912f8e6f739302a9ee658d3d3798 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -74,7 +74,7 @@ const string& GetStringAttr(const NodeDef& node, const string& attr_name) {
   return attr.s();
 }
 
-int GetIntAttr(const NodeDef& node, const string& attr_name) {
+int64 GetIntAttr(const NodeDef& node, const string& attr_name) {
   CHECK(HasAttr(node, attr_name)) << attr_name << " not found in:\n"
                                   << node.DebugString();
   const auto& attr = node.attr().at(attr_name);
@@ -272,6 +272,39 @@ void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
   }
 }
 
+void ImportBoolArray(const TensorProto& input_tensor, Array* output_array) {
+  CHECK_EQ(input_tensor.dtype(), DT_BOOL);
+  const auto& input_shape = input_tensor.tensor_shape();
+  CHECK_LE(input_shape.dim_size(), 4);
+  ImportShape(input_shape.dim(), output_array->mutable_shape());
+  int input_flat_size = 1;
+  for (int k = 0; k < input_shape.dim_size(); k++) {
+    input_flat_size *= input_shape.dim(k).size();
+  }
+  auto& output_bool_data =
+      output_array->GetMutableBuffer<ArrayDataType::kBool>().data;
+  output_bool_data.resize(RequiredBufferSizeForShape(output_array->shape()),
+                          false);
+  if (input_tensor.bool_val_size()) {
+    for (int i = 0; i < input_tensor.bool_val_size(); i++) {
+      output_bool_data[i] = input_tensor.bool_val(i);
+    }
+  } else if (input_tensor.tensor_content().size() == input_flat_size) {
+    std::vector<char> buf(input_tensor.tensor_content().size());
+    toco::port::CopyToBuffer(input_tensor.tensor_content(), buf.data());
+    for (int i = 0; i < input_tensor.tensor_content().size(); i++) {
+      output_bool_data[i] = static_cast<bool>(buf[i]);
+    }
+  } else {
+    // Some graphs have bool const nodes without actual value...
+    // assuming that 'false' is implied.
+    // So far only encountered that in an array with 1 entry, let's
+    // require that until we encounter a graph where that's not the case.
+    CHECK_EQ(output_bool_data.size(), 1);
+    output_bool_data[0] = false;
+  }
+}
+
 void ImportStringArray(const TensorProto& input_tensor, Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_STRING);
   const auto& input_shape = input_tensor.tensor_shape();
@@ -318,6 +351,18 @@ void CheckInputsCount(const NodeDef& node,
       << " input(s) other than control dependencies: " << node.DebugString();
 }
 
+template <ArrayDataType T>
+string CreateConstArray(Model* model, string const& name,
+                        std::vector<typename toco::DataType<T> > const& data) {
+  // Utility function to create a const 1D array, useful for input parameters.
+  string array_name = toco::AvailableArrayName(*model, name);
+  auto& array = model->GetOrCreateArray(array_name);
+  array.data_type = T;
+  array.mutable_shape()->mutable_dims()->emplace_back(data.size());
+  array.GetMutableBuffer<T>().data = data;
+  return array_name;
+}
+
 void ConvertConstOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
@@ -347,6 +392,10 @@ void ConvertConstOperator(const NodeDef& node,
       array.data_type = ArrayDataType::kString;
       ImportStringArray(tensor, &array);
       break;
+    case DT_BOOL:
+      array.data_type = ArrayDataType::kBool;
+      ImportBoolArray(tensor, &array);
+      break;
     default:
       array.data_type = ArrayDataType::kNone;
       // do nothing, silently ignore the Const data.
@@ -365,7 +414,7 @@ void ConvertConvOperator(const NodeDef& node,
 
   // We only support NHWC, which is the default data_format.
   // So if data_format is not defined, we're all good.
-  if (node.attr().count("data_format")) {
+  if (HasAttr(node, "data_format")) {
     CHECK_EQ(GetStringAttr(node, "data_format"), "NHWC");
   }
   CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
@@ -399,6 +448,17 @@ void ConvertConvOperator(const NodeDef& node,
   CHECK_EQ(strides.i(3), 1);
   conv->stride_height = strides.i(1);
   conv->stride_width = strides.i(2);
+  if (HasAttr(node, "dilations")) {
+    const auto& dilations = GetListAttr(node, "dilations");
+    CHECK_EQ(dilations.i_size(), 4);
+    CHECK_EQ(dilations.i(0), 1);
+    CHECK_EQ(dilations.i(3), 1);
+    conv->dilation_height_factor = dilations.i(1);
+    conv->dilation_width_factor = dilations.i(2);
+  } else {
+    conv->dilation_height_factor = 1;
+    conv->dilation_width_factor = 1;
+  }
   const auto& padding = GetStringAttr(node, "padding");
   if (padding == "SAME") {
     conv->padding.type = PaddingType::kSame;
@@ -418,7 +478,7 @@ void ConvertDepthwiseConvOperator(const NodeDef& node,
 
   // We only support NHWC, which is the default data_format.
   // So if data_format is not defined, we're all good.
-  if (node.attr().count("data_format")) {
+  if (HasAttr(node, "data_format")) {
     CHECK_EQ(GetStringAttr(node, "data_format"), "NHWC");
   }
   CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
@@ -509,6 +569,23 @@ void ConvertBiasAddOperator(const NodeDef& node,
   model->operators.emplace_back(biasadd);
 }
 
+void ConvertRandomUniform(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "RandomUniform");
+  CheckInputsCount(node, tf_import_flags, 1);
+
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_INT32);
+  auto op = absl::make_unique<RandomUniformOperator>();
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  op->dtype = ConvertDataType(GetDataTypeAttr(node, "dtype"));
+  op->seed = GetIntAttr(node, "seed");
+  op->seed2 = GetIntAttr(node, "seed2");
+  CHECK(model != nullptr);
+  model->operators.emplace_back(std::move(op));
+}
+
 void ConvertReluOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
@@ -534,6 +611,18 @@ void ConvertRelu6Operator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertLogOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Log");
+  CheckInputsCount(node, tf_import_flags, 1);
+
+  auto op = absl::make_unique<LogOperator>();
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(std::move(op));
+}
+
 void ConvertLogisticOperator(const NodeDef& node,
                              const TensorFlowImportFlags& tf_import_flags,
                              Model* model) {
@@ -605,6 +694,8 @@ void ConvertFakeQuantWithMinMaxArgs(
   minmax.min = GetFloatAttr(node, "min");
   minmax.max = GetFloatAttr(node, "max");
   op->outputs.push_back(node.name());
+  // tf.fake_quant_with_min_max_args num_bits defaults to 8.
+  op->num_bits = HasAttr(node, "num_bits") ? GetIntAttr(node, "num_bits") : 8;
   model->operators.emplace_back(op);
 }
 
@@ -622,6 +713,7 @@ void ConvertFakeQuantWithMinMaxVars(
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+  op->num_bits = HasAttr(node, "num_bits") ? GetIntAttr(node, "num_bits") : 8;
   model->operators.emplace_back(op);
 }
 
@@ -667,9 +759,12 @@ void ConvertSqueezeOperator(const NodeDef& node,
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
 
-  const auto& squeeze_dims = GetListAttr(node, "squeeze_dims");
-  for (int i = 0; i < squeeze_dims.i_size(); ++i) {
-    op->squeeze_dims.push_back(squeeze_dims.i(i));
+  // When omitted we are to squeeze all dimensions == 1.
+  if (HasAttr(node, "squeeze_dims")) {
+    const auto& squeeze_dims = GetListAttr(node, "squeeze_dims");
+    for (int i = 0; i < squeeze_dims.i_size(); ++i) {
+      op->squeeze_dims.push_back(squeeze_dims.i(i));
+    }
   }
 
   model->operators.emplace_back(op);
@@ -998,8 +1093,10 @@ void ConvertMatMulOperator(const NodeDef& node,
 
   // Transpose flags should be easy to support, but we don't have a
   // GraphDef with them to test on at the moment.
-  CHECK_EQ(GetBoolAttr(node, "transpose_a"), false);
-  CHECK_EQ(GetBoolAttr(node, "transpose_b"), false);
+  CHECK_EQ(HasAttr(node, "transpose_a") && GetBoolAttr(node, "transpose_a"),
+           false);
+  CHECK_EQ(HasAttr(node, "transpose_b") && GetBoolAttr(node, "transpose_b"),
+           false);
   CHECK(!HasAttr(node, "adjoint_a") ||
         (GetBoolAttr(node, "adjoint_a") == false));
   CHECK(!HasAttr(node, "adjoint_b") ||
@@ -1205,11 +1302,17 @@ void ConvertStridedSliceOperator(const NodeDef& node,
   }
   op->outputs.push_back(node.name());
 
-  op->begin_mask = GetIntAttr(node, "begin_mask");
-  op->ellipsis_mask = GetIntAttr(node, "ellipsis_mask");
-  op->end_mask = GetIntAttr(node, "end_mask");
-  op->new_axis_mask = GetIntAttr(node, "new_axis_mask");
-  op->shrink_axis_mask = GetIntAttr(node, "shrink_axis_mask");
+  op->begin_mask =
+      HasAttr(node, "begin_mask") ? GetIntAttr(node, "begin_mask") : 0;
+  op->ellipsis_mask =
+      HasAttr(node, "ellipsis_mask") ? GetIntAttr(node, "ellipsis_mask") : 0;
+  op->end_mask = HasAttr(node, "end_mask") ? GetIntAttr(node, "end_mask") : 0;
+  op->new_axis_mask =
+      HasAttr(node, "new_axis_mask") ? GetIntAttr(node, "new_axis_mask") : 0;
+  op->shrink_axis_mask = HasAttr(node, "shrink_axis_mask")
+                             ? GetIntAttr(node, "shrink_axis_mask")
+                             : 0;
+
   model->operators.emplace_back(op);
 }
 
@@ -1280,13 +1383,16 @@ void ConvertFloorOperator(const NodeDef& node,
 void ConvertGatherOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
-  CHECK_EQ(node.op(), "Gather");
-  CheckInputsCount(node, tf_import_flags, 2);
+  CHECK(node.op() == "Gather" || node.op() == "GatherV2");
+  if (node.op() == "Gather") CheckInputsCount(node, tf_import_flags, 2);
+  if (node.op() == "GatherV2") CheckInputsCount(node, tf_import_flags, 3);
   const auto indices_data_type = GetDataTypeAttr(node, "Tindices");
   CHECK(indices_data_type == DT_INT32 || indices_data_type == DT_INT64);
   auto* op = new GatherOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
+  // TODO(ahentz): we currently ignore the third tensor in GatherV2 but we
+  // should read it an pass it on to the TF Lite Interpreter.
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
@@ -1296,8 +1402,11 @@ void ConvertArgMaxOperator(const NodeDef& node,
                            Model* model) {
   CHECK_EQ(node.op(), "ArgMax");
   CheckInputsCount(node, tf_import_flags, 2);
-  const auto axis_data_type = GetDataTypeAttr(node, "Tidx");
-  const auto output_type = GetDataTypeAttr(node, "output_type");
+  const auto axis_data_type =
+      HasAttr(node, "Tidx") ? GetDataTypeAttr(node, "Tidx") : DT_INT32;
+  const auto output_type = HasAttr(node, "output_type")
+                               ? GetDataTypeAttr(node, "output_type")
+                               : DT_INT64;
   CHECK(axis_data_type == DT_INT64 || axis_data_type == DT_INT32);
   CHECK(output_type == DT_INT64 || output_type == DT_INT32);
   auto* op = new ArgMaxOperator;
@@ -1388,12 +1497,8 @@ void ConvertFusedBatchNormOperator(const NodeDef& node,
   const string& moving_variance_input = node.input(4);
 
   // Create an array holding the epsilon value (typically, 0.001).
-  const string epsilon_array_name = node.name() + "_epsilon_array";
-  auto& epsilon_array = model->GetOrCreateArray(epsilon_array_name);
-  epsilon_array.data_type = ArrayDataType::kFloat;
-  *epsilon_array.mutable_shape()->mutable_dims() = {1};
-  epsilon_array.GetMutableBuffer<ArrayDataType::kFloat>().data.push_back(
-      GetFloatAttr(node, "epsilon"));
+  const string epsilon_array_name = CreateConstArray<ArrayDataType::kFloat>(
+      model, node.name() + "_epsilon_array", {GetFloatAttr(node, "epsilon")});
 
   // Add epsilon to the moving variance.
   const string epsilon_add_op_name = node.name() + "_epsilon";
@@ -1482,7 +1587,9 @@ void ConvertMeanOperator(const NodeDef& node,
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  if (HasAttr(node, "keep_dims")) {
+  if (HasAttr(node, "keepdims")) {
+    op->keep_dims = GetBoolAttr(node, "keepdims");
+  } else if (HasAttr(node, "keep_dims")) {
     op->keep_dims = GetBoolAttr(node, "keep_dims");
   }
 }
@@ -1521,16 +1628,56 @@ void ConvertTransposeConvOperator(const NodeDef& node,
   CHECK_EQ(node.op(), "Conv2DBackpropInput");
   CheckInputsCount(node, tf_import_flags, 3);
   auto* op = new TransposeConvOperator;
-  op->inputs.push_back(node.input(2));
-  op->inputs.push_back(node.input(1));
   op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
   op->outputs.push_back(node.name());
   const auto& strides = GetListAttr(node, "strides");
-  CHECK_EQ(strides.i_size(), 4);
-  CHECK_EQ(strides.i(0), 1);
   op->stride_height = strides.i(1);
   op->stride_width = strides.i(2);
-  CHECK_EQ(strides.i(3), 1);
+  CHECK_EQ(strides.i_size(), 4)
+      << "Can only import TransposeConv ops with 4D strides. TensorFlow op \""
+      << node.name() << "\" has " << strides.i_size() << "D strides.";
+  CHECK((strides.i(0) == 1) && (strides.i(3) == 1))
+      << "Can only import TransposeConv ops with striding along the height "
+         "(1st) or width (2nd) axis. TensorFlow op \""
+      << node.name() << "\" had strides:[ " << strides.i(0) << ", "
+      << strides.i(1) << ", " << strides.i(2) << ", " << strides.i(3) << "].";
+  op->stride_height = strides.i(1);
+  op->stride_width = strides.i(2);
+  if (HasAttr(node, "dilations")) {
+    const auto& dilations = GetListAttr(node, "dilations");
+    CHECK_EQ(dilations.i_size(), 4)
+        << "Dilation unsupported in TransposeConv. TensorFlow op \""
+        << node.name() << "\" had dilations";
+    CHECK((dilations.i(0) == 1) && (dilations.i(1) == 1) &&
+          (dilations.i(1) == 1) && (dilations.i(3) == 1))
+        << "Dilation unsupported in TransposeConv. TensorFlow op \""
+        << node.name() << "\" had dilations:[ " << dilations.i(0) << ", "
+        << dilations.i(1) << ", " << dilations.i(2) << ", " << dilations.i(3)
+        << "].";
+  }
+
+  const string& weights_name = node.input(TransposeConvOperator::WEIGHTS);
+  const string& transposed_weights_name = weights_name + "_transposed";
+  // Check if a TransposeOperator was already created for these weights
+  // (can happen when multiple layers share the same weights).
+  const Operator* existing_transpose =
+      GetOpWithOutput(*model, transposed_weights_name);
+  if (existing_transpose) {
+    CHECK(existing_transpose->type == OperatorType::kTranspose);
+  } else {
+    // Transpose weights from HWIO order to OHWI order, which is more efficient
+    // for computation
+    TransposeOperator* transpose = new TransposeOperator;
+    string perm_array = CreateConstArray<ArrayDataType::kInt32>(
+        model, node.name() + "_transpose_perm", {3, 0, 1, 2});
+    transpose->inputs = {weights_name, perm_array};
+    transpose->outputs = {transposed_weights_name};
+    model->operators.emplace_back(transpose);
+  }
+  op->inputs[1] = transposed_weights_name;
+
   auto const& padding = GetStringAttr(node, "padding");
   if (padding == "SAME") {
     op->padding.type = PaddingType::kSame;
@@ -1636,7 +1783,7 @@ void ConvertStackOperator(const NodeDef& node,
     op->inputs.push_back(node.input(i));
   }
   // Both "Stack" and "Pack" have the "axis" attribute.
-  op->axis = GetIntAttr(node, "axis");
+  op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : 0;
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
@@ -1826,28 +1973,54 @@ void ConvertTopKV2Operator(const NodeDef& node,
   op->inputs.push_back(node.input(0));
   // K can be encoded as attr (TopK) convert it to a const.
   if (HasAttr(node, "k")) {
-    // Convert attribute into const tensor.
-    const string array_name = node.name() + "k";
-    auto& array = model->GetOrCreateArray(array_name);
-    array.data_type = ArrayDataType::kInt32;
-    // Size of array is always 1.
-    array.mutable_shape()->mutable_dims()->emplace_back(1);
-
-    auto& output_int_data =
-        array.GetMutableBuffer<ArrayDataType::kInt32>().data;
-    output_int_data.resize(1);
-    output_int_data[0] = GetIntAttr(node, "k");
-    op->inputs.push_back(array_name);
-
+    string k_array = CreateConstArray<ArrayDataType::kInt32>(
+        model, node.name() + "k", {static_cast<int32>(GetIntAttr(node, "k"))});
+    op->inputs.push_back(k_array);
   } else {
     CheckInputsCount(node, tf_import_flags, 2);
     op->inputs.push_back(node.input(1));
   }
   // The op has two outputs.
-  op->outputs.push_back(node.name() + ":0");
+  op->outputs.push_back(node.name());
   op->outputs.push_back(node.name() + ":1");
   model->operators.emplace_back(op.release());
 }
+
+void ConvertDynamicPartitionOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  auto op = absl::make_unique<DynamicPartitionOperator>();
+  CHECK(HasAttr(node, "num_partitions"));
+  op->num_partitions = GetIntAttr(node, "num_partitions");
+  CheckInputsCount(node, tf_import_flags, 2);
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  CHECK_GT(op->num_partitions, 1);
+  op->outputs.push_back(node.name());  // Implicit :0.
+  for (int i = 1; i < op->num_partitions; ++i) {
+    op->outputs.push_back(node.name() + ":" + std::to_string(i));
+  }
+  model->operators.emplace_back(op.release());
+}
+
+void ConvertDynamicStitchOperator(const NodeDef& node,
+                                  const TensorFlowImportFlags& tf_import_flags,
+                                  Model* model) {
+  // The parallel and non-parallel variants are the same besides whether they
+  // have a parallel loop; there are no behavioral differences.
+  CHECK(node.op() == "DynamicStitch" || node.op() == "ParallelDynamicStitch");
+  auto op = absl::make_unique<DynamicStitchOperator>();
+  CHECK(HasAttr(node, "N"));
+  op->num_partitions = GetIntAttr(node, "N");
+  // Expect all ID partitions + all value partitions.
+  CheckInputsCount(node, tf_import_flags, op->num_partitions * 2);
+  for (int i = 0; i < op->num_partitions * 2; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op.release());
+}
+
 }  // namespace
 
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
@@ -1944,6 +2117,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertLRNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Softmax") {
       ConvertSoftmaxOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Log") {
+      ConvertLogOperator(node, tf_import_flags, model);
     } else if (node.op() == "LogSoftmax") {
       ConvertLogSoftmaxOperator(node, tf_import_flags, model);
     } else if (node.op() == "All") {
@@ -1992,7 +2167,7 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertCastOperator(node, tf_import_flags, model);
     } else if (node.op() == "Floor") {
       ConvertFloorOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Gather") {
+    } else if (node.op() == "Gather" || node.op() == "GatherV2") {
       ConvertGatherOperator(node, tf_import_flags, model);
     } else if (node.op() == "ResizeBilinear") {
       ConvertResizeBilinearOperator(node, tf_import_flags, model);
@@ -2033,6 +2208,13 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertExpOperator(node, tf_import_flags, model);
     } else if (node.op() == "TopK" || node.op() == "TopKV2") {
       ConvertTopKV2Operator(node, tf_import_flags, model);
+    } else if (node.op() == "DynamicPartition") {
+      ConvertDynamicPartitionOperator(node, tf_import_flags, model);
+    } else if (node.op() == "DynamicStitch" ||
+               node.op() == "ParallelDynamicStitch") {
+      ConvertDynamicStitchOperator(node, tf_import_flags, model);
+    } else if (node.op() == "RandomUniform") {
+      ConvertRandomUniform(node, tf_import_flags, model);
     } else {
       ConvertUnsupportedOperator(node, tf_import_flags, model);
     }
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index c55bf664f8b65c6eb53ff9ae926bed11adc7b183..482cc71d8b34705041130c7518b58f7fe183bc3c 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -24,11 +24,14 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
+using tflite::QuantizationParams;
+
 enum class OperatorType {
   kNone,
   // General-purpose neural network operators.
@@ -54,15 +57,18 @@ enum class OperatorType {
   kL2Pool,
   kLstmCell,
   kLocalResponseNormalization,
+  kLog,
   kLogistic,
   kMaxPool,
   kFakeQuant,
   kMul,
+  kRandomUniform,
   kRange,
   kRank,
   kRelu,
   kRelu1,
   kRelu6,
+  kPRelu,
   kSoftmax,
   kLogSoftmax,
   kSub,
@@ -115,6 +121,8 @@ enum class OperatorType {
   kTensorFlowTile,
   kTranspose,
   kTopK_V2,
+  kDynamicPartition,
+  kDynamicStitch,
   // An unsupported TF operation. It's only needed to be able to represent TF
   // graph internally and is expected to be dropped by graph transformations.
   kTensorFlowUnsupported,
@@ -144,9 +152,9 @@ enum class AxesOrder {
 };
 
 // The type of the scalars in an array.
-// Note that that does not by itself tell whether the values in the array are
-// real (are literally interpreted as real numbers) or quantized (only acquire
-// a meaning as real numbers in conjunction with QuantizationParams).
+// Note that the type does not by itself tell whether the values in the array
+// are real (are literally interpreted as real numbers) or quantized (only
+// acquire a meaning as real numbers in conjunction with QuantizationParams).
 //
 // In practice though:
 //   float values are always real
@@ -244,6 +252,8 @@ struct GenericBuffer {
   // in containers and have the containers call the right subclass destructor.
   virtual ~GenericBuffer() {}
 
+  virtual int Length() const = 0;
+
   const ArrayDataType type;
 
  protected:
@@ -256,6 +266,8 @@ template <ArrayDataType A>
 struct Buffer : GenericBuffer {
   Buffer() : GenericBuffer(A) {}
 
+  int Length() const override { return data.size(); }
+
   std::vector<DataType<A>> data;
 };
 
@@ -359,7 +371,8 @@ struct ConvOperator : Operator {
   // A dilation_rate of 0 is invalid and this field is an optional attribute.
   // Thus initializing it to 1 to allow default conv behavior when the
   // attribute is not present.
-  int dilation_rate = 1;
+  int dilation_width_factor = 1;
+  int dilation_height_factor = 1;
 };
 
 // Depthwise-separable convolution operator.
@@ -413,6 +426,7 @@ struct SpaceToDepthOperator : Operator {
 // input activations as a matrix, followed by a MatMul node.
 struct FullyConnectedOperator : Operator {
   FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {}
+  bool experimental_shuffled_weights = false;
 };
 
 // Dequantization operator, converting a quantized array of integers with
@@ -557,6 +571,18 @@ struct Relu6Operator : Operator {
   Relu6Operator() : Operator(OperatorType::kRelu6) {}
 };
 
+// PRelu
+//   f(x) = alpha * x for x < 0, f(x) = x for x >= 0.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the alpha array
+//
+// Equivalent to keras.layers.PReLU.
+struct PReluOperator : Operator {
+  PReluOperator() : Operator(OperatorType::kPRelu) {}
+};
+
 // Element-wise Logistic operator:
 //   x -> Logistic(x) = 1 / (1 + exp(-x))
 //
@@ -568,6 +594,17 @@ struct LogisticOperator : Operator {
   LogisticOperator() : Operator(OperatorType::kLogistic) {}
 };
 
+// Element-wise natural log operator:
+//   x -> ln(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Log
+struct LogOperator : Operator {
+  LogOperator() : Operator(OperatorType::kLog) {}
+};
+
 // Element-wise Tanh operator:
 //   x -> Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 //
@@ -689,8 +726,7 @@ struct L2PoolOperator : Operator {
 // The expected [min, max] range of values in a given array.
 // Used for quantization only.
 // This information typically comes from special nodes found in quantized
-// models,
-// see FakeQuantOperator, and is used during quantization to resolve
+// models, see FakeQuantOperator, and is used during quantization to resolve
 // actual quantization parameters (see QuantizationParams).
 struct MinMax {
   double min = 0.;
@@ -718,6 +754,7 @@ inline bool operator==(const MinMax& m1, const MinMax& m2) {
 struct FakeQuantOperator : Operator {
   FakeQuantOperator() : Operator(OperatorType::kFakeQuant) {}
   std::unique_ptr<MinMax> minmax;
+  int num_bits = 8;
 };
 
 // Element-wise division operator.
@@ -809,6 +846,60 @@ struct StridedSliceOperator : Operator {
   int end_mask;
   int new_axis_mask;
   int shrink_axis_mask;
+
+  StridedSliceOperator(const StridedSliceOperator& other)
+      : Operator(OperatorType::kStridedSlice) {
+    inputs = other.inputs;
+    outputs = other.outputs;
+
+    start_indices = other.start_indices;
+    stop_indices = other.stop_indices;
+    strides = other.strides;
+
+    begin_mask = other.begin_mask;
+    ellipsis_mask = other.ellipsis_mask;
+    end_mask = other.end_mask;
+    new_axis_mask = other.new_axis_mask;
+    shrink_axis_mask = other.shrink_axis_mask;
+  }
+
+  void PadIndices(int dim_count) {
+    // Add indices and mask bits to fully include extra dimensions
+    CHECK_GE(dim_count, start_indices.size());
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    for (int i = start_indices.size(); i < dim_count; i++) {
+      start_indices.push_back(0);
+      stop_indices.push_back(0);
+      strides.push_back(1);
+      begin_mask |= 1 << i;
+      end_mask |= 1 << i;
+    }
+  }
+
+  void ReverseIndices() {
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    std::reverse(start_indices.begin(), start_indices.end());
+    std::reverse(stop_indices.begin(), stop_indices.end());
+    std::reverse(strides.begin(), strides.end());
+
+    begin_mask = toco::port::ReverseBits32(static_cast<uint32>(begin_mask)) >>
+                 (32 - start_indices.size());
+    ellipsis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(ellipsis_mask)) >>
+        (32 - start_indices.size());
+    end_mask = toco::port::ReverseBits32(static_cast<uint32>(end_mask)) >>
+               (32 - start_indices.size());
+    new_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(new_axis_mask)) >>
+        (32 - start_indices.size());
+    shrink_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(shrink_axis_mask)) >>
+        (32 - start_indices.size());
+  }
 };
 
 // Reshaping operator, reshaping its input array to a two-dimensional shape
@@ -839,19 +930,29 @@ struct SqueezeOperator : Operator {
 };
 
 // Inputs:
-//   inputs[0]: required: the input activations array
-//   inputs[1]: required: the Conv weights
-//   channel.
+//   inputs[0]: required: the output shape
+//   inputs[1]: required: the weights
+//   inputs[2]: required: the input activations array
+//   NOTE: The input activations is NOT the first input.
+//
 //
 // Outputs:
 //   outputs[0]: required: the output activations array
 //
 // TensorFlow equivalent: Conv2DBackpropInput
 struct TransposeConvOperator : Operator {
+  enum Inputs {
+    OUTPUT_SHAPE = 0,
+    WEIGHTS = 1,
+    DATA_INPUT = 2,
+  };
+
   TransposeConvOperator() : Operator(OperatorType::kTransposeConv) {}
   Padding padding;
   int stride_width = 0;
   int stride_height = 0;
+  // Dilation is possible with transpose convolution, but Tensorflow does not
+  // currently support it, so we omit it.
 };
 
 // Given a tensor input, this operation calculates element-wise exponential
@@ -914,6 +1015,13 @@ struct FloorModOperator : Operator {
   FloorModOperator() : Operator(OperatorType::kFloorMod) {}
 };
 
+struct RandomUniformOperator : Operator {
+  RandomUniformOperator() : Operator(OperatorType::kRandomUniform) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+  int64 seed;
+  int64 seed2;
+};
+
 // Creates a sequence of numbers that begins at start and extends by increments
 // of delta up to but not including limit.
 //
@@ -1277,6 +1385,15 @@ struct SoftmaxOperator : Operator {
 // TensorFlow equivalent: LogSoftmax
 struct LogSoftmaxOperator : Operator {
   LogSoftmaxOperator() : Operator(OperatorType::kLogSoftmax) {}
+
+  // LogSoftmax can in principal have very large negative output, depending on
+  // the input size.  However, input x_i that is less than x_max-10 is
+  // accumulated as exp(x_i-x_max), which is truncated to zero.
+  //
+  // Since we effectively disregard smallish inputs in the normalizing factor,
+  // we also drop them in the output (set to minimum output), and in doing so
+  // make better use of the quantization range / resolution.
+  static constexpr float kOutputRangeMin = -16.0;
 };
 
 // Cast operator.
@@ -1359,8 +1476,7 @@ struct SpaceToBatchNDOperator : Operator {
 };
 
 // BatchToSpaceND operator. Rearranges data from batch into blocks of
-// spatial data. Currently, only 2-d blocks are supported. Cropping is not
-// supported, either, and the crops array should be all zero.
+// spatial data. Currently, only 2-d blocks are supported.
 //
 // Inputs:
 //   inputs[0]: required: the input array
@@ -1409,6 +1525,30 @@ struct TopKV2Operator : Operator {
   TopKV2Operator() : Operator(OperatorType::kTopK_V2) {}
 };
 
+// DynamicPartition operator:
+//
+// Inputs:
+//  inputs[0]: required: data.
+//  inputs[1]: required: partitions.
+//
+// TensorFlow equivalent: DynamicPartition
+struct DynamicPartitionOperator : Operator {
+  DynamicPartitionOperator() : Operator(OperatorType::kDynamicPartition) {}
+  int num_partitions;
+};
+
+// DynamicStitch operator:
+//
+// Inputs:
+//  inputs[0,N): required: indices.
+//  inputs[N,2N): required: data.
+//
+// TensorFlow equivalent: DynamicStitch/ParallelDynamicStitch
+struct DynamicStitchOperator : Operator {
+  DynamicStitchOperator() : Operator(OperatorType::kDynamicStitch) {}
+  int num_partitions;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
@@ -1422,22 +1562,6 @@ inline bool operator<(const Alloc& a, const Alloc& b) {
   return a.start < b.start;
 }
 
-// Quantization parameters, determining the mapping of quantized values
-// to real values (i.e. determining how quantized values are mathematically
-// interpreted).
-//
-// The correspondence is as follows:
-//
-//   real_value = scale * (quantized_value - zero_point);
-//
-// In other words, zero_point designates which quantized value corresponds to
-// the real 0 value, and scale designates the difference between the real values
-// corresponding to consecutive quantized values differing by 1.
-struct QuantizationParams {
-  int32 zero_point = 0;
-  double scale = 0.;
-};
-
 class Shape {
  public:
   // For Shape, we stick to half-way encapsulation for now:
@@ -1459,7 +1583,14 @@ class Shape {
 
   // We still have that one convenience accessor to avoid
   // the awkward double bracket issue:  shape.dims()[i].
-  int dims(int i) const { return dims_[i]; }
+  int dims(int i) const {
+    // Always check for out-of-bounds accesses, even in optimized builds where
+    // standard assertions are disabled. Out-of-bounds access here is a common
+    // occurrence.
+    CHECK_GE(i, 0);
+    CHECK_GT(dims_.size(), i);
+    return dims_[i];
+  }
 
   bool operator==(const Shape& comp) const {
     return (this->dims_ == comp.dims());
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 4e2dec15a534607ef9207149a2e6061069eabcb1..7bbeab7c9d1e42d28f221f1a1134d9d05fe6ab51 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -72,6 +72,12 @@ bool ParseModelFlagsFromCommandLineFlags(
            "Shapes corresponding to --input_arrays, colon-separated. For "
            "many models each shape takes the form batch size, input array "
            "height, input array width, input array depth."),
+      Flag("batch_size", parsed_flags.batch_size.bind(),
+           parsed_flags.batch_size.default_value(),
+           "Batch size for the model. Replaces the first dimension of an "
+           "input size array if undefined. Use only with SavedModels when "
+           "--input_shapes flag is not specified. Always use --input_shapes "
+           "flag with frozen graphs."),
       Flag("input_data_type", parsed_flags.input_data_type.bind(),
            parsed_flags.input_data_type.default_value(),
            "Deprecated: use --input_data_types instead. Input array type, if "
@@ -154,6 +160,16 @@ bool ParseModelFlagsFromCommandLineFlags(
           "Path to an optional file containing a serialized ArraysExtraInfo "
           "proto allowing to pass extra information about arrays not specified "
           "in the input model file, such as extra MinMax information."),
+      Flag("model_flags_file", parsed_flags.model_flags_file.bind(),
+           parsed_flags.model_flags_file.default_value(),
+           "Path to an optional file containing a serialized ModelFlags proto. "
+           "Options specified on the command line will override the values in "
+           "the proto."),
+      Flag("change_concat_input_ranges",
+           parsed_flags.change_concat_input_ranges.bind(),
+           parsed_flags.change_concat_input_ranges.default_value(),
+           "Boolean to change the behavior of min/max ranges for inputs and"
+           " output of the concat operators."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -176,7 +192,24 @@ void ReadModelFlagsFromCommandLineFlags(
     const ParsedModelFlags& parsed_model_flags, ModelFlags* model_flags) {
   toco::port::CheckInitGoogleIsDone("InitGoogle is not done yet");
 
-// "batch" flag only exists internally
+  // Load proto containing the initial model flags.
+  // Additional flags specified on the command line will overwrite the values.
+  if (parsed_model_flags.model_flags_file.specified()) {
+    string model_flags_file_contents;
+    QCHECK(port::file::GetContents(parsed_model_flags.model_flags_file.value(),
+                                   &model_flags_file_contents,
+                                   port::file::Defaults())
+               .ok())
+        << "Specified --model_flags_file="
+        << parsed_model_flags.model_flags_file.value()
+        << " was not found or could not be read";
+    QCHECK(ParseFromStringEitherTextOrBinary(model_flags_file_contents,
+                                             model_flags))
+        << "Specified --model_flags_file="
+        << parsed_model_flags.model_flags_file.value()
+        << " could not be parsed";
+  }
+
 #ifdef PLATFORM_GOOGLE
   CHECK(!((base::SpecifiedOnCommandLine("batch") &&
            parsed_model_flags.variable_batch.specified())))
@@ -371,12 +404,15 @@ void ReadModelFlagsFromCommandLineFlags(
       parsed_model_flags.allow_nonascii_arrays.value());
   model_flags->set_allow_nonexistent_arrays(
       parsed_model_flags.allow_nonexistent_arrays.value());
+  model_flags->set_change_concat_input_ranges(
+      parsed_model_flags.change_concat_input_ranges.value());
 
   if (parsed_model_flags.arrays_extra_info_file.specified()) {
     string arrays_extra_info_file_contents;
-    port::file::GetContents(parsed_model_flags.arrays_extra_info_file.value(),
-                            &arrays_extra_info_file_contents,
-                            port::file::Defaults());
+    CHECK(port::file::GetContents(
+              parsed_model_flags.arrays_extra_info_file.value(),
+              &arrays_extra_info_file_contents, port::file::Defaults())
+              .ok());
     ParseFromStringEitherTextOrBinary(arrays_extra_info_file_contents,
                                       model_flags->mutable_arrays_extra_info());
   }
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index e4b39b34e85e4d703c1b41cb68f8139abd1f6279..d23e80c464c9fe9d717d4af8093fa5dee04dca6d 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -96,9 +96,13 @@ message RnnState {
 // model that does not already contain such MinMax information.
 message ArraysExtraInfo {
   message Entry {
+    // Next ID to use: 7.
     optional string name = 1;
-    optional float min = 2;
-    optional float max = 3;
+    optional double min = 2;
+    optional double max = 3;
+    optional IODataType data_type = 4;
+    optional InputArrayShape shape = 5;
+    optional float constant_float_value = 6;
   }
   repeated Entry entries = 1;
 }
@@ -124,7 +128,7 @@ message ArraysExtraInfo {
 //   optional int32 input_dims = 11 [ default = 4];
 //   repeated int32 input_shape = 13;
 //
-// Next ID to USE: 19.
+// Next ID to USE: 20.
 message ModelFlags {
   // Information about the input arrays, i.e. the arrays from which input
   // activations will be read.
@@ -171,4 +175,8 @@ message ModelFlags {
   // If set, this ArraysExtraInfo allows to pass extra information about arrays
   // not specified in the input model file, such as extra MinMax information.
   optional ArraysExtraInfo arrays_extra_info = 18;
+
+  // When set to false, toco will not change the input ranges and the output
+  // ranges of concat operator to the overlap of all input ranges.
+  optional bool change_concat_input_ranges = 19 [default = true];
 }
diff --git a/tensorflow/contrib/lite/toco/python/BUILD b/tensorflow/contrib/lite/toco/python/BUILD
index 17115047d2ef93cce7004926c2b1a4bfa58f6243..6c4f8e12cdd5b3222997c4a2b0ac243cc74324e0 100644
--- a/tensorflow/contrib/lite/toco/python/BUILD
+++ b/tensorflow/contrib/lite/toco/python/BUILD
@@ -45,9 +45,6 @@ py_binary(
     name = "toco_wrapper",
     srcs = ["toco_wrapper.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
 )
 
 tf_py_test(
@@ -63,15 +60,3 @@ tf_py_test(
     ],
     tags = ["no_pip"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
index c35b6f99259b762aa83d92d21512169a7ab50b70..3761e0095ebb06b9e26eca55a36718b92058e47b 100644
--- a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
+++ b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
@@ -50,6 +50,7 @@ class TocoFromProtosTest(googletest.TestCase):
     toco_flags.output_format = toco_flags_pb2.TFLITE
     toco_flags.inference_input_type = types_pb2.FLOAT
     toco_flags.inference_type = types_pb2.FLOAT
+    toco_flags.allow_custom_ops = True;
     model_flags = model_flags_pb2.ModelFlags()
     input_array = model_flags.input_arrays.add()
     input_array.name = TensorName(in_tensor)
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.cc b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
index 8a5e483f3f1676ebed3244bd6f7eb610fad21557..153c117d17e4564d7cb0aaea64d792f63a587d91 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
@@ -75,7 +75,8 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
       toco::Import(toco_flags, model_flags, input_contents_txt);
   toco::Transform(toco_flags, model.get());
   string output_file_contents_txt;
-  Export(toco_flags, *model, &output_file_contents_txt);
+  Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+         &output_file_contents_txt);
 
   // Convert arguments back to byte (py3) or str (py2)
   return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
diff --git a/tensorflow/contrib/lite/toco/python/toco_wrapper.py b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
index e39b5f22c7c8ffafaf72129be6f54090e6761dc3..6d6b500d7eccd353f566a4bad76df35e0e849d95 100644
--- a/tensorflow/contrib/lite/toco/python/toco_wrapper.py
+++ b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
@@ -22,14 +22,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import sys
-import tensorflow as tf
 
 
 def main():
   # Pip installs the binary in aux-bin off of main site-package install.
   # Just find it and exec, passing all arguments in the process.
   # TODO(aselle): it is unfortunate to use all of tensorflow to lookup binary.
-  binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
-  os.execvp(binary, sys.argv)
+  print("""TOCO from pip install is currently not working on command line.
+Please use the python TOCO API or use
+bazel run tensorflow/contrib/lite:toco -- <args> from a TensorFlow source dir.
+""")
+  sys.exit(1)
+  # TODO(aselle): Replace this when we find a way to run toco without
+  # blowing up executable size.
+  # binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
+  # os.execvp(binary, sys.argv)
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
index 0c1a1141fca91e7d27fe48ffae4f834ae92a1e08..336e94de1ed3238d64f521cf1347acc8f0737de7 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
@@ -88,15 +88,3 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
index fddf6cc83686632033f31496ec42b33e2ea15f20..5e421ba944cccd9746c66bc33e986b4406dd3bf5 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
@@ -144,7 +144,9 @@ std::unique_ptr<GraphDef> MaybeReplaceCompositeSubgraph(
       MaybeResolveClusters(tf_graph, cluster_factories);
 
   // Copy function definitions
-  *(pruned_graph->mutable_library()) = tf_graph.library();
+  if (pruned_graph) {
+    *(pruned_graph->mutable_library()) = tf_graph.library();
+  }
   return pruned_graph;
 }
 
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index a2b8145a67278c3ac0065f9551da6ffd1de60772..e1025c66642d2860c5916bf7625f1c0403c9901c 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "types.h",
     ],
     deps = [
+        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/toco:model",
     ],
@@ -115,9 +116,11 @@ cc_library(
     deps = [
         ":operator",
         ":types",
+        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/toco:model",
         "//tensorflow/contrib/lite/toco:tooling_util",
+        "//tensorflow/contrib/lite/tools:verifier",
         "@flatbuffers",
     ],
 )
@@ -135,15 +138,3 @@ tf_cc_test(
         "@flatbuffers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 27719599708a7eb14f72a82f8e5d76b3b8af9dc4..335b496dccdbdb7e342515868e1d7195c98f0351 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -300,6 +300,17 @@ void Export(const Model& model, bool allow_custom_ops,
   std::set<string> error_summary;
   auto op_codes = ExportOperatorCodes(model, ops_by_type, operators_map,
                                       &builder, &error_summary);
+  const string fake_quant_operation_name = "FAKE_QUANT";
+  if (error_summary.count(fake_quant_operation_name) != 0) {
+    LOG(ERROR)
+        << fake_quant_operation_name
+        << " operation was not converted. If running quantized make sure you "
+           "are passing --inference_type=QUANTIZED_UINT8 and values for "
+           "--std_values and --mean_values.";
+    // Remove the fake quant operation from the errors, since it shouldn't
+    // be provided a custom implementation.
+    error_summary.erase(fake_quant_operation_name);
+  }
   if (!allow_custom_ops && !error_summary.empty()) {
     LOG(QFATAL) << "Some of the operators in the model are not supported by "
                    "the standard TensorFlow Lite runtime. If you have a custom "
diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/contrib/lite/toco/tflite/import.cc
index 5b1ab514b23248cd98e66847185d0e8b9fe2d6aa..c0e7ab2ef57ed8edf1b7cda08c64f6ae66172af3 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import.cc
@@ -15,10 +15,12 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/tflite/import.h"
 
 #include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/contrib/lite/tools/verifier.h"
 
 namespace toco {
 
@@ -64,6 +66,9 @@ void ImportTensors(const ::tflite::Model& input_model, Model* model) {
 
     auto shape = input_tensor->shape();
     if (shape) {
+      // If the shape is 0-dimensional, make sure to record it as such,
+      // as oppose to leaving the array without a shape.
+      array.mutable_shape()->mutable_dims()->clear();
       for (int i = 0; i < shape->Length(); ++i) {
         auto d = shape->Get(i);
         array.mutable_shape()->mutable_dims()->push_back(d);
@@ -159,16 +164,28 @@ void ImportIOTensors(const ::tflite::Model& input_model,
   }
 }
 
+namespace {
+bool Verify(const void* buf, size_t len) {
+  ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
+  return ::tflite::VerifyModelBuffer(verifier);
+}
+}  // namespace
+
 std::unique_ptr<Model> Import(const ModelFlags& model_flags,
                               const string& input_file_contents) {
+  ::tflite::AlwaysTrueResolver r;
+  if (!::tflite::Verify(input_file_contents.data(), input_file_contents.size(),
+                        r, ::tflite::DefaultErrorReporter())) {
+    LOG(FATAL) << "Invalid flatbuffer.";
+  }
   const ::tflite::Model* input_model =
       ::tflite::GetModel(input_file_contents.data());
 
   // Full list of all known operators.
   const auto ops_by_name = BuildOperatorByNameMap();
 
-  if (input_model->subgraphs()->size() != 1) {
-    LOG(FATAL) << "# of subgraphs in tflite should be exactly 1 for now.";
+  if (!input_model->subgraphs() || input_model->subgraphs()->size() != 1) {
+    LOG(FATAL) << "Number of subgraphs in tflite should be exactly 1.";
   }
   std::unique_ptr<Model> model;
   model.reset(new Model);
diff --git a/tensorflow/contrib/lite/toco/tflite/import_test.cc b/tensorflow/contrib/lite/toco/tflite/import_test.cc
index aad6e780d5eb5c3dbc880906df5053ad231ffd54..edd22f783f03b1fbd34039cd7b00f08d34ca9fc6 100644
--- a/tensorflow/contrib/lite/toco/tflite/import_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import_test.cc
@@ -27,60 +27,110 @@ namespace {
 
 using ::testing::ElementsAre;
 
+using flatbuffers::Offset;
+using flatbuffers::Vector;
 class ImportTest : public ::testing::Test {
  protected:
   template <typename T>
-  flatbuffers::Offset<flatbuffers::Vector<unsigned char>> CreateDataVector(
-      const std::vector<T>& data) {
+  Offset<Vector<unsigned char>> CreateDataVector(const std::vector<T>& data) {
     return builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.data()),
                                  sizeof(T) * data.size());
   }
-  // This is a very simplistic model. We are not interested in testing all the
-  // details here, since tf.mini's testing framework will be exercising all the
-  // conversions multiple times, and the conversion of operators is tested by
-  // separate unittests.
-  void BuildTestModel() {
-    // The tensors
+
+  Offset<Vector<Offset<::tflite::Buffer>>> BuildBuffers() {
+    auto buf0 = ::tflite::CreateBuffer(builder_, CreateDataVector<float>({}));
+    auto buf1 = ::tflite::CreateBuffer(
+        builder_, CreateDataVector<float>({1.0f, 2.0f, 3.0f, 4.0f}));
+    auto buf2 =
+        ::tflite::CreateBuffer(builder_, CreateDataVector<float>({3.0f, 4.0f}));
+    return builder_.CreateVector(
+        std::vector<Offset<::tflite::Buffer>>({buf0, buf1, buf2}));
+  }
+
+  Offset<Vector<Offset<::tflite::Tensor>>> BuildTensors() {
     auto q = ::tflite::CreateQuantizationParameters(
         builder_,
         /*min=*/builder_.CreateVector<float>({0.1f}),
         /*max=*/builder_.CreateVector<float>({0.2f}),
         /*scale=*/builder_.CreateVector<float>({0.3f}),
         /*zero_point=*/builder_.CreateVector<int64_t>({100ll}));
-    auto buf0 = ::tflite::CreateBuffer(builder_, CreateDataVector<float>({}));
-    auto buf1 =
-        ::tflite::CreateBuffer(builder_, CreateDataVector<float>({1.0f, 2.0f}));
-    auto buf2 =
-        ::tflite::CreateBuffer(builder_, CreateDataVector<float>({3.0f}));
-    auto buffers = builder_.CreateVector(
-        std::vector<flatbuffers::Offset<::tflite::Buffer>>({buf0, buf1, buf2}));
-    auto t1 = ::tflite::CreateTensor(builder_,
-                                     builder_.CreateVector<int>({1, 2, 3, 4}),
-                                     ::tflite::TensorType_FLOAT32, 1,
-                                     builder_.CreateString("tensor_one"), q);
+    auto t1 =
+        ::tflite::CreateTensor(builder_, builder_.CreateVector<int>({1, 2, 2}),
+                               ::tflite::TensorType_FLOAT32, 1,
+                               builder_.CreateString("tensor_one"), q);
     auto t2 =
         ::tflite::CreateTensor(builder_, builder_.CreateVector<int>({2, 1}),
                                ::tflite::TensorType_FLOAT32, 2,
                                builder_.CreateString("tensor_two"), q);
-    auto tensors = builder_.CreateVector(
-        std::vector<flatbuffers::Offset<::tflite::Tensor>>({t1, t2}));
-
-    // The operator codes.
-    auto c1 =
-        ::tflite::CreateOperatorCode(builder_, ::tflite::BuiltinOperator_CUSTOM,
-                                     builder_.CreateString("custom_op_one"));
-    auto c2 = ::tflite::CreateOperatorCode(
-        builder_, ::tflite::BuiltinOperator_CONV_2D, 0);
-    auto opcodes = builder_.CreateVector(
-        std::vector<flatbuffers::Offset<::tflite::OperatorCode>>({c1, c2}));
-
-    auto subgraph = ::tflite::CreateSubGraph(builder_, tensors, 0, 0, 0);
-    std::vector<flatbuffers::Offset<::tflite::SubGraph>> subgraph_vector(
-        {subgraph});
-    auto subgraphs = builder_.CreateVector(subgraph_vector);
+    return builder_.CreateVector(
+        std::vector<Offset<::tflite::Tensor>>({t1, t2}));
+  }
+
+  Offset<Vector<Offset<::tflite::OperatorCode>>> BuildOpCodes(
+      std::initializer_list<::tflite::BuiltinOperator> op_codes) {
+    std::vector<Offset<::tflite::OperatorCode>> op_codes_vector;
+    for (auto op : op_codes) {
+      op_codes_vector.push_back(::tflite::CreateOperatorCode(builder_, op, 0));
+    }
+    return builder_.CreateVector(op_codes_vector);
+  }
+
+  Offset<Vector<Offset<::tflite::OperatorCode>>> BuildOpCodes() {
+    return BuildOpCodes({::tflite::BuiltinOperator_MAX_POOL_2D,
+                         ::tflite::BuiltinOperator_CONV_2D});
+  }
+
+  Offset<Vector<Offset<::tflite::Operator>>> BuildOperators(
+      std::initializer_list<int> inputs, std::initializer_list<int> outputs) {
+    auto is = builder_.CreateVector<int>(inputs);
+    if (inputs.size() == 0) is = 0;
+    auto os = builder_.CreateVector<int>(outputs);
+    if (outputs.size() == 0) os = 0;
+    auto op = ::tflite::CreateOperator(
+        builder_, 0, is, os, ::tflite::BuiltinOptions_Conv2DOptions,
+        ::tflite::CreateConv2DOptions(builder_, ::tflite::Padding_VALID, 1, 1,
+                                      ::tflite::ActivationFunctionType_NONE)
+            .Union(),
+        /*custom_options=*/0, ::tflite::CustomOptionsFormat_FLEXBUFFERS);
+
+    return builder_.CreateVector(std::vector<Offset<::tflite::Operator>>({op}));
+  }
+
+  Offset<Vector<Offset<::tflite::Operator>>> BuildOperators() {
+    return BuildOperators({0}, {1});
+  }
+
+  Offset<Vector<Offset<::tflite::SubGraph>>> BuildSubGraphs(
+      Offset<Vector<Offset<::tflite::Tensor>>> tensors,
+      Offset<Vector<Offset<::tflite::Operator>>> operators,
+      int num_sub_graphs = 1) {
+    std::vector<int32_t> inputs = {0};
+    std::vector<int32_t> outputs = {1};
+    std::vector<Offset<::tflite::SubGraph>> v;
+    for (int i = 0; i < num_sub_graphs; ++i) {
+      v.push_back(::tflite::CreateSubGraph(
+          builder_, tensors, builder_.CreateVector(inputs),
+          builder_.CreateVector(outputs), operators,
+          builder_.CreateString("subgraph")));
+    }
+    return builder_.CreateVector(v);
+  }
+
+  // This is a very simplistic model. We are not interested in testing all the
+  // details here, since tf.mini's testing framework will be exercising all the
+  // conversions multiple times, and the conversion of operators is tested by
+  // separate unittests.
+  void BuildTestModel() {
+    auto buffers = BuildBuffers();
+    auto tensors = BuildTensors();
+    auto opcodes = BuildOpCodes();
+    auto operators = BuildOperators();
+    auto subgraphs = BuildSubGraphs(tensors, operators);
     auto s = builder_.CreateString("");
-    builder_.Finish(::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
-                                          opcodes, subgraphs, s, buffers));
+
+    ::tflite::FinishModelBuffer(
+        builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
+                                        opcodes, subgraphs, s, buffers));
 
     input_model_ = ::tflite::GetModel(builder_.GetBufferPointer());
   }
@@ -89,7 +139,6 @@ class ImportTest : public ::testing::Test {
                   builder_.GetSize());
   }
   flatbuffers::FlatBufferBuilder builder_;
-  // const uint8_t* buffer_ = nullptr;
   const ::tflite::Model* input_model_ = nullptr;
 };
 
@@ -106,7 +155,7 @@ TEST_F(ImportTest, LoadOperatorsTable) {
 
   details::OperatorsTable operators;
   details::LoadOperatorsTable(*input_model_, &operators);
-  EXPECT_THAT(operators, ElementsAre("custom_op_one", "CONV_2D"));
+  EXPECT_THAT(operators, ElementsAre("MAX_POOL_2D", "CONV_2D"));
 }
 
 TEST_F(ImportTest, Tensors) {
@@ -118,9 +167,9 @@ TEST_F(ImportTest, Tensors) {
   Array& a1 = model->GetArray("tensor_one");
   EXPECT_EQ(ArrayDataType::kFloat, a1.data_type);
   EXPECT_THAT(a1.GetBuffer<ArrayDataType::kFloat>().data,
-              ElementsAre(1.0f, 2.0f));
+              ElementsAre(1.0f, 2.0f, 3.0f, 4.0f));
   ASSERT_TRUE(a1.has_shape());
-  EXPECT_THAT(a1.shape().dims(), ElementsAre(1, 2, 3, 4));
+  EXPECT_THAT(a1.shape().dims(), ElementsAre(1, 2, 2));
 
   const auto& mm = a1.minmax;
   ASSERT_TRUE(mm.get());
@@ -133,6 +182,80 @@ TEST_F(ImportTest, Tensors) {
   EXPECT_EQ(100, q->zero_point);
 }
 
+TEST_F(ImportTest, NoBuffers) {
+  auto buffers = 0;
+  auto tensors = BuildTensors();
+  auto opcodes = BuildOpCodes();
+  auto operators = BuildOperators();
+  auto subgraphs = BuildSubGraphs(tensors, operators);
+  auto comment = builder_.CreateString("");
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+  EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
+               "Missing 'buffers' section.");
+}
+
+TEST_F(ImportTest, NoInputs) {
+  auto buffers = BuildBuffers();
+  auto tensors = BuildTensors();
+  auto opcodes = BuildOpCodes();
+  auto operators = BuildOperators({}, {1});
+  auto subgraphs = BuildSubGraphs(tensors, operators);
+  auto comment = builder_.CreateString("");
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+  EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
+               "Missing 'inputs' for operator.");
+}
+
+TEST_F(ImportTest, NoOutputs) {
+  auto buffers = BuildBuffers();
+  auto tensors = BuildTensors();
+  auto opcodes = BuildOpCodes();
+  auto operators = BuildOperators({0}, {});
+  auto subgraphs = BuildSubGraphs(tensors, operators);
+  auto comment = builder_.CreateString("");
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+  EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
+               "Missing 'outputs' for operator.");
+}
+
+TEST_F(ImportTest, InvalidOpCode) {
+  auto buffers = BuildBuffers();
+  auto tensors = BuildTensors();
+  auto opcodes = BuildOpCodes({static_cast<::tflite::BuiltinOperator>(-1),
+                               ::tflite::BuiltinOperator_CONV_2D});
+  auto operators = BuildOperators();
+  auto subgraphs = BuildSubGraphs(tensors, operators);
+  auto comment = builder_.CreateString("");
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+  EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
+               "Operator id '-1' is out of range.");
+}
+
+TEST_F(ImportTest, MultipleSubGraphs) {
+  auto buffers = BuildBuffers();
+  auto tensors = BuildTensors();
+  auto opcodes = BuildOpCodes();
+  auto operators = BuildOperators();
+  auto subgraphs = BuildSubGraphs(tensors, operators, 2);
+  auto comment = builder_.CreateString("");
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+
+  input_model_ = ::tflite::GetModel(builder_.GetBufferPointer());
+
+  EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
+               "Number of subgraphs in tflite should be exactly 1.");
+}
+
 // TODO(ahentz): still need tests for Operators and IOTensors.
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index aabc7c5109ddc205a7862c3ee2253390dae25095..fce3bad3266e855e7b056bbad942578809abe813 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -68,7 +68,9 @@ class Convolution
     auto activation_function =
         ActivationFunction::Serialize(op.fused_activation_function);
     return ::tflite::CreateConv2DOptions(*builder, padding, op.stride_width,
-                                         op.stride_height, activation_function);
+                                         op.stride_height, activation_function,
+                                         op.dilation_width_factor,
+                                         op.dilation_height_factor);
   }
 
   void ReadOptions(const TfLiteOptions& options,
@@ -76,6 +78,8 @@ class Convolution
     op->padding.type = Padding::Deserialize(options.padding());
     op->stride_width = options.stride_w();
     op->stride_height = options.stride_h();
+    op->dilation_width_factor = options.dilation_w_factor();
+    op->dilation_height_factor = options.dilation_h_factor();
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
@@ -204,17 +208,22 @@ class BatchToSpaceND
                    TocoOperator* op) const override {}
 };
 
-class Cast : public CustomOperator<CastOperator> {
+class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
+                                    ::tflite::BuiltinOptions_CastOptions> {
  public:
-  using CustomOperator::CustomOperator;
-  void WriteOptions(const TocoOperator& op,
-                    flexbuffers::Builder* fbb) const override {
-    fbb->Int("src_data_type", DataType::Serialize(op.src_data_type));
-    fbb->Int("dst_data_type", DataType::Serialize(op.dst_data_type));
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateCastOptions(*builder,
+                                       DataType::Serialize(op.src_data_type),
+                                       DataType::Serialize(op.dst_data_type));
   }
-  void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
-    op->src_data_type = DataType::Deserialize(m["src_data_type"].AsInt64());
-    op->dst_data_type = DataType::Deserialize(m["dst_data_type"].AsInt64());
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->src_data_type = DataType::Deserialize(options.in_data_type());
+    op->dst_data_type = DataType::Deserialize(options.out_data_type());
   }
 };
 
@@ -255,12 +264,15 @@ class FakeQuant : public CustomOperator<FakeQuantOperator> {
                     flexbuffers::Builder* fbb) const override {
     fbb->Float("min", op.minmax->min);
     fbb->Float("max", op.minmax->max);
+    fbb->Int("num_bits", op.num_bits);
   }
   void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
     auto* minmax = new MinMax;
     minmax->min = m["min"].AsFloat();
     minmax->max = m["max"].AsFloat();
     op->minmax.reset(minmax);
+    const auto& num_bits = m["num_bits"];
+    op->num_bits = num_bits.IsInt() ? num_bits.AsInt32() : 8;
   }
 };
 
@@ -657,6 +669,23 @@ class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
                    TocoOperator* op) const override {}
 };
 
+class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
+                                      ::tflite::BuiltinOptions_ArgMaxOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateArgMaxOptions(
+        *builder, DataType::Serialize(op.output_data_type));
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->output_data_type = DataType::Deserialize(options.output_type());
+  }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -827,9 +856,12 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new TopK_V2(::tflite::BuiltinOperator_TOPK_V2, OperatorType::kTopK_V2));
   ops.emplace_back(
       new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell));
+  ops.emplace_back(
+      new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
+  ops.emplace_back(
+      new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax));
 
   // Custom Operators.
-  ops.emplace_back(new Cast("CAST", OperatorType::kCast));
   ops.emplace_back(
       new DepthToSpace("DEPTH_TO_SPACE", OperatorType::kDepthToSpace));
   ops.emplace_back(new FakeQuant("FAKE_QUANT", OperatorType::kFakeQuant));
@@ -854,11 +886,23 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new SimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1));
   ops.emplace_back(
       new SimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6));
+  ops.emplace_back(
+      new SimpleOperator<PReluOperator>("PRELU", OperatorType::kPRelu));
   ops.emplace_back(new SimpleOperator<LogisticOperator>(
       "LOGISTIC", OperatorType::kLogistic));
   ops.emplace_back(
       new SimpleOperator<TanhOperator>("TANH", OperatorType::kTanh));
   ops.emplace_back(new SimpleOperator<ExpOperator>("EXP", OperatorType::kExp));
+  ops.emplace_back(new SimpleOperator<LogSoftmaxOperator>(
+      "LOG_SOFTMAX", OperatorType::kLogSoftmax));
+  ops.emplace_back(new SimpleOperator<TensorFlowMaximumOperator>(
+      "MAXIMUM", OperatorType::kTensorFlowMaximum));
+  ops.emplace_back(new SimpleOperator<TensorFlowMinimumOperator>(
+      "MINIMUM", OperatorType::kTensorFlowMinimum));
+  ops.emplace_back(new SimpleOperator<TensorFlowLessOperator>(
+      "LESS", OperatorType::kTensorFlowLess));
+  ops.emplace_back(
+      new SimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 5c486f72ade9ec5f366f075fcc39274bb7b12679..36ed741541eadbc9435a67bec15d389ba48350c1 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -107,6 +107,14 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<LogisticOperator>("LOGISTIC", OperatorType::kLogistic);
   CheckSimpleOperator<TanhOperator>("TANH", OperatorType::kTanh);
   CheckSimpleOperator<ExpOperator>("EXP", OperatorType::kExp);
+  CheckSimpleOperator<LogSoftmaxOperator>("LOG_SOFTMAX",
+                                          OperatorType::kLogSoftmax);
+  CheckSimpleOperator<TensorFlowMaximumOperator>(
+      "MAXIMUM", OperatorType::kTensorFlowMaximum);
+  CheckSimpleOperator<TensorFlowMinimumOperator>(
+      "MINIMUM", OperatorType::kTensorFlowMinimum);
+  CheckSimpleOperator<TensorFlowLessOperator>("LESS",
+                                              OperatorType::kTensorFlowLess);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -127,7 +135,7 @@ TEST_F(OperatorTest, BuiltinMean) {
   EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims);
 }
 
-TEST_F(OperatorTest, CustomCast) {
+TEST_F(OperatorTest, BuiltinCast) {
   CastOperator op;
   op.src_data_type = ArrayDataType::kFloat;
   op.dst_data_type = ArrayDataType::kUint8;
@@ -159,10 +167,12 @@ TEST_F(OperatorTest, CustomFakeQuant) {
   minmax->min = -10;
   minmax->max = 200;
   op.minmax.reset(minmax);
+  op.num_bits = 16;
   auto output_toco_op = SerializeAndDeserialize(
       GetOperator("FAKE_QUANT", OperatorType::kFakeQuant), op);
   EXPECT_EQ(op.minmax->min, output_toco_op->minmax->min);
   EXPECT_EQ(op.minmax->max, output_toco_op->minmax->max);
+  EXPECT_EQ(op.num_bits, output_toco_op->num_bits);
 }
 
 TEST_F(OperatorTest, CustomFullyConnected) {
@@ -387,6 +397,13 @@ TEST_F(OperatorTest, BuiltinTopKV2) {
   ASSERT_NE(nullptr, output_toco_op.get());
 }
 
+TEST_F(OperatorTest, BuiltinArgMax) {
+  ArgMaxOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("ARG_MAX", OperatorType::kArgMax), op);
+  EXPECT_EQ(op.output_data_type, output_toco_op->output_data_type);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
diff --git a/tensorflow/contrib/lite/toco/tflite/types.cc b/tensorflow/contrib/lite/toco/tflite/types.cc
index b4c2851502a40a1ca36965d4ddd2c8a15b8fe60f..c9c2e9ba0184ef3f531f325091afaf6976e07f4f 100644
--- a/tensorflow/contrib/lite/toco/tflite/types.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types.cc
@@ -13,12 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/contrib/lite/string_util.h"
 
 namespace toco {
 
 namespace tflite {
 
 namespace {
+
+DataBuffer::FlatBufferOffset CopyStringToBuffer(
+    const Array& array, flatbuffers::FlatBufferBuilder* builder) {
+  const auto& src_data = array.GetBuffer<ArrayDataType::kString>().data;
+  ::tflite::DynamicBuffer dyn_buffer;
+  for (const string& str : src_data) {
+    dyn_buffer.AddString(str.c_str(), str.length());
+  }
+  char* tensor_buffer;
+  int bytes = dyn_buffer.WriteToBuffer(&tensor_buffer);
+  std::vector<uint8_t> dst_data(bytes);
+  memcpy(dst_data.data(), tensor_buffer, bytes);
+  free(tensor_buffer);
+  return builder->CreateVector(dst_data.data(), bytes);
+}
+
 template <ArrayDataType T>
 DataBuffer::FlatBufferOffset CopyBuffer(
     const Array& array, flatbuffers::FlatBufferBuilder* builder) {
@@ -29,6 +46,18 @@ DataBuffer::FlatBufferOffset CopyBuffer(
   return builder->CreateVector(dst_data, size);
 }
 
+void CopyStringFromBuffer(const ::tflite::Buffer& buffer, Array* array) {
+  auto* src_data = reinterpret_cast<const char*>(buffer.data()->data());
+  std::vector<string>* dst_data =
+      &array->GetMutableBuffer<ArrayDataType::kString>().data;
+  int32_t num_strings = ::tflite::GetStringCount(src_data);
+  for (int i = 0; i < num_strings; i++) {
+    ::tflite::StringRef str_ref = ::tflite::GetString(src_data, i);
+    string this_str(str_ref.str, str_ref.len);
+    dst_data->push_back(this_str);
+  }
+}
+
 template <ArrayDataType T>
 void CopyBuffer(const ::tflite::Buffer& buffer, Array* array) {
   using NativeT = ::toco::DataType<T>;
@@ -90,8 +119,10 @@ flatbuffers::Offset<flatbuffers::Vector<uint8_t>> DataBuffer::Serialize(
       return CopyBuffer<ArrayDataType::kFloat>(array, builder);
     case ArrayDataType::kInt32:
       return CopyBuffer<ArrayDataType::kInt32>(array, builder);
+    case ArrayDataType::kInt64:
+      return CopyBuffer<ArrayDataType::kInt64>(array, builder);
     case ArrayDataType::kString:
-      return CopyBuffer<ArrayDataType::kString>(array, builder);
+      return CopyStringToBuffer(array, builder);
     case ArrayDataType::kUint8:
       return CopyBuffer<ArrayDataType::kUint8>(array, builder);
     default:
@@ -112,7 +143,7 @@ void DataBuffer::Deserialize(const ::tflite::Tensor& tensor,
     case ::tflite::TensorType_INT64:
       return CopyBuffer<ArrayDataType::kInt64>(buffer, array);
     case ::tflite::TensorType_STRING:
-      return CopyBuffer<ArrayDataType::kString>(buffer, array);
+      return CopyStringFromBuffer(buffer, array);
     case ::tflite::TensorType_UINT8:
       return CopyBuffer<ArrayDataType::kUint8>(buffer, array);
     default:
diff --git a/tensorflow/contrib/lite/toco/tflite/types_test.cc b/tensorflow/contrib/lite/toco/tflite/types_test.cc
index a040fe135841b92a6e668f32cc5e36cf812ab15b..29fb0b2af22ef13e735f7c8128a2b9f6dbc42d19 100644
--- a/tensorflow/contrib/lite/toco/tflite/types_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types_test.cc
@@ -151,6 +151,13 @@ TEST(DataBuffer, Int32) {
               ::testing::ElementsAre(1, 1 << 30));
 }
 
+TEST(DataBuffer, String) {
+  Array recovered = ToFlatBufferAndBack<ArrayDataType::kString>(
+      {"AA", "BBB", "Best. String. Ever."});
+  EXPECT_THAT(recovered.GetBuffer<ArrayDataType::kString>().data,
+              ::testing::ElementsAre("AA", "BBB", "Best. String. Ever."));
+}
+
 TEST(Padding, All) {
   EXPECT_EQ(::tflite::Padding_SAME, Padding::Serialize(PaddingType::kSame));
   EXPECT_EQ(PaddingType::kSame, Padding::Deserialize(::tflite::Padding_SAME));
diff --git a/tensorflow/contrib/lite/toco/toco.cc b/tensorflow/contrib/lite/toco/toco.cc
index f01ec0ec6102494f36cca0265b79e90355661271..8041aa9e7fbfdaf44134395fee4b2bb01633893a 100644
--- a/tensorflow/contrib/lite/toco/toco.cc
+++ b/tensorflow/contrib/lite/toco/toco.cc
@@ -23,40 +23,70 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
 #include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/toco_saved_model.h"
 #include "tensorflow/contrib/lite/toco/toco_tooling.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/platform/logging.h"
 
-#ifndef CHECK_OK
-#define CHECK_OK(val) CHECK_EQ((val).ok(), true)
-#define QCHECK_OK(val) QCHECK_EQ((val).ok(), true)
-#endif
-
 namespace toco {
 namespace {
 
-#define QCHECK_REQUIRE_TOCO_FLAG(arg) \
-  QCHECK(parsed_toco_flags.arg.specified()) << "Missing required flag: " #arg;
-
-void CheckFilePermissions(const ParsedTocoFlags& parsed_toco_flags,
-                          const ParsedModelFlags& parsed_model_flags,
-                          const TocoFlags& toco_flags) {
-  port::CheckInitGoogleIsDone("InitGoogle is not done yet");
-
-  QCHECK_REQUIRE_TOCO_FLAG(input_file)
-  QCHECK_OK(port::file::Exists(parsed_toco_flags.input_file.value(),
-                               port::file::Defaults()))
-      << "Specified input_file does not exist: "
-      << parsed_toco_flags.input_file.value();
-  QCHECK_OK(port::file::Readable(parsed_toco_flags.input_file.value(),
-                                 port::file::Defaults()))
+// Checks the permissions of the output file to ensure it is writeable.
+void CheckOutputFilePermissions(const Arg<string>& output_file) {
+  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
+  QCHECK(port::file::Writable(output_file.value()).ok())
+      << "Specified output_file is not writable: " << output_file.value()
+      << ".\n";
+}
+
+// Checks the permissions of the frozen model file.
+void CheckFrozenModelPermissions(const Arg<string>& input_file) {
+  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
+  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file does not exist: " << input_file.value() << ".\n";
+  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
       << "Specified input_file exists, but is not readable: "
-      << parsed_toco_flags.input_file.value();
+      << input_file.value() << ".\n";
+}
 
-  QCHECK_REQUIRE_TOCO_FLAG(output_file);
-  QCHECK_OK(port::file::Writable(parsed_toco_flags.output_file.value()))
-      << "parsed_toco_flags.input_file.value() output_file is not writable: "
-      << parsed_toco_flags.output_file.value();
+// Checks the permissions of the SavedModel directory.
+void CheckSavedModelPermissions(const Arg<string>& savedmodel_directory) {
+  QCHECK(savedmodel_directory.specified())
+      << "Missing required flag --savedmodel_directory.\n";
+  QCHECK(
+      port::file::Exists(savedmodel_directory.value(), port::file::Defaults())
+          .ok())
+      << "Specified savedmodel_directory does not exist: "
+      << savedmodel_directory.value() << ".\n";
+}
+
+// Reads the contents of the GraphDef from either the frozen graph file or the
+// SavedModel directory. If it reads the SavedModel directory, it updates the
+// ModelFlags and TocoFlags accordingly.
+void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags,
+                   string* graph_def_contents) {
+  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
+
+  bool has_input_file = parsed_toco_flags.input_file.specified();
+  bool has_savedmodel_dir = parsed_toco_flags.savedmodel_directory.specified();
+
+  // Ensure either input_file or savedmodel_directory flag has been set.
+  QCHECK_NE(has_input_file, has_savedmodel_dir)
+      << "Specify either input_file or savedmodel_directory flag.\n";
+
+  // Checks the input file permissions and reads the contents.
+  if (has_input_file) {
+    CheckFrozenModelPermissions(parsed_toco_flags.input_file);
+    CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
+                                  graph_def_contents, port::file::Defaults())
+              .ok());
+  } else {
+    CheckSavedModelPermissions(parsed_toco_flags.savedmodel_directory);
+    GetSavedModelContents(parsed_toco_flags, parsed_model_flags, toco_flags,
+                          model_flags, graph_def_contents);
+  }
 }
 
 void ToolMain(const ParsedTocoFlags& parsed_toco_flags,
@@ -67,21 +97,20 @@ void ToolMain(const ParsedTocoFlags& parsed_toco_flags,
   TocoFlags toco_flags;
   ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
 
-  CheckFilePermissions(parsed_toco_flags, parsed_model_flags, toco_flags);
+  string graph_def_contents;
+  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
+                &model_flags, &graph_def_contents);
+  CheckOutputFilePermissions(parsed_toco_flags.output_file);
 
-  string input_file_contents;
-  CHECK_OK(port::file::GetContents(parsed_toco_flags.input_file.value(),
-                                   &input_file_contents,
-                                   port::file::Defaults()));
   std::unique_ptr<Model> model =
-      Import(toco_flags, model_flags, input_file_contents);
+      Import(toco_flags, model_flags, graph_def_contents);
   Transform(toco_flags, model.get());
   string output_file_contents;
   Export(toco_flags, *model, toco_flags.allow_custom_ops(),
          &output_file_contents);
-  CHECK_OK(port::file::SetContents(parsed_toco_flags.output_file.value(),
-                                   output_file_contents,
-                                   port::file::Defaults()));
+  CHECK(port::file::SetContents(parsed_toco_flags.output_file.value(),
+                                output_file_contents, port::file::Defaults())
+            .ok());
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index c5a62fdb620ee7d6b7195f6e8e2bc3cb208feb10..1611c4d0c0b148e00dbf0b21b9cd65d4c8163c2c 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
+#include "absl/types/optional.h"
 #include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/core/platform/logging.h"
@@ -38,6 +39,9 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "Input file (model of any supported format). For Protobuf "
            "formats, both text and binary are supported regardless of file "
            "extension."),
+      Flag("savedmodel_directory", parsed_flags.savedmodel_directory.bind(),
+           parsed_flags.savedmodel_directory.default_value(),
+           "Full path to the directory containing the SavedModel."),
       Flag("output_file", parsed_flags.output_file.bind(),
            parsed_flags.output_file.default_value(),
            "Output file. "
@@ -49,14 +53,29 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.output_format.default_value(),
            "Output file format. "
            "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
+      Flag("savedmodel_tagset", parsed_flags.savedmodel_tagset.bind(),
+           parsed_flags.savedmodel_tagset.default_value(),
+           "Comma-separated set of tags identifying the MetaGraphDef within "
+           "the SavedModel to analyze. All tags in the tag set must be "
+           "specified."),
       Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
            parsed_flags.default_ranges_min.default_value(),
            "If defined, will be used as the default value for the min bound "
-           "of min/max ranges used for quantization."),
+           "of min/max ranges used for quantization of uint8 arrays."),
       Flag("default_ranges_max", parsed_flags.default_ranges_max.bind(),
            parsed_flags.default_ranges_max.default_value(),
            "If defined, will be used as the default value for the max bound "
-           "of min/max ranges used for quantization."),
+           "of min/max ranges used for quantization of uint8 arrays."),
+      Flag("default_int16_ranges_min",
+           parsed_flags.default_int16_ranges_min.bind(),
+           parsed_flags.default_int16_ranges_min.default_value(),
+           "If defined, will be used as the default value for the min bound "
+           "of min/max ranges used for quantization of int16 arrays."),
+      Flag("default_int16_ranges_max",
+           parsed_flags.default_int16_ranges_max.bind(),
+           parsed_flags.default_int16_ranges_max.default_value(),
+           "If defined, will be used as the default value for the max bound "
+           "of min/max ranges used for quantization of int16 arrays."),
       Flag("inference_type", parsed_flags.inference_type.bind(),
            parsed_flags.inference_type.default_value(),
            "Target data type of arrays in the output file (for input_arrays, "
@@ -112,6 +131,23 @@ bool ParseTocoFlagsFromCommandLineFlags(
           "If true, ignore control dependency requirements in input TensorFlow "
           "GraphDef. Otherwise an error will be raised upon control dependency "
           "inputs."),
+      Flag("debug_disable_recurrent_cell_fusion",
+           parsed_flags.debug_disable_recurrent_cell_fusion.bind(),
+           parsed_flags.debug_disable_recurrent_cell_fusion.default_value(),
+           "If true, disable fusion of known identifiable cell subgraphs into "
+           "cells. This includes, for example, specific forms of LSTM cell."),
+      Flag("propagate_fake_quant_num_bits",
+           parsed_flags.propagate_fake_quant_num_bits.bind(),
+           parsed_flags.propagate_fake_quant_num_bits.default_value(),
+           "If true, use FakeQuant* operator num_bits attributes to adjust "
+           "array data_types."),
+      Flag("allow_nudging_weights_to_use_fast_gemm_kernel",
+           parsed_flags.allow_nudging_weights_to_use_fast_gemm_kernel.bind(),
+           parsed_flags.allow_nudging_weights_to_use_fast_gemm_kernel
+               .default_value(),
+           "Some fast uint8 GEMM kernels require uint8 weights to avoid the "
+           "value 0. This flag allows nudging them to 1 to allow proceeding, "
+           "with moderate inaccuracy."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -123,55 +159,86 @@ bool ParseTocoFlagsFromCommandLineFlags(
   }
 }
 
+namespace {
+
+// Defines the requirements for a given flag. kUseDefault means the default
+// should be used in cases where the value isn't specified by the user.
+enum class FlagRequirement {
+  kNone,
+  kMustBeSpecified,
+  kMustNotBeSpecified,
+  kUseDefault,
+};
+
+// Enforces the FlagRequirements are met for a given flag.
+template <typename T>
+void EnforceFlagRequirement(const T& flag, const string& flag_name,
+                            FlagRequirement requirement) {
+  if (requirement == FlagRequirement::kMustBeSpecified) {
+    QCHECK(flag.specified()) << "Missing required flag " << flag_name;
+  }
+  if (requirement == FlagRequirement::kMustNotBeSpecified) {
+    QCHECK(!flag.specified())
+        << "Given other flags, this flag should not have been specified: "
+        << flag_name;
+  }
+}
+
+// Gets the value from the flag if specified. Returns default if the
+// FlagRequirement is kUseDefault.
+template <typename T>
+absl::optional<T> GetFlagValue(const Arg<T>& flag,
+                               FlagRequirement requirement) {
+  if (flag.specified()) return flag.value();
+  if (requirement == FlagRequirement::kUseDefault) return flag.default_value();
+  return absl::optional<T>();
+}
+
+}  // namespace
+
 void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
                                        TocoFlags* toco_flags) {
   namespace port = toco::port;
   port::CheckInitGoogleIsDone("InitGoogle is not done yet");
 
-  enum class FlagRequirement { kNone, kMustBeSpecified, kMustNotBeSpecified };
-
-#define ENFORCE_FLAG_REQUIREMENT(name, requirement)                          \
-  do {                                                                       \
-    if (requirement == FlagRequirement::kMustBeSpecified) {                  \
-      QCHECK(parsed_toco_flags.name.specified())                             \
-          << "Missing required flag: " << #name;                             \
-    }                                                                        \
-    if (requirement == FlagRequirement::kMustNotBeSpecified) {               \
-      QCHECK(!parsed_toco_flags.name.specified())                            \
-          << "Given other flags, this flag should not have been specified: " \
-          << #name;                                                          \
-    }                                                                        \
-  } while (false)
-#define READ_TOCO_FLAG(name, requirement)                     \
-  ENFORCE_FLAG_REQUIREMENT(name, requirement);                \
-  do {                                                        \
-    if (parsed_toco_flags.name.specified()) {                 \
-      toco_flags->set_##name(parsed_toco_flags.name.value()); \
-    }                                                         \
+#define READ_TOCO_FLAG(name, requirement)                                \
+  do {                                                                   \
+    EnforceFlagRequirement(parsed_toco_flags.name, #name, requirement);  \
+    auto flag_value = GetFlagValue(parsed_toco_flags.name, requirement); \
+    if (flag_value.has_value()) {                                        \
+      toco_flags->set_##name(flag_value.value());                        \
+    }                                                                    \
   } while (false)
 
-#define PARSE_TOCO_FLAG(Type, name, requirement)               \
-  ENFORCE_FLAG_REQUIREMENT(name, requirement);                 \
-  do {                                                         \
-    if (parsed_toco_flags.name.specified()) {                  \
-      Type x;                                                  \
-      QCHECK(Type##_Parse(parsed_toco_flags.name.value(), &x)) \
-          << "Unrecognized " << #Type << " value "             \
-          << parsed_toco_flags.name.value();                   \
-      toco_flags->set_##name(x);                               \
-    }                                                          \
+#define PARSE_TOCO_FLAG(Type, name, requirement)                         \
+  do {                                                                   \
+    EnforceFlagRequirement(parsed_toco_flags.name, #name, requirement);  \
+    auto flag_value = GetFlagValue(parsed_toco_flags.name, requirement); \
+    if (flag_value.has_value()) {                                        \
+      Type x;                                                            \
+      QCHECK(Type##_Parse(flag_value.value(), &x))                       \
+          << "Unrecognized " << #Type << " value "                       \
+          << parsed_toco_flags.name.value();                             \
+      toco_flags->set_##name(x);                                         \
+    }                                                                    \
   } while (false)
 
-  PARSE_TOCO_FLAG(FileFormat, input_format, FlagRequirement::kMustBeSpecified);
-  PARSE_TOCO_FLAG(FileFormat, output_format, FlagRequirement::kMustBeSpecified);
+  PARSE_TOCO_FLAG(FileFormat, input_format, FlagRequirement::kUseDefault);
+  PARSE_TOCO_FLAG(FileFormat, output_format, FlagRequirement::kUseDefault);
   PARSE_TOCO_FLAG(IODataType, inference_type, FlagRequirement::kNone);
   PARSE_TOCO_FLAG(IODataType, inference_input_type, FlagRequirement::kNone);
   READ_TOCO_FLAG(default_ranges_min, FlagRequirement::kNone);
   READ_TOCO_FLAG(default_ranges_max, FlagRequirement::kNone);
+  READ_TOCO_FLAG(default_int16_ranges_min, FlagRequirement::kNone);
+  READ_TOCO_FLAG(default_int16_ranges_max, FlagRequirement::kNone);
   READ_TOCO_FLAG(drop_fake_quant, FlagRequirement::kNone);
   READ_TOCO_FLAG(reorder_across_fake_quant, FlagRequirement::kNone);
   READ_TOCO_FLAG(allow_custom_ops, FlagRequirement::kNone);
   READ_TOCO_FLAG(drop_control_dependency, FlagRequirement::kNone);
+  READ_TOCO_FLAG(debug_disable_recurrent_cell_fusion, FlagRequirement::kNone);
+  READ_TOCO_FLAG(propagate_fake_quant_num_bits, FlagRequirement::kNone);
+  READ_TOCO_FLAG(allow_nudging_weights_to_use_fast_gemm_kernel,
+                 FlagRequirement::kNone);
 
   // Deprecated flag handling.
   if (parsed_toco_flags.input_type.specified()) {
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 3b9d7e22570b66aef2c9fc819e5ab4ec38e179f5..a04017a6bf05fb5a1ae324051375340a0adaccc6 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -36,7 +36,8 @@ enum FileFormat {
 // are not normally encoded in model files and in general may not be thought
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
-// Next Id: 13
+//
+// Next ID to use: 18.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -102,8 +103,14 @@ message TocoFlags {
   // for experimentation purposes only and should not be used in production:
   // they make it easy to quantize models, but the resulting quantized model
   // will be inaccurate.
+  //
+  // These values only apply to arrays quantized with the kUint8 data type.
   optional float default_ranges_min = 5;
   optional float default_ranges_max = 6;
+  // Equivalent versions of default_ranges_min/_max for arrays quantized with
+  // the kInt16 data type.
+  optional float default_int16_ranges_min = 15;
+  optional float default_int16_ranges_max = 16;
 
   // Ignore and discard FakeQuant nodes. For instance, that can be used to
   // generate plain float code without fake-quantization from a quantized
@@ -136,4 +143,22 @@ message TocoFlags {
   //    - Default to false if the output format is TENSORFLOW_GRAPHDEF.
   //    - Default to true in all other cases.
   optional bool drop_control_dependency = 12;
+
+  // Disables transformations that fuse subgraphs such as known LSTMs (not all
+  // LSTMs are identified).
+  optional bool debug_disable_recurrent_cell_fusion = 13;
+
+  // Uses the FakeQuantWithMinMaxArgs.num_bits attribute to adjust quantized
+  // array data types throughout the graph. The graph must be properly annotated
+  // with FakeQuant* ops on at least the edges and may contain additional ops on
+  // the interior of the graph to widen/narrow as desired.
+  //
+  // Input and output array data types may change because of this propagation
+  // and users must be sure to query the final data_type values.
+  optional bool propagate_fake_quant_num_bits = 14;
+
+  // Some fast uint8 GEMM kernels require uint8 weights to avoid the value 0.
+  // This flag allows nudging them to 1 to allow proceeding, with moderate
+  // inaccuracy.
+  optional bool allow_nudging_weights_to_use_fast_gemm_kernel = 17;
 }
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 4be3b5a0bf00ed204a1218545d9e66f7685a50d7..2d5c231bef350884cd2b0bb62cfdbddebfed7f58 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -75,6 +75,14 @@ Status Exists(const string& filename, const Options& options);
 void CopyToBuffer(const ::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
 void CopyToBuffer(const string& src, char* dest);
+
+inline uint32 ReverseBits32(uint32 n) {
+  n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
+  n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2);
+  n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4);
+  return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) |
+          ((n & 0xFF000000) >> 24));
+}
 }  // namespace port
 
 inline bool ParseFromStringOverload(const std::string& in,
diff --git a/tensorflow/contrib/lite/toco/toco_saved_model.cc b/tensorflow/contrib/lite/toco/toco_saved_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26f55a66c729894a990258080e397bb42ea98a13
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_saved_model.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "tensorflow/contrib/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/contrib/lite/toco/toco_saved_model.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace toco {
+namespace {
+
+// Loads a SavedModel from the directory specified in parsed_toco_flags.
+// Returns a SavedModelBundle with the requested MetaGraphDef.
+const tensorflow::SavedModelBundle* LoadSavedModel(
+    const ParsedTocoFlags& parsed_toco_flags) {
+  const string model_path = parsed_toco_flags.savedmodel_directory.value();
+  QCHECK(tensorflow::MaybeSavedModelDirectory(model_path))
+      << "Model is not saved in the supported SavedModel format.\n";
+
+  // Gets the tags identifying the MetaGraphDef from the command line arguments.
+  string tags_str;
+  if (parsed_toco_flags.savedmodel_tagset.specified()) {
+    tags_str = parsed_toco_flags.savedmodel_tagset.value();
+  } else {
+    tags_str = parsed_toco_flags.savedmodel_tagset.default_value();
+  }
+  auto tags = absl::StrSplit(tags_str, ',');
+
+  // Loads MetaGraphDef.
+  auto* bundle = new tensorflow::SavedModelBundle;
+  TF_CHECK_OK(tensorflow::LoadSavedModel(tensorflow::SessionOptions(),
+                                         tensorflow::RunOptions(), model_path,
+                                         tags, bundle))
+      << "Failed to load exported model from " << model_path
+      << ". Ensure the model contains the required tags '" << tags_str
+      << "'.\n";
+  return bundle;
+}
+
+// Returns the array name without the postfix.
+//
+// e.g. reduces "input:0" to "input".
+string GetArrayName(const string& name) {
+  const std::vector<string>& names = absl::StrSplit(name, ':');
+  return names[0];
+}
+
+// Returns the list of array names without the postfix sorted alphabetically.
+std::set<string> GetSortedNames(const std::unordered_set<string>& names) {
+  std::vector<string> final_names;
+  final_names.reserve(names.size());
+  for (const auto& name : names) {
+    final_names.push_back(GetArrayName(name));
+  }
+  return std::set<string>(final_names.begin(), final_names.end());
+}
+
+// Gets the final shape after replacing the first dimension with batch size, if
+// it is undefined (containing the value -1). Returns whether the shape is
+// valid.
+bool ReplaceShapeBatchSize(const tensorflow::TensorShapeProto& shape,
+                           int batch_size,
+                           tensorflow::TensorShapeProto* final_shape) {
+  for (int idx = 0; idx < shape.dim().size(); ++idx) {
+    int64 final_dim = shape.dim()[idx].size();
+    if (final_dim == -1) {
+      if (idx > 0) return false;
+      final_dim = batch_size;
+    }
+    final_shape->add_dim()->set_size(final_dim);
+  }
+  return true;
+}
+
+// Updates the input arrays in ModelFlags to contain the shape of the array.
+void ProcessInputShapes(const tensorflow::GraphDef& graph_def, int batch_size,
+                        ModelFlags* model_flags) {
+  // Build map of input array names to input arrays.
+  std::unordered_map<string, InputArray*> input_data_map;
+  for (auto& input : *model_flags->mutable_input_arrays()) {
+    input_data_map[input.name()] = &input;
+  }
+
+  // Adds shapes to the input arrays if the shape is valid.
+  for (const tensorflow::NodeDef& node_def : graph_def.node()) {
+    if (input_data_map.find(node_def.name()) != input_data_map.end()) {
+      const auto shape_it = node_def.attr().find("shape");
+      if (shape_it != node_def.attr().end()) {
+        tensorflow::TensorShapeProto final_shape;
+        bool is_valid = ReplaceShapeBatchSize(shape_it->second.shape(),
+                                              batch_size, &final_shape);
+
+        if (is_valid) {
+          auto* shape = input_data_map.at(node_def.name())->mutable_shape();
+          QCHECK_EQ(shape->dims_size(), 0)
+              << "The shape for the input '" << node_def.name()
+              << "' was previously defined. For clarity please define inputs "
+              << "via --input_arrays and input_shapes flags.\n";
+          for (const auto& dim : final_shape.dim()) {
+            shape->add_dims(dim.size());
+          }
+        }
+      }
+    }
+  }
+
+  // Checks all input arrays have a shape.
+  for (auto const& input : model_flags->input_arrays()) {
+    QCHECK(input.shape().dims_size() > 0)
+        << "A valid input shape was not found for input '" << input.name()
+        << "'. Please define via --input_arrays and --input_shapes flags.\n";
+  }
+}
+
+}  // namespace
+
+void ParseMetaData(const tensorflow::GraphDef& graph_def,
+                   const std::unordered_set<string>& inputs,
+                   const std::unordered_set<string>& outputs,
+                   const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags) {
+  if (!parsed_model_flags.input_arrays.specified()) {
+    const std::set<string> sorted_inputs = GetSortedNames(inputs);
+    for (const auto& input_name : sorted_inputs) {
+      model_flags->add_input_arrays()->set_name(input_name);
+    }
+  }
+
+  if (!parsed_model_flags.output_arrays.specified()) {
+    const std::set<string> sorted_outputs = GetSortedNames(outputs);
+    for (const auto& output_name : sorted_outputs) {
+      model_flags->add_output_arrays(GetArrayName(output_name));
+    }
+  }
+
+  if (!parsed_model_flags.input_shapes.specified()) {
+    int batch_size = parsed_model_flags.batch_size.value();
+    ProcessInputShapes(graph_def, batch_size, model_flags);
+  }
+
+  if (!parsed_toco_flags.inference_type.specified()) {
+    toco_flags->set_inference_type(IODataType::FLOAT);
+  }
+}
+
+// TODO(nupurgarg): Add top level tests.
+void GetSavedModelContents(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags,
+                           TocoFlags* toco_flags, ModelFlags* model_flags,
+                           string* graph_def_contents) {
+  // Loads the MetaGraphDef within a SavedModelBundle.
+  auto bundle = LoadSavedModel(parsed_toco_flags);
+
+  // Converts the MetaGraphDef to frozen GraphDef.
+  tensorflow::GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_CHECK_OK(tensorflow::FreezeSavedModel(*bundle, &frozen_graph_def, &inputs,
+                                           &outputs));
+
+  // Reads the frozen GraphDef into a string.
+  QCHECK(frozen_graph_def.SerializeToString(graph_def_contents))
+      << "Unable to generate serialized GraphDef.\n";
+
+  // Process inputs and outputs and metadata within GraphDef.
+  const tensorflow::GraphDef graph_def = bundle->meta_graph_def.graph_def();
+  ParseMetaData(graph_def, inputs, outputs, parsed_toco_flags,
+                parsed_model_flags, toco_flags, model_flags);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_saved_model.h b/tensorflow/contrib/lite/toco/toco_saved_model.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a0fabd82d90131a3b2d28c757c08dcb0f9e3988
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_saved_model.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_SAVED_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_SAVED_MODEL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/cc/tools/freeze_saved_model.h"
+#include "tensorflow/contrib/lite/toco/args.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/types.pb.h"
+
+namespace toco {
+
+// Parses metadata into `toco_flags` and `model_flags`.
+//
+// Stores `inputs` as input_arrays and `outputs` as output_arrays in
+// `model_flags`. Infers input_shapes from the GraphDef and stores it in
+// `model_flags` as part of the input_arrays. Assumes inference_type is FLOAT
+// and stores it in `toco_flags`.
+void ParseMetaData(const tensorflow::GraphDef& graph_def,
+                   const std::unordered_set<string>& inputs,
+                   const std::unordered_set<string>& outputs,
+                   const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags);
+
+// Generates a frozen graph from the SavedModel in the directory specified in
+// `toco_flags`. Reads frozen graph contents into `graph_def_contents`. Parses
+// metadata relating to the GraphDef into `toco_flags` and `model_flags`.
+void GetSavedModelContents(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags,
+                           TocoFlags* toco_flags, ModelFlags* model_flags,
+                           string* graph_def_contents);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_SAVED_MODEL_H_
diff --git a/tensorflow/contrib/lite/toco/toco_saved_model_test.cc b/tensorflow/contrib/lite/toco/toco_saved_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e122afe65dc29abc85f142f4019aae5058ace51
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_saved_model_test.cc
@@ -0,0 +1,274 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/toco/toco_saved_model.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/contrib/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace toco {
+namespace {
+
+using tensorflow::ops::Add;
+using tensorflow::ops::Const;
+using tensorflow::ops::FakeQuantWithMinMaxArgs;
+using tensorflow::ops::Placeholder;
+
+class TocoSavedModelTest : public ::testing::Test {
+ protected:
+  // Calls functions to process cmdline arguments and calls ParseMetaData.
+  // ParseMetaData parses input_arrays, output_arrays, and gets metadata from
+  // SavedModel it is not defined in the cmdline arguments.
+  void ProcessGraphDefMetadata(const std::unordered_set<string>& inputs,
+                               const std::unordered_set<string>& outputs,
+                               const tensorflow::GraphDef& graph_def) {
+    ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags_, &toco_flags_);
+    ReadModelFlagsFromCommandLineFlags(parsed_model_flags_, &model_flags_);
+    ParseMetaData(graph_def, inputs, outputs, parsed_toco_flags_,
+                  parsed_model_flags_, &toco_flags_, &model_flags_);
+  }
+
+  // Gets the GraphDef from the SavedModelBundle and processes metadata.
+  void ProcessSavedModelMetadata(const std::unordered_set<string>& inputs,
+                                 const std::unordered_set<string>& outputs) {
+    const tensorflow::GraphDef graph_def = bundle_.meta_graph_def.graph_def();
+    ProcessGraphDefMetadata(inputs, outputs, graph_def);
+  }
+
+  // Returns a GraphDef representing a simple float model with a single input.
+  tensorflow::GraphDef GetFloatGraphDef(const std::vector<int64>& shape) {
+    tensorflow::GraphDef graph_def;
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    tensorflow::Output input =
+        Placeholder(scope.WithOpName("input"), tensorflow::DT_FLOAT,
+                    Placeholder::Shape(tensorflow::PartialTensorShape(shape)));
+    tensorflow::Output zero = Const(scope.WithOpName("zero"), 0.0f, {});
+    tensorflow::Output add = Add(scope.WithOpName("add"), input, zero);
+
+    TF_EXPECT_OK(scope.ToGraphDef(&graph_def));
+    return graph_def;
+  }
+
+  // Returns a GraphDef representing a simple float model with two inputs.
+  tensorflow::GraphDef GetComplexFloatGraphDef() {
+    tensorflow::GraphDef graph_def;
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    tensorflow::Output inputA =
+        Placeholder(scope.WithOpName("inputA"), tensorflow::DT_FLOAT,
+                    Placeholder::Shape(tensorflow::TensorShape({1, 3, 3, 1})));
+    tensorflow::Output inputB =
+        Placeholder(scope.WithOpName("inputB"), tensorflow::DT_FLOAT,
+                    Placeholder::Shape(tensorflow::TensorShape({1, 3, 3, 1})));
+    tensorflow::Output add = Add(scope.WithOpName("add"), inputB, inputA);
+
+    TF_EXPECT_OK(scope.ToGraphDef(&graph_def));
+    return graph_def;
+  }
+
+  // Returns a GraphDef representing a simple quantized model.
+  tensorflow::GraphDef GetQuantizedGraphDef() {
+    tensorflow::GraphDef graph_def;
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    tensorflow::Output input =
+        Placeholder(scope.WithOpName("input"), tensorflow::DT_FLOAT,
+                    Placeholder::Shape(tensorflow::TensorShape({1, 3, 3, 1})));
+    tensorflow::Output zero = Const(scope.WithOpName("zero"), 0.0f, {});
+    tensorflow::Output fake_quant =
+        FakeQuantWithMinMaxArgs(scope.WithOpName("quant"), zero);
+    tensorflow::Output add = Add(scope.WithOpName("add"), input, fake_quant);
+
+    TF_EXPECT_OK(scope.ToGraphDef(&graph_def));
+    return graph_def;
+  }
+
+  // Gets the values in the input_arrays flag.
+  std::vector<string> GetInputArrays() {
+    std::vector<string> actual;
+    for (const auto& input : model_flags_.input_arrays()) {
+      actual.push_back(input.name());
+    }
+    return actual;
+  }
+
+  // Gets the values in the output_arrays flag.
+  std::vector<string> GetOutputArrays() {
+    std::vector<string> actual(model_flags_.output_arrays().begin(),
+                               model_flags_.output_arrays().end());
+    return actual;
+  }
+
+  // Gets the shape of the given input array.
+  string GetInputShape(const string& input_array) {
+    for (const auto& input : model_flags_.input_arrays()) {
+      if (input.name() == input_array) {
+        std::vector<string> dims;
+        for (int idx = 0; idx < input.shape().dims_size(); ++idx) {
+          dims.push_back(std::to_string(input.shape().dims(idx)));
+        }
+        return absl::StrJoin(dims, ",");
+      }
+    }
+    return "";
+  }
+
+  tensorflow::SavedModelBundle bundle_;
+  ParsedTocoFlags parsed_toco_flags_;
+  ParsedModelFlags parsed_model_flags_;
+  TocoFlags toco_flags_;
+  ModelFlags model_flags_;
+};
+
+// Tests if input_arrays, output_arrays, inference_type, and output_arrays are
+// added to ModelFlags if they are not specified in cmdline arguments.
+// Tests if the default batch size replaces a -1 in the first dimension.
+TEST_F(TocoSavedModelTest, NoCmdLine) {
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({-1, 3, 3, 1});
+
+  ProcessGraphDefMetadata({"input"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"input"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("input"), "1,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if the order of input_arrays and output_arrays is deterministic when
+// they are taken from the SavedModel.
+TEST_F(TocoSavedModelTest, NoCmdLineMultipleArrays) {
+  tensorflow::GraphDef graph_def = GetComplexFloatGraphDef();
+
+  // Note: The model does not have two outputs. However, the function does not
+  // need an accurate output_array list. This is only meant to test order.
+  ProcessGraphDefMetadata({"inputB", "inputA"}, {"add", "invalid"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"inputA", "inputB"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add", "invalid"}));
+  EXPECT_EQ(GetInputShape("inputA"), "1,3,3,1");
+  EXPECT_EQ(GetInputShape("inputB"), "1,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if input_shapes is inferred when input_arrays is passed in via cmdline
+// arguments.
+TEST_F(TocoSavedModelTest, InputNameWithoutInputShape) {
+  parsed_model_flags_.input_arrays.bind()("input");
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({2, 3, 3, 1});
+
+  ProcessGraphDefMetadata({"not_used_input"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"input"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("input"), "2,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Ensures a failure occurs when input_shapes is defined without input_arrays.
+TEST_F(TocoSavedModelTest, InputShapeWithoutInputName) {
+  parsed_model_flags_.input_shapes.bind()("1,224,224,1:9,12");
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({1, 3, 3, 1});
+
+  EXPECT_DEATH(ProcessGraphDefMetadata({"input"}, {"add"}, graph_def),
+               "failed: input_shapes.size\\(\\) == "
+               "model_flags->input_arrays_size\\(\\)");
+}
+
+// Tests if the cmdline values of input_arrays, input_shapes are used when
+// specified with an empty GraphDef.
+TEST_F(TocoSavedModelTest, InputArraysCmdLine) {
+  parsed_model_flags_.input_arrays.bind()("inputA,inputB");
+  parsed_model_flags_.input_shapes.bind()("1,224,224,1:9,12");
+
+  ProcessSavedModelMetadata({"input0", "input1"}, {"output0", "output1"});
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"inputA", "inputB"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"output0", "output1"}));
+  EXPECT_EQ(GetInputShape("inputA"), "1,224,224,1");
+  EXPECT_EQ(GetInputShape("inputB"), "9,12");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if the cmdline values of input_arrays, input_shapes are used when
+// specified even if values exist within the GraphDef.
+TEST_F(TocoSavedModelTest, InputArraysCmdLineWithGraphDef) {
+  parsed_model_flags_.input_arrays.bind()("inputA");
+  parsed_model_flags_.input_shapes.bind()("1,224,224,1");
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({1, 3, 3, 1});
+
+  ProcessGraphDefMetadata({"inputA"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"inputA"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("inputA"), "1,224,224,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if the cmdline values of input_arrays, input_shapes, inference_type,
+// and output_arrays are used when specified with an empty GraphDef.
+TEST_F(TocoSavedModelTest, AllParamsCmdLine) {
+  parsed_model_flags_.input_arrays.bind()("inputA,inputB");
+  parsed_model_flags_.output_arrays.bind()("outputA,outputB");
+  parsed_model_flags_.input_shapes.bind()("1,224,224,1:9,12");
+  parsed_toco_flags_.inference_type.bind()("FLOAT");
+
+  ProcessSavedModelMetadata({"input0", "input1"}, {"output0", "output1"});
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"inputA", "inputB"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"outputA", "outputB"}));
+  EXPECT_EQ(GetInputShape("inputA"), "1,224,224,1");
+  EXPECT_EQ(GetInputShape("inputB"), "9,12");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if a quantized graph gives the correct values assuming type is passed
+// in via command line.
+TEST_F(TocoSavedModelTest, QuantizedNoCmdLine) {
+  parsed_toco_flags_.inference_type.bind()("QUANTIZED_UINT8");
+  tensorflow::GraphDef graph_def = GetQuantizedGraphDef();
+
+  ProcessGraphDefMetadata({"input"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"input"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("input"), "1,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::QUANTIZED_UINT8);
+}
+
+// Tests if the provided batch size replaces a -1 in the first dimension of
+// input shape.
+TEST_F(TocoSavedModelTest, MissingShapeParameterValid) {
+  parsed_model_flags_.batch_size.bind()(3);
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({-1, 3, 3, 1});
+
+  ProcessGraphDefMetadata({"input"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"input"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("input"), "3,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Ensures a failure occurs if there is a -1 in a dimension aside from the first
+// position of input shape.
+TEST_F(TocoSavedModelTest, MissingShapeParameterInvalid) {
+  parsed_model_flags_.batch_size.bind()(3);
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({1, -1, 3, 1});
+
+  EXPECT_DEATH(ProcessGraphDefMetadata({"input"}, {"add"}, graph_def),
+               "A valid input shape was not found for input 'input'.");
+}
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 1b836fbc151db2141ad64d5370f15a43246fdd8b..7252ec2ea4d8864334fc989c91fc8c26efe2efea 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <set>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/contrib/lite/toco/allocate_transient_arrays.h"
 #include "tensorflow/contrib/lite/toco/dump_graphviz.h"
@@ -52,18 +53,21 @@ void MakeGeneralGraphTransformationsSet(
     GraphTransformationsSet* transformations) {
   CHECK(transformations->empty());
   transformations->Add(new ConvertExpandDimsToReshape);
+  transformations->Add(new ConvertSqueezeToReshape);
   transformations->Add(new ConvertTrivialAddNToAdd);
   transformations->Add(new ConvertTrivialStackToReshape);
   transformations->Add(new ConvertTrivialTransposeToReshape);
   transformations->Add(new ConvertReorderAxes);
   transformations->Add(new ResolveReshapeAttributes);
   transformations->Add(new ResolveTransposeAttributes);
+  transformations->Add(new PropagateActivationFunctionIntoConstants);
   transformations->Add(new PropagateArrayDataTypes);
   transformations->Add(new PropagateFixedSizes);
   transformations->Add(new RemoveTensorFlowAssert);
   transformations->Add(new RemoveTensorFlowIdentity);
   transformations->Add(new RemoveTrivialConcatenation);
   transformations->Add(new RemoveTrivialConcatenationInput);
+  transformations->Add(new RemoveTrivialFakeQuant);
   transformations->Add(new RemoveTrivialSlice);
   transformations->Add(new RemoveUnusedOp);
   transformations->Add(new EnsureBiasVectors);
@@ -72,11 +76,16 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTensorFlowMatMul);
   transformations->Add(new FuseBinaryIntoPrecedingAffine);
   transformations->Add(new FuseBinaryIntoFollowingAffine);
-  transformations->Add(new ReorderActivationFunctions);
+  transformations->Add(new MergeReshapeIntoPrecedingTranspose);
+  transformations->Add(new ReorderElementwiseUnary);
+  transformations->Add(new ReorderReshapeTranspose);
   transformations->Add(new ResolveBatchNormalization);
   transformations->Add(new ResolveConstantBinaryOperator);
   transformations->Add(new ResolveConstantFill);
+  transformations->Add(new ResolveConstantGather);
+  transformations->Add(new ResolveConstantRandomUniform);
   transformations->Add(new ResolveConstantRange);
+  transformations->Add(new ResolveConstantReshape);
   transformations->Add(new ResolveConstantStack);
   transformations->Add(new ResolveConstantStridedSlice);
   transformations->Add(new ResolveConstantTranspose);
@@ -87,9 +96,11 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTensorFlowTile);
   transformations->Add(new ResolveTensorFlowConcat);
   transformations->Add(new ResolveMultiplyByZero);
+  transformations->Add(new IdentifyDilatedConv);
   transformations->Add(new IdentifyL2Normalization);
   transformations->Add(new IdentifyL2Pool);
   transformations->Add(new IdentifyRelu1);
+  transformations->Add(new IdentifyPRelu);
   transformations->Add(new RemoveTrivialBinaryOperator);
   transformations->Add(new ReadFakeQuantMinMax);
   transformations->Add(new ResolveSpaceToBatchNDAttributes);
@@ -100,7 +111,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveMeanAttributes);
   transformations->Add(new ResolveConstantShapeOrRank);
   transformations->Add(new MakeInitialDequantizeOperator);
-  transformations->Add(new ResolveConstantFakeQuant);
+  transformations->Add(new UnpartitionEmbeddingLookup);
 }
 
 bool SupportsQuantization(FileFormat format) {
@@ -121,20 +132,26 @@ bool SupportsPreallocatedWorkspace(FileFormat format) {
 }
 
 bool IsRealValued(toco::ArrayDataType type) {
+  // TODO(benoitjacob) - this is hardcoding that uint8 and int16 are only used
+  // for quantized real-number values, and no other integer type is ever used
+  // for that. This is dirty, should be resolved as part of a more general push
+  // to more explicitly distinguish between true-integers and
+  // integers used as quantized values representing real numbers.
   return static_cast<bool>(type == toco::ArrayDataType::kFloat ||
-                           type == toco::ArrayDataType::kUint8);
+                           type == toco::ArrayDataType::kUint8 ||
+                           type == toco::ArrayDataType::kInt16);
 }
 
 void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
   const FileFormat output_format = toco_flags.output_format();
   ArrayDataType type;
-  if (toco_flags.has_inference_input_type()) {
+  if (!SupportsQuantization(output_format)) {
+    // Data type is implicitly float for non-quantized formats
+    type = ArrayDataType::kFloat;
+  } else if (toco_flags.has_inference_input_type()) {
     type = ConvertIODataTypeToArrayDataType(toco_flags.inference_input_type());
   } else if (toco_flags.has_inference_type()) {
     type = ConvertIODataTypeToArrayDataType(toco_flags.inference_type());
-  } else if (!SupportsQuantization(output_format)) {
-    // Data type is implicitly float for non-quantized formats
-    type = ArrayDataType::kFloat;
   } else {
     // Nothing to do. Data types stay as-is.
     return;
@@ -189,22 +206,23 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
 }
 
 void Transform(const TocoFlags& toco_flags, Model* model) {
-  // Clean up after import.
-  SetFinalDataTypeOnInputs(toco_flags, model);
-  UseArraysExtraInfo(model);
-  FinishBuildingRNNStates(model);
-
   const FileFormat output_format = toco_flags.output_format();
   const IODataType inference_type = toco_flags.inference_type();
 
   const bool quantize_output =
-      SupportsQuantization(output_format) && inference_type == QUANTIZED_UINT8;
+      SupportsQuantization(output_format) &&
+      (inference_type == QUANTIZED_UINT8 || inference_type == QUANTIZED_INT16);
 
   if (quantize_output) {
     QCHECK_NE(toco_flags.inference_input_type(), FLOAT)
         << "Quantized inference is not allowed with float inputs.";
   }
 
+  // Clean up after import.
+  SetFinalDataTypeOnInputs(toco_flags, model);
+  UseArraysExtraInfo(model, quantize_output);
+  FinishBuildingRNNStates(model);
+
   // Remove unused ops before performing any other optimizations. This is to
   // stop optimizations from crossing the input/output boundaries. For example
   // this will stop BatchNorm fusing if the output node is in between a conv
@@ -216,6 +234,12 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   MakeGeneralGraphTransformationsSet(&transformations);
   auto* remove_trivial_reshape = new RemoveTrivialReshape;
   transformations.Add(remove_trivial_reshape);
+  auto* resolve_constant_fake_quant = new ResolveConstantFakeQuant;
+  if (quantize_output) {
+    resolve_constant_fake_quant->set_propagate_fake_quant_num_bits(
+        toco_flags.propagate_fake_quant_num_bits());
+  }
+  transformations.Add(resolve_constant_fake_quant);
   if (SupportsFusedActivationFunction(output_format)) {
     transformations.Add(new FuseActivationFunctions);
   } else {
@@ -234,7 +258,9 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   }
   transformations.Add(new ConvertPureConvToDepthwise);
   if (SupportsLstmCell(output_format)) {
-    transformations.Add(new IdentifyLstmCell);
+    if (!toco_flags.debug_disable_recurrent_cell_fusion()) {
+      transformations.Add(new IdentifyLstmCell);
+    }
     if (output_format == TFLITE) {
       transformations.Add(new toco::SplitLstmCellInputs);
     } else {
@@ -246,25 +272,63 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
                           transformations);
 
   if (quantize_output) {
+    if (toco_flags.propagate_fake_quant_num_bits()) {
+      RunGraphTransformations(model,
+                              "fake quant propagation graph transformations",
+                              {new PropagateFakeQuantNumBits});
+    }
     RunGraphTransformations(model, "pre-quantization graph transformations",
-                            {new HardcodeMinMax, new DropFakeQuant});
+                            {
+                                new HardcodeMinMax,
+                                new DropFakeQuant,
+                            });
   }
 
+  // Fix any issues with IO edges. This must happen after any transform that
+  // may modify the structure of the edges.
+  FixEdgeArrays(model);
+
   if (quantize_output) {
+    // If the user specified default min/max ranges we need to set all arrays
+    // that didn't either have a min/max specified or get one set via
+    // HardcodeMinMax or PropagateFakeQuantNumBits. This may require running
+    // HardcodeMinMax to move changes through the graph as we make changes.
+    auto propagate_default_min_max =
+        absl::make_unique<PropagateDefaultMinMax>();
     if (toco_flags.has_default_ranges_min() &&
         toco_flags.has_default_ranges_max()) {
-      UseDefaultMinMaxRangeValues(model, toco_flags.default_ranges_min(),
-                                  toco_flags.default_ranges_max());
-      // The new MinMax info may need to be propagated a bit.
+      propagate_default_min_max->DefineTypeRange(
+          ArrayDataType::kUint8, toco_flags.default_ranges_min(),
+          toco_flags.default_ranges_max());
+    }
+    if (toco_flags.has_default_int16_ranges_min() &&
+        toco_flags.has_default_int16_ranges_max()) {
+      propagate_default_min_max->DefineTypeRange(
+          ArrayDataType::kInt16, toco_flags.default_int16_ranges_min(),
+          toco_flags.default_int16_ranges_max());
+    }
+    if (propagate_default_min_max->has_any_ranges_defined()) {
       RunGraphTransformations(
           model, "default min-max range propagation graph transformations",
-          {new HardcodeMinMax});
+          {
+              propagate_default_min_max.release(),
+              new HardcodeMinMax,
+          });
     }
+
     CheckIsReadyForQuantization(*model);
-    RunGraphTransformations(
-        model, "quantization graph transformations",
-        {new Quantize, new RemoveTrivialQuantizedActivationFunc,
-         new RemoveFinalDequantizeOp});
+    auto* ensure_safe_for_int8_kernels =
+        new EnsureUint8WeightsSafeForFastInt8Kernels;
+    ensure_safe_for_int8_kernels->set_allow_nudging_weights(
+        toco_flags.allow_nudging_weights_to_use_fast_gemm_kernel());
+    RunGraphTransformations(model, "quantization graph transformations",
+                            {
+                                new RemoveTrivialQuantizedActivationFunc,
+                                new RemoveTrivialQuantizedMinMax,
+                                new Quantize,
+                                new RemoveFinalDequantizeOp,
+                                ensure_safe_for_int8_kernels,
+                            });
   } else {
     GraphTransformationsSet dequantization_transformations{new Dequantize};
     // Dequantize creates FakeQuant nodes. We may want to discard
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index dcb409c84d8d80790f3c5e41a6eb7bce1b1efd2e..f334c51bbb35b8e9c4456dfd40a47eeec317019e 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -62,9 +62,49 @@ string LogName(const Operator& op) {
   }
 }
 
-bool IsInputArray(const Model& model, const string& name) {
+string ArrayDataTypeName(ArrayDataType data_type) {
+  switch (data_type) {
+    case ArrayDataType::kFloat:
+      return "Float";
+    case ArrayDataType::kInt8:
+      return "Int8";
+    case ArrayDataType::kUint8:
+      return "Uint8";
+    case ArrayDataType::kInt16:
+      return "Int16";
+    case ArrayDataType::kUint16:
+      return "Uint16";
+    case ArrayDataType::kInt32:
+      return "Int32";
+    case ArrayDataType::kUint32:
+      return "Uint32";
+    case ArrayDataType::kInt64:
+      return "Int64";
+    case ArrayDataType::kUint64:
+      return "Uint64";
+    case ArrayDataType::kString:
+      return "String";
+    case ArrayDataType::kBool:
+      return "Bool";
+    case ArrayDataType::kNone:
+      return "None";
+    default:
+      LOG(FATAL) << "Unhandled array data type " << static_cast<int>(data_type);
+  }
+}
+
+bool IsInputArray(const Model& model, const string& array_name) {
   for (const auto& input_array : model.flags.input_arrays()) {
-    if (input_array.name() == name) {
+    if (array_name == input_array.name()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IsOutputArray(const Model& model, const string& array_name) {
+  for (const auto& output_array : model.flags.output_arrays()) {
+    if (array_name == output_array) {
       return true;
     }
   }
@@ -75,10 +115,8 @@ bool IsArrayConsumed(const Model& model, const string& name) {
   if (GetOpWithInput(model, name)) {
     return true;
   }
-  for (const string& model_output : model.flags.output_arrays()) {
-    if (model_output == name) {
-      return true;
-    }
+  if (IsOutputArray(model, name)) {
+    return true;
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
     if (rnn_state.back_edge_source_array() == name) {
@@ -128,6 +166,15 @@ bool DeleteArrayIfUsedOnce(const string& array_name, Model* model) {
   return false;
 }
 
+void DeleteOpAndArraysIfUnused(Model* model, Operator* op) {
+  for (const string& array_name : op->inputs) {
+    DeleteArrayIfUsedOnce(array_name, model);
+  }
+  auto op_it = FindOp(*model, op);
+  CHECK(op_it != model->operators.end());
+  model->operators.erase(op_it);
+}
+
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
     const Model& model, const string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
@@ -251,15 +298,18 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Dequantize)
     HANDLE_OPERATORTYPENAME_CASE(L2Normalization)
     HANDLE_OPERATORTYPENAME_CASE(LocalResponseNormalization)
+    HANDLE_OPERATORTYPENAME_CASE(Log)
     HANDLE_OPERATORTYPENAME_CASE(Logistic)
     HANDLE_OPERATORTYPENAME_CASE(LstmCell)
     HANDLE_OPERATORTYPENAME_CASE(MaxPool)
     HANDLE_OPERATORTYPENAME_CASE(L2Pool)
     HANDLE_OPERATORTYPENAME_CASE(FakeQuant)
     HANDLE_OPERATORTYPENAME_CASE(Mul)
+    HANDLE_OPERATORTYPENAME_CASE(RandomUniform)
     HANDLE_OPERATORTYPENAME_CASE(Relu)
     HANDLE_OPERATORTYPENAME_CASE(Relu1)
     HANDLE_OPERATORTYPENAME_CASE(Relu6)
+    HANDLE_OPERATORTYPENAME_CASE(PRelu)
     HANDLE_OPERATORTYPENAME_CASE(ReorderAxes)
     HANDLE_OPERATORTYPENAME_CASE(Softmax)
     HANDLE_OPERATORTYPENAME_CASE(LogSoftmax)
@@ -316,6 +366,8 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(TopK_V2)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowUnsupported)
     HANDLE_OPERATORTYPENAME_CASE(Exp)
+    HANDLE_OPERATORTYPENAME_CASE(DynamicPartition)
+    HANDLE_OPERATORTYPENAME_CASE(DynamicStitch)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -334,6 +386,7 @@ string HelpfulOperatorTypeName(const Operator& op) {
 bool OperatorSupportsFusedActivation(OperatorType type) {
   switch (type) {
     case OperatorType::kConcatenation:
+    case OperatorType::kFakeQuant:
     case OperatorType::kGather:
     case OperatorType::kSlice:
     case OperatorType::kSqueeze:
@@ -363,48 +416,9 @@ void LogSummary(int log_level, const Model& model) {
 void LogArray(int log_level, const Model& model, const string& name) {
   const auto& array = model.GetArray(name);
   VLOG(log_level) << "Array: " << name;
-  switch (array.data_type) {
-    case ArrayDataType::kNone:
-      VLOG(log_level) << "  Data type:";
-      break;
-    case ArrayDataType::kFloat:
-      VLOG(log_level) << "  Data type: kFloat";
-      break;
-    case ArrayDataType::kInt32:
-      VLOG(log_level) << "  Data type: kInt32";
-      break;
-    case ArrayDataType::kUint8:
-      VLOG(log_level) << "  Data type: kUint8";
-      break;
-    case ArrayDataType::kString:
-      VLOG(log_level) << "  Data type: kString";
-      break;
-    default:
-      VLOG(log_level) << "  Data type: other (numerical value: "
-                      << static_cast<int>(array.data_type) << ")";
-      break;
-  }
-  switch (array.final_data_type) {
-    case ArrayDataType::kNone:
-      VLOG(log_level) << "  Final type:";
-      break;
-    case ArrayDataType::kFloat:
-      VLOG(log_level) << "  Final type: kFloat";
-      break;
-    case ArrayDataType::kInt32:
-      VLOG(log_level) << "  Final type: kInt32";
-      break;
-    case ArrayDataType::kUint8:
-      VLOG(log_level) << "  Final type: kUint8";
-      break;
-    case ArrayDataType::kString:
-      VLOG(log_level) << "  Final type: kString";
-      break;
-    default:
-      VLOG(log_level) << "  Final type: other (numerical value: "
-                      << static_cast<int>(array.data_type) << ")";
-      break;
-  }
+  VLOG(log_level) << "  Data type: " << ArrayDataTypeName(array.data_type);
+  VLOG(log_level) << "  Final type: "
+                  << ArrayDataTypeName(array.final_data_type);
   if (array.buffer) {
     VLOG(log_level) << "  Constant Buffer";
   }
@@ -811,17 +825,22 @@ void FixNoOrphanedArray(Model* model) {
 void CheckEachArray(const Model& model) {
   for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = array_entry.second;
-    if (array->has_shape()) {
-      for (int d : array->shape().dims()) {
-        CHECK_GE(d, 1);
-      }
-    }
     // It's OK to have a buffer or an alloc, but not both.
     // (Since allocs are for transient arrays without a buffer).
     CHECK(!array->buffer || !array->alloc);
-    // If there is a buffer, its type should be consistent with data_type.
     if (array->buffer) {
+      // If there is a buffer, its type should be consistent with data_type.
       CHECK(array->buffer->type == array->data_type);
+      // The presence of a fixed buffer should imply the presence of a fixed
+      // shape.
+      CHECK(array->has_shape());
+      // Constant buffer should has a valid shape.
+      for (int d : array->shape().dims()) {
+        CHECK_GE(d, 1);
+      }
+      // The shape flat-size should agree with the buffer length.
+      CHECK_EQ(array->buffer->Length(),
+               RequiredBufferSizeForShape(array->shape()));
     }
 
     // Check name.  Either "name_with_suffix_8", "name_with_port:3", but not
@@ -1038,6 +1057,128 @@ void CheckModelCounts(const Model& model) {
   }
 }
 
+void FixEdgeArrays(Model* model) {
+  for (const string& output_array_name : model->flags.output_arrays()) {
+    if (!GetOpWithOutput(*model, output_array_name)) {
+      // Output has no operator producing it. Change that by inserting a copy.
+      LOG(WARNING) << "Fixing constant output array " << output_array_name
+                   << " by inserting a copy. This is not optimal.";
+      string intermediate_array_name =
+          AvailableArrayName(*model, output_array_name + "_copy");
+      CloneArray(model, output_array_name, intermediate_array_name);
+      InsertCopyOperator(model, intermediate_array_name, output_array_name);
+    }
+  }
+}
+
+namespace {
+void CopyArrayAttribs(const Array& source_array, Array* target_array) {
+  target_array->data_type = source_array.data_type;
+  target_array->final_data_type = source_array.final_data_type;
+  target_array->copy_shape(source_array.shape());
+
+  if (source_array.minmax) {
+    target_array->GetOrCreateMinMax() = source_array.GetMinMax();
+  } else {
+    target_array->minmax.reset();
+  }
+
+  if (source_array.quantization_params) {
+    target_array->GetOrCreateQuantizationParams() =
+        source_array.GetQuantizationParams();
+  } else {
+    target_array->quantization_params.reset();
+  }
+}
+}  // namespace
+
+void InsertCopyOperator(Model* model, const string& source_array_name,
+                        const string& target_array_name) {
+  // Reshape to the same size. This should be a no-op.
+  const Array& source_array = model->GetArray(source_array_name);
+  std::vector<int> shape = source_array.shape().dims();
+
+  // Drop constant data from the target array as the copy will be done at
+  // runtime.
+  Array& target_array = model->GetOrCreateArray(target_array_name);
+  target_array.buffer.reset();
+  CopyArrayAttribs(source_array, &target_array);
+
+  // Insert copy operator.
+  auto* copy_op = new TensorFlowReshapeOperator;
+  copy_op->inputs = {
+      source_array_name,
+      CreateInt32Array(model, target_array_name + "_copy_shape", shape)};
+  copy_op->outputs = {target_array_name};
+  model->operators.emplace_back(copy_op);
+}
+
+void CloneArray(Model* model, const string& source_array_name,
+                const string& target_array_name) {
+  CHECK(!model->HasArray(target_array_name));
+  const Array& source_array = model->GetArray(source_array_name);
+  Array& target_array = model->GetOrCreateArray(target_array_name);
+  CopyArrayAttribs(source_array, &target_array);
+
+  if (source_array.minmax) {
+    const auto& smm = source_array.GetMinMax();
+    auto& tmm = target_array.GetOrCreateMinMax();
+    tmm.min = smm.min;
+    tmm.max = smm.max;
+  }
+
+  if (source_array.quantization_params) {
+    const auto& sqp = source_array.GetQuantizationParams();
+    auto& tqp = target_array.GetOrCreateQuantizationParams();
+    tqp.zero_point = sqp.zero_point;
+    tqp.scale = sqp.scale;
+  }
+
+  target_array.data_type = source_array.data_type;
+  target_array.final_data_type = source_array.final_data_type;
+  target_array.copy_shape(source_array.shape());
+
+  switch (source_array.data_type) {
+    case ArrayDataType::kBool:
+      CopyArrayBuffer<ArrayDataType::kBool>(source_array, &target_array);
+      break;
+    case ArrayDataType::kFloat:
+      CopyArrayBuffer<ArrayDataType::kFloat>(source_array, &target_array);
+      break;
+    case ArrayDataType::kInt8:
+      CopyArrayBuffer<ArrayDataType::kInt8>(source_array, &target_array);
+      break;
+    case ArrayDataType::kUint8:
+      CopyArrayBuffer<ArrayDataType::kUint8>(source_array, &target_array);
+      break;
+    case ArrayDataType::kInt16:
+      CopyArrayBuffer<ArrayDataType::kInt16>(source_array, &target_array);
+      break;
+    case ArrayDataType::kUint16:
+      CopyArrayBuffer<ArrayDataType::kUint16>(source_array, &target_array);
+      break;
+    case ArrayDataType::kInt32:
+      CopyArrayBuffer<ArrayDataType::kInt32>(source_array, &target_array);
+      break;
+    case ArrayDataType::kUint32:
+      CopyArrayBuffer<ArrayDataType::kUint32>(source_array, &target_array);
+      break;
+    case ArrayDataType::kInt64:
+      CopyArrayBuffer<ArrayDataType::kInt64>(source_array, &target_array);
+      break;
+    case ArrayDataType::kUint64:
+      CopyArrayBuffer<ArrayDataType::kUint64>(source_array, &target_array);
+      break;
+    case ArrayDataType::kString:
+      CopyArrayBuffer<ArrayDataType::kString>(source_array, &target_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type: "
+                 << ArrayDataTypeName(source_array.data_type);
+      return;
+  }
+}
+
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims) {
   CHECK(out_dims->empty());
@@ -1201,7 +1342,7 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
       << "This model does not define output arrays, so a "
          "--output_arrays flag must be given on the command-line.";
 
-  for (const auto& input_array_proto : model->flags.input_arrays()) {
+  for (auto& input_array_proto : *model->flags.mutable_input_arrays()) {
     auto& input_array = model->GetOrCreateArray(input_array_proto.name());
     if (input_array_proto.has_data_type()) {
       const ArrayDataType specified_type =
@@ -1245,23 +1386,25 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
         for (int i = 0; i < input_array_dims.size(); i++) {
           CHECK_EQ(input_array_dims[i], input_array_proto.shape().dims(i));
         }
+      } else {
+        for (int i = 0; i < input_array.shape().dimensions_count(); i++) {
+          input_array_proto.mutable_shape()->add_dims(
+              input_array.shape().dims(i));
+        }
       }
     }
 
     const float mean_value = input_array_proto.mean_value();
     const float std_value = input_array_proto.std_value();
     MinMax input_minmax;
-    input_minmax.min = (0.f - mean_value) / std_value;
-    input_minmax.max = (255.f - mean_value) / std_value;
-    if (input_array.minmax) {
-      if (input_array_proto.has_mean_value() ||
-          input_array_proto.has_std_value()) {
-        CHECK(input_minmax == *input_array.minmax)
-            << input_minmax.min << ", " << input_minmax.max
-            << " != " << input_array.minmax->min << ", "
-            << input_array.minmax->max;
-      }
-    } else {
+    float qmin = 0, qmax = 255;
+    if (input_array.data_type == ArrayDataType::kInt16) {
+      qmin = -32768;
+      qmax = 32767;
+    }
+    input_minmax.min = (qmin - mean_value) / std_value;
+    input_minmax.max = (qmax - mean_value) / std_value;
+    if (!input_array.minmax) {
       input_array.GetOrCreateMinMax() = input_minmax;
     }
   }
@@ -1276,7 +1419,8 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
       CHECK(input_array.shape().dims_size());
     }
   }
-
+  model->flags.set_change_concat_input_ranges(
+      model_flags.change_concat_input_ranges());
   model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
   model->flags.set_allow_nonexistent_arrays(
       model_flags.allow_nonexistent_arrays());
@@ -1316,30 +1460,10 @@ void CheckIsReadyForQuantization(const Model& model) {
   }
 }
 
-void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
-                                 double default_ranges_max) {
-  for (const auto& op : model->operators) {
-    for (const auto& input : op->inputs) {
-      auto& input_array = model->GetArray(input);
-      if (!input_array.minmax && !input_array.buffer) {
-        auto& minmax = input_array.GetOrCreateMinMax();
-        minmax.min = default_ranges_min;
-        minmax.max = default_ranges_max;
-      }
-    }
-    for (const auto& output : op->outputs) {
-      auto& output_array = model->GetArray(output);
-      if (!output_array.minmax && !output_array.buffer) {
-        auto& minmax = output_array.GetOrCreateMinMax();
-        minmax.min = default_ranges_min;
-        minmax.max = default_ranges_max;
-      }
-    }
-  }
-}
-
 int ElementSize(ArrayDataType data_type) {
   switch (data_type) {
+    case ArrayDataType::kBool:
+      return sizeof(bool);
     case ArrayDataType::kFloat:
       return 4;
     case ArrayDataType::kInt8:
@@ -1365,7 +1489,7 @@ int ElementSize(ArrayDataType data_type) {
       LOG(FATAL) << "Transient arrays with strings are not supported yet";
       return 0;
     default:
-      LOG(FATAL) << "Should not get here.";
+      LOG(FATAL) << "Unknown data_type = " << static_cast<int>(data_type);
       return 0;
   }
 }
@@ -1384,14 +1508,9 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
   if (model.IsOptionalArray(array_name)) return false;
   // The model's input and output arrays are externally allocated.
   // They are not transient arrays.
-  if (IsInputArray(model, array_name)) {
+  if (IsInputArray(model, array_name) || IsOutputArray(model, array_name)) {
     return false;
   }
-  for (const string& output_array : model.flags.output_arrays()) {
-    if (array_name == output_array) {
-      return false;
-    }
-  }
   const auto& array = &model.GetArray(array_name);
   // An array with a constant buffer isn't a transient array.
   if (!!array->buffer) {
@@ -1769,15 +1888,8 @@ int AxesCount(AxesOrder axes_order) {
 }
 
 bool IsDiscardableArray(const Model& model, const string& array_name) {
-  for (const auto& input_array : model.flags.input_arrays()) {
-    if (array_name == input_array.name()) {
-      return false;
-    }
-  }
-  for (const string& output_array : model.flags.output_arrays()) {
-    if (array_name == output_array) {
-      return false;
-    }
+  if (IsInputArray(model, array_name) || IsOutputArray(model, array_name)) {
+    return false;
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
     if (!rnn_state.discardable()) {
@@ -1792,15 +1904,47 @@ bool IsDiscardableArray(const Model& model, const string& array_name) {
   return true;
 }
 
+bool ReshapeIsEquivalentToTranspose(const Model& model,
+                                    const TensorFlowReshapeOperator* op,
+                                    bool allow_extra_unary_dims) {
+  CHECK(!op->shape.empty());
+  CHECK(model.HasArray(op->inputs[0]));
+  CHECK(model.HasArray(op->outputs[0]));
+
+  const auto& input_array = model.GetArray(op->inputs[0]);
+  const auto& output_array = model.GetArray(op->outputs[0]);
+
+  CHECK(input_array.has_shape());
+  CHECK(output_array.has_shape());
+
+  std::vector<int> in_shape = input_array.shape().dims();
+  std::vector<int> out_shape = output_array.shape().dims();
+
+  // If the reshape changes the number of dimensions so it cannot be interpreted
+  // as a transpose.
+  if (!allow_extra_unary_dims && in_shape.size() != out_shape.size()) {
+    return false;
+  }
+
+  in_shape.erase(std::remove(in_shape.begin(), in_shape.end(), 1),
+                 in_shape.end());
+  out_shape.erase(std::remove(out_shape.begin(), out_shape.end(), 1),
+                  out_shape.end());
+  return in_shape == out_shape;
+}
+
 void CheckFinalDataTypesSatisfied(const Model& model) {
   for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = *array_entry.second;
-    if (array.final_data_type != ArrayDataType::kNone) {
+    // If the final data type is int16, the data type may be float, for example
+    // after dequantization.
+    if (array.final_data_type != ArrayDataType::kNone &&
+        array.final_data_type != ArrayDataType::kInt16) {
       CHECK(array.final_data_type == array.data_type)
           << "Array \"" << array_entry.first
           << "\" has mis-matching actual and final data types ("
-          << static_cast<int>(array.data_type) << ","
-          << static_cast<int>(array.final_data_type) << ").";
+          << ArrayDataTypeName(array.data_type) << ","
+          << ArrayDataTypeName(array.final_data_type) << ").";
     }
   }
 }
@@ -1811,6 +1955,8 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
       return ArrayDataType::kFloat;
     case QUANTIZED_UINT8:
       return ArrayDataType::kUint8;
+    case QUANTIZED_INT16:
+      return ArrayDataType::kInt16;
     case INT32:
       return ArrayDataType::kInt32;
     case INT64:
@@ -1837,14 +1983,41 @@ void FinishBuildingRNNStates(Model* model) {
   }
 }
 
-void UseArraysExtraInfo(Model* model) {
+void UseArraysExtraInfo(Model* model, bool quantize_output) {
   for (const auto& entry : model->flags.arrays_extra_info().entries()) {
-    QCHECK(model->HasArray(entry.name()))
-        << "ArraysExtraInfo refers to non-existent array name: "
-        << entry.name();
-    auto& minmax = model->GetArray(entry.name()).GetOrCreateMinMax();
-    minmax.min = entry.min();
-    minmax.max = entry.max();
+    if (!model->HasArray(entry.name())) {
+      continue;
+    }
+    auto& array = model->GetArray(entry.name());
+    if (entry.has_min() || entry.has_max()) {
+      CHECK_EQ(entry.has_min(), entry.has_max());
+      auto& minmax = array.GetOrCreateMinMax();
+      minmax.min = entry.min();
+      minmax.max = entry.max();
+    }
+    if (entry.has_data_type() && quantize_output) {
+      array.final_data_type =
+          ConvertIODataTypeToArrayDataType(entry.data_type());
+    }
+    if (entry.has_shape()) {
+      array.clear_shape();
+      // Make sure to create the shape even if there are no dims, to
+      // correctly record 0-D shapes.
+      array.mutable_shape();
+      for (int dim : entry.shape().dims()) {
+        array.mutable_shape()->mutable_dims()->push_back(dim);
+      }
+    }
+    if (entry.has_constant_float_value()) {
+      CHECK(array.has_shape());
+      if (array.data_type == ArrayDataType::kFloat) {
+        auto& data = array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+        data.resize(RequiredBufferSizeForShape(array.shape()));
+        for (float& f : data) {
+          f = entry.constant_float_value();
+        }
+      }
+    }
   }
 }
 
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 0aaa0f6a215288430dfdde7d5042012730c3be4c..5cc15fa57b3ea4ecb74c3f81af14e4fea95f6232 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -54,7 +54,13 @@ absl::string_view FindLongestCommonPrefix(absl::string_view a,
                                           absl::string_view b);
 string LogName(const Operator& op);
 
-bool IsInputArray(const Model& model, const string& name);
+string ArrayDataTypeName(ArrayDataType data_type);
+
+// Returns true if the given array is specified as a model input array.
+bool IsInputArray(const Model& model, const string& array_name);
+// Returns true if the given array is specified as a model output array.
+bool IsOutputArray(const Model& model, const string& array_name);
+
 bool IsArrayConsumed(const Model& model, const string& name);
 int CountTrueOutputs(const Model& model, const Operator& op);
 
@@ -62,6 +68,10 @@ int CountOpsWithInput(const Model& model, const string& array_name);
 bool DeleteArrayIfUnused(const string& array_name, Model* model);
 bool DeleteArrayIfUsedOnce(const string& array_name, Model* model);
 
+// Deletes the op and any of its input and output arrays if they are unused
+// after the op has been deleted.
+void DeleteOpAndArraysIfUnused(Model* model, Operator* op);
+
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
     const Model& model, const string& array_name);
 Operator* GetOpWithOutput(const Model& model, const string& array_name);
@@ -69,8 +79,6 @@ Operator* GetOpWithOutput(const Model& model, const string& array_name);
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
     Model& model, const string& array_name);
 
-Operator* GetOpWithOutput(const Model& model, const string& array_name);
-
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
     const Model& model, const string& array_name);
 
@@ -139,83 +147,51 @@ void FixOperatorOrdering(Model* model);
 void FixNoMissingArray(Model* model);
 void FixNoOrphanedArray(Model* model);
 
-void ResolveModelFlags(const ModelFlags& model_flags, Model* model);
+// Fixes input/output arrays that may have issues during export or inference.
+void FixEdgeArrays(Model* model);
 
+// Copies the contents of an array into another.
+// Expects that the shape and data type match.
 template <ArrayDataType A>
-void GetQuantizationParamsFromMinMax(const ModelFlags& model_flags,
-                                     const MinMax& minmax,
-                                     QuantizationParams* quantization_params) {
-  using Integer = DataType<A>;
-  const Integer qmin = std::numeric_limits<Integer>::min();
-  const Integer qmax = std::numeric_limits<Integer>::max();
-  const double qmin_double = qmin;
-  const double qmax_double = qmax;
-  const double rmin = minmax.min;
-  const double rmax = minmax.max;
-  // 0 should always be a representable value. Let's assume that the initial
-  // min,max range contains 0.
-  CHECK_LE(rmin, 0.);
-  CHECK_GE(rmax, 0.);
-  if (rmin == rmax) {
-    // Special case where the min,max range is a point. Should be {0}.
-    CHECK_EQ(rmin, 0.);
-    CHECK_EQ(rmax, 0.);
-    quantization_params->zero_point = 0;
-    quantization_params->scale = 0.;
-    return;
+void CopyArrayBuffer(const Array& source_array, Array* target_array) {
+  int source_buffer_size = RequiredBufferSizeForShape(source_array.shape());
+  int target_buffer_size = RequiredBufferSizeForShape(target_array->shape());
+  CHECK_EQ(source_buffer_size, target_buffer_size)
+      << "Buffer sizes must match in element count";
+  CHECK(source_array.data_type == target_array->data_type)
+      << "Data types must match";
+  if (source_array.buffer) {
+    const auto& source_buffer = source_array.GetBuffer<A>();
+    auto& target_buffer = target_array->GetMutableBuffer<A>();
+    target_buffer.data = source_buffer.data;
   }
+}
 
-  // General case.
-  //
-  // First determine the scale.
-  const double scale = (rmax - rmin) / (qmax_double - qmin_double);
-
-  // Zero-point computation.
-  // First the initial floating-point computation. The zero-point can be
-  // determined from solving an affine equation for any known pair
-  // (real value, corresponding quantized value).
-  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-  // The arithmetic error on the zero point computed from either pair
-  // will be roughly machine_epsilon * (sum of absolute values of terms)
-  // so we want to use the variant that adds the smaller terms.
-  const double zero_point_from_min = qmin_double - rmin / scale;
-  const double zero_point_from_max = qmax_double - rmax / scale;
-  const double zero_point_from_min_error =
-      std::abs(qmin_double) + std::abs(rmin / scale);
-  const double zero_point_from_max_error =
-      std::abs(qmax_double) + std::abs(rmax / scale);
-
-  const double zero_point_double =
-      zero_point_from_min_error < zero_point_from_max_error
-          ? zero_point_from_min
-          : zero_point_from_max;
-
-  // Now we need to nudge the zero point to be an integer
-  // (our zero points are integer, and this is motivated by the requirement
-  // to be able to represent the real value "0" exactly as a quantized value,
-  // which is required in multiple places, for example in Im2col with SAME
-  // padding).
-  Integer nudged_zero_point = 0;
-  if (zero_point_double < qmin_double) {
-    nudged_zero_point = qmin;
-  } else if (zero_point_double > qmax_double) {
-    nudged_zero_point = qmax;
-  } else {
-    nudged_zero_point = static_cast<Integer>(std::round(zero_point_double));
+// Inserts a no-op reshape operator between the source array and the target
+// array. This effectively just copies the data.
+void InsertCopyOperator(Model* model, const string& source_array_name,
+                        const string& target_array_name);
+
+// Clones an array with all data and parameters.
+void CloneArray(Model* model, const string& source_array_name,
+                const string& target_array_name);
+
+void ResolveModelFlags(const ModelFlags& model_flags, Model* model);
+
+template <typename T>
+T ConvertOperator(Operator* o, OperatorType type) {
+  if (o != nullptr && o->type == type) {
+    return static_cast<T>(o);
   }
-  // The zero point should always be in the range of quantized value,
-  // [qmin, qmax].
-  CHECK_GE(nudged_zero_point, qmin);
-  CHECK_LE(nudged_zero_point, qmax);
-
-  // Finally, store the result nudged quantization params.
-  quantization_params->zero_point = nudged_zero_point;
-  quantization_params->scale = scale;
+
+  return nullptr;
 }
 
 void CheckIsReadyForQuantization(const Model& model);
-void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
-                                 double default_ranges_max);
+
+bool ReshapeIsEquivalentToTranspose(const Model& model,
+                                    const TensorFlowReshapeOperator* op,
+                                    bool allow_extra_unary_dims);
 
 inline int Offset(const Shape& shape, const std::vector<int>& indices) {
   DCHECK_EQ(shape.dimensions_count(), indices.size());
@@ -316,7 +292,7 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 // already quantized, then case (a) should hold.
 void FinishBuildingRNNStates(Model* model);
 
-void UseArraysExtraInfo(Model* model);
+void UseArraysExtraInfo(Model* model, bool quantize_output);
 
 }  // namespace toco
 
diff --git a/tensorflow/contrib/lite/toco/types.proto b/tensorflow/contrib/lite/toco/types.proto
index 318fd4b7b2c2df093562e73c3fe707675ee98876..03bd6150bc86bb27221814cd191b17f1a09585fa 100644
--- a/tensorflow/contrib/lite/toco/types.proto
+++ b/tensorflow/contrib/lite/toco/types.proto
@@ -34,4 +34,7 @@ enum IODataType {
 
   // String, not quantized
   STRING = 5;
+
+  // Int16, quantized
+  QUANTIZED_INT16 = 6;
 }
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 999ccf2ebc009b6b7c50a9a2d1667d69a3f690e7..7b3569ea9c8b15959b15e8ba46cf44d159d5528c 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -4,6 +4,7 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 py_binary(
@@ -45,7 +46,15 @@ tf_cc_binary(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
 cc_library(
@@ -69,6 +78,9 @@ cc_test(
         "//tensorflow/contrib/lite:testdata/test_model.bin",
         "//tensorflow/contrib/lite:testdata/test_model_broken.bin",
     ],
+    tags = [
+        "tflite_not_portable_android",
+    ],
     deps = [
         ":gen_op_registration",
         "@com_google_googletest//:gtest",
@@ -82,18 +94,6 @@ cc_library(
     deps = ["//tensorflow/contrib/lite:framework"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "verifier",
     srcs = ["verifier.cc"],
@@ -111,6 +111,9 @@ cc_test(
     name = "verifier_test",
     size = "small",
     srcs = ["verifier_test.cc"],
+    tags = [
+        "tflite_not_portable",
+    ],
     deps = [
         ":mutable_op_resolver",
         ":verifier",
@@ -124,3 +127,5 @@ cc_test(
         "@flatbuffers",
     ],
 )
+
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark_model.cc
index 6ae3ab57294a92162b15f326630ac202a9ba2a82..93c80e0f5e021f76bff6858b0ea3370724393d6d 100644
--- a/tensorflow/contrib/lite/tools/benchmark_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark_model.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,36 +25,89 @@ limitations under the License.
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
 
 #ifdef TFLITE_CUSTOM_OPS_HEADER
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 #endif
 
-#define LOG(x) std::cerr
+namespace tflite {
 
-#define CHECK(x)                  \
-  if (!(x)) {                     \
-    LOG(ERROR) << #x << "failed"; \
-    exit(1);                      \
+using ::tensorflow::Env;
+using ::tensorflow::str_util::Split;
+using ::tensorflow::str_util::SplitAndParseAsFloats;
+using ::tensorflow::str_util::SplitAndParseAsInts;
+
+struct InputLayerInfo {
+  string name;
+  TfLiteType data_type;
+  std::vector<int> shape;
+  // Note that initialization_values is currently unused.
+  std::vector<float> initialization_values;
+};
+
+template <typename T>
+void FillRandomValue(T* ptr, const std::vector<int>& sizes,
+                     const std::function<T()>& random_func) {
+  int num_elements = 1;
+  for (int dim : sizes) {
+    num_elements *= dim;
+  }
+  for (int i = 0; i < num_elements; ++i) {
+    *ptr++ = random_func();
   }
+}
 
-namespace tensorflow {
-namespace benchmark_tflite_model {
+void FillRandomString(tflite::DynamicBuffer* buffer,
+                      const std::vector<int>& sizes,
+                      const std::function<string()>& random_func) {
+  int num_elements = 1;
+  for (int dim : sizes) {
+    num_elements *= dim;
+  }
+  for (int i = 0; i < num_elements; ++i) {
+    auto str = random_func();
+    buffer->AddString(str.data(), str.length());
+  }
+}
 
-std::unique_ptr<tflite::FlatBufferModel> model;
-std::unique_ptr<tflite::Interpreter> interpreter;
+TfLiteType TfLiteTypeFromString(const string& input_layer_type) {
+  if (input_layer_type == "string")
+    return kTfLiteString;
+  else if (input_layer_type == "float")
+    return kTfLiteFloat32;
+  else if (input_layer_type == "uint8")
+    return kTfLiteUInt8;
+  else if (input_layer_type == "int32")
+    return kTfLiteInt32;
+  else if (input_layer_type == "int64")
+    return kTfLiteInt64;
+  else
+    return kTfLiteNoType;
+}
 
-void InitImpl(const std::string& graph, const std::vector<int>& sizes,
-              const std::string& input_layer_type, int num_threads) {
-  CHECK(graph.c_str());
+std::vector<int> ShapeFromTfLiteTensor(TfLiteTensor* t) {
+  std::vector<int> result;
+  result.reserve(t->dims->size);
+  for (int i = 0; i < t->dims->size; ++i) {
+    result.push_back(t->dims->data[i]);
+  }
+  CHECK(!result.empty()) << "Found no shapes in model";
+  return result;
+}
 
-  model = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
+bool CreateInterpreter(const string& graph,
+                       std::unique_ptr<FlatBufferModel>* model,
+                       std::unique_ptr<Interpreter>* interpreter) {
+  *model = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
   if (!model) {
-    LOG(FATAL) << "Failed to mmap model " << graph;
+    std::cerr << "Failed to load model " << graph << std::endl;
+    return false;
   }
-  LOG(INFO) << "Loaded model " << graph;
-  model->error_reporter();
-  LOG(INFO) << "resolved reporter";
 
 #ifdef TFLITE_CUSTOM_OPS_HEADER
   tflite::MutableOpResolver resolver;
@@ -63,34 +116,360 @@ void InitImpl(const std::string& graph, const std::vector<int>& sizes,
   tflite::ops::builtin::BuiltinOpResolver resolver;
 #endif
 
-  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
-  if (!interpreter) {
-    LOG(FATAL) << "Failed to construct interpreter";
+  tflite::InterpreterBuilder(*(model->get()), resolver)(interpreter);
+  if (!(*interpreter)) {
+    std::cerr << "Failed to construct interpreter" << std::endl;
+    return false;
   }
 
+  return true;
+}
+
+bool PrepareInterpreter(const std::vector<InputLayerInfo> inputs,
+                        int num_threads, bool use_nnapi,
+                        Interpreter* interpreter) {
   if (num_threads != -1) {
     interpreter->SetNumThreads(num_threads);
   }
 
-  int input = interpreter->inputs()[0];
+  interpreter->UseNNAPI(use_nnapi);
 
-  if (input_layer_type != "string") {
-    interpreter->ResizeInputTensor(input, sizes);
+  // Check that all names and types match
+  for (const InputLayerInfo& input : inputs) {
+    for (int i : interpreter->inputs()) {
+      TfLiteTensor* t = interpreter->tensor(i);
+      CHECK_EQ(t->name, input.name)
+          << "Tensor # " << i << " is named " << t->name
+          << " but flags call it " << input.name;
+      CHECK_EQ(t->type, input.data_type)
+          << "Could not match the type of input tensor " << t->name;
+    }
+  }
+
+  // Resize all non-string tensors.
+  for (const InputLayerInfo& input : inputs) {
+    for (int i : interpreter->inputs()) {
+      TfLiteTensor* t = interpreter->tensor(i);
+      if (t->type != kTfLiteString) {
+        interpreter->ResizeInputTensor(i, input.shape);
+      }
+    }
   }
 
   if (interpreter->AllocateTensors() != kTfLiteOk) {
-    LOG(FATAL) << "Failed to allocate tensors!";
+    std::cerr << "Failed to allocate tensors!" << std::endl;
+    return false;
+  }
+
+  // Set the values of the input tensors.
+  for (int i : interpreter->inputs()) {
+    TfLiteTensor* t = interpreter->tensor(i);
+    std::vector<int> sizes = ShapeFromTfLiteTensor(t);
+
+    // TODO(ahentz): below we ignore the O-th dimension (number of batches).
+    if (t->type == kTfLiteFloat32) {
+      FillRandomValue<float>(
+          interpreter->typed_tensor<float>(i),
+          std::vector<int>(sizes.begin() + 1, sizes.end()),
+          []() { return static_cast<float>(rand()) / RAND_MAX - 0.5f; });
+    } else if (t->type == kTfLiteUInt8) {
+      FillRandomValue<uint8_t>(
+          interpreter->typed_tensor<uint8_t>(i),
+          std::vector<int>(sizes.begin() + 1, sizes.end()),
+          []() { return static_cast<uint8_t>(rand()) % 255; });
+    } else if (t->type == kTfLiteString) {
+      tflite::DynamicBuffer buffer;
+      FillRandomString(&buffer, sizes, []() {
+        return "we're have some friends over saturday to hang out in the yard";
+      });
+      buffer.WriteToTensor(interpreter->tensor(i));
+    } else {
+      std::cerr << "Don't know how to populate tensor " << t->name
+                << " of type " << t->type << std::endl;
+      return false;
+    }
+  }
+  return true;
+}
+
+bool PopulateInputLayerInfo(const string& names_string,
+                            const string& shapes_string,
+                            const string& types_string,
+                            const string& values_string,
+                            std::vector<InputLayerInfo>* info) {
+  std::vector<string> names = Split(names_string, ',');
+  std::vector<string> shapes = Split(shapes_string, ':');
+  std::vector<string> types = Split(types_string, ',');
+  std::vector<string> values = Split(values_string, ':');
+
+  if (names.size() != shapes.size()) {
+    LOG(ERROR) << "The number of items in"
+               << " --input_layer_shape (" << shapes_string << ", with "
+               << shapes.size() << " items)"
+               << " must match the number of items in"
+               << " --input_layer (" << names_string << ", with "
+               << names.size() << " items)."
+               << " For example --input_layer=input1,input2"
+               << " --input_layer_shape=1,224,224,4:1,20";
+    return false;
+  }
+  if (names.size() != types.size()) {
+    LOG(ERROR) << "The number of items in"
+               << " --input_layer_type (" << types_string << ", with "
+               << types.size() << " items)"
+               << " must match the number of items in"
+               << " --input_layer (" << names_string << ", with "
+               << names.size() << " items)."
+               << " For example --input_layer=input1,input2"
+               << " --input_layer_type=float,int";
+    return false;
+  }
+
+  for (int i = 0; i < names.size(); ++i) {
+    info->push_back(InputLayerInfo());
+    InputLayerInfo& input = info->back();
+
+    input.name = names[i];
+
+    input.data_type = TfLiteTypeFromString(types[i]);
+    CHECK(input.data_type != kTfLiteNoType)
+        << types[i] << " was an invalid type";
+
+    CHECK(SplitAndParseAsInts(shapes[i], ',', &input.shape))
+        << "Incorrect size string specified: " << shapes[i];
+    for (int dim : input.shape) {
+      if (dim == -1) {
+        LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
+                   << " with the size you want to benchmark with.";
+        return false;
+      }
+    }
+
+    if (i < values.size()) {
+      CHECK(SplitAndParseAsFloats(values[i], ',', &input.initialization_values))
+          << "Incorrect initialization values string specified: " << values[i];
+    }
+  }
+
+  return true;
+}
+
+bool RunBenchmark(Interpreter* interpreter, int64_t* inference_time_us) {
+  const int64_t start_time = Env::Default()->NowMicros();
+
+  if (interpreter->Invoke() != kTfLiteOk) {
+    std::cerr << "Failed to invoke!";
+    return false;
   }
+
+  const int64_t end_time = Env::Default()->NowMicros();
+  *inference_time_us = end_time - start_time;
+  return true;
+}
+
+class Latencies {
+ public:
+  void AddMeasurement(int64_t time_us) {
+    max_ = std::max(time_us, max_);
+    min_ = std::min(time_us, min_);
+    ++count_;
+    sum_ += time_us;
+    squared_sum_ += static_cast<double>(time_us) * time_us;
+  }
+
+  double avg() const {
+    if (count_ == 0) return std::numeric_limits<int64_t>::quiet_NaN();
+    return static_cast<double>(sum_) / count_;
+  }
+
+  int64_t std_deviation() const {
+    if (count_ == 0 || min_ == max_) return 0;
+    return sqrt(squared_sum_ / count_ - avg() * avg());
+  }
+
+  void OutputToStream(std::ostream* stream) const {
+    *stream << "count=" << count_;
+    if (count_ == 0) return;
+    *stream << " min=" << min_ << " max=" << max_;
+    *stream << " avg=" << avg() << " std=" << std_deviation();
+  }
+
+ private:
+  int64_t count_ = 0;
+  int64_t min_ = std::numeric_limits<int64_t>::max();
+  int64_t max_ = std::numeric_limits<int64_t>::min();
+  int64_t sum_ = 0;
+  double squared_sum_ = 0;
+};
+
+bool TimeMultipleRuns(Interpreter* interpreter, double sleep_seconds,
+                      int num_runs, int64* total_time_us) {
+  // Convert the run_delay string into a timespec.
+  timespec req;
+  req.tv_sec = static_cast<time_t>(sleep_seconds);
+  req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
+
+  *total_time_us = 0;
+
+  std::cout << "Running benchmark for " << num_runs
+            << " iterations: " << std::endl;
+
+  Latencies latencies;
+  for (int i = 0; i < num_runs; ++i) {
+    int64_t time_us;
+    bool run_status = RunBenchmark(interpreter, &time_us);
+    latencies.AddMeasurement(time_us);
+    *total_time_us += time_us;
+    if (!run_status) {
+      std::cout << "Failed on run " << i << std::endl;
+      return false;
+    }
+
+    // If requested, sleep between runs for an arbitrary amount of time.
+    // This can be helpful to determine the effect of mobile processor
+    // scaling and thermal throttling.
+    if (sleep_seconds > 0.0) {
+#ifdef PLATFORM_WINDOWS
+      Sleep(sleep_seconds * 1000);
+#else
+      nanosleep(&req, nullptr);
+#endif
+    }
+  }
+  latencies.OutputToStream(&std::cout);
+  std::cout << std::endl;
+
+  return true;
 }
 
 int Main(int argc, char** argv) {
-  InitImpl("", {}, "", 1);
+  using tensorflow::Flag;
+  using tensorflow::Flags;
+
+  string graph;               // e.g.: /data/local/tmp/tfl_inception-v1_model.fb
+  string input_layer_string;  // e.g.: input
+  string input_layer_shape_string;  // e.g.: 1,224,224,3
+  string input_layer_type_string;   // e.g.: float
+  string input_layer_values_string;
+  string output_layer_string;  // e.g.: output
+  int num_runs = 50;
+  string run_delay = "-1.0";
+  int num_threads = -1;
+  string benchmark_name = "";
+  string output_prefix = "";
+  int warmup_runs = 1;
+  bool use_nnapi = false;
+
+  std::vector<Flag> flag_list = {
+      Flag("graph", &graph, "graph file name"),
+      // All the following flags are optional, but can be used in order
+      // to benchmark different input shapes.
+      Flag("input_layer", &input_layer_string, "input layer names"),
+      Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
+      Flag("input_layer_type", &input_layer_type_string, "input layer type"),
+      Flag("input_layer_values", &input_layer_values_string,
+           "values to initialize the inputs with"),
+      Flag("output_layer", &output_layer_string, "output layer name"),
+      Flag("num_runs", &num_runs, "number of runs"),
+      Flag("run_delay", &run_delay, "delay between runs in seconds"),
+      Flag("num_threads", &num_threads, "number of threads"),
+      Flag("benchmark_name", &benchmark_name, "benchmark name"),
+      Flag("output_prefix", &output_prefix, "benchmark output prefix"),
+      Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
+      Flag("use_nnapi", &use_nnapi, "use nnapi api"),
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  if (!parse_result) {
+    std::cerr << usage << std::endl;
+    return -1;
+  }
+
+  std::cout << "Graph: [" << graph << "]" << std::endl;
+  if (!input_layer_string.empty()) {
+    std::cout << "Input layers: [" << input_layer_string << "]" << std::endl;
+    std::cout << "Input shapes: [" << input_layer_shape_string << "]"
+              << std::endl;
+    std::cout << "Input types: [" << input_layer_type_string << "]"
+              << std::endl;
+  }
+  if (!output_layer_string.empty()) {
+    std::cout << "Output layers: [" << output_layer_string << "]" << std::endl;
+  }
+  std::cout << "Num runs: [" << num_runs << "]" << std::endl;
+  std::cout << "Inter-run delay (seconds): [" << run_delay << "]" << std::endl;
+  std::cout << "Num threads: [" << num_threads << "]" << std::endl;
+  if (!benchmark_name.empty()) {
+    std::cout << "Benchmark name: [" << benchmark_name << "]" << std::endl;
+    std::cout << "Output prefix: [" << output_prefix << "]" << std::endl;
+  }
+  std::cout << "Warmup runs: [" << warmup_runs << "]" << std::endl;
+  std::cout << "Use nnapi : [" << use_nnapi << "]" << std::endl;
+
+  if (graph.empty()) {
+    std::cout
+        << "Please specify the name of your TF Lite input file with --graph"
+        << std::endl;
+    return -1;
+  }
+
+  std::vector<InputLayerInfo> inputs;
+  if (!PopulateInputLayerInfo(input_layer_string, input_layer_shape_string,
+                              input_layer_type_string,
+                              input_layer_values_string, &inputs)) {
+    return -1;
+  }
+
+  int64 initialization_start_us = Env::Default()->NowMicros();
+
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (!CreateInterpreter(graph, &model, &interpreter)) {
+    return -1;
+  }
+  if (!PrepareInterpreter(inputs, num_threads, use_nnapi, interpreter.get())) {
+    return -1;
+  }
+
+  int64 initialization_end_us = Env::Default()->NowMicros();
+
+  const double initialization_time_s =
+      (initialization_end_us - initialization_start_us) / 1000000.0f;
+  std::cout << "Initialized session in " << initialization_time_s << "s"
+            << std::endl;
+
+  const double sleep_seconds = std::strtod(run_delay.c_str(), nullptr);
+
+  // If requested, run through the graph first to preinitialize everything
+  // before the benchmarking runs.
+  int64 warmup_time_us = 0;
+  if (warmup_runs > 0) {
+    if (!TimeMultipleRuns(interpreter.get(), sleep_seconds, warmup_runs,
+                          &warmup_time_us)) {
+      std::cerr << "Warmup failed" << std::endl;
+      return -1;
+    }
+  }
+
+  // Capture overall inference time without stat logging overhead. This is the
+  // timing data that can be compared to other libaries.
+  int64 no_stat_time_us = 0;
+  if (!TimeMultipleRuns(interpreter.get(), sleep_seconds, num_runs,
+                        &no_stat_time_us)) {
+    std::cerr << "Timing failed." << std::endl;
+    return -1;
+  }
+
+  std::cout << "Average inference timings in us: " << no_stat_time_us / num_runs
+            << " , Warmup: "
+            << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
+            << std::endl;
+
   return 0;
 }
 
-}  // namespace benchmark_tflite_model
-}  // namespace tensorflow
+}  // namespace tflite
 
-int main(int argc, char** argv) {
-  return tensorflow::benchmark_tflite_model::Main(argc, argv);
-}
+int main(int argc, char** argv) { return ::tflite::Main(argc, argv); }
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
index 59c74205f0a311ec12ff87f46622041605fb493b..8818a7dc85d9ffdc1da450fb389d5ed11139bc31 100644
--- a/tensorflow/contrib/lite/tools/verifier.cc
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -148,11 +148,52 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
   // TODO(yichengfan): verify quantized tensors.
 }
 
+using flatbuffers::Offset;
+using flatbuffers::Vector;
+
+bool VerifyOperators(const Vector<Offset<Operator>>& operators,
+                     ErrorReporter* error_reporter) {
+  for (const auto& op : operators) {
+    if (!op->inputs()) {
+      ReportError(error_reporter, "Missing 'inputs' for operator.");
+      return false;
+    }
+    if (!op->outputs()) {
+      ReportError(error_reporter, "Missing 'outputs' for operator.");
+      return false;
+    }
+  }
+  return true;
+}
+
+bool VerifySubGraphs(const Model& model, ErrorReporter* error_reporter) {
+  if (!model.subgraphs()) {
+    ReportError(error_reporter, "Missing 'subgraphs' section.");
+    return false;
+  }
+  for (const auto& subgraph : *model.subgraphs()) {
+    if (!subgraph->operators()) {
+      ReportError(error_reporter, "Missing 'operators' section in subgraph.");
+      return false;
+    }
+
+    if (!VerifyOperators(*subgraph->operators(), error_reporter)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Verifies tensors have valid properties and legit buffer if set.
 bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
   if (!model.subgraphs()) {
     return true;
   }
+  if (!model.buffers()) {
+    ReportError(error_reporter, "Missing 'buffers' section.");
+    return false;
+  }
+
   for (const auto& subgraph : *model.subgraphs()) {
     if (!subgraph->tensors()) {
       continue;
@@ -167,19 +208,23 @@ bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
         return false;
       }
       auto* buffer = model.buffers()->Get(tensor->buffer());
-      if (!buffer || !buffer->data()) {
+      if (!buffer) {
         ReportError(error_reporter, "Tensor buffer %d not set",
                     tensor->buffer());
         return false;
       }
 
-      if (tensor->type() == TensorType_STRING) {
-        if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
-          return false;
-        }
-      } else {
-        if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
-          return false;
+      // Many transient tensors don't have data in the flatbuffer. Their
+      // buffers will be allocated by the interpreter at run-time.
+      if (buffer->data()) {
+        if (tensor->type() == TensorType_STRING) {
+          if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+            return false;
+          }
+        } else {
+          if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
+            return false;
+          }
         }
       }
     }
@@ -193,6 +238,13 @@ bool VerifyOps(const Model& model, const OpResolver& resolver,
     return true;
   }
   for (const auto& opcode : *model.operator_codes()) {
+    if (opcode->builtin_code() < BuiltinOperator_MIN ||
+        opcode->builtin_code() > BuiltinOperator_MAX) {
+      ReportError(error_reporter, "Operator id '%d' is out of range.",
+                  opcode->builtin_code());
+      return false;
+    }
+
     if (opcode->builtin_code() == BuiltinOperator_CUSTOM) {
       if (!resolver.FindOp(opcode->custom_code()->c_str())) {
         ReportError(error_reporter, "Unsupported custom op: %s",
@@ -223,6 +275,9 @@ bool Verify(const void* buf, size_t len, const OpResolver& resolver,
     ReportError(error_reporter, "Invalid model version %d", model->version());
     return false;
   }
+  if (!VerifySubGraphs(*model, error_reporter)) {
+    return false;
+  }
   if (!VerifyTensors(*model, error_reporter)) {
     return false;
   }
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
index c2ee11215c861ed7b27696a8d786bb6e2a48e930..b7ce4e830576af14002d6bd9080af1da5764b1c9 100644
--- a/tensorflow/contrib/lite/tools/verifier.h
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -23,6 +23,21 @@ limitations under the License.
 
 namespace tflite {
 
+class AlwaysTrueResolver : public OpResolver {
+ public:
+  AlwaysTrueResolver() {}
+  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+    static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
+                                                   nullptr};
+    return &null_registration;
+  }
+  TfLiteRegistration* FindOp(const char* op) const override {
+    static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
+                                                   nullptr};
+    return &null_registration;
+  }
+};
+
 // Verifies the integrity of a Tensorflow Lite flatbuffer model file.
 // Currently, it verifies:
 // * The file is following a legit flatbuffer schema.
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index b3e611f999b2837efbf8876bd989db44c408b8c7..03b93afe3ed04b4bff13bc01d7c7c8e9fae9bdf3 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -113,8 +113,8 @@ TEST(VerifyModel, TestEmptyModel) {
                            /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                     MutableOpResolver{}, DefaultErrorReporter()));
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      MutableOpResolver{}, DefaultErrorReporter()));
 }
 
 TEST(VerifyModel, TestSimpleModel) {
diff --git a/tensorflow/contrib/lite/util.cc b/tensorflow/contrib/lite/util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fb4af07d060cac3a6a4e01c7d625b6db5241f10d
--- /dev/null
+++ b/tensorflow/contrib/lite/util.cc
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/util.h"
+
+namespace tflite {
+
+TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input) {
+  return ConvertArrayToTfLiteIntArray(input.size(), input.data());
+}
+
+TfLiteIntArray* ConvertArrayToTfLiteIntArray(const int rank, const int* dims) {
+  TfLiteIntArray* output = TfLiteIntArrayCreate(rank);
+  for (size_t i = 0; i < rank; i++) {
+    output->data[i] = dims[i];
+  }
+  return output;
+}
+
+bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
+                                 const int* b) {
+  if (!a) return false;
+  if (a->size != b_size) return false;
+  for (int i = 0; i < a->size; ++i) {
+    if (a->data[i] != b[i]) return false;
+  }
+  return true;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/util.h b/tensorflow/contrib/lite/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..a34db35823104414cce028b9119397da085d05b1
--- /dev/null
+++ b/tensorflow/contrib/lite/util.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides general C++ utility functions in TFLite.
+// For example: Converting between `TfLiteIntArray`, `std::vector` and
+// Flatbuffer vectors. These functions can't live in `context.h` since it's pure
+// C.
+
+#ifndef TENSORFLOW_CONTRIB_LITE_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_UTIL_H_
+
+#include <vector>
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// Converts a `std::vector` to a `TfLiteIntArray`.
+TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input);
+
+TfLiteIntArray* ConvertArrayToTfLiteIntArray(const int rank, const int* dims);
+
+// Checks whether a `TfLiteIntArray` and an int array have matching elements.
+bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
+                                 const int* b);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_UTIL_H_
diff --git a/tensorflow/contrib/lite/util_test.cc b/tensorflow/contrib/lite/util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04579c53aa4835c47d812c89a1554a0d2f2f30b8
--- /dev/null
+++ b/tensorflow/contrib/lite/util_test.cc
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/util.h"
+
+namespace tflite {
+namespace {
+
+TEST(ConvertVectorToTfLiteIntArray, TestWithVector) {
+  std::vector<int> input = {1, 2};
+  TfLiteIntArray* output = ConvertVectorToTfLiteIntArray(input);
+  ASSERT_NE(output, nullptr);
+  EXPECT_EQ(output->size, 2);
+  EXPECT_EQ(output->data[0], 1);
+  EXPECT_EQ(output->data[1], 2);
+  TfLiteIntArrayFree(output);
+}
+
+TEST(ConvertVectorToTfLiteIntArray, TestWithEmptyVector) {
+  std::vector<int> input;
+  TfLiteIntArray* output = ConvertVectorToTfLiteIntArray(input);
+  ASSERT_NE(output, nullptr);
+  EXPECT_EQ(output->size, 0);
+  TfLiteIntArrayFree(output);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index 8ca03f4193f260ce32f942ccaf76a8260b282156..e3928a82a2d453fdd36cb861ce178a776574269c 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -28,7 +28,7 @@ py_library(
 
 tf_py_test(
     name = "lookup_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["lookup_ops_test.py"],
     additional_deps = [
         ":lookup_py",
@@ -46,16 +46,5 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     grpc_enabled = True,
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index a430dac4ec43ce31f0b5aaae5e7b0b51d25c9632..4942d941765951ed2ee5555138e91a202b96bf7c 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -105,7 +105,7 @@ def index_table_from_tensor(mapping,
   ...
   tf.tables_initializer().run()
 
-  ids.eval()  ==> [0, 1, 4, 2]
+  ids.eval()  ==> [0, 1, 3, 2]
   ```
 
   Args:
@@ -298,7 +298,7 @@ class MutableHashTable(LookupInterface):
   table = tf.contrib.lookup.MutableHashTable(key_dtype=tf.string,
                                              value_dtype=tf.int64,
                                              default_value=-1)
-  table.insert(keys, values)
+  sess.run(table.insert(keys, values))
   out = table.lookup(query_keys)
   print(out.eval())
   ```
@@ -341,23 +341,21 @@ class MutableHashTable(LookupInterface):
     # training to work correctly. Use the node name if no shared_name has been
     # explicitly specified.
     use_node_name_sharing = checkpoint and shared_name is None
-    # pylint: disable=protected-access
     if self._default_value.get_shape().ndims == 0:
-      self._table_ref = gen_lookup_ops._mutable_hash_table_v2(
+      self._table_ref = gen_lookup_ops.mutable_hash_table_v2(
           shared_name=shared_name,
           use_node_name_sharing=use_node_name_sharing,
           key_dtype=key_dtype,
           value_dtype=value_dtype,
           name=name)
     else:
-      self._table_ref = gen_lookup_ops._mutable_hash_table_of_tensors_v2(
+      self._table_ref = gen_lookup_ops.mutable_hash_table_of_tensors_v2(
           shared_name=shared_name,
           use_node_name_sharing=use_node_name_sharing,
           key_dtype=key_dtype,
           value_dtype=value_dtype,
           value_shape=self._default_value.get_shape(),
           name=name)
-    # pylint: enable=protected-access
     super(MutableHashTable, self).__init__(key_dtype, value_dtype,
                                            self._table_ref.op.name.split(
                                                "/")[-1])
@@ -378,9 +376,7 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as name:
       with ops.colocate_with(self._table_ref):
-
-        # pylint: disable=protected-access
-        return gen_lookup_ops._lookup_table_size_v2(self._table_ref, name=name)
+        return gen_lookup_ops.lookup_table_size_v2(self._table_ref, name=name)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -406,8 +402,7 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
                         (self._table_ref, keys, self._default_value)) as name:
       with ops.colocate_with(self._table_ref):
-        # pylint: disable=protected-access
-        values = gen_lookup_ops._lookup_table_find_v2(
+        values = gen_lookup_ops.lookup_table_find_v2(
             self._table_ref, keys, self._default_value, name=name)
 
         values.set_shape(keys.get_shape().concatenate(self._value_shape))
@@ -437,7 +432,7 @@ class MutableHashTable(LookupInterface):
                         [self._table_ref, keys, values]) as name:
       with ops.colocate_with(self._table_ref):
         # pylint: disable=protected-access
-        op = gen_lookup_ops._lookup_table_insert_v2(
+        op = gen_lookup_ops.lookup_table_insert_v2(
             self._table_ref, keys, values, name=name)
     return op
 
@@ -454,8 +449,7 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
                         [self._table_ref]) as name:
       with ops.colocate_with(self._table_ref):
-        # pylint: disable=protected-access
-        exported_keys, exported_values = gen_lookup_ops._lookup_table_export_v2(
+        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
             self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
@@ -477,7 +471,7 @@ class MutableHashTable(LookupInterface):
     def restore(self, restored_tensors, unused_restored_shapes):
       # pylint: disable=protected-access
       with ops.colocate_with(self.op._table_ref):
-        return gen_lookup_ops._lookup_table_import_v2(
+        return gen_lookup_ops.lookup_table_import_v2(
             self.op._table_ref, restored_tensors[0], restored_tensors[1])
 
 
@@ -500,7 +494,7 @@ class MutableDenseHashTable(LookupInterface):
                                                   value_dtype=tf.int64,
                                                   default_value=-1,
                                                   empty_key=0)
-  table.insert(keys, values)
+  sess.run(table.insert(keys, values))
   out = table.lookup(query_keys)
   print(out.eval())
   ```
@@ -551,8 +545,7 @@ class MutableDenseHashTable(LookupInterface):
     # explicitly specified.
     use_node_name_sharing = checkpoint and shared_name is None
     empty_key = ops.convert_to_tensor(empty_key, dtype=key_dtype)
-    # pylint: disable=protected-access
-    self._table_ref = gen_lookup_ops._mutable_dense_hash_table_v2(
+    self._table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
         empty_key=empty_key,
         shared_name=shared_name,
         use_node_name_sharing=use_node_name_sharing,
@@ -560,7 +553,6 @@ class MutableDenseHashTable(LookupInterface):
         value_shape=self._value_shape,
         initial_num_buckets=initial_num_buckets,
         name=name)
-    # pylint: enable=protected-access
     super(MutableDenseHashTable, self).__init__(
         key_dtype, value_dtype, self._table_ref.op.name.split("/")[-1])
 
@@ -580,8 +572,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as name:
       with ops.colocate_with(self._table_ref):
-        # pylint: disable=protected-access
-        return gen_lookup_ops._lookup_table_size_v2(self._table_ref, name=name)
+        return gen_lookup_ops.lookup_table_size_v2(self._table_ref, name=name)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -607,8 +598,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
                         [self._table_ref, keys]) as name:
       with ops.colocate_with(self._table_ref):
-        # pylint: disable=protected-access
-        values = gen_lookup_ops._lookup_table_find_v2(
+        values = gen_lookup_ops.lookup_table_find_v2(
             self._table_ref, keys, self._default_value, name=name)
 
     if keys.get_shape().ndims is not None and keys.get_shape().ndims > 0:
@@ -640,8 +630,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
       with ops.colocate_with(self._table_ref):
-        # pylint: disable=protected-access
-        op = gen_lookup_ops._lookup_table_insert_v2(
+        op = gen_lookup_ops.lookup_table_insert_v2(
             self._table_ref, keys, values, name=name)
       return op
 
@@ -658,8 +647,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
                         [self._table_ref]) as name:
       with ops.colocate_with(self._table_ref):
-        # pylint: disable=protected-access
-        exported_keys, exported_values = gen_lookup_ops._lookup_table_export_v2(
+        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
             self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
@@ -681,5 +669,5 @@ class MutableDenseHashTable(LookupInterface):
     def restore(self, restored_tensors, unused_restored_shapes):
       # pylint: disable=protected-access
       with ops.colocate_with(self.op._table_ref):
-        return gen_lookup_ops._lookup_table_import_v2(
+        return gen_lookup_ops.lookup_table_import_v2(
             self.op._table_ref, restored_tensors[0], restored_tensors[1])
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index f681b7b132750ef80aa56f25143418fbc4eaa1bb..5d4682ec9f4b8c5864383bd1d2f4c0b41a11baad 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -58,6 +58,12 @@ class HashTableOpTest(test.TestCase):
       result = output.eval()
       self.assertAllEqual([0, 1, -1], result)
 
+      exported_keys_tensor, exported_values_tensor = table.export()
+
+      self.assertItemsEqual([b"brain", b"salad", b"surgery"],
+                            exported_keys_tensor.eval())
+      self.assertItemsEqual([0, 1, 2], exported_values_tensor.eval())
+
   def testHashTableFindHighRank(self):
     with self.test_session():
       default_val = -1
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index 56942115213a762e532971a81da768b53b8537d8..728f75f8ef1eb3b107dbd0ab4ffbecd63787bf3e 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -97,15 +97,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 8c3a8afe7a0f6f5ad9ceae566288ba60be73d339..bdad34a665e47a4e060fcaddfffecfdc876a8fb0 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 
 __all__ = [
     "absolute_difference", "add_loss", "cosine_distance",
@@ -651,11 +652,9 @@ def cosine_distance(predictions,
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `weights` is `None`.
   """
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dim'")
-    axis = dim
-  if axis is None and dim is None:
+  axis = deprecated_argument_lookup(
+      "axis", axis, "dim", dim)
+  if axis is None:
     raise ValueError("You must specify 'axis'.")
   with ops.name_scope(scope, "cosine_distance_loss",
                       [predictions, labels, weights]) as scope:
diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index 6842bc38eb108b46cc3eff715c9cbc74f991308b..de76acb51ffe985162a66c617b266f47c5216b19 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -50,16 +50,12 @@ def pairwise_distance(feature, squared=False):
     pairwise_distances: 2-D Tensor of size [number of data, number of data].
   """
   pairwise_distances_squared = math_ops.add(
+      math_ops.reduce_sum(math_ops.square(feature), axis=[1], keepdims=True),
       math_ops.reduce_sum(
-          math_ops.square(feature),
-          axis=[1],
-          keepdims=True),
-      math_ops.reduce_sum(
-          math_ops.square(
-              array_ops.transpose(feature)),
+          math_ops.square(array_ops.transpose(feature)),
           axis=[0],
-          keepdims=True)) - 2.0 * math_ops.matmul(
-              feature, array_ops.transpose(feature))
+          keepdims=True)) - 2.0 * math_ops.matmul(feature,
+                                                  array_ops.transpose(feature))
 
   # Deal with numerical inaccuracies. Set small negatives to zero.
   pairwise_distances_squared = math_ops.maximum(pairwise_distances_squared, 0.0)
@@ -134,8 +130,8 @@ def masked_maximum(data, mask, dim=1):
   """
   axis_minimums = math_ops.reduce_min(data, dim, keepdims=True)
   masked_maximums = math_ops.reduce_max(
-      math_ops.multiply(
-          data - axis_minimums, mask), dim, keepdims=True) + axis_minimums
+      math_ops.multiply(data - axis_minimums, mask), dim,
+      keepdims=True) + axis_minimums
   return masked_maximums
 
 
@@ -153,8 +149,8 @@ def masked_minimum(data, mask, dim=1):
   """
   axis_maximums = math_ops.reduce_max(data, dim, keepdims=True)
   masked_minimums = math_ops.reduce_min(
-      math_ops.multiply(
-          data - axis_maximums, mask), dim, keepdims=True) + axis_maximums
+      math_ops.multiply(data - axis_maximums, mask), dim,
+      keepdims=True) + axis_maximums
   return masked_minimums
 
 
@@ -202,8 +198,7 @@ def triplet_semihard_loss(labels, embeddings, margin=1.0):
   mask_final = array_ops.reshape(
       math_ops.greater(
           math_ops.reduce_sum(
-              math_ops.cast(
-                  mask, dtype=dtypes.float32), 1, keepdims=True),
+              math_ops.cast(mask, dtype=dtypes.float32), 1, keepdims=True),
           0.0), [batch_size, batch_size])
   mask_final = array_ops.transpose(mask_final)
 
@@ -450,8 +445,8 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
   #     this is to take the max only among negatives.
   row_minimums = math_ops.reduce_min(diff, 1, keepdims=True)
   row_negative_maximums = math_ops.reduce_max(
-      math_ops.multiply(
-          diff - row_minimums, mask), 1, keepdims=True) + row_minimums
+      math_ops.multiply(diff - row_minimums, mask), 1,
+      keepdims=True) + row_minimums
 
   # Compute the loss.
   # Keep track of matrix of maximums where M_ij = max(m_i, m_j)
@@ -467,10 +462,11 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
       array_ops.transpose(max_elements), [-1, 1])
 
   loss_exp_left = array_ops.reshape(
-      math_ops.reduce_sum(math_ops.multiply(
-          math_ops.exp(
-              diff_tiled - max_elements_vect),
-          mask_tiled), 1, keepdims=True), [batch_size, batch_size])
+      math_ops.reduce_sum(
+          math_ops.multiply(
+              math_ops.exp(diff_tiled - max_elements_vect), mask_tiled),
+          1,
+          keepdims=True), [batch_size, batch_size])
 
   loss_mat = max_elements + math_ops.log(
       loss_exp_left + array_ops.transpose(loss_exp_left))
@@ -715,7 +711,7 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
       candidate_scores, margin_multiplier * nmi_scores)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   return candidate_ids[argmax_index]
 
@@ -815,7 +811,7 @@ def update_medoid_per_cluster(pairwise_distances, pairwise_distances_subset,
   candidate_scores = math_ops.add(scores_fac, margin_multiplier * scores_margin)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   best_medoid = math_ops.to_int32(cluster_member_ids[argmax_index])
   chosen_ids = update_1d_tensor(chosen_ids, cluster_idx, best_medoid)
diff --git a/tensorflow/contrib/makefile/BUILD b/tensorflow/contrib/makefile/BUILD
index 701eeb44fe3f814cb3fb1cedd8618753946cc3e5..1abb46f4d41d2a9cc60d0cd9de865070689ddbfc 100644
--- a/tensorflow/contrib/makefile/BUILD
+++ b/tensorflow/contrib/makefile/BUILD
@@ -3,12 +3,3 @@
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:private"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 81327407d44b4317b7aecb964a689a35aa35c163..1a1ab54a53dd5866ca8357067846c002c5d5e9c1 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -89,6 +89,7 @@ HOST_INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
+-I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(HOST_GENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
@@ -125,7 +126,9 @@ PROTO_TEXT := $(HOST_BINDIR)proto_text
 # The list of dependencies is derived from the Bazel build file by running
 # the gen_file_lists.sh script on a system with a working Bazel setup.
 PROTO_TEXT_CC_FILES := $(shell cat $(MAKEFILE_DIR)/proto_text_cc_files.txt)
-PROTO_TEXT_PB_CC_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt)
+PROTO_TEXT_PB_CC_LIST := \
+	$(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) \
+	$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc)
 PROTO_TEXT_PB_H_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_h_files.txt)
 
 # Locations of the intermediate files proto_text generates.
@@ -171,6 +174,7 @@ INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
+-I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
@@ -326,6 +330,7 @@ $(MARCH_OPTION) \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
+-I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@@ -603,6 +608,7 @@ $(wildcard tensorflow/core/platform/*/*.cc) \
 $(wildcard tensorflow/core/platform/*/*/*.cc) \
 $(wildcard tensorflow/core/util/*.cc) \
 $(wildcard tensorflow/core/util/*/*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
@@ -677,6 +683,7 @@ endif  # TEGRA
 TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # Add in any extra files that don't fit the patterns easily
 TF_CC_SRCS += tensorflow/contrib/makefile/downloads/fft2d/fftsg.c
+TF_CC_SRCS += tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
 # Also include the op and kernel definitions.
 TF_CC_SRCS += $(shell cat $(MAKEFILE_DIR)/tf_op_files.txt)
 PBT_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_pb_text_files.txt)
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 995230dfa848532dc2a50b85f58d19ba264f293e..6c3b02e12b3082be8bfcc316c4c6122931eb5f76 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -194,6 +194,8 @@ with:
 srcs = glob(["libs/arm64-v8a/*.so"]),
 ```
 
+If you are building for Android TV (Shield TV devices), replace "portrait" with "landscape" for android:screenOrientation in all four activities in tensorflow/examples/android/AndroidManifest.xml
+
 Then run:
 ```bash
 # Create dir for native libs
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index 2d9979183975e6a17527b40ef5ee1795ced44a7b..0a458a27b3ac9b1a24b0f42de2f0166d515e8cd9 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -80,10 +80,9 @@ if [[ ! -z "${OPTIMIZE_FOR_GRAPH}" ]]; then
         fi
     else
         echo "${PRNT_SLCTV_BIN} found. Using it"
-        ${PRNT_SLCTV_BIN} --graphs=${OPTIMIZE_FOR_GRAPH} > ${TOP_SRCDIR}/tensorflow/core/framework/ops_to_register.h
-
     fi
 
+    ${PRNT_SLCTV_BIN} --graphs=${OPTIMIZE_FOR_GRAPH} > ${TOP_SRCDIR}/tensorflow/core/framework/ops_to_register.h
 fi
 
 if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
@@ -111,7 +110,7 @@ if [[ -z "${BUILD_ARCH}" ]]; then
     TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
 else
     # arch specified so build just that
-    TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios -a ${BUILD_ARCH}`
+    TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios -a "${BUILD_ARCH}"`
 fi
 export HOST_NSYNC_LIB TARGET_NSYNC_LIB
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index 7927997678f077a716d81749561068f259d9744f..e8c6edd7ba9aa6a45d956d1d5655b2809d8d2309 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -109,17 +109,18 @@ for arch in $archs; do
         linux)  makefile='
                         CC=${CC_PREFIX} g++
                         PLATFORM_CPPFLAGS=-DNSYNC_USE_CPP11_TIMEPOINT -DNSYNC_ATOMIC_CPP11 \
+                                          -I../../platform/c++11.futex \
                                           -I../../platform/c++11 -I../../platform/gcc \
                                           -I../../platform/posix -pthread
                         PLATFORM_CFLAGS=-std=c++11 -Werror -Wall -Wextra -pedantic
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
-                        PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
+                        PLATFORM_C=../../platform/linux/src/nsync_semaphore_futex.c \
                                    ../../platform/c++11/src/per_thread_waiter.cc \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
-                        PLATFORM_OBJS=nsync_semaphore_mutex.o per_thread_waiter.o yield.o \
+                        PLATFORM_OBJS=nsync_semaphore_futex.o per_thread_waiter.o yield.o \
                                       time_rep_timespec.o nsync_panic.o
                         TEST_PLATFORM_C=../../platform/c++11/src/start_thread.cc
                         TEST_PLATFORM_OBJS=start_thread.o
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 4ae18b2cef28335a90bbc967529c0cf76b0a5da2..eff9081e35c285027c764c5bdbaf14f78bc5f512 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,14 +27,17 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
+FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL_FILE_PATH}" | head -n1)"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
-CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -87,6 +90,7 @@ download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync"
 download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
 download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
 download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
+download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion"
 download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive"
 
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index d56e388477db6239cfb577f7e2754321ff33bd82..76428bc1d4e682e000998a6e28fc290e218c2341 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -12,11 +12,13 @@ tensorflow/core/platform/posix/env.cc
 tensorflow/core/platform/posix/load_library.cc
 tensorflow/core/platform/posix/env_time.cc
 tensorflow/core/platform/file_system.cc
+tensorflow/core/platform/file_system_helper.cc
 tensorflow/core/platform/env.cc
 tensorflow/core/platform/env_time.cc
 tensorflow/core/platform/setround.cc
 tensorflow/core/platform/denormal.cc
 tensorflow/core/platform/default/tracing.cc
+tensorflow/core/platform/default/mutex.cc
 tensorflow/core/platform/default/logging.cc
 tensorflow/core/platform/cpu_info.cc
 tensorflow/core/lib/wav/wav_io.cc
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 5a812af4e95fe7a05b9c2634b0cc1d860fb7f619..d4c3f2eda8be0c70e961afe582983b9f73769c77 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -151,6 +151,7 @@ tensorflow/core/kernels/decode_bmp_op.cc
 tensorflow/core/kernels/depthtospace_op.cc
 tensorflow/core/kernels/data_format_ops.cc
 tensorflow/core/kernels/spacetodepth_op.cc
+tensorflow/core/kernels/dense_update_functor.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
 tensorflow/core/kernels/decode_wav_op.cc
@@ -228,6 +229,11 @@ tensorflow/core/kernels/cast_op_impl_int64.cc
 tensorflow/core/kernels/cast_op_impl_int8.cc
 tensorflow/core/kernels/cast_op_impl_uint16.cc
 tensorflow/core/kernels/cast_op_impl_uint8.cc
+tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+tensorflow/core/kernels/boosted_trees/resource_ops.cc
+tensorflow/core/kernels/boosted_trees/resources.cc
+tensorflow/core/kernels/boosted_trees/stats_ops.cc
+tensorflow/core/kernels/boosted_trees/training_ops.cc
 tensorflow/core/kernels/bias_op.cc
 tensorflow/core/kernels/bcast_ops.cc
 tensorflow/core/kernels/batch_norm_op.cc
@@ -258,6 +264,7 @@ tensorflow/core/kernels/requantize.cc
 tensorflow/core/kernels/remote_fused_graph_execute_op.cc
 tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
 tensorflow/core/kernels/batch_matmul_op_real.cc
+tensorflow/core/kernels/random_op.cc
 tensorflow/core/ops/training_ops.cc
 tensorflow/core/ops/string_ops.cc
 tensorflow/core/ops/state_ops.cc
@@ -285,6 +292,7 @@ tensorflow/core/ops/data_flow_ops.cc
 tensorflow/core/ops/ctc_ops.cc
 tensorflow/core/ops/control_flow_ops.cc
 tensorflow/core/ops/candidate_sampling_ops.cc
+tensorflow/core/ops/boosted_trees_ops.cc
 tensorflow/core/ops/array_ops.cc
 tensorflow/core/ops/array_grad.cc
 tensorflow/core/kernels/spacetobatch_functor.cc
@@ -294,3 +302,5 @@ tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/batch_util.cc
 tensorflow/core/ops/audio_ops.cc
+tensorflow/core/kernels/decode_proto_op.cc
+tensorflow/core/kernels/encode_proto_op.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index d569bde637b20e0ca55c48c616855332abd9fb13..1f254692d7a8fb7af3ce795428464c48f5997a54 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -18,6 +18,7 @@ tensorflow/core/protobuf/device_properties.proto
 tensorflow/core/protobuf/rewriter_config.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/lib/core/error_codes.proto
+tensorflow/core/kernels/boosted_trees/boosted_trees.proto
 tensorflow/core/framework/versions.proto
 tensorflow/core/framework/variable.proto
 tensorflow/core/framework/types.proto
diff --git a/tensorflow/contrib/memory_stats/BUILD b/tensorflow/contrib/memory_stats/BUILD
index 72424c32e7b756e6c50965f38135869e03ba730f..63843b993c16363a80b64622af665aaa64e05830 100644
--- a/tensorflow/contrib/memory_stats/BUILD
+++ b/tensorflow/contrib/memory_stats/BUILD
@@ -79,15 +79,3 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/meta_graph_transform/BUILD b/tensorflow/contrib/meta_graph_transform/BUILD
index 4b5b1c3e15d36b7602791856416ece54d24798b2..24400789f8a937c88b86141704f7977494c1495e 100644
--- a/tensorflow/contrib/meta_graph_transform/BUILD
+++ b/tensorflow/contrib/meta_graph_transform/BUILD
@@ -59,15 +59,3 @@ filegroup(
         "**/*.py",
     ]),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index ff88b4fa841673fc52b9f6fdc5ca43d30c44bbfd..c35e60a5547c23e5f9c7b7fc2a0702d8a7decf30 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Apply graph_transforms tool to MetaGraphDefs."""
+"""Apply graph_transforms tool to MetaGraphDefs.
+
+@@meta_graph_transform
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -30,7 +33,7 @@ from tensorflow.python.framework import importer as _importer
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.saved_model import constants as _saved_model_constants
 from tensorflow.python.training import saver as _saver_lib
-from tensorflow.python.util import compat
+from tensorflow.python.util import compat as _compat
 from tensorflow.tools import graph_transforms as _graph_transforms
 
 
@@ -161,7 +164,7 @@ def _clean_save_and_restore(graph_def, op, removed_op_names):
   shapes = []
   dtypes = []
   for index, value in enumerate(name_op_value_tensor.string_val):
-    if not _is_removed(compat.as_str(value), removed_op_names):
+    if not _is_removed(_compat.as_str(value), removed_op_names):
       names.append(value)
       shapes.append(shape_op_value_tensor.string_val[index])
       dtypes.append(op.attr['dtypes'].list.type[index])
@@ -348,7 +351,7 @@ def _freeze_graph_with_def_protos(input_graph_def, output_node_names,
                                   input_saver_def, input_checkpoint):
   """Converts all variables in a graph and checkpoint into constants.
 
-  During this process, we need to retain certain initialzer nodes (e.g. table
+  During this process, we need to retain certain initializer nodes (e.g. table
   initializer nodes). Instead of determining which dependencies
   of the shared initializer node (e.g. group_deps) to keep, we
   reconstruct the connections between the individual initializer nodes and
@@ -651,7 +654,7 @@ def _is_removed_mentioned(s, removed_op_names):
   # /foo/bar. This regex ensures that we handle these two nodes
   # as separate entities.  It matches on nodes having names in the form of
   # '/foo/bar_x' as well as nodes having names in the form of 'foo.'
-  s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', compat.as_str_any(s))
+  s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', _compat.as_str_any(s))
   for removed_op_name in removed_op_names:
     for s_name in s_names:
       if s_name.endswith(removed_op_name):
@@ -737,9 +740,9 @@ def meta_graph_transform(
   for tag in tags:
     meta_graph_def.meta_info_def.tags.append(tag)
 
-  base_op_names = [compat.as_str(node.name)
+  base_op_names = [_compat.as_str(node.name)
                    for node in base_meta_graph_def.graph_def.node]
-  retained_op_names = [compat.as_str(node.name)
+  retained_op_names = [_compat.as_str(node.name)
                        for node in meta_graph_def.graph_def.node]
   removed_op_names = set(base_op_names) - set(retained_op_names)
 
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index e90c525113348532a3ebdadde7e712bf2d98cee9..e050f3c8d4fc61adfdd3d15869e533ed2b51c4a8 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -77,7 +77,7 @@ py_test(
 py_test(
     name = "metric_ops_test",
     srcs = ["python/ops/metric_ops_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["noasan"],  # times out b/63678675
     deps = [
@@ -97,14 +97,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index de02dc8f457364450929776035829d86035d706b..5effea3596bb83a08e0a8627e411684262aef5f7 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -71,6 +71,7 @@ See the @{$python/contrib.metrics} guide.
 @@count
 @@precision_recall_at_equal_thresholds
 @@recall_at_precision
+@@precision_at_recall
 
 """
 from __future__ import absolute_import
@@ -87,6 +88,7 @@ from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metrics
 from tensorflow.contrib.metrics.python.ops.metric_ops import auc_with_confidence_intervals
 from tensorflow.contrib.metrics.python.ops.metric_ops import cohen_kappa
 from tensorflow.contrib.metrics.python.ops.metric_ops import count
+from tensorflow.contrib.metrics.python.ops.metric_ops import precision_at_recall
 from tensorflow.contrib.metrics.python.ops.metric_ops import precision_recall_at_equal_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import recall_at_precision
 from tensorflow.contrib.metrics.python.ops.metric_ops import sparse_recall_at_top_k
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 31e274c5fd7c670458b1b40a4f58c668a23776c7..00a933e5e0c537033573b225d43581f74557b240 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -63,6 +63,8 @@ def _safe_div(numerator, denominator, name):
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
                              labels,
                              weights=None,
@@ -107,6 +109,8 @@ def streaming_true_positives(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.true_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_negatives(predictions,
                              labels,
                              weights=None,
@@ -151,6 +155,8 @@ def streaming_true_negatives(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.false_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_positives(predictions,
                               labels,
                               weights=None,
@@ -195,6 +201,8 @@ def streaming_false_positives(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.false_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_negatives(predictions,
                               labels,
                               weights=None,
@@ -238,6 +246,7 @@ def streaming_false_negatives(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.mean')
 def streaming_mean(values,
                    weights=None,
                    metrics_collections=None,
@@ -287,6 +296,7 @@ def streaming_mean(values,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.mean_tensor')
 def streaming_mean_tensor(values,
                           weights=None,
                           metrics_collections=None,
@@ -340,9 +350,8 @@ def streaming_mean_tensor(values,
       name=name)
 
 
-@deprecated(None,
-            'Please switch to tf.metrics.accuracy. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.accuracy. Note that the order '
+            'of the labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                        labels,
                        weights=None,
@@ -400,6 +409,8 @@ def streaming_accuracy(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.precision. Note that the order '
+            'of the labels and predictions arguments has been switched.')
 def streaming_precision(predictions,
                         labels,
                         weights=None,
@@ -456,6 +467,8 @@ def streaming_precision(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.recall. Note that the order '
+            'of the labels and predictions arguments has been switched.')
 def streaming_recall(predictions,
                      labels,
                      weights=None,
@@ -975,8 +988,8 @@ def streaming_curve_points(labels=None,
     return points, update_op
 
 
-@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of '
+            'the labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                   labels,
                   weights=None,
@@ -1263,7 +1276,7 @@ def _compute_placement_auc(labels, predictions, weights, alpha,
   weights_for_true = ordered_weights * float_labels_for_true
   weights_for_false = ordered_weights * float_labels_for_false
 
- # For each set of weights with the same segmented indices, we add up the
+  # For each set of weights with the same segmented indices, we add up the
   # weight values. Note that for each label, we deliberately rely on weights
   # for the opposite label.
   weight_totals_for_true = math_ops.segment_sum(weights_for_false,
@@ -1797,9 +1810,9 @@ def streaming_sensitivity_at_specificity(predictions,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.precision_at_thresholds. Note that the '
-    'order of the labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.precision_at_thresholds. Note that '
+            'the order of the labels and predictions arguments are switched.')
 def streaming_precision_at_thresholds(predictions,
                                       labels,
                                       thresholds,
@@ -2575,6 +2588,121 @@ def recall_at_precision(labels,
     return recall, update_op
 
 
+def precision_at_recall(labels,
+                        predictions,
+                        target_recall,
+                        weights=None,
+                        num_thresholds=200,
+                        metrics_collections=None,
+                        updates_collections=None,
+                        name=None):
+  """Computes the precision at a given recall.
+
+  This function creates variables to track the true positives, false positives,
+  true negatives, and false negatives at a set of thresholds. Among those
+  thresholds where recall is at least `target_recall`, precision is computed
+  at the threshold where recall is closest to `target_recall`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  precision at `target_recall`. `update_op` increments the counts of true
+  positives, false positives, true negatives, and false negatives with the
+  weight of each case found in the `predictions` and `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  For additional information about precision and recall, see
+  http://en.wikipedia.org/wiki/Precision_and_recall
+
+  Args:
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    target_recall: A scalar value in range `[0, 1]`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `labels` dimension).
+    num_thresholds: The number of thresholds to use for matching the given
+      recall.
+    metrics_collections: An optional list of collections to which `precision`
+      should be added.
+    updates_collections: An optional list of collections to which `update_op`
+      should be added.
+    name: An optional variable_scope name.
+
+  Returns:
+    precision: A scalar `Tensor` representing the precision at the given
+      `target_recall` value.
+    update_op: An operation that increments the variables for tracking the
+      true positives, false positives, true negatives, and false negatives and
+      whose value matches `precision`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `target_recall` is not between 0 and 1, or if either `metrics_collections`
+      or `updates_collections` are not a list or tuple.
+    RuntimeError: If eager execution is enabled.
+  """
+  if context.executing_eagerly():
+    raise RuntimeError('tf.metrics.precision_at_recall is not '
+                       'supported when eager execution is enabled.')
+
+  if target_recall < 0 or target_recall > 1:
+    raise ValueError('`target_recall` must be in the range [0, 1].')
+
+  with variable_scope.variable_scope(name, 'precision_at_recall',
+                                     (predictions, labels, weights)):
+    kepsilon = 1e-7  # Used to avoid division by zero.
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+    values, update_ops = _streaming_confusion_matrix_at_thresholds(
+        predictions, labels, thresholds, weights)
+
+    def compute_precision_at_recall(tp, fp, fn, name):
+      """Computes the precision at a given recall.
+
+      Args:
+        tp: True positives.
+        fp: False positives.
+        fn: False negatives.
+        name: A name for the operation.
+
+      Returns:
+        The precision at the desired recall.
+      """
+      recalls = math_ops.div(tp, tp + fn + kepsilon)
+
+      # Because recall is monotone decreasing as a function of the threshold,
+      # the smallest recall exceeding target_recall occurs at the largest
+      # threshold where recall >= target_recall.
+      admissible_recalls = math_ops.cast(
+          math_ops.greater_equal(recalls, target_recall), dtypes.int64)
+      tf_index = math_ops.reduce_sum(admissible_recalls) - 1
+
+      # Now we have the threshold at which to compute precision:
+      return math_ops.div(tp[tf_index] + kepsilon,
+                          tp[tf_index] + fp[tf_index] + kepsilon,
+                          name)
+
+    precision_value = compute_precision_at_recall(
+        values['tp'], values['fp'], values['fn'], 'value')
+    update_op = compute_precision_at_recall(
+        update_ops['tp'], update_ops['fp'], update_ops['fn'], 'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, precision_value)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return precision_value, update_op
+
+
 def streaming_sparse_average_precision_at_k(predictions,
                                             labels,
                                             k,
@@ -2706,7 +2834,9 @@ def streaming_sparse_average_precision_at_top_k(top_k_predictions,
       name=name)
 
 
-@deprecated(None, 'Please switch to tf.metrics.mean.')
+@deprecated(None,
+            'Please switch to tf.metrics.mean_absolute_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_absolute_error(predictions,
                                   labels,
                                   weights=None,
@@ -2825,7 +2955,9 @@ def streaming_mean_relative_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None,
+            'Please switch to tf.metrics.mean_squared_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_squared_error(predictions,
                                  labels,
                                  weights=None,
@@ -2883,7 +3015,10 @@ def streaming_mean_squared_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(
+    None,
+    'Please switch to tf.metrics.root_mean_squared_error. Note that the '
+    'order of the labels and predictions arguments has been switched.')
 def streaming_root_mean_squared_error(predictions,
                                       labels,
                                       weights=None,
@@ -3223,7 +3358,7 @@ def streaming_mean_cosine_distance(predictions,
   radial_diffs = math_ops.reduce_sum(
       radial_diffs, reduction_indices=[
           dim,
-      ], keep_dims=True)
+      ], keepdims=True)
   mean_distance, update_op = streaming_mean(radial_diffs, weights, None, None,
                                             name or 'mean_cosine_distance')
   mean_distance = math_ops.subtract(1.0, mean_distance)
@@ -3646,8 +3781,8 @@ def cohen_kappa(labels,
       `updates_collections` are not a list or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
-    raise RuntimeError('tf.contrib.metrics.cohen_kappa is not supported'
+  if context.executing_eagerly():
+    raise RuntimeError('tf.contrib.metrics.cohen_kappa is not supported '
                        'when eager execution is enabled.')
   if num_classes < 2:
     raise ValueError('`num_classes` must be >= 2.'
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index b387f26c0195432fb972dac450d2919bdaa702a1..76420db8bda39435bcc2be2fd3d8c3467d6753e2 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -1802,9 +1802,9 @@ class StreamingAUCTest(test.TestCase):
       auc, update_op = metrics.streaming_auc(predictions, labels, curve='PR')
 
       sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.54166603, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
 
-      self.assertAlmostEqual(0.54166603, auc.eval(), delta=1e-3)
+      self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
 
   def testAnotherAUCPRSpecialCase(self):
     with self.test_session() as sess:
@@ -1816,9 +1816,9 @@ class StreamingAUCTest(test.TestCase):
       auc, update_op = metrics.streaming_auc(predictions, labels, curve='PR')
 
       sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.44365042, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
 
-      self.assertAlmostEqual(0.44365042, auc.eval(), delta=1e-3)
+      self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
 
   def testThirdAUCPRSpecialCase(self):
     with self.test_session() as sess:
@@ -1830,9 +1830,9 @@ class StreamingAUCTest(test.TestCase):
       auc, update_op = metrics.streaming_auc(predictions, labels, curve='PR')
 
       sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.73611039, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
 
-      self.assertAlmostEqual(0.73611039, auc.eval(), delta=1e-3)
+      self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-3)
 
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
@@ -1865,9 +1865,9 @@ class StreamingAUCTest(test.TestCase):
       auc, update_op = metrics.streaming_auc(predictions, labels, curve='PR')
 
       sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.49999976, sess.run(update_op), 6)
+      self.assertAlmostEqual(1, sess.run(update_op), 6)
 
-      self.assertAlmostEqual(0.49999976, auc.eval(), 6)
+      self.assertAlmostEqual(1, auc.eval(), 6)
 
   def testWithMultipleUpdates(self):
     num_samples = 1000
@@ -3380,6 +3380,138 @@ class RecallAtPrecisionTest(test.TestCase):
       self.assertAlmostEqual(target_recall, recall.eval())
 
 
+class PrecisionAtRecallTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.precision_at_recall(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        target_recall=0.7)
+    _assert_metric_variables(self,
+                             ('precision_at_recall/true_positives:0',
+                              'precision_at_recall/false_negatives:0',
+                              'precision_at_recall/false_positives:0',
+                              'precision_at_recall/true_negatives:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.precision_at_recall(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        target_recall=0.7,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.precision_at_recall(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        target_recall=0.7,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = random_ops.random_uniform(
+        (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
+    labels = random_ops.random_uniform(
+        (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=1)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.7)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_precision = precision.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_precision, precision.eval(), places=5)
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(inputs)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.7)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+      self.assertEqual(1, precision.eval())
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
+    labels = 1.0 - predictions
+    label_prior = math_ops.reduce_mean(labels)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.2)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertEqual(sess.run(label_prior), sess.run(update_op))
+      self.assertEqual(sess.run(label_prior), precision.eval())
+
+  def testSomeCorrectHighRecall(self):
+    predictions_values = [0.1, 0.2, 0.5, 0.3, 0.0, 0.1, 0.45, 0.5, 0.8, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.8)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.assertAlmostEqual(0.8, precision.eval())
+
+  def testSomeCorrectLowRecall(self):
+    predictions_values = [0.1, 0.2, 0.7, 0.3, 0.0, 0.1, 0.45, 0.5, 0.6, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.4)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(2.0/3, sess.run(update_op))
+      self.assertAlmostEqual(2.0/3, precision.eval())
+
+  def testWeighted_multipleLabelDtypes(self):
+    for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions_values = [
+          0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.22, 0.25, 0.31, 0.35]
+      labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+      weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+      predictions = constant_op.constant(
+          predictions_values, dtype=dtypes_lib.float32)
+      labels = math_ops.cast(labels_values, dtype=label_dtype)
+      weights = constant_op.constant(weights_values)
+      precision, update_op = metrics.precision_at_recall(
+          labels, predictions, target_recall=0.8, weights=weights)
+
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertAlmostEqual(34.0/43, sess.run(update_op))
+        self.assertAlmostEqual(34.0/43, precision.eval())
+
+
 class StreamingFNRThresholdsTest(test.TestCase):
 
   def setUp(self):
@@ -6888,8 +7020,7 @@ class CohenKappaTest(test.TestCase):
     # [[0, 25, 0],
     #  [0, 0, 25],
     #  [25, 0, 0]]
-    # Calculated by v0.19: sklearn.metrics.cohen_kappa_score(
-    #                          labels, predictions)
+    # Calculated by v0.19: sklearn.metrics.cohen_kappa_score(labels, predictions)
     expect = -0.333333333333
 
     with self.test_session() as sess:
@@ -6948,8 +7079,7 @@ class CohenKappaTest(test.TestCase):
                 weights_t: weights[batch_start:batch_end]
             })
       # Calculated by v0.19: sklearn.metrics.cohen_kappa_score(
-      #                          labels_np, predictions_np,
-      #                          sample_weight=weights_np)
+      #                          labels_np, predictions_np, sample_weight=weights_np)
       expect = 0.289965397924
       self.assertAlmostEqual(expect, kappa.eval(), 5)
 
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index ca3f13479ed32e9ab3d43dfe9a392ef8466ce5f2..54bd39afacbec07f054f61b72eda0a3654858aa7 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -71,6 +71,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "pruning_utils",
+    srcs = ["python/pruning_utils.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "pruning",
     srcs = ["python/pruning.py"],
@@ -78,9 +89,20 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":core_layers",
+        ":pruning_utils",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/python:platform",
-        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "pruning_utils_test",
+    size = "small",
+    srcs = ["python/pruning_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pruning_utils",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
@@ -125,15 +147,3 @@ py_library(
         ":rnn_cells",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index d286750c257e9a78a82c95c1fc872b3ca6972203..86f4fd6adf60d8fa54c13989bf4087e28f1e006f 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -45,7 +45,7 @@ The pruning library allows for specification of the following hyper parameters:
 | do_not_prune | list of strings | [""] | list of layers names that are not pruned |
 | threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
-| nbins | integer | 255 | Number of bins to use for histogram computation |
+| nbins | integer | 256 | Number of bins to use for histogram computation |
 | block_height|integer | 1 | Number of rows in a block for block sparse matrices|
 | block_width |integer | 1 | Number of cols in a block for block sparse matrices|
 | block_pooling_function| string | AVG | The function to use to pool weight values in a block: average (AVG) or max (MAX)|
@@ -134,7 +134,7 @@ $ bazel-bin/$examples_dir/cifar10/cifar10_eval --run_once
 
 ### Block Sparsity
 
-For some hardware architectures, it may be beneficial to induce spatially correlated sparsity. To train models in which the weight tensors have block sparse structure, set *block_height* and *block_width* hyperparameters to the desired block configuration (2x2, 4x4, 4x1, 1x8, etc). Currently, block sparsity is supported for weight tensors with rank 2 only. The matrix is partitioned into non-overlapping blocks of size *[block_height, block_dim]* and the either the average or max absolute value in this block is taken as a proxy for the entire block (set by *block_pooling_function* hyperparameter).
+For some hardware architectures, it may be beneficial to induce spatially correlated sparsity. To train models in which the weight tensors have block sparse structure, set *block_height* and *block_width* hyperparameters to the desired block configuration (2x2, 4x4, 4x1, 1x8, etc). Currently, block sparsity is only supported for weight tensors which can be squeezed to rank 2. The matrix is partitioned into non-overlapping blocks of size *[block_height, block_dim]* and the either the average or max absolute value in this block is taken as a proxy for the entire block (set by *block_pooling_function* hyperparameter).
 The convolution layer tensors are always pruned used block dimensions of [1,1].
 
 ## References
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
index e7848adcc5ac126a2b85ef6dcb0ffa355b8b0628..30ea9122229c72950bee280f7a6c5eda4ac2fdbf 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
@@ -68,15 +68,3 @@ py_binary(
         "//tensorflow/contrib/model_pruning:pruning",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/model_pruning/python/layers/layers.py b/tensorflow/contrib/model_pruning/python/layers/layers.py
index 988748ad75bdf72f1da3f4e1c6e85aabb04a5954..466daf204a1ae86a7f37107342046305ea7249fc 100644
--- a/tensorflow/contrib/model_pruning/python/layers/layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/layers.py
@@ -214,7 +214,7 @@ def masked_convolution(inputs,
     elif data_format == 'NCHW':
       df = 'channels_first'
     else:
-      raise ValueError('Unsupported data fromat', data_format)
+      raise ValueError('Unsupported data format', data_format)
 
     layer = layer_class(
         filters=num_outputs,
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index d16af9da19816211ee22f6ea48a347f0b9a4e612..ea6032e588cf398deaf497fb99087436ce1cb2e8 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -33,12 +33,14 @@
   # Returns a list of all the weight tensors that have been masked
   get_weights()
 
-  The Pruning class uses a proto (defined in pruning.proto) to set up the
-  parameters for a pruning specification. Here's a typical usage:
+  The Pruning class uses a tf.hparams object to set up the
+  parameters for a model pruning. Here's a typical usage:
 
-  # Initialize a pruning spec from a proto
-  pruning_spec = '/tmp/pruning.pb'
-  p = Pruning(pruning_spec)
+  # Parse pruning hyperparameters
+  pruning_hparams = pruning.get_pruning_hparams().parse(FLAGS.pruning_hparams)
+
+  # Create a pruning object using the pruning_hparams
+  p = pruning.Pruning(pruning_hparams)
 
   # Add mask update ops to the graph
   mask_update_op = p.conditional_mask_update_op()
@@ -51,24 +53,20 @@
 
   # An object of the pruning also accepts externally defined sparsity:
   sparsity = tf.Variable(0.5, name = "ConstantSparsity")
-  pruning_spec = '/tmp/pruning.pb'
-  p = Pruning(pruning_spec, sparsity=sparsity)
-
+  p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
 """
 # pylint: disable=missing-docstring
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
+from tensorflow.contrib.model_pruning.python import pruning_utils
 from tensorflow.contrib.model_pruning.python.layers import core_layers as core
 from tensorflow.contrib.training.python.training import hparam
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
@@ -87,172 +85,18 @@ _WEIGHT_COLLECTION = core.WEIGHT_COLLECTION
 _MASKED_WEIGHT_NAME = core.MASKED_WEIGHT_NAME
 
 
-def _weight_mask_variable(var, scope):
-  """Create a mask for the weights.
-
-  This function adds a variable 'mask' to the graph.
-
-  Args:
-    var: the weight variable that needs to be masked
-    scope: The variable scope of the variable var
-
-  Returns:
-    the mask variable of the same size and shape as var, initialized to all 1s.
-  """
-  with variable_scope.variable_scope(scope):
-    mask = variable_scope.get_variable(
-        'mask',
-        var.get_shape(),
-        initializer=init_ops.ones_initializer(),
-        trainable=False,
-        dtype=var.dtype)
-  return mask
-
-
-def _weight_threshold_variable(var, scope):
-  """Create a scalar threshold for the weights.
-
-  This function adds a variable
-  'threshold' to the graph.
-
-  Args:
-    var: The weight variable that needs to be masked
-    scope: The variable scope of the variable var
-
-  Returns:
-    a scalar threshold variable initialized to 0.
-  """
-  with variable_scope.variable_scope(scope):
-    threshold = variable_scope.get_variable(
-        'threshold', [],
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        dtype=var.dtype)
-    return threshold
-
-
-def _kronecker_product(mat1, mat2):
-  """Computes the Kronecker product of two matrices mat1 and mat2.
-
-  Args:
-    mat1: A matrix of size m x n
-    mat2: A matrix of size p x q
-  Returns:
-    Kronecker product of matrices mat1 and mat2 of size mp x nq
-  """
-
-  m1, n1 = mat1.get_shape().as_list()
-  mat1_rsh = array_ops.reshape(mat1, [m1, 1, n1, 1])
-  m2, n2 = mat2.get_shape().as_list()
-  mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
-  return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
-
-
-def _histogram(values, value_range, nbins=100, dtype=np.int32, name=None):
-  """Return histogram of values.
-
-  Given the tensor `values`, this operation returns a rank 1 histogram counting
-  the number of entries in `values` that fell into every bin.  The bins are
-  equal width and determined by the arguments `value_range` and `nbins`.
-
-  Args:
-    values:  Numeric `Tensor`.
-    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
-      values <= value_range[0] will be mapped to hist[0],
-      values >= value_range[1] will be mapped to hist[-1].
-    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
-    dtype:  dtype for returned histogram.
-    name:  A name for this operation (defaults to 'histogram').
-
-  Returns:
-    A 1-D `Tensor` holding histogram of values.
-
-  """
-  with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
-    values = ops.convert_to_tensor(values, name='values')
-    values = gen_array_ops.reshape(values, [-1])
-    value_range = ops.convert_to_tensor(value_range, name='value_range')
-    nbins = ops.convert_to_tensor(nbins, dtype=np.int32, name='nbins')
-    nbins_float = math_ops.cast(nbins, values.dtype)
-
-    # Map tensor values that fall within value_range to [0, 1].
-    scaled_values = math_ops.truediv(
-        values - value_range[0],
-        value_range[1] - value_range[0],
-        name='scaled_values')
-
-    # map tensor values within the open interval value_range to {0,.., nbins-1},
-    # values outside the open interval will be zero or less, or nbins or more.
-    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
-
-    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
-    indices = math_ops.cast(
-        clip_ops.clip_by_value(indices, 0, nbins_float - 1), np.int32)
-
-    return math_ops.unsorted_segment_sum(
-        array_ops.ones_like(indices, dtype=dtype), indices, nbins, name=scope)
-
-
-def _determine_partitioned_axis(partitioned_variable):
-  partitioned_axis = 0
-  concatenated_variable_shape = partitioned_variable.get_shape()
-  for partition in partitioned_variable:
-    partition_shape = partition.get_shape()
-    maybe_partitioned_axis = np.less(partition_shape,
-                                     concatenated_variable_shape)
-    # Sanity check: make sure number of partitioned axis == 1
-    if np.count_nonzero(maybe_partitioned_axis) != 1:
-      raise ValueError('Number of partitioned axes %s not equal to 1' %
-                       np.count_nonzero(maybe_partitioned_axis))
-    partitioned_axis = np.where(maybe_partitioned_axis)[0][0]
-  return partitioned_axis
-
-
-def _variable_assign(var, new_value):
-  return state_ops.assign(var, new_value, name=var.op.name + '_assign')
-
-
-def _partitioned_variable_assign(partitioned_var, new_value):
-  """Assign op for partitioned variables.
-
-  Args:
-    partitioned_var: A partitioned tensotflow variable
-    new_value: Value to be assigned to the variable var
-
-  Returns:
-    A tensorflow op that groups the assign ops for each of the variable slices
-  """
-  # Determine which axis was used to partition the variable. Currently
-  # tensorflow allows partitioning variable only along 1 axis.
-  axis = 0 if len(partitioned_var) == 1 else _determine_partitioned_axis(
-      partitioned_var)
-
-  partition_sizes = np.array(
-      [partition.get_shape()[axis] for partition in partitioned_var])
-  new_partitioned_values = array_ops.split(
-      new_value,
-      ops.convert_to_tensor(partition_sizes, dtype=np.int32),
-      axis=axis)
-  op_list = []
-  for partition in partitioned_var:
-    op_list.append(
-        _variable_assign(partition, new_partitioned_values[len(op_list)]))
-  return control_flow_ops.group(
-      *op_list, name=partitioned_var.name + '_group_assign')
-
-
 def apply_mask(x, scope=''):
   """Apply mask to a given weight tensor.
 
   Args:
     x: Input weight tensor
-    scope: The current variable scope. Defaults to ""
+    scope: The current variable scope. Defaults to "".
   Returns:
     Tensor representing masked_weights
   """
 
-  mask = _weight_mask_variable(x, scope)
-  threshold = _weight_threshold_variable(x, scope)
+  mask = pruning_utils.weight_mask_variable(x, scope)
+  threshold = pruning_utils.weight_threshold_variable(x, scope)
   # Add masked_weights in the weights namescope so as to make it easier
   # for the quantization library to add quant ops.
   masked_weights = math_ops.multiply(mask, x, _MASKED_WEIGHT_NAME)
@@ -335,6 +179,8 @@ def get_pruning_hparams():
     sparsity_function_exponent: float
       exponent = 1 is linearly varying sparsity between initial and final.
       exponent > 1 varies more slowly towards the end than the beginning
+    use_tpu: False
+      Indicates whether to use TPU
 
     We use the following sparsity function:
 
@@ -357,7 +203,7 @@ def get_pruning_hparams():
       do_not_prune=[''],
       threshold_decay=0.9,
       pruning_frequency=10,
-      nbins=255,
+      nbins=256,
       block_height=1,
       block_width=1,
       block_pooling_function='AVG',
@@ -365,7 +211,8 @@ def get_pruning_hparams():
       target_sparsity=0.5,
       sparsity_function_begin_step=0,
       sparsity_function_end_step=100,
-      sparsity_function_exponent=3)
+      sparsity_function_exponent=3,
+      use_tpu=False)
 
 
 class Pruning(object):
@@ -414,7 +261,7 @@ class Pruning(object):
     if graph_global_step is None:
       graph_global_step = training_util.get_global_step()
 
-    return math_ops.cast(graph_global_step, np.int32)
+    return math_ops.cast(graph_global_step, dtypes.int32)
 
   def _setup_sparsity(self):
     begin_step = self._spec.sparsity_function_begin_step
@@ -429,13 +276,13 @@ class Pruning(object):
           (begin_step, end_step))
 
     with ops.name_scope(self._spec.name):
-      p = math_ops.minimum(1.0,
-                           math_ops.maximum(
-                               0.0,
-                               math_ops.div(
-                                   math_ops.cast(self._global_step - begin_step,
-                                                 np.float32),
-                                   end_step - begin_step)))
+      p = math_ops.minimum(
+          1.0,
+          math_ops.maximum(
+              0.0,
+              math_ops.div(
+                  math_ops.cast(self._global_step - begin_step, dtypes.float32),
+                  end_step - begin_step)))
       sparsity = math_ops.add(
           math_ops.multiply(initial_sparsity - target_sparsity,
                             math_ops.pow(1 - p, exponent)),
@@ -445,17 +292,18 @@ class Pruning(object):
     return sparsity
 
   def _setup_last_update_step(self):
-    with variable_scope.variable_scope(self._spec.name) as scope:
+    with variable_scope.variable_scope(
+        self._spec.name, use_resource=self._spec.use_tpu) as scope:
       try:
         last_update_step = variable_scope.get_variable(
             'last_mask_update_step', [],
             initializer=init_ops.zeros_initializer(),
             trainable=False,
-            dtype=np.int32)
+            dtype=dtypes.int32)
       except ValueError:
         scope.reuse_variables()
         last_update_step = variable_scope.get_variable(
-            'last_mask_update_step', dtype=np.int32)
+            'last_mask_update_step', dtype=dtypes.int32)
     return last_update_step
 
   def _exists_in_do_not_prune_list(self, tensor_name):
@@ -497,18 +345,16 @@ class Pruning(object):
     with ops.name_scope(weights.op.name + '_pruning_ops'):
       abs_weights = math_ops.abs(weights)
       max_value = math_ops.reduce_max(abs_weights)
-      histogram = _histogram(
-          abs_weights, [0.0, max_value],
-          nbins=self._spec.nbins,
-          dtype=np.float32)
+      cdf_fn = pruning_utils.compute_cdf_from_histogram
+      if self._spec.use_tpu:
+        cdf_fn = pruning_utils.compute_cdf
 
-      cdf = math_ops.cumsum(histogram)
-      norm_cdf = math_ops.div(cdf, math_ops.reduce_sum(histogram))
+      norm_cdf = cdf_fn(abs_weights, [0.0, max_value], nbins=self._spec.nbins)
       current_threshold = math_ops.multiply(
           math_ops.div(
               math_ops.reduce_sum(
                   math_ops.cast(
-                      math_ops.less(norm_cdf, self._sparsity), np.float32)),
+                      math_ops.less(norm_cdf, self._sparsity), dtypes.float32)),
               float(self._spec.nbins)), max_value)
 
       smoothed_threshold = math_ops.add_n([
@@ -516,14 +362,15 @@ class Pruning(object):
           math_ops.multiply(threshold, self._spec.threshold_decay)
       ])
       new_mask = math_ops.cast(
-          math_ops.greater(abs_weights, smoothed_threshold), np.float32)
+          math_ops.greater(abs_weights, smoothed_threshold), dtypes.float32)
     return smoothed_threshold, new_mask
 
   def _maybe_update_block_mask(self, weights, threshold):
     """Performs block-granular masking of the weights.
 
     Block pruning occurs only if the block_height or block_width is > 1 and
-    if the weight tensor has ndims = 2. Otherwise, elementwise pruning occurs.
+    if the weight tensor, when squeezed, has ndims = 2. Otherwise, elementwise
+    pruning occurs.
     Args:
       weights: The weight tensor that needs to be masked.
       threshold: The current threshold value. The function will compute a new
@@ -540,7 +387,8 @@ class Pruning(object):
     Raises:
       ValueError: if block pooling function is not AVG or MAX
     """
-    if weights.get_shape().ndims != 2 or self._block_dim == [1, 1]:
+    squeezed_weights = array_ops.squeeze(weights)
+    if squeezed_weights.get_shape().ndims != 2 or self._block_dim == [1, 1]:
       return self._update_mask(weights, threshold)
 
     if self._block_pooling_function not in ['AVG', 'MAX']:
@@ -549,9 +397,11 @@ class Pruning(object):
 
     with ops.name_scope(weights.op.name + '_pruning_ops'):
       abs_weights = math_ops.abs(
-          array_ops.reshape(
-              weights, [1, weights.get_shape()[0],
-                        weights.get_shape()[1], 1]))
+          array_ops.reshape(weights, [
+              1,
+              squeezed_weights.get_shape()[0],
+              squeezed_weights.get_shape()[1], 1
+          ]))
       pool_window = [self._block_dim[0], self._block_dim[1]]
       pooled_weights = nn_ops.pool(
           abs_weights,
@@ -568,13 +418,14 @@ class Pruning(object):
           new_mask,
           [pooled_weights.get_shape()[1],
            pooled_weights.get_shape()[2]])
-      updated_mask = _kronecker_product(reshaped_mask,
-                                        array_ops.ones(self._block_dim))
+      updated_mask = pruning_utils.kronecker_product(
+          reshaped_mask, array_ops.ones(self._block_dim))
       sliced_mask = array_ops.slice(
           updated_mask, [0, 0],
-          [weights.get_shape()[0],
-           weights.get_shape()[1]])
-    return smoothed_threshold, sliced_mask
+          [squeezed_weights.get_shape()[0],
+           squeezed_weights.get_shape()[1]])
+    return smoothed_threshold, array_ops.reshape(sliced_mask,
+                                                 array_ops.shape(weights))
 
   def _get_mask_assign_ops(self):
     # Make sure the assignment ops have not already been added to the list
@@ -603,11 +454,12 @@ class Pruning(object):
           continue
 
       new_threshold, new_mask = self._maybe_update_block_mask(weight, threshold)
-      self._assign_ops.append(_variable_assign(threshold, new_threshold))
+      self._assign_ops.append(
+          pruning_utils.variable_assign(threshold, new_threshold))
 
       self._assign_ops.append(
-          _partitioned_variable_assign(mask, new_mask)
-          if is_partitioned else _variable_assign(mask, new_mask))
+          pruning_utils.partitioned_variable_assign(mask, new_mask)
+          if is_partitioned else pruning_utils.variable_assign(mask, new_mask))
 
   def mask_update_op(self):
     with ops.name_scope(self._spec.name):
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index 1767b4bb94a9bb56bc6a4933423ad27d8cf3ed35..f80b7c52c000f13b5ce98dd442ff21abfac37761 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -110,12 +110,12 @@ class PruningTest(test.TestCase):
       self.assertAllEqual(np.count_nonzero(masked_weights_val), 100)
       session.run(mask_update_op)
       masked_weights_val = masked_weights.eval()
-      self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 50)
 
   def _blockMasking(self, hparams, weights, expected_mask):
 
     threshold = variables.Variable(0.0, name="threshold")
-    sparsity = variables.Variable(0.51, name="sparsity")
+    sparsity = variables.Variable(0.5, name="sparsity")
     test_spec = ",".join(hparams)
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
 
@@ -138,7 +138,26 @@ class PruningTest(test.TestCase):
     weights_max = constant_op.constant(
         [[0.1, 0.0, 0.2, 0.0], [0.0, -0.1, 0.0, -0.2], [0.3, 0.0, 0.4, 0.0],
          [0.0, -0.3, 0.0, -0.4]])
-    expected_mask = [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]]
+    expected_mask = [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+                     [1., 1., 1., 1.], [1., 1., 1., 1.]]
+
+    self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max,
+                       expected_mask)
+    self._blockMasking(param_list + ["block_pooling_function=AVG"], weights_avg,
+                       expected_mask)
+
+  def testBlockMaskingWithHigherDimensions(self):
+    param_list = ["block_height=2", "block_width=2", "threshold_decay=0"]
+
+    # Weights as in testBlockMasking, but with one extra dimension.
+    weights_avg = constant_op.constant(
+        [[[0.1, 0.1, 0.2, 0.2], [0.1, 0.1, 0.2, 0.2], [0.3, 0.3, 0.4, 0.4],
+          [0.3, 0.3, 0.4, 0.4]]])
+    weights_max = constant_op.constant(
+        [[[0.1, 0.0, 0.2, 0.0], [0.0, -0.1, 0.0, -0.2], [0.3, 0.0, 0.4, 0.0],
+          [0.0, -0.3, 0.0, -0.4]]])
+    expected_mask = [[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+                      [1., 1., 1., 1.], [1., 1., 1., 1.]]]
 
     self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max,
                        expected_mask)
@@ -161,11 +180,12 @@ class PruningTest(test.TestCase):
       masked_weights_val = masked_weights.eval()
       session.run(mask_update_op)
       masked_weights_val = masked_weights.eval()
-      self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 50)
 
   def testConditionalMaskUpdate(self):
     param_list = [
-        "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6"
+        "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6",
+        "nbins=100"
     ]
     test_spec = ",".join(param_list)
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..56d3dcef20d1b1c34d6b04535e2b4dc7be7f7320
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -0,0 +1,269 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions for adding pruning related ops to the graph.
+"""
+# pylint: disable=missing-docstring
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+
+_NBINS = 256
+
+
+def weight_mask_variable(var, scope):
+  """Create a mask for the weights.
+
+  This function adds a variable 'mask' to the graph.
+
+  Args:
+    var: the weight variable that needs to be masked
+    scope: The variable scope of the variable var
+
+  Returns:
+    the mask variable of the same size and shape as var, initialized to all 1s.
+  """
+  with variable_scope.variable_scope(scope):
+    mask = variable_scope.get_variable(
+        'mask',
+        var.get_shape(),
+        initializer=init_ops.ones_initializer(),
+        trainable=False,
+        dtype=var.dtype)
+  return mask
+
+
+def weight_threshold_variable(var, scope):
+  """Create a scalar threshold for the weights.
+
+  This function adds a variable
+  'threshold' to the graph.
+
+  Args:
+    var: The weight variable that needs to be masked
+    scope: The variable scope of the variable var
+
+  Returns:
+    a scalar threshold variable initialized to 0.
+  """
+  with variable_scope.variable_scope(scope):
+    threshold = variable_scope.get_variable(
+        'threshold', [],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        dtype=var.dtype)
+    return threshold
+
+
+def kronecker_product(mat1, mat2):
+  """Computes the Kronecker product of two matrices mat1 and mat2.
+
+  Args:
+    mat1: A matrix of size m x n
+    mat2: A matrix of size p x q
+  Returns:
+    Kronecker product of matrices mat1 and mat2 of size mp x nq
+  """
+
+  m1, n1 = mat1.get_shape().as_list()
+  mat1_rsh = array_ops.reshape(mat1, [m1, 1, n1, 1])
+  m2, n2 = mat2.get_shape().as_list()
+  mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
+  return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
+
+
+def _histogram(values, value_range, nbins=100, dtype=dtypes.int32, name=None):
+  """Return histogram of values.
+
+  Given the tensor `values`, this operation returns a rank 1 histogram counting
+  the number of entries in `values` that fell into every bin.  The bins are
+  equal width and determined by the arguments `value_range` and `nbins`.
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+      values <= value_range[0] will be mapped to hist[0],
+      values >= value_range[1] will be mapped to hist[-1].
+    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
+    dtype:  dtype for returned histogram.
+    name:  A name for this operation (defaults to 'histogram').
+
+  Returns:
+    A 1-D `Tensor` holding histogram of values.
+
+  """
+  with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
+    values = ops.convert_to_tensor(values, name='values')
+    values = array_ops.reshape(values, [-1])
+    value_range = ops.convert_to_tensor(value_range, name='value_range')
+    nbins_float = np.float32(nbins)
+
+    # Map tensor values that fall within value_range to [0, 1].
+    scaled_values = math_ops.truediv(
+        values - value_range[0],
+        value_range[1] - value_range[0],
+        name='scaled_values')
+
+    # map tensor values within the open interval value_range to {0,.., nbins-1},
+    # values outside the open interval will be zero or less, or nbins or more.
+    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
+
+    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
+    indices = math_ops.cast(
+        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
+
+    return math_ops.unsorted_segment_sum(
+        array_ops.ones_like(indices, dtype=dtype), indices, nbins, name=scope)
+
+
+def compute_cdf_from_histogram(values, value_range, **kwargs):
+  """Returns the normalized cumulative distribution of the given values tensor.
+
+  Computes the histogram and uses tf.cumsum to arrive at cdf
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+    **kwargs: keyword arguments: nbins, name
+
+  Returns:
+    A 1-D `Tensor` holding normalized cdf of values.
+
+  """
+  nbins = kwargs.get('nbins', _NBINS)
+  name = kwargs.get('name', None)
+  with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
+    histogram = _histogram(
+        values, value_range, dtype=dtypes.float32, nbins=nbins)
+    cdf = math_ops.cumsum(histogram)
+    return math_ops.div(cdf, math_ops.reduce_max(cdf))
+
+
+def compute_cdf(values, value_range, **kwargs):
+  """Returns the normalized cumulative distribution of the given values tensor.
+
+  Uses tf.while_loop to directly compute the cdf of the values. Number of bins
+  for histogram is fixed at _NBINS=255
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`
+    **kwargs: keyword arguments: name
+
+  Returns:
+    A 1-D `Tensor` holding normalized cdf of values.
+
+  """
+  nbins = _NBINS
+  name = kwargs.get('name', None)
+  with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
+    values = ops.convert_to_tensor(values, name='values')
+    value_range = ops.convert_to_tensor(value_range, name='value_range')
+    nbins_float = np.float32(nbins)
+
+    # Map tensor values that fall within value_range to [0, 1].
+    scaled_values = math_ops.truediv(
+        values - value_range[0],
+        value_range[1] - value_range[0],
+        name='scaled_values')
+
+    # map tensor values within the open interval value_range to {0,.., nbins-1},
+    # values outside the open interval will be zero or less, or nbins or more.
+    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
+
+    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
+    indices = math_ops.cast(
+        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
+
+    cdf = array_ops.zeros(nbins)
+    i = constant_op.constant(0)
+
+    def loop_cond(loop_count, _):
+      return math_ops.less(loop_count, nbins)
+
+    def loop_body(loop_count, cdf):
+      temp = math_ops.reduce_sum(
+          math_ops.cast(
+              math_ops.less_equal(indices, loop_count), dtypes.float32))
+      cdf = math_ops.add(
+          cdf,
+          array_ops.one_hot(
+              loop_count, depth=_NBINS, on_value=temp, off_value=0.0))
+      return [loop_count + 1, cdf]
+
+    _, cdf = control_flow_ops.while_loop(
+        loop_cond, loop_body, [i, cdf], maximum_iterations=nbins)
+
+    return math_ops.div(cdf, math_ops.reduce_max(cdf))
+
+
+def determine_partitioned_axis(partitioned_variable):
+  partitioned_axis = 0
+  concatenated_variable_shape = partitioned_variable.get_shape()
+  for partition in partitioned_variable:
+    partition_shape = partition.get_shape()
+    maybe_partitioned_axis = np.less(partition_shape,
+                                     concatenated_variable_shape)
+    # Sanity check: make sure number of partitioned axis == 1
+    if np.count_nonzero(maybe_partitioned_axis) != 1:
+      raise ValueError('Number of partitioned axes %s not equal to 1' %
+                       np.count_nonzero(maybe_partitioned_axis))
+    partitioned_axis = np.where(maybe_partitioned_axis)[0][0]
+  return partitioned_axis
+
+
+def variable_assign(var, new_value):
+  return state_ops.assign(var, new_value, name=var.op.name + '_assign')
+
+
+def partitioned_variable_assign(partitioned_var, new_value):
+  """Assign op for partitioned variables.
+
+  Args:
+    partitioned_var: A partitioned tensorflow variable
+    new_value: Value to be assigned to the variable var
+
+  Returns:
+    A tensorflow op that groups the assign ops for each of the variable slices
+  """
+  # Determine which axis was used to partition the variable. Currently
+  # tensorflow allows partitioning variable only along 1 axis.
+  axis = 0 if len(partitioned_var) == 1 else determine_partitioned_axis(
+      partitioned_var)
+
+  partition_sizes = np.array(
+      [partition.get_shape()[axis] for partition in partitioned_var])
+  new_partitioned_values = array_ops.split(
+      new_value,
+      ops.convert_to_tensor(partition_sizes, dtype=dtypes.int32),
+      axis=axis)
+  op_list = []
+  for partition in partitioned_var:
+    op_list.append(
+        variable_assign(partition, new_partitioned_values[len(op_list)]))
+  return control_flow_ops.group(
+      *op_list, name=partitioned_var.name + '_group_assign')
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..10e1dd0a8eee88f357fbe60bf00f180c05f2c4d2
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -0,0 +1,86 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utility functions in pruning_utils.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.model_pruning.python import pruning_utils
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class PruningUtilsTest(test.TestCase):
+
+  def testHistogram(self):
+    width = 10
+    height = 10
+    nbins = 100
+    expected_histogram = np.full(nbins, 1.0)
+    init = init_ops.constant_initializer(np.linspace(0.0, 1.0, width * height))
+    weights = variable_scope.get_variable(
+        "weights", [width, height], initializer=init)
+    histogram = pruning_utils._histogram(
+        weights, [0, 1.0], nbins, dtype=np.float32)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      computed_histogram = histogram.eval()
+    self.assertAllEqual(expected_histogram, computed_histogram)
+
+  def testCDF(self):
+    nbins = 5
+    weights = constant_op.constant([-1, 0, 1, 1.5, 2, 3, 4, 5, 10, 100])
+    abs_weights = math_ops.abs(weights)
+    norm_cdf = pruning_utils.compute_cdf_from_histogram(
+        abs_weights, [0.0, 5.0], nbins=nbins)
+    expected_cdf = np.array([0.1, 0.4, 0.5, 0.6, 1.0], dtype=np.float32)
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      norm_cdf_val = sess.run(norm_cdf)
+      self.assertAllEqual(len(norm_cdf_val), nbins)
+      self.assertAllEqual(expected_cdf, norm_cdf_val)
+
+  def _compare_cdf(self, values):
+    abs_values = math_ops.abs(values)
+    max_value = math_ops.reduce_max(abs_values)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      cdf_from_histogram = pruning_utils.compute_cdf_from_histogram(
+          abs_values, [0.0, max_value], nbins=pruning_utils._NBINS)
+      cdf = pruning_utils.compute_cdf(abs_values, [0.0, max_value])
+      return cdf.eval(), cdf_from_histogram.eval()
+
+  def testCDFEquivalence2D(self):
+    width = 100
+    height = 100
+    weights = variable_scope.get_variable("weights", shape=[width, height])
+    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
+    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+
+  def testCDFEquivalence4D(self):
+    weights = variable_scope.get_variable("weights", shape=[5, 5, 128, 128])
+    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
+    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/mpi/mpi_utils.h b/tensorflow/contrib/mpi/mpi_utils.h
index fa297c28cb47d43ba927ab941854bd472d90b465..4091925fc0d7ab49954bc2e0e91cfc6da2a685a9 100644
--- a/tensorflow/contrib/mpi/mpi_utils.h
+++ b/tensorflow/contrib/mpi/mpi_utils.h
@@ -22,8 +22,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
+// Skip MPI C++ bindings support, this matches the usage in other places
+#define OMPI_SKIP_MPICXX
 #include "third_party/mpi/mpi.h"
 #define MPI_CHECK(cmd)                                                \
   do {                                                                \
diff --git a/tensorflow/contrib/mpi_collectives/BUILD b/tensorflow/contrib/mpi_collectives/BUILD
index 9f9802b8fe12356c0da82ebb2b48b565cf3f7319..a7be92a35e0d62a61f7923ac61bb2c1267d039c6 100644
--- a/tensorflow/contrib/mpi_collectives/BUILD
+++ b/tensorflow/contrib/mpi_collectives/BUILD
@@ -126,15 +126,3 @@ tf_py_test(
     ],
     tags = ["manual"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
index 8dca90a1e34d6a234c2b1479ca5594e88afcc194..ed22ee667f1d73b3f86f77e09bad9bfec7e46391 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
@@ -73,7 +73,7 @@ limitations under the License.
  */
 
 template <class T>
-using StatusOr = perftools::gputools::port::StatusOr<T>;
+using StatusOr = se::port::StatusOr<T>;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..475297ca92111c6ead01b41d402556094dab1ee0
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
@@ -0,0 +1,1236 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include <queue>
+#include <thread>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#include <cuda_runtime.h>
+#include "tensorflow/stream_executor/stream.h"
+#endif
+
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+#define OMPI_SKIP_MPICXX
+#include "third_party/mpi/mpi.h"
+#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
+#include "tensorflow/contrib/mpi_collectives/ring.h"
+
+/*
+ * MPI Allreduce and Allgather Ops for TensorFlow.
+ *
+ * TensorFlow natively provides inter-device communication through send and
+ * receive ops and inter-node communication through Distributed TensorFlow,
+ * based on the same send and receive abstractions. These end up being
+ * insufficient for synchronous data-parallel training on HPC clusters where
+ * Infiniband or other high-speed interconnects are available.  This module
+ * implements MPI ops for allgather and allreduce, which do bandwidth-optimal
+ * gathers and reductions and can take advantage of hardware-optimized
+ * communication libraries through the MPI implementation.
+ *
+ * The primary logic of the allreduce and allgather are in RingAllgather() and
+ * RingAllreduce(). The background thread which facilitates MPI operations is
+ * run in BackgroundThreadLoop(). The provided MPI ops are:
+ *      – MPIInit:
+ *          Initialize MPI on a given device (CPU or GPU).
+ *          Should only be run on a single device in every process.
+ *      – MPISize:
+ *          Get the number of MPI processes in the global communicator.
+ *      – MPIRank:
+ *          Get the rank of the current MPI process in the global communicator.
+ *      – MPILocalRank:
+ *          Get the local rank of the current MPI process within its node.
+ *      – MPIAllreduce:
+ *          Perform an allreduce on a Tensor, returning the sum
+ *          across all MPI processes in the global communicator.
+ *      – MPIAllgather:
+ *          Perform an allgather on a Tensor, returning the concatenation of
+ *          the tensor on the first dimension across all MPI processes in the
+ *          global communicator.
+ *
+ */
+
+template <class T>
+using StatusOr = se::port::StatusOr<T>;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+namespace tensorflow {
+namespace contrib {
+namespace mpi {
+
+// Make sure template specializations are generated in the ring.cu.cc and the
+// ring.cc file, not in this file.
+extern template Status RingAllreduce<GPUDevice, int>(OpKernelContext*,
+                                                     const Tensor*, Tensor*,
+                                                     Tensor*);
+extern template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
+                                                           const Tensor*,
+                                                           Tensor*, Tensor*);
+extern template Status RingAllreduce<GPUDevice, float>(OpKernelContext*,
+                                                       const Tensor*, Tensor*,
+                                                       Tensor*);
+extern template Status RingAllgather<GPUDevice, int>(OpKernelContext*,
+                                                     const Tensor*,
+                                                     const std::vector<size_t>&,
+                                                     Tensor*);
+extern template Status RingAllgather<GPUDevice, long long>(
+    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
+extern template Status RingAllgather<GPUDevice, float>(
+    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
+extern template Status RingAllreduce<CPUDevice, int>(OpKernelContext*,
+                                                     const Tensor*, Tensor*,
+                                                     Tensor*);
+extern template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
+                                                           const Tensor*,
+                                                           Tensor*, Tensor*);
+extern template Status RingAllreduce<CPUDevice, float>(OpKernelContext*,
+                                                       const Tensor*, Tensor*,
+                                                       Tensor*);
+extern template Status RingAllgather<CPUDevice, int>(OpKernelContext*,
+                                                     const Tensor*,
+                                                     const std::vector<size_t>&,
+                                                     Tensor*);
+extern template Status RingAllgather<CPUDevice, long long>(
+    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
+extern template Status RingAllgather<CPUDevice, float>(
+    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
+
+namespace {
+
+// Return true if the templated type is GPUDevice, otherwise false.
+template <typename T>
+bool IsGPUDevice();
+template <>
+bool IsGPUDevice<GPUDevice>() {
+  return true;
+};
+template <>
+bool IsGPUDevice<CPUDevice>() {
+  return false;
+};
+
+// A callback to call after the MPI communication completes. Since the
+// allreduce and allgather ops are asynchronous, this callback is what resumes
+// computation after the reduction is completed.
+typedef std::function<void(StatusOr<Tensor>)> CommunicationDoneCallback;
+
+struct CollectiveOpRecord {
+  // The rank performing this piece of the op
+  int rank;
+
+  // The name of the op/tensor to be reduced
+  std::string name;
+
+  // The op's kernel context
+  OpKernelContext* context;
+
+  // Data type of the op
+  DataType dtype;
+
+  // The input tensor
+  const Tensor* in_t;
+
+  // Allgather: Vector of per-rank first-dimension sizes
+  std::vector<size_t> sizes_vec;
+
+  // The temp tensor for intermediate results
+  Tensor temp_t;
+
+  // The output tensor
+  Tensor* out_t;
+
+  // Whether to run this op on the gpu
+  bool on_gpu;
+
+  // The callback to call after the op has completed
+  CommunicationDoneCallback callback;
+};
+
+// Table storing Tensors to be reduced, keyed by unique name.
+// This table contains everything necessary to do the reduction
+typedef std::unordered_map<std::string, CollectiveOpRecord> TensorTable;
+
+// Table for storing Tensor metadata on rank zero. This is used for error
+// checking and size calculations, as well as determining when a reduction is
+// ready to be done (when all nodes are ready to do it).
+typedef std::unordered_map<std::string, std::vector<MPIRequest> > MessageTable;
+
+// The global state required for the MPI ops.
+//
+// MPI is a library that stores a lot of global per-program state and often
+// requires running on a single thread. As a result, we have to have a single
+// background thread responsible for all MPI operations, and communicate with
+// that background thread through global state.
+struct MPIGlobalState {
+  // An atomic boolean which is set to true when MPI is initialized.
+  // This ensures that MPI_Init is never called twice.
+  std::atomic_flag initialized_flag = ATOMIC_FLAG_INIT;
+
+  // Condition variable to wait for initialization
+  condition_variable cv;
+
+  // Whether MPI_Init has been completed on the background thread.
+  bool initialization_done = false;
+
+  // Whether MPI_Init succeeded on the background thread.
+  Status init_status;
+
+  // A mutex that needs to be used whenever MPI operations touch
+  // shared structures.
+  mutex mu;
+
+  // Tensors waiting to be allreduced or allgathered.
+  TensorTable tensor_table;
+
+  // Queue of MPI requests waiting to be sent to the coordinator node.
+  std::queue<MPIRequest> message_queue;
+
+  // Background thread running MPI communication.
+  std::thread background_thread;
+
+  // Whether the background thread should shutdown.
+  bool shut_down = false;
+
+  // Only exists on the coordinator node (rank zero). Maintains a count of
+  // how many nodes are ready to allreduce every tensor (keyed by tensor
+  // name).
+  std::unique_ptr<MessageTable> message_table;
+
+  // The MPI rank, local rank, and size.
+  int rank = 0;
+  int local_rank = 0;
+  int size = 1;
+
+  // The device that MPI was initialized on. (-1 for no GPU)
+  int device = -1;
+
+  // The CUDA stream used for data transfers and within-allreduce operations.
+  // A naive implementation would use the TensorFlow StreamExecutor CUDA
+  // stream. However, the allreduce and allgather require doing memory copies
+  // and kernel executions (for accumulation of values on the GPU). However,
+  // the subsequent operations must wait for those operations to complete,
+  // otherwise MPI (which uses its own stream internally) will begin the data
+  // transfers before the CUDA calls are complete. In order to wait for those
+  // CUDA operations, if we were using the TensorFlow stream, we would have
+  // to synchronize that stream; however, other TensorFlow threads may be
+  // submitting more work to that stream, so synchronizing on it can cause
+  // the allreduce to be delayed, waiting for compute totally unrelated to it
+  // in other parts of the graph. Overlaying memory transfers and compute
+  // during backpropagation is crucial for good performance, so we cannot use
+  // the TensorFlow stream, and must use our own stream.
+#if GOOGLE_CUDA
+  cudaStream_t stream;
+  std::atomic_flag stream_created_flag = ATOMIC_FLAG_INIT;
+#endif
+
+  ~MPIGlobalState() {
+    // Make sure that the destructor of the background thread is safe to
+    // call. If a thread is still joinable (not detached or complete) its
+    // destructor cannot be called.
+    if (background_thread.joinable()) {
+      shut_down = true;
+      background_thread.join();
+    }
+  }
+};
+
+// All the MPI state that must be stored globally per-process.
+static MPIGlobalState mpi_global;
+
+// For clarify in argument lists.
+#define RANK_ZERO 0
+
+// A tag used for all coordinator messaging.
+#define TAG_NOTIFY 1
+
+// Store the MPIRequest for a name, and return whether the total count of
+// MPIRequests for that tensor is now equal to the MPI size (and thus we are
+// ready to reduce the tensor).
+bool IncrementTensorCount(std::unique_ptr<MessageTable>& message_table,
+                          MPIRequest msg, int mpi_size) {
+  auto name = msg.tensor_name();
+  auto table_iter = message_table->find(name);
+  if (table_iter == message_table->end()) {
+    message_table->emplace(name, std::vector<MPIRequest>({msg}));
+    table_iter = message_table->find(name);
+  } else {
+    table_iter->second.push_back(msg);
+  }
+
+  int count = table_iter->second.size();
+  return count == mpi_size;
+}
+
+// Once a tensor is ready to be reduced, the coordinator sends an MPIResponse
+// instructing all ranks to start the reduction to all ranks. The MPIResponse
+// also contains error messages in case the submitted MPIRequests were not
+// valid (for example, contained mismatched shapes or types).
+//
+// Constructing the MPIResponse, thus, requires a whole lot of error checking.
+MPIResponse ConstructMPIResponse(std::unique_ptr<MessageTable>& message_table,
+                                 std::string name) {
+  bool error = false;
+  auto it = message_table->find(name);
+  assert(it != message_table->end());
+
+  std::vector<MPIRequest> requests = it->second;
+  assert(requests.size() > 0);
+
+  std::ostringstream error_message_stream;
+
+  // Check that all data types being reduced or gathered are identical
+  auto data_type = requests[0].tensor_type();
+  for (unsigned int i = 1; i < requests.size(); i++) {
+    auto request_type = requests[i].tensor_type();
+    if (data_type != request_type) {
+      error = true;
+      error_message_stream << "Mismatched data types: One rank had type "
+                           << DataType_Name(data_type)
+                           << ", but another rank had type "
+                           << DataType_Name(request_type) << ".";
+      break;
+    }
+  }
+
+  // Check that all requested operations are the same
+  auto message_type = requests[0].request_type();
+  for (unsigned int i = 1; i < requests.size(); i++) {
+    if (error) {
+      break;
+    }
+
+    auto request_type = requests[i].request_type();
+    if (message_type != request_type) {
+      error = true;
+      error_message_stream << "Mismatched MPI operations: One rank did an "
+                           << message_type << ", but another rank did an "
+                           << request_type << ".";
+      break;
+    }
+  }
+
+  // If we are doing an allreduce, check that all tensor shapes
+  // are identical
+  if (message_type == MPIRequest::ALLREDUCE) {
+    TensorShape tensor_shape = requests[0].tensor_shape();
+    for (unsigned int i = 1; i < requests.size(); i++) {
+      if (error) {
+        break;
+      }
+
+      TensorShape request_shape = requests[i].tensor_shape();
+      if (tensor_shape != request_shape) {
+        error = true;
+        error_message_stream << "Mismatched allreduce tensor shapes: "
+                             << "One rank reduced a tensor of shape "
+                             << tensor_shape.DebugString()
+                             << ", but another rank sent a tensor of shape "
+                             << request_shape.DebugString() << ".";
+        break;
+      }
+    }
+  }
+
+  // If we are doing an allgather, make sure all but the first dimension are
+  // the same. The first dimension may be different and the output tensor is
+  // the sum of the first dimension. Collect the sizes by rank.
+  if (message_type == MPIRequest::ALLGATHER) {
+    TensorShape tensor_shape = requests[0].tensor_shape();
+
+    if (tensor_shape.dims() == 0) {
+      error = true;
+      error_message_stream << "Rank zero tried to gather a rank-zero tensor.";
+    }
+
+    for (unsigned int i = 1; i < requests.size(); i++) {
+      if (error) {
+        break;
+      }
+
+      TensorShape request_shape = requests[i].tensor_shape();
+      if (tensor_shape.dims() != request_shape.dims()) {
+        error = true;
+        error_message_stream << "Mismatched allgather tensor shapes: "
+                             << "One rank gathered a tensor of rank "
+                             << tensor_shape.dims()
+                             << ", but another rank sent a tensor of rank "
+                             << request_shape.dims() << ".";
+        break;
+      }
+
+      for (unsigned int dim = 1; dim < tensor_shape.dims(); dim++) {
+        if (tensor_shape.dim_size(dim) != request_shape.dim_size(dim)) {
+          error = true;
+          error_message_stream
+              << "Mismatched allgather tensor shapes: "
+              << "One rank gathered a tensor with dimension " << dim
+              << " equal to " << tensor_shape.dim_size(dim)
+              << ", but another rank sent a tensor with dimension " << dim
+              << " equal to " << request_shape.dim_size(dim) << ".";
+          break;
+        }
+      }
+    }
+  }
+
+  MPIResponse response;
+  response.set_tensor_name(name);
+  if (error) {
+    std::string error_message = error_message_stream.str();
+    response.set_response_type(MPIResponse::ERROR);
+    response.set_error_message(error_message);
+  } else {
+    auto response_type = MPIResponse::ERROR;
+    if (message_type == MPIRequest::ALLREDUCE) {
+      response_type = MPIResponse::ALLREDUCE;
+    } else {
+      response_type = MPIResponse::ALLGATHER;
+    }
+    response.set_response_type(response_type);
+  }
+
+  // Clear all queued up requests for this name. They are now taken care of
+  // by the constructed MPI response.
+  message_table->erase(it);
+
+  return response;
+}
+
+// Process an MPIResponse by doing a reduction, a gather, or raising an error.
+void PerformCollectiveOp(TensorTable& tensor_table, MPIResponse response) {
+  OpKernelContext* context;
+  const Tensor* input_tensor;
+  std::vector<size_t> sizes_vec;
+  Tensor temp_tensor;
+  Tensor* output_tensor;
+  CommunicationDoneCallback callback;
+  bool on_gpu;
+  {
+    // Lock on the tensor table.
+    mutex_lock guard(mpi_global.mu);
+
+    // We should never fail at finding this key in the tensor table.
+    auto name = response.tensor_name();
+    auto iter = tensor_table.find(name);
+    assert(iter != tensor_table.end());
+
+    assert(response.response_type() == MPIResponse::ALLREDUCE ||
+           response.response_type() == MPIResponse::ALLGATHER ||
+           response.response_type() == MPIResponse::ERROR);
+
+    CollectiveOpRecord record = iter->second;
+    context = record.context;
+    input_tensor = record.in_t;
+    sizes_vec = record.sizes_vec;
+    temp_tensor = record.temp_t;
+    output_tensor = record.out_t;
+    on_gpu = record.on_gpu;
+    callback = record.callback;
+
+    // Clear the tensor table of this tensor and its callbacks; the rest of
+    // this function takes care of it.
+    tensor_table.erase(iter);
+  }
+
+  // Use CPUDevice instead of GPUDevice if no CUDA, to ensure we don't
+  // link to non-existent symbols.
+#if GOOGLE_CUDA
+#define GPU_DEVICE_IF_CUDA GPUDevice
+#else
+#define GPU_DEVICE_IF_CUDA CPUDevice
+#endif
+
+  Status status;
+  auto dtype = input_tensor->dtype();
+  if (response.response_type() == MPIResponse::ALLGATHER) {
+    if (dtype == DT_FLOAT) {
+      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, float>(
+                            context, input_tensor, sizes_vec, output_tensor)
+                      : RingAllgather<CPUDevice, float>(
+                            context, input_tensor, sizes_vec, output_tensor);
+    } else if (dtype == DT_INT32) {
+      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, int>(
+                            context, input_tensor, sizes_vec, output_tensor)
+                      : RingAllgather<CPUDevice, int>(context, input_tensor,
+                                                      sizes_vec, output_tensor);
+    } else if (dtype == DT_INT64) {
+      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, long long>(
+                            context, input_tensor, sizes_vec, output_tensor)
+                      : RingAllgather<CPUDevice, long long>(
+                            context, input_tensor, sizes_vec, output_tensor);
+    } else {
+      status = errors::Unknown("Invalid tensor type for MPI allgather.");
+    }
+  } else if (response.response_type() == MPIResponse::ALLREDUCE) {
+    if (dtype == DT_FLOAT) {
+      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, float>(
+                            context, input_tensor, &temp_tensor, output_tensor)
+                      : RingAllreduce<CPUDevice, float>(
+                            context, input_tensor, &temp_tensor, output_tensor);
+    } else if (dtype == DT_INT32) {
+      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, int>(
+                            context, input_tensor, &temp_tensor, output_tensor)
+                      : RingAllreduce<CPUDevice, int>(
+                            context, input_tensor, &temp_tensor, output_tensor);
+    } else if (dtype == DT_INT64) {
+      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, long long>(
+                            context, input_tensor, &temp_tensor, output_tensor)
+                      : RingAllreduce<CPUDevice, long long>(
+                            context, input_tensor, &temp_tensor, output_tensor);
+    } else {
+      status = errors::Unknown("Invalid tensor type for MPI allreduce.");
+    }
+  } else if (response.response_type() == MPIResponse::ERROR) {
+    status = errors::FailedPrecondition(response.error_message());
+  }
+
+  if (status.ok()) {
+    callback(StatusOr<Tensor>(*output_tensor));
+  } else {
+    callback(StatusOr<Tensor>(status));
+  }
+}
+
+// The MPI background thread loop coordinates all the MPI processes and the
+// tensor reductions. The design of the communicator mechanism is limited by a
+// few considerations:
+//
+//      1. Some MPI implementations require all MPI calls to happen from a
+//      single thread. Since TensorFlow may use several threads for graph
+//      processing, this means we must have our own dedicated thread for
+//      dealing with MPI.
+//      2. We want to gracefully handle errors, when MPI processes do not
+//      properly agree upon what should happen (such as mismatched types or
+//      shapes). To do so requires the MPI processes to know about the shapes
+//      and types of the relevant tensors on the other processes.
+//      3. The MPI reductions and gathers should be able to happen in parallel
+//      with other ongoing operations. Since MPI uses an internal
+//      (inaccessible) GPU stream separate from the TF GPUDevice streams, we
+//      cannot explicitly synchronize memcpys or kernels with it. As a result,
+//      MPIAllreduce and MPIAllgather must be AsyncOpKernels to ensure proper
+//      ordering of memcpys and kernels with respect to TF streams.
+//      4. NOTE: We cannot guarantee that all the MPI processes reduce their
+//      tensors in the same order. Thus, there must be a way to ensure the
+//      reduction memcpys and kernels occur for correct tensors across all
+//      ranks at the same time. We choose to use a coordinator (rank ID 0) to
+//      gather and trigger the reduction operations that are ready to execute.
+//
+// The coordinator currently follows a master-worker paradigm. Rank zero acts
+// as the master (the "coordinator"), whereas all other ranks are simply
+// workers. Each rank runs its own background thread which progresses in ticks.
+// In each tick, the following actions happen:
+//
+//      a) The workers send any available MPIRequests to the coordinator. These
+//      MPIRequests indicate what the worker would like to do (i.e. which
+//      tensor they would like to gather or reduce, as well as their shape and
+//      type). They repeat this for every tensor that they would like to
+//      operate on after that tensor's collective op has executed ComputeAsync.
+//
+//      b) The workers send an empty "DONE" message to the coordinator to
+//      indicate that there are no more tensors they wish to operate on.
+//
+//      c) The coordinator receives the MPIRequests from the workers, as well
+//      as from its own TensorFlow ops, and stores them in a request table. The
+//      coordinator continues to receive MPIRequest messages until it has
+//      received MPI_SIZE number of empty "DONE" messages.
+//
+//      d) The coordinator finds all tensors that are ready to be reduced,
+//      gathered, or all operations that result in an error. For each of those,
+//      it sends an MPIResponse to all the workers. When no more MPIResponses
+//      are available, it sends a "DONE" response to the workers. If the
+//      process is being shutdown, it instead sends a "SHUTDOWN" response.
+//
+//      e) The workers listen for MPIResponse messages, processing each one by
+//      doing the required reduce or gather, until they receive a "DONE"
+//      response from the coordinator. At that point, the tick ends.
+//      If instead of "DONE" they receive "SHUTDOWN", they exit their
+//      background loop.
+// TODO: Use the global mpi_global state variable instead of a local one
+void BackgroundThreadLoop() {
+#if GOOGLE_CUDA
+  // Set the device, so that this thread uses the same GPU context as the
+  // calling thread.
+  // TODO: Ensure that this is operating correctly. The background thread
+  // needs to be able to control all GPUs that the rank has access to, and
+  // might be more than 1 GPU. Tensors could be resident in any of the
+  // GPUs, so the background thread's accumulate and copy kernels might need
+  // to correctly set the device and it might be necessary for the background
+  // thread to manage multiple streams.
+  cudaSetDevice(mpi_global.device);
+  cudaStreamCreate(&mpi_global.stream);
+#endif
+
+  // Initialize MPI. This must happen on the background thread, since not all
+  // MPI implementations support being called from multiple threads.
+  auto init_result = MPI_Init(NULL, NULL);
+  if (init_result != MPI_SUCCESS) {
+    mpi_global.init_status =
+        errors::Unknown("Could not initialize MPI; MPI_Init() failed.");
+    mpi_global.initialization_done = true;
+    mpi_global.cv.notify_all();
+    return;
+  } else {
+    mpi_global.init_status = Status::OK();
+  }
+
+  // Get MPI rank to determine if we are rank zero.
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  bool is_coordinator = rank == 0;
+
+  // Get MPI size to determine how many tensors to wait for before reducing.
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  // Determine local rank by querying the local communicator.
+  MPI_Comm local_comm;
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,
+                      &local_comm);
+  int local_rank;
+  MPI_Comm_rank(local_comm, &local_rank);
+
+  mpi_global.rank = rank;
+  mpi_global.local_rank = local_rank;
+  mpi_global.size = size;
+  mpi_global.initialization_done = true;
+
+  // Notify calling thread that initialization is complete
+  mpi_global.cv.notify_all();
+
+  // TODO: MOVE MESSAGE TABLE INITIALIZATION TO LIBRARY LOAD!
+  // Initialize the tensor count table. No tensors are available yet.
+  if (is_coordinator) {
+    mpi_global.message_table =
+        std::unique_ptr<MessageTable>(new MessageTable());
+  }
+
+  // The coordinator sends a SHUTDOWN message to trigger shutdown.
+  bool should_shut_down = false;
+  do {
+    // TODO: Eliminate the need for thread sleep by making all activity
+    // depend on other activity (e.g. condition or MPI waits).
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+
+    // Copy the data structures from global state under this lock.
+    // However, don't keep the lock for the rest of the loop, so that
+    // enqueued stream callbacks can continue.
+    std::queue<MPIRequest> message_queue;
+    {
+      mutex_lock guard(mpi_global.mu);
+      while (!mpi_global.message_queue.empty()) {
+        MPIRequest message = mpi_global.message_queue.front();
+        mpi_global.message_queue.pop();
+        message_queue.push(message);
+      }
+    }
+
+    // Collect all tensors that are ready to be reduced. Record them in the
+    // tensor count table (rank zero) or send them to rank zero to be
+    // recorded (everyone else).
+    std::vector<std::string> ready_to_reduce;
+    while (!message_queue.empty()) {
+      // Pop the first available message message
+      MPIRequest message = message_queue.front();
+      message_queue.pop();
+
+      if (is_coordinator) {
+        bool reduce =
+            IncrementTensorCount(mpi_global.message_table, message, size);
+        if (reduce) {
+          ready_to_reduce.push_back(message.tensor_name());
+        }
+      } else {
+        std::string encoded_message;
+        message.SerializeToString(&encoded_message);
+        MPI_Send(encoded_message.c_str(), encoded_message.length() + 1,
+                 MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
+      }
+    }
+
+    // Rank zero has put all its own tensors in the tensor count table.
+    // Now, it should count all the tensors that are coming from other
+    // ranks at this tick. It should keep getting tensors until it gets a
+    // DONE message from all the other ranks.
+    if (is_coordinator) {
+      // Count of DONE messages. Keep receiving messages until the number
+      // of messages is equal to the number of processes. Initialize to
+      // one since the coordinator is effectively done.
+      int completed_ranks = 1;
+      while (completed_ranks != size) {
+        MPI_Status status;
+        MPI_Probe(MPI_ANY_SOURCE, TAG_NOTIFY, MPI_COMM_WORLD, &status);
+
+        // Find number of characters in message (including zero byte).
+        int source_rank = status.MPI_SOURCE;
+        int msg_length;
+        MPI_Get_count(&status, MPI_BYTE, &msg_length);
+
+        // If the length is zero, this is a DONE message.
+        if (msg_length == 0) {
+          completed_ranks++;
+          MPI_Recv(NULL, 0, MPI_BYTE, source_rank, TAG_NOTIFY, MPI_COMM_WORLD,
+                   &status);
+          continue;
+        }
+
+        // Get tensor name from MPI into an std::string.
+        char* buffer = new char[msg_length];
+        MPI_Recv(buffer, msg_length, MPI_BYTE, source_rank, TAG_NOTIFY,
+                 MPI_COMM_WORLD, &status);
+        std::string received_data(buffer);
+        delete[] buffer;
+
+        MPIRequest received_message;
+        received_message.ParseFromString(received_data);
+        auto received_name = received_message.tensor_name();
+
+        bool reduce = IncrementTensorCount(mpi_global.message_table,
+                                           received_message, size);
+        if (reduce) {
+          ready_to_reduce.push_back(received_name);
+        }
+      }
+
+      // At this point, rank zero should have a fully updated tensor
+      // count table and should know all the tensors that need to be
+      // reduced or gathered, and everyone else should have sent all
+      // their information to rank zero. We can now do reductions and
+      // gathers; rank zero will choose which ones and in what order,
+      // and will notify the other ranks before doing each reduction.
+      for (int i = 0; i < ready_to_reduce.size(); i++) {
+        // Notify all nodes which tensor we'd like to reduce now
+        auto name = ready_to_reduce[i];
+        MPIResponse response =
+            ConstructMPIResponse(mpi_global.message_table, name);
+
+        std::string encoded_response;
+        response.SerializeToString(&encoded_response);
+        for (int r = 1; r < size; r++) {
+          MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
+                   MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
+        }
+
+        // Perform the reduction. All nodes should end up performing
+        // the same reduction.
+        PerformCollectiveOp(mpi_global.tensor_table, response);
+      }
+
+      // Notify all nodes that we are done with the reductions for this
+      // tick.
+      MPIResponse done_response;
+      should_shut_down = mpi_global.shut_down;
+      done_response.set_response_type(
+          mpi_global.shut_down ? MPIResponse::SHUTDOWN : MPIResponse::DONE);
+      std::string encoded_response;
+      done_response.SerializeToString(&encoded_response);
+      for (int r = 1; r < size; r++) {
+        MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
+                 MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
+      }
+    } else {
+      // Notify the coordinator that this node is done sending messages.
+      // A DONE message is encoded as a zero-length message.
+      MPI_Send(NULL, 0, MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
+
+      // Receive names for tensors to reduce from rank zero. Once we
+      // receive a empty DONE message, stop waiting for more names.
+      while (true) {
+        MPI_Status status;
+        MPI_Probe(0, TAG_NOTIFY, MPI_COMM_WORLD, &status);
+
+        // Find number of characters in message (including zero byte).
+        int msg_length;
+        MPI_Get_count(&status, MPI_BYTE, &msg_length);
+
+        // Get tensor name from MPI into an std::string.
+        char* buffer = new char[msg_length];
+        MPI_Recv(buffer, msg_length, MPI_BYTE, 0, TAG_NOTIFY, MPI_COMM_WORLD,
+                 &status);
+        std::string received_message(buffer);
+        delete[] buffer;
+
+        MPIResponse response;
+        response.ParseFromString(received_message);
+        if (response.response_type() == MPIResponse::DONE) {
+          // No more messages this tick
+          break;
+        } else if (response.response_type() == MPIResponse::SHUTDOWN) {
+          // No more messages this tick, and the background thread
+          // should shut down
+          should_shut_down = true;
+          break;
+        } else {
+          // Process the current message
+          PerformCollectiveOp(mpi_global.tensor_table, response);
+        }
+      }
+    }
+  } while (!should_shut_down);
+
+  MPI_Finalize();
+}
+
+// Initialize MPI and start the MPI background thread. Ensure that this is
+// only done once no matter how many times this function is called.
+Status InitializeMPIOnce(bool gpu) {
+  // Ensure MPI is only initialized once.
+  if (mpi_global.initialized_flag.test_and_set()) return mpi_global.init_status;
+
+  mpi_global.device = -1;
+#if GOOGLE_CUDA
+  if (gpu) {
+    cudaGetDevice(&mpi_global.device);
+  }
+#endif
+
+  // Start the MPI background thread, which assumes MPI is initialized
+  // TODO: Change this to a Tensorflow thread
+  mpi_global.background_thread = std::thread(BackgroundThreadLoop);
+
+  // Wait to ensure that the background thread has finished initializing MPI
+  mutex_lock guard(mpi_global.mu);
+  mpi_global.cv.wait(guard);
+  if (!mpi_global.initialization_done) {
+    mpi_global.init_status =
+        errors::Unknown("Failed to wait for MPI initialization.");
+  }
+
+  return mpi_global.init_status;
+}
+
+// Check that MPI is initialized.
+Status IsMPIInitialized() {
+  if (!mpi_global.initialization_done) {
+    return errors::FailedPrecondition(
+        "MPI has not been initialized; use tf.contrib.mpi.Session.");
+  }
+  return Status::OK();
+}
+
+// This function (called from the callback set up in MPIAll*Op::ComputeAsync)
+// only adds the op's record into the local op queue (to track the op's
+// progress), and sends a message to the coordinator indicating that this rank
+// is ready to begin. The MPI background thread will handle the MPI message.
+void EnqueueTensorCollective(CollectiveOpRecord record,
+                             MPIRequest::RequestType rtype) {
+  const Tensor* input_tensor = record.in_t;
+  MPIRequest message;
+  message.set_request_rank(record.rank);
+  message.set_tensor_name(record.name);
+  message.set_tensor_type(record.dtype);
+  message.set_request_type(rtype);
+  input_tensor->shape().AsProto(message.mutable_tensor_shape());
+
+  mutex_lock guard(mpi_global.mu);
+  mpi_global.tensor_table.emplace(record.name, record);
+  mpi_global.message_queue.push(message);
+}
+
+}  // namespace
+
+#if GOOGLE_CUDA
+cudaStream_t CudaStreamForMPI() { return mpi_global.stream; }
+#endif
+
+// Op to initialize MPI in the current process. The settings used in the
+// configuration are the same that must be used for all future MPI ops.
+template <typename Device>
+class MPIInitOp : public OpKernel {
+ public:
+  explicit MPIInitOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    bool on_gpu = IsGPUDevice<Device>();
+    OP_REQUIRES_OK(context, InitializeMPIOnce(on_gpu));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_CPU),
+                        MPIInitOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_GPU),
+                        MPIInitOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPIInit").Doc(R"doc(
+Initialize MPI for the current process.
+
+If this is run on a GPU, then that GPU must be used for all future MPI
+operations. If it is run on CPU, then all future MPI operations must also
+run on CPU.
+)doc");
+
+// Op to get the current MPI Size.
+template <typename Device>
+class MPISizeOp : public OpKernel {
+ public:
+  explicit MPISizeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES_OK(context, IsMPIInitialized());
+
+    // Write integer to output tensor
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+
+    auto flat = output->flat<int>();
+    flat(0) = mpi_global.size;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_CPU),
+                        MPISizeOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_GPU).HostMemory("size"),
+                        MPISizeOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPISize")
+    .Output("size: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the number of running MPI processes.
+
+More precisely, returns the number of MPI processes in the group associated
+with the MPI_COMM_WORLD communicator.
+
+size:   Size of the MPI group.
+)doc");
+
+// Op to get the current MPI Rank.
+template <typename Device>
+class MPIRankOp : public OpKernel {
+ public:
+  explicit MPIRankOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES_OK(context, IsMPIInitialized());
+
+    // Write integer to output tensor
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+
+    auto flat = output->flat<int>();
+    flat(0) = mpi_global.rank;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_CPU),
+                        MPIRankOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_GPU).HostMemory("rank"),
+                        MPIRankOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPIRank")
+    .Output("rank: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the index of the current process in the MPI group.
+
+More precisely, returns the rank of the calling process in the MPI_COMM_WORLD
+communicator.
+
+rank:   Rank of the calling process.
+)doc");
+
+// Op to get the current local MPI Rank.
+template <typename Device>
+class MPILocalRankOp : public OpKernel {
+ public:
+  explicit MPILocalRankOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES_OK(context, IsMPIInitialized());
+
+    // Write integer to output tensor
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+
+    auto flat = output->flat<int>();
+    flat(0) = mpi_global.local_rank;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPILocalRank").Device(DEVICE_CPU),
+                        MPILocalRankOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("MPILocalRank").Device(DEVICE_GPU).HostMemory("rank"),
+    MPILocalRankOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPILocalRank")
+    .Output("rank: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the index of the current process in the node it is on.
+
+More precisely, returns the rank of the calling process in communicator that
+only spans the MPI processes running on that node.
+
+rank:   Rank of the calling process on the node it is on.
+)doc");
+
+template <typename Device>
+class MPIAllreduceOp : public AsyncOpKernel {
+ public:
+  explicit MPIAllreduceOp(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  // Although this op is handled asynchronously, the ComputeAsync call is
+  // very inexpensive. It only sets up a CollectiveOpRecord and places it
+  // in the table for the background thread to handle. Thus, we do not need
+  // a TF pool thread to perform the op.
+  bool IsExpensive() override { return false; }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
+    const Tensor* input_tensor = &context->input(0);
+    Tensor* output_tensor;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_output(0, input_tensor->shape(), &output_tensor),
+        done);
+
+    // Record allocated on stack so op can fail without memory leak
+    CollectiveOpRecord record;
+    record.name = name();
+    record.context = context;
+    record.in_t = input_tensor;
+    record.out_t = output_tensor;
+    record.on_gpu = IsGPUDevice<Device>();
+    record.dtype = input_tensor->dtype();
+
+    const size_t temp_size =
+        (input_tensor->NumElements() + mpi_global.size - 1) / mpi_global.size;
+    TensorShape temp_shape;
+    temp_shape.AddDim(temp_size);
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_temp(input_tensor->dtype(),
+                                                temp_shape, &record.temp_t),
+                         done);
+
+    auto allreduce_done_callback = [done, context](StatusOr<Tensor> status) {
+      context->SetStatus(status.status());
+      done();
+    };
+    record.callback = allreduce_done_callback;
+
+    auto allreduce_launch_callback = [record] {
+      EnqueueTensorCollective(record, MPIRequest::ALLREDUCE);
+    };
+
+    // If we are on a CPU, our device context will be null and we can't
+    // get a stream to enqueue this on. On a CPU this op is called when the
+    // data is already available, so we can just immediately do the
+    // allreduce; we don't have to wait for the data to get populated.
+#if GOOGLE_CUDA
+    auto device_context = context->op_device_context();
+    if (device_context == nullptr) {
+      allreduce_launch_callback();
+    } else {
+      auto stream = device_context->stream();
+      stream->ThenDoHostCallback(allreduce_launch_callback);
+    }
+#else
+    allreduce_launch_callback();
+#endif
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_CPU),
+                        MPIAllreduceOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_GPU),
+                        MPIAllreduceOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPIAllreduce")
+    .Attr("T: {int32, int64, float32}")
+    .Input("tensor: T")
+    .Output("sum: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Perform an MPI Allreduce on a tensor. All other processes that do a reduction
+on a tensor with the same name must have the same dimension for that tensor.
+Tensors are reduced with other tensors that have the same node name for the
+allreduce.
+
+Arguments
+    tensor:     A tensor to reduce.
+
+Output
+    sum:        A tensor with the same shape as `tensor`, summed across all
+                MPI processes.
+)doc");
+
+template <typename Device>
+class MPIAllgatherOp : public AsyncOpKernel {
+ public:
+  explicit MPIAllgatherOp(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  // Although this op is handled asynchronously, the ComputeAsync call is
+  // very inexpensive. It only sets up a CollectiveOpRecord and places it
+  // in the table for the background thread to handle. Thus, we do not need
+  // a TF pool thread to perform the op.
+  bool IsExpensive() override { return false; }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
+    const Tensor* input_tensor = &context->input(0);
+    const Tensor* sizing_tensor = &context->input(1);
+
+    // Record allocated on stack so op can fail without memory leak
+    CollectiveOpRecord record;
+    record.name = name();
+    record.context = context;
+    record.in_t = input_tensor;
+    record.on_gpu = IsGPUDevice<Device>();
+
+    // Construct the output size from the sizing tensor
+    size_t output_first_dim = 0;
+    if (sizing_tensor->shape().dims() == 0) {
+      // 0-dim sizing_tensor implies that the op is just gathering
+      // a single element from each rank
+      output_first_dim = mpi_global.size;
+      for (int i = 0; i < mpi_global.size; i++) {
+        record.sizes_vec.push_back(1);
+      }
+    } else {
+      // Collect the total output tensor sizing from the sizing tensor
+      // NOTE: The sizing tensor is forced to be placed on the CPU by
+      // declaring the input as HostMemory, so it is valid to read it here.
+      const int64* sizing_array =
+          (const int64*)sizing_tensor->tensor_data().data();
+      for (int i = 0; i < mpi_global.size; i++) {
+        record.sizes_vec.push_back(sizing_array[i]);
+        output_first_dim += sizing_array[i];
+      }
+    }
+
+    TensorShape output_shape;
+    output_shape.AddDim(output_first_dim);
+    for (int i = 1; i < input_tensor->shape().dims(); i++) {
+      output_shape.AddDim(input_tensor->shape().dim_size(i));
+    }
+
+    Tensor* output_tensor;
+    OP_REQUIRES_OK_ASYNC(
+        context, context->allocate_output(0, output_shape, &output_tensor),
+        done);
+
+    record.out_t = output_tensor;
+    record.dtype = input_tensor->dtype();
+
+    auto allgather_done_callback = [done, context](StatusOr<Tensor> status) {
+      context->SetStatus(status.status());
+      done();
+    };
+    record.callback = allgather_done_callback;
+
+    auto allgather_launch_callback = [record] {
+      EnqueueTensorCollective(record, MPIRequest::ALLGATHER);
+    };
+
+    // If we are on a CPU, our device context will be null and we can't
+    // get a stream to enqueue this on. On a CPU this op is called when the
+    // data is already available, so we can just immediately do the
+    // allgather; we don't have to wait for the data to get populated.
+#if GOOGLE_CUDA
+    auto device_context = context->op_device_context();
+    if (device_context == nullptr) {
+      allgather_launch_callback();
+    } else {
+      auto stream = device_context->stream();
+      stream->ThenDoHostCallback(allgather_launch_callback);
+    }
+#else
+    allgather_launch_callback();
+#endif
+  }
+};
+
+REGISTER_OP("MPIAllgather")
+    .Attr("T: {int32, int64, float32}")
+    .Attr("S: {int64}")
+    .Input("tensor: T")
+    .Input("sizes: S")
+    .Output("gathered: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle output;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
+      c->set_output(0, output);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Perform an MPI Allgather on a tensor. All other processes that do a gather on a
+tensor with the same name must have the same rank for that tensor, and have the
+same dimension on all but the first dimension.
+
+Arguments
+    tensor:     A tensor to gather.
+    sizes:      A tensor containing the first-dimension sizes of tensors to be
+                gathered from other ranks
+
+Output
+    gathered:   A tensor with the same shape as `tensor` except for the first
+                dimension, which is the sum of dimensions in `sizes`.
+)doc");
+
+REGISTER_KERNEL_BUILDER(
+    Name("MPIAllgather").Device(DEVICE_CPU).HostMemory("sizes"),
+    MPIAllgatherOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("MPIAllgather").Device(DEVICE_GPU).HostMemory("sizes"),
+    MPIAllgatherOp<GPUDevice>);
+#endif
+
+}  // namespace mpi
+}  // namespace contrib
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index 5ac96007df7ee08b1e32aacd28f83768859810a9..334e70318dd88185cecd93ebeb2587861b7999b9 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -31,7 +31,7 @@ tf_custom_op_library(
         "kernels/nccl_ops.cc",
     ],
     deps = if_cuda([
-        "@nccl_archive//:nccl",
+        "@local_config_nccl//:nccl",
         "//tensorflow/core:gpu_headers_lib",
     ]),
 )
@@ -52,6 +52,7 @@ tf_cuda_cc_test(
         "manual",
         "multi_gpu",
         "no_oss",
+        "noguitar",
         "notap",
     ],
     deps =
@@ -60,7 +61,7 @@ tf_cuda_cc_test(
             "//tensorflow/core:test",
             "//tensorflow/core:test_main",
             "//tensorflow/core:testlib",
-            "@nccl_archive//:nccl",
+            "@local_config_nccl//:nccl",
         ],
 )
 
@@ -79,7 +80,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:stream_executor",
-        "@nccl_archive//:nccl",
+        "@local_config_nccl//:nccl",
     ],
     alwayslink = 1,
 )
@@ -136,18 +137,7 @@ cuda_py_test(
         "manual",
         "multi_gpu",
         "no_oss",
+        "noguitar",
         "notap",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 913935b38246f1c5c0f7da4c1ea1f986bc00891b..b1cb89391ceaa70813be47cc1bba0c16f4f70e77 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using se::cuda::ScopedActivateExecutorContext;
 
 // Contains data for a single stream used for nccl communication; this includes
 // a background thread that calls NcclManager::LoopKernelLaunches.
@@ -37,11 +37,11 @@ struct NcclManager::NcclStream {
     cv.notify_all();
   }
 
-  perftools::gputools::StreamExecutor* executor = nullptr;
+  se::StreamExecutor* executor = nullptr;
 
   // The stream on which to run the nccl collective.
   // This is a different stream than the tensorflow compute stream.
-  std::unique_ptr<perftools::gputools::Stream> stream;
+  std::unique_ptr<se::Stream> stream;
 
   // See NcclManager::LoopKernelLaunches for information on these.
   std::unique_ptr<Thread> thread;
@@ -76,6 +76,8 @@ struct NcclManager::Communicator {
 namespace {
 ncclDataType_t ToNcclType(DataType t) {
   switch (t) {
+    case DT_HALF:
+      return ncclHalf;
     case DT_FLOAT:
       return ncclFloat;
     case DT_DOUBLE:
@@ -93,9 +95,8 @@ ncclDataType_t ToNcclType(DataType t) {
 // A participant in a Collective.  See <Collective> below.
 struct NcclManager::Participant {
   Participant(const Tensor* in_t, Tensor* out_t, EventMgr* event_mgr,
-              perftools::gputools::Stream* tensor_stream,
-              perftools::gputools::StreamExecutor* executor, int gpu_device_id,
-              NcclManager::DoneCallback done_callback)
+              se::Stream* tensor_stream, se::StreamExecutor* executor,
+              int gpu_device_id, NcclManager::DoneCallback done_callback)
       : in_t(in_t),
         out_t(out_t),
         event_mgr(event_mgr),
@@ -119,11 +120,11 @@ struct NcclManager::Participant {
   EventMgr* const event_mgr;
 
   // Owned by the caller, who must keep it live until <done_callback> is called.
-  perftools::gputools::Stream* const tensor_stream;
+  se::Stream* const tensor_stream;
 
   // Matches the executor in CommunicatorMember::stream. Expected to be live for
   // process lifetime.
-  perftools::gputools::StreamExecutor* const executor = nullptr;
+  se::StreamExecutor* const executor = nullptr;
 
   const int gpu_device_id;
 
@@ -243,7 +244,7 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
     if (nccl_stream == nullptr) {
       nccl_stream = new NcclStream();
       nccl_stream->executor = executor;
-      nccl_stream->stream.reset(new perftools::gputools::Stream(executor));
+      nccl_stream->stream.reset(new se::Stream(executor));
       nccl_stream->stream->Init();
 
       streams.emplace_back(nccl_stream);
@@ -298,10 +299,10 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
 
 void NcclManager::AddToAllReduce(int num_devices, const string& key,
                                  ncclRedOp_t reduction_op,
-                                 perftools::gputools::StreamExecutor* executor,
+                                 se::StreamExecutor* executor,
                                  int gpu_device_id, EventMgr* event_mgr,
-                                 perftools::gputools::Stream* tensor_stream,
-                                 const Tensor* in_t, Tensor* out_t,
+                                 se::Stream* tensor_stream, const Tensor* in_t,
+                                 Tensor* out_t,
                                  const DoneCallback& done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(in_t, out_t, event_mgr, tensor_stream, executor,
@@ -310,11 +311,12 @@ void NcclManager::AddToAllReduce(int num_devices, const string& key,
                  kAllReduce, reduction_op);
 }
 
-void NcclManager::AddBroadcastSend(
-    int num_devices, const string& key,
-    perftools::gputools::StreamExecutor* executor, int gpu_device_id,
-    EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream,
-    const Tensor* in_t, DoneCallback done_callback) {
+void NcclManager::AddBroadcastSend(int num_devices, const string& key,
+                                   se::StreamExecutor* executor,
+                                   int gpu_device_id, EventMgr* event_mgr,
+                                   se::Stream* tensor_stream,
+                                   const Tensor* in_t,
+                                   DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
                       executor, gpu_device_id, std::move(done_callback)));
@@ -323,11 +325,11 @@ void NcclManager::AddBroadcastSend(
                  kBroadcast, ncclSum /* unused */);
 }
 
-void NcclManager::AddBroadcastRecv(
-    int num_devices, const string& key,
-    perftools::gputools::StreamExecutor* executor, int gpu_device_id,
-    EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream,
-    Tensor* out_t, DoneCallback done_callback) {
+void NcclManager::AddBroadcastRecv(int num_devices, const string& key,
+                                   se::StreamExecutor* executor,
+                                   int gpu_device_id, EventMgr* event_mgr,
+                                   se::Stream* tensor_stream, Tensor* out_t,
+                                   DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(nullptr /* in_t */, out_t, event_mgr, tensor_stream,
                       executor, gpu_device_id, std::move(done_callback)));
@@ -337,9 +339,8 @@ void NcclManager::AddBroadcastRecv(
 
 void NcclManager::AddReduceSend(int num_devices, const string& key,
                                 ncclRedOp_t reduction_op,
-                                perftools::gputools::StreamExecutor* executor,
-                                int gpu_device_id, EventMgr* event_mgr,
-                                perftools::gputools::Stream* tensor_stream,
+                                se::StreamExecutor* executor, int gpu_device_id,
+                                EventMgr* event_mgr, se::Stream* tensor_stream,
                                 const Tensor* in_t,
                                 DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
@@ -351,9 +352,8 @@ void NcclManager::AddReduceSend(int num_devices, const string& key,
 
 void NcclManager::AddReduceRecv(int num_devices, const string& key,
                                 ncclRedOp_t reduction_op,
-                                perftools::gputools::StreamExecutor* executor,
-                                int gpu_device_id, EventMgr* event_mgr,
-                                perftools::gputools::Stream* tensor_stream,
+                                se::StreamExecutor* executor, int gpu_device_id,
+                                EventMgr* event_mgr, se::Stream* tensor_stream,
                                 const Tensor* in_t, Tensor* out_t,
                                 DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
@@ -442,7 +442,7 @@ void NcclManager::RunCollective(const string& key, Collective* collective) {
 }
 
 void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
-  perftools::gputools::Stream* comm_stream = nccl_stream->stream.get();
+  se::Stream* comm_stream = nccl_stream->stream.get();
   ScopedActivateExecutorContext scoped_context(nccl_stream->executor);
   const cudaStream_t* cu_stream = reinterpret_cast<const cudaStream_t*>(
       comm_stream->implementation()->CudaStreamMemberHack());
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index bb219e0edc8a2c4ba0ce0583cbe4018a4fa3a1d1..57a96c5d3342f6e934e88367881388fb160dc5e3 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "src/nccl.h"
+#include "third_party/nccl/nccl.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -55,41 +55,34 @@ class NcclManager {
   // is also the stream that will use the produced data; <done_callback> is
   // not called until the next kernel launched on <stream> would see the data.
   void AddToAllReduce(int num_devices, const string& key,
-                      ncclRedOp_t reduction_op,
-                      perftools::gputools::StreamExecutor* executor,
+                      ncclRedOp_t reduction_op, se::StreamExecutor* executor,
                       int gpu_device_id, EventMgr* event_mgr,
-                      perftools::gputools::Stream* tensor_stream,
-                      const Tensor* in_t, Tensor* out_t,
-                      const DoneCallback& done_callback);
+                      se::Stream* tensor_stream, const Tensor* in_t,
+                      Tensor* out_t, const DoneCallback& done_callback);
 
   // AddBroadcastSend and AddBroadcastRecv combine to sent data from one sender
   // to all receivers.
   void AddBroadcastSend(int num_devices, const string& key,
-                        perftools::gputools::StreamExecutor* executor,
-                        int gpu_device_id, EventMgr* event_mgr,
-                        perftools::gputools::Stream* tensor_stream,
+                        se::StreamExecutor* executor, int gpu_device_id,
+                        EventMgr* event_mgr, se::Stream* tensor_stream,
                         const Tensor* in_t, DoneCallback done_callback);
   void AddBroadcastRecv(int num_devices, const string& key,
-                        perftools::gputools::StreamExecutor* executor,
-                        int gpu_device_id, EventMgr* event_mgr,
-                        perftools::gputools::Stream* tensor_stream,
+                        se::StreamExecutor* executor, int gpu_device_id,
+                        EventMgr* event_mgr, se::Stream* tensor_stream,
                         Tensor* out_t, DoneCallback done_callback);
 
   // AddReduceSend and AddReduceRecv combine to sent data from all senders
   // to one receiver.
   void AddReduceSend(int num_devices, const string& key,
-                     ncclRedOp_t reduction_op,
-                     perftools::gputools::StreamExecutor* executor,
+                     ncclRedOp_t reduction_op, se::StreamExecutor* executor,
                      int gpu_device_id, EventMgr* event_mgr,
-                     perftools::gputools::Stream* tensor_stream,
-                     const Tensor* in_t, DoneCallback done_callback);
+                     se::Stream* tensor_stream, const Tensor* in_t,
+                     DoneCallback done_callback);
   void AddReduceRecv(int num_devices, const string& key,
-                     ncclRedOp_t reduction_op,
-                     perftools::gputools::StreamExecutor* executor,
+                     ncclRedOp_t reduction_op, se::StreamExecutor* executor,
                      int gpu_device_id, EventMgr* event_mgr,
-                     perftools::gputools::Stream* tensor_stream,
-                     const Tensor* in_t, Tensor* out_t,
-                     DoneCallback done_callback);
+                     se::Stream* tensor_stream, const Tensor* in_t,
+                     Tensor* out_t, DoneCallback done_callback);
 
  private:
   enum CollectiveType {
@@ -123,8 +116,7 @@ class NcclManager {
   // Maps a device to the communication streams that make up its collective.
   // This is used to share the stream across different communicators that
   // include the same device.
-  std::map<perftools::gputools::StreamExecutor*,
-           std::vector<std::unique_ptr<NcclStream>>>
+  std::map<se::StreamExecutor*, std::vector<std::unique_ptr<NcclStream>>>
       device_to_comm_streams_ GUARDED_BY(mu_);
 
   std::vector<std::unique_ptr<Communicator>> communicators_;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 985b2bae2566c38dfb2c71a899e4b03bbb8fa55d..4d8d922cb42d2974dab32cf4562bee3993bef098 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -48,35 +48,9 @@ static std::vector<BaseGPUDevice*> GetGPUDevices() {
   return gpus;
 }
 
+template <typename Scalar>
 class NcclManagerTest : public ::testing::Test {
- protected:
-  static void SetUpTestCase() {
-    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
-    devices = new std::vector<BaseGPUDevice*>(GetGPUDevices());
-    CHECK(!devices->empty());
-    LOG(ERROR) << "Running test with " << devices->size() << " gpus";
-  }
-  static void TearDownTestCase() {
-    for (auto device : *devices) delete device;
-    delete devices;
-  }
-
-  static Allocator* gpu_allocator(BaseGPUDevice* device) {
-    return device->GetStepAllocator(AllocatorAttributes(),
-                                    nullptr /* step_resource_manager */);
-  }
-
-  static std::vector<BaseGPUDevice*>* devices;
-
-  template <typename Scalar>
-  perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
-      const Scalar* cuda_memory) {
-    perftools::gputools::DeviceMemoryBase wrapped(
-        const_cast<Scalar*>(cuda_memory));
-    perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
-    return typed;
-  }
-
+ public:
   // A single all-reduce to apply.
   struct TestCase {
     string key;
@@ -89,42 +63,52 @@ class NcclManagerTest : public ::testing::Test {
     int num_completed = 0;
   };
 
+  static void SetUpTestCase() {
+    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
+    devices_ = new std::vector<BaseGPUDevice*>(GetGPUDevices());
+    CHECK(!devices_->empty());
+    LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
+  }
+
+  static void TearDownTestCase() {
+    for (auto device : *devices_) delete device;
+    delete devices_;
+  }
+
   TestCase* MakeTestCase(int num_ranks, ncclRedOp_t reduction_op,
                          TensorShape shape, float value_offset) {
     TestCase* test_case = new TestCase();
-    test_case->expected = Tensor(DT_FLOAT, shape);
+    test_case->expected = Tensor(data_type_, shape);
     if (reduction_op == ncclProd) {
-      test::FillFn<float>(&test_case->expected, [](int) { return 1; });
+      test::FillFn<Scalar>(&test_case->expected,
+                           [](int) { return static_cast<Scalar>(1); });
     } else if (reduction_op == ncclSum) {
-      test::FillFn<float>(&test_case->expected, [](int) { return 0; });
+      test::FillFn<Scalar>(&test_case->expected,
+                           [](int) { return static_cast<Scalar>(0); });
     } else if (reduction_op == ncclMax) {
-      test::FillFn<float>(&test_case->expected, [](int) {
-        return -1 * std::numeric_limits<float>::max();
-      });
+      test::FillFn<Scalar>(&test_case->expected, [](int) { return -max_; });
     } else if (reduction_op == ncclMin) {
-      test::FillFn<float>(&test_case->expected, [](int) {
-        return std::numeric_limits<float>::max();
-      });
+      test::FillFn<Scalar>(&test_case->expected, [](int) { return max_; });
     } else {
       LOG(FATAL) << "Invalid reduction_op " << reduction_op;
     }
 
-    int mult = 1;
-    for (int i = 0; i < num_ranks; ++i) {
-      auto* device = devices->at(i % devices->size());
+    float value_scale = 0.01;  // Small scale to avoid fp16 overflow.
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
 
-      Tensor in_cpu(DT_FLOAT, shape);
-      test::FillFn<float>(&in_cpu, [mult, value_offset](int index) {
-        return value_offset + (index + 1) * mult;
+      Tensor in_cpu(data_type_, shape);
+      test::FillFn<Scalar>(&in_cpu, [&](int index) {
+        return static_cast<Scalar>((index + 1) * value_scale + value_offset);
       });
       for (int j = 0; j < shape.num_elements(); ++j) {
-        auto in_val = in_cpu.flat<float>()(j);
-        auto out_expr = test_case->expected.flat<float>();
+        auto in_val = in_cpu.flat<Scalar>()(j);
+        auto out_expr = test_case->expected.template flat<Scalar>();
         if (reduction_op == ncclProd) {
-          out_expr(j) *= in_val;
+          out_expr(j) = out_expr(j) * in_val;
         } else if (reduction_op == ncclSum) {
-          out_expr(j) += in_val;
+          out_expr(j) = out_expr(j) + in_val;
         } else if (reduction_op == ncclMax) {
           if (in_val > out_expr(j)) {
             out_expr(j) = in_val;
@@ -136,26 +120,18 @@ class NcclManagerTest : public ::testing::Test {
         }
       }
 
-      mult *= 10;
-      test_case->ins.emplace_back(gpu_allocator(device), DT_FLOAT, shape);
-      test_case->outs.emplace_back(gpu_allocator(device), DT_FLOAT, shape);
+      value_scale *= 10;
+      test_case->ins.emplace_back(GpuAllocator(device), data_type_, shape);
+      test_case->outs.emplace_back(GpuAllocator(device), data_type_, shape);
 
       const Tensor& in_gpu = test_case->ins.back();
-      auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<float>().data());
-      stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<float>().data(),
+      auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
+      stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
                          in_cpu.TotalBytes());
     }
     return test_case;
   }
 
-  NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
-    return [this, test_case](Status s) {
-      mutex_lock l(test_case->mu);
-      ++test_case->num_completed;
-      test_case->final_status.Update(s);
-    };
-  }
-
   void VerifyResults(const string& case_label, TestCase* test_case) {
     // Wait for the done callback to be called.
     {
@@ -168,41 +144,82 @@ class NcclManagerTest : public ::testing::Test {
       test_case->mu.unlock();
     }
     // Copy memory to host and verify.
-    for (int i = 0; i < test_case->outs.size(); ++i) {
-      auto* device = devices->at(i % devices->size());
+    for (int rank = 0; rank < test_case->outs.size(); ++rank) {
+      auto* device = GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
-      const Tensor& out_gpu = test_case->outs[i];
-      Tensor out_cpu(DT_FLOAT, out_gpu.shape());
-      auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<float>().data());
-      stream->ThenMemcpy(out_cpu.flat<float>().data(), out_gpu_mem,
+      const Tensor& out_gpu = test_case->outs[rank];
+      Tensor out_cpu(data_type_, out_gpu.shape());
+      auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<Scalar>().data());
+      stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
-      test::ExpectTensorEqual<float>(test_case->expected, out_cpu);
+      test::ExpectTensorNear<Scalar>(test_case->expected, out_cpu, 0.01);
     }
   }
+
+  NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
+    return [this, test_case](Status s) {
+      mutex_lock l(test_case->mu);
+      ++test_case->num_completed;
+      test_case->final_status.Update(s);
+    };
+  }
+
+  static BaseGPUDevice* GetDevice(size_t rank) {
+    return devices_->at(rank % devices_->size());
+  }
+
+ private:
+  static Allocator* GpuAllocator(BaseGPUDevice* device) {
+    return device->GetStepAllocator(AllocatorAttributes(),
+                                    nullptr /* step_resource_manager */);
+  }
+
+  static se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* cuda_memory) {
+    se::DeviceMemoryBase wrapped(const_cast<Scalar*>(cuda_memory));
+    se::DeviceMemory<Scalar> typed(wrapped);
+    return typed;
+  }
+
+ private:
+  static std::vector<BaseGPUDevice*>* devices_;
+  static const DataType data_type_;
+  static const Scalar max_;
 };
-std::vector<BaseGPUDevice*>* NcclManagerTest::devices = nullptr;
+
+template <typename Scalar>
+std::vector<BaseGPUDevice*>* NcclManagerTest<Scalar>::devices_ = nullptr;
+template <typename Scalar>
+const DataType NcclManagerTest<Scalar>::data_type_ =
+    DataTypeToEnum<Scalar>::value;
+template <typename Scalar>
+const Scalar NcclManagerTest<Scalar>::max_ =
+    Eigen::NumTraits<Scalar>::highest();
+
+// Instantiate tests for float and half.
+using TypeList = ::testing::Types<float, Eigen::half>;
+TYPED_TEST_CASE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
-TEST_F(NcclManagerTest, BasicSumReduction) {
+TYPED_TEST(NcclManagerTest, BasicSumReduction) {
   const int num_ranks = 3;
 
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
-    std::unique_ptr<TestCase> test_case(
-        MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0));
-    for (int device_num = 0; device_num < num_ranks; ++device_num) {
-      auto* device = devices->at(device_num % devices->size());
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0.0f));
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
       auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       NcclManager::instance()->AddToAllReduce(
           num_ranks, "allreduce", reduction_op, device->executor(),
-          device->gpu_id(), event_mgr, stream, &test_case->ins[device_num],
-          &test_case->outs[device_num], CreateDoneCallback(test_case.get()));
+          device->gpu_id(), event_mgr, stream, &test_case->ins[rank],
+          &test_case->outs[rank], this->CreateDoneCallback(test_case.get()));
     }
 
     LOG(ERROR) << "Verifying results";
-    VerifyResults("test_case", test_case.get());
+    this->VerifyResults("test_case", test_case.get());
   }
 }
 
@@ -213,7 +230,7 @@ TEST_F(NcclManagerTest, BasicSumReduction) {
 // with num_ranks > devices->size(), for some GPUs (e.g. K20m).
 // To test the higher settings, increase num_ranks,
 // num_collectives_per_iteration and time_limit_micros.
-TEST_F(NcclManagerTest, MultipleCallers) {
+TYPED_TEST(NcclManagerTest, MultipleCallers) {
   const int num_ranks = 1;                      // 2;
   const int num_collectives_per_iteration = 1;  // 1000;
   const int num_threads = 3;
@@ -223,49 +240,49 @@ TEST_F(NcclManagerTest, MultipleCallers) {
   srand(Env::Default()->NowMicros());
 
   for (;;) {
-    std::vector<std::pair<int, int>> case_and_device_num;
-    std::vector<std::unique_ptr<TestCase>> test_cases;
+    std::vector<std::pair<int, int>> case_and_rank;
+    std::vector<std::unique_ptr<typename TestFixture::TestCase>> test_cases;
     for (int i = 0; i < num_collectives_per_iteration; ++i) {
-      test_cases.emplace_back(
-          MakeTestCase(num_ranks, ncclSum,
-                       TensorShape({100, i % 5 + 1, i % 3 + 1}), i + 0.1 * i));
+      test_cases.emplace_back(this->MakeTestCase(
+          num_ranks, ncclSum, TensorShape({100, i % 5 + 1, i % 3 + 1}),
+          1.1f * i));
       for (int j = 0; j < num_ranks; ++j) {
-        case_and_device_num.emplace_back(i, j);
+        case_and_rank.emplace_back(i, j);
       }
     }
 
-    for (int i = 0; i < num_ranks; ++i) {
-      auto* device = devices->at(i % devices->size());
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       SE_ASSERT_OK(stream->BlockHostUntilDone());
     }
 
-    std::shuffle(case_and_device_num.begin(), case_and_device_num.end(),
+    std::shuffle(case_and_rank.begin(), case_and_rank.end(),
                  std::mt19937(std::random_device()()));
 
-    mutex mu;  // guards case_and_device_num.
+    mutex mu;  // guards case_and_rank.
     std::unique_ptr<thread::ThreadPool> pool(
         new thread::ThreadPool(Env::Default(), "test", num_threads));
-    const int to_schedule = case_and_device_num.size();
+    const int to_schedule = case_and_rank.size();
     for (int i = 0; i < to_schedule; ++i) {
       auto fn = [&]() {
-        int device_num;
+        int rank;
         int test_num;
         {
           mutex_lock l(mu);
-          test_num = case_and_device_num.back().first;
-          device_num = case_and_device_num.back().second;
-          case_and_device_num.pop_back();
+          test_num = case_and_rank.back().first;
+          rank = case_and_rank.back().second;
+          case_and_rank.pop_back();
         }
-        auto* device = devices->at(device_num % devices->size());
+        auto* device = this->GetDevice(rank);
         auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
         auto* stream = device->tensorflow_gpu_device_info()->stream;
-        TestCase* test_case = test_cases[test_num].get();
+        typename TestFixture::TestCase* test_case = test_cases[test_num].get();
         NcclManager::instance()->AddToAllReduce(
             num_ranks, strings::StrCat("allreduce", test_num), ncclSum,
             device->executor(), device->gpu_id(), event_mgr, stream,
-            &test_case->ins[device_num], &test_case->outs[device_num],
-            CreateDoneCallback(test_case));
+            &test_case->ins[rank], &test_case->outs[rank],
+            this->CreateDoneCallback(test_case));
       };
       pool->Schedule(fn);
     }
@@ -274,7 +291,8 @@ TEST_F(NcclManagerTest, MultipleCallers) {
     LOG(ERROR) << "Verifying results for " << num_collectives_per_iteration
                << " collectives";
     for (int i = 0; i < test_cases.size(); ++i) {
-      VerifyResults(strings::StrCat("collective", i), test_cases[i].get());
+      this->VerifyResults(strings::StrCat("collective", i),
+                          test_cases[i].get());
     }
 
     int64 delta = Env::Default()->NowMicros() - start;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_ops.cc b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
index 266d4f6f0de0274dca2bfc9022503f09b0ca7d42..c2b76caef38a4af248387b65701b8f8936e8431f 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "src/nccl.h"
+#include "third_party/nccl/nccl.h"
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc b/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
index a4de46a93fab1dfe93b47f2789cc533bc447e43a..4676e937e56e35cdec5d2ac57fa07b7bda5fe291 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA
 
 #include <forward_list>
@@ -254,7 +255,7 @@ class NcclReplacePass : public GraphOptimizationPass {
     // Find reduction and broadcast ops and replace them with Send/Recv ops.
     for (Node* node : graph->op_nodes()) {
       StringPiece type = node->type_string();
-      if (!type.starts_with("Nccl")) {
+      if (!str_util::StartsWith(type, "Nccl")) {
         continue;
       }
       if (type == "NcclReduce") {
diff --git a/tensorflow/contrib/nccl/ops/nccl_ops.cc b/tensorflow/contrib/nccl/ops/nccl_ops.cc
index 8eb804c2e988f313ba1b340217cae20f1f5502c7..a353a34b80add119fcdc8bc4230eddf5a77b30e8 100644
--- a/tensorflow/contrib/nccl/ops/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/ops/nccl_ops.cc
@@ -25,7 +25,7 @@ REGISTER_OP("NcclAllReduce")
     .Input("input: T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -51,7 +51,7 @@ REGISTER_OP("NcclReduce")
     .Input("input: num_devices * T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -69,7 +69,7 @@ reduction: the reduction operation to perform.
 REGISTER_OP("_NcclReduceSend")
     .Input("input: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -92,7 +92,7 @@ REGISTER_OP("_NcclReduceRecv")
     .Input("input: T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -118,7 +118,7 @@ shared_name: Identifier that is shared between ops of the same reduce.
 REGISTER_OP("NcclBroadcast")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("shape: shape")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -135,7 +135,7 @@ shape: The shape of the input tensor.
 
 REGISTER_OP("_NcclBroadcastSend")
     .Input("input: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -157,7 +157,7 @@ shared_name: Identifier that is shared between ops of the same broadcast.
 REGISTER_OP("_NcclBroadcastRecv")
     .Input("shape: int32")
     .Output("output: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops.py b/tensorflow/contrib/nccl/python/ops/nccl_ops.py
index 8dc038b9ac992de7db8b762e3697c6693099e192..794372a1f4b0dcc41bcf0da611f5bc2ec9301973 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops.py
@@ -267,5 +267,5 @@ def _check_device(tensor, expected=None):
 
 
 def _check_graph_mode():
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise ValueError('Nccl ops are not supported in eager mode')
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index 98fe394c5b38294700617591992d3207b0a4706b..423a8689aeee062fb58eaf9d6d9b980b0998754e 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -72,7 +72,7 @@ class NcclTestCase(test.TestCase):
           two.
       device_sets: Tuple of virtual devices to run test on.
     """
-    for dtype in [np.float32, np.int32, np.int64, np.float64]:
+    for dtype in [np.float16, np.float32, np.int32, np.int64, np.float64]:
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
diff --git a/tensorflow/contrib/nearest_neighbor/BUILD b/tensorflow/contrib/nearest_neighbor/BUILD
index 9500c18b1df9d772dfb827bc2b3d33e0a65974f6..6fa762446705310a60cbdd9302c1a5083b69f065 100644
--- a/tensorflow/contrib/nearest_neighbor/BUILD
+++ b/tensorflow/contrib/nearest_neighbor/BUILD
@@ -111,15 +111,3 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/nn/BUILD b/tensorflow/contrib/nn/BUILD
index 5543eb6c6e3785978e9c878f309b9bd0863b0b0a..ef7ab2264655ca0148a9c045bba04018d9599dfc 100644
--- a/tensorflow/contrib/nn/BUILD
+++ b/tensorflow/contrib/nn/BUILD
@@ -98,14 +98,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
index 63fc487dca69a4777821595a0366d0ae0b393ce2..e65925610c5f5125c2d2e92edc1cf708c54255d4 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py
@@ -88,7 +88,7 @@ def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
     return math_ops.reduce_logsumexp(
         math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True),
         axis=1,
-        keep_dims=False)
+        keepdims=False)
 
   # Calling this protected form of embedding_lookup allows co-locating
   # the logsumexp computation with the partitioned weights, which yields
diff --git a/tensorflow/contrib/nn/python/ops/scaled_softplus.py b/tensorflow/contrib/nn/python/ops/scaled_softplus.py
index fcbfbc239ca5b8a1d4b17b403f99b7eb05db47b0..7184ef2b66ec4662af3a37def070ab151d6e7c15 100644
--- a/tensorflow/contrib/nn/python/ops/scaled_softplus.py
+++ b/tensorflow/contrib/nn/python/ops/scaled_softplus.py
@@ -30,9 +30,7 @@ def _reduce_and_reshape_grad(g, t):
   """Returns the gradient, sum-reduced and reshaped to `t`'s shape."""
   shape = array_ops.shape(t)
   g_shape = array_ops.shape(g)
-  # pylint: disable=protected-access
-  bcast_dims, _ = gen_array_ops._broadcast_gradient_args(shape, g_shape)
-  # pylint: enable=protected-access
+  bcast_dims, _ = gen_array_ops.broadcast_gradient_args(shape, g_shape)
   return array_ops.reshape(math_ops.reduce_sum(g, bcast_dims), shape)
 
 
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 86ceda71b703a021fd3a71371b5d56ab82c42ee2..13aa1d7e7a11877373a848c1ba865aa418790cd0 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -14,6 +14,7 @@ py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/adamax.py",
         "python/training/addsign.py",
         "python/training/drop_stale_gradient_optimizer.py",
         "python/training/elastic_average_optimizer.py",
@@ -24,6 +25,7 @@ py_library(
         "python/training/multitask_optimizer_wrapper.py",
         "python/training/nadam_optimizer.py",
         "python/training/powersign.py",
+        "python/training/reg_adagrad_optimizer.py",
         "python/training/sign_decay.py",
         "python/training/variable_clipping_optimizer.py",
     ],
@@ -43,15 +45,34 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+py_test(
+    name = "adamax_test",
+    srcs = ["python/training/adamax_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "external_optimizer_test",
     srcs = ["python/training/external_optimizer_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no-internal-py3",
+    ],
     deps = [
         ":opt_py",
         "//tensorflow/python:array_ops",
@@ -70,7 +91,6 @@ py_test(
     srcs = ["python/training/moving_average_optimizer_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss",  # b/73507407
         "notsan",  # b/31055119
     ],
     deps = [
@@ -136,6 +156,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "reg_adagrad_optimizer_test",
+    srcs = ["python/training/reg_adagrad_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "nadam_optimizer_test",
     srcs = ["python/training/nadam_optimizer_test.py"],
@@ -263,14 +302,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 6c1bb1adc096f5b8e6945ea1492727d16cf29e65..4c13c8e247185213b798eb733ddcf65a07a8f64d 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.adamax import *
 from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
@@ -36,6 +37,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
+    'AdaMaxOptimizer',
     'PowerSignOptimizer',
     'AddSignOptimizer',
     'DelayCompensatedGradientDescentOptimizer',
diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..686bac0d8408cdb0d2937bbd1de8535e216f8bf0
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""AdaMax for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import training_ops
+
+
+class AdaMaxOptimizer(adam.AdamOptimizer):
+  """Optimizer that implements the AdaMax algorithm.
+
+  Adamax is sometimes superior to adam, specially in models with embeddings,
+  see [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+               use_locking=False, name="AdaMax"):
+    """Construct a new AdaMax optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section 7.1 of the paper:
+
+    ```
+    t <- t + 1
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- max(beta2 * v_{t-1}, abs(g))
+    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+    ```
+
+    Similar to AdamOptimizer, the epsilon is added for numerical stability
+    (especially to get rid of division by zero when v_t = 0).
+
+    Contrast to AdamOptimizer, the sparse implementation of this algorithm
+    (used when the gradient is an IndexedSlices object, typically because of
+    `tf.gather` or an embedding lookup in the forward pass) only updates
+    variable slices and corresponding `m_t`, `v_t` terms when that part of
+    the variable was used in the forward pass. This means that the sparse
+    behavior is contrast to the dense behavior (similar to some momentum
+    implementations which ignore momentum unless a variable slice was actually
+    used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta1: A float value or a constant float tensor.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor.
+        The exponential decay rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "AdaMax".
+    """
+    super(AdaMaxOptimizer, self).__init__(learning_rate, beta1, beta2,
+                                          epsilon, use_locking, name)
+
+  def _get_beta_accumulators(self):
+    if context.executing_eagerly():
+      graph = None
+    else:
+      graph = ops.get_default_graph()
+    return self._get_non_slot_variable("beta1_power", graph=graph)
+
+  def _create_slots(self, var_list):
+    # Create the beta1 accumulators on the same device as the first
+    # variable. Sort the var_list to make sure this device is consistent across
+    # workers (these need to go on the same PS, otherwise some updates are
+    # silently ignored).
+    first_var = min(var_list, key=lambda x: x.name)
+    self._create_non_slot_variable(initial_value=self._beta1,
+                                   name="beta1_power",
+                                   colocate_with=first_var)
+
+    # Create slots for the first and second moments.
+    for v in var_list:
+      self._zeros_slot(v, "m", self._name)
+      self._zeros_slot(v, "v", self._name)
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power = self._get_beta_accumulators()
+    return training_ops.apply_ada_max(
+        var, m, v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad, use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power = self._get_beta_accumulators()
+    return training_ops.resource_apply_ada_max(
+        var.handle, m.handle, v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices,
+                           scatter_add, scatter_update):
+    beta1_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_slice = array_ops.gather(m, indices)
+    m_t_slice = m_slice * beta1_t + grad * (1 - beta1_t)
+    with ops.control_dependencies([m_t_slice]):
+      m_t = scatter_update(m, indices, m_t_slice)
+    # u_t = max(beta2 * u, abs(g_t))
+    v = self.get_slot(var, "v")
+    v_slice = array_ops.gather(v, indices)
+    v_t_slice = math_ops.maximum(v_slice * beta2_t, math_ops.abs(grad))
+    with ops.control_dependencies([v_t_slice]):
+      v_t = scatter_update(v, indices, v_t_slice)
+    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
+    var_slice = -lr_t / (1 - beta1_power) * (m_t_slice /
+                                             (v_t_slice + epsilon_t))
+    with ops.control_dependencies([var_slice]):
+      var_update = scatter_add(var, indices, var_slice)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking),
+        lambda x, i, v: state_ops.scatter_update(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking))
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._apply_sparse_shared(
+        grad, var, indices,
+        self._resource_scatter_add, self._resource_scatter_update)
+
+  def _finish(self, update_ops, name_scope):
+    # Update the power accumulators.
+    with ops.control_dependencies(update_ops):
+      beta1_power = self._get_beta_accumulators()
+      with ops.colocate_with(beta1_power):
+        update_beta1 = beta1_power.assign(
+            beta1_power * self._beta1_t, use_locking=self._use_locking)
+    return control_flow_ops.group(*update_ops + [update_beta1],
+                                  name=name_scope)
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc92a7006f1a0a56adafc486a75afa94e965cb2c
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -0,0 +1,348 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AdaMax."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import adamax
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adamax_update_numpy(param,
+                        g_t,
+                        t,
+                        m,
+                        v,
+                        alpha=0.001,
+                        beta1=0.9,
+                        beta2=0.999,
+                        epsilon=1e-8):
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = np.maximum(beta2 * v, np.abs(g_t))
+  param_t = param - (alpha / (1 - beta1**t)) * (m_t / (v_t + epsilon))
+  return param_t, m_t, v_t
+
+
+def adamax_sparse_update_numpy(param,
+                               indices,
+                               g_t,
+                               t,
+                               m,
+                               v,
+                               alpha=0.001,
+                               beta1=0.9,
+                               beta2=0.999,
+                               epsilon=1e-8):
+  m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
+  param_t_slice = param[indices] - ((alpha / (1 - beta1**t)) *
+                                    (m_t_slice / (v_t_slice + epsilon)))
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t
+
+
+class AdaMaxOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)
+        m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
+        var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([2, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adamax.AdaMaxOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
+        self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_sparse_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_sparse_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adamax.AdaMaxOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adamax.AdaMaxOptimizer().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adamax.AdaMaxOptimizer().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def doTestBasic(self, use_resource=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.test_session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adamax.AdaMaxOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        beta1_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+
+        with ops.Graph().as_default():
+          # Shouldn't return non-slot variables from other graphs.
+          self.assertEqual(0, len(opt.variables()))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/AdaMax:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.AdaMaxOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.AdaMaxOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined AdaMax1 and AdaMax2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = adamax.AdaMaxOptimizer()
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adamax.AdaMaxOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(5, len(set(opt.variables())))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/addsign_test.py b/tensorflow/contrib/opt/python/training/addsign_test.py
index bd19ee3e7ac514448c6d79272abb86a154f55e9a..08d45ed73f3ae4b580d7078272e79fef22ef67c5 100644
--- a/tensorflow/contrib/opt/python/training/addsign_test.py
+++ b/tensorflow/contrib/opt/python/training/addsign_test.py
@@ -97,7 +97,7 @@ class AddSignTest(test.TestCase):
                                      global_step=global_step)
         neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
                                          global_step=global_step)
-        if context.in_graph_mode():
+        if not context.executing_eagerly():
           self.evaluate(variables.global_variables_initializer())
           # Fetch params to validate initial values
           self.assertAllClose([1.0, 2.0], self.evaluate(var0))
@@ -108,13 +108,13 @@ class AddSignTest(test.TestCase):
         # last 3 steps with negative gradient (sign(gm) should be -1)
         for t in range(1, 8):
           if t < 5:
-            if context.in_graph_mode():
+            if not context.executing_eagerly():
               self.evaluate(update)
             elif t > 1:
               opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
                                   global_step=global_step)
           else:
-            if context.in_graph_mode():
+            if not context.executing_eagerly():
               self.evaluate(neg_update)
             elif t > 1:
               opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
index aeca900bc8ff4c4cc26da490ce43dfec70fd9f11..72117c1e81a164b0517fabeaddec3ea5132af5a9 100644
--- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
@@ -56,21 +56,21 @@ class LazyAdamOptimizer(adam.AdamOptimizer):
     epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
     lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
 
-    # m := beta1 * m + (1 - beta1) * g_t
+    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
     m = self.get_slot(var, "m")
     m_t = state_ops.scatter_update(m, grad.indices,
                                    beta1_t * array_ops.gather(m, grad.indices) +
                                    (1 - beta1_t) * grad.values,
                                    use_locking=self._use_locking)
 
-    # v := beta2 * v + (1 - beta2) * (g_t * g_t)
+    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
     v = self.get_slot(var, "v")
     v_t = state_ops.scatter_update(v, grad.indices,
                                    beta2_t * array_ops.gather(v, grad.indices) +
                                    (1 - beta2_t) * math_ops.square(grad.values),
                                    use_locking=self._use_locking)
 
-    # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))
+    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
     m_t_slice = array_ops.gather(m_t, grad.indices)
     v_t_slice = array_ops.gather(v_t, grad.indices)
     denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index 85e3e8d3791f2331ed249c0b7f67a3dbde4fca08..ac04ad99110b016b62e091aa10c7f565e5093bc1 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -85,7 +85,7 @@ class MovingAverageOptimizerTest(test.TestCase):
               state_ops.assign_add(ema_var1, [4.0, 4.0])
           ])
 
-          # Test taht saver with missing ema variables will fail.
+          # Test that saver with missing ema variables will fail.
           with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
             opt.swapping_saver(var_list=[var0])
 
@@ -123,7 +123,7 @@ class MovingAverageOptimizerTest(test.TestCase):
             self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
             self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
             self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restore back to previou state.
+            # Restore back to previous state.
             train_saver.restore(sess, save_path)
 
           # If updates are parallel, this is not always true after the 1st step.
diff --git a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
index cb6c77a86feedde3285d75092511c8eb1e63b2a5..9076cc9d128552e37c09852ab2f24aa0c9977892 100644
--- a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
@@ -22,6 +22,7 @@ import types
 import six
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -40,8 +41,10 @@ def _get_wrapper(fn, opt):
 
   def wrapper(self, grad, *args, **kwargs):  # pylint: disable=unused-argument
     all_zeros = _is_all_zeros(grad)
-    return control_flow_ops.cond(all_zeros, control_flow_ops.no_op,
-                                 lambda: fn(grad, *args, **kwargs))
+    def call_fn():
+      with ops.control_dependencies([fn(grad, *args, **kwargs)]):
+        return control_flow_ops.no_op()
+    return control_flow_ops.cond(all_zeros, control_flow_ops.no_op, call_fn)
 
   wrapper = types.MethodType(wrapper, opt)
   return wrapper
diff --git a/tensorflow/contrib/opt/python/training/powersign_test.py b/tensorflow/contrib/opt/python/training/powersign_test.py
index ff7b1a72d47d8ef54980905323bcaf358c988a82..5214082dd66f00eadadad71d50f7e00b178b8c10 100644
--- a/tensorflow/contrib/opt/python/training/powersign_test.py
+++ b/tensorflow/contrib/opt/python/training/powersign_test.py
@@ -99,7 +99,7 @@ class PowerSignTest(test.TestCase):
         neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
                                          global_step=global_step)
 
-        if context.in_graph_mode():
+        if not context.executing_eagerly():
           self.evaluate(variables.global_variables_initializer())
           # Fetch params to validate initial values
           self.assertAllClose([1.0, 2.0], self.evaluate(var0))
@@ -110,13 +110,13 @@ class PowerSignTest(test.TestCase):
         # last 3 steps with negative gradient (sign(gm) should be -1)
         for t in range(1, 8):
           if t < 5:
-            if context.in_graph_mode():
+            if not context.executing_eagerly():
               self.evaluate(update)
             elif t > 1:
               opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
                                   global_step=global_step)
           else:
-            if context.in_graph_mode():
+            if not context.executing_eagerly():
               self.evaluate(neg_update)
             elif t > 1:
               opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0e0405a2c3e5ec05cf487a2ca48207b7a9d4663
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py
@@ -0,0 +1,107 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RegAdagrad for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import training_ops
+from tensorflow.python.util import tf_contextlib
+
+
+class RegAdagradOptimizer(adagrad.AdagradOptimizer):
+  """RegAdagrad: Adagrad with updates that optionally skip updating the slots.
+
+  This is meant to address the problem of additional regularization terms in the
+  loss function affecting learning rate decay and causing hyper-param
+  entanglement. Example usage:
+
+    loss = tf.nn.cross_entropy(x, labels)
+    reg_loss = reg_strength * tf.reduce_sum(x * x)
+    opt = tf.contrib.opt.RegAdagradOptimizer(learning_rate)
+    loss_update = opt.minimize(loss)
+    with opt.avoid_updating_slots():
+      reg_update = opt.minimize(reg_loss)
+    total_update = tf.group([loss_update, reg_update])
+
+    # ...
+
+    sess.run(total_update, ...)
+  """
+
+  def __init__(self,
+               learning_rate,
+               initial_accumulator_value=0.1,
+               use_locking=False,
+               name="RegAdagrad"):
+    super(RegAdagradOptimizer, self).__init__(
+        learning_rate,
+        initial_accumulator_value=initial_accumulator_value,
+        use_locking=use_locking,
+        name=name)
+    self._should_update_slots = True
+
+  @tf_contextlib.contextmanager
+  def avoid_updating_slots(self):
+    old = self._should_update_slots
+    self._should_update_slots = False
+    try:
+      yield
+    finally:
+      self._should_update_slots = old
+
+  def _apply_dense(self, grad, var):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.apply_adagrad(
+        var,
+        acc,
+        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking,
+        update_slots=self._should_update_slots)
+
+  def _resource_apply_dense(self, grad, var, update_slots=True):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.resource_apply_adagrad(
+        var.handle,
+        acc.handle,
+        math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking,
+        update_slots=self._should_update_slots)
+
+  def _apply_sparse(self, grad, var, update_slots=True):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.sparse_apply_adagrad(
+        var,
+        acc,
+        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking,
+        update_slots=self._should_update_slots)
+
+  def _resource_apply_sparse(self, grad, var, indices, update_slots=True):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.resource_sparse_apply_adagrad(
+        var.handle,
+        acc.handle,
+        math_ops.cast(self._learning_rate_tensor, grad.dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking,
+        update_slots=self._should_update_slots)
diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea56e1646a0811ab065105cd260a760b5b718354
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py
@@ -0,0 +1,343 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for Regreg_adagrad_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import reg_adagrad_optimizer
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class RegAdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_locking=False, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0, initial_accumulator_value=0.1, use_locking=use_locking)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic(use_locking=False)
+
+  def testBasicResource(self):
+    self.doTestBasic(use_locking=False, use_resource=True)
+
+  def testBasicLocked(self):
+    self.doTestBasic(use_locking=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = reg_adagrad_optimizer.RegAdagradOptimizer(1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]],
+                                           var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            constant_op.constant(3.0), initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([[1.0], [2.0]], var0.eval())
+        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([[3.0], [3.715679168701172]]), var1.eval())
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]), constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant([0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        repeated_update = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0).apply_gradients([(grad_repeated_index,
+                                   repeated_index_update_var)])
+        aggregated_update = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0).apply_gradients([(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def testSparseRepeatedIndicesResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = reg_adagrad_optimizer.RegAdagradOptimizer(
+            2.0).minimize(loss_repeated)
+        update_op_aggregated = reg_adagrad_optimizer.RegAdagradOptimizer(
+            2.0).minimize(loss_aggregated)
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(var_repeated.eval(),
+                                           var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(var_repeated.eval(),
+                                             var_aggregated.eval())
+
+  def testSparseStability(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        shape = [1, 6]
+        var0 = variables.Variable(
+            [[
+                0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257,
+                -0.0105945
+            ]],
+            dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[
+                    -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05,
+                    -8.4877e-05, -9.48906e-05
+                ]],
+                shape=shape,
+                dtype=dtype), constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            1.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(3.0)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+        # Validate updated params (the same as with only 1 RegAdagrad).
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testDynamicShapeVariable_Ok(self):
+    with self.test_session():
+      v = variable_scope.get_variable(
+          "v", initializer=constant_op.constant(1.), validate_shape=False)
+      self.assertFalse(v.shape.is_fully_defined())
+      # Creating optimizer should cause no exception.
+      reg_adagrad_optimizer.RegAdagradOptimizer(
+          3.0, initial_accumulator_value=0.1)
+
+  def testSkipUpdatingSlots(self):
+    iav = 0.130005  # A value that works with float16
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0, initial_accumulator_value=iav)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        with ada_opt.avoid_updating_slots():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        for _ in range(3):
+          ada_update.run()
+        # Validate that ada_opt's slots are not updated.
+        self.assertAllCloseAccordingToType(np.array([iav, iav]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([iav, iav]), slot1.eval())
+
+  def testSparseSkipUpdatingSlots(self):
+    iav = 0.130005  # A value that works with float16
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0, initial_accumulator_value=iav)
+        with ada_opt.avoid_updating_slots():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([[1.0], [2.0]], var0.eval())
+        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+        # Validate that ada_opt's slots are not updated.
+        self.assertAllCloseAccordingToType(
+            np.array([[iav], [iav]]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([[iav], [iav]]), slot1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/variable_clipping_optimizer.py b/tensorflow/contrib/opt/python/training/variable_clipping_optimizer.py
index 74036082f0ca2bae23b30deb1b1986befd6601d8..3c0b8394be51e8744b5461a00a99ead5e45d90b2 100644
--- a/tensorflow/contrib/opt/python/training/variable_clipping_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/variable_clipping_optimizer.py
@@ -109,7 +109,7 @@ class VariableClippingOptimizer(optimizer.Optimizer):
 
   def _clip_dense(self, var):
     with self._maybe_colocate_with(var):
-      updated_var_value = var._ref()  # pylint: disable=protected-access
+      updated_var_value = var.read_value()
       normalized_var = clip_ops.clip_by_norm(
           updated_var_value, self._max_norm, self._vars_to_clip_dims[var])
       delta = updated_var_value - normalized_var
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5225ecc14fef3cec9506eceb776805b74a87714e
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -0,0 +1,206 @@
+# Prototype of OptimizerV2.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "optimizer_v2_py",
+    srcs = ["optimizer_v2_symbols.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":training",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "training",
+    srcs = [
+        "adadelta.py",
+        "adagrad.py",
+        "adam.py",
+        "gradient_descent.py",
+        "momentum.py",
+        "optimizer_v2.py",
+        "rmsprop.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "adadelta_test",
+    size = "medium",
+    srcs = ["adadelta_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "adagrad_test",
+    size = "small",
+    srcs = ["adagrad_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "small",
+    srcs = ["adam_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "checkpointable_utils_test",
+    srcs = ["checkpointable_utils_test.py"],
+    additional_deps = [
+        ":training",
+        "@six_archive//:six",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:layers_base",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+    ],
+    tags = ["notsan"],
+)
+
+cuda_py_test(
+    name = "gradient_descent_test",
+    size = "medium",
+    srcs = ["gradient_descent_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "momentum_test",
+    size = "medium",
+    srcs = ["momentum_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "optimizer_v2_test",
+    size = "medium",
+    srcs = ["optimizer_v2_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "rmsprop_test",
+    size = "small",
+    srcs = ["rmsprop_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+    tags = ["optonly"],
+)
diff --git a/tensorflow/contrib/optimizer_v2/adadelta.py b/tensorflow/contrib/optimizer_v2/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..b206f9f61bd56581e5105b2bc635c69abbc9af4c
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adadelta.py
@@ -0,0 +1,113 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adadelta for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class AdadeltaOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adadelta algorithm.
+
+  See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
+  ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
+  """
+
+  def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-8,
+               use_locking=False, name="Adadelta"):
+    """Construct a new Adadelta optimizer.
+
+    Some of the args below are hyperparameters, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+        To match the exact form in the original paper use 1.0.
+      rho: A float hyperparameter. The decay rate.
+      epsilon: A float hyperparameter. A constant epsilon used to better
+        condition the grad update.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adadelta".
+    """
+    super(AdadeltaOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("rho", rho)
+    self._set_hyper("epsilon", epsilon)
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      state.zeros_slot(v, "accum")
+      state.zeros_slot(v, "accum_update")
+
+  def _apply_dense(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.apply_adadelta(
+        var,
+        accum,
+        accum_update,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_dense(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.resource_apply_adadelta(
+        var.handle,
+        accum.handle,
+        accum_update.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.sparse_apply_adadelta(
+        var,
+        accum,
+        accum_update,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.resource_sparse_apply_adadelta(
+        var.handle,
+        accum.handle,
+        accum_update.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/adadelta_test.py b/tensorflow/contrib/optimizer_v2/adadelta_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cfec0d50d691cb9e618400fa4b37708a8a3ba2
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adadelta_test.py
@@ -0,0 +1,167 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adadelta Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import adadelta
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdadeltaOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False):
+    num_updates = 4  # number of ADADELTA steps to perform
+    for dtype in [dtypes.half, dtypes.float32]:
+      for grad in [0.2, 0.1, 0.01]:
+        for lr in [1.0, 0.5, 0.1]:
+          with self.test_session():
+            var0_init = [1.0, 2.0]
+            var1_init = [3.0, 4.0]
+            if use_resource:
+              var0 = resource_variable_ops.ResourceVariable(
+                  var0_init, dtype=dtype)
+              var1 = resource_variable_ops.ResourceVariable(
+                  var1_init, dtype=dtype)
+            else:
+              var0 = variables.Variable(var0_init, dtype=dtype)
+              var1 = variables.Variable(var1_init, dtype=dtype)
+
+            grads = constant_op.constant([grad, grad], dtype=dtype)
+
+            accum = 0.0
+            accum_update = 0.0
+
+            # ADADELTA gradient optimizer
+            rho = 0.95
+            epsilon = 1e-8
+            adadelta_opt = adadelta.AdadeltaOptimizer(lr, rho, epsilon)
+            adadelta_update = adadelta_opt.apply_gradients(
+                zip([grads, grads], [var0, var1]))
+
+            opt_vars = adadelta_opt.variables()
+            self.assertStartsWith(opt_vars[0].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[1].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[2].name, var1._shared_name)
+            self.assertStartsWith(opt_vars[3].name, var1._shared_name)
+            self.assertEqual(4, len(opt_vars))
+
+            variables.global_variables_initializer().run()
+
+            # Assign slots
+            slot = [None] * 2
+            slot_update = [None] * 2
+            self.assertEqual(["accum", "accum_update"],
+                             adadelta_opt.get_slot_names())
+            slot[0] = adadelta_opt.get_slot(var0, "accum")
+            self.assertEquals(slot[0].get_shape(), var0.get_shape())
+            self.assertFalse(slot[0] in variables.trainable_variables())
+
+            slot_update[0] = adadelta_opt.get_slot(var0, "accum_update")
+            self.assertEquals(slot_update[0].get_shape(), var0.get_shape())
+            self.assertFalse(slot_update[0] in variables.trainable_variables())
+
+            slot[1] = adadelta_opt.get_slot(var1, "accum")
+            self.assertEquals(slot[1].get_shape(), var1.get_shape())
+            self.assertFalse(slot[1] in variables.trainable_variables())
+
+            slot_update[1] = adadelta_opt.get_slot(var1, "accum_update")
+            self.assertEquals(slot_update[1].get_shape(), var1.get_shape())
+            self.assertFalse(slot_update[1] in variables.trainable_variables())
+
+            # Fetch params to validate initial values
+            self.assertAllClose(var0_init, var0.eval())
+            self.assertAllClose(var1_init, var1.eval())
+
+            update = [None] * num_updates
+            tot_update = 0
+            for step in range(num_updates):
+              # Run adadelta update for comparison
+              adadelta_update.run()
+
+              # Perform initial update without previous accum values
+              accum = accum * rho + (grad**2) * (1 - rho)
+              update[step] = (np.sqrt(accum_update + epsilon) *
+                              (1. / np.sqrt(accum + epsilon)) * grad)
+              accum_update = (accum_update * rho + (update[step]**2) *
+                              (1.0 - rho))
+              tot_update += update[step] * lr
+
+              # Check that the accumulators have been updated
+              for slot_idx in range(2):
+                self.assertAllCloseAccordingToType(
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
+                    slot[slot_idx].eval(),
+                    rtol=1e-5)
+
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [accum_update, accum_update],
+                        dtype=dtype.as_numpy_dtype()),
+                    slot_update[slot_idx].eval(),
+                    rtol=1e-5)
+
+              # Check that the parameters have been updated
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var0_init[0] - tot_update, var0_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  var0.eval(),
+                  rtol=1e-5)
+
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var1_init[0] - tot_update, var1_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  var1.eval(),
+                  rtol=1e-5)
+
+  def testBasic(self):
+    self.doTestBasic(use_resource=False)
+
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adadelta.AdadeltaOptimizer(
+            1.0, 1.0, 1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[-111, -138]], var0.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/adagrad.py b/tensorflow/contrib/optimizer_v2/adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..c333d1e089047e707c3f13acddae268d935b2b3e
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adagrad.py
@@ -0,0 +1,118 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adagrad optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class AdagradOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adagrad algorithm.
+
+  See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  or this
+  [intro](http://cs.stanford.edu/~ppasupat/a9online/uploads/proximal_notes.pdf).
+  """
+
+  def __init__(self, learning_rate, initial_accumulator_value=0.1,
+               use_locking=False, name="Adagrad"):
+    """Construct a new Adagrad optimizer.
+
+    The learning_rate arg below is a hyperparameter, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      initial_accumulator_value: A floating point value.
+        Starting value for the accumulators, must be positive.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adagrad".
+
+    Raises:
+      ValueError: If the `initial_accumulator_value` is invalid.
+    """
+    if initial_accumulator_value <= 0.0:
+      raise ValueError("initial_accumulator_value must be positive: %s" %
+                       initial_accumulator_value)
+    super(AdagradOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+
+    self._initial_accumulator_value = initial_accumulator_value
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      # TODO(isaprykin): Delete colocate_with(v) from other optimizers and
+      # confirm that colocation will happen anyway.
+      dtype = v.dtype.base_dtype
+      if v.get_shape().is_fully_defined():
+        init = init_ops.constant_initializer(self._initial_accumulator_value,
+                                             dtype=dtype)
+      else:
+        # Use a Tensor instead of initializer if variable does not have static
+        # shape.
+        init_constant = gen_array_ops.fill(
+            array_ops.shape(v), self._initial_accumulator_value)
+        init = math_ops.cast(init_constant, dtype)
+      state.create_slot_with_initializer(v, init, v.get_shape(), dtype,
+                                         "accumulator")
+
+  def _apply_dense(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.apply_adagrad(
+        var,
+        acc,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_dense(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.resource_apply_adagrad(
+        var.handle,
+        acc.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.sparse_apply_adagrad(
+        var,
+        acc,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.resource_sparse_apply_adagrad(
+        var.handle,
+        acc.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/adagrad_test.py b/tensorflow/contrib/optimizer_v2/adagrad_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..18191c3ef2cb78f63b6558c289b36b6107b6c171
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adagrad_test.py
@@ -0,0 +1,282 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for aggregate operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import adagrad
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_locking=False, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(
+            3.0, initial_accumulator_value=0.1, use_locking=use_locking)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic(use_locking=False)
+
+  def testBasicResource(self):
+    self.doTestBasic(use_locking=False, use_resource=True)
+
+  def testBasicLocked(self):
+    self.doTestBasic(use_locking=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adagrad.AdagradOptimizer(1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType(
+            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(
+            constant_op.constant(3.0), initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        ada_opt = adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([[1.0], [2.0]], var0.eval())
+        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([[3.0], [3.715679168701172]]), var1.eval())
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def testSparseRepeatedIndicesResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = adagrad.AdagradOptimizer(
+            2.0).minimize(loss_repeated)
+        update_op_aggregated = adagrad.AdagradOptimizer(
+            2.0).minimize(loss_aggregated)
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(
+            var_repeated.eval(), var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(
+              var_repeated.eval(), var_aggregated.eval())
+
+  def testSparseStability(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        shape = [1, 6]
+        var0 = variables.Variable(
+            [[
+                0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257,
+                -0.0105945
+            ]],
+            dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[
+                    -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05,
+                    -8.4877e-05, -9.48906e-05
+                ]],
+                shape=shape,
+                dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = adagrad.AdagradOptimizer(1.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(3.0)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+        # Validate updated params (the same as with only 1 Adagrad).
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testDynamicShapeVariable_Ok(self):
+    with self.test_session():
+      v = variable_scope.get_variable("v", initializer=constant_op.constant(1.),
+                                      validate_shape=False)
+      self.assertFalse(v.shape.is_fully_defined())
+      # Creating optimizer should cause no exception.
+      adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..d538ad0fb02699ed8514f512208914f629a47436
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -0,0 +1,198 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adam optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+
+
+class AdamOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adam algorithm.
+
+  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+               use_locking=False, name="Adam"):
+    """Construct a new Adam optimizer.
+
+    Initialization:
+
+    $$m_0 := 0 (Initialize initial 1st moment vector)$$
+    $$v_0 := 0 (Initialize initial 2nd moment vector)$$
+    $$t := 0 (Initialize timestep)$$
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section2 of the paper:
+
+    $$t := t + 1$$
+    $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+    The default value of 1e-8 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+
+    Some of the args below are hyperparameters where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      beta1: A float hyperparameter. The exponential decay rate for the 1st
+        moment estimates.
+      beta2: A float hyperparameter. The exponential decay rate for the 2nd
+        moment estimates.
+      epsilon: A float hyperparameter. This epsilon is "epsilon hat" in the
+        Kingma and Ba paper (in the formula just before Section 2.1), not the
+        epsilon in Algorithm 1 of the paper.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".
+    """
+    super(AdamOptimizer, self).__init__(use_locking, name)
+
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("beta1", beta1)
+    self._set_hyper("beta2", beta2)
+    self._set_hyper("epsilon", epsilon)
+
+  def _get_beta_accumulators(self, state=None):
+    if state is None:
+      state = self._get_per_graph_state()
+    return (state.get_non_slot("beta1_power"),
+            state.get_non_slot("beta2_power"))
+
+  def _create_vars(self, var_list, state):
+    # Non-slot variables end up on the same device(s).
+    state.create_non_slot(initial_value=state.get_hyper("beta1"),
+                          name="beta1_power")
+    state.create_non_slot(initial_value=state.get_hyper("beta2"),
+                          name="beta2_power")
+
+    # Create slots for the first and second moments.
+    for v in var_list:
+      state.zeros_slot(v, "m")
+      state.zeros_slot(v, "v")
+
+  def _apply_dense(self, grad, var, state):
+    m = state.get_slot(var, "m")
+    v = state.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    return training_ops.apply_adam(
+        var, m, v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("beta1", var.dtype.base_dtype),
+        state.get_hyper("beta2", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad, use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    m = state.get_slot(var, "m")
+    v = state.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    return training_ops.resource_apply_adam(
+        var.handle, m.handle, v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
+        state.get_hyper("learning_rate", grad.dtype.base_dtype),
+        state.get_hyper("beta1", grad.dtype.base_dtype),
+        state.get_hyper("beta2", grad.dtype.base_dtype),
+        state.get_hyper("epsilon", grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add, state):
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = state.get_hyper("learning_rate", var.dtype.base_dtype)
+    beta1_t = state.get_hyper("beta1", var.dtype.base_dtype)
+    beta2_t = state.get_hyper("beta2", var.dtype.base_dtype)
+    epsilon_t = state.get_hyper("epsilon", var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = state.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta1_t)
+    m_t = state_ops.assign(m, m * beta1_t,
+                           use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = state.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(var,
+                                      lr * m_t / (v_sqrt + epsilon_t),
+                                      use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var, state):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking),
+        state)
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    return self._apply_sparse_shared(
+        grad, var, indices, self._resource_scatter_add, state)
+
+  def _finish(self, state):
+    # Update the power accumulators.
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    update_beta1 = beta1_power.assign(
+        beta1_power * state.get_hyper("beta1"),
+        use_locking=self._use_locking)
+    update_beta2 = beta2_power.assign(
+        beta2_power * state.get_hyper("beta2"),
+        use_locking=self._use_locking)
+    return control_flow_ops.group(update_beta1, update_beta2)
diff --git a/tensorflow/contrib/optimizer_v2/adam_test.py b/tensorflow/contrib/optimizer_v2/adam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ad58b0a607ecef1df097c8858b074361e7892b
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adam_test.py
@@ -0,0 +1,333 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import adam
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adam.AdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam.AdamOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam.AdamOptimizer().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adam.AdamOptimizer().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def doTestBasic(self, use_resource=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.test_session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adam.AdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertTrue(beta2_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+        self.assertIn(beta2_power, opt_variables)
+
+        with ops.Graph().as_default():
+          # Shouldn't return non-slot variables from other graphs.
+          self.assertEqual(0, len(opt.variables()))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta2_power))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.AdamOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.AdamOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = adam.AdamOptimizer()
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.AdamOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(6, len(set(opt.variables())))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e2858d00ff192e56680b288651975410c63c539
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -0,0 +1,729 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# TODO(josh11b): Forked from contrib/eager/python to test OptimizerV2 the same way
+# OptimizerV1 is tested. This file should be removed once the fork is resolved.
+
+import functools
+import os
+
+import six
+
+from tensorflow.contrib.optimizer_v2 import adam
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training import saver as core_saver
+from tensorflow.python.training import training_util
+
+
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class _MirroringSaveable(
+    core_saver.BaseSaverBuilder.ResourceVariableSaveable):
+
+  def __init__(self, primary_variable, mirrored_variable, name):
+    self._primary_variable = primary_variable
+    self._mirrored_variable = mirrored_variable
+    super(_MirroringSaveable, self).__init__(
+        self._primary_variable, "", name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group(
+        self._primary_variable.assign(tensor),
+        self._mirrored_variable.assign(tensor))
+
+
+class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
+  """A Checkpointable object which returns a more complex SaveableObject."""
+
+  def __init__(self):
+    self.non_dep_variable = variable_scope.get_variable(
+        name="non_dep_variable", initializer=6., use_resource=True)
+    self.mirrored = variable_scope.get_variable(
+        name="mirrored", initializer=15., use_resource=True)
+
+  def _gather_saveables_for_checkpoint(self):
+    def _saveable_factory(name=self.non_dep_variable.name):
+      return _MirroringSaveable(
+          primary_variable=self.non_dep_variable,
+          mirrored_variable=self.mirrored,
+          name=name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  # The Saver sorts by name before parsing, so we need a name property.
+  @property
+  def name(self):
+    return self.non_dep_variable.name
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph = (
+        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step:0",
+        named_variables["optimizer_step" + suffix].name)
+    self.assertEqual(
+        "my_model/dense_1/kernel:0",
+        named_variables["model/_second/kernel" + suffix].name)
+    self.assertEqual(
+        "my_model/dense/kernel:0",
+        named_variables["model/_named_dense/kernel" + suffix].name)
+    self.assertEqual(
+        "beta1_power:0",
+        named_variables["optimizer/beta1_power" + suffix].name)
+    self.assertEqual(
+        "beta2_power:0",
+        named_variables["optimizer/beta2_power" + suffix].name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta1_power",
+                     optimizer_node.children[0].local_name)
+    self.assertEqual("beta1_power",
+                     serialized_graph.nodes[optimizer_node.children[0].node_id]
+                     .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=named_variables["model/_named_dense/kernel" + suffix],
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_checkpointable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when appying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0, beta2=1.0)
+    on_create_root = checkpointable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_consumed()
+    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(core_saver.latest_checkpoint(checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = checkpointable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+          with self.test_session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            else:
+              status.assert_consumed()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes()
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def _get_checkpoint_name(self, name):
+    root = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        root, name=name, shape=[1, 2], dtype=dtypes.float64)
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    checkpoint_name, = named_variables.keys()
+    with ops.name_scope("root/" + checkpoint_name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return checkpoint_name
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = checkpointable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = checkpointable.Checkpointable()
+    root.var = checkpointable_utils.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.AdamOptimizer(0.1)
+    if context.executing_eagerly():
+      optimizer.minimize(root.var.read_value)
+    else:
+      train_op = optimizer.minimize(root.var)
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(checkpointable_utils.gather_initializers(
+          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
+                                   14.))
+    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "with_slots"))
+    new_root = checkpointable.Checkpointable()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(slots_path)
+    no_slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = checkpointable_utils.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.AdamOptimizer(0.1)
+    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+      slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+    else:
+      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
+                    None)
+    if context.executing_eagerly():
+      new_root.optimizer.minimize(new_root.var.read_value)
+    else:
+      train_op = new_root.optimizer.minimize(new_root.var)
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+      self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        saver.save(checkpoint_prefix)
+        before_ops = graph.get_operations()
+        saver.save(checkpoint_prefix)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        save_path = saver.save(checkpoint_prefix)
+        saver.restore(save_path)
+        before_ops = graph.get_operations()
+        saver.restore(save_path)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testMultipleGraphsNonSlotVariables(self):
+    with context.graph_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer = adam.AdamOptimizer(0.001)
+      # Construct a model in one graph
+      first_graph = ops.Graph()
+      first_session = session_lib.Session(graph=first_graph)
+      with first_graph.as_default(), first_session.as_default():
+        first_variable = resource_variable_ops.ResourceVariable([1.])
+        first_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=first_variable)
+        train_op = optimizer.minimize(first_variable.read_value)
+        self.evaluate(checkpointable_utils.gather_initializers(
+            first_root_checkpointable))
+        self.evaluate(train_op)
+        self.evaluate(first_variable.assign([1.]))
+        self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m").assign([2.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.))
+
+      # Save and load in a second graph
+      second_graph = ops.Graph()
+      with second_graph.as_default(), session_lib.Session(graph=second_graph):
+        second_variable = resource_variable_ops.ResourceVariable([1.])
+        second_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=second_variable)
+        train_op = optimizer.minimize(second_variable.read_value)
+        second_root_checkpointable.restore(None).initialize_or_restore()
+        self.evaluate(train_op)
+        self.evaluate(second_variable.assign([4.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([5.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(6.))
+        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        self.evaluate(second_variable.assign([7.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([8.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+        status = second_root_checkpointable.restore(save_path)
+        status.assert_consumed().run_restore_ops()
+        self.assertAllEqual([4.], self.evaluate(second_variable))
+        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+
+      # Check that the first graph is unmolested
+      with first_graph.as_default(), first_session.as_default():
+        self.assertAllEqual([1.], self.evaluate(first_variable))
+        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta1_power))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = core_saver.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+
+  # TODO(allenl): Test for the core name-based saver loading object-based
+  # checkpoints once object-based checkpointing is in core.
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        save_path = root.save(
+            session=session, file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      save_path = root.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/gradient_descent.py b/tensorflow/contrib/optimizer_v2/gradient_descent.py
new file mode 100644
index 0000000000000000000000000000000000000000..945c8de5595394341077ae13cae3161c71ad4f98
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/gradient_descent.py
@@ -0,0 +1,69 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""GradientDescent optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import training_ops
+
+
+class GradientDescentOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the gradient descent algorithm."""
+
+  def __init__(self, learning_rate, use_locking=False, name="GradientDescent"):
+    """Construct a new gradient descent optimizer.
+
+    The learning rate arg below is a hyperparameter where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate to use.
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "GradientDescent".
+    """
+    super(GradientDescentOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+
+  def _apply_dense(self, grad, var, state):
+    return training_ops.apply_gradient_descent(
+        var,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, handle, state):
+    lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
+    return training_ops.resource_apply_gradient_descent(
+        handle.handle, lr, grad, use_locking=self._use_locking)
+
+  def _resource_apply_sparse_duplicate_indices(
+      self, grad, handle, indices, state):
+    lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
+    return resource_variable_ops.resource_scatter_add(
+        handle.handle, indices, -grad * lr)
+
+  def _apply_sparse_duplicate_indices(self, grad, var, state):
+    delta = ops.IndexedSlices(
+        grad.values * state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.indices, grad.dense_shape)
+    return var.scatter_sub(delta, use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/gradient_descent_test.py b/tensorflow/contrib/optimizer_v2/gradient_descent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad9aef804fb250395d0c42fcd145f8a1707237d0
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/gradient_descent_test.py
@@ -0,0 +1,223 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for GradientDescent optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import gradient_descent
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class GradientDescentOptimizerTest(test.TestCase):
+
+  def testBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        optimizer = gradient_descent.GradientDescentOptimizer(3.0)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        self.assertEqual(0, len(optimizer.variables()))
+
+  def testBasicResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testMinimizeResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(var0, x) + var1
+        loss = pred * pred
+        sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        pred += var1
+        loss = pred * pred
+        sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        sgd_op = gradient_descent.GradientDescentOptimizer(
+            lrate).apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testGradWrtRef(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        opt = gradient_descent.GradientDescentOptimizer(3.0)
+        values = [1.0, 3.0]
+        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        variables.global_variables_initializer().run()
+        for grad, _ in grads_and_vars:
+          self.assertAllCloseAccordingToType([1.0], grad.eval())
+
+  def testWithGlobalStep(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        global_step = variables.Variable(0, trainable=False)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]), global_step=global_step)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params and global_step
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        self.assertAllCloseAccordingToType(1, global_step.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                           var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/momentum.py b/tensorflow/contrib/optimizer_v2/momentum.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a5aadc2d13074cec440a7b508be56bd195d7517
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/momentum.py
@@ -0,0 +1,124 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Momentum for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class MomentumOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Momentum algorithm.
+
+  Computes (if `use_nesterov = False`):
+
+  ```
+  accumulation = momentum * accumulation + gradient
+  variable -= learning_rate * accumulation
+  ```
+
+  Note that in the dense version of this algorithm, `accumulation` is updated
+  and applied regardless of a gradient's value, whereas the sparse version (when
+  the gradient is an `IndexedSlices`, typically because of `tf.gather` or an
+  embedding) only updates variable slices and corresponding `accumulation` terms
+  when that part of the variable was used in the forward pass.
+  """
+
+  def __init__(self, learning_rate, momentum,
+               use_locking=False, name="Momentum", use_nesterov=False):
+    """Construct a new Momentum optimizer.
+
+    Some of the args below are hyperparameters, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      momentum: A float hyperparameter. The momentum.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Momentum".
+      use_nesterov: If `True` use Nesterov Momentum.
+        See [Sutskever et al., 2013](
+        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
+        This implementation always computes gradients at the value of the
+        variable(s) passed to the optimizer. Using Nesterov Momentum makes the
+        variable(s) track the values called `theta_t + mu*v_t` in the paper.
+
+    @compatibility(eager)
+    When eager execution is enabled, learning_rate and momentum can each be a
+    callable that takes no arguments and returns the actual value to use. This
+    can be useful for changing these values across different invocations of
+    optimizer functions.
+    @end_compatibility
+    """
+    super(MomentumOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("momentum", momentum)
+    self._use_nesterov = use_nesterov
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      state.zeros_slot(v, "momentum")
+
+  def _apply_dense(self, grad, var, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.apply_momentum(
+        var,
+        mom,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.resource_apply_momentum(
+        var.handle,
+        mom.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov)
+
+  def _apply_sparse(self, grad, var, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.sparse_apply_momentum(
+        var,
+        mom,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_momentum(
+        var.handle,
+        mom.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        indices,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov)
diff --git a/tensorflow/contrib/optimizer_v2/momentum_test.py b/tensorflow/contrib/optimizer_v2/momentum_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..26724f66c2a1db1d01577b31b739af18f51d3976
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/momentum_test.py
@@ -0,0 +1,582 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Momentum."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.optimizer_v2 import momentum as momentum_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class MomentumOptimizerTest(test.TestCase):
+
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    var = var + accum * lr * momentum
+    accum = accum * momentum + g
+    var = var - lr * accum
+    var = var - accum * lr * momentum
+    return var, accum
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            [3.0, 4.0], dtype=dtype, name="var1_%d" % i)
+      else:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      learning_rate = lambda: 2.0
+      momentum = lambda: 0.9
+      if not use_callable_params:
+        learning_rate = learning_rate()
+        momentum = momentum()
+      mom_opt = momentum_lib.MomentumOptimizer(
+          learning_rate=learning_rate, momentum=momentum)
+      mom_update = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+
+      if not context.executing_eagerly():
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      # Check we have slots
+      self.assertEqual(["momentum"], mom_opt.get_slot_names())
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEquals(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEquals(slot1.get_shape(), var1.get_shape())
+      if not context.executing_eagerly():
+        self.assertFalse(slot0 in variables.trainable_variables())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+      # Step 1: the momentum accumulators where 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      if not context.executing_eagerly():
+        self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
+                                         self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(np.array([0.01, 0.01]),
+                                         self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+          self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+      # Step 2: the momentum accumulators contain the previous update.
+      if context.executing_eagerly():
+        mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      else:
+        self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+          self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+              2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                  (0.9 * 0.01 + 0.01) * 2.0)
+          ]), self.evaluate(var1))
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testVariablesAcrossGraphs(self):
+    optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5)
+    with ops.Graph().as_default():
+      var0 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var0")
+      var1 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var1")
+      if context.executing_eagerly():
+        loss = lambda: math_ops.reduce_sum(var0 + var1)
+      else:
+        loss = math_ops.reduce_sum(var0 + var1)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var0")
+      self.assertStartsWith(optimizer_variables[1].name, "var1")
+      self.assertEquals(2, len(optimizer_variables))
+
+    with ops.Graph().as_default():
+      var2 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var2")
+      var3 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var3")
+      if context.executing_eagerly():
+        loss = lambda: math_ops.reduce_sum(var2 + var3)
+      else:
+        loss = math_ops.reduce_sum(var2 + var3)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var2")
+      self.assertStartsWith(optimizer_variables[1].name, "var3")
+      self.assertEquals(2, len(optimizer_variables))
+
+  def testNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        cost = 5 * var0 * var0 + 3 * var1
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name="global_step")
+        mom_op = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+        opt_op = mom_op.minimize(cost, global_step, [var0, var1])
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  def testSparseNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        grads = []
+        for t in range(1, 5):
+          grads.append(var0_np * 10)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        loss = 5 * var0 * var0 + 3 * var1
+        mom_op = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+        x_feed = array_ops.placeholder(dtype)
+        y_feed = ops.IndexedSlices(
+            x_feed, constant_op.constant([0, 1]), constant_op.constant([2]))
+        grads_and_vars = [(y_feed, var0), (constant_op.constant(
+            [3.0, 3.0], dtype=dtype), var1)]
+        opt_update = mom_op.apply_gradients(grads_and_vars)
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_update.run(feed_dict={x_feed: grads[t - 1]})
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        return pred * pred
+      # pylint: enable=cell-var-from-loop
+
+      opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss)
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
+
+  def testTensorLearningRateAndMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=constant_op.constant(2.0),
+            momentum=constant_op.constant(0.9))
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertFalse(slot0 in variables.trainable_variables())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+  def _dbParamsMom01(self):
+    """Return dist-belief momentum values.
+
+    Return values been generated from the dist-belief momentum unittest,
+    running with a learning rate of 0.1 and a momentum of 0.1.
+
+    These values record how a parameter vector of size 10, initialized with 0.0,
+    gets updated with 10 consecutive momentum steps.  It uses random gradients.
+
+    Returns:
+      db_grad: The gradients to apply
+      db_out: The parameters after the momentum update.
+    """
+    db_grad = [[]] * 10
+    db_out = [[]] * 10
+    # pylint: disable=line-too-long
+    db_grad[0] = [
+        0.00096264342, 0.17914793, 0.93945462, 0.41396621, 0.53037018,
+        0.93197989, 0.78648776, 0.50036013, 0.55345792, 0.96722615
+    ]
+    db_out[0] = [
+        -9.6264346e-05, -0.017914793, -0.093945466, -0.041396622, -0.053037018,
+        -0.093197994, -0.078648776, -0.050036013, -0.055345792, -0.096722618
+    ]
+    db_grad[1] = [
+        0.17075552, 0.88821375, 0.20873757, 0.25236958, 0.57578111, 0.15312378,
+        0.5513742, 0.94687688, 0.16012503, 0.22159521
+    ]
+    db_out[1] = [
+        -0.017181443, -0.10852765, -0.12421377, -0.070773244, -0.11591884,
+        -0.11783017, -0.14165108, -0.14972731, -0.076892875, -0.1285544
+    ]
+    db_grad[2] = [
+        0.35077485, 0.47304362, 0.44412705, 0.44368884, 0.078527533, 0.81223965,
+        0.31168157, 0.43203235, 0.16792089, 0.24644311
+    ]
+    db_out[2] = [
+        -0.053967446, -0.1648933, -0.1716533, -0.1180798, -0.13005978,
+        -0.20151734, -0.17911947, -0.20289968, -0.095839672, -0.15638189
+    ]
+    db_grad[3] = [
+        0.9694621, 0.75035888, 0.28171822, 0.83813518, 0.53807181, 0.3728098,
+        0.81454384, 0.03848977, 0.89759839, 0.93665648
+    ]
+    db_out[3] = [
+        -0.15459226, -0.24556576, -0.20456907, -0.20662397, -0.18528105,
+        -0.24716705, -0.2643207, -0.21206589, -0.18749419, -0.2528303
+    ]
+    db_grad[4] = [
+        0.38578293, 0.8536852, 0.88722926, 0.66276771, 0.13678469, 0.94036359,
+        0.69107032, 0.81897682, 0.5433259, 0.67860287
+    ]
+    db_out[4] = [
+        -0.20323303, -0.33900154, -0.29658359, -0.28175515, -0.20448165,
+        -0.34576839, -0.34194785, -0.29488021, -0.25099224, -0.33033544
+    ]
+    db_grad[5] = [
+        0.27885768, 0.76100707, 0.24625534, 0.81354135, 0.18959245, 0.48038563,
+        0.84163809, 0.41172323, 0.83259648, 0.44941229
+    ]
+    db_out[5] = [
+        -0.23598288, -0.42444581, -0.33041057, -0.3706224, -0.22536094,
+        -0.40366709, -0.43387437, -0.34433398, -0.34060168, -0.38302717
+    ]
+    db_grad[6] = [
+        0.27233034, 0.056316052, 0.5039115, 0.24105175, 0.35697976, 0.75913221,
+        0.73577434, 0.16014607, 0.57500273, 0.071136251
+    ]
+    db_out[6] = [
+        -0.26649091, -0.43862185, -0.38418442, -0.40361428, -0.26314685,
+        -0.48537019, -0.51664448, -0.36529395, -0.40706289, -0.39540997
+    ]
+    db_grad[7] = [
+        0.58697265, 0.2494842, 0.08106143, 0.39954534, 0.15892942, 0.12683646,
+        0.74053431, 0.16033, 0.66625422, 0.73515922
+    ]
+    db_out[7] = [
+        -0.32823896, -0.46498787, -0.39766794, -0.446868, -0.28281838,
+        -0.50622416, -0.59897494, -0.38342294, -0.48033443, -0.47016418
+    ]
+    db_grad[8] = [
+        0.8215279, 0.41994119, 0.95172721, 0.68000203, 0.79439718, 0.43384039,
+        0.55561525, 0.22567581, 0.93331909, 0.29438227
+    ]
+    db_out[8] = [
+        -0.41656655, -0.50961858, -0.49418902, -0.51919359, -0.36422527,
+        -0.55169362, -0.6627695, -0.40780342, -0.58099347, -0.50707781
+    ]
+    db_grad[9] = [
+        0.68297005, 0.67758518, 0.1748755, 0.13266537, 0.70697063, 0.055731893,
+        0.68593478, 0.50580865, 0.12602448, 0.093537711
+    ]
+    db_out[9] = [
+        -0.49369633, -0.58184016, -0.52132869, -0.5396927, -0.44306302,
+        -0.56181377, -0.73774242, -0.46082234, -0.60366184, -0.52012295
+    ]
+    # pylint: enable=line-too-long
+    return db_grad, db_out
+
+  def testLikeDistBeliefMom01(self):
+    with self.test_session():
+      db_grad, db_out = self._dbParamsMom01()
+      num_samples = len(db_grad)
+      var0 = variables.Variable([0.0] * num_samples)
+      grads0 = constant_op.constant([0.0] * num_samples)
+      mom_opt = momentum_lib.MomentumOptimizer(learning_rate=0.1, momentum=0.1)
+      mom_update = mom_opt.apply_gradients(zip([grads0], [var0]))
+      variables.global_variables_initializer().run()
+      for i in xrange(num_samples):
+        mom_update.run(feed_dict={grads0: db_grad[i]})
+        self.assertAllClose(np.array(db_out[i]), var0.eval())
+
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
+        var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[.1, .1]], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([4, 2]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [[.01, .01], [.01, .01]], dtype=dtype),
+            constant_op.constant([2, 3]),
+            constant_op.constant([4, 2]))
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([0, 0], var0.eval()[0])
+        self.assertAllClose([0, 0], var0.eval()[1])
+        self.assertAllClose([1, 1], var1.eval()[2])
+
+        # Step 1: the momentum accumulators are 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]), slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]), var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]), var1.eval()[2])
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), -(0.1 * 2.0) - (
+                    (0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0), 0.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval()[2])
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9)
+        mom_update1 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        mom_update2 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update1.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the second momentum accumulators contain the previous update.
+        mom_update2.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..46bfbb729fa9cdfc98f4228f516a7c5c42f23059
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -0,0 +1,1337 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Version 2 of class Optimizer."""
+# pylint: disable=g-bad-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training import slot_creator
+from tensorflow.python.util import nest
+
+
+class _OptimizableVariable(object):
+  """Interface for abstracting over variables in the optimizers."""
+
+  @abc.abstractmethod
+  def target(self):
+    """Returns the optimization target for this variable."""
+    raise NotImplementedError("Calling an abstract method.")
+
+  @abc.abstractmethod
+  def update_op(self, optimizer, g, *args):
+    """Returns the update ops for updating the variable."""
+    raise NotImplementedError("Calling an abstract method.")
+
+
+class _RefVariableProcessor(_OptimizableVariable):
+  """Processor for Variable."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v._ref()  # pylint: disable=protected-access
+
+  def update_op(self, optimizer, g, *args):
+    if isinstance(g, ops.Tensor):
+      update_op = optimizer._apply_dense(g, self._v, *args)  # pylint: disable=protected-access
+      if self._v.constraint is not None:
+        with ops.control_dependencies([update_op]):
+          return self._v.assign(self._v.constraint(self._v))
+      else:
+        return update_op
+    else:
+      assert isinstance(g, ops.IndexedSlices), ("Gradient ", g, " is neither a "
+                                                "tensor nor IndexedSlices.")
+      if self._v.constraint is not None:
+        raise RuntimeError(
+            "Cannot use a constraint function on a sparse variable.")
+      # pylint: disable=protected-access
+      return optimizer._apply_sparse_duplicate_indices(g, self._v, *args)
+
+
+class _DenseReadResourceVariableProcessor(_OptimizableVariable):
+  """Processor for dense ResourceVariables."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    # pylint: disable=protected-access
+    update_op = optimizer._resource_apply_dense(g, self._v.op.inputs[0], *args)
+    if self._v.constraint is not None:
+      with ops.control_dependencies([update_op]):
+        return self._v.assign(self._v.constraint(self._v))
+    else:
+      return update_op
+
+
+class _DenseResourceVariableProcessor(_OptimizableVariable):
+  """Processor for dense ResourceVariables."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    # pylint: disable=protected-access
+    if isinstance(g, ops.IndexedSlices):
+      if self._v.constraint is not None:
+        raise RuntimeError(
+            "Cannot use a constraint function on a sparse variable.")
+      return optimizer._resource_apply_sparse_duplicate_indices(
+          g.values, self._v, g.indices, *args)
+    update_op = optimizer._resource_apply_dense(g, self._v, *args)
+    if self._v.constraint is not None:
+      with ops.control_dependencies([update_op]):
+        return self._v.assign(self._v.constraint(self._v))
+    else:
+      return update_op
+
+
+class _TensorProcessor(_OptimizableVariable):
+  """Processor for ordinary Tensors.
+
+  Even though a Tensor can't really be updated, sometimes it is useful to
+  compute the gradients with respect to a Tensor using the optimizer. Updating
+  the Tensor is, of course, unsupported.
+  """
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    raise NotImplementedError("Trying to update a Tensor ", self._v)
+
+
+def _get_processor(v):
+  """The processor of v."""
+  if context.executing_eagerly():
+    if isinstance(v, ops.Tensor):
+      return _TensorProcessor(v)
+    else:
+      return _DenseResourceVariableProcessor(v)
+  if v.op.type == "VarHandleOp":
+    return _DenseResourceVariableProcessor(v)
+  if isinstance(v, variables.Variable):
+    return _RefVariableProcessor(v)
+  if isinstance(v, ops.Tensor):
+    return _TensorProcessor(v)
+  raise NotImplementedError("Trying to optimize unsupported type ", v)
+
+
+def _var_key_v2(var):
+  """Key for representing a primary variable, for looking up slots."""
+  # pylint: disable=protected-access
+  if hasattr(var, "_mirrored_container"):
+    mirrored_container = var._mirrored_container()
+    assert mirrored_container is not None
+    if context.executing_eagerly():
+      return mirrored_container._unique_id
+    return mirrored_container._shared_name
+  if context.executing_eagerly():
+    return var._unique_id
+  return var.op.name
+
+
+def _resolve(value, name):
+  if callable(value):
+    value = value()
+  return ops.convert_to_tensor(value, name=name)
+
+
+def _is_dynamic(value):
+  """Returns true if __init__ arg `value` should be re-evaluated each step."""
+  if callable(value): return True
+  # Don't need to do anything special in graph mode, since dynamic values
+  # will propagate correctly automatically.
+  # TODO(josh11b): Add per-device caching across steps using variables for
+  # truly static values once we add distributed support.
+  if context.executing_eagerly() and isinstance(
+      value, resource_variable_ops.ResourceVariable):
+    return True
+  return False
+
+
+class _OptimizerV2State(object):
+  """Holds per-graph and per-step optimizer state.
+
+  Use _init_with_static_hyper() to create the state for a graph, and then
+  _copy_with_dynamic_hyper() to convert that to state for a particular step.
+  The difference between the two is that the former only has hyper
+  parameter values that are static and the latter also has values that
+  can change every step (according to _is_dynamic()).
+  """
+
+  def __init__(self, op_name):
+    self._op_name = op_name
+
+  def _init_with_static_hyper(self, hyper):
+    """Initialize a fresh state object from hyper dict."""
+    # self._hyper contains a dict from name to a dict with the Tensor values.
+    # This dict starts with a single item with key "None" with the hyper
+    # parameter value converted to a Tensor. Other items have dtype keys
+    # with that Tensor cast to that dtype.
+    self._hyper = {name: {None: ops.convert_to_tensor(value, name=name)}
+                   for name, (dynamic, value) in hyper.items() if not dynamic}
+    self._slots = {}
+    self._non_slot_dict = {}
+    # Extra state to help Optimizers implement Checkpointable. Holds information
+    # about variables which will be restored as soon as they're created.
+    self._deferred_dependencies = {}  # Non-slot variables
+    self._deferred_slot_restorations = {}  # Slot variables
+
+  def _copy_with_dynamic_hyper(self, hyper, distribution, non_slot_devices):
+    """Create a new state object for a particular step."""
+    ret = _OptimizerV2State(self._op_name)
+    # pylint: disable=protected-access
+    ret._slots = self._slots
+    ret._non_slot_dict = self._non_slot_dict
+    ret._deferred_dependencies = self._deferred_dependencies
+    ret._deferred_slot_restorations = self._deferred_slot_restorations
+    ret._hyper = {name: {None: _resolve(value, name)}
+                  for name, (dynamic, value) in hyper.items() if dynamic}
+    ret._hyper.update(self._hyper)
+    ret._non_slot_devices = non_slot_devices
+    ret._distribution = distribution
+    return ret
+
+  def _variables(self):
+    """Returns a list of all variables held by self."""
+    optimizer_variables = list(self._non_slot_dict.values())
+    for variable_dict in self._slots.values():
+      for slot_for_variable in variable_dict.values():
+        optimizer_variables.append(slot_for_variable)
+    # Sort variables by name so that the return is deterministic.
+    return sorted(optimizer_variables, key=lambda v: v.name)
+
+  def _slot_dict(self, slot_name):
+    """Returns a dict for caching slots created under the given name.
+
+    Args:
+      slot_name: Name for the slot.
+
+    Returns:
+      A dict that maps primary `Variable` objects to the slot created
+      for that variable, under the given slot name.
+    """
+    named_slots = self._slots.get(slot_name, None)
+    if named_slots is None:
+      named_slots = {}
+      self._slots[slot_name] = named_slots
+    return named_slots
+
+  def create_slot(self, var, val, slot_name, optional_op_name=None):
+    """Find or create a slot for a variable.
+
+    Args:
+      var: A `Variable` object.
+      val: A `Tensor`.  The initial value of the slot.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_slot(
+          var, val, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def create_slot_with_initializer(self, var, initializer, shape, dtype,
+                                   slot_name, optional_op_name=None):
+    """Find or create a slot for a variable, using an Initializer.
+
+    Args:
+      var: A `Variable` object.
+      initializer: An `Initializer`.  The initial value of the slot.
+      shape: Shape of the initial value of the slot.
+      dtype: Type of the value of the slot.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_slot_with_initializer(
+          var, initializer, shape, dtype, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def zeros_slot(self, var, slot_name, optional_op_name=None):
+    """Find or create a slot initialized with 0.0.
+
+    Args:
+      var: A `Variable` object.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_zeros_slot(
+          var, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable,
+      optional_op_name=None):
+    """Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored. When executing eagerly, we create the slot variable with a
+    restoring initializer.
+
+    No new variables are created when graph building. Instead,
+    _restore_slot_variable catches these after normal creation and adds restore
+    ops to the graph. This method is nonetheless important when graph building
+    for the case when a slot variable has already been created but `variable`
+    has just been added to a dependency graph (causing us to realize that the
+    slot variable needs to be restored).
+
+    Args:
+      slot_variable_position: A `checkpointable._CheckpointPosition` object
+        indicating the slot variable `Checkpointable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+    """
+    slot_variable = self.get_slot(var=variable, name=slot_name)
+    if (slot_variable is None and context.executing_eagerly() and
+        slot_variable_position.is_simple_variable()):
+      initializer = checkpointable.CheckpointInitialValue(
+          checkpoint_position=slot_variable_position)
+      slot_variable = self.create_slot(
+          var=variable,
+          val=initializer,
+          slot_name=slot_name,
+          optional_op_name=optional_op_name)
+      # Optimizers do not have unconditional dependencies on their slot
+      # variables (nor do any other objects). They are only saved if the
+      # variables they were created for are also saved.
+    if slot_variable is not None:
+      # If we've either made this slot variable, or if we've pulled out an
+      # existing slot variable, we should restore it.
+      slot_variable_position.restore(slot_variable)
+    else:
+      # We didn't make the slot variable. Defer restoring until it gets created
+      # normally. We keep a list rather than the one with the highest restore
+      # UID in case slot variables have their own dependencies, in which case
+      # those could differ between restores.
+      variable_key = _var_key_v2(variable)
+      self._deferred_slot_restorations.setdefault(
+          slot_name, {}).setdefault(variable_key, []).append(
+              slot_variable_position)
+
+  def get_slot(self, var, name):
+    """Return a slot named `name` created for `var` by the Optimizer.
+
+    Some `Optimizer` subclasses use additional variables.  For example
+    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
+    gives access to these `Variable` objects if for some reason you need them.
+
+    Use `get_slot_names()` to get the list of slot names created by the
+    `Optimizer`.
+
+    Args:
+      var: A variable passed to `minimize()` or `apply_gradients()`.
+      name: A string.
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    named_slots = self._slots.get(name, None)
+    if not named_slots:
+      return None
+    return named_slots.get(_var_key_v2(var), None)
+
+  def get_slot_names(self):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    See `get_slot()`.
+
+    Returns:
+      A list of strings.
+    """
+    return sorted(self._slots.keys())
+
+  def create_non_slot(self, initial_value, name, colocate_with=None):
+    """Add an extra variable, not associated with a slot."""
+    v = self._non_slot_dict.get(name, None)
+    if v is None:
+      if colocate_with is None: colocate_with = self._non_slot_devices
+      with self._distribution.colocate_vars_with(colocate_with):
+        # TODO(josh11b): Use get_variable() except for the legacy Adam use case.
+        v = variable_scope.variable(initial_value, name=name, trainable=False)
+      self._non_slot_dict[name] = v
+      deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
+      for checkpoint_position in sorted(
+          deferred_dependencies_list,
+          key=lambda restore: restore.checkpoint.restore_uid,
+          reverse=True):
+        checkpoint_position.restore(v)
+    return v
+
+  def _restore_slot_variable(self, slot_name, variable, slot_variable):
+    """Restore a newly created slot variable's value."""
+    variable_key = _var_key_v2(variable)
+    deferred_restorations = self._deferred_slot_restorations.get(
+        slot_name, {}).pop(variable_key, [])
+    # Iterate over restores, highest restore UID first to minimize the number
+    # of assignments.
+    deferred_restorations.sort(key=lambda position: position.restore_uid,
+                               reverse=True)
+    for checkpoint_position in deferred_restorations:
+      checkpoint_position.restore(slot_variable)
+
+  def get_non_slot(self, name):
+    """Returns the non-slot variable identified by `name`."""
+    return self._non_slot_dict.get(name, None)
+
+  def get_hyper(self, name, dtype=None):
+    """Returns the `name` hyper parameter, optionally cast to `dtype`."""
+    dtype_dict = self._hyper[name]
+    # Do we have the value cast to dtype already cached? This should always
+    # succeed when dtype is None.
+    if dtype in dtype_dict:
+      return dtype_dict[dtype]
+    # Not cached, cast to dtype and save the result in the cache.
+    result = math_ops.cast(dtype_dict[None], dtype)
+    dtype_dict[dtype] = result
+    return result
+
+
+class OptimizerV2(optimizer_v1.Optimizer):
+  """Updated base class for optimizers.
+
+  This class defines the API to add Ops to train a model.  You never use this
+  class directly, but instead instantiate one of its subclasses such as
+  `GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
+
+  ### Usage
+
+  ```python
+  # Create an optimizer with the desired parameters.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+  # Add Ops to the graph to minimize a cost by updating a list of variables.
+  # "cost" is a Tensor, and the list of variables contains tf.Variable
+  # objects.
+  opt_op = opt.minimize(cost, var_list=<list of variables>)
+  ```
+
+  In the training program you will just have to run the returned Op.
+
+  ```python
+  # Execute opt_op to do one step of training:
+  opt_op.run()
+  ```
+
+  ### Processing gradients before applying them.
+
+  Calling `minimize()` takes care of both computing the gradients and
+  applying them to the variables.  If you want to process the gradients
+  before applying them you can instead use the optimizer in three steps:
+
+  1.  Compute the gradients with `compute_gradients()`.
+  2.  Process the gradients as you wish.
+  3.  Apply the processed gradients with `apply_gradients()`.
+
+  Example:
+
+  ```python
+  # Create an optimizer.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+
+  # Compute the gradients for a list of variables.
+  grads_and_vars = opt.compute_gradients(loss, <list of variables>)
+
+  # grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
+  # need to the 'gradient' part, for example cap them, etc.
+  capped_grads_and_vars = [(MyCapper(gv[0]), gv[1]) for gv in grads_and_vars]
+
+  # Ask the optimizer to apply the capped gradients.
+  opt.apply_gradients(capped_grads_and_vars)
+  ```
+
+  ### Gating Gradients
+
+  Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
+  argument that controls the degree of parallelism during the application of
+  the gradients.
+
+  The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.
+
+  <b>`GATE_NONE`</b>: Compute and apply gradients in parallel.  This provides
+  the maximum parallelism in execution, at the cost of some non-reproducibility
+  in the results.  For example the two gradients of `matmul` depend on the input
+  values: With `GATE_NONE` one of the gradients could be applied to one of the
+  inputs _before_ the other gradient is computed resulting in non-reproducible
+  results.
+
+  <b>`GATE_OP`</b>: For each Op, make sure all gradients are computed before
+  they are used.  This prevents race conditions for Ops that generate gradients
+  for multiple inputs where the gradients depend on the inputs.
+
+  <b>`GATE_GRAPH`</b>: Make sure all gradients for all variables are computed
+  before any one of them is used.  This provides the least parallelism but can
+  be useful if you want to process all gradients before applying any of them.
+
+  ### Slots
+
+  Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
+  allocate and manage additional variables associated with the variables to
+  train.  These are called <i>Slots</i>.  Slots have names and you can ask the
+  optimizer for the names of the slots that it uses.  Once you have a slot name
+  you can ask the optimizer for the variable it created to hold the slot value.
+
+  This can be useful if you want to log debug a training algorithm, report stats
+  about the slots, etc.
+
+  ### Non-slot variables
+
+  Some optimizer subclasses, such as `AdamOptimizer` have variables that
+  are not associated with the variables to train, just the step itself.
+
+  ### Hyper parameters
+
+  These are arguments passed to the optimizer subclass constructor
+  (the `__init__` method), and then passed to `self._set_hyper()`.
+  They can be either regular Python values (like 1.0), tensors, or
+  callables. If they are callable, the callable will be called during
+  `apply_gradients()` to get the value for the hyper parameter.
+
+  ### State
+
+  Internal methods are passed a `state` argument with the correct
+  values to use for the slot and non-slot variables, and the hyper
+  parameters.
+  """
+
+  # Values for gate_gradients.
+  GATE_NONE = 0
+  GATE_OP = 1
+  GATE_GRAPH = 2
+
+  def __init__(self, use_locking, name):
+    """Create a new Optimizer.
+
+    This must be called by the constructors of subclasses.
+    Note that Optimizer instances should not bind to a single graph,
+    and so shouldn't keep Tensors as member variables. Generally
+    you should be able to use the _set_hyper()/state.get_hyper()
+    facility instead.
+
+    Args:
+      use_locking: Bool. If True apply use locks to prevent concurrent updates
+        to variables.
+      name: A non-empty string.  The name to use for accumulators created
+        for the optimizer.
+
+    Raises:
+      ValueError: If name is malformed.
+      RuntimeError: If _create_slots has been overridden instead of
+          _create_vars.
+    """
+    # Note: We intentionally don't call parent __init__.
+
+    # Optimizer._create_slots was replaced by _create_vars in OptimizerV2.
+    if (self.__class__._create_slots.__code__ is not  # pylint: disable=protected-access
+        OptimizerV2._create_slots.__code__):
+      raise RuntimeError("Override _create_vars instead of _create_slots when "
+                         "descending from OptimizerV2 (class %s)" %
+                         self.__class__.__name__)
+    if not name:
+      raise ValueError("Must specify the optimizer name")
+
+    self._use_locking = use_locking
+    self._name = name
+    # Map from graph_key to state for that graph. We use the graph_key
+    # since it works in both eager and graph mode, and gives the outer
+    # graph inside functions.
+    tower_context = distribute_lib.get_tower_context()
+    if tower_context is None:
+      # In a cross-tower context for a DistributionStrategy, which means
+      # only one Optimizer will be created, not one per tower.
+      self._per_graph_state = {}
+    else:
+      # We use get_tower_context().merge_call() to get a single dict
+      # shared across all model replicas when running with a
+      # DistributionStrategy.
+      self._per_graph_state = tower_context.merge_call(lambda _: {})
+
+    # Hyper parameters, and whether they should be re-evaluated every step.
+    self._hyper = {}
+
+  def _set_hyper(self, name, value):
+    self._hyper[name] = (_is_dynamic(value), value)
+
+  def minimize(self, loss, global_step=None, var_list=None,
+               gate_gradients=GATE_OP, aggregation_method=None,
+               colocate_gradients_with_ops=False, name=None,
+               grad_loss=None, stop_gradients=None,
+               scale_loss_by_num_towers=None):
+    """Add operations to minimize `loss` by updating `var_list`.
+
+    This method simply combines calls `compute_gradients()` and
+    `apply_gradients()`. If you want to process the gradient before applying
+    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    of using this function.
+
+    Args:
+      loss: A `Tensor` containing the value to minimize.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      var_list: Optional list or tuple of `Variable` objects to update to
+        minimize `loss`.  Defaults to the list of variables collected in
+        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      name: Optional name for the returned operation.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
+        down by the number of towers. By default, auto-detects whether this
+        is needed.
+
+    Returns:
+      An Operation that updates the variables in `var_list`.  If `global_step`
+      was not `None`, that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If some of the variables are not `Variable` objects.
+
+    @compatibility(eager)
+    When eager execution is enabled, `loss` should be a Python function that
+    takes elements of `var_list` as arguments and computes the value to be
+    minimized. If `var_list` is None, `loss` should take no arguments.
+    Minimization (and gradient computation) is done with respect to the
+    elements of `var_list` if not None, else with respect to any trainable
+    variables created during the execution of the `loss` function.
+    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
+    `grad_loss` are ignored when eager execution is enabled.
+    @end_compatibility
+    """
+    grads_and_vars = self.compute_gradients(
+        loss, var_list=var_list, gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss, stop_gradients=stop_gradients,
+        scale_loss_by_num_towers=scale_loss_by_num_towers)
+
+    vars_with_grad = [v for g, v in grads_and_vars if g is not None]
+    if not vars_with_grad:
+      raise ValueError(
+          "No gradients provided for any variable, check your graph for ops"
+          " that do not support gradients, between variables %s and loss %s." %
+          ([str(v) for _, v in grads_and_vars], loss))
+
+    return self.apply_gradients(grads_and_vars, global_step=global_step,
+                                name=name)
+
+  def compute_gradients(self, loss, var_list=None,
+                        gate_gradients=GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None, stop_gradients=None,
+                        scale_loss_by_num_towers=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize or a callable taking
+        no arguments which returns the value to minimize. When eager execution
+        is enabled it must be a callable.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
+        down by the number of towers. By default, auto-detects whether this
+        is needed.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid.
+      RuntimeError: If called with eager execution enabled and `loss` is
+        not callable.
+
+    @compatibility(eager)
+    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
+    and `colocate_gradients_with_ops` are ignored.
+    @end_compatibility
+    """
+    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
+    if callable(loss):
+      with backprop.GradientTape() as tape:
+        if var_list is not None:
+          tape.watch(var_list)
+        loss_value = loss()
+
+        # Scale loss for number of towers (callable-loss case). In this case,
+        # we have to be careful to call distribute_lib.get_loss_reduction()
+        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        if scale_loss_by_num_towers is None:
+          scale_loss_by_num_towers = (
+              distribute_lib.get_loss_reduction() == "mean")
+        if scale_loss_by_num_towers:
+          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          if num_towers > 1:
+            loss_value *= 1. / num_towers
+
+      if var_list is None:
+        var_list = tape.watched_variables()
+      grads = tape.gradient(loss_value, var_list, grad_loss)
+      return list(zip(grads, var_list))
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "`loss` passed to Optimizer.compute_gradients should "
+          "be a function when eager execution is enabled.")
+
+    # Scale loss for number of towers (non-callable-loss case).
+    if scale_loss_by_num_towers is None:
+      scale_loss_by_num_towers = (
+          distribute_lib.get_loss_reduction() == "mean")
+    if scale_loss_by_num_towers:
+      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      if num_towers > 1:
+        loss *= 1. / num_towers
+
+    if gate_gradients not in [optimizer_v1.Optimizer.GATE_NONE,
+                              optimizer_v1.Optimizer.GATE_OP,
+                              optimizer_v1.Optimizer.GATE_GRAPH]:
+      raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
+                       "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" %
+                       gate_gradients)
+    self._assert_valid_dtypes([loss])
+    if grad_loss is not None:
+      self._assert_valid_dtypes([grad_loss])
+    if var_list is None:
+      var_list = (
+          variables.trainable_variables() +
+          ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+    else:
+      var_list = nest.flatten(var_list)
+    # pylint: disable=protected-access
+    var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS)
+    # pylint: enable=protected-access
+    processors = [_get_processor(v) for v in var_list]
+    if not var_list:
+      raise ValueError("No variables to optimize.")
+    var_refs = [p.target() for p in processors]
+    grads = gradients.gradients(
+        loss, var_refs, grad_ys=grad_loss,
+        gate_gradients=(gate_gradients == optimizer_v1.Optimizer.GATE_OP),
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        stop_gradients=stop_gradients)
+    if gate_gradients == optimizer_v1.Optimizer.GATE_GRAPH:
+      grads = control_flow_ops.tuple(grads)
+    grads_and_vars = list(zip(grads, var_list))
+    self._assert_valid_dtypes(
+        [v for g, v in grads_and_vars
+         if g is not None and v.dtype != dtypes.resource])
+    return grads_and_vars
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    # This is a default implementation of apply_gradients() that can be shared
+    # by most optimizers.  It relies on the subclass implementing the following
+    # methods: _create_vars(), _prepare(), _apply_dense(), and _apply_sparse().
+
+    # Filter out variables with gradients of `None`.
+    grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
+    if not grads_and_vars:
+      raise ValueError("No variables provided.")
+    filtered = tuple((g, v) for (g, v) in grads_and_vars if g is not None)
+    if not filtered:
+      raise ValueError("No gradients provided for any variable: %s." %
+                       ([str(v) for _, v in grads_and_vars],))
+    return distribute_lib.get_tower_context().merge_call(
+        self._distributed_apply, filtered, global_step=global_step, name=name)
+
+  def _get_or_create_state(self, var_list=None):
+    """Either looks up or creates `_OptimizerV2State`.
+
+    If any variables are available, they should be passed via the `var_list`
+    argument, and these will be used to determine the graph to create/retrieve
+    state for. Otherwise the returned state is for the current default graph.
+
+    Args:
+      var_list: A list of variables to extract a graph from.
+
+    Returns:
+      An `_OptimizerV2State` object.
+    """
+    # Determine the graph_key from the current graph.
+    eager_execution = context.executing_eagerly()
+    if eager_execution or var_list is None:
+      graph = ops.get_default_graph()
+    else:
+      graph = ops._get_graph_from_inputs(var_list)  # pylint: disable=protected-access
+    assert graph is not None
+    graph_key = graph._graph_key  # pylint: disable=protected-access
+
+    # Get the per graph state by looking up the graph_key.
+    if graph_key in self._per_graph_state:
+      per_graph_state = self._per_graph_state[graph_key]
+    else:
+      per_graph_state = _OptimizerV2State(self._name)
+      per_graph_state._init_with_static_hyper(self._hyper)  # pylint: disable=protected-access
+      self._per_graph_state[graph_key] = per_graph_state
+    return per_graph_state
+
+  def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
+    """`apply_gradients` for use with a `DistributionStrategy`."""
+    reduced_grads = distribution.batch_reduce("sum", grads_and_vars)
+    var_list = [v for _, v in grads_and_vars]
+    grads_and_vars = zip(reduced_grads, var_list)
+
+    unwrapped_var_list = [x for v in var_list for x in distribution.unwrap(v)]
+    eager_execution = context.executing_eagerly()
+    if eager_execution:
+      # Give a clear error in this case instead of "name not supported
+      # for Eager Tensors" when we compute non_slot_devices.
+      for v in unwrapped_var_list:
+        if isinstance(v, ops.Tensor):
+          raise NotImplementedError("Trying to update a Tensor ", v)
+
+    with ops.name_scope(name, self._name) as name:
+      per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list)
+      # Include the current value of any dynamic hyper parameters in `state`.
+      non_slot_devices = distribution.non_slot_devices(var_list)
+      state = per_graph_state._copy_with_dynamic_hyper(  # pylint: disable=protected-access
+          self._hyper, distribution, non_slot_devices)
+
+    # Create any slot and non-slot variables we need in `state`.
+    with ops.init_scope():
+      self._create_vars(var_list, state)
+
+    with ops.name_scope(name):  # Re-enter name_scope created above
+      # Give the child class a chance to do something before we start
+      # applying gradients.
+      self._prepare(state)
+
+      def update(v, g):
+        """Update variable `v` using gradient `g`."""
+        assert v is not None
+
+        # Convert the grad to Tensor or IndexedSlices if necessary, and
+        # look up a processor for each variable's type.
+        try:
+          g = ops.convert_to_tensor_or_indexed_slices(g)
+        except TypeError:
+          raise TypeError(
+              "Gradient must be convertible to a Tensor"
+              " or IndexedSlices, or None: %s" % g)
+        if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
+          raise TypeError(
+              "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
+        processor = _get_processor(v)
+
+        # We colocate all ops created in _apply_dense or _apply_sparse
+        # on the same device as the variable.
+        # TODO(apassos): figure out how to get the variable name here.
+        scope_name = "" if eager_execution else v.op.name
+        # device_policy is set because non-mirrored tensors will be read in
+        # `update_op`.
+        # TODO(josh11b): Make different state objects for each device to
+        # avoid needing to set the device_policy.
+        with ops.name_scope("update_" + scope_name), \
+            context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+          return processor.update_op(self, g, state)
+
+      # Use the processors to update the variables.
+      update_ops = []
+      for grad, var in grads_and_vars:
+        update_ops.extend(distribution.unwrap(distribution.update(
+            var, update, grad)))
+
+      # Give the child class a chance to do something after applying
+      # gradients
+      def finish():
+        # TODO(josh11b): Make different state objects for each device to
+        # avoid needing to set the device_policy.
+        with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+          return self._finish(state)
+
+      update_ops = control_flow_ops.group(update_ops)
+      with ops.control_dependencies([update_ops]):
+        finish_updates = distribution.update_non_slot(non_slot_devices, finish)
+      if finish_updates is None:
+        finish_updates = update_ops
+
+      # Update `global_step` (if any).
+      if global_step is None:
+        apply_updates = distribution.group(finish_updates, name=name)
+      else:
+        with ops.control_dependencies(distribution.unwrap(finish_updates)):
+
+          def update_global_step(global_step):
+            if isinstance(global_step, resource_variable_ops.ResourceVariable):
+              return global_step.assign_add(
+                  ops.convert_to_tensor(1, dtype=global_step.dtype),
+                  read_value=False)
+            else:
+              return state_ops.assign_add(global_step, 1)
+
+          apply_updates = distribution.group(
+              distribution.update(global_step, update_global_step), name=name)
+
+      # Add the training op to the TRAIN_OP graph collection in graph mode.
+      if not eager_execution:
+        if isinstance(apply_updates, ops.Tensor):
+          apply_updates = apply_updates.op
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        if apply_updates not in train_op:
+          train_op.append(apply_updates)
+
+      return apply_updates
+
+  def get_slot(self, var, name):
+    """Return a slot named `name` created for `var` by the Optimizer.
+
+    Some `Optimizer` subclasses use additional variables.  For example
+    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
+    gives access to these `Variable` objects if for some reason you need them.
+
+    Use `get_slot_names()` to get the list of slot names created by the
+    `Optimizer`.
+
+    Args:
+      var: A variable passed to `minimize()` or `apply_gradients()`.
+      name: A string.
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    state = self._get_state_for_var(var)
+    return state.get_slot(var, name) if state is not None else None
+
+  def get_slot_names(self):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    See `get_slot()`.
+
+    Returns:
+      A list of strings.
+    """
+    state = self._get_per_graph_state()
+    return state.get_slot_names() if state is not None else []
+
+  def variables(self):
+    """A list of variables which encode the current state of `Optimizer`.
+
+    Includes slot variables and additional global variables created by the
+    optimizer in the current default graph.
+
+    Returns:
+      A list of variables.
+    """
+    state = self._get_per_graph_state()
+    return state._variables() if state is not None else []  # pylint: disable=protected-access
+
+  # --------------
+  # Methods to be implemented by subclasses if they want to use the
+  # inherited implementation of apply_gradients() or compute_gradients().
+  # --------------
+  def _create_vars(self, var_list, state):
+    """Create all slots needed by the variables and any non-slot variables.
+
+    Args:
+      var_list: A list of `Variable` objects.
+      state: An object with these methods:
+        `create_slot(var, val, slot_name, optional_op_name)`,
+        `create_slot_with_initializer(`
+            `var, initializer, shape, dtype, slot_name, optional_op_name)`,
+        `zeros_slot(var, slot_name, optional_op_name)`,
+        `create_non_slot_variable(initial_value, name, colocate_with)`,
+        `get_hyper(name)`
+    """
+    # No slots needed by default
+    pass
+
+  def _prepare(self, state):
+    """Code to execute before applying gradients.
+
+    Note that most uses of _prepare() in Optimizer have been subsumed
+    by explicit support for hyper parameters in OptimizerV2
+
+    Args:
+      state: An object with a `get_hyper(name)` method.
+
+    Returns:
+      Return value will be ignored.
+    """
+    pass
+
+  def _apply_dense(self, grad, var, state):
+    """Add ops to apply dense gradients to `var`.
+
+    Args:
+      grad: A `Tensor`.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_dense(self, grad, handle, state):
+    """Add ops to apply dense gradients to the variable `handle`.
+
+    Args:
+      grad: a `Tensor` representing the gradient.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_sparse_duplicate_indices(
+      self, grad, handle, indices, state):
+    """Add ops to apply sparse gradients to `handle`, with repeated indices.
+
+    Optimizers which override this method must deal with repeated indices. See
+    the docstring of `_apply_sparse_duplicate_indices` for details. By default
+    the correct behavior, to sum non-unique indices and their associated
+    gradients, is enforced by first pre-processing `grad` and `indices` and
+    passing them on to `_resource_apply_sparse`. Optimizers which deal correctly
+    with duplicate indices may instead override this method to avoid the
+    overhead of summing.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      indices: a `Tensor` of integral type representing the indices for
+       which the gradient is nonzero. Indices may be repeated.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    # pylint: disable=protected-access
+    summed_grad, unique_indices = optimizer_v1._deduplicate_indexed_slices(
+        values=grad, indices=indices)
+    # pylint: enable=protected-access
+    return self._resource_apply_sparse(
+        summed_grad, handle, unique_indices, state)
+
+  def _resource_apply_sparse(self, grad, handle, indices, state):
+    """Add ops to apply sparse gradients to the variable `handle`.
+
+    Similar to `_apply_sparse`, the `indices` argument to this method has been
+    de-duplicated. Optimizers which deal correctly with non-unique indices may
+    instead override `_resource_apply_sparse_duplicate_indices` to avoid this
+    overhead.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      indices: a `Tensor` of integral type representing the indices for
+       which the gradient is nonzero. Indices are unique.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _apply_sparse_duplicate_indices(self, grad, var, state):
+    """Add ops to apply sparse gradients to `var`, with repeated sparse indices.
+
+    Optimizers which override this method must deal with IndexedSlices objects
+    such as the following:
+
+      IndexedSlicesValue(values=[1, 1], indices=[0, 0], dense_shape=[1])
+
+    The correct interpretation is:
+
+      IndexedSlicesValue(values=[2], indices=[0], dense_shape=[1])
+
+    Many optimizers deal incorrectly with repeated indices when updating based
+    on sparse gradients (e.g. summing squares rather than squaring the sum, or
+    applying momentum terms multiple times). Adding first is always the correct
+    behavior, so this is enforced here by reconstructing the IndexedSlices to
+    have only unique indices, then calling _apply_sparse.
+
+    Optimizers which deal correctly with repeated indices may instead override
+    this method to avoid the overhead of summing indices.
+
+    Args:
+      grad: `IndexedSlices`.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    # pylint: disable=protected-access
+    summed_values, unique_indices = optimizer_v1._deduplicate_indexed_slices(
+        values=grad.values, indices=grad.indices)
+    # pylint: enable=protected-access
+    gradient_no_duplicate_indices = ops.IndexedSlices(
+        indices=unique_indices,
+        values=summed_values,
+        dense_shape=grad.dense_shape)
+    return self._apply_sparse(gradient_no_duplicate_indices, var, state)
+
+  def _apply_sparse(self, grad, var, state):
+    """Add ops to apply sparse gradients to `var`.
+
+    The IndexedSlices object passed to `grad` in this function is by default
+    pre-processed in `_apply_sparse_duplicate_indices` to remove duplicate
+    indices (see its docstring for details). Optimizers which can tolerate or
+    have correct special cases for duplicate sparse indices may override
+    `_apply_sparse_duplicate_indices` instead of this function, avoiding that
+    overhead.
+
+    Args:
+      grad: `IndexedSlices`, with no repeated indices.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    raise NotImplementedError()
+
+  def _finish(self, state):
+    """Do what is needed to finish the update.
+
+    This is called inside a scope colocated with any non-slot variables.
+
+    Args:
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      The operation to apply updates, or None if no updates.
+    """
+    return None
+
+  # --------------
+  # Utility methods for subclasses.
+  # --------------
+  def _get_per_graph_state(self):
+    # pylint: disable=protected-access
+    return self._per_graph_state.get(ops.get_default_graph()._graph_key, None)
+
+  def _get_state_for_var(self, var):
+    # pylint: disable=protected-access
+    return self._per_graph_state.get(var._graph_key, None)
+
+  # --------------
+  # Overridden methods from Checkpointable.
+  # --------------
+
+  def _track_checkpointable(self, *args, **kwargs):
+    """Optimizers may not track dependencies. Raises an error."""
+    raise NotImplementedError(
+        "Optimizers may not have dependencies. File a feature request if this "
+        "limitation bothers you.")
+
+  @property
+  def _checkpoint_dependencies(self):
+    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    current_graph_non_slot_variables = []
+    state = self._get_per_graph_state()
+    if state is not None:
+      for name, variable_object in sorted(
+          state._non_slot_dict.items(),  # pylint: disable=protected-access
+          # Avoid comparing variables
+          key=lambda item: item[0]):
+        current_graph_non_slot_variables.append(
+            checkpointable.CheckpointableReference(
+                name=name, ref=variable_object))
+    # Note: ignores super(); Optimizers may not have any dependencies outside of
+    # state objects.
+    return current_graph_non_slot_variables
+
+  def _lookup_dependency(self, name):
+    """From Checkpointable. Find a non-slot variable in the current graph."""
+    state = self._get_per_graph_state()
+    if state is None:
+      return None
+    else:
+      return state.get_non_slot(name)
+
+  @property
+  def _deferred_dependencies(self):
+    """Lets Checkpointable know where non-slot variables are created.
+
+    If necessary, creates a new state object for the current default graph.
+    Checkpointable will then add entries to that state's deferred dependency
+    dictionary. The state object will check that dictionary when creating
+    non-slot variables, restoring their value if an entry is found.
+
+    Returns:
+      A dictionary which holds deferred dependencies for the current default
+      graph.
+    """
+    state = self._get_or_create_state()
+    return state._deferred_dependencies  # pylint: disable=protected-access
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable):
+    """Checkpointable: Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored.
+
+    Args:
+      slot_variable_position: A `checkpointable._CheckpointPosition` object
+        indicating the slot variable `Checkpointable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+    """
+    state = self._get_or_create_state(var_list=[variable])
+    state._create_or_restore_slot_variable(  # pylint: disable=protected-access
+        slot_variable_position=slot_variable_position,
+        slot_name=slot_name,
+        variable=variable,
+        optional_op_name=self._name)
+
+  # --------------
+  # Unsupported parent methods
+  # --------------
+  def _slot_dict(self, slot_name):
+    raise NotImplementedError(
+        "_slot_dict() method unsupported in OptimizerV2")
+
+  def _get_or_make_slot(self, var, val, slot_name, op_name):
+    raise NotImplementedError(
+        "_get_or_make_slot() method unsupported in OptimizerV2")
+
+  def _get_or_make_slot_with_initializer(self, var, initializer, shape, dtype,
+                                         slot_name, op_name):
+    raise NotImplementedError(
+        "_get_or_make_slot_with_initializer() method unsupported in "
+        "OptimizerV2")
+
+  def _create_non_slot_variable(self, initial_value, name, colocate_with):
+    raise NotImplementedError(
+        "_create_non_slot_variable() method unsupported in OptimizerV2")
+
+  def _get_non_slot_variable(self, name, graph=None):
+    raise NotImplementedError(
+        "_get_non_slot_variable() method unsupported in OptimizerV2")
+
+  def _non_slot_variables(self):
+    raise NotImplementedError(
+        "_non_slot_variables() method unsupported in OptimizerV2")
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_symbols.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_symbols.py
new file mode 100644
index 0000000000000000000000000000000000000000..24eada06ccdd68090f44c62646040fcd7d659727
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_symbols.py
@@ -0,0 +1,42 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution-aware version of Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.optimizer_v2.adadelta import AdadeltaOptimizer
+from tensorflow.contrib.optimizer_v2.adagrad import AdagradOptimizer
+from tensorflow.contrib.optimizer_v2.adam import AdamOptimizer
+from tensorflow.contrib.optimizer_v2.gradient_descent import GradientDescentOptimizer
+from tensorflow.contrib.optimizer_v2.momentum import MomentumOptimizer
+from tensorflow.contrib.optimizer_v2.optimizer_v2 import OptimizerV2
+from tensorflow.contrib.optimizer_v2.rmsprop import RMSPropOptimizer
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'AdadeltaOptimizer',
+    'AdagradOptimizer',
+    'AdamOptimizer',
+    'GradientDescentOptimizer',
+    'MomentumOptimizer',
+    'OptimizerV2',
+    'RMSPropOptimizer',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8599af32f6f4cc5529cd812e83c02ef3812cb71e
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
@@ -0,0 +1,294 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for OptimizerV2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import gradient_descent
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class OptimizerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBasic(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a_%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b_%d' % i)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+      # Note that for eager execution, minimize expects a function instead of a
+      # Tensor.
+      global_step = resource_variable_ops.ResourceVariable(
+          array_ops.zeros([], dtypes.int64), name='global_step_%d' % i)
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+
+      self.evaluate(variables.global_variables_initializer())
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Run 1 step of sgd through optimizer
+      opt_op = sgd_op.minimize(loss, global_step, [var0, var1])
+      self.evaluate(opt_op)
+      # Validate updated params
+      self.assertAllClose([-14., -13.], self.evaluate(var0))
+      self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  def testAggregationMethod(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        cost = 5 * var0 + 3 * var1
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name='global_step')
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+        opt_op = sgd_op.minimize(
+            cost,
+            global_step, [var0, var1],
+            aggregation_method=gradients_impl.AggregationMethod.
+            EXPERIMENTAL_ACCUMULATE_N)
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd through optimizer
+        opt_op.run()
+        # Validate updated params
+        self.assertAllClose([-14., -13.], var0.eval())
+        self.assertAllClose([-6., -5.], var1.eval())
+
+  def testPrecomputedGradient(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        cost = 5 * var0 + 3 * var1
+        grad_loss = constant_op.constant([42, -42], dtype=dtype)
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name='global_step')
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+        opt_op = sgd_op.minimize(
+            cost, global_step, [var0, var1], grad_loss=grad_loss)
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd through optimizer
+        opt_op.run()
+        # Validate updated params
+        self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
+                            var0.eval())
+        self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
+                            var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoVariables(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        var0 = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype, trainable=False, name='a')
+        var1 = resource_variable_ops.ResourceVariable(
+            [3.0, 4.0], dtype=dtype, trainable=False, name='b')
+        return 5 * var0 + var1
+      # pylint: enable=cell-var-from-loop
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No.*variables'):
+        sgd_op.minimize(loss)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoGradients(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b%d' % i)
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        return 5 * var0
+      # pylint: enable=cell-var-from-loop
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No gradients'):
+        # var1 has no gradient
+        sgd_op.minimize(loss, var_list=[var1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoGradientsForAnyVariables_Minimize(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a_%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b_%d' % i)
+      def loss():
+        return constant_op.constant(5.0)
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'No gradients provided for any variable'):
+        sgd_op.minimize(loss, var_list=[var0, var1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoGradientsForAnyVariables_ApplyGradients(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a_%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b_%d' % i)
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'No gradients provided for any variable'):
+        sgd_op.apply_gradients([(None, var0), (None, var1)])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientsAsVariables(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b%d' % i)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      grads_and_vars = sgd_op.compute_gradients(loss, [var0, var1])
+      # Convert gradients to tf.Variables
+      converted_grads = [
+          resource_variable_ops.ResourceVariable(array_ops.zeros([2], dtype),
+                                                 name='c_%d_%d' % (i, j))
+          for j, gv in enumerate(grads_and_vars)
+      ]
+      convert_ops = [
+          state_ops.assign(converted_grads[j], gv[0])
+          for j, gv in enumerate(grads_and_vars)
+      ]
+
+      self.evaluate(variables.global_variables_initializer())
+      # Run convert_ops to achieve the gradietns converting
+      self.evaluate(convert_ops)
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      # Run 1 step of sgd through optimizer
+      converted_grads_and_vars = list(zip(converted_grads, [var0, var1]))
+      opt_op = sgd_op.apply_gradients(converted_grads_and_vars)
+      self.evaluate(opt_op)
+
+      # Validate updated params
+      self.assertAllClose([-14., -13.], self.evaluate(var0))
+      self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testComputeGradientsWithTensors(self):
+    x = ops.convert_to_tensor(1.0)
+    def f():
+      return x * x
+    sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+    grads_and_vars = sgd_op.compute_gradients(f, [x])
+    self.assertEqual(1, len(grads_and_vars))
+    grad, x_as_var = grads_and_vars[0]
+    self.assertIs(x, x_as_var)
+    self.assertEqual(2.0, self.evaluate(grad))
+
+    with self.assertRaises(NotImplementedError):
+      sgd_op.apply_gradients(grads_and_vars)
+
+  def testTrainOp(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0])
+      var1 = variables.Variable([3.0, 4.0])
+      cost = 5 * var0 + 3 * var1
+      global_step = variables.Variable(
+          array_ops.zeros([], dtypes.int64), name='global_step')
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+      self.assertTrue(opt_op in ops.get_collection(ops.GraphKeys.TRAIN_OP))
+
+  def testConstraint(self):
+    constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
+    constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0],
+                                constraint=constraint_01)
+      var1 = variables.Variable([3.0, 4.0],
+                                constraint=constraint_0)
+      cost = 5 * var0 + 3 * var1
+      global_step = variables.Variable(
+          array_ops.zeros([], dtypes.int64), name='global_step')
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Run 1 step of sgd through optimizer
+      opt_op.run()
+      # Validate updated params
+      self.assertAllClose([-0.1, -0.1], var0.eval())
+      self.assertAllClose([0., 0.], var1.eval())
+
+  def testStopGradients(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0], name='var0')
+      var1 = variables.Variable([3.0, 4.0], name='var1')
+      var0_id = array_ops.identity(var0)
+      cost = 5 * var0_id + 3 * var1
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      grads_and_vars = sgd_op.compute_gradients(cost, [var0, var1],
+                                                stop_gradients=[var0_id])
+      grad_dict = {var.op.name: grad for grad, var in grads_and_vars}
+      self.assertIsNone(grad_dict['var0'])
+      self.assertIsNotNone(grad_dict['var1'])
+
+  def testDoNotOverrideCreateSlots(self):
+    class ShouldNotOverrideCreateSlots(optimizer_v2.OptimizerV2):
+
+      def _create_slots(self, var_list):
+        """In OptimizerV2 _create_slots was renamed _create_vars."""
+        return var_list
+
+    with self.assertRaises(RuntimeError):
+      ShouldNotOverrideCreateSlots(True, 'name')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop.py b/tensorflow/contrib/optimizer_v2/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..164ff0ea0670bd07d19fa642e2e3cde1ab84612a
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/rmsprop.py
@@ -0,0 +1,233 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RMSprop optimizer for Tensorflow.
+
+rmsprop algorithm [tieleman2012rmsprop]
+
+A detailed description of rmsprop.
+
+- maintain a moving (discounted) average of the square of gradients
+- divide gradient by the root of this average
+
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square + epsilon)
+delta = - mom
+
+This implementation of RMSProp uses plain momentum, not Nesterov momentum.
+
+The centered version additionally maintains a moving (discounted) average of the
+gradients, and uses that average to estimate the variance:
+
+mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t /
+    sqrt(mean_square - mean_grad**2 + epsilon)
+delta = - mom
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+
+from tensorflow.python.training import training_ops
+
+
+class RMSPropOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the RMSProp algorithm.
+
+  See the
+  [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  """
+
+  def __init__(self,
+               learning_rate,
+               decay=0.9,
+               momentum=0.0,
+               epsilon=1e-10,
+               use_locking=False,
+               centered=False,
+               name="RMSProp"):
+    """Construct a new RMSProp optimizer.
+
+    Note that in the dense implementation of this algorithm, variables and their
+    corresponding accumulators (momentum, gradient moving average, square
+    gradient moving average) will be updated even if the gradient is zero
+    (i.e. accumulators will decay, momentum will be applied). The sparse
+    implementation (used when the gradient is an `IndexedSlices` object,
+    typically because of `tf.gather` or an embedding lookup in the forward pass)
+    will not update variable slices or their accumulators unless those slices
+    were used in the forward pass (nor is there an "eventual" correction to
+    account for these omitted updates). This leads to more efficient updates for
+    large embedding lookup tables (where most of the slices are not accessed in
+    a particular graph execution), but differs from the published algorithm.
+
+    Some of the args below are hyperparameters, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      decay: A float hyperparameter. Discounting factor for the history/coming
+        gradient.
+      momentum: A float hyperparameter.
+      epsilon: A float hyperparameter. Small value to avoid zero denominator.
+      use_locking: If True use locks for update operation.
+      centered: If True, gradients are normalized by the estimated variance of
+        the gradient; if False, by the uncentered second moment. Setting this to
+        True may help with training, but is slightly more expensive in terms of
+        computation and memory. Defaults to False.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "RMSProp".
+    """
+    super(RMSPropOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", decay)
+    self._set_hyper("momentum", momentum)
+    self._set_hyper("epsilon", epsilon)
+
+    self._centered = centered
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      if v.get_shape().is_fully_defined():
+        init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
+      else:
+        init_rms = array_ops.ones_like(v)
+      state.create_slot_with_initializer(v, init_rms, v.get_shape(),
+                                         v.dtype.base_dtype, "rms")
+      if self._centered:
+        state.zeros_slot(v, "mg")
+      state.zeros_slot(v, "momentum")
+
+  def _apply_dense(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.apply_centered_rms_prop(
+          var,
+          mg,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking).op
+    else:
+      return training_ops.apply_rms_prop(
+          var,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.resource_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.sparse_apply_centered_rms_prop(
+          var,
+          mg,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad.values,
+          grad.indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.sparse_apply_rms_prop(
+          var,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad.values,
+          grad.indices,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_sparse_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          indices,
+          use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed68f6afbf8bf9678649c1ce6fc59c3b91026dc0
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -0,0 +1,449 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rmsprop optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import itertools
+import math
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import rmsprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+_DATA_TYPES = [dtypes.half, dtypes.float32]
+
+_TEST_PARAM_VALUES = [
+    # learning_rate, decay, momentum, epsilon, centered, use_resource
+    [0.5, 0.9, 0.0, 1e-3, True, False],
+    [0.5, 0.9, 0.0, 1e-3, False, False],
+    [0.5, 0.9, 0.0, 1e-3, True, True],
+    [0.5, 0.9, 0.0, 1e-3, False, True],
+    [0.1, 0.9, 0.0, 1e-3, True, False],
+    [0.5, 0.95, 0.0, 1e-3, False, False],
+    [0.5, 0.95, 0.0, 1e-5, True, False],
+    [0.5, 0.95, 0.9, 1e-5, True, False],
+]
+
+_TESTPARAMS = [
+    [data_type] + values
+    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
+]
+
+
+class RMSPropOptimizerTest(test.TestCase):
+
+  def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, decay, momentum,
+                            epsilon, centered):
+    rms_t = rms * decay + (1 - decay) * g * g
+    denom_t = rms_t + epsilon
+    if centered:
+      mg_t = mg * decay + (1 - decay) * g
+      denom_t -= mg_t * mg_t
+    else:
+      mg_t = mg
+    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
+    var_t = var - mom_t
+    return var_t, mg_t, rms_t, mom_t
+
+  def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
+                                   lr, decay, momentum, epsilon, centered):
+    mg_t = copy.deepcopy(mg)
+    rms_t = copy.deepcopy(rms)
+    mom_t = copy.deepcopy(mom)
+    var_t = copy.deepcopy(var)
+    for i in range(len(gindexs)):
+      gindex = gindexs[i]
+      gvalue = gvalues[i]
+      rms_t[gindex] = rms[gindex] * decay + (1 - decay) * gvalue * gvalue
+      denom_t = rms_t[gindex] + epsilon
+      if centered:
+        mg_t[gindex] = mg_t[gindex] * decay + (1 - decay) * gvalue
+        denom_t -= mg_t[gindex] * mg_t[gindex]
+      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
+      var_t[gindex] = var[gindex] - mom_t[gindex]
+    return var_t, mg_t, rms_t, mom_t
+
+  def testDense(self):
+    # TODO(yori): Use ParameterizedTest when available
+    for (dtype, learning_rate, decay, momentum,
+         epsilon, centered, use_resource) in _TESTPARAMS:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=learning_rate,
+            decay=decay,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        mg0 = opt.get_slot(var0, "mg")
+        self.assertEqual(mg0 is not None, centered)
+        mg1 = opt.get_slot(var1, "mg")
+        self.assertEqual(mg1 is not None, centered)
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 4 steps of RMSProp
+        for _ in range(1, 5):
+          update.run()
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
+              decay, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
+              decay, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSPropOptimizer(
+            learning_rate=1.0,
+            decay=0.0,
+            momentum=0.0,
+            epsilon=0.0,
+            centered=False).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0., 1.]], var0.eval(), atol=0.01)
+
+  def testMinimizeSparseResourceVariableCentered(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSPropOptimizer(
+            learning_rate=1.0,
+            decay=0.0,
+            momentum=0.0,
+            epsilon=1.0,
+            centered=True).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[-111, -138]], var0.eval(), atol=0.01)
+
+  def testSparse(self):
+    # TODO(yori): Use ParameterizedTest when available
+    for (dtype, learning_rate, decay,
+         momentum, epsilon, centered, _) in _TESTPARAMS:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+        grads1_np_indices = np.array([1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=learning_rate,
+            decay=decay,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        mg0 = opt.get_slot(var0, "mg")
+        self.assertEqual(mg0 is not None, centered)
+        mg1 = opt.get_slot(var1, "mg")
+        self.assertEqual(mg1 is not None, centered)
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 4 steps of RMSProp
+        for _ in range(1, 5):
+          update.run()
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+              learning_rate, decay, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+              learning_rate, decay, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testWithoutMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.test_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the rms accumulators where 1. So we should see a normal
+        # update: v -= grad * learning_rate
+        update.run()
+        # Check the root mean square accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901, 0.901]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001, 0.90001]), rms1.eval())
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
+            ]), var1.eval())
+        # Step 2: the root mean square accumulators contain the previous update.
+        update.run()
+        # Check the rms accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
+            ]), var1.eval())
+
+  def testWithMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.test_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: rms = 1, mom = 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        update.run()
+        # Check the root mean square accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901, 0.901]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001, 0.90001]), rms1.eval())
+        # Check the momentum accumulators
+        self.assertAllCloseAccordingToType(
+            np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+
+        # Check that the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
+            ]), var1.eval())
+
+        # Step 2: the root mean square accumulators contain the previous update.
+        update.run()
+        # Check the rms accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
+                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
+            ]), mom0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
+                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
+            ]), mom1.eval())
+
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
+            ]), var0.eval())
+
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
+            ]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index bd9078ae76ee27ec26c09d1aa2012f871cbdf5e9..6ca7fe8b6e59b0dc24be76262d4f54f387e53e48 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -94,18 +94,6 @@ py_test(
 #     srcs_version = "PY2AND3",
 # )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "custom_op_sources",
     srcs = glob(
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index a80f060b91df3b6d5e2ca9ff63c721382f0cbb0a..36e21af618f5af744ce793509813eaf36e1b8479 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -8,18 +8,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "predictor",
     srcs = ["__init__.py"],
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index 04b5d5bdf158dc6a478d7a24b538c75d1dca8d45..6e77e934fe19851eea9ed0b74eb7aecc76f6237a 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -53,7 +53,7 @@ def from_contrib_estimator(estimator,
       `Estimator`.
   """
   if isinstance(estimator, core_estimator.Estimator):
-    raise TypeError('Espected estimator to be of type '
+    raise TypeError('Expected estimator to be of type '
                     'tf.contrib.learn.Estimator, but got type '
                     'tf.python.estimator.Estimator. You likely want to call '
                     'from_estimator.')
@@ -88,7 +88,7 @@ def from_estimator(estimator,
       `Estimator`.
   """
   if isinstance(estimator, contrib_estimator.Estimator):
-    raise TypeError('Espected estimator to be of type '
+    raise TypeError('Expected estimator to be of type '
                     'tf.python.estimator.Estimator, but got type '
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3e9b1a0b8d8ec7c3c5fe5d1f2cf896dbb6c3de72
--- /dev/null
+++ b/tensorflow/contrib/proto/BUILD
@@ -0,0 +1,32 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+py_library(
+    name = "proto",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = [
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+    ],
+)
+
+py_library(
+    name = "proto_pip",
+    data = [
+        "//tensorflow/contrib/proto/python/kernel_tests:test_messages",
+    ] + if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/proto/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":proto",
+        "//tensorflow/contrib/proto/python/kernel_tests:py_test_deps",
+    ],
+)
diff --git a/tensorflow/contrib/bayesflow/python/ops/halton_sequence.py b/tensorflow/contrib/proto/__init__.py
similarity index 68%
rename from tensorflow/contrib/bayesflow/python/ops/halton_sequence.py
rename to tensorflow/contrib/proto/__init__.py
index 49d747d538f5a4aa3134d28ba00a651cb509fa41..bc5a49de78e251cb4a854fc11a7b13b39820127d 100644
--- a/tensorflow/contrib/bayesflow/python/ops/halton_sequence.py
+++ b/tensorflow/contrib/proto/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,22 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Support for low discrepancy Halton sequences.
+"""Ops and modules related to proto.
 
+@@decode_proto
+@@encode_proto
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.halton_sequence_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'sample',
-]
+from tensorflow.contrib.proto.python.ops.decode_proto_op import decode_proto
+from tensorflow.contrib.proto.python.ops.encode_proto_op import encode_proto
 
-remove_undocumented(__name__, _allowed_symbols)
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/BUILD b/tensorflow/contrib/proto/python/kernel_tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a380a131f86abc8dd921a123afdb964bf6c2466c
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/BUILD
@@ -0,0 +1,86 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Much of the work in this BUILD file actually happens in the corresponding
+# build_defs.bzl, which creates an individual testcase for each example .pbtxt
+# file in this directory.
+#
+load(":build_defs.bzl", "decode_proto_test_suite")
+load(":build_defs.bzl", "encode_proto_test_suite")
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :decode_proto_op_tests.
+decode_proto_test_suite(
+    name = "decode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :encode_proto_op_tests.
+encode_proto_test_suite(
+    name = "encode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# Below here are tests that are not tied to an example text proto.
+filegroup(
+    name = "test_messages",
+    srcs = glob(["*.pbtxt"]),
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+tf_py_test(
+    name = "decode_proto_fail_test",
+    size = "small",
+    srcs = ["decode_proto_fail_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/proto:proto",
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+    tags = [
+        "no_pip",  # TODO(b/78026780)
+        "no_windows",  # TODO(b/78028010)
+    ],
+)
+
+py_library(
+    name = "test_case",
+    srcs = ["test_case.py"],
+    deps = ["//tensorflow/python:client_testlib"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [
+        ":test_case",
+        ":test_example_proto_py",
+    ],
+)
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..f425601691e21b36914f340d53ccadf9b4e3641f
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
@@ -0,0 +1,89 @@
+"""BUILD rules for generating file-driven proto test cases.
+
+The decode_proto_test_suite() and encode_proto_test_suite() rules take a list
+of text protos and generates a tf_py_test() for each one.
+"""
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "register_extension_info")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+def _test_name(test, path):
+  return "%s_%s_test" % (test, path.split("/")[-1].split(".")[0])
+
+def decode_proto_test_suite(name, examples):
+  """Build the decode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("decode_proto", test_filename),
+        srcs = ["decode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "decode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+            "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        ],
+        tags = [
+            "no_pip",  # TODO(b/78026780)
+            "no_windows",  # TODO(b/78028010)
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("decode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+def encode_proto_test_suite(name, examples):
+  """Build the encode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("encode_proto", test_filename),
+        srcs = ["encode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "encode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+            "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+            "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+        ],
+        tags = [
+            "no_pip",  # TODO(b/78026780)
+            "no_windows",  # TODO(b/78028010)
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("encode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+register_extension_info(
+    extension_name = "decode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:decode_example_.*",
+    })
+
+register_extension_info(
+    extension_name = "encode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:encode_example_.*",
+    })
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5298342ee79b08a50b13ce8715e891a332efb3bc
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
@@ -0,0 +1,68 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class DecodeProtoFailTest(test_case.ProtoOpTestCase):
+  """Test failure cases for DecodeToProto."""
+
+  def _TestCorruptProtobuf(self, sanitize):
+    """Test failure cases for DecodeToProto."""
+
+    # The goal here is to check the error reporting.
+    # Testing against a variety of corrupt protobufs is
+    # done by fuzzing.
+    corrupt_proto = 'This is not a binary protobuf'
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(corrupt_proto, dtype=object)
+    msg_type = 'tensorflow.contrib.proto.TestCase'
+    field_names = ['sizes']
+    field_types = [dtypes.int32]
+
+    with self.test_session() as sess:
+      ctensor, vtensor = decode_proto_op.decode_proto(
+          batch,
+          message_type=msg_type,
+          field_names=field_names,
+          output_types=field_types,
+          sanitize=sanitize)
+      with self.assertRaisesRegexp(errors.DataLossError,
+                                   'Unable to parse binary protobuf'
+                                   '|Failed to consume entire buffer'):
+        _ = sess.run([ctensor] + vtensor)
+
+  def testCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=False)
+
+  def testSanitizerCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1c13c82bc264bc8bcc721eb68ee3916f32ef7a8
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
@@ -0,0 +1,300 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for decode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+"""
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class DecodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def _compareValues(self, fd, vs, evs):
+    """Compare lists/arrays of field values."""
+
+    if len(vs) != len(evs):
+      self.fail('Field %s decoded %d outputs, expected %d' %
+                (fd.name, len(vs), len(evs)))
+    for i, ev in enumerate(evs):
+      # Special case fuzzy match for float32. TensorFlow seems to mess with
+      # MAX_FLT slightly and the test doesn't work otherwise.
+      # TODO(nix): ask on TF list about why MAX_FLT doesn't pass through.
+      if fd.cpp_type == fd.CPPTYPE_FLOAT:
+        # Numpy isclose() is better than assertIsClose() which uses an absolute
+        # value comparison.
+        self.assertTrue(
+            np.isclose(vs[i], ev), 'expected %r, actual %r' % (ev, vs[i]))
+      elif fd.cpp_type == fd.CPPTYPE_STRING:
+        # In Python3 string tensor values will be represented as bytes, so we
+        # reencode the proto values to match that.
+        self.assertEqual(vs[i], ev.encode('ascii'))
+      else:
+        # Doubles and other types pass through unscathed.
+        self.assertEqual(vs[i], ev)
+
+  def _compareRepeatedPrimitiveValue(self, batch_shape, sizes, fields,
+                                     field_dict):
+    """Compare protos of type RepeatedPrimitiveValue.
+
+    Args:
+      batch_shape: the shape of the input tensor of serialized messages.
+      sizes: int matrix of repeat counts returned by decode_proto
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      field_dict: map from field names to decoded numpy tensors of values
+    """
+
+    # Check that expected values match.
+    for field in fields:
+      values = field_dict[field.name]
+      self.assertEqual(dtypes.as_dtype(values.dtype), field.dtype)
+
+      fd = field.expected.DESCRIPTOR.fields_by_name[field.name]
+
+      # Values has the same shape as the input plus an extra
+      # dimension for repeats.
+      self.assertEqual(list(values.shape)[:-1], batch_shape)
+
+      # Nested messages are represented as TF strings, requiring
+      # some special handling.
+      if field.name == 'message_value':
+        vs = []
+        for buf in values.flat:
+          msg = test_example_pb2.PrimitiveValue()
+          msg.ParseFromString(buf)
+          vs.append(msg)
+        evs = getattr(field.expected, field.name)
+        if len(vs) != len(evs):
+          self.fail('Field %s decoded %d outputs, expected %d' %
+                    (fd.name, len(vs), len(evs)))
+        for v, ev in zip(vs, evs):
+          self.assertEqual(v, ev)
+        continue
+
+      # This can be a little confusing. For testing we are using
+      # RepeatedPrimitiveValue in two ways: it's the proto that we
+      # decode for testing, and it's used in the expected value as a
+      # union type. The two cases are slightly different: this is the
+      # second case.
+      # We may be fetching the uint64_value from the test proto, but
+      # in the expected proto we store it in the int64_value field
+      # because TensorFlow doesn't support unsigned int64.
+      tf_type_to_primitive_value_field = {
+          dtypes.float32:
+              'float_value',
+          dtypes.float64:
+              'double_value',
+          dtypes.int32:
+              'int32_value',
+          dtypes.uint8:
+              'uint8_value',
+          dtypes.int8:
+              'int8_value',
+          dtypes.string:
+              'string_value',
+          dtypes.int64:
+              'int64_value',
+          dtypes.bool:
+              'bool_value',
+          # Unhandled TensorFlow types:
+          # DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
+          # DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
+      }
+      tf_field_name = tf_type_to_primitive_value_field.get(field.dtype)
+      if tf_field_name is None:
+        self.fail('Unhandled tensorflow type %d' % field.dtype)
+
+      self._compareValues(fd, values.flat,
+                          getattr(field.expected, tf_field_name))
+
+  def _runDecodeProtoTests(self, fields, case_sizes, batch_shape, batch,
+                           message_type, message_format, sanitize,
+                           force_disordered=False):
+    """Run decode tests on a batch of messages.
+
+    Args:
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      case_sizes: expected sizes array
+      batch_shape: the shape of the input tensor of serialized messages
+      batch: list of serialized messages
+      message_type: descriptor name for messages
+      message_format: format of messages, 'text' or 'binary'
+      sanitize: whether to sanitize binary protobuf inputs
+      force_disordered: whether to force fields encoded out of order.
+    """
+
+    if force_disordered:
+      # Exercise code path that handles out-of-order fields by prepending extra
+      # fields with tag numbers higher than any real field. Note that this won't
+      # work with sanitization because that forces reserialization using a
+      # trusted decoder and encoder.
+      assert not sanitize
+      extra_fields = test_example_pb2.ExtraFields()
+      extra_fields.string_value = 'IGNORE ME'
+      extra_fields.bool_value = False
+      extra_msg = extra_fields.SerializeToString()
+      batch = [extra_msg + msg for msg in batch]
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(batch, dtype=object)
+    batch = np.reshape(batch, batch_shape)
+
+    field_names = [f.name for f in fields]
+    output_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, vtensor = decode_proto_op.decode_proto(
+          batch,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=output_types,
+          message_format=message_format,
+          sanitize=sanitize)
+
+      vlist = sess.run([sizes] + vtensor)
+      sizes = vlist[0]
+      # Values is a list of tensors, one for each field.
+      value_tensors = vlist[1:]
+
+      # Check that the repeat sizes are correct.
+      self.assertTrue(
+          np.all(np.array(sizes.shape) == batch_shape + [len(field_names)]))
+
+      # Check that the decoded sizes match the expected sizes.
+      self.assertEqual(len(sizes.flat), len(case_sizes))
+      self.assertTrue(
+          np.all(sizes.flat == np.array(
+              case_sizes, dtype=np.int32)))
+
+      field_dict = dict(zip(field_names, value_tensors))
+
+      self._compareRepeatedPrimitiveValue(batch_shape, sizes, fields,
+                                          field_dict)
+
+  def testBinary(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testBinaryDisordered(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False,
+        force_disordered=True)
+
+  def testPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    packed_batch = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        packed_batch,
+        'tensorflow.contrib.proto.PackedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testText(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Note: float_format='.17g' is necessary to ensure preservation of
+    # doubles and floats in text format.
+    text_batch = [
+        text_format.MessageToString(
+            primitive, float_format='.17g') for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        text_batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'text',
+        sanitize=False)
+
+  def testSanitizerGood(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e316819077c7dbb28beefd4dc260568f26da680
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt
@@ -0,0 +1,94 @@
+primitive {
+  # No fields specified, so we get all defaults
+}
+shape: 1
+sizes: 0
+field {
+  name: "double_default"
+  dtype: DT_DOUBLE
+  expected { double_value: 1.0 }
+}
+sizes: 0
+field {
+  name: "float_default"
+  dtype: DT_DOUBLE  # Try casting the float field to double.
+  expected { double_value: 2.0 }
+}
+sizes: 0
+field {
+  name: "int64_default"
+  dtype: DT_INT64
+  expected { int64_value: 3 }
+}
+sizes: 0
+field {
+  name: "uint64_default"
+  dtype: DT_INT64
+  expected { int64_value: 4 }
+}
+sizes: 0
+field {
+  name: "int32_default"
+  dtype: DT_INT32
+  expected { int32_value: 5 }
+}
+sizes: 0
+field {
+  name: "fixed64_default"
+  dtype: DT_INT64
+  expected { int64_value: 6 }
+}
+sizes: 0
+field {
+  name: "fixed32_default"
+  dtype: DT_INT32
+  expected { int32_value: 7 }
+}
+sizes: 0
+field {
+  name: "bool_default"
+  dtype: DT_BOOL
+  expected { bool_value: true }
+}
+sizes: 0
+field {
+  name: "string_default"
+  dtype: DT_STRING
+  expected { string_value: "a" }
+}
+sizes: 0
+field {
+  name: "bytes_default"
+  dtype: DT_STRING
+  expected { string_value: "a longer default string" }
+}
+sizes: 0
+field {
+  name: "uint32_default"
+  dtype: DT_INT32
+  expected { int32_value: -1 }
+}
+sizes: 0
+field {
+  name: "sfixed32_default"
+  dtype: DT_INT32
+  expected { int32_value: 10 }
+}
+sizes: 0
+field {
+  name: "sfixed64_default"
+  dtype: DT_INT64
+  expected { int64_value: 11 }
+}
+sizes: 0
+field {
+  name: "sint32_default"
+  dtype: DT_INT32
+  expected { int32_value: 12 }
+}
+sizes: 0
+field {
+  name: "sint64_default"
+  dtype: DT_INT64
+  expected { int64_value: 13 }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..30e58e6336dc66830418c7cd2b3111a851d691b6
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
@@ -0,0 +1,180 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for encode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+
+It tests that encode_proto is a lossless inverse of decode_proto
+(for the specified fields).
+"""
+# Python3 readiness boilerplate
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class EncodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def testBadInputs(self):
+    # Invalid field name
+    with self.test_session():
+      with self.assertRaisesOpError('Unknown field: non_existent_field'):
+        encode_proto_op.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['non_existent_field']).eval()
+
+    # Incorrect types.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Incompatible type for field double_value.'):
+        encode_proto_op.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval()
+
+    # Incorrect shapes of sizes.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          r'sizes should be batch_size \+ \[len\(field_names\)\]'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values = array_ops.placeholder(dtypes.float64)
+        encode_proto_op.encode_proto(
+            sizes=sizes,
+            values=[values],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval(feed_dict={
+                sizes: [[[0, 0]]],
+                values: [[0.0]]
+            })
+
+    # Inconsistent shapes of values.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Values must match up to the last dimension'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values1 = array_ops.placeholder(dtypes.float64)
+        values2 = array_ops.placeholder(dtypes.int32)
+        (encode_proto_op.encode_proto(
+            sizes=[[1, 1]],
+            values=[values1, values2],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value', 'int32_value']).eval(feed_dict={
+                values1: [[0.0]],
+                values2: [[0], [0]]
+            }))
+
+  def _testRoundtrip(self, in_bufs, message_type, fields):
+
+    field_names = [f.name for f in fields]
+    out_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, field_tensors = decode_proto_op.decode_proto(
+          in_bufs,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=out_types)
+
+      out_tensors = encode_proto_op.encode_proto(
+          sizes,
+          field_tensors,
+          message_type=message_type,
+          field_names=field_names)
+
+      out_bufs, = sess.run([out_tensors])
+
+      # Check that the re-encoded tensor has the same shape.
+      self.assertEqual(in_bufs.shape, out_bufs.shape)
+
+      # Compare the input and output.
+      for in_buf, out_buf in zip(in_bufs.flat, out_bufs.flat):
+        in_obj = test_example_pb2.RepeatedPrimitiveValue()
+        in_obj.ParseFromString(in_buf)
+
+        out_obj = test_example_pb2.RepeatedPrimitiveValue()
+        out_obj.ParseFromString(out_buf)
+
+        # Check that the deserialized objects are identical.
+        self.assertEqual(in_obj, out_obj)
+
+        # Check that the input and output serialized messages are identical.
+        # If we fail here, there is a difference in the serialized
+        # representation but the new serialization still parses. This could
+        # be harmless (a change in map ordering?) or it could be bad (e.g.
+        # loss of packing in the encoding).
+        self.assertEqual(in_buf, out_buf)
+
+  def testRoundtrip(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    in_bufs = [primitive.SerializeToString() for primitive in case.primitive]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.RepeatedPrimitiveValue', case.field)
+
+  def testRoundtripPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    in_bufs = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.PackedPrimitiveValue', case.field)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b170f89c0f00dd9dffd5785197bb3bfd1ca2cfee
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
@@ -0,0 +1,161 @@
+primitive {
+  double_value: -1.7976931348623158e+308
+  double_value: 2.2250738585072014e-308
+  double_value: 1.7976931348623158e+308
+  float_value: -3.402823466e+38
+  float_value: 1.175494351e-38
+  float_value: 3.402823466e+38
+  int64_value: -9223372036854775808
+  int64_value: 9223372036854775807
+  uint64_value: 0
+  uint64_value: 18446744073709551615
+  int32_value: -2147483648
+  int32_value: 2147483647
+  fixed64_value: 0
+  fixed64_value: 18446744073709551615
+  fixed32_value: 0
+  fixed32_value: 4294967295
+  bool_value: false
+  bool_value: true
+  string_value: ""
+  string_value: "I refer to the infinite."
+  uint32_value: 0
+  uint32_value: 4294967295
+  sfixed32_value: -2147483648
+  sfixed32_value: 2147483647
+  sfixed64_value: -9223372036854775808
+  sfixed64_value: 9223372036854775807
+  sint32_value: -2147483648
+  sint32_value: 2147483647
+  sint64_value: -9223372036854775808
+  sint64_value: 9223372036854775807
+}
+shape: 1
+sizes: 3
+sizes: 3
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: -1.7976931348623158e+308
+    double_value: 2.2250738585072014e-308
+    double_value: 1.7976931348623158e+308
+  }
+}
+field {
+  name: "float_value"
+  dtype: DT_FLOAT
+  expected {
+    float_value: -3.402823466e+38
+    float_value: 1.175494351e-38
+    float_value: 3.402823466e+38
+  }
+}
+field {
+  name: "int64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "uint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1
+  }
+}
+field {
+  name: "int32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "fixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1  # unsigned is 18446744073709551615
+  }
+}
+field {
+  name: "fixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: false
+    bool_value: true
+  }
+}
+field {
+  name: "string_value"
+  dtype: DT_STRING
+  expected {
+    string_value: ""
+    string_value: "I refer to the infinite."
+  }
+}
+field {
+  name: "uint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "sfixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sfixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "sint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c664e52851b5bb3c439544537ce6402fc7cf3362
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
@@ -0,0 +1,16 @@
+primitive {
+  message_value {
+    double_value: 23.5
+  }
+}
+shape: 1
+sizes: 1
+field {
+  name: "message_value"
+  dtype: DT_STRING
+  expected {
+    message_value {
+      double_value: 23.5
+    }
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..125651d7eaa1901e4804712bb807322b02ed5bc6
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
@@ -0,0 +1,20 @@
+primitive {
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 0
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 0.0
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc07efc8f3038c6c540855c97b2254575e517ef3
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
@@ -0,0 +1,29 @@
+primitive {
+  fixed32_value: 4294967295
+  uint32_value: 4294967295
+}
+shape: 1
+sizes: 1
+field {
+  name: "fixed32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
+sizes: 1
+field {
+  name: "uint32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
+sizes: 0
+field {
+  name: "uint32_default"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295  # Comes from an explicitly-specified default
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61c7ac53f72b0764a0d57241cbdcdd93fcbd9279
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
@@ -0,0 +1,32 @@
+primitive {
+  double_value: 23.5
+  double_value: 123.0
+  bool_value: true
+}
+primitive {
+  double_value: 3.1
+  bool_value: false
+}
+shape: 2
+sizes: 2
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 123.0
+    double_value: 3.1
+    double_value: 0.0
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f4828076d52dc5d03a887c4a445dbcf52414c361
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
@@ -0,0 +1,62 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+primitive {
+  double_value: 44.0
+  bool_value: false
+}
+primitive {
+  double_value: 3.14159
+  bool_value: true
+}
+primitive {
+  double_value: 1.414
+  bool_value: true
+}
+primitive {
+  double_value: -32.2
+  bool_value: false
+}
+primitive {
+  double_value: 0.0001
+  bool_value: true
+}
+shape: 3
+shape: 2
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 44.0
+    double_value: 3.14159
+    double_value: 1.414
+    double_value: -32.2
+    double_value: 0.0001
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+    bool_value: true
+    bool_value: true
+    bool_value: false
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc20ac147b0e772f05b4fc614f9f56513aceb1d5
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
@@ -0,0 +1,21 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_case.py b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95202c5df654cfc02339477b242b2c58575a4d5
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
@@ -0,0 +1,35 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Test case base for testing proto operations."""
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+from tensorflow.python.platform import test
+
+
+class ProtoOpTestCase(test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(ProtoOpTestCase, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000000000000000000000000000000000..a2c88e372bf7c6b7f14c5bb55776b66c4c06bcd4
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
@@ -0,0 +1,182 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.proto;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+
+  // Optional fields with explicitly-specified defaults.
+  optional double double_default = 20 [default = 1.0];
+  optional float float_default = 21 [default = 2.0];
+  optional int64 int64_default = 22 [default = 3];
+  optional uint64 uint64_default = 23 [default = 4];
+  optional int32 int32_default = 24 [default = 5];
+  optional fixed64 fixed64_default = 25 [default = 6];
+  optional fixed32 fixed32_default = 26 [default = 7];
+  optional bool bool_default = 27 [default = true];
+  optional string string_default = 28 [default = "a"];
+  optional bytes bytes_default = 29 [default = "a longer default string"];
+  optional uint32 uint32_default = 30 [default = 4294967295];
+  optional sfixed32 sfixed32_default = 31 [default = 10];
+  optional sfixed64 sfixed64_default = 32 [default = 11];
+  optional sint32 sint32_default = 33 [default = 12];
+  optional sint64 sint64_default = 34 [default = 13];
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+
+  optional double double_default = 20 [default = 1.0];
+  optional float float_default = 21 [default = 2.0];
+  optional int64 int64_default = 22 [default = 3];
+  optional uint64 uint64_default = 23 [default = 4];
+  optional int32 int32_default = 24 [default = 5];
+  optional fixed64 fixed64_default = 25 [default = 6];
+  optional fixed32 fixed32_default = 26 [default = 7];
+  optional bool bool_default = 27 [default = true];
+  optional string string_default = 28 [default = "a"];
+  optional bytes bytes_default = 29 [default = "a longer default string"];
+  optional uint32 uint32_default = 30 [default = 4294967295];
+  optional sfixed32 sfixed32_default = 31 [default = 10];
+  optional sfixed64 sfixed64_default = 32 [default = 11];
+  optional sint32 sint32_default = 33 [default = 12];
+  optional sint64 sint64_default = 34 [default = 13];
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/contrib/proto/python/ops/BUILD b/tensorflow/contrib/proto/python/ops/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f17065477e1e14d24a16338b1a11d98da44639fe
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_wrapper_py",
+)
+
+py_library(
+    name = "decode_proto_op_py",
+    srcs = ["decode_proto_op.py"],
+    deps = [
+        ":gen_decode_proto_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_decode_proto_op_py",
+    out = "gen_decode_proto_op.py",
+    deps = [
+        "//tensorflow/core:decode_proto_ops_op_lib",
+    ],
+)
+
+py_library(
+    name = "encode_proto_op_py",
+    srcs = ["encode_proto_op.py"],
+    deps = [
+        ":gen_encode_proto_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_encode_proto_op_py",
+    out = "gen_encode_proto_op.py",
+    deps = [
+        "//tensorflow/core:encode_proto_ops_op_lib",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/ops/decode_proto_op.py b/tensorflow/contrib/proto/python/ops/decode_proto_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc000ebe49724e7571ade500eb29de25be89485
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/decode_proto_op.py
@@ -0,0 +1,25 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""Protocol Buffer decoding from tensors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.gen_decode_proto_op import decode_proto_v2 as decode_proto
+from tensorflow.python.framework import ops
+ops.NotDifferentiable("DecodeProtoV2")
diff --git a/tensorflow/contrib/proto/python/ops/encode_proto_op.py b/tensorflow/contrib/proto/python/ops/encode_proto_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac12198b2e462b73238778e91c1e9b6be156182c
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/encode_proto_op.py
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""Protocol Buffer encoding from tensors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.gen_encode_proto_op import encode_proto
+from tensorflow.python.framework import ops
+
+ops.NotDifferentiable("EncodeProto")
diff --git a/tensorflow/contrib/py2tf/README.md b/tensorflow/contrib/py2tf/README.md
deleted file mode 100644
index cd50675ad57316b9c749c137e6acd30b91c10073..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Py2TF
-
-A compiler for generating TensorFlow numeric and control flow ops from Python
-code.
diff --git a/tensorflow/contrib/py2tf/converters/break_statements.py b/tensorflow/contrib/py2tf/converters/break_statements.py
deleted file mode 100644
index bfb709c5e32c6f19dc0fd109df61ece925d701a3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/converters/break_statements.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Canonicalizes break statements by de-sugaring into a control boolean."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
-
-
-class BreakCanonicalizationTransformer(transformer.Base):
-  """Canonicalizes continue statements into additional conditionals."""
-
-  def __init__(self, context):
-    super(BreakCanonicalizationTransformer, self).__init__(context)
-    # This is a stack structure, to correctly process nested loops.
-    self.break_uses = []
-
-  def _create_break_check(self):
-    template = """
-      (not var_name)
-    """
-    expr, = templates.replace(template, var_name=self.break_uses[-1][1])
-    return expr.value
-
-  def _create_break_trigger(self):
-    template = """
-      var_name = True
-    """
-    block = templates.replace(template, var_name=self.break_uses[-1][1])
-    block.append(gast.Continue())
-    return block
-
-  def _create_break_init(self):
-    template = """
-      var_name = False
-    """
-    assign, = templates.replace(template, var_name=self.break_uses[-1][1])
-    return assign
-
-  # TODO(mdan): Surely the transformer supports this better?
-  def _manual_visit_list(self, block):
-    new_block = []
-    for n in block:
-      new_n = self.visit(n)
-      if isinstance(new_n, list):
-        new_block.extend(new_n)
-      else:
-        new_block.append(new_n)
-    return new_block
-
-  def visit_While(self, node):
-    self.generic_visit(node.test)
-    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-
-    break_var = self.context.namer.new_symbol('break_requested',
-                                              scope.referenced)
-    self.break_uses.append([False, break_var])
-    node.body = self._manual_visit_list(node.body)
-    if self.break_uses[-1][0]:
-      node.test = gast.BoolOp(gast.And(), [
-          node.test,
-          gast.UnaryOp(gast.Not(), gast.Name(break_var, gast.Load(), None))
-      ])
-      final_nodes = [self._create_break_init(), node]
-    else:
-      final_nodes = node
-    self.break_uses.pop()
-
-    for n in node.orelse:
-      self.generic_visit(n)
-    return final_nodes
-
-  def visit_For(self, node):
-    self.generic_visit(node.target)
-    self.generic_visit(node.iter)
-    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-
-    break_var = self.context.namer.new_symbol('break_requested',
-                                              scope.referenced)
-    self.break_uses.append([False, break_var])
-    node.body = self._manual_visit_list(node.body)
-    if self.break_uses[-1][0]:
-      anno.setanno(node, 'extra_cond',
-                   gast.UnaryOp(gast.Not(),
-                                gast.Name(break_var, gast.Load(), None)))
-      final_nodes = [self._create_break_init(), node]
-    else:
-      final_nodes = node
-    self.break_uses.pop()
-
-    for n in node.orelse:
-      self.generic_visit(n)
-    return final_nodes
-
-  def visit_Break(self, node):
-    self.break_uses[-1][0] = True
-    return self._create_break_trigger()
-
-
-def transform(node, context):
-  return BreakCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/break_statements_test.py b/tensorflow/contrib/py2tf/converters/break_statements_test.py
deleted file mode 100644
index 095fcdff07d44ecc6b9bb7f8d3e2c7c43df72a02..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/converters/break_statements_test.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for break_statements module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.py2tf.converters import break_statements
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.python.platform import test
-
-
-class BreakCanonicalizationTest(converter_test_base.TestCase):
-
-  def test_basic_break(self):
-
-    def test_fn(x):
-      v = []
-      while x > 0:
-        x -= 1
-        if x % 2 == 0:
-          break
-        v.append(x)
-      return v
-
-    node = self.parse_and_analyze(test_fn, {})
-    node = break_statements.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      self.assertEqual(test_fn(0), result.test_fn(0))
-      self.assertEqual(test_fn(1), result.test_fn(1))
-      self.assertEqual(test_fn(2), result.test_fn(2))
-      self.assertEqual(test_fn(3), result.test_fn(3))
-      self.assertEqual(test_fn(4), result.test_fn(4))
-
-  def test_basic_break_for_loop(self):
-
-    def test_fn(a):
-      v = []
-      for x in a:
-        x -= 1
-        if x % 2 == 0:
-          break
-        v.append(x)
-      return v
-
-    # The break is incompletely canonicalized for for loops. Everything is
-    # in place except for the condition verification.
-    def test_equiv_fn(a):
-      v = []
-      for x in a:
-        x -= 1
-        if x % 2 == 0:
-          continue
-        v.append(x)
-      return v
-
-    node = self.parse_and_analyze(test_fn, {})
-    node = break_statements.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      # The break is incompletely canonicalized. Everything is in place, but
-      # the loop does not break.
-      self.assertEqual(test_equiv_fn([]), result.test_fn([]))
-      self.assertEqual(test_equiv_fn([1]), result.test_fn([1]))
-      self.assertEqual(test_equiv_fn([2]), result.test_fn([2]))
-      self.assertEqual(
-          test_equiv_fn([1, 2, 3, 4]), result.test_fn([1, 2, 3, 4]))
-
-  def test_continue_deeply_nested(self):
-
-    def test_fn(x):
-      v = []
-      u = []
-      w = []
-      while x > 0:
-        x -= 1
-        if x % 2 == 0:
-          if x % 3 != 0:
-            u.append(x)
-          else:
-            w.append(x)
-            continue
-        v.append(x)
-      return v, u, w
-
-    node = self.parse_and_analyze(test_fn, {})
-    node = break_statements.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      self.assertEqual(test_fn(0), result.test_fn(0))
-      self.assertEqual(test_fn(1), result.test_fn(1))
-      self.assertEqual(test_fn(2), result.test_fn(2))
-      self.assertEqual(test_fn(3), result.test_fn(3))
-      self.assertEqual(test_fn(4), result.test_fn(4))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/decorators_test.py b/tensorflow/contrib/py2tf/converters/decorators_test.py
deleted file mode 100644
index 402fa0dda28e696f70d0354ca4abf3a6c83506d9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/converters/decorators_test.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for decorators module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import textwrap
-
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import decorators
-from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.python.platform import test
-from tensorflow.python.util import tf_inspect
-
-
-class DecoratorsTest(converter_test_base.TestCase):
-
-  def test_function_decorator(self):
-
-    def function_decorator():
-
-      def decorator(f):
-        return lambda a: f(a) + 1
-
-      return decorator
-
-    # The Python parser does capture decorators into the AST.
-    # However, the interpreter desugars them on load, and refering to the
-    # decorated function at runtime usually loses any trace of the decorator.
-    # Below is an example when that doesn't happen.
-    def static_wrapper():
-
-      @function_decorator()
-      def test_fn(a):  # pylint:disable=unused-variable
-        return a
-
-    node = self.parse_and_analyze(static_wrapper,
-                                  {'function_decorator': function_decorator})
-    node = node.body[0].body[0]
-
-    node = decorators.transform(node, remove_decorators=())
-    # Since the decorator is not removed, we need to include its source
-    # code. We cannot do it after the fact because decorators are executed
-    # on load.
-    result, _ = compiler.ast_to_object(
-        node,
-        source_prefix=textwrap.dedent(tf_inspect.getsource(function_decorator)))
-    self.assertEqual(2, result.test_fn(1))
-
-    node = decorators.transform(node, remove_decorators=(function_decorator,))
-    with self.compiled(node) as result:
-      self.assertEqual(1, result.test_fn(1))
-
-  def test_simple_decorator(self):
-
-    def simple_decorator(f):
-      return lambda a: f(a) + 1
-
-    # The Python parser does capture decorators into the AST.
-    # However, the interpreter desugars them upon load, and refering to the
-    # decorated function at runtime usually loses any trace of the decorator.
-    # Below is an example when that doesn't happen.
-    def static_wrapper():
-
-      @simple_decorator
-      def test_fn(a):  # pylint:disable=unused-variable
-        return a
-
-    node = self.parse_and_analyze(static_wrapper,
-                                  {'simple_decorator': simple_decorator})
-    node = node.body[0].body[0]
-
-    node = decorators.transform(node, remove_decorators=())
-    # Since the decorator is not removed, we need to include its source
-    # code. We cannot do it after the fact because decorators are executed
-    # on load.
-    result, _ = compiler.ast_to_object(
-        node,
-        source_prefix=textwrap.dedent(tf_inspect.getsource(simple_decorator)))
-    self.assertEqual(2, result.test_fn(1))
-
-    node = decorators.transform(node, remove_decorators=(simple_decorator,))
-    with self.compiled(node) as result:
-      self.assertEqual(1, result.test_fn(1))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/for_loops.py b/tensorflow/contrib/py2tf/converters/for_loops.py
deleted file mode 100644
index 935dade0ed30975dd29c8ffe5be875993936d241..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/converters/for_loops.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Canonicalizes for loops into while loops.
-
-This canonicalizer uses the len function on its argument. That should be
-converted to a tf.shape separately.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
-
-
-class ForLoopCanonicalizationTransformer(transformer.Base):
-  """Canonicalizes for loops (e.g. into while loops)."""
-
-  def __init__(self, context):
-    super(ForLoopCanonicalizationTransformer, self).__init__(context)
-
-  def visit_For(self, node):
-    self.generic_visit(node)
-    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-
-    if anno.hasanno(node, 'extra_cond'):
-      template = """
-        i = 0
-        n = len(loop_iter)
-        while i < n and extra_cond:
-          # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
-          target = loop_iter[i]
-          body
-          i += 1
-      """
-      return templates.replace(
-          template,
-          loop_iter=node.iter,
-          target=node.target,
-          body=node.body,
-          i=self.context.namer.new_symbol('i', body_scope.referenced),
-          n=self.context.namer.new_symbol('n', body_scope.referenced),
-          extra_cond=anno.getanno(node, 'extra_cond'))
-    else:
-      template = """
-        i = 0
-        n = len(loop_iter)
-        while i < n:
-          # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
-          target = loop_iter[i]
-          body  # pylint:disable=pointless-statement
-          i += 1
-      """
-      repl = templates.replace(
-          template,
-          loop_iter=node.iter,
-          target=node.target,
-          body=node.body,
-          i=self.context.namer.new_symbol('i', body_scope.referenced),
-          n=self.context.namer.new_symbol('n', body_scope.referenced))
-      return repl
-
-  def visit_Continue(self, node):
-    assert False, 'continue statement should be desugared at this point'
-
-  def visit_Break(self, node):
-    assert False, 'break statement should be desugared at this point'
-
-
-def transform(node, context):
-  return ForLoopCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/logical_expressions.py b/tensorflow/contrib/py2tf/converters/logical_expressions.py
deleted file mode 100644
index df980d41c9c57e325bee9a1fa870d9c95f46ea41..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/converters/logical_expressions.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Converter for logical expressions.
-
-e.g. `a and b -> tf.logical_and(a, b)`. This is not done automatically in TF.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.contrib.py2tf.pyct import parser
-
-
-class LogicalExpressionTransformer(gast.NodeTransformer):
-  """Converts logical expressions to corresponding TF calls."""
-
-  def __init__(self):
-    # TODO(mdan): Look into replacing with bitwise operators instead.
-    self.op_mapping = {
-        gast.And: 'tf.logical_and',
-        gast.Or: 'tf.logical_or',
-        gast.Not: 'tf.logical_not',
-        gast.Eq: 'tf.equal',
-    }
-
-  def visit_Compare(self, node):
-    node = self.generic_visit(node)
-    if len(node.ops) > 1:
-      raise NotImplementedError()
-    cmp_type = type(node.ops[0])
-    if cmp_type in self.op_mapping:
-      tf_function = parser.parse_str(self.op_mapping[cmp_type]).body[0].value
-      return gast.Call(
-          func=tf_function, args=[node.left, node.comparators[0]], keywords=[])
-    return node
-
-  def visit_UnaryOp(self, node):
-    node = self.generic_visit(node)
-    if isinstance(node.op, gast.Not):
-      tf_function = parser.parse_str(self.op_mapping[type(
-          node.op)]).body[0].value
-      node = gast.Call(func=tf_function, args=[node.operand], keywords=[])
-    return node
-
-  def visit_BoolOp(self, node):
-    # TODO(mdan): A normalizer may be useful here. Use ANF?
-    node = self.generic_visit(node)
-    tf_function = parser.parse_str(self.op_mapping[type(node.op)]).body[0].value
-    left = node.values[0]
-    for i in range(1, len(node.values)):
-      left = gast.Call(
-          func=tf_function, args=[left, node.values[i]], keywords=[])
-    return left
-
-
-def transform(node):
-  transformer = LogicalExpressionTransformer()
-  node = transformer.visit(node)
-  return node
diff --git a/tensorflow/contrib/py2tf/impl/api.py b/tensorflow/contrib/py2tf/impl/api.py
deleted file mode 100644
index 29d2e038a73c8cac89c121ec65c32f0d4f68aff6..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/impl/api.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Public API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import wraps
-
-import gast
-import six
-
-from tensorflow.contrib.py2tf.impl import config
-from tensorflow.contrib.py2tf.impl import conversion
-from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import tf_inspect
-
-# TODO(mdan): Properly document the type hints.
-# TODO(mdan): Reduce the type hint information to (module, type).
-# (currently we require (module + class name, type))
-
-
-def graph_ready(f):
-  """No-op decorator that explicitly marks a function as graph-ready.
-
-  Graph-ready functions are assumed to not need any conversion.
-
-  Args:
-    f: Any callable.
-  Returns:
-    f itself.
-  """
-  setattr(f, '__pyct_is_compile_decorator', True)
-  return f
-
-
-def convert_inline(f, *args, **kwargs):
-  """Shorthand to convert and call a function.
-
-  For example, the following two statements are equivalent:
-
-      @convert()
-      def foo():
-        ...
-      foo(bar)
-
-      def foo():
-        ...
-      convert_inline(foo, bar)
-
-  Args:
-    f: Function to convert. Only this call will be converted.
-    *args: Passed through to f.
-    **kwargs: Passed through to f, with the following exceptions:
-        * arg_value_hints: A dict mapping parameter names to objects that can
-            hint at the type of those parameters.
-
-  Returns:
-    The result of the converted f applied to args and kwargs.
-  """
-  if 'arg_value_hints' in kwargs:
-    arg_value_hints = kwargs['arg_value_hints']
-    del kwargs['arg_value_hints']
-  else:
-    arg_value_hints = None
-  if tf_inspect.ismethod(f):
-    # When converting methods, the result is still an unbound function.
-    args = (f.__self__,) + args
-  return convert(arg_value_hints)(f)(*args, **kwargs)
-
-
-def convert(recursive=False, verbose=False, arg_types=None):
-  """Decorator that compiles a function to graph mode.
-
-  The decorator is dynamic - invoking compilation whenever the decorated
-  function is called. This means the parameter values are known at compilation.
-
-  Args:
-    recursive: Whether to recusrively convert any functions that the decorator
-        function may call.
-    verbose: Whether to output the compiled code in the logs.
-    arg_types: See to_graph.
-
-  Returns:
-    A decorator that compiles the given function to graph mode.
-
-  Raises:
-    ValueError: If any of the arguments are illegal.
-  """
-  if arg_types is None:
-    arg_types = {}
-
-  def decorator(f):
-    """Decorator implementation."""
-
-    @wraps(f)
-    def wrapper(*args, **kwargs):
-      """Wrapper that calls the compiled version of the wrapped function."""
-      partial_types = ()
-      arg_values = {}
-      arg_names = tf_inspect.getargspec(f)[0]
-      for name, arg in zip(arg_names, args):
-        arg_values[name] = arg
-        arg_class = arg.__class__
-        # If arg_value_hints specifies any name, use that instead.
-        if name not in arg_types:
-          arg_types[name] = (arg_class.__name__, arg_class)
-        if name == 'self' and tf_inspect.isclass(arg_class):
-          # Annotated methods need to specify that their owner type is partial,
-          # otherwise other members they call will not be converted.
-          partial_types = (arg_class,)
-      wrapped = to_graph(
-          f,
-          recursive=recursive,
-          verbose=verbose,
-          arg_values=arg_values,
-          arg_types=arg_types,
-          partial_types=partial_types)
-      return wrapped(*args, **kwargs)
-
-    # Sometimes the decorator is just desugared, making it impossible to detect.
-    # This attribute makes detection easier.
-    setattr(wrapper, '__pyct_is_compile_decorator', True)
-    return wrapper
-
-  return decorator
-
-
-def to_graph(e,
-             recursive=True,
-             verbose=False,
-             arg_values=None,
-             arg_types=None,
-             partial_types=None):
-  """Compile a Python entity into equivalent TensorFlow code.
-
-  Currently supported entities:
-    * functions
-    * classes
-
-  Classes are handled by converting all their methods into a new class.
-
-  Args:
-    e: A Python entity.
-    recursive: Whether to recusrively convert any functions that the decorator
-        function may call.
-    verbose: Whether to output the compiled code in the logs.
-    arg_values: A dict containing value hints for symbols like function
-        parameters.
-    arg_types: A dict containing type hints for symbols like function
-        parameters.
-    partial_types: A set of types (e.g. classes) that will not be converted
-        entirely. Calls to member functions for these types will be renamed
-        independently.
-
-  Returns:
-    A function with a signature identical to `o`, but which when executed it
-  creates TF a graph that has the same functionality as the original entity.
-  """
-  conversion_map = conversion.ConversionMap(
-      recursive=recursive,
-      nocompile_decorators=(convert, graph_ready, convert_inline),
-      partial_types=partial_types,
-      api_module=tf_inspect.getmodule(to_graph))
-  _, name = conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
-
-  module = gast.Module([])
-  for import_line in config.COMPILED_IMPORT_STATEMENTS:
-    module.body.append(parser.parse_str(import_line))
-  for dep in conversion_map.dependency_cache.values():
-    module.body.append(dep)
-  compiled_node, compiled_src = compiler.ast_to_object(module)
-
-  # The compiled code should see everything the entry function saw.
-  # TODO(mdan): This might not work well if the call tree spans modules?
-  if tf_inspect.isfunction(e):
-    compiled_node.__dict__.update(six.get_function_globals(e))
-  compiled_fn = getattr(compiled_node, name)
-
-  if verbose:
-    logging.info('Compiled output of %s:\n\n%s\n', e, compiled_src)
-
-  return compiled_fn
-
-
-def to_code(e,
-            recursive=True,
-            arg_values=None,
-            arg_types=None,
-            partial_types=None,
-            indentation='  '):
-  """Return the equivalent of an entity in TensorFlow code.
-
-  See `to_graph` for more details.
-
-  Args:
-    e: A Python entity.
-    recursive: See to_graph.
-    arg_values: See to_graph.
-    arg_types: See to_graph.
-    partial_types: See to_graph.
-    indentation: String, when to use for each level of indentation.
-
-  Returns:
-    String.
-  """
-  conversion_map = conversion.ConversionMap(
-      recursive=recursive,
-      nocompile_decorators=(convert, graph_ready, convert_inline),
-      partial_types=partial_types,
-      api_module=tf_inspect.getmodule(to_graph))
-  conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
-
-  imports = '\n'.join(config.COMPILED_IMPORT_STATEMENTS)
-  code = '\n'.join(
-      compiler.ast_to_source(dep, indentation)
-      for dep in reversed(tuple(
-          six.itervalues(conversion_map.dependency_cache))))
-
-  return imports + '\n\n' + code
diff --git a/tensorflow/contrib/py2tf/impl/conversion.py b/tensorflow/contrib/py2tf/impl/conversion.py
deleted file mode 100644
index 7610f0427be45832dcc12e8f32b65292254eadfd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/impl/conversion.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""High level conversion support."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-import six
-
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.converters import asserts
-from tensorflow.contrib.py2tf.converters import break_statements
-from tensorflow.contrib.py2tf.converters import builtin_functions
-from tensorflow.contrib.py2tf.converters import call_trees
-from tensorflow.contrib.py2tf.converters import continue_statements
-from tensorflow.contrib.py2tf.converters import control_flow
-from tensorflow.contrib.py2tf.converters import decorators
-from tensorflow.contrib.py2tf.converters import for_loops
-from tensorflow.contrib.py2tf.converters import logical_expressions
-from tensorflow.contrib.py2tf.converters import side_effect_guards
-from tensorflow.contrib.py2tf.impl import config
-from tensorflow.contrib.py2tf.impl import naming
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
-from tensorflow.python.util import tf_inspect
-
-
-# TODO(mdan): Might we not need any renaming at all?
-
-
-class ConversionMap(object):
-  """ConversionMaps keep track of converting function hierarchies.
-
-  Attributes:
-    recursive: Whether to recusrively convert any functions that the decorator
-        function may call.
-    nocompile_decorators: tuple of decorator functions that toggle compilation
-        off.
-    dependency_cache: dict[object]: ast; maps original entities to their
-        converted AST
-    name_map: dict[string]: string; maps original entities to the name of
-        their converted counterparts
-    api_module: A reference to the api module. The reference needs to be passed
-        to avoid circular dependencies.
-  """
-
-  # TODO(mdan): Rename to ConversionContext, and pull in additional flags.
-
-  def __init__(self, recursive, nocompile_decorators, partial_types,
-               api_module):
-    self.recursive = recursive
-    self.nocompile_decorators = nocompile_decorators
-    self.partial_types = partial_types if partial_types else ()
-    self.dependency_cache = {}
-    self.name_map = {}
-    self.api_module = api_module
-
-  def new_namer(self, namespace):
-    return naming.Namer(namespace, self.recursive, self.name_map,
-                        self.partial_types)
-
-  def update_name_map(self, namer):
-    for o, name in namer.renamed_calls.items():
-      if o in self.name_map:
-        if self.name_map[o] != name:
-          raise ValueError(
-              'Calls to %s were converted using multiple names (%s). This is '
-              'possible when an entity with one of these names already '
-              'existed. To fix, avoid using any of these names.')
-      else:
-        self.name_map[o] = name
-
-  def add_to_cache(self, original_entity, converted_ast):
-    self.dependency_cache[original_entity] = converted_ast
-
-
-def entity_to_graph(o, conversion_map, arg_values, arg_types):
-  """Compile a Python entity into equivalent TensorFlow.
-
-  The function will also recursively compile all the entities that `o`
-  references, updating `dependency_cache`.
-
-  This function is reentrant, and relies on dependency_cache to avoid
-  generating duplicate code.
-
-  Args:
-    o: A Python entity.
-    conversion_map: A ConversionMap object.
-    arg_values: A dict containing value hints for symbols like function
-        parameters.
-    arg_types: A dict containing type hints for symbols like function
-        parameters.
-
-  Returns:
-    A tuple (ast, new_name):
-        * ast: An AST representing an entity with interface equivalent to `o`,
-            but which when executed it creates TF a graph.
-        * new_name: The symbol name under which the new entity can be found.
-
-  Raises:
-    ValueError: if the entity type is not supported.
-  """
-  if tf_inspect.isclass(o):
-    node, new_name = class_to_graph(o, conversion_map)
-  elif tf_inspect.isfunction(o):
-    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
-  elif tf_inspect.ismethod(o):
-    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
-  else:
-    raise ValueError(
-        'Entity "%s" has unsupported type "%s". Only functions and classes are '
-        'supported for now.' % (o, type(o)))
-
-  conversion_map.add_to_cache(o, node)
-  if conversion_map.recursive:
-    for obj in conversion_map.name_map.keys():
-      if obj not in conversion_map.dependency_cache:
-        if (hasattr(obj, 'im_class') and
-            getattr(obj, 'im_class') not in conversion_map.partial_types):
-          # Class members are converted with their objects, unless they're
-          # only converted partially.
-          continue
-        entity_to_graph(obj, conversion_map, {}, {})
-
-  return node, new_name
-
-
-def class_to_graph(c, conversion_map):
-  """Specialization of `entity_to_graph` for classes."""
-  converted_members = {}
-  members = tf_inspect.getmembers(c, predicate=tf_inspect.ismethod)
-  if not members:
-    raise ValueError('Cannot convert %s: it has no member methods.')
-
-  class_globals = None
-  for _, m in members:
-    node, _ = function_to_graph(
-        m,
-        conversion_map=conversion_map,
-        arg_values={},
-        arg_types={'self': (c.__name__, c)},
-        owner_type=c)
-    # TODO(mdan): Do not assume all members have the same view of globals.
-    if class_globals is None:
-      class_globals = six.get_function_globals(m)
-    converted_members[m] = node
-  namer = conversion_map.new_namer(class_globals)
-  class_name = namer.compiled_class_name(c.__name__, c)
-  node = gast.ClassDef(
-      class_name,
-      bases=[],
-      keywords=[],
-      body=converted_members.values(),
-      decorator_list=[])
-
-  return node, class_name
-
-
-def _add_self_references(namespace, api_module):
-  """Self refs are only required for analysis and are not used directly."""
-  # Manually add the utils namespace which may be used from generated code.
-  if 'py2tf_util' not in namespace:
-    namespace['py2tf_utils'] = utils
-  elif namespace['py2tf_utils'] != utils:
-    raise ValueError(
-        'The module name "py2tf_utils" is reserved and may not be used.')
-
-  # We also make reference to the api module for dynamic conversion, but
-  # to avoid circular references we don't import it here.
-  if 'py2tf_api' not in namespace:
-    namespace['py2tf_api'] = api_module
-  elif namespace['py2tf_api'] != api_module:
-    raise ValueError(
-        'The module name "py2tf_api" is reserved and may not be used.')
-
-
-def function_to_graph(f, conversion_map, arg_values, arg_types,
-                      owner_type=None):
-  """Specialization of `entity_to_graph` for callable functions."""
-  node, source = parser.parse_entity(f)
-  node = node.body[0]
-  namespace = six.get_function_globals(f)
-
-  # This is needed for non-global functions.
-  closure = six.get_function_closure(f)
-  if closure:
-    for e in closure:
-      if callable(e.cell_contents):
-        fn = e.cell_contents
-        namespace[fn.__name__] = fn
-
-  _add_self_references(namespace, conversion_map.api_module)
-
-  namer = conversion_map.new_namer(namespace)
-  ctx = context.EntityContext(
-      namer=namer,
-      source_code=source,
-      source_file='<fragment>',
-      namespace=namespace,
-      arg_values=arg_values,
-      arg_types=arg_types,
-      recursive=conversion_map.recursive)
-  node = node_to_graph(node, ctx, conversion_map.nocompile_decorators)
-
-  # TODO(mdan): This somewhat duplicates the call rename logic in call_treest.py
-  new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type)
-  if not did_rename:
-    new_name = f.__name__
-    if node.name != f.__name__:
-      raise NotImplementedError('Strange corner case. Send us offending code!')
-
-  node.name = new_name
-  conversion_map.update_name_map(namer)
-  return node, new_name
-
-
-def _static_analysis_pass(node, ctx):
-  node = qual_names.resolve(node)
-  node = activity.resolve(node, ctx, None)
-  node = live_values.resolve(node, ctx, config.PYTHON_LITERALS)
-  node = type_info.resolve(node, ctx)
-  return node
-
-
-def node_to_graph(node, ctx, nocompile_decorators):
-  """Convert Python code to equivalent TF graph mode code.
-
-  Args:
-    node: A Python AST node representing the code to convert.
-    ctx: An EntityContext object.
-    nocompile_decorators: A tuple containing decorators to be stripped from
-        functions during conversion.
-
-  Returns:
-    A tuple (node, deps):
-        * node: A Python ast node, representing the converted code.
-        * deps: A set of strings, the fully qualified names of entity
-            dependencies that this node has.
-  """
-  # TODO(mdan): Verify arguments for correctness.
-
-  # TODO(mdan): Factor out common elements.
-  # These include:
-  #   * code move between blocks
-  #   * visiting blocks in transformers
-
-  # Certain steps, especially canonicalization, insert new symbols into the
-  # tree, which must be accounted. Although less efficient, it is most robust
-  # to re-run the analysis.
-
-  node = _static_analysis_pass(node, ctx)
-  # Past this point, line numbers are no longer accurate so we ignore the
-  # source.
-  # TODO(mdan): Is it feasible to reconstruct intermediate source code?
-  ctx.source_code = None
-  node = decorators.transform(node, nocompile_decorators)
-  node = break_statements.transform(node, ctx)
-  node = asserts.transform(node, ctx)
-
-  # Note: sequencing continue canonicalization before for loop one avoids
-  # dealing with the extra loop increment operation that the for
-  # canonicalization creates.
-  node = continue_statements.transform(node, ctx)
-  ctx.namespace['len'] = len
-
-  node = _static_analysis_pass(node, ctx)
-  node = for_loops.transform(node, ctx)
-  # for_loops may insert new global references.
-  node = builtin_functions.transform(node, ctx)
-
-  node = _static_analysis_pass(node, ctx)
-  node = call_trees.transform(node, ctx, config.DEFAULT_UNCOMPILED_MODULES,
-                              nocompile_decorators)
-  node = control_flow.transform(node, ctx)
-
-  # control_flow may create new symbols and change scopes.
-  node = _static_analysis_pass(node, ctx)
-  node = logical_expressions.transform(node)
-  node = side_effect_guards.transform(node, ctx)
-
-  return node
diff --git a/tensorflow/contrib/py2tf/impl/conversion_test.py b/tensorflow/contrib/py2tf/impl/conversion_test.py
deleted file mode 100644
index 75e95ed88883ac1d94ef8bc4edc232f97cd0b75b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/impl/conversion_test.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for conversion module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.contrib.py2tf.impl import conversion
-from tensorflow.python.platform import test
-
-
-class ConversionTest(test.TestCase):
-
-  def test_entity_to_graph_unsupported_types(self):
-    with self.assertRaises(ValueError):
-      conversion_map = conversion.ConversionMap(True, (), (), None)
-      conversion.entity_to_graph('dummy', conversion_map, None, None)
-
-  def test_entity_to_graph_callable(self):
-
-    def f(a):
-      return a
-
-    conversion_map = conversion.ConversionMap(True, (), (), None)
-    ast, new_name = conversion.entity_to_graph(f, conversion_map, None, None)
-    self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
-    self.assertEqual('tf__f', new_name)
-
-  def test_entity_to_graph_call_tree(self):
-
-    def g(a):
-      return a
-
-    def f(a):
-      return g(a)
-
-    conversion_map = conversion.ConversionMap(True, (), (), None)
-    conversion.entity_to_graph(f, conversion_map, None, None)
-
-    self.assertTrue(f in conversion_map.dependency_cache)
-    self.assertTrue(g in conversion_map.dependency_cache)
-    self.assertEqual('tf__f', conversion_map.dependency_cache[f].name)
-    self.assertEqual(
-        'tf__g', conversion_map.dependency_cache[f].body[0].value.func.id)
-    self.assertEqual('tf__g', conversion_map.dependency_cache[g].name)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/qual_names.py b/tensorflow/contrib/py2tf/pyct/qual_names.py
deleted file mode 100644
index 8717ee6cff198ff31f6cbdb7213e5a8dd3df1149..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/pyct/qual_names.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for manipulating qualified names.
-
-A qualified name is a uniform way to refer to simple (e.g. 'foo') and composite
-(e.g. 'foo.bar') syntactic symbols.
-
-This is *not* related to the __qualname__ attribute used by inspect, which
-refers to scopes.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.contrib.py2tf.pyct import anno
-
-
-class QN(object):
-  """Represents a qualified name."""
-
-  def __init__(self, base, attr=None):
-    if attr:
-      if not isinstance(base, QN):
-        raise ValueError('For attribute QNs, base must be a QN.')
-      self._parent = base
-      self.qn = base.qn + (attr,)
-    else:
-      if isinstance(base, QN):
-        if base.is_composite():
-          self._parent = base.parent
-        else:
-          self._parent = None
-        self.qn = base.qn
-      else:
-        self._parent = None
-        self.qn = tuple(base.split('.'))
-
-  def is_composite(self):
-    return len(self.qn) > 1
-
-  @property
-  def parent(self):
-    if self._parent is None:
-      raise ValueError('Cannot get parent of simple name "%s".' % self.qn[0])
-    return self._parent
-
-  def __hash__(self):
-    return hash(self.qn)
-
-  def __eq__(self, other):
-    return self.qn == other.qn
-
-  def __str__(self):
-    return '.'.join(self.qn)
-
-  def __repr__(self):
-    return str(self)
-
-  def ssf(self):
-    """Simple symbol form."""
-    return '_'.join(self.qn)
-
-  def ast(self):
-    # The caller must adjust the context appropriately.
-    if self.is_composite():
-      return gast.Attribute(self.parent.ast(), self.qn[-1], None)
-    return gast.Name(self.qn[0], None, None)
-
-
-class QnResolver(gast.NodeTransformer):
-  """Annotates nodes with QN information.
-
-  Note: Not using NodeAnnos to avoid circular dependencies.
-  """
-
-  def visit_Name(self, node):
-    self.generic_visit(node)
-    anno.setanno(node, anno.Basic.QN, QN(node.id))
-    return node
-
-  def visit_Attribute(self, node):
-    self.generic_visit(node)
-    anno.setanno(node, anno.Basic.QN,
-                 QN(anno.getanno(node.value, anno.Basic.QN), node.attr))
-    return node
-
-
-def resolve(node):
-  return QnResolver().visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/qual_names_test.py b/tensorflow/contrib/py2tf/pyct/qual_names_test.py
deleted file mode 100644
index 1b1eee2deca18bb0540c17d6ee85d421602aa2b7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/pyct/qual_names_test.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for qual_names module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import textwrap
-
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.python.platform import test
-
-
-class QNTest(test.TestCase):
-
-  def test_basic(self):
-    a = qual_names.QN('a')
-    self.assertEqual(a.qn, ('a',))
-    self.assertEqual(str(a), 'a')
-    self.assertEqual(a.ssf(), 'a')
-    self.assertEqual(a.ast().id, 'a')
-    self.assertFalse(a.is_composite())
-    with self.assertRaises(ValueError):
-      _ = a.parent
-
-    a_b = qual_names.QN(a, 'b')
-    self.assertEqual(a_b.qn, ('a', 'b'))
-    self.assertEqual(str(a_b), 'a.b')
-    self.assertEqual(a_b.ssf(), 'a_b')
-    self.assertEqual(a_b.ast().value.id, 'a')
-    self.assertEqual(a_b.ast().attr, 'b')
-    self.assertTrue(a_b.is_composite())
-    self.assertEqual(a_b.parent.qn, ('a',))
-
-    a2 = qual_names.QN(a)
-    self.assertEqual(a2.qn, ('a',))
-    with self.assertRaises(ValueError):
-      _ = a.parent
-
-    a_b2 = qual_names.QN(a_b)
-    self.assertEqual(a_b2.qn, ('a', 'b'))
-    self.assertEqual(a_b2.parent.qn, ('a',))
-
-    self.assertTrue(a2 == a)
-    self.assertFalse(a2 is a)
-
-    self.assertTrue(a_b.parent == a)
-    self.assertTrue(a_b2.parent == a)
-
-    self.assertTrue(a_b2 == a_b)
-    self.assertFalse(a_b2 is a_b)
-    self.assertFalse(a_b2 == a)
-
-    with self.assertRaises(ValueError):
-      qual_names.QN('a', 'b')
-
-  def test_hashable(self):
-    d = {qual_names.QN('a'): 'a', qual_names.QN('b'): 'b'}
-
-    self.assertEqual(d[qual_names.QN('a')], 'a')
-    self.assertEqual(d[qual_names.QN('b')], 'b')
-    self.assertTrue(qual_names.QN('c') not in d)
-
-
-class QNResolverTest(test.TestCase):
-
-  def assertQNStringIs(self, node, qn_str):
-    self.assertEqual(str(anno.getanno(node, anno.Basic.QN)), qn_str)
-
-  def test_resolve(self):
-    samples = """
-      a
-      a.b
-      (c, d.e)
-      [f, (g.h.i)]
-      j(k, l)
-    """
-    nodes = qual_names.resolve(parser.parse_str(textwrap.dedent(samples)))
-    nodes = tuple(n.value for n in nodes.body)
-
-    self.assertQNStringIs(nodes[0], 'a')
-    self.assertQNStringIs(nodes[1], 'a.b')
-    self.assertQNStringIs(nodes[2].elts[0], 'c')
-    self.assertQNStringIs(nodes[2].elts[1], 'd.e')
-    self.assertQNStringIs(nodes[3].elts[0], 'f')
-    self.assertQNStringIs(nodes[3].elts[1], 'g.h.i')
-    self.assertQNStringIs(nodes[4].func, 'j')
-    self.assertQNStringIs(nodes[4].args[0], 'k')
-    self.assertQNStringIs(nodes[4].args[1], 'l')
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/templates_test.py b/tensorflow/contrib/py2tf/pyct/templates_test.py
deleted file mode 100644
index 8ccfde8573724741b0bbe4eacb3c54beb381ee7e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/pyct/templates_test.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for templates module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.python.platform import test
-
-
-class TemplatesTest(test.TestCase):
-
-  def test_replace_tuple(self):
-    template = """
-      def test_fn(a, c):
-        return b,
-    """
-
-    node = templates.replace(template, b=('a', 'c'))[0]
-    result, _ = compiler.ast_to_object(node)
-
-    self.assertEquals((2, 3), result.test_fn(2, 3))
-
-  def test_replace_variable(self):
-    template = """
-      def test_fn(a):
-        a += 1
-        a = 2 * a + 1
-        return b
-    """
-
-    node = templates.replace(template, a='b')[0]
-    result, _ = compiler.ast_to_object(node)
-    self.assertEquals(7, result.test_fn(2))
-
-  def test_replace_function_name(self):
-    template = """
-      def fname(a):
-        a += 1
-        a = 2 * a + 1
-        return a
-    """
-
-    node = templates.replace(template, fname='test_fn')[0]
-    result, _ = compiler.ast_to_object(node)
-    self.assertEquals(7, result.test_fn(2))
-
-  def test_code_block(self):
-    template = """
-      def test_fn(a):
-        block
-        return a
-    """
-
-    node = templates.replace(
-        template,
-        block=[
-            gast.Assign([
-                gast.Name('a', None, None)
-            ], gast.BinOp(gast.Name('a', None, None), gast.Add(), gast.Num(1))),
-        ] * 2)[0]
-    result, _ = compiler.ast_to_object(node)
-    self.assertEquals(3, result.test_fn(1))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/py2tf/pyct/transformer.py
deleted file mode 100644
index 877d52af016af720424c8a56257fec9ab64611cb..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/pyct/transformer.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A node transformer that includes utilities for SCT."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-import gast
-import six
-
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import pretty_printer
-
-
-class PyFlowParseError(SyntaxError):
-  pass
-
-
-class Base(gast.NodeTransformer):
-  """Base class for specialized transformers."""
-
-  def __init__(self, context):
-    """Initialize the transformer. Subclasses should call this.
-
-    Args:
-      context: An EntityContext.
-    """
-    self._lineno = 0
-    self._col_offset = 0
-    self.context = context
-
-  def visit(self, node):
-    source_code = self.context.source_code
-    source_file = self.context.source_file
-    try:
-      if source_code and hasattr(node, 'lineno'):
-        self._lineno = node.lineno
-        self._col_offset = node.col_offset
-      if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
-        return node
-      return super(Base, self).visit(node)
-    except (ValueError, AttributeError, KeyError, NotImplementedError,
-            AssertionError) as e:
-      msg = '%s: %s\nOccurred at node:\n%s' % (
-          e.__class__.__name__, str(e), pretty_printer.fmt(node, color=False))
-      if source_code:
-        line = source_code.splitlines()[self._lineno - 1]
-      else:
-        line = '<no source available>'
-      six.reraise(PyFlowParseError,
-                  PyFlowParseError(
-                      msg,
-                      (source_file, self._lineno, self._col_offset + 1, line)),
-                  sys.exc_info()[2])
diff --git a/tensorflow/contrib/py2tf/utils/__init__.py b/tensorflow/contrib/py2tf/utils/__init__.py
deleted file mode 100644
index 0a1b993fd366e1317e5f7e01fe849d86c93b8fc2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/utils/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility module that contains APIs usable in the generated code."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.py2tf.utils.context_managers import control_dependency_on_returns
-from tensorflow.contrib.py2tf.utils.misc import alias_tensors
-from tensorflow.contrib.py2tf.utils.multiple_dispatch import run_cond
-from tensorflow.contrib.py2tf.utils.multiple_dispatch import run_while
-from tensorflow.contrib.py2tf.utils.printing import call_print
-from tensorflow.contrib.py2tf.utils.py_func import wrap_py_func
-from tensorflow.contrib.py2tf.utils.type_check import is_tensor
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py b/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
deleted file mode 100644
index a855fdc075941915035d1e3380846ff912803494..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for type-dependent behavior used in py2tf-generated code."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-
-from tensorflow.contrib.py2tf.utils.type_check import is_tensor
-from tensorflow.python.ops import control_flow_ops
-
-
-def run_cond(condition, true_fn, false_fn):
-  """Type-dependent functional conditional.
-
-  Args:
-    condition: A Tensor or Python bool.
-    true_fn: A Python callable implementing the true branch of the conditional.
-    false_fn: A Python callable implementing the false branch of the
-      conditional.
-
-  Returns:
-    result: The result of calling the appropriate branch. If condition is a
-    Tensor, tf.cond will be used. Otherwise, a standard Python if statement will
-    be ran.
-  """
-  if is_tensor(condition):
-    return control_flow_ops.cond(condition, true_fn, false_fn)
-  else:
-    return py_cond(condition, true_fn, false_fn)
-
-
-def py_cond(condition, true_fn, false_fn):
-  if condition:
-    return true_fn()
-  else:
-    return false_fn()
-
-
-def run_while(cond_fn, body_fn, init_args):
-  """Type-dependent functional while loop.
-
-  Args:
-    cond_fn: A Python callable implementing the stop conditions of the loop.
-    body_fn: A Python callable implementing the body of the loop.
-    init_args: The initial values of the arguments that will be passed to both
-      cond_fn and body_fn.
-
-  Returns:
-    result: A list of values with the same shape and type as init_args. If any
-    of the init_args, or any variables closed-over in cond_fn are Tensors,
-    tf.while_loop will be used, otherwise a Python while loop will be ran.
-
-  Raises:
-    ValueError: if init_args is not a tuple or list with one or more elements.
-  """
-  if not isinstance(init_args, (tuple, list)) or not init_args:
-    raise ValueError(
-        'init_args must be a non-empty list or tuple, found %s' % init_args)
-
-  # TODO(alexbw): statically determine all active variables in cond_fn,
-  # and pass them directly
-  closure_vars = tuple(
-      [c.cell_contents for c in six.get_function_closure(cond_fn) or []])
-  possibly_tensors = tuple(init_args) + closure_vars
-  if is_tensor(*possibly_tensors):
-    return control_flow_ops.while_loop(cond_fn, body_fn, init_args)
-  else:
-    return py_while_loop(cond_fn, body_fn, init_args)
-
-
-def py_while_loop(cond_fn, body_fn, init_args):
-  state = init_args
-  while cond_fn(*state):
-    state = body_fn(*state)
-  return state
diff --git a/tensorflow/contrib/py2tf/utils/printing_test.py b/tensorflow/contrib/py2tf/utils/printing_test.py
deleted file mode 100644
index 2070deb304d8df2433fb9a95ae36d48415578482..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/utils/printing_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for printing module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-import six
-
-from tensorflow.contrib.py2tf.utils import printing
-from tensorflow.python.platform import test
-
-
-class ContextManagersTest(test.TestCase):
-
-  def test_call_print_tf(self):
-    try:
-      out_capturer = six.StringIO()
-      sys.stdout = out_capturer
-      with self.test_session() as sess:
-        sess.run(printing.call_print('test message', 1))
-        self.assertEqual(out_capturer.getvalue(), 'test message 1\n')
-    finally:
-      sys.stdout = sys.__stdout__
-
-  def test_call_print_py_func(self):
-    try:
-      out_capturer = six.StringIO()
-      sys.stdout = out_capturer
-      with self.test_session() as sess:
-        sess.run(printing.call_print('test message', [1, 2]))
-        self.assertEqual(out_capturer.getvalue(), 'test message [1, 2]\n')
-    finally:
-      sys.stdout = sys.__stdout__
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/py_func.py b/tensorflow/contrib/py2tf/utils/py_func.py
deleted file mode 100644
index 838872d092a3ab07e965180eff4fec7ff6c4ccf9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/utils/py_func.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Pyfunc creation utilities."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import script_ops
-
-
-def wrap_py_func(f, return_dtypes, arguments, use_dummy_return=False):
-  """Helper that wraps a callable to py_func.
-
-  The helper passes tensor arguments through the py_func interface. Non-tensor
-  arguments are allowed, and will be passed to f directly. Note that non-tensor
-  arguments are captured by f will not update every time the wrapper is
-  called (this is consistent with its argument list, which only includes
-  the tensor arguments). In general, it's safest not to reuse this wrapper.
-
-  Args:
-    f: Callable
-    return_dtypes: DType, tuple, list or None, the data type for each of f's
-        return value. None if f has no return values or use_dummy_return is
-        True.
-    arguments: Arguments for f
-    use_dummy_return: If True, the function will return a dummy value of 1
-        and discard its actual return value.
-  Returns:
-    The return values of f converted to tensor.
-  Raises:
-    ValueError: if the arguments are incorrect.
-  """
-
-  if return_dtypes and use_dummy_return:
-    raise ValueError('if use_dummy_return is True, return_dtypes must be empty')
-
-  n = len(arguments)
-  arg_is_tensor = tuple(map(tensor_util.is_tensor, arguments))
-  index_in_tensor_list = [0] * n
-  i = 0
-  for j in range(n):
-    index_in_tensor_list[j] = i
-    if arg_is_tensor[j]:
-      i += 1
-
-  def f_wrapper(*tensor_args):
-    f_args = tuple(tensor_args[index_in_tensor_list[i]]
-                   if arg_is_tensor[i] else arguments[i] for i in range(n))
-    retval = f(*f_args)
-    return 1 if use_dummy_return else retval
-
-  return script_ops.py_func(
-      f_wrapper, tuple(arguments[i] for i in range(n) if arg_is_tensor[i]),
-      dtypes.int64 if use_dummy_return else return_dtypes)
diff --git a/tensorflow/contrib/py2tf/utils/py_func_test.py b/tensorflow/contrib/py2tf/utils/py_func_test.py
deleted file mode 100644
index 776b5309c6f027bb2008aa83d48e4155e817ed97..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/utils/py_func_test.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for wrap_py_func module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.py2tf.utils import py_func
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.platform import test
-
-
-class PyFuncTest(test.TestCase):
-
-  def test_wrap_py_func_simple(self):
-
-    def test_fn(a, b, c):
-      return a + b + c
-
-    with self.test_session() as sess:
-      tensor_1 = constant_op.constant(1)
-      self.assertEqual(3,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, dtypes.int64,
-                                                (1, tensor_1, 1))))
-      self.assertEqual(3,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, dtypes.int64,
-                                                (1, 1, 1))))
-      self.assertEqual(3,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, dtypes.int64,
-                                                (tensor_1, 1, tensor_1))))
-
-  def test_wrap_py_func_complex_args(self):
-
-    class TestClass(object):
-
-      def __init__(self):
-        self.foo = 5
-
-    def test_fn(a, b):
-      return a * b.foo
-
-    with self.test_session() as sess:
-      self.assertEqual(35,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, dtypes.int64,
-                                                (7, TestClass()))))
-      self.assertEqual(
-          35,
-          sess.run(
-              py_func.wrap_py_func(test_fn, dtypes.int64,
-                                   (constant_op.constant(7), TestClass()))))
-
-  def test_wrap_py_func_dummy_return(self):
-
-    side_counter = [0]
-
-    def test_fn(_):
-      side_counter[0] += 1
-
-    with self.test_session() as sess:
-      self.assertEqual(1,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, None, (5,), True)))
-      self.assertEqual([1], side_counter)
-      self.assertEqual(1,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, None,
-                                                (constant_op.constant(5),),
-                                                True)))
-      self.assertEqual([2], side_counter)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/quantization/BUILD b/tensorflow/contrib/quantization/BUILD
index c19a31afb2a1a86159eae5c94bbd83daa28caaeb..2de10e8faefa80d609e490f26ef97f6bf513debd 100644
--- a/tensorflow/contrib/quantization/BUILD
+++ b/tensorflow/contrib/quantization/BUILD
@@ -49,15 +49,3 @@ filegroup(
         "**/*.py",
     ]),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/quantization/README.md b/tensorflow/contrib/quantization/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..359950aaf3d89c1f3e8fda21cbd27fb89217d918
--- /dev/null
+++ b/tensorflow/contrib/quantization/README.md
@@ -0,0 +1,7 @@
+The contrib/quantization package exposes a few TensorFlow quantization operations.
+
+If you are looking for quantized training rewrites that allow for training
+quantized models that work with
+[TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/), you should look at
+the [contrib/quantize](https://www.tensorflow.org/api_docs/python/tf/contrib/quantize)
+package.
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index aec9f47ccb20349c08bbe2fd813ee24a807f9fe3..b9918fdee1ece2bae1ab1459985066a35b6431be 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -24,6 +24,7 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
+        "//tensorflow/python:variable_scope",
     ],
 )
 
@@ -245,15 +246,3 @@ py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 40541729da5fd9d0ae75579e11f20999337de124..c83623ec947c1550991352a9dd9a5c6ee9282290 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -1,73 +1,74 @@
+# Quantized Training Rewrites
+
 tf.contrib.quantize provides tools for transforming graphs to include ops to
 model quantization of weights, biases and activations during both training and
-inference. This is done using the
-[fake quantization op]
-(https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization),
-which is described below:
-
-Recent literature has shown that fixed point networks provide comparable
-performance to floating point networks [1]. This is achieved by modeling the
-quantization operation during training in both the forward and backward passes.
+inference. The details of the transformation implemented in this package is
+described here [1].
+
+This is done using the
+[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
+
+Literature has shown that fixed point networks provide comparable performance to
+floating point networks [2]. This is achieved by modeling the quantization
+operation during training in both the forward and backward passes.
 The fake quantization operator achieves this by modeling the quantizer as a pass
-through estimator [2]. Note that during back propagation, the parameters are
+through estimator [3]. Note that during back propagation, the parameters are
 updated at high precision as this is needed to ensure sufficient precision in
 accumulating tiny adjustments to the parameters. However, for the forward pass,
 the parameters and activations are quantized to the desired lower precision.
 
-![drawing](g3doc/drawings/Fake_Quantization.jpg)
-
-###Forward pass
-
-
-
-
-\begin{equation*}
-f_Q(x) = \Delta\text{ }round\left(\frac{sat\left(x\right)-x_{min}}{\Delta}\right)
-\end{equation*}
-
-
-where
-
-$$
-\begin{equation*}
-sat(x) =
-\left\{
-	\begin{array}{ll}
-		x_{min}  & \mbox{if } x \le x_{min} \\
-		x & \mbox{if } x_{min} \leq x \leq x_{max} \\
-    x_{max} & \mbox{if } x_{max} \le x
-	\end{array}
-\right.
-\end{equation*}
-$$
-
-
-where $$\Delta$$ is the Quantizer Step size, given by
-$$\Delta =\frac{x_{max} - x_{min} }{255} $$ and $$x_{min} $$ and $$x_{max}$$ are
-the minimum and maximum values of the variable under consideration. Note that
-the rounding performed is deterministic and corresponds to asymmetric rounding,
-which is supported in almost all hardware platforms.
-
-###Backward pass
-For the backward pass, we model the quantizer as a piecewise linear block, with
-derivatives that are non-zero only in the linear region.
-
-
-
-\begin{equation*}
-\frac{df_Q(x)}{dx}=1, x_{min} \leq x \leq x_{max},\text{ 0  elsewhere }
-\end{equation*}
-
-Therefore, the backward pass through the quantizer reduces to passing through
-the gradients as long as the inputs to the quantizer are in the linear region.
-Otherwise, the gradients are set to zero.
-
-Note that the quantizer is fully specified by the min and max values of the
-variables being quantized.
-
-
-[1] P.Gysel, "HARDWARE-ORIENTED APPROXIMATION OF CONVOLUTIONAL
+## How to use the Rewrites
+
+tf.contrib.quantize provides two rewrites, one to train for quantization and
+one to create a [TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/)
+compatible eval graph.
+
+```
+# Build forward pass of model.
+…
+loss = tf.losses.get_total_loss()
+
+# Call the training rewrite which rewrites the graph in-place with FakeQuantization nodes
+# and folds batchnorm for training.
+# It is often needed to finetune a floating point model for quantization with this training tool.
+# When training from scratch, quant_delay can be used to activate quantization after
+# training to convergence with the float graph, effectively finetuning the model.
+tf.contrib.quantize.create_training_graph(quant_delay=2000000)
+
+# Call backward pass optimizer as usual.
+optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+optimizer.minimize(loss)
+```
+
+Additionally, the rewritten eval graph is non-trivially different from the
+training graph due the effects of quantization on batch normalization. Thus,
+we offer a separate rewrite for the eval_graph.
+
+```
+# Build eval model
+…
+logits = tf.nn.softmax_cross_entropy_with_logits(...)
+
+# Call the eval rewrite which rewrites the graph in-place with FakeQuantization nodes
+# and fold batchnorm for eval.
+tf.contrib.quantize.create_eval_graph()
+
+# Save the checkpoint and eval graph proto to disk for freezing and providing to TFLite.
+with open(eval_graph_file, ‘w’) as f:
+  f.write(str(g.as_graph_def()))
+saver = tf.train.Saver()
+saver.save(sess, checkpoint_name)
+```
+
+These rewrites are an active area of research and experimentation, so the
+rewrites and quantized training will likely not work across all models, though
+we hope to work towards generalizing these techniques.
+
+[1] B.Jacob et al., "Quantization and Training of Neural Networks for Efficient
+Integer-Arithmetic-Only Inference", https://arxiv.org/abs/1712.05877
+
+[2] P.Gysel et al., "HARDWARE-ORIENTED APPROXIMATION OF CONVOLUTIONAL
 NEURAL NETWORKS", https://arxiv.org/pdf/1604.03168.pdf
 
-[2] Y.Bengio, "Estimating or Propagating Gradients Through Stochastic Neurons
-for Conditional Computation", https://arxiv.org/abs/1308.3432
+[3] Y.Bengio et al., "Estimating or Propagating Gradients Through Stochastic
+Neurons for Conditional Computation", https://arxiv.org/abs/1308.3432
diff --git a/tensorflow/contrib/quantize/g3doc/drawings/Fake_Quantization.jpg b/tensorflow/contrib/quantize/g3doc/drawings/Fake_Quantization.jpg
deleted file mode 100644
index fdc7ae40cec757cc0a93d50eca6c8698a4697d07..0000000000000000000000000000000000000000
Binary files a/tensorflow/contrib/quantize/g3doc/drawings/Fake_Quantization.jpg and /dev/null differ
diff --git a/tensorflow/contrib/quantize/python/common.py b/tensorflow/contrib/quantize/python/common.py
index 3a1fa61e43986af1a1315d5a9e6f010e802ea157..bf648e158ec15e1bfa962ba7dbe0567263c89c9b 100644
--- a/tensorflow/contrib/quantize/python/common.py
+++ b/tensorflow/contrib/quantize/python/common.py
@@ -23,6 +23,7 @@ import re
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -101,7 +102,7 @@ def CreateOrGetQuantizationStep():
     Quantization step Tensor.
   """
   quantization_step_name = 'fake_quantization_step'
-  quantization_step_tensor_name = quantization_step_name + '/AssignAdd:0'
+  quantization_step_tensor_name = quantization_step_name + '/Identity:0'
   g = ops.get_default_graph()
   try:
     return g.get_tensor_by_name(quantization_step_tensor_name)
@@ -118,5 +119,15 @@ def CreateOrGetQuantizationStep():
       with g.name_scope(quantization_step_tensor.op.name + '/'):
         # We return the incremented variable tensor. Since this is used in conds
         # for quant_delay and freeze_bn_delay, it will run once per graph
-        # execution.
-        return state_ops.assign_add(quantization_step_tensor, 1)
+        # execution. We return an identity to force resource variables and
+        # normal variables to return a tensor of the same name.
+        return array_ops.identity(
+            state_ops.assign_add(quantization_step_tensor, 1))
+
+
+def DropStringPrefix(s, prefix):
+  """If the string starts with this prefix, drops it."""
+  if s.startswith(prefix):
+    return s[len(prefix):]
+  else:
+    return s
diff --git a/tensorflow/contrib/quantize/python/common_test.py b/tensorflow/contrib/quantize/python/common_test.py
index d6237fe5e38d905bf262d7be3746b9ee6046da47..06c62f2d265503bf42d46fb682a398ce1f4d15fb 100644
--- a/tensorflow/contrib/quantize/python/common_test.py
+++ b/tensorflow/contrib/quantize/python/common_test.py
@@ -22,6 +22,7 @@ from tensorflow.contrib.quantize.python import common
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -29,8 +30,15 @@ from tensorflow.python.platform import googletest
 class CommonTest(test_util.TensorFlowTestCase):
 
   def testCreateOrGetQuantizationStep(self):
+    self._TestCreateOrGetQuantizationStep(False)
+
+  def testCreateOrGetQuantizationStepResourceVar(self):
+    self._TestCreateOrGetQuantizationStep(True)
+
+  def _TestCreateOrGetQuantizationStep(self, use_resource):
     g = ops.Graph()
     with session.Session(graph=g) as sess:
+      variable_scope.get_variable_scope().set_use_resource(use_resource)
       quantization_step_tensor = common.CreateOrGetQuantizationStep()
 
       # Check that operations are added to the graph.
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 75d9eb0e58d96e4bb2946684febd250e2e1a6b4a..1f286bc39a21d4ec30ad8274b7d40bba28dca3a9 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import compat
 
 
@@ -133,9 +134,9 @@ def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
 
       nodes_modified_count = graph_editor.reroute_ts(bias_add_tensor,
                                                      match.output_tensor)
-      if nodes_modified_count != 1:
-        raise ValueError(
-            'Unexpected inputs to op: %s' % match.output_tensor.name)
+      if nodes_modified_count == 0:
+        raise ValueError('Folding batch norms failed, %s had no outputs.' %
+                         match.output_tensor.name)
 
 
 def _FindFusedBatchNorms(graph):
@@ -194,7 +195,7 @@ def _FindFusedBatchNorms(graph):
     layer_op = match_result.get_op(layer_pattern)
     layer_tensor = match_result.get_tensor(layer_pattern)
     bn_op = match_result.get_op(batch_norm_pattern)
-    batch_epsilon_tensor = bn_op.get_attr('epsilon')
+    batch_epsilon = bn_op.get_attr('epsilon')
 
     # In the MatMul case, the output of batch norm is reshaped back into a
     # 2D tensor, so the output_tensor is the output of the Reshape op.
@@ -207,6 +208,11 @@ def _FindFusedBatchNorms(graph):
         continue
       output_tensor = output_reshape_op.outputs[0]
 
+    # Ensure that the output tensor has consumers, otherwise this is a dangling
+    # node and not a match.
+    if not output_tensor.consumers():
+      continue
+
     input_tensor = match_result.get_tensor(input_pattern)
     weight_tensor = match_result.get_tensor(weight_pattern)
     gamma_tensor = match_result.get_tensor(gamma_pattern)
@@ -231,7 +237,7 @@ def _FindFusedBatchNorms(graph):
       # The batch variance used during forward and backward prop is biased,
       # i.e it is calculated as: V=sum(x(k)-mu)^2/N. For the moving average
       # calculation, the variance is corrected by the term N/N-1 (Bessel's
-      # correction). The variance tensor read from FuseBatchNorm has bessel's
+      # correction). The variance tensor read from FuseBatchNorm has Bessel's
       # correction applied, so we undo it here.
       scope, sep, _ = bn_op.name.rpartition('/')
       g = ops.get_default_graph()
@@ -270,7 +276,7 @@ def _FindFusedBatchNorms(graph):
         moving_variance_tensor=moving_variance_tensor,
         bn_decay_mean_tensor=bn_decay_mean_tensor,
         bn_decay_var_tensor=bn_decay_var_tensor,
-        batch_epsilon_tensor=batch_epsilon_tensor)
+        batch_epsilon=batch_epsilon)
 
 
 def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
@@ -300,7 +306,7 @@ def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
 
   Args:
     context: The scope under which we look for batch norm params
-    match: Object containg required batch norm tensors for correction
+    match: Object containing required batch norm tensors for correction
       computation.
     freeze_batch_norm_delay: Delay in steps at which computation switches
       from regular batch norm to frozen mean and variance.
@@ -311,11 +317,11 @@ def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
   """
 
   g = ops.get_default_graph()
-  with g.name_scope(context + '/batch_norm_correction'):
+  prefix = '' if not context else context + '/'
+  with g.name_scope(prefix + 'batch_norm_correction'):
     recip_sigma_mv = math_ops.rsqrt(
-        match.moving_variance_tensor + match.batch_epsilon_tensor)
-    recip_sigma = math_ops.rsqrt(
-        match.variance_tensor + match.batch_epsilon_tensor)
+        match.moving_variance_tensor + match.batch_epsilon)
+    recip_sigma = math_ops.rsqrt(match.variance_tensor + match.batch_epsilon)
     correction_scale = math_ops.divide(
         recip_sigma_mv, recip_sigma, name='scale_compute')
     correction_scale = array_ops.identity(
@@ -434,6 +440,9 @@ def _FoldUnfusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
   for bn in common.BatchNormGroups(graph):
     has_scaling = _HasScaling(graph, input_to_ops_map, bn)
 
+    if not _IsValidUnfusedBatchNorm(graph, bn):
+      continue
+
     # The mangling code intimately depends on BatchNorm node's internals.
     original_op, folded_op = _CreateFoldedOp(
         graph,
@@ -462,6 +471,52 @@ def _FoldUnfusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
       raise ValueError('Unexpected inputs to op: %s' % add_bypass.name)
 
 
+def _IsValidUnfusedBatchNorm(graph, context):
+  """Checks that the output of the unfused batch norm has consumers."""
+  add_shift = graph.get_operation_by_name(
+      context + '/BatchNorm/batchnorm/add_1')
+  # Ensure that the output tensor of batch norm has consumers, otherwise this
+  # is a dangling node and not a match.
+  return bool(add_shift.outputs[0].consumers())
+
+
+def _FindMatchingTensor(graph, match_pattern, scope):
+  """Finds best match of ops matching match_pattern with scope.
+
+     Example: _FindMatchingTensor(graph,'/BatchNorm/moments/Squeeze',
+     'MobilenetV1/MobilenetV1/Conv2d_0/') returns:
+      Tensor('MobilenetV1/Conv2d_0/BatchNorm/moments/Squeeze')
+
+  Args:
+    graph: Graph to inspect.
+    match_pattern: Part of the name of the op that we need to match, should
+    be present in the op's name
+    scope: The scope of the op. All the elements of the scope need not be
+    present in the op's name.
+
+  Returns:
+    Tensor from graph that provides the best match to the match_pattern and
+    scope
+  """
+
+  oplist = graph.get_operations()
+  split_context = set(scope.split('/'))
+  match_dict = {}
+  for op in oplist:
+    if op.name.endswith(match_pattern):
+      split_name = op.name.split('/')
+      num_matches = len(set(split_name) & split_context)
+      if num_matches > 0:
+        match_dict[op.name] = num_matches
+  # match_dict contains matching op names from graph with values being
+  # number of matches to scope. We pick the key with the most matches
+  if match_dict:
+    max_key = max(match_dict, key=match_dict.get)
+    return graph.get_tensor_by_name(max_key + ':0')
+  else:
+    return None
+
+
 def _GetBatchNormParams(graph, context, has_scaling):
   """Extracts relevant tensors for folding batch norms.
 
@@ -478,48 +533,74 @@ def _GetBatchNormParams(graph, context, has_scaling):
   batch_variance_tensor = None
   moving_mean_tensor = None
   moving_variance_tensor = None
-  batch_epsilon_tensor = None
+  batch_epsilon = None
   bn_decay_mean_tensor = None
   bn_decay_var_tensor = None
 
-  split_context = context.split('/')
-  base_context = split_context[-1]
-
-  oplist = graph.get_operations()
-  op_suffix_gamma = base_context + '/BatchNorm/gamma'
-  op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze'
-  op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1'
-  op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read'
-  op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read'
-  op_suffix_epsilon = base_context + '/BatchNorm/batchnorm/add/y'
-  op_suffix_bn_decay_mean = base_context + '/BatchNorm/AssignMovingAvg/decay'
-  op_suffix_bn_decay_var = base_context + '/BatchNorm/AssignMovingAvg_1/decay'
-
+  # TODO(raghuramank) This code relies on string matching and needs to be
+  # updated if unfused batch norm continues to be widely used
+  # Matching variable names is brittle and relies on scoping
+  # conventions. Fused batch norm folding is more robust. Support for unfused
+  # batch norms will be deprecated as we move forward. Fused batch norms allow
+  # for faster training and should be used whenever possible.
+  # context contains part of the names of the tensors we are interested in:
+  # For MobilenetV1, the context has repetitions:
+  # MobilenetV1/MobilenetV1/Conv2d_3_depthwise
+  # when the moving_mean tensor has the name:
+  # MobilenetV1/Conv2d_3_depthwise/BatchNorm/moving_mean/read
+  # To pick the correct variable name, it is necessary to ignore the repeating
+  # header.
+
+  # For MobilenetV2, this problem does not exist:
+  # The context is: MobilenetV2/expanded_conv_3/depthwise
+  # and the names of the tensors start with a single MobilenetV2
+  # The moving mean for example, has the name:
+  # MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read
+  # We identify the best match for an op by checking for
+  # 1. The suffix of the op is exactly matched
+  # 2. Maximum number of matches with the context.The matching
+  # score is given by the number of parts of context (split by /) that
+  # are present in the parts of the tensor name (again split by /).
+  # For example: scope= MobilenetV2/MobilenetV2/expanded_conv_3 and
+  # op.name =  MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read
+  # will have 2 matches,scope with a different conv layer will have one match.
+
+  op_suffix_mean = '/BatchNorm/moments/Squeeze'
+  op_suffix_variance = '/BatchNorm/moments/Squeeze_1'
+  op_suffix_epsilon = '/BatchNorm/batchnorm/add/y'
+  op_suffix_bn_decay_mean = '/BatchNorm/AssignMovingAvg/decay'
+  op_suffix_bn_decay_var = '/BatchNorm/AssignMovingAvg_1/decay'
+
+  if variable_scope.get_variable_scope().use_resource:
+    op_suffix_gamma = '/BatchNorm/gamma/Read/ReadVariableOp'
+    op_suffix_moving_variance = (
+        '/BatchNorm/moving_variance/Read/ReadVariableOp')
+    op_suffix_moving_mean = ('/BatchNorm/moving_mean/Read/ReadVariableOp')
+  else:
+    op_suffix_gamma = '/BatchNorm/gamma'
+    op_suffix_moving_variance = '/BatchNorm/moving_variance/read'
+    op_suffix_moving_mean = '/BatchNorm/moving_mean/read'
   # Parse through list of ops to find relevant ops
-  for op in oplist:
-    if op.name.endswith(op_suffix_mean):
-      # This is an efficient way to check for two things:
-      # Is batch norm present and is it training mode?
-      # Batch statistics are computed only during batch norm in training
-      batch_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_variance):
-      batch_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_moving_mean):
-      moving_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_moving_variance):
-      moving_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_epsilon):
-      batch_epsilon_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_bn_decay_mean):
-      bn_decay_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_bn_decay_var):
-      bn_decay_var_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if has_scaling:
-      if op.name.endswith(op_suffix_gamma):
-        gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
+
+  batch_mean_tensor = _FindMatchingTensor(graph, op_suffix_mean, context)
+  batch_variance_tensor = _FindMatchingTensor(graph, op_suffix_variance,
+                                              context)
+  moving_mean_tensor = _FindMatchingTensor(graph, op_suffix_moving_mean,
+                                           context)
+  moving_variance_tensor = _FindMatchingTensor(graph, op_suffix_moving_variance,
+                                               context)
+  batch_epsilon = _FindMatchingTensor(graph, op_suffix_epsilon, context)
+  bn_decay_mean_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_mean,
+                                             context)
+  bn_decay_var_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_var,
+                                            context)
+  if batch_mean_tensor is None and moving_mean_tensor is None:
+    ValueError('Error folding unfused batch norms')
+  if has_scaling:
+    gamma_tensor = _FindMatchingTensor(graph, op_suffix_gamma, context)
 
   if not has_scaling:
-    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+    gamma_tensor = array_ops.ones(moving_mean_tensor.shape)
 
   return _BatchNormMatch(
       layer_op=None,
@@ -535,7 +616,7 @@ def _GetBatchNormParams(graph, context, has_scaling):
       moving_variance_tensor=moving_variance_tensor,
       bn_decay_mean_tensor=bn_decay_mean_tensor,
       bn_decay_var_tensor=bn_decay_var_tensor,
-      batch_epsilon_tensor=batch_epsilon_tensor)
+      batch_epsilon=batch_epsilon)
 
 
 def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
@@ -816,7 +897,7 @@ class _BatchNormMatch(object):
   def __init__(self, layer_op, bn_op, output_tensor, input_tensor,
                weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
                variance_tensor, moving_mean_tensor, moving_variance_tensor,
-               bn_decay_mean_tensor, bn_decay_var_tensor, batch_epsilon_tensor):
+               bn_decay_mean_tensor, bn_decay_var_tensor, batch_epsilon):
     self._layer_op = layer_op
     self._bn_op = bn_op
     self._output_tensor = output_tensor
@@ -830,7 +911,7 @@ class _BatchNormMatch(object):
     self._moving_variance_tensor = moving_variance_tensor
     self._bn_decay_mean_tensor = bn_decay_mean_tensor
     self._bn_decay_var_tensor = bn_decay_var_tensor
-    self._batch_epsilon_tensor = batch_epsilon_tensor
+    self._batch_epsilon = batch_epsilon
 
   @property
   def layer_op(self):
@@ -877,8 +958,8 @@ class _BatchNormMatch(object):
     return self._moving_variance_tensor
 
   @property
-  def batch_epsilon_tensor(self):
-    return self._batch_epsilon_tensor
+  def batch_epsilon(self):
+    return self._batch_epsilon
 
   @property
   def bn_decay_mean_tensor(self):
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index c90a18ab0357f1bcbc5d8ccd48edf894d7baf5f9..fa5e11b4708402a4fe76a494ed59e30835ed1363 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import saver as saver_lib
@@ -128,9 +129,97 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
 
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def testFoldConv2d(self):
     self._RunTestOverParameters(self._TestFoldConv2d)
 
+  def testMultipleLayerConv2d(self,
+                              relu=nn_ops.relu,
+                              relu_op_name='Relu',
+                              has_scaling=True,
+                              fused_batch_norm=False,
+                              freeze_batch_norm_delay=None):
+    """Tests folding cases for a network with multiple layers.
+
+    Args:
+      relu: Callable that returns an Operation, a factory method for the Relu*.
+      relu_op_name: String, name of the Relu* operation.
+      has_scaling: Bool, when true the batch norm has scaling.
+      fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
+    """
+    g = ops.Graph()
+    with g.as_default():
+      batch_size, height, width = 5, 128, 128
+      inputs = array_ops.zeros((batch_size, height, width, 3))
+      out_depth = 3
+      stride = 1
+      activation_fn = relu
+      scope = 'topnet/testnet'
+      with variable_scope.variable_scope(scope, [inputs]):
+        layer1 = conv2d(
+            inputs,
+            out_depth, [5, 5],
+            stride=stride,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            normalizer_fn=None,
+            scope='testnet/layer1')
+        # Add bn and relu with different scope
+        layer1 = batch_norm(
+            layer1, scale=has_scaling, fused=fused_batch_norm, scope='layer1')
+        layer1 = activation_fn(layer1)
+        layer2 = conv2d(
+            layer1,
+            2 * out_depth, [5, 5],
+            stride=stride,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=activation_fn,
+            normalizer_fn=batch_norm,
+            normalizer_params=self._BatchNormParams(
+                scale=has_scaling, fused=fused_batch_norm),
+            scope='testnet/layer2')
+        # Add bn and relu with different scope
+        layer2 = batch_norm(
+            layer2, scale=has_scaling, fused=fused_batch_norm, scope='layer2')
+        _ = activation_fn(layer2)
+
+      scope = 'topnet/testnet/testnet/layer2'
+
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
+    folded_mul = g.get_operation_by_name(scope + '/mul_fold')
+    self.assertEqual(folded_mul.type, 'Mul')
+    self._AssertInputOpsAre(folded_mul, [
+        scope + '/correction_mult',
+        self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
+    ])
+    self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
+
+    folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold')
+    self.assertEqual(folded_conv.type, 'Conv2D')
+    # Remove :0 at end of name for tensor prior to comparison
+    self._AssertInputOpsAre(folded_conv,
+                            [scope + '/mul_fold', layer1.name[:-2]])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
+
+    folded_add = g.get_operation_by_name(scope + '/add_fold')
+    self.assertEqual(folded_add.type, 'Add')
+    self._AssertInputOpsAre(folded_add, [
+        scope + '/correction_add',
+        self._BathNormBiasName(scope, fused_batch_norm)
+    ])
+    output_op_names = [scope + '/' + relu_op_name]
+    self._AssertOutputGoesToOps(folded_add, g, output_op_names)
+
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass,
                                   has_scaling, fused_batch_norm,
                                   freeze_batch_norm_delay):
@@ -196,6 +285,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
 
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def testFoldConv2dUnknownShape(self):
     self._RunTestOverParameters(self._TestFoldConv2dUnknownShape)
 
@@ -260,6 +352,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
 
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def testFoldFullyConnectedLayer(self):
     self._RunTestOverParameters(self._TestFoldFullyConnectedLayer)
 
@@ -337,6 +432,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
 
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def testFoldDepthwiseConv2d(self):
     self._RunTestOverParameters(self._TestFoldDepthwiseConv2d)
 
diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py
index b458f039df0523b5b8b07cff7d14643154124b95..bacc707a3abb5539b3b119c1ebc17bd7b30efc5b 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher.py
@@ -103,7 +103,7 @@ class OneofPattern(Pattern):
 class MatchResult(object):
   r"""Encapsulates the result of a match done by GraphMatcher.
 
-  MatchResult contains a map from OpTypePattern to the matching op and tensor.
+  MatchResult contains a map from Pattern to the matching op and tensor.
   When the matching op has multiple output tensors, the matching tensor is the
   output tensor used by the matching op of the parent pattern. E.g., when we
   match graph
@@ -138,7 +138,7 @@ class MatchResult(object):
       self._name_to_pattern[pattern.name] = pattern
 
   def _to_pattern(self, pattern_or_name):
-    if isinstance(pattern_or_name, OpTypePattern):
+    if isinstance(pattern_or_name, Pattern):
       return pattern_or_name
 
     if isinstance(pattern_or_name, str):
@@ -146,8 +146,8 @@ class MatchResult(object):
         return None
       return self._name_to_pattern[pattern_or_name]
 
-    raise ValueError('pattern_or_name has type %s. Expect OpTypePattern or str.'
-                     % type(pattern_or_name))
+    raise ValueError('pattern_or_name has type %s. Expect Pattern or str.' %
+                     type(pattern_or_name))
 
   def _get_op_tensor(self, pattern_or_name):
     pattern = self._to_pattern(pattern_or_name)
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 0a8e35080cb08f71dc28e33c6138a12656e5a5ea..5c0e17dc8646ce7850e26ffaa80c0201cea456af 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -51,7 +51,6 @@ def LastValueQuantize(inputs,
                       per_channel=False,
                       init_min=-6.0,
                       init_max=6.0,
-                      updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                       name_prefix='LastValueQuant',
                       reuse=None,
@@ -69,8 +68,6 @@ def LastValueQuantize(inputs,
       quantization ranges per output channel.
     init_min: a float scalar, the initial value for variable min.
     init_max: a float scalar, the initial value for variable max.
-    updates_collection: (Optional) collections to collect the update ops for
-      computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
     name_prefix: name_prefix for created nodes.
@@ -133,7 +130,6 @@ def LastValueQuantize(inputs,
     # TFLite requires that 0.0 if always in the [min; max] range.
     batch_min = math_ops.minimum(batch_min, 0.0)
     assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast')
-    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -146,7 +142,6 @@ def LastValueQuantize(inputs,
     # TFLite requires that 0.0 if always in the [min; max] range.
     batch_max = math_ops.maximum(batch_max, 0.0)
     assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast')
-    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
@@ -163,7 +158,6 @@ def MovingAvgQuantize(inputs,
                       init_min=-6.0,
                       init_max=6.0,
                       ema_decay=0.999,
-                      updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                       name_prefix='MovingAvgQuantize',
                       reuse=None,
@@ -182,8 +176,6 @@ def MovingAvgQuantize(inputs,
     init_min: a float scalar, the initial value for variable min.
     init_max: a float scalar, the initial value for variable max.
     ema_decay: EMA decay parameter.
-    updates_collection: (Optional) collections to collect the update ops for
-      computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
     name_prefix: name_prefix for created nodes.
@@ -246,7 +238,6 @@ def MovingAvgQuantize(inputs,
     batch_min = math_ops.minimum(batch_min, 0.0)
     assign_min = moving_averages.assign_moving_average(
         min_var, batch_min, ema_decay, name='AssignMinEma')
-    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -260,7 +251,6 @@ def MovingAvgQuantize(inputs,
     batch_max = math_ops.maximum(batch_max, 0.0)
     assign_max = moving_averages.assign_moving_average(
         max_var, batch_max, ema_decay, name='AssignMaxEma')
-    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
@@ -282,8 +272,8 @@ def _FakeQuantWithMinMaxVars(inputs, min_var, max_var, per_channel, num_bits,
   Args:
     inputs: a tensor containing values to be quantized.
     min_var: a variable containing quantization range lower end(s).
-    max_var: a variable containing quantization range lupper end(s).
-    per_channel: a boolean specifying whether to use per-channel quantizatioh.
+    max_var: a variable containing quantization range upper end(s).
+    per_channel: a boolean specifying whether to use per-channel quantization.
     num_bits: Number of bits to use for quantization, must be between 2 and 8.
     narrow_range: Whether to use the narrow quantization range
       [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1].
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 7a3f92f503a5d6f2b0fab2a499f8e8758809d0ed..efc1a94b3c6e34cfd9f45e57af0be3faf4490866 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.quantize.python import quant_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
 
 # Quantizable operation types that are supported by the quantization rewrite.
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
@@ -34,10 +35,6 @@ _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
 # Activations that are supported by the quantization rewrite.
 _ACTIVATION_TYPES = {'Relu', 'Relu6', 'Identity'}
 
-# Weight types that are supported by the quantization rewrite.
-# TODO(suharshs): Add support for ResourceVariable.
-_WEIGHT_TYPES = {'Variable', 'VariableV2'}
-
 
 def Quantize(graph,
              is_training,
@@ -45,9 +42,16 @@ def Quantize(graph,
              activation_bits=8,
              ema_decay=0.999,
              quant_delay=None,
-             vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES):
+             vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
+             scope=None):
   """Updates graph with quantization operations.
 
+  Currently we quantize the following tensors:
+  * Conv/MatMul: Quantize the weights if it matches.
+  * Activation: Quantize the output if it matches.
+  * Bypass/Post-activation Bypass: Quantize both input and output
+    if it matches.
+
   Args:
     graph: Graph to modify.
     is_training: Whether quantizing training graph or eval graph.
@@ -61,13 +65,21 @@ def Quantize(graph,
       training.
     vars_collection: (Optional) Collection where to store the variables for
       quantization interval ends.
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
   Raises:
     ValueError: When quantization fails.
   """
+  if scope and not scope.endswith('/'):
+    scope += '/'
+
   input_to_ops_map = input_to_ops.InputToOps(graph)
   for layer_match in _FindLayersToQuantize(graph):
     # Quantize the weights.
     context = _GetContextFromOp(layer_match.layer_op)
+
+    # If `scope` is given, only quantize it if the consumer of weights
+    # (the layer op) is in the right scope.
     _InsertQuantOp(
         context,
         'weights_quant',
@@ -78,7 +90,8 @@ def Quantize(graph,
         quant_delay=quant_delay,
         narrow_range=True,
         vars_collection=vars_collection,
-        bits=weight_bits)
+        bits=weight_bits,
+        consumer_scope=scope)
 
     # Quantize the activations.
     consumer_ops = input_to_ops_map.ConsumerOperations(
@@ -86,6 +99,9 @@ def Quantize(graph,
     add_context = context
     if layer_match.bypass_op:
       add_context = re.search(r'^(.*)/([^/]+)', context).group(1)
+
+    # If `scope` is given, only quantize it if the producer of weights
+    # (usually it's the layer op) is in the right scope.
     _InsertQuantOp(
         add_context,
         'act_quant',
@@ -97,11 +113,14 @@ def Quantize(graph,
         quant_delay=quant_delay,
         vars_collection=vars_collection,
         bits=activation_bits,
-        init_min=0.0)
+        init_min=0.0,
+        producer_scope=scope)
 
     # Quantize the inputs and output to the bypass (if it exists). The input to
     # the bypass is the bias add, and the output is the activation.
     if layer_match.bypass_op is not None:
+      # If `scope` is given, only quantize it if the both the producer and the
+      # consumer are in the right scope.
       _InsertQuantOp(
           context,
           'conv_quant',
@@ -111,34 +130,97 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
-      _InsertQuantOp(
-          add_context,
-          'add_quant',
-          layer_match.bypass_op,
-          input_to_ops_map.ConsumerOperations(layer_match.bypass_op),
-          is_training,
-          moving_avg=True,
-          ema_decay=ema_decay,
-          quant_delay=quant_delay,
-          vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope,
+          consumer_scope=scope)
+      # Make sure the op following this isn't an activation. In which case, we
+      # shouldn't quantize it, since the activation will be Fused into the
+      # Add at inference time.
+      consumers = input_to_ops_map.ConsumerOperations(layer_match.bypass_op)
+      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+        logging.info('Skipping %s, because its followed by an activation.',
+                     layer_match.bypass_op.name)
+      else:
+        _InsertQuantOp(
+            add_context,
+            'add_quant',
+            layer_match.bypass_op,
+            input_to_ops_map.ConsumerOperations(layer_match.bypass_op),
+            is_training,
+            moving_avg=True,
+            ema_decay=ema_decay,
+            quant_delay=quant_delay,
+            vars_collection=vars_collection,
+            bits=activation_bits,
+            producer_scope=scope,
+            consumer_scope=scope)
+
+    # Quantize bypass ops that occur after the activation.
+    if layer_match.post_activation_bypass_op is not None:
+      post_activation_bypass_context = re.search(
+          r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1)
+      # If `scope` is given, only quantize it if the producer is in the right
+      # scope.
+      # Make sure the op following this isn't an activation. In which case, we
+      # shouldn't quantize it, since the activation will be Fused into the
+      # Add at inference time.
+      consumers = input_to_ops_map.ConsumerOperations(
+          layer_match.post_activation_bypass_op)
+      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+        logging.info('Skipping %s, because its followed by an activation.',
+                     layer_match.post_activation_bypass_op.name)
+      else:
+        _InsertQuantOp(
+            post_activation_bypass_context,
+            'post_activation_bypass_quant',
+            layer_match.post_activation_bypass_op,
+            consumers,
+            is_training,
+            moving_avg=True,
+            ema_decay=ema_decay,
+            quant_delay=quant_delay,
+            vars_collection=vars_collection,
+            bits=activation_bits,
+            producer_scope=scope)
 
 
 def _FindLayersToQuantize(graph):
   """Matches layers in graph to quantize.
 
+  The following patterns get matched. Nodes surrounded by [] will be
+  optionally matched:
+
+          weight|folded_weight
+                /
+         conv|fc
+            |
+    [post_conv_correction]
+            |
+     biasadd|folded_bias
+            |
+         [bypass]
+            |
+        activation
+            |
+   [post_activation_bypass]
+
+  Match replacements:
+    If weight|folded_weight is found, FakeQuant is added afterwards.
+    If bypass is found, FakeQuant is added before and after.
+    If activation is found, FakeQuant is added afterwards.
+    If post_activation_bypass is found, FakeQuant is added afterwards.
+
   Args:
     graph: Graph to perform match on.
 
-  Yields:
-    _LayerMatches.
+  Returns:
+    list of _LayerMatches.
   """
   input_pattern = graph_matcher.OpTypePattern('*')
-  weight_var_pattern = graph_matcher.OpTypePattern('|'.join(_WEIGHT_TYPES))
-  weight_pattern = graph_matcher.OpTypePattern(
+  weight_var_pattern = graph_matcher.OpTypePattern('Variable|VariableV2')
+  weight_identity_pattern = graph_matcher.OpTypePattern(
       'Identity', inputs=[weight_var_pattern])
-
+  weight_resource_var_pattern = graph_matcher.OpTypePattern('ReadVariableOp')
   folded_weight_pattern = graph_matcher.OpTypePattern('Mul')
 
   # The weights inputs to the layer operation can either be from the Variable or
@@ -147,7 +229,10 @@ def _FindLayersToQuantize(graph):
       '|'.join(_QUANTIZABLE_TYPES),
       inputs=[
           input_pattern,
-          graph_matcher.OneofPattern([weight_pattern, folded_weight_pattern])
+          graph_matcher.OneofPattern([
+              weight_identity_pattern, weight_resource_var_pattern,
+              folded_weight_pattern
+          ])
       ])
 
   folded_bias_mul_pattern = graph_matcher.OpTypePattern(
@@ -180,7 +265,7 @@ def _FindLayersToQuantize(graph):
               [bias_add_pattern, folded_bias_add_pattern])
       ])
 
-  # The input to the activation can come from bias add, fold bias add or the
+  # The input to the activation can come from bias add, fold bias add, the
   # bypasses.
   activation_pattern = graph_matcher.OpTypePattern(
       '|'.join(_ACTIVATION_TYPES),
@@ -191,10 +276,62 @@ def _FindLayersToQuantize(graph):
           ])
       ])
 
+  post_activation_bypass_pattern_a = graph_matcher.OpTypePattern(
+      'Add', inputs=['*', activation_pattern])
+  post_activation_bypass_pattern_b = graph_matcher.OpTypePattern(
+      'Add', inputs=[activation_pattern, '*'])
+
+  # The order of the following matching blocks is very important. Since matches
+  # aren't guaranteed to be disjoint, we structure matches from largest to
+  # smallest to guarantee that the largest match always wins. Additionally, we
+  # ensure that we don't match layers multiple times.
+
+  layer_matches = []
+  # We use matched_layer_set to ensure that layers aren't matched multiple
+  # times.
+  matched_layer_set = set()
+
+  # First, we match layers that have a post activation bypass. We do this first
+  # to ensure we don't match only the first part of this layer, missing the
+  # post activation bypass node.
+  post_activation_bypass_layer_matcher = graph_matcher.GraphMatcher(
+      graph_matcher.OneofPattern([
+          post_activation_bypass_pattern_a,
+          post_activation_bypass_pattern_b,
+      ]))
+  for match_result in post_activation_bypass_layer_matcher.match_graph(graph):
+    layer_op = match_result.get_op(layer_pattern)
+    weight_tensor = match_result.get_tensor(weight_identity_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(weight_resource_var_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(folded_weight_pattern)
+    activation_op = match_result.get_op(activation_pattern)
+    bias_add_op = match_result.get_op(bias_add_pattern)
+    if bias_add_op is None:
+      bias_add_op = match_result.get_op(folded_bias_add_pattern)
+    bypass_op = match_result.get_op(bypass_pattern_a)
+    if bypass_op is None:
+      bypass_op = match_result.get_op(bypass_pattern_b)
+    post_activation_bypass_op = match_result.get_op(
+        post_activation_bypass_pattern_a)
+    if post_activation_bypass_op is None:
+      post_activation_bypass_op = match_result.get_op(
+          post_activation_bypass_pattern_b)
+    if layer_op not in matched_layer_set:
+      matched_layer_set.add(layer_op)
+      layer_matches.append(
+          _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op,
+                      post_activation_bypass_op, bias_add_op))
+
+  # Now, we match the basic layer ending at an activation. We may get duplicate
+  # matches from above, but we don't add them to layer_matches.
   layer_matcher = graph_matcher.GraphMatcher(activation_pattern)
   for match_result in layer_matcher.match_graph(graph):
     layer_op = match_result.get_op(layer_pattern)
-    weight_tensor = match_result.get_tensor(weight_pattern)
+    weight_tensor = match_result.get_tensor(weight_identity_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(weight_resource_var_pattern)
     if weight_tensor is None:
       weight_tensor = match_result.get_tensor(folded_weight_pattern)
     activation_op = match_result.get_op(activation_pattern)
@@ -204,19 +341,54 @@ def _FindLayersToQuantize(graph):
     bypass_op = match_result.get_op(bypass_pattern_a)
     if bypass_op is None:
       bypass_op = match_result.get_op(bypass_pattern_b)
-    yield _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op,
-                      bias_add_op)
+    if layer_op not in matched_layer_set:
+      matched_layer_set.add(layer_op)
+      layer_matches.append(
+          _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op, None,
+                      bias_add_op))
+
+  # Match the final layer, where there may not be an activation and instead
+  # the output of the final BiasAdd must be quantized. So we treat the BiasAdd
+  # as the 'activation_op' in the _LayerMatch, to ensure that it's output is
+  # quantized.
+  final_layer_matcher = graph_matcher.GraphMatcher(
+      graph_matcher.OneofPattern([bias_add_pattern, folded_bias_add_pattern]))
+  for match_result in final_layer_matcher.match_graph(graph):
+    layer_op = match_result.get_op(layer_pattern)
+    weight_tensor = match_result.get_tensor(weight_identity_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(weight_resource_var_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(folded_weight_pattern)
+    activation_op = match_result.get_op(bias_add_pattern)
+    if activation_op is None:
+      activation_op = match_result.get_op(folded_bias_add_pattern)
+    if layer_op not in matched_layer_set:
+      matched_layer_set.add(layer_op)
+      layer_matches.append(
+          _LayerMatch(layer_op, weight_tensor, activation_op, None, None, None))
+
+  return layer_matches
+
+
+def _HasPostActivationBypass(activation_op):
+  for activation_tensor in activation_op.outputs:
+    for output_op in activation_tensor.consumers():
+      if output_op.type == 'Add':
+        return True
+  return False
 
 
 class _LayerMatch(object):
   """Contains all information related to a matched Layer."""
 
   def __init__(self, layer_op, weight_tensor, activation_op, bypass_op,
-               bias_add_op):
+               post_activation_bypass_op, bias_add_op):
     self._layer_op = layer_op
     self._weight_tensor = weight_tensor
     self._activation_op = activation_op
     self._bypass_op = bypass_op
+    self._post_activation_bypass_op = post_activation_bypass_op
     self._bias_add_op = bias_add_op
 
   @property
@@ -235,6 +407,10 @@ class _LayerMatch(object):
   def bypass_op(self):
     return self._bypass_op
 
+  @property
+  def post_activation_bypass_op(self):
+    return self._post_activation_bypass_op
+
   @property
   def bias_add_op(self):
     return self._bias_add_op
@@ -251,12 +427,14 @@ def _InsertQuantOp(context,
                    bits=8,
                    ema_decay=0.999,
                    quant_delay=None,
-                   vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-                   narrow_range=False):
+                   vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
+                   narrow_range=False,
+                   producer_scope=None,
+                   consumer_scope=None):
   """Inserts a quant op between a producer op and (multiple) consumer ops.
 
   Args:
-    context: Context w,here producer and consumer operations are nested.
+    context: Context where producer and consumer operations are nested.
     name: Name for the new quantization op within the context.
     producer: Producer operation of the pairs where quantization will be
       inserted.
@@ -277,12 +455,54 @@ def _InsertQuantOp(context,
       quantization interval ends.
     narrow_range: Whether to use the narrow quantization range
       [1; 2^bits - 1] or wide range [0; 2^bits - 1].
+    producer_scope: The restriction of producer scope. If not None, the new op
+      will be inserted only when the producer is in this scope.
+    consumer_scope: The restriction of producer scope. If not None, the new op
+      will be inserted only when all the consumers are in this scope.
   Raises:
     ValueError: When producer operation is not directly connected to the
       consumer operation.
   """
+  if producer_scope and not producer.name.startswith(producer_scope):
+    logging.info(
+        '_InsertQuantOp ignores context="%s" name="%s" '
+        'because producer "%s" is not in scope "%s"',
+        context, name, producer.name, producer_scope)
+    return
+
+  if consumer_scope:
+    consumers_in_scope = []
+    for consumer in consumers:
+      if consumer.name.startswith(consumer_scope):
+        consumers_in_scope.append(consumer)
+      else:
+        logging.info(
+            '_InsertQuantOp context="%s" name="%s" ignores '
+            'consumer "%s" because it is not in scope "%s"',
+            context, name, consumer.name, consumer_scope)
+        return
+    consumers = consumers_in_scope
+
   name_prefix = _AddContextToName(context, name)
+  # This is needed on TPU where name_scope == 'TPUReplicate/loop', and
+  # name_prefix starts with 'TPUReplicate/loop/'; without dropping it
+  # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which
+  # breaks things later.
+  name_scope = ops.get_name_scope()
+  if name_scope:
+    name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/')
+
   inputs = producer.outputs[0]
+  # Prevent ops from being quantized multiple times. Bypass ops can sometimes
+  # overlap between multiple matches, so we need to ensure that we don't
+  # add duplicate FakeQuant operations.
+  fake_quant_ops = set([
+      'FakeQuantWithMinMaxVars',
+      'FakeQuantWithMinMaxArgs'
+  ])
+  if fake_quant_ops.intersection(set([c.type for c in inputs.consumers()])):
+    return
+
   if moving_avg:
     quant = (
         quant_ops.MovingAvgQuantize(
@@ -318,11 +538,16 @@ def _InsertQuantOp(context,
         lambda: inputs,
         name=name_prefix + '/delayed_quant')
 
-  nodes_modified_count = graph_editor.reroute_ts(
-      [quant], [inputs], can_modify=consumers)
-  if nodes_modified_count != len(consumers):
-    raise ValueError('Some inputs not quantized for ops: [%s]' % ', '.join(
-        [consumer.name for consumer in consumers]))
+  if consumers:
+    tensors_modified_count = graph_editor.reroute_ts(
+        [quant], [inputs], can_modify=consumers)
+    # Some operations can have multiple output tensors going to the same
+    # consumer. Since consumers is a set, we need to ensure that
+    # tensors_modified_count is greater than or equal to the length of the set
+    # of consumers.
+    if tensors_modified_count < len(consumers):
+      raise ValueError('No inputs quantized for ops: [%s]' % ', '.join(
+          [consumer.name for consumer in consumers]))
 
 
 def _GetContextFromOp(op):
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 5a3a74cec4864ad3808d485849334c81f569d300..11d052d7f491dc029d1bda9b47364d6e9c880a67 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -28,7 +28,8 @@ def _create_graph(input_graph=None,
                   weight_bits=8,
                   activation_bits=8,
                   quant_delay=None,
-                  freeze_bn_delay=None):
+                  freeze_bn_delay=None,
+                  scope=None):
   """Rewrites an input_graph in place for simulated quantization.
 
   The graph has fake quantization ops inserted to simulate the error
@@ -48,6 +49,8 @@ def _create_graph(input_graph=None,
       frozen and used instead of batch statistics during training.
       freeze_bn_delay should be greater than quant_delay and should correspond
       to the number of steps when training has almost converged
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -66,12 +69,15 @@ def _create_graph(input_graph=None,
         is_training,
         quant_delay=quant_delay,
         weight_bits=weight_bits,
-        activation_bits=activation_bits)
+        activation_bits=activation_bits,
+        scope=scope)
 
 
 def create_training_graph(input_graph=None, quant_delay=0):
   """Rewrites a training input_graph in place for simulated quantization.
 
+  Variables added by the rewrite get added to the global variables collection.
+
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
   the expected behavior of previously held references to nodes and tensors may
@@ -97,16 +103,7 @@ def create_training_graph(input_graph=None, quant_delay=0):
   # TODO(raghuramank) Need to have freeze_bn_delay be a function of batch size
   # Currently the values below are hardcoded for mobilenetV1 on imagenet
   # Please use the experimental API if you need to tune these values.
-  if quant_delay == 0:
-    # Corresponds to case of restoring from a floating point checkpoint
-    # In this case, we can freeze the moving mean and variance early on and
-    # switch to using them during training. Therefore, freeze_bn_delay is set to
-    # 2e5.
-    freeze_bn_delay = int(2e5)
-  else:
-    # If training from scratch, set freeze_bn_delay to 100 epochs after quant
-    # delay. With a batch size of 64, this corresponds to 20000*100=2M steps.
-    freeze_bn_delay = quant_delay + int(2e6)
+  freeze_bn_delay = None
 
   _create_graph(
       input_graph=input_graph,
@@ -118,6 +115,8 @@ def create_training_graph(input_graph=None, quant_delay=0):
 def create_eval_graph(input_graph=None):
   """Rewrites an eval input_graph in place for simulated quantization.
 
+  Variables added by the rewrite get added to the global variables collection.
+
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
   the expected behavior of previously held references to nodes and tensors may
@@ -138,9 +137,12 @@ def experimental_create_training_graph(input_graph=None,
                                        weight_bits=8,
                                        activation_bits=8,
                                        quant_delay=0,
-                                       freeze_bn_delay=int(2e5)):
+                                       freeze_bn_delay=None,
+                                       scope=None):
   """Rewrites a training input_graph in place for simulated quantization.
 
+  Variables added by the rewrite get added to the global variables collection.
+
   This function has additional experimental options not (yet) available to
   create_training_graph. The resulting behavior may be undefined.
 
@@ -158,7 +160,7 @@ def experimental_create_training_graph(input_graph=None,
   often fail.
 
   Args:
-    input_graph: The tf.Graph to be transformed,if None then defaults to the
+    input_graph: The tf.Graph to be transformed, if None then defaults to the
       default graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
@@ -168,6 +170,8 @@ def experimental_create_training_graph(input_graph=None,
       frozen and used instead of batch statistics during training.
       freeze_bn_delay should be greater than quant_delay and should correspond
       to when training has almost converged
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -180,14 +184,18 @@ def experimental_create_training_graph(input_graph=None,
       weight_bits=weight_bits,
       activation_bits=activation_bits,
       quant_delay=quant_delay,
-      freeze_bn_delay=freeze_bn_delay)
+      freeze_bn_delay=freeze_bn_delay,
+      scope=scope)
 
 
 def experimental_create_eval_graph(input_graph=None,
                                    weight_bits=8,
-                                   activation_bits=8):
+                                   activation_bits=8,
+                                   scope=None):
   """Rewrites an eval input_graph in place for simulated quantization.
 
+  Variables added by the rewrite get added to the global variables collection.
+
   This function has additional experimental options not (yet) available to
   create_eval_graph. The resulting behavior may be undefined.
 
@@ -201,8 +209,8 @@ def experimental_create_eval_graph(input_graph=None,
       default graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
-
-
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -212,4 +220,5 @@ def experimental_create_eval_graph(input_graph=None,
       input_graph=input_graph,
       is_training=False,
       weight_bits=weight_bits,
-      activation_bits=activation_bits)
+      activation_bits=activation_bits,
+      scope=scope)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index 6b9289ef5f4b847172e1f093a1e4b5b2d3bdab57..54faf582f15a26c12813f3fdffe2dda6aa5cc91f 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -66,6 +66,20 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     for fn in rewrite_fns:
       test_fn(fn)
 
+  def _RunTestOverExperimentalRewritesWithScope(self, test_fn, scope):
+    def with_absent_scope(fn):
+      def fn_with_absent_scope(*args):
+        fn(*args, scope=scope)
+      return fn_with_absent_scope
+    rewrite_fns = [
+        with_absent_scope(
+            quantize_graph.experimental_create_training_graph),
+        with_absent_scope(
+            quantize_graph.experimental_create_eval_graph),
+    ]
+    for fn in rewrite_fns:
+      test_fn(fn)
+
   def testRewrite(self):
     self._RunTestOverAllRewrites(self._TestRewrite)
 
@@ -99,6 +113,20 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       # Ensure that variables were added.
       self.assertTrue(len(orig_variable_names) < len(q_variables))
 
+  def testWithPostActivationBypass(self):
+    self._RunTestOverAllRewrites(self._TestWithPostActivationBypass)
+
+  def _TestWithPostActivationBypass(self, rewrite_fn):
+    # Tests that the default graph is correctly used when no args are provided
+    # to rewrite_fn.
+    with ops.Graph().as_default() as g:
+      self._ConvLayer(post_activation_bypass=True, scope='scope1')
+      rewrite_fn()
+
+      op_names = [op.name for op in g.get_operations()]
+      self.assertTrue(any(
+          'scope1/post_activation_bypass_quant/' in name for name in op_names))
+
   def testQuantDelay(self):
     self._RunTestOverTrainingRewrites(self._TestQuantDelay)
 
@@ -211,20 +239,79 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       self.assertFalse(any(s in op.name for s in update_names))
     self.assertTrue(quant_found)
 
-  def _ConvLayer(self):
+  def testIdempotent(self):
+    self._RunTestOverAllRewrites(self._TestIdempotent)
+
+  def _TestIdempotent(self, rewrite_fn):
+    with ops.Graph().as_default() as g:
+      self._ConvLayer()
+      rewrite_fn()
+      graph_def_before = str(g.as_graph_def())
+      # Ensuring that calling the rewrite again doesn't add more nodes.
+      rewrite_fn()
+      graph_def_after = str(g.as_graph_def())
+      self.assertEqual(graph_def_before, graph_def_after)
+
+  def testRewriteWithScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestRewriteWithScope, 'scope1')
+
+  def _TestRewriteWithScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      scope1_output = self._ConvLayer(scope='scope1')
+      self._ConvLayer(input_tensor=scope1_output, scope='scope2')
+
+    rewrite_fn(graph)
+
+    op_names = [op.name for op in graph.get_operations()]
+    # The weights and activation of scope1 is quantized, but not scope2.
+    self.assertTrue(
+        any('scope1/Conv/act_quant' in name for name in op_names))
+    self.assertTrue(
+        any('scope1/Conv/weights_quant' in name for name in op_names))
+    self.assertFalse(
+        any('scope2/Conv/act_quant' in name for name in op_names))
+    self.assertFalse(
+        any('scope2/Conv/weights_quant' in name for name in op_names))
+
+  def testRewriteWithNonMatchingScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestRewriteWithNonMatchingScope, 'NonExistingScope')
+
+  def _TestRewriteWithNonMatchingScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      self._ConvLayer()
+
+    op_names_before_rewrite = set([op.name for op in graph.get_operations()])
+    rewrite_fn(graph)
+    op_names_after_rewrite = set([op.name for op in graph.get_operations()])
+
+    # No ops should be inserted or removed.
+    self.assertEqual(op_names_before_rewrite, op_names_after_rewrite)
+
+  def _ConvLayer(
+      self, input_tensor=None, scope='test', pre_activation_bypass=False,
+      post_activation_bypass=False):
     """Add a basic convolution layer to the default graph."""
     batch_size, height, width, depth = 5, 128, 128, 3
-    inputs = array_ops.zeros((batch_size, height, width, depth))
+    if input_tensor is None:
+      input_tensor = array_ops.zeros((batch_size, height, width, depth))
     weight_init = init_ops.truncated_normal_initializer
-    conv = layers.conv2d(
-        inputs,
-        32, [5, 5],
-        stride=2,
-        padding='SAME',
-        weights_initializer=weight_init(0.09),
-        activation_fn=None,
-        scope='test')
-    _ = nn_ops.relu6(conv)
+    with ops.name_scope(scope):
+      output = layers.conv2d(
+          input_tensor,
+          depth, [5, 5],
+          padding='SAME',
+          weights_initializer=weight_init(0.09),
+          activation_fn=None)
+      if pre_activation_bypass:
+        output += input_tensor
+      output = nn_ops.relu6(output)
+      if post_activation_bypass:
+        output += input_tensor
+    return output
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
index 639a7454a92aebd7289c59498cebff82cc003f75..db745aa56212af6a9c20e06ee9e4e5d6e27cf3c3 100644
--- a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import googletest
 
 batch_norm = layers.batch_norm
@@ -56,52 +57,46 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         (array_ops.identity, 'Identity', True, 5000),
     ]
     for params in parameters_list:
-      test_fn(params[0], params[1], params[2], params[3])
+      # Test everything with resource variables and normal variables.
+      test_fn(params[0], params[1], params[2], params[3], False)
+      test_fn(params[0], params[1], params[2], params[3], True)
 
-  def _TestQuantize_Conv2dWithoutBatchNorm(self, activation, activation_op_name,
-                                           with_bypass, delay):
-    """Tests quantization: inputs -> Conv2d no batch norm -> Activation.
-
-    Args:
-      activation: Callable that returns an Operation, a factory method for the
-        Activation.
-      activation_op_name: String, name of the Activation operation.
-      with_bypass: Bool, when true there is an extra connection added from
-        inputs to just before Activation.
-      delay: Int (optional), delay in number of steps until quantization starts.
-    """
-    graph = ops.Graph()
-    with graph.as_default():
-      batch_size, height, width, depth = 5, 128, 128, 3
-      inputs = array_ops.zeros((batch_size, height, width, depth))
-      stride = 1 if with_bypass else 2
-      out_depth = 3 if with_bypass else 32
-      activation_fn = None if with_bypass else activation
-      scope = 'test/test2' if with_bypass else 'test'
-      node = conv2d(inputs, out_depth, [5, 5], stride=stride, padding='SAME',
-                    weights_initializer=self._WeightInit(0.09),
-                    activation_fn=activation_fn, scope=scope)
-      if with_bypass:
-        node = math_ops.add(inputs, node, name='test/Add')
-        node = activation(node, name='test/' + activation_op_name)
-      update_barrier = control_flow_ops.no_op(name='update_barrier')
-      with ops.control_dependencies([update_barrier]):
-        array_ops.identity(node, name='control_dependency')
-
-      quantize.Quantize(graph, True, quant_delay=delay)
+  def _AssertCorrectQuantizedGraphWithoutBatchNorm(
+      self, graph, scope, layer, activation_op_name, with_bypass, delay,
+      use_resource):
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
-    expected_inputs = [
-        scope + '/weights_quant/AssignMinLast',
-        scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
-    ]
+
+    # Assemble the expected inputs.
+    if use_resource:
+      expected_inputs = [
+          scope + '/weights_quant/FakeQuantWithMinMaxVars/ReadVariableOp',
+          scope + '/weights_quant/FakeQuantWithMinMaxVars/ReadVariableOp_1',
+      ]
+      if layer == 'DepthwiseConv2dNative':
+        expected_inputs.append(scope + '/depthwise/ReadVariableOp')
+      else:
+        expected_inputs.append(scope + '/' + layer + '/ReadVariableOp')
+    else:
+      expected_inputs = [
+          scope + '/weights_quant/AssignMinLast',
+          scope + '/weights_quant/AssignMaxLast',
+      ]
+      if layer == 'DepthwiseConv2dNative':
+        expected_inputs.append(scope + '/depthwise_weights/read')
+      else:
+        expected_inputs.append(scope + '/weights/read')
+
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     if delay and delay > 0:
       output_op_name = scope + '/weights_quant/delayed_quant/Switch_1'
     else:
-      output_op_name = scope + '/Conv2D'
+      if layer == 'DepthwiseConv2dNative':
+        output_op_name = scope + '/depthwise'
+      else:
+        output_op_name = scope + '/' + layer
 
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
@@ -109,10 +104,17 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       conv_quant = graph.get_operation_by_name(scope + '/conv_quant/' +
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
-      expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
-      ]
+      if use_resource:
+        expected_inputs = [
+            scope + '/conv_quant/FakeQuantWithMinMaxVars/ReadVariableOp',
+            scope + '/conv_quant/FakeQuantWithMinMaxVars/ReadVariableOp_1',
+            scope + '/BiasAdd',
+        ]
+      else:
+        expected_inputs = [
+            scope + '/conv_quant/AssignMinEma',
+            scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
+        ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
                         if delay else 'test/Add')
@@ -121,22 +123,76 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     act_quant = graph.get_operation_by_name('test/act_quant/' +
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
-
-    expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
-        'test/' + activation_op_name
-    ]
+    if use_resource:
+      expected_inputs = [
+          'test/act_quant/FakeQuantWithMinMaxVars/ReadVariableOp',
+          'test/act_quant/FakeQuantWithMinMaxVars/ReadVariableOp_1',
+          'test/' + activation_op_name,
+      ]
+    else:
+      expected_inputs = [
+          'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
+          'test/' + activation_op_name
+      ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
     output_op_name = ('test/act_quant/delayed_quant/Switch_1'
                       if delay else 'control_dependency')
     self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
+    self._AssertIdempotent(graph)
 
   def testQuantize_Conv2dWithoutBatchNorm(self):
     self._RunWithoutBatchNormTestOverParameters(
         self._TestQuantize_Conv2dWithoutBatchNorm)
 
+  def _TestQuantize_Conv2dWithoutBatchNorm(self, activation, activation_op_name,
+                                           with_bypass, delay, use_resource):
+    """Tests quantization: inputs -> Conv2d no batch norm -> Activation.
+
+    Args:
+      activation: Callable that returns an Operation, a factory method for the
+        Activation.
+      activation_op_name: String, name of the Activation operation.
+      with_bypass: Bool, when true there is an extra connection added from
+        inputs to just before Activation.
+      delay: Int (optional), delay in number of steps until quantization starts.
+      use_resource: Bool, when true uses resource variables.
+    """
+    graph = ops.Graph()
+    with graph.as_default():
+      variable_scope.get_variable_scope().set_use_resource(use_resource)
+      batch_size, height, width, depth = 5, 128, 128, 3
+      inputs = array_ops.zeros((batch_size, height, width, depth))
+      stride = 1 if with_bypass else 2
+      out_depth = 3 if with_bypass else 32
+      activation_fn = None if with_bypass else activation
+      scope = 'test/test2' if with_bypass else 'test'
+      node = conv2d(
+          inputs,
+          out_depth, [5, 5],
+          stride=stride,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=activation_fn,
+          scope=scope)
+      if with_bypass:
+        node = math_ops.add(inputs, node, name='test/Add')
+        node = activation(node, name='test/' + activation_op_name)
+      update_barrier = control_flow_ops.no_op(name='update_barrier')
+      with ops.control_dependencies([update_barrier]):
+        array_ops.identity(node, name='control_dependency')
+
+      quantize.Quantize(graph, True, quant_delay=delay)
+
+    self._AssertCorrectQuantizedGraphWithoutBatchNorm(
+        graph, scope, 'Conv2D', activation_op_name, with_bypass, delay,
+        use_resource)
+
+  def testQuantize_FCWithoutBatchNorm(self):
+    self._RunWithoutBatchNormTestOverParameters(
+        self._TestQuantize_FCWithoutBatchNorm)
+
   def _TestQuantize_FCWithoutBatchNorm(self, activation, activation_op_name,
-                                       with_bypass, delay):
+                                       with_bypass, delay, use_resource):
     """Tests quantization: inputs -> FC no batch norm -> Activation.
 
     Args:
@@ -146,72 +202,40 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with_bypass: Bool, when true there is an extra connection added from
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
+      use_resource: Bool, when true uses resource variables.
     """
     graph = ops.Graph()
     with graph.as_default():
+      variable_scope.get_variable_scope().set_use_resource(use_resource)
       batch_size, depth = 5, 256
       inputs = array_ops.zeros((batch_size, depth))
       out_depth = 256 if with_bypass else 128
       activation_fn = None if with_bypass else activation
       scope = 'test/test2' if with_bypass else 'test'
-      node = fully_connected(inputs, out_depth,
-                             weights_initializer=self._WeightInit(0.03),
-                             activation_fn=activation_fn, scope=scope)
+      node = fully_connected(
+          inputs,
+          out_depth,
+          weights_initializer=self._WeightInit(0.03),
+          activation_fn=activation_fn,
+          scope=scope)
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
         node = activation(node, name='test/' + activation_op_name)
       update_barrier = control_flow_ops.no_op(name='update_barrier')
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
-
       quantize.Quantize(graph, True, quant_delay=delay)
 
-    quantization_node_name = 'FakeQuantWithMinMaxVars'
-    weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
-                                                quantization_node_name)
-    self.assertEqual(weights_quant.type, quantization_node_name)
-    expected_inputs = [
-        scope + '/weights_quant/AssignMinLast',
-        scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
-    ]
-    self._AssertInputOpsAre(weights_quant, expected_inputs)
-    if delay and delay > 0:
-      output_op_name = scope + '/weights_quant/delayed_quant/Switch_1'
-    else:
-      output_op_name = scope + '/MatMul'
-    self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
-
-    if with_bypass:
-      conv_quant = graph.get_operation_by_name(scope + '/conv_quant/' +
-                                               quantization_node_name)
-      self.assertEqual(conv_quant.type, quantization_node_name)
-      expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
-      ]
-      self._AssertInputOpsAre(conv_quant, expected_inputs)
-      output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
-                        if delay else 'test/Add')
-      self._AssertOutputGoesToOps(conv_quant, graph, [output_op_name])
-
-    act_quant = graph.get_operation_by_name('test/act_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(act_quant.type, quantization_node_name)
-    expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
-        'test/' + activation_op_name
-    ]
-    self._AssertInputOpsAre(act_quant, expected_inputs)
-    output_op_name = ('test/act_quant/delayed_quant/Switch_1'
-                      if delay else 'control_dependency')
-    self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
+    self._AssertCorrectQuantizedGraphWithoutBatchNorm(
+        graph, scope, 'MatMul', activation_op_name, with_bypass, delay,
+        use_resource)
 
-  def testQuantize_FCWithoutBatchNorm(self):
+  def testQuantize_DepthwiseConv2dWithoutBatchNorm(self):
     self._RunWithoutBatchNormTestOverParameters(
-        self._TestQuantize_FCWithoutBatchNorm)
+        self._TestQuantize_DepthwiseConv2dWithoutBatchNorm)
 
   def _TestQuantize_DepthwiseConv2dWithoutBatchNorm(
-      self, activation, activation_op_name, with_bypass, delay):
+      self, activation, activation_op_name, with_bypass, delay, use_resource):
     """Tests quantization: inputs -> DWConv2d no batch norm -> Activation.
 
     Args:
@@ -221,71 +245,36 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with_bypass: Bool, when true there is an extra connection added from
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
+      use_resource: Bool, when true uses resource variables.
     """
     graph = ops.Graph()
     with graph.as_default():
+      variable_scope.get_variable_scope().set_use_resource(use_resource)
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
       activation_fn = None if with_bypass else activation
       scope = 'test/test2' if with_bypass else 'test'
-      node = separable_conv2d(inputs, None, [5, 5], stride=stride,
-                              depth_multiplier=1.0, padding='SAME',
-                              weights_initializer=self._WeightInit(0.09),
-                              activation_fn=activation_fn, scope=scope)
+      node = separable_conv2d(
+          inputs,
+          None, [5, 5],
+          stride=stride,
+          depth_multiplier=1.0,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=activation_fn,
+          scope=scope)
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
         node = activation(node, name='test/' + activation_op_name)
       update_barrier = control_flow_ops.no_op(name='update_barrier')
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
-
       quantize.Quantize(graph, True, quant_delay=delay)
 
-    quantization_node_name = 'FakeQuantWithMinMaxVars'
-    weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
-                                                quantization_node_name)
-    self.assertEqual(weights_quant.type, quantization_node_name)
-    expected_inputs = [
-        scope + '/weights_quant/AssignMinLast',
-        scope + '/weights_quant/AssignMaxLast',
-        scope + '/depthwise_weights/read'
-    ]
-    self._AssertInputOpsAre(weights_quant, expected_inputs)
-    if delay and delay > 0:
-      output_op_name = scope + '/weights_quant/delayed_quant/Switch_1'
-    else:
-      output_op_name = scope + '/depthwise'
-    self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
-
-    if with_bypass:
-      conv_quant = graph.get_operation_by_name(scope + '/conv_quant/' +
-                                               quantization_node_name)
-      self.assertEqual(conv_quant.type, quantization_node_name)
-      expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
-      ]
-      self._AssertInputOpsAre(conv_quant, expected_inputs)
-      output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
-                        if delay else 'test/Add')
-      self._AssertOutputGoesToOps(conv_quant, graph, [output_op_name])
-
-    act_quant = graph.get_operation_by_name('test/act_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(act_quant.type, quantization_node_name)
-    expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
-        'test/' + activation_op_name
-    ]
-    self._AssertInputOpsAre(act_quant, expected_inputs)
-    output_op_name = ('test/act_quant/delayed_quant/Switch_1'
-                      if delay else 'control_dependency')
-    self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
-
-  def testQuantize_DepthwiseConv2dWithoutBatchNorm(self):
-    self._RunWithoutBatchNormTestOverParameters(
-        self._TestQuantize_DepthwiseConv2dWithoutBatchNorm)
+    self._AssertCorrectQuantizedGraphWithoutBatchNorm(
+        graph, scope, 'DepthwiseConv2dNative', activation_op_name, with_bypass,
+        delay, use_resource)
 
   def _RunBatchNormTestOverParameters(self, test_fn):
     # TODO(suharshs): Use parameterized test once OSS TF supports it.
@@ -317,13 +306,88 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         (array_ops.identity, 'Identity', True, 5000, True)
     ]
     for params in parameters_list:
-      test_fn(params[0], params[1], params[2], params[3], params[4])
+      # Test everything with resource variables and normal variables.
+      test_fn(params[0], params[1], params[2], params[3], params[4], False)
+      test_fn(params[0], params[1], params[2], params[3], params[4], True)
+
+  def _AssertCorrectQuantizedGraphWithBatchNorm(self, graph, scope, layer,
+                                                activation_op_name, with_bypass,
+                                                delay, use_resource):
+    quantization_node_name = 'FakeQuantWithMinMaxVars'
+    weights_quant = graph.get_operation_by_name(
+        scope + '/weights_quant/' + quantization_node_name)
+    self.assertEqual(weights_quant.type, quantization_node_name)
+    if use_resource:
+      expected_inputs = [
+          scope + '/weights_quant/FakeQuantWithMinMaxVars/ReadVariableOp',
+          scope + '/weights_quant/FakeQuantWithMinMaxVars/ReadVariableOp_1',
+      ]
+    else:
+      expected_inputs = [
+          scope + '/weights_quant/' + 'AssignMinLast',
+          scope + '/weights_quant/' + 'AssignMaxLast'
+      ]
+    expected_inputs.append(scope + '/mul_fold')
+
+    self._AssertInputOpsAre(weights_quant, expected_inputs)
+    if layer == 'DepthwiseConv2dNative':
+      output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
+                                if delay else '/depthwise_Fold')
+    else:
+      output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
+                                if delay else '/' + layer + '_Fold')
+    self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
+
+    if with_bypass:
+      conv_quant = graph.get_operation_by_name(
+          scope + '/conv_quant/' + quantization_node_name)
+      self.assertEqual(conv_quant.type, quantization_node_name)
+
+      if use_resource:
+        expected_inputs = [
+            scope + '/conv_quant/FakeQuantWithMinMaxVars/ReadVariableOp',
+            scope + '/conv_quant/FakeQuantWithMinMaxVars/ReadVariableOp_1',
+        ]
+      else:
+        expected_inputs = [
+            scope + '/conv_quant/AssignMinEma',
+            scope + '/conv_quant/AssignMaxEma',
+        ]
+      expected_inputs.append(scope + '/add_fold')
+
+      self._AssertInputOpsAre(conv_quant, expected_inputs)
+      output_op_name = (
+          scope + '/conv_quant/delayed_quant/Switch_1' if delay else 'test/Add')
+      self._AssertOutputGoesToOps(conv_quant, graph, [output_op_name])
+
+    act_quant = graph.get_operation_by_name(
+        'test/act_quant/' + quantization_node_name)
+    self.assertEqual(act_quant.type, quantization_node_name)
+
+    if use_resource:
+      expected_inputs = [
+          'test/act_quant/FakeQuantWithMinMaxVars/ReadVariableOp',
+          'test/act_quant/FakeQuantWithMinMaxVars/ReadVariableOp_1',
+      ]
+    else:
+      expected_inputs = [
+          'test/act_quant/AssignMinEma',
+          'test/act_quant/AssignMaxEma',
+      ]
+    expected_inputs.append('test/' + activation_op_name)
+
+    self._AssertInputOpsAre(act_quant, expected_inputs)
+    output_op_name = ('test/act_quant/delayed_quant/Switch_1'
+                      if delay else 'control_dependency')
+    self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
+    self._AssertIdempotent(graph)
 
   def testQuantize_Conv2dWithBatchNorm(self):
     self._RunBatchNormTestOverParameters(self._TestQuantize_Conv2dWithBatchNorm)
 
   def _TestQuantize_Conv2dWithBatchNorm(self, activation, activation_op_name,
-                                        with_bypass, delay, fused_batch_norm):
+                                        with_bypass, delay, fused_batch_norm,
+                                        use_resource):
     """Tests quantization: inputs -> Conv2d with batch norm -> Activation.
 
     Args:
@@ -334,9 +398,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
       fused_batch_norm: Bool, when true use FusedBatchNorm.
+      use_resource: Bool, when true uses resource variables.
     """
     graph = ops.Graph()
     with graph.as_default():
+      variable_scope.get_variable_scope().set_use_resource(use_resource)
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
@@ -353,7 +419,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           normalizer_params=self._BatchNormParams(fused_batch_norm),
           scope=scope)
 
-      # Manually add a bypass (optionaly) and an activation.
+      # Manually add a bypass (optional) and an activation.
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
 
@@ -364,52 +430,18 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         array_ops.identity(node, name='control_dependency')
 
       fold_batch_norms.FoldBatchNorms(graph, is_training=True)
-
       quantize.Quantize(graph, True, quant_delay=delay)
 
-    quantization_node_name = 'FakeQuantWithMinMaxVars'
-    weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
-                                                quantization_node_name)
-    self.assertEqual(weights_quant.type, quantization_node_name)
-    expected_inputs = [
-        scope + '/weights_quant/' + 'AssignMinLast',
-        scope + '/weights_quant/' + 'AssignMaxLast', scope + '/mul_fold'
-    ]
-    self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
-                              if delay else '/Conv2D_Fold')
-    self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
-
-    if with_bypass:
-      conv_quant = graph.get_operation_by_name(scope + '/conv_quant/' +
-                                               quantization_node_name)
-      self.assertEqual(conv_quant.type, quantization_node_name)
-      expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/add_fold'
-      ]
-      self._AssertInputOpsAre(conv_quant, expected_inputs)
-      output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
-                        if delay else 'test/Add')
-      self._AssertOutputGoesToOps(conv_quant, graph, [output_op_name])
-
-    act_quant = graph.get_operation_by_name('test/act_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(act_quant.type, quantization_node_name)
-    expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
-        'test/' + activation_op_name
-    ]
-    self._AssertInputOpsAre(act_quant, expected_inputs)
-    output_op_name = ('test/act_quant/delayed_quant/Switch_1'
-                      if delay else 'control_dependency')
-    self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
+      self._AssertCorrectQuantizedGraphWithBatchNorm(
+          graph, scope, 'Conv2D', activation_op_name, with_bypass, delay,
+          use_resource)
 
   def testQuantize_FCWithBatchNorm(self):
     self._RunBatchNormTestOverParameters(self._TestQuantize_FCWithBatchNorm)
 
   def _TestQuantize_FCWithBatchNorm(self, activation, activation_op_name,
-                                    with_bypass, delay, fused_batch_norm):
+                                    with_bypass, delay, fused_batch_norm,
+                                    use_resource):
     """Tests quantization: inputs -> FC with batch norm -> Activation.
 
     Args:
@@ -420,9 +452,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
       fused_batch_norm: Bool, when true use FusedBatchNorm.
+      use_resource: Bool, when true uses resource variables.
     """
     graph = ops.Graph()
     with graph.as_default():
+      variable_scope.get_variable_scope().set_use_resource(use_resource)
       batch_size, depth = 5, 256
       inputs = array_ops.zeros((batch_size, depth))
       out_depth = 256 if with_bypass else 128
@@ -436,7 +470,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           normalizer_params=self._BatchNormParams(fused_batch_norm),
           scope=scope)
 
-      # Manually add a bypass (optionaly) and an activation.
+      # Manually add a bypass (optional) and an activation.
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
 
@@ -450,43 +484,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
 
       quantize.Quantize(graph, True, quant_delay=delay)
 
-    quantization_node_name = 'FakeQuantWithMinMaxVars'
-    weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
-                                                quantization_node_name)
-    self.assertEqual(weights_quant.type, quantization_node_name)
-    expected_inputs = [
-        scope + '/weights_quant/' + 'AssignMinLast',
-        scope + '/weights_quant/' + 'AssignMaxLast', scope + '/mul_fold'
-    ]
-    self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
-                              if delay else '/MatMul_Fold')
-    self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
-
-    if with_bypass:
-      conv_quant = graph.get_operation_by_name(scope + '/conv_quant/' +
-                                               quantization_node_name)
-      self.assertEqual(conv_quant.type, quantization_node_name)
-      expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/add_fold'
-      ]
-      self._AssertInputOpsAre(conv_quant, expected_inputs)
-      output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
-                        if delay else 'test/Add')
-      self._AssertOutputGoesToOps(conv_quant, graph, [output_op_name])
-
-    act_quant = graph.get_operation_by_name('test/act_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(act_quant.type, quantization_node_name)
-    expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
-        'test/' + activation_op_name
-    ]
-    self._AssertInputOpsAre(act_quant, expected_inputs)
-    output_op_name = ('test/act_quant/delayed_quant/Switch_1'
-                      if delay else 'control_dependency')
-    self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
+    self._AssertCorrectQuantizedGraphWithBatchNorm(
+        graph, scope, 'MatMul', activation_op_name, with_bypass, delay,
+        use_resource)
 
   def testQuantize_DepthwiseConv2dWithBatchNorm(self):
     self._RunBatchNormTestOverParameters(
@@ -494,7 +494,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
 
   def _TestQuantize_DepthwiseConv2dWithBatchNorm(
       self, activation, activation_op_name, with_bypass, delay,
-      fused_batch_norm):
+      fused_batch_norm, use_resource):
     """Tests quantization: inputs -> DWConv2d with batch norm -> Activation.
 
     Args:
@@ -505,9 +505,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
       fused_batch_norm: Bool, when true use FusedBatchNorm.
+      use_resource: Bool, when true uses resource variables.
     """
     graph = ops.Graph()
     with graph.as_default():
+      variable_scope.get_variable_scope().set_use_resource(use_resource)
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
@@ -524,7 +526,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           normalizer_params=self._BatchNormParams(fused_batch_norm),
           scope=scope)
 
-      # Manually add a bypass (optionaly) and an activation.
+      # Manually add a bypass (optional) and an activation.
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
 
@@ -535,45 +537,21 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         array_ops.identity(node, name='control_dependency')
 
       fold_batch_norms.FoldBatchNorms(graph, is_training=True)
-
       quantize.Quantize(graph, True, quant_delay=delay)
-    quantization_node_name = 'FakeQuantWithMinMaxVars'
-    weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
-                                                quantization_node_name)
-    self.assertEqual(weights_quant.type, quantization_node_name)
-    expected_inputs = [
-        scope + '/weights_quant/' + 'AssignMinLast',
-        scope + '/weights_quant/' + 'AssignMaxLast', scope + '/mul_fold'
-    ]
-    self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
-                              if delay else '/depthwise_Fold')
-    self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
-    if with_bypass:
-      conv_quant = graph.get_operation_by_name(scope + '/conv_quant/' +
-                                               quantization_node_name)
-      self.assertEqual(conv_quant.type, quantization_node_name)
-      expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/add_fold'
-      ]
-      self._AssertInputOpsAre(conv_quant, expected_inputs)
-      output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
-                        if delay else 'test/Add')
-      self._AssertOutputGoesToOps(conv_quant, graph, [output_op_name])
+      self._AssertCorrectQuantizedGraphWithBatchNorm(
+          graph, scope, 'DepthwiseConv2dNative', activation_op_name,
+          with_bypass, delay, use_resource)
 
-    act_quant = graph.get_operation_by_name('test/act_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(act_quant.type, quantization_node_name)
-    expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
-        'test/' + activation_op_name
-    ]
-    self._AssertInputOpsAre(act_quant, expected_inputs)
-    output_op_name = ('test/act_quant/delayed_quant/Switch_1'
-                      if delay else 'control_dependency')
-    self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
+  def _AssertIdempotent(self, graph):
+    # Ensure that calling the rewrite again doesn't change the graph.
+    graph_def_before = str(graph.as_graph_def())
+    with graph.as_default():
+      # Ensuring that calling the rewrite again doesn't add more nodes.
+      fold_batch_norms.FoldBatchNorms(graph, is_training=True)
+      quantize.Quantize(graph, True)
+    graph_def_after = str(graph.as_graph_def())
+    self.assertEqual(graph_def_before, graph_def_after)
 
   def _BatchNormParams(self, fused=False):
     return {'center': True, 'scale': True, 'decay': 1.0 - 0.003, 'fused': fused}
@@ -587,7 +565,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       stddev: Standard deviation of normal variable.
 
     Returns:
-      An initialized that initialzes with a truncated normal variable.
+      An initialized that initializes with a truncated normal variable.
     """
     return init_ops.truncated_normal_initializer(stddev=stddev)
 
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index bb7be0809421b64a019e73f00aac6c58524222e8..5e479f394680420e510550a510d46d8fff2c03b0 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -82,9 +82,22 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
-    add_quant = graph.get_operation_by_name('test/add_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(add_quant.type, quantization_node_name)
+    conv_quant = graph.get_operation_by_name('test/test/conv_quant/' +
+                                             quantization_node_name)
+    self.assertEqual(conv_quant.type, quantization_node_name)
+
+    # Scan through all FakeQuant operations, ensuring that the activation
+    # isn't in the consumers of the operation. Since activations are folded
+    # the preceding operation during inference, the FakeQuant operation after
+    # the activation is all that is needed.
+    for op in graph.get_operations():
+      if op.type == quantization_node_name:
+        quant_op = graph.get_operation_by_name(op.name)
+        consumers = []
+        for output in quant_op.outputs:
+          consumers.extend(output.consumers())
+
+        self.assertNotIn('test/identity', [c.name for c in consumers])
 
   def testInsertQuantOpForAddAfterSeparableConv2d(self):
     self._RunTestOverParameters(
@@ -109,9 +122,210 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
-    add_quant = graph.get_operation_by_name('test/add_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(add_quant.type, quantization_node_name)
+    conv_quant = graph.get_operation_by_name('test/test/conv_quant/' +
+                                             quantization_node_name)
+    self.assertEqual(conv_quant.type, quantization_node_name)
+
+    for op in graph.get_operations():
+      if op.type == quantization_node_name:
+        quant_op = graph.get_operation_by_name(op.name)
+        # Scan through all FakeQuant operations, ensuring that the activation
+        # identity op isn't in the consumers of the operation.
+        consumers = []
+        for output in quant_op.outputs:
+          consumers.extend(output.consumers())
+
+        self.assertNotIn('test/identity', [c.name for c in consumers])
+
+  def testFinalLayerQuantized(self):
+    self._RunTestOverParameters(self._TestFinalLayerQuantized)
+
+  def _TestFinalLayerQuantized(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      input1 = array_ops.zeros((batch_size, height, width, depth))
+      _ = conv2d(
+          input1,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=None,
+          scope='test')
+      # Ensure that the a FakeQuant operation is in the outputs of the BiasAdd.
+      bias_add_op = graph.get_operation_by_name('test/BiasAdd')
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+      self.assertTrue('FakeQuantWithMinMaxVars' in
+                      [op.type for op in bias_add_op.outputs[0].consumers()])
+
+  def testPostActivationBypassQuantized(self):
+    self._RunTestOverParameters(self._TestPostActivationBypassQuantized)
+
+  def _TestPostActivationBypassQuantized(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      input1 = array_ops.zeros((batch_size, height, width, depth))
+      input2 = array_ops.zeros((batch_size, height / 2, width / 2, 32))
+      conv = conv2d(
+          input1,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=array_ops.identity,
+          scope='test/test')
+      bypass_tensor = math_ops.add(conv, input2, name='test/add')
+      # The output of the post_activation bypass will be another layer.
+      _ = conv2d(
+          bypass_tensor,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=array_ops.identity,
+          scope='test/unused')
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+      # Ensure that the bypass node is preceded by and followed by a
+      # FakeQuantWithMinMaxVar operation, since the output of the Add isn't an
+      # activation.
+      self.assertTrue('FakeQuantWithMinMaxVars' in
+                      [c.type for c in bypass_tensor.consumers()])
+      self.assertTrue('FakeQuantWithMinMaxVars' in
+                      [i.op.type for i in bypass_tensor.op.inputs])
+
+  def testOverlappingPostActivationBypassQuantized(self):
+    self._RunTestOverParameters(
+        self._TestOverlappingPostActivationBypassQuantized)
+
+  def _TestOverlappingPostActivationBypassQuantized(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      conv_input = array_ops.zeros((batch_size, height, width, depth))
+      conv1 = conv2d(
+          conv_input,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=array_ops.identity,
+          scope='test/test1')
+
+      # The bypass of this conv is the post activation bypass of the previous
+      # conv.
+      conv2 = conv2d(
+          conv_input,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=None,
+          scope='test/test2')
+
+      bypass_tensor = math_ops.add(conv1, conv2, name='test/add')
+      _ = array_ops.identity(bypass_tensor, name='test/output')
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+      # Ensure that the bypass node is preceded by a FakeQuantWithMinMaxVar
+      # operation, and NOT followed by one.
+      self.assertTrue('FakeQuantWithMinMaxVars' not in
+                      [c.type for c in bypass_tensor.consumers()])
+      self.assertTrue('FakeQuantWithMinMaxVars' in
+                      [i.op.type for i in bypass_tensor.op.inputs])
+
+      # Ensure that all the convs and activations are quantized.
+      op_names = [op.name for op in graph.get_operations()]
+      self.assertTrue(
+          'test/test1/weights_quant/FakeQuantWithMinMaxVars' in op_names)
+      self.assertTrue(
+          'test/test2/weights_quant/FakeQuantWithMinMaxVars' in op_names)
+      self.assertTrue(
+          'test/test1/act_quant/FakeQuantWithMinMaxVars' in op_names)
+      self.assertTrue('test/act_quant/FakeQuantWithMinMaxVars' in op_names)
+      self.assertEqual(
+          'Identity',
+          graph.get_operation_by_name(
+              'test/test1/act_quant/FakeQuantWithMinMaxVars').inputs[0].op.type)
+      self.assertEqual(
+          'Identity',
+          graph.get_operation_by_name(
+              'test/act_quant/FakeQuantWithMinMaxVars').inputs[0].op.type)
+
+  def testWithNameScope(self):
+    self._RunTestOverParameters(self._TestWithNameScope)
+
+  def _TestWithNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope('name_scope'):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+        quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+    for op in graph.get_operations():
+      self.assertTrue(not op.name.startswith('name_scope/name_scope/'),
+                      'Broken op: %s' % op.name)
+
+  def testWithNullNameScope(self):
+    self._RunTestOverParameters(self._TestWithNullNameScope)
+
+  def _TestWithNullNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope(None):
+        batch_size, height, width, depth = 5, 128, 128, 32
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+        quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+        # Passes if Quantize() does not crash.
+
+  def testWithNonMatchingNameScope(self):
+    self._RunTestOverParameters(self._testWithNonMatchingNameScope)
+
+  def _testWithNonMatchingNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope('name_scope'):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+    op_names_before_quantize = set([op.name for op in graph.get_operations()])
+    quantize.Quantize(
+        graph, is_training, weight_bits=8, activation_bits=8,
+        scope='NonExisting/')
+    op_names_after_quantize = set([op.name for op in graph.get_operations()])
+
+    # No ops should be inserted or removed.
+    self.assertEqual(op_names_before_quantize, op_names_after_quantize)
 
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
@@ -122,7 +336,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       stddev: Standard deviation of normal variable.
 
     Returns:
-      An initialized that initialzes with a truncated normal variable.
+      An initialized that initializes with a truncated normal variable.
     """
     return init_ops.truncated_normal_initializer(stddev=stddev)
 
diff --git a/tensorflow/contrib/receptive_field/BUILD b/tensorflow/contrib/receptive_field/BUILD
index e975aeaea7ee78f8e912be8ab1be61b9acc7b418..9325a14745c1db2f8c311602143175e736fc3c5f 100644
--- a/tensorflow/contrib/receptive_field/BUILD
+++ b/tensorflow/contrib/receptive_field/BUILD
@@ -106,15 +106,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/recurrent/BUILD b/tensorflow/contrib/recurrent/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b3cb04ce26d96333f516f1298c8d5c331964f05b
--- /dev/null
+++ b/tensorflow/contrib/recurrent/BUILD
@@ -0,0 +1,106 @@
+# Recurrent library.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+py_library(
+    name = "recurrent_py",
+    srcs = ["python/recurrent_api.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":functional_rnn_ops_py",
+        ":recurrent_ops_py",
+    ],
+)
+
+py_library(
+    name = "recurrent_ops_py",
+    srcs = ["python/ops/recurrent.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "functional_rnn_ops_py",
+    srcs = ["python/ops/functional_rnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":recurrent_ops_py",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:standard_ops",
+    ],
+)
+
+cuda_py_tests(
+    name = "recurrent_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/recurrent_test.py"],
+    additional_deps = [
+        ":recurrent_ops_py",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:variables",
+    ],
+    tags = ["nopip"],
+)
+
+cuda_py_tests(
+    name = "functional_rnn_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/functional_rnn_test.py"],
+    additional_deps = [
+        ":functional_rnn_ops_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/tpu:tpu",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    tags = ["nopip"],
+)
diff --git a/tensorflow/contrib/recurrent/README.md b/tensorflow/contrib/recurrent/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..86e10eee517f69d1316b76af85cb13a2bfea2984
--- /dev/null
+++ b/tensorflow/contrib/recurrent/README.md
@@ -0,0 +1,13 @@
+# Recurrent computation library
+
+The recurrent computation library contains code to perform recurrent
+computations.
+
+Its chief application is to implement recurrent neural networks (RNNs, LSTMs,
+etc), which is implemented in `functional_rnn.py`. Similar techniques may be
+used to implement deep networks.
+
+The computation saves the activations in the forward pass, and computes the
+gradients in the backward pass using a single accumulator.
+
+The `functional_rnn` interface is compatible with the `dynamic_rnn` API.
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f19ac7dbe0cee2eb6c780ec5ea6266bc847abd7
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
@@ -0,0 +1,163 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Functional RNN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+from tensorflow.contrib.recurrent.python.ops import functional_rnn
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import rnn as rnn_lib
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variables
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test as test_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+def _CreateStackedLstmCell(*cell_sizes):
+  subcells = [rnn_cell_impl.LSTMCell(cell_size) for cell_size in cell_sizes]
+  return rnn_cell_impl.MultiRNNCell(subcells)
+
+
+class FunctionalRnnTest(test_util.TensorFlowTestCase):
+
+  _BATCH_SIZE = 3
+  _TOTAL_TIME = 5
+  _INPUT_SIZE = 11
+  _NUM_UNITS = 7
+
+  # Set this to some output if you want to use it.
+  _LSTM_GRAPH_DEF_FILEPATH = None
+
+  _CELLDEFS = {
+      'gru': (rnn_cell_impl.GRUCell, [_NUM_UNITS]),
+      'lstm': (rnn_cell_impl.LSTMCell, [_NUM_UNITS]),
+      'stacked_lstm': (_CreateStackedLstmCell, [_NUM_UNITS] * 3)
+  }
+
+  def _CreateCell(self, celldef_name):
+    func, args = self._CELLDEFS[celldef_name]
+    return func(*args)
+
+  def _CreateInputs(self):
+    inputs = np.random.random([FunctionalRnnTest._BATCH_SIZE,
+                               FunctionalRnnTest._TOTAL_TIME,
+                               FunctionalRnnTest._INPUT_SIZE])
+    # Always leave one time slot empty, to check max_length behavior.
+    sequence_length = np.random.randint(
+        0, high=FunctionalRnnTest._TOTAL_TIME - 1,
+        size=FunctionalRnnTest._BATCH_SIZE,
+        dtype=np.int)
+    return (inputs, sequence_length)
+
+  def _CreateRnnGraph(self, create_rnn_computation_func, cell, tf_inputs,
+                      tf_sequence_length, initial_state=None,
+                      time_major=None, scope=None):
+    tf_result = create_rnn_computation_func(cell=cell, inputs=tf_inputs,
+                                            sequence_length=tf_sequence_length,
+                                            initial_state=initial_state,
+                                            dtype=dtypes.float32,
+                                            time_major=time_major,
+                                            scope=scope)
+    grad = gradients_impl.gradients(tf_result, variables.trainable_variables())
+    return {'inference': tf_result, 'grad': grad}
+
+  def _MaybeResetVariables(self, variable_cache, sess, var_list):
+    """Possibly resets the variables to a previously seen value."""
+    reset_ops = []
+    fetches = []
+    for var in var_list:
+      if var.name in variable_cache:
+        reset_ops += [var.assign(variable_cache[var.name])]
+      else:
+        fetches += [(var.name, var)]
+    if reset_ops:
+      sess.run(reset_ops)
+    if fetches:
+      val = sess.run(dict(fetches))
+      for n, v in val.items():
+        assert n not in variable_cache
+        variable_cache[n] = v
+
+  def _RunRnn(self, numpy_inputs, numpy_slen, cell_name, variable_cache,
+              is_dynamic):
+    with ops.Graph().as_default() as graph:
+      tf_inputs = array_ops.placeholder(
+          dtypes.float32, shape=numpy_inputs.shape)
+      tf_slen = array_ops.placeholder(dtypes.int32)
+      feeds = {tf_inputs: numpy_inputs, tf_slen: numpy_slen}
+      cell = self._CreateCell(cell_name)
+      fn = rnn_lib.dynamic_rnn if is_dynamic else functional_rnn.functional_rnn
+      fetches = self._CreateRnnGraph(fn, cell, tf_inputs, tf_slen)
+      with self.test_session(graph=graph) as sess:
+        sess.run(variables.global_variables_initializer())
+        # Note that cell.trainable_variables it not always set.
+        self._MaybeResetVariables(variable_cache, sess,
+                                  variables.trainable_variables())
+        val = sess.run(fetches, feed_dict=feeds)
+      graph_def = graph.as_graph_def()
+      return graph_def, val
+
+  def testRunLstm(self):
+    """Runs a simple LSTM. Does not check output."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    graphdef, _ = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, False)
+    logging.info('graphdef: %s', graphdef)
+    if self._LSTM_GRAPH_DEF_FILEPATH:
+      with open(self._LSTM_GRAPH_DEF_FILEPATH, 'w') as f:
+        f.write(str(graphdef))
+
+  def testLstm(self):
+    """Checks an LSTM against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    _, func_rnn = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, False)
+    _, dyn_rnn = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, True)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testGru(self):
+    """Checks a GRU cell against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    _, func_rnn = self._RunRnn(np_inputs, np_slen, 'gru', var_cache, False)
+    _, dyn_rnn = self._RunRnn(np_inputs, np_slen, 'gru', var_cache, True)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testStackedLstm(self):
+    """Checks a stacked LSTM cell against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    args = [np_inputs, np_slen, 'stacked_lstm', var_cache]
+    _, func_rnn = self._RunRnn(*(args + [False]))
+    _, dyn_rnn = self._RunRnn(*(args + [True]))
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+
+if __name__ == '__main__':
+  test_lib.main()
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..00fbd4fbb8205ceb649616050314e400be1785a5
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py
@@ -0,0 +1,192 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Recurrent ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.recurrent.python.ops import recurrent
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test as test_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+_ElmanState = collections.namedtuple('ElmanState', ('h'))
+_ElmanTheta = collections.namedtuple('ElmanTheta', ('w', 'b'))
+_ElmanInputs = collections.namedtuple('ElmanInputs', ('x'))
+
+
+# TODO(drpng): add test for max length computation.
+class RecurrentTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    # pylint:disable=invalid-name
+    _PolyState = collections.namedtuple('PolyState', ('value', 'x_power'))
+    _PolyTheta = collections.namedtuple('PolyTheta', ('x'))
+    _PolyInputs = collections.namedtuple('PolyInputs', ('coeff'))
+    # pylint:enable=invalid-name
+
+    def Poly(theta, state, inputs):
+      next_state = _PolyState(
+          value=state.value + inputs.coeff * state.x_power,
+          x_power=state.x_power * theta.x)
+      return next_state, []
+
+    with self.test_session() as sess:
+      theta = _PolyTheta(x=array_ops.constant(2.0))
+      state = _PolyState(
+          value=array_ops.constant(0.0),
+          x_power=array_ops.constant(1.0))
+      inputs = _PolyInputs(coeff=array_ops.constant([1., 2., 3.]))
+
+      # x = 2
+      # 1 + 2*x + 3*x^2
+      ret = recurrent.Recurrent(theta, state, inputs, Poly)
+
+      acc, state = sess.run(ret)
+      self.assertAllClose(acc.value, [1., 5., 17.])
+      self.assertAllClose(acc.x_power, [2., 4., 8.])
+      self.assertAllClose(state.value, 17.)
+      self.assertAllClose(state.x_power, 8.)
+
+      y = ret[1].value
+      dx, d_coeff = gradients_impl.gradients(ys=[y], xs=[theta.x, inputs.coeff])
+      dx_val, d_coeff_val = sess.run([dx, d_coeff])
+
+      # 2 + 6*x
+      self.assertAllClose(dx_val, 14.)
+      self.assertAllClose(d_coeff_val, [1., 2., 4.])
+
+      # acc = [1, 1+2x, 1+2x+3x^2]
+      # sum(acc) = 3 + 4x + 3x^2
+      acc = ret[0].value
+      dx, d_coeff = gradients_impl.gradients(
+          ys=[math_ops.reduce_sum(acc)], xs=[theta.x, inputs.coeff])
+      dx_val, d_coeff_val = sess.run([dx, d_coeff])
+      # 4 + 6*x
+      self.assertAllClose(dx_val, 16.)
+      self.assertAllClose(d_coeff_val, [3., 4., 4.])
+
+  @staticmethod
+  def Rand(shape):
+    return random_ops.random_uniform(
+        shape, minval=-0.2, maxval=0.2, dtype=dtypes.float64)
+
+  @staticmethod
+  def Elman(theta, state0, inputs):
+    h0, w, b, x = state0.h, theta.w, theta.b, inputs.x
+    xw = math_ops.matmul(array_ops.concat([x, h0], axis=1), w)
+    h1 = math_ops.sigmoid(xw + b)
+    state1 = _ElmanState(h=h1)
+    return (state1, state1)
+
+  @staticmethod
+  def ElmanGrad(theta, state0, inputs, extras, dstate1):
+
+    @function.Defun()
+    def Grad(h0, w, b, x, h1, dh1):
+      del b
+      # We hand-roll the gradient for the 2nd half of the cell as a demo.
+      dxwb = (dh1 * (1 - h1) * h1)
+      dxw, db = dxwb, math_ops.reduce_sum(dxwb, axis=0)
+
+      # Uses tf.gradient for the 1nd half of the cell as a demo.
+      xw = math_ops.matmul(array_ops.concat([x, h0], axis=1), w)
+      dh0, dx, dw = gradients_impl.gradients(
+          ys=[xw], xs=[h0, x, w], grad_ys=[dxw])
+
+      return dh0, dx, dw, db
+
+    dh0, dx, dw, db = Grad(state0.h, theta.w, theta.b, inputs.x,
+                           extras.h, dstate1.h)
+    dstate0 = _ElmanState(h=dh0)
+    dinputs = _ElmanInputs(x=dx)
+    return (_ElmanTheta(w=dw, b=db), dstate0, dinputs)
+
+  @staticmethod
+  def ElmanOut(state1):
+    return _ElmanState(x=state1.h)
+
+  @staticmethod
+  def ElmanOutGrad(dout):
+    return _ElmanState(h=dout.x)
+
+  def testElman(self):
+    for seqlen, use_grad in [(1, False), (1, True), (7, False), (7, True)]:
+      logging.info('== Elman: seqlen=%s, use_grad=%s', seqlen, use_grad)
+      self._ParameterizedTestElman(seqlen, use_grad)
+
+  def _ParameterizedTestElman(self, seqlen, use_grad):
+
+    with self.test_session() as sess:
+      random_seed.set_random_seed(342462)
+
+      batch = 3
+      dims = 4
+      theta = _ElmanTheta(w=RecurrentTest.Rand([2 * dims, dims]),
+                          b=RecurrentTest.Rand([dims]))
+      state0 = _ElmanState(h=RecurrentTest.Rand([batch, dims]))
+      inputs = _ElmanInputs(x=RecurrentTest.Rand([seqlen, batch, dims]))
+
+      # Statically unrolled.
+      s = state0
+      out = []
+      for i in xrange(seqlen):
+        inp = _ElmanInputs(x=inputs.x[i, :])
+        s, _ = RecurrentTest.Elman(theta, s, inp)
+        out += [s.h]
+      acc0, final0 = array_ops.stack(out), s.h
+      loss0 = math_ops.reduce_sum(acc0) + math_ops.reduce_sum(final0)
+      (dw0, db0, dh0, di0) = gradients_impl.gradients(
+          loss0, [theta.w, theta.b, state0.h, inputs.x])
+
+      acc1, final1 = recurrent.Recurrent(
+          theta=theta,
+          state0=state0,
+          inputs=inputs,
+          cell_fn=RecurrentTest.Elman,
+          cell_grad=RecurrentTest.ElmanGrad if use_grad else None)
+      assert isinstance(acc1, _ElmanState)
+      assert isinstance(final1, _ElmanState)
+      acc1, final1 = acc1.h, final1.h
+      loss1 = math_ops.reduce_sum(acc1) + math_ops.reduce_sum(final1)
+      (dw1, db1, dh1, di1) = gradients_impl.gradients(
+          loss1, [theta.w, theta.b, state0.h, inputs.x])
+
+      # Fetches a few values and compare them.
+      (acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0,
+       di1) = sess.run(
+           [acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0, di1])
+      self.assertAllClose(acc0, acc1)
+      self.assertAllClose(final0, final1)
+      self.assertAllClose(dw0, dw1)
+      self.assertAllClose(db0, db1)
+      self.assertAllClose(dh0, dh1)
+      self.assertAllClose(di0, di1)
+
+if __name__ == '__main__':
+  test_lib.main()
diff --git a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a085474c1bf6117ba5663139c78d8f08f71392d3
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
@@ -0,0 +1,396 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tf.nn.dynamic_rnn variant, built on the Recurrent class.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib.recurrent.python.ops import recurrent
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def _GetDTypesFromStructure(struct):
+  dtypes_list = []
+  for x in nest.flatten(struct):
+    x = ops.convert_to_tensor(x)
+    dtypes_list.append(x.dtype)
+  return dtypes_list
+
+
+def _SetShapeFromTemplate(struct, struct_template):
+  as_list = nest.flatten(struct)
+  template_as_list = nest.flatten(struct_template)
+  for element, template in zip(as_list, template_as_list):
+    element.set_shape(template.shape)
+
+
+class _FunctionalRnnCell(object):
+  """Wrapper around RNNCell which separates state from computation.
+
+  This class accomplishes the following:
+  * Turn the cell's `__call__` function into a pure function. The global
+    side effects are separated as `theta`. They are the variables created
+    for the weights of the computation.
+  * Unless the output is aliased as part of the state, extend the state to
+    contain the output so that we store the history in `Recurrent`.
+  * Set static shapes as required.
+  """
+
+  def __init__(self, rnn_cell, seq_inputs, initial_state):
+    assert initial_state is not None
+
+    # TODO(drpng): Dtype needs to be configurable.
+    input_dtypes = [dtypes.float32] + _GetDTypesFromStructure(initial_state)
+    # See _index.
+    like_inputs_t = nest.map_structure(
+        lambda x: array_ops.stop_gradient(array_ops.gather(x, 0)), seq_inputs)
+    input_structure = (like_inputs_t, initial_state)
+
+    @function.Defun(*input_dtypes)
+    def FlatCellStep(*flat_inputs):
+      """The flattened version of `rnn_cell`."""
+      inputs_t, state0 = nest.pack_sequence_as(input_structure, flat_inputs)
+      _SetShapeFromTemplate(state0, initial_state)
+      _SetShapeFromTemplate(inputs_t, like_inputs_t)
+      outputs_t, state1 = rnn_cell(inputs_t, state0)
+      state_list = nest.flatten(state1)
+      self._output_shape = outputs_t.shape
+
+      if outputs_t in state_list:
+        output_index_in_state = state_list.index(outputs_t)
+      else:
+        output_index_in_state = None
+
+      if output_index_in_state is None:
+        self._prepend_output = True
+        self._output_state_idx = 0
+        return [outputs_t] + state_list
+      else:
+        self._output_state_idx = output_index_in_state
+        self._prepend_output = False
+        # To save memory, we don't store return the output separately
+        # from the state list, since we know it's the same.
+        return state_list
+
+    def _ToPureFunction(func):
+      # NOTE: This forces the creating of the function.
+      if func.captured_inputs:
+        pure_func = copy.copy(func)
+        # pylint: disable=protected-access
+        pure_func._extra_inputs = []
+        return pure_func
+      return func
+
+    pure_flat_cell_step = _ToPureFunction(FlatCellStep)
+
+    def CellStep(theta, extended_state0, inputs_t):
+      """Performs one time steps on structured inputs.
+
+      The purpose of this function is to turn the parameters into flattened
+      versions, and to resolve the parameter order difference between
+      `Recurrent` and `RNNCell`.
+
+      In the event the cell returns a transformed output that is not aliased
+      within its state, the `extended_state0` also contains the output as its
+      first element.
+
+      Args:
+        theta: Weights required for the computation. A structure of tensors.
+        extended_state0: the state0, and possibly the output at the previous
+          time step. A structure of tensors.
+        inputs_t: the inputs at time t.
+
+      Returns:
+        A pair of the next state (inclusive of the output), and an empty list
+        (unused `extras`).
+        The next state is congruent to state0.
+      """
+      extended_state0_flat = nest.flatten(extended_state0)
+      state0_flat = self.MaybeRemoveOutputFromState(extended_state0_flat)
+      full_inputs = [inputs_t] + state0_flat + theta
+      # Note that the thetas are additional inputs appeneded as extra
+      # parameters.
+      cell_out = pure_flat_cell_step(*full_inputs)
+      return cell_out, []
+
+    self._cell_step = CellStep
+    self._theta = FlatCellStep.captured_inputs
+    self._zero_state = rnn_cell.zero_state
+    self._state_template = initial_state
+    self._output_size = rnn_cell.output_size
+
+  @property
+  def extended_initial_state(self):
+    if self._prepend_output:
+      return [array_ops.zeros(self._output_shape), self._state_template]
+    else:
+      # The base case, where the output is just the hidden state.
+      return self._state_template
+
+  @property
+  def cell_step(self):
+    return self._cell_step
+
+  @property
+  def theta(self):
+    return self._theta
+
+  @property
+  def state_template(self):
+    return self._state_template
+
+  @property
+  def output_shape(self):
+    return self._output_shape
+
+  def GetOutputFromState(self, state):
+    return nest.flatten(state)[self._output_state_idx]
+
+  def MaybeRemoveOutputFromState(self, flat_state):
+    if self._prepend_output:
+      return flat_state[1:]
+    return flat_state
+
+
+def _ApplyLengthsToBatch(sequence_lengths, tf_output):
+  # TODO(drpng): just use Update so that we don't carry over the gradients?
+  """Sets the output to be zero at the end of the sequence."""
+  # output is batch major.
+  batch_size, max_time, vector_size = tf_output.shape
+  output_time = array_ops.tile(math_ops.range(0, max_time), [batch_size])
+  output_time = array_ops.reshape(output_time, [batch_size, max_time])
+  lengths = array_ops.tile(
+      array_ops.reshape(sequence_lengths, [-1, 1]), [1, max_time])
+  is_less = math_ops.cast(
+      math_ops.less(output_time, lengths), dtype=dtypes.float32)
+  keep_mask = array_ops.tile(
+      array_ops.expand_dims(is_less, -1),
+      [1, 1, vector_size])
+  final_output = keep_mask * tf_output
+  return final_output
+
+
+def _PickFinalStateFromHistory(acc_state, sequence_length):
+  """Implements acc_state[sequence_length - 1]."""
+  # This will work on all platforms, unlike the regular slice.
+  last_value = []
+  for state_var in nest.flatten(acc_state):
+    # We compute the following with matrix operations:
+    # last_var = state_var[sequence_length - 1]
+    shape = array_ops.shape(state_var)
+    max_time, batch_size = shape[0], shape[1]
+    output_time = array_ops.tile(math_ops.range(0, max_time), [batch_size])
+    output_time = array_ops.reshape(output_time, [batch_size, max_time])
+    lengths = array_ops.tile(array_ops.reshape(sequence_length,
+                                               [-1, 1]), [1, max_time])
+    last_idx = math_ops.cast(math_ops.equal(output_time, lengths - 1),
+                             dtype=dtypes.float32)
+    last_idx = array_ops.transpose(last_idx)
+    last_idx_for_bcast = array_ops.expand_dims(last_idx, -1)
+    sliced = math_ops.multiply(last_idx_for_bcast, state_var)
+    last_var = math_ops.reduce_sum(sliced, 0)
+    last_value += [last_var]
+  return nest.pack_sequence_as(acc_state, last_value)
+
+
+def _PostProcessOutput(extended_acc_state, extended_final_state, func_cell,
+                       total_time, inputs_lengths):
+  """Post-process output of recurrent.
+
+  This function takes the accumulated extended state and extracts the requested
+  state and output.
+
+  When `inputs_lengths` has been set, it extracts the output from the
+  accumulated state. It also sets outputs past.
+
+  It also sets the static shape information.
+
+  Args:
+    extended_acc_state: A structure containing the accumulated state at each
+      time. It may contain the output at each time as well.
+    extended_final_state: A structure containing the final state. It may
+      contain the output at the final time.
+    func_cell: The functional wrapper around the cell.
+    total_time: A scalar integer tensor.
+    inputs_lengths: An integer tensor with one entry per input.
+
+  Returns:
+    A tuple with the outputs at each time, and the final state.
+  """
+  if inputs_lengths is None:
+    flat_final_state = func_cell.MaybeRemoveOutputFromState(
+        nest.flatten(extended_final_state))
+    tf_state = nest.pack_sequence_as(func_cell.state_template, flat_final_state)
+  else:
+    # The accumulated state is over the entire sequence, so we pick it
+    # out from the acc_state sequence.
+    flat_acc_state = func_cell.MaybeRemoveOutputFromState(
+        nest.flatten(extended_acc_state))
+    acc_state = nest.pack_sequence_as(
+        func_cell.state_template, flat_acc_state)
+    tf_state = _PickFinalStateFromHistory(acc_state, inputs_lengths)
+
+  output_from_state = func_cell.GetOutputFromState(extended_acc_state)
+  tf_output = array_ops.transpose(output_from_state, [1, 0, 2])
+  tf_output.set_shape(
+      [func_cell.output_shape[0], total_time, func_cell.output_shape[1]])
+  if inputs_lengths is not None:
+    # Need set the outputs to zero.
+    tf_output = _ApplyLengthsToBatch(inputs_lengths, tf_output)
+    # tf_output = array_ops.zeros([4, 3, 5])
+  _SetShapeFromTemplate(tf_state, func_cell.state_template)
+  return tf_output, tf_state
+
+
+# pylint: disable=invalid-name
+def functional_rnn(cell, inputs, sequence_length=None,
+                   initial_state=None, dtype=None, time_major=False,
+                   scope=None, use_tpu=False):
+  """Same interface as `tf.nn.dynamic_rnn`."""
+  with variable_scope.variable_scope(scope or 'rnn'):
+    if not time_major:
+      inputs = nest.map_structure(
+          lambda t: array_ops.transpose(t, [1, 0, 2]), inputs)
+    inputs_flat = nest.flatten(inputs)
+    batch_size = array_ops.shape(inputs_flat[0])[1]
+    if initial_state is None:
+      initial_state = cell.zero_state(batch_size, dtype)
+    func_cell = _FunctionalRnnCell(cell, inputs, initial_state)
+  extended_acc_state, extended_final_state = recurrent.Recurrent(
+      theta=func_cell.theta,
+      state0=func_cell.extended_initial_state,
+      inputs=inputs,
+      cell_fn=func_cell.cell_step,
+      use_tpu=use_tpu)
+  return _PostProcessOutput(extended_acc_state, extended_final_state,
+                            func_cell, inputs_flat[0].shape[0], sequence_length)
+
+
+def bidirectional_functional_rnn(
+    cell_fw,
+    cell_bw,
+    inputs,
+    initial_state_fw=None,
+    initial_state_bw=None,
+    dtype=None,
+    sequence_length=None,
+    time_major=False,
+    use_tpu=False,
+    scope=None):
+  """Creates a bidirectional recurrent neural network.
+
+  Performs fully dynamic unrolling of inputs in both directions. Built to be API
+  compatible with `tf.nn.bidirectional_dynamic_rnn`, but implemented with
+  functional control flow for TPU compatibility.
+
+  Args:
+    cell_fw: An instance of `tf.contrib.rnn.RNNCell`.
+    cell_bw: An instance of `tf.contrib.rnn.RNNCell`.
+    inputs: The RNN inputs. If time_major == False (default), this must be a
+      Tensor (or hierarchical structure of Tensors) of shape
+      [batch_size, max_time, ...]. If time_major == True, this must be a Tensor
+      (or hierarchical structure of Tensors) of shape:
+      [max_time, batch_size, ...]. The first two dimensions must match across
+      all the inputs, but otherwise the ranks and other shape components may
+      differ.
+    initial_state_fw: An optional initial state for `cell_fw`. Should match
+      `cell_fw.zero_state` in structure and type.
+    initial_state_bw: An optional initial state for `cell_bw`. Should match
+      `cell_bw.zero_state` in structure and type.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_states are not provided or RNN state has a
+      heterogeneous dtype.
+    sequence_length: An optional int32/int64 vector sized [batch_size]. Used to
+      copy-through state and zero-out outputs when past a batch element's
+      sequence length. So it's more for correctness than performance.
+    time_major: Whether the `inputs` tensor is in "time major" format.
+    use_tpu: Whether to enable TPU-compatible operation. If True, does not truly
+      reverse `inputs` in the backwards RNN. Once b/69305369 is fixed, we can
+      remove this flag.
+    scope: An optional scope name for the dynamic RNN.
+
+  Returns:
+    outputs: A tuple of `(output_fw, output_bw)`. The output of the forward and
+      backward RNN. If time_major == False (default), these will
+      be Tensors shaped: [batch_size, max_time, cell.output_size]. If
+      time_major == True, these will be Tensors shaped:
+      [max_time, batch_size, cell.output_size]. Note, if cell.output_size is a
+      (possibly nested) tuple of integers or TensorShape objects, then the
+      output for that direction will be a tuple having the same structure as
+      cell.output_size, containing Tensors having shapes corresponding to the
+      shape data in cell.output_size.
+    final_states: A tuple of `(final_state_fw, final_state_bw)`. A Tensor or
+      hierarchical structure of Tensors indicating the final cell state in each
+      direction. Must have the same structure and shape as cell.zero_state.
+
+  Raises:
+    ValueError: If `initial_state_fw` is None or `initial_state_bw` is None and
+      `dtype` is not provided.
+  """
+  # Keep this code in sync with tf.nn.dynamic_rnn for compatibility.
+  with variable_scope.variable_scope(scope or 'bidirectional_rnn'):
+    # Forward direction
+    with variable_scope.variable_scope('fw') as fw_scope:
+      output_fw, output_state_fw = functional_rnn(
+          cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+          initial_state=initial_state_fw, dtype=dtype,
+          time_major=time_major, scope=fw_scope, use_tpu=use_tpu)
+    # Backward direction
+    if not time_major:
+      time_dim = 1
+      batch_dim = 0
+    else:
+      time_dim = 0
+      batch_dim = 1
+
+    def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+      if seq_lengths is not None:
+        return array_ops.reverse_sequence(
+            input=input_, seq_lengths=seq_lengths,
+            seq_dim=seq_dim, batch_dim=batch_dim)
+      else:
+        # See b/69305369.
+        assert not use_tpu, (
+            'Bidirectional with variable sequence lengths unsupported on TPU')
+        return array_ops.reverse(input_, axis=[seq_dim])
+
+    with variable_scope.variable_scope('bw') as bw_scope:
+      inputs_reverse = _reverse(
+          inputs, seq_lengths=sequence_length,
+          seq_dim=time_dim, batch_dim=batch_dim)
+      tmp, output_state_bw = functional_rnn(
+          cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+          initial_state=initial_state_bw, dtype=dtype,
+          time_major=time_major, scope=bw_scope, use_tpu=use_tpu)
+
+  output_bw = _reverse(
+      tmp, seq_lengths=sequence_length,
+      seq_dim=time_dim, batch_dim=batch_dim)
+
+  outputs = (output_fw, output_bw)
+  output_states = (output_state_fw, output_state_bw)
+
+  return (outputs, output_states)
+# pylint: enable=invalid-name
diff --git a/tensorflow/contrib/recurrent/python/ops/recurrent.py b/tensorflow/contrib/recurrent/python/ops/recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa16b82ab62f27d034c3ca7584e7e1ca14be6f9b
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/ops/recurrent.py
@@ -0,0 +1,720 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent computation.
+
+The main interface of this module is Recurrent().
+A recurrent computation describes an auto-regressive process, where outputs
+of one time step are fed to the output of the next time step.
+
+This module uses:
+  theta: the "weights" each RNN uses.
+  state0: the initial state of each RNN.
+  cell_fn: A python function describing RNN cell. It must has the following
+    signature:
+         cell_fn: (theta, state0, inputs) -> (state1, extras)
+    state1 is the next RNN state, extras are computed by cell_fn
+    and the library forwards extras to cell_fn's gradient function.
+  cell_grad: A python function describing the backprop gradient function
+    for the RNN cell. It must has the following signature:
+         cell_grad: (theta, state0, inputs, extras, dstate1) -> (
+                  dtheta, dstate0, dinputs)
+    dstate1 is what the backprop algorithm provides representing
+    gradients of state1 w.r.t. the final loss.
+
+In this module, we handle structures of tensors for theta, state0, inputs,
+and extras. The structure is an arbitrarily nested python structure, such
+as a dictionary of named tuples.
+
+Because the computation is a left-to-right chain, a single in-place accumulator
+can be used rather than a stack. Thus a special gradient was written to reduce
+unnecessary memory usage.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.inplace_ops import alias_inplace_update
+from tensorflow.python.util import nest
+
+
+def _AssertIsCompatible(a, b):
+  """Checks that `a` and `b` are nested structures of the same type."""
+  # TODO(drpng): implement.
+  del a
+  del b
+
+
+def _Index(struct, index):
+  """Returns a structure with `x[index]` for each tensor `x` in the structure.
+
+  Args:
+    struct: A structure of tensors.
+    index: A scalar integer tensor. Performance is better if `index` is
+      on the host memory.
+
+  Returns:
+    A structure of tensors congruent to `struct`.
+    For each key in `ret`, `rets[key] = struct[key][index]`.
+  """
+  index = ops.convert_to_tensor(index)
+  index.get_shape().assert_has_rank(0)
+  return nest.map_structure(lambda x: x[index], struct)
+
+
+def _Update(struct_acc, struct_x, t):
+  """Updates t-th row in accumulators.
+
+  Args:
+    struct_acc: The accumulators. A structure of tensors.
+    struct_x: The new values. A structure of tensors congruent to `struct_acc`.
+    t: A scalar integer. Performance is better if `t` is on the device
+      memory.
+
+  Returns:
+    A structure of tensors. Say, ret is a returned dictionary. Then, for
+    each key, we have:
+      ret[key] = struct_acc[key];
+      ret[key][t, :] = struct_x[key]
+  """
+  to_skip_update = set()
+  acc_lst = nest.flatten(struct_acc)
+  x_lst = nest.flatten(struct_x)
+  t = math_ops.to_int32([t])  # tf.to_int32 casts on-device tensors.
+  lst = []
+  for acc, x in zip(acc_lst, x_lst):
+    if acc in to_skip_update:
+      # Until b/62105730 is fixed, we need to avoid inplace update for tensors
+      # of rank 1.  could reshape to handle it, but we don't really need the
+      # values applied to these, so just skip their modification.
+      lst += [acc]
+    else:
+      lst += [alias_inplace_update(acc, t, array_ops.expand_dims(x, 0))]
+  return nest.pack_sequence_as(struct_acc, lst)
+
+
+def _SeqLenDim(struct):
+  """Returns the 0-th dim size of tensors in a structure of tensors.
+
+  This is the max sequence length according to the shape of the inputs.
+
+  Args:
+    struct: A structure of tensors. Every tensor's 0-th dim has the same size.
+
+  Returns:
+    A scalar tensor which is the size of 0-th dim of every tensors in struct.
+  """
+  xs = nest.flatten(struct)
+  assert xs
+  dim0 = array_ops.shape(xs[0])[0]
+  return dim0
+
+
+def _Flatten(struct):
+  """Flattens a structure."""
+  return nest.flatten(struct)
+
+
+def _Pack(elements, struct_template):
+  """Packs the list of tensors according to the structure.
+
+  In the event that `elements` should be a scalar, `struct_template` must
+  contain exactly one non-trivial element (for instance, `[[], {'x':elt}]`).
+
+  Args:
+    elements: Elements to be packed. A list of tensor, or a single tensor.
+    struct_template: The container structure in which to pack them.
+  Returns:
+    A python structure of the same type as `struct_template`, containing
+    `elements` as its contained elements.
+  """
+  if not nest.is_sequence(elements):
+    return nest.pack_sequence_as(struct_template, [elements])
+  return nest.pack_sequence_as(struct_template, elements)
+
+
+def _EmptyAcc(slen, struct_template):
+  """Creates a set of accumulators for tensors in structure.
+
+  Args:
+    slen: The sequence length. A scalar tensor.
+    struct_template: A structure of tensors.
+
+  Returns:
+    A structure congruent to `struct_template`. Say ret is a returned
+    dictionary. Then, `ret.key`, a tensor, has the same dtype as
+    `struct_template.key`. The tensor's shape has 1 more dimension
+    than the tensor `struct_template.key`. The extra 0-th dimension is of size
+    `slen`. E.g., if `slen=10` and `struct_template.key`'s shape is `[3, 5]`,
+    then, `ret.key`'s shape is `[10, 3, 5]`.
+  """
+
+  def _EmptyAccForTensor(tensor):
+    return inplace_ops.empty(
+        array_ops.concat([[slen], array_ops.shape(tensor)], axis=0),
+        tensor.dtype,
+        init=True)
+
+  return nest.map_structure(_EmptyAccForTensor, struct_template)
+
+
+def _EmptyLike(struct):
+  """Creates a set of empty initialized tensors.
+
+  Args:
+    struct: A structure of tensors.
+
+  Returns:
+    A struct of tensors. Each tensor has the same shape and dtype as
+    its corresponding tensor in `struct`. And each tensor is initialized.
+  """
+  return nest.map_structure(
+      lambda x: inplace_ops.empty_like(x, init=True), struct)
+
+
+def _Add(struct_x, struct_y):
+  """Adds tensors in `struct_x` with respective tensors in `struct_y`.
+
+  Args:
+    struct_x: A struct of tensors.
+    struct_y: A struct of tensors congruent to `struct_x`.
+
+  Returns:
+    A struct of tensors. Each element of the returned value
+  equals `x + y`, with corresponding values in `struct_x` and `struct_y`.
+  """
+  list_x = nest.flatten(struct_x)
+  list_y = nest.flatten(struct_y)
+  z = []
+  for x, y in zip(list_x, list_y):
+    z += [math_ops.add(x, y)]
+  return nest.pack_sequence_as(struct_x, z)
+
+
+def _Dtypes(struct):
+  """Returns all tensors' data types in a list."""
+  return [x.dtype for x in nest.flatten(struct)]
+
+
+def _ConvertNoneGradientToZeros(xs, dxs):
+  """Sanitize dxs so that None becomes zeros appropriately.
+
+  Args:
+    xs: A list of tensors.
+    dxs: A list of tensors. dxs[i] corresponds to xs[i]'s gradient.
+
+  Returns:
+    A structure same as `dxs` with `None` replaced by a zero tensor.
+  """
+  list_xs = nest.flatten(xs)
+  list_dxs = nest.flatten(dxs)
+
+  # If x does not get any backprop-ed gradient, propagate zeros.
+  rets = []
+  for (x, dx) in zip(list_xs, list_dxs):
+    if dx is None:
+      rets.append(array_ops.zeros_like(x))
+    else:
+      rets.append(dx)
+
+  return nest.pack_sequence_as(dxs, rets)
+
+
+# All structures are flattened for use internally. This is for simplicity
+# and also to use the Defun construct.
+# In the forward pass (inference), the computation is structured as follows.
+# Forward: [gradient = _Recurrent.Grad]
+#   Flatten structures, create accumulators.
+#   for t = 0..max_input_length:
+#     Defun ForwardLoopBody:
+#       Defun Fwd: flatten/pack around cell_fn
+#       state1 = Fwd(inputs[t], state0)
+#       acc_state += [state1]
+#   Pack structures.
+# During the backward pass (backpropping the gradient from the last time
+# step to the first, through the structure), the computation is structured
+# as follows.
+# Grad:
+#   Flatten structures.
+#   Defun Backward:
+#     Create create accumulated derivatives: d_theta, d_inputs, d_acc_state.
+#     Regarding the note at the top of the file, there is only one accumulator
+#     for d_theta accumulated over the whole sequence.
+#     for t = max_input_length -1..0:
+#       Defun BackwardLoopBody:
+#         Retrieve acc_state[t] computed in the forward pass.
+#         Defun Bak: flatten/back around cell_fn_grad.
+#         d_state1 is d_state0 from previous step (ie next time).
+#         d_acc_state[dev_t] += d_state1
+#         d_theta_t, d_state0, d_inputs_t, = Bak()
+#         d_inputs[dev_t] += d_inputs
+#         d_theta += d_theta_t
+#         d_acc_state[t] += d_state1
+#   Pack structures and return.
+class _Recurrent(object):
+  """A helper class to construct a recurrent neural net."""
+
+  def __init__(self, cell_fn, cell_grad, theta, state0, inputs,
+               max_input_length, extras, use_tpu):
+    """RNN helper class.
+
+    Args:
+      cell_fn: A python function, which computes:
+         state1, extras = cell_fn(theta, state0, inputs[t, :])
+      cell_grad: A python function which computes:
+         dtheta, dstate0, dinputs[t, :] = cell_grad(
+           theta, state0, inputs[t, :], extras, dstate1)
+      theta: weights. A structure of tensors.
+      state0: initial state. A structure of tensors.
+      inputs: inputs. A structure of tensors.
+      max_input_length: None, or the maximum effective length of the input over
+        all batches. A scalar tensor.
+      extras: A structure of tensors. The 2nd return value of every
+        invocation of cell_fn is a structure of tensors with matching keys
+        and shapes of this `extras`.
+      use_tpu: A boolean indicating whether the computation is mean to
+        run on a TPU.
+    """
+    self._theta = theta
+    self._state = state0
+    self._inputs = inputs
+    self._max_input_length = self._MaybeComputeMaxInputLength(
+        inputs, max_input_length)
+    self._cell_fn = cell_fn
+    self._cell_grad = cell_grad
+    self._extras = extras
+
+    # pylint: disable=unbalanced-tuple-unpacking
+
+    # NOTE: TF Function (Fwd, Bak, ForwardLoopBody, BackwardLoopBody,
+    # Forward and Backward defined below) simply takes a list of
+    # Tensors and returns a list of Tensors. When we pass in a
+    # structure (a list of structures of Tensors), we use _Flatten to
+    # convert the structure into a list of tensor. Conversely, the
+    # following code often uses _Pack to formulate a structure from a
+    # list of tensors based on a "template".
+
+    # Wraps cell_fn in a TF Function:
+    #    state1 = cell_fn(theta, state0, inputs)
+    fwd_sig = [self._theta, self._state, self._inputs]
+
+    compiled = use_tpu
+    noinline = not compiled
+    dev_t_type = dtypes.int32 if use_tpu else dtypes.int64
+
+    @function.Defun(*_Dtypes(fwd_sig))
+    def Fwd(*args):
+      (theta, state0, inputs) = _Pack(args, fwd_sig)
+      state1, extras = self._cell_fn(theta, state0, inputs)
+      assert not function.get_extra_args(), (
+          'cell_fn is not pure with extra args: %s.' %
+          (function.get_extra_args()))
+      _AssertIsCompatible(state1, self._state)
+      _AssertIsCompatible(extras, self._extras)
+      return _Flatten([state1, extras])
+
+    # Wraps cell_fn in a TF Function as a for-loop's body.
+    #
+    # The loop state is composed of:
+    #  t: The loop variable. Timestep id.
+    #  dev_t: The loop variable mirrored on the device.
+    #  theta: the recurrent net's weights.
+    #  state0: the previous recurrent state.
+    #  inputs: inputs to the recurrent net. inputs[t, :] are for the timestep t.
+    #  acc_state: Each timestep's computed new state is also stashed into
+    #    acc_state.
+    #  acc_extras: Each timestep's computed extras is stashed into acc_extras
+    fwdloop_sig = [
+        self._theta, self._state, self._inputs, self._state, self._extras
+    ]
+
+    @function.Defun(dtypes.int32, dev_t_type, *_Dtypes(fwdloop_sig))
+    def ForwardLoopBody(*args):
+      """The body of forward loop."""
+      t, dev_t = args[0], args[1]
+      (theta, state0, inputs, acc_state, acc_extras) = _Pack(
+          args[2:], fwdloop_sig)
+      inputs_t = _Index(inputs, t)  # external input at time step t.
+      fwd = Fwd(*_Flatten([theta, state0, inputs_t]))
+      state1, extras = _Pack(fwd, [self._state, self._extras])
+      # Saves state1 and extras in their accumulators.
+      acc_state = _Update(acc_state, state1, dev_t)
+      acc_extras = _Update(acc_extras, extras, dev_t)
+
+      return [math_ops.add(dev_t, 1)] + _Flatten(
+          [theta, state1, inputs, acc_state, acc_extras])
+
+    def Grad(op, *args):
+      """The python grad function for the Forward function."""
+
+      # NOTE: tf.gradient backprops None for int32/int64 while zeros
+      # for float32/float64. For consistency, we always backprop
+      # zeros.
+      args = list(args)
+      for i, dy in enumerate(args):
+        if dy is None:
+          args[i] = array_ops.zeros_like(op.outputs[i])
+      # TODO(drpng): getting the extra state here?
+      op_inputs = [x for x in op.inputs]
+      op_struct = [
+          self._theta, self._state, self._inputs, self._max_input_length,
+          self._extras
+      ]
+      (theta, state0, inputs, max_input_length, _) = _Pack(op_inputs, op_struct)
+      # acc_state and acc_extras are computed by the Forward pass and
+      # needed by the Backward pass.
+      acc_state, _, acc_extras = _Pack([x for x in op.outputs],
+                                       [self._state, self._state, self._extras])
+
+      # Forward computes acc_state, the final state and
+      # acc_extras. tf.gradients gives us their gradients w.r.t. the
+      # final loss. Because acc_extras are not exposed by Compute(),
+      # it has no gradients w.r.t. the final loss (i.e., by
+      # construction, it must be zeros).
+      d_acc_state, d_state1, _ = _Pack(args,
+                                       [self._state, self._state, self._extras])
+      return Backward(*_Flatten([
+          theta, state0, inputs, max_input_length, acc_state, acc_extras,
+          d_acc_state, d_state1
+      ]))
+
+    # Forward calls ForwardLoopBody n times. Each time computes one
+    # time step of the recurrent net.
+    forward_sig = [
+        self._theta, self._state, self._inputs, self._max_input_length,
+        self._extras
+    ]
+
+    @function.Defun(
+        *_Dtypes(forward_sig), python_grad_func=Grad, noinline=noinline)
+    def Forward(*args):
+      """Forward pass of the recurrent net."""
+      theta, state0, inputs, max_input_length, extras = _Pack(args, forward_sig)
+
+      slen_dim = _SeqLenDim(inputs)
+
+      # Creates accumulators for state0 and extras.
+      acc_state = _EmptyAcc(slen_dim, state0)
+      acc_extras = _EmptyAcc(slen_dim, extras)
+
+      dev_t = array_ops.constant(0, dtype=dev_t_type)
+      run = functional_ops.For(
+          start=0,
+          limit=max_input_length,
+          delta=1,
+          inputs=[dev_t] + _Flatten(
+              [theta, state0, inputs, acc_state, acc_extras]),
+          body=ForwardLoopBody,
+          rewrite_with_while=compiled)
+      _, state1, _, acc_state, acc_extras = _Pack(
+          run[1:],
+          [self._theta, self._state, self._inputs, self._state, self._extras])
+
+      return _Flatten([acc_state, state1, acc_extras])
+
+    # The per-step backward computes:
+    #    d_theta, d_state0, d_inputs = cell_grad(
+    #        theta, state0, inputs, extras, d_state1)
+    # where d_state1 is the backprop-ed gradient for state1, and
+    # extras is the computed by the forward step to facilitate the
+    # backward step.
+    bak_sig = [
+        self._theta, self._state, self._inputs, self._extras, self._state
+    ]
+
+    @function.Defun(*_Dtypes(bak_sig))
+    def Bak(*args):
+      """Backward step."""
+      (theta, state0, inputs, extras, d_state1) = _Pack(args, bak_sig)
+      (dtheta, dstate0, dinputs) = self._cell_grad(theta, state0, inputs,
+                                                   extras, d_state1)
+      assert not function.get_extra_args(), (
+          'cell_grad is not pure with extra args: %s.' %
+          (function.get_extra_args()))
+      _AssertIsCompatible(dtheta, self._theta)
+      _AssertIsCompatible(dstate0, self._state)
+      _AssertIsCompatible(dinputs, self._inputs)
+      return _Flatten(
+          _ConvertNoneGradientToZeros([theta, state0, inputs],
+                                      [dtheta, dstate0, dinputs]))
+
+    # Define defuns used by a functional_ops.If in BackwardLoopBody.
+    state_if_sig = [self._state, self._state]
+
+    @function.Defun(*_Dtypes(state_if_sig))
+    def ReturnOrigState0(*args):
+      """Returns original state0 from inputs."""
+      (_, orig_state0) = _Pack(args, state_if_sig)
+      return nest.flatten(orig_state0)
+
+    @function.Defun(*_Dtypes(state_if_sig))
+    def ReturnAccState(*args):
+      """Returns acc_state[t-1] from inputs."""
+      (acc_state, _) = _Pack(args, state_if_sig)
+      return nest.flatten(acc_state)
+
+    # Wraps cell_grad gradient function in a TF Function as a
+    # for-loop's body for the Backward pass.
+    #
+    # The loop state is composed of:
+    #  t: The loop variable. Timestep id.
+    #  state0: the initial state for the entire backward loop.
+    #  dev_t: The loop variable mirrored on the device.
+    #  theta: the recurrent net's weights.
+    #  inputs: inputs to the recurrent net. inputs[t, :] are for the timestep t.
+    #  acc_state: Each timestep's computed new state was stashed into
+    #    acc_state by the Forward pass.
+    #  acc_extras: Each timestep's computed extras was stashed into
+    #    acc_extras by the Forward pass.
+    #  d_theta: All timestep's gradient for theta is accumulated (added) into
+    #      d_theta.
+    #  d_state1: The backprop-ed gradient for the new stated computed by
+    #      timestep t.
+    #  d_inputs: d_inputs[t, :] is populated by the backward time step t.
+    #  d_acc_state: The backprop-ed gradient for acc_state.
+    bakloop_sig = [
+        self._theta, self._state, self._inputs, self._state, self._extras,
+        self._theta, self._state, self._inputs, self._state
+    ]
+
+    @function.Defun(dtypes.int32, dev_t_type, *_Dtypes(bakloop_sig))
+    def BackwardLoopBody(*args):
+      """Backward loop body function."""
+      t, dev_t = args[0], args[1]
+      (theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state1,
+       d_inputs, d_acc_state) = _Pack(args[2:], bakloop_sig)
+
+      # The input recurrent state for time step t is previous time step's
+      # output, or the original state0 when on time step 0.
+      state_from_acc = _Index(acc_state, math_ops.maximum(0, t - 1))
+      state0 = functional_ops.If(
+          math_ops.equal(t, array_ops.constant(0, dtypes.int32)),
+          _Flatten([state_from_acc, orig_state0]), ReturnOrigState0,
+          ReturnAccState)
+      state0 = nest.pack_sequence_as(orig_state0, state0)
+
+      # The external inputs for time step t.
+      inputs_t = _Index(inputs, t)
+      # The extras for time step t.
+      extras_t = _Index(acc_extras, t)
+
+      d_state1 = _Add(_Index(d_acc_state, t), d_state1)
+      (d_theta_t, d_state0, d_inputs_t) = _Pack(
+          Bak(*_Flatten([theta, state0, inputs_t, extras_t, d_state1])),
+          [self._theta, self._state, self._inputs])
+      d_theta = _Add(d_theta, d_theta_t)
+      d_inputs = _Update(d_inputs, d_inputs_t, dev_t)
+      return [math_ops.subtract(dev_t, 1)] + _Flatten([
+          theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state0,
+          d_inputs, d_acc_state
+      ])
+
+    # Backward calls BackwardLoopBody n times.  Each time computes the backprop
+    # for one time step of the recurrent net.
+    backward_sig = [
+        self._theta, self._state, self._inputs, self._max_input_length,
+        self._state, self._extras, self._state, self._state
+    ]
+
+    @function.Defun(*_Dtypes(backward_sig), noinline=noinline)
+    def Backward(*args):
+      """Backward pass for the recurrent net."""
+      # theta, state0, inputs are Forward's inputs.
+      # acc_state is the accumulated 1st output of Forward.
+      # acc_extras is the accumulated 2nd output of Forward.
+      # d_acc_state is the gradient for acc_state.
+      # d_state1 is the gradient for the final state computed by Forward.
+      (theta, state0, inputs, max_input_length, acc_state, acc_extras,
+       d_acc_state, d_state1) = _Pack(args, backward_sig)
+
+      # Accumulators for gradients.
+      d_theta = _EmptyLike(theta)
+      d_inputs = _EmptyLike(inputs)
+
+      # Loop backwards. Note the loop's limit is open-ended, so goes through
+      # t=0.
+      t = max_input_length - 1
+      dev_t = math_ops.to_int32(t) if use_tpu else math_ops.to_int64(t)
+      run = functional_ops.For(
+          start=t,
+          limit=-1,
+          delta=-1,
+          inputs=[dev_t] + _Flatten([
+              theta, state0, inputs, acc_state, acc_extras, d_theta, d_state1,
+              d_inputs, d_acc_state
+          ]),
+          body=BackwardLoopBody,
+          rewrite_with_while=compiled)
+
+      (theta, state0, inputs, acc_state, acc_extras, d_theta, d_state0,
+       d_inputs, d_acc_state) = _Pack(run[1:], bakloop_sig)
+
+      d_max_input_length = array_ops.constant(0, dtype=max_input_length.dtype)
+      return _Flatten(
+          [d_theta, d_state0, d_inputs, d_max_input_length, acc_extras])
+
+    self._forward = Forward
+
+  def _MaybeComputeMaxInputLength(self, inputs, max_input_length):
+    if max_input_length is not None:
+      return max_input_length
+    return math_ops.reduce_max(array_ops.shape(nest.flatten(inputs)[0])[0])
+
+  def Compute(self):
+    return _Pack(
+        self._forward(*_Flatten([
+            self._theta, self._state, self._inputs, self._max_input_length,
+            self._extras
+        ])), [self._state, self._state, self._extras])[:2]
+
+
+def _GetCellGrad(cell_fn, cell_grad):
+  """Returns the gradient function for cell_fn.
+
+  Args:
+    cell_fn: The recurrent neural net's cell function.
+    cell_grad: If not None, cell_fn's gradient function.
+
+  Returns:
+    Returns cell_grad if not None. Otherwise, assume cell_fn is a python
+    function representing the recurrent neural net's cell function, i.e.,
+      cell_fn: (theta, state0, inputs) -> (state1, extra)
+    returns its default gradient python function, i.e.,
+      cell_grad: (theta, state0, inputs, extras, dstate1) -> (
+                  dtheta, dstate0, dinputs)
+  """
+
+  if cell_grad:
+    return cell_grad
+
+  def CellGrad(theta, state0, inputs, extras, dstate1):
+    """Default gradient function for cell_fn."""
+    # NOTE: The default grad function recomputes the forward
+    # function and does not take advantage of 'extras' returned by
+    # the forward function.
+    del extras
+    state1, extras = cell_fn(theta, state0, inputs)
+    ys = _Flatten([state1])
+    xs = _Flatten([theta, state0, inputs])
+    grad_ys = _Flatten([dstate1])
+    grads = gradients_impl.gradients(ys=ys, xs=xs, grad_ys=grad_ys)
+    return _ConvertNoneGradientToZeros([theta, state0, inputs],
+                                       _Pack(grads, [theta, state0, inputs]))
+
+  return CellGrad
+
+
+def _IsSingleTimeStep(inputs, max_input_length):
+  """Returns True only if the time dimension of inputs is 1."""
+  if not isinstance(max_input_length, ops.Tensor):
+    return max_input_length == 1
+  for x in nest.flatten(inputs):
+    if x.shape.dims is None or x.shape[0].value != 1:
+      return False
+  return True
+
+
+def Recurrent(theta,
+              state0,
+              inputs,
+              cell_fn,
+              cell_grad=None,
+              extras=None,
+              max_input_length=None,
+              use_tpu=False):
+  """Compute a recurrent neural net.
+
+  Roughly, Recurrent() computes the following:
+    state = state0
+    for t in inputs' sequence length:
+      state = cell_fn(theta, state, inputs[t, :])
+      accumulate_state[t, :] = state
+    return accumulate_state, state
+
+  theta, state, inputs are all structures of tensors.
+
+  inputs[t, :] means taking a slice out from every tensor in the inputs.
+
+  accumulate_state[t, :] = state means that we stash every tensor in
+  'state' into a slice of the corresponding tensor in
+  accumulate_state.
+
+  cell_fn is a python callable computing (building up a TensorFlow
+  graph) the recurrent neural network's one forward step. Two calls of
+  cell_fn must describe two identical computations.
+
+  By construction, Recurrent()'s backward computation does not access
+  any intermediate values computed by cell_fn during forward
+  computation. We may extend Recurrent() to support that by taking a
+  customized backward function of cell_fn.
+
+  Args:
+    theta: weights. A structure of tensors.
+    state0: initial state. A structure of tensors.
+    inputs: inputs. A structure of tensors.
+    cell_fn: A python function, which computes:
+      state1, extras = cell_fn(theta, state0, inputs[t, :])
+    cell_grad: A python function which computes:
+      dtheta, dstate0, dinputs[t, :] = cell_grad(
+        theta, state0, inputs[t, :], extras, dstate1)
+    extras: A structure of tensors. The 2nd return value of every
+      invocation of cell_fn is a structure of tensors with matching keys
+      and shapes of  this `extras`.
+    max_input_length: maximum length of effective input. This is used to
+      truncate the computation if the inputs have been allocated to a
+      larger size. A scalar tensor.
+    use_tpu: whether or not we are on TPU.
+
+  Returns:
+    accumulate_state and the final state.
+  """
+  if cell_grad is None and _IsSingleTimeStep(inputs, max_input_length):
+    # The seqlen length is staticly known as 1. Hence, we just need to
+    # call cell_fn once without putting it into a loop.
+    inputs = nest.map_structure(lambda x: array_ops.squeeze(x, axis=0), inputs)
+    state1, _ = cell_fn(theta, state0, inputs)
+    acc_state = nest.map_structure(lambda x: array_ops.expand_dims(x, axis=0),
+                                   state1)
+    return acc_state, state1
+
+  # If cell_grad is not given, derives the gradient function from
+  # cell_fn.
+  cell_grad = _GetCellGrad(cell_fn, cell_grad)
+
+  if extras is None:
+    # Derives 'extras' so that we can allocate extras' accumulator.
+    _, extras = cell_fn(theta, state0, _Index(inputs, 0))
+    extras = nest.map_structure(array_ops.zeros_like, extras)
+  else:
+    _, actual = cell_fn(theta, state0, _Index(inputs, 0))
+    _AssertIsCompatible(extras, actual)
+
+  return _Recurrent(
+      cell_fn=cell_fn,
+      cell_grad=cell_grad,
+      theta=theta,
+      state0=state0,
+      inputs=inputs,
+      max_input_length=max_input_length,
+      extras=extras,
+      use_tpu=use_tpu).Compute()
diff --git a/tensorflow/contrib/bayesflow/python/ops/variable_utils.py b/tensorflow/contrib/recurrent/python/recurrent_api.py
similarity index 66%
rename from tensorflow/contrib/bayesflow/python/ops/variable_utils.py
rename to tensorflow/contrib/recurrent/python/recurrent_api.py
index eadf6f4d5fa1c776e2c71c66c4b64b8f5ac98359..f1c97927dfe4c212df00581d6da0988ff76bae42 100644
--- a/tensorflow/contrib/bayesflow/python/ops/variable_utils.py
+++ b/tensorflow/contrib/recurrent/python/recurrent_api.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility functions related to managing `tf.Variable`s."""
+"""Recurrent computations library."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.variable_utils_impl import *  # pylint: disable=wildcard-import,unused-wildcard-import,g-importing-member
-from tensorflow.python.util import all_util
+# pylint: disable=unused-import
+from tensorflow.contrib.recurrent.python.ops.functional_rnn import bidirectional_functional_rnn
+from tensorflow.contrib.recurrent.python.ops.functional_rnn import functional_rnn
+from tensorflow.contrib.recurrent.python.ops.recurrent import Recurrent
+# pylint: enable=unused-import
 
-_allowed_symbols = [
-    "externalize_variables_as_args",
-]
-
-all_util.remove_undocumented(__name__, _allowed_symbols)
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/reduce_slice_ops/BUILD b/tensorflow/contrib/reduce_slice_ops/BUILD
index b31f4488f5882a0bc4e419668dba5da72d69b7fe..02b3d66e4612d0f7eb29959d6c9f8472379fe16c 100644
--- a/tensorflow/contrib/reduce_slice_ops/BUILD
+++ b/tensorflow/contrib/reduce_slice_ops/BUILD
@@ -101,15 +101,3 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
index 27f0a7f58f78135f1d73ae04bd1e76ef496fa549..3aa8a14f44f38de51ed61f0b894cfd77ea9329f8 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/BUILD
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -38,7 +38,6 @@ py_test(
     size = "small",
     srcs = ["python/ops/remote_fused_graph_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":remote_fused_graph_ops_py",
         "//tensorflow/core:protos_all_py",
@@ -48,15 +47,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/resampler/BUILD b/tensorflow/contrib/resampler/BUILD
index f0ecc8b85a5db93075d3cf0b55e7df95732bcf94..48345d7030bea431152bbed934af9f500f2c15c5 100644
--- a/tensorflow/contrib/resampler/BUILD
+++ b/tensorflow/contrib/resampler/BUILD
@@ -85,14 +85,3 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 7e5e35d0b55c97946c022e55180765d982eaa87a..43c0f7595590802aa80e1012967d377a6ab83d29 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -321,19 +321,6 @@ tf_cc_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "tools/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_gen_op_libs(
     op_lib_names = [
         "lstm_ops",
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
index 03006dab323a7c6dc83d9a17c035ef705f7b0366..45d22b739b8c597c7ebda85968aa44cd599a798c 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
@@ -26,9 +26,9 @@ namespace tensorflow {
 #if GOOGLE_CUDA
 namespace {
 template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 }  // namespace
@@ -41,9 +41,8 @@ void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
                                      T alpha, const T* a, int lda, const T* b,
                                      int ldb, T beta, T* c, int ldc) {
 #if GOOGLE_CUDA
-  perftools::gputools::blas::Transpose trans[] = {
-      perftools::gputools::blas::Transpose::kNoTranspose,
-      perftools::gputools::blas::Transpose::kTranspose};
+  se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
+                                 se::blas::Transpose::kTranspose};
 
   auto a_ptr = AsDeviceMemory(a);
   auto b_ptr = AsDeviceMemory(b);
diff --git a/tensorflow/contrib/rnn/ops/gru_ops.cc b/tensorflow/contrib/rnn/ops/gru_ops.cc
index e91d1e8a80ed252e5f89e116fb0a325be67e3941..9c8e40851a0cc5bd7f37f94a62ecdef7248660c1 100644
--- a/tensorflow/contrib/rnn/ops/gru_ops.cc
+++ b/tensorflow/contrib/rnn/ops/gru_ops.cc
@@ -69,7 +69,7 @@ Element-wise dot product of a and b is represented by ab
 Element-wise dot product is represented by \circ
 Matrix multiplication is represented by *
 
-Baises are initialized with :
+Biases are initialized with :
 `b_ru` - constant_initializer(1.0)
 `b_c` - constant_initializer(0.0)
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 0e62b315b61cb3ceeb5cfd33bf5102a71abef83b..d41fc0b3ac1cee4eacc88cb0f41df1f9ee59e7c3 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -187,6 +187,8 @@ class RNNCellTest(test.TestCase):
               ],
               state_is_tuple=False)
           self.assertEqual(cell.dtype, None)
+          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
+          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
           g, out_m = cell(x, m)
           # Layer infers the input type.
           self.assertEqual(cell.dtype, dtype.name)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 57521c6a9ba0b2d66639017b09c541e270276323..ba4933ddf793c58d00ae28a54eb21410f41e2e16 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -307,6 +307,21 @@ class LSTMTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  def testDType(self):
+    # Test case for GitHub issue 16228
+    # Not passing dtype in constructor results in default float32
+    lstm = rnn_cell.LSTMCell(10)
+    input_tensor = array_ops.ones([10, 50])
+    lstm.build(input_tensor.get_shape())
+    self.assertEqual(lstm._bias.dtype, dtypes.float32_ref)
+
+    # Explicitly pass dtype in constructor
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
+      lstm = rnn_cell.LSTMCell(10, dtype=dtype)
+      input_tensor = array_ops.ones([10, 50])
+      lstm.build(input_tensor.get_shape())
+      self.assertEqual(lstm._bias.dtype, dtype._as_ref)
+
   def testNoProjNoSharding(self):
     num_units = 3
     input_size = 5
@@ -869,7 +884,7 @@ class LSTMTest(test.TestCase):
     num_proj = 4
     max_length = 8
     sequence_length = [4, 6]
-    in_graph_mode = context.in_graph_mode()
+    in_graph_mode = not context.executing_eagerly()
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
@@ -934,8 +949,7 @@ class LSTMTest(test.TestCase):
       if in_graph_mode:
         self.assertAllEqual(outputs_static, outputs_dynamic)
       else:
-        self.assertAllEqual(
-            array_ops.stack(outputs_static).numpy(), outputs_dynamic.numpy())
+        self.assertAllEqual(array_ops.stack(outputs_static), outputs_dynamic)
       self.assertAllEqual(np.hstack(state_static), np.hstack(state_dynamic))
 
   @test_util.run_in_graph_and_eager_modes()
@@ -946,7 +960,7 @@ class LSTMTest(test.TestCase):
     num_proj = 4
     max_length = 8
     sequence_length = [4, 6]
-    in_graph_mode = context.in_graph_mode()
+    in_graph_mode = not context.executing_eagerly()
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
@@ -1022,10 +1036,9 @@ class LSTMTest(test.TestCase):
       if in_graph_mode:
         self.assertAllEqual(outputs_static, outputs_dynamic)
       else:
-        self.assertAllEqual(
-            array_ops.stack(outputs_static).numpy(), outputs_dynamic.numpy())
-        state_static = [s.numpy() for s in nest.flatten(state_static)]
-        state_dynamic = [s.numpy() for s in nest.flatten(state_dynamic)]
+        self.assertAllEqual(array_ops.stack(outputs_static), outputs_dynamic)
+        state_static = nest.flatten(state_static)
+        state_dynamic = nest.flatten(state_dynamic)
       self.assertAllEqual(np.hstack(state_static), np.hstack(state_dynamic))
 
   def _testDynamicEquivalentToStaticRNN(self, use_sequence_length):
@@ -1043,7 +1056,7 @@ class LSTMTest(test.TestCase):
     else:
       sequence_length = None
 
-    in_graph_mode = context.in_graph_mode()
+    in_graph_mode = not context.executing_eagerly()
 
     # TODO(b/68017812): Eager ignores operation seeds, so we need to create a
     # single cell and reuse it across the static and dynamic RNNs. Remove this
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
index 7957edf68cc8a1461fccfc2de93ad5250dc9fdb5..ffd24218944e150a32b1b915288ab1df90afb45c 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -54,7 +54,7 @@ def blocks_match(sess, use_peephole):
   initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212)
 
   with variable_scope.variable_scope("test", initializer=initializer):
-    # magic naming so that the cells pick up these variables and resuse them
+    # magic naming so that the cells pick up these variables and reuse them
     if use_peephole:
       wci = variable_scope.get_variable(
           "rnn/lstm_cell/w_i_diag", shape=[cell_size], dtype=dtypes.float32)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 7b883ebc5d7756f1bdf445f900500a4b89e6cffd..c7d85862f65674f60c9f63fd5c649afa75b95cc0 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -455,8 +455,8 @@ class RNNCellTest(test.TestCase):
         self.assertAllClose(np.concatenate(res[1], axis=1), expected_state)
 
   def testAttentionCellWrapperFailures(self):
-    with self.assertRaisesRegexp(TypeError,
-                                 "The parameter cell is not RNNCell."):
+    with self.assertRaisesRegexp(
+        TypeError, rnn_cell_impl.ASSERT_LIKE_RNNCELL_ERROR_REGEXP):
       contrib_rnn_cell.AttentionCellWrapper(None, 0)
 
     num_units = 8
@@ -842,12 +842,12 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_state_c = np.array(
-          [[6.450831e-04, 4.697885e-04], [9.862894e-05, 7.212213e-04],
-           [4.401947e-04, 9.143004e-04]],
+          [[0.00072015, 0.00036633], [0.00083481, 0.00047266],
+           [0.00085111, 0.00053054]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[4.621217e-04, 3.365449e-04], [7.438179e-05, 5.439147e-04],
-           [3.347936e-04, 6.953785e-04]],
+          [[0.0005159, 0.00026243], [0.00062958, 0.00035646],
+           [0.00064732, 0.00040351]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -878,7 +878,6 @@ class RNNCellTest(test.TestCase):
       shape = [2, 1]
       filter_size = [3]
       num_features = 1
-      batch_size = 2
       expected_state_c = np.array(
           [[[1.4375670191], [1.4375670191]], [[2.7542609292], [2.7542609292]]],
           dtype=np.float32)
@@ -912,7 +911,6 @@ class RNNCellTest(test.TestCase):
       shape = [2, 2, 1]
       filter_size = [3, 3]
       num_features = 1
-      batch_size = 2
       expected_state_c = np.array(
           [[[[1.4375670191], [1.4375670191]], [[1.4375670191], [1.4375670191]]],
            [[[2.7542609292], [2.7542609292]], [[2.7542609292], [2.7542609292]]
@@ -954,7 +952,6 @@ class RNNCellTest(test.TestCase):
       shape = [2, 2, 2, 1]
       filter_size = [3, 3, 3]
       num_features = 1
-      batch_size = 2
       expected_state_c = np.array(
           [[[[[1.4375670191], [1.4375670191]], [[1.4375670191], [1.4375670191]]
             ], [[[1.4375670191], [1.4375670191]], [[1.4375670191],
@@ -1031,57 +1028,92 @@ class RNNCellTest(test.TestCase):
     num_units = 4
     number_of_groups = 1
 
-    with self.test_session() as sess:
-      with variable_scope.variable_scope(
-          "root1", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.ones([batch_size, num_units])
-        # When number_of_groups = 1, G-LSTM is equivalent to regular LSTM
-        gcell = contrib_rnn_cell.GLSTMCell(
-            num_units=num_units, number_of_groups=number_of_groups)
-        cell = rnn_cell.LSTMCell(num_units=num_units)
-        self.assertTrue(isinstance(gcell.state_size, tuple))
-        zero_state = gcell.zero_state(
-            batch_size=batch_size, dtype=dtypes.float32)
-        gh, gs = gcell(x, zero_state)
-        h, g = cell(x, zero_state)
+    # Try with input dimension equal to num_units or not.
+    for num_inputs in [num_units, num_units + number_of_groups]:
+      with self.test_session() as sess:
+        with variable_scope.variable_scope(
+            "root1_%d" % num_inputs,
+            initializer=init_ops.constant_initializer(0.5)):
+          x = array_ops.ones([batch_size, num_inputs])
+          # When number_of_groups = 1, G-LSTM is equivalent to regular LSTM
+          gcell = contrib_rnn_cell.GLSTMCell(
+              num_units=num_units, number_of_groups=number_of_groups)
+          cell = rnn_cell.LSTMCell(num_units=num_units)
+          self.assertTrue(isinstance(gcell.state_size, tuple))
+          zero_state = gcell.zero_state(
+              batch_size=batch_size, dtype=dtypes.float32)
+          gh, gs = gcell(x, zero_state)
+          h, g = cell(x, zero_state)
 
-        sess.run([variables.global_variables_initializer()])
-        glstm_result = sess.run([gh, gs])
-        lstm_result = sess.run([h, g])
+          sess.run([variables.global_variables_initializer()])
+          glstm_result = sess.run([gh, gs])
+          lstm_result = sess.run([h, g])
 
-        self.assertAllClose(glstm_result[0], lstm_result[0], 1e-5)
-        self.assertAllClose(glstm_result[1], lstm_result[1], 1e-5)
+          self.assertAllClose(glstm_result[0], lstm_result[0], 1e-5)
+          self.assertAllClose(glstm_result[1], lstm_result[1], 1e-5)
 
     # Test that G-LSTM subgroup act like corresponding sub-LSTMs
     batch_size = 2
     num_units = 4
     number_of_groups = 2
 
-    with self.test_session() as sess:
+    # Try with num_inputs equal to or not equal to num_units.
+    for num_inputs in [num_units, num_units + number_of_groups]:
+      with self.test_session() as sess:
+        with variable_scope.variable_scope(
+            "root2_%d" % num_inputs,
+            initializer=init_ops.constant_initializer(0.5)):
+          # input for G-LSTM with 2 groups
+          glstm_input = array_ops.ones([batch_size, num_inputs])
+          gcell = contrib_rnn_cell.GLSTMCell(
+              num_units=num_units, number_of_groups=number_of_groups)
+          gcell_zero_state = gcell.zero_state(
+              batch_size=batch_size, dtype=dtypes.float32)
+          gh, gs = gcell(glstm_input, gcell_zero_state)
+
+          # input for LSTM cell simulating single G-LSTM group
+          lstm_input = array_ops.ones(
+              [batch_size, num_inputs / number_of_groups])
+          # note division by number_of_groups. This cell one simulates G-LSTM
+          # group
+          cell = rnn_cell.LSTMCell(num_units=int(num_units / number_of_groups))
+          cell_zero_state = cell.zero_state(
+              batch_size=batch_size, dtype=dtypes.float32)
+          h, g = cell(lstm_input, cell_zero_state)
+
+          sess.run([variables.global_variables_initializer()])
+          [gh_res, h_res] = sess.run([gh, h])
+          self.assertAllClose(gh_res[:, 0:int(num_units / number_of_groups)],
+                              h_res, 1e-5)
+          self.assertAllClose(gh_res[:, int(num_units / number_of_groups):],
+                              h_res, 1e-5)
+
+  def testGLSTMCellFailure(self):
+    batch_size = 2
+    num_units = 4
+    number_of_groups = 2
+    with self.test_session():
       with variable_scope.variable_scope(
-          "root2", initializer=init_ops.constant_initializer(0.5)):
-        # input for G-LSTM with 2 groups
-        glstm_input = array_ops.ones([batch_size, num_units])
+          "glstm_failure", initializer=init_ops.constant_initializer(0.5)):
         gcell = contrib_rnn_cell.GLSTMCell(
             num_units=num_units, number_of_groups=number_of_groups)
         gcell_zero_state = gcell.zero_state(
             batch_size=batch_size, dtype=dtypes.float32)
-        gh, gs = gcell(glstm_input, gcell_zero_state)
 
-        # input for LSTM cell simulating single G-LSTM group
-        lstm_input = array_ops.ones([batch_size, num_units / number_of_groups])
-        # note division by number_of_groups. This cell one simulates G-LSTM group
-        cell = rnn_cell.LSTMCell(num_units=int(num_units / number_of_groups))
-        cell_zero_state = cell.zero_state(
-            batch_size=batch_size, dtype=dtypes.float32)
-        h, g = cell(lstm_input, cell_zero_state)
+        # Try an input with statically-unknown innermost dimension.
+        glstm_input = array_ops.placeholder(
+            dtypes.float32, shape=[batch_size, None])
+        with self.assertRaisesRegexp(ValueError,
+                                     "input size must be statically known"):
+          gcell(glstm_input, gcell_zero_state)
 
-        sess.run([variables.global_variables_initializer()])
-        [gh_res, h_res] = sess.run([gh, h])
-        self.assertAllClose(gh_res[:, 0:int(num_units / number_of_groups)],
-                            h_res, 1e-5)
-        self.assertAllClose(gh_res[:, int(num_units / number_of_groups):],
-                            h_res, 1e-5)
+        # Try an input whose innermost dimension isn't divisible into groups.
+        glstm_input = array_ops.placeholder(
+            dtypes.float32, shape=[batch_size, 3])
+        with self.assertRaisesRegexp(
+            ValueError,
+            r"input size \(3\) must be divisible by number_of_groups \(2\)"):
+          gcell(glstm_input, gcell_zero_state)
 
 
 class LayerNormBasicLSTMCellTest(test.TestCase):
@@ -1168,7 +1200,7 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         h1 = array_ops.zeros([1, 2])
         state1 = rnn_cell.LSTMStateTuple(c1, h1)
         state = (state0, state1)
-        single_cell = lambda: contrib_rnn_cell.LayerNormBasicLSTMCell(2, layer_norm=False)
+        single_cell = lambda: contrib_rnn_cell.LayerNormBasicLSTMCell(2, layer_norm=False)  # pylint: disable=line-too-long
         cell = rnn_cell.MultiRNNCell([single_cell() for _ in range(2)])
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
@@ -1200,7 +1232,7 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         self.assertAllClose(expected_state1_h, actual_state1_h, 1e-5)
 
       with variable_scope.variable_scope(
-          "other", initializer=init_ops.constant_initializer(0.5)) as vs:
+          "other", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros(
             [1, 3])  # Test BasicLSTMCell with input_size != num_units.
         c = array_ops.zeros([1, 2])
@@ -1549,7 +1581,7 @@ class WeightNormLSTMCellTest(test.TestCase):
   """Compared cell output with pre-calculated values."""
 
   def _cell_output(self, cell):
-    """Calculate cell output"""
+    """Calculates cell output."""
 
     with self.test_session() as sess:
       init = init_ops.constant_initializer(0.5)
@@ -1576,7 +1608,7 @@ class WeightNormLSTMCellTest(test.TestCase):
     return actual_state_c, actual_state_h
 
   def testBasicCell(self):
-    """Tests cell w/o peepholes and w/o normalisation"""
+    """Tests cell w/o peepholes and w/o normalisation."""
 
     def cell():
       return contrib_rnn_cell.WeightNormLSTMCell(2,
@@ -1592,7 +1624,7 @@ class WeightNormLSTMCellTest(test.TestCase):
     self.assertAllClose(expected_h, actual_h, 1e-5)
 
   def testNonbasicCell(self):
-    """Tests cell with peepholes and w/o normalisation"""
+    """Tests cell with peepholes and w/o normalisation."""
 
     def cell():
       return contrib_rnn_cell.WeightNormLSTMCell(2,
@@ -1607,9 +1639,8 @@ class WeightNormLSTMCellTest(test.TestCase):
     self.assertAllClose(expected_c, actual_c, 1e-5)
     self.assertAllClose(expected_h, actual_h, 1e-5)
 
-
   def testBasicCellWithNorm(self):
-    """Tests cell w/o peepholes and with normalisation"""
+    """Tests cell w/o peepholes and with normalisation."""
 
     def cell():
       return contrib_rnn_cell.WeightNormLSTMCell(2,
@@ -1625,7 +1656,7 @@ class WeightNormLSTMCellTest(test.TestCase):
     self.assertAllClose(expected_h, actual_h, 1e-5)
 
   def testNonBasicCellWithNorm(self):
-    """Tests cell with peepholes and with normalisation"""
+    """Tests cell with peepholes and with normalisation."""
 
     def cell():
       return contrib_rnn_cell.WeightNormLSTMCell(2,
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py b/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
index 8109ebc718353300f94536c5d7ae3332da584a1d..645f82624bf67b96ffc8520289b293b45f0e69e2 100644
--- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
@@ -40,7 +40,6 @@ from tensorflow.python.util import nest
 
 # pylint: disable=protected-access,invalid-name
 RNNCell = rnn_cell_impl.RNNCell
-_like_rnncell = rnn_cell_impl._like_rnncell
 _WEIGHTS_VARIABLE_NAME = rnn_cell_impl._WEIGHTS_VARIABLE_NAME
 _BIAS_VARIABLE_NAME = rnn_cell_impl._BIAS_VARIABLE_NAME
 # pylint: enable=protected-access,invalid-name
@@ -221,8 +220,7 @@ class EmbeddingWrapper(RNNCell):
       ValueError: if embedding_classes is not positive.
     """
     super(EmbeddingWrapper, self).__init__(_reuse=reuse)
-    if not _like_rnncell(cell):
-      raise TypeError("The parameter cell is not RNNCell.")
+    rnn_cell_impl.assert_like_rnncell("cell", cell)
     if embedding_classes <= 0 or embedding_size <= 0:
       raise ValueError("Both embedding_classes and embedding_size must be > 0: "
                        "%d, %d." % (embedding_classes, embedding_size))
@@ -301,8 +299,7 @@ class InputProjectionWrapper(RNNCell):
     super(InputProjectionWrapper, self).__init__(_reuse=reuse)
     if input_size is not None:
       logging.warn("%s: The input_size parameter is deprecated.", self)
-    if not _like_rnncell(cell):
-      raise TypeError("The parameter cell is not RNNCell.")
+    rnn_cell_impl.assert_like_rnncell("cell", cell)
     self._cell = cell
     self._num_proj = num_proj
     self._activation = activation
@@ -356,8 +353,7 @@ class OutputProjectionWrapper(RNNCell):
       ValueError: if output_size is not positive.
     """
     super(OutputProjectionWrapper, self).__init__(_reuse=reuse)
-    if not _like_rnncell(cell):
-      raise TypeError("The parameter cell is not RNNCell.")
+    rnn_cell_impl.assert_like_rnncell("cell", cell)
     if output_size < 1:
       raise ValueError("Parameter output_size must be > 0: %d." % output_size)
     self._cell = cell
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 4eb4fbcd92f0d7cb3bee712862c8950a1971b632..9e61fc54d10c1b75786450060e428c73974760a7 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -480,8 +480,7 @@ class LSTMBlockWrapper(base_layer.Layer):
     """Run this LSTM on inputs, starting from the given state.
 
     Args:
-      inputs: `3-D` tensor with shape `[time_len, batch_size, input_size]`
-        or a list of `time_len` tensors of shape `[batch_size, input_size]`.
+      inputs: `3-D` tensor with shape `[time_len, batch_size, input_size]`.
       initial_state: a tuple `(initial_cell_state, initial_output)` with tensors
         of shape `[batch_size, self._num_units]`. If this is not provided, the
         cell is expected to create a zero initial state of type `dtype`.
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index a6c2d9cdbb2b6f61d59960f708000e945c6115e9..b12e2cd5eddc3f8abdba62781692673a40e41d9b 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -534,7 +534,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       initializer: (optional) The initializer to use for the weight and
         projection matrices, default None.
       num_unit_shards: (optional) int, default 1, How to split the weight
-        matrix. If > 1,the weight matrix is stored across num_unit_shards.
+        matrix. If > 1, the weight matrix is stored across num_unit_shards.
       forget_bias: (optional) float, default 1.0, The initial bias of the
         forget gates, used to reduce the scale of forgetting at the beginning
         of the training.
@@ -993,7 +993,7 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
       initializer: (optional) The initializer to use for the weight and
         projection matrices, default None.
       num_unit_shards: (optional) int, default 1, How to split the weight
-        matrix. If > 1,the weight matrix is stored across num_unit_shards.
+        matrix. If > 1, the weight matrix is stored across num_unit_shards.
       forget_bias: (optional) float, default 1.0, The initial bias of the
         forget gates, used to reduce the scale of forgetting at the beginning
         of the training.
@@ -1143,8 +1143,7 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
           `state_is_tuple` is `False` or if attn_length is zero or less.
     """
     super(AttentionCellWrapper, self).__init__(_reuse=reuse)
-    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
-      raise TypeError("The parameter cell is not RNNCell.")
+    rnn_cell_impl.assert_like_rnncell("cell", cell)
     if nest.is_sequence(cell.state_size) and not state_is_tuple:
       raise ValueError(
           "Cell returns tuple of states, but the flag "
@@ -2059,16 +2058,19 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
                initializers=None,
                name="conv_lstm_cell"):
     """Construct ConvLSTMCell.
+
     Args:
       conv_ndims: Convolution dimensionality (1, 2 or 3).
       input_shape: Shape of the input as int tuple, excluding the batch size.
       output_channels: int, number of output channels of the conv LSTM.
       kernel_shape: Shape of kernel as in tuple (of size 1,2 or 3).
-      use_bias: Use bias in convolutions.
+      use_bias: (bool) Use bias in convolutions.
       skip_connection: If set to `True`, concatenate the input to the
-      output of the conv LSTM. Default: `False`.
+        output of the conv LSTM. Default: `False`.
       forget_bias: Forget bias.
+      initializers: Unused.
       name: Name of the module.
+
     Raises:
       ValueError: If `skip_connection` is `True` and stride is different from 1
         or if `input_shape` is incompatible with `conv_ndims`.
@@ -2131,7 +2133,7 @@ class Conv1DLSTMCell(ConvLSTMCell):
 
   def __init__(self, name="conv_1d_lstm_cell", **kwargs):
     """Construct Conv1DLSTM. See `ConvLSTMCell` for more details."""
-    super(Conv1DLSTMCell, self).__init__(conv_ndims=1, **kwargs)
+    super(Conv1DLSTMCell, self).__init__(conv_ndims=1, name=name, **kwargs)
 
 
 class Conv2DLSTMCell(ConvLSTMCell):
@@ -2142,7 +2144,7 @@ class Conv2DLSTMCell(ConvLSTMCell):
 
   def __init__(self, name="conv_2d_lstm_cell", **kwargs):
     """Construct Conv2DLSTM. See `ConvLSTMCell` for more details."""
-    super(Conv2DLSTMCell, self).__init__(conv_ndims=2, **kwargs)
+    super(Conv2DLSTMCell, self).__init__(conv_ndims=2, name=name, **kwargs)
 
 
 class Conv3DLSTMCell(ConvLSTMCell):
@@ -2153,19 +2155,23 @@ class Conv3DLSTMCell(ConvLSTMCell):
 
   def __init__(self, name="conv_3d_lstm_cell", **kwargs):
     """Construct Conv3DLSTM. See `ConvLSTMCell` for more details."""
-    super(Conv3DLSTMCell, self).__init__(conv_ndims=3, **kwargs)
+    super(Conv3DLSTMCell, self).__init__(conv_ndims=3, name=name, **kwargs)
 
 
 def _conv(args, filter_size, num_features, bias, bias_start=0.0):
-  """convolution:
+  """Convolution.
+
   Args:
     args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D,
     batch x n, Tensors.
     filter_size: int tuple of filter height and width.
     num_features: int, number of features.
+    bias: Whether to use biases in the convolution layer.
     bias_start: starting value to initialize the bias; 0 by default.
+
   Returns:
     A 3D, 4D, or 5D Tensor with shape [batch ... num_features]
+
   Raises:
     ValueError: if some of the arguments has unspecified or wrong shape.
   """
@@ -2225,6 +2231,13 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
 
   O. Kuchaiev and B. Ginsburg
   "Factorization Tricks for LSTM Networks", ICLR 2017 workshop.
+
+  In brief, a G-LSTM cell consists of one LSTM sub-cell per group, where each
+  sub-cell operates on an evenly-sized sub-vector of the input and produces an
+  evenly-sized sub-vector of the output.  For example, a G-LSTM cell with 128
+  units and 4 groups consists of 4 LSTMs sub-cells with 32 units each.  If that
+  G-LSTM cell is fed a 200-dim input, then each sub-cell receives a 50-dim part
+  of the input and produces a 32-dim part of the output.
   """
 
   def __init__(self,
@@ -2298,7 +2311,7 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
     return self._output_size
 
   def _get_input_for_group(self, inputs, group_id, group_size):
-    """Slices inputs into groups to prepare for processing by cell's groups
+    """Slices inputs into groups to prepare for processing by cell's groups.
 
     Args:
       inputs: cell input or it's previous state,
@@ -2320,9 +2333,12 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
     """Run one step of G-LSTM.
 
     Args:
-      inputs: input Tensor, 2D, [batch x num_units].
-      state: this must be a tuple of state Tensors, both `2-D`,
-      with column sizes `c_state` and `m_state`.
+      inputs: input Tensor, 2D, [batch x num_inputs].  num_inputs must be
+        statically-known and evenly divisible into groups.  The innermost
+        vectors of the inputs are split into evenly-sized sub-vectors and fed
+        into the per-group LSTM sub-cells.
+      state: this must be a tuple of state Tensors, both `2-D`, with column
+        sizes `c_state` and `m_state`.
 
     Returns:
       A tuple containing:
@@ -2337,11 +2353,24 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
 
     Raises:
       ValueError: If input size cannot be inferred from inputs via
-        static shape inference.
+        static shape inference, or if the input shape is incompatible
+        with the number of groups.
     """
     (c_prev, m_prev) = state
 
     self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
+
+    # If the input size is statically-known, calculate and validate its group
+    # size.  Otherwise, use the output group size.
+    input_size = inputs.shape[1].value
+    if input_size is None:
+      raise ValueError("input size must be statically known")
+    if input_size % self._number_of_groups != 0:
+      raise ValueError(
+          "input size (%d) must be divisible by number_of_groups (%d)" %
+          (input_size, self._number_of_groups))
+    input_group_size = int(input_size / self._number_of_groups)
+
     dtype = inputs.dtype
     scope = vs.get_variable_scope()
     with vs.variable_scope(scope, initializer=self._initializer):
@@ -2354,8 +2383,7 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
         with vs.variable_scope("group%d" % group_id):
           x_g_id = array_ops.concat(
               [
-                  self._get_input_for_group(inputs, group_id,
-                                            self._group_shape[0]),
+                  self._get_input_for_group(inputs, group_id, input_group_size),
                   self._get_input_for_group(m_prev, group_id,
                                             self._group_shape[0])
               ],
@@ -2684,7 +2712,7 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
 
 
 class SRUCell(rnn_cell_impl.LayerRNNCell):
-  """SRU, Simple Recurrent Unit
+  """SRU, Simple Recurrent Unit.
 
      Implementation based on
      Training RNNs as Fast as CNNs (cf. https://arxiv.org/abs/1709.02755).
@@ -2732,12 +2760,13 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
 
     input_depth = inputs_shape[1].value
 
+    # pylint: disable=protected-access
     self._kernel = self.add_variable(
         rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
         shape=[input_depth, 4 * self._num_units])
-
+    # pylint: enable=protected-access
     self._bias = self.add_variable(
-        rnn_cell_impl._BIAS_VARIABLE_NAME,
+        rnn_cell_impl._BIAS_VARIABLE_NAME,  # pylint: disable=protected-access
         shape=[2 * self._num_units],
         initializer=init_ops.constant_initializer(0.0, dtype=self.dtype))
 
@@ -2746,7 +2775,7 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
   def call(self, inputs, state):
     """Simple recurrent unit (SRU) with num_units cells."""
 
-    U = math_ops.matmul(inputs, self._kernel)
+    U = math_ops.matmul(inputs, self._kernel)  # pylint: disable=invalid-name
     x_bar, f_intermediate, r_intermediate, x_tx = array_ops.split(
         value=U, num_or_size_splits=4, axis=1)
 
@@ -2862,7 +2891,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
 
     output_size = weight.get_shape().as_list()[1]
     g = vs.get_variable(name, [output_size], dtype=weight.dtype)
-    return nn_impl.l2_normalize(weight, dim=0) * g
+    return nn_impl.l2_normalize(weight, axis=0) * g
 
   def _linear(self,
               args,
@@ -2876,6 +2905,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
     Args:
       args: a 2D Tensor or a list of 2D, batch x n, Tensors.
       output_size: int, second dimension of W[i].
+      norm: bool, whether to normalize the weights.
       bias: boolean, whether to add a bias term or not.
       bias_initializer: starting value to initialize the bias
         (default is all zeros).
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..dbd311a276bed2db9422cd8f1841d642616cac93
--- /dev/null
+++ b/tensorflow/contrib/rpc/BUILD
@@ -0,0 +1,29 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+py_library(
+    name = "rpc",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = ["//tensorflow/contrib/rpc/python/ops:rpc_op_py"],
+)
+
+py_library(
+    name = "rpc_pip",
+    data = if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/rpc/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":rpc",
+        "//tensorflow/contrib/rpc/python/kernel_tests:py_test_deps",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_base",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_servicer",
+    ],
+)
diff --git a/tensorflow/contrib/rpc/__init__.py b/tensorflow/contrib/rpc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c65c1a05def92b91ddd75e7aecdb4e4d9b8abe8a
--- /dev/null
+++ b/tensorflow/contrib/rpc/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to RPC.
+
+@@rpc
+@@try_rpc
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rpc.python.ops.rpc_op import rpc
+from tensorflow.contrib.rpc.python.ops.rpc_op import try_rpc
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2311c15a68c46090cec0f97bd950296506b0817e
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -0,0 +1,80 @@
+# TODO(b/76425722): Port everything in here to OS (currently excluded).
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+# Placeholder for loading internal BUILD rule.
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [":test_example_proto_py"],
+)
+
+py_library(
+    name = "rpc_op_test_base",
+    srcs = ["rpc_op_test_base.py"],
+    deps = [
+        ":test_example_proto_py",
+        "//tensorflow/contrib/proto",
+        "//tensorflow/contrib/rpc",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "rpc_op_test_servicer",
+    srcs = ["rpc_op_test_servicer.py"],
+    deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
+
+tf_py_test(
+    name = "rpc_op_test",
+    size = "small",
+    srcs = ["rpc_op_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        ":rpc_op_test_servicer",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+    tags = [
+        "no_pip",  # TODO(b/78026780)
+        "no_windows",  # TODO(b/78028010)
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fc6bfbb4d03a39906d4441e48b2788423caa234
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for RpcOp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+import grpc
+from grpc.framework.foundation import logging_pool
+import portpicker
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_servicer
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+from tensorflow.python.platform import test
+
+
+class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
+  _protocol = 'grpc'
+
+  invalid_method_string = 'Method not found'
+  connect_failed_string = 'Connect Failed'
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(RpcOpTest, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
+
+  def get_method_name(self, suffix):
+    return '/tensorflow.contrib.rpc.TestCaseService/%s' % suffix
+
+  def setUp(self):
+    super(RpcOpTest, self).setUp()
+
+    service_port = portpicker.pick_unused_port()
+
+    server = grpc.server(logging_pool.pool(max_workers=25))
+    servicer = rpc_op_test_servicer.RpcOpTestServicer()
+    test_example_pb2_grpc.add_TestCaseServiceServicer_to_server(
+        servicer, server)
+    self._address = 'localhost:%d' % service_port
+    server.add_insecure_port(self._address)
+    server.start()
+    self._server = server
+
+  def tearDown(self):
+    # TODO(ebrevdo): Figure out why this sometimes times out.
+    #    self._service.ExitLoop()
+    #    self._service_thread.join()
+    # self._server.stop()
+    super(RpcOpTest, self).tearDown()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..27273d16b1c09eba60e124e632b353b09ea2d063
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
@@ -0,0 +1,346 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Base class for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.rpc.python.ops import rpc_op
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+
+__all__ = ['I_WARNED_YOU', 'RpcOpTestBase']
+
+I_WARNED_YOU = 'I warned you!'
+
+
+class RpcOpTestBase(object):
+  # pylint: disable=missing-docstring,invalid-name
+  """Base class for RpcOp tests."""
+
+  def get_method_name(self, suffix):
+    raise NotImplementedError
+
+  def rpc(self, *args, **kwargs):
+    return rpc_op.rpc(*args, protocol=self._protocol, **kwargs)
+
+  def try_rpc(self, *args, **kwargs):
+    return rpc_op.try_rpc(*args, protocol=self._protocol, **kwargs)
+
+  def testScalarHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, ())
+      response_values = sess.run(response_tensors)
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+
+  def testScalarHostPortTryRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(status_code.shape, ())
+      self.assertEqual(status_message.shape, ())
+      self.assertEqual(response_tensors.shape, ())
+      response_values, status_code_values, status_message_values = (
+          sess.run((response_tensors, status_code, status_message)))
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+    # For the base Rpc op, don't expect to get error status back.
+    self.assertEqual(errors.OK, status_code_values)
+    self.assertEqual(b'', status_message_values)
+
+  def testEmptyHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = []
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertAllEqual(response_tensors.shape, [0])
+      response_values = sess.run(response_tensors)
+    self.assertAllEqual(response_values.shape, [0])
+
+  def testInvalidMethod(self):
+    for method in [
+        '/InvalidService.IncrementTestShapes',
+        self.get_method_name('InvalidMethodName')
+    ]:
+      with self.test_session() as sess:
+        with self.assertRaisesOpError(self.invalid_method_string):
+          sess.run(self.rpc(method=method, address=self._address, request=''))
+
+        _, status_code_value, status_message_value = sess.run(
+            self.try_rpc(method=method, address=self._address, request=''))
+        self.assertEqual(errors.UNIMPLEMENTED, status_code_value)
+        self.assertTrue(
+            self.invalid_method_string in status_message_value.decode('ascii'))
+
+  def testInvalidAddress(self):
+    # This covers the case of address='' and address='localhost:293874293874'
+    address = 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+    with self.test_session() as sess:
+      with self.assertRaises(errors.UnavailableError):
+        sess.run(
+            self.rpc(
+                method=self.get_method_name('IncrementTestShapes'),
+                address=address,
+                request=''))
+      _, status_code_value, status_message_value = sess.run(
+          self.try_rpc(
+              method=self.get_method_name('IncrementTestShapes'),
+              address=address,
+              request=''))
+      self.assertEqual(errors.UNAVAILABLE, status_code_value)
+      self.assertTrue(
+          self.connect_failed_string in status_message_value.decode('ascii'))
+
+  def testAlwaysFailingMethod(self):
+    with self.test_session() as sess:
+      response_tensors = self.rpc(
+          method=self.get_method_name('AlwaysFailWithInvalidArgument'),
+          address=self._address,
+          request='')
+      self.assertEqual(response_tensors.shape, ())
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('AlwaysFailWithInvalidArgument'),
+          address=self._address,
+          request='')
+      self.assertEqual(response_tensors.shape, ())
+      self.assertEqual(status_code.shape, ())
+      self.assertEqual(status_message.shape, ())
+      status_code_value, status_message_value = sess.run((status_code,
+                                                          status_message))
+      self.assertEqual(errors.INVALID_ARGUMENT, status_code_value)
+      self.assertTrue(I_WARNED_YOU in status_message_value.decode('ascii'))
+
+  def testSometimesFailingMethodWithManyRequests(self):
+    with self.test_session() as sess:
+      # Fail hard by default.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+      # Don't fail hard, use TryRpc - return the failing status instead.
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values, status_message_values = sess.run((status_code,
+                                                            status_message))
+      self.assertTrue([
+          x in (errors.OK, errors.INVALID_ARGUMENT) for x in status_code_values
+      ])
+      expected_message_values = np.where(
+          status_code_values == errors.INVALID_ARGUMENT,
+          I_WARNED_YOU.encode('ascii'), b'')
+      self.assertAllEqual(expected_message_values, status_message_values)
+
+  def testVecHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, (20,))
+      response_values = sess.run(response_tensors)
+    self.assertEqual(response_values.shape, (20,))
+    for i in range(20):
+      response_message = test_example_pb2.TestCase()
+      self.assertTrue(response_message.ParseFromString(response_values[i]))
+      self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortManyParallelRpcs(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      many_response_tensors = [
+          self.rpc(
+              method=self.get_method_name('IncrementTestShapes'),
+              address=self._address,
+              request=request_tensors) for _ in range(10)
+      ]
+      # Launch parallel 10 calls to the RpcOp, each containing 20 rpc requests.
+      many_response_values = sess.run(many_response_tensors)
+    self.assertEqual(10, len(many_response_values))
+    for response_values in many_response_values:
+      self.assertEqual(response_values.shape, (20,))
+      for i in range(20):
+        response_message = test_example_pb2.TestCase()
+        self.assertTrue(response_message.ParseFromString(response_values[i]))
+        self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortRpcUsingEncodeAndDecodeProto(self):
+    with self.test_session() as sess:
+      request_tensors = encode_proto_op.encode_proto(
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          sizes=[[3]] * 20,
+          values=[
+              [[i, i + 1, i + 2] for i in range(20)],
+          ])
+      response_tensor_strings = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      _, (response_shape,) = decode_proto_op.decode_proto(
+          bytes=response_tensor_strings,
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          output_types=[dtypes.int32])
+      response_shape_values = sess.run(response_shape)
+    self.assertAllEqual([[i + 1, i + 2, i + 3]
+                         for i in range(20)], response_shape_values)
+
+  def testVecHostPortRpcCancelsUponSessionTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          request=request_tensors)
+      for timeout_ms in [1, 500, 1000]:
+        options = config_pb2.RunOptions(timeout_in_ms=timeout_ms)
+        with self.assertRaises((errors.UnavailableError,
+                                errors.DeadlineExceededError)):
+          sess.run(response_tensors, options=options)
+
+  def testVecHostPortRpcCancelsUponConfiguredTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          timeout_in_ms=1000,
+          request=request_tensors)
+      with self.assertRaises(errors.DeadlineExceededError):
+        sess.run(response_tensors)
+
+  def testTryRpcPropagatesDeadlineErrorWithSometimesTimingOutRequests(self):
+    with self.test_session() as sess:
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesSleepForever'),
+          timeout_in_ms=1000,
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values = sess.run(status_code)
+      self.assertTrue([
+          x in (errors.OK, errors.DEADLINE_EXCEEDED) for x in status_code_values
+      ])
+
+  def testTryRpcWithMultipleAddressesSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleMethodsSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      methods = flatten(
+          [[self.get_method_name('IncrementTestShapes'), 'InvalidMethodName']
+           for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=methods, address=self._address, request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNIMPLEMENTED] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleAddressesAndRequests(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      requests = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=requests)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(20):
+        if i % 2 == 1:
+          self.assertFalse(response_tensors_values[i])
+        else:
+          response_message = test_example_pb2.TestCase()
+          self.assertTrue(
+              response_message.ParseFromString(response_tensors_values[i]))
+          self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cbd636cb16e3befc9ae27cb231696634e859a22
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Test servicer for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import time
+
+import grpc
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+
+
+class RpcOpTestServicer(test_example_pb2_grpc.TestCaseServiceServicer):
+  """Test servicer for RpcOp tests."""
+
+  def IncrementTestShapes(self, request, context):
+    """Increment the entries in the shape attribute of request.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    for i in range(len(request.shape)):
+      request.shape[i] += 1
+    return request
+
+  def AlwaysFailWithInvalidArgument(self, request, context):
+    """Always fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    del request
+    context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+    context.set_details(rpc_op_test_base.I_WARNED_YOU)
+
+  def SometimesFailWithInvalidArgument(self, request, context):
+    """Sometimes fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+      context.set_details(rpc_op_test_base.I_WARNED_YOU)
+    return request
+
+  def SleepForever(self, request, context):
+    """Sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    # TODO(ebrevdo): Make this async wait like the stubby version.
+    time.sleep(5)
+
+  def SometimesSleepForever(self, request, context):
+    """Sometimes sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      time.sleep(5)
+    return request
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000000000000000000000000000000000..96f4550f62bc17e713abe1f3843ec0964f57b046
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
@@ -0,0 +1,171 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.rpc;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+service TestCaseService {
+  // Copy input, and increment each entry in 'shape' by 1.
+  rpc IncrementTestShapes(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever.
+  rpc SleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever 50% of the time, return immediately the other 50%.
+  rpc SometimesSleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Always fails with InvalidArgument.
+  rpc AlwaysFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+
+  // Fails with InvalidArgument 50% of the time.
+  rpc SometimesFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/contrib/rpc/python/ops/BUILD b/tensorflow/contrib/rpc/python/ops/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..84d2a1832f14b61ec313e7a1a00b0672bc410cfb
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/ops/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+
+py_library(
+    name = "rpc_op_py",
+    srcs = ["rpc_op.py"],
+    deps = [
+        ":gen_rpc_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_rpc_op_py",
+    out = "gen_rpc_op.py",
+    deps = [
+        "//tensorflow/core:rpc_ops_op_lib",
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/ops/rpc_op.py b/tensorflow/contrib/rpc/python/ops/rpc_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1b6c41137828950e73757579dca1eba4adf2ae4
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/ops/rpc_op.py
@@ -0,0 +1,26 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""RPC communication."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rpc.python.ops.gen_rpc_op import rpc
+from tensorflow.contrib.rpc.python.ops.gen_rpc_op import try_rpc
+from tensorflow.python.framework import ops
+ops.NotDifferentiable("Rpc")
+ops.NotDifferentiable("TryRpc")
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 245fe07f2bcdaddb2bc47c0e1234dc1f19bd85e3..26fd4e2023806765ea4088f4c13a780ca7338bff 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -48,15 +48,14 @@ py_library(
     ],
 )
 
-py_test(
-    name = "reader_test",
-    size = "small",
-    srcs = ["python/saved_model/reader_test.py"],
+py_library(
+    name = "reader",
+    srcs = ["python/saved_model/reader.py"],
     srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    visibility = ["//visibility:public"],
     deps = [
         ":saved_model_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:variables",
@@ -65,6 +64,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "reader_test",
+    size = "small",
+    srcs = ["python/saved_model/reader_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    visibility = ["//visibility:private"],
+    deps = [
+        ":reader",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "signature_def_utils_test",
     size = "small",
@@ -81,15 +93,3 @@ py_test(
         "//tensorflow/python/saved_model:utils",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/BUILD b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
index ea4da80ba38389fb63a5dfe3cf608b959939c7ca..3c616c555b88cf5ec948bef3df5c2fef5caed0d4 100644
--- a/tensorflow/contrib/saved_model/cc/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
@@ -49,9 +49,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index ab80c68b1a8e4ff151494e393b68c460846fa8fe..1a1591d798f6f904e23987d9d7a60193c124c20e 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -3,9 +3,12 @@
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
+package(default_visibility = [
+    "//learning/brain/google/xla/tests:__subpackages__",
+    "//tensorflow:__subpackages__",
+])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
+exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
@@ -38,6 +41,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:functional_ops",
@@ -211,15 +215,3 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
index dfa12e873a6aca806031c48d6f92e0432d0ea6e0..a9a32b7b25d6767cc1f944640722e128a9d728b5 100644
--- a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
@@ -74,7 +74,7 @@ class GatherTreeOp : public OpKernel {
         ctx,
         step_ids_shape.dim_size(1) == max_sequence_lengths.shape().dim_size(0),
         errors::InvalidArgument("batch size dimensions step_ids.shape[1] and "
-                                "max_seqeuence_lengths.shape[0] must match.  "
+                                "max_sequence_lengths.shape[0] must match.  "
                                 "but shapes are: ",
                                 step_ids_shape.DebugString(), " and ",
                                 max_sequence_lengths.shape().DebugString()));
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index b427dff88b2d586ccf8c512bb498cdaf879ac781..cd162bae25aa1c1b6718b8e5b0b8687e5b80eab3 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -110,7 +111,12 @@ class AttentionWrapperTest(test.TestCase):
                          alignment_history=False,
                          expected_final_alignment_history=None,
                          attention_layer_size=6,
+                         attention_layer=None,
                          name=''):
+    attention_layer_sizes = (
+        [attention_layer_size] if attention_layer_size is not None else None)
+    attention_layers = (
+        [attention_layer] if attention_layer is not None else None)
     self._testWithMaybeMultiAttention(
         is_multi=False,
         create_attention_mechanisms=[create_attention_mechanism],
@@ -119,7 +125,8 @@ class AttentionWrapperTest(test.TestCase):
         attention_mechanism_depths=[attention_mechanism_depth],
         alignment_history=alignment_history,
         expected_final_alignment_history=expected_final_alignment_history,
-        attention_layer_sizes=[attention_layer_size],
+        attention_layer_sizes=attention_layer_sizes,
+        attention_layers=attention_layers,
         name=name)
 
   def _testWithMaybeMultiAttention(self,
@@ -131,6 +138,7 @@ class AttentionWrapperTest(test.TestCase):
                                    alignment_history=False,
                                    expected_final_alignment_history=None,
                                    attention_layer_sizes=None,
+                                   attention_layers=None,
                                    name=''):
     # Allow is_multi to be True with a single mechanism to enable test for
     # passing in a single mechanism in a list.
@@ -144,12 +152,18 @@ class AttentionWrapperTest(test.TestCase):
     encoder_output_depth = 10
     cell_depth = 9
 
-    if attention_layer_sizes is None:
-      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
-    else:
+    if attention_layer_sizes is not None:
       # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
       attention_depth = sum([attention_layer_size or encoder_output_depth
                              for attention_layer_size in attention_layer_sizes])
+    elif attention_layers is not None:
+      # Compute sum of attention_layers output depth.
+      attention_depth = sum(
+          attention_layer.compute_output_shape(
+              [batch_size, cell_depth + encoder_output_depth])[-1].value
+          for attention_layer in attention_layers)
+    else:
+      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
 
     decoder_inputs = array_ops.placeholder_with_default(
         np.random.randn(batch_size, decoder_max_time,
@@ -171,13 +185,20 @@ class AttentionWrapperTest(test.TestCase):
       with vs.variable_scope(
           'root',
           initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
+        attention_layer_size = attention_layer_sizes
+        attention_layer = attention_layers
+        if not is_multi:
+          if attention_layer_size is not None:
+            attention_layer_size = attention_layer_size[0]
+          if attention_layer is not None:
+            attention_layer = attention_layer[0]
         cell = rnn_cell.LSTMCell(cell_depth)
         cell = wrapper.AttentionWrapper(
             cell,
             attention_mechanisms if is_multi else attention_mechanisms[0],
-            attention_layer_size=(attention_layer_sizes if is_multi
-                                  else attention_layer_sizes[0]),
-            alignment_history=alignment_history)
+            attention_layer_size=attention_layer_size,
+            alignment_history=alignment_history,
+            attention_layer=attention_layer)
         helper = helper_py.TrainingHelper(decoder_inputs,
                                           decoder_sequence_length)
         my_decoder = basic_decoder.BasicDecoder(
@@ -222,6 +243,9 @@ class AttentionWrapperTest(test.TestCase):
           self.assertEqual(
               (None, batch_size, None),
               tuple(state_alignment_history.get_shape().as_list()))
+        nest.assert_same_structure(
+            cell.state_size,
+            cell.zero_state(batch_size, dtypes.float32))
         # Remove the history from final_state for purposes of the
         # remainder of the tests.
         final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
@@ -257,6 +281,41 @@ class AttentionWrapperTest(test.TestCase):
             expected_final_alignment_history,
             final_alignment_history_info)
 
+  def testBahdanauNormalizedDType(self):
+    for dtype in [np.float16, np.float32, np.float64]:
+      num_units = 128
+      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      batch_size = 64
+      attention_mechanism = wrapper.BahdanauAttention(
+          num_units=num_units,
+          memory=encoder_outputs,
+          memory_sequence_length=encoder_sequence_length,
+          normalize=True,
+          dtype=dtype,
+      )
+      cell = rnn_cell.LSTMCell(num_units)
+      cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+      helper = helper_py.TrainingHelper(decoder_inputs,
+                                        decoder_sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtype, batch_size=batch_size))
+
+      final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
   def testBahdanauNotNormalized(self):
     create_attention_mechanism = wrapper.BahdanauAttention
 
@@ -350,6 +409,42 @@ class AttentionWrapperTest(test.TestCase):
         attention_mechanism_depth=9,
         name='testLuongNotNormalized')
 
+  def testLuongScaledDType(self):
+    # Test case for GitHub issue 18099
+    for dt in [np.float16, np.float32, np.float64]:
+      num_units = 128
+      encoder_outputs = array_ops.placeholder(dt, shape=[64, None, 256])
+      encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      decoder_inputs = array_ops.placeholder(dt, shape=[64, None, 128])
+      decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      batch_size = 64
+      attention_mechanism = wrapper.LuongAttention(
+          num_units=num_units,
+          memory=encoder_outputs,
+          memory_sequence_length=encoder_sequence_length,
+          scale=True,
+          dtype=dt,
+      )
+      cell = rnn_cell.LSTMCell(num_units)
+      cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+      helper = helper_py.TrainingHelper(decoder_inputs,
+                                        decoder_sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dt, batch_size=batch_size))
+
+      final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual(final_outputs.rnn_output.dtype, dt)
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
   def testLuongScaled(self):
     create_attention_mechanism = functools.partial(
         wrapper.LuongAttention, scale=True)
@@ -758,6 +853,48 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testMultiAttention')
 
+  def testMultiAttentionWithLayerInstances(self):
+    create_attention_mechanisms = (
+        wrapper.BahdanauAttention, wrapper.LuongAttention)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 7), dtype=dtype('float32'), mean=0.0011709079),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=3.2000000000000002))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0038725811),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019329828)),
+        attention=ResultSummary(
+            shape=(5, 7), dtype=dtype('float32'), mean=0.001174294),
+        time=3,
+        alignments=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        alignment_history=())
+
+    expected_final_alignment_history = (
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125))
+
+    self._testWithMaybeMultiAttention(
+        True,
+        create_attention_mechanisms,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depths=[9, 9],
+        attention_layers=[layers_core.Dense(3, use_bias=False),
+                          layers_core.Dense(4, use_bias=False)],
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testMultiAttention')
+
   def testLuongMonotonicHard(self):
     # Run attention mechanism with mode='hard', make sure probabilities are hard
     b, t, u, d = 10, 20, 30, 40
@@ -782,26 +919,31 @@ class AttentionWrapperTest(test.TestCase):
         wrapper.BahdanauAttention, wrapper.LuongAttention)
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=ResultSummary(
-            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.11798714846372604),
-        sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=7.933333333333334))
+        rnn_output=ResultSummary(shape=(5, 3, 20),
+                                 dtype=dtype('float32'),
+                                 mean=0.11723966),
+        sample_id=ResultSummary(shape=(5, 3),
+                                dtype=dtype('int32'),
+                                mean=9.2666666666666675))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0036486709),
-            h=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0018835809)),
-        attention=ResultSummary(
-            shape=(5, 20), dtype=dtype('float32'), mean=0.11798714846372604),
+            c=ResultSummary(shape=(5, 9),
+                            dtype=dtype('float32'),
+                            mean=-0.003545674),
+            h=ResultSummary(shape=(5, 9),
+                            dtype=dtype('float32'),
+                            mean=-0.0018327223)),
+        attention=ResultSummary(shape=(5, 20),
+                                dtype=dtype('float32'),
+                                mean=0.11728073),
         time=3,
         alignments=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        alignment_history=(),
         attention_state=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
-            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
-        alignment_history=())
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)))
     expected_final_alignment_history = (
         ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
         ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125))
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index 926554031775202d7f7d9018cf6ae4efb34fe96b..178328619f087789df040489cd150ba018cc8d14 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
@@ -70,6 +71,98 @@ class TestGatherTree(test.TestCase):
 
     self.assertAllEqual(expected_result, res_)
 
+  def _test_gather_tree_from_array(self,
+                                   depth_ndims=0,
+                                   merged_batch_beam=False):
+    array = np.array(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [0, 0, 0]],
+         [[2, 3, 4], [5, 6, 7], [8, 9, 10], [11, 12, 0]]]).transpose([1, 0, 2])
+    parent_ids = np.array(
+        [[[0, 0, 0], [0, 1, 1], [2, 1, 2], [-1, -1, -1]],
+         [[0, 0, 0], [1, 1, 0], [2, 0, 1], [0, 1, 0]]]).transpose([1, 0, 2])
+    expected_array = np.array(
+        [[[2, 2, 2], [6, 5, 6], [7, 8, 9], [0, 0, 0]],
+         [[2, 3, 2], [7, 5, 7], [8, 9, 8], [11, 12, 0]]]).transpose([1, 0, 2])
+    sequence_length = [[3, 3, 3], [4, 4, 3]]
+
+    array = ops.convert_to_tensor(
+        array, dtype=dtypes.float32)
+    parent_ids = ops.convert_to_tensor(
+        parent_ids, dtype=dtypes.int32)
+    expected_array = ops.convert_to_tensor(
+        expected_array, dtype=dtypes.float32)
+
+    max_time = array_ops.shape(array)[0]
+    batch_size = array_ops.shape(array)[1]
+    beam_width = array_ops.shape(array)[2]
+
+    def _tile_in_depth(tensor):
+      # Generate higher rank tensors by concatenating tensor and tensor + 1.
+      for _ in range(depth_ndims):
+        tensor = array_ops.stack([tensor, tensor + 1], -1)
+      return tensor
+
+    if merged_batch_beam:
+      array = array_ops.reshape(
+          array, [max_time, batch_size * beam_width])
+      expected_array = array_ops.reshape(
+          expected_array, [max_time, batch_size * beam_width])
+
+    if depth_ndims > 0:
+      array = _tile_in_depth(array)
+      expected_array = _tile_in_depth(expected_array)
+
+    sorted_array = beam_search_decoder.gather_tree_from_array(
+        array, parent_ids, sequence_length)
+
+    with self.test_session() as sess:
+      sorted_array = sess.run(sorted_array)
+      expected_array = sess.run(expected_array)
+      self.assertAllEqual(expected_array, sorted_array)
+
+  def test_gather_tree_from_array_scalar(self):
+    self._test_gather_tree_from_array()
+
+  def test_gather_tree_from_array_1d(self):
+    self._test_gather_tree_from_array(depth_ndims=1)
+
+  def test_gather_tree_from_array_1d_with_merged_batch_beam(self):
+    self._test_gather_tree_from_array(depth_ndims=1, merged_batch_beam=True)
+
+  def test_gather_tree_from_array_2d(self):
+    self._test_gather_tree_from_array(depth_ndims=2)
+
+
+class TestArrayShapeChecks(test.TestCase):
+
+  def _test_array_shape_dynamic_checks(self, static_shape, dynamic_shape,
+                                       batch_size, beam_width, is_valid=True):
+    t = array_ops.placeholder_with_default(
+        np.random.randn(*static_shape).astype(np.float32),
+        shape=dynamic_shape)
+
+    batch_size = array_ops.constant(batch_size)
+    check_op = beam_search_decoder._check_batch_beam(t, batch_size, beam_width)  # pylint: disable=protected-access
+
+    with self.test_session() as sess:
+      if is_valid:
+        sess.run(check_op)
+      else:
+        with self.assertRaises(errors.InvalidArgumentError):
+          sess.run(check_op)
+
+  def test_array_shape_dynamic_checks(self):
+    self._test_array_shape_dynamic_checks(
+        (8, 4, 5, 10), (None, None, 5, 10), 4, 5, is_valid=True)
+    self._test_array_shape_dynamic_checks(
+        (8, 20, 10), (None, None, 10), 4, 5, is_valid=True)
+    self._test_array_shape_dynamic_checks(
+        (8, 21, 10), (None, None, 10), 4, 5, is_valid=False)
+    self._test_array_shape_dynamic_checks(
+        (8, 4, 6, 10), (None, None, None, 10), 4, 5, is_valid=False)
+    self._test_array_shape_dynamic_checks(
+        (8, 4), (None, None), 4, 5, is_valid=False)
+
 
 class TestEosMasking(test.TestCase):
   """Tests EOS masking used in beam search."""
@@ -319,7 +412,8 @@ class TestLargeBeamStep(test.TestCase):
 
 class BeamSearchDecoderTest(test.TestCase):
 
-  def _testDynamicDecodeRNN(self, time_major, has_attention):
+  def _testDynamicDecodeRNN(self, time_major, has_attention,
+                            with_alignment_history=False):
     encoder_sequence_length = np.array([3, 2, 3, 1, 1])
     decoder_sequence_length = np.array([2, 0, 1, 2, 3])
     batch_size = 5
@@ -359,7 +453,7 @@ class BeamSearchDecoderTest(test.TestCase):
             cell=cell,
             attention_mechanism=attention_mechanism,
             attention_layer_size=attention_depth,
-            alignment_history=False)
+            alignment_history=with_alignment_history)
       cell_state = cell.zero_state(
           dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
       if has_attention:
@@ -420,6 +514,12 @@ class BeamSearchDecoderTest(test.TestCase):
   def testDynamicDecodeRNNBatchMajorYesAttention(self):
     self._testDynamicDecodeRNN(time_major=False, has_attention=True)
 
+  def testDynamicDecodeRNNBatchMajorYesAttentionWithAlignmentHistory(self):
+    self._testDynamicDecodeRNN(
+        time_major=False,
+        has_attention=True,
+        with_alignment_history=True)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
index ac830ae98e5f9878f0972fb851fd5edd1c1a8789..b549cbf568f254cbf18456145af751a8245dd379 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
@@ -92,14 +92,18 @@ class DynamicDecodeRNNTest(test.TestCase):
 
       # Mostly a smoke test
       time_steps = max_out
+      expected_length = sequence_length
       if maximum_iterations is not None:
         time_steps = min(max_out, maximum_iterations)
+        expected_length = [min(x, maximum_iterations) for x in expected_length]
       self.assertEqual(
           _t((batch_size, time_steps, cell_depth)),
           sess_results["final_outputs"].rnn_output.shape)
       self.assertEqual(
           _t((batch_size, time_steps)),
           sess_results["final_outputs"].sample_id.shape)
+      self.assertItemsEqual(expected_length,
+                            sess_results["final_sequence_length"])
 
   def testDynamicDecodeRNNBatchMajor(self):
     self._testDynamicDecodeRNN(time_major=False)
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 0a53fd66dbe4d28ea102773b9c5bae50b9d18e9c..1c9d179e3c55ad07fcf709f66028c91c20e8eea0 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -339,7 +339,8 @@ def _luong_score(query, keys, scale):
   if scale:
     # Scalar used in weight scaling
     g = variable_scope.get_variable(
-        "attention_g", dtype=dtype, initializer=1.)
+        "attention_g", dtype=dtype,
+        initializer=init_ops.ones_initializer, shape=())
     score = g * score
   return score
 
@@ -471,7 +472,8 @@ def _bahdanau_score(processed_query, keys, normalize):
     # Scalar used in weight normalization
     g = variable_scope.get_variable(
         "attention_g", dtype=dtype,
-        initializer=math.sqrt((1. / num_units)))
+        initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))),
+        shape=())
     # Bias added prior to the nonlinearity
     b = variable_scope.get_variable(
         "attention_b", [num_units], dtype=dtype,
@@ -609,8 +611,8 @@ def monotonic_attention(p_choose_i, previous_attention, mode):
   addition, once an input sequence element is attended to at a given output
   timestep, elements occurring before it cannot be attended to at subsequent
   output timesteps.  This function generates attention distributions according
-  to these assumptions.  For more information, see ``Online and Linear-Time
-  Attention by Enforcing Monotonic Alignments''.
+  to these assumptions.  For more information, see `Online and Linear-Time
+  Attention by Enforcing Monotonic Alignments`.
 
   Args:
     p_choose_i: Probability of choosing input sequence/memory element i.  Should
@@ -653,7 +655,7 @@ def monotonic_attention(p_choose_i, previous_attention, mode):
     shifted_1mp_choose_i = array_ops.concat(
         [array_ops.ones((batch_size, 1)), 1 - p_choose_i[:, :-1]], 1)
     # Compute attention distribution recursively as
-    # q[i] = (1 - p_choose_i[i])*q[i - 1] + previous_attention[i]
+    # q[i] = (1 - p_choose_i[i - 1])*q[i - 1] + previous_attention[i]
     # attention[i] = p_choose_i[i]*q[i]
     attention = p_choose_i*array_ops.transpose(functional_ops.scan(
         # Need to use reshape to remind TF of the shape between loop iterations
@@ -736,7 +738,7 @@ class _BaseMonotonicAttentionMechanism(_BaseAttentionMechanism):
   """Base attention mechanism for monotonic attention.
 
   Simply overrides the initial_alignments function to provide a dirac
-  distribution,which is needed in order for the monotonic attention
+  distribution, which is needed in order for the monotonic attention
   distributions to have the correct behavior.
   """
 
@@ -763,7 +765,7 @@ class _BaseMonotonicAttentionMechanism(_BaseAttentionMechanism):
 class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Bahadanau-style energy function.
 
-  This type of attention encorces a monotonic constraint on the attention
+  This type of attention enforces a monotonic constraint on the attention
   distributions; that is once the model attends to a given point in the memory
   it can't attend to any prior points at subsequence output timesteps.  It
   achieves this by using the _monotonic_probability_fn instead of softmax to
@@ -867,7 +869,7 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
 class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Luong-style energy function.
 
-  This type of attention encorces a monotonic constraint on the attention
+  This type of attention enforces a monotonic constraint on the attention
   distributions; that is once the model attends to a given point in the memory
   it can't attend to any prior points at subsequence output timesteps.  It
   achieves this by using the _monotonic_probability_fn instead of softmax to
@@ -1081,7 +1083,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
                cell_input_fn=None,
                output_attention=True,
                initial_cell_state=None,
-               name=None):
+               name=None,
+               attention_layer=None):
     """Construct the `AttentionWrapper`.
 
     **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
@@ -1124,7 +1127,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         (default), use the context as attention at each time step. Otherwise,
         feed the context and cell output into the attention layer to generate
         attention at each time step. If attention_mechanism is a list,
-        attention_layer_size must be a list of the same length.
+        attention_layer_size must be a list of the same length. If
+        attention_layer is set, this must be None.
       alignment_history: Python boolean, whether to store alignment history
         from all time steps in the final output state (currently stored as a
         time major `TensorArray` on which you must call `stack()`).
@@ -1133,7 +1137,7 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
       output_attention: Python bool.  If `True` (default), the output at each
         time step is the attention value.  This is the behavior of Luong-style
         attention mechanisms.  If `False`, the output at each time step is
-        the output of `cell`.  This is the beahvior of Bhadanau-style
+        the output of `cell`.  This is the behavior of Bhadanau-style
         attention mechanisms.  In both cases, the `attention` tensor is
         propagated to the next time step via the state and is used there.
         This flag only controls whether the attention mechanism is propagated
@@ -1144,17 +1148,22 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         does not match the batch size of `initial_cell_state`, proper
         behavior is not guaranteed.
       name: Name to use when creating ops.
+      attention_layer: A list of `tf.layers.Layer` instances or a
+        single `tf.layers.Layer` instance taking the context and cell output as
+        inputs to generate attention at each time step. If None (default), use
+        the context as attention at each time step. If attention_mechanism is a
+        list, attention_layer must be a list of the same length. If
+        attention_layers_size is set, this must be None.
 
     Raises:
       TypeError: `attention_layer_size` is not None and (`attention_mechanism`
         is a list but `attention_layer_size` is not; or vice versa).
       ValueError: if `attention_layer_size` is not None, `attention_mechanism`
-        is a list, and its length does not match that of `attention_layer_size`.
+        is a list, and its length does not match that of `attention_layer_size`;
+        if `attention_layer_size` and `attention_layer` are set simultaneously.
     """
     super(AttentionWrapper, self).__init__(name=name)
-    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
-      raise TypeError(
-          "cell must be an RNNCell, saw type: %s" % type(cell).__name__)
+    rnn_cell_impl.assert_like_rnncell("cell", cell)
     if isinstance(attention_mechanism, (list, tuple)):
       self._is_multi = True
       attention_mechanisms = attention_mechanism
@@ -1182,6 +1191,10 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             "cell_input_fn must be callable, saw type: %s"
             % type(cell_input_fn).__name__)
 
+    if attention_layer_size is not None and attention_layer is not None:
+      raise ValueError("Only one of attention_layer_size and attention_layer "
+                       "should be set")
+
     if attention_layer_size is not None:
       attention_layer_sizes = tuple(
           attention_layer_size
@@ -1200,6 +1213,22 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
               dtype=attention_mechanisms[i].dtype)
           for i, attention_layer_size in enumerate(attention_layer_sizes))
       self._attention_layer_size = sum(attention_layer_sizes)
+    elif attention_layer is not None:
+      self._attention_layers = tuple(
+          attention_layer
+          if isinstance(attention_layer, (list, tuple))
+          else (attention_layer,))
+      if len(self._attention_layers) != len(attention_mechanisms):
+        raise ValueError(
+            "If provided, attention_layer must contain exactly one "
+            "layer per attention_mechanism, saw: %d vs %d"
+            % (len(self._attention_layers), len(attention_mechanisms)))
+      self._attention_layer_size = sum(
+          layer.compute_output_shape(
+              [None,
+               cell.output_size + mechanism.values.shape[-1].value])[-1].value
+          for layer, mechanism in zip(
+              self._attention_layers, attention_mechanisms))
     else:
       self._attention_layers = None
       self._attention_layer_size = sum(
@@ -1280,7 +1309,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         attention_state=self._item_or_tuple(
             a.state_size for a in self._attention_mechanisms),
         alignment_history=self._item_or_tuple(
-            () for _ in self._attention_mechanisms))  # sometimes a TensorArray
+            a.alignments_size if self._alignment_history else ()
+            for a in self._attention_mechanisms))  # sometimes a TensorArray
 
   def zero_state(self, batch_size, dtype):
     """Return an initial (zero) state tuple for this `AttentionWrapper`.
@@ -1320,22 +1350,26 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         cell_state = nest.map_structure(
             lambda s: array_ops.identity(s, name="checked_cell_state"),
             cell_state)
+      initial_alignments = [
+          attention_mechanism.initial_alignments(batch_size, dtype)
+          for attention_mechanism in self._attention_mechanisms]
       return AttentionWrapperState(
           cell_state=cell_state,
           time=array_ops.zeros([], dtype=dtypes.int32),
           attention=_zero_state_tensors(self._attention_layer_size, batch_size,
                                         dtype),
-          alignments=self._item_or_tuple(
-              attention_mechanism.initial_alignments(batch_size, dtype)
-              for attention_mechanism in self._attention_mechanisms),
+          alignments=self._item_or_tuple(initial_alignments),
           attention_state=self._item_or_tuple(
               attention_mechanism.initial_state(batch_size, dtype)
               for attention_mechanism in self._attention_mechanisms),
           alignment_history=self._item_or_tuple(
-              tensor_array_ops.TensorArray(dtype=dtype, size=0,
-                                           dynamic_size=True)
+              tensor_array_ops.TensorArray(
+                  dtype,
+                  size=0,
+                  dynamic_size=True,
+                  element_shape=alignment.shape)
               if self._alignment_history else ()
-              for _ in self._attention_mechanisms))
+              for alignment in initial_alignments))
 
   def call(self, inputs, state):
     """Perform a step of attention-wrapped RNN.
diff --git a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
index ed226239b860e2250072a28a5538b816642ec54b..7eb95e5a70de985dca0d4b565ba03bdf454b6161 100644
--- a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
@@ -59,8 +59,7 @@ class BasicDecoder(decoder.Decoder):
     Raises:
       TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
     """
-    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
-      raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
+    rnn_cell_impl.assert_like_rnncell("cell", cell)
     if not isinstance(helper, helper_py.Helper):
       raise TypeError("helper must be a Helper, received: %s" % type(helper))
     if (output_layer is not None
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index d6184d61095f727f9dcab56fe59e2601868c1624..184144f64a56358206014a0f75473b4a9b16617a 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 
 __all__ = [
@@ -121,14 +122,114 @@ def tile_batch(t, multiplier, name=None):
     return nest.map_structure(lambda t_: _tile_batch(t_, multiplier), t)
 
 
+def gather_tree_from_array(t, parent_ids, sequence_length):
+  """Calculates the full beams for `TensorArray`s.
+
+  Args:
+    t: A stacked `TensorArray` of size `max_time` that contains `Tensor`s of
+      shape `[batch_size, beam_width, s]` or `[batch_size * beam_width, s]`
+      where `s` is the depth shape.
+    parent_ids: The parent ids of shape `[max_time, batch_size, beam_width]`.
+    sequence_length: The sequence length of shape `[batch_size, beam_width]`.
+
+  Returns:
+    A `Tensor` which is a stacked `TensorArray` of the same size and type as
+    `t` and where beams are sorted in each `Tensor` according to `parent_ids`.
+  """
+  max_time = parent_ids.shape[0].value or array_ops.shape(parent_ids)[0]
+  batch_size = parent_ids.shape[1].value or array_ops.shape(parent_ids)[1]
+  beam_width = parent_ids.shape[2].value or array_ops.shape(parent_ids)[2]
+
+  # Generate beam ids that will be reordered by gather_tree.
+  beam_ids = array_ops.expand_dims(
+      array_ops.expand_dims(math_ops.range(beam_width), 0), 0)
+  beam_ids = array_ops.tile(beam_ids, [max_time, batch_size, 1])
+
+  mask = array_ops.sequence_mask(
+      sequence_length, maxlen=max_time, dtype=dtypes.int32)
+  mask = array_ops.transpose(mask, perm=[2, 0, 1])
+
+  # Use beam_width + 1 to mark the end of beam.
+  masked_beam_ids = (beam_ids * mask) + (1 - mask) * (beam_width + 1)
+
+  max_sequence_lengths = math_ops.to_int32(
+      math_ops.reduce_max(sequence_length, axis=1))
+  sorted_beam_ids = beam_search_ops.gather_tree(
+      step_ids=masked_beam_ids,
+      parent_ids=parent_ids,
+      max_sequence_lengths=max_sequence_lengths,
+      end_token=beam_width + 1)
+
+  # For out of range steps, simply copy the same beam.
+  sorted_beam_ids = array_ops.where(
+      math_ops.cast(mask, dtypes.bool), x=sorted_beam_ids, y=beam_ids)
+
+  # Generate indices for gather_nd.
+  time_ind = array_ops.tile(array_ops.reshape(
+      math_ops.range(max_time), [-1, 1, 1]), [1, batch_size, beam_width])
+  batch_ind = array_ops.tile(array_ops.reshape(
+      math_ops.range(batch_size), [-1, 1, 1]), [1, max_time, beam_width])
+  batch_ind = array_ops.transpose(batch_ind, perm=[1, 0, 2])
+  indices = array_ops.stack([time_ind, batch_ind, sorted_beam_ids], -1)
+
+  # Gather from a tensor with collapsed additional dimensions.
+  gather_from = t
+  final_shape = array_ops.shape(gather_from)
+  gather_from = array_ops.reshape(
+      gather_from, [max_time, batch_size, beam_width, -1])
+  ordered = array_ops.gather_nd(gather_from, indices)
+  ordered = array_ops.reshape(ordered, final_shape)
+
+  return ordered
+
+
 def _check_maybe(t):
-  if isinstance(t, tensor_array_ops.TensorArray):
-    raise TypeError(
-        "TensorArray state is not supported by BeamSearchDecoder: %s" % t.name)
   if t.shape.ndims is None:
     raise ValueError(
         "Expected tensor (%s) to have known rank, but ndims == None." % t)
 
+def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
+  """Raises an exception if dimensions are known statically and can not be
+  reshaped to [batch_size, beam_size, -1].
+  """
+  reshaped_shape = tensor_shape.TensorShape([batch_size, beam_width, None])
+  if (batch_size is not None and shape[0].value is not None
+      and (shape[0] != batch_size * beam_width
+           or (shape.ndims >= 2 and shape[1].value is not None
+               and (shape[0] != batch_size or shape[1] != beam_width)))):
+    tf_logging.warn("TensorArray reordering expects elements to be "
+                    "reshapable to %s which is incompatible with the "
+                    "current shape %s. Consider setting "
+                    "reorder_tensor_arrays to False to disable TensorArray "
+                    "reordering during the beam search."
+                    % (reshaped_shape, shape))
+    return False
+  return True
+
+def _check_batch_beam(t, batch_size, beam_width):
+  """Returns an Assert operation checking that the elements of the stacked
+  TensorArray can be reshaped to [batch_size, beam_size, -1]. At this point,
+  the TensorArray elements have a known rank of at least 1.
+  """
+  error_message = ("TensorArray reordering expects elements to be "
+                   "reshapable to [batch_size, beam_size, -1] which is "
+                   "incompatible with the dynamic shape of %s elements. "
+                   "Consider setting reorder_tensor_arrays to False to disable "
+                   "TensorArray reordering during the beam search."
+                   % (t.name))
+  rank = t.shape.ndims
+  shape = array_ops.shape(t)
+  if rank == 2:
+    condition = math_ops.equal(shape[1], batch_size * beam_width)
+  else:
+    condition = math_ops.logical_or(
+        math_ops.equal(shape[1], batch_size * beam_width),
+        math_ops.logical_and(
+            math_ops.equal(shape[1], batch_size),
+            math_ops.equal(shape[2], beam_width)))
+  return control_flow_ops.Assert(condition, [error_message])
+
+
 
 class BeamSearchDecoder(decoder.Decoder):
   """BeamSearch sampling decoder.
@@ -173,7 +274,8 @@ class BeamSearchDecoder(decoder.Decoder):
                initial_state,
                beam_width,
                output_layer=None,
-               length_penalty_weight=0.0):
+               length_penalty_weight=0.0,
+               reorder_tensor_arrays=True):
     """Initialize the BeamSearchDecoder.
 
     Args:
@@ -188,6 +290,12 @@ class BeamSearchDecoder(decoder.Decoder):
         `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
         to storing the result or sampling.
       length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell
+        state will be reordered according to the beam search path. If the
+        `TensorArray` can be reordered, the stacked form will be returned.
+        Otherwise, the `TensorArray` will be returned as is. Set this flag to
+        `False` if the cell state contains `TensorArray`s that are not amenable
+        to reordering.
 
     Raises:
       TypeError: if `cell` is not an instance of `RNNCell`,
@@ -195,14 +303,14 @@ class BeamSearchDecoder(decoder.Decoder):
       ValueError: If `start_tokens` is not a vector or
         `end_token` is not a scalar.
     """
-    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
-      raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
+    rnn_cell_impl.assert_like_rnncell("cell", cell)  # pylint: disable=protected-access
     if (output_layer is not None and
         not isinstance(output_layer, layers_base.Layer)):
       raise TypeError(
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
     self._output_layer = output_layer
+    self._reorder_tensor_arrays = reorder_tensor_arrays
 
     if callable(embedding):
       self._embedding_fn = embedding
@@ -300,12 +408,13 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     finished, start_inputs = self._finished, self._start_inputs
 
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
     log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
         array_ops.zeros([self._batch_size], dtype=dtypes.int32),
         depth=self._beam_width,
-        on_value=0.0,
-        off_value=-np.Inf,
-        dtype=nest.flatten(self._initial_cell_state)[0].dtype)
+        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
+        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
+        dtype=dtype)
 
     initial_state = BeamSearchDecoderState(
         cell_state=self._initial_cell_state,
@@ -342,6 +451,11 @@ class BeamSearchDecoder(decoder.Decoder):
         outputs.parent_ids,
         max_sequence_lengths=max_sequence_lengths,
         end_token=self._end_token)
+    if self._reorder_tensor_arrays:
+      final_state = final_state._replace(cell_state=nest.map_structure(
+          lambda t: self._maybe_sort_array_beams(
+              t, outputs.parent_ids, final_state.lengths),
+          final_state.cell_state))
     outputs = FinalBeamSearchDecoderOutput(
         beam_search_decoder_output=outputs, predicted_ids=predicted_ids)
     return outputs, final_state
@@ -432,9 +546,10 @@ class BeamSearchDecoder(decoder.Decoder):
       returned unchanged.
 
     Raises:
-      TypeError: If `t` is an instance of `TensorArray`.
       ValueError: If the rank of `t` is not statically known.
     """
+    if isinstance(t, tensor_array_ops.TensorArray):
+      return t
     _check_maybe(t)
     if t.shape.ndims >= 1:
       return self._split_batch_beams(t, s)
@@ -455,15 +570,55 @@ class BeamSearchDecoder(decoder.Decoder):
       A reshaped version of t with shape `[batch_size, beam_width] + s`.
 
     Raises:
-      TypeError: If `t` is an instance of `TensorArray`.
       ValueError:  If the rank of `t` is not statically known.
     """
+    if isinstance(t, tensor_array_ops.TensorArray):
+      return t
     _check_maybe(t)
     if t.shape.ndims >= 2:
       return self._merge_batch_beams(t, s)
     else:
       return t
 
+  def _maybe_sort_array_beams(self, t, parent_ids, sequence_length):
+    """Maybe sorts beams within a `TensorArray`.
+
+    Args:
+      t: A `TensorArray` of size `max_time` that contains `Tensor`s of shape
+        `[batch_size, beam_width, s]` or `[batch_size * beam_width, s]` where
+        `s` is the depth shape.
+      parent_ids: The parent ids of shape `[max_time, batch_size, beam_width]`.
+      sequence_length: The sequence length of shape `[batch_size, beam_width]`.
+
+    Returns:
+      A `TensorArray` where beams are sorted in each `Tensor` or `t` itself if
+      it is not a `TensorArray` or does not meet shape requirements.
+    """
+    if not isinstance(t, tensor_array_ops.TensorArray):
+      return t
+    # pylint: disable=protected-access
+    if (not t._infer_shape or not t._element_shape
+        or t._element_shape[0].ndims is None
+        or t._element_shape[0].ndims < 1):
+      shape = (
+          t._element_shape[0] if t._infer_shape and t._element_shape
+          else tensor_shape.TensorShape(None))
+      tf_logging.warn("The TensorArray %s in the cell state is not amenable to "
+                      "sorting based on the beam search result. For a "
+                      "TensorArray to be sorted, its elements shape must be "
+                      "defined and have at least a rank of 1, but saw shape: %s"
+                      % (t.handle.name, shape))
+      return t
+    shape = t._element_shape[0]
+    # pylint: enable=protected-access
+    if not _check_static_batch_beam_maybe(
+        shape, tensor_util.constant_value(self._batch_size), self._beam_width):
+      return t
+    t = t.stack()
+    with ops.control_dependencies(
+        [_check_batch_beam(t, self._batch_size, self._beam_width)]):
+      return gather_tree_from_array(t, parent_ids, sequence_length)
+
   def step(self, time, inputs, state, name=None):
     """Perform a decoding step.
 
@@ -570,7 +725,6 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
 
   time = ops.convert_to_tensor(time, name="time")
   # During the first time step we only consider the initial beam
-  scores_shape = array_ops.shape(scores)
   scores_flat = array_ops.reshape(scores, [batch_size, -1])
 
   # Pick the next beams according to the specified successors function
@@ -667,9 +821,9 @@ def _get_scores(log_probs, sequence_lengths, length_penalty_weight):
   Returns:
     The scores normalized by the length_penalty.
   """
-  length_penality_ = _length_penalty(
+  length_penalty_ = _length_penalty(
       sequence_lengths=sequence_lengths, penalty_factor=length_penalty_weight)
-  return log_probs / length_penality_
+  return log_probs / length_penalty_
 
 
 def _length_penalty(sequence_lengths, penalty_factor):
@@ -706,7 +860,7 @@ def _mask_probs(probs, eos_token, finished):
   unfinished beams remain unchanged.
 
   Args:
-    probs: Log probabiltiies of shape `[batch_size, beam_width, vocab_size]`
+    probs: Log probabilities of shape `[batch_size, beam_width, vocab_size]`
     eos_token: An int32 id corresponding to the EOS token to allocate
       probability to.
     finished: A boolean tensor of shape `[batch_size, beam_width]` that
@@ -724,7 +878,7 @@ def _mask_probs(probs, eos_token, finished):
       eos_token,
       vocab_size,
       dtype=probs.dtype,
-      on_value=0.,
+      on_value=ops.convert_to_tensor(0., dtype=probs.dtype),
       off_value=probs.dtype.min)
   finished_probs = array_ops.tile(
       array_ops.reshape(finished_row, [1, 1, -1]),
@@ -759,6 +913,8 @@ def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
     output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)]
       or the original tensor if its dimensions are too small.
   """
+  if isinstance(gather_from, tensor_array_ops.TensorArray):
+    return gather_from
   _check_maybe(gather_from)
   if gather_from.shape.ndims >= len(gather_shape):
     return _tensor_gather_helper(
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index f14974b9d5ca8cbcfd9f91086ca0a90ceff48f43..e69725ff8ab1ba4de880c914a6f5fdad5e54566d 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -28,8 +28,10 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
@@ -39,6 +41,7 @@ __all__ = ["Decoder", "dynamic_decode"]
 
 
 _transpose_batch_time = rnn._transpose_batch_time  # pylint: disable=protected-access
+_zero_state_tensors = rnn_cell_impl._zero_state_tensors  # pylint: disable=protected-access
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -133,16 +136,8 @@ class Decoder(object):
 
 def _create_zero_outputs(size, dtype, batch_size):
   """Create a zero outputs Tensor structure."""
-  def _t(s):
-    return (s if isinstance(s, ops.Tensor) else constant_op.constant(
-        tensor_shape.TensorShape(s).as_list(),
-        dtype=dtypes.int32,
-        name="zero_suffix_shape"))
-
   def _create(s, d):
-    return array_ops.zeros(
-        array_ops.concat(
-            ([batch_size], _t(s)), axis=0), dtype=d)
+    return _zero_state_tensors(s, batch_size, d)
 
   return nest.map_structure(_create, size, dtype)
 
@@ -187,6 +182,15 @@ def dynamic_decode(decoder,
     raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
                     type(decoder))
 
+  def _is_xla_tensor(tensor):
+    try:
+      op = tensor.op
+    except AttributeError:
+      return False
+    if control_flow_util.IsInXLAContext(op):
+      return True
+    return False
+
   with variable_scope.variable_scope(scope, "decoder") as varscope:
     # Properly cache variable values inside the while_loop
     if varscope.caching_device is None:
@@ -204,6 +208,11 @@ def dynamic_decode(decoder,
                                         decoder.output_dtype,
                                         decoder.batch_size)
 
+    is_xla = False
+    if any([_is_xla_tensor(i) for i in nest.flatten(initial_inputs)]):
+      is_xla = True
+    if is_xla and maximum_iterations is None:
+      raise ValueError("maximum_iterations is required for XLA compilation.")
     if maximum_iterations is not None:
       initial_finished = math_ops.logical_or(
           initial_finished, 0 >= maximum_iterations)
@@ -212,7 +221,8 @@ def dynamic_decode(decoder,
     initial_time = constant_op.constant(0, dtype=dtypes.int32)
 
     def _shape(batch_size, from_shape):
-      if not isinstance(from_shape, tensor_shape.TensorShape):
+      if (not isinstance(from_shape, tensor_shape.TensorShape) or
+          from_shape.ndims == 0):
         return tensor_shape.TensorShape(None)
       else:
         batch_size = tensor_util.constant_value(
@@ -220,11 +230,13 @@ def dynamic_decode(decoder,
                 batch_size, name="batch_size"))
         return tensor_shape.TensorShape([batch_size]).concatenate(from_shape)
 
+    dynamic_size = maximum_iterations is None or not is_xla
+
     def _create_ta(s, d):
       return tensor_array_ops.TensorArray(
           dtype=d,
-          size=0,
-          dynamic_size=True,
+          size=0 if dynamic_size else maximum_iterations,
+          dynamic_size=dynamic_size,
           element_shape=_shape(decoder.batch_size, s))
 
     initial_outputs_ta = nest.map_structure(_create_ta, decoder.output_size,
@@ -256,11 +268,8 @@ def dynamic_decode(decoder,
         next_finished = decoder_finished
       else:
         next_finished = math_ops.logical_or(decoder_finished, finished)
-      if maximum_iterations is not None:
-        next_finished = math_ops.logical_or(
-            next_finished, time + 1 >= maximum_iterations)
       next_sequence_lengths = array_ops.where(
-          math_ops.logical_and(math_ops.logical_not(finished), next_finished),
+          math_ops.logical_not(finished),
           array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
           sequence_lengths)
 
@@ -301,11 +310,16 @@ def dynamic_decode(decoder,
     res = control_flow_ops.while_loop(
         condition,
         body,
-        loop_vars=[
-            initial_time, initial_outputs_ta, initial_state, initial_inputs,
-            initial_finished, initial_sequence_lengths,
-        ],
+        loop_vars=(
+            initial_time,
+            initial_outputs_ta,
+            initial_state,
+            initial_inputs,
+            initial_finished,
+            initial_sequence_lengths,
+        ),
         parallel_iterations=parallel_iterations,
+        maximum_iterations=maximum_iterations,
         swap_memory=swap_memory)
 
     final_outputs_ta = res[1]
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 67011c8fef6c4f54db2626ffe7ae1299bddbb352..9c0885918071c25ab65cb4044bc19ea22c55442a 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -1,9 +1,7 @@
 # Description:
 #   TensorFlow Serving session bundle.
 
-package(
-    default_visibility = ["//visibility:public"],
-)
+package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -19,18 +17,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-)
-
 # TODO(b/32673259): add a test to continuously validate these files.
 filegroup(
     name = "session_bundle_half_plus_two",
@@ -165,6 +151,7 @@ py_test(
     name = "gc_test",
     srcs = ["gc_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     visibility = ["//visibility:private"],
     deps = [
         ":gc",
diff --git a/tensorflow/contrib/session_bundle/example/BUILD b/tensorflow/contrib/session_bundle/example/BUILD
index dbbae01f3661b81f35350470c08ec65b3488b7fc..9a56eab431d66c53c1c51341d48bf47eb8926829 100644
--- a/tensorflow/contrib/session_bundle/example/BUILD
+++ b/tensorflow/contrib/session_bundle/example/BUILD
@@ -10,19 +10,6 @@ exports_files(["LICENSE"])
 
 # vardef("PYTHON_BIN_PATH", "/usr/bin/python")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-)
-
 py_binary(
     name = "export_half_plus_two",
     srcs = [
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.cc b/tensorflow/contrib/session_bundle/session_bundle_test.cc
index 6d997bac9ee8e0fe242455686cc00a016d9bd768..612623ae309f6393beb258138b7b795c2a25d4e1 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
@@ -239,8 +240,8 @@ TEST(LoadSessionBundleFromPath, BasicTestRunOptionsThreadPoolInvalid) {
 
   // Expect failed session run calls with invalid run-options.
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Invalid inter_op_thread_pool: 2"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Invalid inter_op_thread_pool: 2"))
       << status.error_message();
 }
 
@@ -314,8 +315,8 @@ TEST_F(SessionBundleTest, ServingGraphEmpty) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message())
-                  .contains("Expected exactly one serving GraphDef"))
+  EXPECT_TRUE(str_util::StrContains(status_.error_message(),
+                                    "Expected exactly one serving GraphDef"))
       << status_.error_message();
 }
 
@@ -330,8 +331,9 @@ TEST_F(SessionBundleTest, ServingGraphAnyIncorrectType) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message())
-                  .contains("Expected Any type_url for: tensorflow.GraphDef"))
+  EXPECT_TRUE(
+      str_util::StrContains(status_.error_message(),
+                            "Expected Any type_url for: tensorflow.GraphDef"))
       << status_.error_message();
 }
 
@@ -347,7 +349,8 @@ TEST_F(SessionBundleTest, ServingGraphAnyValueCorrupted) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message()).contains("Failed to unpack"))
+  EXPECT_TRUE(
+      str_util::StrContains(status_.error_message(), "Failed to unpack"))
       << status_.error_message();
 }
 
@@ -362,9 +365,9 @@ TEST_F(SessionBundleTest, AssetFileAnyIncorrectType) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(
-      StringPiece(status_.error_message())
-          .contains("Expected Any type_url for: tensorflow.serving.AssetFile"))
+  EXPECT_TRUE(str_util::StrContains(
+      status_.error_message(),
+      "Expected Any type_url for: tensorflow.serving.AssetFile"))
       << status_.error_message();
 }
 
@@ -380,7 +383,8 @@ TEST_F(SessionBundleTest, AssetFileAnyValueCorrupted) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message()).contains("Failed to unpack"))
+  EXPECT_TRUE(
+      str_util::StrContains(status_.error_message(), "Failed to unpack"))
       << status_.error_message();
 }
 
@@ -395,8 +399,8 @@ TEST_F(SessionBundleTest, InitOpTooManyValues) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message())
-                  .contains("Expected exactly one serving init op"))
+  EXPECT_TRUE(str_util::StrContains(status_.error_message(),
+                                    "Expected exactly one serving init op"))
       << status_.error_message();
 }
 
diff --git a/tensorflow/contrib/session_bundle/signature_test.cc b/tensorflow/contrib/session_bundle/signature_test.cc
index 741b7fde9bdb40e8d0d7e4396676dfff036970d6..b1ff55552e0932ddc100adc4a257016fa3923120 100644
--- a/tensorflow/contrib/session_bundle/signature_test.cc
+++ b/tensorflow/contrib/session_bundle/signature_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 
@@ -33,8 +34,8 @@ namespace tensorflow {
 namespace serving {
 namespace {
 
-static bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+static bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
@@ -69,8 +70,8 @@ TEST(GetClassificationSignature, MissingSignature) {
   ClassificationSignature signature;
   const Status status = GetClassificationSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a classification signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a classification signature"))
       << status.error_message();
 }
 
@@ -86,8 +87,8 @@ TEST(GetClassificationSignature, WrongSignatureType) {
   ClassificationSignature signature;
   const Status status = GetClassificationSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a classification signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a classification signature"))
       << status.error_message();
 }
 
@@ -122,8 +123,8 @@ TEST(GetNamedClassificationSignature, MissingSignature) {
   const Status status =
       GetNamedClassificationSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Missing signature named \"foo\""))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Missing signature named \"foo\""))
       << status.error_message();
 }
 
@@ -141,9 +142,9 @@ TEST(GetNamedClassificationSignature, WrongSignatureType) {
   const Status status =
       GetNamedClassificationSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(
-      StringPiece(status.error_message())
-          .contains("Expected a classification signature for name \"foo\""))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "Expected a classification signature for name \"foo\""))
       << status.error_message();
 }
 
@@ -176,8 +177,8 @@ TEST(GetRegressionSignature, MissingSignature) {
   RegressionSignature signature;
   const Status status = GetRegressionSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a regression signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a regression signature"))
       << status.error_message();
 }
 
@@ -193,8 +194,8 @@ TEST(GetRegressionSignature, WrongSignatureType) {
   RegressionSignature signature;
   const Status status = GetRegressionSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a regression signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a regression signature"))
       << status.error_message();
 }
 
@@ -227,8 +228,8 @@ TEST(GetNamedSignature, MissingSignature) {
   Signature signature;
   const Status status = GetNamedSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Missing signature named \"foo\""))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Missing signature named \"foo\""))
       << status.error_message();
 }
 
@@ -370,7 +371,7 @@ TEST(RunClassification, RunNotOk) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Data is gone"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Data is gone"))
       << status.error_message();
 }
 
@@ -386,7 +387,8 @@ TEST(RunClassification, TooManyOutputs) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Expected 1 output"))
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "Expected 1 output"))
       << status.error_message();
 }
 
@@ -402,8 +404,9 @@ TEST(RunClassification, WrongBatchOutputs) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Input batch size did not match output batch size"))
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "Input batch size did not match output batch size"))
       << status.error_message();
 }
 
@@ -449,7 +452,7 @@ TEST_F(RunRegressionTest, RunNotOk) {
   const Status status =
       RunRegression(signature_, input_tensor_, &session_, &output_tensor_);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Data is gone"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Data is gone"))
       << status.error_message();
 }
 
@@ -460,8 +463,9 @@ TEST_F(RunRegressionTest, MismatchedSizeForBatchInputAndOutput) {
   const Status status =
       RunRegression(signature_, input_tensor_, &session_, &output_tensor_);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Input batch size did not match output batch size"))
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "Input batch size did not match output batch size"))
       << status.error_message();
 }
 
@@ -488,7 +492,7 @@ TEST(GetSignatures, MissingSignature) {
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Expected exactly one"))
+      str_util::StrContains(status.error_message(), "Expected exactly one"))
       << status.error_message();
 }
 
@@ -502,9 +506,9 @@ TEST(GetSignatures, WrongProtoInAny) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected Any type_url for: "
-                            "tensorflow.serving.Signatures"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected Any type_url for: "
+                                    "tensorflow.serving.Signatures"))
       << status.error_message();
 }
 
@@ -519,7 +523,7 @@ TEST(GetSignatures, JunkInAny) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Failed to unpack"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Failed to unpack"))
       << status.error_message();
 }
 
@@ -567,7 +571,7 @@ TEST(GetSignatures, MultipleSignaturesNotOK) {
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Expected exactly one"))
+      str_util::StrContains(status.error_message(), "Expected exactly one"))
       << status.error_message();
 }
 
@@ -641,8 +645,8 @@ TEST(GetGenericSignature, WrongSignatureType) {
   const Status status =
       GetGenericSignature("generic_bindings", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a generic signature:"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a generic signature:"))
       << status.error_message();
 }
 
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index a83fc20596c8ad7e1cf94ede8b10d82e25f47b17..fdecceff526a860a274354e53e824b98d11418a6 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -130,15 +130,3 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
index 35c4b5bec172858b39dd4628a37e164efe87bdbf..345eb6cfaa67fd4cda6e7e3f01a1243bbf3c9fa1 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.signal.python.kernel_tests import test_util
 from tensorflow.contrib.signal.python.ops import mel_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 # mel spectrum constants and functions.
@@ -173,6 +174,18 @@ class LinearToMelTest(test.TestCase):
         rewritten_graph = test_util.grappler_optimize(g, [mel_matrix])
         self.assertEqual(1, len(rewritten_graph.node))
 
+  def test_num_spectrogram_bins_dynamic(self):
+    with self.test_session(use_gpu=True):
+      num_spectrogram_bins = array_ops.placeholder(shape=(),
+                                                   dtype=dtypes.int32)
+      mel_matrix_np = spectrogram_to_mel_matrix(
+          20, 129, 8000.0, 125.0, 3800.0)
+      mel_matrix = mel_ops.linear_to_mel_weight_matrix(
+          20, num_spectrogram_bins, 8000.0, 125.0, 3800.0)
+      self.assertAllClose(
+          mel_matrix_np,
+          mel_matrix.eval(feed_dict={num_spectrogram_bins: 129}), atol=3e-6)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
index 1c052354b8afcc5fd8a53b783cc5c676588cf48c..64cc8c7ea54673ac748be73e677575331d8e1cc9 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
@@ -338,7 +338,7 @@ class FrameTest(test.TestCase):
 
   def test_constant_folding(self):
     """frame should be constant foldable for constant inputs."""
-    for pad_end in [False, True]:
+    for pad_end in [True, False]:
       g = ops.Graph()
       with g.as_default():
         frame_length, frame_step = 32, 16
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py
index d1a36548d95cf44d2bf7e6108141aeb00853db04..1e84006116daa3f28c760037cb9eeafd53eaafb8 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mel_ops.py
@@ -64,14 +64,11 @@ def _hertz_to_mel(frequencies_hertz, name=None):
         1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
 
 
-def _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+def _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype):
   """Checks the inputs to linear_to_mel_weight_matrix."""
   if num_mel_bins <= 0:
     raise ValueError('num_mel_bins must be positive. Got: %s' % num_mel_bins)
-  if num_spectrogram_bins <= 0:
-    raise ValueError('num_spectrogram_bins must be positive. Got: %s' %
-                     num_spectrogram_bins)
   if sample_rate <= 0.0:
     raise ValueError('sample_rate must be positive. Got: %s' % sample_rate)
   if lower_edge_hertz < 0.0:
@@ -122,9 +119,9 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
 
   Args:
     num_mel_bins: Python int. How many bands in the resulting mel spectrum.
-    num_spectrogram_bins: Python int. How many bins there are in the source
-      spectrogram data, which is understood to be `fft_size // 2 + 1`, i.e. the
-      spectrogram only contains the nonredundant FFT bins.
+    num_spectrogram_bins: An integer `Tensor`. How many bins there are in the
+      source spectrogram data, which is understood to be `fft_size // 2 + 1`,
+      i.e. the spectrogram only contains the nonredundant FFT bins.
     sample_rate: Python float. Samples per second of the input signal used to
       create the spectrogram. We need this to figure out the actual frequencies
       for each spectrogram bin, which dictates how they are mapped into the mel
@@ -148,7 +145,10 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
   [mel]: https://en.wikipedia.org/wiki/Mel_scale
   """
   with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name:
-    _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+    # Note: As num_spectrogram_bins is passed to `math_ops.linspace`
+    # and the validation is already done in linspace (both in shape function
+    # and in kernel), there is no need to validate num_spectrogram_bins here.
+    _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype)
 
     # To preserve accuracy, we compute the matrix at float64 precision and then
diff --git a/tensorflow/contrib/slim/BUILD b/tensorflow/contrib/slim/BUILD
index c2f106c2b28029f05648716bb08cd2531729fb36..516e3ea073268e9b113a1e13577551ccacbf4206 100644
--- a/tensorflow/contrib/slim/BUILD
+++ b/tensorflow/contrib/slim/BUILD
@@ -178,15 +178,3 @@ py_test(
         "//tensorflow/python:summary",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 2d9df8f27ee98431f51fd39c168325b8f625dce9..746b95564237617359afe1791484809369c4a894 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -94,7 +94,7 @@ of thin wrapper functions in
 [variables.py](https://www.tensorflow.org/code/tensorflow/contrib/framework/python/ops/variables.py)
 which allow callers to easily define variables.
 
-For example, to create a `weight` variable, initialize it using a truncated
+For example, to create a `weights` variable, initialize it using a truncated
 normal distribution, regularize it with an `l2_loss` and place it on the `CPU`,
 one need only declare the following:
 
@@ -290,9 +290,9 @@ slim.stack(x, slim.conv2d, [(32, [3, 3]), (32, [1, 1]), (64, [3, 3]), (64, [1, 1
 
 In addition to the types of scope mechanisms in TensorFlow
 ([name_scope](https://www.tensorflow.org/api_docs/python/tf/name_scope),
-[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope),
+[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope)),
 TF-Slim adds a new scoping mechanism called
-[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope),
+[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope).
 This new scope allows a user to specify one or more operations and a set of
 arguments which will be passed to each of the operations defined in the
 `arg_scope`. This functionality is best illustrated by example. Consider the
@@ -761,8 +761,8 @@ parts:
 3. Finalization: (optionally) perform any final operation to compute metric
 values. For example, computing means, mins, maxes, etc.
 
-For example, to compute `mean_absolute_error`, two variables, a `count` and
-`total` variable are *initialized* to zero. During *aggregation*, we observed
+For example, to compute `mean_absolute_error`, two variables (`count` and
+`total`) are *initialized* to zero. During *aggregation*, we observed
 some set of predictions and labels, compute their absolute differences and add
 the total to `total`. Each time we observe another value,
 `count` is incremented. Finally, during *finalization*, `total` is divided
diff --git a/tensorflow/contrib/slim/python/slim/data/BUILD b/tensorflow/contrib/slim/python/slim/data/BUILD
index 5daabbd62e7e63608a7a86a8b7fb0bc0d570b28b..eef043e83276dcdffe491ee9b981c8de0894f592 100644
--- a/tensorflow/contrib/slim/python/slim/data/BUILD
+++ b/tensorflow/contrib/slim/python/slim/data/BUILD
@@ -61,6 +61,7 @@ py_test(
     name = "dataset_data_provider_test",
     srcs = ["dataset_data_provider_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":dataset",
         ":dataset_data_provider",
@@ -193,15 +194,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
index ad5e985487190e72b9eb2809da964f3d7b34ef94..99ad48763031cc2f98009449cea050fd90d01eb5 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
@@ -115,8 +115,8 @@ class ParallelReader(io_ops.ReaderBase):
     reader needs to start reading from a new file since it has finished with
     the previous file).
 
-    A queue runner for enqueing in the `common_queue` is automatically added to
-    the TF QueueRunners collection.
+    A queue runner for enqueuing in the `common_queue` is automatically added
+    to the TF QueueRunners collection.
 
     Args:
       queue: A Queue or a mutable string Tensor representing a handle
@@ -221,7 +221,7 @@ def parallel_read(data_sources,
         the data will be cycled through indefinitely.
     num_readers: a integer, number of Readers to create.
     reader_kwargs: an optional dict, of kwargs for the reader.
-    shuffle: boolean, wether should shuffle the files and the records by using
+    shuffle: boolean, whether should shuffle the files and the records by using
       RandomShuffleQueue as common_queue.
     dtypes:  A list of types.  The length of dtypes must equal the number
         of elements in each record. If it is None it will default to
diff --git a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
index 37e9c4754ca62fc02f9146632943a50c33f9423d..62bd20036126b41040ca4329c7f13ea7671a8045 100644
--- a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
+++ b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
@@ -36,9 +36,9 @@ def prefetch_queue(tensors,
                    dynamic_pad=False,
                    shared_name=None,
                    name=None):
-  """Creates a queue to prefetech tensors from `tensors`.
+  """Creates a queue to prefetch tensors from `tensors`.
 
-  A queue runner for enqueing tensors into the prefetch_queue is automatically
+  A queue runner for enqueuing tensors into the prefetch_queue is automatically
   added to the TF QueueRunners collection.
 
   Example:
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index b3b61e1dfe5671a7fbbee20b0c577ee5fad0fb9b..f2d31dc8db5688dc9a3308267109214277436040 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -124,7 +124,7 @@ class BoundingBox(ItemHandler):
     super(BoundingBox, self).__init__(self._full_keys)
 
   def tensors_to_item(self, keys_to_tensors):
-    """Maps the given dictionary of tensors to a contatenated list of bboxes.
+    """Maps the given dictionary of tensors to a concatenated list of bboxes.
 
     Args:
       keys_to_tensors: a mapping of TF-Example keys to parsed tensors.
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index 3caf4e02da3aa2d7e586c4e76807a11f84585ea6..5cfd5ee82e2a0fce33311a8783d2d4ceb031544d 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -230,6 +230,7 @@ def evaluation_loop(master,
                     max_number_of_evaluations=None,
                     session_config=None,
                     timeout=None,
+                    timeout_fn=None,
                     hooks=None):
   """Runs TF-Slim's Evaluation Loop.
 
@@ -261,6 +262,9 @@ def evaluation_loop(master,
       configure the `Session`. If left as `None`, the default will be used.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
+    timeout_fn: Optional function to call after a timeout.  If the function
+      returns True, then it means that no new checkpoints will be generated and
+      the iterator will exit.  The function is called with no arguments.
     hooks: A list of additional `SessionRunHook` objects to pass during
       repeated evaluations.
 
@@ -298,4 +302,5 @@ def evaluation_loop(master,
       hooks=all_hooks,
       config=session_config,
       max_number_of_evaluations=max_number_of_evaluations,
-      timeout=timeout)
+      timeout=timeout,
+      timeout_fn=timeout_fn)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index c24bd048512daaae116e732ac437f7c9b6f6d7fc..94fc12ca814721acf62f16b72ffa50473043cc8b 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -177,6 +177,17 @@ class EvaluationTest(test.TestCase):
     # The timeout kicked in.
     self.assertLess(end, start + 1.1)
 
+  def testTimeoutFnOnEvaluationLoop(self):
+    # We require a mutable object (e.g. list but not an int) to maintain state
+    # across calls of a nested function.
+    timeout_fn_calls = [0]
+    def _TimeoutFn():
+      timeout_fn_calls[0] += 1
+      return timeout_fn_calls[0] >= 3
+    # Need not do any evaluation, but should just call timeout_fn repeatedly.
+    evaluation.evaluation_loop('', '', '', timeout=0, timeout_fn=_TimeoutFn)
+    self.assertEqual(timeout_fn_calls[0], 3)
+
   def testMonitorCheckpointsLoopTimeout(self):
     ret = list(
         evaluation_lib.checkpoints_iterator(
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 6a200de1ea172b4ccb38c0f5d889566ccaeef893..8a2c74742a8ebbfdca702943ee2f631531c7b2ca 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -389,7 +389,7 @@ def create_train_op(total_loss,
     total_loss: A `Tensor` representing the total loss.
     optimizer: A tf.Optimizer to use for computing the gradients.
     global_step: A `Tensor` representing the global step variable. If left as
-      `_USE_GLOBAL_STEP`, then slim.variables.global_step() is used.
+      `_USE_GLOBAL_STEP`, then tf.contrib.framework.global_step() is used.
     update_ops: An optional list of updates to execute. If `update_ops` is
       `None`, then the update ops are set to the contents of the
       `tf.GraphKeys.UPDATE_OPS` collection. If `update_ops` is not `None`, but
@@ -578,7 +578,8 @@ def train(train_op,
     is_chief: Specifies whether or not the training is being run by the primary
       replica during replica training.
     global_step: The `Tensor` representing the global step. If left as `None`,
-      then slim.variables.get_or_create_global_step() is used.
+      then training_util.get_or_create_global_step(), that is,
+      tf.contrib.framework.global_step() is used.
     number_of_steps: The max number of gradient steps to take during training,
       as measured by 'global_step': training will stop if global_step is
       greater than 'number_of_steps'. If the value is left as None, training
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index 7f03aaf085cf26e3f5f940f4388828006a02ef42..8bbdf96384683c68648367c6433eeb89c64c22bf 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -317,15 +317,3 @@ py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
index 235a595de49f956e1df740fd821936c80eefaa55..11c4214176a8e3d69065066bb5ac4d668da10574 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
@@ -207,7 +207,7 @@ def resnet_v1(inputs,
         net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
         if global_pool:
           # Global average pooling.
-          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
         if num_classes is not None:
           net = layers.conv2d(
               net,
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
index 61665c9c8ba7817377a16bf3f2673447cab0518e..19e0538dd1e272b2bcaada3c83944e9c9c1f9eef 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
@@ -221,7 +221,7 @@ def resnet_v2(inputs,
             net, activation_fn=nn_ops.relu, scope='postnorm')
         if global_pool:
           # Global average pooling.
-          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
         if num_classes is not None:
           net = layers_lib.conv2d(
               net,
diff --git a/tensorflow/contrib/slim/python/slim/summaries.py b/tensorflow/contrib/slim/python/slim/summaries.py
index 358359d6ebeea209fe83f7282f47db8be63747ac..a7dc3f6723a0d1a55dd3e8dce006e6acce083e6f 100644
--- a/tensorflow/contrib/slim/python/slim/summaries.py
+++ b/tensorflow/contrib/slim/python/slim/summaries.py
@@ -144,7 +144,7 @@ def add_zero_fraction_summary(tensor, name=None, prefix=None,
     A scalar `Tensor` of type `string` whose contents are the serialized
     `Summary` protocol buffer.
   """
-  name = _get_summary_name(tensor, name, prefix, 'Fraction of Zero Values')
+  name = _get_summary_name(tensor, name, prefix, 'Fraction_of_Zero_Values')
   tensor = nn.zero_fraction(tensor)
   return add_scalar_summary(tensor, name, print_summary=print_summary)
 
diff --git a/tensorflow/contrib/solvers/BUILD b/tensorflow/contrib/solvers/BUILD
index 87b67486ad413ad537aa8cb68f9f7bef729dd488..5247288d54aaf4e3020d38618b74f1118a69a105 100644
--- a/tensorflow/contrib/solvers/BUILD
+++ b/tensorflow/contrib/solvers/BUILD
@@ -93,16 +93,3 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-# All files
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/solvers/python/ops/least_squares.py b/tensorflow/contrib/solvers/python/ops/least_squares.py
index fb7c0eb649c5216736b239d1a423cdaf7079f582..6e164f53420675d149ded6c1f42ca87bd89b158c 100644
--- a/tensorflow/contrib/solvers/python/ops/least_squares.py
+++ b/tensorflow/contrib/solvers/python/ops/least_squares.py
@@ -33,7 +33,7 @@ def cgls(operator, rhs, tol=1e-6, max_iter=20, name="cgls"):
   r"""Conjugate gradient least squares solver.
 
   Solves a linear least squares problem \\(||A x - rhs||_2\\) for a single
-  righ-hand side, using an iterative, matrix-free algorithm where the action of
+  right-hand side, using an iterative, matrix-free algorithm where the action of
   the matrix A is represented by `operator`. The CGLS algorithm implicitly
   applies the symmetric conjugate gradient algorithm to the normal equations
   \\(A^* A x = A^* rhs\\). The iteration terminates when either
diff --git a/tensorflow/contrib/solvers/python/ops/linear_equations.py b/tensorflow/contrib/solvers/python/ops/linear_equations.py
index d791d467639b572e7831c1d1a582aa15585649b6..9305c6a11c4ec898c82553773e8e7277a54ab82e 100644
--- a/tensorflow/contrib/solvers/python/ops/linear_equations.py
+++ b/tensorflow/contrib/solvers/python/ops/linear_equations.py
@@ -41,7 +41,7 @@ def conjugate_gradient(operator,
   r"""Conjugate gradient solver.
 
   Solves a linear system of equations `A*x = rhs` for selfadjoint, positive
-  definite matrix `A` and righ-hand side vector `rhs`, using an iterative,
+  definite matrix `A` and right-hand side vector `rhs`, using an iterative,
   matrix-free algorithm where the action of the matrix A is represented by
   `operator`. The iteration terminates when either the number of iterations
   exceeds `max_iter` or when the residual norm has been reduced to `tol`
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index fcfaa2aba4e8ab086a9eac053188f8fbd4f6f39a..b729fff261192be22c6a56fa9ca0a641f302c570 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -65,15 +65,3 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/sparsemax/__init__.py b/tensorflow/contrib/sparsemax/__init__.py
index 19d213fb3e8f615190d67862b1928205f31146b4..7bc726f4a84d683517b73814193429220f864735 100644
--- a/tensorflow/contrib/sparsemax/__init__.py
+++ b/tensorflow/contrib/sparsemax/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Module that implements sparsemax and sparsemax loss, see [1].
 
-[1] https://arxiv.org/abs/1602.02068
+[1]: https://arxiv.org/abs/1602.02068
 
 ## Sparsemax
 
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index 890ca20f4cabd65146e803e54e554a5c97e72427..e617af2ff1b731eddb5b72469a1cd67e7cfd163f 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -31,7 +31,7 @@ def sparsemax(logits, name=None):
   """Computes sparsemax activations [1].
 
   For each batch `i` and class `j` we have
-    sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)
+    $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$
 
   [1]: https://arxiv.org/abs/1602.02068
 
diff --git a/tensorflow/contrib/specs/BUILD b/tensorflow/contrib/specs/BUILD
index 084953a0a226cde46ebd9d2031d20cb839180ca8..055b04db8a5654ebf6fee45547d58f0375f9a554 100644
--- a/tensorflow/contrib/specs/BUILD
+++ b/tensorflow/contrib/specs/BUILD
@@ -60,15 +60,3 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/staging/BUILD b/tensorflow/contrib/staging/BUILD
index bc4a289468c257e7e5e2bd437b8d6d1235980495..0c86f3db1d5bc262f27440754c86f8c63e16b690 100644
--- a/tensorflow/contrib/staging/BUILD
+++ b/tensorflow/contrib/staging/BUILD
@@ -6,18 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "staging",
     srcs = ["__init__.py"],
diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index 5fd02efbf6327b20eade6785007930eed3fd4e03..30be14c10cd8576ded75b8489cc89d439a9cc282 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -31,16 +31,5 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    tags = ["no_windows"],
 )
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
index 6e259e1d32be64f3b593faf73e8af4f704d72349..dcbef2881df7b5543d664c4b385927f52ae2cbaa 100644
--- a/tensorflow/contrib/stateless/BUILD
+++ b/tensorflow/contrib/stateless/BUILD
@@ -38,15 +38,3 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py
index ca937546f50df46b7e5b1144dcbdc380cb04ca9b..0cca40f071c889773736ce009b32ba17728041ce 100644
--- a/tensorflow/contrib/stateless/__init__.py
+++ b/tensorflow/contrib/stateless/__init__.py
@@ -22,6 +22,7 @@ WARNING: These ops are in contrib, and are not stable.  They should be
 consistent across multiple runs on the same hardware, but only for the same
 version of the code.
 
+@@stateless_multinomial
 @@stateless_random_uniform
 @@stateless_random_normal
 @@stateless_truncated_normal
@@ -37,6 +38,7 @@ from tensorflow.contrib.stateless.gen_stateless_random_ops import *
 from tensorflow.python.framework import ops
 from tensorflow.python.util.all_util import remove_undocumented
 
+ops.NotDifferentiable("StatelessMultinomial")
 ops.NotDifferentiable("StatelessRandomNormal")
 ops.NotDifferentiable("StatelessRandomUniform")
 ops.NotDifferentiable("StatelessTruncatedNormal")
diff --git a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
index bea6341cfdcf7d56f255bec275b7861228e44e12..d724a5c014d2f9f5f6e3a6704341bcb8c429ae06 100644
--- a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
+++ b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
@@ -96,6 +96,52 @@ class StatelessOpsTest(test.TestCase):
               for s1, v1 in values:
                 self.assertEqual(s0 == s1, np.all(v0 == v1))
 
+  def testMatchStatefulMultinomial(self):
+    # Stateless ops should be the same as stateful ops on the first call
+    # after seed scrambling.
+    key = 0x3ec8f720, 0x02461e29
+    num_samples = 4
+    for logits_dtype in np.float16, np.float32, np.float64:
+      for output_dtype in dtypes.int32, dtypes.int64:
+        for seed in (7, 17), (11, 5), (2, 3):
+          preseed = invert_philox(key,
+                                  (seed[0], 0, seed[1], 0)).astype(np.uint64)
+          preseed = preseed[::2] | preseed[1::2] << 32
+          random_seed.set_random_seed(seed[0])
+          with self.test_session(use_gpu=True):
+            for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                      [0.25, 0.75]]):
+              logits_t = constant_op.constant(logits, dtype=logits_dtype)
+              stateful = random_ops.multinomial(
+                  logits_t,
+                  num_samples,
+                  seed=seed[1],
+                  output_dtype=output_dtype)
+              pure = stateless.stateless_multinomial(
+                  logits_t,
+                  num_samples,
+                  seed=preseed,
+                  output_dtype=output_dtype)
+              self.assertAllEqual(stateful.eval(), pure.eval())
+
+  def testDeterminismMultinomial(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    num_samples = 10
+    with self.test_session(use_gpu=True):
+      for seed_type in [dtypes.int32, dtypes.int64]:
+        seed_t = array_ops.placeholder(seed_type, shape=[2])
+        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+        for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                  [0.25, 0.75]]):
+          pure = stateless.stateless_multinomial(
+              logits, num_samples, seed=seed_t)
+          values = [
+              (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
+          ]
+          for s0, v0 in values:
+            for s1, v1 in values:
+              self.assertEqual(s0 == s1, np.all(v0 == v1))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index b58c83fdaf574fb349fac57c922f1178b7d13b66..f88b03ec4c2b1f250091594ea12d7d1862029fa2 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -10,18 +10,11 @@ load(
     "tf_gen_op_wrapper_py",
 )
 
-tf_gen_op_wrapper_py(
-    name = "gen_summary_ops",
-    out = "gen_summary_ops.py",
-    deps = ["//tensorflow/core:summary_ops_op_lib"],
-)
-
 py_test(
     name = "summary_ops_test",
     srcs = ["summary_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":summary_ops",
         ":summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
@@ -29,6 +22,7 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
@@ -41,7 +35,6 @@ py_test(
     srcs = ["summary_ops_graph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":summary_ops",
         ":summary_test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -50,31 +43,9 @@ py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "summary_ops",
-    srcs = ["summary_ops.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":gen_summary_ops",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:summary_op_util",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python:variables",
         "@six_archive//:six",
     ],
 )
@@ -85,22 +56,10 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":summary_ops",
+        "//tensorflow/python:summary_ops_v2",
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # NOTE: target cannot be testonly because it needs to be in the pip
 # package. Sigh.
 py_library(
@@ -110,8 +69,10 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:summary_ops_v2",
         "@org_sqlite//:python",
     ],
 )
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 2d6d7ea6a3eff2562ba8def4117e3aa6f818b6fd..99ced53e1167ec5486d0b75cff81ffbf857c2be7 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -61,23 +61,23 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
-from tensorflow.contrib.summary.summary_ops import all_summary_ops
-from tensorflow.contrib.summary.summary_ops import always_record_summaries
-from tensorflow.contrib.summary.summary_ops import audio
-from tensorflow.contrib.summary.summary_ops import create_db_writer
-from tensorflow.contrib.summary.summary_ops import create_file_writer
-from tensorflow.contrib.summary.summary_ops import create_summary_file_writer
-from tensorflow.contrib.summary.summary_ops import eval_dir
-from tensorflow.contrib.summary.summary_ops import flush
-from tensorflow.contrib.summary.summary_ops import generic
-from tensorflow.contrib.summary.summary_ops import graph
-from tensorflow.contrib.summary.summary_ops import histogram
-from tensorflow.contrib.summary.summary_ops import image
-from tensorflow.contrib.summary.summary_ops import import_event
-from tensorflow.contrib.summary.summary_ops import initialize
-from tensorflow.contrib.summary.summary_ops import never_record_summaries
-from tensorflow.contrib.summary.summary_ops import record_summaries_every_n_global_steps
-from tensorflow.contrib.summary.summary_ops import scalar
-from tensorflow.contrib.summary.summary_ops import should_record_summaries
-from tensorflow.contrib.summary.summary_ops import summary_writer_initializer_op
-from tensorflow.contrib.summary.summary_ops import SummaryWriter
+from tensorflow.python.ops.summary_ops_v2 import all_summary_ops
+from tensorflow.python.ops.summary_ops_v2 import always_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import audio
+from tensorflow.python.ops.summary_ops_v2 import create_db_writer
+from tensorflow.python.ops.summary_ops_v2 import create_file_writer
+from tensorflow.python.ops.summary_ops_v2 import create_summary_file_writer
+from tensorflow.python.ops.summary_ops_v2 import eval_dir
+from tensorflow.python.ops.summary_ops_v2 import flush
+from tensorflow.python.ops.summary_ops_v2 import generic
+from tensorflow.python.ops.summary_ops_v2 import graph
+from tensorflow.python.ops.summary_ops_v2 import histogram
+from tensorflow.python.ops.summary_ops_v2 import image
+from tensorflow.python.ops.summary_ops_v2 import import_event
+from tensorflow.python.ops.summary_ops_v2 import initialize
+from tensorflow.python.ops.summary_ops_v2 import never_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import record_summaries_every_n_global_steps
+from tensorflow.python.ops.summary_ops_v2 import scalar
+from tensorflow.python.ops.summary_ops_v2 import should_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import summary_writer_initializer_op
+from tensorflow.python.ops.summary_ops_v2 import SummaryWriter
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
index 2b7806f80d020e0064b0f5cf32fd765a9ee993d1..ae8336daaf8ea9113716b90b6ea9be9de7303596 100644
--- a/tensorflow/contrib/summary/summary_ops_graph_test.py
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -16,27 +16,220 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
+import time
 
 import six
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
 
 get_all = summary_test_util.get_all
 
 
-class DbTest(summary_test_util.SummaryDbTest):
+class GraphFileTest(test_util.TensorFlowTestCase):
+
+  def testSummaryOps(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.generic('tensor', 1, step=1)
+      summary_ops.scalar('scalar', 2.0, step=1)
+      summary_ops.histogram('histogram', [1.0], step=1)
+      summary_ops.image('image', [[[[1.0]]]], step=1)
+      summary_ops.audio('audio', [[1.0]], 1.0, 1, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    # The working condition of the ops is tested in the C++ test so we just
+    # test here that we're calling them correctly.
+    self.assertTrue(gfile.Exists(logdir))
+
+  def testSummaryName(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual('scalar', events[1].summary.value[0].tag)
+
+  def testSummaryNameScope(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      with ops.name_scope('scope'):
+        summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual('scope/scalar', events[1].summary.value[0].tag)
+
+  def testSummaryGlobalStep(self):
+    training_util.get_or_create_global_step()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(summary_ops.summary_writer_initializer_op())
+      step, _ = sess.run(
+          [training_util.get_global_step(), summary_ops.all_summary_ops()])
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual(step, events[1].step)
+
+  def testMaxQueue(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(
+        logdir, max_queue=1, flush_millis=999999)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      # Should flush after second summary since max_queue = 1
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(3, get_total())
+
+  def testFlushFunction(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(
+        logdir, max_queue=999999, flush_millis=999999)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+      flush_op = summary_ops.flush()
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      sess.run(flush_op)
+      self.assertEqual(2, get_total())
+      # Test "writer" parameter
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(summary_ops.flush(writer=writer))
+      self.assertEqual(3, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(summary_ops.flush(writer=writer._resource))  # pylint:disable=protected-access
+      self.assertEqual(4, get_total())
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.scalar('two', 2.0, step=2)
+      # Create with different shared name (should be separate resource/file)
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.scalar('three', 3.0, step=3)
+
+    with self.test_session() as sess:
+      # Run init ops across writers sequentially to avoid race condition.
+      # TODO(nickfelt): fix race condition in resource manager lookup or create
+      sess.run(writer1.init())
+      sess.run(writer2.init())
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      sess.run(writer3.init())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run([writer1.flush(), writer2.flush(), writer3.flush()])
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*tfevents*'))))
+
+    # First file has tags "one" and "two"
+    events = summary_test_util.events_from_file(next(event_files))
+    self.assertEqual('brain.Event:2', events[0].file_version)
+    tags = [e.summary.value[0].tag for e in events[1:]]
+    self.assertItemsEqual(['one', 'two'], tags)
+
+    # Second file has tag "three"
+    events = summary_test_util.events_from_file(next(event_files))
+    self.assertEqual('brain.Event:2', events[0].file_version)
+    tags = [e.summary.value[0].tag for e in events[1:]]
+    self.assertItemsEqual(['three'], tags)
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      self.assertEqual(1, get_total())  # file_version Event
+      # Running init() again while writer is open has no effect
+      sess.run(writer.init())
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      # Running close() should do an implicit flush
+      sess.run(writer.close())
+      self.assertEqual(2, get_total())
+      # Running init() on a closed writer should start a new file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      sess.run(writer.init())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(writer.close())
+      files = sorted(gfile.Glob(os.path.join(logdir, '*tfevents*')))
+      self.assertEqual(2, len(files))
+      self.assertEqual(2, len(summary_test_util.events_from_file(files[1])))
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      self.assertEqual(1, get_total())  # file_version Event
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      sess.run(writer.flush())
+      self.assertEqual(2, get_total())
+
+
+class GraphDbTest(summary_test_util.SummaryDbTest):
 
   def testGraphPassedToGraph_isForbiddenForThineOwnSafety(self):
     with self.assertRaises(TypeError):
@@ -85,6 +278,38 @@ class DbTest(summary_test_util.SummaryDbTest):
           self.assertEqual(len(events), 2)
           self.assertEqual(events[1].summary.value[0].tag, 'my_scalar')
 
+  def testScalarSummaryNameScope(self):
+    """Test record_summaries_every_n_global_steps and all_summaries()."""
+    with ops.Graph().as_default(), self.test_session() as sess:
+      global_step = training_util.get_or_create_global_step()
+      global_step.initializer.run()
+      with ops.device('/cpu:0'):
+        step_increment = state_ops.assign_add(global_step, 1)
+      sess.run(step_increment)  # Increment global step from 0 to 1
+
+      logdir = tempfile.mkdtemp()
+      with summary_ops.create_file_writer(logdir, max_queue=0,
+                                          name='t2').as_default():
+        with summary_ops.record_summaries_every_n_global_steps(2):
+          summary_ops.initialize()
+          with ops.name_scope('scope'):
+            summary_op = summary_ops.scalar('my_scalar', 2.0)
+
+          # Neither of these should produce a summary because
+          # global_step is 1 and "1 % 2 != 0"
+          sess.run(summary_ops.all_summary_ops())
+          sess.run(summary_op)
+          events = summary_test_util.events_from_logdir(logdir)
+          self.assertEqual(len(events), 1)
+
+          # Increment global step from 1 to 2 and check that the summary
+          # is now written
+          sess.run(step_increment)
+          sess.run(summary_ops.all_summary_ops())
+          events = summary_test_util.events_from_logdir(logdir)
+          self.assertEqual(len(events), 2)
+          self.assertEqual(events[1].summary.value[0].tag, 'scope/my_scalar')
+
   def testSummaryGraphModeCond(self):
     with ops.Graph().as_default(), self.test_session():
       training_util.get_or_create_global_step()
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index bb7215f879411e91a1c47b87f5caede63fffea74..f1ef218e74bbd225071324a8269fdfeb5de0e038 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -16,12 +16,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
+import time
 
 import numpy as np
 import six
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -29,9 +30,11 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
@@ -56,7 +59,7 @@ _NUMPY_NUMERIC_TYPES = {
 }
 
 
-class TargetTest(test_util.TensorFlowTestCase):
+class EagerFileTest(test_util.TensorFlowTestCase):
 
   def testShouldRecordSummary(self):
     self.assertFalse(summary_ops.should_record_summaries())
@@ -107,6 +110,20 @@ class TargetTest(test_util.TensorFlowTestCase):
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
+  def testSummaryNameScope(self):
+    training_util.get_or_create_global_step()
+    logdir = tempfile.mkdtemp()
+    with summary_ops.create_file_writer(
+        logdir, max_queue=0,
+        name='t2').as_default(), summary_ops.always_record_summaries():
+
+      with ops.name_scope('scope'):
+        summary_ops.scalar('scalar', 2.0)
+
+      events = summary_test_util.events_from_logdir(logdir)
+      self.assertEqual(len(events), 2)
+      self.assertEqual(events[1].summary.value[0].tag, 'scope/scalar')
+
   def testSummaryGlobalStep(self):
     step = training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
@@ -123,21 +140,22 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testMaxQueue(self):
     logs = tempfile.mkdtemp()
     with summary_ops.create_file_writer(
-        logs, max_queue=2, flush_millis=999999,
+        logs, max_queue=1, flush_millis=999999,
         name='lol').as_default(), summary_ops.always_record_summaries():
       get_total = lambda: len(summary_test_util.events_from_logdir(logs))
       # Note: First tf.Event is always file_version.
       self.assertEqual(1, get_total())
       summary_ops.scalar('scalar', 2.0, step=1)
       self.assertEqual(1, get_total())
+      # Should flush after second summary since max_queue = 1
       summary_ops.scalar('scalar', 2.0, step=2)
       self.assertEqual(3, get_total())
 
-  def testFlush(self):
+  def testFlushFunction(self):
     logs = tempfile.mkdtemp()
-    with summary_ops.create_file_writer(
-        logs, max_queue=999999, flush_millis=999999,
-        name='lol').as_default(), summary_ops.always_record_summaries():
+    writer = summary_ops.create_file_writer(
+        logs, max_queue=999999, flush_millis=999999, name='lol')
+    with writer.as_default(), summary_ops.always_record_summaries():
       get_total = lambda: len(summary_test_util.events_from_logdir(logs))
       # Note: First tf.Event is always file_version.
       self.assertEqual(1, get_total())
@@ -146,9 +164,103 @@ class TargetTest(test_util.TensorFlowTestCase):
       self.assertEqual(1, get_total())
       summary_ops.flush()
       self.assertEqual(3, get_total())
+      # Test "writer" parameter
+      summary_ops.scalar('scalar', 2.0, step=3)
+      summary_ops.flush(writer=writer)
+      self.assertEqual(4, get_total())
+      summary_ops.scalar('scalar', 2.0, step=4)
+      summary_ops.flush(writer=writer._resource)  # pylint:disable=protected-access
+      self.assertEqual(5, get_total())
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+        summary_ops.flush()
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.scalar('two', 2.0, step=2)
+        summary_ops.flush()
+      # Create with different shared name (should be separate resource/file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.scalar('three', 3.0, step=3)
+        summary_ops.flush()
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*tfevents*'))))
+
+    # First file has tags "one" and "two"
+    events = iter(summary_test_util.events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual('one', next(events).summary.value[0].tag)
+    self.assertEqual('two', next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file has tag "three"
+    events = iter(summary_test_util.events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual('three', next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      # Calling init() again while writer is open has no effect
+      writer.init()
+      self.assertEqual(1, get_total())
+      try:
+        # Not using .as_default() to avoid implicit flush when exiting
+        writer.set_as_default()
+        summary_ops.scalar('one', 1.0, step=1)
+        self.assertEqual(1, get_total())
+        # Calling .close() should do an implicit flush
+        writer.close()
+        self.assertEqual(2, get_total())
+        # Calling init() on a closed writer should start a new file
+        time.sleep(1.1)  # Ensure filename has a different timestamp
+        writer.init()
+        files = sorted(gfile.Glob(os.path.join(logdir, '*tfevents*')))
+        self.assertEqual(2, len(files))
+        get_total = lambda: len(summary_test_util.events_from_file(files[1]))
+        self.assertEqual(1, get_total())  # file_version Event
+        summary_ops.scalar('two', 2.0, step=2)
+        writer.close()
+        self.assertEqual(2, get_total())
+      finally:
+        # Clean up by resetting default writer
+        summary_ops.create_file_writer(None).set_as_default()
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+        self.assertEqual(1, get_total())
+        writer.flush()
+        self.assertEqual(2, get_total())
+        summary_ops.scalar('two', 2.0, step=2)
+      # Exiting the "as_default()" should do an implicit flush of the "two" tag
+      self.assertEqual(3, get_total())
 
 
-class DbTest(summary_test_util.SummaryDbTest):
+class EagerDbTest(summary_test_util.SummaryDbTest):
 
   def testIntegerSummaries(self):
     step = training_util.create_global_step()
diff --git a/tensorflow/contrib/summary/summary_test_internal.py b/tensorflow/contrib/summary/summary_test_internal.py
deleted file mode 100644
index d0d3384735fb1eb1a048c7aa6da0037ee9fc6936..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/summary/summary_test_internal.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Internal helpers for tests in this directory."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-
-import sqlite3
-
-from tensorflow.contrib.summary import summary_ops
-from tensorflow.python.framework import test_util
-
-
-class SummaryDbTest(test_util.TensorFlowTestCase):
-  """Helper for summary database testing."""
-
-  def setUp(self):
-    super(SummaryDbTest, self).setUp()
-    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
-    if os.path.exists(self.db_path):
-      os.unlink(self.db_path)
-    self.db = sqlite3.connect(self.db_path)
-    self.create_db_writer = functools.partial(
-        summary_ops.create_db_writer,
-        db_uri=self.db_path,
-        experiment_name='experiment',
-        run_name='run',
-        user_name='user')
-
-  def tearDown(self):
-    self.db.close()
-    super(SummaryDbTest, self).tearDown()
-
-
-def get_one(db, q, *p):
-  return db.execute(q, p).fetchone()[0]
-
-
-def get_all(db, q, *p):
-  return unroll(db.execute(q, p).fetchall())
-
-
-def unroll(list_of_tuples):
-  return sum(list_of_tuples, ())
diff --git a/tensorflow/contrib/summary/summary_test_util.py b/tensorflow/contrib/summary/summary_test_util.py
index 8506c4be9c4ca8305b62da17c7246e6e18313bd3..b4ae43302cb22ad17c04050eb84433c470757bf1 100644
--- a/tensorflow/contrib/summary/summary_test_util.py
+++ b/tensorflow/contrib/summary/summary_test_util.py
@@ -24,10 +24,10 @@ import os
 
 import sqlite3
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.core.util import event_pb2
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 
 
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 1e4cc3f0952ef74a1c89b7ed2d8c357fa8847ad5..136856c0156c41046f9af61cdd6e3d5f8213309e 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -16,20 +16,6 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "kernels/v4/*",
-            "proto/*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # ---------------------------------- V2 ops ------------------------------------------#
 filegroup(
     name = "v2_op_sources",
@@ -553,7 +539,6 @@ py_test(
     srcs = ["client/random_forest_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "no_windows",
         "nomac",  # b/63258195
         "notsan",
     ],
diff --git a/tensorflow/contrib/tensor_forest/README.md b/tensorflow/contrib/tensor_forest/README.md
index 8b24430c71c16c2ed6b2e1a530e19fbc9ebb1698..9e1491ea666b51ba0d367610778c659c543dacf6 100644
--- a/tensorflow/contrib/tensor_forest/README.md
+++ b/tensorflow/contrib/tensor_forest/README.md
@@ -116,7 +116,7 @@ a different `feature_bagging_fraction * num_features` sized subset of the
 input features.  Defaults to 1.0 (no feature bagging).
 
 * `base_random_seed`.  By default (`base_random_seed = 0`), the random number
-generator for each tree is seeded by the current time (in microseconds) when
+generator for each tree is seeded by a 64-bit random value when
 each tree is first created.  Using a non-zero value causes tree training to
 be deterministic, in that the i-th tree's random number generator is seeded
 with the value `base_random_seed + i`.
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index 90033015ebc5e44ea70fbf2bc9735d0aeb4ec27d..e893e1d1c836cc7feef15757dde79d0db362cbaf 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -37,7 +37,7 @@ def _top_k_generator(k):
   def _top_k(probabilities, targets):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
-      targets = array_ops.squeeze(targets, squeeze_dims=[1])
+      targets = array_ops.squeeze(targets, axis=[1])
     return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
@@ -57,7 +57,7 @@ def _r2(probabilities, targets, weights=None):
 
 
 def _squeeze_and_onehot(targets, depth):
-  targets = array_ops.squeeze(targets, squeeze_dims=[1])
+  targets = array_ops.squeeze(targets, axis=[1])
   return array_ops.one_hot(math_ops.to_int32(targets), depth)
 
 
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 4abcc20ed334e706c8ae59e2127dfd6f4e152361..35e8c92aba325d9115c7ee566363a1625e6e76fc 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -399,7 +399,7 @@ def get_combined_model_fn(model_fns):
   training ops: tf.group them.
   loss: average them.
   predictions: concat probabilities such that predictions[*][0-C1] are the
-    probablities for output 1 (where C1 is the number of classes in output 1),
+    probabilities for output 1 (where C1 is the number of classes in output 1),
     predictions[*][C1-(C1+C2)] are the probabilities for output 2 (where C2
     is the number of classes in output 2), etc.  Also stack predictions such
     that predictions[i][j] is the class prediction for example i and output j.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/BUILD b/tensorflow/contrib/tensor_forest/hybrid/BUILD
index a2a3b485f6aa0ae827bbaa7812823730bd8db3b8..b7185e09c70fbeb33ed559cde1dfeaf348a7e126 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/BUILD
+++ b/tensorflow/contrib/tensor_forest/hybrid/BUILD
@@ -11,18 +11,6 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "custom_op_sources",
     srcs = glob(
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
index cf0db788a419f64ed891df8aa097fa8826f6de91..06bfe871fdff29fcdac1a3a50500b94dbd2998d7 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
@@ -80,7 +80,7 @@ REGISTER_OP("HardRoutingFunction")
    regression model that translates from node features to
    probabilities.
 
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index c9df09bfda44e665ed013da383e1e9a2c665c454..1a055756c084016b3862ad131937e4d400f26d55 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -85,7 +85,7 @@ REGISTER_OP("StochasticHardRoutingFunction")
    regression model that translates from node features to
    probabilities.
 
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
index b0d8b832b5437db7a4b3026e80ae99d0391d7f7a..7d092bbc24d5c6611fed431c04200b2f950887f7 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
@@ -81,7 +81,7 @@ REGISTER_OP("StochasticHardRoutingGradient")
   tree_biases: `tree_biases[i]` gives the bias of the logistic
    regression model that translates from node features to
    probabilities.
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
index ff3ab21eaa9a4aa823f2ae7d3dd39674abea3d2a..745a5b1caf2fe348f1b276ccc245aa2ef350a62e 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
@@ -55,7 +55,7 @@ class ManyToOneLayer(hybrid_layer.HybridLayer):
 
       # There is always one activation per instance by definition, so squeeze
       # away the extra dimension.
-      return array_ops.squeeze(nn_activations, squeeze_dims=[1])
+      return array_ops.squeeze(nn_activations, axis=[1])
 
 
 class FlattenedFullyConnectedLayer(hybrid_layer.HybridLayer):
diff --git a/tensorflow/contrib/tensor_forest/kernels/data_spec.h b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
index 0a3abe56dfc4f611ac8ed0815e4c74a639d2477e..bb33400214e5ef37be73b538455eecf5ae481db4 100644
--- a/tensorflow/contrib/tensor_forest/kernels/data_spec.h
+++ b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
@@ -21,6 +21,7 @@
 
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace tensorforest {
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 44997ec5d6d5fdb9aab52ab7a50f46a731bfda66..cefcc960510293c15569391091e5c1b4611c8835 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -421,7 +421,7 @@ double getChebyshevEpsilon(const std::vector<float>& mu1,
                            const std::vector<float>& mu2) {
   // Math time!!
   // We are trying to minimize d = |mu1 - x|^2 + |mu2 - y|^2 over the surface.
-  // Using Langrange multipliers, we get
+  // Using Lagrange multipliers, we get
   //   partial d / partial x = -2 mu1 + 2 x = lambda_1 1 + 2 lambda_3 x
   //   partial d / partial y = -2 mu2 + 2 y = lambda_2 1 - 2 lambda_3 y
   // or
@@ -485,7 +485,7 @@ double getChebyshevEpsilon(const std::vector<float>& mu1,
   }
 
   double sdiscrim = sqrt(discrim);
-  // TODO(thomaswc): Analyze whetever one of these is always closer.
+  // TODO(thomaswc): Analyze whatever one of these is always closer.
   double v1 = (-b + sdiscrim) / (2 * a);
   double v2 = (-b - sdiscrim) / (2 * a);
   double dist1 = getDistanceFromLambda3(v1, mu1, mu2);
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index edbac6700677633cbd4d41f7040b4859ca599c4a..03aab1b61ee58a647edb24f6b97e517a411e996c 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -123,7 +123,7 @@ bool BestSplitDominatesRegression(const Tensor& total_sums,
                                   const Tensor& split_squares,
                                   int32 accumulator);
 
-// Performs booststrap_samples bootstrap samples of the best split's class
+// Performs bootstrap_samples bootstrap samples of the best split's class
 // counts and the second best splits's class counts, and returns true if at
 // least dominate_fraction of the time, the former has a better (lower)
 // Gini impurity.  Does not take over ownership of *rand.
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
index 794b76d8583c3608d540d34a5aaf1d1a799f35e3..b1b1559383a1d26a80d4974e2773f5b27ce1f2be 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
@@ -11,11 +11,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(["**/*"]),
-)
-
 DECISION_TREE_RESOURCE_DEPS = [
     ":decision_node_evaluator",
     ":input_data",
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index 328af28725af016e90b30ae2d303ffba15c81c1f..d3edb43733761a906c6e5bf8b65f76e3e1ae56fc 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -60,7 +60,7 @@ class DecisionTreeResource : public ResourceBase {
   mutex* get_mutex() { return &mu_; }
 
   // Return the TreeNode for the leaf that the example ends up at according
-  // to decsion_tree_. Also fill in that leaf's depth if it isn't nullptr.
+  // to decision_tree_. Also fill in that leaf's depth if it isn't nullptr.
   int32 TraverseTree(const std::unique_ptr<TensorDataSet>& input_data,
                      int example, int32* depth, TreePath* path) const;
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index bf2b2aaa3c8f433ab4fc145217857112f7a0a579..3db351c328c73beb94d6994aa503e3e2c4c06390 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -60,7 +60,7 @@ class InequalityDecisionNodeEvaluator : public BinaryDecisionNodeEvaluator {
   bool include_equals_;
 };
 
-// Evalutor for splits with multiple weighted features.
+// Evaluator for splits with multiple weighted features.
 class ObliqueInequalityDecisionNodeEvaluator
     : public BinaryDecisionNodeEvaluator {
  public:
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
index da600d34eacdf27514709240723e5bb730cfe7f0..63d4d9ba50603f65cc822ea74c97b923c29fea35 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
@@ -19,6 +19,7 @@
 #include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
 #include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h"
 #include "tensorflow/core/lib/random/distribution_sampler.h"
+#include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 namespace tensorforest {
@@ -122,9 +123,8 @@ ClassificationStats::ClassificationStats(const TensorForestParams& params,
     right_gini_.reset(new RunningGiniScores());
   }
 
-  uint64 time_seed = static_cast<uint64>(std::clock());
   single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-      new random::PhiloxRandom(time_seed));
+      new random::PhiloxRandom(random::New64()));
   rng_ = std::unique_ptr<random::SimplePhilox>(
       new random::SimplePhilox(single_rand_.get()));
 }
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
index c544a8c75e9bfe8fe6bbea8913e7be17d868bfef..95f75b4d7e6a961edf6b3da1dc1712e7ddaacf31 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
@@ -23,6 +23,7 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 
 namespace tensorflow {
@@ -44,18 +45,20 @@ class TensorDataSet {
     int column_count = 0;
     for (int i = 0; i < input_spec_.dense_size(); ++i) {
       for (int j = 0; j < input_spec_.dense(i).size(); ++j) {
-        decision_trees::FeatureId id;
-        id.mutable_id()->set_value(strings::StrCat(column_count));
-        available_features_.push_back(id);
         ++column_count;
       }
     }
+    available_features_.reserve(column_count);
+    decision_trees::FeatureId id;
+    for (int i = 0; i < column_count; i++) {
+      id.mutable_id()->set_value(strings::StrCat(i));
+      available_features_.emplace_back(id);
+    }
 
     // Set up the random number generator.
     if (split_sampling_random_seed_ == 0) {
-      uint64 time_seed = static_cast<uint64>(std::clock());
       single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-          new random::PhiloxRandom(time_seed));
+          new random::PhiloxRandom(random::New64()));
     } else {
       single_rand_ = std::unique_ptr<random::PhiloxRandom>(
           new random::PhiloxRandom(split_sampling_random_seed_));
diff --git a/tensorflow/contrib/tensor_forest/ops/model_ops.cc b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
index 3099cccdf8b1b7b1d49d12ec454f9a0dc7325e9b..98124d519c761907ed0e62ee2a396ef84716a911 100644
--- a/tensorflow/contrib/tensor_forest/ops/model_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
@@ -165,7 +165,7 @@ tree_handle: The handle to the tree.
 leaf_ids: `leaf_ids[i]` is the leaf id for input i.
 input_labels: The training batch's labels as a 1 or 2-d tensor.
   'input_labels[i][j]' gives the j-th label/target for the i-th input.
-input_weights: The training batch's eample weights as a 1-d tensor.
+input_weights: The training batch's weights as a 1-d tensor.
   'input_weights[i]' gives the weight for the i-th input.
 )doc");
 
diff --git a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
index e8b5c5d8a6efae83b9e408ed3d05ac72d848ef7f..5be581aaec4cab342ef3fd49fa0294e5e702ba1c 100644
--- a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
@@ -75,7 +75,7 @@ REGISTER_OP("GrowTreeV4")
     .Attr("params: string")
     .Input("tree_handle: resource")
     .Input("stats_handle: resource")
-    .Input("finshed_nodes: int32")
+    .Input("finished_nodes: int32")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
 Grows the tree for finished nodes and allocates waiting nodes.
@@ -83,7 +83,7 @@ Grows the tree for finished nodes and allocates waiting nodes.
 params: A serialized TensorForestParams proto.
 tree_handle: The handle to the tree.
 stats_handle: The handle to the stats.
-finshed_nodes: A 1-d Tensor of finished node ids from ProcessInput.
+finished_nodes: A 1-d Tensor of finished node ids from ProcessInput.
 )doc");
 
 REGISTER_OP("ProcessInputV4")
@@ -119,7 +119,7 @@ sparse_input_values: The values tensor from the SparseTensor input.
 sparse_input_shape: The shape tensor from the SparseTensor input.
 input_labels: The training batch's labels as a 1 or 2-d tensor.
   'input_labels[i][j]' gives the j-th label/target for the i-th input.
-input_weights: The training batch's eample weights as a 1-d tensor.
+input_weights: The training batch's weights as a 1-d tensor.
   'input_weights[i]' gives the weight for the i-th input.
 finished_nodes: A 1-d tensor of node ids that have finished and are ready to
   grow.
diff --git a/tensorflow/contrib/tensor_forest/proto/BUILD b/tensorflow/contrib/tensor_forest/proto/BUILD
index 1cfef44af1aaee3c105664398200524f2770f7d7..04fd6a9839509d2d02b7cf947acc4505c28cbdcd 100644
--- a/tensorflow/contrib/tensor_forest/proto/BUILD
+++ b/tensorflow/contrib/tensor_forest/proto/BUILD
@@ -6,14 +6,6 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
 package(default_visibility = ["//visibility:public"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "fertile_stats_proto",
     srcs = ["fertile_stats.proto"],
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 3650b5d52fe8a1b87a239d41ecfa3de677fffc72..7a35a70bbe3112e0649cefd8116cc50565978da5 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -212,7 +212,7 @@ class ForestHParams(object):
     self.regression = getattr(self, 'regression', False)
 
     # Num_outputs is the actual number of outputs (a single prediction for
-    # classification, a N-dimenensional point for regression).
+    # classification, a N-dimensional point for regression).
     self.num_outputs = self.num_classes if self.regression else 1
 
     # Add an extra column to classes for storing counts, which is needed for
@@ -445,7 +445,7 @@ class RandomForestGraphs(object):
           mask = math_ops.less(
               r, array_ops.ones_like(r) * self.params.bagging_fraction)
           gather_indices = array_ops.squeeze(
-              array_ops.where(mask), squeeze_dims=[1])
+              array_ops.where(mask), axis=[1])
           # TODO(thomaswc): Calculate out-of-bag data and labels, and store
           # them for use in calculating statistics later.
           tree_data = array_ops.gather(processed_dense_features, gather_indices)
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index 2e0a46ffe432341a423ac159deb7745d9ef15374..2b6a2b2f3c711f48812063e98e05735e2d9b4141 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -9,11 +9,11 @@ exports_files(["LICENSE"])
 
 # For platform specific build config
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
-    go_api_version = 2,
     visibility = ["//visibility:public"],
 )
 
@@ -82,6 +82,7 @@ py_test(
     size = "small",
     srcs = ["plugins/trace/trace_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":trace",
         "//tensorflow/python:client_testlib",
@@ -89,15 +90,3 @@ py_test(
         "//tensorflow/python:platform",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD
index 4175d8adb58a85728519042a9870e8c4590232ba..3f6b4cdc9ad10f5089f28af35a8be408918c7f90 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/contrib/tensorboard/db/BUILD
@@ -135,9 +135,3 @@ tf_cc_binary(
         "//tensorflow/core/lib/db:sqlite",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
index 85b3e7231bcb433e9510522597c03c5f764f06cf..3f24f58f03aac2ba6d368d7eccf8731f611a81b4 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -132,7 +132,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
   Status WriteEvent(std::unique_ptr<Event> event) override {
     mutex_lock ml(mu_);
     queue_.emplace_back(std::move(event));
-    if (queue_.size() >= max_queue_ ||
+    if (queue_.size() > max_queue_ ||
         env_->NowMicros() - last_flush_ > 1000 * flush_millis_) {
       return InternalFlush();
     }
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
index c61b4655961664a6c9c22a5f6d6f26a55c34bfcd..cd3f712256f2293ed725745f8cbe48109856ef86 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/event.pb.h"
@@ -58,7 +59,7 @@ class SummaryFileWriterTest : public ::testing::Test {
     TF_CHECK_OK(env_.GetChildren(testing::TmpDir(), &files));
     bool found = false;
     for (const string& f : files) {
-      if (StringPiece(f).contains(test_name)) {
+      if (str_util::StrContains(f, test_name)) {
         if (found) {
           return errors::Unknown("Found more than one file for ", test_name);
         }
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index cf67c27b70f1a8c761b71074d3eb5cd962a68488..742be7baf0bab48affc33bcede5883486e6f17ca 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,6 +11,7 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "py_test",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
@@ -47,9 +48,11 @@ tf_cuda_cc_test(
 
 tf_custom_op_library(
     name = "python/ops/_trt_engine_op.so",
-    srcs = ["ops/trt_engine_op.cc"],
+    srcs = [
+        "ops/trt_calib_op.cc",
+        "ops/trt_engine_op.cc",
+    ],
     deps = [
-        ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
@@ -71,25 +74,34 @@ tf_cuda_library(
 
 cc_library(
     name = "trt_engine_op_kernel",
-    srcs = ["kernels/trt_engine_op.cc"],
-    hdrs = ["kernels/trt_engine_op.h"],
+    srcs = [
+        "kernels/trt_calib_op.cc",
+        "kernels/trt_engine_op.cc",
+    ],
+    hdrs = [
+        "kernels/trt_calib_op.h",
+        "kernels/trt_engine_op.h",
+    ],
     copts = tf_copts(),
+    visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+        ":trt_resources",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:stream_executor_headers_lib",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + tf_custom_op_library_additional_deps(),
-    alwayslink = 1,
+    # TODO(laigd)
+    alwayslink = 1,  # buildozer: disable=alwayslink-with-hdrs
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["trt_engine_op"],
-    deps = if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
+    op_lib_names = [
+        "trt_engine_op",
+        "trt_calib_op",
+    ],
 )
 
 tf_cuda_library(
@@ -106,7 +118,9 @@ tf_cuda_library(
 
 tf_gen_op_wrapper_py(
     name = "trt_engine_op",
+    gen_locally = True,
     deps = [
+        ":trt_calib_op_op_lib",
         ":trt_engine_op_op_lib",
         ":trt_logging",
         ":trt_shape_function",
@@ -121,8 +135,15 @@ tf_custom_op_py_library(
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]),
+    kernels = [
+        ":trt_engine_op_kernel",
+        ":trt_engine_op_op_lib",
+        ":trt_calib_op_op_lib",
+        ":trt_shape_function",
+    ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:resources",
     ],
@@ -138,6 +159,7 @@ py_library(
     deps = [
         ":trt_convert_py",
         ":trt_ops_py",
+        "//tensorflow/python:errors",
     ],
 )
 
@@ -156,6 +178,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":wrap_conversion",
+        "//tensorflow/python:tf_optimizer",
     ],
 )
 
@@ -165,11 +188,33 @@ tf_py_wrap_cc(
     copts = tf_copts(),
     deps = [
         ":trt_conversion",
+        ":trt_engine_op_kernel",
         "//tensorflow/core:framework_lite",
         "//util/python:python_headers",
     ],
 )
 
+tf_cuda_library(
+    name = "trt_resources",
+    srcs = [
+        "resources/trt_int8_calibrator.cc",
+        "resources/trt_resource_manager.cc",
+    ],
+    hdrs = [
+        "resources/trt_int8_calibrator.h",
+        "resources/trt_resource_manager.h",
+        "resources/trt_resources.h",
+    ],
+    deps = [
+        ":trt_logging",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
 # Library for the node-level conversion portion of TensorRT operation creation
 tf_cuda_library(
     name = "trt_conversion",
@@ -184,6 +229,7 @@ tf_cuda_library(
     deps = [
         ":segment",
         ":trt_logging",
+        ":trt_resources",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core:framework",
@@ -233,14 +279,18 @@ tf_cc_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+py_test(
+    name = "tf_trt_integration_test",
+    srcs = ["test/tf_trt_integration_test.py"],
+    main = "test/tf_trt_integration_test.py",
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":init_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
 )
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index dfcce0fd00eedf3341850bbc23927dc3b2e2d2aa..687dee07e1327d50fabc4e14c25a357ae6c959e7 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -1,40 +1,29 @@
-Using TensorRT in TensorFlow
-============================
+# Using TensorRT in TensorFlow
 
-This module provides necessary bindings and introduces TRT_engine_op
-operator that wraps a subgraph in TensorRT.
+This module provides necessary bindings and introduces TRT_engine_op operator
+that wraps a subgraph in TensorRT. This is still a work in progress but should
+be useable with most common graphs.
 
-Compilation
------------
+## Compilation
 
-In order to compile the module, you need to have a local TensorRT
-installation (libnvinfer.so and respective include files). During the
-configuration step, TensorRT should be enabled and installation path
-should be set. If installed through package managers (deb,rpm),
-configure script should find the necessary components from the system
-automatically. If installed from tar packages, user has to set path to
-location where the library is installed during configuration.
+In order to compile the module, you need to have a local TensorRT installation
+(libnvinfer.so and respective include files). During the configuration step,
+TensorRT should be enabled and installation path should be set. If installed
+through package managers (deb,rpm), configure script should find the necessary
+components from the system automatically. If installed from tar packages, user
+has to set path to location where the library is installed during configuration.
 
-
-```
+```shell
 bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
 ```
 
-After the installation of tensorflow package, TensorRT transformation
-will be available. An example use is shown below.
+After the installation of tensorflow package, TensorRT transformation will be
+available. An example use can be found in test/test_tftrt.py script
 
-```python
-import tensorflow as tf
-import tensorflow.contrib.tensorrt as trt
-#... create and train or load model
-gdef = sess.graph.as_graph_def()
-trt_gdef = trt.create_inference_graph(
-    gdef, #original graph_def
-    ["output"], #name of output node(s)
-    max_batch_size, #maximum batch size to run the inference
-    max_workspace_size_bytes) # max memory for TensorRT to use
-tf.reset_default_graph()
-tf.import_graph_def(graph_def=trt_gdef)
-#...... run inference
-```
+## Installing TensorRT 3.0.4
+
+In order to make use of TensorRT integration, you will need a local installation
+of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt).
+Installation instructions for compatibility with TensorFlow are provided on the
+[TensorFlow Installation page](https://www.tensorflow.org/install/install_linux#nvidia_requirements_to_run_tensorflow_with_gpu_support).
diff --git a/tensorflow/contrib/tensorrt/__init__.py b/tensorflow/contrib/tensorrt/__init__.py
index fd551d70b4385b14b84b7b98a6d16b0c03733d38..140ad4828208ae4844a49bf664955b50cd9e51cd 100644
--- a/tensorflow/contrib/tensorrt/__init__.py
+++ b/tensorflow/contrib/tensorrt/__init__.py
@@ -18,6 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
-from tensorflow.contrib.tensorrt.python import *
-# pylint: enable=unused-import,wildcard-import
+from tensorflow.python.framework import errors
+
+# pylint: disable=unused-import,wildcard-import,g-import-not-at-top
+try:
+  from tensorflow.contrib.tensorrt.python import *
+except errors.NotFoundError as e:
+  no_trt_message = (
+      '**** Failed to initialize TensorRT. This is either because the TensorRT'
+      ' installation path is not in LD_LIBRARY_PATH, or because you do not have'
+      ' it installed. If not installed, please go to'
+      ' https://developer.nvidia.com/tensorrt to download and install'
+      ' TensorRT ****')
+  print(no_trt_message)
+  raise e
+# pylint: enable=unused-import,wildcard-import,g-import-not-at-top
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 899448004f917b36b35fb871a66a9d857736a338..07740277115fe4956f21e8db1dabfb10990a2095 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
 
+#include <list>
 #include <map>
 #include <set>
 #include <unordered_map>
@@ -37,7 +38,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"  // NOLINT
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -48,16 +49,33 @@ namespace tensorrt {
 namespace convert {
 namespace {
 
-static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
+bool IsTensorRTCandidate(const tensorflow::Node* node) {
   // LINT.IfChange
   // TODO(jie): Segmentation shouldn't associated with op name.
   //            Split it into a registration for each kernel.
   static const std::set<string> candidate_ops = {
-      "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
-      "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
+      "Identity",
+      "Snapshot",
+      "Const",
+      "Conv2D",
+      "MaxPool",
+      "BiasAdd",
+      "Relu",
+      "Add",
+      "Mul",
+      "Sub",
+      "Rsqrt",
+      "Pad",
+      "Mean",
+      "AvgPool",
+      "ConcatV2",
+      "DepthwiseConv2dNative",
+      "FusedBatchNorm",
+      "FusedBatchNormV2",
+      // TODO(ben,jie): ...
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
-  return candidate_ops.count(node_def.op());
+  return candidate_ops.count(node->type_string());
 }
 
 void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
@@ -67,8 +85,10 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
     const tensorflow::Node* node = graph.FindNodeId(node_id);
     for (const tensorflow::Edge* edge : node->in_edges()) {
       if (!subgraph_node_ids.count(edge->src()->id()) &&
-          !edge->src()->IsSource()) {
+          !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
+      } else {
+        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
       }
     }
   }
@@ -81,27 +101,32 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     const tensorflow::Node* node = graph.FindNodeId(node_id);
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
-          !edge->dst()->IsSink()) {
+          !edge->dst()->IsSink() && !edge->IsControlEdge()) {
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
         outgoing_edges->insert(edge);
+      } else {
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
       }
     }
   }
 }
 
-std::pair<string, int> ParseTensorName(string name, int default_idx = 0) {
+std::pair<string, int> ParseTensorName(const string& name,
+                                       int default_idx = 0) {
+  string name_no_idx = name;
   int idx = default_idx;
-  size_t sep = name.find_last_of(':');
+  const size_t sep = name_no_idx.find_last_of(':');
   if (sep != string::npos) {
-    name = name.substr(0, sep);
+    name_no_idx = name_no_idx.substr(0, sep);
     idx = std::stoi(name.substr(sep + 1));
   }
-  return std::make_pair(name, idx);
+  return std::make_pair(name_no_idx, idx);
 }
 
 std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
     const std::vector<string>& tensor_names) {
   std::unordered_map<string, std::vector<int>> result;
-  for (string const& tensor_name : tensor_names) {
+  for (const string& tensor_name : tensor_names) {
     string node_name;
     int index;
     std::tie(node_name, index) = ParseTensorName(tensor_name);
@@ -110,73 +135,150 @@ std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
   return result;
 }
 
-tensorflow::Status ConvertSubGraphToTensorRT(
-    const std::vector<string>& output_names,
-    const std::set<int>& subgraph_node_ids,
-    size_t max_batch_size,  // Max batch size that engine will be created for
-    // Max amount of memory that engine will be allowed to consume, in bytes
-    size_t max_workspace_size_bytes,
-    const tensorflow::grappler::GraphProperties& graph_properties,
-    tensorflow::Graph* graph) {
-  tensorflow::EdgeSet subgraph_incoming_edges;
-  GetSubGraphIncomingEdges(*graph, subgraph_node_ids, &subgraph_incoming_edges);
-
+// TODO(sami): convert references to pointers
+struct ConvertGraphParams {
+  ConvertGraphParams(
+      tensorflow::Graph& inp_graph,
+      const std::vector<string>& output_node_names,
+      const std::set<int>& subgraph_node_id_numbers,
+      size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
+      const tensorflow::grappler::GraphProperties& current_graph_properties,
+      std::unordered_map<string, std::pair<int, string>>* output_edges,
+      int engine_precision_mode)
+      : graph(inp_graph),
+        output_names(output_node_names),
+        subgraph_node_ids(subgraph_node_id_numbers),
+        max_batch_size(max_supported_batch_size),
+        max_workspace_size_bytes(max_consumed_workspace_size_bytes),
+        graph_properties(current_graph_properties),
+        output_edge_map(output_edges),
+        precision_mode(engine_precision_mode) {}
+  tensorflow::Graph& graph;
+  const std::vector<string>& output_names;
+  const std::set<int>& subgraph_node_ids;
+  size_t max_batch_size;
+  size_t max_workspace_size_bytes;
+  const tensorflow::grappler::GraphProperties& graph_properties;
+  std::unordered_map<string, std::pair<int, string>>* output_edge_map;
+  int precision_mode;
   std::vector<std::pair<int, int>> subgraph_inputs;
+  std::vector<std::pair<int, int>> subgraph_outputs;
+  tensorflow::EdgeSet subgraph_incoming_edges;
+  tensorflow::EdgeSet subgraph_outgoing_edges;
+};
 
-  // Collect inputs by looking for incoming edges
-  for (const tensorflow::Edge* edge : subgraph_incoming_edges) {
-    subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
+static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
+  GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
+                           &p->subgraph_incoming_edges);
+  for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
+    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
   }
+  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
   std::set<std::pair<int, int>> subgraph_outputs_set;
   // Collect outputs referenced from output_names
-  auto output_name_to_index_map = BuildTensorNameMap(output_names);
-  for (int node_id : subgraph_node_ids) {
-    tensorflow::Node* node = graph->FindNodeId(node_id);
+  for (int node_id : p->subgraph_node_ids) {
+    tensorflow::Node* node = p->graph.FindNodeId(node_id);
     if (output_name_to_index_map.count(node->name())) {
       for (int index : output_name_to_index_map.at(node->name())) {
         subgraph_outputs_set.insert({node_id, index});
       }
     }
   }
-  // Collect outputs referenced from outgoing edges
-  tensorflow::EdgeSet subgraph_outgoing_edges;
-  GetSubGraphOutgoingEdges(*graph, subgraph_node_ids, &subgraph_outgoing_edges);
-  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
+  GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
+                           &p->subgraph_outgoing_edges);
+  for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
     subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
   }
-  // Impose an ordering on the outputs
-  std::vector<std::pair<int, int>> subgraph_outputs(
-      subgraph_outputs_set.begin(), subgraph_outputs_set.end());
-  // Build TensorRT node and add it to the graph
+  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
+  p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
+                             subgraph_outputs_set.begin(),
+                             subgraph_outputs_set.end());
+  return tensorflow::Status::OK();
+};
+
+tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
+  TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
   tensorflow::NodeDef trt_node_def;
-  TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(
-      *graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
-      max_batch_size, max_workspace_size_bytes, graph_properties,
-      &trt_node_def));
+  SubGraphParams s(params->graph, params->subgraph_node_ids,
+                   params->subgraph_inputs, params->subgraph_outputs,
+                   params->max_batch_size, params->max_workspace_size_bytes,
+                   params->graph_properties, params->output_edge_map,
+                   &trt_node_def, params->precision_mode);
+  TF_RETURN_IF_ERROR(InjectCalibrationNode(s));
   tensorflow::Status status;
-  tensorflow::Node* trt_node = graph->AddNode(trt_node_def, &status);
+  tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
+
+  TF_RETURN_IF_ERROR(status);
+
+  for (auto in_edge :
+       params->subgraph_incoming_edges) {  // loop over incoming edges and
+                                           // attach them to calib node
+    // tensorflow::Node* src_node = in_edge->src();
+    auto src_output = in_edge->src_output();
+    auto dst_node = in_edge->dst();
+    auto dst_input = in_edge->dst_input();
+    VLOG(1) << " update edge " << trt_node->name() << ":" << src_output
+            << " -> " << dst_node->name() << ":" << dst_input;
+    TF_RETURN_IF_ERROR(
+        params->graph.UpdateEdge(trt_node, src_output, dst_node, dst_input));
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
+  TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
+  tensorflow::NodeDef trt_node_def;
+
+  SubGraphParams s(params->graph, params->subgraph_node_ids,
+                   params->subgraph_inputs, params->subgraph_outputs,
+                   params->max_batch_size, params->max_workspace_size_bytes,
+                   params->graph_properties, params->output_edge_map,
+                   &trt_node_def, params->precision_mode);
+  TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s));
+  tensorflow::Status status;
+  tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
+
+  // AddNode does not wire edges.
+  // Re-map incoming edges to use the new TRT node instead of the orig subgraph
+  std::map<std::pair<int, int>, int> subgraph_edge_to_input_map;
+  for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
+    subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
+  }
+  for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
+    std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    int new_src_output = subgraph_edge_to_input_map.at(old_src);
+    params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
+                          new_src_output);
+    params->graph.RemoveEdge(edge);
+  }
+
+  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
+  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+  }
+
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
   std::map<std::pair<int, int>, int> subgraph_edge_to_output_map;
-  for (size_t i = 0; i < subgraph_outputs.size(); ++i) {
-    subgraph_edge_to_output_map.insert({subgraph_outputs.at(i), i});
+  for (size_t i = 0; i < params->subgraph_outputs.size(); ++i) {
+    subgraph_edge_to_output_map.insert({params->subgraph_outputs.at(i), i});
   }
   TF_RETURN_IF_ERROR(status);
-  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
+  for (const tensorflow::Edge* edge : params->subgraph_outgoing_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
-    TF_RETURN_IF_ERROR(graph->UpdateEdge(trt_node, new_src_output, edge->dst(),
-                                         edge->dst_input()));
+    TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
+        trt_node, new_src_output, edge->dst(), edge->dst_input()));
   }
   // Remove the original subgraph
-  for (int node_id : subgraph_node_ids) {
-    tensorflow::Node* node = graph->FindNodeId(node_id);
+  for (int node_id : params->subgraph_node_ids) {
+    tensorflow::Node* node = params->graph.FindNodeId(node_id);
     // Don't remove the input placeholders
     if (node->type_string() == "Placeholder") {
       continue;
     }
-    graph->RemoveNode(node);
+    params->graph.RemoveNode(node);
   }
   return tensorflow::Status::OK();
 }
@@ -194,12 +296,39 @@ tensorflow::Status BuildNodeMap(
 }
 
 }  // namespace
+tensorflow::Status ConvertCalibGraphToInferGraph(
+    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph) {
+  VLOG(0) << "Starting Calib Conversion";
+  tensorflow::Graph graph(tensorflow::OpRegistry::Global());
+  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
+      tensorflow::GraphConstructorOptions(), graph_def, &graph));
+  //  get calib nodes
+  std::vector<tensorflow::Node*> calib_nodes;
+  for (auto node : graph.op_nodes()) {
+    if (node->type_string() == "TRTCalibOp") {
+      VLOG(1) << "Found Calib Node";
+      calib_nodes.push_back(node);
+    }
+  }
+  VLOG(0) << "Num Calib nodes in graph= " << calib_nodes.size();
+  if (calib_nodes.size() == 0)
+    return tensorflow::errors::FailedPrecondition(
+        "Graph doesn't contain any calibration nodes!."
+        " Please generate calibration graph and run calibration first");
+  for (auto n : calib_nodes) {
+    TF_RETURN_IF_ERROR(
+        tensorrt::convert::ConvertCalibrationNodeToEngineNode(graph, n));
+  }
+  graph.ToGraphDef(infer_graph);
+  return tensorflow::Status::OK();
+}
 
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def) {
-  // Optimization pass
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
+    int precision_mode = FP32MODE, int minimum_segment_size = 3) {
+  // optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
   tensorflow::GraphDef gdef;
@@ -209,16 +338,23 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   tensorflow::grappler::LayoutOptimizer optimizer;
   tensorflow::grappler::Cluster* cluster;
 
-  // Virtual cluster
+  // virtual cluster
   tensorflow::DeviceProperties device_properties;
+
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
   cluster =
       new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
+  // single machine
+  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
+  VLOG(2) << "cpu_cores: " << num_cpu_cores;
+  VLOG(2) << "gpus: " << num_gpus;
+
   TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
 
-  // Constant folding
+  // constant folding
   item.graph = gdef;
   tensorflow::grappler::ConstantFolding fold(nullptr);
   TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
@@ -226,7 +362,6 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
-
   // Build full graph
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
                                              gdef.library());
@@ -243,7 +378,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   }
 
   // TODO(sami): this should be passed as a knob!!!!
-  segment_options.minimum_segment_size = 2;
+  segment_options.minimum_segment_size = minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
       gdef, IsTensorRTCandidate, segment_options, &segments));
@@ -252,14 +387,44 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   }
   std::unordered_map<string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
+  std::unordered_map<string, std::pair<int, string>> output_edge_map;
+  int count = 0;
+  float total_num_nodes_in_segments = 0.;
+  for (auto s : segments) {
+    total_num_nodes_in_segments += s.size();
+  }
   for (const std::set<string>& subgraph_node_names : segments) {
     std::set<int> subgraph_node_ids;
+    size_t max_mem_per_engine =
+        max_workspace_size_bytes *
+        ((float)subgraph_node_names.size() / total_num_nodes_in_segments);
+    std::stringstream oss;
     for (const string& node_name : subgraph_node_names) {
+      oss << " " << node_name;
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
-    TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
-        output_names, subgraph_node_ids, max_batch_size,
-        max_workspace_size_bytes, static_graph_properties, &graph));
+    VLOG(2) << "Subgraph nodes" << oss.str();
+    ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size,
+                         max_mem_per_engine, static_graph_properties,
+                         &output_edge_map, precision_mode);
+    if (precision_mode == INT8MODE) {
+      tensorflow::Status status = GetCalibNode(&p);
+      if (status != tensorflow::Status::OK()) {
+        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
+                     << " due to: \"" << status.ToString()
+                     << "\" SKIPPING......( " << subgraph_node_names.size()
+                     << " nodes)";
+      }
+    } else {
+      tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
+      if (status != tensorflow::Status::OK()) {
+        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
+                     << " due to: \"" << status.ToString()
+                     << "\" SKIPPING......( " << subgraph_node_names.size()
+                     << " nodes)";
+      }
+    }
+    count++;
   }
   graph.ToGraphDef(new_graph_def);
   return tensorflow::Status::OK();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 154ad3f2e8fb0ae702448097fbdece510df30223..e01e4a5328061ad527b2dac6e2e4ef1559bd914d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -28,14 +28,20 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
+// This method converts an already generated calibration graph which was used in
+// calibration runs to an inference graph
+tensorflow::Status ConvertCalibGraphToInferGraph(
+    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def);
+
 // max_batch_size: maximum batch size which can be used for inference for
 //                 optimization targets inference run with max batch size.
-// max_workspace_size_bytes: The upper bound of memory allowence for
+// max_workspace_size_bytes: The upper bound of memory allowance for
 //                 engine building.
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def);
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
+    int precision_mode, int minimum_segment_size);
 
 }  // namespace convert
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 9ee717dd7fb1eff4a11fb104cf5806ec8ab853d2..b81ae9dc3eeed6f7b7c6eeac0186700bdd692245 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -24,6 +24,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"  // NOLINT
 #include "tensorflow/core/framework/types.h"
@@ -32,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tensor_coding.h"
@@ -39,7 +44,6 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorrt/include/NvInfer.h"
 
 //  Check if the types are equal. Cast to int first so that failure log message
@@ -49,7 +53,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
-
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
 namespace {
 
 inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
@@ -65,7 +70,8 @@ inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
       *trt_dtype = nvinfer1::DataType::kHALF;
       break;
     default:
-      return tensorflow::errors::InvalidArgument("Unsupported data type");
+      return tensorflow::errors::InvalidArgument(
+          "Unsupported data type " + tensorflow::DataTypeString(tf_dtype));
   }
   return tensorflow::Status::OK();
 }
@@ -112,6 +118,18 @@ static std::vector<std::pair<int, int>> CreateSamePadding(
   return padding;
 }
 
+string GetCommonNameScope(const string& op_name_a, const string& op_name_b) {
+  size_t last_scope_separator = 0;
+  for (size_t i = 0; i < std::min(op_name_a.size(), op_name_b.size()); ++i) {
+    if (op_name_a[i] != op_name_b[i]) {
+      break;
+    } else if (op_name_a[i] == '/') {
+      last_scope_separator = i + 1;
+    }
+  }
+  return op_name_a.substr(0, last_scope_separator);
+}
+
 class TRT_ShapedWeights {
  public:
   TRT_ShapedWeights(tensorflow::DataType type, const void* values,
@@ -244,6 +262,11 @@ std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
   return std::vector<int>(attr.begin(), attr.end());
 }
 
+template <>
+std::vector<string> TFAttrs::get<std::vector<string>>(string key) const {
+  auto attr = this->at(key)->list().s();
+  return std::vector<string>(attr.begin(), attr.end());
+}
 template <>
 nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
   auto values = this->get<std::vector<int>>(key);
@@ -266,6 +289,17 @@ tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
   return this->at(key)->type();
 }
 
+template <>
+float TFAttrs::get<float>(string key) const {
+  return this->at(key)->f();
+}
+
+template <>
+bool TFAttrs::get<bool>(string key) const {
+  return this->at(key)->b();
+}
+
+// TODO(jie): reorder4 & reorder2 should be merged?
 template <typename T>
 void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
               nvinfer1::DimsNCHW istrides, T* odata,
@@ -283,29 +317,86 @@ void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
   }
 }
 
+template <typename T>
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+
+// TODO(jie): fallback to tensorflow!!
+void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
+                   TRT_ShapedWeights* oweights) {
+  int c = iweights.shape_.d[0];
+  int k = iweights.shape_.d[1];
+  oweights->shape_.d[0] = k;
+  oweights->shape_.d[1] = c;
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  switch (iweights.type_) {
+    case tensorflow::DataType::DT_FLOAT: {
+      Reorder2({k, c}, static_cast<float const*>(iweights.GetValues()),
+               istrides,
+               static_cast<float*>(const_cast<void*>(oweights->GetValues())),
+               ostrides);
+      break;
+    }
+    case tensorflow::DataType::DT_HALF: {
+      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+               istrides, static_cast<Eigen::half*>(
+                             const_cast<void*>(oweights->GetValues())),
+               ostrides);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unsupported type in reorder expected fp32 or fp16 but got "
+                 << DataTypeString(iweights.type_);
+  }
+}
+
 void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
-                       TRT_ShapedWeights* oweights) {
+                       TRT_ShapedWeights* oweights, int num_groups) {
   CHECK_EQ(iweights.type_, oweights->type_);
   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
   int r = iweights.shape_.d[0];
   int s = iweights.shape_.d[1];
-  int c = iweights.shape_.d[2];
-  int k = iweights.shape_.d[3];
-  oweights->shape_.d[0] = k;
-  oweights->shape_.d[1] = c;
+  // TRT requires GKcRS, while TF depthwise has RSCK
+  //   where c=1, C=G
+  VLOG(2) << "num_groups: " << num_groups;
+  int c = iweights.shape_.d[2] / num_groups;
+  VLOG(2) << "c" << iweights.shape_.d[2] << " then " << c;
+  int k = iweights.shape_.d[3] * num_groups;
+  VLOG(2) << "k" << iweights.shape_.d[3] << " then " << k;
+  oweights->shape_.d[0] = k / num_groups;
+  oweights->shape_.d[1] = c * num_groups;
   oweights->shape_.d[2] = r;
   oweights->shape_.d[3] = s;
   nvinfer1::DimsNCHW istrides = {1, k, s * k * c, c * k};
   nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1};
   switch (iweights.type_) {
-    case tensorflow::DataType::DT_FLOAT:
+    case tensorflow::DataType::DT_FLOAT: {
       Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()),
                istrides,
                static_cast<float*>(const_cast<void*>(oweights->GetValues())),
                ostrides);
       break;
+    }
+    case tensorflow::DataType::DT_HALF: {
+      Reorder4(
+          {k, c, r, s}, static_cast<Eigen::half const*>(iweights.GetValues()),
+          istrides,
+          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
+          ostrides);
+      break;
+    }
+
     default:
-      LOG(FATAL) << "!!!!!!!!!!!!!!!!!!!!!!!!broke!!!!!!!!!!!!";
+      LOG(FATAL) << "Unsupported type, expected fp32 or fp16 but got "
+                 << DataTypeString(iweights.type_);
   }
 }
 
@@ -323,12 +414,11 @@ inline std::shared_ptr<T> infer_object(T* obj) {
   return std::shared_ptr<T>(obj, InferDeleter());
 }
 
-// Logger for GIE info/warning/errors
 class Converter;
 
 using OpConverter =
     std::function<tensorflow::Status(Converter&, const tensorflow::NodeDef&,
-                                     std::vector<TRT_TensorOrWeights> const&,
+                                     const std::vector<TRT_TensorOrWeights>&,
                                      std::vector<TRT_TensorOrWeights>*)>;
 
 class Converter {
@@ -336,40 +426,69 @@ class Converter {
   std::unordered_map<string, OpConverter> op_registry_;
   nvinfer1::INetworkDefinition* trt_network_;
   std::list<std::vector<uint8_t>> temp_bufs_;
-
+  tensorflow::tensorrt::TRTWeightStore* weight_store_;
+  bool fp16_;
   void register_op_converters();
-
-  std::vector<TRT_TensorOrWeights> get_inputs(
-      const tensorflow::NodeDef& node_def) {
-    std::vector<TRT_TensorOrWeights> inputs;
-    for (const auto& input_name : node_def.input()) {
-      VLOG(2) << "Retrieve input: " << input_name;
-      inputs.push_back(trt_tensors_.at(input_name));
+  tensorflow::Status get_inputs(const tensorflow::NodeDef& node_def,
+                                std::vector<TRT_TensorOrWeights>* inputs) {
+    for (auto const& input_name : node_def.input()) {
+      /*************************************************************************
+       * TODO(jie) handle case 1) here
+       * Normalizes the inputs and extracts associated metadata:
+       * 1) Inputs can contain a colon followed by a suffix of characters.
+       *    That suffix may be a single number (e.g. inputName:1) or several
+       *    word characters separated from a number by a colon
+       *    (e.g. inputName:foo:1). The
+       *    latter case is used to denote inputs and outputs of functions.
+       * 2) Control dependency inputs contain caret at the beginning and we
+       *    remove this and annotate the edge as a control dependency.
+       ************************************************************************/
+      // skip control nodes
+      if (input_name[0] == '^') continue;
+      string name = input_name;
+      auto first = name.find_first_of(':');
+      if (first != string::npos && first + 2 == name.size() &&
+          name[first + 1] == '0')
+        name.erase(first);
+
+      VLOG(2) << "retrieve input: " << name;
+      if (trt_tensors_.count(name)) {
+        inputs->push_back(trt_tensors_.at(name));
+      } else {
+        string str("Node ");
+        StrAppend(&str, node_def.name(), " should have an input named '", name,
+                  "' but it is not available");
+        LOG(WARNING) << "input: " << name << " not available for node at "
+                     << node_def.name();
+        return tensorflow::errors::InvalidArgument(str);
+      }
     }
-    return inputs;
+    return tensorflow::Status::OK();
   }
 
  public:
-  explicit Converter(nvinfer1::INetworkDefinition* trt_network)
-      : trt_network_(trt_network) {
+  explicit Converter(nvinfer1::INetworkDefinition* trt_network,
+                     tensorflow::tensorrt::TRTWeightStore* ws, bool fp16)
+      : trt_network_(trt_network), weight_store_(ws), fp16_(fp16) {
     this->register_op_converters();
   }
-
+  tensorflow::tensorrt::TRTWeightStore* weight_store() { return weight_store_; }
   TRT_ShapedWeights get_temp_weights(tensorflow::DataType type,
                                      nvinfer1::Dims shape) {
     TRT_ShapedWeights weights(type, nullptr, shape);
     // TODO(jie): check weights size_bytes. 0 means type error
-    temp_bufs_.push_back(std::vector<uint8_t>(weights.size_bytes()));
-    weights.SetValues(temp_bufs_.back().data());
+    weight_store_->store_.push_back(std::vector<uint8_t>(weights.size_bytes()));
+    weights.SetValues(weight_store_->store_.back().data());
     return weights;
   }
-
+  bool isFP16() { return fp16_; };
   TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
     return this->get_temp_weights(weights.type_, weights.shape_);
   }
 
   tensorflow::Status convert_node(const tensorflow::NodeDef& node_def) {
-    std::vector<TRT_TensorOrWeights> inputs = this->get_inputs(node_def);
+    std::vector<TRT_TensorOrWeights> inputs;
+    TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs));
     string op = node_def.op();
     if (!op_registry_.count(op)) {
       return tensorflow::errors::Unimplemented(
@@ -382,7 +501,7 @@ class Converter {
       TRT_TensorOrWeights output = outputs.at(i);
       // TODO(jie): tf protobuf seems to be omitting the :0 suffix
       string output_name = node_def.name();
-      if (i != 0) output_name = output_name + ":" + std::to_string(i);
+      if (i != 0) output_name = StrCat(output_name, ":", i);
       if (output.is_tensor()) {
         output.tensor()->setName(output_name.c_str());
       }
@@ -434,6 +553,19 @@ class Converter {
   }
 };
 
+TRT_ShapedWeights ConvertFP32ToFP16(Converter& ctx,
+                                    const TRT_ShapedWeights& weights_src) {
+  auto dtype_new = tensorflow::DataType::DT_HALF;
+  TRT_ShapedWeights weights =
+      ctx.get_temp_weights(dtype_new, weights_src.shape_);
+  const float* src = static_cast<const float*>(weights_src.GetValues());
+  Eigen::half* dst = const_cast<Eigen::half*>(
+      static_cast<Eigen::half const*>(weights.GetValues()));
+  for (int64_t i = 0; i < weights_src.count(); i++) {
+    dst[i] = Eigen::half_impl::float_to_half_rtne(src[i]);
+  }
+  return weights;
+}
 // ****************************************************************************
 // Constant folding functions
 // TODO(jie): once optimizer kicks in, we should have done constant folding
@@ -448,7 +580,7 @@ struct LambdaFactory {
     switch (op) {
       case OP_CATEGORY::RSQRT: {
         VLOG(2) << "RSQRT GETS DONE";
-        return [](T t) -> T { return 1.0 / std::sqrt(t); };
+        return [](T t) -> T { return 1.0 / sqrt(t); };
       }
       case OP_CATEGORY::NEG:
         return [](T t) -> T { return -t; };
@@ -534,6 +666,22 @@ struct LambdaFactory {
   }
 };
 
+template <>
+std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
+  switch (op) {
+    case OP_CATEGORY::RSQRT: {
+      VLOG(2) << "RSQRT GETS DONE";
+      return [](Eigen::half t) -> Eigen::half {
+        return Eigen::half(1.0 / sqrt(float(t)));
+      };
+    }
+    case OP_CATEGORY::NEG:
+      return [](Eigen::half t) -> Eigen::half { return -t; };
+    default:
+      VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+      return nullptr;
+  }
+}
 tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
                                 TRT_ShapedWeights* oweights,
                                 LambdaFactory unary_op) {
@@ -545,6 +693,14 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
       std::transform(inp, inp + iweights.count(), oup, unary_op.unary<float>());
       break;
     }
+    case tensorflow::DataType::DT_HALF: {
+      auto inp = static_cast<Eigen::half const*>(iweights.GetValues());
+      auto oup =
+          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues()));
+      std::transform(inp, inp + iweights.count(), oup,
+                     unary_op.unary<Eigen::half>());
+      break;
+    }
     default:
       return tensorflow::errors::Unimplemented(
           "Data type not supported: " +
@@ -588,6 +744,32 @@ tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
       }
       break;
     }
+    case tensorflow::DataType::DT_HALF: {
+      auto inp_l = static_cast<const Eigen::half*>(iweights_l.GetValues());
+      auto inp_r = static_cast<const Eigen::half*>(iweights_r.GetValues());
+      auto oup =
+          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues()));
+
+      if (iweights_l.count() != iweights_r.count()) {
+        // We only supports broadcast of RankZero
+        if (iweights_l.count() == 1) {
+          VLOG(2) << "I bet it is not working!" << (*inp_l);
+          std::transform(inp_r, inp_r + iweights_r.count(), oup,
+                         binary_op.broadcast_l<Eigen::half>(*inp_l));
+        } else if (iweights_r.count() == 1) {
+          VLOG(2) << "I bet it is not working!" << (*inp_r);
+          std::transform(inp_l, inp_l + iweights_l.count(), oup,
+                         binary_op.broadcast_r<Eigen::half>(*inp_r));
+        } else {
+          return tensorflow::errors::Unimplemented(
+              "Binary op with non-rankZero broadcast not supported");
+        }
+      } else {
+        std::transform(inp_l, inp_l + iweights_l.count(), inp_r, oup,
+                       binary_op.binary<Eigen::half>());
+      }
+      break;
+    }
     default:
       return tensorflow::errors::Unimplemented(
           "Data type not supported: " +
@@ -599,7 +781,7 @@ tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
 
 tensorflow::Status ConstantFoldUnary(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    std::vector<TRT_TensorOrWeights> const& inputs,
+    const std::vector<TRT_TensorOrWeights>& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   TRT_ShapedWeights weights_input = inputs.at(0).weights();
 
@@ -613,13 +795,12 @@ tensorflow::Status ConstantFoldUnary(
   CHECK_EQ(weights_input.type_,
            TFAttrs(node_def).get<tensorflow::DataType>("T"));
 
-  // Maybe I should do a switch
   LambdaFactory unary_op;
   if (node_def.op() == "Rsqrt") {
     // Compute rsqrt
     unary_op.op = LambdaFactory::OP_CATEGORY::RSQRT;
     auto ret = UnaryCompute(weights_input, &weights_output, unary_op);
-    // PAss the output
+    // Pass the output
     if (ret == tensorflow::Status::OK()) {
       outputs->push_back(TRT_TensorOrWeights(weights_output));
     }
@@ -631,11 +812,11 @@ tensorflow::Status ConstantFoldUnary(
 }
 
 // TODO(jie,ben) broadcast is needed yet not implemented
-// Let's get the simple stuff working first. Maybe we should fall bakc to TF
+// Let's get the simple stuff working first. Maybe we should fall back to TF
 //   approach for constant folding
 tensorflow::Status ConstantFoldBinary(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    std::vector<TRT_TensorOrWeights> const& inputs,
+    const std::vector<TRT_TensorOrWeights>& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   TRT_ShapedWeights weights_input_l = inputs.at(0).weights();
   TRT_ShapedWeights weights_input_r = inputs.at(1).weights();
@@ -648,12 +829,12 @@ tensorflow::Status ConstantFoldBinary(
         "Binary op implicit broadcast not supported: " + node_def.op());
 
   // TODO(jie): constant fold should really fall back to TF.
-  int nb_dims = weights_input_l.shape_.nbDims;
+  int num_dims = weights_input_l.shape_.nbDims;
   nvinfer1::Dims output_shape;
-  output_shape.nbDims = nb_dims;
-  VLOG(2) << "nb_dims: " << nb_dims
+  output_shape.nbDims = num_dims;
+  VLOG(2) << "nb_dims: " << num_dims
           << ", the other: " << weights_input_r.shape_.nbDims;
-  for (int i = 0; i < nb_dims; i++) {
+  for (int i = 0; i < num_dims; i++) {
     if (weights_input_l.shape_.d[i] == weights_input_r.shape_.d[i]) {
       output_shape.d[i] = weights_input_l.shape_.d[i];
     } else if (weights_input_l.shape_.d[i] == 1 ||
@@ -678,7 +859,6 @@ tensorflow::Status ConstantFoldBinary(
   // Allocate output weights
   TRT_ShapedWeights weights_output = ctx.get_temp_weights(dtype, output_shape);
 
-  // Maybe I should do a switch
   LambdaFactory binary_op;
   if (node_def.op() == "Sub") {
     binary_op.op = LambdaFactory::OP_CATEGORY::SUB;
@@ -712,48 +892,94 @@ tensorflow::Status BinaryTensorOpWeight(
   // Maybe this part has to be moved into the block of rsqrt later
 
   // Check type consistency
-  auto dtype = TFAttrs(node_def).get<nvinfer1::DataType>("T");
-  CHECK_EQ_TYPE(tensor->getType(), dtype);  // Cast to int for error messages
   nvinfer1::DataType ttype;
-  TF_CHECK_OK(ConvertDType(weights.type_, &ttype));
-  CHECK_EQ_TYPE(ttype, dtype);  // Cast to int for error message
+  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &ttype));
 
   // Check scale mode
   auto dims_w = weights.shape_;
   auto dims_t = tensor->getDimensions();
 
-  // Default to channel-wise
+  // default to element-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
+  // TODO(jie): maybe use a permutation instead to support more cases;
+  bool permutation_flag = false;
+
   if (weights.count() == 1) {
     VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
-    // No broadcasting on Batch dimension;
-    assert(dims_w.d[0] == 1);
-
-    // Broadcasting on Channel dimension only allowed in kUNIFORM
-    assert(dims_w.d[1] == dims_t.d[0]);
-    assert(dims_w.nbDims == dims_t.nbDims);
-
-    // Default is element;
-    for (int i = 2; i < dims_w.nbDims; i++) {
-      if (dims_w.d[i] != dims_t.d[i - 1]) {
-        scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-        break;
+    // no broadcasting on Batch dimension;
+    VLOG(2) << "WEIGHTS DIM: " << dims_w.nbDims
+            << " tensor DIM: " << dims_t.nbDims;
+    if (dims_w.nbDims == dims_t.nbDims + 1) {
+      if (dims_w.d[0] == 1) {
+        for (int i = 1; i < dims_w.nbDims; i++) {
+          dims_w.d[i - 1] = dims_w.d[i];
+        }
+        dims_w.nbDims--;
+      } else {
+        return tensorflow::errors::InvalidArgument(
+            "Binary op cannot operate on batch, " + node_def.name());
       }
     }
-    if (scale_mode == nvinfer1::ScaleMode::kELEMENTWISE) {
+
+    if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) {
       scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      for (int i = 2; i < dims_w.nbDims; i++) {
-        if (dims_w.d[i] != 1)
-          return tensorflow::errors::InvalidArgument(
-              "Weight shape not compatible at, " + node_def.name());
+      // default is element;
+      for (int i = 1; i < dims_w.nbDims; i++) {
+        if (dims_w.d[i] != dims_t.d[i]) {
+          // if dimension does not match, switch back to channel;
+          VLOG(2) << "channel";
+          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+          break;
+        }
+      }
+      // if channel as candidate, validate it
+      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
+        for (int i = 1; i < dims_w.nbDims; i++) {
+          if (dims_w.d[i] != 1)
+            return tensorflow::errors::InvalidArgument(
+                "Weight shape not compatible at, " + node_def.name());
+        }
+      } else {
+        VLOG(2) << "elementwise";
+      }
+    } else if (dims_w.nbDims == 1 &&
+               dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) {
+      // channel wise and broadcast required;
+      permutation_flag = true;
+      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+    } else {
+      return tensorflow::errors::InvalidArgument(
+          "Weight shape not compatible at, " + node_def.name());
+    }
+  }
+
+  // transpose last dimension
+  std::vector<int> permutation(dims_t.nbDims + 1);
+  if (permutation_flag) {
+    if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
+      // we swap the last dimension into channel for trt.
+      // because of tensorflow default broadcasting rules.
+      for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
+        permutation[i] = i;
       }
+      permutation[1] = dims_t.nbDims;
+      permutation[dims_t.nbDims] = 1;
+      tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+                                   permutation);
+    } else {
+      return tensorflow::errors::InvalidArgument(
+          "Transpose cannot be applied, " + node_def.name());
     }
   }
 
-  // Prepare weights
+  if (ctx.isFP16()) {
+    weights = ConvertFP32ToFP16(ctx, weights);
+  }
+
+  // prepare weights
   TRT_ShapedWeights shift_weights(weights.type_);
   TRT_ShapedWeights scale_weights(weights.type_);
   TRT_ShapedWeights power_weights(weights.type_);
@@ -779,88 +1005,24 @@ tensorflow::Status BinaryTensorOpWeight(
       scale_weights, power_weights);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  // transpose back dimension
+  if (permutation_flag) {
+    output_tensor = ctx.TransposeTensor(output_tensor, permutation);
+  }
 
   // Pass the output
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status BinaryTensorOpTensor(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
-      {"Add", nvinfer1::ElementWiseOperation::kSUM},
-      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
-      // {"max", nvinfer1::ElementWiseOperation::kMAX},
-      // {"min", nvinfer1::ElementWiseOperation::kMIN},
-      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
-      {"Div", nvinfer1::ElementWiseOperation::kDIV},
-  };
-
-  // FIXME assume type matches input weights
-  // Get trt type & shape
-  TFAttrs attrs(node_def);
-  // Maybe this part has to be moved into the block of rsqrt later
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
-
-  // Check type consistency
-  CHECK_EQ_TYPE(tensor_l->getType(), dtype);
-  CHECK_EQ_TYPE(tensor_r->getType(), dtype);
-  auto op_pair = ops.find(node_def.op());
-  if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
-
-  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
-      *const_cast<nvinfer1::ITensor*>(tensor_l),
-      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
-
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-
-  // Pass the output
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
+enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
 
-tensorflow::Status ConvertPlaceholder(
+tensorflow::Status ConvertConv2DHelper(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    std::vector<TRT_TensorOrWeights> const& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  VLOG(2) << "Placeholder should have been replace already";
-  return tensorflow::errors::Unimplemented(", cannot convert Placeholder op");
-  // OK this make sense since we are supposed to replace it with input
-  TFAttrs attrs(node_def);
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
-  nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape");
-
-  dims.nbDims--;
-  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1];
-
-  nvinfer1::ITensor* output =
-      ctx.network()->addInput(node_def.name().c_str(), dtype, dims);
-  if (!output) {
-    return tensorflow::errors::InvalidArgument("Failed to create Input layer");
-  }
-  outputs->push_back(TRT_TensorOrWeights(output));
-  return tensorflow::Status::OK();
-}
+    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs, int group) {
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
-tensorflow::Status ConvertConv2D(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  // TODO(jie): handle NHWC/NCHW transpose;
-  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
-  ReorderRSCKToKCRS(weights_rsck, &weights);
-  TRT_ShapedWeights biases(weights.type_);
-  int noutput = weights.shape_.d[0];
-  nvinfer1::DimsHW kernel_size;
-  kernel_size.h() = weights.shape_.d[2];
-  kernel_size.w() = weights.shape_.d[3];
   TFAttrs attrs(node_def);
 
   int h_index = 2;
@@ -874,11 +1036,35 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
     // TODO(jie): transpose it
   }
 
+  // tensor after transpose (NCHW)
+  auto tensor_dim = tensor->getDimensions();
+
+  int num_groups = group;
+  if (num_groups == 0)  // depthwise convolution
+    num_groups = tensor_dim.d[0];
+  VLOG(2) << "groups count: " << num_groups;
+
+  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
+  if (ctx.isFP16()) {
+    weights_rsck = ConvertFP32ToFP16(ctx, inputs.at(1).weights());
+  }
+
+  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
+  ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
+  TRT_ShapedWeights biases(weights.type_);
+  int noutput = weights.shape_.d[0] * num_groups;
+  nvinfer1::DimsHW kernel_size;
+  kernel_size.h() = weights.shape_.d[2];
+  kernel_size.w() = weights.shape_.d[3];
+  VLOG(2) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w();
+
   // TODO(jie): stride. (NHWC/NCHW)
   auto tf_stride = attrs.get<std::vector<int>>("strides");
+  VLOG(2) << "h_INDEX" << h_index << ", w_index " << w_index;
+  VLOG(2) << "stride!!!: " << tf_stride[0] << tf_stride[1] << tf_stride[2]
+          << tf_stride[3];
   nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
-  auto tensor_dim = tensor->getDimensions();
   std::vector<std::pair<int, int>> padding;
   // TODO(jie): padding.
   if (attrs.get<string>("padding") == "SAME") {
@@ -919,10 +1105,11 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
   layer->setName(node_def.name().c_str());
+  layer->setNbGroups(num_groups);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   auto dim_after = output_tensor->getDimensions();
-  VLOG(2) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
+  VLOG(2) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1] << ", "
           << dim_after.d[2] << ", " << dim_after.d[3];
 
   if (data_format == "NHWC") {
@@ -935,11 +1122,101 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertConv2DHelper(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs, ConvolutionType type) {
+  switch (type) {
+    case ConvolutionType::DEFAULT:
+      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 1);
+    case ConvolutionType::DEPTHWISE_CONV:
+      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 0);
+  }
+  return tensorflow::errors::Unimplemented("unsupported convolution type at, " +
+                                           node_def.name());
+}
+
+tensorflow::Status BinaryTensorOpTensor(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
+      {"Add", nvinfer1::ElementWiseOperation::kSUM},
+      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
+      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
+      {"Div", nvinfer1::ElementWiseOperation::kDIV},
+  };
+
+  // FIXME assume type matches input weights
+  // get trt type & shape
+  TFAttrs attrs(node_def);
+  // maybe this part has to be moved into the block of rsqrt later
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+
+  // check type consistency
+  CHECK_EQ_TYPE(tensor_l->getType(), dtype);
+  CHECK_EQ_TYPE(tensor_r->getType(), dtype);
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end())
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
+
+  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
+      *const_cast<nvinfer1::ITensor*>(tensor_l),
+      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  // pass the output
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertPlaceholder(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  VLOG(2) << "Placeholder should have been replace already";
+  return tensorflow::errors::Unimplemented("cannot convert Placeholder op");
+  // OK this make sense since we are supposed to replace it with input
+  TFAttrs attrs(node_def);
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
+  nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape");
+
+  dims.nbDims--;
+  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1];
+
+  nvinfer1::ITensor* output =
+      ctx.network()->addInput(node_def.name().c_str(), dtype, dims);
+  if (!output) {
+    return tensorflow::errors::InvalidArgument("Failed to create Input layer");
+  }
+  outputs->push_back(TRT_TensorOrWeights(output));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertConv2D(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
+                             ConvolutionType::DEFAULT);
+}
+
+tensorflow::Status ConvertConv2DDepthwise(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
+                             ConvolutionType::DEPTHWISE_CONV);
+}
+
 tensorflow::Status ConvertPool(Converter& ctx,
                                const tensorflow::NodeDef& node_def,
-                               std::vector<TRT_TensorOrWeights> const& inputs,
+                               const std::vector<TRT_TensorOrWeights>& inputs,
                                std::vector<TRT_TensorOrWeights>* outputs) {
-  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   TFAttrs attrs(node_def);
 
   int h_index = 2;
@@ -957,6 +1234,8 @@ tensorflow::Status ConvertPool(Converter& ctx,
   // TODO(jie): support other pooling type
   if (node_def.op() == "MaxPool")
     type = nvinfer1::PoolingType::kMAX;
+  else if (node_def.op() == "AvgPool")
+    type = nvinfer1::PoolingType::kAVERAGE;
   else
     return tensorflow::errors::Unimplemented("Only supports Max pool");
 
@@ -1019,9 +1298,9 @@ tensorflow::Status ConvertPool(Converter& ctx,
 
 tensorflow::Status ConvertActivation(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    std::vector<TRT_TensorOrWeights> const& inputs,
+    const std::vector<TRT_TensorOrWeights>& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
-  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   nvinfer1::IActivationLayer* layer = ctx.network()->addActivation(
       *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ActivationType::kRELU);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
@@ -1031,17 +1310,20 @@ tensorflow::Status ConvertActivation(
 
 tensorflow::Status ConvertScale(Converter& ctx,
                                 const tensorflow::NodeDef& node_def,
-                                std::vector<TRT_TensorOrWeights> const& inputs,
+                                const std::vector<TRT_TensorOrWeights>& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
     return tensorflow::errors::Unimplemented(
         "Only supports tensor op weight for now, at " + node_def.name());
   // Implement tensor binaryOp weight [channel wise] for now;
-  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
-  // TODO(jie): handle NHWC/NCHW transpose;
   TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (ctx.isFP16()) {
+    weights = ConvertFP32ToFP16(ctx, inputs.at(1).weights());
+  }
+
   TRT_ShapedWeights empty_weights(weights.type_);
 
   TFAttrs attrs(node_def);
@@ -1055,12 +1337,29 @@ tensorflow::Status ConvertScale(Converter& ctx,
   } else {
     VLOG(2) << "NCHW !!!!";
   }
-  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
-      weights, empty_weights, empty_weights);
 
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  if (data_format == "NHWC") {
+  auto dims = tensor->getDimensions();
+  VLOG(2) << "tensor dimensions: " << dims.nbDims;
+  for (int i = 0; i < dims.nbDims; i++) {
+    VLOG(2) << "i: " << dims.d[i];
+  }
+  dims = weights.shape_;
+  VLOG(2) << "tensor dimensions: " << dims.nbDims;
+  for (int i = 0; i < dims.nbDims; i++) {
+    VLOG(2) << "i: " << dims.d[i];
+  }
+
+  nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
+  if (weights.shape_.d[0] == 1) {
+    mode = nvinfer1::ScaleMode::kUNIFORM;
+  }
+
+  nvinfer1::IScaleLayer* layer =
+      ctx.network()->addScale(*const_cast<nvinfer1::ITensor*>(tensor), mode,
+                              weights, empty_weights, empty_weights);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
     output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
@@ -1072,7 +1371,7 @@ tensorflow::Status ConvertScale(Converter& ctx,
 
 tensorflow::Status ConvertConst(Converter& ctx,
                                 const tensorflow::NodeDef& node_def,
-                                std::vector<TRT_TensorOrWeights> const& inputs,
+                                const std::vector<TRT_TensorOrWeights>& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   const auto& weights_tensor = node_def.attr().at("value").tensor();
 
@@ -1091,22 +1390,96 @@ tensorflow::Status ConvertConst(Converter& ctx,
     VLOG(2) << "SCALAR!!!" << node_def.name();
     nvinfer1::Dims scalar_shape;
     if (tensor.dims() > 0) {
-      VLOG(2) << "Dimensions: " << tensor.dims();
-      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
-                                  GetTensorShape(tensor));
+      VLOG(2) << "dimensions: " << tensor.dims();
+      VLOG(2) << "size: " << weights_tensor.float_val_size();
+      scalar_shape = GetTensorShape(tensor);
+      for (int i = 0; i < scalar_shape.nbDims; i++)
+        VLOG(2) << scalar_shape.d[i];
+      if (GetShapeSize(scalar_shape) != weights_tensor.float_val_size()) {
+        if (weights_tensor.float_val_size() == 1 ||
+            scalar_shape.d[0] == weights_tensor.float_val_size()) {
+          scalar_shape.nbDims = 1;
+          // no dimension provided. flatten it
+          scalar_shape.d[0] = weights_tensor.float_val_size();
+          scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
+        } else {
+          LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and"
+                       << " kUNIFORM, at: " << node_def.name();
+          string err_str("Broadcast method is not supported for '");
+          StrAppend(&err_str, node_def.name(), "' of type ", node_def.op());
+          return tensorflow::errors::InvalidArgument(err_str);
+        }
+      }
     } else {
       VLOG(2) << "Dimensions: " << tensor.dims();
       scalar_shape.nbDims = 1;
-      scalar_shape.d[0] = 1;
+      // no dimension provided. flatten it
+      scalar_shape.d[0] = weights_tensor.float_val_size();
       scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
       for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
         scalar_shape.d[i] = 0;
         scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
       }
-      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
-                                  scalar_shape);
     }
+    size_t len_data = tensorflow::DataTypeSize(dtype);
+    for (int i = 0; i < scalar_shape.nbDims; i++) len_data *= scalar_shape.d[i];
+    ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
+    void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
+    std::vector<float> tensor_data(
+        weights_tensor.float_val().begin(),
+        weights_tensor.float_val()
+            .end());  //  make a local copy first to flatten
+    memcpy(dst, tensor_data.data(), len_data);  // store into weight store
+    weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
+  } else if (!weights_tensor.int_val().empty()) {
+    VLOG(2) << "int!!!" << node_def.name();
+    nvinfer1::Dims scalar_shape;
+    if (tensor.dims() > 0) {
+      VLOG(2) << "dimensions: " << tensor.dims();
+      scalar_shape = GetTensorShape(tensor);
+      if (GetShapeSize(scalar_shape) != weights_tensor.int_val_size()) {
+        if (weights_tensor.int_val_size() == 1 ||
+            scalar_shape.d[0] == weights_tensor.int_val_size()) {
+          scalar_shape.nbDims = 1;
+          // no dimension provided. flatten it
+          scalar_shape.d[0] = weights_tensor.int_val_size();
+          scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
+        } else {
+          LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and"
+                       << " kUNIFORM, at: " << node_def.name();
+          string err_str("Broadcast method is not supported for '");
+          StrAppend(&err_str, node_def.name(), "' of type ", node_def.op());
+          return tensorflow::errors::InvalidArgument(err_str);
+        }
+      }
+    } else {
+      VLOG(2) << "dimensions: " << tensor.dims();
+      scalar_shape.nbDims = 1;
+      // no dimension provided. flatten it
+      scalar_shape.d[0] = weights_tensor.int_val_size();
+      scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
+      for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
+        scalar_shape.d[i] = 0;
+        scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
+      }
+    }
+    //  we should not have converted //if (ctx.isFP16()) {
+    size_t len_data = tensorflow::DataTypeSize(dtype);
+    for (int i = 0; i < scalar_shape.nbDims; i++) len_data *= scalar_shape.d[i];
+    size_t len_tensor = weights_tensor.int_val_size() * sizeof(int32);
+    len_data = std::max(len_data, len_tensor);
+    ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
+    void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
+    std::vector<int32> tensor_data(
+        weights_tensor.int_val().begin(),
+        weights_tensor.int_val().end());  //  make a local copy first to flatten
+                                          //  doesn't have to be contigous
+    memcpy(dst, tensor_data.data(), len_tensor);  // store into weight store
+    weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
   } else if (!weights_tensor.tensor_content().empty()) {
+    //  obsolete method.
+    //  After optimization path, we do not see weights in this format.
+    //  fp16 conversion technically should be needed here.
     VLOG(2) << "TENSOR!!!" << node_def.name();
     const auto& content = weights_tensor.tensor_content();
 
@@ -1130,7 +1503,7 @@ tensorflow::Status ConvertConst(Converter& ctx,
 
 tensorflow::Status ConvertIdentity(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    std::vector<TRT_TensorOrWeights> const& inputs,
+    const std::vector<TRT_TensorOrWeights>& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   outputs->push_back(inputs.at(0));
   return tensorflow::Status::OK();
@@ -1138,7 +1511,7 @@ tensorflow::Status ConvertIdentity(
 
 tensorflow::Status ConvertBinary(Converter& ctx,
                                  const tensorflow::NodeDef& node_def,
-                                 std::vector<TRT_TensorOrWeights> const& inputs,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2)
     return tensorflow::errors::FailedPrecondition(
@@ -1165,7 +1538,7 @@ tensorflow::Status ConvertBinary(Converter& ctx,
 
 tensorflow::Status ConvertUnary(Converter& ctx,
                                 const tensorflow::NodeDef& node_def,
-                                std::vector<TRT_TensorOrWeights> const& inputs,
+                                const std::vector<TRT_TensorOrWeights>& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 1)
     return tensorflow::errors::FailedPrecondition(
@@ -1183,7 +1556,7 @@ tensorflow::Status ConvertUnary(Converter& ctx,
 
 tensorflow::Status ConvertReduce(Converter& ctx,
                                  const tensorflow::NodeDef& node_def,
-                                 std::vector<TRT_TensorOrWeights> const& inputs,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
@@ -1191,7 +1564,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
         "Input expects tensor and weights, at" + node_def.name());
 
   // Implement tensor binaryOp weight [channel wise] for now;
-  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   auto dims = tensor->getDimensions();
   // Restore implicit batch dimension
   int nb_dims = dims.nbDims + 1;
@@ -1229,6 +1602,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
       return tensorflow::errors::InvalidArgument("TRT cannot reduce at 0, at" +
                                                  node_def.name());
     if (index_list_data[i] == 1) permuted_index = 1;
+
     idx_set.emplace(index_list_data[i]);
   }
 
@@ -1236,7 +1610,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
   nvinfer1::DimsHW pool_kernel;
   if (permuted_index == 1) {
     for (int i = 2; i < nb_dims; i++) {
-      if (idx_set.count(i)) {
+      if (idx_set.count(i) == 0) {
         permuted_index = i;
         break;
       }
@@ -1271,12 +1645,13 @@ tensorflow::Status ConvertReduce(Converter& ctx,
     output_tensor = ctx.TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), permutation_order);
   }
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
 tensorflow::Status ConvertPad(Converter& ctx,
                               const tensorflow::NodeDef& node_def,
-                              std::vector<TRT_TensorOrWeights> const& inputs,
+                              const std::vector<TRT_TensorOrWeights>& inputs,
                               std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
@@ -1284,7 +1659,7 @@ tensorflow::Status ConvertPad(Converter& ctx,
         "Input expects tensor and weights, at" + node_def.name());
 
   // Implement tensor binaryOp weight [channel wise] for now;
-  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   auto dims = tensor->getDimensions();
   // Restore implicit batch dimension
   int nb_dims = dims.nbDims + 1;
@@ -1371,19 +1746,318 @@ tensorflow::Status ConvertPad(Converter& ctx,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertConcat(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  // not including the last input (axis) here
+  int input_size = static_cast<int>(inputs.size()) - 1;
+
+  if (!inputs.at(0).is_tensor())
+    return tensorflow::errors::InvalidArgument(
+        "Concat in TRT support only Tensor input, at " + node_def.name());
+
+  // We are retrieving the axis
+  TRT_ShapedWeights axis = inputs.at(input_size).weights();
+
+  TFAttrs attrs(node_def);
+  auto index_type = attrs.get<tensorflow::DataType>("Tidx");
+
+  // TODO(jie): handle data type
+  // Only expect to handle INT32 as index attributes for now
+  if (index_type != tensorflow::DataType::DT_INT32)
+    return tensorflow::errors::Unimplemented(
+        "Tidx supports only DT_INT32, at " + node_def.name());
+
+  int index = *(static_cast<int*>(const_cast<void*>(axis.GetValues())));
+
+  // TODO(jie): early termination with no-op (attr_size==1)
+
+  auto dim = inputs.at(0).tensor()->getDimensions();
+  // dimension check
+  if (index > dim.nbDims + 1)
+    return tensorflow::errors::InvalidArgument(
+        "Concatenate on axis out of dimension range, at " + node_def.name());
+
+  if (index == 0)
+    return tensorflow::errors::InvalidArgument(
+        "Concatenate on batch dimension not supported, at " + node_def.name());
+
+  // incase we need permutation;
+  std::vector<int> permutation_order(dim.nbDims + 1);
+
+  for (int i = 0; i < dim.nbDims + 1; i++) permutation_order[i] = i;
+
+  if (index != 1) {
+    permutation_order[1] = index - 1;
+    permutation_order[index - 1] = 1;
+  }
+
+  std::vector<nvinfer1::ITensor const*> inputs_vec;
+  // Shap chack (all input tensor should have same shape)
+  // starting from 0 since we are probably also doing transpose here;
+  for (int i = 0; i < input_size; i++) {
+    auto tensor_i = inputs.at(i).tensor();
+    auto dim_i = tensor_i->getDimensions();
+    if (dim_i.nbDims != dim.nbDims)
+      return tensorflow::errors::InvalidArgument(
+          "Concatenate receives inputs with inconsistent dimensions, at " +
+          node_def.name());
+
+    for (int j = 0; j < dim.nbDims; j++) {
+      // check dimension consistency on non-concatenate axis
+      if (j != index - 1 && dim_i.d[j] != dim.d[j])
+        return tensorflow::errors::InvalidArgument(
+            "Concatenate receives inputs with inconsistent shape, at" +
+            node_def.name());
+    }
+
+    // TRT does concatenation only on channel!
+    if (index != 1)
+      tensor_i = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor_i),
+                                     permutation_order);
+
+    inputs_vec.push_back(tensor_i);
+  }
+
+  // nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  nvinfer1::IConcatenationLayer* layer = ctx.network()->addConcatenation(
+      const_cast<nvinfer1::ITensor* const*>(inputs_vec.data()),
+      inputs_vec.size());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  if (index != 1) {
+    output_tensor = ctx.TransposeTensor(output_tensor, permutation_order);
+  }
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertFusedBatchNorm(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  TFAttrs attrs(node_def);
+  float epsilon = attrs.get<float>("epsilon");
+  auto data_format = attrs.get<string>("data_format");
+  if (data_format != "NCHW") {
+    return tensorflow::errors::Unimplemented(
+        "only data_format=NCHW is supported, at " + node_def.name());
+  }
+  bool is_training = attrs.get<bool>("is_training");
+  if (is_training) {
+    return tensorflow::errors::Unimplemented(
+        "only is_training=false is supported, at " + node_def.name());
+  }
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+
+  //  Check parameter types
+  auto parameter_type = inputs.at(1).weights().type_;
+  if ((parameter_type != tensorflow::DataType::DT_FLOAT) &&
+      (parameter_type != tensorflow::DataType::DT_HALF)) {
+    return tensorflow::errors::Unimplemented(
+        "only float32 or float16 weight data type is supported, for node " +
+        node_def.name() + " got " + tensorflow::DataTypeString(parameter_type));
+  }
+  for (int i = 1; i < 5; i++) {
+    if (inputs.at(i).weights().type_ != parameter_type) {
+      return tensorflow::errors::Unimplemented(
+          "Inconsistent parameter type for batchnormis not supported, at: " +
+          node_def.name());
+    }
+  }
+
+  TRT_ShapedWeights dummy_power_weights(parameter_type);
+  size_t nweight = 0;
+  for (int i = 1; i < 5; i++) {
+    nweight = std::max(nweight, (size_t)inputs.at(i).weights().count());
+  }
+  TRT_ShapedWeights* ptr_shape_weights = nullptr;
+  for (int i = 1; i < 5; i++) {
+    if (inputs.at(i).weights().count() == nweight) {
+      ptr_shape_weights =
+          const_cast<TRT_ShapedWeights*>(&(inputs.at(i).weights()));
+    } else if (inputs.at(i).weights().count() != 1) {
+      return tensorflow::errors::InvalidArgument(
+          "Inconsistent batchnorm parameter count, at: " + node_def.name());
+    }
+  }
+  //  We could technically have two weights with different shape.
+  //  that requires two addScale op, arguably less performant
+  TRT_ShapedWeights combined_scale_weights =
+      ctx.get_temp_weights_like(*ptr_shape_weights);
+  TRT_ShapedWeights combined_offset_weights =
+      ctx.get_temp_weights_like(*ptr_shape_weights);
+
+  const Eigen::half* cast_vals_array[4];
+  const float* vals_array[4];
+  for (int j = 0; j < 4; j++) {
+    cast_vals_array[j] =
+        static_cast<Eigen::half const*>(inputs.at(j + 1).weights().GetValues());
+    vals_array[j] =
+        static_cast<float const*>(inputs.at(j + 1).weights().GetValues());
+  }
+  Eigen::half* cast_combined_scale_vals = const_cast<Eigen::half*>(
+      static_cast<Eigen::half const*>(combined_scale_weights.GetValues()));
+  Eigen::half* cast_combined_offset_vals = const_cast<Eigen::half*>(
+      static_cast<Eigen::half const*>(combined_offset_weights.GetValues()));
+  float* combined_scale_vals = const_cast<float*>(
+      static_cast<float const*>(combined_scale_weights.GetValues()));
+  float* combined_offset_vals = const_cast<float*>(
+      static_cast<float const*>(combined_offset_weights.GetValues()));
+
+  for (size_t i = 0; i < nweight; ++i) {
+    float batchnorm_data[4];
+    for (int j = 0; j < 4; j++) {
+      if (inputs.at(j + 1).weights().count() != 1) {
+        if (parameter_type == tensorflow::DT_FLOAT) {
+          batchnorm_data[j] = vals_array[j][i];
+        } else if (parameter_type == tensorflow::DT_HALF) {
+          batchnorm_data[j] =
+              Eigen::half_impl::half_to_float(cast_vals_array[j][i]);
+        }
+      } else {
+        if (parameter_type == tensorflow::DT_FLOAT) {
+          batchnorm_data[j] = vals_array[j][0];
+        } else if (parameter_type == tensorflow::DT_HALF) {
+          batchnorm_data[j] =
+              Eigen::half_impl::half_to_float(cast_vals_array[j][0]);
+        }
+      }
+    }
+    float scale = batchnorm_data[0];
+    float offset = batchnorm_data[1];
+    float mean = batchnorm_data[2];
+    float variance = batchnorm_data[3];
+    float combined_scale_val = scale / sqrtf(variance + epsilon);
+    float combined_offset_val = offset - mean * combined_scale_val;
+    if (parameter_type == tensorflow::DT_FLOAT) {
+      combined_scale_vals[i] = combined_scale_val;
+      combined_offset_vals[i] = combined_offset_val;
+    } else if (parameter_type == tensorflow::DT_HALF) {
+      cast_combined_scale_vals[i] = Eigen::half(combined_scale_val);
+      cast_combined_offset_vals[i] = Eigen::half(combined_offset_val);
+    }
+  }
+
+  nvinfer1::ScaleMode mode = nweight == 1 ? nvinfer1::ScaleMode::kUNIFORM
+                                          : nvinfer1::ScaleMode::kCHANNEL;
+  nvinfer1::IScaleLayer* layer =
+      ctx.network()->addScale(*const_cast<nvinfer1::ITensor*>(tensor), mode,
+                              combined_offset_weights.GetWeightsForTRT(),
+                              combined_scale_weights.GetWeightsForTRT(),
+                              dummy_power_weights.GetWeightsForTRT());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertMatMul(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+
+  // TODO(jie): transpose!
+  TFAttrs attrs(node_def);
+
+  TRT_ShapedWeights weights_ck = inputs.at(1).weights();
+  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_ck);
+  ReorderCKtoKC(weights_ck, &weights);
+  TRT_ShapedWeights biases(weights.type_);
+
+  int noutput = weights.shape_.d[0];
+
+  nvinfer1::IFullyConnectedLayer* layer = ctx.network()->addFullyConnected(
+      *const_cast<nvinfer1::ITensor*>(tensor), noutput, weights, biases);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertReshape(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights())
+    return tensorflow::errors::InvalidArgument(
+        "Input expects tensor and weights, at" + node_def.name());
+
+  // implement tensor binaryOp weight [channel wise] for now;
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  auto dims = tensor->getDimensions();
+  // restore implicit batch dimension
+
+  TRT_ShapedWeights shape = inputs.at(1).weights();
+
+  TFAttrs attrs(node_def);
+
+  auto padding_type = attrs.get<tensorflow::DataType>("Tshape");
+
+  if (shape.shape_.nbDims != 1)
+    return tensorflow::errors::InvalidArgument(
+        "reshape new shape is not 1 dimensional, at " + node_def.name());
+
+  // Only expect to handle INT32 as attributes for now
+  if (padding_type != tensorflow::DataType::DT_INT32)
+    return tensorflow::errors::Unimplemented(
+        "reshape new shape supports only DT_INT32, at " + node_def.name());
+
+  auto shape_data = static_cast<int*>(const_cast<void*>(shape.GetValues()));
+
+  if (shape_data[0] != -1)
+    return tensorflow::errors::InvalidArgument(
+        "reshape new shape first dimension is not -1, at " + node_def.name());
+
+  auto shape_num_dims = shape.shape_.d[0];
+  VLOG(2) << "shape dimensions: " << shape_num_dims;
+  int volume_w = 1;
+  for (int i = 1; i < shape.shape_.d[0]; i++) volume_w *= shape_data[i];
+
+  int volume_t = 1;
+  for (int i = 0; i < dims.nbDims; i++) volume_t *= dims.d[i];
+
+  VLOG(2) << "volume: " << volume_t << " volume weights: " << volume_w;
+  if (volume_w != volume_t)
+    return tensorflow::errors::InvalidArgument(
+        "volume does not agree between tensor and new shape, at " +
+        node_def.name());
+
+  nvinfer1::IShuffleLayer* layer =
+      ctx.network()->addShuffle(*const_cast<nvinfer1::ITensor*>(tensor));
+
+  nvinfer1::Dims reshape_dims;
+  VLOG(2) << "new dimension: " << shape_num_dims - 1;
+  reshape_dims.nbDims = shape_num_dims - 1;
+  for (int32_t i = 0; i < reshape_dims.nbDims; ++i) {
+    reshape_dims.d[i] = shape_data[i + 1];
+  }
+  layer->setReshapeDimensions(reshape_dims);
+  VLOG(2) << "new dimension: " << shape_num_dims - 1;
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  auto dims_output = output_tensor->getDimensions();
+  VLOG(2) << "output tensor dimension:" << dims_output.nbDims;
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
 void Converter::register_op_converters() {
   // vgg_16 slim implementation
   op_registry_["Placeholder"] = ConvertPlaceholder;
   op_registry_["Conv2D"] = ConvertConv2D;
+  op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
   op_registry_["Relu"] = ConvertActivation;
   op_registry_["MaxPool"] = ConvertPool;
+  op_registry_["AvgPool"] = ConvertPool;
   // This could be really handled as ConvertBinary
   op_registry_["BiasAdd"] = ConvertScale;
   op_registry_["Const"] = ConvertConst;
-  // op_registry_["MatMul"] = ConvertFullyConnected;  // Not used in vgg
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
-  // op_registry_["AvgPool"] = ConvertPool;
+  op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
 
   // resnet_50_v1 slim implementation
   op_registry_["Add"] = ConvertBinary;
@@ -1393,26 +2067,407 @@ void Converter::register_op_converters() {
   op_registry_["Mean"] = ConvertReduce;
   op_registry_["Pad"] = ConvertPad;
   // TODO(ben,jie): Add more ops
+
+  op_registry_["ConcatV2"] = ConvertConcat;
+  op_registry_["MatMul"] = ConvertMatMul;
+  op_registry_["Reshape"] = ConvertReshape;
+  op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
+  op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
 }
 
 }  // namespace
+tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
+  return tensorflow::errors::Unimplemented("Not implemented yet");
+}
+tensorflow::Status ConvertCalibrationNodeToEngineNode(
+    tensorflow::Graph& graph, tensorflow::Node* c_node) {
+  const auto ndef = c_node->def();
+
+  TFAttrs attrs(ndef);
+  std::vector<string> segment_nodes(
+      attrs.get<std::vector<string>>("segment_nodes"));
+  std::vector<string> output_nodes(
+      attrs.get<std::vector<string>>("segment_output_names"));
+  std::vector<string> input_names(
+      attrs.get<std::vector<string>>("input_names"));
+  string res_name = attrs.get<string>("resource_name");
+  VLOG(1) << "Node name " << c_node->name() << " res_name " << res_name;
+  string engine_name = "my_trt_op";
+  {
+    const auto node_id = tensorflow::str_util::Split(res_name, "_");
+    engine_name += node_id.back();
+  }
+  std::map<string, tensorflow::Node*> node_maps;
+
+  for (auto n : graph.op_nodes()) {
+    node_maps.insert({n->name(), n});
+  }
+  VLOG(1) << "Output Nodes:";
+  std::vector<tensorflow::DataType> out_types;
+  std::vector<const tensorflow::Edge*> out_edges;
+  for (auto& i : output_nodes) {
+    auto node_port = tensorflow::str_util::Split(i, ":");
+    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
+    auto out_node_name = node_port.at(0);
+    if (node_port.size() > 1) {
+      VLOG(1) << "Multi port output" << node_port.at(0) << " "
+              << node_port.at(1) << " size=" << node_port.size();
+    }
+    auto node_it = node_maps.find(out_node_name);
+    if (node_it != node_maps.end()) {
+      tensorflow::Node* out_node = node_it->second;
+      int port = 0;
+      if (node_port.size() == 2) {
+        port = std::strtoul(node_port.at(1).c_str(), nullptr, 10);
+        out_types.push_back(out_node->output_type(port));
+      } else {
+        out_types.push_back(out_node->output_type(0));
+      }
+      for (auto out_edge : out_node->out_edges()) {
+        if (out_edge->src_output() == port) {
+          out_edges.push_back(out_edge);
+          break;
+        }
+      }
+    } else {
+      LOG(WARNING) << " couldn't find output node " << out_node_name;
+    }
+  }
+  VLOG(1) << "Input Nodes:";
+  for (auto& i : input_names) {
+    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
+  }
+  auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto resmgr = trt_rm->getManager("TRTCalibOps");
+  tensorflow::tensorrt::TRTCalibrationResource* calib_res = nullptr;
+  auto status = resmgr->Lookup(res_name, res_name, &calib_res);
+  if (!status.ok() || !calib_res->calibrator_) {
+    return tensorflow::errors::FailedPrecondition(
+        "You must run calibration"
+        " and inference conversion in the same proces");
+  }
+
+  calib_res->calibrator_->setDone();
+  calib_res->thr_->join();
+  delete calib_res->thr_;
+  if (!calib_res->engine_) {
+    LOG(ERROR) << "Calibration failed!, engine does not exist. Did you run "
+                  "calibration graph?";
+    return tensorflow::errors::FailedPrecondition(
+        "Calibration graph needs to be executed on"
+        " calibration data before convertsion to inference graph");
+  }
+  auto weight_rmgr = trt_rm->getManager("WeightStore");
+  TF_CHECK_OK(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
+      res_name, res_name));
+  auto engine_plan = calib_res->engine_->serialize();
+  calib_res->engine_->destroy();
+  calib_res->network_->destroy();
+  calib_res->builder_->destroy();
+  calib_res->thr_ = nullptr;
+  calib_res->engine_ = nullptr;
+  calib_res->builder_ = nullptr;
+  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  for (const auto in_edge : c_node->in_edges()) {
+    auto src = in_edge->src();
+    int dest_port = in_edge->dst_input();
+    income_edges.emplace_back(src->name(), in_edge->src_output(),
+                              c_node->input_type(dest_port));
+  }
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
+  op_builder.Input(input_list);
+  tensorflow::NodeDef engine_node;
+  const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
+  string engine_plan_string(engine_plan_data,
+                            engine_plan_data + engine_plan->size());
+  status = op_builder.Attr("serialized_engine", engine_plan_string)
+               .Attr("input_nodes", input_names)
+               .Attr("output_nodes", output_nodes)
+               .Attr("OutT", out_types)
+               .Finalize(&engine_node);
+  if (!status.ok()) {
+    LOG(ERROR) << "Engine Node creation failed";
+    return status;
+  }
+  auto trt_engine_node = graph.AddNode(engine_node, &status);
+  TF_RETURN_IF_ERROR(status);
+  for (size_t i = 0; i < out_edges.size(); i++) {
+    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
+            << out_edges.at(i)->dst()->name() << " port "
+            << out_edges.at(i)->dst_input();
+    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
+                                        out_edges.at(i)->dst(),
+                                        out_edges.at(i)->dst_input()));
+  }
+  VLOG(1) << "Segment nodes:";
+  for (auto& i : segment_nodes) {
+    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
+    auto it = node_maps.find(i);
+    if (it != node_maps.end()) {
+      graph.RemoveNode(it->second);
+    }
+  }
+  graph.RemoveNode(c_node);
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
+  // Visit nodes in reverse topological order and construct the TRT network.
+
+  // Toposort
+  std::vector<tensorflow::Node*> order_vec;
+  tensorflow::GetPostOrder(s.graph, &order_vec);
+  // Select just the subgraph
+  std::list<tensorflow::Node*> order;
+  for (tensorflow::Node* node : order_vec) {
+    if (s.subgraph_node_ids.count(node->id())) {
+      order.push_front(node);  // we want topological order to construct the
+      // network layer by layer
+    }
+  }
+  // topological order is needed to build TRT network
+  static int static_id = 0;
+  string subgraph_name_scope;
+  if (!order.empty()) {
+    subgraph_name_scope = order.front()->name();
+  }
+  for (const tensorflow::Node* node : order) {
+    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
+  }
+  // TODO(sami,ben,jie): proper naming!
+  string calib_op_name =
+      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
+  static_id++;
+  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
+  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
+  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
+  op_res->logger_ = new tensorflow::tensorrt::Logger();
+  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
+
+  if (!op_res->builder_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT builder object");
+  }
+
+  op_res->network_ = op_res->builder_->createNetwork();
+  if (!op_res->network_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT network object");
+  }
+
+  // Build the network
+  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
+  auto ws = new tensorflow::tensorrt::TRTWeightStore();
+  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
+  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+
+  std::vector<string> input_names;
+  std::vector<tensorflow::DataType> input_dtypes;
+  for (const std::pair<int, int>& input : s.input_inds) {
+    VLOG(2) << "parsing input. Node id= " << input.first;
+    int node_id = input.first;
+    int output_idx = input.second;
+    tensorflow::Node* node = s.graph.FindNodeId(node_id);
+    auto node_name = node->name();
+    // input_names should use the node name in the graph
+    // here it should be the input tensor name -> matching the binding
+    // insert original node name without port
+    auto tensor_name = node_name;
+    if (output_idx != 0) {
+      tensor_name = StrCat(tensor_name, ":", output_idx);
+    }
+
+    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
+            << " idx: " << output_idx;
+
+    auto shape_inference_node_name = node_name;
+    auto shape_inference_output_idx = output_idx;
+    // rewire the shape inference to original node in the graph
+    if (s.output_edge_map->count(tensor_name)) {
+      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
+      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
+    }
+    if (shape_inference_output_idx < 0) continue;
+    VLOG(2) << "shapeinference name: " << shape_inference_node_name
+            << " idx: " << shape_inference_output_idx;
+
+    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
+      return tensorflow::errors::Internal("failed to find input node: " +
+                                          shape_inference_node_name);
+
+    auto op_info_vec =
+        s.graph_properties.GetOutputProperties(shape_inference_node_name);
+    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
+      return tensorflow::errors::Internal(
+          "accessing output index of: ", shape_inference_output_idx,
+          ", at node: ", shape_inference_node_name,
+          " with output entry from shape_map: ", op_info_vec.size());
+
+    auto op_info = op_info_vec.at(shape_inference_output_idx);
+    tensorflow::DataType tf_dtype = op_info.dtype();
+    input_dtypes.push_back(tf_dtype);
+
+    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
+    auto type_status = ConvertDType(tf_dtype, &dtype);
+    if (type_status != tensorflow::Status::OK()) {
+      LOG(WARNING) << "Data type conversion for input '" << node_name
+                   << "' failed";
+      return type_status;
+    }
+
+    VLOG(2) << "accessing output index of: " << output_idx
+            << ", at node: " << node_name
+            << "with output entry from shape_map: " << op_info_vec.size();
+    // TODO(ben,jie): update TRT input format/dimension
+    nvinfer1::DimsCHW input_dim_psuedo_chw;
+    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
+
+    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
+    //            update the code once TRT 4.0 comes out.
+    if (op_info.shape().dim_size() != 4) {
+      string err_str = "Require 4 dimensional input.";
+      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
+                shape_inference_node_name);
+      return tensorflow::errors::Unimplemented(err_str);
+    }
+
+    for (int i = 1; i < op_info.shape().dim_size(); i++) {
+      VLOG(2) << "dimension: " << i
+              << " , size: " << op_info.shape().dim(i).size();
+      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
+    }
+
+    // TODO(ben,jie): proper way to restore input tensor name?
+    auto input_tensor_name = node_name;
+    if (output_idx != 0) {
+      input_tensor_name = StrCat(node_name, ":", output_idx);
+    }
+
+    input_names.push_back(input_tensor_name);
+    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
+        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
+
+    if (!input_tensor)
+      return tensorflow::errors::InvalidArgument(
+          "Failed to create Input layer");
+    VLOG(2) << "input tensor name :" << input_tensor_name;
+
+    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
+      return tensorflow::errors::AlreadyExists(
+          "output tensor already exists for op: " + input_tensor_name);
+  }
+
+  VLOG(2) << "finished sorting";
+
+  for (const tensorflow::Node* node : order) {
+    const tensorflow::NodeDef& node_def = node->def();
+    VLOG(2) << "converting node: " << node_def.name() << " , " << node_def.op();
+    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
+  }
+
+  VLOG(2) << "finished conversion";
+
+  // Gather output metadata
+  std::vector<string> output_names;
+  std::vector<tensorflow::DataType> output_dtypes;
+  int trt_engine_op_output_idx = 0;
+  for (const std::pair<int, int>& output : s.output_inds) {
+    int node_id = output.first;
+    int output_idx = output.second;
+    tensorflow::Node* node = s.graph.FindNodeId(node_id);
+    string op_name = node->name();
+    string tensor_name = op_name;
+
+    s.output_edge_map->insert(
+        {trt_engine_op_output_idx == 0
+             ? engine_name
+             : StrCat(engine_name, ":", trt_engine_op_output_idx),
+         {output_idx, tensor_name}});
+    trt_engine_op_output_idx++;
+    if (output_idx != 0) {
+      tensor_name = StrCat(tensor_name, ":", output_idx);
+    }
+    VLOG(1) << "output tensor name: " << tensor_name;
+    output_names.push_back(tensor_name);
+    auto tensor_or_weights = converter.get_tensor(tensor_name);
+    if (!tensor_or_weights.is_tensor()) {
+      return tensorflow::errors::InvalidArgument("Output node'" + tensor_name +
+                                                 "' is weights not tensor");
+    }
+    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
+    if (!tensor) {
+      return tensorflow::errors::NotFound("Output tensor not found: " +
+                                          tensor_name);
+    }
+    converter.network()->markOutput(*tensor);
+    tensorflow::DataType tf_dtype = node->output_type(output_idx);
+    output_dtypes.push_back(tf_dtype);
+    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
+    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
+    tensor->setType(trt_dtype);
+  }
+
+  VLOG(2) << "Finished processing outputs";
+
+  // Build the engine
+  op_res->builder_->setMaxBatchSize(s.max_batch_size);
+  op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes);
+  VLOG(0) << "Max batch size= " << s.max_batch_size
+          << " max workspace size= " << s.max_workspace_size_bytes;
+
+  // Build the TRT op
+  // TODO(sami,ben,jie): proper naming!
+  tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    int output_idx = s.input_inds.at(i).second;
+    // we wired up the input here already, it is redundant to do it again in
+    //  ConvertSubGraphToTensorRT(convert_graph.cc)
+    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
+        input_names.at(i), output_idx, input_dtypes.at(i));
+    VLOG(1) << calib_op_name << " input " << i << " = " << input_names.at(i)
+            << ":" << output_idx
+            << " dType= " << tensorflow::DataTypeString(input_dtypes.at(i));
+    income_edges.push_back(incoming_edge);
+  }
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
+  op_builder.Input(input_list);
+  std::vector<string> segment_names;
+  segment_names.reserve(s.subgraph_node_ids.size());
+  for (int i : s.subgraph_node_ids) {
+    auto node = s.graph.FindNodeId(i);
+    segment_names.push_back(node->name());
+  }
+  LOG(INFO) << "finished op preparation";
+
+  auto status = op_builder.Attr("segment_nodes", segment_names)
+                    .Attr("input_names", input_names)
+                    .Attr("segment_output_names", output_names)
+                    .Attr("resource_name", calib_op_name)
+                    .Finalize(s.trt_node);
+
+  LOG(INFO) << status.ToString();
+  LOG(INFO) << "finished op building";
+
+  return tensorflow::Status::OK();
+}
 
 tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
-    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
-    const std::vector<std::pair<int, int>>& input_inds,
-    const std::vector<std::pair<int, int>>& output_inds, size_t max_batch_size,
-    size_t max_workspace_size_bytes,
-    const tensorflow::grappler::GraphProperties& graph_properties,
-    tensorflow::NodeDef* trt_node) {
+    tensorrt::convert::SubGraphParams& s) {
   // Visit nodes in reverse topological order and construct the TRT network.
 
   // Toposort
   std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(graph, &order_vec);
+  tensorflow::GetPostOrder(s.graph, &order_vec);
   // Select just the subgraph
   std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
-    if (subgraph_node_ids.count(node->id())) {
+    if (s.subgraph_node_ids.count(node->id())) {
       // We want topological order to contstruct the
       // network layer by layer
       order.push_front(node);
@@ -1434,46 +2489,94 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         "Failed to create TensorRT network object");
   }
 
+  string subgraph_name_scope;
+  if (!order.empty()) {
+    subgraph_name_scope = order.front()->name();
+  }
+  for (const tensorflow::Node* node : order) {
+    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
+  }
+  static int static_id = 0;
+  // TODO(sami,ben,jie): proper naming!
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op");
+  engine_name = StrCat(engine_name, static_id++);
+  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
+  auto ws = new tensorflow::tensorrt::TRTWeightStore();
+  TF_CHECK_OK(weight_rmgr->Create(engine_name, engine_name, ws));
+
   // Build the network
-  Converter converter(trt_network.get());
+  Converter converter(trt_network.get(), ws, s.precision_mode == FP16MODE);
 
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
-  for (std::pair<int, int> const& input : input_inds) {
+  for (const std::pair<int, int>& input : s.input_inds) {
+    VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
     int output_idx = input.second;
-    tensorflow::Node* node = graph.FindNodeId(node_id);
+    tensorflow::Node* node = s.graph.FindNodeId(node_id);
     auto node_name = node->name();
-    input_names.push_back(node_name);  // Insert original node name without port
-    // TODO(jie): alternative :)
-    if (!graph_properties.HasOutputProperties(node_name))
-      return tensorflow::errors::Internal("Failed to find input node: " +
-                                          node_name);
-
-    auto op_info_vec = graph_properties.GetOutputProperties(node_name);
-    if (static_cast<int>(op_info_vec.size()) < output_idx)
-      return tensorflow::errors::Internal(
-          "Accessing output index of: " + std::to_string(output_idx) +
-          ", at node: " + node_name + " with output entry from shape_map: " +
-          std::to_string(op_info_vec.size()));
+    // input_names should use the node name in the graph
+    // here it should be the input tensor name -> matching the binding
+    // insert original node name without port
+    auto tensor_name = node_name;
+    if (output_idx != 0) {
+      tensor_name = StrCat(tensor_name, ":", output_idx);
+    }
+
+    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
+            << " idx: " << output_idx;
+
+    auto shape_inference_node_name = node_name;
+    auto shape_inference_output_idx = output_idx;
+    // rewire the shape inference to original node in the graph
+    if (s.output_edge_map->count(tensor_name)) {
+      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
+      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
+    }
+    if (shape_inference_output_idx < 0) continue;
+    VLOG(2) << "shapeinference name: " << shape_inference_node_name
+            << " idx: " << shape_inference_output_idx;
+
+    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
+      return tensorflow::errors::Internal("failed to find input node: " +
+                                          shape_inference_node_name);
 
-    auto op_info = op_info_vec.at(output_idx);
+    auto op_info_vec =
+        s.graph_properties.GetOutputProperties(shape_inference_node_name);
+    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
+      return tensorflow::errors::Internal(
+          "accessing output index of: ", shape_inference_output_idx,
+          ", at node: ", shape_inference_node_name,
+          " with output entry from shape_map: ", op_info_vec.size());
 
+    auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
     input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
+    auto type_status = ConvertDType(tf_dtype, &dtype);
+    if (type_status != tensorflow::Status::OK()) {
+      LOG(WARNING) << "Type conversion failed for " << node_name;
+      return type_status;
+    }
 
-    VLOG(2) << "Accessing output index of: " << std::to_string(output_idx)
+    VLOG(2) << "Accessing output index of: " << output_idx
             << ", at node: " << node_name
-            << " with output entry from shape_map: "
-            << std::to_string(op_info_vec.size());
-
+            << " with output entry from shape_map: " << op_info_vec.size();
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
+    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
+    //            update the code once TRT 4.0 comes out.
+    if (op_info.shape().dim_size() != 4) {
+      string err_str = "Require 4 dimensional input.";
+      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
+                shape_inference_node_name);
+      return tensorflow::errors::Unimplemented(err_str);
+    }
+
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
@@ -1482,9 +2585,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
     // TODO(ben,jie): proper way to restore input tensor name?
     auto input_tensor_name = node_name;
-    if (output_idx != 0)
-      input_tensor_name = node_name + ":" + std::to_string(output_idx);
+    if (output_idx != 0) {
+      input_tensor_name = StrCat(node_name, ":", output_idx);
+    }
 
+    input_names.push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
 
@@ -1511,20 +2616,28 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   // Gather output metadata
   std::vector<string> output_names;
   std::vector<tensorflow::DataType> output_dtypes;
-  for (std::pair<int, int> const& output : output_inds) {
+  int trt_engine_op_output_idx = 0;
+  for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
-    tensorflow::Node* node = graph.FindNodeId(node_id);
+    tensorflow::Node* node = s.graph.FindNodeId(node_id);
     string op_name = node->name();
     string tensor_name = op_name;
+
+    s.output_edge_map->insert(
+        {trt_engine_op_output_idx == 0
+             ? engine_name
+             : StrCat(engine_name, ":", trt_engine_op_output_idx),
+         {output_idx, tensor_name}});
+    trt_engine_op_output_idx++;
     if (output_idx != 0)
-      tensor_name = tensor_name + ":" + std::to_string(output_idx);
+      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument(
-          "Output node is weights not tensor");
+      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
+                                                 "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
     if (!tensor) {
@@ -1540,19 +2653,25 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   }
 
   VLOG(2) << "Finished output";
-  // TODO(jie): static_id is not thread safe.
-  static int static_id = 0;
 
   // Build the engine
-  trt_builder->setMaxBatchSize(max_batch_size);
-  trt_builder->setMaxWorkspaceSize(max_workspace_size_bytes);
-  VLOG(0) << "Starting build engine " << static_id;
-  // TODO(ben,jie): half2 and int8 mode support
+  trt_builder->setMaxBatchSize(s.max_batch_size);
+  trt_builder->setMaxWorkspaceSize(s.max_workspace_size_bytes);
+  VLOG(0) << "Max batch size= " << s.max_batch_size
+          << " max workspace size= " << s.max_workspace_size_bytes;
+  if (s.precision_mode == FP16MODE) {
+    trt_builder->setHalf2Mode(true);
+    VLOG(0) << "Using FP16 precision mode";
+  }
+  LOG(INFO) << "starting build engine";
   string engine_plan_string;
   {
     auto trt_engine =
         infer_object(trt_builder->buildCudaEngine(*converter.network()));
     VLOG(0) << "Built network";
+    if (trt_engine.get() == nullptr) {
+      return tensorflow::errors::Internal("Engine building failure");
+    }
     auto engine_plan = infer_object(trt_engine->serialize());
     VLOG(0) << "Serialized engine";
     const char* engine_plan_data =
@@ -1560,18 +2679,20 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     engine_plan_string =
         string(engine_plan_data, engine_plan_data + engine_plan->size());
   }
-
-  VLOG(0) << "Finished engine";
+  TF_RETURN_IF_ERROR(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
+      engine_name, engine_name));
+  LOG(INFO) << "finished engine " << engine_name << " containing "
+            << s.subgraph_node_ids.size() << " nodes";
 
   // Build the TRT op
-  // TODO(sami,ben,jie): proper naming!
-  tensorflow::NodeDefBuilder op_builder(
-      tensorflow::strings::StrCat("my_trt_op", static_id++), "TRTEngineOp");
+  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  VLOG(2) << "input edge size: " << input_names.size();
   for (size_t i = 0; i < input_names.size(); ++i) {
-    int output_idx = input_inds.at(i).second;
-    // We wired up the input here already, it is redundant to do it again in
-    // ConvertSubGraphToTensorRT(convert_graph.cc)
+    VLOG(2) << "input edges: " << i << " " << input_names.at(i);
+    int output_idx = s.input_inds.at(i).second;
+    // we wired up the input here already, it is redundant to do it again in
+    //  ConvertSubGraphToTensorRT(convert_graph.cc)
     auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
         input_names.at(i), output_idx, input_dtypes.at(i));
     income_edges.push_back(incoming_edge);
@@ -1586,7 +2707,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
                     .Attr("input_nodes", input_names)
                     .Attr("output_nodes", output_names)
                     .Attr("OutT", output_dtypes)
-                    .Finalize(trt_node);
+                    .Finalize(s.trt_node);
 
   VLOG(0) << status.ToString() << " finished op building";
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 2e7fd19566e1ed3719b932c7443a9c3f652b2d3e..954a1e72f8604371fc00e088a67b4d411314dda6 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
 
 #include <set>
+#include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -32,16 +34,49 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
-    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
-    const std::vector<std::pair<int, int>>&
-        input_inds,  // {node_id, output_idx}
-    const std::vector<std::pair<int, int>>&
-        output_inds,  // {node_id, output_idx}
-    size_t max_batch_size, size_t max_workspace_size_bytes,
-    const tensorflow::grappler::GraphProperties& graph_prop,
-    tensorflow::NodeDef* trt_node);
+const int FP32MODE = 0;
+const int FP16MODE = 1;
+const int INT8MODE = 2;
 
+struct SubGraphParams {
+  SubGraphParams(
+      tensorflow::Graph& inp_graph,
+      const std::set<int>& subgraph_node_id_numbers,
+      const std::vector<std::pair<int, int>>& input_indices,
+      const std::vector<std::pair<int, int>>& output_indices,
+      size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
+      const tensorflow::grappler::GraphProperties& current_graph_properties,
+      std::unordered_map<string, std::pair<int, string>>* output_edges,
+      tensorflow::NodeDef* constructed_trt_node,
+      int engine_precision_mode = FP32MODE)
+      : graph(inp_graph),
+        subgraph_node_ids(subgraph_node_id_numbers),
+        input_inds(input_indices),
+        output_inds(output_indices),
+        max_batch_size(max_supported_batch_size),
+        max_workspace_size_bytes(max_consumed_workspace_size_bytes),
+        graph_properties(current_graph_properties),
+        output_edge_map(output_edges),
+        trt_node(constructed_trt_node),
+        precision_mode(engine_precision_mode) {}
+
+  tensorflow::Graph& graph;
+  const std::set<int>& subgraph_node_ids;
+  const std::vector<std::pair<int, int>>& input_inds;   // {node_id, output_idx}
+  const std::vector<std::pair<int, int>>& output_inds;  // {node_id, output_idx}
+  size_t max_batch_size;
+  size_t max_workspace_size_bytes;
+  const tensorflow::grappler::GraphProperties& graph_properties;
+  std::unordered_map<string, std::pair<int, string>>* output_edge_map;
+  tensorflow::NodeDef* trt_node;
+  const int precision_mode;
+};
+
+// TODO(sami): Replace references with const reference or pointers
+tensorflow::Status ConvertSubGraphToTensorRTNodeDef(SubGraphParams& params);
+tensorflow::Status InjectCalibrationNode(SubGraphParams& params);
+tensorflow::Status ConvertCalibrationNodeToEngineNode(tensorflow::Graph& graph,
+                                                      tensorflow::Node* c_node);
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aea44fd8a2fcc4c359a6cb0c98ae34711708326e
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc
@@ -0,0 +1,136 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/kernels/trt_calib_op.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+TRTCalibOp::TRTCalibOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("segment_nodes", &segment_nodes_));
+  OP_REQUIRES_OK(context, context->GetAttr("input_names", &input_names_));
+  OP_REQUIRES_OK(context, context->GetAttr("resource_name", &resource_name_));
+};
+
+#define TYPECASE(dt, X, Y)                                                \
+  case dt: {                                                              \
+    return (void*)X->flat<tensorflow::EnumToDataType<dt>::Type>().data(); \
+  }
+
+void* GetTensorAddress(const Tensor* tensor_ptr) {
+  auto tensor_type = tensor_ptr->dtype();
+  switch (tensor_type) {
+    TYPECASE(tensorflow::DT_FLOAT, tensor_ptr, dest_ptr);
+    TYPECASE(tensorflow::DT_HALF, tensor_ptr, dest_ptr);
+    TYPECASE(tensorflow::DT_INT8, tensor_ptr, dest_ptr);
+    default: {
+      LOG(FATAL) << "Unsupported Data type "
+                 << tensorflow::DataTypeString(tensor_type);
+      return nullptr;
+    }
+  }
+}
+
+void TRTCalibOp::Compute(tensorflow::OpKernelContext* ctx) {
+  // TODO(aaroey): make sure ctx->resource_mgr() is used in future PR.
+  auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto res_mgr = trt_rm->getManager("TRTCalibOps");
+  tensorflow::tensorrt::TRTCalibrationResource* calib_res = nullptr;
+  auto status = res_mgr->Lookup(resource_name_, resource_name_, &calib_res);
+
+  if (!status.ok()) {
+    ctx->SetStatus(status);
+    return;
+  }
+  int num_inputs = ctx->num_inputs();
+  // first run instantiate calibrator
+  if (calib_res->calibrator_ == nullptr) {
+    dev_tensors_.resize(num_inputs);
+    int batch_size = ctx->input(0).dim_size(0);
+    VLOG(1) << " Constructing calibrator";
+    for (int i = 0; i < num_inputs; i++) {
+      // allocate workspace on device for inputs
+      const tensorflow::Tensor& t = ctx->input(i);
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_persistent(t.dtype(), t.shape(),
+                                              &dev_tensors_.at(i), nullptr));
+      const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
+      CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
+      void* device_address = GetTensorAddress(device_tensor);
+      device_buffers_.emplace(input_names_.at(i),
+                              std::pair<void*, size_t>(
+                                  device_address, device_tensor->TotalBytes()));
+    }
+
+    calib_res->calibrator_ =
+        new TRTInt8Calibrator(device_buffers_, batch_size, resource_name_);
+    string label(resource_name_);
+    calib_res->thr_ = new std::thread([calib_res, label]() {
+      VLOG(1) << "Starting calibration thread, Calibration Resource @ "
+              << calib_res;
+      calib_res->builder_->setInt8Calibrator(calib_res->calibrator_);
+      calib_res->builder_->setInt8Mode(true);
+      calib_res->engine_ = calib_res->builder_->buildCudaEngine(
+          *calib_res->network_);  // will loop until we terminate calibrator
+      VLOG(1) << "Calibration loop terminated " << label;
+    });
+    VLOG(1) << "initialized calibrator resource";
+  }  //  calibrator initialized
+
+  // Pass input data to calibrator
+  std::unordered_map<string, void*> input_data;
+  for (int i = 0; i < num_inputs; i++) {
+    const Tensor& t = ctx->input(i);
+    void* data_address = GetTensorAddress(&t);
+    const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
+    CHECK_EQ(t.TotalBytes(),
+             device_tensor->TotalBytes());  // use the tensor so FW keeps it
+    input_data.emplace(input_names_.at(i), data_address);
+    ctx->set_output(i, t);
+  }
+  VLOG(2) << "Filled map for sending";
+  // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
+  const cudaStream_t* stream = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+  calib_res->calibrator_->setBatch(input_data, *stream);
+  VLOG(2) << "Passed calibration data";
+  // TODO(aaroey): make sure we wait for the completion of calibration on the
+  // last batch in future PR.
+};
+
+#undef TYPECASE
+
+REGISTER_KERNEL_BUILDER(Name("TRTCalibOp").Device(DEVICE_GPU), TRTCalibOp);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_calib_op.h b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..23df9db32f077a080eaff7479fcbe90d6a504c42
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_CALIB_OP_H
+#define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_CALIB_OP_H
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+namespace tensorflow {
+namespace tensorrt {
+// TODO(sami): Convert this to async kernel!
+class TRTCalibOp : public OpKernel {
+ public:
+  explicit TRTCalibOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  string resource_name_;
+  std::vector<string> segment_nodes_;
+  std::vector<string> input_names_;
+  std::vector<tensorflow::TensorShape> shapes_;
+  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
+  std::vector<tensorflow::PersistentTensor> dev_tensors_;
+};
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
+#endif
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_CALIB_OP_H
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 8efdf63ebebc4d7a199c60635ca64348d2b30505..b8f881ceb16a48f2aeb5f73173ee996a10f7bfbe 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -24,8 +24,11 @@ limitations under the License.
 #include "cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
-namespace tensorrt {
 static ::tensorflow::tensorrt::Logger logger;
+using IRuntime = nvinfer1::IRuntime;
+using Dims = nvinfer1::Dims;
+
+namespace tensorrt {
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // read serialized_engine
@@ -40,10 +43,21 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // TODO(samikama) runtime should be taken from a resourcemanager as well.
   // Only engine should be in the op and context and runtime should be taken
   // from resourcemanager
-  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
+  // TODO(jie): cudaSetDevice make sure trt engine is allocated on the same
+  // gpu where the input/output is also located.
+  int gpu_id = context->device()->tensorflow_gpu_device_info()->gpu_id;
+  cudaSetDevice(gpu_id);
+  int device;
+  cudaGetDevice(&device);
+  if (gpu_id != device) LOG(FATAL) << "set device failed!";
+
+  // TODO(samikama) runtime should be taken from a resourcemanager as well.
+  // Only engine should be in the op and context and runtime should be taken
+  // from resourcemanager
+
+  IRuntime* infer = nvinfer1::createInferRuntime(logger);
   trt_engine_ptr_.reset(infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr));
-
   trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
   // Runtime is safe to delete after engine creation
   infer->destroy();
@@ -55,7 +69,6 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
 
   size_t binding_index;
   int num_batch = 0;
-  bool valid = true;
   for (int i = 0; i < context->num_inputs(); i++) {
     // Grab the input tensor
     binding_index = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str());
@@ -64,11 +77,16 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     const TensorShape& input_shape = input_tensor.shape();
     if (i == 0) {
       num_batch = input_shape.dim_size(0);
+      if (num_batch > trt_engine_ptr_->getMaxBatchSize()) {
+        LOG(FATAL) << "input tensor batch larger than max_batch_size: "
+                   << trt_engine_ptr_->getMaxBatchSize();
+      }
     } else if (num_batch != input_shape.dim_size(0)) {
-      valid = false;
+      LOG(FATAL) << "input data inconsistent batch size";
       break;
     }
-    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
+    auto dtype = trt_engine_ptr_->getBindingDataType(binding_index);
+    switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
         break;
@@ -78,12 +96,12 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
       case nvinfer1::DataType::kINT8:
         LOG(FATAL) << "int8 is not supported yet!";
         break;
+      default:
+        LOG(FATAL) << "Unknown data type: " << int(dtype);
+        break;
     }
   }
 
-  // Might want a different way to inform the user of batch size inconsistency
-  if (!valid) LOG(WARNING) << "input data inconsistent batch size";
-
   for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
     // This is bad that we have to reallocate output buffer every run.
     // Create an output tensor
@@ -106,7 +124,8 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
 
     OP_REQUIRES_OK(context,
                    context->allocate_output(i, output_shape, &output_tensor));
-    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
+    auto dtype = trt_engine_ptr_->getBindingDataType(binding_index);
+    switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] =
             reinterpret_cast<void*>(output_tensor->flat<float>().data());
@@ -117,6 +136,9 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
       case nvinfer1::DataType::kINT8:
         LOG(FATAL) << "int8 is not supported yet!";
         break;
+      default:
+        LOG(FATAL) << "Unknown data type: " << int(dtype);
+        break;
     }
   }
   // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
@@ -126,9 +148,11 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
                                                 ->implementation()
                                                 ->CudaStreamMemberHack()));
 
-  // execution handled by TF since we are getting stream from TF.
-  // it is safe for CPU pointer array (buffers) to go out of scope after enqueue
-  trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], *stream, nullptr);
+  // TODO(jie): trt enqueue does not return error
+  auto ret = trt_execution_context_ptr_->enqueue(num_batch, &buffers[0],
+                                                 *stream, nullptr);
+  VLOG(2) << "enqueue returns: " << ret;
+  // sync should be done by TF.
 }
 
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index 7add8cb8b3d2a04206ee4174e79a1a4b86e05f30..dda0dc9e712eb726800abfb6084f4f708d04825b 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -27,19 +27,19 @@ void Logger::log(Severity severity, const char* msg) {
   // Suppress info-level messages
   switch (severity) {
     case Severity::kINFO: {  // Mark TRT info messages as debug!
-      VLOG(2) << msg;
+      VLOG(2) << name_ << " " << msg;
       break;
     }
     case Severity::kWARNING: {
-      LOG(WARNING) << msg;
+      LOG(WARNING) << name_ << " " << msg;
       break;
     }
     case Severity::kERROR: {
-      LOG(ERROR) << msg;
+      LOG(ERROR) << name_ << " " << msg;
       break;
     }
     case Severity::kINTERNAL_ERROR: {
-      LOG(FATAL) << msg;
+      LOG(FATAL) << name_ << " " << msg;
       break;
     }
     // This is useless for now. But would catch it in future if enum changes. It
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index d71f66b933a8068a6276a7e070755e0075543bb5..7f3544f8cfda8dce13881e1f8f4388b640e315f4 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -27,9 +27,11 @@ namespace tensorrt {
 
 // Logger for GIE info/warning/errors
 class Logger : public nvinfer1::ILogger {
- private:
+ public:
+  Logger(string name = "DefaultLogger") : name_(name){};
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
 
+ private:
   string name_;
 };
 
diff --git a/tensorflow/contrib/tensorrt/ops/trt_calib_op.cc b/tensorflow/contrib/tensorrt/ops/trt_calib_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4835e5065068ec7a59995eb7f6126b31aecf6704
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/ops/trt_calib_op.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+namespace tensorflow {
+
+REGISTER_OP("TRTCalibOp")
+    .Attr("segment_nodes: list(string)")         // names of the ops in segment
+    .Attr("segment_output_names: list(string)")  // names of the output ops in
+                                                 // segment
+    .Attr("input_names: list(string)")           // names of the inputs for
+                                                 // passing into tensorrt
+    .Attr("resource_name: string")
+    .Attr("InT: list({int8, float16, float32})")
+    .Input("in_tensor: InT")
+    .Output("out_tensor: InT")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_inputs(); i++) {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/python/__init__.py b/tensorflow/contrib/tensorrt/python/__init__.py
index 7e050a768ce97af1fc1d2c85cb52640b4c6a6a97..0b2321b5fc7bcbd53c01d1c97cafcfcb229a83ef 100644
--- a/tensorflow/contrib/tensorrt/python/__init__.py
+++ b/tensorflow/contrib/tensorrt/python/__init__.py
@@ -20,5 +20,6 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
 from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+from tensorflow.contrib.tensorrt.python.trt_convert import calib_graph_to_infer_graph
 from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 9454862f857ab743712ce409ff007de55e72a68e..338475d90ea55ab2c1bb8df77f27a71a4a36a5dd 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -20,11 +20,17 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
 import six as _six
+from tensorflow.contrib.tensorrt.wrap_conversion import calib_convert
 from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl as _impl
+from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.util import compat
+# pylint: enable=unused-import,line-too-long
 
 
 # TODO(skama): get outputs from session when implemented as c++
@@ -32,22 +38,33 @@ from tensorflow.python.framework import ops
 def create_inference_graph(input_graph_def,
                            outputs,
                            max_batch_size=1,
-                           max_workspace_size_bytes=2 << 20):
-  """Python wrapper for the TRT transormation.
-
+                           max_workspace_size_bytes=2 << 20,
+                           precision_mode="FP32",
+                           minimum_segment_size=3):
+  """Python wrapper for the TRT transformation.
 
   Args:
     input_graph_def: GraphDef object containing a model to be transformed.
-    outputs: List of tensors or node names for the model outputs.
+    outputs: list of tensors or node names for the model outputs.
     max_batch_size: max size for the input batch
     max_workspace_size_bytes: parameter to control memory allocation (in Bytes)
+    precision_mode: one of 'FP32', 'FP16' and 'INT8'
+    minimum_segment_size: the minimum number of nodes required for a subgraph to
+      be replaced by TRTEngineOp.
 
   Returns:
     New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
 
   Raises:
+    ValueError: if the provided precision mode is invalid.
     RuntimeError: if the returned status message is malformed.
   """
+  supported_precision_modes = {"FP32": 0, "FP16": 1, "INT8": 2}
+  if precision_mode.upper() not in supported_precision_modes:
+    raise ValueError(("precision mode '{}' is not supported."
+                      "It should be one of {}").format(
+                          precision_mode, "{'FP32', 'FP16', 'INT8'}"))
+  mode = supported_precision_modes[precision_mode.upper()]
 
   def py2bytes(inp):
     return inp
@@ -83,7 +100,7 @@ def create_inference_graph(input_graph_def,
   # pair or strings where first one is encoded status and the second
   # one is the transformed graphs protobuf string.
   out = trt_convert(input_graph_def_str, out_names, max_batch_size,
-                    max_workspace_size_bytes)
+                    max_workspace_size_bytes, mode, minimum_segment_size)
   status = to_string(out[0])
   output_graph_def_string = out[1]
   del input_graph_def_str  # Save some memory
@@ -101,3 +118,46 @@ def create_inference_graph(input_graph_def,
   output_graph_def.ParseFromString(output_graph_def_string)
   del output_graph_def_string  # Save some memory
   return output_graph_def
+
+
+def calib_graph_to_infer_graph(calibration_graph_def):
+  """Convert an existing calibration graph to inference graph.
+
+  Args:
+    calibration_graph_def: the calibration GraphDef object with calibration data
+  Returns:
+    New GraphDef with TRTEngineOps placed in graph replacing calibration nodes.
+  Raises:
+    RuntimeError: if the returned status message is malformed.
+  """
+
+  def py2string(inp):
+    return inp
+
+  def py3string(inp):
+    return inp.decode("utf-8")
+
+  if _six.PY2:
+    to_string = py2string
+  else:
+    to_string = py3string
+
+  graph_str = calibration_graph_def.SerializeToString()
+  out = calib_convert(graph_str)
+  status = to_string(out[0])
+  output_graph_def_string = out[1]
+  del graph_str  # Save some memory
+  if len(status) < 2:
+    raise _impl.UnknownError(None, None, status)
+  if status[:2] != "OK":
+    msg = status.split(";")
+    if len(msg) == 1:
+      raise RuntimeError("Status message is malformed {}".format(status))
+    # pylint: disable=protected-access
+    raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
+                                         int(msg[0]))
+    # pylint: enable=protected-access
+  output_graph_def = graph_pb2.GraphDef()
+  output_graph_def.ParseFromString(output_graph_def_string)
+  del output_graph_def_string  # Save some memory
+  return output_graph_def
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c93f869f5ef7c8eaa2a87eed26cfe69597fdb
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+
+#include <atomic>
+#include <chrono>
+#include <unordered_map>
+
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// set the batch size before constructing the thread to execute engine
+int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
+
+TRTInt8Calibrator::TRTInt8Calibrator(
+    const std::unordered_map<string, std::pair<void*, size_t>>& dev_buffers,
+    int batch_size, string engine_name)
+    : batch_size_(batch_size),
+      done_(false),
+      dev_buffers_(dev_buffers),
+      calib_running_(false),
+      batch_is_set_(false),
+      engine_name_(engine_name) {}
+
+bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
+                                 const cudaStream_t stream) {
+  tensorflow::mutex_lock lock(cond_mtx_);
+  while ((calib_running_ || batch_is_set_) &&
+         !done_) {  // wait while calibration is running
+    cond_.wait(lock);
+  }
+  if (done_) return false;
+  CHECK(!calib_running_ && !batch_is_set_);
+  VLOG(1) << "Set Batch Waiting finished";
+  for (const auto it : data) {
+    auto devptr = dev_buffers_.find(it.first);
+    if (devptr == dev_buffers_.end()) {
+      LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first
+                 << "' does not match with the buffer names";
+    }
+    const auto& d = devptr->second;
+
+    // TODO(aaroey): we should not use sync copy on default stream. Make sure
+    // stream->ThenMemcpy() is used in future PRs.
+    // TODO(sami,aaroey): Need to figure out a way to ensure synchronization
+    // between stream, perhaps using a tensor?
+    auto status = cudaMemcpyAsync(d.first, it.second, d.second,
+                                  cudaMemcpyDeviceToDevice, stream);
+    if (status != cudaSuccess) {
+      LOG(FATAL) << "cudaMemcpy " << engine_name_ << " for '" << it.first
+                 << "' failed with " << status;
+    }
+  }
+
+  // TODO(Sami, aaorey): Find an alternative way!
+  cudaStreamSynchronize(
+      stream);  // we have to wait for the stream before returning!
+  batch_is_set_ = true;
+  cond_.notify_all();
+  return true;
+}
+
+bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
+                                 int num_bindings) {
+  tensorflow::mutex_lock lock(cond_mtx_);
+  calib_running_ = false;
+  cond_.notify_all();
+  while ((!batch_is_set_ && !done_)) {  // wait until new batch arrives
+    cond_.wait(lock);
+
+  }
+  if (done_) {
+    return false;
+  }
+
+  for (int i = 0; i < num_bindings; i++) {
+    auto it = dev_buffers_.find(names[i]);
+    if (it == dev_buffers_.end()) {
+      LOG(FATAL) << "Calibration engine asked for unknown tensor name '"
+                 << names[i] << "' at position " << i;
+    }
+
+    bindings[i] = it->second.first;
+  }
+  batch_is_set_ = false;
+  calib_running_ = true;
+  return true;
+}
+
+const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) {
+  return nullptr;
+}
+
+void TRTInt8Calibrator::setDone() {
+  tensorflow::mutex_lock lock(cond_mtx_);
+  done_ = true;
+  cond_.notify_all();
+}
+
+void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
+                                              std::size_t length) {}
+TRTInt8Calibrator::~TRTInt8Calibrator() {
+  VLOG(1) << "Destroying calibrator for " << engine_name_;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
new file mode 100644
index 0000000000000000000000000000000000000000..d77aa2c5ab184756adaee38f88180b3c128ebe03
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "tensorflow/core/platform/mutex.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+// This class provides a 1 element queue to match TFs push model to
+// TRTs pull model for calibration. When TRT implements a means for
+// a push calibration This class should be updated accordingly
+
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+ public:
+  TRTInt8Calibrator(
+      const std::unordered_map<string, std::pair<void*, size_t>>& dev_buffers,
+      int batch_size, string engine_name);
+  int getBatchSize() const override;
+  bool getBatch(void* bindings[], const char* names[],
+                int num_bindings) override;
+  bool setBatch(const std::unordered_map<string, void*>& data,
+                const cudaStream_t stream);
+  void setDone();
+  const void* readCalibrationCache(std::size_t& length) override;
+  void writeCalibrationCache(const void* ptr, std::size_t length) override;
+  ~TRTInt8Calibrator();
+
+ private:
+  const int batch_size_;
+  tensorflow::mutex cond_mtx_;           // mutex for condition_variable
+  tensorflow::condition_variable cond_;  // condition variable to implement
+                                         // producer-consumer queue for
+                                         // calibration
+  bool done_;
+  const std::unordered_map<string, std::pair<void*, size_t>>
+      dev_buffers_;  // map to keep tensorrt input buffers and sizes keyed with
+                     // buffer names
+  bool calib_running_;
+  bool batch_is_set_;
+  string engine_name_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif
+#endif
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c3698e5d1cc5d6d8d31a8fcaf03d103f1e1915d
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+std::shared_ptr<TRTResourceManager>
+tensorflow::tensorrt::TRTResourceManager::instance() {
+  static std::shared_ptr<TRTResourceManager> instance_(new TRTResourceManager);
+  return instance_;
+}
+
+std::shared_ptr<tensorflow::ResourceMgr>
+tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
+  // mutex is held for lookup only. Most instantiations where mutex will be held
+  // longer will be during op creation and should be ok.
+  tensorflow::mutex_lock lock(map_mutex_);
+  auto s = managers_.find(op_name);
+  if (s == managers_.end()) {
+    auto it = managers_.emplace(
+        op_name, std::make_shared<tensorflow::ResourceMgr>(op_name));
+    VLOG(1) << "Returning a new manager " << op_name;
+    return it.first->second;
+  }
+  VLOG(1) << "Returning old manager " << op_name;
+  return s->second;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc15b51e05ef743d0aa260bbd9bd21302a752ec0
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
+#include <memory>
+
+#include <string>
+#include <unordered_map>
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class TRTResourceManager {
+  TRTResourceManager() = default;
+
+ public:
+  static std::shared_ptr<TRTResourceManager> instance();
+  // returns a manager for given op, if it doesn't exists it creates one
+  std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
+
+ private:
+  std::unordered_map<string, std::shared_ptr<tensorflow::ResourceMgr>>
+      managers_;
+  tensorflow::mutex map_mutex_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCE_TRT_RESOURCE_MANAGER_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c85968ae7acf5c5fc567be6805a5d226b1094c7
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
+
+#include <list>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+class TRTCalibrationResource : public tensorflow::ResourceBase {
+ public:
+  TRTCalibrationResource()
+      : calibrator_(nullptr),
+        builder_(nullptr),
+        network_(nullptr),
+        engine_(nullptr),
+        logger_(nullptr),
+        thr_(nullptr) {}
+  string DebugString() override {
+    std::stringstream oss;
+    oss << " Calibrator = " << std::hex << calibrator_ << std::dec << std::endl
+        << " Builder    = " << std::hex << builder_ << std::dec << std::endl
+        << " Network    = " << std::hex << network_ << std::dec << std::endl
+        << " Engine     = " << std::hex << engine_ << std::dec << std::endl
+        << " Logger     = " << std::hex << logger_ << std::dec << std::endl
+        << " Thread     = " << std::hex << thr_ << std::dec << std::endl;
+    return oss.str();
+  }
+  ~TRTCalibrationResource() {
+    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+  }
+  TRTInt8Calibrator* calibrator_;
+  nvinfer1::IBuilder* builder_;
+  nvinfer1::INetworkDefinition* network_;
+  nvinfer1::ICudaEngine* engine_;
+  tensorflow::tensorrt::Logger* logger_;
+  // TODO(sami): Use threadpool threads!
+  std::thread* thr_;
+};
+
+class TRTWeightStore : public tensorflow::ResourceBase {
+ public:
+  TRTWeightStore() {}
+  std::list<std::vector<uint8_t>> store_;
+  string DebugString() override {
+    std::stringstream oss;
+    size_t lenBytes = 0;
+    for (const auto& v : store_) {
+      lenBytes += v.size() * sizeof(uint8_t);
+    }
+    oss << " Number of entries     = " << store_.size() << std::endl
+        << " Total number of bytes = "
+        << store_.size() * sizeof(std::vector<uint8_t>) + lenBytes << std::endl;
+    return oss.str();
+  }
+  virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
+};
+
+class TRTEngineResource : public tensorflow::ResourceBase {
+ public:
+  TRTEngineResource() : runtime_(nullptr), ctx_(nullptr){};
+  string DebugString() override { return string(""); }
+  nvinfer1::IRuntime* runtime_;
+  nvinfer1::IExecutionContext* ctx_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCEMGR_TRTRESOURCES_H_
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 6193f0b0a13f6985d5fc8dd4c6fc09b15f72f139..8fc4697c513057c668d31a341cb13f60dc107e81 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -80,13 +80,20 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
   std::vector<const tensorflow::Edge*> in_edges(dst->in_edges().begin(),
                                                 dst->in_edges().end());
   for (const tensorflow::Edge* in_edge : in_edges) {
-    if (in_edge->src() != src) {
-      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
-      if (e->src() == graph->source_node()) {
-        graph->AddEdge(e->src(), e->src_output(), src,
-                       tensorflow::Graph::kControlSlot);
-      } else {
-        graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
+    if (in_edge->IsControlEdge()) {
+      if (in_edge->src() != src) {
+        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        graph->AddControlEdge(e->src(), src);
+      }
+    } else {
+      if (in_edge->src() != src) {
+        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        if (e->src() == graph->source_node()) {
+          graph->AddEdge(e->src(), e->src_output(), src,
+                         tensorflow::Graph::kControlSlot);
+        } else {
+          graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
+        }
       }
     }
   }
@@ -94,12 +101,19 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
   std::vector<const tensorflow::Edge*> out_edges(dst->out_edges().begin(),
                                                  dst->out_edges().end());
   for (const tensorflow::Edge* out_edge : out_edges) {
-    tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
-    if (e->dst() == graph->sink_node()) {
-      graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
-                     e->dst_input());
+    if (out_edge->IsControlEdge()) {
+      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      graph->AddControlEdge(src, e->dst());
     } else {
-      graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
+      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      if (e->dst() == graph->sink_node()) {
+        VLOG(1) << " edge to sink node " << src->name() << " -> "
+                << e->dst()->name();
+        graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
+                       e->dst_input());
+      } else {
+        graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
+      }
     }
   }
 
@@ -118,7 +132,7 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
 
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments) {
   // Create a Graph representation of the GraphDef.
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
@@ -136,7 +150,7 @@ tensorflow::Status SegmentGraph(
   for (int i = 0; i < graph.num_node_ids(); ++i) {
     tensorflow::Node* node = graph.FindNodeId(i);
     if (options.exclude_node_list.count(node->name()) != 0 ||
-        !candidate_fn(node->def())) {
+        !candidate_fn(node)) {
       node = nullptr;
     }
     node_segments.emplace_back(node);
@@ -155,7 +169,7 @@ tensorflow::Status SegmentGraph(
 
   for (const tensorflow::Node* node : order) {
     // All output nodes of 'node' have been visited...
-    VLOG(2) << "Trying node " << node->name();
+    VLOG(2) << "Trying node " << node->name() << " id=" << node->id();
 
     // 'node' must be a TRT candidate...
     if (node_segments[node->id()].Value() == nullptr) {
@@ -169,8 +183,12 @@ tensorflow::Status SegmentGraph(
     while (true) {
       std::set<const tensorflow::Edge*> contract_edges;
       for (const tensorflow::Edge* out_edge : node->out_edges()) {
-        VLOG(2) << "... out node " << out_edge->dst()->name();
-
+        VLOG(2) << "... out node " << out_edge->dst()->name() << " ( "
+                << out_edge->dst()->id() << " <- " << node->id() << " )";
+        if (out_edge->IsControlEdge()) {
+          VLOG(2) << "... ... Control Edge, Skipping";
+          continue;
+        }
         // Out node must be TRT candidate...
         if (node_segments[out_edge->dst()->id()].Value() == nullptr) {
           VLOG(2) << "... ... not a TRT candidate";
@@ -196,7 +214,8 @@ tensorflow::Status SegmentGraph(
         const tensorflow::Node* src = contract_edge->src();
         const tensorflow::Node* dst = contract_edge->dst();
 
-        VLOG(2) << "Merge " << src->name() << " <- " << dst->name();
+        VLOG(2) << "Merge " << src->name() << " <- " << dst->name() << " ("
+                << src->id() << " <- " << dst->id();
         node_segments[src->id()].Merge(&node_segments[dst->id()]);
 
         // Contracting the edge leaves disconnected graph edges.
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index ee6e2b3ed26cd1fabc0e952d882d549046cd9a30..7e8685f44a8c8a20fd7159ee40a8835531e78e9f 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -20,10 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+
 namespace tensorrt {
 namespace segment {
 
@@ -46,7 +48,7 @@ struct SegmentOptions {
 // @return the status.
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments);
 
 }  // namespace segment
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 74cbc5f2b376b76324eed06d251767da6f928e3e..7ddabec268d4ef7b5c679001e5fb99aa7d83aec0 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -35,7 +35,7 @@ class SegmentTest : public ::testing::Test {
   TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                     TF_Status* s, const char* name);
 
-  std::function<bool(const NodeDef&)> MakeCandidateFn(
+  std::function<bool(const Node*)> MakeCandidateFn(
       const std::set<string>& node_names);
 
  protected:
@@ -60,10 +60,10 @@ bool SegmentTest::GetGraphDef(TF_Graph* graph,
   return ret;
 }
 
-std::function<bool(const NodeDef&)> SegmentTest::MakeCandidateFn(
+std::function<bool(const Node*)> SegmentTest::MakeCandidateFn(
     const std::set<string>& node_names) {
-  return [node_names](const NodeDef& node) -> bool {
-    return node_names.find(node.name()) != node_names.end();
+  return [node_names](const Node* node) -> bool {
+    return node_names.find(node->name()) != node_names.end();
   };
 }
 
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/contrib/tensorrt/tensorrt_test.cc
index e11522ea5bda7f5a303d6ea332148dbd7b17f162..3712a9a6fe349d949ef2666652b9d750538d5535 100644
--- a/tensorflow/contrib/tensorrt/tensorrt_test.cc
+++ b/tensorflow/contrib/tensorrt/tensorrt_test.cc
@@ -95,9 +95,9 @@ nvinfer1::IHostMemory* CreateNetwork() {
 }
 
 // Executes the network.
-void Execute(nvinfer1::IExecutionContext& context, const float* input,
+void Execute(nvinfer1::IExecutionContext* context, const float* input,
              float* output) {
-  const nvinfer1::ICudaEngine& engine = context.getEngine();
+  const nvinfer1::ICudaEngine& engine = context->getEngine();
 
   // We have two bindings: input and output.
   ASSERT_EQ(engine.getNbBindings(), 2);
@@ -118,7 +118,7 @@ void Execute(nvinfer1::IExecutionContext& context, const float* input,
   // could be removed.
   ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
                                cudaMemcpyHostToDevice, stream));
-  context.enqueue(1, buffers, stream, nullptr);
+  context->enqueue(1, buffers, stream, nullptr);
   ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
                                cudaMemcpyDeviceToHost, stream));
   cudaStreamSynchronize(stream);
@@ -143,7 +143,7 @@ TEST(TensorrtTest, BasicFunctions) {
   // Execute the network.
   float input = 1234;
   float output;
-  Execute(*context, &input, &output);
+  Execute(context, &input, &output);
   EXPECT_EQ(output, input * 2 + 3);
 
   // Destroy the engine.
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 18dba94acb3724cb2b5a1c53227bcf08bf9f8fcc..ad01bedd8fa066e914b05b20dbc47d9aabe790d9 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -37,7 +37,7 @@ from tensorflow.python.ops import nn_ops as nn_ops
 
 
 def get_simple_graph_def():
-  """Create a simple graph and return its graph_def"""
+  """Create a simple graph and return its graph_def."""
   g = ops.Graph()
   with g.as_default():
     a = aops.placeholder(
@@ -60,6 +60,7 @@ def get_simple_graph_def():
 
 
 def run_graph(gdef, dumm_inp):
+  """Run given graphdef once."""
   gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
   ops.reset_default_graph()
   g = ops.Graph()
@@ -74,15 +75,65 @@ def run_graph(gdef, dumm_inp):
   return val
 
 
+# Use real data that is representative of the inference dataset
+# for calibration. For this test script it is random data.
+def run_calibration(gdef, dumm_inp):
+  """Run given calibration graph multiple times."""
+  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  ops.reset_default_graph()
+  g = ops.Graph()
+  with g.as_default():
+    inp, out = importer.import_graph_def(
+        graph_def=gdef, return_elements=["input", "output"])
+    inp = inp.outputs[0]
+    out = out.outputs[0]
+  with csess.Session(
+      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
+    # run over real calibration data here, we are mimicking a calibration set of
+    # 30 different batches. Use as much calibration data as you want
+    for _ in range(30):
+      val = sess.run(out, {inp: dumm_inp})
+  return val
+
+
 if "__main__" in __name__:
   inp_dims = (100, 24, 24, 2)
   dummy_input = np.random.random_sample(inp_dims)
-  gdef = get_simple_graph_def()
+  orig_graph = get_simple_graph_def()  # use a frozen graph for inference
   # Get optimized graph
-  trt_graph = trt.create_inference_graph(gdef, ["output"], inp_dims[0])
-  o1 = run_graph(gdef, dummy_input)
+  trt_graph = trt.create_inference_graph(
+      input_graph_def=orig_graph,
+      outputs=["output"],
+      max_batch_size=inp_dims[0],
+      max_workspace_size_bytes=1 << 25,
+      precision_mode="FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
+      minimum_segment_size=2  # minimum number of nodes in an engine
+  )
+  o1 = run_graph(orig_graph, dummy_input)
   o2 = run_graph(trt_graph, dummy_input)
   o3 = run_graph(trt_graph, dummy_input)
   assert np.array_equal(o1, o2)
   assert np.array_equal(o3, o2)  # sanity check
+  fp16_graph = trt.create_inference_graph(
+      input_graph_def=orig_graph,
+      outputs=["output"],
+      max_batch_size=inp_dims[0],
+      max_workspace_size_bytes=1 << 25,
+      precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
+      minimum_segment_size=2  # minimum number of nodes in an engine
+  )
+  int8_calib_gdef = trt.create_inference_graph(
+      input_graph_def=orig_graph,
+      outputs=["output"],
+      max_batch_size=inp_dims[0],
+      max_workspace_size_bytes=1 << 25,
+      precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
+      minimum_segment_size=2  # minimum number of nodes in an engine
+  )
+  o4 = run_graph(fp16_graph, dummy_input)
+  _ = run_calibration(int8_calib_gdef, dummy_input)
+  int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
+  o5 = run_graph(int8_graph, dummy_input)
+  assert np.allclose(o1, o4)
+  assert np.allclose(o1, o5)
   print("Pass")
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a4732876286a9484bc607242ae19a31941313db
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
@@ -0,0 +1,156 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+import numpy as np
+
+from tensorflow.contrib import tensorrt as trt
+from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.python.framework import constant_op as cop
+from tensorflow.python.framework import dtypes as dtypes
+from tensorflow.python.framework import importer as importer
+from tensorflow.python.framework import ops as ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops as aops
+from tensorflow.python.ops import nn as nn
+from tensorflow.python.ops import nn_ops as nn_ops
+from tensorflow.python.platform import googletest
+
+
+@test_util.with_c_api
+class IntegrationTest(test_util.TensorFlowTestCase):
+  """Class to test Tensorflow-TensorRT integration."""
+
+  def setUp(self):
+    """Setup method."""
+    super(IntegrationTest, self).setUp()
+    warnings.simplefilter("always")
+    inp_dims = (100, 24, 24, 2)
+    self._input = np.random.random_sample(inp_dims)
+    self._original_graph = self.get_simple_graph_def()
+    self._gpu_options = cpb2.GPUOptions(
+        per_process_gpu_memory_fraction=0.50)
+    self._config = cpb2.ConfigProto(gpu_options=self._gpu_options)
+    self._reference = self.run_graph(self._original_graph, self._input)
+
+  def get_simple_graph_def(self):
+    """Create a simple graph and return its graph_def."""
+    g = ops.Graph()
+    with g.as_default():
+      a = aops.placeholder(
+          dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+      e = cop.constant(
+          [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+          name="weights",
+          dtype=dtypes.float32)
+      conv = nn.conv2d(
+          input=a,
+          filter=e,
+          strides=[1, 2, 2, 1],
+          padding="SAME",
+          name="conv")
+      b = cop.constant(
+          [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
+      t = nn.bias_add(conv, b, name="biasAdd")
+      relu = nn.relu(t, "relu")
+      idty = aops.identity(relu, "ID")
+      v = nn_ops.max_pool(
+          idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+      aops.squeeze(v, name="output")
+    return g.as_graph_def()
+
+  def run_graph(self, gdef, dumm_inp):
+    """Run given graphdef once."""
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+    with self.test_session(
+        graph=g, config=self._config, use_gpu=True,
+        force_gpu=True) as sess:
+      val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  # Use real data that is representative of the inference dataset
+  # for calibration. For this test script it is random data.
+  def run_calibration(self, gdef, dumm_inp):
+    """Run given calibration graph multiple times."""
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+      # run over real calibration data here, we are mimicking a calibration
+      # set of 30 different batches. Use as much calibration data as you want
+    with self.test_session(
+        graph=g, config=self._config, use_gpu=True,
+        force_gpu=True) as sess:
+      for _ in range(30):
+        val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  def get_trt_graph(self, mode):
+    """Return trt converted graph."""
+    if mode in  ["FP32", "FP16", "INT8"]:
+      return trt.create_inference_graph(
+          input_graph_def=self._original_graph,
+          outputs=["output"],
+          max_batch_size=self._input.shape[0],
+          max_workspace_size_bytes=1 << 25,
+          precision_mode=mode,  # TRT Engine precision "FP32","FP16" or "INT8"
+          minimum_segment_size=2  # minimum number of nodes in an engine
+          )
+    return None
+
+  def testFP32(self):
+    """Test FP32 conversion. Results should be identical to native case."""
+    trt_graph = self.get_trt_graph("FP32")
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    result1 = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(result1, result)
+
+  def testFP16(self):
+    """Test FP16 conversion. Results may be different from native case."""
+    trt_graph = self.get_trt_graph("FP16")
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllClose(self._reference, result, rtol=1.e-03)
+    result1 = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(result1, result)
+
+  def testINT8(self):
+    """Test INT8 conversion. Results may be different from native case."""
+    calib_graph = self.get_trt_graph("INT8")
+    result = self.run_calibration(calib_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    int8_graph = trt.calib_graph_to_infer_graph(calib_graph)
+    result = self.run_graph(int8_graph, self._input)
+    self.assertAllClose(self._reference, result, rtol=1.e-03)
+    result1 = self.run_graph(int8_graph, self._input)
+    self.assertAllEqual(result1, result)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index d679945d569c1784448b6cb09c2f431b9cda56d7..46480e99a113afb34702b0ecd71468d4bdc83f98 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -64,13 +64,17 @@ PyObject* pair_helper(std::pair<string, string>* in) {
 %ignoreall
 %unignore tensorflow;
 %unignore trt_convert;
+%unignore calib_convert;
 
 %{
+
 std::pair<string, string> trt_convert(
     string graph_def_string,  // The serialized GraphDef string.
     std::vector<string> output_names,
     size_t max_batch_size,
-    size_t max_workspace_size_bytes
+    size_t max_workspace_size_bytes,
+    int precision_mode,
+    int minimum_segment_size
     // Unfortunately we can't use TF_Status here since it
     // is in c/c_api and brings in a lot of other libraries
     // which in turn declare ops. These ops are included
@@ -90,16 +94,64 @@ std::pair<string, string> trt_convert(
     return std::pair<string, string>{out_status, ""};
   }
 
+  if(precision_mode < 0 || precision_mode > 2){
+    out_status = "InvalidArgument;Invalid precision_mode";
+    return std::pair<string, string>{out_status, ""};
+  }
   if (!output_names.size()) {
     out_status = "InvalidArgument;Size of the output_names vector is 0";
     return std::pair<string, string>{out_status, ""};
-    // return "";
   }
   tensorflow::GraphDef outGraph;
   tensorflow::Status conversion_status =
       tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
           graph_def, output_names, max_batch_size, max_workspace_size_bytes,
-          &outGraph);
+          &outGraph, precision_mode, minimum_segment_size);
+  if (!conversion_status.ok()) {
+    auto retCode = (int)conversion_status.code();
+    char buff[2000];
+    snprintf(buff, 2000, "%d;%s", retCode,
+             conversion_status.error_message().c_str());
+    out_status = buff;
+    return std::pair<string, string>{out_status, ""};
+  }
+  string result;
+  if (!outGraph.SerializeToString(&result)) {
+    out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
+  }
+  out_status = "OK;All good!";
+  return std::pair<string, string>{out_status, result};
+#else
+  // Returns FAILED_PRECONDITION.
+  return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+}
+
+std::pair<string, string> calib_convert(string graph_def_string  //  const tensorflow::GraphDef&
+    // unfortunately we can't use TF_Status here since it
+    // is in c/c_api and brings in a lot of other libraries
+    // which in turn declare ops. These ops are included
+    // statically in our library and cause an abort when
+    // module is loaded due to double registration
+    // until Tensorflow properly exposes these headers
+    // we have to work around this by returning a string
+    // and converting it to exception on python side.
+    //,TF_Status* out_status) {
+) {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  string out_status;
+
+  tensorflow::GraphDef graph_def;
+  if (!graph_def.ParseFromString(graph_def_string)) {
+    out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
+  }
+
+  tensorflow::GraphDef outGraph;
+  tensorflow::Status conversion_status =
+      tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(graph_def,
+                                                                   &outGraph);
   if (!conversion_status.ok()) {
     auto retCode = (int)conversion_status.code();
     char buff[2000];
@@ -122,10 +174,13 @@ std::pair<string, string> trt_convert(
 }
 %}
 
+std::pair<string, string> calib_convert(string graph_def_string);
+
 std::pair<string, string> trt_convert(string graph_def_string,
                                       std::vector<string> output_names,
                                       size_t max_batch_size,
-                                      size_t max_workspace_size_bytes);
+                                      size_t max_workspace_size_bytes,
+                                      int precision_mode, int minimum_segment_size);
 
 
 %unignoreall
diff --git a/tensorflow/contrib/testing/BUILD b/tensorflow/contrib/testing/BUILD
index 0be6aa755bee50451f6717139fd8e1315789b389..8a40e111d7723b0d1c332b9d2381169c8bed510f 100644
--- a/tensorflow/contrib/testing/BUILD
+++ b/tensorflow/contrib/testing/BUILD
@@ -22,15 +22,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
index f2065c666255984c8ab770fc10f682b1eabad095..eac34afc4adb268e909076ec9c5fb379cc95537b 100644
--- a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
+++ b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import summary_pb2
+from tensorflow.python.framework import test_util
 from tensorflow.python.summary.writer import writer
 from tensorflow.python.summary.writer import writer_cache
 
@@ -51,6 +52,7 @@ class FakeSummaryWriter(object):
     self._added_graphs = []
     self._added_meta_graphs = []
     self._added_session_logs = []
+    self._added_run_metadata = {}
 
   @property
   def summaries(self):
@@ -85,7 +87,11 @@ class FakeSummaryWriter(object):
     if expected_added_graphs is not None:
       test_case.assertEqual(expected_added_graphs, self._added_graphs)
     if expected_added_meta_graphs is not None:
-      test_case.assertEqual(expected_added_meta_graphs, self._added_meta_graphs)
+      test_case.assertEqual(len(expected_added_meta_graphs),
+                            len(self._added_meta_graphs))
+      for expected, actual in zip(expected_added_meta_graphs,
+                                  self._added_meta_graphs):
+        test_util.assert_meta_graph_protos_equal(test_case, expected, actual)
     if expected_session_logs is not None:
       test_case.assertEqual(expected_session_logs, self._added_session_logs)
 
@@ -122,6 +128,11 @@ class FakeSummaryWriter(object):
     # pylint: disable=unused-argument
     self._added_session_logs.append(session_log)
 
+  def add_run_metadata(self, run_metadata, tag, global_step=None):
+    if (global_step is not None) and (global_step < 0):
+      raise ValueError('Invalid global_step %s.' % global_step)
+    self._added_run_metadata[tag] = run_metadata
+
   def flush(self):
     pass
 
diff --git a/tensorflow/contrib/text/BUILD b/tensorflow/contrib/text/BUILD
index 698fdd830f57eb64c3c4119371f545908bf726e5..38d91f7e496d47ac74415da3bae91bad7f431dce 100644
--- a/tensorflow/contrib/text/BUILD
+++ b/tensorflow/contrib/text/BUILD
@@ -111,14 +111,3 @@ py_test(
         "//tensorflow/python:training",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/tfprof/BUILD b/tensorflow/contrib/tfprof/BUILD
index 28adce71d414d267bd53109751689c6f4d5d7b3b..e7f4ebdd36aa9d21ec1dc71ed200001eb0331704 100644
--- a/tensorflow/contrib/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/BUILD
@@ -20,15 +20,3 @@ py_library(
         "//tensorflow/python/profiler:tfprof_logger",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/BUILD b/tensorflow/contrib/timeseries/BUILD
index 6ba069778ccf5bfba94921ac47db9233c63c0cfe..f2b8786a527289fe20de86447355fbf552cd265e 100644
--- a/tensorflow/contrib/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/BUILD
@@ -31,15 +31,3 @@ py_library(
         "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index bb86ecb2209f9bed3ad6c37f4b23bc7b361e1bd6..355303acf6ddf866ecf18815b394fcea8488d67d 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -8,14 +8,22 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+config_setting(
+    name = "empty_condition",
+    values = {"define": "UNUSED=unused"},
+)
+
 py_binary(
     name = "predict",
     srcs = ["predict.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
         "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -25,7 +33,10 @@ py_test(
     srcs = ["predict_test.py"],
     data = ["data/period_trend.csv"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67513579
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",  # b/67513579
+    ],
     deps = [
         ":predict",
         "//tensorflow/python:client_testlib",
@@ -38,9 +49,12 @@ py_binary(
     data = ["data/changepoints.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
         "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -61,9 +75,12 @@ py_binary(
     data = ["data/multivariate_level.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
         "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -86,11 +103,14 @@ py_binary(
     data = ["data/multivariate_periods.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/timeseries/python/timeseries:estimators",
         "//tensorflow/contrib/timeseries/python/timeseries:model",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -106,15 +126,3 @@ py_test(
         "//tensorflow/python/estimator:estimator_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv b/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
index b49a0662c29b1d810f4be31ca1f318f0571f533e..9b15b4f0b26f11ac3281ca4206654872984628b6 100644
--- a/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
+++ b/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
@@ -1,100 +1,100 @@
-0,0.926906299771,1.99107237682,2.56546245685,3.07914768197,4.04839057867,1.,0.
-1,0.108010001864,1.41645361423,2.1686839775,2.94963962176,4.1263503303,1.,0.
-2,-0.800567600028,1.0172132907,1.96434754116,2.99885333086,4.04300485864,1.,0.
-3,0.0607042871898,0.719540073421,1.9765012584,2.89265588817,4.0951014426,1.,0.
-4,0.933712200629,0.28052120776,1.41018552514,2.69232603996,4.06481164223,1.,0.
-5,-0.171730652974,0.260054421028,1.48770816369,2.62199129293,4.44572807842,1.,0.
-6,-1.00180162933,0.333045158863,1.50006392277,2.88888309683,4.24755865606,1.,0.
-7,0.0580061875336,0.688929398826,1.56543458772,2.99840358953,4.52726873347,1.,0.
-8,0.764139447412,1.24704875327,1.77649279698,3.13578593851,4.63238922951,1.,0.
-9,-0.230331874785,1.47903998963,2.03547545751,3.20624030377,4.77980005228,1.,0.
-10,-1.03846045211,2.01133000781,2.31977503972,3.67951536251,5.09716775897,1.,0.
-11,0.188643592253,2.23285349038,2.68338482249,3.49817168611,5.24928239634,1.,0.
-12,0.91207302309,2.24244446841,2.71362604985,3.96332587625,5.37802271594,1.,0.
-13,-0.296588665881,2.02594634141,3.07733910479,3.99698324956,5.56365901394,1.,0.
-14,-0.959961476551,1.45078629833,3.18996420137,4.3763059609,5.65356015609,1.,0.
-15,0.46313530679,1.01141441548,3.4980215948,4.20224896882,5.88842247449,1.,0.
-16,0.929354125798,0.626635305936,3.70508262244,4.51791573544,5.73945973251,1.,0.
-17,-0.519110731957,0.269249223148,3.39866823332,4.46802003061,5.82768174382,1.,0.
-18,-0.924330981367,0.349602834684,3.21762413294,4.72803587499,5.94918925767,1.,0.
-19,0.253239387885,0.345158023497,3.11071425333,4.79311566935,5.9489259713,1.,0.
-20,0.637408390225,0.698996675371,3.25232492145,4.73814732384,5.9612010251,1.,0.
-21,-0.407396859412,1.17456342803,2.49526823723,4.59323415742,5.82501686811,1.,0.
-22,-0.967485452118,1.66655933642,2.47284606244,4.58316034754,5.88721406681,1.,0.
-23,0.474480867904,1.95018556323,2.0228950072,4.48651142819,5.8255943735,1.,0.
-24,1.04309652155,2.23519892356,1.91924131572,4.19094661783,5.87457348436,1.,0.
-25,-0.517861513772,2.12501967336,1.70266619979,4.05280882887,5.72160912899,1.,0.
-26,-0.945301585146,1.65464653549,1.81567174251,3.92309850635,5.58270493814,1.,0.
-27,0.501153868974,1.40600764889,1.53991387719,3.72853247942,5.60169001727,1.,0.
-28,0.972859524418,1.00344321868,1.5175642828,3.64092376655,5.10567722582,1.,0.
-29,-0.70553406135,0.465306263885,1.7038540803,3.33236870312,5.09182481555,1.,0.
-30,-0.946093634916,0.294539309453,1.88052827037,2.93011492669,4.97354922696,1.,0.
-31,0.47922123231,0.308465865031,2.03445883031,2.90772899045,4.86241793548,1.,0.
-32,0.754030014252,0.549752241167,2.46115815089,2.95063349534,4.71834614627,1.,0.
-33,-0.64875949826,0.894615488148,2.5922463381,2.81269864022,4.43480095104,1.,0.
-34,-0.757829951086,1.39123914261,2.69258079904,2.61834837315,4.36580046156,1.,0.
-35,0.565653301088,1.72360022693,2.97794913834,2.80403840334,4.27327248459,1.,0.
-36,0.867440092372,2.21100730052,3.38648090792,2.84057515729,4.12210169576,1.,0.
-37,-0.894567758095,2.17549105818,3.45532493329,2.90446025717,4.00251740584,1.,0.
-38,-0.715442356893,2.15105389965,3.52041791902,3.03650393392,4.12809249577,1.,0.
-39,0.80671703672,1.81504564517,3.60463324866,3.00747789871,3.98440762467,1.,0.
-40,0.527014790142,1.31803513865,3.43842186337,3.3332594663,4.03232406566,1.,0.
-41,-0.795936862129,0.847809114454,3.09875133548,3.52863155938,3.94883924909,1.,0.
-42,-0.610245806946,0.425530441018,2.92581949152,3.77238736123,4.27287245021,1.,0.
-43,0.611662279431,0.178432049837,2.48128214822,3.73212087883,4.17319013831,1.,0.
-44,0.650866553108,0.220341648392,2.41694642022,4.2609098519,4.27271645905,1.,0.
-45,-0.774156982023,0.632667602331,2.05474356052,4.32889204886,4.18029723271,1.,0.
-46,-0.714058448409,0.924562377599,1.75706135146,4.52492718422,4.3972678094,1.,0.
-47,0.889627293379,1.46207968841,1.78299357672,4.64466731095,4.56317887554,1.,0.
-48,0.520140662861,1.8996333843,1.41377633823,4.48899091177,4.78805049769,1.,0.
-49,-1.03816935616,2.08997002059,1.51218375351,4.84167764204,4.93026048606,1.,0.
-50,-0.40772951362,2.30878972136,1.44144415128,4.76854460997,5.01538444629,1.,0.
-51,0.792730684781,1.91367048509,1.58887384677,4.71739397335,5.25690012199,1.,0.
-52,0.371311881576,1.67565079528,1.81688563053,4.60353107555,5.44265822961,1.,0.
-53,-0.814398070371,1.13374634126,1.80328814859,4.72264252878,5.52674761122,1.,0.
-54,-0.469017949323,0.601244136627,2.29690896736,4.49859178859,5.54126153454,1.,0.
-55,0.871044371426,0.407597593794,2.7499112487,4.19060637761,5.57693767301,1.,0.
-56,0.523764933017,0.247705192709,3.09002071379,4.02095509006,5.80510362182,1.,0.
-57,-0.881326403531,0.31513103164,3.11358205718,3.96079100808,5.81000652365,1.,0.
-58,-0.357928025339,0.486163915865,3.17884556771,3.72634990659,5.85693642011,1.,0.
-59,0.853038779822,1.04218094475,3.45835384454,3.36703969978,5.9585988449,1.,0.
-60,0.435311516013,1.59715085283,3.63313338588,3.11276729421,5.93643818229,1.,0.
-61,-1.02703719138,1.92205832542,3.47606111735,3.06247155999,6.02106646259,1.,0.
-62,-0.246661325557,2.14653802542,3.29446326567,2.89936259181,5.67531541272,1.,0.
-63,1.02554736569,2.25943737733,3.07031591528,2.78176218013,5.78206328989,1.,0.
-64,0.337814475969,2.07589147224,2.80356226089,2.55888206331,5.7094075496,1.,0.
-65,-1.12023369929,1.25333011618,2.56497288445,2.77361359194,5.50799418376,1.,0.
-66,-0.178980246554,1.11937139901,2.51598681313,2.91438309151,5.47469577206,1.,0.
-67,0.97550951531,0.60553823137,2.11657741073,2.88081098981,5.37034999502,1.,0.
-68,0.136653357206,0.365828836075,1.97386033165,3.13217903204,5.07254490219,1.,0.
-69,-1.05607596951,0.153152115069,1.52110743825,3.01308794192,5.08902539125,1.,0.
-70,-0.13095280331,0.337113974483,1.52703079853,3.16687131599,4.86649398514,1.,0.
-71,1.07081057754,0.714247566736,1.53761382634,3.45151989484,4.75892309166,1.,0.
-72,0.0153410376082,1.24631231847,1.61690939161,3.85481994498,4.35683752832,1.,0.
-73,-0.912801257303,1.60791309476,1.8729264524,4.03037260012,4.36072588913,1.,0.
-74,-0.0894895640338,2.02535207407,1.93484909619,4.09557485132,4.35327025188,1.,0.
-75,0.978646999652,2.20085086625,2.09003440427,4.27542353033,4.1805058388,1.,0.
-76,-0.113312642876,2.2444100761,2.50789248839,4.4151861502,4.03267168136,1.,0.
-77,-1.00215099149,1.84305628445,2.61691237246,4.45425147595,3.81203553766,1.,0.
-78,-0.0183234614205,1.49573923116,2.99308471214,4.71134960112,4.0273804959,1.,0.
-79,1.0823738177,1.12211589848,3.27079386925,4.94288270502,4.01851068083,1.,0.
-80,0.124370187893,0.616474412808,3.4284236674,4.76942168327,3.9749536483,1.,0.
-81,-0.929423379352,0.290977090976,3.34131726136,4.78590392707,4.10190661656,1.,0.
-82,0.23766302648,0.155302052254,3.49779513794,4.64605656795,4.15571321107,1.,0.
-83,1.03531486192,0.359702776204,3.4880725919,4.48167586667,4.21134561991,1.,0.
-84,-0.261234571382,0.713877760378,3.42756426614,4.426443869,4.25208300527,1.,0.
-85,-1.03572442277,1.25001113691,2.96908341113,4.25500915322,4.25723010649,1.,0.
-86,0.380034261243,1.70543355622,2.73605932518,4.16703432307,4.63700400788,1.,0.
-87,1.03734873488,1.97544410562,2.55586572141,3.84976673263,4.55282864289,1.,0.
-88,-0.177344253372,2.22614526325,2.09565864891,3.77378097953,4.82577400298,1.,0.
-89,-0.976821526892,2.18385079177,1.78522284118,3.67768223554,5.06302440873,1.,0.
-90,0.264820472091,1.86981946157,1.50048403865,3.43619796921,5.05651761669,1.,0.
-91,1.05642344868,1.47568646076,1.51347671977,3.20898518885,5.50149047462,1.,0.
-92,-0.311607433358,1.04226467636,1.52089650905,3.02291865417,5.4889046232,1.,0.
-93,-0.724285777937,0.553052311957,1.48573560173,2.7365973598,5.72549174225,1.,0.
-94,0.519859192905,0.226520626591,1.61543723167,2.84102086852,5.69330622288,1.,0.
-95,1.0323195039,0.260873217055,1.81913034804,2.83951143848,5.90325028086,1.,0.
-96,-0.53285682538,0.387695521405,1.70935609313,2.57977050631,5.79579213161,1.,0.
-97,-0.975127997215,0.920948771589,2.51292643636,2.71004616612,5.87016469227,1.,0.
-98,0.540246804099,1.36445470181,2.61949412896,2.98482553485,6.02447664937,1.,0.
-99,0.987764008058,1.85581989607,2.84685706149,2.94760204892,6.0212151724,1.,0.
+0,0.926906299771,1.99107237682,2.56546245685,3.07914768197,4.04839057867,1.,0.,strkeya
+1,0.108010001864,1.41645361423,2.1686839775,2.94963962176,4.1263503303,1.,0.,strkeyb
+2,-0.800567600028,1.0172132907,1.96434754116,2.99885333086,4.04300485864,1.,0.,strkey
+3,0.0607042871898,0.719540073421,1.9765012584,2.89265588817,4.0951014426,1.,0.,strkey
+4,0.933712200629,0.28052120776,1.41018552514,2.69232603996,4.06481164223,1.,0.,strkey
+5,-0.171730652974,0.260054421028,1.48770816369,2.62199129293,4.44572807842,1.,0.,strkey
+6,-1.00180162933,0.333045158863,1.50006392277,2.88888309683,4.24755865606,1.,0.,strkey
+7,0.0580061875336,0.688929398826,1.56543458772,2.99840358953,4.52726873347,1.,0.,strkey
+8,0.764139447412,1.24704875327,1.77649279698,3.13578593851,4.63238922951,1.,0.,strkey
+9,-0.230331874785,1.47903998963,2.03547545751,3.20624030377,4.77980005228,1.,0.,strkey
+10,-1.03846045211,2.01133000781,2.31977503972,3.67951536251,5.09716775897,1.,0.,strkeyc
+11,0.188643592253,2.23285349038,2.68338482249,3.49817168611,5.24928239634,1.,0.,strkey
+12,0.91207302309,2.24244446841,2.71362604985,3.96332587625,5.37802271594,1.,0.,strkey
+13,-0.296588665881,2.02594634141,3.07733910479,3.99698324956,5.56365901394,1.,0.,strkey
+14,-0.959961476551,1.45078629833,3.18996420137,4.3763059609,5.65356015609,1.,0.,strkey
+15,0.46313530679,1.01141441548,3.4980215948,4.20224896882,5.88842247449,1.,0.,strkey
+16,0.929354125798,0.626635305936,3.70508262244,4.51791573544,5.73945973251,1.,0.,strkey
+17,-0.519110731957,0.269249223148,3.39866823332,4.46802003061,5.82768174382,1.,0.,strkey
+18,-0.924330981367,0.349602834684,3.21762413294,4.72803587499,5.94918925767,1.,0.,strkey
+19,0.253239387885,0.345158023497,3.11071425333,4.79311566935,5.9489259713,1.,0.,strkey
+20,0.637408390225,0.698996675371,3.25232492145,4.73814732384,5.9612010251,1.,0.,strkey
+21,-0.407396859412,1.17456342803,2.49526823723,4.59323415742,5.82501686811,1.,0.,strkey
+22,-0.967485452118,1.66655933642,2.47284606244,4.58316034754,5.88721406681,1.,0.,strkey
+23,0.474480867904,1.95018556323,2.0228950072,4.48651142819,5.8255943735,1.,0.,strkey
+24,1.04309652155,2.23519892356,1.91924131572,4.19094661783,5.87457348436,1.,0.,strkey
+25,-0.517861513772,2.12501967336,1.70266619979,4.05280882887,5.72160912899,1.,0.,strkey
+26,-0.945301585146,1.65464653549,1.81567174251,3.92309850635,5.58270493814,1.,0.,strkey
+27,0.501153868974,1.40600764889,1.53991387719,3.72853247942,5.60169001727,1.,0.,strkey
+28,0.972859524418,1.00344321868,1.5175642828,3.64092376655,5.10567722582,1.,0.,strkey
+29,-0.70553406135,0.465306263885,1.7038540803,3.33236870312,5.09182481555,1.,0.,strkey
+30,-0.946093634916,0.294539309453,1.88052827037,2.93011492669,4.97354922696,1.,0.,strkey
+31,0.47922123231,0.308465865031,2.03445883031,2.90772899045,4.86241793548,1.,0.,strkey
+32,0.754030014252,0.549752241167,2.46115815089,2.95063349534,4.71834614627,1.,0.,strkey
+33,-0.64875949826,0.894615488148,2.5922463381,2.81269864022,4.43480095104,1.,0.,strkey
+34,-0.757829951086,1.39123914261,2.69258079904,2.61834837315,4.36580046156,1.,0.,strkey
+35,0.565653301088,1.72360022693,2.97794913834,2.80403840334,4.27327248459,1.,0.,strkey
+36,0.867440092372,2.21100730052,3.38648090792,2.84057515729,4.12210169576,1.,0.,strkey
+37,-0.894567758095,2.17549105818,3.45532493329,2.90446025717,4.00251740584,1.,0.,strkeyd
+38,-0.715442356893,2.15105389965,3.52041791902,3.03650393392,4.12809249577,1.,0.,strkey
+39,0.80671703672,1.81504564517,3.60463324866,3.00747789871,3.98440762467,1.,0.,strkey
+40,0.527014790142,1.31803513865,3.43842186337,3.3332594663,4.03232406566,1.,0.,strkey
+41,-0.795936862129,0.847809114454,3.09875133548,3.52863155938,3.94883924909,1.,0.,strkey
+42,-0.610245806946,0.425530441018,2.92581949152,3.77238736123,4.27287245021,1.,0.,strkey
+43,0.611662279431,0.178432049837,2.48128214822,3.73212087883,4.17319013831,1.,0.,strkey
+44,0.650866553108,0.220341648392,2.41694642022,4.2609098519,4.27271645905,1.,0.,strkey
+45,-0.774156982023,0.632667602331,2.05474356052,4.32889204886,4.18029723271,1.,0.,strkey
+46,-0.714058448409,0.924562377599,1.75706135146,4.52492718422,4.3972678094,1.,0.,strkey
+47,0.889627293379,1.46207968841,1.78299357672,4.64466731095,4.56317887554,1.,0.,strkey
+48,0.520140662861,1.8996333843,1.41377633823,4.48899091177,4.78805049769,1.,0.,strkey
+49,-1.03816935616,2.08997002059,1.51218375351,4.84167764204,4.93026048606,1.,0.,strkey
+50,-0.40772951362,2.30878972136,1.44144415128,4.76854460997,5.01538444629,1.,0.,strkey
+51,0.792730684781,1.91367048509,1.58887384677,4.71739397335,5.25690012199,1.,0.,strkey
+52,0.371311881576,1.67565079528,1.81688563053,4.60353107555,5.44265822961,1.,0.,strkey
+53,-0.814398070371,1.13374634126,1.80328814859,4.72264252878,5.52674761122,1.,0.,strkey
+54,-0.469017949323,0.601244136627,2.29690896736,4.49859178859,5.54126153454,1.,0.,strkey
+55,0.871044371426,0.407597593794,2.7499112487,4.19060637761,5.57693767301,1.,0.,strkey
+56,0.523764933017,0.247705192709,3.09002071379,4.02095509006,5.80510362182,1.,0.,strkey
+57,-0.881326403531,0.31513103164,3.11358205718,3.96079100808,5.81000652365,1.,0.,strkey
+58,-0.357928025339,0.486163915865,3.17884556771,3.72634990659,5.85693642011,1.,0.,strkey
+59,0.853038779822,1.04218094475,3.45835384454,3.36703969978,5.9585988449,1.,0.,strkey
+60,0.435311516013,1.59715085283,3.63313338588,3.11276729421,5.93643818229,1.,0.,strkey
+61,-1.02703719138,1.92205832542,3.47606111735,3.06247155999,6.02106646259,1.,0.,strkey
+62,-0.246661325557,2.14653802542,3.29446326567,2.89936259181,5.67531541272,1.,0.,strkey
+63,1.02554736569,2.25943737733,3.07031591528,2.78176218013,5.78206328989,1.,0.,strkey
+64,0.337814475969,2.07589147224,2.80356226089,2.55888206331,5.7094075496,1.,0.,strkey
+65,-1.12023369929,1.25333011618,2.56497288445,2.77361359194,5.50799418376,1.,0.,strkey
+66,-0.178980246554,1.11937139901,2.51598681313,2.91438309151,5.47469577206,1.,0.,strkey
+67,0.97550951531,0.60553823137,2.11657741073,2.88081098981,5.37034999502,1.,0.,strkey
+68,0.136653357206,0.365828836075,1.97386033165,3.13217903204,5.07254490219,1.,0.,strkey
+69,-1.05607596951,0.153152115069,1.52110743825,3.01308794192,5.08902539125,1.,0.,strkey
+70,-0.13095280331,0.337113974483,1.52703079853,3.16687131599,4.86649398514,1.,0.,strkey
+71,1.07081057754,0.714247566736,1.53761382634,3.45151989484,4.75892309166,1.,0.,strkey
+72,0.0153410376082,1.24631231847,1.61690939161,3.85481994498,4.35683752832,1.,0.,strkey
+73,-0.912801257303,1.60791309476,1.8729264524,4.03037260012,4.36072588913,1.,0.,strkey
+74,-0.0894895640338,2.02535207407,1.93484909619,4.09557485132,4.35327025188,1.,0.,strkey
+75,0.978646999652,2.20085086625,2.09003440427,4.27542353033,4.1805058388,1.,0.,strkey
+76,-0.113312642876,2.2444100761,2.50789248839,4.4151861502,4.03267168136,1.,0.,strkey
+77,-1.00215099149,1.84305628445,2.61691237246,4.45425147595,3.81203553766,1.,0.,strkey
+78,-0.0183234614205,1.49573923116,2.99308471214,4.71134960112,4.0273804959,1.,0.,strkey
+79,1.0823738177,1.12211589848,3.27079386925,4.94288270502,4.01851068083,1.,0.,strkey
+80,0.124370187893,0.616474412808,3.4284236674,4.76942168327,3.9749536483,1.,0.,strkey
+81,-0.929423379352,0.290977090976,3.34131726136,4.78590392707,4.10190661656,1.,0.,strkey
+82,0.23766302648,0.155302052254,3.49779513794,4.64605656795,4.15571321107,1.,0.,strkey
+83,1.03531486192,0.359702776204,3.4880725919,4.48167586667,4.21134561991,1.,0.,strkey
+84,-0.261234571382,0.713877760378,3.42756426614,4.426443869,4.25208300527,1.,0.,strkey
+85,-1.03572442277,1.25001113691,2.96908341113,4.25500915322,4.25723010649,1.,0.,strkey
+86,0.380034261243,1.70543355622,2.73605932518,4.16703432307,4.63700400788,1.,0.,strkey
+87,1.03734873488,1.97544410562,2.55586572141,3.84976673263,4.55282864289,1.,0.,strkey
+88,-0.177344253372,2.22614526325,2.09565864891,3.77378097953,4.82577400298,1.,0.,strkey
+89,-0.976821526892,2.18385079177,1.78522284118,3.67768223554,5.06302440873,1.,0.,strkey
+90,0.264820472091,1.86981946157,1.50048403865,3.43619796921,5.05651761669,1.,0.,strkey
+91,1.05642344868,1.47568646076,1.51347671977,3.20898518885,5.50149047462,1.,0.,strkey
+92,-0.311607433358,1.04226467636,1.52089650905,3.02291865417,5.4889046232,1.,0.,strkey
+93,-0.724285777937,0.553052311957,1.48573560173,2.7365973598,5.72549174225,1.,0.,strkey
+94,0.519859192905,0.226520626591,1.61543723167,2.84102086852,5.69330622288,1.,0.,strkey
+95,1.0323195039,0.260873217055,1.81913034804,2.83951143848,5.90325028086,1.,0.,strkey
+96,-0.53285682538,0.387695521405,1.70935609313,2.57977050631,5.79579213161,1.,0.,strkey
+97,-0.975127997215,0.920948771589,2.51292643636,2.71004616612,5.87016469227,1.,0.,strkey
+98,0.540246804099,1.36445470181,2.61949412896,2.98482553485,6.02447664937,1.,0.,strkey
+99,0.987764008058,1.85581989607,2.84685706149,2.94760204892,6.0212151724,1.,0.,strkey
diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly.py b/tensorflow/contrib/timeseries/examples/known_anomaly.py
index 7659dd308a7ee1b70d6688b85e4f6157ddee0540..71621abc7190fae9973f78522e23f03d43e342c6 100644
--- a/tensorflow/contrib/timeseries/examples/known_anomaly.py
+++ b/tensorflow/contrib/timeseries/examples/known_anomaly.py
@@ -41,32 +41,60 @@ _MODULE_PATH = path.dirname(__file__)
 _DATA_FILE = path.join(_MODULE_PATH, "data/changepoints.csv")
 
 
-def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
-  """Training, evaluating, and predicting on a series with changepoints."""
+def state_space_esitmator(exogenous_feature_columns):
+  """Constructs a StructuralEnsembleRegressor."""
+
+  def _exogenous_update_condition(times, features):
+    del times  # unused
+    # Make exogenous updates sparse by setting an update condition. This in
+    # effect allows missing exogenous features: if the condition evaluates to
+    # False, no update is performed. Otherwise we sometimes end up with "leaky"
+    # updates which add unnecessary uncertainty to the model even when there is
+    # no changepoint.
+    return tf.equal(tf.squeeze(features["is_changepoint"], axis=-1), "yes")
+
+  return (
+      tf.contrib.timeseries.StructuralEnsembleRegressor(
+          periodicities=12,
+          # Extract a smooth period by constraining the number of latent values
+          # being cycled between.
+          cycle_num_latent_values=3,
+          num_features=1,
+          exogenous_feature_columns=exogenous_feature_columns,
+          exogenous_update_condition=_exogenous_update_condition),
+      # Use truncated backpropagation with a window size of 64, batching
+      # together 4 of these windows (random offsets) per training step. Training
+      # with exogenous features often requires somewhat larger windows.
+      4, 64)
+
+
+def autoregressive_esitmator(exogenous_feature_columns):
+  input_window_size = 8
+  output_window_size = 2
+  return (
+      tf.contrib.timeseries.ARRegressor(
+          periodicities=12,
+          num_features=1,
+          input_window_size=input_window_size,
+          output_window_size=output_window_size,
+          exogenous_feature_columns=exogenous_feature_columns),
+      64, input_window_size + output_window_size)
 
+
+def train_and_evaluate_exogenous(
+    estimator_fn, csv_file_name=_DATA_FILE, train_steps=300):
+  """Training, evaluating, and predicting on a series with changepoints."""
   # Indicate the format of our exogenous feature, in this case a string
   # representing a boolean value.
-  string_feature = tf.contrib.layers.sparse_column_with_keys(
-      column_name="is_changepoint", keys=["no", "yes"])
+  string_feature = tf.feature_column.categorical_column_with_vocabulary_list(
+      key="is_changepoint", vocabulary_list=["no", "yes"])
   # Specify the way this feature is presented to the model, here using a one-hot
   # encoding.
-  one_hot_feature = tf.contrib.layers.one_hot_column(
-      sparse_id_column=string_feature)
-
-  estimator = tf.contrib.timeseries.StructuralEnsembleRegressor(
-      periodicities=12,
-      # Extract a smooth period by constraining the number of latent values
-      # being cycled between.
-      cycle_num_latent_values=3,
-      num_features=1,
-      exogenous_feature_columns=[one_hot_feature],
-      # Make exogenous updates sparse by setting an update condition. This in
-      # effect allows missing exogenous features: if the condition evaluates to
-      # False, no update is performed. Otherwise we sometimes end up with
-      # "leaky" updates which add unnecessary uncertainty to the model even when
-      # there is no changepoint.
-      exogenous_update_condition=
-      lambda times, features: tf.equal(features["is_changepoint"], "yes"))
+  one_hot_feature = tf.feature_column.indicator_column(
+      categorical_column=string_feature)
+
+  estimator, batch_size, window_size = estimator_fn(
+      exogenous_feature_columns=[one_hot_feature])
   reader = tf.contrib.timeseries.CSVReader(
       csv_file_name,
       # Indicate the format of our CSV file. First we have two standard columns,
@@ -82,10 +110,7 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
       # This CSV has a header line; here we just ignore it.
       skip_header_lines=1)
   train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
-      # Use truncated backpropagation with a window size of 64, batching
-      # together 4 of these windows (random offsets) per training step. Training
-      # with exogenous features often requires somewhat larger windows.
-      reader, batch_size=4, window_size=64)
+      reader, batch_size=batch_size, window_size=window_size)
   estimator.train(input_fn=train_input_fn, steps=train_steps)
   evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
   evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
@@ -142,7 +167,12 @@ def main(unused_argv):
   if not HAS_MATPLOTLIB:
     raise ImportError(
         "Please install matplotlib to generate a plot from this example.")
-  make_plot("Ignoring a known anomaly", *train_and_evaluate_exogenous())
+  make_plot("Ignoring a known anomaly (state space)",
+            *train_and_evaluate_exogenous(
+                estimator_fn=state_space_esitmator))
+  make_plot("Ignoring a known anomaly (autoregressive)",
+            *train_and_evaluate_exogenous(
+                estimator_fn=autoregressive_esitmator, train_steps=3000))
   pyplot.show()
 
 
diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
index c3e307cad815d3c9c8556d0349d366d6f938101a..8c64f2e186a1aab0235f7cfbf1a942b872edd93b 100644
--- a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
+++ b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
@@ -23,12 +23,24 @@ from tensorflow.contrib.timeseries.examples import known_anomaly
 from tensorflow.python.platform import test
 
 
-class KnownAnaomalyExampleTest(test.TestCase):
+class KnownAnomalyExampleTest(test.TestCase):
 
-  def test_shapes_and_variance_structural(self):
+  def test_shapes_and_variance_structural_ar(self):
     (times, observed, all_times, mean, upper_limit, lower_limit,
      anomaly_locations) = known_anomaly.train_and_evaluate_exogenous(
-         train_steps=50)
+         train_steps=1, estimator_fn=known_anomaly.autoregressive_esitmator)
+    self.assertAllEqual(
+        anomaly_locations,
+        [25, 50, 75, 100, 125, 150, 175, 249])
+    self.assertAllEqual(all_times.shape, mean.shape)
+    self.assertAllEqual(all_times.shape, upper_limit.shape)
+    self.assertAllEqual(all_times.shape, lower_limit.shape)
+    self.assertAllEqual(times.shape, observed.shape)
+
+  def test_shapes_and_variance_structural_ssm(self):
+    (times, observed, all_times, mean, upper_limit, lower_limit,
+     anomaly_locations) = known_anomaly.train_and_evaluate_exogenous(
+         train_steps=50, estimator_fn=known_anomaly.state_space_esitmator)
     self.assertAllEqual(
         anomaly_locations,
         [25, 50, 75, 100, 125, 150, 175, 249])
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index f37cafcc502dc9415db0829b9b067b862f87dca7..b1c7475442c58b9a190c818b752760a4fb4fe6f0 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -59,10 +59,10 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
       num_units: The number of units in the model's LSTMCell.
       num_features: The dimensionality of the time series (features per
         timestep).
-      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
-          objects representing features which are inputs to the model but are
-          not predicted by it. These must then be present for training,
-          evaluation, and prediction.
+      exogenous_feature_columns: A list of `tf.feature_column`s representing
+          features which are inputs to the model but are not predicted by
+          it. These must then be present for training, evaluation, and
+          prediction.
       dtype: The floating point data type to use.
     """
     super(_LSTMModel, self).__init__(
@@ -189,12 +189,16 @@ def train_and_predict(
     export_directory=None):
   """Train and predict using a custom time series model."""
   # Construct an Estimator from our LSTM model.
+  categorical_column = tf.feature_column.categorical_column_with_hash_bucket(
+      key="categorical_exogenous_feature", hash_bucket_size=16)
   exogenous_feature_columns = [
       # Exogenous features are not part of the loss, but can inform
       # predictions. In this example the features have no extra information, but
       # are included as an API example.
-      tf.contrib.layers.real_valued_column(
-          "2d_exogenous_feature", dimension=2)]
+      tf.feature_column.numeric_column(
+          "2d_exogenous_feature", shape=(2,)),
+      tf.feature_column.embedding_column(
+          categorical_column=categorical_column, dimension=10)]
   estimator = ts_estimators.TimeSeriesRegressor(
       model=_LSTMModel(num_features=5, num_units=128,
                        exogenous_feature_columns=exogenous_feature_columns),
@@ -205,7 +209,11 @@ def train_and_predict(
       csv_file_name,
       column_names=((tf.contrib.timeseries.TrainEvalFeatures.TIMES,)
                     + (tf.contrib.timeseries.TrainEvalFeatures.VALUES,) * 5
-                    + ("2d_exogenous_feature",) * 2))
+                    + ("2d_exogenous_feature",) * 2
+                    + ("categorical_exogenous_feature",)),
+      # Data types other than for `times` need to be specified if they aren't
+      # float32. In this case one of our exogenous features has string dtype.
+      column_dtypes=((tf.int64,) + (tf.float32,) * 7 + (tf.string,)))
   train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
       reader, batch_size=4, window_size=32)
   estimator.train(input_fn=train_input_fn, steps=training_steps)
@@ -215,7 +223,9 @@ def train_and_predict(
   predict_exogenous_features = {
       "2d_exogenous_feature": numpy.concatenate(
           [numpy.ones([1, 100, 1]), numpy.zeros([1, 100, 1])],
-          axis=-1)}
+          axis=-1),
+      "categorical_exogenous_feature": numpy.array(
+          ["strkey"] * 100)[None, :, None]}
   (predictions,) = tuple(estimator.predict(
       input_fn=tf.contrib.timeseries.predict_continuation_input_fn(
           evaluation, steps=100,
@@ -226,20 +236,36 @@ def train_and_predict(
       [evaluation["mean"][0], predictions["mean"]], axis=0))
   all_times = numpy.concatenate([times, predictions["times"]], axis=0)
 
-  # Export the model in SavedModel format.
+  # Export the model in SavedModel format. We include a bit of extra boilerplate
+  # for "cold starting" as if we didn't have any state from the Estimator, which
+  # is the case when serving from a SavedModel. If Estimator output is
+  # available, the result of "Estimator.evaluate" can be passed directly to
+  # `tf.contrib.timeseries.saved_model_utils.predict_continuation` as the
+  # `continue_from` argument.
+  with tf.Graph().as_default():
+    filter_feature_tensors, _ = evaluation_input_fn()
+    with tf.train.MonitoredSession() as session:
+      # Fetch the series to "warm up" our state, which will allow us to make
+      # predictions for its future values. This is just a dictionary of times,
+      # values, and exogenous features mapping to numpy arrays. The use of an
+      # input_fn is just a convenience for the example; they can also be
+      # specified manually.
+      filter_features = session.run(filter_feature_tensors)
   if export_directory is None:
     export_directory = tempfile.mkdtemp()
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
   export_location = estimator.export_savedmodel(
       export_directory, input_receiver_fn)
-  # Predict using the SavedModel
+  # Warm up and predict using the SavedModel
   with tf.Graph().as_default():
     with tf.Session() as session:
       signatures = tf.saved_model.loader.load(
           session, [tf.saved_model.tag_constants.SERVING], export_location)
+      state = tf.contrib.timeseries.saved_model_utils.cold_start_filter(
+          signatures=signatures, session=session, features=filter_features)
       saved_model_output = (
           tf.contrib.timeseries.saved_model_utils.predict_continuation(
-              continue_from=evaluation, signatures=signatures,
+              continue_from=state, signatures=signatures,
               session=session, steps=100,
               exogenous_features=predict_exogenous_features))
       # The exported model gives the same results as the Estimator.predict()
diff --git a/tensorflow/contrib/timeseries/examples/lstm_test.py b/tensorflow/contrib/timeseries/examples/lstm_test.py
index ca56e38ca079f71b38cf29605a295a50929945e8..c58e24e6d9748868791d21b0ff4ec28ca2f646c3 100644
--- a/tensorflow/contrib/timeseries/examples/lstm_test.py
+++ b/tensorflow/contrib/timeseries/examples/lstm_test.py
@@ -36,17 +36,14 @@ class LSTMExampleTest(test.TestCase):
   def test_periodicity_learned(self):
     (observed_times, observed_values,
      all_times, predicted_values) = lstm.train_and_predict(
-         training_steps=100, estimator_config=_SeedRunConfig(),
+         training_steps=2, estimator_config=_SeedRunConfig(),
          export_directory=self.get_temp_dir())
     self.assertAllEqual([100], observed_times.shape)
     self.assertAllEqual([100, 5], observed_values.shape)
     self.assertAllEqual([200], all_times.shape)
     self.assertAllEqual([200, 5], predicted_values.shape)
-    self.assertGreater(
-        predicted_values[100, 4]
-        - predicted_values[115, 4],  # Amplitude of fifth component
-        0.2)
-
+    # TODO(allenl): Make the model deterministic so you can check something
+    # substantive.
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index fff972c1f3277ad5d83673a202a50d1e6f7df210..d2746032a04946cdfab4b5ac968ea3add5f6b51d 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -88,10 +88,14 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:export",
+        "//tensorflow/python/feature_column",
     ],
 )
 
@@ -132,7 +136,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":feature_keys",
-        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -140,11 +143,14 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:export",
         "//tensorflow/python/estimator:head",
+        "//tensorflow/python/estimator:metric_keys",
     ],
 )
 
@@ -154,23 +160,30 @@ py_test(
         "head_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip_gpu",  # b/63391119
-    ],
+    tags = ["no_pip_gpu"],  # b/63391119
     deps = [
+        ":estimators",
         ":feature_keys",
         ":head",
+        ":input_pipeline",
         ":model",
         ":state_management",
+        "//tensorflow/contrib/timeseries/examples:lstm",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -233,6 +246,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_pip",  # b/64527635
         "no_pip_gpu",  # b/63391119
     ],
@@ -425,6 +439,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip_gpu",  # b/63391119
+        "no_windows",  # TODO: needs investigation on Windows
     ],
     deps = [
         ":feature_keys",
@@ -440,15 +455,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index ff140efd48104e386826eab7abbc94bec220f9df..558d9480b495ca87e828e8b440370ec9c6e3be2f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -60,7 +60,8 @@ class ARModel(model.TimeSeriesModel):
                num_features,
                num_time_buckets=10,
                loss=NORMAL_LIKELIHOOD_LOSS,
-               hidden_layer_sizes=None):
+               hidden_layer_sizes=None,
+               exogenous_feature_columns=None):
     """Constructs an auto-regressive model.
 
     Args:
@@ -70,7 +71,7 @@ class ARModel(model.TimeSeriesModel):
       input_window_size: Number of past time steps of data to look at when doing
         the regression.
       output_window_size: Number of future time steps to predict. Note that
-        setting it to > 1 empiricaly seems to give a better fit.
+        setting it to > 1 empirically seems to give a better fit.
       num_features: number of input features per time step.
       num_time_buckets: Number of buckets into which to divide (time %
         periodicity) for generating time based features.
@@ -81,6 +82,11 @@ class ARModel(model.TimeSeriesModel):
         observations and predictions, while the training loss is computed on
         normalized data (if input statistics are available).
       hidden_layer_sizes: list of sizes of hidden layers.
+      exogenous_feature_columns: A list of `tf.feature_column`s (for example
+          `tf.feature_column.embedding_column`) corresponding to exogenous
+          features which provide extra information to the model but are not part
+          of the series to be predicted. Passed to
+          `tf.feature_column.input_layer`.
     """
     self.input_window_size = input_window_size
     self.output_window_size = output_window_size
@@ -90,7 +96,12 @@ class ARModel(model.TimeSeriesModel):
     self.window_size = self.input_window_size + self.output_window_size
     self.loss = loss
     super(ARModel, self).__init__(
-        num_features=num_features)
+        num_features=num_features,
+        exogenous_feature_columns=exogenous_feature_columns)
+    if exogenous_feature_columns is not None:
+      self.exogenous_size = self._get_exogenous_embedding_shape()[-1]
+    else:
+      self.exogenous_size = 0
     assert num_time_buckets > 0
     self._buckets = int(num_time_buckets)
     if periodicities is None or not periodicities:
@@ -110,7 +121,10 @@ class ARModel(model.TimeSeriesModel):
     # that the serving input_receiver_fn gets placeholder shapes correct.
     return (array_ops.zeros([self.input_window_size], dtype=dtypes.int64),
             array_ops.zeros(
-                [self.input_window_size, self.num_features], dtype=self.dtype))
+                [self.input_window_size, self.num_features], dtype=self.dtype),
+            array_ops.zeros(
+                [self.input_window_size, self.exogenous_size],
+                dtype=self.dtype))
 
   # TODO(allenl,agarwal): Support sampling for AR.
   def random_model_parameters(self, seed=None):
@@ -163,7 +177,7 @@ class ARModel(model.TimeSeriesModel):
       activations.append((activation, activation_size))
     return activations
 
-  def prediction_ops(self, times, values):
+  def prediction_ops(self, times, values, exogenous_regressors):
     """Compute model predictions given input data.
 
     Args:
@@ -173,6 +187,8 @@ class ARModel(model.TimeSeriesModel):
           prediction times.
       values: A [batch size, self.input_window_size, self.num_features] Tensor
           with input features.
+      exogenous_regressors: A [batch size, self.window_size,
+          self.exogenous_size] Tensor with exogenous features.
     Returns:
       Tuple (predicted_mean, predicted_covariance), where each element is a
       Tensor with shape [batch size, self.output_window_size,
@@ -183,25 +199,33 @@ class ARModel(model.TimeSeriesModel):
     if self.input_window_size:
       values.get_shape().assert_is_compatible_with(
           [None, self.input_window_size, self.num_features])
+    if exogenous_regressors is not None:
+      exogenous_regressors.get_shape().assert_is_compatible_with(
+          [None, self.window_size, self.exogenous_size])
     # Create input features.
+    activation_components = []
     if self._periods:
       _, time_features = self._compute_time_features(times)
       activation_size = self.window_size * self._buckets * len(self._periods)
-      activation = array_ops.reshape(time_features, [-1, activation_size])
+      activation_components.append(
+          array_ops.reshape(time_features, [-1, activation_size]))
     else:
       activation_size = 0
-      activation = None
-
     if self.input_window_size:
       inp = array_ops.slice(values, [0, 0, 0], [-1, self.input_window_size, -1])
       inp_size = self.input_window_size * self.num_features
       inp = array_ops.reshape(inp, [-1, inp_size])
-      if activation is not None:
-        activation = array_ops.concat([inp, activation], 1)
-      else:
-        activation = inp
+      activation_components.append(inp)
       activation_size += inp_size
+    if self.exogenous_size:
+      exogenous_size = self.window_size * self.exogenous_size
+      activation_size += exogenous_size
+      exogenous_flattened = array_ops.reshape(
+          exogenous_regressors, [-1, exogenous_size])
+      activation_components.append(exogenous_flattened)
     assert activation_size
+    assert activation_components
+    activation = array_ops.concat(activation_components, axis=1)
     activations.append((activation, activation_size))
     # Create hidden layers.
     activations += self._create_hidden_stack(activation, activation_size)
@@ -228,6 +252,19 @@ class ARModel(model.TimeSeriesModel):
         math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype)
     return loss_op
 
+  def _process_exogenous_features(self, times, features):
+    embedded = super(ARModel, self)._process_exogenous_features(
+        times=times, features=features)
+    if embedded is None:
+      assert self.exogenous_size == 0
+      # No embeddings. Return a zero-size [batch, times, 0] array so we don't
+      # have to special case it downstream.
+      return array_ops.zeros(
+          array_ops.concat([array_ops.shape(times), constant_op.constant([0])],
+                           axis=0))
+    else:
+      return embedded
+
   # TODO(allenl, agarwal): Consider better ways of warm-starting predictions.
   def predict(self, features):
     """Computes predictions multiple steps into the future.
@@ -243,6 +280,7 @@ class ARModel(model.TimeSeriesModel):
           segment of the time series before `TIMES`. This data is used
           to start of the autoregressive computation. This should have data for
           at least self.input_window_size timesteps.
+        And any exogenous features, with shapes prefixed by shape of `TIMES`.
     Returns:
       A dictionary with keys, "mean", "covariance". The
       values are Tensors of shape [batch_size, predict window size,
@@ -250,25 +288,39 @@ class ARModel(model.TimeSeriesModel):
     """
     predict_times = math_ops.cast(
         ops.convert_to_tensor(features[PredictionFeatures.TIMES]), dtypes.int32)
+    exogenous_regressors = self._process_exogenous_features(
+        times=predict_times,
+        features={key: value for key, value in features.items()
+                  if key not in [TrainEvalFeatures.TIMES,
+                                 TrainEvalFeatures.VALUES,
+                                 PredictionFeatures.STATE_TUPLE]})
+    with ops.control_dependencies(
+        [check_ops.assert_equal(array_ops.shape(predict_times)[1],
+                                array_ops.shape(exogenous_regressors)[1])]):
+      exogenous_regressors = array_ops.identity(exogenous_regressors)
     batch_size = array_ops.shape(predict_times)[0]
     num_predict_values = array_ops.shape(predict_times)[1]
     prediction_iterations = ((num_predict_values + self.output_window_size - 1)
                              // self.output_window_size)
-    # Pad predict_times so as to have exact multiple of self.output_window_size
-    # values per example.
+    # Pad predict_times and exogenous regressors so as to have exact multiple of
+    # self.output_window_size values per example.
     padding_size = (prediction_iterations * self.output_window_size -
                     num_predict_values)
-    padding = array_ops.zeros([batch_size, padding_size], predict_times.dtype)
-    predict_times = control_flow_ops.cond(
-        padding_size > 0, lambda: array_ops.concat([predict_times, padding], 1),
-        lambda: predict_times)
+    predict_times = array_ops.pad(
+        predict_times, [[0, 0], [0, padding_size]])
+    exogenous_regressors = array_ops.pad(
+        exogenous_regressors, [[0, 0], [0, padding_size], [0, 0]])
     state = features[PredictionFeatures.STATE_TUPLE]
-    (state_times, state_values) = state
+    (state_times, state_values, state_exogenous_regressors) = state
     state_times = math_ops.cast(
         ops.convert_to_tensor(state_times), dtypes.int32)
     state_values = ops.convert_to_tensor(state_values, dtype=self.dtype)
+    state_exogenous_regressors = ops.convert_to_tensor(
+        state_exogenous_regressors, dtype=self.dtype)
 
     initial_input_times = predict_times[:, :self.output_window_size]
+    initial_input_exogenous_regressors = (
+        exogenous_regressors[:, :self.output_window_size, :])
     if self.input_window_size > 0:
       initial_input_times = array_ops.concat(
           [state_times[:, -self.input_window_size:], initial_input_times], 1)
@@ -279,6 +331,11 @@ class ARModel(model.TimeSeriesModel):
           check_ops.assert_equal(values_size, times_size)
       ]):
         initial_input_values = state_values[:, -self.input_window_size:, :]
+        initial_input_exogenous_regressors = array_ops.concat(
+            [state_exogenous_regressors[:, -self.input_window_size:, :],
+             initial_input_exogenous_regressors[
+                 :, :self.output_window_size, :]],
+            axis=1)
     else:
       initial_input_values = 0
 
@@ -288,9 +345,10 @@ class ARModel(model.TimeSeriesModel):
       return math_ops.less(iteration_number, prediction_iterations)
 
     def _while_body(iteration_number, input_times, input_values,
-                    mean_ta, covariance_ta):
+                    input_exogenous_regressors, mean_ta, covariance_ta):
       """Predict self.output_window_size values."""
-      prediction_ops = self.prediction_ops(input_times, input_values)
+      prediction_ops = self.prediction_ops(
+          input_times, input_values, input_exogenous_regressors)
       predicted_mean = prediction_ops["mean"]
       predicted_covariance = prediction_ops["covariance"]
       offset = self.output_window_size * gen_math_ops.minimum(
@@ -299,20 +357,33 @@ class ARModel(model.TimeSeriesModel):
         if self.output_window_size < self.input_window_size:
           new_input_values = array_ops.concat(
               [input_values[:, self.output_window_size:, :], predicted_mean], 1)
+          new_input_exogenous_regressors = array_ops.concat(
+              [input_exogenous_regressors[:, -self.input_window_size:, :],
+               exogenous_regressors[
+                   :, offset:offset + self.output_window_size, :]],
+              axis=1)
           new_input_times = array_ops.concat([
-              input_times[:, self.output_window_size:],
+              input_times[:, -self.input_window_size:],
               predict_times[:, offset:offset + self.output_window_size]
           ], 1)
         else:
           new_input_values = predicted_mean[:, -self.input_window_size:, :]
+          new_input_exogenous_regressors = exogenous_regressors[
+              :,
+              offset - self.input_window_size:offset + self.output_window_size,
+              :]
           new_input_times = predict_times[
               :,
               offset - self.input_window_size:offset + self.output_window_size]
       else:
         new_input_values = input_values
+        new_input_exogenous_regressors = exogenous_regressors[
+            :, offset:offset + self.output_window_size, :]
         new_input_times = predict_times[:,
                                         offset:offset + self.output_window_size]
       new_input_times.set_shape(initial_input_times.get_shape())
+      new_input_exogenous_regressors.set_shape(
+          initial_input_exogenous_regressors.get_shape())
       new_mean_ta = mean_ta.write(iteration_number, predicted_mean)
       if isinstance(covariance_ta, tensor_array_ops.TensorArray):
         new_covariance_ta = covariance_ta.write(iteration_number,
@@ -322,6 +393,7 @@ class ARModel(model.TimeSeriesModel):
       return (iteration_number + 1,
               new_input_times,
               new_input_values,
+              new_input_exogenous_regressors,
               new_mean_ta,
               new_covariance_ta)
 
@@ -332,9 +404,13 @@ class ARModel(model.TimeSeriesModel):
                           if self.loss != ARModel.SQUARED_LOSS else 0.)
     mean_ta_init = tensor_array_ops.TensorArray(
         dtype=self.dtype, size=prediction_iterations)
-    _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop(
+    _, _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop(
         _while_condition, _while_body, [
-            0, initial_input_times, initial_input_values, mean_ta_init,
+            0,
+            initial_input_times,
+            initial_input_values,
+            initial_input_exogenous_regressors,
+            mean_ta_init,
             covariance_ta_init
         ])
 
@@ -366,11 +442,11 @@ class ARModel(model.TimeSeriesModel):
     return {"mean": predicted_mean,
             "covariance": predicted_covariance}
 
-  def _process_window(self, features, mode):
+  def _process_window(self, features, mode, exogenous_regressors):
     """Compute model outputs on a single window of data."""
-    # TODO(agarwal): Use exogenous features
     times = math_ops.cast(features[TrainEvalFeatures.TIMES], dtypes.int64)
     values = math_ops.cast(features[TrainEvalFeatures.VALUES], dtype=self.dtype)
+    exogenous_regressors = math_ops.cast(exogenous_regressors, dtype=self.dtype)
     original_values = values
 
     # Extra shape checking for the window size (above that in
@@ -395,7 +471,8 @@ class ARModel(model.TimeSeriesModel):
       input_values = values[:, :self.input_window_size, :]
     else:
       input_values = None
-    prediction_ops = self.prediction_ops(times, input_values)
+    prediction_ops = self.prediction_ops(
+        times, input_values, exogenous_regressors)
     prediction = prediction_ops["mean"]
     covariance = prediction_ops["covariance"]
     targets = array_ops.slice(values, [0, self.input_window_size, 0],
@@ -419,7 +496,8 @@ class ARModel(model.TimeSeriesModel):
     return model.ModelOutputs(
         loss=loss,
         end_state=(times[:, -self.input_window_size:],
-                   values[:, -self.input_window_size:, :]),
+                   values[:, -self.input_window_size:, :],
+                   exogenous_regressors[:, -self.input_window_size:, :]),
         predictions={"mean": prediction, "covariance": covariance,
                      "observed": original_values[:, -self.output_window_size:]},
         prediction_times=times[:, -self.output_window_size:])
@@ -454,17 +532,24 @@ class ARModel(model.TimeSeriesModel):
     """
     features = {feature_name: ops.convert_to_tensor(feature_value)
                 for feature_name, feature_value in features.items()}
+    times = features[TrainEvalFeatures.TIMES]
+    exogenous_regressors = self._process_exogenous_features(
+        times=times,
+        features={key: value for key, value in features.items()
+                  if key not in [TrainEvalFeatures.TIMES,
+                                 TrainEvalFeatures.VALUES,
+                                 PredictionFeatures.STATE_TUPLE]})
     if mode == estimator_lib.ModeKeys.TRAIN:
       # For training, we require the window size to be self.window_size as
       # iterating sequentially on larger windows could introduce a bias.
-      return self._process_window(features, mode=mode)
+      return self._process_window(
+          features, mode=mode, exogenous_regressors=exogenous_regressors)
     elif mode == estimator_lib.ModeKeys.EVAL:
       # For evaluation, we allow the user to pass in a larger window, in which
       # case we try to cover as much of the window as possible without
       # overlap. Quantitative evaluation is more efficient/correct with fixed
       # windows matching self.window_size (as with training), but this looping
       # allows easy plotting of "in-sample" predictions.
-      times = features[TrainEvalFeatures.TIMES]
       times.get_shape().assert_has_rank(2)
       static_window_size = times.get_shape()[1].value
       if (static_window_size is not None
@@ -500,7 +585,9 @@ class ARModel(model.TimeSeriesModel):
                 feature_name:
                 feature_value[:, base_offset:base_offset + self.window_size]
                 for feature_name, feature_value in features.items()},
-            mode=mode)
+            mode=mode,
+            exogenous_regressors=exogenous_regressors[
+                :, base_offset:base_offset + self.window_size])
         # This code needs to be updated if new predictions are added in
         # self._process_window
         assert len(model_outputs.predictions) == 3
@@ -525,7 +612,9 @@ class ARModel(model.TimeSeriesModel):
       batch_size = array_ops.shape(times)[0]
       prediction_shape = [batch_size, self.output_window_size * num_iterations,
                           self.num_features]
-      previous_state_times, previous_state_values = state
+      (previous_state_times,
+       previous_state_values,
+       previous_state_exogenous_regressors) = state
       # Make sure returned state always has windows of self.input_window_size,
       # even if we were passed fewer than self.input_window_size points this
       # time.
@@ -540,14 +629,24 @@ class ARModel(model.TimeSeriesModel):
              self._scale_data(values)], axis=1)[:, -self.input_window_size:, :]
         new_state_values.set_shape((None, self.input_window_size,
                                     self.num_features))
+        new_exogenous_regressors = array_ops.concat(
+            [previous_state_exogenous_regressors,
+             exogenous_regressors], axis=1)[:, -self.input_window_size:, :]
+        new_exogenous_regressors.set_shape(
+            (None,
+             self.input_window_size,
+             self.exogenous_size))
       else:
         # There is no state to keep, and the strided slices above do not handle
         # input_window_size=0.
         new_state_times = previous_state_times
         new_state_values = previous_state_values
+        new_exogenous_regressors = previous_state_exogenous_regressors
       return model.ModelOutputs(
           loss=math_ops.reduce_mean(loss_ta.stack(), axis=0),
-          end_state=(new_state_times, new_state_values),
+          end_state=(new_state_times,
+                     new_state_values,
+                     new_exogenous_regressors),
           predictions={
               "mean": array_ops.reshape(
                   array_ops.transpose(mean_ta.stack(), [1, 0, 2, 3]),
@@ -604,7 +703,8 @@ class AnomalyMixtureARModel(ARModel):
                num_features,
                anomaly_distribution=GAUSSIAN_ANOMALY,
                num_time_buckets=10,
-               hidden_layer_sizes=None):
+               hidden_layer_sizes=None,
+               exogenous_feature_columns=None):
     assert (anomaly_prior_probability < 1.0 and
             anomaly_prior_probability > 0.0)
     self._anomaly_prior_probability = anomaly_prior_probability
@@ -619,7 +719,8 @@ class AnomalyMixtureARModel(ARModel):
         input_window_size=input_window_size,
         output_window_size=output_window_size,
         loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
-        hidden_layer_sizes=hidden_layer_sizes)
+        hidden_layer_sizes=hidden_layer_sizes,
+        exogenous_feature_columns=exogenous_feature_columns)
 
   def _create_anomaly_ops(self, times, values, prediction_ops_dict):
     anomaly_log_param = variable_scope.get_variable(
@@ -631,9 +732,9 @@ class AnomalyMixtureARModel(ARModel):
     # distribution.
     prediction_ops_dict["anomaly_params"] = gen_math_ops.exp(anomaly_log_param)
 
-  def prediction_ops(self, times, values):
+  def prediction_ops(self, times, values, exogenous_regressors):
     prediction_ops_dict = super(AnomalyMixtureARModel, self).prediction_ops(
-        times, values)
+        times, values, exogenous_regressors)
     self._create_anomaly_ops(times, values, prediction_ops_dict)
     return prediction_ops_dict
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
index 1e1ca4e77fc41bb418cf2521c2c7fbed9f27c6a8..d078ac8d46397d00efefc32e19aa9bd1133aebaa 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
@@ -155,12 +155,15 @@ class ARModelTest(test.TestCase):
     state_times = np.expand_dims(train_data_times[:input_window_size], 0)
     state_values = np.expand_dims(
         train_data_values[:input_window_size, :], 0)
+    state_exogenous = state_times[:, :, None][:, :, :0]
 
     def prediction_input_fn():
       return ({
           PredictionFeatures.TIMES: training.limit_epochs(
               predict_times, num_epochs=1),
-          PredictionFeatures.STATE_TUPLE: (state_times, state_values)
+          PredictionFeatures.STATE_TUPLE: (state_times,
+                                           state_values,
+                                           state_exogenous)
       }, {})
     (predictions,) = tuple(estimator.predict(input_fn=prediction_input_fn))
     predicted_mean = predictions["mean"][:, 0]
@@ -246,7 +249,8 @@ class ARModelTest(test.TestCase):
       with session.Session():
         predicted_values = model.predict({
             PredictionFeatures.TIMES: [[4, 6, 10]],
-            PredictionFeatures.STATE_TUPLE: ([[1, 2]], [[[1.], [2.]]])
+            PredictionFeatures.STATE_TUPLE: (
+                [[1, 2]], [[[1.], [2.]]], [[[], []]])
         })
         variables.global_variables_initializer().run()
         self.assertAllEqual(predicted_values["mean"].eval().shape,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index f8355f366fe8e191ab570fd271bbe4a8bf71c73d..f4608ca2d1cc286575f10f6922335f43c7ec0231 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.layers.python.layers import feature_column
-
 from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
@@ -31,18 +29,22 @@ from tensorflow.contrib.timeseries.python.timeseries.state_space_models.filterin
 
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.export import export_lib
+from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.training import training as train
+from tensorflow.python.util import nest
 
 
 class TimeSeriesRegressor(estimator_lib.Estimator):
   """An Estimator to fit and evaluate a time series model."""
 
   def __init__(self, model, state_manager=None, optimizer=None, model_dir=None,
-               config=None):
+               config=None, head_type=ts_head_lib.TimeSeriesRegressionHead):
     """Initialize the Estimator.
 
     Args:
@@ -53,6 +55,8 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
           from tf.train.Optimizer. Defaults to Adam with step size 0.02.
       model_dir: See `Estimator`.
       config: See `Estimator`.
+      head_type: The kind of head to use for the model (inheriting from
+          `TimeSeriesRegressionHead`).
     """
     input_statistics_generator = math_utils.InputStatisticsFromMiniBatch(
         dtype=model.dtype, num_features=model.num_features)
@@ -61,8 +65,8 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
     if optimizer is None:
       optimizer = train.AdamOptimizer(0.02)
     self._model = model
-    ts_regression_head = ts_head_lib.time_series_regression_head(
-        model, state_manager, optimizer,
+    ts_regression_head = head_type(
+        model=model, state_manager=state_manager, optimizer=optimizer,
         input_statistics_generator=input_statistics_generator)
     model_fn = ts_regression_head.create_estimator_spec
     super(TimeSeriesRegressor, self).__init__(
@@ -98,11 +102,11 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
     def _serving_input_receiver_fn():
       """A receiver function to be passed to export_savedmodel."""
       placeholders = {}
-      placeholders[feature_keys.TrainEvalFeatures.TIMES] = (
-          array_ops.placeholder(
-              name=feature_keys.TrainEvalFeatures.TIMES,
-              dtype=dtypes.int64,
-              shape=[default_batch_size, default_series_length]))
+      time_placeholder = array_ops.placeholder(
+          name=feature_keys.TrainEvalFeatures.TIMES,
+          dtype=dtypes.int64,
+          shape=[default_batch_size, default_series_length])
+      placeholders[feature_keys.TrainEvalFeatures.TIMES] = time_placeholder
       # Values are only necessary when filtering. For prediction the default
       # value will be ignored.
       placeholders[feature_keys.TrainEvalFeatures.VALUES] = (
@@ -117,36 +121,57 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
                   dtype=self._model.dtype),
               shape=(default_batch_size, default_series_length,
                      self._model.num_features)))
-      with ops.Graph().as_default():
-        # Default placeholders have only an unknown batch dimension. Make them
-        # in a separate graph, then splice in the series length to the shapes
-        # and re-create them in the outer graph.
-        exogenous_feature_shapes = {
-            key: (value.get_shape(), value.dtype) for key, value
-            in feature_column.make_place_holder_tensors_for_base_features(
-                self._model.exogenous_feature_columns).items()}
-      for feature_key, (batch_only_feature_shape, value_dtype) in (
-          exogenous_feature_shapes.items()):
-        batch_only_feature_shape = batch_only_feature_shape.with_rank_at_least(
-            1).as_list()
-        feature_shape = ([default_batch_size, default_series_length]
-                         + batch_only_feature_shape[1:])
-        placeholders[feature_key] = array_ops.placeholder(
-            dtype=value_dtype, name=feature_key, shape=feature_shape)
+      if self._model.exogenous_feature_columns:
+        with ops.Graph().as_default():
+          # Default placeholders have only an unknown batch dimension. Make them
+          # in a separate graph, then splice in the series length to the shapes
+          # and re-create them in the outer graph.
+          parsed_features = (
+              feature_column.make_parse_example_spec(
+                  self._model.exogenous_feature_columns))
+          placeholder_features = parsing_ops.parse_example(
+              serialized=array_ops.placeholder(
+                  shape=[None], dtype=dtypes.string),
+              features=parsed_features)
+          exogenous_feature_shapes = {
+              key: (value.get_shape(), value.dtype) for key, value
+              in placeholder_features.items()}
+        for feature_key, (batch_only_feature_shape, value_dtype) in (
+            exogenous_feature_shapes.items()):
+          batch_only_feature_shape = (
+              batch_only_feature_shape.with_rank_at_least(1).as_list())
+          feature_shape = ([default_batch_size, default_series_length]
+                           + batch_only_feature_shape[1:])
+          placeholders[feature_key] = array_ops.placeholder(
+              dtype=value_dtype, name=feature_key, shape=feature_shape)
       # Models may not know the shape of their state without creating some
       # variables/ops. Avoid polluting the default graph by making a new one. We
       # use only static metadata from the returned Tensors.
       with ops.Graph().as_default():
         self._model.initialize_graph()
-        model_start_state = self._model.get_start_state()
-      for prefixed_state_name, state_tensor in ts_head_lib.state_to_dictionary(
-          model_start_state).items():
+        # Evaluate the initial state as same-dtype "zero" values. These zero
+        # constants aren't used, but are necessary for feeding to
+        # placeholder_with_default for the "cold start" case where state is not
+        # fed to the model.
+        def _zeros_like_constant(tensor):
+          return tensor_util.constant_value(array_ops.zeros_like(tensor))
+        start_state = nest.map_structure(
+            _zeros_like_constant, self._model.get_start_state())
+      batch_size_tensor = array_ops.shape(time_placeholder)[0]
+      for prefixed_state_name, state in ts_head_lib.state_to_dictionary(
+          start_state).items():
         state_shape_with_batch = tensor_shape.TensorShape(
-            (default_batch_size,)).concatenate(state_tensor.get_shape())
-        placeholders[prefixed_state_name] = array_ops.placeholder(
+            (default_batch_size,)).concatenate(state.shape)
+        default_state_broadcast = array_ops.tile(
+            state[None, ...],
+            multiples=array_ops.concat(
+                [batch_size_tensor[None],
+                 array_ops.ones(len(state.shape), dtype=dtypes.int32)],
+                axis=0))
+        placeholders[prefixed_state_name] = array_ops.placeholder_with_default(
+            input=default_state_broadcast,
             name=prefixed_state_name,
-            shape=state_shape_with_batch,
-            dtype=state_tensor.dtype)
+            shape=state_shape_with_batch)
       return export_lib.ServingInputReceiver(placeholders, placeholders)
 
     return _serving_input_receiver_fn
@@ -165,7 +190,7 @@ class ARRegressor(TimeSeriesRegressor):
 
   def __init__(
       self, periodicities, input_window_size, output_window_size,
-      num_features, num_time_buckets=10,
+      num_features, exogenous_feature_columns=None, num_time_buckets=10,
       loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS, hidden_layer_sizes=None,
       anomaly_prior_probability=None, anomaly_distribution=None,
       optimizer=None, model_dir=None, config=None):
@@ -180,7 +205,12 @@ class ARRegressor(TimeSeriesRegressor):
       output_window_size: Number of future time steps to predict. Note that
         setting it to > 1 empirically seems to give a better fit.
       num_features: The dimensionality of the time series (one for univariate,
-          more than one for multivariate).
+        more than one for multivariate).
+      exogenous_feature_columns: A list of `tf.feature_column`s (for example
+        `tf.feature_column.embedding_column`) corresponding to exogenous
+        features which provide extra information to the model but are not part
+        of the series to be predicted. Passed to
+        `tf.feature_column.input_layer`.
       num_time_buckets: Number of buckets into which to divide (time %
         periodicity) for generating time based features.
       loss: Loss function to use for training. Currently supported values are
@@ -216,6 +246,7 @@ class ARRegressor(TimeSeriesRegressor):
         anomaly_distribution = ar_model.AnomalyMixtureARModel.GAUSSIAN_ANOMALY
       model = ar_model.ARModel(
           periodicities=periodicities, num_features=num_features,
+          exogenous_feature_columns=exogenous_feature_columns,
           num_time_buckets=num_time_buckets,
           input_window_size=input_window_size,
           output_window_size=output_window_size, loss=loss,
@@ -230,6 +261,7 @@ class ARRegressor(TimeSeriesRegressor):
           input_window_size=input_window_size,
           output_window_size=output_window_size,
           num_features=num_features,
+          exogenous_feature_columns=exogenous_feature_columns,
           num_time_buckets=num_time_buckets,
           hidden_layer_sizes=hidden_layer_sizes,
           anomaly_prior_probability=anomaly_prior_probability,
@@ -333,11 +365,11 @@ class StructuralEnsembleRegressor(StateSpaceRegressor):
           determine the model size. Learning autoregressive coefficients
           typically requires more steps and a smaller step size than other
           components.
-      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
-          objects (for example tf.contrib.layers.embedding_column) corresponding
-          to exogenous features which provide extra information to the model but
-          are not part of the series to be predicted. Passed to
-          tf.contrib.layers.input_from_feature_columns.
+      exogenous_feature_columns: A list of `tf.feature_column`s (for example
+          `tf.feature_column.embedding_column`) corresponding to exogenous
+          features which provide extra information to the model but are not part
+          of the series to be predicted. Passed to
+          `tf.feature_column.input_layer`.
       exogenous_update_condition: A function taking two Tensor arguments,
           `times` (shape [batch size]) and `features` (a dictionary mapping
           exogenous feature keys to Tensors with shapes [batch size, ...]), and
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index f4304f2560a82b666f87f302a821a39b0e9e140e..eebee053f8e6000bdf17996abde03896bf4f32e1 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import tempfile
 
 import numpy
+import six
 
 from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import estimators
@@ -28,6 +29,7 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
@@ -47,12 +49,17 @@ class TimeSeriesRegressorTest(test.TestCase):
   def _fit_restore_fit_test_template(self, estimator_fn, dtype):
     """Tests restoring previously fit models."""
     model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    first_estimator = estimator_fn(model_dir)
+    exogenous_feature_columns = (
+        feature_column.numeric_column("exogenous"),
+    )
+    first_estimator = estimator_fn(model_dir, exogenous_feature_columns)
     times = numpy.arange(20, dtype=numpy.int64)
     values = numpy.arange(20, dtype=dtype.as_numpy_dtype)
+    exogenous = numpy.arange(20, dtype=dtype.as_numpy_dtype)
     features = {
         feature_keys.TrainEvalFeatures.TIMES: times,
-        feature_keys.TrainEvalFeatures.VALUES: values
+        feature_keys.TrainEvalFeatures.VALUES: values,
+        "exogenous": exogenous
     }
     train_input_fn = input_pipeline.RandomWindowInputFn(
         input_pipeline.NumpyReader(features), shuffle_seed=2, num_threads=1,
@@ -67,14 +74,19 @@ class TimeSeriesRegressorTest(test.TestCase):
     first_loss_after_fit = first_estimator.evaluate(
         input_fn=eval_input_fn, steps=1)["loss"]
     self.assertLess(first_loss_after_fit, first_loss_before_fit)
-    second_estimator = estimator_fn(model_dir)
+    second_estimator = estimator_fn(model_dir, exogenous_feature_columns)
     second_estimator.train(input_fn=train_input_fn, steps=2)
     whole_dataset_input_fn = input_pipeline.WholeDatasetInputFn(
         input_pipeline.NumpyReader(features))
     whole_dataset_evaluation = second_estimator.evaluate(
         input_fn=whole_dataset_input_fn, steps=1)
+    exogenous_values_ten_steps = {
+        "exogenous": numpy.arange(
+            10, dtype=dtype.as_numpy_dtype)[None, :, None]
+    }
     predict_input_fn = input_pipeline.predict_continuation_input_fn(
         evaluation=whole_dataset_evaluation,
+        exogenous_features=exogenous_values_ten_steps,
         steps=10)
     # Also tests that limit_epochs in predict_continuation_input_fn prevents
     # infinite iteration
@@ -91,6 +103,7 @@ class TimeSeriesRegressorTest(test.TestCase):
         saved_prediction = saved_model_utils.predict_continuation(
             continue_from=whole_dataset_evaluation,
             steps=10,
+            exogenous_features=exogenous_values_ten_steps,
             signatures=signatures,
             session=sess)
         # Saved model predictions should be the same as Estimator predictions
@@ -103,7 +116,8 @@ class TimeSeriesRegressorTest(test.TestCase):
             continue_from=whole_dataset_evaluation,
             features={
                 feature_keys.FilteringFeatures.TIMES: times[None, -1] + 2,
-                feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2.
+                feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2.,
+                "exogenous": values[None, -1, None] + 12.
             },
             signatures=signatures,
             session=sess)
@@ -111,6 +125,10 @@ class TimeSeriesRegressorTest(test.TestCase):
         second_saved_prediction = saved_model_utils.predict_continuation(
             continue_from=first_filtering,
             steps=1,
+            exogenous_features={
+                "exogenous": numpy.arange(
+                    1, dtype=dtype.as_numpy_dtype)[None, :, None]
+            },
             signatures=signatures,
             session=sess)
         self.assertEqual(
@@ -121,27 +139,63 @@ class TimeSeriesRegressorTest(test.TestCase):
             continue_from=first_filtering,
             features={
                 feature_keys.FilteringFeatures.TIMES: times[-1] + 3,
-                feature_keys.FilteringFeatures.VALUES: values[-1] + 3.
+                feature_keys.FilteringFeatures.VALUES: values[-1] + 3.,
+                "exogenous": values[-1, None] + 13.
+            },
+            signatures=signatures,
+            session=sess)
+
+        # Test cold starting
+        six.assertCountEqual(
+            self,
+            [feature_keys.FilteringFeatures.TIMES,
+             feature_keys.FilteringFeatures.VALUES,
+             "exogenous"],
+            signatures.signature_def[
+                feature_keys.SavedModelLabels.COLD_START_FILTER].inputs.keys())
+        batch_numpy_times = numpy.tile(
+            numpy.arange(30, dtype=numpy.int64)[None, :], (10, 1))
+        batch_numpy_values = numpy.ones([10, 30, 1])
+        state = saved_model_utils.cold_start_filter(
+            signatures=signatures,
+            session=sess,
+            features={
+                feature_keys.FilteringFeatures.TIMES: batch_numpy_times,
+                feature_keys.FilteringFeatures.VALUES: batch_numpy_values,
+                "exogenous": 10. + batch_numpy_values
+            }
+        )
+        predict_times = numpy.tile(
+            numpy.arange(30, 45, dtype=numpy.int64)[None, :], (10, 1))
+        predictions = saved_model_utils.predict_continuation(
+            continue_from=state,
+            times=predict_times,
+            exogenous_features={
+                "exogenous": numpy.tile(numpy.arange(
+                    15, dtype=dtype.as_numpy_dtype), (10,))[None, :, None]
             },
             signatures=signatures,
             session=sess)
+        self.assertAllEqual([10, 15, 1], predictions["mean"].shape)
 
   def test_fit_restore_fit_ar_regressor(self):
-    def _estimator_fn(model_dir):
+    def _estimator_fn(model_dir, exogenous_feature_columns):
       return estimators.ARRegressor(
           periodicities=10, input_window_size=10, output_window_size=6,
           num_features=1, model_dir=model_dir, config=_SeedRunConfig(),
           # This test is flaky with normal likelihood loss (could add more
           # training iterations instead).
-          loss=ar_model.ARModel.SQUARED_LOSS)
+          loss=ar_model.ARModel.SQUARED_LOSS,
+          exogenous_feature_columns=exogenous_feature_columns)
     self._fit_restore_fit_test_template(_estimator_fn, dtype=dtypes.float32)
 
   def test_fit_restore_fit_structural_ensemble_regressor(self):
     dtype = dtypes.float32
-    def _estimator_fn(model_dir):
+    def _estimator_fn(model_dir, exogenous_feature_columns):
       return estimators.StructuralEnsembleRegressor(
           num_features=1, periodicities=10, model_dir=model_dir, dtype=dtype,
-          config=_SeedRunConfig())
+          config=_SeedRunConfig(),
+          exogenous_feature_columns=exogenous_feature_columns)
     self._fit_restore_fit_test_template(_estimator_fn, dtype=dtype)
 
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py b/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py
index 970b9aa8acd6f55db843a4e023052b122992baf4..56566ee2e3207abd81ef665da10f851c9dc98ccb 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py
@@ -72,3 +72,4 @@ class SavedModelLabels(object):
   """Names of signatures exported with export_savedmodel."""
   PREDICT = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
   FILTER = "filter"
+  COLD_START_FILTER = "cold_start_filter"
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index f0330bfbbd6e8067e5d085376acdf2e6bcaccb6a..a28a5872b850b51630240bdeb3ff22f372613523 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.export import export_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,29 +36,21 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
+from tensorflow.python.summary import summary
 
 
-def time_series_regression_head(model,
-                                state_manager,
-                                optimizer,
-                                input_statistics_generator=None):
-  """Creates a `_Head` for time series regression.
+class _NoStatePredictOutput(export_lib.PredictOutput):
 
-  Args:
-    model: A model for time series regression.
-    state_manager: A state manager.
-    optimizer: An optimizer.
-    input_statistics_generator: A input statistics generator.
-
-  Returns:
-    An instance of `_Head` for time series regression.
-  """
-  return _TimeSeriesRegressionHead(model, state_manager, optimizer,
-                                   input_statistics_generator)
+  def as_signature_def(self, receiver_tensors):
+    no_state_receiver_tensors = {
+        key: value for key, value in receiver_tensors.items()
+        if not key.startswith(feature_keys.State.STATE_PREFIX)}
+    return super(_NoStatePredictOutput, self).as_signature_def(
+        receiver_tensors=no_state_receiver_tensors)
 
 
-class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
-  """See `time_series_regression_head`."""
+class TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
+  """Determines input and output signatures for a time series model."""
 
   def __init__(self,
                model,
@@ -65,17 +58,49 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                optimizer,
                input_statistics_generator=None,
                name=None):
+    """Creates a `_Head` for time series regression.
+
+    Args:
+      model: A model for time series regression.
+      state_manager: A state manager.
+      optimizer: An optimizer.
+      input_statistics_generator: A input statistics generator.
+      name: An optional name for the model.
+    """
     self.model = model
     self.state_manager = state_manager
     self.optimizer = optimizer
     self.input_statistics_generator = input_statistics_generator
     self._name = name
 
+  @property
+  def name(self):
+    return self._name
+
+  # TODO(terrytangyuan): consolidate `model_outputs` and `_Head.LossSpec`
+  # once `_Head.create_loss` becomes extendable
+  def create_loss(self, features, mode, logits=None, labels=None):
+    """See `_Head`."""
+    model_outputs = self.state_manager.define_loss(
+        self.model, features, mode)
+    summary.scalar(
+        head_lib._summary_key(self._name, metric_keys.MetricKeys.LOSS),
+        model_outputs.loss)
+    return model_outputs
+
+  @property
+  def logits_dimension(self):
+    """See `_Head`."""
+    return 1
+
   def _train_ops(self, features):
     """Add training ops to the graph."""
-    with variable_scope.variable_scope("model"):
-      model_outputs = self.state_manager.define_loss(
-          self.model, features, estimator_lib.ModeKeys.TRAIN)
+    mode = estimator_lib.ModeKeys.TRAIN
+    with variable_scope.variable_scope(
+        "model",
+        # Use ResourceVariables to avoid race conditions.
+        use_resource=True):
+      model_outputs = self.create_loss(features, mode)
 
     train_op = optimizers.optimize_loss(
         model_outputs.loss,
@@ -85,31 +110,14 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
         learning_rate=None)
     return estimator_lib.EstimatorSpec(
         loss=model_outputs.loss,
-        mode=estimator_lib.ModeKeys.TRAIN,
+        mode=mode,
         train_op=train_op)
 
-  # TODO(terrytangyuan): suffix summary and metrics keys by `"/" + name`
-  @property
-  def name(self):
-    return self._name
-
-  # TODO(terrytangyuan): unused for now. Need to decouple
-  # `state_manager.define_loss` to satisfy the extendable return signature of
-  # `_Head.create_loss`.
-  def create_loss(self, features, mode, logits, labels):
-    """See `_Head`."""
-    return None
-
-  # TODO(terrytangyuan): check label dimension
-  @property
-  def logits_dimension(self):
-    return None
-
   def _evaluate_ops(self, features):
     """Add ops for evaluation (aka filtering) to the graph."""
-    with variable_scope.variable_scope("model"):
-      model_outputs = self.state_manager.define_loss(
-          self.model, features, estimator_lib.ModeKeys.EVAL)
+    mode = estimator_lib.ModeKeys.EVAL
+    with variable_scope.variable_scope("model", use_resource=True):
+      model_outputs = self.create_loss(features, mode)
     metrics = {}
     # Just output in-sample predictions for the last chunk seen
     for prediction_key, prediction_value in model_outputs.predictions.items():
@@ -122,13 +130,13 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                                 model_outputs.end_state))
     return estimator_lib.EstimatorSpec(
         loss=model_outputs.loss,
-        mode=estimator_lib.ModeKeys.EVAL,
+        mode=mode,
         eval_metric_ops=metrics,
         predictions={})
 
   def _predict_ops(self, features):
     """Add ops for prediction to the graph."""
-    with variable_scope.variable_scope("model"):
+    with variable_scope.variable_scope("model", use_resource=True):
       prediction = self.model.predict(features=features)
     prediction[feature_keys.PredictionResults.TIMES] = features[
         feature_keys.PredictionFeatures.TIMES]
@@ -137,12 +145,19 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
 
   def _serving_ops(self, features):
     """Add ops for serving to the graph."""
-    with variable_scope.variable_scope("model"):
+    with variable_scope.variable_scope("model", use_resource=True):
       prediction_outputs = self.model.predict(features=features)
     with variable_scope.variable_scope("model", reuse=True):
-      filtering_outputs = self.state_manager.define_loss(
-          self.model, features, estimator_lib.ModeKeys.EVAL)
-
+      filtering_outputs = self.create_loss(
+          features, estimator_lib.ModeKeys.EVAL)
+    with variable_scope.variable_scope("model", reuse=True):
+      no_state_features = {
+          k: v for k, v in features.items()
+          if not k.startswith(feature_keys.State.STATE_PREFIX)}
+      # Ignore any state management when cold-starting. The model's default
+      # start state is replicated across the batch.
+      cold_filtering_outputs = self.model.define_loss(
+          features=no_state_features, mode=estimator_lib.ModeKeys.EVAL)
     return estimator_lib.EstimatorSpec(
         mode=estimator_lib.ModeKeys.PREDICT,
         export_outputs={
@@ -150,7 +165,10 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                 export_lib.PredictOutput(prediction_outputs),
             feature_keys.SavedModelLabels.FILTER:
                 export_lib.PredictOutput(
-                    state_to_dictionary(filtering_outputs.end_state))
+                    state_to_dictionary(filtering_outputs.end_state)),
+            feature_keys.SavedModelLabels.COLD_START_FILTER:
+                _NoStatePredictOutput(
+                    state_to_dictionary(cold_filtering_outputs.end_state))
         },
         # Likely unused, but it is necessary to return `predictions` to satisfy
         # the Estimator's error checking.
@@ -191,7 +209,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
 
   def create_estimator_spec(self, features, mode, labels=None):
     """Performs basic error checking and returns an EstimatorSpec."""
-    with ops.name_scope("head"):
+    with ops.name_scope(self._name, "head"):
       if labels:
         raise ValueError(
             "The model received a `labels` dictionary, which is "
@@ -237,6 +255,58 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
         return self._serving_ops(features)
 
 
+class OneShotPredictionHead(TimeSeriesRegressionHead):
+  """A time series head which exports a single stateless serving signature.
+
+  The serving default signature exported by this head expects `times`, `values`,
+  and any exogenous features, but no state. `values` has shape `[batch_size,
+  filter_length, num_features]` and `times` has shape `[batch_size,
+  total_length]`, where `total_length > filter_length`. Any exogenous features
+  must have their shapes prefixed by the shape of the `times` feature.
+
+  When serving, first performs filtering on the series up to `filter_length`
+  starting from the default start state for the model, then computes predictions
+  on the remainder of the series, returning them.
+
+  Model state is neither accepted nor returned, so filtering must be performed
+  each time predictions are requested when using this head.
+  """
+
+  def _serving_ops(self, features):
+    """Add ops for serving to the graph."""
+    with variable_scope.variable_scope("model", use_resource=True):
+      filtering_features = {}
+      prediction_features = {}
+      values_length = array_ops.shape(
+          features[feature_keys.FilteringFeatures.VALUES])[1]
+      for key, value in features.items():
+        if key == feature_keys.State.STATE_TUPLE:
+          # Ignore state input. The model's default start state is replicated
+          # across the batch.
+          continue
+        if key == feature_keys.FilteringFeatures.VALUES:
+          filtering_features[key] = value
+        else:
+          filtering_features[key] = value[:, :values_length]
+          prediction_features[key] = value[:, values_length:]
+      cold_filtering_outputs = self.model.define_loss(
+          features=filtering_features, mode=estimator_lib.ModeKeys.EVAL)
+      prediction_features[feature_keys.State.STATE_TUPLE] = (
+          cold_filtering_outputs.end_state)
+    with variable_scope.variable_scope("model", reuse=True):
+      prediction_outputs = self.model.predict(
+          features=prediction_features)
+    return estimator_lib.EstimatorSpec(
+        mode=estimator_lib.ModeKeys.PREDICT,
+        export_outputs={
+            feature_keys.SavedModelLabels.PREDICT:
+                _NoStatePredictOutput(prediction_outputs),
+        },
+        # Likely unused, but it is necessary to return `predictions` to satisfy
+        # the Estimator's error checking.
+        predictions={})
+
+
 def _check_feature_shapes_compatible_with(features,
                                           compatible_with_name,
                                           compatible_with_value,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 3415061cfd87358cccaf36dcb301fb36986bbde6..c606db76a668235ab6a837159b9dec072b5fd801 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -18,12 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy
+import six
+
+from tensorflow.contrib.timeseries.examples import lstm as lstm_example
+from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
 from tensorflow.contrib.timeseries.python.timeseries import model
 from tensorflow.contrib.timeseries.python.timeseries import state_management
 
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -31,6 +39,9 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import adam
 from tensorflow.python.training import coordinator as coordinator_lib
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import training as train
@@ -90,7 +101,7 @@ class EvaluationMetricsTests(test.TestCase):
                       .count_up_to(10),
                       dtype=dtypes.float32), (1, 1, 1))
       }
-      model_fn = ts_head_lib.time_series_regression_head(
+      model_fn = ts_head_lib.TimeSeriesRegressionHead(
           model=_TickerModel(),
           state_manager=state_management.PassthroughStateManager(),
           optimizer=train.GradientDescentOptimizer(0.001)).create_estimator_spec
@@ -127,7 +138,7 @@ class _StubModel(object):
 
 
 def _stub_model_fn():
-  return ts_head_lib.time_series_regression_head(
+  return ts_head_lib.TimeSeriesRegressionHead(
       model=_StubModel(),
       state_manager=state_management.PassthroughStateManager(),
       optimizer=train.AdamOptimizer(0.001)).create_estimator_spec
@@ -263,5 +274,76 @@ class PredictFeatureCheckingTests(test.TestCase):
           mode=estimator_lib.ModeKeys.PREDICT)
 
 
+class OneShotTests(test.TestCase):
+
+  def test_one_shot_prediction_head_export(self):
+    model_dir = self.get_temp_dir()
+    categorical_column = feature_column.categorical_column_with_hash_bucket(
+        key="categorical_exogenous_feature", hash_bucket_size=16)
+    exogenous_feature_columns = [
+        feature_column.numeric_column(
+            "2d_exogenous_feature", shape=(2,)),
+        feature_column.embedding_column(
+            categorical_column=categorical_column, dimension=10)]
+    estimator = ts_estimators.TimeSeriesRegressor(
+        model=lstm_example._LSTMModel(
+            num_features=5, num_units=128,
+            exogenous_feature_columns=exogenous_feature_columns),
+        optimizer=adam.AdamOptimizer(0.001),
+        config=estimator_lib.RunConfig(tf_random_seed=4),
+        state_manager=state_management.ChainingStateManager(),
+        head_type=ts_head_lib.OneShotPredictionHead,
+        model_dir=model_dir)
+    train_features = {
+        feature_keys.TrainEvalFeatures.TIMES: numpy.arange(
+            20, dtype=numpy.int64),
+        feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
+            20, dtype=numpy.float32)[:, None], [1, 5]),
+        "2d_exogenous_feature": numpy.ones([20, 2]),
+        "categorical_exogenous_feature": numpy.array(
+            ["strkey"] * 20)[:, None]
+    }
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(train_features), shuffle_seed=2,
+        num_threads=1, batch_size=16, window_size=16)
+    estimator.train(input_fn=train_input_fn, steps=5)
+    input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
+    export_location = estimator.export_savedmodel(self.get_temp_dir(),
+                                                  input_receiver_fn)
+    graph = ops.Graph()
+    with graph.as_default():
+      with session_lib.Session() as session:
+        signatures = loader.load(
+            session, [tag_constants.SERVING], export_location)
+        self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
+                         list(signatures.signature_def.keys()))
+        predict_signature = signatures.signature_def[
+            feature_keys.SavedModelLabels.PREDICT]
+        six.assertCountEqual(
+            self,
+            [feature_keys.FilteringFeatures.TIMES,
+             feature_keys.FilteringFeatures.VALUES,
+             "2d_exogenous_feature",
+             "categorical_exogenous_feature"],
+            predict_signature.inputs.keys())
+        features = {
+            feature_keys.TrainEvalFeatures.TIMES: numpy.tile(
+                numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
+            feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
+                20, dtype=numpy.float32)[None, :, None], [2, 1, 5]),
+            "2d_exogenous_feature": numpy.ones([2, 35, 2]),
+            "categorical_exogenous_feature": numpy.tile(numpy.array(
+                ["strkey"] * 35)[None, :, None], [2, 1, 1])
+        }
+        feeds = {
+            graph.as_graph_element(input_value.name): features[input_key]
+            for input_key, input_value in predict_signature.inputs.items()}
+        fetches = {output_key: graph.as_graph_element(output_value.name)
+                   for output_key, output_value
+                   in predict_signature.outputs.items()}
+        output = session.run(fetches, feed_dict=feeds)
+        self.assertAllEqual((2, 15, 5), output["mean"].shape)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py b/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py
index 04225333b9377447f46d32663df76aece97a51e7..403c6e2cb4aeb665fb112b6322109a6a90f7a261 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py
@@ -492,8 +492,7 @@ class CSVReader(ReaderBaseTimeSeriesParser):
       features_lists.setdefault(column_name, []).append(value)
     features = {}
     for column_name, values in features_lists.items():
-      if (len(values) == 1 and
-          column_name != feature_keys.TrainEvalFeatures.VALUES):
+      if column_name == feature_keys.TrainEvalFeatures.TIMES:
         features[column_name] = values[0]
       else:
         features[column_name] = array_ops.stack(values, axis=1)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index 23452a81c397da3516016d72b7bc9b80f7d6447f..9b593fecbb3fbc3b8b57848462c85dff4c3b7577 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -60,7 +60,7 @@ def clip_covariance(
   # TODO(allenl): Smarter scaling here so that correlations are preserved when
   # fiddling with diagonal elements.
   diagonal = array_ops.matrix_diag_part(covariance_matrix)
-  maximum = math_ops.reduce_max(diagonal, axis=-1, keep_dims=True)
+  maximum = math_ops.reduce_max(diagonal, axis=-1, keepdims=True)
   new_diagonal = gen_math_ops.maximum(
       diagonal, maximum / maximum_variance_ratio)
   return array_ops.matrix_set_diag(
@@ -185,7 +185,7 @@ def batch_matrix_pow(matrices, powers):
                     { matmul(A, power(matmul(A, A), (p - 1) / 2)) for odd p
       power(A, 0) = I
 
-    The power(A, 0) = I case is handeled by starting with accumulator set to the
+    The power(A, 0) = I case is handled by starting with accumulator set to the
     identity matrix; matrices with zero residual powers are passed through
     unchanged.
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model.py b/tensorflow/contrib/timeseries/python/timeseries/model.py
index bac7d1ebf59b28d4688a3d1a69ecdc1fc12248e0..7644764a7459db3951fe9a2790389713dd412a8f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/model.py
@@ -21,18 +21,17 @@ from __future__ import print_function
 import abc
 import collections
 
-from tensorflow.contrib import layers
-from tensorflow.contrib.layers import feature_column
-
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
 
+from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 
@@ -66,11 +65,11 @@ class TimeSeriesModel(object):
 
     Args:
       num_features: Number of features for the time series
-      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
-          objects (for example tf.contrib.layers.embedding_column) corresponding
-          to exogenous features which provide extra information to the model but
-          are not part of the series to be predicted. Passed to
-          tf.contrib.layers.input_from_feature_columns.
+      exogenous_feature_columns: A list of `tf.feature_column`s (for example
+           `tf.feature_column.embedding_column`) corresponding to exogenous
+           features which provide extra information to the model but are not
+           part of the series to be predicted. Passed to
+           `tf.feature_column.input_layer`.
       dtype: The floating point datatype to use.
     """
     if exogenous_feature_columns:
@@ -86,7 +85,7 @@ class TimeSeriesModel(object):
 
   @property
   def exogenous_feature_columns(self):
-    """`FeatureColumn` objects for features which are not predicted."""
+    """`tf.feature_colum`s for features which are not predicted."""
     return self._exogenous_feature_columns
 
   # TODO(allenl): Move more of the generic machinery for generating and
@@ -265,11 +264,14 @@ class TimeSeriesModel(object):
     if not self._exogenous_feature_columns:
       return (0,)
     with ops.Graph().as_default():
-      placeholder_features = (
-          feature_column.make_place_holder_tensors_for_base_features(
+      parsed_features = (
+          feature_column.make_parse_example_spec(
               self._exogenous_feature_columns))
-      embedded = layers.input_from_feature_columns(
-          columns_to_tensors=placeholder_features,
+      placeholder_features = parsing_ops.parse_example(
+          serialized=array_ops.placeholder(shape=[None], dtype=dtypes.string),
+          features=parsed_features)
+      embedded = feature_column.input_layer(
+          features=placeholder_features,
           feature_columns=self._exogenous_feature_columns)
       return embedded.get_shape().as_list()[1:]
 
@@ -308,13 +310,13 @@ class TimeSeriesModel(object):
         # Avoid shape warnings when embedding "scalar" exogenous features (those
         # with only batch and window dimensions); input_from_feature_columns
         # expects input ranks to match the embedded rank.
-        if tensor.get_shape().ndims == 1:
+        if tensor.get_shape().ndims == 1 and tensor.dtype != dtypes.string:
           exogenous_features_single_batch_dimension[name] = tensor[:, None]
         else:
           exogenous_features_single_batch_dimension[name] = tensor
       embedded_exogenous_features_single_batch_dimension = (
-          layers.input_from_feature_columns(
-              columns_to_tensors=exogenous_features_single_batch_dimension,
+          feature_column.input_layer(
+              features=exogenous_features_single_batch_dimension,
               feature_columns=self._exogenous_feature_columns,
               trainable=True))
       exogenous_regressors = array_ops.reshape(
@@ -381,8 +383,8 @@ class SequentialTimeSeriesModel(TimeSeriesModel):
           may use _scale_back_data or _scale_back_variance to return predictions
           to the input scale.
       dtype: The floating point datatype to use.
-      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
-          objects. See `TimeSeriesModel`.
+      exogenous_feature_columns: A list of `tf.feature_column`s objects. See
+          `TimeSeriesModel`.
       exogenous_update_condition: A function taking two Tensor arguments `times`
           (shape [batch size]) and `features` (a dictionary mapping exogenous
           feature keys to Tensors with shapes [batch size, ...]) and returning a
diff --git a/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py b/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
index 97f6d36a879532c12684ffdd700ef40b72750567..0461abdc19c08767114e3d26d1134ea4bc5481f8 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
@@ -15,6 +15,7 @@
 """Convenience functions for working with time series saved_models.
 
 @@predict_continuation
+@@cold_start_filter
 @@filter_continuation
 """
 
@@ -30,10 +31,12 @@ from tensorflow.contrib.timeseries.python.timeseries import model_utils as _mode
 from tensorflow.python.util.all_util import remove_undocumented
 
 
-def _colate_features_to_feeds_and_fetches(continue_from, signature, features,
-                                          graph):
+def _colate_features_to_feeds_and_fetches(signature, features, graph,
+                                          continue_from=None):
   """Uses a saved model signature to construct feed and fetch dictionaries."""
-  if _feature_keys.FilteringResults.STATE_TUPLE in continue_from:
+  if continue_from is None:
+    state_values = {}
+  elif _feature_keys.FilteringResults.STATE_TUPLE in continue_from:
     # We're continuing from an evaluation, so we need to unpack/flatten state.
     state_values = _head.state_to_dictionary(
         continue_from[_feature_keys.FilteringResults.STATE_TUPLE])
@@ -115,6 +118,55 @@ def predict_continuation(continue_from,
   return output
 
 
+def cold_start_filter(signatures, session, features):
+  """Perform filtering using an exported saved model.
+
+  Filtering refers to updating model state based on new observations.
+  Predictions based on the returned model state will be conditioned on these
+  observations.
+
+  Starts from the model's default/uninformed state.
+
+  Args:
+    signatures: The `MetaGraphDef` protocol buffer returned from
+      `tf.saved_model.loader.load`. Used to determine the names of Tensors to
+      feed and fetch. Must be from the same model as `continue_from`.
+    session: The session to use. The session's graph must be the one into which
+      `tf.saved_model.loader.load` loaded the model.
+    features: A dictionary mapping keys to Numpy arrays, with several possible
+      shapes (requires keys `FilteringFeatures.TIMES` and
+      `FilteringFeatures.VALUES`):
+        Single example; `TIMES` is a scalar and `VALUES` is either a scalar or a
+          vector of length [number of features].
+        Sequence; `TIMES` is a vector of shape [series length], `VALUES` either
+          has shape [series length] (univariate) or [series length x number of
+          features] (multivariate).
+        Batch of sequences; `TIMES` is a vector of shape [batch size x series
+          length], `VALUES` has shape [batch size x series length] or [batch
+          size x series length x number of features].
+      In any case, `VALUES` and any exogenous features must have their shapes
+      prefixed by the shape of the value corresponding to the `TIMES` key.
+  Returns:
+    A dictionary containing model state updated to account for the observations
+    in `features`.
+  """
+  filter_signature = signatures.signature_def[
+      _feature_keys.SavedModelLabels.COLD_START_FILTER]
+  features = _input_pipeline._canonicalize_numpy_data(  # pylint: disable=protected-access
+      data=features,
+      require_single_batch=False)
+  output_tensors_by_name, feed_dict = _colate_features_to_feeds_and_fetches(
+      signature=filter_signature,
+      features=features,
+      graph=session.graph)
+  output = session.run(output_tensors_by_name, feed_dict=feed_dict)
+  # Make it easier to chain filter -> predict by keeping track of the current
+  # time.
+  output[_feature_keys.FilteringResults.TIMES] = features[
+      _feature_keys.FilteringFeatures.TIMES]
+  return output
+
+
 def filter_continuation(continue_from, signatures, session, features):
   """Perform filtering using an exported saved model.
 
@@ -124,8 +176,8 @@ def filter_continuation(continue_from, signatures, session, features):
 
   Args:
     continue_from: A dictionary containing the results of either an Estimator's
-      evaluate method or a previous filter_continuation. Used to determine the
-      model state to start filtering from.
+      evaluate method or a previous filter step (cold start or
+      continuation). Used to determine the model state to start filtering from.
     signatures: The `MetaGraphDef` protocol buffer returned from
       `tf.saved_model.loader.load`. Used to determine the names of Tensors to
       feed and fetch. Must be from the same model as `continue_from`.
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
index d5dce30fda0353bd70f44ec567ac91acce1e9394..5f7e3da2db6da26f50aad9d500959238063a3e3c 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
@@ -78,7 +78,7 @@ class StubTimeSeriesModel(model.TimeSeriesModel):
     batch_end_values = array_ops.squeeze(
         array_ops.slice(values, [0, array_ops.shape(times)[1] - 1, 0],
                         [-1, 1, -1]),
-        squeeze_dims=[1, 2])
+        axis=[1, 2])
     # A pretty odd but easy to think about loss: L1 loss on the batch end
     # values.
     loss = math_ops.reduce_sum(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index c86d06e9236962cbabbc56afa1cfe213e0c78bc0..5d33e23a427bd54fd02b0eb7489f84d189e05e35 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -40,6 +40,7 @@ py_test(
     timeout = "long",  # Moderate but for asan
     srcs = ["state_space_model_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":state_space_model",
         "//tensorflow/contrib/layers:layers_py",
@@ -268,15 +269,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
index 1fcd3e391b63c2362d6187da9556e2c71836dbaa..a614386121e000961bf8b32625a28e1251654320 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
@@ -170,7 +170,7 @@ class KalmanFilter(object):
         math_ops.matmul(
             transition_matrices,
             prior_state[..., None]),
-        squeeze_dims=[-1])
+        axis=[-1])
     return advanced_state
 
   def predict_state_var(
@@ -254,7 +254,7 @@ class KalmanFilter(object):
             kalman_gain_transposed,
             array_ops.expand_dims(residual, -1),
             adjoint_a=True),
-        squeeze_dims=[-1])
+        axis=[-1])
     gain_obs = math_ops.matmul(
         kalman_gain_transposed, observation_model, adjoint_a=True)
     identity_extradim = linalg_ops.eye(
@@ -332,7 +332,7 @@ class KalmanFilter(object):
             array_ops.expand_dims(state_mean, 1),
             observation_model,
             adjoint_b=True),
-        squeeze_dims=[1])
+        axis=[1])
     observed_var = math_ops.matmul(
         math_ops.matmul(observation_model, state_var),
         observation_model,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py
index 6257002647ed53bbde3ace11a6b45e4e2cdeb57d..951c6546d5fed77e0cfa98a4e774b804639d7dad 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py
@@ -112,11 +112,11 @@ class StateSpaceModelConfiguration(
       exogenous_noise_decreases: If True, exogenous regressors can "set" model
           state, decreasing uncertainty. If both this parameter and
           exogenous_noise_increases are False, exogenous regressors are ignored.
-      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
-          objects (for example tf.contrib.layers.embedding_column) corresponding
-          to exogenous features which provide extra information to the model but
-          are not part of the series to be predicted. Passed to
-          tf.contrib.layers.input_from_feature_columns.
+      exogenous_feature_columns: A list of `tf.feature_column`s (for example
+          `tf.feature_column.embedding_column`) corresponding to exogenous
+          features which provide extra information to the model but are not part
+          of the series to be predicted. Passed to
+          `tf.feature_column.input_layer`.
       exogenous_update_condition: A function taking two Tensor arguments `times`
           (shape [batch size]) and `features` (a dictionary mapping exogenous
           feature keys to Tensors with shapes [batch size, ...]) and returning a
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
index 1afc58cfb240c52a9f001da787addfb7fbb46789..6746dd7b433466c473402e0e8374377093a73492 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
@@ -107,7 +107,7 @@ class VARMA(state_space_model.StateSpaceModel):
 
     Returns:
       the state transition matrix. It has shape
-        [self.state_dimendion, self.state_dimension].
+        [self.state_dimension, self.state_dimension].
     """
     # Pad any unused AR blocks with zeros. The extra state is necessary if
     # ma_order >= ar_order.
@@ -127,7 +127,7 @@ class VARMA(state_space_model.StateSpaceModel):
 
     Returns:
       the state noise transform matrix. It has shape
-        [self.state_dimendion, self.num_features].
+        [self.state_dimension, self.num_features].
     """
     # Noise is broadcast, through the moving average coefficients, to
     # un-observed parts of the latent state.
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index c48e84ddfaac8ac9c07e061847315eab3fd72152..0bdf6f64c9eeef83198e28c6b9279bde24bd0ead 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -24,6 +24,8 @@ cc_library(
     name = "all_ops",
     deps = [
         ":cross_replica_ops_op_lib",
+        ":heartbeat_ops_op_lib",
+        ":host_compute_ops_op_lib",
         ":infeed_ops_op_lib",
         ":outfeed_ops_op_lib",
         ":replication_ops_op_lib",
@@ -45,7 +47,7 @@ py_library(
     deps = [
         ":tpu_lib",
         ":tpu_py",
-        "//tensorflow/contrib/summary:summary_ops",
+        "//tensorflow/contrib/training:training_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -55,6 +57,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
@@ -69,6 +72,8 @@ py_library(
 tf_gen_op_libs(
     op_lib_names = [
         "cross_replica_ops",
+        "heartbeat_ops",
+        "host_compute_ops",
         "infeed_ops",
         "outfeed_ops",
         "replication_ops",
@@ -78,6 +83,7 @@ tf_gen_op_libs(
     deps = [
         "//tensorflow/contrib/tpu/proto:tpu_embedding_config_proto_cc",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -85,6 +91,8 @@ tf_custom_op_library(
     name = "python/ops/_tpu_ops.so",
     srcs = [
         "ops/cross_replica_ops.cc",
+        "ops/heartbeat_ops.cc",
+        "ops/host_compute_ops.cc",
         "ops/infeed_ops.cc",
         "ops/outfeed_ops.cc",
         "ops/replication_ops.cc",
@@ -101,6 +109,8 @@ tf_gen_op_wrapper_py(
     name = "tpu_ops",
     deps = [
         ":cross_replica_ops_op_lib",
+        ":heartbeat_ops_op_lib",
+        ":host_compute_ops_op_lib",
         ":infeed_ops_op_lib",
         ":outfeed_ops_op_lib",
         ":replication_ops_op_lib",
@@ -114,6 +124,8 @@ py_library(
     srcs = ["python/profiler/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_pb2_grpc",
+        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_proto_py",
         "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
         "//tensorflow/python:util",
     ],
@@ -152,7 +164,10 @@ py_library(
     name = "tpu_lib",
     srcs = [
         "python/tpu/__init__.py",
+        "python/tpu/bfloat16.py",
         "python/tpu/device_assignment.py",
+        "python/tpu/keras_support.py",
+        "python/tpu/session_support.py",
         "python/tpu/topology.py",
         "python/tpu/tpu.py",
         "python/tpu/tpu_feed.py",
@@ -163,6 +178,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":datasets",
         ":profiler",
         ":tpu_py",
         "//tensorflow/contrib/tpu/proto:topology_proto_py",
@@ -181,6 +197,35 @@ py_library(
     ],
 )
 
+py_library(
+    name = "datasets",
+    srcs = [
+        "python/tpu/datasets.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:batching",
+        "//tensorflow/contrib/data/python/ops:interleave_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+tf_py_test(
+    name = "datasets_test",
+    srcs = ["python/tpu/datasets_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        ":datasets",
+    ],
+    grpc_enabled = True,
+    tags = ["no_windows"],
+)
+
 tf_py_test(
     name = "tpu_test",
     size = "small",
@@ -192,6 +237,7 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:layers",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 tf_py_test(
@@ -205,6 +251,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["python/tpu/bfloat16_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
 tf_py_test(
     name = "tpu_infeed_test",
     size = "small",
@@ -238,15 +295,13 @@ tf_py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
+tf_py_test(
+    name = "tpu_estimator_signals_test",
+    size = "small",
+    srcs = ["python/tpu/tpu_estimator_signals_test.py"],
+    additional_deps = [
+        ":tpu_estimator",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
 )
diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index ea6e874f2d952b03e8cdabeee00ccfe1b076a0d0..dc9066855990f372c28dc481959117daa4c2da97 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -43,6 +43,7 @@
 @@TPUEstimator
 @@TPUEstimatorSpec
 @@RunConfig
+@@InputPipelineConfig
 @@TPUConfig
 """
 
@@ -53,6 +54,7 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.contrib.tpu.python import profiler
 from tensorflow.contrib.tpu.python.ops.tpu_ops import *
+from tensorflow.contrib.tpu.python.tpu.bfloat16 import *
 from tensorflow.contrib.tpu.python.tpu.device_assignment import *
 from tensorflow.contrib.tpu.python.tpu.topology import *
 from tensorflow.contrib.tpu.python.tpu.tpu import *
diff --git a/tensorflow/contrib/tpu/ops/heartbeat_ops.cc b/tensorflow/contrib/tpu/ops/heartbeat_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca0f5bc0e562cd9e27b4c456b53fb9f51f1cb1f8
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/heartbeat_ops.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+REGISTER_OP("WorkerHeartbeat")
+    .Input("request: string")
+    .Output("response: string")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Worker heartbeat op.
+
+Heartbeats may be sent periodically to indicate the coordinator is still active,
+to retrieve the current worker status and to expedite shutdown when necessary.
+
+request: A string tensor containing a serialized WorkerHeartbeatRequest
+response: A string tensor containing a serialized WorkerHeartbeatResponse
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/host_compute_ops.cc b/tensorflow/contrib/tpu/ops/host_compute_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48aeb81ac1311d3acd4972810f0a27a382f8b136
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/host_compute_ops.cc
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("_XlaSendFromHost")
+    .Input("inputs: Tinputs")
+    .Input("dynamic_key: string")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("key: string")
+    .Attr("device_ordinal: int")
+    .SetIsStateful()
+    .SetShapeFn(::tensorflow::shape_inference::NoOutputs)
+    .Doc(R"doc(
+A placeholder op for multiple values that will be sent from TensorFlow to a
+running XLA computation.
+
+inputs: A list of tensors that will be sent to the XLA computation.
+dynamic_key: The key sent at runtime by the compile node to identify which
+execution the transfer corresponds to.
+Tinputs: The element types of each element in `inputs`.
+key: A key that is unique in the computation and associates the send with the consumer in
+the XLA computation.
+device_ordinal: The device to use.
+)doc");
+
+REGISTER_OP("_XlaRecvAtHost")
+    .Input("dynamic_key: string")
+    .Output("outputs: Toutputs")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("key: string")
+    .Attr("device_ordinal: int")
+    .SetIsStateful()
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape)
+    .Doc(R"doc(
+A placeholder op for multiple values that will be sent to TensorFlow from a
+running XLA computation.
+
+dynamic_key: The key sent at runtime by the compile node to identify which
+execution the transfer corresponds to.
+outputs: A list of tensors that will be received from the XLA computation.
+Toutputs: The element types of each element in `outputs`.
+key: A key that is unique in the computation and associates the send with the consumer in
+the XLA computation.
+device_ordinal: The device to use.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
index 5900c61a38726551391c212f92b9b9eacd4a465b..b05c76ca64fbaedc205ab06cc31616787ccc84b8 100644
--- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc
+++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
@@ -26,6 +26,7 @@ REGISTER_OP("OutfeedEnqueue")
     .Input("input: dtype")
     .Attr("dtype: type")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 An op which emits a single Tensor value from an XLA computation.
 
@@ -36,6 +37,7 @@ REGISTER_OP("OutfeedEnqueueTuple")
     .Input("inputs: dtypes")
     .Attr("dtypes: list(type)")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 An op which emits multiple Tensor values from an XLA computation.
 
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
index cba71c6b98e1079de6c6c4c32fa2ffc44a9ce71e..3bdf7c2f83b037984a45cea99910df87c967aa40 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -27,6 +27,7 @@ REGISTER_OP("TPUReplicateMetadata")
     .Attr("topology: string = \"\"")
     .Attr("device_assignment: list(int) = []")
     .Attr("computation_shape: list(int) = []")
+    .Attr("host_compute_core: list(string) = []")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -68,6 +69,7 @@ REGISTER_OP("TPUReplicate")
     .Attr("num_replicas: int >= 1")
     .Attr("topology: string = \"\"")
     .Attr("device_assignment: list(int) = []")
+    .Attr("host_compute_core: list(string) = []")
     .Attr("computation_shape: list(int) = []")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Tbroadcast_inputs: list(type) >= 0")
diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
index f8de8baa65339383c7f92284ee274a434f12f8c2..d5600eef4a9dc69fcfd931a083f86d7941ba8fb4 100644
--- a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
@@ -191,6 +191,7 @@ REGISTER_OP("ConfigureDistributedTPU")
     .Output("topology: string")
     .Attr("embedding_config: string = ''")
     .Attr("tpu_embedding_config: string = ''")
+    .Attr("is_global_init: bool = false")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
@@ -202,6 +203,7 @@ topology.
 tpu_embedding_config: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
 describes the embedding lookups of the program.
 embedding_config: Reserved. Do not use.
+is_global_init: Reserved. Do not use.
 )doc");
 
 REGISTER_OP("ShutdownDistributedTPU")
@@ -212,20 +214,4 @@ An op that shuts down a running distributed TPU system. The Op returns
 an error if no system is running.
 )doc");
 
-REGISTER_OP("SessionStatus")
-    .Input("fetch_start_timestamp: double")
-    .Output("status: string")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Not for public usage.
-
-Returns messages from the current session as a serialized SessionStatusProto.
-
-This includes the current state of the compiler, along with any critical
-logging or warning messages.
-
-fetch_start_timestamp: any messages earlier than this will be excluded from the
-returned proto.
-)doc");
-
 }  // end namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
index cc32a265286951a1e4d59228da6b3ac83a75c5e9..72d37f774cc518c559b5953561957a799a7da568 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
@@ -50,7 +50,7 @@ namespace tensorflow {
 // TPU Embeddings use dedicated ops to enforce Host/TPU consistency in the
 // state of embedding table variables. Before beginning training or inference,
 // the model must Load the optimizer parameters into the TPU memories. Before
-// saving a checkpoint, the model must Retreieve the parameters back into the
+// saving a checkpoint, the model must Retrieve the parameters back into the
 // host CPU memory.
 
 REGISTER_OP("TPUEmbeddingLoadGradientDescentParameters")
@@ -263,7 +263,7 @@ REGISTER_OP("TPUEmbeddingReceiveActivations")
     .SetIsStateful()
     .SetShapeFn(tpu_embedding_config_util::ActivationShapes)
     .Doc(R"doc(
-An op that receives embeddng activations on the TPU.
+An op that receives embedding activations on the TPU.
 
 The TPU system performs the embedding lookups and aggregations specified by
 the arguments to TPUEmbeddingEnqueueSparseBatch. The results of these
@@ -293,7 +293,7 @@ REGISTER_OP("TPUEmbeddingActivations")
 An op enabling differentiation of TPU Embeddings.
 
 This op simply returns its first input, which is assumed to have been sliced
-from the Tensors returnd by TPUEmbeddingDequeueActivations. The presence of this
+from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of this
 op, and its first argument being a trainable Variable, enables automatic
 differentiation of graphs containing embeddings via the TPU Embedding Python
 libraries.
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 198da0203a7d17249c4f50110713121b74d5ca4f..dbf1ab6bbf0ddc7429d8e19279451eb862981e0c 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -6,19 +6,7 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_cc")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-tf_proto_library_cc(
+tf_proto_library(
     name = "tpu_profiler_proto",
     srcs = ["tpu_profiler.proto"],
     has_services = 1,
@@ -58,6 +46,7 @@ tf_cc_binary(
     visibility = ["//visibility:public"],
     deps = [
         ":dump_tpu_profile",
+        ":tpu_profiler_analysis_proto_cc",
         ":tpu_profiler_proto_cc",
         ":version",
         "//tensorflow/core:framework_internal",
@@ -98,16 +87,34 @@ tf_cc_test(
     ],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "op_profile_proto",
     srcs = ["op_profile.proto"],
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "tf_op_stats_proto",
     srcs = ["tf_op_stats.proto"],
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
+
+tf_proto_library(
+    name = "tpu_profiler_analysis_proto",
+    srcs = ["tpu_profiler_analysis.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    protodeps = [":tpu_profiler_proto"] + tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "tpu_profiler_analysis_pb2_grpc",
+    srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [":tpu_profiler_analysis_proto_py"],
+)
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index b1ef9fde37fe0647965f0818895be37d2d56d207..816897499b7a49365060c026d60b977990f3ecdc 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -26,9 +26,13 @@ limitations under the License.
 
 #include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
 #include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.grpc.pb.h"
 #include "tensorflow/contrib/tpu/profiler/version.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -37,6 +41,7 @@ namespace tensorflow {
 namespace tpu {
 namespace {
 
+using ::tensorflow::TPUProfileAnalysis;
 using ::tensorflow::TPUProfiler;
 
 constexpr uint64 kMaxEvents = 1000000;
@@ -61,16 +66,35 @@ Status ValidateHostPortPair(const string& host_port) {
   return Status::OK();
 }
 
-ProfileResponse Profile(const string& service_addr, int duration_ms,
-                        const ProfileOptions& opts) {
+ProfileRequest PopulateProfileRequest(int duration_ms,
+                                      const string& repository_root,
+                                      const string& session_id,
+                                      const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
+  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
+    // For backward compatibilities, only generate tracetable etc when the
+    // user provide a GCS path for model directory.
+    request.set_repository_root(repository_root);
+    request.set_session_id(session_id);
+  }
   request.add_tools("input_pipeline");
   request.add_tools("overview_page");
   *request.mutable_opts() = opts;
   std::cout << "Limiting the number of trace events to " << kMaxEvents
             << std::endl;
+  return request;
+}
+
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+bool Profile(const string& service_addr, const string& logdir, int duration_ms,
+             const string& repository_root, const string& session_id,
+             const ProfileOptions& opts) {
+  ProfileRequest request =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+
   ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
   // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
@@ -84,7 +108,60 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
           channel_args));
   ProfileResponse response;
   TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
-  return response;
+
+  if (!response.encoded_trace().empty()) {
+    TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
+        logdir, session_id, "", response, &std::cout));
+    // Print this at the end so that it's not buried in irrelevant LOG messages.
+    std::cout
+        << "NOTE: using the trace duration " << duration_ms << "ms."
+        << std::endl
+        << "Set an appropriate duration (with --duration_ms) if you "
+           "don't see a full step in your trace or the captured trace is too "
+           "large."
+        << std::endl;
+  }
+
+  return response.encoded_trace().empty();
+}
+
+// Start a new profiling session that include all the hosts included in
+// hostnames, for the time interval of duration_ms. Possibly save the profiling
+// result in the directory specified by repository_root and session_id.
+bool NewSession(const string& service_addr,
+                const std::vector<tensorflow::string>& hostnames,
+                int duration_ms, const string& repository_root,
+                const string& session_id, const ProfileOptions& opts) {
+  NewProfileSessionRequest new_session_request;
+  *new_session_request.mutable_request() =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+  new_session_request.set_repository_root(repository_root);
+  new_session_request.set_session_id(session_id);
+  for (const auto& hostname : hostnames) {
+    new_session_request.add_hosts(hostname);
+  }
+
+  ::grpc::ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
+  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  // TODO(jiesun): GRPC support following relevant naming scheme:
+  // 1. dns:///host:port
+  // 2. ipv4:host:port or ipv6:[host]:port
+  // We might need to change the prefix which depends on what TPU name resolver
+  // will give us.
+  std::unique_ptr<TPUProfileAnalysis::Stub> stub =
+      TPUProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
+  NewProfileSessionResponse new_session_response;
+  TF_QCHECK_OK(FromGrpcStatus(
+      stub->NewSession(&context, new_session_request, &new_session_response)));
+
+  std::cout << "Profile session succeed for host(s):"
+            << str_util::Join(hostnames, ",") << std::endl;
+  return new_session_response.empty_trace();
 }
 
 }  // namespace
@@ -94,12 +171,16 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
 int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
+  tensorflow::string FLAGS_workers_list;
   int FLAGS_duration_ms = 2000;
   int FLAGS_num_tracing_attempts = 3;
   bool FLAGS_include_dataset_ops = true;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("service_addr", &FLAGS_service_addr,
                        "Address of TPU profiler service e.g. localhost:8466"),
+      tensorflow::Flag("workers_list", &FLAGS_workers_list,
+                       "The list of worker TPUs that we are about to profile "
+                       "in the current session."),
       tensorflow::Flag("logdir", &FLAGS_logdir,
                        "Path of TensorBoard log directory e.g. /tmp/tb_log, "
                        "gs://tb_bucket"),
@@ -137,17 +218,36 @@ int main(int argc, char** argv) {
   opts.set_include_dataset_ops(FLAGS_include_dataset_ops);
   tensorflow::ProfileResponse response;
 
+  // Use the current timestamp as the run name.
+  tensorflow::string session_id =
+      tensorflow::tpu::GetCurrentTimeStampAsString();
+  constexpr char kProfilePluginDirectory[] = "plugins/profile/";
+  tensorflow::string repository_root =
+      ::tensorflow::io::JoinPath(FLAGS_logdir, kProfilePluginDirectory);
+  std::vector<tensorflow::string> hostnames =
+      tensorflow::str_util::Split(FLAGS_workers_list, ",");
+
+  bool empty_trace = false;
   while (true) {
     std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
               << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
-    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts);
-    if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break;
+    if (hostnames.empty()) {
+      empty_trace = tensorflow::tpu::Profile(FLAGS_service_addr, FLAGS_logdir,
+                                             duration_ms, repository_root,
+                                             session_id, opts);
+    } else {
+      tensorflow::string tpu_master = FLAGS_service_addr;
+      empty_trace =
+          tensorflow::tpu::NewSession(tpu_master, hostnames, duration_ms,
+                                      repository_root, session_id, opts);
+    }
+    if (remaining_attempts <= 0 || !empty_trace) break;
     std::cout << "No trace event is collected. Automatically retrying."
               << std::endl
               << std::endl;
   }
 
-  if (response.encoded_trace().empty()) {
+  if (empty_trace) {
     std::cout << "No trace event is collected after "
               << FLAGS_num_tracing_attempts << " attempt(s). "
               << "Perhaps, you want to try again (with more attempts?)."
@@ -158,15 +258,5 @@ int main(int argc, char** argv) {
     return 0;
   }
 
-  // Use the current timestamp as the run name.
-  tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString();
-  TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
-      FLAGS_logdir, run, response, &std::cout));
-  // Print this at the end so that it's not buried in irrelevant LOG messages.
-  std::cout
-      << "NOTE: using the trace duration " << duration_ms << "ms." << std::endl
-      << "Set an appropriate duration (with --duration_ms) if you "
-         "don't see a full step in your trace or the captured trace is too "
-         "large."
-      << std::endl;
+  return 0;
 }
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index ebd6185faad28ae7a22eb33f6b358eb2344c9c22..5e85a967ad4ea373e213fa90c3640e9ab1f92d25 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -41,6 +41,7 @@ namespace {
 using ::tensorflow::io::JoinPath;
 using ::tensorflow::protobuf::util::JsonOptions;
 using ::tensorflow::protobuf::util::MessageToJsonString;
+using ::tensorflow::strings::StrCat;
 
 constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
 constexpr char kJsonOpProfileFileName[] = "op_profile.json";
@@ -61,28 +62,34 @@ Status WriteGzippedDataToFile(const string& filename, const string& data) {
   return Status::OK();
 }
 
-Status DumpTraceToLogDirectory(StringPiece run_dir, const string& encoded_trace,
-                               std::ostream* os) {
-  string proto_path = JoinPath(run_dir, kProtoTraceFileName);
+Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
+                               const string& encoded_trace, std::ostream* os) {
+  string proto_path =
+      JoinPath(run_dir, StrCat(host_prefix, kProtoTraceFileName));
   TF_RETURN_IF_ERROR(
       WriteStringToFile(Env::Default(), proto_path, encoded_trace));
   LOG(INFO) << "Dumped raw-proto trace data to " << proto_path;
 
-  string json_path = JoinPath(run_dir, kJsonTraceFileName);
+  string json_path = JoinPath(run_dir, StrCat(host_prefix, kJsonTraceFileName));
   Trace trace;
   trace.ParseFromString(encoded_trace);
-  *os << "Trace contains " << trace.trace_events_size() << " events."
-      << std::endl;
+  if (os) {
+    *os << "Trace contains " << trace.trace_events_size() << " events."
+        << std::endl;
+  }
   TF_RETURN_IF_ERROR(
       WriteGzippedDataToFile(json_path, TraceEventsToJson(trace)));
-  *os << "Dumped JSON trace data to " << json_path << std::endl;
+  if (os) {
+    *os << "Dumped JSON trace data to " << json_path << std::endl;
+  }
   return Status::OK();
 }
 
 Status DumpOpProfileToLogDirectory(StringPiece run_dir,
+                                   const string& host_prefix,
                                    const tpu::op_profile::Profile& profile,
                                    std::ostream* os) {
-  string path = JoinPath(run_dir, kJsonOpProfileFileName);
+  string path = JoinPath(run_dir, StrCat(host_prefix, kJsonOpProfileFileName));
   string json;
   JsonOptions options;
   options.always_print_primitive_fields = true;
@@ -93,49 +100,20 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
         string(status.error_message()));
   }
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, json));
-  *os << "Dumped json op profile data to " << path << std::endl;
+  if (os) {
+    *os << "Dumped json op profile data to " << path << std::endl;
+  }
   return Status::OK();
 }
 
 Status DumpToolDataToLogDirectory(StringPiece run_dir,
+                                  const string& host_prefix,
                                   const tensorflow::ProfileToolData& tool,
                                   std::ostream* os) {
-  string path = JoinPath(run_dir, tool.name());
+  string path = JoinPath(run_dir, StrCat(host_prefix, tool.name()));
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
-  *os << "Dumped tool data for " << tool.name() << " to " << path << std::endl;
-  return Status::OK();
-}
-
-Status DumpGraphEvents(const string& logdir, const string& run,
-                       const ProfileResponse& response, std::ostream* os) {
-  int num_graphs = response.computation_graph_size();
-  if (response.computation_graph_size() == 0) return Status::OK();
-  // The server might generates multiple graphs for one program; we simply
-  // pick the first one.
-  if (num_graphs > 1) {
-    *os << num_graphs
-        << " TPU program variants observed over the profiling period. "
-        << "One computation graph will be chosen arbitrarily." << std::endl;
-  }
-  // The graph plugin expects the graph in <logdir>/<run>/<event.file>.
-  string run_dir = JoinPath(logdir, strings::StrCat(kGraphRunPrefix, run));
-  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(run_dir));
-  EventsWriter event_writer(JoinPath(run_dir, "events"));
-  Event event;
-  // Add the computation graph.
-  event.set_graph_def(response.computation_graph(0).SerializeAsString());
-  event_writer.WriteEvent(event);
-  *os << "Wrote a HLO graph to " << event_writer.FileName() << std::endl;
-
-  if (response.has_hlo_metadata()) {
-    tensorflow::TaggedRunMetadata tagged_run_metadata;
-    tagged_run_metadata.set_tag(run);
-    tagged_run_metadata.set_run_metadata(
-        response.hlo_metadata().SerializeAsString());
-    tensorflow::Event meta_event;
-    *meta_event.mutable_tagged_run_metadata() = tagged_run_metadata;
-    event_writer.WriteEvent(meta_event);
-    *os << "Wrote HLO ops run metadata to " << event_writer.FileName()
+  if (os) {
+    *os << "Dumped tool data for " << tool.name() << " to " << path
         << std::endl;
   }
   return Status::OK();
@@ -144,27 +122,30 @@ Status DumpGraphEvents(const string& logdir, const string& run,
 }  // namespace
 
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const string& host,
                                   const ProfileResponse& response,
                                   std::ostream* os) {
   // Dumps profile data to <logdir>/plugins/profile/<run>/.
+  string host_prefix = host.empty() ? "" : StrCat(host, ".");
   string profile_run_dir = JoinPath(logdir, kProfilePluginDirectory, run);
+  *os << "Creating directory: " << profile_run_dir;
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
 
   // Ignore computation_graph for now.
   if (!response.encoded_trace().empty()) {
     LOG(INFO) << "Converting trace events to TraceViewer JSON.";
-    TF_RETURN_IF_ERROR(
-        DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
+    TF_RETURN_IF_ERROR(DumpTraceToLogDirectory(profile_run_dir, host_prefix,
+                                               response.encoded_trace(), os));
   }
   if (response.has_op_profile() &&
       (response.op_profile().has_by_program_structure() ||
        response.op_profile().has_by_category())) {
-    TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
+    TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir, host_prefix,
                                                    response.op_profile(), os));
   }
   for (const auto& tool_data : response.tool_data()) {
-    TF_RETURN_IF_ERROR(
-        DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
+    TF_RETURN_IF_ERROR(DumpToolDataToLogDirectory(profile_run_dir, host_prefix,
+                                                  tool_data, os));
   }
 
   return Status::OK();
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 29ef977bacfd61e163be49558c5b94277ed479c1..ecf21b1de2219e8896d5e8b79325a193de0b0fa1 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -32,6 +32,7 @@ namespace tpu {
 // Note: this function creates a directory even when all fields in
 // ProfileResponse are unset/empty.
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const string& host,
                                   const ProfileResponse& response,
                                   std::ostream* os);
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index a730d6142d890cc41f72176cf617ac0b0434192c..508c7a842fb82ec080082d7e7f02f8d2f2a79447 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -37,12 +37,17 @@ flags.DEFINE_string(
     'will attempt to automatically detect the GCE project from metadata.')
 flags.DEFINE_string('tpu_name', None,
                     'Name of the Cloud TPU for Cluster Resolvers. You must '
-                    'specify either this flag or --master.')
+                    'specify either this flag or --service_addr.')
 
 # Tool specific parameters
 flags.DEFINE_string(
     'service_addr', None, 'Address of TPU profiler service e.g. '
     'localhost:8466, you must specify either this flag or --tpu_name.')
+flags.DEFINE_string(
+    'workers_list', None, 'The list of worker TPUs that we are about to profile'
+    ' e.g. 10.0.1.2, 10.0.1.3. You can specify this flag with --tpu_name or '
+    '--service_addr to profile a subset of tpu nodes. You can also use only'
+    '--tpu_name and leave this flag unspecified to profile all the tpus.')
 flags.DEFINE_string('logdir', None,
                     'Path of TensorBoard log directory e.g. /tmp/tb_log, '
                     'gs://tb_bucket')
@@ -56,18 +61,25 @@ flags.DEFINE_boolean('include_dataset_ops', True,
 
 FLAGS = flags.FLAGS
 EXECUTABLE = 'data/capture_tpu_profile'
+JOB_NAME = 'worker'
 
+def get_workers_list(cluster_resolver):
+  cluster_spec = cluster_resolver.cluster_spec()
+  task_indices = cluster_spec.task_indices(JOB_NAME)
+  workers_list = [cluster_spec.task_address(JOB_NAME, i).split(':')[0]
+                  for i in task_indices]
+  return ','.join(workers_list)
 
 def run_main():
   tf.app.run(main)
 
-
 def main(unused_argv=None):
   tf.logging.set_verbosity(tf.logging.INFO)
 
   if FLAGS.service_addr is None and FLAGS.tpu_name is None:
     sys.exit('You must specify either --service_addr or --tpu_name.')
 
+  tpu_cluster_resolver = None
   if FLAGS.service_addr is not None:
     if FLAGS.tpu_name is not None:
       tf.logging.warn('Both --service_addr and --tpu_name are set. Ignoring '
@@ -76,12 +88,18 @@ def main(unused_argv=None):
   else:
     tpu_cluster_resolver = (
         tf.contrib.cluster_resolver.TPUClusterResolver(
-            tpu_names=[FLAGS.tpu_name],
+            [FLAGS.tpu_name],
             zone=FLAGS.tpu_zone,
             project=FLAGS.gcp_project))
     service_addr = tpu_cluster_resolver.get_master()
   service_addr = service_addr.replace('grpc://', '').replace(':8470', ':8466')
 
+  workers_list = ""
+  if FLAGS.workers_list is not None:
+    workers_list = FLAGS.workers_list
+  elif tpu_cluster_resolver is not None:
+    workers_list = get_workers_list(tpu_cluster_resolver)
+
   if not FLAGS.logdir:
     sys.exit('logdir must be provided.')
   executable_path = os.path.join(os.path.dirname(__file__), EXECUTABLE)
@@ -89,6 +107,7 @@ def main(unused_argv=None):
   cmd = [executable_path]
   cmd.append('--logdir=' + logdir)
   cmd.append('--service_addr=' + service_addr)
+  cmd.append('--workers_list=' + workers_list)
   cmd.append('--duration_ms=' + str(FLAGS.duration_ms))
   cmd.append('--num_tracing_attempts=' + str(FLAGS.num_tracing_attempts))
   cmd.append('--include_dataset_ops=' + str(FLAGS.include_dataset_ops).lower())
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 8d99835b64152629c66607e6792495eb36319eb8..ebd478fd02295108b9d2454963eb06165828b523 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.6.0-rc1'
+_VERSION = '1.6.0'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 2094294baad63ae73712c8648b588accd4551ef8..b9ac1a550c87e055fd5d555c346d5ec545bbe634 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -66,6 +66,10 @@ message OpMetricsDbResult {
   // The total of the difference between the start times of two
   // consecutive infeed-enqueues (per host) in picoseconds.
   optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
+  // The total device time in microseconds.
+  optional double total_device_time_in_us = 4;
+  // The total host time in microseconds.
+  optional double total_host_time_in_us = 5;
 }
 
 // Result proto for StepInfo.
@@ -77,6 +81,12 @@ message StepInfoResult {
   // The infeed duration in picoseconds.
   // Can turn into a map if we want a variable number of ops.
   optional uint64 infeed_duration_ps = 3;
+  // The start time of this step in picoseconds.
+  optional uint64 begin_ps = 4;
+  // The waiting time within this step in picoseconds.
+  optional uint64 wait_duration_ps = 5;
+  // The time spent on cross-replica-sum in picoseconds.
+  optional uint64 crs_duration_ps = 6;
 }
 
 // Result proto for a sequence of steps.
@@ -155,6 +165,66 @@ message RunEnvironmentResult {
   repeated HostDependentJobInfoResult host_dependent_job_info = 6;
 }
 
+// The types of host operations that are tracked.
+enum HostOp {
+  // Invalid host op.
+  kINVALIDHostOp = 0;
+  // Each of host op type has two parts:
+  // (1) the stage where the op happens and (2) the op name.
+  // stage = Input Data Producer, op = Get Next Batch.
+  kInputDataProducerGetNextBatch = 1;
+  // stage = Input Data Producer, op = Session Run.
+  kInputDataProducerSessionRun = 2;
+  // stage = Input Data Producer, op = Forward Batch.
+  kInputDataProducerForwardBatch = 3;
+  // stage = Infeed Thread, op = Get Next Batch.
+  kInfeedThreadGetNextBatch = 4;
+  // stage = Infeed Thread, op = Session Run.
+  kInfeedThreadSessionRun = 5;
+  // stage = Infeed Thread, op = Forward Batch.
+  kInfeedThreadForwardBatch = 6;
+  // stage = Outfeed Thread, op = Get Next Batch.
+  kOutfeedThreadGetNextBatch = 7;
+  // stage = Outfeed Thread, op = Session Run.
+  kOutfeedThreadSessionRun = 8;
+  // stage = Outfeed Thread, op = Forward Batch.
+  kOutfeedThreadForwardBatch = 9;
+}
+
+// Result proto for the host ops per TPU step.
+message HostOpsPerTpuStep {
+  // Whether the data in this message is valid.
+  optional bool valid = 1 [default = false];
+  // The current TPU step number.
+  optional uint32 tpu_step_num = 2;
+  // The beginning time of the current TPU step on the device in picoseconds.
+  optional uint64 tpu_step_begin_ps = 3;
+  // The ending time of the current TPU step on the device in picoseconds.
+  optional uint64 tpu_step_end_ps = 4;
+  // For each possible host operation, maps to the difference between the TPU
+  // step number that the host op targets and the current TPU step number.
+  // The key is HostOp, value is the step difference.
+  map<int32, int32> step_diffs = 5;
+}
+
+message HostOpsDetailsPerCore {
+  // Map from core id to HostOpsPerTpuStep.
+  map<int32, HostOpsPerTpuStep> core_map = 1;
+}
+
+message HostOpsDetailsPerHost {
+  // Map from hostname to a map from core id to HostOpsPerTpuStep.
+  map<string, HostOpsDetailsPerCore> host_map = 1;
+}
+
+// Result proto for the host ops for all TPU steps.
+message HostOpsResult {
+  reserved 1;  // (was repeated HostOpsPerTpuStep host_op_sequence)
+  // A sequence of records with one for each TPU step. Each record
+  // is a map from hostname to a map from core id to HostOpsPerTpuStep.
+  repeated HostOpsDetailsPerHost hostops_details = 2;
+}
+
 // Result proto for TfStatsHelper.
 message TfOpStats {
   // The result for the TF-metric database.
@@ -171,4 +241,10 @@ message TfOpStats {
   optional double matrix_unit_utilization_percent = 6;
   // The run environment of this profiling session.
   optional RunEnvironmentResult run_environment = 7;
+  // The result for the host operations.
+  optional HostOpsResult host_ops = 8;
+  // A map from core ID to name.
+  map<uint32, string> core_id_to_name_map = 9;
+  // The result for hw unit b stats.
+  optional bytes unit_b_stats = 10;
 }
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index f3f3302ceb3d27dbb21bdce753aeb2d7fcd77448..7be694e866729c58efae4ccf7932dd929c03ed91 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -21,6 +21,17 @@ message ProfileOptions {
   // next-field: 2
 }
 
+message ToolRequestOptions {
+  // Required formats for the tool, it should be one of "json", "proto", "raw"
+  // etc. If not specified (backward compatible), use default format, i.e. most
+  // tools use json format.
+  string output_formats = 2;
+
+  // Whether save the result directly to repository or pass it back to caller.
+  // Default to false for backward compatibilities.
+  bool save_to_repo = 3;
+}
+
 message ProfileRequest {
   // In future, the caller will be able to customize when profiling starts and
   // stops. For now, it collects `duration_ms` milliseconds worth of data.
@@ -30,16 +41,30 @@ message ProfileRequest {
   // events.
   uint64 max_events = 2;
 
-  // required profiling tools name such as "input_pipeline_analyzer" etc
+  // Required profiling tools name such as "input_pipeline_analyzer" etc
   repeated string tools = 3;
 
+  // Specifies the requirement for each tools.
+  map<string, ToolRequestOptions> tool_options = 8;
+
   // Optional profiling options that control how a TF session will be profiled.
   ProfileOptions opts = 4;
 
+  // The place where we will dump profile data. We will normally use
+  // MODEL_DIR/plugin/profile/ as our repository root.
+  string repository_root = 5;
+
+  // The user provided profile session identifier.
+  string session_id = 6;
+
+  // The hostname of system where the profile should happen.
+  // We use it as identifier in part of our output filename.
+  string host_name = 7;
+
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
-  // next-field: 5
+  // next-field: 9
 }
 
 message ProfileToolData {
@@ -71,5 +96,10 @@ message ProfileResponse {
 
   // Data payload for each required tools.
   repeated ProfileToolData tool_data = 6;
-  // next-field: 7
+
+  // When we write profiling data directly to repository directory, we need a
+  // way to figure out whether the captured trace is empty (due to idle TPU).
+  bool empty_trace = 7;
+
+  // next-field: 8
 }
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
new file mode 100644
index 0000000000000000000000000000000000000000..8b0bbde98e6a1dee8ade789328f3ba0624049562
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
@@ -0,0 +1,75 @@
+syntax = "proto3";
+package tensorflow;
+
+import "tensorflow/contrib/tpu/profiler/tpu_profiler.proto";
+
+message NewProfileSessionRequest {
+  ProfileRequest request = 1;
+  string repository_root = 2;
+  repeated string hosts = 3;
+  string session_id = 4;
+}
+
+message NewProfileSessionResponse {
+  // Auxiliary error_message.
+  string error_message = 1;
+
+  // Whether all hosts had returned a empty trace.
+  bool empty_trace = 2;
+}
+
+message EnumProfileSessionsAndToolsRequest {
+  string repository_root = 1;
+}
+
+message ProfileSessionInfo {
+  string session_id = 1;
+  // Which tool data is available for consumption.
+  repeated string available_tools = 2;
+}
+
+message EnumProfileSessionsAndToolsResponse {
+  // Auxiliary error_message.
+  string error_message = 1;
+  // If success, the returned sessions information are stored here.
+  repeated ProfileSessionInfo sessions = 2;
+}
+
+message ProfileSessionDataRequest {
+  string repository_root = 1;
+  string session_id = 2;
+  // Which tool
+  string tool_name = 3;
+  // Tool's specific parameters. e.g. TraceViewer's viewport etc
+  map<string, string> parameters = 4;
+}
+
+message ProfileSessionDataResponse {
+  // Auxiliary error_message.
+  string error_message = 1;
+
+  // Output format. e.g. "json" or "proto" or "blob"
+  string output_format = 2;
+
+  // TODO(jiesun): figure out whether to put bytes or oneof tool specific proto.
+  bytes output = 3;
+}
+////////////////////////////////////////////////////////////////////////////////
+// TPUProfileAnalysis service provide entry point for profiling TPU and for
+// serving profiled data to Tensorboard through GRPC
+////////////////////////////////////////////////////////////////////////////////
+service TPUProfileAnalysis {
+  // Starts a profiling session, blocks until it completes.
+  // TPUProfileAnalysis service delegate this to TPUProfiler service.
+  // Populate the profiled data in repository, then return status to caller.
+  rpc NewSession(NewProfileSessionRequest) returns (NewProfileSessionResponse) {
+  }
+  // Enumerate existing sessions and return available profile tools.
+  rpc EnumSessions(EnumProfileSessionsAndToolsRequest)
+      returns (EnumProfileSessionsAndToolsResponse) {
+  }
+  // Retrieve specific tool's data for specific session.
+  rpc GetSessionToolData(ProfileSessionDataRequest)
+      returns (ProfileSessionDataResponse) {
+  }
+}
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5148828878b1c03bf35d1d11dc11942128b20c
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+#
+# Do not use pylint on generated code.
+# pylint: disable=missing-docstring,g-short-docstring-punctuation,g-no-space-after-docstring-summary,invalid-name,line-too-long,unused-argument,g-doc-args
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import grpc
+
+from tensorflow.contrib.tpu.profiler import tpu_profiler_analysis_pb2 as third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2
+
+
+class TPUProfileAnalysisStub(object):
+  """//////////////////////////////////////////////////////////////////////////////
+
+  TPUProfileAnalysis service provide entry point for profiling TPU and for
+  serving profiled data to Tensorboard through GRPC
+  //////////////////////////////////////////////////////////////////////////////
+  """
+
+  def __init__(self, channel):
+    """Constructor.
+
+    Args:
+      channel: A grpc.Channel.
+    """
+    self.NewSession = channel.unary_unary(
+        '/tensorflow.TPUProfileAnalysis/NewSession',
+        request_serializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        NewProfileSessionRequest.SerializeToString,
+        response_deserializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        NewProfileSessionResponse.FromString,
+    )
+    self.EnumSessions = channel.unary_unary(
+        '/tensorflow.TPUProfileAnalysis/EnumSessions',
+        request_serializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        EnumProfileSessionsAndToolsRequest.SerializeToString,
+        response_deserializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        EnumProfileSessionsAndToolsResponse.FromString,
+    )
+    self.GetSessionToolData = channel.unary_unary(
+        '/tensorflow.TPUProfileAnalysis/GetSessionToolData',
+        request_serializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        ProfileSessionDataRequest.SerializeToString,
+        response_deserializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        ProfileSessionDataResponse.FromString,
+    )
+
+
+class TPUProfileAnalysisServicer(object):
+  """//////////////////////////////////////////////////////////////////////////////
+
+  TPUProfileAnalysis service provide entry point for profiling TPU and for
+  serving profiled data to Tensorboard through GRPC
+  //////////////////////////////////////////////////////////////////////////////
+  """
+
+  def NewSession(self, request, context):
+    """Starts a profiling session, blocks until it completes.
+    TPUProfileAnalysis service delegate this to TPUProfiler service.
+    Populate the profiled data in repository, then return status to caller.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+  def EnumSessions(self, request, context):
+    """Enumerate existing sessions and return available profile tools.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+  def GetSessionToolData(self, request, context):
+    """Retrieve specific tool's data for specific session.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+
+def add_TPUProfileAnalysisServicer_to_server(servicer, server):
+  rpc_method_handlers = {
+      'NewSession':
+          grpc.unary_unary_rpc_method_handler(
+              servicer.NewSession,
+              request_deserializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              NewProfileSessionRequest.FromString,
+              response_serializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              NewProfileSessionResponse.SerializeToString,
+          ),
+      'EnumSessions':
+          grpc.unary_unary_rpc_method_handler(
+              servicer.EnumSessions,
+              request_deserializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              EnumProfileSessionsAndToolsRequest.FromString,
+              response_serializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              EnumProfileSessionsAndToolsResponse.SerializeToString,
+          ),
+      'GetSessionToolData':
+          grpc.unary_unary_rpc_method_handler(
+              servicer.GetSessionToolData,
+              request_deserializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              ProfileSessionDataRequest.FromString,
+              response_serializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              ProfileSessionDataResponse.SerializeToString,
+          ),
+  }
+  generic_handler = grpc.method_handlers_generic_handler(
+      'tensorflow.TPUProfileAnalysis', rpc_method_handlers)
+  server.add_generic_rpc_handlers((generic_handler,))
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
index dc6a934891138018d32d511750120453bdf290cf..618479e1a6ccf26a4103ea1f182b662d7d9998da 100644
--- a/tensorflow/contrib/tpu/profiler/version.h
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 
-#define TPU_PROFILER_VERSION "1.5.0"
+#define TPU_PROFILER_VERSION "1.6.0"
 
 #endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/contrib/tpu/proto/BUILD
index e1660985676e8c2efe3b01e32b48b211391885b7..fcfbbe1a213b6959b82c20beff02df48517b5e98 100644
--- a/tensorflow/contrib/tpu/proto/BUILD
+++ b/tensorflow/contrib/tpu/proto/BUILD
@@ -4,17 +4,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "tpu_embedding_config_proto",
     srcs = [
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index 97876216793e0e6b20b7c072cac4f575b8fd48be..14c63a79763300dcfe8d6c8e09b90f8e9c772358 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -47,7 +47,7 @@ if platform.system() != "Windows":
   # types are supported.
 
   _SUPPORTED_INFEED_DTYPES = set([
-      dtypes.bool, dtypes.int32, dtypes.bfloat16, dtypes.float32,
+      dtypes.bool, dtypes.int32, dtypes.int64, dtypes.bfloat16, dtypes.float32,
       dtypes.complex64
   ])
 
diff --git a/tensorflow/contrib/tpu/python/profiler/__init__.py b/tensorflow/contrib/tpu/python/profiler/__init__.py
index bde13f0527a1d8c5f71dd9684b93144ae07d60e4..15ce6aceec299adacd7025f0021cf8b6f6ef765b 100644
--- a/tensorflow/contrib/tpu/python/profiler/__init__.py
+++ b/tensorflow/contrib/tpu/python/profiler/__init__.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.contrib.tpu.profiler.tpu_profiler_analysis_pb2 import *
 from tensorflow.contrib.tpu.profiler.trace_events_pb2 import *
 # pylint: enable=wildcard-import,unused-import
 
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16.py b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa74f651aa63c72d14eb78c8af479263810e9b7d
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper context for running models with bfloat16."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import tf_contextlib
+
+
+def _get_custom_getter():
+  """Returns a custom getter that this class's methods must be called under.
+
+  All methods of this class must be called under a variable scope that was
+  passed this custom getter. Example:
+
+  ```python
+  network = ConvNetBuilder(...)
+  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
+    network.conv(...)
+    # Call more methods of network here
+  ```
+
+  Currently, this custom getter only does anything if self.use_tf_layers is
+  True. In that case, it causes variables to be stored as dtype
+  self.variable_type, then casted to the requested dtype, instead of directly
+  storing the variable as the requested dtype.
+  """
+
+  def inner_custom_getter(getter, *args, **kwargs):
+    """Custom getter that forces variables to have type self.variable_type."""
+    cast_to_bfloat16 = False
+    requested_dtype = kwargs['dtype']
+    if requested_dtype == dtypes.bfloat16:
+      # Only change the variable dtype if doing so does not decrease variable
+      # precision.
+      kwargs['dtype'] = dtypes.float32
+      cast_to_bfloat16 = True
+    var = getter(*args, **kwargs)
+    # This if statement is needed to guard the cast, because batch norm
+    # assigns directly to the return value of this custom getter. The cast
+    # makes the return value not a variable so it cannot be assigned. Batch
+    # norm variables are always in fp32 so this if statement is never
+    # triggered for them.
+    if cast_to_bfloat16:
+      var = math_ops.cast(var, dtypes.bfloat16)
+    return var
+
+  return inner_custom_getter
+
+
+@tf_contextlib.contextmanager
+def bfloat16_scope():
+  """Scope class for bfloat16 variables so that the model uses custom getter.
+
+  This enables variables to be read as bfloat16 type when using get_variable.
+  """
+  with variable_scope.variable_scope(
+      '', custom_getter=_get_custom_getter()) as varscope:
+    yield varscope
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..26fd3768278cacd076e5fee8bdad75d0486678d0
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for bfloat16 helper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.tpu import bfloat16
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variable_scope
+
+from tensorflow.python.platform import test
+
+
+class BFloat16ScopeTest(test.TestCase):
+
+  def testScopeName(self):
+    """Test if name for the variable scope is propogated correctly.
+    """
+    with bfloat16.bfloat16_scope() as bf:
+      self.assertEqual(bf.name, "")
+
+  def testRequestedDType(self):
+    """Test if requested dtype is honored in the getter.
+    """
+    with bfloat16.bfloat16_scope() as scope:
+      v1 = variable_scope.get_variable("v1", [])
+      self.assertEqual(v1.dtype.base_dtype, dtypes.float32)
+      v2 = variable_scope.get_variable("v2", [], dtype=dtypes.bfloat16)
+      self.assertEqual(v2.dtype.base_dtype, dtypes.bfloat16)
+      self.assertEqual([dtypes.float32, dtypes.float32],
+                       [v.dtype.base_dtype for v in scope.global_variables()])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e472a2805f98b15505f56af403aa6223e28c667
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -0,0 +1,184 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Library of Cloud TPU helper functions for data loading."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import functional_ops
+
+
+def _TextLineDataset(filename):
+  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+  dataset = readers.TextLineDataset(filename, buffer_size=buffer_size)
+  return dataset
+
+
+def _TFRecordDataset(filename):
+  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+  dataset = readers.TFRecordDataset(filename, buffer_size=buffer_size)
+  return dataset
+
+
+_FILETYPE_MAP = {
+    'tfrecord': _TFRecordDataset,
+    'textline': _TextLineDataset,
+    'text': _TextLineDataset,
+}
+
+
+def StreamingFilesDataset(files,
+                          filetype=None,
+                          file_reader_job=None,
+                          worker_job=None,
+                          num_epochs=None,
+                          filename_shuffle_buffer_size=None,
+                          num_parallel_reads=None,
+                          batch_transfer_size=None,
+                          sloppy=None):
+  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).
+
+  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
+  files local to your GCE VM. In order to train using files stored on your local
+  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
+  helper to generate a dataset to feed your Cloud TPU with files from your GCE
+  VM.
+
+  The resulting dataset may return an OutOfRangeError if there are no files
+  found as a result of the fileglob expansion.
+
+  Note: StreamingFilesDataset assumes that the session is using a
+  TPUClusterResolver and has therefore a worker and a coordinator job. File
+  loading will be done on the coordinator job.
+
+  Args:
+    files: A string glob to match files, or a `tf.data.Dataset` generating file
+      names.
+    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
+      TensorFlow function that when given a filename returns a dataset.
+    file_reader_job: An optional string that corresponds to the job that should
+      perform the file reads.
+    worker_job: An optional string that corresponds to the job that should
+      process the tensors (i.e. your GPU or TPU worker).
+    num_epochs: The number of epochs through the training set that should be
+      generated. By default, it will repeat infinitely.
+    filename_shuffle_buffer_size: An optional integer whose value controls the
+      shuffling of the file names. If you would like to read from the files in
+      the same order, set to 0 or False.
+    num_parallel_reads: An optional integer controlling the number of files to
+      read from concurrently. (Set to 1 for no parallelism.)
+    batch_transfer_size: An optional integer controlling the batching used to
+      amortize the remote function invocation overhead. Set to a very large
+      number to increase throughput. Set to a very small number to reduce memory
+      consumption. Set to False to skip batching.
+    sloppy: (Optional.) If `False`, read input data while maintaining a
+      deterministic order. (This may have significant performance impacts.)
+      sloppy defaults to: True.
+  Returns:
+    A `tf.data.Dataset` with an infinite stream of elements generated by a
+    parallel interleaving of the set of files matched (or generated) by `files`
+    with a type is the output of the dataset specified by `filetype`.
+
+  Raises:
+    ValueError: if any argument is not of the expected type.
+  """
+  if filetype is None:
+    filetype = 'tfrecord'
+
+  if isinstance(filetype, str):
+    if filetype not in _FILETYPE_MAP:
+      raise ValueError('Unexpected filetype: %s' % filetype)
+    reader_fn = _FILETYPE_MAP[filetype]
+  elif callable(filetype):
+    reader_fn = filetype
+  else:
+    raise ValueError('filetype should be a string or a callable')
+
+  file_reader_job = file_reader_job or 'coordinator'
+
+  worker_job = worker_job or 'worker'
+
+  if filename_shuffle_buffer_size is None:
+    filename_shuffle_buffer_size = 4096
+
+  num_parallel_reads = num_parallel_reads or 8
+
+  if batch_transfer_size is None:
+    batch_transfer_size = 256
+
+  if sloppy is None:
+    sloppy = True
+
+  with ops.device('/job:%s' % file_reader_job):
+    if isinstance(files, str):
+      source_dataset = dataset_ops.Dataset.list_files(files)
+    elif isinstance(files, dataset_ops.Dataset):
+      source_dataset = files
+    else:
+      raise ValueError('files was not a string or a dataset: %s' % files)
+
+    if filename_shuffle_buffer_size:
+      source_dataset = source_dataset.shuffle(
+          buffer_size=filename_shuffle_buffer_size)
+
+    # NOTE: We perform the `repeat` on the source dataset, because the output
+    # dataset does not currently have enough information to recreate an iterator
+    # over the source dataset when it reaches the end.
+    source_dataset = source_dataset.repeat(num_epochs)
+
+    source_dataset = source_dataset.apply(
+        interleave_ops.parallel_interleave(
+            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))
+
+    if batch_transfer_size:
+      source_dataset = source_dataset.batch(batch_transfer_size)
+
+    source_dataset = source_dataset.prefetch(1)
+
+    source_iterator = source_dataset.make_one_shot_iterator()
+    source_handle = source_iterator.string_handle()
+
+  @function.Defun(dtypes.string)
+  def LoadingFunc(h):
+    remote_iterator = iterator_ops.Iterator.from_string_handle(
+        h, source_dataset.output_types, source_dataset.output_shapes)
+    return remote_iterator.get_next()
+
+  def MapFn(unused_input):
+    return functional_ops.remote_call(
+        args=[source_handle],
+        Tout=[dtypes.string],
+        f=LoadingFunc,
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
+
+  with ops.device('/job:%s' % worker_job):
+    output_dataset = dataset_ops.Dataset.range(2).repeat().map(
+        MapFn, num_parallel_calls=4 if sloppy else None)
+    output_dataset = output_dataset.prefetch(1)
+
+    if batch_transfer_size:
+      # Undo the batching used during the transfer.
+      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)
+
+  return output_dataset
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..918cf0ed8e513de0d4207f7d2aac61ad886c8288
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -0,0 +1,181 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU datasets tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.tpu.python.tpu import datasets
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+_NUM_FILES = 10
+_NUM_ENTRIES = 20
+
+
+class DatasetsTest(test.TestCase):
+
+  def setUp(self):
+    super(DatasetsTest, self).setUp()
+    self._coord = server_lib.Server.create_local_server()
+    self._worker = server_lib.Server.create_local_server()
+
+    self._cluster_def = cluster_pb2.ClusterDef()
+    worker_job = self._cluster_def.job.add()
+    worker_job.name = 'worker'
+    worker_job.tasks[0] = self._worker.target[len('grpc://'):]
+    coord_job = self._cluster_def.job.add()
+    coord_job.name = 'coordinator'
+    coord_job.tasks[0] = self._coord.target[len('grpc://'):]
+
+    session_config = config_pb2.ConfigProto(cluster_def=self._cluster_def)
+
+    self._sess = session.Session(self._worker.target, config=session_config)
+
+  def testTextLineDataset(self):
+    all_contents = []
+    for i in range(_NUM_FILES):
+      filename = os.path.join(self.get_temp_dir(), 'text_line.%d.txt' % i)
+      contents = []
+      for j in range(_NUM_ENTRIES):
+        contents.append(compat.as_bytes('%d: %d' % (i, j)))
+      with open(filename, 'wb') as f:
+        f.write(b'\n'.join(contents))
+      all_contents.extend(contents)
+
+    dataset = datasets.StreamingFilesDataset(
+        os.path.join(self.get_temp_dir(), 'text_line.*.txt'), filetype='text')
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = []
+    for _ in range(4 * len(all_contents)):
+      retrieved_values.append(compat.as_bytes(self._sess.run(get_next)))
+
+    self.assertEqual(set(all_contents), set(retrieved_values))
+
+  def testTFRecordDataset(self):
+    all_contents = []
+    for i in range(_NUM_FILES):
+      filename = os.path.join(self.get_temp_dir(), 'tf_record.%d' % i)
+      writer = python_io.TFRecordWriter(filename)
+      for j in range(_NUM_ENTRIES):
+        record = compat.as_bytes('Record %d of file %d' % (j, i))
+        writer.write(record)
+        all_contents.append(record)
+      writer.close()
+
+    dataset = datasets.StreamingFilesDataset(
+        os.path.join(self.get_temp_dir(), 'tf_record*'), filetype='tfrecord')
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = []
+    for _ in range(4 * len(all_contents)):
+      retrieved_values.append(compat.as_bytes(self._sess.run(get_next)))
+
+    self.assertEqual(set(all_contents), set(retrieved_values))
+
+  def testTFRecordDatasetFromDataset(self):
+    filenames = []
+    all_contents = []
+    for i in range(_NUM_FILES):
+      filename = os.path.join(self.get_temp_dir(), 'tf_record.%d' % i)
+      filenames.append(filename)
+      writer = python_io.TFRecordWriter(filename)
+      for j in range(_NUM_ENTRIES):
+        record = compat.as_bytes('Record %d of file %d' % (j, i))
+        writer.write(record)
+        all_contents.append(record)
+      writer.close()
+
+    filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
+
+    dataset = datasets.StreamingFilesDataset(filenames, filetype='tfrecord')
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = []
+    for _ in range(4 * len(all_contents)):
+      retrieved_values.append(compat.as_bytes(self._sess.run(get_next)))
+
+    self.assertEqual(set(all_contents), set(retrieved_values))
+
+  def testArbitraryReaderFunc(self):
+
+    def MakeRecord(i, j):
+      return compat.as_bytes('%04d-%04d' % (i, j))
+
+    record_bytes = len(MakeRecord(10, 200))
+
+    all_contents = []
+    for i in range(_NUM_FILES):
+      filename = os.path.join(self.get_temp_dir(), 'fixed_length.%d' % i)
+      with open(filename, 'wb') as f:
+        for j in range(_NUM_ENTRIES):
+          record = MakeRecord(i, j)
+          f.write(record)
+          all_contents.append(record)
+
+    def FixedLengthFile(filename):
+      return readers.FixedLengthRecordDataset(filename, record_bytes)
+
+    dataset = datasets.StreamingFilesDataset(
+        os.path.join(self.get_temp_dir(), 'fixed_length*'),
+        filetype=FixedLengthFile)
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = []
+    for _ in range(4 * len(all_contents)):
+      retrieved_values.append(compat.as_bytes(self._sess.run(get_next)))
+
+    self.assertEqual(set(all_contents), set(retrieved_values))
+
+  def testUnexpectedFiletypeString(self):
+    with self.assertRaises(ValueError):
+      datasets.StreamingFilesDataset(
+          os.path.join(self.get_temp_dir(), '*'), filetype='foo')
+
+  def testUnexpectedFiletypeType(self):
+    with self.assertRaises(ValueError):
+      datasets.StreamingFilesDataset(
+          os.path.join(self.get_temp_dir(), '*'), filetype=3)
+
+  def testUnexpectedFilesType(self):
+    with self.assertRaises(ValueError):
+      datasets.StreamingFilesDataset(123, filetype='tfrecord')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
index bdd9b88af55fa4fb483ddbdbe5c51d7076cce675..726b2d248e3086e1882004827076ed3e563d960d 100644
--- a/tensorflow/contrib/tpu/python/tpu/device_assignment.py
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -191,9 +191,9 @@ class DeviceAssignment(object):
       logical_core: A tuple of three integers which represents a logical core.
     Returns:
       A sorted list of the replicas that are attached to that task and
-      loical_core.
+      logical_core.
     Raises:
-      ValueError: If no replica exisis in the task which contains the logical
+      ValueError: If no replica exists in the task which contains the logical
       core.
     """
     try:
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1d8d38a9a0e68719380d062a2357930b2f5f167
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -0,0 +1,426 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""*Experimental* support for running Keras models on the TPU.
+
+To use, wrap your model with the `keras_support.tpu_model` function.
+
+Example usage:
+
+```
+# Must activate before building TPU models
+keras_support.setup_tpu_session(master_address)
+
+image = tf.keras.layers.Input(shape=(28, 28, 3), name='image')
+c1 = tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3))( image)
+flattened = tf.keras.layers.Flatten()(c1)
+logits = tf.keras.layers.Dense(10, activation='softmax')(flattened)
+model = tf.keras.Model(inputs=[image], outputs=[logits])
+model = keras_support.tpu_model(model)
+
+# Only TF optimizers are currently supported.
+model.compile(optimizer=tf.train.AdamOptimizer(), ...)
+
+# `images` and `labels` should be Numpy arrays.  Support for tensor input
+# (e.g. datasets) is planned.
+model.fit(images, labels)
+
+# Invoke before shutting down
+keras_support.shutdown_tpu_session()
+```
+"""
+
+# pylint: disable=protected-access
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import layers
+from tensorflow.python.keras._impl.keras import models
+from tensorflow.python.keras._impl.keras import optimizers as keras_optimizers
+from tensorflow.python.keras._impl.keras.layers import embeddings
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+
+
+class TPUEmbedding(embeddings.Embedding):
+  """TPU compatible embedding layer.
+
+  The default Keras layer is not TPU compatible.  This layer is a drop-in
+  replacement: it has the same behavior and will work on CPU and GPU devices.
+  """
+
+  def __init__(self, *args, **kw):
+    super(TPUEmbedding, self).__init__(*args, **kw)
+
+  def build(self, input_shape):
+    if input_shape[0] is None:
+      raise ValueError(
+          'TPUEmbeddings must have a fixed input_length or input shape.')
+    return super(TPUEmbedding, self).build(input_shape)
+
+  def call(self, inputs):
+    if K.dtype(inputs) != 'int32':
+      inputs = math_ops.cast(inputs, 'int32')
+
+    inputs = array_ops.one_hot(inputs, self.input_dim)
+    return math_ops.tensordot(inputs, self.embeddings, 1)
+
+
+class CompiledTPUOp(
+    collections.namedtuple(
+        'CompiledTPUOp',
+        ['tpu_execute_op', 'infeed_tensors', 'infeed_op', 'outfeed_op'])):
+  pass
+
+
+def _valid_name(tensor_name):
+  """Return a valid tensor name (strips '/', ':', etc)."""
+  return re.sub('[^a-zA-Z0-9_-]+', '', tensor_name)
+
+
+class TPUFunction(object):
+  """K.function compatible interface for invoking a TPU compiled function.
+
+  Recompilation is triggered on-demand for each set of new inputs shapes: the
+  results are cached for future execution.  We expect most computations will
+  be dominated by a standard batch-size, followed by a straggler batch for
+  the end of training or evaluation.
+
+  All `inputs` and `outputs` will be loaded via the infeed and outfeed queues
+  instead of being injected as `feed_dict` items or fetches.
+  """
+
+  def __init__(self, model, execution_mode):
+    self.model = model
+    self.execution_mode = execution_mode
+    self._compilation_cache = {}
+
+  def _specialize_model(self, input_specs):
+    """Specialize `self.model` (a Keras model) for the given input shapes."""
+    # Re-create our input and output layers inside our subgraph.  They will be
+    # attached to the true computation when we clone our model in `tpu_fn`.
+    K.set_learning_phase(
+        self.execution_mode == model_fn_lib.ModeKeys.TRAIN
+    )
+
+    # functools.partial and callable objects are not supported by tpu.rewrite
+    def _model_fn():
+      """Compute fit/eval/predict for the TPU."""
+      is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN
+      is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL
+      is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT
+
+      # During train/eval, we infeed our features as well as labels.
+      if is_training or is_test:
+        infeed_layers = self.model._input_layers + self.model._output_layers
+      else:
+        infeed_layers = self.model._input_layers
+
+      # Generate our infeed operation to read features & labels.
+      infeed_tensors = tpu_ops.infeed_dequeue_tuple(
+          dtypes=[spec.dtype for spec in input_specs],
+          shapes=[spec.shape for spec in input_specs],
+          name='infeed-%s' % self.execution_mode)
+
+      assert len(infeed_tensors) == len(infeed_layers), (
+          'Infeed inputs did not match model: %s vs %s', (infeed_layers,
+                                                          infeed_tensors))
+
+      tpu_targets = []
+      tpu_inputs = []
+
+      # Sort infeed outputs into inputs and labels for calling our Keras model.
+      for tensor, layer in zip(infeed_tensors, infeed_layers):
+        if layer in self.model._input_layers:
+          tpu_inputs.append(layers.Input(name=layer.name, tensor=tensor))
+        if layer in self.model._output_layers:
+          tpu_targets.append(tensor)
+
+      # Call our model with our infeed inputs (re-using the weights).
+      model_outputs = self.model(tpu_inputs)
+      child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs)
+      if is_training or is_test:
+        child_model.compile(
+            optimizer=self.model.optimizer,
+            loss=self.model.loss,
+            loss_weights=self.model.loss_weights,
+            metrics=self.model.metrics,
+            weighted_metrics=self.model.weighted_metrics,
+            target_tensors=tpu_targets,
+        )
+
+      # Compute our outfeed depending on the execution mode
+      if is_training:
+        child_model._make_train_function()
+        self._outfeed_spec = [
+            tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
+            for tensor in child_model.train_function.outputs
+        ]
+        return [
+            child_model.train_function.updates_op,
+            tpu_ops.outfeed_enqueue_tuple(
+                child_model.train_function.outputs, name='oufeed-enqueue-train')
+        ]
+      elif is_test:
+        child_model._make_test_function()
+        self._outfeed_spec = [
+            tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
+            for tensor in child_model.test_function.outputs
+        ]
+        return [
+            tpu_ops.outfeed_enqueue_tuple(
+                child_model.test_function.outputs, name='outfeed-enqueue-test')
+        ]
+      elif is_predict:
+        child_model._make_predict_function()
+        self._outfeed_spec = [
+            tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
+            for tensor in child_model.predict_function.outputs
+        ]
+        return [
+            tpu_ops.outfeed_enqueue_tuple(
+                child_model.predict_function.outputs,
+                name='outfeed-enqueue-predict',
+            )
+        ]
+      else:
+        assert False, 'Unexpected execution mode: %s' % self.execution_mode
+
+    # Capture outfeed metadata computed during the rewrite.
+    self._outfeed_spec = None
+
+    tpu_execute_op = tpu.rewrite(_model_fn)
+
+    # Generate CPU side operations to enqueue features/labels and dequeue
+    # outputs from the model call.
+    with ops.device('/device:TPU:0'):
+      infeed_tensors = []
+      for spec in input_specs:
+        infeed_tensors.append(
+            array_ops.placeholder(
+                dtype=spec.dtype,
+                shape=spec.shape,
+                name='infeed-enqueue-%s' % spec.name))
+
+      infeed_op = tpu_ops.infeed_enqueue_tuple(
+          infeed_tensors, [spec.shape for spec in input_specs],
+          name='infeed-enqueue-%s' % self.execution_mode)
+
+      outfeed_op = tpu_ops.outfeed_dequeue_tuple(
+          dtypes=[spec.dtype for spec in self._outfeed_spec],
+          shapes=[spec.shape for spec in self._outfeed_spec],
+          name='outfeed-dequeue-%s' % self.execution_mode)
+
+    return CompiledTPUOp(tpu_execute_op, infeed_tensors, infeed_op, outfeed_op)
+
+  def __call__(self, inputs):
+    assert isinstance(inputs, list)
+
+    # Strip sample weight from inputs
+    if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or
+        self.execution_mode == model_fn_lib.ModeKeys.EVAL):
+      input_tensors = self.model._feed_inputs + self.model._feed_targets
+      inputs = inputs[:len(input_tensors)]
+    else:
+      input_tensors = self.model._feed_inputs
+
+    # Compute an input specification (used to generate infeed enqueue and
+    # dequeue operations).  We use the shape from our input array and the
+    # dtype from our model.  A user may pass in a float64 for a float32
+    # input: for model compatibility we still must generate a float32 infeed.
+    input_specs = []
+    for tensor, ary in zip(input_tensors, inputs):
+      input_specs.append(
+          tensor_spec.TensorSpec(ary.shape, tensor.dtype,
+                                 _valid_name(tensor.name)))
+
+    # XLA requires every operation in the graph has a fixed shape.  To
+    # handle varying batch sizes we recompile a new sub-graph for each
+    # unique input shape.
+    shape_key = tuple([tuple(spec.shape.as_list()) for spec in input_specs])
+
+    if shape_key not in self._compilation_cache:
+      logging.info('New input shapes; (re-)compiling: mode=%s, %s',
+                   self.execution_mode, input_specs)
+      self._compilation_cache[shape_key] = self._specialize_model(input_specs)
+
+    compiled_model = self._compilation_cache[shape_key]
+
+    infeed_dict = {}
+    for tensor, value in zip(compiled_model.infeed_tensors, inputs):
+      infeed_dict[tensor] = value
+
+    session = K.get_session()
+    _, _, outfeed_outputs = session.run([
+        compiled_model.infeed_op, compiled_model.tpu_execute_op,
+        compiled_model.outfeed_op
+    ], infeed_dict)
+
+    return outfeed_outputs
+
+
+@experimental
+def setup_tpu_session(master):
+  """Initializes and returns a Keras/TF session connected the TPU `master`."""
+  session = tf_session.Session(
+      target=master, config=config_pb2.ConfigProto(isolate_session_state=True))
+  K.set_session(session)
+  K.get_session().run(tpu.initialize_system())
+  return session
+
+
+@experimental
+def shutdown_tpu_session(session=None):
+  """Shutdown the TPU attached to session.
+
+  This should be called to cleanly shut down the TPU system before the client
+  exits.
+
+  Args:
+    session: Session to shutdown, or None to use the default session.
+
+  Returns:
+
+  """
+  if session is None:
+    session = K.get_session()
+
+  session.run(tpu.shutdown_system())
+
+
+class KerasTPUModel(models.Model):
+  """TPU compatible Keras model wrapper."""
+
+  def __init__(self, inputs, outputs, name=None):
+    super(models.Model, self).__init__(
+        inputs=inputs,
+        outputs=outputs,
+        name=name,
+    )
+    self.predict_function = None
+    self.test_function = None
+    self.train_function = None
+
+  def compile(self,
+              optimizer,
+              loss=None,
+              metrics=None,
+              loss_weights=None,
+              sample_weight_mode=None,
+              weighted_metrics=None,
+              target_tensors=None,
+              **kwargs):
+    if sample_weight_mode:
+      raise ValueError('sample_weight_mode not supported for TPU execution.')
+    if weighted_metrics:
+      raise ValueError('weighted_metrics not supported for TPU execution.')
+    if target_tensors:
+      raise ValueError('target_tensors is not supported for TPU execution.')
+
+    super(KerasTPUModel, self).compile(optimizer, loss, metrics, loss_weights,
+                                       sample_weight_mode, weighted_metrics,
+                                       target_tensors, **kwargs)
+
+    # Keras optimizers are not compatible with TPU rewrite
+    if not isinstance(self.optimizer, keras_optimizers.TFOptimizer):
+      raise ValueError(
+          'Optimizer must be a TFOptimizer, got: %s' % self.optimizer)
+
+  def _make_train_function(self):
+    if not self.train_function:
+      self.train_function = TPUFunction(self, model_fn_lib.ModeKeys.TRAIN)
+
+    return self.train_function
+
+  def _make_test_function(self):
+    if not self.test_function:
+      self.test_function = TPUFunction(self, model_fn_lib.ModeKeys.EVAL)
+    return self.test_function
+
+  def _make_predict_function(self):
+    if not self.predict_function:
+      self.predict_function = TPUFunction(self, model_fn_lib.ModeKeys.PREDICT)
+    return self.predict_function
+
+  def cpu_model(self):
+    cpu_model = models.Model(
+        inputs=self.inputs,
+        outputs=self.outputs,
+        name=self.name,
+    )
+
+    if self.optimizer:
+      cpu_model.compile(
+          optimizer=self.optimizer,
+          loss=self.loss,
+          metrics=self.metrics,
+          loss_weights=self.loss_weights,
+      )
+
+    return cpu_model
+
+
+def _validate_shapes(model):
+  """Validate that all layers in `model` have constant shape."""
+  for layer in model.layers:
+    if isinstance(layer.input_shape, tuple):
+      input_shapes = [layer.input_shape]
+    else:
+      input_shapes = layer.input_shape
+
+    if isinstance(layer.output_shape, tuple):
+      output_shapes = [layer.output_shape]
+    else:
+      output_shapes = layer.output_shape
+
+    for shape in input_shapes + output_shapes:
+      for dim in shape[1:]:
+        if dim is None:
+          raise ValueError(
+              """
+Layer %(layer)s has a variable shape in a non-batch dimension.  TPU models must
+have constant shapes for all operations.
+
+You may have to specify `input_length` for RNN/TimeDistributed layers.
+
+Layer: %(layer)s
+Input shape: %(input_shape)s
+Output shape: %(output_shape)s
+  """ % {
+      'layer': layer,
+      'input_shape': layer.input_shape,
+      'output_shape': layer.output_shape
+      })
+
+
+@experimental
+def tpu_model(model):
+  _validate_shapes(model)
+  return KerasTPUModel(
+      inputs=model.inputs, outputs=model.outputs, name=model.name)
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c25f6693cd27df3daee59882b22bc2abdbe6ad5
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -0,0 +1,311 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Operations for handling session logging and shutdown notifications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import time
+from google.protobuf import text_format
+
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+
+
+class CoordinatorShutdownException(Exception):
+  """Raised when the coordinator needs to shutdown."""
+  pass
+
+
+class WorkerHeartbeatManager(object):
+  """Manages the status/heartbeat monitor for a set of workers."""
+
+  def __init__(self, session, devices, heartbeat_ops, request_placeholder):
+    """Construct a new WorkerHeartbeatManager.
+
+    (Prefer using `WorkerHeartbeatManager.from_devices` when possible.)
+
+    Args:
+      session: `tf.Session`, session to use for heartbeat operations.
+      devices: `list[string]` Set of devices to connect to.
+      heartbeat_ops: `list[tf.Operation]` Heartbeat operations.
+      request_placeholder: `tf.Placeholder[String]` Placeholder used to specify
+        the WorkerHeartbeatRequest protocol buffer.
+    """
+    self._session = session
+    self._devices = devices
+    self._ops = heartbeat_ops
+    self._request_placeholder = request_placeholder
+
+  @staticmethod
+  def from_devices(session, devices):
+    """Construct a heartbeat manager for the given devices."""
+    if not devices:
+      logging.error('Trying to create heartbeat manager with no devices?')
+
+    logging.info('Creating heartbeat manager for %s', devices)
+    request_placeholder = array_ops.placeholder(
+        name='worker_heartbeat_request', dtype=dtypes.string)
+
+    heartbeat_ops = []
+    for device in devices:
+      with ops.device(device):
+        heartbeat_ops.append(tpu_ops.worker_heartbeat(request_placeholder))
+
+    return WorkerHeartbeatManager(session, devices, heartbeat_ops,
+                                  request_placeholder)
+
+  def configure(self, message):
+    """Configure heartbeat manager for all devices.
+
+    Args:
+      message: `event_pb2.WorkerHeartbeatRequest`
+
+    Returns: `None`
+
+    """
+    logging.info('Configuring worker heartbeat: %s',
+                 text_format.MessageToString(message))
+    self._session.run(self._ops,
+                      {self._request_placeholder: message.SerializeToString()})
+
+  def ping(self, request=None, timeout_in_ms=5000):
+    """Ping all workers, returning the parsed status results."""
+    if request is None:
+      request = event_pb2.WorkerHeartbeatRequest()
+
+    options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms)
+    results = self._session.run(
+        self._ops,
+        feed_dict={self._request_placeholder: request.SerializeToString()},
+        options=options)
+    parsed_results = [
+        event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
+        for res_pb in results
+    ]
+    logging.info('Results: %s', parsed_results)
+    return parsed_results
+
+  def lame_workers(self):
+    """Ping all workers, returning manager containing lame workers (or None)."""
+    ping_results = self.ping()
+    lame_workers = []
+
+    for ping_response, device, op in zip(ping_results, self._devices,
+                                         self._ops):
+      if ping_response.health_status != event_pb2.OK:
+        lame_workers.append((device, op))
+
+    if not lame_workers:
+      return None
+
+    bad_devices, bad_ops = zip(*lame_workers)
+    return WorkerHeartbeatManager(self._session, bad_devices, bad_ops,
+                                  self._request_placeholder)
+
+  def shutdown(self, timeout_ms=10000):
+    """Shutdown all workers after `shutdown_timeout_secs`."""
+    req = event_pb2.WorkerHeartbeatRequest(
+        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms))
+    self.configure(req)
+
+
+def all_worker_devices(session):
+  """Return a list of devices for each worker in the system."""
+  devices = session.list_devices()
+  return [device.name for device in devices if 'CPU' in device.name]
+
+
+class WatchdogManager(threading.Thread):
+  """Configures worker watchdog timer and handles periodic pings.
+
+  Usage:
+    # Ping workers every minute, shutting down workers if they haven't received
+    # a ping after 1 hour.
+    watchdog_manager = WatchdogManager(
+      ping_interval=60, shutdown_timeout=3600
+    )
+
+    # Use as a context manager, resetting watchdog on context exit:
+    with watchdog_manager:
+      session.run(...)
+
+    # Or setup globally; watchdog will remain active until program exit.
+    watchdog_manager.configure_and_run()
+  """
+
+  def __init__(self,
+               session,
+               devices=None,
+               ping_interval=60,
+               shutdown_timeout=3600):
+    """Initialize a watchdog manager.
+
+    Args:
+
+      session: Session connected to worker devices.  A cloned session and graph
+        will be created for managing worker pings.
+      devices: Set of devices to monitor.  If none, all workers will be
+        monitored.
+      ping_interval: Time, in seconds, between watchdog pings.
+      shutdown_timeout: Time, in seconds, before watchdog timeout.
+    """
+    threading.Thread.__init__(self)
+    self.ping_interval = ping_interval
+    self.shutdown_timeout = shutdown_timeout
+    self.daemon = True
+    self._running = False
+    self._graph = ops.Graph()
+    self._session = session_lib.Session(
+        target=session.sess_str, graph=self._graph)
+
+    with self._graph.as_default():
+      if devices is None:
+        devices = all_worker_devices(self._session)
+      self._worker_manager = WorkerHeartbeatManager.from_devices(
+          self._session, devices)
+
+  def configure_and_run(self):
+    logging.info('Enabling worker watchdog.')
+    self._running = True
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(
+                timeout_ms=self.shutdown_timeout * 1000,)))
+
+    self.start()
+
+  def __enter__(self):
+    self.configure_and_run()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    logging.info('Disabling worker watchdog.')
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,)))
+    self._running = False
+    self.join()
+
+  def run(self):
+    # Don't fetch logs or adjust timing: just ping the watchdog.
+    while self._running:
+      self._worker_manager.ping(request=None)
+      time.sleep(self.ping_interval)
+
+
+class GracefulShutdownHook(session_run_hook.SessionRunHook):
+  """Session hook that watches for shutdown events.
+
+  If a shutdown is indicated, `saver.save(checkpoint_prefix)` is executed, and a
+  SystemShutdown exception is raised to terminate the main session.  If `saver`
+  is None the `SAVERS` collection will be read to find a saver.
+
+  `on_shutdown_hooks` is an optional list of functions that should be called
+  after checkpointing.  The function is called with (`run_context`,
+  `all_workers`, `lame_workers`).
+
+  If `heartbeat_group` is not specified, it will default to all CPU workers
+  in the system.
+  """
+
+  def __init__(self, checkpoint_prefix, saver=None, on_shutdown_hooks=None):
+    self._saver = saver
+    self._checkpoint_prefix = checkpoint_prefix
+    self._on_shutdown_hooks = on_shutdown_hooks if on_shutdown_hooks else []
+
+    # Worker heartbeats are managed independently of the main training graph.
+    self._graph = ops.Graph()
+    self._workers = None
+    self._session = None
+
+  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
+    # N.B. We have to pull the global step here to avoid it being unavailable
+    # at checkpoint time; the graph has been frozen at that point.
+    if training_util.get_global_step() is None and self.saver() is not None:
+      raise ValueError(
+          'Saver defined but no global step.  Run `get_or_create_global_step()`'
+          ' in your model definition to allow checkpointing.')
+
+    with self._graph.as_default():
+      self._session = session_lib.Session(
+          target=training_session.sess_str, graph=self._graph)
+      self._workers = WorkerHeartbeatManager.from_devices(
+          self._session, all_worker_devices(self._session))
+
+      self._workers.configure(
+          event_pb2.WorkerHeartbeatRequest(
+              shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+
+  def saver(self):
+    if self._saver:
+      return self._saver
+
+    savers = ops.get_collection(ops.GraphKeys.SAVERS)[0]
+    if not savers:
+      return None
+
+    if not isinstance(savers, list):
+      return savers
+
+    assert len(savers) == 1, 'Only one saver supported.'
+    return savers[0]
+
+  def after_run(self, run_context, run_values):
+    del run_values
+
+    lame_workers = self._workers.lame_workers()
+    if lame_workers:
+      logging.info('ShutdownHook: lame workers found: %s', lame_workers)
+
+      if self.saver():
+        logging.info('ShutdownHook: saving checkpoint to %s',
+                     self._checkpoint_prefix)
+        self.saver().save(
+            run_context.session,
+            self._checkpoint_prefix,
+            global_step=training_util.get_global_step(),
+            write_state=True,
+        )
+      else:
+        logging.info('ShutdownHook: no Saver defined.')
+
+      for fn in self._on_shutdown_hooks:
+        fn(run_context, self._workers, lame_workers)
+
+
+def restart_computation(run_context, all_workers, lame_workers):
+  del run_context, lame_workers
+  logging.info('Shutting down all workers.')
+  all_workers.shutdown()
+
+  logging.info('Terminating coordinator.')
+  raise CoordinatorShutdownException()
+
+
+def shutdown_lame_workers(run_context, all_workers, lame_workers):
+  del run_context, all_workers
+  logging.info('Shutting down %s', lame_workers)
+  lame_workers.shutdown()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index d5f54ff4fd278f0c84f79e0079bfb7a409dfba8d..7b8786304ccf7e707712f178838cafb8346d2941 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -25,6 +25,8 @@ from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -56,6 +58,7 @@ _NOT_IMPLEMENTED_OPS = set([
 _MAX_WARNING_LINES = 5
 
 _TPU_REPLICATE_ATTR = "_tpu_replicate"
+_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
 
 
 def _tpu_system_device_name(job):
@@ -121,8 +124,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name):
+  def __init__(self, name, num_replicas):
     super(TPUReplicateContext, self).__init__()
+    self._num_replicas = num_replicas
+    self._outer_device_function_stack = None
+    self._oc_dev_fn_stack = None
+    self._outside_compilation_cluster = None
+    self._outside_compilation_counter = 0
+    self._in_gradient_colocation = None
+    self._gradient_colocation_stack = []
+    self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
 
@@ -136,6 +147,125 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
         logging.warning("... and %d more" %
                         (len(self._unsupported_ops) - _MAX_WARNING_LINES))
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      self._gradient_colocation_stack.append(op)
+      if not self._outside_compilation_cluster:
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
+          if self._in_gradient_colocation:
+            raise NotImplementedError(
+                "Cannot nest gradient colocation operations outside compilation"
+            )
+          if gradient_uid == "__unsupported__":
+            raise NotImplementedError(
+                "No gradient_uid calling gradient within outside_compilation")
+          # When we take the gradient of an op X in an
+          # outside_compilation cluster C in a forward computation we
+          # would like to put the ops corresponding to the gradient of
+          # X into a new outside_compilation cluster C'. However, if
+          # we take the gradient of X twice, the second one should get
+          # yet another new outside_compilation cluster C''.
+          #
+          # The mechanism we adopt is to use a 'root_cluster' which is
+          # the cluster that X was in before we took gradients, and a
+          # 'gradient_uid' which is different for every invocation of
+          # gradients, and put the gradient of X in cluster
+          # 'root_cluster.gradient_uid'.
+          #
+          # When taking a gradient of a gradient, some ops will be
+          # colocated with Op in the forward pass (e.g., cluster
+          # root_cluster) and some in the backward pass (e.g., cluster
+          # root_cluster.initial_gradient_uid). We need all of the
+          # grad-of-grad ops to be in the same cluster to avoid cyclic
+          # dependencies between clusters. We adopt a heuristic that
+          # puts any op clustered with root_cluster.<xxx> in
+          # root_cluster.gradient_uid, even if xxx was
+          # initial_gradient_uid.
+          self._in_gradient_colocation = op
+          parts = outside_attr.split(".")
+          cluster = parts[0] + "." + gradient_uid
+          self._EnterOutsideCompilationScope(cluster=cluster)
+        except ValueError:
+          # The attr was not present: do nothing.
+          pass
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      if not self._gradient_colocation_stack:
+        raise errors.InternalError(
+            op.node_def, op,
+            "Badly nested gradient colocation: empty stack when popping Op " +
+            op.name)
+      last_op = self._gradient_colocation_stack.pop()
+      if op is last_op:
+        if op is self._in_gradient_colocation:
+          self._in_gradient_colocation = None
+          self._ExitOutsideCompilationScope()
+      else:
+        raise errors.InternalError(
+            op.node_def, op, "Badly nested gradient colocation, expected " +
+            last_op + ", got " + op.name)
+
+  def _EnterOutsideCompilationScope(self, cluster=None):
+
+    class FakeOp(object):
+      """A helper class to determine the current device.
+
+      Supports only the device set/get methods needed to run the
+      graph's _apply_device_function method.
+      """
+
+      def __init__(self):
+        self._device = ""
+
+      @property
+      def device(self):
+        return self._device
+
+      def _set_device(self, device):
+        self._device = device.to_string()
+
+    if self._outside_compilation_cluster:
+      raise NotImplementedError("Cannot nest outside_compilation clusters")
+    if cluster:
+      self._outside_compilation_cluster = cluster
+    else:
+      self._outside_compilation_cluster = str(self._outside_compilation_counter)
+      self._outside_compilation_counter += 1
+    graph = ops.get_default_graph()
+    fake_op = FakeOp()
+    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
+    device = pydev.DeviceSpec.from_string(fake_op.device)
+    if (device.device_type == "TPU_REPLICATED_CORE" and
+        device.device_index is not None):
+      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
+                                     str(device.device_index))
+    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
+    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
+
+  def _ExitOutsideCompilationScope(self):
+    if not self._outside_compilation_cluster:
+      raise NotImplementedError(
+          "Attempted to exit outside_compilation scope when not in scope")
+    self._outside_compilation_cluster = None
+    graph = ops.get_default_graph()
+    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
+
+  def Enter(self):
+    if not self._outer_device_function_stack:
+      # Capture the device function stack at the time of first entry
+      # since that is the stack that will be used outside_compilation.
+      graph = ops.get_default_graph()
+      self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
+    super(TPUReplicateContext, self).Enter()
+
+  def Exit(self):
+    super(TPUReplicateContext, self).Exit()
+
+  def HostComputeCore(self):
+    return self._host_compute_core
+
   def AddOp(self, op):
     self._AddOpInternal(op)
 
@@ -157,9 +287,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise ValueError("TPU computations cannot be nested")
     op._set_attr(_TPU_REPLICATE_ATTR,
                  attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
-    # pylint: enable=protected-access
-    op.graph.prevent_feeding(op)
-    op.graph.prevent_fetching(op)
+    if self._outside_compilation_cluster:
+      op._set_attr(
+          _OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(
+              s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._num_replicas > 1 or not self._outside_compilation_cluster:
+      # Prevent feeding or fetching anything that is being compiled,
+      # and any replicated outside_compilation Op.
+      op.graph.prevent_feeding(op)
+      op.graph.prevent_fetching(op)
 
   def AddValue(self, val):
     result = val
@@ -181,6 +318,45 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     return None
 
 
+def outside_compilation(computation, args=None):
+  """Builds part of a computation outside any current TPU replicate scope.
+
+  Args:
+    computation: A Python function that builds the computation to
+      place on the host.
+    args: Inputs to pass to computation.
+  Returns:
+    The Tensors returned by computation.
+  """
+  graph = ops.get_default_graph()
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope")
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
+
+
 def replicate(computation,
               inputs=None,
               infeed_queue=None,
@@ -201,7 +377,7 @@ def replicate(computation,
       `DeviceAssignment` may be omitted if each replica of the computation uses
       only one core, and there is either only one replica, or the number of
       replicas is equal to the number of cores in the TPU system.
-    name: The name of the operator.
+    name: (Deprecated) Does nothing.
   Returns:
     A list of lists of output tensors, indexed by `[replica_num][output_num]`.
   Raises:
@@ -209,8 +385,7 @@ def replicate(computation,
     ValueError: If the number of inputs per replica does not match
       the number of formal parameters to `computation`.
   """
-  if name is None:
-    name = "TPUReplicate"
+  del name
   inputs = [[]] if inputs is None else inputs
 
   metadata_kwargs = {}
@@ -274,118 +449,124 @@ def replicate(computation,
 
   graph = ops.get_default_graph()
 
-  with ops.name_scope(name, "replicate"):
-    # Fan-in: Builds a TPUReplicatedInput node for each input.
-    computation_inputs = []
-    for i in range(0, input_arity):
-      replicas = [inputs[replica][i] for replica in xrange(num_replicas)]
-      computation_inputs.append(
-          tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
+  # Fan-in: Builds a TPUReplicatedInput node for each input.
+  computation_inputs = []
+  for i in range(0, input_arity):
+    replicas = [inputs[replica][i] for replica in xrange(num_replicas)]
+    computation_inputs.append(
+        tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
+
+  context = TPUReplicateContext(
+      name=graph.unique_name("cluster"), num_replicas=num_replicas)
+  try:
+    context.Enter()
+
+    metadata = tpu_ops.tpu_replicate_metadata(
+        num_replicas=num_replicas, **metadata_kwargs)
+
+    with tpu_function.tpu_shard_context(
+        num_replicas), ops.control_dependencies([metadata]):
+
+      # The EncapsulateTPUComputations rewrite needs to identify the
+      # replicated arguments inside each computation. Adds identity operators
+      # tagged with an attribute _tpu_replicated_input to identify the
+      # replicated inputs.
+      # pylint: disable=protected-access
+      with graph._attr_scope({"_tpu_replicated_input":
+                              attr_value_pb2.AttrValue(b=True)}):
+        computation_inputs = [
+            array_ops.identity(x, name="replicated_input_{}".format(i))
+            for i, x in enumerate(computation_inputs)]
+      # pylint: enable=protected-access
+
+      # If there is an infeed queue, adds the dequeued values to the
+      # computation's inputs.
+      if infeed_queue is not None:
+        infeed_queue.set_number_of_shards(num_replicas)
+        for t in infeed_queue.generate_dequeue_op():
+          computation_inputs.append(t)
+
+      # Only resource variables work inside a TPU computation, so turn on
+      # resource variables for the computation.
+      # TODO(phawkins): consider removing this code. It will
+      # be less confusing to clients if they knowingly choose to use resource
+      # variables.
+      vscope = variable_scope.get_variable_scope()
+      saved_use_resource = vscope.use_resource
+      vscope.set_use_resource(True)
+
+      outputs = computation(*computation_inputs)
+
+      vscope.set_use_resource(saved_use_resource)
+
+    # If the computation only returned one value, makes it a tuple.
+    if not isinstance(outputs, (list, tuple)):
+      outputs = (outputs,)
 
-    context = TPUReplicateContext(name=graph.unique_name("cluster"))
     try:
-      context.Enter()
-
-      metadata = tpu_ops.tpu_replicate_metadata(
-          num_replicas=num_replicas, **metadata_kwargs)
-
-      with tpu_function.tpu_shard_context(
-          num_replicas), ops.control_dependencies([metadata]):
-
-        # The EncapsulateTPUComputations rewrite needs to identify the
-        # replicated arguments inside each computation. Adds identity operators
-        # tagged with an attribute _tpu_replicated_input to identify the
-        # replicated inputs.
-        # pylint: disable=protected-access
-        with graph._attr_scope({"_tpu_replicated_input":
-                                attr_value_pb2.AttrValue(b=True)}):
-          computation_inputs = [
-              array_ops.identity(x, name="replicated_input_{}".format(i))
-              for i, x in enumerate(computation_inputs)]
-        # pylint: enable=protected-access
-
-        # If there is an infeed queue, adds the dequeued values to the
-        # computation's inputs.
-        if infeed_queue is not None:
-          infeed_queue.set_number_of_shards(num_replicas)
-          for t in infeed_queue.generate_dequeue_op():
-            computation_inputs.append(t)
-
-        # Only resource variables work inside a TPU computation, so turn on
-        # resource variables for the computation.
-        # TODO(phawkins): consider removing this code. It will
-        # be less confusing to clients if they knowingly choose to use resource
-        # variables.
-        vscope = variable_scope.get_variable_scope()
-        saved_use_resource = vscope.use_resource
-        vscope.set_use_resource(True)
-
-        outputs = computation(*computation_inputs)
-
-        vscope.set_use_resource(saved_use_resource)
-
-      # If the computation only returned one value, makes it a tuple.
-      if not isinstance(outputs, (list, tuple)):
-        outputs = (outputs,)
-
-      try:
-        with ops.device(core(0)):
-          outputs = [
-              o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-              for o in outputs
-          ]
-      except Exception as e:
-        raise ValueError(
-            "TPU function return values must all either be Operations or "
-            "convertible to Tensors. Got '%s'" % str(e))
-
-      # Separates the returned Operations and Tensors.
-      output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-      output_tensors = [o for o in outputs
-                        if not isinstance(o, ops.Operation)]
-
-      if outputs != output_tensors + output_operations:
-        raise ValueError(
-            "TPU functions must return zero-or more Tensor values followed by "
-            "zero or more Operations.")
-      output_arity = len(output_tensors)
-
-      # Wraps outputs in Identity ops. Otherwise a replicated input copied
-      # straight to an output would bypass the replicate(). This would be bad
-      # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
-      # be rewritten away, leading to a runtime error.
-      # TODO(phawkins): extend the rewrite to elide these nodes instead.
-      new_output_tensors = []
-      for t in output_tensors:
-        with ops.device(t.device if t.device else core(0)):
-          new_output_tensors.append(array_ops.identity(t))
-      output_tensors = new_output_tensors
-    finally:
-      context.report_unsupported_operations()
-      context.Exit()
-
-    # Fan-out: Builds a TPUReplicatedOutput node for each output.
-    outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
-                                             name="output{}".format(i))
-               for i in xrange(output_arity)]
-
-    with ops.control_dependencies(output_operations):
-      if output_arity == 0:
-        # Returns a list of NoOps dependent on the replication Op, indexed by
-        # [replica_num].
-        return [
-            control_flow_ops.no_op(name="%s_shard_%d" % (name, i))
-            for i in range(num_replicas)
-        ]
-      else:
-        # Wraps the outputs in identity operators so the names of any possible
-        # `fetch` nodes are preserved by the replication rewrite.
-        return [
-            [array_ops.identity(outputs[out][replica],
-                                name="output_%d_shard_%d" % (out, replica))
-             for out in xrange(output_arity)]
-            for replica in xrange(num_replicas)
+      with ops.device(core(0)):
+        outputs = [
+            o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+            for o in outputs
         ]
+    except Exception as e:
+      raise ValueError(
+          "TPU function return values must all either be Operations or "
+          "convertible to Tensors. Got '%s'" % str(e))
+
+    # Separates the returned Operations and Tensors.
+    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+    output_tensors = [o for o in outputs
+                      if not isinstance(o, ops.Operation)]
+
+    if outputs != output_tensors + output_operations:
+      raise ValueError(
+          "TPU functions must return zero-or more Tensor values followed by "
+          "zero or more Operations.")
+    output_arity = len(output_tensors)
+
+    # Wraps outputs in Identity ops. Otherwise a replicated input copied
+    # straight to an output would bypass the replicate(). This would be bad
+    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+    # be rewritten away, leading to a runtime error.
+    # TODO(phawkins): extend the rewrite to elide these nodes instead.
+    new_output_tensors = []
+    for t in output_tensors:
+      with ops.device(t.device if t.device else core(0)):
+        new_output_tensors.append(array_ops.identity(t))
+    output_tensors = new_output_tensors
+  finally:
+    context.report_unsupported_operations()
+    context.Exit()
+    host_compute_core = context.HostComputeCore()
+
+  if host_compute_core:
+    attr_value = attr_value_pb2.AttrValue()
+    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
+    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
+
+  # Fan-out: Builds a TPUReplicatedOutput node for each output.
+  outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
+                                           name="output{}".format(i))
+             for i in xrange(output_arity)]
+
+  with ops.control_dependencies(output_operations):
+    if output_arity == 0:
+      # Returns a list of NoOps dependent on the replication Op, indexed by
+      # [replica_num].
+      return [
+          control_flow_ops.no_op(name="shard_%d" % i)
+          for i in range(num_replicas)
+      ]
+    else:
+      # Wraps the outputs in identity operators so the names of any possible
+      # `fetch` nodes are preserved by the replication rewrite.
+      return [
+          [array_ops.identity(outputs[out][replica],
+                              name="output_%d_shard_%d" % (out, replica))
+           for out in xrange(output_arity)]
+          for replica in xrange(num_replicas)
+      ]
 
 
 def shard(computation,
@@ -450,7 +631,7 @@ def shard(computation,
       `DeviceAssignment` may be omitted if each shard of the computation uses
       only one core, and there is either only one shard, or the number of shards
       is equal to the number of cores in the TPU system.
-    name: The name of the operator.
+    name: (Deprecated) Does nothing.
   Returns:
     A list of output tensors.
   Raises:
@@ -579,7 +760,7 @@ def batch_parallel(computation,
       `DeviceAssignment` may be omitted if each shard of the computation uses
       only one core, and there is either only one shard, or the number of shards
       is equal to the number of cores in the TPU system.
-    name: The name of the operator.
+    name: (Deprecated) Does nothing.
   Returns:
     A list of output tensors.
   Raises:
@@ -613,7 +794,7 @@ def rewrite(computation,
       mapping between logical cores in the computation with physical cores in
       the TPU topology. May be omitted for a single-core computation, in which
       case the core attached to task 0, TPU device 0 is used.
-    name: The name of the operator.
+    name: (Deprecated) Does nothing.
   Returns:
     A list of output tensors.
   """
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 644070218214643923b9ca3ee138615ec568e8b5..6d7331e3c79ade9c12c15de79f550cf3973c4e6c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -26,6 +26,7 @@ import os
 import numpy as np
 
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.platform import tf_logging as logging
 
@@ -34,10 +35,16 @@ _TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
 _SERVICE_KEY = run_config_lib._SERVICE_KEY
 _TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
 _NUM_CORES_PER_HOST = 8
-
 # pylint: enable=protected-access
 
 
+class InputPipelineConfig(object):
+  r"""Please see the definition of these values in TPUConfig."""
+  PER_SHARD_V1 = 1
+  PER_HOST_V1 = 2
+  PER_HOST_V2 = 3
+
+
 # TODO(b/72511246) Provide a simplified api to configure model parallelism.
 class TPUConfig(
     collections.namedtuple('TPUConfig', [
@@ -65,15 +72,18 @@ class TPUConfig(
       cores. This is required by model-parallelism which enables partitioning
       the model to multiple cores. For example, [2, 2, 1] means the model is
       partitioned across 4 cores which span two cores in both x and y
-      coordinates.  Please refer to ${tf.contrib.tpu.TopologyProto} for the
+      coordinates.  Please refer to @{tf.contrib.tpu.Topology} for the
       geometry of a TPU mesh.
-    per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host
-      rather than Per-Core. With Per-Host input pipeline deployment, `input_fn`
-      is invoked once on each host. With Per-Core input pipeline deployment, it
-      is invoked once for each core. To be precise, with a global batch size
-      `train_batch_size` in `TPUEstimator` constructor, the batch size for each
-      shard is `train_batch_size` // #hosts. With Per-Core input pipeline
-      deployment, the shard batch size is `train_batch_size` // #cores.
+    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
+      `input_fn` is invoked per-host rather than per-core. With per-host input
+      pipeline configuration, `input_fn` is invoked once on each host. With the
+      per-core input pipeline configuration, it is invoked once for each core.
+      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
+      the batch size for each shard is `train_batch_size` // #hosts in the
+      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
+      `train_batch_size` // #cores. With the per-core input pipeline
+      configuration, the shard batch size is also `train_batch_size` // #cores.
+      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
     tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
       within TPUEstimator, however when using ClusterSpec propagation in more
       esoteric cluster configurations, you may need to specify the job name as a
@@ -116,6 +126,13 @@ class TPUConfig(
         raise ValueError('computation_shape elements can only be 1 or 2; got '
                          'computation_shape={}'.format(computation_shape))
 
+    # per_host_input_for_training may be True, False, or integer in [1..3].
+    # Map legacy values (True, False) to numeric values.
+    if per_host_input_for_training is False:
+      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
+    elif per_host_input_for_training is True:
+      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
+
     # Check initial_infeed_sleep_secs.
     if initial_infeed_sleep_secs:
       util_lib.check_positive_integer(initial_infeed_sleep_secs,
@@ -140,6 +157,7 @@ class RunConfig(run_config_lib.RunConfig):
                tpu_config=None,
                evaluation_master=None,
                master=None,
+               cluster=None,
                **kwargs):
     """Constructs a RunConfig.
 
@@ -148,15 +166,26 @@ class RunConfig(run_config_lib.RunConfig):
       evaluation_master: a string. The address of the master to use for eval.
         Defaults to master if not set.
       master: a string. The address of the master to use for training.
+      cluster: a ClusterResolver
       **kwargs: keyword config parameters.
+
+    Raises:
+      ValueError: if cluster is not None and the provided session_config has a
+        cluster_def already.
     """
     super(RunConfig, self).__init__(**kwargs)
     self._tpu_config = tpu_config or TPUConfig()
+    self._cluster = cluster
 
-    # If user sets master and/or evaluation_master explicilty, including empty
+    # If user sets master and/or evaluation_master explicitly, including empty
     # string '', take it. Otherwise, take the values set by parent class.
     if master is not None:
+      if cluster is not None:
+        raise ValueError('Both master and cluster are set.')
       self._master = master
+    else:
+      if cluster:
+        self._master = cluster.master()
 
     if evaluation_master is not None:
       self._evaluation_master = evaluation_master
@@ -170,6 +199,21 @@ class RunConfig(run_config_lib.RunConfig):
       # evaluation_master to master, unless user overwrites it.
       self._evaluation_master = self._master
 
+    # Set the ClusterSpec to use
+    if cluster:
+      self._cluster_spec = cluster.cluster_spec()
+
+      # Merge the cluster_def into the ConfigProto.
+      if self._session_config is None:  # pylint: disable=access-member-before-definition
+        self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+      if self._session_config.HasField('cluster_def'):
+        raise ValueError(
+            'You cannot provide a ClusterResolver and '
+            'session_config.cluster_def.')
+      if self._cluster_spec:
+        self._session_config.cluster_def.CopyFrom(
+            self._cluster_spec.as_cluster_def())
+
   @property
   def evaluation_master(self):
     return self._evaluation_master
@@ -182,6 +226,10 @@ class RunConfig(run_config_lib.RunConfig):
   def tpu_config(self):
     return self._tpu_config
 
+  @property
+  def cluster(self):
+    return self._cluster
+
   def replace(self, **kwargs):
     if 'tpu_config' not in kwargs:
       return super(RunConfig, self).replace(**kwargs)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index c5c46ea741ea64ca37089431f8ed66cad7bc31fb..fbc1173e49fd6e8912f6bfae8a88198eda4f6d5b 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -24,6 +24,7 @@ import copy
 import numpy as np
 
 from tensorflow.contrib.tpu.python.tpu import device_assignment  as tpu_device_assignment
+from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.platform import tf_logging as logging
@@ -39,7 +40,7 @@ class _TPUContext(object):
 
   This immutable object holds TPUEstimator config, train/eval batch size, and
   `TPUEstimator.use_tpu`, which is expected to be passed around. It also
-  provides utility functions, basded on the current state, to determine other
+  provides utility functions, based on the current state, to determine other
   information commonly required by TPU computation, such as TPU device names,
   TPU hosts, shard batch size, etc.
 
@@ -205,7 +206,13 @@ class _TPUContext(object):
     """Return true if input_fn is invoked per-core (other than per-host)."""
     mode = self._assert_mode()
     return (mode == model_fn_lib.ModeKeys.TRAIN and
-            not self._config.tpu_config.per_host_input_for_training)
+            (self._config.tpu_config.per_host_input_for_training is
+             tpu_config.InputPipelineConfig.PER_SHARD_V1))
+
+  def is_input_per_host_with_iterators(self):
+    """Return true if input_fn should be run in the per-host v2 config."""
+    return (self._config.tpu_config.per_host_input_for_training is
+            tpu_config.InputPipelineConfig.PER_HOST_V2)
 
   def is_running_on_cpu(self, is_export_mode=False):
     """Determines whether the input_fn and model_fn should be invoked on CPU.
@@ -218,7 +225,7 @@ class _TPUContext(object):
         model, when mode == PREDICT. Only with this bool, we could
         tell whether user is calling the Estimator.predict or
         Estimator.export_savedmodel, which are running on TPU and CPU
-        respectively. Parent class Estimator does not distingush these two.
+        respectively. Parent class Estimator does not distinguish these two.
 
     Returns:
       bool, whether current input_fn or model_fn should be running on CPU.
@@ -271,7 +278,8 @@ class _TPUContext(object):
       return global_batch_size
 
     # On TPU
-    if self.is_input_sharded_per_core():
+    if self.is_input_sharded_per_core() or (
+        self.is_input_per_host_with_iterators()):
       # We prohibit per core input sharding for the model parallelism case,
       # therefore it is safe to use num_cores here.
       return global_batch_size // self.num_cores
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index ff53fe4f5d0e219f56d77d3476640bb023c7535a..eb537b7b6ad2c7e6f2374ad427d6a50884df52b4 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -20,23 +20,27 @@ from __future__ import print_function
 
 import collections
 import copy
+import os
 import signal
 import threading
 import time
 import traceback
 
+import numpy as np
 import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.contrib.summary import summary_ops as contrib_summary
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import session_support
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_context
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
+from tensorflow.contrib.training.python.training import hparam
+from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
@@ -48,10 +52,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as contrib_summary
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -61,6 +68,7 @@ from tensorflow.python.training import evaluation
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 _INITIAL_LOSS = 1e7
@@ -69,6 +77,9 @@ _TPU_ESTIMATOR = 'tpu_estimator'
 _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
+_ONE_GIGABYTE = 1024 * 1024 * 1024
+_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
+_TPU_TRAIN_OP = '_tpu_train_op'
 
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
@@ -81,6 +92,13 @@ _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 _WRAP_INPUT_FN_INTO_WHILE_LOOP = False
 
 
+ops.register_proto_function(
+    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
+    proto_type=variable_pb2.VariableDef,
+    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
+    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
+
+
 def _create_global_step(graph):
   graph = graph or ops.get_default_graph()
   if training.get_global_step(graph) is not None:
@@ -133,7 +151,7 @@ def _increase_eval_step_op(iterations_per_loop):
   """Returns an op to increase the eval step for TPU evaluation.
 
   Args:
-    iterations_per_loop: Tensor. The number of eval steps runnining in TPU
+    iterations_per_loop: Tensor. The number of eval steps running in TPU
         system before returning to CPU host for each `Session.run`.
 
   Returns:
@@ -605,17 +623,17 @@ class _StoppingPredictHook(session_run_hook.SessionRunHook):
       # batch. And we append one more batch to signal the system it should stop.
       # The data flow might look like
       #
-      #  batch   0: images, labels, stop = 0  (user provideded)
-      #  batch   1: images, labels, stop = 0  (user provideded)
+      #  batch   0: images, labels, stop = 0  (user provided)
+      #  batch   1: images, labels, stop = 0  (user provided)
       #  ...
-      #  batch  99: images, labels, stop = 0  (user provideded)
+      #  batch  99: images, labels, stop = 0  (user provided)
       #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
       #
       # where the final batch (id = 100) is appended by TPUEstimator, so we
       # should drop it before returning the predictions to user.
       # To achieve that, we throw the OutOfRangeError in after_run. Once
       # Monitored Session sees this error in SessionRunHook.after_run, the
-      # "current" prediciton, i.e., batch with id=100, will be discarded
+      # "current" prediction, i.e., batch with id=100, will be discarded
       # immediately
       raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
 
@@ -676,8 +694,11 @@ def generate_per_host_enqueue_ops_fn_for_host(
         raise TypeError(
             'For mode PREDICT, `input_fn` must return `Dataset` instead of '
             '`features` and `labels`.')
+      if batch_axis is not None:
+        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
       inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset, batch_size=ctx.batch_size_for_input_fn)
+          dataset=inputs.dataset, batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
 
     if is_dataset:
       hooks.append(inputs.dataset_initializer_hook())
@@ -733,6 +754,61 @@ def generate_per_host_enqueue_ops_fn_for_host(
   return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
 
+def generate_per_host_v2_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, device, host_id):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  del host_id  # unused
+  captured_infeed_queue = _CapturedObject()
+  hooks = []
+
+  with ops.device(device):
+    inputs = _Inputs.from_input_fn(input_fn())
+
+    is_dataset = inputs.is_dataset
+    if not is_dataset:
+      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
+                      'input pipeline configuration.')
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      # TODO(b/XXX): Add predict support for PER_HOST_V2
+      raise TypeError('Most PREDICT not yet supported in PER_HOST_V2 mode.')
+
+    hooks.append(inputs.dataset_initializer_hook())
+
+  def enqueue_ops_fn():
+    """Generates the per_host enqueue ops."""
+    control_deps = []
+    per_host_sharded_inputs = []
+    num_replicas_per_host = ctx.num_of_replicas_per_host
+    with ops.device(device):
+      if not inputs.is_dataset:
+        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
+      for _ in range(num_replicas_per_host):
+        # Use control dependencies to ensure a deterministic ordering.
+        with ops.control_dependencies(control_deps):
+          features, labels = inputs.features_and_labels()  # Calls get_next()
+
+        inputs_structure_recorder.validate_and_record_structure(
+            features, labels)
+        flattened_inputs = (
+            inputs_structure_recorder.flatten_features_and_labels(
+                features, labels))
+
+        control_deps.extend(flattened_inputs)
+        per_host_sharded_inputs.append(flattened_inputs)
+
+    infeed_queue = tpu_feed.InfeedQueue(
+        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
+    captured_infeed_queue.capture(infeed_queue)
+    infeed_queue.set_configuration_from_sharded_input_tensors(
+        per_host_sharded_inputs)
+
+    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+        per_host_sharded_inputs, tpu_ordinal_function=ctx.tpu_ordinal_function)
+    return per_host_enqueue_ops
+
+  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
+
+
 class _InputPipeline(object):
   """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
 
@@ -751,7 +827,7 @@ class _InputPipeline(object):
   2. (features, labels)
 
   Internally, form 1 is reformed to `(features, None)` as features and labels
-  are passed separatedly to underlying methods. For TPU training, TPUEstimator
+  are passed separately to underlying methods. For TPU training, TPUEstimator
   may expect multiple `features` and `labels` tuples one for each core.
 
   TPUEstimator allows various different structures for inputs (namely `features`
@@ -784,7 +860,8 @@ class _InputPipeline(object):
       def _extract_key_names(tensor_or_dict):
         if tensor_or_dict is None:
           return []
-        return tensor_or_dict.keys() if isinstance(tensor_or_dict, dict) else []
+        return sorted(tensor_or_dict.keys()) if isinstance(
+            tensor_or_dict, dict) else []
 
       # Extract structure.
       has_labels = labels is not None
@@ -923,8 +1000,7 @@ class _InputPipeline(object):
       # In the model-parallel case, both the host-side and device-side
       # computations must agree on the core on which infeed takes place. We
       # choose to perform infeed on logical core 0 of each replica.
-      with ops.device(tpu.core(0)):
-        values = self._infeed_queue.generate_dequeue_op()
+      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
       # The unflatten process uses the structure information recorded above.
       return self._inputs_structure_recorder.unflatten_features_and_labels(
           values)
@@ -968,10 +1044,17 @@ class _InputPipeline(object):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
-                generate_per_host_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    self._batch_axis, host_device, host_id))
+            if self._ctx.is_input_per_host_with_iterators():
+              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
+                  generate_per_host_v2_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, host_device, host_id))
+            else:
+              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
+                  generate_per_host_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, self._batch_axis,
+                      host_device, host_id))
             all_hooks.extend(hooks)
 
             # NOTE(xiejw): We dispatch here based on the return type of the
@@ -1036,8 +1119,8 @@ class _ModelFnWrapper(object):
     self._params = params
     self._ctx = ctx
 
-  def call_without_tpu(self, features, labels):
-    return self._call_model_fn(features, labels)
+  def call_without_tpu(self, features, labels, is_export_mode):
+    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
 
   def convert_to_single_tpu_train_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single train step on TPU.
@@ -1196,7 +1279,7 @@ class _ModelFnWrapper(object):
 
     return predict_step, host_calls, captured_scaffold_fn
 
-  def _call_model_fn(self, features, labels, is_export_mode=True):
+  def _call_model_fn(self, features, labels, is_export_mode=False):
     """Calls the model_fn with required parameters."""
     model_fn_args = util.fn_args(self._model_fn)
     kwargs = {}
@@ -1222,9 +1305,16 @@ class _ModelFnWrapper(object):
                        'required by TPUEstimator to pass batch size as '
                        'params[\'batch_size\']'.format(self._model_fn))
 
-    batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
+    if is_export_mode:
+      batch_size_for_model_fn = None
+    else:
+      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
+
     if batch_size_for_model_fn is not None:
-      params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
+      if isinstance(params, hparam.HParams):
+        params.add_hparam(_BATCH_SIZE_KEY, batch_size_for_model_fn)
+      else:
+        params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
 
     estimator_spec = self._model_fn(features=features, **kwargs)
     if (self._ctx.is_running_on_cpu(is_export_mode) and
@@ -1463,7 +1553,7 @@ class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
 
 
 class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
-  """Count examples during runtime."""
+  """"Calculate and report the number of examples/sec during training."""
 
   def __init__(self,
                batch_size,
@@ -1516,14 +1606,20 @@ class TPUEstimator(estimator_lib.Estimator):
   size when calling the `input_fn` and `model_fn`. Users should specify
   global batch size in constructor, and then get the batch size for each shard
   in `input_fn` and `model_fn` by `params['batch_size']`.
-  For training, `model_fn` gets per-core batch size; `input_fn` may get
-  per-core or per-host batch size depending on
-  `per_host_input_for_training` in `TPUConfig`.
-  For evaluation, `model_fn` gets per-core batch size and `input_fn` get
-  per-host batch size.
+
+  - For training, `model_fn` gets per-core batch size; `input_fn` may get
+    per-core or per-host batch size depending on `per_host_input_for_training`
+    in `TPUConfig` (See docstring for TPUConfig for details).
+
+  - For evaluation and prediction, `model_fn` gets per-core batch size and
+    `input_fn` get per-host batch size.
+
+  Evaluation
+  ==========
 
   `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
   for TPU evaluation.
+
   `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
   `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. (See
   `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
@@ -1535,12 +1631,17 @@ class TPUEstimator(estimator_lib.Estimator):
   `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
 
   Current limitations:
+  --------------------
 
-  1. TPU evaluation only works on single host.
-  2. `input_fn` for evaluation should not throw OutOfRange error for all
-  evaluation steps and all batches should have the same size.
+  1. TPU evaluation only works on a single host (one TPU worker).
+
+  2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
+     (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
+     batches should have the same size.
 
   Example (MNIST):
+  ----------------
+
   ```
   # The metric Fn which runs on CPU.
   def metric_fn(labels, logits):
@@ -1576,8 +1677,83 @@ class TPUEstimator(estimator_lib.Estimator):
           }))
   ```
 
-  Predict support on TPU is not yet implemented. So, `predict` and
-  `export_savedmodel` are executed on CPU, even if `use_tpu` is true.
+  Prediction
+  ==========
+
+  Prediction on TPU is an experimental feature to support large batch inference.
+  It is not designed for latency-critical system. In addition, due to some
+  usability issues, for prediction with small dataset, CPU `.predict`, i.e.,
+  creating a new `TPUEstimator` instance with `use_tpu=False`, might be more
+  convenient.
+
+  Note: In contrast to TPU training/evaluation, the `input_fn` for prediction
+  *should* raise an end-of-input exception (`OutOfRangeError` or
+  `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be
+  precise, the ops created by `input_fn` produce one batch of the data.
+  The `predict()` API processes one batch at a time. When reaching the end of
+  the data source, an end-of-input exception should be raised by one of these
+  operations. The user usually does not need to do this manually. As long as the
+  dataset is not repeated forever, the `tf.data` API will raise an end-of-input
+  exception automatically after the last batch has been produced.
+
+  Note: Estimator.predict returns a Python generator. Please consume all the
+  data from the generator so that TPUEstimator can shutdown the TPU system
+  properly for user.
+
+  Current limitations:
+  --------------------
+  1. TPU prediction only works on a single host (one TPU worker).
+
+  2. `input_fn` must return a `Dataset` instance rather than `features`. In
+  fact, .train() and .evaluate() also support Dataset as return value.
+
+  Example (MNIST):
+  ----------------
+  ```
+  height = 32
+  width = 32
+  total_examples = 100
+
+  def predict_input_fn(params):
+    batch_size = params['batch_size']
+
+    images = tf.random_uniform(
+        [total_examples, height, width, 3], minval=-1, maxval=1)
+
+    dataset = tf.data.Dataset.from_tensor_slices(images)
+    dataset = dataset.map(lambda images: {'image': images})
+
+    dataset = dataset.batch(batch_size)
+    return dataset
+
+  def model_fn(features, labels, params, mode):
+     # Generate predictions, called 'output', from features['image']
+
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      return tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          predictions={
+              'predictions': output,
+              'is_padding': features['is_padding']
+          })
+
+  tpu_est = TPUEstimator(
+      model_fn=model_fn,
+      ...,
+      predict_batch_size=16)
+
+  # Fully consume the generator so that TPUEstimator can shutdown the TPU
+  # system.
+  for item in tpu_est.predict(input_fn=input_fn):
+    # Filter out item if the `is_padding` is 1.
+    # Process the 'predictions'
+  ```
+
+  Exporting
+  =========
+
+  Exporting `SavedModel` support on TPU is not yet implemented. So,
+  `export_savedmodel` is executed on CPU, even if `use_tpu` is true.
   """
 
   def __init__(self,
@@ -1627,7 +1803,7 @@ class TPUEstimator(estimator_lib.Estimator):
         labels to match up with the corresponding images. If None is supplied,
         and per_host_input_for_training is True, batches will be sharded based
         on the major dimension. If tpu_config.per_host_input_for_training is
-        False, batch_axis is ignored.
+        False or `PER_HOST_V2`, batch_axis is ignored.
 
     Raises:
       ValueError: `params` has reserved keys already.
@@ -1647,7 +1823,8 @@ class TPUEstimator(estimator_lib.Estimator):
         raise ValueError('`train_batch_size` cannot be `None`')
       util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
 
-      if (not config.tpu_config.per_host_input_for_training and
+      if (config.tpu_config.per_host_input_for_training is
+          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
           config.tpu_config.computation_shape):
         raise ValueError(
             'Model parallelism only supports per host input for training. '
@@ -1684,6 +1861,8 @@ class TPUEstimator(estimator_lib.Estimator):
         eval_batch_size, predict_batch_size,
         use_tpu)
 
+    self._is_input_fn_invoked = None
+
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
 
@@ -1763,12 +1942,21 @@ class TPUEstimator(estimator_lib.Estimator):
     if 'config' in input_fn_args:
       kwargs['config'] = config
 
+    if 'mode' in input_fn_args:
+      kwargs['mode'] = mode
+
+    # Records the fact input_fn has been invoked.
+    self._is_input_fn_invoked = True
+
     with self._ctx.with_mode(mode) as ctx:
       # Setting the batch size in params first. This helps user to have same
       # input_fn for use_tpu=True/False.
       batch_size_for_input_fn = ctx.batch_size_for_input_fn
       if batch_size_for_input_fn is not None:
-        kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
+        if isinstance(kwargs['params'], hparam.HParams):
+          kwargs['params'].add_hparam(_BATCH_SIZE_KEY, batch_size_for_input_fn)
+        else:
+          kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
 
       # For export_savedmodel, input_fn is never passed to Estimator. So,
       # `is_export_mode` must be False.
@@ -1791,6 +1979,17 @@ class TPUEstimator(estimator_lib.Estimator):
 
       return _input_fn
 
+  def _validate_features_in_predict_input(self, result):
+    """Skip the validation.
+
+    For TPUEstimator, we do not need to check the result type. `_InputPipeline`
+    has stronger check. Parent class's check generates confusing warning msg.
+
+    Args:
+      result: `features` returned by input_fn.
+    """
+    pass
+
   def _augment_model_fn(self, model_fn, batch_axis):
     """Returns a new model_fn, which wraps the TPU support."""
 
@@ -1799,15 +1998,24 @@ class TPUEstimator(estimator_lib.Estimator):
       with self._ctx.with_mode(mode) as ctx:
         model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
 
-        # For export_savedmodel, input_fn is never passed to Estimator. So,
-        # if features is callable, it means it is the input_fn passed by
-        # TPUEstimator._call_input_fn. Then we can know if the mode == PREDICT,
-        # it implies, it is the .predict API, not export_savedmodel API.
-        is_export_mode = not callable(features)
+        if mode != model_fn_lib.ModeKeys.PREDICT:
+          is_export_mode = False
+        else:
+          # For export_savedmodel, input_fn is never passed to Estimator. So, by
+          # checking the self._is_input_fn_invoked bit, we can know, given the
+          # mode == PREDICT, it is the .predict API, not export_savedmodel API.
+          if self._is_input_fn_invoked:
+            is_export_mode = False
+          else:
+            is_export_mode = True
+
+        # Clear the bit.
+        self._is_input_fn_invoked = None
 
         if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
           logging.info('Running %s on CPU', mode)
-          return model_fn_wrapper.call_without_tpu(features, labels)
+          return model_fn_wrapper.call_without_tpu(
+              features, labels, is_export_mode=is_export_mode)
 
         assert labels is None, '`labels` passed to `model_fn` must be `None`.'
         # TPUEstimator._call_input_fn passes `input_fn` as features to here.
@@ -1818,12 +2026,24 @@ class TPUEstimator(estimator_lib.Estimator):
         enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
+        graph = ops.get_default_graph()
+        for enqueue_op in enqueue_ops:
+          if isinstance(enqueue_op, list):
+            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
+          else:
+            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
+
         if mode == model_fn_lib.ModeKeys.TRAIN:
           loss, host_call, scaffold = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
           host_ops = host_call.create_tpu_hostcall()
           if host_ops is None:
             host_ops = []
+
+          shutdown_hooks = []
+          if os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN', '0') != '0':
+            shutdown_hooks.append(session_support.GracefulShutdownHook())
+
           hooks = [
               TPUInfeedOutfeedSessionHook(
                   ctx,
@@ -1831,7 +2051,8 @@ class TPUEstimator(estimator_lib.Estimator):
                   host_ops,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator)),
-              ExamplesPerSecondHook(ctx.global_batch_size),
+              ExamplesPerSecondHook(
+                  ctx.global_batch_size, output_dir=self.model_dir),
               InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
@@ -1839,7 +2060,18 @@ class TPUEstimator(estimator_lib.Estimator):
                       'step': training.get_global_step()
                   },
                   every_n_secs=30)
-          ] + input_hooks
+          ] + input_hooks + shutdown_hooks
+
+          chief_hooks = []
+          if (self._config.save_checkpoints_secs or
+              self._config.save_checkpoints_steps):
+            chief_hooks.append(
+                training.CheckpointSaverHook(
+                    self.model_dir,
+                    save_secs=self._config.save_checkpoints_secs,
+                    save_steps=self._config.save_checkpoints_steps,
+                    steps_per_run=self._config.tpu_config.iterations_per_loop,
+                    scaffold=scaffold))
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -1847,11 +2079,15 @@ class TPUEstimator(estimator_lib.Estimator):
           # Validate the TPU training graph to catch basic errors
           _validate_tpu_training_graph()
 
+          train_op = control_flow_ops.group(*update_ops)
+          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
+
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=loss,
+              training_chief_hooks=chief_hooks,
               training_hooks=hooks,
-              train_op=control_flow_ops.group(*update_ops),
+              train_op=train_op,
               scaffold=scaffold)
 
         if mode == model_fn_lib.ModeKeys.EVAL:
@@ -1945,12 +2181,18 @@ class TPUEstimator(estimator_lib.Estimator):
           host_ops = host_call_ret['host_call']
 
         predictions = host_call_ret['predictions']
-        stopping_signals = host_call_ret['signals']
+        _verify_cross_hosts_transfer_size(
+            predictions, message=(
+                'The estimated size for TPUEstimatorSpec.predictions is too '
+                'large.'))
+        signals = host_call_ret['signals']
 
         with ops.control_dependencies(host_ops):
           host_ops = []  # Empty, we do do not need it anymore.
           scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
-              stopping_signals)
+              signals)
+          predictions = _PaddingSignals.slice_tensor_or_dict(
+              predictions, signals)
 
         hooks = [
             _StoppingPredictHook(scalar_stopping_signal),
@@ -1977,8 +2219,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   def multi_tpu_eval_steps_on_single_shard():
     return training_loop.repeat(
         iterations_per_loop_var,
-        single_tpu_eval_step, [_ZERO_LOSS],
-        name='loop')
+        single_tpu_eval_step, [_ZERO_LOSS])
 
   (loss,) = tpu.shard(
       multi_tpu_eval_steps_on_single_shard,
@@ -2001,8 +2242,7 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   def multi_tpu_train_steps_on_single_shard():
     return training_loop.repeat(
         iterations_per_loop_var,
-        single_tpu_train_step, [_INITIAL_LOSS],
-        name=b'loop')
+        single_tpu_train_step, [_INITIAL_LOSS])
 
   (loss,) = tpu.shard(
       multi_tpu_train_steps_on_single_shard,
@@ -2233,6 +2473,10 @@ class _Inputs(object):
   def features_and_labels(self):
     """Gets `features` and `labels`."""
     if self.is_dataset:
+      if self._iterator is None:
+        raise RuntimeError('Internal error: Must call dataset_initializer_hook '
+                           'before calling features_and_labels(). Please file '
+                           'a bug!')
       return _Inputs._parse_inputs(self._iterator.get_next())
 
     return (self._features, self._labels)
@@ -2245,20 +2489,19 @@ class _Inputs(object):
     return self._dataset
 
 
-# TODO(xiejw): Extend this to support final partial batch.
 class _InputsWithStoppingSignals(_Inputs):
   """Inputs with `_StopSignals` inserted into the dataset."""
 
-  def __init__(self, dataset, batch_size):
+  def __init__(self, dataset, batch_size, add_padding=False):
 
     assert dataset is not None
 
     user_provided_dataset = dataset.map(
         _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=False, batch_size=batch_size))
+            stop=False, batch_size=batch_size, add_padding=add_padding))
     final_batch_dataset = dataset.take(1).map(
         _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=True, batch_size=batch_size))
+            stop=True, batch_size=batch_size, add_padding=add_padding))
     dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
 
     super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
@@ -2288,7 +2531,7 @@ class _InputsWithStoppingSignals(_Inputs):
     return signals
 
   @staticmethod
-  def insert_stopping_signal(stop, batch_size):
+  def insert_stopping_signal(stop, batch_size, add_padding=False):
     """Inserts stopping_signal into dataset via _map_fn.
 
     Here we change the data structure in the dataset, such that the return value
@@ -2299,19 +2542,39 @@ class _InputsWithStoppingSignals(_Inputs):
     Args:
       stop: bool, state of current stopping signals.
       batch_size: int, batch size.
+      add_padding: bool, whether to pad the tensor to full batch size.
 
     Returns:
       A map_fn passed to dataset.map API.
     """
 
     def _map_fn(*args):
+      """The map fn to insert signals."""
+      if len(args) == 1:
+        # Unpack the single Tensor/dict argument as features. This is required
+        # for the input_fn returns no labels.
+        args = args[0]
       features, labels = _Inputs._parse_inputs(args)
       new_input_dict = {}
-      new_input_dict['features'] = features
-      if labels is not None:
-        new_input_dict['labels'] = labels
+
+      if add_padding:
+        padding_mask, features, labels = (
+            _PaddingSignals.pad_features_and_labels(
+                features, labels, batch_size))
+
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+
+      else:
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+        padding_mask = None
+
       new_input_dict['signals'] = _StopSignals(
-          stop=stop, batch_size=batch_size).as_dict()
+          stop=stop, batch_size=batch_size, padding_mask=padding_mask).as_dict()
+
       return new_input_dict
 
     return _map_fn
@@ -2320,23 +2583,28 @@ class _InputsWithStoppingSignals(_Inputs):
 class _StopSignals(object):
   """Signals class holding all logic to handle TPU stopping condition."""
 
-  NON_STOPPING_SIGNAL = 0.0
-  STOPPING_SIGNAL = 1.0
+  NON_STOPPING_SIGNAL = False
+  STOPPING_SIGNAL = True
 
-  def __init__(self, stop, batch_size):
+  def __init__(self, stop, batch_size, padding_mask=None):
     self._stop = stop
     self._batch_size = batch_size
+    self._padding_mask = padding_mask
 
   def as_dict(self):
+    """Returns the signals as Python dict."""
     shape = [self._batch_size, 1]
-    dtype = dtypes.float32
+    dtype = dtypes.bool
 
     if self._stop:
       stopping = array_ops.ones(shape=shape, dtype=dtype)
     else:
       stopping = array_ops.zeros(shape=shape, dtype=dtype)
 
-    return {'stopping': stopping}
+    signals = {'stopping': stopping}
+    if self._padding_mask is not None:
+      signals['padding_mask'] = self._padding_mask
+    return signals
 
   @staticmethod
   def as_scalar_stopping_signal(signals):
@@ -2344,7 +2612,118 @@ class _StopSignals(object):
 
   @staticmethod
   def should_stop(scalar_stopping_signal):
-    return scalar_stopping_signal >= _StopSignals.STOPPING_SIGNAL
+    if isinstance(scalar_stopping_signal, ops.Tensor):
+      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
+      # way to express the bool check whether scalar_stopping_signal is True.
+      return math_ops.logical_and(
+          scalar_stopping_signal, _StopSignals.STOPPING_SIGNAL)
+    else:
+      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
+      # the graph anymore. Here, we use pure Python.
+      return bool(scalar_stopping_signal)
+
+
+class _PaddingSignals(object):
+  """Signals class holding all logic to handle padding."""
+
+  @staticmethod
+  def pad_features_and_labels(features, labels, batch_size):
+    """Pads out the batch dimension of features and labels."""
+    real_batch_size = array_ops.shape(
+        _PaddingSignals._find_any_tensor(features))[0]
+
+    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
+
+    check_greater = check_ops.assert_greater_equal(
+        batch_size_tensor, real_batch_size,
+        data=(batch_size_tensor, real_batch_size),
+        message='The real batch size should not be greater than batch_size.')
+
+    with ops.control_dependencies([check_greater]):
+      missing_count = batch_size_tensor - real_batch_size
+
+    def pad_single_tensor(tensor):
+      """Pads out the batch dimension of a tensor to the complete batch_size."""
+      rank = len(tensor.shape)
+      assert rank > 0
+      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
+      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
+      padded_tensor = array_ops.pad(tensor, padding)
+      padded_tensor.set_shape(padded_shape)
+      return padded_tensor
+
+    def nest_pad(tensor_or_dict):
+      return nest.map_structure(pad_single_tensor, tensor_or_dict)
+
+    features = nest_pad(features)
+    if labels is not None:
+      labels = nest_pad(labels)
+
+    padding_mask = _PaddingSignals._padding_mask(
+        real_batch_size, missing_count, batch_size)
+
+    return padding_mask, features, labels
+
+  @staticmethod
+  def slice_tensor_or_dict(tensor_or_dict, signals):
+    """Slice the real Tensors according to padding mask in signals."""
+
+    padding_mask = signals['padding_mask']
+    batch_size = array_ops.shape(padding_mask)[0]
+
+    def verify_batch_size(tensor):
+      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
+      with ops.control_dependencies([check_batch_size]):
+        return array_ops.identity(tensor)
+
+    def slice_single_tensor(tensor):
+      rank = len(tensor.shape)
+      assert rank > 0
+      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
+      return verify_batch_size(tensor)[0:real_batch_size]
+
+    # As we split the Tensors to all TPU cores and concat them back, it is
+    # important to ensure the real data is placed before padded ones, i.e.,
+    # order is preserved. By that, the sliced padding mask should have all 0's.
+    # If this assertion failed, # the slice logic here would not hold.
+    sliced_padding_mask = slice_single_tensor(padding_mask)
+    assert_padding_mask = math_ops.equal(
+        math_ops.reduce_sum(sliced_padding_mask), 0)
+
+    with ops.control_dependencies([assert_padding_mask]):
+      should_stop = _StopSignals.should_stop(
+          _StopSignals.as_scalar_stopping_signal(signals))
+
+    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
+
+    def slice_fn(tensor):
+      # If the current batch is full batch or part of stopping signals, we do
+      # not need to slice to save performance.
+      return control_flow_ops.cond(
+          math_ops.logical_or(should_stop, is_full_batch),
+          (lambda: verify_batch_size(tensor)),
+          (lambda: slice_single_tensor(tensor)))
+
+    return nest.map_structure(slice_fn, tensor_or_dict)
+
+  @staticmethod
+  def _find_any_tensor(batch_features):
+    tensors = [x for x in nest.flatten(batch_features)
+               if isinstance(x, ops.Tensor)]
+    if not tensors:
+      raise ValueError('Cannot find any Tensor in features dict.')
+    return tensors[0]
+
+  @staticmethod
+  def _padding_mask(real_batch_size, missing_count, batch_size):
+    padding_mask = array_ops.concat(
+        [
+            array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
+            array_ops.ones((missing_count,), dtype=dtypes.int32)
+        ],
+        axis=0)
+    padding_mask.set_shape((batch_size,))
+    return padding_mask
 
 
 class _SignalsHelper(object):
@@ -2365,3 +2744,21 @@ class _SignalsHelper(object):
   @staticmethod
   def as_tensor_list(signals):
     return [signals[key] for key in sorted(signals.iterkeys())]
+
+
+def _verify_cross_hosts_transfer_size(tensor_dict, message):
+  total_size = 0
+  tensor_structure = {}
+  for key, tensor in tensor_dict.items():
+    shape = tensor.shape
+    size = np.product(shape) * tensor.dtype.size
+    tensor_structure[key] = shape
+    total_size += size
+  if total_size >= _ONE_GIGABYTE:
+    raise ValueError(
+        '{} The transfer size is larger than the protobuf limit. Please '
+        'consider to use Tensors with smaller shapes or reduce batch '
+        'size. Given:\n'
+        '{}'.format(message, '\n'.join([
+            ' -- Key: {}, Shape: {}'.format(k, v)
+            for k, v in tensor_structure.items()])))
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e90957e6dea7ff1777dd3e26cdf1c6fdb340dd3
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
@@ -0,0 +1,291 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU Estimator Signalling Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
+from tensorflow.python import data as dataset_lib
+from tensorflow.python.client import session
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+def make_input_fn(num_samples):
+  a = np.linspace(0, 100.0, num=num_samples)
+  b = np.reshape(np.array(a, dtype=np.float32), (len(a), 1))
+
+  def input_fn(params):
+    batch_size = params['batch_size']
+    da1 = dataset_lib.Dataset.from_tensor_slices(a)
+    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+
+    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset.map(lambda fa, fb: {'a': fa, 'b': fb})
+    dataset = dataset.batch(batch_size)
+    return dataset
+  return input_fn, (a, b)
+
+
+def make_input_fn_with_labels(num_samples):
+  a = np.linspace(0, 100.0, num=num_samples)
+  b = np.reshape(np.array(a, dtype=np.float32), (len(a), 1))
+
+  def input_fn(params):
+    batch_size = params['batch_size']
+    da1 = dataset_lib.Dataset.from_tensor_slices(a)
+    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+
+    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset.map(lambda fa, fb: ({'a': fa}, fb))
+    dataset = dataset.batch(batch_size)
+    return dataset
+  return input_fn, (a, b)
+
+
+class TPUEstimatorStoppingSignalsTest(test.TestCase):
+
+  def test_normal_output_without_signals(self):
+    num_samples = 4
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      features = dataset.make_one_shot_iterator().get_next()
+
+      # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
+      self.assertIsNone(features['a'].shape.as_list()[0])
+
+      with session.Session() as sess:
+        result = sess.run(features)
+        self.assertAllEqual(a[:batch_size], result['a'])
+        self.assertAllEqual(b[:batch_size], result['b'])
+
+        # This run should work as num_samples / batch_size = 2.
+        result = sess.run(features)
+        self.assertAllEqual(a[batch_size:num_samples], result['a'])
+        self.assertAllEqual(b[batch_size:num_samples], result['b'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          # Given num_samples and batch_size, this run should fail.
+          sess.run(features)
+
+  def test_output_with_stopping_signals(self):
+    num_samples = 4
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size)
+      hook = inputs.dataset_initializer_hook()
+      features, _ = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
+      self.assertIsNone(features['a'].shape.as_list()[0])
+
+      with session.Session() as sess:
+        hook.begin()
+        hook.after_create_session(sess, coord=None)
+
+        result, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual(a[:batch_size], result['a'])
+        self.assertAllEqual(b[:batch_size], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # This run should work as num_samples / batch_size = 2.
+        result, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual(a[batch_size:num_samples], result['a'])
+        self.assertAllEqual(b[batch_size:num_samples], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # This run should work, *but* see STOP ('1') as signals
+        _, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(features)
+
+
+class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
+
+  def test_num_samples_divisible_by_batch_size(self):
+    num_samples = 4
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
+                                                        add_padding=True)
+      hook = inputs.dataset_initializer_hook()
+      features, _ = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      # With padding, all shapes are static now.
+      self.assertEqual(batch_size, features['a'].shape.as_list()[0])
+
+      with session.Session() as sess:
+        hook.begin()
+        hook.after_create_session(sess, coord=None)
+
+        result, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual(a[:batch_size], result['a'])
+        self.assertAllEqual(b[:batch_size], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([0.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        # This run should work as num_samples / batch_size = 2.
+        result, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual(a[batch_size:num_samples], result['a'])
+        self.assertAllEqual(b[batch_size:num_samples], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([0.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        # This run should work, *but* see STOP ('1') as signals
+        _, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(features)
+
+  def test_num_samples_not_divisible_by_batch_size(self):
+    num_samples = 5
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn_with_labels(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
+                                                        add_padding=True)
+      hook = inputs.dataset_initializer_hook()
+      features, labels = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      # With padding, all shapes are static.
+      self.assertEqual(batch_size, features['a'].shape.as_list()[0])
+
+      with session.Session() as sess:
+        hook.begin()
+        hook.after_create_session(sess, coord=None)
+
+        evaluated_features, evaluated_labels, evaluated_signals = (
+            sess.run([features, labels, signals]))
+        self.assertAllEqual(a[:batch_size], evaluated_features['a'])
+        self.assertAllEqual(b[:batch_size], evaluated_labels)
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([0.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        # This run should work as num_samples / batch_size >= 2.
+        evaluated_features, evaluated_labels, evaluated_signals = (
+            sess.run([features, labels, signals]))
+        self.assertAllEqual(a[batch_size:2*batch_size], evaluated_features['a'])
+        self.assertAllEqual(b[batch_size:2*batch_size], evaluated_labels)
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([0.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        # This is the final partial batch.
+        evaluated_features, evaluated_labels, evaluated_signals = (
+            sess.run([features, labels, signals]))
+        real_batch_size = num_samples % batch_size
+
+        # Assert the real part.
+        self.assertAllEqual(a[2*batch_size:num_samples],
+                            evaluated_features['a'][:real_batch_size])
+        self.assertAllEqual(b[2*batch_size:num_samples],
+                            evaluated_labels[:real_batch_size])
+        # Assert the padded part.
+        self.assertAllEqual([0.0] * (batch_size - real_batch_size),
+                            evaluated_features['a'][real_batch_size:])
+        self.assertAllEqual([[0.0]] * (batch_size - real_batch_size),
+                            evaluated_labels[real_batch_size:])
+
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        padding = ([.0] * real_batch_size
+                   + [1.] * (batch_size - real_batch_size))
+        self.assertAllEqual(padding, evaluated_signals['padding_mask'])
+
+        # This run should work, *but* see STOP ('1') as signals
+        _, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(features)
+
+  def test_slice(self):
+    num_samples = 3
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
+                                                        add_padding=True)
+      hook = inputs.dataset_initializer_hook()
+      features, _ = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      sliced_features = (
+          tpu_estimator._PaddingSignals.slice_tensor_or_dict(
+              features, signals))
+
+      with session.Session() as sess:
+        hook.begin()
+        hook.after_create_session(sess, coord=None)
+
+        result, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertAllEqual(a[:batch_size], result['a'])
+        self.assertAllEqual(b[:batch_size], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # This is the final partial batch.
+        result, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertEqual(1, len(result['a']))
+        self.assertAllEqual(a[batch_size:num_samples], result['a'])
+        self.assertAllEqual(b[batch_size:num_samples], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # This run should work, *but* see STOP ('1') as signals
+        _, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(sliced_features)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index 42ac6eb680437ec82287468bcba2b770ac0e5749..604e6600c81a4136a1f10e79a725a887a96f4d86 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_sharding
 
 from tensorflow.python.framework import dtypes
@@ -368,13 +369,20 @@ class InfeedQueue(object):
       policy.freeze()
     self._validate()
 
-  def generate_dequeue_op(self):
+  def generate_dequeue_op(self, tpu_device=0):
     """Generates the device-side Op to dequeue a tuple from the queue.
 
     Implicitly freezes the queue configuration if it is not already
     frozen, which will raise errors if the shapes and types have not
     been fully specified.
 
+    Args:
+      tpu_device: The TPU device ordinal where the infeed instruction should be
+        placed. If None, no explicit placement will be performed, and it is up
+        to the user to call this API from within a proper TPU device scope.
+        The XLA code will fail if the TPU dequeue instruction is not bound to
+        any device.
+
     Returns:
       A list of Outputs corresponding to a shard of infeed dequeued
       into XLA, suitable for use within a replicated block.
@@ -392,8 +400,13 @@ class InfeedQueue(object):
         policy.get_sharded_shape(shape)
         for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
     ]
-    return tpu_ops.infeed_dequeue_tuple(
-        dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    if tpu_device is not None:
+      with ops.device(tpu.core(tpu_device)):
+        return tpu_ops.infeed_dequeue_tuple(
+            dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    else:
+      return tpu_ops.infeed_dequeue_tuple(
+          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
 
   def _generate_enqueue_op(self,
                            inputs,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index 493d1848c072caa5254fc87c67badc2e99ec16ee..894f21d0635ca47d3da1c0d2c3f5c37bac690920 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -60,7 +60,7 @@ def _query_tpu_system_metadata(master_address, run_config,
       with ops.Graph().as_default():
         with session_lib.Session(
             master_address,
-            config=_get_session_config_with_timeout(
+            config=get_session_config_with_timeout(
                 _PINGING_MASTER_TIMEOUT_IN_MS, run_config)) as sess:
           devices = sess.list_devices()
           for device in devices:
@@ -72,9 +72,9 @@ def _query_tpu_system_metadata(master_address, run_config,
               tpu_core_count += 1
           break
     except errors.DeadlineExceededError:
-      msg = ('Fail to connect Tensorflow master. It could be the TPU worker is '
-             'not ready (still under scheduling) or Tensorflow '
-             'master address is correct: got (%s).' %
+      msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
+             'not be ready (still scheduling) or the Tensorflow master address '
+             'is incorrect: got (%s).' %
              (master_address))
 
       # TODO(xiejw): For local or grpc master we might not need retry logic
@@ -120,7 +120,8 @@ def _query_tpu_system_metadata(master_address, run_config,
     logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
     logging.info('*** Num TPU Cores Per Worker: %d',
                  metadata.num_of_cores_per_host)
-    logging.info('*** Available Devices: %s', metadata.devices)
+    for device in metadata.devices:
+      logging.info('*** Available Device: %s', device)
   else:
     logging.info('Failed to find TPU: %s', metadata)
   return metadata
@@ -132,7 +133,7 @@ def _obtain_topology(master_address, run_config):
                  'for model parallelism. This might take a while.',
                  master_address)
     with ops.Graph().as_default():
-      session_config = _get_session_config_with_timeout(
+      session_config = get_session_config_with_timeout(
           _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, run_config)
       with session_lib.Session(
           master_address, config=session_config) as sess:
@@ -145,7 +146,7 @@ def _obtain_topology(master_address, run_config):
             master_address))
 
 
-def _get_session_config_with_timeout(timeout_in_secs, run_config):
+def get_session_config_with_timeout(timeout_in_secs, run_config):
   cluster_def = None
   if run_config.session_config and run_config.session_config.cluster_def.job:
     cluster_def = run_config.session_config.cluster_def
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index 336d8260c3c8a5c30efa603e3faeabcc0944b8d0..c3882b8a27bc835f906c47dc5219f280c53800b8 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -37,7 +37,7 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    context = tpu.TPUReplicateContext(b"context")
+    context = tpu.TPUReplicateContext(b"context", 1)
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/contrib/tpu/python/tpu/training_loop.py b/tensorflow/contrib/tpu/python/tpu/training_loop.py
index 3d7896127a99653167f164873331a2cc95f656e8..10a8bccf3b23add75188e16eb3591c32eb8621ee 100644
--- a/tensorflow/contrib/tpu/python/tpu/training_loop.py
+++ b/tensorflow/contrib/tpu/python/tpu/training_loop.py
@@ -44,7 +44,7 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
       None (equivalent to an empty list).
     infeed_queue: if not None, the infeed queue from which to append a tuple
       of arguments as inputs to condition.
-    name: an optional name for the loop.
+    name: (Deprecated) Does nothing.
 
   Returns:
     The final values of the loop-carried tensors.
@@ -52,7 +52,7 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
   Raises:
     TypeError: if body or condition has the wrong signature.
   """
-
+  del name
   # Converts inputs to Tensors.
   inputs = [] if inputs is None else [ops.convert_to_tensor(x) for
                                       x in inputs]
@@ -166,11 +166,11 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
   if input_arity == 0:
     inputs = [array_ops.constant(0)]
   return control_flow_ops.while_loop(condition_wrapper, body_wrapper, inputs,
-                                     name=name)
+                                     name="")
 
 
 def repeat(n, body, inputs=None, infeed_queue=None, name=None):
-  """Builds a training loop that executes a fixed number of interations.
+  """Builds a training loop that executes a fixed number of iterations.
 
   The set of loop-carried tensors correspond to `inputs`.
   `body` must be a function that takes and returns the values of the
@@ -183,7 +183,7 @@ def repeat(n, body, inputs=None, infeed_queue=None, name=None):
       None (equivalent to an empty list).
     infeed_queue: if not None, the infeed queue from which to append a tuple
       of arguments as inputs to condition.
-    name: an optional name for the loop.
+    name: (Deprecated) Does nothing.
   Returns:
     The final values of the loop-carried tensors.
   Raises:
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index 4ef8f9eebdb165e5fe221be8670276bf943159b3..639e70816926aaed850cee62dca6aa819b38de8b 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -172,7 +172,7 @@ It is always recommended to port a small, simple model first to make sure that
 you are familiar with the basic concepts of `TPUEstimator` and test end-to-end
 behavior. Once your simple model runs, gradually add more functionality.
 In addition, there are several sample models, available at
-[github.com/tensorflow/tpu-demos](https://github.com/tensorflow/tpu-demos).
+[github.com/tensorflow/tpu](https://github.com/tensorflow/tpu).
 
 To convert your code from the vanilla `Estimator` class to use TPUs, change the
 following (note some of the details may change over time):
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 6db373d2d5e20ea7da449530b2730403c3bb64cc..5de55b5f7f2a41ac6edd27e5a102e565f33df12c 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -60,6 +60,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data",
         "//tensorflow/python/estimator:inputs_queues",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -308,23 +309,10 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
     cc_api_version = 2,
-    go_api_version = 2,
     java_api_version = 2,
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index da2de3e421b841937e4125168ea1ecea066ff841..edd71fb2502cf6c965a97485e074d20f876fd504 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -57,6 +57,8 @@ from tensorflow.contrib.training.python.training.hparam import *
 from tensorflow.contrib.training.python.training.resample import *
 from tensorflow.contrib.training.python.training.sampling_ops import *
 from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import *
+from tensorflow.contrib.training.python.training.tensor_queue_dataset import enqueue_in_queue_dataset
+from tensorflow.contrib.training.python.training.tensor_queue_dataset import prepend_from_queue_and_padded_batch_dataset
 from tensorflow.contrib.training.python.training.training import add_gradients_summaries
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms_fn
@@ -75,6 +77,7 @@ _allowed_symbols = [
     'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook',
     'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries',
     'clip_gradient_norms', 'clip_gradient_norms_fn', 'create_train_op',
-    'multiply_gradients', 'train']
+    'multiply_gradients', 'enqueue_in_queue_dataset',
+    'prepend_from_queue_and_padded_batch_dataset', 'train']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index dbdbb08a8252c799924812c83fff7f0631424761..f305197c190b67355338c407a7895a0507941ddb 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -517,6 +518,7 @@ class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
     ops._USE_C_API = self._prev_value
 
 
+@test_util.with_c_api
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index 1a5fb45be0ad463f2b4189d97ce4bd41a67a1937..f7fd66d33fc0c329db7daaf87373385156d84217 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -36,9 +36,8 @@ out the metrics values to stdout:
 
   # Choose the metrics to compute:
   names_to_values, names_to_updates = tf.contrib.metrics.aggregate_metric_map({
-      "accuracy": tf.contrib.metrics.streaming_accuracy(predictions, labels),
-      "mse": tf.contrib.metrics.streaming_mean_squared_error(
-        predictions, labels),
+      "accuracy": tf.metrics.accuracy(labels, predictions),
+      "mse": tf.metrics.mean_squared_error(labels, predictions),
   })
 
   # Define the summaries to write:
@@ -81,9 +80,8 @@ more summaries and call the evaluate_repeatedly method:
 
   # Choose the metrics to compute:
   names_to_values, names_to_updates = tf.contrib.metrics.aggregate_metric_map({
-      "accuracy": tf.contrib.metrics.streaming_accuracy(predictions, labels),
-      "mse": tf.contrib.metrics.streaming_mean_squared_error(
-          predictions, labels),
+      "accuracy": tf.metrics.accuracy(labels, predictions),
+      "mse": tf.metrics.mean_squared_error(labels, predictions),
   })
 
   # Define the summaries to write:
@@ -140,7 +138,6 @@ from __future__ import print_function
 
 import time
 
-from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
@@ -300,7 +297,7 @@ class SummaryAtEndHook(session_run_hook.SessionRunHook):
   def begin(self):
     if self._replace_summary_op:
       self._summary_op = summary.merge_all()
-    self._global_step = variables.get_or_create_global_step()
+    self._global_step = training_util.get_or_create_global_step()
 
   def after_create_session(self, session, coord):
     if self._summary_writer is None and self._log_dir:
diff --git a/tensorflow/contrib/training/python/training/evaluation_test.py b/tensorflow/contrib/training/python/training/evaluation_test.py
index b07039916c203940039732c12938e7f342fa72a3..c36d00e8425ccbfe9338b50fc492dc1334d59731 100644
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@@ -27,7 +27,6 @@ import numpy as np
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.losses.python.losses import loss_ops
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.training.python.training import evaluation
 from tensorflow.contrib.training.python.training import training
 from tensorflow.core.protobuf import config_pb2
@@ -38,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
@@ -196,7 +196,8 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     checkpoint_path = evaluation.wait_for_new_checkpoint(checkpoint_dir)
 
@@ -311,7 +312,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     final_values = evaluation.evaluate_repeatedly(
         checkpoint_dir=checkpoint_dir,
@@ -365,7 +367,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     timeout_fn_calls = [0]
     def timeout_fn():
@@ -417,9 +420,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     self.assertEqual(final_values['my_var'], expected_value)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(
-        predictions + 1, labels)
+    accuracy0, update_op0 = metrics.accuracy(labels, predictions)
+    accuracy1, update_op1 = metrics.accuracy(labels, predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index fdfd27d6a414933b0bec824bae512c45dac24d3c..f0418f04ba2c5c12c882d0b678f182058f25a94f 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -315,7 +315,7 @@ class HParams(object):
 
   Hyperparameters have type, which is inferred from the type of their value
   passed at construction type.   The currently supported types are: integer,
-  float, string, and list of integer, float, or string.
+  float, boolean, string, and list of integer, float, boolean, or string.
 
   You can override hyperparameter values by calling the
   [`parse()`](#HParams.parse) method, passing a string of comma separated
@@ -358,6 +358,8 @@ class HParams(object):
   ```
   """
 
+  _HAS_DYNAMIC_ATTRIBUTES = True  # Required for pytype checks.
+
   def __init__(self, hparam_def=None, model_structure=None, **kwargs):
     """Create an instance of `HParams` from keyword arguments.
 
@@ -500,6 +502,16 @@ class HParams(object):
             'Must pass a list for multi-valued parameter: %s.' % name)
       setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
 
+  def del_hparam(self, name):
+    """Removes the hyperparameter with key 'name'.
+
+    Args:
+      name: Name of the hyperparameter.
+    """
+    if hasattr(self, name):
+      delattr(self, name)
+      del self._hparam_types[name]
+
   def parse(self, values):
     """Override hyperparameter values, parsing new values from a string.
 
@@ -628,6 +640,9 @@ class HParams(object):
   def __str__(self):
     return str(sorted(self.values().items()))
 
+  def __repr__(self):
+    return '%s(%s)' % (type(self).__name__, self.__str__())
+
   @staticmethod
   def _get_kind_name(param_type, is_list):
     """Returns the field name given parameter type and is_list.
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 16397622edd382bc6dcb12870de5fa22130a2c2b..11fd15b5275a3c00b85bf986b2ff1ba0e2638aed 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -38,40 +38,60 @@ class HParamsTest(test.TestCase):
     self.assertFalse('bar' in hparams)
 
   def testSomeValues(self):
-    hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6')
-    self.assertDictEqual({'aaa': 1, 'b': 2.0, 'c_c': 'relu6'}, hparams.values())
-    expected_str = '[(\'aaa\', 1), (\'b\', 2.0), (\'c_c\', \'relu6\')]'
+    hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6', d='/a/b=c/d')
+    self.assertDictEqual(
+        {'aaa': 1, 'b': 2.0, 'c_c': 'relu6', 'd': '/a/b=c/d'},
+        hparams.values())
+    expected_str = ('[(\'aaa\', 1), (\'b\', 2.0), (\'c_c\', \'relu6\'), '
+                    '(\'d\', \'/a/b=c/d\')]')
     self.assertEqual(expected_str, str(hparams.__str__()))
     self.assertEqual(expected_str, str(hparams))
     self.assertEqual(1, hparams.aaa)
     self.assertEqual(2.0, hparams.b)
     self.assertEqual('relu6', hparams.c_c)
+    self.assertEqual('/a/b=c/d', hparams.d)
     hparams.parse('aaa=12')
     self.assertDictEqual({
         'aaa': 12,
         'b': 2.0,
-        'c_c': 'relu6'
+        'c_c': 'relu6',
+        'd': '/a/b=c/d'
     }, hparams.values())
     self.assertEqual(12, hparams.aaa)
     self.assertEqual(2.0, hparams.b)
     self.assertEqual('relu6', hparams.c_c)
+    self.assertEqual('/a/b=c/d', hparams.d)
     hparams.parse('c_c=relu4, b=-2.0e10')
     self.assertDictEqual({
         'aaa': 12,
         'b': -2.0e10,
-        'c_c': 'relu4'
+        'c_c': 'relu4',
+        'd': '/a/b=c/d'
     }, hparams.values())
     self.assertEqual(12, hparams.aaa)
     self.assertEqual(-2.0e10, hparams.b)
     self.assertEqual('relu4', hparams.c_c)
+    self.assertEqual('/a/b=c/d', hparams.d)
     hparams.parse('c_c=,b=0,')
-    self.assertDictEqual({'aaa': 12, 'b': 0, 'c_c': ''}, hparams.values())
+    self.assertDictEqual({'aaa': 12, 'b': 0, 'c_c': '', 'd': '/a/b=c/d'},
+                         hparams.values())
     self.assertEqual(12, hparams.aaa)
     self.assertEqual(0.0, hparams.b)
     self.assertEqual('', hparams.c_c)
+    self.assertEqual('/a/b=c/d', hparams.d)
     hparams.parse('c_c=2.3",b=+2,')
     self.assertEqual(2.0, hparams.b)
     self.assertEqual('2.3"', hparams.c_c)
+    hparams.parse('d=/a/b/c/d,aaa=11,')
+    self.assertEqual(11, hparams.aaa)
+    self.assertEqual(2.0, hparams.b)
+    self.assertEqual('2.3"', hparams.c_c)
+    self.assertEqual('/a/b/c/d', hparams.d)
+    hparams.parse('b=1.5,d=/a=b/c/d,aaa=10,')
+    self.assertEqual(10, hparams.aaa)
+    self.assertEqual(1.5, hparams.b)
+    self.assertEqual('2.3"', hparams.c_c)
+    self.assertEqual('/a=b/c/d', hparams.d)
     with self.assertRaisesRegexp(ValueError, 'Unknown hyperparameter'):
       hparams.parse('x=123')
     with self.assertRaisesRegexp(ValueError, 'Could not parse'):
@@ -84,17 +104,19 @@ class HParamsTest(test.TestCase):
       hparams.parse('b=relu')
     with self.assertRaisesRegexp(ValueError, 'Must not pass a list'):
       hparams.parse('aaa=[123]')
-    self.assertEqual(12, hparams.aaa)
-    self.assertEqual(2.0, hparams.b)
+    self.assertEqual(10, hparams.aaa)
+    self.assertEqual(1.5, hparams.b)
     self.assertEqual('2.3"', hparams.c_c)
+    self.assertEqual('/a=b/c/d', hparams.d)
     # Exports to proto.
     hparam_def = hparams.to_proto()
     # Imports from proto.
     hparams2 = hparam.HParams(hparam_def=hparam_def)
     # Verifies that all hparams are restored.
-    self.assertEqual(12, hparams2.aaa)
-    self.assertEqual(2.0, hparams2.b)
+    self.assertEqual(10, hparams2.aaa)
+    self.assertEqual(1.5, hparams2.b)
     self.assertEqual('2.3"', hparams2.c_c)
+    self.assertEqual('/a=b/c/d', hparams2.d)
 
   def testSetFromMap(self):
     hparams = hparam.HParams(a=1, b=2.0, c='tanh')
@@ -417,6 +439,22 @@ class HParamsTest(test.TestCase):
     self.assertEqual(123, hparams.get('unknown', 123))
     self.assertEqual([1, 2, 3], hparams.get('unknown', [1, 2, 3]))
 
+  def testDel(self):
+    hparams = hparam.HParams(aaa=1, b=2.0)
+
+    with self.assertRaises(ValueError):
+      hparams.set_hparam('aaa', 'will fail')
+
+    with self.assertRaises(ValueError):
+      hparams.add_hparam('aaa', 'will fail')
+
+    hparams.del_hparam('aaa')
+    hparams.add_hparam('aaa', 'will work')
+    self.assertEqual('will work', hparams.get('aaa'))
+
+    hparams.set_hparam('aaa', 'still works')
+    self.assertEqual('still works', hparams.get('aaa'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/training/python/training/resample.py b/tensorflow/contrib/training/python/training/resample.py
index b16159bc16bb5f3ccd9b5cc7f4af64d9c24b22af..7b8332b1d672dfdfb233e24c0e4f760e49382bc0 100644
--- a/tensorflow/contrib/training/python/training/resample.py
+++ b/tensorflow/contrib/training/python/training/resample.py
@@ -77,7 +77,7 @@ def resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False):
 
   Args:
     inputs: A list of tensors, each of which has a shape of `[batch_size, ...]`
-    rates: A tensor of shape `[batch_size]` contiaining the resampling rates
+    rates: A tensor of shape `[batch_size]` containing the resampling rates
        for each input.
     scope: Scope for the op.
     seed: Random seed to use.
diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py
index ba888f87dc8c12788c1160f7199c0b762be186c1..7140f2a46d57f0f3b76ff4f1ea9d0d73808405c8 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops.py
@@ -123,7 +123,7 @@ def rejection_sample(tensors,
         batch_size=batch_size,
         num_threads=queue_threads)
 
-    # Queues return a single tensor if the list of enqued tensors is one. Since
+    # Queues return a single tensor if the list of enqueued tensors is one. Since
     # we want the type to always be the same, always return a list.
     if isinstance(minibatch, ops.Tensor):
       minibatch = [minibatch]
@@ -312,7 +312,7 @@ def _verify_input(tensor_list, labels, probs_list):
   """Verify that batched inputs are well-formed."""
   checked_probs_list = []
   for probs in probs_list:
-    # Since number of classes shouldn't change at runtime, probalities shape
+    # Since number of classes shouldn't change at runtime, probabilities shape
     # should be fully defined.
     probs.get_shape().assert_is_fully_defined()
 
@@ -407,7 +407,7 @@ def _calculate_acceptance_probabilities(init_probs, target_probs):
   ```
 
 
-  A solution for a_i in terms of the other variabes is the following:
+  A solution for a_i in terms of the other variables is the following:
     ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
   """
   # Make list of t_i / p_i.
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 72231948856b38edd3d022a99a62e6d4c8c5649e..39d75a080604e3a7ae93391652d4c03be9857218 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -876,7 +876,7 @@ class SequenceQueueingStateSaver(object):
         ]):
           self._length = array_ops.identity(self._length)
 
-        # Only create barrier; enqueu and dequeue operations happen when you
+        # Only create barrier; enqueue and dequeue operations happen when you
         # access prefetch_op and next_batch.
         self._create_barrier()
         self._scope = scope
@@ -1574,8 +1574,9 @@ def _padding(sequences, num_unroll):
   if not sequences:
     return 0, {}
 
-  sequences_dict = {}
-  for key, value in sequences.items():
+  # Sort 'sequences_dict' so 'length' will have a predictable value below.
+  sequences_dict = collections.OrderedDict()
+  for key, value in sorted(sequences.items()):
     if not (isinstance(value, sparse_tensor.SparseTensor) or
             isinstance(value, sparse_tensor.SparseTensorValue)):
       sequences_dict[key] = ops.convert_to_tensor(value)
@@ -1636,7 +1637,7 @@ def _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll):
 
   For `key, value` pairs in `input_context` with `SparseTensor` `value` removes
   them from `input_context` and transforms the `value` into a sequence and
-  then adding `key`, transformed `value` into `input_seuqences`.
+  then adding `key`, transformed `value` into `input_sequences`.
   The transformation is done by adding a new first dimension of `value_length`
   equal to that of the other values in input_sequences` and tiling the `value`
   every `num_unroll` steps.
diff --git a/tensorflow/contrib/util/BUILD b/tensorflow/contrib/util/BUILD
index 6c766e4f1c04fd9cca0b6e03382737d42b6fda20..d9ccda8e89a4c9a1b3f3d24915b9ad3fb4d9be5f 100644
--- a/tensorflow/contrib/util/BUILD
+++ b/tensorflow/contrib/util/BUILD
@@ -75,15 +75,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/util/loader.py b/tensorflow/contrib/util/loader.py
index f4283cd9ed6eb185ec56bdfef45f6ddc5253d128..dca01d26f4c6291074d855322c8215027cfd2ace 100644
--- a/tensorflow/contrib/util/loader.py
+++ b/tensorflow/contrib/util/loader.py
@@ -42,9 +42,10 @@ def load_op_library(path):
     plugin.
   """
   if os.name == 'nt':
-    # To avoid makeing every user_ops aware of windows, re-write
-    # the file extension from .so to .dll.
-    path = re.sub(r'\.so$', '.dll', path)
+    # To avoid making every user_ops aware of windows, re-write
+    # the file extension from .so to .dll if .so file doesn't exist.
+    if not os.path.exists(path):
+      path = re.sub(r'\.so$', '.dll', path)
 
     # Currently we have only some user_ops as dlls on windows - don't try
     # to load them if the dll is not found.
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 80a5d07ea43531ed2532443b6ff9327b9ece6df7..9720fd6e8657de18cf8d7565f834568ae52fdbda 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -11,18 +11,6 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index 58fed4e5cb4c24b0f21dfe9b99cf4c665d2591c7..4b6104a8b4d542b1d8a9cb3e48eeed4950d791cd 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -93,7 +93,7 @@ When the receiver receives the RDMA write, it will locate the relevant **RdmaTen
 
 1. When the sender receives a tensor request, the source tensor may or may not be ready yet. The situation is handled through a process of tag matching:
 	* If the request arrives before the tensor is ready, then a callback is put in a local table, and will be invoked once the tensor arrives.
-	* If the tensor is ready before the request arives, than the tensor is put in a local table. When the request arrives, it will invoke the callback immediately.
+	* If the tensor is ready before the request arrives, than the tensor is put in a local table. When the request arrives, it will invoke the callback immediately.
    In code it is done by calling **RecvLocalAsync()**, which receives the tensor's key, step-id, and the callback.
 2. When the callback is invoked, the relevant tensor is removed from the tag matching table. In the case where we need to send the tensor's meta-data, the **RdmaTensorResponse** will store a copy of the tensor until the re-request arrives.
 3. The sending of protocol messages (**RDMA_MESSAGE_TENSOR_REQUEST**, **RDMA_MESSAGE_META_DATA_RESPONSE** and **RDMA_MESSAGE_TENSOR_RE_REQUEST**) is done by the class **RdmaMessageBuffer**. All messages are sent using RDMA writes from/to fixed messages buffers. This implies that we cannot send on a specific channel more than one message at a time. In order to synchronize the messages, the **RdmaMessageBuffer** holds the a local and remote buffer statuses which can be either busy or idle. When a write is issued, both statuses will be changed to busy. When the write-complete event is received, the local status is changed to idle. When the write is received on the remote side, the remote side will parse the message, and return an ACK back to the sending side on which the sending side will update the remote status to idle. When both the local and remote statuses are idle, the next message can be sent.
diff --git a/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md b/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
index 956b8f2147cf8154b6f1ade006d7bff194864c9b..da6fdd48e19e9d1503d1537926b1c464a0e77589 100644
--- a/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
+++ b/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
@@ -64,7 +64,7 @@ The protocol messages themselves will remain mostly unchanged at the first stage
 	* type - The message type.
 	* request_index - Request index.
 	* is_dead/data_type/tensor_shape/tensor_bytes - The up-to-date meta-data.
-* **RDMA_MESSAGE_BUFFER_RESPONSE** - (receiver ==> sender) Tensor re-requset after meta-data update and reallocation of result/proxy tensors.
+* **RDMA_MESSAGE_BUFFER_RESPONSE** - (receiver ==> sender) Tensor re-request after meta-data update and reallocation of result/proxy tensors.
 	* type - The message type.
 	* name (name_size) - Name of the requested tensor.
 	* step_id - Step ID.
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index 94203ee2b3654bffe82d203cde8780a64f63ba2a..c9df6beb6b1d67f14d26d7f0420cb53b6347bf99 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -262,7 +262,7 @@ class RdmaTensorRequest {
   // Receive tensor content (RDMA write was completed).
   //
   // Decode proto if required and/or move to GPU if the content was not
-  // written to it directly (GPU direct is not avaliable). Afterwards,
+  // written to it directly (GPU direct is not available). Afterwards,
   // invoke Done().
   void RecvTensorContent();
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 30ac270109dbbb77cd6a400d1feaa1ac116456c1..4b86d6ef4752b191d04114a5ee5b547c644debcb 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -100,6 +100,8 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 # For platform specific build config
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_platform_hdrs",
+    "tf_platform_srcs",
     "tf_proto_library",
     "tf_proto_library_cc",
     "tf_additional_all_protos",
@@ -119,8 +121,6 @@ load(
     "tf_additional_libdevice_srcs",
     "tf_additional_test_deps",
     "tf_additional_test_srcs",
-    "tf_env_time_hdrs",
-    "tf_env_time_srcs",
     "tf_kernel_tests_linkstatic",
     "tf_additional_cloud_op_deps",
     "tf_additional_cloud_kernel_deps",
@@ -144,10 +144,15 @@ load(
     "tf_cuda_tests_tags",
     "if_static",
 )
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
+load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
+
+exports_files(["ops/ops.pbtxt"])
 
 # -----------------------------------------------------------------------------
 # Public targets
@@ -156,7 +161,8 @@ load(
 #
 # Note that some protos are in neither additional_core_proto_srcs nor this
 # filegroup; e.g.  ones with individual proto_library targets.
-CORE_PROTO_SRCS = [
+# LINT.IfChange
+COMMON_PROTO_SRCS = [
     "example/example.proto",
     "example/feature.proto",
     "framework/allocation_description.proto",
@@ -184,7 +190,6 @@ CORE_PROTO_SRCS = [
     "framework/types.proto",
     "framework/variable.proto",
     "framework/versions.proto",
-    "lib/core/error_codes.proto",
     "protobuf/config.proto",
     "protobuf/cluster.proto",
     "protobuf/debug.proto",
@@ -198,6 +203,13 @@ CORE_PROTO_SRCS = [
     "util/saved_tensor_slice.proto",
 ]
 
+ERROR_CODES_PROTO_SRCS = [
+    "lib/core/error_codes.proto",
+]
+# LINT.ThenChange(//tensorflow/core/android_proto_config.asciipb)
+
+CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS
+
 # Protos which are not needed on mobile builds, but should be included in
 # protos_all.
 #
@@ -205,6 +217,7 @@ CORE_PROTO_SRCS = [
 # ones with individual proto_library targets.
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
+    "protobuf/checkpointable_object_graph.proto",
     "protobuf/control_flow.proto",
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # "protobuf/critical_section.proto",
@@ -217,13 +230,16 @@ ADDITIONAL_CORE_PROTO_SRCS = [
 
 tf_proto_library(
     name = "protos_all",
-    srcs = CORE_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
+    srcs = [],
     cc_api_version = 2,
     default_header = True,
-    go_api_version = 2,
     j2objc_api_version = 1,
     java_api_version = 2,
     js_api_version = 2,
+    protodeps = [
+        ":protos_all_proto",
+        ":error_codes_proto",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -242,6 +258,15 @@ tf_nano_proto_library(
     deps = [":protos_all_cc"],
 )
 
+proto_library(
+    name = "example_protos",
+    srcs = [
+        "example/example.proto",
+        "example/feature.proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 exports_files([
     "framework/types.proto",
 ])
@@ -254,7 +279,7 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-# Minimal lib to detect plafrom
+# Minimal lib to detect platform
 cc_library(
     name = "lib_platform",
     hdrs = [
@@ -262,11 +287,66 @@ cc_library(
     ],
 )
 
+PLATFORM_BASE_HDRS = [
+    "platform/env_time.h",
+    "platform/logging.h",
+    "platform/macros.h",
+    "platform/types.h",
+    "platform/byte_order.h",
+]
+
+PLATFORM_OTHER_HDRS = [
+    "platform/abi.h",
+    "platform/stacktrace.h",
+    "platform/stacktrace_handler.h",
+    "platform/context.h",
+    "platform/cpu_info.h",
+    "platform/cpu_feature_guard.h",
+    "platform/dynamic_annotations.h",
+    "platform/env.h",
+    "platform/file_system.h",
+    "platform/file_system_helper.h",
+    "platform/fingerprint.h",
+    "platform/init_main.h",
+    "platform/mem.h",
+    "platform/mutex.h",
+    "platform/net.h",
+    "platform/notification.h",
+    "platform/null_file_system.h",
+    "platform/prefetch.h",
+    "platform/profile_utils/clock_cycle_profiler.h",
+    "platform/profile_utils/cpu_utils.h",
+    "platform/protobuf.h",
+    "platform/strong_hash.h",
+    "platform/subprocess.h",
+    "platform/thread_annotations.h",
+]
+
+# Smaller platform libraries that don't depend on "lib" or "lib_internal".
+cc_library(
+    name = "platform_base",
+    srcs = tf_platform_hdrs([
+        "integral_types.h",
+        "logging.h",
+    ]) + tf_platform_srcs([
+        "logging.cc",
+        "env_time.cc",
+    ]) + [
+        "platform/env_time.cc",
+    ],
+    hdrs = PLATFORM_BASE_HDRS,
+    copts = tf_copts(),
+    deps = [
+        ":lib_platform",
+        "//tensorflow/core/platform/default/build_config:base",
+    ],
+)
+
 # Minimal lib so that tools used for mobile compilation
 # don't have to depend on lib/platformlib.
 cc_library(
     name = "lib_proto_parsing",
-    srcs = glob(tf_additional_proto_srcs()) + tf_env_time_srcs(),
+    srcs = glob(tf_additional_proto_srcs()),
     hdrs = [
         "lib/core/errors.h",
         "lib/core/status.h",
@@ -281,9 +361,12 @@ cc_library(
         "platform/types.h",
         "platform/windows/cpu_info.h",
         "lib/bfloat16/bfloat16.h",
-    ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()),
+    ] + tf_additional_proto_hdrs(),
     copts = tf_copts(),
-    deps = tf_lib_proto_parsing_deps(),
+    deps = tf_lib_proto_parsing_deps() + [
+        ":platform_base",
+        "@double_conversion//:double-conversion",
+    ],
 )
 
 # This build rule (along with :lib_internal, :framework, and
@@ -292,7 +375,8 @@ cc_library(
 # tf_cc_test and tf_cc_binary will include the necessary symbols.
 cc_library(
     name = "lib",
-    hdrs = [
+    hdrs = PLATFORM_BASE_HDRS +
+           PLATFORM_OTHER_HDRS + [
         "lib/bfloat16/bfloat16.h",
         "lib/core/arena.h",
         "lib/core/bitmap.h",
@@ -314,6 +398,7 @@ cc_library(
         "lib/gtl/optional.h",
         "lib/gtl/priority_queue_util.h",
         "lib/hash/crc32c.h",
+        "lib/hash/hash.h",
         "lib/histogram/histogram.h",
         "lib/io/buffered_inputstream.h",
         "lib/io/compression.h",
@@ -338,31 +423,6 @@ cc_library(
         "lib/strings/str_util.h",
         "lib/strings/strcat.h",
         "lib/strings/stringprintf.h",
-        "platform/abi.h",
-        "platform/cpu_feature_guard.h",
-        "platform/cpu_info.h",
-        "platform/dynamic_annotations.h",
-        "platform/env.h",
-        "platform/env_time.h",
-        "platform/file_system.h",
-        "platform/fingerprint.h",
-        "platform/init_main.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/mem.h",
-        "platform/mutex.h",
-        "platform/net.h",
-        "platform/notification.h",
-        "platform/prefetch.h",
-        "platform/profile_utils/clock_cycle_profiler.h",
-        "platform/profile_utils/cpu_utils.h",
-        "platform/protobuf.h",
-        "platform/stacktrace.h",
-        "platform/strong_hash.h",
-        "platform/subprocess.h",
-        "platform/thread_annotations.h",
-        "platform/types.h",
-        "platform/windows/cpu_info.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -377,13 +437,13 @@ cc_library(
 )
 
 cc_library(
-    name = "session_message",
-    srcs = ["util/session_message.cc"],
-    hdrs = ["util/session_message.h"],
+    name = "stacktrace",
+    srcs = glob(["platform/*/stacktrace.h"]),
+    hdrs = ["platform/stacktrace.h"],
     deps = [
-        ":framework",
-        ":lib",
-        ":protos_all_cc",
+        ":abi",
+        ":lib_platform",
+        "//tensorflow/core/platform/default/build_config:stacktrace",
     ],
 )
 
@@ -392,11 +452,23 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
-        ":lib",
+        ":abi",
         ":lib_platform",
+        ":stacktrace",
     ],
 )
 
+# Libraries that will eventually be moved into lib/core
+# Note that stringpiece_test can't be place here yet, because we are
+# required to use tf_cc_test, and that rule will change / into _
+cc_library(
+    name = "core_stringpiece",
+    srcs = ["lib/core/stringpiece.cc"],
+    hdrs = ["lib/core/stringpiece.h"],
+    copts = tf_copts(),
+    deps = [":platform_base"],
+)
+
 # Test support library needed for all tests
 # This is currently public, but may be made internal in the
 # future.  Try to avoid depending on it.
@@ -424,6 +496,27 @@ cc_library(
     ] + tf_additional_test_deps(),
 )
 
+# Testing libraries - lite versions that don't depend on all of "lib" or
+# "lib_internal". Instead, they only need a much smaller set of support
+# libraries such as ":platform_base" and ":core_stringpiece".
+cc_library(
+    name = "test_lite",
+    testonly = 1,
+    srcs = [
+        "platform/test.cc",
+    ],
+    hdrs = [
+        "platform/test.h",
+        "platform/test_benchmark.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":lib_platform",
+        ":platform_base",
+        "//tensorflow/core/platform/default/build_config:gtest",
+    ],
+)
+
 # This build rule (along with :framework_internal, :lib, and :lib_internal)
 # purposefully omits the definitions of many declared symbols, which are
 # included in //tensorflow:libtensorflow_framework.so. Using tf_cc_test and tf_cc_binary
@@ -441,6 +534,7 @@ tf_cuda_library(
         "framework/attr_value_util.h",
         "framework/bfloat16.h",
         "framework/cancellation.h",
+        "framework/collective.h",
         "framework/common_shape_fns.h",
         "framework/control_flow.h",  # TODO(josh11b): Make internal?
         "framework/dataset.h",
@@ -448,6 +542,7 @@ tf_cuda_library(
         "framework/device_base.h",
         "framework/function.h",
         "framework/graph_def_util.h",
+        "framework/graph_to_functiondef.h",
         "framework/kernel_def_builder.h",
         "framework/log_memory.h",
         "framework/lookup_interface.h",
@@ -471,6 +566,7 @@ tf_cuda_library(
         "framework/selective_registration.h",
         "framework/session_state.h",
         "framework/shape_inference.h",
+        "framework/stats_aggregator.h",
         "framework/tensor.h",
         "framework/tensor_shape.h",
         "framework/tensor_slice.h",
@@ -579,6 +675,7 @@ cc_library(
         "framework/tensor_types.h",
         "framework/type_traits.h",
         "lib/bfloat16/bfloat16.h",
+        "platform/byte_order.h",
         "platform/default/dynamic_annotations.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
@@ -592,6 +689,7 @@ cc_library(
         "platform/prefetch.h",
         "platform/thread_annotations.h",
         "platform/types.h",
+        "platform/cpu_info.h",
     ] + if_windows(["platform/windows/integral_types.h"]),
     visibility = ["//visibility:public"],
     deps =
@@ -609,12 +707,16 @@ tf_gen_op_libs(
     op_lib_names = [
         "batch_ops",
         "bitwise_ops",
+        "boosted_trees_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
+        "collective_ops",
         "control_flow_ops",
         "ctc_ops",
         "data_flow_ops",
         "dataset_ops",
+        "decode_proto_ops",
+        "encode_proto_ops",
         "function_ops",
         "functional_ops",
         "image_ops",
@@ -631,6 +733,8 @@ tf_gen_op_libs(
         "random_ops",
         "remote_fused_graph_ops",
         "resource_variable_ops",
+        "rpc_ops",
+        "scoped_allocator_ops",
         "sdca_ops",
         "set_ops",
         "script_ops",
@@ -684,6 +788,34 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "cudnn_rnn_ops",
+    srcs = [
+        "ops/cudnn_rnn_ops.cc",
+    ],
+    linkstatic = 1,
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//third_party/eigen3",
+        "@farmhash_archive//:farmhash",
+    ],
+    alwayslink = 1,
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "cudnn_rnn_ops",
+    ],
+    deps = [
+        ":lib",
+    ],
+)
+
 cc_library(
     name = "ops",
     visibility = ["//visibility:public"],
@@ -692,12 +824,17 @@ cc_library(
         ":audio_ops_op_lib",
         ":batch_ops_op_lib",
         ":bitwise_ops_op_lib",
+        ":boosted_trees_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
+        ":collective_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
+        ":cudnn_rnn_ops_op_lib",
         ":data_flow_ops_op_lib",
         ":dataset_ops_op_lib",
+        ":decode_proto_ops_op_lib",
+        ":encode_proto_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":image_ops_op_lib",
@@ -714,11 +851,14 @@ cc_library(
         ":random_ops_op_lib",
         ":remote_fused_graph_ops_op_lib",
         ":resource_variable_ops_op_lib",
+        ":rpc_ops_op_lib",
+        ":scoped_allocator_ops_op_lib",
         ":script_ops_op_lib",
         ":sdca_ops_op_lib",
         ":sendrecv_ops_op_lib",
         ":set_ops_op_lib",
         ":sparse_ops_op_lib",
+        ":summary_ops_op_lib",
         ":spectral_ops_op_lib",
         ":state_ops_op_lib",
         ":stateless_random_ops_op_lib",
@@ -830,12 +970,17 @@ cc_library(
         "//tensorflow/core/kernels:audio",
         "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels:bincount_op",
+        "//tensorflow/core/kernels:boosted_trees_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
+        "//tensorflow/core/kernels:collective_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
+        "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/kernels:dataset_ops",
+        "//tensorflow/core/kernels:decode_proto_op",
+        "//tensorflow/core/kernels:encode_proto_op",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:functional_ops",
@@ -852,11 +997,14 @@ cc_library(
         "//tensorflow/core/kernels:nn",
         "//tensorflow/core/kernels:parameterized_truncated_normal_op",
         "//tensorflow/core/kernels:parsing",
+        "//tensorflow/core/kernels:partitioned_function_ops",
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:random_poisson_op",
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:required",
         "//tensorflow/core/kernels:resource_variable_ops",
+        "//tensorflow/core/kernels:rpc_op",
+        "//tensorflow/core/kernels:scoped_allocator_ops",
         "//tensorflow/core/kernels:sdca_ops",
         "//tensorflow/core/kernels:set_kernels",
         "//tensorflow/core/kernels:sparse",
@@ -886,6 +1034,9 @@ cc_library(
         "//tensorflow/core/kernels:mkl_softmax_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
         "//tensorflow/core/kernels:mkl_aggregate_ops",
+    ]) + if_cuda([
+        "//tensorflow/core/grappler/optimizers:gpu_swapping_kernels",
+        "//tensorflow/core/grappler/optimizers:gpu_swapping_ops",
     ]),
 )
 
@@ -926,6 +1077,7 @@ cc_library(
     hdrs = [
         "common_runtime/function_testlib.h",
         "common_runtime/kernel_benchmark_testlib.h",
+        "common_runtime/test_collective_executor_mgr.h",
         "framework/fake_input.h",
         "framework/function_testlib.h",
         "framework/shape_inference_testutil.h",
@@ -988,22 +1140,16 @@ filegroup(
 
 # Core sources for Android builds.
 filegroup(
-    name = "mobile_srcs",
+    name = "mobile_srcs_no_runtime",
     srcs = [
-        ":proto_text_srcs_all",
-        "//tensorflow/core/kernels:android_srcs",
+        ":protos_all_proto_text_srcs",
+        ":error_codes_proto_text_srcs",
         "//tensorflow/core/platform/default/build_config:android_srcs",
-        "//tensorflow/core/util/ctc:android_srcs",
-        "//tensorflow/core/util/tensor_bundle:android_srcs",
     ] + glob(
         [
             "client/**/*.cc",
-            "common_runtime/**/*.h",
-            "common_runtime/**/*.cc",
             "framework/**/*.h",
             "framework/**/*.cc",
-            "graph/**/*.h",
-            "graph/**/*.cc",
             "lib/**/*.h",
             "lib/**/*.cc",
             "platform/**/*.h",
@@ -1019,7 +1165,6 @@ filegroup(
             "**/*main.cc",
             "debug/**/*",
             "framework/op_gen_*",
-            "graph/dot.*",
             "lib/jpeg/**/*",
             "lib/png/**/*",
             "lib/gif/**/*",
@@ -1036,13 +1181,54 @@ filegroup(
             "platform/stream_executor.*",
             "platform/windows/**/*",
             "user_ops/**/*.cu.cc",
+            "util/ctc/*.h",
+            "util/ctc/*.cc",
+            "util/tensor_bundle/*.h",
+            "util/tensor_bundle/*.cc",
             "common_runtime/gpu/**/*",
+            "common_runtime/eager/*",
             "common_runtime/gpu_device_factory.*",
         ],
     ),
     visibility = ["//visibility:public"],
 )
 
+filegroup(
+    name = "mobile_srcs_only_runtime",
+    srcs = [
+        "//tensorflow/core/kernels:android_srcs",
+        "//tensorflow/core/util/ctc:android_srcs",
+        "//tensorflow/core/util/tensor_bundle:android_srcs",
+    ] + glob(
+        [
+            "common_runtime/**/*.h",
+            "common_runtime/**/*.cc",
+            "graph/**/*.h",
+            "graph/**/*.cc",
+        ],
+        exclude = [
+            "**/*test.*",
+            "**/*testutil*",
+            "**/*testlib*",
+            "**/*main.cc",
+            "common_runtime/gpu/**/*",
+            "common_runtime/eager/*",
+            "common_runtime/gpu_device_factory.*",
+            "graph/dot.*",
+        ],
+    ),
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "mobile_srcs",
+    srcs = [
+        ":mobile_srcs_no_runtime",
+        ":mobile_srcs_only_runtime",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 # Native library support for Android applications.  Does not contain
 # operators, use :android_tensorflow_lib if you want full operator
 # support.
@@ -1067,6 +1253,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1106,6 +1293,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1169,6 +1357,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1191,6 +1380,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1370,6 +1560,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "protobuf/device_properties_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "protobuf/device_properties.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "protobuf/meta_graph_pyclif",
     proto_lib = ":protos_all_cc",
@@ -1378,9 +1575,9 @@ tf_pyclif_proto_library(
 )
 
 tf_pyclif_proto_library(
-    name = "protobuf/device_properties_pyclif",
+    name = "protobuf/saved_model_pyclif",
     proto_lib = ":protos_all_cc",
-    proto_srcfile = "protobuf/device_properties.proto",
+    proto_srcfile = "protobuf/saved_model.proto",
     visibility = ["//visibility:public"],
 )
 
@@ -1431,6 +1628,18 @@ tf_proto_library_cc(
     ],
 )
 
+tf_proto_library_cc(
+    name = "eager_service_proto",
+    srcs = ["protobuf/eager_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = tf_additional_all_protos(),
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
 LIB_INTERNAL_PRIVATE_HEADERS = ["framework/resource_handle.h"] + glob(
     [
         "lib/**/*.h",
@@ -1486,6 +1695,7 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
     "lib/strings/base64.h",
     "lib/strings/ordered_code.h",
     "lib/strings/proto_text_util.h",
+    "lib/strings/proto_serialization.h",
     "lib/strings/scanner.h",
     "lib/wav/wav_io.h",
     "platform/demangle.h",
@@ -1539,23 +1749,29 @@ cc_library(
         exclude = [
             "**/*test*",
             "framework/variant.cc",
+            "lib/core/stringpiece.cc",
             "lib/hash/crc32c_accelerate.cc",
             "lib/gif/**/*",
             "lib/jpeg/**/*",
             "platform/**/env_time.cc",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logging.cc",
+            "platform/abi.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ],
     ) + tf_additional_lib_srcs(
         exclude = [
             "**/*test*",
+            "lib/core/stringpiece.cc",
             "platform/**/cuda.h",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logging.cc",
+            "platform/abi.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ] +
@@ -1569,10 +1785,13 @@ cc_library(
     deps = tf_additional_lib_deps() + [
         ":lib_hash_crc32c_accelerate_internal",
         ":lib_proto_parsing",
+        ":abi",
+        ":core_stringpiece",
         "//third_party/eigen3",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
         "@zlib_archive//:zlib",
+        "@double_conversion//:double-conversion",
         "@protobuf_archive//:protobuf",
     ] + tf_protos_all_impl() + tf_protos_grappler_impl(),
 )
@@ -1631,6 +1850,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tflite_portable_logging",
+    srcs = [],
+    hdrs = [
+        "lib/bfloat16/bfloat16.h",
+        "platform/default/integral_types.h",
+        "platform/default/logging.h",
+        "platform/logging.h",
+        "platform/macros.h",
+        "platform/platform.h",
+        "platform/types.h",
+    ],
+    copts = tf_copts(),
+    linkopts = ["-ldl"],
+    deps = [
+        "//tensorflow/core/platform/default/build_config:logging",
+    ],
+)
+
 cc_library(
     name = "android_jpeg_internal",
     srcs = if_android([
@@ -1699,6 +1937,7 @@ cc_library(
         "lib/core/casts.h",
         "lib/core/stringpiece.h",
         "lib/png/png_io.h",
+        "platform/byte_order.h",
         "platform/cpu_info.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
@@ -1714,15 +1953,58 @@ cc_library(
     ],
 )
 
-proto_text_hdrs_and_srcs = tf_generate_proto_text_sources(
-    name = "proto_text_srcs_all",
-    srcs = CORE_PROTO_SRCS,
+tf_proto_library(
+    name = "error_codes_proto",
+    srcs = ERROR_CODES_PROTO_SRCS,
+    cc_api_version = 2,
+    default_header = True,
+    j2objc_api_version = 1,
+    java_api_version = 2,
+    js_api_version = 2,
+)
+
+tf_generate_proto_text_sources(
+    name = "error_codes_proto_text",
+    srcs = ERROR_CODES_PROTO_SRCS,
+    protodeps = [],
+    srcs_relative_dir = "tensorflow/core/",
+    deps = [
+        ":error_codes_proto_cc",
+        ":lib_internal",
+    ],
+)
+
+tf_proto_library(
+    name = "protos_all_proto",
+    srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
+    cc_api_version = 2,
+    default_header = True,
+    j2objc_api_version = 1,
+    java_api_version = 2,
+    js_api_version = 2,
+    protodeps = [
+        ":error_codes_proto",
+    ],
+)
+
+tf_generate_proto_text_sources(
+    name = "protos_all_proto_text",
+    srcs = COMMON_PROTO_SRCS,
+    protodeps = ERROR_CODES_PROTO_SRCS,
     srcs_relative_dir = "tensorflow/core/",
+    deps = [
+        ":error_codes_proto_text",
+        ":lib_internal",
+        ":protos_all_proto_cc",
+    ],
 )
 
 cc_library(
     name = "proto_text",
-    hdrs = proto_text_hdrs_and_srcs.hdrs,
+    hdrs = [
+        ":error_codes_proto_text_hdrs",
+        ":protos_all_proto_text_hdrs",
+    ],
     deps = [
         ":lib",
         ":lib_internal",
@@ -1821,6 +2103,13 @@ cc_header_only_library(
     ],
 )
 
+cc_header_only_library(
+    name = "core_cpu_headers_lib",
+    deps = [
+        ":core_cpu_lib",
+    ],
+)
+
 tf_cuda_library(
     name = "framework_internal_impl",
     srcs = FRAMEWORK_INTERNAL_PRIVATE_HEADERS + [
@@ -1850,7 +2139,6 @@ tf_cuda_library(
             "framework/resource_handle.cc",
             "util/memmapped_file_system.*",
             "util/memmapped_file_system_writer.*",
-            "util/session_message.cc",
             "util/version_info.cc",
         ],
     ) + select({
@@ -1860,7 +2148,7 @@ tf_cuda_library(
             "util/memmapped_file_system.cc",
             "util/memmapped_file_system_writer.cc",
         ],
-    }) + proto_text_hdrs_and_srcs.srcs + tf_additional_framework_srcs(),
+    }) + tf_additional_framework_srcs(),
     hdrs = FRAMEWORK_INTERNAL_PUBLIC_HEADERS,
     copts = tf_copts(),
     linkopts = select({
@@ -1874,7 +2162,8 @@ tf_cuda_library(
     deps = [
         ":lib",
         ":lib_internal",
-        ":proto_text",
+        ":protos_all_proto_text",
+        ":error_codes_proto_text",
         ":protos_all_cc",
         ":version_lib",
         "//tensorflow/core/platform/default/build_config:platformlib",
@@ -1886,7 +2175,7 @@ tf_cuda_library(
     ) + if_mkl(
         [
             "//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn//:mkl_dnn",
+            "@mkl_dnn",
         ],
     ),
     alwayslink = 1,
@@ -1898,7 +2187,6 @@ cc_header_only_library(
     deps = [
         ":framework",
         ":reader_base",
-        "@nsync//:nsync_headers",
     ],
 )
 
@@ -2005,14 +2293,21 @@ tf_cuda_library(
 
 CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
     "common_runtime/device.h",
+    "common_runtime/device_factory.h",
+    "common_runtime/device_mgr.h",
+    "common_runtime/device_set.h",
+    "common_runtime/eval_const_tensor.h",
     "common_runtime/graph_runner.h",
     "common_runtime/shape_refiner.h",
     "framework/versions.h",
+    "common_runtime/process_function_library_runtime.h",
+    "common_runtime/function.h",
 ]
 
 tf_cuda_library(
     name = "core_cpu_base",
     srcs = [
+        "common_runtime/eval_const_tensor.cc",
         "common_runtime/shape_refiner.cc",
         "common_runtime/shape_refiner.h",
         "framework/versions.h",
@@ -2046,33 +2341,39 @@ tf_cuda_library(
 
 CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
+    "common_runtime/base_collective_executor.h",
     "common_runtime/bfc_allocator.h",
+    "common_runtime/broadcaster.h",
+    "common_runtime/buf_rendezvous.h",
     "common_runtime/build_graph_options.h",
+    "common_runtime/collective_executor_mgr.h",
+    "common_runtime/collective_param_resolver_local.h",
+    "common_runtime/collective_rma_local.h",
     "common_runtime/constant_folding.h",
     "common_runtime/copy_tensor.h",
     "common_runtime/costmodel_manager.h",
     "common_runtime/debugger_state_interface.h",
-    "common_runtime/device_factory.h",
-    "common_runtime/device_mgr.h",
-    "common_runtime/device_set.h",
+    "common_runtime/device_resolver_local.h",
     "common_runtime/dma_helper.h",
     "common_runtime/eigen_thread_pool.h",
     "common_runtime/executor.h",
-    "common_runtime/function.h",
     "common_runtime/graph_optimizer.h",
     "common_runtime/local_device.h",
     "common_runtime/memory_types.h",
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
-    "common_runtime/process_function_library_runtime.h",
+    "common_runtime/placer.h",
     "common_runtime/process_util.h",
     "common_runtime/profile_handler.h",
     "common_runtime/renamed_device.h",
     "common_runtime/rendezvous_mgr.h",
     "common_runtime/rendezvous_util.h",
+    "common_runtime/ring_reducer.h",
+    "common_runtime/scoped_allocator.h",
+    "common_runtime/scoped_allocator_mgr.h",
     "common_runtime/session_factory.h",
-    "common_runtime/placer.h",
+    "common_runtime/single_threaded_cpu_device.h",
     "common_runtime/stats_publisher_interface.h",
     "common_runtime/step_stats_collector.h",
     "common_runtime/threadpool_device.h",
@@ -2086,8 +2387,14 @@ tf_cuda_library(
     srcs = [
         "common_runtime/accumulate_n_optimizer.cc",
         "common_runtime/allocator_retry.cc",
+        "common_runtime/base_collective_executor.cc",
         "common_runtime/bfc_allocator.cc",
+        "common_runtime/broadcaster.cc",
+        "common_runtime/buf_rendezvous.cc",
         "common_runtime/build_graph_options.cc",
+        "common_runtime/collective_executor_mgr.cc",
+        "common_runtime/collective_param_resolver_local.cc",
+        "common_runtime/collective_rma_local.cc",
         "common_runtime/constant_folding.cc",
         "common_runtime/copy_tensor.cc",
         "common_runtime/costmodel_manager.cc",
@@ -2095,6 +2402,7 @@ tf_cuda_library(
         "common_runtime/device.cc",
         "common_runtime/device_factory.cc",
         "common_runtime/device_mgr.cc",
+        "common_runtime/device_resolver_local.cc",
         "common_runtime/device_set.cc",
         "common_runtime/executor.cc",
         "common_runtime/function.cc",
@@ -2102,6 +2410,7 @@ tf_cuda_library(
         "common_runtime/graph_runner.cc",
         "common_runtime/local_device.cc",
         "common_runtime/memory_types.cc",
+        "common_runtime/mkl_cpu_allocator.cc",
         "common_runtime/optimization_registry.cc",
         "common_runtime/parallel_concat_optimizer.cc",
         "common_runtime/placer.cc",
@@ -2110,6 +2419,9 @@ tf_cuda_library(
         "common_runtime/renamed_device.cc",
         "common_runtime/rendezvous_mgr.cc",
         "common_runtime/rendezvous_util.cc",
+        "common_runtime/ring_reducer.cc",
+        "common_runtime/scoped_allocator.cc",
+        "common_runtime/scoped_allocator_mgr.cc",
         "common_runtime/session.cc",
         "common_runtime/session_factory.cc",
         "common_runtime/session_options.cc",
@@ -2141,6 +2453,7 @@ tf_cuda_library(
     ] + if_mkl(
         [
             "//third_party/mkl:intel_binary_blob",
+            "@mkl_dnn",
         ],
     ),
     alwayslink = 1,
@@ -2185,14 +2498,12 @@ tf_cuda_library(
     ] + if_mkl(
         [
             "//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn//:mkl_dnn",
+            "@mkl_dnn",
         ],
     ) + tf_additional_core_deps() + if_static([":core_cpu_impl"]),
     alwayslink = 1,
 )
 
-# This library is deprecated and no longer publicly available.
-# Do not add more uses of it.
 cc_library(
     name = "regexp_internal",
     hdrs = [
@@ -2264,6 +2575,19 @@ tf_cuda_library(
 
 cc_library(
     name = "gpu_id",
+    hdrs = [
+        "common_runtime/gpu/gpu_id.h",
+        "common_runtime/gpu/gpu_id_manager.h",
+    ],
+    deps = [
+        ":lib",
+    ] + if_static([
+        ":gpu_id_impl",
+    ]),
+)
+
+cc_library(
+    name = "gpu_id_impl",
     srcs = ["common_runtime/gpu/gpu_id_manager.cc"],
     hdrs = [
         "common_runtime/gpu/gpu_id.h",
@@ -2313,7 +2637,7 @@ tf_cuda_library(
         ":core_cpu_lib",
         ":framework",
         ":framework_internal",
-        ":gpu_id",
+        ":gpu_id_impl",
         ":gpu_init_impl",
         ":gpu_lib",
         ":graph",
@@ -2471,6 +2795,26 @@ cc_library(
     alwayslink = 1,
 )
 
+# This is the lite version of a main() for tests. It does not include any
+# support for reporting benchmark results when running on TPUs.
+cc_library(
+    name = "test_lite_main",
+    testonly = 1,
+    srcs = ["platform/test_main.cc"],
+    copts = tf_copts(),
+    deps = [
+        # TODO(ahentz): we don't want to depend on "lib" here. It used to be
+        # that "core_stringpiece" was enough but that recently changed and
+        # we now need at least "str_util".
+        ":lib",
+        ":lib_platform",
+        ":stacktrace_handler",
+        ":test_lite",
+        "//tensorflow/core/platform/default/build_config:test_lite_main",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
@@ -2519,7 +2863,6 @@ tf_cc_tests(
         "lib/monitoring/sampler_test.cc",
         "lib/random/distribution_sampler_test.cc",
         "lib/random/philox_random_test.cc",
-        "lib/random/random_distributions_test.cc",
         "lib/random/random_test.cc",
         "lib/random/simple_philox_test.cc",
         "lib/strings/base64_test.cc",
@@ -2549,6 +2892,21 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "lib_random_random_distributions_test",
+    srcs = ["lib/random/random_distributions_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test(
     name = "platform_env_test",
     size = "small",
@@ -2700,6 +3058,11 @@ tf_cc_tests(
     name = "higher_level_tests",
     size = "small",
     srcs = [
+        "common_runtime/buf_rendezvous_test.cc",
+        "common_runtime/collective_executor_mgr_test.cc",
+        "common_runtime/collective_param_resolver_local_test.cc",
+        "common_runtime/collective_rma_local_test.cc",
+        "common_runtime/device_resolver_local_test.cc",
         "common_runtime/device_set_test.cc",
         "common_runtime/optimization_registry_test.cc",
         "common_runtime/pending_counts_test.cc",
@@ -2713,6 +3076,7 @@ tf_cc_tests(
         "framework/common_shape_fns_test.cc",
         "framework/function_test.cc",
         "framework/graph_def_util_test.cc",
+        "framework/graph_to_functiondef_test.cc",
         "framework/kernel_def_builder_test.cc",
         "framework/memory_types_test.cc",
         "framework/node_def_builder_test.cc",
@@ -2791,6 +3155,8 @@ tf_cc_tests(
         ":testlib",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
         "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/cc:while_loop",
@@ -2835,6 +3201,79 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "cudnn_rnn_ops_test_cc",
+    size = "small",
+    srcs = [
+        "ops/cudnn_rnn_ops_test.cc",
+    ],
+    deps = [
+        ":cudnn_rnn_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cc_tests_gpu(
+    name = "ring_reducer_test",
+    size = "medium",
+    srcs = [
+        "common_runtime/ring_reducer_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":gpu_runtime",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
+tf_cc_tests_gpu(
+    name = "broadcaster_test",
+    size = "small",
+    srcs = [
+        "common_runtime/broadcaster_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":gpu_runtime",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
@@ -3095,6 +3534,7 @@ tf_cc_test(
         ":core_cpu",
         ":core_cpu_internal",
         ":framework",
+        ":lib",
         ":test",
         ":test_main",
         ":testlib",
@@ -3105,6 +3545,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "common_runtime_process_util_test",
+    size = "small",
+    srcs = ["common_runtime/process_util_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core_cpu_internal",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "common_runtime_rendezvous_util_test",
     size = "small",
@@ -3143,6 +3595,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:dense_update_ops",
         "//tensorflow/core/kernels:fifo_queue_op",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:ops_util",
@@ -3185,6 +3638,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:fifo_queue_op",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:identity_op",
+        "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/kernels:queue_ops",
@@ -3259,6 +3713,10 @@ tf_cc_test(
     size = "small",
     srcs = ["common_runtime/function_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = [
+        "manual",
+        "no_oss",
+    ],
     deps = [
         ":core",
         ":core_cpu",
@@ -3287,6 +3745,54 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "common_runtime_function_threadpool_test",
+    size = "small",
+    srcs = ["common_runtime/function_threadpool_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
+        "//tensorflow/core/kernels:cast_op",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:random_ops",
+        "//tensorflow/core/kernels:shape_ops",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "common_runtime_scoped_allocator_mgr_test",
+    size = "small",
+    srcs = ["common_runtime/scoped_allocator_mgr_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":framework",
+        ":lib",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test_gpu(
     name = "gpu_allocator_retry_test",
     size = "medium",
@@ -3484,6 +3990,7 @@ tf_cc_tests(
         "ops/parsing_ops_test.cc",
         "ops/random_ops_test.cc",
         "ops/set_ops_test.cc",
+        "ops/shape_function_test.cc",
         "ops/sparse_ops_test.cc",
         "ops/spectral_ops_test.cc",
         "ops/state_ops_test.cc",
@@ -3590,6 +4097,13 @@ filegroup(
         "lib/gif/testdata/optimized.gif",
         # BMP data
         "lib/bmp/testdata/lena.bmp",
+        # SSIM, PSNR data
+        "lib/ssim/testdata/checkerboard1.png",
+        "lib/ssim/testdata/checkerboard2.png",
+        "lib/ssim/testdata/checkerboard3.png",
+        "lib/psnr/testdata/cat_q20.jpg",
+        "lib/psnr/testdata/cat_q72.jpg",
+        "lib/psnr/testdata/cat_q95.jpg",
     ],
     visibility = ["//visibility:public"],
 )
@@ -3630,16 +4144,16 @@ cc_library(
 # -----------------------------------------------------------------------------
 # Google-internal targets go here (must be at the end).
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+alias(
+    name = "android_srcs_no_runtime",
+    actual = ":mobile_srcs_no_runtime",
+    visibility = ["//visibility:public"],
+)
+
+alias(
+    name = "android_srcs_only_runtime",
+    actual = ":mobile_srcs_only_runtime",
+    visibility = ["//visibility:public"],
 )
 
 alias(
@@ -3647,3 +4161,9 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
+
+closure_proto_library(
+    name = "example_protos_closure",
+    visibility = ["//visibility:public"],
+    deps = [":example_protos"],
+)
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 58dbac4e8edac7079d315fbfcdafbd136793df0b..19d643880966f7607405539a5ad43d8e03dc13fb 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -17,18 +17,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "base_api_def",
     srcs = glob(["base_api/*"]),
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..145d05de59a16ed67af4b978e72593ae88da3d05
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
@@ -0,0 +1,78 @@
+op {
+  graph_op_name: "ApplyAdaMax"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AdaMax algorithm."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+v_t <- max(beta2 * v_{t-1}, abs(g))
+variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
index c2858a1bfbb5a30e5e72bcfad694f696e08d2346..b90f5473c89cbe3afe38f0283025e7273817d0e4 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -82,9 +82,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+$$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3f181e91ce48c4d070eea620cb6b5d0039e5b3d2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -0,0 +1,93 @@
+op {
+  graph_op_name: "BoostedTreesCalculateBestGainsPerFeature"
+  visibility: HIDDEN
+  in_arg {
+    name: "node_id_range"
+    description: <<END
+A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+END
+  }
+  in_arg {
+    name: "stats_summary_list"
+    description: <<END
+A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+l1 regularization factor on leaf weights, per instance based.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+l2 regularization factor on leaf weights, per instance based.
+END
+  }
+  in_arg {
+    name: "tree_complexity"
+    description: <<END
+adjustment to the gain, per leaf based.
+END
+  }
+  in_arg {
+    name: "min_node_weight"
+    description: <<END
+mininum avg of hessians in a node before required for the node to be considered for splitting.
+END
+  }
+  out_arg {
+    name: "node_ids_list"
+    description: <<END
+An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "gains_list"
+    description: <<END
+An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "thresholds_list"
+    description: <<END
+An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "left_node_contribs_list"
+    description: <<END
+A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "right_node_contribs_list"
+    description: <<END
+A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+END
+  }
+  attr {
+    name: "max_splits"
+    description: <<END
+the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+inferred from the size of `stats_summary_list`; the number of total features.
+END
+  }
+  summary: "Calculates gains for each feature and returns the best possible split information for the feature."
+  description: <<END
+The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+
+It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+
+In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+
+The length of output lists are all of the same length, `num_features`.
+The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aee73b910f0ae8b542b3741ceeeadb9624126a27
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "BoostedTreesCreateEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble resource to be created.
+END
+  }
+  in_arg {
+    name: "stamp_token"
+    description: <<END
+Token to use as the initial value of the resource stamp.
+END
+  }
+  in_arg {
+    name: "tree_ensemble_serialized"
+    description: <<END
+Serialized proto of the tree ensemble.
+END
+  }
+  summary: "Creates a tree ensemble model and returns a handle to it."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1602ba045b95db060295a6b05f17dd8a06924d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "BoostedTreesDeserializeEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble.
+END
+  }
+  in_arg {
+    name: "stamp_token"
+    description: <<END
+Token to use as the new value of the resource stamp.
+END
+  }
+  in_arg {
+    name: "tree_ensemble_serialized"
+    description: <<END
+Serialized proto of the ensemble.
+END
+  }
+  summary: "Deserializes a serialized tree ensemble config and replaces current tree"
+  description: <<END
+ensemble.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bce5639a2049dba897fe45680fe98fa45f76c24
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "BoostedTreesEnsembleResourceHandleOp"
+  visibility: HIDDEN
+  summary: "Creates a handle to a BoostedTreesEnsembleResource"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4377125224979a4a499750af19267005c0f19e59
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "BoostedTreesGetEnsembleStates"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble.
+END
+  }
+  out_arg {
+    name: "stamp_token"
+    description: <<END
+Stamp token of the tree ensemble resource.
+END
+  }
+  out_arg {
+    name: "num_trees"
+    description: <<END
+The number of trees in the tree ensemble resource.
+END
+  }
+  out_arg {
+    name: "num_finalized_trees"
+    description: <<END
+The number of trees that were finished successfully.
+END
+  }
+  out_arg {
+    name: "num_attempted_layers"
+    description: <<END
+The number of layers we attempted to build (but not necessarily succeeded).
+END
+  }
+  out_arg {
+    name: "last_layer_nodes_range"
+    description: <<END
+Rank size 2 tensor that contains start and end ids of the nodes in the latest
+layer.
+END
+
+  }
+  summary: "Retrieves the tree ensemble resource stamp token, number of trees and growing statistics."
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc0856c900d1b1238d0641fef3f2f57b95a209fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "BoostedTreesMakeStatsSummary"
+  visibility: HIDDEN
+  in_arg {
+    name: "node_ids"
+    description: <<END
+int32 Rank 1 Tensor containing node ids, which each example falls into for the requested layer.
+END
+  }
+  in_arg {
+    name: "gradients"
+    description: <<END
+float32; Rank 2 Tensor (shape=[#examples, 1]) for gradients.
+END
+  }
+  in_arg {
+    name: "hessians"
+    description: <<END
+float32; Rank 2 Tensor (shape=[#examples, 1]) for hessians.
+END
+  }
+  in_arg {
+    name: "bucketized_features_list"
+    description: <<END
+int32 list of Rank 1 Tensors, each containing the bucketized feature (for each feature column).
+END
+  }
+  out_arg {
+    name: "stats_summary"
+    description: <<END
+output Rank 4 Tensor (shape=[#features, #splits, #buckets, 2]) containing accumulated stats put into the corresponding node and bucket. The first index of 4th dimension refers to gradients, and the second to hessians.
+END
+  }
+  attr {
+    name: "max_splits"
+    description: <<END
+int; the maximum number of splits possible in the whole tree.
+END
+  }
+  attr {
+    name: "num_buckets"
+    description: <<END
+int; equals to the maximum possible value of bucketized feature.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+int; inferred from the size of bucketized_features_list; the number of features.
+END
+  }
+  summary: "Makes the summary of accumulated stats for the batch."
+  description: <<END
+The summary stats contains gradients and hessians accumulated into the corresponding node and bucket for each example.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..60ad9b4640f77a840c9de4c86cc42766ec79d387
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "BoostedTreesPredict"
+  visibility: HIDDEN
+  in_arg {
+    name: "bucketized_features"
+    description: <<END
+A list of rank 1 Tensors containing bucket id for each
+feature.
+END
+  }
+  out_arg {
+    name: "logits"
+    description: <<END
+Output rank 2 Tensor containing logits for each example.
+END
+  }
+  attr {
+    name: "num_bucketized_features"
+    description: <<END
+Inferred.
+END
+  }
+  attr {
+    name: "logits_dimension"
+    description: <<END
+scalar, dimension of the logits, to be used for partial logits
+shape.
+END
+  }
+  summary: "Runs multiple additive regression ensemble predictors on input instances and"
+  description: <<END
+computes the logits. It is designed to be used during prediction.
+It traverses all the trees and calculates the final score for each instance.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0b3688d8a3f23d24178343d5b5e6b3ab5741b9f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "BoostedTreesSerializeEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble.
+END
+  }
+  out_arg {
+    name: "stamp_token"
+    description: <<END
+Stamp token of the tree ensemble resource.
+END
+  }
+  out_arg {
+    name: "tree_ensemble_serialized"
+    description: <<END
+Serialized proto of the ensemble.
+END
+  }
+  summary: "Serializes the tree ensemble to a proto."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f8a3639c9b715c646056ac79ed3f01cbb96330c3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
@@ -0,0 +1,63 @@
+op {
+  graph_op_name: "BoostedTreesTrainingPredict"
+  visibility: HIDDEN
+  in_arg {
+    name: "cached_tree_ids"
+    description: <<END
+Rank 1 Tensor containing cached tree ids which is the starting
+tree of prediction.
+END
+  }
+  in_arg {
+    name: "cached_node_ids"
+    description: <<END
+Rank 1 Tensor containing cached node id which is the starting
+node of prediction.
+END
+  }
+  in_arg {
+    name: "bucketized_features"
+    description: <<END
+A list of rank 1 Tensors containing bucket id for each
+feature.
+END
+  }
+  out_arg {
+    name: "partial_logits"
+    description: <<END
+Rank 2 Tensor containing logits update (with respect to cached
+values stored) for each example.
+END
+  }
+  out_arg {
+    name: "tree_ids"
+    description: <<END
+Rank 1 Tensor containing new tree ids for each example.
+END
+  }
+  out_arg {
+    name: "node_ids"
+    description: <<END
+Rank 1 Tensor containing new node ids in the new tree_ids.
+END
+  }
+  attr {
+    name: "num_bucketized_features"
+    description: <<END
+Inferred.
+END
+  }
+  attr {
+    name: "logits_dimension"
+    description: <<END
+scalar, dimension of the logits, to be used for partial logits
+shape.
+END
+  }
+  summary: "Runs multiple additive regression ensemble predictors on input instances and"
+  description: <<END
+computes the update to cached logits. It is designed to be used during training.
+It traverses the trees starting from cached tree id and cached node id and
+calculates the updates to be pushed to the cache.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3cf486d087d5183dc9cfd234a9a566a62583cf70
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
@@ -0,0 +1,82 @@
+op {
+  graph_op_name: "BoostedTreesUpdateEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the ensemble variable.
+END
+  }
+  in_arg {
+    name: "feature_ids"
+    description: <<END
+Rank 1 tensor with ids for each feature. This is the real id of
+the feature that will be used in the split.
+END
+  }
+  in_arg {
+    name: "node_ids"
+    description: <<END
+List of rank 1 tensors representing the nodes for which this feature
+has a split.
+END
+  }
+  in_arg {
+    name: "gains"
+    description: <<END
+List of rank 1 tensors representing the gains for each of the feature's
+split.
+END
+  }
+  in_arg {
+    name: "thresholds"
+    description: <<END
+List of rank 1 tensors representing the thesholds for each of the
+feature's split.
+END
+  }
+  in_arg {
+    name: "left_node_contribs"
+    description: <<END
+List of rank 2 tensors with left leaf contribs for each of
+the feature's splits. Will be added to the previous node values to constitute
+the values of the left nodes.
+END
+  }
+  in_arg {
+    name: "right_node_contribs"
+    description: <<END
+List of rank 2 tensors with right leaf contribs for each
+of the feature's splits. Will be added to the previous node values to constitute
+the values of the right nodes.
+END
+  }
+  in_arg {
+    name: "max_depth"
+    description: <<END
+Max depth of the tree to build.
+END
+  }
+  in_arg {
+    name: "learning_rate"
+    description: <<END
+shrinkage const for each new tree.
+END
+  }
+  attr {
+    name: "pruning_mode"
+    description: <<END
+0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+Number of features that have best splits returned. INFERRED.
+END
+  }
+  summary: "Updates the tree ensemble by either adding a layer to the last tree being grown"
+  description: <<END
+or by starting a new tree.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..763760176a722051640d4497e280e11f871f8011
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "BroadcastTo"
+  in_arg {
+    name: "input"
+    description: <<END
+A Tensor to broadcast.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+An 1-D `int` Tensor. The shape of the desired output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor.
+END
+  }
+  summary: "Broadcast an array for a compatible shape."
+  description: <<END
+Broadcasting is the process of making arrays to have compatible shapes
+for arithmetic operations. Two shapes are compatible if for each
+dimension pair they are either equal or one of them is one. When trying
+to broadcast a Tensor to a shape, it starts with the trailing dimensions,
+and works its way forward.
+
+For example,
+```
+>>> x = tf.constant([1, 2, 3])
+>>> y = tf.broadcast_to(x, [3, 3])
+>>> sess.run(y)
+array([[1, 2, 3],
+       [1, 2, 3],
+       [1, 2, 3]], dtype=int32)
+```
+In the above example, the input Tensor with the shape of `[1, 3]`
+is broadcasted to output Tensor with shape of `[3, 3]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..803d8970ab78de347936a8dbbd2f39d8d9915f1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ClipByValue"
+  in_arg {
+    name: "t"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "clip_value_min"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The minimum value to clip by.
+END
+  }
+  in_arg {
+    name: "clip_value_max"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The maximum value to clip by.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A clipped `Tensor` with the same shape as input 't'.
+END
+  }
+  summary: "Clips tensor values to a specified min and max."
+  description: <<END
+Given a tensor `t`, this operation returns a tensor of the same type and
+shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+greater than `clip_value_max` are set to `clip_value_max`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CloseSummaryWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_CloseSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f6fd7d93169306fdf5ca62d27635e1f86f37bc4d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CloseSummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CloseSummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..88049bca365f4a738cac9975d0e14340e1ae401d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveBcastRecv"
+  visibility: SKIP
+  summary: "Receives a tensor value broadcast from another device."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ff70f5b178af117e694cf7e998423b5ea58ac5a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveBcastSend"
+  visibility: SKIP
+  summary: "Broadcasts a tensor value to one or more other devices."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..10d9771d46d408d9c0414dab4ae5954a75dfc47e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveReduce"
+  visibility: SKIP
+  summary: "Mutually reduces multiple tensors of identical type and shape."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConsumeMutexLock.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConsumeMutexLock.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9db8274dea5d904dbbc687927673e0c7f7fa649
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConsumeMutexLock.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "ConsumeMutexLock"
+  in_arg {
+    name: "mutex_lock"
+    description: <<END
+A tensor returned by `MutexLock`.
+END
+  }
+  summary: "This op consumes a lock created by `MutexLock`."
+  description: <<END
+This op exists to consume a tensor created by `MutexLock` (other than
+direct control dependencies).  It should be the only that consumes the tensor,
+and will raise an error if it is not.  Its only purpose is to keep the
+mutex lock tensor alive until it is consumed by this op.
+
+**NOTE**: This operation must run on the same device as its input.  This may
+be enforced via the `colocate_with` mechanism.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CreateSummaryDbWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_CreateSummaryDbWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28da46a0f8e452f65d06a13c4b0d0b03b2a75757
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CreateSummaryDbWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CreateSummaryDbWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CreateSummaryFileWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_CreateSummaryFileWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ce2c4d37e5001681ffa733bf4726c6bea652029
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CreateSummaryFileWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CreateSummaryFileWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CriticalSectionOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_CriticalSectionOp.pbtxt
deleted file mode 100644
index 5027fa861e7d8914b1e8ae06cd1ffa2ed06b6ad2..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_CriticalSectionOp.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-op {
-  graph_op_name: "CriticalSectionOp"
-  attr {
-    name: "container"
-    description: <<END
-the container this critical section is placed in.
-END
-  }
-  attr {
-    name: "shared_name"
-    description: <<END
-the name by which this critical section is referred to.
-END
-  }
-  summary: "Creates a handle to a CriticalSection resource."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..461b498662d4b149b49e4c1723f6b96dc274c740
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "CudnnRNN"
+  summary: "A RNN backed by cuDNN."
+  description: <<END
+Computes the RNN from the input and initial states, with respect to the params
+buffer.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+  the actual computation before the first layer. 'skip_input' is only allowed
+  when input_size == num_units; 'auto_select' implies 'skip_input' when
+  input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+output: A 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+is_training: Indicates whether this operation is used for inferenece or
+  training.
+reserve_space: An opaque tensor that can be used in backprop calculation. It
+  is only produced if is_training is false.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7cd5ae637b4dbfd67bf7925fc0e58d97e1329318
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "CudnnRNNBackprop"
+  summary: "Backprop step of CudnnRNN."
+  description: <<END
+Compute the backprop of both data and weights in a RNN.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+    the actual computation before the first layer. 'skip_input' is only allowed
+    when input_size == num_units; 'auto_select' implies 'skip_input' when
+    input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+output: A 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+    pass.
+output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+    pass.
+reserve_space: The same reserve_space produced in for forward operation.
+input_backprop: The backprop to input in the forward pass. Has the same shape
+    as input.
+input_h_backprop: The backprop to input_h in the forward pass. Has the same
+    shape as input_h.
+input_c_backprop: The backprop to input_c in the forward pass. Has the same
+    shape as input_c.
+params_backprop: The backprop to the params buffer in the forward pass. Has the
+    same shape as params.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..03aa9cc250d3d55dfd0595edb9f2daab6d65da6e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV2.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "CudnnRNNBackpropV2"
+  visibility: HIDDEN
+  summary: "Backprop step of CudnnRNN."
+  description: <<END
+Compute the backprop of both data and weights in a RNN. Takes an extra
+    "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
+    cudnnRNNAlgo_t and cudnnMathType_t.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicates whether there is a linear projection between the input and
+    the actual computation before the first layer. 'skip_input' is only allowed
+    when input_size == num_units; 'auto_select' implies 'skip_input' when
+    input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+output: A 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+    pass.
+output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+    pass.
+reserve_space: The same reserve_space produced in the forward operation.
+host_reserved: The same host_reserved produced in the forward operation.
+input_backprop: The backprop to input in the forward pass. Has the same shape
+    as input.
+input_h_backprop: The backprop to input_h in the forward pass. Has the same
+    shape as input_h.
+input_c_backprop: The backprop to input_c in the forward pass. Has the same
+    shape as input_c.
+params_backprop: The backprop to the params buffer in the forward pass. Has the
+    same shape as params.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNCanonicalToParams.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNCanonicalToParams.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..abf81a2071172c5b00fec662e1401a46fc49c450
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNCanonicalToParams.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "CudnnRNNCanonicalToParams"
+  summary: "Converts CudnnRNN params from canonical form to usable form."
+  description: <<END
+Writes a set of weights into the opaque params buffer so they can be used in
+upcoming training or inferences.
+
+Note that the params buffer may not be compatible across different GPUs. So any
+save and restoration should be converted to and from the canonical weights and
+biases.
+
+num_layers: Specifies the number of layers in the RNN model.
+num_units: Specifies the size of the hidden state.
+input_size: Specifies the size of the input state.
+weights: the canonical form of weights that can be used for saving
+    and restoration. They are more likely to be compatible across different
+    generations.
+biases: the canonical form of biases that can be used for saving
+    and restoration. They are more likely to be compatible across different
+    generations.
+num_params: number of parameter sets for all layers.
+    Each layer may contain multiple parameter sets, with each set consisting of
+    a weight matrix and a bias vector.
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+    The actual computation before the first layer. 'skip_input' is only allowed
+    when input_size == num_units; 'auto_select' implies 'skip_input' when
+    input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used.
+    dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31fb85d4fb3f59ae82737128cc88d2cbdbc996ea
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsSize.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "CudnnRNNParamsSize"
+  summary: "Computes size of weights that can be used by a Cudnn RNN model."
+  description: <<END
+Return the params size that can be used by the Cudnn RNN model. Subsequent
+weight allocation and initialization should use this size.
+
+num_layers: Specifies the number of layers in the RNN model.
+num_units: Specifies the size of the hidden state.
+input_size: Specifies the size of the input state.
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+  The actual computation before the first layer. 'skip_input' is only allowed
+  when input_size == num_units; 'auto_select' implies 'skip_input' when
+  input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used.
+  dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
+params_size: The size of the params buffer that should be allocated and
+  initialized for this RNN model. Note that this params buffer may not be
+  compatible across GPUs. Please use CudnnRNNParamsWeights and
+  CudnnRNNParamsBiases to save and restore them in a way that is compatible
+  across different runs.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsToCanonical.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsToCanonical.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..47753bf8fcf9aa3b1d0938b974aa788f1e2c5df1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsToCanonical.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "CudnnRNNParamsToCanonical"
+  summary: "Retrieves CudnnRNN params in canonical form."
+  description: <<END
+Retrieves a set of weights from the opaque params buffer that can be saved and
+restored in a way compatible with future runs.
+
+Note that the params buffer may not be compatible across different GPUs. So any
+save and restoration should be converted to and from the canonical weights and
+biases.
+
+num_layers: Specifies the number of layers in the RNN model.
+num_units: Specifies the size of the hidden state.
+input_size: Specifies the size of the input state.
+num_params: number of parameter sets for all layers.
+    Each layer may contain multiple parameter sets, with each set consisting of
+    a weight matrix and a bias vector.
+weights: the canonical form of weights that can be used for saving
+    and restoration. They are more likely to be compatible across different
+    generations.
+biases: the canonical form of biases that can be used for saving
+    and restoration. They are more likely to be compatible across different
+    generations.
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+    The actual computation before the first layer. 'skip_input' is only allowed
+    when input_size == num_units; 'auto_select' implies 'skip_input' when
+    input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used.
+    dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c8a39de68cf15223b1a4788a937ef15b27829b83
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "CudnnRNNV2"
+  visibility: HIDDEN
+  summary: "A RNN backed by cuDNN."
+  description: <<END
+Computes the RNN from the input and initial states, with respect to the params
+buffer. Produces one extra output "host_reserved" than CudnnRNN.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicates whether there is a linear projection between the input and
+  the actual computation before the first layer. 'skip_input' is only allowed
+  when input_size == num_units; 'auto_select' implies 'skip_input' when
+  input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+output: A 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+is_training: Indicates whether this operation is used for inferenece or
+  training.
+reserve_space: An opaque tensor that can be used in backprop calculation. It
+  is only produced if is_training is true.
+host_reserved: An opaque tensor that can be used in backprop calculation. It is
+  only produced if is_training is true. It is output on host memory rather than
+  device memory.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e1b8a9abdd2bec0fda690f96d266569b2fb2fcab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "DatasetToTFRecord"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to write.
+END
+  }
+  in_arg {
+    name: "filename"
+    description: <<END
+A scalar string tensor representing the filename to use.
+END
+  }
+  in_arg {
+    name: "compression_type"
+    description: <<END
+A scalar string tensor containing either (i) the empty string (no
+compression), (ii) "ZLIB", or (iii) "GZIP".
+END
+  }
+  summary: "Writes the given dataset to the given file using the TFRecord format."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c8152f53c4ded035140abd24ba006bf391641cf1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
@@ -0,0 +1,116 @@
+op {
+  graph_op_name: "DecodeProtoV2"
+  in_arg {
+    name: "bytes"
+    description: <<END
+Tensor of serialized protos with shape `batch_shape`.
+END
+  }
+  out_arg {
+    name: "sizes"
+    description: <<END
+Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+Each entry is the number of values found for the corresponding field.
+Optional fields may have 0 or 1 values.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+List of tensors containing values for the corresponding field.
+`values[i]` has datatype `output_types[i]`
+and shape `[batch_shape, max(sizes[...,i])]`.
+END
+  }
+  attr {
+    name: "message_type"
+    description: <<END
+Name of the proto message type to decode.
+END
+  }
+  attr {
+    name: "field_names"
+    description: <<END
+List of strings containing proto field names.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+List of TF types to use for the respective field in field_names.
+END
+  }
+  attr {
+    name: "descriptor_source"
+    description: <<END
+Either the special value `local://` or a path to a file containing
+a serialized `FileDescriptorSet`.
+END
+  }
+  attr {
+    name: "message_format"
+    description: <<END
+Either `binary` or `text`.
+END
+  }
+  attr {
+    name: "sanitize"
+    description: <<END
+Whether to sanitize the result or not.
+END
+  }
+  summary: <<END
+The op extracts fields from a serialized protocol buffers message into tensors.
+END
+  description: <<END
+The `decode_proto` op extracts fields from a serialized protocol buffers
+message into tensors.  The fields in `field_names` are decoded and converted
+to the corresponding `output_types` if possible.
+
+A `message_type` name must be provided to give context for the field
+names. The actual message descriptor can be looked up either in the
+linked-in descriptor pool or a filename provided by the caller using
+the `descriptor_source` attribute.
+
+Each output tensor is a dense tensor. This means that it is padded to
+hold the largest number of repeated elements seen in the input
+minibatch. (The shape is also padded by one to prevent zero-sized
+dimensions). The actual repeat counts for each example in the
+minibatch can be found in the `sizes` output. In many cases the output
+of `decode_proto` is fed immediately into tf.squeeze if missing values
+are not a concern. When using tf.squeeze, always pass the squeeze
+dimension explicitly to avoid surprises.
+
+For the most part, the mapping between Proto field types and
+TensorFlow dtypes is straightforward. However, there are a few
+special cases:
+
+- A proto field that contains a submessage or group can only be converted
+to `DT_STRING` (the serialized submessage). This is to reduce the
+complexity of the API. The resulting string can be used as input
+to another instance of the decode_proto op.
+
+- TensorFlow lacks support for unsigned integers. The ops represent uint64
+types as a `DT_INT64` with the same twos-complement bit pattern
+(the obvious way). Unsigned int32 values can be represented exactly by
+specifying type `DT_INT64`, or using twos-complement if the caller
+specifies `DT_INT32` in the `output_types` attribute.
+
+The `descriptor_source` attribute selects a source of protocol
+descriptors to consult when looking up `message_type`. This may be a
+filename containing a serialized `FileDescriptorSet` message,
+or the special value `local://`, in which case only descriptors linked
+into the code will be searched; the filename can be on any filesystem
+accessible to TensorFlow.
+
+You can build a `descriptor_source` file using the `--descriptor_set_out`
+and `--include_imports` options to the protocol compiler `protoc`.
+
+The `local://` database only covers descriptors linked into the
+code via C++ libraries, not Python imports. You can link in a proto descriptor
+by creating a cc_library target with alwayslink=1.
+
+Both binary and text proto serializations are supported, and can be
+chosen using the `format` attribute.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe0fc3823ff724641298c03f74c115dd6211f385
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "DeepCopy"
+  in_arg {
+    name: "x"
+    description: "The source tensor of type `T`."
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+    y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
+      is not an alias of `x`.
+END
+  }
+  summary: "Makes a copy of `x`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..746f561e9251c34c327123efd349bfb57682d7aa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "Empty"
+  in_arg {
+    name: "shape"
+    description: "1-D. Represents the shape of the output tensor."
+  }
+  attr {
+    name: "init"
+    description:
+        "If True, initialize the returned tensor with the default value "
+        "of dtype.  Otherwise, the implementation is free not to initialize"
+        "the tensor's content."
+  }
+  out_arg {
+    name: "output"
+    description: "A `Tensor` of type `T`."
+  }
+  summary: <<END
+Creates a tensor with the given shape.
+
+This operation creates a tensor of `shape` and `dtype`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt b/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fdbe47f23615a1bbac30346d46241c4d321bc649
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt
@@ -0,0 +1,81 @@
+op {
+  graph_op_name: "EncodeProto"
+  in_arg {
+    name: "sizes"
+    description: <<END
+Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+List of tensors containing values for the corresponding field.
+END
+  }
+  out_arg {
+    name: "bytes"
+    description: <<END
+Tensor of serialized protos with shape `batch_shape`.
+END
+  }
+  attr {
+    name: "message_type"
+    description: <<END
+Name of the proto message type to decode.
+END
+  }
+  attr {
+    name: "field_names"
+    description: <<END
+List of strings containing proto field names.
+END
+  }
+  attr {
+    name: "Tinput_types"
+    description: <<END
+The input types.
+END
+  }
+  summary: <<END
+The op serializes protobuf messages provided in the input tensors.
+END
+  description: <<END
+The types of the tensors in `values` must match the schema for the
+fields specified in `field_names`. All the tensors in `values` must
+have a common shape prefix, *batch_shape*.
+
+The `sizes` tensor specifies repeat counts for each field.  The repeat
+count (last dimension) of a each tensor in `values` must be greater
+than or equal to corresponding repeat count in `sizes`.
+
+A `message_type` name must be provided to give context for the field
+names. The actual message descriptor can be looked up either in the
+linked-in descriptor pool or a filename provided by the caller using
+the `descriptor_source` attribute.
+
+The `descriptor_source` attribute selects a source of protocol
+descriptors to consult when looking up `message_type`. This may be a
+filename containing a serialized `FileDescriptorSet` message,
+or the special value `local://`, in which case only descriptors linked
+into the code will be searched; the filename can be on any filesystem
+accessible to TensorFlow.
+
+You can build a `descriptor_source` file using the `--descriptor_set_out`
+and `--include_imports` options to the protocol compiler `protoc`.
+
+The `local://` database only covers descriptors linked into the
+code via C++ libraries, not Python imports. You can link in a proto descriptor
+by creating a cc_library target with alwayslink=1.
+
+There are a few special cases in the value mapping:
+
+Submessage and group fields must be pre-serialized as TensorFlow strings.
+
+TensorFlow lacks support for unsigned int64s, so they must be
+represented as `tf.int64` with the same twos-complement bit pattern
+(the obvious way).
+
+Unsigned int32 values can be represented exactly with `tf.int64`, or
+with sign wrapping if the input is of type `tf.int32`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExecuteInCriticalSection.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExecuteInCriticalSection.pbtxt
deleted file mode 100644
index cd5fc84a74faa209262da0402c546bcc3b4256fe..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExecuteInCriticalSection.pbtxt
+++ /dev/null
@@ -1,49 +0,0 @@
-op {
-  graph_op_name: "ExecuteInCriticalSection"
-  in_arg {
-    name: "critical_section"
-    description: <<END
-The handle of the `critical_section`.
-END
-  }
-  in_arg {
-    name: "arguments"
-    description: <<END
-Arguments for `f`, including any captured inputs appended at the end.
-END
-  }
-  out_arg {
-    name: "outputs"
-    description: <<END
-The outputs of `f`.
-END
-  }
-  attr {
-    name: "f"
-    description: <<END
-The `Function` to execute.
-END
-  }
-  summary: "Executes function `f` within critical section `critical_section`."
-  description: <<END
-While `f` is running in `critical_section`, no other functions which wish to
-use this critical section may run.
-
-Often the use case is that two executions of the same graph, in parallel,
-wish to run `f`; and we wish to ensure that only one of them executes
-at a time.  This is especially important if `f` modifies one or more
-variables at a time.
-
-It is also useful if two separate functions must share a resource, but we
-wish to ensure the usage is exclusive.
-
-The signature of `f` is expected to be:
-
-```
-  outputs <- F(arguments)
-```
-Typically, but this is not required, `arguments` contain resources.  The
-primary purpose of this op is to limit access to these resources to one
-execution of `F` at a time.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
index 561c86ddf68a4fb093d263d076fb6ccc8d408733..599bbce65f44a3c6798be2d5cee3b8f6f2e2635a 100644
--- a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
@@ -6,7 +6,7 @@ Attributes `[min; max]` define the clamping range for the `inputs` data.
 `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 
 Quantization is called fake since the output is still in floating point.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt
index 2713c01b27f6bc45eb6117047243f06873d4dd87..1976ffb8aac29f6aaac307c6f126b78f17dcadb8 100644
--- a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt
@@ -8,7 +8,7 @@ and `max` to 'outputs' tensor of same shape as `inputs`.
 `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 
 This operation has a gradient and thus allows for training `min` and `max`
 values.
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
index e293d4d084bc90f24ee0cc1111f750ddfa46465b..c0fac6a445895eb1ebbfd621d9afd49d99ed80f4 100644
--- a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
@@ -9,7 +9,7 @@ to 'outputs' tensor of same shape as `inputs`.
 `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 
 This operation has a gradient and thus allows for training `min` and `max`
 values.
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
index 8a4ab368b5a8c4d8ac756513da14796ff3a41551..2051903f6dae6bb51490dbec137e7cd7592fb6c6 100644
--- a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
@@ -40,7 +40,7 @@ END
   attr {
     name: "num_bits"
     description: <<END
-The bitwidth of the quantization; between 2 and 8, inclusive.
+The bitwidth of the quantization; between 2 and 16, inclusive.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_FlushSummaryWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_FlushSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ada43c9b8b5e25b72fa6e6d7b0a313965dd9d5a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FlushSummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FlushSummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_For.pbtxt b/tensorflow/core/api_def/base_api/api_def_For.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7cd8e1a26e2c75c5d6aaea65699c0545c1be445
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_For.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "For"
+  in_arg { name: "start" description: "The lower bound. An int32" }
+  in_arg { name: "limit" description: "The upper bound. An int32" }
+  in_arg { name: "delta" description: "The increment. An int32" }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors whose types are T."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of output tensors whose types are T."
+  }
+  attr { name: "T"  description: "A list of dtypes." }
+  attr {
+    name: "body"
+    description: <<END
+    A function that takes a list of tensors (int32, T) and returns another
+    list of tensors (T).
+END
+  }
+  summary: <<END
+  ```python
+   output = input;
+   for i in range(start, limit, delta)
+     output = body(i, output);
+  ```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f1cf3e6867a06df1f39774bc389fbe35a994ab4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GeneratorDataset"
+  summary: "Creates a dataset that invokes a function to generate elements."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_If.pbtxt b/tensorflow/core/api_def/base_api/api_def_If.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ba5a3f37e1cd55e9f13b3c8f8f11f1f20346de7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_If.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "If"
+  in_arg { name: "cond"  description: "The predicate." }
+  in_arg {
+    name: "cond"
+    description: <<END
+      A Tensor. If the tensor is a scalar of non-boolean type, the
+      scalar is converted to a boolean according to the
+      following rule: if the scalar is a numerical value, non-zero means
+      `True` and zero means False; if the scalar is a string, non-empty
+      means `True` and empty means `False`. If the tensor is not a scalar,
+      being empty means False and being non-empty means True.
+END
+  }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "then_branch"
+    description: <<END
+      A function that takes 'inputs' and returns a list of tensors, whose
+      types are the same as what else_branch returns.
+END
+  }
+  attr {
+    name: "else_branch"
+    description: <<END
+    A function that takes 'inputs' and returns a list of tensors, whose
+    types are the same as what then_branch returns.
+END
+  }
+  summary: "output = cond ? then_branch(input) : else_branch(input)"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
index 9b00f5b19d97500f01916d7e78bdaf9c64b8b66e..56a3658fa02aada5a3a5c29d80c714fa2ba6de42 100644
--- a/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
@@ -61,7 +61,7 @@ build the `tag` of the summary values:
    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 
 The `bad_color` argument is the color to use in the generated images for
-non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 Each element must be in the range `[0, 255]` (It represents the value of a
 pixel in the output image).  Non-finite values in the input tensor are
 replaced by this tensor in the output image.  The default value is the color
diff --git a/tensorflow/core/api_def/base_api/api_def_ImportEvent.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImportEvent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8813b58f3e53e5916edcabafc1fd28388fea8d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ImportEvent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ImportEvent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3654286cc350995f8bed497cd662fce3b4150872
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceAdd"
+  in_arg {
+    name: "x"
+    description: "A `Tensor` of type T."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Adds v into specified rows of x.
+
+    Computes y = x; y[i, :] += v; return y.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9480b4a3837893395168785c5b5b9ba74b643d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceSub"
+  in_arg {
+    name: "x"
+    description: "A `Tensor` of type T."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Subtracts `v` into specified rows of `x`.
+
+    Computes y = x; y[i, :] -= v; return y.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fcd3659dc771077d34d2ba833a40e7d6be68f53
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceUpdate"
+  in_arg {
+    name: "x"
+    description: "A tensor of type `T`."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Updates specified rows with values in `v`.
+
+    Computes `x[i, :] = v; return x`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d54b7ef32a3237607c6d31934aa43f11859a248b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "IsBoostedTreesEnsembleInitialized"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble resouce.
+END
+  }
+  out_arg {
+    name: "is_initialized"
+    description: <<END
+output boolean on whether it is initialized or not.
+END
+  }
+  summary: "Checks whether a tree ensemble has been initialized."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
deleted file mode 100644
index c6f2212cd4fa5fe81ecc97c33ebe17d18ac7c616..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "IteratorSetStatsAggregator"
-  summary: "Associates the given iterator with the given statistics aggregator."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt
index 51d91399f8a53325b03e67e643c1375c2bd7cf22..e667c328ae58092c7059d1466f90a5c0935f3c89 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt
@@ -49,14 +49,14 @@ in the batch:
 If `fast` is `True`, then the solution is computed by solving the normal
 equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
 \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-\lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\). 
+If \\(m \lt n\\) then `output` is computed as
 \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
 minimum-norm solution to the under-determined linear system, i.e.
 \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
 subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
 when \\(A\\) is numerically full rank and has a condition number
-\\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+\\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
 sufficiently large.
 
 If `fast` is `False` an algorithm based on the numerically robust complete
diff --git a/tensorflow/core/api_def/base_api/api_def_MutexLock.pbtxt b/tensorflow/core/api_def/base_api/api_def_MutexLock.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd3eb434868a43602158fa263c9215eee7d25124
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MutexLock.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "MutexLock"
+  in_arg {
+    name: "mutex"
+    description: <<END
+The mutex resource to lock.
+END
+  }
+  out_arg {
+    name: "mutex_lock"
+    description: <<END
+A tensor that keeps a shared pointer to a lock on the mutex;
+when the Tensor is destroyed, the use count on the shared pointer is decreased
+by 1.  When it reaches 0, the lock is released.
+END
+  }
+  summary: "Locks a mutex resource.  The output is the lock.  So long as the lock tensor"
+  description: <<END
+is alive, any other request to use `MutexLock` with this mutex will wait.
+
+This is particularly useful for creating a critical section when used in
+conjunction with `MutexLockIdentity`:
+
+```python
+
+mutex = mutex_v2(
+  shared_name=handle_name, container=container, name=name)
+
+def execute_in_critical_section(fn, *args, **kwargs):
+  lock = gen_resource_variable_ops.mutex_lock(mutex)
+
+  with ops.control_dependencies([lock]):
+    r = fn(*args, **kwargs)
+
+  with ops.control_dependencies(nest.flatten(r)):
+    with ops.colocate_with(mutex):
+      ensure_lock_exists = mutex_lock_identity(lock)
+
+    # Make sure that if any element of r is accessed, all of
+    # them are executed together.
+    r = nest.map_structure(tf.identity, r)
+
+  with ops.control_dependencies([ensure_lock_exists]):
+    return nest.map_structure(tf.identity, r)
+```
+
+While `fn` is running in the critical section, no other functions which wish to
+use this critical section may run.
+
+Often the use case is that two executions of the same graph, in parallel,
+wish to run `fn`; and we wish to ensure that only one of them executes
+at a time.  This is especially important if `fn` modifies one or more
+variables at a time.
+
+It is also useful if two separate functions must share a resource, but we
+wish to ensure the usage is exclusive.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MutexV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MutexV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..22295ecbd847c7b918ecb337a75492c1164a1e5d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MutexV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "MutexV2"
+  out_arg {
+    name: "resource"
+    description: <<END
+The mutex resource.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this variable is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this variable is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "Creates a Mutex resource that can be locked by `MutexLock`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
index e45e2375eb9eb732712ab3bde5b33ac6de884e09..ee4aad78993e41f83c4f72dd017d861de06f13f0 100644
--- a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
@@ -24,5 +24,6 @@ pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
                       [0, 0, 2, 2, 0, 0]
                       [0, 0, 0, 0, 0, 0]]
 ```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt b/tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..caf8172a529b42b5b2aec972d349d31141201902
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "PartitionedCall"
+  in_arg {
+    name: "args"
+    description: "A list of input tensors."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "f"
+    description: <<END
+      A function that takes 'args', a list of tensors, and returns 'output',
+      another list of tensors. Input and output types are specified by 'Tin'
+      and 'Tout'. The function body of f will be placed and partitioned across
+      devices, setting this op apart from the regular Call op.
+END
+  }
+  summary: "returns `f(inputs)`, where `f`'s body is placed and partitioned."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
index b9e75caf02b3b557e632a8137cf4780178b8ad8a..37ac10dddb7fc583d270fd6a75af6ab82f9206a5 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
@@ -44,6 +44,7 @@ In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
 if T == qint8, out[i] -= (range(T) + 1) / 2.0
 ```
+
 here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 
 *MIN_COMBINED Mode Example*
@@ -87,6 +88,7 @@ choosing to elide the lowest possible value for symmetry (e.g., output range is
 
 We first find the range of values in our tensor. The
 range we use is always centered on 0, so we find m such that
+
 ```c++
   m = max(abs(input_min), abs(input_max))
 ```
@@ -95,6 +97,7 @@ Our input tensor range is then `[-m, m]`.
 
 Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
 If T is signed, this is
+
 ```
   num_bits = sizeof(T) * 8
   [min_fixed, max_fixed] =
@@ -102,16 +105,19 @@ If T is signed, this is
 ```
 
 Otherwise, if T is unsigned, the fixed-point range is
+
 ```
   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
 ```
 
 From this we compute our scaling factor, s:
+
 ```c++
   s = (max_fixed - min_fixed) / (2 * m)
 ```
 
 Now we can quantize the elements of our tensor:
+
 ```c++
 result = round(input * s)
 ```
diff --git a/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70ad5219267fcc84368f072a6f5a122b6cc11a89
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "RegexReplace"
+  in_arg {
+    name: "input"
+    description: "The text to be processed."
+  }
+  in_arg {
+    name: "pattern"
+    description: "The regular expression to match the input."
+  }
+  in_arg {
+    name: "rewrite"
+    description: "The rewrite to be applied to the matched expresion."
+  }
+  out_arg {
+    name: "output"
+    description: "The text after applying pattern and rewrite."
+  }
+  attr {
+    name: "replace_global"
+    description: "If True, the replacement is global, otherwise the replacement\nis done only on the first match."
+  }
+  summary: "Replaces the match of pattern in input with rewrite."
+  description: "It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3f2188ba50723cafab560f417da3ebf793a8661
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,72 @@
+op {
+  graph_op_name: "ResourceApplyAdaMax"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AdaMax algorithm."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+v_t <- max(beta2 * v_{t-1}, abs(g))
+variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index bea1fd67627cc69354f81da7cf36d20babb9f38d..ad0aeac00426b5f7af4e9e51239c3717ee76756b 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -76,9 +76,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+$$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt
index 9e0de08267288e32e34cef323761cf4566fce128..4eb6eb4e4da44115bc1889241881b39e8e066fde 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt
@@ -34,7 +34,7 @@ This operation computes
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their contributions add.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..47148f7b03e1e2029e3151133165c694ce8e7110
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterDiv.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterDiv"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Divides sparse updates into the variable referenced by `resource`."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] /= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] /= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions multiply.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71f06d9a4349ecc36f1d4d276caee5f167d8c999
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMax.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterMax"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Reduces sparse updates into the variable referenced by `resource` using the `max` operation."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = max(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions are combined.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08e40ee2a8039c3afbf98b534c7fd6f10faabeff
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMin.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterMin"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Reduces sparse updates into the variable referenced by `resource` using the `min` operation."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = min(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions are combined.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c63549d81009f3b6d54795325196ce87c396cf4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMul.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterMul"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Multiplies sparse updates into the variable referenced by `resource`."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] *= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] *= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions multiply.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e71e60cbee5c69a44852ddbf835072fbdbd623eb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterSub.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterSub"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Subtracts sparse updates from the variable referenced by `resource`."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] -= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] -= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions add.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt b/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..344ef191fd580657acd5ebf75c3b5969f1af1fd2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt
@@ -0,0 +1,108 @@
+op {
+  graph_op_name: "Rpc"
+  in_arg {
+    name: "address"
+    description: <<END
+`0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `method` and `request`.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+`0-D` or `1-D`.  The method address on the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `request`.
+END
+  }
+  in_arg {
+    name: "request"
+    description: <<END
+`0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `method`.
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+Same shape as `request`. Serialized proto strings: the rpc responses.
+END
+  }
+  attr {
+    name: "protocol"
+    description: <<END
+RPC protocol to use.  Empty string means use the default protocol.
+Options include 'grpc'.
+END
+  }
+  attr {
+    name: "fail_fast"
+    description: <<END
+`boolean`. If `true` (default), then failures to connect
+(i.e., the server does not immediately respond) cause an RPC failure.
+END
+  }
+  attr {
+    name: "timeout_in_ms"
+    description: <<END
+`int`. If `0` (default), then the kernel will run the RPC
+request and only time out if the RPC deadline passes or the session times out.
+If this value is greater than `0`, then the op will raise an exception if
+the RPC takes longer than `timeout_in_ms`.
+END
+  }
+  summary: <<END
+Perform batches of RPC requests.
+END
+  description: <<END
+This op asynchronously performs either a single RPC request, or a batch
+of requests.  RPC requests are defined by three main parameters:
+
+  - `address` (the host+port or BNS address of the request)
+  - `method` (the RPC method name for the request)
+  - `request` (the serialized proto string, or vector of strings,
+     of the RPC request argument).
+
+For example, if you have an RPC service running on port localhost:2345,
+and its interface is configured with the following proto declaration:
+
+```
+service MyService {
+  rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+  }
+};
+```
+
+then call this op with arguments:
+
+```
+address = "localhost:2345"
+method = "MyService/MyMethod"
+```
+
+The `request` tensor is a string tensor representing serialized `MyRequestProto`
+strings; and the output string tensor `response` will have the same shape
+and contain (upon successful completion) corresponding serialized
+`MyResponseProto` strings.
+
+For example, to send a single, empty, `MyRequestProto`, call
+this op with `request = ""`.  To send 5 **parallel** empty requests,
+call this op with `request = ["", "", "", "", ""]`.
+
+More generally, one can create a batch of `MyRequestProto` serialized protos
+from regular batched tensors using the `encode_proto` op, and convert
+the response `MyResponseProto` serialized protos to batched tensors
+using the `decode_proto` op.
+
+**NOTE** Working with serialized proto strings is faster than instantiating
+actual proto objects in memory, so no performance degradation is expected
+compared to writing custom kernels for this workflow.
+
+If the connection fails or the remote worker returns an error
+status, the op reraises this exception locally.
+
+See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt
index 4b5201f025b438a1e6bba41035004b82ab876de7..9da9d09ea693036ea21b5e89b0a9a4d59f67b834 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt
@@ -51,7 +51,7 @@ This makes it easier to chain operations that need to use the reset value.
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their contributions add.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt
index 771cf0b591367e18f007e91bf66bc1cfd02ab459..8e99718c7e3751c1bf4ef4d03e558be3c0ada51e 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt
@@ -53,6 +53,6 @@ This makes it easier to chain operations that need to use the reset value.
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their contributions divide.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b52dad4a163643af659320f324ce6558fcffcd8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterMax.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "ScatterMax"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to reduce into `ref`.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as `ref`.  Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the update will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Reduces sparse updates into a variable reference using the `max` operation."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = max(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+This operation outputs `ref` after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions combine.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..721ac0ff35f934583e227317515b0ba3298de747
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterMin.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "ScatterMin"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to reduce into `ref`.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as `ref`.  Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the update will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Reduces sparse updates into a variable reference using the `min` operation."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = min(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+This operation outputs `ref` after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions combine.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt
index a51f571b00d7fc68a24dbfc4a0104522f8c0f559..b9e293ba9efba10de9ccd774111899adf4342c90 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt
@@ -53,6 +53,6 @@ This makes it easier to chain operations that need to use the reset value.
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their contributions multiply.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 4cb8c064fce615ace8e971505518e85f303d4c12..58753a651a17d163da5e0e5affb4025ce9530bbd 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -25,12 +25,12 @@ A new tensor with the given shape and updates applied according
 to the indices.
 END
   }
-  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
+  summary: "Scatter `updates` into a new tensor according to `indices`."
   description: <<END
-Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` according to
-indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-extracts values or slices from a given tensor.
+Creates a new tensor by applying sparse `updates` to individual values or
+slices within a tensor (initially zero for numeric, empty for string) of
+the given `shape` according to indices.  This operator is the inverse of the
+@{tf.gather_nd} operator which extracts values or slices from a given tensor.
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
 output will be nondeterministic if `indices` contains duplicates.
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt
index c0d3a4a1337ee1e1a32114adc51c930e014bc268..d12b3e68c25c22825349bf7affbb09de8fdf98ac 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt
@@ -51,7 +51,7 @@ This makes it easier to chain operations that need to use the reset value.
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their (negated) contributions add.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/ScatterSub.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
index c44dbbd2332828242792d9cdd4a218e7457c7d2b..4804908afc61356db76391a4d425b0857c52412d 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
@@ -54,7 +54,7 @@ If values in `ref` is to be updated more than once, because there are
 duplicate entries in `indices`, the order at which the updates happen
 for each value is undefined.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/ScatterUpdate.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_SdcaOptimizer.pbtxt b/tensorflow/core/api_def/base_api/api_def_SdcaOptimizer.pbtxt
index b0b58ac00e6709922ed517ad2c9efebbedf450a3..9da0e124ebe02f1cfb6450b96471d7d9d146bd20 100644
--- a/tensorflow/core/api_def/base_api/api_def_SdcaOptimizer.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SdcaOptimizer.pbtxt
@@ -97,8 +97,11 @@ END
   }
   attr {
     name: "adaptative"
+    default_value {
+      b: True
+    }
     description: <<END
-Whether to use Adapative SDCA for the inner loop.
+Whether to use Adaptive SDCA for the inner loop.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt
index 51d63eeb5695d6a428e990ba43e54102db58b58e..7be9a958ab55d27b4b9fe3dd023e44ae828e042c 100644
--- a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt
@@ -19,6 +19,7 @@ form square matrices, with the same constraints as the single matrix
 SelfAdjointEig.
 
 The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+are sorted in non-decreasing order.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt
index 4a5e1252586ea8b3e03b2545e0d8646288ddc408..fae9e84fc85be06184b19308d87c90632347e2f6 100644
--- a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt
@@ -31,7 +31,8 @@ END
   summary: "Computes the eigen decomposition of one or more square self-adjoint matrices."
   description: <<END
 Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+are sorted in non-decreasing order.
 
 ```python
 # a is a tensor.
diff --git a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77123e143b200fc079879bc0e891a771a7cb67e7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "SetStatsAggregatorDataset"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9fabe7863e4bf89a09a9bfcc9ce6c0a00d8cc6db
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "SlideDataset"
+  in_arg {
+    name: "window_size"
+    description: <<END
+A scalar representing the number of elements in the
+sliding window.
+END
+  }
+  in_arg {
+    name: "stride"
+    description: <<END
+A scalar representing the steps moving the sliding window
+forward in one iteration. It must be in `[1, window_size)`.
+END
+  }
+  summary: "Creates a dataset that passes a sliding window over `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4e6c1fddd6202be4380fa7994ac1e4e60338217
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "StatelessMultinomial"
+  in_arg {
+    name: "logits"
+    description: <<END
+2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+represents the unnormalized log probabilities for all classes.
+END
+  }
+  in_arg {
+    name: "num_samples"
+    description: <<END
+0-D.  Number of independent samples to draw for each row slice.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+contains the drawn class labels with range `[0, num_classes)`.
+END
+  }
+  summary: "Draws samples from a multinomial distribution."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12fbdfdf3fb656a49f28e6a1c035c3dc56ca373e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "StringStrip"
+  in_arg {
+    name: "input"
+    description: <<END
+A string `Tensor` of any shape.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A string `Tensor` of the same shape as the input.
+END
+  }
+  summary: "Strip leading and trailing whitespaces from the Tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SummaryWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_SummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fe57ecf195c85217bd174dbc503b28f26adade9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3fa6265e1082369a9c42c3286b44da800496de6b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListConcatLists"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c47208fa0525ccc7f91711bade66b0c86b914a7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListGetItem"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f33d4926018f6ebad79c7e2e69fca9a1966eb5f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListPushBackBatch"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..002e2a9bd37c2e6a2b41ba43237278bc42119bf7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListSetItem"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Timestamp.pbtxt b/tensorflow/core/api_def/base_api/api_def_Timestamp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf2d07bcf58a51ae5dc7f52006f4a911430bf421
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Timestamp.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "Timestamp"
+  summary: "Provides the time since epoch in seconds."
+  description: <<END
+Returns the timestamp as a `float64` for seconds since the Unix epoch.
+
+Note: the timestamp is computed when the op is executed, not when it is added
+to the graph.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt b/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bded00e83c7de09da8eb06d353925a83bb4e7134
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt
@@ -0,0 +1,123 @@
+op {
+  graph_op_name: "TryRpc"
+  in_arg {
+    name: "address"
+    description: <<END
+`0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `method` and `request`.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+`0-D` or `1-D`.  The method address on the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `request`.
+END
+  }
+  in_arg {
+    name: "request"
+    description: <<END
+`0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `method`.
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+Same shape as `request`. Serialized proto strings: the rpc responses.
+END
+  }
+  out_arg {
+    name: "status_code"
+    description: <<END
+Same shape as `request`.  Values correspond to tensorflow Status enum codes.
+END
+  }
+  out_arg {
+    name: "status_message"
+    description: <<END
+Same shape as `request`.  Values correspond to Status messages
+returned from the RPC calls.
+END
+  }
+  attr {
+    name: "protocol"
+    description: <<END
+RPC protocol to use.  Empty string means use the default protocol.
+Options include 'grpc'.
+END
+  }
+  attr {
+    name: "fail_fast"
+    description: <<END
+`boolean`. If `true` (default), then failures to connect
+(i.e., the server does not immediately respond) cause an RPC failure.
+END
+  }
+  attr {
+    name: "timeout_in_ms"
+    description: <<END
+`int`. If `0` (default), then the kernel will run the RPC
+request and only time out if the RPC deadline passes or the session times out.
+If this value is greater than `0`, then the op will raise an exception if
+the RPC takes longer than `timeout_in_ms`.
+END
+  }
+  summary: <<END
+Perform batches of RPC requests.
+END
+  description: <<END
+This op asynchronously performs either a single RPC request, or a batch
+of requests.  RPC requests are defined by three main parameters:
+
+  - `address` (the host+port or BNS address of the request)
+  - `method` (the method name for the request)
+  - `request` (the serialized proto string, or vector of strings,
+     of the RPC request argument).
+
+For example, if you have an RPC service running on port localhost:2345,
+and its interface is configured with the following proto declaration:
+
+```
+service MyService {
+  rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+  }
+};
+```
+
+then call this op with arguments:
+
+```
+address = "localhost:2345"
+method = "MyService/MyMethod"
+```
+
+The `request` tensor is a string tensor representing serialized `MyRequestProto`
+strings; and the output string tensor `response` will have the same shape
+and contain (upon successful completion) corresponding serialized
+`MyResponseProto` strings.
+
+For example, to send a single, empty, `MyRequestProto`, call
+this op with `request = ""`.  To send 5 **parallel** empty requests,
+call this op with `request = ["", "", "", "", ""]`.
+
+More generally, one can create a batch of `MyRequestProto` serialized protos
+from regular batched tensors using the `encode_proto` op, and convert
+the response `MyResponseProto` serialized protos to batched tensors
+using the `decode_proto` op.
+
+**NOTE** Working with serialized proto strings is faster than instantiating
+actual proto objects in memory, so no performance degradation is expected
+compared to writing custom kernels for this workflow.
+
+Unlike the standard `Rpc` op, if the connection fails or the remote worker
+returns an error status, this op does **not** reraise the exception.
+Instead, the `status_code` and `status_message` entry for the corresponding RPC
+call is set with the error returned from the RPC call.  The `response` tensor
+will contain valid response values for those minibatch entries whose RPCs did
+not fail; the rest of the entries will have empty strings.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..324fadac0af5088e86e61beaaa27f2111cfd4b82
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnbatchDataset"
+  summary: "A dataset that splits the elements of its input into multiple elements."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueDataset.pbtxt
deleted file mode 100644
index 009256916908c412fdebd0775387a7f7f4d30a25..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_UniqueDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "UniqueDataset"
-  summary: "Creates a dataset that contains the unique elements of `input_dataset`."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e21f56ba5b926826a79f10d154bb7b0c2253af2f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt
@@ -0,0 +1,85 @@
+op {
+  graph_op_name: "UniqueWithCountsV2"
+  in_arg {
+    name: "x"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+find the unique elements.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A `Tensor`. Unique elements along the `axis` of `Tensor` x.
+END
+  }
+  out_arg {
+    name: "idx"
+    description: <<END
+A 1-D Tensor. Has the same type as x that contains the index of each
+value of x in the output y.
+END
+  }
+  out_arg {
+    name: "count"
+    description: <<END
+A 1-D Tensor. The count of each value of x in the output y.
+END
+  }
+  summary: "Finds unique elements along an axis of a tensor."
+  description: <<END
+This operation either returns a tensor `y` containing unique elements
+along the `axis` of a tensor. The returned unique elements is sorted
+in the same order as they occur along `axis` in `x`.
+This operation also returns a tensor `idx` and a tensor `count`
+that are the same size as the number of the elements in `x` along the
+`axis` dimension. The `idx` contains the index in the unique output `y`
+and the `count` contains the count in the unique output `y`.
+In other words, for an `1-D` tensor `x` with `axis = None:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx, count = unique_with_counts(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+count ==> [2, 1, 3, 1, 2]
+```
+
+For an `2-D` tensor `x` with `axis = 0`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx, count = unique_with_counts(x, axis=0)
+y ==> [[1, 0, 0],
+       [2, 0, 0]]
+idx ==> [0, 0, 1]
+count ==> [2, 1]
+```
+
+For an `2-D` tensor `x` with `axis = 1`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx, count = unique_with_counts(x, axis=1)
+y ==> [[1, 0],
+       [1, 0],
+       [2, 0]]
+idx ==> [0, 1, 1]
+count ==> [1, 2]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_While.pbtxt b/tensorflow/core/api_def/base_api/api_def_While.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95a19c6dff99a51aa3228923b6408b3d7a995835
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_While.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "While"
+  in_arg {
+    name: "input"
+    description: "A list of input tensors whose types are T."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of output tensors whose types are T."
+  }
+  attr { name: "T"  description: "dtype in use." }
+  attr {
+    name: "cond"
+    description: <<END
+      A function takes 'input' and returns a tensor.  If the tensor is
+      a scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical
+      value, non-zero means True and zero means False; if the scalar is
+      a string, non-empty means True and empty means False. If the
+      tensor is not a scalar, non-emptiness means True and False
+      otherwise.
+END
+  }
+  attr {
+    name: "body"
+    description: <<END
+      A function that takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as specified
+      by T.
+END
+  }
+  summary: "output = input; While (Cond(output)) { output = Body(output) }"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..520952cd4117867f61cd3c536b8a7cc5beeeab62
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteAudioSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3653477b2067875ee772dedc5015bc550de1ec12
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteGraphSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26e1482630596d9ecb80917ff91adb2bd1131692
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteHistogramSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78db8700f0c7231f24fd1db3a0eedbcc4f43deeb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteImageSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bae8638d258b6fdf217fbdbd8705369f57e0bb3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteScalarSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db86883e21ea47a9e74788f8d76a166a235674da
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Abort.pbtxt b/tensorflow/core/api_def/python_api/api_def_Abort.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3f95aaf12c65383b1425fd4063a79afff63480a6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Abort.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Abort"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AccumulatorApplyGradient.pbtxt b/tensorflow/core/api_def/python_api/api_def_AccumulatorApplyGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e76d6dadcde5083dba8e2ef78740256fd45dc63
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AccumulatorApplyGradient.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AccumulatorApplyGradient"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AccumulatorNumAccumulated.pbtxt b/tensorflow/core/api_def/python_api/api_def_AccumulatorNumAccumulated.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbe971ab2e221bd01e991f9c80e1d527736e59bf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AccumulatorNumAccumulated.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AccumulatorNumAccumulated"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AccumulatorSetGlobalStep.pbtxt b/tensorflow/core/api_def/python_api/api_def_AccumulatorSetGlobalStep.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0047b25af6a52d02b5b4f1e87fa16fce56a90a29
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AccumulatorSetGlobalStep.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AccumulatorSetGlobalStep"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AccumulatorTakeGradient.pbtxt b/tensorflow/core/api_def/python_api/api_def_AccumulatorTakeGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..860fbe124506c7db95e3f1603b9f3878a2d4b84b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AccumulatorTakeGradient.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AccumulatorTakeGradient"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AdjustContrast.pbtxt b/tensorflow/core/api_def/python_api/api_def_AdjustContrast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0311ad92b7e40c67969bf14193a8b2f98659558a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AdjustContrast.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AdjustContrast"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AdjustHue.pbtxt b/tensorflow/core/api_def/python_api/api_def_AdjustHue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4411677118b002bd751a492aadf30b6fb0f4ac8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AdjustHue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AdjustHue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AdjustSaturation.pbtxt b/tensorflow/core/api_def/python_api/api_def_AdjustSaturation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..893219e17a70c5d4fdd24b46986b6ed33303448c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AdjustSaturation.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AdjustSaturation"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Angle.pbtxt b/tensorflow/core/api_def/python_api/api_def_Angle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..771e861fd17171ae886fabcf47218923ab5451b6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Angle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Angle"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e49a355b81ca543418d813e9cdedceda7efb85a9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyAdaMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyAdadelta.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8776b19f1a28dca7f9f067154d51cdb02599d79
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyAdadelta.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyAdadelta"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyAdagrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e659c1bb30ab2f80a9bb010c55a4426b12f9d5b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyAdagradDA.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d647c5eb0a23346f25407630d24d25321e3282a3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyAdagradDA.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyAdagradDA"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyAdam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66d9095c8fd0eb729f3b3c3ca5938ebe045723d7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyAdam.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyAdam"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyAddSign.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b7fe1aa6542e874c071bb3d069d50a37a4779754
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyAddSign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyAddSign"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56003c5e6fda60fb43c4a5784e2ccdf564ff1ed8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyCenteredRMSProp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyFtrl.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..680b3ef480f54da4e13331cc47589810ccecb54e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyFtrl"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyFtrlV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ab3bb6efd2bd483a5223acbb9a16b0b9ab3d001
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyFtrlV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyFtrlV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyGradientDescent.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..467bf7db558000d7a15bf83b33011690a34a107a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyGradientDescent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyGradientDescent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c3f0fef95f55c18170343d6f1cc081612dd68ee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyPowerSign.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f376b1dc6e531113d580406df26a5b30210b07ad
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyPowerSign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyPowerSign"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c6e2a4bb1ede274433bd297c7f39b2c58186923
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyProximalAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyProximalAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90c1655fe913504f55b910d826e834e70a7c4acc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyProximalGradientDescent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyRMSProp.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18cce1915a5eb25f68a227968c0819977d02467f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyRMSProp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyRMSProp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApproximateEqual.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApproximateEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..707f6716f9604994f5d92651ecacc7608f10742d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApproximateEqual.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApproximateEqual"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ArgMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ArgMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4c23a432f2b747a8a406c6152d36fc0ba5f1118f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ArgMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ArgMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ArgMin.pbtxt b/tensorflow/core/api_def/python_api/api_def_ArgMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..daa14f638663ef42fc50667e1bb1d5236e3d3361
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ArgMin.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ArgMin"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Assign.pbtxt b/tensorflow/core/api_def/python_api/api_def_Assign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34062ede91a836fa88f4680cdea91a5e23917158
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Assign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Assign"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AssignAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_AssignAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4553c6c6e72f50d047310c7df723d7d6e4b8491c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AssignAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AssignAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AssignAddVariableOp.pbtxt b/tensorflow/core/api_def/python_api/api_def_AssignAddVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e30ec092e68bf9204b34e6d06b3b4c1cfbeab02b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AssignAddVariableOp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AssignAddVariableOp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AssignSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_AssignSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aec68d5c21e598a4dd28673bda698e71bd808f51
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AssignSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AssignSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AssignSubVariableOp.pbtxt b/tensorflow/core/api_def/python_api/api_def_AssignSubVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81290a56ec1cd89b15a875c24ec31a20faa11bb8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AssignSubVariableOp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AssignSubVariableOp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AssignVariableOp.pbtxt b/tensorflow/core/api_def/python_api/api_def_AssignVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ffa4a11c41f2cf38b9251a0fd030ea0fc511058
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AssignVariableOp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AssignVariableOp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc16523a1567e8d7f2d0146c1c44d9ef11b6c6d5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AvgPool3D"
+  endpoint {
+    name: "nn.avg_pool3d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4289c1daf96583943b8dfad84aeca3351657bee4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatrixBandPart.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatrixBandPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a699e20506e47177722a57249f53cb1b80cf1b2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatrixBandPart.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatrixBandPart"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatrixDiag.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatrixDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40be51ecccd3d5f6ceb3a1dc245e925e666d8ac5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatrixDiag.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatrixDiag"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatrixDiagPart.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatrixDiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ef78fa5ece9a6ff1147b70d4660769954e67301
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatrixDiagPart.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatrixDiagPart"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatrixSetDiag.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatrixSetDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..644c1270a2385871d9dc4f429e39de4c0382c27a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatrixSetDiag.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatrixSetDiag"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BiasAddGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_BiasAddGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9226c6791c82fcddb1ed0d54db155d20ff44d18e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BiasAddGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BiasAddGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Bincount.pbtxt b/tensorflow/core/api_def/python_api/api_def_Bincount.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..551b51db26251cbf15b38ac3e48b5024fae0ec72
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Bincount.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Bincount"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..083eeced81dfc04d01c8721e3fb65727ef13176a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BroadcastTo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fcf541f9036baaef1590f06da0d7471b0558b4c7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BytesProducedStatsDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2bbb4ff9e3b08d0dd11c7444e5d00feb514e81c0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CacheDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Cast.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..428aa62c462a2361c519243a8b8a6bdb4f42cb9d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Cast.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Cast"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CholeskyGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_CholeskyGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3538afb2a7108763b1986615f8c738e0c3113c96
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CholeskyGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CholeskyGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cacdd5c2ca0838701aff1c085f06d81319612832
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ClipByValue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CloseSummaryWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_CloseSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f6fd7d93169306fdf5ca62d27635e1f86f37bc4d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CloseSummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CloseSummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CompareAndBitpack.pbtxt b/tensorflow/core/api_def/python_api/api_def_CompareAndBitpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..493a7e48665954b647e99a5fe9d06a7a42755494
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CompareAndBitpack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CompareAndBitpack"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c005a4da0f866c1d1106effabbaa22f1abecf422
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConcatenateDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConditionalAccumulator.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConditionalAccumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4663e8eb3a20b6d809242ec9a44376247b8b3ca
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ConditionalAccumulator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConditionalAccumulator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConsumeMutexLock.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConsumeMutexLock.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9559947490e6fbcb88ab8a359e8416fd11a8b165
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ConsumeMutexLock.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConsumeMutexLock"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ControlTrigger.pbtxt b/tensorflow/core/api_def/python_api/api_def_ControlTrigger.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33941493af7bf038503dad2379821d0c36929cf8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ControlTrigger.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ControlTrigger"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ae75d6da222d84245bb2a912942522eb52047bc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv2D"
+  endpoint {
+    name: "nn.conv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f21d8c8802f9a18c9357dbe68d3c65407bff923
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv2DBackpropFilter"
+  endpoint {
+    name: "nn.conv2d_backprop_filter"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea976799cbc73bc9164a15e781a051f03e14275b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv2DBackpropInput"
+  endpoint {
+    name: "nn.conv2d_backprop_input"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba8d178263c94574c0aaac8f1f24fb1424a50275
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv3D"
+  endpoint {
+    name: "nn.conv3d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..634545f427c906edd94297f9e5291be4021462ad
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Conv3DBackpropFilter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1da8ee3a25f36a0b44f6458a351854190fe7830f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv3DBackpropFilterV2"
+  endpoint {
+    name: "nn.conv3d_backprop_filter_v2"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2b0a0d19f4a31b05771133c52446078a7e938c8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Conv3DBackpropInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropInputV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropInputV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e5c4f74fe90c148e11be98a2e343a41511d1d1d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropInputV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Conv3DBackpropInputV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CountUpTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_CountUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f41be2f540d241776bee3fcb1bba496d4baebeab
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CountUpTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CountUpTo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CreateSummaryDbWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_CreateSummaryDbWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28da46a0f8e452f65d06a13c4b0d0b03b2a75757
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CreateSummaryDbWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CreateSummaryDbWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CreateSummaryFileWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_CreateSummaryFileWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ce2c4d37e5001681ffa733bf4726c6bea652029
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CreateSummaryFileWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CreateSummaryFileWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CropAndResizeGradBoxes.pbtxt b/tensorflow/core/api_def/python_api/api_def_CropAndResizeGradBoxes.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac4449419314a1fe09e9a2b17e815a741b960b1d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CropAndResizeGradBoxes.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CropAndResizeGradBoxes"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CropAndResizeGradImage.pbtxt b/tensorflow/core/api_def/python_api/api_def_CropAndResizeGradImage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eecd0536f29bc189705c7a7311a79eb5ffff02dc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CropAndResizeGradImage.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CropAndResizeGradImage"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNN.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b13586b63ba3418c452e44b0f007c42885498f9f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNN.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNN"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNNBackprop.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNNBackprop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81c4efc60b7f6338a0197e5898c6e7eddd5069bf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNNBackprop.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNBackprop"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNNCanonicalToParams.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNNCanonicalToParams.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..164a306034af2b85fb803b431367e337bc65b34f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNNCanonicalToParams.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNCanonicalToParams"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00f97f05b11d3bdb049c55beba2fe9ce18e14ff0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNParamsSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsToCanonical.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsToCanonical.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..841bc0cf55e0800e3350f0eb68d37f42c788d79e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsToCanonical.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNParamsToCanonical"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Cumprod.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cumprod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f5e2f061bd281d22e91b0922899eda3a641d68f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Cumprod.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Cumprod"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Cumsum.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cumsum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..715f26fcac2bb03b729d58f5c5f7cfe6802660fd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Cumsum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Cumsum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DataFormatDimMap.pbtxt b/tensorflow/core/api_def/python_api/api_def_DataFormatDimMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..82a39cfc5981f14edfe39cee363abf169f89245e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DataFormatDimMap.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DataFormatDimMap"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DataFormatVecPermute.pbtxt b/tensorflow/core/api_def/python_api/api_def_DataFormatVecPermute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ec292df8f670cfbae6488545979354d751e5d41
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DataFormatVecPermute.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DataFormatVecPermute"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e3d34cc15be752b466aa03f6805cd687698f74fa
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DatasetToSingleElement"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2d5ed2b432d8ac5e60414409311308dcce7a486d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeepCopy"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DenseToDenseSetOperation.pbtxt b/tensorflow/core/api_def/python_api/api_def_DenseToDenseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c47ec09c5ee16d37ac57c211c2409cd8f8c6970
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DenseToDenseSetOperation.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DenseToDenseSetOperation"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a8e068afb744ce8b472111d19cf743d39ac44ef
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DenseToSparseBatchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DenseToSparseSetOperation.pbtxt b/tensorflow/core/api_def/python_api/api_def_DenseToSparseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a30757df4d0326159c180c4be14309f9150fff00
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DenseToSparseSetOperation.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DenseToSparseSetOperation"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthToSpace.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthToSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd0766b36556d86b2fc99f8b2ac19480832546e1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DepthToSpace.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DepthToSpace"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DeserializeIterator.pbtxt b/tensorflow/core/api_def/python_api/api_def_DeserializeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..170d37be4e268c2829a5fa01fcaa48be082c2e0e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DeserializeIterator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeserializeIterator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DestroyResourceOp.pbtxt b/tensorflow/core/api_def/python_api/api_def_DestroyResourceOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9dde0080a8c3533a1b1837ddd4aaeb05e7a180e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DestroyResourceOp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DestroyResourceOp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6d73ecf1bb06895017b2d2ac2a16c702681eb217
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Dilation2D"
+  endpoint {
+    name: "nn.dilation2d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Dilation2DBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dilation2DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..feb9f083db691c55832997509ff6455a6584f486
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Dilation2DBackpropFilter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Dilation2DBackpropFilter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Dilation2DBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dilation2DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9a6b09f5cc653ba456bfb9fb66757c48963503e4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Dilation2DBackpropInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Dilation2DBackpropInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Div.pbtxt b/tensorflow/core/api_def/python_api/api_def_Div.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e5537c8bfea638b585c04c514264d930054fde5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Div.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Div"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b863520e987b69df680c84efcbdfca44518c6e8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Empty"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..051cf14c0ec2b32779be8b9c297b93abd1bc1318
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EnqueueInQueueDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..391167254edb69725c778e6319bf8a9f6038589f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Erf"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ed1341dfe2d0c4f57e0fa3c2d14378bce452be3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "FFT2D"
+  endpoint {
+    name: "spectral.fft2d"
+  }
+  endpoint {
+    name: "fft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a4e1d6adf9b9c2bf68c6375de6aebfdfcf5bfb3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "FFT3D"
+  endpoint {
+    name: "spectral.fft3d"
+  }
+  endpoint {
+    name: "fft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f91b842181c769d0a2f921f1d7566c4d8522541
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FilterDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d0703471d38c94a8c37da6f0a65ebd165c23a820
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9de61ac263cd82a0893aa2e27b9d7532490ca441
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FlatMapDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FlushSummaryWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_FlushSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ada43c9b8b5e25b72fa6e6d7b0a313965dd9d5a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FlushSummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FlushSummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_For.pbtxt b/tensorflow/core/api_def/python_api/api_def_For.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a58ddf56fe1a8516c57ca203f14ea76414ab55f5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_For.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "For" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_FusedBatchNormGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56409f32d8d58b923980f78b3662f196e7954e14
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNormGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FusedBatchNormGradV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5a4200b76c884c0f24335df1716f85b0666b589
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormGradV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNormGradV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FusedPadConv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FusedPadConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..03b5fdd5a11844af209c86d9ef8e362c4d286ea6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FusedPadConv2D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedPadConv2D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FusedResizeAndPadConv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FusedResizeAndPadConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..52165d9b4d991d1636cdc08d5cb2f9efe2f7754f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FusedResizeAndPadConv2D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedResizeAndPadConv2D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Gather.pbtxt b/tensorflow/core/api_def/python_api/api_def_Gather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f956930e0f5bc9a9160974ee4c4a177102942fa
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Gather.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Gather"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_GatherV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..029bc59b51cb5463579dbf867e3a1927cb3577f7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GatherV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GatherV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9dcfa0f7d210012aa5c2d43349239a953ea3739e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GeneratorDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d40208e613e6b7ee1522c2990afea1345cc5de1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GroupByWindowDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6b36a314b8d8a197651ee3c68b1376a9bbed669
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "IFFT2D"
+  endpoint {
+    name: "spectral.ifft2d"
+  }
+  endpoint {
+    name: "ifft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6def5b36da17766c5342703fcefe2b377028f330
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "IFFT3D"
+  endpoint {
+    name: "spectral.ifft3d"
+  }
+  endpoint {
+    name: "ifft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IRFFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_IRFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8fa74a4317fe635cb10ca226f5516834370275c2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IRFFT.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IRFFT"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IRFFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IRFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2021cad63911d2bafb159e1a1f2f11ed2a1d372e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IRFFT2D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IRFFT2D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IRFFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IRFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d1eab6003ece3b1eed22200743a28de185d1299
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IRFFT3D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IRFFT3D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Identity.pbtxt b/tensorflow/core/api_def/python_api/api_def_Identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00f2afde271ebade0ac7d1ae75dc9dff6f692ab5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Identity.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Identity"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_If.pbtxt b/tensorflow/core/api_def/python_api/api_def_If.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a44db5da081692ee26a7931850236d31d2231627
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_If.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "If" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_Imag.pbtxt b/tensorflow/core/api_def/python_api/api_def_Imag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5632fd4365f718ccf079e1c75b962b011c0253f6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Imag.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Imag"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ImmutableConst.pbtxt b/tensorflow/core/api_def/python_api/api_def_ImmutableConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..997013914b89fb489eaa3c8f96f001b093aa23e0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ImmutableConst.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ImmutableConst"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ImportEvent.pbtxt b/tensorflow/core/api_def/python_api/api_def_ImportEvent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8813b58f3e53e5916edcabafc1fd28388fea8d8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ImportEvent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ImportEvent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..390e3bbf97340472608414af23ad5e6d8ee300ae
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af9634f9b2b0cfe4e050005b8b05ca127d0523d9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5fa9d778ea6c937e7b8502b6db32d15bfa2ca90d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef1b06b19cc6a0c62f6e9f451aceed8aeabed553
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InterleaveDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Inv.pbtxt b/tensorflow/core/api_def/python_api/api_def_Inv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ed58a276f69e46bbf3d14fbd4b921ad2f0d7a2df
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Inv.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Inv"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IsVariableInitialized.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsVariableInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a7b0789090c96fa2db968edbc885258aecf34d7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IsVariableInitialized.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IsVariableInitialized"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Iterator.pbtxt b/tensorflow/core/api_def/python_api/api_def_Iterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a021db1534834a5c248e750b9e56e334a20d3949
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Iterator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Iterator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IteratorFromStringHandle.pbtxt b/tensorflow/core/api_def/python_api/api_def_IteratorFromStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9efe2d1446330aa78405329b017ce0c81d3a20c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IteratorFromStringHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IteratorFromStringHandle"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IteratorGetNext.pbtxt b/tensorflow/core/api_def/python_api/api_def_IteratorGetNext.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f7066484ceaa2c0dce7a9ccba8c71838e79e85c0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IteratorGetNext.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IteratorGetNext"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IteratorGetNextSync.pbtxt b/tensorflow/core/api_def/python_api/api_def_IteratorGetNextSync.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d94edbc71de2295e4a83bda4a0616cbb6c3ebe41
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IteratorGetNextSync.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IteratorGetNextSync"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IteratorToStringHandle.pbtxt b/tensorflow/core/api_def/python_api/api_def_IteratorToStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a4251f76bd078903cbdf4b2d8419815dab2742e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IteratorToStringHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IteratorToStringHandle"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94bf6106ad8459767d31a345a17483b255dfc02b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LatencyStatsDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LoopCond.pbtxt b/tensorflow/core/api_def/python_api/api_def_LoopCond.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4cfa295b2a33446e3646fc1d000ecefd78d64291
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LoopCond.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LoopCond"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MakeIterator.pbtxt b/tensorflow/core/api_def/python_api/api_def_MakeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..acc3342c9ba98d1e5022d99a17fe51c9f4af0ce6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MakeIterator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MakeIterator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cffd2910fb404bc7f75e55e42b9ebba1635db134
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapAndBatchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapClear.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67c1c3e2dd3191d9e37ea40e6d8cf00e5f888550
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MapClear.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapClear"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b1d2f2c730ff8b8b928fcd97c4fe3bdc704e470
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapIncompleteSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db7921e13b97eebf09260f985b311175bf5b67a4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MapIncompleteSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapIncompleteSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapPeek.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapPeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85fab1722948b9b5e0d2e74794bd98b9dd7de37e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MapPeek.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapPeek"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8b6ed1a0cf460ad9050af7b3bea7a2ef9bd5c1e4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MapSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapStage.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapStage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ae70d5d5791ec58642fff759c53d56338670540
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MapStage.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapStage"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapUnstage.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapUnstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e5f92e37db41b2528961f1dde322e3a1938539b3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MapUnstage.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapUnstage"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapUnstageNoKey.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapUnstageNoKey.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c2a25db2139db94c3a541188fa17021a2492738
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MapUnstageNoKey.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapUnstageNoKey"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8576c9ff2e0729235d9bca70c369536dacaa08e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPool3D"
+  endpoint {
+    name: "nn.max_pool3d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolGradGradV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolGradGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..534cc90e41ea33fda876a907bc1dfe7eae1bcc15
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolGradGradV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPoolGradGradV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolGradV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e79f839686a425a5648f569d05a2cce60d46edcb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolGradV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPoolGradV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MergeV2Checkpoints.pbtxt b/tensorflow/core/api_def/python_api/api_def_MergeV2Checkpoints.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca9f74e0c19081446fdaa2d13413d2817e00f402
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MergeV2Checkpoints.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MergeV2Checkpoints"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Mod.pbtxt b/tensorflow/core/api_def/python_api/api_def_Mod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..48d828ca72a5abfebf1815980e82e1a3f471c175
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Mod.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Mod"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Multinomial.pbtxt b/tensorflow/core/api_def/python_api/api_def_Multinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b654335806407602938e43850d2165e3c952032
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Multinomial.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Multinomial"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MutexLock.pbtxt b/tensorflow/core/api_def/python_api/api_def_MutexLock.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74e6e1035771484adbfdabd1720c260a39e5f519
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MutexLock.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutexLock"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MutexV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_MutexV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..013f42d8550cac92aad2539f766deb3e97abaeaf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MutexV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutexV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NextIteration.pbtxt b/tensorflow/core/api_def/python_api/api_def_NextIteration.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28ac301e4169fa4302124a3e554cae6f8f1e13db
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NextIteration.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NextIteration"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NthElement.pbtxt b/tensorflow/core/api_def/python_api/api_def_NthElement.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec838585103345748ef5332b032af7a522393fb0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NthElement.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NthElement"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_OneShotIterator.pbtxt b/tensorflow/core/api_def/python_api/api_def_OneShotIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee9d777b4e4c7104ea919bfe3fa6e48aa0928b20
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_OneShotIterator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OneShotIterator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_OnesLike.pbtxt b/tensorflow/core/api_def/python_api/api_def_OnesLike.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c058e5b1ab19790b8aa4049412f937282bd14abb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_OnesLike.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OnesLike"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_OrderedMapClear.pbtxt b/tensorflow/core/api_def/python_api/api_def_OrderedMapClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b8276b964a58855e3ab92d026ebb0fc00e67f2e7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_OrderedMapClear.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapClear"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_OrderedMapIncompleteSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_OrderedMapIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ba6c5b2fc7f461d05cf944a1152d249de0217f0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_OrderedMapIncompleteSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapIncompleteSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_OrderedMapPeek.pbtxt b/tensorflow/core/api_def/python_api/api_def_OrderedMapPeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f0c7afd465358c35ea8c1fd3b33eb4d4a76ef87
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_OrderedMapPeek.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapPeek"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_OrderedMapSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_OrderedMapSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e155726da6bd697ef422c53d96cec086df511b3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_OrderedMapSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_OrderedMapStage.pbtxt b/tensorflow/core/api_def/python_api/api_def_OrderedMapStage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6222c1fc4c998174b65e861ef1aeb4375d58c05e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_OrderedMapStage.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapStage"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_OrderedMapUnstage.pbtxt b/tensorflow/core/api_def/python_api/api_def_OrderedMapUnstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5cca8d9f93d065271a71ea23bf953e73a1cd6e58
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_OrderedMapUnstage.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapUnstage"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_OrderedMapUnstageNoKey.pbtxt b/tensorflow/core/api_def/python_api/api_def_OrderedMapUnstageNoKey.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d67b95b65b7e00475a1f8f422e3df529b3747ea0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_OrderedMapUnstageNoKey.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapUnstageNoKey"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6223b3132ed0d6878995d3c5e657275fac0cc4f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PaddedBatchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelDynamicStitch.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelDynamicStitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a36ad273646a97aaadbe74718800e5fb1fc27dae
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ParallelDynamicStitch.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParallelDynamicStitch"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..93cd5719feb613cd3de2e422e23cc3d690bdef08
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParallelInterleaveDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09d200dd24c828af85d1505bb17086dbfa688ee8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParallelMapDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParseSingleExample.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseSingleExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4193bdd091e015eca8cca85034255d36ba27a67b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ParseSingleExample.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParseSingleExample"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PartitionedCall.pbtxt b/tensorflow/core/api_def/python_api/api_def_PartitionedCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c443acd2e9dd9a795e80ae1072d4e07aa01d3df4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PartitionedCall.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "PartitionedCall" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_PlaceholderV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_PlaceholderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a30360d2de4e36f47f3c7564db5ec9ca045034b9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PlaceholderV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PlaceholderV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PopulationCount.pbtxt b/tensorflow/core/api_def/python_api/api_def_PopulationCount.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d35550236a317c6581b6e8b91f8843b5cc24977f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PopulationCount.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PopulationCount"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec4e214eb5e082c8f732cbef9db69524c48d80a4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PrefetchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..228c4047d2e0b7ddfec1d8cd4fad478aa6c4c1a7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PreventGradient.pbtxt b/tensorflow/core/api_def/python_api/api_def_PreventGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9565f5632b9ffdbaf1879dc1c18092838143d06b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PreventGradient.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PreventGradient"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantize.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2468f1b243f318dcc3a8fb45524c6b548f378fa
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15e181be20948128a7f970f024e6cc8dfe28c96c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f1edc6f5faecfbedc0b9b873484b160551b0f2d2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeDownAndShrinkRange.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeDownAndShrinkRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9a2a86d25dad916208e9a666b5ffaa15f1513c4a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeDownAndShrinkRange.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeDownAndShrinkRange"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40673234ed02fa49601b83ce4f587b9051295315
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b952d6eccbb30df85582848f7f7e7869eea367a8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e009ada5535993ab5c6eefe8e0b8858e04735824
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedBatchNormWithGlobalNormalization"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedBiasAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedBiasAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3432962e593d6777a62723d25bffd22b8001cc68
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedBiasAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedBiasAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2409d12abeff922cca92f9ae609764a27f651356
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedConv2D"
+  endpoint {
+    name: "nn.quantized_conv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedInstanceNorm.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedInstanceNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..47a4931a05ab0f9f8b746103667c64f2cc27fbae
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedInstanceNorm.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedInstanceNorm"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedMatMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ca9d2ae0774cda244db4843e86372cbe40e2ecb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedMatMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedMatMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c026fba194c0a0fa6208799248b7269d09a5623b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedRelu.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e5da4f25f0e51b1b73b9bb96c9b5b18c2ee54d60
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedRelu6.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedRelu6.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef1e64831270955219d409c80f865a16713cbcc3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedRelu6.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedRelu6"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedReshape.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedReshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e6d9ed718386f77c5f28ed164803e2d7f148eaf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedReshape.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedReshape"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedResizeBilinear.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedResizeBilinear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a8da4128c260644db022183683e2dc362d82d39e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedResizeBilinear.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedResizeBilinear"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueIsClosed.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueIsClosed.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f1d2ef63f1a8849befe42341e15c1630f730ec04
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueIsClosed.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueIsClosed"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueIsClosedV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueIsClosedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..07cf1a7497a40ff435f40eaaa31d22e8785bd20c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueIsClosedV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueIsClosedV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RFFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_RFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9719255aebe6c665f9178c6b652230dd4542d13
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RFFT.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RFFT"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RFFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_RFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1336a64408e8135284d9cafd6ca057572950bdf5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RFFT2D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RFFT2D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RFFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_RFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..978b5814ff652246cca7630f9a2df22985bbb28e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RFFT3D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RFFT3D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a5f6f8c6f1db344c480e2bd452362d977dc15000
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomPoissonV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomPoissonV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cc217c50ea74af0413a804c8e2b726b3e5f1a91
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomPoissonV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomPoissonV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4cd8296b2233ac58c12e6573d2194f7d976d9137
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RangeDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Rank.pbtxt b/tensorflow/core/api_def/python_api/api_def_Rank.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05aa12f2fa238f540f653e42576899c3e1b799da
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Rank.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Rank"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReadVariableOp.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReadVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e250b78effcd998b3d26804522858b2386df9b46
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReadVariableOp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReadVariableOp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Real.pbtxt b/tensorflow/core/api_def/python_api/api_def_Real.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..52a9089f4a75018cb1a6a551aecef7b1795e9f4f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Real.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Real"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RecordInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_RecordInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..29f798050e7c868a13a13b3c123ecbc2c5f70de1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RecordInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RecordInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReduceJoin.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReduceJoin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0fde5942abee41797db084e1b34b8202532db1a5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReduceJoin.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReduceJoin"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RefNextIteration.pbtxt b/tensorflow/core/api_def/python_api/api_def_RefNextIteration.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9dfcf5e97e9fd4f0676cdb59503947c1a1972f9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RefNextIteration.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RefNextIteration"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RefSelect.pbtxt b/tensorflow/core/api_def/python_api/api_def_RefSelect.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f9909aa86a861e5b1bfb95aa96e3fbd925f0c4a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RefSelect.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RefSelect"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RefSwitch.pbtxt b/tensorflow/core/api_def/python_api/api_def_RefSwitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68b0f4a694aa1f059ca85b5218b40c99e1d21d28
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RefSwitch.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RefSwitch"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RemoteCall.pbtxt b/tensorflow/core/api_def/python_api/api_def_RemoteCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc069d857d0b40bda75dedc4d881359419ce8b6b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RemoteCall.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RemoteCall"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be301da8386af0fbd98c9b02d2cfc0fe79178990
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RepeatDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RequantizationRange.pbtxt b/tensorflow/core/api_def/python_api/api_def_RequantizationRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e327595a3898c69e3b060a821345cb8d863b4587
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RequantizationRange.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RequantizationRange"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Requantize.pbtxt b/tensorflow/core/api_def/python_api/api_def_Requantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f26f0611bae9544aab74f014690db9ceb4606241
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Requantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Requantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca679e6889f102bf377dc44ecfc217c602642858
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdaMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdadelta.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e0413a67a3c949e8d34311b26acd81a251100ca4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdadelta.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdadelta"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdagrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..52b8ba0b0e4db79a65fc47cc14a66f7469db6328
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdagradDA.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..edfc0a733f84542601ce95a3bad1b99db629c2a1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdagradDA.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdagradDA"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca2713b533804e8ecc9ed76798744c0b07bb5e24
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdam.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdam"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAddSign.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50dd6439536ae38c5de377d5636262f5cdb906a6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAddSign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAddSign"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20592e38c812aeb45cd03bae300b5d6667aee7dd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyCenteredRMSProp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyFtrl.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72b49e09d6217ce9fdc110f91f4e10fe86124e09
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyFtrl"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyFtrlV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af1d24c344d81f94216c0517011b387ab93965eb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyFtrlV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyFtrlV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyGradientDescent.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..75d6afd426a0bfcd324542e6fdae70bf7f4b53b9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyGradientDescent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyGradientDescent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e499cf72e254ed5d0e6e73da8f88a9de4392605
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyPowerSign.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b23ad0d061a42a054e9116a25792f3d73a40caf1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyPowerSign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyPowerSign"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ad124c59005285d2c9a9f894d53147cf0823c86
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyProximalAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyProximalAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d684a5dd6720333e334b8afe462224437e32e248
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyProximalGradientDescent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyRMSProp.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4c20e1382f6759993511e7eb4fd846c63575611
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyRMSProp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyRMSProp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceCountUpTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceCountUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87376b74475d3191dd0d2be3a80c8da57087a88b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceCountUpTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceCountUpTo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceGather.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..714ba4a7ca9a1f05dc34cefe9e2430ebdc6f284a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceGather.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceGather"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d4601cafdeb8af4821887ac0d354b1a4a7844b9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56b5a46d107e25e6ee501c9389dafda20dd34a04
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterDiv.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterDiv"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8119bcc6c6504de57e2c3a53dbb4c7bd03003ad4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterMin.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d874aef3fe33415e69f483841a6851f3b8c30523
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMin.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterMin"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..365a37fa0d73406c20e6908b84cc8b7d005f36c1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54c66708aeb1263b01ed90b792b1331e909416ec
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterNdUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72dc5bf8893b249a7dc9b58f53222213d26faca1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30f885bee0cf27801b56918e82cdef8c644afe1b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyAdadelta.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7e4dad13878112512fc7b915cb4fec9bd47b76a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyAdadelta.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdadelta"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyAdagrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1388da789c08d54bf61ace44791280586d0ec6aa
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyAdagradDA.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c5beaa4f580fb22cd23a7bcefe9009f4168c36b4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyAdagradDA.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdagradDA"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f3de3d93df3d658d02663537d6cc4c404a315d27
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyCenteredRMSProp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyFtrl.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f83833d3511cf3f42c0257cd61677684da86b35f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyFtrl"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71adbb0bcd6f63e2de2ecf63d7c8d56265b29ae2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyFtrlV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyFtrlV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28a19caaccfabc58444e5963ff1a1c6446e67255
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8cda7f4edd1217baca8ed84b7c9ae96a22e3b6f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyProximalAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5fa1ade6696ba464a3da44b05f35c67e8ada4fc8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyProximalGradientDescent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyRMSProp.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..86cc9a41ae9db89aa61b2225aeb15c42461dda45
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyRMSProp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyRMSProp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceStridedSliceAssign.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceStridedSliceAssign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef6e19fea0d35ad6410f1001d3a683581bd545ea
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceStridedSliceAssign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceStridedSliceAssign"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RestoreV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_RestoreV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34d07239a1a18e85e2534db6607a89c12cc670a1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RestoreV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RestoreV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReverseSequence.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReverseSequence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f3fc2578dfd2af59077f611bce137f98f6af38ee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReverseSequence.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReverseSequence"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/python_api/api_def_Roll.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cc919f36fef69b28fc20873f1a635f5dd1644cf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Roll.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Roll"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Round.pbtxt b/tensorflow/core/api_def/python_api/api_def_Round.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74428e2f58323e47ca672d50f5193bac2977b1f9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Round.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Round"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SaveV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SaveV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..617897ee44e8351bd95d9f44ca2b660894617b88
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SaveV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SaveV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e71b655c22fbcbf1524433fc65a392e4d80c5c43
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScanDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f5b6decf6d1cfc6b3fbd8492824ab95958b060b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterNdNonAliasingAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterNdNonAliasingAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ecf71cd6257b5630566cb6fb92110f6f738f91f4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterNdNonAliasingAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterNdNonAliasingAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterNdUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ccf4a9cce807c4cbc5fe2fdc1e2a7057a0bc5464
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterNdUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterNdUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e4c41c1226674fbb21899ea31be8668b0d8f6ece
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SerializeIterator.pbtxt b/tensorflow/core/api_def/python_api/api_def_SerializeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..07d2f200fee0dee55cb813389f672a914a10e0f2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SerializeIterator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SerializeIterator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SetSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_SetSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee9c71036bdcf6c3d64d468cfe5a4793e522335d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SetSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SetSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3a8c1036ca34233b245a92110dc6e81ac348942d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SetStatsAggregatorDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Shape.pbtxt b/tensorflow/core/api_def/python_api/api_def_Shape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd7b5ad36c6e1d7f3292cbd4ca13a1242bf09e8c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Shape.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Shape"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShapeN.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShapeN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2dbe74b09689b6c4fb1c54640205c7281d23780
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ShapeN.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShapeN"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b0d2994f0711f440fb6623aa2322c86bd3859f8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShuffleAndRepeatDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f0be9197adeb23b2d5047c5d69916df0e2c1eda
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShuffleDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2ee91dd12ed16ba27a9c4ae45b48194bc5a8b03
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Sign"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Size.pbtxt b/tensorflow/core/api_def/python_api/api_def_Size.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f76173a5d870910edead637d3493e75ba651b67
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Size.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Size"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96a551c5b6669a8d019e3c705507aba768ab9d21
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SkipDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..867116c5da718f66205132d70a93c39464096df6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SlideDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SpaceToDepth.pbtxt b/tensorflow/core/api_def/python_api/api_def_SpaceToDepth.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d56a7384eb9c75f2f90420a1a742733b364e770c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SpaceToDepth.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SpaceToDepth"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseAccumulatorApplyGradient.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseAccumulatorApplyGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e158c9ca0ca4620cb18c7e98f969598180df7c0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseAccumulatorApplyGradient.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseAccumulatorApplyGradient"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseAccumulatorTakeGradient.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseAccumulatorTakeGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5326f23def4637a178e8af1aff972f4ad1d982c7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseAccumulatorTakeGradient.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseAccumulatorTakeGradient"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyAdadelta.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d30a8676e03172b852a1a3c6d50f77722ed25625
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyAdadelta.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyAdadelta"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyAdagrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb5ddef2128bbaa3239a279df614a4e3512dcf41
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyAdagradDA.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c3b87b09536ee5d36f5d8b1c83b025e5857d13ab
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyAdagradDA.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyAdagradDA"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db4732873845424e21a14506b84723268a963eea
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyCenteredRMSProp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyFtrl.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..14e37b8ba209374d6478cc047229e55314edcd81
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyFtrl"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d307af9b497739d76d1d875751ef16323a1d56b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyFtrlV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyFtrlV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ed34c0485d4b597dbe366ad69503be1393e079e8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff2d3b673141a571b6b1d816bf29fb5ba9880232
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyProximalAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f342a611bb2aed3e4c9eae07a4802ed59bae76e8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyProximalGradientDescent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseApplyRMSProp.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f337d50e5b0a995a2a3765782bf229d967ee9b4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseApplyRMSProp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyRMSProp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseConditionalAccumulator.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseConditionalAccumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bad4120795ef1fc6411a3bcacc209efe2e7b6841
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseConditionalAccumulator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseConditionalAccumulator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseDenseCwiseAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseDenseCwiseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c5e7c9851f15b6748e48eadd26b26925fbe2ed94
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseDenseCwiseAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseDenseCwiseAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseDenseCwiseDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseDenseCwiseDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f72031cf684b4515bbf580203f2ea4e5714eab58
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseDenseCwiseDiv.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseDenseCwiseDiv"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseDenseCwiseMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseDenseCwiseMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a87004ee5f9c9d445acb1d49138639d91da7b44f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseDenseCwiseMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseDenseCwiseMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseReduceMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseReduceMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a885e97d238101857a3c45a11ba0dbacb1a21fd8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseReduceMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseReduceMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseReduceMaxSparse.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseReduceMaxSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7d697880178cc6252edd0672553d8bf44eb6bea
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseReduceMaxSparse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseReduceMaxSparse"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseReduceSum.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseReduceSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee30d7aaf17bc122797394b61cb9eeba0387928d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseReduceSum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseReduceSum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseReduceSumSparse.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseReduceSumSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0dd89fb4c50a955f75d38dd999467d467eea8d54
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseReduceSumSparse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseReduceSumSparse"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSegmentMean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f12c2e207368d4c585137bfeac523e2fada0ed50
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSegmentMean.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSegmentMean"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSegmentMeanGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSegmentMeanGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..771083cd513c4d3deba21aaa2abaa090362c5684
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSegmentMeanGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSegmentMeanGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fcb029535c36c6be0e7e08f36ac7b39a5e126df0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSegmentMeanWithNumSegments"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSqrtN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7daaa81482ff7f9e84ee7ba8a3768d0b19ef38e0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSegmentSqrtN"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSegmentSqrtNGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSqrtNGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0682a597bb038abebab4198a44079486b60eb799
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSqrtNGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7311a093df9e593dbebac764cd28e55c556ae6da
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNWithNumSegments"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7028efce268d3e6e80328908c3a9e2dfc3d4343
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSegmentSum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81c2b8554e8f62e5ffffbdf410389776bbcf9035
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSegmentSumWithNumSegments"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSlice.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..716f8781d0ce698deb9b1f70ade8f1988b418749
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSlice.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSlice"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSoftmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSoftmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc29dd6513a9abce3f5ab39a3e8a5a79aed138d4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSoftmax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSoftmax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSparseMaximum.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSparseMaximum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0dbadc01edc539658e3066a9d720c71bc50ecae9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSparseMaximum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSparseMaximum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSparseMinimum.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSparseMinimum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0e3ffcbddf3bccdae0e28cc15527566ddf2ff03a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSparseMinimum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSparseMinimum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..19c0c7f199dfd24d24a56c3766733f9e55957c12
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseTensorSliceDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseToSparseSetOperation.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseToSparseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..735ee18e149aca56cd82c9bb2b3fd8d3870a3188
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseToSparseSetOperation.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseToSparseSetOperation"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ab4c3e441dd51f50a2796ef9d6fa0d21b727ffa
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SqlDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59e2dfe8366813242337c9490d74ca317e525636
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Sqrt"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b39ae25fa062b4271dcc2aee6523847c97b1e4d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Square"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Stage.pbtxt b/tensorflow/core/api_def/python_api/api_def_Stage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66de5901bc9b604d92693e2affb75ea8555bfc4e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Stage.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Stage"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StageClear.pbtxt b/tensorflow/core/api_def/python_api/api_def_StageClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f54a1c1c0428753d93cc19abbd4fbc961d8eb988
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StageClear.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StageClear"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StagePeek.pbtxt b/tensorflow/core/api_def/python_api/api_def_StagePeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..710394d30d34cece63c148f41b6a18a3e4d99b7b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StagePeek.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StagePeek"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StageSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_StageSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..472032ac42af197e32a437a112f6c39704193ad0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StageSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StageSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f7bed36602f40602313157c20677acbbf592d7be
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatsAggregatorHandle"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8b1bab2440f1934f1fd0194b76b7907fb0fb142d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatsAggregatorSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StridedSlice.pbtxt b/tensorflow/core/api_def/python_api/api_def_StridedSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a55fa9887797ed9fa6900f9e77f9d1fa70de5aa2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StridedSlice.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StridedSlice"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StridedSliceAssign.pbtxt b/tensorflow/core/api_def/python_api/api_def_StridedSliceAssign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bcf1df228e879fbde73fed7d0e955f67ea494663
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StridedSliceAssign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StridedSliceAssign"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StridedSliceGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_StridedSliceGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05d7d57511e8dd485d40a5168ca0866c4b6c481a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StridedSliceGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StridedSliceGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SummaryWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_SummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fe57ecf195c85217bd174dbc503b28f26adade9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c270ada3c219b03715e0cd651a4b56fe5ebc227
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TFRecordDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..711b335dc1926d32071637b3c986727c339736a3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TakeDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5bc3920c56360f2348805db1db79ab2b630f379d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..89ad016483fa392a302915d588d32201237c717a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorSliceDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08d785191b6a4bddce2ac43fd4c0188b4d74548e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TextLineDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Transpose.pbtxt b/tensorflow/core/api_def/python_api/api_def_Transpose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e22b6a040e4011f87b1c945ffd7df050bcbdea76
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Transpose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Transpose"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e5415749f0d3abad8f6f5c632a0bc59b11e8de2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnbatchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueWithCounts.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueWithCounts.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71b35eaab5f4a251ebebf9ddb7baf2ecd0a12401
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UniqueWithCounts.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithCounts"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueWithCountsV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueWithCountsV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7876e55cf3e2c24e19507cefb01f9f61abd0a2bc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UniqueWithCountsV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithCountsV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Unstage.pbtxt b/tensorflow/core/api_def/python_api/api_def_Unstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..65eb756b870d4a6b8d767de3876d9353a192c1d5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Unstage.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Unstage"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_VarHandleOp.pbtxt b/tensorflow/core/api_def/python_api/api_def_VarHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c93a6db93cf62aab345bb9044e4acddd01da7d9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_VarHandleOp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "VarHandleOp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_VarIsInitializedOp.pbtxt b/tensorflow/core/api_def/python_api/api_def_VarIsInitializedOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de5d9850acb1a3adbc59e554aedd819d870c5442
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_VarIsInitializedOp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "VarIsInitializedOp"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_VariableShape.pbtxt b/tensorflow/core/api_def/python_api/api_def_VariableShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b317152ddf925b3f0b5b24c95bcb44bed6b718a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_VariableShape.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "VariableShape"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Where.pbtxt b/tensorflow/core/api_def/python_api/api_def_Where.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4dd25a20655a036c0fba33c14133389a532bf8e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Where.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Where"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_While.pbtxt b/tensorflow/core/api_def/python_api/api_def_While.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f47a9b0fceb77af3dc9fcad3569f8e18b7f44188
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_While.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "While" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteAudioSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteAudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..520952cd4117867f61cd3c536b8a7cc5beeeab62
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteAudioSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteAudioSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteGraphSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteGraphSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3653477b2067875ee772dedc5015bc550de1ec12
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteGraphSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteGraphSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteHistogramSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteHistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26e1482630596d9ecb80917ff91adb2bd1131692
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteHistogramSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteHistogramSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteImageSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78db8700f0c7231f24fd1db3a0eedbcc4f43deeb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteImageSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteImageSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteScalarSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bae8638d258b6fdf217fbdbd8705369f57e0bb3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteScalarSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteScalarSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db86883e21ea47a9e74788f8d76a166a235674da
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd1459521ff70fc4b3adce7fbb1251b45106b439
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ZipDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index 832a55f2556f46efe6a94fb62d0420330917faac..822d0065b6713dbc6692ed11b7a938a784b0d597 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -114,19 +114,43 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
 
     const string accumulator_name =
         strings::StrCat(n->name(), "/Internal/Accumulator");
+    TensorShapeProto variable_shape;
+    variable_shape.add_dim()->set_size(0);
     TF_RETURN_IF_ERROR(make_node("TemporaryVariable")
-                           .Attr("shape", shape)
+                           .Attr("shape", variable_shape)
                            .Attr("dtype", dtype)
                            .Attr("var_name", accumulator_name)
                            .Finalize(g, &create_accumulator));
-    TF_RETURN_IF_ERROR(make_node("Const")
-                           .Attr("value", make_zeros(dtype, shape))
-                           .Attr("dtype", dtype)
-                           .Finalize(g, &initial_val));
+    if (PartialTensorShape(shape).IsFullyDefined()) {
+      // For fully defined shapes make a constant zero tensor.
+      TF_RETURN_IF_ERROR(make_node("Const")
+                             .Attr("value", make_zeros(dtype, shape))
+                             .Attr("dtype", dtype)
+                             .Finalize(g, &initial_val));
+    } else {
+      // For partial shapes make a Fill operation to make a zero tensor with the
+      // shape of the first input.
+      Node* shape_node;
+      TF_RETURN_IF_ERROR(
+          make_node("Shape")
+              .Input(data_edges[0]->src(), data_edges[0]->src_output())
+              .Finalize(g, &shape_node));
+      Node* zero;
+      TF_RETURN_IF_ERROR(
+          make_node("Const")
+              .Attr("value", make_zeros(dtype, TensorShapeProto()))
+              .Attr("dtype", dtype)
+              .Finalize(g, &zero));
+      TF_RETURN_IF_ERROR(make_node("Fill")
+                             .Input(shape_node)
+                             .Input(zero)
+                             .Finalize(g, &initial_val));
+    }
     TF_RETURN_IF_ERROR(make_node("Assign")
                            .Attr("T", dtype)
                            .Input(create_accumulator)  // ref: Ref(T)
                            .Input(initial_val)         // value: T
+                           .Attr("validate_shape", false)
                            .Finalize(g, &initialize_accumulator));
     for (int i = 0; i < data_edges.size(); ++i) {
       Node* assignAdd;
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..637b43c844b6938db457f49bfc423304907a889f
--- /dev/null
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -0,0 +1,314 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+
+#include "tensorflow/core/common_runtime/broadcaster.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/ring_reducer.h"
+#include "tensorflow/core/lib/core/notification.h"
+
+#define VALUE_IN_DEBUG_STRING false
+
+namespace tensorflow {
+/*static*/
+int64 CollectiveAdapter::AlignedChunkElts(int64 elt_bytes, int64 total_elts,
+                                          int64 num_chunks) {
+  DCHECK_GT(num_chunks, 0);
+  int64 base_chunk_elts = (total_elts + (num_chunks - 1)) / num_chunks;
+  if (EIGEN_MAX_ALIGN_BYTES == 0) return base_chunk_elts;
+  if (EIGEN_MAX_ALIGN_BYTES <= elt_bytes) {
+    // Tolerate weird small values of EIGEN_MAX_ALIGN_BYTES
+    DCHECK_EQ(0, elt_bytes % EIGEN_MAX_ALIGN_BYTES);
+    return base_chunk_elts;
+  }
+  // elt_bytes < EIGEN_MAX_ALIGN_BYTES, which
+  // must be a common multiple of the various atomic data types.
+  DCHECK_EQ(0, EIGEN_MAX_ALIGN_BYTES % elt_bytes)
+      << "total_elts=" << total_elts << " num_chunks=" << num_chunks
+      << " EIGEN_MAX_ALIGN_BYTES=" << EIGEN_MAX_ALIGN_BYTES
+      << " elt_bytes=" << elt_bytes;
+  // Round bytes per chunk up to the next multiple of EIGEN_MAX_ALIGN_BYTES.
+  int64 chunk_bytes = base_chunk_elts * elt_bytes;
+  int64 diff =
+      (chunk_bytes < EIGEN_MAX_ALIGN_BYTES)
+          ? (EIGEN_MAX_ALIGN_BYTES - chunk_bytes)
+          : (EIGEN_MAX_ALIGN_BYTES - (chunk_bytes % EIGEN_MAX_ALIGN_BYTES));
+  CHECK_EQ(0, diff % elt_bytes);
+  base_chunk_elts += (diff / elt_bytes);
+  DCHECK_EQ(0, ((base_chunk_elts * elt_bytes) % EIGEN_MAX_ALIGN_BYTES))
+      << "total_elts=" << total_elts << " num_chunks=" << num_chunks
+      << " EIGEN_MAX_ALIGN_BYTES=" << EIGEN_MAX_ALIGN_BYTES
+      << " base_chunk_elts=" << base_chunk_elts << " elt_bytes=" << elt_bytes;
+  return base_chunk_elts;
+}
+
+namespace {
+template <typename T>
+class CollectiveAdapterImpl : public CollectiveAdapter {
+ public:
+  // Takes ownership of output and prepares to properly alias its chunks.
+  // Ownership is taken because the shape may temporarily change.
+  CollectiveAdapterImpl(Tensor* output, int64 num_chunks, Allocator* allocator)
+      : output_(std::move(*output)),
+        dt_(output_.dtype()),
+        old_shape_(output_.shape()),
+        num_chunks_(num_chunks),
+        allocator_(allocator),
+        total_elts_(output_.NumElements()),
+        chunk_elts_(AlignedChunkElts(sizeof(T), total_elts_, num_chunks_)),
+        data_start_(reinterpret_cast<T*>(DMAHelper::base(&output_))),
+        data_end_(data_start_ + total_elts_) {
+    CHECK_GT(chunk_elts_, 0);
+    Flatten();
+  }
+
+  ~CollectiveAdapterImpl() override {}
+
+  const Tensor& Value() const override { return output_; }
+
+  // If necessary, flatten output.
+  void Flatten() {
+    if (old_shape_.dims() > 1) {
+      TensorShape new_shape = TensorShape({old_shape_.num_elements()});
+      DMAHelper::UnsafeSetShape(&output_, new_shape);
+    }
+  }
+
+  void ConsumeFinalValue(Tensor* output) override {
+    if (old_shape_ != output_.shape()) {
+      DMAHelper::UnsafeSetShape(&output_, old_shape_);
+    }
+    *output = std::move(output_);
+  }
+
+  // Number of T elements in a particular chunk.
+  inline int64 ChunkElts(int i) const {
+    DCHECK_LT(i, num_chunks_);
+    const T* chunk_start = std::min(data_end_, data_start_ + i * chunk_elts_);
+    const T* chunk_end = std::min(data_end_, chunk_start + chunk_elts_);
+    return chunk_end - chunk_start;
+  }
+
+  int64 ChunkBytes(int i) const override { return sizeof(T) * ChunkElts(i); }
+
+  // Returns a new Tensor that aliases the required chunk.
+  Tensor ChunkAlias(int i) override {
+    int64 start = chunk_elts_ * i;
+    int64 num_elts = ChunkElts(i);
+    // If this chunk is empty the prior chunk might also be short
+    // so always take an empty slice from the front of the tensor
+    // to avoid an illegal offset check failure somewhere.
+    return (num_elts > 0) ? output_.Slice(start, start + num_elts)
+                          : output_.Slice(0, 0);
+  }
+
+  Tensor TempChunk(int i) const override {
+    AllocationAttributes empty;
+    return Tensor(allocator_, dt_, {ChunkElts(i)}, empty);
+  }
+
+  string DebugString() const override {
+    return strings::StrCat(
+        "base addr ", reinterpret_cast<int64>(DMAHelper::base(&output_)),
+        " num_chunks ", num_chunks_, " total_elts ", total_elts_, " chunk_elts",
+        chunk_elts_, " value ",
+        VALUE_IN_DEBUG_STRING ? output_.SummarizeValue(1024) : "<hidden>");
+  }
+
+  string TBounds(const Tensor& t) const override {
+    int64 base_addr = reinterpret_cast<int64>(DMAHelper::base(&t));
+    return strings::StrCat("(", base_addr, ", ", (base_addr + t.TotalBytes()),
+                           ")");
+  }
+
+  Tensor Scalar(int v) const override {
+    Tensor t(dt_, TensorShape({}));
+    t.scalar<T>()() = v;
+    return t;
+  }
+
+  Tensor Scalar(Allocator* a) const override {
+    Tensor t(a, dt_, TensorShape({}));
+    return t;
+  }
+
+  Tensor output_;
+  const DataType dt_;
+  const TensorShape old_shape_;
+  const int64 num_chunks_;
+  Allocator* allocator_;
+  const int64 total_elts_;
+  const int64 chunk_elts_;
+  const T* data_start_;
+  const T* data_end_;
+};
+
+}  // namespace
+
+CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
+                                         Allocator* allocator) {
+  switch (output->dtype()) {
+    case DT_FLOAT:
+      return new CollectiveAdapterImpl<float>(output, num_chunks, allocator);
+      break;
+    case DT_DOUBLE:
+      return new CollectiveAdapterImpl<double>(output, num_chunks, allocator);
+      break;
+    case DT_INT32:
+      return new CollectiveAdapterImpl<int32>(output, num_chunks, allocator);
+      break;
+    case DT_INT64:
+      return new CollectiveAdapterImpl<int64>(output, num_chunks, allocator);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported type " << output->dtype()
+                 << " to MakeCollectiveAdapter";
+      return nullptr;
+  }
+}
+
+BaseCollectiveExecutor::~BaseCollectiveExecutor() {}
+
+void BaseCollectiveExecutor::StartAbort(const Status& s) {
+  LOG(WARNING) << "BaseCollectiveExecutor::StartAbort " << s;
+  remote_access_->StartAbort(s);
+}
+
+void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
+                                          const CollectiveParams& col_params,
+                                          const string& exec_key,
+                                          StatusCallback done) {
+  // On any individual collective Op failure we need to abort the
+  // BufRendezvous so that other Ops in the instance don't hang
+  // waiting for transmissions that will never happen.  Do so after a
+  // delay so that the original error status is more likely to
+  // propagate up, and peers are unlikely to re-create the purged
+  // BufRendezvous by late-arriving requests.
+  StatusCallback done_safe = [this, done](const Status& s) {
+    if (!s.ok()) {
+      Ref();  // Ensure this lasts until the closure executes.
+      SchedNonBlockingClosureAfter(1000000, [this, s] {
+        remote_access_->buf_rendezvous()->StartAbort(s);
+        Unref();
+      });
+    }
+    done(s);
+  };
+
+  Tensor* output = ctx->mutable_output(0);
+  string error;
+  switch (col_params.instance.type) {
+    case REDUCTION_COLLECTIVE: {
+      // TODO(tucker): support other reduction algorithms,
+      // e.g. tree-reduce, hybrid tree/ring, delegate-to-NCCL, etc.
+      const Tensor* input = &ctx->input(0);
+      RingReducer* reducer =
+          CreateReducer(ctx, CtxParams(ctx), col_params, exec_key, step_id_,
+                        input, output, &error);
+      if (!reducer) {
+        done_safe(errors::Internal(error));
+        return;
+      }
+      // Run in an I/O thread, so as not to starve the executor threads.
+      // TODO(tucker): Instead of forking every per-device Collective
+      // Op off into its own thread, consider queuing them on a
+      // fixed-size thread-pool dedicated to running CollectiveOps.
+      SchedClosure([reducer, done_safe]() {
+        reducer->Run([reducer, done_safe](const Status& s) {
+          done_safe(s);
+          delete reducer;
+        });
+      });
+    } break;
+
+    case BROADCAST_COLLECTIVE: {
+      Broadcaster* broadcaster = CreateBroadcaster(
+          ctx, CtxParams(ctx), col_params, exec_key, step_id_, output, &error);
+      if (!broadcaster) {
+        done_safe(errors::Internal(error));
+        return;
+      }
+      // Run in an I/O thread, so as not to starve the executor threads.
+      SchedClosure([broadcaster, done_safe]() {
+        broadcaster->Run([broadcaster, done_safe](const Status& s) {
+          done_safe(s);
+          delete broadcaster;
+        });
+      });
+    } break;
+
+    default:
+      done_safe(errors::Internal("Unimplemented CollectiveType ",
+                                 col_params.instance.type));
+  }
+}
+
+RingReducer* BaseCollectiveExecutor::CreateReducer(
+    OpKernelContext* ctx, OpKernelContext::Params* params,
+    const CollectiveParams& col_params, const string& exec_key, int64 step_id,
+    const Tensor* input, Tensor* output, string* error) {
+  switch (col_params.instance.data_type) {
+    case DT_INT32:
+      if (col_params.group.device_type == DEVICE_GPU) {
+        *error =
+            "Collective Reduce does not support datatype DT_INT32 on "
+            "DEVICE_GPU";
+        return nullptr;
+      }
+      TF_FALLTHROUGH_INTENDED;
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_INT64:
+      return new RingReducer(this, dev_mgr_, ctx, params, col_params, exec_key,
+                             step_id, input, output);
+      break;
+    default:
+      *error = strings::StrCat("Collective Reduce does not support datatype ",
+                               col_params.instance.data_type);
+      return nullptr;
+  }
+}
+
+Broadcaster* BaseCollectiveExecutor::CreateBroadcaster(
+    OpKernelContext* ctx, OpKernelContext::Params* params,
+    const CollectiveParams& col_params, const string& exec_key, int64 step_id,
+    Tensor* output, string* error) {
+  switch (col_params.instance.data_type) {
+    case DT_INT32:
+      if (col_params.group.device_type == DEVICE_GPU) {
+        *error =
+            "Collective Broadcast does not support datatype DT_INT32 on "
+            "DEVICE_GPU";
+        return nullptr;
+      }
+      TF_FALLTHROUGH_INTENDED;
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_INT64: {
+      return new Broadcaster(this, dev_mgr_, ctx, params, col_params, exec_key,
+                             step_id, output);
+    } break;
+    default:
+      *error =
+          strings::StrCat("Collective Broadcast does not support datatype ",
+                          DataTypeString(col_params.instance.data_type));
+      return nullptr;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..462d6b75331ca552ca0db50538af23cb5868e3f6
--- /dev/null
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -0,0 +1,151 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
+
+#include <string>
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+class Broadcaster;
+class DeviceMgr;
+class RingReducer;
+
+// Helper interface that aliases regular subfields of a Tensor as separate
+// Tensors for in-place update.
+class CollectiveAdapter {
+ public:
+  virtual ~CollectiveAdapter() {}
+
+  // Move the backing tensor to 'output' with its original storage and
+  // shape. After this call this CollectiveAdapter object should be
+  // deleted immediately without calling any of its other methods.
+  virtual void ConsumeFinalValue(Tensor* output) = 0;
+
+  // const access to entire intermediate value for debugging
+  virtual const Tensor& Value() const = 0;
+
+  // Returns tensor for chunk i which aliases the backing buffer.
+  virtual Tensor ChunkAlias(int i) = 0;
+
+  // Returns tensor allocated on the same device but with its own
+  // separate backing buffer.  Will have same type and size as
+  // chunk i.
+  virtual Tensor TempChunk(int i) const = 0;
+
+  // Bytes in chunk i
+  virtual int64 ChunkBytes(int i) const = 0;
+
+  // Generate a CPU RAM scalar tensor of the same DataType as the
+  // backing tensor with the given integer value.
+  virtual Tensor Scalar(int v) const = 0;
+
+  // Generate a scalar tensor of same DataType and on the same device
+  // as the backing tensor.
+  virtual Tensor Scalar(Allocator* a) const = 0;
+
+  // Debugging string describing buffer location
+  virtual string TBounds(const Tensor& t) const = 0;
+
+  virtual string DebugString() const = 0;
+
+  // Computes the number of elements per alias chunk tensor.
+  //
+  // A CHECK in tensor.cc expects that the memory buffer backing a
+  // Tensor will be aligned according to EIGEN_MAX_ALIGN_BYTES.  To
+  // ensure that all chunk aliasing Tensors maintain this alignment we
+  // need to pick a chunk size that preserves it.  Note than in extreme
+  // cases (impractical, but possible with very small tensors) one or
+  // more tail chunks can end up emptby.
+  static int64 AlignedChunkElts(int64 elt_bytes, int64 total_elts,
+                                int64 num_chunks);
+};
+
+// Create a CollectiveAdaptor wrapping 'output', specialized to its
+// data-type and shape.
+CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
+                                         Allocator* allocator);
+
+// Default implementation of CollectiveExecutor.  Delegates the actual
+// work of moving data to a class specialized for the operation type,
+// arguments and device+interconnect topology.
+class BaseCollectiveExecutor : public CollectiveExecutor {
+ public:
+  BaseCollectiveExecutor(CollectiveExecutorMgrInterface* cem,
+                         PerStepCollectiveRemoteAccess* remote_access,
+                         int64 step_id, const DeviceMgr* dev_mgr)
+      : CollectiveExecutor(cem),
+        step_id_(step_id),
+        dev_mgr_(dev_mgr),
+        remote_access_(remote_access) {}
+
+  ~BaseCollectiveExecutor() override;
+
+  void StartAbort(const Status& s) override;
+
+  void ExecuteAsync(OpKernelContext* ctx, const CollectiveParams& col_params,
+                    const string& exec_key, StatusCallback done) override;
+
+  PerStepCollectiveRemoteAccess* remote_access() override {
+    return remote_access_.get();
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    remote_access_->RecvFromPeer(peer_device, peer_task, peer_is_local, key,
+                                 to_device, to_device_ctx, to_alloc_attr,
+                                 to_tensor, client_locality, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    remote_access_->PostToPeer(peer_device, peer_task, key, from_device,
+                               from_device_ctx, from_alloc_attr, from_tensor,
+                               client_locality, done);
+  }
+
+ protected:
+  const int64 step_id_;
+  const DeviceMgr* dev_mgr_;  // Not owned.
+  std::unique_ptr<PerStepCollectiveRemoteAccess> remote_access_;
+
+ private:
+  RingReducer* CreateReducer(OpKernelContext* ctx,
+                             OpKernelContext::Params* params,
+                             const CollectiveParams& col_params,
+                             const string& exec_key, int64 step_id,
+                             const Tensor* input, Tensor* output,
+                             string* error);
+
+  Broadcaster* CreateBroadcaster(OpKernelContext* ctx,
+                                 OpKernelContext::Params* params,
+                                 const CollectiveParams& col_params,
+                                 const string& exec_key, int64 step_id,
+                                 Tensor* output, string* error);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index b8e773503c7a2f8024e8a6f58247ad343a762f71..ba5a3eea3ac07115b2f3ea57528c7baed2008dfe 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -378,7 +378,7 @@ class BFCAllocator : public VisitableAllocator {
   inline int Log2FloorNonZero(uint64 n) {
 #if defined(__GNUC__)
     return 63 ^ __builtin_clzll(n);
-#elif defined(PLATFORM_WINDOWS)
+#elif defined(PLATFORM_WINDOWS) && (_WIN64)
     unsigned long index;
     _BitScanReverse64(&index, n);
     return index;
diff --git a/tensorflow/core/common_runtime/broadcaster.cc b/tensorflow/core/common_runtime/broadcaster.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e8af8653dc011e926218761f113888cb8caf926
--- /dev/null
+++ b/tensorflow/core/common_runtime/broadcaster.cc
@@ -0,0 +1,249 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/broadcaster.h"
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/env.h"
+
+// Set true for greater intelligibility of debug mode log messages.
+#define READABLE_KEYS false
+
+namespace tensorflow {
+
+namespace {
+// Key to be used for BufRendezvous by Broadcaster.
+string BroadcastBufKey(const string& exec_key, int src_rank, int dst_rank) {
+  if (READABLE_KEYS) {
+    return strings::StrCat("broadcast(", exec_key, "):src(", src_rank, "):dst(",
+                           dst_rank, ")");
+  } else {
+    // TODO(tucker): Try a denser format, e.g. a 64 or 128 bit hash.
+    return strings::StrCat(exec_key, ":", src_rank, ":", dst_rank);
+  }
+}
+}  // namespace
+
+Broadcaster::Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+                         OpKernelContext* ctx, OpKernelContext::Params* params,
+                         const CollectiveParams& col_params,
+                         const string& exec_key, int64 step_id, Tensor* output)
+    : col_exec_(col_exec),
+      dev_mgr_(dev_mgr),
+      ctx_(ctx),
+      col_params_(col_params),
+      exec_key_(exec_key),
+      rank_(col_params.subdiv_rank[0]),
+      is_source_(col_params.is_source),
+      output_(output),
+      done_(nullptr),
+      device_(nullptr) {}
+
+void Broadcaster::Run(StatusCallback done) {
+  // The optimal data transfer choreography is going to very platform dependent.
+  // That will be addressed by later improvements here or by platform-specific
+  // overrides of collective broadcast. The initial version is simply
+  // a binary tree that completely ignores DeviceLocality.
+  done_ = std::move(done);
+
+  // Get the device for which we're executing and look up its locality.
+  status_ = dev_mgr_->LookupDevice(
+      col_params_.instance.device_names[col_params_.default_rank], &device_);
+  if (!status_.ok()) {
+    done_(status_);
+    return;
+  }
+  CHECK(device_);
+  device_locality_ = device_->attributes().locality();
+
+  RunTree();
+}
+
+// Binary tree parent/child relations are trivial to calculate, i.e.
+// device at rank r is the parent of 2r+1 and 2r+2.  The one exception
+// is if the source is not rank 0.  We treat that case as though the
+// source is appended to the front of the rank ordering as well as
+// continuing to occupy its current position.  Hence we calculate as
+// though each device's rank is actually r+1, then subtract 1 again to
+// get the descendent ranks.  If the source is not rank 0 then its
+// decendents include both {0,1} and the descendents of its current
+// position.  Where a non-0-rank source is a descendent of another
+// device, no send to it is necessary.
+
+/* static*/
+int Broadcaster::TreeRecvFrom(const CollectiveParams& cp) {
+  DCHECK_EQ(1, cp.subdiv_rank.size());
+  if (cp.is_source) return -1;
+  int source_rank = cp.instance.impl_details.subdiv_source_rank[0];
+  int my_rank = cp.subdiv_rank[0];
+  if (source_rank == 0) {
+    return (my_rank - 1) / 2;
+  } else {
+    int predecessor_rank = (my_rank / 2) - 1;
+    return (predecessor_rank < 0) ? source_rank : predecessor_rank;
+  }
+}
+
+/* static */
+void Broadcaster::TreeSendTo(const CollectiveParams& cp,
+                             std::vector<int>* targets) {
+  DCHECK_EQ(1, cp.subdiv_rank.size());
+  targets->clear();
+  int my_rank = cp.subdiv_rank[0];
+  DCHECK_EQ(1, cp.instance.impl_details.subdiv_source_rank.size());
+  int source_rank = cp.instance.impl_details.subdiv_source_rank[0];
+  int successor_rank = 0;
+  if (source_rank == 0) {
+    successor_rank = (2 * my_rank) + 1;
+  } else {
+    successor_rank = (2 * (my_rank + 1));
+  }
+  DCHECK_NE(successor_rank, my_rank);
+  if (cp.is_source && source_rank != 0) {
+    // The source sends to rank 0,1 in addition to its positional
+    // decendents.
+    if (cp.group.group_size > 1) {
+      targets->push_back(0);
+    }
+    if (cp.group.group_size > 2 && source_rank != 1) {
+      targets->push_back(1);
+    }
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (successor_rank < cp.group.group_size && successor_rank != source_rank) {
+      targets->push_back(successor_rank);
+    }
+    ++successor_rank;
+  }
+}
+
+// Execute a tree broadcast, i.e. each non-source device receives from
+// one other and sends to up-to two others.
+void Broadcaster::RunTree() {
+  mutex mu;
+  int pending_count = 0;  // GUARDED_BY(mu)
+  condition_variable all_done;
+  std::vector<int> send_to_ranks;
+  TreeSendTo(col_params_, &send_to_ranks);
+
+  if (!is_source_) {
+    // Begin by receiving the value.
+    int recv_from_rank = TreeRecvFrom(col_params_);
+    Notification note;
+    DispatchRecv(recv_from_rank, output_,
+                 [this, recv_from_rank, &mu, &note](const Status& s) {
+                   mutex_lock l(mu);
+                   status_.Update(s);
+                   note.Notify();
+                 });
+    note.WaitForNotification();
+  }
+
+  // Then forward value to all descendent devices.
+  if (status_.ok()) {
+    for (int i = 0; i < send_to_ranks.size(); ++i) {
+      int target_rank = send_to_ranks[i];
+      {
+        mutex_lock l(mu);
+        ++pending_count;
+      }
+      DispatchSend(
+          target_rank, output_,
+          [this, target_rank, &mu, &pending_count, &all_done](const Status& s) {
+            status_.Update(s);
+            {
+              mutex_lock l(mu);
+              --pending_count;
+              if (pending_count == 0) {
+                all_done.notify_all();
+              }
+            }
+          });
+    }
+  }
+
+  if (status_.ok() && is_source_) {
+    // Meanwhile, copy input to output if we weren't lucky enough to
+    // be able to reuse input as output.
+    const Tensor* input = &ctx_->input(0);
+    if (input != output_ &&
+        (DMAHelper::base(input) != DMAHelper::base(output_))) {
+      {
+        mutex_lock l(mu);
+        ++pending_count;
+      }
+      DeviceContext* op_dev_ctx = ctx_->op_device_context();
+      CollectiveRemoteAccessLocal::MemCpyAsync(
+          op_dev_ctx, op_dev_ctx, device_, device_, ctx_->input_alloc_attr(0),
+          ctx_->output_alloc_attr(0), input, output_,
+          [this, &mu, &pending_count, &all_done](const Status& s) {
+            status_.Update(s);
+            {
+              mutex_lock l(mu);
+              --pending_count;
+              if (0 == pending_count) {
+                all_done.notify_all();
+              }
+            }
+          });
+    }
+  }
+
+  // Then wait for all pending actions to complete.
+  {
+    mutex_lock l(mu);
+    if (pending_count > 0) {
+      all_done.wait(l);
+    }
+  }
+
+  VLOG(2) << "return status " << status_;
+  done_(status_);
+}
+
+void Broadcaster::DispatchSend(int dst_rank, const Tensor* src_tensor,
+                               const StatusCallback& done) {
+  string send_buf_key = BroadcastBufKey(exec_key_, rank_, dst_rank);
+  VLOG(1) << "DispatchSend " << send_buf_key << " from_device "
+          << device_->name();
+  int dst_idx =
+      col_params_.instance.impl_details.subdiv_permutations[0][dst_rank];
+  col_exec_->PostToPeer(col_params_.instance.device_names[dst_idx],
+                        col_params_.instance.task_names[dst_idx], send_buf_key,
+                        device_, ctx_->op_device_context(),
+                        ctx_->output_alloc_attr(0), src_tensor,
+                        device_locality_, done);
+}
+
+void Broadcaster::DispatchRecv(int src_rank, Tensor* dst_tensor,
+                               const StatusCallback& done) {
+  string recv_buf_key = BroadcastBufKey(exec_key_, src_rank, rank_);
+  int src_idx =
+      col_params_.instance.impl_details.subdiv_permutations[0][src_rank];
+  VLOG(1) << "DispatchRecv " << recv_buf_key << " from_device "
+          << col_params_.instance.device_names[src_idx];
+  int dst_idx = col_params_.instance.impl_details.subdiv_permutations[0][rank_];
+  CHECK_EQ(col_params_.instance.device_names[dst_idx], device_->name());
+  col_exec_->RecvFromPeer(col_params_.instance.device_names[src_idx],
+                          col_params_.instance.task_names[src_idx],
+                          col_params_.task.is_local[src_idx], recv_buf_key,
+                          device_, ctx_->op_device_context(),
+                          ctx_->output_alloc_attr(0), dst_tensor,
+                          device_locality_, done);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/broadcaster.h b/tensorflow/core/common_runtime/broadcaster.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdf68f19abdf4cf6639011f3c02bee6cea100364
--- /dev/null
+++ b/tensorflow/core/common_runtime/broadcaster.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_
+
+#include <vector>
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+
+// Tree-algorithm implementation of collective broadcast.
+class Broadcaster {
+ public:
+  Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+              OpKernelContext* ctx, OpKernelContext::Params* params,
+              const CollectiveParams& col_params, const string& exec_key,
+              int64 step_id, Tensor* output);
+
+  void Run(StatusCallback done);
+
+  // Returns the rank of the device from which this device should receive
+  // its value, -1 if no value should be received.
+  static int TreeRecvFrom(const CollectiveParams& cp);
+
+  // Populates targets with the ranks of the devices to which this device
+  // should forward the value.
+  static void TreeSendTo(const CollectiveParams& cp, std::vector<int>* targets);
+
+ private:
+  void DispatchSend(int dst_rank, const Tensor* src_tensor,
+                    const StatusCallback& done);
+  void DispatchRecv(int src_rank, Tensor* dst_tensor,
+                    const StatusCallback& done);
+  void RunTree();
+
+  Status status_;
+  CollectiveExecutor* col_exec_;  // Not owned
+  const DeviceMgr* dev_mgr_;      // Not owned
+  OpKernelContext* ctx_;          // Not owned
+  const CollectiveParams& col_params_;
+  const string exec_key_;
+  const int rank_;
+  const bool is_source_;
+  Tensor* output_;  // Not owned
+  std::unique_ptr<CollectiveAdapter> ca_;
+  StatusCallback done_;
+  Device* device_;  // The device for which this instance labors
+  DeviceLocality device_locality_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_
diff --git a/tensorflow/core/common_runtime/broadcaster_test.cc b/tensorflow/core/common_runtime/broadcaster_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89d39144b3d212f267846e4d48a075ff383e4e47
--- /dev/null
+++ b/tensorflow/core/common_runtime/broadcaster_test.cc
@@ -0,0 +1,741 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/broadcaster.h"
+
+#include <algorithm>
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+static int64 kStepId = 123;
+static int32 kNumSubdivs = 1;  // Subdiv not yet meaningful for broadcast
+
+// The test harness won't allow a mixture of fixture and non-fixture
+// tests in one file, so this is a trival fixture for tests that don't
+// need the heavy-weight BroadcasterTest fixture.
+class TrivialTest : public ::testing::Test {
+ protected:
+  TrivialTest() {}
+};
+
+// Tests of static TreeSendTo() and TreeRecvFrom() functions.
+// D = number of devices
+// S = source rank
+// R = tested rank
+// RF = receive-from rank
+// ST = send_to rank vector
+#define DEF_TL_TEST(D, S, R, RF, ST)                               \
+  TEST_F(TrivialTest, TreeLinks_##D##Devs_##S##Source_##R##Rank) { \
+    CollectiveParams cp;                                           \
+    cp.group.group_size = D;                                       \
+    cp.instance.impl_details.subdiv_source_rank = {S};             \
+    cp.subdiv_rank = {R};                                          \
+    cp.is_source = (S == R);                                       \
+    EXPECT_EQ(RF, Broadcaster::TreeRecvFrom(cp));                  \
+    std::vector<int> expected = ST;                                \
+    std::vector<int> send_to;                                      \
+    Broadcaster::TreeSendTo(cp, &send_to);                         \
+    ASSERT_EQ(expected.size(), send_to.size());                    \
+    for (int i = 0; i < expected.size(); ++i) {                    \
+      EXPECT_EQ(expected[i], send_to[i]);                          \
+    }                                                              \
+  }
+
+#define V(...) std::vector<int>({__VA_ARGS__})
+
+//          D  S  R  RF  ST
+// 2 device cases
+DEF_TL_TEST(2, 0, 0, -1, V(1))
+DEF_TL_TEST(2, 1, 0, 1, V())
+DEF_TL_TEST(2, 0, 1, 0, V())
+DEF_TL_TEST(2, 1, 1, -1, V(0))
+// 3 device cases
+DEF_TL_TEST(3, 0, 0, -1, V(1, 2))
+DEF_TL_TEST(3, 0, 1, 0, V())
+DEF_TL_TEST(3, 0, 2, 0, V())
+DEF_TL_TEST(3, 1, 0, 1, V(2))
+DEF_TL_TEST(3, 1, 1, -1, V(0))
+DEF_TL_TEST(3, 1, 2, 0, V())
+DEF_TL_TEST(3, 2, 0, 2, V())
+DEF_TL_TEST(3, 2, 1, 2, V())
+DEF_TL_TEST(3, 2, 2, -1, V(0, 1))
+// 4 device cases
+DEF_TL_TEST(4, 0, 0, -1, V(1, 2))
+DEF_TL_TEST(4, 0, 1, 0, V(3))
+DEF_TL_TEST(4, 0, 2, 0, V())
+DEF_TL_TEST(4, 0, 3, 1, V())
+DEF_TL_TEST(4, 1, 0, 1, V(2, 3))
+DEF_TL_TEST(4, 1, 1, -1, V(0))
+DEF_TL_TEST(4, 1, 2, 0, V())
+DEF_TL_TEST(4, 1, 3, 0, V())
+DEF_TL_TEST(4, 2, 0, 2, V(3))
+DEF_TL_TEST(4, 2, 1, 2, V())
+DEF_TL_TEST(4, 2, 2, -1, V(0, 1))
+DEF_TL_TEST(4, 2, 3, 0, V())
+DEF_TL_TEST(4, 3, 0, 3, V(2))
+DEF_TL_TEST(4, 3, 1, 3, V())
+DEF_TL_TEST(4, 3, 2, 0, V())
+DEF_TL_TEST(4, 3, 3, -1, V(0, 1))
+// 8 device cases
+//          D  S  R  RF  ST
+DEF_TL_TEST(8, 0, 0, -1, V(1, 2))
+DEF_TL_TEST(8, 0, 1, 0, V(3, 4))
+DEF_TL_TEST(8, 0, 2, 0, V(5, 6))
+DEF_TL_TEST(8, 0, 3, 1, V(7))
+DEF_TL_TEST(8, 0, 4, 1, V())
+DEF_TL_TEST(8, 0, 5, 2, V())
+DEF_TL_TEST(8, 0, 6, 2, V())
+DEF_TL_TEST(8, 0, 7, 3, V())
+DEF_TL_TEST(8, 7, 0, 7, V(2, 3))
+DEF_TL_TEST(8, 7, 1, 7, V(4, 5))
+DEF_TL_TEST(8, 7, 2, 0, V(6))
+DEF_TL_TEST(8, 7, 3, 0, V())
+DEF_TL_TEST(8, 7, 4, 1, V())
+DEF_TL_TEST(8, 7, 5, 1, V())
+DEF_TL_TEST(8, 7, 6, 2, V())
+DEF_TL_TEST(8, 7, 7, -1, V(0, 1))
+#undef DEF_TL_TEST
+#undef V
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+// TODO(tucker): factor out of this file and ring_reducer_test.cc
+// into a single common source.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      auto error = errors::Internal("Deliberate failure");
+      LOG(INFO) << "triggering failure " << error;
+      SchedNonBlockingClosureAfter(
+          1000, [this, error] { buf_rendezvous()->StartAbort(error); });
+      done(error);
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ GUARDED_BY(mu_);
+};
+
+class BroadcasterTest : public ::testing::Test {
+ protected:
+  BroadcasterTest() : device_type_(DEVICE_CPU) {}
+
+  ~BroadcasterTest() override {
+    stop_ = true;
+    for (auto i : instances_) {
+      delete i;
+    }
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void SetUp() override {
+#if GOOGLE_CUDA
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+#endif
+  }
+
+  void Init(int num_workers, int num_devices, DataType dtype,
+            const DeviceType& device_type, int fail_after) {
+    device_type_ = device_type;
+    std::vector<Device*> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name = strings::StrCat("/job:worker/replica:0/task:", wi,
+                                            "/device:CPU:", di);
+          local_devices.push_back(new ThreadPoolDevice(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(gpu_devices_[dev_idx]);
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      dev_mgr_.reset(new DeviceMgr(local_devices));
+    }
+    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get());
+    col_params_.name = "test_collective";
+    col_params_.instance.data_type = dtype;
+    static const int kGroupKey = 5;
+    col_params_.group.group_key = kGroupKey;
+    static const int kInstanceKey = 17;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.group.device_type = device_type;
+    col_params_.group.group_size = num_workers * num_devices;
+    col_params_.instance.impl_details.subdiv_offsets.clear();
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    col_params_.instance.impl_details.subdiv_permutations.resize(kNumSubdivs);
+    col_params_.subdiv_rank.resize(kNumSubdivs);
+    int subdiv_stride = num_devices / kNumSubdivs;
+    for (int sdi = 0; sdi < kNumSubdivs; ++sdi) {
+      col_params_.instance.impl_details.subdiv_offsets.push_back(sdi *
+                                                                 subdiv_stride);
+      col_params_.subdiv_rank[sdi] = sdi * subdiv_stride;
+    }
+
+    // Set up a local device ring order that's not just 0,1,2...
+    std::vector<int> local_ring_order;
+    for (int di = 0; di < num_devices; ++di) {
+      local_ring_order.push_back(di);
+    }
+    for (int di = 0; di < num_devices; ++di) {
+      bool is_odd = ((di % 2) == 1);
+      int other = (di + (is_odd ? 7 : 3)) % num_devices;
+      if (di == other) continue;
+      iter_swap(local_ring_order.begin() + di,
+                local_ring_order.begin() + other);
+    }
+    broadcast_dev_id_ = local_ring_order[0];
+    string lro_buf;
+    for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", ");
+    VLOG(1) << "local_ring_order " << lro_buf;
+
+    // Set up all of the fake device contexts.
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name = strings::StrCat(task_name, "/device:CPU:", di);
+        if (device_type == DEVICE_GPU) {
+          dev_name = strings::StrCat(task_name, "/device:GPU:0");
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.task_names.push_back(task_name);
+        // Normally each device would set is_local to its own perspective but
+        // this test runs in a single process so is_local is always true.
+        col_params_.task.is_local.push_back(true);
+        for (int sdi = 0; sdi < kNumSubdivs; ++sdi) {
+          int rotated_di =
+              (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) %
+              num_devices;
+          col_params_.instance.impl_details.subdiv_permutations[sdi].push_back(
+              wi * num_devices + local_ring_order[rotated_di]);
+        }
+      }
+    }
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        int rank = wi * num_devices + di;
+        instances_.push_back(new DeviceInstance(
+            rank, col_params_.instance.device_names[rank], device_type_, this));
+      }
+    }
+  }
+
+  typedef std::function<void(Tensor*)> InitFunc;
+
+  void Broadcast() {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoBroadcast();
+        ++done;
+      });
+    }
+    while (done < instances_.size()) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
+                                      const DeviceType& device_type,
+                                      DeviceBase* device) {
+    Status status;
+    std::unique_ptr<OpKernel> k = CreateOpKernel(
+        device_type, device, device->GetAllocator(AllocatorAttributes()), node,
+        TF_GRAPH_DEF_VERSION, &status);
+    if (!status.ok()) {
+      LOG(FATAL) << status;
+    }
+    return k;
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveBcastSend(
+      const CollectiveParams& params, Tensor* input,
+      const DeviceType& device_type, DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_bcast_send_", bcast_send_counter_++),
+        "CollectiveBcastSend");
+    TF_CHECK_OK(builder.Attr("T", input->dtype())
+                    .Attr("group_size", params.group.group_size)
+                    .Attr("group_key", params.group.group_key)
+                    .Attr("instance_key", params.instance.instance_key)
+                    .Attr("shape", input->shape())
+                    .Input(FakeInput(params.instance.data_type))
+                    .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveBcastRecv(
+      const CollectiveParams& params, const TensorShape& shape,
+      const DeviceType& device_type, DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_bcast_recv_", bcast_recv_counter_++),
+        "CollectiveBcastRecv");
+    TF_CHECK_OK(builder.Attr("T", params.instance.data_type)
+                    .Attr("group_size", params.group.group_size)
+                    .Attr("group_key", params.group.group_key)
+                    .Attr("instance_key", params.instance.instance_key)
+                    .Attr("shape", shape)
+                    .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  void BuildColParams() {}
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int tensor_len, int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, fail_after);
+
+    // Initialize each instance tensor with distinct values.
+    for (int di = 0; di < instances_.size(); ++di) {
+      DeviceInstance* instance = instances_[di];
+      instance->InitTensor(
+          dtype, TensorShape({tensor_len}), [di, dtype](Tensor* t) {
+            for (size_t i = 0; i < t->NumElements(); ++i) {
+              // The cast is necessary to prevent clang-tidy from insisting
+              // that a faster non-open source function be substituted.
+              float value = pow(10, static_cast<double>(di)) * i;
+              t->flat<T>()(i) = value;
+            }
+          });
+    }
+
+    // Copy the expected value from the broadcast source tensor
+    std::vector<T> expected(tensor_len, 0.0);
+    const CollectiveParams& cp = instances_[0]->col_params_;
+    int broadcast_dev_id =
+        cp.instance.impl_details.subdiv_permutations
+            [0][cp.instance.impl_details.subdiv_source_rank[0]];
+    const Tensor* t = &instances_[broadcast_dev_id]->tensor_;
+    Tensor cpu_copy(dtype, TensorShape({tensor_len}));
+    if (device_type == DEVICE_GPU) {
+      Notification notification;
+      Device* dev = instances_[broadcast_dev_id]->device_;
+      auto* dev_info = dev->tensorflow_gpu_device_info();
+      CHECK(dev_info);
+      dev_info->default_context->CopyDeviceTensorToCPU(
+          t, "" /*tensor_name*/, dev, &cpu_copy,
+          [this, &notification](Status s) {
+            TF_CHECK_OK(s);
+            notification.Notify();
+          });
+      notification.WaitForNotification();
+      t = &cpu_copy;
+    }
+    for (size_t i = 0; i < t->NumElements(); ++i) {
+      expected[i] = t->flat<T>()(i);
+    }
+
+    Broadcast();
+
+    // At this point all of the ops have terminated.
+    for (int di = 0; di < instances_.size(); ++di) {
+      if (!instances_[di]->status_.ok()) {
+        ASSERT_GT(fail_after, 0);
+        ASSERT_EQ(instances_[di]->status_.error_message(),
+                  "Deliberate failure");
+        mutex_lock l(mu_);
+        ++failure_count_;
+        continue;
+      }
+      Tensor* inst = &instances_[di]->tensor_;
+      Tensor actual(dtype, TensorShape({tensor_len}));
+      if (device_type_ == DEVICE_CPU) {
+        CHECK(actual.CopyFrom(*inst, inst->shape()));
+      } else if (device_type_ == DEVICE_GPU) {
+        Notification notification;
+        Device* dev = instances_[di]->device_;
+        auto* dev_info = dev->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        dev_info->default_context->CopyDeviceTensorToCPU(
+            inst, "" /*tensor_name*/, dev, &actual,
+            [this, &notification](Status s) {
+              TF_CHECK_OK(s);
+              notification.Notify();
+            });
+        notification.WaitForNotification();
+      }
+      for (int i = 0; i < tensor_len; ++i) {
+        switch (dtype) {
+          case DT_FLOAT:
+            EXPECT_FLOAT_EQ(expected[i], actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_DOUBLE:
+            EXPECT_DOUBLE_EQ(expected[i], actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_INT32:
+          case DT_INT64:
+            EXPECT_EQ(expected[i], actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          default:
+            LOG(FATAL) << "unimplemented";
+        }
+      }
+    }
+
+    // Note that the order of operations during broadcast is
+    // non-deterministic and unlike the reduce case some Ops in the
+    // instance may succeed while others fail, even if a transmission
+    // failure occurs early in the operation chain.  So, when an abort
+    // is specified we need to verify that at least one Op fails with
+    // the expected status and any Op that succeeds yeilds the correct
+    // value.
+    if (fail_after > 0) {
+      mutex_lock l(mu_);
+      EXPECT_GT(failure_count_, 0);
+    }
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, BroadcasterTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_));
+      col_params_.name = parent_->col_params_.name;
+      col_params_.instance.data_type = parent_->col_params_.instance.data_type;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.instance.instance_key =
+          parent_->col_params_.instance.instance_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance.device_names =
+          parent_->col_params_.instance.device_names;
+      col_params_.instance.task_names =
+          parent_->col_params_.instance.task_names;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      col_params_.instance.impl_details.subdiv_permutations =
+          parent_->col_params_.instance.impl_details.subdiv_permutations;
+      col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
+
+      int group_size = col_params_.group.group_size;
+      CHECK_EQ(group_size, col_params_.instance.device_names.size());
+      // Default rank is order in device_names.
+      col_params_.default_rank = rank;
+      // perm_rank is order in subdiv[0]:
+      int perm_rank = -1;
+      for (int i = 0;
+           i < col_params_.instance.impl_details.subdiv_permutations[0].size();
+           ++i) {
+        if (rank ==
+            col_params_.instance.impl_details.subdiv_permutations[0][i]) {
+          perm_rank = i;
+          break;
+        }
+      }
+      CHECK_GE(perm_rank, 0);
+      col_params_.instance.impl_details.subdiv_source_rank.resize(1, 0);
+      col_params_.is_source =
+          (perm_rank ==
+           col_params_.instance.impl_details.subdiv_source_rank[0]);
+      // Set rank in all subdivs by finding that default_rank.
+      for (int sdi = 0; sdi < kNumSubdivs; ++sdi) {
+        for (int r = 0;
+             r <
+             col_params_.instance.impl_details.subdiv_permutations[sdi].size();
+             ++r) {
+          if (col_params_.default_rank ==
+              col_params_.instance.impl_details.subdiv_permutations[sdi][r]) {
+            col_params_.subdiv_rank[sdi] = r;
+            CHECK_EQ(0, sdi);
+            CHECK_EQ(perm_rank, col_params_.subdiv_rank[sdi]);
+            break;
+          }
+        }
+      }
+      CHECK_EQ(group_size, col_params_.task.is_local.size());
+      CHECK_EQ(group_size, col_params_.instance.task_names.size());
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const InitFunc& f) {
+      tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        f(&tensor_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        f(&cpu_tensor);
+        Notification notification;
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        dev_info->default_context->CopyCPUTensorToDevice(
+            &cpu_tensor, device_, &tensor_, [this, &notification](Status s) {
+              TF_CHECK_OK(s);
+              notification.Notify();
+            });
+        notification.WaitForNotification();
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoBroadcast() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = parent_->step_id_;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      int forward_from[] = {0};
+      if (col_params_.is_source) {
+        op_params.forward_from_array = &forward_from[0];
+      }
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op =
+          col_params_.is_source
+              ? parent_->GetCollectiveBcastSend(col_params_, &tensor_,
+                                                DEVICE_CPU, device_)
+              : parent_->GetCollectiveBcastRecv(col_params_, tensor_.shape(),
+                                                DEVICE_CPU, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      Tensor* output_tensor_ptr = nullptr;
+      if (col_params_.is_source) {
+        TF_CHECK_OK(ctx.forward_input_or_allocate_output(
+            {0}, 0, tensor_.shape(), &output_tensor_ptr));
+      } else {
+        TF_CHECK_OK(
+            ctx.allocate_output(0, tensor_.shape(), &output_tensor_ptr));
+      }
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Prepare a Broadcaster instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      Broadcaster broadcaster(parent_->col_exec_, parent_->dev_mgr_.get(), &ctx,
+                              &op_params, col_params_, exec_key, kStepId,
+                              output_tensor_ptr);
+
+      // Start execution in a threadpool then wait for completion.
+      Notification notification;
+      broadcaster.Run([this, &notification](Status s) {
+        status_ = s;
+        notification.Notify();
+      });
+      notification.WaitForNotification();
+      if (status_.ok()) {
+        CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
+      }
+
+      dev_ctx->Unref();
+    }
+
+    BroadcasterTest* parent_;
+    string dev_name_;
+    DeviceType device_type_ = DEVICE_CPU;
+    int rank_;
+    Tensor tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    std::unique_ptr<CollectiveAdapter> ca_;
+    std::unique_ptr<OpKernelContext> ctx_;
+    Status status_;
+  };  // class DeviceInstance
+
+  bool stop_ = false;
+  int64 step_id_ = kStepId;
+  int broadcast_dev_id_ = 0;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_ = nullptr;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<tensorflow::Device*> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  mutex mu_;
+  int bcast_recv_counter_ GUARDED_BY(mu_) = 0;
+  int bcast_send_counter_ GUARDED_BY(mu_) = 0;
+  int failure_count_ GUARDED_BY(mu_) = 0;
+};
+
+// Tests of full broadcast algorithm, with different device and
+// data types.
+// B = data element type
+// T = device type
+// W = number of workers
+// D = number of devices per worker
+// L = tensor length
+// A = abort after count
+#define DEF_TEST(B, T, W, D, L, A)                                 \
+  TEST_F(BroadcasterTest,                                          \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Len##L##_Abt##A) { \
+    DataType dtype = DT_##B;                                       \
+    switch (dtype) {                                               \
+      case DT_FLOAT: {                                             \
+        RunTest<float>(dtype, DEVICE_##T, W, D, L, A);             \
+      } break;                                                     \
+      case DT_DOUBLE: {                                            \
+        RunTest<double>(dtype, DEVICE_##T, W, D, L, A);            \
+      } break;                                                     \
+      case DT_INT32: {                                             \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, L, A);             \
+      } break;                                                     \
+      case DT_INT64: {                                             \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, L, A);             \
+      } break;                                                     \
+      default:                                                     \
+        LOG(FATAL) << "Unimplemented";                             \
+    }                                                              \
+  }
+
+#ifndef GOOGLE_CUDA
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0)
+
+DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0)
+DEF_TEST(INT32, CPU, 2, 4, 128, 0)
+DEF_TEST(INT64, CPU, 2, 4, 128, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 1)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 5)
+#endif
+
+#ifdef GOOGLE_CUDA
+// Can only set W=1 for GPU tests.
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 33, 0)
+DEF_TEST(FLOAT, GPU, 1, 3, 64, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0)
+
+DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 8, 1001, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, GPU, 1, 8, 128, 6)
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.cc b/tensorflow/core/common_runtime/buf_rendezvous.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b57eb2943a7fac79804587043e1958c279923daf
--- /dev/null
+++ b/tensorflow/core/common_runtime/buf_rendezvous.cc
@@ -0,0 +1,166 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+
+namespace tensorflow {
+
+BufRendezvous::~BufRendezvous() {
+  mutex_lock l(mu_);
+  if (!hook_table_.empty()) {
+    PurgeTable(errors::Internal("Delete called on non-empty BufRendezvous"),
+               &hook_table_);
+  }
+}
+
+void BufRendezvous::StartAbort(const Status& s) {
+  CHECK(!s.ok());
+  HookTable dummy_table;
+  {
+    mutex_lock l(mu_);
+    status_.Update(s);
+    hook_table_.swap(dummy_table);
+  }
+  PurgeTable(s, &dummy_table);
+}
+
+void BufRendezvous::PurgeTable(const Status& s, HookTable* table) {
+  for (auto& it : *table) {
+    Hook* h = it.second;
+    if (h->cons_cb != nullptr) {
+      h->cons_cb(s, nullptr);
+    }
+    if (h->prod_cb != nullptr) {
+      h->prod_cb(s);
+    }
+    delete h;
+  }
+  table->clear();
+}
+
+string BufRendezvous::Hook::DebugString() const {
+  return strings::StrCat("[dev:", (prod_dev ? prod_dev->name() : "none"),
+                         ", ctx:", reinterpret_cast<uint64>(prod_ctx),
+                         ", val:", reinterpret_cast<uint64>(prod_value),
+                         ", pcb:", reinterpret_cast<uint64>(&prod_cb),
+                         ", ccb:", reinterpret_cast<uint64>(&cons_cb), "]");
+}
+
+void BufRendezvous::ProvideBuf(const string& key, Device* dev,
+                               DeviceContext* dev_ctx, const Tensor* v,
+                               const AllocatorAttributes& attr,
+                               const ProducerCallback& done) {
+  Hook* h = nullptr;
+  Status providebuf_status;
+  do {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      providebuf_status = status_;
+      break;
+    } else {
+      auto it = hook_table_.find(key);
+      if (it == hook_table_.end()) {
+        h = new Hook;
+        it = hook_table_.insert(std::make_pair(key, h)).first;
+      } else {
+        if (it->second->prod_cb != nullptr) {
+          providebuf_status = errors::Internal(
+              "BufRendezvous::ProvideBuf already called for key ", key);
+          break;
+        }
+        h = it->second;
+      }
+      // Populate Hook with all of the prod values.
+      h->prod_dev = dev;
+      h->prod_ctx = dev_ctx;
+      h->prod_value = v;
+      h->prod_attr = attr;
+      h->prod_cb = done;
+      // If consumer is waiting, kick off right away, removing Hook from table.
+      if (h->cons_cb != nullptr) {
+        hook_table_.erase(it);
+      } else {
+        h = nullptr;
+      }
+    }
+  } while (false);
+  if (h) {
+    h->cons_cb(Status::OK(), h);
+  }
+  if (!providebuf_status.ok()) {
+    done(providebuf_status);
+  }
+}
+
+void BufRendezvous::ConsumeBuf(const string& key,
+                               const ConsumerCallback& done) {
+  Hook* existing_hook = nullptr;
+  Status consumebuf_status;
+  do {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      consumebuf_status = status_;
+      break;
+    }
+    auto it = hook_table_.find(key);
+    if (it != hook_table_.end()) {
+      // Prepare to consume immediately.
+      if (it->second->cons_cb) {
+        consumebuf_status =
+            errors::Internal("Second consumer arrived for key ", key);
+        break;
+      }
+      existing_hook = it->second;
+      hook_table_.erase(it);
+      existing_hook->cons_cb = done;
+    } else {
+      // Hang consumer callback on the Hook.
+      Hook* h = new Hook;
+      hook_table_[key] = h;
+      h->cons_cb = done;
+      return;
+    }
+  } while (false);
+  if (existing_hook) {
+    existing_hook->cons_cb(Status::OK(), existing_hook);
+    return;
+  }
+  if (!consumebuf_status.ok()) {
+    done(consumebuf_status, nullptr);
+    return;
+  }
+}
+
+/*static*/
+void BufRendezvous::DoneWithHook(Hook* h) {
+  h->prod_cb(Status::OK());
+  delete h;
+}
+
+void BufRendezvous::LogContents() {
+  mutex_lock l(mu_);
+  LOG(INFO) << strings::StrCat("BufRendezvous ",
+                               strings::Hex(reinterpret_cast<uint64>(this)),
+                               " step_id=", step_id_, " current contents:");
+  for (auto it : hook_table_) {
+    LOG(INFO) << it.first << ":" << it.second->DebugString();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.h b/tensorflow/core/common_runtime/buf_rendezvous.h
new file mode 100644
index 0000000000000000000000000000000000000000..e94e88b323ec74a36948ffff4e5718f211efbbb6
--- /dev/null
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+#define TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+class Device;
+class DeviceContext;
+class Tensor;
+
+// EXPERIMENTAL: RDMA oriented producer/consumer rendezvous on a local
+// Tensor value for which DMAHelper::CanUseDMA() is true, i.e. dense
+// numeric types.  Similar to Rendezvous but never owns a Ref on the
+// tensor, instead it uses an explicit callback to the producer when
+// the consumer side is finished with the value.  This allows the
+// producer to perform in-place updates on the source buffer or to take
+// other actions that depend on knowing the consumer has passed a certain
+// execution point.
+class BufRendezvous {
+ public:
+  explicit BufRendezvous(uint64 step_id) : step_id_(step_id) {}
+
+  ~BufRendezvous();
+
+  // Inform all all waiting parties that this BufRendezvous is defunct
+  // because of an error Status interrupting the Step.
+  void StartAbort(const Status& s);
+
+  struct Hook;
+  // Provided by the consumer to be called when access to the buffer
+  // is available.  If the Status arg is not OK, then hook will not
+  // be populated.  Ownership of Hook passes to consumer with the
+  // callback.
+  typedef std::function<void(const Status&, Hook*)> ConsumerCallback;
+  // Provided by the producer to be called when the consumer has finished
+  // reading the buffer and will no longer access it.
+  typedef std::function<void(const Status&)> ProducerCallback;
+
+  struct Hook {
+    Device* prod_dev;
+    DeviceContext* prod_ctx;
+    const Tensor* prod_value;
+    AllocatorAttributes prod_attr;
+    ProducerCallback prod_cb;
+    ConsumerCallback cons_cb;
+    Hook()
+        : prod_dev(nullptr),
+          prod_ctx(nullptr),
+          prod_value(nullptr),
+          prod_cb(nullptr),
+          cons_cb(nullptr) {}
+    string DebugString() const;
+  };
+
+  // Called to advertise availability of a Tensor value corresponding
+  // to key.  That value must stay valid until done is called.
+  void ProvideBuf(const string& key, Device* dev, DeviceContext* dev_ctx,
+                  const Tensor* v, const AllocatorAttributes& attr,
+                  const ProducerCallback& done);
+
+  // Called to request access to a Tensor value corresponding to key.
+  // Consumer is provide with a Hook as soon as availble.
+  void ConsumeBuf(const string& key, const ConsumerCallback& done);
+
+  // Consumer must call this function when it's done reading the Hook provided
+  // by the ConsumerCallback.  This function will invoke the producer callback
+  // and then delete h.
+  static void DoneWithHook(Hook* h);
+
+  // Write the current contents of the table to the INFO log.
+  void LogContents();
+
+ protected:
+  const uint64 step_id_;
+  mutex mu_;
+  Status status_ GUARDED_BY(mu_);
+  typedef gtl::FlatMap<string, Hook*> HookTable;
+  HookTable hook_table_ GUARDED_BY(mu_);
+
+  void PurgeTable(const Status& s, HookTable* table);
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e798235bf0649428409a2fa72ac3067736c347a
--- /dev/null
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -0,0 +1,197 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class BufRendezvousTest : public ::testing::Test {
+ protected:
+  BufRendezvousTest() {
+    br_.reset(new BufRendezvous(123));
+    fake_dev_ptr_ = reinterpret_cast<Device*>(512LLU);
+    fake_dev_ctx_ = reinterpret_cast<DeviceContext*>(1024LLU);
+    a_ = Tensor(DT_FLOAT, TensorShape({24}));
+    b_ = Tensor(DT_FLOAT, TensorShape({24}));
+  }
+
+  Device* fake_dev_ptr_ = nullptr;
+  DeviceContext* fake_dev_ctx_ = nullptr;
+  Tensor a_;
+  Tensor b_;
+  AllocatorAttributes aa_;
+  std::unique_ptr<BufRendezvous> br_;
+};
+
+TEST_F(BufRendezvousTest, CorrectUseProducerFirst) {
+  Status prod_status;
+  Status cons_status;
+  bool prod_callback_called = false;
+  bool cons_callback_called = false;
+  Notification note;
+  br_->ProvideBuf(
+      "key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+      [&note, &prod_status, &prod_callback_called](const Status& s) {
+        prod_status = s;
+        prod_callback_called = true;
+        note.Notify();
+      });
+  EXPECT_FALSE(prod_callback_called);
+  br_->ConsumeBuf("key0", [this, &cons_status, &cons_callback_called](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_callback_called = true;
+    ASSERT_TRUE(h != nullptr);
+    EXPECT_EQ(h->prod_dev, fake_dev_ptr_);
+    EXPECT_EQ(h->prod_ctx, fake_dev_ctx_);
+    EXPECT_EQ(h->prod_value, &a_);
+    br_->DoneWithHook(h);
+  });
+  EXPECT_TRUE(cons_callback_called);
+  note.WaitForNotification();
+  EXPECT_TRUE(prod_callback_called);
+  TF_EXPECT_OK(cons_status);
+  TF_EXPECT_OK(prod_status);
+}
+
+TEST_F(BufRendezvousTest, CorrectUseConsumerFirst) {
+  Status prod_status;
+  Status cons_status;
+  bool prod_callback_called = false;
+  bool cons_callback_called = false;
+  Notification note;
+  br_->ConsumeBuf("key0", [this, &cons_status, &cons_callback_called](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_callback_called = true;
+    ASSERT_TRUE(h != nullptr);
+    EXPECT_EQ(h->prod_dev, fake_dev_ptr_);
+    EXPECT_EQ(h->prod_ctx, fake_dev_ctx_);
+    EXPECT_EQ(h->prod_value, &a_);
+    br_->DoneWithHook(h);
+  });
+  EXPECT_FALSE(cons_callback_called);
+  br_->ProvideBuf(
+      "key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+      [&note, &prod_status, &prod_callback_called](const Status& s) {
+        prod_status = s;
+        prod_callback_called = true;
+        note.Notify();
+      });
+  EXPECT_TRUE(cons_callback_called);
+  note.WaitForNotification();
+  EXPECT_TRUE(prod_callback_called);
+  TF_EXPECT_OK(cons_status);
+  TF_EXPECT_OK(prod_status);
+}
+
+TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
+  bool prod_callback_called = false;
+  br_->ProvideBuf("key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [this, &prod_callback_called](const Status& s) {
+                    prod_callback_called = true;
+                  });
+  Status bad_status;
+  Notification note;
+  br_->ProvideBuf("key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [&bad_status, &note](const Status& s) {
+                    bad_status = s;
+                    note.Notify();
+                  });
+  note.WaitForNotification();
+  EXPECT_FALSE(bad_status.ok());
+  EXPECT_EQ("BufRendezvous::ProvideBuf already called for key key0",
+            bad_status.error_message());
+  EXPECT_FALSE(prod_callback_called);
+  br_.reset();
+}
+
+TEST_F(BufRendezvousTest, ErrorDeleteNonEmpty) {
+  Status cons_status;
+  br_->ConsumeBuf(
+      "key0", [this, &cons_status](const Status& s, BufRendezvous::Hook* h) {
+        cons_status = s;
+        EXPECT_EQ(h, nullptr);
+      });
+  EXPECT_TRUE(cons_status.ok());
+  br_.reset();
+  EXPECT_FALSE(cons_status.ok());
+  EXPECT_EQ("Delete called on non-empty BufRendezvous",
+            cons_status.error_message());
+}
+
+TEST_F(BufRendezvousTest, AbortNonEmpty) {
+  Status cons_status;
+  Status prod_status;
+  Notification prod_note;
+  Notification cons_note;
+  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_note.Notify();
+  });
+  br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [this, &prod_note, &prod_status](const Status& s) {
+                    prod_status = s;
+                    prod_note.Notify();
+                  });
+  br_->StartAbort(errors::Internal("Falling sky detected"));
+  prod_note.WaitForNotification();
+  cons_note.WaitForNotification();
+  EXPECT_FALSE(prod_status.ok());
+  EXPECT_EQ(prod_status.error_message(), "Falling sky detected");
+  EXPECT_FALSE(cons_status.ok());
+  EXPECT_EQ(cons_status.error_message(), "Falling sky detected");
+}
+
+TEST_F(BufRendezvousTest, AbortEmpty) {
+  br_->StartAbort(errors::Internal("Falling sky detected"));
+}
+
+TEST_F(BufRendezvousTest, UseAfterAbort) {
+  br_->StartAbort(errors::Internal("Falling sky detected"));
+  Status cons_status;
+  Status prod_status;
+  Notification prod_note;
+  Notification cons_note;
+  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_note.Notify();
+  });
+  br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [this, &prod_note, &prod_status](const Status& s) {
+                    prod_status = s;
+                    prod_note.Notify();
+                  });
+  prod_note.WaitForNotification();
+  cons_note.WaitForNotification();
+  EXPECT_FALSE(prod_status.ok());
+  EXPECT_EQ(prod_status.error_message(), "Falling sky detected");
+  EXPECT_FALSE(cons_status.ok());
+  EXPECT_EQ(cons_status.error_message(), "Falling sky detected");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/build_graph_options.cc b/tensorflow/core/common_runtime/build_graph_options.cc
index 811d459758a826c4c1b2d14cca7a96ea07d0a799..a9dc6ca6cda9443ae9737267aca5f361e492d22d 100644
--- a/tensorflow/core/common_runtime/build_graph_options.cc
+++ b/tensorflow/core/common_runtime/build_graph_options.cc
@@ -21,15 +21,15 @@ namespace tensorflow {
 
 string BuildGraphOptions::DebugString() const {
   string rv = "Feed endpoints: ";
-  for (auto& s : feed_endpoints) {
+  for (auto& s : callable_options.feed()) {
     strings::StrAppend(&rv, s, ", ");
   }
   strings::StrAppend(&rv, "\nFetch endpoints: ");
-  for (auto& s : fetch_endpoints) {
+  for (auto& s : callable_options.fetch()) {
     strings::StrAppend(&rv, s, ", ");
   }
   strings::StrAppend(&rv, "\nTarget nodes: ");
-  for (auto& s : target_nodes) {
+  for (auto& s : callable_options.target()) {
     strings::StrAppend(&rv, s, ", ");
   }
   return rv;
diff --git a/tensorflow/core/common_runtime/build_graph_options.h b/tensorflow/core/common_runtime/build_graph_options.h
index 5f0e8f170b9e9b0c6a3094e475fcc3bbf47756ea..5ca170e922ce348fb3f76b1129b22ca01804054c 100644
--- a/tensorflow/core/common_runtime/build_graph_options.h
+++ b/tensorflow/core/common_runtime/build_graph_options.h
@@ -19,25 +19,18 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/debug.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
 struct BuildGraphOptions {
-  std::vector<string> feed_endpoints;
-  std::vector<string> fetch_endpoints;
-
-  // TODO(vrv): Remove this when we unify target_nodes and fetch_endpoint,
-  // the former via "ref" fetch_endpoints.
-  std::vector<string> target_nodes;
+  CallableOptions callable_options;
 
   // If `true`, uses Arg/Retval to implement feeds/fetches; otherwise
   // uses Recv/Send to implement feeds/fetches.
   // TODO(mrry): Remove this when the distributed runtime supports Arg/Retval.
   bool use_function_convention = false;
 
-  DebugOptions debug_options;
-
   string DebugString() const;
 };
 
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e07829b286741e18db21e3c491973ec8f4b973dc
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/build_graph_options.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+CollectiveExecutorMgr::CollectiveExecutorMgr(
+    const ConfigProto& config, const DeviceMgr* dev_mgr,
+    DeviceResolverInterface* dev_resolver,
+    ParamResolverInterface* param_resolver)
+    : dev_mgr_(dev_mgr),
+      dev_resolver_(dev_resolver),
+      param_resolver_(param_resolver) {}
+
+CollectiveExecutorMgr::~CollectiveExecutorMgr() {
+  for (auto iter : executor_table_) {
+    iter.second->Unref();
+  }
+}
+
+CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
+  CollectiveExecutor* ce = nullptr;
+  {
+    mutex_lock l(exec_mu_);
+    auto it = executor_table_.find(step_id);
+    if (it != executor_table_.end()) {
+      ce = it->second;
+    } else {
+      CollectiveRemoteAccessLocal* rma = new CollectiveRemoteAccessLocal(
+          dev_mgr_, dev_resolver_.get(), step_id);
+      ce = new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_);
+      executor_table_[step_id] = ce;
+    }
+    ce->Ref();
+  }
+  return ce;
+}
+
+void CollectiveExecutorMgr::Cleanup(int64 step_id) {
+  CollectiveExecutor* ce = nullptr;
+  {
+    mutex_lock l(exec_mu_);
+    auto it = executor_table_.find(step_id);
+    if (it != executor_table_.end()) {
+      ce = it->second;
+      executor_table_.erase(it);
+    }
+  }
+  if (ce) ce->Unref();
+}
+
+void CollectiveExecutorMgr::GetStepSequenceAsync(
+    const GetStepSequenceRequest* request, GetStepSequenceResponse* response,
+    const StatusCallback& done) {
+  done(errors::Internal(
+      "CollectiveExecutorMgr does not implement GetStepSequence."));
+}
+
+void CollectiveExecutorMgr::RefreshStepIdSequenceAsync(
+    int64 graph_key, const StatusCallback& done) {
+  done(errors::Internal(
+      "CollectiveExecutorMgr does not implement RefreshStepIdSequence."));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b42e2b4d16c5804e0660079c7a149442b47edb0
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+class ConfigProto;
+class DeviceMgr;
+
+class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
+ public:
+  CollectiveExecutorMgr(const ConfigProto& config, const DeviceMgr* dev_mgr,
+                        DeviceResolverInterface* dev_resolver,
+                        ParamResolverInterface* param_resolver);
+
+  virtual ~CollectiveExecutorMgr();
+
+  CollectiveExecutor* FindOrCreate(int64 step_id) override;
+
+  void Cleanup(int64 step_id) override;
+
+  ParamResolverInterface* GetParamResolver() const override {
+    return param_resolver_.get();
+  }
+
+  DeviceResolverInterface* GetDeviceResolver() const override {
+    return dev_resolver_.get();
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            const StatusCallback& done) override;
+
+  void RefreshStepIdSequenceAsync(int64 graph_key,
+                                  const StatusCallback& done) override;
+
+  int64 NextStepId(int64 graph_key) override {
+    return CollectiveExecutor::kInvalidId;
+  }
+
+  void RetireStepId(int64 graph_key, int64 step_id) override {}
+
+ protected:
+  const DeviceMgr* dev_mgr_;
+  std::unique_ptr<DeviceResolverInterface> dev_resolver_;
+  std::unique_ptr<ParamResolverInterface> param_resolver_;
+  CollectiveRemoteAccess* remote_access_;
+  string task_name_;
+  mutex exec_mu_;
+  // Map from step_id to CollectiveExecutor
+  gtl::FlatMap<int64, CollectiveExecutor*> executor_table_ GUARDED_BY(exec_mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34c9163d6a40ba47323afc306cc2803b643e1d8b
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class CollectiveExecutorMgrTest : public ::testing::Test {
+ protected:
+  CollectiveExecutorMgrTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    string task_name = "/job:localhost/replica:0/task:0";
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    DeviceResolverLocal* drl = new DeviceResolverLocal(device_mgr_.get());
+    cme_.reset(new CollectiveExecutorMgr(
+        cp, device_mgr_.get(), drl,
+        new CollectiveParamResolverLocal(device_mgr_.get(), drl, task_name)));
+  }
+
+  std::unique_ptr<CollectiveExecutorMgr> cme_;
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+};
+
+TEST_F(CollectiveExecutorMgrTest, FindOrCreate) {
+  CollectiveExecutor::Handle* h =
+      new CollectiveExecutor::Handle(cme_->FindOrCreate(1), true);
+  EXPECT_TRUE(h->get());
+  CollectiveExecutor::Handle* h2 =
+      new CollectiveExecutor::Handle(cme_->FindOrCreate(1), true);
+  EXPECT_EQ(h->get(), h2->get());
+  CollectiveExecutor* ce = h->get();
+  delete h;
+  delete h2;
+  CollectiveExecutor::Handle h3(cme_->FindOrCreate(1), true);
+  EXPECT_EQ(ce, h3.get());
+  cme_->Cleanup(1);
+}
+
+TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
+  EXPECT_EQ(CollectiveExecutor::kInvalidId, cme_->NextStepId(123));
+  Notification ss_note;
+  Status ss_status;
+  cme_->RefreshStepIdSequenceAsync(
+      123, [this, &ss_status, &ss_note](const Status& s) {
+        ss_status = s;
+        ss_note.Notify();
+      });
+  ss_note.WaitForNotification();
+  EXPECT_FALSE(ss_status.ok());
+  EXPECT_EQ(ss_status.error_message(),
+            "CollectiveExecutorMgr does not implement RefreshStepIdSequence.");
+  Notification gs_note;
+  Status gs_status;
+  GetStepSequenceRequest* req = nullptr;
+  GetStepSequenceResponse* resp = nullptr;
+  cme_->GetStepSequenceAsync(req, resp,
+                             [this, &gs_status, &gs_note](const Status& s) {
+                               gs_status = s;
+                               gs_note.Notify();
+                             });
+  gs_note.WaitForNotification();
+  EXPECT_FALSE(gs_status.ok());
+  EXPECT_EQ(gs_status.error_message(),
+            "CollectiveExecutorMgr does not implement GetStepSequence.");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdddf927d89152481e0d952ba1e08d68c7b38a09
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -0,0 +1,701 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+
+namespace tensorflow {
+
+CollectiveParamResolverLocal::CollectiveParamResolverLocal(
+    const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+    const string& task_name)
+    : dev_mgr_(dev_mgr), dev_resolver_(dev_resolver), task_name_(task_name) {}
+
+void CollectiveParamResolverLocal::CompleteGroupAsync(
+    const CompleteGroupRequest* request, CompleteGroupResponse* response,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  done(
+      errors::Internal("CompleteGroup is not implemented by "
+                       "CollectiveParamResolverLocal which is "
+                       "intended only for non-distributed deployment."));
+}
+
+void CollectiveParamResolverLocal::CompleteGroupLocal(
+    const string& device, CollectiveParams* cp, const GroupRecCallback& done) {
+  VLOG(1) << "CompleteGroupLocal " << cp << ": " << cp->ToString();
+  std::vector<StatusCallback> to_be_called;
+  GroupRec* gr = nullptr;
+  {
+    mutex_lock l(group_mu_);
+    auto it = group_table_.find(cp->group.group_key);
+    if (it == group_table_.end()) {
+      gr = new GroupRec;
+      gr->group.group_key = cp->group.group_key;
+      gr->group.group_size = cp->group.group_size;
+      gr->group.device_type = cp->group.device_type;
+      group_table_[gr->group.group_key].reset(gr);
+      VLOG(2) << "New group_key=" << gr->group.group_key
+              << " group_size=" << gr->group.group_size;
+    } else {
+      gr = it->second.get();
+    }
+  }
+  Status status;
+  {
+    mutex_lock gr_lock(gr->mu);
+    if (!gr->device_set.empty()) {
+      // Check for consistency with existing GroupRec.
+      if (cp->group.device_type != gr->group.device_type) {
+        status = errors::Internal(
+            "Collective Op ", cp->name, " is assigned to device ", device,
+            " with type ", cp->group.device_type.type_string(),
+            " and group_key ", cp->group.group_key, " but that group has type ",
+            gr->group.device_type.type_string());
+      } else if (cp->group.group_size != gr->group.group_size) {
+        status = errors::Internal(
+            "Collective Op ", cp->name, " has group_size ",
+            cp->group.group_size, " and group_key", cp->group.group_key,
+            " but that group has size ", gr->group.group_size);
+      }
+    }
+    if (status.ok()) {
+      // Insert device if not already present.
+      auto it = gr->device_set.find(device);
+      if (it == gr->device_set.end()) {
+        if (gr->device_set.size() == gr->group.group_size) {
+          // The group is already full.
+          status = errors::Internal(
+              "Collective Op ", cp->name, " is assigned to device ", device,
+              " and group_key ", cp->group.group_key,
+              " but that group doesn't contain that device.");
+        } else {
+          // This is a new device that has not yet joined the group.
+          gr->device_set.insert(device);
+          gr->device_list.push_back(device);
+          DeviceNameUtils::ParsedName parsed_device;
+          DeviceNameUtils::ParseFullName(device, &parsed_device);
+          string task_name = strings::StrCat("/job:", parsed_device.job,
+                                             "/replica:", parsed_device.replica,
+                                             "/task:", parsed_device.task);
+          gr->task_set.insert(task_name);
+          gr->task_list.push_back(task_name);
+          gr->group.num_tasks = static_cast<int32>(gr->task_set.size());
+          VLOG(1) << "group_key=" << gr->group.group_key
+                  << " group_size=" << gr->group.group_size
+                  << " dev_set=" << gr->device_set.size();
+        }
+      }
+    }
+
+    if (status.ok()) {
+      // If the group is not yet complete, queue to wait for it.
+      VLOG(2) << "group_size " << gr->group.group_size << " set size "
+              << gr->device_set.size() << " gr " << gr;
+
+      if (gr->device_set.size() < gr->group.group_size) {
+        gr->waiting.push_back(std::bind(done, std::placeholders::_1, gr));
+        return;
+      }
+      CHECK_EQ(gr->device_set.size(), gr->group.group_size);
+      if (!gr->waiting.empty()) {
+        std::swap(to_be_called, gr->waiting);
+      }
+    }
+  }
+  done(status, gr);
+  for (int i = 0; i < to_be_called.size(); ++i) {
+    to_be_called[i](Status::OK());
+  }
+}
+
+namespace {
+
+struct DevRec {
+  string task;
+  string device;
+  int original_rank;
+  int local_rank;
+  int global_rank;
+  const DeviceLocality* locality;
+};
+typedef std::unordered_map<string, DevRec> TaskDeviceMap;
+typedef std::unordered_map<string, TaskDeviceMap> GlobalDeviceMap;
+
+// Create a populated GlobalDeviceMap from CollInstanceParams and localities.
+GlobalDeviceMap BuildDevRecs(const CollInstanceParams& ip,
+                             const std::vector<DeviceLocality>& localities) {
+  GlobalDeviceMap gdm;
+  CHECK_EQ(ip.device_names.size(), ip.task_names.size());
+  CHECK_EQ(ip.device_names.size(), localities.size());
+  for (int i = 0; i < ip.device_names.size(); ++i) {
+    TaskDeviceMap& tdm = gdm[ip.task_names[i]];
+    DevRec* dr = &tdm[ip.device_names[i]];
+    dr->task = ip.task_names[i];
+    dr->device = ip.device_names[i];
+    dr->original_rank = i;
+    dr->local_rank = 0;   // Will be populated later by OrderTaskDeviceMap.
+    dr->global_rank = 0;  // Will be populated later by EstablishGlobalRank.
+    dr->locality = &localities[i];
+  }
+  return gdm;
+}
+
+void OrderTaskDeviceMap(TaskDeviceMap* tdm) {
+  CHECK_GT(tdm->size(), 0);  // Should never be called with 0 devices
+  int least_rank = -1;
+  string next_device;
+  std::set<string> selected;
+  // Starting device is one with the least initial rank.
+  for (const auto& it : *tdm) {
+    if (least_rank < 0 || it.second.original_rank < least_rank) {
+      least_rank = it.second.original_rank;
+      next_device = it.second.device;
+    }
+  }
+  CHECK_GE(least_rank, 0);
+  DeviceNameUtils::ParsedName parsed_name;
+  CHECK(DeviceNameUtils::ParseFullName(next_device, &parsed_name));
+  // NOTE: InterconnectLink has only a device_id, nothing more, so for
+  // the time being if there's more than one device at a task we
+  // assume they're all GPUs.
+
+  int next_rank = 0;
+  while (true) {
+    selected.insert(next_device);
+    DevRec* dr = &(*tdm)[next_device];
+    dr->local_rank = next_rank;
+    ++next_rank;
+    if (selected.size() == tdm->size()) {
+      break;
+    }
+    // For the present time we assume Locality links only cover GPUs.
+    // For multiple CPUs, just take them in order.
+    const InterconnectLink* best_link = nullptr;
+    if (parsed_name.type == "GPU") {
+      for (const InterconnectLink& il : dr->locality->links().link()) {
+        parsed_name.id = il.device_id();
+        string endpoint_device =
+            DeviceNameUtils::ParsedNameToString(parsed_name);
+        if (selected.find(endpoint_device) != selected.end()) {
+          continue;
+        }
+        if (best_link == nullptr || il.strength() > best_link->strength()) {
+          best_link = &il;
+        }
+      }
+    }
+    if (best_link != nullptr) {
+      // Follow the best edge
+      parsed_name.id = best_link->device_id();
+      next_device = DeviceNameUtils::ParsedNameToString(parsed_name);
+    } else {
+      // No good edges, alas. Pick the lowest initial rank among remaining
+      // devices.
+      least_rank = -1;
+      for (const auto& it : *tdm) {
+        if (selected.find(it.second.device) != selected.end()) {
+          continue;
+        }
+        if (least_rank < 0 || it.second.original_rank < least_rank) {
+          least_rank = it.second.original_rank;
+          next_device = it.second.device;
+        }
+      }
+      CHECK_GE(least_rank, 0);
+    }
+  }
+}
+
+// The first time a shared CollectiveParams is established for a
+// shared set of instances we compute a good rank order for all the
+// devices in the group, that is appropriate for a ring algorithm.
+// This order need not be the same across different instance groups
+// sharing the same device group where there is more than one good
+// order.
+GlobalDeviceMap EstablishGlobalRank(
+    CollectiveParams* cp, const std::vector<DeviceLocality>& localities) {
+  VLOG(1) << "EstablishGlobalRank";
+  GlobalDeviceMap gdm = BuildDevRecs(cp->instance, localities);
+  for (auto& iter : gdm) {
+    TaskDeviceMap& tdm = iter.second;
+    OrderTaskDeviceMap(&tdm);
+  }
+  // Connect the global rank order by the order in which tasks first appear.
+  std::set<string> ordered_tasks;
+  int next_rank = 0;
+  for (int i = 0; i < cp->instance.task_names.size(); ++i) {
+    const string& task_name = cp->instance.task_names[i];
+    if (ordered_tasks.find(task_name) != ordered_tasks.end()) {
+      continue;
+    }
+    ordered_tasks.insert(task_name);
+    TaskDeviceMap* tdm = &gdm[task_name];
+    for (auto& it : *tdm) {
+      it.second.global_rank = it.second.local_rank + next_rank;
+    }
+    next_rank += tdm->size();
+  }
+  return gdm;
+}
+
+// Count the devices associated with each task and set
+// cp->same_num_devices_per_task.  Requires cp->instance.task_names
+// be sorted.
+void SetDevPerTask(CollectiveParams* cp) {
+  cp->instance.same_num_devices_per_task = false;
+  if (cp->instance.task_names.empty()) return;
+  int dev_per_task = -1;
+  int count = 0;
+  const string* last_task_name = &cp->instance.task_names[0];
+  for (const string& task_name : cp->instance.task_names) {
+    if (task_name != *last_task_name) {
+      CHECK_GT(count, 0);
+      if (dev_per_task < 0) {
+        dev_per_task = count;
+      } else {
+        CHECK_GT(dev_per_task, 0);
+        if (count != dev_per_task) return;
+      }
+      count = 1;
+      last_task_name = &task_name;
+    } else {
+      ++count;
+    }
+  }
+  CHECK_GT(count, 0);
+  if ((dev_per_task > 0) && (count != dev_per_task)) {
+    return;
+  }
+  cp->instance.same_num_devices_per_task = true;
+  CHECK_EQ((cp->group.group_size % cp->group.num_tasks), 0);
+}
+
+// Sort cp->instance.device_names lexicographically, but do by first
+// computing a reordering permutation so we can keep cp->instance.task_names
+// in corresponding order.
+void SortDevicesAndTasks(CollectiveParams* cp) {
+  VLOG(1) << "SortDevicesAndTasks " << cp << " instance " << &cp->instance;
+  CHECK(cp);
+  CHECK_EQ(cp->group.group_size, cp->instance.device_names.size());
+  CHECK_EQ(cp->group.group_size, cp->instance.task_names.size());
+  std::vector<int> perm(cp->group.group_size);
+  // TODO(tucker): substitute std::iota when the windows build supports it.
+  // std::iota(perm.begin(), perm.end(), 0);
+  for (int i = 0; i < perm.size(); ++i) {
+    perm[i] = i;
+  }
+  std::sort(perm.begin(), perm.end(), [cp](const int& a, const int& b) {
+    return cp->instance.device_names[a] < cp->instance.device_names[b];
+  });
+  std::vector<string> new_devs;
+  std::vector<string> new_tasks;
+  new_devs.reserve(cp->group.group_size);
+  new_tasks.reserve(cp->group.group_size);
+  for (int pi : perm) {
+    new_devs.push_back(cp->instance.device_names[pi]);
+    new_tasks.push_back(cp->instance.task_names[pi]);
+  }
+  cp->instance.device_names = std::move(new_devs);
+  cp->instance.task_names = std::move(new_tasks);
+  VLOG(1) << "Modified device_names on " << cp;
+  SetDevPerTask(cp);
+}
+
+// Establish the requested number of subdivision permutations based on the
+// ring order implicit in the device order.
+void GenerateSubdivPerms(const string& device, int source_rank,
+                         CollectiveParams* cp) {
+  CHECK_GT(cp->instance.impl_details.subdiv_offsets.size(), 0);
+  cp->instance.impl_details.subdiv_permutations.resize(
+      cp->instance.impl_details.subdiv_offsets.size());
+  // Each subdiv permutation is a ring formed by rotating each
+  // single-task subsequence of devices by an offset.  This makes most
+  // sense when each task has the same number of devices but we can't
+  // depend on that being the case so we'll compute something that
+  // works in any case.
+
+  // Start by counting the devices in each task.
+  // Precondition: device_names must be sorted so that all devices in
+  // the same task are adjacent.
+  VLOG(2) << "Sorted task names: "
+          << str_util::Join(cp->instance.task_names, ", ");
+  std::vector<int> dev_per_task;
+  const string* prior_task_name = &cp->instance.task_names[0];
+  int dev_count = 1;
+  for (int di = 1; di < cp->group.group_size; ++di) {
+    if (cp->instance.task_names[di] != *prior_task_name) {
+      dev_per_task.push_back(dev_count);
+      dev_count = 1;
+      prior_task_name = &cp->instance.task_names[di];
+    } else {
+      ++dev_count;
+    }
+  }
+  dev_per_task.push_back(dev_count);
+  CHECK_EQ(cp->group.num_tasks, dev_per_task.size());
+
+  // Generate a ring permutation for each requested offset.
+  CHECK_GT(cp->instance.impl_details.subdiv_offsets.size(), 0);
+  VLOG(2) << "Setting up perms for cp " << cp << " subdiv_permutations "
+          << &cp->instance.impl_details.subdiv_permutations;
+  cp->instance.impl_details.subdiv_permutations.resize(
+      cp->instance.impl_details.subdiv_offsets.size());
+  cp->subdiv_rank.resize(cp->instance.impl_details.subdiv_offsets.size(), -1);
+  for (int sdi = 0; sdi < cp->instance.impl_details.subdiv_offsets.size();
+       ++sdi) {
+    std::vector<int>& perm = cp->instance.impl_details.subdiv_permutations[sdi];
+    CHECK_EQ(perm.size(), 0);
+    int offset = cp->instance.impl_details.subdiv_offsets[sdi];
+    int prior_dev_count = 0;
+    for (int ti = 0; ti < cp->group.num_tasks; ++ti) {
+      for (int di = 0; di < dev_per_task[ti]; ++di) {
+        int offset_di = (di + offset) % dev_per_task[ti];
+        int permuted_di = prior_dev_count + offset_di;
+        perm.push_back(permuted_di);
+        if (cp->instance.device_names[prior_dev_count + di] == device) {
+          CHECK_EQ(prior_dev_count + di, cp->default_rank);
+          cp->subdiv_rank[sdi] = permuted_di;
+        }
+      }
+      prior_dev_count += dev_per_task[ti];
+    }
+    CHECK_EQ(cp->group.group_size, perm.size());
+  }
+
+  if (cp->instance.type == BROADCAST_COLLECTIVE) {
+    CHECK_GE(source_rank, 0);
+    cp->instance.impl_details.subdiv_source_rank.resize(
+        cp->instance.impl_details.subdiv_offsets.size(), -1);
+    for (int sdi = 0; sdi < cp->instance.impl_details.subdiv_source_rank.size();
+         ++sdi) {
+      for (int j = 0; j < cp->group.group_size; ++j) {
+        if (cp->instance.impl_details.subdiv_permutations[sdi][j] ==
+            source_rank) {
+          cp->instance.impl_details.subdiv_source_rank[sdi] = j;
+          break;
+        }
+      }
+      CHECK_GE(cp->instance.impl_details.subdiv_source_rank[sdi], 0);
+    }
+  }
+
+  if (VLOG_IS_ON(1)) {
+    // Log the computed ring order for each subdiv.
+    string buf;
+    for (int sdi = 0;
+         sdi < cp->instance.impl_details.subdiv_permutations.size(); ++sdi) {
+      buf = strings::StrCat("Subdiv ", sdi, " device order:\n");
+      for (int di = 0;
+           di < cp->instance.impl_details.subdiv_permutations[sdi].size();
+           ++di) {
+        int idx = cp->instance.impl_details.subdiv_permutations[sdi][di];
+        strings::StrAppend(&buf, cp->instance.device_names[idx], "\n");
+      }
+      strings::StrAppend(&buf, " subdiv_offsets: ");
+      for (auto o : cp->instance.impl_details.subdiv_offsets)
+        strings::StrAppend(&buf, o, " ");
+      strings::StrAppend(&buf, " SubdivRank: ");
+      for (auto d : cp->subdiv_rank) strings::StrAppend(&buf, d, " ");
+      VLOG(1) << buf;
+    }
+  }
+}
+
+}  // namespace
+
+void CollectiveParamResolverLocal::CompleteTaskIsLocal(const string& task_name,
+                                                       CollectiveParams* cp) {
+  cp->task.is_local.resize(cp->group.group_size, false);
+  for (int i = 0; i < cp->group.group_size; ++i) {
+    cp->task.is_local[i] = (cp->instance.task_names[i] == task_name);
+  }
+}
+
+void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
+                                                  CollectiveParams* cp) {
+  CHECK_EQ(cp->group.group_size, cp->instance.device_names.size()) << cp;
+  for (int i = 0; i < cp->group.group_size; ++i) {
+    if (cp->instance.device_names[i] == device) {
+      cp->default_rank = i;
+      break;
+    }
+  }
+}
+
+Status CollectiveParamResolverLocal::InitInstanceSharedParams(
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
+  VLOG(1) << "InitInstanceSharedParams " << ir;
+  ir->shared.instance = cp->instance;
+  {
+    mutex_lock gl(gr->mu);
+    ir->shared.group = gr->group;
+    ir->shared.instance.device_names.assign(gr->device_list.begin(),
+                                            gr->device_list.end());
+    ir->shared.instance.task_names.assign(gr->task_list.begin(),
+                                          gr->task_list.end());
+    VLOG(2) << "Initialized names for instance: "
+            << ir->shared.instance.ToString();
+  }
+  ir->shared.default_rank = -1;
+
+  // Sort devce_names lexicographcally, keeping task_names in
+  // corresponding order.
+  SortDevicesAndTasks(&ir->shared);
+
+  // Get Locality data for all devices.
+
+  // Set is_local and task_names in *shared prior to invoking
+  // GetDeviceLocalitiesAsync.  In a distributed context this function can be
+  // called by a derived class, some of the devices may be non-local and
+  // GetDeviceLocalitiesAsync will use those fields to launch RPCs.
+  CompleteTaskIsLocal(task_name_, &ir->shared);
+  std::vector<DeviceLocality> localities;
+  Notification note;
+  Status status;
+  dev_resolver_->GetDeviceLocalitiesAsync(ir->shared.instance, &localities,
+                                          [&note, &status](const Status& s) {
+                                            status = s;
+                                            note.Notify();
+                                          });
+  note.WaitForNotification();
+  if (status.ok()) {
+    CompleteDefaultRanking(gr, cp, ir, localities);
+  }
+  return status;
+}
+
+void CollectiveParamResolverLocal::CompleteDefaultRanking(
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
+    const std::vector<DeviceLocality>& localities) {
+  // Establish an instance-specific default rank order for devices
+  // based on localities.  This rank order should be a good ring
+  // order, if possible.
+  GlobalDeviceMap gdm = EstablishGlobalRank(&ir->shared, localities);
+  // Reflect the new global ranking on shared
+  size_t num_devices = ir->shared.group.group_size;
+  std::vector<string> new_device_names(num_devices, "");
+  std::vector<string> new_task_names(num_devices, "");
+  for (const auto& git : gdm) {
+    const TaskDeviceMap& tdm = git.second;
+    for (const auto& tit : tdm) {
+      const DevRec& dr = tit.second;
+      new_device_names[dr.global_rank] =
+          ir->shared.instance.device_names[dr.original_rank];
+      new_task_names[dr.global_rank] =
+          ir->shared.instance.task_names[dr.original_rank];
+    }
+  }
+
+  ir->shared.instance.device_names = new_device_names;
+  ir->shared.instance.task_names = new_task_names;
+  if (VLOG_IS_ON(2)) {
+    string buf;
+    for (const auto& d : cp->instance.device_names)
+      strings::StrAppend(&buf, "\n", d);
+    VLOG(2) << "Optimized device order for " << ir->shared.name << ": " << buf;
+  }
+}
+
+void CollectiveParamResolverLocal::CallbackWithStatus(
+    const InstanceRecCallback& done, InstanceRec* irec) {
+  Status s;
+  {
+    mutex_lock l(irec->out_mu);
+    s = irec->status;
+  }
+  done(s, irec);
+}
+
+void CollectiveParamResolverLocal::FindInstanceRec(
+    const GroupRec* gr, CollectiveParams* cp, const InstanceRecCallback& done) {
+  InstanceRec* irec = nullptr;
+  bool exit_outside_locks = false;
+  {
+    mutex_lock l(instance_mu_);
+    auto it = instance_table_.find(cp->instance.instance_key);
+    if (it != instance_table_.end()) {
+      irec = it->second.get();
+      {
+        mutex_lock l(irec->in_mu);
+        if (irec->is_init) {
+          exit_outside_locks = true;
+        } else {
+          irec->init_waiters.push_back([this, gr, cp, done](InstanceRec* irec) {
+            CallbackWithStatus(done, irec);
+          });
+          return;
+        }
+      }
+    } else {
+      // Create new InstanceRec.
+      irec = new InstanceRec;
+      instance_table_[cp->instance.instance_key].reset(irec);
+    }
+  }
+  if (exit_outside_locks) {
+    CallbackWithStatus(done, irec);
+    return;
+  }
+  // Initialize the new InstanceRec while holding out_mu.
+  {
+    mutex_lock il(irec->out_mu);
+    irec->known.resize(cp->group.group_size, false);
+    irec->status = InitInstanceSharedParams(gr, cp, irec);
+  }
+  // Prepare to invoke any waiters that accumlated during initialization.
+  std::vector<IRConsumer> init_waiters;
+  {
+    mutex_lock tl(instance_mu_);
+    {
+      mutex_lock l(irec->in_mu);
+      irec->is_init = true;
+      if (!irec->init_waiters.empty()) {
+        std::swap(init_waiters, irec->init_waiters);
+      }
+    }
+  }
+  CallbackWithStatus(done, irec);
+  for (auto& f : init_waiters) {
+    f(irec);
+  }
+}
+
+void CollectiveParamResolverLocal::CompleteParamsAsync(
+    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
+    const StatusCallback& done) {
+  VLOG(1) << "CompleteParams " << device << " for " << cp << ": "
+          << cp->ToString();
+  CompleteGroupLocal(
+      device, cp,
+      [this, device, cp, done](const Status& s, const GroupRec* gr) {
+        if (s.ok()) {
+          CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
+        } else {
+          done(s);
+        }
+      });
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceAsync(
+    const CompleteInstanceRequest* request, CompleteInstanceResponse* response,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  done(
+      errors::Internal("CompleteInstance is not implemented by "
+                       "CollectiveParamResolverLocal which is "
+                       "intended only for non-distributed deployment."));
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceLocal(
+    const string& device, const GroupRec* gr, CollectiveParams* cp,
+    bool is_source, const StatusCallback& done) {
+  VLOG(1) << "CompleteInstanceLocal " << device
+          << " instance_key: " << cp->instance.instance_key << " gr " << gr;
+
+  // Populate the group portion of *cp from *gr.  Most of it should already
+  // match.
+  DCHECK_EQ(cp->group.group_key, gr->group.group_key);
+  DCHECK_EQ(cp->group.group_size, gr->group.group_size);
+  DCHECK_EQ(cp->group.device_type, gr->group.device_type);
+  cp->group = gr->group;
+
+  // Get the shared InstanceRec for this instance.
+  FindInstanceRec(gr, cp,
+                  [this, device, gr, cp, is_source, done](const Status& s,
+                                                          InstanceRec* ir) {
+                    if (s.ok()) {
+                      CompleteInstanceFromInitializedIRec(device, gr, cp, ir,
+                                                          is_source, done);
+                    } else {
+                      done(s);
+                    }
+                  });
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
+    const string& device, const GroupRec* gr, CollectiveParams* cp,
+    InstanceRec* ir, bool is_source, const StatusCallback& done) {
+  // Populate the fields common across instance.
+  {
+    mutex_lock l(ir->out_mu);
+    // custom operator= does a deep copy.
+    cp->instance = ir->shared.instance;
+  }
+  // Populate the fields common across task, also default_rank.
+  SetDefaultRank(device, cp);
+  CompleteTaskIsLocal(task_name_, cp);
+  // If broadcast, may need to wait for source discovery.
+  if (cp->instance.type == BROADCAST_COLLECTIVE) {
+    CompleteInstanceSource(ir, cp, is_source,
+                           [this, ir, device, cp, done](InstanceRec* irec) {
+                             CHECK_EQ(ir, irec);
+                             Status s;
+                             int source_rank;
+                             {
+                               mutex_lock l(irec->out_mu);
+                               s = irec->status;
+                               source_rank = ir->source_rank;
+                             }
+                             if (s.ok()) {
+                               GenerateSubdivPerms(device, source_rank, cp);
+                             }
+                             done(s);
+                           });
+    return;
+  } else {
+    GenerateSubdivPerms(device, 0, cp);
+  }
+  done(Status::OK());
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
+                                                          CollectiveParams* cp,
+                                                          bool is_source,
+                                                          const IRConsumer& f) {
+  std::vector<IRConsumer> ready_waiters;
+  {
+    mutex_lock l(ir->out_mu);
+    CHECK_EQ(cp->group.group_size, ir->known.size());
+    CHECK_GE(cp->default_rank, 0);
+    if (!ir->known[cp->default_rank]) {
+      ir->known[cp->default_rank] = true;
+      ++ir->known_count;
+      if (is_source) {
+        if (ir->source_rank >= 0) {
+          ir->status = errors::Internal("Instance ", cp->instance.instance_key,
+                                        " already has source ", ir->source_rank,
+                                        ", recevied second claim from ",
+                                        cp->default_rank);
+        } else {
+          ir->source_rank = cp->default_rank;
+        }
+      }
+    }
+    if (ir->known_count < ir->shared.group.group_size) {
+      ir->known_waiters.push_back(f);
+      return;
+    }
+    CHECK_EQ(ir->known_count, ir->shared.group.group_size);
+    CHECK_GE(ir->source_rank, 0);
+    if (!ir->known_waiters.empty()) {
+      ready_waiters = std::move(ir->known_waiters);
+    }
+  }
+  f(ir);
+  for (auto& f : ready_waiters) {
+    f(ir);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b2946e9368518f7e19a9507eeda913d5e1202a9
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -0,0 +1,211 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+class CompleteGroupRequest;
+class CompleteGroupResponse;
+class CompleteInstanceRequest;
+class CompleteInstanceResponse;
+class DeviceMgr;
+
+// Implements ParamResolverInterface for a single-task context.
+// It also implements the functionality necessary to serve as the
+// group leader for param resolution in a multi-task context.
+class CollectiveParamResolverLocal : public ParamResolverInterface {
+ public:
+  CollectiveParamResolverLocal(const DeviceMgr* dev_mgr,
+                               DeviceResolverInterface* dev_resolver,
+                               const string& task_name);
+
+  ~CollectiveParamResolverLocal() override {}
+
+  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           const StatusCallback& done) override;
+
+  void CompleteGroupAsync(const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          CancellationManager* cancel_mgr,
+                          const StatusCallback& done) override;
+
+  void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             CancellationManager* cancel_mgr,
+                             const StatusCallback& done) override;
+
+ protected:
+  // Used to complete/verify CollGroup.
+  struct GroupRec {
+    CollGroupParams group;
+    mutable mutex mu;
+    Status status GUARDED_BY(mu);
+    std::set<string> device_set GUARDED_BY(mu);
+    std::vector<string> device_list GUARDED_BY(mu);
+    std::set<string> task_set GUARDED_BY(mu);
+    std::vector<string> task_list GUARDED_BY(mu);
+    std::vector<StatusCallback> waiting GUARDED_BY(mu);
+  };
+
+  // Finds the GroupRec that corresponds to cp->group_key.
+  // Also populates cp->group from that group_rec.
+  // Will wait until GroupRec is fully populated or an error arises before
+  // calling done.  Callback GroupRec* arg is only valid if status is ok.
+  // Ownership of GroupRec stays with this object and does not pass to the
+  // callback.
+  typedef std::function<void(const Status& s, const GroupRec* gr)>
+      GroupRecCallback;
+  void CompleteGroupLocal(const string& device, CollectiveParams* cp,
+                          const GroupRecCallback& done)
+      LOCKS_EXCLUDED(group_mu_);
+
+  // Used to complete/verify CollInstance.
+  struct InstanceRec;
+  typedef std::function<void(InstanceRec*)> IRConsumer;
+  struct InstanceRec {
+    // This structure has two mutexes so that a possibly long
+    // initialization can be done without holding the instance_mu_
+    // table lock the whole time (which can cause an excessive number
+    // of threads to block on it), and because the compiler may not
+    // permit mutex locks to be taken in more than one order.
+    //
+    // out_mu guards access to most of the fields.
+    // in_mu guards access to a queue of comsumer callbacks wanting to
+    // read the fields guarded by out_mu.
+    //
+    // The in_mu should be locked only while holding instance_mu_; the
+    // out_mu should be locked only while not holding
+    // instance_mu_.
+    //
+    // When is_init is false (the initial value) any potential user
+    // other than the creator should queue a callback on init_waiters.
+    // As soon as the shared member of this structure is fully
+    // initialized is_init will be set true and those callbacks will
+    // be invoked.
+    //
+    // Once inserted in the table this structure will never be replaced
+    // so users can capture the pointer while holding instance_mu_,
+    // drop that lock, then take a lock on out_mu before
+    // reading/modifying its values.
+    mutex in_mu;
+    bool is_init GUARDED_BY(in_mu);
+    std::vector<IRConsumer> init_waiters GUARDED_BY(in_mu);
+
+    // Values to be shared by all instances, constant after initialization.
+    mutex out_mu;
+    CollectiveParams shared GUARDED_BY(out_mu);
+    // If an error occurs during initialization this structure stays in
+    // the table with a non-OK status.  Purging the table and restarting
+    // needs to be done at a higher level.
+    Status status GUARDED_BY(out_mu);
+
+    // These fields are used to count the instances that have called
+    // in and become known while resolving broadcast source identity.
+    int source_rank GUARDED_BY(out_mu);
+    int known_count GUARDED_BY(out_mu);
+    std::vector<bool> known GUARDED_BY(out_mu);
+    std::vector<IRConsumer> known_waiters GUARDED_BY(out_mu);
+
+    InstanceRec() : is_init(false), source_rank(-1), known_count(0) {}
+  };
+
+  // Find the InstanceRec with the same instance_key as cp.  If it doesn't
+  // already exist, create and initialize from gr and cp.
+  //
+  // Precondition: *gr must be a complete GroupRec, i.e. the value set
+  // by CompleteGroupLocal. *cp must be populated with all the fields
+  // required by InitInstanceSharedParams.  Ownership of InstanceRec stays
+  // with this object and does not pass to the callback.
+  typedef std::function<void(const Status& s, InstanceRec* ir)>
+      InstanceRecCallback;
+  void FindInstanceRec(const GroupRec* gr, CollectiveParams* cp,
+                       const InstanceRecCallback& done)
+      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+
+  // Populate *ir with device membership from gr, then initialize to be specific
+  // to cp->instance_key, i.e. order the devices and tasks.
+  //
+  // Preconditions:
+  //  cp is populated with all DeviceLocalities
+  Status InitInstanceSharedParams(const GroupRec* gr,
+                                  const CollectiveParams* cp, InstanceRec* ir)
+      EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu) LOCKS_EXCLUDED(gr->mu);
+
+  // Establishes the final order of ir->shared.instance.device_names and
+  // ir->shared.instance.task_names by considering localities of all devices.
+  void CompleteDefaultRanking(const GroupRec* gr, const CollectiveParams* cp,
+                              InstanceRec* ir,
+                              const std::vector<DeviceLocality>& localities)
+      EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
+
+  // Finish populating *cp.
+  // Precondition: *gr has been fully populated by CompleteGroupLocal.
+  void CompleteInstanceLocal(const string& device, const GroupRec* gr,
+                             CollectiveParams* cp, bool is_source,
+                             const StatusCallback& done)
+      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+
+  // Finish populating *cp from fully initialized *ir.
+  // Precondition: *gr and *ir are fully populated.
+  void CompleteInstanceFromInitializedIRec(const string& device,
+                                           const GroupRec* gr,
+                                           CollectiveParams* cp,
+                                           InstanceRec* ir, bool is_source,
+                                           const StatusCallback& done)
+      LOCKS_EXCLUDED(ir->out_mu);
+
+  // Complete source data for a broadcast instance.
+  // Precondition: *cp has complete group data and default_rank.
+  void CompleteInstanceSource(InstanceRec* ir, CollectiveParams* cp,
+                              bool is_source, const IRConsumer& f)
+      LOCKS_EXCLUDED(ir->out_mu);
+
+  // If cp.device_names contains only devices local to this process
+  // populates *localities, else returns an error.
+  Status GetLocalDeviceLocalities(const CollectiveParams& cp,
+                                  std::vector<DeviceLocality>* localities);
+
+  // Sets CollTaskParams.is_local and CollectiveParams.default_rank.
+  // Precondition: cp->device_names is fully populated and in final order.
+  void CompleteTaskIsLocal(const string& task_name, CollectiveParams* cp);
+
+  // Sets cp->instance_default_rank according to location of device in
+  // current ordering of cp->instance.device_names.
+  void SetDefaultRank(const string& device, CollectiveParams* cp);
+
+  // Helper to grab status under lock, invoke callback out of lock.
+  void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
+      LOCKS_EXCLUDED(irec->out_mu);
+
+  const DeviceMgr* dev_mgr_;
+  DeviceResolverInterface* dev_resolver_;
+  string task_name_;
+  mutex group_mu_;
+  gtl::FlatMap<int32, std::unique_ptr<GroupRec>> group_table_
+      GUARDED_BY(group_mu_);
+  mutex instance_mu_;
+  gtl::FlatMap<int32, std::unique_ptr<InstanceRec>> instance_table_
+      GUARDED_BY(instance_mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e33c4779a3fb78bcb8d850d834060dfc69c6df5
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class CollectiveParamResolverLocalTest : public ::testing::Test {
+ protected:
+  CollectiveParamResolverLocalTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    string task_name = "/job:localhost/replica:0/task:0";
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
+                                                task_name));
+  }
+
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DeviceResolverLocal> drl_;
+  std::unique_ptr<CollectiveParamResolverLocal> prl_;
+};
+
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
+  CollectiveParams cps[NUM_DEVS];
+  Status statuses[NUM_DEVS];
+  Notification note[NUM_DEVS];
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    CollectiveParams* cp = &cps[i];
+    cp->group.group_key = 1;
+    cp->group.group_size = 3;
+    cp->group.device_type = DeviceType("CPU");
+    cp->group.num_tasks = 1;
+    cp->instance.instance_key = 7;
+    cp->instance.type = REDUCTION_COLLECTIVE;
+    cp->instance.data_type = DataType(DT_FLOAT);
+    cp->instance.shape = TensorShape({5});
+    cp->instance.device_names.push_back(
+        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
+    cp->instance.impl_details.subdiv_offsets.push_back(0);
+    cp->is_source = false;
+    Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
+      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+                                nullptr /*CancellationManager*/,
+                                [this, &statuses, &note, i](const Status& s) {
+                                  statuses[i] = s;
+                                  note[i].Notify();
+                                });
+    });
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    note[i].WaitForNotification();
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    TF_ASSERT_OK(statuses[i]);
+    ASSERT_EQ(cps[i].instance.device_names.size(), 3);
+    for (int j = 0; j < NUM_DEVS; ++j) {
+      EXPECT_EQ(
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", j),
+          cps[i].instance.device_names[j]);
+      EXPECT_TRUE(cps[i].task.is_local[j]);
+    }
+    EXPECT_EQ(cps[i].subdiv_rank[0], i);
+    EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank.size(), 0);
+    EXPECT_FALSE(cps[i].is_source);
+    EXPECT_EQ(cps[i].default_rank, i);
+    EXPECT_TRUE(cps[i].instance.same_num_devices_per_task);
+  }
+}
+
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
+  CollectiveParams cps[NUM_DEVS];
+  Status statuses[NUM_DEVS];
+  Notification note[NUM_DEVS];
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    CollectiveParams* cp = &cps[i];
+    cp->group.group_key = 1;
+    cp->group.group_size = 3;
+    cp->group.device_type = DeviceType("CPU");
+    cp->group.num_tasks = 1;
+    cp->instance.instance_key = 3;
+    cp->instance.type = BROADCAST_COLLECTIVE;
+    cp->instance.data_type = DataType(DT_FLOAT);
+    cp->instance.shape = TensorShape({5});
+    cp->instance.device_names.push_back(
+        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
+    cp->instance.impl_details.subdiv_offsets.push_back(0);
+    cp->is_source = (i == 1);
+    Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
+      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+                                nullptr /*CancellationManager*/,
+                                [this, &statuses, &note, i](const Status& s) {
+                                  statuses[i] = s;
+                                  note[i].Notify();
+                                });
+    });
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    note[i].WaitForNotification();
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    TF_ASSERT_OK(statuses[i]);
+    ASSERT_EQ(cps[i].instance.device_names.size(), 3);
+    for (int j = 0; j < NUM_DEVS; ++j) {
+      EXPECT_EQ(
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", j),
+          cps[i].instance.device_names[j]);
+      EXPECT_TRUE(cps[i].task.is_local[j]);
+    }
+    ASSERT_GT(cps[i].subdiv_rank.size(), 0);
+    EXPECT_EQ(cps[i].subdiv_rank[0], i);
+    ASSERT_GT(cps[i].instance.impl_details.subdiv_source_rank.size(), 0);
+    EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank[0], 1);
+    EXPECT_EQ(cps[i].is_source, (i == 1));
+    EXPECT_EQ(cps[i].default_rank, i);
+    EXPECT_TRUE(cps[i].instance.same_num_devices_per_task);
+  }
+}
+
+// TEST_F(CollectiveParamResolverLocalTest,
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad9b32ce3514dcfb29662d781ca6f1febd406c89
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+
+namespace tensorflow {
+
+void CollectiveRemoteAccessLocal::StartAbort(const Status& s) {
+  buf_rendezvous_.StartAbort(s);
+}
+
+void CollectiveRemoteAccessLocal::RecvFromPeer(
+    const string& peer_device, const string& peer_task, bool peer_is_local,
+    const string& key, Device* to_device, DeviceContext* to_device_ctx,
+    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+    const DeviceLocality& client_locality, const StatusCallback& done) {
+  VLOG(1) << "RecvFromPeer " << this << " from " << peer_device << " key "
+          << key;
+  if (!peer_is_local) {
+    done(
+        errors::Internal("CollectiveRemoteAccessLocal::RecvFromPeer "
+                         "called with peer_is_local=false"));
+    return;
+  }
+  buf_rendezvous_.ConsumeBuf(
+      key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr, done](
+               const Status& s, BufRendezvous::Hook* hook) {
+        if (!s.ok()) {
+          done(s);
+          delete hook;
+        } else {
+          int64 recv_bytes = to_tensor->TotalBytes();
+          CHECK_EQ(recv_bytes, hook->prod_value->TotalBytes());
+          MemCpyAsync(hook->prod_ctx,    // src DeviceContext
+                      to_device_ctx,     // dst DeviceContext
+                      hook->prod_dev,    // src Device
+                      to_device,         // dst Device
+                      hook->prod_attr,   // src AllocatorAttributes
+                      to_alloc_attr,     // dst AllocatorAttributes
+                      hook->prod_value,  // src Tensor*
+                      to_tensor,         // dst Tensor*
+                      [hook, done](const Status& s) {
+                        done(s);
+                        hook->prod_cb(s);
+                        delete hook;
+                      });
+        }
+      });
+}
+
+void CollectiveRemoteAccessLocal::PostToPeer(
+    const string& peer_device, const string& peer_task, const string& key,
+    Device* from_device, DeviceContext* from_device_ctx,
+    const AllocatorAttributes& from_alloc_attr, const Tensor* from_tensor,
+    const DeviceLocality& client_locality, const StatusCallback& done) {
+  VLOG(1) << "PostToPeer " << this << " key " << key
+          << " step_id_=" << step_id_;
+  buf_rendezvous_.ProvideBuf(key, from_device, from_device_ctx, from_tensor,
+                             from_alloc_attr, done);
+}
+
+/*static*/
+void CollectiveRemoteAccessLocal::MemCpyAsync(
+    DeviceContext* src_dev_ctx, DeviceContext* dst_dev_ctx, Device* src_dev,
+    Device* dst_dev, const AllocatorAttributes& src_attr,
+    const AllocatorAttributes& dst_attr, const Tensor* src, Tensor* dst,
+    const StatusCallback& done) {
+  // We want a real copy to happen, i.e. the bytes inside of src should be
+  // transferred to the buffer backing dst.  If src and dst are on different
+  // devices then CopyTensor::ViaDMA will do just that.  But if they're both
+  // the same CPU, then it will actually just reset dst to point to src.
+  // Since this routine is used for copying between devices and within a
+  // device, we need to detect and bypass the wrong-semantics case.
+  const DeviceType src_device_type(
+      src_attr.on_host() ? DEVICE_CPU : src_dev->attributes().device_type());
+  const DeviceType dst_device_type(
+      dst_attr.on_host() ? DEVICE_CPU : dst_dev->attributes().device_type());
+  const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
+  const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
+  if (non_cpu_src) CHECK(src_dev_ctx);
+  if (non_cpu_dst) CHECK(dst_dev_ctx);
+  if (non_cpu_src || non_cpu_dst) {
+    CopyTensor::ViaDMA("",  // edge name (non-existent)
+                       src_dev_ctx, dst_dev_ctx, src_dev, dst_dev, src_attr,
+                       dst_attr, src, dst, done);
+  } else {
+    int64 bytes = src->TotalBytes();
+    DCHECK_EQ(dst->TotalBytes(), bytes);
+    memcpy(DMAHelper::base(dst), DMAHelper::base(src), bytes);
+    done(Status::OK());
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..716e23bfa16e7bc76cd82b993ca458378ebaf5f1
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
+#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/rendezvous.h"
+
+namespace tensorflow {
+
+// Basic implementation of PerStepCollectiveRemoteAccess.
+class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
+ public:
+  CollectiveRemoteAccessLocal(const DeviceMgr* dev_mgr,
+                              DeviceResolverInterface* dev_resolver,
+                              int64 step_id)
+      : dev_mgr_(dev_mgr),
+        dev_resolver_(dev_resolver),
+        buf_rendezvous_(step_id),
+        step_id_(step_id) {}
+
+  virtual ~CollectiveRemoteAccessLocal() {}
+
+  void StartAbort(const Status& s);
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override;
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override;
+
+  void GetDeviceLocalitiesAsync(const CollInstanceParams& ci_params,
+                                std::vector<DeviceLocality>* localities,
+                                const StatusCallback& done) override {
+    dev_resolver_->GetDeviceLocalitiesAsync(ci_params, localities, done);
+  }
+
+  void GetLocalityAsync(const string& device, const string& task,
+                        DeviceLocality* locality,
+                        const StatusCallback& done) override {
+    dev_resolver_->GetLocalityAsync(device, task, locality, done);
+  }
+
+  void ClearTask(const string& task) override {
+    dev_resolver_->ClearTask(task);
+  }
+
+  BufRendezvous* buf_rendezvous() override { return &buf_rendezvous_; }
+
+  // Copy utility that always copies bytes from src to dst even if
+  // they are on the same device, unlike CopyTensor::ViaDMA which will
+  // just change the dst buffer pointer in that case.
+  static void MemCpyAsync(DeviceContext* src_dev_ctx,
+                          DeviceContext* dst_dev_ctx, Device* src_dev,
+                          Device* dst_dev, const AllocatorAttributes& src_attr,
+                          const AllocatorAttributes& dst_attr,
+                          const Tensor* src, Tensor* dst,
+                          const StatusCallback& done);
+
+ protected:
+  const DeviceMgr* dev_mgr_;               // not owned
+  DeviceResolverInterface* dev_resolver_;  // not owned
+  BufRendezvous buf_rendezvous_;
+  int64 step_id_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcd4272d96b5f855660509bf69de4585128f836c
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+static const int kStepId = 123;
+
+class CollectiveRemoteAccessLocalTest : public ::testing::Test {
+ protected:
+  const string kTaskName = "/job:localhost/replica:0/task:0";
+
+  CollectiveRemoteAccessLocalTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
+                                                kTaskName));
+    rma_.reset(new CollectiveRemoteAccessLocal(device_mgr_.get(), drl_.get(),
+                                               kStepId));
+  }
+
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DeviceResolverLocal> drl_;
+  std::unique_ptr<CollectiveParamResolverLocal> prl_;
+  std::unique_ptr<CollectiveRemoteAccessLocal> rma_;
+};
+
+TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
+  Device* cpu0 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:0", &cpu0));
+  Tensor sink_tensor(DT_FLOAT, TensorShape({8}));
+  Notification recv_note;
+  Status recv_status;
+  rma_->RecvFromPeer(kTaskName + "/device:CPU:0", kTaskName, true /*is_local*/,
+                     "key_0", cpu0 /*to_device*/, nullptr /*to_device_ctx*/,
+                     attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
+                     [this, &recv_note, &recv_status](const Status& s) {
+                       recv_status = s;
+                       recv_note.Notify();
+                     });
+  Tensor source_tensor(DT_FLOAT, TensorShape({8}));
+  for (int i = 0; i < 8; ++i) {
+    source_tensor.flat<float>()(i) = i / 2;
+  }
+  // Tensors have distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+  Notification send_note;
+  Status send_status;
+  rma_->PostToPeer(kTaskName + "/device:CPU:0", kTaskName, "key_0",
+                   cpu0 /*from_device*/, nullptr /*from_device_ctx*/,
+                   attr /*to_alloc_attr*/, &source_tensor, dev_locality,
+                   [this, &send_note, &send_status](const Status& s) {
+                     send_status = s;
+                     send_note.Notify();
+                   });
+  recv_note.WaitForNotification();
+  send_note.WaitForNotification();
+  TF_EXPECT_OK(recv_status);
+  TF_EXPECT_OK(send_status);
+  // Sink tensor gets the source tensor values.
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(sink_tensor.flat<float>()(i), i / 2);
+  }
+  // And still has distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+}
+
+TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
+  Device* cpu2 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:2", &cpu2));
+  Tensor sink_tensor(DT_FLOAT, TensorShape({8}));
+  Notification recv_note;
+  Status recv_status;
+  rma_->RecvFromPeer(kTaskName + "/device:CPU:1", kTaskName, true /*is_local*/,
+                     "key_0", cpu2 /*to_device*/, nullptr /*to_device_ctx*/,
+                     attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
+                     [this, &recv_note, &recv_status](const Status& s) {
+                       recv_status = s;
+                       recv_note.Notify();
+                     });
+  Tensor source_tensor(DT_FLOAT, TensorShape({8}));
+  for (int i = 0; i < 8; ++i) {
+    source_tensor.flat<float>()(i) = i / 2;
+  }
+  // Tensors have distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+  Device* cpu1 = nullptr;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:1", &cpu1));
+  Notification send_note;
+  Status send_status;
+  rma_->PostToPeer(kTaskName + "/device:CPU:2", kTaskName, "key_0",
+                   cpu1 /*from_device*/, nullptr /*from_device_ctx*/,
+                   attr /*to_alloc_attr*/, &source_tensor, dev_locality,
+                   [this, &send_note, &send_status](const Status& s) {
+                     send_status = s;
+                     send_note.Notify();
+                   });
+  recv_note.WaitForNotification();
+  send_note.WaitForNotification();
+  TF_EXPECT_OK(recv_status);
+  TF_EXPECT_OK(send_status);
+  // Sink tensor gets the source tensor values.
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(sink_tensor.flat<float>()(i), i / 2);
+  }
+  // And still has distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index b1e1fb831963bccb81731752ec76b9d5be123d9f..84598880bb20e74570fb79de8e9e0d75fa341658 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/env.h"
 
+// TODO(skyewm): can this be combined with EvaluateConstantTensor?
+
 namespace tensorflow {
 
 // This generator type is used to generate a name for the newly folded node
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 6ac9319ad1e2c4953c2d82257dac6a3aeeffcd5c..16b61315f29322565492da8c168c6fbc89d6daf1 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/null_file_system.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index e35548729b993c68f6e58180e0c2dc18b4eea801..08d120c7a5bed69616cdb4b8e641e600edccaeac 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -237,7 +237,7 @@ void CopyTensor::ViaDMA(StringPiece edge_name, DeviceContext* send_dev_context,
                         const AllocatorAttributes dst_alloc_attr,
                         const Tensor* input, Tensor* output,
                         StatusCallback done) {
-  port::Tracing::ScopedAnnotation annotation(edge_name);
+  tracing::ScopedAnnotation annotation(edge_name);
   VLOG(1) << "Copy " << edge_name;
 
   const DeviceType src_device_type(
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 1f0cc5e83bdb8520cb49387ea34e69491c71f61e..a77601ba79bf29998e1bb78b5e39dbdbd48434e9 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -94,8 +94,8 @@ Status DeviceMgr::LookupDevice(StringPiece name, Device** device) const {
     for (auto&& itr : device_map_) {
       device_names.push_back(itr.first);
     }
-    LOG(WARNING) << "Unknown device: " << name
-                 << " all devices: " << str_util::Join(device_names, ", ");
+    VLOG(1) << "Unknown device: " << name
+            << " all devices: " << str_util::Join(device_names, ", ");
     return errors::InvalidArgument(name, " unknown device.");
   }
   *device = iter->second;
diff --git a/tensorflow/core/common_runtime/device_resolver_local.cc b/tensorflow/core/common_runtime/device_resolver_local.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17ef4a228449560c619b51dabbac0e67ec1d4db8
--- /dev/null
+++ b/tensorflow/core/common_runtime/device_resolver_local.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+
+namespace tensorflow {
+
+void DeviceResolverLocal::GetDeviceLocalitiesAsync(
+    const CollInstanceParams& ci_params,
+    std::vector<DeviceLocality>* localities, const StatusCallback& done) {
+  localities->clear();
+  for (const string& device_name : ci_params.device_names) {
+    Device* dev;
+    Status s = dev_mgr_->LookupDevice(device_name, &dev);
+    if (!s.ok()) {
+      done(s);
+      return;
+    }
+    localities->push_back(dev->attributes().locality());
+  }
+  done(Status::OK());
+}
+
+void DeviceResolverLocal::GetLocalityAsync(const string& device,
+                                           const string& task,
+                                           DeviceLocality* locality,
+                                           const StatusCallback& done) {
+  Device* dev;
+  Status s = dev_mgr_->LookupDevice(device, &dev);
+  if (s.ok()) {
+    *locality = dev->attributes().locality();
+  }
+  done(s);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_resolver_local.h b/tensorflow/core/common_runtime/device_resolver_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..098eccdf842ea754c445e9cb83a2b270ec82e386
--- /dev/null
+++ b/tensorflow/core/common_runtime/device_resolver_local.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+#define TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+class DeviceMgr;
+
+// Implements DeviceResolverInterface in a single-task context.
+class DeviceResolverLocal : public DeviceResolverInterface {
+ public:
+  DeviceResolverLocal(const DeviceMgr* dev_mgr) : dev_mgr_(dev_mgr) {}
+
+  virtual ~DeviceResolverLocal() {}
+
+  void GetDeviceLocalitiesAsync(const CollInstanceParams& ci_params,
+                                std::vector<DeviceLocality>* localities,
+                                const StatusCallback& done) override;
+
+  void GetLocalityAsync(const string& device, const string& task,
+                        DeviceLocality* locality,
+                        const StatusCallback& done) override;
+
+  void ClearTask(const string& task) override {}
+
+ protected:
+  const DeviceMgr* dev_mgr_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5a6471ff731578d377ccfc9ad146847ae3f221c
--- /dev/null
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class DeviceResolverLocalTest : public ::testing::Test {
+ protected:
+  DeviceResolverLocalTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    string task_name = "/job:localhost/replica:0/task:0";
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+  }
+
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DeviceResolverLocal> drl_;
+};
+
+TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesKnown) {
+  CollectiveParams cp;
+  std::vector<DeviceLocality> localities;
+  cp.instance.device_names.push_back(
+      "/job:localhost/replica:0/task:0/device:CPU:1");
+  cp.instance.device_names.push_back(
+      "/job:localhost/replica:0/task:0/device:CPU:2");
+  Notification note;
+  Status status;
+  drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
+                                 [this, &note, &status](const Status& s) {
+                                   status = s;
+                                   note.Notify();
+                                 });
+  note.WaitForNotification();
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(2, localities.size());
+}
+
+TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesUnknown) {
+  CollectiveParams cp;
+  std::vector<DeviceLocality> localities;
+  // In some builds there may be 1 GPU, but there should never be 9.
+  cp.instance.device_names.push_back(
+      "/job:localhost/replica:0/task:0/device:GPU:9");
+  Notification note;
+  Status status;
+  drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
+                                 [this, &note, &status](const Status& s) {
+                                   status = s;
+                                   note.Notify();
+                                 });
+  note.WaitForNotification();
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(0, localities.size());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index ecbffcbf6c4030bde82f2abe0e7779bf9c5a9870..0afbd02e866c279134d5c3576127f6d174068f60 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
@@ -53,7 +54,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -69,20 +70,6 @@ auto* direct_session_runs = monitoring::Counter<0>::New(
     "/tensorflow/core/direct_session_runs",
     "The number of times DirectSession::Run() has been called.");
 
-int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
-  const int32 t = options.config.inter_op_parallelism_threads();
-  if (t != 0) return t;
-  // Default to using the number of cores available in the process.
-  return port::NumSchedulableCPUs();
-}
-
-thread::ThreadPool* NewThreadPoolFromSessionOptions(
-    const SessionOptions& options) {
-  const int32 num_threads = NumInterOpThreadsFromSessionOptions(options);
-  VLOG(1) << "Direct session inter op parallelism threads: " << num_threads;
-  return new thread::ThreadPool(options.env, "Compute", num_threads);
-}
-
 Status NewThreadPoolFromThreadPoolOptions(
     const SessionOptions& options,
     const ThreadPoolOptionProto& thread_pool_options, int pool_number,
@@ -318,6 +305,7 @@ DirectSession::~DirectSession() {
   for (auto& it : executors_) {
     it.second.reset();
   }
+  callables_.clear();
   for (auto d : device_mgr_->ListDevices()) {
     d->op_segment()->RemoveHold(session_handle_);
   }
@@ -409,16 +397,21 @@ Status DirectSession::Run(const NamedTensorList& inputs,
 }
 
 Status DirectSession::CreateDebuggerState(
-    const DebugOptions& debug_options, int64 session_run_index,
-    int64 executor_step_index, const std::vector<string>& input_names,
-    const std::vector<string>& output_names,
-    const std::vector<string>& target_names,
+    const CallableOptions& callable_options, int64 global_step,
+    int64 session_run_index, int64 executor_step_index,
     std::unique_ptr<DebuggerStateInterface>* debugger_state) {
-  TF_RETURN_IF_ERROR(
-      DebuggerStateRegistry::CreateState(debug_options, debugger_state));
+  TF_RETURN_IF_ERROR(DebuggerStateRegistry::CreateState(
+      callable_options.run_options().debug_options(), debugger_state));
+  std::vector<string> input_names(callable_options.feed().begin(),
+                                  callable_options.feed().end());
+  std::vector<string> output_names(callable_options.fetch().begin(),
+                                   callable_options.fetch().end());
+  std::vector<string> target_names(callable_options.target().begin(),
+                                   callable_options.target().end());
+
   TF_RETURN_IF_ERROR(debugger_state->get()->PublishDebugMetadata(
-      debug_options.global_step(), session_run_index, executor_step_index,
-      input_names, output_names, target_names));
+      global_step, session_run_index, executor_step_index, input_names,
+      output_names, target_names));
   return Status::OK();
 }
 
@@ -433,84 +426,23 @@ Status DirectSession::DecorateAndPublishGraphForDebug(
   return Status::OK();
 }
 
-Status DirectSession::Run(const RunOptions& run_options,
-                          const NamedTensorList& inputs,
-                          const std::vector<string>& output_names,
-                          const std::vector<string>& target_nodes,
-                          std::vector<Tensor>* outputs,
-                          RunMetadata* run_metadata) {
-  TF_RETURN_IF_ERROR(CheckNotClosed());
-  direct_session_runs->GetCell()->IncrementBy(1);
-  {
-    mutex_lock l(graph_def_lock_);
-    if (!graph_created_) {
-      return errors::InvalidArgument(
-          "Session was not created with a graph before Run()!");
-    }
-  }
-
-  // Extract the inputs names for this run of the session.
-  std::vector<string> input_tensor_names;
-  input_tensor_names.reserve(inputs.size());
-  for (const auto& it : inputs) {
-    input_tensor_names.push_back(it.first);
-  }
-
-  if (run_options.inter_op_thread_pool() < 0 ||
-      run_options.inter_op_thread_pool() >= thread_pools_.size()) {
-    return errors::InvalidArgument("Invalid inter_op_thread_pool: ",
-                                   run_options.inter_op_thread_pool());
-  }
-  thread::ThreadPool* pool =
-      thread_pools_[run_options.inter_op_thread_pool()].first;
-
-  // Check if we already have an executor for these arguments.
-  ExecutorsAndKeys* executors_and_keys;
-  RunStateArgs run_state_args(run_options.debug_options());
-
-  Executor::Args args;
-  args.step_id = step_id_counter_.fetch_add(1);
-
-  TF_RETURN_IF_ERROR(GetOrCreateExecutors(input_tensor_names, output_names,
-                                          target_nodes, &executors_and_keys,
-                                          &run_state_args));
+Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
+                                  CallFrameInterface* call_frame,
+                                  ExecutorsAndKeys* executors_and_keys,
+                                  RunMetadata* run_metadata) {
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
   std::unique_ptr<DebuggerStateInterface> debugger_state;
   if (!run_options.debug_options().debug_tensor_watch_opts().empty()) {
-    TF_RETURN_IF_ERROR(CreateDebuggerState(
-        run_options.debug_options(), args.step_id, executor_step_count,
-        input_tensor_names, output_names, target_nodes, &debugger_state));
-  }
-
-  // Configure a call frame for the step, which we use to feed and
-  // fetch values to and from the executors.
-  FunctionCallFrame call_frame(executors_and_keys->input_types,
-                               executors_and_keys->output_types);
-  gtl::InlinedVector<Tensor, 4> feed_args(inputs.size());
-  for (const auto& it : inputs) {
-    if (it.second.dtype() == DT_RESOURCE) {
-      Tensor tensor_from_handle;
-      TF_RETURN_IF_ERROR(
-          ResourceHandleToInputTensor(it.second, &tensor_from_handle));
-      feed_args[executors_and_keys->input_name_to_index[it.first]] =
-          tensor_from_handle;
-    } else {
-      feed_args[executors_and_keys->input_name_to_index[it.first]] = it.second;
-    }
-  }
-  const Status s = call_frame.SetArgs(feed_args);
-  if (errors::IsInternal(s)) {
-    return errors::InvalidArgument(s.error_message());
-  } else if (!s.ok()) {
-    return s;
+    TF_RETURN_IF_ERROR(
+        CreateDebuggerState(executors_and_keys->callable_options,
+                            run_options.debug_options().global_step(), step_id,
+                            executor_step_count, &debugger_state));
   }
 
   // Create a run state and start execution.
-  RunState run_state(args.step_id, &devices_);
+  RunState run_state(step_id, &devices_);
   run_state.rendez = new IntraProcessRendezvous(device_mgr_.get());
-  CancellationManager step_cancellation_manager;
-  args.call_frame = &call_frame;
 
   // Start parallel Executors.
   const size_t num_executors = executors_and_keys->items.size();
@@ -523,15 +455,15 @@ Status DirectSession::Run(const RunOptions& run_options,
         run_state.executors_done.Notify();
       });
 
+  Executor::Args args;
+  args.step_id = step_id;
+  args.call_frame = call_frame;
   args.rendezvous = run_state.rendez;
+  CancellationManager step_cancellation_manager;
   args.cancellation_manager = &step_cancellation_manager;
-
   args.session_state = &session_state_;
   args.tensor_store = &run_state.tensor_store;
   args.step_container = &run_state.step_container;
-  if (LogMemory::IsEnabled()) {
-    LogMemory::RecordStep(args.step_id, run_state_args.handle);
-  }
   args.sync_on_finish = sync_on_finish_;
 
   const bool do_trace = (run_options.trace_level() > RunOptions::NO_TRACE);
@@ -569,6 +501,14 @@ Status DirectSession::Run(const RunOptions& run_options,
     }
   }
 
+  if (run_options.inter_op_thread_pool() < 0 ||
+      run_options.inter_op_thread_pool() >= thread_pools_.size()) {
+    run_state.executors_done.Notify();
+    delete barrier;
+    return errors::InvalidArgument("Invalid inter_op_thread_pool: ",
+                                   run_options.inter_op_thread_pool());
+  }
+
   // Register this step with session's cancellation manager, so that
   // `Session::Close()` will cancel the step.
   const CancellationToken cancellation_token =
@@ -586,6 +526,9 @@ Status DirectSession::Run(const RunOptions& run_options,
     return errors::Cancelled("Run call was cancelled");
   }
 
+  thread::ThreadPool* pool =
+      thread_pools_[run_options.inter_op_thread_pool()].first;
+
   Executor::Args::Runner default_runner = [this,
                                            pool](Executor::Args::Closure c) {
     SchedClosure(pool, std::move(c));
@@ -628,6 +571,111 @@ Status DirectSession::Run(const RunOptions& run_options,
     TF_RETURN_IF_ERROR(run_state.status);
   }
 
+  // Save the output tensors of this run we choose to keep.
+  if (!run_state.tensor_store.empty()) {
+    TF_RETURN_IF_ERROR(run_state.tensor_store.SaveTensors(
+        {executors_and_keys->callable_options.fetch().begin(),
+         executors_and_keys->callable_options.fetch().end()},
+        &session_state_));
+  }
+
+  if (args.stats_collector) {
+    args.stats_collector->Finalize();
+  }
+
+  // Build and return the cost model as instructed.
+  if (update_cost_model) {
+    // Build the cost model
+    std::unordered_map<string, const Graph*> device_to_graph;
+    for (const PerPartitionExecutorsAndLib& partition :
+         executors_and_keys->items) {
+      const Graph* graph = partition.graph;
+      const string device = partition.flib->device()->name();
+      device_to_graph[device] = graph;
+    }
+
+    mutex_lock l(executor_lock_);
+    args.stats_collector->BuildCostModel(&cost_model_manager_, device_to_graph);
+
+    // annotate stats onto cost graph.
+    CostGraphDef* cost_graph = run_metadata->mutable_cost_graph();
+    for (const auto& item : executors_and_keys->items) {
+      TF_RETURN_IF_ERROR(
+          cost_model_manager_.AddToCostGraphDef(item.graph, cost_graph));
+    }
+  }
+
+  // If requested via RunOptions, output the partition graphs.
+  if (run_options.output_partition_graphs()) {
+    protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
+        run_metadata->mutable_partition_graphs();
+    for (const PerPartitionExecutorsAndLib& exec_and_lib :
+         executors_and_keys->items) {
+      GraphDef* partition_graph_def = partition_graph_defs->Add();
+      exec_and_lib.graph->ToGraphDef(partition_graph_def);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status DirectSession::Run(const RunOptions& run_options,
+                          const NamedTensorList& inputs,
+                          const std::vector<string>& output_names,
+                          const std::vector<string>& target_nodes,
+                          std::vector<Tensor>* outputs,
+                          RunMetadata* run_metadata) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  TF_RETURN_IF_ERROR(CheckGraphCreated("Run()"));
+  direct_session_runs->GetCell()->IncrementBy(1);
+
+  // Extract the inputs names for this run of the session.
+  std::vector<string> input_tensor_names;
+  input_tensor_names.reserve(inputs.size());
+  for (const auto& it : inputs) {
+    input_tensor_names.push_back(it.first);
+  }
+
+  // Check if we already have an executor for these arguments.
+  ExecutorsAndKeys* executors_and_keys;
+  RunStateArgs run_state_args(run_options.debug_options());
+
+  TF_RETURN_IF_ERROR(GetOrCreateExecutors(input_tensor_names, output_names,
+                                          target_nodes, &executors_and_keys,
+                                          &run_state_args));
+
+  // Configure a call frame for the step, which we use to feed and
+  // fetch values to and from the executors.
+  FunctionCallFrame call_frame(executors_and_keys->input_types,
+                               executors_and_keys->output_types);
+  gtl::InlinedVector<Tensor, 4> feed_args(inputs.size());
+  for (const auto& it : inputs) {
+    if (it.second.dtype() == DT_RESOURCE) {
+      Tensor tensor_from_handle;
+      TF_RETURN_IF_ERROR(
+          ResourceHandleToInputTensor(it.second, &tensor_from_handle));
+      feed_args[executors_and_keys->input_name_to_index[it.first]] =
+          tensor_from_handle;
+    } else {
+      feed_args[executors_and_keys->input_name_to_index[it.first]] = it.second;
+    }
+  }
+  const Status s = call_frame.SetArgs(feed_args);
+  if (errors::IsInternal(s)) {
+    return errors::InvalidArgument(s.error_message());
+  } else if (!s.ok()) {
+    return s;
+  }
+
+  const int64 step_id = step_id_counter_.fetch_add(1);
+
+  if (LogMemory::IsEnabled()) {
+    LogMemory::RecordStep(step_id, run_state_args.handle);
+  }
+
+  TF_RETURN_IF_ERROR(RunInternal(step_id, run_options, &call_frame,
+                                 executors_and_keys, run_metadata));
+
   // Receive outputs.
   if (outputs) {
     std::vector<Tensor> sorted_outputs;
@@ -667,45 +715,6 @@ Status DirectSession::Run(const RunOptions& run_options,
     }
   }
 
-  // Save the output tensors of this run we choose to keep.
-  TF_RETURN_IF_ERROR(
-      run_state.tensor_store.SaveTensors(output_names, &session_state_));
-  if (args.stats_collector) {
-    args.stats_collector->Finalize();
-  }
-
-  // Build and return the cost model as instructed.
-  mutex_lock l(executor_lock_);
-  if (update_cost_model) {
-    // Build the cost model
-    std::unordered_map<string, const Graph*> device_to_graph;
-    for (const PerPartitionExecutorsAndLib& partition :
-         executors_and_keys->items) {
-      const Graph* graph = partition.graph;
-      const string device = partition.flib->device()->name();
-      device_to_graph[device] = graph;
-    }
-    args.stats_collector->BuildCostModel(&cost_model_manager_, device_to_graph);
-
-    // annotate stats onto cost graph.
-    CostGraphDef* cost_graph = run_metadata->mutable_cost_graph();
-    for (const auto& item : executors_and_keys->items) {
-      TF_RETURN_IF_ERROR(
-          cost_model_manager_.AddToCostGraphDef(item.graph, cost_graph));
-    }
-  }
-
-  // If requested via RunOptions, output the partition graphs.
-  if (run_options.output_partition_graphs()) {
-    protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
-        run_metadata->mutable_partition_graphs();
-    for (const PerPartitionExecutorsAndLib& exec_and_lib :
-         executors_and_keys->items) {
-      GraphDef* partition_graph_def = partition_graph_defs->Add();
-      exec_and_lib.graph->ToGraphDef(partition_graph_def);
-    }
-  }
-
   return Status::OK();
 }
 
@@ -714,13 +723,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
                                 const std::vector<string>& target_nodes,
                                 string* handle) {
   TF_RETURN_IF_ERROR(CheckNotClosed());
-  {
-    mutex_lock l(graph_def_lock_);
-    if (!graph_created_) {
-      return errors::InvalidArgument(
-          "Session was not created with a graph before PRunSetup()!");
-    }
-  }
+  TF_RETURN_IF_ERROR(CheckGraphCreated("PRunSetup()"));
 
   // RunOptions is not available in PRunSetup, so use thread pool 0.
   thread::ThreadPool* pool = thread_pools_[0].first;
@@ -1061,92 +1064,20 @@ Status DirectSession::CheckFetch(const NamedTensorList& feeds,
   return Status::OK();
 }
 
-Status DirectSession::GetOrCreateExecutors(
-    gtl::ArraySlice<string> inputs, gtl::ArraySlice<string> outputs,
-    gtl::ArraySlice<string> target_nodes, ExecutorsAndKeys** executors_and_keys,
+Status DirectSession::CreateExecutors(
+    const CallableOptions& callable_options,
+    std::unique_ptr<ExecutorsAndKeys>* out_executors_and_keys,
+    std::unique_ptr<FunctionInfo>* out_func_info,
     RunStateArgs* run_state_args) {
-  int64 handle_name_counter_value = -1;
-  if (LogMemory::IsEnabled() || run_state_args->is_partial_run) {
-    handle_name_counter_value = handle_name_counter_.fetch_add(1);
-  }
-
-  string debug_tensor_watches_summary;
-  if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) {
-    debug_tensor_watches_summary = SummarizeDebugTensorWatches(
-        run_state_args->debug_options.debug_tensor_watch_opts());
-  }
-
-  // Fast lookup path, no sorting.
-  const string key = strings::StrCat(
-      str_util::Join(inputs, ","), "->", str_util::Join(outputs, ","), "/",
-      str_util::Join(target_nodes, ","), "/", run_state_args->is_partial_run,
-      "/", debug_tensor_watches_summary);
-  // Set the handle, if it's needed to log memory or for partial run.
-  if (handle_name_counter_value >= 0) {
-    run_state_args->handle =
-        strings::StrCat(key, ";", handle_name_counter_value);
-  }
-
-  // See if we already have the executors for this run.
-  {
-    mutex_lock l(executor_lock_);  // could use reader lock
-    auto it = executors_.find(key);
-    if (it != executors_.end()) {
-      *executors_and_keys = it->second.get();
-      return Status::OK();
-    }
-  }
-
-  // Slow lookup path, the unsorted key missed the cache.
-  // Sort the inputs and outputs, and look up with the sorted key in case an
-  // earlier call used a different order of inputs and outputs.
-  //
-  // We could consider some other signature instead of sorting that
-  // preserves the same property to avoid the sort in the future.
-  std::vector<string> inputs_sorted(inputs.begin(), inputs.end());
-  std::sort(inputs_sorted.begin(), inputs_sorted.end());
-  std::vector<string> outputs_sorted(outputs.begin(), outputs.end());
-  std::sort(outputs_sorted.begin(), outputs_sorted.end());
-  std::vector<string> tn_sorted(target_nodes.begin(), target_nodes.end());
-  std::sort(tn_sorted.begin(), tn_sorted.end());
-
-  const string sorted_key = strings::StrCat(
-      str_util::Join(inputs_sorted, ","), "->",
-      str_util::Join(outputs_sorted, ","), "/", str_util::Join(tn_sorted, ","),
-      "/", run_state_args->is_partial_run, "/", debug_tensor_watches_summary);
-  // Set the handle, if its needed to log memory or for partial run.
-  if (handle_name_counter_value >= 0) {
-    run_state_args->handle =
-        strings::StrCat(sorted_key, ";", handle_name_counter_value);
-  }
-
-  // See if we already have the executors for this run.
-  {
-    mutex_lock l(executor_lock_);
-    auto it = executors_.find(sorted_key);
-    if (it != executors_.end()) {
-      *executors_and_keys = it->second.get();
-      // Insert this under the original key.
-      executors_.emplace(key, it->second);
-      return Status::OK();
-    }
-  }
-
-  // Nothing found, so create the executors and store in the cache.
   BuildGraphOptions options;
-  options.feed_endpoints = inputs_sorted;
-  options.fetch_endpoints = outputs_sorted;
-  options.target_nodes = tn_sorted;
+  options.callable_options = callable_options;
   options.use_function_convention = !run_state_args->is_partial_run;
-  if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) {
-    options.debug_options = run_state_args->debug_options;
-  }
 
   std::unique_ptr<FunctionInfo> func_info(new FunctionInfo);
-  std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
+  std::unique_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
+
+  ek->callable_options = callable_options;
 
-  // The executor_lock_ is intentionally released while executor is
-  // being created.
   std::unordered_map<string, std::unique_ptr<Graph>> graphs;
   TF_RETURN_IF_ERROR(CreateGraphs(options, &graphs, &func_info->flib_def,
                                   run_state_args, &ek->input_types,
@@ -1155,11 +1086,11 @@ Status DirectSession::GetOrCreateExecutors(
   if (run_state_args->is_partial_run) {
     ek->graph = std::move(run_state_args->graph);
     std::unordered_set<StringPiece, StringPieceHasher> names;
-    for (const string& input : inputs) {
+    for (const string& input : callable_options.feed()) {
       TensorId id(ParseTensorName(input));
       names.emplace(id.first);
     }
-    for (const string& output : outputs) {
+    for (const string& output : callable_options.fetch()) {
       TensorId id(ParseTensorName(output));
       names.emplace(id.first);
     }
@@ -1181,7 +1112,7 @@ Status DirectSession::GetOrCreateExecutors(
   }
   func_info->proc_flr.reset(new ProcessFunctionLibraryRuntime(
       device_mgr_.get(), options_.env, graph_def_version,
-      func_info->flib_def.get(), optimizer_opts));
+      func_info->flib_def.get(), optimizer_opts, thread_pools_[0].first));
 
   GraphOptimizer optimizer(optimizer_opts);
   for (auto iter = graphs.begin(); iter != graphs.end(); ++iter) {
@@ -1236,9 +1167,11 @@ Status DirectSession::GetOrCreateExecutors(
                        /*shape_map=*/nullptr);
 
     // EXPERIMENTAL: tfdbg inserts debug nodes in the graph.
-    if (!options.debug_options.debug_tensor_watch_opts().empty()) {
+    const DebugOptions& debug_options =
+        options.callable_options.run_options().debug_options();
+    if (!debug_options.debug_tensor_watch_opts().empty()) {
       TF_RETURN_IF_ERROR(DecorateAndPublishGraphForDebug(
-          options.debug_options, partition_graph.get(), params.device));
+          debug_options, partition_graph.get(), params.device));
     }
 
     TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device->device_type()),
@@ -1260,12 +1193,12 @@ Status DirectSession::GetOrCreateExecutors(
     // For regular `Run()`, we use the function calling convention, and so
     // maintain a mapping from input/output names to
     // argument/return-value ordinal index.
-    for (size_t i = 0; i < inputs_sorted.size(); ++i) {
-      const string& input = inputs_sorted[i];
+    for (int i = 0; i < callable_options.feed().size(); ++i) {
+      const string& input = callable_options.feed(i);
       ek->input_name_to_index[input] = i;
     }
-    for (size_t i = 0; i < outputs_sorted.size(); ++i) {
-      const string& output = outputs_sorted[i];
+    for (int i = 0; i < callable_options.fetch().size(); ++i) {
+      const string& output = callable_options.fetch(i);
       ek->output_name_to_index[output] = i;
     }
   } else {
@@ -1274,26 +1207,123 @@ Status DirectSession::GetOrCreateExecutors(
     //
     // We always use the first device as the device name portion of the
     // key, even if we're feeding another graph.
-    for (size_t i = 0; i < inputs_sorted.size(); ++i) {
-      const string& input = inputs_sorted[i];
+    for (int i = 0; i < callable_options.feed().size(); ++i) {
+      const string& input = callable_options.feed(i);
       ek->input_name_to_rendezvous_key[input] = GetRendezvousKey(
           input, device_set_.client_device()->attributes(), FrameAndIter(0, 0));
     }
-    for (size_t i = 0; i < outputs_sorted.size(); ++i) {
-      const string& output = outputs_sorted[i];
+    for (int i = 0; i < callable_options.fetch().size(); ++i) {
+      const string& output = callable_options.fetch(i);
       ek->output_name_to_rendezvous_key[output] =
           GetRendezvousKey(output, device_set_.client_device()->attributes(),
                            FrameAndIter(0, 0));
     }
   }
 
+  *out_executors_and_keys = std::move(ek);
+  *out_func_info = std::move(func_info);
+  return Status::OK();
+}
+
+Status DirectSession::GetOrCreateExecutors(
+    gtl::ArraySlice<string> inputs, gtl::ArraySlice<string> outputs,
+    gtl::ArraySlice<string> target_nodes, ExecutorsAndKeys** executors_and_keys,
+    RunStateArgs* run_state_args) {
+  int64 handle_name_counter_value = -1;
+  if (LogMemory::IsEnabled() || run_state_args->is_partial_run) {
+    handle_name_counter_value = handle_name_counter_.fetch_add(1);
+  }
+
+  string debug_tensor_watches_summary;
+  if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) {
+    debug_tensor_watches_summary = SummarizeDebugTensorWatches(
+        run_state_args->debug_options.debug_tensor_watch_opts());
+  }
+
+  // Fast lookup path, no sorting.
+  const string key = strings::StrCat(
+      str_util::Join(inputs, ","), "->", str_util::Join(outputs, ","), "/",
+      str_util::Join(target_nodes, ","), "/", run_state_args->is_partial_run,
+      "/", debug_tensor_watches_summary);
+  // Set the handle, if it's needed to log memory or for partial run.
+  if (handle_name_counter_value >= 0) {
+    run_state_args->handle =
+        strings::StrCat(key, ";", handle_name_counter_value);
+  }
+
+  // See if we already have the executors for this run.
+  {
+    mutex_lock l(executor_lock_);  // could use reader lock
+    auto it = executors_.find(key);
+    if (it != executors_.end()) {
+      *executors_and_keys = it->second.get();
+      return Status::OK();
+    }
+  }
+
+  // Slow lookup path, the unsorted key missed the cache.
+  // Sort the inputs and outputs, and look up with the sorted key in case an
+  // earlier call used a different order of inputs and outputs.
+  //
+  // We could consider some other signature instead of sorting that
+  // preserves the same property to avoid the sort in the future.
+  std::vector<string> inputs_sorted(inputs.begin(), inputs.end());
+  std::sort(inputs_sorted.begin(), inputs_sorted.end());
+  std::vector<string> outputs_sorted(outputs.begin(), outputs.end());
+  std::sort(outputs_sorted.begin(), outputs_sorted.end());
+  std::vector<string> tn_sorted(target_nodes.begin(), target_nodes.end());
+  std::sort(tn_sorted.begin(), tn_sorted.end());
+
+  const string sorted_key = strings::StrCat(
+      str_util::Join(inputs_sorted, ","), "->",
+      str_util::Join(outputs_sorted, ","), "/", str_util::Join(tn_sorted, ","),
+      "/", run_state_args->is_partial_run, "/", debug_tensor_watches_summary);
+  // Set the handle, if its needed to log memory or for partial run.
+  if (handle_name_counter_value >= 0) {
+    run_state_args->handle =
+        strings::StrCat(sorted_key, ";", handle_name_counter_value);
+  }
+
+  // See if we already have the executors for this run.
+  {
+    mutex_lock l(executor_lock_);
+    auto it = executors_.find(sorted_key);
+    if (it != executors_.end()) {
+      *executors_and_keys = it->second.get();
+      // Insert this under the original key.
+      executors_.emplace(key, it->second);
+      return Status::OK();
+    }
+  }
+
+  // Nothing found, so create the executors and store in the cache.
+  // The executor_lock_ is intentionally released while executors are
+  // being created.
+  CallableOptions callable_options;
+  for (const string& input : inputs_sorted) {
+    callable_options.add_feed(input);
+  }
+  for (const string& output : outputs_sorted) {
+    callable_options.add_fetch(output);
+  }
+  for (const string& target : tn_sorted) {
+    callable_options.add_target(target);
+  }
+  *callable_options.mutable_run_options()->mutable_debug_options() =
+      run_state_args->debug_options;
+  std::unique_ptr<ExecutorsAndKeys> ek;
+  std::unique_ptr<FunctionInfo> func_info;
+  TF_RETURN_IF_ERROR(
+      CreateExecutors(callable_options, &ek, &func_info, run_state_args));
+
   // Reacquire the lock, try to insert into the map.
   mutex_lock l(executor_lock_);
   functions_.push_back(std::move(func_info));
 
   // Another thread may have created the entry before us, in which case we will
   // reuse the already created one.
-  auto insert_result = executors_.emplace(sorted_key, ek);
+  auto insert_result = executors_.emplace(
+      sorted_key, std::shared_ptr<ExecutorsAndKeys>(std::move(ek)));
   // Insert the value under the original key, so the fast path lookup will work
   // if the user uses the same order of inputs, outputs, and targets again.
   executors_.emplace(key, insert_result.first->second);
@@ -1332,19 +1362,19 @@ Status DirectSession::CreateGraphs(
         execution_state->BuildGraph(subgraph_options, &client_graph));
   }
 
-  if (subgraph_options.feed_endpoints.size() !=
+  if (subgraph_options.callable_options.feed_size() !=
       client_graph->feed_types.size()) {
     return errors::Internal(
         "Graph pruning failed: requested number of feed endpoints = ",
-        subgraph_options.feed_endpoints.size(),
+        subgraph_options.callable_options.feed_size(),
         " versus number of pruned feed endpoints = ",
         client_graph->feed_types.size());
   }
-  if (subgraph_options.fetch_endpoints.size() !=
+  if (subgraph_options.callable_options.fetch_size() !=
       client_graph->fetch_types.size()) {
     return errors::Internal(
         "Graph pruning failed: requested number of fetch endpoints = ",
-        subgraph_options.fetch_endpoints.size(),
+        subgraph_options.callable_options.fetch_size(),
         " versus number of pruned fetch endpoints = ",
         client_graph->fetch_types.size());
   }
@@ -1562,4 +1592,156 @@ void DirectSession::WaitForNotification(RunState* run_state,
   return Status::OK();
 }
 
+Status DirectSession::MakeCallable(const CallableOptions& callable_options,
+                                   CallableHandle* out_handle) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  TF_RETURN_IF_ERROR(CheckGraphCreated("MakeCallable()"));
+
+  if (!callable_options.run_options()
+           .debug_options()
+           .debug_tensor_watch_opts()
+           .empty()) {
+    return errors::Unimplemented(
+        "Debug options are not currently supported via the C++ MakeCallable "
+        "interface.");
+  }
+
+  std::unique_ptr<ExecutorsAndKeys> ek;
+  std::unique_ptr<FunctionInfo> func_info;
+  RunStateArgs run_state_args(callable_options.run_options().debug_options());
+  TF_RETURN_IF_ERROR(
+      CreateExecutors(callable_options, &ek, &func_info, &run_state_args));
+  {
+    mutex_lock l(callables_lock_);
+    *out_handle = next_callable_handle_++;
+    callables_[*out_handle] = {std::move(ek), std::move(func_info)};
+  }
+  return Status::OK();
+}
+
+class DirectSession::RunCallableCallFrame : public CallFrameInterface {
+ public:
+  RunCallableCallFrame(DirectSession* session,
+                       ExecutorsAndKeys* executors_and_keys,
+                       const std::vector<Tensor>* feed_tensors,
+                       std::vector<Tensor>* fetch_tensors)
+      : session_(session),
+        executors_and_keys_(executors_and_keys),
+        feed_tensors_(feed_tensors),
+        fetch_tensors_(fetch_tensors) {}
+
+  size_t num_args() const override {
+    return executors_and_keys_->input_types.size();
+  }
+  size_t num_retvals() const override {
+    return executors_and_keys_->output_types.size();
+  }
+
+  Status GetArg(int index, Tensor* val) const override {
+    if (index > feed_tensors_->size()) {
+      return errors::Internal("Args index out of bounds: ", index);
+    } else if (executors_and_keys_->input_types[index] == DT_RESOURCE) {
+      TF_RETURN_IF_ERROR(
+          session_->ResourceHandleToInputTensor((*feed_tensors_)[index], val));
+    } else {
+      *val = (*feed_tensors_)[index];
+    }
+    return Status::OK();
+  }
+
+  Status SetRetval(int index, const Tensor& val) override {
+    if (index > fetch_tensors_->size()) {
+      return errors::Internal("RetVal index out of bounds: ", index);
+    }
+    (*fetch_tensors_)[index] = val;
+    return Status::OK();
+  }
+
+ private:
+  DirectSession* const session_;                   // Not owned.
+  ExecutorsAndKeys* const executors_and_keys_;     // Not owned.
+  const std::vector<Tensor>* const feed_tensors_;  // Not owned.
+  std::vector<Tensor>* const fetch_tensors_;       // Not owned.
+};
+
+::tensorflow::Status DirectSession::RunCallable(
+    CallableHandle handle, const std::vector<Tensor>& feed_tensors,
+    std::vector<Tensor>* fetch_tensors, RunMetadata* run_metadata) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  TF_RETURN_IF_ERROR(CheckGraphCreated("RunCallable()"));
+  direct_session_runs->GetCell()->IncrementBy(1);
+
+  // Check if we already have an executor for these arguments.
+  std::shared_ptr<ExecutorsAndKeys> executors_and_keys;
+  const int64 step_id = step_id_counter_.fetch_add(1);
+
+  {
+    tf_shared_lock l(callables_lock_);
+    if (handle >= next_callable_handle_) {
+      return errors::InvalidArgument("No such callable handle: ", handle);
+    }
+    executors_and_keys = callables_[handle].executors_and_keys;
+  }
+
+  if (!executors_and_keys) {
+    return errors::InvalidArgument(
+        "Attempted to run callable after handle was released: ", handle);
+  }
+
+  // NOTE(mrry): Debug options are not currently supported in the
+  // callable interface.
+  DebugOptions debug_options;
+  RunStateArgs run_state_args(debug_options);
+
+  // Configure a call frame for the step, which we use to feed and
+  // fetch values to and from the executors.
+  if (feed_tensors.size() != executors_and_keys->input_types.size()) {
+    return errors::InvalidArgument(
+        "Expected ", executors_and_keys->input_types.size(),
+        " feed tensors, but got ", feed_tensors.size());
+  }
+  if (fetch_tensors != nullptr) {
+    fetch_tensors->resize(executors_and_keys->output_types.size());
+  } else if (!executors_and_keys->output_types.empty()) {
+    return errors::InvalidArgument(
+        "`fetch_tensors` must be provided when the callable has one or more "
+        "outputs.");
+  }
+
+  // A specialized CallFrame implementation that takes advantage of the
+  // optimized RunCallable interface.
+
+  RunCallableCallFrame call_frame(this, executors_and_keys.get(), &feed_tensors,
+                                  fetch_tensors);
+
+  if (LogMemory::IsEnabled()) {
+    LogMemory::RecordStep(step_id, run_state_args.handle);
+  }
+
+  TF_RETURN_IF_ERROR(
+      RunInternal(step_id, executors_and_keys->callable_options.run_options(),
+                  &call_frame, executors_and_keys.get(), run_metadata));
+
+  return Status::OK();
+}
+
+::tensorflow::Status DirectSession::ReleaseCallable(CallableHandle handle) {
+  mutex_lock l(callables_lock_);
+  if (handle >= next_callable_handle_) {
+    return errors::InvalidArgument("No such callable handle: ", handle);
+  }
+  callables_.erase(handle);
+  return Status::OK();
+}
+
+DirectSession::Callable::~Callable() {
+  // We must delete the fields in this order, because the destructor
+  // of `executors_and_keys` will call into an object owned by
+  // `function_info` (in particular, when deleting a kernel, it relies
+  // on the `FunctionLibraryRuntime` to know if the kernel is stateful
+  // or not).
+  executors_and_keys.reset();
+  function_info.reset();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 45d765f8498e5e12eef3a47cd4a7ff0ad22aa495..6f9c1b980b30213373d82ded639fc2e614b3752d 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -107,6 +107,14 @@ class DirectSession : public Session {
     cost_model_manager_.ExportCostModels(cost_models);
   }
 
+  ::tensorflow::Status MakeCallable(const CallableOptions& callable_options,
+                                    CallableHandle* out_handle) override;
+  ::tensorflow::Status RunCallable(CallableHandle handle,
+                                   const std::vector<Tensor>& feed_tensors,
+                                   std::vector<Tensor>* fetch_tensors,
+                                   RunMetadata* run_metadata) override;
+  ::tensorflow::Status ReleaseCallable(CallableHandle handle) override;
+
  private:
   // We create one executor and its dependent library runtime for
   // every partition.
@@ -139,6 +147,8 @@ class DirectSession : public Session {
 
     DataTypeVector input_types;
     DataTypeVector output_types;
+
+    CallableOptions callable_options;
   };
 
   // A FunctionInfo object is created for every unique set of feeds/fetches.
@@ -206,6 +216,14 @@ class DirectSession : public Session {
       gtl::ArraySlice<string> target_nodes,
       ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args);
 
+  // Creates a set of executors to run the subgraph defined by
+  // `callable_options`.
+  ::tensorflow::Status CreateExecutors(
+      const CallableOptions& callable_options,
+      std::unique_ptr<ExecutorsAndKeys>* out_executors_and_keys,
+      std::unique_ptr<FunctionInfo>* out_func_info,
+      RunStateArgs* run_state_args);
+
   // Creates several graphs given the existing graph_def_ and the
   // input feeds and fetches, given 'devices'. The graphs share a common
   // function library 'flib_def'.
@@ -216,6 +234,11 @@ class DirectSession : public Session {
       RunStateArgs* run_state_args, DataTypeVector* input_types,
       DataTypeVector* output_types);
 
+  ::tensorflow::Status RunInternal(int64 step_id, const RunOptions& run_options,
+                                   CallFrameInterface* call_frame,
+                                   ExecutorsAndKeys* executors_and_keys,
+                                   RunMetadata* run_metadata);
+
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_def_lock_);
 
@@ -257,11 +280,18 @@ class DirectSession : public Session {
     return ::tensorflow::Status::OK();
   }
 
+  ::tensorflow::Status CheckGraphCreated(const char* method) {
+    mutex_lock l(graph_def_lock_);
+    if (!graph_created_) {
+      return errors::InvalidArgument(
+          "Session was not created with a graph before ", method, "!");
+    }
+    return ::tensorflow::Status::OK();
+  }
+
   ::tensorflow::Status CreateDebuggerState(
-      const DebugOptions& debug_options, int64 session_run_index,
-      int64 executor_step_index, const std::vector<string>& input_names,
-      const std::vector<string>& output_names,
-      const std::vector<string>& target_names,
+      const CallableOptions& options, int64 global_step,
+      int64 session_run_index, int64 executor_step_index,
       std::unique_ptr<DebuggerStateInterface>* debugger_state);
 
   ::tensorflow::Status DecorateAndPublishGraphForDebug(
@@ -303,6 +333,16 @@ class DirectSession : public Session {
   std::unordered_map<string, std::shared_ptr<ExecutorsAndKeys>> executors_
       GUARDED_BY(executor_lock_);
 
+  class RunCallableCallFrame;
+  struct Callable {
+    std::shared_ptr<ExecutorsAndKeys> executors_and_keys;
+    std::shared_ptr<FunctionInfo> function_info;
+    ~Callable();
+  };
+  mutex callables_lock_;
+  int64 next_callable_handle_ GUARDED_BY(callables_lock_) = 0;
+  std::unordered_map<int64, Callable> callables_ GUARDED_BY(callables_lock_);
+
   // Holds mappings from handle to partial run state.
   std::unordered_map<string, std::unique_ptr<RunState>> partial_runs_
       GUARDED_BY(executor_lock_);
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index b75a4f76d94f704cf38a6c4657b6089a863c085f..8ddc9958b2259f4da6dc1750c6c79a706c804be8 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -48,6 +50,22 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+CallableOptions MakeCallableOptions(gtl::ArraySlice<string> feeds,
+                                    gtl::ArraySlice<string> fetches,
+                                    gtl::ArraySlice<string> targets) {
+  CallableOptions ret;
+  for (const string& feed : feeds) {
+    ret.add_feed(feed);
+  }
+  for (const string& fetch : fetches) {
+    ret.add_fetch(fetch);
+  }
+  for (const string& target : targets) {
+    ret.add_target(target);
+  }
+  return ret;
+}
+
 std::unique_ptr<Session> CreateSession() {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 2;
@@ -63,6 +81,7 @@ class DirectSessionMinusAXTest : public ::testing::Test {
     test::FillValues<float>(&a_tensor, a_values);
     Node* a = test::graph::Constant(&graph, a_tensor);
     a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
+    a_ = a->name();
 
     Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
     test::FillValues<float>(&x_tensor, {1, 1});
@@ -79,12 +98,18 @@ class DirectSessionMinusAXTest : public ::testing::Test {
     y_neg_ = y_neg->name();
     y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
 
+    Node* z = test::graph::Unary(&graph, "Identity", y_neg);
+    z_ = z->name();
+    z->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
+
     test::graph::ToGraphDef(&graph, &def_);
   }
 
+  string a_;
   string x_;
   string y_;
   string y_neg_;
+  string z_;
   GraphDef def_;
 };
 
@@ -110,6 +135,205 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork) {
   EXPECT_FLOAT_EQ(5.0, mat(0, 0));
 }
 
+TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+
+  // Run the test twice to ensure that the Make/Run/Release cycle is hermetic.
+  for (int i = 0; i < 2; ++i) {
+    // Request two targets: one fetch output and one non-fetched output.
+    Session::CallableHandle handle;
+    TF_ASSERT_OK(session->MakeCallable(
+        MakeCallableOptions({}, {y_ + ":0"}, {y_neg_}), &handle));
+
+    for (int i = 0; i < 2; ++i) {
+      std::vector<Tensor> outputs;
+      TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+
+      ASSERT_EQ(1, outputs.size());
+      // The first output should be initialized and have the correct
+      // output.
+      auto mat = outputs[0].matrix<float>();
+      ASSERT_TRUE(outputs[0].IsInitialized());
+      EXPECT_FLOAT_EQ(5.0, mat(0, 0));
+    }
+
+    Status s = session->RunCallable(handle, {}, nullptr, nullptr);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                      "`fetch_tensors` must be provided"));
+
+    TF_ASSERT_OK(session->ReleaseCallable(handle));
+
+    std::vector<Tensor> outputs;
+    s = session->RunCallable(handle, {}, &outputs, nullptr);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Attempted to run callable after handle was released"));
+
+    s = session->RunCallable(handle + 1, {}, &outputs, nullptr);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(
+        str_util::StrContains(s.error_message(), "No such callable handle"));
+  }
+}
+
+TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+
+  {
+    // Directly wire the output of node a to the output of node y, making the
+    // callable graph into "Neg(a);".
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+    ASSERT_EQ(1, outputs.size());
+    auto mat = outputs[0].matrix<float>();
+    ASSERT_TRUE(outputs[0].IsInitialized());
+    EXPECT_FLOAT_EQ(-3.0, mat(0, 0));
+    EXPECT_FLOAT_EQ(-2.0, mat(0, 1));
+    EXPECT_FLOAT_EQ(1.0, mat(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat(1, 1));
+    TF_ASSERT_OK(session->ReleaseCallable(handle));
+  }
+
+  {
+    // Directly wire the output of node a to the output of node y, making the
+    // callable graph into "Neg(a);"; also fetch the result of a.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(a_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+    ASSERT_EQ(2, outputs.size());
+    auto mat_a = outputs[0].matrix<float>();
+    ASSERT_TRUE(outputs[0].IsInitialized());
+    EXPECT_FLOAT_EQ(3.0, mat_a(0, 0));
+    EXPECT_FLOAT_EQ(2.0, mat_a(0, 1));
+    EXPECT_FLOAT_EQ(-1.0, mat_a(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat_a(1, 1));
+
+    auto mat_y_neg = outputs[1].matrix<float>();
+    ASSERT_TRUE(outputs[1].IsInitialized());
+    EXPECT_FLOAT_EQ(-3.0, mat_y_neg(0, 0));
+    EXPECT_FLOAT_EQ(-2.0, mat_y_neg(0, 1));
+    EXPECT_FLOAT_EQ(1.0, mat_y_neg(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat_y_neg(1, 1));
+    TF_ASSERT_OK(session->ReleaseCallable(handle));
+  }
+
+  {
+    // Wire the output of "Neg(Matmul(a, x))" to the output of "a",
+    // creating an invalid cycle.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(y_ + ":0");
+    c->set_to_tensor(a_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(
+        str_util::StrContains(s.error_message(), "would create a cycle"));
+  }
+
+  {
+    // Attempt to wire a non-existent node to a node that does exist.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor("unknown_node:0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown node"));
+  }
+
+  {
+    // Attempt to wire a non-existent output from a node that does
+    // exist to another node.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":17");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown edge"));
+  }
+
+  {
+    // Attempt to wire a tensor to a node that doesn't exist.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor("unknown_node:0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsNotFound(s));
+    EXPECT_TRUE(
+        str_util::StrContains(s.error_message(), "unable to find feed output"));
+  }
+
+  {
+    // Attempt to wire two tensors to the same tensor.
+    CallableOptions callable_options;
+    TensorConnection* c1 = callable_options.add_tensor_connection();
+    c1->set_from_tensor(a_ + ":0");
+    c1->set_to_tensor(y_neg_ + ":0");
+    TensorConnection* c2 = callable_options.add_tensor_connection();
+    c2->set_from_tensor(x_ + ":0");
+    c2->set_to_tensor(y_neg_ + ":0");
+    callable_options.add_fetch(z_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+  }
+
+  {
+    // Attempt to wire a tensor to a tensor that is also being fed.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_feed(y_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+  }
+}
+
 TEST_F(DirectSessionMinusAXTest, TestFeed) {
   Initialize({1, 2, 3, 4});
   auto session = CreateSession();
@@ -139,6 +363,39 @@ TEST_F(DirectSessionMinusAXTest, TestFeed) {
   EXPECT_FLOAT_EQ(39.0, mat(1, 0));
 }
 
+TEST_F(DirectSessionMinusAXTest, TestFeed_Callable) {
+  Initialize({1, 2, 3, 4});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+
+  TF_ASSERT_OK(session->Create(def_));
+
+  // Fill in the input and ask for the output
+  //
+  // Note that the input being fed is on the second device.
+  CallableOptions callable_options;
+  callable_options.add_feed(x_);
+  callable_options.add_fetch(y_ + ":0");
+  Session::CallableHandle handle;
+  TF_ASSERT_OK(session->MakeCallable(MakeCallableOptions({x_}, {y_ + ":0"}, {}),
+                                     &handle));
+  Tensor t(DT_FLOAT, TensorShape({2, 1}));
+  t.matrix<float>()(0, 0) = 5;
+  t.matrix<float>()(1, 0) = 6;
+  std::vector<Tensor> inputs = {t};
+  std::vector<Tensor> outputs;
+
+  // Run the callable
+  TF_ASSERT_OK(session->RunCallable(handle, inputs, &outputs, nullptr));
+
+  ASSERT_EQ(1, outputs.size());
+  auto mat = outputs[0].matrix<float>();
+
+  // Expect outputs to be; 1*5 + 2*6, 3*5 + 4*6
+  EXPECT_FLOAT_EQ(17.0, mat(0, 0));
+  EXPECT_FLOAT_EQ(39.0, mat(1, 0));
+}
+
 TEST_F(DirectSessionMinusAXTest, TestConcurrency) {
   Initialize({1, 2, 3, 4});
   auto session = CreateSession();
@@ -171,6 +428,39 @@ TEST_F(DirectSessionMinusAXTest, TestConcurrency) {
   delete tp;
 }
 
+TEST_F(DirectSessionMinusAXTest, TestConcurrency_Callable) {
+  Initialize({1, 2, 3, 4});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+
+  // Fill in the input and ask for the output
+  thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "test", 4);
+
+  Session::CallableHandle handle;
+  TF_ASSERT_OK(
+      session->MakeCallable(MakeCallableOptions({}, {y_ + ":0"}, {}), &handle));
+
+  // Run the callable 1000 times in 4 different threads concurrently.
+  auto fn = [&session, handle]() {
+    for (int i = 0; i < 1000; ++i) {
+      std::vector<Tensor> outputs;
+      // Run the graph
+      TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+      ASSERT_EQ(1, outputs.size());
+      auto mat = outputs[0].matrix<float>();
+      EXPECT_FLOAT_EQ(3.0, mat(0, 0));
+    }
+  };
+
+  for (int i = 0; i < 4; ++i) {
+    tp->Schedule(fn);
+  }
+
+  // Wait for the functions to finish.
+  delete tp;
+}
+
 TEST_F(DirectSessionMinusAXTest, TestPerSessionThreads) {
   Initialize({1, 2, 3, 4});
 
@@ -296,6 +586,38 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetworkWithOpts) {
   EXPECT_EQ(run_metadata.step_stats().dev_stats_size(), 2);
 }
 
+TEST_F(DirectSessionMinusAXTest, RunSimpleNetworkWithOpts_Callable) {
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+
+  // Request two targets: one fetch output and one non-fetched output.
+  Session::CallableHandle handle;
+  CallableOptions callable_options =
+      MakeCallableOptions({}, {y_ + ":0"}, {y_neg_});
+  callable_options.mutable_run_options()->set_trace_level(
+      RunOptions::FULL_TRACE);
+  TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+
+  RunMetadata run_metadata;
+  EXPECT_EQ(run_metadata.step_stats().dev_stats_size(), 0);
+
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, &run_metadata));
+
+  ASSERT_EQ(1, outputs.size());
+  // The first output should be initialized and have the correct
+  // output.
+  auto mat = outputs[0].matrix<float>();
+  ASSERT_TRUE(outputs[0].IsInitialized());
+  EXPECT_FLOAT_EQ(5.0, mat(0, 0));
+
+  // Checks RunMetadata is well-formed
+  ASSERT_TRUE(run_metadata.has_step_stats());
+  EXPECT_EQ(run_metadata.step_stats().dev_stats_size(), 2);
+}
+
 TEST(DirectSessionTest, KeepsStateAcrossRunsOfSession) {
   GraphDef def;
   Graph g(OpRegistry::Global());
@@ -405,7 +727,139 @@ TEST(DirectSessionTest, MultipleFeedTest) {
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+}
+
+TEST(DirectSessionTest, MultipleFeedTest_Callable) {
+  GraphDef def;
+  Graph g(OpRegistry::Global());
+
+  Tensor first_value(DT_FLOAT, TensorShape({}));
+  first_value.scalar<float>()() = 1.0;
+  Node* first_const = test::graph::Constant(&g, first_value);
+  Node* first_identity = test::graph::Identity(&g, first_const);
+
+  Tensor second_value(DT_FLOAT, TensorShape({}));
+  second_value.scalar<float>()() = 2.0;
+  Node* second_const = test::graph::Constant(&g, second_value);
+  Node* second_identity = test::graph::Identity(&g, second_const);
+
+  test::graph::ToGraphDef(&g, &def);
+
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  Session::CallableHandle handle;
+  std::vector<Tensor> outputs;
+
+  // Fetch without feeding.
+  TF_ASSERT_OK(session->MakeCallable(
+      MakeCallableOptions(
+          {}, {first_identity->name() + ":0", second_identity->name() + ":0"},
+          {}),
+      &handle));
+  TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(1.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(2.0, outputs[1].flat<float>()(0));
+
+  TF_ASSERT_OK(session->MakeCallable(
+      MakeCallableOptions(
+          {}, {second_identity->name() + ":0", first_identity->name() + ":0"},
+          {}),
+      &handle));
+  TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(2.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(1.0, outputs[1].flat<float>()(0));
+
+  Tensor value_11(DT_FLOAT, TensorShape({}));
+  value_11.scalar<float>()() = 11.0;
+  Tensor value_22(DT_FLOAT, TensorShape({}));
+  value_22.scalar<float>()() = 22.0;
+
+  // Feed [first_const, second_const]
+  TF_ASSERT_OK(session->MakeCallable(
+      MakeCallableOptions(
+          {first_const->name(), second_const->name()},
+          {first_identity->name() + ":0", second_identity->name() + ":0"}, {}),
+      &handle));
+  TF_ASSERT_OK(
+      session->RunCallable(handle, {value_11, value_22}, &outputs, nullptr));
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(11.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(22.0, outputs[1].flat<float>()(0));
+
+  // Feed [second_const, first_const]
+  TF_ASSERT_OK(session->MakeCallable(
+      MakeCallableOptions(
+          {second_const->name(), first_const->name()},
+          {first_identity->name() + ":0", second_identity->name() + ":0"}, {}),
+      &handle));
+  TF_ASSERT_OK(
+      session->RunCallable(handle, {value_22, value_11}, &outputs, nullptr));
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(11.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(22.0, outputs[1].flat<float>()(0));
+
+  // Feed [first_const, first_const]
+  Status s = session->MakeCallable(
+      MakeCallableOptions(
+          {first_const->name(), first_const->name()},
+          {first_identity->name() + ":0", second_identity->name() + ":0"}, {}),
+      &handle);
+  EXPECT_TRUE(errors::IsInvalidArgument(s));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+}
+
+TEST(DirectSessionTest, TestTensorConnectionUseTwice) {
+  Graph graph(OpRegistry::Global());
+
+  Tensor a_tensor(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a_tensor, {1.0, 2.0, 3.0, 4.0});
+  Node* a = test::graph::Constant(&graph, a_tensor);
+
+  Tensor dummy_tensor(DT_FLOAT, TensorShape({1}));
+  test::FillValues<float>(&dummy_tensor, {-1.0});
+
+  Node* left = test::graph::Constant(&graph, dummy_tensor);
+  Node* right = test::graph::Constant(&graph, dummy_tensor);
+
+  // y = A * x
+  Node* y = test::graph::Add(&graph, left, right);
+
+  GraphDef def;
+  test::graph::ToGraphDef(&graph, &def);
+
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  CallableOptions callable_options;
+  // Directly wire the output of node a to the outputs of nodes left
+  // and right, making the callable graph into "a + a;".
+  TensorConnection* c_left = callable_options.add_tensor_connection();
+  c_left->set_from_tensor(a->name() + ":0");
+  c_left->set_to_tensor(left->name() + ":0");
+  TensorConnection* c_right = callable_options.add_tensor_connection();
+  c_right->set_from_tensor(a->name() + ":0");
+  c_right->set_to_tensor(right->name() + ":0");
+
+  callable_options.add_fetch(y->name() + ":0");
+
+  Session::CallableHandle handle;
+  TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+  ASSERT_EQ(1, outputs.size());
+  auto mat = outputs[0].matrix<float>();
+  ASSERT_TRUE(outputs[0].IsInitialized());
+  EXPECT_FLOAT_EQ(2.0, mat(0, 0));
+  EXPECT_FLOAT_EQ(4.0, mat(0, 1));
+  EXPECT_FLOAT_EQ(6.0, mat(1, 0));
+  EXPECT_FLOAT_EQ(8.0, mat(1, 1));
+  TF_ASSERT_OK(session->ReleaseCallable(handle));
 }
 
 TEST(DirectSessionTest, FetchMultipleTimes) {
@@ -600,8 +1054,8 @@ TEST(DirectSessionTest, PartialRunMissingFeed) {
   s = session->PRun(handle, {{first_const->name(), value_11}},
                     {third_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("can't be computed from the feeds"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "can't be computed from the feeds"));
 }
 
 TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
@@ -630,8 +1084,8 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
   // Fetch fourth_identity without feeds.
   s = session->PRun(handle, {}, {fourth_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("can't be computed from the feeds"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "can't be computed from the feeds"));
 
   // Feed switch_node:1 and fetch fourth_identity.
   s = session->PRun(handle, {{switch_node->name() + ":1", bool_value}},
@@ -694,6 +1148,59 @@ TEST(DirectSessionTest, RunHandleTest) {
   ASSERT_TRUE(s.ok());
 }
 
+TEST(DirectSessionTest, RunHandleTest_Callable) {
+  GraphDef def;
+  Graph g(OpRegistry::Global());
+
+  Tensor value0(DT_FLOAT, TensorShape({}));
+  value0.scalar<float>()() = 1.0;
+  Node* const0 = test::graph::Constant(&g, value0);
+  Node* identity0 = test::graph::Identity(&g, const0);
+
+  Tensor value1(DT_FLOAT, TensorShape({}));
+  value1.scalar<float>()() = 2.0;
+  Node* const1 = test::graph::Constant(&g, value1);
+  Node* node3 = test::graph::Add(&g, identity0, const1);
+  Node* node4 = test::graph::Unary(&g, "GetSessionHandleV2", node3);
+
+  Tensor value2(DT_STRING, TensorShape({}));
+  Node* const2 = test::graph::Constant(&g, value2);
+  Node* node5 = test::graph::GetSessionTensor(&g, const2);
+  Node* node6 = test::graph::Add(&g, node5, const1);
+
+  Node* node7 = test::graph::Unary(&g, "DeleteSessionTensor", const2);
+
+  test::graph::ToGraphDef(&g, &def);
+
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  // First run call: Create a handle.
+  std::vector<Tensor> outputs;
+  Status s = session->Run({}, {node4->name() + ":0"}, {}, &outputs);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(1, outputs.size());
+
+  ResourceHandle resource_handle = outputs[0].scalar<ResourceHandle>()();
+  Tensor string_handle(DT_STRING, {});
+  string_handle.flat<string>().setConstant(resource_handle.name());
+
+  // Second run call: Use a handle.
+  std::vector<Tensor> outputs1;
+  s = session->Run({{const2->name(), string_handle}}, {node6->name() + ":0"},
+                   {}, &outputs1);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(1, outputs1.size());
+  ASSERT_EQ(5.0, outputs1[0].flat<float>()(0));
+
+  // Third run call: Delete a handle.
+  std::vector<Tensor> outputs2;
+  s = session->Run({{const2->name(), string_handle}}, {}, {node7->name()},
+                   &outputs2);
+  ASSERT_TRUE(s.ok());
+}
+
 TEST(DirectSessionTest, CreateGraphFailsWhenAssigningAFedVar) {
   Graph graph(OpRegistry::Global());
 
@@ -868,59 +1375,14 @@ TEST(DirectSessionTest, TestTimeoutCleanShutdown) {
   TF_ASSERT_OK(session->Close());
 }
 
-class BlockingOpState {
- public:
-  void AwaitState(int awaiting_state) {
-    mutex_lock ml(mu_);
-    while (state_ != awaiting_state) {
-      cv_.wait(ml);
-    }
-  }
-  void MoveToState(int expected_current, int next) {
-    mutex_lock ml(mu_);
-    CHECK_EQ(expected_current, state_);
-    state_ = next;
-    cv_.notify_all();
-  }
-
- private:
-  mutex mu_;
-  condition_variable cv_;
-  int state_ = 0;
-};
-static BlockingOpState* blocking_op_state = nullptr;
-
-// BlockingOp blocks on the global <blocking_op_state's> state,
-// and also updates it when it is unblocked and finishing computation.
-class BlockingOp : public OpKernel {
- public:
-  explicit BlockingOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) override {
-    blocking_op_state->MoveToState(0, 1);
-    blocking_op_state->AwaitState(2);
-    blocking_op_state->MoveToState(2, 3);
-
-    Tensor* out = nullptr;
-    const Tensor& in = ctx->input(0);
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in.shape(), &out));
-    out->flat<float>() = in.flat<float>();
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp);
-REGISTER_OP("BlockingOp").Input("x: float").Output("y: float").Doc("");
-
 static void TestSessionInterOpThreadsImpl(bool use_function_lib,
                                           bool use_global_pools) {
+  using test::function::blocking_op_state;
+  using test::function::BlockingOpState;
+
   FunctionDefLibrary library_graph_def;
   if (use_function_lib) {
-    const string lib = R"proto(
-        signature: {
-          name: "BlockingOpFn" input_arg: { name: "x" type: DT_FLOAT }
-                               output_arg: { name: "y" type: DT_FLOAT }}
-        node_def: { name: "y" op: "BlockingOp" input: "x" }
-        ret: { key: "y" value: "y:y:0" } )proto";
-    CHECK(protobuf::TextFormat::ParseFromString(
-        lib, library_graph_def.add_function()));
+    *library_graph_def.add_function() = test::function::BlockingOpFn();
   }
 
   FunctionLibraryDefinition flib(OpRegistry::Global(), library_graph_def);
@@ -1153,6 +1615,11 @@ TEST(DirectSessionTest, TestDirectSessionRunClose) {
   EXPECT_EQ(t.scalar<float>()(), outputs[0].scalar<float>()());
   outputs.clear();
 
+  // Make a callable handle before closing the session.
+  Session::CallableHandle handle;
+  TF_ASSERT_OK(session->MakeCallable(
+      MakeCallableOptions({}, {}, {var_assign->name()}), &handle));
+
   // Close the session.
   TF_ASSERT_OK(session->Close());
 
@@ -1160,6 +1627,10 @@ TEST(DirectSessionTest, TestDirectSessionRunClose) {
   Status s = session->Run({} /* inputs */, {},
                           {var_assign->name()} /* target_nodes */, nullptr);
   EXPECT_EQ("Cancelled: Session has been closed.", s.ToString());
+
+  // Run the read as a callable to verify that we get the same error.
+  s = session->RunCallable(handle, {}, {}, nullptr);
+  EXPECT_EQ("Cancelled: Session has been closed.", s.ToString());
 }
 
 TEST(DirectSessionTest, TestDirectSessionPRunClose) {
@@ -1261,7 +1732,8 @@ TEST(DirectSessionTest, LocalDeviceManager) {
 
 // A simple benchmark for the overhead of `DirectSession::Run()` calls
 // with varying numbers of feeds/fetches.
-void FeedFetchBenchmarkHelper(int iters, int num_feeds) {
+void FeedFetchBenchmarkHelper(int iters, int num_feeds,
+                              bool use_make_callable) {
   testing::StopTiming();
 
   Tensor value(DT_FLOAT, TensorShape());
@@ -1297,29 +1769,55 @@ void FeedFetchBenchmarkHelper(int iters, int num_feeds) {
   SessionOptions opts;
   std::unique_ptr<Session> session(NewSession(opts));
   TF_CHECK_OK(session->Create(gd));
-  {
-    // NOTE(mrry): Ignore the first run, which will incur the graph
-    // partitioning/pruning overhead and skew the results.
-    //
-    // Note that we should also optimize and monitor the overhead on
-    // the first run, which will impact application startup times, but
-    // that is not the object of study in this benchmark.
-    std::vector<Tensor> output_values;
-    TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
-  }
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    std::vector<Tensor> output_values;
-    TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
+  if (use_make_callable) {
+    Session::CallableHandle handle;
+    CallableOptions callable_options;
+    std::vector<Tensor> input_tensors;
+    for (const auto& input : inputs) {
+      callable_options.add_feed(input.first);
+      input_tensors.push_back(input.second);
+    }
+    for (const string& output : outputs) {
+      callable_options.add_fetch(output);
+    }
+    TF_CHECK_OK(session->MakeCallable(callable_options, &handle));
+
+    testing::StartTiming();
+    for (int i = 0; i < iters; ++i) {
+      std::vector<Tensor> output_values;
+      TF_CHECK_OK(
+          session->RunCallable(handle, input_tensors, &output_values, nullptr));
+    }
+    testing::StopTiming();
+  } else {
+    {
+      // NOTE(mrry): Ignore the first run, which will incur the graph
+      // partitioning/pruning overhead and skew the results.
+      //
+      // Note that we should also optimize and monitor the overhead on
+      // the first run, which will impact application startup times, but
+      // that is not the object of study in this benchmark.
+      std::vector<Tensor> output_values;
+      TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
+    }
+    testing::StartTiming();
+    for (int i = 0; i < iters; ++i) {
+      std::vector<Tensor> output_values;
+      TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
+    }
+    testing::StopTiming();
   }
-  testing::StopTiming();
 }
 
 void BM_FeedFetch(int iters, int num_feeds) {
-  FeedFetchBenchmarkHelper(iters, num_feeds);
+  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ false);
+}
+void BM_FeedFetchCallable(int iters, int num_feeds) {
+  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ true);
 }
 
 BENCHMARK(BM_FeedFetch)->Arg(1)->Arg(2)->Arg(5)->Arg(10);
+BENCHMARK(BM_FeedFetchCallable)->Arg(1)->Arg(2)->Arg(5)->Arg(10);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 31fb128f937ae46eefb309fc9bab8167e54846a7..b4dd521bbc80c8ff4fd921b4cb285cb2178c1ed2 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,9 +102,9 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
-          EXPECT_EQ(3, cm->AllocationId(node, 0));
+          EXPECT_EQ(7, cm->AllocationId(node, 0));
         } else {
-          EXPECT_EQ(4, cm->AllocationId(node, 0));
+          EXPECT_EQ(8, cm->AllocationId(node, 0));
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/dma_helper.h b/tensorflow/core/common_runtime/dma_helper.h
index 1cc8b9e723a26ba5848291a75029871d559a6b5b..cdfce1f366be66785a63a169c2107c2aaede1396 100644
--- a/tensorflow/core/common_runtime/dma_helper.h
+++ b/tensorflow/core/common_runtime/dma_helper.h
@@ -28,6 +28,9 @@ class DMAHelper {
   static void* base(Tensor* t) { return t->base<void>(); }
   static TensorBuffer* buffer(Tensor* t) { return t->buf_; }
   static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; }
+  static void UnsafeSetShape(Tensor* t, const TensorShape& s) {
+    t->set_shape(s);
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..13d6b021b549e88c49ca5d38f1e3d3cbeff95e56
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -0,0 +1,173 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_cuda_library",
+)
+
+tf_cuda_library(
+    name = "eager_executor",
+    srcs = [
+        "eager_executor.cc",
+    ],
+    hdrs = [
+        "eager_executor.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cuda_library(
+    name = "context",
+    srcs = [
+        "context.cc",
+    ],
+    hdrs = [
+        "context.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":eager_executor",
+        ":kernel_and_device",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+    ],
+)
+
+tf_cuda_library(
+    name = "eager_operation",
+    srcs = [
+        "eager_operation.cc",
+    ],
+    hdrs = [
+        "eager_operation.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":tensor_handle",
+        "//tensorflow/c/eager:runtime",
+    ],
+)
+
+tf_cuda_library(
+    name = "tensor_handle",
+    srcs = [
+        "tensor_handle.cc",
+    ],
+    hdrs = [
+        "tensor_handle.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":eager_executor",
+        ":kernel_and_device",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+    ],
+)
+
+tf_cuda_library(
+    name = "copy_to_device_node",
+    hdrs = [
+        "copy_to_device_node.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":eager_executor",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+    ],
+)
+
+tf_cuda_library(
+    name = "kernel_and_device",
+    srcs = [
+        "kernel_and_device.cc",
+    ],
+    hdrs = [
+        "kernel_and_device.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_and_device_test",
+    srcs = ["kernel_and_device_test.cc"],
+    deps = [
+        ":kernel_and_device",
+        "//tensorflow/c/eager:runtime",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "execute",
+    srcs = ["execute.cc"],
+    hdrs = [
+        "execute.h",
+        "execute_node.h",
+    ],
+    deps = [
+        ":context",
+        ":copy_to_device_node",
+        ":eager_executor",
+        ":eager_operation",
+        ":kernel_and_device",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3fe6a7edeabecdaba0d894cca700608b896026d
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+
+#include "tensorflow/core/common_runtime/process_util.h"
+
+namespace tensorflow {
+
+EagerContext::EagerContext(const SessionOptions& opts,
+                           ContextDevicePlacementPolicy default_policy,
+                           bool async, std::unique_ptr<DeviceMgr> device_mgr,
+                           Rendezvous* rendezvous)
+    : policy_(default_policy),
+      device_manager_(std::move(device_mgr)),
+      devices_(device_manager_->ListDevices()),
+      rendezvous_(rendezvous),
+      thread_pool_(NewThreadPoolFromSessionOptions(opts)),
+      pflr_(new ProcessFunctionLibraryRuntime(
+          device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_,
+          {}, thread_pool_.get())),
+      log_device_placement_(opts.config.log_device_placement()),
+      async_default_(async) {
+  if (async_default_) {
+    executor_.EnableAsync();
+  }
+
+  for (auto* device : devices_) {
+    devices_map_[device->name()] = device;
+  }
+}
+
+bool EagerContext::Async() const {
+  mutex_lock l(async_map_mu_);
+  return gtl::FindWithDefault(thread_local_async_, std::this_thread::get_id(),
+                              async_default_);
+}
+
+Status EagerContext::SetAsyncForThread(bool async) {
+  {
+    tensorflow::mutex_lock l(async_map_mu_);
+    thread_local_async_[std::this_thread::get_id()] = async;
+  }
+  if (async) {
+    executor_.EnableAsync();
+  } else {
+    // TODO(agarwal): Currently we add a wait here to handle cases where a
+    // sync op has a control dependency on an async op, and the latter has not
+    // executed yet. This wait can be removed by storing all the control
+    // inputs and waiting for them when executing ops.
+    return executor_.WaitForAllPendingNodes();
+  }
+  return Status::OK();
+}
+
+void EagerContext::ClearCaches() {
+  mutex_lock ml(cache_mu_);
+  gtl::STLDeleteValues(&kernel_cache_);
+}
+
+void EagerContext::SetThreadLocalDevicePlacementPolicy(
+    ContextDevicePlacementPolicy policy) {
+  mutex_lock ml(policy_map_mu_);
+  thread_local_policies_[std::this_thread::get_id()] = policy;
+}
+
+ContextDevicePlacementPolicy EagerContext::GetDevicePlacementPolicy() {
+  mutex_lock ml(policy_map_mu_);
+  auto policy_map_it = thread_local_policies_.find(std::this_thread::get_id());
+  if (policy_map_it != thread_local_policies_.end()) {
+    return policy_map_it->second;
+  }
+  return policy_;
+}
+
+EagerContext::~EagerContext() {
+  executor_.WaitForAllPendingNodes().IgnoreError();
+  ClearCaches();
+  rendezvous_->Unref();
+}
+
+bool EagerContext::FindFunctionByName(const string& name) {
+  mutex_lock l(functions_mu_);
+  return func_lib_def_.Find(name) != nullptr;
+}
+
+Status EagerContext::FindFunctionOpData(
+    const string& name, const tensorflow::OpRegistrationData** op_data) {
+  mutex_lock l(functions_mu_);
+  return func_lib_def_.LookUp(name, op_data);
+}
+
+const FunctionDef* EagerContext::FindFunctionDef(const string& name) {
+  mutex_lock l(functions_mu_);
+  return func_lib_def_.Find(name);
+}
+
+Status EagerContext::FindDeviceByName(const string& name, Device** result) {
+  auto it = devices_map_.find(name);
+  if (it == devices_map_.end()) {
+    return errors::InvalidArgument(name, " unknown device.");
+  }
+  *result = it->second;
+  return Status::OK();
+}
+
+Status EagerContext::AddFunctionDef(const FunctionDef& fdef) {
+  mutex_lock l(functions_mu_);
+  return func_lib_def_.AddFunctionDef(fdef);
+}
+
+KernelAndDevice* EagerContext::GetCachedKernel(Fprint128 cache_key) {
+  tf_shared_lock l(cache_mu_);
+  return gtl::FindPtrOrNull(kernel_cache_, cache_key);
+}
+
+void EagerContext::AddKernelToCache(Fprint128 cache_key,
+                                    KernelAndDevice* kernel) {
+  mutex_lock ml(cache_mu_);
+  gtl::InsertOrUpdate(&kernel_cache_, cache_key, kernel);
+}
+
+void EagerContext::SetShouldStoreMetadata(bool value) {
+  should_store_metadata_.store(value);
+  if (!value) {
+    mutex_lock ml(metadata_mu_);
+    run_metadata_.Clear();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..6665df27d09a73d4c30756cf01e383834fcae339
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -0,0 +1,193 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// Note: there's a copy enum in eager/c_api.h. It should be kept in sync.
+enum ContextDevicePlacementPolicy {
+  // Running operations with input tensors on the wrong device will fail.
+  DEVICE_PLACEMENT_EXPLICIT = 0,
+  // Copy the tensor to the right device but log a warning.
+  DEVICE_PLACEMENT_WARN = 1,
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default policy.
+  DEVICE_PLACEMENT_SILENT = 2,
+  // Default placement policy which silently copies int32 tensors but not other
+  // dtypes.
+  DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
+};
+
+class EagerContext {
+ public:
+  explicit EagerContext(const SessionOptions& opts,
+                        ContextDevicePlacementPolicy default_policy, bool async,
+                        std::unique_ptr<DeviceMgr> device_mgr,
+                        Rendezvous* rendezvous);
+
+  ~EagerContext();
+
+  // Returns the function library runtime for the given device.
+  FunctionLibraryRuntime* func_lib(Device* d) const {
+    return pflr_->GetFLR(d->name());
+  }
+
+  // True if running in asynchronous mode.
+  bool Async() const;
+
+  EagerExecutor* Executor() { return &executor_; }
+
+  // Sets whether this thread should run in synchronous or asynchronous mode.
+  Status SetAsyncForThread(bool async);
+
+  // TODO(apassos) make this return a constant reference
+  gtl::FlatMap<string, Device*, StringPieceHasher>* device_map() {
+    return &devices_map_;
+  }
+
+  // TODO(apassos) make this return a constant reference
+  std::vector<Device*>* devices() { return &devices_; }
+
+  // Clears the kernel caches.
+  void ClearCaches();
+
+  // Sets the device placement policy for the current thread.
+  void SetThreadLocalDevicePlacementPolicy(ContextDevicePlacementPolicy policy);
+
+  // Returns the device placement policy for the current thread.
+  ContextDevicePlacementPolicy GetDevicePlacementPolicy();
+
+  Status AsyncWait() { return executor_.WaitForAllPendingNodes(); }
+
+  Status GetStatus() { return executor_.status(); }
+
+  void ClearAsyncError() { executor_.ClearError(); }
+
+  bool FindFunctionByName(const string& name);
+
+  Status FindFunctionOpData(const string& name,
+                            const tensorflow::OpRegistrationData** op_data);
+
+  const FunctionDef* FindFunctionDef(const string& name);
+
+  Status FindDeviceByName(const string& name, Device** result);
+
+  Device* HostCPU() { return devices_[0]; }
+
+  uint64 NextId() { return executor_.NextId(); }
+
+  void ExecutorAdd(EagerNode* node) { executor_.Add(node); }
+
+  Status AddFunctionDef(const FunctionDef& fdef);
+
+  KernelAndDevice* GetCachedKernel(Fprint128 cache_key);
+
+  void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);
+
+  bool LogDevicePlacement() { return log_device_placement_; }
+
+  Rendezvous* GetRendezvous() { return rendezvous_; }
+
+  mutex* FunctionsMu() { return &functions_mu_; }
+
+  tensorflow::DeviceMgr* device_mgr() { return device_manager_.get(); }
+
+  // TODO(apassos) remove the need for this
+  void ReleaseDeviceMgr() { device_manager_.release(); }
+
+  // TODO(apassos) clean up RunMetadata storage.
+  mutex* MetadataMu() { return &metadata_mu_; }
+  bool ShouldStoreMetadata() { return should_store_metadata_.load(); }
+  void SetShouldStoreMetadata(bool value);
+  RunMetadata* RunMetadataProto() { return &run_metadata_; }
+
+  FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; }
+
+ private:
+  const ContextDevicePlacementPolicy policy_;
+
+  // Note: we cannot use C++11 thread_local here as there is no concept of a
+  // thread-local-object-local variable in C++11.
+  mutex policy_map_mu_;
+  std::unordered_map<std::thread::id, ContextDevicePlacementPolicy>
+      thread_local_policies_ GUARDED_BY(policy_map_mu_);
+
+  std::unique_ptr<DeviceMgr> device_manager_;
+  // Devices owned by device_manager
+  std::vector<Device*> devices_;
+  // All devices are not owned.
+  gtl::FlatMap<string, Device*, StringPieceHasher> devices_map_;
+  Rendezvous* const rendezvous_;
+
+  mutex functions_mu_;
+  FunctionLibraryDefinition func_lib_def_ GUARDED_BY(functions_mu_){
+      OpRegistry::Global(), {}};
+
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
+  // One FunctionLibraryRuntime per device.
+  // func_libs[i] is the FunctionLibraryRuntime corresponding to
+  // session->devices[i].
+  const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+
+  mutex cache_mu_;
+  std::unordered_map<Fprint128, KernelAndDevice*, Fprint128Hasher> kernel_cache_
+      GUARDED_BY(cache_mu_);
+
+  // Whether we should compute RunMetadata.
+  std::atomic<bool> should_store_metadata_{false};
+  mutex metadata_mu_;
+  RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
+  const bool log_device_placement_;
+  // EagerExecutor for async execution.
+  EagerExecutor executor_;
+
+  // True if the default value for execution mode is async. Note that this value
+  // can be overridden per thread based on `thread_local_async` overrides.
+  const bool async_default_;
+  mutable mutex async_map_mu_;
+  std::unordered_map<std::thread::id, bool> thread_local_async_
+      GUARDED_BY(async_map_mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a887540b066055fc1f59e64e0cead9f2512178e
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class CopyToDeviceNode : public EagerNode {
+ public:
+  CopyToDeviceNode(TensorHandle* src, Device* dstd, EagerContext* ctx)
+      : EagerNode(ctx->NextId()),
+        src_(src),
+        dstd_(dstd),
+        ctx_(ctx),
+        dst_(new TensorHandle(id, src_->dtype, ctx)) {
+    src_->Ref();
+    dst_->Ref();
+  }
+
+  ~CopyToDeviceNode() override {
+    src_->Unref();
+    dst_->Unref();
+  }
+
+  Status Run() override {
+    TensorHandle* temp = nullptr;
+    TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &temp));
+    const Tensor* tensor = nullptr;
+    Device* device = nullptr;
+    Device* op_device = nullptr;
+    Status status = temp->TensorAndDevice(&tensor, &device, &op_device);
+    // `temp` is a ready handle. So the following call should return OK.
+    TF_DCHECK_OK(status) << status.error_message();
+    DCHECK(tensor);
+    dst_->SetTensorAndDevice(*tensor, device, op_device);
+    temp->Unref();
+    return Status::OK();
+  }
+
+  TensorHandle* dst() { return dst_; }
+
+ private:
+  TensorHandle* src_;
+  Device* dstd_;
+  EagerContext* ctx_;
+  TensorHandle* dst_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b699036e9697576adca403d5919b341d8f919db0
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+
+namespace tensorflow {
+
+EagerNode::EagerNode(tensorflow::uint64 id) : id(id) {}
+
+EagerExecutor::~EagerExecutor() {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  thread_done_ = true;
+  nodes_pending_.notify_all();
+}
+
+tensorflow::uint64 EagerExecutor::NextId() {
+  tensorflow::mutex_lock l(next_id_mutex_);
+  return next_id_++;
+}
+
+void EagerExecutor::EnableAsync() {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  if (thread_ == nullptr) {
+    thread_.reset(tensorflow::Env::Default()->StartThread(
+        tensorflow::ThreadOptions(), "eager_async_executor",
+        std::bind(&EagerExecutor::Run, this)));
+  }
+}
+
+void EagerExecutor::Add(EagerNode* node) {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  DCHECK(thread_) << "EnableAsync should have been called before Add";
+  if (!status_.ok()) {
+    delete node;
+    return;
+  }
+  int64 qlen = node_queue_.size();
+  if (qlen > 0) {
+    if (node_queue_.back()->id >= node->id) {
+      status_ = tensorflow::errors::InvalidArgument(
+          "Inserting EagerNode with non-increasing ids:",
+          node_queue_.back()->id, " vs ", node->id);
+      delete node;
+      return;
+    }
+    node_queue_.push(node);
+  } else {
+    node_queue_.push(node);
+    nodes_pending_.notify_all();
+  }
+}
+
+tensorflow::Status EagerExecutor::WaitFor(tensorflow::uint64 node_id) {
+  return WaitImpl(false, node_id);
+}
+
+tensorflow::Status EagerExecutor::WaitForAllPendingNodes() {
+  return WaitImpl(true, 0);
+}
+
+tensorflow::Status EagerExecutor::WaitImpl(bool wait_all,
+                                           tensorflow::uint64 node_id) {
+  tensorflow::condition_variable cond;
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  // Don't wait if an error is already set.
+  if (!status_.ok()) return status_;
+  if (node_queue_.empty()) return tensorflow::Status::OK();
+  if (wait_all) {
+    node_id = node_queue_.back()->id;
+  } else if (node_id < node_queue_.front()->id) {
+    // Note that we are relying on the ops being dispatched sequentially from
+    // the queue.
+    return tensorflow::Status::OK();
+  }
+  node_done_notifications_.insert(std::make_pair(node_id, &cond));
+  cond.wait(l);
+  // Note that we could be woken up if an error occurs, even though the node has
+  // not actually executed.
+  return status_;
+}
+
+void EagerExecutor::ClearError() {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  if (status_.ok()) return;
+  // If an error was set, node_done_notifications_ and node_queue_ should have
+  // been cleared, and no new entries should have been added since.
+  DCHECK(node_done_notifications_.empty());
+  DCHECK(node_queue_.empty());
+  status_ = tensorflow::Status::OK();
+  nodes_pending_.notify_all();
+}
+
+tensorflow::Status EagerExecutor::status() {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  return status_;
+}
+
+void EagerExecutor::Run() {
+  while (true) {
+    std::unique_ptr<EagerNode> curr_node;
+    {
+      tensorflow::mutex_lock l(node_queue_mutex_);
+      while (node_queue_.empty() || !status_.ok()) {
+        if (thread_done_) return;
+        nodes_pending_.wait(l);
+      }
+      curr_node.reset(node_queue_.front());
+    }
+    tensorflow::Status status = curr_node->Run();
+    const bool ok = status.ok();
+    tensorflow::mutex_lock l(node_queue_mutex_);
+    node_queue_.pop();
+    if (!ok) {
+      status_ = status;
+      // TODO(agarwal): mark all affected handles as corrupted before clearing
+      // this queue.
+      // We remove any pending ops so that we don't try to execute them if
+      // ClearError is called.
+      for (int i = 0; i < node_queue_.size(); ++i) {
+        delete node_queue_.front();
+        node_queue_.pop();
+      }
+    }
+    if (!node_done_notifications_.empty()) {
+      tensorflow::uint64 node_id = curr_node->id;
+      // Note that we notify all waiting threads in case an error has occurred.
+      // These calling threads are responsible for checking status_ before
+      // proceeding.
+      const auto range = ok ? node_done_notifications_.equal_range(node_id)
+                            : make_pair(node_done_notifications_.begin(),
+                                        node_done_notifications_.end());
+      for (auto it = range.first; it != range.second; ++it) {
+        it->second->notify_all();
+      }
+      node_done_notifications_.erase(range.first, range.second);
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..021daeb21d2ecb033b8017f012a148dafa092c01
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// A unit of execution for the EagerExecutor class below. Example subclasses
+// encapsulate execution of a TFE_Op, or copying a TFE_TensorHandle from one
+// device to another.
+class EagerNode {
+ public:
+  explicit EagerNode(uint64 id);
+
+  virtual ~EagerNode() {}
+
+  // Runs the computation corresponding to this node and blocks till the
+  // execution is done.
+  virtual Status Run() = 0;
+
+  // An id unique to the TFE_Context under which this node is created. Allocated
+  // monotonically.
+  const uint64 id;
+};
+
+// A class for handling async execution (see TFE_ContextSetAsync).
+// Note that this class is thread-safe.
+// TODO(agarwal): TFE_OpAddInput may currently block if it tries to access the
+// device of the input handle. Fix that.
+// TODO(agarwal): On error, mark all affected handles as corrupted.
+// TODO(agarwal): Implement support for control dependencies.
+// TODO(agarwal): Support out-of-order execution and dispatching multiple
+// EagerNode in parallel.
+// TODO(agarwal): Implement optimizations over EagerNode traces.
+class EagerExecutor {
+ public:
+  ~EagerExecutor();
+
+  // This is called whenever async mode is enabled. Note that it may be called
+  // multiple times as different calling threads may switch async mode on or off
+  // independently.
+  void EnableAsync();
+
+  // Helper function to create monotonically increasing ids unique to this
+  // object.
+  uint64 NextId();
+
+  // Schedules `node` for execution.
+  // Note that Add must be called in monotonically increasing order of node->id.
+  void Add(EagerNode* node);
+
+  // Causes the caller to block till node with id `node_id` has finished
+  // execution.
+  Status WaitFor(uint64 node_id);
+
+  // Blocks till all currently pending ops are done.
+  Status WaitForAllPendingNodes();
+
+  // Clears all currently set errors which re-enables async execution.
+  void ClearError();
+
+  // Returns Status based on any errors that occurred during async execution.
+  Status status();
+
+ private:
+  // Starts execution of pending EagerNodes. This function loops till
+  // thread_done_ is set to true. If any errors are encontered, these are set
+  // inside `status_`. The loop blocks anytime there are no pending nodes, or if
+  // `status_` is not ok.
+  void Run();
+
+  Status WaitImpl(bool wait_all, uint64 node_id);
+
+  mutex node_queue_mutex_;
+
+  // Used to signal that some EagerNodes are pending execution.
+  condition_variable nodes_pending_ GUARDED_BY(node_queue_mutex_);
+
+  // Queue of pending EagerNodes.
+  std::queue<EagerNode*> node_queue_ GUARDED_BY(node_queue_mutex_);
+
+  // `status_` is set based on any errors raised during execution of a
+  // EagerNode.  It remains set until ClearError is called.
+  Status status_ GUARDED_BY(node_queue_mutex_);
+
+  // Map from id of a EagerNode to condition_variables (not owned by the map).
+  // These condition_variables are notified and removed when that EagerNode is
+  // done executing, or if an error is found in execution of any EagerNode.
+  std::multimap<uint64, condition_variable*> node_done_notifications_
+      GUARDED_BY(node_queue_mutex_);
+
+  // Thread object that calls the `Run` method. Currently we use only one thread
+  // for executing the EagerNodes one-by-one.
+  std::unique_ptr<Thread> thread_ GUARDED_BY(node_queue_mutex_);
+
+  // Indicates that `thread_` should stop as soon as it is done executing the
+  // current EagerNode.
+  bool thread_done_ GUARDED_BY(node_queue_mutex_) = false;
+
+  mutex next_id_mutex_;
+  uint64 next_id_ GUARDED_BY(next_id_mutex_) = 1;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..381b05ada8594fde1aa917053acd0371167f66ed
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+
+namespace tensorflow {
+tensorflow::Status EagerOperation::SetDevice(const char* device) {
+  auto status = Status::OK();
+  tensorflow::Device* d = nullptr;
+  if (device != nullptr && strlen(device) > 0) {
+    status.Update(ctx_->FindDeviceByName(device, &d));
+  }
+  device_ = d;
+  return status;
+}
+
+void EagerOperation::AddInput(tensorflow::TensorHandle* h) {
+  h->Ref();
+  inputs_.push_back(h);
+  attrs_.NumInputs(static_cast<int>(inputs_.size()));
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b6e53da87acb88ab8662fbc89024480108e97e0
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
+
+#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+
+namespace tensorflow {
+class EagerOperation {
+ public:
+  // t is NULL iff the EagerOperation corresponds to a TensorFlow function
+  // instead of a primitive operation.
+  EagerOperation(tensorflow::EagerContext* ctx, const char* op,
+                 const tensorflow::AttrTypeMap* t)
+      : ctx_(ctx), name_(op), attrs_(op), attr_types_(t), device_(nullptr) {}
+
+  ~EagerOperation() {
+    for (tensorflow::TensorHandle* h : inputs_) {
+      h->Unref();
+    }
+  }
+
+  bool is_function() const { return attr_types_ == nullptr; }
+
+  tensorflow::EagerContext* EagerContext() { return ctx_; }
+
+  tensorflow::AttrBuilder* MutableAttrs() { return &attrs_; }
+  const tensorflow::AttrBuilder& Attrs() const { return attrs_; }
+
+  const tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4>& Inputs()
+      const {
+    return inputs_;
+  }
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4>*
+  MutableInputs() {
+    return &inputs_;
+  }
+  void AddInput(tensorflow::TensorHandle* h);
+
+  const tensorflow::string& Name() const { return name_; }
+  const tensorflow::AttrTypeMap* AttrTypes() const { return attr_types_; }
+
+  tensorflow::Device* Device() const { return device_; }
+  tensorflow::Status SetDevice(const char* device);
+  void SetDevice(tensorflow::Device* device) { device_ = device; }
+
+  void SetUseXla(bool use_xla) { use_xla_ = use_xla; }
+
+ private:
+  tensorflow::EagerContext* ctx_;  // Must outlive the EagerOperation.
+  const tensorflow::string name_;
+  tensorflow::AttrBuilder attrs_;
+  const tensorflow::AttrTypeMap* attr_types_;
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs_;
+  tensorflow::Device* device_;
+  bool use_xla_ = false;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a514f81e146f20c8d2fe828ae15bd650f60c5e40
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -0,0 +1,619 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/execute.h"
+
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
+#include "tensorflow/core/common_runtime/eager/execute_node.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Initializes the step stats if needed.
+void MaybeInitializeStepStats(StepStats* step_stats, EagerContext* ctx) {
+  // Lazily initialize the RunMetadata with information about all devices if
+  // this is the first call.
+  while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+    int device_idx = step_stats->dev_stats_size();
+    auto* dev_stats = step_stats->add_dev_stats();
+    dev_stats->set_device(ctx->devices()->at(device_idx)->name());
+  }
+}
+
+int StepStatsDeviceIndex(StepStats* step_stats, EagerContext* ctx,
+                         Device* device) {
+  // Find the current device's index.
+  if (device == nullptr) {
+    device = ctx->HostCPU();
+  }
+  for (int i = 0; i < ctx->devices()->size(); ++i) {
+    if (ctx->devices()->at(i) == device ||
+        ctx->devices()->at(i)->name() == device->name()) {
+      return i;
+    }
+  }
+  // TODO(apassos) do not fall back to host CPU if device is unknown.
+  return 0;
+}
+
+Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
+                                     EagerOperation* op, const OpKernel* kernel,
+                                     RunMetadata* run_metadata) {
+  Device* host_device = ctx->HostCPU();
+  const MemoryTypeVector& memtypes = kernel->input_memory_types();
+  if (memtypes.size() != op->Inputs().size()) {
+    return errors::InvalidArgument("expected ", memtypes.size(),
+                                   " inputs, got ", op->Inputs().size());
+  }
+  for (int i = 0; i < op->Inputs().size(); ++i) {
+    const Device* expected_device =
+        memtypes[i] == HOST_MEMORY ? host_device : op_device;
+    TensorHandle* handle = op->Inputs()[i];
+    Device* handle_device = nullptr;
+    TF_RETURN_IF_ERROR(handle->Device(&handle_device));
+    const Device* actual_device =
+        handle_device == nullptr ? host_device : handle_device;
+    if (expected_device != actual_device) {
+      switch (ctx->GetDevicePlacementPolicy()) {
+        case DEVICE_PLACEMENT_SILENT_FOR_INT32:
+          // TODO(xpan): See if we could bubble python related error up
+          // to python level.
+          if (handle->dtype == DT_INT32) {
+            // Note: enabling silent copies of int32 tensors to match behavior
+            // of graph mode.
+            break;
+          }
+          TF_FALLTHROUGH_INTENDED;
+        case DEVICE_PLACEMENT_EXPLICIT:
+          return errors::InvalidArgument(
+              "Tensors on conflicting devices:"
+              " cannot compute ",
+              op->Name(), " as input #", i, " was expected to be on ",
+              expected_device->name(), " but is actually on ",
+              actual_device->name(), " (operation running on ",
+              op_device->name(), ")",
+              " Tensors can be copied explicitly using .gpu() or .cpu() "
+              "methods,"
+              " or transparently copied by using tf.enable_eager_execution("
+              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
+              "between devices"
+              " may slow down your model");
+        case DEVICE_PLACEMENT_WARN:
+          LOG(WARNING) << "before computing " << op->Name() << " input #" << i
+                       << " was expected to be on " << expected_device->name()
+                       << " but is actually on " << actual_device->name()
+                       << " (operation running on " << op_device->name()
+                       << "). This triggers a copy which can be a performance "
+                          "bottleneck.";
+          break;
+        case DEVICE_PLACEMENT_SILENT:  // Do nothing.
+          break;
+      }
+      // We are only here if the policy is warn or silent copies, so we should
+      // trigger a copy.
+      auto pre_time = Env::Default()->NowMicros();
+      TensorHandle* copied_tensor = nullptr;
+      Status status = EagerCopyToDevice(
+          handle, ctx, expected_device->name().c_str(), &copied_tensor);
+      if (run_metadata != nullptr) {
+        auto* step_stats = run_metadata->mutable_step_stats();
+        MaybeInitializeStepStats(step_stats, ctx);
+        // Record the sending on the source device for now.
+        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
+        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+        auto* node_stats = dev_stats->add_node_stats();
+        node_stats->set_node_name("_Send");
+        node_stats->set_all_start_micros(pre_time);
+        node_stats->set_op_end_rel_micros(Env::Default()->NowMicros() -
+                                          pre_time);
+      }
+      if (!status.ok()) {
+        if (copied_tensor != nullptr) copied_tensor->Unref();
+        return errors::Internal("Failed copying input tensor from ",
+                                actual_device->name(), " to ",
+                                expected_device->name(), " in order to run ",
+                                op->Name(), ": ", status.error_message());
+      }
+      handle->Unref();
+      handle = copied_tensor;
+      (*op->MutableInputs())[i] = copied_tensor;
+    }
+    if (handle->dtype != kernel->input_type(i)) {
+      return errors::InvalidArgument(
+          "cannot compute ", op->Name(), " as input #", i,
+          " was expected to be a ", DataTypeString(kernel->input_type(i)),
+          " tensor but is a ", DataTypeString(handle->dtype), " tensor");
+    }
+  }
+  return Status::OK();
+}
+
+Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
+  DeviceSet ds;
+  for (Device* d : *ctx->devices()) {
+    ds.AddDevice(d);
+  }
+  DeviceTypeVector final_devices;
+  auto status = SupportedDeviceTypesForNode(ds.PrioritizedDeviceTypeList(),
+                                            ndef, &final_devices);
+  if (!status.ok()) return status;
+  if (final_devices.empty()) {
+    return errors::Internal("Could not find valid device for node ",
+                            ndef.DebugString());
+  }
+  for (Device* d : *ctx->devices()) {
+    if (d->device_type() == final_devices[0].type_string()) {
+      *device = d;
+      return Status::OK();
+    }
+  }
+  return errors::Unknown("Could not find a device for node ",
+                         ndef.DebugString());
+}
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+// Synthesizes and returns a wrapper function over `op`, which must be a
+// primitive op (e.g. matmul).
+//
+// The wrapper function conforms to the function signature expected by
+// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
+// resources>. For example, if the op has input params <Const1, Arg2, Const3,
+// Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
+// Resource4> as the input params to the synthesized function.
+//
+// It populates `const_input_types`, `arg_input_types` and
+// `op_input_to_func_input` based on the reordering results, that the caller can
+// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
+// `status` accordingly.
+const FunctionDef* OpToFunction(TFE_Op* op,
+                                std::vector<TF_DataType>* const_input_types,
+                                std::vector<TF_DataType>* arg_input_types,
+                                gtl::FlatMap<int, int>* op_input_to_func_input,
+                                TF_Status* status) {
+  DCHECK(!op->operation.is_function());
+
+  FunctionDef fdef;
+
+  // Get the OpDef of the op we are trying to encapsulate.
+  TFE_Context* ctx = op->operation.ctx;
+  const OpRegistrationData* op_data;
+  {
+    status = ctx->context.FindFunctionOpData(op->operation.Name(), &op_data);
+    if (!status.ok()) {
+      return nullptr;
+    }
+  }
+  const OpDef& op_def = op_data->op_def;
+
+  OpDef* signature = fdef.mutable_signature();
+
+  // Handle constant inputs.
+  const std::unordered_set<string> const_inputs(
+      *XlaOpRegistry::CompileTimeConstantInputs(op->operation.Name()));
+
+  // First add place holders for the input args, so that we can refer to them by
+  // position in the next loop. Also tally up the resource inputs.
+  int num_resource_inputs = 0;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    if (op_def.input_arg(i).type() == DT_RESOURCE) {
+      ++num_resource_inputs;
+    }
+    signature->add_input_arg();
+  }
+
+  // Now we map the input params from `op_def` to `signature`, where the param
+  // ordering for `signature` is: <constants, args, resources>.
+  int const_index = 0;
+  int arg_index = const_inputs.size();
+  int resource_index = op_def.input_arg_size() - num_resource_inputs;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    const OpDef::ArgDef& op_input_arg = op_def.input_arg(i);
+    OpDef::ArgDef* func_input_arg = nullptr;
+    if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) {
+      VLOG(1) << "For const input, mapping op input " << i << " to func input "
+              << const_index;
+      (*op_input_to_func_input)[i] = const_index;
+      func_input_arg = signature->mutable_input_arg(const_index++);
+      const_input_types->push_back(
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
+    } else if (op_input_arg.type() == DT_RESOURCE) {
+      VLOG(1) << "For resource input, mapping op input " << i
+              << " to func input " << resource_index;
+      (*op_input_to_func_input)[i] = resource_index;
+      func_input_arg = signature->mutable_input_arg(resource_index++);
+    } else {
+      VLOG(1) << "For arg input, mapping op input " << i << " to func input "
+              << arg_index;
+      (*op_input_to_func_input)[i] = arg_index;
+      func_input_arg = signature->mutable_input_arg(arg_index++);
+      arg_input_types->push_back(
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
+    }
+
+    func_input_arg->set_name(op_input_arg.name());
+    func_input_arg->set_type(op->operation.Inputs()[i]->dtype);
+  }
+  VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
+
+  // Resources args are at the end of the function input params, and we should
+  // have iterated over all of them.
+  DCHECK_EQ(signature->input_arg_size(), resource_index);
+
+  // Make the synthesized function's name unique.
+  signature->set_name(
+      strings::StrCat(op_def.name(), func_id_generator.fetch_add(1)));
+
+  // Add the node def and set its input names to match op_def's names.
+  const NodeDef& ndef = op->operation.MutableAttrs()->BuildNodeDef();
+  DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
+  *fdef.add_node_def() = ndef;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name());
+  }
+  VLOG(1) << "Added NodeDef: " << fdef.DebugString();
+
+  // Fix the output names and set output types.
+  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    OpDef::ArgDef* arg = signature->add_output_arg();
+    const OpDef::ArgDef& op_def_arg = op_def.output_arg(i);
+    const string& out_tensor_name =
+        strings::StrCat(ndef.name(), ":", op_def_arg.name(), ":", 0);
+    arg->set_name(op_def_arg.name());
+    (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name;
+    const string& type_attr = op_def_arg.type_attr();
+    if (!type_attr.empty()) {
+      auto i = ndef.attr().find(type_attr);
+      if (i == ndef.attr().end()) {
+        status = errors::InvalidArgument(
+            strings::StrCat("Could not find attr ", type_attr, " in NodeDef ",
+                            ndef.DebugString()));
+        return nullptr;
+      }
+      arg->set_type(i->second.type());
+    }
+  }
+  VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString();
+
+  status = ctx->context.AddFunctionDef(fdef);
+  if (!status.ok()) return nullptr;
+  const auto ret = ctx->context.FindFunctionDef(signature->name());
+  DCHECK(ret != nullptr);
+  return ret;
+}
+
+// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
+// via XLA.
+std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
+  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name();
+  auto launch_op = std::unique_ptr<TFE_Op>(
+      TFE_NewOp(op->operation.ctx, "_XlaLaunch", status));
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  if (op->operation.device) {
+    TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(),
+                    status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+
+  const FunctionDef* fdef;
+  { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); }
+  std::vector<TF_DataType> const_input_types;
+  std::vector<TF_DataType> arg_input_types;
+  gtl::FlatMap<int, int> op_input_to_func_input;
+  if (fdef == nullptr) {
+    // See if this is a primitive op, and if so create a function for it, so
+    // that _XlaLaunchOp can access it.
+    fdef = OpToFunction(op, &const_input_types, &arg_input_types,
+                        &op_input_to_func_input, status);
+    if (!status.ok()) return nullptr;
+  } else {
+    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
+    // functions, so we need to find another way to handle constant inputs.
+    for (int i = const_input_types.size();
+         i < fdef->signature().input_arg_size(); ++i) {
+      VLOG(1) << "Adding Targs from input arg " << i;
+      const OpDef::ArgDef& arg = fdef->signature().input_arg(i);
+      arg_input_types.push_back(static_cast<TF_DataType>(arg.type()));
+    }
+  }
+  DCHECK(fdef != nullptr);
+
+  // Copy inputs and their devices.
+  // Since input param reordering may have occurred between `op` and `launch_op`
+  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
+  *launch_op->operation.MutableInputs() = op->operation.Inputs();
+  for (TensorHandle* h : launch_op->operation.Inputs()) {
+    h->Ref();
+  }
+  if (!op_input_to_func_input.empty()) {
+    DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size());
+    for (int i = 0; i < op_input_to_func_input.size(); ++i) {
+      VLOG(1) << "mapping op input " << i << " to func input "
+              << op_input_to_func_input[i];
+
+      (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] =
+          op->operation.Inputs()[i];
+    }
+  }
+  launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size());
+
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
+                        const_input_types.size());
+
+  // Set Targs and Nresources attrs.
+  TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(),
+                        arg_input_types.size());
+  const int num_resource_inputs = fdef->signature().input_arg_size() -
+                                  const_input_types.size() -
+                                  arg_input_types.size();
+  TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs);
+
+  // Set Tresults attr.
+  std::vector<TF_DataType> tresults;
+  for (const OpDef::ArgDef& arg : fdef->signature().output_arg()) {
+    tresults.push_back(static_cast<TF_DataType>(arg.type()));
+  }
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(),
+                        tresults.size());
+
+  // Set function attr.
+  AttrValue attr_value;
+  NameAttrList* func = attr_value.mutable_func();
+  func->set_name(fdef->signature().name());
+  launch_op->attrs.Set("function", attr_value);
+
+  return launch_op;
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
+}  // namespace
+
+Status EagerExecute(EagerOperation* op,
+                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
+                    int* num_retvals) {
+  EagerContext* ctx = op->EagerContext();
+  auto status = ctx->GetStatus();
+  if (!status.ok()) return status;
+#ifdef TENSORFLOW_EAGER_USE_XLA
+  std::unique_ptr<TFE_Op> xla_launch_op;
+  if (op->UseXla() && op->Name() != "_XlaLaunch") {
+    xla_launch_op = BuildXlaLaunch(op, status);
+    if (!status.ok()) return status;
+    op = xla_launch_op.get();
+  }
+#endif  // TENSORFLOW_EAGER_USE_XLA
+  // Ensure all resource-touching ops run in the device the resource is,
+  // regardless of anything else that has been specified. This is identical to
+  // the graph mode behavior.
+  for (int i = 0; i < op->Inputs().size(); ++i) {
+    Device* input_op_device = nullptr;
+    status = op->Inputs()[i]->OpDevice(&input_op_device);
+    if (!status.ok()) return status;
+    VLOG(2) << "for op " << op->Name() << " input " << i << " "
+            << DataTypeString(op->Inputs()[i]->dtype) << " "
+            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
+            << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name());
+    if (op->Inputs()[i]->dtype == DT_RESOURCE &&
+        (input_op_device != op->Device() || input_op_device == nullptr)) {
+      Device* d = input_op_device == nullptr ? ctx->HostCPU() : input_op_device;
+      VLOG(1) << "Changing device of operation " << op->Name() << " to "
+              << d->name() << " because input #" << i
+              << " is a resource in this device.";
+      op->SetDevice(d);
+    }
+  }
+  Device* device = op->Device();
+
+  Fprint128 cache_key = op->MutableAttrs()->CacheKey(
+      device == nullptr ? "unspecified" : device->name());
+  KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key);
+  if (kernel == nullptr) {
+    const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
+    if (device == nullptr) {
+      status = SelectDevice(ndef, ctx, &device);
+      if (!status.ok()) return status;
+    }
+    CHECK(device != nullptr);
+    if (ctx->LogDevicePlacement()) {
+      LOG(INFO) << "Executing op " << ndef.op() << " in device "
+                << device->name();
+    }
+    kernel = new KernelAndDevice(ctx->GetRendezvous());
+    // Knowledge of the implementation of Init (and in-turn
+    // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
+    // will be accessed, so grab on to the lock.
+    // See WARNING comment in Execute (before kernel->Run) - would be nice to
+    // rework to avoid this subtlety.
+    tf_shared_lock l(*ctx->FunctionsMu());
+    status = KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel);
+    if (!status.ok()) {
+      delete kernel;
+      return status;
+    }
+    // Update output_dtypes inside `kernel`.
+    const OpDef* op_def = nullptr;
+    const FunctionDef* function_def = ctx->FuncLibDef()->Find(ndef.op());
+    if (function_def != nullptr) {
+      op_def = &(function_def->signature());
+    }
+    if (op_def == nullptr) {
+      status = OpDefForOp(ndef.op().c_str(), &op_def);
+      if (!status.ok()) return status;
+    }
+    DataTypeVector input_dtypes;
+    status = InOutTypesForNode(ndef, *op_def, &input_dtypes,
+                               kernel->mutable_output_dtypes());
+    if (!status.ok()) return status;
+    ctx->AddKernelToCache(cache_key, kernel);
+  }
+  const DataTypeVector& output_dtypes = kernel->output_dtypes();
+  const int output_dtypes_size = static_cast<int>(output_dtypes.size());
+  if (output_dtypes_size > *num_retvals) {
+    return errors::InvalidArgument("Expecting ", output_dtypes.size(),
+                                   " outputs, but *num_retvals is ",
+                                   *num_retvals);
+  }
+  *num_retvals = output_dtypes_size;
+  if (device == nullptr) {
+    // TODO(apassos) debug how the assignment below might return a different
+    // device from the one requested above.
+    device = kernel->device();
+  }
+  status = ValidateInputTypeAndPlacement(
+      ctx, device, op, kernel->kernel(),
+      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
+  if (!status.ok()) return status;
+  std::unique_ptr<NodeExecStats> maybe_stats;
+  if (ctx->ShouldStoreMetadata()) {
+    maybe_stats.reset(new NodeExecStats);
+    maybe_stats->set_node_name(op->Name());
+    maybe_stats->set_all_start_micros(Env::Default()->NowMicros());
+    maybe_stats->set_op_start_rel_micros(0);
+    maybe_stats->set_scheduled_micros(Env::Default()->NowMicros());
+    // TODO(apassos) track referenced tensors
+  }
+  retvals->resize(*num_retvals);
+  if (ctx->Async()) {
+    // Note that for async mode, execution order will make sure that all
+    // input handles are ready before executing them.
+    // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
+    tensorflow::uint64 id = ctx->NextId();
+    for (int i = 0; i < *num_retvals; ++i) {
+      (*retvals)[i] = new TensorHandle(id, output_dtypes[i], ctx);
+    }
+    EagerNode* node =
+        new ExecuteNode(id, ctx, op->Device(), op->Inputs(), kernel,
+                        maybe_stats.release(), output_dtypes, *retvals);
+    ctx->ExecutorAdd(node);
+  } else {
+    // Execute checks if retvals[i] is nullptr or not to figure if it needs to
+    // allocate it.
+    status = EagerExecute(ctx, op->Device(), op->Inputs(), kernel,
+                          maybe_stats.get(), retvals->data(), *num_retvals);
+  }
+
+  return status;
+}
+
+Status EagerExecute(EagerContext* ctx, Device* device,
+                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                    TensorHandle** retvals, int num_retvals) {
+  if (device == nullptr) {
+    // TODO(apassos) debug how the assignment below might return a different
+    // device from the one requested above.
+    device = kernel->device();
+  }
+
+  std::vector<Tensor> outputs(1);
+  const MemoryTypeVector* output_memory_types = nullptr;
+  output_memory_types = &kernel->kernel()->output_memory_types();
+  std::vector<Tensor> inputs(op_inputs.size());
+  for (int i = 0; i < op_inputs.size(); ++i) {
+    const Tensor* input_tensor = nullptr;
+    TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
+    inputs[i] = *input_tensor;
+  }
+  // WARNING: kernel->Run utilizes the FunctionLibraryRuntime
+  // (ctx->func_lib(device)), which in turn holds a pointer to func_lib_def.
+  // But knowledge of the implementation
+  // of FunctionLibraryRuntime tells us that func_lib_def is not accessed by
+  // FunctionLibraryRuntime::Run(), so there is no thread-safety concern here.
+  // This is quite subtle. Re-work things to make this better?  (Would it make
+  // sense for FunctionLibraryRuntime to ensure thread-safe access to
+  // FunctionLibraryDefinition?).  TODO(apassos) figure out how to record stats
+  // for ops which are a part of functions.
+  // TODO(agarwal): change Run to take vector of handles ?
+  TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats));
+  if (maybe_stats != nullptr) {
+    maybe_stats->set_op_end_rel_micros(Env::Default()->NowMicros() -
+                                       maybe_stats->all_start_micros());
+    mutex_lock ml(*ctx->MetadataMu());
+    if (ctx->ShouldStoreMetadata()) {
+      auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
+      // Lazily initialize the RunMetadata with information about all devices if
+      // this is the first call.
+      while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+        step_stats->add_dev_stats();
+      }
+      // Find the current device's index.
+      int device_idx = 0;
+      for (int i = 0; i < ctx->devices()->size(); ++i) {
+        if (ctx->devices()->at(i) == device) {
+          device_idx = i;
+          break;
+        }
+      }
+      // Populate the device stats for this device.
+      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+      dev_stats->set_device(device->name());
+      *dev_stats->add_node_stats() = *maybe_stats;
+    }
+  }
+  DCHECK_EQ(num_retvals, outputs.size());
+  Device* op_device = device;
+  for (int i = 0; i < num_retvals; ++i) {
+    Device* d = op_device;
+    if (d != nullptr && output_memory_types != nullptr &&
+        (*output_memory_types)[i] == HOST_MEMORY) {
+      d = nullptr;
+    }
+    if (retvals[i] == nullptr) {
+      retvals[i] = new TensorHandle(outputs[i], d, op_device, ctx);
+    } else {
+      retvals[i]->SetTensorAndDevice(outputs[i], d, op_device);
+    }
+  }
+  return Status::OK();
+}
+
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         const char* device_name, TensorHandle** result) {
+  TF_RETURN_IF_ERROR(ctx->GetStatus());
+  Device* dstd = ctx->HostCPU();
+  if (device_name != nullptr && strlen(device_name) > 0) {
+    TF_RETURN_IF_ERROR(ctx->device_mgr()->LookupDevice(device_name, &dstd));
+  }
+  if (ctx->Async()) {
+    // Note that `h` may not be currently ready. However execution order will
+    // make sure that `h` is ready before the copy is actually done.
+    CopyToDeviceNode* node = new CopyToDeviceNode(h, dstd, ctx);
+    TensorHandle* output = node->dst();
+    // Note that calling Add makes `node` accessible by the EagerExecutor
+    // thread. So further accesses need to be thread-safe.
+    ctx->ExecutorAdd(node);
+    *result = output;
+    return Status::OK();
+  } else {
+    TF_RETURN_IF_ERROR(h->CopyToDevice(ctx, dstd, result));
+    return Status::OK();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c8d7e164d0b8061c9f8380fb238f17bcd6d74d3
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+// Utility function that executes a fully constructed EagerOperation.
+Status EagerExecute(
+    EagerOperation* op,
+    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* retvals,
+    int* num_retvals);
+
+// Low-level utility to execute the kernel specified by kernel on device device,
+// with the inputs op_inputs, in the context ctx.
+Status EagerExecute(EagerContext* ctx, Device* device,
+                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                    TensorHandle** retvals, int num_retvals);
+
+// Low-level utility to copy a tensor handle from one device to another.
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         const char* device_name, TensorHandle** result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..93018dd96914c0d091c7242a9c053fabce434e78
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+class ExecuteNode : public EagerNode {
+ public:
+  ExecuteNode(uint64 id, EagerContext* ctx, Device* op_device,
+              const tensorflow::gtl::InlinedVector<TensorHandle*, 4>& inputs,
+              KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+              const DataTypeVector& output_dtypes,
+              const tensorflow::gtl::InlinedVector<TensorHandle*, 2>& retvals)
+      : EagerNode(id),
+        ctx_(ctx),
+        op_device_(op_device),
+        inputs_(inputs),
+        kernel_(kernel),
+        maybe_stats_(maybe_stats),
+        retvals_(retvals) {
+    for (auto handle : inputs_) {
+      handle->Ref();
+    }
+    for (auto handle : retvals_) {
+      handle->Ref();
+    }
+  }
+
+  ~ExecuteNode() override {
+    for (auto handle : inputs_) {
+      handle->Unref();
+    }
+    for (auto handle : retvals_) {
+      handle->Unref();
+    }
+  }
+
+  tensorflow::Status Run() override {
+    const Status status =
+        EagerExecute(ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
+                     retvals_.begin(), retvals_.size());
+    if (status.ok()) {
+      return status;
+    } else {
+      return Status(status.code(),
+                    strings::StrCat("Got error, \"", status.error_message(),
+                                    "\" while executing kernel ",
+                                    kernel_->kernel()->def().DebugString()));
+    }
+  }
+
+ private:
+  tensorflow::EagerContext* ctx_;
+  tensorflow::Device* op_device_;
+  tensorflow::gtl::InlinedVector<TensorHandle*, 4> inputs_;
+  tensorflow::KernelAndDevice* kernel_;
+  std::unique_ptr<NodeExecStats> maybe_stats_;
+  tensorflow::gtl::InlinedVector<TensorHandle*, 2> retvals_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a4895a938a72a41cf3ad494ecca2ef9fb3e9648
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -0,0 +1,132 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+
+// static
+Status KernelAndDevice::InitOp(Device* device, const NodeDef& ndef,
+                               KernelAndDevice* out) {
+  OpKernel* k = nullptr;
+  Status s = CreateOpKernel(device->device_type().c_str(), device,
+                            device->GetAllocator(AllocatorAttributes()),
+                            nullptr, ndef, TF_GRAPH_DEF_VERSION, &k);
+  out->device_ = device;
+  out->kernel_.reset(k);
+  out->flib_ = nullptr;
+  return s;
+}
+
+// static
+Status KernelAndDevice::Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
+                             KernelAndDevice* out) {
+  OpKernel* k = nullptr;
+  Status s = flib->CreateKernel(ndef, &k);
+  out->device_ = flib->device();
+  out->kernel_.reset(k);
+  out->flib_ = flib;
+  return s;
+}
+
+Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
+                            std::vector<Tensor>* output_tensors,
+                            NodeExecStats* stats) {
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (Tensor& t : *input_tensors) {
+    inputs.push_back(TensorValue(&t));
+  }
+
+  std::vector<AllocatorAttributes> out_attrs(kernel_->num_outputs());
+  for (size_t i = 0; i < out_attrs.size(); ++i) {
+    out_attrs[i].set_on_host(kernel_->output_memory_types()[i] ==
+                             tensorflow::HOST_MEMORY);
+  }
+
+  OpKernelContext::Params params;
+  params.device = device_;
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = kernel_.get();
+  params.resource_manager = device_->resource_manager();
+  params.output_attr_array = gtl::vector_as_array(&out_attrs);
+  params.function_library = flib_;
+  params.slice_reader_cache = &slice_reader_cache_;
+  params.rendezvous = rendez_;
+  if (stats != nullptr) {
+    params.track_allocations = true;
+  }
+  // TODO(apassos): use a thread pool.
+  std::function<void(std::function<void()>)> runner =
+      [](std::function<void()> f) { f(); };
+  params.runner = &runner;
+
+  OpKernelContext context(&params);
+
+  if (kernel_->def().op() == "_Recv") {
+    // TODO(apassos) do not special-case _Recv. Currently the GPU device fails
+    // if trying to run _Recv->Compute(), specifically checking for _Recv. To go
+    // around this we call _Recv->ComputeAsync, to mimic graph mode behavior.
+    AsyncOpKernel* async = kernel_->AsAsync();
+    Notification done;
+    device_->ComputeAsync(async, &context, [&done]() { done.Notify(); });
+    done.WaitForNotification();
+  } else {
+    device_->Compute(kernel_.get(), &context);
+  }
+  if (!context.status().ok()) return context.status();
+
+  output_tensors->clear();
+  for (int i = 0; i < context.num_outputs(); ++i) {
+    output_tensors->push_back(Tensor(*context.mutable_output(i)));
+  }
+  if (stats != nullptr) {
+    for (const auto& allocator_pair : context.wrapped_allocators()) {
+      AllocatorMemoryUsed* memory = stats->add_memory();
+      memory->set_allocator_name(allocator_pair.first->Name());
+      auto sizes = allocator_pair.second->GetSizes();
+      memory->set_total_bytes(std::get<0>(sizes));
+      memory->set_peak_bytes(std::get<1>(sizes));
+      memory->set_live_bytes(std::get<2>(sizes));
+
+      AllocatorStats allocator_stats;
+      allocator_pair.first->GetStats(&allocator_stats);
+      memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
+      allocator_pair.second->GetRecordsAndUnRef();
+    }
+    auto* ms = stats->mutable_memory_stats();
+    ms->set_temp_memory_size(context.temp_memory_allocated());
+    for (const auto& alloc_id : context.persistent_alloc_ids()) {
+      ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
+    }
+
+    ms->set_persistent_memory_size(context.persistent_memory_allocated());
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..46ec550c780aaa3cd5cad5f02a4dfe9a75572277
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_KERNEL_AND_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_KERNEL_AND_DEVICE_H_
+
+// Support for eager execution of TensorFlow kernels.
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+
+// KernelAndDevice encapsulates an instantiated kernel and the device it is on.
+//
+// Also see:
+// https://www.tensorflow.org/code/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+// and
+// https://www.tensorflow.org/code/tensorflow/core/kernels/ops_testutil.h
+class KernelAndDevice {
+ public:
+  // Populates 'out' with a kernel appropriate for 'ndef'.
+  //
+  // The provided FunctionLibraryRuntime MUST outlive all calls to
+  // Run() on the returned KernelAndDevice.
+  //
+  // TODO(ashankar): Figure out thread-safety concerns around
+  // FunctionLibraryRuntime (in particular, how the underlying
+  // FunctionLibraryDefinition might be mutated by another thread as new
+  // functions are registered with it).  Conservatively, thread-safe usage of
+  // the FunctionLibraryRuntime is pushed on to the caller (see locking in
+  // c_api.cc).
+  static Status Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
+                     KernelAndDevice* out);
+  // TODO(ashankar): Remove this
+  static Status InitOp(Device* device, const NodeDef& ndef,
+                       KernelAndDevice* out);
+
+  KernelAndDevice(tensorflow::Rendezvous* rendez)
+      : device_(nullptr), flib_(nullptr), rendez_(rendez) {}
+
+  // TODO(ashankar): Handle list-valued inputs.
+  Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
+             NodeExecStats* stats);
+
+  const OpKernel* kernel() const { return kernel_.get(); }
+
+  Device* device() const { return device_; }
+
+  DataTypeVector* mutable_output_dtypes() { return &output_dtypes_; }
+  const DataTypeVector& output_dtypes() { return output_dtypes_; }
+
+ private:
+  std::unique_ptr<OpKernel> kernel_;
+  Device* device_;
+  FunctionLibraryRuntime* flib_;
+  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
+  Rendezvous* rendez_;
+  DataTypeVector output_dtypes_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_KERNEL_AND_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd055c3c3eb6df3eb440a78b7a8d3e72ff9335bd
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+class TestEnv {
+ public:
+  TestEnv() : flib_def_(OpRegistry::Global(), {}) {
+    Device* device =
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
+    device_mgr_.reset(new DeviceMgr({device}));
+    flib_runtime_ = NewFunctionLibraryRuntime(device_mgr_.get(), Env::Default(),
+                                              device, TF_GRAPH_DEF_VERSION,
+                                              &flib_def_, nullptr, {}, nullptr);
+  }
+
+  FunctionLibraryRuntime* function_library_runtime() const {
+    return flib_runtime_.get();
+  }
+
+ private:
+  FunctionLibraryDefinition flib_def_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
+};
+
+void BM_CreateGraph(int iters) {
+  for (int i = 0; i < iters; ++i) {
+    Scope root = Scope::NewRootScope();
+    auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
+    auto M = ops::MatMul(root, C, C);
+    TF_CHECK_OK(root.status());
+  }
+}
+BENCHMARK(BM_CreateGraph);
+
+void BM_RunGraph(int iters) {
+  tensorflow::testing::StopTiming();
+  Scope root = Scope::NewRootScope();
+  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
+  auto M = ops::MatMul(root, C, C);
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(1);
+  opts.config.set_intra_op_parallelism_threads(1);
+  ClientSession sess(root, opts);
+  std::vector<Tensor> outputs;
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    outputs.clear();
+    TF_CHECK_OK(sess.Run({M}, &outputs));
+  }
+}
+BENCHMARK(BM_RunGraph);
+
+void BM_CreateAndDestroySession(int iters) {
+  tensorflow::testing::StopTiming();
+  Scope root = Scope::NewRootScope();
+  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
+  auto M = ops::MatMul(root, C, C);
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    ClientSession sess(root);
+  }
+}
+BENCHMARK(BM_CreateAndDestroySession);
+
+void BM_KernelAndDeviceInit(int iters) {
+  tensorflow::testing::StopTiming();
+  NodeDef ndef(AttrBuilder("MatMul")
+                   .Set("T", DT_FLOAT)
+                   .Set("transpose_a", false)
+                   .Set("transpose_b", false)
+                   .NumInputs(2)
+                   .BuildNodeDef());
+  TestEnv env;
+  KernelAndDevice k(nullptr);
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    TF_CHECK_OK(
+        KernelAndDevice::Init(ndef, env.function_library_runtime(), &k));
+  }
+}
+BENCHMARK(BM_KernelAndDeviceInit);
+
+void BM_KernelAndDeviceRun(int iters) {
+  tensorflow::testing::StopTiming();
+  Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
+  std::vector<Tensor> inputs;
+  inputs.push_back(t);
+  inputs.push_back(t);
+  std::vector<Tensor> outputs;
+  NodeDef ndef(AttrBuilder("MatMul")
+                   .Set("T", DT_FLOAT)
+                   .Set("transpose_a", false)
+                   .Set("transpose_b", false)
+                   .NumInputs(inputs.size())
+                   .BuildNodeDef());
+  TestEnv env;
+  KernelAndDevice kernel(nullptr);
+  TF_CHECK_OK(
+      KernelAndDevice::Init(ndef, env.function_library_runtime(), &kernel));
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    TF_CHECK_OK(kernel.Run(&inputs, &outputs, nullptr));
+  }
+}
+BENCHMARK(BM_KernelAndDeviceRun);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e11f7b7104ce22cc585e2b03fcfd914e0eb80aa
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -0,0 +1,179 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+bool TensorHandle::IsReady() {
+  if (node_id == 0) return true;
+  mutex_lock l(ctx_mutex_);
+  return is_ready_;
+}
+
+Status TensorHandle::WaitReady() {
+  if (node_id == 0) return Status::OK();
+  EagerExecutor* executor = nullptr;
+  {
+    mutex_lock l(ctx_mutex_);
+    if (is_ready_) return Status::OK();
+    executor = ctx_->Executor();
+  }
+  return executor->WaitFor(node_id);
+}
+
+Status TensorHandle::Tensor(const tensorflow::Tensor** t) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *t = &tensor_;
+  return Status::OK();
+}
+
+Status TensorHandle::Device(tensorflow::Device** d) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *d = device_;
+  return Status::OK();
+}
+
+Status TensorHandle::OpDevice(tensorflow::Device** d) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *d = op_device_;
+  return Status::OK();
+}
+
+Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
+                                     tensorflow::Device** device,
+                                     tensorflow::Device** op_device) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *tensor = &tensor_;
+  *device = device_;
+  *op_device = op_device_;
+  return Status::OK();
+}
+
+void TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor,
+                                      tensorflow::Device* device,
+                                      tensorflow::Device* op_device) {
+  mutex_lock l(ctx_mutex_);
+  DCHECK(node_id > 0 && !is_ready_)
+      << "SetTensorAndDevice should be only called  "
+      << "on non-ready handles.";
+  is_ready_ = true;
+  tensor_ = tensor;
+  device_ = device;
+  op_device_ = op_device;
+}
+
+Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
+                                  TensorHandle** output) {
+  const tensorflow::Tensor* src = nullptr;
+  tensorflow::Device* srcd = nullptr;
+  // TODO(agarwal): src_opd is unused. Perhaps allow TensorAndDevice to accept
+  // nullptr.
+  tensorflow::Device* src_opd = nullptr;
+  TF_RETURN_IF_ERROR(TensorAndDevice(&src, &srcd, &src_opd));
+  if (srcd == nullptr) srcd = ctx->HostCPU();
+  bool is_same_device = (srcd == dstd) || (srcd->name() == dstd->name());
+  const bool dst_cpu = dstd->tensorflow_gpu_device_info() == nullptr;
+  const bool src_cpu = srcd->tensorflow_gpu_device_info() == nullptr;
+  // both_on_cpu can be true and yet is_same_device is false, if one of src/dst
+  // has device type XLA_CPU, and the other CPU.
+  const bool both_on_cpu = src_cpu && dst_cpu;
+  if (is_same_device || both_on_cpu) {
+    dstd = dst_cpu ? nullptr : dstd;
+    *output = new tensorflow::TensorHandle(*src, dstd, dstd, ctx);
+    return tensorflow::Status::OK();
+  }
+  if (!dst_cpu && (src->dtype() != tensorflow::DT_VARIANT &&
+                   !tensorflow::DataTypeCanUseMemcpy(src->dtype()))) {
+    return tensorflow::errors::InvalidArgument(
+        "Can't copy Tensor with type ",
+        tensorflow::DataTypeString(src->dtype()), " to device ", dstd->name(),
+        ".");
+  }
+  tensorflow::AllocatorAttributes attr;
+  if (src->dtype() == tensorflow::DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  tensorflow::Tensor dst(dstd->GetAllocator(attr), src->dtype(), src->shape());
+  if (src->shape().num_elements() == 0) {
+    dstd = dst_cpu ? nullptr : dstd;
+    *output = new tensorflow::TensorHandle(dst, dstd, dstd, ctx);
+    return tensorflow::Status::OK();
+  }
+  tensorflow::DeviceContext* src_device_context = nullptr;
+  if (!src_cpu) {
+    src_device_context = srcd->tensorflow_gpu_device_info()->default_context;
+  }
+  tensorflow::DeviceContext* dst_device_context = nullptr;
+  if (!dst_cpu) {
+    dst_device_context = dstd->tensorflow_gpu_device_info()->default_context;
+  }
+  // TODO(ashankar): The Sync() call below may be more aggressive than
+  // necessary. It is based on knowledge of implementation details - that
+  // GPU devices are implemented using 3 streams - one for host->device copies,
+  // one for device->host copies and one for sending operations to the GPU.
+  // With that setup, Sync()ing across all 3 streams should be sufficient
+  // but more than necessary (since it waits for operations that might have
+  // nothing to do with this tensor to complete).
+  TF_RETURN_IF_ERROR(srcd->Sync());
+  tensorflow::Notification n;
+  tensorflow::Status status;
+  tensorflow::CopyTensor::ViaDMA("copy", src_device_context, dst_device_context,
+                                 srcd, dstd, tensorflow::AllocatorAttributes(),
+                                 tensorflow::AllocatorAttributes(), src, &dst,
+                                 [&status, &n](const tensorflow::Status& s) {
+                                   status = s;
+                                   n.Notify();
+                                 });
+  n.WaitForNotification();
+  if (status.ok()) {
+    dstd = dst_cpu ? nullptr : dstd;
+    *output = new tensorflow::TensorHandle(dst, dstd, dstd, ctx);
+  }
+  return status;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..d66c4d95e2a5513680f81e3f7c1875266b2dfb02
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// Associates a Tensor and a Device, used in the eager runtime. Internal version
+// executor_of the TFE_TensorHandle struct and the python EagerTensor class
+// (unrelated to python TensorHandle).
+class TensorHandle : public core::RefCounted {
+ public:
+  TensorHandle(const Tensor& t, Device* d, Device* op_device, EagerContext* ctx)
+      : dtype(t.dtype()),
+        node_id(0),
+        tensor_(t),
+        device_(d),
+        op_device_(op_device),
+        ctx_(ctx),
+        is_ready_(true) {}
+
+  TensorHandle(uint64 node_id, DataType dtype, EagerContext* ctx)
+      : dtype(dtype),
+        node_id(node_id),
+        tensor_(dtype),
+        device_(nullptr),
+        op_device_(nullptr),
+        ctx_(ctx),
+        is_ready_(ctx == nullptr) {
+    DCHECK_GT(node_id, 0);
+  }
+
+  ~TensorHandle() override {}
+
+  Status Tensor(const tensorflow::Tensor** t);
+
+  Status Device(tensorflow::Device** d);
+
+  Status OpDevice(tensorflow::Device** d);
+
+  Status TensorAndDevice(const tensorflow::Tensor** tensor,
+                         tensorflow::Device** device,
+                         tensorflow::Device** op_device);
+
+  // Note that this can be called at most once, and only on non-ready handles,
+  // and makes them ready.
+  void SetTensorAndDevice(const tensorflow::Tensor& tensor,
+                          tensorflow::Device* device,
+                          tensorflow::Device* op_device);
+
+  Status CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
+                      TensorHandle** output);
+
+  // Warning: can return nullptr for CPU tensors.
+  EagerContext* Context() {
+    mutex_lock ml(ctx_mutex_);
+    return ctx_;
+  }
+
+  // dtype for the handle. It must be the same as t.dtype() once the handle is
+  // ready.
+  const DataType dtype;
+
+ private:
+  // If the contents of the Tensor pointed to by this handle is yet to be
+  // computed by a EagerNode, this function will block till that compuatation is
+  // done and the handle is "ready".
+  Status WaitReady();
+
+  bool IsReady();
+
+  // Id for the EagerNode that will compute the value pointed to by this handle.
+  // If the value is 0, the handle is already ready, but not vice-versa.
+  const uint64 node_id;
+
+  tensorflow::Tensor tensor_;
+
+  // TODO(ashankar): device_ == nullptr iff local CPU
+  // This was expedient, but perhaps worth revisiting ('device_' should always
+  // be a valid pointer?)
+  // This can be done if TFE_NewOp() and the TFE_TensorHandle constructors are
+  // provided with the appropriate TFE_Context.
+  //
+  // TODO(ashankar): Reference count TFE_Context to ensure that 'device_' of a
+  // TFE_TensorHandle does not outlive the TFE_Context from which it came?
+  tensorflow::Device* device_;
+
+  // Device in which the op producing this tensor was executed. Equals to
+  // device_ for constant tensors.
+  tensorflow::Device* op_device_;
+
+  mutex ctx_mutex_;
+
+  // `ctx` is only guaranteed to be set if the handle is not "ready". This is
+  // typically true when the handle was produced during async execution.
+  // `ctx` object is not owned and should outlive this handle.
+  EagerContext* ctx_ GUARDED_BY(ctx_mutex_);
+  bool is_ready_ GUARDED_BY(ctx_mutex_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
diff --git a/tensorflow/core/common_runtime/eval_const_tensor.cc b/tensorflow/core/common_runtime/eval_const_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1542f1f57fd06a9b8fa0d6e36a5592772404c0e
--- /dev/null
+++ b/tensorflow/core/common_runtime/eval_const_tensor.cc
@@ -0,0 +1,361 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eval_const_tensor.h"
+
+#include <deque>
+
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+namespace {
+
+// Tries to infer tensor output based on the input shapes of the node. In some
+// cases, the shapes of the inputs are sufficient for inferring the contents of
+// the output tensor. For example, a Shape op with fully defined input shapes
+// can have its output tensor inferred.
+Status TryToInferTensorOutputFromInputShapes(const Edge& edge,
+                                             const ShapeRefiner& refiner,
+                                             Tensor* output, bool* success) {
+  *success = false;
+  const Node* node = edge.src();
+  InferenceContext* c = refiner.GetContext(node);
+  if (c == nullptr) {
+    return errors::FailedPrecondition("Node does not have context.");
+  }
+
+  if (node->type_string() == "Shape") {
+    // If input shapes to the shape op are fully defined,
+    // we can infer the shape op's output tensor.
+    bool fully_defined_inputs = c->FullyDefined(c->input(0));
+    if (fully_defined_inputs) {
+      int input_rank = c->Rank(c->input(0));
+      Tensor t(node->output_type(0), TensorShape({input_rank}));
+      if (node->output_type(0) == DT_INT32) {
+        auto flat = t.flat<int>();
+        for (int i = 0; i < input_rank; i++) {
+          int64 dimension = c->Value(c->Dim(c->input(0), i));
+          if (!FastBoundsCheck(dimension, std::numeric_limits<int32>::max())) {
+            return errors::InvalidArgument(
+                "Shape has output type int32, but dimension exceeds maximum "
+                "int32 value");
+          }
+          flat(i) = static_cast<int32>(dimension);
+        }
+      } else if (node->output_type(0) == DT_INT64) {
+        auto flat = t.flat<int64>();
+        for (int i = 0; i < input_rank; i++) {
+          flat(i) = c->Value(c->Dim(c->input(0), i));
+        }
+      } else {
+        return errors::FailedPrecondition(
+            "Shape has output type that is not int32 or int64");
+      }
+      *output = t;
+      *success = true;
+    }
+  } else if (node->type_string() == "Rank") {
+    bool rank_known = c->RankKnown(c->input(0));
+    if (rank_known) {
+      int32 input_rank = c->Rank(c->input(0));
+      Tensor t(node->output_type(0), TensorShape({}));
+      t.flat<int32>()(0) = input_rank;
+      *output = t;
+      *success = true;
+    }
+  } else if (node->type_string() == "Size") {
+    bool fully_defined_inputs = c->FullyDefined(c->input(0));
+    if (fully_defined_inputs) {
+      int32 rank = c->Rank(c->input(0));
+      Tensor t(node->output_type(0), TensorShape({}));
+      int64 size = 1;
+      for (int i = 0; i < rank; i++) {
+        size *= c->Value(c->Dim(c->input(0), i));
+      }
+      if (node->output_type(0) == DT_INT32) {
+        if (!FastBoundsCheck(size, std::numeric_limits<int32>::max())) {
+          return errors::InvalidArgument(
+              "Size has output type int32, but size exceeds maximum int32 "
+              "value");
+        }
+        t.flat<int32>()(0) = static_cast<int32>(size);
+      } else if (node->output_type(0) == DT_INT64) {
+        t.flat<int64>()(0) = size;
+      } else {
+        return errors::FailedPrecondition(
+            "Size has output type that is not int32 or int64");
+      }
+      *output = t;
+      *success = true;
+    }
+  }
+  return Status::OK();
+}
+
+// Extracts the subgraph ending at 'target_node' that is statically computable
+// and inserts into 'out_graph'. If statically computable, 'is_constant_graph'
+// will be set to true.
+Status ExtractConstantSubgraph(
+    const Node& target_node, const ShapeRefiner& refiner,
+    const std::unordered_map<string, Tensor>* cached_values, Graph* out_graph,
+    bool* is_constant_graph,
+    std::vector<std::pair<string, Tensor>>* const_inputs) {
+  *is_constant_graph = false;
+  std::unordered_set<string> const_inputs_added;
+
+  if (target_node.op_def().is_stateful()) {
+    return Status::OK();
+  }
+
+  if (IsMerge(&target_node)) {
+    return Status::OK();
+  }
+
+  if (target_node.type_string() == "PlaceholderWithDefault") {
+    return Status::OK();
+  }
+
+  // TODO(skyewm): should more of the filtering applied in input nodes below be
+  // applied to target_node here?
+
+  // Identify the possibly constant subgraph by recursively iterating backwards
+  // through the inputs to 'target_node' until we either 1) find an already
+  // existing input to our subgraph 'const_inputs', 2) Discover our graph is not
+  // constant, or 3) Hit a root node.
+
+  struct NodeAndRecursed {
+    Node* new_node = nullptr;
+    bool recursed = false;
+  };
+
+  std::map<const Node*, NodeAndRecursed> old_to_new_and_recursed;
+  Node* target_node_copy = out_graph->CopyNode(&target_node);
+  old_to_new_and_recursed[&target_node].new_node = target_node_copy;
+  old_to_new_and_recursed[&target_node].recursed = true;
+
+  // Add the target node's inputs to seed the recursion.
+  std::deque<const Edge*> edges_to_visit;
+  for (const Edge* e : target_node.in_edges()) {
+    // TODO(skyewm): control edges will be meaningful if/when we handle control
+    // flow (e.g. constants in cond branches are triggered via control edges).
+    if (e->IsControlEdge()) continue;
+    edges_to_visit.push_back(e);
+  }
+
+  *is_constant_graph = true;
+
+  // Iterate over the set of edges to visit (backwards).
+  while (!edges_to_visit.empty()) {
+    const Edge* current_edge = edges_to_visit.front();
+    edges_to_visit.pop_front();
+    Node* current_node = current_edge->src();
+
+    // If the node is stateful, assume the graph is not constant.
+    if (current_node->op_def().is_stateful()) {
+      *is_constant_graph = false;
+      return Status::OK();
+    }
+
+    // During construction or import from GraphConstructor, back edges may not
+    // be filled in. In addition, control flow constructs may depend on control
+    // edges which aren't handled by this method. Don't constant fold through
+    // merges at all for now.
+    if (IsMerge(current_node)) {
+      *is_constant_graph = false;
+      return Status::OK();
+    }
+
+    // Don't constant fold enter/exit currently either, as it's easy to end
+    // up with a partial frame.
+    if (IsEnter(current_node) || IsExit(current_node)) {
+      *is_constant_graph = false;
+      return Status::OK();
+    }
+
+    // Placeholders should never be constant folded because their outputs are
+    // fed by the user. Note that "Placeholder" nodes have no inputs so are
+    // handled below.
+    if (current_node->type_string() == "PlaceholderWithDefault") {
+      *is_constant_graph = false;
+      return Status::OK();
+    }
+
+    // If there is nothing more to recurse down, see if
+    // the generator node is a constant.
+    if (current_node->num_inputs() == 0) {
+      if (!current_node->IsConstant()) {
+        // Generator node is not a constant, so subgraph is not
+        // constant.
+        *is_constant_graph = false;
+        return Status::OK();
+      }
+    }
+
+    // Either the node is a constant, or the node is a potential
+    // intermediate node on the path from a constant.
+    //
+    // Add a copy of its node and a new edge to the new subgraph.
+
+    // Get or create the version of 'current_node' in the new graph.
+    Node* current_node_copy;
+    // This gets or creates the NodeAndRecursed entry for current_node.
+    NodeAndRecursed* node_and_recursed = &old_to_new_and_recursed[current_node];
+    if (node_and_recursed->new_node == nullptr) {
+      // First time processing this node.
+      current_node_copy = out_graph->CopyNode(current_node);
+      // Track the mapping from the original node to the new one.
+      node_and_recursed->new_node = current_node_copy;
+    } else {
+      current_node_copy = node_and_recursed->new_node;
+    }
+
+    // Add the edge to the destination node.
+    {
+      auto it = old_to_new_and_recursed.find(current_edge->dst());
+      if (it == old_to_new_and_recursed.end()) {
+        return errors::Internal(
+            "Could not find mapping from old to new copy of destination node: ",
+            current_edge->dst()->name());
+      }
+      Node* dst_copy = it->second.new_node;
+
+      out_graph->AddEdge(current_node_copy, current_edge->src_output(),
+                         dst_copy, current_edge->dst_input());
+    }
+
+    const string& output_tensor_name =
+        strings::StrCat(current_node->name(), ":", current_edge->src_output());
+
+    // Some tensor values can be inferred. For example, a shape op
+    // with input shapes fully defined can have its output tensor inferred.
+    Tensor tensor_inferred;
+    bool successfully_inferred_tensor = false;
+    TF_RETURN_IF_ERROR(TryToInferTensorOutputFromInputShapes(
+        *current_edge, refiner, &tensor_inferred,
+        &successfully_inferred_tensor));
+    if (successfully_inferred_tensor) {
+      const_inputs->emplace_back(output_tensor_name, tensor_inferred);
+      const_inputs_added.insert(output_tensor_name);
+      continue;
+    }
+
+    // If we have a copy of the input tensor materialized already,
+    // then add to the list of inputs to feed and do not recurse further.
+    if (cached_values != nullptr) {
+      auto it = cached_values->find(output_tensor_name);
+      if (it != cached_values->end() &&
+          const_inputs_added.count(output_tensor_name) == 0) {
+        const_inputs->emplace_back(output_tensor_name, it->second);
+        const_inputs_added.insert(output_tensor_name);
+        continue;
+      }
+    }
+
+    // If this node's inputs have not been processed already, do so now.
+    if (!node_and_recursed->recursed) {
+      node_and_recursed->recursed = true;
+      for (const Edge* e : current_node->in_edges()) {
+        if (e->IsControlEdge()) continue;
+        edges_to_visit.push_back(e);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+Status EvaluateConstantTensor(OutputTensor tensor, const ShapeRefiner& refiner,
+                              const OpRegistryInterface& ops,
+                              int32 graph_def_version, bool* evaluated,
+                              Tensor* result, GraphRunner* graph_runner,
+                              std::unordered_map<string, Tensor>* cached_values,
+                              int64 max_cached_value_size,
+                              bool disable_constant_propagation) {
+  *evaluated = false;
+  const Node* src = tensor.node;
+
+  // Simple case: the source node is a constant
+  if (src->IsConstant()) {
+    if (result->FromProto(src->def().attr().at("value").tensor())) {
+      *evaluated = true;
+      return Status::OK();
+    }
+  }
+
+  if (disable_constant_propagation) {
+    return Status::OK();
+  }
+
+  bool is_constant_graph = false;
+  Graph subgraph(&ops);
+  auto versions = subgraph.versions();
+  versions.set_producer(graph_def_version);
+  subgraph.set_versions(versions);
+
+  std::vector<std::pair<string, Tensor>> const_inputs;
+  TF_RETURN_IF_ERROR(ExtractConstantSubgraph(*src, refiner, cached_values,
+                                             &subgraph, &is_constant_graph,
+                                             &const_inputs));
+  if (!is_constant_graph) {
+    return Status::OK();
+  }
+  const string output_tensor_name =
+      strings::StrCat(src->name(), ":", tensor.index);
+  std::vector<Tensor> outputs;
+
+  std::unique_ptr<GraphRunner> graph_runner_storage;
+  if (graph_runner == nullptr) {
+    // TODO(skyewm): Convert to std::make_unique when available.
+    graph_runner_storage.reset(new GraphRunner(Env::Default()));
+    graph_runner = graph_runner_storage.get();
+  }
+
+  // NOTE; we should pass in a function library runtime if we want
+  // to support constant-expression evaluation on functions.
+  Status s = graph_runner->Run(&subgraph, nullptr /* function_library */,
+                               const_inputs, {output_tensor_name}, &outputs);
+
+  // If all kernels in the constant graph are not registered
+  // in the process, GraphRunner::Run may fail, in which case
+  // we cannot propagate constants, so this is best-effort.
+  if (s.ok()) {
+    *result = outputs[0];
+    *evaluated = true;
+
+    // We memoize (small) constants evaluated so far, so
+    // ExtractConstantSubgraph can avoid extracting the full
+    // subgraph.  As we build up large graphs, this avoids
+    // repeated computation of the early parts of a constant
+    // graph.
+    if (cached_values != nullptr &&
+        outputs[0].TotalBytes() <= max_cached_value_size) {
+      (*cached_values)[output_tensor_name] = outputs[0];
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eval_const_tensor.h b/tensorflow/core/common_runtime/eval_const_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..fca5a235695ce121552656bced5f4001b146cfd0
--- /dev/null
+++ b/tensorflow/core/common_runtime/eval_const_tensor.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EVAL_CONST_TENSOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EVAL_CONST_TENSOR_H_
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+// TODO(skyewm): can this be combined with ConstantFold?
+
+namespace tensorflow {
+
+class GraphRunner;
+class OpRegistryInterface;
+class ShapeRefiner;
+class Tensor;
+
+// Attempts to evaluate `tensor`. This will only be possible if `tensor` doesn't
+// depend on any graph inputs (this function is safe to call if this isn't the
+// case though).
+//
+// If the evaluation is successful, `evaluated` will be set to true and
+// `tensor`s value returned in `result`. Otherwise `evaluated` will be set to
+// false. An error status is returned if something is wrong with the graph or
+// input. Note that `evaluated` may set to false if Status::OK() is returned.
+//
+// Params:
+//   tensor - the tensor to be evaluated.
+//   refiner - used to fetch the InferenceContexts for nodes in the graph.
+//   ops - the OpRegistryInterface for the graph.
+//   graph_def_version - the producer version of the graph.
+//   evaluated - output param indicating whether evaluation was successful.
+//   result - output param containing the result if evaluated is true.
+//   graph_runner - optional. If not set, a GraphRunner will be created for
+//     evaluating tensor. This can be set to avoid creating a new GraphRunner
+//     for every call.
+//   cached_values - optional. This can be used to cache evaluated results
+//     across calls, to avoid evaluating the same parts of the graph multiple
+//     times.
+//   max_cached_value_size - optional. If `cached_values` is set, the maximum
+//     result size to cache.
+//   disable_constant_propagation - if true, only Const node values will be
+//     returned.
+Status EvaluateConstantTensor(
+    OutputTensor tensor, const ShapeRefiner& refiner,
+    const OpRegistryInterface& ops, int32 graph_def_version, bool* evaluated,
+    Tensor* result, GraphRunner* graph_runner = nullptr,
+    std::unordered_map<string, Tensor>* cached_values = nullptr,
+    int64 max_cached_value_size = 1024,
+    bool disable_constant_propagation = false);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EVAL_CONST_TENSOR_H_
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index b06b75d6585f01640374eb7ab9842bf441cf9411..0c461a9ee98ca61fb3d3f165d93adf0e5cec7ee7 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -258,6 +258,13 @@ struct NodeItem {
   // Return array of per-output allocator attributes.
   const AllocatorAttributes* output_attrs() const { return output_attr_base(); }
 
+  // Return array of expected input index from which each output should
+  // be forwarded:
+  // kNeverForward (-2) for DO NOT FORWARD (must allocate).
+  // kNoReservation (-1) for no expected forwarding.
+  // 0... for forward from that input.
+  const int* forward_from() const { return forward_from_base(); }
+
  private:
   friend class GraphView;
 
@@ -267,6 +274,7 @@ struct NodeItem {
   //   AllocatorAttributes output_attr[num_outputs];
   //   uint8               input_type[num_inputs];
   //   uint8               output_type[num_outputs];
+  //   int                 forward_from[num_outputs];
 
   // Return pointer to variable length section.
   char* var() const {
@@ -292,6 +300,13 @@ struct NodeItem {
         sizeof(AllocatorAttributes) * num_outputs + sizeof(uint8) * num_inputs);
   }
 
+  int* forward_from_base() const {
+    return reinterpret_cast<int*>(var() + sizeof(EdgeInfo) * num_output_edges +
+                                  sizeof(AllocatorAttributes) * num_outputs +
+                                  sizeof(uint8) * num_inputs +
+                                  sizeof(uint8) * num_outputs);
+  }
+
   TF_DISALLOW_COPY_AND_ASSIGN(NodeItem);
 };
 
@@ -466,7 +481,8 @@ size_t GraphView::NodeItemBytes(const Node* n) {
       + num_output_edges * sizeof(EdgeInfo)        // output_edges[...]
       + num_outputs * sizeof(AllocatorAttributes)  // output_attr[...]
       + num_inputs * sizeof(uint8)                 // input_type[num_inputs]
-      + num_outputs * sizeof(uint8);               // output_type[num_outputs]
+      + num_outputs * sizeof(uint8)                // output_type[num_outputs]
+      + num_outputs * sizeof(int);                 // forward_from[num_outputs]
   static constexpr size_t kItemAlignment = sizeof(NodeItem*);
   static_assert(kItemAlignment % alignof(NodeItem) == 0,
                 "NodeItem must be aligned with kItemAlignment");
@@ -737,8 +753,8 @@ Status InferAllocAttr(const Node* n, const Node* dst,
       VLOG(2) << "node " << n->name() << " is the sink of an RPC in";
     } else if ((local_dev_name.type == "CPU" || n->IsHostRecv()) &&
                parsed_src_name.type != "CPU") {
-      // Value is going to be the sink of a local DMA from GPU to CPU (or other
-      // types of accelerators).
+      // Value is going to be the sink of a local DMA from GPU to CPU (or
+      // other types of accelerators).
       attr->set_gpu_compatible(true);
       VLOG(2) << "node " << n->name() << " is the sink of a gpu->cpu copy";
     } else {
@@ -1022,7 +1038,8 @@ class ExecutorState {
     int total_input_tensors = 0;
     std::vector<const Node*>* nodes = nullptr;
 
-    // Lock ordering: ExecutorState.mu_ < mu.
+    // Lock ordering: ExecutorState.mu_ < mu;
+    // during structured traversal: parent_frame->mu < mu.
     mutex mu;
 
     void InitializeFrameInfo(const string& enter_name) {
@@ -1090,7 +1107,8 @@ class ExecutorState {
     void ActivateLoopInvs(const GraphView* gview, int64 iter,
                           TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
-    // Add a new loop invariant and make it available to all active iterations.
+    // Add a new loop invariant and make it available to all active
+    // iterations.
     void AddLoopInv(const NodeItem* item, const Entry& value,
                     TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
@@ -1147,8 +1165,8 @@ class ExecutorState {
         if (front_index_ == ready_.size()) {
           ready_.clear();
         } else {
-          // Lots of unused entries at beginning of vector: move everything down
-          // to start of vector.
+          // Lots of unused entries at beginning of vector: move everything
+          // down to start of vector.
           ready_.erase(ready_.begin(), ready_.begin() + front_index_);
         }
         front_index_ = 0;
@@ -1596,6 +1614,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
       params.frame_iter = FrameAndIter(input_frame->frame_id, input_iter);
       params.is_input_dead = is_input_dead;
       params.output_attr_array = item.output_attrs();
+      params.forward_from_array = nullptr;  // later: item.forward_from();
 
       if (item.kernel_is_async) {
         // Asynchronous computes.
@@ -2333,8 +2352,9 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   FrameState* parent_frame = frame->parent_frame;
   const int64 parent_iter = frame->parent_iter;
   if (parent_frame != nullptr) {
-    mutex_lock paranet_frame_lock(parent_frame->mu);
+    mutex_lock parent_frame_lock(parent_frame->mu);
     // Propagate all the dead exits to the parent frame.
+    mutex_lock this_frame_lock(frame->mu);
     for (const Node* node : frame->dead_exits) {
       auto parent_iter_state = parent_frame->GetIteration(parent_iter);
       for (const Edge* e : node->out_edges()) {
@@ -2603,7 +2623,7 @@ void ExecutorImpl::RunAsync(const Args& args, DoneCallback done) {
   (new ExecutorState(args, this))->RunAsync(std::move(done));
 }
 
-}  // end namespace
+}  // namespace
 
 Status NewLocalExecutor(const LocalExecutorParams& params,
                         std::unique_ptr<const Graph> graph,
@@ -2629,4 +2649,4 @@ Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
 
 void DeleteNonCachedKernel(OpKernel* kernel) { delete kernel; }
 
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index b941819838a7b155d8c8f54985bd6ae8bc15ce9d..a6f637b48837a097e613fc8aa51485b26a94efdb 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/graph/gradients.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/optimizer_cse.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -42,11 +43,8 @@ limitations under the License.
 namespace tensorflow {
 
 // A few string constant used throughout this module.
-//
-// TODO(zhifengc): Dedup some of these constants into
-// framework/function.h
-static constexpr const char* const kArgOp = "_Arg";
-static constexpr const char* const kRetOp = "_Retval";
+static constexpr const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static constexpr const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
 static constexpr const char* const kGradientOp =
     FunctionLibraryDefinition::kGradientOp;
 static constexpr const char* const kNodeLabel = "Func";
@@ -144,6 +142,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   FunctionLibraryRuntimeImpl(const DeviceMgr* dmgr, Env* env, Device* device,
                              int graph_def_version,
                              const FunctionLibraryDefinition* lib_def,
+                             thread::ThreadPool* default_thread_pool,
                              const OptimizerOptions& optimizer_options,
                              CustomKernelCreator custom_kernel_creator,
                              ProcessFunctionLibraryRuntime* parent);
@@ -177,6 +176,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   }
 
   Device* device() override { return device_; }
+  const DeviceMgr* device_mgr() const override { return device_mgr_; }
   Env* env() override { return env_; }
   int graph_def_version() override { return graph_def_version_; }
 
@@ -196,6 +196,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   const FunctionLibraryDefinition* const base_lib_def_;
   GraphOptimizer optimizer_;
   const CustomKernelCreator custom_kernel_creator_;
+  Executor::Args::Runner default_runner_;
   const string device_name_;
 
   std::function<Status(const string&, const OpDef**)> get_func_sig_;
@@ -208,6 +209,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item : public core::RefCounted {
+    bool invalidated = false;
     const Graph* graph = nullptr;                            // Owned by exec.
     const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
@@ -245,6 +247,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
     const DeviceMgr* dmgr, Env* env, Device* device, int graph_def_version,
     const FunctionLibraryDefinition* lib_def,
+    thread::ThreadPool* default_thread_pool,
     const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
     ProcessFunctionLibraryRuntime* parent)
@@ -255,6 +258,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
       base_lib_def_(lib_def),
       optimizer_(optimizer_options),
       custom_kernel_creator_(std::move(custom_kernel_creator)),
+      default_runner_(nullptr),
       device_name_(device_ == nullptr
                        ? ProcessFunctionLibraryRuntime::kDefaultFLRDevice
                        : device_->name()),
@@ -266,18 +270,22 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
   create_kernel_ = [this](const NodeDef& ndef, OpKernel** kernel) {
     return CreateKernel(ndef, kernel);
   };
+  thread::ThreadPool* pool = nullptr;
+  if (device_ != nullptr) {
+    pool = device_->tensorflow_device_thread_pool();
+  }
+  if (pool == nullptr) {
+    pool = default_thread_pool;
+  }
+  if (pool != nullptr) {
+    default_runner_ = [pool](Executor::Args::Closure c) {
+      pool->Schedule(std::move(c));
+    };
+  }
 }
 
 FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {
-  // The most common patterns of FLR usage don't require the caller to
-  // explicitly release handles. As a result, we try to unref each item until
-  // it's erased.
-  for (auto item : items_) {
-    if (item.second) {
-      while (!item.second->Unref()) {
-      }
-    }
-  }
+  for (auto p : items_) p.second->Unref();
 }
 
 // An asynchronous op kernel which executes an instantiated function
@@ -481,11 +489,32 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
   InstantiateOptions options_copy(options);
   options_copy.target = device_name_;
   const string key = Canonicalize(function_name, attrs, options_copy);
-  *handle = parent_->GetHandle(key);
-  if (*handle != kInvalidHandle) {
+
+  Handle found_handle = kInvalidHandle;
+  {
     mutex_lock l(mu_);
-    items_[parent_->GetHandleOnDevice(device_name_, *handle)]->Ref();
-    return Status::OK();
+    found_handle = parent_->GetHandle(key);
+    if (found_handle != kInvalidHandle) {
+      FunctionLibraryRuntime::LocalHandle handle_on_device =
+          parent_->GetHandleOnDevice(device_name_, found_handle);
+      if (handle_on_device == kInvalidLocalHandle) {
+        return errors::Internal("LocalHandle not found for handle ", *handle,
+                                ".");
+      }
+      auto iter = items_.find(handle_on_device);
+      if (iter == items_.end()) {
+        return errors::Internal("LocalHandle ", handle_on_device,
+                                " for handle ", found_handle,
+                                " not found in items.");
+      }
+      Item* item = iter->second;
+      if (!item->invalidated) {
+        *handle = found_handle;
+        return Status::OK();
+      }
+      // *item is invalidated. Fall through and instantiate the given
+      // function_name/attrs/option again.
+    }
   }
 
   Status s;
@@ -516,10 +545,10 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
 
   {
     mutex_lock l(mu_);
-    *handle = parent_->GetHandle(key);
-    if (*handle != kInvalidHandle) {
+    Handle found_handle_again = parent_->GetHandle(key);
+    if (found_handle_again != found_handle) {
       delete fbody;
-      items_[parent_->GetHandleOnDevice(device_name_, *handle)]->Ref();
+      *handle = found_handle_again;
     } else {
       *handle = parent_->AddHandle(key, device_name_, next_handle_);
       Item* item = new Item;
@@ -536,15 +565,12 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
     return parent_->ReleaseHandle(handle);
   }
-
   LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
+  CHECK_NE(h, kInvalidLocalHandle);
   mutex_lock l(mu_);
   CHECK_EQ(1, items_.count(h));
   Item* item = items_[h];
-  if (item->Unref()) {
-    items_.erase(h);
-    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
-  }
+  item->invalidated = true;  // Reinstantiate later.
   return Status::OK();
 }
 
@@ -705,6 +731,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   // computation is done and stored in *rets, we send the return values back
   // to the source_device (caller) so that the ProcFLR can receive them later.
   std::vector<Tensor>* remote_args = new std::vector<Tensor>;
+  item->Ref();
   ProcessFunctionLibraryRuntime::ReceiveTensorsAsync(
       source_device, target_device, "arg_", src_incarnation, args.size(),
       device_context, {}, rendezvous, remote_args,
@@ -716,6 +743,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           s = frame->SetArgs(*remote_args);
         }
         if (!s.ok()) {
+          item->Unref();
           delete frame;
           delete remote_args;
           delete exec_args;
@@ -726,6 +754,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
             *exec_args, [item, frame, rets, done, source_device, target_device,
                          target_incarnation, rendezvous, device_context,
                          remote_args, exec_args](const Status& status) {
+              core::ScopedUnref unref(item);
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets);
@@ -765,13 +794,17 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
       done(status);
     };
   }
+
+  if (run_opts.runner == nullptr) {
+    run_opts.runner = &default_runner_;
+  }
+  DCHECK(run_opts.runner != nullptr);
+
   if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
     parent_->Run(run_opts, handle, args, rets, done);
     return;
   }
 
-  DCHECK(run_opts.runner != nullptr);
-
   Executor::Args* exec_args = new Executor::Args;
   // Inherit the step_id from the caller.
   exec_args->step_id = run_opts.step_id;
@@ -807,11 +840,13 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     return;
   }
 
+  item->Ref();
   item->exec->RunAsync(
       // Executor args
       *exec_args,
       // Done callback.
       [item, frame, rets, done, exec_args](const Status& status) {
+        core::ScopedUnref unref(item);
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets);
@@ -856,6 +891,9 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     done(s);
     return;
   }
+  if (run_opts.runner == nullptr) {
+    run_opts.runner = &default_runner_;
+  }
   DCHECK(run_opts.runner != nullptr);
 
   Executor::Args* exec_args = new Executor::Args;
@@ -868,6 +906,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   exec_args->runner = *run_opts.runner;
   exec_args->call_frame = frame;
 
+  item->Ref();
   item->exec->RunAsync(
       // Executor args
       *exec_args,
@@ -876,6 +915,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
           [item, frame, exec_args](DoneCallback done,
                                    // Start unbound arguments.
                                    const Status& status) {
+            core::ScopedUnref unref(item);
             delete exec_args;
             done(status);
           },
@@ -944,21 +984,21 @@ void RegisterDefaultCustomKernelCreator(CustomKernelCreator cb) {
 std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, Device* device,
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options,
+    thread::ThreadPool* thread_pool, const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
     ProcessFunctionLibraryRuntime* parent) {
   return std::unique_ptr<FunctionLibraryRuntime>(new FunctionLibraryRuntimeImpl(
-      device_mgr, env, device, graph_def_version, lib_def, optimizer_options,
-      std::move(custom_kernel_creator), parent));
+      device_mgr, env, device, graph_def_version, lib_def, thread_pool,
+      optimizer_options, std::move(custom_kernel_creator), parent));
 }
 
 std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, Device* device,
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options,
+    thread::ThreadPool* thread_pool, const OptimizerOptions& optimizer_options,
     ProcessFunctionLibraryRuntime* parent) {
   return NewFunctionLibraryRuntime(device_mgr, env, device, graph_def_version,
-                                   lib_def, optimizer_options,
+                                   lib_def, thread_pool, optimizer_options,
                                    GetCustomCreatorSingleton()->Get(), parent);
 }
 
@@ -1580,9 +1620,6 @@ Status FunctionDefToBodyHelper(
 
   // Call BuildControlFlowInfo to validate that this function body has
   // well-formed control flow.
-  // NOTE(skyewm): this is usually done in Partition(), but we don't partition
-  // function bodies. This should be removed if function bodies ever go through
-  // the Partition() path.
   std::vector<ControlFlowInfo> dummy;
   TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph.get(), &dummy));
 
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index 477340d87a3c6ae311b3f5125af9b4fb0f66c6ad..a0f9fcae0aaf63c62ef194f5cb8e84d2d53b321a 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -55,7 +55,7 @@ void RegisterDefaultCustomKernelCreator(CustomKernelCreator cb);
 std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, Device* device,
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options,
+    thread::ThreadPool* thread_pool, const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
     ProcessFunctionLibraryRuntime* parent);
 
@@ -65,7 +65,7 @@ std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
 std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, Device* device,
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options,
+    thread::ThreadPool* thread_pool, const OptimizerOptions& optimizer_options,
     ProcessFunctionLibraryRuntime* parent);
 
 // FunctionLibraryRuntime::GetFunctionBody returns a description of an
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 63ad0d231c28a5af144b61e967a73e8ecfe6049a..373fc64007e43e87d1a4af1263cabbd5757773c1 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -38,6 +38,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -52,8 +54,8 @@ Status GetOpSig(const string& op, const OpDef** sig) {
   return OpRegistry::Global()->LookUpOpDef(op, sig);
 }
 
-void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+void HasError(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
@@ -135,7 +137,8 @@ TEST_F(FunctionTest, WXPlusB) {
 
 class FunctionLibraryRuntimeTest : public ::testing::Test {
  protected:
-  void Init(const std::vector<FunctionDef>& flib) {
+  void Init(const std::vector<FunctionDef>& flib,
+            thread::ThreadPool* default_thread_pool = nullptr) {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 3});
@@ -149,7 +152,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     device_mgr_.reset(new DeviceMgr(devices_));
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
-        opts, nullptr /* cluster_flr */));
+        opts, default_thread_pool, nullptr /* cluster_flr */));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
     flr1_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:1");
     flr2_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:2");
@@ -158,16 +161,20 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
              FunctionLibraryRuntime::Options opts,
-             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
+             const std::vector<Tensor>& args, std::vector<Tensor*> rets,
+             bool add_runner = true) {
     std::atomic<int32> call_count(0);
     std::function<void(std::function<void()>)> runner =
         [&call_count](std::function<void()> fn) {
           ++call_count;
           test::function::FunctionTestSchedClosure(fn);
         };
-
+    if (add_runner) {
+      opts.runner = &runner;
+    } else {
+      opts.runner = nullptr;
+    }
     Notification done;
-    opts.runner = &runner;
     std::vector<Tensor> out;
     Status status;
     flr->Run(opts, handle, args, &out, [&status, &done](const Status& s) {
@@ -183,7 +190,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       *rets[i] = out[i];
     }
 
-    EXPECT_GE(call_count, 1);  // Test runner is used.
+    if (add_runner) {
+      EXPECT_GE(call_count, 1);  // Test runner is used.
+    }
 
     return Status::OK();
   }
@@ -204,49 +213,43 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   Status InstantiateAndRun(FunctionLibraryRuntime* flr, const string& name,
                            test::function::Attrs attrs,
                            const std::vector<Tensor>& args,
-                           std::vector<Tensor*> rets) {
+                           std::vector<Tensor*> rets, bool add_runner = true) {
     return InstantiateAndRun(flr, name, attrs,
                              FunctionLibraryRuntime::InstantiateOptions(), args,
-                             std::move(rets));
+                             std::move(rets), add_runner);
   }
 
   Status InstantiateAndRun(
       FunctionLibraryRuntime* flr, const string& name,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
-      const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
+      const std::vector<Tensor>& args, std::vector<Tensor*> rets,
+      bool add_runner = true) {
     FunctionLibraryRuntime::Handle handle;
     Status status = flr->Instantiate(name, attrs, options, &handle);
     if (!status.ok()) {
       return status;
     }
     FunctionLibraryRuntime::Options opts;
-    status = Run(flr, handle, opts, args, rets);
-    if (!status.ok()) return status;
-
-    // Release the handle and try running again. It should not succeed.
-    status = flr->ReleaseHandle(handle);
-    if (!status.ok()) return status;
-
-    Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
-
-    return status;
+    TF_RETURN_IF_ERROR(Run(flr, handle, opts, args, rets, add_runner));
+    return flr->ReleaseHandle(handle);
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
-             FunctionLibraryRuntime::Options opts, CallFrameInterface* frame) {
+             FunctionLibraryRuntime::Options opts, CallFrameInterface* frame,
+             bool add_runner = true) {
     std::atomic<int32> call_count(0);
     std::function<void(std::function<void()>)> runner =
         [&call_count](std::function<void()> fn) {
           ++call_count;
           test::function::FunctionTestSchedClosure(fn);
         };
-
+    if (add_runner) {
+      opts.runner = &runner;
+    } else {
+      opts.runner = nullptr;
+    }
     Notification done;
-    opts.runner = &runner;
     std::vector<Tensor> out;
     Status status;
     flr->Run(opts, handle, frame, [&status, &done](const Status& s) {
@@ -258,7 +261,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
 
-    EXPECT_GE(call_count, 1);  // Test runner is used.
+    if (add_runner) {
+      EXPECT_GE(call_count, 1);  // Test runner is used.
+    }
 
     return Status::OK();
   }
@@ -288,16 +293,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       *rets[i] = retvals[i];
     }
 
-    // Release the handle and try running again. It should not succeed.
-    status = flr->ReleaseHandle(handle);
-    if (!status.ok()) return status;
-
-    Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
-
-    return status;
+    // Release the handle.
+    return flr->ReleaseHandle(handle);
   }
 
   std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
@@ -447,7 +444,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
   {
     // Simple case: instantiating with no state_handle.
     for (int32 expected : {6, 4}) {
-      TF_CHECK_OK(Run(flr0_, handle, opts, {}, {&y}));
+      TF_CHECK_OK(Run(flr0_, handle, opts, {}, {&y}, true));
       test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
     }
   }
@@ -460,7 +457,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
         Instantiate(flr0_, "RandomUniformWrapper", {}, &handle_non_isolated));
     EXPECT_EQ(handle, handle_non_isolated);
     for (int32 expected : {0, 1}) {
-      TF_CHECK_OK(Run(flr0_, handle_non_isolated, opts, {}, {&y}));
+      TF_CHECK_OK(Run(flr0_, handle_non_isolated, opts, {}, {&y}, true));
       test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
     }
   }
@@ -475,7 +472,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
                             &handle_isolated));
     EXPECT_NE(handle, handle_isolated);
     for (int32 expected : {6, 4, 0, 1}) {
-      TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
+      TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}, true));
       test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
     }
   }
@@ -490,7 +487,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
                             &handle_isolated));
     EXPECT_NE(handle, handle_isolated);
     for (int32 expected : {6, 4, 0, 1}) {
-      TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
+      TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}, true));
       test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
     }
   }
@@ -507,7 +504,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
                               &handle_isolated));
       EXPECT_NE(handle, handle_isolated);
       for (int32 expected : {6, 4, 0, 1}) {
-        TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
+        TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}, true));
         test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
       }
       TF_CHECK_OK(flr0_->ReleaseHandle(handle_isolated));
@@ -1247,14 +1244,14 @@ TEST_F(FunctionLibraryRuntimeTest, CrossDevice) {
   opts.rendezvous = new IntraProcessRendezvous(device_mgr_.get());
   opts.source_device = "/device:CPU:1";
   // Run on flr1_, flr2_ and make sure that the device it ran on was cpu:1.
-  TF_CHECK_OK(Run(flr1_, handle, opts, {}, {&y}));
+  TF_CHECK_OK(Run(flr1_, handle, opts, {}, {&y}, true));
   test::ExpectTensorEqual<string>(
       y,
       test::AsTensor<string>({"/job:localhost/replica:0/task:0/device:CPU:1"},
                              TensorShape({})));
   opts.remote_execution = true;
   opts.source_device = "/job:localhost/replica:0/task:0/cpu:2";
-  TF_CHECK_OK(Run(flr2_, handle, opts, {}, {&y}));
+  TF_CHECK_OK(Run(flr2_, handle, opts, {}, {&y}, true));
   test::ExpectTensorEqual<string>(
       y,
       test::AsTensor<string>({"/job:localhost/replica:0/task:0/device:CPU:1"},
diff --git a/tensorflow/core/common_runtime/function_testlib.cc b/tensorflow/core/common_runtime/function_testlib.cc
index 87c2476b04af7300d7138d59b3261496eb38c482..1720ee64c07ae744ebb6f0d4ac89738a4e4419e7 100644
--- a/tensorflow/core/common_runtime/function_testlib.cc
+++ b/tensorflow/core/common_runtime/function_testlib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function_testlib.h"
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -39,7 +40,9 @@ class FindDeviceOpKernel : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("FindDeviceOp").Device(tensorflow::DEVICE_CPU),
                         FindDeviceOpKernel);
-REGISTER_OP("FindDeviceOp").Output("device_name: string");
+REGISTER_OP("FindDeviceOp")
+    .Output("device_name: string")
+    .SetShapeFn(shape_inference::UnknownShape);
 
 FunctionDef FindDevice() {
   return FDH::Define(
@@ -55,6 +58,59 @@ FunctionDef FindDevice() {
       {{{"device_name"}, "FindDeviceOp", {}, {}}});
 }
 
+void BlockingOpState::AwaitState(int awaiting_state) {
+  mutex_lock ml(mu_);
+  while (state_ != awaiting_state) {
+    cv_.wait(ml);
+  }
+}
+
+void BlockingOpState::MoveToState(int expected_current, int next) {
+  mutex_lock ml(mu_);
+  CHECK_EQ(expected_current, state_);
+  state_ = next;
+  cv_.notify_all();
+}
+
+BlockingOpState* blocking_op_state = nullptr;
+
+// BlockingOp blocks on the global <blocking_op_state's> state,
+// and also updates it when it is unblocked and finishing computation.
+class BlockingOp : public OpKernel {
+ public:
+  explicit BlockingOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    blocking_op_state->MoveToState(0, 1);
+    blocking_op_state->AwaitState(2);
+    blocking_op_state->MoveToState(2, 3);
+
+    Tensor* out = nullptr;
+    const Tensor& in = ctx->input(0);
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in.shape(), &out));
+    out->flat<float>() = in.flat<float>();
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp);
+REGISTER_OP("BlockingOp")
+    .Input("x: float")
+    .Output("y: float")
+    .Doc("")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+FunctionDef BlockingOpFn() {
+  return FDH::Define(
+      // Name
+      "BlockingOpFn",
+      // Args
+      {"x: float"},
+      // Return values
+      {"y: float"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"y"}, "BlockingOp", {"x"}, {}}});
+}
+
 // TODO(phawkins): replace with C++ API for calling functions, when that exists.
 Output Call(Scope* scope, const string& op_name, const string& fn_name,
             gtl::ArraySlice<Input> inputs) {
diff --git a/tensorflow/core/common_runtime/function_testlib.h b/tensorflow/core/common_runtime/function_testlib.h
index 3ddb26de929dc19792142dffde345672aafaadce..fb967a612336ee28f9e5abf186a380203dd4f0c3 100644
--- a/tensorflow/core/common_runtime/function_testlib.h
+++ b/tensorflow/core/common_runtime/function_testlib.h
@@ -25,6 +25,22 @@ namespace function {
 // {} -> y:DT_STRING (device where this op runs).
 FunctionDef FindDevice();
 
+class BlockingOpState {
+ public:
+  void AwaitState(int awaiting_state);
+
+  void MoveToState(int expected_current, int next);
+
+ private:
+  mutex mu_;
+  condition_variable cv_;
+  int state_ = 0;
+};
+
+extern BlockingOpState* blocking_op_state;
+
+FunctionDef BlockingOpFn();
+
 // Adds a function call to the given scope and returns the output for the node.
 // TODO(phawkins): replace with C++ API for calling functions, when that exists.
 Output Call(Scope* scope, const string& op_name, const string& fn_name,
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98dac38a8cb903695506714ff07ab23f66594027
--- /dev/null
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -0,0 +1,247 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/function.h"
+
+#include <atomic>
+#include <utility>
+
+#include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/equal_graph_def.h"
+
+namespace tensorflow {
+namespace {
+
+class FunctionLibraryRuntimeTest : public ::testing::Test {
+ protected:
+  void Init(const std::vector<FunctionDef>& flib,
+            thread::ThreadPool* default_thread_pool = nullptr) {
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", 3});
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        options, "/job:localhost/replica:0/task:0", &devices_));
+
+    FunctionDefLibrary proto;
+    for (const auto& fdef : flib) *(proto.add_function()) = fdef;
+    lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
+    OptimizerOptions opts;
+    device_mgr_.reset(new DeviceMgr(devices_));
+    pflr_.reset(new ProcessFunctionLibraryRuntime(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
+        opts, default_thread_pool, nullptr /* cluster_flr */));
+    flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+    flr1_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:1");
+    flr2_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:2");
+    fdef_lib_ = lib_def_->ToProto();
+  }
+
+  Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
+             FunctionLibraryRuntime::Options opts,
+             const std::vector<Tensor>& args, std::vector<Tensor*> rets,
+             bool add_runner = true) {
+    std::atomic<int32> call_count(0);
+    std::function<void(std::function<void()>)> runner =
+        [&call_count](std::function<void()> fn) {
+          ++call_count;
+          test::function::FunctionTestSchedClosure(fn);
+        };
+    if (add_runner) {
+      opts.runner = &runner;
+    } else {
+      opts.runner = nullptr;
+    }
+    Notification done;
+    std::vector<Tensor> out;
+    Status status;
+    flr->Run(opts, handle, args, &out, [&status, &done](const Status& s) {
+      status = s;
+      done.Notify();
+    });
+    done.WaitForNotification();
+    if (!status.ok()) {
+      return status;
+    }
+    CHECK_EQ(rets.size(), out.size());
+    for (size_t i = 0; i < rets.size(); ++i) {
+      *rets[i] = out[i];
+    }
+
+    if (add_runner) {
+      EXPECT_GE(call_count, 1);  // Test runner is used.
+    }
+
+    return Status::OK();
+  }
+
+  Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
+                     test::function::Attrs attrs,
+                     FunctionLibraryRuntime::Handle* handle) {
+    return flr->Instantiate(name, attrs, handle);
+  }
+
+  Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
+                     test::function::Attrs attrs,
+                     const FunctionLibraryRuntime::InstantiateOptions& options,
+                     FunctionLibraryRuntime::Handle* handle) {
+    return flr->Instantiate(name, attrs, options, handle);
+  }
+
+  Status InstantiateAndRun(FunctionLibraryRuntime* flr, const string& name,
+                           test::function::Attrs attrs,
+                           const std::vector<Tensor>& args,
+                           std::vector<Tensor*> rets, bool add_runner = true) {
+    return InstantiateAndRun(flr, name, attrs,
+                             FunctionLibraryRuntime::InstantiateOptions(), args,
+                             std::move(rets), add_runner);
+  }
+
+  Status InstantiateAndRun(
+      FunctionLibraryRuntime* flr, const string& name,
+      test::function::Attrs attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      const std::vector<Tensor>& args, std::vector<Tensor*> rets,
+      bool add_runner = true) {
+    FunctionLibraryRuntime::Handle handle;
+    Status status = flr->Instantiate(name, attrs, options, &handle);
+    if (!status.ok()) {
+      return status;
+    }
+    FunctionLibraryRuntime::Options opts;
+    return Run(flr, handle, opts, args, std::move(rets), add_runner);
+  }
+
+  Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
+             FunctionLibraryRuntime::Options opts, CallFrameInterface* frame,
+             bool add_runner = true) {
+    std::atomic<int32> call_count(0);
+    std::function<void(std::function<void()>)> runner =
+        [&call_count](std::function<void()> fn) {
+          ++call_count;
+          test::function::FunctionTestSchedClosure(fn);
+        };
+    if (add_runner) {
+      opts.runner = &runner;
+    } else {
+      opts.runner = nullptr;
+    }
+    Notification done;
+    std::vector<Tensor> out;
+    Status status;
+    flr->Run(opts, handle, frame, [&status, &done](const Status& s) {
+      status = s;
+      done.Notify();
+    });
+    done.WaitForNotification();
+    if (!status.ok()) {
+      return status;
+    }
+
+    if (add_runner) {
+      EXPECT_GE(call_count, 1);  // Test runner is used.
+    }
+
+    return Status::OK();
+  }
+
+  FunctionLibraryRuntime* flr0_;
+  FunctionLibraryRuntime* flr1_;
+  FunctionLibraryRuntime* flr2_;
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionDefLibrary fdef_lib_;
+};
+
+TEST_F(FunctionLibraryRuntimeTest, DefaultThreadpool) {
+  using test::function::blocking_op_state;
+  using test::function::BlockingOpState;
+
+  thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "FLRTest", 1);
+  Init({test::function::BlockingOpFn(), test::function::XTimesTwo()}, tp);
+
+  auto x = test::AsScalar<float>(1.3);
+  Tensor y;
+  blocking_op_state = new BlockingOpState();
+
+  thread::ThreadPool* tp1 = new thread::ThreadPool(Env::Default(), "tp1", 5);
+  bool finished_running = false;
+  tp1->Schedule([&x, &y, &finished_running, this]() {
+    TF_CHECK_OK(InstantiateAndRun(flr0_, "BlockingOpFn", {}, {x}, {&y},
+                                  false /* add_runner */));
+    finished_running = true;
+  });
+
+  // InstantiateAndRun shouldn't finish because BlockingOpFn should be blocked.
+  EXPECT_FALSE(finished_running);
+
+  FunctionLibraryRuntime::Handle h;
+  TF_CHECK_OK(Instantiate(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, &h));
+
+  auto x1 = test::AsTensor<float>({1, 2, 3, 4});
+  std::atomic<int32> num_done(0);
+  FunctionLibraryRuntime::Options opts;
+  for (int i = 0; i < 4; ++i) {
+    tp1->Schedule([&h, &x1, &opts, &num_done, this]() {
+      Tensor y1;
+      TF_CHECK_OK(Run(flr0_, h, opts, {x1}, {&y1}, false /* add_runner */));
+      num_done.fetch_add(1);
+    });
+  }
+  // All the 4 Run() calls should be blocked because the runner is occupied.
+  EXPECT_EQ(0, num_done.load());
+
+  blocking_op_state->AwaitState(1);
+  blocking_op_state->MoveToState(1, 2);
+  // Now the runner should be unblocked and all the other Run() calls should
+  // proceed.
+  blocking_op_state->AwaitState(3);
+  blocking_op_state->MoveToState(3, 0);
+  delete tp1;
+  EXPECT_TRUE(finished_running);
+  EXPECT_EQ(4, num_done.load());
+
+  delete blocking_op_state;
+  blocking_op_state = nullptr;
+  delete tp;
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index c2c0b020c7409e7be168d42e83579a2ff3c29a60..ad142e9982a9ed255f6551526b6e7c0e6300d708 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 // A GPU memory allocator that implements a 'best-fit with coalescing'
@@ -52,7 +50,7 @@ class GPUBFCAllocator : public BFCAllocator {
 class GPUMemAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit GPUMemAllocator(perftools::gputools::StreamExecutor* stream_exec)
+  explicit GPUMemAllocator(se::StreamExecutor* stream_exec)
       : stream_exec_(stream_exec) {
     CHECK(stream_exec_ != nullptr);
   }
@@ -68,13 +66,13 @@ class GPUMemAllocator : public SubAllocator {
 
   void Free(void* ptr, size_t num_bytes) override {
     if (ptr != nullptr) {
-      gpu::DeviceMemoryBase gpu_ptr(ptr);
+      se::DeviceMemoryBase gpu_ptr(ptr);
       stream_exec_->Deallocate(&gpu_ptr);
     }
   }
 
  private:
-  perftools::gputools::StreamExecutor* stream_exec_;  // not owned, non-null
+  se::StreamExecutor* stream_exec_;  // not owned, non-null
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 08961fc1055b67902e85887f69158ece5dd68e76..934a57a5fb92eda9617bde7d610c69afbb3409bd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -38,7 +38,7 @@ GPUcudaMallocAllocator::~GPUcudaMallocAllocator() { delete base_allocator_; }
 void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
 #ifdef GOOGLE_CUDA
   // allocate with cudaMalloc
-  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
+  se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
   CUdeviceptr rv = 0;
   CUresult res = cuMemAlloc(&rv, num_bytes);
   if (res != CUDA_SUCCESS) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 208697361d2dfc4f3b8290ea511d15c9bd86857b..5043fac79741e1db8db4de255e07c153bf14b98f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -44,7 +44,7 @@ class GPUcudaMallocAllocator : public VisitableAllocator {
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
+  se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUcudaMallocAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 63ed0b8be16ecb187113311db5283c8d4f3b1a5e..e4c834b30d1eb251bfe2eb5d02441da9e6e5ec46 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -40,9 +40,8 @@ int64* NewMask(int64 word) {
 int64* before_mask = NewMask(0xabababababababab);
 int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
 
-bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr,
-               int64* mask) {
-  gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
+bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
+  se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
   int64 tmp[MASK_WORDS];
 
   if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) {
@@ -62,9 +61,8 @@ bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr,
   return ok;
 }
 
-void InitMask(perftools::gputools::StreamExecutor* exec, void* ptr,
-              int64* mask) {
-  gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
+void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
+  se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
   if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
     LOG(FATAL) << "Could not copy debug mask";
   }
@@ -85,8 +83,8 @@ GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
 
 void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   num_bytes += (2 * MASK_BYTES);
-
   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+  if (allocated_ptr == nullptr) return allocated_ptr;
 
   // Return the pointer after the header
   void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
@@ -102,11 +100,13 @@ void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   return rv;
 }
 void GPUDebugAllocator::DeallocateRaw(void* ptr) {
-  CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
-  CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
+  if (ptr != nullptr) {
+    CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
+    CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
 
-  // Backtrack to the beginning of the header.
-  ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
+    // Backtrack to the beginning of the header.
+    ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
+  }
   // Deallocate the memory
   base_allocator_->DeallocateRaw(ptr);
 }
@@ -168,12 +168,14 @@ GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
 
 void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+  if (allocated_ptr == nullptr) return allocated_ptr;
 
   // Initialize the buffer to Nans
   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
-  std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
-  gpu::DeviceMemory<float> nan_ptr{
-      gpu::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
+  std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
+                          std::nanf(""));
+  se::DeviceMemory<float> nan_ptr{
+      se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
 
   if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
     LOG(ERROR) << "Could not initialize to NaNs";
@@ -182,13 +184,16 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   return allocated_ptr;
 }
 void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
-  // Reset the buffer to Nans
-  size_t req_size = base_allocator_->RequestedSize(ptr);
-  std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
-  gpu::DeviceMemory<float> nan_ptr{
-      gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
-  if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-    LOG(ERROR) << "Could not initialize to NaNs";
+  if (ptr != nullptr) {
+    // Reset the buffer to Nans
+    size_t req_size = base_allocator_->RequestedSize(ptr);
+    std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
+                            std::nanf(""));
+    se::DeviceMemory<float> nan_ptr{
+        se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
+    if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
+      LOG(ERROR) << "Could not initialize to NaNs";
+    }
   }
 
   // Deallocate the memory
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index adce3a84368ced958002443721016778cb6df028..c49ec2a5662c0b803ac87daa8e8cb01a5ce1ea59 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -55,7 +55,7 @@ class GPUDebugAllocator : public VisitableAllocator {
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
+  se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUDebugAllocator);
 };
@@ -81,7 +81,7 @@ class GPUNanResetAllocator : public VisitableAllocator {
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
+  se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUNanResetAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index d34f0cb3c28af5d2720c61cc7c5016622b1c0876..236a0afa0b0b0902ac817da044e2df377efdad0b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -43,7 +43,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
     std::vector<int64> cpu_array(s);
     memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
     int64* gpu_array = a.Allocate<int64>(cpu_array.size());
-    gpu::DeviceMemory<int64> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}};
+    se::DeviceMemory<int64> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
     ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0],
                                                s * sizeof(int64)));
     EXPECT_TRUE(a.CheckHeader(gpu_array));
@@ -68,13 +68,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
           int64* gpu_array = a.Allocate<int64>(cpu_array.size());
 
-          gpu::DeviceMemory<int64> gpu_array_ptr{
-              gpu::DeviceMemoryBase{gpu_array}};
+          se::DeviceMemory<int64> gpu_array_ptr{
+              se::DeviceMemoryBase{gpu_array}};
           ASSERT_TRUE(stream_exec->SynchronousMemcpy(
               &gpu_array_ptr, &cpu_array[0], cpu_array.size() * sizeof(int64)));
 
-          gpu::DeviceMemory<int64> gpu_hdr_ptr{
-              gpu::DeviceMemoryBase{gpu_array - 1}};
+          se::DeviceMemory<int64> gpu_hdr_ptr{
+              se::DeviceMemoryBase{gpu_array - 1}};
           // Clobber first word of the header.
           float pi = 3.1417;
           ASSERT_TRUE(
@@ -101,14 +101,14 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
           int64* gpu_array = a.Allocate<int64>(cpu_array.size());
 
-          gpu::DeviceMemory<int64> gpu_array_ptr{
-              gpu::DeviceMemoryBase{gpu_array}};
+          se::DeviceMemory<int64> gpu_array_ptr{
+              se::DeviceMemoryBase{gpu_array}};
           ASSERT_TRUE(stream_exec->SynchronousMemcpy(
               &gpu_array_ptr, &cpu_array[0], cpu_array.size() * sizeof(int64)));
 
           // Clobber word of the footer.
-          gpu::DeviceMemory<int64> gpu_ftr_ptr{
-              gpu::DeviceMemoryBase{gpu_array + s}};
+          se::DeviceMemory<int64> gpu_ftr_ptr{
+              se::DeviceMemoryBase{gpu_array + s}};
           float pi = 3.1417;
           ASSERT_TRUE(
               stream_exec->SynchronousMemcpy(&gpu_ftr_ptr, &pi, sizeof(float)));
@@ -131,7 +131,7 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
 
   // Allocate 1024 floats
   float* gpu_array = a.Allocate<float>(cpu_array.size());
-  gpu::DeviceMemory<float> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}};
+  se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
   ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
                                              cpu_array.size() * sizeof(float)));
   for (float f : cpu_array) {
@@ -174,7 +174,7 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
 
   // Allocate 1024 floats
   float* gpu_array = a.Allocate<float>(cpu_array.size());
-  gpu::DeviceMemory<float> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}};
+  se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
   ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
                                              cpu_array.size() * sizeof(float)));
   for (float f : cpu_array) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 15ff15fd5ab28605c4ab0904e62305edc3815adb..9b434e5e2fdb94d070d655b93f3132074a6d0d72 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -200,27 +200,27 @@ class BaseGPUDevice::StreamGroupFactory {
   // This function is thread safe.
   BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
                                           int stream_group_within_gpu,
-                                          gpu::StreamExecutor* executor) {
+                                          se::StreamExecutor* executor) {
     mutex_lock guard(lock_);
     StreamGroup* group =
         &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
     if (!group->compute) {
-      group->compute = new gpu::Stream(executor);
+      group->compute = new se::Stream(executor);
       group->compute->Init();
       VLOG(2) << "Created stream[" << stream_group_within_gpu
               << "] = " << group->compute;
 
-      group->host_to_device = new gpu::Stream(executor);
+      group->host_to_device = new se::Stream(executor);
       group->host_to_device->Init();
       VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
               << "] = " << group->host_to_device;
 
-      group->device_to_host = new gpu::Stream(executor);
+      group->device_to_host = new se::Stream(executor);
       group->device_to_host->Init();
       VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
               << "] = " << group->device_to_host;
 
-      group->device_to_device = new gpu::Stream(executor);
+      group->device_to_device = new se::Stream(executor);
       group->device_to_device->Init();
       VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
               << "] = " << group->device_to_host;
@@ -257,6 +257,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                                                          physical_device_desc)),
       gpu_allocator_(gpu_allocator),
       cpu_allocator_(cpu_allocator),
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
       tf_gpu_id_(tf_gpu_id),
       sync_every_op_(sync_every_op),
       max_streams_(max_streams) {
@@ -265,6 +266,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
 
 BaseGPUDevice::~BaseGPUDevice() {
   delete gpu_device_info_;
+  for (auto sb : scratch_) gpu_allocator_->DeallocateRaw(sb);
   for (auto ctx : device_contexts_) ctx->Unref();
 }
 
@@ -296,9 +298,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
     }
     scratch_.push_back(static_cast<char*>(scratch_buffer));
 
-    perftools::gputools::DeviceMemory<char> mem(
-        perftools::gputools::DeviceMemoryBase(scratch_buffer,
-                                              scratch_buffer_size));
+    se::DeviceMemory<char> mem(
+        se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
 
     bool ok = executor_->SynchronousMemZero(
         &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
@@ -405,12 +406,8 @@ Status BaseGPUDevice::FillContextMap(const Graph* graph,
 }
 
 void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
-  // ScopedActivity is cheap when tracing is not active, but we
-  // can avoid computing the Hash64.
-  // TODO(pbar) This would no longer be needed if Ops have a unique id.
-  const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0;
-  port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
-                                       id);
+  tracing::ScopedRegion region(tracing::EventCategory::kCompute,
+                               op_kernel->name());
 
   // NOTE(tucker): We need to discriminate between Eigen GPU
   // operations and all others.  If an operation is Eigen
@@ -424,11 +421,9 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
     context->SetStatus(errors::Internal(
         "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
-  } else if (port::Tracing::ScopedAnnotation::Enabled()) {
-    port::Tracing::ScopedAnnotation annotation(op_kernel->name(),
-                                               op_kernel->type_string());
-    ComputeHelper(op_kernel, context);
   } else {
+    tracing::ScopedAnnotation annotation(op_kernel->name(),
+                                         op_kernel->type_string());
     ComputeHelper(op_kernel, context);
   }
 }
@@ -440,7 +435,7 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
     gpu_device_context =
         static_cast<GPUDeviceContext*>(context->op_device_context());
   }
-  gpu::Stream* stream = gpu_device_context->stream();
+  se::Stream* stream = gpu_device_context->stream();
   const auto stream_id = gpu_device_context->stream_id();
 
   const bool vlog_1 = VLOG_IS_ON(1);
@@ -484,7 +479,7 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
     }
   }
-  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
+  se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->Compute(context);
   if (context->status().ok()) {
     if (sync_every_op_) {
@@ -503,7 +498,7 @@ void BaseGPUDevice::ConsumeListOfAccessedTensors(
   if (device_context != nullptr) {
     gpu_device_context = static_cast<GPUDeviceContext*>(device_context);
   }
-  gpu::Stream* stream = gpu_device_context->stream();
+  se::Stream* stream = gpu_device_context->stream();
   em_->ThenDeleteTensors(stream, tensor_refs);
 }
 
@@ -519,19 +514,18 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
     gpu_device_context =
         static_cast<GPUDeviceContext*>(context->op_device_context());
   }
-  gpu::Stream* stream = gpu_device_context->stream();
+  se::Stream* stream = gpu_device_context->stream();
   const auto stream_id = gpu_device_context->stream_id();
 
   VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
           << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
           << stream_id << "]";
 
-  // When TraceMe profiling is off (which is the default), the
-  // following TraceMe constructor is simply a conditional test of
-  // false value. Measurements show that its overhead is negligible.
-  port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(),
-                                  op_kernel->IsExpensive());
-  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
+  // When Xprof profiling is off (which is the default), constructing the
+  // activity is simple enough that its overhead is negligible.
+  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
+                                   op_kernel->IsExpensive());
+  se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -572,7 +566,7 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
         },
         std::move(done), std::placeholders::_1);
 
-    port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
+    tracing::ScopedAnnotation annotation("MakeTensorFromProto");
     device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy,
                                                std::move(wrapped_done));
     return Status::OK();
@@ -665,7 +659,7 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
 Status ParseVisibleDeviceList(const string& visible_device_list,
                               std::vector<CudaGpuId>* visible_gpu_order) {
   visible_gpu_order->clear();
-  gpu::Platform* gpu_manager = GPUMachineManager();
+  se::Platform* gpu_manager = GPUMachineManager();
 
   // If the user wants to remap the visible to virtual GPU mapping,
   // check for that here.
@@ -784,7 +778,7 @@ Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
                                       int64* memory_limit) {
   int64 total_memory = 0;
   int64 available_memory = 0;
-  gpu::StreamExecutor* se =
+  se::StreamExecutor* se =
       GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
   if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
     return errors::Unknown("Failed to query available memory for GPU ",
@@ -840,6 +834,17 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
   }
 }
 
+Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
+                                             int64 step_id) {
+  if (attr.scope_id > 0) {
+    return scoped_allocator_mgr_->GetContainer(step_id)->GetInstance(
+        attr.scope_id);
+  }
+  LOG(FATAL) << "Unexpected call to BaseGPUDevice::GetScopedAllocator "
+             << "attr.scope_id = " << attr.scope_id;
+  return gpu_allocator_;
+}
+
 const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
 const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
 
@@ -847,7 +852,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                                            const string& name_prefix,
                                            std::vector<Device*>* devices) {
   TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
-  gpu::Platform* gpu_manager = GPUMachineManager();
+  se::Platform* gpu_manager = GPUMachineManager();
   if (gpu_manager == nullptr) {
     return Status::OK();
   }
@@ -986,7 +991,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
 }
 
 static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id,
-                                        const gpu::DeviceDescription& desc) {
+                                        const se::DeviceDescription& desc) {
   int cc_major;
   int cc_minor;
   if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
@@ -1013,21 +1018,34 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
   CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
   int numa_node = dev_locality.numa_node();
-  Bytes allocated_bytes = static_cast<Bytes>(memory_limit);
 
-  gpu::StreamExecutor* se =
+  se::StreamExecutor* se =
       GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
-  const gpu::DeviceDescription& desc = se->GetDeviceDescription();
-  LOG(INFO) << "Creating TensorFlow device (" << device_name << " with "
-            << (memory_limit >> 20) << " MB memory) -> physical GPU ("
-            << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
+  const se::DeviceDescription& desc = se->GetDeviceDescription();
   ProcessState* process_state = ProcessState::singleton();
+  Allocator* gpu_allocator = process_state->GetGPUAllocator(
+      options.config.gpu_options(), tf_gpu_id, memory_limit);
+  if (gpu_allocator == nullptr) {
+    return errors::Internal("Failed to get memory allocator for TF GPU ",
+                            tf_gpu_id.value(), " with ", memory_limit,
+                            " bytes of memory.");
+  }
+  AllocatorStats stats;
+  gpu_allocator->GetStats(&stats);
+  // 'memory_limit' is the required memory size, but if the allocator with given
+  // tf_gpu_id was created before, we'll use it instead of creating a new one
+  // (as TF gpu device is a shared resource), in which case the actual memory
+  // limit represented by 'stats.bytes_limit' used by that allocator may be
+  // different (which should be an error).
+  //
+  // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
   BaseGPUDevice* gpu_device = CreateGPUDevice(
-      options, device_name, allocated_bytes, dev_locality, tf_gpu_id,
-      GetShortDeviceDescription(cuda_gpu_id, desc),
-      process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id,
-                                     memory_limit),
+      options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
+      tf_gpu_id, GetShortDeviceDescription(cuda_gpu_id, desc), gpu_allocator,
       process_state->GetCPUAllocator(numa_node));
+  LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
+            << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
+            << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
   TF_RETURN_IF_ERROR(gpu_device->Init(options));
   devices->push_back(gpu_device);
 
@@ -1036,15 +1054,15 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
 
 namespace {
 std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>>
-GetPeerAccessMap(gpu::Platform* platform,
+GetPeerAccessMap(se::Platform* platform,
                  const std::vector<CudaGpuId>& visible_gpu_order) {
   std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map(
       new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>);
   for (CudaGpuId cuda_gpu_i : visible_gpu_order) {
     for (CudaGpuId cuda_gpu_j : visible_gpu_order) {
-      gpu::StreamExecutor* from =
+      se::StreamExecutor* from =
           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
-      gpu::StreamExecutor* to =
+      se::StreamExecutor* to =
           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
       (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to);
     }
@@ -1056,7 +1074,7 @@ GetPeerAccessMap(gpu::Platform* platform,
 }  // namespace
 
 Status BaseGPUDeviceFactory::GetInterconnectMaps(
-    const std::vector<CudaGpuId>& visible_gpu_order, gpu::Platform* gpu_manager,
+    const std::vector<CudaGpuId>& visible_gpu_order, se::Platform* gpu_manager,
     std::vector<InterconnectMap>* maps) {
   // The default interconnect map is obtained from the StreamExecutor.
   auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
@@ -1087,9 +1105,9 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
     // virtualized in some environments, we can't just use the GPU id.
     // NUMA locales are indexed from 0, buses are indexed from 1.
-    gpu::StreamExecutor* se =
+    se::StreamExecutor* se =
         GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
-    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+    const se::DeviceDescription& desc = se->GetDeviceDescription();
     int numa_node = desc.numa_node();
     if (numa_node < 0) {
       // For some reason the StreamExecutor couldn't get the NUMA
@@ -1145,7 +1163,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
 }
 
 static int GetDefaultMinGPUMultiprocessorCount(
-    gpu::Platform* gpu_manager,
+    se::Platform* gpu_manager,
     const std::vector<CudaGpuId>& visible_gpu_order) {
   static const int kDefaultMinGPUMultiprocessorCount = 8;
 
@@ -1158,8 +1176,8 @@ static int GetDefaultMinGPUMultiprocessorCount(
       continue;
     }
 
-    gpu::StreamExecutor* se = exec_status.ValueOrDie();
-    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+    se::StreamExecutor* se = exec_status.ValueOrDie();
+    const se::DeviceDescription& desc = se->GetDeviceDescription();
     max_count = std::max(max_count, desc.core_count());
   }
 
@@ -1171,7 +1189,7 @@ static int GetDefaultMinGPUMultiprocessorCount(
 }
 
 static int GetMinGPUMultiprocessorCount(
-    gpu::Platform* gpu_manager,
+    se::Platform* gpu_manager,
     const std::vector<CudaGpuId>& visible_gpu_order) {
   const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
 
@@ -1249,7 +1267,7 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
   return cuda_caps;
 }
 
-Status EnablePeerAccess(gpu::Platform* platform,
+Status EnablePeerAccess(se::Platform* platform,
                         const std::vector<CudaGpuId>& visible_gpu_order) {
   int possible_peer_count = 0;
   int enabled_peer_count = 0;
@@ -1258,9 +1276,9 @@ Status EnablePeerAccess(gpu::Platform* platform,
     for (int j = 0; j < visible_gpu_order.size(); ++j) {
       const CudaGpuId cuda_gpu_j = visible_gpu_order[j];
       // We have already validated that ExecutorForDevice() calls return OK.
-      gpu::StreamExecutor* from =
+      se::StreamExecutor* from =
           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
-      gpu::StreamExecutor* to =
+      se::StreamExecutor* to =
           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
 
       if (from->CanEnablePeerAccessTo(to)) {
@@ -1294,7 +1312,7 @@ Status EnablePeerAccess(gpu::Platform* platform,
 Status BaseGPUDeviceFactory::GetValidDeviceIds(
     const std::vector<CudaGpuId>& visible_gpu_order,
     std::vector<CudaGpuId>* ids) {
-  gpu::Platform* gpu_manager = GPUMachineManager();
+  se::Platform* gpu_manager = GPUMachineManager();
   bool new_gpu_found = false;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
     const CudaGpuId cuda_gpu_id = visible_gpu_order[i];
@@ -1309,7 +1327,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
 
     auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id);
     if (!executor.ok()) {
-      return StreamExecutorUtil::ConvertStatus(executor.status());
+      return executor.status();
     }
 
     auto stream_exec = executor.ValueOrDie();
@@ -1364,8 +1382,8 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
                 << exec_status.status().ToString();
       continue;
     }
-    gpu::StreamExecutor* se = exec_status.ValueOrDie();
-    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+    se::StreamExecutor* se = exec_status.ValueOrDie();
+    const se::DeviceDescription& desc = se->GetDeviceDescription();
     CudaVersion device_capability;
     if (!desc.cuda_compute_capability(&device_capability.major_part,
                                       &device_capability.minor_part)) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index c88daa8ff87589a3fc48f4c7693d073d6adf9a5a..b754ffd2db00a2d6c57e7d38692f3629734648c1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -17,8 +17,8 @@ limitations under the License.
 #error This file must only be included when building with Cuda support
 #endif
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
 
 #include <memory>
 #include <string>
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -68,7 +69,7 @@ class BaseGPUDevice : public LocalDevice {
       const TensorReferenceVector& tensor_refs) override;
 
   Status FillContextMap(const Graph* graph,
-                        DeviceContextMap* device_context_map);
+                        DeviceContextMap* device_context_map) override;
 
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
 
@@ -93,20 +94,28 @@ class BaseGPUDevice : public LocalDevice {
 
   // The executor that provides control for the device; e.g., for CUDA this
   // corresponds to the cuda context.
-  gpu::StreamExecutor* executor() const { return executor_; }
+  se::StreamExecutor* executor() const { return executor_; }
+
+  Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                int64 step_id) override;
+
+  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
+    return scoped_allocator_mgr_.get();
+  }
 
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
 
-  gpu::StreamExecutor* executor_;  // not owned
+  se::StreamExecutor* executor_;  // not owned
+  std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_;
 
  private:
   struct StreamGroup {
-    gpu::Stream* compute = nullptr;
-    gpu::Stream* host_to_device = nullptr;
-    gpu::Stream* device_to_host = nullptr;
-    gpu::Stream* device_to_device = nullptr;
+    se::Stream* compute = nullptr;
+    se::Stream* host_to_device = nullptr;
+    se::Stream* device_to_host = nullptr;
+    se::Stream* device_to_device = nullptr;
   };
   class StreamGroupFactory;
 
@@ -159,7 +168,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // pathways between GPUs.
   virtual Status GetInterconnectMaps(
       const std::vector<CudaGpuId>& visible_gpu_order,
-      gpu::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
+      se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
 
   struct TfGpuIdHash {
     std::size_t operator()(const TfGpuId& id) const noexcept {
@@ -205,4 +214,4 @@ class BaseGPUDeviceFactory : public DeviceFactory {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index b56823204afe8ee52e0ea376b1a79d91d6932fa0..f3935f6ba26c49a9967d0848bfb6d965c73d2fab 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -18,42 +18,48 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-namespace {
 const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
 
-static SessionOptions MakeSessionOptions(
-    const string& visible_device_list = "",
-    double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
-    const std::vector<std::vector<float>>& memory_limit_mb = {}) {
-  SessionOptions options;
-  ConfigProto* config = &options.config;
-  (*config->mutable_device_count())["GPU"] = gpu_device_count;
-  GPUOptions* gpu_options = config->mutable_gpu_options();
-  gpu_options->set_visible_device_list(visible_device_list);
-  gpu_options->set_per_process_gpu_memory_fraction(
-      per_process_gpu_memory_fraction);
-  for (const auto& v : memory_limit_mb) {
-    auto virtual_devices =
-        gpu_options->mutable_experimental()->add_virtual_devices();
-    for (float mb : v) {
-      virtual_devices->add_memory_limit_mb(mb);
+class GPUDeviceTest : public ::testing::Test {
+ public:
+  void TearDown() { ProcessState::singleton()->TestOnlyReset(); }
+
+ protected:
+  static SessionOptions MakeSessionOptions(
+      const string& visible_device_list = "",
+      double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
+      const std::vector<std::vector<float>>& memory_limit_mb = {}) {
+    SessionOptions options;
+    ConfigProto* config = &options.config;
+    (*config->mutable_device_count())["GPU"] = gpu_device_count;
+    GPUOptions* gpu_options = config->mutable_gpu_options();
+    gpu_options->set_visible_device_list(visible_device_list);
+    gpu_options->set_per_process_gpu_memory_fraction(
+        per_process_gpu_memory_fraction);
+    for (const auto& v : memory_limit_mb) {
+      auto virtual_devices =
+          gpu_options->mutable_experimental()->add_virtual_devices();
+      for (float mb : v) {
+        virtual_devices->add_memory_limit_mb(mb);
+      }
     }
+    return options;
   }
-  return options;
-}
 
-static bool StartsWith(const string& lhs, const string& rhs) {
-  if (rhs.length() > lhs.length()) return false;
-  return lhs.substr(0, rhs.length()) == rhs;
-}
+  static bool StartsWith(const string& lhs, const string& rhs) {
+    if (rhs.length() > lhs.length()) return false;
+    return lhs.substr(0, rhs.length()) == rhs;
+  }
+};
 
-TEST(GPUDeviceTest, FailedToParseVisibleDeviceList) {
+TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,abc");
   std::vector<tensorflow::Device*> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
@@ -63,7 +69,7 @@ TEST(GPUDeviceTest, FailedToParseVisibleDeviceList) {
       << status;
 }
 
-TEST(GPUDeviceTest, InvalidGpuId) {
+TEST_F(GPUDeviceTest, InvalidGpuId) {
   SessionOptions opts = MakeSessionOptions("100");
   std::vector<tensorflow::Device*> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
@@ -74,7 +80,7 @@ TEST(GPUDeviceTest, InvalidGpuId) {
       << status;
 }
 
-TEST(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
+TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,0");
   std::vector<tensorflow::Device*> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
@@ -85,7 +91,7 @@ TEST(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
       << status;
 }
 
-TEST(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
+TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
   SessionOptions opts = MakeSessionOptions("0", 0.1, 1, {{}});
   std::vector<tensorflow::Device*> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
@@ -96,7 +102,7 @@ TEST(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
       << status;
 }
 
-TEST(GPUDeviceTest, GpuDeviceCountTooSmall) {
+TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
   // device_count is 0, but with one entry in visible_device_list and one
   // (empty) VirtualDevices messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 0, {{}});
@@ -109,7 +115,7 @@ TEST(GPUDeviceTest, GpuDeviceCountTooSmall) {
       << status;
 }
 
-TEST(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
+TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
   // Single entry in visible_device_list with two (empty) VirtualDevices
   // messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 8, {{}, {}});
@@ -122,7 +128,7 @@ TEST(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
       << status;
 }
 
-TEST(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
+TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
   // This test requires at least two visible GPU hardware.
   if (GPUMachineManager()->VisibleDeviceCount() < 2) return;
   // Three entries in visible_device_list with two (empty) VirtualDevices
@@ -139,7 +145,7 @@ TEST(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
       << status;
 }
 
-TEST(GPUDeviceTest, EmptyVirtualDeviceConfig) {
+TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
   // It'll create single virtual device when the virtual device config is empty.
   SessionOptions opts = MakeSessionOptions("0");
   std::vector<tensorflow::Device*> devices;
@@ -150,7 +156,7 @@ TEST(GPUDeviceTest, EmptyVirtualDeviceConfig) {
   for (auto d : devices) delete d;
 }
 
-TEST(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
+TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
   // It'll create single virtual device for the gpu in question when
   // memory_limit_mb is unset.
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{}});
@@ -162,7 +168,7 @@ TEST(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
   for (auto d : devices) delete d;
 }
 
-TEST(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
+TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}});
   std::vector<tensorflow::Device*> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
@@ -172,7 +178,7 @@ TEST(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
   for (auto d : devices) delete d;
 }
 
-TEST(GPUDeviceTest, MultipleVirtualDevices) {
+TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}});
   std::vector<tensorflow::Device*> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
@@ -195,7 +201,6 @@ TEST(GPUDeviceTest, MultipleVirtualDevices) {
   for (auto d : devices) delete d;
 }
 
-}  // namespace
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 2452efc77952a89dd7b01989f684ac04a8a5ca90..48984484760f86ce2edd9d84d85776e503a019b5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -18,11 +18,9 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
-EventMgr::EventMgr(gpu::StreamExecutor* se, const GPUOptions& gpu_options)
+EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
     : exec_(se),
       deferred_bytes_threshold_(gpu_options.deferred_deletion_bytes()
                                     ? gpu_options.deferred_deletion_bytes()
@@ -30,10 +28,6 @@ EventMgr::EventMgr(gpu::StreamExecutor* se, const GPUOptions& gpu_options)
       polling_active_delay_usecs_(gpu_options.polling_active_delay_usecs()
                                       ? gpu_options.polling_active_delay_usecs()
                                       : 10),
-      polling_inactive_delay_msecs_(
-          gpu_options.polling_inactive_delay_msecs()
-              ? gpu_options.polling_inactive_delay_msecs()
-              : 1),
       accumulated_stream_(nullptr),
       accumulated_tensors_(new TensorReferenceVector),
       accumulated_tensor_bytes_(0),
@@ -78,21 +72,27 @@ EventMgr::~EventMgr() {
 
 void EventMgr::StartPollingLoop() {
   CHECK(polling_stopped_ == nullptr);
-  stop_polling_.reset(new Notification);
+  {
+    mutex_lock l(mu_);
+    stop_polling_ = false;
+  }
   polling_stopped_.reset(new Notification);
   threadpool_.Schedule([this]() { PollLoop(); });
 }
 
 void EventMgr::StopPollingLoop() {
-  if (stop_polling_) {
-    stop_polling_->Notify();
+  if (polling_stopped_) {
+    {
+      mutex_lock l(mu_);
+      stop_polling_ = true;
+      events_pending_.notify_all();
+    }
     polling_stopped_->WaitForNotification();
-    stop_polling_.reset(nullptr);
     polling_stopped_.reset(nullptr);
   }
 }
 
-void EventMgr::ThenDeleteTensors(perftools::gputools::Stream* stream,
+void EventMgr::ThenDeleteTensors(se::Stream* stream,
                                  const TensorReferenceVector& tensors) {
   mutex_lock l(mu_);
   // TODO(jeff): We currently keep one accumulated_tensors_ object.
@@ -121,42 +121,45 @@ void EventMgr::FlushAccumulatedTensors() {
   accumulated_stream_ = nullptr;
 }
 
-// A polling loop to detect completion of GPU events.  There's a
-// tradeoff between achieving low latency detection, which argues for
-// little delay between calls, and minimizing CPU use and lock
-// contention, which argue for longer delay.  The current strategy is
-// to poll frequently when the queue is non-empty, and infrequently
-// otherwise.
+// A polling loop to detect completion of GPU events.
+//
+// While one or more events is outstanding, poll for completed events.  When no
+// events are outstanding, we sleep until one is enqueued.
 void EventMgr::PollLoop() {
-  bool queue_empty = false;
-  while (!stop_polling_->HasBeenNotified()) {
-    if (queue_empty) {
-      mutex_lock l(mu_);
-      WaitForMilliseconds(&l, &events_pending_, polling_inactive_delay_msecs_);
-    } else {
-      Env::Default()->SleepForMicroseconds(polling_active_delay_usecs_);
-    }
-    ToFreeVector to_free;
+  ToFreeVector to_free;
+  while (true) {
+    bool events_still_pending;
     {
       mutex_lock l(mu_);
+      if (stop_polling_) {
+        break;
+      }
+      if (used_events_.empty()) {
+        events_pending_.wait(l);
+      }
       PollEvents(true, &to_free);
-      queue_empty = used_events_.empty();
+      events_still_pending = !used_events_.empty();
     }
     FreeMemory(to_free);
+    to_free.clear();
+
+    if (events_still_pending) {
+      Env::Default()->SleepForMicroseconds(polling_active_delay_usecs_);
+    }
   }
   polling_stopped_->Notify();
 }
 
-void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
+void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
   VLOG(2) << "QueueInUse  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Events are created on demand, and repeatedly reused.  There is no
   // limit placed here on the number of allocated Events.
   if (free_events_.empty()) {
-    free_events_.push_back(new gpu::Event(exec_));
+    free_events_.push_back(new se::Event(exec_));
     free_events_.back()->Init();
   }
-  gpu::Event* e = free_events_.back();
+  se::Event* e = free_events_.back();
   free_events_.pop_back();
   stream->ThenRecordEvent(e);
   iu.event = e;
@@ -194,18 +197,18 @@ void EventMgr::PollEvents(bool is_dedicated_poller,
   // the first non-complete record that is still pending.
   for (auto& iu : used_events_) {
     if (iu.event == nullptr) continue;
-    gpu::Event::Status s = iu.event->PollForStatus();
+    se::Event::Status s = iu.event->PollForStatus();
     switch (s) {
-      case gpu::Event::Status::kUnknown:
-      case gpu::Event::Status::kError:
+      case se::Event::Status::kUnknown:
+      case se::Event::Status::kError:
         // We don't expect to see these.  Someday maybe propagate
         // a Status error, but for now fail hard.
         LOG(FATAL) << "Unexpected Event status: " << static_cast<int>(s);
         break;
-      case gpu::Event::Status::kPending:
+      case se::Event::Status::kPending:
         if (!is_dedicated_poller) return;  // quit processing queue
         break;
-      case gpu::Event::Status::kComplete:
+      case se::Event::Status::kComplete:
         // Make a copy of the InUse record so we can free it after releasing
         // the lock
         to_free->push_back(iu);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index 9692b24084ab577c3d27ed32248d430fd0d65fa0..b26f88a201c15720aa1ea5e3bbd135296d934f12 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -29,13 +29,11 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Event;
 class Stream;
 class StreamExecutor;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
@@ -46,14 +44,13 @@ class GPUOptions;
 // Events are recorded.
 class EventMgr {
  public:
-  EventMgr(perftools::gputools::StreamExecutor* se,
-           const GPUOptions& gpu_options);
+  EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
 
   ~EventMgr();
 
   // Releases the references on the elements of "tensors" as soon as
   // all events currently enqueued on "stream" have completed.
-  void ThenDeleteTensors(perftools::gputools::Stream* stream,
+  void ThenDeleteTensors(se::Stream* stream,
                          const TensorReferenceVector& tensors);
 
   struct BufRec {
@@ -67,8 +64,7 @@ class EventMgr {
 
   // Takes ownership of *bufrec.buf and calls bufrec.alloc->DeallocateRaw()
   // on it as soon as all events currently enqueued on *stream have completed.
-  inline void ThenDeleteBuffer(perftools::gputools::Stream* stream,
-                               BufRec bufrec) {
+  inline void ThenDeleteBuffer(se::Stream* stream, BufRec bufrec) {
     ToFreeVector to_free;
     {
       mutex_lock l(mu_);
@@ -78,8 +74,7 @@ class EventMgr {
     FreeMemory(to_free);
   }
 
-  inline void ThenExecute(perftools::gputools::Stream* stream,
-                          std::function<void()> func) {
+  inline void ThenExecute(se::Stream* stream, std::function<void()> func) {
     ToFreeVector to_free;
     {
       mutex_lock l(mu_);
@@ -91,17 +86,16 @@ class EventMgr {
 
  private:
   friend class TEST_EventMgrHelper;
-  perftools::gputools::StreamExecutor* const exec_;
+  se::StreamExecutor* const exec_;
   const int64 deferred_bytes_threshold_;
   const int32 polling_active_delay_usecs_;
-  const int32 polling_inactive_delay_msecs_;
   mutex mu_;
   condition_variable events_pending_ GUARDED_BY(mu_);
 
   void FlushAccumulatedTensors() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   struct InUse {
-    perftools::gputools::Event* event;
+    se::Event* event;
     TensorReferenceVector* mem;
     BufRec bufrec;
     std::function<void()> func;
@@ -133,22 +127,21 @@ class EventMgr {
   // Stream-enqueue an unused Event and save with it a collection of
   // Tensors and/or a BufRec to be deleted only after the Event
   // records.
-  void QueueInUse(perftools::gputools::Stream* stream, InUse in_use)
+  void QueueInUse(se::Stream* stream, InUse in_use)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  void QueueTensors(perftools::gputools::Stream* stream,
-                    TensorReferenceVector* tensors)
+  void QueueTensors(se::Stream* stream, TensorReferenceVector* tensors)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
   }
 
-  void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
+  void QueueBuffer(se::Stream* stream, BufRec bufrec)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
   }
 
-  void QueueFunc(perftools::gputools::Stream* stream,
-                 std::function<void()> func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void QueueFunc(se::Stream* stream, std::function<void()> func)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     QueueInUse(stream, {nullptr, nullptr, BufRec(), std::move(func)});
   }
 
@@ -169,10 +162,10 @@ class EventMgr {
   void StopPollingLoop();
 
   // A stack of unused events
-  std::vector<perftools::gputools::Event*> free_events_ GUARDED_BY(mu_);
+  std::vector<se::Event*> free_events_ GUARDED_BY(mu_);
 
   // Buffered list of tensors waiting to have an event queued for deletion
-  perftools::gputools::Stream* accumulated_stream_ GUARDED_BY(mu_);
+  se::Stream* accumulated_stream_ GUARDED_BY(mu_);
   TensorReferenceVector* accumulated_tensors_ GUARDED_BY(mu_);
   // Sum of the TotalBytes() of the tensors in "accumulated_tensors_"
   int64 accumulated_tensor_bytes_ GUARDED_BY(mu_);
@@ -180,7 +173,7 @@ class EventMgr {
   // A FIFO queue of InUse events and associated tensors.
   std::deque<InUse> used_events_ GUARDED_BY(mu_);
 
-  std::unique_ptr<Notification> stop_polling_;
+  bool stop_polling_ GUARDED_BY(mu_);
   std::unique_ptr<Notification> polling_stopped_;
 
   // The main PollLoop for the event manager runs in this threadpool.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 3ad0b0eb85feaa7dc11bc0836b1186d857e7ce48..1d4ad957b94bfcc0381fd6250c08ac8d34f90ff6 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 class TEST_EventMgrHelper {
@@ -47,8 +45,7 @@ class TEST_EventMgrHelper {
     return em_->free_events_.size();
   }
 
-  void QueueTensors(perftools::gputools::Stream* stream,
-                    TensorReferenceVector* tensors) {
+  void QueueTensors(se::Stream* stream, TensorReferenceVector* tensors) {
     mutex_lock l(em_->mu_);
     em_->QueueTensors(stream, tensors);
   }
@@ -121,7 +118,7 @@ TEST(EventMgr, DelayedPolling) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   TensorReferenceVector* v = nullptr;
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -153,7 +150,7 @@ TEST(EventMgr, FlushLargeTensorImmediately) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -170,7 +167,7 @@ TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -189,8 +186,8 @@ TEST(EventMgr, StreamSwitchingFlushesImmediately) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<gpu::Stream> stream1(new gpu::Stream(stream_exec));
-  std::unique_ptr<gpu::Stream> stream2(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream1(new se::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream2(new se::Stream(stream_exec));
   stream1->Init();
   stream2->Init();
   TensorReferenceVector v1;
@@ -211,7 +208,7 @@ TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -234,7 +231,7 @@ TEST(EventMgr, NonEmptyShutdown) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(0, th.free_size());
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
index 207afdca75642b14c1617c8abae4fd5e9916f020..7dfff3269cf91582adf783dcd15dd55d1c4e1451 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
@@ -18,7 +18,10 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
@@ -27,8 +30,8 @@ namespace {
 class TfToCudaGpuIdMap {
  public:
   static TfToCudaGpuIdMap* singleton() {
-    static auto* manager = new TfToCudaGpuIdMap;
-    return manager;
+    static auto* id_map = new TfToCudaGpuIdMap;
+    return id_map;
   }
 
   void InsertOrDie(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id)
@@ -47,18 +50,41 @@ class TfToCudaGpuIdMap {
     }
   }
 
-  int32 FindOrDie(TfGpuId tf_gpu_id) const LOCKS_EXCLUDED(mu_) {
+  CudaGpuId FindOrDie(TfGpuId tf_gpu_id) const LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
+    return FindOrDieLocked(tf_gpu_id);
+  }
+
+  bool Find(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) const
+      LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    if (id_map_.count(tf_gpu_id.value()) == 0) return false;
+    *cuda_gpu_id = FindOrDieLocked(tf_gpu_id);
+    return true;
+  }
+
+ private:
+  TfToCudaGpuIdMap() = default;
+
+  CudaGpuId FindOrDieLocked(TfGpuId tf_gpu_id) const
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto result = id_map_.find(tf_gpu_id.value());
     CHECK(result != id_map_.end())
         << "Could not find the mapping for TfGpuId: " << tf_gpu_id;
-    return result->second;
+    return CudaGpuId(result->second);
+  }
+
+  void TestOnlyReset() LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    id_map_.clear();
   }
 
- private:
   using IdMapType = std::unordered_map<int32, int32>;
   mutable mutex mu_;
   IdMapType id_map_ GUARDED_BY(mu_);
+
+  friend class ::tensorflow::GpuIdManager;
+  TF_DISALLOW_COPY_AND_ASSIGN(TfToCudaGpuIdMap);
 };
 }  // namespace
 
@@ -67,8 +93,20 @@ void GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id,
   TfToCudaGpuIdMap::singleton()->InsertOrDie(tf_gpu_id, cuda_gpu_id);
 }
 
+Status GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) {
+  if (TfToCudaGpuIdMap::singleton()->Find(tf_gpu_id, cuda_gpu_id)) {
+    return Status::OK();
+  }
+  return errors::NotFound("TF GPU device with id ", tf_gpu_id.value(),
+                          " was not registered");
+}
+
 CudaGpuId GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id) {
-  return CudaGpuId(TfToCudaGpuIdMap::singleton()->FindOrDie(tf_gpu_id));
+  return TfToCudaGpuIdMap::singleton()->FindOrDie(tf_gpu_id);
+}
+
+void GpuIdManager::TestOnlyReset() {
+  TfToCudaGpuIdMap::singleton()->TestOnlyReset();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
index 33925d8c36f44a9d2c7abc8f2801f3f203bcb982..2b54cc184ca508b94e2a715642cdb13fe8a4c3e1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
@@ -17,15 +17,25 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
-// Class that manages the translation between Tensorflow GPU ids and CUDA GPU
-// ids.
+// Class that maintains a map from TfGpuId to CudaGpuId, and manages the
+// translation between them.
 class GpuIdManager {
  public:
+  // Adds a mapping from tf_gpu_id to cuda_gpu_id.
   static void InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id);
+
+  // Gets the cuda_gpu_id associated with tf_gpu_id. Returns OK if found.
+  static Status TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id);
+  // Similar to the above version, but returns the result, and checks fail if
+  // no result is found.
   static CudaGpuId TfToCudaGpuId(TfGpuId tf_gpu_id);
+
+  // Clears the map. Used in unit tests only.
+  static void TestOnlyReset();
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
index 2e90687fe8854460dc2ec683d8587ab2ceadf42e..42bf074e630363ba20b360c270cd13367c13f254 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -23,22 +23,21 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
-namespace gpu = ::perftools::gputools;
 
 // Utility methods for translation between Tensorflow GPU ids and CUDA GPU ids.
 class GpuIdUtil {
  public:
   // Convenient methods for getting the associated executor given a TfGpuId or
   // CudaGpuId.
-  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForCudaGpuId(
-      gpu::Platform* gpu_manager, CudaGpuId cuda_gpu_id) {
+  static se::port::StatusOr<se::StreamExecutor*> ExecutorForCudaGpuId(
+      se::Platform* gpu_manager, CudaGpuId cuda_gpu_id) {
     return gpu_manager->ExecutorForDevice(cuda_gpu_id.value());
   }
-  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForCudaGpuId(
+  static se::port::StatusOr<se::StreamExecutor*> ExecutorForCudaGpuId(
       CudaGpuId cuda_gpu_id) {
     return ExecutorForCudaGpuId(GPUMachineManager(), cuda_gpu_id);
   }
-  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForTfGpuId(
+  static se::port::StatusOr<se::StreamExecutor*> ExecutorForTfGpuId(
       TfGpuId tf_gpu_id) {
     return ExecutorForCudaGpuId(GpuIdManager::TfToCudaGpuId(tf_gpu_id));
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
index aa23e3cc6144481f43089f20aa86b8e9aab3cc07..e0ec93a98e2e4f3ba566dd591a84b2c088cbeea0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc
@@ -26,21 +26,14 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 Status ValidateGPUMachineManager() {
-  auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
-  if (!result.ok()) {
-    return StreamExecutorUtil::ConvertStatus(result.status());
-  }
-
-  return Status::OK();
+  return se::MultiPlatformManager::PlatformWithName("CUDA").status();
 }
 
-gpu::Platform* GPUMachineManager() {
-  auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
+se::Platform* GPUMachineManager() {
+  auto result = se::MultiPlatformManager::PlatformWithName("CUDA");
   if (!result.ok()) {
     LOG(FATAL) << "Could not find Platform with name CUDA";
     return nullptr;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
index 927d05d5ba919509d2718fd13bad621a63a495bb..bfd7a77f8339256c313daf2aa6aa48ce1587698f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -18,11 +18,9 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Platform;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
@@ -34,7 +32,7 @@ Status ValidateGPUMachineManager();
 // initializing the GPUs on the machine if needed the first time it is
 // called.  Must only be called when there is a valid GPU environment
 // in the process (e.g., ValidateGPUMachineManager() returns OK).
-perftools::gputools::Platform* GPUMachineManager();
+stream_executor::Platform* GPUMachineManager();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index a0f5877d62f0c889c2a598b8e03771e4bb49e0a9..d38413d79c9cf964909c9cd1ea65b1d3b63fe12f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -55,17 +55,15 @@ limitations under the License.
 const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
 extern bool FLAGS_brain_gpu_record_mem_types;
 
-using perftools::gputools::DeviceMemoryBase;
-using perftools::gputools::Stream;
-
 namespace tensorflow {
 
-namespace gpu = ::perftools::gputools;
+using se::DeviceMemoryBase;
+using se::Stream;
 
 Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
                    const Tensor* dst,
                    const DeviceBase::GpuDeviceInfo** dev_info,
-                   gpu::Stream** stream) {
+                   se::Stream** stream) {
   if (device == nullptr) {
     return errors::Internal("Unexpected null device.");
   }
@@ -120,7 +118,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
                               StatusCallback done) {
   VLOG(1) << "SetProtoFromGPU device_context " << device_context;
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(dev, device_context, tensor, nullptr, &dev_info,
                          &send_stream);
   if (!s.ok()) {
@@ -151,7 +149,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
   char* buf = nullptr;
   const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes();
   if (total_bytes > 0) {
-    port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU");
+    tracing::ScopedAnnotation annotation("SetProtoFromGPU");
     alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
     buf = alloc->Allocate<char>(total_bytes);
     if (LogMemory::IsEnabled()) {
@@ -195,7 +193,7 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
                                  const Tensor* input, Tensor* output,
                                  StatusCallback done) {
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info,
                          &send_stream);
   if (!s.ok()) {
@@ -262,7 +260,7 @@ void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
                                  StatusCallback done) {
   VLOG(1) << "CopyGPUTensorToCPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor,
                          &dev_info, &send_stream);
   if (!s.ok()) {
@@ -307,7 +305,7 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
                                  StatusCallback done) {
   VLOG(1) << "CopyCPUTensorToGPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* recv_stream = nullptr;
+  se::Stream* recv_stream = nullptr;
   Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor,
                          &dev_info, &recv_stream);
   if (!s.ok()) {
@@ -430,7 +428,7 @@ void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device,
                                      StatusCallback done) {
   VLOG(1) << "CopyGPUTensorToSameGPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor,
                          dst_gpu_tensor, &dev_info, &send_stream);
   if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index d99a0b1f611abcf4bd3f56ea0cb6d6df746b08cc..237b0044dafa832ff667ce8d48f0ef4ce5f2cc70 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -27,8 +27,6 @@ namespace tensorflow {
 class RecvTensorResponse;
 class TensorProto;
 
-namespace gpu = ::perftools::gputools;
-
 class GPUUtil {
  public:
   // "tensor" is GPU-local.  "dev" is the hosting GPU.
@@ -72,10 +70,9 @@ class GPUUtil {
   // NOTE: will be removed soon, see StreamExecutorUtil::AsDeviceMemory
   // instead.
   template <typename T>
-  static perftools::gputools::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
+  static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
     T* ptr = reinterpret_cast<T*>(const_cast<void*>(DMAHelper::base(&t)));
-    return perftools::gputools::DeviceMemory<T>(
-        perftools::gputools::DeviceMemoryBase(ptr, t.TotalBytes()));
+    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
   }
 
   // Computes a checksum over the contents of "tensor", which is allocated
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index 91ce830df8521e7fe8284dd3c52d1bbf667891cd..310158aba1b3240c887343dedbea9f285df6b441 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -181,7 +181,7 @@ class BasicCPUAllocator : public SubAllocator {
 class CUDAHostAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec)
+  explicit CUDAHostAllocator(se::StreamExecutor* stream_exec)
       : stream_exec_(stream_exec) {
     CHECK(stream_exec_ != nullptr);
   }
@@ -206,7 +206,7 @@ class CUDAHostAllocator : public SubAllocator {
   }
 
  private:
-  perftools::gputools::StreamExecutor* stream_exec_;  // not owned, non-null
+  se::StreamExecutor* stream_exec_;  // not owned, non-null
 
   TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index 85555955e37f95b9652241dc6776f349014e5f0a..a4c8d5fe86c23a9c62321602d73fe84155e6a89a 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -20,18 +20,16 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 namespace {
 
 TEST(PoolAllocatorTest, ZeroSizeBuffers) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
 
@@ -44,12 +42,12 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
 }
 
 TEST(PoolAllocatorTest, ZeroSizePool) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
 
@@ -77,12 +75,12 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
 }
 
 TEST(PoolAllocatorTest, Alignment) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
   for (int i = 0; i < 16; ++i) {
@@ -123,12 +121,12 @@ TEST(PoolAllocatorTest, AutoResize) {
 }
 
 TEST(PoolAllocatorTest, CudaHostAllocator) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
 
@@ -200,12 +198,12 @@ TEST(PoolAllocatorTest, Pow2Rounder) {
 }
 
 TEST(PoolAllocatorTest, Name) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
   EXPECT_EQ("pool", pool.Name());
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 61013bd1acd254b6e927a8d41accaeda424d6ebc..5ed01278c137576661ed90073d1e968cade61696 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -145,7 +146,7 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options,
 
     // If there are any pending AllocVisitors for this bus, add
     // them now.
-    gpu::StreamExecutor* se =
+    se::StreamExecutor* se =
         GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
     int bus_id = se->GetDeviceDescription().numa_node();
     if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
@@ -256,7 +257,7 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // better source of information about which executor to use.  For
   // example, process_state could maybe save the first stream executor
   // it knows is valid.
-  gpu::StreamExecutor* se = nullptr;
+  se::StreamExecutor* se = nullptr;
   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
     if (gpu_allocators_[i] != nullptr) {
       se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
@@ -304,7 +305,7 @@ void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) {
 #if GOOGLE_CUDA
   mutex_lock lock(mu_);
   for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) {
-    gpu::StreamExecutor* se =
+    se::StreamExecutor* se =
         GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
     if (gpu_allocators_[i] &&
         (se->GetDeviceDescription().numa_node() + 1) == bus_id) {
@@ -318,4 +319,17 @@ void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) {
 #endif  // GOOGLE_CUDA
 }
 
+void ProcessState::TestOnlyReset() {
+  mutex_lock lock(mu_);
+  gpu_device_enabled_ = false;
+  gpu_visitors_.clear();
+  mem_desc_map_.clear();
+  gtl::STLDeleteElements(&cpu_allocators_);
+  gtl::STLDeleteElements(&gpu_allocators_);
+  gtl::STLDeleteElements(&cuda_host_allocators_);
+  gtl::STLDeleteElements(&cpu_al_);
+  gtl::STLDeleteElements(&gpu_al_);
+  gtl::STLDeleteElements(&cuda_al_);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/process_state.h b/tensorflow/core/common_runtime/gpu/process_state.h
index f6e234967306476542cec3038ea2e271cca2dc8c..bc2c4182d72334e26d387397e564dbf02cfa3ae4 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.h
+++ b/tensorflow/core/common_runtime/gpu/process_state.h
@@ -114,6 +114,10 @@ class ProcessState {
  protected:
   ProcessState();
 
+  // Helper method for unit tests to reset the ProcessState singleton by
+  // cleaning up everything. Never use in production.
+  virtual void TestOnlyReset();
+
   static ProcessState* instance_;
   bool gpu_device_enabled_;
 
@@ -132,6 +136,8 @@ class ProcessState {
   std::vector<Allocator*> cpu_al_ GUARDED_BY(mu_);
   std::vector<Allocator*> gpu_al_ GUARDED_BY(mu_);
   std::vector<Allocator*> cuda_al_ GUARDED_BY(mu_);
+
+  friend class GPUDeviceTest;
 };
 
 namespace internal {
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index 8b1430f02193b986bef0ca36685433277acf09aa..c92c5d1af36e4c84e1917a6fc8847a1b6d4aa56d 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -19,23 +19,19 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/device_base.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Stream;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
-namespace gpu = ::perftools::gputools;
-
 class GPUDeviceContext : public DeviceContext {
  public:
   // Does not take ownership of streams.
-  GPUDeviceContext(int stream_id, gpu::Stream* stream,
-                   gpu::Stream* host_to_device_stream,
-                   gpu::Stream* device_to_host_stream,
-                   gpu::Stream* device_to_device_stream)
+  GPUDeviceContext(int stream_id, se::Stream* stream,
+                   se::Stream* host_to_device_stream,
+                   se::Stream* device_to_host_stream,
+                   se::Stream* device_to_device_stream)
       : stream_id_(stream_id),
         stream_(stream),
         host_to_device_stream_(host_to_device_stream),
@@ -44,10 +40,10 @@ class GPUDeviceContext : public DeviceContext {
 
   ~GPUDeviceContext() override {}
 
-  gpu::Stream* stream() const override { return stream_; }
-  gpu::Stream* host_to_device_stream() const { return host_to_device_stream_; }
-  gpu::Stream* device_to_host_stream() const { return device_to_host_stream_; }
-  gpu::Stream* device_to_device_stream() const {
+  se::Stream* stream() const override { return stream_; }
+  se::Stream* host_to_device_stream() const { return host_to_device_stream_; }
+  se::Stream* device_to_host_stream() const { return device_to_host_stream_; }
+  se::Stream* device_to_device_stream() const {
     return device_to_device_stream_;
   }
   int stream_id() const { return stream_id_; }
@@ -60,20 +56,20 @@ class GPUDeviceContext : public DeviceContext {
                              Device* device, Tensor* cpu_tensor,
                              StatusCallback done) override;
 
-  void MaintainLifetimeOnStream(
-      const Tensor* t, perftools::gputools::Stream* stream) const override {}
+  void MaintainLifetimeOnStream(const Tensor* t,
+                                se::Stream* stream) const override {}
 
  private:
   int stream_id_;
   // The default primary stream to use for this context.
   // All the memory belongs to this stream.
-  gpu::Stream* stream_;
+  se::Stream* stream_;
   // The stream to use for copy data from host into GPU.
-  gpu::Stream* host_to_device_stream_;
+  se::Stream* host_to_device_stream_;
   // The stream to use for copy data from GPU to host.
-  gpu::Stream* device_to_host_stream_;
+  se::Stream* device_to_host_stream_;
   // The stream to use for copy data between GPU.
-  gpu::Stream* device_to_device_stream_;
+  se::Stream* device_to_device_stream_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 33a5d60eb7ec4de829d3c0784f909ef42cf994d1..eb710bdbc504f48b8ddd69ac8963657f5aa87a70 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
@@ -73,6 +75,10 @@ GraphExecutionState::~GraphExecutionState() {
 /* static */ Status GraphExecutionState::MakeForBaseGraph(
     GraphDef* graph_def, const GraphExecutionStateOptions& options,
     std::unique_ptr<GraphExecutionState>* out_state) {
+#ifndef __ANDROID__
+  VLOG(4) << "Graph proto is " << graph_def->DebugString();
+#endif  // __ANDROID__
+
   std::unique_ptr<GraphExecutionState> ret(
       new GraphExecutionState(graph_def, options));
 
@@ -233,6 +239,122 @@ void GraphExecutionState::RestoreStatefulNodes(Graph* graph) {
   }
 }
 
+namespace {
+
+class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
+ public:
+  TensorConnectionPruneRewrite(const string* endpoint_name,
+                               NodeBuilder::NodeOut from_tensor)
+      : subgraph::PruneRewrite(endpoint_name, nullptr /* device_info */),
+        from_tensor_(std::move(from_tensor)) {}
+
+  Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                 Node** out_node) override {
+    Status s;
+    auto check_no_cycle_fn = [this, feed_tensor, &s](Node* n) {
+      if (n == feed_tensor.node) {
+        s.Update(errors::InvalidArgument(
+            "Requested Tensor connection between nodes \"",
+            feed_tensor.node->name(), "\" and \"", from_tensor_.node->name(),
+            "\" would create a cycle."));
+      }
+    };
+    ReverseDFSFrom(*g, {from_tensor_.node}, std::move(check_no_cycle_fn),
+                   nullptr);
+    TF_RETURN_IF_ERROR(s);
+
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(strings::StrCat("_identity_", feed_tensor.node->name(), "_",
+                                    feed_tensor.index),
+                    "Identity")
+            .Input(from_tensor_)
+            .Attr("T",
+                  BaseType(from_tensor_.node->output_type(from_tensor_.index)))
+            .Finalize(g, out_node));
+
+    (*out_node)->set_assigned_device_name(
+        feed_tensor.node->assigned_device_name());
+    return Status::OK();
+  }
+
+ private:
+  NodeBuilder::NodeOut from_tensor_;
+};
+
+}  // namespace
+
+Status GraphExecutionState::PruneGraph(
+    const BuildGraphOptions& options, Graph* graph,
+    subgraph::RewriteGraphMetadata* out_rewrite_metadata) {
+  std::vector<std::unique_ptr<subgraph::PruneRewrite>> feed_rewrites;
+  feed_rewrites.reserve(options.callable_options.feed_size());
+  std::vector<std::unique_ptr<subgraph::PruneRewrite>> fetch_rewrites;
+  fetch_rewrites.reserve(options.callable_options.fetch_size());
+  const DeviceAttributes* device_info =
+      &device_set_->client_device()->attributes();
+  if (options.use_function_convention) {
+    for (int i = 0; i < options.callable_options.feed_size(); ++i) {
+      feed_rewrites.emplace_back(new subgraph::ArgFeedRewrite(
+          &options.callable_options.feed(i), device_info, i));
+    }
+    for (int i = 0; i < options.callable_options.fetch_size(); ++i) {
+      fetch_rewrites.emplace_back(new subgraph::RetvalFetchRewrite(
+          &options.callable_options.fetch(i), device_info, i));
+    }
+  } else {
+    for (const string& feed : options.callable_options.feed()) {
+      feed_rewrites.emplace_back(
+          new subgraph::RecvFeedRewrite(&feed, device_info));
+    }
+    for (const string& fetch : options.callable_options.fetch()) {
+      fetch_rewrites.emplace_back(
+          new subgraph::SendFetchRewrite(&fetch, device_info));
+    }
+  }
+
+  for (const TensorConnection& tensor_connection :
+       options.callable_options.tensor_connection()) {
+    Node* from_node = nullptr;
+    TensorId from_id(ParseTensorName(tensor_connection.from_tensor()));
+
+    for (Node* n : graph->nodes()) {
+      if (n->name() == from_id.first) {
+        from_node = n;
+        break;
+      }
+    }
+    if (from_node == nullptr) {
+      return errors::InvalidArgument(
+          "Requested tensor connection from unknown node: \"",
+          tensor_connection.to_tensor(), "\".");
+    }
+    if (from_id.second >= from_node->num_outputs()) {
+      return errors::InvalidArgument(
+          "Requested tensor connection from unknown edge: \"",
+          tensor_connection.to_tensor(),
+          "\" (actual number of outputs = ", from_node->num_outputs(), ").");
+    }
+
+    feed_rewrites.emplace_back(new TensorConnectionPruneRewrite(
+        &tensor_connection.to_tensor(), {from_node, from_id.second}));
+  }
+
+  std::vector<string> target_node_names(
+      options.callable_options.target().begin(),
+      options.callable_options.target().end());
+  TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
+      graph, feed_rewrites, fetch_rewrites, target_node_names,
+      out_rewrite_metadata));
+
+  CHECK_EQ(out_rewrite_metadata->feed_types.size(),
+           options.callable_options.feed_size() +
+               options.callable_options.tensor_connection_size());
+  for (int i = 0; i < options.callable_options.tensor_connection_size(); ++i) {
+    out_rewrite_metadata->feed_types.pop_back();
+  }
+  return Status::OK();
+}
+
 Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   const GraphDef* graph_def = &original_graph_def_;
 
@@ -247,10 +369,8 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
       session_options_->config.graph_options().place_pruned_graph()) {
     // Rewrite the graph before placement.
     rewrite_metadata_.reset(new subgraph::RewriteGraphMetadata);
-    TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
-        new_graph.get(), options.feed_endpoints, options.fetch_endpoints,
-        options.target_nodes, device_set_->client_device()->attributes(),
-        options.use_function_convention, rewrite_metadata_.get()));
+    TF_RETURN_IF_ERROR(
+        PruneGraph(options, new_graph.get(), rewrite_metadata_.get()));
   }
 
   // Save stateful placements before placing.
@@ -278,7 +398,8 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
 }
 
 Status GraphExecutionState::OptimizeGraph(
-    const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph) {
+    const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph,
+    std::unique_ptr<FunctionLibraryDefinition>* optimized_flib) {
 #ifndef IS_MOBILE_PLATFORM
   if (session_options_->config.graph_options().place_pruned_graph()) {
     return errors::InvalidArgument("Can't optimize a pruned graph");
@@ -295,19 +416,37 @@ Status GraphExecutionState::OptimizeGraph(
     item.id = "tf_graph";
     graph_->ToGraphDef(&item.graph);
 
-    item.fetch = options.fetch_endpoints;
-    item.fetch.insert(item.fetch.end(), options.target_nodes.begin(),
-                      options.target_nodes.end());
+    item.fetch.insert(item.fetch.end(),
+                      options.callable_options.fetch().begin(),
+                      options.callable_options.fetch().end());
+    item.fetch.insert(item.fetch.end(),
+                      options.callable_options.target().begin(),
+                      options.callable_options.target().end());
 
-    if (!options.feed_endpoints.empty()) {
+    for (const TensorConnection& tensor_connection :
+         options.callable_options.tensor_connection()) {
+      item.fetch.push_back(tensor_connection.from_tensor());
+    }
+
+    if (!(options.callable_options.feed().empty() &&
+          options.callable_options.tensor_connection().empty())) {
       std::unordered_set<string> feeds;
-      for (const string& feed : options.feed_endpoints) {
+      for (const string& feed : options.callable_options.feed()) {
         TensorId id = ParseTensorName(feed);
         if (id.second != 0) {
           return errors::InvalidArgument("Unsupported feed: ", feed);
         }
         feeds.insert(id.first.ToString());
       }
+      for (const TensorConnection& tensor_connection :
+           options.callable_options.tensor_connection()) {
+        TensorId id = ParseTensorName(tensor_connection.to_tensor());
+        if (id.second != 0) {
+          return errors::InvalidArgument("Unsupported feed: ",
+                                         tensor_connection.to_tensor());
+        }
+        feeds.insert(id.first.ToString());
+      }
       for (const NodeDef& node : original_graph_def_.node()) {
         if (feeds.find(node.name()) == feeds.end()) {
           continue;
@@ -351,13 +490,34 @@ Status GraphExecutionState::OptimizeGraph(
         cpu_device = device;
       }
     }
-    grappler::VirtualCluster cluster(device_map);
+    grappler::VirtualCluster cluster(device_map, device_set_);
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
         item, rewrite_options, cpu_device, &cluster, &new_graph));
+
+    // Merge optimized graph function library with an original library.
+    // Optimized graph might have new functions specialized for it's
+    // instantiation context (see Grappler function optimizer), and modified
+    // function body for the existing functions.
+    optimized_flib->reset(new FunctionLibraryDefinition(*flib_def_));
+
+    for (const FunctionDef& fdef : new_graph.library().function()) {
+      const string& func_name = fdef.signature().name();
+
+      if ((*optimized_flib)->Find(func_name)) {
+        VLOG(3) << "Replace function: name=" << func_name;
+        TF_RETURN_IF_ERROR((*optimized_flib)->RemoveFunction(func_name));
+        TF_RETURN_IF_ERROR((*optimized_flib)->AddFunctionDef(fdef));
+      } else {
+        VLOG(3) << "Add new function: name=" << func_name;
+        TF_RETURN_IF_ERROR((*optimized_flib)->AddFunctionDef(fdef));
+      }
+    }
+
+    optimized_graph->reset(new Graph(OpRegistry::Global()));
+
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
-    optimized_graph->reset(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(
         ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
     // The graph conversion sets the requested device names but not the assigned
@@ -386,23 +546,26 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
         "Attempted to prune a graph that has not been fully initialized.");
   }
 
-  std::unique_ptr<Graph> ng;
-  Status s = OptimizeGraph(options, &ng);
+  // Grappler optimization might change the structure of a graph itself, and
+  // also it can add/prune functions to/from the library.
+  std::unique_ptr<Graph> optimized_graph;
+  std::unique_ptr<FunctionLibraryDefinition> optimized_flib;
+
+  Status s = OptimizeGraph(options, &optimized_graph, &optimized_flib);
   if (!s.ok()) {
-    // Simply copy the original graph if we couldn't optimize it.
-    ng.reset(new Graph(flib_def_.get()));
-    CopyGraph(*graph_, ng.get());
+    VLOG(2) << "Grappler optimization failed. Error: " << s.error_message();
+    // Simply copy the original graph and the function library if we couldn't
+    // optimize it.
+    optimized_graph.reset(new Graph(flib_def_.get()));
+    CopyGraph(*graph_, optimized_graph.get());
+    optimized_flib.reset(new FunctionLibraryDefinition(*flib_def_));
   }
 
   subgraph::RewriteGraphMetadata rewrite_metadata;
   if (session_options_ == nullptr ||
       !session_options_->config.graph_options().place_pruned_graph()) {
-    // Extract the subset of the graph that needs to be run, adding feed/fetch
-    // ops as needed.
-    TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
-        ng.get(), options.feed_endpoints, options.fetch_endpoints,
-        options.target_nodes, device_set_->client_device()->attributes(),
-        options.use_function_convention, &rewrite_metadata));
+    TF_RETURN_IF_ERROR(
+        PruneGraph(options, optimized_graph.get(), &rewrite_metadata));
   } else {
     // This GraphExecutionState represents a graph that was
     // pruned when this was constructed, so we copy the metadata from
@@ -411,18 +574,16 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
     rewrite_metadata = *rewrite_metadata_;
   }
 
-  CHECK_EQ(options.feed_endpoints.size(), rewrite_metadata.feed_types.size());
-  CHECK_EQ(options.fetch_endpoints.size(), rewrite_metadata.fetch_types.size());
-
-  // Make a fresh copy of the function library for the client graph.
-  std::unique_ptr<FunctionLibraryDefinition> flib(
-      new FunctionLibraryDefinition(*flib_def_));
+  CHECK_EQ(options.callable_options.feed_size(),
+           rewrite_metadata.feed_types.size());
+  CHECK_EQ(options.callable_options.fetch_size(),
+           rewrite_metadata.fetch_types.size());
 
   // TODO(andydavis): Clarify optimization pass requirements around CostModel.
   GraphOptimizationPassOptions optimization_options;
   optimization_options.session_options = session_options_;
-  optimization_options.graph = &ng;
-  optimization_options.flib_def = flib.get();
+  optimization_options.graph = &optimized_graph;
+  optimization_options.flib_def = optimized_flib.get();
   optimization_options.device_set = device_set_;
 
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
@@ -432,9 +593,9 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   // since the local CostModel used to record its stats is sized by
   // the largest node id.
   std::unique_ptr<ClientGraph> dense_copy(
-      new ClientGraph(std::move(flib), rewrite_metadata.feed_types,
+      new ClientGraph(std::move(optimized_flib), rewrite_metadata.feed_types,
                       rewrite_metadata.fetch_types));
-  CopyGraph(*ng, &dense_copy->graph);
+  CopyGraph(*optimized_graph, &dense_copy->graph);
 
   // TODO(vrv): We should check invariants of the graph here.
 
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 2312e1a89fd1fd5734fab4316c25ca2e39f16ae5..d44a24c87ba04deaf6fcbf042679c99f1597ba82 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -177,8 +177,14 @@ class GraphExecutionState {
   void SaveStatefulNodes(Graph* graph);
   void RestoreStatefulNodes(Graph* graph);
 
-  Status OptimizeGraph(const BuildGraphOptions& options,
-                       std::unique_ptr<Graph>* optimized_graph);
+  // Extract the subset of the graph that needs to be run, adding feed/fetch
+  // ops as needed.
+  Status PruneGraph(const BuildGraphOptions& options, Graph* graph,
+                    subgraph::RewriteGraphMetadata* out_rewrite_metadata);
+
+  Status OptimizeGraph(
+      const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph,
+      std::unique_ptr<FunctionLibraryDefinition>* optimized_flib);
 
   GraphDef original_graph_def_;            // Immutable after ctor.
   const DeviceSet* device_set_;            // Not owned
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index f1082a60030fb3c289de35b4cab397c527f8afca..790f2eaa1e9de96b5cd399dd53a1e49696035f21 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// TODO(skyewm): this is necessary to make the single_threaded_cpu_device.h
+// include work. Some other include must be including eigen without defining
+// this. Consider defining in this in a BUILD rule.
+#define EIGEN_USE_THREADS
+
 #include "tensorflow/core/common_runtime/graph_runner.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -20,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/single_threaded_cpu_device.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -36,18 +42,6 @@ namespace tensorflow {
 
 namespace {
 
-std::unique_ptr<Device> GetCPUDevice(Env* env) {
-  std::vector<Device*> devices;
-  SessionOptions session_options;
-  session_options.env = env;
-  Status s = DeviceFactory::GetFactory(DEVICE_CPU)
-                 ->CreateDevices(session_options, "", &devices);
-  if (s.ok() && !devices.empty()) {
-    return std::unique_ptr<Device>(devices[0]);
-  }
-  return nullptr;
-}
-
 // A simple rendezvous class.
 // Assumes a single sender and a single receiver, no duplicate sends, and no
 // sends of dead tensors.
@@ -97,7 +91,10 @@ class SimpleRendezvous : public Rendezvous {
 
 }  // namespace
 
-GraphRunner::GraphRunner(Env* env) : cpu_device_(GetCPUDevice(env)) {}
+GraphRunner::GraphRunner(Env* env)
+    : device_deleter_(new SingleThreadedCpuDevice(env)),
+      device_(device_deleter_.get()) {}
+GraphRunner::GraphRunner(Device* device) : device_(device) {}
 
 GraphRunner::~GraphRunner() {}
 
@@ -105,17 +102,18 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
                         const NamedTensorList& inputs,
                         const std::vector<string>& output_names,
                         std::vector<Tensor>* outputs) {
-  if (cpu_device_ == nullptr) {
+  if (device_ == nullptr) {
     return errors::NotFound("Cannot find a device for GraphRunner.");
   }
 
   if (function_library && function_library->device() &&
-      function_library->device()->device_type() != cpu_device_->device_type()) {
-    // We are running on a CPU but the function library is for a non-CPU device,
-    // so just ignore the function_library.
+      function_library->device()->device_type() != device_->device_type()) {
+    // Mismatch between function_library's device_type and device_'s
+    // device_type.
     // TODO(matthewmurray) Can we create a new FunctionLibraryRuntime that is
-    // identical to function_library except that it uses CPU?
-    VLOG(1) << "Cannot run on CPU device with a function library for a "
+    // identical to function_library except that it uses the given 'device_'?
+    VLOG(1) << "Cannot run on: " << device_->device_type()
+            << " with a function library for a "
             << function_library->device()->device_type() << " device.";
     function_library = nullptr;
   }
@@ -146,8 +144,7 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   subgraph::RewriteGraphMetadata metadata;
   TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
       graph_to_run.get(), input_names, output_names, {} /* target nodes */,
-      cpu_device_->attributes(), false /* use_function_convention */,
-      &metadata));
+      device_->attributes(), false /* use_function_convention */, &metadata));
 
   // Create the local executor and the Rendezvous for fetching back the
   // constants.
@@ -158,13 +155,12 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
 
   LocalExecutorParams params;
   // The ownership of the output tensors are bound to this device's lifetime.
-  params.device = cpu_device_.get();
+  params.device = device_;
   params.function_library = function_library;
   const int producer = graph_to_run->versions().producer();
   params.create_kernel = [this, producer](const NodeDef& ndef,
                                           OpKernel** kernel) {
-    return CreateNonCachedKernel(cpu_device_.get(), nullptr, ndef, producer,
-                                 kernel);
+    return CreateNonCachedKernel(device_, nullptr, ndef, producer, kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) { delete kernel; };
 
diff --git a/tensorflow/core/common_runtime/graph_runner.h b/tensorflow/core/common_runtime/graph_runner.h
index 1e4ae7722794ca527bcea023d992d92839ee46c9..1c4b2b719cd52a4cce16453601112e710b3c3e9d 100644
--- a/tensorflow/core/common_runtime/graph_runner.h
+++ b/tensorflow/core/common_runtime/graph_runner.h
@@ -36,12 +36,14 @@ namespace tensorflow {
 // This class is only meant for internal use where one needs to
 // partially evaluate inexpensive nodes in a graph, such as for shape
 // inference or for constant folding.  Because of its limited, simple
-// use-cases, it executes all computation on the CPU and is not meant
-// to be particularly lightweight, fast, or efficient.
+// use-cases, it executes all computation on the given device (CPU by default)
+// and is not meant to be particularly lightweight, fast, or efficient.
 class GraphRunner {
  public:
   // REQUIRES: `env` is not nullptr.
   GraphRunner(Env* env);
+  // REQUIRES: 'device' is not nullptr. Not owned.
+  GraphRunner(Device* device);
   ~GraphRunner();
 
   // Function semantics for `inputs`, `output_names` and `outputs`
@@ -59,7 +61,8 @@ class GraphRunner {
              std::vector<Tensor>* outputs);
 
  private:
-  std::unique_ptr<Device> cpu_device_;
+  std::unique_ptr<Device> device_deleter_;
+  Device* const device_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 64d884947568381eb2e5f60ab181b3c8c709d53b..7de1b80e2d6b3f2e36b3fdd56fc30e962e38bada 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test_benchmark.h"
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index ca7f1614f1f4e7c076a1569fa809d61a5c412857..873182371e097cf0929cd6886b3ec70dfb9b3ab2 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 090a16ebeb10007261666aeb6491a1785dd2e5c4..116750fbfd60f74ff49390de56f659308aa50f5c 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -92,7 +92,7 @@ static Status ProcessMemoryTypes(
 
 Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g) {
   return ProcessMemoryTypes(
-      device_type, g, [g](const Edge* e, MemoryType sm, MemoryType dm) {
+      device_type, g, [](const Edge* e, MemoryType sm, MemoryType dm) {
         if (sm == dm) {
           return Status::OK();
         }
@@ -155,7 +155,7 @@ Status EnsureMemoryTypes(const DeviceType& device_type,
   };
   std::vector<Item> edges;
   TF_RETURN_IF_ERROR(ProcessMemoryTypes(
-      device_type, g, [g, &edges](const Edge* e, MemoryType sm, MemoryType dm) {
+      device_type, g, [&edges](const Edge* e, MemoryType sm, MemoryType dm) {
         if (sm == dm) {
           return Status::OK();
         }
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43a909466ed4b6fe6ea32b1ad72a1154390288ac
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+
+namespace tensorflow {
+
+constexpr const char* MklCPUAllocator::kMaxLimitStr;
+constexpr const size_t MklCPUAllocator::kDefaultMaxLimit;
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 71e0de9724680cfcc012ae04782b90b867e0095b..245320c8964e1767af3da5b6a4e11fdbb67164f6 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -31,6 +31,10 @@ limitations under the License.
 
 #include "i_malloc.h"
 
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
 namespace tensorflow {
 
 class MklSubAllocator : public SubAllocator {
@@ -53,7 +57,7 @@ class MklCPUAllocator : public VisitableAllocator {
   static constexpr const char* kMaxLimitStr = "TF_MKL_ALLOC_MAX_BYTES";
 
   /// Default upper limit on allocator size - 64GB
-  static const size_t kDefaultMaxLimit = 64LL << 30;
+  static constexpr size_t kDefaultMaxLimit = 64LL << 30;
 
   MklCPUAllocator() { TF_CHECK_OK(Initialize()); }
 
@@ -158,7 +162,7 @@ class MklCPUAllocator : public VisitableAllocator {
   static constexpr const char* kName = "mklcpu";
 
   /// The alignment that we need for the allocations
-  static const size_t kAlignment = 64;
+  static constexpr const size_t kAlignment = 64;
 
   VisitableAllocator* allocator_;  // owned by this class
 };
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index a913f2075181a3896015579d79093395d67101ff..86851c2c075a60a57c6f169cbc7ad81253a94227 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -151,7 +152,8 @@ class ColocationGraph {
       if (attr_value != nullptr && attr_value->has_list()) {
         for (const string& class_spec : attr_value->list().s()) {
           StringPiece spec(class_spec);
-          if (spec.Consume(kColocationGroupPrefixStringPiece)) {
+          if (str_util::ConsumePrefix(&spec,
+                                      kColocationGroupPrefixStringPiece)) {
             found_spec = true;
             TF_RETURN_IF_ERROR(
                 ColocateNodeToGroup(&colocation_group_root, node, spec));
@@ -464,6 +466,7 @@ class ColocationGraph {
     // the user can see why an unsatisfiable placement occurred.
 
     std::unordered_map<string, string> type_to_devices;
+    std::vector<const Node*> colocation_nodes;
     int num_nodes_found = 0;
 
     for (const Node* node : graph_->nodes()) {
@@ -475,6 +478,7 @@ class ColocationGraph {
         continue;
       }
       ++num_nodes_found;
+      colocation_nodes.push_back(node);
       const string& op_type = node->type_string();
       string devices_registered;
       for (const auto& device_type : members_[id].supported_device_types) {
@@ -488,6 +492,13 @@ class ColocationGraph {
     for (const auto& td : type_to_devices) {
       strings::StrAppend(&text, "\n", td.first, ": ", td.second);
     }
+    strings::StrAppend(&text,
+                       "\n\nColocation members and user-requested devices:");
+    for (const Node* node : colocation_nodes) {
+      strings::StrAppend(&text, "\n  ", node->name(), " (", node->type_string(),
+                         ") ", node->requested_device());
+    }
+    strings::StrAppend(&text, "\n");
 
     if (num_nodes_found <= 1) {
       text.clear();
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 098024d2195aad8ef651120181ab271be168f92a..5ad251c892f175dceccc0304bceedc1405bc0123 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -262,9 +263,9 @@ class PlacerTest : public ::testing::Test {
                 ->attributes()                                          \
                 .device_type())
 
-#define EXPECT_DEVICE_CONTAINS(g, name, device_substr)                        \
-  EXPECT_TRUE(StringPiece(GetNodeByName((g), (name))->assigned_device_name()) \
-                  .contains(device_substr))
+#define EXPECT_DEVICE_CONTAINS(g, name, device_substr) \
+  EXPECT_TRUE(::tensorflow::str_util::StrContains(     \
+      GetNodeByName((g), (name))->assigned_device_name(), device_substr))
 
 // Test that a graph with no constraints will successfully assign nodes to the
 // "best available" device (i.e. prefer GPU over CPU).
@@ -488,11 +489,10 @@ TEST_F(PlacerTest, TestAssignedGpuDeviceToCpuDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Assigned device '/job:a/replica:0/task:0/device:fakegpu:0' "
-              "does not have registered OpKernel support for TestInput"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Assigned device '/job:a/replica:0/task:0/device:fakegpu:0' "
+      "does not have registered OpKernel support for TestInput"));
 }
 
 // Test that graphs with reference connections are correctly placed.
@@ -541,15 +541,15 @@ TEST_F(PlacerTest, TestReferenceConnection) {
   {
     Status s = ReferenceTestHelper("VariableCPU", "AssignGPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("no device type supports both of those nodes"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "TestAssign", "FakeGPU"));
   {
     Status s = ReferenceTestHelper("VariableGPU", "AssignCPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("no device type supports both of those nodes"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "AssignGPU", "FakeGPU"));
 }
@@ -760,8 +760,9 @@ TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
   }
 
   Status s = Place(&g);
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Cannot colocate nodes 'foo' and 'in' because no "
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(),
+                            "Cannot colocate nodes 'foo' and 'in' because no "
                             "device type supports both of those nodes and the "
                             "other nodes colocated with them"));
 }
@@ -824,11 +825,11 @@ TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
   }
 
   Status s = Place(&g);
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Cannot colocate nodes 'var3' and 'assign3' because no "
-                    "device type supports both of those nodes and the other "
-                    "nodes colocated with them."));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Cannot colocate nodes 'var3' and 'assign3' because no "
+      "device type supports both of those nodes and the other "
+      "nodes colocated with them."));
 }
 
 TEST_F(PlacerTest, TestColocationAndReferenceConnections) {
@@ -888,7 +889,7 @@ TEST_F(PlacerTest, TestEmptyDeviceSet) {
 
   Status s = Place(&g, &empty);
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("No devices are registered"));
+      str_util::StrContains(s.error_message(), "No devices are registered"));
 }
 
 // Test that placement fails when the requested device forces an
@@ -913,16 +914,17 @@ TEST_F(PlacerTest, TestHeterogeneousDeviceSetFailure) {
   heterogeneous.AddDevice(cpu.get());
   Status s = Place(&g, &heterogeneous);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("colocated with a group of nodes that required "
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(),
+                            "colocated with a group of nodes that required "
                             "incompatible device"));
 
   // The error message should contain information that indicates which
   // op types have which registered device types.
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("VariableGPU: FakeGPU"))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "VariableGPU: FakeGPU"))
       << s;
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("TestAssign: FakeGPU FakeCPU"))
+      str_util::StrContains(s.error_message(), "TestAssign: FakeGPU FakeCPU"))
       << s;
 }
 
@@ -937,7 +939,7 @@ TEST_F(PlacerTest, TestUnknownDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the combination of partial
@@ -952,7 +954,7 @@ TEST_F(PlacerTest, TestUnknownMergedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the previously-assigned device for a
@@ -969,9 +971,9 @@ TEST_F(PlacerTest, TestUnknownAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Assigned device '/job:foo' does not match any device"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Assigned device '/job:foo' does not match any device"));
 }
 
 // Test that placement fails when an op with no registered kernels is
@@ -986,12 +988,11 @@ TEST_F(PlacerTest, TestNoKernelsRegistered) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No OpKernel was registered to support Op 'VariableNoKernels'"));
   EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "No OpKernel was registered to support Op 'VariableNoKernels'"));
-  EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("<no registered kernels>"));
+      str_util::StrContains(s.error_message(), "<no registered kernels>"));
 }
 
 // Test that placement fails when a kernel is registered but no known
@@ -1011,10 +1012,10 @@ TEST_F(PlacerTest, TestNoDevicesRegistered) {
 
   Status s = Place(&g, &cpu_only);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("No OpKernel was registered to support "
-                            "Op 'VariableGPU'"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("device='FakeGPU'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No OpKernel was registered to support Op 'VariableGPU'"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "device='FakeGPU'"));
 }
 
 // Test that placement fails when a requested device is malformed.
@@ -1028,8 +1029,8 @@ TEST_F(PlacerTest, TestMalformedDeviceSpecification) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Malformed device specification '/foo:bar'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Malformed device specification '/foo:bar'"));
 }
 
 // Test that placement fails when a previously-assigned device is malformed.
@@ -1045,8 +1046,8 @@ TEST_F(PlacerTest, TestMalformedAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Malformed assigned device '/foo:bar'"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "Malformed assigned device '/foo:bar'"));
 }
 
 // Test that placement fails when a device was previously assigned to
@@ -1063,9 +1064,8 @@ TEST_F(PlacerTest, TestNonUniqueAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Assigned device '/job:a' does not match any device"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Assigned device '/job:a' does not match any device"));
 }
 
 // Test that ops request to be placed on non-existent devices will be relocated
@@ -1099,7 +1099,7 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakegpu:11"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakegpu:11"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1116,10 +1116,10 @@ TEST_F(PlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakecpu:0"));
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("no supported kernel for fakecpu devices is available"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakecpu:0"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "no supported kernel for fakecpu devices is available"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1137,9 +1137,9 @@ TEST_F(PlacerTest, TestNonExistentDevice) {
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("was explicitly assigned to /job:foo/replica:17 "
-                            "but available devices"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "was explicitly assigned to /job:foo/replica:17 but available devices"));
 }
 
 TEST_F(PlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
@@ -1205,8 +1205,8 @@ TEST_F(PlacerTest, TestUnsatisfiableConstraintWithReferenceConnections) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Cannot colocate nodes 'var' and 'assign'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Cannot colocate nodes 'var' and 'assign'"));
 }
 
 // Test that a generator node follows its consumers (where there are several
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index e205e34aa0f6afb1363d65bd23403d4b50f056eb..e61ed8c4794883ccc2fdd78b63faf7ca149c20de 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -25,25 +25,40 @@ namespace tensorflow {
 
 const char ProcessFunctionLibraryRuntime::kDefaultFLRDevice[] = "null";
 
+Status ProcessFunctionLibraryRuntime::FunctionData::DistributedInit(
+    DistributedFunctionLibraryRuntime* parent, const string& function_name,
+    const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options) {
+  mutex_lock l(mu_);
+  if (!init_started_) {
+    init_started_ = true;
+    init_result_ = parent->Instantiate(function_name, lib_def, attrs, options,
+                                       &local_handle_);
+  }
+  return init_result_;
+}
+
 ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, int graph_def_version,
     const FunctionLibraryDefinition* lib_def,
     const OptimizerOptions& optimizer_options,
+    thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent)
     : device_mgr_(device_mgr),
       lib_def_(lib_def),
+      default_thread_pool_(default_thread_pool),
       next_handle_(0),
       parent_(parent) {
   if (device_mgr == nullptr) {
-    flr_map_[nullptr] =
-        NewFunctionLibraryRuntime(nullptr, env, nullptr, graph_def_version,
-                                  lib_def, optimizer_options, this);
+    flr_map_[nullptr] = NewFunctionLibraryRuntime(
+        nullptr, env, nullptr, graph_def_version, lib_def, default_thread_pool,
+        optimizer_options, this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
-    flr_map_[d] =
-        NewFunctionLibraryRuntime(device_mgr, env, d, graph_def_version,
-                                  lib_def, optimizer_options, this);
+    flr_map_[d] = NewFunctionLibraryRuntime(
+        device_mgr, env, d, graph_def_version, lib_def, default_thread_pool,
+        optimizer_options, this);
   }
 }
 
@@ -52,21 +67,23 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const FunctionLibraryDefinition* lib_def,
     const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
+    thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent)
     : device_mgr_(device_mgr),
       lib_def_(lib_def),
+      default_thread_pool_(default_thread_pool),
       next_handle_(0),
       parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] = NewFunctionLibraryRuntime(
-        nullptr, env, nullptr, graph_def_version, lib_def, optimizer_options,
-        std::move(custom_kernel_creator), this);
+        nullptr, env, nullptr, graph_def_version, lib_def, default_thread_pool,
+        optimizer_options, std::move(custom_kernel_creator), this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
     flr_map_[d] = NewFunctionLibraryRuntime(
-        device_mgr, env, d, graph_def_version, lib_def, optimizer_options,
-        custom_kernel_creator, this);
+        device_mgr, env, d, graph_def_version, lib_def, default_thread_pool,
+        optimizer_options, custom_kernel_creator, this);
   }
 }
 
@@ -127,7 +144,10 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
   }
   Device* device = flr->device();
   string device_type = device->parsed_name().type;
-  if (device_type == "CPU") return Status::OK();
+  if (device_type == "CPU" || device_type == "TPU_SYSTEM") {
+    // "TPU_SYSTEM" indicates that `device` is a CPU.
+    return Status::OK();
+  }
   if (device_type == "GPU") {
     auto* dev_info = flr->device()->tensorflow_gpu_device_info();
     if (dev_info) {
@@ -145,7 +165,7 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
-      LOG(ERROR) << "Could not find device: " << device_name;
+      VLOG(1) << "Could not find device: " << device_name;
       return nullptr;
     }
   }
@@ -161,13 +181,9 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
     const string& function_key, const string& device_name,
     FunctionLibraryRuntime::LocalHandle local_handle) {
   mutex_lock l(mu_);
-  FunctionLibraryRuntime::Handle h =
-      gtl::FindWithDefault(table_, function_key, kInvalidHandle);
-  if (h != kInvalidHandle) {
-    if (function_data_.count(h) != 0) return h;
-  }
-  h = next_handle_;
-  function_data_.insert({h, FunctionData(device_name, local_handle)});
+  auto h = next_handle_;
+  FunctionData* fd = new FunctionData(device_name, local_handle);
+  function_data_[h] = std::unique_ptr<FunctionData>(fd);
   table_[function_key] = h;
   next_handle_++;
   return h;
@@ -176,12 +192,7 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
     const string& function_key) const {
   mutex_lock l(mu_);
-  FunctionLibraryRuntime::Handle h =
-      gtl::FindWithDefault(table_, function_key, kInvalidHandle);
-  if (h != kInvalidHandle) {
-    if (function_data_.count(h) == 0) return kInvalidHandle;
-  }
-  return h;
+  return gtl::FindWithDefault(table_, function_key, kInvalidHandle);
 }
 
 bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
@@ -196,19 +207,19 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
   if (function_data_.count(handle) == 0) {
     return kInvalidLocalHandle;
   }
-  const FunctionData& function_data = function_data_[handle];
-  if (function_data.target_device != device_name) {
+  FunctionData* function_data = function_data_[handle].get();
+  if (function_data->target_device() != device_name) {
     return kInvalidLocalHandle;
   }
-  return function_data.local_handle;
+  return function_data->local_handle();
 }
 
 string ProcessFunctionLibraryRuntime::GetDeviceName(
     FunctionLibraryRuntime::Handle handle) {
   mutex_lock l(mu_);
   CHECK_EQ(1, function_data_.count(handle));
-  const FunctionData& function_data = function_data_[handle];
-  return function_data.target_device;
+  FunctionData* function_data = function_data_[handle].get();
+  return function_data->target_device();
 }
 
 Status ProcessFunctionLibraryRuntime::Instantiate(
@@ -225,18 +236,29 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
         "Currently don't support instantiating functions on device: ",
         options.target);
   }
-  FunctionLibraryRuntime::Handle cluster_handle;
-  TF_RETURN_IF_ERROR(parent_->Instantiate(function_name, *lib_def_, attrs,
-                                          options, &cluster_handle));
-  string function_key = Canonicalize(function_name, attrs);
-  *handle = AddHandle(function_key, options.target, cluster_handle);
-  return Status::OK();
-}
-
-Status ProcessFunctionLibraryRuntime::RemoveHandle(
-    FunctionLibraryRuntime::Handle handle) {
-  mutex_lock l(mu_);
-  function_data_.erase(handle);
+  VLOG(1) << "ProcessFLR Instantiate: " << function_name
+          << " on: " << options.target;
+  string function_key = Canonicalize(function_name, attrs, options);
+  FunctionData* f;
+  {
+    mutex_lock l(mu_);
+    FunctionLibraryRuntime::Handle h =
+        gtl::FindWithDefault(table_, function_key, kInvalidHandle);
+    if (h == kInvalidHandle || function_data_.count(h) == 0) {
+      h = next_handle_;
+      FunctionData* fd = new FunctionData(options.target, kInvalidHandle);
+      function_data_[h] = std::unique_ptr<FunctionData>(fd);
+      table_[function_key] = h;
+      next_handle_++;
+    }
+    f = function_data_[h].get();
+    *handle = h;
+  }
+  TF_RETURN_IF_ERROR(
+      f->DistributedInit(parent_, function_name, *lib_def_, attrs, options));
+  VLOG(1) << "ProcessFLR Instantiate [success]: " << function_name
+          << " on: " << options.target << " with handle: " << *handle
+          << " (this: " << this << ")";
   return Status::OK();
 }
 
@@ -247,7 +269,7 @@ Status ProcessFunctionLibraryRuntime::ReleaseHandle(
   {
     mutex_lock l(mu_);
     CHECK_EQ(1, function_data_.count(handle)) << " handle: " << handle;
-    target_device = function_data_[handle].target_device;
+    target_device = function_data_[handle]->target_device();
   }
   flr = GetFLR(target_device);
   if (flr != nullptr) {
@@ -276,8 +298,8 @@ void ProcessFunctionLibraryRuntime::Run(
       done(errors::NotFound("Handle: ", handle, " not found."));
       return;
     }
-    target_device = function_data_[handle].target_device;
-    local_handle = function_data_[handle].local_handle;
+    target_device = function_data_[handle]->target_device();
+    local_handle = function_data_[handle]->local_handle();
   }
   flr = GetFLR(target_device);
   if (flr != nullptr) {
@@ -341,7 +363,8 @@ Status ProcessFunctionLibraryRuntime::Clone(
   out_lib_def->reset(new FunctionLibraryDefinition(*lib_def_));
   out_pflr->reset(new ProcessFunctionLibraryRuntime(
       device_mgr_, env, graph_def_version, out_lib_def->get(),
-      optimizer_options, std::move(custom_kernel_creator), parent_));
+      optimizer_options, std::move(custom_kernel_creator), default_thread_pool_,
+      parent_));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 0473e16d242814930a9de17c88d4851d0d73edbe..05e57708993a93512c359b2742863564dd76a51a 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -33,6 +33,7 @@ class ProcessFunctionLibraryRuntime {
       const DeviceMgr* device_mgr, Env* env, int graph_def_version,
       const FunctionLibraryDefinition* lib_def,
       const OptimizerOptions& optimizer_options,
+      thread::ThreadPool* thread_pool = nullptr,
       DistributedFunctionLibraryRuntime* parent = nullptr);
 
   // With `custom_kernel_creator`.
@@ -41,6 +42,7 @@ class ProcessFunctionLibraryRuntime {
                                 const FunctionLibraryDefinition* lib_def,
                                 const OptimizerOptions& optimizer_options,
                                 CustomKernelCreator custom_kernel_creator,
+                                thread::ThreadPool* thread_pool,
                                 DistributedFunctionLibraryRuntime* parent);
 
   // Sends `tensors_to_send` from `source_device` to `target_device` using
@@ -132,9 +134,6 @@ class ProcessFunctionLibraryRuntime {
   // of the device where the function is registered.
   string GetDeviceName(FunctionLibraryRuntime::Handle handle);
 
-  // Removes handle from the state owned by this object.
-  Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
-
   Status Clone(Env* env, int graph_def_version,
                const OptimizerOptions& optimizer_options,
                CustomKernelCreator custom_kernel_creator,
@@ -145,22 +144,44 @@ class ProcessFunctionLibraryRuntime {
 
   mutable mutex mu_;
 
-  struct FunctionData {
-    const string target_device;
-    const FunctionLibraryRuntime::LocalHandle local_handle;
-
+  class FunctionData {
+   public:
     FunctionData(const string& target_device,
                  FunctionLibraryRuntime::LocalHandle local_handle)
-        : target_device(target_device), local_handle(local_handle) {}
-    FunctionData() : FunctionData("", -1) {}
+        : target_device_(target_device), local_handle_(local_handle) {}
+
+    string target_device() { return target_device_; }
+
+    FunctionLibraryRuntime::LocalHandle local_handle() {
+      mutex_lock l(mu_);
+      return local_handle_;
+    }
+
+    // Initializes the FunctionData object by potentially making an Initialize
+    // call to the DistributedFunctionLibraryRuntime.
+    Status DistributedInit(
+        DistributedFunctionLibraryRuntime* parent, const string& function_name,
+        const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
+        const FunctionLibraryRuntime::InstantiateOptions& options);
+
+   private:
+    mutex mu_;
+
+    const string target_device_;
+    FunctionLibraryRuntime::LocalHandle local_handle_ GUARDED_BY(mu_);
+    bool init_started_ GUARDED_BY(mu_) = false;
+    Status init_result_ GUARDED_BY(mu_);
+    Notification init_done_;
   };
 
   const DeviceMgr* const device_mgr_;
   const FunctionLibraryDefinition* lib_def_;
+  thread::ThreadPool* default_thread_pool_;
   // Holds all the function invocations here.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
       GUARDED_BY(mu_);
-  std::unordered_map<FunctionLibraryRuntime::Handle, FunctionData>
+  std::unordered_map<FunctionLibraryRuntime::Handle,
+                     std::unique_ptr<FunctionData>>
       function_data_ GUARDED_BY(mu_);
   std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
   int next_handle_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 439ba1ce965ebe4addb525cd3d17d794feaecd1f..cc10e77ad2e9528a41db24cc86f7149eae5beb5c 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -19,9 +19,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -29,8 +32,32 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+class TestClusterFLR : public DistributedFunctionLibraryRuntime {
+ public:
+  TestClusterFLR() {}
+
+  Status Instantiate(const string& function_name,
+                     const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
+                     const FunctionLibraryRuntime::InstantiateOptions& options,
+                     FunctionLibraryRuntime::LocalHandle* handle) {
+    mutex_lock l(mu_);
+    *handle = next_handle_;
+    next_handle_++;
+    return Status::OK();
+  }
+
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::LocalHandle handle,
+           gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+           FunctionLibraryRuntime::DoneCallback done) {}
+
+ private:
+  mutex mu_;
+  int next_handle_ GUARDED_BY(mu_) = 0;
+};
+
 class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
- protected:
+ public:
   void Init(const std::vector<FunctionDef>& flib) {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
@@ -42,12 +69,20 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
+    cluster_flr_.reset(new TestClusterFLR());
     proc_flr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
-        opts, nullptr /* cluster_flr */));
+        opts, nullptr, cluster_flr_.get()));
     rendezvous_ = new IntraProcessRendezvous(device_mgr_.get());
   }
 
+  Status Instantiate(
+      const string& name, test::function::Attrs attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
+      FunctionLibraryRuntime::Handle* handle) {
+    return proc_flr_->Instantiate(name, attrs, instantiate_opts, handle);
+  }
+
   Status Run(const string& name, FunctionLibraryRuntime::Options opts,
              test::function::Attrs attrs,
              const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
@@ -84,12 +119,13 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
     EXPECT_GE(call_count, 1);  // Test runner is used.
 
-    // Release the handle and then try running the function. It shouldn't
-    // succeed.
+    // Release the handle and then try running the function.  It
+    // should still succeed.
     status = proc_flr_->ReleaseHandle(handle);
     if (!status.ok()) {
       return status;
     }
+
     Notification done2;
     proc_flr_->Run(opts, handle, args, &out,
                    [&status, &done2](const Status& s) {
@@ -97,15 +133,13 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                      done2.Notify();
                    });
     done2.WaitForNotification();
-    EXPECT_TRUE(errors::IsNotFound(status));
-    EXPECT_TRUE(StringPiece(status.error_message()).contains("not found."));
-
-    return Status::OK();
+    return status;
   }
 
   std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<TestClusterFLR> cluster_flr_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr_;
   IntraProcessRendezvous* rendezvous_;
 };
@@ -118,7 +152,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, GetFLRNull) {
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr(
       new ProcessFunctionLibraryRuntime(
           nullptr /* device_mgr */, Env::Default(), TF_GRAPH_DEF_VERSION,
-          lib_def.get(), opts, nullptr /* cluster_flr */));
+          lib_def.get(), opts, nullptr, nullptr /* cluster_flr */));
   FunctionLibraryRuntime* flr =
       proc_flr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
   EXPECT_NE(flr, nullptr);
@@ -250,5 +284,60 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsDiffDeviceFindDevice) {
   rendezvous_->Unref();
 }
 
+TEST_F(ProcessFunctionLibraryRuntimeTest, ClusterFLRSerialTest) {
+  Init({test::function::FindDevice()});
+  FunctionLibraryRuntime::Options opts;
+  opts.source_device = "/job:a/replica:0/task:0/cpu:0";
+  opts.rendezvous = rendezvous_;
+  opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:b/replica:0/task:0/device:CPU:0";
+  FunctionLibraryRuntime::Handle h;
+  TF_CHECK_OK(Instantiate("FindDevice",
+                          {{"_target", "/job:b/replica:0/task:0/device:CPU:0"}},
+                          instantiate_opts, &h));
+  EXPECT_EQ(0, proc_flr_->GetHandleOnDevice(
+                   "/job:b/replica:0/task:0/device:CPU:0", h));
+  TF_CHECK_OK(Instantiate("FindDevice",
+                          {{"_target", "/job:b/replica:0/task:0/device:CPU:0"}},
+                          instantiate_opts, &h));
+  EXPECT_EQ(0, proc_flr_->GetHandleOnDevice(
+                   "/job:b/replica:0/task:0/device:CPU:0", h));
+  instantiate_opts.target = "/job:c/replica:0/task:0/device:CPU:0";
+  TF_CHECK_OK(Instantiate("FindDevice",
+                          {{"_target", "/job:c/replica:0/task:0/device:CPU:0"}},
+                          instantiate_opts, &h));
+  EXPECT_EQ(1, proc_flr_->GetHandleOnDevice(
+                   "/job:c/replica:0/task:0/device:CPU:0", h));
+  rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, ClusterFLRParallelTest) {
+  Init({test::function::FindDevice()});
+  FunctionLibraryRuntime::Options opts;
+  opts.source_device = "/job:a/replica:0/task:0/cpu:0";
+  opts.rendezvous = rendezvous_;
+  opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:b/replica:0/task:0/device:CPU:0";
+
+  thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "test", 4);
+  auto fn = [this, &instantiate_opts]() {
+    FunctionLibraryRuntime::Handle h;
+    TF_CHECK_OK(Instantiate(
+        "FindDevice", {{"_target", "/job:b/replica:0/task:0/device:CPU:0"}},
+        instantiate_opts, &h));
+    EXPECT_EQ(0, proc_flr_->GetHandleOnDevice(
+                     "/job:b/replica:0/task:0/device:CPU:0", h));
+  };
+
+  for (int i = 0; i < 100; ++i) {
+    tp->Schedule(fn);
+  }
+  delete tp;
+
+  rendezvous_->Unref();
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index d738fef7be5b04f166651644542b2eadbe38715d..21912236d079bdd76f1a3c1f5deb2d2cc1fc443e 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -15,9 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/process_util.h"
 
+#ifdef INTEL_MKL
+#include <omp.h>
+#endif
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
@@ -46,22 +50,47 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
   return compute_pool;
 }
 
+int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
+  const int32 inter_op = options.config.inter_op_parallelism_threads();
+  if (inter_op != 0) return inter_op;
+#ifdef INTEL_MKL
+  // MKL library executes ops in parallel using OMP threads
+  // Set inter_op conservatively to avoid thread oversubscription that could
+  // lead to severe perf degradations and OMP resource exhaustion
+  const int mkl_intra_op = omp_get_max_threads();
+  CHECK_GE(mkl_intra_op, 1);
+  const int32 mkl_inter_op = std::max(
+      (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
+  VLOG(0) << "Creating new thread pool with default inter op setting: "
+          << mkl_inter_op
+          << ". Tune using inter_op_parallelism_threads for best performance.";
+  return mkl_inter_op;
+#else
+  // Default to using the number of cores available in the process.
+  return port::NumSchedulableCPUs();
+#endif
+}
+
+thread::ThreadPool* NewThreadPoolFromSessionOptions(
+    const SessionOptions& options) {
+  const int32 num_threads = NumInterOpThreadsFromSessionOptions(options);
+  VLOG(1) << "Direct session inter op parallelism threads: " << num_threads;
+  return new thread::ThreadPool(options.env, "Compute", num_threads);
+}
+
 void SchedClosure(std::function<void()> closure) {
-  if (port::Tracing::IsActive()) {
-    const uint64 id = port::Tracing::UniqueId();
-    port::Tracing::RecordEvent(port::Tracing::EventCategory::kScheduleClosure,
-                               id);
-    std::function<void()> wrapper = std::bind(
-        [id](std::function<void()> closure) {
-          port::Tracing::ScopedActivity region(
-              port::Tracing::EventCategory::kRunClosure, id);
-          closure();
-        },
-        std::move(closure));
-    Env::Default()->SchedClosure(std::move(wrapper));
-  } else {
-    Env::Default()->SchedClosure(std::move(closure));
+  if (!tracing::EventCollector::IsEnabled()) {
+    return Env::Default()->SchedClosure(std::move(closure));
   }
+  uint64 id = tracing::GetUniqueArg();
+  tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
+
+  Env::Default()->SchedClosure(std::bind(
+      [id](std::function<void()> closure) {
+        tracing::ScopedRegion region(tracing::EventCategory::kRunClosure, id);
+        closure();
+      },
+      std::move(closure)));
 }
 
 void SchedNonBlockingClosureAfter(int64 micros, std::function<void()> closure) {
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index fc3a262fe1c14856819361f29ea9066193181695..5d9266671617320eea4cea60de1ebd7210f3b674 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -30,6 +30,13 @@ namespace tensorflow {
 // using 'options'.  Caller does not take ownership over threadpool.
 thread::ThreadPool* ComputePool(const SessionOptions& options);
 
+// Returns number of inter op threads.
+int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
+
+// Creates a thread pool with number of inter op threads.
+thread::ThreadPool* NewThreadPoolFromSessionOptions(
+    const SessionOptions& options);
+
 // Schedule "closure" in the default thread queue.
 void SchedClosure(std::function<void()> closure);
 
diff --git a/tensorflow/core/common_runtime/process_util_test.cc b/tensorflow/core/common_runtime/process_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46672ac92eef1c1da608720f17a30417f92d04bd
--- /dev/null
+++ b/tensorflow/core/common_runtime/process_util_test.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/process_util.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(ProcessUtilTest, NumThreads) {
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(10);
+  EXPECT_EQ(10, NumInterOpThreadsFromSessionOptions(opts));
+}
+
+TEST(ProcessUtilTest, ThreadPool) {
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(10);
+
+  thread::ThreadPool* pool = NewThreadPoolFromSessionOptions(opts);
+  EXPECT_EQ(10, pool->NumThreads());
+  delete pool;
+}
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 60263d1471943fed4392a4dc0704992589ce3b69..93f24a3217ef08fc7368365c9a43a913810f211b 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -121,27 +121,36 @@ void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
   // Recv the tensor from local_.
   local_->RecvAsync(
       parsed, recv_args,
-      [this, parsed, done](
-          const Status& status, const Rendezvous::Args& send_args,
-          const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
-        // If "in" is an uninitialized tensor, do copy-construction to preserve
-        // the uninitialized state, along with data type and shape info, which
-        // is useful for debugger purposes.
-        Tensor* out = in.IsInitialized() ? new Tensor : new Tensor(in);
-
-        StatusCallback final_callback = [done, send_args, recv_args, out,
-                                         is_dead](const Status& s) {
-          done(s, send_args, recv_args, *out, is_dead);
-          delete out;
-        };
-
-        if (status.ok() && in.IsInitialized()) {
-          SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
-                             std::move(final_callback));
-        } else {
-          final_callback(status);
-        }
-      });
+      std::bind(
+          [this, parsed](DoneCallback done,
+                         // Begin unbound arguments.
+                         const Status& status,
+                         const Rendezvous::Args& send_args,
+                         const Rendezvous::Args& recv_args, const Tensor& in,
+                         bool is_dead) {
+            // If "in" is an uninitialized tensor, do copy-construction to
+            // preserve the uninitialized state, along with data type and shape
+            // info, which is useful for debugger purposes.
+            Tensor* out = in.IsInitialized() ? new Tensor : new Tensor(in);
+
+            auto final_callback = std::bind(
+                [send_args, recv_args, out, is_dead](DoneCallback done,
+                                                     // Begin unbound arguments.
+                                                     const Status& s) {
+                  done(s, send_args, recv_args, *out, is_dead);
+                  delete out;
+                },
+                std::move(done), std::placeholders::_1);
+
+            if (status.ok() && in.IsInitialized()) {
+              SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
+                                 std::move(final_callback));
+            } else {
+              final_callback(status);
+            }
+          },
+          std::move(done), std::placeholders::_1, std::placeholders::_2,
+          std::placeholders::_3, std::placeholders::_4, std::placeholders::_5));
 }
 
 void IntraProcessRendezvous::StartAbort(const Status& s) {
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a17281835ea5f5851b2304cdd953f7820b275395
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -0,0 +1,546 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_reducer.h"
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+
+// Set true for greater intelligibility of debug mode log messages.
+#define READABLE_KEYS false
+
+namespace tensorflow {
+namespace {
+// Each CollectiveOp implementation is free to define its own
+// BufRendezvous key format.  This function produces the key used by
+// RingReducer.
+string RingReduceBufKey(const string& exec_key, int pass, int section,
+                        int source_rank) {
+  if (READABLE_KEYS) {
+    return strings::StrCat("rred(", exec_key, "):pass(", pass, "):section(",
+                           section, "):srcrank(", source_rank, ")");
+  } else {
+    // TODO(tucker): Try out some kind of denser encoding, e.g. 128 bit hash.
+    return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank);
+  }
+}
+
+}  // namespace
+
+void RingReducer::PCQueue::Enqueue(RingField* rf) {
+  mutex_lock l(pcq_mu_);
+  deque_.push_back(rf);
+  if (waiter_count_ > 0) {
+    cv_.notify_one();
+  }
+}
+
+RingReducer::RingField* RingReducer::PCQueue::Dequeue() {
+  mutex_lock l(pcq_mu_);
+  if (deque_.empty()) {
+    ++waiter_count_;
+    while (deque_.empty()) {
+      cv_.wait(l);
+    }
+    --waiter_count_;
+  }
+  RingField* rf = deque_.front();
+  deque_.pop_front();
+  return rf;
+}
+
+RingReducer::RingReducer(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+                         OpKernelContext* ctx,
+                         OpKernelContext::Params* op_params,
+                         const CollectiveParams& col_params,
+                         const string& exec_key, int64 step_id,
+                         const Tensor* input, Tensor* output)
+    : col_exec_(col_exec),
+      dev_mgr_(dev_mgr),
+      ctx_(ctx),
+      op_params_(op_params),
+      col_params_(col_params),
+      exec_key_(exec_key),
+      input_(input),
+      output_(output),
+      rank_(col_params.subdiv_rank[0]),
+      step_id_(step_id),
+      group_size_(col_params.group.group_size),
+      num_subdivs_(static_cast<int>(
+          col_params.instance.impl_details.subdiv_permutations.size())),
+      done_(nullptr),
+      device_(nullptr),
+      device_name_(
+          col_params_.instance.device_names[col_params_.default_rank]) {
+  CHECK_GT(group_size_, 0);
+  CHECK_GT(num_subdivs_, 0);
+}
+
+RingReducer::~RingReducer() { group_size_tensor_ready_.WaitForNotification(); }
+
+string RingReducer::TensorDebugString(Tensor tensor) {
+  const DeviceBase::GpuDeviceInfo* gpu_device_info =
+      ctx_->device()->tensorflow_gpu_device_info();
+  if (gpu_device_info) {
+    Tensor cpu_tensor(tensor.dtype(), tensor.shape());
+    Notification note;
+    gpu_device_info->default_context->CopyDeviceTensorToCPU(
+        &tensor, "" /*tensor_name*/, device_, &cpu_tensor,
+        [&note](const Status& s) {
+          CHECK(s.ok());
+          note.Notify();
+        });
+    note.WaitForNotification();
+    return cpu_tensor.SummarizeValue(64);
+  } else {
+    return tensor.SummarizeValue(64);
+  }
+}
+
+void RingReducer::Run(StatusCallback done) {
+  done_ = std::move(done);
+
+  // Get local execution device.
+  if (VLOG_IS_ON(1)) {
+    string buf;
+    for (int r = 0; r < col_params_.instance.device_names.size(); ++r) {
+      strings::StrAppend(&buf, "dev ", r, " : ",
+                         col_params_.instance.device_names[r], "\n");
+    }
+    for (int sd = 0;
+         sd < col_params_.instance.impl_details.subdiv_permutations.size();
+         ++sd) {
+      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      for (auto x : col_params_.instance.impl_details.subdiv_permutations[sd]) {
+        strings::StrAppend(&buf, x, ", ");
+      }
+    }
+    VLOG(1) << "RingReducer::Run for device " << device_name_
+            << " default_rank " << col_params_.default_rank << "\n"
+            << buf;
+  }
+  CHECK(dev_mgr_);
+  Status status = dev_mgr_->LookupDevice(
+      col_params_.instance.device_names[col_params_.default_rank], &device_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to find device "
+               << col_params_.instance.device_names[col_params_.default_rank];
+    for (auto d : dev_mgr_->ListDevices()) {
+      LOG(ERROR) << "Available device " << d->name();
+    }
+    done_(status);
+    return;
+  }
+  CHECK(device_);
+  device_locality_ = device_->attributes().locality();
+
+  VLOG(1) << this << " default_rank " << col_params_.default_rank << " cp "
+          << &col_params_ << ": " << col_params_.ToString();
+
+  // Start by copying input to output if they're not already the same, i.e. if
+  // we're not computing in-place on the input tensor.
+  if ((input_ != output_) &&
+      (DMAHelper::base(input_) != DMAHelper::base(output_))) {
+    CollectiveRemoteAccessLocal::MemCpyAsync(
+        ctx_->input_device_context(0), ctx_->op_device_context(), device_,
+        device_, ctx_->input_alloc_attr(0), ctx_->output_alloc_attr(0), input_,
+        output_, [this](const Status& s) {
+          if (!s.ok()) {
+            done_(s);
+          } else {
+            ContinueAfterInputCopy();
+          }
+        });
+  } else {
+    ContinueAfterInputCopy();
+  }
+}
+
+void RingReducer::ContinueAfterInputCopy() {
+  AllocatorAttributes attr = ctx_->output_alloc_attr(0);
+  ca_.reset(MakeCollectiveAdapter(output_, group_size_ * num_subdivs_,
+                                  device_->GetAllocator(attr)));
+
+  if (col_params_.final_op) {
+    // Create an on-device scalar value from group_size_ that may be needed
+    // later.
+    // TODO(tucker): Cache and reuse across invocations? Or maybe the scalar
+    // can be provided to the kernel in host memory?
+    Tensor group_size_val = ca_->Scalar(group_size_);
+    if (col_params_.group.device_type != "CPU") {
+      group_size_tensor_ =
+          ca_->Scalar(device_->GetAllocator(ctx_->input_alloc_attr(0)));
+      DeviceContext* op_dev_ctx = ctx_->op_device_context();
+      op_dev_ctx->CopyCPUTensorToDevice(&group_size_val, device_,
+                                        &group_size_tensor_,
+                                        [this](const Status& s) {
+                                          if (!s.ok()) {
+                                            StartAbort(s);
+                                          }
+                                          group_size_tensor_ready_.Notify();
+                                        });
+    } else {
+      group_size_tensor_ = group_size_val;
+      group_size_tensor_ready_.Notify();
+    }
+  }
+  Finish(RunAsyncParts());
+}
+
+void RingReducer::StartAbort(const Status& s) {
+  // In abort mode we stop issuing additional ProvideBuf
+  // and ConsumeBuf calls, but we need to wait for all of the
+  // outstanding callbacks to be invoked before quitting.
+  bool abort_started = false;
+  {
+    mutex_lock l(status_mu_);
+    if (status_.ok()) {
+      LOG(ERROR) << "Aborting RingReduce with " << s;
+      abort_started = true;
+      status_.Update(s);
+    }
+  }
+  // If this is the initial entry to abort mode then invoke StartAbort
+  // on the CollectiveExecutor that invoked us.  That should start
+  // cancellation on all of the outstanding CollectiveRemoteAccess
+  // actions.
+  if (abort_started) {
+    col_exec_->StartAbort(s);
+  }
+}
+
+void RingReducer::Finish(bool ok) {
+  if (ok) {
+    // Recover the output from the adaptor.
+    ca_->ConsumeFinalValue(output_);
+  }
+  Status s;
+  {
+    mutex_lock l(status_mu_);
+    s = status_;
+  }
+  done_(s);
+}
+
+RingReducer::SubContext::SubContext(OpKernelContext* ctx,
+                                    OpKernelContext::Params* params,
+                                    OpKernel* op, Tensor* output, Tensor* input)
+    : sub_params_(*params),
+      sub_inputs_({output, input}),
+      sub_input_attr_({ctx->input_alloc_attr(0), ctx->input_alloc_attr(0)}),
+      sub_input_dc_(
+          {ctx->input_device_context(0), ctx->input_device_context(0)}) {
+  sub_params_.op_kernel = op;
+  sub_params_.inputs = &sub_inputs_;
+  sub_params_.input_alloc_attrs = &sub_input_attr_;
+  sub_params_.input_device_contexts = &sub_input_dc_;
+  sub_params_.eigen_gpu_device = nullptr;
+  sub_params_.ensure_eigen_gpu_device();
+  sub_ctx_ = new OpKernelContext(&sub_params_, 1);
+}
+
+Status RingReducer::ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
+                                 Tensor* input) {
+  // Prepare an OpKernelContext that is identical to that of the original Op
+  // (i.e. the collective), except for the input output sizes and identities and
+  // the Op itself.
+  // TODO(tucker): Is it possible to cache and reuse these objects?  They're
+  // mostly identical inside one device execution.
+  std::unique_ptr<SubContext> sub_ctx(
+      new SubContext(ctx_, op_params_, op, output, input));
+  device->Compute(op, sub_ctx->sub_ctx_);
+  return sub_ctx->sub_ctx_->status();
+}
+
+// At the beginning of the algorithm initialize a RingField struct for
+// every independent field of the tensor.
+void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                                int field_idx) {
+  // Note on field indexing: There are group_size_ devices in the
+  // instance, implying the same number of chunks per tensor, where a
+  // chunk is the unit of data transferred in a time step.  However, if
+  // a device can simultaenously send data by 2 or more independent
+  // channels we can speed up the transfer by subdividing chunks and
+  // processing multiple subdivisions at once.  So the actual number
+  // of RingFields is group_size_ * num_subdivs_.
+  DCHECK_EQ(field_idx, (chunk_idx * num_subdivs_) + subdiv_idx);
+  rf->chunk_idx = chunk_idx;
+  rf->subdiv_idx = subdiv_idx;
+  rf->sc_idx = field_idx;
+  rf->rank = col_params_.subdiv_rank[subdiv_idx];
+  rf->second_pass = false;
+  rf->action = RF_INIT;
+  // Recv from the device with preceding rank within the subdivision.
+  int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  rf->recv_dev_idx = col_params_.instance.impl_details
+                         .subdiv_permutations[subdiv_idx][recv_from_rank];
+  int send_dev_idx = col_params_.instance.impl_details
+                         .subdiv_permutations[subdiv_idx][send_to_rank];
+  rf->recv_is_remote = !col_params_.task.is_local[rf->recv_dev_idx];
+  rf->send_is_remote = !col_params_.task.is_local[send_dev_idx];
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 0 we skip Recv when rank = chunk_idx
+    rf->do_recv = (rf->chunk_idx != rf->rank);
+    // In pass 0 we skip Send when rank = chunk_idx-1
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  if (rf->do_send || rf->do_recv) {
+    rf->chunk = ca_->ChunkAlias(rf->sc_idx);
+    CHECK(rf->chunk.IsAligned()) << rf->DebugString();
+  }
+  if (rf->do_recv) {
+    rf->tmp_chunk = ca_->TempChunk(rf->sc_idx);
+    CHECK(rf->tmp_chunk.IsAligned()) << rf->DebugString();
+  }
+  VLOG(2) << this << " InitRingField " << rf->DebugString() << " chunk "
+          << ca_->TBounds(rf->chunk);
+}
+
+// When a RingField transitions from first to second recompute the
+// do_send and do_recv values.
+void RingReducer::AdvanceToSecondPass(RingField* rf) {
+  VLOG(3) << "IncrRingField old value " << rf->DebugString();
+  CHECK(!rf->second_pass);
+  rf->second_pass = true;
+  rf->action = RF_INIT;
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 1 the send/no-send boundary moves down 1 place.
+    rf->do_recv =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  VLOG(3) << "IncrRingField new value " << rf->DebugString();
+}
+
+string RingReducer::RingField::DebugString() const {
+  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
+                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
+                              " action=", action);
+  strings::StrAppend(&rv, " pass=", second_pass);
+  strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
+                     " is_final=", is_final, " recv_is_remote=", recv_is_remote,
+                     " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
+  return rv;
+}
+
+void RingReducer::DispatchSend(RingField* rf, const StatusCallback& done) {
+  CHECK(rf->do_send);
+  string send_buf_key =
+      RingReduceBufKey(exec_key_, rf->second_pass, rf->sc_idx, rf->rank);
+  VLOG(3) << "DispatchSend rank=" << col_params_.default_rank << " send key "
+          << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
+          << rf->sc_idx;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  int send_to_dev_idx = col_params_.instance.impl_details
+                            .subdiv_permutations[rf->subdiv_idx][send_to_rank];
+  col_exec_->PostToPeer(col_params_.instance.device_names[send_to_dev_idx],
+                        col_params_.instance.task_names[send_to_dev_idx],
+                        send_buf_key, device_, ctx_->op_device_context(),
+                        ctx_->output_alloc_attr(0), &rf->chunk,
+                        device_locality_, done);
+}
+
+void RingReducer::DispatchRecv(RingField* rf, const StatusCallback& done) {
+  CHECK(rf->do_recv);
+  string recv_buf_key =
+      RingReduceBufKey(exec_key_, rf->second_pass, rf->sc_idx,
+                       (rf->rank + (group_size_ - 1)) % group_size_);
+  VLOG(3) << "DispatchRecv rank=" << col_params_.default_rank << " recv key "
+          << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into "
+          << ((col_params_.merge_op != nullptr) ? "tmp_chunk" : "chunk");
+  Tensor* dst_tensor = (!rf->second_pass && (col_params_.merge_op != nullptr))
+                           ? &rf->tmp_chunk
+                           : &rf->chunk;
+  col_exec_->RecvFromPeer(col_params_.instance.device_names[rf->recv_dev_idx],
+                          col_params_.instance.task_names[rf->recv_dev_idx],
+                          col_params_.task.is_local[rf->recv_dev_idx],
+                          recv_buf_key, device_, ctx_->op_device_context(),
+                          ctx_->output_alloc_attr(0), dst_tensor,
+                          device_locality_, done);
+}
+
+string RingReducer::FieldState() {
+  string s = strings::StrCat("RingReducer ",
+                             strings::Hex(reinterpret_cast<uint64>(this)),
+                             " exec ", exec_key_, " step_id=", step_id_,
+                             " state of all ", rfv_.size(), " fields:");
+  for (int i = 0; i < rfv_.size(); ++i) {
+    s.append("\n");
+    s.append(rfv_[i].DebugString());
+  }
+  return s;
+}
+
+bool RingReducer::RunAsyncParts() {
+  // This function orchestrates RingReduce actions on behalf of a
+  // single device. It is entered by a blockable thread that
+  // loops within it until all actions assigned to that device
+  // complete. Hence function local variables are accessible only by that
+  // one thread and do not require an explicit mutex.
+  rfv_.clear();
+  rfv_.resize(group_size_ * num_subdivs_);
+  PCQueue ready_queue;
+  int field_done_count = 0;
+  int send_pending_count = 0;
+  int recv_pending_count = 0;
+  std::atomic<bool> aborted(false);
+  field_done_count = 0;
+  send_pending_count = 0;
+  recv_pending_count = 0;
+  for (int chunk_idx = 0; chunk_idx < group_size_; ++chunk_idx) {
+    for (int subdiv_idx = 0; subdiv_idx < num_subdivs_; ++subdiv_idx) {
+      int rf_index = (chunk_idx * num_subdivs_) + subdiv_idx;
+      InitRingField(&rfv_[rf_index], chunk_idx, subdiv_idx, rf_index);
+      ready_queue.Enqueue(&rfv_[rf_index]);
+    }
+  }
+
+  // Loop until all RingFields have advanced to completion.
+  while (field_done_count < rfv_.size()) {
+    VLOG(4) << FieldState();
+    // Wait for a RingField to appear in the ready_queue.
+    RingField* rf = ready_queue.Dequeue();
+    // Advance the RingField to its next action and execute, repeating
+    // until either an async action has been started or the RingField
+    // is done.
+    bool dispatched = false;  // true if async action was initiated
+    do {
+      if (aborted) {
+        // Requeue this RingField to be counted off below.
+        ready_queue.Enqueue(rf);
+        break;
+      }
+      switch (rf->action) {
+        case RF_INIT:
+          if (rf->do_recv) {
+            rf->action = RF_RECV;
+            auto requeue = [this, rf, &ready_queue, &aborted](Status s) {
+              const bool bad_status = !s.ok();
+              if (bad_status) aborted = true;
+              ready_queue.Enqueue(rf);
+              if (bad_status) StartAbort(s);
+            };
+            DispatchRecv(rf, requeue);
+            dispatched = true;
+            ++recv_pending_count;
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_RECV:
+          CHECK_GT(recv_pending_count, 0);
+          --recv_pending_count;
+          if (!rf->second_pass) {
+            rf->action = RF_REDUCE;
+            Status s = ComputeBinOp(device_, col_params_.merge_op.get(),
+                                    &rf->chunk, &rf->tmp_chunk);
+            if (!s.ok()) {
+              aborted = true;
+              StartAbort(s);
+            }
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_REDUCE:
+          if (!rf->second_pass && col_params_.final_op.get() && rf->is_final) {
+            rf->action = RF_FINALIZE;
+            group_size_tensor_ready_.WaitForNotification();
+            Status s = ComputeBinOp(device_, col_params_.final_op.get(),
+                                    &rf->chunk, &group_size_tensor_);
+            if (!s.ok()) {
+              aborted = true;
+              StartAbort(s);
+            }
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_FINALIZE:
+          rf->action = RF_DONE;
+          break;
+        case RF_SEND_READY:
+          if (rf->do_send) {
+            rf->action = RF_SEND;
+            auto send_complete = [this, rf, &ready_queue, &aborted](Status s) {
+              const bool bad_status = !s.ok();
+              if (bad_status) aborted = true;
+              ready_queue.Enqueue(rf);
+              if (bad_status) StartAbort(s);
+            };
+            DispatchSend(rf, send_complete);
+            dispatched = true;
+            ++send_pending_count;
+          } else {
+            rf->action = RF_DONE;
+          }
+          break;
+        case RF_SEND:
+          CHECK_GT(send_pending_count, 0);
+          --send_pending_count;
+          rf->action = RF_DONE;
+          break;
+        case RF_DONE:
+          break;
+      }
+      if (rf->action == RF_DONE) {
+        if (rf->second_pass) {
+          ++field_done_count;
+          break;  // from do while(!dispatched)
+        } else {
+          AdvanceToSecondPass(rf);
+        }
+      }
+    } while (!dispatched);
+    if (aborted) break;
+  }  // while (field_done_count < number of fields)
+
+  if (aborted) {
+    // All of the pending data actions should be aborted; field the
+    // callbacks and clear the queue before quitting.
+    while ((send_pending_count > 0) || (recv_pending_count > 0)) {
+      RingField* rf = ready_queue.Dequeue();
+      switch (rf->action) {
+        case RF_RECV:
+          --recv_pending_count;
+          break;
+        case RF_SEND:
+          --send_pending_count;
+          break;
+        default: {}  // Ignore any other actions
+      }
+    }
+  }
+
+  CHECK_EQ(send_pending_count, 0);
+  CHECK_EQ(recv_pending_count, 0);
+
+  VLOG(2) << this << " rank=" << rank_ << " finish;"
+          << " final value " << TensorDebugString(ca_->Value());
+  return !aborted;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_reducer.h b/tensorflow/core/common_runtime/ring_reducer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e1988e78706fc40f4f3c924d9612aa263f7f416
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_reducer.h
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
+
+#include <deque>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+class DeviceMgr;
+
+// Ring-algorithm implementation of collective all-reduce.
+class RingReducer {
+ public:
+  RingReducer(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+              OpKernelContext* ctx, OpKernelContext::Params* op_params,
+              const CollectiveParams& col_params, const string& exec_key,
+              int64 step_id, const Tensor* input, Tensor* output);
+
+  virtual ~RingReducer();
+
+  void Run(StatusCallback done);
+
+ private:
+  // Called when a bad status is received that implies we should terminate
+  // execution and return a bad status.
+  void StartAbort(const Status& s);
+  void ContinueAfterInputCopy();
+  void Finish(bool ok);
+  Status ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
+                      Tensor* input);
+  bool RunAsyncParts();
+
+  // Used for executing a sub-operation, e.g. a merge_op instance, with
+  // an OpKernelContext based on the one passed into this Op.
+  class SubContext {
+   public:
+    OpKernelContext::Params sub_params_;
+    gtl::InlinedVector<TensorValue, 4> sub_inputs_;
+    gtl::InlinedVector<AllocatorAttributes, 4> sub_input_attr_;
+    gtl::InlinedVector<DeviceContext*, 4> sub_input_dc_;
+    // Used only for Binary and Unary Ops for which we require
+    // the calculation to be in-place on the first input.
+    int forward_from_ = 0;
+    OpKernelContext* sub_ctx_;
+    SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+               OpKernel* op, Tensor* output, Tensor* input);
+    ~SubContext() { delete sub_ctx_; }
+  };
+
+  // Current status of a RingField
+  enum RingFieldAction {
+    RF_INIT = 0,    // Just initialized for a pass
+    RF_RECV,        // Recv pending
+    RF_REDUCE,      // Reduce pending
+    RF_FINALIZE,    // FinalOp pending
+    RF_SEND_READY,  // Ready to send
+    RF_SEND,        // Send pending
+    RF_DONE,        // No more work
+  };
+
+  // Tracks progress of actions on a single subfield of the entire tensor.
+  struct RingField {
+    int16 chunk_idx;     // major division index
+    int16 subdiv_idx;    // minor division index
+    int16 sc_idx;        // subchunk index
+    int16 rank;          // rank within subdiv permutation
+    int16 recv_dev_idx;  // dev from which value should be recv'd
+    RingFieldAction action;
+    bool second_pass;
+    bool recv_is_remote = false;
+    bool send_is_remote = false;
+    bool do_send = false;   // is the value sent in this pass?
+    bool do_recv = false;   // is the value recv'd in this pass?
+    bool is_final = false;  // is the last field in the pass for this rank
+    Tensor chunk;           // alias to field values
+    Tensor tmp_chunk;
+    Status status;
+    string DebugString() const;
+  };
+  void AdvanceToSecondPass(RingField* rf);
+  void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                     int field_idx);
+  void DispatchSend(RingField* rf, const StatusCallback& done);
+  void DispatchRecv(RingField* rf, const StatusCallback& done);
+
+  // For constructing log messages for debugging.
+  string FieldState();
+  string TensorDebugString(Tensor tensor);
+
+  // Producer/Consumer Queue of RingField structs.
+  class PCQueue {
+   public:
+    void Enqueue(RingField* rf);
+    RingField* Dequeue();
+
+   private:
+    mutex pcq_mu_;
+    condition_variable cv_;
+    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
+    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
+  };
+
+  CollectiveExecutor* col_exec_;        // Not owned
+  const DeviceMgr* dev_mgr_;            // Not owned
+  OpKernelContext* ctx_;                // Not owned
+  OpKernelContext::Params* op_params_;  // Not owned
+  const CollectiveParams& col_params_;
+  const string exec_key_;
+  const Tensor* input_;  // Not owned
+  Tensor* output_;       // Not owned
+  const int rank_;
+  const int64 step_id_;
+  const int group_size_;
+  const int num_subdivs_;
+  Tensor group_size_tensor_;
+  Notification group_size_tensor_ready_;
+  std::unique_ptr<CollectiveAdapter> ca_;
+  StatusCallback done_;
+  Device* device_;  // The device for which this instance labors
+  const string device_name_;
+  DeviceLocality device_locality_;
+
+  mutex status_mu_;
+  Status status_ GUARDED_BY(status_mu_);
+
+  std::vector<RingField> rfv_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4387a074af79f97e17d1f9f1d828157b738fa40
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -0,0 +1,606 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_reducer.h"
+
+#include <algorithm>
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      done(errors::Internal("Deliberate failure"));
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ GUARDED_BY(mu_);
+};
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
+                                    const DeviceType& device_type,
+                                    DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      device_type, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) {
+    LOG(FATAL) << status;
+  }
+  return k;
+}
+
+std::unique_ptr<OpKernel> GetAdd(DataType dtype, const DeviceType& device_type,
+                                 DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Add");
+  TF_CHECK_OK(builder.Attr("T", dtype)
+                  .Input(FakeInput(dtype))
+                  .Input(FakeInput(dtype))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device_type, device);
+}
+
+std::unique_ptr<OpKernel> GetDiv(DataType dtype, const DeviceType& device_type,
+                                 DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Div");
+  TF_CHECK_OK(builder.Attr("T", dtype)
+                  .Input(FakeInput(dtype))
+                  .Input(FakeInput(dtype))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device_type, device);
+}
+
+static int64 kStepId = 123;
+
+class RingReducerTest : public ::testing::Test {
+ protected:
+  RingReducerTest() : device_type_(DEVICE_CPU) {}
+
+  void SetUp() override {
+#if GOOGLE_CUDA
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+#endif
+  }
+
+  ~RingReducerTest() override {
+    stop_ = true;
+    for (auto i : instances_) {
+      delete i;
+    }
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void Init(int num_workers, int num_devices, DataType dtype,
+            const DeviceType& device_type, int num_subdivs, int fail_after) {
+    device_type_ = device_type;
+    std::vector<Device*> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name =
+              strings::StrCat("/job:worker/replica:0/task:", wi, "/cpu:", di);
+          local_devices.push_back(new ThreadPoolDevice(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(gpu_devices_[dev_idx]);
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
+                 << " devices: ";
+      dev_mgr_.reset(new DeviceMgr(local_devices));
+    }
+    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get());
+    col_params_.name = "test_collective";
+    static const int kGroupKey = 5;
+    col_params_.group.group_key = kGroupKey;
+    col_params_.group.device_type = device_type;
+    col_params_.group.group_size = num_workers * num_devices;
+    static const int kInstanceKey = 17;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.instance.impl_details.subdiv_offsets.clear();
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    col_params_.instance.data_type = dtype;
+    col_params_.instance.impl_details.subdiv_permutations.resize(num_subdivs);
+    col_params_.subdiv_rank.resize(num_subdivs);
+    int subdiv_stride = num_devices / num_subdivs;
+    for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+      col_params_.instance.impl_details.subdiv_offsets.push_back(sdi *
+                                                                 subdiv_stride);
+      col_params_.subdiv_rank[sdi] = sdi * subdiv_stride;
+    }
+
+    // Set up a local device ring order that's not just 0,1,2...
+    std::vector<int> local_ring_order;
+    for (int di = 0; di < num_devices; ++di) {
+      local_ring_order.push_back(di);
+    }
+    for (int di = 0; di < num_devices; ++di) {
+      bool is_odd = ((di % 2) == 1);
+      int other = (di + (is_odd ? 7 : 3)) % num_devices;
+      if (di == other) continue;
+      iter_swap(local_ring_order.begin() + di,
+                local_ring_order.begin() + other);
+    }
+    string lro_buf;
+    for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", ");
+    VLOG(1) << "local_ring_order " << lro_buf;
+
+    // Set up all of the fake device contexts.
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name = strings::StrCat(task_name, "/cpu:", di);
+        if (device_type == DEVICE_GPU) {
+          dev_name =
+              strings::StrCat(task_name, "/gpu:", di % gpu_devices_.size());
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.task_names.push_back(task_name);
+        // Normally each device would set is_local to its own perspective but
+        // this test runs in a single process so is_local is always true.
+        col_params_.task.is_local.push_back(true);
+        for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+          int rotated_di =
+              (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) %
+              num_devices;
+          col_params_.instance.impl_details.subdiv_permutations[sdi].push_back(
+              wi * num_devices + local_ring_order[rotated_di]);
+        }
+      }
+    }
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        int rank = wi * num_devices + di;
+        instances_.push_back(new DeviceInstance(
+            rank, col_params_.instance.device_names[rank], device_type_, this));
+      }
+    }
+  }
+
+  void Reduce() {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoReduce();
+        ++done;
+      });
+    }
+    while (done < static_cast<int>(instances_.size())) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int num_subdivs, int tensor_len,
+               int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, num_subdivs, fail_after);
+    std::vector<T> expected(tensor_len, 0.0);
+    for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+      DeviceInstance* instance = instances_[di];
+      instance->InitTensor(
+          dtype, TensorShape({tensor_len}), [&expected, dtype, di](Tensor* t) {
+            for (size_t i = 0; i < t->NumElements(); ++i) {
+              // The cast is necessary to prevent clang-tidy from insisting
+              // that a faster non-open source function be substituted.
+              float value = pow(10, static_cast<double>(di)) * i;
+              if (dtype == DT_INT32 || dtype == DT_INT64) {
+                value = di * 10 + i;
+              }
+              t->flat<T>()(i) = static_cast<T>(value);
+              expected[i] += value;
+            }
+          });
+    }
+    Reduce();
+    if (fail_after > 0) {
+      // Confirm that every device terminated with the expected error status.
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        EXPECT_EQ("Deliberate failure",
+                  instances_[di]->status_.error_message());
+      }
+    } else {
+      // Confirm that every device computed the same correct reduction value.
+      for (int i = 0; i < tensor_len; ++i) {
+        expected[i] /= (num_workers * num_devices);
+      }
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        TF_EXPECT_OK(instances_[di]->status_);
+        Tensor* inst = &instances_[di]->tensor_;
+        CHECK(inst);
+        Tensor actual(dtype, TensorShape({tensor_len}));
+        if (device_type_ == DEVICE_CPU) {
+          CHECK(actual.CopyFrom(*inst, inst->shape()));
+          VLOG(1) << "actual " << actual.SummarizeValue(100);
+        } else if (device_type_ == DEVICE_GPU) {
+          Notification note;
+          Device* dev = instances_[di]->device_;
+          auto* dev_info = dev->tensorflow_gpu_device_info();
+          CHECK(dev_info);
+          dev_info->default_context->CopyDeviceTensorToCPU(
+              inst, "" /*tensor_name*/, dev, &actual, [&note](const Status& s) {
+                CHECK(s.ok());
+                note.Notify();
+              });
+          note.WaitForNotification();
+        }
+
+        for (int i = 0; i < tensor_len; ++i) {
+          switch (dtype) {
+            case DT_FLOAT:
+              EXPECT_FLOAT_EQ(expected[i], actual.template flat<T>()(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_DOUBLE:
+              EXPECT_DOUBLE_EQ(expected[i], actual.template flat<T>()(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_INT32:
+            case DT_INT64:
+              EXPECT_EQ(expected[i], actual.template flat<T>()(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            default:
+              LOG(FATAL) << "unimplemented";
+          }
+        }
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveReduce(const CollectiveParams& params,
+                                                Tensor* input,
+                                                const DeviceType& device_type,
+                                                DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_reduce_", reduce_counter_++),
+        "CollectiveReduce");
+    TF_CHECK_OK(
+        builder.Attr("T", params.instance.data_type)
+            .Attr("merge_op", "Add")
+            .Attr("final_op", "Id")
+            .Attr("group_size", params.group.group_size)
+            .Attr("group_key", params.group.group_key)
+            .Attr("instance_key", params.instance.instance_key)
+            .Attr("subdiv_offsets", params.instance.impl_details.subdiv_offsets)
+            .Input(FakeInput(params.instance.data_type))
+            .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, RingReducerTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_))
+          << "Couldn't find device " << dev_name
+          << " existing devices: " << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
+
+      int num_subdivs = static_cast<int>(col_params_.subdiv_rank.size());
+      int group_size = col_params_.group.group_size;
+      CHECK_EQ(group_size,
+               static_cast<int>(col_params_.instance.device_names.size()));
+      // Id of this device is at rank position in first subdiv perm.
+      int my_device_id =
+          col_params_.instance.impl_details.subdiv_permutations[0][rank];
+      col_params_.default_rank = my_device_id;
+      // Set rank for all other subdivs by finding that device_id.
+      for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+        for (int r = 0; r < static_cast<int>(col_params_.instance.impl_details
+                                                 .subdiv_permutations[sdi]
+                                                 .size());
+             ++r) {
+          if (my_device_id ==
+              col_params_.instance.impl_details.subdiv_permutations[sdi][r]) {
+            col_params_.subdiv_rank[sdi] = r;
+            break;
+          }
+        }
+      }
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        init_f(&tensor_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        init_f(&cpu_tensor);
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        Notification note;
+        dev_info->default_context->CopyCPUTensorToDevice(
+            &cpu_tensor, device_, &tensor_, [&note](const Status& s) {
+              CHECK(s.ok());
+              note.Notify();
+            });
+        note.WaitForNotification();
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoReduce() {
+      col_params_.merge_op =
+          GetAdd(col_params_.instance.data_type, device_type_, device_);
+      col_params_.final_op =
+          GetDiv(col_params_.instance.data_type, device_type_, device_);
+
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = kStepId;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      int forward_from = 0;
+      op_params.forward_from_array = &forward_from;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op = parent_->GetCollectiveReduce(
+          col_params_, &tensor_, DEVICE_CPU, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      // We never actually execute the kernel, so we need to do the
+      // output allocation that it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, tensor_.shape(),
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Prepare a RingReducer instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      RingReducer rr(parent_->col_exec_, parent_->dev_mgr_.get(), &ctx,
+                     &op_params, col_params_, exec_key, kStepId, &tensor_,
+                     &tensor_);
+
+      // Start execution in a threadpool then wait for completion.
+      Notification notification;
+      SchedClosure([this, &notification, &rr]() {
+        rr.Run([this, &notification](Status s) {
+          status_ = s;
+          notification.Notify();
+        });
+      });
+      notification.WaitForNotification();
+      CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
+
+      dev_ctx->Unref();
+    }
+
+    const Tensor& tensor() { return tensor_; }
+
+    RingReducerTest* parent_;
+    string dev_name_;
+    DeviceType device_type_;
+    int rank_;
+    Tensor tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    std::unique_ptr<CollectiveAdapter> ca_;
+    std::unique_ptr<OpKernelContext> ctx_;
+    Status status_;
+  };
+
+  bool stop_ = false;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<tensorflow::Device*> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  mutex mu_;
+  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
+};
+
+#define DEF_TEST(B, T, W, D, S, L, A)                                         \
+  TEST_F(RingReducerTest,                                                     \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \
+    DataType dtype = DT_##B;                                                  \
+    switch (dtype) {                                                          \
+      case DT_FLOAT: {                                                        \
+        RunTest<float>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_DOUBLE: {                                                       \
+        RunTest<double>(dtype, DEVICE_##T, W, D, S, L, A);                    \
+      } break;                                                                \
+      case DT_INT32: {                                                        \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_INT64: {                                                        \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      default:                                                                \
+        LOG(FATAL) << "Unimplemented";                                        \
+    }                                                                         \
+  }
+
+#ifndef GOOGLE_CUDA
+// Success tests
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 3, 4095, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 3, 1045991, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 4, 1045991, 0)
+DEF_TEST(DOUBLE, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(DOUBLE, CPU, 2, 8, 3, 4095, 0)
+DEF_TEST(INT32, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT32, CPU, 2, 8, 3, 4095, 0)
+DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT64, CPU, 2, 8, 3, 4095, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
+#endif
+
+#ifdef GOOGLE_CUDA
+// GPU tests.  So long as the device names are all in a single tasks we
+// bypass inter-worker routing code and can fake multiple GPUs with a single
+// GPU, from the perspective of the RingReducer logic.  So these tests
+// are all single-worker.
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 3, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 3, 1045991, 0)
+DEF_TEST(FLOAT, GPU, 1, 4, 4, 1045991, 0)
+DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
+// INT32 values are never on the GPU.
+// DEF_TEST(INT32, GPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/scoped_allocator.cc b/tensorflow/core/common_runtime/scoped_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a26672b79dab87d66ac98a8436cc7e2df7473677
--- /dev/null
+++ b/tensorflow/core/common_runtime/scoped_allocator.cc
@@ -0,0 +1,210 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+
+namespace tensorflow {
+
+ScopedAllocator::ScopedAllocator(const Tensor& backing_tensor, int32 scope_id,
+                                 const string& name,
+                                 const gtl::ArraySlice<Field>& fields,
+                                 int32 expected_call_count,
+                                 ScopedAllocatorContainer* container)
+    : backing_tensor_(backing_tensor),
+      tbuf_(backing_tensor_.buf_),
+      id_(scope_id),
+      name_(name),
+      container_(container),
+      fields_(fields.begin(), fields.end()),
+      expected_call_count_(expected_call_count),
+      live_alloc_count_(0) {
+  // Hold this until all aliases have been deallocated.
+  tbuf_->Ref();
+  // Hold this until all expected_calls have been made.
+  container->Ref();
+  CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes);
+}
+
+ScopedAllocator::~ScopedAllocator() {
+  mutex_lock l(mu_);
+  VLOG(1) << "~ScopedAllocator " << this << " tbuf_ " << tbuf_ << " data "
+          << static_cast<void*>(tbuf_->data());
+  // In the absence of incomplete graph execution situations
+  // (interruption by error status or control flow branch crossing
+  // ScopedAllocation region) we expect expected_call_count_ == 0 at
+  // exit.
+  if (VLOG_IS_ON(1)) {
+    if (expected_call_count_ > 0)
+      VLOG(1) << "expected_call_count_ = " << expected_call_count_
+              << " at deallocation";
+  }
+  if (tbuf_) tbuf_->Unref();
+}
+
+void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) {
+  VLOG(1) << "ScopedAllocator index " << id_ << " AllocateRaw "
+          << "field " << field_index << " num_bytes " << num_bytes;
+  mutex_lock l(mu_);
+  if (expected_call_count_ <= 0) {
+    LOG(ERROR) << "Scoped allocator " << name_
+               << " could not satisfy request for " << num_bytes
+               << " bytes, expected uses exhausted. ";
+    return nullptr;
+  }
+
+  int32_t num_fields = static_cast<int32>(fields_.size());
+  if (field_index >= num_fields) {
+    LOG(ERROR) << "ScopedAllocator " << name_
+               << " received unexpected field number " << field_index;
+    return nullptr;
+  }
+
+  const Field& f = fields_[field_index];
+  if (num_bytes != f.bytes) {
+    LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
+               << num_bytes << " bytes from field " << field_index
+               << " which has precalculated size " << f.bytes << " and offset "
+               << f.offset;
+    return nullptr;
+  }
+
+  void* ptr = static_cast<void*>((tbuf_->template base<char>() + f.offset));
+
+  ++live_alloc_count_;
+  --expected_call_count_;
+  if (0 == expected_call_count_) {
+    for (auto& f : fields_) {
+      container_->Drop(f.scope_id, this);
+    }
+    container_->Drop(id_, this);
+    container_->Unref();
+    container_ = nullptr;
+  }
+  VLOG(1) << "AllocateRaw returning " << ptr;
+  return ptr;
+}
+
+void ScopedAllocator::DeallocateRaw(void* p) {
+  CHECK(VerifyPointer(p));
+
+  bool dead = false;
+  {
+    mutex_lock l(mu_);
+    CHECK_GT(live_alloc_count_, 0);
+    if (0 == --live_alloc_count_) {
+      if (0 == expected_call_count_) {
+        dead = true;
+      }
+    }
+  }
+  if (dead) {
+    delete this;
+  }
+}
+
+bool ScopedAllocator::VerifyPointer(const void* p) {
+  void* base = tbuf_->data();
+  CHECK_GE(p, base);
+  for (auto& f : fields_) {
+    void* f_ptr = static_cast<void*>(static_cast<char*>(base) + f.offset);
+    if (f_ptr == p) {
+      return true;
+      break;
+    }
+  }
+  VLOG(1) << "ScopedAllocator index " << id_ << " VerifyPointer for p=" << p
+          << " failed.";
+  return false;
+}
+
+bool ScopedAllocator::VerifyTensor(const Tensor* t) {
+  return VerifyPointer(t->buf_->data());
+}
+
+ScopedAllocatorInstance::ScopedAllocatorInstance(ScopedAllocator* sa,
+                                                 int32 field_index)
+    : scoped_allocator_(sa),
+      field_index_(field_index),
+      allocated_(false),
+      deallocated_(false),
+      in_table_(true) {
+  VLOG(1) << "new ScopedAllocatorInstance " << this << " on SA " << sa
+          << " field_index " << field_index;
+}
+
+void ScopedAllocatorInstance::DropFromTable() {
+  bool del = false;
+  {
+    mutex_lock l(mu_);
+    CHECK(in_table_);
+    in_table_ = false;
+    VLOG(2) << "ScopedAllocatorInstance::DropFromTable " << this
+            << " allocated_ " << allocated_ << " deallocated_ " << deallocated_
+            << " in_table_ " << in_table_;
+    // Single use is complete when it is allocated and deallocated.
+    // This check prevents a race between Allocating the tensor slice and
+    // Dropping it from the parent container's table.
+    if (allocated_ && deallocated_) {
+      del = true;
+    }
+  }
+  if (del) delete this;
+}
+
+void* ScopedAllocatorInstance::AllocateRaw(size_t alignment, size_t num_bytes) {
+  void* ptr = scoped_allocator_->AllocateRaw(field_index_, num_bytes);
+  {
+    mutex_lock l(mu_);
+    if (nullptr == ptr) {
+      VLOG(2) << "ScopedAllocatorInstance::AllocateRaw " << this
+              << " call to underlying ScopedAllocator unsuccessful,"
+              << " allocated_ " << allocated_ << " deallocated_ "
+              << deallocated_ << " in_table_ " << in_table_
+              << " returning nullptr.";
+    } else {
+      allocated_ = true;
+      VLOG(2) << "ScopedAllocatorInstance::AllocateRaw " << this
+              << " allocated_ " << allocated_ << " deallocated_ "
+              << deallocated_ << " in_table_ " << in_table_
+              << " returning ptr = " << ptr;
+    }
+  }
+  return ptr;
+}
+
+void ScopedAllocatorInstance::DeallocateRaw(void* p) {
+  scoped_allocator_->DeallocateRaw(p);
+  bool del = false;
+  {
+    mutex_lock l(mu_);
+    CHECK(allocated_);
+    deallocated_ = true;
+    VLOG(2) << "ScopedAllocatorInstance::DeallocateRaw " << this
+            << " allocated_ " << allocated_ << " deallocated_ " << deallocated_
+            << " in_table_ " << in_table_;
+    // Single use is now complete, but only delete this instance when it is
+    // no longer in a ScopedAllocatorContainer's table.
+    if (!in_table_) {
+      del = true;
+    }
+  }
+  if (del) delete this;
+}
+
+string ScopedAllocatorInstance::Name() {
+  return strings::StrCat(scoped_allocator_->name(), "_field_", field_index_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..71a7001217352cae2c374ad1f11a13482e940be2
--- /dev/null
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_H_
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+class ScopedAllocatorContainer;
+class ScopedAllocatorInstance;
+
+// Manages a single backing tensor and a collection of aliases.
+class ScopedAllocator {
+ public:
+  static const int32 kInvalidId = 0;
+  static const size_t kMaxAlignment = 64;
+
+  // A subrange of the TensorBuffer associated with this object that
+  // will be the backing memory for one aliased tensor.
+  struct Field {
+    int32 scope_id;
+    size_t offset;
+    size_t bytes;
+  };
+  // Field index that refers to backing tensor, not any aliased field.
+  static const int32 kBackingIndex = -1;
+
+  // backing_tensor is expected to be newly allocated by a ScopedAllocatorOp
+  // instance.  It must be large enough to back all of the specified
+  // (offset, byte) ranges of the fields.
+  ScopedAllocator(const Tensor& backing_tensor, int32 scope_id,
+                  const string& name, const gtl::ArraySlice<Field>& fields,
+                  int32 expected_call_count,
+                  ScopedAllocatorContainer* container);
+
+  // Automatically deletes when last use expires, or when
+  // ScopedAllocatorContainer decides to delete.
+  ~ScopedAllocator() LOCKS_EXCLUDED(mu_);
+
+  // For debugging: returns true iff p is a pointer that could have
+  // been returned by AllocateRaw.
+  bool VerifyPointer(const void* p);
+  bool VerifyTensor(const Tensor* t);
+
+  const Tensor& tensor() const { return backing_tensor_; }
+
+  const string& name() const { return name_; }
+
+ private:
+  friend class ScopedAllocatorInstance;
+  // Only ScopedAllocatorInstances can call AllocateRaw and DeallocateRaw on a
+  // ScopedAllocator
+  void* AllocateRaw(int32 field_index, size_t num_bytes) LOCKS_EXCLUDED(mu_);
+  void DeallocateRaw(void* p) LOCKS_EXCLUDED(mu_);
+  Tensor backing_tensor_;
+  TensorBuffer* tbuf_;
+  int32 id_;
+  string name_;
+  ScopedAllocatorContainer* container_;
+  std::vector<Field> fields_;
+  mutex mu_;
+  int32 expected_call_count_ GUARDED_BY(mu_);
+  int32 live_alloc_count_ GUARDED_BY(mu_);
+};
+
+// An Allocator that will return a pointer into the backing buffer of
+// a previously allocated tensor, allowing creation of an alias
+// tensor.  There is a one-to-one mapping between the fields of a
+// ScopedAllocator and ScopedAllocatorInstances.  There is also a one-to-one
+// mapping between scope_ids and ScopedAllocatorInstances.  It should be
+// discarded immediately after a single use.
+class ScopedAllocatorInstance : public Allocator {
+ public:
+  explicit ScopedAllocatorInstance(ScopedAllocator* sa, int32 field_index);
+
+ private:
+  ~ScopedAllocatorInstance() { VLOG(1) << "~ScopedAllocatorInstance " << this; }
+
+ public:
+  // When a ScopedAllocatorContainer "Drops" a scope_id, it calls DropFromTable
+  // on the underlying ScopedAllocatorInstance.  If this instance has already
+  // deallocated the tensor slice, we can safely delete this.
+  void DropFromTable() LOCKS_EXCLUDED(mu_);
+  void* AllocateRaw(size_t alignment, size_t num_bytes)
+      LOCKS_EXCLUDED(mu_) override;
+  void* AllocateRaw(size_t alignment, size_t num_bytes,
+                    const AllocationAttributes& allocator_attr) override {
+    return AllocateRaw(alignment, num_bytes);
+  }
+  void DeallocateRaw(void* p) LOCKS_EXCLUDED(mu_) override;
+  bool TracksAllocationSizes() override { return false; }
+  bool ShouldAllocateEmptyTensors() override { return false; }
+  size_t RequestedSize(const void* ptr) override { return 0; }
+  size_t AllocatedSize(const void* ptr) override { return 0; }
+  int64 AllocationId(const void* ptr) override { return 0; }
+  size_t AllocatedSizeSlow(const void* ptr) override { return 0; }
+  string Name() override;
+
+ private:
+  mutex mu_;
+  ScopedAllocator* scoped_allocator_;
+  int32 field_index_;
+  bool allocated_ GUARDED_BY(mu_);
+  bool deallocated_ GUARDED_BY(mu_);
+  bool in_table_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be79cc4507124fbbef8104f87773045c07005ef6
--- /dev/null
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+
+namespace tensorflow {
+
+Status ScopedAllocatorContainer::AddScopedAllocator(
+    const Tensor& backing_tensor, int32 scope_id, const string& scope_name,
+    const gtl::ArraySlice<ScopedAllocator::Field>& fields,
+    int32 expected_call_count) {
+  VLOG(1) << "AddScopedAllocator " << mgr_->device_name()
+          << " step_id_=" << step_id_ << " scope_id=" << scope_id;
+  mutex_lock l(mu_);
+  // Ensure none of the new scope_ids are in use.
+  auto it = allocators_.find(scope_id);
+  if (it != allocators_.end()) {
+    return errors::Internal("Cannot create ScopedAllocator because scope_id ",
+                            scope_id, " for name ", scope_name,
+                            " already exists");
+  }
+  for (auto& f : fields) {
+    if (allocators_.find(f.scope_id) != allocators_.end()) {
+      return errors::Internal(
+          "Cannot create ScopedAllocator because field scope_id ", f.scope_id,
+          " for name ", scope_name, " already exists");
+    }
+  }
+  VLOG(2) << " container " << this << " step_id " << step_id_;
+  ScopedAllocator* sa = new ScopedAllocator(
+      backing_tensor, scope_id, scope_name, fields, expected_call_count, this);
+  allocators_[scope_id] =
+      ScopedAllocatorContainer::SAField(ScopedAllocator::kBackingIndex, sa);
+  VLOG(2) << "#fields " << fields.size();
+  for (int i = 0; i < fields.size(); ++i) {
+    const ScopedAllocator::Field& f = fields[i];
+    VLOG(2) << "Adding instance with for " << mgr_->device_name()
+            << " scope_id=" << f.scope_id;
+    allocators_[f.scope_id] = ScopedAllocatorContainer::SAField(
+        i, new ScopedAllocatorInstance(sa, i));
+  }
+  return Status::OK();
+}
+
+ScopedAllocator* ScopedAllocatorContainer::GetAllocator(int32 scope_id) {
+  mutex_lock l(mu_);
+  auto it = allocators_.find(scope_id);
+  if (it != allocators_.end()) {
+    CHECK_EQ(ScopedAllocator::kBackingIndex, it->second.field_index);
+    return it->second.scoped_allocator;
+  } else {
+    LOG(ERROR) << "Failed to find ScopedAllocator for " << scope_id
+               << " in container for step " << step_id_ << " on "
+               << mgr_->device_name();
+    return nullptr;
+  }
+}
+
+ScopedAllocatorInstance* ScopedAllocatorContainer::GetInstance(int32 scope_id) {
+  VLOG(2) << "GetInstance " << scope_id << " step " << step_id_ << " on "
+          << mgr_->device_name();
+  mutex_lock l(mu_);
+  auto it = allocators_.find(scope_id);
+  if (it != allocators_.end()) {
+    return it->second.instance;
+  }
+  LOG(FATAL) << "Failed to find instance " << scope_id << " in container "
+             << step_id_ << " on " << mgr_->device_name();
+  return nullptr;
+}
+
+void ScopedAllocatorContainer::Drop(int32 scope_id, ScopedAllocator* sa) {
+  VLOG(2) << "Drop " << scope_id << " from container " << this << " step "
+          << step_id_ << " on " << mgr_->device_name();
+  mutex_lock l(mu_);
+  auto it = allocators_.find(scope_id);
+  if (it != allocators_.end()) {
+    if (it->second.field_index != ScopedAllocator::kBackingIndex) {
+      it->second.instance->DropFromTable();
+    }
+    allocators_.erase(it);
+  }
+}
+
+ScopedAllocatorContainer::~ScopedAllocatorContainer() {
+  VLOG(2) << "~ScopedAllocatorContainer " << this << " step " << step_id_
+          << " on " << mgr_->device_name();
+  mutex_lock l(mu_);
+  // In normal execution the table should be empty and all of its
+  // contents deleted via Drop.  When when a step ends early
+  // (e.g. through abnormal termination) we need to clean up
+  // explicitly.  So long as graph execution of the associated step has
+  // completey terminated this should be safe.
+  for (auto& it : allocators_) {
+    if (it.second.field_index == ScopedAllocator::kBackingIndex) {
+      delete it.second.scoped_allocator;
+    } else {
+      it.second.instance->DropFromTable();
+    }
+  }
+}
+
+ScopedAllocatorMgr::~ScopedAllocatorMgr() {
+  mutex_lock l(mu_);
+  for (auto it : per_step_map_) {
+    // In normal execution the associated ScopedAllocatorContainer is
+    // empty and gone by the end of the step.  But in abnormal termination,
+    // such as when an error has interrupted execution or in a unittest,
+    // we need to remove all of its Refs here to avoid memory leaks.
+    // This is safe so long as graph execution has ceased.
+    while (!it.second->Unref()) {
+    }
+  }
+}
+
+void ScopedAllocatorMgr::Cleanup(int64 step_id) {
+  mutex_lock l(mu_);
+  auto it = per_step_map_.find(step_id);
+  if (it != per_step_map_.end()) {
+    it->second->Unref();
+    per_step_map_.erase(it);
+  }
+}
+
+ScopedAllocatorContainer* ScopedAllocatorMgr::GetContainer(int64 step_id) {
+  VLOG(2) << "GetContainer " << step_id << " on " << device_name();
+  ScopedAllocatorContainer* sac = nullptr;
+  mutex_lock l(mu_);
+  auto it = per_step_map_.find(step_id);
+  if (it == per_step_map_.end()) {
+    sac = new ScopedAllocatorContainer(this, step_id);
+    per_step_map_[step_id] = sac;
+  } else {
+    sac = it->second;
+  }
+  return sac;
+}
+
+Status ScopedAllocatorMgr::AddScopedAllocator(
+    const Tensor& backing_tensor, int64 step_id, int32 scope_id,
+    const string& scope_name,
+    const gtl::ArraySlice<ScopedAllocator::Field>& fields,
+    int32 expected_call_count) {
+  ScopedAllocatorContainer* sac = GetContainer(step_id);
+  return sac->AddScopedAllocator(backing_tensor, scope_id, scope_name, fields,
+                                 expected_call_count);
+}
+
+void ScopedAllocatorMgr::PopulateFields(
+    int32 scope_id, const gtl::ArraySlice<TensorShape>& shapes,
+    const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
+  const int32 num_fields = static_cast<int32>(shapes.size());
+  fields->resize(num_fields);
+  size_t offset = 0;
+  for (int32 i = 0; i < num_fields; ++i) {
+    size_t bytes = shapes[i].num_elements() * DataTypeSize(dtype);
+    (*fields)[i].scope_id = scope_id + 1 + i;
+    (*fields)[i].bytes = bytes;
+    (*fields)[i].offset = offset;
+    VLOG(1) << "field=" << i << " scope_id=" << (*fields)[i].scope_id
+            << " bytes=" << (*fields)[i].bytes
+            << " offset=" << (*fields)[i].offset;
+    offset += bytes;
+    size_t overshoot = offset % Allocator::kAllocatorAlignment;
+    if (overshoot > 0) {
+      offset += (Allocator::kAllocatorAlignment - overshoot);
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.h b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..effc5f2d775336621a783d83b7dd5eece6d42292
--- /dev/null
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_MGR_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+class ScopedAllocatorMgr;
+
+// At most one of these exists per <device, step_id> pair.
+// A Ref is held by every ScopedAllocator and also by the ScopedAllocatorMgr.
+class ScopedAllocatorContainer : public core::RefCounted {
+ public:
+  // Establishes a reachable ScopedAllocator.
+  Status AddScopedAllocator(
+      const Tensor& backing_tensor, int32 scope_id, const string& scope_name,
+      const gtl::ArraySlice<ScopedAllocator::Field>& fields,
+      int32 expected_call_count);
+
+  ScopedAllocatorInstance* GetInstance(int32 scope_id);
+  ScopedAllocator* GetAllocator(int32 scope_id);
+
+  // Retire the scope_id.
+  void Drop(int32 scope_id, ScopedAllocator* sa);
+
+ protected:
+  friend class ScopedAllocatorMgr;
+  ScopedAllocatorContainer(const ScopedAllocatorMgr* mgr, int64 step_id)
+      : mgr_(mgr), step_id_(step_id) {}
+  ~ScopedAllocatorContainer();
+
+ private:
+  const ScopedAllocatorMgr* mgr_;
+  int64 step_id_;
+  mutex mu_;
+  struct SAField {
+    int32 field_index;
+    union {
+      ScopedAllocator* scoped_allocator;
+      ScopedAllocatorInstance* instance;
+    };
+    SAField(int32 fi, ScopedAllocatorInstance* sai)
+        : field_index(fi), instance(sai) {}
+    SAField(int32 fi, ScopedAllocator* sa)
+        : field_index(fi), scoped_allocator(sa) {}
+    SAField()
+        : field_index(ScopedAllocator::kBackingIndex),
+          scoped_allocator(nullptr) {}
+  };
+  std::unordered_map<int32, SAField> allocators_ GUARDED_BY(mu_);
+};
+
+// At most one of these exists per device.
+class ScopedAllocatorMgr {
+ public:
+  explicit ScopedAllocatorMgr(const string& device_name)
+      : device_name_(device_name) {}
+  ~ScopedAllocatorMgr();
+
+  ScopedAllocatorContainer* GetContainer(int64 step_id);
+
+  // Establishes a reachable ScopedAllocator.
+  Status AddScopedAllocator(
+      const Tensor& backing_tensor, int64 step_id, int32 scope_id,
+      const string& scope_name,
+      const gtl::ArraySlice<ScopedAllocator::Field>& fields,
+      int32 expected_call_count);
+
+  void Cleanup(int64 step_id);
+
+  // Populate the bytes and offset members of Field.  Instance allocaters get
+  // consecutive scope_id values following that of the base ScopedAllocator.
+  static void PopulateFields(int32 scope_id,
+                             const gtl::ArraySlice<TensorShape>& shapes,
+                             const DataType dtype,
+                             std::vector<ScopedAllocator::Field>* fields);
+
+  const string& device_name() const { return device_name_; }
+
+ private:
+  string device_name_;
+  mutex mu_;
+  std::unordered_map<int64, ScopedAllocatorContainer*> per_step_map_
+      GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SCOPED_ALLOCATOR_MGR_H_
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38e07e47f24d6808e858721f8e7832668a556164
--- /dev/null
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
@@ -0,0 +1,227 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class ScopedAllocatorMgrTest : public ::testing::Test {
+ public:
+  ScopedAllocatorMgrTest() : sam_("CPU0") {}
+
+  void InitTensor() {
+    backing_tensor_ = Tensor(cpu_allocator(), DT_FLOAT, backing_tensor_shape_);
+  }
+
+  void PopulateFields() {
+    ScopedAllocatorMgr::PopulateFields(scope_id_, fields_shapes_, DT_FLOAT,
+                                       &fields_);
+  }
+
+  Status AddScopedAllocator(int expected_use_count, int scope_id) {
+    VLOG(2) << "Adding ScopedAllocator step_id " << step_id_ << " scope_id "
+            << scope_id_ << " #fields " << fields_.size()
+            << " expected_use_count " << expected_use_count;
+    return sam_.AddScopedAllocator(backing_tensor_, step_id_, scope_id,
+                                   "tensor_shape_599", fields_,
+                                   expected_use_count);
+  }
+
+  Status PrepScopedAllocatorMgr(int expected_use_count) {
+    InitTensor();
+    PopulateFields();
+    return AddScopedAllocator(expected_use_count, scope_id_);
+  }
+
+  void SaveInstances(int num_instances) {
+    sa_instances_.clear();
+    sa_instances_.resize(num_instances);
+    ScopedAllocatorContainer* sac = sam_.GetContainer(step_id_);
+    for (int i = 0; i < num_instances; i++) {
+      sa_instances_[i] = sac->GetInstance(scope_id_ + 1 + i);
+    }
+  }
+
+  // For the specific case when the backing tensor is of shape
+  // {512 + 9 + 512 + 16} and the fields_shapes are {{512}, {3,3}, {2, 256}}
+  // This method computes the padding between the second and third slice of the
+  // backing tensor.  This example is reused across multiple tests.
+  int AlignmentPadding() {
+    int alignment_padding =
+        (Allocator::kAllocatorAlignment -
+         (521 * sizeof(float)) % Allocator::kAllocatorAlignment) %
+        Allocator::kAllocatorAlignment;
+    return alignment_padding;
+  }
+
+  // Debug
+  void PrintShapes() {
+    VLOG(2) << "tensor_shape=" << backing_tensor_shape_.DebugString();
+    for (int i = 0; i < fields_shapes_.size(); i++) {
+      VLOG(2) << "fields_shapes[" << i
+              << "]=" << fields_shapes_[i].DebugString();
+    }
+  }
+
+ protected:
+  TensorShape backing_tensor_shape_;
+  Tensor backing_tensor_;
+  std::vector<TensorShape> fields_shapes_;
+  std::vector<ScopedAllocator::Field> fields_;
+  ScopedAllocatorMgr sam_;
+  const int step_id_ = 101;
+  const int scope_id_ = 599;
+  std::vector<ScopedAllocatorInstance*> sa_instances_;
+};
+
+TEST_F(ScopedAllocatorMgrTest, ContainerAllocation) {
+  ScopedAllocatorContainer* sac_101 = sam_.GetContainer(101);
+  EXPECT_TRUE(sac_101 != nullptr);
+  ScopedAllocatorContainer* sac_201 = sam_.GetContainer(201);
+  EXPECT_TRUE(sac_201 != nullptr);
+  EXPECT_NE(sac_101, sac_201);
+  ScopedAllocatorContainer* also_sac_101 = sam_.GetContainer(101);
+  EXPECT_EQ(sac_101, also_sac_101);
+  sam_.Cleanup(101);
+  // 201 should be cleaned up by the destructor.
+}
+
+TEST_F(ScopedAllocatorMgrTest, PopulateFields) {
+  backing_tensor_shape_ = TensorShape({512 + 9 + 512 + 16});
+  fields_shapes_ = std::vector<TensorShape>({{512}, {3, 3}, {2, 256}});
+  InitTensor();
+  PopulateFields();
+  EXPECT_EQ(0, fields_[0].offset);
+  EXPECT_EQ(512 * sizeof(float), fields_[0].bytes);
+  EXPECT_EQ(scope_id_ + 1, fields_[0].scope_id);
+  EXPECT_EQ(512 * sizeof(float), fields_[1].offset);
+  EXPECT_EQ(9 * sizeof(float), fields_[1].bytes);
+  EXPECT_EQ(scope_id_ + 2, fields_[1].scope_id);
+  EXPECT_EQ(521 * sizeof(float) + AlignmentPadding(), fields_[2].offset);
+  EXPECT_EQ(512 * sizeof(float), fields_[2].bytes);
+  EXPECT_EQ(scope_id_ + 3, fields_[2].scope_id);
+}
+
+TEST_F(ScopedAllocatorMgrTest, ContainerAddAllocator) {
+  backing_tensor_shape_ = TensorShape({1024});
+  fields_shapes_ = std::vector<TensorShape>({{512}, {512}});
+  Status s = PrepScopedAllocatorMgr(2);
+  EXPECT_TRUE(s.ok());
+  // Need to call Allocate and Deallocate in order to use up the expected uses
+  // for this allocator.  Save the instances for now.
+  SaveInstances(fields_shapes_.size());
+
+  s = AddScopedAllocator(2, scope_id_);
+  EXPECT_FALSE(s.ok());
+  fields_[0].scope_id = scope_id_ + 1;
+  s = AddScopedAllocator(2, scope_id_ + 3);
+  EXPECT_FALSE(s.ok());
+
+  // Cleanup the instances by invoking allocate and deallocate.
+  void* ptr0 =
+      sa_instances_[0]->AllocateRaw(0 /* alignment */, 512 * sizeof(float));
+  void* ptr1 =
+      sa_instances_[1]->AllocateRaw(0 /* alignment */, 512 * sizeof(float));
+  sa_instances_[0]->DeallocateRaw(ptr0);
+  sa_instances_[1]->DeallocateRaw(ptr1);
+}
+
+TEST_F(ScopedAllocatorMgrTest, AllocatorSuccess) {
+  ScopedAllocatorContainer* sac = sam_.GetContainer(step_id_);
+  ScopedAllocator* other = sac->GetAllocator(scope_id_);
+  EXPECT_EQ(other, nullptr);
+  backing_tensor_shape_ = TensorShape({512 + 9 + 512 + 16});
+  fields_shapes_ = std::vector<TensorShape>({{512}, {3, 3}, {2, 256}});
+  Status s = PrepScopedAllocatorMgr(3);
+  other = sac->GetAllocator(scope_id_);
+
+  ScopedAllocatorInstance* inst0 = sac->GetInstance(scope_id_ + 1);
+  char* ptr0 = static_cast<char*>(inst0->AllocateRaw(0, 512 * sizeof(float)));
+  const char* base =
+      static_cast<const char*>(DMAHelper::base(&backing_tensor_));
+  EXPECT_EQ(ptr0, base);
+
+  ScopedAllocatorInstance* inst1 = sac->GetInstance(scope_id_ + 2);
+  char* ptr1 = static_cast<char*>(inst1->AllocateRaw(0, 9 * sizeof(float)));
+  EXPECT_EQ(ptr1, ptr0 + (512 * sizeof(float)));
+
+  ScopedAllocatorInstance* inst2 = sac->GetInstance(scope_id_ + 3);
+  char* ptr2 = static_cast<char*>(inst2->AllocateRaw(0, 512 * sizeof(float)));
+  EXPECT_EQ(ptr2, ptr1 + AlignmentPadding() + (9 * sizeof(float)));
+
+  // At this point the scopes should be gone from the container
+  EXPECT_EQ(nullptr, sac->GetAllocator(scope_id_));
+
+  // The ScopedAllocatorInstances automatically delete when their memory
+  // is returned and they are out of table.
+  inst0->DeallocateRaw(ptr0);
+  inst1->DeallocateRaw(ptr1);
+  inst2->DeallocateRaw(ptr2);
+}
+
+// ScopedAllocator initialization should fail because backing_tensor is not
+// large enough to hold all the fields
+TEST_F(ScopedAllocatorMgrTest, AllocatorInitFail) {
+  backing_tensor_shape_ = TensorShape({8});
+  InitTensor();
+  fields_.resize(1);
+  fields_[0].scope_id = scope_id_ + 1;
+  fields_[0].offset = 0;
+  fields_[0].bytes = backing_tensor_shape_.num_elements() * 2 * sizeof(float);
+  // fields[0].offset + fields[0].bytes is larger than the size of the backing
+  // tensor, so this check should fail
+  EXPECT_DEATH(Status s = AddScopedAllocator(1, scope_id_), "");
+}
+
+// ScopedAllocator allocation should fail because we called more times than
+// expected, or we deallocated a non-existent pointer, or we requested more
+// or less than the exact size of an instance buffer.
+TEST_F(ScopedAllocatorMgrTest, AllocatorFail) {
+  backing_tensor_shape_ = TensorShape({1024});
+  fields_shapes_ = std::vector<TensorShape>({{512}, {512}});
+  Status s = PrepScopedAllocatorMgr(2);
+  EXPECT_TRUE(s.ok());
+  // Save instances so that we can explicitly delete later on.  In normal
+  // operation the instances will be automatically deleted after single use, but
+  // in this test we are invoking the ScopedAllocator's Alloc/Dealloc interface,
+  // so we need to explicitly delete the instances to avoid a memleak.
+  SaveInstances(fields_shapes_.size());
+
+  char* ptr0 =
+      static_cast<char*>(sa_instances_[0]->AllocateRaw(0, 512 * sizeof(float)));
+  VLOG(2) << "Should fail because we deallocate ptr="
+          << static_cast<void*>(ptr0 + 8) << " which we never allocated.";
+  EXPECT_DEATH(sa_instances_[0]->DeallocateRaw(ptr0 + 8), "");
+  VLOG(2) << "Should fail because we allocate smaller than the size of the "
+          << "field.";
+  EXPECT_EQ(nullptr, sa_instances_[1]->AllocateRaw(0, 256 * sizeof(float)));
+  VLOG(2) << "Should fail because we allocate larger than the size of the "
+          << "field.";
+  EXPECT_EQ(nullptr, sa_instances_[1]->AllocateRaw(0, 1024 * sizeof(float)));
+  void* ptr1 = sa_instances_[1]->AllocateRaw(0, 512 * sizeof(float));
+  VLOG(2) << "Should fail because we exceed expected_use_count.";
+  EXPECT_EQ(nullptr, sa_instances_[0]->AllocateRaw(0, 512 * sizeof(float)));
+  sa_instances_[0]->DeallocateRaw(ptr0);
+  sa_instances_[1]->DeallocateRaw(ptr1);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/session_test.cc b/tensorflow/core/common_runtime/session_test.cc
index a074154450694e2135e07f345082393773d97084..feaf29c7bb528c6019da3ae273681997173fd372 100644
--- a/tensorflow/core/common_runtime/session_test.cc
+++ b/tensorflow/core/common_runtime/session_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/public/session.h"
 
 #include "tensorflow/core/common_runtime/session_factory.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -31,10 +32,9 @@ TEST(SessionTest, InvalidTargetReturnsNull) {
   Session* session;
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "No session factory registered for the given session options"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No session factory registered for the given session options"));
 }
 
 // Register a fake session factory to test error handling paths in
@@ -44,7 +44,7 @@ class FakeSessionFactory : public SessionFactory {
   FakeSessionFactory() {}
 
   bool AcceptsOptions(const SessionOptions& options) override {
-    return StringPiece(options.target).starts_with("fake");
+    return str_util::StartsWith(options.target, "fake");
   }
 
   Session* NewSession(const SessionOptions& options) override {
@@ -68,9 +68,9 @@ TEST(SessionTest, MultipleFactoriesForTarget) {
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::INTERNAL);
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("Multiple session factories"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("FAKE_SESSION_1"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("FAKE_SESSION_2"));
+      str_util::StrContains(s.error_message(), "Multiple session factories"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_1"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_2"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 45cdab98e0642a3fbfee3dfa415696b98251600a..06dbe049868b2f85e8ebcabe4df5cec2170486b4 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/eval_const_tensor.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -211,14 +212,14 @@ Status ShapeRefiner::AddNode(const Node* node) {
   // For each 'input' of this node, fetch the corresponding shape
   // from 'input's InferenceContext, and store into a vector
   // indexed by 'node's input.
-  std::vector<Node*> input_nodes(node->num_inputs());
+  std::vector<const Node*> input_nodes(node->num_inputs());
   std::vector<ShapeHandle> input_shapes(node->num_inputs());
   std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
       input_handle_shapes_and_types(node->num_inputs());
   for (const Edge* e : node->in_edges()) {
     if (e->IsControlEdge()) continue;
 
-    Node* input = e->src();
+    const Node* input = e->src();
     auto it = node_to_context_.find(input);
     if (it == node_to_context_.end()) {
       return errors::FailedPrecondition(
@@ -350,6 +351,11 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
         }
       }
     }
+    if (node_context->requested_input_tensor_as_partial_shape(dst_input)) {
+      // The input value may have changed. Since we have no way to know if
+      // that's indeed the case, err on the safe side.
+      *refined = true;
+    }
 
     // Also propagate handle shape and dtype of edges which are carrying
     // resource handles.
@@ -407,301 +413,13 @@ Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
                                                    int dst_idx, bool* evaluated,
                                                    Tensor* result) {
   *evaluated = false;
-
   const Edge* input_edge;
   TF_RETURN_IF_ERROR(node->input_edge(dst_idx, &input_edge));
-
-  // Simple case: the source node is a constant
-  const Node* src = input_edge->src();
-  if (src->IsConstant()) {
-    if (result->FromProto(src->def().attr().at("value").tensor())) {
-      *evaluated = true;
-      return Status::OK();
-    }
-  }
-
-  if (disable_constant_propagation_) {
-    return Status::OK();
-  }
-
-  bool is_constant_graph = false;
-  Graph subgraph(ops_registry_);
-  auto versions = subgraph.versions();
-  versions.set_producer(graph_def_version_);
-  subgraph.set_versions(versions);
-
-  // We identify the possibly constant subgraph to evaluate by
-  // recursively iterating backwards through the inputs to 'node'
-  // until we either 1) find an already existing input to our subgraph
-  // (filled in `const_inputs`), 2) Discover our graph is not constant,
-  // or 3) Hit a root node.
-  std::vector<std::pair<string, Tensor>> const_inputs;
-  TF_RETURN_IF_ERROR(ExtractConstantSubgraph(
-      input_edge->src(), &subgraph, &is_constant_graph, &const_inputs));
-  if (!is_constant_graph) {
-    return Status::OK();
-  }
-  const string output_tensor_name =
-      strings::StrCat(input_edge->src()->name(), ":", input_edge->src_output());
-  std::vector<Tensor> outputs;
-
-  // NOTE; we should pass in a function library runtime if we want
-  // to support constant-expression evaluation on functions.
-  Status s = graph_runner_.Run(&subgraph, nullptr /* function_library */,
-                               const_inputs, {output_tensor_name}, &outputs);
-
-  // If all kernels in the constant graph are not registered
-  // in the process, GraphRunner::Run may fail, in which case
-  // we cannot propagate constants, so this is best-effort.
-  if (s.ok()) {
-    *result = outputs[0];
-    *evaluated = true;
-
-    // We memoize (small) constants evaluated so far, so
-    // ExtractConstantSubgraph can avoid extracting the full
-    // subgraph.  As we build up large graphs, this avoids
-    // repeated computation of the early parts of a constant
-    // graph.
-    if (outputs[0].TotalBytes() <= kMaxTensorSize) {
-      const_tensor_map_[output_tensor_name] = outputs[0];
-    }
-  }
-  return Status::OK();
-}
-
-Status ShapeRefiner::TryToInferTensorOutputFromInputShapes(const Edge* edge,
-                                                           Tensor* output,
-                                                           bool* success) {
-  *success = false;
-  const Node* node = edge->src();
-  auto it = node_to_context_.find(node);
-  if (it == node_to_context_.end()) {
-    return errors::FailedPrecondition("Node does not have context.");
-  }
-  InferenceContext* c = it->second->get_context();
-
-  if (node->type_string() == "Shape") {
-    // If input shapes to the shape op are fully defined,
-    // we can infer the shape op's output tensor.
-    bool fully_defined_inputs = c->FullyDefined(c->input(0));
-    if (fully_defined_inputs) {
-      int input_rank = c->Rank(c->input(0));
-      Tensor t(node->output_type(0), TensorShape({input_rank}));
-      if (node->output_type(0) == DT_INT32) {
-        auto flat = t.flat<int>();
-        for (int i = 0; i < input_rank; i++) {
-          int64 dimension = c->Value(c->Dim(c->input(0), i));
-          if (!FastBoundsCheck(dimension, std::numeric_limits<int32>::max())) {
-            return errors::FailedPrecondition(
-                "Shape has output type int32, but dimension exceeds maximum "
-                "int32 value");
-          }
-          flat(i) = static_cast<int32>(dimension);
-        }
-      } else if (node->output_type(0) == DT_INT64) {
-        auto flat = t.flat<int64>();
-        for (int i = 0; i < input_rank; i++) {
-          flat(i) = c->Value(c->Dim(c->input(0), i));
-        }
-      } else {
-        return errors::FailedPrecondition(
-            "Shape has output type that is not int32 or int64");
-      }
-      *output = t;
-      *success = true;
-    }
-  } else if (node->type_string() == "Rank") {
-    bool rank_known = c->RankKnown(c->input(0));
-    if (rank_known) {
-      int32 input_rank = c->Rank(c->input(0));
-      Tensor t(node->output_type(0), TensorShape({}));
-      t.flat<int32>()(0) = input_rank;
-      *output = t;
-      *success = true;
-    }
-  } else if (node->type_string() == "Size") {
-    bool fully_defined_inputs = c->FullyDefined(c->input(0));
-    if (fully_defined_inputs) {
-      int32 rank = c->Rank(c->input(0));
-      Tensor t(node->output_type(0), TensorShape({}));
-      int64 size = 1;
-      for (int i = 0; i < rank; i++) {
-        size *= c->Value(c->Dim(c->input(0), i));
-      }
-      if (node->output_type(0) == DT_INT32) {
-        if (!FastBoundsCheck(size, std::numeric_limits<int32>::max())) {
-          return errors::FailedPrecondition(
-              "Size has output type int32, but size exceeds maximum int32 "
-              "value");
-        }
-        t.flat<int32>()(0) = static_cast<int32>(size);
-      } else if (node->output_type(0) == DT_INT64) {
-        t.flat<int64>()(0) = size;
-      } else {
-        return errors::FailedPrecondition(
-            "Size has output type that is not int32 or int64");
-      }
-      *output = t;
-      *success = true;
-    }
-  }
-  return Status::OK();
-}
-
-Status ShapeRefiner::ExtractConstantSubgraph(
-    Node* target_node, Graph* out_graph, bool* is_constant_graph,
-    std::vector<std::pair<string, Tensor>>* const_inputs) {
-  *is_constant_graph = false;
-  std::unordered_set<string> const_inputs_added;
-
-  if (target_node->op_def().is_stateful()) {
-    return Status::OK();
-  }
-
-  if (target_node->type_string() == "PlaceholderWithDefault") {
-    return Status::OK();
-  }
-
-  // TODO(skyewm): more of the filtering applied in input nodes below should be
-  // applied to target_node here
-
-  struct NodeAndRecursed {
-    Node* new_node = nullptr;
-    bool recursed = false;
-  };
-
-  std::map<Node*, NodeAndRecursed> old_to_new_and_recursed;
-  Node* target_node_copy = out_graph->CopyNode(target_node);
-  old_to_new_and_recursed[target_node].new_node = target_node_copy;
-  old_to_new_and_recursed[target_node].recursed = true;
-
-  // Add the target node's inputs to seed the recursion.
-  std::deque<const Edge*> edges_to_visit;
-  for (const Edge* e : target_node->in_edges()) {
-    // TODO(vrv): What do we do about control edges?  Based on our
-    // definition of a constant graph, we should be free to ignore
-    // control edges since the order in which a constant graph is
-    // executed should be the same regardless of when nodes run: we
-    // should only need to recurse down data edges.
-    if (e->IsControlEdge()) continue;
-    edges_to_visit.push_back(e);
-  }
-
-  *is_constant_graph = true;
-
-  // Iterate over the set of edges to visit (backwards).
-  while (!edges_to_visit.empty()) {
-    const Edge* current_edge = edges_to_visit.front();
-    edges_to_visit.pop_front();
-    Node* current_node = current_edge->src();
-
-    // If the node is stateful, assume the graph is not constant.
-    if (current_node->op_def().is_stateful()) {
-      *is_constant_graph = false;
-      return Status::OK();
-    }
-
-    // During construction or import from GraphConstructor, back edges may not
-    // be filled in.  Don't constant fold through merges at all for now.
-    if (IsMerge(current_node)) {
-      *is_constant_graph = false;
-      return Status::OK();
-    }
-
-    // Don't constant fold enter/exit currently either, as it's easy to end
-    // up with a partial frame.
-    if (IsEnter(current_node) || IsExit(current_node)) {
-      *is_constant_graph = false;
-      return Status::OK();
-    }
-
-    // Placeholders should never be constant folded because their outputs are
-    // fed by the user. Note that "Placeholder" nodes have no inputs so are
-    // handled below.
-    if (current_node->type_string() == "PlaceholderWithDefault") {
-      *is_constant_graph = false;
-      return Status::OK();
-    }
-
-    // If there is nothing more to recurse down, see if
-    // the generator node is a constant.
-    if (current_node->num_inputs() == 0) {
-      if (!current_node->IsConstant()) {
-        // Generator node is not a constant, so subgraph is not
-        // constant.
-        *is_constant_graph = false;
-        return Status::OK();
-      }
-    }
-
-    // Either the node is a constant, or the node is a potential
-    // intermediate node on the path from a constant.
-    //
-    // Add a copy of its node and a new edge to the new subgraph.
-
-    // Get or create the version of 'current_node' in the new graph.
-    Node* current_node_copy;
-    // This gets or creates the NodeAndRecursed entry for current_node.
-    NodeAndRecursed* node_and_recursed = &old_to_new_and_recursed[current_node];
-    if (node_and_recursed->new_node == nullptr) {
-      // First time processing this node.
-      current_node_copy = out_graph->CopyNode(current_node);
-      // Track the mapping from the original node to the new one.
-      node_and_recursed->new_node = current_node_copy;
-    } else {
-      current_node_copy = node_and_recursed->new_node;
-    }
-
-    // Add the edge to the destination node.
-    {
-      auto it = old_to_new_and_recursed.find(current_edge->dst());
-      if (it == old_to_new_and_recursed.end()) {
-        return errors::Internal(
-            "Could not find mapping from old to new copy of destination node: ",
-            current_edge->dst()->name());
-      }
-      Node* dst_copy = it->second.new_node;
-
-      out_graph->AddEdge(current_node_copy, current_edge->src_output(),
-                         dst_copy, current_edge->dst_input());
-    }
-
-    const string& output_tensor_name =
-        strings::StrCat(current_node->name(), ":", current_edge->src_output());
-
-    // Some tensor values can be inferred. For example, a shape op
-    // with input shapes fully defined can have its output tensor inferred.
-    Tensor tensor_inferred;
-    bool successfully_inferred_tensor = false;
-    TF_RETURN_IF_ERROR(TryToInferTensorOutputFromInputShapes(
-        current_edge, &tensor_inferred, &successfully_inferred_tensor));
-    if (successfully_inferred_tensor) {
-      const_inputs->emplace_back(output_tensor_name, tensor_inferred);
-      const_inputs_added.insert(output_tensor_name);
-      continue;
-    }
-
-    // If we have a copy of the input tensor materialized already,
-    // then add to the list of inputs to feed and do not recurse further.
-    auto it = const_tensor_map_.find(output_tensor_name);
-    if (it != const_tensor_map_.end() &&
-        const_inputs_added.count(output_tensor_name) == 0) {
-      const_inputs->emplace_back(output_tensor_name, it->second);
-      const_inputs_added.insert(output_tensor_name);
-      continue;
-    }
-
-    // If this node's inputs have not been processed already, do so now.
-    if (!node_and_recursed->recursed) {
-      node_and_recursed->recursed = true;
-      for (const Edge* e : current_node->in_edges()) {
-        if (e->IsControlEdge()) continue;
-        edges_to_visit.push_back(e);
-      }
-    }
-  }
-
-  return Status::OK();
+  OutputTensor tensor(input_edge->src(), input_edge->src_output());
+  return EvaluateConstantTensor(tensor, *this, *ops_registry_,
+                                graph_def_version_, evaluated, result,
+                                &graph_runner_, &const_tensor_map_,
+                                kMaxTensorSize, disable_constant_propagation_);
 }
 
 Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
@@ -713,6 +431,32 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   InferenceContext* src_context = GetContext(input_edge->src());
   if (src_context == nullptr) return errors::Internal("Missing src context");
   ShapeHandle src_shape = src_context->output(input_edge->src_output());
+
+  if (src_context->Value(src_context->Rank(src_shape)) == 0) {
+    Tensor t;
+    bool evaluated = false;
+    TF_RETURN_IF_ERROR(
+        EvaluateConstantTensorForEdge(node, dst_idx, &evaluated, &t));
+    if (!evaluated) {
+      return errors::InvalidArgument(
+          "Received a shape scalar with unknown static value.  A static value "
+          "of '-1' is required to represent an unknown shape.");
+    }
+    if (t.dims() == 0) {
+      if (t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) {
+        *result = target_context->UnknownShape();
+        return Status::OK();
+      } else if (t.dtype() == DT_INT64 && t.scalar<int64>()() == -1) {
+        *result = target_context->UnknownShape();
+        return Status::OK();
+      }
+    }
+    return errors::InvalidArgument(
+        "Received an invalid shape scalar with a static value that is not "
+        "'-1': ",
+        t.DebugString());
+  }
+
   TF_RETURN_IF_ERROR(src_context->WithRank(src_shape, 1, &src_shape));
 
   const string& src_op = input_edge->src()->type_string();
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 75eb5bf0d2972e6bccdd9c2c265f3494821210cc..d49c4373f0b8c8721bb5f151dfe4d8774fbceb50 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -215,20 +215,6 @@ class ShapeRefiner {
                                 bool keep_nested_shapes,
                                 ExtendedInferenceContext* outer_context);
 
-  // Tries to infer tensor output based on the input shapes of the node. In some
-  // cases, the shapes of the inputs are sufficient for inferring the contents
-  // of the output tensor. For example, a Shape op with fully defined input
-  // shapes can have its output tensor inferred.
-  Status TryToInferTensorOutputFromInputShapes(const Edge* edge, Tensor* output,
-                                               bool* success);
-
-  // Extracts the subgraph ending at 'node' that is statically
-  // computable and inserts into 'out_graph'. If statically computable,
-  // 'is_constant_graph' will be true.
-  Status ExtractConstantSubgraph(
-      Node* node, Graph* out_graph, bool* is_constant_graph,
-      std::vector<std::pair<string, Tensor>>* const_inputs) TF_MUST_USE_RESULT;
-
   Status EvaluateConstantTensorForEdge(const Node* node, int dst_idx,
                                        bool* evaluated, Tensor* result);
 
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index adf5a9afff2ebc6848db8811506ebd4a031df2bb..f48638afc0f602e4b0c1376f7e5732f3c637d025 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -143,8 +144,8 @@ TEST_F(ShapeRefinerTest, BadShapes) {
   // an error.
   Status s = m.AddNode(mm.node());
   ASSERT_FALSE(s.ok());
-  ASSERT_TRUE(StringPiece(s.error_message())
-                  .contains("Dimensions must be equal, but are 1 and 2"));
+  ASSERT_TRUE(str_util::StrContains(
+      s.error_message(), "Dimensions must be equal, but are 1 and 2"));
 }
 
 TEST_F(ShapeRefinerTest, SetShape) {
@@ -1032,8 +1033,8 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
   TF_ASSERT_OK(m.AddNode(pack.node()));
-  EXPECT_TRUE(
-      StringPiece(m.AddNode(result).error_message()).contains("but is rank 2"));
+  EXPECT_TRUE(str_util::StrContains(m.AddNode(result).error_message(),
+                                    "but is rank 2"));
 }
 
 TEST_F(ShapeRefinerTest, ConstantValueAsShape_Concat) {
diff --git a/tensorflow/core/common_runtime/single_threaded_cpu_device.h b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..04d5af9087059232097e7aebeb32141a3046ee63
--- /dev/null
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
@@ -0,0 +1,82 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+
+class Env;
+
+// A simple single-threaded CPU device. This can be used to run inexpensive
+// computations. In particular, using this avoids initializing the global thread
+// pools in LocalDevice.
+class SingleThreadedCpuDevice : public Device {
+ public:
+  SingleThreadedCpuDevice(Env* env)
+      : Device(env, Device::BuildDeviceAttributes("/device:CPU:0", DEVICE_CPU,
+                                                  Bytes(256 << 20),
+                                                  DeviceLocality())) {
+    eigen_worker_threads_.num_threads = 1;
+    eigen_worker_threads_.workers = new thread::ThreadPool(
+        env, "graph_runner", eigen_worker_threads_.num_threads);
+    eigen_threadpool_wrapper_.reset(
+        new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    eigen_device_.reset(new Eigen::ThreadPoolDevice(
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+    set_tensorflow_cpu_worker_threads(&eigen_worker_threads_);
+    set_eigen_cpu_device(eigen_device_.get());
+  }
+
+  ~SingleThreadedCpuDevice() override {
+    eigen_threadpool_wrapper_.reset();
+    eigen_device_.reset();
+    delete eigen_worker_threads_.workers;
+  }
+
+  Status Sync() override { return Status::OK(); }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    Tensor parsed(tensor_proto.dtype());
+    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
+    }
+    *tensor = parsed;
+    return Status::OK();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return cpu_allocator();
+  }
+
+ private:
+  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+  std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index cb900db10af98496cfdfafa5a38296bfdc4e996b..f21536d586edcca2cec9257579db9ca616f36a6c 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -226,13 +226,14 @@ void StepStatsCollector::BuildCostModel(
       if (node) {
         for (int i = 0; i < stats.output_size(); ++i) {
           const auto& output = stats.output(i);
-          cm->RecordMaxMemorySize(node, i,
+          int output_slot = output.slot();
+          cm->RecordMaxMemorySize(node, output_slot,
                                   Bytes(output.tensor_description()
                                             .allocation_description()
                                             .allocated_bytes()),
-                                  stats.output(i).tensor_description().shape(),
-                                  node->output_types()[i]);
-          cm->RecordAllocationId(node, i,
+                                  output.tensor_description().shape(),
+                                  node->output_types()[output_slot]);
+          cm->RecordAllocationId(node, output_slot,
                                  output.tensor_description()
                                      .allocation_description()
                                      .allocation_id());
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index 6e1a45b3efa8b5975de6352d5ea403e4afc9da76..f3bd72f697cde16bec7948f75ebdfd37b6854ce1 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -27,12 +27,11 @@ SYCLDevice::~SYCLDevice() {}
 
 void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   assert(context);
-  if (port::Tracing::IsActive()) {
-    // TODO(pbar) We really need a useful identifier of the graph node.
-    const uint64 id = Hash64(op_kernel->name());
-    port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
-                                         id);
-  }
+  // When ThreadScape profiling is off (which is the default), constructing the
+  // following code is simple enough that its overhead is negligible.
+  tracing::ScopedRegion region(tracing::EventCategory::kCompute,
+                               op_kernel->name());
+
   op_kernel->Compute(context);
 }
 
diff --git a/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0d4f24b111ed340f754ff4ab77223e8b19d68ab
--- /dev/null
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+
+// Mock objects that can't actually execute a Collective, but satisfy
+// general infrastructure expectations within tests that don't require
+// full functionality.
+
+class TestCollectiveExecutor : public CollectiveExecutor {
+ public:
+  explicit TestCollectiveExecutor(CollectiveExecutorMgrInterface* cem)
+      : CollectiveExecutor(cem) {}
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,  //???
+                    const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+};
+
+class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
+ public:
+  TestCollectiveExecutorMgr() {}
+
+  ~TestCollectiveExecutorMgr() override {
+    for (auto& iter : table_) {
+      iter.second->Unref();
+    }
+  }
+
+  CollectiveExecutor* FindOrCreate(int64 step_id) override {
+    mutex_lock l(mu_);
+    CollectiveExecutor* ce = nullptr;
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      ce = iter->second;
+    } else {
+      ce = new TestCollectiveExecutor(this);
+      table_[step_id] = ce;
+    }
+    ce->Ref();
+    return ce;
+  }
+
+  void Cleanup(int64 step_id) override {
+    mutex_lock l(mu_);
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      iter->second->Unref();
+      table_.erase(iter);
+    }
+  }
+
+  ParamResolverInterface* GetParamResolver() const override {
+    LOG(FATAL);
+    return nullptr;
+  }
+
+  DeviceResolverInterface* GetDeviceResolver() const override {
+    LOG(FATAL);
+    return nullptr;
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            const StatusCallback& done) override {
+    done(errors::Internal("unimplemented"));
+  }
+
+  void RefreshStepIdSequenceAsync(int64 graph_key,
+                                  const StatusCallback& done) override {
+    done(errors::Internal("unimplemented"));
+  }
+
+  int64 NextStepId(int64 graph_key) override {
+    return CollectiveExecutor::kInvalidId;
+  }
+
+  void RetireStepId(int64 graph_key, int64 step_id) override {}
+
+  mutex mu_;
+  gtl::FlatMap<int64, CollectiveExecutor*> table_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 5aa01376ab047e7613ba7403bb32859a83a09f5a..f7a07fe503f26b89a70843f536400684579422e0 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 
 #include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -40,31 +42,37 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
                                    Allocator* allocator)
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
-      allocator_(allocator) {}
+      allocator_(allocator),
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
 void ThreadPoolDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
-  // When TraceMe profiling is off (which is the default), the
-  // following TraceMe constructor is simply a conditional test of
-  // false value. Measurements show that its overhead is negligible.
-  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string(),
-                                  op_kernel->IsExpensive());
-  if (port::Tracing::IsActive()) {
-    // TODO(pbar) We really need a useful identifier of the graph node.
-    const uint64 id = Hash64(op_kernel->name());
-    port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
-                                         id);
-    op_kernel->Compute(context);
-  } else {
-    op_kernel->Compute(context);
-  }
+  // When Xprof/ThreadScape profiling is off (which is the default), the
+  // following code is simple enough that its overhead is negligible.
+  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
+                                   op_kernel->IsExpensive());
+  tracing::ScopedRegion region(tracing::EventCategory::kCompute,
+                               op_kernel->name());
+
+  op_kernel->Compute(context);
 }
 
 Allocator* ThreadPoolDevice::GetAllocator(AllocatorAttributes attr) {
   return allocator_;
 }
 
+Allocator* ThreadPoolDevice::GetScopedAllocator(AllocatorAttributes attr,
+                                                int64 step_id) {
+  if (attr.scope_id > 0) {
+    return scoped_allocator_mgr_->GetContainer(step_id)->GetInstance(
+        attr.scope_id);
+  }
+  LOG(FATAL) << "Unexpected call to ThreadPoolDevice::GetScopedAllocator "
+             << "attr.scope_id = " << attr.scope_id;
+  return allocator_;
+}
+
 Status ThreadPoolDevice::MakeTensorFromProto(
     const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs,
     Tensor* tensor) {
diff --git a/tensorflow/core/common_runtime/threadpool_device.h b/tensorflow/core/common_runtime/threadpool_device.h
index 37cb745a0aa89b9aeae2c289d347faac2ae177dd..afc5d15ebc39883f3d24c91b42d86c46576883c0 100644
--- a/tensorflow/core/common_runtime/threadpool_device.h
+++ b/tensorflow/core/common_runtime/threadpool_device.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
-#define TENSORFLOW_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
@@ -31,6 +31,11 @@ class ThreadPoolDevice : public LocalDevice {
 
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
   Allocator* GetAllocator(AllocatorAttributes attr) override;
+  Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                int64 step_id) override;
+  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
+    return scoped_allocator_mgr_.get();
+  }
   Status MakeTensorFromProto(const TensorProto& tensor_proto,
                              const AllocatorAttributes alloc_attrs,
                              Tensor* tensor) override;
@@ -39,8 +44,9 @@ class ThreadPoolDevice : public LocalDevice {
 
  private:
   Allocator* allocator_;  // Not owned
+  std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_;
 };
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 40cb8353cdccb4307f09b537ff7016e3dca5a8da..5fab740e920519abe8ec109615b75555593ec4c8 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -298,6 +298,9 @@ tf_cc_test(
     size = "small",
     srcs = ["debug_grpc_io_utils_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = [
+        "no_oss",  # b/73962011
+    ],
     deps = [
         ":debug_graph_utils",
         ":debug_grpc_testlib",
@@ -336,18 +339,3 @@ cc_library(
 #     ],
 #     visibility = ["//visibility:public"],
 # )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 9e152aa0823b67fceb7f103cc6e090f00870f88a..343dd5d4560adb9814e005431d2c493696090f4c 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -7,18 +7,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
@@ -145,6 +133,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
         "//tensorflow/core:worker_proto_cc",
     ],
 )
@@ -156,6 +145,7 @@ tf_cc_test(
     deps = [
         ":session_mgr",
         ":worker_env",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
@@ -595,6 +585,7 @@ tf_cc_test(
     srcs = ["recent_request_ids_test.cc"],
     deps = [
         ":recent_request_ids",
+        ":request_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 049eec347c672de8e12f44eda9a6bebccb68043c..5f6931e008879f331eb225bc25fdd457555572ad 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -144,9 +144,9 @@ BaseRemoteRendezvous::~BaseRemoteRendezvous() {
 // Returns true if "device_name" is a valid full name of local device
 // of the "worker".  This helper is purely based on the worker name
 // and device name and does no lookups in the worker->device_mgr.
-static bool IsLocalDevice(const string& worker_name,
+static bool IsLocalDevice(const StringPiece worker_name,
                           const StringPiece device_name) {
-  return device_name.starts_with(worker_name);
+  return str_util::StartsWith(device_name, worker_name);
 }
 
 Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
@@ -253,13 +253,13 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
 
   WorkerSession* sess = session();
   Device* src_device;
-  Status s = sess->device_mgr->LookupDevice(parsed.src_device, &src_device);
+  Status s = sess->device_mgr()->LookupDevice(parsed.src_device, &src_device);
   if (!s.ok()) {
     done(s);
     return;
   }
   Device* dst_device;
-  s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+  s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
   if (!s.ok()) {
     done(s);
     return;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 3a8d5912369525253904bd700dfdc6e3eb26e0ae..6edc2ec5ed58265c73654377655ea6312f4451d1 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -121,6 +121,8 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
     const string& function_name, const FunctionLibraryDefinition& lib_def,
     AttrSlice attrs, const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::LocalHandle* handle) {
+  VLOG(1) << "CFLR::Instantiate: " << function_name << " on " << options.target
+          << " (this: " << this << ")";
   WorkerInterface* wi =
       worker_session_->worker_cache->CreateWorker(options.target);
 
@@ -143,6 +145,7 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
 
   RegisterGraphRequest req;
   req.set_session_handle(worker_session_->session_name);
+  req.set_create_worker_session_called(create_worker_session_called_);
   *req.mutable_graph_def() = gdef;
   req.mutable_graph_options()
       ->mutable_optimizer_options()
@@ -154,6 +157,9 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
   *handle = function_data_.size();
   function_data_.push_back(FunctionData(resp.graph_handle(), options.target, wi,
                                         send_keys, recv_keys));
+  VLOG(1) << "CFLR::Instantiate: [Success] " << function_name << " on "
+          << options.target << " (this: " << this << ")"
+          << " with handle: " << *handle;
   return Status::OK();
 }
 
@@ -175,32 +181,34 @@ void ClusterFunctionLibraryRuntime::Run(
     return;
   }
 
-  RunGraphRequest req;
-  req.set_session_handle(worker_session_->session_name);
-  req.set_graph_handle(function_data->graph_handle);
+  RunGraphRequest* req = new RunGraphRequest;
+  req->set_session_handle(worker_session_->session_name);
+  req->set_create_worker_session_called(create_worker_session_called_);
+  req->set_graph_handle(function_data->graph_handle);
   // Borrowed from master_session.cc
   const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
-  req.set_step_id(step_id);
+  req->set_step_id(step_id);
   int i = 0;
   for (const auto& send_key : function_data->send_keys) {
-    NamedTensorProto* send = req.add_send();
+    NamedTensorProto* send = req->add_send();
     send->set_name(send_key);
     args[i].AsProtoTensorContent(send->mutable_tensor());
     i++;
   }
   const std::vector<string>& recv_keys = function_data->recv_keys;
   for (const auto& recv_key : recv_keys) {
-    req.add_recv_key(recv_key);
+    req->add_recv_key(recv_key);
   }
 
   RunGraphResponse* resp = new RunGraphResponse();
   CallOptions* call_options = new CallOptions();
   wi->RunGraphAsync(
-      call_options, &req, resp,
-      [call_options, resp, rets, recv_keys, done](const Status& status) {
+      call_options, req, resp,
+      [call_options, req, resp, rets, recv_keys, done](const Status& status) {
         if (!status.ok()) {
           done(status);
           delete call_options;
+          delete req;
           delete resp;
           return;
         }
@@ -212,25 +220,28 @@ void ClusterFunctionLibraryRuntime::Run(
         for (const auto& recv_key : recv_keys) {
           TensorProto* tp = mapped_recvs[recv_key];
           if (tp == nullptr) {
+            done(errors::Internal("Could not find key: ", recv_key));
             delete call_options;
+            delete req;
             delete resp;
-            done(errors::Internal("Could not find key: ", recv_key));
             return;
           }
           Tensor t;
           if (t.FromProto(*tp)) {
             rets->push_back(t);
           } else {
-            delete call_options;
-            delete resp;
             done(errors::Internal("Could not convert tensor proto: ",
                                   tp->DebugString()));
+            delete call_options;
+            delete req;
+            delete resp;
             return;
           }
         }
+        done(status);
         delete call_options;
+        delete req;
         delete resp;
-        done(status);
       });
 }
 
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index d3ca350e3659ffa9f8248d2be80a1b1f0303addc..1ea0a3ad515e8284ddd683f1c1fd2af53a272ba8 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -27,8 +27,10 @@ struct WorkerSession;
 // functions across processes by making RPCs.
 class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
  public:
-  ClusterFunctionLibraryRuntime(WorkerSession* worker_session)
-      : worker_session_(worker_session) {}
+  ClusterFunctionLibraryRuntime(WorkerSession* worker_session,
+                                bool create_worker_session_called)
+      : worker_session_(worker_session),
+        create_worker_session_called_(create_worker_session_called) {}
 
   ~ClusterFunctionLibraryRuntime() override;
 
@@ -51,6 +53,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
   mutable mutex mu_;
   WorkerSession* const worker_session_ = nullptr;  // not owned.
+  const bool create_worker_session_called_;
 
   struct FunctionData {
     const string graph_handle;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 1810996ab8c2a8e4901c007517ae276829b4fc2a..cd6e13501408a0f0325989234b7ec9023efb9d61 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -36,7 +36,8 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
     ChannelCreationFunction channel_func =
         ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
     std::unique_ptr<WorkerCacheInterface> worker_cache(
-        NewGrpcWorkerCache(NewGrpcChannelCache(spec, channel_func)));
+        NewGrpcWorkerCache(std::shared_ptr<GrpcChannelCache>(
+            NewGrpcChannelCache(spec, channel_func))));
 
     worker_session_.reset(new WorkerSession(
         "cluster_test_session", "/job:localhost/replica:0/task:0",
@@ -44,7 +45,7 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
         std::unique_ptr<GraphMgr>()));
 
     cluster_flr_.reset(
-        new ClusterFunctionLibraryRuntime(worker_session_.get()));
+        new ClusterFunctionLibraryRuntime(worker_session_.get(), true));
   }
 
   Status ConstructFunctionGraphHelper(
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 7878ebb5f06db0f64e9216250da2a79352274ab3..8447c55bf43d8b2f7b40685932474b075d6cf696 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -134,7 +134,8 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 
   item->proc_flr.reset(new ProcessFunctionLibraryRuntime(
       device_mgr_, worker_env_->env, gdef.versions().producer(),
-      item->lib_def.get(), graph_options.optimizer_options(), cluster_flr));
+      item->lib_def.get(), graph_options.optimizer_options(),
+      worker_env_->compute_pool, cluster_flr));
 
   // Constructs the graph out of "gdef".
   Graph graph(OpRegistry::Global());
@@ -437,7 +438,7 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
 
   StartParallelExecutors(handle, step_id, item, rendezvous, collector,
                          cost_graph, cancellation_manager,
-                         [this, item, rendezvous, done](const Status& s) {
+                         [item, rendezvous, done](const Status& s) {
                            done(s);
                            rendezvous->Unref();
                            item->Unref();
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index aaa4cfa7341c42bf9f7302e8ef30a28b68e6213c..76315462a738b7e70ce8c1f9ca5776d0037e22f9 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -157,6 +157,47 @@ Status LocalMaster::Reset(CallOptions* call_options,
   return ret;
 }
 
+Status LocalMaster::MakeCallable(CallOptions* call_options,
+                                 const MakeCallableRequest* request,
+                                 MakeCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->MakeCallable(request, response, [&n, &ret](const Status& s) {
+    ret.Update(s);
+    n.Notify();
+  });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+Status LocalMaster::RunCallable(CallOptions* call_options,
+                                const RunCallableRequest* request,
+                                RunCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->RunCallable(call_options, request, response,
+                            [&n, &ret](const Status& s) {
+                              ret.Update(s);
+                              n.Notify();
+                            });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+Status LocalMaster::ReleaseCallable(CallOptions* call_options,
+                                    const ReleaseCallableRequest* request,
+                                    ReleaseCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->ReleaseCallable(request, response, [&n, &ret](const Status& s) {
+    ret.Update(s);
+    n.Notify();
+  });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+
 namespace {
 mutex* get_local_master_registry_lock() {
   static mutex local_master_registry_lock(LINKER_INITIALIZED);
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index c20b40329ab1712b3dd0cae673d337481ee40196..cad6babad82b9b2ac2953f5497e46bb471699b10 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -71,6 +71,16 @@ class LocalMaster : public MasterInterface {
   Status Reset(CallOptions* call_options, const ResetRequest* request,
                ResetResponse* response) override;
 
+  Status MakeCallable(CallOptions* call_options,
+                      const MakeCallableRequest* request,
+                      MakeCallableResponse* response) override;
+  Status RunCallable(CallOptions* call_options,
+                     const RunCallableRequest* request,
+                     RunCallableResponse* response) override;
+  Status ReleaseCallable(CallOptions* call_options,
+                         const ReleaseCallableRequest* request,
+                         ReleaseCallableResponse* response);
+
   // Registers the mapping from the given `target` to the given `master`.
   //
   // WARNING: The `master` pointer remains owned by the caller. It is
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 1a488303ac73b8628b9d3fe4050ad9144724348e..e60386fd34acc1d94565014b5e3ca920fe336ea3 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -167,13 +167,16 @@ class DeviceFinder {
     }
     // Enumerates all known workers' target. A target name is a
     // prefix of a device name. E.g., /job:mnist/replica:0/task:10.
+    CHECK_GT(env_->local_devices.size(), 0) << "No local devices provided.";
+    const string& local_device_name = env_->local_devices[0]->name();
     std::vector<string> workers;
     worker_cache->ListWorkers(&workers);
     if (filters_.empty()) {
       std::swap(workers, targets_);
     } else {
       for (const string& name : workers) {
-        if (MatchFilters(name)) {
+        if (MatchFilters(name) ||
+            DeviceNameUtils::IsSameAddressSpace(name, local_device_name)) {
           targets_.push_back(name);
         }
       }
@@ -417,9 +420,13 @@ void Master::CreateSession(const CreateSessionRequest* req,
     SessionOptions options;
     options.config = req->config();
 
+    std::vector<string> filtered_worker_list;
+    DeviceFinder::GetRemoteWorkers(req->config().device_filters(), env_,
+                                   worker_cache, &filtered_worker_list);
+
     MasterSession* session = env_->master_session_factory(
         options, env_, std::move(remote_devices), std::move(worker_cache_ptr),
-        std::move(device_set));
+        std::move(device_set), std::move(filtered_worker_list));
 
     GraphDef* gdef =
         const_cast<CreateSessionRequest*>(req)->mutable_graph_def();
@@ -611,4 +618,55 @@ void Master::Reset(const ResetRequest* req, ResetResponse* resp,
   });
 }
 
+void Master::MakeCallable(const MakeCallableRequest* req,
+                          MakeCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, req, resp](MyClosure done) {
+        Status s = session->MakeCallable(*req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
+void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
+                         RunCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, opts, req, resp](MyClosure done) {
+        Status s = session->RunCallable(opts, *req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
+void Master::ReleaseCallable(const ReleaseCallableRequest* req,
+                             ReleaseCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, req, resp](MyClosure done) {
+        Status s = session->ReleaseCallable(*req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index 678fc46bd7f4644022c5811f8a1c7a9f606be111..dbb337fd484960fbd3bfe47d0bfe0497985de66f 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -61,6 +61,13 @@ class Master {
   // See tensorflow::Reset() and the comment on ResetRequest.
   void Reset(const ResetRequest* req, ResetResponse* resp, MyClosure done);
 
+  void MakeCallable(const MakeCallableRequest* req, MakeCallableResponse* resp,
+                    MyClosure done);
+  void RunCallable(CallOptions* opts, const RunCallableRequest* req,
+                   RunCallableResponse* resp, MyClosure done);
+  void ReleaseCallable(const ReleaseCallableRequest* req,
+                       ReleaseCallableResponse* resp, MyClosure done);
+
  private:
   typedef Master ME;
 
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index 178c5b40ee15505bc5557dd2c084fc0ce616ac7f..16f4d93c8b4a753f42c74fc51c6196b2d229ee1d 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -83,7 +83,8 @@ struct MasterEnv {
       SessionOptions, MasterEnv*,
       std::unique_ptr<std::vector<std::unique_ptr<Device>>>,
       std::unique_ptr<WorkerCacheInterface>,
-      std::unique_ptr<DeviceSet> device_set)>
+      std::unique_ptr<DeviceSet> device_set,
+      std::vector<string> filtered_worker_list)>
       master_session_factory;
 
   std::function<Status(const WorkerCacheFactoryOptions&,
diff --git a/tensorflow/core/distributed_runtime/master_interface.h b/tensorflow/core/distributed_runtime/master_interface.h
index bf6a2db3e27b301c01ca7d5073d175b24417220f..a8ae3cba3cdd3f02aae823d893e027b2bccae2c9 100644
--- a/tensorflow/core/distributed_runtime/master_interface.h
+++ b/tensorflow/core/distributed_runtime/master_interface.h
@@ -89,6 +89,16 @@ class MasterInterface {
   virtual Status Reset(CallOptions* call_options, const ResetRequest* request,
                        ResetResponse* response) = 0;
 
+  virtual Status MakeCallable(CallOptions* call_options,
+                              const MakeCallableRequest* request,
+                              MakeCallableResponse* response) = 0;
+  virtual Status RunCallable(CallOptions* call_options,
+                             const RunCallableRequest* request,
+                             RunCallableResponse* response) = 0;
+  virtual Status ReleaseCallable(CallOptions* call_options,
+                                 const ReleaseCallableRequest* request,
+                                 ReleaseCallableResponse* response) = 0;
+
  protected:
   // NOTE: This should only be called by implementations of this
   // interface whose CreateRunStepResponse() method returns a
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 878a1398c9d382a4b2018712ca9f9e48c11a9345..83afc5b1a4676bfa23d03c1b94d8ed85732750ed 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -66,13 +66,13 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                     std::unique_ptr<ClientGraph> cg,
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
-                    GraphExecutionState* execution_state, bool is_partial,
-                    WorkerCacheInterface* worker_cache, bool should_deregister)
+                    bool is_partial, WorkerCacheInterface* worker_cache,
+                    bool should_deregister)
       : session_handle_(handle),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
-        debug_opts_(bopts.debug_options),
+        callable_opts_(bopts.callable_options),
         worker_cache_(worker_cache),
         should_deregister_(should_deregister) {
     VLOG(1) << "Created ReffedClientGraph for node with "
@@ -80,8 +80,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
     stats_publisher_ = stats_publisher_factory(handle, bopts, session_opts);
 
-    // Initialize a name to node map for testing that fetches are reachable.
-    for (Node* n : execution_state->full_graph()->nodes()) {
+    // Initialize a name to node map for processing device stats.
+    for (Node* n : client_graph_->graph.nodes()) {
       name_to_node_.insert({n->name(), n});
     }
   }
@@ -89,17 +89,27 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   ~ReffedClientGraph() override {
     if (should_deregister_) {
       DeregisterPartitions();
+    } else {
+      for (Part& part : partitions_) {
+        worker_cache_->ReleaseWorker(part.name, part.worker);
+      }
     }
   }
 
   const ClientGraph* client_graph() { return client_graph_.get(); }
 
+  const CallableOptions& callable_options() { return callable_opts_; }
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
                                                     int64 execution_count,
                                                     const RunOptions& ropts) {
     return stats_publisher_->GetProfileHandler(step, execution_count, ropts);
   }
 
+  int64 get_and_increment_execution_count() {
+    return execution_count_.fetch_add(1);
+  }
+
   // Turn RPC logging on or off, both at the WorkerCache used by this
   // master process, and at each remote worker in use for the current
   // partitions.
@@ -178,6 +188,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                        CallOptions* opts, const RunStepRequestWrapper& req,
                        MutableRunStepResponseWrapper* resp,
                        CancellationManager* cm, const bool is_last_partial_run);
+  Status RunPartitions(const MasterEnv* env, int64 step_id,
+                       int64 execution_count, PerStepState* pss,
+                       CallOptions* call_opts, const RunCallableRequest& req,
+                       RunCallableResponse* resp, CancellationManager* cm);
 
   // Calls workers to cleanup states for the step "step_id".  Calls
   // `done` when all cleanup RPCs have completed.
@@ -211,10 +225,11 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const std::unique_ptr<ClientGraph> client_graph_;
   const SessionOptions session_opts_;
   const bool is_partial_;
-  const DebugOptions& debug_opts_;
+  const CallableOptions callable_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node_;
   const bool should_deregister_;
+  std::atomic<int64> execution_count_ = {0};
 
   // Graph partitioned into per-location subgraphs.
   struct Part {
@@ -269,6 +284,17 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
       const PartitionOptions& popts,
       std::unordered_map<string, GraphDef> graph_partitions);
 
+  // Prepares a number of calls to workers. One call per partition.
+  // This is a generic method that handles Run, PartialRun, and RunCallable.
+  template <class FetchListType, class ClientRequestType,
+            class ClientResponseType>
+  Status RunPartitionsHelper(
+      const std::unordered_map<StringPiece, size_t, StringPieceHasher>& feeds,
+      const FetchListType& fetches, const MasterEnv* env, int64 step_id,
+      int64 execution_count, PerStepState* pss, CallOptions* call_opts,
+      const ClientRequestType& req, ClientResponseType* resp,
+      CancellationManager* cm, bool is_last_partial_run);
+
   // Deregisters the partitions on the workers.  Called in the
   // destructor and does not wait for the rpc completion.
   void DeregisterPartitions();
@@ -394,6 +420,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
   if (!s.ok()) {
     for (Part& part : partitions_) {
       worker_cache_->ReleaseWorker(part.name, part.worker);
+      part.worker = nullptr;
     }
     return s;
   }
@@ -409,9 +436,11 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     const Part& part = partitions_[i];
     Call* c = &calls[i];
     c->req.set_session_handle(session_handle_);
+    c->req.set_create_worker_session_called(!should_deregister_);
     c->req.mutable_graph_def()->Swap(&graph_partitions[part.name]);
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
-    *c->req.mutable_debug_options() = debug_opts_;
+    *c->req.mutable_debug_options() =
+        callable_opts_.run_options().debug_options();
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -490,24 +519,46 @@ class RunManyGraphs {
   TF_DISALLOW_COPY_AND_ASSIGN(RunManyGraphs);
 };
 
-Status MasterSession::ReffedClientGraph::RunPartitions(
-    const MasterEnv* env, int64 step_id, int64 execution_count,
-    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
-    MutableRunStepResponseWrapper* resp, CancellationManager* cm,
-    const bool is_last_partial_run) {
-  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
-          << execution_count;
-  // Maps the names of fed tensors to their index in `req`.
-  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+namespace {
+Status AddSendFromClientRequest(const RunStepRequestWrapper& client_req,
+                                MutableRunGraphRequestWrapper* worker_req,
+                                size_t index, const string& send_key) {
+  return worker_req->AddSendFromRunStepRequest(client_req, index, send_key);
+}
 
-  for (size_t i = 0; i < req.num_feeds(); ++i) {
-    if (!feeds.insert({req.feed_name(i), i}).second) {
-      return errors::InvalidArgument("Duplicated feeds: ", req.feed_name(i));
-    }
-  }
+Status AddSendFromClientRequest(const RunCallableRequest& client_req,
+                                MutableRunGraphRequestWrapper* worker_req,
+                                size_t index, const string& send_key) {
+  return worker_req->AddSendFromRunCallableRequest(client_req, index, send_key);
+}
 
-  // Prepares a number of calls to workers. One call per partition.
+// TODO(mrry): Add a full-fledged wrapper that avoids TensorProto copies for
+// in-process messages.
+struct RunCallableResponseWrapper {
+  RunCallableResponse* resp;  // Not owned.
+  std::unordered_map<string, TensorProto> fetch_key_to_protos;
+
+  RunMetadata* mutable_metadata() { return resp->mutable_metadata(); }
+
+  Status AddTensorFromRunGraphResponse(
+      const string& tensor_name, MutableRunGraphResponseWrapper* worker_resp,
+      size_t index) {
+    // TODO(b/74355905): Add a specialized implementation that avoids
+    // copying the tensor into the RunCallableResponse when at least
+    // two of the {client, master, worker} are in the same process.
+    return worker_resp->RecvValue(index, &fetch_key_to_protos[tensor_name]);
+  }
+};
+}  // namespace
 
+template <class FetchListType, class ClientRequestType,
+          class ClientResponseType>
+Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
+    const std::unordered_map<StringPiece, size_t, StringPieceHasher>& feeds,
+    const FetchListType& fetches, const MasterEnv* env, int64 step_id,
+    int64 execution_count, PerStepState* pss, CallOptions* call_opts,
+    const ClientRequestType& req, ClientResponseType* resp,
+    CancellationManager* cm, bool is_last_partial_run) {
   // Collect execution cost stats on a smoothly decreasing frequency.
   ExecutorOpts exec_opts;
   if (pss->report_tensor_allocations_upon_oom) {
@@ -542,6 +593,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       c->req->set_is_last_partial_run(is_last_partial_run);
     }
     c->req->set_session_handle(session_handle_);
+    c->req->set_create_worker_session_called(!should_deregister_);
     c->req->set_graph_handle(part.graph_handle);
     c->req->set_step_id(step_id);
     *c->req->mutable_exec_opts() = exec_opts;
@@ -553,28 +605,19 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
     // We keep these as separate paths for now, to ensure we aren't
     // inadvertently slowing down the normal run path.
     if (is_partial_) {
-      for (size_t i = 0; i < req.num_feeds(); ++i) {
-        const string& name = req.feed_name(i);
-        const auto iter = part.feed_key.find(name);
+      for (const auto& name_index : feeds) {
+        const auto iter = part.feed_key.find(name_index.first.ToString());
         if (iter == part.feed_key.end()) {
           // The provided feed must be for a different partition.
           continue;
         }
         const string& key = iter->second;
-        auto feeds_iter = feeds.find(name);
-        if (feeds_iter == feeds.end()) {
-          return errors::InvalidArgument("No feed is provided for feed=", name,
-                                         ", key=", key);
-        } else if (feeds_iter->second != static_cast<size_t>(i)) {
-          return errors::Internal("Cannot find feed named \"", name,
-                                  " in request.");
-        }
-        TF_RETURN_IF_ERROR(c->req->AddSendFromRunStepRequest(req, i, key));
+        TF_RETURN_IF_ERROR(AddSendFromClientRequest(req, c->req.get(),
+                                                    name_index.second, key));
       }
       // TODO(suharshs): Make a map from feed to fetch_key to make this faster.
       // For now, we just iterate through partitions to find the matching key.
-      for (int i = 0; static_cast<size_t>(i) < req.num_fetches(); ++i) {
-        const string& req_fetch = req.fetch_name(i);
+      for (const string& req_fetch : fetches) {
         for (const auto& key_fetch : part.key_fetch) {
           if (key_fetch.second == req_fetch) {
             c->req->add_recv_key(key_fetch.first);
@@ -586,9 +629,13 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       for (const auto& feed_key : part.feed_key) {
         const string& feed = feed_key.first;
         const string& key = feed_key.second;
-        const int64 feed_index = feeds[feed];
+        auto iter = feeds.find(feed);
+        if (iter == feeds.end()) {
+          return errors::Internal("No feed index found for feed: ", feed);
+        }
+        const int64 feed_index = iter->second;
         TF_RETURN_IF_ERROR(
-            c->req->AddSendFromRunStepRequest(req, feed_index, key));
+            AddSendFromClientRequest(req, c->req.get(), feed_index, key));
       }
       for (const auto& key_fetch : part.key_fetch) {
         const string& key = key_fetch.first;
@@ -622,50 +669,115 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
   } else {
     return errors::Cancelled("Step was cancelled");
   }
+  TF_RETURN_IF_ERROR(calls.status());
 
-  // Collects fetches.
-  Status status = calls.status();
-  if (status.ok()) {
-    for (int i = 0; i < num; ++i) {
-      const Part& part = partitions_[i];
-      MutableRunGraphResponseWrapper* run_graph_resp = calls.get(i)->resp.get();
-      for (size_t j = 0; j < run_graph_resp->num_recvs(); ++j) {
-        auto iter = part.key_fetch.find(run_graph_resp->recv_key(j));
-        if (iter == part.key_fetch.end()) {
-          status.Update(errors::Internal("Unexpected fetch key: ",
-                                         run_graph_resp->recv_key(j)));
-          break;
-        }
-        const string& fetch = iter->second;
-        status.Update(
-            resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
-        if (!status.ok()) {
-          break;
-        }
+  // Collects fetches and metadata.
+  Status status;
+  for (int i = 0; i < num; ++i) {
+    const Part& part = partitions_[i];
+    MutableRunGraphResponseWrapper* run_graph_resp = calls.get(i)->resp.get();
+    for (size_t j = 0; j < run_graph_resp->num_recvs(); ++j) {
+      auto iter = part.key_fetch.find(run_graph_resp->recv_key(j));
+      if (iter == part.key_fetch.end()) {
+        status.Update(errors::Internal("Unexpected fetch key: ",
+                                       run_graph_resp->recv_key(j)));
+        break;
       }
-      if (pss->collect_timeline) {
-        pss->step_stats[i].Swap(run_graph_resp->mutable_step_stats());
+      const string& fetch = iter->second;
+      status.Update(
+          resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
+      if (!status.ok()) {
+        break;
       }
-      if (pss->collect_costs) {
-        CostGraphDef* cost_graph = run_graph_resp->mutable_cost_graph();
-        for (int j = 0; j < cost_graph->node_size(); ++j) {
-          resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
-              cost_graph->mutable_node(j));
-        }
+    }
+    if (pss->collect_timeline) {
+      pss->step_stats[i].Swap(run_graph_resp->mutable_step_stats());
+    }
+    if (pss->collect_costs) {
+      CostGraphDef* cost_graph = run_graph_resp->mutable_cost_graph();
+      for (int j = 0; j < cost_graph->node_size(); ++j) {
+        resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
+            cost_graph->mutable_node(j));
       }
-      if (pss->collect_partition_graphs) {
-        protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
-            resp->mutable_metadata()->mutable_partition_graphs();
-        for (size_t i = 0; i < run_graph_resp->num_partition_graphs(); i++) {
-          partition_graph_defs->Add()->Swap(
-              run_graph_resp->mutable_partition_graph(i));
-        }
+    }
+    if (pss->collect_partition_graphs) {
+      protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
+          resp->mutable_metadata()->mutable_partition_graphs();
+      for (size_t i = 0; i < run_graph_resp->num_partition_graphs(); i++) {
+        partition_graph_defs->Add()->Swap(
+            run_graph_resp->mutable_partition_graph(i));
       }
     }
   }
   return status;
 }
 
+Status MasterSession::ReffedClientGraph::RunPartitions(
+    const MasterEnv* env, int64 step_id, int64 execution_count,
+    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
+    MutableRunStepResponseWrapper* resp, CancellationManager* cm,
+    const bool is_last_partial_run) {
+  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
+          << execution_count;
+  // Maps the names of fed tensors to their index in `req`.
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+  for (size_t i = 0; i < req.num_feeds(); ++i) {
+    if (!feeds.insert({req.feed_name(i), i}).second) {
+      return errors::InvalidArgument("Duplicated feeds: ", req.feed_name(i));
+    }
+  }
+
+  std::vector<string> fetches;
+  fetches.reserve(req.num_fetches());
+  for (size_t i = 0; i < req.num_fetches(); ++i) {
+    fetches.push_back(req.fetch_name(i));
+  }
+
+  return RunPartitionsHelper(feeds, fetches, env, step_id, execution_count, pss,
+                             call_opts, req, resp, cm, is_last_partial_run);
+}
+
+Status MasterSession::ReffedClientGraph::RunPartitions(
+    const MasterEnv* env, int64 step_id, int64 execution_count,
+    PerStepState* pss, CallOptions* call_opts, const RunCallableRequest& req,
+    RunCallableResponse* resp, CancellationManager* cm) {
+  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
+          << execution_count;
+  // Maps the names of fed tensors to their index in `req`.
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+  for (size_t i = 0; i < callable_opts_.feed_size(); ++i) {
+    if (!feeds.insert({callable_opts_.feed(i), i}).second) {
+      // MakeCallable will fail if there are two feeds with the same name.
+      return errors::Internal("Duplicated feeds in callable: ",
+                              callable_opts_.feed(i));
+    }
+  }
+
+  // Create a wrapped response object to collect the fetched values and
+  // rearrange them for the RunCallableResponse.
+  RunCallableResponseWrapper wrapped_resp;
+  wrapped_resp.resp = resp;
+
+  TF_RETURN_IF_ERROR(RunPartitionsHelper(
+      feeds, callable_opts_.fetch(), env, step_id, execution_count, pss,
+      call_opts, req, &wrapped_resp, cm, false /* is_last_partial_run */));
+
+  // Collects fetches.
+  // TODO(b/74355905): Add a specialized implementation that avoids
+  // copying the tensor into the RunCallableResponse when at least
+  // two of the {client, master, worker} are in the same process.
+  for (const string& fetch : callable_opts_.fetch()) {
+    TensorProto* fetch_proto = resp->mutable_fetch()->Add();
+    auto iter = wrapped_resp.fetch_key_to_protos.find(fetch);
+    if (iter == wrapped_resp.fetch_key_to_protos.end()) {
+      return errors::Internal("Worker did not return a value for fetch: ",
+                              fetch);
+    }
+    fetch_proto->Swap(&iter->second);
+  }
+  return Status::OK();
+}
+
 namespace {
 
 class CleanupBroadcastHelper {
@@ -829,8 +941,6 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
 // TODO(suharsh,mrry): Build a map from fetch target to set of feeds it depends
 // on once at setup time to prevent us from computing the dependencies
 // everytime.
-// TODO(suharshs,mrry): Consider removing the need for execution_state to reduce
-// contention.
 Status MasterSession::ReffedClientGraph::CheckFetches(
     const RunStepRequestWrapper& req, const RunState* run_state,
     GraphExecutionState* execution_state) {
@@ -840,8 +950,8 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
     // Skip if already fed.
     if (input.second) continue;
     TensorId id(ParseTensorName(input.first));
-    const auto it = name_to_node_.find(id.first);
-    if (it == name_to_node_.end()) {
+    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    if (n == nullptr) {
       return errors::NotFound("Feed ", input.first, ": not found");
     }
     pending_feeds.insert(id);
@@ -856,11 +966,11 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
   for (size_t i = 0; i < req.num_fetches(); ++i) {
     const string& fetch = req.fetch_name(i);
     const TensorId id(ParseTensorName(fetch));
-    auto it = name_to_node_.find(id.first);
-    if (it == name_to_node_.end()) {
+    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    if (n == nullptr) {
       return errors::NotFound("Fetch ", fetch, ": not found");
     }
-    stack.push_back(it->second);
+    stack.push_back(n);
   }
 
   // Any tensor needed for fetches can't be in pending_feeds.
@@ -900,6 +1010,7 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
     if (!part.graph_handle.empty()) {
       Call* c = new Call;
       c->req.set_session_handle(session_handle_);
+      c->req.set_create_worker_session_called(!should_deregister_);
       c->req.set_graph_handle(part.graph_handle);
       // NOTE(mrry): We must capture `worker_cache_` since `this`
       // could be deleted before the callback is called.
@@ -921,61 +1032,70 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
   }
 }
 
+namespace {
+void CopyAndSortStrings(size_t size,
+                        const std::function<string(size_t)>& input_accessor,
+                        protobuf::RepeatedPtrField<string>* output) {
+  std::vector<string> temp;
+  temp.reserve(size);
+  for (size_t i = 0; i < size; ++i) {
+    output->Add(input_accessor(i));
+  }
+  std::sort(output->begin(), output->end());
+}
+}  // namespace
+
 void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
                             BuildGraphOptions* opts) {
-  for (size_t i = 0; i < req.num_feeds(); ++i) {
-    opts->feed_endpoints.push_back(req.feed_name(i));
-  }
-  for (size_t i = 0; i < req.num_fetches(); ++i) {
-    opts->fetch_endpoints.push_back(req.fetch_name(i));
-  }
-  for (size_t i = 0; i < req.num_targets(); ++i) {
-    opts->target_nodes.push_back(req.target_name(i));
-  }
+  CallableOptions* callable_opts = &opts->callable_options;
+  CopyAndSortStrings(req.num_feeds(),
+                     [&req](size_t i) { return req.feed_name(i); },
+                     callable_opts->mutable_feed());
+  CopyAndSortStrings(req.num_fetches(),
+                     [&req](size_t i) { return req.fetch_name(i); },
+                     callable_opts->mutable_fetch());
+  CopyAndSortStrings(req.num_targets(),
+                     [&req](size_t i) { return req.target_name(i); },
+                     callable_opts->mutable_target());
 
   if (!req.options().debug_options().debug_tensor_watch_opts().empty()) {
-    opts->debug_options = req.options().debug_options();
+    *callable_opts->mutable_run_options()->mutable_debug_options() =
+        req.options().debug_options();
   }
-
-  std::sort(opts->feed_endpoints.begin(), opts->feed_endpoints.end());
-  std::sort(opts->target_nodes.begin(), opts->target_nodes.end());
-  std::sort(opts->fetch_endpoints.begin(), opts->fetch_endpoints.end());
 }
 
 void BuildBuildGraphOptions(const PartialRunSetupRequest& req,
                             BuildGraphOptions* opts) {
-  for (const auto& feed : req.feed()) {
-    opts->feed_endpoints.push_back(feed);
-  }
-  for (const auto& fetch : req.fetch()) {
-    opts->fetch_endpoints.push_back(fetch);
-  }
-  for (const auto& target : req.target()) {
-    opts->target_nodes.push_back(target);
-  }
+  CallableOptions* callable_opts = &opts->callable_options;
+  CopyAndSortStrings(req.feed_size(), [&req](size_t i) { return req.feed(i); },
+                     callable_opts->mutable_feed());
+  CopyAndSortStrings(req.fetch_size(),
+                     [&req](size_t i) { return req.fetch(i); },
+                     callable_opts->mutable_fetch());
+  CopyAndSortStrings(req.target_size(),
+                     [&req](size_t i) { return req.target(i); },
+                     callable_opts->mutable_target());
 
   // TODO(cais): Add TFDBG support to partial runs.
-
-  std::sort(opts->feed_endpoints.begin(), opts->feed_endpoints.end());
-  std::sort(opts->target_nodes.begin(), opts->target_nodes.end());
-  std::sort(opts->fetch_endpoints.begin(), opts->fetch_endpoints.end());
 }
 
 uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
   uint64 h = 0x2b992ddfa23249d6ull;
-  for (const string& name : opts.feed_endpoints) {
+  for (const string& name : opts.callable_options.feed()) {
     h = Hash64(name.c_str(), name.size(), h);
   }
-  for (const string& name : opts.target_nodes) {
+  for (const string& name : opts.callable_options.target()) {
     h = Hash64(name.c_str(), name.size(), h);
   }
-  for (const string& name : opts.fetch_endpoints) {
+  for (const string& name : opts.callable_options.fetch()) {
     h = Hash64(name.c_str(), name.size(), h);
   }
 
-  if (!opts.debug_options.debug_tensor_watch_opts().empty()) {
-    const string watch_summary = SummarizeDebugTensorWatches(
-        opts.debug_options.debug_tensor_watch_opts());
+  const DebugOptions& debug_options =
+      opts.callable_options.run_options().debug_options();
+  if (!debug_options.debug_tensor_watch_opts().empty()) {
+    const string watch_summary =
+        SummarizeDebugTensorWatches(debug_options.debug_tensor_watch_opts());
     h = Hash64(watch_summary.c_str(), watch_summary.size(), h);
   }
 
@@ -984,15 +1104,15 @@ uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
 
 string BuildGraphOptionsString(const BuildGraphOptions& opts) {
   string buf;
-  for (const string& name : opts.feed_endpoints) {
+  for (const string& name : opts.callable_options.feed()) {
     strings::StrAppend(&buf, " FdE: ", name);
   }
   strings::StrAppend(&buf, "\n");
-  for (const string& name : opts.target_nodes) {
+  for (const string& name : opts.callable_options.target()) {
     strings::StrAppend(&buf, " TN: ", name);
   }
   strings::StrAppend(&buf, "\n");
-  for (const string& name : opts.fetch_endpoints) {
+  for (const string& name : opts.callable_options.fetch()) {
     strings::StrAppend(&buf, " FeE: ", name);
   }
   strings::StrAppend(&buf, "\n");
@@ -1004,6 +1124,7 @@ MasterSession::MasterSession(
     std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     std::unique_ptr<DeviceSet> device_set,
+    std::vector<string> filtered_worker_list,
     StatsPublisherFactory stats_publisher_factory)
     : session_opts_(opt),
       env_(env),
@@ -1011,6 +1132,7 @@ MasterSession::MasterSession(
       remote_devs_(std::move(remote_devs)),
       worker_cache_(std::move(worker_cache)),
       devices_(std::move(device_set)),
+      filtered_worker_list_(std::move(filtered_worker_list)),
       stats_publisher_factory_(std::move(stats_publisher_factory)),
       graph_version_(0),
       run_graphs_(5),
@@ -1056,21 +1178,14 @@ Status MasterSession::Create(GraphDef* graph_def,
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph(
         graph_def, execution_options, &execution_state_));
   }
-  // TODO(b/36574172): Remove these conditions when ClusterSpec
-  // propagation is supported in all servers.
-  if (options.cluster_def != nullptr ||
-      session_opts_.config.isolate_session_state()) {
-    should_delete_worker_sessions_ = true;
-    return CreateWorkerSessions(options);
-  }
-  return Status::OK();
+  should_delete_worker_sessions_ = true;
+  return CreateWorkerSessions(options);
 }
 
 Status MasterSession::CreateWorkerSessions(
     const WorkerCacheFactoryOptions& options) {
-  std::vector<string> worker_names;
+  const std::vector<string> worker_names = filtered_worker_list_;
   WorkerCacheInterface* worker_cache = get_worker_cache();
-  worker_cache->ListWorkers(&worker_names);
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
@@ -1102,17 +1217,6 @@ Status MasterSession::CreateWorkerSessions(
     workers[i].name = &worker_names[i];
     workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
-    if (options.cluster_def) {
-      *workers[i].request.mutable_server_def()->mutable_cluster() =
-          *options.cluster_def;
-      workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
-      // Session state is always isolated when ClusterSpec propagation
-      // is in use.
-      workers[i].request.set_isolate_session_state(true);
-    } else {
-      workers[i].request.set_isolate_session_state(
-          session_opts_.config.isolate_session_state());
-    }
 
     DeviceNameUtils::ParsedName name;
     if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
@@ -1126,8 +1230,21 @@ Status MasterSession::CreateWorkerSessions(
       return status;
     }
 
-    workers[i].request.mutable_server_def()->set_job_name(name.job);
-    workers[i].request.mutable_server_def()->set_task_index(name.task);
+    if (options.cluster_def) {
+      *workers[i].request.mutable_server_def()->mutable_cluster() =
+          *options.cluster_def;
+      workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+      workers[i].request.mutable_server_def()->set_job_name(name.job);
+      workers[i].request.mutable_server_def()->set_task_index(name.task);
+      // Session state is always isolated when ClusterSpec propagation
+      // is in use.
+      workers[i].request.set_isolate_session_state(true);
+    } else {
+      // NOTE(mrry): Do not set any component of the ServerDef,
+      // because the worker will use its local configuration.
+      workers[i].request.set_isolate_session_state(
+          session_opts_.config.isolate_session_state());
+    }
   }
 
   for (size_t i = 0; i < worker_names.size(); ++i) {
@@ -1148,8 +1265,7 @@ Status MasterSession::CreateWorkerSessions(
 
 Status MasterSession::DeleteWorkerSessions() {
   WorkerCacheInterface* worker_cache = get_worker_cache();
-  std::vector<string> worker_names;
-  worker_cache->ListWorkers(&worker_names);
+  const std::vector<string>& worker_names = filtered_worker_list_;
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
@@ -1158,6 +1274,8 @@ Status MasterSession::DeleteWorkerSessions() {
     // The worker referenced by name. (Not owned.)
     WorkerInterface* worker = nullptr;
 
+    CallOptions call_opts;
+
     // Request and responses used for a given worker.
     DeleteWorkerSessionRequest request;
     DeleteWorkerSessionResponse response;
@@ -1181,6 +1299,9 @@ Status MasterSession::DeleteWorkerSessions() {
     workers[i].name = &worker_names[i];
     workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
+    // Since the worker may have gone away, set a timeout to avoid blocking the
+    // session-close operation.
+    workers[i].call_opts.SetTimeout(10000);
   }
 
   for (size_t i = 0; i < worker_names.size(); ++i) {
@@ -1188,8 +1309,8 @@ Status MasterSession::DeleteWorkerSessions() {
       workers[i].status = s;
       done.DecrementCount();
     };
-    workers[i].worker->DeleteWorkerSessionAsync(&workers[i].request,
-                                                &workers[i].response, cb);
+    workers[i].worker->DeleteWorkerSessionAsync(
+        &workers[i].call_opts, &workers[i].request, &workers[i].response, cb);
   }
 
   done.Wait();
@@ -1259,15 +1380,11 @@ WorkerCacheInterface* MasterSession::get_worker_cache() const {
   return env_->worker_cache;
 }
 
-Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
-                                ReffedClientGraph** rcg, bool is_partial) {
+Status MasterSession::StartStep(const BuildGraphOptions& opts, bool is_partial,
+                                ReffedClientGraph** out_rcg, int64* out_count) {
   const uint64 hash = HashBuildGraphOptions(opts);
   {
     mutex_lock l(mu_);
-    // Keep track of how many times this subgraph has been executed in
-    // this session.
-    int64* c = &subgraph_execution_counts_[hash];
-    *count = (*c)++;
     // TODO(suharshs): We cache partial run graphs and run graphs separately
     // because there is preprocessing that needs to only be run for partial
     // run calls.
@@ -1284,13 +1401,14 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
       WorkerCacheInterface* worker_cache = get_worker_cache();
       auto entry = new ReffedClientGraph(
           handle_, opts, std::move(client_graph), session_opts_,
-          stats_publisher_factory_, execution_state_.get(), is_partial,
-          worker_cache, !should_delete_worker_sessions_);
+          stats_publisher_factory_, is_partial, worker_cache,
+          !should_delete_worker_sessions_);
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
-    *rcg = iter->second;
-    (*rcg)->Ref();
+    *out_rcg = iter->second;
+    (*out_rcg)->Ref();
+    *out_count = (*out_rcg)->get_and_increment_execution_count();
   }
   return Status::OK();
 }
@@ -1309,6 +1427,12 @@ void MasterSession::ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
   rcg_map->clear();
 }
 
+namespace {
+uint64 MakeStepId() {
+  return (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+}
+}  // namespace
+
 Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
                                       PartialRunSetupResponse* resp) {
   std::vector<string> inputs, outputs, targets;
@@ -1325,15 +1449,15 @@ Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
   string handle = std::to_string(partial_run_handle_counter_.fetch_add(1));
 
   ReffedClientGraph* rcg = nullptr;
-  int64 count = 0;
 
   // Prepare.
   BuildGraphOptions opts;
   BuildBuildGraphOptions(*req, &opts);
-  TF_RETURN_IF_ERROR(StartStep(opts, &count, &rcg, true));
+  int64 count;
+  TF_RETURN_IF_ERROR(StartStep(opts, true, &rcg, &count));
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+  const uint64 step_id = MakeStepId();
   TRACEPRINTF("stepid %llu", step_id);
 
   rcg->Ref();
@@ -1578,6 +1702,73 @@ Status MasterSession::CreateDebuggerState(
   return Status::OK();
 }
 
+void MasterSession::FillPerStepState(MasterSession::ReffedClientGraph* rcg,
+                                     const RunOptions& run_options,
+                                     uint64 step_id, int64 count,
+                                     PerStepState* out_pss,
+                                     std::unique_ptr<ProfileHandler>* out_ph) {
+  out_pss->collect_timeline =
+      run_options.trace_level() == RunOptions::FULL_TRACE;
+  out_pss->collect_rpcs = run_options.trace_level() == RunOptions::FULL_TRACE;
+  out_pss->report_tensor_allocations_upon_oom =
+      run_options.report_tensor_allocations_upon_oom();
+  // Build the cost model every 'build_cost_model_every' steps after skipping an
+  // initial 'build_cost_model_after' steps.
+  const int64 build_cost_model_after =
+      session_opts_.config.graph_options().build_cost_model_after();
+  const int64 build_cost_model_every =
+      session_opts_.config.graph_options().build_cost_model();
+  out_pss->collect_costs =
+      build_cost_model_every > 0 &&
+      ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
+  out_pss->collect_partition_graphs = run_options.output_partition_graphs();
+
+  *out_ph = rcg->GetProfileHandler(step_id, count, run_options);
+  if (*out_ph) {
+    out_pss->collect_timeline = true;
+    out_pss->collect_rpcs = (*out_ph)->should_collect_rpcs();
+  }
+}
+
+Status MasterSession::PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
+                                     uint64 step_id,
+                                     const RunOptions& run_options,
+                                     PerStepState* pss,
+                                     const std::unique_ptr<ProfileHandler>& ph,
+                                     const Status& run_status,
+                                     RunMetadata* out_run_metadata) {
+  Status s = run_status;
+  if (s.ok()) {
+    pss->end_micros = Env::Default()->NowMicros();
+
+    // Schedule post-processing and cleanup to be done asynchronously.
+    rcg->ProcessStats(step_id, pss, ph.get(), run_options, out_run_metadata);
+  } else if (errors::IsCancelled(s)) {
+    mutex_lock l(mu_);
+    if (closed_) {
+      if (garbage_collected_) {
+        s = errors::Cancelled(
+            "Step was cancelled because the session was garbage collected due "
+            "to inactivity.");
+      } else {
+        s = errors::Cancelled(
+            "Step was cancelled by an explicit call to `Session::Close()`.");
+      }
+    }
+  }
+  Ref();
+  rcg->Ref();
+  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
+    if (!s.ok()) {
+      LOG(ERROR) << "Cleanup partition error: " << s;
+    }
+    rcg->Unref();
+    MarkRunCompletion();
+    Unref();
+  });
+  return s;
+}
+
 Status MasterSession::DoRunWithLocalExecution(
     CallOptions* opts, const RunStepRequestWrapper& req,
     MutableRunStepResponseWrapper* resp) {
@@ -1590,8 +1781,8 @@ Status MasterSession::DoRunWithLocalExecution(
   BuildGraphOptions bgopts;
   BuildBuildGraphOptions(req, &bgopts);
   ReffedClientGraph* rcg = nullptr;
-  int64 count = 0;
-  TF_RETURN_IF_ERROR(StartStep(bgopts, &count, &rcg, false));
+  int64 count;
+  TF_RETURN_IF_ERROR(StartStep(bgopts, false, &rcg, &count));
 
   // Unref "rcg" when out of scope.
   core::ScopedUnref unref(rcg);
@@ -1607,64 +1798,133 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+  const uint64 step_id = MakeStepId();
   TRACEPRINTF("stepid %llu", step_id);
 
-  pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
-  pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
-  pss.report_tensor_allocations_upon_oom =
-      req.options().report_tensor_allocations_upon_oom();
-  // Build the cost model every 'build_cost_model_every' steps after skipping an
-  // initial 'build_cost_model_after' steps.
-  const int64 build_cost_model_after =
-      session_opts_.config.graph_options().build_cost_model_after();
-  const int64 build_cost_model_every =
-      session_opts_.config.graph_options().build_cost_model();
-  pss.collect_costs =
-      build_cost_model_every > 0 &&
-      ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
-  pss.collect_partition_graphs = req.options().output_partition_graphs();
+  std::unique_ptr<ProfileHandler> ph;
+  FillPerStepState(rcg, req.options(), step_id, count, &pss, &ph);
 
-  std::unique_ptr<ProfileHandler> ph =
-      rcg->GetProfileHandler(step_id, count, req.options());
-  if (ph) {
-    pss.collect_timeline = true;
-    pss.collect_rpcs = ph->should_collect_rpcs();
+  Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
+                                &cancellation_manager_, false);
+  cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
+  return PostRunCleanup(rcg, step_id, req.options(), &pss, ph, s,
+                        resp->mutable_metadata());
+}
+
+Status MasterSession::MakeCallable(const MakeCallableRequest& req,
+                                   MakeCallableResponse* resp) {
+  UpdateLastAccessTime();
+
+  BuildGraphOptions opts;
+  opts.callable_options = req.options();
+  opts.use_function_convention = false;
+
+  ReffedClientGraph* callable;
+
+  {
+    mutex_lock l(mu_);
+    if (closed_) {
+      return errors::FailedPrecondition("Session is closed.");
+    }
+    std::unique_ptr<ClientGraph> client_graph;
+    TF_RETURN_IF_ERROR(execution_state_->BuildGraph(opts, &client_graph));
+    callable = new ReffedClientGraph(handle_, opts, std::move(client_graph),
+                                     session_opts_, stats_publisher_factory_,
+                                     false /* is_partial */, get_worker_cache(),
+                                     !should_delete_worker_sessions_);
+  }
+
+  Status s = BuildAndRegisterPartitions(callable);
+  if (!s.ok()) {
+    callable->Unref();
+    return s;
+  }
+
+  uint64 handle;
+  {
+    mutex_lock l(mu_);
+    handle = next_callable_handle_++;
+    callables_[handle] = callable;
+  }
+
+  resp->set_handle(handle);
+  return Status::OK();
+}
+
+Status MasterSession::DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
+                                    const RunCallableRequest& req,
+                                    RunCallableResponse* resp) {
+  VLOG(2) << "DoRunCallable req: " << req.DebugString();
+  PerStepState pss;
+  pss.start_micros = Env::Default()->NowMicros();
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
+
+  // Prepare.
+  int64 count = rcg->get_and_increment_execution_count();
+
+  // Keeps the highest 8 bits 0x01: we reserve some bits of the
+  // step_id for future use.
+  const uint64 step_id = MakeStepId();
+  TRACEPRINTF("stepid %llu", step_id);
+
+  const RunOptions& run_options = rcg->callable_options().run_options();
+
+  if (run_options.timeout_in_ms() != 0) {
+    opts->SetTimeout(run_options.timeout_in_ms());
   }
 
+  std::unique_ptr<ProfileHandler> ph;
+  FillPerStepState(rcg, run_options, step_id, count, &pss, &ph);
   Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
-                                &cancellation_manager_, false);
-  if (s.ok()) {
-    pss.end_micros = Env::Default()->NowMicros();
+                                &cancellation_manager_);
+  cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
+  return PostRunCleanup(rcg, step_id, run_options, &pss, ph, s,
+                        resp->mutable_metadata());
+}
 
-    // Schedule post-processing and cleanup to be done asynchronously.
-    rcg->ProcessStats(step_id, &pss, ph.get(), req.options(),
-                      resp->mutable_metadata());
-  } else if (errors::IsCancelled(s)) {
+Status MasterSession::RunCallable(CallOptions* opts,
+                                  const RunCallableRequest& req,
+                                  RunCallableResponse* resp) {
+  UpdateLastAccessTime();
+  ReffedClientGraph* callable;
+  {
     mutex_lock l(mu_);
     if (closed_) {
-      if (garbage_collected_) {
-        s = errors::Cancelled(
-            "Step was cancelled because the session was garbage collected due "
-            "to inactivity.");
-      } else {
-        s = errors::Cancelled(
-            "Step was cancelled by an explicit call to `Session::Close()`.");
-      }
+      return errors::FailedPrecondition("Session is closed.");
+    }
+    int64 handle = req.handle();
+    if (handle >= next_callable_handle_) {
+      return errors::InvalidArgument("No such callable handle: ", handle);
+    }
+    auto iter = callables_.find(req.handle());
+    if (iter == callables_.end()) {
+      return errors::InvalidArgument(
+          "Attempted to run callable after handle was released: ", handle);
     }
+    callable = iter->second;
+    callable->Ref();
+    ++num_running_;
   }
-  Ref();
-  rcg->Ref();
-  cleanup.release();  // MarkRunCompletion called in done closure.
-  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
-    if (!s.ok()) {
-      LOG(ERROR) << "Cleanup partition error: " << s;
+  core::ScopedUnref unref_callable(callable);
+  return DoRunCallable(opts, callable, req, resp);
+}
+
+Status MasterSession::ReleaseCallable(const ReleaseCallableRequest& req,
+                                      ReleaseCallableResponse* resp) {
+  UpdateLastAccessTime();
+  ReffedClientGraph* to_unref = nullptr;
+  {
+    mutex_lock l(mu_);
+    auto iter = callables_.find(req.handle());
+    if (iter != callables_.end()) {
+      to_unref = iter->second;
+      callables_.erase(iter);
     }
-    rcg->Unref();
-    MarkRunCompletion();
-    Unref();
-  });
-  return s;
+  }
+  if (to_unref != nullptr) {
+    to_unref->Unref();
+  }
+  return Status::OK();
 }
 
 Status MasterSession::Close() {
@@ -1681,6 +1941,7 @@ Status MasterSession::Close() {
     }
     ClearRunsTable(&to_unref, &run_graphs_);
     ClearRunsTable(&to_unref, &partial_run_graphs_);
+    ClearRunsTable(&to_unref, &callables_);
   }
   for (ReffedClientGraph* rcg : to_unref) rcg->Unref();
   if (should_delete_worker_sessions_) {
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 4bd4e1367aa75730df829a2909005a221b9ab780..ec34e20b79afe9e3b0eef3aa70639ef411977e6f 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -52,6 +52,7 @@ class MasterSession : public core::RefCounted {
       std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
       std::unique_ptr<DeviceSet> device_set,
+      std::vector<string> filtered_worker_list,
       StatsPublisherFactory stats_publisher_factory);
 
   // Initialize the MasterSession for "def".  Must be called before Extend(),
@@ -89,6 +90,15 @@ class MasterSession : public core::RefCounted {
 
   Status ListDevices(ListDevicesResponse* resp) const;
 
+  Status MakeCallable(const MakeCallableRequest& req,
+                      MakeCallableResponse* resp);
+
+  Status RunCallable(CallOptions* opts, const RunCallableRequest& req,
+                     RunCallableResponse* resp);
+
+  Status ReleaseCallable(const ReleaseCallableRequest& req,
+                         ReleaseCallableResponse* resp);
+
   // Close this session and delete "*this". Returns OK if all known
   // states are cleanup successfully.
   //
@@ -121,6 +131,10 @@ class MasterSession : public core::RefCounted {
   // The device set used by this session.
   std::unique_ptr<DeviceSet> devices_;
 
+  // The (partial device) names of remote worker tasks that this
+  // session will contact.
+  const std::vector<string> filtered_worker_list_;
+
   StatsPublisherFactory stats_publisher_factory_;
 
   std::atomic_ulong last_access_time_usec_;
@@ -140,6 +154,8 @@ class MasterSession : public core::RefCounted {
   typedef std::unordered_map<uint64, ReffedClientGraph*> RCGMap;
   RCGMap run_graphs_ GUARDED_BY(mu_);
   RCGMap partial_run_graphs_ GUARDED_BY(mu_);
+  int64 next_callable_handle_ GUARDED_BY(mu_) = 0;
+  RCGMap callables_ GUARDED_BY(mu_);
 
   struct PerStepState {
     bool collect_costs = false;
@@ -201,19 +217,31 @@ class MasterSession : public core::RefCounted {
   // workers.
   Status CreateWorkerSessions(const WorkerCacheFactoryOptions& server_def);
 
-  // TODO(b/36574172): Always use Create/DeleteWorkerSession.
   bool should_delete_worker_sessions_ = false;
   Status DeleteWorkerSessions();
 
-  Status StartStep(const BuildGraphOptions& opts, int64* count,
-                   ReffedClientGraph** graph, bool is_partial);
+  Status StartStep(const BuildGraphOptions& opts, bool is_partial,
+                   ReffedClientGraph** out_rcg, int64* out_count);
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
                       RCGMap* rcg_map) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void FillPerStepState(MasterSession::ReffedClientGraph* rcg,
+                        const RunOptions& run_options, uint64 step_id,
+                        int64 count, PerStepState* out_pss,
+                        std::unique_ptr<ProfileHandler>* out_ph);
   Status DoRunWithLocalExecution(CallOptions* opts,
                                  const RunStepRequestWrapper& req,
                                  MutableRunStepResponseWrapper* resp);
   Status DoPartialRun(CallOptions* opts, const RunStepRequestWrapper& req,
                       MutableRunStepResponseWrapper* resp);
+  Status DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
+                       const RunCallableRequest& req,
+                       RunCallableResponse* resp);
+  Status PostRunCleanup(MasterSession::ReffedClientGraph* rcg, uint64 step_id,
+                        const RunOptions& run_options, PerStepState* pss,
+                        const std::unique_ptr<ProfileHandler>& ph,
+                        const Status& run_status,
+                        RunMetadata* out_run_metadata);
+
   void MarkRunCompletion();
   void UpdateLastAccessTime();
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 66ebb3080af7cd01021362b5ea0c0b54458aebfc..40bf564cab6fe465ed66639f42fe0daeb149f132 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -282,10 +282,18 @@ const string& InMemoryRunGraphRequest::session_handle() const {
   return session_handle_;
 }
 
+bool InMemoryRunGraphRequest::create_worker_session_called() const {
+  return create_worker_session_called_;
+}
+
 void InMemoryRunGraphRequest::set_session_handle(const string& handle) {
   session_handle_ = handle;
 }
 
+void InMemoryRunGraphRequest::set_create_worker_session_called(bool called) {
+  create_worker_session_called_ = called;
+}
+
 const string& InMemoryRunGraphRequest::graph_handle() const {
   return graph_handle_;
 }
@@ -326,6 +334,20 @@ Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
   return Status::OK();
 }
 
+// TODO(b/74355905): Add a specialized implementation that avoids
+// copying the tensor when at least two of the {client, master,
+// worker} are in the same process.
+Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
+    const RunCallableRequest& run_callable_request, size_t i,
+    const string& send_key) {
+  Tensor tensor;
+  if (!ParseTensorProtoToTensor(run_callable_request.feed(i), &tensor)) {
+    return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
+  }
+  sends_.emplace_back(send_key, std::move(tensor));
+  return Status::OK();
+}
+
 size_t InMemoryRunGraphRequest::num_recvs() const { return recvs_.size(); }
 
 const string& InMemoryRunGraphRequest::recv_key(size_t i) const {
@@ -364,6 +386,8 @@ const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   if (!proto_version_) {
     proto_version_.reset(new RunGraphRequest);
     proto_version_->set_session_handle(session_handle());
+    proto_version_->set_create_worker_session_called(
+        create_worker_session_called());
     proto_version_->set_graph_handle(graph_handle());
     proto_version_->set_step_id(step_id());
     *proto_version_->mutable_exec_opts() = exec_opts();
@@ -389,6 +413,15 @@ void MutableProtoRunGraphRequest::set_session_handle(const string& handle) {
   request_.set_session_handle(handle);
 }
 
+bool MutableProtoRunGraphRequest::create_worker_session_called() const {
+  return request_.create_worker_session_called();
+}
+
+void MutableProtoRunGraphRequest::set_create_worker_session_called(
+    bool called) {
+  request_.set_create_worker_session_called(called);
+}
+
 const string& MutableProtoRunGraphRequest::graph_handle() const {
   return request_.graph_handle();
 }
@@ -439,6 +472,18 @@ Status MutableProtoRunGraphRequest::AddSendFromRunStepRequest(
   return Status::OK();
 }
 
+// TODO(b/74355905): Add a specialized implementation that avoids
+// copying the tensor when at least two of the {client, master,
+// worker} are in the same process.
+Status MutableProtoRunGraphRequest::AddSendFromRunCallableRequest(
+    const RunCallableRequest& run_callable_request, size_t i,
+    const string& send_key) {
+  NamedTensorProto* send = request_.add_send();
+  send->set_name(send_key);
+  *send->mutable_tensor() = run_callable_request.feed(i);
+  return Status::OK();
+}
+
 size_t MutableProtoRunGraphRequest::num_recvs() const {
   return request_.recv_key_size();
 }
@@ -488,6 +533,10 @@ const string& ProtoRunGraphRequest::session_handle() const {
   return request_->session_handle();
 }
 
+bool ProtoRunGraphRequest::create_worker_session_called() const {
+  return request_->create_worker_session_called();
+}
+
 const string& ProtoRunGraphRequest::graph_handle() const {
   return request_->graph_handle();
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 79fa6f926ea6afb351eacf279d3cf493b6d4713f..72a0c7edd8ecd8828099672e8cfa490385da3383 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -246,6 +246,9 @@ class RunGraphRequestWrapper {
   // namespace is used.
   virtual const string& session_handle() const = 0;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  virtual bool create_worker_session_called() const = 0;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   virtual const string& graph_handle() const = 0;
@@ -293,6 +296,7 @@ class RunGraphRequestWrapper {
 class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
  public:
   virtual void set_session_handle(const string& handle) = 0;
+  virtual void set_create_worker_session_called(bool called) = 0;
   virtual void set_graph_handle(const string& handle) = 0;
   virtual void set_step_id(int64 step_id) = 0;
   virtual ExecutorOpts* mutable_exec_opts() = 0;
@@ -302,6 +306,9 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
   virtual Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) = 0;
+  virtual Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) = 0;
 
   virtual void add_recv_key(const string& recv_key) = 0;
   virtual void set_is_partial(bool is_partial) = 0;
@@ -314,6 +321,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   // RunGraphRequestWrapper methods.
   const string& session_handle() const override;
   const string& graph_handle() const override;
+  bool create_worker_session_called() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
   size_t num_sends() const override;
@@ -328,12 +336,16 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 
   // MutableRunGraphRequestWrapper methods.
   void set_session_handle(const string& handle) override;
+  void set_create_worker_session_called(bool called) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
   Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) override;
+  Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
@@ -341,6 +353,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 
  private:
   string session_handle_;
+  bool create_worker_session_called_ = false;
   string graph_handle_;
   int64 step_id_;
   ExecutorOpts exec_opts_;
@@ -364,6 +377,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
   const string& session_handle() const override;
+  bool create_worker_session_called() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
@@ -379,12 +393,16 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
 
   // MutableRunGraphRequestWrapper methods.
   void set_session_handle(const string& handle) override;
+  void set_create_worker_session_called(bool called) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
   Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) override;
+  Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
@@ -400,6 +418,7 @@ class ProtoRunGraphRequest : public RunGraphRequestWrapper {
 
   // RunGraphRequestWrapper methods.
   const string& session_handle() const override;
+  bool create_worker_session_called() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.cc b/tensorflow/core/distributed_runtime/recent_request_ids.cc
index c30879406c6924aa85ad4bf8279b278eaf5d29fd..4f6866c5d154ba023b0923af67fe00a7a69b459d 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.cc
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 
+#include <utility>
+
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -29,12 +31,14 @@ RecentRequestIds::RecentRequestIds(int num_tracked_request_ids)
 Status RecentRequestIds::TrackUnique(int64 request_id,
                                      const string& method_name,
                                      const protobuf::Message& request) {
-  mutex_lock l(mu_);
   if (request_id == 0) {
     // For backwards compatibility, allow all requests with request_id 0.
     return Status::OK();
   }
-  if (set_.count(request_id) > 0) {
+
+  mutex_lock l(mu_);
+  const bool inserted = set_.insert(request_id).second;
+  if (!inserted) {
     // Note: RecentRequestIds is not strict LRU because we don't update
     // request_id's age in the circular_buffer_ if it's tracked again. Strict
     // LRU is not useful here because returning this error will close the
@@ -49,7 +53,6 @@ Status RecentRequestIds::TrackUnique(int64 request_id,
   // when the buffer is not yet full.
   set_.erase(circular_buffer_[next_index_]);
   circular_buffer_[next_index_] = request_id;
-  set_.insert(request_id);
   next_index_ = (next_index_ + 1) % circular_buffer_.size();
   return Status::OK();
 }
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
index e8e45331dd5a26e2230bb92e8ce73888d3f28505..11cf937c94659d85e3dc88350f20e107a27fab62 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.h
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
 
+#include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -64,7 +66,7 @@ class RecentRequestIds {
   // request_id.
   int next_index_ GUARDED_BY(mu_) = 0;
   std::vector<int64> circular_buffer_ GUARDED_BY(mu_);
-  gtl::FlatSet<int64> set_ GUARDED_BY(mu_);
+  std::unordered_set<int64> set_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids_test.cc b/tensorflow/core/distributed_runtime/recent_request_ids_test.cc
index 9a0facf5404bb4e6d0d57f55bcd1f2a4f4f99dba..8910a50e9cda691984d712ebfc5aea1d4f904d3f 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids_test.cc
+++ b/tensorflow/core/distributed_runtime/recent_request_ids_test.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
@@ -93,4 +95,15 @@ TEST(RecentRequestIds, Ordered3) { TestOrdered(3); }
 TEST(RecentRequestIds, Ordered4) { TestOrdered(4); }
 TEST(RecentRequestIds, Ordered5) { TestOrdered(5); }
 
+void BM_TrackUnique(int iters) {
+  RecentRequestIds recent_request_ids(100000);
+  RecvTensorRequest request;
+  for (int i = 0; i < iters; ++i) {
+    TF_CHECK_OK(recent_request_ids.TrackUnique(GetUniqueRequestId(),
+                                               "BM_TrackUnique", request));
+  }
+}
+
+BENCHMARK(BM_TrackUnique);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index dade26abc6a3c58f24c759ad863600a156985708..e973a22f45e9087ca97ec3d484ed9cb4563de687 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -5,18 +5,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
@@ -201,7 +189,7 @@ cc_library(
     srcs = ["grpc_worker_service_impl.cc"],
     hdrs = ["grpc_worker_service_impl.h"],
     deps = [
-        ":grpc_serialization_traits",
+        ":grpc_util",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "@grpc//:grpc++_unsecure",
@@ -247,21 +235,11 @@ cc_library(
     srcs = ["grpc_master_service_impl.cc"],
     hdrs = ["grpc_master_service_impl.h"],
     deps = [
-        ":grpc_serialization_traits",
         "//tensorflow/core:master_proto_cc",
         "@grpc//:grpc++_unsecure",
     ],
 )
 
-cc_library(
-    name = "grpc_serialization_traits",
-    srcs = [],
-    hdrs = ["grpc_serialization_traits.h"],
-    deps = [
-        "@grpc//:grpc++_unsecure",
-    ],
-)
-
 cc_library(
     name = "rpc_rendezvous_mgr",
     srcs = ["rpc_rendezvous_mgr.cc"],
@@ -381,6 +359,7 @@ tf_cuda_library(
     data = [
         ":grpc_testlib_server",
     ],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":grpc_session",
         ":grpc_testlib_ops",
@@ -509,3 +488,33 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
+
+cc_library(
+    name = "grpc_rpc_factory",
+    srcs = [
+        "grpc_rpc_factory.cc",
+    ],
+    hdrs = ["grpc_rpc_factory.h"],
+    deps = [
+        ":grpc_state",
+        ":grpc_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util/rpc:call_container",
+        "//tensorflow/core/util/rpc:rpc_factory",
+    ],
+)
+
+cc_library(
+    name = "grpc_rpc_factory_registration",
+    srcs = [
+        "grpc_rpc_factory_registration.cc",
+    ],
+    deps = [
+        ":grpc_rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory_registry",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index b4d18d8607eaddd75f4e395e71fbd75554645a61..e025e555dd065674edff3492df1cc4bd040fc741 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -47,11 +47,11 @@ namespace tensorflow {
 
 class GrpcMasterService : public AsyncServiceInterface {
  public:
-  GrpcMasterService(Master* master, int64 default_timeout_in_ms,
+  GrpcMasterService(Master* master, const ConfigProto& default_session_config,
                     ::grpc::ServerBuilder* builder)
       : master_impl_(master),
-        default_timeout_in_ms_(default_timeout_in_ms),
-        is_shutdown_(false) {
+        is_shutdown_(false),
+        default_session_config_(default_session_config) {
     builder->RegisterService(&master_service_);
     cq_ = builder->AddCompletionQueue();
   }
@@ -111,6 +111,11 @@ class GrpcMasterService : public AsyncServiceInterface {
     ENQUEUE_REQUEST(CloseSession, false);
     ENQUEUE_REQUEST(ListDevices, false);
     ENQUEUE_REQUEST(Reset, false);
+    ENQUEUE_REQUEST(MakeCallable, false);
+    for (int i = 0; i < 100; ++i) {
+      ENQUEUE_REQUEST(RunCallable, true);
+    }
+    ENQUEUE_REQUEST(ReleaseCallable, false);
 
     void* tag;
     bool ok;
@@ -129,12 +134,12 @@ class GrpcMasterService : public AsyncServiceInterface {
 
  private:
   Master* master_impl_ = nullptr;  // Not owned.
-  const int64 default_timeout_in_ms_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   grpc::MasterService::AsyncService master_service_;
 
   mutex mu_;
   bool is_shutdown_ GUARDED_BY(mu_);
+  const ConfigProto default_session_config_;
   ::grpc::Alarm* shutdown_alarm_ = nullptr;
 
   template <class RequestMessage, class ResponseMessage>
@@ -144,9 +149,13 @@ class GrpcMasterService : public AsyncServiceInterface {
   // RPC handler for creating a session.
   void CreateSessionHandler(
       MasterCall<CreateSessionRequest, CreateSessionResponse>* call) {
-    master_impl_->CreateSession(&call->request, &call->response,
-                                [call](const Status& status) {
+    CreateSessionRequest* rewritten_req = new CreateSessionRequest;
+    rewritten_req->mutable_config()->MergeFrom(default_session_config_);
+    rewritten_req->MergeFrom(call->request);
+    master_impl_->CreateSession(rewritten_req, &call->response,
+                                [call, rewritten_req](const Status& status) {
                                   call->SendResponse(ToGrpcStatus(status));
+                                  delete rewritten_req;
                                 });
     ENQUEUE_REQUEST(CreateSession, true);
   }
@@ -178,7 +187,7 @@ class GrpcMasterService : public AsyncServiceInterface {
     if (call->request.options().timeout_in_ms() > 0) {
       call_opts->SetTimeout(call->request.options().timeout_in_ms());
     } else {
-      call_opts->SetTimeout(default_timeout_in_ms_);
+      call_opts->SetTimeout(default_session_config_.operation_timeout_in_ms());
     }
     RunStepRequestWrapper* wrapped_request =
         new ProtoRunStepRequest(&call->request);
@@ -232,10 +241,51 @@ class GrpcMasterService : public AsyncServiceInterface {
                         });
     ENQUEUE_REQUEST(Reset, false);
   }
+
+  // RPC handler for making a callable.
+  void MakeCallableHandler(
+      MasterCall<MakeCallableRequest, MakeCallableResponse>* call) {
+    master_impl_->MakeCallable(&call->request, &call->response,
+                               [call](const Status& status) {
+                                 call->SendResponse(ToGrpcStatus(status));
+                               });
+    ENQUEUE_REQUEST(MakeCallable, false);
+  }
+
+  // RPC handler for running a callable.
+  void RunCallableHandler(
+      MasterCall<RunCallableRequest, RunCallableResponse>* call) {
+    auto* trace = TraceRpc("RunCallable/Server", call->client_metadata());
+    CallOptions* call_opts = new CallOptions;
+    // The timeout may be overridden by a non-zero timeout in the
+    // callable's `RunOptions`; this overriding will happen inside the
+    // `MasterSession` implementation.
+    call_opts->SetTimeout(default_session_config_.operation_timeout_in_ms());
+    call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+    master_impl_->RunCallable(call_opts, &call->request, &call->response,
+                              [call, call_opts, trace](const Status& status) {
+                                call->ClearCancelCallback();
+                                delete call_opts;
+                                delete trace;
+                                call->SendResponse(ToGrpcStatus(status));
+                              });
+    ENQUEUE_REQUEST(RunCallable, false);
+  }
+
+  // RPC handler for making a callable.
+  void ReleaseCallableHandler(
+      MasterCall<ReleaseCallableRequest, ReleaseCallableResponse>* call) {
+    master_impl_->ReleaseCallable(&call->request, &call->response,
+                                  [call](const Status& status) {
+                                    call->SendResponse(ToGrpcStatus(status));
+                                  });
+    ENQUEUE_REQUEST(ReleaseCallable, false);
+  }
+
 #undef ENQUEUE_REQUEST
 
   // Start tracing, including the ID attached to the RPC.
-  port::Tracing::TraceMe* TraceRpc(
+  tracing::ScopedActivity* TraceRpc(
       StringPiece name,
       const std::multimap<::grpc::string_ref, ::grpc::string_ref>& metadata) {
     StringPiece id;
@@ -243,16 +293,16 @@ class GrpcMasterService : public AsyncServiceInterface {
     if (it != metadata.end()) {
       id = StringPiece(it->second.data(), it->second.size());
     }
-    return new port::Tracing::TraceMe(name, id);
+    return new tracing::ScopedActivity(name, id);
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterService);
 };
 
-AsyncServiceInterface* NewGrpcMasterService(Master* master,
-                                            int64 default_timeout_in_ms,
-                                            ::grpc::ServerBuilder* builder) {
-  return new GrpcMasterService(master, default_timeout_in_ms, builder);
+AsyncServiceInterface* NewGrpcMasterService(
+    Master* master, const ConfigProto& default_session_config,
+    ::grpc::ServerBuilder* builder) {
+  return new GrpcMasterService(master, default_session_config, builder);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
index 473604f257607456d0fb4dcb6d9189f2f6dba135..f0fe5b0c4e96941f2774392273c3e1457219796a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/master.pb.h"
 
 namespace grpc {
 class ServerBuilder;
@@ -28,9 +29,9 @@ namespace tensorflow {
 class AsyncServiceInterface;
 class Master;
 
-AsyncServiceInterface* NewGrpcMasterService(Master* master,
-                                            int64 default_timeout_in_ms,
-                                            ::grpc::ServerBuilder* builder);
+AsyncServiceInterface* NewGrpcMasterService(
+    Master* master, const ConfigProto& default_session_config,
+    ::grpc::ServerBuilder* builder);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index e2016e824c0bf504af4c624cad253963b223eb35..c832adbbbf8eba1ec512d62470025fb56a39b8a4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -36,6 +36,9 @@ static const char* grpcMasterService_method_names[] = {
     "/tensorflow.MasterService/CloseSession",
     "/tensorflow.MasterService/ListDevices",
     "/tensorflow.MasterService/Reset",
+    "/tensorflow.MasterService/MakeCallable",
+    "/tensorflow.MasterService/RunCallable",
+    "/tensorflow.MasterService/ReleaseCallable",
 };
 
 std::unique_ptr<MasterService::Stub> MasterService::NewStub(
@@ -64,7 +67,14 @@ MasterService::Stub::Stub(
       rpcmethod_ListDevices_(grpcMasterService_method_names[5],
                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_Reset_(grpcMasterService_method_names[6],
-                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel) {}
+                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_MakeCallable_(grpcMasterService_method_names[7],
+                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_RunCallable_(grpcMasterService_method_names[8],
+                             ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_ReleaseCallable_(grpcMasterService_method_names[9],
+                                 ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                 channel) {}
 
 ::grpc::Status MasterService::Stub::CreateSession(
     ::grpc::ClientContext* context, const CreateSessionRequest& request,
@@ -115,8 +125,29 @@ MasterService::Stub::Stub(
                                              context, request, response);
 }
 
+::grpc::Status MasterService::Stub::MakeCallable(
+    ::grpc::ClientContext* context, const MakeCallableRequest& request,
+    MakeCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_MakeCallable_, context, request, response);
+}
+
+::grpc::Status MasterService::Stub::RunCallable(
+    ::grpc::ClientContext* context, const RunCallableRequest& request,
+    RunCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_RunCallable_, context, request, response);
+}
+
+::grpc::Status MasterService::Stub::ReleaseCallable(
+    ::grpc::ClientContext* context, const ReleaseCallableRequest& request,
+    ReleaseCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_ReleaseCallable_, context, request, response);
+}
+
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 7; ++i) {
+  for (int i = 0; i < 10; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 6ae94b74417c3fb6c4da1589bb9f532cb6d79930..8f1b589698276d5df7aa0245d57bc5bdb4a9f0db 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -25,18 +25,8 @@ limitations under the License.
 #include "grpc++/impl/codegen/stub_options.h"
 #include "grpc++/impl/codegen/sync_stream.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::CreateSessionRequest);
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::ExtendSessionRequest);
-// Contains potentially large TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunStepRequest);
-// Contains potentially large StepStats, TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunStepResponse);
-
 namespace grpc {
 class CompletionQueue;
 class Channel;
@@ -79,6 +69,15 @@ class MasterService final {
     virtual ::grpc::Status Reset(::grpc::ClientContext* context,
                                  const ResetRequest& request,
                                  ResetResponse* response) = 0;
+    virtual ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                        const MakeCallableRequest& request,
+                                        MakeCallableResponse* response) = 0;
+    virtual ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                                       const RunCallableRequest& request,
+                                       RunCallableResponse* response) = 0;
+    virtual ::grpc::Status ReleaseCallable(
+        ::grpc::ClientContext* context, const ReleaseCallableRequest& request,
+        ReleaseCallableResponse* response) = 0;
   };
   class Stub final : public StubInterface {
    public:
@@ -104,6 +103,15 @@ class MasterService final {
     ::grpc::Status Reset(::grpc::ClientContext* context,
                          const ResetRequest& request,
                          ResetResponse* response) override;
+    ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                const MakeCallableRequest& request,
+                                MakeCallableResponse* response) override;
+    ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                               const RunCallableRequest& request,
+                               RunCallableResponse* response) override;
+    ::grpc::Status ReleaseCallable(::grpc::ClientContext* context,
+                                   const ReleaseCallableRequest& request,
+                                   ReleaseCallableResponse* response) override;
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
@@ -114,6 +122,9 @@ class MasterService final {
     const ::grpc::internal::RpcMethod rpcmethod_CloseSession_;
     const ::grpc::internal::RpcMethod rpcmethod_ListDevices_;
     const ::grpc::internal::RpcMethod rpcmethod_Reset_;
+    const ::grpc::internal::RpcMethod rpcmethod_MakeCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_RunCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_ReleaseCallable_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
@@ -179,6 +190,30 @@ class MasterService final {
       ::grpc::Service::RequestAsyncUnary(6, context, request, response,
                                          new_call_cq, notification_cq, tag);
     }
+    void RequestMakeCallable(
+        ::grpc::ServerContext* context, MakeCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<MakeCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(7, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestRunCallable(
+        ::grpc::ServerContext* context, RunCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<RunCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(8, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestReleaseCallable(
+        ::grpc::ServerContext* context, ReleaseCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<ReleaseCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(9, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
   };
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 1088e9be66ceb7fbddfaed0691423745f362343f..b832a2115cb809b5561fc55ab8d9057f2274dcd8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -95,13 +95,35 @@ class GrpcRemoteMaster : public MasterInterface {
                 &MasterServiceStub::Reset);
   }
 
+  Status MakeCallable(CallOptions* call_options,
+                      const MakeCallableRequest* request,
+                      MakeCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::MakeCallable);
+  }
+  Status RunCallable(CallOptions* call_options,
+                     const RunCallableRequest* request,
+                     RunCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::RunCallable);
+  }
+  Status ReleaseCallable(CallOptions* call_options,
+                         const ReleaseCallableRequest* request,
+                         ReleaseCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::ReleaseCallable);
+  }
+
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
-  port::Tracing::TraceMe TraceRpc(StringPiece name,
-                                  ::grpc::ClientContext* ctx) {
-    string trace_id = strings::StrCat(port::Tracing::UniqueId());
+  tracing::ScopedActivity TraceRpc(StringPiece name,
+                                   ::grpc::ClientContext* ctx) {
+    string trace_id = strings::StrCat(tracing::GetUniqueArg());
     ctx->AddMetadata(GrpcIdKey(), trace_id);
-    return port::Tracing::TraceMe(name, trace_id);
+    return tracing::ScopedActivity(name, trace_id);
   }
 
   void SetDeadline(::grpc::ClientContext* ctx, int64 time_in_ms) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index b3b05408b15e20ceb934267ccb66134133aff2fd..895bbd97b769219ab81180bfa63002958088276d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -72,10 +72,12 @@ class GrpcRemoteWorker : public WorkerInterface {
     IssueRequest(request, response, createworkersession_, std::move(done));
   }
 
-  void DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+  void DeleteWorkerSessionAsync(CallOptions* call_opts,
+                                const DeleteWorkerSessionRequest* request,
                                 DeleteWorkerSessionResponse* response,
                                 StatusCallback done) override {
-    IssueRequest(request, response, deleteworkersession_, std::move(done));
+    IssueRequest(request, response, deleteworkersession_, std::move(done),
+                 call_opts);
   }
 
   void RegisterGraphAsync(const RegisterGraphRequest* request,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cde6b785dc6e351ba0d51bef9b23d6bd05742320
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -0,0 +1,216 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/util/rpc/call_container.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h"
+
+namespace tensorflow {
+
+namespace internal {
+class GrpcCall {
+ public:
+  explicit GrpcCall(CallContainer<GrpcCall>* container, int index, bool try_rpc,
+                    const string* request_msg, string* response_msg,
+                    int32* status_code, string* status_message)
+      : container_(container),
+        index_(index),
+        try_rpc_(try_rpc),
+        request_msg_(request_msg),
+        response_msg_(response_msg),
+        status_code_(status_code),
+        status_message_(status_message) {}
+
+  void StartCancel() { call_opts_.StartCancel(); }
+
+  void Done(const Status& s) {
+    DCHECK(container_ != nullptr);
+    if (!s.ok() && try_rpc_) {
+      DCHECK(status_code_ != nullptr);
+      DCHECK(status_message_ != nullptr);
+      *status_code_ = s.code();
+      *status_message_ = s.error_message();
+    }
+    container_->Done(s, index_);
+  }
+
+  CallOptions* call_opts() { return &call_opts_; }
+  int index() { return index_; }
+  const string& request() const { return *request_msg_; }
+  string* response() const { return response_msg_; }
+
+ private:
+  CallContainer<GrpcCall>* const container_;
+  const int index_;
+  bool try_rpc_;
+  CallOptions call_opts_;
+  const string* request_msg_;
+  string* response_msg_;
+  int* status_code_;
+  string* status_message_;
+};
+
+}  // namespace internal
+
+using internal::GrpcCall;
+
+GrpcRPCFactory::GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
+                               int64 timeout_in_ms)
+    : RPCFactory(), fail_fast_(fail_fast), timeout_in_ms_(timeout_in_ms) {
+  // TODO(ebrevdo): Investigate possible performance improvements by
+  // replacing this thread with a threadpool.
+  polling_thread_ =
+      ctx->env()->StartThread(ThreadOptions(), "rpc_op_grpc_factory", [this]() {
+        void* tag;
+        bool ok;
+        while (completion_queue_.Next(&tag, &ok)) {
+          GrpcClientCQTag* callback_tag = static_cast<GrpcClientCQTag*>(tag);
+          callback_tag->OnCompleted(ok);
+        }
+      });
+}
+
+GrpcRPCFactory::~GrpcRPCFactory() {
+  // The amount of time we wait depends on several parameters, including:
+  //   - the value of the fail_fast attribute.
+  //   - the timeout option of the rpc call in the proto declaration.
+  //   - the network roundtrip time and service's execution time.
+  //
+  // If a connection is made but the service doesn't ever respond, and
+  // there is no timeout option set for this rpc call, then it is
+  // possible the RPC request will wait forever.
+  //
+  completion_queue_.Shutdown();
+  delete polling_thread_;
+}
+
+void GrpcRPCFactory::Call(OpKernelContext* ctx, int64 num_elements,
+                          const Tensor& address_t, const Tensor& method_t,
+                          const Tensor& request_t, const bool try_rpc,
+                          Tensor* response_t, Tensor* status_code_t,
+                          Tensor* status_message_t,
+                          AsyncOpKernel::DoneCallback done) {
+  if (try_rpc) {
+    // In this case status_code will never be set in the response,
+    // so we just set it to OK.
+    DCHECK(status_code_t != nullptr);
+    status_code_t->flat<int32>().setConstant(
+        static_cast<int>(errors::Code::OK));
+  }
+
+  CallContainer<GrpcCall>::CreateCallFn create_call_fn =
+      [this, &request_t, &try_rpc, response_t, status_code_t, status_message_t](
+          CallContainer<GrpcCall>* container, int index) {
+        CreateCall(request_t, try_rpc, index, container, response_t,
+                   status_code_t, status_message_t);
+      };
+
+  CallContainer<GrpcCall>::StartCallFn start_call_fn =
+      [this, &address_t, &method_t](GrpcCall* call) {
+        StartCall(address_t, method_t, call);
+      };
+
+  // This object will delete itself when done.
+  new CallContainer<GrpcCall>(ctx, num_elements, fail_fast_, try_rpc,
+                              std::move(done), std::move(create_call_fn),
+                              std::move(start_call_fn));
+}
+
+::grpc::GenericStub* GrpcRPCFactory::GetOrCreateStubForAddress(
+    const string& address) {
+  mutex_lock lock(mu_);
+
+  auto stub = stubs_.find(address);
+  if (stub != stubs_.end()) return stub->second.get();
+
+  ChannelPtr channel = CreateChannelForAddress(address);
+  auto* created = new ::grpc::GenericStub(channel);
+  stubs_[address].reset(created);
+  return created;
+}
+
+GrpcRPCFactory::ChannelPtr GrpcRPCFactory::CreateChannelForAddress(
+    const string& address) {
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+
+  // Set a standard backoff timeout of 1s instead of the
+  // (sometimes default) 20s.
+  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  return ::grpc::CreateCustomChannel(
+      /*target=*/address, ::grpc::InsecureChannelCredentials(), args);
+}
+
+void GrpcRPCFactory::CreateCall(const Tensor& request_t, const bool try_rpc,
+                                int index, CallContainer<GrpcCall>* container,
+                                Tensor* response_t, Tensor* status_code_t,
+                                Tensor* status_message_t) {
+  auto request = request_t.flat<string>();
+  auto get_request_ptr = [&request](int64 ix) -> const string* {
+    return (request.size() > 1) ? &(request(ix)) : &(request(0));
+  };
+  auto response = response_t->flat<string>();
+  int32* status_code_ptr = nullptr;
+  string* status_message_ptr = nullptr;
+  if (try_rpc) {
+    status_code_ptr = status_code_t->flat<int32>().data();
+    status_message_ptr = status_message_t->flat<string>().data();
+  }
+  container->RegisterCall(container, index, try_rpc, get_request_ptr(index),
+                          &response(index),
+                          (try_rpc) ? &status_code_ptr[index] : nullptr,
+                          (try_rpc) ? &status_message_ptr[index] : nullptr);
+}
+
+void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
+                               GrpcCall* call) {
+  auto address = address_t.flat<string>();
+  auto method = method_t.flat<string>();
+  // Stubs are maintained by the GrpcRPCFactory class and will be
+  // deleted when the class is destroyed.
+  ::grpc::GenericStub* singleton_stub = nullptr;
+  if (address.size() == 1) {
+    singleton_stub = GetOrCreateStubForAddress(address(0));
+  }
+  auto get_stub = [&address, this,
+                   singleton_stub](int64 ix) -> ::grpc::GenericStub* {
+    return (address.size() > 1) ? GetOrCreateStubForAddress(address(ix))
+                                : singleton_stub;
+  };
+  auto get_method_ptr = [&method](int64 ix) -> const string* {
+    return (method.size() > 1) ? &(method(ix)) : &(method(0));
+  };
+
+  int index = call->index();
+  // This object will delete itself when done.
+  new RPCState<string>(get_stub(index), &completion_queue_,
+                       *get_method_ptr(index), call->request(),
+                       call->response(),
+                       /*done=*/[call](const Status& s) { call->Done(s); },
+                       call->call_opts(), fail_fast_, timeout_in_ms_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..29394c84b551841eb9823fdea5c6f9d309cba347
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
@@ -0,0 +1,77 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/rpc/call_container.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+// Forward declaration of GrpcCall.
+namespace internal {
+class GrpcCall;
+}  // namespace internal
+
+class GrpcRPCFactory : public RPCFactory {
+ public:
+  explicit GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
+                          int64 timeout_in_ms);
+
+  // Explicit destructor to control destruction order.
+  ~GrpcRPCFactory() override;
+
+  void Call(OpKernelContext* ctx, int64 num_elements, const Tensor& address_t,
+            const Tensor& method_t, const Tensor& request_t, const bool try_rpc,
+            Tensor* response_t, Tensor* status_code_t, Tensor* status_message_t,
+            AsyncOpKernel::DoneCallback done) override;
+
+ protected:
+  typedef std::shared_ptr<::grpc::Channel> ChannelPtr;
+  virtual ChannelPtr CreateChannelForAddress(const string& address);
+
+ private:
+  // Creates a call and registers it with given `container`. The `index` is used
+  // to index into the tensor arguments.
+  void CreateCall(const Tensor& request_t, const bool try_rpc, int index,
+                  CallContainer<internal::GrpcCall>* container,
+                  Tensor* response_t, Tensor* status_code_t,
+                  Tensor* status_message_t);
+
+  // Asynchronously invokes the given `call`. The call completion is handled
+  // by the call container the call was previously registered with.
+  void StartCall(const Tensor& address_t, const Tensor& method_t,
+                 internal::GrpcCall* call);
+
+  ::grpc::GenericStub* GetOrCreateStubForAddress(const string& address);
+
+  bool fail_fast_;
+  int64 timeout_in_ms_;
+  ::grpc::CompletionQueue completion_queue_;
+  Thread* polling_thread_;  // Owned.
+
+  mutex mu_;
+  typedef std::unique_ptr<::grpc::GenericStub> StubPtr;
+  std::unordered_map<string, StubPtr> stubs_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b884489378464d7271e31e0ae1d180134becc6dc
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+namespace {
+
+// Used for adding the grpc factory to the RPC factory registry.
+struct Value {
+  static RPCFactory* Function(OpKernelConstruction* ctx, bool fail_fast,
+                              int64 timeout_in_ms) {
+    return new GrpcRPCFactory(ctx, fail_fast, timeout_in_ms);
+  }
+};
+
+REGISTER_RPC_FACTORY("grpc", Value::Function);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
deleted file mode 100644
index 730124c25e9a3e8d102a9dd39e4c4a17f2ce39d1..0000000000000000000000000000000000000000
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-
-#include "grpc++/impl/codegen/proto_utils.h"
-#include "grpc++/support/slice.h"
-
-namespace grpc {
-
-namespace tensorflow_helper {
-
-const int kGrpcBufferWriterMaxBufferLength = 8192;
-
-class GrpcBufferWriter final
-    : public ::grpc::protobuf::io::ZeroCopyOutputStream {
- public:
-  explicit GrpcBufferWriter(grpc_byte_buffer** bp, int block_size)
-      : block_size_(block_size), byte_count_(0), have_backup_(false) {
-    *bp = g_core_codegen_interface->grpc_raw_byte_buffer_create(NULL, 0);
-    slice_buffer_ = &(*bp)->data.raw.slice_buffer;
-  }
-
-  ~GrpcBufferWriter() override {
-    if (have_backup_) {
-      g_core_codegen_interface->grpc_slice_unref(backup_slice_);
-    }
-  }
-
-  bool Next(void** data, int* size) override {
-    if (have_backup_) {
-      slice_ = backup_slice_;
-      have_backup_ = false;
-    } else {
-      slice_ = g_core_codegen_interface->grpc_slice_malloc(block_size_);
-    }
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
-    g_core_codegen_interface->grpc_slice_buffer_add(slice_buffer_, slice_);
-    return true;
-  }
-
-  void BackUp(int count) override {
-    g_core_codegen_interface->grpc_slice_buffer_pop(slice_buffer_);
-    if (count == block_size_) {
-      backup_slice_ = slice_;
-    } else {
-      backup_slice_ = g_core_codegen_interface->grpc_slice_split_tail(
-          &slice_, GRPC_SLICE_LENGTH(slice_) - count);
-      g_core_codegen_interface->grpc_slice_buffer_add(slice_buffer_, slice_);
-    }
-    // It's dangerous to keep an inlined grpc_slice as the backup slice, since
-    // on a following Next() call, a reference will be returned to this slice
-    // via GRPC_SLICE_START_PTR, which will not be an address held by
-    // slice_buffer_.
-    have_backup_ = backup_slice_.refcount != NULL;
-    byte_count_ -= count;
-  }
-
-  grpc::protobuf::int64 ByteCount() const override { return byte_count_; }
-
- private:
-  const int block_size_;
-  int64_t byte_count_;
-  grpc_slice_buffer* slice_buffer_;
-  bool have_backup_;
-  grpc_slice backup_slice_;
-  grpc_slice slice_;
-};
-
-class GrpcBufferReader final
-    : public ::grpc::protobuf::io::ZeroCopyInputStream {
-  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    (g_core_codegen_interface->*ptr)(reader, buffer);
-  }
-  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
-    (void)result;
-  }
-
- public:
-  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
-      : byte_count_(0), backup_count_(0) {
-    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
-               buffer);
-  }
-  ~GrpcBufferReader() override {
-    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
-  }
-
-  bool Next(const void** data, int* size) override {
-    if (backup_count_ > 0) {
-      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
-              backup_count_;
-      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = (int)backup_count_;
-      backup_count_ = 0;
-      return true;
-    }
-    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
-                                                                &slice_)) {
-      return false;
-    }
-    g_core_codegen_interface->grpc_slice_unref(slice_);
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
-    return true;
-  }
-
-  void BackUp(int count) override { backup_count_ = count; }
-
-  bool Skip(int count) override {
-    const void* data;
-    int size;
-    while (Next(&data, &size)) {
-      if (size >= count) {
-        BackUp(size - count);
-        return true;
-      }
-      // size < count;
-      count -= size;
-    }
-    // error or we have too large count;
-    return false;
-  }
-
-  grpc::protobuf::int64 ByteCount() const override {
-    return byte_count_ - backup_count_;
-  }
-
- private:
-  int64_t byte_count_;
-  int64_t backup_count_;
-  grpc_byte_buffer_reader reader_;
-  grpc_slice slice_;
-};
-
-}  // namespace tensorflow_helper
-
-// Defines specialized serialization/deserialization routines that
-// default to allowing a 2GB max message size.
-//
-// To instantiate this template for a particular type `T`, use
-// `TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(T)`, as defined below.
-template <typename T>
-class UnlimitedSizeProtoSerializationTraits {
- public:
-  static Status Serialize(const T& msg, grpc_byte_buffer** bp,
-                          bool* own_buffer) {
-    *own_buffer = true;
-    int byte_size = msg.ByteSize();
-    if (byte_size < 0) {
-      return Status(StatusCode::INTERNAL, "Message length was negative");
-    } else if (byte_size <=
-               tensorflow_helper::kGrpcBufferWriterMaxBufferLength) {
-      grpc_slice slice = g_core_codegen_interface->grpc_slice_malloc(byte_size);
-      GPR_CODEGEN_ASSERT(
-          GRPC_SLICE_END_PTR(slice) ==
-          msg.SerializeWithCachedSizesToArray(GRPC_SLICE_START_PTR(slice)));
-      *bp = g_core_codegen_interface->grpc_raw_byte_buffer_create(&slice, 1);
-      g_core_codegen_interface->grpc_slice_unref(slice);
-      return g_core_codegen_interface->ok();
-    } else {
-      tensorflow_helper::GrpcBufferWriter writer(
-          bp, tensorflow_helper::kGrpcBufferWriterMaxBufferLength);
-      return msg.SerializeToZeroCopyStream(&writer)
-                 ? g_core_codegen_interface->ok()
-                 : Status(StatusCode::INTERNAL, "Failed to serialize message");
-    }
-  }
-
-  static Status Deserialize(grpc_byte_buffer* buffer, T* msg,
-                            int max_message_size = INT_MAX) {
-    if (buffer == nullptr) {
-      return Status(StatusCode::INTERNAL, "No payload");
-    }
-    Status result = g_core_codegen_interface->ok();
-    {
-      tensorflow_helper::GrpcBufferReader reader(buffer);
-      ::grpc::protobuf::io::CodedInputStream decoder(&reader);
-      if (max_message_size == 0) {
-        // NOTE(mrry): Override maximum message size to 2GB.
-        decoder.SetTotalBytesLimit(INT_MAX, INT_MAX);
-      } else {
-        decoder.SetTotalBytesLimit(max_message_size, max_message_size);
-      }
-      if (!msg->ParseFromCodedStream(&decoder)) {
-        result = Status(StatusCode::INTERNAL, msg->InitializationErrorString());
-      }
-      if (!decoder.ConsumedEntireMessage()) {
-        result = Status(StatusCode::INTERNAL, "Did not read entire message");
-      }
-    }
-    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
-    return result;
-  }
-};
-
-}  // namespace grpc
-
-// For the given protobuf message type `MessageType`, specializes the
-// gRPC serialization and deserialization such that the default
-// maximum message size is 2GB.
-#define TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(MessageType)             \
-  namespace grpc {                                                    \
-  template <>                                                         \
-  class SerializationTraits<MessageType>                              \
-      : public UnlimitedSizeProtoSerializationTraits<MessageType> {}; \
-  }  // namespace grpc
-
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index c4ac92d809627e7134b5d4ae694f9978cd5390b4..99b6bda6b145fa89cd00bd1e1e9cca1b34b5b3a6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -106,7 +106,8 @@ GrpcServer::~GrpcServer() {
 Status GrpcServer::Init(
     ServiceInitFunction service_func,
     const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const WorkerCreationFunction& worker_func) {
+    const WorkerCreationFunction& worker_func,
+    const StatsPublisherFactory& stats_factory) {
   mutex_lock l(mu_);
   CHECK_EQ(state_, NEW);
   master_env_.env = env_;
@@ -182,8 +183,7 @@ Status GrpcServer::Init(
   builder.SetOption(
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
   master_impl_ = CreateMaster(&master_env_);
-  master_service_ = NewGrpcMasterService(
-      master_impl_.get(), config.operation_timeout_in_ms(), &builder);
+  master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
   worker_impl_ =
       worker_func ? worker_func(&worker_env_) : NewGrpcWorker(&worker_env_);
   worker_service_ =
@@ -218,15 +218,17 @@ Status GrpcServer::Init(
   master_env_.ops = OpRegistry::Global();
   master_env_.worker_cache = worker_cache;
   master_env_.master_session_factory =
-      [config](
+      [config, stats_factory](
           SessionOptions options, const MasterEnv* env,
           std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
           std::unique_ptr<WorkerCacheInterface> worker_cache,
-          std::unique_ptr<DeviceSet> device_set) {
+          std::unique_ptr<DeviceSet> device_set,
+          std::vector<string> filtered_worker_list) {
         options.config.MergeFrom(config);
         return new MasterSession(options, env, std::move(remote_devs),
                                  std::move(worker_cache), std::move(device_set),
-                                 CreateNoOpStatsPublisher);
+                                 std::move(filtered_worker_list),
+                                 stats_factory);
       };
   master_env_.worker_cache_factory =
       [this](const WorkerCacheFactoryOptions& options,
@@ -241,6 +243,14 @@ Status GrpcServer::Init(
   return Status::OK();
 }
 
+Status GrpcServer::Init(
+    ServiceInitFunction service_func,
+    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+    const WorkerCreationFunction& worker_func) {
+  return Init(std::move(service_func), rendezvous_mgr_func, worker_func,
+              CreateNoOpStatsPublisher);
+}
+
 Status GrpcServer::Init(
     ServiceInitFunction service_func,
     const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
@@ -286,7 +296,7 @@ Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(options, &channel_spec));
 
-  std::unique_ptr<GrpcChannelCache> channel_cache(
+  std::shared_ptr<GrpcChannelCache> channel_cache(
       NewGrpcChannelCache(channel_spec, GetChannelCreationFunction()));
 
   string name_prefix = strings::StrCat("/job:", *options.job_name, "/replica:0",
@@ -306,7 +316,7 @@ Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
   }
 
   *worker_cache = NewGrpcWorkerCacheWithLocalWorker(
-      channel_cache.release(), worker_impl_.get(), name_prefix);
+      channel_cache, worker_impl_.get(), name_prefix);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 8b12ac1461d6b1fa3098197aa7697031a5d3075b..7c2f06f618a85c901ce7a7902cb8b1bc4e57be40 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "grpc++/security/credentials.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
@@ -68,6 +69,11 @@ class GrpcServer : public ServerInterface {
   const string target() const override;
 
  protected:
+  Status Init(ServiceInitFunction service_func,
+              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+              const WorkerCreationFunction& worker_func,
+              const StatsPublisherFactory& stats_factory);
+
   Status Init(ServiceInitFunction service_func,
               const RendezvousMgrCreationFunction& rendezvous_mgr_func,
               const WorkerCreationFunction& worker_func);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 120a33f17b0d1f81e50dfbc844f56e3d85def096..fd1c150fa7aab95bee0c492ce553b9c7f58cd487 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
@@ -90,6 +91,15 @@ void ReEncodeConsts(GraphDef* gdef) {
 }
 }  // namespace
 
+Status GrpcSession::Handle(string* out_handle) {
+  mutex_lock l(mu_);
+  if (handle_.empty()) {
+    return errors::InvalidArgument("A session is not created yet....");
+  }
+  *out_handle = handle_;
+  return Status::OK();
+}
+
 Status GrpcSession::CreateImpl(CallOptions* call_options,
                                const GraphDef& graph) {
   {
@@ -273,14 +283,9 @@ Status GrpcSession::Run(const std::vector<std::pair<string, Tensor>>& inputs,
 Status GrpcSession::RunProto(CallOptions* call_options,
                              MutableRunStepRequestWrapper* req,
                              MutableRunStepResponseWrapper* resp) {
-  {
-    mutex_lock l(mu_);
-    if (handle_.empty()) {
-      return errors::InvalidArgument("A session is not created yet....");
-    }
-
-    req->set_session_handle(handle_);
-  }
+  string handle;
+  TF_RETURN_IF_ERROR(Handle(&handle));
+  req->set_session_handle(handle);
   return master_->RunStep(call_options, req, resp);
 }
 
@@ -292,14 +297,7 @@ Status GrpcSession::PRunSetup(const std::vector<string>& input_names,
   PartialRunSetupRequest req;
   PartialRunSetupResponse resp;
   CallOptions call_options;
-  {
-    mutex_lock l(mu_);
-    if (handle_.empty()) {
-      return errors::InvalidArgument("A session is not created yet....");
-    }
-
-    req.set_session_handle(handle_);
-  }
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   for (const string& feed : input_names) {
     req.add_feed(feed);
   }
@@ -399,10 +397,59 @@ Status GrpcSession::Reset(const SessionOptions& options,
   return ret;
 }
 
+Status GrpcSession::MakeCallable(const CallableOptions& callable_options,
+                                 CallableHandle* out_handle) {
+  MakeCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  *req.mutable_options() = callable_options;
+  MakeCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  TF_RETURN_IF_ERROR(master_->MakeCallable(&call_options, &req, &resp));
+  *out_handle = resp.handle();
+  return Status::OK();
+}
+
+Status GrpcSession::RunCallable(CallableHandle handle,
+                                const std::vector<Tensor>& feed_tensors,
+                                std::vector<Tensor>* fetch_tensors,
+                                RunMetadata* run_metadata) {
+  RunCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  req.set_handle(handle);
+  for (const Tensor& feed : feed_tensors) {
+    feed.AsProtoTensorContent(req.mutable_feed()->Add());
+  }
+
+  RunCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  TF_RETURN_IF_ERROR(master_->RunCallable(&call_options, &req, &resp));
+  for (const TensorProto& fetch : resp.fetch()) {
+    Tensor fetch_tensor;
+    if (!fetch_tensor.FromProto(cpu_allocator(), fetch)) {
+      return errors::Internal(
+          "Could not parse fetched tensor data in response from master.");
+    }
+    fetch_tensors->push_back(std::move(fetch_tensor));
+  }
+  return Status::OK();
+}
+
+Status GrpcSession::ReleaseCallable(CallableHandle handle) {
+  ReleaseCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  req.set_handle(handle);
+  ReleaseCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  return master_->ReleaseCallable(&call_options, &req, &resp);
+}
+
 class GrpcSessionFactory : public SessionFactory {
  public:
   bool AcceptsOptions(const SessionOptions& options) override {
-    return StringPiece(options.target).starts_with(kSchemePrefix);
+    return str_util::StartsWith(options.target, kSchemePrefix);
   }
 
   Session* NewSession(const SessionOptions& options) override {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index d87956a13515fde533e746d2abd04e4a2f4959ae..63795117f9763434f5ff331d3d2d3bdb99413e81 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -82,20 +82,27 @@ class GrpcSession : public Session {
   Status Close() override;
 
   // NOTE: This API is still experimental and may change.
-  ::tensorflow::Status PRunSetup(const std::vector<string>& input_names,
-                                 const std::vector<string>& output_names,
-                                 const std::vector<string>& target_nodes,
-                                 string* handle) override;
+  Status PRunSetup(const std::vector<string>& input_names,
+                   const std::vector<string>& output_names,
+                   const std::vector<string>& target_nodes,
+                   string* handle) override;
 
   // NOTE: This API is still experimental and may change.
-  ::tensorflow::Status PRun(
-      const string& handle,
-      const std::vector<std::pair<string, Tensor> >& inputs,
-      const std::vector<string>& output_names,
-      std::vector<Tensor>* outputs) override;
+  Status PRun(const string& handle,
+              const std::vector<std::pair<string, Tensor> >& inputs,
+              const std::vector<string>& output_names,
+              std::vector<Tensor>* outputs) override;
 
   Status ListDevices(std::vector<DeviceAttributes>* response) override;
 
+  Status MakeCallable(const CallableOptions& callable_options,
+                      CallableHandle* out_handle) override;
+  Status RunCallable(CallableHandle handle,
+                     const std::vector<Tensor>& feed_tensors,
+                     std::vector<Tensor>* fetch_tensors,
+                     RunMetadata* run_metadata) override;
+  Status ReleaseCallable(CallableHandle handle) override;
+
  protected:
   // Takes ownership of `*master`.
   void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
@@ -111,6 +118,8 @@ class GrpcSession : public Session {
   // The current version of the graph.
   int64 current_graph_version_ GUARDED_BY(mu_);
 
+  Status Handle(string* out_handle) LOCKS_EXCLUDED(mu_);
+
   Status RunHelper(const RunOptions& run_options,
                    const std::vector<std::pair<string, Tensor> >& inputs,
                    const std::vector<string>& output_tensor_names,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 335c3febe20e17e5b5ea57dc68c69e616997e14b..45b15a54a29b481b4888515f18bd913d71c1013c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -120,6 +120,49 @@ TEST(GrpcSessionTest, BasicNonProtoAPI) {
   }
 }
 
+TEST(GrpcSessionTest, BasicCallable) {
+  GraphDef graph;
+  string node_names[3];
+  // c = a * b
+  CreateGraphDef(&graph, node_names);
+
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+
+  std::unique_ptr<Session> session(
+      NewRemote(Options(cluster->targets()[0], 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  for (int iters = 0; iters < 25; ++iters) {
+    TF_CHECK_OK(session->Create(graph));
+    {
+      // Just run to target node
+      CallableOptions opts;
+      opts.add_target(node_names[2]);
+      Session::CallableHandle handle;
+      TF_CHECK_OK(session->MakeCallable(opts, &handle));
+      TF_CHECK_OK(session->RunCallable(handle, {}, nullptr, nullptr));
+      TF_CHECK_OK(session->ReleaseCallable(handle));
+    }
+    {
+      // Run to a target node and a real tensor
+      CallableOptions opts;
+      opts.add_target(node_names[1]);
+      opts.add_fetch(node_names[2] + ":0");
+      Session::CallableHandle handle;
+      TF_CHECK_OK(session->MakeCallable(opts, &handle));
+      std::vector<Tensor> outputs;
+      TF_CHECK_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+      ASSERT_EQ(1, outputs.size());
+      ASSERT_TRUE(outputs[0].IsInitialized());
+      ASSERT_EQ(4.0, outputs[0].flat<float>()(0));
+      TF_CHECK_OK(session->ReleaseCallable(handle));
+    }
+
+    TF_CHECK_OK(session->Close());
+  }
+}
+
 TEST(GrpcSessionTest, BasicNonProtoAPIConsistentOrder) {
   GraphDef graph;
   string node_names[3];
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 0b6f9474dd9e520b21c1915578cd8071a28ac7fd..59dbb7ae04f63989204ba24755700fb40a22b943 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -56,7 +56,11 @@ class RPCState : public GrpcClientCQTag {
     }
 
     response_ = response;
-    GrpcMaybeUnparseProto(request, &request_buf_);
+    ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
+    if (!s.ok()) {
+      LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
+                 << s.error_message();
+    }
     call_ =
         std::move(stub->PrepareUnaryCall(&context_, method, request_buf_, cq));
     call_->StartCall();
@@ -73,7 +77,7 @@ class RPCState : public GrpcClientCQTag {
       // to Finish for client-side unary calls, ok should never be false
       s.Update(errors::Internal("unexpected ok value at rpc completion"));
     }
-    if (s.ok() && !GrpcMaybeParseProto(response_buf_, response_)) {
+    if (s.ok() && !GrpcMaybeParseProto(&response_buf_, response_)) {
       s.Update(errors::Internal("could not parse rpc response"));
     }
     if (!s.ok()) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index c237f2dce43b9391c340736f45166c9adc2a5b78..89f83f9f24d570d96704ea0b2d09da13147b1d6c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -57,7 +57,7 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
          strings::StrCat("--num_gpus=", num_gpus)});
-    ret->subprocesses_.emplace_back(testing::CreateSubProcess(argv));
+    ret->subprocesses_.emplace_back(CreateSubProcess(argv));
     bool success = ret->subprocesses_[i]->Start();
     if (!success) {
       return errors::Internal("Could not start subprocess");
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
index 4b3a03b1d708744bded25ff4d320979bb7eb38b2..d5baaae353a99b2681ae5e0873a4cef7161845f3 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index c80728544b089016aa58ed9e4db7275eac03fd4a..e211c33732b26777697f11178909edaf6c9b65ed 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -18,115 +18,42 @@ limitations under the License.
 
 namespace tensorflow {
 
-GrpcByteBufferSource::GrpcByteBufferSource() {}
-
-bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
-  cur_ = -1;
-  left_ = 0;
-  ptr_ = nullptr;
-  byte_count_ = 0;
-  bool ok = src.Dump(&slices_).ok();
-  if (!ok) {
-    slices_.clear();
-  }
-  return ok;
-}
-
-bool GrpcByteBufferSource::Next(const void** data, int* size) {
-  // Use loop instead of if in case buffer contained empty slices.
-  while (left_ == 0) {
-    // Advance to next slice.
-    cur_++;
-    if (cur_ >= slices_.size()) {
-      return false;
-    }
-    const ::grpc::Slice& s = slices_[cur_];
-    left_ = s.size();
-    ptr_ = reinterpret_cast<const char*>(s.begin());
-  }
-
-  *data = ptr_;
-  *size = left_;
-  byte_count_ += left_;
-  ptr_ += left_;
-  left_ = 0;
-  return true;
-}
-
-void GrpcByteBufferSource::BackUp(int count) {
-  ptr_ -= count;
-  left_ += count;
-  byte_count_ -= count;
-}
-
-bool GrpcByteBufferSource::Skip(int count) {
-  const void* data;
-  int size;
-  while (Next(&data, &size)) {
-    if (size >= count) {
-      BackUp(size - count);
-      return true;
-    }
-    // size < count;
-    count -= size;
-  }
-  // error or we have too large count;
-  return false;
-}
-
-grpc::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
-  return byte_count_;
-}
-
-void GrpcMaybeUnparseProto(const protobuf::Message& src,
-                           grpc::ByteBuffer* dst) {
-  // TODO(sanjay): For bigger protos, serialize into a ZeroCopyOutputStream.
-  ::grpc::Slice s(src.ByteSizeLong());
-  src.SerializeWithCachedSizesToArray(
-      const_cast<uint8*>(reinterpret_cast<const uint8*>(s.begin())));
-  ::grpc::ByteBuffer buffer(&s, 1);
-  dst->Swap(&buffer);
+::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
+                                     grpc::ByteBuffer* dst) {
+  bool own_buffer;
+  return ::grpc::GenericSerialize<::grpc::ProtoBufferWriter,
+                                  protobuf::Message>(src, dst, &own_buffer);
 }
 
 // GrpcMaybeUnparseProto from a string simply copies the string to the
 // ByteBuffer.
-void GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
+::grpc::Status GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
   ::grpc::Slice s(src.data(), src.size());
   ::grpc::ByteBuffer buffer(&s, 1);
   dst->Swap(&buffer);
+  return ::grpc::Status::OK;
 }
 
-bool GrpcMaybeParseProto(const grpc::ByteBuffer& src, protobuf::Message* dst) {
-  GrpcByteBufferSource stream;
-  if (!stream.Init(src)) return false;
-  return dst->ParseFromZeroCopyStream(&stream);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst) {
+  ::grpc::ProtoBufferReader reader(src);
+  return dst->ParseFromZeroCopyStream(&reader);
 }
 
 // Overload of GrpcParseProto so we can decode a TensorResponse without
 // extra copying.  This overload is used by the RPCState class in
 // grpc_state.h.
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst) {
-  struct ByteSource : public TensorResponse::Source {
-    const ::grpc::ByteBuffer* buffer;
-    GrpcByteBufferSource src;
-    bool ok;
-
-    ::tensorflow::protobuf::io::ZeroCopyInputStream* contents() override {
-      ok = src.Init(*buffer);
-      return &src;
-    }
-  };
-  ByteSource bs;
-  bs.buffer = &src;
-  return dst->ParseFrom(&bs).ok() && bs.ok;
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst) {
+  ::tensorflow::GrpcByteSource byte_source(src);
+  auto s = dst->ParseFrom(&byte_source);
+  return s.ok();
 }
 
 // GrpcMaybeParseProto into a string simply copies bytes into the string.
-bool GrpcMaybeParseProto(const grpc::ByteBuffer& src, string* dst) {
+bool GrpcMaybeParseProto(grpc::ByteBuffer* src, string* dst) {
   dst->clear();
-  dst->reserve(src.Length());
+  dst->reserve(src->Length());
   std::vector<::grpc::Slice> slices;
-  if (!src.Dump(&slices).ok()) {
+  if (!src->Dump(&slices).ok()) {
     return false;
   }
   for (const ::grpc::Slice& s : slices) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index d5e7e9f5b39e9f1ab9704de3f8ec7964096ae569..4b58781b543981c0176cdec1296f40b770829e44 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -29,6 +29,33 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Thin wrapper around ::grpc::ProtoBufferReader to give TensorResponse an
+// efficient byte reader from which to decode a RecvTensorResponse.
+class GrpcByteSource : public TensorResponse::Source {
+ public:
+  explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
+  ~GrpcByteSource() override { DeleteStream(); }
+
+  typedef ::grpc::ProtoBufferReader Reader;
+
+  protobuf::io::ZeroCopyInputStream* contents() override {
+    DeleteStream();
+    stream_ = new (&space_) Reader(buffer_);
+    return stream_;
+  }
+
+ private:
+  void DeleteStream() {
+    if (stream_) {
+      stream_->~Reader();
+    }
+  }
+
+  ::grpc::ByteBuffer* buffer_;  // Not owned
+  Reader* stream_ = nullptr;    // Points into space_ if non-nullptr
+  char space_[sizeof(Reader)];
+};
+
 constexpr char kStreamRemovedMessage[] = "Stream removed";
 
 // Identify if the given grpc::Status corresponds to an HTTP stream removed
@@ -79,38 +106,21 @@ typedef std::shared_ptr<::grpc::Channel> SharedGrpcChannelPtr;
 inline string GrpcIdKey() { return "tf-rpc"; }
 
 // Serialize src and store in *dst.
-void GrpcMaybeUnparseProto(const protobuf::Message& src,
-                           ::grpc::ByteBuffer* dst);
+::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
+                                     ::grpc::ByteBuffer* dst);
 
 // Parse contents of src and initialize *dst with them.
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, protobuf::Message* dst);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst);
 
 // Specialization for TensorResponse
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst);
 
 // Copy string src to grpc buffer *dst.
-void GrpcMaybeUnparseProto(const string& src, ::grpc::ByteBuffer* dst);
+::grpc::Status GrpcMaybeUnparseProto(const string& src,
+                                     ::grpc::ByteBuffer* dst);
 
 // Copy grpc buffer src to string *dst.
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, string* dst);
-
-// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
-class GrpcByteBufferSource : public ::grpc::protobuf::io::ZeroCopyInputStream {
- public:
-  GrpcByteBufferSource();
-  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
-  bool Next(const void** data, int* size) override;
-  void BackUp(int count) override;
-  bool Skip(int count) override;
-  ::grpc::protobuf::int64 ByteCount() const override;
-
- private:
-  std::vector<::grpc::Slice> slices_;
-  int cur_;          // Current slice index.
-  int left_;         // Number of bytes in slices_[cur_] left to yield.
-  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
-  ::grpc::protobuf::int64 byte_count_;
-};
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
index 5356fb36e4bb214513b5da8b1c7ac841af8db045..6eaa0b183317ee15e32de73c8acbe804d4825549 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
@@ -67,7 +67,7 @@ TEST(GrpcProto, Unparse) {
   proto.add_container("hello");
   proto.add_container("world");
   grpc::ByteBuffer buf;
-  GrpcMaybeUnparseProto(proto, &buf);
+  ASSERT_TRUE(GrpcMaybeUnparseProto(proto, &buf).ok());
   CleanupAllRequest parsed;
   ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
   ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -80,7 +80,7 @@ TEST(GrpcProto, UnparseToString) {
   string str;
   CHECK(proto.SerializeToString(&str));
   grpc::ByteBuffer buf;
-  GrpcMaybeUnparseProto(str, &buf);
+  ASSERT_TRUE(GrpcMaybeUnparseProto(str, &buf).ok());
   CleanupAllRequest parsed;
   ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
   ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -103,7 +103,7 @@ TEST(GrpcProto, Parse) {
     CleanupAllRequest proto = MakeProto(c.length);
     ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
     CleanupAllRequest parsed;
-    ASSERT_TRUE(GrpcMaybeParseProto(src, &parsed))
+    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed))
         << c.length << " " << c.slices;
     ASSERT_EQ(proto.DebugString(), parsed.DebugString());
   }
@@ -127,7 +127,7 @@ TEST(GrpcProto, ParseFromString) {
     ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
     string parsed_str;
     CleanupAllRequest parsed;
-    ASSERT_TRUE(GrpcMaybeParseProto(src, &parsed_str))
+    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed_str))
         << c.length << " " << c.slices;
     ASSERT_TRUE(parsed.ParseFromString(parsed_str));
     ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -140,7 +140,7 @@ static void BM_UnparseGrpc(int iters, int size) {
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     grpc::ByteBuffer buf;
-    GrpcMaybeUnparseProto(proto, &buf);
+    CHECK(GrpcMaybeUnparseProto(proto, &buf).ok());
   }
   testing::StopTiming();
 }
@@ -167,7 +167,7 @@ static void BM_ParseGrpc(int iters, int size, int num_slices) {
   testing::StartTiming();
 
   for (int i = 0; i < iters; i++) {
-    CHECK(GrpcMaybeParseProto(buf, &proto));
+    CHECK(GrpcMaybeParseProto(&buf, &proto));
   }
 
   testing::StopTiming();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index 2ed07e3669a3badd82b8ef27f45bac2b712c8978..18998bbccbb44dfb37ba8f143914d60ecfbcbf28 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -34,9 +34,9 @@ namespace {
 class GrpcWorkerCache : public WorkerCachePartial {
  public:
   // TODO(ncteisen): consider adding a config var or flag for this
-  static constexpr const size_t kGrpcWorkerCacheThreadCount = 2;
+  static constexpr const size_t kGrpcWorkerCacheThreadCount = 8;
 
-  explicit GrpcWorkerCache(GrpcChannelCache* channel_cache,
+  explicit GrpcWorkerCache(std::shared_ptr<GrpcChannelCache> channel_cache,
                            WorkerInterface* local_worker,
                            const string& local_target)
       : local_target_(local_target),
@@ -48,7 +48,6 @@ class GrpcWorkerCache : public WorkerCachePartial {
   // Explicit destructor to control destruction order.
   ~GrpcWorkerCache() override {
     threads_.clear();  // Blocks until threads exit.
-    delete channel_cache_;
   }
 
   void ListWorkers(std::vector<string>* workers) const override {
@@ -130,7 +129,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
   const string local_target_;
   WorkerInterface* const local_worker_;  // Not owned.
-  GrpcChannelCache* channel_cache_;      // Owned.
+  std::shared_ptr<GrpcChannelCache> channel_cache_;
   WorkerCacheLogger logger_;
   std::vector<GrpcWorkerCacheThread> threads_;
 
@@ -142,12 +141,12 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
 }  // namespace
 
-WorkerCacheInterface* NewGrpcWorkerCache(GrpcChannelCache* cc) {
+WorkerCacheInterface* NewGrpcWorkerCache(std::shared_ptr<GrpcChannelCache> cc) {
   return new GrpcWorkerCache(cc, nullptr, "");
 }
 
 WorkerCacheInterface* NewGrpcWorkerCacheWithLocalWorker(
-    GrpcChannelCache* cc, WorkerInterface* local_worker,
+    std::shared_ptr<GrpcChannelCache> cc, WorkerInterface* local_worker,
     const string& local_target) {
   return new GrpcWorkerCache(cc, local_worker, local_target);
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
index 7a35fdbca08e1f7a79e77418f69efb3e4fa80e0a..d63fca74c15a5ff4dccb70f49084d2bfffdc7bcb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
@@ -22,10 +22,10 @@ limitations under the License.
 namespace tensorflow {
 
 // The returned WorkerCacheInterface object takes the ownership of "cc".
-WorkerCacheInterface* NewGrpcWorkerCache(GrpcChannelCache* cc);
+WorkerCacheInterface* NewGrpcWorkerCache(std::shared_ptr<GrpcChannelCache> cc);
 
 WorkerCacheInterface* NewGrpcWorkerCacheWithLocalWorker(
-    GrpcChannelCache* cc, WorkerInterface* local_worker,
+    std::shared_ptr<GrpcChannelCache> cc, WorkerInterface* local_worker,
     const string& local_target);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 1beb198732ad40ed9e21f66c665ff82a231eebb6..b20e744a97160a17cd1621b38475a7c9c4f81d8f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -52,7 +52,7 @@ namespace {
 
 class GrpcWorkerService : public AsyncServiceInterface {
   // TODO(ncteisen): consider adding a config var or flag for this
-  static constexpr const size_t kGrpcWorkerServiceThreadCount = 2;
+  static constexpr const size_t kGrpcWorkerServiceThreadCount = 8;
 
  public:
   GrpcWorkerService(GrpcWorker* worker, ::grpc::ServerBuilder* builder)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 1a5e2edfb240198c50d3b5d00bec1127fceff725..a54ea9379628ed1d437440c54ccd8fe25b527902 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -26,44 +26,10 @@ limitations under the License.
 #include "grpc++/impl/codegen/sync_stream.h"
 #include "grpc++/support/byte_buffer.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RegisterGraphRequest);
-// Contains potentially large TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunGraphRequest);
-// Contains potentially large StepStats, TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunGraphResponse);
-
-namespace tensorflow {
-class GrpcByteSource : public TensorResponse::Source {
- public:
-  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
-  ~GrpcByteSource() override { DeleteStream(); }
-
-  typedef ::grpc::tensorflow_helper::GrpcBufferReader Reader;
-
-  protobuf::io::ZeroCopyInputStream* contents() override {
-    DeleteStream();
-    stream_ = new (&space_) Reader(buffer_);
-    return stream_;
-  }
-
- private:
-  void DeleteStream() {
-    if (stream_) {
-      stream_->~Reader();
-    }
-  }
-
-  grpc_byte_buffer* buffer_;  // Not owned
-  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
-  char space_[sizeof(Reader)];
-};
-}  // namespace tensorflow
-
 namespace grpc {
 class CompletionQueue;
 class Channel;
@@ -74,21 +40,19 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::TensorResponse.
 // Wire-format is identical to RecvTensorResponse.
 template <>
-class SerializationTraits<tensorflow::TensorResponse>
-    : public UnlimitedSizeProtoSerializationTraits<tensorflow::TensorResponse> {
+class SerializationTraits<tensorflow::TensorResponse> {
  public:
-  static Status Serialize(const tensorflow::TensorResponse& msg,
-                          grpc_byte_buffer** bp, bool* own_buffer) {
+  static Status Serialize(const tensorflow::TensorResponse& msg, ByteBuffer* bp,
+                          bool* own_buffer) {
     LOG(FATAL) << "TODO(sanjay,jeff): Implement";
     return Status();
   }
-  static Status Deserialize(grpc_byte_buffer* buffer,
-                            tensorflow::TensorResponse* msg,
-                            int max_message_size = INT_MAX) {
+  static Status Deserialize(ByteBuffer* buffer,
+                            tensorflow::TensorResponse* msg) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
     }
-    Status result = g_core_codegen_interface->ok();
+    Status result = Status::OK;
     if (result.ok()) {
       ::tensorflow::GrpcByteSource source(buffer);
       auto s = msg->ParseFrom(&source);
@@ -98,7 +62,7 @@ class SerializationTraits<tensorflow::TensorResponse>
                             "TensorResponse parse error", s.ToString()));
       }
     }
-    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
+    buffer->Clear();
     return result;
   }
 };
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 067dc5dff5bb81f8cc1da883d226ee3cfa5638f2..b8cb5385038ed2c01d15cb5a571cd2d5ec6505c8 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -227,7 +227,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
 
   Device* dst_device;
   if (s.ok()) {
-    s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
   }
   if (!s.ok()) {
     if (rwi != nullptr) {
diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc
index 9dae5b3b926fab14c2b36955436d3956baa29fdd..84036361971b73f9fb7fe990833d5018f6321e27 100644
--- a/tensorflow/core/distributed_runtime/scheduler.cc
+++ b/tensorflow/core/distributed_runtime/scheduler.cc
@@ -80,7 +80,7 @@ Microseconds SlackAnalysis::ComputeAsap(std::vector<Microseconds>* asap_times) {
   std::vector<int> pending_count(graph_->num_node_ids());
   InitializePending(graph_, &pending_count);
 
-  std::deque<Node*> queue;
+  std::deque<const Node*> queue;
   Node* srcNode = graph_->source_node();
   queue.push_back(srcNode);
   (*asap_times)[srcNode->id()] = 0;
@@ -92,7 +92,7 @@ Microseconds SlackAnalysis::ComputeAsap(std::vector<Microseconds>* asap_times) {
     for (const Edge* out_edge : curr->out_edges()) {
       // The time needed for 'out' to get its input from 'curr'.
       Microseconds copy_time(0);
-      Node* out = out_edge->dst();
+      const Node* out = out_edge->dst();
       if (!out_edge->IsControlEdge() &&
           curr->assigned_device_name() != out->assigned_device_name()) {
         // Add an arbitrary 10microsecs for each copy.
@@ -137,7 +137,7 @@ Microseconds SlackAnalysis::ComputeAlap(std::vector<Microseconds>* alap_times) {
     }
   }
 
-  std::deque<Node*> queue;
+  std::deque<const Node*> queue;
   Node* sinkNode = graph_->sink_node();
   queue.push_back(sinkNode);
   (*alap_times)[sinkNode->id()] = 0;
@@ -148,7 +148,7 @@ Microseconds SlackAnalysis::ComputeAlap(std::vector<Microseconds>* alap_times) {
     for (const Edge* in_edge : curr->in_edges()) {
       // The time needed for 'curr' to get its input from 'src'.
       Microseconds copy_time(0);
-      Node* src = in_edge->src();
+      const Node* src = in_edge->src();
       if (!in_edge->IsControlEdge() &&
           src->assigned_device_name() != curr->assigned_device_name()) {
         // TODO(yuanbyu): Use the real cost model
@@ -236,7 +236,7 @@ Microseconds GreedyScheduler::ComputeSchedule(
 
       for (const Edge* out_edge : event.node->out_edges()) {
         Microseconds copy_time(0);
-        Node* out = out_edge->dst();
+        const Node* out = out_edge->dst();
         if (!out_edge->IsControlEdge() &&
             event.node->assigned_device_name() != out->assigned_device_name()) {
           // TODO(yuanbyu): Use below with the real cost model.
@@ -277,11 +277,11 @@ Microseconds GreedyScheduler::ComputeSchedule(
   return max_completion;
 }
 
-Node* GreedyScheduler::GetNodeWithHighestPriority(
-    const std::vector<Node*>& nodes) {
-  Node* curr_node = nullptr;
+const Node* GreedyScheduler::GetNodeWithHighestPriority(
+    const std::vector<const Node*>& nodes) {
+  const Node* curr_node = nullptr;
   int64 curr_priority = kint64max;
-  for (Node* n : nodes) {
+  for (const Node* n : nodes) {
     if ((*priority_)[n->id()] < curr_priority) {
       curr_node = n;
       curr_priority = (*priority_)[n->id()];
diff --git a/tensorflow/core/distributed_runtime/scheduler.h b/tensorflow/core/distributed_runtime/scheduler.h
index ef87b9834dba50cf628a8c29c70b0266661d6227..bf9d0d1bec33284a44f69412477edb4a0963e8a1 100644
--- a/tensorflow/core/distributed_runtime/scheduler.h
+++ b/tensorflow/core/distributed_runtime/scheduler.h
@@ -57,11 +57,11 @@ class GreedyScheduler {
   struct Sim {
     int degree_parallelism;
     int num_running;
-    std::vector<Node*> ready_nodes;
+    std::vector<const Node*> ready_nodes;
   };
 
   struct Event {
-    Node* node;
+    const Node* node;
     Microseconds time;
     bool is_completion;
 
@@ -79,7 +79,7 @@ class GreedyScheduler {
 
  private:
   // Returns the ready node with the highest priority for a sim.
-  Node* GetNodeWithHighestPriority(const std::vector<Node*>& nodes);
+  const Node* GetNodeWithHighestPriority(const std::vector<const Node*>& nodes);
 
   const DeviceSet* devices_;
   const CostModel* cost_model_;
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 51b9547f53ba687c863b0fd11647e7bb82d80e03..7ef4206c780d138943851de7c66d55adbe392384 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
@@ -33,15 +34,16 @@ SessionMgr::SessionMgr(
     WorkerCacheFactory worker_cache_factory)
     : worker_env_(worker_env),
       default_worker_cache_(std::move(default_worker_cache)),
-      legacy_session_(new WorkerSession(
+      legacy_session_(WorkerSession::CreateWithBorrowedDeviceMgr(
           "", default_worker_name,
           std::unique_ptr<WorkerCacheInterface>(
               new WorkerCacheWrapper(default_worker_cache_.get())),
-          std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
+          worker_env->device_mgr,
           std::unique_ptr<GraphMgr>(
               new GraphMgr(worker_env, worker_env->device_mgr)))),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
+/* static */
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
   return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
                          server_def.task_index());
@@ -55,13 +57,14 @@ Status SessionMgr::CreateSession(const string& session,
     return errors::InvalidArgument("Session must be non-empty.");
   }
 
-  const string worker_name = WorkerNameFromServerDef(server_def);
-
   WorkerCacheInterface* worker_cache = nullptr;
+  string worker_name;
   if (server_def.cluster().job().empty()) {
     worker_cache = new WorkerCacheWrapper(default_worker_cache_.get());
+    worker_name = legacy_session_->worker_name;
   } else {
     TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
+    worker_name = WorkerNameFromServerDef(server_def);
   }
 
   if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
@@ -71,19 +74,32 @@ Status SessionMgr::CreateSession(const string& session,
   CHECK(!worker_env_->local_devices.empty())
       << "The WorkerEnv must have at least one device in `local_devices`.";
 
-  std::vector<Device*> renamed_devices;
-  for (Device* d : worker_env_->local_devices) {
-    renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
-        worker_name, d, false, isolate_session_state));
-  }
-  std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr(renamed_devices));
+  std::shared_ptr<WorkerSession> worker_session;
 
-  std::unique_ptr<GraphMgr> graph_mgr(
-      new GraphMgr(worker_env_, device_mgr.get()));
+  if (isolate_session_state) {
+    // Create a private copy of the DeviceMgr for the WorkerSession.
+    std::vector<Device*> renamed_devices;
+    for (Device* d : worker_env_->local_devices) {
+      renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
+          worker_name, d, false, isolate_session_state));
+    }
 
-  std::shared_ptr<WorkerSession> worker_session(new WorkerSession(
-      session, worker_name, std::unique_ptr<WorkerCacheInterface>(worker_cache),
-      std::move(device_mgr), std::move(graph_mgr)));
+    auto device_mgr = MakeUnique<DeviceMgr>(renamed_devices);
+    auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
+    worker_session.reset(
+        new WorkerSession(session, worker_name,
+                          std::unique_ptr<WorkerCacheInterface>(worker_cache),
+                          std::move(device_mgr), std::move(graph_mgr)));
+  } else {
+    // Borrown the WorkerEnv's DeviceMgr for the WorkerSession, so
+    // that resources using it can use its devices after the
+    // WorkerSession has been deleted.
+    auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, worker_env_->device_mgr);
+    worker_session = WorkerSession::CreateWithBorrowedDeviceMgr(
+        session, worker_name,
+        std::unique_ptr<WorkerCacheInterface>(worker_cache),
+        worker_env_->device_mgr, std::move(graph_mgr));
+  }
 
   sessions_.insert(std::make_pair(session, std::move(worker_session)));
   return Status::OK();
@@ -98,20 +114,26 @@ Status SessionMgr::DeleteSession(const string& session) {
   return Status::OK();
 }
 
-std::shared_ptr<WorkerSession> SessionMgr::WorkerSessionForSessionUnlocked(
-    const string& session) {
-  auto it = sessions_.find(session);
-  if (it == sessions_.end()) {
-    return legacy_session_;
+Status SessionMgr::WorkerSessionForSessionLocked(
+    const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
+  if (session_handle.empty()) {
+    *out_session = legacy_session_;
   } else {
-    return it->second;
+    auto it = sessions_.find(session_handle);
+    if (it == sessions_.end()) {
+      return errors::Aborted("Session handle is not found: ", session_handle,
+                             ". Possibly this worker just restarted.");
+    } else {
+      *out_session = it->second;
+    }
   }
+  return Status::OK();
 }
 
-std::shared_ptr<WorkerSession> SessionMgr::WorkerSessionForSession(
-    const string& session) {
+Status SessionMgr::WorkerSessionForSession(
+    const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
   mutex_lock l(mu_);
-  return WorkerSessionForSessionUnlocked(session);
+  return WorkerSessionForSessionLocked(session_handle, out_session);
 }
 
 std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 4c9702d522cede454d5efd15669eaec2b0c1c1b1..04d1d614098818675cd3bc5be6bdd55c752de53e 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -50,7 +50,8 @@ class SessionMgr {
                        bool isolate_session_state);
 
   // Locates the worker session for a given session handle
-  std::shared_ptr<WorkerSession> WorkerSessionForSession(const string& session);
+  Status WorkerSessionForSession(const string& session_handle,
+                                 std::shared_ptr<WorkerSession>* out_session);
   std::shared_ptr<WorkerSession> LegacySession();
 
   Status DeleteSession(const string& session);
@@ -64,7 +65,7 @@ class SessionMgr {
   void ClearLogs();
 
  private:
-  const WorkerEnv* const worker_env_;  // Not owned.
+  WorkerEnv* const worker_env_;  // Not owned.
 
   // A note about destruction:
   // We must delete graph_mgr before device_mgr, due to shared
@@ -86,8 +87,9 @@ class SessionMgr {
 
   const WorkerCacheFactory worker_cache_factory_;
 
-  std::shared_ptr<WorkerSession> WorkerSessionForSessionUnlocked(
-      const string& session) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status WorkerSessionForSessionLocked(
+      const string& session_handle, std::shared_ptr<WorkerSession>* out_session)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   // A map from session identifier to internal session structure.
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 4d028f7f4a9e5eea7cd52b67ac41b03de3f0078f..99192119a63e2553bc107eff3f79a436c455b9e3 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 
 namespace tensorflow {
 
@@ -43,15 +44,17 @@ class FakeDevice : public Device {
 class SessionMgrTest : public ::testing::Test {
  protected:
   SessionMgrTest()
-      : device_(FakeDevice::MakeCPU(
-            "/job:mnist/replica:0/task:0/device:fakecpu:0")),
-        mgr_(&env_, "/job:mnist/replica:0/task:0",
-             std::unique_ptr<WorkerCacheInterface>(), factory_),
-        legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {
-    env_.local_devices = {device_.get()};
+      : mgr_(&env_, "/job:mnist/replica:0/task:0",
+             std::unique_ptr<WorkerCacheInterface>(), factory_) {
+    Device* device =
+        FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0")
+            .release();
+    env_.local_devices = {device};
+    device_mgr_.reset(new DeviceMgr(env_.local_devices));
+    env_.device_mgr = device_mgr_.get();
   }
 
-  std::unique_ptr<Device> device_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
   WorkerEnv env_;
   SessionMgr::WorkerCacheFactory factory_ =
       [](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
@@ -59,7 +62,6 @@ class SessionMgrTest : public ::testing::Test {
         return Status::OK();
       };
   SessionMgr mgr_;
-  std::shared_ptr<WorkerSession> legacy_session_;
 };
 
 TEST_F(SessionMgrTest, CreateSessionSimple) {
@@ -69,35 +71,68 @@ TEST_F(SessionMgrTest, CreateSessionSimple) {
 
   string session_handle = "test_session_handle";
   TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
-  auto session = mgr_.WorkerSessionForSession(session_handle);
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
   EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
   EXPECT_NE(mgr_.LegacySession(), session);
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
+TEST_F(SessionMgrTest, CreateSessionClusterDefWorkerName) {
+  ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+  auto job = server_def.mutable_cluster()->add_job();
+  job->set_name("worker");
+  job->mutable_tasks()->insert({3, "localhost:3333"});
+
+  string session_handle = "test_session_handle";
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
+  EXPECT_EQ("/job:worker/replica:0/task:3", session->worker_name);
+  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
+}
+
+TEST_F(SessionMgrTest, CreateSessionDefaultWorkerName) {
+  ServerDef server_def;
+  string session_handle = "test_session_handle";
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
+  EXPECT_EQ("/job:mnist/replica:0/task:0", session->worker_name);
+  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
+}
+
 TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
   ServerDef server_def;
   server_def.set_job_name("worker");
   server_def.set_task_index(3);
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_1", server_def, false));
-  auto session_1 = mgr_.WorkerSessionForSession("handle_1");
-  std::vector<Device*> devices_1 = session_1->device_mgr->ListDevices();
+  std::shared_ptr<WorkerSession> session_1;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_1", &session_1));
+  std::vector<Device*> devices_1 = session_1->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_1.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_2", server_def, false));
-  auto session_2 = mgr_.WorkerSessionForSession("handle_2");
-  std::vector<Device*> devices_2 = session_2->device_mgr->ListDevices();
+  std::shared_ptr<WorkerSession> session_2;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_2", &session_2));
+  std::vector<Device*> devices_2 = session_2->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_2.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_3", server_def, true));
-  auto session_3 = mgr_.WorkerSessionForSession("handle_3");
-  std::vector<Device*> devices_3 = session_3->device_mgr->ListDevices();
+  std::shared_ptr<WorkerSession> session_3;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_3", &session_3));
+  std::vector<Device*> devices_3 = session_3->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_3.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_4", server_def, true));
-  auto session_4 = mgr_.WorkerSessionForSession("handle_4");
-  std::vector<Device*> devices_4 = session_4->device_mgr->ListDevices();
+  std::shared_ptr<WorkerSession> session_4;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_4", &session_4));
+  std::vector<Device*> devices_4 = session_4->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_4.size());
 
   EXPECT_EQ(devices_1[0]->resource_manager(), devices_2[0]->resource_manager());
@@ -109,12 +144,23 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
 TEST_F(SessionMgrTest, LegacySession) {
   ServerDef server_def;
   string session_handle = "";
-  auto session = mgr_.WorkerSessionForSession(session_handle);
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
   EXPECT_EQ(mgr_.LegacySession(), session);
 
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
+TEST_F(SessionMgrTest, UnknownSessionHandle) {
+  ServerDef server_def;
+  string session_handle = "unknown_session_handle";
+  std::shared_ptr<WorkerSession> session;
+  Status s = mgr_.WorkerSessionForSession(session_handle, &session);
+  EXPECT_TRUE(errors::IsAborted(s));
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(), "Session handle is not found"));
+}
+
 TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
   ServerDef server_def;
   server_def.set_job_name("worker");
@@ -124,7 +170,7 @@ TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
 }
 
 TEST_F(SessionMgrTest, DeleteLegacySession) {
-  TF_EXPECT_OK(mgr_.DeleteSession("legacy_session"));
+  TF_EXPECT_OK(mgr_.DeleteSession(""));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index 34a4013547b5feef12b49198bff4e733f1b9e932..fe2d1a12934dde814344b70f52fbc972f74347e0 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -81,7 +81,7 @@ void TensorResponse::InitPartial(const RecvTensorResponse& response) {
 Status TensorResponse::ParseFrom(Source* source) {
   if (!on_host_) {
     protobuf::io::CodedInputStream input(source->contents());
-    input.SetTotalBytesLimit(INT_MAX);  // Unlimited
+    input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
 
     // Pre-parse into local storage, then delegate to device.
     if (!meta_.ParseFromCodedStream(&input) || !input.ConsumedEntireMessage()) {
@@ -217,7 +217,7 @@ bool TensorResponse::ParseTensorSubmessage(
 
 bool TensorResponse::ParseFast(Source* source) {
   protobuf::io::CodedInputStream input(source->contents());
-  input.SetTotalBytesLimit(INT_MAX);  // Unlimited
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
   while (true) {
     auto p = input.ReadTagWithCutoff(127);
     int tag = GetTagFieldNumber(p.first);
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 63455493671fcd1f4282bc804f8f2a521c056dce..e9073ef9f66d5e8e861f4a985ef9b515739b3fc6 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -49,7 +49,8 @@ void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
   done(s);
 }
 
-void Worker::DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+void Worker::DeleteWorkerSessionAsync(CallOptions* opts,
+                                      const DeleteWorkerSessionRequest* request,
                                       DeleteWorkerSessionResponse* response,
                                       StatusCallback done) {
   Status s = env_->session_mgr->DeleteSession(request->session_handle());
@@ -59,21 +60,37 @@ void Worker::DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
 void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
                                 RegisterGraphResponse* response,
                                 StatusCallback done) {
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
-  Status s = session->graph_mgr->Register(
-      request->session_handle(), request->graph_def(), request->graph_options(),
-      request->debug_options(), session->cluster_flr.get(),
-      response->mutable_graph_handle());
+  std::shared_ptr<WorkerSession> session;
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (s.ok()) {
+    s = session->graph_mgr->Register(
+        request->session_handle(), request->graph_def(),
+        request->graph_options(), request->debug_options(),
+        session->cluster_flr.get(), response->mutable_graph_handle());
+  }
   done(s);
 }
 
 void Worker::DeregisterGraphAsync(const DeregisterGraphRequest* request,
                                   DeregisterGraphResponse* response,
                                   StatusCallback done) {
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
-  Status s = session->graph_mgr->Deregister(request->graph_handle());
+  std::shared_ptr<WorkerSession> session;
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (s.ok()) {
+    s = session->graph_mgr->Deregister(request->graph_handle());
+  }
 
   done(s);
 }
@@ -135,11 +152,21 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
                         StatusCallback done) {
   const int64 step_id = request->step_id();
   TRACEPRINTF("RunGraph: %lld", step_id);
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+  std::shared_ptr<WorkerSession> session;
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
-  Status s = PrepareRunGraph(request, &in, out);
+  s = PrepareRunGraph(request, &in, out);
   if (!s.ok()) {
     delete out;
     done(s);
@@ -209,13 +236,24 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
   const int64 step_id = request->step_id();
   const string& graph_handle = request->graph_handle();
   TRACEPRINTF("PartialRunGraph: %lld", step_id);
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+  std::shared_ptr<WorkerSession> session;
+
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
 
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
-  Status s = PrepareRunGraph(request, &in, out);
-  auto finish = [this, done, out, opts](const Status& s) {
+  s = PrepareRunGraph(request, &in, out);
+  auto finish = [done, out, opts](const Status& s) {
     opts->ClearCancelCallback();
     delete out;
     done(s);
@@ -247,7 +285,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
     session->graph_mgr->ExecuteAsync(
         graph_handle, step_id, session.get(), request->exec_opts(),
         nullptr /* collector */, nullptr /* response */, cm, in,
-        [this, token, step_id, session, cm](Status s) {
+        [this, token, step_id, session](Status s) {
           {
             mutex_lock l(mu_);
             cancellation_manager_->DeregisterCallback(token);
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index 62fa5f3cf54202c91b27ae03d9d34fc09b8392ec..19aeeb752c43d765c8792a88f77d1448f87ab910 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -52,7 +52,8 @@ class Worker : public WorkerInterface {
                                 CreateWorkerSessionResponse* response,
                                 StatusCallback done) override;
 
-  void DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+  void DeleteWorkerSessionAsync(CallOptions* opts,
+                                const DeleteWorkerSessionRequest* request,
                                 DeleteWorkerSessionResponse* response,
                                 StatusCallback done) override;
 
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index 4c58bf41a461160a6ea258aee207fffff01aa99d..a1597ee798ff27cf9e2a73f4264b548d72b85688 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -45,7 +45,7 @@ class WorkerInterface {
       CreateWorkerSessionResponse* response, StatusCallback done) = 0;
 
   virtual void DeleteWorkerSessionAsync(
-      const DeleteWorkerSessionRequest* request,
+      CallOptions* opts, const DeleteWorkerSessionRequest* request,
       DeleteWorkerSessionResponse* response, StatusCallback done) = 0;
 
   virtual void RegisterGraphAsync(const RegisterGraphRequest* request,
@@ -124,7 +124,8 @@ class WorkerInterface {
 
   Status DeleteWorkerSession(const DeleteWorkerSessionRequest* request,
                              DeleteWorkerSessionResponse* response) {
-    return CallAndWait(&ME::DeleteWorkerSessionAsync, request, response);
+    return CallAndWaitWithOptions(&ME::DeleteWorkerSessionAsync, request,
+                                  response);
   }
 
   Status RegisterGraph(const RegisterGraphRequest* request,
@@ -183,6 +184,19 @@ class WorkerInterface {
     n.WaitForNotification();
     return ret;
   }
+
+  template <typename Method, typename Req, typename Resp>
+  Status CallAndWaitWithOptions(Method func, const Req* req, Resp* resp) {
+    CallOptions call_opts;
+    Status ret;
+    Notification n;
+    (this->*func)(&call_opts, req, resp, [&ret, &n](const Status& s) {
+      ret = s;
+      n.Notify();
+    });
+    n.WaitForNotification();
+    return ret;
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index cb7059b36e98d511a8b338010096225424099a62..ca6dc1b1deaa94cb414da1e957a9f1f3b9e6b457 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -95,8 +95,43 @@ WorkerSession::WorkerSession(const string& session_name,
     : session_name(session_name),
       worker_name(worker_name),
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
-      device_mgr(std::move(device_mgr)),
       graph_mgr(std::move(graph_mgr)),
-      cluster_flr(new ClusterFunctionLibraryRuntime(this)) {}
+      cluster_flr(
+          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      device_mgr_(std::move(device_mgr)),
+      borrowed_device_mgr_(nullptr) {}
+
+/* static */
+std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
+    const string& session_name, const string& worker_name,
+    std::unique_ptr<WorkerCacheInterface> worker_cache,
+    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr) {
+  return std::shared_ptr<WorkerSession>(
+      new WorkerSession(session_name, worker_name, std::move(worker_cache),
+                        borrowed_device_mgr, std::move(graph_mgr)));
+}
+
+WorkerSession::WorkerSession(const string& session_name,
+                             const string& worker_name,
+                             std::unique_ptr<WorkerCacheInterface> worker_cache,
+                             DeviceMgr* borrowed_device_mgr,
+                             std::unique_ptr<GraphMgr> graph_mgr)
+    : session_name(session_name),
+      worker_name(worker_name),
+      worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
+      graph_mgr(std::move(graph_mgr)),
+      cluster_flr(
+          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      device_mgr_(nullptr),
+      borrowed_device_mgr_(borrowed_device_mgr) {}
+
+WorkerSession::~WorkerSession() {
+  if (graph_mgr) {
+    Status s = graph_mgr->DeregisterAll();
+    if (!s.ok()) {
+      LOG(WARNING) << "Error during worker session deletion: " << s;
+    }
+  }
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 0fd19ac27f20edbf8a2ed85d1c3d97eaabab3347..f1faf4936479359830bfe4dabd51e035acfa7f44 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -40,10 +40,14 @@ struct WorkerSession {
   // Object from which WorkerInterface instances can be obtained.
   const std::unique_ptr<WorkerCacheInterface> worker_cache;
 
-  // Collection of local devices. These devices are typically RenamedDevices
-  // in all except the SessionMgr.legacy_session_. legacy_session_.device_mgr
-  // == worker_env_.device_mgr, which holds the true devices.
-  const std::unique_ptr<DeviceMgr> device_mgr;
+  // Collection of local devices. These devices are typically
+  // RenamedDevices in all except the SessionMgr.legacy_session_ and
+  // sessions created with `isolate_session_state == false`. In the
+  // those cases, this method returns a pointer to a borrowed
+  // DeviceMgr (typically the `worker_env.device_mgr`).
+  DeviceMgr* device_mgr() {
+    return device_mgr_ ? device_mgr_.get() : borrowed_device_mgr_;
+  }
 
   // graph_mgr keeps track of the registered graphs of this session.
   //
@@ -57,6 +61,22 @@ struct WorkerSession {
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 std::unique_ptr<DeviceMgr> device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr);
+
+  static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
+      const string& session_name, const string& worker_name,
+      std::unique_ptr<WorkerCacheInterface> worker_cache,
+      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr);
+
+  ~WorkerSession();
+
+ private:
+  WorkerSession(const string& session_name, const string& worker_name,
+                std::unique_ptr<WorkerCacheInterface> worker_cache,
+                DeviceMgr* borrowed_device_mgr,
+                std::unique_ptr<GraphMgr> graph_mgr);
+
+  const std::unique_ptr<DeviceMgr> device_mgr_;
+  DeviceMgr* const borrowed_device_mgr_;  // Not owned.
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index d977935b8a392adf1f78c38955f77f6f364502c9..2265498b5e2794bdd2782ac25fa067a7aa8b9557 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -182,13 +182,25 @@ struct FeatureTrait<
 // Returns true if sequence_example has a feature_list with the specified key.
 bool HasFeatureList(const string& key, const SequenceExample& sequence_example);
 
+template <typename T>
+struct TypeHasFeatures : std::false_type {};
+
+template <>
+struct TypeHasFeatures<Example> : std::true_type {};
+
+template <>
+struct TypeHasFeatures<Features> : std::true_type {};
+
 // A family of template functions to return mutable Features proto from a
 // container proto. Supported ProtoTypes: Example, Features.
 template <typename ProtoType>
-Features* GetFeatures(ProtoType* proto);
+typename std::enable_if<TypeHasFeatures<ProtoType>::value, Features*>::type
+GetFeatures(ProtoType* proto);
 
 template <typename ProtoType>
-const Features& GetFeatures(const ProtoType& proto);
+typename std::enable_if<TypeHasFeatures<ProtoType>::value,
+                        const Features&>::type
+GetFeatures(const ProtoType& proto);
 
 // Base declaration of a family of template functions to return a read only
 // repeated field of feature values.
@@ -300,7 +312,7 @@ bool HasFeature(const string& key, const Features& features);
 template <typename... FeatureType>
 bool HasFeature(const string& key, const Example& example) {
   return HasFeature<FeatureType...>(key, GetFeatures(example));
-};
+}
 
 // DEPRECATED: use HasFeature instead.
 // TODO(gorban): update all clients in a followup CL.
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 94bf34afa49f586e1bb61c1654865a5abc9abe19..1c62d37955b3135eb691460ee57e08eb9524cab5 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -61,6 +61,29 @@ static bool cpu_allocator_collect_stats = false;
 // If true, cpu allocator collects full stats.
 static bool cpu_allocator_collect_full_stats = false;
 
+// Individual allocations large than this amount will trigger a warning.
+static const double kLargeAllocationWarningThreshold = 0.1;
+
+// If cpu_allocator_collect_stats is true, warn when the total allocated memory
+// exceeds this threshold.
+static const double kTotalAllocationWarningThreshold = 0.5;
+
+static const int kMaxSingleAllocationWarnings = 5;
+static const int kMaxTotalAllocationWarnings = 1;
+
+// Cache first invocation to port::AvailableRam, as it can be expensive.
+static int64_t LargeAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kLargeAllocationWarningThreshold);
+  return value;
+}
+
+static int64_t TotalAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kTotalAllocationWarningThreshold);
+  return value;
+}
+
 void EnableCPUAllocatorStats(bool enable) {
   cpu_allocator_collect_stats = enable;
 }
@@ -70,13 +93,23 @@ void EnableCPUAllocatorFullStats(bool enable) {
 
 class CPUAllocator : public Allocator {
  public:
-  CPUAllocator() {}
+  CPUAllocator()
+      : single_allocation_warning_count_(0),
+        total_allocation_warning_count_(0) {}
 
   ~CPUAllocator() override {}
 
   string Name() override { return "cpu"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    if (num_bytes > LargeAllocationWarningBytes() &&
+        single_allocation_warning_count_ < kMaxSingleAllocationWarnings) {
+      ++single_allocation_warning_count_;
+      LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
+                   << 100 * kLargeAllocationWarningThreshold
+                   << "% of system memory.";
+    }
+
     void* p = port::AlignedMalloc(num_bytes, alignment);
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
@@ -87,6 +120,14 @@ class CPUAllocator : public Allocator {
           std::max<int64>(stats_.max_bytes_in_use, stats_.bytes_in_use);
       stats_.max_alloc_size =
           std::max<int64>(stats_.max_alloc_size, alloc_size);
+
+      if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
+          total_allocation_warning_count_ < kMaxTotalAllocationWarnings) {
+        ++total_allocation_warning_count_;
+        LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use
+                     << "exceeds " << 100 * kTotalAllocationWarningThreshold
+                     << "% of system memory";
+      }
     }
     return p;
   }
@@ -121,6 +162,11 @@ class CPUAllocator : public Allocator {
   mutex mu_;
   AllocatorStats stats_ GUARDED_BY(mu_);
 
+  // Use <atomic> for single allocations to avoid mutex contention when
+  // statistics are disabled.
+  std::atomic<int> single_allocation_warning_count_;
+  int total_allocation_warning_count_ GUARDED_BY(mu_);
+
   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
 };
 
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 3ce1b612464291eceb6e08d9b0f2deca70cda27a..2c87156dca61188a3a8deabf9ad483c9180ccd55 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_ALLOCATOR_H_
-#define TENSORFLOW_FRAMEWORK_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
 
 #include <stdlib.h>
 
@@ -359,7 +359,12 @@ struct AllocatorAttributes {
   bool nic_compatible() const { return value & (0x1 << 1); }
   void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); }
   bool gpu_compatible() const { return value & (0x1 << 2); }
-  void Merge(AllocatorAttributes other) { value |= other.value; }
+  void Merge(AllocatorAttributes other) {
+    value |= other.value;
+    scope_id = (scope_id > 0 && other.scope_id == 0)
+                   ? scope_id
+                   : ((scope_id == 0) ? other.scope_id : 0);
+  }
   // Returns true if the fields set in *this is a subset of or equal to
   // those set in other.
   bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const {
@@ -371,6 +376,9 @@ struct AllocatorAttributes {
   // upper 8 bits in device-specific ways, and ops implemented for those
   // devices are responsible for setting those 8 bits appropriately.
   uint32 value = 0;
+  // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
+  // a named special-purpose allocator on the same device.
+  int32 scope_id = 0;
 };
 
 // Returns a trivial implementation of Allocator which uses the system
@@ -396,4 +404,4 @@ class SubAllocator {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index a1c39d2a7a78354239f2cdbb718160906b233ddd..87c1ddd15df4f89e29b1d073f4380e65dae531f9 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -185,7 +186,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   // check if has_list is false and some other field in attr_value is
   // set to flag the error.  This test can be made more strict once
   // support for GraphDef versions <= 4 is dropped.
-  if (StringPiece(type).starts_with("list(") && !attr_value.has_list()) {
+  if (str_util::StartsWith(type, "list(") && !attr_value.has_list()) {
     if (num_set) {
       return errors::InvalidArgument(
           "AttrValue missing value with expected type '", type, "'");
@@ -196,7 +197,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   }
 
   // Okay to have an empty list, but not to be missing a non-list value.
-  if (num_set == 0 && !StringPiece(type).starts_with("list(")) {
+  if (num_set == 0 && !str_util::StartsWith(type, "list(")) {
     return errors::InvalidArgument(
         "AttrValue missing value with expected type '", type, "'");
   }
@@ -240,29 +241,29 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
 bool ParseAttrValue(StringPiece type, StringPiece text, AttrValue* out) {
   // Parse type.
   string field_name;
-  bool is_list = type.Consume("list(");
-  if (type.Consume("string")) {
+  bool is_list = str_util::ConsumePrefix(&type, "list(");
+  if (str_util::ConsumePrefix(&type, "string")) {
     field_name = "s";
-  } else if (type.Consume("int")) {
+  } else if (str_util::ConsumePrefix(&type, "int")) {
     field_name = "i";
-  } else if (type.Consume("float")) {
+  } else if (str_util::ConsumePrefix(&type, "float")) {
     field_name = "f";
-  } else if (type.Consume("bool")) {
+  } else if (str_util::ConsumePrefix(&type, "bool")) {
     field_name = "b";
-  } else if (type.Consume("type")) {
+  } else if (str_util::ConsumePrefix(&type, "type")) {
     field_name = "type";
-  } else if (type.Consume("shape")) {
+  } else if (str_util::ConsumePrefix(&type, "shape")) {
     field_name = "shape";
-  } else if (type.Consume("tensor")) {
+  } else if (str_util::ConsumePrefix(&type, "tensor")) {
     field_name = "tensor";
-  } else if (type.Consume("func")) {
+  } else if (str_util::ConsumePrefix(&type, "func")) {
     field_name = "func";
-  } else if (type.Consume("placeholder")) {
+  } else if (str_util::ConsumePrefix(&type, "placeholder")) {
     field_name = "placeholder";
   } else {
     return false;
   }
-  if (is_list && !type.Consume(")")) {
+  if (is_list && !str_util::ConsumePrefix(&type, ")")) {
     return false;
   }
 
diff --git a/tensorflow/core/framework/attr_value_util_test.cc b/tensorflow/core/framework/attr_value_util_test.cc
index e4fad917ffe1d4a0790bf1fd56e3c72f841523d8..1a3994736cb5627c590c3029c7b9e163dff2351c 100644
--- a/tensorflow/core/framework/attr_value_util_test.cc
+++ b/tensorflow/core/framework/attr_value_util_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 
+#include <numeric>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index 968c18bdd2159fee4eb6982c62697951d79b706c..2f79d0fa7089088955b842c3f1208875655cfcec 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_BFLOAT16_H_
 
 #include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_WINDOWS)
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index 17e6209f8e5ad5240dfc8ca1def75c178da45c27..206396a25ab784e93daa227bcf79fe608f5df706 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -37,19 +37,27 @@ float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
 
 struct Bfloat16TestParam {
   float input;
-  float expected;
+  float expected_truncation;
+  float expected_rounding;
 };
 
 class Bfloat16Test : public ::testing::Test,
                      public ::testing::WithParamInterface<Bfloat16TestParam> {};
 
 TEST_P(Bfloat16Test, TruncateTest) {
-  bfloat16 a(GetParam().input);
+  bfloat16 truncated(GetParam().input);
   if (std::isnan(GetParam().input)) {
-    EXPECT_TRUE(std::isnan(float(a)) || std::isinf(float(a)));
+    EXPECT_TRUE(std::isnan(float(truncated)) || std::isinf(float(truncated)));
     return;
   }
-  EXPECT_EQ(GetParam().expected, float(a));
+  EXPECT_EQ(GetParam().expected_truncation, float(truncated));
+
+  bfloat16 rounded = bfloat16::round_to_bfloat16((GetParam().input));
+  if (std::isnan(GetParam().input)) {
+    EXPECT_TRUE(std::isnan(float(rounded)) || std::isinf(float(rounded)));
+    return;
+  }
+  EXPECT_EQ(GetParam().expected_rounding, float(rounded));
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -57,37 +65,48 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         Bfloat16TestParam{
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001001, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(1, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000)},
+            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000),
+            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000001),
-            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000000)},
+            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000000),
+            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b11111111, 0b1111111, 0b1111111111111111),
-            BinaryToFloat(0, 0b11111111, 0b1111111, 0b0000000000000000)},
+            BinaryToFloat(0, 0b11111111, 0b1111111, 0b0000000000000000),
+            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(1, 0b10000000, 0b1001000, 0b1100000000000000),
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000)},
+            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000),
+            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
         Bfloat16TestParam{
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0100000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b00000000, 0b1001000, 0b1000000000000000),
+            BinaryToFloat(0, 0b00000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b00000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b00000000, 0b1111111, 0b1100000000000000),
-            BinaryToFloat(0, 0b00000000, 0b1111111, 0b0000000000000000)}));
+            BinaryToFloat(0, 0b00000000, 0b1111111, 0b0000000000000000),
+            BinaryToFloat(0, 0b00000001, 0b0000000, 0b0000000000000000)}));
 
 TEST(Bfloat16Test, Conversion) {
   float a[100];
diff --git a/tensorflow/core/framework/cancellation.cc b/tensorflow/core/framework/cancellation.cc
index 9da4828bbad7b6333336dd1215441f5c5f62151a..1258e40c9344ab9b41b6c8e0e29dce5d8da3bbf9 100644
--- a/tensorflow/core/framework/cancellation.cc
+++ b/tensorflow/core/framework/cancellation.cc
@@ -89,6 +89,10 @@ bool CancellationManager::DeregisterCallback(CancellationToken token) {
   }
 }
 
-CancellationManager::~CancellationManager() { StartCancel(); }
+CancellationManager::~CancellationManager() {
+  if (!callbacks_.empty()) {
+    StartCancel();
+  }
+}
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4ac50cbbe65862294b8526769922c2fb59c1501
--- /dev/null
+++ b/tensorflow/core/framework/collective.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/collective.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+string CollGroupParams::ToString() const {
+  return strings::StrCat("CollGroupParams {group_key=", group_key,
+                         " group_size=", group_size,
+                         " device_type=", device_type.type_string(),
+                         " num_tasks=", num_tasks, "}");
+}
+
+CollInstanceParams& CollInstanceParams::operator=(
+    const CollInstanceParams& other) {
+  if (this != &other) {
+    instance_key = other.instance_key;
+    type = other.type;
+    data_type = other.data_type;
+    shape = other.shape;
+    device_names.clear();
+    device_names.assign(other.device_names.begin(), other.device_names.end());
+    task_names.assign(other.task_names.begin(), other.task_names.end());
+    same_num_devices_per_task = other.same_num_devices_per_task;
+    impl_details.subdiv_offsets.assign(
+        other.impl_details.subdiv_offsets.begin(),
+        other.impl_details.subdiv_offsets.end());
+    impl_details.subdiv_permutations.clear();
+    for (auto p : other.impl_details.subdiv_permutations) {
+      impl_details.subdiv_permutations.push_back(
+          std::vector<int>(p.begin(), p.end()));
+    }
+    impl_details.subdiv_source_rank.assign(
+        other.impl_details.subdiv_source_rank.begin(),
+        other.impl_details.subdiv_source_rank.end());
+  }
+  return *this;
+}
+
+string CollInstanceParams::ToString() const {
+  string v = strings::StrCat("CollInstanceParams { instance_key=", instance_key,
+                             " type=", type, " data_type=", data_type,
+                             " shape=", shape.DebugString(), " devices {");
+  for (const auto& d : device_names) {
+    strings::StrAppend(&v, d, ",");
+  }
+  strings::StrAppend(&v, "} task_names={");
+  for (const auto& n : task_names) {
+    strings::StrAppend(&v, n, ", ");
+  }
+  strings::StrAppend(&v, "}, subdiv_offsets={");
+  for (const auto& d : impl_details.subdiv_offsets) {
+    strings::StrAppend(&v, d, ",");
+  }
+  strings::StrAppend(&v, "}, subdiv_perms={");
+  for (const auto& p : impl_details.subdiv_permutations) {
+    strings::StrAppend(&v, "{");
+    for (const auto& i : p) {
+      strings::StrAppend(&v, i, ",");
+    }
+    strings::StrAppend(&v, "}");  // one subdiv
+  }
+  if (!impl_details.subdiv_source_rank.empty()) {
+    strings::StrAppend(&v, " subdiv_source_rank={");
+    for (const auto& r : impl_details.subdiv_source_rank) {
+      strings::StrAppend(&v, r, ",");
+    }
+    strings::StrAppend(&v, "}");
+  }
+  strings::StrAppend(&v, "}");  // all subdivs
+  return v;
+}
+
+string CollTaskParams::ToString() const {
+  string v = strings::StrCat("CollTaskParams {is_local={");
+  for (const auto& b : is_local) {
+    strings::StrAppend(&v, static_cast<int>(b), ",");
+  }
+  strings::StrAppend(&v, "}}");
+  return v;
+}
+
+string CollectiveParams::ToString() const {
+  string v = strings::StrCat("CollectiveParams ", name, " {", group.ToString());
+  strings::StrAppend(&v, " ", instance.ToString());
+  strings::StrAppend(&v, " ", task.ToString());
+  strings::StrAppend(&v, " default_rank=", default_rank,
+                     " is_source=", is_source, " subdiv_rank={");
+  for (const auto& r : subdiv_rank) {
+    strings::StrAppend(&v, r, ",");
+  }
+  strings::StrAppend(&v, "}}");
+  return v;
+}
+
+/*static*/ OpKernelContext::Params* CollectiveExecutor::CtxParams(
+    OpKernelContext* ctx) {
+  return ctx->params_;
+}
+
+/*static*/
+int64 CollectiveExecutor::kInvalidId = -1;
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6fe12e7ef6da75f77966164ce801f9a8d7d9bfa
--- /dev/null
+++ b/tensorflow/core/framework/collective.h
@@ -0,0 +1,308 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
+#define TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class BufRendezvous;
+class CancellationManager;
+class CompleteGroupRequest;
+class CompleteGroupResponse;
+class CompleteInstanceRequest;
+class CompleteInstanceResponse;
+class DeviceLocality;
+class GetStepSequenceRequest;
+class GetStepSequenceResponse;
+class Op;
+class Tensor;
+
+// Types of supported collective operations.
+enum CollectiveType {
+  REDUCTION_COLLECTIVE = 0,
+  BROADCAST_COLLECTIVE,
+  UNDEFINED_COLLECTIVE,
+};
+
+// Data common to all members of a device group.
+// All members share the same device set but its order is
+// particular to an instance so it is stored there.
+struct CollGroupParams {
+  int32 group_key;
+  int32 group_size;
+  DeviceType device_type;
+  int32 num_tasks;  // number of distinct tasks in group
+  string ToString() const;
+  CollGroupParams() : device_type(DEVICE_CPU) {}
+};
+
+// The best implementation of a collective op depends on many factors
+// including the number of devices involved, the topology of
+// interconnects between them and the sizes of inputs.  This structure
+// is used in generating and representing data movement choreography
+// for each specific algorithm, hence it does not have a single, fixed
+// interpretation.  On first execution the runtime will update this
+// structure with decisions that will guide all subsequent executions.
+struct CollImplDetails {
+  std::vector<std::vector<int>> subdiv_permutations;
+  std::vector<int> subdiv_offsets;
+  // broadcast only: rank of source in each subdiv
+  std::vector<int> subdiv_source_rank;
+};
+
+// Data common to all members of a collective instance.
+struct CollInstanceParams {
+  int32 instance_key;  // Identifies all participating graph nodes.
+  CollectiveType type;
+  DataType data_type;
+  TensorShape shape;
+  // Fully qualified name of device for each member, in default rank order.
+  std::vector<string> device_names;
+  // Task name prefix of corresponding device name.
+  std::vector<string> task_names;
+  // True if every task has the same number of devices.
+  bool same_num_devices_per_task = false;
+  CollImplDetails impl_details;
+  string ToString() const;
+  CollInstanceParams& operator=(const struct CollInstanceParams& other);
+};
+
+// Data common to all instance members in the same task.
+struct CollTaskParams {
+  // True for devices that are local to the process, i.e. no RPC needed.
+  std::vector<bool> is_local;
+  string ToString() const;
+};
+
+// Unique to a single CollectiveOp node.
+struct CollectiveParams {
+  CollGroupParams group;
+  CollInstanceParams instance;
+  CollTaskParams task;
+
+  string name;             // node name used only for log or error messages
+  int default_rank;        // index of this op within device_names
+  bool is_source = false;  // broadcast only
+  // Rank of this device in each subdivision permutation.
+  std::vector<int> subdiv_rank;
+  std::unique_ptr<OpKernel> merge_op;  // reduction only
+  std::unique_ptr<OpKernel> final_op;  // reduction only
+  string ToString() const;
+};
+
+class CollectiveExecutor;
+
+// Interface that provides resolution of device localities.
+class DeviceResolverInterface {
+ public:
+  virtual ~DeviceResolverInterface() {}
+
+  // Collects DeviceLocality protobufs from all of the devices identified
+  // in 'col_params'.
+  virtual void GetDeviceLocalitiesAsync(const CollInstanceParams& inst_params,
+                                        std::vector<DeviceLocality>* localities,
+                                        const StatusCallback& done) = 0;
+
+  // Populate *locality with the DeviceLocality of the specified
+  // device.
+  virtual void GetLocalityAsync(const string& device, const string& task,
+                                DeviceLocality* locality,
+                                const StatusCallback& done) = 0;
+
+  // Clear the cache of device data belonging
+  // to the specified task.
+  virtual void ClearTask(const string& task) = 0;
+};
+
+// Interface that provides resolution of shared CollectiveParams fields.
+class ParamResolverInterface {
+ public:
+  virtual ~ParamResolverInterface() {}
+
+  // Called by each collective op at first execution in order to fill out
+  // the CollectiveParams structure with data gathered from the full
+  // (maybe distributed) collection of peer nodes.
+  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   const StatusCallback& done) = 0;
+
+  // Used within a distributed implementation to discover/verify
+  // data shared across a device group.
+  virtual void CompleteGroupAsync(const CompleteGroupRequest* request,
+                                  CompleteGroupResponse* response,
+                                  CancellationManager* cancel_mgr,
+                                  const StatusCallback& done) = 0;
+
+  // Used within a distributed implementation to discover/verify data
+  // shared across an instance group.
+  virtual void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                                     CompleteInstanceResponse* response,
+                                     CancellationManager* cancel_mgr,
+                                     const StatusCallback& done) = 0;
+};
+
+// Graphs which utilize Collective Ops in a common instance must
+// execute with identical step_ids even if they are disjoint graphs
+// run by otherwise independent tasks.  This interface supplies
+// coordinated step_ids to use in such cases.
+class StepSequenceInterface {
+ public:
+  virtual ~StepSequenceInterface() {}
+
+  // Used with a distributed implementation to coordinate step_id
+  // sequences across tasks.
+  virtual void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                                    GetStepSequenceResponse* response,
+                                    const StatusCallback& done) = 0;
+
+  // Refresh the local per-graph_key step_id sequence from collective
+  // group leader, if applicable.
+  virtual void RefreshStepIdSequenceAsync(int64 graph_key,
+                                          const StatusCallback& done) = 0;
+
+  // Returns the step_id that should be used for initiating a new execution
+  // on the specified graph. May return the same step_id multiple times if
+  // RetireStepId or RefreshStepIdReservation is not called.
+  virtual int64 NextStepId(int64 graph_key) = 0;
+
+  // Reports that execution of the given step has completed successfully.
+  // Should be called immediately after a step completes with OK status,
+  // prior to calling NextStepId().  If the step fails, don't call.
+  virtual void RetireStepId(int64 graph_key, int64 step_id) = 0;
+};
+
+// Interface that provides access to per-step CollectiveExecutor
+// instances and various distributed resolution capabilities.
+class CollectiveExecutorMgrInterface : public StepSequenceInterface {
+ public:
+  virtual ~CollectiveExecutorMgrInterface() {}
+
+  // Returns the step-specific CollectiveExecutor, creating if one does not
+  // already exist.  The caller assumes ownership of one Ref on the object.
+  virtual CollectiveExecutor* FindOrCreate(int64 step_id) = 0;
+
+  // If there is a CollectiveExecutor for step_id, remove it from the
+  // table.
+  virtual void Cleanup(int64 step_id) = 0;
+
+  virtual ParamResolverInterface* GetParamResolver() const = 0;
+
+  virtual DeviceResolverInterface* GetDeviceResolver() const = 0;
+};
+
+// Interface that a Collective Op implementation uses to exchange data
+// with peers.  Note that data exchange is currently limited to types
+// for which DMAHelper::CanUseDMA() returns true, i.e.  dense numeric
+// types.
+class PeerAccessInterface {
+ public:
+  virtual ~PeerAccessInterface() {}
+
+  virtual void RecvFromPeer(const string& peer_device, const string& peer_task,
+                            bool peer_is_local, const string& key,
+                            Device* to_device, DeviceContext* to_device_ctx,
+                            const AllocatorAttributes& to_alloc_attr,
+                            Tensor* to_tensor,
+                            const DeviceLocality& client_locality,
+                            const StatusCallback& done) = 0;
+
+  virtual void PostToPeer(const string& peer_device, const string& peer_task,
+                          const string& key, Device* from_device,
+                          DeviceContext* from_device_ctx,
+                          const AllocatorAttributes& from_alloc_attr,
+                          const Tensor* from_tensor,
+                          const DeviceLocality& client_locality,
+                          const StatusCallback& done) = 0;
+};
+
+class PerStepCollectiveRemoteAccess;
+
+// A step-specific object that can execute a collective operation completely
+// described by a CollectiveParams object.
+class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
+ public:
+  virtual void StartAbort(const Status& s) {}
+
+  virtual void ExecuteAsync(OpKernelContext* ctx,
+                            const CollectiveParams& col_params,
+                            const string& exec_key, StatusCallback done) {
+    done(errors::Internal(
+        "A collective Op has been called in a context in which "
+        "a CollectiveExecutor has not been provided."));
+  }
+
+  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   StatusCallback done) {
+    cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr, done);
+  }
+
+  virtual PerStepCollectiveRemoteAccess* remote_access() { return nullptr; }
+
+  // Used to designate an invalid group or instance key.
+  static int64 kInvalidId;
+
+  // Lexically scoped handle for Ref.
+  class Handle {
+   public:
+    explicit Handle(CollectiveExecutor* ce, bool inherit_ref) : ce_(ce) {
+      if (!inherit_ref) ce->Ref();
+    }
+    ~Handle() { ce_->Unref(); }
+    CollectiveExecutor* get() const { return ce_; }
+
+   private:
+    CollectiveExecutor* ce_;
+  };
+
+ protected:
+  explicit CollectiveExecutor(CollectiveExecutorMgrInterface* cem)
+      : cem_(cem) {}
+
+  // For use only by derived classes
+  static OpKernelContext::Params* CtxParams(OpKernelContext* ctx);
+  CollectiveExecutorMgrInterface* cem_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor);
+};
+
+// Interface of a helper object that provides a CollectiveExecutor with
+// all of the remote access it needs.
+class CollectiveRemoteAccess : public PeerAccessInterface,
+                               public DeviceResolverInterface {
+ public:
+  virtual ~CollectiveRemoteAccess() {}
+
+  virtual BufRendezvous* buf_rendezvous() = 0;
+};
+
+// A per-step version of CollectiveRemoteAccess that cleans up outstanding
+// communications in case step execution is abandoned.
+class PerStepCollectiveRemoteAccess : public CollectiveRemoteAccess {
+ public:
+  virtual ~PerStepCollectiveRemoteAccess() {}
+  virtual void StartAbort(const Status& s) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 623248b6ce6adff8ed323acad7dae300742f8eba..0916c9b7a85122da7b4067b82b7b7cc1e7c3fb0a 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -487,6 +487,15 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   string data_format;
   Status s = c->GetAttr("data_format", &data_format);
 
+  std::vector<int32> dilations;
+  TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
+
+  if (dilations.size() != 5) {
+    return errors::InvalidArgument(
+        "Conv3D requires the dilation attribute to contain 5 values, but got: ",
+        dilations.size());
+  }
+
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 5) {
@@ -496,6 +505,7 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   }
 
   int32 stride_planes, stride_rows, stride_cols;
+  int32 dilation_planes, dilation_rows, dilation_cols;
   if (s.ok() && data_format == "NCDHW") {
     // Convert input_shape to NDHWC.
     auto dim = [&](char dimension) {
@@ -504,12 +514,18 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
     input_shape =
         c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('2'), dim('C')}});
     stride_planes = strides[2];
-    stride_cols = strides[3];
-    stride_rows = strides[4];
+    stride_rows = strides[3];
+    stride_cols = strides[4];
+    dilation_planes = dilations[2];
+    dilation_cols = dilations[3];
+    dilation_rows = dilations[4];
   } else {
     stride_planes = strides[1];
     stride_rows = strides[2];
     stride_cols = strides[3];
+    dilation_planes = dilations[1];
+    dilation_cols = dilations[2];
+    dilation_rows = dilations[3];
   }
 
   DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
@@ -530,13 +546,15 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
   DimensionHandle output_planes, output_rows, output_cols;
 
-  TF_RETURN_IF_ERROR(
-      GetWindowedOutputSizeFromDims(c, in_planes_dim, filter_planes_dim,
-                                    stride_planes, padding, &output_planes));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_rows_dim, filter_rows_dim, stride_rows, padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_cols_dim, filter_cols_dim, stride_cols, padding, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_planes_dim, filter_planes_dim, dilation_planes, stride_planes,
+      padding, &output_planes));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding,
+      &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding,
+      &output_cols));
 
   ShapeHandle output_shape;
   if (data_format == "NCDHW") {
@@ -1210,7 +1228,7 @@ Status ConcatV2Shape(InferenceContext* c) {
                            c->num_inputs() - 1 /* dim_index */);
 }
 
-Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
+Status BroadcastBinaryOpOutputShapeFn(InferenceContext* c, int output_index) {
   ShapeHandle shape_x = c->input(0);
   ShapeHandle shape_y = c->input(1);
   if (!c->RankKnown(shape_x) || !c->RankKnown(shape_y)) {
@@ -1272,7 +1290,7 @@ Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
     }
   }
 
-  c->set_output(0, c->MakeShape(dims));
+  c->set_output(output_index, c->MakeShape(dims));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 293c40e04d6ad9b57aabfda678216b1805a006f4..789746b4037fbf9f11d34c425272bfc44d8623be 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -265,9 +265,15 @@ Status ConcatShape(shape_inference::InferenceContext* c,
 // Shape function for concat operations.
 Status ConcatV2Shape(shape_inference::InferenceContext* c);
 
+// Shape function for binary operators that broadcast their inputs
+// and with output to output_index.
+Status BroadcastBinaryOpOutputShapeFn(InferenceContext* c, int output_index);
+
 // Shape function for binary operators that broadcast their inputs.
 // Tested by ops/math_ops_test.cc.
-Status BroadcastBinaryOpShapeFn(InferenceContext* c);
+inline Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
+  return BroadcastBinaryOpOutputShapeFn(c, 0);
+}
 
 // Shape function for random operations.
 Status RandomShape(shape_inference::InferenceContext* c);
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 5f3e5ad45731750bfd73181c41cd029f23aab55f..919e0967c03a24eda9a22e931c2708a412159420 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -140,9 +141,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains("Invalid argument: Shape must be rank 2 but is rank 1"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 1"));
   }
 
   {
@@ -161,10 +161,9 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5}), S({3, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains(
-                "Invalid argument: Dimensions must be equal, but are 5 and 3"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(),
+        "Invalid argument: Dimensions must be equal, but are 5 and 3"));
   }
 
   {
@@ -173,9 +172,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5, 3}), S({3, 5, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains("Invalid argument: Shape must be rank 2 but is rank 3"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 3"));
   }
 
   {
@@ -646,15 +644,19 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) {
                     .Finalize(&op.node_def));
   };
 
-  // 1x1x1 filter
-  set_op({{1, 1, 1, 1, 1}}, "VALID");
-  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
-
   // Invalid rank for input
   INFER_ERROR("must be rank 5", op, "[4,4];[2,1,1,1]");
   // Invalid rank for filter
   INFER_ERROR("must be rank 5", op, "[1,4,4,1];[2,1,1]");
 
+  // Invalid value for strides
+  set_op({{1, 1, 1, 0, 1}}, "VALID");
+  INFER_ERROR("must be > 0", op, "[1,2,2,2,1];[1,1,1,1,1]");
+
+  // 1x1x1 filter
+  set_op({{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
   // unknown dims in the critical fields give partial inference.
   INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
   INFER_OK(op, "[1,?,2,2,1];[1,1,1,1,1]", "[d0_0,?,2,2,d1_4]");
@@ -714,6 +716,49 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) {
   INFER_OK(op, "[1,4,9,4,1];[2,2,2,1,?]", "[d0_0,2,3,1,d1_4]");
 }
 
+TEST(CommonShapeFnsTest, Conv3DDilatedShapeTest) {
+  ShapeInferenceTestOp op("Conv3D");
+  auto set_op = [&op](const std::vector<int32>& dilations,
+                      const std::vector<int32>& strides,
+                      const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Conv3D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("dilations", dilations)
+                    .Attr("strides", strides)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // Invalid rank for dilation
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_ERROR("contain 5 values", op, "[1,2,2,2,1];[1,1,1,1,1]");
+
+  // Invalid value for dilation
+  set_op({{1, 2, 0, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_ERROR("must be >= 1", op, "[1,2,2,2,1];[1,1,1,1,1]");
+
+  // 2x1x1 dilation 1x1x1 filter
+  set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // 2x1x1 dilation 2x2x2 filter
+  set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,3,2,2,1];[2,2,2,1,1]", "[d0_0,1,1,1,d1_4]");
+
+  // 2x1x1 dilation 3x3x3 input, 1x1x1 filter, 2x2x2 stride
+  set_op({{1, 2, 1, 1, 1}}, {{1, 2, 2, 2, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // 2x1x1 dilation 3x3x3 input, 1x1x1 filter, 2x1x1 stride
+  set_op({{1, 2, 1, 1, 1}}, {{1, 2, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,3,3,d1_4]");
+
+  // 2x1x1 dilation 4x4x4 input, 2x2x2 filter, 1x1x1 stride
+  set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "SAME");
+  INFER_OK(op, "[1,4,4,4,1];[2,2,2,1,1]", "[d0_0,d0_1,d0_2,d0_3,d1_4]");
+}
+
 TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
   ShapeInferenceTestOp op("DepthwiseConv2dNative");
   std::vector<int32> strides = {{1, 1, 1, 1}};
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 6ab23d92a421df8b5fb9bcf637ad805d67577aa1..775d9f6eb6a4e24e74b44988ada95eed5239f725 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -201,7 +201,7 @@ class GraphDefBuilderWrapper {
   // Also looks up the `op_def->name` in the global
   // `WhitelistedStatefulOpRegistry`.
   bool IsOpWhitelisted(const OpDef* op_def) const {
-    return (StringPiece(op_def->name()).ends_with("Dataset") &&
+    return (str_util::EndsWith(op_def->name(), "Dataset") &&
             op_def->output_arg_size() == 1 &&
             op_def->output_arg(0).type() == DT_VARIANT) ||
            dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
@@ -305,6 +305,14 @@ class IteratorContext {
     return params_.allocator_getter(attrs);
   }
 
+  std::function<Allocator*(AllocatorAttributes)> allocator_getter() {
+    return params_.allocator_getter;
+  }
+
+  std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter() {
+    return params_.stats_aggregator_getter;
+  }
+
  private:
   Params params_;
 };
@@ -356,7 +364,7 @@ class IteratorBase {
  protected:
   // This is needed so that sub-classes of IteratorBase can call
   // `SaveInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
+  // `RepeatDatasetOp::Dataset`.
   Status SaveParent(IteratorStateWriter* writer,
                     const std::unique_ptr<IteratorBase>& parent) {
     return parent->SaveInternal(writer);
@@ -364,7 +372,7 @@ class IteratorBase {
 
   // This is needed so that sub-classes of IteratorBase can call
   // `RestoreInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
+  // `RepeatDatasetOp::Dataset`.
   Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader,
                        const std::unique_ptr<IteratorBase>& parent) {
     return parent->RestoreInternal(ctx, reader);
@@ -466,11 +474,11 @@ class GraphDatasetBase : public DatasetBase {
   }
 
   // Key for storing the Dataset graph in the serialized format.
-  static const char kDatasetGraphKey[];
+  TF_EXPORT static const char kDatasetGraphKey[];
 
   // Key for storing the output node of the Dataset graph in the serialized
   // format.
-  static const char kDatasetGraphOutputNodeKey[];
+  TF_EXPORT static const char kDatasetGraphOutputNodeKey[];
 
  private:
   Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
@@ -513,7 +521,7 @@ class DatasetIterator : public IteratorBase {
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) final {
-    port::Tracing::TraceMe activity(params_.prefix);
+    tracing::ScopedActivity activity(params_.prefix);
     Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
     if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) {
       s = errors::Internal(
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 1838a8ad02d2bd5522ce3162fea53e3f5afc0309..223b74857d04593b27f7b66efb0cadbad7e3cf4c 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_DEVICE_BASE_H_
-#define TENSORFLOW_FRAMEWORK_DEVICE_BASE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
 
 #include <memory>
 #include <string>
@@ -34,11 +34,9 @@ struct SyclDevice;
 #endif
 }  // end namespace Eigen
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Stream;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
@@ -48,6 +46,7 @@ class Env;
 class EventMgr;
 class OpKernelContext;
 class ResourceMgr;
+class ScopedAllocatorMgr;
 class TensorProto;
 
 namespace thread {
@@ -68,9 +67,10 @@ class PerOpGpuDevice {
 class DeviceContext : public core::RefCounted {
  public:
   ~DeviceContext() override {}
-  virtual perftools::gputools::Stream* stream() const { return nullptr; }
-  virtual void MaintainLifetimeOnStream(
-      const Tensor* t, perftools::gputools::Stream* stream) const {}
+  virtual stream_executor::Stream* stream() const { return nullptr; }
+  virtual void MaintainLifetimeOnStream(const Tensor* t,
+                                        stream_executor::Stream* stream) const {
+  }
 
   // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
   // "device_tensor" which is on a GPU device "device". "device_tensor"
@@ -128,9 +128,11 @@ class DeviceBase {
   // using a single stream.)
   // "event_mgr" is used to delay deallocation of temporary GPU buffers.
   // TODO(pbar) Work out how to move this out of DeviceBase.
+  // GpuDeviceInfo name is an unfortunate legacy, it is used not only by GPUs
+  // but also by TPU devices (to provide default device context).
   struct GpuDeviceInfo {
     // Make sure all the defaults are NULL, so we can spot missing assignments.
-    perftools::gputools::Stream* stream = nullptr;
+    stream_executor::Stream* stream = nullptr;
     DeviceContext* default_context = nullptr;
     EventMgr* event_mgr = nullptr;
     int gpu_id = -1;
@@ -177,6 +179,16 @@ class DeviceBase {
     return GetAllocator(attr);
   }
 
+  // Return an Allocator prepared for use in particular places by graph
+  // optimization
+  virtual Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                        int64 step_id) {
+    LOG(FATAL) << "Device does not implement GetScopedAllocator()";
+    return nullptr;
+  }
+
+  virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
+
   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device() {
     CHECK(eigen_cpu_device_ != nullptr);
     return eigen_cpu_device_;
@@ -230,6 +242,7 @@ class DeviceBase {
  private:
   Env* const env_;
   CpuWorkerThreads* cpu_worker_threads_ = nullptr;
+  // Set by GPUs as well as by TPU devices.
   GpuDeviceInfo* gpu_device_info_ = nullptr;
   thread::ThreadPool* device_thread_pool_ = nullptr;
   Eigen::ThreadPoolDevice* eigen_cpu_device_ = nullptr;
@@ -240,4 +253,4 @@ class DeviceBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_DEVICE_BASE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index eae8e6c3c10c4b49081aed0e253d9a6f382f562b..bdc1af9fdaeb2e2ba7605b6f67ea55fa0bb7977a 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -168,7 +169,7 @@ class FunctionInstantiationHelper {
         strings::StrAppend(&name, "_", i);
       }
       NodeDef* gnode = AddNode(name);
-      gnode->set_op("_Arg");
+      gnode->set_op(FunctionLibraryDefinition::kArgOp);
       AddAttr("T", dtypes[i], gnode);
       AddAttr("index", arg_index, gnode);
       result_.arg_types.push_back(dtypes[i]);
@@ -278,7 +279,7 @@ class FunctionInstantiationHelper {
       auto it = index_.lower_bound(node_name);
       while (it != index_.end() && it->first <= node_colon_bound) {
         if (it->first == node_name ||
-            tensorflow::StringPiece(it->first).starts_with(node_colon)) {
+            tensorflow::str_util::StartsWith(it->first, node_colon)) {
           nid = it->second.nid;
           break;
         }
@@ -328,7 +329,7 @@ class FunctionInstantiationHelper {
         strings::StrAppend(&name, "_", i);
       }
       NodeDef* gnode = AddNode(name);
-      gnode->set_op("_Retval");
+      gnode->set_op(FunctionLibraryDefinition::kRetOp);
       AddInput(nodes_.size() - 1, item->nid, item->idx + i);
       AddAttr("T", dtypes[i], gnode);
       AddAttr("index", (*ret_index)++, gnode);
@@ -502,7 +503,7 @@ string Print(const NodeDef& n) {
   std::vector<StringPiece> dat;
   std::vector<string> dep;
   for (StringPiece s : n.input()) {
-    if (s.Consume("^")) {
+    if (str_util::ConsumePrefix(&s, "^")) {
       dep.push_back(s.ToString());
     } else {
       dat.push_back(s);
@@ -558,9 +559,9 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   std::vector<const NodeDef*> ret;
   std::vector<const NodeDef*> body;
   for (const NodeDef* n : nodes) {
-    if (n->op() == "_Arg") {
+    if (n->op() == FunctionLibraryDefinition::kArgOp) {
       arg.push_back(n);
-    } else if (n->op() == "_Retval") {
+    } else if (n->op() == FunctionLibraryDefinition::kRetOp) {
       ret.push_back(n);
     } else {
       body.push_back(n);
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index e27001133bbb5056abf1a3e1f5b9d69c8e01bc56..e00399f97de42ca6c683202fdec9142310fa6e2d 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -344,6 +344,11 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   Status LookUp(const string& op_type_name,
                 const OpRegistrationData** op_reg_data) const override;
 
+  // Ops created for function arguments bear the name given by `kArgOp`; those
+  // created for return values bear the name given by `kRetOp`.
+  static constexpr const char* const kArgOp = "_Arg";
+  static constexpr const char* const kRetOp = "_Retval";
+
   static constexpr const char* const kGradientOp = "SymbolicGradient";
   static constexpr const char* const kFuncAttr = "f";
 
@@ -404,6 +409,8 @@ struct FunctionBody;
 
 // Forward declare. Defined in common_runtime/device.h
 class Device;
+// Forward declare. Defined in common_runtime/device_mgr.h
+class DeviceMgr;
 
 class FunctionLibraryRuntime {
  public:
@@ -518,6 +525,9 @@ class FunctionLibraryRuntime {
   // Returns the device on which the function executes.
   virtual Device* device() = 0;
 
+  // Get the DeviceMgr from which the device was obtained.
+  virtual const DeviceMgr* device_mgr() const = 0;
+
   // Returns the function library definition that backs this runtime.
   // NOTE(mrry): The returned library definition is the default function library
   // for this runtime. The runtime may instantiate functions from separate
diff --git a/tensorflow/core/framework/function.proto b/tensorflow/core/framework/function.proto
index bd01e86da3a646b7e6331301a03e80e90d2ce6ee..72e3c438314bd80391c38c7fab4acc994eeb92a4 100644
--- a/tensorflow/core/framework/function.proto
+++ b/tensorflow/core/framework/function.proto
@@ -30,7 +30,8 @@ message FunctionDef {
   // Attributes specific to this function definition.
   map<string, AttrValue> attr = 5;
 
-  // NOTE: field id 2 deleted on Jan 11, 2016, GraphDef version 21.
+  // NOTE: field id 2 deleted on Jan 11, 2017, GraphDef version 21.
+  reserved 2;
 
   // In both of the following fields, there is the need to specify an
   // output that is used as either the input to another node (in
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 23685e9c536a67ca33fbabdab438e7192c8a47fc..44e1383719c9c903f956fca0b1ba93ec5df4adb4 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -496,7 +496,7 @@ MySelect(x:float) -> (z:float) {
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
 
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index 1f670535d575e9bbc4196fb1f1e1c381d33ae204..f7539d37be08ce1235f35dcc0a8fd0bfcb12b434 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb_text.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -53,6 +54,12 @@ Status ValidateExternalGraphDefSyntax(const GraphDef& graph_def) {
 Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
                                  const OpRegistryInterface& op_registry,
                                  int node_offset) {
+  return AddDefaultAttrsToGraphDef(graph_def, op_registry, node_offset, false);
+}
+
+Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
+                                 const OpRegistryInterface& op_registry,
+                                 int node_offset, bool skip_unknown_ops) {
   if (node_offset > graph_def->node_size()) {
     return errors::InvalidArgument(
         "Tried to add default attrs to GraphDef "
@@ -63,8 +70,12 @@ Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
   for (int i = node_offset; i < graph_def->node_size(); ++i) {
     NodeDef* node_def = graph_def->mutable_node(i);
     const OpDef* op_def;
-    TF_RETURN_IF_ERROR(op_registry.LookUpOpDef(node_def->op(), &op_def));
-    AddDefaultsToNodeDef(*op_def, node_def);
+    Status s = op_registry.LookUpOpDef(node_def->op(), &op_def);
+    if (s.ok()) {
+      AddDefaultsToNodeDef(*op_def, node_def);
+    } else if (!skip_unknown_ops) {
+      return s;
+    }
   }
 
   return Status::OK();
@@ -84,7 +95,7 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
   std::vector<string> to_remove;
   for (const auto& attr : node_def->attr()) {
     // If the attr is not in consumer_op_def and doesn't start with '_'...
-    if (!StringPiece(attr.first).starts_with("_") &&
+    if (!str_util::StartsWith(attr.first, "_") &&
         FindAttr(attr.first, *consumer_op_def) == nullptr) {
       const OpDef::AttrDef* producer_attr_def =
           FindAttr(attr.first, *producer_op_def);
diff --git a/tensorflow/core/framework/graph_def_util.h b/tensorflow/core/framework/graph_def_util.h
index 0c6542a9f258a28f42a9caa9821ea3faf8010342..525e84a989fb0edbc8fd57ff3f3b0d0ed4b13e16 100644
--- a/tensorflow/core/framework/graph_def_util.h
+++ b/tensorflow/core/framework/graph_def_util.h
@@ -56,6 +56,12 @@ Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
                                  const OpRegistryInterface& op_registry,
                                  int node_offset);
 
+// Same as above, except for the fact that it skips nodes that aren't found in
+// op_registry if skip_unknown_ops is true.
+Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
+                                 const OpRegistryInterface& op_registry,
+                                 int node_offset, bool skip_unknown_ops);
+
 // Remove attrs from 'graph_def' that have the default value according
 // to 'producer_op_registry', but don't exist according to
 // 'consumer_op_registry'. This can allow 'graph_def' to run on the
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
similarity index 97%
rename from tensorflow/compiler/jit/graph_to_functiondef.cc
rename to tensorflow/core/framework/graph_to_functiondef.cc
index 6fa21fa6204dcc9446081d07e2a59ccace216713..4ffa5033792e64c15323b9dc95b0f52c550006df 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/jit/graph_to_functiondef.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 
 #include <unordered_map>
 #include <unordered_set>
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -110,7 +111,7 @@ string NodeNameMapping::Renormalize(const string& name) const {
 }  // anonymous namespace
 
 // Graph to FunctionDef conversion. This code is closely modeled on the Python
-// code in third_party/tensorflow/python/framework/function.py.
+// code in tensorflow/python/framework/function.py.
 
 Status GraphToFunctionDef(const Graph& graph, const string& name,
                           FunctionDef* fdef) {
@@ -229,7 +230,7 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
   for (int n_index = 0; n_index < fdef->node_def_size(); ++n_index) {
     NodeDef* node_def = fdef->mutable_node_def(n_index);
     for (int i = 0; i < node_def->input_size(); ++i) {
-      if (StringPiece(node_def->input(i)).starts_with("^")) {
+      if (str_util::StartsWith(node_def->input(i), "^")) {
         // Control input
         const string normalized =
             node_names.Renormalize(node_def->input(i).substr(1));
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.h b/tensorflow/core/framework/graph_to_functiondef.h
similarity index 79%
rename from tensorflow/compiler/jit/graph_to_functiondef.h
rename to tensorflow/core/framework/graph_to_functiondef.h
index 3e1ae7bbbe286e464899c26f197665905fb50aec..cb0e2b2fbd5dd66656830fb8f21ab5148a2f5517 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.h
+++ b/tensorflow/core/framework/graph_to_functiondef.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_GRAPH_TO_FUNCTIONDEF_H_
-#define TENSORFLOW_COMPILER_JIT_GRAPH_TO_FUNCTIONDEF_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
+#define TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
 
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
@@ -23,11 +23,10 @@ limitations under the License.
 namespace tensorflow {
 
 // Converts 'graph' to a FunctionDef 'fdef', with name 'name'.
-// Closely modeled on the Python code in
-// third_party/tensorflow/python/framework/function.py
+// Closely modeled on the Python code in tensorflow/python/framework/function.py
 Status GraphToFunctionDef(const Graph& graph, const string& name,
                           FunctionDef* fdef);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_GRAPH_TO_FUNCTIONDEF_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
diff --git a/tensorflow/compiler/jit/graph_to_functiondef_test.cc b/tensorflow/core/framework/graph_to_functiondef_test.cc
similarity index 98%
rename from tensorflow/compiler/jit/graph_to_functiondef_test.cc
rename to tensorflow/core/framework/graph_to_functiondef_test.cc
index 676db7c4dd2fd7047e8ae9bb190daf18af6ac7cf..587e2c07ac046e7476a2da53a9ef4d8b3651410a 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef_test.cc
+++ b/tensorflow/core/framework/graph_to_functiondef_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/jit/graph_to_functiondef.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
diff --git a/tensorflow/core/framework/graph_transfer_info.proto b/tensorflow/core/framework/graph_transfer_info.proto
index 016259ddbf5254a96086d4813a312b27593f10d9..41dd54d78c0395047ca2045e5007edc3b1b72b07 100644
--- a/tensorflow/core/framework/graph_transfer_info.proto
+++ b/tensorflow/core/framework/graph_transfer_info.proto
@@ -8,6 +8,46 @@ option java_package = "org.tensorflow.framework";
 
 import "tensorflow/core/framework/types.proto";
 
+message GraphTransferNodeInput {
+  int32 node_id = 1;
+  int32 output_port = 2;
+}
+message GraphTransferNodeInfo {
+  string name = 1;
+  int32 node_id = 2;
+  string type_name = 3;
+  int32 soc_op_id = 4;
+  int32 padding_id = 5;
+  int32 input_count = 6;
+  int32 output_count = 7;
+};
+message GraphTransferConstNodeInfo {
+  string name = 1;
+  int32 node_id = 2;
+  repeated int64 shape = 3;
+  bytes data = 4;
+  DataType dtype = 5;
+};
+message GraphTransferNodeInputInfo {
+  int32 node_id = 1;
+  repeated GraphTransferNodeInput node_input = 2;
+};
+message GraphTransferNodeOutputInfo {
+  int32 node_id = 1;
+  repeated int32 max_byte_size = 2;
+};
+message GraphTransferGraphInputNodeInfo {
+  string name = 1;
+  repeated int64 shape = 2;
+  DataType dtype = 3;
+}
+
+message GraphTransferGraphOutputNodeInfo {
+  string name = 1;
+  repeated int64 shape = 2;
+  DataType dtype = 3;
+}
+
 // Protocol buffer representing a handle to a tensorflow resource. Handles are
 // not valid across executions, but can be serialized back and forth from within
 // a single run.
@@ -16,53 +56,14 @@ message GraphTransferInfo {
     NOP = 0;
     HEXAGON = 1;
   }
-  message NodeInput {
-    int32 node_id = 1;
-    int32 output_port = 2;
-  }
-  message NodeInfo {
-    string name = 1;
-    int32 node_id = 2;
-    string type_name = 3;
-    int32 soc_op_id = 4;
-    int32 padding_id = 5;
-    int32 input_count = 6;
-    int32 output_count = 7;
-  };
-  message ConstNodeInfo {
-    string name = 1;
-    int32 node_id = 2;
-    repeated int64 shape = 3;
-    bytes data = 4;
-    DataType dtype = 5;
-  };
-  message NodeInputInfo {
-    int32 node_id = 1;
-    repeated NodeInput node_input = 2;
-  };
-  message NodeOutputInfo {
-    int32 node_id = 1;
-    repeated int32 max_byte_size = 2;
-  };
-  message GraphInputNodeInfo {
-    string name = 1;
-    repeated int64 shape = 2;
-    DataType dtype = 3;
-  }
-
-  message GraphOutputNodeInfo {
-    string name = 1;
-    repeated int64 shape = 2;
-    DataType dtype = 3;
-  }
 
-  repeated NodeInfo node_info = 1;
-  repeated ConstNodeInfo const_node_info = 2;
-  repeated NodeInputInfo node_input_info = 3;
-  repeated NodeOutputInfo node_output_info = 4;
+  repeated GraphTransferNodeInfo node_info = 1;
+  repeated GraphTransferConstNodeInfo const_node_info = 2;
+  repeated GraphTransferNodeInputInfo node_input_info = 3;
+  repeated GraphTransferNodeOutputInfo node_output_info = 4;
   // Input Node parameters of transferred graph
-  repeated GraphInputNodeInfo graph_input_node_info = 5;
-  repeated GraphOutputNodeInfo graph_output_node_info = 6;
+  repeated GraphTransferGraphInputNodeInfo graph_input_node_info = 5;
+  repeated GraphTransferGraphOutputNodeInfo graph_output_node_info = 6;
   // Destination of graph transfer
   Destination destination = 7;
 };
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index e836873f667a6971b2c12d44860e5436a04cb93c..cc583df348b8d4d5416e428698fe1a49c29f3637 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -82,7 +83,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
     for (const string& message : messages) {
-      EXPECT_TRUE(StringPiece(status.error_message()).contains(message))
+      EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
           << status << ", " << message;
     }
   }
@@ -103,7 +104,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     }
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
-    EXPECT_TRUE(StringPiece(status.error_message()).contains(message))
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
         << "Actual error: " << status.error_message()
         << "\nDoes not contain: " << message;
   }
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 95fb3863144e8150d78f5d21722f6bc102c451ea..bad92ca9b3d8c981a5dc56485d218179190e83d0 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -131,7 +132,7 @@ Status AttrSlice::Find(StringPiece attr_name,
   // Skip AttachDef for internal attrs since it is a little bit
   // expensive and it is common for them to correctly not be included
   // in a NodeDef.
-  if (!attr_name.starts_with("_") && ndef_ != nullptr) {
+  if (!str_util::StartsWith(attr_name, "_") && ndef_ != nullptr) {
     s = AttachDef(s, *ndef_);
   }
   return s;
@@ -399,7 +400,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   size_t num_inputs = 0;
   // TODO(josh11b): Unify the input field validation.
   for (const string& input : node_def.input()) {
-    if (StringPiece(input).starts_with("^")) {
+    if (str_util::StartsWith(input, "^")) {
       seen_control = true;
       if (input.find(':') != string::npos) {
         return errors::InvalidArgument(
@@ -425,7 +426,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   }
   for (const auto& attr : node_def.attr()) {
     // Allow internal optional attributes with names starting with "_".
-    if (StringPiece(attr.first).starts_with("_")) {
+    if (str_util::StartsWith(attr.first, "_")) {
       continue;
     }
     auto iter = op_attrs.find(attr.first);
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index ae3a93eafeefb2be3a85e546c085691a72caf2e1..2a49425dba9edeacf71b0ba41b78c082809ab2ae 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -65,7 +66,7 @@ void ExpectFailure(const NodeDef& bad, const OpDef& op_def,
       << "; OpDef: " << SummarizeOpDef(op_def);
 
   LOG(INFO) << "Message: " << status.error_message();
-  EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+  EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
       << "NodeDef: " << SummarizeNodeDef(bad)
       << "; OpDef: " << SummarizeOpDef(op_def) << "\nActual error: " << status
       << "\nDoes not contain: " << message;
@@ -265,7 +266,7 @@ void ExpectInvalidSyntax(const NodeDef& bad, const string& message) {
   EXPECT_TRUE(errors::IsInvalidArgument(status))
       << status << "; NodeDef: " << SummarizeNodeDef(bad);
 
-  EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.ToString()), message))
       << "NodeDef: " << SummarizeNodeDef(bad) << ", " << status << ", "
       << message;
 }
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 4c38fbbe591a5d07ba4cbbea00dcbfb41ca2f403..b1d01278098b5126aa974c5c2b55868fe8810e95 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
 // clang-format on
 
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -110,7 +111,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 abs(
 }  // namespace numext
 }  // namespace Eigen
 
-#if defined(COMPILER_MSVC) && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
 namespace std {
 template <>
 struct hash<Eigen::half> {
@@ -119,6 +120,6 @@ struct hash<Eigen::half> {
   }
 };
 }  // namespace std
-#endif  // COMPILER_MSVC
+#endif  // _MSC_VER
 
 #endif  // TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index fadb60d744217daa0c569601c437146a70f9b4d5..0873d4e47bd406e3234d8f18724b45a4e3aee4b1 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -90,11 +91,15 @@ Status OpRegistry::LookUp(const string& op_type_name,
         }
       }
     }
-    Status status =
-        errors::NotFound("Op type not registered '", op_type_name,
-                         "' in binary running on ", port::Hostname(), ". ",
-                         "Make sure the Op and Kernel are registered in the "
-                         "binary running in this process.");
+    Status status = errors::NotFound(
+        "Op type not registered '", op_type_name, "' in binary running on ",
+        port::Hostname(), ". ",
+        "Make sure the Op and Kernel are registered in the "
+        "binary running in this process. Note that if you "
+        "are loading a saved graph which used ops from "
+        "tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done"
+        "before importing the graph, as contrib ops are lazily registered "
+        "when the module is first accessed.");
     VLOG(1) << status.ToString();
     return status;
   }
@@ -110,6 +115,15 @@ void OpRegistry::GetRegisteredOps(std::vector<OpDef>* op_defs) {
   }
 }
 
+void OpRegistry::GetOpRegistrationData(
+    std::vector<OpRegistrationData>* op_data) {
+  mutex_lock lock(mu_);
+  MustCallDeferred();
+  for (const auto& p : registry_) {
+    op_data->push_back(*p.second);
+  }
+}
+
 Status OpRegistry::SetWatcher(const Watcher& watcher) {
   mutex_lock lock(mu_);
   if (watcher_ && watcher) {
@@ -133,7 +147,7 @@ void OpRegistry::Export(bool include_internal, OpList* ops) const {
   out->Reserve(sorted.size());
 
   for (const auto& item : sorted) {
-    if (include_internal || !StringPiece(item.first).starts_with("_")) {
+    if (include_internal || !str_util::StartsWith(item.first, "_")) {
       *out->Add() = item.second->op_def;
     }
   }
@@ -236,10 +250,15 @@ Status OpListOpRegistry::LookUp(const string& op_type_name,
   auto iter = index_.find(op_type_name);
   if (iter == index_.end()) {
     *op_reg_data = nullptr;
-    return errors::NotFound("Op type not registered '", op_type_name,
-                            "' in binary running on ", port::Hostname(), ". ",
-                            "Make sure the Op and Kernel are registered in the "
-                            "binary running in this process.");
+    return errors::NotFound(
+        "Op type not registered '", op_type_name, "' in binary running on ",
+        port::Hostname(), ". ",
+        "Make sure the Op and Kernel are registered in the "
+        "binary running in this process. Note that if you "
+        "are loading a saved graph which used ops from "
+        "tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done"
+        "before importing the graph, as contrib ops are lazily registered "
+        "when the module is first accessed.");
   }
   *op_reg_data = iter->second;
   return Status::OK();
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index f7f1ed2a886548c39fa38239d65aa2a73564c3c4..3ccca4090d9804050c484d64a62826665b94d4d2 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -89,6 +89,9 @@ class OpRegistry : public OpRegistryInterface {
   // Get all registered ops.
   void GetRegisteredOps(std::vector<OpDef>* op_defs);
 
+  // Get all `OpRegistrationData`s.
+  void GetOpRegistrationData(std::vector<OpRegistrationData>* op_data);
+
   // Watcher, a function object.
   // The watcher, if set by SetWatcher(), is called every time an op is
   // registered via the Register function. The watcher is passed the Status
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index b57bdcb841592578de4a2026d70b0e91bae66b02..c782480f1fa859715c46785faa22d01675c3c16e 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -96,7 +97,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeOpDef(old_op_def) << " vs. "
                     << SummarizeOpDef(new_op_def);
     } else {
-      EXPECT_TRUE(StringPiece(status.error_message()).contains(error))
+      EXPECT_TRUE(str_util::StrContains(status.error_message(), error))
           << status << " does not contain " << error;
     }
   }
@@ -118,7 +119,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeNodeDef(*node_def());
     } else {
       EXPECT_TRUE(
-          StringPiece(status.error_message()).contains(validation_error))
+          str_util::StrContains(status.error_message(), validation_error))
           << status << " does not contain " << validation_error;
     }
 
@@ -179,7 +180,7 @@ class OpCompatibilityTest : public OpsTestBase {
                     << SummarizeOpDef(*new_op_def);
     } else {
       EXPECT_TRUE(
-          StringPiece(status.error_message()).contains(compatibility_error))
+          str_util::StrContains(status.error_message(), compatibility_error))
           << status << " does not contain " << compatibility_error;
     }
   }
diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index ba545a19949e5574086756dc2092033341be4b30..ca0e5e7133af61a4c8e15be7cf8df903eeb648b0 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -126,6 +126,12 @@ message OpDef {
   // -------------------------------------------------------------------------
   // Optimization constraints.
 
+  // Ops are marked as stateful if their behavior depends on some state beyond
+  // their input tensors (e.g. variable reading op) or if they have
+  // a side-effect (e.g. printing or asserting ops). Equivalently, stateless ops
+  // must always produce the same output for the same input and have
+  // no side-effects.
+  //
   // By default Ops may be moved between devices.  Stateful ops should
   // either not be moved, or should only be moved if that state can also
   // be moved (e.g. via some sort of save / restore).
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 962bc11ccbd2b9abdd4ce26dc3e75c45862cdc74..403bd0b5e22a314309ad0994a879d588c124fe54 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -112,9 +112,11 @@ bool ConsumeAttrNumber(StringPiece* sp, int64* out) {
 
 bool ConsumeCompoundAttrType(StringPiece* sp, StringPiece* out) {
   auto capture_begin = sp->begin();
-  if (sp->Consume("numbertype") || sp->Consume("numerictype") ||
-      sp->Consume("quantizedtype") || sp->Consume("realnumbertype") ||
-      sp->Consume("realnumberictype")) {
+  if (str_util::ConsumePrefix(sp, "numbertype") ||
+      str_util::ConsumePrefix(sp, "numerictype") ||
+      str_util::ConsumePrefix(sp, "quantizedtype") ||
+      str_util::ConsumePrefix(sp, "realnumbertype") ||
+      str_util::ConsumePrefix(sp, "realnumberictype")) {
     *out = StringPiece(capture_begin, sp->begin() - capture_begin);
     return true;
   }
@@ -155,32 +157,32 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   bool is_list = ConsumeListPrefix(&spec);
   string type;
   StringPiece type_string;  // Used if type == "type"
-  if (spec.Consume("string")) {
+  if (str_util::ConsumePrefix(&spec, "string")) {
     type = "string";
-  } else if (spec.Consume("int")) {
+  } else if (str_util::ConsumePrefix(&spec, "int")) {
     type = "int";
-  } else if (spec.Consume("float")) {
+  } else if (str_util::ConsumePrefix(&spec, "float")) {
     type = "float";
-  } else if (spec.Consume("bool")) {
+  } else if (str_util::ConsumePrefix(&spec, "bool")) {
     type = "bool";
-  } else if (spec.Consume("type")) {
+  } else if (str_util::ConsumePrefix(&spec, "type")) {
     type = "type";
-  } else if (spec.Consume("shape")) {
+  } else if (str_util::ConsumePrefix(&spec, "shape")) {
     type = "shape";
-  } else if (spec.Consume("tensor")) {
+  } else if (str_util::ConsumePrefix(&spec, "tensor")) {
     type = "tensor";
-  } else if (spec.Consume("func")) {
+  } else if (str_util::ConsumePrefix(&spec, "func")) {
     type = "func";
   } else if (ConsumeCompoundAttrType(&spec, &type_string)) {
     type = "type";
     AttrValue* allowed = attr->mutable_allowed_values();
     VERIFY(ProcessCompoundType(type_string, allowed),
            "Expected to see a compound type, saw: ", type_string);
-  } else if (spec.Consume("{")) {
+  } else if (str_util::ConsumePrefix(&spec, "{")) {
     // e.g. "{ int32, float, bool }" or "{ \"foo\", \"bar\" }"
     AttrValue* allowed = attr->mutable_allowed_values();
     str_util::RemoveLeadingWhitespace(&spec);
-    if (spec.starts_with("\"") || spec.starts_with("'")) {
+    if (str_util::StartsWith(spec, "\"") || str_util::StartsWith(spec, "'")) {
       type = "string";  // "{ \"foo\", \"bar\" }" or "{ 'foo', 'bar' }"
       while (true) {
         StringPiece escaped_string;
@@ -193,11 +195,12 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                "Trouble unescaping \"", escaped_string,
                "\", got error: ", error);
         allowed->mutable_list()->add_s(unescaped);
-        if (spec.Consume(",")) {
+        if (str_util::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (spec.Consume("}")) break;  // Allow ending with ", }".
+          if (str_util::ConsumePrefix(&spec, "}"))
+            break;  // Allow ending with ", }".
         } else {
-          VERIFY(spec.Consume("}"),
+          VERIFY(str_util::ConsumePrefix(&spec, "}"),
                  "Expected , or } after strings in list, not: '", spec, "'");
           break;
         }
@@ -215,11 +218,12 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                  "Unrecognized type string '", type_string, "'");
           allowed->mutable_list()->add_type(dt);
         }
-        if (spec.Consume(",")) {
+        if (str_util::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (spec.Consume("}")) break;  // Allow ending with ", }".
+          if (str_util::ConsumePrefix(&spec, "}"))
+            break;  // Allow ending with ", }".
         } else {
-          VERIFY(spec.Consume("}"),
+          VERIFY(str_util::ConsumePrefix(&spec, "}"),
                  "Expected , or } after types in list, not: '", spec, "'");
           break;
         }
@@ -232,7 +236,8 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
 
   // Write the type into *attr.
   if (is_list) {
-    VERIFY(spec.Consume(")"), "Expected ) to close 'list(', not: '", spec, "'");
+    VERIFY(str_util::ConsumePrefix(&spec, ")"),
+           "Expected ) to close 'list(', not: '", spec, "'");
     str_util::RemoveLeadingWhitespace(&spec);
     attr->set_type(strings::StrCat("list(", type, ")"));
   } else {
@@ -240,7 +245,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Read optional minimum constraint at the end.
-  if ((is_list || type == "int") && spec.Consume(">=")) {
+  if ((is_list || type == "int") && str_util::ConsumePrefix(&spec, ">=")) {
     int64 min_limit = -999;
     VERIFY(ConsumeAttrNumber(&spec, &min_limit),
            "Could not parse integer lower limit after '>=', found '", spec,
@@ -250,7 +255,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Parse default value, if present.
-  if (spec.Consume("=")) {
+  if (str_util::ConsumePrefix(&spec, "=")) {
     str_util::RemoveLeadingWhitespace(&spec);
     VERIFY(ParseAttrValue(attr->type(), spec, attr->mutable_default_value()),
            "Could not parse default value '", spec, "'");
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 2d035ab90d0f4493f6b6f572d0dd8550f5098e7e..9be0dc69d2c190274b3f8d473df170f3b4ed3660 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -238,7 +240,7 @@ static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
 Status ValidateOpDef(const OpDef& op_def) {
   using ::tensorflow::strings::Scanner;
 
-  if (!StringPiece(op_def.name()).starts_with("_")) {
+  if (!str_util::StartsWith(op_def.name(), "_")) {
     VALIDATE(Scanner(op_def.name())
                  .One(Scanner::UPPERLETTER)
                  .Any(Scanner::LETTER_DIGIT)
@@ -258,11 +260,11 @@ Status ValidateOpDef(const OpDef& op_def) {
 
     // Validate type
     StringPiece type(attr.type());
-    bool is_list = type.Consume("list(");
+    bool is_list = str_util::ConsumePrefix(&type, "list(");
     bool found = false;
     for (StringPiece valid : {"string", "int", "float", "bool", "type", "shape",
                               "tensor", "func"}) {
-      if (type.Consume(valid)) {
+      if (str_util::ConsumePrefix(&type, valid)) {
         found = true;
         break;
       }
@@ -270,8 +272,9 @@ Status ValidateOpDef(const OpDef& op_def) {
     VALIDATE(found, "Unrecognized type '", type, "' in attr '", attr.name(),
              "'");
     if (is_list) {
-      VALIDATE(type.Consume(")"), "'list(' is missing ')' in attr ",
-               attr.name(), "'s type ", attr.type());
+      VALIDATE(str_util::ConsumePrefix(&type, ")"),
+               "'list(' is missing ')' in attr ", attr.name(), "'s type ",
+               attr.type());
     }
     VALIDATE(type.empty(), "Extra '", type, "' at the end of attr ",
              attr.name(), "'s type ", attr.type());
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 2b9812d4fcbc145540155959b19dd37cf902c1a2..4514d92e387b9de90b767d0a775272469006cf04 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -57,7 +57,7 @@ class ValidateOpDefTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << "Did not see error with: " << message;
     if (!status.ok()) {
       LOG(INFO) << "message: " << status;
-      EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+      EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
           << "Actual: " << status << "\nExpected to contain: " << message;
     }
   }
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 5f2eb9d99ab11f9862bd277d93af61c05e2517f4..7f23272871abe96dfa2fd7240bfc82015178bda6 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -50,10 +50,10 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
     StringPiece to_append = str.substr(0, space);
     str.remove_prefix(space + 1);
     // Remove spaces at break.
-    while (to_append.ends_with(" ")) {
+    while (str_util::EndsWith(to_append, " ")) {
       to_append.remove_suffix(1);
     }
-    while (str.Consume(" ")) {
+    while (str_util::ConsumePrefix(&str, " ")) {
     }
 
     // Go on to the next line.
@@ -65,8 +65,9 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
 }
 
 bool ConsumeEquals(StringPiece* description) {
-  if (description->Consume("=")) {
-    while (description->Consume(" ")) {  // Also remove spaces after "=".
+  if (str_util::ConsumePrefix(description, "=")) {
+    while (str_util::ConsumePrefix(description,
+                                   " ")) {  // Also remove spaces after "=".
     }
     return true;
   }
@@ -98,7 +99,7 @@ static bool StartsWithFieldName(StringPiece line,
                                 const std::vector<string>& multi_line_fields) {
   StringPiece up_to_colon;
   if (!SplitAt(':', &line, &up_to_colon)) return false;
-  while (up_to_colon.Consume(" "))
+  while (str_util::ConsumePrefix(&up_to_colon, " "))
     ;  // Remove leading spaces.
   for (const auto& field : multi_line_fields) {
     if (up_to_colon == field) {
@@ -119,9 +120,9 @@ static bool ConvertLine(StringPiece line,
   StringPiece up_to_colon;
   StringPiece after_colon = line;
   SplitAt(':', &after_colon, &up_to_colon);
-  while (after_colon.Consume(" "))
+  while (str_util::ConsumePrefix(&after_colon, " "))
     ;  // Remove leading spaces.
-  if (!after_colon.Consume("\"")) {
+  if (!str_util::ConsumePrefix(&after_colon, "\"")) {
     // We only convert string fields, so don't convert this line.
     return false;
   }
@@ -181,9 +182,9 @@ string PBTxtToMultiline(StringPiece pbtxt,
 static bool FindMultiline(StringPiece line, size_t colon, string* end) {
   if (colon == StringPiece::npos) return false;
   line.remove_prefix(colon + 1);
-  while (line.Consume(" ")) {
+  while (str_util::ConsumePrefix(&line, " ")) {
   }
-  if (line.Consume("<<")) {
+  if (str_util::ConsumePrefix(&line, "<<")) {
     *end = line.ToString();
     return true;
   }
@@ -228,7 +229,7 @@ string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
     string suffix;
     while (!multiline_pbtxt.empty()) {
       SplitAt('\n', &multiline_pbtxt, &line);
-      if (line.Consume(end)) break;
+      if (str_util::ConsumePrefix(&line, end)) break;
       if (first) {
         first = false;
       } else {
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 8654437059ca449432e6381b9eb3c4ba15e56f48..ca91d68f79f9e3d2bf658e8cb240ed6de93ff983 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -96,7 +96,7 @@ OpKernel::OpKernel(OpKernelConstruction* context,
       output_memory_types_(context->output_memory_types().begin(),
                            context->output_memory_types().end()),
       graph_def_version_(context->graph_def_version()),
-      is_internal_(StringPiece(type_string()).starts_with("_")),
+      is_internal_(str_util::StartsWith(type_string(), "_")),
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
@@ -282,8 +282,13 @@ OpKernelContext::~OpKernelContext() {
 }
 
 Allocator* OpKernelContext::get_allocator(AllocatorAttributes attr) {
-  Allocator* allocator =
-      params_->device->GetStepAllocator(attr, resource_manager());
+  Allocator* allocator = nullptr;
+  if (attr.scope_id > 0) {
+    allocator = params_->device->GetScopedAllocator(attr, step_id());
+    CHECK(allocator);
+  } else {
+    allocator = params_->device->GetStepAllocator(attr, resource_manager());
+  }
   if (track_allocations()) {
     mutex_lock lock(mu_);
     for (const auto& wrapped : wrapped_allocators_) {
@@ -360,7 +365,7 @@ Status OpKernelContext::input_ref_mutex(StringPiece name, mutex** out_mutex) {
 
 const Tensor& OpKernelContext::input(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, num_inputs());
+  DCHECK_LT(index, num_inputs()) << " name: " << op_kernel().name();
   DCHECK(!input_is_ref(index));
   const Tensor& tensor = *((*params_->inputs)[index].tensor);
   record_tensor_reference(tensor);
@@ -415,8 +420,8 @@ bool OpKernelContext::forward_input_to_output_with_shape(
                                ? AllocatorAttributes()
                                : output_alloc_attr(output_index);
   std::unique_ptr<Tensor> new_tensor = forward_input(
-      input_index, expected_output_dtype(output_index), output_shape,
-      output_memory_type(output_index), output_attr);
+      input_index, output_index, expected_output_dtype(output_index),
+      output_shape, output_memory_type(output_index), output_attr);
   if (new_tensor != nullptr) {
     // Transfer ownership to the output slot in OpKernelContext.
     outputs_[output_index] = TensorValue(new_tensor.release());
@@ -456,35 +461,66 @@ Status OpKernelContext::forward_input_to_output_with_shape(
 }
 
 std::unique_ptr<Tensor> OpKernelContext::forward_input(
-    int input_index, DataType output_dtype, const TensorShape& output_shape,
-    MemoryType output_memory_type, const AllocatorAttributes& output_attr) {
+    int input_index, int output_index, DataType output_dtype,
+    const TensorShape& output_shape, MemoryType output_memory_type,
+    const AllocatorAttributes& output_attr) {
   DCHECK_GE(input_index, 0);
   DCHECK_LT(input_index, num_inputs());
   const TensorValue& input = (*params_->inputs)[input_index];
-  // Check that input tensor exists, is not a ref, and has no other consumers.
-  if (input.tensor == nullptr || input.is_ref() || !input->RefCountIsOne()) {
+  // Check whether at graph construction time this output was marked
+  // either for no forwarding or with a reservation for this input.
+  // If it's reserved for this input we'll skip the refcount and
+  // AllocatorAttribute checks.
+  // TODO(tucker): Maybe we should skip all of the checks?
+  bool never_forward =
+      (params_->forward_from_array != nullptr && output_index >= 0 &&
+       params_->forward_from_array[output_index] == Params::kNeverForward);
+  if (never_forward) return nullptr;
+  bool forward_expected =
+      (params_->forward_from_array != nullptr && output_index >= 0 &&
+       params_->forward_from_array[output_index] == input_index);
+  if (!forward_expected && params_->forward_from_array != nullptr) {
+    // Check for possibly conflicting forward.
+    for (int i = 0; i < num_outputs(); ++i) {
+      if (params_->forward_from_array[i] == input_index) {
+        // This input is reserved for output i.
+        return nullptr;
+      }
+    }
+  }
+  // Check that input tensor exists and is not a ref.
+  if (input.tensor == nullptr || input.is_ref()) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that input type matches.
   if (input_dtype(input_index) != output_dtype) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that the input and output sizes are compatible.
   if (input.tensor->shape().num_elements() != output_shape.num_elements()) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that input and output memory types match, i.e.
   // that they either both live in host or both live in device memory.
   if (input_memory_type(input_index) != output_memory_type) {
+    CHECK(!forward_expected);
     return nullptr;
   }
-  // Check that output allocator attributes are not more restrictive than
-  // input allocator attributes.
-  const auto input_attr = params_->input_alloc_attrs == nullptr
-                              ? AllocatorAttributes()
-                              : input_alloc_attr(input_index);
-  if (!output_attr.IsEqualOrLessRestrictiveThan(input_attr)) {
-    return nullptr;
+  if (!forward_expected) {
+    if (!input->RefCountIsOne()) {
+      return nullptr;
+    }
+    // Check that output allocator attributes are not more restrictive than
+    // input allocator attributes.
+    const auto input_attr = params_->input_alloc_attrs == nullptr
+                                ? AllocatorAttributes()
+                                : input_alloc_attr(input_index);
+    if (!output_attr.IsEqualOrLessRestrictiveThan(input_attr)) {
+      return nullptr;
+    }
   }
   // TODO(rmlarsen): Use MakeUnique here. There is already a copy in
   // tensorflow/compiler/xla/ptr_util.h. Perhaps this should be part of
@@ -500,7 +536,8 @@ Status OpKernelContext::forward_input_or_allocate_temp(
     Tensor* out_temp) {
   for (int input_index : candidate_input_indices) {
     std::unique_ptr<Tensor> new_tensor =
-        forward_input(input_index, type, shape, DEVICE_MEMORY, allocator_attr);
+        forward_input(input_index, Params::kNoReservation /*output_index*/,
+                      type, shape, DEVICE_MEMORY, allocator_attr);
     if (new_tensor != nullptr) {
       *out_temp = std::move(*new_tensor);
       return Status::OK();
@@ -590,6 +627,14 @@ Status OpKernelContext::allocate_output(int index, const TensorShape& shape,
                                         Tensor** output) {
   DCHECK_GE(index, 0);
   DCHECK_LT(index, num_outputs());
+  bool forward_expected =
+      (params_->forward_from_array != nullptr && index >= 0 &&
+       params_->forward_from_array[index] >= 0);
+  if (forward_expected) {
+    return errors::Internal(
+        "Explicit allocate_output call where input forwarding required.  Try "
+        "turning off the ScopedAllocator optimizer.");
+  }
   AllocatorAttributes attr = output_alloc_attr(index);
   return allocate_output(index, shape, output, attr);
 }
@@ -1228,51 +1273,59 @@ const Eigen::SyclDevice& OpKernelContext::eigen_device() const {
 }
 #endif
 
+namespace {
+template <class OpKernelT>
+void CtxFailureInternal(OpKernelT* op_kernel, const char* file, int line,
+                        const Status& s) {
+  const string logging_prefix =
+      file == nullptr ? "CtxFailure: "
+                      : strings::StrCat("CtxFailure at ", io::Basename(file),
+                                        ":", line, ": ");
+
+  if (errors::IsOutOfRange(s)) {
+    // VLOG OutOfRange errors. Dataset ops create OutOfRange errors when they
+    // reach end-of-sequence.
+    VLOG(1) << logging_prefix << s;
+  } else {
+    LOG(WARNING) << logging_prefix << s;
+  }
+  op_kernel->SetStatus(s);
+}
+}  // anonymous namespace
+
 void OpKernelConstruction::CtxFailure(const Status& s) {
-  VLOG(1) << s;
-  SetStatus(s);
+  CtxFailureInternal(this, nullptr, 0, s);
 }
 
 void OpKernelConstruction::CtxFailureWithWarning(const Status& s) {
-  LOG(WARNING) << s;
-  SetStatus(s);
+  CtxFailureInternal(this, nullptr, 0, s);
 }
 
 void OpKernelConstruction::CtxFailure(const char* file, int line,
                                       const Status& s) {
-  VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
-          << " : " << s;
-  SetStatus(s);
+  CtxFailureInternal(this, file, line, s);
 }
 
 void OpKernelConstruction::CtxFailureWithWarning(const char* file, int line,
                                                  const Status& s) {
-  LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
-               << " : " << s;
-  SetStatus(s);
+  CtxFailureInternal(this, file, line, s);
 }
 
 void OpKernelContext::CtxFailure(const Status& s) {
-  VLOG(1) << s;
-  SetStatus(s);
+  CtxFailureInternal(this, nullptr, 0, s);
 }
 
 void OpKernelContext::CtxFailureWithWarning(const Status& s) {
-  LOG(WARNING) << s;
-  SetStatus(s);
+  CtxFailureInternal(this, nullptr, 0, s);
 }
 
 void OpKernelContext::CtxFailure(const char* file, int line, const Status& s) {
-  VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
-          << " : " << s;
-  SetStatus(s);
+  CtxFailureInternal(this, file, line, s);
 }
 
 void OpKernelContext::CtxFailureWithWarning(const char* file, int line,
                                             const Status& s) {
-  LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
-               << " : " << s;
-  SetStatus(s);
+  CtxFailureInternal(this, file, line, s);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 5ccd45efc980393aa02582595dde873be7426e26..67943377b9f5cd2dcb5f9dc347011db22fac1726 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -64,10 +64,11 @@ class AsyncOpKernel;
 class CallFrameInterface;
 class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
-class OpKernelContext;       // declared below
+class OpKernelContext;       // declared below,
 class OpRegistryInterface;
 class ResourceMgr;
 class ScopedStepContainer;
+class CollectiveExecutor;
 class StepStatsCollector;
 
 class OpKernel {
@@ -532,6 +533,10 @@ class OpKernelContext {
     // computations running on other devices.
     Rendezvous* rendezvous = nullptr;
 
+    // Mechanism for executing a collective op that needs to coordinate
+    // with parallel instances runing on other devices.
+    CollectiveExecutor* collective_executor = nullptr;
+
     // The session state for this op.
     SessionState* session_state = nullptr;
 
@@ -565,6 +570,12 @@ class OpKernelContext {
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
+
+    // Support for forwarding reservations (used by ScopedAllocator).
+    static const int kNeverForward = -2;
+    static const int kNoReservation = -1;
+    // Values in [0,...) represent reservations for the indexed output.
+    const int* forward_from_array = nullptr;
   };
 
   // params must outlive the OpKernelContext.
@@ -707,14 +718,31 @@ class OpKernelContext {
   //     input[input_index] are compatible with those given in dtype, shape,
   //     memory_type, and attr,
   //   * refcount on the underlying buffer is one.
+  //   * Either there is no forwarding reservation for either input_index
+  //     or output_index or the specified input is reserved for the specified
+  //     output. More precisely:
+  //
+  //     These cases mean neither input nor output has a reservation:
+  //        forward_from_array = nullptr
+  //     OR (input_index is not in forward_from_array AND
+  //         (output_index == kNoReservation OR
+  //          forward_from_array[output_index] == kNoReservation))
+  //
+  //     This case means that input_index is reserved for output_index:
+  //        forward_from_array[output_index] == input_index
+  //
+  //     This case means the output is reserved to always be allocated,
+  //     never assigned a forwarded input:
+  //        forward_from_array[output_index] == kNeverForward
+  //
   // Otherwise returns nullptr.
   // NOTE: For Cuda kernels that read inputs using the __ldg() intrinsic,
   // forwarding is only safe if there are no reads via __ldg() after writes
   // to the same address.
   std::unique_ptr<Tensor> forward_input(
-      int input_index, DataType dtype, const TensorShape& shape,
-      MemoryType memory_type,
-      const AllocatorAttributes& attr) TF_MUST_USE_RESULT;
+      int input_index, int output_index, DataType output_dtype,
+      const TensorShape& output_shape, MemoryType output_memory_type,
+      const AllocatorAttributes& output_attr) TF_MUST_USE_RESULT;
 
   // Tries to forward one of the inputs given in input_indices to
   // output[output_index]. If none of the given inputs can be forwarded, calls
@@ -934,6 +962,10 @@ class OpKernelContext {
   // Rendezvous Send() and Recv().
   Rendezvous* rendezvous() const { return params_->rendezvous; }
 
+  CollectiveExecutor* collective_executor() const {
+    return params_->collective_executor;
+  }
+
   // An op kernel can access the session state it belongs to.
   SessionState* session_state() const { return params_->session_state; }
 
@@ -1101,7 +1133,8 @@ class OpKernelContext {
   void NotifyUseOfPersistentTensor(const Tensor& tensor);
 
   Status status_;
-  Params* params_;    // not owned
+  friend class CollectiveExecutor;  // for access to params_
+  Params* params_;                  // not owned
   mutable mutex mu_;  // mutable so const accessors can acquire the lock
   gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators_ GUARDED_BY(mu_);
   gtl::InlinedVector<TensorValue, 4> outputs_;
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index b53b877f28d2c80e969fb418aa316ad96c6e2eaa..bcd409e5c54b7d63137dd9d236d21bb3ec7b4f56 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -546,9 +546,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) {
                                             {"T|list(type)|[DT_FLOAT]"}));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
-  EXPECT_TRUE(
-      StringPiece(GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}))
-          .contains("Invalid argument: "));
+  EXPECT_TRUE(str_util::StrContains(
+      GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}),
+      "Invalid argument: "));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {"T|int|7"},
                 error::INVALID_ARGUMENT);
@@ -565,8 +565,8 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Multiple OpKernel registrations match NodeDef"));
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernel", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
 }
@@ -585,8 +585,8 @@ TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Multiple OpKernel registrations match NodeDef"));
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernelForT", DEVICE_CPU, {"T|type|DT_FLOAT"},
                 error::INVALID_ARGUMENT);
@@ -606,8 +606,9 @@ TEST_F(OpKernelBuilderTest, BadConstraint) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("OpKernel 'BadConstraint' has constraint on attr "
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "OpKernel 'BadConstraint' has constraint on attr "
                             "'T' not in NodeDef"));
 
   ExpectFailure("BadConstraint", DEVICE_CPU, {"dtype|type|DT_FLOAT"},
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index 389a08ac2f3f23080bfb80555da2777eaba24404..946da40d0e315ad62528b7163e5417eec17ebaa4 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -14,14 +14,6 @@ import "tensorflow/core/framework/types.proto";
 // not valid across executions, but can be serialized back and forth from within
 // a single run.
 message RemoteFusedGraphExecuteInfo {
-  enum NodeType {
-    UNUSED = 0;
-    GRAPH_INPUT = 1;
-    GRAPH_OUTPUT = 2;
-    FUSED_NODE = 3;
-    BORDER_INPUT = 4;
-    BORDER_OUTPUT = 5;
-  }
 
   message TensorShapeTypeProto {
     DataType dtype = 1;
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index 90756a4f2fceb366f2ec0eb991adc31dcf884d99..e84143f1b9ac2e2ef1b0d7121fd8167fe7328d1b 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -296,7 +296,9 @@ class LocalRendezvousImpl : public Rendezvous {
   Status status_ GUARDED_BY(mu_);
 
   ~LocalRendezvousImpl() override {
-    StartAbort(errors::Cancelled("LocalRendezvousImpl deleted"));
+    if (!table_.empty()) {
+      StartAbort(errors::Cancelled("LocalRendezvousImpl deleted"));
+    }
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalRendezvousImpl);
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 9a458431e7c1b038c3177b2aa58e21dfa3e4e837..c84ea3b034cc20329b20af111f6b08ceebbfb80b 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -319,14 +319,13 @@ class IsResourceInitialized : public OpKernel {
 // specified type. The type will be a part of the generated op name.
 // TODO(apassos): figure out how to get non-cpu-allocated tensors to work
 // through constant folding so this doesn't have to be marked as stateful.
-#define REGISTER_RESOURCE_HANDLE_OP(Type)                   \
-  REGISTER_OP(#Type "HandleOp")                             \
-      .Attr("container: string = ''")                       \
-      .Attr("shared_name: string = ''")                     \
-      .Output("resource: resource")                         \
-      .SetIsStateful()                                      \
-      .SetShapeFn(tensorflow::shape_inference::ScalarShape) \
-      .Doc("Creates a handle to a " #Type)
+#define REGISTER_RESOURCE_HANDLE_OP(Type) \
+  REGISTER_OP(#Type "HandleOp")           \
+      .Attr("container: string = ''")     \
+      .Attr("shared_name: string = ''")   \
+      .Output("resource: resource")       \
+      .SetIsStateful()                    \
+      .SetShapeFn(tensorflow::shape_inference::ScalarShape)
 
 // Utility op kernel to produce a handle to a resource of type T.
 template <typename T>
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 07272e2374cbf4fb46c5b8da5df73ef4d6858c62..798220d4c35c502df61c93b78ccd100d7c4b5ad5 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -71,7 +72,7 @@ string LookupOrCreate(ResourceMgr* rm, const string& container,
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
new file mode 100644
index 0000000000000000000000000000000000000000..872b8f8b304affe8ab8d3fd5470611408dced9bd
--- /dev/null
+++ b/tensorflow/core/framework/resource_var.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+// Resource stored by variables in the resource manager
+// (new, resource-style version).
+class Var : public ResourceBase {
+ public:
+  explicit Var(DataType dtype) : tensor_(dtype) {}
+  // Not copyable or movable.
+  Var(const Var&) = delete;
+  Var& operator=(const Var&) = delete;
+
+  // TODO(ebrevdo): Use LockSet instead of exposing mu.
+  mutex* mu() { return &mu_; }
+  Tensor* tensor() { return &tensor_; }
+
+  string DebugString() override {
+    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
+                           tensor_.shape().DebugString());
+  }
+
+  // Only used in the resource variable path. In resource variables,
+  // tensor.IsInitialized() can be true (i.e. have memory allocated to it) while
+  // there is not a good value there due to a race condition, and it's possible
+  // to stumble upon this during variable.initialized_value(). So it's best to
+  // just store directly whether the variable is initialized.
+  bool is_initialized = false;  // GUARDED_BY(mu_) but annotalysis doesn't like
+                                // it.
+
+ private:
+  mutex mu_;
+  Tensor tensor_;
+
+  ~Var() override {}
+};
+
+}  //  end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
diff --git a/tensorflow/core/framework/session_state.h b/tensorflow/core/framework/session_state.h
index 8fbe940f6aefe70cad3016ecc99528bae1b761dd..653a661dd234a9f9739c0fe7254dd0939ce63223 100644
--- a/tensorflow/core/framework/session_state.h
+++ b/tensorflow/core/framework/session_state.h
@@ -74,6 +74,12 @@ class TensorStore {
   Status SaveTensors(const std::vector<string>& output_names,
                      SessionState* session_state);
 
+  // Returns true if no tensors have been added to this store.
+  bool empty() {
+    mutex_lock l(lock_);
+    return tensors_.empty();
+  }
+
  private:
   mutex lock_;
 
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 641681973a1004f15163217684001c96592731d8..2b995e8b5e8c845078de6f0f82f204f6fa781ee8 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -40,6 +40,7 @@ InferenceContext::InferenceContext(
     : graph_def_version_(graph_def_version),
       node_def_(CHECK_NOTNULL(node_def)) {
   std::vector<ShapeHandle> input_tensors_as_shape_handles;
+  input_tensors_as_shape_handles.reserve(input_tensors_as_shapes.size());
   for (const TensorShapeProto& p : input_tensors_as_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromShapeProto(p, &shape));
@@ -50,6 +51,7 @@ InferenceContext::InferenceContext(
   }
   PreInputInit(op_def, input_tensors, input_tensors_as_shape_handles);
   if (!construction_status_.ok()) return;
+  inputs_.reserve(input_shapes.size());
   for (const TensorShapeProto& p : input_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromShapeProto(p, &shape));
@@ -93,6 +95,7 @@ InferenceContext::InferenceContext(
     : graph_def_version_(graph_def_version),
       node_def_(CHECK_NOTNULL(node_def)) {
   std::vector<ShapeHandle> input_tensors_as_shape_handles;
+  input_tensors_as_shape_handles.reserve(input_tensors_as_shapes.size());
   for (const PartialTensorShape& p : input_tensors_as_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromPartialTensorShape(p, &shape));
@@ -103,6 +106,7 @@ InferenceContext::InferenceContext(
   }
   PreInputInit(op_def, input_tensors, input_tensors_as_shape_handles);
   if (!construction_status_.ok()) return;
+  inputs_.reserve(input_shapes.size());
   for (const PartialTensorShape& p : input_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromPartialTensorShape(p, &shape));
@@ -153,8 +157,10 @@ InferenceContext::~InferenceContext() {}
 
 Status InferenceContext::Run(
     const std::function<Status(shape_inference::InferenceContext* c)>& fn) {
+  ForgetMerges();
   Status s = fn(this);
   if (!s.ok()) {
+    ForgetMerges();
     return AttachContext(s);
   }
 #ifndef NDEBUG
@@ -229,9 +235,7 @@ void InferenceContext::PreInputInit(
   for (const auto& e : output_name_map_) {
     num_outputs = std::max(num_outputs, e.second.second);
   }
-  for (int i = 0; i < num_outputs; ++i) {
-    outputs_.push_back(nullptr);
-  }
+  outputs_.assign(num_outputs, nullptr);
   output_handle_shapes_and_types_.resize(num_outputs);
 }
 
@@ -298,13 +302,23 @@ bool InferenceContext::FullyDefined(ShapeHandle s) {
 DimensionHandle InferenceContext::NumElements(ShapeHandle s) {
   const auto rank = Rank(s);
   if (rank == kUnknownRank) return UnknownDim();
+  bool found_unknown = false;
   int64 size = 1;
   for (int i = 0; i < rank; ++i) {
     int64 dim_val = Value(Dim(s, i));
-    if (dim_val == kUnknownDim) return UnknownDim();
-    size *= dim_val;
+    if (dim_val == kUnknownDim) {
+      found_unknown = true;
+    } else if (dim_val == 0) {
+      return MakeDim(0);
+    } else {
+      size *= dim_val;
+    }
+  }
+  if (found_unknown) {
+    return UnknownDim();
+  } else {
+    return MakeDim(size);
   }
-  return MakeDim(size);
 }
 
 string InferenceContext::DebugString(ShapeHandle s) {
@@ -459,13 +473,15 @@ Status InferenceContext::MergePrefix(ShapeHandle s, ShapeHandle prefix,
   TF_RETURN_IF_ERROR(WithRankAtLeast(s, rank, &s));
 
   // Merge the prefix dims and create the new output shapes.
+  const int32 rank_s = Rank(s);
   std::vector<DimensionHandle> dims;
+  dims.reserve(std::max(rank, rank_s));
   dims.resize(rank);
   for (int i = 0; i < rank; ++i) {
     TF_RETURN_IF_ERROR(Merge(Dim(s, i), Dim(prefix, i), &dims[i]));
   }
   *prefix_out = MakeShape(dims);
-  for (int i = rank; i < Rank(s); ++i) dims.push_back(Dim(s, i));
+  for (int i = rank; i < rank_s; ++i) dims.push_back(Dim(s, i));
   *s_out = MakeShape(dims);
   return Status::OK();
 }
@@ -716,6 +732,24 @@ ShapeHandle InferenceContext::Matrix(DimensionOrConstant dim1,
   return MakeShape({dim1, dim2});
 }
 
+Status InferenceContext::MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+    int input_idx, ShapeHandle* out) {
+  ShapeHandle input_shape;
+  TF_RETURN_IF_ERROR(WithRankAtMost(input(input_idx), 1, &input_shape));
+
+  requested_input_tensor_as_partial_shape_[input_idx] = true;
+  if (input_idx < input_tensors_as_shapes_.size() &&
+      input_tensors_as_shapes_[input_idx].IsSet() &&
+      RankKnown(input_tensors_as_shapes_[input_idx])) {
+    *out = input_tensors_as_shapes_[input_idx];
+    return Status::OK();
+  }
+
+  return InternalMakeShapeFromTensor(
+      true /* treat_unknown_scalar_tensor_as_unknown_shape */,
+      input_tensor(input_idx), input_shape, out);
+}
+
 Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
                                                   ShapeHandle* out) {
   ShapeHandle input_shape;
@@ -729,13 +763,31 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
     return Status::OK();
   }
 
-  return MakeShapeFromTensor(input_tensor(input_idx), input_shape, out);
+  return InternalMakeShapeFromTensor(
+      false /* treat_unknown_scalar_tensor_as_unknown_shape */,
+      input_tensor(input_idx), input_shape, out);
 }
 
 Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
                                              ShapeHandle tensor_shape,
                                              ShapeHandle* out) {
+  return InternalMakeShapeFromTensor(
+      false /* treat_unknown_scalar_tensor_as_unknown_shape */, t, tensor_shape,
+      out);
+}
+
+Status InferenceContext::InternalMakeShapeFromTensor(
+    bool treat_unknown_scalar_tensor_as_unknown_shape, const Tensor* t,
+    ShapeHandle tensor_shape, ShapeHandle* out) {
+  // Only callers who have set
+  if (!treat_unknown_scalar_tensor_as_unknown_shape) {
+    TF_RETURN_IF_ERROR(WithRank(tensor_shape, 1, &tensor_shape));
+  }
   if (t == nullptr) {
+    // This is guarded by the check above.
+    if (Rank(tensor_shape) == 0) {
+      return ReturnUnknownShape(out);
+    }
     // Shape tensor is not known, but if the shape of the shape tensor is then
     // the right number of unknown dims can be created.
     DimensionHandle shape_dim = Dim(tensor_shape, 0);
@@ -749,10 +801,46 @@ Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
     return ReturnCreatedShape(dims, out);
   }
 
+  if (t->shape().dims() == 0) {
+    if (t->dtype() == DataType::DT_INT32) {
+      auto flat_t = t->scalar<int32>();
+      if (flat_t() != -1) {
+        *out = nullptr;
+        return errors::InvalidArgument(
+            "Input tensor must be rank 1, or if its rank 0 it must have value "
+            "-1 "
+            "(representing an unknown shape).  Saw value: ",
+            flat_t());
+      }
+      return ReturnUnknownShape(out);
+    } else if (t->dtype() == DataType::DT_INT64) {
+      auto flat_t = t->scalar<int64>();
+      if (flat_t() != -1) {
+        *out = nullptr;
+        return errors::InvalidArgument(
+            "Input tensor must be rank 1, or if its rank 0 it must have value "
+            "-1 "
+            "(representing an unknown shape).  Saw value: ",
+            flat_t());
+      }
+      return ReturnUnknownShape(out);
+    } else {
+      *out = nullptr;
+      return errors::InvalidArgument(
+          "Input tensor must be int32 or int64, but was ",
+          DataTypeString(t->dtype()));
+    }
+  }
+
   if (t->shape().dims() != 1) {
     *out = nullptr;
-    return errors::InvalidArgument("Input tensor must be rank 1, but was rank ",
-                                   t->shape().dims());
+    return errors::InvalidArgument(
+        "Input tensor must be rank 1, but was rank ", t->shape().dims(), ".",
+        ((t->shape().dims() == 0)
+             ? "If it is rank 0 rank 0 it must have statically known value -1 "
+               "(representing an unknown shape). "
+             : " "),
+        "Saw tensor shape ", t->shape().DebugString());
   }
   std::vector<DimensionHandle> dims;
   if (t->dtype() == DataType::DT_INT32) {
@@ -1023,6 +1111,7 @@ Status InferenceContext::Max(DimensionHandle first, DimensionOrConstant second,
 
 Status InferenceContext::AttachContext(const Status& status) {
   std::vector<string> input_shapes;
+  input_shapes.reserve(inputs_.size());
   for (const ShapeHandle& input_shape : inputs_) {
     input_shapes.emplace_back(DebugString(input_shape));
   }
@@ -1030,6 +1119,7 @@ Status InferenceContext::AttachContext(const Status& status) {
   // Add information about the input tensors and partial tensor shapes used.
   std::vector<string> input_from_tensors_str;
   std::vector<string> input_from_tensors_as_shape_str;
+  input_from_tensors_as_shape_str.reserve(inputs_.size());
   for (int i = 0; i < inputs_.size(); ++i) {
     if (requested_input_tensor_as_partial_shape_[i] &&
         i < input_tensors_as_shapes_.size() &&
@@ -1151,9 +1241,7 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
   if (!refined) {
     return false;
   }
-  for (int i = 0; i < new_values.size(); ++i) {
-    (*to_update)[i] = new_values[i];
-  }
+  to_update->swap(new_values);
   return true;
 }
 
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index e3cc848a169bd848b8f3617d552938ba1ced3663..9431a62abefd1a98aa95df2b58f343dfad2c7bee 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -285,6 +285,8 @@ class InferenceContext {
     return true;
   }
 
+  void SetInput(int idx, ShapeHandle shape) { inputs_[idx] = shape; }
+
   ShapeHandle input(int64 idx) const { return inputs_[idx]; }
   Status input(StringPiece input_name, std::vector<ShapeHandle>* output) const;
   int num_inputs() const { return inputs_.size(); }
@@ -317,6 +319,11 @@ class InferenceContext {
     input_tensors_as_shapes_ = input_tensors_as_shapes;
   }
 
+  const std::vector<ShapeHandle>& input_tensors_as_shapes() const {
+    return input_tensors_as_shapes_;
+  }
+
+  ShapeHandle output(int64 idx) const { return outputs_[idx]; }
   void set_output(int idx, ShapeHandle shape) { outputs_[idx] = shape; }
   Status set_output(StringPiece output_name,
                     const std::vector<ShapeHandle>& shapes);
@@ -462,6 +469,12 @@ class InferenceContext {
   // the input tensor is NULL, then an unknown shape is returned.
   Status MakeShapeFromShapeTensor(int input_idx, ShapeHandle* out);
 
+  // Like the function above, but treats scalar values as unknown
+  // shapes.  **NOTE** If the scalar is statically known, its value
+  // must be -1 or an error is returned.
+  Status MakeShapeFromShapeTensorTreatScalarAsUnknownShape(int input_idx,
+                                                           ShapeHandle* out);
+
   // Returns in <out> a new shape corresponding to <proto>.
   Status MakeShapeFromShapeProto(const TensorShapeProto& proto,
                                  ShapeHandle* out);
@@ -580,6 +593,12 @@ class InferenceContext {
       int idx,
       const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
 
+  void set_input_handle_shapes_and_types(
+      int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+    input_handle_shapes_and_types_[idx].reset(
+        new std::vector<ShapeAndType>(shapes_and_types));
+  }
+
   // Returns the output handle shapes and types, for the resource tensor output
   // at index <idx>. Returns NULL if the shape and types were never set.
   const std::vector<ShapeAndType>* output_handle_shapes_and_types(int idx) {
@@ -707,6 +726,11 @@ class InferenceContext {
     merged_dims_.clear();
   }
 
+  // Helper method for MakeShapeFromTensor and MakeShapeFromShapeTensor.
+  Status InternalMakeShapeFromTensor(
+      bool treat_unknown_scalar_tensor_as_unknown_shape, const Tensor* t,
+      ShapeHandle tensor_shape, ShapeHandle* out);
+
   ShapeManager shape_manager_;
 
   // inputs_, outputs_, and input_tensors_as_shapes_ refer to values from
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index f48a7b9c47df3cfa93434ccf585dda8c5a29a2ba..586c38e43bbe75fa0710b11bb7290ee7b3f627d9 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -152,10 +153,9 @@ TEST_F(ShapeInferenceTest, Run) {
     };
     Status s = c.Run(fn);
     // Extra error message is attached when Run fails.
-    EXPECT_TRUE(StringPiece(s.ToString())
-                    .contains("Shape must be at most rank 0 but "
-                              "is rank 1 for 'foo' (op: "
-                              "'foo_op')"))
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(),
+        "Shape must be at most rank 0 but is rank 1 for 'foo' (op: 'foo_op')"))
         << s;
   }
 }
@@ -367,10 +367,9 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
 
   // WithRankAtMost on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(
-      StringPiece(c.WithRankAtMost(in1, 2, &s1).ToString())
-          .contains(
-              "Invalid argument: Shape must be at most rank 2 but is rank 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.WithRankAtMost(in1, 2, &s1).ToString(),
+      "Invalid argument: Shape must be at most rank 2 but is rank 3"));
 
   EXPECT_FALSE(IsSet(s1));
   EXPECT_TRUE(c.WithRankAtMost(in1, 3, &s1).ok());
@@ -406,10 +405,9 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
 
   // WithRankAtLeast on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(
-      StringPiece(c.WithRankAtLeast(in1, 4, &s1).ToString())
-          .contains(
-              "Invalid argument: Shape must be at least rank 4 but is rank 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.WithRankAtLeast(in1, 4, &s1).ToString(),
+      "Invalid argument: Shape must be at least rank 4 but is rank 3"));
 
   EXPECT_FALSE(IsSet(s1));
   EXPECT_TRUE(c.WithRankAtLeast(in1, 3, &s1).ok());
@@ -449,12 +447,14 @@ TEST_F(ShapeInferenceTest, WithValue) {
   // WithValue on dimension with known size.
   out1 = d0;
 
-  EXPECT_TRUE(StringPiece(c.WithValue(d0, 0, &out1).ToString())
-                  .contains("Invalid argument: Dimension must be 0 but is 1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.WithValue(d0, 0, &out1).ToString(),
+                            "Invalid argument: Dimension must be 0 but is 1"));
   EXPECT_FALSE(IsSet(out1));
   out1 = d0;
-  EXPECT_TRUE(StringPiece(c.WithValue(d0, 2, &out1).ToString())
-                  .contains("Invalid argument: Dimension must be 2 but is 1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.WithValue(d0, 2, &out1).ToString(),
+                            "Invalid argument: Dimension must be 2 but is 1"));
 
   EXPECT_FALSE(IsSet(out1));
   EXPECT_TRUE(c.WithValue(d0, 1, &out1).ok());
@@ -513,16 +513,14 @@ TEST_F(ShapeInferenceTest, MergeDim) {
   EXPECT_EQ(3, merged_dims.size());
 
   // Merging unequal values is an error.
-  EXPECT_TRUE(
-      StringPiece(c.Merge(d2, d1, &out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 2 and 1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(d2, d1, &out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 2 and 1"));
 
   EXPECT_FALSE(IsSet(out));
-  EXPECT_TRUE(
-      StringPiece(c.Merge(d1, d2, &out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(d1, d2, &out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
 
@@ -729,26 +727,23 @@ TEST_F(ShapeInferenceTest, MergeShape) {
 
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_u_2, s_1_3, &out).ToString())
-          .contains(
-              "Invalid argument: Dimension 1 in both shapes must be equal, but "
-              "are 2 and 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_u_2, s_1_3, &out).ToString(),
+      "Invalid argument: Dimension 1 in both shapes must be equal, but "
+      "are 2 and 3"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_1_3, s_u_2, &out).ToString())
-          .contains(
-              "Invalid argument: Dimension 1 in both shapes must be equal, but "
-              "are 3 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_1_3, s_u_2, &out).ToString(),
+      "Invalid argument: Dimension 1 in both shapes must be equal, but "
+      "are 3 and 2"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_1, s_1_2, &out).ToString())
-          .contains(
-              "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_1, s_1_2, &out).ToString(),
+      "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
 
@@ -795,22 +790,18 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
   // Incompatible merges give errors and set outs to nullptr.
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(
-          c.MergePrefix(s_1_u_3, s_2_4, &s_out, &s_prefix_out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MergePrefix(s_1_u_3, s_2_4, &s_out, &s_prefix_out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(s_out));
   EXPECT_FALSE(IsSet(s_prefix_out));
 
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(
-          c.MergePrefix(s_2_4, s_1_u_3, &s_out, &s_prefix_out).ToString())
-          .contains(
-              "Invalid argument: Shape must be at least rank 3 but is rank 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MergePrefix(s_2_4, s_1_u_3, &s_out, &s_prefix_out).ToString(),
+      "Invalid argument: Shape must be at least rank 3 but is rank 2"));
   EXPECT_FALSE(IsSet(s_out));
   EXPECT_FALSE(IsSet(s_prefix_out));
 }
@@ -868,24 +859,21 @@ TEST_F(ShapeInferenceTest, Subshape) {
 
   // Errors.
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, 6, -3, &out).ToString())
-                  .contains("Invalid argument: Subshape must have computed "
-                            "start <= end, but is 5 "
-                            "and 2 (computed from start 6 and end -3 over "
-                            "shape with rank 5)"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Subshape(in0, 6, -3, &out).ToString(),
+      "Invalid argument: Subshape must have computed start <= end, but is 5 "
+      "and 2 (computed from start 6 and end -3 over shape with rank 5)"));
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, -50, 100, &out).ToString())
-                  .contains("Invalid argument: Subshape start out of "
-                            "bounds: -50, for shape with "
-                            "rank 5"));
+  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, -50, 100, &out).ToString(),
+                                    "Invalid argument: Subshape start out of "
+                                    "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, 0, -50, &out).ToString())
-                  .contains("Invalid argument: Subshape end out of bounds: "
-                            "-50, for shape with rank "
-                            "5"));
+  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, 0, -50, &out).ToString(),
+                                    "Invalid argument: Subshape end out of "
+                                    "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
 }
@@ -1093,28 +1081,36 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   t = ::tensorflow::test::AsTensor<int64>({});
   EXPECT_EQ("[]", create(&t));
 
+  // Test negative scalar
+  t = ::tensorflow::test::AsScalar<int32>(-1);
+  EXPECT_EQ("?", create(&t));
+
   t = ::tensorflow::test::AsTensor<float>({1, 2, 3});
-  EXPECT_TRUE(
-      StringPiece(create(&t))
-          .contains("Input tensor must be int32 or int64, but was float"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Input tensor must be int32 or int64, but was float"));
 
   t = ::tensorflow::test::AsScalar<int32>(1);
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Input tensor must be rank 1, but was rank 0"));
+  auto s_scalar = create(&t);
+  EXPECT_TRUE(str_util::StrContains(
+      s_scalar,
+      "Input tensor must be rank 1, or if its rank 0 it must have value -1"))
+      << s_scalar;
 
   t = ::tensorflow::test::AsTensor<int32>({1, 2}, TensorShape{2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Input tensor must be rank 1, but was rank 2"));
+  auto s_matrix = create(&t);
+  EXPECT_TRUE(str_util::StrContains(
+      s_matrix, "Input tensor must be rank 1, but was rank 2"))
+      << s_matrix;
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int64>({3, -2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Invalid value in tensor used for shape: -2"));
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int32>({3, -2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Invalid value in tensor used for shape: -2"));
 
   // Test when the input shape is wrong.
   {
@@ -1172,9 +1168,9 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
   EXPECT_TRUE(c.MakeShapeFromShapeProto(proto, &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
   proto.add_dim()->set_size(0);
-  EXPECT_TRUE(
-      StringPiece(c.MakeShapeFromShapeProto(proto, &out).error_message())
-          .contains("An unknown shape must not have any dimensions set."));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MakeShapeFromShapeProto(proto, &out).error_message(),
+      "An unknown shape must not have any dimensions set."));
   EXPECT_FALSE(IsSet(out));
 
   // With known rank.
@@ -1188,10 +1184,10 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
 
   // With invalid dimension value.
   proto.add_dim()->set_size(-2);
-  EXPECT_TRUE(
-      StringPiece(c.MakeShapeFromShapeProto(proto, &out).error_message())
-          .contains("Shape [0,?,1000,-2] has dimensions with values below -1 "
-                    "(where -1 means unknown)"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MakeShapeFromShapeProto(proto, &out).error_message(),
+      "Shape [0,?,1000,-2] has dimensions with values below -1 "
+      "(where -1 means unknown)"));
 
   EXPECT_FALSE(IsSet(out));
 }
@@ -1257,9 +1253,10 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
   EXPECT_EQ("20", c.DebugString(d));
 
-  EXPECT_TRUE(StringPiece(c.MakeDimForScalarInput(1, &d).error_message())
-                  .contains("Dimension size, given by scalar input 1, must "
-                            "be non-negative but is -1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                            "Dimension size, given by scalar input 1, must be "
+                            "non-negative but is -1"));
 
   // Same tests, with int64 values.
   t1 = tensorflow::test::AsScalar<int64>(20);
@@ -1267,9 +1264,10 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
   EXPECT_EQ("20", c.DebugString(d));
 
-  EXPECT_TRUE(StringPiece(c.MakeDimForScalarInput(1, &d).error_message())
-                  .contains("Dimension size, given by scalar input 1, must "
-                            "be non-negative but is -1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                            "Dimension size, given by scalar input 1, must be "
+                            "non-negative but is -1"));
 }
 
 TEST_F(ShapeInferenceTest, GetAttr) {
@@ -1322,33 +1320,33 @@ TEST_F(ShapeInferenceTest, Divide) {
   EXPECT_TRUE(c.Divide(d_6, d_2, evenly_divisible, &out).ok());
   EXPECT_EQ("3", c.DebugString(out));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 5, evenly_divisible, &out).error_message())
-          .contains("Dimension size must be evenly divisible by 5 but is 6"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 5, evenly_divisible, &out).error_message(),
+      "Dimension size must be evenly divisible by 5 but is 6"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, d_0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, d_0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, -1, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is -1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is -1"));
 
   // Repeat error cases above with evenly_divisible=false.
   evenly_divisible = false;
   EXPECT_TRUE(c.Divide(d_6, 5, evenly_divisible, &out).ok());
   EXPECT_EQ("1", c.DebugString(out));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, -1, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is -1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is -1"));
 }
 
 TEST_F(ShapeInferenceTest, Add) {
@@ -1396,11 +1394,9 @@ TEST_F(ShapeInferenceTest, Add) {
   EXPECT_TRUE(c.Add(d_0, d_6, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(
-      StringPiece(c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out)
-                      .error_message())
-          .contains(
-              "Dimension size overflow from adding 6 and 9223372036854775802"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out).error_message(),
+      "Dimension size overflow from adding 6 and 9223372036854775802"));
 }
 
 TEST_F(ShapeInferenceTest, Subtract) {
@@ -1448,9 +1444,9 @@ TEST_F(ShapeInferenceTest, Subtract) {
   EXPECT_TRUE(c.Subtract(d_6, d_0, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(
-      StringPiece(c.Subtract(d_5, d_6, &out).error_message())
-          .contains("Negative dimension size caused by subtracting 6 from 5"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Subtract(d_5, d_6, &out).error_message(),
+      "Negative dimension size caused by subtracting 6 from 5"));
 }
 
 TEST_F(ShapeInferenceTest, Multiply) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index b4765ab0b2c41a1b510364d755984b6ae68dd07a..b54dd220ab919a640c9cd58e112459999762e4d1 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -100,7 +100,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
       }
     }
 
-    if (expected.starts_with("in")) {
+    if (str_util::StartsWith(expected, "in")) {
       if (in_index == -1) {
         return Unknown(err_prefix,
                        " should have matched an input shape by "
@@ -135,7 +135,9 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     }
 
     // Verify the dimensions.
-    CHECK(expected.starts_with("[") && expected.ends_with("]")) << expected;
+    CHECK(str_util::StartsWith(expected, "[") &&
+          str_util::EndsWith(expected, "]"))
+        << expected;
     expected.remove_prefix(1);
     expected.remove_suffix(1);
 
@@ -176,7 +178,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
           return Unknown(err_prefix, " expected to be unknown but was ",
                          c.Value(out_dim), err_suffix);
         }
-      } else if (expected_dim.starts_with("d")) {
+      } else if (str_util::StartsWith(expected_dim, "d")) {
         // Compare the dimension values.
         auto v = str_util::Split(expected_dim, '|');
         if (in_dim_idx.first == -1) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 7977841482efa396c8e0797d8c80a40c11b4df56..2a99af7659d9be0dbab505fc7147e7fcc15d67c9 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
 
@@ -83,17 +84,17 @@ class ShapeInferenceTestutil {
       "", ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
               op, i, o)                                                       \
               .error_message())
-#define INFER_ERROR(error_substring, op, i)                                 \
-  {                                                                         \
-    string error_message =                                                  \
-        ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
-            op, i, "e")                                                     \
-            .error_message();                                               \
-    const string& substring = error_substring;                              \
-    EXPECT_NE("", error_message);                                           \
-    EXPECT_TRUE(StringPiece(error_message).contains(substring))             \
-        << "Expected to see '" << substring << "' in '" << error_message    \
-        << "'";                                                             \
+#define INFER_ERROR(error_substring, op, i)                                    \
+  {                                                                            \
+    string error_message =                                                     \
+        ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes(    \
+            op, i, "e")                                                        \
+            .error_message();                                                  \
+    const string& substring = error_substring;                                 \
+    EXPECT_NE("", error_message);                                              \
+    EXPECT_TRUE(::tensorflow::str_util::StrContains(error_message, substring)) \
+        << "Expected to see '" << substring << "' in '" << error_message       \
+        << "'";                                                                \
   }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index 20a6807064bea96f41cbd6035327d7a6db2f73b8..a4405b502cb68444fd43ad21af5922e3bd42ec42 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -25,10 +26,11 @@ namespace shape_inference {
 
 namespace {
 
-#define EXPECT_CONTAINS(str, substr)                                 \
-  do {                                                               \
-    string s = (str);                                                \
-    EXPECT_TRUE(StringPiece(s).contains(substr)) << "String: " << s; \
+#define EXPECT_CONTAINS(str, substr)                            \
+  do {                                                          \
+    string s = (str);                                           \
+    EXPECT_TRUE(::tensorflow::str_util::StrContains(s, substr)) \
+        << "String: " << s;                                     \
   } while (false)
 
 static OpShapeInferenceFn* global_fn_ptr = nullptr;
@@ -97,8 +99,8 @@ TEST(ShapeInferenceTestutilTest, Failures) {
   auto error_message = ShapeInferenceTestutil::InferShapes(
                            ShapeInferenceTestOp("NoSuchOp"), "", "")
                            .error_message();
-  EXPECT_TRUE(StringPiece(error_message)
-                  .starts_with("Op type not registered 'NoSuchOp'"));
+  EXPECT_TRUE(
+      str_util::StartsWith(error_message, "Op type not registered 'NoSuchOp'"));
 
   // Wrong shape error messages.
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "?", fn_copy_input_0),
diff --git a/tensorflow/core/kernels/data/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
similarity index 90%
rename from tensorflow/core/kernels/data/stats_aggregator.h
rename to tensorflow/core/framework/stats_aggregator.h
index 076a56b0bf100161fe2cf4384e6be0809eb251fe..8002d9291c2e0f68b029e570d09f2de41266a8d5 100644
--- a/tensorflow/core/kernels/data/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
 
 #include <memory>
 #include <string>
@@ -47,6 +47,10 @@ class StatsAggregator {
   virtual void AddToHistogram(const string& name,
                               gtl::ArraySlice<double> values) = 0;
 
+  // TODO(shivaniagarawal): consistency in double and float usage.
+  // Add the given `value` as Scalar with the given `name`.
+  virtual void AddScalar(const string& name, float value) = 0;
+
   // Stores a protocol buffer representation of the aggregator state in the
   // given `out_summary`.
   // TODO(mrry): Consider separating this method from the `StatsAggregator`
@@ -81,4 +85,4 @@ class StatsAggregatorResource : public ResourceBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 5d32b71628263fe89d6f54fd07b2fe18bbb55e53..d5a45c73c37bf0807e9437a4c886ca0d96dc5c67 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -610,11 +610,15 @@ bool Tensor::IsInitialized() const {
 }
 
 void Tensor::CheckType(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype);
+  CHECK_EQ(dtype(), expected_dtype)
+      << DataTypeString(expected_dtype) << " expected, got "
+      << DataTypeString(dtype());
 }
 
 void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype);
+  CHECK_EQ(dtype(), expected_dtype)
+      << DataTypeString(expected_dtype) << " expected, got "
+      << DataTypeString(dtype());
   CHECK(IsAligned()) << "CheckTypeAndIsAligned";
 }
 
@@ -884,6 +888,20 @@ bool Tensor::CanUseDMA() const {
 #undef CASE
 
 namespace {
+
+// StrCat and StrAppend don't support Eigen::half directly at the moment, and
+// we would like to keep them compatible with their absl counterparts, for ease
+// of migration. We could rely on errors::internal::PrepareForStrCat() but the
+// logic is so simple we can just replicate it here, where it is close to its
+// usage and easy to change later. And there's the extra benefit of not
+// accessing an 'internal' namespace.
+inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a) {
+  return a;
+}
+inline float PrintOneElement(const Eigen::half& h) {
+  return static_cast<float>(h);
+}
+
 // Print from left dim to right dim recursively.
 template <typename T>
 void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
@@ -896,7 +914,7 @@ void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
     for (int64 i = 0; i < element_count; i++) {
       if (*data_index >= limit) return;
       if (i > 0) strings::StrAppend(result, " ");
-      strings::StrAppend(result, data[(*data_index)++]);
+      strings::StrAppend(result, PrintOneElement(data[(*data_index)++]));
     }
     return;
   }
@@ -927,7 +945,7 @@ string SummarizeArray(int64 limit, int64 num_elts,
   if (shape.empty()) {
     for (int64 i = 0; i < limit; ++i) {
       if (i > 0) strings::StrAppend(&ret, " ");
-      strings::StrAppend(&ret, array[i]);
+      strings::StrAppend(&ret, PrintOneElement(array[i]));
     }
     if (num_elts > limit) strings::StrAppend(&ret, "...");
     return ret;
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 62c42ba652356a5128d4a337e34a3b449781b445..58fbced606c9888241a038bba9613d9dc51f6021 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -44,6 +44,7 @@ class TensorProto;
 class VariantTensorData;
 namespace batch_util {
 Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
 }  // namespace batch_util
 
 /// @ingroup core
@@ -482,6 +483,9 @@ class Tensor {
   friend class AutoReloadVariableOp;  // For access to set_shape
   friend class TensorTestHelper;      // For access to set_shape
   friend class OpKernelContext;       // For access to RefCountIsOne().
+  friend class ScopedAllocator;       // For access to buf_.
+  friend class XlaTensorBuffer;  // For access to the private constructor taking
+                                 // the buffer
   template <typename Device, typename T>
   friend class AssignVariableOp;  // For access to RefCountIsOne().
   template <typename Device, typename T>
@@ -490,6 +494,10 @@ class Tensor {
   friend Status batch_util::CopyElementToSlice(
       Tensor element, Tensor* parent,
       int64 index);                // For access to RefCountIsOne().
+  friend Status batch_util::MaybeMoveSliceToElement(
+      Tensor* parent, Tensor* element,
+      int64 index);  // For access to RefCountIsOne().
+
   friend class NumpyTensorBuffer;  // For access to the private constructor
                                    // taking the buffer.
 
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index fe2ba375aa0c5c50009b3155338cd8860070d47a..be7e740c335ced3ec6826e804090927962d57285 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -271,6 +271,12 @@ class TensorShapeBase : public TensorShapeRep {
   friend Status MakeShapeHelper(const T*, int64, S*);
 };
 
+/// Outputs `TensorShapeBase` to `std::ostream`.
+template <typename Shape>
+std::ostream& operator<<(std::ostream& os, const TensorShapeBase<Shape>& tsb) {
+  return os << tsb.DebugString();
+}
+
 /// Represents the shape of a Tensor.
 ///
 /// A tensor's shape is denoted by its number of dimensions and a size for each
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index d7517bb311d517351f4dd2a59438780482485dff..6329aa6d8edf3795ed8018b7802661749683fe41 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -198,6 +198,13 @@ TEST(TensorShapeTest, DataType) {
   EXPECT_EQ(TensorShapeTestHelper::data_type(&s2), DT_INVALID);
 }
 
+TEST(TensorShapeTest, ostream) {
+  TensorShape s({10, 5, 4});
+  std::stringstream ss;
+  ss << s;
+  EXPECT_EQ(ss.str(), "[10,5,4]");
+}
+
 // -----------------------------------------------------------------------
 // An old implementation of TensorShape using a different representation,
 // preserved here in the unittest to allow us to have a randomized unittest
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index adf4e1bae307d81d91e7e597fc882caf4c87601f..2280114de5110630a0b64742e1f050e589d00bd0 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -114,7 +114,7 @@ string DataTypeString(DataType dtype) {
 }
 
 bool DataTypeFromString(StringPiece sp, DataType* dt) {
-  if (sp.ends_with("_ref")) {
+  if (str_util::EndsWith(sp, "_ref")) {
     sp.remove_suffix(4);
     DataType non_ref;
     if (DataTypeFromString(sp, &non_ref) && !IsRefType(non_ref)) {
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 60f2b4135a68c4eed618e3efb07758fbab85fa07..16b069c70a7640b4859680a630920990dea087ce 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 
 #include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -140,9 +141,8 @@ TEST(TypesTest, ComplexTypes) {
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
     const string name = DataTypeString(dt);
-    const StringPiece n = name;
-    EXPECT_EQ(DataTypeIsInteger(dt),
-              n.starts_with("int") || n.starts_with("uint"))
+    EXPECT_EQ(DataTypeIsInteger(dt), str_util::StartsWith(name, "int") ||
+                                         str_util::StartsWith(name, "uint"))
         << "DataTypeInteger failed for " << name;
   }
 }
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index 85e014f80434d2a2de2851d2cb361f4b0a0c9433..60fa7bd55937b81555d18dab455640326d98a73d 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/port.h"
 
@@ -259,8 +260,8 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
   ClientSession session(root);
   std::vector<Tensor> outputs;
   Status s = session.Run({create_const}, &outputs);
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("GPU copy from non-DMA string tensor"))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "GPU copy from non-DMA string tensor"))
       << s.ToString();
 }
 
@@ -365,8 +366,9 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPUStringFailsSafely) {
   std::vector<Tensor> outputs;
   Status err = session.Run({create_op, identity}, &outputs);
   EXPECT_EQ(err.code(), errors::Code::INVALID_ARGUMENT);
-  EXPECT_TRUE(StringPiece(err.error_message())
-                  .contains("During Variant Host->Device Copy: non-DMA-copy "
+  EXPECT_TRUE(
+      str_util::StrContains(err.error_message(),
+                            "During Variant Host->Device Copy: non-DMA-copy "
                             "attempted of tensor type: string"))
       << err.error_message();
 }
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 06ca211c762748b1dacd4eb9623ffd2d72762cca..7055e62c0e745f61e072914ff2af4d4ff582963a 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include "tensorflow/core/lib/strings/str_util.h"
 
 #define EIGEN_USE_THREADS
 
@@ -130,7 +131,7 @@ TEST(VariantOpShapeRegistryTest, TestBasic) {
   Variant v = vv_early_exit;
   Status s0 = (*shape_fn)(v, &shape);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit!"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit!"));
 
   VariantValue vv_ok{false /* early_exit */};
   v = vv_ok;
@@ -229,7 +230,7 @@ TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(
-      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -254,7 +255,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(
-      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -299,7 +300,7 @@ TEST(VariantOpAddRegistryTest, TestBasicCPU) {
   Status s0 = BinaryOpVariants<CPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit add"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
@@ -325,7 +326,7 @@ TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   Status s0 = BinaryOpVariants<GPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit add"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index db6683d1e74512e37a40773b7642cf33eb888782..30ff19cd7eae794e0e9875ca0825b647b44b02af 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -24,23 +24,24 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
+Status BuildControlFlowInfo(const Graph* g,
+                            std::vector<ControlFlowInfo>* info) {
   info->clear();
   info->resize(g->num_node_ids());
 
   std::vector<const Node*> parent_nodes;
   parent_nodes.resize(g->num_node_ids());
 
-  Node* src_node = g->source_node();
+  const Node* src_node = g->source_node();
   ControlFlowInfo& src_info = (*info)[src_node->id()];
   src_info.frame = src_node;
   src_info.parent_frame = src_node;
 
   string frame_name;
-  std::deque<Node*> ready;
+  std::deque<const Node*> ready;
   ready.push_back(src_node);
   while (!ready.empty()) {
-    Node* curr_node = ready.front();
+    const Node* curr_node = ready.front();
     ready.pop_front();
     const ControlFlowInfo& curr_info = (*info)[curr_node->id()];
     const Node* frame = curr_info.frame;
@@ -56,7 +57,7 @@ Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
     }
 
     for (const Edge* out_edge : curr_node->out_edges()) {
-      Node* out = out_edge->dst();
+      const Node* out = out_edge->dst();
       int out_id = out->id();
       ControlFlowInfo* out_info = &(*info)[out_id];
       const Node* out_parent = out_info->parent_frame;
diff --git a/tensorflow/core/graph/control_flow.h b/tensorflow/core/graph/control_flow.h
index 372044f538f9428e1979ba80bbb18a9742fc014e..79e2be0d4b9db6dd70d339ee07faf25c85376386 100644
--- a/tensorflow/core/graph/control_flow.h
+++ b/tensorflow/core/graph/control_flow.h
@@ -30,14 +30,14 @@ struct ControlFlowInfo {
   string frame_name;                   // frame name of a node
 };
 
-// Assign to each node the name of the frame and the level it belongs to.
-// We check the well-formedness of the graph: All inputs to a node must
-// come from the same frame and have the same "static" iteration level.
-// `info` is cleared and populated by this function.
-// NOTE(yuanbyu): For now, we require all sends/recvs have iteration level
-// 0. This essentially means there can't be multiple serial Nexts in
-// an iteration, which all sane front-ends should satisfy.
-Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info);
+// Clear and populate `info` with each node's frame and the level it belongs to.
+// We check the well-formedness of the graph: All inputs to a node must come
+// from the same frame and have the same "static" iteration level.
+//
+// NOTE(yuanbyu): For now, we require all sends/recvs have iteration level 0.
+// This essentially means there can't be multiple serial Nexts in an iteration,
+// which all sane front-ends should satisfy.
+Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index 4f3a6ec38cb88213c7127df41823bc16e9834d09..1df45d9b893fdb2807c5e6ab63dd4a8577d7feb6 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -427,7 +427,7 @@ static void AssignSizes(const Graph& g, CostModel* cost_model) {
     if (e->IsControlEdge()) {
       continue;
     }
-    Node* src = e->src();
+    const Node* src = e->src();
 
     // TODO(josh11b): Get an estimate from the Op
     Bytes size(1);
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 9b56216f1f97a9598dd7ae8b70786e32bb7e0f4b..fb8a6c39e6786c9dbf3f14c68b8af66b01a20f29 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -339,7 +339,7 @@ Node* Graph::AddNode(const NodeDef& node_def, Status* status) {
   return node;
 }
 
-Node* Graph::CopyNode(Node* node) {
+Node* Graph::CopyNode(const Node* node) {
   DCHECK(!node->IsSource());
   DCHECK(!node->IsSink());
   Node* copy = AllocateNode(node->props_, node);
@@ -567,6 +567,11 @@ void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
         inputs[edge->dst_input()] = edge;
       }
     }
+    // Sort the control inputs for more predictable serialization.
+    std::sort(inputs.begin() + node->num_inputs(), inputs.end(),
+              [](const Edge* a, const Edge* b) -> bool {
+                return a->src()->name() < b->src()->name();
+              });
     node_def->clear_input();
     node_def->mutable_input()->Reserve(inputs.size());
 
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 9d96cd4654bbf1fd65c5135d6a8bdc271c6e9443..f7ca7d0620f4d483d31eaad66dfb8ef10dcab027 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -124,7 +124,8 @@ class Node {
   // Inputs requested by the NodeDef.  For the actual inputs, use in_edges.
   const protobuf::RepeatedPtrField<string>& requested_inputs() const;
 
-  // Get the neighboring nodes via edges either in or out of this node.
+  // Get the neighboring nodes via edges either in or out of this node.  This
+  // includes control edges.
   gtl::iterator_range<NeighborIter> in_nodes() const;
   gtl::iterator_range<NeighborIter> out_nodes() const;
   const EdgeSet& in_edges() const { return in_edges_; }
@@ -422,7 +423,7 @@ class Graph {
   // Copies *node, which may belong to another graph, to a new node,
   // which is returned.  Does not copy any edges.  *this owns the
   // returned instance.
-  Node* CopyNode(Node* node);
+  Node* CopyNode(const Node* node);
 
   // Removes a node from this graph, including all edges from or to it.
   // *node should not be accessed after calling this function.
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 0629ff32d00cf7fad00c39f07810aa4a9d57f14f..c678283fce1768888cb7730aac8f306336c78c4b 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/version.h"
 
@@ -73,7 +74,7 @@ class GraphConstructor {
     Options(const ImportGraphDefOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(false),
           expect_device_spec(false),
-          prefix(in.prefix.empty() || StringPiece(in.prefix).ends_with("/")
+          prefix(in.prefix.empty() || str_util::EndsWith(in.prefix, "/")
                      ? in.prefix
                      : in.prefix + "/"),
           uniquify_names(in.uniquify_names),
@@ -436,7 +437,7 @@ Status GraphConstructor::BuildNodeIndex() {
     bool in_control_dependence = false;
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name = node_def.input(i);
-      if (!input_name.empty() && input_name.starts_with("^")) {
+      if (!input_name.empty() && str_util::StartsWith(input_name, "^")) {
         in_control_dependence = true;
       } else if (in_control_dependence) {
         return errors::InvalidArgument(
@@ -484,7 +485,7 @@ Status GraphConstructor::InitFromEdges() {
       bool has_loop_back_edge = false;
       for (int i = 0; i < node_def.input_size(); ++i) {
         StringPiece input_name(node_def.input(i));
-        if (input_name.starts_with("^")) {
+        if (str_util::StartsWith(input_name, "^")) {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
@@ -534,7 +535,7 @@ Status GraphConstructor::ValidateColocationConstraints(
   if (iter == node_def.attr().end()) return Status::OK();
   for (const string& c : iter->second.list().s()) {
     StringPiece s(c);
-    if (s.Consume(kColocationGroupPrefix) &&
+    if (str_util::ConsumePrefix(&s, kColocationGroupPrefix) &&
         gdef_nodes_.find(s) == gdef_nodes_.end()) {
       return errors::InvalidArgument(
           "Node '", node_def.name(),
@@ -568,13 +569,22 @@ Status GraphConstructor::ValidateShape(Node* node) {
   auto* ic = refiner_->GetContext(node);
   DCHECK(ic != nullptr)
       << "ShapeRefiner::AddNode() should have created the InferenceContext";
-  if (shape_attrs.size() != node->num_outputs()) {
+  if (shape_attrs.size() < node->num_outputs()) {
     return errors::InvalidArgument(
         "Node '", node->name(), "' has ", node->num_outputs(),
         " outputs but the ", kAttrName, " attribute specifies shapes for ",
         shape_attrs.size(), " outputs");
   }
-  for (int i = 0; i < shape_attrs.size(); ++i) {
+  // NOTE(skyewm): we don't raise an error here because some users depend on
+  // this behavior, even though it's unsafe.
+  // TODO(b/74619486): raise an error.
+  if (shape_attrs.size() > node->num_outputs()) {
+    LOG(WARNING) << "Node '" << node->name() << "' has " << node->num_outputs()
+                 << " outputs but the " << kAttrName
+                 << " attribute specifies shapes for " << shape_attrs.size()
+                 << " outputs. Output shapes may be inaccurate.";
+  }
+  for (int i = 0; i < node->num_outputs(); ++i) {
     const TensorShapeProto& p = shape_attrs[i];
     shape_inference::ShapeHandle h;
     Status s = ic->MakeShapeFromShapeProto(p, &h);
@@ -656,20 +666,17 @@ Status GraphConstructor::ModifyNodeDefForImport(NodeDef* node_def) {
 void RemoveInputs(const std::vector<int>& inputs_to_remove, NodeDef* node_def,
                   std::vector<bool>* input_already_exists) {
   // Remove 'inputs_to_remove' from 'node_def'
-  // TODO(skyewm): is there a better way to do this?
-  std::vector<string> inputs;
-  inputs.reserve(node_def->input_size());
-  for (int i = 0; i < node_def->input_size(); ++i) {
-    inputs.push_back(node_def->input(i));
-  }
-  node_def->clear_input();
-  for (int i = 0, j = 0; i < inputs.size(); ++i) {
+  NodeDef copy;
+  copy.mutable_input()->Reserve(node_def->input_size() -
+                                inputs_to_remove.size());
+  for (int i = 0, j = 0; i < node_def->input_size(); ++i) {
     if (j < inputs_to_remove.size() && i == inputs_to_remove[j]) {
       ++j;
     } else {
-      node_def->add_input(inputs[i]);
+      copy.add_input()->swap(*node_def->mutable_input(i));
     }
   }
+  node_def->mutable_input()->Swap(copy.mutable_input());
   // Remove 'inputs_to_remove' from 'input_already_exists'
   for (int idx : inputs_to_remove) {
     input_already_exists->erase(input_already_exists->begin() + idx);
@@ -735,9 +742,21 @@ void GraphConstructor::AddControlDependencies(
   // dependencies
   for (const string& control_dep : opts_.control_dependencies) {
     string input = TensorId(control_dep, Graph::kControlSlot).ToString();
-    const protobuf::RepeatedPtrField<string>& inputs = node_def->input();
-    if (std::find(inputs.begin(), inputs.end(), input) != inputs.end()) {
-      // Control dependency already exists
+    bool found = false;
+    for (int i = node_def->input_size() - 1; i >= 0; --i) {
+      const string& node_input = node_def->input(i);
+      if (node_input[0] != '^') {
+        // Control inputs are at the end. Break when we reach the non-control
+        // inputs.
+        break;
+      }
+      if (node_input == input) {
+        // Control dependency already exists
+        found = true;
+        break;
+      }
+    }
+    if (found) {
       continue;
     }
     node_def->add_input(input);
@@ -751,11 +770,11 @@ void GraphConstructor::AddPrefixToNodeDef(
   node_def->set_name(strings::StrCat(prefix_, node_def->name()));
   // Update names of input nodes
   for (int i = 0; i < node_def->input_size(); ++i) {
-    StringPiece input(node_def->input(i));
     // Skip remapped inputs (which already exist in g_ and are not being
     // imported).
     if (input_already_exists[i]) continue;
-    if (input.Consume("^")) {
+    StringPiece input(node_def->input(i));
+    if (str_util::ConsumePrefix(&input, "^")) {
       node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
       node_def->set_input(i, strings::StrCat(prefix_, input));
@@ -767,7 +786,7 @@ void GraphConstructor::AddPrefixToNodeDef(
         node_def->mutable_attr()->at(kColocationAttrName).mutable_list();
     for (int i = 0; i < list->s_size(); ++i) {
       StringPiece v(list->s(i));
-      if (v.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&v, kColocationGroupPrefix)) {
         list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
       }
     }
@@ -810,7 +829,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     bool updated = false;
     for (int i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
-      if (val.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
         const auto& name_pair = uniquified_names_.find(val.ToString());
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
@@ -923,10 +942,10 @@ Status GraphConstructor::Convert() {
         }
       }
 
-      // TODO(ashankar): The line below means an additional copy of the NodeDef,
-      // which can be expensive if the NodeDef contains large tensors in it.
-      // Might make sense to change the API for ImportGraphDef to take a mutable
-      // GraphDef* and avoid the copying.
+      // TODO(ashankar): The line below means an additional copy of the
+      // NodeDef, which can be expensive if the NodeDef contains large tensors
+      // in it. Might make sense to change the API for ImportGraphDef to take
+      // a mutable GraphDef* and avoid the copying.
       imported_node_def = original_node_def;
       if (!opts_.input_map.empty()) {
         // Note that input_already_exists can shrink here
@@ -970,7 +989,7 @@ Status GraphConstructor::Convert() {
             src_node->num_outputs(), " outputs");
       }
 
-      inputs.push_back(InputInfo(id.first.ToString(), src_node, src_index));
+      inputs.emplace_back(id.first.ToString(), src_node, src_index);
     }
 
     if (has_data_back_edge && !IsMerge(*node_def)) {
@@ -1000,8 +1019,7 @@ Status GraphConstructor::Convert() {
       if (inputs[i].node == nullptr) {
         // Record this back edge, which will be added after all nodes
         // are created.
-        back_edges_.push_back(
-            EdgeInfo(inputs[i].name, inputs[i].index, node, i));
+        back_edges_.emplace_back(inputs[i].name, inputs[i].index, node, i);
       } else if (inputs[i].index == Graph::kControlSlot) {
         g_->AddControlEdge(inputs[i].node, node);
       } else {
@@ -1009,12 +1027,7 @@ Status GraphConstructor::Convert() {
       }
     }
 
-    // Function shape inference is supported on an opt-in basis per
-    // ShapeRefiner.
-    if (refiner_->function_shape_inference_supported() ||
-        g_->flib_def().Find(node_def->name()) == nullptr) {
-      TF_RETURN_IF_ERROR(ValidateShape(node));
-    }
+    TF_RETURN_IF_ERROR(ValidateShape(node));
 
     // Update pending_count_ for outputs.
     UpdatePendingCountAndReady(outputs_, o, &pending_count_, &ready_);
@@ -1271,7 +1284,7 @@ void CopyGraph(const Graph& src, Graph* dest) {
   dest->set_versions(src.versions());
 
   // Copy the nodes
-  std::unordered_map<Node*, Node*>
+  std::unordered_map<const Node*, Node*>
       node_map;  // "Node in src" -> "Node in *dest"
   node_map[src.source_node()] = dest->source_node();
   node_map[src.sink_node()] = dest->sink_node();
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 963c1dc024b4265e14314c610399fc92331f053c..b513778de9c86e12c8c80b51fbb4e5801f3514e2 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -156,7 +156,9 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     StringPiece loc(value[0]);
-    return loc.Consume(kColocationGroupPrefix) ? loc.ToString() : "";
+    return str_util::ConsumePrefix(&loc, kColocationGroupPrefix)
+               ? loc.ToString()
+               : "";
   }
 
   string GraphDebugString() const {
@@ -3158,5 +3160,20 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ValidateColationConstraints) {
   TF_EXPECT_OK(ImportGraphDef(options, def, &graph_, nullptr));
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_UnknownOps) {
+  const string pb_ascii = "node { name: 'op_from_contrib' op: 'OpFromContrib'}";
+  // Try load twice to check for two parts of the error message. We cannot check
+  // for the whole thing in one go because the message includes the hostname.
+  ExpectError(pb_ascii, {"Op type not registered 'OpFromContrib'"});
+  ExpectError(
+      pb_ascii,
+      {"Make sure the Op and Kernel are registered in the "
+       "binary running in this process. Note that if you "
+       "are loading a saved graph which used ops from "
+       "tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done"
+       "before importing the graph, as contrib ops are lazily registered "
+       "when the module is first accessed."});
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index add80eda23d7887fb06902c0b123c03db8f4cccf..877e4f1b44e005b310667f48dcc0bfd0d0a7e1d5 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -123,8 +124,8 @@ bool NeedSameDeviceSendRecv(const Edge* edge, const GraphInfo& info) {
     return false;
   }
 
-  Node* src = edge->src();
-  Node* dst = edge->dst();
+  const Node* src = edge->src();
+  const Node* dst = edge->dst();
   if (src->assigned_device_name() == dst->assigned_device_name()) {
     int src_port = edge->src_output();
     int dst_port = edge->dst_input();
@@ -141,7 +142,7 @@ bool NeedSameDeviceSendRecv(const Edge* edge, const GraphInfo& info) {
 
 // Return true iff (dst, dst_input) is specified on host memory.
 bool IsDstInputOnHost(const Edge* edge, const GraphInfo& info) {
-  Node* dst = edge->dst();
+  const Node* dst = edge->dst();
   int dst_port = edge->dst_input();
   if (info.device_types[dst->id()] != DEVICE_CPU) {
     if (edge->IsControlEdge()) return false;
@@ -372,7 +373,7 @@ string ControlLoopName(const string& name) {
 
 bool IsControlLoop(const Node* node) {
   const string& name = node->name();
-  return StringPiece(name).starts_with("_cloop");
+  return str_util::StartsWith(name, "_cloop");
 }
 
 // An enter node for control flow.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 6841f2914989b22d6aef91831ac6101b0ba6555f..83b24cafe2cb364b2afd5dcb6533bf662dc40a1b 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -120,7 +121,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
       if (ndef.op() == "_Recv") {
         bool has_control = false;
         for (const string& input_name : ndef.input()) {
-          if (StringPiece(input_name).starts_with("^")) {
+          if (str_util::StartsWith(input_name, "^")) {
             has_control = true;
             break;
           }
@@ -128,7 +129,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
         EXPECT_TRUE(has_control);
       }
       // Must have a control loop
-      if (StringPiece(ndef.name()).starts_with("_cloop")) {
+      if (str_util::StartsWith(ndef.name(), "_cloop")) {
         if (ndef.op() == "Enter") {
           has_control_enter = true;
         }
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index e2ce0ba046f26b69bdb8f427afeb480727977844..c8c2b225fea721bd19683fbdb805601bb9be494b 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -408,7 +409,7 @@ TEST_F(GraphTest, NewName) {
   EXPECT_NE(a1, a2);
   EXPECT_NE(a1, b1);
   EXPECT_NE(a2, b1);
-  EXPECT_TRUE(StringPiece(a1).starts_with("A")) << a1;
+  EXPECT_TRUE(str_util::StartsWith(a1, "A")) << a1;
 }
 
 TEST_F(GraphTest, IsValidNode) {
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 1b99d54e8e33fd5155913a78ee833343bf92b905..5f51d6083b1ae17d8c4dee2434f4b57de5f18d06 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -90,7 +90,7 @@ inline string GetMklOpName(const string& name) {
 // @input: name of the op
 // @input: T datatype to be used for checking op
 // @return: true if opname is registered as Mkl op; false otherwise
-static inline bool IsMklOp(const std::string& op_name, DataType T) {
+static inline bool IsMklOp(const string& op_name, DataType T) {
   string kernel = KernelsRegisteredForOp(op_name);
   bool result =
       kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
@@ -104,7 +104,7 @@ static inline bool IsMklOp(const std::string& op_name, DataType T) {
 // @input: T datatype to be used for checking op
 // @return: true if opname is registered as element-wise Mkl op;
 // false otherwise
-static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
+static inline bool IsMklElementWiseOp(const string& op_name, DataType T) {
   if (!IsMklOp(op_name, T)) {
     return false;
   }
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 7d3be152991351533a6185ea088503032f720b47..72a13d4da7a70a7dc22df8ef382575ac78ad78d3 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// TODO(intel): Improve error handling in this file; instead of CHECK failing
+// all over the place, we should log an error and execute the original graph.
 #ifdef INTEL_MKL
 
 #include <algorithm>
@@ -545,14 +547,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -1030,8 +1032,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorProto proto;
   proto.set_dtype(dt);
   uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
-                           8);
+  proto.set_tensor_content(string(reinterpret_cast<const char*>(zero), 8));
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
@@ -1144,7 +1145,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     // For that let's first find filter node that is 2nd input (slot 1)
     // of BackpropInput.
     Node* filter_node = nullptr;
-    old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node);
+    TF_CHECK_OK(old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx,
+                                     &filter_node));
     CHECK_NOTNULL(filter_node);
 
     // Now check which nodes receive from filter_node. Filter feeds as
@@ -1323,8 +1325,7 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorProto proto;
   proto.set_dtype(dt);
   float zero[1] = {0};
-  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
-                           4);
+  proto.set_tensor_content(string(reinterpret_cast<char*>(&zero), 4));
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
@@ -1829,7 +1830,7 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
 
     // Create node.
     Node* new_node;
-    nb.Finalize(&**g, &new_node);
+    TF_CHECK_OK(nb.Finalize(&**g, &new_node));
     CHECK_NOTNULL(new_node);
 
     // Set the Mkl layer label for this op.
@@ -2491,10 +2492,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
-                      CopyAttrsLRN, AlwaysRewrite});
+                      CopyAttrsLRN, LrnRewrite});
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
-                      CopyAttrsLRN, AlwaysRewrite});
+                      CopyAttrsLRN, LrnRewrite});
     rinfo_.push_back({csinfo_.max_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool),
                       CopyAttrsPooling, NonDepthBatchWisePoolRewrite});
@@ -2690,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2864,6 +2865,28 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
+  // path. The unoptimized path is slow. Thus we dont rewrite the node 
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
+  // path is taken, i.e., eigen node is rewritten by MKl DNN node.
+  static bool LrnRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    int depth_radius;
+    CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
+
+    // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
+    // and use eigen node instead 
+    if (depth_radius == 2) {
+      return true;
+    }
+    VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
+            << "case is not optimized by Intel MKL, thus using Eigen op"
+            << "for LRN " ; 
+
+    return false;
+  }
+
   static bool AddNRewrite(const Node* n) {
     CHECK_NOTNULL(n);
 
@@ -3080,8 +3103,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorProto proto;
   proto.set_dtype(dt);
   uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
-                           8);
+  proto.set_tensor_content(string(reinterpret_cast<char*>(&zero), 8));
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
@@ -3196,7 +3218,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     // For that let's first find filter node that is 2nd input (slot 1)
     // of BackpropInput.
     Node* filter_node = nullptr;
-    old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node);
+    TF_CHECK_OK(old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx,
+                                     &filter_node));
     CHECK_NOTNULL(filter_node);
 
     // Now check which nodes receive from filter_node. Filter feeds as
@@ -3376,8 +3399,7 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorProto proto;
   proto.set_dtype(dt);
   float zero[1] = {0};
-  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
-                           4);
+  proto.set_tensor_content(string(reinterpret_cast<char*>(&zero), 4));
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
@@ -3527,11 +3549,13 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
   string data_format;
   string padding;
   std::vector<int32> strides;
+  std::vector<int32> dilations;
   bool use_cudnn_on_gpu;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
   TF_CHECK_OK(
@@ -3540,6 +3564,7 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("strides", strides);
+  nb->Attr("dilations", dilations);
   nb->Attr("padding", padding);
   nb->Attr("data_format", data_format);
   nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
@@ -3777,12 +3802,14 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   DataType T_pred, T_succ;
   string padding;
   std::vector<int32> strides;
+  std::vector<int32> dilations;
   string data_format_pred, data_format_succ;
   bool use_cudnn_on_gnu;
   TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
@@ -3848,7 +3875,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
 
   // Create node.
   Node* new_node;
-  nb.Finalize(&**g, &new_node);
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
   CHECK_NOTNULL(new_node);
 
   // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
@@ -3959,7 +3986,7 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
 
   // Create node.
   Node* new_node;
-  nb.Finalize(&**g, &new_node);
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
   CHECK_NOTNULL(new_node);
 
   // Incoming data edges from BiasAddGrad node and Conv2DBackpropFilter node to
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.h b/tensorflow/core/graph/mkl_tfconversion_pass.h
index 0562d8b3cd4ffa69c2e4ab5eb3875abc2b25c14f..84e50ee6e0d57f629f90fa456ba8cf3ace018bfb 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.h
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.h
@@ -24,6 +24,10 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/core/graph/graph.h"
 
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
 namespace tensorflow {
 // Interface to invoke the pass for unit test
 //
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 138952dcb33e7b1e57cf013147581d20f509e85d..114962c0e4f2969fe539d5b168aaf62d577a7024 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -88,7 +88,7 @@ NodeBuilder& NodeBuilder::ControlInput(Node* src_node) {
 NodeBuilder& NodeBuilder::ControlInputs(gtl::ArraySlice<Node*> src_nodes) {
   control_inputs_.insert(control_inputs_.end(), src_nodes.begin(),
                          src_nodes.end());
-  for (Node* src_node : src_nodes) {
+  for (const Node* src_node : src_nodes) {
     def_builder_.ControlInput(src_node->name());
   }
   return *this;
@@ -127,7 +127,7 @@ Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const {
   return Status::OK();
 }
 
-void NodeBuilder::AddIndexError(Node* node, int i) {
+void NodeBuilder::AddIndexError(const Node* node, int i) {
   if (node == nullptr) {
     errors_.emplace_back(
         strings::StrCat("Attempt to add nullptr Node to node with type ",
@@ -140,7 +140,7 @@ void NodeBuilder::AddIndexError(Node* node, int i) {
   }
 }
 
-bool NodeBuilder::GetOutputType(Node* node, int i, DataType* dt) {
+bool NodeBuilder::GetOutputType(const Node* node, int i, DataType* dt) {
   bool error;
   *dt = SafeGetOutput(node, i, &error);
   if (error) AddIndexError(node, i);
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 86647a49c12085b6850a0e6d2622ef1bb58c513d..f6b7b5674b032cd2b19d69765e7c3b6b6613b3bd 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -120,7 +120,7 @@ class NodeBuilder {
   const OpDef& op_def() const { return def_builder_.op_def(); }
 
  private:
-  static DataType SafeGetOutput(Node* node, int i, bool* error) {
+  static DataType SafeGetOutput(const Node* node, int i, bool* error) {
     if (node != nullptr && i >= 0 && i < node->num_outputs()) {
       *error = false;
       return node->output_type(i);
@@ -131,11 +131,11 @@ class NodeBuilder {
   }
 
   // If SafeGetOutput indicates a range error, add it to errors_.
-  void AddIndexError(Node* node, int i);
+  void AddIndexError(const Node* node, int i);
 
   // Set *dt and returns true if i is in range. Combines
   // SafeGetOutput() and AddIndexError().
-  bool GetOutputType(Node* node, int i, DataType* dt);
+  bool GetOutputType(const Node* node, int i, DataType* dt);
 
   NodeDefBuilder def_builder_;
   std::vector<NodeOut> inputs_;
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index 6b452a1d5dca0a636264a3483e4ee9d027fd2e06..4073255db3f7cbcd697f3cb2781e04b3b01634c1 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -65,8 +65,8 @@ class OptimizerCSE {
 };
 
 static void FillInputs(const Node* n,
-                       gtl::InlinedVector<Node*, 4>* control_edges,
-                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
+                       gtl::InlinedVector<const Node*, 4>* control_edges,
+                       gtl::InlinedVector<std::pair<const Node*, int>, 4>* in) {
   DCHECK_EQ(in->size(), n->num_inputs());
   control_edges->clear();
   for (const Edge* e : n->in_edges()) {
@@ -96,8 +96,8 @@ size_t OptimizerCSE::NodeHash(const Node* n) {
 
   const int N_in = n->num_inputs();
   strings::StrAppend(&str_to_hash, N_in);
-  gtl::InlinedVector<Node*, 4> control_edges;
-  gtl::InlinedVector<std::pair<Node*, int>, 4> in(N_in);
+  gtl::InlinedVector<const Node*, 4> control_edges;
+  gtl::InlinedVector<std::pair<const Node*, int>, 4> in(N_in);
   FillInputs(n, &control_edges, &in);
   for (const auto& edge : in) {
     strings::StrAppend(&str_to_hash, edge.first->id(), edge.second);
@@ -147,10 +147,10 @@ bool OptimizerCSE::Equivalent(const Node* a, const Node* b,
   // Compare input sources
   if (a->num_inputs() != b->num_inputs()) return false;
   const int N_in = a->num_inputs();
-  gtl::InlinedVector<Node*, 4> a_control_edges;
-  gtl::InlinedVector<Node*, 4> b_control_edges;
-  gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
-  gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(N_in);
+  gtl::InlinedVector<const Node*, 4> a_control_edges;
+  gtl::InlinedVector<const Node*, 4> b_control_edges;
+  gtl::InlinedVector<std::pair<const Node*, int>, 4> a_in(N_in);
+  gtl::InlinedVector<std::pair<const Node*, int>, 4> b_in(N_in);
   FillInputs(a, &a_control_edges, &a_in);
   FillInputs(b, &b_control_edges, &b_in);
   if (a_in != b_in) return false;
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index cb0fc8a1547a8498aa0bd089a2c9395119de2789..3b6e8cc2339a42285a68c6898c99b1ec4b585917 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -259,8 +259,14 @@ Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
   const string restore_op_name = strings::StrCat(name_prefix, "/RestoreV2");
   const string assign_op_name = strings::StrCat(name_prefix, "/Assign");
   for (Node* var : variables) {
-    string new_restore_op_name = graph->NewName(restore_op_name);
-    string new_assign_op_name = graph->NewName(assign_op_name);
+    // Add an extra prefix after calling graph->NewName because the "unique"
+    // name may conflict with names generated for Send nodes.
+    // TODO(b/77547936): fix this more generally and get rid of the extra prefix
+    // here.
+    string new_restore_op_name =
+        strings::StrCat(graph->NewName(restore_op_name), "_qt");
+    string new_assign_op_name =
+        strings::StrCat(graph->NewName(assign_op_name), "_qt");
     string tensor_names_op_name =
         strings::StrCat(new_restore_op_name, "/tensor_names");
     string shape_and_slices_op_name =
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
index 2ad69dbd0c608fa79354c73e01167c3b02ff4fc2..e46f92bc24de9fc7d7923e4b9ebe0f04882beae4 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
@@ -215,7 +216,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"),
                       &found_node);
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
@@ -269,7 +270,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"),
                       &found_node);
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index 2a08bf8ca019185cf9d13dd83ca1880c3741090f..193cf88aed3da8c871f457c02d8dbb714b926737 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -28,13 +28,13 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace subgraph {
 
 // ----------------------------------------------------------------------------
 // Subgraph construction-related routines
@@ -44,6 +44,8 @@ namespace tensorflow {
 
 namespace {
 
+typedef std::unordered_map<StringPiece, Node*, StringPieceHasher> NameIndex;
+
 // Rewrite graph by replacing the output tensors specified in
 // "fed_outputs" with special feed nodes for each specified output
 // tensor, and removing any nodes that are now disconnected from the
@@ -53,59 +55,33 @@ namespace {
 // Return true on success.  On error, return false and sets *error to
 // an appropriate error message (and *g is left in an indeterminate
 // state).
-static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
-                         const gtl::ArraySlice<string>& fed_outputs,
-                         bool use_function_convention,
-                         subgraph::NameIndex* name_index,
-                         DataTypeVector* out_feed_types) {
+Status FeedInputs(
+    Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
+    NameIndex* name_index, DataTypeVector* out_feed_types) {
   out_feed_types->clear();
-  out_feed_types->reserve(fed_outputs.size());
-  for (size_t i = 0; i < fed_outputs.size(); ++i) {
-    const string& t = fed_outputs[i];
+  out_feed_types->reserve(feed_rewrites.size());
+  for (size_t i = 0; i < feed_rewrites.size(); ++i) {
+    const string& t = feed_rewrites[i]->endpoint_name();
     TensorId id(ParseTensorName(t));
 
     auto iter = name_index->find(id.first);
     if (iter == name_index->end()) {
       return errors::NotFound("FeedInputs: unable to find feed output ", t);
     }
-    const Node* n = iter->second;
+    Node* n = iter->second;
     DCHECK_EQ(n->name(), id.first);
     if (id.second >= n->num_outputs()) {
       return errors::InvalidArgument(
           "FeedInputs: ", t, " should have output index < ", n->num_outputs());
     }
 
-    Node* recv_node;
-
-    if (!use_function_convention) {
-      TF_RETURN_IF_ERROR(
-          NodeBuilder(strings::StrCat("_recv_", id.first, "_", id.second),
-                      "_Recv")
-              .Attr("tensor_type", BaseType(n->output_type(id.second)))
-              .Attr("tensor_name", t)
-              .Attr("send_device", device_info.name())
-              .Attr("recv_device", device_info.name())
-              .Attr("send_device_incarnation",
-                    static_cast<int64>(device_info.incarnation()))
-              .Attr("client_terminated", true)
-              .Finalize(g, &recv_node));
-    } else {
-      // NOTE(mrry): We must include the index as part of the node
-      // name, because _Arg is a "stateful" kernel and therefore
-      // its name must uniquely identify a kernel instance across all
-      // graphs in the same session.
-      TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("_arg_", id.first, "_",
-                                                     id.second, "_", i),
-                                     "_Arg")
-                             .Attr("T", BaseType(n->output_type(id.second)))
-                             .Attr("index", static_cast<int32>(i))
-                             .Finalize(g, &recv_node));
-    }
-    recv_node->set_assigned_device_name(device_info.name());
+    Node* feed_node;
+    TF_RETURN_IF_ERROR(
+        feed_rewrites[i]->AddNode(g, {n, id.second}, &feed_node));
 
     // Update name_index
-    (*name_index)[recv_node->name()] = recv_node;
-    g->AddControlEdge(g->source_node(), recv_node);
+    (*name_index)[feed_node->name()] = feed_node;
+    g->AddControlEdge(g->source_node(), feed_node);
 
     // Look through edges coming out of "n" for edges whose src_output() index
     // matches "output_index".  If found, replace the edges with a connection
@@ -119,7 +95,7 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
                   n->type_string() == "PlaceholderV2")) {
         // When feeding a Placeholder node, any outgoing control edges
         // will be replaced with a control edge from the replacement
-        // recv_node.
+        // feed_node.
         // TODO(josh11b,mrry): Come up with a more elegant way of addressing
         // the general version of this problem.
         to_remove.emplace_back(e);
@@ -128,10 +104,10 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
 
     for (const Edge* e : to_remove) {
       if (e->src_output() == id.second) {
-        g->AddEdge(recv_node, 0, e->dst(), e->dst_input());
+        g->AddEdge(feed_node, 0, e->dst(), e->dst_input());
       } else {
         CHECK_EQ(Graph::kControlSlot, e->src_output());
-        g->AddControlEdge(recv_node, e->dst());
+        g->AddControlEdge(feed_node, e->dst());
       }
       g->RemoveEdge(e);
     }
@@ -140,9 +116,61 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
   return Status::OK();
 }
 
-static bool AddNodeToTargets(const string& node_or_tensor_name,
-                             const subgraph::NameIndex& name_index,
-                             std::unordered_set<const Node*>* targets) {
+Status FetchOutputs(
+    Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
+    NameIndex* name_index, std::vector<Node*>* out_fetch_nodes,
+    DataTypeVector* out_fetch_types) {
+  out_fetch_nodes->clear();
+  out_fetch_nodes->reserve(fetch_rewrites.size());
+  for (size_t i = 0; i < fetch_rewrites.size(); ++i) {
+    const string& t = fetch_rewrites[i]->endpoint_name();
+
+    // Parse t into node_name and output_index.
+    TensorId id(ParseTensorName(t));
+
+    // Find node in graph with that name.
+    auto iter = name_index->find(id.first);
+    if (iter == name_index->end()) {
+      return errors::NotFound("FetchOutputs node ", t, ": not found");
+    }
+    Node* n = iter->second;
+    DCHECK_EQ(n->name(), id.first);
+    VLOG(2) << "Found fetch node for " << t;
+
+    // Validate output_index
+    if (n->num_outputs() == 0) {
+      return errors::InvalidArgument(
+          "Tried to fetch data for '", t,
+          "', which produces no output.  To run to a node but not fetch any "
+          "data, pass '",
+          t,
+          "' as an argument to the 'target_node_names' argument of the "
+          "Session::Run API.");
+    } else if (id.second >= n->num_outputs()) {
+      return errors::InvalidArgument("FetchOutputs ", t,
+                                     ": output index too large, must be < ",
+                                     n->num_outputs());
+    }
+
+    // Create the fetch Node and connect it up
+    Node* fetch_node;
+    TF_RETURN_IF_ERROR(
+        fetch_rewrites[i]->AddNode(g, {n, id.second}, &fetch_node));
+
+    // Update the index.
+    (*name_index)[fetch_node->name()] = fetch_node;
+
+    g->AddControlEdge(fetch_node, g->sink_node());
+    out_fetch_nodes->push_back(fetch_node);
+    out_fetch_types->push_back(BaseType(n->output_type(id.second)));
+  }
+
+  return Status::OK();
+}
+
+bool AddNodeToTargets(const string& node_or_tensor_name,
+                      const NameIndex& name_index,
+                      std::unordered_set<const Node*>* targets) {
   TensorId id = ParseTensorName(node_or_tensor_name);
   auto iter = name_index.find(id.first);
   if (iter == name_index.end()) {
@@ -154,9 +182,9 @@ static bool AddNodeToTargets(const string& node_or_tensor_name,
   return true;
 }
 
-static Status PruneForTargets(Graph* g, const subgraph::NameIndex& name_index,
-                              const std::vector<Node*>& fetch_nodes,
-                              const gtl::ArraySlice<string>& target_nodes) {
+Status PruneForTargets(Graph* g, const NameIndex& name_index,
+                       const std::vector<Node*>& fetch_nodes,
+                       const gtl::ArraySlice<string>& target_nodes) {
   string not_found;
   std::unordered_set<const Node*> targets;
   for (Node* n : fetch_nodes) {
@@ -183,108 +211,149 @@ static Status PruneForTargets(Graph* g, const subgraph::NameIndex& name_index,
 
 }  // namespace
 
-namespace subgraph {
+Status ArgFeedRewrite::AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                               Node** out_node) {
+  // NOTE(mrry): We must include the index as part of the node
+  // name, because _Arg is a "stateful" kernel and therefore
+  // its name must uniquely identify a kernel instance across all
+  // graphs in the same session.
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat("_arg_", feed_tensor.node->name(), "_",
+                                  feed_tensor.index, "_", arg_index_),
+                  "_Arg")
+          .Attr("T", BaseType(feed_tensor.node->output_type(feed_tensor.index)))
+          .Attr("index", arg_index_)
+          .Finalize(g, out_node));
+  (*out_node)->set_assigned_device_name(device_info().name());
+  return Status::OK();
+}
 
-Status FetchOutputs(Graph* g, const DeviceAttributes& device_info,
-                    const gtl::ArraySlice<string>& fetch_outputs,
-                    bool use_function_convention, NameIndex* name_index,
-                    std::vector<Node*>* out_fetch_nodes,
-                    DataTypeVector* out_fetch_types) {
-  out_fetch_nodes->clear();
-  out_fetch_nodes->reserve(fetch_outputs.size());
-  for (size_t i = 0; i < fetch_outputs.size(); ++i) {
-    const string& t = fetch_outputs[i];
+Status RecvFeedRewrite::AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                                Node** out_node) {
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat("_recv_", feed_tensor.node->name(), "_",
+                                  feed_tensor.index),
+                  "_Recv")
+          .Attr("tensor_type",
+                BaseType(feed_tensor.node->output_type(feed_tensor.index)))
+          .Attr("tensor_name", endpoint_name())
+          .Attr("send_device", device_info().name())
+          .Attr("recv_device", device_info().name())
+          .Attr("send_device_incarnation",
+                static_cast<int64>(device_info().incarnation()))
+          .Attr("client_terminated", true)
+          .Finalize(g, out_node));
+
+  (*out_node)->set_assigned_device_name(device_info().name());
+  return Status::OK();
+}
 
-    // Parse t into node_name and output_index.
-    TensorId id(ParseTensorName(t));
+Status RetvalFetchRewrite::AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                                   Node** out_node) {
+  // NOTE(mrry): We must include the index as part of the node
+  // name, because _Retval is a "stateful" kernel and therefore
+  // its name must uniquely identify a kernel instance across all
+  // graphs in the same session.
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat("_retval_", fetch_tensor.node->name(), "_",
+                                  fetch_tensor.index, "_", retval_index_),
+                  "_Retval")
+          .Input(fetch_tensor.node, fetch_tensor.index)
+          .Attr("T",
+                BaseType(fetch_tensor.node->output_type(fetch_tensor.index)))
+          .Attr("index", retval_index_)
+          .Finalize(g, out_node));
+  (*out_node)->set_assigned_device_name(device_info().name());
+  return Status::OK();
+}
 
-    // Find node in graph with that name.
-    auto iter = name_index->find(id.first);
-    if (iter == name_index->end()) {
-      return errors::NotFound("FetchOutputs node ", t, ": not found");
-    }
-    Node* n = iter->second;
-    DCHECK_EQ(n->name(), id.first);
-    VLOG(2) << "Found fetch node for " << t;
+Status SendFetchRewrite::AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                                 Node** out_node) {
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat("_send_", fetch_tensor.node->name(), "_",
+                                  fetch_tensor.index),
+                  "_Send")
+          .Input(fetch_tensor.node, fetch_tensor.index)
+          .Attr("tensor_name", endpoint_name())
+          .Attr("send_device", device_info().name())
+          .Attr("recv_device", device_info().name())
+          .Attr("send_device_incarnation",
+                static_cast<int64>(device_info().incarnation()))
+          .Attr("client_terminated", true)
+          .Finalize(g, out_node));
+  (*out_node)->set_assigned_device_name(device_info().name());
+  return Status::OK();
+}
 
-    // Validate output_index
-    if (n->num_outputs() == 0) {
-      return errors::InvalidArgument(
-          "Tried to fetch data for '", t,
-          "', which produces no output.  To run to a node but not fetch any "
-          "data, pass '",
-          t,
-          "' as an argument to the 'target_node_names' argument of the "
-          "Session::Run API.");
-    } else if (id.second >= n->num_outputs()) {
-      return errors::InvalidArgument("FetchOutputs ", t,
-                                     ": output index too large, must be < ",
-                                     n->num_outputs());
+Status RewriteGraphForExecution(
+    Graph* g, const gtl::ArraySlice<string>& fed_outputs,
+    const gtl::ArraySlice<string>& fetch_outputs,
+    const gtl::ArraySlice<string>& target_node_names,
+    const DeviceAttributes& device_info, bool use_function_convention,
+    RewriteGraphMetadata* out_metadata) {
+  std::vector<std::unique_ptr<PruneRewrite>> feed_rewrites;
+  feed_rewrites.reserve(fed_outputs.size());
+  if (use_function_convention) {
+    for (size_t i = 0; i < fed_outputs.size(); ++i) {
+      feed_rewrites.emplace_back(new ArgFeedRewrite(
+          &fed_outputs[i], &device_info, static_cast<int32>(i)));
     }
-
-    // Create the fetch Node and connect it up
-    Node* send_node;
-    if (!use_function_convention) {
-      TF_RETURN_IF_ERROR(
-          NodeBuilder(strings::StrCat("_send_", id.first, "_", id.second),
-                      "_Send")
-              .Input(n, id.second)
-              .Attr("tensor_name", t)
-              .Attr("send_device", device_info.name())
-              .Attr("recv_device", device_info.name())
-              .Attr("send_device_incarnation",
-                    static_cast<int64>(device_info.incarnation()))
-              .Attr("client_terminated", true)
-              .Finalize(g, &send_node));
-    } else {
-      // NOTE(mrry): We must include the index as part of the node
-      // name, because _Retval is a "stateful" kernel and therefore
-      // its name must uniquely identify a kernel instance across all
-      // graphs in the same session.
-      TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("_retval_", id.first, "_",
-                                                     id.second, "_", i),
-                                     "_Retval")
-                             .Input(n, id.second)
-                             .Attr("T", BaseType(n->output_type(id.second)))
-                             .Attr("index", static_cast<int32>(i))
-                             .Finalize(g, &send_node));
+  } else {
+    for (const string& fed_output : fed_outputs) {
+      feed_rewrites.emplace_back(
+          new RecvFeedRewrite(&fed_output, &device_info));
     }
-    send_node->set_assigned_device_name(device_info.name());
-
-    // Update the index.
-    (*name_index)[send_node->name()] = send_node;
+  }
 
-    g->AddControlEdge(send_node, g->sink_node());
-    out_fetch_nodes->push_back(send_node);
-    out_fetch_types->push_back(BaseType(n->output_type(id.second)));
+  std::vector<std::unique_ptr<PruneRewrite>> fetch_rewrites;
+  fetch_rewrites.reserve(fetch_outputs.size());
+  if (use_function_convention) {
+    for (size_t i = 0; i < fetch_outputs.size(); ++i) {
+      fetch_rewrites.emplace_back(new RetvalFetchRewrite(
+          &fetch_outputs[i], &device_info, static_cast<int32>(i)));
+    }
+  } else {
+    for (const string& fetch_output : fetch_outputs) {
+      fetch_rewrites.emplace_back(
+          new SendFetchRewrite(&fetch_output, &device_info));
+    }
   }
 
-  return Status::OK();
+  return RewriteGraphForExecution(g, feed_rewrites, fetch_rewrites,
+                                  target_node_names, out_metadata);
+}
+
+namespace {
+template <typename StringContainer>
+std::vector<string> ConvertToVector(StringContainer field) {
+  return std::vector<string>(field.begin(), field.end());
 }
+}  // namespace
 
 Status RewriteGraphForExecution(
-    Graph* g, const gtl::ArraySlice<string>& fed_outputs,
-    const gtl::ArraySlice<string>& fetch_outputs,
+    Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
+    const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
     const gtl::ArraySlice<string>& target_node_names,
-    const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata) {
-  if (fetch_outputs.empty() && target_node_names.empty()) {
+  if (fetch_rewrites.empty() && target_node_names.empty()) {
     return errors::InvalidArgument(
         "Must specify at least one target to fetch or execute.");
   }
 
   std::unordered_set<string> endpoints;
-  for (const string& endpoint_name : fed_outputs) {
-    auto result = endpoints.insert(endpoint_name);
+  for (const auto& feed_rewrite : feed_rewrites) {
+    auto result = endpoints.insert(feed_rewrite->endpoint_name());
     if (!result.second) {
-      return errors::InvalidArgument("Endpoint \"", endpoint_name,
+      return errors::InvalidArgument("Endpoint \"",
+                                     feed_rewrite->endpoint_name(),
                                      "\" fed more than once.");
     }
   }
 
-  for (const auto& fetch : fetch_outputs) {
-    if (endpoints.count(fetch) > 0) {
-      return errors::InvalidArgument(fetch, " is both fed and fetched.");
+  for (const auto& fetch_rewrite : fetch_rewrites) {
+    if (endpoints.count(fetch_rewrite->endpoint_name()) > 0) {
+      return errors::InvalidArgument(fetch_rewrite->endpoint_name(),
+                                     " is both fed and fetched.");
     }
   }
 
@@ -297,19 +366,17 @@ Status RewriteGraphForExecution(
   }
 
   // Add the feeds.  This may replace nodes in the graph, including the nodes
-  // currently listed in "fetch_nodes".  We pass "name_index" so the index is
+  // currently listed in "fetch_rewrites".  We pass "name_index" so the index is
   // kept up to date.
-  if (!fed_outputs.empty()) {
-    TF_RETURN_IF_ERROR(FeedInputs(g, device_info, fed_outputs,
-                                  use_function_convention, &name_index,
-                                  &out_metadata->feed_types));
+  if (!feed_rewrites.empty()) {
+    TF_RETURN_IF_ERROR(
+        FeedInputs(g, feed_rewrites, &name_index, &out_metadata->feed_types));
   }
 
   // Add the fetch nodes, also updating "name_index".
   std::vector<Node*> fetch_nodes;
-  if (!fetch_outputs.empty()) {
-    TF_RETURN_IF_ERROR(FetchOutputs(g, device_info, fetch_outputs,
-                                    use_function_convention, &name_index,
+  if (!fetch_rewrites.empty()) {
+    TF_RETURN_IF_ERROR(FetchOutputs(g, fetch_rewrites, &name_index,
                                     &fetch_nodes, &out_metadata->fetch_types));
   }
 
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index 3c1f8870f57f6d585f795cc92c320927e1a29315..ba35846d937bfeeeab825be2a2897aa6f3a195b7 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -20,8 +20,10 @@ limitations under the License.
 
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace subgraph {
@@ -38,6 +40,37 @@ struct RewriteGraphMetadata {
   DataTypeVector fetch_types;
 };
 
+// Describes the action to take on a particular tensor endpoint (described by
+// a "<node_name>:<output_index>" pair) when pruning the graph.
+//
+// The `AddNode()` method must be overridden to describe this action. The method
+// will be invoked once during `RewriteGraphForExecution()` with tensor endpoint
+// named by `endpoint_name`, and it may either create a single new node, or fail
+// with an error if the resulting graph would be invalid.
+class PruneRewrite {
+ public:
+  // `endpoint_name` and `device_info` must outlive this object.
+  PruneRewrite(const string* endpoint_name, const DeviceAttributes* device_info)
+      : endpoint_name_(endpoint_name), device_info_(device_info) {}
+  virtual ~PruneRewrite() {}
+
+  // Creates a new node whose output replaces the given `tensor` in graph `g`.
+  // The node will be assigned to the device named in `device_info`.
+  virtual Status AddNode(Graph* g, NodeBuilder::NodeOut tensor,
+                         Node** out_node) = 0;
+
+  // Returns the name of the tensor to which this rewrite applies.
+  const string& endpoint_name() { return *endpoint_name_; }
+
+ protected:
+  // The device on which the new node will be created.
+  const DeviceAttributes& device_info() { return *device_info_; }
+
+ private:
+  const string* const endpoint_name_;          // Not owned.
+  const DeviceAttributes* const device_info_;  // Not owned.
+};
+
 // Rewrite the graph structure of "*g" to deal with feeding node
 // outputs, fetching node outputs, and only running a subset of the
 // graph.  "fed_outputs" and "fetch_outputs" are both lists of
@@ -48,7 +81,7 @@ struct RewriteGraphMetadata {
 // In the resulting graph "*g", output edges in "fed_outputs" have
 // been redirected to special "_recv" nodes introduced into the graph.
 // If these fed nodes are not needed in order to compute the effects
-// of the nodes in "targets_nodes" and "fetch_outputs", then these may
+// of the nodes in "target_node_names" and "fetch_outputs", then these may
 // be omitted from the graph.
 //
 // In the resulting graph "*g", additional "_send" nodes are connected
@@ -71,19 +104,60 @@ Status RewriteGraphForExecution(
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata);
 
-typedef std::unordered_map<StringPiece, Node*, StringPieceHasher> NameIndex;
+// A more general version of the above function that supports
+// customizable rewriting actions for each fed and fetched tensor.
+Status RewriteGraphForExecution(
+    Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
+    const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
+    const gtl::ArraySlice<string>& target_node_names,
+    RewriteGraphMetadata* out_metadata);
 
-// Augment "*g" by adding special "fetch" nodes that connect to the
-// tensor outputs specified in "fetch_outputs" to retrieve the output
-// of the tensors.  The new nodes added are set up to execute on
-// "client_device_name", and are returned in "*fetch_nodes".
-//
-// Return OK on success.  On error, return false and sets *error to
-// an appropriate error message (and *g is left in an indeterminate
-// state).
-Status FetchOutputs(Graph* g, const DeviceAttributes& device_info,
-                    const gtl::ArraySlice<string>& fetch_outputs,
-                    NameIndex* name_index, std::vector<Node*>* fetch_nodes);
+/////////////////////////////////////////////////////////
+// Custom rewrite actions for fed and fetched tensors. //
+/////////////////////////////////////////////////////////
+
+// A rewrite action that adds an _Arg node for a fed tensor.
+class ArgFeedRewrite : public PruneRewrite {
+ public:
+  ArgFeedRewrite(const string* endpoint_name,
+                 const DeviceAttributes* device_info, int32 arg_index)
+      : PruneRewrite(endpoint_name, device_info), arg_index_(arg_index) {}
+  Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                 Node** out_node) override;
+
+ private:
+  const int32 arg_index_;
+};
+
+// A rewrite action that adds a client-terminated _Recv node for a fed tensor.
+class RecvFeedRewrite : public PruneRewrite {
+ public:
+  using PruneRewrite::PruneRewrite;
+  Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                 Node** out_node) override;
+};
+
+// A rewrite action that adds a _Retval node for a fetched tensor.
+class RetvalFetchRewrite : public PruneRewrite {
+ public:
+  RetvalFetchRewrite(const string* endpoint_name,
+                     const DeviceAttributes* device_info, int32 retval_index)
+      : PruneRewrite(endpoint_name, device_info), retval_index_(retval_index) {}
+  Status AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                 Node** out_node) override;
+
+ private:
+  const int32 retval_index_;
+};
+
+// A rewrite action that adds a client-terminated _Send node for a
+// fetched tensor.
+class SendFetchRewrite : public PruneRewrite {
+ public:
+  using PruneRewrite::PruneRewrite;
+  Status AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                 Node** out_node) override;
+};
 
 }  // namespace subgraph
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index 7219d9812f3e4a01cffa4b6b17d38781f7d5e2b0..6c014a8d44388eaeff80fb0850ac1575d3ec023a 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -312,8 +312,8 @@ TEST_F(SubgraphTest, ChainOfFools) {
   EXPECT_TRUE(HasEdge("e", 0, "_send_e_0", 0));
 }
 
-static bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+static bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/core/graph/tensor_id.cc b/tensorflow/core/graph/tensor_id.cc
index 089ea5e527ab18322af01b6e80154cd759b9e980..8af1936d64e503d0cdcf10b7a492847b494c8664 100644
--- a/tensorflow/core/graph/tensor_id.cc
+++ b/tensorflow/core/graph/tensor_id.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -45,7 +46,7 @@ TensorId ParseTensorName(StringPiece name) {
   if (p > base && *p == ':' && mul > 1) {
     id.first = StringPiece(base, p - base);
     id.second = index;
-  } else if (name.starts_with("^")) {
+  } else if (str_util::StartsWith(name, "^")) {
     // Control edge
     id.first = StringPiece(base + 1);
     id.second = Graph::kControlSlot;
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 0d88d1ff723b94783693559926c51c6726a2341b..67b252cb6c576b84de7f823ace2a1c7750d0c35b 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/graph/testlib.h"
 
 #include <vector>
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -50,7 +51,8 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_OP("HostConst")
     .Output("output: dtype")
     .Attr("value: tensor")
-    .Attr("dtype: type");
+    .Attr("dtype: type")
+    .SetShapeFn(shape_inference::UnknownShape);
 
 namespace test {
 namespace graph {
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index cb6d107cadc153930d99ea13cad985dd60c8b393..d58cdc3c5baf02f89cff52ef0396816cb00b48a3 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -60,7 +61,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedDefaultAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -83,7 +84,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -91,7 +92,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   // Validation should still fail.
   s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 }
 
 TEST(ValidateGraphDefAgainstOpListTest, GraphWithOpOnlyInOpList) {
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 2ca9b720ee127b892c06230efb3517f5afabea45..9dcc6765f5b356438c325f84c4891d70e0089efd 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -3,18 +3,6 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "op_types",
     srcs = ["op_types.cc"],
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index b8f8e13c9a6830658e2b53388e1f91fbc8a22eab..30c6126fbb58c16813f923cec2f9551482ade6df 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -1,18 +1,11 @@
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
 )
 
 config_setting(
@@ -26,13 +19,12 @@ config_setting(
 tf_cuda_library(
     name = "utils",
     srcs = ["utils.cc"],
-    hdrs = [
-        "utils.h",
-    ],
+    hdrs = ["utils.h"],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ] + select({
@@ -41,6 +33,21 @@ tf_cuda_library(
     }),
 )
 
+tf_cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    linkstatic = if_cuda(1, 0),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":utils",
+        "//tensorflow/core:gpu_id",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "cluster",
     srcs = ["cluster.cc"],
@@ -49,6 +56,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -66,6 +74,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cluster",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/costs:op_level_cost_estimator",
@@ -104,6 +113,7 @@ cc_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index 39bfca244ed2d40544dd2a17a019dadbe50f6d29..8d8c6084ec9743dea4b45820a6d4a5b2d938979b 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -62,6 +62,10 @@ void Cluster::DisableOptimizer(bool disable) {
         options_.config.mutable_graph_options()->mutable_rewrite_options();
     rewriter_config->set_layout_optimizer(RewriterConfig::OFF);
     rewriter_config->set_disable_model_pruning(true);
+    rewriter_config->set_function_optimization(RewriterConfig::OFF);
+    rewriter_config->set_arithmetic_optimization(RewriterConfig::OFF);
+    rewriter_config->set_loop_optimization(RewriterConfig::OFF);
+    rewriter_config->set_dependency_optimization(RewriterConfig::OFF);
     rewriter_config->set_constant_folding(RewriterConfig::OFF);
     rewriter_config->set_memory_optimization(RewriterConfig::NO_MEM_OPT);
     rewriter_config->mutable_auto_parallel()->set_enable(false);
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 5068f72b30d49850ab445318d1f67d0f4e0e618a..d33aaa7e4c16cd909894fccea597557cb20f0e54 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
-#define TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_CLUSTER_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_CLUSTER_H_
 
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -92,6 +93,10 @@ class Cluster {
   // sorted alphabetically.
   const std::vector<string> GetDeviceNames() const;
 
+  // The DeviceSet is not always available, but when it is it contains a
+  // superset of the devices listed in GetDevices/GetDeviceNames().
+  const DeviceSet* GetDeviceSet() const { return device_set_; }
+
   // Enables collecting the allocator stats. Call with enable=true must be made
   // before Provision().
   virtual Status EnablePeakMemoryStats(bool enable) {
@@ -119,6 +124,7 @@ class Cluster {
 
  protected:
   std::unordered_map<string, DeviceProperties> devices_;
+  const DeviceSet* device_set_ = nullptr;  // Not owned
   const int timeout_s_;
   SessionOptions options_;
   RunOptions run_options_;
@@ -127,4 +133,4 @@ class Cluster {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_CLUSTER_H_
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index cc7f418d49816d64ffc51704d2f127a441815d7b..313ef90d81981ce43ab818873efa4da908e7dcfa 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/cc/training/queue_runner.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -80,13 +82,24 @@ Status SingleMachine::Provision() {
 
   std::vector<DeviceAttributes> devices;
   TF_RETURN_IF_ERROR(session_->ListDevices(&devices));
-  int gpu_id = 0;
   for (const auto& dev : devices) {
     DeviceProperties attr;
     if (dev.device_type() == "CPU") {
       attr = GetLocalCPUInfo();
     } else if (dev.device_type() == "GPU") {
-      attr = GetLocalGPUInfo(gpu_id++);
+      DeviceNameUtils::ParsedName parsed;
+      if (!DeviceNameUtils::ParseFullName(dev.name(), &parsed)) {
+        return errors::InvalidArgument(
+            strings::StrCat("Not able to parse GPU device name: ", dev.name()));
+      }
+      TfGpuId tf_gpu_id(parsed.id);
+      CudaGpuId cuda_gpu_id;
+      Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
+      if (!s.ok()) {
+        return errors::Unavailable("Unknown TF GPU device with id ",
+                                   tf_gpu_id.value(), ": ", s.ToString());
+      }
+      attr = GetLocalGPUInfo(cuda_gpu_id);
     } else if (dev.device_type().find("XLA") == string::npos) {
       // Filter out the fake XLA devices to avoid double counting the actual
       // hardware resources that are available.
@@ -365,10 +378,15 @@ void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
                                        init_costs.node_size() +
                                        queue_costs.node_size());
   std::unordered_set<string> nodes_seen;
+  int queue_costs_id_offset = graph_costs->node_size();
   for (const auto& node : graph_costs->node()) {
     nodes_seen.insert(node.name());
+    if (node.id() >= queue_costs_id_offset) {
+      queue_costs_id_offset = node.id() + 1;
+    }
   }
 
+  int init_costs_id_offset = queue_costs_id_offset + queue_costs.node_size();
   // The costs obtained by running the main graph could be more stable than
   // the one we get from the queue runners since the queue runners run
   // asynchronously.
@@ -376,7 +394,22 @@ void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
     if (nodes_seen.find(node.name()) != nodes_seen.end()) {
       continue;
     }
-    graph_costs->add_node()->MergeFrom(node);
+
+    auto* new_node = graph_costs->add_node();
+    new_node->MergeFrom(node);
+
+    new_node->set_id(node.id() + queue_costs_id_offset);
+    if (new_node->id() >= init_costs_id_offset) {
+      init_costs_id_offset = new_node->id() + 1;
+    }
+
+    for (auto& input_info : *new_node->mutable_input_info()) {
+      input_info.set_preceding_node(input_info.preceding_node() +
+                                    queue_costs_id_offset);
+    }
+    for (auto& control_input : *new_node->mutable_control_input()) {
+      control_input += queue_costs_id_offset;
+    }
   }
 
   // Don't overwrite the costs with that generated during initialization since
@@ -385,7 +418,18 @@ void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
     if (nodes_seen.find(node.name()) != nodes_seen.end()) {
       continue;
     }
-    graph_costs->add_node()->MergeFrom(node);
+
+    auto* new_node = graph_costs->add_node();
+    new_node->MergeFrom(node);
+
+    new_node->set_id(node.id() + init_costs_id_offset);
+    for (auto& input_info : *new_node->mutable_input_info()) {
+      input_info.set_preceding_node(input_info.preceding_node() +
+                                    init_costs_id_offset);
+    }
+    for (auto& control_input : *new_node->mutable_control_input()) {
+      control_input += init_costs_id_offset;
+    }
   }
 }
 
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index 90d6a04cab650178db0dc14ac94564690b0d7bbb..0ae188e0d62e386db02e5f6945b348c4f2ce6445 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
-#define TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
 
 #include "tensorflow/cc/training/coordinator.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -85,4 +85,4 @@ class SingleMachine : public Cluster {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index c6352c1448bb38ece78530007e2534d475ef7fb6..352f08fedecd426c06c8668ff8f3910286e6900a 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -196,10 +196,19 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
   std::set<string> cost_nodes;
   for (const auto& node : metadata.cost_graph().node()) {
+#ifdef INTEL_MKL
+    // Skip the special nodes inserted by TF (and MKL): these are either
+    // prefixed with an underscore or contain "/_".
+    if (node.name()[0] == '_' || node.name().find("/_") != string::npos) {
+      continue;
+    }
+    cost_nodes.insert(node.name());
+#else
     // Skip nodes added by TF internally.
     if (node.name()[0] != '_') {
       cost_nodes.insert(node.name());
     }
+#endif
   }
   const std::set<string> expected_cost_nodes = {
       "zero",      "one",      "add",         "square",
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index aacd2ccb72df07ac6b31c9bd5b96deca499038e4..a7519725a54c7529192a2862c48fa251a395e2e5 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -27,8 +27,12 @@ limitations under the License.
 #include "include/libxsmm.h"
 #endif
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/mem.h"
 
@@ -66,36 +70,40 @@ DeviceProperties GetLocalCPUInfo() {
   return device;
 }
 
-DeviceProperties GetLocalGPUInfo(int gpu_id) {
+DeviceProperties GetLocalGPUInfo(CudaGpuId cuda_gpu_id) {
   DeviceProperties device;
   device.set_type("GPU");
 
 #if GOOGLE_CUDA
   cudaDeviceProp properties;
-  cudaError_t error = cudaGetDeviceProperties(&properties, gpu_id);
-  if (error == cudaSuccess) {
-    device.set_vendor("NVidia");
-    device.set_model(properties.name);
-    device.set_frequency(properties.clockRate * 1e-3);
-    device.set_num_cores(properties.multiProcessorCount);
-    device.set_num_registers(properties.regsPerMultiprocessor);
-    // For compute capability less than 5, l1 cache size is configurable to
-    // either 16 KB or 48 KB. We use the initial configuration 16 KB here. For
-    // compute capability larger or equal to 5, l1 cache (unified with texture
-    // cache) size is 24 KB. This number may need to be updated for future
-    // compute capabilities.
-    device.set_l1_cache_size((properties.major < 5) ? 16 * 1024 : 24 * 1024);
-    device.set_l2_cache_size(properties.l2CacheSize);
-    device.set_l3_cache_size(0);
-    device.set_shared_memory_size_per_multiprocessor(
-        properties.sharedMemPerMultiprocessor);
-    device.set_memory_size(properties.totalGlobalMem);
-    // 8 is the number of bits per byte. 2 is accounted for
-    // double data rate (DDR).
-    device.set_bandwidth(properties.memoryBusWidth / 8 *
-                         properties.memoryClockRate * 2);
+  cudaError_t error = cudaGetDeviceProperties(&properties, cuda_gpu_id.value());
+  if (error != cudaSuccess) {
+    device.set_type("UNKNOWN");
+    LOG(ERROR) << "Failed to get device properties, error code: " << error;
+    return device;
   }
 
+  device.set_vendor("NVIDIA");
+  device.set_model(properties.name);
+  device.set_frequency(properties.clockRate * 1e-3);
+  device.set_num_cores(properties.multiProcessorCount);
+  device.set_num_registers(properties.regsPerMultiprocessor);
+  // For compute capability less than 5, l1 cache size is configurable to
+  // either 16 KB or 48 KB. We use the initial configuration 16 KB here. For
+  // compute capability larger or equal to 5, l1 cache (unified with texture
+  // cache) size is 24 KB. This number may need to be updated for future
+  // compute capabilities.
+  device.set_l1_cache_size((properties.major < 5) ? 16 * 1024 : 24 * 1024);
+  device.set_l2_cache_size(properties.l2CacheSize);
+  device.set_l3_cache_size(0);
+  device.set_shared_memory_size_per_multiprocessor(
+      properties.sharedMemPerMultiprocessor);
+  device.set_memory_size(properties.totalGlobalMem);
+  // 8 is the number of bits per byte. 2 is accounted for
+  // double data rate (DDR).
+  device.set_bandwidth(properties.memoryBusWidth / 8 *
+                       properties.memoryClockRate * 2);
+
   (*device.mutable_environment())["architecture"] =
       strings::StrCat(properties.major, ".", properties.minor);
   (*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
@@ -106,18 +114,26 @@ DeviceProperties GetLocalGPUInfo(int gpu_id) {
 }
 
 DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
+  DeviceProperties unknown;
+  unknown.set_type("UNKNOWN");
+
   if (device.type == "CPU") {
     return GetLocalCPUInfo();
   } else if (device.type == "GPU") {
     if (device.has_id) {
-      return GetLocalGPUInfo(device.id);
+      TfGpuId tf_gpu_id(device.id);
+      CudaGpuId cuda_gpu_id;
+      Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
+      if (!s.ok()) {
+        LOG(ERROR) << s;
+        return unknown;
+      }
+      return GetLocalGPUInfo(cuda_gpu_id);
     } else {
-      return GetLocalGPUInfo(0);
+      return GetLocalGPUInfo(CudaGpuId(0));
     }
   }
-  DeviceProperties result;
-  result.set_type("UNKNOWN");
-  return result;
+  return unknown;
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/clusters/utils.h b/tensorflow/core/grappler/clusters/utils.h
index 191942040a1fdd276bb50f799ce314389c2cb0fe..ca15c48006df13d3ef3e7806a2980adbbda2a471 100644
--- a/tensorflow/core/grappler/clusters/utils.h
+++ b/tensorflow/core/grappler/clusters/utils.h
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
-#define TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_UTILS_H_
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -27,7 +28,7 @@ DeviceProperties GetLocalCPUInfo();
 
 // Returns the DeviceProperties for the specified GPU attached to the server on
 // which grappler is running.
-DeviceProperties GetLocalGPUInfo(int gpu_id);
+DeviceProperties GetLocalGPUInfo(CudaGpuId cuda_gpu_id);
 
 // Returns the DeviceProperties of the specified device
 DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
@@ -35,4 +36,4 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_UTILS_H_
diff --git a/tensorflow/core/grappler/clusters/utils_test.cc b/tensorflow/core/grappler/clusters/utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..74218adbac4eda3a7a780933b8116cfd2b7a1b18
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/utils_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/utils.h"
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(UtilsTest, GetLocalGPUInfo) {
+  GpuIdManager::TestOnlyReset();
+#if GOOGLE_CUDA
+  LOG(INFO) << "CUDA is enabled.";
+  DeviceProperties properties;
+
+  // Invalid CUDA GPU ID.
+  properties = GetLocalGPUInfo(CudaGpuId(100));
+  EXPECT_EQ("UNKNOWN", properties.type());
+
+  // Succeed when a valid CUDA GPU id was inserted.
+  properties = GetLocalGPUInfo(CudaGpuId(0));
+  EXPECT_EQ("GPU", properties.type());
+  EXPECT_EQ("NVIDIA", properties.vendor());
+#else
+  LOG(INFO) << "CUDA is not enabled.";
+  DeviceProperties properties;
+
+  properties = GetLocalGPUInfo(CudaGpuId(0));
+  EXPECT_EQ("GPU", properties.type());
+
+  properties = GetLocalGPUInfo(CudaGpuId(100));
+  EXPECT_EQ("GPU", properties.type());
+#endif
+}
+
+TEST(UtilsTest, GetDeviceInfo) {
+  GpuIdManager::TestOnlyReset();
+  DeviceNameUtils::ParsedName device;
+  DeviceProperties properties;
+
+  // Invalid type.
+  properties = GetDeviceInfo(device);
+  EXPECT_EQ("UNKNOWN", properties.type());
+
+  // Cpu info.
+  device.type = "CPU";
+  properties = GetDeviceInfo(device);
+  EXPECT_EQ("CPU", properties.type());
+
+  // No TF GPU id provided.
+  device.type = "GPU";
+  device.has_id = false;
+  properties = GetDeviceInfo(device);
+  EXPECT_EQ("GPU", properties.type());
+#if GOOGLE_CUDA
+  EXPECT_EQ("NVIDIA", properties.vendor());
+#endif
+
+  // TF to CUDA GPU id mapping entry doesn't exist.
+  device.has_id = true;
+  device.id = 0;
+  properties = GetDeviceInfo(device);
+  EXPECT_EQ("UNKNOWN", properties.type());
+
+#if GOOGLE_CUDA
+  // Invalid CUDA GPU id.
+  GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId(0), CudaGpuId(100));
+  properties = GetDeviceInfo(device);
+  EXPECT_EQ("UNKNOWN", properties.type());
+
+  // Valid CUDA GPU id.
+  GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId(1), CudaGpuId(0));
+  device.id = 1;
+  properties = GetDeviceInfo(device);
+  EXPECT_EQ("GPU", properties.type());
+  EXPECT_EQ("NVIDIA", properties.vendor());
+#endif
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index ae70c9860823dae1a85ba20e00afe15b218cd2b4..5c9b2320b5bbf40ad545dba582f2276fda6aac60 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -37,6 +37,14 @@ VirtualCluster::VirtualCluster(
     : Cluster(0), node_estimator_(node_estimator), node_manager_(node_manager) {
   devices_ = devices;
 }
+
+VirtualCluster::VirtualCluster(
+    const std::unordered_map<string, DeviceProperties>& devices,
+    const DeviceSet* device_set)
+    : VirtualCluster(devices) {
+  device_set_ = device_set;
+}
+
 VirtualCluster::~VirtualCluster() {}
 
 Status VirtualCluster::Provision() { return Status::OK(); }
@@ -66,6 +74,7 @@ Status VirtualCluster::Run(const GraphDef& graph,
   }
 
   Costs node_costs;
+  int node_id = 0;
   do {
     OpContext op_context = scheduler.GetCurrNode();
     node_costs = node_estimator_->PredictCosts(op_context);
@@ -73,6 +82,7 @@ Status VirtualCluster::Run(const GraphDef& graph,
       CostGraphDef::Node* cost_node =
           metadata->mutable_cost_graph()->add_node();
       const string& op_name = op_context.name;
+      cost_node->set_id(node_id++);
       cost_node->set_name(op_name);
       cost_node->set_device(op_context.device_name);
       cost_node->set_compute_cost(
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
index dde70bab7a391e7573560b3202e9f0f7a0d69cae..eebac68e1b5acf336051a85a7898b7894bbbe0b2 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
-#define TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
 
 #include <unordered_map>
+
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
@@ -34,6 +36,8 @@ class VirtualCluster : public Cluster {
   VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
                  OpLevelCostEstimator* node_estimator,
                  ReadyNodeManager* node_manager);
+  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
+                 const DeviceSet* device_set);
 
   ~VirtualCluster() override;
 
@@ -53,4 +57,4 @@ class VirtualCluster : public Cluster {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 0fe01e9c9e094ebfa7fd1e6200d775ef61775184..35f11eac2955e504c4edcecbb2cde78727343528 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -6,18 +6,6 @@ load(
     "tf_protos_grappler",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "graph_properties_testdata",
     srcs = glob([
@@ -53,8 +41,12 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":utils",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
@@ -142,6 +134,7 @@ tf_cuda_library(
         "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
+        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index 9e01ec5ff5b48b9f979695b0a4b7b089245145c0..fe8a876f8ac3e97cd33f3eee0389eb700e845fda 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
-#define TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
 
 #include <chrono>
 #include <unordered_map>
@@ -180,4 +180,4 @@ class CostEstimator {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index 3604de392f803b8b2eb65e796848c2c3ec6a90e5..a5736d40b13fc6d38a6ffd64f5daa0f46bd3ba75 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -14,7 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/graph_memory.h"
-#include <list>
+
+#include <deque>
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -120,7 +121,7 @@ int64 GraphMemory::InferMemUsageForNeighbors(
 static GraphMemory::LiveTensor* FindOrCreateLiveTensor(
     const string& node_name, int output_id,
     std::unordered_map<string, GraphMemory::LiveTensor*>* live_tensors,
-    std::list<GraphMemory::LiveTensor>* device_tensors) {
+    std::deque<GraphMemory::LiveTensor>* device_tensors) {
   string name = strings::StrCat(node_name, ":", output_id);
   GraphMemory::LiveTensor* live;
   auto it = live_tensors->find(name);
@@ -141,6 +142,10 @@ static GraphMemory::LiveTensor* FindOrCreateLiveTensor(
 
 namespace {
 struct Event {
+  Event(int64 _timestamp, bool _allocated,
+        const GraphMemory::LiveTensor* _tensor)
+      : timestamp(_timestamp), allocated(_allocated), tensor(_tensor) {}
+
   int64 timestamp;
   bool allocated;
   const GraphMemory::LiveTensor* tensor;
@@ -160,13 +165,15 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
   }
 
   std::unordered_map<string, LiveTensor*> live_tensors;
-  std::unordered_map<string, std::list<LiveTensor>> live_tensors_per_device;
-
-  NodeMap node_map(&item_.graph);
+  std::unordered_map<string, std::deque<LiveTensor>> live_tensors_per_device;
+  std::unordered_map<string, const NodeDef*> node_map;
+  for (const NodeDef& node : item_.graph.node()) {
+    node_map[node.name()] = &node;
+  }
   for (const auto& dev_stats : timeline.dev_stats()) {
     const string& device_name = dev_stats.device();
     const bool is_gpu = (device_name.find("GPU:") || device_name.find("gpu:"));
-    std::list<LiveTensor>& device_tensors =
+    std::deque<LiveTensor>& device_tensors =
         live_tensors_per_device[dev_stats.device()];
     for (const auto& node_stats : dev_stats.node_stats()) {
       for (int i = 0; i < node_stats.output_size(); ++i) {
@@ -191,12 +198,13 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
                                     node_stats.op_end_rel_micros()));
       }
 
-      const NodeDef* node = node_map.GetNode(node_stats.node_name());
-      if (!node) {
+      auto it = node_map.find(node_stats.node_name());
+      if (it == node_map.end()) {
         // Skip nodes inserted by TF since they don't exist in the original
         // graph (e.g _Send/_Recv nodes).
         continue;
       }
+      const NodeDef* node = it->second;
       std::unordered_set<int> swapped_inputs;
       if (is_gpu) {
         auto it = node->attr().find("_swap_to_host");
@@ -237,14 +245,16 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
     std::vector<Event> events;
     events.reserve(2 * live_per_device.second.size());
     for (const auto& live : live_per_device.second) {
-      events.push_back(Event{live.allocation_time.count(), true, &live});
-      events.push_back(Event{live.deallocation_time.count(), false, &live});
+      events.emplace_back(static_cast<int64>(live.allocation_time.count()),
+                          true, &live);
+      events.emplace_back(static_cast<int64>(live.deallocation_time.count()),
+                          false, &live);
     }
     std::stable_sort(events.begin(), events.end());
     size_t peak = 0;
-    std::set<const LiveTensor*> live_at_peak;
+    std::unordered_set<const LiveTensor*> live_at_peak;
     size_t current = 0;
-    std::set<const LiveTensor*> currently_live;
+    std::unordered_set<const LiveTensor*> currently_live;
     for (int i = 0; i < events.size(); ++i) {
       const auto& event = events[i];
 
diff --git a/tensorflow/core/grappler/costs/graph_memory.h b/tensorflow/core/grappler/costs/graph_memory.h
index 859e4c012c84c9c77836e1cc16cf5ca578839cf8..a8ae4cc49f0cdc4218d6bf78021002fae61cdbc9 100644
--- a/tensorflow/core/grappler/costs/graph_memory.h
+++ b/tensorflow/core/grappler/costs/graph_memory.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
-#define TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_MEMORY_H_
 
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -78,4 +78,4 @@ class GraphMemory {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_MEMORY_H_
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 243ca9121c70d91631b474da62281bc56a476d8a..313f63149d54329ab609dc4c63a15fe8b1775af3 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -18,11 +18,17 @@ limitations under the License.
 #include <queue>
 #include <unordered_map>
 #include <unordered_set>
-#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -152,7 +158,7 @@ struct Processor<DimensionHandle> {
 template <typename Handle>
 class DisjointSet {
  public:
-  DisjointSet(const Processor<Handle>& processor) : processor_(processor) {}
+  DisjointSet() {}
   ~DisjointSet() {
     for (auto rep : nodes_) {
       delete rep.second;
@@ -250,17 +256,16 @@ typename DisjointSet<Handle>::Rep* DisjointSet<Handle>::Find(Handle value) {
   return root;
 }
 
-bool IsQueue(const Node& node) {
-  StringPiece type(node.type_string());
-  return type.ends_with("QueueV2");
+bool IsQueue(const NodeDef& node) {
+  return str_util::EndsWith(node.op(), "QueueV2");
 }
 
 // Returns true if the node is an Enter op AND its input is a Queue.
-bool IsEnterWithQueue(const Node& node) {
-  if (node.IsEnter()) {
-    const Node* in_node;
-    TF_CHECK_OK(node.input_node(0, &in_node));
-    return IsQueue(*in_node);
+bool IsEnterWithQueue(const NodeDef& node, const GraphView& graph) {
+  if (IsEnter(node)) {
+    GraphView::InputPort input(&node, 0);
+    GraphView::OutputPort fanin = graph.GetRegularFanin(input);
+    return IsQueue(*fanin.node);
   }
   return false;
 }
@@ -277,8 +282,9 @@ bool HasAnyUnknownDimensions(const TensorShapeProto& proto) {
   return false;
 }
 
+// This really should be done in an external debugging tool
 void VerboseLogUnknownDimensionSources(
-    const Graph& graph,
+    const GraphDef& graph,
     const std::map<string, std::vector<OpInfo::TensorProperties>>&
         input_properties_map,
     const std::map<string, std::vector<OpInfo::TensorProperties>>&
@@ -293,17 +299,13 @@ void VerboseLogUnknownDimensionSources(
   // do not have any unknown dimensions in their inputs, but
   // we have some unknown dimensions in their outputs.
   std::map<string, int> op_to_count;
-  for (const Node* const node : graph.nodes()) {
-    if (node->num_outputs() == 0) {
-      continue;
-    }
-
-    const auto& input_properties = input_properties_map.at(node->name());
-    const auto& output_properties = output_properties_map.at(node->name());
+  for (const NodeDef& node : graph.node()) {
+    const auto& input_properties = input_properties_map.at(node.name());
+    const auto& output_properties = output_properties_map.at(node.name());
 
     bool has_unknown_inputs = false;
-    for (int i = 0; i < node->num_inputs(); ++i) {
-      if (HasAnyUnknownDimensions(input_properties[i].shape())) {
+    for (const auto& input_prop : input_properties) {
+      if (HasAnyUnknownDimensions(input_prop.shape())) {
         has_unknown_inputs = true;
         break;
       }
@@ -313,26 +315,24 @@ void VerboseLogUnknownDimensionSources(
       continue;
     }
 
-    for (int i = 0; i < node->num_outputs(); ++i) {
-      if (HasAnyUnknownDimensions(output_properties[i].shape())) {
+    for (const auto& output_prop : output_properties) {
+      if (HasAnyUnknownDimensions(output_prop.shape())) {
         string inputs = "input_shapes=[";
-        for (int i = 0; i < node->num_inputs(); ++i) {
-          inputs +=
-              PartialTensorShape::DebugString(input_properties[i].shape());
+        for (const auto& input_prop : input_properties) {
+          inputs += PartialTensorShape::DebugString(input_prop.shape());
         }
         inputs += "]";
 
         string outputs = "output_shapes=[";
-        for (int i = 0; i < node->num_outputs(); ++i) {
-          outputs +=
-              PartialTensorShape::DebugString(output_properties[i].shape());
+        for (const auto& output_prop : output_properties) {
+          outputs += PartialTensorShape::DebugString(output_prop.shape());
         }
         outputs += "]";
 
-        VLOG(2) << "Node: " << node->name() << ", Op: " << node->def().op()
-                << ", " << inputs << ", " << outputs;
+        VLOG(2) << "Node: " << node.name() << ", Op: " << node.op() << ", "
+                << inputs << ", " << outputs;
 
-        op_to_count[node->def().op()]++;
+        op_to_count[node.op()]++;
 
         // don't log again for this node
         break;
@@ -355,11 +355,13 @@ void VerboseLogUnknownDimensionSources(
 // information is refined.
 class TopoQueue {
  public:
-  void push(const Node* n) { queue_.insert(n); }
-  const Node* pop() {
+  explicit TopoQueue(const std::unordered_map<const NodeDef*, int>& topo_order)
+      : queue_(CompareNodes(topo_order)) {}
+  void push(const NodeDef* n) { queue_.insert(n); }
+  const NodeDef* pop() {
     CHECK(!empty());
     auto it = queue_.begin();
-    const Node* n = *it;
+    const NodeDef* n = *it;
     queue_.erase(it);
     return n;
   }
@@ -371,11 +373,17 @@ class TopoQueue {
   // Graph nodes are created in (roughly) topological order. Therefore we can
   // use their id to ensure they're sorted topologically.
   struct CompareNodes {
-    bool operator()(const Node* lhs, const Node* rhs) const {
-      return lhs->id() < rhs->id();
+    explicit CompareNodes(
+        const std::unordered_map<const NodeDef*, int>& topo_ordering)
+        : topo_order(topo_ordering) {}
+    bool operator()(const NodeDef* lhs, const NodeDef* rhs) const {
+      return topo_order.at(lhs) < topo_order.at(rhs);
     }
+
+   private:
+    const std::unordered_map<const NodeDef*, int>& topo_order;
   };
-  std::set<const Node*, CompareNodes> queue_;
+  std::set<const NodeDef*, CompareNodes> queue_;
 };
 
 // Merge and relax symbolic shapes.
@@ -385,16 +393,146 @@ class TopoQueue {
 // unknown shape/dimension of a given node.
 class SymbolicShapeRefiner {
  public:
-  explicit SymbolicShapeRefiner(ShapeRefiner* shape_refiner)
-      : shape_refiner_(shape_refiner) {}
+  explicit SymbolicShapeRefiner(
+      const GraphView& graph,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports)
+      : graph_(graph),
+        function_library_(OpRegistry::Global(), graph.GetGraph()->library()),
+        fed_ports_(fed_ports) {
+    graph_def_version_ = graph.GetGraph()->versions().producer();
+    node_to_context_.reserve(graph.GetGraph()->node_size());
+  }
+
+  const GraphView& graph() const { return graph_; }
 
-  InferenceContext* GetContext(const Node* node) {
-    return shape_refiner_->GetContext(node);
+  struct NodeContext {
+    const OpRegistrationData* op_data;
+    DataTypeVector input_types;
+    DataTypeVector output_types;
+    std::unique_ptr<InferenceContext> inference_context;
+    std::vector<ShapeHandle> output_tensors_as_shapes;
+  };
+
+  NodeContext* GetNodeContext(const NodeDef* node) {
+    auto it = node_to_context_.find(node);
+    if (it == node_to_context_.end()) {
+      return nullptr;
+    }
+    return &it->second;
   }
-  Status UpdateNode(const Node* node, bool relax, bool* refined) {
-    return shape_refiner_->UpdateNode(node, relax, refined);
+
+  InferenceContext* GetContext(const NodeDef* node) {
+    auto it = node_to_context_.find(node);
+    if (it == node_to_context_.end()) {
+      return nullptr;
+    }
+    return it->second.inference_context.get();
   }
-  Status SetUnknownShape(const Node* node, int output_port) {
+  Status UpdateNode(const NodeDef* node, bool relax, bool* refined) {
+    NodeContext* node_context = GetNodeContext(node);
+    if (node_context == nullptr) {
+      TF_RETURN_IF_ERROR(AddNode(node));
+      node_context = CHECK_NOTNULL(GetNodeContext(node));
+      *refined = true;
+    }
+    // Check if the shapes of the nodes in the fan-in of this node have changed,
+    // and if they have, update the node input shapes.
+    InferenceContext* inference_context = node_context->inference_context.get();
+    std::vector<Tensor> const_values(inference_context->num_inputs());
+    std::vector<const Tensor*> input_tensors(inference_context->num_inputs(),
+                                             nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes(
+        inference_context->num_inputs());
+
+    for (int dst_input = 0; dst_input < inference_context->num_inputs();
+         ++dst_input) {
+      GraphView::InputPort port(node, dst_input);
+      for (const GraphView::OutputPort fanin : graph_.GetFanin(port)) {
+        int src_output = fanin.port_id;
+        const NodeDef* input = fanin.node;
+        NodeContext* c = GetNodeContext(input);
+        if (c == nullptr) {
+          return errors::FailedPrecondition(
+              "Input ", dst_input, " ('", input->name(), "') for '",
+              node->name(), "' was not previously added to ShapeRefiner.");
+        }
+
+        if (IsConstant(*input)) {
+          // Convert constant value into tensors.
+          if (const_values[dst_input].FromProto(
+                  input->attr().at("value").tensor())) {
+            input_tensors[dst_input] = &const_values[dst_input];
+            // Integer tensors of rank one can also be interpreted as a shape
+            // provided all their values are >= -1.
+            if (const_values[dst_input].dims() == 1 &&
+                (const_values[dst_input].dtype() == DT_INT32 ||
+                 const_values[dst_input].dtype() == DT_INT64)) {
+              ShapeHandle tensor_shape = inference_context->Vector(
+                  const_values[dst_input].NumElements());
+              ShapeHandle shp;
+              if (inference_context
+                      ->MakeShapeFromTensor(input_tensors[dst_input],
+                                            tensor_shape, &shp)
+                      .ok()) {
+                input_tensors_as_shapes[dst_input] = shp;
+              }
+            }
+          }
+        }
+
+        if (c->output_tensors_as_shapes.size() > src_output) {
+          input_tensors_as_shapes[dst_input] =
+              c->output_tensors_as_shapes[src_output];
+        }
+
+        DCHECK_GE(dst_input, 0);
+        if (!*refined && !inference_context->input(dst_input).SameHandle(
+                             c->inference_context->output(src_output))) {
+          *refined = true;
+        }
+        inference_context->SetInput(dst_input,
+                                    c->inference_context->output(src_output));
+
+        if (!*refined &&
+            inference_context->requested_input_tensor_as_partial_shape(
+                dst_input)) {
+          // The input value may have changed. Since we have no way to know if
+          // that's indeed the case, err on the safe side.
+          *refined = true;
+        }
+
+        // Also propagate handle shape and dtype of edges which are carrying
+        // resource handles.
+        if (node_context->input_types[dst_input] == DT_RESOURCE) {
+          auto* outputs =
+              c->inference_context->output_handle_shapes_and_types(src_output);
+          if (!outputs) continue;
+          auto* inputs =
+              inference_context->input_handle_shapes_and_types(dst_input);
+
+          if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) {
+            *refined = true;
+          }
+          inference_context->set_input_handle_shapes_and_types(dst_input,
+                                                               *outputs);
+        }
+      }
+    }
+
+    if (!*refined) {
+      // No input shape has changed, we're done
+      return Status::OK();
+    }
+
+    node_context->inference_context->set_input_tensors(input_tensors);
+    node_context->inference_context->set_input_tensors_as_shapes(
+        input_tensors_as_shapes);
+
+    // Update the shapes of the outputs.
+    return InferShapes(*node, node_context);
+  }
+
+  Status SetUnknownShape(const NodeDef* node, int output_port) {
     shape_inference::ShapeHandle shape =
         GetUnknownOutputShape(node, output_port);
     InferenceContext* ctx = GetContext(node);
@@ -406,7 +544,7 @@ class SymbolicShapeRefiner {
   }
 
   struct ShapeId {
-    const Node* node;
+    const NodeDef* node;
     int port_id;
     bool operator==(const ShapeId& other) const {
       return node == other.node && port_id == other.port_id;
@@ -414,12 +552,12 @@ class SymbolicShapeRefiner {
   };
   struct HashShapeId {
     std::size_t operator()(const ShapeId& shp) const {
-      return std::hash<const Node*>{}(shp.node) + shp.port_id;
+      return std::hash<const NodeDef*>{}(shp.node) + shp.port_id;
     }
   };
 
   struct DimId {
-    const Node* node;
+    const NodeDef* node;
     int port_id;
     int dim_index;
     bool operator==(const DimId& other) const {
@@ -430,18 +568,19 @@ class SymbolicShapeRefiner {
 
   struct HashDimId {
     std::size_t operator()(const DimId& dim) const {
-      return std::hash<const Node*>{}(dim.node) + dim.port_id + dim.dim_index;
+      return std::hash<const NodeDef*>{}(dim.node) + dim.port_id +
+             dim.dim_index;
     }
   };
 
   // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the intersection of shape1 and shape2.
-  ShapeHandle OutputAsIntersection(const Node* node, int port_index,
+  ShapeHandle OutputAsIntersection(const NodeDef* node, int port_index,
                                    ShapeHandle shape1, ShapeHandle shape2) {
     if (shape1.SameHandle(shape2)) {
       return shape1;
     }
-    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    InferenceContext* ctx = GetContext(node);
     ShapeHandle merged = shape1;
     if (!ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) {
       // Return either one since they're expected to represent the same value.
@@ -481,12 +620,12 @@ class SymbolicShapeRefiner {
 
   // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
-  ShapeHandle OutputAsUnion(const Node* node, int port_index,
+  ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
     if (shape1.SameHandle(shape2)) {
       return shape1;
     }
-    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    InferenceContext* ctx = GetContext(node);
     ShapeHandle relaxed = shape1;
     const int rank = ctx->Rank(shape1);
     if (!ctx->RankKnown(shape2) || ctx->Rank(shape2) != rank) {
@@ -551,38 +690,149 @@ class SymbolicShapeRefiner {
     return true;
   }
 
+  Status AddNode(const NodeDef* node) {
+    NodeContext& node_ctx = node_to_context_[node];
+    TF_RETURN_IF_ERROR(function_library_.LookUp(node->op(), &node_ctx.op_data));
+
+    TF_RETURN_IF_ERROR(InOutTypesForNode(*node, node_ctx.op_data->op_def,
+                                         &node_ctx.input_types,
+                                         &node_ctx.output_types));
+
+    // Create the inference context for this node.
+    const int num_inputs = node_ctx.input_types.size();
+    std::vector<ShapeHandle> input_shapes(num_inputs);
+    std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+        input_handle_shapes_and_types(num_inputs);
+    std::vector<const Tensor*> input_tensors(num_inputs, nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes;
+
+    node_ctx.inference_context.reset(new InferenceContext(
+        graph_def_version_, node, node_ctx.op_data->op_def, input_shapes,
+        input_tensors, input_tensors_as_shapes,
+        std::move(input_handle_shapes_and_types)));
+    const Status s = node_ctx.inference_context->construction_status();
+    if (!s.ok()) {
+      node_ctx.inference_context.reset(nullptr);
+    }
+    return s;
+  }
+
  private:
   // Return the one ShapeHandle used to denote a fully unknown shape for a node
   // output.
-  ShapeHandle GetUnknownOutputShape(const Node* node, int index) {
+  ShapeHandle GetUnknownOutputShape(const NodeDef* node, int index) {
     ShapeId id{node, index};
     auto it = unknown_shapes_.find(id);
     if (it != unknown_shapes_.end()) {
       return it->second;
     }
-    InferenceContext* c = shape_refiner_->GetContext(node);
+    InferenceContext* c = GetContext(node);
     ShapeHandle shp = c->UnknownShape();
     unknown_shapes_[id] = shp;
     return shp;
   }
   // Return the one ShapeHandle used to denote a fully unknown dimension for a
   // node output.
-  DimensionHandle GetUnknownOutputDim(const Node* node, int index, int dim_id) {
+  DimensionHandle GetUnknownOutputDim(const NodeDef* node, int index,
+                                      int dim_id) {
     DimId id{node, index, dim_id};
     auto it = unknown_dims_.find(id);
     if (it != unknown_dims_.end()) {
       return it->second;
     }
-    InferenceContext* c = shape_refiner_->GetContext(node);
+    InferenceContext* c = GetContext(node);
     DimensionHandle dim = c->UnknownDim();
     unknown_dims_[id] = dim;
     return dim;
   }
 
-  ShapeRefiner* shape_refiner_;
+  Status InferShapes(const NodeDef& node, NodeContext* c) {
+    InferenceContext* ic = c->inference_context.get();
+
+    auto it = fed_ports_.find(node.name());
+    const bool is_fed = it != fed_ports_.end();
+
+    // Propagate shape tensors unless the node is fed.
+    // TODO(bsteiner) We should still propagate the shapes to the ports that
+    // aren't fed in the case of a ShapeN node.
+    if (!is_fed) {
+      if (IsShape(node)) {
+        c->output_tensors_as_shapes.resize(1);
+        c->output_tensors_as_shapes[0] = c->inference_context->input(0);
+      } else if (IsShapeN(node)) {
+        c->output_tensors_as_shapes.resize(c->inference_context->num_inputs());
+        for (int i = 0; i < c->inference_context->num_inputs(); ++i) {
+          c->output_tensors_as_shapes[i] = c->inference_context->input(i);
+        }
+      } else if (node.op() == "ConcatV2") {
+        bool valid = true;
+        ShapeHandle result;
+        for (int i = 0; i < ic->num_inputs() - 1; ++i) {
+          ShapeHandle input = ic->input_tensors_as_shapes()[i];
+          if (!ic->RankKnown(input)) {
+            valid = false;
+            break;
+          } else if (i == 0) {
+            result = input;
+          } else {
+            TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result));
+          }
+        }
+        if (valid) {
+          c->output_tensors_as_shapes.resize(1);
+          c->output_tensors_as_shapes[0] = result;
+        }
+      } else if (IsSlice(node)) {
+        ShapeHandle input = ic->input_tensors_as_shapes()[0];
+        bool valid = ic->RankKnown(input);
+        const Tensor* slice_offset = ic->input_tensor(1);
+        valid &= slice_offset != nullptr && slice_offset->NumElements() == 1;
+        const Tensor* slice_size = ic->input_tensor(2);
+        valid &= slice_size != nullptr && slice_size->NumElements() == 1;
+        if (valid) {
+          int64 start = slice_offset->dtype() == DT_INT32
+                            ? slice_offset->flat<int32>()(0)
+                            : slice_offset->flat<int64>()(0);
+          int64 end = start + (slice_size->dtype() == DT_INT32
+                                   ? slice_size->flat<int32>()(0)
+                                   : slice_size->flat<int64>()(0));
+          ShapeHandle result;
+          TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result));
+          c->output_tensors_as_shapes.resize(1);
+          c->output_tensors_as_shapes[0] = result;
+        }
+      }
+    }
+
+    // Infer the shapes of output tensors.
+    if (!c->op_data || c->op_data->shape_inference_fn == nullptr) {
+      // There is nothing more we can infer, annotate outputs with unknown
+      // shapes
+      return c->inference_context->Run(shape_inference::UnknownShape);
+    }
+
+    TF_RETURN_IF_ERROR(
+        c->inference_context->Run(c->op_data->shape_inference_fn));
 
+    Status status = Status::OK();
+    if (is_fed) {
+      // It is possible to feed node output ports with tensors of any shape: as
+      // a result, the shape of a fed port is completely unknown.
+      for (const int output_port : it->second) {
+        status.Update(SetUnknownShape(&node, output_port));
+      }
+    }
+    return status;
+  }
+
+ private:
+  const GraphView& graph_;
+  int graph_def_version_;
+  std::unordered_map<const NodeDef*, NodeContext> node_to_context_;
   std::unordered_map<ShapeId, ShapeHandle, HashShapeId> unknown_shapes_;
   std::unordered_map<DimId, DimensionHandle, HashDimId> unknown_dims_;
+  FunctionLibraryDefinition function_library_;
+  const std::unordered_map<string, std::unordered_set<int>>& fed_ports_;
 };
 
 // Keep track of shapes and dimensions in a graph.
@@ -590,7 +840,7 @@ class SymbolicShapeRefiner {
 // dims, and consolidate the information globally.
 class SymbolicShapeManager {
  public:
-  SymbolicShapeManager() : shapes_(shape_processor_), dims_(dim_processor_) {}
+  SymbolicShapeManager() {}
 
   Status Merge(ShapeHandle s1, ShapeHandle s2) {
     if (!s1.IsSet() || !s2.IsSet()) {
@@ -630,14 +880,12 @@ class SymbolicShapeManager {
   }
 
  private:
-  Processor<ShapeHandle> shape_processor_;
   DisjointSet<shape_inference::ShapeHandle> shapes_;
-  Processor<DimensionHandle> dim_processor_;
   DisjointSet<shape_inference::DimensionHandle> dims_;
 };
 
 Status GraphProperties::MergeEnqueueShapesAndTypes(
-    SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+    SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
     const std::vector<ShapeAndType>& shapes_and_types,
     std::vector<ShapeAndType>* queue_shapes_and_types) {
   if (shapes_and_types.size() != queue_shapes_and_types->size()) {
@@ -660,7 +908,7 @@ Status GraphProperties::MergeEnqueueShapesAndTypes(
 }
 
 Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
-    SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+    SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
     const std::vector<ShapeAndType>& shapes_and_types,
     std::vector<ShapeAndType>* queue_shapes_and_types) {
   if (shapes_and_types.size() != queue_shapes_and_types->size()) {
@@ -688,35 +936,41 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
 // inputs are UnknownShapes. So we need to ignore the input from NextIteration
 // nodes to propagate any known shape from the Merge node.
 Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                                        const Node* node, bool relax,
-                                        TopoQueue* new_shapes) {
+                                        const NodeDef* node, bool relax,
+                                        bool* new_shapes) const {
   InferenceContext* c = shape_refiner->GetContext(node);
-  CHECK_NE(c, nullptr);
-
-  ShapeHandle out1;
-  TF_RETURN_IF_ERROR(c->WithRank(c->output(1), 0, &out1));
-  c->set_output(1, out1);
+  if (!c) {
+    // Now we can run shape inference
+    TF_RETURN_IF_ERROR(shape_refiner->AddNode(node));
+    c = CHECK_NOTNULL(shape_refiner->GetContext(node));
+    *new_shapes = true;
+
+    // Infer the shape of the second output once and for all since it never
+    // changes.
+    ShapeHandle out1 = c->Scalar();
+    c->set_output(1, out1);
+  }
 
   ShapeHandle out;
   bool out_initialized = false;
-  for (const Edge* e : node->in_edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
+  for (const GraphView::Edge fanin :
+       shape_refiner->graph().GetFaninEdges(*node, false)) {
     // Skip back edges during the initial propagation phase. This is equivalent
     // to assuming that all the inputs to the merge nodes are fed by the same
     // shape, and will be corrected as needed in the relaxation phase.
-    if (!relax && e->src()->IsNextIteration()) {
+    if (!relax && IsNextIteration(*fanin.src.node)) {
       continue;
     }
 
-    InferenceContext* in = shape_refiner->GetContext(e->src());
-    ShapeHandle input = in->output(e->src_output());
-    if (relax) {
-      c->RelaxInput(e->dst_input(), input);
-    } else {
-      c->MergeInput(e->dst_input(), input);
+    InferenceContext* in = shape_refiner->GetContext(fanin.src.node);
+    if (!relax && !in) {
+      // Handling a loop for the first time, the back edge won't have any shape
+      // info.
+      continue;
     }
+    ShapeHandle input = in->output(fanin.src.port_id);
+    CHECK_EQ(fanin.tgt.node, node);
+    c->SetInput(fanin.tgt.port_id, input);
     if (!out_initialized) {
       out_initialized = true;
       out = input;
@@ -729,67 +983,46 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
     }
   }
 
-  if (!shape_refiner->EquivalentShapes(out, c->output(0))) {
+  if (*new_shapes || !shape_refiner->EquivalentShapes(out, c->output(0))) {
     c->set_output(0, out);
-    new_shapes->push(node);
+    *new_shapes = true;
   }
 
   return Status::OK();
 }
 
-Status GraphProperties::OverwriteFedPorts(
-    SymbolicShapeRefiner* shape_refiner,
-    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-    const Node* node, TopoQueue* new_shapes) const {
-  auto it = fed_ports.find(node->name());
-  Status status;
-  if (it != fed_ports.end()) {
-    // It is possible to feed node output ports with tensors of any shape: as a
-    // result, the shape of a fed port is completely unknown.
-    for (const int output_port : it->second) {
-      status.Update(shape_refiner->SetUnknownShape(node, output_port));
-    }
-    new_shapes->push(node);
-  }
-  return status;
-}
-
 // Manually propagate the input shape for Enter nodes and update any Merge node
 // outputs.
 Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
-                                    const Node* node, bool relax,
-                                    TopoQueue* new_shapes) {
+                                    const NodeDef* node, bool relax,
+                                    bool* new_shapes) {
   auto enter_ctx = shape_refiner->GetContext(node);
-  CHECK_NE(enter_ctx, nullptr);
+  if (!enter_ctx) {
+    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, relax, new_shapes));
+    enter_ctx = shape_refiner->GetContext(node);
+  }
 
-  for (const Edge* e : node->in_edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
-    InferenceContext* in = shape_refiner->GetContext(e->src());
-    ShapeHandle input = in->output(e->src_output());
-    if (!enter_ctx->output(0).SameHandle(input)) {
-      if (relax) {
-        enter_ctx->RelaxInput(0, input);
-      } else {
-        enter_ctx->MergeInput(0, input);
-      }
-      enter_ctx->set_output(0, input);
-      new_shapes->push(node);
-    }
+  GraphView::InputPort inp(node, 0);
+  GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(inp);
+
+  InferenceContext* in = shape_refiner->GetContext(fanin.node);
+  ShapeHandle input = in->output(fanin.port_id);
+  if (!enter_ctx->output(0).SameHandle(input)) {
+    enter_ctx->SetInput(0, input);
+    enter_ctx->set_output(0, input);
+    *new_shapes = true;
   }
   return Status::OK();
 }
 
-Status GraphProperties::UpdateShapes(
-    SymbolicShapeRefiner* shape_refiner, bool relax,
-    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-    const Node* n, TopoQueue* new_shapes) const {
-  if (n->IsEnter()) {
+Status GraphProperties::UpdateShapes(SymbolicShapeRefiner* shape_refiner,
+                                     bool relax, const NodeDef* n,
+                                     bool* new_shapes) const {
+  if (IsEnter(*n)) {
     // The Enter shape function always forwards an UnknownShape, so do the right
     // thing here.
     TF_RETURN_IF_ERROR(UpdateEnter(shape_refiner, n, relax, new_shapes));
-  } else if (n->IsMerge()) {
+  } else if (IsMerge(*n)) {
     // Properly handle merge nodes.
     TF_RETURN_IF_ERROR(UpdateMergeNode(shape_refiner, n, relax, new_shapes));
   } else {
@@ -799,22 +1032,19 @@ Status GraphProperties::UpdateShapes(
     if (updated) {
       // We want to avoid propagating through loops on the merge pass because
       // the shapes are not guaranteed to converge.
-      if (relax || !n->IsNextIteration()) {
-        new_shapes->push(n);
+      if (relax || !IsNextIteration(*n)) {
+        *new_shapes = true;
       }
     }
   }
-  // Nodes can be fed with any shape. The TensorFlow shape inference code can't
-  // handle this properly, so overwrite its behavior here.
-  return OverwriteFedPorts(shape_refiner, fed_ports, n, new_shapes);
+  return Status::OK();
 }
 
 // Propagates the shapes in the transitive fan-out of <new_shapes>.
 Status GraphProperties::PropagateShapes(
     SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
-    const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
-        resources,
-    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+    const std::unordered_map<const NodeDef*,
+                             std::unordered_set<const NodeDef*>>& resources,
     int num_loops) const {
   // Limit the number of iterations to prevent infinite loops in the presence of
   // incorrect shape functions. The algoritm should converge in at most
@@ -836,12 +1066,13 @@ Status GraphProperties::PropagateShapes(
     int64 num_loop_iterations = 0;
     while (!new_shapes->empty() &&
            num_loop_iterations++ < max_loop_iterations) {
-      const Node* n = new_shapes->pop();
-      for (const Edge* e : n->out_edges()) {
-        if (!e->IsControlEdge()) {
-          const Node* fanout = e->dst();
-          TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, fed_ports,
-                                          fanout, new_shapes));
+      const NodeDef* n = new_shapes->pop();
+      bool updated = false;
+      TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, n, &updated));
+      if (updated) {
+        for (const GraphView::InputPort fanout :
+             shape_refiner->graph().GetFanouts(*n, false)) {
+          new_shapes->push(fanout.node);
         }
       }
     }
@@ -851,7 +1082,7 @@ Status GraphProperties::PropagateShapes(
       // fanout of the queues, we need to manually propagate the shapes from
       // enqueue node to the corresponding queue.
       TF_RETURN_IF_ERROR(UpdateResource(resource.first, resource.second,
-                                        shape_refiner, relax, new_shapes));
+                                        shape_refiner, new_shapes));
     }
   } while (!new_shapes->empty() &&
            num_resource_iterations++ < max_resource_iterations);
@@ -864,10 +1095,11 @@ Status GraphProperties::PropagateShapes(
 }
 
 Status GraphProperties::UpdateResource(
-    const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
-    SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes) {
+    const NodeDef* qnode,
+    const std::unordered_set<const NodeDef*>& queue_inputs,
+    SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes) {
   // Proceed only if qnode is a queue or an Enter with queue input.
-  if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode)) {
+  if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode, shape_refiner->graph())) {
     return Status::OK();
   }
   auto qctx = shape_refiner->GetContext(qnode);
@@ -879,31 +1111,24 @@ Status GraphProperties::UpdateResource(
   // Merge all inputs into the enqueue node, regardless of which phase we
   // are in.
   std::vector<ShapeAndType> queue_shapes_and_types;
-  if (queue_handle_data) {
-    queue_shapes_and_types = *queue_handle_data;
-  }
   for (const auto& node : queue_inputs) {
-    auto ctx = shape_refiner->GetContext(node);
+    auto ctx = shape_refiner->GetNodeContext(node);
     if (!ctx) {
       continue;
     }
     // TODO(bsteiner): handle EnqueueMany as well.
-    if (node->type_string().find("Enqueue") != std::string::npos &&
-        node->type_string().find("EnqueueMany") == std::string::npos) {
+    if (node->op().find("Enqueue") != std::string::npos &&
+        node->op().find("EnqueueMany") == std::string::npos) {
       std::vector<ShapeAndType> shapes_and_types;
-      for (int i = 1; i < ctx->num_inputs(); ++i) {
-        shapes_and_types.push_back({ctx->input(i), node->input_type(i)});
+      for (int i = 1; i < ctx->input_types.size(); ++i) {
+        shapes_and_types.push_back(
+            {ctx->inference_context->input(i), ctx->input_types[i]});
       }
       if (queue_shapes_and_types.empty()) {
         queue_shapes_and_types = shapes_and_types;
       } else {
-        if (relax) {
-          TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
-              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
-        } else {
-          TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
-              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
-        }
+        TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
+            shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
       }
     }
   }
@@ -913,29 +1138,18 @@ Status GraphProperties::UpdateResource(
                                                queue_shapes_and_types)) {
     qctx->set_output_handle_shapes_and_types(0, queue_shapes_and_types);
 
-    new_shapes->push(qnode);
+    for (const GraphView::InputPort fanout :
+         shape_refiner->graph().GetFanouts(*qnode, false)) {
+      new_shapes->push(fanout.node);
+    }
   }
 
   return Status::OK();
 }
 
 Status GraphProperties::InferStatically(bool assume_valid_feeds) {
-  Graph graph(OpRegistry::Global());
-  FunctionLibraryDefinition function_library(graph.op_registry(),
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item_.graph.library());
-  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
-  shape_refiner.set_require_shape_inference_fns(false);
-  shape_refiner.set_disable_constant_propagation(true);
-  shape_refiner.set_function_library_for_shape_inference(&function_library);
-  ImportGraphDefOptions options;
-  // Graph optimization happens at the late stage of graph execution,
-  // when colocation constraints are already validated previously and
-  // the device placement of nodes has also completed, so there
-  // is no need to validate colocation constraints again.
-  options.validate_colocation_constraints = false;
-  Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
-  TF_RETURN_IF_ERROR(s);
-
   std::unordered_map<string, std::unordered_set<int>> fed_ports;
   if (!assume_valid_feeds) {
     for (const auto& feed : item_.feed) {
@@ -945,34 +1159,48 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
-  // List the resources and the nodes using them. Also collect the Enter and
-  // Merge nodes.
-  std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
-  std::unordered_set<const Node*> enter_nodes;
-  std::unordered_set<const Node*> merge_nodes;
-  std::unordered_set<const Node*> fed_nodes;
+  std::unordered_map<const NodeDef*, int> topo_order;
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(item_.graph, &topo_order));
+
+  GraphView graph_view(&item_.graph);
+
+  // List the resources and the nodes using them. Also collect the Merge nodes,
+  // fed nodes, and primary inputs.
+  std::unordered_map<const NodeDef*, std::unordered_set<const NodeDef*>>
+      resources;
+  std::unordered_set<const NodeDef*> merge_nodes;
+  std::unordered_set<const NodeDef*> fed_nodes;
+  std::unordered_set<const NodeDef*> primary_inputs;
   int num_loops = 0;
-  for (const Node* const node : graph.nodes()) {
-    for (int i = 0; i < node->num_inputs(); ++i) {
-      if (node->input_type(i) == DataType::DT_RESOURCE) {
-        const Node* resource;
-        TF_CHECK_OK(node->input_node(i, &resource));
-        resources[resource].insert(node);
-      }
-    }
-    if (node->IsEnter()) {
-      enter_nodes.insert(node);
-    } else if (node->IsMerge()) {
-      merge_nodes.insert(node);
-    } else if (node->IsNextIteration()) {
+  for (const NodeDef& node : item_.graph.node()) {
+    if (NumNonControlInputs(node) == 0) {
+      primary_inputs.insert(&node);
+    } else if (IsMerge(node)) {
+      merge_nodes.insert(&node);
+    } else if (IsNextIteration(node)) {
       ++num_loops;
+    } else {
+      const OpRegistrationData* op_data;
+      TF_RETURN_IF_ERROR(function_library.LookUp(node.op(), &op_data));
+      DataTypeVector input_types;
+      DataTypeVector output_types;
+      TF_RETURN_IF_ERROR(InOutTypesForNode(node, op_data->op_def, &input_types,
+                                           &output_types));
+      for (int i = 0; i < input_types.size(); ++i) {
+        if (input_types[i] == DataType::DT_RESOURCE) {
+          GraphView::InputPort input(&node, i);
+          const GraphView::OutputPort resource =
+              graph_view.GetRegularFanin(input);
+          resources[resource.node].insert(&node);
+        }
+      }
     }
-    if (fed_ports.find(node->name()) != fed_ports.end()) {
-      fed_nodes.insert(node);
+    if (fed_ports.find(node.name()) != fed_ports.end()) {
+      fed_nodes.insert(&node);
     }
   }
 
-  SymbolicShapeRefiner refiner(&shape_refiner);
+  SymbolicShapeRefiner refiner(graph_view, fed_ports);
 
   // We propagate shapes through the graph in two phases. In the first phase, we
   // exclusively merge shapes but we do not propagate shapes through the
@@ -980,38 +1208,37 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   // we exclusively relax shapes and propagate shapes through loops until
   // reaching fixed point.
   for (int relax = 0; relax < 2; relax++) {
-    TopoQueue new_shapes;
-    // Force the propagation of shapes of Enter nodes manually (the Enter shape
-    // function always forwards an UnknownShape).
-    for (const Node* node : enter_nodes) {
-      TF_RETURN_IF_ERROR(
-          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
-    }
+    TopoQueue new_shapes(topo_order);
     // Seed the propagation of shapes through merge nodes.
-    for (const Node* node : merge_nodes) {
-      TF_RETURN_IF_ERROR(
-          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
+    if (relax) {
+      for (const NodeDef* node : merge_nodes) {
+        new_shapes.push(node);
+      }
+    }
+    // Also seed the propagation of shapes in the fanout of primary inputs.
+    for (const NodeDef* node : primary_inputs) {
+      new_shapes.push(node);
     }
     // Also seed the propagation of shapes in the fanout of fed nodes.
-    for (const Node* node : fed_nodes) {
-      TF_RETURN_IF_ERROR(
-          OverwriteFedPorts(&refiner, fed_ports, node, &new_shapes));
+    for (const NodeDef* node : fed_nodes) {
+      new_shapes.push(node);
     }
     // Propagate shapes normally.
-    TF_RETURN_IF_ERROR(PropagateShapes(&refiner, relax, &new_shapes, resources,
-                                       fed_ports, num_loops));
+    TF_RETURN_IF_ERROR(
+        PropagateShapes(&refiner, relax, &new_shapes, resources, num_loops));
   }
 
   // Track shapes globally across the graph.
   SymbolicShapeManager shape_manager;
   bool found_error = false;
-  for (const Node* const node : graph.nodes()) {
-    auto node_ctx = shape_refiner.GetContext(node);
+  for (const NodeDef& node : item_.graph.node()) {
+    auto node_ctx = refiner.GetContext(&node);
     if (!node_ctx) {
       continue;
     }
     // Skip any information that comes from fed nodes.
-    if (fed_ports.find(node->name()) != fed_ports.end()) {
+    if (fed_ports.find(node.name()) != fed_ports.end()) {
+      VLOG(2) << "Skipping feed node shape: " << node.name();
       continue;
     }
     for (const auto& merged_shapes : node_ctx->MergedShapes()) {
@@ -1035,61 +1262,56 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
-  for (const Node* const node : graph.nodes()) {
-    VLOG(3) << "Filling in graph properties for node: " << node->name();
-    auto ctx = shape_refiner.GetContext(node);
+  for (const NodeDef& node : item_.graph.node()) {
+    VLOG(3) << "Filling in graph properties for node: " << node.name();
+    auto ctx = refiner.GetNodeContext(&node);
     if (!ctx) {
       continue;
     }
 
     // Fill input properties.
     {
-      CHECK_EQ(ctx->num_inputs(), node->num_inputs());
-      auto& input_properties = input_properties_[node->name()];
+      // CHECK_EQ(ctx->num_inputs(), node.num_inputs());
+      auto& input_properties = input_properties_[node.name()];
 
       // Should always be empty, node names in graph are supposed to be unique.
       CHECK_EQ(input_properties.size(), 0);
 
-      input_properties.resize(ctx->num_inputs());
-      for (int i = 0; i < ctx->num_inputs(); ++i) {
-        shape_manager.AsTensorProperties(ctx->input(i), node->input_type(i),
+      input_properties.resize(ctx->inference_context->num_inputs());
+      GraphView::InputPort input(&node, -1);
+      for (int i = 0; i < ctx->inference_context->num_inputs(); ++i) {
+        shape_manager.AsTensorProperties(ctx->inference_context->input(i),
+                                         ctx->input_types[i],
                                          &input_properties[i]);
-      }
-      for (const auto& edge : node->in_edges()) {
-        if (edge->IsControlEdge()) {
-          continue;
-        }
-        if (!edge->src()->IsConstant()) {
+        input.port_id = i;
+        GraphView::OutputPort fanin = graph_view.GetRegularFanin(input);
+        if (!IsConstant(*fanin.node)) {
           continue;
         }
-        const int input_id = edge->dst_input();
-        if (input_id >= input_properties.size()) {
-          continue;
-        }
-        const NodeDef& node = edge->src()->def();
-        const TensorProto& raw_val = node.attr().at("value").tensor();
-        *input_properties[input_id].mutable_value() = raw_val;
+        const TensorProto& raw_val = fanin.node->attr().at("value").tensor();
+        *input_properties[i].mutable_value() = raw_val;
       }
     }
 
     // Fill output properties.
     {
-      CHECK_EQ(ctx->num_outputs(), node->num_outputs());
-      auto& output_properties = output_properties_[node->name()];
+      // CHECK_EQ(ctx->num_outputs(), node->num_outputs());
+      auto& output_properties = output_properties_[node.name()];
 
       // Should always be empty, node names in graph are supposed to be unique.
       CHECK_EQ(output_properties.size(), 0);
 
-      output_properties.resize(ctx->num_outputs());
-      for (int i = 0; i < ctx->num_outputs(); ++i) {
-        shape_manager.AsTensorProperties(ctx->output(i), node->output_type(i),
+      output_properties.resize(ctx->inference_context->num_outputs());
+      for (int i = 0; i < ctx->inference_context->num_outputs(); ++i) {
+        shape_manager.AsTensorProperties(ctx->inference_context->output(i),
+                                         ctx->output_types[i],
                                          &output_properties[i]);
       }
     }
   }
 
   // Help trace the unknown dimensions to their origins.
-  VerboseLogUnknownDimensionSources(graph, input_properties_,
+  VerboseLogUnknownDimensionSources(item_.graph, input_properties_,
                                     output_properties_);
 
   return Status::OK();
@@ -1182,5 +1404,12 @@ GraphProperties::GetOutputProperties(const string& node_name) const {
   return missing_properties_;
 }
 
+void GraphProperties::ClearInputProperties(const string& node_name) {
+  input_properties_.erase(node_name);
+}
+void GraphProperties::ClearOutputProperties(const string& node_name) {
+  output_properties_.erase(node_name);
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 6fc53a7f2e7da7bae7b6f49c7b32291c981fef53..7d685b58337213d0d3d6b43ebc22d9d6c9147b44 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
-#define TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
 
 #include <unordered_map>
 #include <vector>
@@ -24,14 +24,18 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 
 namespace tensorflow {
+
 namespace grappler {
 
 class SymbolicShapeRefiner;
 class TopoQueue;
 
-// A TensorFlow model to optimize.
-// Models are represented by the combination of a graph, one of more fetch
-// nodes, and potentially a set of nodes to feed.
+// Infer OpInfo::TensorProperties for graph nodes inputs/outputs.
+//
+// Typical use case, is to infer tensor properties from a graph, before doing
+// optimization pass. Nodes modified during optimization pass have to be
+// invalidated, to prevent further incorrect optimizations based on wrong shape
+// and data type properties.
 class GraphProperties {
  public:
   explicit GraphProperties(const GrapplerItem& item) : item_(item) {}
@@ -64,60 +68,51 @@ class GraphProperties {
       const string& node_name) const;
   const std::vector<OpInfo::TensorProperties>& GetOutputProperties(
       const string& node_name) const;
-
-  static void FillTensorPropertiesFromContext(
-      const shape_inference::ShapeHandle&, const DataType&,
-      shape_inference::InferenceContext*,
-      std::unordered_map<const shape_inference::Dimension*, int>* dim_ids,
-      OpInfo::TensorProperties*);
+  // Invalidate input/output properties for nodes modified during graph
+  // optimization pass, to prevent potential optimizations, based on incorrect
+  // shape information.
+  void ClearInputProperties(const string& node_name);
+  void ClearOutputProperties(const string& node_name);
 
  private:
   // Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
   static Status MergeEnqueueShapesAndTypes(
-      SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+      SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
       const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
       std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
   // Relaxes shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
   static Status RelaxEnqueueShapesAndMergeTypes(
-      SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+      SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
       const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
       std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
 
   // Update the shapes for qnode. If output shapes of qnode have changed,
   // enqueue its fanout in 'new_shapes'.
   static Status UpdateResource(
-      const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
-      SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes);
+      const NodeDef* qnode,
+      const std::unordered_set<const NodeDef*>& queue_inputs,
+      SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes);
 
   // Update the output shapes of a Merge node, and enqueue its fanout in
   // new_shapes if needed.
-  static Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                                const Node* node, bool relax,
-                                TopoQueue* new_shapes);
+  Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
+                         const NodeDef* node, bool relax,
+                         bool* new_shapes) const;
   // Process the Enter node, and enqueue its fanout in new_shapes if needed.
   static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
-                            const Node* node, bool relax,
-                            TopoQueue* new_shapes);
-  // Process a node that is used to feed the model.
-  Status OverwriteFedPorts(
-      SymbolicShapeRefiner* shape_refiner,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-      const Node* node, TopoQueue* new_shapes) const;
+                            const NodeDef* node, bool relax, bool* new_shapes);
   // Update the shapes for node 'n'. If output shapes for n have changed,
   // enqueue its fanout in 'new_shapes'.
-  Status UpdateShapes(
-      SymbolicShapeRefiner* shape_refiner, bool relax,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-      const Node* n, TopoQueue* new_shapes) const;
+  Status UpdateShapes(SymbolicShapeRefiner* shape_refiner, bool relax,
+                      const NodeDef* n, bool* new_shapes) const;
   // Propagate the shapes for the nodes enqueued in new_shapes and their
   // transitive fanout until a fixed point is reached.
   Status PropagateShapes(
       SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
-      const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
-          resources,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      const std::unordered_map<const NodeDef*,
+                               std::unordered_set<const NodeDef*>>& resources,
       int num_loops) const;
 
   // Data members
@@ -130,4 +125,4 @@ class GraphProperties {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 5012069118fbe0b3d90d2e99690b2988c45a2843..afe334dfa2fafd7b24ac076fde1d2c5532b6e8b8 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -113,6 +114,33 @@ TEST_F(GraphPropertiesTest, StaticProperties) {
   }
 }
 
+TEST_F(GraphPropertiesTest, ClearProperties) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphProperties properties(item);
+  Status s = properties.InferStatically(true);
+  TF_CHECK_OK(s);
+
+  for (const auto& node : item.graph.node()) {
+    if (node.op() == "RandomStandardNormal") {
+      EXPECT_EQ(1, properties.GetInputProperties(node.name()).size());
+      const auto props = properties.GetOutputProperties(node.name());
+      properties.ClearOutputProperties(node.name());
+      const auto cleared_props = properties.GetOutputProperties(node.name());
+      EXPECT_TRUE(cleared_props.empty());
+    } else if (node.op() == "AddN") {
+      const auto in_props = properties.GetInputProperties(node.name());
+      EXPECT_EQ(1, in_props.size());
+      properties.ClearInputProperties(node.name());
+      const auto cleared_props = properties.GetInputProperties(node.name());
+      EXPECT_TRUE(cleared_props.empty());
+    }
+  }
+}
+
 TEST_F(GraphPropertiesTest, DynamicProperties) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
                                           cluster_->GetDeviceNames());
@@ -276,9 +304,9 @@ TEST_F(GraphPropertiesTest, Queues) {
       root.WithOpName("Queue5"),
       {DataType::DT_FLOAT, DataType::DT_DOUBLE, DataType::DT_FLOAT});
   Output rnd2 =
-      ops::RandomNormal(root.WithOpName("rnd"), {10}, DataType::DT_DOUBLE);
+      ops::RandomNormal(root.WithOpName("rnd2"), {10}, DataType::DT_DOUBLE);
   Output rnd3 =
-      ops::RandomNormal(root.WithOpName("rnd"), {1, 2, 3}, DataType::DT_FLOAT);
+      ops::RandomNormal(root.WithOpName("rnd3"), {1, 2, 3}, DataType::DT_FLOAT);
   auto enqueue5 =
       ops::QueueEnqueue(root.WithOpName("Enqueue5"), q5, {rnd, rnd2, rnd3});
   auto dequeue5 = ops::QueueDequeue(
@@ -728,25 +756,25 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
       z = MyAdd(x, y)
       z = MyAdd(x, z)
   */
-  // Check that the shape of the second MyAdd node propagates
-  // correctly.
+  // Check that the shape inference code infers what it can.
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "simple_function.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
   TF_CHECK_OK(properties.InferStatically(false));
-  const auto props = properties.GetOutputProperties("MyAdd_55e046a8_1");
-  const OpInfo::TensorProperties& prop = props[0];
-  EXPECT_EQ(DT_FLOAT, prop.dtype());
-  EXPECT_FALSE(prop.shape().unknown_rank());
-  EXPECT_EQ(2, prop.shape().dim_size());
-  EXPECT_EQ(1, prop.shape().dim(0).size());
-  EXPECT_EQ(2, prop.shape().dim(1).size());
-
-  PartialTensorShape shape(prop.shape());
-  EXPECT_TRUE(shape.IsFullyDefined());
-  EXPECT_FALSE(shape.unknown_rank());
+  const auto out_props = properties.GetOutputProperties("MyAdd_55e046a8");
+  const OpInfo::TensorProperties& out_prop = out_props[0];
+  EXPECT_EQ(DT_FLOAT, out_prop.dtype());
+  EXPECT_TRUE(out_prop.shape().unknown_rank());
+
+  const auto in_props = properties.GetInputProperties("MyAdd_55e046a8");
+  const OpInfo::TensorProperties& in_prop = in_props[0];
+  EXPECT_EQ(DT_FLOAT, in_prop.dtype());
+  EXPECT_FALSE(in_prop.shape().unknown_rank());
+  EXPECT_EQ(2, in_prop.shape().dim_size());
+  EXPECT_EQ(1, in_prop.shape().dim(0).size());
+  EXPECT_EQ(2, in_prop.shape().dim(1).size());
 }
 
 TEST_F(GraphPropertiesTest, SymbolicShapes) {
@@ -928,6 +956,11 @@ TEST_F(GraphPropertiesTest, Performance) {
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "large_graph.pbtxt.html");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_CHECK_OK(AddDefaultAttrsToGraphDef(
+      &item.graph,
+      FunctionLibraryDefinition(OpRegistry::Global(), item.graph.library()), 0,
+      true));
+
   GraphProperties properties(item);
   TF_CHECK_OK(properties.InferStatically(false));
 }
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index ea4320687af366ccdd82e46cf28adf4ee9c100c0..833205ac6f12a73d96c93455bb355ee511d6700a 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <limits>
 
 #include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/robust_stats.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -52,6 +53,8 @@ Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
 Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
                                             CostGraphDef* cost_graph,
                                             Costs* costs) const {
+  const bool running_simulation = (cluster_->type() == "virtual");
+
   std::vector<double> times(measurement_steps_);
   BlockingCounter barrier(measurement_steps_);
 
@@ -80,9 +83,23 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     }
 
     const Costs::MicroSeconds finish = Env::Default()->NowMicros();
-    const double time = (finish - start).count() * 1e3;
-    times[step] = time;
-
+    if (running_simulation) {
+      // When running simulation, return the estimated runtime, not the time it
+      // takes to run the simulation.
+      double time = 0.0;
+      for (const DeviceStepStats& stepstats :
+           metadata.step_stats().dev_stats()) {
+        for (const NodeExecStats& node_stats : stepstats.node_stats()) {
+          const double completion_time =
+              node_stats.all_end_rel_micros() + node_stats.all_start_micros();
+          time = std::max(time, completion_time * 1e3);
+        }
+      }
+      times[step] = time;
+    } else {
+      const double time = (finish - start).count() * 1e3;
+      times[step] = time;
+    }
     if (cost_graph && (step + 1 == measurement_steps_)) {
       metadata.mutable_cost_graph()->Swap(cost_graph);
     }
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
index 1b3edb4c27b325d03884624bf48d57fe09768df7..3e741c91997403e7eae438d2dd72c9a70da9316a 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
-#define TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
 
 #include <string>
 #include <utility>
@@ -73,4 +73,4 @@ class MeasuringCostEstimator : public CostEstimator {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index a57cfdd9891b1d654092f9b896af248fa40eb88f..b35873ce385974178967ef3700e2309cf90c1d10 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 
 namespace tensorflow {
@@ -29,10 +30,12 @@ constexpr char kConst[] = "Const";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
+constexpr char kFusedConv2dBiasActivation[] = "FusedConv2DBiasActivation";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
 constexpr char kIdentity[] = "Identity";
+constexpr char kIdentityN[] = "IdentityN";
 constexpr char kRefIdentity[] = "RefIdentity";
 constexpr char kNoOp[] = "NoOp";
 constexpr char kReshape[] = "Reshape";
@@ -46,6 +49,15 @@ constexpr char kShape[] = "Shape";
 constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
+constexpr char kGather[] = "Gather";
+constexpr char kGatherV2[] = "GatherV2";
+constexpr char kSlice[] = "Slice";
+constexpr char kMaxPool[] = "MaxPool";
+constexpr char kMaxPoolGrad[] = "MaxPoolGrad";
+constexpr char kAvgPool[] = "AvgPool";
+constexpr char kAvgPoolGrad[] = "AvgPoolGrad";
+constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
+constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
 
 static const Costs::Duration kMinComputeTime(1);
 
@@ -67,14 +79,39 @@ Padding GetPadding(const OpInfo& op_features) {
   return Padding::SAME;  // Default padding.
 }
 
+bool IsTraining(const OpInfo& op_info) {
+  if (op_info.attr().find("is_training") != op_info.attr().end() &&
+      op_info.attr().at("is_training").b()) {
+    return true;
+  }
+  return false;
+}
+
+// TODO(dyoon): support non-4D tensors in the c ost functions of convolution
+// related ops (Conv, Pool, BatchNorm, and their backprops) and the related
+// helper functions.
 std::vector<int64> GetStrides(const OpInfo& op_features) {
   if (op_features.attr().find("strides") != op_features.attr().end()) {
     const auto strides = op_features.attr().at("strides").list().i();
+    CHECK(strides.size() == 4) << "Attr strides is not a length-4 vector: "
+                               << op_features.DebugString();
     return {strides[0], strides[1], strides[2], strides[3]};
   }
   return {1, 1, 1, 1};
 }
 
+std::vector<int64> GetKernelSize(const OpInfo& op_info) {
+  if (op_info.attr().find("ksize") != op_info.attr().end()) {
+    const auto ksize = op_info.attr().at("ksize").list().i();
+    CHECK(ksize.size() == 4)
+        << "Attr ksize is not a length-4 vector: " << op_info.DebugString();
+    return {ksize[0], ksize[1], ksize[2], ksize[3]};
+  }
+  // Note that FusedBatchNorm doesn't have ksize attr, but GetKernelSize returns
+  // {1, 1, 1, 1} in that case.
+  return {1, 1, 1, 1};
+}
+
 int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
                     const Padding& padding) {
   // Logic for calculating output shape is from GetWindowedOutputSizeVerbose()
@@ -161,14 +198,21 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter)},
       {kConv2dBackpropInput,
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
+      {kFusedConv2dBiasActivation,
+       wrap(&OpLevelCostEstimator::PredictFusedConv2DBiasActivation)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
 
+      {kGather, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
+      {kGatherV2, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
+      {kSlice, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
+
       {kPlaceholder, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kIdentityN, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kRefIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kStopGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kPreventGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
@@ -182,107 +226,86 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
-      {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)}};
-
-  elementwise_ops_ = {
-      // Unary ops alphabetically sorted
-      {"Acos", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_acos_op<float>>::Cost},
-      {"Asin", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_asin_op<float>>::Cost},
-      {"Atan", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_atan_op<float>>::Cost},
-      {"Atan2", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_quotient_op<float>>::Cost +
-                    Eigen::internal::functor_traits<
-                        Eigen::internal::scalar_atan_op<float>>::Cost},
-      {"Ceil", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_ceil_op<float>>::Cost},
-      {"Cos", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_cos_op<float>>::Cost},
-      {"Erf", 1},
-      {"Erfc", 1},
-      {"Exp", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_exp_op<float>>::Cost},
-      {"Expm1", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_expm1_op<float>>::Cost},
-      {"Floor", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_floor_op<float>>::Cost},
-      {"Inv", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_inverse_op<float>>::Cost},
-      {"InvGrad", 1},
-      {"Lgamma", 1},
-      {"Log", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_log_op<float>>::Cost},
-      {"Log1p", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_log1p_op<float>>::Cost},
-      {"Neg", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_opposite_op<float>>::Cost},
-      {"Reciprocal", Eigen::internal::functor_traits<
-                         Eigen::internal::scalar_inverse_op<float>>::Cost},
-      {"Rint", 1},
-      {"Round", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_round_op<float>>::Cost},
-      {"Rsqrt", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_rsqrt_op<float>>::Cost},
-      {"Sqrt", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_sqrt_op<float>>::Cost},
-      {"Square", Eigen::internal::functor_traits<
-                     Eigen::internal::scalar_square_op<float>>::Cost},
-      {"Tanh", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_tanh_op<float>>::Cost},
-      {"Relu", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_max_op<float>>::Cost},
-      {"Sigmoid", Eigen::internal::functor_traits<
-                      Eigen::internal::scalar_sigmoid_op<float>>::Cost},
-      {"Sign", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_sign_op<float>>::Cost},
-      {"Sin", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_sin_op<float>>::Cost},
-      {"Tan", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_tan_op<float>>::Cost},
-      // Binary ops alphabetically sorted
-      {"Add", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_sum_op<float>>::Cost},
-      {"ApproximateEqual", 1},
-      {"Div", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_quotient_op<float>>::Cost},
-      {"Equal", 1},
-      {"FloorDiv", Eigen::internal::functor_traits<
-                       Eigen::internal::scalar_quotient_op<float>>::Cost},
-      {"FloorMod", Eigen::internal::functor_traits<
-                       Eigen::internal::scalar_mod_op<float>>::Cost},
-      {"Greater", 1},
-      {"GreaterEqual", 1},
-      {"Less", 1},
-      {"LessEqual", 1},
-      {"LogicalAnd", Eigen::internal::functor_traits<
-                         Eigen::internal::scalar_boolean_and_op>::Cost},
-      {"LogicalNot", 1},
-      {"LogicalOr", Eigen::internal::functor_traits<
-                        Eigen::internal::scalar_boolean_or_op>::Cost},
-      {"Maximum", Eigen::internal::functor_traits<
-                      Eigen::internal::scalar_max_op<float>>::Cost},
-      {"Minimum", Eigen::internal::functor_traits<
-                      Eigen::internal::scalar_min_op<float>>::Cost},
-      {"Mod", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_mod_op<float>>::Cost},
-      {"Mul", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_product_op<float>>::Cost},
-      {"NotEqual", 1},
-      {"QuantizedAdd", Eigen::internal::functor_traits<
-                           Eigen::internal::scalar_sum_op<float>>::Cost},
-      {"QuantizedMul", Eigen::internal::functor_traits<
-                           Eigen::internal::scalar_product_op<float>>::Cost},
-      {"RealDiv", Eigen::internal::functor_traits<
-                      Eigen::internal::scalar_quotient_op<float>>::Cost},
-      {"SquareDifference", 1},
-      {"Sub", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_difference_op<float>>::Cost},
-      {"TruncateDiv", Eigen::internal::functor_traits<
-                          Eigen::internal::scalar_quotient_op<float>>::Cost},
-      {"TruncateMod", Eigen::internal::functor_traits<
-                          Eigen::internal::scalar_mod_op<float>>::Cost}};
+      {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)},
+      {kMaxPool, wrap(&OpLevelCostEstimator::PredictMaxPool)},
+      {kMaxPoolGrad, wrap(&OpLevelCostEstimator::PredictMaxPoolGrad)},
+      {kAvgPool, wrap(&OpLevelCostEstimator::PredictAvgPool)},
+      {kAvgPoolGrad, wrap(&OpLevelCostEstimator::PredictAvgPoolGrad)},
+      {kFusedBatchNorm, wrap(&OpLevelCostEstimator::PredictFusedBatchNorm)},
+      {kFusedBatchNormGrad,
+       wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad)},
+  };
+
+#define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
+
+  // Quantize = apply min and max bounds, multiply by scale factor and round.
+  const int quantize_v2_cost =
+      EIGEN_COST(scalar_product_op<float>) + EIGEN_COST(scalar_max_op<float>) +
+      EIGEN_COST(scalar_min_op<float>) + EIGEN_COST(scalar_round_op<float>);
+
+  elementwise_ops_ = {// Unary ops alphabetically sorted
+                      {"Acos", EIGEN_COST(scalar_acos_op<float>)},
+                      {"Asin", EIGEN_COST(scalar_asin_op<float>)},
+                      {"Atan", EIGEN_COST(scalar_atan_op<float>)},
+                      {"Atan2", EIGEN_COST(scalar_quotient_op<float>) +
+                                    EIGEN_COST(scalar_atan_op<float>)},
+                      {"Ceil", EIGEN_COST(scalar_ceil_op<float>)},
+                      {"Cos", EIGEN_COST(scalar_cos_op<float>)},
+                      {"Dequantize", EIGEN_COST(scalar_product_op<float>)},
+                      {"Erf", 1},
+                      {"Erfc", 1},
+                      {"Exp", EIGEN_COST(scalar_exp_op<float>)},
+                      {"Expm1", EIGEN_COST(scalar_expm1_op<float>)},
+                      {"Floor", EIGEN_COST(scalar_floor_op<float>)},
+                      {"Inv", EIGEN_COST(scalar_inverse_op<float>)},
+                      {"InvGrad", 1},
+                      {"Lgamma", 1},
+                      {"Log", EIGEN_COST(scalar_log_op<float>)},
+                      {"Log1p", EIGEN_COST(scalar_log1p_op<float>)},
+                      {"Neg", EIGEN_COST(scalar_opposite_op<float>)},
+                      {"QuantizeV2", quantize_v2_cost},
+                      {"Reciprocal", EIGEN_COST(scalar_inverse_op<float>)},
+                      {"Rint", 1},
+                      {"Round", EIGEN_COST(scalar_round_op<float>)},
+                      {"Rsqrt", EIGEN_COST(scalar_rsqrt_op<float>)},
+                      {"Sqrt", EIGEN_COST(scalar_sqrt_op<float>)},
+                      {"Square", EIGEN_COST(scalar_square_op<float>)},
+                      {"Tanh", EIGEN_COST(scalar_tanh_op<float>)},
+                      {"Relu", EIGEN_COST(scalar_max_op<float>)},
+                      {"Sigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
+                      {"Sign", EIGEN_COST(scalar_sign_op<float>)},
+                      {"Sin", EIGEN_COST(scalar_sin_op<float>)},
+                      {"Tan", EIGEN_COST(scalar_tan_op<float>)},
+                      // Binary ops alphabetically sorted
+                      {"Add", EIGEN_COST(scalar_sum_op<float>)},
+                      {"ApproximateEqual", 1},
+                      {"BiasAdd", EIGEN_COST(scalar_sum_op<float>)},
+                      {"Div", EIGEN_COST(scalar_quotient_op<float>)},
+                      {"Equal", 1},
+                      {"FloorDiv", EIGEN_COST(scalar_quotient_op<float>)},
+                      {"FloorMod", EIGEN_COST(scalar_mod_op<float>)},
+                      {"Greater", 1},
+                      {"GreaterEqual", 1},
+                      {"Less", 1},
+                      {"LessEqual", 1},
+                      {"LogicalAnd", EIGEN_COST(scalar_boolean_and_op)},
+                      {"LogicalNot", 1},
+                      {"LogicalOr", EIGEN_COST(scalar_boolean_or_op)},
+                      {"Maximum", EIGEN_COST(scalar_max_op<float>)},
+                      {"Minimum", EIGEN_COST(scalar_min_op<float>)},
+                      {"Mod", EIGEN_COST(scalar_mod_op<float>)},
+                      {"Mul", EIGEN_COST(scalar_product_op<float>)},
+                      {"NotEqual", 1},
+                      {"QuantizedAdd", EIGEN_COST(scalar_sum_op<float>)},
+                      {"QuantizedMul", EIGEN_COST(scalar_product_op<float>)},
+                      {"RealDiv", EIGEN_COST(scalar_quotient_op<float>)},
+                      {"ReluGrad", EIGEN_COST(scalar_max_op<float>)},
+                      {"SquareDifference", 1},
+                      {"Sub", EIGEN_COST(scalar_difference_op<float>)},
+                      {"TruncateDiv", EIGEN_COST(scalar_quotient_op<float>)},
+                      {"TruncateMod", EIGEN_COST(scalar_mod_op<float>)}};
+
+#undef EIGEN_COST
 
   // By default, use sum of memory_time and compute_time for execution_time.
   compute_memory_overlap_ = false;
@@ -409,40 +432,39 @@ Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp(
 }
 
 Costs OpLevelCostEstimator::PredictOpCountBasedCost(
-    double operations, const OpInfo& op_features) const {
-  DeviceInfo device_perf = GetDeviceInfo(op_features.device());
-  if (device_perf.gigaops <= 0 || device_perf.gb_per_sec <= 0) {
-    VLOG(1) << "BAD DEVICE. Op:" << op_features.op()
-            << " device type:" << op_features.device().type()
-            << " device model:" << op_features.device().model();
-  }
+    double operations, const OpInfo& op_info) const {
+  bool unknown_shapes = false;
+  const double input_size = CalculateInputSize(op_info, &unknown_shapes);
+  const double output_size = CalculateOutputSize(op_info, &unknown_shapes);
+  const double total_io_bytes = input_size + output_size;
+  Costs costs = PredictOpCountBasedCost(operations, total_io_bytes, op_info);
+  costs.inaccurate = unknown_shapes;
+  costs.max_memory = output_size;
+  return costs;
+}
 
-  Costs::NanoSeconds compute_cost(std::ceil(operations / device_perf.gigaops));
-  VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9
-          << " Execution Time (ns):" << compute_cost.count();
+Costs OpLevelCostEstimator::PredictOpCountBasedCost(
+    double operations, double total_io_bytes, const OpInfo& op_info) const {
+  const DeviceInfo device_info = GetDeviceInfo(op_info.device());
+  if (device_info.gigaops <= 0 || device_info.gb_per_sec <= 0) {
+    VLOG(1) << "BAD DEVICE. Op:" << op_info.op()
+            << " device type:" << op_info.device().type()
+            << " device model:" << op_info.device().model();
+  }
 
-  bool found_unknown_shapes = false;
-  const double total_input_size =
-      CalculateInputSize(op_features, &found_unknown_shapes);
-  const double total_output_size =
-      CalculateOutputSize(op_features, &found_unknown_shapes);
-  const double total_io_size = total_input_size + total_output_size;
+  Costs::NanoSeconds compute_cost(std::ceil(operations / device_info.gigaops));
+  VLOG(1) << "Op:" << op_info.op() << " GOps:" << operations / 1e9
+          << " Compute Time (ns):" << compute_cost.count();
 
   Costs::NanoSeconds memory_cost(
-      std::ceil(total_io_size / device_perf.gb_per_sec));
-  VLOG(1) << "Op:" << op_features.op() << " Size (KB):" << (total_io_size) / 1e3
+      std::ceil(total_io_bytes / device_info.gb_per_sec));
+  VLOG(1) << "Op:" << op_info.op() << " Size (KB):" << (total_io_bytes) / 1e3
           << " Memory Time (ns):" << memory_cost.count();
 
   Costs costs;
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
-  if (compute_memory_overlap_) {
-    costs.execution_time = std::max(compute_cost, memory_cost);
-  } else {
-    costs.execution_time = compute_cost + memory_cost;
-  }
-  costs.inaccurate = found_unknown_shapes;
-  costs.max_memory = total_output_size;
+  CombineCostsAndUpdateExecutionTime(&costs);
   return costs;
 }
 
@@ -528,7 +550,6 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   ops *= conv_dims.kx * conv_dims.ky;
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
-  VLOG(1) << "Operations for Conv2D " << ops;
 
   if (conv_info != nullptr) {
     *conv_info = conv_dims;
@@ -718,6 +739,56 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
   return ops;
 }
 
+bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
+                                        TensorShapeProto* tensor_shape_proto) {
+  tensor_shape_proto->Clear();
+  // First convert TensorProto into Tensor class so that it correctly parses
+  // data values within TensorProto (whether it's in int_val, int64_val,
+  // tensor_content, or anything.
+  Tensor tensor(tensor_proto.dtype());
+  if (!tensor.FromProto(tensor_proto)) {
+    LOG(WARNING) << "GetTensorShapeProtoFromTensorProto() -- "
+                 << "failed to parse TensorProto: "
+                 << tensor_proto.DebugString();
+    return false;
+  }
+  if (tensor.dims() != 1) {
+    LOG(WARNING) << "GetTensorShapeProtoFromTensorProto() -- "
+                 << "tensor is not 1D: " << tensor.dims();
+    return false;
+  }
+  // Then, convert it back to TensorProto using AsProtoField, which makes sure
+  // the data is in int_val, int64_val, or such repeated data fields, not in
+  // tensor_content.
+  TensorProto temp_tensor;
+  tensor.AsProtoField(&temp_tensor);
+
+#define TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(type)        \
+  do {                                                   \
+    for (const auto& value : temp_tensor.type##_val()) { \
+      tensor_shape_proto->add_dim()->set_size(value);    \
+    }                                                    \
+  } while (0)
+
+  if (tensor.dtype() == DT_INT32 || tensor.dtype() == DT_INT16 ||
+      tensor.dtype() == DT_INT8 || tensor.dtype() == DT_UINT8) {
+    TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(int);
+  } else if (tensor.dtype() == DT_INT64) {
+    TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(int64);
+  } else if (tensor.dtype() == DT_UINT32) {
+    TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(uint32);
+  } else if (tensor.dtype() == DT_UINT64) {
+    TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(uint64);
+  } else {
+    LOG(WARNING) << "GetTensorShapeProtoFromTensorProto() -- "
+                 << "Unsupported dtype: " << tensor.dtype();
+    return false;
+  }
+#undef TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO
+
+  return true;
+}
+
 // TODO(cliffy): Dedup this method and CountConv2DBackpropFilterOperations.
 int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
     const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
@@ -732,21 +803,18 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
 
   TensorShapeProto input_shape;
+  bool shape_found = false;
   if (op_features.inputs(0).has_value()) {
     const TensorProto& value = op_features.inputs(0).value();
-    if (value.int64_val_size() > 0) {
-      for (int i = 0; i < value.int64_val_size(); ++i) {
-        input_shape.add_dim()->set_size(value.int64_val(i));
-      }
-    } else {
-      for (int i = 0; i < value.int_val_size(); ++i) {
-        input_shape.add_dim()->set_size(value.int_val(i));
-      }
-    }
-  } else if (op_features.outputs_size() == 1) {
+    shape_found = GetTensorShapeProtoFromTensorProto(value, &input_shape);
+  }
+  if (!shape_found && op_features.outputs_size() == 1) {
     input_shape = op_features.outputs(0).shape();
-  } else {
+    shape_found = true;
+  }
+  if (!shape_found) {
     // Set the minimum filter size that's feasible.
+    input_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       input_shape.add_dim()->set_size(1);
     }
@@ -778,21 +846,18 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
   DCHECK_EQ(kConv2dBackpropFilter, op_features.op());
 
   TensorShapeProto filter_shape;
+  bool shape_found = false;
   if (op_features.inputs_size() >= 2 && op_features.inputs(1).has_value()) {
     const TensorProto& value = op_features.inputs(1).value();
-    if (value.int64_val_size() > 0) {
-      for (int i = 0; i < value.int64_val_size(); ++i) {
-        filter_shape.add_dim()->set_size(value.int64_val(i));
-      }
-    } else {
-      for (int i = 0; i < value.int_val_size(); ++i) {
-        filter_shape.add_dim()->set_size(value.int_val(i));
-      }
-    }
-  } else if (op_features.outputs_size() == 1) {
+    shape_found = GetTensorShapeProtoFromTensorProto(value, &filter_shape);
+  }
+  if (!shape_found && op_features.outputs_size() == 1) {
     filter_shape = op_features.outputs(0).shape();
-  } else {
+    shape_found = true;
+  }
+  if (!shape_found) {
     // Set the minimum filter size that's feasible.
+    filter_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       filter_shape.add_dim()->set_size(1);
     }
@@ -823,7 +888,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
 
 int64 OpLevelCostEstimator::CalculateTensorElementCount(
     const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) const {
-  VLOG(2) << "   with " << tensor.dtype() << " tensor of shape "
+  VLOG(2) << "   with " << DataTypeString(tensor.dtype()) << " tensor of shape "
           << tensor.shape().DebugString();
   int64 tensor_size = 1;
   int num_dims = std::max(1, tensor.shape().dim_size());
@@ -922,6 +987,91 @@ Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
+    const OpContext& op_context) const {
+  // FusedConv2DBiasActivation computes a fused kernel which implements:
+  // 2D convolution, adds side input with separate scaling on convolution and
+  // side inputs, then adds bias, and finally applies the ReLU activation
+  // function to the result:
+  //
+  // Input -> Conv2D  ->  Add  -> BiasAdd  -> ReLU
+  //            ^          ^         ^
+  //          Filter   Side Input   Bias
+  //
+  // Note that when adding the side input, the operation multiplies the output
+  // of Conv2D by conv_input_scale, confusingly, and the side_input by
+  // side_input_scale.
+  //
+  // Note that in the special case that side_input_scale is 0, which we infer
+  // from side_input having dimensions [], we skip that addition operation.
+  //
+  // For more information, see
+  // contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+  auto& conv_input = op_context.op_info.inputs(0);
+  auto& filter = op_context.op_info.inputs(1);
+  auto& bias = op_context.op_info.inputs(2);
+  auto& side_input = op_context.op_info.inputs(3);
+  auto& conv_input_scale = op_context.op_info.inputs(4);
+  auto& side_input_scale = op_context.op_info.inputs(5);
+
+  // Manually compute our convolution dimensions.
+  bool found_unknown_shapes = false;
+  auto dims = ConvolutionDimensionsFromInputs(
+      conv_input.shape(), filter.shape(), op_context.op_info,
+      &found_unknown_shapes);
+
+  // Construct the shape of our output tensor from our convolution dimensions
+  // and format, as it may not be available yet.
+  //
+  // TODO(varomodt): should we centralize the Conv2D input/output shapes?
+  bool unknown_conv_format = false;
+  OpInfo::TensorProperties output;
+  switch (GetConvolutionFormat(op_context)) {
+    case NCHW:
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.oz, dims.ox, dims.oy});
+      break;
+    case NHWC:
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.ox, dims.oy, dims.oz});
+      break;
+    default:
+      // TODO(b/77722245): support cost estimation for NCHW_VECT_C.
+      LOG(WARNING) << "unsupported data format: "
+                   << GetDataFormat(op_context.op_info)
+                   << " Defaulting to NHWC.";
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.ox, dims.oy, dims.oz});
+      unknown_conv_format = true;
+      break;
+  }
+
+  // Add the operations the fused op always computes.
+  std::vector<OpContext> component_ops = {
+      FusedChildContext(op_context, "Conv2D", output, {conv_input, filter}),
+      FusedChildContext(op_context, "Mul", output, {output, conv_input_scale}),
+      FusedChildContext(op_context, "BiasAdd", output, {output, bias}),
+      FusedChildContext(op_context, "Relu", output, {output})};
+
+  // Add our side_input iff it's non-empty.
+  if (side_input.shape().dim_size() > 0) {
+    component_ops.push_back(FusedChildContext(op_context, "Mul", side_input,
+                                              {side_input, side_input_scale}));
+    component_ops.push_back(
+        FusedChildContext(op_context, "Add", output, {side_input, output}));
+  }
+
+  // Construct an op_context which definitely has our output shape.
+  auto op_context_with_output = op_context;
+  op_context_with_output.op_info.mutable_outputs()->Clear();
+  *op_context_with_output.op_info.mutable_outputs()->Add() = output;
+
+  // Construct component operations and run the cost computation.
+  auto costs = PredictFusedOp(op_context_with_output, component_ops);
+  costs.inaccurate |= found_unknown_shapes || unknown_conv_format;
+  return costs;
+}
+
 Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
   bool found_unknown_shapes = false;
@@ -984,5 +1134,414 @@ Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictGatherOrSlice(
+    const OpContext& op_context) const {
+  // Gather & Slice ops can have a very large input, but only access a small
+  // part of it. For these op the size of the output determines the memory cost.
+  const auto& op_info = op_context.op_info;
+
+  const int inputs_needed = op_info.op() == "Slice" ? 3 : 2;
+  if (op_info.outputs_size() == 0 || op_info.inputs_size() < inputs_needed) {
+    Costs costs = Costs::ZeroCosts();
+    costs.inaccurate = true;
+    return costs;
+  }
+
+  bool unknown_shapes = false;
+
+  // Each output element is a copy of some element from input.
+  // For roofline estimate we assume each copy has a unit cost.
+  const int64 op_count =
+      CalculateTensorElementCount(op_info.outputs(0), &unknown_shapes);
+
+  const double output_size = CalculateOutputSize(op_info, &unknown_shapes);
+  double input_size = output_size;
+  if (op_info.op() == "Slice") {
+    // Add 'begin' & 'size' tensors sizes.
+    input_size +=
+        CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes) +
+        CalculateTensorElementCount(op_info.inputs(2), &unknown_shapes);
+  } else {
+    // Assuming this is "Gather" or "GatherV2" op, add 'indices' size.
+    input_size +=
+        CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes);
+  }
+
+  const double total_io = input_size + output_size;
+  Costs costs = PredictOpCountBasedCost(op_count, total_io, op_info);
+  costs.inaccurate = unknown_shapes;
+  costs.max_memory = output_size;
+
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictFusedOp(
+    const OpContext& op_context,
+    const std::vector<OpContext>& fused_op_contexts) const {
+  // Note that PredictOpCountBasedCost will get the correct memory_time from
+  // the node's inputs and outputs; but we don't want to have to re-implement
+  // the logic for computing the operation count of each of our component
+  // operations here; so we simply add the compute times of each component
+  // operation, then update the execution time.
+  Costs fused_cost = PredictOpCountBasedCost(0, op_context.op_info);
+  fused_cost.compute_time = 0;
+  fused_cost.inaccurate = false;
+  for (auto& fused_op : fused_op_contexts) {
+    auto op_cost = PredictCosts(fused_op);
+    fused_cost.compute_time += op_cost.compute_time;
+    fused_cost.inaccurate |= op_cost.inaccurate;
+  }
+
+  CombineCostsAndUpdateExecutionTime(&fused_cost);
+  return fused_cost;
+}
+
+/* static */
+OpContext OpLevelCostEstimator::FusedChildContext(
+    const OpContext& parent, const string& op_name,
+    const OpInfo::TensorProperties& output,
+    const std::vector<OpInfo::TensorProperties>& inputs) {
+  // Setup the base parameters of our new context.
+  OpContext new_context;
+  new_context.name = op_name;
+  new_context.device_name = parent.device_name;
+  new_context.op_info = parent.op_info;
+  new_context.op_info.set_op(op_name);
+
+  // Setup the inputs of our new context.
+  new_context.op_info.mutable_inputs()->Clear();
+  for (const auto& input : inputs) {
+    *new_context.op_info.mutable_inputs()->Add() = input;
+  }
+
+  // Setup the output of our new context.
+  new_context.op_info.mutable_outputs()->Clear();
+  *new_context.op_info.mutable_outputs()->Add() = output;
+
+  return new_context;
+}
+
+/* static */
+OpInfo::TensorProperties OpLevelCostEstimator::DescribeTensor(
+    DataType type, const std::vector<int64>& dims) {
+  OpInfo::TensorProperties ret;
+  ret.set_dtype(type);
+
+  auto shape = ret.mutable_shape();
+  for (const int dim : dims) {
+    shape->add_dim()->set_size(dim);
+  }
+
+  return ret;
+}
+
+/* static */
+OpLevelCostEstimator::ConvolutionDimensions
+OpLevelCostEstimator::OpDimensionsFromInputs(
+    const TensorShapeProto& original_image_shape, const OpInfo& op_info,
+    bool* found_unknown_shapes) {
+  VLOG(2) << "op features: " << op_info.DebugString();
+  VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
+  auto image_shape =
+      MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
+  VLOG(2) << "Image shape: " << image_shape.DebugString();
+
+  int x_index, y_index, channel_index;
+  const string& data_format = GetDataFormat(op_info);
+  if (data_format == "NCHW") {
+    x_index = 2;
+    y_index = 3;
+    channel_index = 1;
+  } else {
+    x_index = 1;
+    y_index = 2;
+    channel_index = 3;
+  }
+  int64 batch = image_shape.dim(0).size();
+  int64 ix = image_shape.dim(x_index).size();
+  int64 iy = image_shape.dim(y_index).size();
+  int64 iz = image_shape.dim(channel_index).size();
+
+  // Note that FusedBatchNorm doesn't have ksize attr, but GetKernelSize returns
+  // {1, 1, 1, 1} in that case.
+  std::vector<int64> ksize = GetKernelSize(op_info);
+  int64 kx = ksize[x_index];
+  int64 ky = ksize[y_index];
+
+  std::vector<int64> strides = GetStrides(op_info);
+  int64 sx = strides[x_index];
+  int64 sy = strides[y_index];
+  const auto padding = GetPadding(op_info);
+
+  int64 ox = GetOutputSize(ix, kx, sx, padding);
+  int64 oy = GetOutputSize(iy, ky, sy, padding);
+  int64 oz = iz;
+
+  OpLevelCostEstimator::ConvolutionDimensions conv_dims = {
+      batch, ix, iy, iz, kx, ky, oz, ox, oy, sx, sy, padding};
+  return conv_dims;
+}
+
+Costs OpLevelCostEstimator::PredictMaxPool(const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+  // kx * ky - 1 comparisons per output (kx * xy > 1)
+  // or 1 copy per output (kx * k1 = 1).
+  int per_output_ops = dims.kx * dims.ky == 1 ? 1 : dims.kx * dims.ky - 1;
+  int64 ops = dims.batch * dims.ox * dims.oy * dims.oz * per_output_ops;
+
+  double total_input_size = 0;
+  if (dims.ky >= dims.sy) {
+    total_input_size =
+        CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  } else {  // dims.ky < dims.sy
+    // Vertical stride is larger than vertical kernel; assuming row-major
+    // format, skip unnecessary rows (or read every kx rows per sy rows, as the
+    // others are not used for output).
+    const auto data_size = DataTypeSize(BaseType(op_info.inputs(0).dtype()));
+    total_input_size =
+        data_size * dims.batch * dims.ix * dims.ky * dims.oy * dims.iz;
+  }
+  const double total_output_size =
+      CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictMaxPoolGrad(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  // y: op_info.inputs(1)
+  // y_grad: op_info.inputs(2)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+
+  int64 ops = 0;
+  if (dims.kx == 1 && dims.ky == 1) {
+    // 1x1 window. No need to know which input was max.
+    ops = dims.batch * dims.ix * dims.iy * dims.iz;
+  } else if (dims.kx <= dims.sx && dims.ky <= dims.sy) {
+    // Non-overlapping window: re-run maxpool, then assign zero or y_grad.
+    ops = dims.batch * dims.iz *
+          (dims.ox * dims.oy * (dims.kx * dims.ky - 1) + dims.ix * dims.iy);
+  } else {
+    // Overlapping window: initialize with zeros, re-run maxpool, then
+    // accumulate y_gad to proper x_grad locations.
+    ops = dims.batch * dims.iz *
+          (dims.ox * dims.oy * (dims.kx * dims.ky - 1) + dims.ix * dims.iy * 2);
+  }
+
+  // Just read x and y_grad; no need to read y as we assume MaxPoolGrad re-run
+  // MaxPool internally.
+  double total_input_size =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  total_input_size +=
+      CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
+  // Write x_grad; size equal to x.
+  const double total_output_size =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictAvgPool(const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+
+  // kx * ky - 1 additions and 1 multiplication per output.
+  int64 ops = dims.batch * dims.ox * dims.oy * dims.oz * dims.kx * dims.ky;
+
+  double total_input_size = 0;
+  if (dims.ky >= dims.sy) {
+    total_input_size =
+        CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  } else {  // dims.ky < dims.sy
+    // vertical stride is larger than vertical kernel; assuming row-major
+    // format, skip unnecessary rows (or read every kx rows per sy rows, as the
+    // others are not used for output).
+    const auto data_size = DataTypeSize(BaseType(op_info.inputs(0).dtype()));
+    total_input_size =
+        data_size * dims.batch * dims.ix * dims.ky * dims.oy * dims.iz;
+  }
+  const double total_output_size =
+      CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictAvgPoolGrad(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x's shape: op_info.inputs(0)
+  // y_grad: op_info.inputs(1)
+
+  // Extract x_shape from op_info.inputs(0).value() or op_info.outputs(0).
+  bool shape_found = false;
+  TensorShapeProto x_shape;
+  if (op_info.inputs_size() >= 1 && op_info.inputs(0).has_value()) {
+    const TensorProto& value = op_info.inputs(0).value();
+    shape_found = GetTensorShapeProtoFromTensorProto(value, &x_shape);
+  }
+  if (!shape_found && op_info.outputs_size() > 0) {
+    x_shape = op_info.outputs(0).shape();
+    shape_found = true;
+  }
+  if (!shape_found) {
+    // Set the minimum shape that's feasible.
+    x_shape.Clear();
+    for (int i = 0; i < 4; ++i) {
+      x_shape.add_dim()->set_size(1);
+    }
+    found_unknown_shapes = true;
+  }
+
+  ConvolutionDimensions dims =
+      OpDimensionsFromInputs(x_shape, op_info, &found_unknown_shapes);
+
+  int64 ops = 0;
+  if (dims.kx <= dims.sx && dims.ky <= dims.sy) {
+    // Non-overlapping window.
+    ops = dims.batch * dims.iz * (dims.ix * dims.iy + dims.ox * dims.oy);
+  } else {
+    // Overlapping window.
+    ops = dims.batch * dims.iz *
+          (dims.ix * dims.iy + dims.ox * dims.oy * (dims.kx * dims.ky + 1));
+  }
+
+  const double total_input_size =
+      CalculateInputSize(op_info, &found_unknown_shapes);
+  const double total_output_size =
+      CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictFusedBatchNorm(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  // scale: op_info.inputs(1)
+  // offset: op_info.inputs(2)
+  // mean: op_info.inputs(3)  --> only for inference
+  // variance: op_info.inputs(4) --> only for inference
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+  const bool is_training = IsTraining(op_info);
+
+  int64 ops = 0;
+  const auto rsqrt_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_rsqrt_op<float>>::Cost;
+  if (is_training) {
+    ops = dims.iz * (dims.batch * dims.ix * dims.iy * 4 + 6 + rsqrt_cost);
+  } else {
+    ops = dims.batch * dims.ix * dims.iy * dims.iz * 2;
+  }
+
+  const double size_nhwc =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  const double size_c =
+      CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
+  double total_input_size = 0.0;
+  double total_internal_read_size = 0.0;
+  double total_output_size = 0.0;
+  if (is_training) {
+    total_input_size = size_nhwc + size_c * 2;
+    total_output_size = size_nhwc + size_c * 4;
+    total_internal_read_size = size_nhwc;
+  } else {
+    total_input_size = size_nhwc + size_c * 4;
+    total_output_size = size_nhwc;
+  }
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size + total_internal_read_size,
+      op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // y_backprop: op_info.inputs(0)
+  // x: op_info.inputs(1)
+  // scale: op_info.inputs(2)
+  // mean: op_info.inputs(3)
+  // variance or inverse of variance: op_info.inputs(4)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(1).shape(), op_info, &found_unknown_shapes);
+
+  int64 ops = 0;
+  const auto rsqrt_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_rsqrt_op<float>>::Cost;
+  ops = dims.iz * (dims.batch * dims.ix * dims.iy * 11 + 5 + rsqrt_cost);
+
+  const double size_nhwc =
+      CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
+  const double size_c =
+      CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
+  double total_input_size = size_nhwc * 2 + size_c * 2;
+  double total_internal_read_size = size_nhwc;
+  double total_output_size = size_nhwc * 1 + size_c * 2;
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size + total_internal_read_size,
+      op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+/* static */
+OpLevelCostEstimator::ConvolutionFormat
+OpLevelCostEstimator::GetConvolutionFormat(const OpContext& op_context) {
+  auto data_format = GetDataFormat(op_context.op_info);
+  if (data_format == "NCHW") {
+    return NCHW;
+  } else if (data_format == "NHWC") {
+    return NHWC;
+  } else if (data_format == "NCHW_VECT_C") {
+    return NCHW_VECT_C;
+  }
+
+  return UNKNOWN_CONVOLUTION_FORMAT;
+}
+
+void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
+    Costs* costs) const {
+  if (compute_memory_overlap_) {
+    costs->execution_time = std::max(costs->compute_time, costs->memory_time);
+  } else {
+    costs->execution_time = costs->compute_time + costs->memory_time;
+  }
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index a292e5e97fe52383648d74b08bb7a384b6278446..35649f7ee959a292dbf68246221bc98c52f2db37 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -28,6 +28,9 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
+                                        TensorShapeProto* tensor_shape_proto);
+
 class OpLevelCostEstimator {
  public:
   OpLevelCostEstimator();
@@ -48,10 +51,15 @@ class OpLevelCostEstimator {
   // Predict cost of an op for which no accurate estimator is defined.
   Costs PredictCostOfAnUnknownOp(const OpContext& op_context) const;
 
-  // Naive cost estimate based on operations divided by device ops/sec,
-  // and input/output tensor sizes.
-  Costs PredictOpCountBasedCost(double operations,
-                                const OpInfo& op_features) const;
+  // Naive cost estimate based on the given operations count and total
+  // input/output tensor sizes of the given op_info combined.
+  Costs PredictOpCountBasedCost(double operations, const OpInfo& op_info) const;
+
+  // Naive cost estimate based on the given operations count and the given total
+  // io size in bytes. Sizes of op_info inputs and outputs are not taken into
+  // consideration.
+  Costs PredictOpCountBasedCost(double operations, double total_io_bytes,
+                                const OpInfo& op_info) const;
 
   // This family of routines counts the number of operations to perform the
   // specified TensorFlow Op.
@@ -74,6 +82,13 @@ class OpLevelCostEstimator {
     int64 sy;         // Stride y.
     Padding padding;  // SAME or VALID.
   };
+  enum ConvolutionFormat {
+    UNKNOWN_CONVOLUTION_FORMAT,
+    NHWC,
+    NCHW,
+    NCHW_VECT_C,
+    NCHW_VECT_W,
+  };
   int64 CountConv2DOperations(const OpInfo& op_features,
                               bool* found_unknown_shapes) const;
   int64 CountConv2DOperations(const OpInfo& op_features,
@@ -122,7 +137,7 @@ class OpLevelCostEstimator {
   // implementation just divides the operations to
   // perform the op (from the "Count" routines,
   // above) by the device peak operations per
-  // second. Override to supply a better estimate.
+  // second.
   // Implementation of costs other than
   // execution_time is optional, depending on the
   // device.
@@ -130,12 +145,24 @@ class OpLevelCostEstimator {
   Costs PredictCwiseOp(const OpContext& op_context) const;
   Costs PredictConv2DBackpropInput(const OpContext& op_context) const;
   Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
+  Costs PredictFusedConv2DBiasActivation(const OpContext& op_context) const;
   Costs PredictMatMul(const OpContext& op_context) const;
   Costs PredictNoOp(const OpContext& op_context) const;
   Costs PredictIdentity(const OpContext& op_context) const;
   Costs PredictVariable(const OpContext& op_context) const;
   Costs PredictBatchMatMul(const OpContext& op_context) const;
   Costs PredictMetadata(const OpContext& op_context) const;
+  Costs PredictGatherOrSlice(const OpContext& op_context) const;
+  Costs PredictMaxPool(const OpContext& op_context) const;
+  Costs PredictMaxPoolGrad(const OpContext& op_context) const;
+  Costs PredictAvgPool(const OpContext& op_context) const;
+  Costs PredictAvgPoolGrad(const OpContext& op_context) const;
+  Costs PredictFusedBatchNorm(const OpContext& op_context) const;
+  Costs PredictFusedBatchNormGrad(const OpContext& op_context) const;
+
+  // Generic cost prediction method for fused operations.
+  Costs PredictFusedOp(const OpContext& op_context,
+                       const std::vector<OpContext>& fused_op_contexts) const;
 
   // Utility function for safe division. Returns 0
   // if rhs is 0 or negative.
@@ -147,11 +174,36 @@ class OpLevelCostEstimator {
     }
   }
 
+  // For convolution and its grad ops.
   static ConvolutionDimensions ConvolutionDimensionsFromInputs(
       const TensorShapeProto& original_image_shape,
-      const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+      const TensorShapeProto& original_filter_shape, const OpInfo& op_info,
+      bool* found_unknown_shapes);
+
+  // For Pooling, FusedBatchNorm, and their grad ops.
+  static ConvolutionDimensions OpDimensionsFromInputs(
+      const TensorShapeProto& original_image_shape, const OpInfo& op_info,
       bool* found_unknown_shapes);
 
+  // Helper to construct child operation contexts for the component operations
+  // of fused ops.
+  static OpContext FusedChildContext(
+      const OpContext& parent, const string& op_name,
+      const OpInfo::TensorProperties& output,
+      const std::vector<OpInfo::TensorProperties>& inputs);
+
+  // Helper to construct tensor shapes.
+  static OpInfo::TensorProperties DescribeTensor(
+      DataType type, const std::vector<int64>& dims);
+
+  // Returns the Conv2D format for this operation.
+  static ConvolutionFormat GetConvolutionFormat(const OpContext& op_context);
+
+  // This method calculates the execution time depending on whether IO can
+  // overlap with computation. It assumes the memory and the compute times have
+  // already been calculated.
+  void CombineCostsAndUpdateExecutionTime(Costs* costs) const;
+
  protected:
   std::map<string, int> elementwise_ops_;
   typedef std::function<Costs(const OpContext& op_context)> CostImpl;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 60fc783472d2b6a1d50eb52e912da1fccbe8cf08..13ea43bed692828f00e89b7f964c3abcdcdb6483 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"
@@ -53,28 +57,11 @@ OpContext DescribeMatMul(int m, int n, int l, int k) {
   return op_context;
 }
 
-// Returns an OpInfo for MatMul with unknown input shapes.
-OpContext DescribeMatMulUnknownShape() {
-  OpContext op_context;
-  SetCpuDevice(&op_context.op_info);
-  op_context.op_info.set_op("MatMul");
-
-  auto input = op_context.op_info.add_inputs();
-  auto shape = input->mutable_shape();
-  shape->set_unknown_rank(true);
-
-  input = op_context.op_info.add_inputs();
-  shape = input->mutable_shape();
-  shape->set_unknown_rank(true);
-
-  return op_context;
-}
-
 // Wrangles the minimum number of proto fields to set up an input of
 // arbitrary rank and type.
 void DescribeArbitraryRankInput(const std::vector<int>& dims, DataType dtype,
-                                OpInfo* op_features) {
-  auto input = op_features->add_inputs();
+                                OpInfo* op_info) {
+  auto input = op_info->add_inputs();
   input->set_dtype(dtype);
   auto shape = input->mutable_shape();
   for (auto d : dims) {
@@ -82,6 +69,18 @@ void DescribeArbitraryRankInput(const std::vector<int>& dims, DataType dtype,
   }
 }
 
+// Wrangles the minimum number of proto fields to set up an output of
+// arbitrary rank and type.
+void DescribeArbitraryRankOutput(const std::vector<int>& dims, DataType dtype,
+                                 OpInfo* op_info) {
+  auto output = op_info->add_outputs();
+  output->set_dtype(dtype);
+  auto shape = output->mutable_shape();
+  for (auto d : dims) {
+    shape->add_dim()->set_size(d);
+  }
+}
+
 // Returns an OpInfo for a BatchMatMul
 OpContext DescribeBatchMatMul(const std::vector<int>& dims_a,
                               const std::vector<int>& dims_b) {
@@ -94,48 +93,279 @@ OpContext DescribeBatchMatMul(const std::vector<int>& dims_a,
   return op_context;
 }
 
+// Wrangles the minimum number of proto fields to set up a 1D Tensor for cost
+// estimation purposes.
+void DescribeTensor1D(int dim0, OpInfo::TensorProperties* tensor) {
+  auto shape = tensor->mutable_shape();
+  shape->add_dim()->set_size(dim0);
+  tensor->set_dtype(DT_FLOAT);
+}
+
 // Wrangles the minimum number of proto fields to set up a 4D Tensor for cost
 // estimation purposes.
 void DescribeTensor4D(int dim0, int dim1, int dim2, int dim3,
-                      OpInfo* op_features) {
-  auto input = op_features->add_inputs();
-  auto shape = input->mutable_shape();
+                      OpInfo::TensorProperties* tensor) {
+  auto shape = tensor->mutable_shape();
   shape->add_dim()->set_size(dim0);
   shape->add_dim()->set_size(dim1);
   shape->add_dim()->set_size(dim2);
   shape->add_dim()->set_size(dim3);
-  input->set_dtype(DT_FLOAT);
+  tensor->set_dtype(DT_FLOAT);
 }
 
-// Returns an OpInfo for Conv2D with the minimum set of fields set up.
+// DescribeConvolution constructs an OpContext for a Conv2D applied to an input
+// tensor with shape (batch, ix, iy, iz1) and a kernel tensor with shape
+// (kx, ky, iz2, oz).
 OpContext DescribeConvolution(int batch, int ix, int iy, int iz1, int iz2,
                               int kx, int ky, int oz) {
   OpContext op_context;
   SetCpuDevice(&op_context.op_info);
   op_context.op_info.set_op("Conv2D");
 
-  DescribeTensor4D(batch, ix, iy, iz1, &op_context.op_info);
-  DescribeTensor4D(kx, ky, iz2, oz, &op_context.op_info);
+  DescribeTensor4D(batch, ix, iy, iz1, op_context.op_info.add_inputs());
+  DescribeTensor4D(kx, ky, iz2, oz, op_context.op_info.add_inputs());
+
+  return op_context;
+}
+
+// DescribeFusedConv2DBiasActivation constructs an OpContext for a
+// FusedConv2DBiasActivation applied to a convolution input tensor with shape
+// (batch, ix, iy, iz1), a kernel tensor with shape (kx, ky, iz2, oz), a
+// bias tensor with shape (oz), a side input tensor with shape
+// (batch, ox, oy, oz) if has_side_input is set, and two scaling tensors with
+// shape (1).
+//
+// Note that this assumes the NHWC data format.
+OpContext DescribeFusedConv2DBiasActivation(int batch, int ix, int iy, int iz1,
+                                            int iz2, int kx, int ky, int ox,
+                                            int oy, int oz,
+                                            bool has_side_input) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("FusedConv2DBiasActivation");
+  DescribeTensor4D(batch, ix, iy, iz1, op_context.op_info.add_inputs());
+  DescribeTensor4D(kx, ky, iz2, oz, op_context.op_info.add_inputs());
+  DescribeTensor1D(oz, op_context.op_info.add_inputs());
+
+  // Add the side_input, if any.
+  auto side_input = op_context.op_info.add_inputs();
+  if (has_side_input) {
+    DescribeTensor4D(batch, ox, oy, oz, side_input);
+  }
+
+  // Add the scaling tensors.
+  DescribeTensor1D(1, op_context.op_info.add_inputs());
+  DescribeTensor1D(1, op_context.op_info.add_inputs());
+
   return op_context;
 }
 
-OpContext DescribeOp(const string& op, int size1, int size2) {
+// DescribeUnaryOp constructs an OpContext for the given operation applied to
+// a 4-tensor with shape (size1, 1, 1, 1).
+OpContext DescribeUnaryOp(const string& op, int size1) {
   OpContext op_context;
   SetCpuDevice(&op_context.op_info);
   op_context.op_info.set_op(op);
 
-  DescribeTensor4D(size1, 1, 1, 1, &op_context.op_info);
-  DescribeTensor4D(2 * size1, size2, 1, 1, &op_context.op_info);
+  DescribeTensor4D(size1, 1, 1, 1, op_context.op_info.add_inputs());
+  DescribeTensor4D(size1, 1, 1, 1, op_context.op_info.add_outputs());
 
-  auto output = op_context.op_info.add_outputs();
-  auto shape = output->mutable_shape();
-  shape->add_dim()->set_size(2 * size1);
-  shape->add_dim()->set_size(size2);
-  shape->add_dim()->set_size(1);
-  shape->add_dim()->set_size(1);
-  output->set_dtype(DT_FLOAT);
+  return op_context;
+}
 
+// DescribeBinaryOp constructs an OpContext for the given operation applied to
+// a 4-tensor with dimensions (size1, 1, 1, 1) and a 4-tensor with dimensions
+// (2 * size1, size2, 1, 1).
+//
+// The choice of dimension here is arbitrary, and is used strictly to test the
+// cost model for applying elementwise operations to tensors with unequal
+// dimension values.
+OpContext DescribeBinaryOp(const string& op, int size1, int size2) {
+  OpContext op_context;
   SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op(op);
+
+  DescribeTensor4D(size1, 1, 1, 1, op_context.op_info.add_inputs());
+  DescribeTensor4D(2 * size1, size2, 1, 1, op_context.op_info.add_inputs());
+  DescribeTensor4D(2 * size1, size2, 1, 1, op_context.op_info.add_outputs());
+
+  return op_context;
+}
+
+// DescribeBiasAdd constructs an OpContext for a BiasAdd applied to a 4-tensor
+// with dimensions (1, 1, size2, size1) and a bias with dimension (size1),
+// according to the constraint that the bias must be 1D with size equal to that
+// of the last dimension of the input value.
+OpContext DescribeBiasAdd(int size1, int size2) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("BiasAdd");
+
+  DescribeTensor4D(1, 1, size2, size1, op_context.op_info.add_inputs());
+  DescribeTensor1D(size1, op_context.op_info.add_inputs());
+  DescribeTensor4D(1, 1, size2, size1, op_context.op_info.add_outputs());
+
+  return op_context;
+}
+
+int GetOutputSize(const int x, const int k, const int s,
+                  const string& padding) {
+  if (padding == "SAME") {
+    return (x + s - 1) / s;
+  } else {
+    return (x - k + s) / s;
+  }
+}
+
+std::vector<int> GetPoolingOutputSize(const std::vector<int>& input,
+                                      const std::vector<int>& ksize,
+                                      const std::vector<int>& strides,
+                                      const string& data_format,
+                                      const string& padding) {
+  // h, w, and c indices: default with NHWC.
+  int h_index = 1;
+  int w_index = 2;
+  int c_index = 3;
+  if (data_format == "NCHW") {
+    h_index = 2;
+    w_index = 3;
+    c_index = 1;
+  }
+  // Extract parameters.
+  int n = input[0];
+  int h = input[h_index];
+  int w = input[w_index];
+  int c = input[c_index];
+  int sx = strides[h_index];
+  int sy = strides[w_index];
+  int kx = ksize[h_index];
+  int ky = ksize[w_index];
+
+  // Output activation size: default with VALID padding.
+  int ho = GetOutputSize(h, kx, sx, padding);
+  int wo = GetOutputSize(w, ky, sy, padding);
+
+  std::vector<int> output;
+  if (data_format == "NHWC") {
+    output = {n, ho, wo, c};
+  } else {
+    output = {n, c, ho, wo};
+  }
+  return output;
+}
+
+// Helper functions for testing GetTensorShapeProtoFromTensorProto().
+void GetTensorProto(const DataType dtype, const std::vector<int64>& shape,
+                    const std::vector<int64> values, const bool tensor_content,
+                    TensorProto* tensor_proto) {
+  tensor_proto->Clear();
+  TensorProto temp_tensor_proto;
+  temp_tensor_proto.set_dtype(dtype);
+  for (const auto& x : shape) {
+    temp_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(x);
+  }
+  for (const auto& x : values) {
+    if (dtype == DT_INT64) {
+      temp_tensor_proto.add_int64_val(x);
+    } else if (dtype == DT_INT32 || dtype == DT_INT16 || dtype == DT_INT8 ||
+               dtype == DT_UINT8) {
+      temp_tensor_proto.add_int_val(x);
+    } else if (dtype == DT_UINT32) {
+      temp_tensor_proto.add_uint32_val(x);
+    } else if (dtype == DT_UINT64) {
+      temp_tensor_proto.add_uint64_val(x);
+    } else {
+      CHECK(false) << "Unsupported dtype: " << dtype;
+    }
+  }
+  Tensor tensor(dtype);
+  CHECK(tensor.FromProto(temp_tensor_proto));
+  if (tensor_content) {
+    tensor.AsProtoTensorContent(tensor_proto);
+  } else {
+    tensor.AsProtoField(tensor_proto);
+  }
+}
+
+OpContext DescribePoolingOp(const string& op_name, const std::vector<int>& x,
+                            const std::vector<int>& ksize,
+                            const std::vector<int>& strides,
+                            const string& data_format, const string& padding) {
+  OpContext op_context;
+  auto& op_info = op_context.op_info;
+  SetCpuDevice(&op_info);
+  op_info.set_op(op_name);
+
+  const std::vector<int> y =
+      GetPoolingOutputSize(x, ksize, strides, data_format, padding);
+  if (op_name == "AvgPool" || op_name == "MaxPool") {
+    // input: x, output: y.
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_outputs());
+  } else if (op_name == "AvgPoolGrad") {
+    // input: x's shape, y_grad, output: x_grad.
+    DescribeArbitraryRankInput({4}, DT_INT32, &op_info);
+    auto* tensor_proto = op_info.mutable_inputs(0)->mutable_value();
+    GetTensorProto(DT_INT32, {4}, {x[0], x[1], x[2], x[3]},
+                   /*tensor_content=*/false, tensor_proto);
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_outputs());
+  } else if (op_name == "MaxPoolGrad") {
+    // input: x, y, y_grad, output: x_grad.
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_outputs());
+  }
+  auto* attr = op_info.mutable_attr();
+  SetAttrValue(data_format, &(*attr)["data_format"]);
+  SetAttrValue(padding, &(*attr)["padding"]);
+  SetAttrValue(strides, &(*attr)["strides"]);
+  SetAttrValue(ksize, &(*attr)["ksize"]);
+  return op_context;
+}
+
+OpContext DescribeFusedBatchNorm(const bool is_training, const bool is_grad,
+                                 const std::vector<int>& x,
+                                 const string& data_format) {
+  // First, get MaxPool op info with unit stride and unit window.
+  OpContext op_context = DescribePoolingOp("MaxPool", x, {1, 1, 1, 1},
+                                           {1, 1, 1, 1}, data_format, "SAME");
+  auto& op_info = op_context.op_info;
+  // Override op name.
+  if (is_grad) {
+    op_info.set_op("FusedBatchNormGrad");
+  } else {
+    op_info.set_op("FusedBatchNorm");
+  }
+
+  // Add additional input output tensors.
+  if (is_grad) {
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+  }
+  int num_1d_inputs = is_grad ? 3 : 4;
+  for (int i = 0; i < num_1d_inputs; i++) {
+    auto* tensor = op_info.add_inputs();
+    auto* shape = tensor->mutable_shape();
+    shape->add_dim()->set_size(x[3]);
+    tensor->set_dtype(DT_FLOAT);
+  }
+  for (int i = 0; i < 4; i++) {
+    auto* tensor = op_info.add_outputs();
+    auto* shape = tensor->mutable_shape();
+    shape->add_dim()->set_size(x[3]);
+    tensor->set_dtype(DT_FLOAT);
+  }
+
+  // Delete unnecessary attr.
+  auto* attr = op_context.op_info.mutable_attr();
+  attr->erase("ksize");
+  attr->erase("strides");
+  attr->erase("padding");
+
+  // Additional attrs for FusedBatchNorm.
+  SetAttrValue(is_training, &(*attr)["is_training"]);
+
   return op_context;
 }
 }  // namespace
@@ -161,11 +391,122 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
     estimator_.compute_memory_overlap_ = value;
   }
 
+  void ValidateOpDimensionsFromImputs(const int n, const int h, const int w,
+                                      const int c, const int kx, const int ky,
+                                      const int sx, const int sy,
+                                      const string& data_format,
+                                      const string& padding) {
+    OpContext op_context;
+    int ho;
+    int wo;
+    if (data_format == "NHWC") {
+      op_context = DescribePoolingOp("MaxPool", {n, h, w, c}, {1, kx, ky, 1},
+                                     {1, sx, sy, 1}, "NHWC", padding);
+      ho = op_context.op_info.outputs(0).shape().dim(1).size();
+      wo = op_context.op_info.outputs(0).shape().dim(2).size();
+    } else {
+      op_context = DescribePoolingOp("MaxPool", {n, c, h, w}, {1, 1, kx, ky},
+                                     {1, 1, sx, sy}, "NCHW", padding);
+      ho = op_context.op_info.outputs(0).shape().dim(2).size();
+      wo = op_context.op_info.outputs(0).shape().dim(3).size();
+    }
+
+    bool found_unknown_shapes;
+    auto dims = OpLevelCostEstimator::OpDimensionsFromInputs(
+        op_context.op_info.inputs(0).shape(), op_context.op_info,
+        &found_unknown_shapes);
+    Padding padding_enum;
+    if (padding == "VALID") {
+      padding_enum = Padding::VALID;
+    } else {
+      padding_enum = Padding::SAME;
+    }
+    EXPECT_EQ(n, dims.batch);
+    EXPECT_EQ(h, dims.ix);
+    EXPECT_EQ(w, dims.iy);
+    EXPECT_EQ(c, dims.iz);
+    EXPECT_EQ(kx, dims.kx);
+    EXPECT_EQ(ky, dims.ky);
+    EXPECT_EQ(sx, dims.sx);
+    EXPECT_EQ(sy, dims.sy);
+    EXPECT_EQ(ho, dims.ox);
+    EXPECT_EQ(wo, dims.oy);
+    EXPECT_EQ(c, dims.oz);
+    EXPECT_EQ(padding_enum, dims.padding);
+  }
+
   OpLevelCostEstimator estimator_;
 };
 
+TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Gather");
+
+  // Huge first input shouldn't affect Gather execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankOutput({16, 10}, DT_FLOAT, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(130), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(16), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(146), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, TestGatherCostsWithoutOutput) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Gather");
+
+  // Huge first input shouldn't affect Gather execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(0), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(0), cost.execution_time);
+  EXPECT_TRUE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Slice");
+
+  // Huge first input shouldn't affect Slice execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankOutput({10, 10}, DT_FLOAT, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(81), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(10), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(91), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, BiasAddExecutionTime) {
+  auto cost = PredictCosts(DescribeBiasAdd(1000, 10));
+  EXPECT_EQ(Costs::Duration(8400), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(1000), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(9400), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, Conv2DExecutionTime) {
+  auto cost = PredictCosts(DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(233780), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(354877440), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(355111220), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
-  auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
+  auto cost = PredictCosts(DescribeBinaryOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);
@@ -174,7 +515,7 @@ TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
 
 TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   SetComputeMemoryOverlap(true);
-  auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
+  auto cost = PredictCosts(DescribeBinaryOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);  // max(2000, 200)
@@ -182,8 +523,27 @@ TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   SetComputeMemoryOverlap(false);  // Set it back to default.
 }
 
+TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationExecutionTime) {
+  auto cost = PredictCosts(DescribeFusedConv2DBiasActivation(
+      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true));
+  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest,
+       FusedConv2DBiasActivationNoSideInputExecutionTime) {
+  auto cost = PredictCosts(DescribeFusedConv2DBiasActivation(
+      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ false));
+  EXPECT_EQ(Costs::Duration(825345), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(355321038), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(356146383), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
-  auto cost = PredictCosts(DescribeOp("Mul", 1000, 1));
+  auto cost = PredictCosts(DescribeBinaryOp("Mul", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(200), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2200), cost.execution_time);
@@ -191,7 +551,7 @@ TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
 }
 
 TEST_F(OpLevelCostEstimatorTest, MulBroadcastExecutionTime) {
-  auto cost = PredictCosts(DescribeOp("Mul", 1000, 2));
+  auto cost = PredictCosts(DescribeBinaryOp("Mul", 1000, 2));
   EXPECT_EQ(Costs::Duration(3600), cost.memory_time);
   EXPECT_EQ(Costs::Duration(400), cost.compute_time);
   EXPECT_EQ(Costs::Duration(4000), cost.execution_time);
@@ -199,13 +559,21 @@ TEST_F(OpLevelCostEstimatorTest, MulBroadcastExecutionTime) {
 }
 
 TEST_F(OpLevelCostEstimatorTest, ModExecutionTime) {
-  auto cost = PredictCosts(DescribeOp("Mod", 1000, 1));
+  auto cost = PredictCosts(DescribeBinaryOp("Mod", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(1600), cost.compute_time);
   EXPECT_EQ(Costs::Duration(3600), cost.execution_time);
   EXPECT_FALSE(cost.inaccurate);
 }
 
+TEST_F(OpLevelCostEstimatorTest, ReluExecutionTime) {
+  auto cost = PredictCosts(DescribeUnaryOp("Relu", 1000));
+  EXPECT_EQ(Costs::Duration(800), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(100), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(900), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, UnknownOrPartialShape) {
   EXPECT_FALSE(PredictCosts(DescribeMatMul(2, 4, 7, 7)).inaccurate);
   EXPECT_TRUE(PredictCosts(DescribeMatMul(-1, 4, 7, 7)).inaccurate);
@@ -247,5 +615,296 @@ TEST_F(OpLevelCostEstimatorTest, BatchMatMul) {
   EXPECT_NE(matmul_inaccurate, batch_matmul_inaccurate);
 }
 
+void ExpectTensorShape(const std::vector<int64>& expected,
+                       const TensorShapeProto& tensor_shape_proto) {
+  TensorShape tensor_shape_expected(expected);
+  TensorShape tensor_shape(tensor_shape_proto);
+
+  LOG(INFO) << "Expected: " << tensor_shape_expected.DebugString();
+  LOG(INFO) << "TensorShape: " << tensor_shape.DebugString();
+  EXPECT_TRUE(tensor_shape_expected == tensor_shape);
+}
+
+TEST_F(OpLevelCostEstimatorTest, GetTensorShapeProtoFromTensorProto) {
+  TensorProto tensor_proto;
+  TensorShapeProto tensor_shape_proto;
+
+  // Dimension larger than max value; should fail while converting to Tensor
+  // class.
+  tensor_proto.mutable_tensor_shape()->add_dim()->set_size(255);
+  EXPECT_FALSE(
+      GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
+
+  tensor_proto.Clear();
+  // Expect only 1D shape.
+  tensor_proto.mutable_tensor_shape()->add_dim()->set_size(1);
+  tensor_proto.mutable_tensor_shape()->add_dim()->set_size(2);
+  EXPECT_FALSE(
+      GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
+
+  // Expect only handle integer data types.
+  GetTensorProto(DT_FLOAT, {}, {}, /*tensor_content=*/false, &tensor_proto);
+  EXPECT_FALSE(
+      GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
+
+  // Check GetTensorShapeProtoFromTensorProto() resturns correct values.
+  {
+    std::vector<int64> shape_expected = {10, 20, 30, 40};
+    GetTensorProto(DT_INT32, {4}, shape_expected, /*tensor_content=*/false,
+                   &tensor_proto);
+    EXPECT_TRUE(
+        GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
+    ExpectTensorShape(shape_expected, tensor_shape_proto);
+  }
+
+  {
+    std::vector<int64> shape_expected = {40, 20, 90, 40};
+    GetTensorProto(DT_INT64, {4}, shape_expected, /*tensor_content=*/false,
+                   &tensor_proto);
+    EXPECT_TRUE(
+        GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
+    ExpectTensorShape(shape_expected, tensor_shape_proto);
+  }
+
+  {
+    std::vector<int64> shape_expected = {10, 20, 30, 40};
+    GetTensorProto(DT_INT32, {4}, shape_expected, /*tensor_content=*/true,
+                   &tensor_proto);
+    EXPECT_TRUE(
+        GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
+    ExpectTensorShape(shape_expected, tensor_shape_proto);
+  }
+
+  {
+    std::vector<int64> shape_expected = {40, 20, 90, 40};
+    GetTensorProto(DT_INT64, {4}, shape_expected, /*tensor_content=*/true,
+                   &tensor_proto);
+    EXPECT_TRUE(
+        GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
+    ExpectTensorShape(shape_expected, tensor_shape_proto);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, OpDimensionsFromInputs) {
+  std::vector<string> paddings = {"VALID", "SAME"};
+  std::vector<string> formats = {"NHWC", "NCHW"};
+  for (const auto& p : paddings) {
+    for (const auto& f : formats) {
+      // n, h, w, c, kx, ky, sx, sy, data_format, padding.
+      ValidateOpDimensionsFromImputs(10, 20, 20, 100, 3, 3, 2, 2, f, p);
+      ValidateOpDimensionsFromImputs(10, 20, 20, 100, 1, 1, 3, 3, f, p);
+      ValidateOpDimensionsFromImputs(10, 200, 200, 100, 5, 5, 3, 3, f, p);
+      ValidateOpDimensionsFromImputs(10, 14, 14, 3840, 3, 3, 2, 2, f, p);
+    }
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictMaxPool) {
+  auto predict_max_pool = [this](const int n, const int in, const int c,
+                                 const int k, const int s,
+                                 const string& padding) -> Costs {
+    OpContext op_context = DescribePoolingOp(
+        "MaxPool", {n, in, in, c}, {1, k, k, 1}, {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_max_pool(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1075200), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(307200), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_max_pool(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(499200), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(38400), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(460800), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_max_pool(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(561792), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(56448), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(505344), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictMaxPoolGrad) {
+  auto predict_max_pool_grad = [this](const int n, const int in, const int c,
+                                      const int k, const int s,
+                                      const string& padding) -> Costs {
+    OpContext op_context =
+        DescribePoolingOp("MaxPoolGrad", {n, in, in, c}, {1, k, k, 1},
+                          {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_max_pool_grad(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1996800), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(614400), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_max_pool_grad(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1536000), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(153600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_max_pool_grad(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(1514112), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(210048), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(1304064), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictAvgPool) {
+  auto predict_avg_pool = [this](const int n, const int in, const int c,
+                                 const int k, const int s,
+                                 const string& padding) -> Costs {
+    OpContext op_context = DescribePoolingOp(
+        "AvgPool", {n, in, in, c}, {1, k, k, 1}, {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_avg_pool(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1113600), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(345600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_avg_pool(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(499200), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(38400), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(460800), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_avg_pool(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(580608), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(75264), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(505344), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictAvgPoolGrad) {
+  auto predict_avg_pool_grad = [this](const int n, const int in, const int c,
+                                      const int k, const int s,
+                                      const string& padding) -> Costs {
+    OpContext op_context =
+        DescribePoolingOp("AvgPoolGrad", {n, in, in, c}, {1, k, k, 1},
+                          {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_avg_pool_grad(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1305602), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(537600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_avg_pool_grad(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(960002), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(192000), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_avg_pool_grad(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(862082), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(172416), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(689666), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNorm) {
+  auto predict_fused_bn = [this](const int n, const int in, const int c,
+                                 const bool is_training) -> Costs {
+    OpContext op_context = DescribeFusedBatchNorm(
+        is_training, /*is_grad=*/false, {n, in, in, c}, "NHWC");
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    auto costs = predict_fused_bn(10, 20, 96, /*is_training=*/true);
+    EXPECT_EQ(Costs::Duration(614737), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(153706), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(461031), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn(10, 20, 32, /*is_training=*/true);
+    EXPECT_EQ(Costs::Duration(204913), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(51236), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(153677), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn(10, 20, 96, /*is_training=*/false);
+    EXPECT_EQ(Costs::Duration(384154), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(76800), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(307354), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn(10, 20, 32, /*is_training=*/false);
+    EXPECT_EQ(Costs::Duration(128052), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(25600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(102452), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNormGrad) {
+  auto predict_fused_bn_grad = [this](const int n, const int in,
+                                      const int c) -> Costs {
+    OpContext op_context = DescribeFusedBatchNorm(
+        /*is_training=*/false, /*is_grad=*/true, {n, in, in, c}, "NHWC");
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    auto costs = predict_fused_bn_grad(10, 20, 96);
+    EXPECT_EQ(Costs::Duration(1037050), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(422496), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(614554), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn_grad(128, 7, 384);
+    EXPECT_EQ(Costs::Duration(6503809), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(2649677), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(3854132), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_performance_data.proto b/tensorflow/core/grappler/costs/op_performance_data.proto
index 37f9ebd6a146c8c0089857c7a41ba863b4c2fb1f..5ef5fd927b8580b55539d98014ba1af4c079ae0d 100644
--- a/tensorflow/core/grappler/costs/op_performance_data.proto
+++ b/tensorflow/core/grappler/costs/op_performance_data.proto
@@ -24,6 +24,11 @@ import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/protobuf/device_properties.proto";
 
+// Description of the session when an op is run.
+message SessionInfo {
+  int64 intra_op_parallelism = 1;
+}
+
 // Description of an operation as well as the parameters expected to impact its
 // performance.
 message OpInfo {
@@ -46,6 +51,9 @@ message OpInfo {
 
   // Device on which the operation is run.
   DeviceProperties device = 4;
+
+  // Information about the session configs.
+  SessionInfo session_info = 6;
 }
 
 message NormalDistribution {
@@ -58,17 +66,13 @@ message LogNormalDistribution {
   double sigma = 2;
 }
 
-message SessionInfo {
-  int64 intra_op_parallelism = 1;
-}
-
 // Performance data for tensorflow operations
 message OpPerformance {
   // The op
   OpInfo op = 1;
 
   // Information about the session configs.
-  SessionInfo session_info = 12;
+  SessionInfo session_info = 12 [deprecated = true];
 
   // The node name (optional). Makes it easier to associate the performance data
   // with a specific graph node.
diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
index 9866bc86887e2fa1a1fcfe95e3e9673b7df1a8f3..5151b87c59cc09934871b225c70e785c8f9093dd 100644
--- a/tensorflow/core/grappler/costs/robust_stats.cc
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/robust_stats.h"
 #include <algorithm>
 #include <cmath>
+#include <utility>
 
 namespace tensorflow {
 namespace grappler {
diff --git a/tensorflow/core/grappler/costs/robust_stats.h b/tensorflow/core/grappler/costs/robust_stats.h
index 9d8f5bc970ad9cde6e5c31ce0df72272e35d1662..f247eb940cef381cc1a42b1b7bf88cf03e1cf10c 100644
--- a/tensorflow/core/grappler/costs/robust_stats.h
+++ b/tensorflow/core/grappler/costs/robust_stats.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
-#define TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_ROBUST_STATS_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_ROBUST_STATS_H_
 
 #include <vector>
 namespace tensorflow {
@@ -39,4 +39,4 @@ class RobustStats {
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_ROBUST_STATS_H_
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 602f69f12ea9d24ebd94da73a2a76d1992f3bfb1..be54d98534e25954edd9d2f53f4882f1ee12a566 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 #endif
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -42,7 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -200,17 +202,25 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
 }
 
 DeviceProperties GetDeviceInfo(const string& device_str) {
+  DeviceProperties unknown;
+  unknown.set_type("UNKNOWN");
+
   DeviceNameUtils::ParsedName parsed;
   if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
     if (parsed.type == "GPU") {
-      return GetLocalGPUInfo(parsed.id);
+      TfGpuId tf_gpu_id(parsed.id);
+      CudaGpuId cuda_gpu_id;
+      Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
+      if (!s.ok()) {
+        // We are probably running simulation without linking cuda libraries.
+        cuda_gpu_id = CudaGpuId(parsed.id);
+      }
+      return GetLocalGPUInfo(cuda_gpu_id);
     } else if (parsed.type == "CPU") {
       return GetLocalCPUInfo();
     }
   }
-  DeviceProperties device;
-  device.set_type("UNKNOWN");
-  return device;
+  return unknown;
 }
 
 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
diff --git a/tensorflow/core/grappler/costs/utils.h b/tensorflow/core/grappler/costs/utils.h
index 409f07b28b16ca6acc032304c9dac02733cdd39f..d2c7c676667c50944cb0c8a9beb9f2ecb3d742fa 100644
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
-#define TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
 
 #include <string>
 #include <unordered_map>
@@ -111,4 +111,4 @@ string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 14b4ed7507f6237ea6255f46e060aa3d0f60b34d..7f6827295079c6cd8665378dcd076c2195b1106f 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -44,6 +44,8 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
 
   Costs result = left;
   result.execution_time += right.execution_time;
+  result.compute_time += right.compute_time;
+  result.memory_time += right.memory_time;
   if (right.inaccurate) {
     result.inaccurate = true;
   }
@@ -325,7 +327,7 @@ Status VirtualScheduler::Init() {
 
   // Get the nodes that would run to output fetch_nodes.
   bool ill_formed = false;
-  std::vector<const NodeDef*> nodes =
+  const std::vector<const NodeDef*> fetch_fanin_nodes =
       ComputeTransitiveFanin(graph, fetch_nodes, &ill_formed);
   if (ill_formed) {
     return errors::InvalidArgument(
@@ -339,7 +341,7 @@ Status VirtualScheduler::Init() {
   // exactly the same as those executed for real. One possible discrepancy could
   // be the control flow nodes, where tf only executes one path.
   std::unordered_map<string, const NodeDef*> name_to_node;
-  for (const auto& node : nodes) {
+  for (const auto& node : fetch_fanin_nodes) {
     name_to_node[node->name()] = node;
   }
 
@@ -360,14 +362,22 @@ Status VirtualScheduler::Init() {
 
   // Build node_map; for each node, create its NodeState and connect its inputs
   // and outputs.
-  for (const auto* curr_node : nodes) {
+  for (const auto* curr_node : fetch_fanin_nodes) {
     auto& curr_node_state = GetNodeStateOrCreateIt(curr_node);
     const string curr_node_device = DeviceName(curr_node);
     std::vector<string> inputs;
     if (IsRecv(*curr_node)) {
       const auto& attr = curr_node->attr();
-      const NodeDef* send = name_to_send[attr.at("tensor_name").s()];
-      inputs = {send->name()};
+      if (attr.count("tensor_name")) {
+        const auto& send_node_name = attr.at("tensor_name").s();
+        auto it = name_to_send.find(send_node_name);
+        // If there is a _Send associated with the curr_node (_Recv), add it as
+        // input.
+        if (it != name_to_send.end()) {
+          const NodeDef* send = it->second;
+          inputs = {send->name()};
+        }
+      }
     } else {
       for (const string& input : curr_node->input()) {
         inputs.push_back(input);
@@ -426,9 +436,11 @@ Status VirtualScheduler::Init() {
         feed_nodes.find(curr_node->name()) != feed_nodes.end();
 
     // Default case: node without inputs are ready at time 0.
-    const bool has_no_inputs = curr_node->input().empty();
+    // Note that we check inputs vector which may be different to
+    // curr_node->input(); e.g., we add Send as input to Recv.
+    const bool has_no_inputs = inputs.empty();
 
-    if (!IsRecv(*curr_node) && (given_as_feed || has_no_inputs)) {
+    if (given_as_feed || has_no_inputs) {
       curr_node_state.time_ready = Costs::Duration();
       ready_nodes_->AddNode(curr_node);
       VLOG(3) << "Added ready node: " << curr_node->name();
@@ -451,9 +463,11 @@ Status VirtualScheduler::Init() {
   }
 
   if (!feed_nodes.empty()) {
-    return errors::InvalidArgument(
-        strings::StrCat("Some feed nodes were not found in the graph: ",
-                        str_util::Join(feed_nodes, ",")));
+    // This isn't always a bug: when the caller hasn't specified the exact list
+    // of feed and fetch nodes, by default we consider all placeholders as feed
+    // nodes, but some of them may not be needed for the default fetch node.
+    VLOG(1) << "Some feed nodes were not consumed by the fetch fanin: "
+            << str_util::Join(feed_nodes, ",");
   }
   initialized_ = true;
   return Status::OK();
@@ -829,19 +843,23 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
 Costs VirtualScheduler::Summary() const {
   // Print out basic execution summary.
   VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
+  VLOG(1) << "Expected compute time: " << graph_costs_.compute_time.count();
+  VLOG(1) << "Expected memory time: " << graph_costs_.memory_time.count();
   VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
   VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
   VLOG(1) << "Expected max per-op streaming buffers: "
           << graph_costs_.max_per_op_streaming;
 
-  VLOG(1) << "Per-op execution time:";
+  VLOG(1) << "Per-op execution time / compute time / memory time:";
   for (const auto& op_cost_pair : op_to_cost_) {
     const auto& op = op_cost_pair.first;
     const auto& cost = op_cost_pair.second.execution_time.count();
+    const auto& compute_cost = op_cost_pair.second.compute_time.count();
+    const auto& memory_cost = op_cost_pair.second.memory_time.count();
     const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
     if (cost) {  // Skip printing out zero-cost ops.
       VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
-              << cost;
+              << cost << " / " << compute_cost << " / " << memory_cost;
     }
   }
 
@@ -882,7 +900,8 @@ Costs VirtualScheduler::Summary() const {
             << ", at the end: "
             << strings::HumanReadableNumBytes(state.memory_usage);
 
-    VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):";
+    VLOG(1) << "Per-op execution time compute time / memory time "
+               "(and memory usage at peak memory usage):";
 
     // Profile non-persistent op memory usage.
     for (const auto& node_port : state.mem_usage_snapshot_at_peak) {
@@ -896,6 +915,8 @@ Costs VirtualScheduler::Summary() const {
     for (const auto& op_cost_pair : state.op_to_cost) {
       const auto& op = op_cost_pair.first;
       const auto& cost = op_cost_pair.second.execution_time.count();
+      const auto& compute_cost = op_cost_pair.second.compute_time.count();
+      const auto& memory_cost = op_cost_pair.second.memory_time.count();
       total_compute_time_ns += op_cost_pair.second.execution_time;
       const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
       if (!is_op_cost_accurate) {
@@ -914,8 +935,9 @@ Costs VirtualScheduler::Summary() const {
       if (cost || mem_usage_percent > 1.0) {
         // Print out only non-zero cost ops or ops with > 1% memory usage.
         VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
-                << cost << " (" << strings::HumanReadableNumBytes(op_mem_usage)
-                << " [" << mem_usage_percent << "%] "
+                << cost << " / " << compute_cost << " / " << memory_cost << " ("
+                << strings::HumanReadableNumBytes(op_mem_usage) << " ["
+                << mem_usage_percent << "%] "
                 << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
       }
     }
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 5116c8183cb4c51dc833988cbeb75a4a184e4c40..67bf1e6980e5508780afac438324293b3b971d50 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -199,7 +199,7 @@ class FirstReadyManager : public ReadyNodeManager {
   // current node.
   std::vector<const NodeDef*> nodes_;
   // Newly added nodes are added to waiting_queue_. That way, GetCurrNode(),
-  // wihch returns the front of the nodes_, always returns the same node,
+  // which returns the front of the nodes_, always returns the same node,
   // even if any of new nodes has time_ready smaller than the current node's.
   std::vector<const NodeDef*> waiting_queue_;
   // Comparator functor for heap; stl heap is max heap, so we use "greater than"
@@ -212,7 +212,7 @@ class FirstReadyManager : public ReadyNodeManager {
 };
 
 // CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal
-// ops (neither _Send nor _Recv) and FirstyReadyManagers for _Send ops and _Recv
+// ops (neither _Send nor _Recv) and FirstReadyManagers for _Send ops and _Recv
 // ops, and then it chooses FirstReady among the ops chosen from each
 // internal NodeManagers. The objective is to maximize producer-consumer
 // locality within device, while processing nodes across devices, including
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 53dcb497a6453dfa70c1215352e74e96796ebeb7..f9154e42f984c8dd8e774b83750b41a48087d7bb 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -205,6 +205,25 @@ class VirtualSchedulerTest : public ::testing::Test {
     dependency_["out"] = {"x", "y", "z", "w"};
   }
 
+  // Graph with some placeholder feed nodes that are not in the fetch fan-in.
+  void CreateGrapplerItemWithUnnecessaryPlaceholderNodes() {
+    Scope s = Scope::NewRootScope().WithDevice(kCPU0);
+    auto unnecessary = ops::Placeholder(s.WithOpName("unnecessary"), DT_FLOAT);
+    auto x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT);
+
+    GraphDef def;
+    TF_CHECK_OK(s.ToGraphDef(&def));
+
+    grappler_item_.reset(new GrapplerItem);
+    grappler_item_->id = "test_extra_placeholders";
+    grappler_item_->graph = def;
+    grappler_item_->fetch = {"x"};
+
+    // Grappler Item Builder puts all placeholder nodes into the feed
+    // list by default.
+    grappler_item_->feed = {{"x", Tensor()}, {"unnecessary", Tensor()}};
+  }
+
   // NoOp that takes 7 NoOps as control dependency.
   void CreateGrapplerItemWithControlDependency() {
     Scope s = Scope::NewRootScope().WithDevice(kCPU0);
@@ -394,6 +413,63 @@ versions {
     grappler_item_->fetch = {"Recv"};
   }
 
+  void CreateGrapplerItemWithRecvWithoutSend() {
+    const string gdef_ascii = R"EOF(
+node {
+  name: "Recv"
+  op: "_Recv"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "client_terminated"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "test"
+    }
+  }
+  attr {
+    key: "tensor_type"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 24
+}
+    )EOF";
+
+    grappler_item_.reset(new GrapplerItem);
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii,
+                                                &grappler_item_->graph));
+    grappler_item_->id = "test_graph";
+    grappler_item_->fetch = {"Recv"};
+  }
+
   // A simple while loop
   void CreateGrapplerItemWithLoop() {
     // Test graph produced in python using:
@@ -1700,6 +1776,16 @@ TEST_F(VirtualSchedulerTest, MemoryUsage) {
                               cpu_state.mem_usage_snapshot_at_peak);
 }
 
+TEST_F(VirtualSchedulerTest, UnnecessaryFeedNodes) {
+  CreateGrapplerItemWithUnnecessaryPlaceholderNodes();
+  InitScheduler();
+
+  // Test that scheduler can run graphs with extra unnecessary feed nodes.
+  auto ops_executed = RunScheduler("");
+  ASSERT_EQ(1, ops_executed.size());
+  ASSERT_EQ(ops_executed.count("x"), 1);
+}
+
 TEST_F(VirtualSchedulerTest, ControlDependency) {
   // Init.
   CreateGrapplerItemWithControlDependency();
@@ -2015,5 +2101,17 @@ TEST_F(VirtualSchedulerTest, GraphWithSendRecvDifferentDevice) {
             0);
   EXPECT_GT(ops_executed.count("Recv"), 0);
 }
+
+TEST_F(VirtualSchedulerTest, GraphWihtOnlyRecv) {
+  // Init.
+  CreateGrapplerItemWithRecvWithoutSend();
+  InitScheduler();
+
+  // Run the scheduler.
+  auto ops_executed = RunScheduler("");
+
+  // Recv without Send will be treated as initially ready node.
+  EXPECT_GT(ops_executed.count("Recv"), 0);
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index b318ac22d4babe96968e94730691dc0d0664d585..3268697671b9ba47e489d5037af9a7267353b448 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
 #if GOOGLE_CUDA
@@ -30,15 +31,14 @@ int GetNumAvailableGPUs() {
   int num_eligible_gpus = 0;
 #if GOOGLE_CUDA
   if (ValidateGPUMachineManager().ok()) {
-    perftools::gputools::Platform* gpu_manager = GPUMachineManager();
+    se::Platform* gpu_manager = GPUMachineManager();
     if (gpu_manager != nullptr) {
       int num_gpus = gpu_manager->VisibleDeviceCount();
       for (int i = 0; i < num_gpus; i++) {
         auto exec_status = gpu_manager->ExecutorForDevice(i);
         if (exec_status.ok()) {
-          perftools::gputools::StreamExecutor* se = exec_status.ValueOrDie();
-          const perftools::gputools::DeviceDescription& desc =
-              se->GetDeviceDescription();
+          se::StreamExecutor* se = exec_status.ValueOrDie();
+          const se::DeviceDescription& desc = se->GetDeviceDescription();
           int min_gpu_core_count = 8;
           if (desc.core_count() >= min_gpu_core_count) {
             num_eligible_gpus++;
@@ -56,10 +56,9 @@ int GetNumAvailableGPUs() {
 int64 AvailableGPUMemory(int gpu_id) {
 #if GOOGLE_CUDA
   // Look up the device, to see its attributes.
-  perftools::gputools::Platform* gpu_platform = GPUMachineManager();
+  se::Platform* gpu_platform = GPUMachineManager();
   CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
-  perftools::gputools::StreamExecutor* se =
-      gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+  se::StreamExecutor* se = gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
   int64 total_memory, available_memory;
   CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory));
 
diff --git a/tensorflow/core/grappler/devices.h b/tensorflow/core/grappler/devices.h
index 2d6c41888d92e044fbfd06104f2079562c37fcfc..1e60117b2d1649176b4a942911df7f75609fa74e 100644
--- a/tensorflow/core/grappler/devices.h
+++ b/tensorflow/core/grappler/devices.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_DEVICES_H_
-#define TENSORFLOW_GRAPPLER_DEVICES_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_DEVICES_H_
+#define TENSORFLOW_CORE_GRAPPLER_DEVICES_H_
 
 #include <functional>
 
@@ -39,4 +39,4 @@ int GetNumAvailableLogicalCPUCores();
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_DEVICES_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_DEVICES_H_
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index 0d3f94854b65cfc06c3d68fc5ac7bc3aa68f9a34..3e448216f900f28927b7be69708df43d6e02177b 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -173,5 +173,54 @@ int GraphView::NumFanins(const NodeDef& node,
   return count;
 }
 
+std::unordered_set<GraphView::Edge, GraphView::HashEdge>
+GraphView::GetFanoutEdges(const NodeDef& node,
+                          bool include_controlled_edges) const {
+  std::unordered_set<Edge, HashEdge> result;
+  OutputPort port;
+  port.node = const_cast<NodeDef*>(&node);
+  const int first_port_id = include_controlled_edges ? -1 : 0;
+  auto it = num_regular_outputs_.find(&node);
+  const int last_port_id = (it != num_regular_outputs_.end()) ? it->second : -1;
+
+  for (int i = first_port_id; i <= last_port_id; ++i) {
+    port.port_id = i;
+    auto it = fanouts_.find(port);
+    if (it != fanouts_.end()) {
+      Edge fanout;
+      fanout.src.node = const_cast<NodeDef*>(&node);
+      fanout.src.port_id = i;
+      for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) {
+        fanout.tgt = *itr;
+        result.insert(fanout);
+      }
+    }
+  }
+  return result;
+}
+
+std::unordered_set<GraphView::Edge, GraphView::HashEdge>
+GraphView::GetFaninEdges(const NodeDef& node,
+                         bool include_controlling_edges) const {
+  std::unordered_set<Edge, HashEdge> result;
+  for (int i = 0; i < node.input_size(); ++i) {
+    Edge fanin;
+    fanin.tgt.node = const_cast<NodeDef*>(&node);
+    fanin.tgt.port_id = i;
+    string fanin_name = ParseNodeName(node.input(i), &fanin.src.port_id);
+    if (fanin.src.port_id < 0) {
+      if (!include_controlling_edges) {
+        break;
+      }
+    }
+    auto it = nodes_.find(fanin_name);
+    if (it != nodes_.end()) {
+      fanin.src.node = it->second;
+      result.insert(fanin);
+    }
+  }
+  return result;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 173ce9c09c2fd98d855a801131ed16a796d9caac..584cb9048b64fde5a6d790fe93748e06d83d3b26 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_GRAPH_VIEW_H_
-#define TENSORFLOW_GRAPPLER_GRAPH_VIEW_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_VIEW_H_
 
 #include <unordered_map>
 #include <unordered_set>
@@ -29,6 +29,8 @@ namespace grappler {
 class GraphView {
  public:
   struct Port {
+    Port() : node(nullptr), port_id(-1) {}
+    Port(NodeDef* n, int port) : node(n), port_id(port) {}
     NodeDef* node = nullptr;
     int port_id = -1;
 
@@ -36,8 +38,16 @@ class GraphView {
       return node == other.node && port_id == other.port_id;
     }
   };
-  struct InputPort : public Port {};
-  struct OutputPort : public Port {};
+  struct InputPort : public Port {
+    InputPort() = default;
+    InputPort(NodeDef* n, int port_id) : Port(n, port_id) {}
+    InputPort(const NodeDef* n, int port_id)
+        : Port(const_cast<NodeDef*>(n), port_id) {}
+  };
+  struct OutputPort : public Port {
+    OutputPort() = default;
+    OutputPort(NodeDef* n, int port_id) : Port(n, port_id) {}
+  };
 
   struct HashPort {
     std::size_t operator()(const Port& port) const {
@@ -45,6 +55,20 @@ class GraphView {
     }
   };
 
+  struct Edge {
+    OutputPort src;
+    InputPort tgt;
+
+    bool operator==(const Edge& other) const {
+      return src == other.src && tgt == other.tgt;
+    }
+  };
+  struct HashEdge {
+    std::size_t operator()(const Edge& edge) const {
+      return HashPort()(edge.src) + HashPort()(edge.tgt);
+    }
+  };
+
   explicit GraphView(GraphDef* graph);
   GraphDef* GetGraph() const { return graph_; }
   NodeDef* GetNode(const string& node_name) const;
@@ -63,6 +87,7 @@ class GraphView {
       const OutputPort& port) const;
   std::unordered_set<OutputPort, HashPort> GetFanin(
       const InputPort& port) const;
+
   // Special case: regular (i.e. non-control) input ports can only have one
   // fanin.
   const OutputPort GetRegularFanin(const InputPort& port) const;
@@ -79,6 +104,13 @@ class GraphView {
   // controlling nodes iff include_controlling_nodes is true.
   int NumFanins(const NodeDef& node, bool include_controlling_nodes) const;
 
+  // Get all the edge in the immediate fanout (resp fanin) of a node. Include
+  // the control edges iff include_controlling_edges is true.
+  std::unordered_set<Edge, HashEdge> GetFanoutEdges(
+      const NodeDef& node, bool include_controlled_edges) const;
+  std::unordered_set<Edge, HashEdge> GetFaninEdges(
+      const NodeDef& node, bool include_controlling_edges) const;
+
  private:
   GraphDef* graph_;
   std::unordered_map<string, NodeDef*> nodes_;
@@ -92,4 +124,4 @@ class GraphView {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_GRAPH_VIEW_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_VIEW_H_
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 2f8549cf395f6b78154f7a6faf3fea06ea6c56c4..bbc0fedd22ba640c38556eb3267cf0a82686d0ea 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -27,17 +27,18 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef&& graphDef) {
+GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef* graph_def) {
   id = other.id;
   feed = other.feed;
   fetch = other.fetch;
   init_ops = other.init_ops;
+  keep_ops = other.keep_ops;
   expected_init_time = other.expected_init_time;
   save_op = other.save_op;
   restore_op = other.restore_op;
   save_restore_loc_tensor = other.save_restore_loc_tensor;
   queue_runners = other.queue_runners;
-  graph.Swap(&graphDef);
+  graph.Swap(graph_def);
 }
 
 std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
@@ -82,6 +83,9 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   for (const auto& node : init_ops) {
     result.insert(NodeName(node));
   }
+  for (const auto& node : keep_ops) {
+    result.insert(NodeName(node));
+  }
   if (!save_op.empty()) {
     result.insert(NodeName(save_op));
   }
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 302685972a7f2908278a881112db9dbfb53c1c1a..939e5fa04692fdef8c3b9898eb1478b94a774275 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
-#define TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_H_
 
 #include <memory>
 #include <string>
@@ -33,10 +33,13 @@ namespace grappler {
 // A TensorFlow model to optimize.
 // Models are represented by the combination of a graph, one of more fetch
 // nodes, and potentially a set of nodes to feed.
-// TODO(volunteer_needed): turn this struct into a class.
 struct GrapplerItem {
-  GrapplerItem() {}
-  GrapplerItem(const GrapplerItem& other, GraphDef&& graphDef);
+  GrapplerItem() = default;
+  GrapplerItem(const GrapplerItem& other, GraphDef&& graph_def)
+      : GrapplerItem(other, &graph_def) {}
+  // Swaps *graph_def with an empty GraphDef.
+  GrapplerItem(const GrapplerItem& other, GraphDef* graph_def);
+  virtual ~GrapplerItem() = default;
 
   string id;  // A unique id for this item
 
@@ -58,6 +61,11 @@ struct GrapplerItem {
   // Queue runner(s) required to run the queue(s) of this model.
   std::vector<QueueRunnerDef> queue_runners;
 
+  // List of op names to keep in the graph. This includes nodes that are
+  // referenced in various collections, and therefore must be preserved to
+  // ensure that the optimized metagraph can still be loaded.
+  std::vector<string> keep_ops;
+
   // Return the set of node evaluated during a regular train/inference step.
   std::vector<const NodeDef*> MainOpsFanin() const;
   // Return the set of node run to populate the queues (if any).
@@ -66,7 +74,8 @@ struct GrapplerItem {
   std::vector<const NodeDef*> InitOpsFanin() const;
   // Return the set of variables accessed during a regular train/inference step.
   std::vector<const NodeDef*> MainVariables() const;
-  // Return a set of node names that must be preserved.
+  // Return a set of node names that must be preserved. This includes feed and
+  // fetch nodes, keep_ops, init_ops.
   std::unordered_set<string> NodesToPreserve() const;
 };
 
@@ -84,4 +93,4 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_H_
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 7ba498dd06409635d7dfc282ab29f1133e299c9b..288587ce9b357d0056de428f5abc653cc4b91ea2 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -77,7 +78,7 @@ void InitializeTensor(DataType type, Tensor* tensor) {
 // correct optimizations.
 Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
                      const ItemConfig& cfg) {
-  if (!cfg.apply_optimizations && !cfg.inline_functions) {
+  if (!cfg.apply_optimizations && !cfg.erase_noinline_attributes) {
     return Status::OK();
   }
 
@@ -87,7 +88,7 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   // Make a local copy of graph def, because we need to change some things.
   GraphDef graph_def(graph_def_arg);
 
-  if (cfg.inline_functions && cfg.erase_noinline_attributes) {
+  if (cfg.erase_noinline_attributes) {
     // TF optimizer doesn't inline functions with "_noinline" attribute,
     // so let's go over the function library and erase it.
     for (auto& func : *graph_def.mutable_library()->mutable_function()) {
@@ -112,7 +113,6 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   } else {
     optimizer_opts->set_opt_level(::tensorflow::OptimizerOptions_Level_L0);
   }
-  optimizer_opts->set_do_function_inlining(cfg.inline_functions);
 
   // Create the function library runtime.
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
@@ -138,7 +138,7 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   // The default values of attributes might have been stripped by the optimizer.
   // Add them back.
   return AddDefaultAttrsToGraphDef(output_graph_def, *graphptr->op_registry(),
-                                   0);
+                                   0, true);
 }
 
 // Applies the same graph pruning logic to the graph as Session.Run in TF.
@@ -152,6 +152,27 @@ Status PruneGraph(GrapplerItem* item) {
   return Status::OK();
 }
 
+// Replace any unknown dimensions in a shape with
+// cfg.placeholder_unknown_output_shape_dim if it is no less than 0.
+Status ReplaceUnknownShapeDim(const ItemConfig& cfg,
+                              const TensorShapeProto& shape_pb_in,
+                              TensorShapeProto* shape_pb_out,
+                              TensorShape* shape_out) {
+  std::vector<int32> dims;
+  for (const auto& dim_proto : shape_pb_in.dim()) {
+    if (cfg.placeholder_unknown_output_shape_dim >= 0 &&
+        dim_proto.size() == -1) {
+      dims.push_back(cfg.placeholder_unknown_output_shape_dim);
+      shape_pb_out->add_dim()->set_size(
+          cfg.placeholder_unknown_output_shape_dim);
+    } else {
+      dims.push_back(std::max<int32>(1, dim_proto.size()));
+      shape_pb_out->add_dim()->set_size(dim_proto.size());
+    }
+  }
+  return TensorShapeUtils::MakeShape(dims.data(), dims.size(), shape_out);
+}
+
 }  // namespace
 
 // static
@@ -168,12 +189,6 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   // Fill in feed nodes from config, if any provided.
   for (const auto& feed_node : cfg.feed_nodes) {
     const string feed_name = NodeName(feed_node);
-    if (feed_name.empty()) {
-      LOG(ERROR) << "Invalid feed node name " << feed_node
-                 << ", skipping this input.";
-      return nullptr;
-    }
-    VLOG(1) << "Will use feed node " << feed_name;
     new_item->feed.emplace_back(feed_name, Tensor());
   }
 
@@ -182,17 +197,119 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     const CollectionDef& nodes = meta_graph.collection_def().at("train_op");
     if (nodes.has_node_list()) {
       for (const auto& node : nodes.node_list().value()) {
-        const string name = NodeName(node);
-        if (name.empty()) {
-          LOG(ERROR) << "Invalid fetch node name " << node
-                     << ", skipping this input";
-          return nullptr;
+        new_item->fetch.push_back(NodeName(node));
+      }
+    }
+  }
+
+  // Detect feed and fetch nodes from signature defs. Signatures may share same
+  // inputs or outputs.
+  std::unordered_set<string> signature_feed_nodes;
+  std::unordered_set<string> signature_fetch_nodes;
+  for (const auto& name_and_signature : meta_graph.signature_def()) {
+    for (const auto& name_and_input : name_and_signature.second.inputs()) {
+      const TensorInfo& input = name_and_input.second;
+      if (input.has_coo_sparse()) {
+        // Define the shapes following the comment of CooSparse.
+        // TODO(yuefengz): we probably want to use different dim values for the
+        // three tensors of a SparseTensor.
+        int64 dim = std::max(1, cfg.placeholder_unknown_output_shape_dim);
+        TensorShape shape_1d({dim});
+        TensorShape shape_2d({dim, dim});
+
+        if (gtl::InsertIfNotPresent(
+                &signature_feed_nodes,
+                NodeName(input.coo_sparse().values_tensor_name()))) {
+          Tensor value_tensor(input.dtype(), shape_1d);
+          InitializeTensor(input.dtype(), &value_tensor);
+          new_item->feed.emplace_back(
+              NodeName(input.coo_sparse().values_tensor_name()), value_tensor);
+        }
+        if (gtl::InsertIfNotPresent(
+                &signature_feed_nodes,
+                NodeName(input.coo_sparse().indices_tensor_name()))) {
+          Tensor indices_tensor(DT_INT64, shape_2d);
+          InitializeTensor(input.dtype(), &indices_tensor);
+          new_item->feed.emplace_back(
+              NodeName(input.coo_sparse().indices_tensor_name()),
+              indices_tensor);
+        }
+        if (gtl::InsertIfNotPresent(
+                &signature_feed_nodes,
+                NodeName(input.coo_sparse().dense_shape_tensor_name()))) {
+          Tensor dense_shape_tensor(DT_INT64, shape_1d);
+          InitializeTensor(input.dtype(), &dense_shape_tensor);
+          new_item->feed.emplace_back(
+              NodeName(input.coo_sparse().dense_shape_tensor_name()),
+              dense_shape_tensor);
+        }
+      } else {
+        if (gtl::InsertIfNotPresent(&signature_feed_nodes,
+                                    NodeName(input.name()))) {
+          TensorShape shape;
+          TensorShapeProto shape_proto;
+          Status s = ReplaceUnknownShapeDim(cfg, input.tensor_shape(),
+                                            &shape_proto, &shape);
+          if (!s.ok()) {
+            LOG(ERROR) << "Invalid shape for signature input " << input.name()
+                       << ": " << s << ", skipping this input";
+            return nullptr;
+          }
+
+          Tensor fake_input(input.dtype(), shape);
+          InitializeTensor(input.dtype(), &fake_input);
+          new_item->feed.emplace_back(NodeName(input.name()), fake_input);
+        }
+      }
+    }
+    for (const auto& name_and_output : name_and_signature.second.outputs()) {
+      const TensorInfo& output = name_and_output.second;
+      if (output.has_coo_sparse()) {
+        if (gtl::InsertIfNotPresent(
+                &signature_fetch_nodes,
+                NodeName(output.coo_sparse().values_tensor_name()))) {
+          new_item->fetch.push_back(
+              NodeName(output.coo_sparse().values_tensor_name()));
+        }
+        if (gtl::InsertIfNotPresent(
+                &signature_fetch_nodes,
+                NodeName(output.coo_sparse().indices_tensor_name()))) {
+          new_item->fetch.push_back(
+              NodeName(output.coo_sparse().indices_tensor_name()));
+        }
+        if (gtl::InsertIfNotPresent(
+                &signature_fetch_nodes,
+                NodeName(output.coo_sparse().dense_shape_tensor_name()))) {
+          new_item->fetch.push_back(
+              NodeName(output.coo_sparse().dense_shape_tensor_name()));
+        }
+      } else {
+        if (gtl::InsertIfNotPresent(&signature_fetch_nodes,
+                                    NodeName(output.name()))) {
+          new_item->fetch.push_back(NodeName(output.name()));
         }
-        VLOG(1) << "Will use fetch node " << name;
-        new_item->fetch.push_back(name);
       }
     }
   }
+
+  for (const auto& feed : new_item->feed) {
+    if (feed.first.empty()) {
+      LOG(ERROR) << "Invalid feed node name skipping this input";
+      return nullptr;
+    } else {
+      VLOG(1) << "Will use feed node " << feed.first;
+    }
+  }
+
+  for (const auto& fetch : new_item->fetch) {
+    if (fetch.empty()) {
+      LOG(ERROR) << "Invalid fetch node name skipping this input";
+      return nullptr;
+    } else {
+      VLOG(1) << "Will use fetch node " << fetch;
+    }
+  }
+
   if (new_item->fetch.empty()) {
     LOG(ERROR) << "Failed to detect the fetch node(s), skipping this input";
     return nullptr;
@@ -296,6 +413,14 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     }
   }
 
+  // Add each node referenced in a collection to the list of nodes to keep.
+  for (const auto& col : meta_graph.collection_def()) {
+    const CollectionDef& collection = col.second;
+    for (const string& node : collection.node_list().value()) {
+      new_item->keep_ops.push_back(NodeName(node));
+    }
+  }
+
   for (auto& node : *new_item->graph.mutable_node()) {
     if (IsPlaceholder(node) && node.op() != "PlaceholderWithDefault") {
       if (node.attr().count("dtype") == 0) {
@@ -317,20 +442,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
       // shape is not empty if the shape is partially defined.
       TensorShape shape;
       TensorShapeProto shape_proto;
-      std::vector<int32> dims;
-      for (const auto& dim_proto : node.attr().at("shape").shape().dim()) {
-        if (cfg.placeholder_unknown_output_shape_dim >= 0 &&
-            dim_proto.size() == -1) {
-          dims.push_back(cfg.placeholder_unknown_output_shape_dim);
-          shape_proto.add_dim()->set_size(
-              cfg.placeholder_unknown_output_shape_dim);
-        } else {
-          dims.push_back(std::max<int32>(1, dim_proto.size()));
-          shape_proto.add_dim()->set_size(dim_proto.size());
-        }
-      }
-      Status make_shape_status =
-          TensorShapeUtils::MakeShape(dims.data(), dims.size(), &shape);
+      Status make_shape_status = ReplaceUnknownShapeDim(
+          cfg, node.attr().at("shape").shape(), &shape_proto, &shape);
       if (!make_shape_status.ok()) {
         LOG(ERROR) << "Invalid shape for placeholder " << node.name() << ": "
                    << make_shape_status << ", skipping this input";
@@ -370,7 +483,9 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 
       if (cfg.feed_nodes.empty()) {
         // No specific feed nodes were given. Assume all placeholders are fed.
-        new_item->feed.emplace_back(node.name(), fake_input);
+        if (signature_feed_nodes.count(node.name()) == 0) {
+          new_item->feed.emplace_back(node.name(), fake_input);
+        }
       } else if (cfg.feed_nodes.count(node.name()) > 0) {
         // If specific feed nodes were given, only update their tensors.
         auto it = find_if(new_item->feed.begin(), new_item->feed.end(),
@@ -454,7 +569,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
       &new_item->graph,
       FunctionLibraryDefinition(OpRegistry::Global(),
                                 new_item->graph.library()),
-      0);
+      0, true);
   if (!attr_status.ok()) {
     LOG(ERROR) << "Failed to instantiate default attribute values: "
                << attr_status.error_message();
@@ -510,113 +625,5 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   return new_item;
 }
 
-std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const FunctionDefLibrary& library) {
-  if (func.signature().name().empty()) {
-    LOG(ERROR) << "function name must be specified.";
-    return nullptr;
-  }
-  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
-  new_item->id = func.signature().name();
-
-  std::unordered_map<string, string> port_map;
-
-  // Add the function inputs as placeholder
-  for (const auto& inp : func.signature().input_arg()) {
-    NodeDef* ph = new_item->graph.add_node();
-    ph->set_name(inp.name());
-    ph->set_op("Placeholder");
-    if (inp.type() != DT_INVALID) {
-      (*ph->mutable_attr())["T"].set_type(inp.type());
-    } else {
-      auto it = func_attr.find(inp.type_attr());
-      if (it == func_attr.end()) {
-        LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
-                   << " for function input " << inp.name();
-        return nullptr;
-      } else {
-        (*ph->mutable_attr())["T"] = it->second;
-      }
-    }
-    port_map[inp.name()] = inp.name();
-  }
-
-  // Add the function body to the graph.
-  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
-
-  for (const NodeDef& node : func.node_def()) {
-    NodeDef* new_node = new_item->graph.add_node();
-    *new_node = node;
-    // Replace the placeholder attribute values with the specified value.
-    for (auto& attr : *new_node->mutable_attr()) {
-      const string& ph_name = attr.second.placeholder();
-      auto it = func_attr.find(ph_name);
-      if (it != func_attr.end()) {
-        attr.second = it->second;
-      }
-    }
-
-    // Functions use a custom format to encode connectivity. Map these custom
-    // strings to regular ones.
-    const OpRegistrationData* registration;
-    Status status = func_def.LookUp(node.op(), &registration);
-    if (!status.ok()) {
-      LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
-      return nullptr;
-    }
-
-    tensorflow::NameRangeMap inputs;
-    tensorflow::NameRangeMap outputs;
-    status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
-                                           &outputs);
-    if (!status.ok()) {
-      LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
-      return nullptr;
-    }
-    for (const auto& name_range : outputs) {
-      string port_prefix =
-          strings::StrCat(node.name(), ":", name_range.first, ":");
-      int index_start = name_range.second.first;
-      int index_end = name_range.second.second;
-      for (int i = index_start; i < index_end; ++i) {
-        string port_id = strings::StrCat(port_prefix, i - index_start);
-        string port_name = strings::StrCat(node.name(), ":", i);
-        port_map[port_id] = port_name;
-      }
-    }
-  }
-
-  for (auto& node : *new_item->graph.mutable_node()) {
-    // Rewrite the inputs to use the normal naming convention.
-    for (int i = 0; i < node.input_size(); ++i) {
-      const string& input = node.input(i);
-      if (IsControlInput(input)) {
-        // No need to remap control dependencies.
-        continue;
-      } else {
-        auto it = port_map.find(input);
-        if (it == port_map.end()) {
-          LOG(ERROR) << "Unknown input: " << input;
-          return nullptr;
-        }
-        node.set_input(i, it->second);
-      }
-    }
-  }
-
-  // Add the function outputs to the list of fetch nodes.
-  for (const auto& out : func.signature().output_arg()) {
-    new_item->fetch.emplace_back(out.name());
-  }
-  // Add the function inputs to the list of feeds.
-  for (const auto& inp : func.signature().input_arg()) {
-    new_item->feed.emplace_back(inp.name(), Tensor());
-  }
-
-  return new_item;
-}
-
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index e892a3f556f7e9ccba91d5ce672a12d2eac49f5a..aafd2fdcdaf9204cd7f2def0faeb9ac38245d0cd 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
-#define TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
 
 #include <memory>
 #include <set>
@@ -40,8 +40,6 @@ struct ItemConfig {
   int placeholder_unknown_output_shape_dim = -1;
   // If true, does L1 optimizations.
   bool apply_optimizations = false;
-  // If true, does inlining.
-  bool inline_functions = false;
   // If true, erases all "_noinline" attributes from user-defined functions.
   // Has no effect if "inline_functions" is disabled.
   bool erase_noinline_attributes = false;
@@ -58,14 +56,7 @@ struct ItemConfig {
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg);
 
-// Factory method for creating a GrapplerItem from a FunctionDef.
-// Returns nullptr if the given function def cannot be converted.
-std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const FunctionDefLibrary& library);
-
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
index 68437b60419f73419bca4467b409818bc0b11650..4b90bf3038df2900315aa32e32a6635d834e4403 100644
--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -35,96 +35,6 @@ namespace {
 
 class GrapplerItemBuilderTest : public ::testing::Test {};
 
-// Create a sample graph with a symbolic gradient for sum.
-void SampleSumSymbolicGradientGraphdef(
-    GraphDef *def, CollectionDef *fetches,
-    std::vector<string> *names_of_ops_of_inline) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-
-  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-
-  auto dummy_variable = Variable(scope, {2, 2}, DT_FLOAT);
-  auto x = Const(scope, 1.0f);
-  auto y = Const(scope, 2);
-  auto z = Const(scope, 3.0f);
-  TF_ASSERT_OK(scope.status());
-
-  NameAttrList fn;
-  fn.set_name("Sum");
-  (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
-  auto g0 = SymbolicGradient(scope, std::initializer_list<Input>{x, y, z},
-                             {DT_FLOAT, DT_INT32}, fn);
-
-  // TODO(bsteiner): we should rewrite the feed/fetch nodes to reflect the
-  // inlining that's done in the item builder
-  // fetches->mutable_node_list()->add_value(g0[0].name());
-  fetches->mutable_node_list()->add_value("SymbolicGradient/dx");
-  fetches->mutable_node_list()->add_value("SymbolicGradient/dy_reshaped");
-
-  TF_CHECK_OK(scope.ToGraphDef(def));
-
-  // Add names of the ops that replace the Mul symbolic gradient during
-  // inlining. This is for validation.
-  *names_of_ops_of_inline = {
-      "SymbolicGradient/dx",          "SymbolicGradient/tile_scaling",
-      "SymbolicGradient/dy_reshaped", "SymbolicGradient/y_shape",
-      "SymbolicGradient/x_shape",     "SymbolicGradient/stitch_idx0",
-      "SymbolicGradient/x_rank",      "SymbolicGradient/stitch_val1",
-      "SymbolicGradient/i_shape",     "SymbolicGradient/di",
-      "SymbolicGradient/zero",        "SymbolicGradient/one"};
-}
-
-std::unique_ptr<GrapplerItem> CreateGrapplerItem(const GraphDef &def,
-                                                 const CollectionDef &fetches) {
-  MetaGraphDef meta_def;
-  ItemConfig cfg;
-  cfg.inline_functions = true;
-  *meta_def.mutable_graph_def() = def;
-  (*meta_def.mutable_collection_def())["train_op"] = fetches;
-  return GrapplerItemFromMetaGraphDef("0", meta_def, cfg);
-}
-
-int CountSymbolicGradientOps(const std::unique_ptr<GrapplerItem> &item) {
-  int n_symb_grads = 0;
-  for (const auto &node : item->graph.node()) {
-    if (node.op() == FunctionLibraryDefinition::kGradientOp) {
-      n_symb_grads++;
-    }
-  }
-  return n_symb_grads;
-}
-
-int CountOpsWithNames(const std::unique_ptr<GrapplerItem> &item,
-                      const std::vector<string> &names) {
-  std::set<string> names_set(names.begin(), names.end());
-  int n_with_names = 0;
-  for (const auto &node : item->graph.node()) {
-    if (names_set.find(node.name()) != names_set.end()) {
-      n_with_names++;
-    }
-  }
-  return n_with_names;
-}
-
-TEST_F(GrapplerItemBuilderTest, SymbolicGradientInlining) {
-  // Create sample sum symbolic gradient graph.
-  GraphDef def;
-  CollectionDef fetches;
-  std::vector<string> ops_of_inline;
-  SampleSumSymbolicGradientGraphdef(&def, &fetches, &ops_of_inline);
-
-  // Create the inlined graph.
-  std::unique_ptr<GrapplerItem> with_inline = CreateGrapplerItem(def, fetches);
-
-  // For the inlined graph, there should be 0 symbolic gradient ops.
-  EXPECT_EQ(0, CountSymbolicGradientOps(with_inline));
-
-  // For the inlined graph, make sure all the required expanded op’s are in the
-  // graph.
-  EXPECT_EQ(ops_of_inline.size(),
-            CountOpsWithNames(with_inline, ops_of_inline));
-}
-
 TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest) {
   MetaGraphDef meta_graph;
 
@@ -273,210 +183,134 @@ TEST_F(GrapplerItemBuilderTest, GraphWithFunctions) {
   (*meta_graph.mutable_collection_def())["train_op"] = train_op;
 
   ItemConfig cfg;
-  cfg.inline_functions = false;
 
   std::unique_ptr<GrapplerItem> item =
       GrapplerItemFromMetaGraphDef("0", meta_graph, cfg);
   ASSERT_TRUE(item != nullptr);
 }
 
-TEST_F(GrapplerItemBuilderTest, FromSimpleFunctionDef) {
-  const Tensor kTwo = test::AsScalar<int64>(2);
-  FunctionDef func = FunctionDefHelper::Define(
-      // Name
-      "XTimesTwo",
-      // Args
-      {"x: T"},
-      // Return values
-      {"y: T"},
-      // Attr def
-      {"T: {float, double, int32, int64}"},
-      // Nodes
-      {
-          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
-          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
-          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
-      });
+TEST_F(GrapplerItemBuilderTest, GraphWithCustomOps) {
+  MetaGraphDef meta_graph;
+  // y = XTimesTwo(x)
+  constexpr char device[] = "/cpu:0";
+  *meta_graph.mutable_graph_def() = test::function::GDef(
+      {test::function::NDef("x", "Const", {}, {{"dtype", DT_FLOAT}}, device),
+       test::function::NDef("y", "CustomOp", {"x"}, {{"T", DT_FLOAT}}, device)},
+      {});
 
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
-  CHECK(item);
-  EXPECT_EQ("XTimesTwo", item->id);
-  EXPECT_EQ(4, item->graph.node_size());
-  EXPECT_EQ(std::vector<string>({"y"}), item->fetch);
-  EXPECT_EQ(1, item->feed.size());
-  EXPECT_EQ("x", item->feed[0].first);
+  CollectionDef train_op;
+  train_op.mutable_node_list()->add_value("y");
+  (*meta_graph.mutable_collection_def())["train_op"] = train_op;
 
-  for (const NodeDef &node : item->graph.node()) {
-    if (node.name() == "x") {
-      EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
-      EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "two") {
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "scale") {
-      EXPECT_EQ("Cast", node.op());
-      EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("two:0", node.input(0));
-    } else if (node.name() == "y") {
-      EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("scale:0", node.input(1));
-    }
-  }
+  ItemConfig cfg;
+
+  std::unique_ptr<GrapplerItem> item =
+      GrapplerItemFromMetaGraphDef("0", meta_graph, cfg);
+  ASSERT_TRUE(item != nullptr);
 }
 
-TEST_F(GrapplerItemBuilderTest, FromFunctionDefWithMultiOutputNodes) {
-  // Gradient graph for the Subtract operation
-  std::vector<FunctionDefHelper::Node> nodes = {
-      {{"sx"}, "Shape", {"x"}},
-      {{"sy"}, "Shape", {"y"}},
-      {{"gx"}, "Identity", {"dz"}},
-      {{"gy"}, "Neg", {"dz"}},
-      {{"rx", "ry"}, "BroadcastGradientArgs", {"sx", "sy"}},
-      {{"sum_gx"}, "Sum", {"gx", "rx"}},
-      {{"dx"}, "Reshape", {"sum_gx", "sx"}},
-      {{"sum_gy"}, "Sum", {"gy", "ry"}},
-      {{"dy"}, "Reshape", {"sum_gy", "sy"}},
-  };
-
-  for (auto &n : nodes) {
-    // "BroadcastGradientArgs" doesn't need any attrs.
-    if (n.attr.empty() && n.op != "BroadcastGradientArgs") {
-      n.attr = {{"T", "$T"}};
-    }
-  }
-  FunctionDef func = FunctionDefHelper::Define(
-      // Name
-      "SubGrad",
-      // Arg defs
-      {"x: T", "y: T", "dz: T"},
-      // Ret val defs
-      {"dx: T", "dy: T"},
-      // Attr defs
-      {{"T: {half, float, double}"}},
-      // Nodes
-      nodes);
-
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
-  FunctionDefLibrary library;
+TEST_F(GrapplerItemBuilderTest, FromGraphWithSignatureDef) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), 0);
+  auto y = ops::Const(s.WithOpName("y"), 1);
+  auto z = ops::Add(s.WithOpName("z"), x, y);
+
+  MetaGraphDef meta_graph;
+  TF_CHECK_OK(s.ToGraphDef(meta_graph.mutable_graph_def()));
+
+  TensorInfo input, output;
+  input.set_name("x");
+  input.set_dtype(DT_FLOAT);
+  output.set_name("z");
+  SignatureDef serving_signature;
+  (*serving_signature.mutable_inputs())["input"] = input;
+  (*serving_signature.mutable_outputs())["output"] = output;
+  (*meta_graph.mutable_signature_def())["serving"] = serving_signature;
+
+  // It should be able to dedup the input and output with same names.
+  TensorInfo input2, output2;
+  input.set_name("x");
+  input.set_dtype(DT_FLOAT);
+  output.set_name("z");
+  SignatureDef serving_signature2;
+  (*serving_signature.mutable_inputs())["input2"] = input2;
+  (*serving_signature.mutable_outputs())["output2"] = output2;
+  (*meta_graph.mutable_signature_def())["serving2"] = serving_signature2;
+
   std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
-  CHECK(item);
-  EXPECT_EQ("SubGrad", item->id);
-  EXPECT_EQ(12, item->graph.node_size());
-  EXPECT_EQ(std::vector<string>({"dx", "dy"}), item->fetch);
-  EXPECT_EQ(3, item->feed.size());
-  EXPECT_EQ("x", item->feed[0].first);
-  EXPECT_EQ("y", item->feed[1].first);
-  EXPECT_EQ("dz", item->feed[2].first);
+      GrapplerItemFromMetaGraphDef("0", meta_graph, ItemConfig());
+  ASSERT_TRUE(item != nullptr);
 
-  for (const NodeDef &node : item->graph.node()) {
-    if (node.name() == "x" || node.name() == "y" || node.name() == "dz") {
-      EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
-      EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "rx") {
-      EXPECT_EQ("BroadcastGradientArgs", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("sx:0", node.input(0));
-      EXPECT_EQ("sy:0", node.input(1));
-    } else if (node.name() == "sum_gx") {
-      EXPECT_EQ("Sum", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("gx:0", node.input(0));
-      EXPECT_EQ("rx:0", node.input(1));
-    } else if (node.name() == "sum_gy") {
-      EXPECT_EQ("Sum", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("gy:0", node.input(0));
-      EXPECT_EQ("rx:1", node.input(1));
-    }
-  }
+  EXPECT_EQ(item->feed.size(), 1);
+  EXPECT_EQ(item->fetch.size(), 1);
+  EXPECT_EQ(item->feed[0].first, "x");
+  EXPECT_EQ(item->fetch[0], "z");
 }
 
-TEST_F(GrapplerItemBuilderTest, FromFunctionDefWithNestedFuncs) {
-  FunctionDefLibrary library;
-  *library.add_function() = FunctionDefHelper::Define(
-      // Name
-      "Swap",
-      // Args
-      {"i0: T", "i1: T"},
-      // Return values
-      {"o0: T", "o1: T"},
-      // Attr def
-      {"T: {float, double}"},
-      // Nodes
-      {{{"o0"}, "Identity", {"i1"}, {{"T", "$T"}}},
-       {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
-
-  FunctionDef func = FunctionDefHelper::Create(
-      // Name
-      "ManySwapsFirst",
-      // Args
-      {"x: float", "y: float"},
-      // Return values
-      {"o: float"},
-      // attr def
-      {},
-      // Nodes
-      // o = x*x + y*y.  Furthermore, The 1st swap depends on x2, and
-      // y2 depends on the 2nd swap.  The 2nd swap has data dependency
-      // on the 1st swap.
-      {{{"a0"}, "Swap", {"x", "y"}, {{"T", DT_FLOAT}}, {"x2"}},
-       {{"a1"}, "Swap", {"a0:o0:0", "a0:o1:0"}, {{"T", DT_FLOAT}}},
-       {{"x2"}, "Mul", {"x", "x"}, {{"T", DT_FLOAT}}},
-       {{"y2"}, "Mul", {"y", "y"}, {{"T", DT_FLOAT}}, {"a1"}},
-       {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
-      {{"o", "o:z:0"}});
-
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
+TEST_F(GrapplerItemBuilderTest, FromGraphWithIncompleteSignatureDef) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), 0);
+  auto y = ops::Const(s.WithOpName("y"), 1);
+
+  MetaGraphDef meta_graph;
+  TF_CHECK_OK(s.ToGraphDef(meta_graph.mutable_graph_def()));
+
+  CollectionDef train_op;
+  train_op.mutable_node_list()->add_value("y");
+  (*meta_graph.mutable_collection_def())["train_op"] = train_op;
+
+  TensorInfo input, output;
+  input.set_name("x");
+  input.set_dtype(DT_FLOAT);
+  // Its coo_sparse proto is incomplete.
+  output.mutable_coo_sparse()->set_values_tensor_name("z");
+  SignatureDef serving_signature;
+  (*serving_signature.mutable_inputs())["input"] = input;
+  (*serving_signature.mutable_outputs())["output"] = output;
+  (*meta_graph.mutable_signature_def())["serving"] = serving_signature;
+
   std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
+      GrapplerItemFromMetaGraphDef("0", meta_graph, ItemConfig());
+  ASSERT_TRUE(item == nullptr);
+}
 
-  for (const NodeDef &node : item->graph.node()) {
-    if (node.name() == "x" || node.name() == "y") {
-      EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
-      EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "a0") {
-      EXPECT_EQ("Swap", node.op());
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("y", node.input(1));
-      EXPECT_EQ("^x2", node.input(2));
-    } else if (node.name() == "a1") {
-      EXPECT_EQ("Swap", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("a0:0", node.input(0));
-      EXPECT_EQ("a0:1", node.input(1));
-    } else if (node.name() == "x2") {
-      EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("x", node.input(1));
-    } else if (node.name() == "y2") {
-      EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("y", node.input(0));
-      EXPECT_EQ("y", node.input(1));
-      EXPECT_EQ("^a1", node.input(2));
-    } else if (node.name() == "o") {
-      EXPECT_EQ("Add", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x2:0", node.input(0));
-      EXPECT_EQ("y2:0", node.input(1));
-    }
-  }
+TEST_F(GrapplerItemBuilderTest, FromGraphWithUnknownDimInSignatureInput) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto shape_1d = PartialTensorShape({-1});
+  auto x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                            ops::Placeholder::Shape(shape_1d));
+  auto y = ops::Const(s.WithOpName("y"), static_cast<float>(1.0));
+  auto z = ops::Add(s.WithOpName("z"), x, y);
+
+  MetaGraphDef meta_graph;
+  TF_CHECK_OK(s.ToGraphDef(meta_graph.mutable_graph_def()));
+
+  TensorInfo input, output;
+  input.set_name("x");
+  input.set_dtype(DT_FLOAT);
+  shape_1d.AsProto(input.mutable_tensor_shape());
+  output.set_name("z");
+
+  SignatureDef serving_signature;
+  (*serving_signature.mutable_inputs())["input"] = input;
+  (*serving_signature.mutable_outputs())["output"] = output;
+  (*meta_graph.mutable_signature_def())["serving"] = serving_signature;
+
+  ItemConfig cfg;
+  cfg.placeholder_unknown_output_shape_dim = 64;
+  std::unique_ptr<GrapplerItem> item1 =
+      GrapplerItemFromMetaGraphDef("0", meta_graph, cfg);
+  ASSERT_TRUE(item1 != nullptr);
+
+  ASSERT_EQ(item1->feed.size(), 1);
+  EXPECT_EQ(item1->feed[0].second.NumElements(), 64);
+
+  std::unique_ptr<GrapplerItem> item2 =
+      GrapplerItemFromMetaGraphDef("0", meta_graph, ItemConfig());
+  ASSERT_TRUE(item2 != nullptr);
+
+  ASSERT_EQ(item2->feed.size(), 1);
+  EXPECT_EQ(item2->feed[0].second.NumElements(), 1);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index b683216590ede887d9c65003a23e712e0d612622..ffa204028cca828147810c99277fdcd9cb05f5ee 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -2,18 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "utils",
     srcs = [
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h
index b597319261011e2537848a34167f69cf1e3002f0..f3e9ecb677fdf8dd9ec0866d06ddd2745f3d5006 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -18,8 +18,8 @@ limitations under the License.
 // that may be stored in the checkpoint are not restored in order to speedup the
 // initialization.
 
-#ifndef TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
-#define TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
 
 #include <stddef.h>
 #include <limits>
@@ -53,4 +53,4 @@ class FileInputYielder : public InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/inputs/input_yielder.h b/tensorflow/core/grappler/inputs/input_yielder.h
index c9f90820a9928a405cce047b6563e1b8b2af0626..06f642c513018dd36e71164e71085d72cf42785b 100644
--- a/tensorflow/core/grappler/inputs/input_yielder.h
+++ b/tensorflow/core/grappler/inputs/input_yielder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
-#define TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_INPUT_YIELDER_H_
 
 namespace tensorflow {
 namespace grappler {
@@ -32,4 +32,4 @@ class InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
index 434b660614b4267df7344222b449c5224db35996..74e5080a30f9c6804012c64ac2df7f1ddf33d654 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
-#define TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
 
 #include <string>
 #include <vector>
@@ -44,4 +44,4 @@ class TrivialTestGraphInputYielder : public InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/inputs/utils.h b/tensorflow/core/grappler/inputs/utils.h
index 00fcfa7a3f4b6295ce361f0841288fc146d1e43b..627dd5359fe181364d497c4bf4f36c136feb78af 100644
--- a/tensorflow/core/grappler/inputs/utils.h
+++ b/tensorflow/core/grappler/inputs/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
-#define TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_UTILS_H_
 
 #include <set>
 #include <vector>
@@ -37,4 +37,4 @@ Status ReadGraphDefFromFile(const std::string& graph_def_pbtxt_path,
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_UTILS_H_
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index fdf4540540b4b9f3d64ea767240ca4ea0c353d48..7a89c263744d60e4d758840e196025e84ed0d2d6 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -68,16 +70,28 @@ bool IsBitcast(const NodeDef& node) { return node.op() == "Bitcast"; }
 
 bool IsCast(const NodeDef& node) { return node.op() == "Cast"; }
 
+bool IsCheckNumerics(const NodeDef& node) {
+  return node.op() == "CheckNumerics";
+}
+
 bool IsComplex(const NodeDef& node) { return node.op() == "Complex"; }
 
 bool IsComplexAbs(const NodeDef& node) { return node.op() == "ComplexAbs"; }
 
+bool IsConcat(const NodeDef& node) {
+  return node.op() == "Concat" || node.op() == "ConcatV2";
+}
+
 bool IsConcatOffset(const NodeDef& node) { return node.op() == "ConcatOffset"; }
 
 bool IsConstant(const NodeDef& node) { return node.op() == "Const"; }
 
 bool IsConj(const NodeDef& node) { return node.op() == "Conj"; }
 
+bool IsConjugateTranspose(const NodeDef& node) {
+  return node.op() == "ConjugateTranspose";
+}
+
 bool IsConv2D(const NodeDef& node) { return node.op() == "Conv2D"; }
 
 bool IsConv2DBackpropFilter(const NodeDef& node) {
@@ -144,6 +158,9 @@ bool IsHistogramSummary(const NodeDef& node) {
 
 bool IsIdentity(const NodeDef& node) {
   const auto& op = node.op();
+  if (op == "IdentityN" && node.attr().at("T").list().type_size() == 1) {
+    return true;
+  }
   return op == "Identity" || op == "RefIdentity";
 }
 
@@ -201,6 +218,8 @@ bool IsMod(const NodeDef& node) { return node.op() == "Mod"; }
 
 bool IsMul(const NodeDef& node) { return node.op() == "Mul"; }
 
+bool IsNeg(const NodeDef& node) { return node.op() == "Neg"; }
+
 bool IsNoOp(const NodeDef& node) { return node.op() == "NoOp"; }
 
 bool IsNotEqual(const NodeDef& node) { return node.op() == "NotEqual"; }
@@ -210,6 +229,8 @@ bool IsNextIteration(const NodeDef& node) {
   return op == "NextIteration" || op == "RefNextIteration";
 }
 
+bool IsPack(const NodeDef& node) { return node.op() == "Pack"; }
+
 bool IsPad(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Pad" || op == "PadV2";
@@ -225,8 +246,14 @@ bool IsPolygamma(const NodeDef& node) { return node.op() == "Polygamma"; }
 
 bool IsPow(const NodeDef& node) { return node.op() == "Pow"; }
 
+bool IsPrint(const NodeDef& node) { return node.op() == "Print"; }
+
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
+bool IsRandomShuffle(const NodeDef& node) {
+  return node.op() == "RandomShuffle";
+}
+
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
 
 bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
@@ -256,8 +283,14 @@ bool IsRestore(const NodeDef& node) {
           node.op() == "RestoreSlice");
 }
 
+bool IsReverse(const NodeDef& node) {
+  return node.op() == "Reverse" || node.op() == "ReverseV2";
+}
+
 bool IsReverseV2(const NodeDef& node) { return node.op() == "ReverseV2"; }
 
+bool IsRsqrt(const NodeDef& node) { return node.op() == "Rsqrt"; }
+
 bool IsRsqrtGrad(const NodeDef& node) { return node.op() == "RsqrtGrad"; }
 
 bool IsSelect(const NodeDef& node) { return node.op() == "Select"; }
@@ -272,6 +305,8 @@ bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
 
 bool IsShapeN(const NodeDef& node) { return node.op() == "ShapeN"; }
 
+bool IsShuffle(const NodeDef& node) { return node.op() == "Shuffle"; }
+
 bool IsSigmoidGrad(const NodeDef& node) { return node.op() == "SigmoidGrad"; }
 
 bool IsSlice(const NodeDef& node) { return node.op() == "Slice"; }
@@ -284,14 +319,31 @@ bool IsSplit(const NodeDef& node) { return node.op() == "Split"; }
 
 bool IsSplitV(const NodeDef& node) { return node.op() == "SplitV"; }
 
+bool IsSqrt(const NodeDef& node) { return node.op() == "Sqrt"; }
+
 bool IsSqrtGrad(const NodeDef& node) { return node.op() == "SqrtGrad"; }
 
+bool IsSquare(const NodeDef& node) { return node.op() == "Square"; }
+
 bool IsSquaredDifference(const NodeDef& node) {
   return node.op() == "SquaredDifference";
 }
 
 bool IsSqueeze(const NodeDef& node) { return node.op() == "Squeeze"; }
 
+bool IsStackOp(const NodeDef& node) {
+  return node.op() == "Stack" || node.op() == "StackV2";
+}
+bool IsStackCloseOp(const NodeDef& node) {
+  return node.op() == "StackClose" || node.op() == "StackCloseV2";
+}
+bool IsStackPushOp(const NodeDef& node) {
+  return node.op() == "StackPush" || node.op() == "StackPushV2";
+}
+bool IsStackPopOp(const NodeDef& node) {
+  return node.op() == "StackPop" || node.op() == "StackPopV2";
+}
+
 bool IsStopGradient(const NodeDef& node) {
   const auto& op = node.op();
   return op == "StopGradient" || op == "PreventGradient";
@@ -322,6 +374,8 @@ bool IsTruncateDiv(const NodeDef& node) { return node.op() == "TruncateDiv"; }
 
 bool IsTruncateMod(const NodeDef& node) { return node.op() == "TruncateMod"; }
 
+bool IsUnpack(const NodeDef& node) { return node.op() == "Unpack"; }
+
 bool IsVariable(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
@@ -346,7 +400,8 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
     return false;
   }
   const OpDef* op_def = nullptr;
-  Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+  const string& op_name = node.op();
+  Status status = OpRegistry::Global()->LookUpOpDef(op_name, &op_def);
   if (!status.ok()) {
     return false;
   }
@@ -359,11 +414,27 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
       return false;
     }
   }
+  return !ModifiesInputsInPlace(node);
+}
+
+bool ModifiesInputsInPlace(const NodeDef& node) {
   // Some nodes do in-place updates on regular tensor inputs.
-  if (GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace")) {
+  string op_name = node.op();
+
+  // Ops that modify resource variables effectively modify one of their inputs.
+  if (op_name == "AssignVariableOp" || op_name == "AssignAddVariableOp" ||
+      op_name == "AssignSubVariableOp" || op_name == "ResourceScatterUpdate" ||
+      op_name == "ResourceScatterAdd" || op_name == "ResourceScatterSub" ||
+      op_name == "ResourceScatterMul" || op_name == "ResourceScatterDiv" ||
+      op_name == "ResourceScatterMin" || op_name == "ResourceScatterMax") {
     return false;
   }
-  return true;
+
+  std::transform(op_name.begin(), op_name.end(), op_name.begin(), ::tolower);
+  if (str_util::StrContains(op_name, "inplace")) {
+    return true;
+  }
+  return GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace");
 }
 
 bool ModifiesFrameInfo(const NodeDef& node) {
@@ -387,20 +458,101 @@ OPDEF_PROPERTY_HELPER(Aggregate, aggregate)
 OPDEF_PROPERTY_HELPER(Commutative, commutative)
 
 bool IsInvolution(const NodeDef& node) {
-  const std::unordered_set<string> involution_ops{
-      "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"};
-  return involution_ops.count(node.op()) > 0;
+  static const std::unordered_set<string>* involution_ops =
+      CHECK_NOTNULL((new std::unordered_set<string>{
+          "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"}));
+  return involution_ops->count(node.op()) > 0;
 }
 
-bool IsValuePreserving(const NodeDef& node) {
+bool IsValueAndOrderPreserving(const NodeDef& node) {
   if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
     return true;
   }
-  const std::unordered_set<string> value_preserving_ops{
-      "Transpose",  "Reshape",      "Identity",        "InvertPermutation",
-      "Reverse",    "StopGradient", "PreventGradient", "CheckNumerics",
-      "ExpandDims", "Squeeze"};
-  return value_preserving_ops.count(node.op()) > 0;
+  static const std::unordered_set<string>* value_and_order_preserving_ops =
+      CHECK_NOTNULL((new const std::unordered_set<string>{
+          "CheckNumerics",
+          "DebugGradientIdentity",
+          "DeepCopy"
+          "Enter",
+          "Exit",
+          "ExpandDims",
+          "Identity",
+          "IdentityN",
+          "PreventGradient",
+          "Print",
+          "Reshape",
+          "Snapshot",
+          "Squeeze",
+          "StopGradient",
+      }));
+  return value_and_order_preserving_ops->count(node.op()) > 0;
+}
+
+bool IsValuePreserving(const NodeDef& node) {
+  static const std::unordered_set<string>* value_preserving_ops =
+      CHECK_NOTNULL((new std::unordered_set<string>{
+          "InvertPermutation",
+          "Reverse",
+          "Roll",
+          "Transpose",
+      }));
+  return IsValueAndOrderPreserving(node) ||
+         value_preserving_ops->count(node.op()) > 0;
+}
+
+bool IsUnaryElementWise(const NodeDef& node) {
+  static const std::unordered_set<string>* element_wise_ops =
+      CHECK_NOTNULL((new std::unordered_set<string>{
+          "Abs",
+          "Acos",
+          "Acosh",
+          "Asin",
+          "Asinh",
+          "Atan",
+          "Atan2",
+          "Atanh",
+          "Ceil",
+          "ComplexAbs",
+          "Conj",
+          "Cos",
+          "Cosh",
+          "Digamma",
+          "Elu"
+          "Erf",
+          "Erfc",
+          "Exp",
+          "Expm1",
+          "Floor",
+          "Inv",
+          "Invert",
+          "Isinf",
+          "Isnan",
+          "Isfinite",
+          "Lgamma",
+          "Log",
+          "Log1p",
+          "LogicalNot",
+          "Neg",
+          "Reciprocal",
+          "Relu",
+          "Relu6",
+          "Rint",
+          "Round",
+          "Selu",
+          "Rsqrt",
+          "Sigmoid",
+          "Sign",
+          "Sin",
+          "SinH",
+          "Softplus",
+          "Softsign",
+          "Sqrt",
+          "Square",
+          "Tan"
+          "Tanh",
+      }));
+  return element_wise_ops->count(node.op()) > 0 ||
+         (!IsIdentityN(node) && IsValueAndOrderPreserving(node));
 }
 
 bool HasOpDef(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 9cda40c0a6515caa9754d0c2f4f50a32f9fe8d98..976d23e52795ba771e0fe1985b49a1a21801e1ae 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OP_TYPES_H_
-#define TENSORFLOW_GRAPPLER_OP_TYPES_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
+#define TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
 
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -37,9 +37,12 @@ bool IsBiasAdd(const NodeDef& node);
 bool IsBiasAddGrad(const NodeDef& node);
 bool IsBitcast(const NodeDef& node);
 bool IsCast(const NodeDef& node);
+bool IsCheckNumerics(const NodeDef& node);
 bool IsComplex(const NodeDef& node);
 bool IsComplexAbs(const NodeDef& node);
 bool IsConj(const NodeDef& node);
+bool IsConjugateTranspose(const NodeDef& node);
+bool IsConcat(const NodeDef& node);
 bool IsConcatOffset(const NodeDef& node);
 bool IsConstant(const NodeDef& node);
 bool IsConv2D(const NodeDef& node);
@@ -84,13 +87,18 @@ bool IsMod(const NodeDef& node);
 bool IsMul(const NodeDef& node);
 bool IsMatMul(const NodeDef& node);
 bool IsNextIteration(const NodeDef& node);
+bool IsPack(const NodeDef& node);
 bool IsPad(const NodeDef& node);
+bool IsPack(const NodeDef& node);
+bool IsNeg(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
 bool IsNotEqual(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
 bool IsPolygamma(const NodeDef& node);
+bool IsPrint(const NodeDef& node);
 bool IsProd(const NodeDef& node);
 bool IsPow(const NodeDef& node);
+bool IsRandomShuffle(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
 bool IsRelu6Grad(const NodeDef& node);
@@ -100,7 +108,9 @@ bool IsRecv(const NodeDef& node);
 bool IsReduction(const NodeDef& node);
 bool IsReshape(const NodeDef& node);
 bool IsRestore(const NodeDef& node);
+bool IsReverse(const NodeDef& node);
 bool IsReverseV2(const NodeDef& node);
+bool IsRsqrt(const NodeDef& node);
 bool IsRsqrtGrad(const NodeDef& node);
 bool IsSelect(const NodeDef& node);
 bool IsSeluGrad(const NodeDef& node);
@@ -108,14 +118,21 @@ bool IsSend(const NodeDef& node);
 bool IsSlice(const NodeDef& node);
 bool IsShape(const NodeDef& node);
 bool IsShapeN(const NodeDef& node);
+bool IsShuffle(const NodeDef& node);
 bool IsSigmoidGrad(const NodeDef& node);
 bool IsSoftplusGrad(const NodeDef& node);
 bool IsSoftsignGrad(const NodeDef& node);
 bool IsSplit(const NodeDef& node);
 bool IsSplitV(const NodeDef& node);
+bool IsSqrt(const NodeDef& node);
 bool IsSqrtGrad(const NodeDef& node);
+bool IsSquare(const NodeDef& node);
 bool IsSquaredDifference(const NodeDef& node);
 bool IsSqueeze(const NodeDef& node);
+bool IsStackOp(const NodeDef& node);
+bool IsStackCloseOp(const NodeDef& node);
+bool IsStackPushOp(const NodeDef& node);
+bool IsStackPopOp(const NodeDef& node);
 bool IsStopGradient(const NodeDef& node);
 bool IsStridedSlice(const NodeDef& node);
 bool IsStridedSliceGrad(const NodeDef& node);
@@ -127,6 +144,7 @@ bool IsTile(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
 bool IsTruncateDiv(const NodeDef& node);
 bool IsTruncateMod(const NodeDef& node);
+bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 bool IsZeta(const NodeDef& node);
 
@@ -143,21 +161,31 @@ bool IsCommutative(const NodeDef& node);
 bool IsPersistent(const NodeDef& node);
 
 bool IsFreeOfSideEffect(const NodeDef& node);
+
 bool ModifiesFrameInfo(const NodeDef& node);
 
+// Returns true if the op is known to write to one or more of its inputs.
+bool ModifiesInputsInPlace(const NodeDef& node);
+
 // Returns true if the op is an element-wise involution, i.e. if it is its
 // own inverse such that f(f(x)) == x.
 bool IsInvolution(const NodeDef& node);
 
+// Returns true if the op preserves the order and value of elements in its
+// first input tensor and possible changes its shape.
+bool IsValueAndOrderPreserving(const NodeDef& node);
+
 // Returns true if the op in node only rearranges the order of elements in its
 // first input tensor and possible changes its shape. More precisely, this
 // function returns true if the op commutes with all element-wise operations.
 bool IsValuePreserving(const NodeDef& node);
 
+bool IsUnaryElementWise(const NodeDef& node);
+
 // Returns true if we can find an opdef corresponding to the op of the node.
 bool HasOpDef(const NodeDef& node);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OP_TYPES_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index e839630605a96f1528114f98b88e90a7a20b0a3a..5b5e1e024e8cfaae638c453b469ae4881bcd3df8 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1,17 +1,19 @@
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+# Platform specific build config
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_protos_grappler",
+)
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
 )
 
 cc_library(
@@ -35,7 +37,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "static_schedule_test",
     srcs = ["static_schedule_test.cc"],
     deps = [
@@ -70,7 +72,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "auto_parallel_test",
     srcs = ["auto_parallel_test.cc"],
     deps = [
@@ -129,6 +131,50 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "function_optimizer",
+    srcs = ["function_optimizer.cc"],
+    hdrs = [
+        "function_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/utils:functions",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "function_optimizer_test",
+    srcs = ["function_optimizer_test.cc"],
+    deps = [
+        ":function_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:functional_ops",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 cc_library(
     name = "graph_rewriter",
     srcs = ["graph_rewriter.cc"],
@@ -157,6 +203,50 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "graph_optimizer_stage",
+    srcs = ["graph_optimizer_stage.cc"],
+    hdrs = ["graph_optimizer_stage.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:frame",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "graph_optimizer_stage_test",
+    size = "small",
+    srcs = ["graph_optimizer_stage_test.cc"],
+    deps = [
+        ":graph_optimizer_stage",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
+cc_library(
+    name = "custom_graph_optimizer",
+    hdrs = [
+        "custom_graph_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "arithmetic_optimizer",
     srcs = ["arithmetic_optimizer.cc"],
@@ -167,6 +257,8 @@ cc_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
+        ":graph_optimizer_stage",
+        ":symbolic_shapes",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -176,10 +268,11 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "arithmetic_optimizer_test",
     size = "small",
     srcs = ["arithmetic_optimizer_test.cc"],
@@ -188,12 +281,19 @@ tf_cc_test(
         ":constant_folding",
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -219,7 +319,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "dependency_optimizer_test",
     size = "small",
     srcs = ["dependency_optimizer_test.cc"],
@@ -255,7 +355,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "model_pruner_test",
     srcs = ["model_pruner_test.cc"],
     deps = [
@@ -264,15 +364,47 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
+tf_kernel_library(
+    name = "gpu_swapping_kernels",
+    srcs = [
+        "gpu_swapping_kernels.cc",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
     ],
 )
 
+cc_library(
+    name = "gpu_swapping_ops",
+    srcs = [
+        "gpu_swapping_ops.cc",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "memory_optimizer",
-    srcs = ["memory_optimizer.cc"],
+    srcs = [
+        "memory_optimizer.cc",
+    ],
     hdrs = [
         "memory_optimizer.h",
     ],
@@ -282,6 +414,7 @@ cc_library(
         ":graph_rewriter",
         ":static_schedule",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
@@ -295,10 +428,13 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_only_cc_test(
     name = "memory_optimizer_test",
     srcs = ["memory_optimizer_test.cc"],
+    tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
     deps = [
+        ":gpu_swapping_kernels",
+        ":gpu_swapping_ops",
         ":memory_optimizer",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:ops",
@@ -336,7 +472,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "layout_optimizer_test",
     srcs = ["layout_optimizer_test.cc"],
     deps = [
@@ -351,9 +487,13 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:single_machine",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:virtual_placer",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -368,20 +508,88 @@ cc_library(
         ":arithmetic_optimizer",
         ":auto_parallel",
         ":constant_folding",
+        ":custom_graph_optimizer",
+        ":custom_graph_optimizer_registry",
+        ":debug_stripper",
         ":dependency_optimizer",
+        ":function_optimizer",
         ":graph_optimizer",
         ":layout_optimizer",
         ":loop_optimizer",
         ":memory_optimizer",
         ":model_pruner",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:colocation",
+        "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
+tf_cuda_cc_test(
+    name = "meta_optimizer_test",
+    srcs = ["meta_optimizer_test.cc"],
+    deps = [
+        ":custom_graph_optimizer",
+        ":custom_graph_optimizer_registry",
+        ":meta_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
+# This rule is header-only unless the build is static (--config=monolithic). Its
+# implementation is included directly in the framework shared object.
+cc_library(
+    name = "custom_graph_optimizer_registry",
+    hdrs = ["custom_graph_optimizer_registry.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":custom_graph_optimizer",
+        "//tensorflow/core:lib",
+    ] + if_static(
+        [":custom_graph_optimizer_registry_impl"],
+    ),
+)
+
+# This rule contains static variables for the optimizer registry. Do not depend
+# on it directly; use :custom_graph_optimizer_registry, and link against
+# libtensorflow_framework.so for the registry symbols.
+cc_library(
+    name = "custom_graph_optimizer_registry_impl",
+    srcs = ["custom_graph_optimizer_registry.cc"],
+    hdrs = ["custom_graph_optimizer_registry.h"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":custom_graph_optimizer",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "custom_graph_optimizer_registry_test",
+    size = "small",
+    srcs = ["custom_graph_optimizer_registry_test.cc"],
+    deps = [
+        ":custom_graph_optimizer",
+        ":custom_graph_optimizer_registry",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "loop_optimizer",
     srcs = ["loop_optimizer.cc"],
@@ -390,6 +598,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":constant_folding",
         ":graph_optimizer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -399,12 +608,12 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:frame",
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "loop_optimizer_test",
-    size = "small",
     srcs = ["loop_optimizer_test.cc"],
     deps = [
         ":loop_optimizer",
@@ -415,5 +624,65 @@ tf_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
+cc_library(
+    name = "symbolic_shapes",
+    srcs = ["symbolic_shapes.cc"],
+    hdrs = ["symbolic_shapes.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ] + tf_protos_grappler(),
+)
+
+tf_cc_test(
+    name = "symbolic_shapes_test",
+    srcs = ["symbolic_shapes_test.cc"],
+    deps = [
+        ":symbolic_shapes",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "debug_stripper",
+    srcs = ["debug_stripper.cc"],
+    hdrs = [
+        "debug_stripper.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:graph_optimizer",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "debug_stripper_test",
+    size = "small",
+    srcs = ["debug_stripper_test.cc"],
+    deps = [
+        ":debug_stripper",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 9c544c82bf7f77760e5a2090ca947fd7185e27b7..18076eee96e33aafd01247e75c38886cf66dd702 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 
 #include <algorithm>
+#include <deque>
 #include <limits>
 #include <unordered_map>
 #include <unordered_set>
@@ -30,10 +31,13 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -45,58 +49,6 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-template <typename T>
-bool SafeSetTensorValue(double value, Tensor* tensor) {
-  using RealType = typename Eigen::NumTraits<T>::Real;
-  if (value > std::numeric_limits<RealType>::max() ||
-      value < std::numeric_limits<RealType>::min()) {
-    return false;
-  }
-  tensor->flat<T>()(0) = static_cast<T>(value);
-  return true;
-}
-
-#define HANDLE_CASE(DTYPE)                                          \
-  case DTYPE:                                                       \
-    if (!SafeSetTensorValue<EnumToDataType<DTYPE>::Type>(           \
-            static_cast<double>(value), tensor)) {                  \
-      return errors::InvalidArgument("Cannot store value ", value,  \
-                                     " in tensor of type " #DTYPE); \
-    }                                                               \
-    break
-
-Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
-  switch (dtype) {
-    //    HANDLE_CASE(DT_HALF);
-    HANDLE_CASE(DT_FLOAT);
-    HANDLE_CASE(DT_DOUBLE);
-    HANDLE_CASE(DT_UINT8);
-    HANDLE_CASE(DT_INT8);
-    HANDLE_CASE(DT_UINT16);
-    HANDLE_CASE(DT_INT16);
-    HANDLE_CASE(DT_INT32);
-    HANDLE_CASE(DT_INT64);
-    HANDLE_CASE(DT_COMPLEX64);
-    HANDLE_CASE(DT_COMPLEX128);
-    default:
-      return errors::InvalidArgument("Unexpected type ", DataTypeString(dtype));
-  }
-  return Status::OK();
-}
-
-template <typename T>
-bool AreInversePermutations(const std::vector<T>& a, const std::vector<T>& b) {
-  if (a.size() != b.size()) {
-    return false;
-  }
-  for (int i = 0; i < a.size(); ++i) {
-    if (a[b[i]] != i) {
-      return false;
-    }
-  }
-  return true;
-}
-
 // Extract values from a Const op to `values`. Returns true if succeeds.
 template <typename T>
 bool ValuesFromConstNode(const NodeDef& node, std::vector<T>* values) {
@@ -247,45 +199,22 @@ void SetSourceDataType(DataType dtype, NodeDef* node) {
 
 bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
 
-const char kOutputShapesAttr[] = "_output_shapes";
-
-PartialTensorShape GetInputShape(const string& input, const NodeMap& node_map) {
-  int output_pos;
-  string node_name = ParseNodeName(input, &output_pos);
-  const NodeDef* input_node = node_map.GetNode(node_name);
-  return input_node->attr().at(kOutputShapesAttr).list().shape(output_pos);
-}
-
-bool ShapesEqual(const string& input_x, const string& input_y,
-                 const NodeMap& node_map) {
-  PartialTensorShape x_shape = GetInputShape(input_x, node_map);
-  PartialTensorShape y_shape = GetInputShape(input_y, node_map);
-  if (x_shape.unknown_rank() || y_shape.unknown_rank() ||
-      x_shape.dims() != y_shape.dims()) {
-    return false;
-  }
-  for (int i = 0; i < x_shape.dims(); ++i) {
-    if (x_shape.dim_size(i) == -1 || y_shape.dim_size(i) == -1 ||
-        x_shape.dim_size(i) != y_shape.dim_size(i)) {
-      return false;
-    }
-  }
-  return true;
-}
-
 // Returns whether `reshape` is an identity op. The tensor that `reshape`
 // reshapes is the `output_pos`-th output of node `input`.
 bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
-                       const int output_pos) {
-  if (!reshape.attr().count(kOutputShapesAttr) ||
-      !input.attr().count(kOutputShapesAttr)) {
+                       const int output_pos,
+                       const GraphProperties& graph_properties) {
+  const std::vector<OpInfo::TensorProperties>& reshape_props =
+      graph_properties.GetOutputProperties(reshape.name());
+  const std::vector<OpInfo::TensorProperties>& input_props =
+      graph_properties.GetOutputProperties(input.name());
+  if (reshape_props.empty() || input_props.size() <= output_pos) {
     return false;
   }
 
-  PartialTensorShape src_shape(
-      input.attr().at(kOutputShapesAttr).list().shape(output_pos));
-  PartialTensorShape dst_shape(
-      reshape.attr().at(kOutputShapesAttr).list().shape(0));
+  const PartialTensorShape& src_shape = input_props[output_pos].shape();
+  const PartialTensorShape& dst_shape = reshape_props[0].shape();
+
   if (src_shape.unknown_rank() || dst_shape.unknown_rank()) {
     return false;
   }
@@ -298,7 +227,8 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
   // sizes.
   auto num_unknown_dim_sizes = [](const PartialTensorShape& partial_shape) {
     auto dim_sizes = partial_shape.dim_sizes();
-    return std::count(dim_sizes.begin(), dim_sizes.end(), -1);
+    return std::count_if(dim_sizes.begin(), dim_sizes.end(),
+                         [](int dim) { return dim < 0; });
   };
   int src_num_unknown_dim_sizes = num_unknown_dim_sizes(src_shape);
   int dst_num_unknown_dim_sizes = num_unknown_dim_sizes(dst_shape);
@@ -306,31 +236,1315 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
     return false;
   }
 
-  // Now, src_shape and dst_shape have at most one dimension with unknown
-  // sizes, and are compatible. Therefore, the reshape is a no-op when
-  //
-  // 1. at least one of them is fully-defined, or
-  // 2. both are partially defined and the -1 appears on the same dimension,
-  //    i.e., IsIdenticalTo returns true.
-  if (src_num_unknown_dim_sizes == 1 && dst_num_unknown_dim_sizes == 1) {
-    return dst_shape.IsIdenticalTo(src_shape);
+  // If dst_num_unknown_dim_sizes != src_num_unknown_dim_sizes we would weaken
+  // shape inference in subsequent passes if we removed this reshape.
+  if (src_num_unknown_dim_sizes != dst_num_unknown_dim_sizes) {
+    return false;
   }
 
-  return true;
+  // Remove the reshape if both are fully defined or partially defined and the
+  // unknown or symbolic shape appears on the same dimension, i.e., if
+  // IsIdenticalTo returns true.
+  return dst_shape.IsIdenticalTo(src_shape);
 }
 
 NodeDef* GetTailOfValuePreservingChain(
     const NodeDef& node, const NodeMap& node_map,
     const std::unordered_set<string>& nodes_to_preserve) {
   auto is_value_preserving_non_branching = [&](const NodeDef& node) {
-    return IsValuePreserving(node) &&
-           NumNonControlOutputs(node, node_map) == 1 &&
-           nodes_to_preserve.count(node.name()) == 0;
+    return nodes_to_preserve.find(node.name()) == nodes_to_preserve.end() &&
+           IsValuePreserving(node) && NumNonControlOutputs(node, node_map) == 1;
   };
   return GetTailOfChain(node, node_map, /*follow_control_input=*/false,
                         is_value_preserving_non_branching);
 }
 
+// Graph optimizer context extension specific to ArithmeticOptimizer.
+struct ArithmeticOptimizerContext {
+  explicit ArithmeticOptimizerContext(SetVector<NodeDef*>* nodes_to_simplify)
+      : nodes_to_simplify(nodes_to_simplify) {}
+  SetVector<NodeDef*>* nodes_to_simplify;
+};
+
+// Base class for single arithmetic optimization: e.g. Bitcast optimization,
+// AddOps optimization, etc...
+class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
+ public:
+  explicit ArithmeticOptimizerStage(const string& name,
+                                    const GraphOptimizerContext& ctx,
+                                    const ArithmeticOptimizerContext ctx_ext)
+      : GraphOptimizerStage("ArithmeticOptimizer", name, ctx),
+        ctx_ext_(ctx_ext) {}
+  virtual ~ArithmeticOptimizerStage() = default;
+
+ protected:
+  // Simplification graph rewrite can create additional nodes that are inputs
+  // to final simplified node, they can be also added to the arithmetic
+  // optimizer queue for further optimization.
+  void AddToOptimizationQueue(NodeDef* node) {
+    ctx_ext_.nodes_to_simplify->PushBack(node);
+  }
+
+  // TODO(ezhulenev): remove this method from ArithmeticOptimizer when all
+  // optimizations will be migrated to stages
+  void ForwardControlDependencies(
+      NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
+    for (const auto& src : src_nodes) {
+      for (int i = src->input_size() - 1; i >= 0; --i) {
+        if (IsControlInput(src->input(i))) {
+          *target_node->add_input() = src->input(i);
+          ctx().node_map->AddOutput(NodeName(src->input(i)),
+                                    target_node->name());
+        } else {
+          break;
+        }
+      }
+    }
+  }
+
+ private:
+  // Extended context required for ArithmeticOptimizer.
+  const ArithmeticOptimizerContext ctx_ext_;
+};
+
+// Subtype of ArithmeticOptimizerStage that does optimization by rewriting a
+// group of nodes from the optimized graph.
+//
+// * AddOpsRewrite:
+//   Rewrite a group of Add/AddN with compact Add/AddN tree
+//
+// * MinimizeBroadcasts:
+//   Rewrite a group of binary associative ops, reordering
+//   inputs, to minimize the cost of broadcast
+class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
+ public:
+  explicit ArithmeticNodesGroupOptimizerStage(
+      const string& name, const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext ctx_ext)
+      : ArithmeticOptimizerStage(name, ctx, ctx_ext), optimized_nodes_{} {}
+  ~ArithmeticNodesGroupOptimizerStage() override = default;
+
+  // Input name with a statically inferred shape from GraphProperties
+  struct InputAndShape {
+    InputAndShape(const string& input, const TensorShapeProto& shape)
+        : input(input), shape(shape) {}
+    string input;
+    TensorShapeProto shape;
+  };
+
+  // Subgraph (subtree) of nodes, that we want to optimize in "one shot" (e.g.
+  // all the Add nodes that we plan to rewrite with a single AddN). Subgraph is
+  // obtained by graph traversal, starting from a root node.
+  struct OptimizedNodesGroup {
+    NodeDef* root_node;
+    TensorShapeProto root_shape;
+    // Optimized nodes that will be updated or removed by rewrite
+    std::vector<NodeDef*> optimized_nodes;
+    // Inputs to optimized nodes
+    std::vector<InputAndShape> inputs;
+  };
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
+    OptimizedNodesGroup group;
+    TF_RETURN_IF_ERROR(CreateOptimizedNodesGroup(node, &group));
+
+    if (!group.optimized_nodes.empty()) {
+      *simplified_node_name = RewriteOptimizedNodesGroup(group);
+    }
+
+    return Status::OK();
+  }
+
+ protected:
+  // Modify the optimized graph after nodes group was successfully identified
+  virtual string RewriteOptimizedNodesGroup(
+      const OptimizedNodesGroup& group) = 0;
+
+  // Check if input can become a part of current optimized nodes group.
+  virtual bool IsAbsorbableByOptimizedNodesGroup(
+      const OptimizedNodesGroup& group, const NodeDef& node) const = 0;
+
+  Status AbsorbInputByOptimizedNodesGroup(const string& input,
+                                          OptimizedNodesGroup* group) const {
+    std::deque<const string*> input_tensors;
+    input_tensors.push_front(&input);
+
+    while (!input_tensors.empty()) {
+      const string* input_tensor = input_tensors.front();
+      input_tensors.pop_front();
+
+      // Get a node for the input tensor.
+      NodeDef* input_node;
+      TF_RETURN_IF_ERROR(GetInputNode(*input_tensor, &input_node));
+
+      if (IsAbsorbableByOptimizedNodesGroup(*group, *input_node)) {
+        group->optimized_nodes.push_back(input_node);
+        for (int i = input_node->input_size() - 1; i >= 0; --i) {
+          const string& absorbed_node_input = input_node->input(i);
+          // TODO(ezhulenev): support control inputs
+          if (IsControlInput(absorbed_node_input)) continue;
+          input_tensors.push_front(&absorbed_node_input);
+        }
+      } else {
+        // If input node can't be absorbed, add it to OptimizedNodesGroup input.
+        OpInfo::TensorProperties properties;
+        TF_RETURN_IF_ERROR(GetTensorProperties(*input_tensor, &properties));
+        group->inputs.emplace_back(*input_tensor, properties.shape());
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status CreateOptimizedNodesGroup(NodeDef* root_node,
+                                   OptimizedNodesGroup* group) const {
+    OpInfo::TensorProperties root_node_output_properties;
+    TF_RETURN_IF_ERROR(
+        GetTensorProperties(root_node->name(), &root_node_output_properties));
+
+    group->root_node = root_node;
+    group->root_shape = root_node_output_properties.shape();
+
+    group->optimized_nodes.reserve(root_node->input_size());
+    for (int i = 0; i < root_node->input_size(); ++i) {
+      const string& input_i = root_node->input(i);
+      // TODO(ezhulenev): add support for control inputs
+      if (IsControlInput(input_i)) continue;
+      TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
+    }
+
+    return Status::OK();
+  }
+
+  // Check if all inputs can be broadcasted to the same shape
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool HasAllInputsBroadcastableToShape(
+      const NodeDef& node, const OpInfo::TensorProperties& properties) const {
+    auto is_broadcastable = [this, &properties](const string& input) {
+      OpInfo::TensorProperties input_props;
+      Status has_input_properties = GetTensorProperties(input, &input_props);
+      return has_input_properties.ok() &&
+             ShapesBroadcastable(properties, input_props);
+    };
+    return std::all_of(node.input().begin(), node.input().end(),
+                       is_broadcastable);
+  }
+
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool IsDrivenByControlDependency(const NodeDef& node) const {
+    return std::any_of(node.input().begin(), node.input().end(),
+                       IsControlInput);
+  }
+
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool DrivesControlDependency(const NodeDef& node) const {
+    int position;
+    for (const NodeDef* output : ctx().node_map->GetOutputs(node.name())) {
+      for (int i = 0; i < output->input_size(); ++i) {
+        auto input = output->input(i);
+        string name = ParseNodeName(input, &position);
+        if (name == node.name() && /*control input*/ position < 0) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  string ShapeSignature(const TensorShapeProto& shape) const {
+    string signature = strings::StrCat("rank:", shape.dim_size(), ":dim");
+    for (int i = 0; i < shape.dim_size(); ++i)
+      strings::StrAppend(&signature, ":", shape.dim(i).size());
+    return signature;
+  }
+
+  void AddToOptimizedNodes(const NodeDef* node) {
+    optimized_nodes_.insert(node->name());
+  }
+
+  void AddAllMembersToOptimizedNodes(const OptimizedNodesGroup& group) {
+    AddToOptimizedNodes(group.root_node);
+    for (const NodeDef* opt : group.optimized_nodes) AddToOptimizedNodes(opt);
+  }
+
+  bool IsOnTheSameDevice(const OptimizedNodesGroup& group,
+                         const NodeDef& node) const {
+    return group.root_node->device() == node.device();
+  }
+
+  bool IsInPreserveSet(const NodeDef& node) const {
+    return ctx().nodes_to_preserve->find(node.name()) !=
+           ctx().nodes_to_preserve->end();
+  }
+
+  bool IsAlreadyOptimized(const NodeDef& node) const {
+    return optimized_nodes_.find(node.name()) != optimized_nodes_.end();
+  }
+
+ private:
+  // set of nodes already processed by this optimizer stage
+  std::unordered_set<string> optimized_nodes_;
+};
+
+// Rewrite a tree of Add/AddN with a single AddN operation, consuming all the
+// original inputs of absorbed nodes.
+//
+// 1) All nodes must have the same device placement.
+//
+// 2) If All nodes in a Add/AddN subgraph have symbolically equal shape, tree is
+//    optimized to a single AddN node.
+//
+//                AddN_1
+//             /    |    \
+//          Add_1   z   Add_2       -> AddN(x, y, z, w, q, e)
+//          /  \        /  \
+//         x    y      w    Add_3
+//                          / \
+//                         q   e
+//
+// 3) If some nodes have different shape (it needs to be broadcastable to the
+//    shape of a "root), tree is optimized to AddNs for symbolically equal
+//    shapes, and a tree of Add ops, that minimize broadcasts.
+//
+//                AddN_1                                 Add
+//             /    |    \                              /  \
+//          Add_1   z   Add_2       ->               Add    w
+//          /  \        /  \                        /   \
+//         x    y      w    Add_3      AddN(x, y, q, e)  z
+//                          / \
+//                         q   e
+class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
+ public:
+  explicit AddOpsRewriteStage(const GraphOptimizerContext& ctx,
+                              const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticNodesGroupOptimizerStage("AddOpsRewrite", ctx, ctx_ext) {}
+  ~AddOpsRewriteStage() override = default;
+
+  // Check if a node can become a root of AddOpsGroup
+  bool IsSupported(const NodeDef* node) const override {
+    if (!CanOptimize(*node)) return false;
+
+    // shape must be symbolically defined and all inputs compatible with it
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(node->name(), &properties);
+    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
+           HasAllInputsBroadcastableToShape(*node, properties);
+  }
+
+ protected:
+  // Check if a node can be absorbed by current OptimizedNodesGroup
+  bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
+                                         const NodeDef& node) const override {
+    if (!CanOptimize(node)) return false;
+
+    if (!IsOnTheSameDevice(group, node)) {
+      return false;
+    }
+    // with a single output data consumer (presumably if we reach this node from
+    // previously absorbed or a root node, it means that this node is not used
+    // as an input to any other op, outside of the group)
+    if (NumNonControlDataOutputs(node, *ctx().node_map) != 1) {
+      return false;
+    }
+    // All input shapes must be broadcastable to the node shape
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(node.name(), &properties);
+    return has_properties.ok() &&
+           HasAllInputsBroadcastableToShape(node, properties);
+  }
+
+  // Node requirements both for a root node and an absorbed node
+  bool CanOptimize(const NodeDef& node) const {
+    // TODO(ezhulenev): check if AccumulateNV2 can be supported too
+    if (!IsAdd(node) && !IsAddN(node)) {
+      return false;
+    }
+    if (IsInPreserveSet(node) || IsAlreadyOptimized(node)) {
+      return false;
+    }
+    // TODO(ezhulenev): relax this condition for root node
+    return !(IsDrivenByControlDependency(node) ||
+             DrivesControlDependency(node));
+  }
+
+  // Rewrite a group of add ops into a single AddN if all input shapes are
+  // symbolically equal. If not, create AddN for equal shapes first, and then
+  // build an Add tree, minimizing the cost of broadcasts.
+  string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
+    VLOG(2) << "Collapse Add/AddN: root=" << group.root_node->name()
+            << " op=" << group.root_node->op()
+            << " num_optimized_nodes=" << group.optimized_nodes.size()
+            << " num_inputs=" << group.inputs.size();
+
+    // Do not optimize any of the nodes that are part of this group.
+    AddAllMembersToOptimizedNodes(group);
+
+    // All new nodes will be placed under the scope of a root node.
+    auto root_scope_and_name = ParseNodeScopeAndName(group.root_node->name());
+
+    // Find what shapes are present in the inputs of absorbed nodes.
+    std::unordered_map<string, std::vector<InputAndShape>> shape_sig_to_inputs;
+    for (const auto& input : group.inputs) {
+      shape_sig_to_inputs[ShapeSignature(input.shape)].push_back(input);
+    }
+
+    using SigKV = decltype(shape_sig_to_inputs)::value_type;
+    VLOG(3) << "Add/AddN group has " << shape_sig_to_inputs.size()
+            << " unique shapes: "
+            << str_util::Join(shape_sig_to_inputs, ", ",
+                              [](string* out, SigKV p) {
+                                strings::StrAppend(out, p.first);
+                              });
+
+    // Collect all the shapes from representative elements.
+    std::vector<TensorShapeProto> shapes;
+    shapes.reserve(shape_sig_to_inputs.size());
+    for (const auto& el : shape_sig_to_inputs)
+      shapes.push_back(el.second[0].shape);
+
+    // If all inputs have the same shape, rewrite whole group with a single AddN
+    if (shapes.size() == 1) {
+      string node_name = OptimizedNodeName(root_scope_and_name);
+      AddInputsOfSymbolicallyEqualShape(*group.root_node, node_name,
+                                        group.inputs);
+      return node_name;
+    }
+
+    // For inputs of different shapes:
+    // 1. Rewrite inputs of the same shape using AddN (leaf nodes)
+    // 2. Build a tree of Add nodes, minimizing cost of broadcast
+    std::sort(shapes.begin(), shapes.end(),
+              [](const TensorShapeProto& left, const TensorShapeProto& right) {
+                return CompareSymbolicallyShapedTensorSizes(left, right);
+              });
+
+    // optimized name for leaf AddN nodes
+    auto leaf_node_name = [&root_scope_and_name, this](int i) {
+      return OptimizedNodeName(root_scope_and_name,
+                               strings::StrCat("Leaf_", i));
+    };
+    // optimized name for internal nodes of a tree built up from AddN leaves
+    auto internal_node_name = [&root_scope_and_name, this](int i) {
+      return OptimizedNodeName(root_scope_and_name,
+                               strings::StrCat("Internal_", i));
+    };
+
+    // Add/AddN nodes that must be added to the tree
+    std::deque<InputAndShape> add_ops;
+
+    // Prepare leaf AddN nodes for inputs of equal shape
+    for (int i = 0; i < shapes.size(); ++i) {
+      const auto node_name = leaf_node_name(i);
+      const auto& inputs = shape_sig_to_inputs[ShapeSignature(shapes[i])];
+      add_ops.push_back(AddInputsOfSymbolicallyEqualShape(*group.root_node,
+                                                          node_name, inputs));
+    }
+
+    // Build up a tree of Add ops
+    int internal_nodes = 0;
+    do {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops.front();
+      add_ops.pop_front();
+      string name = add_ops.empty() ? OptimizedNodeName(root_scope_and_name)
+                                    : internal_node_name(internal_nodes++);
+      InputAndShape add = AddAggregatedInputs(*group.root_node, name, lhs, rhs);
+      add_ops.push_front(add);
+    } while (add_ops.size() > 1);
+
+    InputAndShape optimized_root_node = add_ops.front();
+    return optimized_root_node.input;
+  }
+
+  // Add 'AddN' node to aggregate inputs of symbolically equal shape
+  InputAndShape AddInputsOfSymbolicallyEqualShape(
+      const NodeDef& root_node, const string& node_name,
+      const std::vector<InputAndShape>& inputs) {
+    CHECK(!inputs.empty()) << "Inputs must be non-empty";
+
+    // Do not create redundant AddN nodes
+    if (inputs.size() == 1) {
+      return inputs[0];
+    }
+
+    // get shape from representative element
+    auto shape = inputs[0].shape;
+
+    // copy attributes from a root node
+    DataType dtype = root_node.attr().at("T").type();
+
+    // add new AddN node
+    NodeDef* node = AddEmptyNode(node_name);
+    node->set_op("AddN");
+    node->set_device(root_node.device());
+    (*node->mutable_attr())["T"].set_type(dtype);
+    (*node->mutable_attr())["N"].set_i(inputs.size());
+
+    for (const auto& inputAndShape : inputs) {
+      ctx().node_map->AddOutput(inputAndShape.input, node_name);
+      node->add_input(inputAndShape.input);
+    }
+
+    AddToOptimizedNodes(node);
+    return InputAndShape(node_name, shape);
+  }
+
+  // Add a single 'Add' node to sum two inputs
+  InputAndShape AddAggregatedInputs(const NodeDef& root_node,
+                                    const string& node_name,
+                                    const InputAndShape& left,
+                                    const InputAndShape& right) {
+    // copy attributes from a root node
+    DataType dtype = root_node.attr().at("T").type();
+
+    // add new Add node
+    NodeDef* node = AddEmptyNode(node_name);
+    node->set_op("Add");
+    node->set_device(root_node.device());
+    (*node->mutable_attr())["T"].set_type(dtype);
+
+    ctx().node_map->AddOutput(left.input, node_name);
+    ctx().node_map->AddOutput(right.input, node_name);
+
+    node->add_input(left.input);
+    node->add_input(right.input);
+
+    AddToOptimizedNodes(node);
+    return InputAndShape(
+        node_name, TensorShapeProto());  // shape is not important at this point
+  }
+};
+
+// Use the distributive property of multiplication and division over addition,
+// along with commutativity of the former, to hoist common factors/denominators
+// out of aggregate nodes where ALL the inputs are Mul/Div nodes.
+// This pattern occurs frequently in regularization terms for the gradients
+// during training.
+//
+// For example, we can rewrite an expression of the form:
+//   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
+// to the following:
+//   Mul(x, AddN(y1, y2, y3, ... yn))
+// For division, we can rewrite
+//   AddN(Div(y1, x), Div(y2, x), Div(y3, x), ... Div(yn, x))
+// to:
+//   Div(AddN(y1, y2, y3, ... yn), x)
+class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
+ public:
+  explicit HoistCommonFactorOutOfAggregation(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("HoistCommonFactor", ctx, ctx_ext) {}
+  ~HoistCommonFactorOutOfAggregation() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsAggregate(*node) && NumNonControlInputs(*node) > 1 &&
+           !IsRewritten(node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
+    bool common_factor_is_denominator = false;
+    std::set<string> common_factors;
+    std::vector<string> ctrl_deps;
+    TF_RETURN_IF_ERROR(GetCommonFactors(
+        node, &common_factors, &common_factor_is_denominator, &ctrl_deps));
+
+    if (common_factors.size() == 1) {
+      const string& common_factor = *common_factors.begin();
+
+      // Gather up the non-shared factors
+      bool shapes_match = true;
+      std::vector<string> unique_factors;
+      TF_RETURN_IF_ERROR(GetUniqueFactors(node, common_factor,
+                                          common_factor_is_denominator,
+                                          &shapes_match, &unique_factors));
+
+      if (shapes_match) {
+        NodeDef* input_0;
+        TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input_0));
+
+        // Use a copy of the first node for the outer multiplication/division.
+        NodeDef* new_outer_node = AddCopyNode(
+            OuterNodeName(node, common_factor_is_denominator), input_0);
+        // And a copy of aggregation node as one of the inner operands
+        NodeDef* new_add_node = AddCopyNode(InnerAddNodeName(node), node);
+
+        new_outer_node->set_device(node->device());
+        if (common_factor_is_denominator) {
+          new_outer_node->set_input(0, new_add_node->name());
+          new_outer_node->set_input(1, common_factor);
+        } else {
+          new_outer_node->set_input(0, common_factor);
+          new_outer_node->set_input(1, new_add_node->name());
+        }
+
+        ctx().node_map->AddOutput(common_factor, new_outer_node->name());
+        ctx().node_map->AddOutput(new_add_node->name(), new_outer_node->name());
+
+        // Hoist non-shared factors up into the new AddN node.
+        for (int i = 0; i < unique_factors.size(); ++i) {
+          const string& unique_factor_i = unique_factors[i];
+          new_add_node->set_input(i, unique_factor_i);
+          ctx().node_map->AddOutput(unique_factor_i, new_add_node->name());
+        }
+
+        // Add control deps on add node
+        for (const string& ctrl_dep : ctrl_deps) {
+          *new_add_node->add_input() = ctrl_dep;
+          ctx().node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name());
+        }
+
+        // optimize new inner aggregation node
+        AddToOptimizationQueue(new_add_node);
+        // do not optimize the same node twice
+        rewritten_nodes_.insert(node->name());
+        *simplified_node_name = new_outer_node->name();
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  // Get a name for new outer node
+  string OuterNodeName(const NodeDef* node, bool is_div) const {
+    auto scope_and_name = ParseNodeScopeAndName(node->name());
+    return is_div ? OptimizedNodeName(scope_and_name, "Div")
+                  : OptimizedNodeName(scope_and_name, "Mul");
+  }
+
+  // Get a name new inner Add node
+  string InnerAddNodeName(const NodeDef* node) const {
+    auto scope_and_name = ParseNodeScopeAndName(node->name());
+    return OptimizedNodeName(scope_and_name, "Add");
+  }
+
+  // Determine the set of common factors if the input nodes are all Mul or
+  // Div nodes.
+  Status GetCommonFactors(const NodeDef* node, std::set<string>* common_factors,
+                          bool* common_factor_is_denominator,
+                          std::vector<string>* ctrl_deps) const {
+    CHECK(common_factors->empty());
+    CHECK_NOTNULL(common_factor_is_denominator);
+    *common_factor_is_denominator = false;
+
+    bool has_mul = false;
+    bool has_div = false;
+    for (int i = 0; i < node->input_size(); ++i) {
+      if (i > 0 && common_factors->empty()) break;
+      if (IsControlInput(node->input(i))) {
+        ctrl_deps->push_back(node->input(i));
+        continue;
+      }
+      NodeDef* input;
+      TF_RETURN_IF_ERROR(GetInputNode(node->input(i), &input));
+
+      if ((!IsMul(*input) && !IsAnyDiv(*input)) || (IsMul(*input) && has_div) ||
+          (IsAnyDiv(*input) && has_mul)) {
+        // Break if input is neither a Mul or Div, or if there are both Mul &
+        // Div Ops.
+        common_factors->clear();
+        break;
+      } else if (IsAnyDiv(*input)) {
+        has_div = true;
+        // In case of possible common dividers, we avoid hoisting out if any
+        // input is not float/double, since integer division is not distributive
+        // over addition.
+        OpInfo::TensorProperties properties0, properties1;
+        TF_RETURN_IF_ERROR(GetTensorProperties(input->input(0), &properties0));
+        TF_RETURN_IF_ERROR(GetTensorProperties(input->input(1), &properties1));
+        if (properties0.dtype() != DT_FLOAT &&
+            properties0.dtype() != DT_DOUBLE &&
+            properties1.dtype() != DT_FLOAT &&
+            properties1.dtype() != DT_DOUBLE) {
+          common_factors->clear();
+          break;
+        }
+      } else if (IsMul(*input)) {
+        has_mul = true;
+      }
+
+      // We only focus on common factors from denominators if any Op is a
+      // Div.
+      std::set<string> factors_i =
+          has_mul ? std::set<string>{input->input(0), input->input(1)}
+                  : std::set<string>{input->input(1)};
+      if (i == 0) {
+        std::swap(*common_factors, factors_i);
+      } else {
+        std::set<string> intersection;
+        std::set_intersection(
+            factors_i.begin(), factors_i.end(), common_factors->begin(),
+            common_factors->end(),
+            std::inserter(intersection, intersection.begin()));
+        std::swap(*common_factors, intersection);
+      }
+      for (int i = 2; i < input->input_size(); ++i) {
+        ctrl_deps->push_back(input->input(i));
+      }
+    }
+
+    *common_factor_is_denominator = has_div;
+    return Status::OK();
+  }
+
+  // Gather up the non-shared factors (the y's in the example).
+  // Unless the aggregation is Add, we have to make sure that all the y's
+  // have the same shape since the other aggregation ops do not support
+  // broadcasting.
+  Status GetUniqueFactors(const NodeDef* node, const string& common_factor,
+                          const bool common_factor_is_denominator,
+                          bool* shapes_match,
+                          std::vector<string>* unique_factors) const {
+    *shapes_match = true;
+    unique_factors->reserve(node->input_size());
+
+    for (int i = 0; i < node->input_size() && shapes_match; ++i) {
+      const string& input = node->input(i);
+      if (IsControlInput(input)) {
+        break;
+      }
+      NodeDef* inner_node;
+      TF_RETURN_IF_ERROR(GetInputNode(input, &inner_node));
+      const int unique_factor_index =
+          common_factor_is_denominator
+              ? 0
+              : (inner_node->input(0) == common_factor ? 1 : 0);
+      unique_factors->push_back(inner_node->input(unique_factor_index));
+      if (i > 0 && !IsAdd(*node)) {
+        OpInfo::TensorProperties lhs;
+        OpInfo::TensorProperties rhs;
+        TF_RETURN_IF_ERROR(GetTensorProperties(unique_factors->front(), &lhs));
+        TF_RETURN_IF_ERROR(GetTensorProperties(unique_factors->back(), &rhs));
+        *shapes_match = ShapesSymbolicallyEqual(lhs, rhs);
+      }
+    }
+    return Status::OK();
+  }
+
+  bool IsRewritten(const NodeDef* node) const {
+    // if graph rewrite happens in multiple passes without graph pruning between
+    // them, it's possible that rewritten node already exists in a graph
+    return rewritten_nodes_.find(node->name()) != rewritten_nodes_.end() ||
+           ctx().node_map->NodeExists(OuterNodeName(node, false)) ||
+           ctx().node_map->NodeExists(OuterNodeName(node, true));
+  }
+
+  // keep names of the nodes that were optimized by this stage
+  std::unordered_set<string> rewritten_nodes_;
+};
+
+// Binary associative ops can be re-ordered to minimize the number of broadcasts
+// and the size of a temporary tensors.
+//
+// Example: [a, c] - scalars, [b, d] - matrices
+//   @ - binary associative op (Add or Mul)
+//   @* - broadcast
+//
+//           @                      @*
+//        /     \                /      \
+//      @*       @*      ->     @        @
+//    /   \    /   \          /   \    /   \
+//   a     b  c     d        a     c  b     d
+class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
+ public:
+  explicit MinimizeBroadcasts(const GraphOptimizerContext& ctx,
+                              const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticNodesGroupOptimizerStage("MinimizeBroadcasts", ctx, ctx_ext) {
+  }
+  ~MinimizeBroadcasts() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    if (!IsBinaryAssociative(*node)) return false;
+    if (IsAlreadyOptimized(*node)) return false;
+
+    // has a symbolically defined shape with broadcastable inputs
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(node->name(), &properties);
+    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
+           HasAllInputsBroadcastableToShape(*node, properties);
+  }
+
+ protected:
+  bool IsBinaryAssociative(const NodeDef& node) const {
+    return IsMul(node) || IsAdd(node);
+  }
+
+  bool IsSameOp(const OptimizedNodesGroup& group, const NodeDef& node) const {
+    return group.root_node->op() == node.op();
+  }
+
+  // Check if a node can be absorbed by current OptimizedNodesGroup
+  bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
+                                         const NodeDef& node) const override {
+    if (!IsSameOp(group, node)) {
+      return false;
+    }
+    if (IsInPreserveSet(node) || IsAlreadyOptimized(node)) {
+      return false;
+    }
+    if (IsDrivenByControlDependency(node) || DrivesControlDependency(node)) {
+      return false;
+    }
+    if (!IsOnTheSameDevice(group, node)) {
+      return false;
+    }
+    // Optimized nodes updated in place, and that would break the graph, if the
+    // node has multiple output consumers
+    if (NumNonControlOutputs(node, *ctx().node_map) != 1) {
+      return false;
+    }
+    // All input shapes must be broadcastable to the node shape
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(node.name(), &properties);
+    return has_properties.ok() &&
+           HasAllInputsBroadcastableToShape(node, properties);
+  }
+
+  std::size_t CountUniqueShapes(const std::vector<InputAndShape>& inputs) {
+    std::set<string> sigs;
+    for (const auto& ias : inputs) {
+      sigs.insert(ShapeSignature(ias.shape));
+    }
+    return sigs.size();
+  }
+
+  string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
+    VLOG(2) << "Minimize broadcast: root=" << group.root_node->name()
+            << " op=" << group.root_node->op()
+            << " num_optimized_nodes=" << group.optimized_nodes.size();
+
+    // Do not optimize any of the nodes that are part of this group.
+    AddAllMembersToOptimizedNodes(group);
+
+    if (CountUniqueShapes(group.inputs) <= 1) {
+      VLOG(3) << "Skip min-bcast group with single unique shape";
+      // nothing to optimize when all shapes are the same
+      return group.root_node->name();
+    }
+
+    auto num_nodes = /*root*/ 1 + group.optimized_nodes.size();
+    auto num_inputs = group.inputs.size();
+    CHECK_EQ(num_nodes, num_inputs - 1)
+        << "Can't build a tree with " << num_inputs << " inputs, using "
+        << num_nodes << "binary op nodes.";
+
+    std::deque<InputAndShape> add_ops(group.inputs.begin(), group.inputs.end());
+    std::deque<NodeDef*> optimized_nodes(group.optimized_nodes.begin(),
+                                         group.optimized_nodes.end());
+
+    // sort inputs by it's shape from smallest to largest
+    std::stable_sort(add_ops.begin(), add_ops.end(),
+                     [](const InputAndShape& lhs, const InputAndShape& rhs) {
+                       return CompareSymbolicallyShapedTensorSizes(lhs.shape,
+                                                                   rhs.shape);
+                     });
+
+    // If there is an odd number of inputs, last one is the largest, and we want
+    // to attach it to the root node, to build a well balanced tree.
+    std::deque<InputAndShape> add_ops_leftover;
+    if (add_ops.size() % 2 != 0) {
+      add_ops_leftover.push_back(add_ops.back());
+      add_ops.pop_back();
+    }
+
+    // At this point it's guaranteed that add_ops have even number of inputs.
+    do {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops.front();
+      add_ops.pop_front();
+
+      NodeDef* node;
+      if (!optimized_nodes.empty()) {
+        // re-purpose optimized nodes to build a new tree
+        node = optimized_nodes.back();
+        optimized_nodes.pop_back();
+      } else {
+        // or use root node if none optimized nodes left
+        node = group.root_node;
+      }
+      InputAndShape updated_node = UpdateInputs(lhs.input, rhs.input, node);
+
+      // Pushing updated node to the back of a deque will create a wide and
+      // short tree, pushing to the front will create a tall tree. We prefer to
+      // get a wide tree, it minimizes the potential number of temporary tensors
+      // required to keep in memory, though sometimes we can go up to prevent
+      // propagating a brodcast from leaves to the root. Example:
+      //
+      // inputs: [s, s, s, M] (s - scalar, M - matrix)
+      // @* - op with broadcast
+      //
+      //  (only push_back)           @*     (push_front first op)
+      //                            /  \
+      //       @*                  @    M
+      //     /   \                / \
+      //    @     @*      ->     @   s
+      //   / \   / \            / \
+      //  s   s s   M          s   s
+      if (add_ops.size() >= 2 &&
+          CompareSymbolicallyShapedTensorSizes(add_ops.at(0).shape,
+                                               add_ops.at(1).shape)) {
+        add_ops.push_front(updated_node);
+      } else {
+        add_ops.push_back(updated_node);
+      }
+    } while (add_ops.size() > 1);
+    CHECK_EQ(1, add_ops.size());
+
+    // attach the largest tensor to the root op
+    if (!add_ops_leftover.empty()) {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops_leftover.front();
+      InputAndShape updated_node =
+          UpdateInputs(lhs.input, rhs.input, group.root_node);
+      add_ops.push_back(updated_node);
+    }
+
+    return add_ops.front().input;
+  }
+
+  InputAndShape UpdateInputs(const string& input_0, const string& input_1,
+                             NodeDef* node) {
+    string old_input_0 = node->input(0);
+    string old_input_1 = node->input(1);
+
+    // Update inputs only if they changed
+    if (old_input_0 != input_0 || old_input_1 != input_1) {
+      node->set_input(0, input_0);
+      node->set_input(1, input_1);
+      // Invalidate node properties (shape)
+      ctx().graph_properties->ClearOutputProperties(node->name());
+      ctx().graph_properties->ClearInputProperties(node->name());
+      // Update the node map
+      ctx().node_map->RemoveOutput(NodeName(old_input_0), node->name());
+      ctx().node_map->RemoveOutput(NodeName(old_input_1), node->name());
+      ctx().node_map->AddOutput(NodeName(input_0), node->name());
+      ctx().node_map->AddOutput(NodeName(input_1), node->name());
+      // Add updated node to optimization queue
+      AddToOptimizationQueue(node);
+    }
+
+    TensorShapeProto shape;  // shape is not important at this point
+    return InputAndShape(node->name(), shape);
+  }
+};
+
+// Removes inverse transpose nodes
+class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveIdentityTranspose(const GraphOptimizerContext& ctx,
+                                   const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveIdentityTranspose", ctx, ctx_ext) {}
+  ~RemoveIdentityTranspose() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsTranspose(*node) || IsConjugateTranspose(*node);
+  }
+
+  // TODO(rmlarsen): Forward control dependencies on the bypassed
+  // transpose nodes.
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
+    NodeDef* node_perm;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &node_perm));
+    if (!IsConstant(*node_perm)) {
+      return Status::OK();
+    }
+    std::vector<int64> node_perm_values;
+    TF_RETURN_IF_ERROR(GetPermutation(*node_perm, &node_perm_values));
+    if (input->op() == node->op()) {
+      // Remove pairs of transposes that cancel each other.
+      NodeDef* input_perm;
+      TF_RETURN_IF_ERROR(GetInputNode(input->input(1), &input_perm));
+      if (!IsConstant(*input_perm)) {
+        return Status::OK();
+      }
+      std::vector<int64> input_perm_values;
+      TF_RETURN_IF_ERROR(GetPermutation(*input_perm, &input_perm_values));
+      if (AreInversePermutations(node_perm_values, input_perm_values)) {
+        *simplified_node_name = input->input(0);
+      }
+    } else {
+      // Remove simple identity transposes.
+      if (IsIdentityPermutation(node_perm_values)) {
+        *simplified_node_name = node->input(0);
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  Status GetPermutation(const NodeDef& node_perm,
+                        std::vector<int64>* perm64) const {
+    std::vector<int> perm32;
+    if (ValuesFromConstNode(node_perm, &perm32)) {
+      perm64->reserve(perm32.size());
+      for (int val : perm32) {
+        perm64->push_back(static_cast<int64>(val));
+      }
+      return Status::OK();
+    }
+    if (ValuesFromConstNode(node_perm, perm64)) {
+      return Status::OK();
+    }
+    return errors::InvalidArgument("Couldn't extract permutation from ",
+                                   node_perm.name());
+  }
+
+  bool AreInversePermutations(const std::vector<int64>& a,
+                              const std::vector<int64>& b) {
+    if (a.size() != b.size()) {
+      return false;
+    }
+    for (int i = 0; i < a.size(); ++i) {
+      if (a[b[i]] != i) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool IsIdentityPermutation(const std::vector<int64>& perm) {
+    for (int64 i = 0; i < perm.size(); ++i) {
+      if (i != perm[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+// Remove redundant Bitcasts.
+// 1) Remove Bitcast whose source type and destination type are equal
+// 2) Rewrite Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
+class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveRedundantBitcastStage(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveRedundantBitcast", ctx, ctx_ext) {}
+  ~RemoveRedundantBitcastStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsBitcast(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
+    // Bypass Bitcast whose source type and destination type are equal.
+    if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
+      *simplified_node_name = node->input(0);
+      return Status::OK();
+    }
+
+    NodeDef* bitcast;
+    TF_RETURN_IF_ERROR(GetInputNode(node->name(), &bitcast));
+    NodeDef* operand;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &operand));
+
+    if (IsBitcast(*operand)) {
+      // Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
+      bitcast->set_input(0, operand->input(0));
+      SetSourceDataType(GetSourceDataType(*operand), bitcast);
+      ctx().node_map->UpdateInput(bitcast->name(), bitcast->input(0),
+                                  operand->input(0));
+      AddToOptimizationQueue(bitcast);
+      *simplified_node_name = bitcast->name();
+    }
+
+    return Status::OK();
+  }
+};
+
+// Remove Casts whose source type and destination type are equal.
+class RemoveRedundantCastStage : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveRedundantCastStage(const GraphOptimizerContext& ctx,
+                                    const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveRedundantCast", ctx, ctx_ext) {}
+  ~RemoveRedundantCastStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override { return IsCast(*node); }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
+    // Bypass Cast whose source type and destination type are equal.
+    if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
+      *simplified_node_name = node->input(0);
+    }
+    return Status::OK();
+  }
+};
+
+class RemoveNegationStage : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveNegationStage(const GraphOptimizerContext& ctx,
+                               const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveNegation", ctx, ctx_ext) {}
+  ~RemoveNegationStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsAdd(*node) || IsSub(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const string node_name = node->name();
+    NodeDef* x;
+    NodeDef* y;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &x));
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &y));
+    bool updated = false;
+    if (IsAdd(*node)) {
+      if (IsNeg(*x)) {
+        // (-a) + b = b - a
+        node->set_op("Sub");
+        node->mutable_input()->SwapElements(0, 1);
+        node->set_input(1, x->input(0));
+        node->add_input(AsControlDependency(x->name()));
+        ctx().node_map->AddOutput(NodeName(x->input(0)), node_name);
+        updated = true;
+      } else if (IsNeg(*y)) {
+        // a + (-b) = a - b
+        node->set_op("Sub");
+        node->set_input(1, y->input(0));
+        node->add_input(AsControlDependency(y->name()));
+        ctx().node_map->AddOutput(NodeName(y->input(0)), node_name);
+        updated = true;
+      }
+    } else if (IsSub(*node)) {
+      if (IsNeg(*y)) {
+        // a - (-b) = a + b
+        node->set_op("Add");
+        node->set_input(1, y->input(0));
+        node->add_input(AsControlDependency(y->name()));
+        ctx().node_map->AddOutput(NodeName(y->input(0)), node_name);
+        updated = true;
+      }
+    }
+    if (updated) {
+      AddToOptimizationQueue(node);
+    }
+    return Status::OK();
+  }
+};
+
+// This optimization hoists the common prefix of unary ops of the inputs to
+// concat out of the concat.
+// For example: Concat([Exp(Sin(x)), Exp(Sin(y)), Exp(Sin(z))]) ->
+// Exp(Sin(Concat([x, y, z]))).
+// TODO(rmlarsen): Support casting. We would have to change the type attribute
+// on the concat node.
+class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
+ public:
+  explicit HoistCWiseUnaryFromConcatStage(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("", ctx, ctx_ext) {}
+
+  ~HoistCWiseUnaryFromConcatStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    if (!IsConcat(*node)) return false;
+    const int n = node->attr().at("N").i();
+    return n > 1;
+  }
+
+  Status TrySimplify(NodeDef* concat_node,
+                     string* simplified_node_name) override {
+    int prefix_length;
+    std::set<string> ctrl_inputs;
+    TF_RETURN_IF_ERROR(
+        FindCommonUnaryOpPrefix(*concat_node, &prefix_length, &ctrl_inputs));
+    if (prefix_length > 0) {
+      TF_RETURN_IF_ERROR(
+          HoistUnaryOpPrefix(prefix_length, &ctrl_inputs, concat_node));
+      AddToOptimizationQueue(concat_node);
+    }
+    return Status::OK();
+  }
+
+ private:
+  void RemoveControlInputs(std::set<string>* removed_ctrl_inputs,
+                           NodeDef* node) const {
+    const int num_inputs = node->input_size();
+    for (int idx = num_inputs - 1; idx >= 0; --idx) {
+      const string& input = node->input(idx);
+      if (IsControlInput(input)) {
+        removed_ctrl_inputs->insert(input);
+        ctx().node_map->RemoveOutput(NodeName(input), node->name());
+        node->mutable_input()->RemoveLast();
+      } else {
+        break;
+      }
+    }
+  }
+
+  void AddControlInputs(std::set<string>* new_ctrl_inputs,
+                        NodeDef* node) const {
+    for (int idx = node->input_size() - 1; idx >= 0; --idx) {
+      const string& existing_input = node->input(idx);
+      if (IsControlInput(existing_input)) {
+        new_ctrl_inputs->erase(existing_input);
+      } else {
+        break;
+      }
+    }
+    for (const string& new_input : *new_ctrl_inputs) {
+      ctx().node_map->AddOutput(NodeName(new_input), node->name());
+      node->add_input(new_input);
+    }
+  }
+
+  // Returns the length of the common unary prefix chain of ops that can be
+  // hoisted out of concat.
+  Status FindCommonUnaryOpPrefix(const NodeDef& concat_node, int* prefix_length,
+                                 std::set<string>* ctrl_inputs) const {
+    *prefix_length = 0;
+    const int n = concat_node.attr().at("N").i();
+    // Follow the chains backwards from each concat input as long as all the
+    // following conditions hold:
+    //   1. The ops in all chains are the same.
+    //   2. The op is a unary elemenwise op.
+    //   3. The op output has only a single consumer.
+    std::vector<NodeDef*> tail(n, nullptr);
+    const int start = concat_node.op() == "Concat" ? 1 : 0;
+    const int end = start + n;
+    // Set up tail pointers to point to the immediate inputs to Concat.
+    for (int i = start; i < end; ++i) {
+      if (IsControlInput(concat_node.input(i))) {
+        return errors::FailedPrecondition("Got control input ",
+                                          concat_node.input(i),
+                                          " where normal input was expected.");
+      }
+      TF_RETURN_IF_ERROR(GetInputNode(concat_node.input(i), &tail[i - start]));
+    }
+
+    bool stop = false;
+    ctrl_inputs->clear();
+    while (!stop) {
+      const NodeDef* tail0 = tail[0];
+      if (!IsUnaryElementWise(*tail0)) break;
+      for (int chain = 0; chain < n; ++chain) {
+        // TODO(rmlarsen): Allow and hoist outgoing control edges.
+        if (tail[chain]->op() != tail0->op() ||
+            ctx().node_map->GetOutputs(tail[chain]->name()).size() > 1) {
+          stop = true;
+          break;
+        }
+      }
+      if (stop) break;
+      // We found one more op that can be hoisted.
+      ++(*prefix_length);
+      for (int chain = 0; chain < n; ++chain) {
+        RemoveControlInputs(ctrl_inputs, tail[chain]);
+      }
+      // Advance tail pointers to the next level.
+      for (int chain = 0; chain < n; ++chain) {
+        if (tail[chain]->input_size() == 0 ||
+            IsControlInput(tail[chain]->input(0))) {
+          stop = true;
+          break;
+        } else {
+          NodeDef* new_tail = nullptr;
+          TF_RETURN_IF_ERROR(GetInputNode(tail[chain]->input(0), &new_tail));
+          tail[chain] = new_tail;
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  Status HoistUnaryOpPrefix(const int prefix_length,
+                            std::set<string>* ctrl_inputs,
+                            NodeDef* concat_node) {
+    const int n = concat_node->attr().at("N").i();
+    const int start = concat_node->op() == "Concat" ? 1 : 0;
+    const int end = start + n;
+    const std::set<NodeDef*> consumers =
+        ctx().node_map->GetOutputs(concat_node->name());
+    AddControlInputs(ctrl_inputs, concat_node);
+    for (int chain = 0; chain < (end - start); ++chain) {
+      NodeDef* tail = nullptr;
+      const string concat_input = concat_node->input(chain + start);
+      for (int distance = 0; distance < prefix_length; ++distance) {
+        if (distance == 0) {
+          TF_RETURN_IF_ERROR(GetInputNode(concat_input, &tail));
+        } else {
+          TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &tail));
+        }
+      }
+
+      // Hook the node following tail directly into the concat node.
+      const string tail_input = tail->input(0);
+      concat_node->set_input(chain + start, tail_input);
+      ctx().node_map->UpdateInput(concat_node->name(), concat_input,
+                                  tail_input);
+
+      if (chain == 0) {
+        // Reuse nodes in the first chain to process output of concat.
+        tail->set_input(0, concat_node->name());
+        ctx().node_map->UpdateInput(tail->name(), tail_input,
+                                    concat_node->name());
+
+        // Update the consumers of concat to consume the end of the chain
+        // instead.
+        for (NodeDef* consumer : consumers) {
+          for (int idx = 0; idx < consumer->input_size(); ++idx) {
+            if (consumer->input(idx) == concat_node->name()) {
+              consumer->set_input(idx, concat_input);
+              ctx().node_map->UpdateInput(consumer->name(), concat_node->name(),
+                                          concat_input);
+            }
+          }
+          AddToOptimizationQueue(consumer);
+        }
+      }
+    }
+    return Status::OK();
+  }
+};
+
+// Performs the conversion:
+// Div(x, Sqrt(y)) => Mul(x, Rsqrt(y))
+// TODO(srjoglekar): Generalize to optimize cases like (x / pow(y, z)).
+class SqrtDivToRsqrtMulStage : public ArithmeticOptimizerStage {
+ public:
+  explicit SqrtDivToRsqrtMulStage(const GraphOptimizerContext& ctx,
+                                  const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("SqrtDivToRsqrtMul", ctx, ctx_ext) {}
+  ~SqrtDivToRsqrtMulStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsAnyDiv(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    NodeDef* y;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &y));
+    // Optimize only if divisor is a Sqrt whose output is not being consumed
+    // elsewhere.
+    if (IsSqrt(*y) && (NumNonControlOutputs(*y, *ctx().node_map) == 1)) {
+      // a / sqrt(b) = a * rsqrt(b)
+      node->set_op("Mul");
+      y->set_op("Rsqrt");
+      AddToOptimizationQueue(node);
+      AddToOptimizationQueue(y);
+    }
+    return Status::OK();
+  }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -418,6 +1632,9 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   }
 
   // Compare attributes.
+  if (node1.attr().size() != node2.attr().size()) {
+    return false;
+  }
   for (const auto& attr1 : node1.attr()) {
     auto it = node2.attr().find(attr1.first);
     if (it == node2.attr().end()) {
@@ -463,6 +1680,25 @@ bool ArithmeticOptimizer::OptimizedNodeExists(const NodeDef& node,
   return node_map_->NodeExists(OptimizedNodeName(node, suffix));
 }
 
+namespace {
+
+bool FeedsInPlaceOp(const SimpleGraphView& graph_view, const NodeDef& node) {
+  const std::unordered_set<string> op_types_to_traverse = {
+      node.op(),    "Identity", "IdentityN", "Reshape",
+      "ExpandDims", "Enter",    "Switch",    "Merge"};
+  int node_idx = graph_view.index(node.name());
+  std::set<int> node_fanout;
+  graph_view.DepthFirstSearch(op_types_to_traverse, node_idx, &node_fanout);
+  for (int fanout : node_fanout) {
+    if (ModifiesInputsInPlace(graph_view.graph()->node(fanout))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
 bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
@@ -482,6 +1718,11 @@ bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
 
 void ArithmeticOptimizer::DedupComputations() {
   bool stop = true;
+  SimpleGraphView graph_view;
+  if (!graph_view.Initialize(*optimized_graph_).ok()) {
+    LOG(WARNING) << "Failed to build SimpleGraphView.";
+    return;
+  }
   std::set<int> duplicates;
   do {
     stop = true;
@@ -498,19 +1739,28 @@ void ArithmeticOptimizer::DedupComputations() {
       if (rep == node) {
         continue;
       }
+      // If either node feeds an inplace op, deduping them may cause data races.
+      // For example: If we dedup nodes initializing two independent inplace
+      // accumulations, they will write to the same buffer, clobbering each
+      // other's results.
+      if (FeedsInPlaceOp(graph_view, *rep) ||
+          FeedsInPlaceOp(graph_view, *node)) {
+        continue;
+      }
       const std::set<NodeDef*>& fanouts = node_map_->GetOutputs(node->name());
       for (NodeDef* fanout : fanouts) {
-        for (string& name : *fanout->mutable_input()) {
+        for (int i = 0; i < fanout->input_size(); ++i) {
+          string* name = fanout->mutable_input(i);
           int position;
-          const string nodename = ParseNodeName(name, &position);
+          const string nodename = ParseNodeName(*name, &position);
           if (nodename == node->name()) {
             // Update name in-place.
             if (position > 0) {
-              name = StrCat(rep->name(), ":", position);
+              *name = StrCat(rep->name(), ":", position);
             } else if (position == 0) {
-              name = rep->name();
+              *name = rep->name();
             } else {
-              name = StrCat("^", rep->name());
+              *name = StrCat("^", rep->name());
             }
             node_map_->AddOutput(rep->name(), fanout->name());
           }
@@ -536,25 +1786,22 @@ void ArithmeticOptimizer::DedupComputations() {
   }
 }
 
-void ArithmeticOptimizer::AddFrameControlDeps(
-    const NodeDef* old_node, const std::vector<NodeDef*>& new_nodes,
-    const string& source_for_ctrl_dep,
-    const std::vector<NodeDef*>& sinks_for_control_dep) {
-  const auto frame_it = frame_map_.find(old_node);
-  if (frame_it != frame_map_.end()) {
-    for (auto node : new_nodes) {
-      frame_map_.emplace(node, frame_it->second);
-    }
-    if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
-      const string ctrl_dep = ConstantFolding::AddControlDependency(
-          source_for_ctrl_dep, optimized_graph_, node_map_.get());
-      for (auto node : sinks_for_control_dep) {
-        MaybeAddControlInput(ctrl_dep, node, optimized_graph_, node_map_.get());
+void ArithmeticOptimizer::ForwardControlDependencies(
+    NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
+  for (const auto& src : src_nodes) {
+    for (int i = src->input_size() - 1; i >= 0; --i) {
+      if (IsControlInput(src->input(i))) {
+        *target_node->add_input() = src->input(i);
+        node_map_->AddOutput(NodeName(src->input(i)), target_node->name());
+      } else {
+        break;
       }
     }
   }
 }
 
+// TODO(ezhulenev): extract each individual simplify rewrite into separate
+// ArithmeticOptimizerStage
 string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
   // Remove involutions applied twice.
@@ -582,31 +1829,6 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     }
   }
 
-  // Remove inverse transposes.
-  if (node->op() == "Transpose" || node->op() == "ConjugateTranspose") {
-    NodeDef* input = node_map_->GetNode(node->input(0));
-    if (input->op() == node->op()) {
-      const NodeDef* node_perm = node_map_->GetNode(node->input(1));
-      const NodeDef* input_perm = node_map_->GetNode(input->input(1));
-      // Try 32-bit indices.
-      std::vector<int> node_perm_values;
-      std::vector<int> input_perm_values;
-      if (ValuesFromConstNode(*node_perm, &node_perm_values) &&
-          ValuesFromConstNode(*input_perm, &input_perm_values) &&
-          AreInversePermutations(node_perm_values, input_perm_values)) {
-        return input->input(0);
-      }
-      // Try 64-bit indices.
-      std::vector<int64> node_perm_values64;
-      std::vector<int64> input_perm_values64;
-      if (ValuesFromConstNode(*node_perm, &node_perm_values64) &&
-          ValuesFromConstNode(*input_perm, &input_perm_values64) &&
-          AreInversePermutations(node_perm_values64, input_perm_values64)) {
-        return input->input(0);
-      }
-    }
-  }
-
   if (node->op() == "Reshape") {
     //   Reshape
     //      ^
@@ -624,23 +1846,22 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     //      ^      |
     //      |      |
     //    input ---+
-    NodeDef* reshape = node_map_->GetNode(node->name());
+    NodeDef* reshape = const_cast<NodeDef*>(node);
     int output_pos = 0;
-    string input_node_name = ParseNodeName(node->input(0), &output_pos);
+    string input_node_name = ParseNodeName(reshape->input(0), &output_pos);
     const NodeDef* input = node_map_->GetNode(input_node_name);
-    if (input->op() == "Reshape") {
+    if (input->op() == "Reshape" && !HasControlInputs(*input)) {
       reshape->set_input(0, input->input(0));
       node_map_->UpdateInput(reshape->name(), input->name(), input->input(0));
       nodes_to_simplify->PushBack(reshape);
       return reshape->name();
     }
 
-    // If the reshape is a no-op, forward its input to its consumers. This is
-    // considered aggressive, because users may state that the placeholder
-    // outputs tensors of shape [M, N] while feeding it with tensors of shape
-    // [M*N] (or worse). The reshape nodes are then necessary to update the
-    // tensor metadata to the required shape.
-    if (ReshapeIsIdentity(*reshape, *input, output_pos)) {
+    // If the reshape is a no-op, forward its input to its consumers, unless it
+    // anchors a control dependency since we want to make sure that control
+    // dependency is triggered.
+    if (ReshapeIsIdentity(*reshape, *input, output_pos, *graph_properties_) &&
+        !HasControlInputs(*reshape)) {
       return reshape->input(0);
     }
   }
@@ -668,8 +1889,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     // with image.type than with dst_type.
     if (DeviceNameUtils::SplitDeviceName(transpose->device(), &dontcare,
                                          &device) &&
-        (StringPiece(device).contains(DEVICE_CPU) ||
-         StringPiece(device).contains(DEVICE_GPU))) {
+        (str_util::StrContains(device, DEVICE_CPU) ||
+         str_util::StrContains(device, DEVICE_GPU))) {
       const NodeDef* cast = node_map_->GetNode(transpose->input(0));
       if (cast->op() == "Cast") {
         const NodeDef* input = node_map_->GetNode(cast->input(0));
@@ -693,42 +1914,13 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
           node_map_->AddOutput(new_transpose->name(), new_cast->name());
 
           nodes_to_simplify->PushBack(new_transpose);
-          //  Add frame dependencies that the original node might have had.
-          AddFrameControlDeps(node, {new_transpose, new_cast},
-                              new_transpose->input(0), {new_transpose});
-
+          ForwardControlDependencies(new_transpose, {cast, node});
           return new_cast->name();
         }
       }
     }
   }
 
-  if (node->op() == "Bitcast") {
-    NodeDef* bitcast = node_map_->GetNode(node->name());
-    // Bypass bitcasts whose source type and destination type are equal.
-    if (GetSourceDataType(*bitcast) == GetDestinationDataType(*bitcast)) {
-      return bitcast->input(0);
-    }
-
-    const NodeDef* operand = node_map_->GetNode(bitcast->input(0));
-    if (operand->op() == bitcast->op()) {
-      // Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
-      bitcast->set_input(0, operand->input(0));
-      SetSourceDataType(GetSourceDataType(*operand), bitcast);
-      node_map_->UpdateInput(bitcast->name(), bitcast->input(0),
-                             operand->input(0));
-      nodes_to_simplify->PushBack(bitcast);
-      return bitcast->name();
-    }
-  }
-
-  if (node->op() == "Cast") {
-    // Bypass casts whose source type and destination type are equal.
-    if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
-      return node->input(0);
-    }
-  }
-
   // Fold a multiply of a scalar into the following convolution. This folding
   // can jump across nodes that merely reorders data (such as reshape and
   // transpose). For example, we can optimize
@@ -796,7 +1988,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
             node_map_->AddOutput(weights->name(), scaled_weights->name());
             scaled_weights->add_input(mul->input(1));
             node_map_->AddOutput(scale->name(), scaled_weights->name());
-            AddFrameControlDeps(node, {scaled_weights}, "", {});
+            ForwardControlDependencies(scaled_weights, {source});
 
             // Update `conv`'s weights to `scaled_weights`.
             conv->set_input(1, scaled_weights->name());
@@ -822,17 +2014,26 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
 
   if (node->op() == "Mul" && node->input(0) == node->input(1) &&
       !OptimizedNodeExists(*node, "square")) {
-    NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
-    new_square_node->set_op("Square");
-    for (int i = 1; i < new_square_node->input_size(); ++i) {
-      new_square_node->set_input(i - 1, new_square_node->input(i));
+    const DataType type = GetDataTypeFromAttr(*node, "T");
+    bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+    string dontcare;
+    string device;
+    bool is_on_cpu =
+        DeviceNameUtils::SplitDeviceName(node->device(), &dontcare, &device) &&
+        str_util::StrContains(device, DEVICE_CPU);
+    if (!is_complex || is_on_cpu) {
+      NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
+      new_square_node->set_op("Square");
+      for (int i = 1; i < new_square_node->input_size(); ++i) {
+        new_square_node->set_input(i - 1, new_square_node->input(i));
+      }
+      new_square_node->mutable_input()->RemoveLast();
+      return new_square_node->name();
     }
-    new_square_node->mutable_input()->RemoveLast();
-    return new_square_node->name();
   }
 
   if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
-    // Discard aggregate nodes with a single input.
+    // Discard aggregate nodes with a single input and no control dependencies.
     if (node->input_size() == 1) {
       return node->input(0);
     }
@@ -870,9 +2071,16 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
       }
       TensorValue value(&t);
       NodeDef* new_const_node = AddNode(*node, "const", /*copy_node=*/false);
-      *new_const_node =
-          ConstantFolding::CreateNodeDef(new_const_node->name(), value);
+      status = ConstantFolding::CreateNodeDef(new_const_node->name(), value,
+                                              new_const_node);
+      if (!status.ok()) {
+        LOG(WARNING) << "Failed to create const node: "
+                     << status.error_message();
+        return "";
+      }
       new_const_node->set_device(node->device());
+      MaybeAddControlInput(NodeName(node->input(0)), new_const_node,
+                           optimized_graph_, node_map_.get());
       nodes_to_simplify->PushBack(new_const_node);
 
       // 2. Replace the aggregate node with Mul(Const(N), x).
@@ -885,105 +2093,11 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
       new_mul_node->add_input(node->input(0));
       node_map_->AddOutput(node->input(0), new_mul_node->name());
 
-      CopyControlInputs(*node, new_mul_node, optimized_graph_, node_map_.get());
-      AddFrameControlDeps(node, {new_const_node, new_mul_node}, node->input(0),
-                          {new_const_node});
+      ForwardControlDependencies(new_mul_node, {node});
       return new_mul_node->name();
     }
   }
 
-  // Use the commutativity and (left- and right-) distributive property of
-  // multiplication over addition to hoist common factors out of aggregate nodes
-  // where all the inputs are Mul nodes. This pattern occurs frequently in
-  // regularization terms for the gradients during training.
-  // For example, we can rewrite an expression of the form:
-  //   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
-  // to the following:
-  //   Mul(x, AddN(y1, y2, y3, ... yn))
-  if (IsAggregate(*node) && NumNonControlInputs(*node) > 1 &&
-      !OptimizedNodeExists(*node, "hoist_add") &&
-      !OptimizedNodeExists(*node, "hoist_mul")) {
-    // Determine the set of common factors if the input nodes are all Mul nodes.
-    std::set<string> common_factors;
-    for (int i = 0; i < node->input_size(); ++i) {
-      if (i > 0 && common_factors.empty()) {
-        break;
-      }
-      if (IsControlInput(node->input(i))) {
-        break;
-      }
-      const NodeDef* input = node_map_->GetNode(node->input(i));
-      if (input->op() == "Mul") {
-        std::set<string> factors_i{input->input(0), input->input(1)};
-        if (i == 0) {
-          std::swap(common_factors, factors_i);
-        } else {
-          std::set<string> intersection;
-          std::set_intersection(
-              factors_i.begin(), factors_i.end(), common_factors.begin(),
-              common_factors.end(),
-              std::inserter(intersection, intersection.begin()));
-          std::swap(common_factors, intersection);
-        }
-      } else {
-        common_factors.clear();
-      }
-    }
-    if (common_factors.size() == 1) {
-      const string& common_factor = *common_factors.begin();
-
-      // Gather up the non-shared factors (the y's in the example).
-      // Unless the aggregation is Add, we have to make sure that all the y's
-      // have the same shape since the other aggregation ops do not support
-      // broadcasting.
-      std::vector<string> unique_factors;
-      unique_factors.reserve(node->input_size());
-      bool shapes_match = true;
-      for (int i = 0; i < node->input_size() && shapes_match; ++i) {
-        const string& input = node->input(i);
-        if (IsControlInput(input)) {
-          break;
-        }
-        const NodeDef* mul_node = node_map_->GetNode(input);
-        const int unique_factor_index =
-            mul_node->input(0) == common_factor ? 1 : 0;
-        unique_factors.push_back(mul_node->input(unique_factor_index));
-        if (i > 0 && !IsAdd(*node)) {
-          shapes_match = ShapesEqual(unique_factors.front(),
-                                     unique_factors.back(), *node_map_);
-        }
-      }
-
-      if (shapes_match) {
-        // 1. Use a copy of the first Mul node for the outer multiplication.
-        NodeDef* new_mul_node = AddNode(OptimizedNodeName(*node, "hoist_mul"),
-                                        node_map_->GetNode(node->input(0)));
-        NodeDef* new_add_node = AddNode(*node, "hoist_add", /*copy_node=*/true);
-        new_mul_node->set_device(node->device());
-        new_mul_node->set_input(0, common_factor);
-        node_map_->AddOutput(common_factor, new_mul_node->name());
-        new_mul_node->set_input(1, new_add_node->name());
-        node_map_->AddOutput(new_add_node->name(), new_mul_node->name());
-
-        // 2. Hoist non-shared factors up into the new AddN node.
-        nodes_to_simplify->PushBack(new_add_node);
-        for (int i = 0; i < node->input_size(); ++i) {
-          const string& input = node->input(i);
-          if (IsControlInput(input)) {
-            break;
-          }
-          new_add_node->set_input(i, unique_factors[i]);
-        }
-
-        // 3. Add frame dependencies that the original node might have had.
-        AddFrameControlDeps(node, {new_add_node, new_mul_node}, common_factor,
-                            {new_add_node});
-
-        return new_mul_node->name();
-      }
-    }
-  }
-
   // Fold Transpose into matrix multiplication.
   if ((node->op() == "MatMul" || node->op() == "SparseMatMul" ||
        node->op() == "BatchMatMul") &&
@@ -1012,7 +2126,6 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         FlipBooleanAttr(attr_a, new_op);
         new_op->set_input(0, a->input(0));
         node_map_->UpdateInput(new_op->name(), a->name(), a->input(0));
-        AddFrameControlDeps(node, {new_op}, a->input(0), {new_op});
       }
       if (b_is_foldable) {
         const string attr_b =
@@ -1020,10 +2133,15 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         FlipBooleanAttr(attr_b, new_op);
         new_op->set_input(1, b->input(0));
         node_map_->UpdateInput(new_op->name(), b->name(), b->input(0));
-        if (!a_is_foldable) {
-          AddFrameControlDeps(node, {new_op}, b->input(0), {new_op});
-        }
       }
+      std::vector<const NodeDef*> deps_to_forward({node});
+      if (a_is_foldable) {
+        deps_to_forward.push_back(a);
+      }
+      if (b_is_foldable) {
+        deps_to_forward.push_back(b);
+      }
+      ForwardControlDependencies(new_op, deps_to_forward);
     }
   }
 
@@ -1045,7 +2163,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
                                                        : "Transpose");
       new_op->set_input(0, input->input(0));
       node_map_->UpdateInput(new_op->name(), node->name(), input->input(0));
-      AddFrameControlDeps(node, {new_op}, "", {});
+      ForwardControlDependencies(new_op, {node, input});
       return new_op->name();
     }
   }
@@ -1053,20 +2171,61 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   return "";
 }
 
-Status ArithmeticOptimizer::SimplifyArithmeticOps() {
+Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   SetVector<NodeDef*> nodes_to_simplify;
   nodes_to_simplify.Reserve(optimized_graph_->node_size());
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
     nodes_to_simplify.PushBack(optimized_graph_->mutable_node(i));
   }
+
+  const GraphOptimizerContext ctx(&nodes_to_preserve_, optimized_graph_,
+                                  graph_properties_.get(), node_map_.get());
+  const ArithmeticOptimizerContext ctx_ext(&nodes_to_simplify);
+
+  // Stop pipeline after first stage returning non-empty simplified tensor name.
+  const auto stop = [](const string& result) { return !result.empty(); };
+  GraphOptimizerStagePipeline<string> pipeline(stop);
+
+  if (options_.combine_add_to_addn && can_use_shapes)
+    pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
+  if (options_.hoist_common_factor_out_of_aggregation && can_use_shapes)
+    pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
+  if (options_.minimize_broadcasts && can_use_shapes)
+    pipeline.AddStage<MinimizeBroadcasts>(ctx, ctx_ext);
+  if (options_.remove_identity_transpose && can_use_shapes)
+    pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
+  if (options_.remove_redundant_bitcast)
+    pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
+  if (options_.remove_redundant_cast)
+    pipeline.AddStage<RemoveRedundantCastStage>(ctx, ctx_ext);
+  if (options_.remove_negation)
+    pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
+  if (options_.hoist_unary_out_of_concat)
+    pipeline.AddStage<HoistCWiseUnaryFromConcatStage>(ctx, ctx_ext);
+  if (options_.convert_sqrt_div_to_rsqrt_mul)
+    pipeline.AddStage<SqrtDivToRsqrtMulStage>(ctx, ctx_ext);
+
+  VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
+          << str_util::Join(pipeline.StageNames(), ", ");
+
   while (!nodes_to_simplify.Empty()) {
-    const NodeDef* node = nodes_to_simplify.PopBack();
-    const string simplified_tensor =
-        TrySimplifyAndReplaceUses(node, &nodes_to_simplify);
-    if (simplified_tensor.empty()) {
-      continue;
+    NodeDef* node = nodes_to_simplify.PopBack();
+
+    // TODO(ezhulenev): move all rewrites into separate stages
+    string simplified_tensor = "";
+    if (options_.enable_try_simplify_and_replace) {
+      simplified_tensor = TrySimplifyAndReplaceUses(node, &nodes_to_simplify);
+    }
+
+    // if it was not simplified try to run it through all configured stages
+    if (!stop(simplified_tensor)) {
+      bool optimized = pipeline.PassThroughAllStages(node, &simplified_tensor);
+      if (!optimized) {
+        continue;
+      }
     }
 
+    // re-wire consumers of an old node to the new one
     if (NodeName(simplified_tensor) != node->name()) {
       // Always consider simplified_tensor for further optimizations.
       NodeDef* simplified_node = node_map_->GetNode(simplified_tensor);
@@ -1077,7 +2236,12 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
       // consumers of `node` are already redirected to `simplified_tensor`.
       // Re-push the consumers into `nodes_to_simplify` for further
       // optimizations.
-      std::set<NodeDef*> consumers = node_map_->GetOutputs(node->name());
+      const std::set<NodeDef*> outputs = node_map_->GetOutputs(node->name());
+      std::vector<NodeDef*> consumers(outputs.begin(), outputs.end());
+      std::sort(consumers.begin(), consumers.end(),
+                [](const NodeDef* n1, const NodeDef* n2) {
+                  return n1->name() < n2->name();
+                });
       for (NodeDef* consumer : consumers) {
         // Update `consumer`'s use of `node` to `input`'s operand.
         for (int i = 0; i < consumer->input_size(); ++i) {
@@ -1103,30 +2267,33 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
 Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
                                      const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  optimized_graph_ = optimized_graph;
-  *optimized_graph_ = item.graph;
-
   // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
+  *optimized_graph = item.graph;
+  GrapplerItem optimized_item(item, optimized_graph);
+  optimized_graph_ = &optimized_item.graph;
   node_map_.reset(new NodeMap(optimized_graph_));
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                               &frame_map_, &num_frames));
-  // Shapes are only needed in aggressive mode.
-  graph_properties_.reset(new GraphProperties(item));
-  TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
-  TF_RETURN_IF_ERROR(graph_properties_->AnnotateOutputShapes(optimized_graph_));
 
-  // Perform the optimizations.
-  DedupComputations();
-  TF_RETURN_IF_ERROR(SimplifyArithmeticOps());
+  if (options_.dedup_computations) {
+    DedupComputations();
+  }
 
-  // Clear output shapes.
-  for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    optimized_graph_->mutable_node(i)->mutable_attr()->erase(kOutputShapesAttr);
+  // Perform topological sort on the graph in order to help AddOpsRewrite to
+  // optimize larger subgraphs starting from the roots with more inputs.
+  TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
+
+  graph_properties_.reset(new GraphProperties(optimized_item));
+  const Status status = graph_properties_->InferStatically(false);
+  const bool can_use_shapes = status.ok();
+  if (!can_use_shapes) {
+    VLOG(1) << "Shape inference failed." << status.error_message();
   }
 
+  // Perform the optimizations.
+  TF_RETURN_IF_ERROR(SimplifyArithmeticOps(can_use_shapes));
+
+  optimized_graph->Swap(optimized_graph_);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index afd538db408aa859a108e08b2de9efad635d515c..24a2a50719531ca6f52886caad3e6b11c82362ce 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
 
 #include <unordered_set>
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -32,9 +31,14 @@ constexpr char kArithmeticOptimizer[] = "ArithmeticOptimizer";
 // run a model.
 class ArithmeticOptimizer : public GraphOptimizer {
  public:
-  ArithmeticOptimizer() : opt_level_(RewriterConfig::ON) {}
+  ArithmeticOptimizer()
+      : opt_level_(RewriterConfig::ON),
+        options_(ArithmeticOptimizerOptions::Default(RewriterConfig::ON)) {}
+
   explicit ArithmeticOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+      : opt_level_(opt_level),
+        options_(ArithmeticOptimizerOptions::Default(opt_level)) {}
+
   ~ArithmeticOptimizer() override {}
 
   string name() const override { return "arithmetic_optimizer"; };
@@ -46,6 +50,36 @@ class ArithmeticOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  friend class ArithmeticOptimizerTest;
+
+  // Granular control for arithmetic optimizer stages
+  struct ArithmeticOptimizerOptions {
+    // TODO(ezhulenev): flag do disable TrySimplifyAndReplaceUses in tests.
+    // Remove when all optimizers will be migrated to separate stages.
+    bool dedup_computations = true;
+    bool enable_try_simplify_and_replace = true;
+    bool combine_add_to_addn = true;
+    bool hoist_common_factor_out_of_aggregation = true;
+    bool minimize_broadcasts = true;
+    bool remove_identity_transpose = true;
+    bool remove_redundant_bitcast = true;
+    bool remove_redundant_cast = true;
+    bool remove_negation = true;
+    bool hoist_unary_out_of_concat = false;
+    bool convert_sqrt_div_to_rsqrt_mul = false;
+
+    // Choose which arithmetic optimizer stages will be enabled for a given
+    // optimization level by default.
+    static ArithmeticOptimizerOptions Default(
+        RewriterConfig::Toggle opt_level) {
+      ArithmeticOptimizerOptions options;
+      if (opt_level == RewriterConfig::AGGRESSIVE) {
+        options.hoist_unary_out_of_concat = true;
+      }
+      return options;
+    }
+  };
+
   // Returns true is a node with given name and the optimizer prefix already
   // exists.
   string OptimizedNodeName(const NodeDef& node, StringPiece suffix) const;
@@ -67,20 +101,16 @@ class ArithmeticOptimizer : public GraphOptimizer {
   // Dedup redundant nodes in the graph.
   void DedupComputations();
 
-  // Fix frame dependencies by adding control dependencies from old_input to
-  // nodes in new_nodes_for_control_dep, and update frame_map for all nodes in
-  // new_nodes.
-  void AddFrameControlDeps(const NodeDef* old_node,
-                           const std::vector<NodeDef*>& new_nodes,
-                           const string& source_for_ctrl_dep,
-                           const std::vector<NodeDef*>& sinks_for_control_dep);
+  // Forward the control dependencies anchored on src_nodes to the target_nodes.
+  void ForwardControlDependencies(NodeDef* target_node,
+                                  const std::vector<const NodeDef*>& src_nodes);
 
   // Runs peep-hole optimizations on `optimized_graph`, e.g., removing inverse
   // transposes.
-  Status SimplifyArithmeticOps();
+  Status SimplifyArithmeticOps(bool can_use_shapes);
   // Tries to simplify the expression that roots at `node` and replaces the uses
   // of `node` to the simplified expression. Returns the name of the simplified
-  // tensor (e.g. "split:1") or an emtpy string if no simplification is
+  // tensor (e.g. "split:1") or an empty string if no simplification is
   // performed.
   //
   // `node_map` stores the mapping from node names to NodeDef*, and will be
@@ -97,16 +127,16 @@ class ArithmeticOptimizer : public GraphOptimizer {
                                    SetVector<NodeDef*>* nodes_to_simplify);
 
   RewriterConfig::Toggle opt_level_;
+  ArithmeticOptimizerOptions options_;
 
-  bool fetch_nodes_known_;
+  bool fetch_nodes_known_ = false;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
-  FrameMap frame_map_;
   std::unique_ptr<GraphProperties> graph_properties_;
-  GraphDef* optimized_graph_;  // Not owned.
+  GraphDef* optimized_graph_ = nullptr;  // Not owned.
 };
 
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 2a82b250586783759608db75bf9e383f4b0322cb..7485d99c3bd7acc06f456d31d61fca2685047a13 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -16,18 +16,45 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
+
 namespace {
 
+constexpr char kHoistFactorOptimizerDiv[] =
+    "ArithmeticOptimizer/HoistCommonFactor_Div_";
+
+constexpr char kHoistFactorOptimizerMul[] =
+    "ArithmeticOptimizer/HoistCommonFactor_Mul_";
+
+constexpr char kHoistFactorOptimizerAdd[] =
+    "ArithmeticOptimizer/HoistCommonFactor_Add_";
+
+// Optimized name of outer Mul node by HoistCommonFactorOutOfAggregation
+string HoistMulName(const string& name) {
+  return AddPrefixToNodeName(name, kHoistFactorOptimizerMul, "");
+}
+
+// Optimized name of outer Div node by HoistCommonFactorOutOfAggregation
+string HoistDivName(const string& name) {
+  return AddPrefixToNodeName(name, kHoistFactorOptimizerDiv, "");
+}
+
+// Optimized name of inner Add node by HoistCommonFactorOutOfAggregation
+string HoistAddName(const string& name) {
+  return AddPrefixToNodeName(name, kHoistFactorOptimizerAdd, "");
+}
+
 string OptimizedName(const string& name) {
   return AddPrefixToNodeName(name, kArithmeticOptimizer);
 }
@@ -46,8 +73,92 @@ void VerifyGraphsMatch(const GraphDef& original_graph,
     }
   }
 }
+}  // namespace
+
+class ArithmeticOptimizerTest : public GrapplerTest {
+ protected:
+  // Optimize a graph using ArithmeticOptimizer and prune all the nodes that no
+  // longer have any output consumers.
+  void OptimizeAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                        GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  void OptimizeTwice(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                     GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+  }
+
+  // TODO(ezhulenev): Make private. After migration to stages each test
+  // should explicitly enable required optimization for tests isolation
+  void DisableAllStages(ArithmeticOptimizer* optimizer) {
+    ArithmeticOptimizer::ArithmeticOptimizerOptions options;
+    options.dedup_computations = false;
+    options.enable_try_simplify_and_replace = false;
+    options.combine_add_to_addn = false;
+    options.hoist_common_factor_out_of_aggregation = false;
+    options.minimize_broadcasts = false;
+    options.remove_identity_transpose = false;
+    options.remove_redundant_bitcast = false;
+    options.remove_redundant_cast = false;
+    options.remove_negation = false;
+    optimizer->options_ = options;
+  }
+
+  void DisableAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    optimizer->options_.combine_add_to_addn = false;
+  }
+
+  void EnableOnlyAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.combine_add_to_addn = true;
+  }
 
-class ArithmeticOptimizerTest : public ::testing::Test {};
+  void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_common_factor_out_of_aggregation = true;
+  }
+
+  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.minimize_broadcasts = true;
+  }
+
+  void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_identity_transpose = true;
+  }
+
+  void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_bitcast = true;
+  }
+
+  void EnableOnlyRemoveRedundantCast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_cast = true;
+  }
+
+  void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_negation = true;
+  }
+
+  void EnableOnlyHoistCWiseUnaryFromConcat(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_unary_out_of_concat = true;
+  }
+
+  void EnableOnlySqrtDivToRsqrtMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
+  }
+};
 
 TEST_F(ArithmeticOptimizerTest, NoOp) {
   // This trivial graph is so basic there's nothing to optimize.
@@ -71,23 +182,26 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div"};
 
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
   EXPECT_EQ(2, output.node_size());
-  const NodeDef& new_c1 = output.node(0);
-  EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_div = output.node(1);
-  EXPECT_EQ("div", new_div.name());
-  EXPECT_EQ(2, new_div.input_size());
-  EXPECT_EQ("c1", new_div.input(0));
-  EXPECT_EQ("c1", new_div.input(1));
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  EXPECT_EQ(2, new_div->input_size());
+  EXPECT_EQ("c1", new_div->input(0));
+  EXPECT_EQ("c1", new_div->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
@@ -104,23 +218,30 @@ TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div"};
+  Tensor bool_t(DT_BOOL, TensorShape({}));
+  bool_t.scalar<bool>().setConstant(true);
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", bool_t}});
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_div = output.node(3);
-  EXPECT_EQ(4, new_div.input_size());
-  EXPECT_EQ("check1", new_div.input(0));
-  EXPECT_EQ("check1", new_div.input(1));
-  EXPECT_EQ("^assert1", new_div.input(2));
-  EXPECT_EQ("^assert1", new_div.input(3));
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  EXPECT_EQ(4, new_div->input_size());
+  EXPECT_EQ("check1", new_div->input(0));
+  EXPECT_EQ("check1", new_div->input(1));
+  EXPECT_EQ("^assert1", new_div->input(2));
+  EXPECT_EQ("^assert1", new_div->input(3));
+
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", bool_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
@@ -132,32 +253,34 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   Output div1 = ops::Div(s.WithOpName("div1"), mul1, mul2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch = {"div"};
+  item.fetch = {"div1"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(4, output.node_size());
-  const NodeDef& new_c1 = output.node(0);
-  EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_c2 = output.node(1);
-  EXPECT_EQ("c2", new_c2.name());
-  const NodeDef& new_mul1 = output.node(2);
-  EXPECT_EQ("mul1", new_mul1.name());
-  EXPECT_EQ(2, new_mul1.input_size());
-  EXPECT_EQ("c1", new_mul1.input(0));
-  EXPECT_EQ("c2", new_mul1.input(1));
-  const NodeDef& new_div1 = output.node(3);
-  EXPECT_EQ("div1", new_div1.name());
-  EXPECT_EQ(2, new_div1.input_size());
-  EXPECT_EQ("mul1", new_div1.input(0));
-  EXPECT_EQ("mul1", new_div1.input(1));
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+  const NodeDef* new_c2 = node_map.GetNode("c2");
+  ASSERT_NE(new_c2, nullptr);
+  const NodeDef* new_mul1 = node_map.GetNode("mul1");
+  ASSERT_NE(new_mul1, nullptr);
+  EXPECT_EQ(2, new_mul1->input_size());
+  EXPECT_EQ("c1", new_mul1->input(0));
+  EXPECT_EQ("c2", new_mul1->input(1));
+  const NodeDef* new_div1 = node_map.GetNode("div1");
+  ASSERT_NE(new_div1, nullptr);
+  EXPECT_EQ(2, new_div1->input_size());
+  EXPECT_EQ("mul1", new_div1->input(0));
+  EXPECT_EQ("mul1", new_div1->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, MulToSquare) {
@@ -168,6 +291,9 @@ TEST_F(ArithmeticOptimizerTest, MulToSquare) {
   Output id = ops::Identity(s.WithOpName("id"), mul);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -182,6 +308,10 @@ TEST_F(ArithmeticOptimizerTest, MulToSquare) {
   EXPECT_EQ(2, output.node(4).input_size());
   EXPECT_EQ("c", output.node(4).input(0));
   EXPECT_EQ("^d", output.node(4).input(1));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
@@ -194,6 +324,9 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
   Output id = ops::Identity(s.WithOpName("id"), recip2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -204,6 +337,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
   EXPECT_EQ("c", output.node(1).input(0));
   EXPECT_EQ("c", output.node(3).input(0));
   EXPECT_EQ("c", output.node(5).input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
@@ -216,6 +353,9 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
   Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -229,6 +369,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
   EXPECT_EQ(6, output.node_size());
   EXPECT_EQ("squeeze", output.node(5).input(0));
   EXPECT_EQ("c", output.node(2).input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
@@ -243,6 +387,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -260,6 +408,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
       EXPECT_EQ(original.input(j), optimized.input(j));
     }
   }
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
@@ -271,28 +423,35 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_const = output.node(3);
-  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
-  EXPECT_EQ("^x", new_const.input(0));
+
+  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  ASSERT_NE(new_const, nullptr);
+  EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
-            new_const.attr().at("value").tensor().tensor_content());
-  const NodeDef& new_mul = output.node(4);
-  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
-  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
-  EXPECT_EQ("x", new_mul.input(1));
-  const NodeDef& new_id = output.node(2);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+            new_const->attr().at("value").tensor().tensor_content());
+
+  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  ASSERT_NE(new_mul, nullptr);
+  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ("x", new_mul->input(1));
+
+  const NodeDef* new_id = node_map.GetNode("id");
+  ASSERT_NE(new_id, nullptr);
+  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
@@ -305,29 +464,36 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(6, output.node_size());
-  const NodeDef& new_const = output.node(4);
-  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
-  EXPECT_EQ("^x", new_const.input(0));
+
+  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  ASSERT_NE(new_const, nullptr);
+  EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
-            new_const.attr().at("value").tensor().tensor_content());
-  const NodeDef& new_mul = output.node(5);
-  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
-  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
-  EXPECT_EQ("x", new_mul.input(1));
-  EXPECT_EQ("^y", new_mul.input(2));
-  const NodeDef& new_id = output.node(3);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+            new_const->attr().at("value").tensor().tensor_content());
+
+  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  ASSERT_NE(new_mul, nullptr);
+  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ("x", new_mul->input(1));
+  EXPECT_EQ("^y", new_mul->input(2));
+
+  const NodeDef* new_id = node_map.GetNode("id");
+  ASSERT_NE(new_id, nullptr);
+  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
@@ -343,6 +509,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
   const std::vector<string> devices{
       "/device:CPU:0", "/device:GPU:0", "/device:CPU:0", "/device:GPU:1",
       "/device:CPU:0", "/device:CPU:0", "/device:CPU:0",
@@ -350,61 +517,68 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   for (int i = 0; i < item.graph.node_size(); ++i) {
     item.graph.mutable_node(i)->set_device(devices[i]);
   }
+
   ArithmeticOptimizer optimizer;
+  DisableAddToAddNCombining(&optimizer);
+
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
 
-  EXPECT_EQ(17, output.node_size());
-  // The graph gets optimized to
+  // We expect the following rewrite(s) to occur:
+  //
   // Mul(p,
-  //     Add(Add(Const(2), Const(2)),
-  //         Add(Const(2), Const(2))))
+  //     Add_6(Add_4(Const(2), Const(2)),
+  //           Add_5(Const(2), Const(2))))
+  NodeMap node_map(&output);
+
   EXPECT_EQ(17, output.node_size());
-  for (const auto& node : output.node()) {
-    if ("id" == node.name()) {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ(OptimizedName("Add_6_hoist_mul"), node.input(0));
-    } else if (OptimizedName("Add_6_hoist_mul") == node.name()) {
-      EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("Placeholder", node.input(0));
-      EXPECT_EQ(OptimizedName("Add_6_hoist_add"), node.input(1));
-    } else if (OptimizedName("Add_6_hoist_add") == node.name()) {
-      EXPECT_EQ("Add", node.op());
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ(OptimizedName("Add_4_hoist_add"), node.input(0));
-      EXPECT_EQ(OptimizedName("Add_5_hoist_add"), node.input(1));
-      EXPECT_EQ("^Placeholder", node.input(2));
-    } else if (OptimizedName("Add_4_hoist_add") == node.name()) {
-      EXPECT_EQ("Add", node.op());
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ(OptimizedName("Add_const"), node.input(0));
-      EXPECT_EQ(OptimizedName("Add_1_const"), node.input(1));
-      EXPECT_EQ("^Placeholder", node.input(2));
-    } else if (OptimizedName("Add_5_hoist_add") == node.name()) {
-      EXPECT_EQ("Add", node.op());
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ(OptimizedName("Add_const"), node.input(0));
-      EXPECT_EQ(OptimizedName("Add_1_const"), node.input(1));
-      EXPECT_EQ("^Placeholder", node.input(2));
-    } else if (OptimizedName("Add_const") == node.name()) {
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^Placeholder", node.input(0));
-    } else if (OptimizedName("Add_1_const") == node.name()) {
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^Placeholder", node.input(0));
-    }
-  }
+
+  const NodeDef* id_node = node_map.GetNode("id");
+  ASSERT_NE(id_node, nullptr);
+  EXPECT_EQ(1, id_node->input_size());
+  EXPECT_EQ(HoistMulName("Add_6"), id_node->input(0));
+
+  const NodeDef* mul_node = node_map.GetNode(HoistMulName("Add_6"));
+  ASSERT_NE(mul_node, nullptr);
+  EXPECT_EQ(2, mul_node->input_size());
+  EXPECT_EQ("Placeholder", mul_node->input(0));
+  EXPECT_EQ(HoistAddName("Add_6"), mul_node->input(1));
+
+  const NodeDef* add_6_node = node_map.GetNode(HoistAddName("Add_6"));
+  ASSERT_NE(add_6_node, nullptr);
+  EXPECT_EQ(2, add_6_node->input_size());
+  EXPECT_EQ(HoistAddName("Add_4"), add_6_node->input(0));
+  EXPECT_EQ(HoistAddName("Add_5"), add_6_node->input(1));
+
+  const NodeDef* add_4_node = node_map.GetNode(HoistAddName("Add_4"));
+  ASSERT_NE(add_4_node, nullptr);
+  EXPECT_EQ("Add", add_4_node->op());
+  EXPECT_EQ(2, add_4_node->input_size());
+  EXPECT_EQ(OptimizedName("Add_const"), add_4_node->input(0));
+  EXPECT_EQ(OptimizedName("Add_1_const"), add_4_node->input(1));
+
+  const NodeDef* add_5_node = node_map.GetNode(HoistAddName("Add_5"));
+  ASSERT_NE(add_5_node, nullptr);
+  EXPECT_EQ("Add", add_5_node->op());
+  EXPECT_EQ(2, add_5_node->input_size());
+  EXPECT_EQ(OptimizedName("Add_const"), add_5_node->input(0));
+  EXPECT_EQ(OptimizedName("Add_1_const"), add_5_node->input(1));
+
+  const NodeDef* add_const_node = node_map.GetNode(OptimizedName("Add_const"));
+  ASSERT_NE(add_const_node, nullptr);
+  EXPECT_EQ("Const", add_const_node->op());
+  EXPECT_EQ(1, add_const_node->input_size());
+  EXPECT_EQ("^Placeholder", add_const_node->input(0));
+
+  const NodeDef* add_1_const_node =
+      node_map.GetNode(OptimizedName("Add_1_const"));
+  ASSERT_NE(add_1_const_node, nullptr);
+  EXPECT_EQ("Const", add_1_const_node->op());
+  EXPECT_EQ(1, add_1_const_node->input_size());
+  EXPECT_EQ("^Placeholder", add_1_const_node->input(0));
 }
 
-TEST_F(ArithmeticOptimizerTest, HoistFactor) {
+TEST_F(ArithmeticOptimizerTest, HoistFactorMul) {
   for (bool matching_shapes : {true, false}) {
     for (bool use_addn : {true, false}) {
       tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -422,31 +596,125 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
                                    ops::Add(s.WithOpName("add"), mul1, mul2));
 
       GrapplerItem item;
+      item.fetch = {"id"};
       TF_CHECK_OK(s.ToGraphDef(&item.graph));
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+      EXPECT_EQ(1, tensors_expected.size());
       ArithmeticOptimizer optimizer;
+      EnableOnlyHoistCommonFactor(&optimizer);
+
       GraphDef output;
-      Status status = optimizer.Optimize(nullptr, item, &output);
-      TF_EXPECT_OK(status);
-      // Run the optimizer twice to make sure the rewrite is idempotent.
-      item.graph.Swap(&output);
-      status = optimizer.Optimize(nullptr, item, &output);
-      TF_EXPECT_OK(status);
+      OptimizeTwice(&optimizer, &item, &output);
+
+      // We expect the following rewrite(s) to occur:
+      //
+      //        Add                 Mul
+      //      /    \               /   \
+      //    Mul    Mul       ->   x    Add
+      //    / \    / \                 / \
+      //   x  y1  y2  x              y1   y2
+      //
+      // If "root" op is AddN and shapes does not match, this rewrite is not
+      // possible and graph should stay intact.
+      NodeMap node_map(&output);
 
       if (use_addn && !matching_shapes) {
         VerifyGraphsMatch(item.graph, output, __LINE__);
       } else {
         EXPECT_EQ(9, output.node_size());
-        const NodeDef& new_add = output.node(8);
-        EXPECT_EQ(OptimizedName("add_hoist_add"), new_add.name());
-        EXPECT_EQ("y1", new_add.input(0));
-        EXPECT_EQ("y2", new_add.input(1));
-        const NodeDef& new_mul = output.node(7);
-        EXPECT_EQ(OptimizedName("add_hoist_mul"), new_mul.name());
-        EXPECT_EQ("x", new_mul.input(0));
-        EXPECT_EQ(OptimizedName("add_hoist_add"), new_mul.input(1));
-        const NodeDef& new_id = output.node(6);
-        EXPECT_EQ("id", new_id.name());
-        EXPECT_EQ(OptimizedName("add_hoist_mul"), new_id.input(0));
+
+        const NodeDef* new_add_node = node_map.GetNode(HoistAddName("add"));
+        ASSERT_NE(new_add_node, nullptr) << "Hoisted Add node not found";
+        EXPECT_EQ("y1", new_add_node->input(0));
+        EXPECT_EQ("y2", new_add_node->input(1));
+
+        const NodeDef* new_mul_node = node_map.GetNode(HoistMulName("add"));
+        ASSERT_NE(new_mul_node, nullptr) << "Hoisted Mul node not found";
+        EXPECT_EQ("x", new_mul_node->input(0));
+        EXPECT_EQ(new_add_node->name(), new_mul_node->input(1));
+
+        const NodeDef* id_node = node_map.GetNode("id");
+        ASSERT_NE(id_node, nullptr) << "Id node not found";
+        EXPECT_EQ("id", id_node->name());
+        EXPECT_EQ(HoistMulName("add"), id_node->input(0));
+      }
+      auto tensors = EvaluateNodes(output, item.fetch);
+      EXPECT_EQ(1, tensors.size());
+      test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, HoistFactorDiv) {
+  for (bool matching_shapes : {true, false}) {
+    for (bool use_addn : {true, false}) {
+      for (bool use_ints : {true, false}) {
+        tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+        Output x = use_ints
+                       ? ops::Const(s.WithOpName("x"), {1, 2}, {1, 2})
+                       : ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+        Output y1 = use_ints
+                        ? ops::Const(s.WithOpName("y1"), {3, 4}, {1, 2})
+                        : ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
+        Output y2;
+        if (matching_shapes) {
+          y2 = use_ints ? ops::Const(s.WithOpName("y2"), {5, 6}, {1, 2})
+                        : ops::Const(s.WithOpName("y2"), {5.0f, 6.0f}, {1, 2});
+        } else {
+          y2 = use_ints ? ops::Const(s.WithOpName("y2"), {5}, {1, 1})
+                        : ops::Const(s.WithOpName("y2"), {5.0f}, {1, 1});
+        }
+        Output div1 = ops::Div(s.WithOpName("div1"), y1, x);
+        Output div2 = ops::Div(s.WithOpName("div2"), y2, x);
+        Output id =
+            use_addn
+                ? ops::Identity(s.WithOpName("id"),
+                                ops::AddN(s.WithOpName("add"), {div1, div2}))
+                : ops::Identity(s.WithOpName("id"),
+                                ops::Add(s.WithOpName("add"), div1, div2));
+
+        GrapplerItem item;
+        item.fetch = {"id"};
+        TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+        ArithmeticOptimizer optimizer;
+        EnableOnlyHoistCommonFactor(&optimizer);
+
+        GraphDef output;
+        OptimizeTwice(&optimizer, &item, &output);
+
+        // We expect the following rewrite(s) to occur:
+        //
+        //        Add                 Div
+        //      /    \               /   \
+        //    Div    Div       ->  Add    x
+        //    / \    / \           / \
+        //   y1  x  y2  x         y1  y2
+        //
+        // If "root" op is AddN and shapes does not match, this rewrite is not
+        // possible and graph should stay intact.
+        NodeMap node_map(&output);
+
+        if ((use_addn && !matching_shapes) || use_ints) {
+          VerifyGraphsMatch(item.graph, output, __LINE__);
+        } else {
+          EXPECT_EQ(9, output.node_size());
+
+          const NodeDef* new_add_node = node_map.GetNode(HoistAddName("add"));
+          ASSERT_TRUE(new_add_node != nullptr) << "Hoisted Add node not found";
+          EXPECT_EQ("y1", new_add_node->input(0));
+          EXPECT_EQ("y2", new_add_node->input(1));
+
+          const NodeDef* new_div_node = node_map.GetNode(HoistDivName("add"));
+          ASSERT_TRUE(new_div_node != nullptr) << "Hoisted Div node not found";
+          EXPECT_EQ(new_add_node->name(), new_div_node->input(0));
+          EXPECT_EQ("x", new_div_node->input(1));
+
+          const NodeDef* id_node = node_map.GetNode("id");
+          ASSERT_TRUE(id_node != nullptr) << "Id node not found";
+          EXPECT_EQ("id", id_node->name());
+          EXPECT_EQ(HoistDivName("add"), id_node->input(0));
+        }
       }
     }
   }
@@ -454,35 +722,40 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
   Output transp = ops::Transpose(s.WithOpName("trans"), conj, perm);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  std::vector<string> fetch = {"trans"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("trans_fused"), output.node(6).name());
-  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* trans_fused_node =
+      node_map.GetNode(OptimizedName("trans_fused"));
+  ASSERT_NE(trans_fused_node, nullptr);
+  EXPECT_EQ("ConjugateTranspose", trans_fused_node->op());
+  EXPECT_EQ("z", trans_fused_node->input(0));
+  EXPECT_EQ("perm", trans_fused_node->input(1));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
@@ -490,44 +763,56 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
       ops::ConjugateTranspose(s.WithOpName("conjugate_trans"), conj, perm);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"conjugate_trans"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("conjugate_trans_fused"), output.node(6).name());
-  EXPECT_EQ("Transpose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* conjugate_trans_fused_node =
+      node_map.GetNode(OptimizedName("conjugate_trans_fused"));
+  EXPECT_EQ("Transpose", conjugate_trans_fused_node->op());
+  EXPECT_EQ("z", conjugate_trans_fused_node->input(0));
+  EXPECT_EQ("perm", conjugate_trans_fused_node->input(1));
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output trans = ops::Transpose(s.WithOpName("trans"), z, perm);
   Output conj = ops::Conj(s.WithOpName("conj"), trans);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"conj"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("conj_fused"), output.node(6).name());
-  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* conj_fused_node =
+      node_map.GetNode(OptimizedName("conj_fused"));
+  EXPECT_EQ("ConjugateTranspose", conj_fused_node->op());
+  EXPECT_EQ("z", conj_fused_node->input(0));
+  EXPECT_EQ("perm", conj_fused_node->input(1));
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
@@ -549,27 +834,32 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
     }
     GrapplerItem item;
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    std::vector<string> fetch = {"matmul"};
+    auto tensors_expected = EvaluateNodes(item.graph, fetch);
+    EXPECT_EQ(1, tensors_expected.size());
 
     ArithmeticOptimizer optimizer;
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
-    // Run the optimizer twice to make sure the rewrite is idempotent.
-    item.graph.Swap(&output);
-    status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
+    OptimizeTwice(&optimizer, &item, &output);
+    NodeMap node_map(&output);
 
     EXPECT_EQ(7, output.node_size());
-    EXPECT_EQ(OptimizedName("matmul_fused"), output.node(6).name());
-    EXPECT_EQ("a", output.node(6).input(0));
-    EXPECT_EQ("b", output.node(6).input(1));
+
+    const NodeDef* matmul_fused_node =
+        node_map.GetNode(OptimizedName("matmul_fused"));
+    ASSERT_NE(matmul_fused_node, nullptr);
+    EXPECT_EQ("a", matmul_fused_node->input(0));
+    EXPECT_EQ("b", matmul_fused_node->input(1));
     if (matmul_type == "BatchMatMul") {
-      EXPECT_TRUE(output.node(6).attr().at("adj_x").b());
-      EXPECT_TRUE(output.node(6).attr().at("adj_y").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("adj_x").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("adj_y").b());
     } else {
-      EXPECT_TRUE(output.node(6).attr().at("transpose_a").b());
-      EXPECT_TRUE(output.node(6).attr().at("transpose_b").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("transpose_a").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("transpose_b").b());
     }
+    auto tensors = EvaluateNodes(output, fetch);
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
   }
 }
 
@@ -591,6 +881,9 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   Output matmul = ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"matmul"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -603,6 +896,9 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   EXPECT_EQ("b", output.node(10).input(1));
   EXPECT_TRUE(output.node(10).attr().at("adj_x").b());
   EXPECT_TRUE(output.node(10).attr().at("adj_y").b());
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<complex64>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
@@ -623,16 +919,20 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
   item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(0, std::count_if(
-                   output.node().begin(), output.node().end(),
-                   [](const NodeDef& node) { return node.op() == "Reshape"; }));
+  EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
@@ -647,16 +947,20 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({8, 3, 28, 28}));
+  item.feed = {{"Placeholder", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
   item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(1, std::count_if(
-                   output.node().begin(), output.node().end(),
-                   [](const NodeDef& node) { return node.op() == "Reshape"; }));
+  EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
@@ -669,16 +973,13 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
   item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(1, std::count_if(
-                   output.node().begin(), output.node().end(),
-                   [](const NodeDef& node) { return node.op() == "Reshape"; }));
+  EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
 }
 
 TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
@@ -702,16 +1003,20 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_INT8>(TensorShape({8, 3, 28, 28, 4}));
+  item.feed = {{"nchw_vect_c", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
   item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(1, std::count_if(
-                   output.node().begin(), output.node().end(),
-                   [](const NodeDef& node) { return node.op() == "Reshape"; }));
+  EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast) {
@@ -780,7 +1085,7 @@ TEST_F(ArithmeticOptimizerTest, NoReorderTransposeCast) {
   EXPECT_EQ(1, num_transposes);
 }
 
-TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposes) {
+TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs_shape =
       ops::Const(s.WithOpName("inputs_shape"), {8, 3, 28, 28}, {4});
@@ -788,30 +1093,32 @@ TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposes) {
       ops::RandomUniform(s.WithOpName("inputs"), inputs_shape, DT_FLOAT);
   Output perm1 = ops::Const(s.WithOpName("perm1"), {0, 2, 3, 1}, {4});
   Output perm2 = ops::Const(s.WithOpName("perm2"), {0, 3, 1, 2}, {4});
+  Output perm3 = ops::Const(s.WithOpName("perm2"), {0, 1, 2, 3}, {4});
   Output transpose1 = ops::Transpose(s.WithOpName("transpose1"), inputs, perm1);
   Output transpose2 =
       ops::Transpose(s.WithOpName("transpose2"), transpose1, perm2);
-  Output outputs = ops::Identity(s.WithOpName("outputs"), transpose2);
+  Output transpose3 = ops::Transpose(s.WithOpName("transpose3"), inputs, perm3);
+  Output id1 = ops::Identity(s.WithOpName("id1"), transpose2);
+  Output id2 = ops::Identity(s.WithOpName("id2"), transpose3);
 
   GrapplerItem item;
-  item.fetch = {"outputs"};
+  item.fetch = {"id1", "id2"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
 
   std::set<string> nodes_after_optimization;
   for (const NodeDef& node : output.node()) {
     nodes_after_optimization.insert(node.name());
   }
   EXPECT_EQ(nodes_after_optimization,
-            std::set<string>({"inputs_shape", "inputs", "outputs"}));
+            std::set<string>({"id1", "id2", "inputs_shape", "inputs"}));
 }
 
-TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposesMultipleOutputs) {
+TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposesMultipleOutputs) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs_shape =
       ops::Const(s.WithOpName("inputs_shape"), {8, 9, 28, 28}, {4});
@@ -831,10 +1138,9 @@ TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposesMultipleOutputs) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
 
   for (const NodeDef& node : output.node()) {
     if (node.op() == "Concat") {
@@ -858,10 +1164,11 @@ TEST_F(ArithmeticOptimizerTest, RemoveTransposesWithControlDependency) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
 
   NodeMap node_map(&output);
   const NodeDef* outputs_node = node_map.GetNode("outputs");
@@ -887,10 +1194,9 @@ TEST_F(ArithmeticOptimizerTest, NotRemoveTransposes) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(6, output.node_size());
 }
@@ -1105,10 +1411,10 @@ TEST_F(ArithmeticOptimizerTest, OptimizeMultipleMulTransposeConv) {
 
 TEST_F(ArithmeticOptimizerTest, CombineBitcasts) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output inputs =
-      ops::Placeholder(s, DT_UINT8, ops::Placeholder::Shape({2, 3}));
-  Output bc1 = ops::Bitcast(s, inputs, DT_QINT8);
-  Output bc2 = ops::Bitcast(s, bc1, DT_INT8);
+  Output inputs = ops::Placeholder(s.WithOpName("inputs"), DT_UINT8,
+                                   ops::Placeholder::Shape({2, 3}));
+  Output bc1 = ops::Bitcast(s.WithOpName("bc1"), inputs, DT_QINT8);
+  Output bc2 = ops::Bitcast(s.WithOpName("bc2"), bc1, DT_INT8);
   Output outputs = ops::Identity(s.WithOpName("outputs"), bc2);
 
   GrapplerItem item;
@@ -1116,18 +1422,22 @@ TEST_F(ArithmeticOptimizerTest, CombineBitcasts) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantBitcast(&optimizer);
 
-  EXPECT_EQ(1, std::count_if(
-                   output.node().begin(), output.node().end(),
-                   [](const NodeDef& node) { return node.op() == "Bitcast"; }));
+  OptimizeAndPrune(&optimizer, &item, &output);
+  NodeMap node_map(&output);
+
+  // Bitcasts combined into a single op and inputs redirected to updated Bitcast
+  EXPECT_EQ(3, output.node_size());
+  EXPECT_EQ(1, CountOpNodes(output, "Bitcast"));
+  EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "bc2"));
 }
 
 TEST_F(ArithmeticOptimizerTest, CombineAndRemoveBitcasts) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output inputs = ops::Placeholder(s, DT_INT8, ops::Placeholder::Shape({2, 3}));
+  Output inputs = ops::Placeholder(s.WithOpName("inputs"), DT_INT8,
+                                   ops::Placeholder::Shape({2, 3}));
   Output bc1 = ops::Bitcast(s, inputs, DT_QINT8);
   Output bc2 = ops::Bitcast(s, bc1, DT_INT8);
   Output outputs = ops::Identity(s.WithOpName("outputs"), bc2);
@@ -1135,35 +1445,791 @@ TEST_F(ArithmeticOptimizerTest, CombineAndRemoveBitcasts) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantBitcast(&optimizer);
 
-  EXPECT_EQ(0, std::count_if(
-                   output.node().begin(), output.node().end(),
-                   [](const NodeDef& node) { return node.op() == "Bitcast"; }));
+  OptimizeAndPrune(&optimizer, &item, &output);
+  NodeMap node_map(&output);
+
+  // Bitcasts removed and inputs redirected to outputs
+  EXPECT_EQ(2, output.node_size());
+  EXPECT_EQ(0, CountOpNodes(output, "Bitcast"));
+  EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "outputs"));
 }
 
 TEST_F(ArithmeticOptimizerTest, RemoveRedundantCast) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output inputs = ops::Placeholder(s, DT_INT8, ops::Placeholder::Shape({2, 3}));
+  Output inputs = ops::Placeholder(s.WithOpName("inputs"), DT_INT8,
+                                   ops::Placeholder::Shape({2, 3}));
   Output cast = ops::Cast(s, inputs, DT_INT8);
   Output outputs = ops::Identity(s.WithOpName("outputs"), cast);
 
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantCast(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
-  EXPECT_EQ(0, std::count_if(
-                   output.node().begin(), output.node().end(),
-                   [](const NodeDef& node) { return node.op() == "Cast"; }));
+  // Cast removed and inputs redirected to outputs
+  EXPECT_EQ(2, output.node_size());
+  EXPECT_EQ(0, CountOpNodes(output, "Cast"));
+  EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "outputs"));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  tensorflow::Scope sx = s.NewSubScope("x");
+  tensorflow::Scope sy = s.NewSubScope("y");
+
+  auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {2, 2}, DT_FLOAT);
+  auto add_ab = ops::Add(sx.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(sy.WithOpName("Add_abc"), add_ab, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_abc);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //     +
+  //    / \
+  //   +   c      -->    AddN(a, b, c)
+  //  / \
+  // a   b
+  EXPECT_EQ(5, output.node_size());
+
+  NodeMap node_map(&output);
+
+  // check add tree was replaced with AddN
+  const NodeDef* collapsed_add =
+      node_map.GetNode("y/ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_add, nullptr);
+
+  EXPECT_EQ("AddN", collapsed_add->op());
+  EXPECT_EQ(3, collapsed_add->input_size());
+  EXPECT_EQ("a", collapsed_add->input(0));
+  EXPECT_EQ("b", collapsed_add->input(1));
+  EXPECT_EQ("c", collapsed_add->input(2));
+
+  // check output was re-wired to new node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+
+  EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {2, 2}, DT_FLOAT);
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto x = ops::Variable(s.WithOpName("x"), {2, 2}, DT_FLOAT);
+  auto y = ops::Variable(s.WithOpName("y"), {2, 2}, DT_FLOAT);
+  auto z = ops::Variable(s.WithOpName("z"), {2, 2}, DT_FLOAT);
+  auto add_xy = ops::Add(s.WithOpName("Add_xy"), x, y);
+  auto add_xyz = ops::Add(s.WithOpName("Add_xyz"), add_xy, z);
+
+  auto mul = ops::Multiply(s.WithOpName("Mul"), add_abc, add_xyz);
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //         *
+  //      /     \
+  //     +       +                        *
+  //    / \     / \                    /     \
+  //   +   c   x   + -->    AddN(a, b, c)  AddN(x, y, z))
+  //  / \         / \
+  // a   b       y   z
+  EXPECT_EQ(10, output.node_size());
+
+  NodeMap node_map(&output);
+
+  // check left Add subtree replaced with AddN
+  const NodeDef* collapsed_left =
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_left, nullptr);
+
+  EXPECT_EQ("AddN", collapsed_left->op());
+  EXPECT_EQ(3, collapsed_left->input_size());
+  EXPECT_EQ("a", collapsed_left->input(0));
+  EXPECT_EQ("b", collapsed_left->input(1));
+  EXPECT_EQ("c", collapsed_left->input(2));
+
+  // check right Add subtree replaced with AddN
+  const NodeDef* collapsed_right =
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_xyz");
+  ASSERT_NE(collapsed_right, nullptr);
+
+  EXPECT_EQ("AddN", collapsed_right->op());
+  EXPECT_EQ(3, collapsed_right->input_size());
+  EXPECT_EQ("x", collapsed_right->input(0));
+  EXPECT_EQ("y", collapsed_right->input(1));
+  EXPECT_EQ("z", collapsed_right->input(2));
+
+  // check that Mul inputs re-wired to new Nodes
+  const NodeDef* updated_mul = node_map.GetNode("Mul");
+  ASSERT_NE(updated_mul, nullptr);
+
+  EXPECT_EQ("Mul", updated_mul->op());
+  EXPECT_EQ(2, updated_mul->input_size());
+  EXPECT_EQ(collapsed_left->name(), updated_mul->input(0));
+  EXPECT_EQ(collapsed_right->name(), updated_mul->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {2, 2}, DT_FLOAT);
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_bc = ops::Add(s.WithOpName("Add_bc"), b, c);
+  auto add_all = ops::Add(s.WithOpName("Add_all"), add_ab, add_bc);
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_all);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //     +
+  //    / \
+  //   +   +     -->    AddN(a, b, b, c)
+  //  / \ / \                   ^
+  // a   b   c                  b added twice!
+  EXPECT_EQ(5, output.node_size());
+
+  NodeMap node_map(&output);
+
+  // check Add tree replaced with AddN
+  const NodeDef* collapsed_add =
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_all");
+  ASSERT_NE(collapsed_add, nullptr);
+
+  EXPECT_EQ("AddN", collapsed_add->op());
+  EXPECT_EQ(4, collapsed_add->input_size());
+  EXPECT_EQ("a", collapsed_add->input(0));
+  EXPECT_EQ("b", collapsed_add->input(1));
+  EXPECT_EQ("b", collapsed_add->input(2));
+  EXPECT_EQ("c", collapsed_add->input(3));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // unknown input shape propagated symbolically through the graph
+  auto input = ops::Variable(s.WithOpName("input"), {-1, 2}, DT_FLOAT);
+
+  // [a, b, c] have symbolically equal shapes
+  auto a = ops::Sqrt(s.WithOpName("a"), input);
+  auto b = ops::Square(s.WithOpName("b"), input);
+  auto c = ops::Round(s.WithOpName("c"), input);
+
+  // [add_ab, add_abc] shape must be inferred from inputs
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_abc);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //     +
+  //    / \
+  //   +   c      -->    AddN(a, b, c)
+  //  / \
+  // a   b
+  EXPECT_EQ(6, output.node_size());
+
+  NodeMap node_map(&output);
+
+  // check add tree was replaced with AddN
+  const NodeDef* collapsed_add =
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_add, nullptr);
+  EXPECT_EQ("AddN", collapsed_add->op());
+  EXPECT_EQ(3, collapsed_add->input_size());
+  EXPECT_EQ("a", collapsed_add->input(0));
+  EXPECT_EQ("b", collapsed_add->input(1));
+  EXPECT_EQ("c", collapsed_add->input(2));
+
+  // check output was re-wired to new node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32, 32, 32}, DT_FLOAT);
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto x = ops::Variable(s.WithOpName("x"), {32}, DT_FLOAT);
+  auto y = ops::Variable(s.WithOpName("y"), {32, 32}, DT_FLOAT);
+  auto z = ops::Variable(s.WithOpName("z"), {32, 32, 32}, DT_FLOAT);
+  auto add_xy = ops::Add(s.WithOpName("Add_xy"), x, y);
+  auto add_xyz = ops::Add(s.WithOpName("Add_xyz"), add_xy, z);
+
+  auto add_all = ops::Add(s.WithOpName("AddAll"), add_abc, add_xyz);
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_all);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //  1) [a, x], [b, y], [c, z] - aggregate same shapes first
+  //  2) Build an aggregation tree minimizing cost of broadcast
+  //
+  //         +                              +
+  //      /     \                       /       \
+  //     +       +                     +       AddN(c, z)
+  //    / \     / \                 /     \
+  //   +   c   x   + -->    AddN(a, x)  AddN(b, y)
+  //  / \         / \
+  // a   b       y   z
+  EXPECT_EQ(12, output.node_size());
+  NodeMap node_map(&output);
+
+  // expected names of outer and inner nodes
+  string outer_add_name = "ArithmeticOptimizer/AddOpsRewrite_AddAll";
+  string outer_0_add_name =
+      "ArithmeticOptimizer/AddOpsRewrite_Internal_0_AddAll";
+  string inner_0_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_0_AddAll";
+  string inner_1_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_1_AddAll";
+  string inner_2_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_2_AddAll";
+
+  // Add [a, x] first
+  const NodeDef* add_ax_node = node_map.GetNode(inner_0_add_name);
+  ASSERT_NE(add_ax_node, nullptr);
+  EXPECT_EQ("AddN", add_ax_node->op());
+  EXPECT_EQ(2, add_ax_node->input_size());
+  EXPECT_EQ("a", add_ax_node->input(0));
+  EXPECT_EQ("x", add_ax_node->input(1));
+
+  // Then add [b, y]
+  const NodeDef* add_by_node = node_map.GetNode(inner_1_add_name);
+  ASSERT_NE(add_by_node, nullptr);
+  EXPECT_EQ("AddN", add_by_node->op());
+  EXPECT_EQ(2, add_by_node->input_size());
+  EXPECT_EQ("b", add_by_node->input(0));
+  EXPECT_EQ("y", add_by_node->input(1));
+
+  // Then add [c, z]
+  const NodeDef* add_cz_node = node_map.GetNode(inner_2_add_name);
+  ASSERT_NE(add_cz_node, nullptr);
+  EXPECT_EQ("AddN", add_cz_node->op());
+  EXPECT_EQ(2, add_cz_node->input_size());
+  EXPECT_EQ("c", add_cz_node->input(0));
+  EXPECT_EQ("z", add_cz_node->input(1));
+
+  // Then add results together starting from smaller shapes [a, x] + [b, y]
+  const NodeDef* outer_0_node = node_map.GetNode(outer_0_add_name);
+  ASSERT_NE(outer_0_node, nullptr);
+  EXPECT_EQ("Add", outer_0_node->op());
+  EXPECT_EQ(2, outer_0_node->input_size());
+  EXPECT_EQ(inner_0_add_name, outer_0_node->input(0));
+  EXPECT_EQ(inner_1_add_name, outer_0_node->input(1));
+
+  // And finally top level Add node
+  const NodeDef* outer_node = node_map.GetNode(outer_add_name);
+  ASSERT_NE(outer_node, nullptr);
+  EXPECT_EQ("Add", outer_node->op());
+  EXPECT_EQ(2, outer_node->input_size());
+  EXPECT_EQ(outer_0_add_name, outer_node->input(0));
+  EXPECT_EQ(inner_2_add_name, outer_node->input(1));
+
+  // And outputs reading new top level Add node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // We have a small input with one unknown dimension
+  auto small = ops::Variable(s.WithOpName("small"), {-1, 1, 1}, DT_FLOAT);
+
+  // And second input which is larger, but has the same unknown dimension
+  // device spec prevents this node from rewriting
+  auto d = "/job:do_not_rewrite_me";
+  auto v = ops::Variable(s.WithOpName("v"), {1, 32, 32}, DT_FLOAT);
+  auto large = ops::Add(s.WithOpName("large").WithDevice(d), small, v);
+
+  // [a, c] have {?, 1, 1} shape, [b] has {?, 32, 32}
+  auto a = ops::Sqrt(s.WithOpName("a"), small);
+  auto b = ops::Square(s.WithOpName("b"), large);
+  auto c = ops::Round(s.WithOpName("c"), small);
+
+  // [add_ab, add_abc] shape must be inferred from inputs
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_abc);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur: it's much cheaper to add small
+  // tensors, and do the broadcast just once
+  //
+  //     +                  +
+  //    / \                / \
+  //   +   c      -->     +   b
+  //  / \                / \
+  // a   b              a   c
+  EXPECT_EQ(9, output.node_size());
+  NodeMap node_map(&output);
+
+  // expected names of outer and inner nodes
+  string outer_add_name = "ArithmeticOptimizer/AddOpsRewrite_Add_abc";
+  string inner_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_0_Add_abc";
+
+  // outer Add node
+  const NodeDef* outer_add = node_map.GetNode(outer_add_name);
+  ASSERT_NE(outer_add, nullptr);
+  EXPECT_EQ("Add", outer_add->op());
+  EXPECT_EQ(inner_add_name, outer_add->input(0));
+  EXPECT_EQ("b", outer_add->input(1));
+
+  // inner AddN node
+  const NodeDef* inner_add = node_map.GetNode(inner_add_name);
+  ASSERT_NE(inner_add, nullptr);
+  EXPECT_EQ(2, inner_add->input_size());
+  EXPECT_EQ("a", inner_add->input(0));
+  EXPECT_EQ("c", inner_add->input(1));
+
+  // check output was re-wired to new node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Variable(s.WithOpName("x"), {2, 2}, DT_FLOAT);
+  auto y = ops::Variable(s.WithOpName("y"), {2, 2}, DT_FLOAT);
+  Output neg_x = ops::Neg(s.WithOpName("Neg_x"), x);
+  Output neg_y = ops::Neg(s.WithOpName("Neg_y"), y);
+  Output add_x_y = ops::Add(s.WithOpName("Add_x_y"), x, y);
+  Output add_negx_y = ops::Add(s.WithOpName("Add_negx_y"), neg_x, y);
+  Output add_x_negy = ops::Add(s.WithOpName("Add_x_negy"), x, neg_y);
+  Output add_negx_negy = ops::Add(s.WithOpName("Add_negx_negy"), neg_x, neg_y);
+  Output sub_x_y = ops::Sub(s.WithOpName("Sub_x_y"), x, y);
+  Output sub_negx_y = ops::Sub(s.WithOpName("Sub_negx_y"), neg_x, y);
+  Output sub_x_negy = ops::Sub(s.WithOpName("Sub_x_negy"), x, neg_y);
+  Output sub_negx_negy = ops::Sub(s.WithOpName("Sub_negx_negy"), neg_x, neg_y);
+  auto add_all = ops::AddN(s.WithOpName("add_all"),
+                           {add_x_y, add_negx_y, add_x_negy, add_negx_negy,
+                            sub_x_y, sub_negx_y, sub_x_negy, sub_negx_negy});
+
+  GrapplerItem item;
+  item.fetch = {"add_all"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveNegation(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  int found = 0;
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "Add_negx_y") {
+      ++found;
+      EXPECT_EQ("Sub", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+      EXPECT_EQ("^Neg_x", node.input(2));
+    } else if (node.name() == "Add_x_negy") {
+      ++found;
+      EXPECT_EQ("Sub", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^Neg_y", node.input(2));
+    } else if (node.name() == "Add_negx_negy") {
+      ++found;
+      EXPECT_EQ("Sub", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("Neg_y", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+      EXPECT_EQ("^Neg_x", node.input(2));
+    } else if (node.name() == "Sub_x_negy") {
+      ++found;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^Neg_y", node.input(2));
+    } else if (node.name() == "Sub_negx_negy") {
+      ++found;
+      EXPECT_EQ("Sub", node.op());
+      EXPECT_EQ(4, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+      EXPECT_EQ("^Neg_y", node.input(2));
+      EXPECT_EQ("^Neg_x", node.input(3));
+    }
+  }
+  EXPECT_EQ(5, found);
+}
+
+TEST_F(ArithmeticOptimizerTest, ConvertSqrtDivToRsqrtMul) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  auto y = ops::Const(s.WithOpName("y"), {3.0f, 4.0f}, {1, 2});
+  Output sqrt_y = ops::Sqrt(s.WithOpName("sqrt_y"), y);
+  Output div_x_sqrt_y = ops::Div(s.WithOpName("output"), x, sqrt_y);
+
+  GrapplerItem item;
+  item.fetch = {"output"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlySqrtDivToRsqrtMul(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "output") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("sqrt_y", node.input(1));
+    } else if (node.name() == "sqrt_y") {
+      EXPECT_EQ("Rsqrt", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul2);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //     *                  *
+  //    / \                / \
+  //   *   c      -->     *   b
+  //  / \                / \
+  // a   b              a   c
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("c", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("mul1", mul2_node->input(0));
+  EXPECT_EQ("b", mul2_node->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  auto d = ops::Variable(s.WithOpName("d"), {32}, DT_FLOAT);
+  auto e = ops::Variable(s.WithOpName("e"), {32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
+  auto mul3 = ops::Mul(s.WithOpName("mul3"), mul2, d);
+  auto mul4 = ops::Mul(s.WithOpName("mul4"), mul3, e);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul4);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur: Graph is "flattened" and
+  // largest shape pushed to the top.
+  //
+  //          *
+  //        /   \
+  //       *     e                *
+  //      /  \                  /   \
+  //     *    d               *      b
+  //    / \                 /  \
+  //   *   c      -->     *      *
+  //  / \                / \    / \
+  // a   b              a   c  d   e
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("c", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("d", mul2_node->input(0));
+  EXPECT_EQ("e", mul2_node->input(1));
+
+  const NodeDef* mul3_node = node_map.GetNode("mul3");
+  ASSERT_NE(mul3_node, nullptr);
+  EXPECT_EQ("mul1", mul3_node->input(0));
+  EXPECT_EQ("mul2", mul3_node->input(1));
+
+  const NodeDef* mul4_node = node_map.GetNode("mul4");
+  ASSERT_NE(mul4_node, nullptr);
+  EXPECT_EQ("mul3", mul4_node->input(0));
+  EXPECT_EQ("b", mul4_node->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // [a, b, c] - scalars, [d] - matrix
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  auto d = ops::Variable(s.WithOpName("D"), {32, 32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), c, d);
+  auto mul3 = ops::Mul(s.WithOpName("mul3"), mul1, mul2);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul3);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //                              *
+  //                            /  \
+  //       *                   *    D
+  //     /   \                / \
+  //    *     *      ->      *   c
+  //   / \   / \            / \
+  //  a   b c   D          a   b
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("b", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("mul2", mul2_node->input(0));
+  EXPECT_EQ("c", mul2_node->input(1));
+
+  const NodeDef* mul3_node = node_map.GetNode("mul3");
+  ASSERT_NE(mul3_node, nullptr);
+  EXPECT_EQ("D", mul3_node->input(0));
+  EXPECT_EQ("mul1", mul3_node->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  Output b = ops::Variable(s.WithOpName("b"), {32}, DT_FLOAT);
+  Output c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  Output axis = ops::Const(s.WithOpName("axis"), 0, {});
+  Output ctrl1 = ops::Const(s.WithOpName("ctrl1"), 1, {});
+  Output ctrl2 = ops::Const(s.WithOpName("ctrl2"), 2, {});
+  Output ctrl3 = ops::Const(s.WithOpName("ctrl3"), 3, {});
+  // Test case with chains of length 1.
+  Output sin_a =
+      ops::Sin(s.WithOpName("sin_a").WithControlDependencies(ctrl3), a);
+  Output exp_a =
+      ops::Exp(s.WithOpName("exp_a").WithControlDependencies(ctrl1), sin_a);
+  Output exp_b = ops::Exp(s.WithOpName("exp_b"), b);
+  Output exp_c =
+      ops::Exp(s.WithOpName("exp_c").WithControlDependencies(ctrl2), c);
+  Output concat =
+      ops::Concat(s.WithOpName("concat"), {exp_a, exp_b, exp_c}, axis);
+  Output id = ops::Identity(s.WithOpName("id"), concat);
+
+  // Test case with chains of length 2.
+  Output exp_a2 =
+      ops::Exp(s.WithOpName("exp_a2").WithControlDependencies(ctrl1), sin_a);
+  Output exp_b2 = ops::Exp(s.WithOpName("exp_b2"), b);
+  Output exp_c2 =
+      ops::Exp(s.WithOpName("exp_c2").WithControlDependencies(ctrl2), c);
+  Output cos_exp_a2 = ops::Cos(
+      s.WithOpName("cos_exp_a2").WithControlDependencies(ctrl1), exp_a2);
+  Output cos_exp_b2 = ops::Cos(
+      s.WithOpName("cos_exp_b2").WithControlDependencies(ctrl3), exp_b2);
+  Output cos_exp_c2 = ops::Cos(s.WithOpName("cos_exp_c2"), exp_c2);
+  Output concat2 = ops::Concat(s.WithOpName("concat2"),
+                               {cos_exp_a2, cos_exp_b2, cos_exp_c2}, axis);
+  Output id2 = ops::Identity(s.WithOpName("id2"), concat2);
+  GrapplerItem item;
+  item.fetch = {"id", "id2"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyHoistCWiseUnaryFromConcat(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "concat") {
+      EXPECT_EQ(6, node.input_size());
+      EXPECT_EQ("sin_a", node.input(0));
+      EXPECT_EQ("b", node.input(1));
+      EXPECT_EQ("c", node.input(2));
+      EXPECT_EQ("axis", node.input(3));
+      EXPECT_EQ("^ctrl1", node.input(4));
+      EXPECT_EQ("^ctrl2", node.input(5));
+      found++;
+    }
+    if (node.name() == "exp_a") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("concat", node.input(0));
+      found++;
+    }
+    if (node.name() == "id") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("exp_a", node.input(0));
+      found++;
+    }
+
+    if (node.name() == "concat2") {
+      EXPECT_EQ(7, node.input_size());
+      EXPECT_EQ("sin_a", node.input(0));
+      EXPECT_EQ("b", node.input(1));
+      EXPECT_EQ("c", node.input(2));
+      EXPECT_EQ("axis", node.input(3));
+      EXPECT_EQ("^ctrl1", node.input(4));
+      EXPECT_EQ("^ctrl2", node.input(5));
+      EXPECT_EQ("^ctrl3", node.input(6));
+      found++;
+    }
+    if (node.name() == "exp_a2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("concat2", node.input(0));
+      found++;
+    }
+    if (node.name() == "cos_exp_a2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("exp_a2", node.input(0));
+      found++;
+    }
+    if (node.name() == "id2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("cos_exp_a2", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(7, found);
 }
 
-}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index 8d1098d87755c1257dfebe016a3baf86bfece677..63f6fe5b9db87017c1a76a56c697a3c85555c232 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
 #include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
@@ -63,4 +63,4 @@ class AutoParallel : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index b8a21ea5a15ec1556db47b13db43d19bf070c266..4801f18619e672009a9ed26c50118b3129fb497c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -31,11 +31,15 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/bcast.h"
@@ -51,7 +55,14 @@ class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
   explicit EigenThreadPoolWrapper(thread::ThreadPool* pool) : pool_(pool) {}
   ~EigenThreadPoolWrapper() override {}
   void Schedule(std::function<void()> fn) override {
-    pool_->Schedule(std::move(fn));
+    auto wrapped = [=]() {
+      // TensorFlow flushes denormals to zero and rounds to nearest, so we do
+      // the same here.
+      port::ScopedFlushDenormal flush;
+      port::ScopedSetRound round(FE_TONEAREST);
+      fn();
+    };
+    pool_->Schedule(std::move(wrapped));
   }
   int NumThreads() const override { return pool_->NumThreads(); }
   int CurrentThreadId() const override { return pool_->CurrentThreadId(); }
@@ -99,52 +110,37 @@ class DeviceSimple : public DeviceBase {
 };
 
 template <typename T>
-bool AllValuesAre(const TensorProto& tensor, const T& value) {
-  // TensorProto represents the content of the tensor in either <type>_val or
-  // tensor_content.
-  typename checkpoint::SaveTypeTraits<T>::RepeatedField* tensor_values =
-      checkpoint::MutableTensorProtoData<T>(const_cast<TensorProto*>(&tensor));
-  if (!tensor_values->empty()) {
-    for (const T& tensor_value : *tensor_values) {
-      if (tensor_value != value) {
-        return false;
-      }
-    }
-    return true;
+bool AllValuesAre(const TensorProto& proto, const T& value) {
+  Tensor tensor;
+  if (!tensor.FromProto(proto)) {
+    return false;
   }
-  const auto tensor_content_size = tensor.tensor_content().size();
-  if (tensor_content_size > 0) {
-    CHECK_EQ(0, tensor_content_size % sizeof(T));
-    std::vector<T> raw_values(tensor_content_size / sizeof(T));
-    port::CopyToArray(tensor.tensor_content(),
-                      reinterpret_cast<char*>(raw_values.data()));
-    for (int i = 0; i < tensor_content_size / sizeof(T); ++i) {
-      if (raw_values[i] != value) {
-        return false;
-      }
+  auto values = tensor.flat<T>();
+  for (int i = 0; i < tensor.NumElements(); ++i) {
+    if (values(i) != value) {
+      return false;
     }
-    return true;
   }
-  return false;
+  return true;
 }
 
 // Add new_input as a control input to node if it does not already depend on it.
 // TODO(rmlarsen): Move the following two utility functions to utils.{h,cc} and
 // clean up code that should be using them.
-bool MaybeAddControlInput(const string& new_input, NodeDef* node,
+bool MaybeAddControlInput(const string& ctrl_input, NodeDef* node,
                           GraphDef* graph, NodeMap* node_map) {
   bool already_exists = false;
   for (const string& input : node->input()) {
-    if (input == new_input || AsControlDependency(input) == new_input) {
+    if (input == ctrl_input || AsControlDependency(input) == ctrl_input) {
       already_exists = true;
       break;
     }
   }
   if (!already_exists) {
     const string ctrl_dep =
-        ConstantFolding::AddControlDependency(new_input, graph, node_map);
+        ConstantFolding::AddControlDependency(ctrl_input, graph, node_map);
     node->add_input(ctrl_dep);
-    node_map->AddOutput(NodeName(new_input), node->name());
+    node_map->AddOutput(NodeName(ctrl_input), node->name());
   }
   return !already_exists;
 }
@@ -152,16 +148,27 @@ bool MaybeAddControlInput(const string& new_input, NodeDef* node,
 // Remove old_input as a control input to node.
 bool MaybeRemoveControlInput(const string& old_input, NodeDef* node,
                              GraphDef* graph, NodeMap* node_map) {
+  bool removed_input = false;
+  bool update_node_map = true;
+  const string old_input_ctrl_dep = AsControlDependency(NodeName(old_input));
   for (int i = 0; i < node->input_size(); ++i) {
     const string& input = node->input(i);
-    if (IsControlInput(input) && AsControlDependency(old_input) == input) {
-      node->mutable_input()->SwapElements(i, node->input_size() - 1);
-      node->mutable_input()->RemoveLast();
-      node_map->RemoveOutput(NodeName(old_input), node->name());
-      return true;
+    if (old_input_ctrl_dep == input) {
+      if (IsControlInput(input)) {
+        node->mutable_input()->SwapElements(i, node->input_size() - 1);
+        node->mutable_input()->RemoveLast();
+        removed_input = true;
+      } else {
+        // There is a non-control input from the same node.
+        // Don't remove the output from the NodeMap.
+        update_node_map = false;
+      }
     }
   }
-  return false;
+  if (update_node_map) {
+    node_map->RemoveOutput(NodeName(old_input), node->name());
+  }
+  return removed_input;
 }
 
 }  // namespace
@@ -224,44 +231,41 @@ string ConstantFolding::AddControlDependency(const string& input_name,
   }
 }
 
-Status ConvertShapeToConstant(const string& op, const DataType& type,
-                              const PartialTensorShape& shp, Tensor* value) {
+// Puts the given value into the tensor at the given "flat" index.
+static Status PutValueIntoTensor(const int64 value, const DataType& type,
+                                 const int index, Tensor* tensor) {
+  if (type == DT_INT32) {
+    if (value >= INT_MAX) {
+      return Status(error::INVALID_ARGUMENT, "int32 overflow");
+    }
+    tensor->flat<int32>()(index) = static_cast<int32>(value);
+  } else {
+    tensor->flat<int64>()(index) = value;
+  }
+  return Status::OK();
+}
+
+// Writes the given tensor shape into the given tensor.
+// Op is assumed to be Shape, ShapeN, Size or Rank.
+static Status ConvertShapeToConstant(const string& op, const DataType& type,
+                                     const PartialTensorShape& shp,
+                                     Tensor* tensor) {
   if (op == "Shape" || op == "ShapeN") {
-    *value = Tensor(type, TensorShape({shp.dims()}));
+    *tensor = Tensor(type, TensorShape({shp.dims()}));
     for (int i = 0; i < shp.dims(); ++i) {
-      if (type == DT_INT32) {
-        if (shp.dim_size(i) >= INT_MAX) {
-          return Status(error::INVALID_ARGUMENT, "Invalid dimension size");
-        }
-        value->flat<int32>()(i) = shp.dim_size(i);
-      } else {
-        value->flat<int64>()(i) = shp.dim_size(i);
-      }
+      TF_RETURN_IF_ERROR(PutValueIntoTensor(shp.dim_size(i), type, i, tensor));
     }
   } else if (op == "Size") {
     int64 size = 1;
     for (int i = 0; i < shp.dims(); ++i) {
       size *= shp.dim_size(i);
     }
-    *value = Tensor(type, TensorShape({}));
-    if (type == DT_INT32) {
-      if (size >= INT_MAX) {
-        return Status(error::INVALID_ARGUMENT, "Invalid dimension size");
-      }
-      value->flat<int32>()(0) = size;
-    } else {
-      value->flat<int64>()(0) = size;
-    }
+    *tensor = Tensor(type, TensorShape({}));
+    TF_RETURN_IF_ERROR(PutValueIntoTensor(size, type, 0, tensor));
   } else {
-    *value = Tensor(type, TensorShape({}));
-    if (type == DT_INT32) {
-      if (shp.dims() >= INT_MAX) {
-        return Status(error::INVALID_ARGUMENT, "Invalid dimension size");
-      }
-      value->flat<int32>()(0) = shp.dims();
-    } else {
-      value->flat<int64>()(0) = shp.dims();
-    }
+    CHECK_EQ(op, "Rank");
+    *tensor = Tensor(type, TensorShape({}));
+    TF_RETURN_IF_ERROR(PutValueIntoTensor(shp.dims(), type, 0, tensor));
   }
   return Status::OK();
 }
@@ -286,99 +290,160 @@ bool ConstantFolding::IsReallyConstant(const NodeDef& node) const {
   return feed_nodes_.find(node.name()) == feed_nodes_.end();
 }
 
+// Materialize the shapes using constants whenever possible.
 Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
-  // We may add some nodes to the graph to encode control dependencies: there is
-  // no need to process these, so only iterate over the nodes of the input
-  // graph.
+  // We may add some nodes to the graph to encode control dependencies and hold
+  // the materialized shapes: there is no need to process these added nodes, so
+  // only iterate over the nodes of the input graph.
   const int node_count = graph_->node_size();
-  for (int i = 0; i < node_count; ++i) {
-    NodeDef& node = *graph_->mutable_node(i);
-    const string op = node.op();
-    if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN") {
+  for (int node_idx = 0; node_idx < node_count; ++node_idx) {
+    NodeDef* node = graph_->mutable_node(node_idx);
+    const string op = node->op();
+    if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN" &&
+        op != "TensorArraySizeV3") {
       continue;
     }
 
     const std::vector<OpInfo::TensorProperties>& output =
-        properties.GetOutputProperties(node.name());
+        properties.GetOutputProperties(node->name());
     const std::vector<OpInfo::TensorProperties>& input =
-        properties.GetInputProperties(node.name());
+        properties.GetInputProperties(node->name());
     if (input.empty() || output.empty()) {
       continue;
     }
+
     if (op == "Shape" || op == "Size" || op == "Rank") {
       CHECK_EQ(1, output.size());
       CHECK_EQ(1, input.size());
-    }
-    CHECK_EQ(input.size(), output.size());
 
-    for (int j = 0; j < output.size(); ++j) {
-      const DataType type = output[j].dtype();
+      const DataType type = output[0].dtype();
       CHECK(type == DT_INT32 || type == DT_INT64);
-      const TensorShapeProto shape = input[j].shape();
-      // Materialize the shapes using constants whenever possible.
-      PartialTensorShape shp(shape);
-      if (shp.IsFullyDefined() || (!shp.unknown_rank() && op == "Rank")) {
-        Tensor value(type);
-        auto status = ConvertShapeToConstant(op, type, shp, &value);
-        if (!status.ok()) {
+      const PartialTensorShape shape(input[0].shape());
+
+      if ((op != "Rank" && !shape.IsFullyDefined()) ||
+          (op == "Rank" && shape.unknown_rank())) {
+        continue;
+      }
+
+      Tensor constant_value(type);
+      if (!ConvertShapeToConstant(op, type, shape, &constant_value).ok()) {
+        continue;
+      }
+
+      // Repurpose the existing node to be the constant.
+      // Device placement is preserved.
+      node->set_op("Const");
+      node->clear_attr();
+      (*node->mutable_attr())["dtype"].set_type(type);
+      constant_value.AsProtoTensorContent(
+          (*node->mutable_attr())["value"].mutable_tensor());
+
+      // Turn the data input into a control dependency: this is needed to
+      // ensure that the constant value will only be run in the
+      // cases where the shape/rank/size would have been run in
+      // the original graph.
+      string ctrl_dep =
+          AddControlDependency(node->input(0), graph_, node_map_.get());
+      node->set_input(0, ctrl_dep);
+      node_map_->AddOutput(NodeName(ctrl_dep), node->name());
+
+      // Done with the Shape/Size/Rank node, move to the next node.
+      continue;
+    }
+
+    if (op == "TensorArraySizeV3") {
+      const NodeDef* array = node_map_->GetNode(node->input(0));
+      if (array->attr().count("dynamic_size") != 0 &&
+          array->attr().at("dynamic_size").b()) {
+        continue;
+      }
+      const NodeDef* array_size = node_map_->GetNode(array->input(0));
+      if (IsReallyConstant(*array_size)) {
+        // Don't materialize 0 sizes to avoid triggering incorrect static
+        // checks. A 0 sized array that can't grow isn't useful anyway.
+        const TensorProto& raw_val = array_size->attr().at("value").tensor();
+        if (raw_val.dtype() != DT_INT32) {
           continue;
         }
-        // We rewrite the existing node for the first const output and
-        // create new nodes for the remaining const outputs (Note that ShapeN
-        // could have multiple outputs).
-        if (op == "Shape" || op == "Size" || op == "Rank") {
-          // Replace the node with the corresponding constant.
-          node.set_op("Const");
-          node.clear_attr();
-          (*node.mutable_attr())["dtype"].set_type(type);
-          value.AsProtoTensorContent(
-              (*node.mutable_attr())["value"].mutable_tensor());
-
-          // Turn the data input into a control dependency: this is needed to
-          // ensure that the constant value will only be run in the
-          // cases where the shape/rank/size would have been run in
-          // the original graph. Additional inputs are extra control
-          string ctrl_dep =
-              AddControlDependency(node.input(0), graph_, node_map_.get());
-          node.set_input(0, ctrl_dep);
-          node_map_->AddOutput(NodeName(ctrl_dep), node.name());
-        } else {
-          auto outputs = node_map_->GetOutputs(node.name());
-          for (const auto& output : outputs) {
-            for (int k = 0; k < output->input_size(); ++k) {
-              int port;
-              string node_name = ParseNodeName(output->input(k), &port);
-              if (node_name == node.name() && port == j) {
-                // Create a const node as ShapeN's output if not already.
-                const string const_name =
-                    OptimizedNodeName(node, strings::StrCat("-matshapes-", j));
-                if (node_map_->GetNode(const_name) == nullptr) {
-                  NodeDef* added_node = graph_->add_node();
-                  added_node->set_name(const_name);
-                  added_node->set_op("Const");
-                  added_node->set_device(node.device());
-                  node_map_->AddNode(added_node->name(), added_node);
-                  (*added_node->mutable_attr())["dtype"].set_type(type);
-                  value.AsProtoTensorContent(
-                      (*added_node->mutable_attr())["value"].mutable_tensor());
-                  // We add a control dependency to the original ShapeN node,
-                  // so that the node will only be run if all inputs of the
-                  // original ShapeN node are run.
-                  string ctrl_dep = AddControlDependency(node.name(), graph_,
-                                                         node_map_.get());
-                  *added_node->add_input() = ctrl_dep;
-                  node_map_->AddOutput(NodeName(ctrl_dep), added_node->name());
-                }
-                node_map_->UpdateInput(output->name(),
-                                       NodeName(output->input(k)), const_name);
-                *output->mutable_input(k) = const_name;
-              }
+        Tensor value(raw_val.dtype(), raw_val.tensor_shape());
+        if (!value.FromProto(raw_val)) {
+          continue;
+        }
+        if (value.flat<int32>()(0) == 0) {
+          continue;
+        }
+        node->set_op("Const");
+        *node->mutable_attr() = array_size->attr();
+        node->set_input(0, AsControlDependency(NodeName(node->input(0))));
+        node->set_input(1, AddControlDependency(NodeName(node->input(1)),
+                                                graph_, node_map_.get()));
+      }
+      continue;
+    }
+
+    // Handle ShapeN materialization case.
+    // It's possible that not all input tensors have known shapes.
+    CHECK_EQ(op, "ShapeN");
+    CHECK_EQ(input.size(), output.size());
+    const NodeDef* const shape_n_node = node;
+    for (int port_idx = 0; port_idx < output.size(); ++port_idx) {
+      const DataType type = output[port_idx].dtype();
+      CHECK(type == DT_INT32 || type == DT_INT64);
+      const PartialTensorShape shape(input[port_idx].shape());
+      if (!shape.IsFullyDefined()) {
+        continue;
+      }
+      Tensor constant_value(type);
+      auto status = ConvertShapeToConstant(op, type, shape, &constant_value);
+      if (!status.ok()) {
+        continue;
+      }
+
+      // Find all nodes consuming this shape and connect them through the new
+      // constant node instead.
+      auto outputs = node_map_->GetOutputs(shape_n_node->name());
+      for (NodeDef* output : outputs) {
+        // Track whether there are any direct edges left between shape_n_node
+        // and this output node after the transformation.
+        bool direct_edges_exist = false;
+        for (int k = 0; k < output->input_size(); ++k) {
+          int port;
+          const string node_name = ParseNodeName(output->input(k), &port);
+          if (node_name == shape_n_node->name() && port == port_idx) {
+            // Create a const node as ShapeN's output if not already.
+            const string const_name = OptimizedNodeName(
+                *shape_n_node, strings::StrCat("-matshapes-", port_idx));
+            if (node_map_->GetNode(const_name) == nullptr) {
+              NodeDef* added_node = graph_->add_node();
+              added_node->set_name(const_name);
+              added_node->set_op("Const");
+              added_node->set_device(shape_n_node->device());
+              node_map_->AddNode(added_node->name(), added_node);
+              (*added_node->mutable_attr())["dtype"].set_type(type);
+              constant_value.AsProtoTensorContent(
+                  (*added_node->mutable_attr())["value"].mutable_tensor());
+              // We add a control dependency to the original ShapeN node,
+              // so that the node will only be run if all inputs of the
+              // original ShapeN node are run.
+              string ctrl_dep = AddControlDependency(shape_n_node->name(),
+                                                     graph_, node_map_.get());
+              *added_node->add_input() = ctrl_dep;
+              node_map_->AddOutput(NodeName(ctrl_dep), added_node->name());
             }
+            *output->mutable_input(k) = const_name;
+            node_map_->AddOutput(const_name, output->name());
           }
+          if (node_name == shape_n_node->name() && port != port_idx) {
+            direct_edges_exist = true;
+          }
+        }
+        if (!direct_edges_exist) {
+          node_map_->RemoveOutput(node->name(), output->name());
         }
       }
     }
   }
+
   return Status::OK();
 }
 
@@ -450,6 +515,11 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
     return Status::OK();
   }
 
+  // Don't optimize this again if it was already optimized and folded.
+  if (OptimizedNodeExists(node, "-folded-1") ||
+      OptimizedNodeExists(node, "-folded-2")) {
+    return Status::OK();
+  }
   int64 min_id = 0;
   BCast::Vec shape1;
   if (!ExtractShape(*shape_node1, properties, &shape1, &min_id)) {
@@ -529,7 +599,8 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
     out[j] = node_map_->GetNode(const_name);
     if (out[j] == nullptr) {
       out[j] = graph_->add_node();
-      *out[j] = CreateNodeDef(const_name, TensorValue(&value));
+      TF_RETURN_IF_ERROR(
+          CreateNodeDef(const_name, TensorValue(&value), out[j]));
       out[j]->set_device(node.device());
       node_map_->AddNode(const_name, out[j]);
       string ctrl_dep =
@@ -637,7 +708,8 @@ Status ConstantFolding::MaterializeReductionIndices(
       value.vec<int64>()(i) = i;
     }
   }
-  *reduction_indices = CreateNodeDef(const_name, TensorValue(&value));
+  TF_RETURN_IF_ERROR(
+      CreateNodeDef(const_name, TensorValue(&value), reduction_indices));
   reduction_indices->set_device(node->device());
   string ctrl_dep =
       AddControlDependency(node->input(1), graph_, node_map_.get());
@@ -677,7 +749,7 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
       nodes_whitelist_.find(node.name()) == nodes_whitelist_.end()) {
     return false;
   }
-  // Skip control flow nodes, they can't be folded
+  // Skip control flow nodes, they can't be folded.
   if (ModifiesFrameInfo(node)) {
     return false;
   }
@@ -686,12 +758,16 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     return false;
   }
 
-  // Skips ops that don't benefit from folding.
-  const string& op = node.op();
+  // Don't fold stateful ops such as TruncatedNormal.
+  if (!IsFreeOfSideEffect(node)) {
+    return false;
+  }
 
-  if (op.find("Placeholder") == 0) {
+  // Skips ops that don't benefit from folding.
+  if (IsPlaceholder(node)) {
     return false;
   }
+  const string& op = node.op();
   if (op.find("Save") != string::npos || op.find("Restore") != string::npos ||
       op.find("Reader") != string::npos) {
     return false;
@@ -699,20 +775,13 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (op.find("Quantized") != string::npos || op.find("Sparse") == 0) {
     return false;
   }
-  if (node.attr().count("_XlaCompile") > 0) {
-    return false;
-  }
 
-  // Don't fold stateful ops such as TruncatedNormal.
   const OpDef* op_def = nullptr;
   Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
   if (!status.ok()) {
     return false;
   }
-  if (op_def->is_stateful()) {
-    return false;
-  }
-
+  // Don't fold ops without outputs.
   if (op_def->output_arg_size() == 0) {
     return false;
   }
@@ -732,7 +801,7 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   // the case of a merge node that propagate the first inputs that becomes
   // available, and therefore only requires a single constant input to be
   // foldable.
-  bool has_constant_input = false;
+  bool merge_has_constant_input = false;
   const bool is_merge = IsMerge(node);
   for (const auto& input : node.input()) {
     if (IsControlInput(input)) {
@@ -743,21 +812,20 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
       return false;
     }
     bool is_const = IsReallyConstant(*input_node);
-    if (!is_const && !is_merge) {
-      return false;
-    }
-    // Don't fold strings constants for now since this causes problems with
-    // checkpointing.
-    if (is_const && input_node->attr().at("dtype").type() == DT_STRING) {
+    if (is_const) {
+      // Don't fold strings constants for now since this causes problems with
+      // checkpointing.
+      if (input_node->attr().at("dtype").type() == DT_STRING) {
+        return false;
+      }
+      // Special case: If a Merge node has at least one constant input that
+      // does not depend on a control input, we can fold it.
+      merge_has_constant_input |= !HasControlInputs(*input_node);
+    } else if (!is_merge) {
       return false;
     }
-    has_constant_input |= is_const;
-  }
-  if (is_merge) {
-    return has_constant_input;
   }
-
-  return true;
+  return !is_merge || merge_has_constant_input;
 }
 
 namespace {
@@ -774,14 +842,23 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
   t->set_dtype(type);
   *t->mutable_tensor_shape() = shape;
   switch (type) {
-    SET_TENSOR_VAL_CASE(DT_FLOAT, float, float);
-    SET_TENSOR_VAL_CASE(DT_DOUBLE, double, double);
-    SET_TENSOR_VAL_CASE(DT_INT64, int64, int64);
-    SET_TENSOR_VAL_CASE(DT_INT32, int32, int);
-    SET_TENSOR_VAL_CASE(DT_INT16, int32, int);
-    SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
-    SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
-    SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
+    case DT_HALF:
+      t->add_half_val(static_cast<Eigen::half>(value).x);
+      break;
+    case DT_BFLOAT16:
+      t->add_half_val(static_cast<bfloat16>(value).value);
+      break;
+      SET_TENSOR_VAL_CASE(DT_FLOAT, float, float);
+      SET_TENSOR_VAL_CASE(DT_DOUBLE, double, double);
+      SET_TENSOR_VAL_CASE(DT_INT64, int64, int64);
+      SET_TENSOR_VAL_CASE(DT_UINT64, int64, int64);
+      SET_TENSOR_VAL_CASE(DT_INT32, int32, int);
+      SET_TENSOR_VAL_CASE(DT_UINT32, int32, int);
+      SET_TENSOR_VAL_CASE(DT_INT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_UINT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
     default:
       return errors::InvalidArgument("Unsupported type: ", type);
   }
@@ -792,59 +869,74 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
 }  // namespace
 
 // static
-NodeDef ConstantFolding::CreateNodeDef(const string& name,
-                                       const TensorValue& tensor) {
-  NodeDef node;
-  node.set_name(name);
-  node.set_op("Const");
+Status ConstantFolding::CreateNodeDef(const string& name,
+                                      const TensorValue& tensor,
+                                      NodeDef* node) {
+  node->set_name(name);
+  node->set_op("Const");
 
   AttrValue attr_type;
   attr_type.set_type(tensor->dtype());
-  node.mutable_attr()->insert({"dtype", attr_type});
+  node->mutable_attr()->insert({"dtype", attr_type});
 
   AttrValue attr_tensor;
   TensorProto* t = attr_tensor.mutable_tensor();
   bool optimized = false;
+  size_t encoded_size;
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                \
-  const TYPE* val_ptr = tensor->flat<TYPE>().data();                \
-  TYPE last = *val_ptr;                                             \
-  int64 last_index = 0;                                             \
-  for (int64 i = 0; i < tensor->NumElements(); ++i) {               \
-    TYPE cur = *val_ptr++;                                          \
-    if (cur != last) {                                              \
-      last = cur;                                                   \
-      last_index = i;                                               \
-    }                                                               \
-  }                                                                 \
-  if (last_index < kint32max) {                                     \
-    optimized = true;                                               \
-    t->mutable_##NAME##_val()->Reserve(last_index + 1);             \
-    t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
-    val_ptr = tensor->flat<TYPE>().data();                          \
-    for (int64 i = 0; i <= last_index; ++i) {                       \
-      t->set_##NAME##_val(i, *val_ptr++);                           \
-    }                                                               \
-  }
-
-    if (tensor->dtype() == DT_FLOAT) {
-      POPULATE_TENSOR_PROTO(tensor, t, float, float)
-    } else if (tensor->dtype() == DT_DOUBLE) {
-      POPULATE_TENSOR_PROTO(tensor, t, double, double)
-    } else if (tensor->dtype() == DT_INT64) {
-      POPULATE_TENSOR_PROTO(tensor, t, int64, int64)
-    } else if (tensor->dtype() == DT_INT32) {
-      POPULATE_TENSOR_PROTO(tensor, t, int32, int)
-    } else if (tensor->dtype() == DT_INT16) {
-      POPULATE_TENSOR_PROTO(tensor, t, int16, int)
-    } else if (tensor->dtype() == DT_INT8) {
-      POPULATE_TENSOR_PROTO(tensor, t, int8, int)
-    } else if (tensor->dtype() == DT_UINT8) {
-      POPULATE_TENSOR_PROTO(tensor, t, uint8, int)
-    } else if (tensor->dtype() == DT_BOOL) {
-      POPULATE_TENSOR_PROTO(tensor, t, bool, bool)
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                  \
+  {                                                                   \
+    const TYPE* val_ptr = tensor->flat<TYPE>().data();                \
+    TYPE last = *val_ptr;                                             \
+    int64 last_index = 0;                                             \
+    for (int64 i = 0; i < tensor->NumElements(); ++i) {               \
+      TYPE cur = *val_ptr++;                                          \
+      if (cur != last) {                                              \
+        last = cur;                                                   \
+        last_index = i;                                               \
+      }                                                               \
+    }                                                                 \
+    if (last_index < kint32max) {                                     \
+      optimized = true;                                               \
+      encoded_size = (last_index + 1) * sizeof(NAME);                 \
+      t->mutable_##NAME##_val()->Reserve(last_index + 1);             \
+      t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
+      val_ptr = tensor->flat<TYPE>().data();                          \
+      for (int64 i = 0; i <= last_index; ++i) {                       \
+        t->set_##NAME##_val(i, *val_ptr++);                           \
+      }                                                               \
+    }                                                                 \
+  }                                                                   \
+  break
+
+    switch (tensor->dtype()) {
+      case DT_FLOAT:
+        POPULATE_TENSOR_PROTO(tensor, t, float, float);
+      case DT_DOUBLE:
+        POPULATE_TENSOR_PROTO(tensor, t, double, double);
+      case DT_INT64:
+        POPULATE_TENSOR_PROTO(tensor, t, int64, int64);
+      case DT_UINT64:
+        POPULATE_TENSOR_PROTO(tensor, t, uint64, int64);
+      case DT_INT32:
+        POPULATE_TENSOR_PROTO(tensor, t, int32, int);
+      case DT_UINT32:
+        POPULATE_TENSOR_PROTO(tensor, t, uint32, int);
+      case DT_INT16:
+        POPULATE_TENSOR_PROTO(tensor, t, int16, int);
+      case DT_UINT16:
+        POPULATE_TENSOR_PROTO(tensor, t, uint16, int);
+      case DT_INT8:
+        POPULATE_TENSOR_PROTO(tensor, t, int8, int);
+      case DT_UINT8:
+        POPULATE_TENSOR_PROTO(tensor, t, uint8, int);
+      case DT_BOOL:
+        POPULATE_TENSOR_PROTO(tensor, t, bool, bool);
+      default:
+        /* Do nothing. */
+        break;
     }
   }
   if (optimized) {
@@ -853,9 +945,15 @@ NodeDef ConstantFolding::CreateNodeDef(const string& name,
     tensor->shape().AsProto(t->mutable_tensor_shape());
   } else {
     tensor->AsProtoTensorContent(t);
+    encoded_size = t->tensor_content().size();
   }
-  node.mutable_attr()->insert({"value", attr_tensor});
-  return node;
+  node->mutable_attr()->insert({"value", attr_tensor});
+
+  if (encoded_size < 10 * 1024 * 1024) {
+    return Status::OK();
+  }
+  return errors::InvalidArgument(
+      strings::StrCat("Can't fold ", name, ", its size would be too large"));
 }
 
 Status ConstantFolding::EvaluateNode(const NodeDef& node,
@@ -907,7 +1005,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
 
   for (const auto& input : node.input()) {
     int port = 0;
-    ParseNodeName(input, &port);
+    ParseNodeNameAsStringPiece(input, &port);
     if (port < 0) {
       // Control dependency
       break;
@@ -929,17 +1027,19 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     return Status(error::INVALID_ARGUMENT, "Expected at least one output.");
   }
 
+  outputs->resize(output_tensors.size());
   for (size_t i = 0; i < output_tensors.size(); i++) {
     string node_name = OptimizedNodeName(node, "-folded");
     if (output_tensors.size() > 1) {
       node_name = strings::StrCat(node_name, "-", i);
     }
     if (output_tensors[i].tensor) {
-      outputs->push_back(CreateNodeDef(node_name, output_tensors[i]));
+      TF_RETURN_IF_ERROR(
+          CreateNodeDef(node_name, output_tensors[i], &outputs->at(i)));
     } else {
       // Create an empty NodeDef to identify dead outputs (e.g. the output of a
       // switch that's not selected by the switch predicate).
-      outputs->push_back(NodeDef());
+      outputs->at(i) = NodeDef();
     }
   }
   return Status::OK();
@@ -1014,7 +1114,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
       node_map_->AddOutput(node->name(), const_index->name());
 
       auto outputs = node_map_->GetOutputs(node->name());
-      for (auto& output : outputs) {
+      for (NodeDef* output : outputs) {
         for (int i = 0; i < output->input_size(); i++) {
           int port;
           string node_name = ParseNodeName(output->input(i), &port);
@@ -1105,7 +1205,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
 
   if (const_nodes.size() > 1) {
     auto outputs = node_map_->GetOutputs(node->name());
-    for (const auto& output : outputs) {
+    for (NodeDef* output : outputs) {
       for (int i = 0; i < output->input_size(); i++) {
         int port;
         string node_name = ParseNodeName(output->input(i), &port);
@@ -1147,9 +1247,8 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
   std::unordered_set<string> processed_nodes;
   std::deque<NodeDef*> queue;
   for (int i = 0; i < graph_->node_size(); i++) {
-    auto node = graph_->mutable_node(i);
-    if (IsFoldable(*node)) {
-      queue.push_back(node);
+    if (IsFoldable(graph_->node(i))) {
+      queue.push_back(graph_->mutable_node(i));
     }
   }
   while (!queue.empty()) {
@@ -1159,14 +1258,21 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
       continue;
     }
     // We need to record a copy of output nodes before FoldNode() modifies it.
-    std::set<NodeDef*> outputs = node_map_->GetOutputs(node->name());
+    // We also need to ensure that the fanout is sorted deterministically.
+    const std::set<NodeDef*>& outputs = node_map_->GetOutputs(node->name());
+    std::vector<NodeDef*> fanout(outputs.begin(), outputs.end());
+    std::sort(fanout.begin(), fanout.end(),
+              [](const NodeDef* n1, const NodeDef* n2) {
+                return n1->name() < n2->name();
+              });
 
     Status s = FoldNode(node, output);
     processed_nodes.insert(node->name());
     if (!s.ok()) {
-      VLOG(1) << "Failed to fold node " << node->name() << ": " << s;
+      VLOG(1) << "Failed to fold node " << node->DebugString()
+              << "\nError message: " << s;
     } else {
-      for (auto& output : outputs) {
+      for (auto& output : fanout) {
         if (IsFoldable(*output)) {
           queue.push_back(output);
         }
@@ -1178,8 +1284,8 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
   int last = output->node_size() - 1;
   for (int i = output->node_size() - 1; i >= 0; --i) {
     const NodeDef& node = output->node(i);
-    auto outputs = node_map_->GetOutputs(node.name());
-    if (outputs.empty()) {
+    auto fanout = node_map_->GetOutputs(node.name());
+    if (fanout.empty()) {
       output->mutable_node()->SwapElements(i, last);
       last--;
     }
@@ -1191,8 +1297,8 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
     // If no fetch nodes is provided, we conservatively
     // keep all nodes in the original graph in case users need to fetch
     // their values.
-    auto outputs = node_map_->GetOutputs(node.name());
-    if (!outputs.empty() || !has_fetch_ ||
+    auto fanout = node_map_->GetOutputs(node.name());
+    if (!fanout.empty() || !has_fetch_ ||
         nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
       auto added_node = output->add_node();
       *added_node = node;
@@ -1297,23 +1403,27 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
   if (node.op() == "OnesLike") {
     return true;
   }
+  if (node.op() == "Fill") {
+    NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
+    return values != nullptr && IsOnes(*values);
+  }
   if (node.op() != "Const") {
     return false;
   }
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
-    // TODO(rmlarsen): Make DT_HALF case compile.
-    //    IS_ONES_CASE(DT_HALF);
+    IS_ONES_CASE(DT_HALF);
+    IS_ONES_CASE(DT_BFLOAT16);
     IS_ONES_CASE(DT_FLOAT);
     IS_ONES_CASE(DT_DOUBLE);
+    IS_ONES_CASE(DT_COMPLEX64);
+    IS_ONES_CASE(DT_COMPLEX128);
     IS_ONES_CASE(DT_UINT8);
     IS_ONES_CASE(DT_INT8);
     IS_ONES_CASE(DT_UINT16);
     IS_ONES_CASE(DT_INT16);
     IS_ONES_CASE(DT_INT32);
     IS_ONES_CASE(DT_INT64);
-    IS_ONES_CASE(DT_COMPLEX64);
-    IS_ONES_CASE(DT_COMPLEX128);
     default:
       VLOG(1) << "Unsupported type " << DataTypeString(dtype);
       return false;
@@ -1328,23 +1438,27 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   if (node.op() == "ZerosLike") {
     return true;
   }
+  if (node.op() == "Fill") {
+    NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
+    return values != nullptr && IsZeros(*values);
+  }
   if (!IsConstant(node)) {
     return false;
   }
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
-    // TODO(rmlarsen): Make DT_HALF case compile.
-    //    IS_ZEROS_CASE(DT_HALF);
+    IS_ZEROS_CASE(DT_HALF);
+    IS_ZEROS_CASE(DT_BFLOAT16);
     IS_ZEROS_CASE(DT_FLOAT);
     IS_ZEROS_CASE(DT_DOUBLE);
+    IS_ZEROS_CASE(DT_COMPLEX64);
+    IS_ZEROS_CASE(DT_COMPLEX128);
     IS_ZEROS_CASE(DT_UINT8);
     IS_ZEROS_CASE(DT_INT8);
     IS_ZEROS_CASE(DT_UINT16);
     IS_ZEROS_CASE(DT_INT16);
     IS_ZEROS_CASE(DT_INT32);
     IS_ZEROS_CASE(DT_INT64);
-    IS_ZEROS_CASE(DT_COMPLEX64);
-    IS_ZEROS_CASE(DT_COMPLEX128);
     default:
       VLOG(1) << "Unsupported type " << DataTypeString(dtype);
       return false;
@@ -1409,10 +1523,20 @@ void ConstantFolding::ReplaceDivisionOfOnesByReciprocal(NodeDef* node,
   graph_modified_ = true;
 }
 
+void ConstantFolding::ReplaceSubtractionFromZeroByNegation(NodeDef* node,
+                                                           GraphDef* graph) {
+  node->set_op("Neg");
+  node->mutable_input()->SwapElements(0, 1);
+  const string ctrl_dep =
+      AddControlDependency(node->input(1), graph, node_map_.get());
+  node_map_->UpdateInput(node->name(), node->input(1), ctrl_dep);
+  node->set_input(1, ctrl_dep);
+  graph_modified_ = true;
+}
+
 Status ConstantFolding::ReplaceOperationWithConstant(
-    double value, const TensorShapeProto& shape, NodeDef* node,
-    GraphDef* graph) {
-  AttrValue dtype_attr = node->attr().at("T");
+    double value, const AttrValue& dtype_attr, const TensorShapeProto& shape,
+    NodeDef* node, GraphDef* graph) {
   AttrValue tensor_attr;
   TF_RETURN_IF_ERROR(CreateConstantTensorAttrValue(dtype_attr.type(), value,
                                                    shape, &tensor_attr));
@@ -1434,12 +1558,417 @@ Status ConstantFolding::ReplaceOperationWithConstant(
   return Status::OK();
 }
 
-Status ConstantFolding::SimplifyGraph(GraphDef* output,
-                                      const GraphProperties& properties,
+Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
+                                      GraphProperties* properties,
                                       bool use_shape_info) {
   const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
-  for (int i = 0; i < output->node_size(); ++i) {
-    NodeDef* node = output->mutable_node(i);
+  for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    NodeDef* node = optimized_graph->mutable_node(i);
+
+    if (IsSplit(*node) && node->attr().at("num_split").i() == 1) {
+      ReplaceOperationWithIdentity(1, node, optimized_graph);
+      continue;
+    }
+
+    if (IsSplitV(*node) && node->attr().at("num_split").i() == 1) {
+      ReplaceOperationWithIdentity(0, node, optimized_graph);
+      continue;
+    }
+
+    // Remove Shuffle or Transpose op over dimensions of size 1.
+    if (use_shape_info && (IsShuffle(*node) || IsTranspose(*node)) &&
+        properties->GetInputProperties(node->name()).size() >= 2) {
+      const auto& shape =
+          properties->GetInputProperties(node->name())[0].shape();
+      if (shape.unknown_rank()) {
+        // Not optimizable.
+        continue;
+      }
+      const auto& p = properties->GetInputProperties(node->name())[1];
+      if (TensorShape::IsValid(p.shape()) && p.has_value()) {
+        Tensor perm(p.dtype(), p.shape());
+        if (!perm.FromProto(p.value())) {
+          return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                         p.value().DebugString());
+        }
+        std::vector<int> permutation;
+        for (int j = 0; j < perm.NumElements(); ++j) {
+          if (perm.dtype() == DT_INT64) {
+            permutation.push_back(perm.vec<int64>()(j));
+          } else {
+            permutation.push_back(perm.vec<int>()(j));
+          }
+        }
+        if (permutation.size() != shape.dim_size()) {
+          // Number of elements in perm should be same as dim_size. Skip if not.
+          continue;
+        }
+        // The node is replaceable iff
+        // dim_size == 0 || all dims have size 1 ||
+        // all dims with > 1 size are not permuted.
+        bool replaceable = true;
+        for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+          replaceable &= shape.dim(j).size() == 1 || j == permutation[j];
+        }
+        if (replaceable) {
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
+          continue;
+        }
+      }
+    }
+
+    // Remove RandomShuffle op if it is scalar or first dimension is of size 1.
+    if (use_shape_info && IsRandomShuffle(*node) &&
+        !properties->GetInputProperties(node->name()).empty()) {
+      const auto& shape =
+          properties->GetInputProperties(node->name())[0].shape();
+      // The node is replaceable iff
+      // unknown_rank == false && (dim_size == 0 || first dim is of size 1)
+      if (!shape.unknown_rank() &&
+          (shape.dim_size() == 0 || shape.dim(0).size() == 1)) {
+        ReplaceOperationWithIdentity(0, node, optimized_graph);
+        continue;
+      }
+    }
+
+    // Remove Reverse op over dimensions with size 1.
+    if (use_shape_info && node->op() == "ReverseV2" &&
+        properties->GetInputProperties(node->name()).size() >= 2) {
+      const auto& shape =
+          properties->GetInputProperties(node->name())[0].shape();
+      if (shape.unknown_rank()) {
+        // Not optimizable.
+        continue;
+      }
+      const auto& a = properties->GetInputProperties(node->name())[1];
+      if (TensorShape::IsValid(a.shape()) && a.has_value()) {
+        Tensor axis(a.dtype(), a.shape());
+        if (!axis.FromProto(a.value())) {
+          return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                         a.value().DebugString());
+        }
+        std::set<int> target_axes;
+        for (int j = 0; j < axis.NumElements(); ++j) {
+          // value of axis can be negative.
+          if (axis.dtype() == DT_INT64) {
+            target_axes.insert(
+                (axis.vec<int64>()(j) + shape.dim_size()) % shape.dim_size());
+          } else {
+            target_axes.insert(
+                (axis.vec<int>()(j) + shape.dim_size()) % shape.dim_size());
+          }
+        }
+
+        // The node is replaceable iff
+        // unknown_rank == false &&
+        // (dim_size == 0 || all dims have size 1 ||
+        //  all dims with > 1 size are not in target_axes)
+        bool replaceable = !shape.unknown_rank();
+        for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+          replaceable &= shape.dim(j).size() == 1 ||
+                         target_axes.find(j) == target_axes.end();
+        }
+        if (replaceable) {
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
+          continue;
+        }
+      }
+    }
+
+    if (use_shape_info && IsSlice(*node) &&
+        properties->GetInputProperties(node->name()).size() == 3) {
+      const auto& input = properties->GetInputProperties(node->name())[0];
+      const auto& b = properties->GetInputProperties(node->name())[1];
+      const auto& s = properties->GetInputProperties(node->name())[2];
+      if (TensorShape::IsValid(b.shape()) && b.has_value() &&
+          TensorShape::IsValid(s.shape()) && s.has_value()) {
+        Tensor begin(b.dtype(), b.shape());
+        if (!begin.FromProto(b.value())) {
+          return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                         b.value().DebugString());
+        }
+        Tensor size(s.dtype(), s.shape());
+        if (!size.FromProto(s.value())) {
+          return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                         s.value().DebugString());
+        }
+        // The node is replaceable iff unknown_rank == false &&
+        // begin == 0 && (size == -1 || size == input_shape) for all dimensions
+        bool replaceable = !input.shape().unknown_rank();
+        for (int j = 0; replaceable && j < input.shape().dim_size(); ++j) {
+          if (begin.dtype() == DT_INT32) {
+            replaceable &= begin.vec<int>()(j) == 0;
+          } else {
+            replaceable &= begin.vec<int64>()(j) == 0;
+          }
+          if (size.dtype() == DT_INT32) {
+            replaceable &= (size.vec<int>()(j) == -1 ||
+                            size.vec<int>()(j) == input.shape().dim(j).size());
+          } else {
+            replaceable &=
+                (size.vec<int64>()(j) == -1 ||
+                 size.vec<int64>()(j) == input.shape().dim(j).size());
+          }
+        }
+        if (replaceable) {
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
+          continue;
+        }
+      }
+    }
+
+    if (use_shape_info && IsTile(*node) &&
+        properties->GetInputProperties(node->name()).size() == 2) {
+      const auto& m = properties->GetInputProperties(node->name())[1];
+      if (TensorShape::IsValid(m.shape()) && m.has_value()) {
+        Tensor multiplies(m.dtype(), m.shape());
+        if (!multiplies.FromProto(m.value())) {
+          return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                         m.value().DebugString());
+        }
+        // The node is replaceable iff all values in multiplies are 1.
+        bool replaceable = true;
+        if (multiplies.dtype() == DT_INT32) {
+          for (int j = 0; replaceable && j < multiplies.vec<int>().size();
+               ++j) {
+            replaceable &= multiplies.vec<int>()(j) == 1;
+          }
+        } else {
+          for (int j = 0; replaceable && j < multiplies.vec<int64>().size();
+               ++j) {
+            replaceable &= multiplies.vec<int64>()(j) == 1;
+          }
+        }
+        if (replaceable) {
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
+          continue;
+        }
+      }
+    }
+
+    if (use_shape_info && IsPad(*node) &&
+        properties->GetInputProperties(node->name()).size() >= 2) {
+      const auto& p = properties->GetInputProperties(node->name())[1];
+      if (TensorShape::IsValid(p.shape()) && p.has_value()) {
+        Tensor paddings(p.dtype(), p.shape());
+        if (!paddings.FromProto(p.value())) {
+          return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                         p.value().DebugString());
+        }
+        // The node is replaceable iff all values in paddings are 0.
+        bool replaceable = true;
+        // The operation requires it to be int32 value so we don't check for
+        // 1nt64.
+        const auto flatten = paddings.flat<int32>();
+        for (int j = 0; replaceable && j < flatten.size(); ++j) {
+          replaceable &= flatten(j) == 0;
+        }
+        if (replaceable) {
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
+          continue;
+        }
+      }
+    }
+
+    if (use_shape_info && IsSqueeze(*node) &&
+        !properties->GetInputProperties(node->name()).empty()) {
+      // https://www.tensorflow.org/api_docs/python/tf/squeeze mentions it's
+      // error to squeeze a dimension that is not 1, so we only need to check
+      // whether the input has > 1 size for each dimension.
+      const auto& shape =
+          properties->GetInputProperties(node->name())[0].shape();
+      // The node is replaceable iff
+      // unknown_rank == false && (dim_size == 0 || all dims have size > 1)
+      bool replaceable = !shape.unknown_rank();
+      for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+        replaceable &= shape.dim(j).size() > 1;
+      }
+      if (replaceable) {
+        ReplaceOperationWithIdentity(0, node, optimized_graph);
+        continue;
+      }
+    }
+
+    if (IsPack(*node) && NumNonControlInputs(*node) == 1 &&
+        !OptimizedNodeExists(*node, "_const_axis")) {
+      // Create constant axis node.
+      Tensor axis_t(DT_INT32, TensorShape({}));
+      NodeDef* axis_node = optimized_graph->add_node();
+      axis_node->set_name(OptimizedNodeName(*node, "_const_axis"));
+      const int axis = node->attr().at("axis").i();
+      if (!SetTensorValue(DT_INT32, axis, &axis_t).ok() ||
+          !CreateNodeDef(axis_node->name(), TensorValue(&axis_t), axis_node)
+               .ok()) {
+        continue;
+      }
+      // Add a control dependency to make sure axis_node is in the right frame.
+      const string ctrl_dep = ConstantFolding::AddControlDependency(
+          node->input(0), graph_, node_map_.get());
+      axis_node->add_input(ctrl_dep);
+      axis_node->set_device(node->device());
+      node->set_op("ExpandDims");
+      if (node->attr().count("axis") != 0) {
+        node->mutable_attr()->erase("axis");
+      }
+      if (node->attr().count("N") != 0) {
+        node->mutable_attr()->erase("N");
+      }
+      (*node->mutable_attr())["Tdim"].set_type(DT_INT32);
+      node->add_input(axis_node->name());
+      if (node->input_size() > 2) {
+        node->mutable_input()->SwapElements(1, node->input_size() - 1);
+      }
+      graph_modified_ = true;
+      continue;
+    }
+
+    // Move constants past Enter.
+    if (IsEnter(*node) && node->input_size() > 0) {
+      if (node->attr().count("is_constant") == 0 ||
+          !node->attr().at("is_constant").b()) {
+        continue;
+      }
+      const string& node_name = node->name();
+      const NodeDef* input = node_map_->GetNode(node->input(0));
+      if (input != nullptr && IsReallyConstant(*input) &&
+          !OptimizedNodeExists(*input, "_enter")) {
+        auto fanouts = node_map_->GetOutputs(node_name);
+        // Find non-constant nodes that consume the output of *node.
+        std::vector<NodeDef*> consumers;
+        for (NodeDef* fanout : fanouts) {
+          if (!IsConstant(*fanout)) {
+            for (int i = 0; i < fanout->input_size(); ++i) {
+              if (fanout->input(i) == node_name) {
+                consumers.push_back(fanout);
+                break;
+              }
+            }
+          }
+        }
+        if (!consumers.empty()) {
+          NodeDef* new_node = optimized_graph->add_node();
+          *new_node = *input;
+          new_node->set_name(OptimizedNodeName(*input, "_enter"));
+          new_node->set_device(node->device());
+          new_node->clear_input();
+          new_node->add_input(AsControlDependency(node_name));
+          node_map_->AddNode(new_node->name(), new_node);
+          node_map_->AddOutput(node_name, new_node->name());
+          for (NodeDef* consumer : consumers) {
+            for (int i = 0; i < consumer->input_size(); ++i) {
+              if (NodeName(consumer->input(i)) == node_name) {
+                node_map_->UpdateInput(consumer->name(), node_name,
+                                       new_node->name());
+                consumer->set_input(i, new_node->name());
+              }
+            }
+          }
+          graph_modified_ = true;
+          continue;
+        }
+      }
+    }
+
+    // Switch(x, x) will always feed false to its false branch and true to
+    // its true branch. By rewriting the graph a bit, we can propagate these
+    // constants down the two output branches, and just use control dependencies
+    // to trigger the selected one at runtime. For example,
+    //
+    //     +------+
+    // x-->|Switch|-->a  (in practice there may be multiple consumers of each
+    // x-->|      |-->b   output branch.)
+    //     +------+
+    //
+    // Is rewritten as
+    //
+    //     +------+
+    // x-->|Switch|-->Identity--^>Const(false)-->a
+    // x-->|      |-->Identity--^>Const(true)-->b
+    //     +------+
+    if (node->op() == "Switch" && node->input(0) == node->input(1) &&
+        !OptimizedNodeExists(*node, "_const_false") &&
+        !OptimizedNodeExists(*node, "_const_true")) {
+      bool already_optimized = true;
+      // If the optimization was already applied, the switch would have exactly
+      // one Identity node consuming each of its outputs, each without any
+      // non-control outputs.
+      auto fanouts = node_map_->GetOutputs(node->name());
+      if (fanouts.size() == 2) {
+        for (NodeDef* fanout : fanouts) {
+          if (!IsIdentity(*fanout) ||
+              NumNonControlOutputs(*fanout, *node_map_) > 0) {
+            already_optimized = false;
+            break;
+          }
+        }
+      }
+      Tensor false_t(DT_BOOL, TensorShape({}));
+      Tensor true_t(DT_BOOL, TensorShape({}));
+      // Make sure we don't proceed if this switch node was already optimized.
+      if (!already_optimized && SetTensorValue(DT_BOOL, true, &true_t).ok() &&
+          SetTensorValue(DT_BOOL, false, &false_t).ok()) {
+        // Copy the set of consumers of the switch as they will be manipulated
+        // below.
+        const std::set<NodeDef*>& consumer_set =
+            node_map_->GetOutputs(node->name());
+        std::vector<NodeDef*> consumers(consumer_set.begin(),
+                                        consumer_set.end());
+        std::sort(consumers.begin(), consumers.end(),
+                  [](const NodeDef* n1, const NodeDef* n2) {
+                    return n1->name() < n2->name();
+                  });
+        // Create constant false & true nodes.
+        NodeDef* false_node = optimized_graph->add_node();
+        false_node->set_name(OptimizedNodeName(*node, "_const_false"));
+        if (!CreateNodeDef(false_node->name(), TensorValue(&false_t),
+                           false_node)
+                 .ok()) {
+          continue;
+        }
+        false_node->set_device(node->device());
+
+        NodeDef* true_node = optimized_graph->add_node();
+        true_node->set_name(OptimizedNodeName(*node, "_const_true"));
+        if (!CreateNodeDef(true_node->name(), TensorValue(&true_t), true_node)
+                 .ok()) {
+          continue;
+        }
+        true_node->set_device(node->device());
+
+        // Add controls from the switch ports to the constants, and connect the
+        // constants to the original switch outputs.
+        const string false_port = node->name();
+        const string true_port = strings::StrCat(node->name(), ":1");
+        const string false_ctrl_dep =
+            AddControlDependency(false_port, optimized_graph, node_map_.get());
+        false_node->add_input(false_ctrl_dep);
+        const string true_ctrl_dep =
+            AddControlDependency(true_port, optimized_graph, node_map_.get());
+        true_node->add_input(true_ctrl_dep);
+
+        node_map_->AddNode(false_node->name(), false_node);
+        node_map_->AddNode(true_node->name(), true_node);
+        node_map_->AddOutput(NodeName(false_ctrl_dep), false_node->name());
+        node_map_->AddOutput(NodeName(true_ctrl_dep), true_node->name());
+
+        for (NodeDef* consumer : consumers) {
+          for (int i = 0; i < consumer->input_size(); ++i) {
+            const string& input = consumer->input(i);
+            if (input == false_port) {
+              consumer->set_input(i, false_node->name());
+              node_map_->UpdateInput(consumer->name(), false_port,
+                                     false_node->name());
+            } else if (input == true_port) {
+              consumer->set_input(i, true_node->name());
+              node_map_->UpdateInput(consumer->name(), true_port,
+                                     true_node->name());
+            }
+          }
+        }
+        graph_modified_ = true;
+        continue;
+      }
+    }
     if (IsSimplifiableReduction(*node)) {
       // Replace the reduction node with an identity node, that can be further
       // optimized by the model pruner.
@@ -1457,7 +1986,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       graph_modified_ = true;
       continue;
     }
-    if (use_shape_info && IsSimplifiableReshape(*node, properties)) {
+    if (use_shape_info && IsSimplifiableReshape(*node, *properties)) {
       DataType output_type = node->attr().at("T").type();
       node->set_op("Identity");
       node->clear_attr();
@@ -1475,8 +2004,8 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
     // Simplify arithmetic operations with ones or zeros.
     if (use_shape_info &&
         (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
-        properties.HasInputProperties(node->name()) &&
-        properties.HasOutputProperties(node->name())) {
+        properties->HasInputProperties(node->name()) &&
+        properties->HasOutputProperties(node->name())) {
       const NodeDef* x = node_map_->GetNode(node->input(0));
       const NodeDef* y = node_map_->GetNode(node->input(1));
       if (x == nullptr || y == nullptr) {
@@ -1484,20 +2013,25 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
                                        node->DebugString());
       }
       const TensorShapeProto& output_shape =
-          properties.GetOutputProperties(node->name())[0].shape();
+          properties->GetOutputProperties(node->name())[0].shape();
 
       // Simplify element-wise multiplication by ones or addition/subtraction
       // of zeros.
       const TensorShapeProto& y_shape =
-          properties.GetInputProperties(node->name())[1].shape();
+          properties->GetInputProperties(node->name())[1].shape();
       const bool x_is_zero = IsZeros(*x);
-      const bool x_is_one = IsOnes(*x);
+      const bool x_is_one = x_is_zero ? false : IsOnes(*x);
       const bool y_matches_output_shape = ShapesEqual(output_shape, y_shape);
       if (y_matches_output_shape &&
           ((is_mul && x_is_one) || (is_add && x_is_zero))) {
-        // TODO(rmlarsen): Handle subtraction 0 - y.
         // 1 * y = y or 0 + y = y.
-        ReplaceOperationWithSnapshot(1, node, output);
+        ReplaceOperationWithSnapshot(1, node, optimized_graph);
+        continue;
+      }
+
+      if (y_matches_output_shape && (is_sub && x_is_zero)) {
+        // Replace 0 - y with Neg(y).
+        ReplaceSubtractionFromZeroByNegation(node, optimized_graph);
         continue;
       }
 
@@ -1505,21 +2039,20 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       if (y_matches_output_shape && is_any_div && x_is_one) {
         DataType type = node->attr().at("T").type();
         if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
-          ReplaceDivisionOfOnesByReciprocal(node, output);
+          ReplaceDivisionOfOnesByReciprocal(node, optimized_graph);
           continue;
         }
       }
 
       const TensorShapeProto& x_shape =
-          properties.GetInputProperties(node->name())[0].shape();
+          properties->GetInputProperties(node->name())[0].shape();
       const bool y_is_zero = IsZeros(*y);
-      const bool y_is_one = IsOnes(*y);
+      const bool y_is_one = y_is_zero ? false : IsOnes(*y);
       const bool x_matches_output_shape = ShapesEqual(output_shape, x_shape);
-      if (x_matches_output_shape &&
-          (((is_mul || is_any_div) && y_is_one) ||
-           ((is_add || is_sub) && y_is_zero))) {
+      if (x_matches_output_shape && (((is_mul || is_any_div) && y_is_one) ||
+                                     ((is_add || is_sub) && y_is_zero))) {
         // x * 1 = x or x / 1 = x or x +/- 0 = x
-        ReplaceOperationWithSnapshot(0, node, output);
+        ReplaceOperationWithSnapshot(0, node, optimized_graph);
         continue;
       }
 
@@ -1532,18 +2065,24 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
           (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
         const PartialTensorShape shp(output_shape);
         if (shp.IsFullyDefined()) {
-          TF_RETURN_IF_ERROR(
-              ReplaceOperationWithConstant(0, output_shape, node, output));
+          AttrValue dtype_attr;
+          if (node->op() == "SparseMatMul") {
+            dtype_attr.set_type(DT_FLOAT);
+          } else {
+            dtype_attr = node->attr().at("T");
+          }
+          TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
+              0, dtype_attr, output_shape, node, optimized_graph));
           continue;
         }
         // Even if an input shape is only partially known, we may known that it
         // matches the output shape and thus forward the corresponding zero
         // input.
         if ((is_mul || is_any_div) && x_is_zero && x_matches_output_shape) {
-          ReplaceOperationWithIdentity(0, node, output);
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
           continue;
         } else if (is_mul && y_is_zero && y_matches_output_shape) {
-          ReplaceOperationWithIdentity(1, node, output);
+          ReplaceOperationWithIdentity(1, node, optimized_graph);
           continue;
         }
       }
@@ -1568,9 +2107,8 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
         continue;
       }
       // Insert new reciprocal op and change node from Div to Mul.
-      NodeDef* reciprocal_node = output->add_node();
-      reciprocal_node->set_name(AddPrefixToNodeName(
-          strings::StrCat(node->name(), "_recip"), kConstantFoldingConst));
+      NodeDef* reciprocal_node = optimized_graph->add_node();
+      reciprocal_node->set_name(OptimizedNodeName(*node, "_recip"));
       reciprocal_node->set_op("Reciprocal");
       reciprocal_node->set_device(node->device());
       node->set_op("Mul");
@@ -1582,6 +2120,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       node_map_->UpdateOutput(node->name(), const_input,
                               reciprocal_node->name());
       graph_modified_ = true;
+      continue;
     }
 
     // Consider the transformation
@@ -1628,9 +2167,9 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
           left_child_is_constant ? left_child : right_child;
       // Make sure that it is safe to change the value of the child node->
       if (op_child_node->input_size() < 2 ||
-          NumNonControlOutputs(*op_child_node, *node_map_) > 1 ||
           nodes_to_preserve_.find(op_child_node->name()) !=
-              nodes_to_preserve_.end()) {
+              nodes_to_preserve_.end() ||
+          NumNonControlOutputs(*op_child_node, *node_map_) > 1) {
         continue;
       }
 
@@ -1667,14 +2206,257 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       std::swap(*node->mutable_input(parent_const_input),
                 *op_child_node->mutable_input(non_const_leaf_input));
       graph_modified_ = true;
+      continue;
+    }
+
+    // Partial constant propagation through IdentityN.
+    if (IsIdentityN(*node) && NumNonControlInputs(*node) > 0) {
+      const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
+      const std::vector<NodeDef*> consumers(tmp.begin(), tmp.end());
+      bool updated_graph = false;
+      for (int input_idx = 0; input_idx < node->input_size(); ++input_idx) {
+        const string& input = node->input(input_idx);
+        if (IsControlInput(input)) {
+          break;
+        }
+        const NodeDef* input_node = node_map_->GetNode(NodeName(input));
+        if (input_node == nullptr) {
+          LOG(ERROR) << "Bad input: " << input;
+          break;
+        }
+        // Forward constant inputs to outputs and add a control dependency on
+        // the IdentityN node.
+        if (IsReallyConstant(*input_node)) {
+          // Update each consumer.
+          for (NodeDef* consumer : consumers) {
+            bool add_dep = false;
+            for (int consumer_input_idx = 0;
+                 consumer_input_idx < consumer->input_size();
+                 ++consumer_input_idx) {
+              const string& consumer_input =
+                  consumer->input(consumer_input_idx);
+              if (IsControlInput(consumer_input)) {
+                break;
+              }
+              int output_idx;
+              const string input_node_name =
+                  ParseNodeName(consumer_input, &output_idx);
+              if (input_node_name == node->name() && output_idx == input_idx) {
+                consumer->set_input(consumer_input_idx, input);
+                // We will keep the input from IdentityN through a control
+                // dependency, so we only need to add the consumer as an output
+                // for the constant input node.
+                node_map_->AddOutput(NodeName(input), consumer->name());
+                add_dep = true;
+              }
+            }
+            if (add_dep) {
+              consumer->add_input(AsControlDependency(node->name()));
+              updated_graph = true;
+            }
+          }
+        }
+      }
+
+      if (updated_graph) {
+        for (NodeDef* consumer : consumers) {
+          DedupControlInputs(consumer);
+        }
+        graph_modified_ = true;
+        continue;
+      }
+    }
+
+    // Partial constant folding for associative operators:
+    // Split AddN/AccumulateNV2 to enable partial
+    // folding of ops when more than one but not all inputs are constant.
+    // For AddN and AccumulateNV2, we may furthermore reorder inputs, since
+    // addition is commutative.
+    const int num_non_control_inputs = NumNonControlInputs(*node);
+    if (IsAggregate(*node) && IsCommutative(*node) &&
+        num_non_control_inputs > 2) {
+      const int num_control_inputs =
+          node->input_size() - num_non_control_inputs;
+      std::vector<int> const_inputs;
+      std::vector<int> nonconst_inputs;
+      for (int i = 0; i < node->input_size(); ++i) {
+        const string& input = node->input(i);
+        const NodeDef* input_node = node_map_->GetNode(NodeName(input));
+        CHECK(input_node != nullptr) << input;
+        if (!IsControlInput(input) && IsReallyConstant(*input_node)) {
+          const_inputs.push_back(i);
+        } else {
+          // Non-const and control inputs.
+          nonconst_inputs.push_back(i);
+        }
+      }
+      // Promote AccumulateNV2 with all constant inputs to AddN, since it is
+      // a fake node that cannot be constant folded by itself.
+      if (const_inputs.size() == num_non_control_inputs &&
+          node->op() == "AccumulateNV2") {
+        node->set_op("AddN");
+        node->mutable_attr()->erase("shape");
+        graph_modified_ = true;
+        continue;
+      }
+      const string new_node_name = OptimizedNodeName(
+          *node, strings::StrCat("_partial_split_", const_inputs.size()));
+      if (1 < const_inputs.size() &&
+          const_inputs.size() < num_non_control_inputs &&
+          !node_map_->NodeExists(new_node_name)) {
+        NodeDef* added_node = optimized_graph->add_node();
+        *added_node = *node;
+        // Always use AddN for the constant node, since AccumulateNV2 is a fake
+        // node that cannot be constant folded, since it does not have a kernel.
+        added_node->set_op("AddN");
+        added_node->mutable_attr()->erase("shape");
+        added_node->set_name(new_node_name);
+        node_map_->AddNode(added_node->name(), added_node);
+        added_node->clear_input();
+        for (int i : const_inputs) {
+          added_node->add_input(node->input(i));
+          node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
+                                  added_node->name());
+        }
+
+        // Overwrite the first const input with the added node.
+        node->set_input(const_inputs[0], added_node->name());
+        node_map_->AddOutput(added_node->name(), node->name());
+        nonconst_inputs.push_back(const_inputs[0]);
+        // Compact the remaining inputs to the original node.
+        std::sort(nonconst_inputs.begin(), nonconst_inputs.end());
+        int idx = 0;
+        for (int i : nonconst_inputs) {
+          if (idx != i) {
+            node->set_input(idx, node->input(i));
+          }
+          ++idx;
+        }
+        node->mutable_input()->DeleteSubrange(nonconst_inputs.size(),
+                                              const_inputs.size() - 1);
+        (*node->mutable_attr())["N"].set_i(node->input_size() -
+                                           num_control_inputs);
+        properties->ClearInputProperties(node->name());
+        (*added_node->mutable_attr())["N"].set_i(const_inputs.size());
+        graph_modified_ = true;
+        continue;
+      }
+    }
+
+    // Partial constant folding for Concat which is not commutative, so
+    // we have to preserve order and can only push consecutive runs of constant
+    // inputs into sub-nodes.
+    if (IsConcat(*node) && num_non_control_inputs > 3 &&
+        node->name().rfind("_partial_split_") == string::npos) {
+      int axis_arg = -1;
+      int begin = 0;
+      int end = num_non_control_inputs;
+      if (node->op() == "Concat") {
+        begin = 1;
+        axis_arg = 0;
+      } else if (node->op() == "ConcatV2") {
+        end = num_non_control_inputs - 1;
+        axis_arg = num_non_control_inputs - 1;
+      } else {
+        continue;
+      }
+
+      const NodeDef* axis_arg_node =
+          node_map_->GetNode(NodeName(node->input(axis_arg)));
+      if (axis_arg_node == nullptr || !IsReallyConstant(*axis_arg_node)) {
+        // We cannot constant fold Concat unless we the axis argument is
+        // constant. Skip node.
+        continue;
+      }
+
+      // We search for consecutive runs of constant inputs in the range
+      // [begin:end[ and push then down into child nodes.
+      std::vector<std::pair<int, int>> constant_input_runs;
+      int first = begin;
+      int last = begin;
+      while (last < end) {
+        while (first < end && !IsReallyConstant(*node_map_->GetNode(
+                                  NodeName(node->input(first))))) {
+          ++first;
+        }
+        // Invariant: node[first] is constant || first >= end.
+        last = first + 1;
+        while (last < end && IsReallyConstant(*node_map_->GetNode(
+                                 NodeName(node->input(last))))) {
+          ++last;
+        }
+        // Invariant: node[last] is not constant || last >= end
+        // Discard intervals shorter than 2 elements.
+        if (first < end && (last - first) > 1) {
+          constant_input_runs.emplace_back(first, last);
+        }
+        first = last;
+      }
+
+      // Skip if all inputs are constant, and let constant folding take over.
+      if (constant_input_runs.size() == 1 &&
+          constant_input_runs[0].first == begin &&
+          constant_input_runs[0].second == end) {
+        continue;
+      }
+      std::set<int> inputs_to_delete;
+      for (auto interval : constant_input_runs) {
+        // Push the constant inputs in the interval to a child node than can be
+        // constant folded.
+        const string new_node_name = OptimizedNodeName(
+            *node, strings::StrCat("_partial_split_", interval.first));
+        if (node_map_->NodeExists(new_node_name)) {
+          break;
+        }
+        NodeDef* added_node = optimized_graph->add_node();
+        *added_node = *node;
+        added_node->set_name(new_node_name);
+        node_map_->AddNode(added_node->name(), added_node);
+        added_node->clear_input();
+        for (int i = interval.first; i < interval.second; ++i) {
+          added_node->add_input(node->input(i));
+          node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
+                                  added_node->name());
+          if (i != interval.first) {
+            inputs_to_delete.insert(i);
+          }
+        }
+        added_node->add_input(node->input(axis_arg));
+        (*added_node->mutable_attr())["N"].set_i(interval.second -
+                                                 interval.first);
+        node_map_->AddOutput(NodeName(node->input(axis_arg)),
+                             added_node->name());
+
+        // Overwrite the first constant input with the result of the added
+        // child node.
+        node->set_input(interval.first, added_node->name());
+        node_map_->AddOutput(added_node->name(), node->name());
+      }
+      if (!constant_input_runs.empty()) {
+        graph_modified_ = true;
+        if (!inputs_to_delete.empty()) {
+          // Fix up the inputs to the original node.
+          std::vector<string> tmp(node->input().begin(), node->input().end());
+          node->clear_input();
+          for (int i = 0; i < tmp.size(); ++i) {
+            if (inputs_to_delete.find(i) == inputs_to_delete.end()) {
+              node->add_input(tmp[i]);
+            }
+          }
+          (*node->mutable_attr())["N"].set_i(node->input_size() - 1);
+          properties->ClearInputProperties(node->name());
+        }
+        continue;
+      }
     }
   }
+
   return Status::OK();
 }
 
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
                                             const GrapplerItem& item,
-                                            GraphDef* output) {
+                                            GraphDef* optimized_graph) {
   node_map_.reset(new NodeMap(graph_));
   nodes_whitelist_.clear();
   // Fold fetch nodes iff it has a single fanout. Note that if a fetch node
@@ -1703,15 +2485,20 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
     TF_RETURN_IF_ERROR(MaterializeShapes(properties));
     TF_RETURN_IF_ERROR(MaterializeConstants(properties));
   }
+  TF_RETURN_IF_ERROR(FoldGraph(optimized_graph));
+  node_map_.reset(new NodeMap(optimized_graph));
+  TF_RETURN_IF_ERROR(
+      SimplifyGraph(optimized_graph, &properties, can_use_shape_info));
 
-  TF_RETURN_IF_ERROR(FoldGraph(output));
-  node_map_.reset(new NodeMap(output));
-  TF_RETURN_IF_ERROR(SimplifyGraph(output, properties, can_use_shape_info));
   return Status::OK();
 }
 
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                 GraphDef* output) {
+                                 GraphDef* optimized_graph) {
+  // TensorFlow flushes denormals to zero and rounds to nearest, so we do
+  // the same here.
+  port::ScopedFlushDenormal flush;
+  port::ScopedSetRound round(FE_TONEAREST);
   nodes_to_preserve_ = item.NodesToPreserve();
   for (const auto& feed : item.feed) {
     feed_nodes_.insert(NodeName(feed.first));
@@ -1723,20 +2510,20 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   has_fetch_ = !item.fetch.empty();
-
   GrapplerItem item_to_optimize = item;
-  *output = item.graph;
+  *optimized_graph = item.graph;
   int64 node_count;
   do {
     graph_modified_ = false;
-    item_to_optimize.graph.Swap(output);
+    item_to_optimize.graph.Swap(optimized_graph);
     graph_ = &item_to_optimize.graph;
-    *output = GraphDef();
+    *optimized_graph = GraphDef();
     node_count = graph_->node_size();
-    TF_RETURN_IF_ERROR(RunOptimizationPass(cluster, item_to_optimize, output));
-  } while (graph_modified_ || output->node_size() != node_count);
-  *output->mutable_library() = item.graph.library();
-  *output->mutable_versions() = item.graph.versions();
+    TF_RETURN_IF_ERROR(
+        RunOptimizationPass(cluster, item_to_optimize, optimized_graph));
+  } while (graph_modified_ || optimized_graph->node_size() != node_count);
+  *optimized_graph->mutable_library() = item.graph.library();
+  *optimized_graph->mutable_versions() = item.graph.versions();
 
   return Status::OK();
 }
@@ -1746,5 +2533,5 @@ void ConstantFolding::Feedback(Cluster* cluster, const GrapplerItem& item,
   // Nothing to do for ConstantFolding.
 }
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index e4078514af11174788bc5a436125efeb3fa37177..eb06cd081f7f3eca31542614d93fa7dbda10d93c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
 
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -33,11 +33,12 @@ const char kConstantFoldingCtrl[] = "ConstantFoldingCtrl";
 // Constant folding optimization for a graph.
 class ConstantFolding : public GraphOptimizer {
  public:
-  static NodeDef CreateNodeDef(const string& name, const TensorValue& tensor);
+  static Status CreateNodeDef(const string& name, const TensorValue& tensor,
+                              NodeDef* node);
   static string AddControlDependency(const string& input_name, GraphDef* graph,
                                      NodeMap* node_map);
 
-  ConstantFolding(DeviceBase* cpu_device);
+  explicit ConstantFolding(DeviceBase* cpu_device);
   ConstantFolding(RewriterConfig::Toggle opt_level, DeviceBase* cpu_device);
 
   ~ConstantFolding() override {}
@@ -81,7 +82,8 @@ class ConstantFolding : public GraphOptimizer {
                                     GraphDef* graph);
   void ReplaceOperationWithSnapshot(int input_to_forward, NodeDef* node,
                                     GraphDef* graph);
-  Status ReplaceOperationWithConstant(double value,
+  void ReplaceSubtractionFromZeroByNegation(NodeDef* node, GraphDef* graph);
+  Status ReplaceOperationWithConstant(double value, const AttrValue& dtype_attr,
                                       const TensorShapeProto& shape,
                                       NodeDef* node, GraphDef* graph);
   void ReplaceDivisionOfOnesByReciprocal(NodeDef* node, GraphDef* graph);
@@ -90,7 +92,7 @@ class ConstantFolding : public GraphOptimizer {
   bool IsSimplifiableReduction(const NodeDef& node) const;
   bool IsSimplifiableReshape(const NodeDef& node,
                              const GraphProperties& properties) const;
-  Status SimplifyGraph(GraphDef* output, const GraphProperties& properties,
+  Status SimplifyGraph(GraphDef* output, GraphProperties* properties,
                        bool use_shape_info);
 
   Status RunOptimizationPass(Cluster* cluster, const GrapplerItem& item,
@@ -114,4 +116,4 @@ class ConstantFolding : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index d8df19fe6a0a5daafefd11e5ac39c8e3bc50e6e1..306ddd22d739d46385a00f590770ba5e63b795a4 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -22,13 +22,66 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-class ConstantFoldingTest : public GrapplerTest {};
+class ConstantFoldingTest : public GrapplerTest {
+ protected:
+  template <DataType DTYPE>
+  void SimpleNeutralElementTest() {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output x = ops::Placeholder(s.WithOpName("x"), DTYPE,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Tensor zeros_t(DTYPE, TensorShape({2, 2}));
+    Tensor ones_t(DTYPE, TensorShape({2, 2}));
+    Tensor x_t(DTYPE, TensorShape({2, 2}));
+    for (int i = 0; i < 4; ++i) {
+      zeros_t.flat<T>()(i) = T(0);
+      ones_t.flat<T>()(i) = T(1);
+      x_t.flat<T>()(i) = T(i + 1);
+    }
+    Output zeros = ops::Const(s.WithOpName("zeros"), zeros_t);
+    Output ones = ops::Const(s.WithOpName("ones"), ones_t);
+    Output mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
+    Output mul2 = ops::Mul(s.WithOpName("mul2"), x, ones);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch = {"mul1", "mul2"};
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+    LOG(INFO) << output.DebugString();
+    EXPECT_EQ(5, output.node_size());
+    for (int i = 0; i < output.node_size(); ++i) {
+      const NodeDef& node = output.node(i);
+      const string& name = node.name();
+      if (name == "mul1") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "mul2") {
+        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      }
+    }
+    auto tensors_expected =
+        EvaluateNodes(item.graph, {"mul1", "mul2"}, {{"x", x_t}});
+    auto tensors = EvaluateNodes(output, {"mul1", "mul2"}, {{"x", x_t}});
+    EXPECT_EQ(2, tensors_expected.size());
+    EXPECT_EQ(2, tensors.size());
+    for (int i = 0; i < 2; ++i) {
+      test::ExpectTensorEqual<T>(tensors_expected[i], tensors[i]);
+    }
+  }
+};
 
 TEST_F(ConstantFoldingTest, SimpleFolding) {
   // Build a simple graph with a few trivially prunable ops.
@@ -43,9 +96,9 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   item.fetch.push_back("d");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(1, output.node_size());
@@ -89,9 +142,9 @@ TEST_F(ConstantFoldingTest, AddTree) {
   item.fetch = {"add_parent", "mul_parent", "addmul_parent"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // We expect the following rewrite(s) to occur:
@@ -152,7 +205,10 @@ TEST_F(ConstantFoldingTest, AddTree) {
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement) {
-  for (bool use_const : {true, false}) {
+  int kConst = 0;
+  int kLike = 1;
+  int kFill = 2;
+  for (int const_type : {kConst, kLike, kFill}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
     Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
                                 ops::Placeholder::Shape(TensorShape({2, 2})));
@@ -164,11 +220,19 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
                                 ops::Placeholder::Shape(TensorShape({2, 3})));
     Output bias = ops::Placeholder(s.WithOpName("bias"), DT_FLOAT,
                                    ops::Placeholder::Shape(TensorShape({2})));
-    Output zeros = !use_const ? ops::ZerosLike(s.WithOpName("zeros"), x)
-                              : ops::Const(s.WithOpName("zeros"), 0.0f, {2, 2});
     Output zeros_1d = ops::Const(s.WithOpName("zeros_1d"), 0.0f, {2});
-    Output ones = !use_const ? ops::OnesLike(s.WithOpName("ones"), x)
-                             : ops::Const(s.WithOpName("ones"), 1.0f, {2, 2});
+    Output zeros_const = ops::Const(s.WithOpName("zeros_const"), 0.0f, {2, 2});
+    Output zeros_like = ops::ZerosLike(s.WithOpName("zeros_like"), x);
+    Output zeros_fill = ops::Fill(s.WithOpName("zeros_fill"), {2, 2}, 0.0f);
+    Output zeros = const_type == kConst
+                       ? zeros_const
+                       : (const_type == kLike ? zeros_like : zeros_fill);
+    Output ones_const = ops::Const(s.WithOpName("ones_const"), 1.0f, {2, 2});
+    Output ones_like = ops::OnesLike(s.WithOpName("ones_like"), x);
+    Output ones_fill = ops::Fill(s.WithOpName("ones_fill"), {2, 2}, 1.0f);
+    Output ones = const_type == kConst
+                      ? ones_const
+                      : (const_type == kLike ? ones_like : ones_fill);
     Output mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
     Output mul2 = ops::Mul(s.WithOpName("mul2"), zeros, y);
     Output mul3 = ops::Mul(s.WithOpName("mul3"), x, ones);
@@ -187,19 +251,26 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
     Output bias_add2 = ops::BiasAdd(s.WithOpName("bias_add2"), zeros, bias);
     Output sub1 = ops::Sub(s.WithOpName("sub1"), x, zeros);
     Output sub2 = ops::Sub(s.WithOpName("sub2"), zeros, y);
-    Output addn =
-        ops::AddN(s.WithOpName("addn"),
-                  {mul1, mul2, mul3, mul4, mul5, mul6, div1, div2, matmul1,
-                   matmul2, add1, add2, bias_add1, bias_add2, sub1, sub2});
+    Output concat =
+        ops::Stack(s.WithOpName("stack"),
+                   {mul1, mul2, mul3, mul4, mul5, mul6, div1, div2, matmul1,
+                    matmul2, add1, add2, bias_add1, bias_add2, sub1, sub2});
     GrapplerItem item;
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
-    item.fetch = {"addn", "matmul3", "matmul4"};
+    item.fetch = {"stack", "matmul3", "matmul4"};
 
     ConstantFolding optimizer(nullptr /* cpu_device */);
     GraphDef output;
     Status status = optimizer.Optimize(nullptr, item, &output);
     TF_EXPECT_OK(status);
 
+    const string suffix =
+        (const_type == kConst ? "_const"
+                              : (const_type == kLike ? "_like" : "_fill"));
+    const string zeros_name = strings::StrCat("zeros", suffix);
+    const string ones_name = strings::StrCat("ones", suffix);
+    const string ctrl_zeros_name = strings::StrCat("^zeros", suffix);
+    const string ctrl_ones_name = strings::StrCat("^ones", suffix);
     EXPECT_EQ(27, output.node_size());
     for (int i = 0; i < output.node_size(); ++i) {
       const NodeDef& node = output.node(i);
@@ -207,19 +278,19 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
       if (name == "mul1") {
         EXPECT_EQ("Const", node.op());
         EXPECT_EQ("^x", node.input(0));
-        EXPECT_EQ("^zeros", node.input(1));
+        EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "mul2") {
         EXPECT_EQ("Const", node.op());
-        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ(ctrl_zeros_name, node.input(0));
         EXPECT_EQ("^y", node.input(1));
       } else if (name == "mul3") {
         EXPECT_EQ("Snapshot", node.op());
         EXPECT_EQ("x", node.input(0));
-        EXPECT_EQ("^ones", node.input(1));
+        EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "mul4") {
         EXPECT_EQ("Snapshot", node.op());
         EXPECT_EQ("y", node.input(0));
-        EXPECT_EQ("^ones", node.input(1));
+        EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "mul5") {
         EXPECT_EQ("Const", node.op());
         EXPECT_EQ("^x", node.input(0));
@@ -231,23 +302,23 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
       } else if (name == "div1") {
         EXPECT_EQ("Snapshot", node.op());
         EXPECT_EQ("x", node.input(0));
-        EXPECT_EQ("^ones", node.input(1));
+        EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "div2") {
         EXPECT_EQ("Reciprocal", node.op());
         EXPECT_EQ("y", node.input(0));
-        EXPECT_EQ("^ones", node.input(1));
+        EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "matmul1") {
         EXPECT_EQ("Const", node.op());
         EXPECT_EQ("^x", node.input(0));
-        EXPECT_EQ("^zeros", node.input(1));
+        EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "matmul2") {
         EXPECT_EQ("Const", node.op());
-        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ(ctrl_zeros_name, node.input(0));
         EXPECT_EQ("^y", node.input(1));
       } else if (name == "matmul3") {
         EXPECT_EQ("Const", node.op());
         EXPECT_EQ("^a", node.input(0));
-        EXPECT_EQ("^zeros", node.input(1));
+        EXPECT_EQ(ctrl_zeros_name, node.input(1));
         TensorProto t = node.attr().at("value").tensor();
         EXPECT_EQ(1, t.float_val_size());
         EXPECT_EQ(0, t.float_val(0));
@@ -256,7 +327,7 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(2, t.tensor_shape().dim(1).size());
       } else if (name == "matmul4") {
         EXPECT_EQ("Const", node.op());
-        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ(ctrl_zeros_name, node.input(0));
         EXPECT_EQ("^b", node.input(1));
         TensorProto t = node.attr().at("value").tensor();
         EXPECT_EQ(1, t.float_val_size());
@@ -267,11 +338,11 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
       } else if (name == "add1") {
         EXPECT_EQ("Snapshot", node.op());
         EXPECT_EQ("x", node.input(0));
-        EXPECT_EQ("^zeros", node.input(1));
+        EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "add2") {
         EXPECT_EQ("Snapshot", node.op());
         EXPECT_EQ("y", node.input(0));
-        EXPECT_EQ("^zeros", node.input(1));
+        EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "bias_add1") {
         EXPECT_EQ("Snapshot", node.op());
         EXPECT_EQ("x", node.input(0));
@@ -279,17 +350,16 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
       } else if (name == "bias_add2") {
         // We don't eliminate this one, because it requires broadcasting.
         EXPECT_EQ("BiasAdd", node.op());
-        EXPECT_EQ("zeros", node.input(0));
+        EXPECT_EQ(zeros_name, node.input(0));
         EXPECT_EQ("bias", node.input(1));
       } else if (name == "sub1") {
         EXPECT_EQ("Snapshot", node.op());
         EXPECT_EQ("x", node.input(0));
-        EXPECT_EQ("^zeros", node.input(1));
+        EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "sub2") {
-        // We don't handle this case yet.
-        EXPECT_EQ("Sub", node.op());
-        EXPECT_EQ("zeros", node.input(0));
-        EXPECT_EQ("y", node.input(1));
+        EXPECT_EQ("Neg", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ(ctrl_zeros_name, node.input(1));
       }
       const std::set<string> square_zero_const{"mul1", "mul2",    "mul5",
                                                "mul6", "matmul1", "matmul2"};
@@ -302,9 +372,31 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(2, t.tensor_shape().dim(1).size());
       }
     }
+    auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 2}));
+    auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 3}));
+    auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+    auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+    auto bias_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
+
+    auto tensors_expected = EvaluateNodes(
+        item.graph, item.fetch,
+        {{"x", x_t}, {"y", y_t}, {"a", a_t}, {"b", b_t}, {"bias", bias_t}});
+    EXPECT_EQ(item.fetch.size(), tensors_expected.size());
+    auto tensors = EvaluateNodes(
+        output, item.fetch,
+        {{"x", x_t}, {"y", y_t}, {"a", a_t}, {"b", b_t}, {"bias", bias_t}});
+    EXPECT_EQ(item.fetch.size(), tensors.size());
+    for (int i = 0; i < item.fetch.size(); ++i) {
+      test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+    }
   }
 }
 
+TEST_F(ConstantFoldingTest, NeutralElement_ShortFloats) {
+  SimpleNeutralElementTest<DT_HALF>();
+  SimpleNeutralElementTest<DT_BFLOAT16>();
+}
+
 TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output cf_half = ops::Const(s.WithOpName("cf_half"), 0.5f, {1});
@@ -415,7 +507,6 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  LOG(INFO) << output.DebugString();
 
   EXPECT_EQ(15, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
@@ -429,6 +520,25 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
       EXPECT_EQ("Mul", node.op()) << node.name();
     }
   }
+
+  const std::vector<string> fetch = {"mul_0", "mul_4", "mul_8"};
+  auto x_known_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_partially_unknown_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
+  auto expected_tensors =
+      EvaluateNodes(item.graph, fetch,
+                    {{"x_known", x_known_t},
+                     {"x_partially_unknown", x_partially_unknown_t},
+                     {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(fetch.size(), expected_tensors.size());
+  auto tensors = EvaluateNodes(output, fetch,
+                               {{"x_known", x_known_t},
+                                {"x_partially_unknown", x_partially_unknown_t},
+                                {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(expected_tensors[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
@@ -469,7 +579,6 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  LOG(INFO) << output.DebugString();
 
   EXPECT_EQ(10, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
@@ -482,6 +591,20 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
       EXPECT_TRUE(IsControlInput(node.input(1)));
     }
   }
+  const std::vector<string> fetch = {"addn1"};
+  auto x_partially_unknown_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto expected_tensors =
+      EvaluateNodes(item.graph, fetch,
+                    {{"x_partially_unknown", x_partially_unknown_t},
+                     {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(1, expected_tensors.size());
+  auto tensors = EvaluateNodes(output, fetch,
+                               {{"x_partially_unknown", x_partially_unknown_t},
+                                {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(expected_tensors[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, CreateConstNodes) {
@@ -510,9 +633,9 @@ TEST_F(ConstantFoldingTest, CreateConstNodes) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(24, output.node_size());
@@ -559,9 +682,9 @@ TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   item.fetch.push_back("f");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(2, output.node_size());
@@ -600,9 +723,9 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   item.fetch.push_back("e");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "e"};
@@ -643,9 +766,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "c",
@@ -699,10 +822,11 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
   GrapplerItem item;
   item.fetch.push_back("i2");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-
-  ConstantFolding fold(nullptr /* cpu_device */);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "i2"};
@@ -718,6 +842,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
       EXPECT_EQ("^p2", node.input(1));
     }
   }
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
@@ -774,9 +901,9 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
   }
 
   item.fetch = outputs;
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int constant_folded = 0;
@@ -812,9 +939,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterialization) {
   item.fetch.push_back("p2");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -835,6 +962,17 @@ TEST_F(ConstantFoldingTest, ShapeMaterialization) {
     }
   }
   EXPECT_EQ(1, found);
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>({5, 7});
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>({11, 13});
+
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, item.fetch.size());
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, item.fetch.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
@@ -851,9 +989,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -886,6 +1024,18 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
     }
   }
   EXPECT_EQ(3, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({11, 13}));
+  std::vector<string> fetch_nodes = {"p2"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
@@ -905,9 +1055,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
   int found = 0;
   for (const auto& node : output.node()) {
@@ -947,6 +1097,81 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
     }
   }
   EXPECT_EQ(9, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 6}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+  const std::vector<string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
+                                           "i2c", "i3a", "i3b"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+}
+
+TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output v1 = ops::Variable(scope.WithOpName("v1"), {3, -1}, DT_FLOAT);
+  Output v2 = ops::Variable(scope.WithOpName("v2"), {4, 6}, DT_FLOAT);
+  auto s = ops::ShapeN(scope.WithOpName("s"), {v1, v2});
+  auto id_n = ops::IdentityN(scope.WithOpName("id_n"), {s[0], s[1]});
+  Output ia = ops::Identity(scope.WithOpName("ia"), id_n[0]);
+  Output ib = ops::Identity(scope.WithOpName("ib"), id_n[1]);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("ia");
+  item.fetch.push_back("ib");
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int found = 0;
+  for (const auto& node : output.node()) {
+    EXPECT_NE(AddPrefixToNodeName("s-matshapes-0", kConstantFoldingConst),
+              node.name());
+    if (node.name() == "s") {
+      ++found;
+      EXPECT_EQ("ShapeN", node.op());
+      EXPECT_EQ("v1", node.input(0));
+      EXPECT_EQ("v2", node.input(1));
+    }
+    if (node.name() == "id_n") {
+      ++found;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ("s", node.input(0));
+      EXPECT_EQ(AddPrefixToNodeName("s-matshapes-1", kConstantFoldingConst),
+                node.input(1));
+    }
+    if (node.name() == "ia") {
+      ++found;
+      EXPECT_EQ("id_n", node.input(0));
+    }
+    if (node.name() == "ib") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ("^s", node.input(0));
+      EXPECT_EQ("^id_n", node.input(1));
+    }
+  }
+  EXPECT_EQ(4, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"v1", v1_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(output, item.fetch, {{"v1", v1_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
@@ -974,9 +1199,9 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
@@ -991,8 +1216,10 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   EXPECT_EQ(present_nodes.size(), output.node_size());
   int found = 0;
   for (const auto& node : output.node()) {
-    EXPECT_TRUE(present_nodes.find(node.name()) != present_nodes.end());
-    EXPECT_TRUE(not_present_nodes.find(node.name()) == not_present_nodes.end());
+    EXPECT_TRUE(present_nodes.find(node.name()) != present_nodes.end())
+        << node.name();
+    EXPECT_TRUE(not_present_nodes.find(node.name()) == not_present_nodes.end())
+        << node.name();
     present_nodes.erase(node.name());
     not_present_nodes.erase(node.name());
     if (node.name() == "rank") {
@@ -1020,6 +1247,30 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
     }
   }
   EXPECT_EQ(4, found);
+
+  auto v_in_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
+
+  v_ctrl_t.flat<bool>()(0) = true;
+  std::vector<string> fetch_nodes = {"m", "m2"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
+
+  v_ctrl_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, fetch_nodes,
+                                   {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  tensors = EvaluateNodes(output, fetch_nodes,
+                          {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, SwitchNodes) {
@@ -1050,9 +1301,9 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
 
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
                                     "switch",   "i",
@@ -1084,6 +1335,28 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
     }
   }
   EXPECT_EQ(2, found);
+
+  auto v_in_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
+  v_ctrl_t.flat<bool>()(0) = true;
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
+
+  v_ctrl_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, item.fetch,
+                                   {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  tensors = EvaluateNodes(output, item.fetch,
+                          {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, MergeNodes) {
@@ -1106,6 +1379,10 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   ops::Merge m1(scope.WithOpName("m1"), {x, const1, const2});
   ops::Merge m2(scope.WithOpName("m2"), {const1, const3});
   ops::Merge m3(scope.WithOpName("m3"), {x, y});
+  // m4 is not foldable because the only constant input
+  // has a control input, so we cannot know if it will be
+  // triggered.
+  ops::Merge m4(scope.WithOpName("m4"), {x, const1});
 
   ops::Identity out1(scope.WithOpName("out1"), m1.output);
   ops::Identity idx1(scope.WithOpName("idx1"), m1.value_index);
@@ -1113,16 +1390,19 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   ops::Identity idx2(scope.WithOpName("idx2"), m2.value_index);
   ops::Identity out3(scope.WithOpName("out3"), m3.output);
   ops::Identity idx3(scope.WithOpName("idx3"), m3.value_index);
+  ops::Identity out4(scope.WithOpName("out4"), m4.output);
+  ops::Identity idx4(scope.WithOpName("idx4"), m4.value_index);
 
   GrapplerItem item;
-  item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3"};
+  item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3", "out4", "idx4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  EXPECT_EQ(19, output.node_size());
   int found_nodes = 0;
   for (const auto& node : output.node()) {
     if (node.name() == "out1") {
@@ -1159,10 +1439,18 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("m3:1", node.input(0));
       ++found_nodes;
+    } else if (node.name() == "out4") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m4", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "idx4") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m4:1", node.input(0));
+      ++found_nodes;
     }
   }
   // Make sure the graph contains all the nodes we're expecting.
-  EXPECT_EQ(6, found_nodes);
+  EXPECT_EQ(8, found_nodes);
 
   std::vector<string> fetch = {"out1", "idx1"};
   auto tensors = EvaluateNodes(output, fetch);
@@ -1177,175 +1465,741 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   EXPECT_EQ(2, out_idx.flat<int32>()(0));
 }
 
-TEST_F(ConstantFoldingTest, NoOpReduction) {
-  // Build a simple graph with a reduction that can be reduced to the identity.
+TEST_F(ConstantFoldingTest, SplitRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
-  Output v = ops::Variable(scope.WithOpName("v"), {3, 5, 7}, DT_FLOAT);
-  Output c =
-      ops::Const(scope.WithOpName("c").WithControlDependencies(v), 0, {0});
-  Output i = ops::Identity(scope.WithOpName("i"), c);
-  Output p = ops::Prod(scope.WithOpName("p"), v, i);
-  Output s = ops::Square(scope.WithOpName("s"), p);
+  Output in1 =
+      ops::Variable(scope.WithOpName("in1"), TensorShape({2}), DT_FLOAT);
+  Output in2 =
+      ops::Variable(scope.WithOpName("in2"), TensorShape({4}), DT_FLOAT);
+  auto split_dim = ops::Const(scope.WithOpName("split_dim"), {0}, {});
+  ops::Split s1(scope.WithOpName("s1"), split_dim, in1, 1);
+  ops::Split s2(scope.WithOpName("s2"), split_dim, in2, 2);
+
+  ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
 
   GrapplerItem item;
-  item.fetch.push_back("s");
+  item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
-  GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
   TF_EXPECT_OK(status);
 
-  bool found = false;
-  for (const auto& node : output.node()) {
-    if (node.name() == "p") {
-      found = true;
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("v", node.input(0));
-      EXPECT_EQ("^i", node.input(1));
-    }
-  }
-  EXPECT_TRUE(found);
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("split_dim", "Const", {}, {}, &want);
+  AddNode("s1", "Identity", {"in1", AsControlDependency("split_dim")}, {},
+          &want);
+  AddNode("s2", "Split", {"in2", "split_dim"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
+  auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
-TEST_F(ConstantFoldingTest, NoOpReshape) {
-  // Build a simple graph with a reshape that can be reduced to the identity.
+TEST_F(ConstantFoldingTest, SplitVRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
-  // A reshape than can be optimized
-  Output d1 = ops::Const(scope.WithOpName("d1"), 3.14f, {17});
-  Output v1 = ops::Variable(scope.WithOpName("v1"), {17}, DT_FLOAT);
-  Output c1 =
-      ops::Const(scope.WithOpName("c1").WithControlDependencies(v1), 17, {1});
-  Output i1 = ops::Identity(scope.WithOpName("i1"), c1);
-  Output r1 =
-      ops::Reshape(scope.WithOpName("r1").WithControlDependencies(d1), v1, i1);
-  Output s1 = ops::Square(scope.WithOpName("s1"), r1);
-
-  // A multi dimensional reshape than can be optimized
-  Output v3 = ops::Variable(scope.WithOpName("v3"), {5, 5, 5}, DT_FLOAT);
-  Output c3 =
-      ops::Const(scope.WithOpName("c3").WithControlDependencies(v3), 5, {3});
-  Output i3 = ops::Identity(scope.WithOpName("i3"), c3);
-  Output r3 = ops::Reshape(scope.WithOpName("r3"), v3, i3);
-  Output s3 = ops::Square(scope.WithOpName("s3"), r3);
-
-  // A multi dimensional partially defined reshape than can be optimized
-  Output v4 = ops::Variable(scope.WithOpName("v4"), {5, 5, 5}, DT_FLOAT);
-  Output c4 = ops::Const(scope.WithOpName("c4").WithControlDependencies(v4),
-                         {5, -1, 5}, {3});
-  Output i4 = ops::Identity(scope.WithOpName("i4"), c4);
-  Output r4 = ops::Reshape(scope.WithOpName("r4"), v4, i4);
-  Output s4 = ops::Square(scope.WithOpName("s4"), r4);
+  Output in1 =
+      ops::Variable(scope.WithOpName("in1"), TensorShape({2}), DT_FLOAT);
+  Output in2 =
+      ops::Variable(scope.WithOpName("in2"), TensorShape({5}), DT_FLOAT);
+  auto split_dim = ops::Const(scope.WithOpName("split_dim"), {0}, {});
+  auto size_splits1 = ops::Const(scope.WithOpName("size_splits1"), {2}, {1});
+  auto size_splits2 = ops::Const(scope.WithOpName("size_splits2"), {2, 3}, {2});
+  ops::SplitV s1(scope.WithOpName("s1"), in1, size_splits1, split_dim, 1);
+  ops::SplitV s2(scope.WithOpName("s2"), in2, size_splits2, split_dim, 2);
 
-  // A reshape that can't be optimized
-  Output v2 = ops::Variable(scope.WithOpName("v2"), {17, 1}, DT_FLOAT);
-  Output c2 =
-      ops::Const(scope.WithOpName("c2").WithControlDependencies(v2), 17, {1});
-  Output r2 = ops::Reshape(scope.WithOpName("r2"), v2, c2);
-  Output s2 = ops::Square(scope.WithOpName("s2"), r2);
+  ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
 
   GrapplerItem item;
-  item.fetch = {"s1", "s2", "s3", "s4"};
+  item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
-  GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
   TF_EXPECT_OK(status);
 
-  int found = 0;
-  for (const auto& node : output.node()) {
-    if (node.name() == "r1") {
-      ++found;
-      EXPECT_EQ("Identity", node.op());
-      ASSERT_EQ(3, node.input_size());
-      EXPECT_EQ("v1", node.input(0));
-      EXPECT_EQ("^i1", node.input(1));
-      EXPECT_EQ("^d1", node.input(2));
-    } else if (node.name() == "r3") {
-      ++found;
-      EXPECT_EQ("Identity", node.op());
-      ASSERT_EQ(2, node.input_size());
-      EXPECT_EQ("v3", node.input(0));
-      EXPECT_EQ("^i3", node.input(1));
-    } else if (node.name() == "r4") {
-      ++found;
-      EXPECT_EQ("Identity", node.op());
-      ASSERT_EQ(2, node.input_size());
-      EXPECT_EQ("v4", node.input(0));
-      EXPECT_EQ("^i4", node.input(1));
-    } else if (node.name() == "r2") {
-      ++found;
-      EXPECT_EQ("Reshape", node.op());
-      ASSERT_EQ(2, node.input_size());
-      EXPECT_EQ("v2", node.input(0));
-      EXPECT_EQ("c2", node.input(1));
-    }
-  }
-  EXPECT_EQ(4, found);
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("split_dim", "Const", {}, {}, &want);
+  AddNode("size_splits1", "Const", {}, {}, &want);
+  AddNode("size_splits2", "Const", {}, {}, &want);
+  AddNode("s1", "Identity",
+          {"in1", AsControlDependency("size_splits1"),
+           AsControlDependency("split_dim")},
+          {}, &want);
+  AddNode("s2", "SplitV", {"in2", "size_splits2", "split_dim"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
+  auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
-TEST_F(ConstantFoldingTest, Packing) {
-  // Build a simple graph with a large constant that can be folded.
+TEST_F(ConstantFoldingTest, TransposeOnSize1DimsRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-  Output c = ops::Const(scope.WithOpName("c"), 3.14f, {1000});
-  Output i1 = ops::Identity(scope.WithOpName("i1"), c);
-  Output i2 = ops::Identity(scope.WithOpName("i2"), c);
+
+  Output in1 = ops::Variable(scope.WithOpName("in1"), TensorShape({1, 2, 4, 1}),
+                             DT_FLOAT);
+  Output p1 = ops::Const(scope.WithOpName("p1"), {3, 2, 1, 0}, {4});
+  Output in2 = ops::Variable(scope.WithOpName("in2"), TensorShape({1, 4, 2, 1}),
+                             DT_FLOAT);
+  Output p2 = ops::Const(scope.WithOpName("p2"), {3, 1, 2, 0}, {4});
+  ops::Transpose t1(scope.WithOpName("t1"), in1, p1);
+  ops::Transpose t2(scope.WithOpName("t2").WithControlDependencies({in1}), in2,
+                    p2);
+
+  ops::Add out1(scope.WithOpName("out1"), t1, t2);
 
   GrapplerItem item;
+  item.fetch = {"out1"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
-  GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
   TF_EXPECT_OK(status);
 
-  // Make sure that the representation of the folded constant is space
-  // efficient: in particular, the whole message should be smaller than 8k (the
-  // size needed to naively encode 1000 floats folded twice).
-  EXPECT_GT(8000, output.ByteSizeLong());
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("p1", "Const", {}, {}, &want);
+  AddNode("p2", "Const", {}, {}, &want);
+  AddNode("t1", "Transpose", {"in1", "p1"}, {}, &want);
+  AddNode("t2", "Identity",
+          {"in2", AsControlDependency("in1"), AsControlDependency("p2")}, {},
+          &want);
+  AddNode("out1", "Add", {"t1", "t2"}, {}, &want);
+
+  CompareGraphs(want, got);
 }
 
-TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output a =
-      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
-                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
-  Output b = ops::Square(s.WithOpName("b"), a);
-  Output c = ops::Mul(s.WithOpName("c"), a, b);
-  Output d = ops::Shape(s.WithOpName("d"), a);
-  Output e = ops::Shape(s.WithOpName("e"), b);
+TEST_F(ConstantFoldingTest, RandomShuffleOnScalarRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
-  auto f = ops::internal::BroadcastGradientArgs(s.WithOpName("f"), d, e);
-  Output o1 = ops::Identity(s.WithOpName("o1"), f.r0);
-  Output o2 = ops::Identity(s.WithOpName("o2"), f.r1);
+  Output in1 =
+      ops::Variable(scope.WithOpName("in1"), TensorShape({}), DT_FLOAT);
+  Output in2 =
+      ops::Variable(scope.WithOpName("in2"), TensorShape({}), DT_FLOAT);
+  ops::RandomShuffle s1(scope.WithOpName("s1"), in1);
+  ops::RandomShuffle s2(scope.WithOpName("s2").WithControlDependencies({in1}),
+                        in2);
 
-  Output g = ops::Placeholder(s.WithOpName("g"), DT_FLOAT,
-                              ops::Placeholder::Shape(PartialTensorShape({1})));
-  Output h = ops::Shape(s.WithOpName("h"), g);
-  auto i = ops::internal::BroadcastGradientArgs(s.WithOpName("i"), d, h);
-  Output p1 = ops::Identity(s.WithOpName("p1"), i.r0);
-  Output p2 = ops::Identity(s.WithOpName("p2"), i.r1);
+  ops::Add out1(scope.WithOpName("out1"), s1, s2);
+  ops::Identity out2(scope.WithOpName("out2"), s2);
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"out1", "out2"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
-  GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("s1", "Identity", {"in1"}, {}, &want);
+  AddNode("s2", "Identity", {"in2", AsControlDependency("in1")}, {}, &want);
+  AddNode("out1", "Add", {"s1", "s2"}, {}, &want);
+  AddNode("out2", "Identity", {"s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(2, tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+}
+
+TEST_F(ConstantFoldingTest, ReverseOnSize1DimsRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 = ops::Variable(scope.WithOpName("in1"), TensorShape({1, 2, 4, 1}),
+                             DT_FLOAT);
+  Output a1 = ops::Const(scope.WithOpName("a1"), {3, 2, 1, 0}, {4});
+  Output in2 = ops::Variable(scope.WithOpName("in2"), TensorShape({1, 2, 4, 1}),
+                             DT_FLOAT);
+  Output a2 = ops::Const(scope.WithOpName("a2"), {0, 3}, {2});
+  ops::Reverse r1(scope.WithOpName("r1"), in1, a1);
+  ops::Reverse r2(scope.WithOpName("r2").WithControlDependencies({in1}), in2,
+                  a2);
+
+  ops::Add out1(scope.WithOpName("out1"), r1, r2);
+
+  GrapplerItem item;
+  item.fetch = {"out1"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("a1", "Const", {}, {}, &want);
+  AddNode("a2", "Const", {}, {}, &want);
+  AddNode("r1", "ReverseV2", {"in1", "a1"}, {}, &want);
+  AddNode("r2", "Identity",
+          {"in2", AsControlDependency("in1"), AsControlDependency("a2")}, {},
+          &want);
+  AddNode("out1", "Add", {"r1", "r2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
+TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
+  {  // size = {3, 5}
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    auto in1 = ops::Variable(scope.WithOpName("in1"), {3, 5}, DT_FLOAT);
+    auto begin = ops::Const(scope.WithOpName("begin"), {0, 0}, {2});
+    auto size = ops::Const(scope.WithOpName("size"), {3, 5}, {2});
+    Output in2 = ops::Variable(scope.WithOpName("in2"), {4, 6}, DT_FLOAT);
+    ops::Slice s1(scope.WithOpName("s1"), in1, begin, size);
+    ops::Slice s2(scope.WithOpName("s2"), in2, begin, size);
+
+    ops::Add out(scope.WithOpName("out"), s1, s2);
+
+    GrapplerItem item;
+    item.fetch = {"out"};
+    TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef got;
+    Status status = optimizer.Optimize(nullptr, item, &got);
+    TF_EXPECT_OK(status);
+
+    GraphDef want;
+    AddNode("in1", "VariableV2", {}, {}, &want);
+    AddNode("in2", "VariableV2", {}, {}, &want);
+    AddNode("begin", "Const", {}, {}, &want);
+    AddNode("size", "Const", {}, {}, &want);
+    AddNode("s1", "Identity",
+            {"in1", AsControlDependency("begin"), AsControlDependency("size")},
+            {}, &want);
+    AddNode("s2", "Slice", {"in2", "begin", "size"}, {}, &want);
+    AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+    CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+  }
+  {  // size = {-1, -1}
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    auto in1 =
+        ops::Variable(scope.WithOpName("in1"), {3, 5}, DataType::DT_FLOAT);
+    auto begin1 = ops::Const(scope.WithOpName("begin1"), {0, 0}, {2});
+    auto begin2 = ops::Const(scope.WithOpName("begin2"), {1, 1}, {2});
+    auto size = ops::Const(scope.WithOpName("size"), {-1, -1}, {2});
+    Output in2 =
+        ops::Variable(scope.WithOpName("in2"), {4, 6}, DataType::DT_FLOAT);
+    ops::Slice s1(scope.WithOpName("s1"), in1, begin1, size);
+    ops::Slice s2(scope.WithOpName("s2"), in2, begin2, size);
+
+    ops::Add out(scope.WithOpName("out"), s1, s2);
+
+    GrapplerItem item;
+    item.fetch = {"out"};
+    TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef got;
+    Status status = optimizer.Optimize(nullptr, item, &got);
+    TF_EXPECT_OK(status);
+
+    GraphDef want;
+    AddNode("in1", "VariableV2", {}, {}, &want);
+    AddNode("in2", "VariableV2", {}, {}, &want);
+    AddNode("begin1", "Const", {}, {}, &want);
+    AddNode("begin2", "Const", {}, {}, &want);
+    AddNode("size", "Const", {}, {}, &want);
+    AddNode("s1", "Identity",
+            {"in1", AsControlDependency("begin1"), AsControlDependency("size")},
+            {}, &want);
+    AddNode("s2", "Slice", {"in2", "begin2", "size"}, {}, &want);
+    AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+    CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+  }
+}
+
+TEST_F(ConstantFoldingTest, TileWithMultipliesBeingOne) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  auto in1 = ops::Variable(scope.WithOpName("in1"), {4, 6}, DT_FLOAT);
+  auto in2 = ops::Variable(scope.WithOpName("in2"), {4, 3}, DT_FLOAT);
+  auto multiplies1 = ops::Const(scope.WithOpName("multiplies1"), {1, 1}, {2});
+  auto multiplies2 = ops::Const(scope.WithOpName("multiplies2"), {1, 2}, {2});
+
+  ops::Tile t1(scope.WithOpName("t1"), in1, multiplies1);
+  ops::Tile t2(scope.WithOpName("t2"), in2, multiplies2);
+
+  ops::Add out(scope.WithOpName("out"), t1, t2);
+
+  GrapplerItem item;
+  item.fetch = {"out"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("multiplies1", "Const", {}, {}, &want);
+  AddNode("multiplies2", "Const", {}, {}, &want);
+  AddNode("t1", "Identity", {"in1", AsControlDependency("multiplies1")}, {},
+          &want);
+  AddNode("t2", "Tile", {"in2", "multiplies2"}, {}, &want);
+  AddNode("out", "Add", {"t1", "t2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
+TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  auto in1 = ops::Variable(scope.WithOpName("in1"), {4, 6}, DT_INT32);
+  auto in2 = ops::Variable(scope.WithOpName("in2"), {2, 2}, DT_INT32);
+  auto paddings1 =
+      ops::Const(scope.WithOpName("paddings1"), {0, 0, 0, 0}, {2, 2});
+  auto paddings2 =
+      ops::Const(scope.WithOpName("paddings2"), {1, 1, 2, 2}, {2, 2});
+  auto c1 = ops::Const(scope.WithOpName("c1"), 1);
+  auto c2 = ops::Const(scope.WithOpName("c2"), 1);
+
+  ops::PadV2 p1(scope.WithOpName("p1"), in1, paddings1, c1);
+  ops::PadV2 p2(scope.WithOpName("p2"), in2, paddings2, c2);
+
+  ops::Add out(scope.WithOpName("out"), p1, p2);
+
+  GrapplerItem item;
+  item.fetch = {"out"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("paddings1", "Const", {}, {}, &want);
+  AddNode("paddings2", "Const", {}, {}, &want);
+  AddNode("c1", "Const", {}, {}, &want);
+  AddNode("c2", "Const", {}, {}, &want);
+  AddNode("p1", "Identity",
+          {"in1", AsControlDependency("paddings1"), AsControlDependency("c1")},
+          {}, &want);
+  AddNode("p2", "PadV2", {"in2", "paddings2", "c2"}, {}, &want);
+  AddNode("out", "Add", {"p1", "p2"}, {}, &want);
+
+  CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_INT32>(TensorShape({4, 6}));
+  auto in2_t = GenerateRandomTensor<DT_INT32>(TensorShape({2, 2}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  auto in1 = ops::Variable(scope.WithOpName("in1"), {2, 3}, DT_INT32);
+  auto in2 = ops::Variable(scope.WithOpName("in2"), {1, 2, 3, 1}, DT_INT32);
+
+  ops::Squeeze s1(scope.WithOpName("s1"), in1);
+  ops::Squeeze s2(scope.WithOpName("s2"), in2);
+
+  ops::Add out(scope.WithOpName("out"), s1, s2);
+
+  GrapplerItem item;
+  item.fetch = {"out"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("s1", "Identity", {"in1"}, {}, &want);
+  AddNode("s2", "Squeeze", {"in2"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_INT32>(TensorShape({2, 3}));
+  auto in2_t = GenerateRandomTensor<DT_INT32>(TensorShape({1, 2, 3, 1}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(ConstantFoldingTest, NoOpReduction) {
+  // Build a simple graph with a reduction that can be reduced to the
+  // identity.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output v = ops::Variable(scope.WithOpName("v"), {3, 5, 7}, DT_FLOAT);
+  Output c =
+      ops::Const(scope.WithOpName("c").WithControlDependencies(v), 0, {0});
+  Output i = ops::Identity(scope.WithOpName("i"), c);
+  Output p = ops::Prod(scope.WithOpName("p"), v, i);
+  Output s = ops::Square(scope.WithOpName("s"), p);
+
+  GrapplerItem item;
+  item.fetch.push_back("s");
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  bool found = false;
+  for (const auto& node : output.node()) {
+    if (node.name() == "p") {
+      found = true;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("v", node.input(0));
+      EXPECT_EQ("^i", node.input(1));
+    }
+  }
+  EXPECT_TRUE(found);
+
+  auto v_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 7}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"v", v_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch, {{"v", v_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+}
+
+TEST_F(ConstantFoldingTest, NoOpReshape) {
+  // Build a simple graph with a reshape that can be reduced to the identity.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  // A reshape than can be optimized
+  Output d1 = ops::Const(scope.WithOpName("d1"), 3.14f, {17});
+  Output v1 = ops::Variable(scope.WithOpName("v1"), {17}, DT_FLOAT);
+  Output c1 =
+      ops::Const(scope.WithOpName("c1").WithControlDependencies(v1), 17, {1});
+  Output i1 = ops::Identity(scope.WithOpName("i1"), c1);
+  Output r1 =
+      ops::Reshape(scope.WithOpName("r1").WithControlDependencies(d1), v1, i1);
+  Output s1 = ops::Square(scope.WithOpName("s1"), r1);
+
+  // A multi dimensional reshape than can be optimized
+  Output v3 = ops::Variable(scope.WithOpName("v3"), {5, 5, 5}, DT_FLOAT);
+  Output c3 =
+      ops::Const(scope.WithOpName("c3").WithControlDependencies(v3), 5, {3});
+  Output i3 = ops::Identity(scope.WithOpName("i3"), c3);
+  Output r3 = ops::Reshape(scope.WithOpName("r3"), v3, i3);
+  Output s3 = ops::Square(scope.WithOpName("s3"), r3);
+
+  // A multi dimensional partially defined reshape than can be optimized
+  Output v4 = ops::Variable(scope.WithOpName("v4"), {5, 5, 5}, DT_FLOAT);
+  Output c4 = ops::Const(scope.WithOpName("c4").WithControlDependencies(v4),
+                         {5, -1, 5}, {3});
+  Output i4 = ops::Identity(scope.WithOpName("i4"), c4);
+  Output r4 = ops::Reshape(scope.WithOpName("r4"), v4, i4);
+  Output s4 = ops::Square(scope.WithOpName("s4"), r4);
+
+  // A reshape that can't be optimized
+  Output v2 = ops::Variable(scope.WithOpName("v2"), {17, 1}, DT_FLOAT);
+  Output c2 =
+      ops::Const(scope.WithOpName("c2").WithControlDependencies(v2), 17, {1});
+  Output r2 = ops::Reshape(scope.WithOpName("r2"), v2, c2);
+  Output s2 = ops::Square(scope.WithOpName("s2"), r2);
+
+  GrapplerItem item;
+  item.fetch = {"s1", "s2", "s3", "s4"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "r1") {
+      ++found;
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(3, node.input_size());
+      EXPECT_EQ("v1", node.input(0));
+      EXPECT_EQ("^i1", node.input(1));
+      EXPECT_EQ("^d1", node.input(2));
+    } else if (node.name() == "r3") {
+      ++found;
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("v3", node.input(0));
+      EXPECT_EQ("^i3", node.input(1));
+    } else if (node.name() == "r4") {
+      ++found;
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("v4", node.input(0));
+      EXPECT_EQ("^i4", node.input(1));
+    } else if (node.name() == "r2") {
+      ++found;
+      EXPECT_EQ("Reshape", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("v2", node.input(0));
+      EXPECT_EQ("c2", node.input(1));
+    }
+  }
+  EXPECT_EQ(4, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({17}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({17, 1}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 5, 5}));
+  auto v4_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 5, 5}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch,
+                    {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}, {"v4", v4_t}});
+  EXPECT_EQ(4, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(output, item.fetch,
+                    {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}, {"v4", v4_t}});
+  EXPECT_EQ(4, tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+}
+
+TEST_F(ConstantFoldingTest, Packing) {
+  // Build a simple graph with a large constant that can be folded.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(scope.WithOpName("c"), 3.14f, {1000});
+  Output i1 = ops::Identity(scope.WithOpName("i1"), c);
+  Output i2 = ops::Identity(scope.WithOpName("i2"), c);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  const std::vector<string> fetch_nodes = {"i1", "i2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes);
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes);
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+
+  // Make sure that the representation of the folded constant is space
+  // efficient: in particular, the whole message should be smaller than 8k
+  // (the size needed to naively encode 1000 floats folded twice).
+  EXPECT_GT(8000, output.ByteSizeLong());
+}
+
+TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output b = ops::Square(s.WithOpName("b"), a);
+  Output c = ops::Mul(s.WithOpName("c"), a, b);
+  Output d = ops::Shape(s.WithOpName("d"), a);
+  Output e = ops::Shape(s.WithOpName("e"), b);
+
+  auto f = ops::internal::BroadcastGradientArgs(s.WithOpName("f"), d, e);
+  Output o1 = ops::Identity(s.WithOpName("o1"), f.r0);
+  Output o2 = ops::Identity(s.WithOpName("o2"), f.r1);
+
+  Output g = ops::Placeholder(s.WithOpName("g"), DT_FLOAT,
+                              ops::Placeholder::Shape(PartialTensorShape({1})));
+  Output h = ops::Shape(s.WithOpName("h"), g);
+  auto i = ops::internal::BroadcastGradientArgs(s.WithOpName("i"), d, h);
+  Output p1 = ops::Identity(s.WithOpName("p1"), i.r0);
+  Output p2 = ops::Identity(s.WithOpName("p2"), i.r1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 5}));
+  auto g_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}, {"g", g_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+
+  // Run a second time to make sure the optimization is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "o1") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/f-bcastargs-0", node.input(0));
+    } else if (node.name() == "o2") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/f-bcastargs-1", node.input(0));
+    } else if (node.name() == "ConstantFolding/f-bcastargs-0") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^f", node.input(0));
+      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    } else if (node.name() == "ConstantFolding/f-bcastargs-1") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^f", node.input(0));
+      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    } else if (node.name() == "p1") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("i", node.input(0));
+    } else if (node.name() == "p2") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("i:1", node.input(0));
+    }
+  }
+  EXPECT_EQ(6, found);
+
+  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}, {"g", g_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+}
+
+TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({2, 2})));
+  Output b = ops::Square(s.WithOpName("b"), a);
+  Output c = ops::Mul(s.WithOpName("c"), a, b);
+  Output d = ops::Shape(s.WithOpName("d"), a);
+  Output e = ops::Shape(s.WithOpName("e"), b);
+
+  auto f = ops::internal::BroadcastGradientArgs(s.WithOpName("f"), d, e);
+  Output o1 = ops::Identity(s.WithOpName("o1"), f.r0);
+  Output o2 = ops::Identity(s.WithOpName("o2"), f.r1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  std::vector<string> fetch_nodes = {"o1", "o2"};
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = fold.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  EXPECT_EQ(11, output.node_size());
   int found = 0;
   for (const auto& node : output.node()) {
-    if (node.name() == "o1") {
+    if (node.name() == "ConstantFolding/f-folded-1") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^a", node.input(0));
+      EXPECT_EQ("^b", node.input(1));
+    } else if (node.name() == "d") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^a", node.input(0));
+    } else if (node.name() == "e") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^b", node.input(0));
+    } else if (node.name() == "o1") {
       ++found;
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("ConstantFolding/f-bcastargs-0", node.input(0));
@@ -1357,27 +2211,23 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
       ++found;
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^f", node.input(0));
+      EXPECT_EQ("^ConstantFolding/f-folded-1", node.input(0));
       EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
                        .num_elements());
     } else if (node.name() == "ConstantFolding/f-bcastargs-1") {
       ++found;
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^f", node.input(0));
+      EXPECT_EQ("^ConstantFolding/f-folded-1", node.input(0));
       EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
                        .num_elements());
-    } else if (node.name() == "p1") {
-      ++found;
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("i", node.input(0));
-    } else if (node.name() == "p2") {
-      ++found;
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("i:1", node.input(0));
     }
   }
-  EXPECT_EQ(6, found);
+  EXPECT_EQ(7, found);
+  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
@@ -1394,14 +2244,22 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch.push_back("reshape");
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  auto input_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  Tensor indices_t(DT_INT32, TensorShape({2}));
+  indices_t.flat<int>()(0) = 0;
+  indices_t.flat<int>()(1) = 1;
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"input", input_t}, {"indices", indices_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = fold.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1420,10 +2278,531 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
     }
   }
   EXPECT_EQ(3, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"input", input_t}, {"indices", indices_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+}
+
+TEST_F(ConstantFoldingTest, LargeConstant) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  // Generate a 4k by 4k constant matrix.
+  Output mat_diag =
+      ops::Const(scope.WithOpName("mat_diag"), 3.14f, TensorShape({1024 * 4}));
+  Output mat = ops::Diag(scope.WithOpName("mat"), mat_diag);
+  Output out = ops::Identity(scope.WithOpName("out"), mat);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("out");
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // Make sure the diag node hasn't been folded, since it would use too much
+  // memory to encode the corresponding constant.
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "out") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("mat", node.input(0));
+      ++found;
+    } else if (node.name() == "mat") {
+      EXPECT_EQ("Diag", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("mat_diag", node.input(0));
+      ++found;
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  EXPECT_GT(1024 * 1024, output.ByteSizeLong());
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_BOOL,
+                              ops::Placeholder::Shape(TensorShape({})));
+  ops::Switch sw = ops::Switch(s.WithOpName("switch"), x, x);
+  Output id_false = ops::LogicalNot(s.WithOpName("id_false"), sw.output_false);
+  Output id_true = ops::LogicalNot(s.WithOpName("id_true"), sw.output_true);
+
+  GrapplerItem item;
+  item.fetch.push_back("id_false");
+  item.fetch.push_back("id_true");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(6, output.node_size());
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "switch" || node.name() == "x") {
+      ++found;
+    }
+    if (node.name() == "id_false") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^ConstantFoldingCtrl/switch_0", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_true") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^ConstantFoldingCtrl/switch_1", node.input(0));
+      ++found;
+    }
+    if (node.name() == "ConstantFoldingCtrl/switch_0") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("switch", node.input(0));
+      ++found;
+    }
+    if (node.name() == "ConstantFoldingCtrl/switch_1") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("switch:1", node.input(0));
+      ++found;
+    }
+  }
+  EXPECT_EQ(6, found);
+
+  // Evaluate id_true when input tensor x is true.
+  Tensor x_t(DT_BOOL, TensorShape({}));
+  x_t.flat<bool>()(0) = true;
+  auto tensors_expected = EvaluateNodes(item.graph, {"id_true"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, {"id_true"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bool>(tensors_expected[0], tensors[0]);
+
+  // Evalute id_false when input tensor is false.
+  x_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, {"id_false"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  tensors = EvaluateNodes(output, {"id_false"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bool>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
+  std::function<Output(const Scope&, InputList)> addn_fun =
+      [](const Scope& scope, InputList inputs) {
+        return ops::AddN(scope, inputs);
+      };
+  std::function<Output(const Scope&, InputList)> accumulate_fun =
+      [](const Scope& scope, InputList inputs) {
+        return ops::AccumulateNV2(scope, inputs, TensorShape({2, 2}));
+      };
+  for (bool use_add_n : {true, false}) {
+    auto fun = use_add_n ? addn_fun : accumulate_fun;
+    const string op_name = use_add_n ? "AddN" : "AccumulateNV2";
+    Scope s = Scope::NewRootScope();
+    Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Output z = ops::Placeholder(s.WithOpName("z"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Output c1 = ops::Const(s.WithOpName("c1"), 1.0f, {2, 2});
+    Output c2 = ops::Const(s.WithOpName("c2"), 2.0f, {2, 2});
+    Output c3 = ops::Const(s.WithOpName("c3"), 3.0f, {2, 2});
+    Output acc0 = fun(s.WithOpName("acc0"), {c1, c2, c3});
+    Output acc1 = fun(s.WithOpName("acc1"), {x, y, z});
+    Output acc2 = fun(s.WithOpName("acc2"), {c1, x, y});
+    Output acc3 = fun(s.WithOpName("acc3"), {c1, c2, z});
+    Output acc4 = fun(s.WithOpName("acc4"), {c1, y, c2});
+    Output acc5 = fun(s.WithOpName("acc5"), {x, c1, c2});
+    Output acc6 = fun(s.WithOpName("acc6"), {x, c1, y, c2});
+    Output stack = ops::Stack(s.WithOpName("stack"),
+                              {acc0, acc1, acc2, acc3, acc4, acc5, acc6});
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch = {"stack"};
+
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    EXPECT_EQ(16, output.node_size());
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == "acc0") {
+        EXPECT_EQ("Const", node.op());
+      }
+      if (node.name() == "acc1") {
+        EXPECT_EQ(op_name, node.op());
+        EXPECT_EQ(3, node.input_size());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("y", node.input(1));
+        EXPECT_EQ("z", node.input(2));
+      }
+      if (node.name() == "acc2") {
+        EXPECT_EQ(op_name, node.op());
+        EXPECT_EQ(3, node.input_size());
+        EXPECT_EQ("c1", node.input(0));
+        EXPECT_EQ("x", node.input(1));
+        EXPECT_EQ("y", node.input(2));
+      }
+      if (node.name() == "acc3") {
+        EXPECT_EQ(op_name, node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("ConstantFolding/acc3_partial_split_2", node.input(0));
+        EXPECT_EQ("z", node.input(1));
+      }
+      if (node.name() == "acc4") {
+        EXPECT_EQ(op_name, node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("ConstantFolding/acc4_partial_split_2", node.input(0));
+        EXPECT_EQ("y", node.input(1));
+      }
+      if (node.name() == "acc5") {
+        EXPECT_EQ(op_name, node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("ConstantFolding/acc5_partial_split_2", node.input(1));
+      }
+      if (node.name() == "acc6") {
+        EXPECT_EQ(op_name, node.op());
+        EXPECT_EQ(3, node.input_size());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("ConstantFolding/acc6_partial_split_2", node.input(1));
+        EXPECT_EQ("y", node.input(2));
+      }
+      if (str_util::StartsWith(node.name(), "ConstantFolding/")) {
+        EXPECT_EQ("Const", node.op());
+      }
+    }
+
+    std::vector<string> fetch = {"acc0"};
+    auto tensors_expected = EvaluateNodes(item.graph, fetch);
+    auto tensors = EvaluateNodes(output, fetch);
+    EXPECT_EQ(1, tensors_expected.size());
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  }
+}
+
+TEST_F(ConstantFoldingTest, PartialFolding_Concat) {
+  Scope s = Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output z = ops::Placeholder(s.WithOpName("z"), DT_FLOAT,
+                              ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output axis = ops::Const(s.WithOpName("axis"), 0, {});
+  Output c1 = ops::Const(s.WithOpName("c1"), 1.0f, {2, 2});
+  Output c2 = ops::Const(s.WithOpName("c2"), 2.0f, {2, 2});
+  Output concat0 = ops::Concat(s.WithOpName("concat0"), {c1, c2, c1}, axis);
+  Output concat1 = ops::Concat(s.WithOpName("concat1"), {x, y, z}, axis);
+  Output concat2 = ops::Concat(s.WithOpName("concat2"), {c1, x, y}, axis);
+  Output concat3 = ops::Concat(s.WithOpName("concat3"), {c1, c2, z}, axis);
+  Output concat4 = ops::Concat(s.WithOpName("concat4"), {c1, y, c2}, axis);
+  Output concat5 = ops::Concat(s.WithOpName("concat5"), {x, c1, c2}, axis);
+  Output concat6 = ops::Concat(s.WithOpName("concat6"), {x, c1, y, c2}, axis);
+  Output concat7 = ops::Concat(s.WithOpName("concat7"), {x, y, c1, c2}, axis);
+  Output concat8 = ops::Concat(s.WithOpName("concat8"), {x, c1, c2, y}, axis);
+  Output concat9 = ops::Concat(s.WithOpName("concat9"), {c1, c2, x, y}, axis);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"concat0", "concat1", "concat2", "concat3", "concat4",
+                "concat5", "concat6", "concat7", "concat8", "concat9"};
+
+  auto tensors_expected = EvaluateNodes(item.graph, {"concat0"});
+  EXPECT_EQ(1, tensors_expected.size());
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(21, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "concat0") {
+      EXPECT_EQ("Const", node.op());
+    } else if (node.name() == "concat3") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("ConstantFolding/concat3_partial_split_0", node.input(0));
+      EXPECT_EQ("z", node.input(1));
+      EXPECT_EQ("axis", node.input(2));
+    } else if (node.name() == "concat5") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("ConstantFolding/concat5_partial_split_1", node.input(1));
+      EXPECT_EQ("axis", node.input(2));
+    } else if (node.name() == "concat7") {
+      EXPECT_EQ(4, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("ConstantFolding/concat7_partial_split_2", node.input(2));
+      EXPECT_EQ("axis", node.input(3));
+    } else if (node.name() == "concat8") {
+      EXPECT_EQ(4, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("ConstantFolding/concat8_partial_split_1", node.input(1));
+      EXPECT_EQ("y", node.input(2));
+      EXPECT_EQ("axis", node.input(3));
+    } else if (node.name() == "concat9") {
+      EXPECT_EQ(4, node.input_size());
+      EXPECT_EQ("ConstantFolding/concat9_partial_split_0", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+      EXPECT_EQ("y", node.input(2));
+      EXPECT_EQ("axis", node.input(3));
+    } else if (str_util::StartsWith(node.name(), "ConstantFolding/")) {
+      EXPECT_EQ("Const", node.op());
+    } else {
+      EXPECT_EQ(item.graph.node(i).DebugString(), node.DebugString());
+    }
+  }
+
+  auto tensors = EvaluateNodes(output, {"concat0"});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(scope.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape(TensorShape({})));
+  Output c1 = ops::Const(scope.WithOpName("c1"), 1.0f, {2, 2});
+  Output c2 = ops::Const(scope.WithOpName("c2"), 2.0f, {2, 2});
+  auto id_n = ops::IdentityN(scope.WithOpName("id_n"), {c1, x, c2});
+  auto id0 = ops::Identity(scope.WithOpName("id0"), id_n[0]);
+  auto id1 = ops::Identity(scope.WithOpName("id1"), id_n[1]);
+  auto add0 = ops::Add(scope.WithOpName("add0"), id_n[0], id_n[1]);
+  auto add1 = ops::Add(scope.WithOpName("add1"), id_n[0], id_n[2]);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("id0");
+  item.fetch.push_back("id1");
+  item.fetch.push_back("add0");
+  item.fetch.push_back("add1");
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(8, output.node_size());
+  for (const auto& node : output.node()) {
+    // id_n should remain unchanged.
+    if (node.name() == "id_n") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("c1", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+      EXPECT_EQ("c2", node.input(2));
+    }
+    // id0 should be constant folded, and a control dependency from id_n.
+    if (node.name() == "id0") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^id_n", node.input(0));
+    }
+    // id1 is unchanged.
+    if ("id1" == node.name()) {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("id_n:1", node.input(0));
+    }
+
+    if ("add0" == node.name()) {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("c1", node.input(0));
+      EXPECT_EQ("id_n:1", node.input(1));
+    }
+    // add1 should bo constant folded and have a control dependency from id_n.
+    if ("add1" == node.name()) {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^id_n", node.input(0));
+    }
+  }
+
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(4, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(4, tensors.size());
+  for (int i = 0; i < tensors.size(); i++) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+  }
+}
+
+TEST_F(ConstantFoldingTest, TrivialPack) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output x =
+      ops::RandomNormal(scope.WithOpName("x"), {2, 2}, DataType::DT_FLOAT);
+  Output y = ops::Const(scope.WithOpName("y"), {2.0f}, {});
+  auto stack =
+      ops::Stack(scope.WithOpName("stack").WithControlDependencies({y}), {x},
+                 ops::Stack::Axis(1));
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("stack");
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(5, output.node_size());
+  for (const auto& node : output.node()) {
+    if (node.name() == "stack") {
+      EXPECT_EQ("stack", node.name());
+      EXPECT_EQ("ExpandDims", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("ConstantFolding/stack_const_axis", node.input(1));
+      EXPECT_EQ("^y", node.input(2));
+    } else if (node.name() == "ConstantFolding/stack_const_axis") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^x", node.input(0));
+    }
+  }
+
+  std::vector<string> fetch = {"stack"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  EXPECT_EQ(tensors_expected[0].shape(), tensors[0].shape());
+}
+
+// The test does not evalute the optimized and original graphs to check if their
+// outputs are the same. See b/78233179.
+TEST_F(ConstantFoldingTest, Enter) {
+  GrapplerItem item;
+  AttrValue frame_name;
+  frame_name.set_s("foo");
+  AttrValue is_constant_true;
+  is_constant_true.set_b(true);
+  AttrValue is_constant_false;
+  is_constant_false.set_b(false);
+  AttrValue type;
+  type.set_type(DT_FLOAT);
+  AttrValue value;
+  Tensor value_tensor(DT_FLOAT, TensorShape({}));
+  value_tensor.flat<float>()(0) = 1;
+  value_tensor.AsProtoTensorContent(value.mutable_tensor());
+
+  GraphDef& graph = item.graph;
+  AddNode("x", "Placeholder", {}, {{"dtype", type}}, &graph);
+  AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph);
+  AddNode("enter1", "Enter", {"x"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_true}},
+          &graph);
+  AddNode("enter2", "Enter", {"c1"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_true}},
+          &graph);
+  AddNode("enter3", "Enter", {"c1"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_false}},
+          &graph);
+  AddNode("id1", "Identity", {"enter1"}, {{"T", type}}, &graph);
+  AddNode("id2", "Identity", {"enter2"}, {{"T", type}}, &graph);
+  AddNode("id3", "Identity", {"enter2"}, {{"T", type}}, &graph);
+  AddNode("id4", "Identity", {"enter3"}, {{"T", type}}, &graph);
+  item.fetch.push_back("id1");
+  item.fetch.push_back("id2");
+  item.fetch.push_back("id3");
+  item.fetch.push_back("id4");
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(9, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "id1") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("enter1", node.input(0));
+    }
+    if (node.name() == "id2" || node.name() == "id3") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^enter2", node.input(0));
+    }
+    if (node.name() == "id4") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("enter3", node.input(0));
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, TensorArraySize) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output size = ops::Const(scope.WithOpName("size"), 5, TensorShape({}));
+  auto dynamic_array =
+      ops::TensorArray(scope.WithOpName("dynamic"), size, DT_FLOAT,
+                       ops::TensorArray::DynamicSize(true));
+  auto static_array =
+      ops::TensorArray(scope.WithOpName("static"), size, DT_FLOAT,
+                       ops::TensorArray::DynamicSize(false));
+  auto dynamic_sz = ops::TensorArraySize(
+      scope.WithOpName("dynamic_sz"), dynamic_array.handle, dynamic_array.flow);
+  auto static_sz = ops::TensorArraySize(scope.WithOpName("static_sz"),
+                                        static_array.handle, static_array.flow);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  auto tensors_expected =
+      EvaluateNodes(item.graph, {"dynamic_sz", "static_sz"});
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  EXPECT_EQ("dynamic_sz", output.node(3).name());
+  EXPECT_EQ("TensorArraySizeV3", output.node(3).op());
+  EXPECT_EQ("static_sz", output.node(4).name());
+  EXPECT_EQ("Const", output.node(4).op());
+
+  auto tensors_actual = EvaluateNodes(output, {"dynamic_sz", "static_sz"});
+  EXPECT_EQ(2, tensors_expected.size());
+  EXPECT_EQ(2, tensors_actual.size());
+  test::ExpectTensorEqual<int32>(tensors_expected[0], tensors_actual[0]);
+  test::ExpectTensorEqual<int32>(tensors_expected[1], tensors_actual[1]);
 }
 
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-
-//  LocalWords:  NewRootScope
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab9af5acff413b851bbeb823d5b2971ce39a5d92
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A custom optimizer that can be registered.
+class CustomGraphOptimizer : public GraphOptimizer {
+ public:
+  virtual ~CustomGraphOptimizer() {}
+  virtual Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer*
+                          config = nullptr) = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6eed43c2b132c02b58a0088c30dd5648fe80d212
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+typedef std::unordered_map<string, CustomGraphOptimizerRegistry::Creator>
+    RegistrationMap;
+RegistrationMap* registered_optimizers = nullptr;
+RegistrationMap* GetRegistrationMap() {
+  if (registered_optimizers == nullptr)
+    registered_optimizers = new RegistrationMap;
+  return registered_optimizers;
+}
+}  // namespace
+
+std::unique_ptr<CustomGraphOptimizer>
+CustomGraphOptimizerRegistry::CreateByNameOrNull(const string& name) {
+  const auto it = GetRegistrationMap()->find(name);
+  if (it == GetRegistrationMap()->end()) return nullptr;
+  return std::unique_ptr<CustomGraphOptimizer>(it->second());
+}
+
+std::vector<string> CustomGraphOptimizerRegistry::GetRegisteredOptimizers() {
+  std::vector<string> optimizer_names;
+  optimizer_names.reserve(GetRegistrationMap()->size());
+  for (const auto& opt : *GetRegistrationMap())
+    optimizer_names.emplace_back(opt.first);
+  return optimizer_names;
+}
+
+void CustomGraphOptimizerRegistry::RegisterOptimizerOrDie(
+    const Creator& optimizer_creator, const string& name) {
+  const auto it = GetRegistrationMap()->find(name);
+  if (it != GetRegistrationMap()->end()) {
+    LOG(FATAL) << "CustomGraphOptimizer is registered twice: " << name;
+  }
+  GetRegistrationMap()->insert({name, optimizer_creator});
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..3148a5f809f0dffe333058ea08bc1a6118e31306
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_REGISTRY_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class CustomGraphOptimizerRegistry {
+ public:
+  static std::unique_ptr<CustomGraphOptimizer> CreateByNameOrNull(
+      const string& name);
+
+  static std::vector<string> GetRegisteredOptimizers();
+
+  typedef std::function<CustomGraphOptimizer*()> Creator;
+  // Register graph optimizer which can be called during program initialization.
+  // This class is not thread-safe.
+  static void RegisterOptimizerOrDie(const Creator& optimizer_creator,
+                                     const string& name);
+};
+
+class CustomGraphOptimizerRegistrar {
+ public:
+  explicit CustomGraphOptimizerRegistrar(
+      const CustomGraphOptimizerRegistry::Creator& creator,
+      const string& name) {
+    CustomGraphOptimizerRegistry::RegisterOptimizerOrDie(creator, name);
+  }
+};
+
+#define REGISTER_GRAPH_OPTIMIZER_AS(MyCustomGraphOptimizerClass, name) \
+  namespace {                                                          \
+  static CustomGraphOptimizerRegistrar                                 \
+      MyCustomGraphOptimizerClass##_registrar(                         \
+          []() { return new MyCustomGraphOptimizerClass; }, (name));   \
+  }  // namespace
+
+#define REGISTER_GRAPH_OPTIMIZER(MyCustomGraphOptimizerClass) \
+  REGISTER_GRAPH_OPTIMIZER_AS(MyCustomGraphOptimizerClass,    \
+                              #MyCustomGraphOptimizerClass)
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_REGISTRY_H_
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdb1ae853210bc972c08d0b79d8cc552bfdf12a5
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+static const char* kTestOptimizerName = "Test";
+
+class TestGraphOptimizer : public CustomGraphOptimizer {
+ public:
+  Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
+                  nullptr) override {
+    return Status::OK();
+  }
+  string name() const override { return kTestOptimizerName; }
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override {
+    return Status::OK();
+  }
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override {}
+};
+
+REGISTER_GRAPH_OPTIMIZER_AS(TestGraphOptimizer, "StaticRegister");
+
+TEST(CustomGraphOptimizerRegistryTest, DynamicRegistration) {
+  std::vector<string> optimizers =
+      CustomGraphOptimizerRegistry::GetRegisteredOptimizers();
+  std::unique_ptr<const CustomGraphOptimizer> test_optimizer;
+  ASSERT_EQ(
+      0, std::count(optimizers.begin(), optimizers.end(), "DynamicRegister"));
+  test_optimizer =
+      CustomGraphOptimizerRegistry::CreateByNameOrNull("DynamicRegister");
+  EXPECT_EQ(nullptr, test_optimizer);
+  CustomGraphOptimizerRegistry::RegisterOptimizerOrDie(
+      []() { return new TestGraphOptimizer; }, "DynamicRegister");
+  optimizers = CustomGraphOptimizerRegistry::GetRegisteredOptimizers();
+  ASSERT_EQ(
+      1, std::count(optimizers.begin(), optimizers.end(), "DynamicRegister"));
+  test_optimizer =
+      CustomGraphOptimizerRegistry::CreateByNameOrNull("DynamicRegister");
+  ASSERT_NE(nullptr, test_optimizer);
+  EXPECT_EQ(kTestOptimizerName, test_optimizer->name());
+}
+
+TEST(CustomGraphOptimizerRegistryTest, StaticRegistration) {
+  const std::vector<string> optimizers =
+      CustomGraphOptimizerRegistry::GetRegisteredOptimizers();
+  EXPECT_EQ(1,
+            std::count(optimizers.begin(), optimizers.end(), "StaticRegister"));
+  std::unique_ptr<const CustomGraphOptimizer> test_optimizer =
+      CustomGraphOptimizerRegistry::CreateByNameOrNull("StaticRegister");
+  ASSERT_NE(nullptr, test_optimizer);
+  EXPECT_EQ(kTestOptimizerName, test_optimizer->name());
+}
+
+TEST(GraphOptimizerRegistryTest, CrashesOnDuplicateRegistration) {
+  const auto creator = []() { return new TestGraphOptimizer; };
+  EXPECT_DEATH(CustomGraphOptimizerRegistry::RegisterOptimizerOrDie(
+                   creator, "StaticRegister"),
+               "twice");
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9701a038d0287db7745b9181b429bd81b1cdd854
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/debug_stripper.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* output) {
+  *output = item.graph;
+  for (NodeDef& node : *output->mutable_node()) {
+    if (IsAssert(node)) {
+      // Convert this node into a no-op.
+      node.set_op("NoOp");
+      node.clear_attr();
+      // Convert all its inputs into control dependency, which will then
+      // be optimized away by dependency optimizer.
+      for (string& inp : *node.mutable_input()) {
+        if (!IsControlInput(inp)) {
+          inp = AsControlDependency(inp);
+        }
+      }
+    } else if (IsCheckNumerics(node) || IsPrint(node)) {
+      // Replace with Identity op which will be pruned later.
+      node.set_op("Identity");
+      // Only preserve T attribute.
+      protobuf::Map<string, AttrValue> new_attr;
+      if (node.attr().find("T") != node.attr().end()) {
+        new_attr.insert({"T", node.attr().at("T")});
+      }
+      node.mutable_attr()->swap(new_attr);
+      // As Identity op only takes one input, mark redundant inputs as control
+      // input.
+      for (size_t i = 1; i < node.input_size(); ++i) {
+        if (!IsControlInput(node.input(i))) {
+          *node.mutable_input(i) = AsControlDependency(node.input(i));
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void DebugStripper::Feedback(Cluster* cluster, const GrapplerItem& item,
+                             const GraphDef& optimize_output, double result) {
+  // Takes no feedback.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.h b/tensorflow/core/grappler/optimizers/debug_stripper.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fe25aa1c38c1c33c235b78c1a3763631d4c74d4
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.h
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEBUG_STRIPPER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEBUG_STRIPPER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// DebugStripper strips off debug-related nodes (e.g.
+// Assert, CheckNumerics, Print) from the graph.
+class DebugStripper : public GraphOptimizer {
+ public:
+  DebugStripper() {}
+  ~DebugStripper() override {}
+
+  string name() const override { return "debug_stripper"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEBUG_STRIPPER_H_
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..96ceee791f8f15c3cba1d8a6a5ae5e1f1106597c
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
@@ -0,0 +1,205 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/debug_stripper.h"
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class DebugStripperTest : public GrapplerTest {};
+
+TEST_F(DebugStripperTest, OutputEqualToInput) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({}));
+  Output add = ops::Add(s, x, y);
+  Output result = ops::Identity(s, add);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  CompareGraphs(item.graph, output);
+}
+
+TEST_F(DebugStripperTest, StripAssertFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto greaterequal = ops::GreaterEqual(s.WithOpName("GreaterEqual"), x, y);
+  auto assert = ops::Assert(s.WithOpName("Assert"), greaterequal, {x, y});
+  Output add = ops::Add(
+      s.WithOpName("z").WithControlDependencies({assert.operation}), x, y);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "GreaterEqual") {
+      count++;
+      EXPECT_EQ("GreaterEqual", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    } else if (node.name() == "Assert") {
+      count++;
+      EXPECT_EQ("NoOp", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("^GreaterEqual", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ("^y", node.input(2));
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^Assert", node.input(2));
+    }
+  }
+  EXPECT_EQ(5, count);
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  Tensor y_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  y_t.flat<float>()(0) = 0.5f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"z"}, {{"x", x_t}, {"y", y_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"z"}, {{"x", x_t}, {"y", y_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
+TEST_F(DebugStripperTest, StripCheckNumericsFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto check1 = ops::CheckNumerics(s.WithOpName("CheckNumerics1"), x, "foo");
+  auto check2 = ops::CheckNumerics(s.WithOpName("CheckNumerics2"), y, "foo");
+  Output add = ops::Add(s.WithOpName("z"), check1, check2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "CheckNumerics1") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ(1, node.attr_size());
+    } else if (node.name() == "CheckNumerics2") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ(1, node.attr_size());
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("CheckNumerics1", node.input(0));
+      EXPECT_EQ("CheckNumerics2", node.input(1));
+    }
+  }
+  EXPECT_EQ(5, count);
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  Tensor y_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  y_t.flat<float>()(0) = 0.5f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"z"}, {{"x", x_t}, {"y", y_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"z"}, {{"x", x_t}, {"y", y_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
+TEST_F(DebugStripperTest, StripPrintFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output print = ops::Print(s.WithOpName("Print"), x, {x});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "Print") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ(1, node.attr_size());
+    }
+  }
+
+  EXPECT_EQ(2, output.node_size());
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"Print"}, {{"x", x_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"Print"}, {{"x", x_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index edb0db65e987318e1e64bf0288b6ef18a7b9d662..7b7fd8115588a6dceb4a74d502e2883a84f57199 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -87,7 +88,7 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
     // Don't turn Identity nodes following Switch into NoOp or remove them
     // if it requires anchoring a control dependencies the Switch node, which
     // is not valid.
-    if (StringPiece(node.name()).starts_with(kConstantFoldingCtrl)) {
+    if (str_util::StartsWith(node.name(), kConstantFoldingCtrl)) {
       // TODO(rmlarsen): Try to remove this artificial contraint.
       return false;
     }
@@ -108,23 +109,12 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
 }
 
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
-  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
-    return false;
-  }
-  if (!fetch_nodes_known_ || NumNonControlOutputs(node, *node_map_) > 0) {
-    // The output values of this node may be needed.
-    return false;
-  }
-  if (IsMerge(node) || IsSwitch(node)) {
-    return false;
-  }
-  if (ModifiesFrameInfo(node)) {
-    return false;
-  }
-  if (!IsFreeOfSideEffect(node)) {
+  if (!fetch_nodes_known_ ||
+      nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
-  if (node.op() == "ControlTrigger") {
+  if (IsMerge(node) || IsSwitch(node) || ModifiesFrameInfo(node) ||
+      !IsFreeOfSideEffect(node)) {
     return false;
   }
   if (node.op().rfind("Submodel", 0) == 0) {
@@ -135,16 +125,21 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
   if (!status.ok() || op_def->output_arg_size() == 0) {
     return false;
   }
-
+  const std::unordered_set<string> do_not_rewrite_ops{
+      "Assert",      "CheckNumerics",         "_Retval",
+      "_Arg",        "_ParallelConcatUpdate", "_TPUExecute",
+      "_TPUCompile", "ControlTrigger"};
+  if (do_not_rewrite_ops.find(node.op()) != do_not_rewrite_ops.end()) {
+    return false;
+  }
   if (!SafeToRemoveIdentity(node)) {
     return false;
   }
-
-  const std::unordered_set<string> do_not_rewrite_ops{
-      "Assert",     "CheckNumerics",         "_Retval",
-      "_Arg",       "_ParallelConcatUpdate", "_TPUExecute",
-      "_TPUCompile"};
-  return do_not_rewrite_ops.find(node.op()) == do_not_rewrite_ops.end();
+  if (NumNonControlOutputs(node, *node_map_) > 0) {
+    // The output values of this node may be needed.
+    return false;
+  }
+  return true;
 }
 
 void DependencyOptimizer::OptimizeNode(int node_idx,
@@ -163,7 +158,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       bool data_connection = false;
       for (int i = fanout->input_size() - 1; i >= 0; --i) {
         int pos;
-        string input_name = ParseNodeName(fanout->input(i), &pos);
+        StringPiece input_name =
+            ParseNodeNameAsStringPiece(fanout->input(i), &pos);
         if (input_name == node_name) {
           if (pos < 0) {
             fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
@@ -274,19 +270,27 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
   //           +----------+             y --^> b
 
   if (is_noop || is_identity) {
+    if (is_identity && !SafeToRemoveIdentity(*node)) {
+      return;
+    }
+
     const auto& output_node_set = node_map_->GetOutputs(node_name);
     const std::vector<NodeDef*> output_nodes(output_node_set.begin(),
                                              output_node_set.end());
     const int num_outputs = output_nodes.size();
     const int num_inputs = node->input_size();
 
+    // Don't increase the number of edges in the graph.
     if (num_inputs * num_outputs > num_inputs + num_outputs) {
       return;
     }
     std::vector<NodeDef*> input_nodes;
     for (int i = 0; i < num_inputs; ++i) {
       NodeDef* input_node = node_map_->GetNode(node->input(i));
-      CHECK_NE(input_node, nullptr);
+      if (input_node == nullptr) {
+        LOG(ERROR) << "Invalid input " << node->input(i);
+        return;
+      }
       input_nodes.push_back(input_node);
     }
 
@@ -298,31 +302,36 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
          output_nodes[0]->device() != node->device())) {
       return;
     }
-    if (num_inputs == 2 && num_outputs == 2) {
-      const string& noop_dev = node->device();
-      const string& in0_dev = input_nodes[0]->device();
-      const string& in1_dev = input_nodes[1]->device();
-      const string& out0_dev = output_nodes[0]->device();
-      const string& out1_dev = output_nodes[1]->device();
-      const int num_cross_before = static_cast<int>(in0_dev != noop_dev) +
-                                   static_cast<int>(in1_dev != noop_dev) +
-                                   static_cast<int>(out0_dev != noop_dev) +
-                                   static_cast<int>(out1_dev != noop_dev);
-      const int num_cross_after = static_cast<int>(in0_dev != out0_dev) +
-                                  static_cast<int>(in0_dev != out1_dev) +
-                                  static_cast<int>(in1_dev != out0_dev) +
-                                  static_cast<int>(in1_dev != out1_dev);
-      if (num_cross_after > num_cross_before) {
-        return;
-      }
-      // To avoid potentially removing Identity nodes following _Recv nodes,
-      // we require that no device crossings occur in that case.
-      // TODO(rmlarsen): See if we can relax this condition.
-      if (is_identity && (num_cross_after > 0 || num_cross_before > 0)) {
-        return;
+
+    // TODO(rmlarsen): Not all device crossings are equally expensive.
+    // Assign a cost to each based on device affinity and compute a
+    // cost before and after.
+    const string& node_dev = node->device();
+    int num_cross_in = 0;
+    for (NodeDef* input_node : input_nodes) {
+      num_cross_in += static_cast<int>(input_node->device() != node_dev);
+    }
+    int num_cross_out = 0;
+    for (NodeDef* output_node : output_nodes) {
+      num_cross_out += static_cast<int>(output_node->device() != node_dev);
+    }
+    if (is_identity && num_cross_in > 0 && num_cross_out > 0) {
+      // This identity node follows a device crossing, so it might be
+      // following a _Recv node after partioning. Do not remove such nodes,
+      // unless they only have consumers on the same device as themselves.
+      return;
+    }
+
+    // Make sure we do not increase the number of device crossings.
+    const int num_cross_before = num_cross_in + num_cross_out;
+    int num_cross_after = 0;
+    for (NodeDef* input_node : input_nodes) {
+      for (NodeDef* output_node : output_nodes) {
+        num_cross_after +=
+            static_cast<int>(input_node->device() != output_node->device());
       }
     }
-    if (is_identity && !SafeToRemoveIdentity(*node)) {
+    if (num_cross_after > num_cross_before) {
       return;
     }
 
@@ -343,16 +352,23 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
           CHECK(!IsControlInput(input_to_forward));
           for (int j = 0; j < consumer->input_size(); ++j) {
             const string& old_input = consumer->input(j);
-            if (old_input == node_name) {
-              new_input = input_to_forward;
-              node_map_->UpdateInput(consumer->name(), old_input, new_input);
-              consumer->set_input(j, new_input);
-              found_input = true;
-            } else if (old_input == AsControlDependency(NodeName(node_name))) {
-              new_input = AsControlDependency(NodeName(input_to_forward));
-              node_map_->UpdateInput(consumer->name(), old_input, new_input);
-              consumer->set_input(j, new_input);
-              found_input = true;
+            int old_input_pos;
+            StringPiece old_input_node_name =
+                ParseNodeNameAsStringPiece(old_input, &old_input_pos);
+            if (old_input_node_name == node_name) {
+              if (old_input_pos >= 0) {
+                // Regular input
+                new_input = input_to_forward;
+                node_map_->UpdateInput(consumer->name(), old_input, new_input);
+                consumer->set_input(j, new_input);
+                found_input = true;
+              } else {
+                // Control dependency
+                new_input = AsControlDependency(NodeName(input_to_forward));
+                node_map_->UpdateInput(consumer->name(), old_input, new_input);
+                consumer->set_input(j, new_input);
+                found_input = true;
+              }
             }
           }
           CHECK(found_input);
@@ -508,10 +524,6 @@ Status DependencyOptimizer::TransitiveReduction() {
       if (longest_distance[target] > 1) {
         const int input_slot = control_output.second;
         control_edges_to_remove[target].emplace(input_slot, source);
-        //        VLOG(1) << "Removing edge from:\n"
-        //                << optimized_graph_->node(source).DebugString() <<
-        //                "\n\nto:\n\n"
-        //                << optimized_graph_->node(target).DebugString();
       }
     }
   }
@@ -566,7 +578,9 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       // Remove redundant control dependencies.
       TF_RETURN_IF_ERROR(TransitiveReduction());
     } else {
-      LOG(ERROR) << topo_sort_status.error_message();
+      LOG(ERROR) << "Iteration = " << iteration
+                 << ", topological sort failed with message: "
+                 << topo_sort_status.error_message();
     }
     // Turn nodes with only control outputs into NoOps, prune NoOp and Identity
     // nodes.
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 61ed15479370614bc79c15b450039f0cbf30908d..b4db98125aa740b5d261e8f9ad0ea5bfd8102877 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -29,9 +29,8 @@ namespace grappler {
 // optimizations, such as removing nodes that are effectively noops.
 class DependencyOptimizer : public GraphOptimizer {
  public:
-  DependencyOptimizer() : opt_level_(RewriterConfig::ON) {}
-  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+  DependencyOptimizer() {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level) {}
   ~DependencyOptimizer() override {}
 
   string name() const override { return "dependency_optimizer"; };
@@ -63,7 +62,6 @@ class DependencyOptimizer : public GraphOptimizer {
   // Main driver of dependency optimizations.
   Status OptimizeDependencies();
 
-  RewriterConfig::Toggle opt_level_;
   bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 33d6b992d21212fe325c642b87d3c3736185c445..6a297da52d075ea9bdae4584b7646ee44b950012 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -285,6 +285,38 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_DeviceBoundaries) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
+TEST_F(DependencyOptimizerTest, RemoveIdentityOps_DeviceBoundaries) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  // Identity with a single input- and two output dependencies.
+  auto id_a = ops::Identity(s.WithOpName("id_a").WithDevice("/CPU:1"), x);
+  // Identity with a two input- and a single output dependency.
+  auto id_b = ops::Identity(
+      s.WithOpName("id_b").WithControlDependencies(y).WithDevice("/CPU:0"), x);
+
+  Output id =
+      ops::Identity(s.WithControlDependencies(id_a).WithDevice("/CPU:1"), id_b);
+  Output id_1 = ops::Identity(s.WithDevice("/CPU:1"), id_a);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+  item.fetch.push_back("Identity_1");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // The optimization should be disabled to prevent increasing the number of
+  // nodes crossing device boundaries.
+  TF_CHECK_OK(TopologicalSort(&item.graph));
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
 TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
@@ -515,6 +547,181 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
   }
 }
 
+TEST_F(DependencyOptimizerTest, IdentityInputs) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output b = ops::Placeholder(scope.WithOpName("b"), DT_BOOL);
+  Output x = ops::RandomUniform(scope.WithOpName("x"), {1, 2}, DT_FLOAT);
+  auto s = ops::Switch(scope.WithOpName("s"), x, b);
+
+  // Identity nodes to be removed.
+  auto id_f = ops::Identity(scope.WithOpName("id_f"), s.output_false);
+  auto id_t = ops::Identity(scope.WithOpName("id_t"), s.output_true);
+
+  // Output
+  Output out1 = ops::Identity(scope.WithOpName("out1"), id_f);
+  Output out2 = ops::Identity(scope.WithOpName("out2"), id_t);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"out1", "out2"};
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(6, output.node_size());
+  EXPECT_EQ("out1", output.node(4).name());
+  EXPECT_EQ(1, output.node(4).input_size());
+  EXPECT_EQ("s", output.node(4).input(0));
+
+  EXPECT_EQ("out2", output.node(5).name());
+  EXPECT_EQ(1, output.node(5).input_size());
+  EXPECT_EQ("s:1", output.node(5).input(0));
+}
+
+TEST_F(DependencyOptimizerTest, IdentityN) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output b = ops::Placeholder(scope.WithOpName("b"), DT_BOOL);
+  Output x = ops::RandomUniform(scope.WithOpName("x"), {1, 2}, DT_FLOAT);
+  auto s = ops::Switch(scope.WithOpName("s"), x, b);
+
+  // IdentityN nodes to be removed.
+  auto id_f = ops::IdentityN(scope.WithOpName("id_f"), {s.output_false});
+  auto id_t = ops::IdentityN(scope.WithOpName("id_t"), {s.output_true});
+
+  // IdentityN node that can't be removed.
+  auto id_b =
+      ops::IdentityN(scope.WithOpName("id_b"), {s.output_false, s.output_true});
+
+  // Outputs
+  Output out1 = ops::Identity(scope.WithOpName("out1"), id_f[0]);
+  Output out2 = ops::Identity(scope.WithOpName("out2"), id_t[0]);
+  Output out3 = ops::Identity(scope.WithOpName("out3"), id_b[0]);
+  Output out4 = ops::Identity(scope.WithOpName("out4"), id_b[1]);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"out1", "out2", "out3", "out4"};
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(9, output.node_size());
+  EXPECT_EQ("out1", output.node(5).name());
+  EXPECT_EQ(1, output.node(5).input_size());
+  EXPECT_EQ("s", output.node(5).input(0));
+
+  EXPECT_EQ("out2", output.node(6).name());
+  EXPECT_EQ(1, output.node(6).input_size());
+  EXPECT_EQ("s:1", output.node(6).input(0));
+
+  EXPECT_EQ("out3", output.node(7).name());
+  EXPECT_EQ(1, output.node(7).input_size());
+  EXPECT_EQ("id_b", output.node(7).input(0));
+
+  EXPECT_EQ("out4", output.node(8).name());
+  EXPECT_EQ(1, output.node(8).input_size());
+  EXPECT_EQ("id_b:1", output.node(8).input(0));
+}
+
+TEST_F(DependencyOptimizerTest,
+       Identity_DeviceCrossing_ConsumerOnDifferentDevice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x_on_1 =
+      ops::Const(s.WithOpName("x_on_1").WithDevice("/gpu:1"), {1.0f}, {});
+  Output one_on_3 =
+      ops::Const(s.WithOpName("one_on_3").WithDevice("/gpu:3"), {1.0f}, {});
+  Output x_on_2 =
+      ops::Identity(s.WithOpName("x_on_2").WithDevice("/gpu:2"), x_on_1);
+  Output result =
+      ops::Add(s.WithOpName("result").WithDevice("/gpu:3"), x_on_2, one_on_3);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"result"};
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
+TEST_F(DependencyOptimizerTest, Identity_DeviceCrossing_ConsumerOnSameDevice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x_on_1 =
+      ops::Const(s.WithOpName("x_on_1").WithDevice("/gpu:1"), {1.0f}, {});
+  Output one_on_2 =
+      ops::Const(s.WithOpName("one_on_2").WithDevice("/gpu:2"), {1.0f}, {});
+  Output x_on_2 =
+      ops::Identity(s.WithOpName("x_on_2").WithDevice("/gpu:2"), x_on_1);
+  Output result =
+      ops::Add(s.WithOpName("result").WithDevice("/gpu:2"), x_on_2, one_on_2);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"result"};
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  LOG(INFO) << output.DebugString();
+  EXPECT_EQ(3, output.node_size());
+  for (const auto& node : output.node()) {
+    EXPECT_NE("x_on_2", node.name());
+    if (node.name() == "result") {
+      EXPECT_EQ("x_on_1", node.input(0));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, RemoveGreaterEqualWithNoOp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto greaterequal = ops::GreaterEqual(s.WithOpName("GreaterEqual"), x, y);
+  auto noop =
+      ops::NoOp(s.WithOpName("NoOp").WithControlDependencies(greaterequal));
+  Output add = ops::Add(
+      s.WithOpName("z").WithControlDependencies({noop.operation}), x, y);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  item.fetch.push_back("z");
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "GreaterEqual") {
+      count++;
+    } else if (node.name() == "NoOp") {
+      count++;
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+  }
+  EXPECT_EQ(3, count);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a6de9e3b29e5dc5a2737d547f6babf930a816d0
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -0,0 +1,618 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include <unordered_map>
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+// Mark functions that were created as a result of function specialization.
+constexpr char kGrapplerSpecializedFuncAttr[] = "_GrapplerSpecializedFunc";
+
+constexpr char kNoInlineAttr[] = "_noinline";
+
+bool AttrIsTrue(const FunctionDef& func, const string& attr) {
+  return func.attr().count(attr) != 0 && func.attr().at(attr).b();
+}
+
+bool MarkedSpecialized(const FunctionDef& func) {
+  return AttrIsTrue(func, kGrapplerSpecializedFuncAttr);
+}
+
+bool MarkedNoInline(const FunctionDef& func) {
+  return AttrIsTrue(func, kNoInlineAttr);
+}
+
+// Find unique name for the specialized function. Collision can happen if
+// specialized function is instantiated for the nodes with the same name (e.g.
+// inside function body of two different functions).
+string UniqueSpecializedFunctionName(const FunctionDef& func,
+                                     const NodeDef& func_node,
+                                     const FunctionLibraryDefinition& flib) {
+  using str_util::StringReplace;
+  using strings::StrCat;
+
+  string specialized_name = StrCat(func.signature().name(), "_specialized_for_",
+                                   StringReplace(func_node.name(), "/", "_",
+                                                 /*replace_all*/ true));
+  string unique_name = specialized_name;
+
+  int idx = 0;
+  while (flib.Find(unique_name)) {
+    unique_name = strings::StrCat(specialized_name, "_", ++idx);
+  }
+  return unique_name;
+}
+
+class FunctionOptimizerContext {
+ public:
+  explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
+                                    const GrapplerItem& item)
+      : function_library_(OpRegistry::Global(), item.graph.library()) {
+    InitializeInlinedFunctions(opt_level, item);
+  }
+
+  const FunctionLibraryDefinition& function_library() const {
+    return function_library_;
+  }
+
+  FunctionLibraryDefinition& mutable_function_library() {
+    return function_library_;
+  }
+
+  bool IsInlinedFunction(const string& name) const {
+    return inlined_functions_.count(name) > 0;
+  }
+
+  // Find inlining candidate by name. Return nullptr if not found.
+  const FunctionDef* FindInlinedFunction(const string& name) const {
+    return gtl::FindWithDefault(inlined_functions_, name, nullptr);
+  }
+
+ private:
+  void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level,
+                                  const GrapplerItem& item) {
+    bool aggressive = opt_level == RewriterConfig::AGGRESSIVE;
+
+    for (const FunctionDef& func : item.graph.library().function()) {
+      // Can't create IdentityN nodes with no input or output: skip these
+      // functions for now.
+      if (func.signature().input_arg_size() == 0 ||
+          func.signature().output_arg_size() == 0) {
+        continue;
+      }
+      bool marked_noinline = MarkedNoInline(func);
+      bool marked_specialized = MarkedSpecialized(func);
+
+      if (!marked_specialized && (!marked_noinline || aggressive)) {
+        inlined_functions_[func.signature().name()] = &func;
+      }
+    }
+  }
+
+  FunctionLibraryDefinition function_library_;
+  // Functions that can be inlined into optimized graph.
+  std::unordered_map<string, const FunctionDef*> inlined_functions_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
+};
+
+// Return trimmed FunctionDefLibrary with functions that are reachable from
+// the optimized graph.
+FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib,
+                                       const GraphDef& optimized_graph) {
+  // Functions that are reachable from the optimized graph.
+  std::unordered_set<string> keep_funcs;
+
+  std::vector<const FunctionDef*> func_queue;
+  func_queue.reserve(flib.num_functions());
+
+  // Add registered and not already processed functions to the queue by name.
+  const auto add_to_func_queue = [&](const string& func_name) {
+    const FunctionDef* func = flib.Find(func_name);
+    if (func && keep_funcs.find(func_name) == keep_funcs.end()) {
+      func_queue.push_back(func);
+    }
+  };
+
+  // Find all the functions that are reachable from the given node.
+  const auto add_node_to_func_queue = [&](const NodeDef& node) {
+    // Node itself can be a call to the function.
+    add_to_func_queue(node.op());
+
+    // Or node can have an attribute referencing a function.
+    for (const auto& attr : node.attr()) {
+      const auto& attr_value = attr.second;
+
+      // 1. AttrValue.func
+      if (attr_value.has_func()) {
+        add_to_func_queue(attr_value.func().name());
+      }
+
+      // 2. AttrValue.ListValue.func
+      if (attr_value.has_list()) {
+        for (const auto& func : attr_value.list().func()) {
+          add_to_func_queue(func.name());
+        }
+      }
+    }
+  };
+
+  // Add all functions that are directly called from the optimized graph.
+  const auto& graph_nodes = optimized_graph.node();
+  std::for_each(graph_nodes.begin(), graph_nodes.end(), add_node_to_func_queue);
+
+  // Process all reachable functions.
+  while (!func_queue.empty()) {
+    const FunctionDef* func = func_queue.back();
+    func_queue.pop_back();
+
+    const string& func_name = func->signature().name();
+    keep_funcs.insert(func_name);
+
+    // Find all the functions called from the function body.
+    const auto& func_body = func->node_def();
+    std::for_each(func_body.begin(), func_body.end(), add_node_to_func_queue);
+
+    // Check if the function has a registered gradient.
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
+  }
+
+  FunctionDefLibrary lib;
+  for (const string& func_name : keep_funcs) {
+    const FunctionDef* func = CHECK_NOTNULL(flib.Find(func_name));
+    *lib.add_function() = *func;
+
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) {
+      GradientDef* gd = lib.add_gradient();
+      gd->set_function_name(func_name);
+      gd->set_gradient_func(grad_func_name);
+    }
+  }
+
+  VLOG(3) << "Trimmed function library: " << keep_funcs.size() << " functions ("
+          << static_cast<int>(keep_funcs.size() - flib.num_functions()) << ")";
+
+  return lib;
+}
+
+Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
+                          FunctionOptimizerContext* ctx,
+                          GraphDef* optimized_graph) {
+  VLOG(2) << "Specialize function instantiation: "
+          << SummarizeNodeDef(func_node);
+
+  const std::unordered_map<string, AttrValue> func_attr(
+      func_node.attr().begin(), func_node.attr().end());
+
+  const auto& flib = ctx->function_library();
+
+  // Make a GrapplerFunctionItem and immediately convert it back to FunctionDef.
+  GrapplerFunctionItem item;
+  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  // TODO(ezhulenev): Push down const inputs and known input shapes.
+  FunctionDef specialized_func;
+  TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func));
+
+  // Find a name for specialized function.
+  const string specialized_func_name =
+      UniqueSpecializedFunctionName(func, func_node, flib);
+
+  specialized_func.mutable_signature()->set_name(specialized_func_name);
+  auto* specialized_attr = specialized_func.mutable_attr();
+  (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true);
+
+  // Add specialized function to the library.
+  TF_RETURN_IF_ERROR(
+      ctx->mutable_function_library().AddFunctionDef(specialized_func));
+
+  // Add a function call node for the specialized function.
+  NodeDef* specialized_func_node = optimized_graph->add_node();
+  *specialized_func_node = func_node;
+  specialized_func_node->set_op(specialized_func_name);
+
+  return Status::OK();
+}
+
+// Copy input/output argument type to the type_list. Return error if argument
+// type is not explicitly defined, and not specified in function attributes.
+Status CopyArgType(const NodeDef& func_node,
+                   const std::unordered_map<string, AttrValue>& func_attr,
+                   const string& arg_kind, const OpDef::ArgDef& arg,
+                   AttrValue::ListValue* type_list) {
+  if (arg.type() != DT_INVALID) {
+    type_list->add_type(arg.type());
+  } else {
+    auto it = func_attr.find(arg.type_attr());
+    if (it == func_attr.end() || it->second.type() == DT_INVALID) {
+      return errors::InvalidArgument(
+          "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
+          func_node.op(), " instantiated by ", func_node.name());
+    }
+    type_list->add_type(it->second.type());
+  }
+  return Status::OK();
+}
+
+// Add an IdentityN op to hook the function inputs to: this ensures that
+// they're all evaluated before the evaluation of the function body starts.
+Status HookInlinedFunctionInputs(
+    const NodeDef& func_node, const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr, NodeDef* inputs) {
+  inputs->set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
+  inputs->set_op("IdentityN");
+  inputs->set_device(func_node.device());
+  *inputs->mutable_input() = func_node.input();
+  AttrValue::ListValue* type_list =
+      (*inputs->mutable_attr())["T"].mutable_list();
+  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
+    TF_RETURN_IF_ERROR(
+        CopyArgType(func_node, func_attr, "input", arg, type_list));
+  }
+  return Status::OK();
+}
+
+// Add an IdentityN op to hook the function outputs to: this ensures that the
+// function body is fully evaluated before its fanout gets scheduled.
+Status HookInlinedFunctionOutputs(
+    const NodeDef& func_node, const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const gtl::ArraySlice<string> fetch, NodeDef* outputs) {
+  outputs->set_name(func_node.name());
+  outputs->set_op("IdentityN");
+  outputs->set_device(func_node.device());
+  AttrValue::ListValue* type_list =
+      (*outputs->mutable_attr())["T"].mutable_list();
+  for (int i = 0; i < func.signature().output_arg_size(); ++i) {
+    const OpDef::ArgDef& arg = func.signature().output_arg(i);
+    TF_RETURN_IF_ERROR(
+        CopyArgType(func_node, func_attr, "output", arg, type_list));
+    // Use the fetch names since they take into account the output mapping.
+    outputs->add_input(strings::StrCat(func_node.name(), "/", fetch[i]));
+  }
+  return Status::OK();
+}
+
+Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
+                      const FunctionOptimizerContext& ctx,
+                      GraphDef* optimized_graph) {
+  VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node);
+
+  const std::unordered_map<string, AttrValue> func_attr(
+      func_node.attr().begin(), func_node.attr().end());
+
+  GrapplerFunctionItem item;
+  Status item_status =
+      MakeGrapplerFunctionItem(func, func_attr, ctx.function_library(), &item);
+
+  if (!item_status.ok()) {
+    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
+                                   " instantiated by ", func_node.name(),
+                                   ". Error: ", item_status.error_message());
+  }
+
+  std::unordered_map<string, int> input_nodes;
+  for (int i = 0; i < func.signature().input_arg_size(); ++i) {
+    const OpDef::ArgDef& arg = func.signature().input_arg(i);
+    input_nodes[arg.name()] = i;
+  }
+
+  // Hook inlined function inputs to IdentityN node
+  NodeDef* func_inputs = optimized_graph->add_node();
+  TF_RETURN_IF_ERROR(
+      HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs));
+
+  for (NodeDef& func_body_node : *item.mutable_function_body().mutable_node()) {
+    if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
+      CHECK_EQ(0, func_body_node.input_size());
+      // Turn input placeholders into identity nodes
+      if (IsPlaceholder(func_body_node)) {
+        func_body_node.set_op("Identity");
+      }
+      int input_id = input_nodes[func_body_node.name()];
+      func_body_node.add_input(
+          strings::StrCat(func_inputs->name(), ":", input_id));
+    } else {
+      // Update the input names if any.
+      for (string& input : *func_body_node.mutable_input()) {
+        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+      }
+      // If the node has no input, make hook it up to the func_inputs node to
+      // ensure it runs in the same frame as the other nodes of the function
+      // body.
+      if (func_body_node.input_size() == 0) {
+        *func_body_node.add_input() = AsControlDependency(func_inputs->name());
+      }
+    }
+
+    // Add the node name as a prefix to avoid collisions after inlining
+    func_body_node.set_name(
+        strings::StrCat(func_node.name(), "/", func_body_node.name()));
+
+    // Make sure the node is placed
+    func_body_node.set_device(func_node.device());
+
+    // Check if a body node is itself a function
+    const FunctionDef* func_body_node_func =
+        ctx.FindInlinedFunction(func_body_node.op());
+    if (func_body_node_func != nullptr) {
+      // Recursively inline function calls
+      TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
+                                        ctx, optimized_graph));
+    } else {
+      // Annotate the node with the function attributes.
+      for (const auto& attr : func.attr()) {
+        func_body_node.mutable_attr()->insert(attr);
+      }
+      // Move the node to the main graph
+      optimized_graph->add_node()->Swap(&func_body_node);
+    }
+  }
+
+  // Hook inlined function outputs to IdentityN node
+  NodeDef* func_outputs = optimized_graph->add_node();
+  std::vector<string> fetch = OutputTensors(item);
+  TF_RETURN_IF_ERROR(HookInlinedFunctionOutputs(func_node, func, func_attr,
+                                                fetch, func_outputs));
+
+  return Status::OK();
+}
+
+class FakeCPUDevice : public Device {
+ public:
+  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
+  Status Sync() override { return Status::OK(); }
+};
+
+class SymbolicGradientEnv {
+ public:
+  SymbolicGradientEnv(int graph_version, const FunctionDefLibrary& library)
+      : graph_version_(graph_version), library_(library) {}
+
+  FunctionLibraryDefinition* function_library() {
+    InitializeIfNeeded();
+    return fld_.get();
+  }
+  FunctionLibraryRuntime* function_library_runtime() {
+    InitializeIfNeeded();
+    return flr_;
+  }
+
+ private:
+  // This initialization is expensive. Do it lazily to avoid paying for it
+  // unless it's needed.
+  void InitializeIfNeeded() {
+    if (flr_) {
+      return;
+    }
+    Env* env = Env::Default();
+    DeviceAttributes attr;
+    attr.set_name("/device:CPU:0");
+    attr.set_device_type("CPU");
+    FakeCPUDevice* dev = new FakeCPUDevice(env, attr);
+    std::vector<Device*> devices;
+    devices.push_back(dev);
+    dvc_mgr_.reset(new DeviceMgr(devices));
+    fld_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), library_));
+    OptimizerOptions optimizer_opts;
+    optimizer_opts.set_do_function_inlining(true);
+    pflr_.reset(new ProcessFunctionLibraryRuntime(
+        dvc_mgr_.get(), env, graph_version_, fld_.get(), optimizer_opts));
+    flr_ = pflr_->GetFLR(dev->name());
+  }
+
+  const int graph_version_;
+  const FunctionDefLibrary& library_;
+  std::unique_ptr<DeviceMgr> dvc_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> fld_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* flr_ = nullptr;
+};
+
+Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
+                              GraphDef* inlined_graph) {
+  VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node);
+
+  GraphDef graph_def;
+
+  // Create a node to anchor the gradient inputs
+  NodeDef* inlined_input = graph_def.add_node();
+  inlined_input->set_name("FunctionInputs");
+  inlined_input->set_op("IdentityN");
+  AttrValue::ListValue* type_list =
+      (*inlined_input->mutable_attr())["T"].mutable_list();
+  for (const auto& type : node.attr().at("Tin").list().type()) {
+    type_list->add_type(static_cast<DataType>(type));
+  }
+
+  // Add the gradient node
+  NodeDef* inlined = graph_def.add_node();
+  *inlined = node;
+  inlined->clear_input();
+  for (int i = 0; i < node.attr().at("Tin").list().type_size(); ++i) {
+    inlined->add_input(strings::StrCat(inlined_input->name(), ":", i));
+  }
+
+  // Create a node to anchor the gradient outputs
+  NodeDef* inlined_output = graph_def.add_node();
+  inlined_output->set_name("FunctionOutputs");
+  inlined_output->set_op("IdentityN");
+  type_list = (*inlined_output->mutable_attr())["T"].mutable_list();
+  for (const auto& type : node.attr().at("Tout").list().type()) {
+    type_list->add_type(static_cast<DataType>(type));
+  }
+  for (int i = 0; i < node.attr().at("Tout").list().type_size(); ++i) {
+    inlined_output->add_input(strings::StrCat(inlined->name(), ":", i));
+  }
+
+  // Convert the graphdef to a graph
+  GraphConstructorOptions graph_ctor_opts;
+  graph_ctor_opts.allow_internal_ops = true;
+  graph_ctor_opts.expect_device_spec = false;
+  Graph graph(env->function_library());
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
+
+  // Recursively inline the functions until there is nothing more to inline. We
+  // should at least expand one function.
+  int counter = 0;
+  while (counter < 50 &&
+         ExpandInlineFunctions(env->function_library_runtime(), &graph)) {
+    ++counter;
+  }
+
+  GraphDef inlined_graph_def;
+  graph.ToGraphDef(&inlined_graph_def);
+
+  // Add the default values of attributes to the nodes that have been inlined.
+  TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&inlined_graph_def,
+                                               *graph.op_registry(), 0, true));
+
+  // Add the inlined nodes to the graph
+  for (NodeDef& inlined_node : *inlined_graph_def.mutable_node()) {
+    if (inlined_node.name() == "FunctionOutputs") {
+      inlined_node.set_name(node.name());
+      for (int i = 0; i < inlined_node.input_size(); ++i) {
+        inlined_node.set_input(
+            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
+      }
+    } else if (inlined_node.name() == "FunctionInputs") {
+      inlined_node.set_name(
+          AddPrefixToNodeName(inlined_node.name(), node.name()));
+      inlined_node.clear_input();
+      for (int i = 0; i < node.input_size(); ++i) {
+        inlined_node.add_input(node.input(i));
+      }
+    } else {
+      inlined_node.set_name(
+          AddPrefixToNodeName(inlined_node.name(), node.name()));
+      for (int i = 0; i < inlined_node.input_size(); ++i) {
+        inlined_node.set_input(
+            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
+      }
+      // If the node has no input, hook it up to the function input node to make
+      // sure it runs in the same frame as the other nodes of the function body.
+      if (inlined_node.input_size() == 0) {
+        *inlined_node.add_input() = AsControlDependency(
+            AddPrefixToNodeName("FunctionInputs", node.name()));
+      }
+    }
+    inlined_node.set_device(node.device());
+    inlined_graph->add_node()->Swap(&inlined_node);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                   GraphDef* optimized_graph) {
+  VLOG(1) << "Optimize Grappler item: id=" << item.id;
+
+  // Nothing to do here.
+  if (item.graph.library().function_size() == 0) {
+    VLOG(3) << "Skip Grappler item with empty function library";
+    *optimized_graph = item.graph;
+    return Status::OK();
+  }
+
+  FunctionOptimizerContext ctx(opt_level_, item);
+  SymbolicGradientEnv env(item.graph.versions().producer(),
+                          item.graph.library());
+
+  bool inline_gradients = options_.enable_symbolic_gradient_inlining;
+  bool inline_func = options_.enable_function_inlining;
+  bool specialize_func = options_.enable_function_specialization;
+
+  for (const NodeDef& node : item.graph.node()) {
+    const string func_name = node.op();
+
+    if (func_name == "SymbolicGradient" && inline_gradients) {
+      // Inline symbolic gradients only if the corresponding function is inlined
+      const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
+      string f_name = f_attr != nullptr ? f_attr->func().name() : "";
+      if (ctx.IsInlinedFunction(f_name)) {
+        TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
+        continue;
+      }
+    }
+
+    const FunctionDef* func = ctx.function_library().Find(func_name);
+    if (func != nullptr) {
+      if (inline_func && ctx.IsInlinedFunction(func_name)) {
+        // Inline function body into the optimized graph}
+        TF_RETURN_IF_ERROR(InlineFunction(node, *func, ctx, optimized_graph));
+        continue;
+      }
+
+      // Do not specialize if function has custom gradient.
+      const string grad_func = ctx.function_library().FindGradient(func_name);
+
+      if (specialize_func && grad_func.empty() && IsParametrized(*func)) {
+        // TODO(ezhulenev): Specialize function call if input is a Const or has
+        // a known shape. Const input tensors can be pushed into the function
+        // body and removed from function inputs.
+
+        // Specialize function body for its instantiation attributes and inputs.
+        TF_RETURN_IF_ERROR(
+            SpecializeFunction(node, *func, &ctx, optimized_graph));
+        continue;
+      }
+    }
+
+    // If we reached this point, node was not handled by any of the stages
+    // (inline, specialize), simply add a copy to the graph.
+    *optimized_graph->add_node() = node;
+  }
+
+  *optimized_graph->mutable_versions() = item.graph.versions();
+  *optimized_graph->mutable_library() =
+      options_.enable_trim_function_library
+          ? TrimFunctionLibrary(ctx.function_library(), *optimized_graph)
+          : ctx.function_library().ToProto();
+
+  return Status::OK();
+}
+
+void FunctionOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                 const GraphDef& optimized_graph,
+                                 double result) {
+  // Nothing to do for FunctionOptimizer.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..4352555064c43c8db40157ace2fca9479907df8e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Remap TensorFlow subgraphs onto alternative operations or collection of
+// operations to make the overall graph more efficient.
+class FunctionOptimizer : public GraphOptimizer {
+ public:
+  explicit FunctionOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+  ~FunctionOptimizer() override = default;
+
+  string name() const override { return "function_optimizer"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+ private:
+  friend class FunctionOptimizerTest;
+
+  struct FunctionOptimizerOptions {
+    bool enable_function_inlining = true;
+    bool enable_function_specialization = true;
+    bool enable_symbolic_gradient_inlining = true;
+    bool enable_trim_function_library = true;
+  };
+
+  RewriterConfig::Toggle opt_level_;
+  FunctionOptimizerOptions options_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6147e8a27c0e1831cc4dab2c481f621fcd37441e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -0,0 +1,661 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+constexpr char kDevice[] = "/device:CPU:0";
+}  // namespace
+
+class FunctionOptimizerTest : public GrapplerTest {
+ protected:
+  void DisableFunctionSpecialization(FunctionOptimizer* optimizer) {
+    optimizer->options_.enable_function_specialization = false;
+  }
+};
+
+TEST_F(FunctionOptimizerTest, InlineFunction_SimpleFunction) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // Build a graph to compute y = XTimesTwo(x)
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y/inlined_inputs") {
+      count++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    } else if (node.name() == "y/x") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "y/two") {
+      count++;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^y/inlined_inputs", node.input(0));
+    } else if (node.name() == "y/scale") {
+      count++;
+      EXPECT_EQ("Cast", node.op());
+      EXPECT_EQ(kDevice, node.device());
+    } else if (node.name() == "y/y") {
+      count++;
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("y/x", node.input(0));
+      EXPECT_EQ("y/scale", node.input(1));
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y/y", node.input(0));
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+    }
+  }
+  EXPECT_EQ(7, count);
+
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z"};
+  item.feed.emplace_back("x", pi);
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineFunction_FixedTypeFunction) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // Create and instantiate a version of the XTimesTwo function that only
+  // accepts floats a inputs.
+  const Tensor kTwo = test::AsScalar<float>(2.0f);
+  FunctionDef x_times_two = FunctionDefHelper::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: float"},
+      // Return values
+      {"y: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+          {{"y"}, "Mul", {"x", "two"}, {{"T", DT_FLOAT}}},
+      });
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "XTimesTwo", {"x"}, {}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {
+          x_times_two,
+      });
+
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y/inlined_inputs") {
+      count++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    } else if (node.name() == "y/x") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "y/two") {
+      count++;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^y/inlined_inputs", node.input(0));
+      EXPECT_EQ(kDevice, node.device());
+    } else if (node.name() == "y/y") {
+      count++;
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("y/x", node.input(0));
+      EXPECT_EQ("y/two", node.input(1));
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y/y", node.input(0));
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+    }
+  }
+  EXPECT_EQ(6, count);
+
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z"};
+  item.feed.emplace_back("x", pi);
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithOutputMapping) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  FunctionDef func = FunctionDefHelper::Create(
+      // Name
+      "Exp_func",
+      // Args
+      {"in: float"},
+      // Return values
+      {"out: float"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"Linear_func"}, "Identity", {"in"}, {{"T", DT_FLOAT}}},
+       {{"Exp"}, "Exp", {"Linear_func:output:0"}, {{"T", DT_FLOAT}}}},
+      // Mapping
+      {{"out", "Exp:y:0"}});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "Exp_func", {"x"}, {}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {
+          func,
+      });
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y/inlined_inputs") {
+      count++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    } else if (node.name() == "y/in") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "y/Linear_func") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y/in", node.input(0));
+    } else if (node.name() == "y/Exp") {
+      count++;
+      EXPECT_EQ("Exp", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y/Linear_func", node.input(0));
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y/Exp", node.input(0));
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+    }
+  }
+  EXPECT_EQ(6, count);
+
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z"};
+  item.feed.emplace_back("x", pi);
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithInputForwarding) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  FunctionDef func = FunctionDefHelper::Create(
+      // Name
+      "ForwardInputs",
+      // Args
+      {"in0: float", "in1: float", "arg2: float", "arg3: int32", "arg4: float"},
+      // Return values
+      {"out0: float", "arg2: float", "arg3: int32"},
+      // Attr def
+      {},
+      // Nodes
+      {},
+      // Mapping
+      {{"out0", "in0"}, {"arg2", "arg2"}, {"arg3", "arg3"}});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x2", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x3", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+       NDef("x4", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "ForwardInputs", {"x0", "x1", "x2", "x3", "x4"}, {}, kDevice),
+       NDef("z0", "Identity", {"y:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z1", "Identity", {"y:1"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z2", "Identity", {"y:2"}, {{"T", DT_INT32}}, kDevice)},
+      // FunctionLib
+      {
+          func,
+      });
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  item.fetch = {"z0", "z1", "z2"};
+  item.feed.emplace_back("x0", test::AsScalar<float>(3.14f));
+  item.feed.emplace_back("x1", test::AsScalar<float>(2.7f));
+  item.feed.emplace_back("x2", test::AsScalar<float>(1.0f));
+  item.feed.emplace_back("x4", test::AsScalar<float>(-1.0f));
+  item.feed.emplace_back("x3", test::AsScalar<int>(1234));
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
+  test::ExpectTensorEqual<int>(tensors_expected[2], tensors[2]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+  DisableFunctionSpecialization(&optimizer);  // do not specialize noinline func
+
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "GenerateTwo",
+      // Args
+      {},
+      // Return value
+      {"o: T"},
+      // Attr def
+      {"T: {float, double}"},
+      // Nodes
+      {{{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+       {{"o"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}}});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("y", "GenerateTwo", {}, {}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {
+          func,
+      });
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // For now we won't inline the function.
+  EXPECT_EQ(item.graph.DebugString(), output.DebugString());
+}
+
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithNestedFunctionCall) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // Define square via function library:
+  //   MySquare(x) = MyMul(x, x)
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("outputs", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {mul_func, square_func});
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "square/inlined_inputs" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("a", node.input(0));
+    } else if (node.name() == "square/x" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "square/output/inlined_inputs" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square/x", node.input(0));
+      EXPECT_EQ("square/x", node.input(1));
+    } else if (node.name() == "square/output/x" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "square/output/y" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/inlined_inputs:1", node.input(0));
+    } else if (node.name() == "square/output/output" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square/output/x", node.input(0));
+      EXPECT_EQ("square/output/y", node.input(1));
+    } else if (node.name() == "square/output" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/output", node.input(0));
+    } else if (node.name() == "square" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output", node.input(0));
+    } else if (node.name() == "outputs" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square:0", node.input(0));
+    }
+  }
+  EXPECT_EQ(9, count);
+
+  item.fetch = {"outputs"};
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_TestFunc) {
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  FunctionDef func = FunctionDefHelper::Define(
+      "TestFunc", {"x:float", "y:float"}, {"l:float"}, {},
+      {
+          {{"z"}, "Add", {"x", "y"}, {{"T", DT_FLOAT}}},
+          FunctionDefHelper::Const("zero", 0),
+          FunctionDefHelper::Const("one", 1),
+          {{"r"}, "Rank", {"z"}, {{"T", DT_FLOAT}}},
+          {{"indices"}, "Range", {"zero", "r", "one"}},
+          {{"l"}, "Sum", {"z", "indices"}, {{"T", DT_FLOAT}}},
+      });
+
+  auto x = ops::Const(scope, 1.0f);
+  auto y = ops::Const(scope, 2.0f);
+  auto dl = ops::Const(scope, 3.0f);
+
+  NameAttrList fn;
+  fn.set_name("TestFunc");
+  (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+  auto g0 = ops::SymbolicGradient(scope, std::initializer_list<Input>{x, y, dl},
+                                  {DT_FLOAT, DT_FLOAT}, fn);
+  auto out1 = ops::Identity(scope.WithOpName("out1"), g0.output[0]);
+  auto out2 = ops::Identity(scope.WithOpName("out2"), g0.output[1]);
+
+  GrapplerItem item;
+  TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
+  *item.graph.mutable_library()->add_function() = func;
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"out1", "out2"}, {});
+  std::vector<Tensor> optimized = EvaluateNodes(output, {"out1", "out2"}, {});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+  test::ExpectTensorEqual<float>(expected[1], optimized[1]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_IdentityFunc) {
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  FunctionDef func = FunctionDefHelper::Create(
+      // Name
+      "Identity_func",
+      // Args
+      {"in: float"},
+      // Return values
+      {"out: float"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"Identity"}, "Identity", {"in"}, {{"T", DT_FLOAT}}}},
+      // Mapping
+      {{"out", "Identity:output:0"}});
+
+  auto x = ops::Const(scope, 1.0f, {3, 5, 7});
+  auto z = ops::Const(scope, 3.0f, {3, 5, 7});
+
+  NameAttrList fn;
+  fn.set_name("Identity_func");
+  auto g0 = ops::SymbolicGradient(scope, std::initializer_list<Input>{x, z},
+                                  {DT_FLOAT}, fn);
+  auto out = ops::Identity(scope.WithOpName("out"), g0.output[0]);
+
+  GrapplerItem item;
+  TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
+  *item.graph.mutable_library()->add_function() = func;
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_EQ(13, output.node_size());
+  EXPECT_EQ("Const", output.node(0).name());
+  EXPECT_EQ("Const_1", output.node(1).name());
+  EXPECT_EQ("SymbolicGradient/FunctionInputs", output.node(2).name());
+  EXPECT_EQ("SymbolicGradient", output.node(3).name());
+  EXPECT_EQ("SymbolicGradient/SymbolicGradient/Identity",
+            output.node(4).name());
+  EXPECT_EQ("SymbolicGradient/Func/_0", output.node(5).name());
+  EXPECT_EQ("SymbolicGradient/Func/_1", output.node(6).name());
+  EXPECT_EQ("SymbolicGradient/Func/_2", output.node(7).name());
+  EXPECT_EQ("SymbolicGradient/SymbolicGradient/Func/_1/dx",
+            output.node(8).name());
+  EXPECT_EQ("SymbolicGradient/Func/_3", output.node(9).name());
+  EXPECT_EQ("SymbolicGradient/Func/_4", output.node(10).name());
+  EXPECT_EQ("SymbolicGradient/Func/_5", output.node(11).name());
+  EXPECT_EQ("out", output.node(12).name());
+  for (int i = 2; i < 4; ++i) {
+    EXPECT_EQ("IdentityN", output.node(i).op());
+  }
+  for (int i = 4; i < 11; ++i) {
+    EXPECT_EQ("Identity", output.node(i).op());
+  }
+
+  std::vector<Tensor> expected = EvaluateNodes(item.graph, {"out"}, {});
+  std::vector<Tensor> optimized = EvaluateNodes(output, {"out"}, {});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+
+  FunctionDef func = FunctionDefHelper::Define(
+      "TestFunc", {"x:float", "y:float"}, {"l:float"}, {},
+      {
+          {{"z"}, "Add", {"x", "y"}, {{"T", DT_FLOAT}}},
+          FunctionDefHelper::Const("zero", 0),
+          FunctionDefHelper::Const("one", 1),
+          {{"r"}, "Rank", {"z"}, {{"T", DT_FLOAT}}},
+          {{"indices"}, "Range", {"zero", "r", "one"}},
+          {{"l"}, "Sum", {"z", "indices"}, {{"T", DT_FLOAT}}},
+      });
+  (*func.mutable_attr())["_noinline"].set_b(true);
+
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(scope, 1.0f);
+  auto y = ops::Const(scope, 2.0f);
+  auto dl = ops::Const(scope, 3.0f);
+
+  NameAttrList fn;
+  fn.set_name("TestFunc");
+  (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+  auto g0 = ops::SymbolicGradient(scope, std::initializer_list<Input>{x, y, dl},
+                                  {DT_FLOAT, DT_FLOAT}, fn);
+  auto out1 = ops::Identity(scope.WithOpName("out1"), g0.output[0]);
+  auto out2 = ops::Identity(scope.WithOpName("out2"), g0.output[1]);
+
+  GrapplerItem item;
+  TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
+  *item.graph.mutable_library()->add_function() = func;
+
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  // The optimizer should succeed but the graphs should be the same.
+  TF_EXPECT_OK(status);
+  CompareGraphs(item.graph, output);
+}
+
+TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // Mark XTimesTwo as noinline.
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  (*x_times_two.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {x_times_two};
+
+  // Build a graph to compute y = XTimesTwo(x).
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that specialized function was added to the library and original
+  // function was removed.
+  EXPECT_EQ(1, output.library().function_size());
+  EXPECT_EQ("XTimesTwo_specialized_for_y",
+            output.library().function(0).signature().name());
+
+  // And 'y' node is calling specialized function.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y" && count++) {
+      EXPECT_EQ("XTimesTwo_specialized_for_y", node.op());
+    }
+  }
+  EXPECT_EQ(1, count);
+
+  // And that graph evaluation yields the same result.
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z"};
+  item.feed.emplace_back("x", pi);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/gpu_swapping_kernels.cc b/tensorflow/core/grappler/optimizers/gpu_swapping_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1820af6844215475d2bfccba93891a52029218b2
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/gpu_swapping_kernels.cc
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Op kernels used to swap data in and out of GPU memory.
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace {
+
+class CopyFromGpuToHostKernel : public AsyncOpKernel {
+ public:
+  explicit CopyFromGpuToHostKernel(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor& input = ctx->input(0);
+    OP_REQUIRES_ASYNC(
+        ctx, !ctx->input_alloc_attr(0).on_host(),
+        errors::Internal("The input tensor to the _CopyFromGpuToHost kernel "
+                         "must reside on the device."),
+        done);
+
+    AllocatorAttributes alloc_attrs;
+    alloc_attrs.set_gpu_compatible(true);
+    alloc_attrs.set_on_host(true);
+    Tensor* output;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, ctx->allocate_output(0, input.shape(), &output, alloc_attrs),
+        done);
+
+    ctx->op_device_context()->CopyDeviceTensorToCPU(
+        &input, "CopyFromGpuToHost", static_cast<Device*>(ctx->device()),
+        output, [ctx, done](const Status& s) {
+          ctx->SetStatus(s);
+          done();
+        });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("_CopyFromGpuToHost").Device(DEVICE_GPU).HostMemory("output"),
+    CopyFromGpuToHostKernel);
+
+class CopyFromHostToGpuKernel : public AsyncOpKernel {
+ public:
+  explicit CopyFromHostToGpuKernel(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor& input = ctx->input(0);
+    OP_REQUIRES_ASYNC(
+        ctx, ctx->input_alloc_attr(0).on_host(),
+        errors::Internal("The input tensor to the _CopyFromHostToGpu kernel "
+                         "must reside on the host."),
+        done);
+
+    Tensor* output;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->allocate_output(0, input.shape(), &output),
+                         done);
+
+    ctx->op_device_context()->CopyCPUTensorToDevice(
+        &input, static_cast<Device*>(ctx->device()), output,
+        [ctx, done](const Status& s) {
+          ctx->SetStatus(s);
+          done();
+        });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("_CopyFromHostToGpu").Device(DEVICE_GPU).HostMemory("input"),
+    CopyFromHostToGpuKernel);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/gpu_swapping_ops.cc b/tensorflow/core/grappler/optimizers/gpu_swapping_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46828346da608a237528da2a2a8070c57946f762
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/gpu_swapping_ops.cc
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Definition for the ops used to swap data in and out of GPU memory.
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace {
+
+// The _CopyFromGpuToHost op copies its input tensor to the host. The input must
+// reside on GPU. The op itself must be placed on GPU.
+REGISTER_OP("_CopyFromGpuToHost")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    })
+    .Doc("Copies the input tensor from gpu to the host.");
+
+// The _CopyFromHostToGpu op copies its input tensor from the host to the GPU.
+// The input must reside on CPU. The op itself must be placed on GPU.
+REGISTER_OP("_CopyFromHostToGpu")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    })
+    .Doc("Copies the input tensor from the host to the GPU.");
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 42d9837312d25f3504c85f12883c4ac818157cdd..765dd13263f0298a850c8598d499bfeeda2f186c 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -50,4 +50,4 @@ class GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ea57f7b4f003e8a98fe187f6325e39ebe30e9e7
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+
+namespace tensorflow {
+namespace grappler {
+
+const NodeScopeAndName ParseNodeScopeAndName(const string& node_name) {
+  auto pos = node_name.find_last_of("/");
+  if (pos == string::npos) {
+    return {"", node_name};
+  } else {
+    return {node_name.substr(0, pos), node_name.substr(pos + 1)};
+  }
+};
+
+Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
+                    NodeDef** node) {
+  string node_name = NodeName(input);
+  NodeDef* node_by_name = ctx.node_map->GetNode(node_name);
+  if (node_by_name == nullptr) {
+    return errors::FailedPrecondition("Node ", node_name,
+                                      " doesn't exists in a node map");
+  }
+  *node = node_by_name;
+  return Status::OK();
+}
+
+Status GetTensorProperties(const GraphOptimizerContext& ctx,
+                           const string& tensor,
+                           OpInfo::TensorProperties* properties) {
+  if (ctx.graph_properties == nullptr) {
+    return errors::InvalidArgument("Graph properties are unknown.");
+  }
+
+  int port;
+  string tensor_node_name = ParseNodeName(tensor, &port);
+  if (port < 0) {
+    return errors::InvalidArgument(
+        "Can't get tensor properties of control dependency ", tensor);
+  }
+
+  const auto& output_properties =
+      ctx.graph_properties->GetOutputProperties(tensor_node_name);
+  auto num_outputs = output_properties.size();
+
+  if (num_outputs == 0 || port > num_outputs - 1) {
+    return errors::InvalidArgument(
+        "Node ", tensor_node_name,
+        " is missing output properties at position :", port,
+        " (num_outputs=", num_outputs, ")");
+  }
+
+  properties->CopyFrom(output_properties[port]);
+  return Status::OK();
+}
+
+NodeDef* AddCopyNode(const GraphOptimizerContext& ctx, const string& name,
+                     const NodeDef* node_to_copy) {
+  CHECK(node_to_copy != nullptr);
+  CHECK(!ctx.node_map->NodeExists(name))
+      << "Node " << name << " already exists in a graph";
+  NodeDef* new_node = ctx.optimized_graph->add_node();
+  *new_node = *node_to_copy;
+  new_node->set_name(name);
+  ctx.node_map->AddNode(name, new_node);
+  return new_node;
+}
+
+NodeDef* AddEmptyNode(const GraphOptimizerContext& ctx, const string& name) {
+  CHECK(!ctx.node_map->NodeExists(name))
+      << "Node " << name << " already exists in a graph";
+  NodeDef* new_node = ctx.optimized_graph->add_node();
+  new_node->set_name(name);
+  ctx.node_map->AddNode(name, new_node);
+  return new_node;
+}
+
+const string MakeOptimizedNodeName(const NodeScopeAndName& node,
+                                   const string& sub_scope,
+                                   const string& prefix) {
+  CHECK(!sub_scope.empty() || !prefix.empty())
+      << "Either optimized node name prefix or sub-scope must be non-empty";
+  string optimized_node_name;
+  if (!node.scope.empty()) {
+    strings::StrAppend(&optimized_node_name, node.scope, "/");
+  }
+  if (!sub_scope.empty()) {
+    strings::StrAppend(&optimized_node_name, sub_scope, "/");
+  }
+  if (!prefix.empty()) {
+    strings::StrAppend(&optimized_node_name, prefix, "_");
+  }
+  strings::StrAppend(&optimized_node_name, node.name);
+  return optimized_node_name;
+}
+
+const string MakeOptimizedNodeName(const NodeScopeAndName& root,
+                                   const std::vector<string> node_names,
+                                   const string& sub_scope,
+                                   const string& prefix) {
+  string optimized_node_name = MakeOptimizedNodeName(root, sub_scope, prefix);
+  for (const string& node_name : node_names) {
+    auto name_and_scope = ParseNodeScopeAndName(node_name);
+    strings::StrAppend(&optimized_node_name, "_", name_and_scope.name);
+  }
+  return optimized_node_name;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0ec967473bbec782fdcfe7dfed9848a791d26fc
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -0,0 +1,263 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_STAGE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_STAGE_H_
+
+#include <unordered_map>
+#include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+struct NodeScopeAndName {
+  string scope;
+  string name;
+};
+
+// Parse scope and name: "a/b/c/Add_1" -> {"a/b/c", "Add_1"}
+const NodeScopeAndName ParseNodeScopeAndName(const string& node_name);
+
+// Context owned by GraphOptimizer, and passed to every stage at construction
+// time. Each optimizer stage is responsible for updating it according to the
+// changes it made to the graph.
+//
+// If an optimizer needs access to some helper class that is not present in this
+// context, consider creating an extension context, specific to that
+// optimizer (see example of ArithmeticOptimizerContext). GraphOptimizerContext
+// should only have members that are useful to almost all optimizers.
+struct GraphOptimizerContext {
+  GraphOptimizerContext(const std::unordered_set<string>* nodes_to_preserve,
+                        GraphDef* optimized_graph,
+                        GraphProperties* graph_properties, NodeMap* node_map)
+      : nodes_to_preserve(nodes_to_preserve),
+        optimized_graph(optimized_graph),
+        graph_properties(graph_properties),
+        node_map(node_map) {}
+
+  const std::unordered_set<string>* nodes_to_preserve;
+  GraphDef* optimized_graph;
+  GraphProperties* graph_properties;
+  NodeMap* node_map;
+};
+
+Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
+                    NodeDef** node);
+Status GetTensorProperties(const GraphOptimizerContext& ctx,
+                           const string& tensor,
+                           OpInfo::TensorProperties* properties);
+
+NodeDef* AddCopyNode(const GraphOptimizerContext& ctx, const string& name,
+                     const NodeDef* node_to_copy);
+NodeDef* AddEmptyNode(const GraphOptimizerContext& ctx, const string& name);
+
+// WARNING:
+// Optimizer stage must try to re-use original nodes of a graph and
+// make all updates in place. This helps to make robust node placement
+// decisions. Create new nodes only if there is a reason for that.
+
+// Make a name for a new node obtained by optimizing a single node of the
+// original graph. The optimized node is placed under the original node scope.
+//
+// Node name uniqueness is guaranteed by unique name of an original node in
+// a same scope.
+//
+// Empty sub_scope or prefix ignored. At least one of them must be non-empty.
+//
+// Example: a/b/c/Add -> a/b/c/${sub_scope}/${prefix}_Add.
+const string MakeOptimizedNodeName(const NodeScopeAndName& node,
+                                   const string& sub_scope,
+                                   const string& prefix);
+// Make a name for a new node obtained by optimizing multiple nodes of the
+// original graph, starting from "root". The optimized node is placed under
+// the original scope of a "root" node.
+//
+// Example: [a/b/c/Add, x/y/z/Mul] -> a/b/c/${sub_scope}/${prefix}_Add_Mul
+const string MakeOptimizedNodeName(const NodeScopeAndName& root,
+                                   const std::vector<string> node_names,
+                                   const string& sub_scope,
+                                   const string& prefix);
+
+// Base class for multi-stage GraphOptimizers (ArithmeticOptimizer, etc...).
+//
+// If a graph optimizer consists of large number of small independent
+// rewrites, each of them should be implemented as a separate stage.
+//
+// * Result:
+// Each graph optimizer choose what result is reported by each stage
+// (e.g. each stage can fill in the name of optimized nodes, or have more
+// complex result).
+template <typename Result>
+class GraphOptimizerStage {
+ public:
+  explicit GraphOptimizerStage(const string& optimizer_name,
+                               const string& stage_name,
+                               const GraphOptimizerContext& ctx)
+      : optimizer_name_(optimizer_name), stage_name_(stage_name), ctx_(ctx) {}
+  virtual ~GraphOptimizerStage() = default;
+
+  const string& stage_name() const { return stage_name_; }
+  const string& optimizer_name() const { return optimizer_name_; }
+
+  // Check if we should try to simplify node. Returning true doesn't
+  // guarantee that node will be simplified.
+  //
+  // Should implement just a basic sanity check, without any expensive graph
+  // traversals.
+  virtual bool IsSupported(const NodeDef* node) const = 0;
+
+  // Try to simplify the given node.
+  //
+  // Return error status only if some precondition is failed, or got an
+  // incorrect graph. In every other case return Status:OK(), even if didn't
+  // simplify anything.
+  //
+  // Report result using output argument. Each GraphOptimizer can choose it's
+  // own Result type.
+  // TODO(ezhulenev): if it will appear that Result output parameter is not
+  // sufficiently useful (used with a reason by most optimizers), get rid of it,
+  // and remove template parameter.
+  virtual Status TrySimplify(NodeDef* node, Result* result) = 0;
+
+  // Return InvalidArgumentError if node is not supported by the optimizer
+  // stage.
+  // TODO(ezhulenev): make this check part of non-virtual public API
+  // (TrySimplify), and make virtual implementation protected.
+  Status EnsureNodeIsSupported(const NodeDef* node) const {
+    return IsSupported(node)
+               ? Status::OK()
+               : errors::InvalidArgument(
+                     "Node ", node->name(), " is not supported by optimizer ",
+                     optimizer_name_, " and stage ", stage_name_);
+  }
+
+  // Get a name for a new node, created by this stage, based on one or multiple
+  // nodes of an original graph.
+  const string OptimizedNodeName(const NodeScopeAndName& node) const {
+    return MakeOptimizedNodeName(node, optimizer_name_, stage_name_);
+  }
+  const string OptimizedNodeName(const NodeScopeAndName& root,
+                                 const std::vector<string>& nodes) const {
+    return MakeOptimizedNodeName(root, nodes, optimizer_name_, stage_name_);
+  }
+  const string OptimizedNodeName(const NodeScopeAndName& node,
+                                 const string& rewrite_rule) const {
+    const string prefix = strings::StrCat(stage_name_, "_", rewrite_rule);
+    return MakeOptimizedNodeName(node, optimizer_name_, prefix);
+  }
+
+  // Get a node by input name from a node map. Return an error if node was not
+  // found.
+  Status GetInputNode(const string& input, NodeDef** node) const {
+    return ::tensorflow::grappler::GetInputNode(ctx_, input, node);
+  }
+  // Lookup tensor properties by name. Tensor name might have non-zero port
+  // number. Return an error if tensor node doesn't exists in a graph, or it
+  // doesn't have properties defined for requested port.
+  Status GetTensorProperties(const string& tensor,
+                             OpInfo::TensorProperties* properties) const {
+    return ::tensorflow::grappler::GetTensorProperties(ctx_, tensor,
+                                                       properties);
+  }
+
+  NodeDef* AddCopyNode(const string& name, const NodeDef* node_to_copy) {
+    return ::tensorflow::grappler::AddCopyNode(ctx_, name, node_to_copy);
+  }
+  NodeDef* AddEmptyNode(const string& name) {
+    return ::tensorflow::grappler::AddEmptyNode(ctx_, name);
+  }
+
+ protected:
+  const GraphOptimizerContext& ctx() const { return ctx_; }
+
+ private:  // Data members
+  const string optimizer_name_;
+  const string stage_name_;
+  const GraphOptimizerContext ctx_;
+};
+
+template <typename Result>
+class GraphOptimizerStagePipeline {
+ public:
+  // Break predicate specifies if a pipeline should stop early, and not pass
+  // a node to the next registered optimizer stage, typically that should be the
+  // case when a stage successfully optimized a node, and it wants to yield
+  // control to the optimizer.
+  explicit GraphOptimizerStagePipeline(
+      const std::function<bool(const Result&)> break_predicate)
+      : break_predicate_(break_predicate) {}
+
+  // Add a stage to the pipeline. It should be called with the arguments for the
+  // stage constructor:
+  //
+  //   pipeline.AddStage<FooStage>(constructor_arg1, constructor_arg2);
+  //
+  // Returns a reference to the added stage.
+  template <typename T, typename... Args>
+  T& AddStage(Args&&... args) {
+    auto stage = new T(std::forward<Args>(args)...);
+    stages_.push_back(std::unique_ptr<T>(stage));
+    return *stage;
+  }
+
+  // Pass a node through all registered optimizer stages, until break predicate
+  // is true.
+  //
+  // Return true, if pipeline exited after a break predicate was evaluated as
+  // 'true', which typically means that a node was optimized by one of the
+  // registered stages.
+  //
+  // Return false, if node was not optimized by any of registered stages.
+  bool PassThroughAllStages(NodeDef* node, Result* result) {
+    for (auto& stage : stages_) {
+      if (stage->IsSupported(node)) {
+        const Status stage_status = stage->TrySimplify(node, result);
+        // Each stage must be "error safe" (just like exception safe). In
+        // case of any error it must leave optimized graph unmodified.
+        if (!stage_status.ok()) {
+          LOG(WARNING) << "Failed to run optimizer " << stage->optimizer_name()
+                       << ", stage " << stage->stage_name()
+                       << ". Error: " << stage_status.error_message();
+        }
+        if (break_predicate_(*result)) return true;
+      }
+    }
+    return false;
+  }
+
+  std::size_t NumStages() { return stages_.size(); }
+
+  std::vector<string> StageNames() {
+    std::vector<string> names;
+    for (const auto& stage : stages_) {
+      names.push_back(stage->stage_name());
+    }
+    return names;
+  }
+
+ private:
+  std::vector<std::unique_ptr<GraphOptimizerStage<Result>>> stages_;
+  std::function<bool(const Result&)> break_predicate_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GraphOptimizerStagePipeline);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_STAGE_H_
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f5ab87a5a372a0dc954aa5a9ae57241635d5594
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
@@ -0,0 +1,166 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class GraphOptimizerStageTest : public ::testing::Test {};
+
+struct FakeResult {};
+
+// NoOp optimizer stage that supports all the node types and does nothing
+class FakeOptimizerStage : public GraphOptimizerStage<FakeResult> {
+ public:
+  explicit FakeOptimizerStage(const string& optimizer_name,
+                              const string& stage_name,
+                              const GraphOptimizerContext& ctx)
+      : GraphOptimizerStage(optimizer_name, stage_name, ctx) {}
+  ~FakeOptimizerStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override { return true; }
+  Status TrySimplify(NodeDef* node, FakeResult* result) override {
+    return Status::OK();
+  }
+};
+
+TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScope_InRoot) {
+  const auto scope_and_name = ParseNodeScopeAndName("Add");
+  EXPECT_EQ("", scope_and_name.scope);
+  EXPECT_EQ("Add", scope_and_name.name);
+}
+
+TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScope_InScope) {
+  const auto scope_and_name = ParseNodeScopeAndName("a/b/c/Add");
+  EXPECT_EQ("a/b/c", scope_and_name.scope);
+  EXPECT_EQ("Add", scope_and_name.name);
+}
+
+TEST_F(GraphOptimizerStageTest, OptimizedNodeName) {
+  GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
+                            /*optimized_graph*/ nullptr,
+                            /*graph_properties*/ nullptr,
+                            /*node_name*/ nullptr);
+  FakeOptimizerStage stage("my_opt", "my_stg", ctx);
+
+  const auto node = ParseNodeScopeAndName("a/b/c/Add");
+
+  // Without rewrite rule
+  EXPECT_EQ("a/b/c/my_opt/my_stg_Add", stage.OptimizedNodeName(node));
+  EXPECT_EQ(
+      "a/b/c/my_opt/my_stg_Add_Mul_Sqrt",
+      stage.OptimizedNodeName(node, std::vector<string>({"Mul", "Sqrt"})));
+
+  // With rewrite rule
+  const string rewrite = "my_rewrite";
+  EXPECT_EQ("a/b/c/my_opt/my_stg_my_rewrite_Add",
+            stage.OptimizedNodeName(node, rewrite));
+}
+
+TEST_F(GraphOptimizerStageTest, GetInputNodeAndProperties) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto add = ops::Add(s.WithOpName("Add"), a, b);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(/*assume_valid_feeds*/ false));
+
+  NodeMap node_map(&item.graph);
+
+  GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
+                            /*optimized_graph*/ &item.graph,
+                            /*graph_properties*/ &properties,
+                            /*node_name*/ &node_map);
+  FakeOptimizerStage stage("my_opt", "my_stg", ctx);
+
+  NodeDef* add_node;
+  TF_CHECK_OK(stage.GetInputNode("Add", &add_node));
+  EXPECT_EQ("a", add_node->input(0));
+  EXPECT_EQ("b", add_node->input(1));
+
+  OpInfo::TensorProperties add_properties;
+  TF_CHECK_OK(stage.GetTensorProperties("Add", &add_properties));
+  EXPECT_EQ(DT_FLOAT, add_properties.dtype());
+
+  OpInfo::TensorProperties a_properties;
+  TF_CHECK_OK(stage.GetTensorProperties("a:0", &a_properties));
+  EXPECT_EQ(DT_FLOAT_REF, a_properties.dtype());
+
+  OpInfo::TensorProperties b_properties;
+  TF_CHECK_OK(stage.GetTensorProperties("b:0", &b_properties));
+  EXPECT_EQ(DT_FLOAT_REF, b_properties.dtype());
+}
+
+TEST_F(GraphOptimizerStageTest, AddNodes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto add = ops::Add(s.WithOpName("Add"), a, b);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(/*assume_valid_feeds*/ false));
+
+  NodeMap node_map(&item.graph);
+
+  GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
+                            /*optimized_graph*/ &item.graph,
+                            /*graph_properties*/ &properties,
+                            /*node_name*/ &node_map);
+  FakeOptimizerStage stage("my_opt", "my_stg", ctx);
+
+  NodeDef* add_node;
+  TF_CHECK_OK(stage.GetInputNode("Add", &add_node));
+
+  // Add a new copy node
+  NodeDef* add_node_copy = stage.AddCopyNode("Add_1", add_node);
+  EXPECT_EQ("Add_1", add_node_copy->name());
+  EXPECT_EQ("Add", add_node_copy->op());
+  EXPECT_EQ("a", add_node_copy->input(0));
+  EXPECT_EQ("b", add_node_copy->input(1));
+
+  // It must be available for by-name lookup
+  NodeDef* add_node_copy_by_name;
+  TF_CHECK_OK(stage.GetInputNode("Add_1", &add_node_copy_by_name));
+  EXPECT_EQ(add_node_copy, add_node_copy_by_name);
+
+  // Add new empty node
+  NodeDef* empty_node = stage.AddEmptyNode("Add_2");
+  EXPECT_EQ("Add_2", empty_node->name());
+
+  // It must be available for by-name lookup
+  NodeDef* empty_node_by_name;
+  TF_CHECK_OK(stage.GetInputNode("Add_2", &empty_node_by_name));
+  EXPECT_EQ(empty_node, empty_node_by_name);
+}
+
+}  // namespace
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
index 3d48d628e203e3d1ab6c8ee3bda9575facbd129f..4a5a150dc9234ffdeed9b991c828e8ec30befde8 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.h
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
 
 #include <unordered_map>
 #include <unordered_set>
@@ -99,4 +99,4 @@ class GraphRewriter {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 826f00209b15705f2a9b8b43f78134498a19d167..87ab4608627e7b2f5d0a1f122bf2faf135eedd7c 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -17,9 +17,13 @@ limitations under the License.
 #include <unordered_set>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -301,10 +305,6 @@ bool IsComparisonOp(const NodeDef& node) {
   return is_compare;
 }
 
-bool IsLogicalOp(const NodeDef& node) {
-  return IsLogicalAnd(node) || IsLogicalNot(node) || IsLogicalOr(node);
-}
-
 bool IsReduceOp(const NodeDef& node) {
   return IsSum(node) || IsMean(node) || IsProd(node) || IsMax(node) ||
          IsMin(node) || IsAll(node) || IsAny(node);
@@ -355,7 +355,7 @@ std::vector<int> DataInputPos(const NodeDef& node) {
   if (IsBetainc(node) || IsSelect(node)) {
     return {0, 1, 2};
   }
-  if (IsShapeN(node) || IsIdentityN(node) || IsAddN(node)) {
+  if (IsShapeN(node) || IsIdentityN(node) || IsAddN(node) || IsMerge(node)) {
     return NonControlInputs(node);
   }
   if (IsConcat(node)) {
@@ -367,6 +367,28 @@ std::vector<int> DataInputPos(const NodeDef& node) {
   return {};
 }
 
+bool IsHostMemory(const NodeDef& node, int output_port) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (DeviceNameUtils::ParseFullName(node.device(), &parsed_name)) {
+    DeviceType device_type(parsed_name.type);
+    Status s = FindKernelDef(device_type, node, nullptr, nullptr);
+    if (s.ok()) {
+      tensorflow::MemoryTypeVector in_mtypes;
+      tensorflow::MemoryTypeVector out_mtypes;
+      s = tensorflow::MemoryTypesForNode(OpRegistry::Global(), device_type,
+                                         node, &in_mtypes, &out_mtypes);
+      if (s.ok()) {
+        if (out_mtypes[output_port] == HOST_MEMORY) {
+          return true;
+        }
+      }
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
 class GraphProcessor {
  public:
   GraphProcessor(const GraphProperties& graph_properties,
@@ -551,8 +573,8 @@ class NodeProcessor : public GraphProcessor {
     string device;
     string not_used;
     if (DeviceNameUtils::SplitDeviceName(device_name, &not_used, &device) &&
-        (StringPiece(str_util::Lowercase(device)))
-            .contains(str_util::Lowercase(DEVICE_GPU))) {
+        str_util::StrContains(str_util::Lowercase(device),
+                              str_util::Lowercase(DEVICE_GPU))) {
       return true;
     }
     return false;
@@ -887,6 +909,22 @@ class NodeProcessor : public GraphProcessor {
     list->set_i(3, w);
   }
 
+  bool IsInputOnHost(const string& input_name) const {
+    string device = node_->device();
+    DeviceNameUtils::ParsedName parsed_name;
+    if (DeviceNameUtils::ParseFullName(device, &parsed_name)) {
+      if (parsed_name.type != "CPU") {
+        NodeDef* input = node_map_->GetNode(input_name);
+        int port;
+        ParseNodeName(input_name, &port);
+        if (IsHostMemory(*input, port)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
   NodeDef* AddNodeDataFormatOp(const string& name, const string& input_name,
                                const string& op, DataType dtype,
                                bool nhwc_to_nchw) {
@@ -895,6 +933,13 @@ class NodeProcessor : public GraphProcessor {
     added_node->set_op(op);
     node_map_->AddNode(added_node->name(), added_node);
     added_node->set_device(node_->device());
+    // The inputs of a DataFormat op could be in host memory for ops such as
+    // Reshape. In such cases, run the kernel on the host too.
+    if (IsInputOnHost(input_name)) {
+      AttrValue attr_kernel;
+      attr_kernel.set_s("host");
+      added_node->mutable_attr()->insert({"_kernel", attr_kernel});
+    }
     AttrValue attr_data_type;
     attr_data_type.set_type(dtype);
     added_node->mutable_attr()->insert({"T", attr_data_type});
@@ -1160,9 +1205,11 @@ class AgnosticNodeProcessor : public NodeProcessor {
     std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
     std::deque<NodeDef*> queue;
     auto data_node_pos = DataInputPos(node);
+    std::unordered_set<string> visited;
     for (const auto& pos : data_node_pos) {
       auto input_node = node_map_->GetNode(node.input(pos));
       queue.push_back(input_node);
+      visited.insert(input_node->name());
     }
     // The code will exit this while loop in one iteration in most cases, as the
     // graph is already topologically sorted.
@@ -1181,7 +1228,10 @@ class AgnosticNodeProcessor : public NodeProcessor {
         auto current_node_pos = DataInputPos(*current_node);
         for (const auto& pos : current_node_pos) {
           auto input_node = node_map_->GetNode(current_node->input(pos));
-          queue.push_back(input_node);
+          if (visited.find(input_node->name()) == visited.end()) {
+            queue.push_back(input_node);
+            visited.insert(input_node->name());
+          }
         }
       }
     }
@@ -2085,14 +2135,7 @@ int GetNumGPUs(const Cluster& cluster) {
   int num_gpus = 0;
   for (const auto& device : devices) {
     if (device.second.type() == "GPU") {
-      if (device.second.environment().find("architecture") !=
-          device.second.environment().end()) {
-        const string arch = device.second.environment().at("architecture");
-        // TODO(yaozhang): Enable for Volta GPUs (compute capability version 7).
-        if (arch < "7") {
-          num_gpus++;
-        }
-      }
+      num_gpus++;
     }
   }
   return num_gpus;
@@ -2118,6 +2161,10 @@ Status LayoutOptimizer::Tune(const GrapplerItem& item,
 
 Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
+  if (cluster == nullptr) {
+    return errors::InvalidArgument("cluster == nullptr");
+  }
+
   if (GetNumGPUs(*cluster) < 1) {
     // LayoutOptimizer is currently only tuned for GPU.
     *output = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.h b/tensorflow/core/grappler/optimizers/layout_optimizer.h
index 357205828ddea3f35a6dd202606a5b59d8baa5a5..49b697bb75b6b86870f633f24494bfb9d3d45c72 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
@@ -57,4 +57,4 @@ class LayoutOptimizer : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 5cb366df2dccee2260c6f407e992e73296712ccc..dad49cd74f8d26fde58c8d11c8707a7c62e94dab 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -17,10 +17,15 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
@@ -29,15 +34,25 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class LayoutOptimizerTest : public ::testing::Test {
+class LayoutOptimizerTest : public GrapplerTest {
  protected:
   void SetUp() override {
-    DeviceProperties device_properties;
-    device_properties.set_type("GPU");
-    device_properties.mutable_environment()->insert({"architecture", "6"});
-    virtual_cluster_.reset(new VirtualCluster({{"/GPU:0", device_properties}}));
+    gpu_available_ = GetNumAvailableGPUs() > 0;
+
+    if (gpu_available_) {
+      virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
+    } else {
+      DeviceProperties device_properties;
+      device_properties.set_type("GPU");
+      device_properties.mutable_environment()->insert({"architecture", "6"});
+      virtual_cluster_.reset(
+          new VirtualCluster({{"/GPU:1", device_properties}}));
+    }
+    TF_CHECK_OK(virtual_cluster_->Provision());
   }
 
+  void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
+
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                       const string& padding) {
     return SimpleConv2D(s, input_size, filter_size, padding, "");
@@ -93,10 +108,8 @@ class LayoutOptimizerTest : public ::testing::Test {
 
     TensorShape filter_shape(
         {filter_size, filter_size, input_depth, filter_count});
-    Tensor filter_data(DT_FLOAT, filter_shape);
-    test::FillIota<float>(&filter_data, 1.0f);
     Output filter =
-        ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
+        ops::Variable(s->WithOpName("Filter"), filter_shape, DT_FLOAT);
 
     int output_height = input_height;
     int output_width = input_width;
@@ -128,6 +141,10 @@ class LayoutOptimizerTest : public ::testing::Test {
     return tensor;
   }
 
+  TensorShape GetAttrShape(const NodeDef& node) {
+    return TensorShape(node.attr().at({"shape"}).shape());
+  }
+
   Output SimpleFusedBatchNormGrad(tensorflow::Scope* s, bool is_training) {
     int batch_size = 16;
     int input_height = 8;
@@ -158,7 +175,8 @@ class LayoutOptimizerTest : public ::testing::Test {
     return output.x_backprop;
   }
 
-  std::unique_ptr<VirtualCluster> virtual_cluster_;
+  std::unique_ptr<Cluster> virtual_cluster_;
+  bool gpu_available_;
 };
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
@@ -182,6 +200,18 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   Tensor input_sizes_expected(DT_INT32, {4});
   test::FillValues<int>(&input_sizes_expected, {128, 3, 7, 7});
   test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
+
+  if (gpu_available_) {
+    TensorShape filter_shape = GetAttrShape(*node_map.GetNode("Filter"));
+    Tensor filter_data = GenerateRandomTensor<DT_FLOAT>(filter_shape);
+    std::vector<string> fetch = {"Fetch"};
+    auto tensors_expected =
+        EvaluateNodes(item.graph, fetch, {{"Filter", filter_data}});
+    auto tensors = EvaluateNodes(output, fetch, {{"Filter", filter_data}});
+    EXPECT_EQ(1, tensors_expected.size());
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  }
 }
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
@@ -1102,6 +1132,55 @@ TEST_F(LayoutOptimizerTest, IdentityNWithInputsVectorAnd4D) {
   EXPECT_EQ(add_node->input(1),
             "identity_n-0-1-TransposeNCHWToNHWC-LayoutOptimizer");
 }
+
+TEST_F(LayoutOptimizerTest, LoopNoLiveLock) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto c = ops::Const(s.WithOpName("const"), 3.0f, {8, 3, 3, 2});
+  auto merge = ops::Merge(s.WithOpName("merge"), {c, c});
+  auto i0 = ops::Identity(s.WithOpName("i0"), merge.output);
+  ops::Variable v_ctrl(s.WithOpName("v_ctrl"), {}, DT_BOOL);
+  auto sw = ops::Switch(s.WithOpName("switch"), i0, v_ctrl);
+  auto next = ops::NextIteration(s.WithOpName("next"), sw.output_true);
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, sw.output_false);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  NodeMap node_map_original(&item.graph);
+  auto merge_node = node_map_original.GetNode("merge");
+  // Modify the graph to create a loop
+  merge_node->set_input(1, "next");
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->input(0),
+            "Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer");
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0),
+            "Conv2D-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+}
+
+TEST_F(LayoutOptimizerTest, DevicePlacement) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto shape = ops::Shape(s.WithOpName("s"), conv);
+  auto i = ops::Identity(s.WithOpName("i"), shape);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  VirtualPlacer virtual_placer(virtual_cluster_.get());
+  for (auto& node : *item.graph.mutable_node()) {
+    string device = virtual_placer.get_canonical_device_name(node);
+    node.set_device(device);
+  }
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto vec_permute =
+      node_map.GetNode("s-0-0-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(vec_permute->attr().at("_kernel").s(), "host");
+}
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 102526e22f4742cb90757a1daf55467dd16afc3e..f7994221bb30cc0b019e3cf3ee251afb25dcaf22 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -15,23 +15,510 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 
+#include <algorithm>
+#include <deque>
+#include <limits>
 #include <unordered_map>
 #include <unordered_set>
+#include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+
+using tensorflow::strings::StrCat;
 
 namespace tensorflow {
 namespace grappler {
+namespace {
+
+class LoopInvariantNodeMotionOptimizer {
+ public:
+  explicit LoopInvariantNodeMotionOptimizer(GraphDef* optimized_graph)
+      : optimized_graph_(optimized_graph) {}
+  virtual ~LoopInvariantNodeMotionOptimizer() = default;
+  Status Optimize();
+
+ private:
+  Status FindInvariantNodes(NodeDef* node);
+  Status RevertInvariantNodes();
+  Status MoveInvariantNodes(const int frame_id);
+  Status HandleInvariantNode(NodeDef* node, const int num_outputs,
+                             const int frame_id);
+  Status HandleConst(NodeDef* node, const int num_outputs, const int frame_id);
+  Status HandleInvariantEnter(NodeDef* node, const int num_outputs);
+
+  GraphDef* optimized_graph_;  // Not owned.
+  std::unique_ptr<NodeMap> node_map_;
+  std::map<NodeDef*, int> invariant_nodes_;
+  std::set<int> empty_set_;
+  // TODO(rmlarsen): Use vector instead of map, since frames ids are dense.
+  std::map<int, std::set<int>> frame_children_;
+  std::map<int, int> frame_parent_;
+  std::map<int, const NodeDef*> loop_cond_;
+  std::map<int, std::vector<NodeDef*>> invariant_enters_;
+  int new_enter_id_;
+};
+
+Status LoopInvariantNodeMotionOptimizer::HandleInvariantEnter(
+    NodeDef* node, const int num_outputs) {
+  auto consumers = node_map_->GetOutputs(node->name());
+  std::vector<string> enter_control_inputs;
+  string enter_input;
+  for (auto& input : node->input()) {
+    if (IsControlInput(input)) {
+      enter_control_inputs.push_back(input);
+    } else {
+      enter_input = input;
+    }
+  }
+  for (auto* consumer : consumers) {
+    if (invariant_nodes_.count(consumer)) {
+      for (int i = 0; i < consumer->input_size(); ++i) {
+        if (NodeName(consumer->input(i)) == node->name()) {
+          consumer->set_input(i, enter_input);
+          node_map_->AddOutput(NodeName(enter_input), consumer->name());
+          node_map_->RemoveOutput(node->name(), consumer->name());
+        }
+      }
+      for (auto& control_input : enter_control_inputs) {
+        consumer->add_input(control_input);
+        node_map_->AddOutput(NodeName(control_input), consumer->name());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopInvariantNodeMotionOptimizer::HandleConst(NodeDef* node,
+                                                     const int num_outputs,
+                                                     const int frame_id) {
+  NodeDef* const_node = nullptr;
+  if (num_outputs == 0) {
+    // all successor nodes are invariant
+    // Remove the control inputs from this frame to the const node,
+    // when moving it out of the frame (in parent frame)
+    const_node = node;
+    node_map_->RemoveInputs(node->name());
+    node->clear_input();
+  } else {
+    // some successor nodes are variant
+    // Have to keep the const node in the frame,
+    // so create a new one outside the frame (in parent frame)
+    const string const_node_name =
+        AddPrefixToNodeName(node->name(), kLoopOptimizer);
+    const_node = node_map_->GetNode(const_node_name);
+    if (const_node == nullptr) {
+      const_node = optimized_graph_->add_node();
+      const_node->set_name(const_node_name);
+      const_node->set_op("Const");
+      const_node->set_device(node->device());
+      *const_node->mutable_attr() = node->attr();
+      node_map_->AddNode(const_node->name(), const_node);
+    }
+    auto consumers = node_map_->GetOutputs(node->name());
+    for (auto* consumer : consumers) {
+      if (invariant_nodes_.count(consumer)) {
+        for (int i = 0; i < consumer->input_size(); ++i) {
+          if (NodeName(consumer->input(i)) == node->name()) {
+            if (IsControlInput(consumer->input(i))) {
+              *consumer->mutable_input(i) = AsControlDependency(*const_node);
+            } else {
+              *consumer->mutable_input(i) = const_node->name();
+            }
+            node_map_->AddOutput(const_node->name(), consumer->name());
+            node_map_->RemoveOutput(node->name(), consumer->name());
+          }
+        }
+      }
+    }
+  }
+  // add a control input from the parent frame
+  auto parent_it = frame_parent_.find(frame_id);
+  if (parent_it != frame_parent_.end()) {
+    int parent_id = parent_it->second;
+    auto loop_cond_it = loop_cond_.find(parent_id);
+    if (loop_cond_it == loop_cond_.end()) {
+      return errors::InvalidArgument("Frame ", frame_id,
+                                     " doesn't have a LoopCond node");
+    }
+    auto& loop_cond_name = loop_cond_it->second->name();
+    NodeDef* switch_node = nullptr;
+    for (auto* node : node_map_->GetOutputs(loop_cond_name)) {
+      if (node->op() == "Switch") {
+        switch_node = node;
+        break;
+      }
+    }
+    if (!switch_node) {
+      return errors::InvalidArgument("LoopCond node of Frame ", frame_id,
+                                     " doesn't connect to any Switch node");
+    }
+    string switch_output = StrCat(switch_node->name(), ":1");
+    const string ctrl_dep = ConstantFolding::AddControlDependency(
+        switch_output, optimized_graph_, node_map_.get());
+    const_node->add_input(ctrl_dep);
+    node_map_->AddOutput(NodeName(ctrl_dep), const_node->name());
+  }
+  return Status::OK();
+}
+
+Status LoopInvariantNodeMotionOptimizer::HandleInvariantNode(
+    NodeDef* node, const int num_outputs, const int frame_id) {
+  // have to remove control inputs to the invariant node from the same frame
+  // when moving this node out of this frame
+  for (int i = 0; i < node->input_size(); ++i) {
+    if (IsControlInput(node->input(i))) {
+      node->mutable_input()->SwapElements(i, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+    }
+  }
+  if (num_outputs == 0) {
+    return Status::OK();
+  }
+
+  DataTypeVector input_types;
+  DataTypeVector output_types;
+  OpRegistryInterface* op_registry = OpRegistry::Global();
+  const OpRegistrationData* op_reg_data = nullptr;
+  TF_RETURN_IF_ERROR(op_registry->LookUp(node->op(), &op_reg_data));
+  TF_RETURN_IF_ERROR(InOutTypesForNode(*node, op_reg_data->op_def, &input_types,
+                                       &output_types));
+
+  auto consumers = node_map_->GetOutputs(node->name());
+  string fname = invariant_enters_[frame_id][0]->attr().at("frame_name").s();
+  int piterations =
+      invariant_enters_[frame_id][0]->attr().at("parallel_iterations").i();
+  for (auto* consumer : consumers) {
+    if (!invariant_nodes_.count(consumer)) {
+      for (int i = 0; i < consumer->input_size(); ++i) {
+        int port;
+        string node_name = ParseNodeName(consumer->input(i), &port);
+        if (node_name != node->name()) {
+          continue;
+        }
+        if (port < 0) {
+          return errors::InvalidArgument(
+              "Invariant node should not have control outputs "
+              "to variant node");
+        }
+        DataType output_type = output_types[port];
+        NodeDef* new_enter = optimized_graph_->add_node();
+        new_enter->set_op("Enter");
+        new_enter->set_device(node->device());
+        new_enter->set_name(AddPrefixToNodeName(
+            StrCat(fname, "_enter_", new_enter_id_++), kLoopOptimizer));
+        AttrValue data_type;
+        data_type.set_type(output_type);
+        new_enter->mutable_attr()->insert({"T", data_type});
+        AttrValue frame_name;
+        frame_name.set_s(fname);
+        new_enter->mutable_attr()->insert({"frame_name", frame_name});
+        AttrValue is_const;
+        is_const.set_b(true);
+        new_enter->mutable_attr()->insert({"is_constant", is_const});
+        AttrValue parallel_iterations;
+        parallel_iterations.set_i(piterations);
+        new_enter->mutable_attr()->insert(
+            {"parallel_iterations", parallel_iterations});
+        new_enter->add_input(consumer->input(i));
+        *consumer->mutable_input(i) = new_enter->name();
+        node_map_->AddNode(new_enter->name(), new_enter);
+        node_map_->AddOutput(node->name(), new_enter->name());
+        node_map_->AddOutput(new_enter->name(), consumer->name());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopInvariantNodeMotionOptimizer::MoveInvariantNodes(
+    const int frame_id) {
+  for (auto iter = invariant_nodes_.begin(); iter != invariant_nodes_.end();
+       ++iter) {
+    auto* invariant_node = iter->first;
+    const int num_outputs = iter->second;
+    if (IsEnter(*invariant_node)) {
+      TF_RETURN_IF_ERROR(HandleInvariantEnter(invariant_node, num_outputs));
+    } else if (IsConstant(*invariant_node)) {
+      TF_RETURN_IF_ERROR(HandleConst(invariant_node, num_outputs, frame_id));
+    } else {
+      TF_RETURN_IF_ERROR(
+          HandleInvariantNode(invariant_node, num_outputs, frame_id));
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopInvariantNodeMotionOptimizer::RevertInvariantNodes() {
+  std::deque<const NodeDef*> reverted_nodes;
+  for (auto iter = invariant_nodes_.begin(); iter != invariant_nodes_.end();) {
+    bool erased = false;
+    const auto* node = iter->first;
+    if (!IsConstant(*node) && !IsEnter(*node) && iter->second > 0) {
+      auto& consumers = node_map_->GetOutputs(node->name());
+      for (auto* consumer : consumers) {
+        if (!invariant_nodes_.count(consumer)) {
+          for (const auto& input : consumer->input()) {
+            if (IsControlInput(input) && NodeName(input) == node->name()) {
+              reverted_nodes.push_back(node);
+              invariant_nodes_.erase(iter++);
+              erased = true;
+              break;
+            }
+          }
+          if (erased) break;
+        }
+      }
+    }
+    if (!erased) ++iter;
+  }
+  while (!reverted_nodes.empty()) {
+    const auto* node = reverted_nodes.front();
+    reverted_nodes.pop_front();
+    std::set<NodeDef*> producers;
+    for (const auto& input : node->input()) {
+      auto* producer = node_map_->GetNode(input);
+      auto iter = invariant_nodes_.find(producer);
+      if (iter != invariant_nodes_.end()) {
+        if (IsControlInput(input) && !IsConstant(*producer) &&
+            !IsEnter(*producer)) {
+          reverted_nodes.push_back(producer);
+          invariant_nodes_.erase(iter);
+        } else {
+          producers.insert(producer);
+        }
+      }
+    }
+    for (auto* producer : producers) {
+      auto iter = invariant_nodes_.find(producer);
+      if (iter != invariant_nodes_.end()) {
+        ++iter->second;
+      }
+    }
+    for (auto* consumer : node_map_->GetOutputs(node->name())) {
+      auto iter = invariant_nodes_.find(consumer);
+      if (iter != invariant_nodes_.end()) {
+        reverted_nodes.push_back(consumer);
+        invariant_nodes_.erase(iter);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(
+    NodeDef* start_node) {
+  std::vector<NodeDef*> stack;
+  stack.reserve(32);
+  stack.push_back(start_node);
+  while (!stack.empty()) {
+    NodeDef* node = stack.back();
+    stack.pop_back();
+    auto consumers = node_map_->GetOutputs(node->name());
+    invariant_nodes_.emplace(node, consumers.size());
+    for (auto* consumer : consumers) {
+      if (invariant_nodes_.count(consumer) || ModifiesFrameInfo(*consumer)) {
+        continue;
+      }
+      bool is_invariant = true;
+      for (const auto& input : consumer->input()) {
+        if (!IsControlInput(input)) {
+          const string name = NodeName(input);
+          auto* producer = node_map_->GetNode(name);
+          if (!invariant_nodes_.count(producer)) {
+            if (IsConstant(*producer)) {
+              invariant_nodes_.insert(
+                  std::make_pair(producer, node_map_->GetOutputs(name).size()));
+            } else {
+              is_invariant = false;
+              break;
+            }
+          }
+        }
+      }
+      if (is_invariant) {
+        std::set<NodeDef*> producers;
+        for (const auto& input : consumer->input()) {
+          auto* producer = node_map_->GetNode(input);
+          producers.insert(producer);
+        }
+        for (auto* producer : producers) {
+          auto iter = invariant_nodes_.find(producer);
+          if (iter != invariant_nodes_.end()) {
+            --iter->second;
+          }
+        }
+        stack.push_back(consumer);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status LoopInvariantNodeMotionOptimizer::Optimize() {
+  node_map_.reset(new NodeMap(optimized_graph_));
+  FrameMap frame_map;
+  int num_frames;
+  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
+                                               &frame_map, &num_frames));
+  std::deque<int> worklist;
+  for (auto iter = frame_map.begin(); iter != frame_map.end(); ++iter) {
+    auto* node = iter->first;
+    auto& frame_ids = iter->second;
+    if (frame_ids.size() >= 3) {
+      for (unsigned int i = 1; i < frame_ids.size() - 1; ++i) {
+        frame_parent_[frame_ids[i]] = frame_ids[i - 1];
+        frame_children_[frame_ids[i]].insert(frame_ids[i + 1]);
+      }
+    }
+    if (frame_ids.size() >= 2) {
+      frame_children_[frame_ids[0]].insert(frame_ids[1]);
+      frame_parent_[frame_ids.back()] = frame_ids[frame_ids.size() - 2];
+    }
+    if (frame_ids.size() >= 1) {
+      frame_children_.insert(std::make_pair(frame_ids.back(), empty_set_));
+      if (node->op() == "LoopCond") {
+        if (loop_cond_.count(frame_ids.back())) {
+          return errors::InvalidArgument(
+              "Loop ", frame_ids.back(),
+              " has more than one LoopCond node: ", node->name(), " and ",
+              loop_cond_[frame_ids.back()]->name());
+        }
+        loop_cond_[frame_ids.back()] = node;
+      }
+      if (IsEnter(*node) && node->attr().at("is_constant").b()) {
+        invariant_enters_[frame_ids.back()].push_back(
+            const_cast<NodeDef*>(node));
+      }
+    }
+  }
+
+  for (auto it = frame_children_.begin(); it != frame_children_.end(); ++it) {
+    if (it->second.size() == 0) {
+      worklist.push_back(it->first);
+    }
+  }
+
+  while (!worklist.empty()) {
+    int frame_id = worklist.front();
+    new_enter_id_ = 0;
+    worklist.pop_front();
+    auto parent_it = frame_parent_.find(frame_id);
+    if (parent_it != frame_parent_.end()) {
+      int parent_id = parent_it->second;
+      frame_children_[parent_id].erase(frame_id);
+      if (frame_children_[parent_id].size() == 0) {
+        worklist.push_back(parent_id);
+      }
+    }
+
+    if (invariant_enters_[frame_id].empty()) {
+      continue;
+    }
+    invariant_nodes_.clear();
+    for (auto* enter : invariant_enters_[frame_id]) {
+      TF_RETURN_IF_ERROR(FindInvariantNodes(enter));
+    }
+
+    // revert invariant nodes that have control outputs to variant nodes
+    TF_RETURN_IF_ERROR(RevertInvariantNodes());
+
+    TF_RETURN_IF_ERROR(MoveInvariantNodes(frame_id));
+  }
+  return Status::OK();
+}
+
+std::vector<int> GetStackPushNodesToConvert(
+    const SimpleGraphView& graph_view,
+    const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
+  VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
+  const std::unordered_set<string> op_types_to_traverse(
+      {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
+       "Identity", "RefIdentity"});
+  std::vector<int> nodes_to_convert;
+  std::set<int> fanout;
+  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
+  for (int fanout_idx : fanout) {
+    const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
+    VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
+    if (IsStackPushOp(fanout_node)) {
+      nodes_to_convert.push_back(fanout_idx);
+    } else if (IsStackOp(fanout_node) || IsStackCloseOp(fanout_node) ||
+               op_types_to_traverse.find(fanout_node.op()) !=
+                   op_types_to_traverse.end()) {
+      continue;
+    } else if (!IsStackPopOp(fanout_node) ||
+               (!graph_view.outputs(fanout_idx).empty() ||
+                nodes_to_preserve.find(fanout_node.name()) !=
+                    nodes_to_preserve.end())) {
+      // The node is either a stack pop with consumers or something unexpected
+      // so we leave the graph alone.
+      nodes_to_convert.clear();
+      break;
+    }
+  }
+  return nodes_to_convert;
+}
+
+Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
+  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
+  const GraphDef& graph = item.graph;
+  *optimized_graph = graph;
+  NodeMap node_map(optimized_graph);
+  SimpleGraphView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
+  for (int node_idx = 0; node_idx < graph.node_size(); ++node_idx) {
+    if (IsStackOp(graph.node(node_idx))) {
+      for (int push_node_idx : GetStackPushNodesToConvert(
+               graph_view, nodes_to_preserve, node_idx)) {
+        // We found push nodes without corresponding pops. Convert them to
+        // Identity passing the data through and add a control dependency from
+        // the op supplying the stack handle.
+        NodeDef* push_node = optimized_graph->mutable_node(push_node_idx);
+        VLOG(1) << "Converting " << push_node_idx << " : "
+                << push_node->DebugString();
+        if (push_node->attr().count("swap_memory") != 0) {
+          push_node->mutable_attr()->erase("swap_memory");
+        }
+        push_node->set_op("Identity");
+        push_node->mutable_input()->SwapElements(0, 1);
+        const string ctrl_dep = ConstantFolding::AddControlDependency(
+            push_node->input(1), optimized_graph, &node_map);
+        push_node->set_input(1, ctrl_dep);
+        VLOG(1) << "After converting: " << push_node->DebugString();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
 
 Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
+  // Set up helper data structures.
+  if (options_.enable_loop_invariant_node_motion) {
+    LoopInvariantNodeMotionOptimizer linm_optimizer(optimized_graph);
+    TF_RETURN_IF_ERROR(linm_optimizer.Optimize());
+  }
+  if (options_.enable_stack_push_removal) {
+    TF_RETURN_IF_ERROR(RemoveStackOps(item, optimized_graph));
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index 106d4628ae68f3c92ab597f903f96a6af8a64b8d..a422505d23c197a6fd677c97e326b529a4bd57b2 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -17,18 +17,26 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LOOP_OPTIMIZER_H_
 
 #include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
 
+constexpr char kLoopOptimizer[] = "LoopOptimizer";
+
 class LoopOptimizer : public GraphOptimizer {
  public:
-  LoopOptimizer() : opt_level_(RewriterConfig::ON) {}
+  LoopOptimizer()
+      : opt_level_(RewriterConfig::ON),
+        options_(LoopOptimizerOptions::Default(RewriterConfig::ON)) {}
   explicit LoopOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+      : opt_level_(opt_level),
+        options_(LoopOptimizerOptions::Default(RewriterConfig::ON)) {}
+
   ~LoopOptimizer() override {}
 
   string name() const override { return "loop_optimizer"; };
@@ -40,7 +48,21 @@ class LoopOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  friend class LoopOptimizerTest;
+
+  // Granular control for loop optimizer stages.
+  struct LoopOptimizerOptions {
+    bool enable_loop_invariant_node_motion = true;
+    bool enable_stack_push_removal = true;
+
+    static LoopOptimizerOptions Default(RewriterConfig::Toggle opt_level) {
+      LoopOptimizerOptions options;
+      return options;
+    }
+  };
+
   RewriterConfig::Toggle opt_level_;
+  LoopOptimizerOptions options_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index c09434f60916b9bf269b0f5006b8a3732afaa5fc..10ec544424e651a1c0d39ef6af9a8f824de6c99e 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -19,14 +19,462 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
-class LoopOptimizerTest : public ::testing::Test {};
+class LoopOptimizerTest : public GrapplerTest {
+ protected:
+  // These helpers always sets T=DT_FLOAT.
+  void AddEnterNode(const string& name, const string& frame,
+                    const bool is_constant, const int piterations,
+                    const std::vector<string>& inputs, GraphDef* graph) const {
+    std::vector<std::pair<string, AttrValue>> attributes;
+    AttrValue type;
+    type.set_type(DT_FLOAT);
+    attributes.emplace_back("T", type);
+    AttrValue frame_name;
+    frame_name.set_s(frame);
+    attributes.emplace_back("frame_name", frame_name);
+    AttrValue is_const;
+    is_const.set_b(is_constant);
+    attributes.emplace_back("is_constant", is_const);
+    AttrValue parallel_iterations;
+    parallel_iterations.set_i(piterations);
+    attributes.emplace_back("parallel_iterations", parallel_iterations);
+    AddNode(name, "Enter", inputs, attributes, graph);
+  }
+
+  void AddSimpleNode(const string& name, const string& op,
+                     const std::vector<string>& inputs, GraphDef* graph) const {
+    std::vector<std::pair<string, AttrValue>> attributes;
+    AttrValue type;
+    type.set_type(DT_FLOAT);
+    attributes.emplace_back("T", type);
+    AddNode(name, op, inputs, attributes, graph);
+  }
+
+  void DisableAllStages(LoopOptimizer* optimizer) {
+    LoopOptimizer::LoopOptimizerOptions options;
+    options.enable_loop_invariant_node_motion = false;
+    options.enable_stack_push_removal = false;
+    optimizer->options_ = options;
+  }
+
+  void EnableOnlyLoopInvariantNodeMotion(LoopOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.enable_loop_invariant_node_motion = true;
+  }
+
+  void EnableOnlyStackPushRemoval(LoopOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.enable_stack_push_removal = true;
+  }
+};
+
+TEST_F(LoopOptimizerTest, Basic) {
+  GraphDef graph;
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"VariantAdd", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"VariantAdd"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
+}
+
+TEST_F(LoopOptimizerTest, Const) {
+  GraphDef graph;
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("Const", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "Const"}, &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"VariantAdd", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"VariantAdd"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const")).back(), 0);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 0);
+}
+
+TEST_F(LoopOptimizerTest, ControlOutput) {
+  GraphDef graph;
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"VariantAdd", "Less/y", "^InvariantAdd"},
+                &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"VariantAdd"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+}
+
+TEST_F(LoopOptimizerTest, NestedLoop1) {
+  GraphDef graph;
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"Exit2", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"Exit2"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  AddEnterNode("InvariantEnter2", "while/while/while_context", true, 1,
+               {"VariantAdd"}, &graph);
+  AddSimpleNode("InvariantAdd2", "Add", {"InvariantEnter2", "InvariantEnter2"},
+                &graph);
+  AddSimpleNode("VariantAdd2", "Add", {"InvariantAdd2", "Identity2"}, &graph);
+  AddEnterNode("VariantEnter2", "while/while/while_context", false, 1,
+               {"VariantEnter"}, &graph);
+  AddSimpleNode("Merge2", "Merge", {"VariantEnter2", "NextIteration2"}, &graph);
+  AddSimpleNode("Less2/y", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("Less2", "Less", {"VariantAdd2", "Less2/y"}, &graph);
+  AddSimpleNode("LoopCond2", "LoopCond", {"Less2"}, &graph);
+  AddSimpleNode("Switch2", "Switch", {"Merge2", "LoopCond2"}, &graph);
+  AddSimpleNode("Identity2", "Identity", {"Switch2:1"}, &graph);
+  AddSimpleNode("NextIteration2", "NextIteration", {"VariantAdd2"}, &graph);
+  AddSimpleNode("Exit2", "Exit", {"Switch2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
+}
+
+TEST_F(LoopOptimizerTest, NestedLoop2) {
+  GraphDef graph;
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"Exit2", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"Exit2"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  AddEnterNode("InvariantEnter2", "while/while/while_context", true, 1,
+               {"InvariantAdd"}, &graph);
+  AddSimpleNode("InvariantAdd2", "Add", {"InvariantEnter2", "InvariantEnter2"},
+                &graph);
+  AddSimpleNode("VariantAdd2", "Add", {"InvariantAdd2", "Identity2"}, &graph);
+  AddEnterNode("VariantEnter2", "while/while/while_context", false, 1,
+               {"VariantEnter"}, &graph);
+  AddSimpleNode("Merge2", "Merge", {"VariantEnter2", "NextIteration2"}, &graph);
+  AddSimpleNode("Less2/y", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("Less2", "Less", {"VariantAdd2", "Less2/y"}, &graph);
+  AddSimpleNode("LoopCond2", "LoopCond", {"Less2"}, &graph);
+  AddSimpleNode("Switch2", "Switch", {"Merge2", "LoopCond2"}, &graph);
+  AddSimpleNode("Identity2", "Identity", {"Switch2:1"}, &graph);
+  AddSimpleNode("NextIteration2", "NextIteration", {"VariantAdd2"}, &graph);
+  AddSimpleNode("Exit2", "Exit", {"Switch2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+}
+
+TEST_F(LoopOptimizerTest, NestedLoopConst1) {
+  GraphDef graph;
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"Exit2", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"Exit2"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  AddEnterNode("InvariantEnter2", "while/while/while_context", true, 1,
+               {"VariantAdd"}, &graph);
+  AddSimpleNode("Const2", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("InvariantAdd2", "Add", {"InvariantEnter2", "Const2"}, &graph);
+  AddSimpleNode("VariantAdd2", "Add", {"InvariantAdd2", "Identity2"}, &graph);
+  AddEnterNode("VariantEnter2", "while/while/while_context", false, 1,
+               {"VariantEnter"}, &graph);
+  AddSimpleNode("Merge2", "Merge", {"VariantEnter2", "NextIteration2"}, &graph);
+  AddSimpleNode("Less2/y", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("Less2", "Less", {"VariantAdd2", "Less2/y"}, &graph);
+  AddSimpleNode("LoopCond2", "LoopCond", {"Less2"}, &graph);
+  AddSimpleNode("Switch2", "Switch", {"Merge2", "LoopCond2"}, &graph);
+  AddSimpleNode("Identity2", "Identity", {"Switch2:1"}, &graph);
+  AddSimpleNode("NextIteration2", "NextIteration", {"VariantAdd2"}, &graph);
+  AddSimpleNode("Exit2", "Exit", {"Switch2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 0);
+}
+
+TEST_F(LoopOptimizerTest, NestedLoopConst2) {
+  GraphDef graph;
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"Exit2", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"Exit2"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  AddEnterNode("InvariantEnter2", "while/while/while_context", true, 1,
+               {"InvariantAdd"}, &graph);
+  AddSimpleNode("Const2", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("InvariantAdd2", "Add", {"InvariantEnter2", "Const2"}, &graph);
+  AddSimpleNode("VariantAdd2", "Add", {"InvariantAdd2", "Identity2"}, &graph);
+  AddEnterNode("VariantEnter2", "while/while/while_context", false, 1,
+               {"VariantEnter"}, &graph);
+  AddSimpleNode("Merge2", "Merge", {"VariantEnter2", "NextIteration2"}, &graph);
+  AddSimpleNode("Less2/y", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("Less2", "Less", {"VariantAdd2", "Less2/y"}, &graph);
+  AddSimpleNode("LoopCond2", "LoopCond", {"Less2"}, &graph);
+  AddSimpleNode("Switch2", "Switch", {"Merge2", "LoopCond2"}, &graph);
+  AddSimpleNode("Identity2", "Identity", {"Switch2:1"}, &graph);
+  AddSimpleNode("NextIteration2", "NextIteration", {"VariantAdd2"}, &graph);
+  AddSimpleNode("Exit2", "Exit", {"Switch2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::unique_ptr<NodeMap> node_map;
+  std::unordered_map<const NodeDef*, std::vector<int>> frames;
+  int num_frames;
+
+  node_map.reset(new NodeMap(&graph));
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
+
+  node_map.reset(new NodeMap(&output));
+  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
+  EXPECT_EQ(num_frames, 2);
+  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
+  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 0);
+}
 
 void VerifyGraphsEqual(const GraphDef& original_graph,
                        const GraphDef& optimized_graph, const string& func) {
@@ -50,6 +498,7 @@ TEST_F(LoopOptimizerTest, NoOp) {
   CHECK(fake_input.NextItem(&item));
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -57,6 +506,88 @@ TEST_F(LoopOptimizerTest, NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-}  // namespace
+TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
+  GrapplerItem item;
+  GraphDef& graph = item.graph;
+  AddSimpleNode("c", "Const", {}, &graph);
+  // Stack with corresponding push/pop.
+  AddSimpleNode("stack1", "StackV2", {}, &graph);
+  AddSimpleNode("push1", "StackPushV2", {"stack1", "c"}, &graph);
+  AddSimpleNode("pop1", "StackPopV2", {"stack1"}, &graph);
+  AddSimpleNode("id1", "Identity", {"pop1"}, &graph);
+  // Stack with corresponding push/pop behind Enter.
+  AddSimpleNode("stack2", "StackV2", {}, &graph);
+  AddEnterNode("enter2_c", "frame_name", false, 1, {"c"}, &graph);
+  AddEnterNode("enter2_stack2", "frame_name", false, 1, {"stack2"}, &graph);
+  AddSimpleNode("push2", "StackPushV2", {"enter2_stack2", "enter2_c"}, &graph);
+  AddSimpleNode("pop2", "StackPopV2", {"enter2_stack2"}, &graph);
+  AddSimpleNode("id2", "Identity", {"pop2"}, &graph);
+  // Stack with unexpected op type in fanout of Stack.
+  AddSimpleNode("stack3", "StackV2", {}, &graph);
+  AddSimpleNode("push3", "StackPushV2", {"stack3", "c"}, &graph);
+  AddSimpleNode("stop", "StopGradient", {"stack3"}, &graph);
+
+  LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
+TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
+  GrapplerItem item;
+  GraphDef& graph = item.graph;
+  AddSimpleNode("c", "Const", {}, &graph);
+  // Push without Pop.
+  AddSimpleNode("stack1", "StackV2", {}, &graph);
+  AddSimpleNode("push1", "StackPushV2", {"stack1", "c"}, &graph);
+  // Push without Pop behind Enter.
+  AddSimpleNode("stack2", "StackV2", {}, &graph);
+  AddEnterNode("enter_c", "frame_name", false, 1, {"c"}, &graph);
+  AddEnterNode("enter_stack2", "frame_name", false, 1, {"stack2"}, &graph);
+  AddSimpleNode("push2", "StackPushV2", {"enter_stack2", "enter_c"}, &graph);
+  // Pop without consumer.
+  AddSimpleNode("stack3", "StackV2", {}, &graph);
+  AddSimpleNode("push3", "StackPushV2", {"stack3", "c"}, &graph);
+  AddSimpleNode("pop3", "StackPopV2", {"stack3"}, &graph);
+  // Push for a Pop without consumer that is fetched should not be removed.
+  AddSimpleNode("stack4", "StackV2", {}, &graph);
+  AddSimpleNode("push4", "StackPushV2", {"stack4", "c"}, &graph);
+  AddSimpleNode("pop4", "StackPopV2", {"stack4"}, &graph);
+
+  item.fetch.push_back("pop4");
+
+  LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(13, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "push1") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("c", node.input(0));
+      EXPECT_EQ("^stack1", node.input(1));
+    } else if (node.name() == "push2") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("enter_c", node.input(0));
+      EXPECT_EQ("^enter_stack2", node.input(1));
+    } else if (node.name() == "push3") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("c", node.input(0));
+      EXPECT_EQ("^stack3", node.input(1));
+    } else {
+      const NodeDef& orig_node = item.graph.node(i);
+      EXPECT_EQ(orig_node.ShortDebugString(), node.ShortDebugString());
+    }
+  }
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 3057ee5fa14bd209ad4bb6a9ad690d57435601f4..c1fee0e993dd18578fec561b5bfc2b7b8987d31f 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -413,7 +413,7 @@ void RecomputeSubgraph(
 }
 
 void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
-                                const string& recomputation_targets_name_prefix,
+                                const string& recomputation_targets_name_scope,
                                 GraphDef* graph, const GrapplerItem& item) {
   if (optimization_level != RewriterConfig::RECOMPUTATION_HEURISTICS &&
       optimization_level != RewriterConfig::HEURISTICS &&
@@ -438,15 +438,14 @@ void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
     feeds.insert(NodeName(feed.first));
   }
   std::function<bool(const NodeDef&)> is_target =
-      [&recomputation_targets_name_prefix](const NodeDef& node) {
-        // Nodes whose inputs we may want to recompute. Typically targets will
-        // be gradients (recomputation_targets_name_prefix="gradients/"),
-        // although the prefix is configurable since gradients may be created
-        // in a name scope.
-        // TODO(allenl): Use a static schedule
-        // (grappler::EstimateEarliestExecutionTimes) to recompute only nodes
-        // whose outputs will sit around for a while.
-        return node.name().find(recomputation_targets_name_prefix) == 0;
+      [&recomputation_targets_name_scope](const NodeDef& node) {
+        // Nodes whose inputs we may want to recompute. This matches node names
+        // that contain recomputation_targets_name_scope as a name scope,
+        // meaning it either begins with or contains the name scope.
+        // Defaults to "gradients/" which will match any node names that begins
+        // with "gradients/" or contains "/gradients/".
+        return node.name().find(recomputation_targets_name_scope) == 0 ||
+               node.name().find("/" + recomputation_targets_name_scope) != -1;
       };
 
   if (optimization_level == RewriterConfig::RECOMPUTATION_HEURISTICS ||
@@ -720,18 +719,19 @@ Status BuildSwapPair(NodeDef* node, int input_to_swap,
   // Force the tensor to be copied to cpu.
   NodeDef* swap_out_node = graph->add_node();
   swap_out_node->set_name(swap_out_name);
-  swap_out_node->set_op("Identity");
-  swap_out_node->set_device("/device:CPU:0");
+  swap_out_node->set_op("_CopyFromGpuToHost");
 
   // Force the tensor to be restored to the device.
   NodeDef* swap_in_node = graph->add_node();
   swap_in_node->set_name(swap_in_name);
-  swap_in_node->set_op("Identity");
+  swap_in_node->set_op("_CopyFromHostToGpu");
   *swap_in_node->add_input() = swap_out_node->name();
 
-  // Colocate the swap_in_ node with the node itself.
+  // Colocate the swap_out_ and swap_in_ nodes with the node itself.
+  swap_out_node->set_device(node->device());
   swap_in_node->set_device(node->device());
   string coloc_group = strings::StrCat("loc@", tensor_to_swap);
+  (*swap_out_node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
   (*swap_in_node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
   (*node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
 
@@ -1104,7 +1104,8 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level,
                   Cluster* cluster, GrapplerItem* item,
                   std::unordered_set<string>* skip_list) {
   std::unordered_map<NodeDef*, SwapInfo> nodes_to_swap;
-  if (optimization_level == RewriterConfig::SWAPPING_HEURISTICS ||
+  if (optimization_level == RewriterConfig::DEFAULT_MEM_OPT ||
+      optimization_level == RewriterConfig::SWAPPING_HEURISTICS ||
       optimization_level == RewriterConfig::HEURISTICS) {
     // Use heuristics to figure out what needs to be swapped;
     IdentifySwappingCandidates(cluster, item, skip_list, &nodes_to_swap);
@@ -1223,10 +1224,10 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   *optimized_graph = item.graph;
 
   RecomputationRewritingPass(optimization_level_,
-                             recomputation_targets_name_prefix_,
-                             optimized_graph, item);
+                             recomputation_targets_name_scope_, optimized_graph,
+                             item);
 
-  GrapplerItem optimized_item(item, std::move(*optimized_graph));
+  GrapplerItem optimized_item(item, optimized_graph);
   std::unordered_set<string> skip_list;
   // Bound the number of rewrite passes to avoid long processing times on graphs
   // that simply won't fit in memory.
@@ -1240,7 +1241,8 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       updated_graph |= SchedulingPass(cluster, &optimized_item);
     }
 
-    if ((optimization_level_ == RewriterConfig::SWAPPING_HEURISTICS ||
+    if ((optimization_level_ == RewriterConfig::DEFAULT_MEM_OPT ||
+         optimization_level_ == RewriterConfig::SWAPPING_HEURISTICS ||
          optimization_level_ == RewriterConfig::HEURISTICS ||
          optimization_level_ == RewriterConfig::MANUAL) &&
         cluster != nullptr) {
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.h b/tensorflow/core/grappler/optimizers/memory_optimizer.h
index c3dd0c45c6c524ef850ce7cfb9f6543d22e783ec..653ffaec4c206cbb925bc40359f3950b17255a75 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -27,14 +27,14 @@ class MemoryOptimizer : public GraphOptimizer {
  public:
   // optimization_level: Controls the level of autonomy for the memory
   //   optimizer. See RewriterConfig::memory_optimization.
-  // recomputation_targets_name_prefix: Name prefix for potential outputs of
+  // recomputation_targets_name_scope: Name scope for potential outputs of
   //   recomputations. See
-  //   RewriterConfig::memory_optimizer_target_node_name_prefix.
+  //   RewriterConfig::memory_optimizer_target_node_name_scope.
   explicit MemoryOptimizer(
       RewriterConfig::MemOptType optimization_level,
-      const string& recomputation_targets_name_prefix = "gradients/")
+      const string& recomputation_targets_name_scope = "gradients/")
       : optimization_level_(optimization_level),
-        recomputation_targets_name_prefix_(recomputation_targets_name_prefix) {}
+        recomputation_targets_name_scope_(recomputation_targets_name_scope) {}
   ~MemoryOptimizer() override {}
 
   string name() const override { return "memory_optimizer"; };
@@ -47,10 +47,10 @@ class MemoryOptimizer : public GraphOptimizer {
 
  private:
   RewriterConfig::MemOptType optimization_level_;
-  string recomputation_targets_name_prefix_;
+  string recomputation_targets_name_scope_;
 };
 
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index 5d7913e0c018ecf14cc09ab91d3a71125c720aa5..a1f80802ddc2b3c959a74e010f6b45cb421864cf 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -221,16 +221,20 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   // Build a simple graph with an op that's marked for swapping.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  Output a = ops::Variable(s.WithOpName("a"), {10, 10}, DT_FLOAT);
-  Output b = ops::AddN(s.WithOpName("b"), {a});
-  Output c = ops::AddN(s.WithOpName("c"), {b});
-  Output d = ops::AddN(s.WithOpName("d"), {c});
-  Output e = ops::AddN(s.WithOpName("e"), {b, d});
+  Output a =
+      ops::Variable(s.WithOpName("a").WithDevice("/gpu:0"), {10, 10}, DT_FLOAT);
+  Output b = ops::AddN(s.WithOpName("b").WithDevice("/gpu:0"), {a});
+  Output c = ops::AddN(s.WithOpName("c").WithDevice("/gpu:0"), {b});
+  Output d = ops::AddN(s.WithOpName("d").WithDevice("/gpu:0"), {c});
+  Output e = ops::AddN(s.WithOpName("e").WithDevice("/gpu:0"), {b, d});
+
+  Output constant = ops::Const(s.WithOpName("constant"), 0.0f, {10, 10});
+  Output init = ops::Assign(s.WithOpName("init"), a, constant);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  EXPECT_EQ(5, item.graph.node_size());
+  EXPECT_EQ(7, item.graph.node_size());
   EXPECT_EQ(NodeName(e.name()), item.graph.node(4).name());
   AttrValue& val =
       (*item.graph.mutable_node(4)->mutable_attr())["_swap_to_host"];
@@ -243,32 +247,43 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   Status status = optimizer.Optimize(cluster.get(), item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(7, output.node_size());
-  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(9, output.node_size());
+  const NodeDef& new_e = output.node(6);
   EXPECT_EQ(NodeName(e.name()), new_e.name());
 
   EXPECT_EQ(2, new_e.input_size());
   EXPECT_EQ(NodeName(d.name()), new_e.input(1));
   EXPECT_EQ("swap_in_e_0", new_e.input(0));
 
-  const NodeDef& swap_out = output.node(5);
+  const NodeDef& swap_out = output.node(7);
   EXPECT_EQ("swap_out_e_0", swap_out.name());
+  EXPECT_EQ("_CopyFromGpuToHost", swap_out.op());
 
-  const NodeDef& swap_in = output.node(6);
+  const NodeDef& swap_in = output.node(8);
   EXPECT_EQ("swap_in_e_0", swap_in.name());
+  EXPECT_EQ("_CopyFromHostToGpu", swap_in.op());
 
   EXPECT_EQ(NodeName(b.name()), swap_out.input(0));
   EXPECT_EQ(NodeName(swap_out.name()), swap_in.input(0));
   EXPECT_EQ("^c", swap_in.input(1));
 
-  const NodeDef& new_c = output.node(2);
+  const NodeDef& new_c = output.node(4);
   EXPECT_EQ(NodeName(c.name()), new_c.name());
   EXPECT_EQ("^swap_out_e_0", new_c.input(1));
 
   // Run the optimizer a second time to ensure it's idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(cluster.get(), item, &output);
+  GrapplerItem item_copy(item, std::move(output));
+  status = optimizer.Optimize(cluster.get(), item_copy, &output);
   TF_EXPECT_OK(status);
+
+#if GOOGLE_CUDA
+  item.fetch = {"e"};
+  item.init_ops = {init.name()};
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+#endif
 }
 
 TEST_F(MemoryOptimizerTest, SwappingHeuristics) {
@@ -287,9 +302,13 @@ TEST_F(MemoryOptimizerTest, SwappingHeuristics) {
   Output h = ops::Exp(s.WithOpName("h").WithDevice("/gpu:0"), c);
   Output i = ops::Log(s.WithOpName("i").WithDevice("/gpu:0"), d);
 
+  Output constant = ops::Const(s.WithOpName("constant"), 0.0f, {128, 128, 8});
+  Output init = ops::Assign(s.WithOpName("init"), v, constant);
+
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"e", "f", "g", "h", "i"};
+  item.init_ops = {init.name()};
 
   std::unique_ptr<VirtualCluster> cluster(CreateVirtualCluster());
 
@@ -308,6 +327,15 @@ TEST_F(MemoryOptimizerTest, SwappingHeuristics) {
       EXPECT_EQ("axis", node.input(4));
     }
   }
+
+#if GOOGLE_CUDA
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+  }
+#endif
 }
 
 TEST_F(MemoryOptimizerTest, UnswappableInputs) {
@@ -325,9 +353,13 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
   Output e =
       ops::Concat(s.WithOpName("e").WithDevice("/gpu:0"), {b, c, d}, axis);
 
+  Output constant = ops::Const(s.WithOpName("constant"), 0.0f, {128, 128, 8});
+  Output init = ops::Assign(s.WithOpName("init"), v, constant);
+
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"e"};
+  item.init_ops = {init.name()};
 
   std::unique_ptr<VirtualCluster> cluster(CreateVirtualCluster());
 
@@ -344,6 +376,13 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
       EXPECT_EQ("^swap_out_d_2", node.input(4));
     }
   }
+
+#if GOOGLE_CUDA
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+#endif
 }
 
 TEST_F(MemoryOptimizerTest, AccumulationRewrites) {
@@ -387,7 +426,7 @@ TEST_F(MemoryOptimizerTest, AccumulationRewrites) {
   EXPECT_EQ(4, count);
 
   std::vector<string> fetch = {"a", "b", "c", "e"};
-  auto tensors = EvaluateNodes(output, fetch);
+  auto tensors = EvaluateNodes(output, fetch, {});
   EXPECT_EQ(4, tensors.size());
 
   for (int i = 0; i < tensors[0].NumElements(); ++i) {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index e27b9df6206c652e4503bb064366201a2b90f13a..5230177dcab2962890565fad355972080619212e 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,17 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/auto_parallel.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/function_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/colocation.h"
+#include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -32,6 +38,9 @@ namespace tensorflow {
 namespace grappler {
 
 namespace {
+
+constexpr int kDefaultNumberOfIterations = 2;
+
 int64 NumEdges(const GraphDef& graph) {
   int64 num_edges = 0;
   for (const auto& node : graph.node()) {
@@ -41,164 +50,287 @@ int64 NumEdges(const GraphDef& graph) {
 }
 
 string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
-  return strings::StrCat("Graph size before: ", before.node_size(), " nodes, ",
-                         NumEdges(before),
-                         " edges. Graph size after: ", after.node_size(),
-                         " nodes, ", NumEdges(after), " edges.");
+  return strings::StrCat("Graph size after: ", after.node_size(), " nodes (",
+                         after.node_size() - before.node_size(), "), ",
+                         NumEdges(after), " edges (",
+                         NumEdges(after) - NumEdges(before), ")");
+}
+
+int NumIterations(const RewriterConfig& cfg) {
+  return cfg.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
+             ? kDefaultNumberOfIterations
+             : cfg.meta_optimizer_iterations();
+}
+
+// Check if optimizer is allowed to run only once.
+bool IsRunOnceOptimizer(const string& name) {
+  return name == "layout" || name == "memory_optimizer" ||
+         name == "arithmetic_optimizer" || name == "loop_optimizer";
 }
+
 }  // namespace
 
-std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
-    const string& optimizer) {
-  VLOG(1) << "Adding graph optimization pass: " << optimizer;
-  std::unique_ptr<GraphOptimizer> graph_optimizer;
-  if (optimizer == "pruning") {
-    graph_optimizer.reset(new ModelPruner());
+#define MK_OPT(NAME, VALUE) \
+  if (optimizer == NAME) return std::unique_ptr<GraphOptimizer>(VALUE)
+
+std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
+    const string& optimizer) const {
+  MK_OPT("pruning", new ModelPruner());
+  MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization()));
+  MK_OPT("constfold", new ConstantFolding(cpu_device_));
+  MK_OPT("layout", new LayoutOptimizer());
+  MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
+  MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
+  MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization()));
+  MK_OPT("dependency", new DependencyOptimizer(cfg_.dependency_optimization()));
+  MK_OPT("debug_stripper", new DebugStripper());
+
+  return std::unique_ptr<GraphOptimizer>();
+}
+
+#undef MK_OPT
+
+Status MetaOptimizer::InitializeOptimizers(
+    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  if (!cfg_.disable_model_pruning()) {
+    optimizers->emplace_back(new ModelPruner());
   }
-  if (optimizer == "constfold") {
-    graph_optimizer.reset(new ConstantFolding(cpu_device_));
+  if (cfg_.function_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
+        new FunctionOptimizer(cfg_.function_optimization()));
   }
-  if (optimizer == "layout") {
-    graph_optimizer.reset(new LayoutOptimizer());
+  if (cfg_.debug_stripper() == RewriterConfig::ON) {
+    optimizers->emplace_back(new DebugStripper());
   }
-  if (optimizer == "memory") {
-    graph_optimizer.reset(new MemoryOptimizer(RewriterConfig::MANUAL));
+  if (cfg_.constant_folding() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
+        new ConstantFolding(cfg_.constant_folding(), cpu_device_));
   }
-  if (optimizer == "arithmetic") {
-    graph_optimizer.reset(
+  if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
         new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
   }
-  if (optimizer == "autoparallel") {
-    graph_optimizer.reset(
-        new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  if (cfg_.loop_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new LoopOptimizer(cfg_.loop_optimization()));
   }
-  if (optimizer == "dependency") {
-    graph_optimizer.reset(
+  if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
         new DependencyOptimizer(cfg_.dependency_optimization()));
   }
-  if (optimizer == "loop") {
-    graph_optimizer.reset(new LoopOptimizer(cfg_.loop_optimization()));
+  if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new LayoutOptimizer());
+  }
+  if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
+    if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
+      optimizers->emplace_back(
+          // Use the default target node name prefix "gradients/"
+          new MemoryOptimizer(cfg_.memory_optimization()));
+    } else {
+      optimizers->emplace_back(
+          new MemoryOptimizer(cfg_.memory_optimization(),
+                              cfg_.memory_optimizer_target_node_name_scope()));
+    }
   }
-  return graph_optimizer;
+  if (cfg_.auto_parallel().enable()) {
+    optimizers->emplace_back(
+        new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  }
+  return Status::OK();
 }
 
-Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
-  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
-  if (cfg_.optimizers().empty()) {
-    if (!cfg_.disable_model_pruning()) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
+Status MetaOptimizer::InitializeOptimizersByName(
+    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  for (const string& optimizer_name : cfg_.optimizers()) {
+    auto optimizer = MakeNewOptimizer(optimizer_name);
+    if (optimizer) {
+      VLOG(2) << "Registered default graph optimizer: " << optimizer_name;
+      optimizers->push_back(std::move(optimizer));
+      continue;
     }
-    if (cfg_.constant_folding() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new ConstantFolding(cfg_.constant_folding(), cpu_device_)));
-    }
-    if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new ArithmeticOptimizer(cfg_.arithmetic_optimization())));
-    }
-    if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new DependencyOptimizer(cfg_.dependency_optimization())));
-    }
-    if (cfg_.loop_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new LoopOptimizer(cfg_.loop_optimization())));
-    }
-    if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
-    }
-    if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
-      if (cfg_.memory_optimizer_target_node_name_prefix().empty()) {
-        optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-            // Use the default target node name prefix "gradients/"
-            new MemoryOptimizer(cfg_.memory_optimization())));
-      } else {
-        optimizers.push_back(
-            std::unique_ptr<GraphOptimizer>(new MemoryOptimizer(
-                cfg_.memory_optimization(),
-                cfg_.memory_optimizer_target_node_name_prefix())));
-      }
+
+    auto custom_optimizer =
+        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
+
+    if (custom_optimizer) {
+      VLOG(2) << "Registered custom graph optimizer: " << optimizer_name;
+      TF_RETURN_IF_ERROR(custom_optimizer->Init());
+      optimizers->push_back(std::move(custom_optimizer));
+    } else {
+      VLOG(2) << "Can't register an optimizer by name: " << optimizer_name;
     }
-    if (cfg_.auto_parallel().enable()) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new AutoParallel(cfg_.auto_parallel().num_replicas())));
+  }
+  for (const auto& optimizer_config : cfg_.custom_optimizers()) {
+    auto custom_optimizer = CustomGraphOptimizerRegistry::CreateByNameOrNull(
+        optimizer_config.name());
+    if (custom_optimizer) {
+      VLOG(2) << "Registered custom configurable graph optimizer: "
+              << optimizer_config.name();
+      TF_RETURN_IF_ERROR(custom_optimizer->Init(&optimizer_config));
+      optimizers->push_back(std::move(custom_optimizer));
+    } else {
+      VLOG(2) << "Can't register an optimizer by name: "
+              << optimizer_config.name();
     }
+  }
+  return Status::OK();
+}
+
+Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+                                    GraphDef* optimized_graph) {
+  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
+  if (cfg_.optimizers().empty() && cfg_.custom_optimizers().empty()) {
+    TF_RETURN_IF_ERROR(InitializeOptimizers(&optimizers));
   } else {
-    std::set<string> available_optimizers = {
-        "pruning",      "constfold",  "layout",     "memory",
-        "autoparallel", "arithmetic", "dependency", "loop"};
-    for (const auto& optimizer : cfg_.optimizers()) {
-      if (available_optimizers.find(optimizer) != available_optimizers.end()) {
-        optimizers.push_back(NewOptimizer(optimizer));
-      }
-    }
+    TF_RETURN_IF_ERROR(InitializeOptimizersByName(&optimizers));
   }
 
+  VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id
+          << " num_optimizers=" << optimizers.size();
+
   if (optimizers.empty()) {
+    VLOG(3) << "Skip graph optimization, no optimizers registered";
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  bool already_optimized = false;
-  for (const auto& optimizer : optimizers) {
-    if (!already_optimized) {
-      Status status = optimizer->Optimize(cluster, item, optimized_graph);
-      string result;
-      if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                << ". Return status: " << status.ToString();
-        result = status.ToString();
-      } else {
-        already_optimized = true;
-        result = strings::StrCat(
-            "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
-      }
-      result_.push_back(std::make_pair(optimizer->name(), result));
-      VLOG(1) << "Optimizer " << optimizer->name()
-              << " return status: " << result;
-    } else {
-      GrapplerItem optimized_item(item, std::move(*optimized_graph));
+  // Invariant: optimized_graph contains the most recently optimized version of
+  // the graph.
+  GrapplerItem optimized_item = item;
+  optimized_graph->Swap(&optimized_item.graph);
+
+  bool is_optimized = false;
+  GraphOptimizationResult optimization_result(item.id);
+
+  for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) {
+    VLOG(4) << "Starting optimization iteration " << iteration + 1;
+
+    for (const auto& optimizer : optimizers) {
+      // Some optimizers can run only once.
+      if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue;
+
+      uint64 start_us = Env::Default()->NowMicros();
+      // This swaps the current optimized_graph into optimized item and
+      // resets optimized_graph to an empty graph.
+      optimized_graph->Swap(&optimized_item.graph);
+      *optimized_graph = GraphDef();
       Status status =
           optimizer->Optimize(cluster, optimized_item, optimized_graph);
+      uint64 end_us = Env::Default()->NowMicros();
+
       string result;
       if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                << ". Return status: " << status.ToString();
         optimized_graph->Swap(&optimized_item.graph);
         result = status.ToString();
       } else {
+        is_optimized = true;
+        float duration_ms = (end_us - start_us) / 1000.0f;
         result = strings::StrCat(
-            "OK. ",
-            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
+            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph),
+            ", time = ", duration_ms, "ms.");
       }
-      result_.push_back(std::make_pair(optimizer->name(), result));
-      VLOG(1) << "Optimizer " << optimizer->name()
-              << " return status: " << result;
+      VLOG(4) << optimizer->name() << ": " << result;
+
+      OptimizerResult optimizer_result{optimizer->name(), result};
+      optimization_result.results.push_back(optimizer_result);
     }
   }
 
-  if (already_optimized) {
+  // Record graph optimization result.
+  optimization_results_.push_back(optimization_result);
+
+  if (is_optimized) {
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
-    // Make sure that the optimizers preserved the graph version and library.
-    DCHECK_GE(optimized_graph->library().function_size(),
-              item.graph.library().function_size());
-    DCHECK_GE(optimized_graph->library().gradient_size(),
-              item.graph.library().gradient_size());
+    ReassignColocation(optimized_graph);
+    // Make sure that the optimizers preserved the graph version.
     DCHECK_EQ(optimized_graph->versions().producer(),
               item.graph.versions().producer());
-  } else {
-    *optimized_graph = item.graph;
   }
 
   return Status::OK();
 }
 
+Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  optimization_results_.clear();
+
+  // 1. Optimize main graph
+  TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph));
+
+  // 2. Optimize function library
+  FunctionLibraryDefinition flib(OpRegistry::Global(),
+                                 optimized_graph->library());
+
+  // Optimize each function only once.
+  std::unordered_set<string> optimized_funcs;
+  bool optimize_function_library = true;
+
+  while (optimize_function_library) {
+    optimize_function_library = false;
+
+    for (const FunctionDef& func : optimized_graph->library().function()) {
+      const string& func_name = func.signature().name();
+
+      // Skip already optimized functions.
+      if (optimized_funcs.find(func_name) != optimized_funcs.end()) continue;
+
+      // Skip parametrized functions (function type or body is defined only at
+      // function call time by caller node attributes).
+      if (IsParametrized(func)) continue;
+
+      VLOG(3) << "Optimize function: function=" << func_name;
+
+      // Function optimization might specialize nested function calls, so we
+      // have to reset the flag and do at least one more pass over the library.
+      optimize_function_library = true;
+      optimized_funcs.insert(func_name);
+
+      // Make a GrapplerItem from a FunctionDef.
+      GrapplerFunctionItem func_item;
+      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, flib, &func_item));
+
+      // Optimize function body graph.
+      GraphDef optimized_func_graph;
+      TF_RETURN_IF_ERROR(
+          OptimizeGraph(cluster, func_item, &optimized_func_graph));
+
+      // Function body optimization might have created new specialized
+      // functions for each instantiation context. Add them to the library.
+      for (const FunctionDef& func_def :
+           optimized_func_graph.library().function()) {
+        if (flib.Find(func_def.signature().name()) == nullptr) {
+          TF_RETURN_IF_ERROR(flib.AddFunctionDef(func_def));
+        }
+      }
+
+      // Convert optimized graph back to FunctionDef.
+      FunctionDef optimized_func;
+      func_item.SwapFunctionBody(std::move(optimized_func_graph));
+      TF_RETURN_IF_ERROR(MakeFunctionDef(func_item, flib, &optimized_func));
+
+      // Replace optimized function with a new FunctionDef.
+      TF_RETURN_IF_ERROR(flib.RemoveFunction(func_name));
+      TF_RETURN_IF_ERROR(flib.AddFunctionDef(optimized_func));
+    }
+
+    // If optimized at least one function, update the graph library.
+    if (optimize_function_library) {
+      *optimized_graph->mutable_library() = flib.ToProto();
+    }
+  }
+
+  VLOG(3) << "Optimized " << optimized_funcs.size()
+          << " functions: " << str_util::Join(optimized_funcs, ", ");
+
+  return Status::OK();
+}
+
 void MetaOptimizer::PrintResult() {
-  for (const auto& result : result_) {
-    LOG(INFO) << "Return status of optimizer " << result.first << ": "
-              << result.second;
+  for (const GraphOptimizationResult& graph_result : optimization_results_) {
+    LOG(INFO) << "Optimization results for grappler item: " << graph_result.id;
+    for (const OptimizerResult& result : graph_result.results) {
+      LOG(INFO) << "  " << result.optimizer_name << ": " << result.result;
+    }
   }
 }
 
@@ -210,13 +342,15 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
 bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
   return !cfg.disable_model_pruning() ||
          cfg.layout_optimizer() != RewriterConfig::OFF ||
+         cfg.function_optimization() != RewriterConfig::OFF ||
          cfg.constant_folding() != RewriterConfig::OFF ||
-         cfg.dependency_optimization() != RewriterConfig::OFF ||
-         cfg.loop_optimization() == RewriterConfig::ON ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
+         cfg.loop_optimization() != RewriterConfig::OFF ||
+         cfg.dependency_optimization() != RewriterConfig::OFF ||
          cfg.auto_parallel().enable() ||
          cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
-         !cfg.optimizers().empty();
+         cfg.debug_stripper() == RewriterConfig::ON ||
+         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
 }
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 382cfe51d42439691fcedd5b765c9ef13e055ae5..e736dd174ed96cd615bf7eb1e477a461c2568de3 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
 
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -30,7 +30,7 @@ class MetaOptimizer : public GraphOptimizer {
  public:
   MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
       : cpu_device_(cpu_device), cfg_(cfg) {}
-  ~MetaOptimizer() override {}
+  ~MetaOptimizer() override = default;
 
   string name() const override { return "meta_optimizer"; };
 
@@ -43,10 +43,36 @@ class MetaOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  std::unique_ptr<GraphOptimizer> NewOptimizer(const string& optimizer);
+  std::unique_ptr<GraphOptimizer> MakeNewOptimizer(
+      const string& optimizer) const;
+
+  // Initialize active optimizers from RewriterConfig toggles.
+  Status InitializeOptimizers(
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+  // Initialize active optimizers from RewriterConfig optimizer names.
+  Status InitializeOptimizersByName(
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+
+  // Run optimization pass over a single GrapplerItem. Meta optimizer might run
+  // multiple such passes: 1) for the main graph 2) for the function library
+  Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+                       GraphDef* optimized_graph);
+
   DeviceBase* const cpu_device_;  // may be NULL
   RewriterConfig cfg_;
-  std::vector<std::pair<string, string>> result_;
+
+  struct OptimizerResult {
+    string optimizer_name;
+    string result;
+  };
+
+  struct GraphOptimizationResult {
+    explicit GraphOptimizationResult(const string& id) : id(id) {}
+    string id;
+    std::vector<OptimizerResult> results;
+  };
+
+  std::vector<GraphOptimizationResult> optimization_results_;
 };
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg);
@@ -64,4 +90,4 @@ Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..887a988af9afed58c903f312a8c83f769474a634
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -0,0 +1,262 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+constexpr char kDevice[] = "/device:CPU:0";
+
+class TestOptimizer : public CustomGraphOptimizer {
+ public:
+  static void SetOptimized(const bool flag_value) { optimized_ = flag_value; }
+  static bool IsOptimized() { return optimized_; }
+
+  TestOptimizer() {}
+  string name() const override { return "test_optimizer"; }
+
+  Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
+                  nullptr) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override {
+    optimized_ = true;
+    *optimized_graph = item.graph;
+    return Status::OK();
+  }
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override {}
+
+ private:
+  static bool optimized_;
+};
+
+bool TestOptimizer::optimized_;
+
+REGISTER_GRAPH_OPTIMIZER(TestOptimizer);
+
+class MetaOptimizerTest : public GrapplerTest {};
+
+TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  TestOptimizer::SetOptimized(false);
+  RewriterConfig rewriter_config;
+  rewriter_config.add_optimizers("TestOptimizer");
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_TRUE(TestOptimizer::IsOptimized());
+}
+
+TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  RewriterConfig rewriter_config;
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
+TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
+  using test::function::NDef;
+
+  // Enable ony function optimization.
+  RewriterConfig rewriter_config;
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.set_function_optimization(RewriterConfig::ON);
+  rewriter_config.add_optimizers("function");
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+
+  // Define function library:
+  //
+  //   MyMul(x, y)    = x * y
+  //  *MySquare(x)    = MyMul(x, x)
+  //  *MyQuadratic(x) = MySquare(MySquare(x))
+  //
+  //  * - marked as noinline
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "my_mul:z:0"}});
+  (*square_func.mutable_attr())["_noinline"].set_b(true);
+
+  FunctionDef quadratic_func = FunctionDefHelper::Create(
+      "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}},
+       {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "quadratic:z:0"}});
+  (*quadratic_func.mutable_attr())["_noinline"].set_b(true);
+
+  // Tensorflow graph:
+  //
+  //   a = tf.Placeholder(tf.float);
+  //   b = tf.Placeholder(tf.int32);
+  //
+  //   square = MySquare(a);        // a^2
+  //   quadratic = MyQuadratic(b);  // b^4
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+       // Calls into function library
+       NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("quadratic", "MyQuadratic", {"b"}, {{"T", DT_INT32}}, kDevice),
+       // Forward outputs
+       NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)},
+      // FunctionLib
+      {mul_func, square_func, quadratic_func});
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  FunctionLibraryDefinition optimized_flib(OpRegistry::Global(),
+                                           output.library());
+
+  // Specialized and optimized functions should be added to the graph.
+  EXPECT_EQ(6, optimized_flib.num_functions());
+
+  // MyQuadratic should be specialized once:
+  //   0. 'quadratic' node in the main graph
+  const string optimized_0 = "MyQuadratic_specialized_for_quadratic";
+
+  // MySquare should be specialized and optimized for 3 instantiations:
+  //   1. 'square' node in the main graph
+  //   2. 'square' node in the MyQuadratic specialization
+  //   3. 'quadratic' node in the MyQuadratic specialization
+
+  const string optimized_1 = "MySquare_specialized_for_square";
+  const string optimized_2 = "MySquare_specialized_for_square_1";
+  const string optimized_3 = "MySquare_specialized_for_quadratic";
+
+  const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
+  const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
+  const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
+  const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3);
+
+  ASSERT_NE(optimized_func_0, nullptr);
+  ASSERT_NE(optimized_func_1, nullptr);
+  ASSERT_NE(optimized_func_2, nullptr);
+  ASSERT_NE(optimized_func_3, nullptr);
+
+  // Graph should call optimized function.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "square" && count++) {
+      EXPECT_EQ("MySquare_specialized_for_square", node.op());
+    } else if (node.name() == "quadratic" && count++) {
+      EXPECT_EQ("MyQuadratic_specialized_for_quadratic", node.op());
+    }
+  }
+  EXPECT_EQ(2, count);
+
+  // Specialized MySquare should call specialized functions.
+  count = 0;
+  for (const NodeDef& node : optimized_func_0->node_def()) {
+    if (node.name() == "square" && count++) {
+      EXPECT_EQ(optimized_2, node.op());
+    } else if (node.name() == "quadratic" && count++) {
+      EXPECT_EQ(optimized_3, node.op());
+    }
+  }
+  EXPECT_EQ(2, count);
+
+  const std::vector<const FunctionDef*> optimized_funcs = {
+      optimized_func_1, optimized_func_1, optimized_func_3};
+
+  // MyMul should be inlined into all optimized versions of MySquare.
+  for (const FunctionDef* optimized_func : optimized_funcs) {
+    count = 0;
+    for (const NodeDef& node : optimized_func->node_def()) {
+      if (node.name() == "my_mul/inlined_inputs" && count++) {
+        EXPECT_EQ("IdentityN", node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("x:0", node.input(0));
+        EXPECT_EQ("x:0", node.input(1));
+      } else if (node.name() == "my_mul/x" && count++) {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/inlined_inputs:output:0", node.input(0));
+      } else if (node.name() == "my_mul/y" && count++) {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/inlined_inputs:output:1", node.input(0));
+      } else if (node.name() == "my_mul/mul" && count++) {
+        EXPECT_EQ("Mul", node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("my_mul/x:output:0", node.input(0));
+        EXPECT_EQ("my_mul/y:output:0", node.input(1));
+      } else if (node.name() == "my_mul" && count++) {
+        EXPECT_EQ("IdentityN", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/mul:z:0", node.input(0));
+      }
+      EXPECT_TRUE(node.device().empty());
+    }
+    EXPECT_EQ(5, count);
+  }
+
+  item.fetch = {"out_s", "out_q"};
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
+  item.feed.emplace_back("b", test::AsScalar<int>(4));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index f52a2ab86288adacefec6796ceed4cea73d9b632..3311e970108d94d34a92842d51aca8f0c99d904c 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -50,7 +50,7 @@ bool IsTrivialOp(const NodeDef& node, const GraphRewriter& rewriter) {
 
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* pruned_graph) {
-  const std::unordered_set<string>& nodes_to_preserve = item.NodesToPreserve();
+  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
 
   // Prune all the nodes that won't be executed, ie all the nodes that aren't in
   // the fanin of a fetch node. If fetch nodes aren't specified, we'll assume
@@ -59,6 +59,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
   if (!nodes_to_preserve.empty()) {
     std::vector<string> terminal_nodes(nodes_to_preserve.begin(),
                                        nodes_to_preserve.end());
+    std::sort(terminal_nodes.begin(), terminal_nodes.end());
     bool ill_formed = false;
     std::vector<const NodeDef*> keep =
         ComputeTransitiveFanin(item.graph, terminal_nodes, &ill_formed);
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.h b/tensorflow/core/grappler/optimizers/model_pruner.h
index 3d76aebef433f126c660a2861c3ee4a1d18b4c6f..76cc792a45404c1822c66add13efd0beb5e0d7ba 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.h
+++ b/tensorflow/core/grappler/optimizers/model_pruner.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 
@@ -41,4 +41,4 @@ class ModelPruner : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index 8480a74572883a4657e11606b4cb8dcd5532ea3a..cf5b990377f7c6a1b7206260bf8a11bc7788e30a 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -16,9 +16,12 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -26,7 +29,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class ModelPrunerTest : public ::testing::Test {};
+class ModelPrunerTest : public GrapplerTest {};
 
 TEST_F(ModelPrunerTest, NoPruning) {
   // This trivial graph is so basic there's nothing to prune.
@@ -86,6 +89,13 @@ TEST_F(ModelPrunerTest, StopGradientPruning) {
   EXPECT_EQ(NodeName(b.name()), new_e.input(0));
   EXPECT_EQ(1, new_d.input_size());
   EXPECT_EQ(NodeName(b.name()), new_d.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, IdentityPruning) {
@@ -124,6 +134,13 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
   EXPECT_EQ(NodeName(b.name()), new_d.input(0));
   EXPECT_EQ(1, new_c.input_size());
   EXPECT_EQ(NodeName(b.name()), new_c.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, NoOpPruning) {
@@ -162,6 +179,13 @@ TEST_F(ModelPrunerTest, NoOpPruning) {
       EXPECT_EQ("a", new_node.input(0));
     }
   }
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, PreserveIdentities) {
@@ -192,6 +216,19 @@ TEST_F(ModelPrunerTest, PreserveIdentities) {
 
   TF_EXPECT_OK(status);
   EXPECT_EQ(item.graph.node_size(), output.node_size());
+
+  auto v_in_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
+  v_ctrl_t.flat<bool>()(0) = true;
+  auto expected_tensors = EvaluateNodes(
+      item.graph, {"merge", "id2"}, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  auto actual_tensors = EvaluateNodes(output, {"merge", "id2"},
+                                      {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, expected_tensors.size());
+  EXPECT_EQ(2, actual_tensors.size());
+  for (int i = 0; i < expected_tensors.size(); i++) {
+    test::ExpectTensorEqual<float>(expected_tensors[i], actual_tensors[i]);
+  }
 }
 
 TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
@@ -232,6 +269,14 @@ TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
   EXPECT_EQ("b", new_c.input(0));
   EXPECT_EQ("b", new_d.input(0));
   EXPECT_EQ("b", new_e.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto a_t = GenerateRandomTensor<DT_INT64>(TensorShape({}));
+  auto expected_tensors = EvaluateNodes(item.graph, fetch, {{"a", a_t}});
+  auto actual_tensors = EvaluateNodes(output, fetch, {{"a", a_t}});
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<int64>(expected_tensors[0], actual_tensors[0]);
 }
 
 // TODO(rmlarsen): Reenable this test when the issues with
@@ -307,6 +352,12 @@ TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   EXPECT_EQ(NodeName(b.name()), new_b.name());
   const NodeDef& new_c = output.node(2);
   EXPECT_EQ(NodeName(c.name()), new_c.name());
+
+  auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
+  auto actual_tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
@@ -339,6 +390,16 @@ TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
       EXPECT_EQ("c", node.input(0));
     }
   }
+  if (GetNumAvailableGPUs() > 0) {
+    auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
+    auto actual_tensors = EvaluateNodes(output, item.fetch);
+    EXPECT_EQ(4, expected_tensors.size());
+    EXPECT_EQ(4, actual_tensors.size());
+    for (int i = 0; i < expected_tensors.size(); i++) {
+      test::ExpectTensorNear<float>(expected_tensors[i], actual_tensors[i],
+                                    1e-6);
+    }
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
index 450e85340796fdde9afdfebbd0eb9a724cb9440a..5206e9957dc75c13a03dfcb060b8b3b3dc732ad8 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -40,6 +40,12 @@ static Costs::NanoSeconds PredictExecutionTime(
     op_context.op_info.add_inputs()->Swap(&input);
   }
 
+  std::vector<OpInfo::TensorProperties> outputs =
+      properties.GetOutputProperties(node.name());
+  for (auto& output : outputs) {
+    op_context.op_info.add_outputs()->Swap(&output);
+  }
+
   DeviceProperties device = placer.get_device(node);
   op_context.op_info.mutable_device()->Swap(&device);
 
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index 08580d92842377c2dd999950b2e01bef01e2fee6..d632e460e7ccfc092945805a9a0b6b4b4c2215d1 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -64,17 +64,17 @@ TEST_F(StaticScheduleTest, BasicGraph) {
     if (time.first->name() == "Const/Const") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(250001), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(1500001), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(1500004), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(4000004), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(2750007), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(6500007), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(4000010), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(9000010), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(5250013), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(11500013), time.second);
     } else if (time.first->name() == "y") {
-      EXPECT_EQ(Costs::NanoSeconds(6500013), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(14000013), time.second);
     }
   }
 }
@@ -110,13 +110,13 @@ TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
     if (time.first->name() == "a") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "b") {
-      EXPECT_EQ(Costs::NanoSeconds(12500001), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(25000001), time.second);
     } else if (time.first->name() == "c") {
-      EXPECT_EQ(Costs::NanoSeconds(12500002), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(25000002), time.second);
     } else if (time.first->name() == "d") {
-      EXPECT_EQ(Costs::NanoSeconds(12500003), time.second);
-    } else if (time.first->name() == "e") {
       EXPECT_EQ(Costs::NanoSeconds(25000003), time.second);
+    } else if (time.first->name() == "e") {
+      EXPECT_EQ(Costs::NanoSeconds(50000003), time.second);
     }
   }
 }
@@ -142,17 +142,17 @@ TEST_F(StaticScheduleTest, RequiredTimes) {
 
   for (auto time : required_times) {
     if (time.first->name() == "Const/Const") {
-      EXPECT_EQ(Costs::NanoSeconds(-6500012), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-14000012), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(-6250012), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-12500012), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(-5000009), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-10000009), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(-3750006), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-7500006), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(-2500003), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-5000003), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(-1250000), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-2500000), time.second);
     } else if (time.first->name() == "y") {
       EXPECT_EQ(Costs::NanoSeconds(0), time.second);
     }
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.cc b/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfca2dc0d38480240c9b158ecbb3cc718bfa1ad2
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+BCast::Vec ShapeDims(const TensorShapeProto& shape) {
+  BCast::Vec dims;
+  dims.reserve(shape.dim_size());
+  for (int i = 0; i < shape.dim_size(); ++i)
+    dims.push_back(shape.dim(i).size());
+  return dims;
+}
+
+}  // namespace
+
+bool IsKnown(const TensorShapeProto::Dim& dim) { return dim.size() >= 0; }
+
+bool IsKnownSymbolically(const TensorShapeProto::Dim& dim) {
+  return dim.size() <= -2;
+}
+
+bool IsUnknown(const TensorShapeProto::Dim& dim) { return dim.size() == -1; }
+
+bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape) {
+  return !shape.unknown_rank() &&
+         std::all_of(
+             shape.dim().begin(), shape.dim().end(),
+             [](const TensorShapeProto::Dim& dim) { return !IsUnknown(dim); });
+}
+
+bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties) {
+  return ShapeIsSymbolicallyDefined(properties.shape());
+}
+
+bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
+                             const TensorShapeProto& right) {
+  if (left.unknown_rank() || right.unknown_rank() ||
+      left.dim_size() != right.dim_size()) {
+    return false;
+  }
+  for (int i = 0; i < left.dim_size(); ++i) {
+    const auto& ldim = left.dim(i);
+    const auto& rdim = right.dim(i);
+    if (IsUnknown(ldim) || IsUnknown(rdim) || ldim.size() != rdim.size()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ShapesSymbolicallyEqual(const OpInfo::TensorProperties& left,
+                             const OpInfo::TensorProperties& right) {
+  return ShapesSymbolicallyEqual(left.shape(), right.shape());
+}
+
+bool ShapesBroadcastable(const TensorShapeProto& left,
+                         const TensorShapeProto& right) {
+  if (!ShapeIsSymbolicallyDefined(left) || !ShapeIsSymbolicallyDefined(right)) {
+    return false;
+  }
+  BCast bcast(ShapeDims(left), ShapeDims(right),
+              /*fewer_dims_optimization*/ false);
+  return bcast.IsValid();
+}
+
+bool ShapesBroadcastable(const OpInfo::TensorProperties& left,
+                         const OpInfo::TensorProperties& right) {
+  return ShapesBroadcastable(left.shape(), right.shape());
+}
+
+bool CompareSymbolicallyShapedTensorSizes(const TensorShapeProto& left,
+                                          const TensorShapeProto& right) {
+  // if one of the ranks is unknown, it's impossible to compare tensor sizes
+  if (left.unknown_rank() || right.unknown_rank()) {
+    return false;
+  }
+
+  // Tensor size, computed as a product of defined dimensions
+  int64 left_defined_size = 1;
+  int64 right_defined_size = 1;
+
+  // Keep how many times each unknown dimension appeared on the left and right
+  std::unordered_map<int64, int64> left_unknown_dims;
+  std::unordered_map<int64, int64> right_unknown_dims;
+
+  // Assign unique id to every unknown dimension (-1). We are going to
+  // assign positive ids, because negative values are already used by
+  // symbolic dimensions.
+  int64 unknown_dim_id = 1;
+
+  // For each shape dimension update "defined tensor size", if shape is defined,
+  // or increment a counter for unknown dim.
+  auto process_dimensions =
+      [&unknown_dim_id](const TensorShapeProto& shape, int64* defined_size,
+                        std::unordered_map<int64, int64>* unknown_dims) {
+        for (int i = 0; i < shape.dim_size(); ++i) {
+          const auto& dim = shape.dim(i);
+          int64 dim_size = dim.size();
+          if (dim_size > 0) {
+            *defined_size *= dim_size;
+          } else if (IsUnknown(dim)) {
+            ++(*unknown_dims)[unknown_dim_id++];
+          } else if (IsKnownSymbolically(dim)) {
+            ++(*unknown_dims)[dim_size];
+          }
+        }
+      };
+
+  process_dimensions(left, &left_defined_size, &left_unknown_dims);
+  process_dimensions(right, &right_defined_size, &right_unknown_dims);
+
+  // Compute a union of unknown dimension ids appeared in both shapes
+  std::set<int64> unknown_dims;
+  for (const auto& el : left_unknown_dims) unknown_dims.insert(el.first);
+  for (const auto& el : right_unknown_dims) unknown_dims.insert(el.first);
+
+  // Cancel unknown dimensions that appeared in both shapes
+  for (int64 unknown_dim : unknown_dims) {
+    int64 co_occurrence = std::min(left_unknown_dims[unknown_dim],
+                                   right_unknown_dims[unknown_dim]);
+    left_unknown_dims[unknown_dim] -= co_occurrence;
+    right_unknown_dims[unknown_dim] -= co_occurrence;
+  }
+
+  // Count unbalanced unknown dimensions
+  int64 left_unbalanced_unknown_dims = 0;
+  int64 right_unbalanced_unknown_dims = 0;
+  for (const auto& el : left_unknown_dims)
+    left_unbalanced_unknown_dims += el.second;
+  for (const auto& el : right_unknown_dims)
+    right_unbalanced_unknown_dims += el.second;
+
+  if (left_unbalanced_unknown_dims == 0 && right_unbalanced_unknown_dims == 0) {
+    // If unknown dimensions cancelled each other, compare tensor sizes
+    // represented by defined dimensions
+    return left_defined_size < right_defined_size;
+  }
+
+  if (left_defined_size <= right_defined_size &&
+      left_unbalanced_unknown_dims == 0 && right_unbalanced_unknown_dims > 0) {
+    // If size of a 'left" tensor computed from defined dimensions less or
+    // equal, and shape on the right has unbalanced unknown dimensions, we can
+    // guarantee that shape on the left is strictly smaller (assuming that
+    // unknown dimension size is larger than 1)
+    return true;
+  }
+
+  // In every other case, assuming that unknown dimensions can be arbitrary
+  // large in size, we can't guarantee any ordering
+  return false;
+}
+
+bool CompareSymbolicallyShapedTensorSizes(
+    const OpInfo::TensorProperties& left,
+    const OpInfo::TensorProperties& right) {
+  return CompareSymbolicallyShapedTensorSizes(left.shape(), right.shape());
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.h b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb79bab3141579132ea2e2d2afc5733f0013a0d5
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SYMBOLIC_SHAPES_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SYMBOLIC_SHAPES_H_
+
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool IsKnown(const TensorShapeProto::Dim& dim);
+bool IsKnownSymbolically(const TensorShapeProto::Dim& dim);
+bool IsUnknown(const TensorShapeProto::Dim& dim);
+
+// Shape is symbolically defined, if it has a known rank, and each dimension is
+// known (dim_size >= 0), or is a symbolic dimension size (dim_size <= -2).
+bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape);
+bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties);
+
+// Shapes are symbolically equal, if they have the same rank, they are known or
+// symbolically defined, and have matching dimensions.
+bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
+                             const TensorShapeProto& right);
+bool ShapesSymbolicallyEqual(const OpInfo::TensorProperties& left,
+                             const OpInfo::TensorProperties& right);
+
+// Check if two shapes can be broadcasted to each other. Both shapes must be at
+// least symbolically defined, and the have valid BCast instance.
+bool ShapesBroadcastable(const TensorShapeProto& left,
+                         const TensorShapeProto& right);
+bool ShapesBroadcastable(const OpInfo::TensorProperties& left,
+                         const OpInfo::TensorProperties& right);
+
+// Return true if can prove, that tensor of size 'left' is smaller than tensor
+// of size 'right'. Return false if it's larger or equal, or it's impossible to
+// compare because of unknown dimensions, or mismatch in symbolic dimensions.
+bool CompareSymbolicallyShapedTensorSizes(const TensorShapeProto& left,
+                                          const TensorShapeProto& right);
+bool CompareSymbolicallyShapedTensorSizes(
+    const OpInfo::TensorProperties& left,
+    const OpInfo::TensorProperties& right);
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SYMBOLIC_SHAPES_H_
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc b/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ef9f6592571062564d16e5fb282b1dd85d074ef
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class SymbolicShapesTest : public ::testing::Test {
+ protected:
+  TensorShapeProto MakeUnknown() {
+    TensorShapeProto shape;
+    shape.set_unknown_rank(true);
+    return shape;
+  }
+
+  TensorShapeProto MakeShape(std::vector<int> dims) {
+    TensorShapeProto shape;
+    for (int dim_size : dims) {
+      TensorShapeProto::Dim dim;
+      dim.set_size(dim_size);
+      *shape.add_dim() = dim;
+    }
+    return shape;
+  }
+};
+
+bool operator<(const TensorShapeProto& lhs, const TensorShapeProto& rhs) {
+  return CompareSymbolicallyShapedTensorSizes(lhs, rhs);
+}
+
+TEST_F(SymbolicShapesTest, ShapeIsSymbolicallyDefined) {
+  EXPECT_FALSE(ShapeIsSymbolicallyDefined(MakeUnknown()));
+  EXPECT_FALSE(ShapeIsSymbolicallyDefined(MakeShape({-1, 2})));
+
+  EXPECT_TRUE(ShapeIsSymbolicallyDefined(MakeShape({1, 2})));
+  EXPECT_TRUE(ShapeIsSymbolicallyDefined(MakeShape({-2, 2})));
+}
+
+TEST_F(SymbolicShapesTest, ShapesSymbolicallyEqual) {
+  EXPECT_FALSE(ShapesSymbolicallyEqual(MakeUnknown(), MakeUnknown()));
+  EXPECT_FALSE(ShapesSymbolicallyEqual(MakeShape({-1, 2}), MakeShape({-1, 2})));
+  EXPECT_FALSE(ShapesSymbolicallyEqual(MakeShape({-2, 2}), MakeShape({-3, 2})));
+
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({1, 2}), MakeShape({1, 2})));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, 2}), MakeShape({-2, 2})));
+}
+
+TEST_F(SymbolicShapesTest, ShapesBroadcastable) {
+  EXPECT_FALSE(ShapesBroadcastable(MakeUnknown(), MakeUnknown()));
+  EXPECT_FALSE(ShapesBroadcastable(MakeShape({-2}), MakeShape({1, -3})));
+  EXPECT_FALSE(ShapesBroadcastable(MakeShape({-1, 2}), MakeShape({-1, 2})));
+  EXPECT_FALSE(ShapesBroadcastable(MakeShape({-2, 2}), MakeShape({-3, 2})));
+  EXPECT_FALSE(ShapesBroadcastable(MakeShape({-2, 4}), MakeShape({-2, 8})));
+
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({1, 2}), MakeShape({1, 2})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 2}), MakeShape({-2, 2})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 32}), MakeShape({-2, 1})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 1}), MakeShape({1, -2})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 1}), MakeShape({1, -3})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-3}), MakeShape({-2, -3})));
+}
+
+TEST_F(SymbolicShapesTest, CompareSymbolicallyShapedTensorSizes) {
+  EXPECT_TRUE(MakeShape({1, 1, 32}) < MakeShape({32, 32}));
+  EXPECT_TRUE(MakeShape({1, 32, 32}) < MakeShape({2048}));
+  EXPECT_TRUE(MakeShape({1, -2, 32}) < MakeShape({-2, 32, 32}));
+  EXPECT_TRUE(MakeShape({1, 32, 32}) < MakeShape({-2, 32, 32}));
+  EXPECT_TRUE(MakeShape({1, 32, 32}) < MakeShape({-1, 32, 32}));
+  EXPECT_TRUE(MakeShape({1, -2, 32}) < MakeShape({-2, -2, 32}));
+
+  EXPECT_FALSE(MakeShape({1, -2, 32}) < MakeShape({-3, 32, 32}));
+  EXPECT_FALSE(MakeShape({1, -1, 32}) < MakeShape({1, -1, 32}));
+  EXPECT_FALSE(MakeShape({1, -1, 32}) < MakeShape({-1, -1, 32}));
+  EXPECT_FALSE(MakeShape({-1, -1, 32}) < MakeShape({1, -1, 32}));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index eb5a2c48dc8b12f7b4090e80c403e238a526e122..7398d2c896dc01ca35a949be3942509b9b35608d 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -29,6 +29,28 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
+namespace {
+template <typename T>
+bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
+  using RealType = typename Eigen::NumTraits<T>::Real;
+  if (value > static_cast<double>(std::numeric_limits<RealType>::max()) ||
+      value < static_cast<double>(std::numeric_limits<RealType>::min())) {
+    return false;
+  }
+  tensor->flat<T>()(0) = static_cast<T>(value);
+  return true;
+}
+
+// Is 'node' an operator that consumes only the shape of its input, not the
+// data itself?
+// TODO(ezhulenev): move to op_types.h. Requires to break circular dependency.
+// TODO(ezhulenev): what about Identity passing tensor to Shape consumer?
+bool IsShapeConsumer(const NodeDef& node) {
+  const string& op = node.op();
+  return op == "Shape" || op == "ShapeN" || op == "Rank" || op == "Size";
+}
+
+}  // namespace
 
 NodeMap::NodeMap(GraphDef* graph) {
   CHECK(graph != nullptr);
@@ -120,38 +142,12 @@ bool IsSameInput(const string& name1, const string& name2) {
     return true;
   }
   int position1;
-  string node1 = ParseNodeName(name1, &position1);
+  StringPiece node1 = ParseNodeNameAsStringPiece(name1, &position1);
   int position2;
-  string node2 = ParseNodeName(name2, &position2);
+  StringPiece node2 = ParseNodeNameAsStringPiece(name2, &position2);
   return (position1 == position2) && (node1 == node2);
 }
 
-string ParseNodeName(const string& name, int* position) {
-  // Strip the prefix '^' (if any), and strip the trailing ":{digits} (if any)
-  // to get a node name.
-  strings::Scanner scan(name);
-  scan.ZeroOrOneLiteral("^")
-      .RestartCapture()
-      .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
-      .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
-  StringPiece capture;
-  StringPiece remaining;
-  if (scan.Peek(':') != ':' || !scan.GetResult(&remaining, &capture)) {
-    *position = 0;
-    return "";
-  } else {
-    if (name[0] == '^') {
-      *position = -1;
-    } else if (remaining.empty()) {
-      *position = 0;
-    } else {
-      // Skip the first ':' character.
-      CHECK(strings::safe_strto32(remaining.substr(1), position));
-    }
-    return capture.ToString();
-  }
-}
-
 bool IsControlInput(const string& name) {
   return !name.empty() && name[0] == '^';
 }
@@ -163,7 +159,7 @@ string NodeName(const string& name) {
 
 int NodePosition(const string& name) {
   int position;
-  ParseNodeName(name, &position);
+  ParseNodeNameAsStringPiece(name, &position);
   return position;
 }
 
@@ -233,6 +229,14 @@ int NumOutputs(const NodeDef& node, GraphDef* graph) {
   return num_outputs;
 }
 
+bool HasControlInputs(const NodeDef& node) {
+  int num_inputs = node.input_size();
+  if (num_inputs > 0 && IsControlInput(node.input(num_inputs - 1))) {
+    return true;
+  }
+  return false;
+}
+
 int NumNonControlInputs(const NodeDef& node) {
   int num_inputs = node.input_size();
   for (const string& input : node.input()) {
@@ -245,19 +249,42 @@ int NumNonControlInputs(const NodeDef& node) {
 
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
   int num_outputs = 0;
+  int pos;
   for (const NodeDef* output : node_map.GetOutputs(node.name())) {
     for (const string& node_as_input : output->input()) {
       if (IsControlInput(node_as_input)) {
         break;
       }
-      if (NodeName(node_as_input) == node.name()) {
+      if (node_as_input == node.name()) {
         ++num_outputs;
+      } else {
+        const StringPiece name =
+            ParseNodeNameAsStringPiece(node_as_input, &pos);
+        if (name == node.name()) {
+          ++num_outputs;
+        }
       }
     }
   }
   return num_outputs;
 }
 
+int NumNonControlDataOutputs(const NodeDef& node, const NodeMap& node_map) {
+  int num_data_outputs = 0;
+  for (const NodeDef* output : node_map.GetOutputs(node.name())) {
+    if (IsShapeConsumer(*output)) continue;
+
+    for (int i = 0; i < output->input_size(); ++i) {
+      const string& input = output->input(i);
+      if (!IsControlInput(input) && NodeName(input) == node.name()) {
+        ++num_data_outputs;
+        break;
+      }
+    }
+  }
+  return num_data_outputs;
+}
+
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
 DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name) {
@@ -336,6 +363,7 @@ inline void STLSortAndRemoveDuplicates(T* v) {
 
 Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
                                    bool dedup_outputs) {
+  graph_ = &graph;
   const int num_nodes = graph.node_size();
   inputs_.clear();
   inputs_.resize(num_nodes);
@@ -382,6 +410,32 @@ Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
   return Status::OK();
 }
 
+void SimpleGraphView::DepthFirstSearch(
+    const std::unordered_set<string>& op_types_to_traverse, int root_node,
+    std::set<int>* nodes_found) const {
+  nodes_found->clear();
+  const string& op_type = graph_->node(root_node).op();
+  if (op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
+    return;
+  }
+  std::vector<int> stack;
+  stack.reserve(32);
+  stack.push_back(root_node);
+  while (!stack.empty()) {
+    const int node_idx = stack.back();
+    stack.pop_back();
+    nodes_found->insert(node_idx);
+    const string& op_type = graph_->node(node_idx).op();
+    if (op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
+      for (auto output_idx : this->outputs(node_idx)) {
+        if (nodes_found->find(output_idx) == nodes_found->end()) {
+          stack.push_back(output_idx);
+        }
+      }
+    }
+  }
+}
+
 string SimpleGraphView::PrintToString() const {
   string str;
   for (int i = 0; i < num_nodes(); ++i) {
@@ -402,5 +456,43 @@ string SimpleGraphView::PrintToString() const {
   return str;
 }
 
+#define HANDLE_CASE(DTYPE)                                          \
+  case DTYPE:                                                       \
+    if (!SafeSetScalarTensorValue<EnumToDataType<DTYPE>::Type>(     \
+            static_cast<double>(value), tensor)) {                  \
+      return errors::InvalidArgument("Cannot store value ", value,  \
+                                     " in tensor of type " #DTYPE); \
+    }                                                               \
+    break
+
+Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
+  // TODO(rmlarsen): Support more general shapes.
+  if (tensor->NumElements() != 1) {
+    return errors::InvalidArgument(
+        "Expected scalar tensor, got num_elements = ", tensor->NumElements());
+  }
+  switch (dtype) {
+    HANDLE_CASE(DT_HALF);
+    HANDLE_CASE(DT_BFLOAT16);
+    HANDLE_CASE(DT_BOOL);
+    HANDLE_CASE(DT_FLOAT);
+    HANDLE_CASE(DT_DOUBLE);
+    HANDLE_CASE(DT_UINT8);
+    HANDLE_CASE(DT_INT8);
+    HANDLE_CASE(DT_UINT16);
+    HANDLE_CASE(DT_INT16);
+    HANDLE_CASE(DT_INT32);
+    HANDLE_CASE(DT_INT64);
+    HANDLE_CASE(DT_COMPLEX64);
+    HANDLE_CASE(DT_COMPLEX128);
+    default:
+      return errors::InvalidArgument("Unsupported type ",
+                                     DataTypeString(dtype));
+  }
+  return Status::OK();
+}
+
+#undef HANDLE_CASE
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 4ecb28f681507f50ad5909f15cf1b408ed6e2979..54cb26bafa9c4a30a1c2f5695d00f9d5858dafb0 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_UTILS_H_
-#define TENSORFLOW_GRAPPLER_UTILS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_H_
 
 #include <functional>
 #include <unordered_map>
@@ -23,10 +23,13 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -106,8 +109,38 @@ string NodeName(const string& name);
 // Get the trailing position number ":{digits}" (if any) of a node name.
 int NodePosition(const string& name);
 
+inline StringPiece ParseNodeNameAsStringPiece(const string& name,
+                                              int* position) {
+  // Strip the prefix '^' (if any), and strip the trailing ":{digits} (if any)
+  // to get a node name.
+  strings::Scanner scan(name);
+  scan.ZeroOrOneLiteral("^")
+      .RestartCapture()
+      .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
+      .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+  StringPiece capture;
+  StringPiece remaining;
+  if (scan.Peek(':') != ':' || !scan.GetResult(&remaining, &capture)) {
+    *position = 0;
+    static const string empty;
+    return StringPiece(empty);
+  } else {
+    if (name[0] == '^') {
+      *position = -1;
+    } else if (remaining.empty()) {
+      *position = 0;
+    } else {
+      // Skip the first ':' character.
+      CHECK(strings::safe_strto32(remaining.substr(1), position));
+    }
+    return capture;
+  }
+}
+
 // Returns the node name and position in a single call.
-string ParseNodeName(const string& name, int* position);
+inline string ParseNodeName(const string& name, int* position) {
+  return ParseNodeNameAsStringPiece(name, position).ToString();
+}
 
 // Add a prefix to a node name with a custom delimiter.
 string AddPrefixToNodeName(const string& name, const string& prefix,
@@ -137,12 +170,19 @@ string AsControlDependency(const string& node);
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
 
+// Returns true iff the node has at least one control input.
+bool HasControlInputs(const NodeDef& node);
+
 // Number of connected non-control inputs.
 int NumNonControlInputs(const NodeDef& node);
 
 // Number of connected non-control outputs.
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map);
 
+// Number of connected non-control data outputs (Ops that consume output tensor
+// data, not just it's shape).
+int NumNonControlDataOutputs(const NodeDef& node, const NodeMap& node_map);
+
 // Removes redundant control inputs from node.
 void DedupControlInputs(NodeDef* node);
 
@@ -167,6 +207,8 @@ NodeDef* GetTailOfChain(const NodeDef& source, const NodeMap& node_map,
 void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
                          bool invert_permutation);
 
+Status SetTensorValue(DataType dtype, int value, Tensor* tensor);
+
 class SimpleGraphView {
  public:
   Status Initialize(const GraphDef& graph) {
@@ -175,6 +217,7 @@ class SimpleGraphView {
   Status Initialize(const GraphDef& graph, bool dedup_inputs,
                     bool dedup_outputs);
 
+  const GraphDef* graph() const { return graph_; }
   inline int num_nodes() const { return index_to_name_.size(); }
   inline const int index(const string& node_name) const {
     const auto& it = name_to_index_.find(node_name);
@@ -191,9 +234,17 @@ class SimpleGraphView {
     return outputs_[node_idx];
   }
 
+  // Traverse the graph starting at `node_idx`, collecting indices of nodes
+  // visited in nodes_found. If a node has an op in `op_types_to_traverse`, the
+  // walk continues to its children. It is assumed that *graph_ was not modified
+  // after the call to Initialize().
+  void DepthFirstSearch(const std::unordered_set<string>& op_types_to_traverse,
+                        int node_idx, std::set<int>* nodes_found) const;
+
   string PrintToString() const;
 
  private:
+  const GraphDef* graph_;  // Not owned.
   std::vector<string> index_to_name_;
   std::unordered_map<string, int> name_to_index_;
   std::vector<gtl::InlinedVector<int, 4>> inputs_;
@@ -203,4 +254,4 @@ class SimpleGraphView {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_UTILS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_H_
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 0a9dbe22cfe3cd01c2c61661adcdd4839a957f03..44ef4a965b59a5198ca1ea38f94e5adf09821606 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -2,18 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "scc",
     srcs = ["scc.cc"],
@@ -140,8 +128,87 @@ cc_library(
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "grappler_test_test",
+    size = "small",
+    srcs = ["grappler_test_test.cc"],
+    deps = [
+        ":grappler_test",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_library(
+    name = "functions",
+    srcs = [
+        "functions.cc",
+    ],
+    hdrs = ["functions.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
     ],
 )
+
+tf_cc_test(
+    name = "functions_test",
+    srcs = ["functions_test.cc"],
+    deps = [
+        ":functions",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "colocation",
+    srcs = ["colocation.cc"],
+    hdrs = ["colocation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "colocation_test",
+    size = "small",
+    srcs = ["colocation_test.cc"],
+    deps = [
+        ":colocation",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/colocation.cc b/tensorflow/core/grappler/utils/colocation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0573e0a8309d7525733013d27befce53a0ecc44b
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation.cc
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/colocation.h"
+
+#include <cstring>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+// Find root node of the colocation group.
+// The map is mapping from one node name to its parent. node_name is the
+// starting node to search. By iteratively following the path from child to
+// parent, we can find the root node for the colocation group that node_name
+// belongs to.
+string GetColocationGroupRoot(std::unordered_map<string, string>* map,
+                              const string& node_name) {
+  if (map->find(node_name) == map->end()) {
+    // If node_name is not in the map, we create a new root node which points
+    // to itself.
+    map->insert({node_name, node_name});
+    return node_name;
+  }
+  string cur = node_name;
+  while ((*map)[cur] != cur) {
+    // Backtracing the map until we reach the root node.
+    cur = (*map)[cur];
+  }
+  return cur;
+}
+
+// Merge two colocation groups into one.
+// left and right is the root node of two colocation groups respectively.
+void MergeColocationGroup(std::unordered_map<string, string>* map,
+                          const string& left, const string& right) {
+  // Do nothing if left or right node is not in the map.
+  if (map->find(left) == map->end() || map->find(right) == map->end()) {
+    return;
+  }
+  if (left != right) {
+    // Make the right node a child of the left node, which merges the two
+    // groups.
+    map->at(right) = left;
+  }
+}
+}  // namespace
+
+// Use of disjoint set algorithm to build the colocation groups from the input
+// graph. The core data structure in use is a hash map from one node to its
+// parent node. Whenever we see two nodes colocate with each other, we merge
+// their colocation groups together. After we traverse all colocation pairs
+// in the graph, we will have several disjoint sets. Then we pick the root node
+// of each disjoint set as the representative node, and let all other nodes in
+// the group colocate with the representative node.
+void ReassignColocation(GraphDef* graph) {
+  constexpr char kClassAttr[] = "_class";
+  constexpr char kColocPrefix[] = "loc:@";
+
+  // A hashmap that maps from a node name to its parent node name.
+  std::unordered_map<string, string> coloc_groups;
+  NodeMap node_map(graph);
+  for (const auto& node : graph->node()) {
+    auto iter = node.attr().find(kClassAttr);
+    if (iter != node.attr().end() && iter->second.has_list()) {
+      for (const auto& str : iter->second.list().s()) {
+        size_t pos = str.find(kColocPrefix);
+        if (pos == 0) {
+          // After we find a colocation, update the colocation groups.
+          string colocate_node = str.substr(pos + strlen(kColocPrefix));
+          MergeColocationGroup(
+              &coloc_groups, GetColocationGroupRoot(&coloc_groups, node.name()),
+              GetColocationGroupRoot(&coloc_groups, colocate_node));
+        }
+      }
+    }
+  }
+
+  // We use the root node of each colocation groups as its representative
+  // node. For each node in one group, colocate with the representative node
+  // if the node is in the graph.
+  for (const auto& pair : coloc_groups) {
+    if (pair.first != pair.second) {
+      // This is a child node.
+      NodeDef* node = node_map.GetNode(pair.first);
+      if (node) {
+        // Colocate this node with the root node.
+        AttrValue new_value;
+        new_value.mutable_list()->add_s(
+            kColocPrefix + GetColocationGroupRoot(&coloc_groups, pair.first));
+        node->mutable_attr()->erase(kClassAttr);
+        node->mutable_attr()->insert({kClassAttr, new_value});
+      }
+    } else {
+      // This is a root node. Clear the _class attribute.
+      NodeDef* node = node_map.GetNode(pair.first);
+      if (node) {  // root node should always exist in the graph as guaranteed
+                   // by order of merging. Just put check here to ensure safety.
+        node->mutable_attr()->erase(kClassAttr);
+      }
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/colocation.h b/tensorflow/core/grappler/utils/colocation.h
new file mode 100644
index 0000000000000000000000000000000000000000..6062db6102c50853145b15dae08994e971cca83d
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+
+#include <unordered_map>
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Evaluates the colocation relation in the graph and rewrites the new
+// colocation relation in the graph. We scan the graph nodes sequentially, and
+// builds a disjoint-sets of nodes (within each disjoint-set the nodes are
+// colocated with each other). We then select the root node of each set as a
+// representative node, and then colocate each node within the set (should also
+// exist in graph) with the representative node.
+// Note that there is current one situation this function can't handle:
+// Node A colocates with X, node B colocates with Y, X colocates with Y but
+// X, Y are removed from graph. In this case we can't know A colocates with B.
+void ReassignColocation(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
diff --git a/tensorflow/core/grappler/utils/colocation_test.cc b/tensorflow/core/grappler/utils/colocation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6638364240fe4a85a9bb066b812b087fe407db6e
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/colocation.h"
+
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ColocationTest : public ::testing::Test {};
+
+bool VerifyNodeHasColocation(const NodeDef& ndef, const string& coloc) {
+  if (ndef.attr().empty()) {
+    return false;
+  }
+  if (ndef.attr().find("_class") == ndef.attr().end()) {
+    return false;
+  }
+  return ndef.attr().at("_class").list().s(0) == coloc;
+}
+
+TEST(ColocationTest, ReassignColocation_SingleNode) {
+  // Node A colocates with B, but node B is not in the graph.
+  //   A
+  //   |
+  //   |
+  //  [B]
+
+  NodeDef ndef;
+  const Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@B"}).Finalize(&ndef);
+  TF_EXPECT_OK(status);
+  GraphDef gdef = test::function::GDef({ndef});
+
+  EXPECT_EQ(1, gdef.node_size());
+  EXPECT_EQ(1, gdef.node(0).attr_size());
+
+  ReassignColocation(&gdef);
+
+  // Validates that node A's colocation info is cleared.
+  EXPECT_EQ(1, gdef.node_size());
+  EXPECT_EQ(0, gdef.node(0).attr_size());
+}
+
+TEST(ColocationTest, ReassignColocation_MultiNode_SingleGroup) {
+  // Node A, B, C colocate with X. D colocates with C. E colocates with D.
+  // Node X is not in the graph.
+  //  A   B   C---D---E
+  //  |   |   |
+  //  |   |   |
+  //  +--[X]--+
+  // After re-assign of colocation, A, B, C, D should colocate with E.
+  // A   B   C   D
+  // |   |   |   |
+  // |   |   |   |
+  // +---+-E-+---+
+
+  NodeDef ndef_a, ndef_b, ndef_c, ndef_d, ndef_e;
+  Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_a);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("B", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_b);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("C", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_c);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("D", "Const").Attr("_class", {"loc:@C"}).Finalize(&ndef_d);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("E", "Const").Attr("_class", {"loc:@D"}).Finalize(&ndef_e);
+  TF_EXPECT_OK(status);
+  GraphDef gdef =
+      test::function::GDef({ndef_a, ndef_b, ndef_c, ndef_d, ndef_e});
+
+  EXPECT_EQ(5, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@X"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@X"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@X"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@C"));  // D
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(4), "loc:@D"));  // E
+
+  ReassignColocation(&gdef);
+
+  EXPECT_EQ(5, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@E"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@E"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@E"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@E"));  // D
+  EXPECT_EQ(0, gdef.node(4).attr_size());                        // E
+}
+
+TEST(ColocationTest, ReassignColocation_MultiNode_MultiGroup) {
+  // Before re-assign:
+  // Node A, B, C colocate with X. D colocates with C. E colocates with D.
+  // Node U, V colocates with W. Node X, W are not in the graph:
+  //  A   B   C---D---E
+  //  |   |   |
+  //  |   |   |
+  //  +--[X]--+
+  //
+  //  U       V
+  //  |       |
+  //  |       |
+  //  +--[W]--+
+  //
+  // After re-assign:
+  // A, B, C, D should colocate with E. U should colocate with V.
+  // A   B   C   D
+  // |   |   |   |
+  // |   |   |   |
+  // +---+-E-+---+
+  //
+  // U
+  // |
+  // |
+  // V
+
+  NodeDef ndef_a, ndef_b, ndef_c, ndef_d, ndef_e, ndef_u, ndef_v;
+  Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_a);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("B", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_b);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("C", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_c);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("D", "Const").Attr("_class", {"loc:@C"}).Finalize(&ndef_d);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("E", "Const").Attr("_class", {"loc:@D"}).Finalize(&ndef_e);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("U", "Const").Attr("_class", {"loc:@W"}).Finalize(&ndef_u);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("V", "Const").Attr("_class", {"loc:@W"}).Finalize(&ndef_v);
+  TF_EXPECT_OK(status);
+  GraphDef gdef = test::function::GDef(
+      {ndef_a, ndef_b, ndef_c, ndef_d, ndef_e, ndef_u, ndef_v});
+
+  EXPECT_EQ(7, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@X"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@X"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@X"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@C"));  // D
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(4), "loc:@D"));  // E
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(5), "loc:@W"));  // U
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(6), "loc:@W"));  // V
+
+  ReassignColocation(&gdef);
+
+  EXPECT_EQ(7, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@E"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@E"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@E"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@E"));  // D
+  EXPECT_EQ(0, gdef.node(4).attr_size());                        // E
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(5), "loc:@V"));  // U
+  EXPECT_EQ(0, gdef.node(6).attr_size());                        // V
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79b823fa2da1297c1b53efe78e544d855fbfa67b
--- /dev/null
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -0,0 +1,685 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/grappler/utils/functions.h"
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/scanner.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+Status RegisterFunctionBodyOutputs(const OpRegistrationData& registration,
+                                   const NodeDef& node,
+                                   GrapplerFunctionConnectivity* connectivity) {
+  tensorflow::NameRangeMap outputs_range_map;
+  TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(
+      node, registration.op_def, nullptr, &outputs_range_map));
+  connectivity->RegisterFunctionBodyOutputs(node.name(), outputs_range_map);
+  return Status::OK();
+}
+
+Status RegisterFunctionBodyOutputs(const FunctionLibraryDefinition& flib,
+                                   const NodeDef& node,
+                                   GrapplerFunctionConnectivity* connectivity) {
+  const OpRegistrationData* registration;
+  TF_RETURN_IF_ERROR(flib.LookUp(node.op(), &registration));
+  return RegisterFunctionBodyOutputs(*registration, node, connectivity);
+}
+
+// Replace the placeholder attribute values with the values specified in
+// instantiation attributes.
+Status ResolveFunctionBodyNodeAttrPlaceholders(
+    const AttrValueMap& func_instantiation_attr, NodeDef* node) {
+  for (auto& attr : *node->mutable_attr()) {
+    const string& placeholder = attr.second.placeholder();
+    if (placeholder.empty()) continue;
+
+    auto it = func_instantiation_attr.find(placeholder);
+    if (it != func_instantiation_attr.end()) {
+      attr.second = it->second;
+    } else {
+      return errors::InvalidArgument("Can't resolve placeholder: ",
+                                     placeholder);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
+    const InputArgExpansion& input_arg_expansion) {
+  const auto& input_name = input_arg_expansion.input_name;
+  const auto& placeholders = input_arg_expansion.placeholders;
+  input_arg_expansions_.emplace(input_name, input_arg_expansion);
+  for (int i = 0; i < placeholders.size(); ++i) {
+    const string& placeholder = input_arg_expansion.placeholders[i];
+    input_arg_placeholders_.emplace(
+        placeholder, InputArgPlaceholder{input_name, /*position=*/i});
+  }
+}
+
+void GrapplerFunctionConnectivity::RegisterFunctionBodyOutputs(
+    const string& node_name, const tensorflow::NameRangeMap& outputs) {
+  function_body_outputs_[node_name] = outputs;
+}
+
+Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
+    const string& func_def_input, std::vector<string>* graph_def_inputs) const {
+  using ::tensorflow::strings::Scanner;
+
+  if (IsControlInput(func_def_input)) {
+    graph_def_inputs->push_back(func_def_input);
+    return Status::OK();
+  }
+
+  // Parse input format: "node_name[:node_output][:position]"
+  string node_name;
+  string node_output;
+  int position = -1;
+
+  StringPiece capture;
+  StringPiece remaining;
+
+  // Parse "node_name"
+  if (Scanner(func_def_input)
+          .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
+          .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
+          .GetResult(&remaining, &capture)) {
+    node_name = string(capture.data(), capture.size());
+  }
+
+  // Parse "node_output" if it exists
+  if (Scanner(remaining)
+          .OneLiteral(":")
+          .RestartCapture()
+          .One(strings::Scanner::LOWERLETTER)
+          .Any(strings::Scanner::LETTER_DIGIT_UNDERSCORE)
+          .GetResult(&remaining, &capture)) {
+    node_output = string(capture.data(), capture.size());
+  }
+
+  // Parse "position" if it exists
+  if (Scanner(remaining)
+          .OneLiteral(":")
+          .RestartCapture()
+          .Many(strings::Scanner::DIGIT)
+          .GetResult(nullptr, &capture)) {
+    CHECK(strings::safe_strto32(capture, &position));
+  }
+
+  // If "node_output" is not empty, it must be an output of a function body node
+  bool is_function_body_output = !node_output.empty();
+
+  // Function input argument: "node_name[:position]"
+  if (!is_function_body_output) {
+    auto input_arg = input_arg_expansions_.find(node_name);
+    if (input_arg != input_arg_expansions_.end()) {
+      const InputArgExpansion& input_arg_expansion = input_arg->second;
+      const auto& placeholders = input_arg_expansion.placeholders;
+
+      if (position == -1) {
+        // If position is not defined use all placeholders
+        graph_def_inputs->reserve(placeholders.size());
+        for (const string& placeholder : placeholders) {
+          graph_def_inputs->push_back(placeholder);
+        }
+      } else {
+        if (position > input_arg_expansion.placeholders.size() - 1) {
+          return errors::InvalidArgument("Invalid input ", node_name,
+                                         "position: ", position,
+                                         " (out of range)");
+        }
+        graph_def_inputs->push_back(input_arg_expansion.placeholders[position]);
+      }
+
+      return Status::OK();
+    }
+  }
+
+  // Function body output: "node_name:node_output[:position]"
+  if (is_function_body_output) {
+    auto function_body_outputs = function_body_outputs_.find(node_name);
+    if (function_body_outputs != function_body_outputs_.end()) {
+      const tensorflow::NameRangeMap& outputs = function_body_outputs->second;
+      auto output = outputs.find(node_output);
+      if (output != outputs.end()) {
+        const auto& output_range = output->second;
+
+        if (position == -1) {
+          // If position is not defined expand node output range
+          for (int i = output_range.first; i < output_range.second; ++i) {
+            i == 0 ? graph_def_inputs->push_back(node_name)
+                   : graph_def_inputs->push_back(
+                         strings::StrCat(node_name, ":", i));
+          }
+        } else {
+          if (position > (output_range.second - output_range.first)) {
+            return errors::InvalidArgument(
+                "Invalid node ", node_name, " output ", node_output,
+                " position: ", position, " (out of range)");
+          }
+          int pos = output_range.first + position;
+          pos == 0 ? graph_def_inputs->push_back(node_name)
+                   : graph_def_inputs->push_back(
+                         strings::StrCat(node_name, ":", pos));
+        }
+
+        return Status::OK();
+      }
+    }
+  }
+
+  return errors::InvalidArgument("Failed to expand a function def input: ",
+                                 func_def_input);
+}
+
+Status GrapplerFunctionConnectivity::ExpandNodeInputs(
+    NodeDef* function_body_node) const {
+  std::vector<string> expanded_inputs;
+
+  for (const string& function_def_input : function_body_node->input()) {
+    TF_RETURN_IF_ERROR(
+        ExpandFunctionDefInput(function_def_input, &expanded_inputs));
+  }
+
+  function_body_node->clear_input();
+  for (const string& expanded_input : expanded_inputs)
+    function_body_node->add_input(expanded_input);
+  return Status::OK();
+}
+
+Status GrapplerFunctionConnectivity::AsFunctionDefInput(
+    const string& graph_def_input, string* func_def_input) const {
+  using gtl::FindOrNull;
+
+  if (IsControlInput(graph_def_input)) {
+    *func_def_input = graph_def_input;
+    return Status::OK();
+  }
+
+  int position;
+  string node_name = ParseNodeName(graph_def_input, &position);
+  CHECK_GE(position, 0);
+
+  // Check if it's an input arg placeholder
+  if (position == 0) {
+    const InputArgPlaceholder* placeholder =
+        FindOrNull(input_arg_placeholders_, node_name);
+    if (placeholder != nullptr) {
+      *func_def_input =
+          strings::StrCat(placeholder->input_name, ":", placeholder->position);
+      return Status::OK();
+    }
+  }
+
+  // It must be output from one of the function body nodes
+  const tensorflow::NameRangeMap* outputs_range_map =
+      FindOrNull(function_body_outputs_, node_name);
+  if (outputs_range_map != nullptr) {
+    for (const auto& el : *outputs_range_map) {
+      const auto& output_name = el.first;
+      const auto& output_range = el.second;
+      if (position >= output_range.first && position < output_range.second) {
+        int pos = position - output_range.first;
+        *func_def_input =
+            strings::StrCat(node_name, ":", output_name, ":", pos);
+        return Status::OK();
+      }
+    }
+  }
+
+  return errors::InvalidArgument("Unknown graph def input: ", graph_def_input);
+}
+
+Status GrapplerFunctionConnectivity::AsFunctionDefNode(
+    NodeDef* function_body_node) const {
+  string func_def_input;
+
+  for (int i = 0; i < function_body_node->input_size(); ++i) {
+    TF_RETURN_IF_ERROR(
+        AsFunctionDefInput(function_body_node->input(i), &func_def_input));
+    function_body_node->set_input(i, func_def_input);
+  }
+
+  return Status::OK();
+}
+
+Status GrapplerFunctionItemInstantiation::GetTypeAttr(
+    const string& type_attr_name, DataType* data_type) const {
+  auto it = func_instantiation_attr_->find(type_attr_name);
+  if (it == func_instantiation_attr_->end()) {
+    return errors::InvalidArgument("Type attribute ", type_attr_name,
+                                   " is not defined");
+  } else if (it->second.type() == DT_INVALID) {
+    return errors::InvalidArgument("Type attribute ", type_attr_name,
+                                   " is not defined with a valid type");
+  } else {
+    *data_type = it->second.type();
+  }
+  return Status::OK();
+}
+
+Status GrapplerFunctionItemInstantiation::GetArgType(
+    const OpDef::ArgDef& arg, DataType* data_type) const {
+  if (arg.type() != DT_INVALID) {
+    *data_type = arg.type();
+  } else {
+    if (!arg.type_list_attr().empty() || !arg.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Arguments with sequence of tensors are not supported. Unsupported "
+          "argument name: ",
+          arg.name());
+    }
+    TF_RETURN_IF_ERROR(GetTypeAttr(arg.type_attr(), data_type));
+  }
+  return Status::OK();
+}
+
+GrapplerFunctionItem::GrapplerFunctionItem(
+    const string& func_name, const AttrValueMap& func_attr,
+    const std::vector<InputArgExpansion>& input_arg_expansions,
+    const std::vector<OutputArgExpansion>& output_arg_expansions,
+    const std::vector<string>& keep_nodes, bool is_stateful,
+    GraphDef&& function_body)
+    : func_attr_(func_attr),
+      input_arg_expansions_(input_arg_expansions),
+      output_arg_expansions_(output_arg_expansions),
+      is_stateful_(is_stateful) {
+  id = func_name;
+  keep_ops = keep_nodes;
+  // Swap the graph body.
+  graph.Swap(&function_body);
+  // Fill the feed nodes with input placeholders.
+  for (const InputArgExpansion& input_arg : input_arg_expansions_) {
+    for (const string& placeholder : input_arg.placeholders) {
+      feed.emplace_back(placeholder, Tensor());
+      input_arg_placeholders_.insert(placeholder);
+    }
+  }
+  // Fill the fetch nodes with outputs.
+  for (const OutputArgExpansion& output_arg : output_arg_expansions_) {
+    for (const string& output_tensor : output_arg.output_tensors) {
+      fetch.push_back(output_tensor);
+    }
+  }
+  // Stateful and Send (it's not stateful) nodes must be preserved in the graph.
+  for (const NodeDef& node : graph.node()) {
+    if (IsSend(node)) {
+      keep_ops.push_back(node.name());
+    }
+  }
+}
+
+const std::vector<InputArgExpansion>& GrapplerFunctionItem::inputs() const {
+  return input_arg_expansions_;
+}
+
+const InputArgExpansion& GrapplerFunctionItem::input(int i) const {
+  return input_arg_expansions_[i];
+}
+
+const std::size_t GrapplerFunctionItem::input_size() const {
+  return input_arg_expansions_.size();
+}
+
+bool GrapplerFunctionItem::IsInputPlaceholder(const string& node_name) const {
+  return input_arg_placeholders_.find(node_name) !=
+         input_arg_placeholders_.end();
+}
+
+const std::vector<OutputArgExpansion>& GrapplerFunctionItem::outputs() const {
+  return output_arg_expansions_;
+}
+
+const OutputArgExpansion& GrapplerFunctionItem::output(int i) const {
+  return output_arg_expansions_[i];
+}
+
+const std::size_t GrapplerFunctionItem::output_size() const {
+  return output_arg_expansions_.size();
+}
+
+const AttrValueMap& GrapplerFunctionItem::func_attr() const {
+  return func_attr_;
+}
+
+const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
+
+GraphDef& GrapplerFunctionItem::mutable_function_body() { return graph; }
+
+bool GrapplerFunctionItem::is_stateful() const { return is_stateful_; }
+
+GrapplerFunctionItem& GrapplerFunctionItem::SwapFunctionBody(GraphDef&& other) {
+  graph.Swap(&other);
+  return *this;
+}
+
+std::vector<string> OutputTensors(const GrapplerFunctionItem& item) {
+  std::vector<string> output_tensors;
+  for (const OutputArgExpansion& output : item.outputs()) {
+    for (const string& tensor : output.output_tensors) {
+      output_tensors.push_back(tensor);
+    }
+  }
+  return output_tensors;
+}
+
+bool HasParametrizedType(const FunctionDef& func) {
+  const auto is_type_parametrized = [](const OpDef::ArgDef& arg) {
+    return !arg.type_attr().empty() || !arg.number_attr().empty() ||
+           !arg.type_list_attr().empty();
+  };
+
+  const auto& input = func.signature().input_arg();
+  const auto& output = func.signature().output_arg();
+  return std::any_of(input.begin(), input.end(), is_type_parametrized) ||
+         std::any_of(output.begin(), output.end(), is_type_parametrized);
+}
+
+bool HasParametrizedBody(const FunctionDef& func) {
+  const auto is_parametrized = [&](const NodeDef& node) {
+    for (const auto& attr : node.attr()) {
+      if (!attr.second.placeholder().empty()) return true;
+    }
+    return false;
+  };
+  return std::any_of(func.node_def().begin(), func.node_def().end(),
+                     is_parametrized);
+}
+
+bool IsParametrized(const FunctionDef& func) {
+  return HasParametrizedType(func) || HasParametrizedBody(func);
+}
+
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const AttrValueMap& func_instantiation_attr,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item) {
+  const OpDef& signature = func.signature();
+
+  if (signature.name().empty()) {
+    return errors::InvalidArgument("Function name must be specified");
+  }
+
+  // Function types will be resolved from function instantiation attributes. All
+  // other attributes will be lost during conversion to FunctionDef.
+  for (const OpDef::AttrDef& attr : signature.attr()) {
+    if (attr.type() != "type") {
+      return errors::InvalidArgument(
+          "Function signature must have only type attributes");
+    }
+  }
+
+  // Helper methods to lookup function instantiation attributes
+  GrapplerFunctionItemInstantiation instantiation(&func_instantiation_attr);
+
+  // Mapping from FunctionDef input format (name[:output][:position]) to
+  // GraphDef input format (name[:position])
+  GrapplerFunctionConnectivity connectivity;
+
+  std::vector<InputArgExpansion> inputs;
+  std::vector<OutputArgExpansion> outputs;
+  std::vector<string> keep_nodes;
+
+  // Function body shares the library with the graph that instantiated it.
+  GraphDef function_body;
+  *function_body.mutable_library() = flib.ToProto();
+
+  // TODO(ezhulenev): support functions with tensor sequence inputs/outputs
+
+  // Make sure that there is no tensor sequences in outputs
+  for (const OpDef::ArgDef& output : signature.output_arg()) {
+    if (!output.type_list_attr().empty() || !output.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Outputs with sequence of tensors are not supported. Unsupported "
+          "output: ",
+          output.name());
+    }
+  }
+
+  // For each input argument create a placeholder in function body.
+  for (const OpDef::ArgDef& input : signature.input_arg()) {
+    if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Inputs with sequence of tensors are not supported. Unsupported "
+          "input: ",
+          input.name());
+    }
+
+    DataType input_data_type;
+    TF_RETURN_IF_ERROR(instantiation.GetArgType(input, &input_data_type));
+
+    NodeDef* placeholder = function_body.add_node();
+    placeholder->set_name(input.name());
+    placeholder->set_op("Placeholder");
+    (*placeholder->mutable_attr())["T"].set_type(input_data_type);
+
+    InputArgExpansion input_expansion{/*input_name=*/input.name(),
+                                      /*data_type=*/input_data_type,
+                                      /*is_ref*/ input.is_ref(),
+                                      /*placeholders=*/{input.name()}};
+    connectivity.RegisterInputArgExpansion(input_expansion);
+    inputs.push_back(input_expansion);
+  }
+
+  // Add all function nodes to the function body
+  for (const NodeDef& func_def_node : func.node_def()) {
+    NodeDef* new_node = function_body.add_node();
+    *new_node = func_def_node;
+
+    const OpRegistrationData* registration;
+    TF_RETURN_IF_ERROR(flib.LookUp(func_def_node.op(), &registration));
+
+    // Resolve all placeholder values using function instantiation attributes.
+    TF_RETURN_IF_ERROR(ResolveFunctionBodyNodeAttrPlaceholders(
+        func_instantiation_attr, new_node));
+
+    // Register node output range in a function connectivity.
+    TF_RETURN_IF_ERROR(RegisterFunctionBodyOutputs(*registration, func_def_node,
+                                                   &connectivity));
+
+    // Stateful and Send nodes must be preserved in a function body
+    if (registration->op_def.is_stateful() || IsSend(func_def_node)) {
+      keep_nodes.push_back(func_def_node.name());
+    }
+  }
+
+  // Rewrite inputs to use GraphDef format
+  for (NodeDef& node : *function_body.mutable_node()) {
+    TF_RETURN_IF_ERROR(connectivity.ExpandNodeInputs(&node));
+  }
+
+  // Add function outputs
+  for (const OpDef::ArgDef& out : signature.output_arg()) {
+    std::vector<string> output_tensors;
+    auto ret = func.ret().find(out.name());
+    TF_RETURN_IF_ERROR(
+        ret != func.ret().end()
+            // Expand outputs using provided output mapping
+            ? connectivity.ExpandFunctionDefInput(ret->second, &output_tensors)
+            // Otherwise output must be one of the function inputs
+            : connectivity.ExpandFunctionDefInput(out.name(), &output_tensors));
+
+    DataType output_data_type;
+    TF_RETURN_IF_ERROR(instantiation.GetArgType(out, &output_data_type));
+
+    OutputArgExpansion output{/*output_name=*/out.name(),
+                              /*data_type=*/output_data_type,
+                              /*is_ref=*/out.is_ref(),
+                              /*output_tensors=*/output_tensors};
+    outputs.push_back(output);
+  }
+
+  bool is_stateful = signature.is_stateful();
+
+  *item = GrapplerFunctionItem(
+      /*func_name=*/signature.name(),
+      /*func_attr=*/AttrValueMap(func.attr().begin(), func.attr().end()),
+      inputs, outputs, keep_nodes, is_stateful, std::move(function_body));
+  return Status::OK();
+}
+
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item) {
+  return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, item);
+}
+
+// Register GrapplerFunctionItem input arg expansion and function body outputs
+// in the GrapplerFunctionConnectivity.
+Status RegisterGrapplerFunctionConnectivity(
+    const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
+    GrapplerFunctionConnectivity* connectivity) {
+  for (const InputArgExpansion& input : item.inputs()) {
+    connectivity->RegisterInputArgExpansion(input);
+  }
+  for (const NodeDef& func_body_node : item.function_body().node()) {
+    TF_RETURN_IF_ERROR(
+        RegisterFunctionBodyOutputs(flib, func_body_node, connectivity));
+  }
+  return Status::OK();
+}
+
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+                             GrapplerFunctionItem* item) {
+  if (!IsConstant(input_const)) {
+    return errors::InvalidArgument("Input node ", input_const.name(),
+                                   " is not a constant");
+  }
+
+  auto& inputs = item->input_arg_expansions_;
+
+  // Find input arg expansion and input placeholder position in it for the
+  // given function input position.
+  InputArgExpansion* input_arg_expansion = nullptr;
+  int placeholder_idx = input_position;
+
+  for (InputArgExpansion& input : inputs) {
+    if (placeholder_idx < input.placeholders.size()) {
+      input_arg_expansion = &input;
+      break;
+    }
+    placeholder_idx -= input.placeholders.size();
+  }
+
+  if (input_arg_expansion == nullptr) {
+    return errors::InvalidArgument(
+        "Input placeholder not found: input_position=", input_position,
+        " function=", item->id);
+  }
+
+  // Delete placeholder from input expansion.
+  string placeholder_name = input_arg_expansion->placeholders[placeholder_idx];
+  item->input_arg_placeholders_.erase(placeholder_name);
+  input_arg_expansion->placeholders.erase(
+      input_arg_expansion->placeholders.begin() + placeholder_idx);
+
+  // Delete empty input expansions.
+  inputs.erase(std::remove_if(inputs.begin(), inputs.end(),
+                              [](const InputArgExpansion& input) {
+                                return input.placeholders.empty();
+                              }),
+               inputs.end());
+
+  // Replace placeholder node in the function body with a const node.
+  for (NodeDef& node : *item->graph.mutable_node()) {
+    if (node.name() == placeholder_name) {
+      node = input_const;
+      node.set_name(placeholder_name);
+      node.clear_input();   // remove potential control inputs
+      node.clear_device();  // device placement is defined by instantiating node
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MakeFunctionDef(const GrapplerFunctionItem& item,
+                       const FunctionLibraryDefinition& flib,
+                       FunctionDef* func) {
+  func->mutable_signature()->set_name(item.id);
+  func->mutable_signature()->set_is_stateful(item.is_stateful());
+
+  // Build a GrapplerFunctionConnectivity from inputs and new function body.
+  GrapplerFunctionConnectivity connectivity;
+  TF_RETURN_IF_ERROR(
+      RegisterGrapplerFunctionConnectivity(item, flib, &connectivity));
+
+  // Add function input arguments.
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    CHECK(input_arg.placeholders.size() == 1)  // do some sanity checking
+        << "Inputs of tensor sequences are not supported";
+
+    OpDef::ArgDef arg_def;
+    arg_def.set_name(input_arg.input_name);
+    arg_def.set_type(input_arg.data_type);
+    arg_def.set_is_ref(input_arg.is_ref);
+    *func->mutable_signature()->add_input_arg() = arg_def;
+  }
+
+  // Add function output arguments.
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
+        << "Outputs of tensor sequences are not supported";
+
+    OpDef::ArgDef arg_def;
+    arg_def.set_name(output_arg.output_name);
+    arg_def.set_type(output_arg.data_type);
+    arg_def.set_is_ref(output_arg.is_ref);
+    *func->mutable_signature()->add_output_arg() = arg_def;
+
+    string ret;
+    for (const string& output_tensor : output_arg.output_tensors) {
+      TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(output_tensor, &ret));
+      (*func->mutable_ret())[output_arg.output_name] = ret;
+    }
+  }
+
+  // Copy function definition specific attributes.
+  for (const auto& attr : item.func_attr()) {
+    const auto& attr_name = attr.first;
+    const auto& attr_value = attr.second;
+    (*func->mutable_attr())[attr_name] = attr_value;
+  }
+
+  // Copy function body nodes to the FunctionDef and update input format
+  for (const NodeDef& func_body_node : item.function_body().node()) {
+    // Do not copy input placeholders
+    if (item.IsInputPlaceholder(func_body_node.name())) continue;
+
+    NodeDef* func_def_node = func->add_node_def();
+    *func_def_node = func_body_node;
+    TF_RETURN_IF_ERROR(connectivity.AsFunctionDefNode(func_def_node));
+  }
+
+  return Status::OK();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9d71b80ebcf4d0f65622e1a8364d9718275922d
--- /dev/null
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -0,0 +1,231 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_FUNCTIONS_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_FUNCTIONS_H_
+
+#include <memory>
+#include <string>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+using AttrValueMap = std::unordered_map<string, AttrValue>;
+
+// Depending on the function instantiation attributes, input argument to the
+// function might be a single tensor, list of tensors of the same type, or a
+// list of tensors of different types.
+//
+// InputArgExpansion keeps track of the placeholders that were added to the
+// function body in place of function inputs and a resolved input data type.
+struct InputArgExpansion {
+  // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
+  // different data types.
+  // TODO(ezhulenev): Support type parametrized inputs?
+  string input_name;                 // name of the function input argument
+  DataType data_type;                // input data type
+  bool is_ref;                       // if true, inputs are required to be refs
+  std::vector<string> placeholders;  // names of placeholder nodes in the
+                                     // function body
+};
+
+// Depending on the function instantiation attributes, output argument is mapped
+// to one or more outputs of one of the function body nodes.
+//
+// OutputArgExpansion keeps mapping from a function output arg to the output
+// tensors of a function body nodes and a resolved output data type
+struct OutputArgExpansion {
+  // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
+  // different data types.
+  // TODO(ezhulenev): Support type parametrized outputs?
+  string output_name;                  // name of the function output argument
+  DataType data_type;                  // output data type
+  bool is_ref;                         // if true, outputs are refs
+  std::vector<string> output_tensors;  // names of output tensor from the
+                                       // function body nodes
+};
+
+// FunctionDef uses different connectivity encoding for the function body nodes,
+// then a GraphDef (see function.proto for details). Input name in FunctionDef
+// can potentially represent a sequence of tensors (instead just one tensor in
+// GraphDef), we need to expand it when converting from FunctionDef to GraphDef,
+// and fold it back when doing backward conversion.
+class GrapplerFunctionConnectivity {
+ public:
+  void RegisterInputArgExpansion(const InputArgExpansion& input_arg_expansion);
+  void RegisterFunctionBodyOutputs(const string& node_name,
+                                   const tensorflow::NameRangeMap& outputs);
+
+  // Expand input encoded in FunctionDef format (name[:output][:position]) into
+  // multiple inputs in GraphDef format (name[:position]).
+  Status ExpandFunctionDefInput(const string& func_def_input,
+                                std::vector<string>* graph_def_inputs) const;
+
+  // Update Node inputs from FunctionDef to GraphDef format.
+  Status ExpandNodeInputs(NodeDef* function_body_node) const;
+
+  // When expanding inputs in function def format, single input might be
+  // expanded into multiple tensors. When converting back to the function def
+  // format from graph def format, it's always a 1-to-1 relationship.
+  // FunctionDef built from GrapplerFunctionItem is always specialized to it's
+  // instantiation attributes and length of input args (and node def outputs) is
+  // known.
+
+  // Map from GraphDef input format to FunctionDef input format using registered
+  // input arg expansion and function body outputs.
+  Status AsFunctionDefInput(const string& graph_def_input,
+                            string* func_def_input) const;
+
+  // Update Node inputs from GraphDef to FunctionDef format.
+  Status AsFunctionDefNode(NodeDef* function_body_node) const;
+
+ private:
+  // Mapping from input name to input arg expansion.
+  std::unordered_map<string, InputArgExpansion> input_arg_expansions_;
+  // Mapping from function body node name to output names range map.
+  std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
+
+  struct InputArgPlaceholder {
+    string input_name;
+    int position;
+  };
+
+  // Mapping from input arg placeholder to the function input tensor.
+  std::unordered_map<string, InputArgPlaceholder> input_arg_placeholders_;
+};
+
+// Get Function type attributes using attributes of a node that instantiated
+// a function.
+class GrapplerFunctionItemInstantiation {
+ public:
+  explicit GrapplerFunctionItemInstantiation(
+      const AttrValueMap* func_instantiation_attr)
+      : func_instantiation_attr_(func_instantiation_attr) {}
+
+  // Get DataType from attributes by name. Return error if attribute is missing,
+  // or it doesn't define a valid data type.
+  Status GetTypeAttr(const string& type_attr_name, DataType* data_type) const;
+
+  // Get argument data type. If data type is not explicitly defined, uses
+  // provided attribute name to look it up in function attributes.
+  Status GetArgType(const OpDef::ArgDef& arg, DataType* data_type) const;
+
+ private:
+  const AttrValueMap* func_instantiation_attr_;  // do not own
+};
+
+// A special case of GrapplerItem, constructed from a TensorFlow Function.
+class GrapplerFunctionItem : public GrapplerItem {
+ public:
+  GrapplerFunctionItem() = default;
+  GrapplerFunctionItem(
+      const string& func_name, const AttrValueMap& func_attr,
+      const std::vector<InputArgExpansion>& input_arg_expansions,
+      const std::vector<OutputArgExpansion>& output_arg_expansions,
+      const std::vector<string>& keep_nodes, bool is_stateful,
+      GraphDef&& function_body);
+
+  bool IsInputPlaceholder(const string& node_name) const;
+
+  const std::vector<InputArgExpansion>& inputs() const;
+  const InputArgExpansion& input(int i) const;
+  const std::size_t input_size() const;
+
+  const std::vector<OutputArgExpansion>& outputs() const;
+  const OutputArgExpansion& output(int i) const;
+  const std::size_t output_size() const;
+
+  const AttrValueMap& func_attr() const;
+  const GraphDef& function_body() const;
+  GraphDef& mutable_function_body();
+
+  bool is_stateful() const;
+
+  GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
+
+ private:
+  friend Status ReplaceInputWithConst(const NodeDef&, int,
+                                      GrapplerFunctionItem*);
+
+  AttrValueMap func_attr_;  // Attributes specific to function definition that
+                            // produced this item (FuncDef.attr field).
+
+  std::vector<InputArgExpansion> input_arg_expansions_;
+  std::vector<OutputArgExpansion> output_arg_expansions_;
+
+  std::set<string> input_arg_placeholders_;
+
+  bool is_stateful_;
+};
+
+// Return all output tensors referenced by item output args.
+std::vector<string> OutputTensors(const GrapplerFunctionItem& item);
+
+// Check if function input/output types are fully defined only at instantiation
+// time (parametrized by it's instantiation node).
+bool HasParametrizedType(const FunctionDef& func);
+
+// Check if a function body is parametrized by it's instantiation node. Function
+// body is parametrized, if it has at least one node with a 'placeholder'
+// attribute.
+bool HasParametrizedBody(const FunctionDef& func);
+
+// Check if function has parametrized type or body.
+bool IsParametrized(const FunctionDef& func);
+
+// Register GrapplerFunctionItem input arg expansion and function body outputs
+// in the GrapplerFunctionConnectivity. Use function library definition to
+// lookup function body nodes output names and ranges.
+Status RegisterGrapplerFunctionConnectivity(
+    const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
+    GrapplerFunctionConnectivity* connectivity);
+
+// Replace one of the function inputs with a constant.
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+                             GrapplerFunctionItem* item);
+
+// Make a GrapplerFunctionItem from the function definition and function
+// instantiation attributes (caller node attributes). Returns error if the given
+// function def cannot be converted (e.g. not all attributes are defined).
+Status MakeGrapplerFunctionItem(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
+    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
+
+// Make a GrapplerFunction item from the function definition. Function must be
+// fully defined (no type or body parametrization).
+// TODO(ezhulenev): Support parametrized functions without fully defined
+// instantiation attributes? Do we ever want to optimize parametrized function
+// without specializing it to it's instantiation attributes (at least types)?
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item);
+
+// Make a FunctionDef from the GrapplerFunctionItem. Use function library
+// definition to lookup function body nodes output names and ranges.
+Status MakeFunctionDef(const GrapplerFunctionItem& item,
+                       const FunctionLibraryDefinition& flib,
+                       FunctionDef* func);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_FUNCTIONS_H_
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa6fec70ff974410fa68f6713b1adb39fd68c281
--- /dev/null
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -0,0 +1,702 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class FunctionsTest : public ::testing::Test {};
+
+TEST_F(FunctionsTest, IsParametrized) {
+  // Function is defined for multiple input types.
+  FunctionDef parametrized_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  // Function is defined just for float inputs.
+  FunctionDef non_parametrized_func = FunctionDefHelper::Create(
+      "MyMul", {"x:float", "y:float"}, {"z:float"}, {},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  EXPECT_TRUE(HasParametrizedType(parametrized_func));
+  EXPECT_TRUE(HasParametrizedBody(parametrized_func));
+  EXPECT_TRUE(IsParametrized(parametrized_func));
+
+  EXPECT_FALSE(HasParametrizedType(non_parametrized_func));
+  EXPECT_FALSE(HasParametrizedBody(non_parametrized_func));
+  EXPECT_FALSE(IsParametrized(non_parametrized_func));
+}
+
+TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
+  GrapplerFunctionConnectivity connectivity;
+
+  connectivity.RegisterInputArgExpansion(
+      {"inputA", DT_FLOAT, /*is_ref=*/false, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, /*is_ref=*/false, {"inputB_0", "inputB_1"}});
+
+  connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
+  connectivity.RegisterFunctionBodyOutputs("Func",
+                                           {{"o1", {0, 2}}, {"o2", {2, 4}}});
+
+  std::vector<string> inputs;
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("inputA", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("inputA", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("inputB", &inputs));
+  ASSERT_EQ(2, inputs.size());
+  EXPECT_EQ("inputB_0", inputs[0]);
+  EXPECT_EQ("inputB_1", inputs[1]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("inputB:1", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("inputB_1", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Add:z", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Add", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o1", &inputs));
+  ASSERT_EQ(2, inputs.size());
+  EXPECT_EQ("Func", inputs[0]);
+  EXPECT_EQ("Func:1", inputs[1]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o2", &inputs));
+  ASSERT_EQ(2, inputs.size());
+  EXPECT_EQ("Func:2", inputs[0]);
+  EXPECT_EQ("Func:3", inputs[1]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o1:0", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o1:1", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func:1", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o2:0", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func:2", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o2:1", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func:3", inputs[0]);
+}
+
+TEST_F(FunctionsTest, GrapplerFunctionConnectivity_AsFunctionDefInput) {
+  GrapplerFunctionConnectivity connectivity;
+
+  connectivity.RegisterInputArgExpansion(
+      {"inputA", DT_FLOAT, /*is_ref=*/false, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, /*is_ref=*/false, {"inputB_0", "inputB_1"}});
+
+  connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
+  connectivity.RegisterFunctionBodyOutputs("Func",
+                                           {{"o1", {0, 2}}, {"o2", {2, 4}}});
+
+  string input;
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("inputA", &input));
+  EXPECT_EQ("inputA:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("inputB_0", &input));
+  EXPECT_EQ("inputB:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("inputB_1", &input));
+  EXPECT_EQ("inputB:1", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Add", &input));
+  EXPECT_EQ("Add:z:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func", &input));
+  EXPECT_EQ("Func:o1:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func:1", &input));
+  EXPECT_EQ("Func:o1:1", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func:2", &input));
+  EXPECT_EQ("Func:o2:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func:3", &input));
+  EXPECT_EQ("Func:o2:1", input);
+}
+
+TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandNodeInputs) {
+  GrapplerFunctionConnectivity connectivity;
+
+  connectivity.RegisterInputArgExpansion(
+      {"inputA", DT_FLOAT, /*is_ref=*/false, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, /*is_ref=*/false, {"inputB_0", "inputB_1"}});
+
+  NodeDef node;
+  node.add_input("inputA:0");
+  node.add_input("inputB");
+
+  TF_EXPECT_OK(connectivity.ExpandNodeInputs(&node));
+
+  EXPECT_EQ(3, node.input_size());
+  EXPECT_EQ("inputA", node.input(0));
+  EXPECT_EQ("inputB_0", node.input(1));
+  EXPECT_EQ("inputB_1", node.input(2));
+}
+
+TEST_F(FunctionsTest, FromSimpleFunctionDef) {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ("XTimesTwo", item.id);
+  EXPECT_EQ(4, item.function_body().node_size());
+
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ("x", item.input(0).input_name);
+  EXPECT_EQ(std::vector<string>{"x"}, item.input(0).placeholders);
+
+  EXPECT_EQ(1, item.output_size());
+  EXPECT_EQ("y", item.output(0).output_name);
+  EXPECT_EQ("y", item.output(0).output_tensors[0]);
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
+    if (node.name() == "x" && count++) {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "two" && count++) {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "scale" && count++) {
+      EXPECT_EQ("Cast", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("two", node.input(0));
+    } else if (node.name() == "y" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("scale", node.input(1));
+    }
+  }
+  EXPECT_EQ(4, count);
+}
+
+TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
+  // Gradient graph for the Subtract operation
+  std::vector<FunctionDefHelper::Node> nodes = {
+      {{"sx"}, "Shape", {"x"}},
+      {{"sy"}, "Shape", {"y"}},
+      {{"gx"}, "Identity", {"dz"}},
+      {{"gy"}, "Neg", {"dz"}},
+      {{"rx", "ry"}, "BroadcastGradientArgs", {"sx", "sy"}},
+      {{"sum_gx"}, "Sum", {"gx", "rx"}},
+      {{"dx"}, "Reshape", {"sum_gx", "sx"}},
+      {{"sum_gy"}, "Sum", {"gy", "ry"}},
+      {{"dy"}, "Reshape", {"sum_gy", "sy"}},
+  };
+
+  for (auto &n : nodes) {
+    // "BroadcastGradientArgs" doesn't need any attrs.
+    if (n.attr.empty() && n.op != "BroadcastGradientArgs") {
+      n.attr = {{"T", "$T"}};
+    }
+  }
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "SubGrad",
+      // Arg defs
+      {"x: T", "y: T", "dz: T"},
+      // Ret val defs
+      {"dx: T", "dy: T"},
+      // Attr defs
+      {{"T: {half, float, double}"}},
+      // Nodes
+      nodes);
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ("SubGrad", item.id);
+  EXPECT_EQ(12, item.function_body().node_size());
+
+  ASSERT_EQ(3, item.input_size());
+  EXPECT_EQ("x", item.input(0).input_name);
+  EXPECT_EQ("y", item.input(1).input_name);
+  EXPECT_EQ("dz", item.input(2).input_name);
+
+  ASSERT_EQ(2, item.output_size());
+  EXPECT_EQ("dx", item.output(0).output_tensors[0]);
+  EXPECT_EQ("dy", item.output(1).output_tensors[0]);
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
+    if (node.name() == "x" || node.name() == "y" || node.name() == "dz") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "rx" && count++) {
+      EXPECT_EQ("BroadcastGradientArgs", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("sx", node.input(0));
+      EXPECT_EQ("sy", node.input(1));
+    } else if (node.name() == "sum_gx" && count++) {
+      EXPECT_EQ("Sum", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("gx", node.input(0));
+      EXPECT_EQ("rx", node.input(1));
+    } else if (node.name() == "sum_gy" && count++) {
+      EXPECT_EQ("Sum", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("gy", node.input(0));
+      EXPECT_EQ("rx:1", node.input(1));
+    }
+  }
+  EXPECT_EQ(6, count);
+}
+
+TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+  TF_ASSERT_OK(flib.AddFunctionDef(FunctionDefHelper::Define(
+      // Name
+      "Swap",
+      // Args
+      {"i0: T", "i1: T"},
+      // Return values
+      {"o0: T", "o1: T"},
+      // Attr def
+      {"T: {float, double}"},
+      // Nodes
+      {{{"o0"}, "Identity", {"i1"}, {{"T", "$T"}}},
+       {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}})));
+
+  FunctionDef func = FunctionDefHelper::Create(
+      // Name
+      "ManySwapsFirst",
+      // Args
+      {"x: float", "y: float"},
+      // Return values
+      {"o: float"},
+      // attr def
+      {},
+      // Nodes
+      // o = x*x + y*y.  Furthermore, The 1st swap depends on x2, and
+      // y2 depends on the 2nd swap.  The 2nd swap has data dependency
+      // on the 1st swap.
+      {{{"a0"}, "Swap", {"x", "y"}, {{"T", DT_FLOAT}}, {"x2"}},
+       {{"a1"}, "Swap", {"a0:o0:0", "a0:o1:0"}, {{"T", DT_FLOAT}}},
+       {{"x2"}, "Mul", {"x", "x"}, {{"T", DT_FLOAT}}},
+       {{"y2"}, "Mul", {"y", "y"}, {{"T", DT_FLOAT}}, {"a1"}},
+       {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
+      // Output Mapping
+      {{"o", "o:z:0"}});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
+    if (node.name() == "x" || node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "a0" && count++) {
+      EXPECT_EQ("Swap", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^x2", node.input(2));
+    } else if (node.name() == "a1" && count++) {
+      EXPECT_EQ("Swap", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("a0", node.input(0));
+      EXPECT_EQ("a0:1", node.input(1));
+    } else if (node.name() == "x2" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+    } else if (node.name() == "y2" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^a1", node.input(2));
+    } else if (node.name() == "o" && count++) {
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x2", node.input(0));
+      EXPECT_EQ("y2", node.input(1));
+    }
+  }
+  EXPECT_EQ(7, count);
+}
+
+TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
+  FunctionDef func = FunctionDefHelper::Create(
+      // Name
+      "Exp_func",
+      // Args
+      {"in: float"},
+      // Return values
+      {"out: float"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"Linear_func"}, "Identity", {"in"}, {{"T", DT_FLOAT}}},
+       {{"Exp"}, "Exp", {"Linear_func:output:0"}, {{"T", DT_FLOAT}}}},
+      // Mapping
+      {{"out", "Exp:y:0"}});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ(1, item.output_size());
+  EXPECT_EQ("Exp", item.output(0).output_tensors[0]);
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
+    if (node.name() == "in" && count++) {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "Linear_func" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("in", node.input(0));
+    } else if (node.name() == "Exp" && count++) {
+      EXPECT_EQ("Exp", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("Linear_func", node.input(0));
+    }
+  }
+  EXPECT_EQ(3, count);
+}
+
+TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
+  FunctionDef func = FunctionDefHelper::Create(
+      // Name
+      "ForwardInputs",
+      // Args
+      {"in0: float", "in1: float", "arg2: float", "arg3: int32", "arg4: float"},
+      // Return values
+      {"out0: float", "arg2: float", "arg3: int32"},
+      // Attr def
+      {},
+      // Nodes
+      {},
+      // Mapping
+      {{"out0", "in0"}});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ("ForwardInputs", item.id);
+  EXPECT_EQ(5, item.function_body().node_size());
+
+  EXPECT_EQ(3, item.output_size());
+  EXPECT_EQ("in0", item.output(0).output_tensors[0]);
+  EXPECT_EQ("arg2", item.output(1).output_tensors[0]);
+  EXPECT_EQ("arg3", item.output(2).output_tensors[0]);
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
+    EXPECT_TRUE(node.name() == "in0" || node.name() == "in1" ||
+                node.name() == "arg2" || node.name() == "arg3" ||
+                node.name() == "arg4");
+    count++;
+    EXPECT_EQ("Placeholder", node.op());
+    if (node.name() == "arg3") {
+      EXPECT_EQ(DT_INT32, node.attr().at("T").type());
+    } else {
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+    }
+  }
+  EXPECT_EQ(5, count);
+}
+
+TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "GenerateTwo",
+      // Args
+      {},
+      // Return value
+      {"o: T"},
+      // Attr def
+      {"T: {float, double}"},
+      // Nodes
+      {{{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+       {{"o"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}}});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ(0, item.input_size());
+  EXPECT_EQ(1, item.output_size());
+  EXPECT_EQ("o", item.output(0).output_tensors[0]);
+
+  EXPECT_EQ(2, item.function_body().node_size());
+  const NodeDef &two = item.function_body().node(0);
+  EXPECT_EQ("two", two.name());
+  EXPECT_EQ(0, two.input_size());
+  const NodeDef &cast = item.function_body().node(1);
+  EXPECT_EQ("o", cast.name());
+  EXPECT_EQ(1, cast.input_size());
+  EXPECT_EQ("two", cast.input(0));
+}
+
+TEST_F(FunctionsTest, MakeFunctionDef) {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  FunctionDef specialized;
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
+
+  // Input and output types are resolved based on instantiation attributes.
+  EXPECT_EQ("x", specialized.signature().input_arg(0).name());
+  EXPECT_EQ(DT_FLOAT, specialized.signature().input_arg(0).type());
+  EXPECT_EQ("y", specialized.signature().output_arg(0).name());
+  EXPECT_EQ(DT_FLOAT, specialized.signature().output_arg(0).type());
+
+  // Function body specialized for instantiation types
+  int count = 0;
+  for (const NodeDef &node : specialized.node_def()) {
+    if (node.name() == "scale" && count++) {
+      EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
+    } else if (node.name() == "y" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("x:0", node.input(0));
+      EXPECT_EQ("scale:y:0", node.input(1));
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+    }
+  }
+  EXPECT_EQ(2, count);
+}
+
+TEST_F(FunctionsTest, ReplaceInputWithConst) {
+  FunctionDef func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ(2, item.input_size());
+  EXPECT_EQ(1, item.output_size());
+
+  ASSERT_EQ(3, item.function_body().node_size());
+
+  const NodeDef &input_x = item.function_body().node(0);
+  const NodeDef &input_y = item.function_body().node(1);
+
+  // Initially inputs added to the graph as placeholders.
+  EXPECT_EQ("Placeholder", input_x.op());
+  EXPECT_EQ("Placeholder", input_y.op());
+
+  // Replace inputs x and y with constants.
+  NodeDef const_input_x;
+  const_input_x.set_op("Const");
+  AddNodeAttr("Tag", "const_input_x", &const_input_x);
+
+  NodeDef const_input_y;
+  const_input_y.set_op("Const");
+  AddNodeAttr("Tag", "const_input_y", &const_input_y);
+
+  // Replace input x.
+  TF_EXPECT_OK(ReplaceInputWithConst(const_input_x, 0, &item));
+
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ("Const", input_x.op());
+  EXPECT_EQ("const_input_x", input_x.attr().at("Tag").s());
+
+  // Replace input y.
+  TF_EXPECT_OK(ReplaceInputWithConst(const_input_y, 0, &item));
+
+  EXPECT_EQ(0, item.input_size());
+  EXPECT_EQ("Const", input_y.op());
+  EXPECT_EQ("const_input_y", input_y.attr().at("Tag").s());
+
+  // Make a function from const-specialized function item.
+  FunctionDef specialized;
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
+
+  EXPECT_EQ(0, specialized.signature().input_arg_size());
+  EXPECT_EQ(1, specialized.signature().output_arg_size());
+  EXPECT_EQ(3, specialized.node_def_size());
+
+  // Check that graph has const nodes pushed into function body.
+  int count = 0;
+  for (const NodeDef &node : specialized.node_def()) {
+    if (node.name() == "x" && count++) {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ("const_input_x", node.attr().at("Tag").s());
+    } else if (node.name() == "y" && count++) {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ("const_input_y", node.attr().at("Tag").s());
+    } else if (node.name() == "output" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("x:output:0", node.input(0));
+      EXPECT_EQ("y:output:0", node.input(1));
+    }
+  }
+  EXPECT_EQ(3, count);
+}
+
+TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
+  using test::function::NDef;
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  FunctionDef func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  GraphDef id_func_body = test::function::GDef(
+      {/* pass input to output through identity */
+       NDef("output", "Identity", {"x"}, {{"T", "float"}})});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+
+  FunctionDefLibrary lib_def;
+  *lib_def.add_function() = func;
+  *lib_def.add_function() = mul_func;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), lib_def);
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  // Replace function body with identity function
+  item.SwapFunctionBody(std::move(id_func_body));
+  FunctionDef specialized;
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
+
+  // Check that graph body was updated.
+  int count = 0;
+  for (const NodeDef &node : specialized.node_def()) {
+    if (node.name() == "output" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ("x:0", node.input(0));
+    }
+  }
+  EXPECT_EQ(1, count);
+
+  // And return tensor mapping was updated with a new output name (z->output).
+  EXPECT_EQ("output:output:0", (*specialized.mutable_ret())["z"]);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 813f65f825759ca22dba2bdfd8433d946b7dd852..910b0acaefbb10e1da24ab9ec5bfa95b1b5710d4 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -17,23 +17,121 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace grappler {
 
+GrapplerTest::GrapplerTest() {
+  // Turn off all the automatic optimizations to ensure that we run the graph
+  // exactly as it is given to us. This ensures that we can compare the results
+  // before and after manual optimization, without any of the automatic
+  // optimizations interfering in the comparison.
+  RewriterConfig* cfg =
+      options_.config.mutable_graph_options()->mutable_rewrite_options();
+  cfg->set_constant_folding(RewriterConfig::OFF);
+  cfg->set_arithmetic_optimization(RewriterConfig::OFF);
+  cfg->set_dependency_optimization(RewriterConfig::OFF);
+  cfg->set_loop_optimization(RewriterConfig::OFF);
+  cfg->set_function_optimization(RewriterConfig::OFF);
+  cfg->set_layout_optimizer(RewriterConfig::OFF);
+  cfg->set_debug_stripper(RewriterConfig::OFF);
+}
+
+std::vector<Tensor> GrapplerTest::EvaluateNodes(
+    const GraphDef& graph, const std::vector<string>& node_names) const {
+  return EvaluateNodes(graph, node_names, {});
+}
+
 std::vector<Tensor> GrapplerTest::EvaluateNodes(
-    const GraphDef& graph, const std::vector<string>& node_names) {
-  SessionOptions options;
-  std::unique_ptr<tensorflow::Session> session(NewSession(options));
+    const GraphDef& graph, const std::vector<string>& node_names,
+    const std::vector<std::pair<string, Tensor>>& inputs) const {
+  std::unique_ptr<tensorflow::Session> session(NewSession(options_));
   TF_CHECK_OK(session->Create(graph));
   RunOptions run_options;
   std::vector<Tensor> output_tensors;
-  TF_CHECK_OK(session->Run(run_options, {}, node_names, node_names,
+  TF_CHECK_OK(session->Run(run_options, inputs, node_names, node_names,
+                           &output_tensors, nullptr));
+  TF_CHECK_OK(session->Close());
+  return output_tensors;
+}
+
+std::vector<Tensor> GrapplerTest::EvaluateFetchNodes(
+    const GrapplerItem& item) const {
+  std::unique_ptr<tensorflow::Session> session(NewSession(options_));
+  TF_CHECK_OK(session->Create(item.graph));
+  RunOptions run_options;
+  if (!item.init_ops.empty()) {
+    std::vector<Tensor> dummy;
+    TF_CHECK_OK(
+        session->Run(run_options, {}, {}, item.init_ops, &dummy, nullptr));
+  }
+  std::vector<Tensor> output_tensors;
+  TF_CHECK_OK(session->Run(run_options, item.feed, item.fetch, {},
                            &output_tensors, nullptr));
   TF_CHECK_OK(session->Close());
   return output_tensors;
 }
 
+NodeDef* GrapplerTest::AddNode(
+    const string& name, const string& op, const std::vector<string>& inputs,
+    const std::vector<std::pair<string, AttrValue>>& attributes,
+    GraphDef* graph) const {
+  NodeDef* node = graph->add_node();
+  node->set_name(name);
+  node->set_op(op);
+  for (const string& input : inputs) {
+    node->add_input(input);
+  }
+  for (auto attr : attributes) {
+    (*node->mutable_attr())[attr.first] = attr.second;
+  }
+  return node;
+}
+
+void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) const {
+  auto comparator = [](const NodeDef& n1, const NodeDef& n2) -> bool {
+    return n1.name() < n2.name();
+  };
+  std::sort(want.mutable_node()->begin(), want.mutable_node()->end(),
+            comparator);
+  std::sort(got.mutable_node()->begin(), got.mutable_node()->end(), comparator);
+
+  for (int i = 0; i < want.node_size(); ++i) {
+    std::sort(want.mutable_node(i)->mutable_input()->begin(),
+              want.mutable_node(i)->mutable_input()->end());
+  }
+  for (int i = 0; i < got.node_size(); ++i) {
+    std::sort(got.mutable_node(i)->mutable_input()->begin(),
+              got.mutable_node(i)->mutable_input()->end());
+  }
+
+  ASSERT_EQ(want.node_size(), got.node_size());
+  for (int i = 0; i < want.node_size(); ++i) {
+    EXPECT_EQ(want.node(i).op(), got.node(i).op());
+    EXPECT_EQ(want.node(i).name(), got.node(i).name());
+    ASSERT_EQ(want.node(i).input_size(), got.node(i).input_size());
+    for (int j = 0; j < want.node(i).input_size(); ++j) {
+      EXPECT_TRUE(IsSameInput(want.node(i).input(j), got.node(i).input(j)));
+    }
+  }
+}
+
+bool GrapplerTest::IsNodesDirectlyConnected(const NodeMap& node_map,
+                                            const string& src,
+                                            const string& dst, int position) {
+  const NodeDef* src_node = node_map.GetNode(src);
+  const NodeDef* dst_node = node_map.GetNode(dst);
+  EXPECT_TRUE(src_node != nullptr) << src << " node not found";
+  EXPECT_TRUE(dst_node != nullptr) << dst << " node not found";
+  return src_node && dst_node && dst_node->input(position) == src_node->name();
+}
+
+int GrapplerTest::CountOpNodes(const GraphDef& graph, const string& op) {
+  return std::count_if(graph.node().begin(), graph.node().end(),
+                       [&op](const NodeDef& node) { return node.op() == op; });
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index 46ce47c8c3b6bc18b6eac76bbdb8ec1f8a58fab2..bd4d7f2a7e89ad502f3b139f62e00dba198bb751 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -13,25 +13,66 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
-#define TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPPLER_TEST_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPPLER_TEST_H_
 
 #include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace grappler {
 
 class GrapplerTest : public ::testing::Test {
+ public:
+  GrapplerTest();
+
  protected:
-  std::vector<Tensor> EvaluateNodes(const GraphDef& graph,
-                                    const std::vector<string>& node_names);
+  std::vector<Tensor> EvaluateNodes(
+      const GraphDef& graph, const std::vector<string>& node_names) const;
+
+  std::vector<Tensor> EvaluateNodes(
+      const GraphDef& graph, const std::vector<string>& node_names,
+      const std::vector<std::pair<string, Tensor>>& inputs) const;
+
+  std::vector<Tensor> EvaluateFetchNodes(const GrapplerItem& item) const;
+
+  NodeDef* AddNode(const string& name, const string& op,
+                   const std::vector<string>& inputs,
+                   const std::vector<std::pair<string, AttrValue>>& attributes,
+                   GraphDef* graph) const;
+
+  void CompareGraphs(GraphDef want, GraphDef got) const;
+
+  // Check if node 'src' is directly connected to the input($position) of 'dst'.
+  bool IsNodesDirectlyConnected(const NodeMap& node_map, const string& src,
+                                const string& dst, int position = 0);
+
+  // Count nodes of the given op-type in a graph.
+  int CountOpNodes(const GraphDef& graph, const string& op);
+
+  // Get a random tansor with given shape.
+  template <DataType DTYPE>
+  Tensor GenerateRandomTensor(const TensorShape& shape) const {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    Tensor tensor(DTYPE, shape);
+    for (auto i = 0; i < tensor.NumElements(); i++)
+      tensor.flat<T>()(i) = i + random::New64() % 10;
+    return tensor;
+  }
+
+ private:
+  SessionOptions options_;
 };
 
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPPLER_TEST_H_
diff --git a/tensorflow/core/grappler/utils/grappler_test_test.cc b/tensorflow/core/grappler/utils/grappler_test_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..677fa5a79896cf82831812a90242d1e2dd9d302c
--- /dev/null
+++ b/tensorflow/core/grappler/utils/grappler_test_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+// TODO(ezhulenev): add tests for all methods in GrapplerTest
+class GrapplerTestTest : public GrapplerTest {};
+
+TEST_F(GrapplerTestTest, CompareIdenticalGraphs) {
+  tensorflow::Scope s1 = tensorflow::Scope::NewRootScope();
+  auto s1_a = ops::Variable(s1.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto s1_b = ops::Variable(s1.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto s1_add = ops::Add(s1.WithOpName("Add_1"), s1_a, s1_b);
+
+  tensorflow::Scope s2 = tensorflow::Scope::NewRootScope();
+  auto s2_a = ops::Variable(s2.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto s2_b = ops::Variable(s2.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto s2_add = ops::Add(s2.WithOpName("Add_1"), s2_a, s2_b);
+
+  GraphDef graph1;
+  TF_ASSERT_OK(s1.ToGraphDef(&graph1));
+
+  GraphDef graph2;
+  TF_ASSERT_OK(s2.ToGraphDef(&graph2));
+
+  CompareGraphs(graph1, graph2);
+}
+
+TEST_F(GrapplerTestTest, CheckNodesConnectivity) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto add_1 = ops::Add(s.WithOpName("Add_1"), a, b);
+  auto add_2 = ops::Add(s.WithOpName("Add_2"), add_1, b);
+
+  GraphDef graph;
+  TF_ASSERT_OK(s.ToGraphDef(&graph));
+
+  NodeMap node_map(&graph);
+
+  EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "a", "Add_1", 0));
+  EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "b", "Add_1", 1));
+  EXPECT_FALSE(IsNodesDirectlyConnected(node_map, "a", "Add_2", 0));
+  EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "b", "Add_2", 1));
+}
+
+TEST_F(GrapplerTestTest, CountOpNodes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {2, 2}, DT_FLOAT);
+
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_bc = ops::Add(s.WithOpName("Add_bc"), b, c);
+
+  auto mul_ab = ops::Mul(s.WithOpName("Mull_ab"), a, b);
+  auto mul_bc = ops::Mul(s.WithOpName("Mull_bc"), a, b);
+
+  InputList inputs{
+      Output(add_ab),
+      Output(add_bc),
+      Output(mul_ab),
+      Output(mul_bc),
+  };
+  auto add_all = ops::AddN(s.WithOpName("Add_all"), inputs);
+
+  GraphDef graph;
+  TF_ASSERT_OK(s.ToGraphDef(&graph));
+
+  EXPECT_EQ(2, CountOpNodes(graph, "Add"));
+  EXPECT_EQ(2, CountOpNodes(graph, "Mul"));
+  EXPECT_EQ(1, CountOpNodes(graph, "AddN"));
+  EXPECT_EQ(0, CountOpNodes(graph, "Transpose"));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 8d8ff4da3a8df5a2868f1a3a0ac6a5d0c2fd66ad..a8e464d09d6c2e6730bea6c92d769c378cb8ac86 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -26,24 +26,24 @@ namespace grappler {
 
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
-Status TopologicalSort(GraphDef* graph) {
+Status ComputeTopologicalOrder(const GraphDef& graph,
+                               std::vector<int>* ready_nodes) {
   SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(*graph));
+  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
 
-  std::vector<int> ready_nodes;
-  ready_nodes.reserve(graph_view.num_nodes());
+  ready_nodes->reserve(graph_view.num_nodes());
 
   int front = 0;
   int back = 0;
   std::vector<int> num_ready_inputs(graph_view.num_nodes(), 0);
   for (int i = 0; i < graph_view.num_nodes(); i++) {
     if (graph_view.inputs(i).empty()) {
-      ready_nodes.push_back(i);
+      ready_nodes->push_back(i);
       back++;
     }
-    if (IsMerge(graph->node(i))) {
+    if (IsMerge(graph.node(i))) {
       for (int input : graph_view.inputs(i)) {
-        if (IsNextIteration(graph->node(input))) {
+        if (IsNextIteration(graph.node(input))) {
           num_ready_inputs[i]++;
         }
       }
@@ -51,11 +51,11 @@ Status TopologicalSort(GraphDef* graph) {
   }
 
   while (front != back) {
-    int ready_node = ready_nodes[front];
+    int ready_node = (*ready_nodes)[front];
     for (int fanout : graph_view.outputs(ready_node)) {
       ++num_ready_inputs[fanout];
       if (num_ready_inputs[fanout] == graph_view.inputs(fanout).size()) {
-        ready_nodes.push_back(fanout);
+        ready_nodes->push_back(fanout);
         ++back;
       }
     }
@@ -66,7 +66,24 @@ Status TopologicalSort(GraphDef* graph) {
     return errors::InvalidArgument(
         "The graph couldn't be sorted in topological order.");
   }
+  return Status::OK();
+}
 
+Status ComputeTopologicalOrder(
+    const GraphDef& graph,
+    std::unordered_map<const NodeDef*, int>* topo_order) {
+  std::vector<int> ready_nodes;
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(graph, &ready_nodes));
+  topo_order->reserve(graph.node_size());
+  for (int i = 0; i < ready_nodes.size(); ++i) {
+    (*topo_order)[&graph.node(ready_nodes[i])] = i;
+  }
+  return Status::OK();
+}
+
+Status TopologicalSort(GraphDef* graph) {
+  std::vector<int> ready_nodes;
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes));
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index 7700fe41e40e6d1111c9e84aabfd2a05968ef882..668c88dc751c8723c624822233af2a1b5b2fedf7 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -22,6 +22,10 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Compute a topological ordering for the graph nodes.
+Status ComputeTopologicalOrder(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order);
+
 // Sort a graph in topological order.
 Status TopologicalSort(GraphDef* graph);
 
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index c96f15b0e8424d70e8dd1393cf254b52f69200d2..f5c95009d240f321d700c4123486fc3f2b49c3f4 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -52,8 +52,19 @@ TEST_F(TopologicalSortTest, NoLoop) {
   *graph.add_node() = CreateNode("5", {});
   *graph.add_node() = CreateNode("4", {});
 
+  std::unordered_map<const NodeDef*, int> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
+
+  const std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
+  for (const auto& topo : topo_order) {
+    const string& node_name = topo.first->name();
+    const int topo_order = topo.second;
+    std::cout << "Node " << node_name << " at order " << topo_order
+              << std::endl;
+    EXPECT_EQ(node_name, order[topo_order]);
+  }
+
   TF_EXPECT_OK(TopologicalSort(&graph));
-  std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
@@ -68,8 +79,17 @@ TEST_F(TopologicalSortTest, WithLoop) {
   *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
   *graph.add_node() = CreateNode("1", {});
 
+  std::unordered_map<const NodeDef*, int> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
+
+  const std::vector<string> order = {"1", "2", "3", "4", "5"};
+  for (const auto& topo : topo_order) {
+    const string& node_name = topo.first->name();
+    const int topo_order = topo.second;
+    EXPECT_EQ(node_name, order[topo_order]);
+  }
+
   TF_EXPECT_OK(TopologicalSort(&graph));
-  std::vector<string> order = {"1", "2", "3", "4", "5"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index eabce5b5ee7b037b7bc429abfa86ee8735bdbede..49a1996d25e78d17908b1eae04c9acbeb7e2c788 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -292,6 +292,47 @@ TEST_F(UtilsTest, DedupControlInputs) {
   EXPECT_EQ("gnu", foo.input(1));
 }
 
+TEST_F(UtilsTest, NumNonControlOutputs) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  //  *) Round node has control dependency edge from Add, which
+  //     is not on this scheme (ASCII graphics limitation).
+  //
+  //   *Round    [Sqrt, Shape]
+  //      |           |
+  //      |   ctrl    |
+  //     Mul ------> Add
+  //     / \         / \
+  //    x   y       a   b
+  auto x = ops::Variable(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  auto y = ops::Variable(s.WithOpName("y"), {1, 2}, DT_FLOAT);
+  auto a = ops::Variable(s.WithOpName("a"), {1, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {1, 2}, DT_FLOAT);
+
+  auto mul = ops::Multiply(s.WithOpName("mul"), x, y);
+  auto add = ops::Add(s.WithOpName("add").WithControlDependencies(mul), a, b);
+
+  auto shape = ops::Shape(s.WithOpName("shape"), add);
+  auto sqrt = ops::Sqrt(s.WithOpName("sqrt"), add);
+
+  auto round =
+      ops::Round(s.WithOpName("round").WithControlDependencies(add), mul);
+
+  GraphDef graph;
+  TF_CHECK_OK(s.ToGraphDef(&graph));
+  NodeMap node_map(&graph);
+
+  const NodeDef* add_node = node_map.GetNode("add");
+  ASSERT_TRUE(add_node != nullptr);
+
+  // [a, b] are only non-control inputs
+  EXPECT_EQ(2, NumNonControlInputs(*add_node));
+  // [sqrt, shape] are non control outputs
+  EXPECT_EQ(2, NumNonControlOutputs(*add_node, node_map));
+  // sqrt is the only data output
+  EXPECT_EQ(1, NumNonControlDataOutputs(*add_node, node_map));
+}
+
 TEST_F(UtilsTest, DeleteNodes) {}
 
 }  // namespace
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index dc93c76eaee6c3408453a74bac98f5e365364247..6355f136545ed2664a49eed00a8ee48f80cec03c 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -56,8 +56,8 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
-    # convolutions (and possibly more in the future). You will also need
-    # appropriate -mavx*, as required by specific op you use.
+    # sparse matrix multiplications. You will also need appropriate -mavx*
+    # options, as required by specific op you use.
     name = "xsmm",
     values = {
         "define": "tensorflow_xsmm=1",
@@ -65,12 +65,23 @@ config_setting(
 )
 
 config_setting(
-    # Add "--define tensorflow_xsmm_backward=1" to your build command to use
-    # libxsmm for backward convolutions (and possibly more in the future). You
-    # will also need appropriate -mavx*, as required by specific op you use.
-    name = "xsmm_backward",
+    # Add "--define tensorflow_xsmm_convolutions=1" to your build command to
+    # use libxsmm for forward convolutions. You will also need appropriate
+    # -mavx* # options, as required by specific op you use.
+    name = "xsmm_convolutions",
     values = {
-        "define": "tensorflow_xsmm_backward=1",
+        "define": "tensorflow_xsmm_convolutions=1",
+    },
+)
+
+config_setting(
+    # Add "--define tensorflow_xsmm_convolutions=1 --define
+    # tensorflow_xsmm_backward_convolutions=1" to your build command to use libxsmm for
+    # backward convolutions (and possibly more in the future). You will also
+    # need appropriate -mavx* options, as required by specific op you use.
+    name = "xsmm_backward_convolutions",
+    values = {
+        "define": "tensorflow_xsmm_backward_convolutions=1",
     },
 )
 
@@ -120,6 +131,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "collective_ops",
+    prefix = "collective_ops",
+    deps = [
+        "//tensorflow/core:collective_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_kernel_library(
     name = "concat_lib",
     srcs = [
@@ -595,6 +617,7 @@ cc_library(
         ":batch_space_ops",
         ":bcast_ops",
         ":bitcast_op",
+        ":broadcast_to_op",
         ":concat_op",
         ":constant_op",
         ":depth_space_ops",
@@ -646,6 +669,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "broadcast_to_op",
+    prefix = "broadcast_to_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "concat_op",
     prefix = "concat_op",
@@ -868,7 +897,7 @@ tf_kernel_library(
     hdrs = ["transpose_op.h"],
     deps = ARRAY_DEPS + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
+        "@mkl_dnn",
     ]),
 )
 
@@ -909,6 +938,23 @@ tf_kernel_library(
     ]) + ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "cudnn_rnn_kernels",
+    srcs = ["cudnn_rnn_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gpu_util_hdrs",
+        "//tensorflow/core:cudnn_rnn_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//third_party/eigen3",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
 tf_cc_test(
     name = "batch_norm_op_test",
     size = "small",
@@ -1017,7 +1063,7 @@ tf_cc_test(
     name = "xsmm_conv2d_test",
     size = "small",
     srcs = select({
-        ":xsmm": ["xsmm_conv2d_test.cc"],
+        ":xsmm_convolutions": ["xsmm_conv2d_test.cc"],
         "//conditions:default": [],
     }),
     deps = [
@@ -1032,7 +1078,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ] + select({
-        ":xsmm": [
+        ":xsmm_convolutions": [
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
@@ -1368,6 +1414,7 @@ tf_kernel_library(
     visibility = [":friends"],
     deps = [
         ":bounds_check",
+        ":dense_update_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -1655,6 +1702,43 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "scoped_allocator_ops",
+    prefix = "scoped_allocator_ops",
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:scoped_allocator_ops_op_lib",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "scoped_allocator_ops_test",
+    srcs = ["scoped_allocator_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),  #Required for benchmarking
+    deps = [
+        ":cwise_op",
+        ":dense_update_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":scoped_allocator_ops",
+        ":variable_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "session_ops",
     prefix = "session_ops",
@@ -1891,9 +1975,9 @@ tf_kernel_library(
     srcs = ["resource_variable_ops.cc"],
     deps = [
         ":bounds_check",
-        ":critical_section",
         ":dense_update_functor",
         ":gather_functor",
+        ":mutex_ops",
         ":scatter_functor",
         ":state",
         ":training_op_helpers",
@@ -1940,6 +2024,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -1954,6 +2039,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "partitioned_function_ops",
+    prefix = "partitioned_function_ops",
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "image",
     deps = [
@@ -2200,6 +2296,7 @@ tf_cc_tests(
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -2463,13 +2560,13 @@ tf_kernel_library(
 tf_kernel_library(
     name = "self_adjoint_eig_op",
     prefix = "self_adjoint_eig_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"],
 )
 
 tf_kernel_library(
     name = "self_adjoint_eig_v2_op",
     prefix = "self_adjoint_eig_v2_op",
-    deps = LINALG_DEPS + if_cuda([
+    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"] + if_cuda([
         ":cast_op",
         ":cwise_op",
     ]),
@@ -2799,7 +2896,7 @@ tf_kernel_library(
         "//conditions:default": [],
     }) + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
+        "@mkl_dnn",
     ]) + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
     ]),
@@ -3138,7 +3235,7 @@ tf_kernel_library(
         "conv_grad_ops_3d.cc",
         "deep_conv2d.cc",
     ] + select({
-        ":xsmm": ["xsmm_conv2d.cc"],
+        ":xsmm_convolutions": ["xsmm_conv2d.cc"],
         "//conditions:default": [],
     }),
     hdrs = [
@@ -3148,7 +3245,7 @@ tf_kernel_library(
         "gemm_functors.h",
         "winograd_transform.h",
     ] + select({
-        ":xsmm": ["xsmm_conv2d.h"],
+        ":xsmm_convolutions": ["xsmm_conv2d.h"],
         "//conditions:default": [],
     }),
     # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
@@ -3156,13 +3253,15 @@ tf_kernel_library(
     # on Windows. See https://github.com/tensorflow/tensorflow/issues/10521
     copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
-        ":xsmm": [
-            "TENSORFLOW_USE_LIBXSMM",
-            "EIGEN_USE_LIBXSMM",
+        ":xsmm_convolutions": [
+            "TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS",
         ],
         "//conditions:default": [],
     }) + select({
-        ":xsmm_backward": ["TENSORFLOW_USE_LIBXSMM_BACKWARD"],
+        ":xsmm": ["EIGEN_USE_LIBXSMM"],
+        "//conditions:default": [],
+    }) + select({
+        ":xsmm_backward_convolutions": ["TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS"],
         "//conditions:default": [],
     }),
     prefix = "conv_ops",
@@ -3179,7 +3278,7 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
     ] + select({
-        ":xsmm": [
+        ":xsmm_convolutions": [
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
@@ -3469,6 +3568,7 @@ tf_kernel_library(
         "pooling_ops_3d_gpu.cu.cc",
     ],
     deps = [
+        ":bounds_check",
         ":conv_2d",
         ":conv_3d",
         ":conv_ops",
@@ -3479,6 +3579,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:stream_executor",
         "//third_party/eigen3",
     ],
 )
@@ -4094,9 +4195,9 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "critical_section",
-    prefix = "critical_section",
-    deps = STATE_DEPS + [":captured_function"],
+    name = "mutex_ops",
+    prefix = "mutex_ops",
+    deps = STATE_DEPS + [":ops_util"],
 )
 
 tf_cc_test(
@@ -4142,8 +4243,10 @@ cc_library(
         ":as_string_op",
         ":base64_ops",
         ":reduce_join_op",
+        ":regex_replace_op",
         ":string_join_op",
         ":string_split_op",
+        ":string_strip_op",
         ":string_to_hash_bucket_op",
         ":substr_op",
     ],
@@ -4176,12 +4279,24 @@ tf_kernel_library(
     deps = STRING_DEPS,
 )
 
+tf_kernel_library(
+    name = "regex_replace_op",
+    prefix = "regex_replace_op",
+    deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
+)
+
 tf_kernel_library(
     name = "string_split_op",
     prefix = "string_split_op",
     deps = STRING_DEPS,
 )
 
+tf_kernel_library(
+    name = "string_strip_op",
+    prefix = "string_strip_op",
+    deps = STRING_DEPS,
+)
+
 tf_kernel_library(
     name = "substr_op",
     prefix = "substr_op",
@@ -4236,6 +4351,7 @@ tf_kernel_library(
     deps = [
         ":random_op",
         ":random_ops",
+        ":stateless_random_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -4868,7 +4984,7 @@ filegroup(
         "winograd_transform.h",
         ":android_extended_ops_headers",
     ] + select({
-        ":xsmm": [
+        ":xsmm_convolutions": [
             "xsmm_conv2d.h",
             "xsmm_conv2d.cc",
         ],
@@ -5021,6 +5137,7 @@ filegroup(
             # not used on Android. Those ops also do not compile if included,
             # unless we add the additional deps they need.
             "tf_record_reader_op.*",
+            "cudnn_rnn_ops.*",
             "lmdb_reader_op.*",
             "string_to_hash_bucket_op.*",
             "sdca_ops.*",
@@ -5045,11 +5162,16 @@ filegroup(
             "summary_interface.*",
             "summary_kernels.*",
             "spectrogram_convert_test_data.cc",
+            "decode_proto_op.cc",
+            "encode_proto_op.cc",
+            "rpc_op.cc",
+            "partitioned_function_ops.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
-            "critical_section.*",
+            "mutex_ops.*",
             "batch_kernels.*",
+            "regex_replace_op.cc",
         ],
     ),
     visibility = ["//visibility:public"],
@@ -5115,7 +5237,6 @@ tf_kernel_library(
     srcs = [
         "dequantize_op.cc",
         "meta_support.cc",
-        "quantization_utils.cc",
         "quantize_down_and_shrink_range.cc",
         "quantize_op.cc",
         "quantized_activation_ops.cc",
@@ -5136,7 +5257,6 @@ tf_kernel_library(
     ],
     hdrs = [
         "meta_support.h",
-        "quantization_utils.h",
         "reference_gemm.h",
     ],
     deps = [
@@ -5147,6 +5267,7 @@ tf_kernel_library(
         ":image_resizer_state",
         ":ops_util",
         ":pooling_ops",
+        ":quantization_utils",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -5210,6 +5331,7 @@ tf_cc_test(
     name = "quantization_utils_test",
     srcs = ["quantization_utils_test.cc"],
     deps = [
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
@@ -5272,6 +5394,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5333,6 +5456,7 @@ tf_cc_test(
         ":math",
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
@@ -5355,6 +5479,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
@@ -5419,6 +5544,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5439,6 +5565,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5478,6 +5605,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5534,6 +5662,7 @@ tf_cc_test(
         ":math",
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
@@ -5556,6 +5685,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5592,6 +5722,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
@@ -5613,6 +5744,7 @@ tf_cc_test(
     deps = [
         ":batch_norm_op",
         ":ops_testutil",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
@@ -5693,6 +5825,16 @@ tf_kernel_library(
     ],
 )
 
+cc_library(
+    name = "quantization_utils",
+    srcs = ["quantization_utils.cc"],
+    hdrs = ["quantization_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "@gemmlowp",
+    ],
+)
+
 cc_library(
     name = "remote_fused_graph_execute_utils",
     srcs = [
@@ -5809,6 +5951,8 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -5829,10 +5973,8 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-    ] + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
-    ]),
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5846,10 +5988,8 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-    ] + if_mkl([
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
-    ]),
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5865,7 +6005,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5877,6 +6017,7 @@ tf_mkl_kernel_library(
     ],
     hdrs = ["mkl_pooling_ops_common.h"],
     deps = [
+        ":bounds_check",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -5884,7 +6025,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5898,10 +6039,9 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-    ] + if_mkl([
+        "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
-    ]),
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5915,19 +6055,17 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-    ] + if_mkl([
+        "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
-    ]),
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + if_mkl([
+    deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
-    ]),
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5935,16 +6073,15 @@ tf_mkl_kernel_library(
     prefix = "mkl_aggregate_ops",
     deps = MATH_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_concat_op",
     prefix = "mkl_concat_op",
-    deps = ARRAY_DEPS + if_mkl([
+    deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
-    ]),
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5952,25 +6089,23 @@ tf_mkl_kernel_library(
     prefix = "mkl_reshape_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_identity_op",
     prefix = "mkl_identity_op",
-    deps = ARRAY_DEPS + if_mkl([
+    deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
-    ]),
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_lrn_op",
     prefix = "mkl_lrn_op",
-    deps = NN_DEPS + if_mkl([
+    deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn//:mkl_dnn",
-    ]),
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5997,6 +6132,13 @@ cc_library(
     ],
 )
 
+tf_kernel_library(
+    name = "boosted_trees_ops",
+    deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_ops",
+    ],
+)
+
 cc_library(
     name = "captured_function",
     hdrs = ["captured_function.h"],
@@ -6045,21 +6187,53 @@ tf_kernel_library(
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
+tf_kernel_library(
+    name = "decode_proto_op",
+    srcs = [
+        "decode_proto_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:decode_proto_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/util/proto:decode",
+        "//tensorflow/core/util/proto:descriptors",
+        "//third_party/eigen3",
+    ],
+)
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+tf_kernel_library(
+    name = "encode_proto_op",
+    srcs = ["encode_proto_op.cc"],
+    deps = [
+        "//tensorflow/core:encode_proto_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/util/proto:descriptors",
+        "//third_party/eigen3",
+    ],
 )
 
+tf_kernel_library(
+    name = "rpc_op",
+    srcs = [
+        "rpc_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:rpc_ops_op_lib",
+        "//tensorflow/core/util/rpc:call_container",
+        "//tensorflow/core/util/rpc:rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory_registry",
+        "//third_party/eigen3",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
 # Library to link with when compiling the cwise_op kernels directly,
 # e.g. for selective registration.
 # should not be linked by projects that also link the cwise_op library.
@@ -6068,7 +6242,6 @@ cc_library(
     srcs = [
         "cwise_ops_common.cc",
         "meta_support.cc",
-        "quantization_utils.cc",
     ],
     hdrs = [
         "cwise_ops.h",
@@ -6077,13 +6250,22 @@ cc_library(
         "cwise_ops_gpu_gradients.cu.h",
         "cwise_ops_gradients.h",
         "meta_support.h",
-        "quantization_utils.h",
     ],
     deps = [
         ":bounds_check",
+        ":quantization_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
         "@gemmlowp",
     ],
 )
+
+# Header-only version of cwise_lib for clients that want to use the cwise_ops
+# functionality in their own custom ops.
+cc_header_only_library(
+    name = "cwise_lib_hdrs",
+    deps = [
+        ":cwise_lib",
+    ],
+)
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index a312e8e8a420f7f909b20b28f84bf55597a58aba..2ed1628bf1a84bf8729a949ca1b6d66ce58bdcdc 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -77,7 +77,8 @@ class AssignOp : public OpKernel {
 
       // 1. Try to reuse the rhs.
       std::unique_ptr<Tensor> input_alias = context->forward_input(
-          1, old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
+          1, OpKernelContext::Params::kNoReservation /*output_index*/,
+          old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
       if (input_alias != nullptr) {
         // Transfer ownership to the ref.
         context->replace_ref_input(0, *input_alias.release(),
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index ec9cbc2a9b5d4c1ac6d91913fc015e139fa2a068..ba38e1a188f48118e0d5acd782679bc80925a6ac 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -102,6 +102,9 @@ class AvgPoolingOp : public UnaryOp<T> {
   TensorFormat data_format_;
 };
 
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<double>("T"),
+    AvgPoolingOp<CPUDevice, double>);
 REGISTER_KERNEL_BUILDER(
     Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     AvgPoolingOp<CPUDevice, float>);
@@ -153,10 +156,10 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
     TensorShape output_shape = params.forward_output_shape();
 
     if (data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-          stride_, padding_, data_format_, tensor_in, output_shape,
-          /*propagate_nans=*/false);
+      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
+                               stride_, padding_, data_format_, tensor_in,
+                               output_shape,
+                               /*propagate_nans=*/false);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context,
@@ -189,6 +192,7 @@ namespace functor {
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -198,6 +202,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     AvgPoolingOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    AvgPoolingOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA
 
 // The operation to compute AvgPool gradients.
@@ -410,10 +417,10 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
       output_shape.AddDim(shape_vec(i));
     }
 
-    DnnPoolingGradOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-        stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-        output_shape, /*propagate_nans=*/false);
+    DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
+                                 ksize_, stride_, padding_, data_format_,
+                                 nullptr, nullptr, out_backprop, output_shape,
+                                 /*propagate_nans=*/false);
   }
 
  private:
@@ -423,6 +430,12 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
   TensorFormat data_format_;
 };
 
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("orig_input_shape")
+                            .Label("cudnn"),
+                        AvgPoolingGradOp<GPUDevice, double>);
 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<float>("T")
@@ -534,10 +547,10 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
                                 output->flat<T>().data(),       // bottom_diff
                                 context->eigen_gpu_device());   // d
     } else {
-      DnnPoolingGradOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-          stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-          output_shape, /*propagate_nans=*/false);
+      DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
+                                   ksize_, stride_, padding_, data_format_,
+                                   nullptr, nullptr, out_backprop, output_shape,
+                                   /*propagate_nans=*/false);
     }
   }
 
@@ -553,6 +566,11 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .TypeConstraint<float>("T")
                             .HostMemory("orig_input_shape"),
                         AvgPoolingGradOpCustomGPUKernel<float>);
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("orig_input_shape"),
+                        AvgPoolingGradOpCustomGPUKernel<double>);
 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<Eigen::half>("T")
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 6537b42f1ed8856a5f701023eb5fc55ded278ec8..35511d5c313fb4b3794d00bd685ec4249580daa3 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -35,6 +35,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 DEFINE_GPU_KERNELS(Eigen::half)
 DEFINE_GPU_KERNELS(float)
+DEFINE_GPU_KERNELS(double)
 
 #undef DEFINE_GPU_KERNELS
 
@@ -99,6 +100,12 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
   return d.ok();
 }
 
+template bool RunAvePoolBackwardNHWC(
+    const double* const top_diff, const int num, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    double* const bottom_diff, const GPUDevice& d);
 template bool RunAvePoolBackwardNHWC(
     const float* const top_diff, const int num, const int height,
     const int width, const int channels, const int pooled_height,
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 546e51be53cee1833e8e1d4a15ea9b5be8a31506..8c99ded0a89e8065f4a7112db3b14eb2b27010c1 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -146,7 +146,7 @@ Status SplitCPU(OpKernelContext* context, const Tensor& input,
     suffix_dim_size *= input.shape().dim_size(i);
   }
   auto input_reshaped =
-      input.shaped<T, 3>({1, input.shape().dim_size(0), suffix_dim_size});
+      input.shaped<T, 2>({input.shape().dim_size(0), suffix_dim_size});
 
   int64 position = 0;
   for (const int64 size : sizes) {
@@ -155,13 +155,13 @@ Status SplitCPU(OpKernelContext* context, const Tensor& input,
     Tensor output;
     TF_RETURN_IF_ERROR(
         context->allocate_temp(input.dtype(), output_shape, &output));
-    auto output_shaped = output.shaped<T, 3>({1, size, suffix_dim_size});
+    auto output_shaped = output.shaped<T, 2>({size, suffix_dim_size});
 
-    Eigen::DSizes<Eigen::DenseIndex, 3> slice_indices{0, position, 0};
-    Eigen::DSizes<Eigen::DenseIndex, 3> slice_sizes{1, size, suffix_dim_size};
-    functor::Split<CPUDevice, T>()(context->eigen_device<CPUDevice>(),
-                                   output_shaped, input_reshaped, slice_indices,
-                                   slice_sizes);
+    Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{position, 0};
+    Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{size, suffix_dim_size};
+    functor::Split<CPUDevice, T, 2>()(context->eigen_device<CPUDevice>(),
+                                      output_shaped, input_reshaped,
+                                      slice_indices, slice_sizes);
 
     outputs->emplace_back(output);
 
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 43e716c542ac42835baabde057e45534d5442010..a1c03f99181a6cbb86295ddc948d41d9e72c0a4b 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -245,35 +245,35 @@ struct LaunchBatchMatMul<CPUDevice, Scalar> {
 
 namespace {
 template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 
-class CublasScratchAllocator : public perftools::gputools::ScratchAllocator {
+class CublasScratchAllocator : public se::ScratchAllocator {
  public:
-  using Stream = ::perftools::gputools::Stream;
-  using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory<uint8>;
+  using Stream = se::Stream;
+  using DeviceMemoryBytes = se::DeviceMemory<uint8>;
 
   CublasScratchAllocator(OpKernelContext* context) : context_(context) {}
 
   int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; }
 
-  perftools::gputools::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
+  se::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
       Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
 
     Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory));
     if (!allocation_status.ok()) {
-      return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
+      return se::port::StatusOr<DeviceMemoryBytes>(
           DeviceMemoryBytes::MakeFromByteSize(nullptr, 0));
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
-    return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
+    return se::port::StatusOr<DeviceMemoryBytes>(
         DeviceMemoryBytes::MakeFromByteSize(
             temporary_memory.flat<uint8>().data(),
             temporary_memory.flat<uint8>().size()));
@@ -289,12 +289,11 @@ template <typename Scalar>
 struct LaunchBatchMatMul<GPUDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
-    constexpr perftools::gputools::blas::Transpose kTranspose =
-        is_complex<Scalar>::value
-            ? perftools::gputools::blas::Transpose::kConjugateTranspose
-            : perftools::gputools::blas::Transpose::kTranspose;
-    perftools::gputools::blas::Transpose trans[] = {
-        perftools::gputools::blas::Transpose::kNoTranspose, kTranspose};
+    constexpr se::blas::Transpose kTranspose =
+        is_complex<Scalar>::value ? se::blas::Transpose::kConjugateTranspose
+                                  : se::blas::Transpose::kTranspose;
+    se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
+                                   kTranspose};
     const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
     const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
     const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
@@ -305,7 +304,7 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
-    typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
+    typedef se::DeviceMemory<Scalar> DeviceMemoryType;
     std::vector<DeviceMemoryType> a_device_memory;
     std::vector<DeviceMemoryType> b_device_memory;
     std::vector<DeviceMemoryType> c_device_memory;
@@ -340,19 +339,16 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
       // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
       // overhead of the scratch allocator and the batch interface.
       if (n == 1 &&
-          blas_transpose_b !=
-              perftools::gputools::blas::Transpose::kConjugateTranspose &&
-          blas_transpose_a !=
-              perftools::gputools::blas::Transpose::kConjugateTranspose) {
+          blas_transpose_b != se::blas::Transpose::kConjugateTranspose &&
+          blas_transpose_a != se::blas::Transpose::kConjugateTranspose) {
         // This is a matrix*vector multiply so use GEMV to compute A * b.
         // Here we are multiplying in the natural order, so we have to flip
         // the transposition flag to compensate for the tensor being stored
         // row-major. Since GEMV doesn't provide a way to just conjugate an
         // argument, we have to defer those cases to GEMM below.
-        auto gemv_trans_a =
-            blas_transpose_a == perftools::gputools::blas::Transpose::kTranspose
-                ? perftools::gputools::blas::Transpose::kNoTranspose
-                : perftools::gputools::blas::Transpose::kTranspose;
+        auto gemv_trans_a = blas_transpose_a == se::blas::Transpose::kTranspose
+                                ? se::blas::Transpose::kNoTranspose
+                                : se::blas::Transpose::kTranspose;
         bool blas_launch_status =
             stream
                 ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
index 1a45212ad29a7b8a578ce176db20eaf3d2193afd..1182ed42e7a9adacb6059e1e9da9b510b911f11f 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -78,14 +78,44 @@ Status HandleElementToSlice<Variant>(Tensor element, Tensor* parent,
   return Status::OK();
 }
 
-// TODO(jsimsa): Add HandleElementToSlice<variant> specialization that moves
-// the data when possible.
-
+// TODO(b/78245576): Consider removing this overload.
 template <typename T>
-static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
-                                   int64 index) {
+void HandleSliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   element->flat<T>() = parent.flat_outer_dims<T>().chip(index, 0);
-  return Status::OK();
+}
+
+template <typename T>
+void HandleSliceToElement(Tensor* parent, Tensor* element, int64 index,
+                          bool can_move) {
+  element->flat<T>() = parent->flat_outer_dims<T>().chip(index, 0);
+}
+
+template <>
+void HandleSliceToElement<string>(Tensor* parent, Tensor* element, int64 index,
+                                  bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<string>();
+  auto element_flat = element->flat<string>();
+  if (can_move) {
+    for (int64 i = 0; i < element->NumElements(); ++i) {
+      element_flat(i) = std::move(parent_as_matrix(index, i));
+    }
+  } else {
+    element_flat = parent_as_matrix.chip(index, 0);
+  }
+}
+
+template <>
+void HandleSliceToElement<Variant>(Tensor* parent, Tensor* element, int64 index,
+                                   bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<Variant>();
+  auto element_flat = element->flat<Variant>();
+  if (can_move) {
+    for (int64 i = 0; i < element->NumElements(); ++i) {
+      element_flat(i) = std::move(parent_as_matrix(index, i));
+    }
+  } else {
+    element_flat = parent_as_matrix.chip(index, 0);
+  }
 }
 
 }  // namespace
@@ -104,6 +134,8 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   switch (element.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
@@ -115,9 +147,10 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   TF_RETURN_IF_ERROR(ValidateInput(parent, *element, index));
 
-#define HANDLE_TYPE(T)                                      \
-  case DataTypeToEnum<T>::value: {                          \
-    return HandleSliceToElement<T>(parent, element, index); \
+#define HANDLE_TYPE(T)                               \
+  case DataTypeToEnum<T>::value: {                   \
+    HandleSliceToElement<T>(parent, element, index); \
+    return Status::OK();                             \
   }
 
   switch (parent.dtype()) {
@@ -130,6 +163,30 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   }
 }
 
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+//
+// NOTE(mrry): The implementation may be able to optimize the copy to a move.
+// This is particularly important for DT_STRING tensors.
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index) {
+  TF_RETURN_IF_ERROR(ValidateInput(*parent, *element, index));
+  bool can_move = parent->RefCountIsOne();
+
+#define HANDLE_TYPE(T)                                         \
+  case DataTypeToEnum<T>::value: {                             \
+    HandleSliceToElement<T>(parent, element, index, can_move); \
+    return Status::OK();                                       \
+  }
+
+  switch (parent->dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented(
+          "MaybeMoveSliceToElement Unhandled data type: ", element->dtype());
+  }
+}
+
 // The following five functions are copied from padding_fifo_queue.cc.
 // TODO(mrry): Reconcile these functions with the similar methods in the
 // queue implementation.
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index a47bf1935db611417cea1d98ed8aff496efbf689..69098fbd1d8dccc2446edf502abd0d57c228640e 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -32,6 +32,12 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+//
+// NOTE(mrry): The implementation may be able to optimize the copy to a move.
+// This is particularly important for DT_STRING tensors.
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
+
 // Zero-initializes the tensor `element` using the scalar stored in `padding`.
 // Both `element` and `padding` must have matching `dtype`.
 Status SetElementZero(Tensor* element, const Tensor& padding);
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 4397410a5cee839a70bde69f34ca72e31530565f..de05c647d6bfc80a0368ee3edba8f31bccff33f9 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -8,18 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "**/google_*",
-        ],
-    ),
-)
-
 cc_library(
     name = "periodic_function_dynamic",
     srcs = ["periodic_function.cc"],
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 25c5f9cf424fdb286922548ea7ab0a35157a3502..f5ced95febfdfa7d8956d15066f42712d741a1c2 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <memory>
-#include <queue>
 #include <random>
 #include <unordered_map>
 #include <vector>
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -44,49 +44,31 @@ template <typename TaskType>
 class ASBSQueue;
 }  // namespace internal
 
-// EXPERIMENTAL: API MAY BE SUBJECTED TO SUDDEN CHANGES.
-//
 // Shared batch scheduler designed to minimize latency. The scheduler keeps
 // track of a number of queues (one per model or model version) which are
 // continuously enqueuing requests. The scheduler groups the requests into
 // batches which it periodically sends off for processing (see
-// shared_batch_scheduler.h for more details). The AdaptiveSharedBatchScheduler
-// prioritizes batches by age (i.e. the batch's oldest request) irrespective of
-// queue or batch size.
-//
-// The scheduling decision currently exists in two flavors, controlled by the
-// option use_in_flight_batches_implementation. It is expected that setting this
-// option to true will give universally better results; after a period of
-// testing to confirm, the old implementation will be removed.
+// shared_batch_scheduler.h for more details). AdaptiveSharedBatchScheduler
+// (ASBS) prioritizes batches primarily by age (i.e. the batch's oldest request)
+// along with a configurable preference for scheduling larger batches first.
 //
-// If use_in_flight_batches_implementation is set to true, the scheduler
-// limits the number of batches which can be processed concurrently.  If a new
-// batch is created, and the number of in flight batches is below the limit,
-// the next (i.e. oldest) batch is immediately scheduled.  Similarly, when a
-// batch finishes processing, the limit is rechecked, and another batch may be
-// scheduled.  To avoid the need to carefully tune the limit for workload,
-// model type, platform, etc, it is dynamically adjusted in order to provide the
-// lowest latency.
 //
-// If use_in_flight_batches_implementation is set to false, the scheduler will
-// process the oldest batch at an adjustable rate, regardless of batch size.
-// The user can provide feedback to help set this rate to achieve some goal
-// (i.e. minimize overall latency, limit cpu usage, etc). The rate (or rather,
-// the corresponding period) is adjusted each time a batch is processed, using
-// an exponentially weighted moving average to smooth noisy feedback:
-// ewma_feedback = ((N - 1) * ewma_feedback + feedback()) / N
-// period *= (1 + K * emwa_feedback)
+// ASBS tries to keep the system busy by maintaining an adjustable number of
+// concurrently processed batches.  If a new batch is created, and the number of
+// in flight batches is below the target, the next (i.e. oldest) batch is
+// immediately scheduled.  Similarly, when a batch finishes processing, the
+// target is rechecked, and another batch may be scheduled.  To avoid the need
+// to carefully tune the target for workload, model type, platform, etc, it is
+// dynamically adjusted in order to provide the lowest average latency.
 //
 // Some potential use cases:
 // Hardware Accelerators (GPUs & TPUs) - If some phase of batch processing
 //   involves serial processing by a device, from a latency perspective it is
 //   desirable to keep the device evenly loaded, avoiding the need to wait for
 //   the device to process prior batches.
-//   feedback = num_pending_on_device() - desired_pending.
 // CPU utilization - If the batch processing is cpu dominated, you can reap
 //   latency gains when underutilized by increasing the processing rate, but
 //   back the rate off when the load increases to avoid overload.
-//   feedback = cpu_rate() - desired_cpu_rate.
 
 template <typename TaskType>
 class AdaptiveSharedBatchScheduler
@@ -101,13 +83,24 @@ class AdaptiveSharedBatchScheduler
   struct Options {
     // The name to use for the pool of batch threads.
     string thread_pool_name = {"batch_threads"};
-    // Number of batch processing threads; equivalently the maximum number of
-    // concurrently running batches.
+    // Number of batch processing threads - the maximum value of
+    // in_flight_batches_limit_.  It is recommended that this value be set by
+    // running the system under load, observing the learned value for
+    // in_flight_batches_limit_, and setting this maximum to ~ 2x the value.
+    // Under low load, in_flight_batches_limit_ has no substantial effect on
+    // latency and therefore undergoes a random walk.  Unreasonably large values
+    // for num_batch_threads allows for large in_flight_batches_limit_, which
+    // will harm latency for some time once load increases again.
     int64 num_batch_threads = port::NumSchedulableCPUs();
+    // Although batch selection is primarily based on age, this parameter
+    // specifies a preference for larger batches.  A full batch will be
+    // scheduled before an older, nearly empty batch as long as the age gap is
+    // less than full_batch_scheduling_boost_micros.  The optimal value for this
+    // parameter should be of order the batch processing latency, but must be
+    // chosen carefully, as too large a value will harm tail latency.
+    int64 full_batch_scheduling_boost_micros = 0;
     // The environment to use (typically only overridden by test code).
     Env* env = Env::Default();
-    // Which implementation to use (described in class comments above).
-    bool use_in_flight_batches_implementation = false;
     // Initial limit for number of batches being concurrently processed.
     // Non-integer values correspond to probabilistic limits - i.e. a value of
     // 3.2 results in an actual cap of 3 80% of the time, and 4 20% of the time.
@@ -116,28 +109,6 @@ class AdaptiveSharedBatchScheduler
     // numbers will give less noisy latency measurements, but will be less
     // responsive to changes in workload.
     int64 batches_to_average_over = 1000;
-
-    // TODO(kte): remove the rate based implementation and corresponding options
-    // below once testing confirms the superiority of the in flight batches
-    // implementation.
-    // Initial batch scheduling period in microseconds. Will be altered for
-    // non-zero rate_feedback.
-    double initial_scheduling_period_micros = 500;
-    // Minimum batch scheduling period in microseconds. Recommend setting this
-    // value greater than 0, otherwise it may take a while to recover from a
-    // sustained time of negative scheduling_period_feedback (which may occur
-    // under low load).
-    double min_scheduling_period_micros = 100;
-    // Maximum batch scheduling period in microseconds.
-    double max_scheduling_period_micros = 10000;
-    // Feedback function used to modify the scheduling period each time a batch
-    // is scheduled.  Should return values roughly O(1), with positive values
-    // resulting in an increased period.
-    std::function<double()> scheduling_period_feedback{[] { return 0.; }};
-    // To handle potentially noisy scheduling_period_feedback, the period is
-    // adjusted using an exponentially weighted moving average over the previous
-    // feedback_smoothing_batches batches.  Must be greater than 0.
-    int64 feedback_smoothing_batches = 10;
   };
 
   // Ownership is shared between the caller of Create() and any queues created
@@ -171,17 +142,11 @@ class AdaptiveSharedBatchScheduler
 
   explicit AdaptiveSharedBatchScheduler(const Options& options);
 
-  // Batch scheduling function which runs every scheduling_period_ microseconds.
-  // Only used when options_.use_in_flight_batches_implementation == false.
-  void ProcessOneBatch();
-
   // Tracks processing latency and adjusts in_flight_batches_limit to minimize.
-  // Only used when options_.use_in_flight_batches_implementation == true.
   void CallbackWrapper(const internal::ASBSBatch<TaskType>* batch,
                        BatchProcessor callback);
 
   // Schedules batch if in_flight_batches_limit_ is not met.
-  // Only used when options_.use_in_flight_batches_implementation == true.
   void MaybeScheduleNextBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Notifies scheduler of non-empty batch which is eligible for processing.
@@ -194,17 +159,9 @@ class AdaptiveSharedBatchScheduler
 
   const Options options_;
 
-  struct BatchCompare {
-    bool operator()(const internal::ASBSBatch<TaskType>* a,
-                    const internal::ASBSBatch<TaskType>* b);
-  };
-
   // Collection of batches added by AddBatch, ordered by age. Owned by scheduler
   // until they are released for processing.
-  std::priority_queue<const internal::ASBSBatch<TaskType>*,
-                      std::vector<const internal::ASBSBatch<TaskType>*>,
-                      BatchCompare>
-      batches_ GUARDED_BY(mu_);
+  std::vector<const internal::ASBSBatch<TaskType>*> batches_ GUARDED_BY(mu_);
 
   // Unowned queues and callbacks added by AddQueue.
   std::unordered_map<const internal::ASBSQueue<TaskType>*, BatchProcessor>
@@ -212,41 +169,22 @@ class AdaptiveSharedBatchScheduler
 
   mutex mu_;
 
-  // Responsible for running ProcessOneBatch. PeriodicFunction was used in order
-  // to check for deletion so that the thread can be shut down.
-  // Only used when options_.use_in_flight_batches_implementation == false.
-  std::unique_ptr<PeriodicFunction> scheduling_thread_;
-
   // Responsible for running the batch processing callbacks.
   std::unique_ptr<thread::ThreadPool> batch_thread_pool_;
 
-  // Time interval in microseconds between successive ProcessOneBatch calls.
-  // Only used when options_.use_in_flight_batches_implementation == false.
-  double scheduling_period_;
-
-  // Exponentially weighted moving average of
-  // options_.scheduling_period_feedback() evaluated in each ProcessOneBatch
-  // call.
-  // Only used when options_.use_in_flight_batches_implementation == false.
-  double ewma_feedback_ = 0;
-
   // Limit on number of batches which can be concurrently processed.
   // Non-integer values correspond to probabilistic limits - i.e. a value of 3.2
   // results in an actual cap of 3 80% of the time, and 4 20% of the time.
-  // Only used when options_.use_in_flight_batches_implementation == true.
   double in_flight_batches_limit_ GUARDED_BY(mu_);
 
   // Number of batches currently being processed.
-  // Only used when options_.use_in_flight_batches_implementation == true.
   int64 in_flight_batches_ GUARDED_BY(mu_) = 0;
 
   // RNG engine and distribution.
-  // Only used when options_.use_in_flight_batches_implementation == true.
   std::default_random_engine rand_engine_;
   std::uniform_real_distribution<double> rand_double_;
 
   // Fields controlling the dynamic adjustment of in_flight_batches_limit_.
-  // Only used when options_.use_in_flight_batches_implementation == true.
   // Number of batches since the last in_flight_batches_limit_ adjustment.
   int64 batch_count_ GUARDED_BY(mu_) = 0;
   // Sum of processing latency for batches counted by batch_count_.
@@ -348,31 +286,10 @@ Status AdaptiveSharedBatchScheduler<TaskType>::Create(
     return errors::InvalidArgument("num_batch_threads must be positive; was ",
                                    options.num_batch_threads);
   }
-  if (options.min_scheduling_period_micros < 0) {
+  if (options.full_batch_scheduling_boost_micros < 0) {
     return errors::InvalidArgument(
-        "min_scheduling_period_micros must be >= 0; was ",
-        options.min_scheduling_period_micros);
-  }
-  if (options.min_scheduling_period_micros >
-      options.initial_scheduling_period_micros) {
-    return errors::InvalidArgument(
-        "initial_scheduling_period_micros (",
-        options.initial_scheduling_period_micros,
-        ") must be >= min_scheduling_period_micros (",
-        options.min_scheduling_period_micros, ")");
-  }
-  if (options.initial_scheduling_period_micros >
-      options.max_scheduling_period_micros) {
-    return errors::InvalidArgument(
-        "initial_scheduling_period_micros (",
-        options.initial_scheduling_period_micros,
-        ") must be <= max_scheduling_period_micros (",
-        options.max_scheduling_period_micros, ")");
-  }
-  if (options.feedback_smoothing_batches < 1) {
-    return errors::InvalidArgument(
-        "feedback_smoothing_batches must be positive; was ",
-        options.feedback_smoothing_batches);
+        "full_batch_scheduling_boost_micros can't be negative; was ",
+        options.full_batch_scheduling_boost_micros);
   }
   if (options.initial_in_flight_batches_limit > options.num_batch_threads) {
     return errors::InvalidArgument(
@@ -401,20 +318,12 @@ template <typename TaskType>
 AdaptiveSharedBatchScheduler<TaskType>::AdaptiveSharedBatchScheduler(
     const Options& options)
     : options_(options),
-      scheduling_period_(options.initial_scheduling_period_micros),
       in_flight_batches_limit_(options.initial_in_flight_batches_limit),
       rand_double_(0.0, 1.0) {
   std::random_device device;
   rand_engine_.seed(device());
-  PeriodicFunction::Options opts;
-  opts.thread_name_prefix = "scheduling_thread";
-  opts.env = GetEnv();
   batch_thread_pool_.reset(new thread::ThreadPool(
       GetEnv(), options.thread_pool_name, options.num_batch_threads));
-  if (!options.use_in_flight_batches_implementation) {
-    scheduling_thread_.reset(
-        new PeriodicFunction([this] { ProcessOneBatch(); }, 0, opts));
-  }
 }
 
 template <typename TaskType>
@@ -442,10 +351,8 @@ template <typename TaskType>
 void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
     const internal::ASBSBatch<TaskType>* batch) {
   mutex_lock l(mu_);
-  batches_.push(batch);
-  if (options_.use_in_flight_batches_implementation) {
-    MaybeScheduleNextBatch();
-  }
+  batches_.push_back(batch);
+  MaybeScheduleNextBatch();
 }
 
 template <typename TaskType>
@@ -462,10 +369,26 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
   // Non-integer limit handled probabilistially.
   if (in_flight_batches_limit_ - in_flight_batches_ < 1 &&
       rand_double_(rand_engine_) >
-          (in_flight_batches_limit_ - in_flight_batches_))
+          in_flight_batches_limit_ - in_flight_batches_) {
     return;
-  const internal::ASBSBatch<TaskType>* batch = batches_.top();
-  batches_.pop();
+  }
+  auto best_it = batches_.begin();
+  double best_score =
+      (*best_it)->creation_time_micros() -
+      options_.full_batch_scheduling_boost_micros * (*best_it)->size() /
+          static_cast<double>((*best_it)->queue()->max_task_size());
+  for (auto it = batches_.begin() + 1; it != batches_.end(); it++) {
+    const double score =
+        (*it)->creation_time_micros() -
+        options_.full_batch_scheduling_boost_micros * (*it)->size() /
+            static_cast<double>((*it)->queue()->max_task_size());
+    if (score < best_score) {
+      best_score = score;
+      best_it = it;
+    }
+  }
+  const internal::ASBSBatch<TaskType>* batch = *best_it;
+  batches_.erase(best_it);
   // Queue may destroy itself after ReleaseBatch is called.
   batch->queue()->ReleaseBatch(batch);
   batch_thread_pool_->Schedule(
@@ -523,51 +446,6 @@ void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
   MaybeScheduleNextBatch();
 }
 
-template <typename TaskType>
-void AdaptiveSharedBatchScheduler<TaskType>::ProcessOneBatch() {
-  static const double kFeedbackMultiplier = .001;
-  const internal::ASBSBatch<TaskType>* batch = nullptr;
-  BatchProcessor callback;
-  const int64 start_time_micros = GetEnv()->NowMicros();
-  {
-    mutex_lock l(mu_);
-    if (!batches_.empty()) {
-      batch = batches_.top();
-      batches_.pop();
-      callback = queues_and_callbacks_[batch->queue()];
-    }
-  }
-  if (batch != nullptr) {
-    double feedback = options_.scheduling_period_feedback();
-    const int64 N = options_.feedback_smoothing_batches;
-    ewma_feedback_ = ((N - 1) * ewma_feedback_ + feedback) / N;
-    scheduling_period_ *= (1 + kFeedbackMultiplier * ewma_feedback_);
-    if (scheduling_period_ < options_.min_scheduling_period_micros) {
-      scheduling_period_ = options_.min_scheduling_period_micros;
-    } else if (scheduling_period_ > options_.max_scheduling_period_micros) {
-      scheduling_period_ = options_.max_scheduling_period_micros;
-    }
-    // Queue may destroy itself after ReleaseBatch is called.
-    batch->queue()->ReleaseBatch(batch);
-    batch_thread_pool_->Schedule([callback, batch] {
-      callback(std::unique_ptr<Batch<TaskType>>(
-          const_cast<internal::ASBSBatch<TaskType>*>(batch)));
-    });
-  }
-  const int64 sleep_time =
-      scheduling_period_ - (GetEnv()->NowMicros() - start_time_micros);
-  if (sleep_time > 0) {
-    GetEnv()->SleepForMicroseconds(sleep_time);
-  }
-}
-
-template <typename TaskType>
-bool AdaptiveSharedBatchScheduler<TaskType>::BatchCompare::operator()(
-    const internal::ASBSBatch<TaskType>* a,
-    const internal::ASBSBatch<TaskType>* b) {
-  return a->creation_time_micros() > b->creation_time_micros();
-}
-
 // ---------------- ASBSQueue ----------------
 
 namespace internal {
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
index 8ae8ca02eca20b5d1184e6e588f013d59d10464a..1be0c1f5c65cad3245a9392e2a2db61cfb2dd904 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
@@ -64,59 +64,6 @@ std::unique_ptr<Thread> CreateFakeClockAdvancerThread(
       }));
 }
 
-TEST(AdaptiveSharedBatchSchedulerTest, Basic) {
-  for (const bool delete_scheduler_early : {false, true}) {
-    for (const bool delete_queue_1_early : {false, true}) {
-      int queue_0_tasks = 0;
-      auto queue_0_callback =
-          [&queue_0_tasks](std::unique_ptr<Batch<FakeTask>> batch) {
-            ASSERT_TRUE(batch->IsClosed());
-            EXPECT_GT(batch->num_tasks(), 0);
-            for (int i = 0; i < batch->num_tasks(); i++) {
-              queue_0_tasks += batch->task(i).size();
-            }
-          };
-      int queue_1_tasks = 0;
-      auto queue_1_callback =
-          [&queue_1_tasks](std::unique_ptr<Batch<FakeTask>> batch) {
-            ASSERT_TRUE(batch->IsClosed());
-            EXPECT_GT(batch->num_tasks(), 0);
-            for (int i = 0; i < batch->num_tasks(); i++) {
-              queue_1_tasks += batch->task(i).size();
-            }
-          };
-      {
-        std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
-        TF_ASSERT_OK(
-            AdaptiveSharedBatchScheduler<FakeTask>::Create({}, &scheduler));
-
-        // Create two queues.
-        std::unique_ptr<BatchScheduler<FakeTask>> queue_0;
-        TF_ASSERT_OK(scheduler->AddQueue({}, queue_0_callback, &queue_0));
-        std::unique_ptr<BatchScheduler<FakeTask>> queue_1;
-        TF_ASSERT_OK(scheduler->AddQueue({}, queue_1_callback, &queue_1));
-
-        if (delete_scheduler_early) {
-          // Delete our copy of the scheduler. The queues should keep it alive
-          // under the covers.
-          scheduler = nullptr;
-        }
-        // Submit tasks to the two queues, and (optionally) remove the queues.
-        TF_ASSERT_OK(ScheduleTask(1, queue_0.get()));
-        TF_ASSERT_OK(ScheduleTask(2, queue_1.get()));
-        TF_ASSERT_OK(ScheduleTask(3, queue_0.get()));
-        TF_ASSERT_OK(ScheduleTask(4, queue_1.get()));
-        if (delete_queue_1_early) {
-          queue_1 = nullptr;
-        }
-        TF_ASSERT_OK(ScheduleTask(5, queue_0.get()));
-      }
-      EXPECT_EQ(queue_0_tasks, 9);
-      EXPECT_EQ(queue_1_tasks, 6);
-    }
-  }
-}
-
 TEST(AdaptiveSharedBatchSchedulerTest, BadOptions) {
   using Scheduler = AdaptiveSharedBatchScheduler<FakeTask>;
   std::shared_ptr<Scheduler> scheduler;
@@ -124,24 +71,6 @@ TEST(AdaptiveSharedBatchSchedulerTest, BadOptions) {
   options.num_batch_threads = 0;
   EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
   options = Scheduler::Options();
-  options.min_scheduling_period_micros = 50;
-  options.max_scheduling_period_micros = 100;
-  options.initial_scheduling_period_micros = 1;
-  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
-  options = Scheduler::Options();
-  options.min_scheduling_period_micros = 50;
-  options.max_scheduling_period_micros = 100;
-  options.initial_scheduling_period_micros = 1000;
-  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
-  options = Scheduler::Options();
-  options.min_scheduling_period_micros = 100;
-  options.max_scheduling_period_micros = 50;
-  options.initial_scheduling_period_micros = 75;
-  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
-  options = Scheduler::Options();
-  options.feedback_smoothing_batches = 0;
-  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
-  options = Scheduler::Options();
   options.initial_in_flight_batches_limit = 0.5;
   EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
   options = Scheduler::Options();
@@ -153,301 +82,8 @@ TEST(AdaptiveSharedBatchSchedulerTest, BadOptions) {
   EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
 }
 
-TEST(AdaptiveSharedBatchSchedulerTest, ObeysQueueOptions) {
-  test_util::FakeClockEnv env(Env::Default());
-  Notification start_teardown, stop_teardown;
-  std::unique_ptr<Thread> teardown_thread =
-      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
-  {
-    AdaptiveSharedBatchScheduler<FakeTask>::Options options;
-    options.initial_scheduling_period_micros = 1000;
-    options.env = &env;
-    std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
-    TF_ASSERT_OK(
-        AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
-    std::unique_ptr<BatchScheduler<FakeTask>> queue_0;
-    std::unique_ptr<BatchScheduler<FakeTask>> queue_1;
-    int queue_0_tasks = 0;
-    int queue_1_tasks = 0;
-    auto queue_0_callback = [&queue_0_tasks,
-                             &env](std::unique_ptr<Batch<FakeTask>> batch) {
-      ASSERT_TRUE(batch->IsClosed());
-      EXPECT_GT(batch->num_tasks(), 0);
-      for (int i = 0; i < batch->num_tasks(); i++) {
-        queue_0_tasks += batch->task(i).size();
-      }
-      env.SleepForMicroseconds(1);
-    };
-    auto queue_1_callback = [&queue_1_tasks,
-                             &env](std::unique_ptr<Batch<FakeTask>> batch) {
-      ASSERT_TRUE(batch->IsClosed());
-      EXPECT_GT(batch->num_tasks(), 0);
-      for (int i = 0; i < batch->num_tasks(); i++) {
-        queue_1_tasks += batch->task(i).size();
-      }
-      env.SleepForMicroseconds(1);
-    };
-    AdaptiveSharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.max_batch_size = 10;
-    queue_options.max_enqueued_batches = 0;
-    // Queue must have max_enqueued_batchs > 1.
-    EXPECT_FALSE(
-        scheduler->AddQueue(queue_options, queue_0_callback, &queue_0).ok());
-    queue_options.max_enqueued_batches = 2;
-    TF_ASSERT_OK(
-        scheduler->AddQueue(queue_options, queue_0_callback, &queue_0));
-    EXPECT_EQ(10, queue_0->max_task_size());
-    queue_options.max_batch_size = 0;
-    // Queue must have max_batch_size > 0.
-    EXPECT_FALSE(
-        scheduler->AddQueue(queue_options, queue_1_callback, &queue_1).ok());
-    queue_options.max_batch_size = 2;
-    queue_options.max_enqueued_batches = 1;
-    TF_ASSERT_OK(
-        scheduler->AddQueue(queue_options, queue_1_callback, &queue_1));
-
-    // Wait for scheduling_thread to sleep.
-    env.BlockUntilThreadsAsleep(1);
-    // Task larger than max_batch_size shouldn't schedule.
-    EXPECT_FALSE(ScheduleTask(15, queue_0.get()).ok());
-    TF_ASSERT_OK(ScheduleTask(5, queue_0.get()));
-    TF_ASSERT_OK(ScheduleTask(5, queue_0.get()));
-    env.AdvanceByMicroseconds(1);
-
-    // Task larger than max_batch_size shouldn't schedule.
-    EXPECT_FALSE(ScheduleTask(3, queue_1.get()).ok());
-    TF_ASSERT_OK(ScheduleTask(1, queue_1.get()));
-    TF_ASSERT_OK(ScheduleTask(1, queue_1.get()));
-    env.AdvanceByMicroseconds(1);
-    // Exceeds max_enqueued_batches, shouldn't schedule.
-    EXPECT_FALSE(ScheduleTask(1, queue_1.get()).ok());
-
-    TF_ASSERT_OK(ScheduleTask(5, queue_0.get()));
-    // Exceeds max_enqueued_batches, shouldn't schedule.
-    EXPECT_FALSE(ScheduleTask(6, queue_0.get()).ok());
-    TF_ASSERT_OK(ScheduleTask(4, queue_0.get()));
-
-    // Batches should be processed in order from oldest to newest.
-    env.AdvanceByMicroseconds(1000);
-    env.BlockUntilThreadsAsleep(2);
-    EXPECT_EQ(queue_0_tasks, 10);
-    EXPECT_EQ(queue_1_tasks, 0);
-
-    env.AdvanceByMicroseconds(1000);
-    env.BlockUntilThreadsAsleep(2);
-    EXPECT_EQ(queue_0_tasks, 10);
-    EXPECT_EQ(queue_1_tasks, 2);
-
-    env.AdvanceByMicroseconds(1000);
-    env.BlockUntilThreadsAsleep(2);
-    EXPECT_EQ(queue_0_tasks, 19);
-    EXPECT_EQ(queue_1_tasks, 2);
-    start_teardown.Notify();
-  }
-  stop_teardown.Notify();
-}
-
-TEST(AdaptiveSharedBatchSchedulerTest, RateFeedback) {
-  test_util::FakeClockEnv env(Env::Default());
-  Notification start_teardown, stop_teardown;
-  std::unique_ptr<Thread> teardown_thread =
-      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
-  {
-    double feedback = 0;
-    AdaptiveSharedBatchScheduler<FakeTask>::Options options;
-    options.initial_scheduling_period_micros = 1000;
-    options.min_scheduling_period_micros = 200;
-    options.max_scheduling_period_micros = 2000;
-    options.env = &env;
-    options.scheduling_period_feedback = [&feedback] { return feedback; };
-    options.feedback_smoothing_batches = 1;
-    std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
-    TF_ASSERT_OK(
-        AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
-    std::unique_ptr<BatchScheduler<FakeTask>> queue;
-    int scheduled_items = 0;
-    auto queue_callback = [&scheduled_items,
-                           &env](std::unique_ptr<Batch<FakeTask>> batch) {
-      ASSERT_TRUE(batch->IsClosed());
-      EXPECT_GT(batch->num_tasks(), 0);
-      scheduled_items = 0;
-      for (int i = 0; i < batch->num_tasks(); i++) {
-        scheduled_items += batch->task(i).size();
-      }
-      env.SleepForMicroseconds(1);
-    };
-
-    TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
-
-    // Wait for scheduling_thread to sleep.
-    env.BlockUntilThreadsAsleep(1);
-    // Enqueue 6 batches.
-    for (int i = 0; i < 6; i++) {
-      TF_ASSERT_OK(ScheduleTask(900 + i, queue.get()));
-      env.AdvanceByMicroseconds(1);
-    }
-    feedback = -500;
-    env.AdvanceByMicroseconds(994);
-    env.BlockUntilThreadsAsleep(2);  // scheduling period = 500 usec.
-    EXPECT_EQ(scheduled_items, 900);
-    env.AdvanceByMicroseconds(500);
-    env.BlockUntilThreadsAsleep(2);  // scheduling period = 250 usec.
-    EXPECT_EQ(scheduled_items, 901);
-    feedback = 0;
-    env.AdvanceByMicroseconds(250);
-    env.BlockUntilThreadsAsleep(2);  // scheduling period = 250 usec.
-    EXPECT_EQ(scheduled_items, 902);
-    feedback = 10000;  // large feedback should hit max_scheduling_period.
-    env.AdvanceByMicroseconds(250);
-    env.BlockUntilThreadsAsleep(2);  // scheduling period = 2000 usec.
-    EXPECT_EQ(scheduled_items, 903);
-    feedback = -10000;  // large feedback should hit min_scheduling_period.
-    env.AdvanceByMicroseconds(1999);
-    // No callback scheduled, only scheduling thread sleeping.
-    env.BlockUntilThreadsAsleep(1);
-    EXPECT_EQ(scheduled_items, 903);
-    env.AdvanceByMicroseconds(1);
-    env.BlockUntilThreadsAsleep(2);  // scheduling period = 200 usec.
-    EXPECT_EQ(scheduled_items, 904);
-    env.AdvanceByMicroseconds(200);
-    env.BlockUntilThreadsAsleep(2);
-    EXPECT_EQ(scheduled_items, 905);
-    start_teardown.Notify();
-  }
-  stop_teardown.Notify();
-}
-
-TEST(AdaptiveSharedBatchSchedulerTest, FeedbackSmoothing) {
-  test_util::FakeClockEnv env(Env::Default());
-  Notification start_teardown, stop_teardown;
-  std::unique_ptr<Thread> teardown_thread =
-      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
-  {
-    double feedback = 0;
-    AdaptiveSharedBatchScheduler<FakeTask>::Options options;
-    options.initial_scheduling_period_micros = 1000;
-    options.env = &env;
-    options.scheduling_period_feedback = [&feedback] { return feedback; };
-    options.feedback_smoothing_batches = 3;
-    std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
-    TF_ASSERT_OK(
-        AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
-    std::unique_ptr<BatchScheduler<FakeTask>> queue;
-    int scheduled_items = 0;
-    auto queue_callback = [&scheduled_items,
-                           &env](std::unique_ptr<Batch<FakeTask>> batch) {
-      ASSERT_TRUE(batch->IsClosed());
-      EXPECT_GT(batch->num_tasks(), 0);
-      scheduled_items = 0;
-      for (int i = 0; i < batch->num_tasks(); i++) {
-        scheduled_items += batch->task(i).size();
-      }
-      env.SleepForMicroseconds(1);
-    };
-
-    TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
-
-    // Wait for scheduling_thread to sleep.
-    env.BlockUntilThreadsAsleep(1);
-    // Enqueue 4 batches.
-    for (int i = 0; i < 4; i++) {
-      TF_ASSERT_OK(ScheduleTask(900 + i, queue.get()));
-      env.AdvanceByMicroseconds(1);
-    }
-    feedback = -300;
-    env.AdvanceByMicroseconds(996);
-    env.BlockUntilThreadsAsleep(2);
-    // ewma_feedback = 100, scheduling_period = 900.
-    EXPECT_EQ(scheduled_items, 900);
-    env.AdvanceByMicroseconds(899);
-    // No callback scheduled, only scheduling thread sleeping.
-    env.BlockUntilThreadsAsleep(1);
-    EXPECT_EQ(scheduled_items, 900);
-    env.AdvanceByMicroseconds(1);
-    env.BlockUntilThreadsAsleep(2);
-    // ewma_feedback = 167, scheduling_period = 750.
-    EXPECT_EQ(scheduled_items, 901);
-    env.AdvanceByMicroseconds(749);
-    // No callback scheduled, only scheduling thread sleeping.
-    env.BlockUntilThreadsAsleep(1);
-    EXPECT_EQ(scheduled_items, 901);
-    feedback = 1000 / 3.;
-    env.AdvanceByMicroseconds(1);
-    env.BlockUntilThreadsAsleep(2);
-    // emwa_feedback = 0, scheduling_period = 750.
-    EXPECT_EQ(scheduled_items, 902);
-    env.AdvanceByMicroseconds(749);
-    // No callback scheduled, only scheduling thread sleeping.
-    env.BlockUntilThreadsAsleep(1);
-    EXPECT_EQ(scheduled_items, 902);
-    env.AdvanceByMicroseconds(1);
-    env.BlockUntilThreadsAsleep(2);
-    EXPECT_EQ(scheduled_items, 903);
-    start_teardown.Notify();
-  }
-  stop_teardown.Notify();
-}
-
-TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
-  test_util::FakeClockEnv env(Env::Default());
-  Notification start_teardown, stop_teardown;
-  std::unique_ptr<Thread> teardown_thread =
-      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
-  {
-    AdaptiveSharedBatchScheduler<FakeTask>::Options options;
-    options.initial_scheduling_period_micros = 1000;
-    options.env = &env;
-    std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
-    TF_ASSERT_OK(
-        AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
-    std::unique_ptr<BatchScheduler<FakeTask>> queue;
-    int scheduled_items = 0;
-    auto queue_callback = [&scheduled_items,
-                           &env](std::unique_ptr<Batch<FakeTask>> batch) {
-      ASSERT_TRUE(batch->IsClosed());
-      EXPECT_GT(batch->num_tasks(), 0);
-      scheduled_items = 0;
-      for (int i = 0; i < batch->num_tasks(); i++) {
-        scheduled_items += batch->task(i).size();
-      }
-      env.SleepForMicroseconds(1);
-    };
-    AdaptiveSharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.max_batch_size = 10;
-    queue_options.max_enqueued_batches = 10;
-    TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue));
-
-    // Wait for scheduling_thread to sleep.
-    env.BlockUntilThreadsAsleep(1);
-    // Enqueue 3 tasks.
-    EXPECT_EQ(queue->NumEnqueuedTasks(), 0);
-    EXPECT_EQ(queue->SchedulingCapacity(), 100);
-    TF_ASSERT_OK(ScheduleTask(5, queue.get()));
-    EXPECT_EQ(queue->NumEnqueuedTasks(), 1);
-    EXPECT_EQ(queue->SchedulingCapacity(), 95);
-    env.AdvanceByMicroseconds(1);
-    TF_ASSERT_OK(ScheduleTask(6, queue.get()));
-    EXPECT_EQ(queue->NumEnqueuedTasks(), 2);
-    EXPECT_EQ(queue->SchedulingCapacity(), 84);
-    env.AdvanceByMicroseconds(1);
-    TF_ASSERT_OK(ScheduleTask(1, queue.get()));
-    EXPECT_EQ(queue->NumEnqueuedTasks(), 3);
-    EXPECT_EQ(queue->SchedulingCapacity(), 83);
-
-    env.AdvanceByMicroseconds(998);
-    env.BlockUntilThreadsAsleep(2);
-    EXPECT_EQ(scheduled_items, 5);
-    env.AdvanceByMicroseconds(1000);
-    env.BlockUntilThreadsAsleep(2);
-    EXPECT_EQ(scheduled_items, 7);
-    start_teardown.Notify();
-  }
-  stop_teardown.Notify();
-}
-
-TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesImplementation) {
+TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimit) {
   AdaptiveSharedBatchScheduler<FakeTask>::Options options;
-  options.use_in_flight_batches_implementation = true;
   options.initial_in_flight_batches_limit = 2;
   options.batches_to_average_over = 1000;
   mutex mu;
@@ -476,7 +112,7 @@ TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesImplementation) {
   std::unique_ptr<BatchScheduler<FakeTask>> queue;
   TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
 
-  // Enqueue 3 batches.
+  // Enqueue 3 tasks, should result in 3 batches.
   for (int i = 0; i < 3; i++) {
     TF_ASSERT_OK(ScheduleTask(100, queue.get()));
   }
@@ -490,7 +126,6 @@ TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimitTuning) {
   {
     AdaptiveSharedBatchScheduler<FakeTask>::Options options;
     options.env = &env;
-    options.use_in_flight_batches_implementation = true;
     options.initial_in_flight_batches_limit = 2;
     options.batches_to_average_over = 1;
     auto queue_callback = [&env](std::unique_ptr<Batch<FakeTask>> batch) {
@@ -544,6 +179,196 @@ TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimitTuning) {
   }
   stop_teardown.Notify();
 }
+
+TEST(AdaptiveSharedBatchSchedulerTest, FullBatchSchedulingBoostMicros) {
+  test_util::FakeClockEnv env(Env::Default());
+  Notification start_teardown, stop_teardown;
+  std::unique_ptr<Thread> teardown_thread =
+      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
+  {
+    AdaptiveSharedBatchScheduler<FakeTask>::Options options;
+    options.env = &env;
+    options.initial_in_flight_batches_limit = 1;
+    options.batches_to_average_over = 1000;
+    options.full_batch_scheduling_boost_micros = 100;
+    mutex mu;
+    int processed_batches = 0;
+    Notification finish_processing;
+    auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                              std::unique_ptr<Batch<FakeTask>> batch) {
+      ASSERT_TRUE(batch->IsClosed());
+      finish_processing.WaitForNotification();
+      mutex_lock l(mu);
+      processed_batches++;
+      switch (processed_batches) {
+        case 1:
+          EXPECT_EQ(100, batch->size());
+          break;
+        case 2:
+          EXPECT_EQ(50, batch->size());
+          break;
+        case 3:
+          EXPECT_EQ(900, batch->size());
+          break;
+        case 4:
+          EXPECT_EQ(200, batch->size());
+          break;
+        default:
+          EXPECT_TRUE(false) << "Should only have 4 batches";
+      }
+    };
+    std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(
+        AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+    AdaptiveSharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue1;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue2;
+    queue_options.max_batch_size = 1000;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue1));
+    queue_options.max_batch_size = 100;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue2));
+
+    // First batch immediately processed.
+    TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
+
+    TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
+    env.AdvanceByMicroseconds(10);
+    TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
+    env.AdvanceByMicroseconds(10);
+
+    TF_ASSERT_OK(ScheduleTask(50, queue2.get()));
+    env.AdvanceByMicroseconds(45);
+
+    TF_ASSERT_OK(ScheduleTask(900, queue1.get()));
+
+    // Second batch - creation time: 0, fullness: 0.2, sched score: -20
+    // Third batch - creation time: 20, fullness: 0.5, sched score: -30
+    // Fourth batch - creation time: 65, fullness: 0.9, sched score: -25
+
+    finish_processing.Notify();
+    start_teardown.Notify();
+  }
+  stop_teardown.Notify();
+}
+
+TEST(AdaptiveSharedBatchSchedulerTest, DeleteQueue) {
+  AdaptiveSharedBatchScheduler<FakeTask>::Options options;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1000;
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    finish_processing.WaitForNotification();
+    mu.lock();
+    processed_batches++;
+    mu.unlock();
+  };
+
+  std::unique_ptr<Thread> queue_deleter;
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 2 tasks, should result in 2 batches.
+  for (int i = 0; i < 2; i++) {
+    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  }
+  // Delete queue, should be kept alive until empty.
+  queue_deleter.reset(Env::Default()->StartThread(
+      {}, "QueueDeleterThread", [&queue, &mu, &processed_batches] {
+        queue.reset();
+        mutex_lock l(mu);
+        EXPECT_EQ(processed_batches, 2);
+      }));
+  // Give queue_deleter thread time to delete queue.
+  Env::Default()->SleepForMicroseconds(1000);
+  finish_processing.Notify();
+}
+
+TEST(AdaptiveSharedBatchSchedulerTest, DeleteScheduler) {
+  AdaptiveSharedBatchScheduler<FakeTask>::Options options;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1000;
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    finish_processing.WaitForNotification();
+    mu.lock();
+    processed_batches++;
+    mu.unlock();
+  };
+
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 2 tasks, should result in 2 batches.
+  for (int i = 0; i < 2; i++) {
+    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  }
+  // Delete scheduler, should be kept alive until queues are empty.
+  scheduler.reset();
+  finish_processing.Notify();
+  while (true) {
+    mutex_lock l(mu);
+    if (processed_batches == 2) break;
+  }
+}
+
+TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
+  AdaptiveSharedBatchScheduler<FakeTask>::Options options;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1000;
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    mu.lock();
+    int batch_num = ++processed_batches;
+    mu.unlock();
+    if (batch_num == 1) {
+      finish_processing.WaitForNotification();
+    }
+  };
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 2 tasks, should result in 2 batches.
+  for (int i = 0; i < 2; i++) {
+    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  }
+  // First batch was immediately processed, no longer counts as enqueued.
+  EXPECT_EQ(queue->NumEnqueuedTasks(), 1);
+  EXPECT_EQ(queue->SchedulingCapacity(), 9 * 1000 + 900);
+  // Enqueue 2 more tasks, should fall in same batch.
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  TF_ASSERT_OK(ScheduleTask(200, queue.get()));
+  EXPECT_EQ(queue->NumEnqueuedTasks(), 3);
+  EXPECT_EQ(queue->SchedulingCapacity(), 9 * 1000 + 600);
+  // Enqueue 1 more task, should create new batch.
+  TF_ASSERT_OK(ScheduleTask(700, queue.get()));
+  EXPECT_EQ(queue->NumEnqueuedTasks(), 4);
+  EXPECT_EQ(queue->SchedulingCapacity(), 8 * 1000 + 300);
+  finish_processing.Notify();
+}
 }  // namespace anonymous
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index b77289aded437b2e6955ced3f7eca2aa5bd182dd..b4bce90841ff3d383377c5949f1df5ce3a0ee048 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -135,7 +136,7 @@ class SharedBatchScheduler
     // (inclusive). If there is a need to quantize the batch sizes, i.e. only
     // submit batches whose size is in a small set of allowed sizes, that can be
     // done by adding padding in the process-batch callback.
-    int max_batch_size = 1000;
+    size_t max_batch_size = 1000;
 
     // If a task has been enqueued for this amount of time (in microseconds),
     // and a thread is available, the scheduler will immediately form a batch
@@ -156,7 +157,7 @@ class SharedBatchScheduler
     // If this limit is reached, Schedule() will return an UNAVAILABLE error.
     // See the class documentation above for guidelines on how to tune this
     // parameter.
-    int max_enqueued_batches = 10;
+    size_t max_enqueued_batches = 10;
   };
   Status AddQueue(const QueueOptions& options,
                   std::function<void(std::unique_ptr<Batch<TaskType>>)>
@@ -393,7 +394,7 @@ Status SharedBatchScheduler<TaskType>::AddQueue(
     std::function<void(std::unique_ptr<Batch<TaskType>>)>
         process_batch_callback,
     std::unique_ptr<BatchScheduler<TaskType>>* queue) {
-  if (options.max_batch_size <= 0) {
+  if (options.max_batch_size == 0) {
     return errors::InvalidArgument("max_batch_size must be positive; was ",
                                    options.max_batch_size);
   }
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index 368993c8271a1c83339eb5a55ccb2cc6851e201a..9fda7169a8bf3b6543c0a539a19e01899fea9571 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -393,8 +393,8 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
     if (channel == 0) return;
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-    perftools::gputools::DeviceMemoryBase output_ptr(
-        output->flat<T>().data(), output->NumElements() * sizeof(T));
+    se::DeviceMemoryBase output_ptr(output->flat<T>().data(),
+                                    output->NumElements() * sizeof(T));
     stream->ThenMemZero(&output_ptr, output->NumElements() * sizeof(T));
     if (output_backprop.NumElements() > 0) {
       BiasGradGPU<T>::compute(context->template eigen_device<Device>(),
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..62327dfe1d044bd05966d420e557fc39edd84afd
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -0,0 +1,89 @@
+# Description:
+#   OpKernels for boosted trees ops.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library",
+)
+
+tf_proto_library(
+    name = "boosted_trees_proto",
+    srcs = ["boosted_trees.proto"],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
+tf_kernel_library(
+    name = "prediction_ops",
+    srcs = ["prediction_ops.cc"],
+    deps = [
+        ":resource_ops",
+        ":resources",
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "resources",
+    srcs = ["resources.cc"],
+    hdrs = ["resources.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "resource_ops",
+    srcs = ["resource_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_ops",
+    srcs = ["stats_ops.cc"],
+    deps = [
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "training_ops",
+    srcs = ["training_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "boosted_trees_ops",
+    deps = [
+        ":prediction_ops",
+        ":resource_ops",
+        ":stats_ops",
+        ":training_ops",
+    ],
+)
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
new file mode 100644
index 0000000000000000000000000000000000000000..55599de7315a4610cdbc5937e719c0bd2b4d9c34
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -0,0 +1,117 @@
+syntax = "proto3";
+
+package tensorflow.boosted_trees;
+option cc_enable_arenas = true;
+option java_outer_classname = "BoostedTreesProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+// Node describes a node in a tree.
+message Node {
+  oneof node {
+    Leaf leaf = 1;
+    BucketizedSplit bucketized_split = 2;
+  }
+  NodeMetadata metadata = 777;
+}
+
+// NodeMetadata encodes metadata associated with each node in a tree.
+message NodeMetadata {
+  // The gain associated with this node.
+  float gain = 1;
+
+  // The original leaf node before this node was split.
+  Leaf original_leaf = 2;
+}
+
+// Leaves can either hold dense or sparse information.
+message Leaf {
+  oneof leaf {
+    // See third_party/tensorflow/contrib/decision_trees/
+    // proto/generic_tree_model.proto
+    // for a description of how vector and sparse_vector might be used.
+    Vector vector = 1;
+    SparseVector sparse_vector = 2;
+  }
+  float scalar = 3;
+}
+
+message Vector {
+  repeated float value = 1;
+}
+
+message SparseVector {
+  repeated int32 index = 1;
+  repeated float value = 2;
+}
+
+message BucketizedSplit {
+  // Float feature column and split threshold describing
+  // the rule feature <= threshold.
+  int32 feature_id = 1;
+  int32 threshold = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
+// Tree describes a list of connected nodes.
+// Node 0 must be the root and can carry any payload including a leaf
+// in the case of representing the bias.
+// Note that each node id is implicitly its index in the list of nodes.
+message Tree {
+  repeated Node nodes = 1;
+}
+
+message TreeMetadata {
+  // Number of layers grown for this tree.
+  int32 num_layers_grown = 2;
+
+  // Whether the tree is finalized in that no more layers can be grown.
+  bool is_finalized = 3;
+
+  // If tree was finalized and post pruning happened, it is possible that cache
+  // still refers to some nodes that were deleted or that the node ids changed
+  // (e.g. node id 5 became node id 2 due to pruning of the other branch).
+  // The mapping below allows us to understand where the old ids now map to and
+  // how the values should be adjusted due to post-pruning.
+  // The size of the list should be equal to the number of nodes in the tree
+  // before post-pruning happened.
+  // If the node was pruned, it will have new_node_id equal to the id of a node
+  // that this node was collapsed into. For a node that didn't get pruned, it is
+  // possible that its id still changed, so new_node_id will have the
+  // corresponding id in the pruned tree.
+  // If post-pruning didn't happen, or it did and it had no effect (e.g. no
+  // nodes got pruned), this list will be empty.
+  repeated PostPruneNodeUpdate post_pruned_nodes_meta = 4;
+
+  message PostPruneNodeUpdate {
+    int32 new_node_id = 1;
+    float logit_change = 2;
+  }
+}
+
+message GrowingMetadata {
+  // Number of trees that we have attempted to build. After pruning, these
+  // trees might have been removed.
+  int64 num_trees_attempted = 1;
+  // Number of layers that we have attempted to build. After pruning, these
+  // layers might have been removed.
+  int64 num_layers_attempted = 2;
+  // The start (inclusive) and end (exclusive) ids of the nodes in the latest
+  // layer of the latest tree.
+  int32 last_layer_node_start = 3;
+  int32 last_layer_node_end = 4;
+}
+
+// TreeEnsemble describes an ensemble of decision trees.
+message TreeEnsemble {
+  repeated Tree trees = 1;
+  repeated float tree_weights = 2;
+
+  repeated TreeMetadata tree_metadata = 3;
+  // Metadata that is used during the training.
+  GrowingMetadata growing_metadata = 4;
+}
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b5ce32b7bed896ef8c990c79db402ff171eca94
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -0,0 +1,263 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+// The Op used during training time to get the predictions so far with the
+// current ensemble being built.
+// Expect some logits are cached from the previous step and passed through
+// to be reused.
+class BoostedTreesTrainingPredictOp : public OpKernel {
+ public:
+  explicit BoostedTreesTrainingPredictOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_bucketized_features",
+                                             &num_bucketized_features_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+    OP_REQUIRES(context, logits_dimension_ == 1,
+                errors::InvalidArgument(
+                    "Currently only one dimensional outputs are supported."));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    BoostedTreesEnsembleResource* resource;
+    // Get the resource.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &resource));
+    // Release the reference to the resource once we're done using it.
+    core::ScopedUnref unref_me(resource);
+
+    // Get the inputs.
+    OpInputList bucketized_features_list;
+    OP_REQUIRES_OK(context, context->input_list("bucketized_features",
+                                                &bucketized_features_list));
+    std::vector<tensorflow::TTypes<int32>::ConstVec> batch_bucketized_features;
+    batch_bucketized_features.reserve(bucketized_features_list.size());
+    for (const Tensor& tensor : bucketized_features_list) {
+      batch_bucketized_features.emplace_back(tensor.vec<int32>());
+    }
+    const int batch_size = batch_bucketized_features[0].size();
+
+    const Tensor* cached_tree_ids_t;
+    OP_REQUIRES_OK(context,
+                   context->input("cached_tree_ids", &cached_tree_ids_t));
+    const auto cached_tree_ids = cached_tree_ids_t->vec<int32>();
+
+    const Tensor* cached_node_ids_t;
+    OP_REQUIRES_OK(context,
+                   context->input("cached_node_ids", &cached_node_ids_t));
+    const auto cached_node_ids = cached_node_ids_t->vec<int32>();
+
+    // Allocate outputs.
+    Tensor* output_partial_logits_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("partial_logits",
+                                            {batch_size, logits_dimension_},
+                                            &output_partial_logits_t));
+    auto output_partial_logits = output_partial_logits_t->matrix<float>();
+
+    Tensor* output_tree_ids_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("tree_ids", {batch_size},
+                                                     &output_tree_ids_t));
+    auto output_tree_ids = output_tree_ids_t->vec<int32>();
+
+    Tensor* output_node_ids_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("node_ids", {batch_size},
+                                                     &output_node_ids_t));
+    auto output_node_ids = output_node_ids_t->vec<int32>();
+
+    // Indicate that the latest tree was used.
+    const int32 latest_tree = resource->num_trees() - 1;
+
+    if (latest_tree < 0) {
+      // Ensemble was empty. Nothing changes.
+      output_node_ids = cached_node_ids;
+      output_tree_ids = cached_tree_ids;
+      // All the predictions are zeros.
+      output_partial_logits.setZero();
+    } else {
+      output_tree_ids.setConstant(latest_tree);
+      auto do_work = [&resource, &batch_bucketized_features, &cached_tree_ids,
+                      &cached_node_ids, &output_partial_logits,
+                      &output_node_ids, batch_size,
+                      latest_tree](int32 start, int32 end) {
+        for (int32 i = start; i < end; ++i) {
+          int32 tree_id = cached_tree_ids(i);
+          int32 node_id = cached_node_ids(i);
+          float partial_tree_logit = 0.0;
+
+          // If the tree was pruned, returns the node id into which the
+          // current_node_id was pruned, as well the correction of the cached
+          // logit prediction.
+          resource->GetPostPruneCorrection(tree_id, node_id, &node_id,
+                                           &partial_tree_logit);
+
+          // Logic in the loop adds the cached node value again if it is a leaf.
+          // If it is not a leaf anymore we need to subtract the old node's
+          // value. The following logic handles both of these cases.
+          partial_tree_logit -= resource->node_value(tree_id, node_id);
+          float partial_all_logit = 0.0;
+          while (true) {
+            if (resource->is_leaf(tree_id, node_id)) {
+              partial_tree_logit += resource->node_value(tree_id, node_id);
+
+              // Tree is done
+              partial_all_logit +=
+                  resource->GetTreeWeight(tree_id) * partial_tree_logit;
+              partial_tree_logit = 0.0;
+              // Stop if it was the latest tree.
+              if (tree_id == latest_tree) {
+                break;
+              }
+              // Move onto other trees.
+              ++tree_id;
+              node_id = 0;
+            } else {
+              node_id = resource->next_node(tree_id, node_id, i,
+                                            batch_bucketized_features);
+            }
+          }
+          output_node_ids(i) = node_id;
+          output_partial_logits(i, 0) = partial_all_logit;
+        }
+      };
+      // 30 is the magic number. The actual value might be a function of (the
+      // number of layers) * (cpu cycles spent on each layer), but this value
+      // would work for many cases. May be tuned later.
+      const int64 cost = 30;
+      thread::ThreadPool* const worker_threads =
+          context->device()->tensorflow_cpu_worker_threads()->workers;
+      Shard(worker_threads->NumThreads(), worker_threads, batch_size,
+            /*cost_per_unit=*/cost, do_work);
+    }
+  }
+
+ private:
+  int32 logits_dimension_;         // the size of the output prediction vector.
+  int32 num_bucketized_features_;  // Indicates the number of features.
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesTrainingPredict").Device(DEVICE_CPU),
+                        BoostedTreesTrainingPredictOp);
+
+// The Op to get the predictions at the evaluation/inference time.
+class BoostedTreesPredictOp : public OpKernel {
+ public:
+  explicit BoostedTreesPredictOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_bucketized_features",
+                                             &num_bucketized_features_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+    OP_REQUIRES(context, logits_dimension_ == 1,
+                errors::InvalidArgument(
+                    "Currently only one dimensional outputs are supported."));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    BoostedTreesEnsembleResource* resource;
+    // Get the resource.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &resource));
+    // Release the reference to the resource once we're done using it.
+    core::ScopedUnref unref_me(resource);
+
+    // Get the inputs.
+    OpInputList bucketized_features_list;
+    OP_REQUIRES_OK(context, context->input_list("bucketized_features",
+                                                &bucketized_features_list));
+    std::vector<tensorflow::TTypes<int32>::ConstVec> batch_bucketized_features;
+    batch_bucketized_features.reserve(bucketized_features_list.size());
+    for (const Tensor& tensor : bucketized_features_list) {
+      batch_bucketized_features.emplace_back(tensor.vec<int32>());
+    }
+    const int batch_size = batch_bucketized_features[0].size();
+
+    // Allocate outputs.
+    Tensor* output_logits_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "logits", {batch_size, logits_dimension_},
+                                &output_logits_t));
+    auto output_logits = output_logits_t->matrix<float>();
+
+    const int32 latest_tree = resource->num_trees() - 1;
+
+    auto do_work = [&resource, &batch_bucketized_features, &output_logits,
+                    batch_size, latest_tree](int32 start, int32 end) {
+      for (int32 i = start; i < end; ++i) {
+        float tree_logit = 0.0;
+        int32 tree_id = 0;
+        int32 node_id = 0;
+        while (true) {
+          if (resource->is_leaf(tree_id, node_id)) {
+            tree_logit += resource->GetTreeWeight(tree_id) *
+                          resource->node_value(tree_id, node_id);
+
+            // Stop if it was the latest tree.
+            if (tree_id == latest_tree) {
+              break;
+            }
+            // Move onto other trees.
+            ++tree_id;
+            node_id = 0;
+          } else {
+            node_id = resource->next_node(tree_id, node_id, i,
+                                          batch_bucketized_features);
+          }
+        }
+        output_logits(i, 0) = tree_logit;
+      }
+    };
+    // 10 is the magic number. The actual number might depend on (the number of
+    // layers in the trees) and (cpu cycles spent on each layer), but this
+    // value would work for many cases. May be tuned later.
+    const int64 cost = (latest_tree + 1) * 10;
+    thread::ThreadPool* const worker_threads =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    Shard(worker_threads->NumThreads(), worker_threads, batch_size,
+          /*cost_per_unit=*/cost, do_work);
+  }
+
+ private:
+  int32
+      logits_dimension_;  // Indicates the size of the output prediction vector.
+  int32 num_bucketized_features_;  // Indicates the number of features.
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesPredict").Device(DEVICE_CPU),
+                        BoostedTreesPredictOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/resource_ops.cc b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..563f7b8b08c8969f7fb14c9e59b9fcef166312ce
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
@@ -0,0 +1,201 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_KERNEL(BoostedTreesEnsembleResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("IsBoostedTreesEnsembleInitialized").Device(DEVICE_CPU),
+    IsResourceInitialized<BoostedTreesEnsembleResource>);
+
+// Creates a tree ensemble resource.
+class BoostedTreesCreateEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesCreateEnsembleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Get the tree ensemble proto.
+    const Tensor* tree_ensemble_serialized_t;
+    OP_REQUIRES_OK(context, context->input("tree_ensemble_serialized",
+                                           &tree_ensemble_serialized_t));
+    std::unique_ptr<BoostedTreesEnsembleResource> result(
+        new BoostedTreesEnsembleResource());
+    if (!result->InitFromSerialized(
+            tree_ensemble_serialized_t->scalar<string>()(), stamp_token)) {
+      result->Unref();
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unable to parse tree ensemble proto."));
+    }
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status =
+        CreateResource(context, HandleFromInput(context, 0), result.release());
+    if (status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES_OK(context, status);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesCreateEnsemble").Device(DEVICE_CPU),
+                        BoostedTreesCreateEnsembleOp);
+
+// Op for retrieving some model states (needed for training).
+class BoostedTreesGetEnsembleStatesOp : public OpKernel {
+ public:
+  explicit BoostedTreesGetEnsembleStatesOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Looks up the resource.
+    BoostedTreesEnsembleResource* tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_ensemble_resource));
+    tf_shared_lock l(*tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(tree_ensemble_resource);
+
+    // Sets the outputs.
+    const int num_trees = tree_ensemble_resource->num_trees();
+    const int num_finalized_trees =
+        (num_trees <= 0 ||
+         tree_ensemble_resource->IsTreeFinalized(num_trees - 1))
+            ? num_trees
+            : num_trees - 1;
+    const int num_attempted_layers =
+        tree_ensemble_resource->GetNumLayersAttempted();
+
+    // growing_metadata
+    Tensor* output_stamp_token_t = nullptr;
+    Tensor* output_num_trees_t = nullptr;
+    Tensor* output_num_finalized_trees_t = nullptr;
+    Tensor* output_num_attempted_layers_t = nullptr;
+    Tensor* output_last_layer_nodes_range_t = nullptr;
+
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
+                                                     &output_stamp_token_t));
+    OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape(),
+                                                     &output_num_trees_t));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, TensorShape(),
+                                            &output_num_finalized_trees_t));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(3, TensorShape(),
+                                            &output_num_attempted_layers_t));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                4, {2}, &output_last_layer_nodes_range_t));
+
+    output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
+    output_num_trees_t->scalar<int32>()() = num_trees;
+    output_num_finalized_trees_t->scalar<int32>()() = num_finalized_trees;
+    output_num_attempted_layers_t->scalar<int32>()() = num_attempted_layers;
+
+    int32 range_start;
+    int32 range_end;
+    tree_ensemble_resource->GetLastLayerNodesRange(&range_start, &range_end);
+
+    output_last_layer_nodes_range_t->vec<int32>()(0) = range_start;
+    // For a completely empty ensemble, this will be 0. To make it a valid range
+    // we add this max cond.
+    output_last_layer_nodes_range_t->vec<int32>()(1) = std::max(1, range_end);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesGetEnsembleStates").Device(DEVICE_CPU),
+    BoostedTreesGetEnsembleStatesOp);
+
+// Op for serializing a model.
+class BoostedTreesSerializeEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesSerializeEnsembleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    BoostedTreesEnsembleResource* tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_ensemble_resource));
+    tf_shared_lock l(*tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(tree_ensemble_resource);
+    Tensor* output_stamp_token_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
+                                                     &output_stamp_token_t));
+    output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
+    Tensor* output_proto_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, TensorShape(), &output_proto_t));
+    output_proto_t->scalar<string>()() =
+        tree_ensemble_resource->SerializeAsString();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesSerializeEnsemble").Device(DEVICE_CPU),
+    BoostedTreesSerializeEnsembleOp);
+
+// Op for deserializing a tree ensemble variable from a checkpoint.
+class BoostedTreesDeserializeEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesDeserializeEnsembleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    BoostedTreesEnsembleResource* tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_ensemble_resource));
+    mutex_lock l(*tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(tree_ensemble_resource);
+
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Get the tree ensemble proto.
+    const Tensor* tree_ensemble_serialized_t;
+    OP_REQUIRES_OK(context, context->input("tree_ensemble_serialized",
+                                           &tree_ensemble_serialized_t));
+    // Deallocate all the previous objects on the resource.
+    tree_ensemble_resource->Reset();
+    OP_REQUIRES(
+        context,
+        tree_ensemble_resource->InitFromSerialized(
+            tree_ensemble_serialized_t->scalar<string>()(), stamp_token),
+        errors::InvalidArgument("Unable to parse tree ensemble proto."));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesDeserializeEnsemble").Device(DEVICE_CPU),
+    BoostedTreesDeserializeEnsembleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c410748c27e182a177a17442ba776534d3c30c46
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -0,0 +1,439 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Constructor.
+BoostedTreesEnsembleResource::BoostedTreesEnsembleResource()
+    : tree_ensemble_(
+          protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
+              &arena_)) {}
+
+string BoostedTreesEnsembleResource::DebugString() {
+  return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
+                         "]");
+}
+
+bool BoostedTreesEnsembleResource::InitFromSerialized(const string& serialized,
+                                                      const int64 stamp_token) {
+  CHECK_EQ(stamp(), -1) << "Must Reset before Init.";
+  if (ParseProtoUnlimited(tree_ensemble_, serialized)) {
+    set_stamp(stamp_token);
+    return true;
+  }
+  return false;
+}
+
+string BoostedTreesEnsembleResource::SerializeAsString() const {
+  return tree_ensemble_->SerializeAsString();
+}
+
+int32 BoostedTreesEnsembleResource::num_trees() const {
+  return tree_ensemble_->trees_size();
+}
+
+int32 BoostedTreesEnsembleResource::next_node(
+    const int32 tree_id, const int32 node_id, const int32 index_in_batch,
+    const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  const auto& split = node.bucketized_split();
+  if (bucketized_features[split.feature_id()](index_in_batch) <=
+      split.threshold()) {
+    return split.left_id();
+  } else {
+    return split.right_id();
+  }
+}
+
+float BoostedTreesEnsembleResource::node_value(const int32 tree_id,
+                                               const int32 node_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  if (node.node_case() == boosted_trees::Node::kLeaf) {
+    return node.leaf().scalar();
+  } else {
+    return node.metadata().original_leaf().scalar();
+  }
+}
+
+int32 BoostedTreesEnsembleResource::GetNumLayersGrown(
+    const int32 tree_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->tree_metadata(tree_id).num_layers_grown();
+}
+
+void BoostedTreesEnsembleResource::SetNumLayersGrown(
+    const int32 tree_id, int32 new_num_layers) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  tree_ensemble_->mutable_tree_metadata(tree_id)->set_num_layers_grown(
+      new_num_layers);
+}
+
+void BoostedTreesEnsembleResource::UpdateLastLayerNodesRange(
+    const int32 node_range_start, int32 node_range_end) const {
+  tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
+      node_range_start);
+  tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
+      node_range_end);
+}
+
+void BoostedTreesEnsembleResource::GetLastLayerNodesRange(
+    int32* node_range_start, int32* node_range_end) const {
+  *node_range_start =
+      tree_ensemble_->growing_metadata().last_layer_node_start();
+  *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
+}
+
+int64 BoostedTreesEnsembleResource::GetNumNodes(const int32 tree_id) {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->trees(tree_id).nodes_size();
+}
+
+int32 BoostedTreesEnsembleResource::GetNumLayersAttempted() {
+  return tree_ensemble_->growing_metadata().num_layers_attempted();
+}
+
+bool BoostedTreesEnsembleResource::is_leaf(const int32 tree_id,
+                                           const int32 node_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  return node.node_case() == boosted_trees::Node::kLeaf;
+}
+
+int32 BoostedTreesEnsembleResource::feature_id(const int32 tree_id,
+                                               const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().feature_id();
+}
+
+int32 BoostedTreesEnsembleResource::bucket_threshold(
+    const int32 tree_id, const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().threshold();
+}
+
+int32 BoostedTreesEnsembleResource::left_id(const int32 tree_id,
+                                            const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().left_id();
+}
+
+int32 BoostedTreesEnsembleResource::right_id(const int32 tree_id,
+                                             const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().right_id();
+}
+
+std::vector<float> BoostedTreesEnsembleResource::GetTreeWeights() const {
+  return {tree_ensemble_->tree_weights().begin(),
+          tree_ensemble_->tree_weights().end()};
+}
+
+float BoostedTreesEnsembleResource::GetTreeWeight(const int32 tree_id) const {
+  return tree_ensemble_->tree_weights(tree_id);
+}
+
+float BoostedTreesEnsembleResource::IsTreeFinalized(const int32 tree_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->tree_metadata(tree_id).is_finalized();
+}
+
+float BoostedTreesEnsembleResource::IsTreePostPruned(
+    const int32 tree_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta_size() >
+         0;
+}
+
+void BoostedTreesEnsembleResource::SetIsFinalized(const int32 tree_id,
+                                                  const bool is_finalized) {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->mutable_tree_metadata(tree_id)->set_is_finalized(
+      is_finalized);
+}
+
+// Sets the weight of i'th tree.
+void BoostedTreesEnsembleResource::SetTreeWeight(const int32 tree_id,
+                                                 const float weight) {
+  DCHECK_GE(tree_id, 0);
+  DCHECK_LT(tree_id, num_trees());
+  tree_ensemble_->set_tree_weights(tree_id, weight);
+}
+
+void BoostedTreesEnsembleResource::UpdateGrowingMetadata() const {
+  tree_ensemble_->mutable_growing_metadata()->set_num_layers_attempted(
+      tree_ensemble_->growing_metadata().num_layers_attempted() + 1);
+
+  const int n_trees = num_trees();
+
+  if (n_trees <= 0 ||
+      // Checks if we are building the first layer of the dummy empty tree
+      ((n_trees == 1 || IsTreeFinalized(n_trees - 2)) &&
+       (tree_ensemble_->trees(n_trees - 1).nodes_size() == 1))) {
+    tree_ensemble_->mutable_growing_metadata()->set_num_trees_attempted(
+        tree_ensemble_->growing_metadata().num_trees_attempted() + 1);
+  }
+}
+
+// Add a tree to the ensemble and returns a new tree_id.
+int32 BoostedTreesEnsembleResource::AddNewTree(const float weight) {
+  const int32 new_tree_id = tree_ensemble_->trees_size();
+  auto* node = tree_ensemble_->add_trees()->add_nodes();
+  node->mutable_leaf()->set_scalar(0.0);
+  tree_ensemble_->add_tree_weights(weight);
+  tree_ensemble_->add_tree_metadata();
+
+  return new_tree_id;
+}
+
+void BoostedTreesEnsembleResource::AddBucketizedSplitNode(
+    const int32 tree_id, const int32 node_id, const int32 feature_id,
+    const int32 threshold, const float gain, const float left_contrib,
+    const float right_contrib, int32* left_node_id, int32* right_node_id) {
+  auto* tree = tree_ensemble_->mutable_trees(tree_id);
+  auto* node = tree->mutable_nodes(node_id);
+  DCHECK_EQ(node->node_case(), boosted_trees::Node::kLeaf);
+  float prev_node_value = node->leaf().scalar();
+  *left_node_id = tree->nodes_size();
+  *right_node_id = *left_node_id + 1;
+  auto* left_node = tree->add_nodes();
+  auto* right_node = tree->add_nodes();
+  if (node_id != 0) {
+    // Save previous leaf value if it is not the first leaf in the tree.
+    node->mutable_metadata()->mutable_original_leaf()->Swap(
+        node->mutable_leaf());
+  }
+  node->mutable_metadata()->set_gain(gain);
+  auto* new_split = node->mutable_bucketized_split();
+  new_split->set_feature_id(feature_id);
+  new_split->set_threshold(threshold);
+  new_split->set_left_id(*left_node_id);
+  new_split->set_right_id(*right_node_id);
+  // TODO(npononareva): this is LAYER-BY-LAYER boosting; add WHOLE-TREE.
+  left_node->mutable_leaf()->set_scalar(prev_node_value + left_contrib);
+  right_node->mutable_leaf()->set_scalar(prev_node_value + right_contrib);
+}
+
+void BoostedTreesEnsembleResource::Reset() {
+  // Reset stamp.
+  set_stamp(-1);
+
+  // Clear tree ensemle.
+  arena_.Reset();
+  CHECK_EQ(0, arena_.SpaceAllocated());
+  tree_ensemble_ =
+      protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(&arena_);
+}
+
+void BoostedTreesEnsembleResource::PostPruneTree(const int32 current_tree) {
+  // No-op if tree is empty.
+  auto* tree = tree_ensemble_->mutable_trees(current_tree);
+  int32 num_nodes = tree->nodes_size();
+  if (num_nodes == 0) {
+    return;
+  }
+
+  std::vector<int32> nodes_to_delete;
+  // If a node was pruned, we need to save the change of the prediction from
+  // this node to its parent, as well as the parent id.
+  std::vector<std::pair<int32, float>> nodes_changes;
+  nodes_changes.reserve(num_nodes);
+  for (int32 i = 0; i < num_nodes; ++i) {
+    nodes_changes.emplace_back(i, 0.0);
+  }
+  // Prune the tree recursively starting from the root. Each node that has
+  // negative gain and only leaf children will be pruned recursively up from
+  // the bottom of the tree. This method returns the list of nodes pruned, and
+  // updates the nodes in the tree not to refer to those pruned nodes.
+  RecursivelyDoPostPrunePreparation(current_tree, 0, &nodes_to_delete,
+                                    &nodes_changes);
+
+  if (nodes_to_delete.empty()) {
+    // No pruning happened, and no post-processing needed.
+    return;
+  }
+
+  // Sort node ids so they are in asc order.
+  std::sort(nodes_to_delete.begin(), nodes_to_delete.end());
+
+  // We need to
+  // - update split left and right children ids with new indices
+  // - actually remove the nodes that need to be removed
+  // - save the information about pruned node so we could recover the
+  // predictions from cache. Build a map for old node index=>new node index.
+  // nodes_to_delete contains nodes who's indices should be skipped, in
+  // ascending order. Save the information about new indices into meta.
+  std::map<int32, int32> old_to_new_ids;
+  int32 new_index = 0;
+  int32 index_for_deleted = 0;
+  auto* post_prune_meta = tree_ensemble_->mutable_tree_metadata(current_tree)
+                              ->mutable_post_pruned_nodes_meta();
+
+  for (int32 i = 0; i < num_nodes; ++i) {
+    if (index_for_deleted < nodes_to_delete.size() &&
+        i == nodes_to_delete[index_for_deleted]) {
+      // Node i will get removed,
+      ++index_for_deleted;
+      // Update meta info that will allow us to use cached predictions from
+      // those nodes.
+      int32 new_id;
+      float logit_change;
+      CalculateParentAndLogitUpdate(i, nodes_changes, &new_id, &logit_change);
+      auto* meta = post_prune_meta->Add();
+      meta->set_new_node_id(old_to_new_ids[new_id]);
+      meta->set_logit_change(logit_change);
+    } else {
+      old_to_new_ids[i] = new_index++;
+      auto* meta = post_prune_meta->Add();
+      // Update meta info that will allow us to use cached predictions from
+      // those nodes.
+      meta->set_new_node_id(old_to_new_ids[i]);
+      meta->set_logit_change(0.0);
+    }
+  }
+  index_for_deleted = 0;
+  int32 i = 0;
+  protobuf::RepeatedPtrField<boosted_trees::Node> new_nodes;
+  new_nodes.Reserve(old_to_new_ids.size());
+  for (auto node : *(tree->mutable_nodes())) {
+    if (index_for_deleted < nodes_to_delete.size() &&
+        i == nodes_to_delete[index_for_deleted]) {
+      ++index_for_deleted;
+      ++i;
+      continue;
+    } else {
+      if (node.node_case() == boosted_trees::Node::kBucketizedSplit) {
+        node.mutable_bucketized_split()->set_left_id(
+            old_to_new_ids[node.bucketized_split().left_id()]);
+        node.mutable_bucketized_split()->set_right_id(
+            old_to_new_ids[node.bucketized_split().right_id()]);
+      }
+      *new_nodes.Add() = std::move(node);
+    }
+    ++i;
+  }
+  // Replace all the nodes in a tree with the ones we keep.
+  *tree->mutable_nodes() = std::move(new_nodes);
+
+  // Note that if the whole tree got pruned, we will end up with one node.
+  // We can't remove that tree because it will cause problems with cache.
+}
+
+void BoostedTreesEnsembleResource::GetPostPruneCorrection(
+    const int32 tree_id, const int32 initial_node_id, int32* current_node_id,
+    float* logit_update) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  if (IsTreeFinalized(tree_id) && IsTreePostPruned(tree_id)) {
+    DCHECK_LT(
+        initial_node_id,
+        tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta_size());
+    const auto& meta =
+        tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta(
+            initial_node_id);
+    *current_node_id = meta.new_node_id();
+    *logit_update += meta.logit_change();
+  }
+}
+
+bool BoostedTreesEnsembleResource::IsTerminalSplitNode(
+    const int32 tree_id, const int32 node_id) const {
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  const int32 left_id = node.bucketized_split().left_id();
+  const int32 right_id = node.bucketized_split().right_id();
+  return is_leaf(tree_id, left_id) && is_leaf(tree_id, right_id);
+}
+
+// For each pruned node, finds the leaf where it finally ended up and
+// calculates the total update from that pruned node prediction.
+void BoostedTreesEnsembleResource::CalculateParentAndLogitUpdate(
+    const int32 start_node_id,
+    const std::vector<std::pair<int32, float>>& nodes_change, int32* parent_id,
+    float* change) const {
+  *change = 0.0;
+  int32 node_id = start_node_id;
+  int32 parent = nodes_change[node_id].first;
+
+  while (parent != node_id) {
+    (*change) += nodes_change[node_id].second;
+    node_id = parent;
+    parent = nodes_change[node_id].first;
+  }
+  *parent_id = parent;
+}
+
+void BoostedTreesEnsembleResource::RecursivelyDoPostPrunePreparation(
+    const int32 tree_id, const int32 node_id,
+    std::vector<int32>* nodes_to_delete,
+    std::vector<std::pair<int32, float>>* nodes_meta) {
+  auto* node = tree_ensemble_->mutable_trees(tree_id)->mutable_nodes(node_id);
+  DCHECK_NE(node->node_case(), boosted_trees::Node::NODE_NOT_SET);
+  // Base case when we reach a leaf.
+  if (node->node_case() == boosted_trees::Node::kLeaf) {
+    return;
+  }
+
+  // Traverse node children first and recursively prune their sub-trees.
+  RecursivelyDoPostPrunePreparation(tree_id, node->bucketized_split().left_id(),
+                                    nodes_to_delete, nodes_meta);
+  RecursivelyDoPostPrunePreparation(tree_id,
+                                    node->bucketized_split().right_id(),
+                                    nodes_to_delete, nodes_meta);
+
+  // Two conditions must be satisfied to prune the node:
+  // 1- The split gain is negative.
+  // 2- After depth-first pruning, the node only has leaf children.
+  const auto& node_metadata = node->metadata();
+  if (node_metadata.gain() < 0 && IsTerminalSplitNode(tree_id, node_id)) {
+    const int32 left_id = node->bucketized_split().left_id();
+    const int32 right_id = node->bucketized_split().right_id();
+
+    // Save children that need to be deleted.
+    nodes_to_delete->push_back(left_id);
+    nodes_to_delete->push_back(right_id);
+
+    // Change node back into leaf.
+    *node->mutable_leaf() = node_metadata.original_leaf();
+    const float parent_value = node_value(tree_id, node_id);
+
+    // Save the old values of weights of children.
+    (*nodes_meta)[left_id].first = node_id;
+    (*nodes_meta)[left_id].second = parent_value - node_value(tree_id, left_id);
+
+    (*nodes_meta)[right_id].first = node_id;
+    (*nodes_meta)[right_id].second =
+        parent_value - node_value(tree_id, right_id);
+
+    // Clear gain for leaf node.
+    node->clear_metadata();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
new file mode 100644
index 0000000000000000000000000000000000000000..df78d3f275bbf825c99285530208854f7c05cea9
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -0,0 +1,167 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
+#define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Forward declaration for proto class TreeEnsemble
+namespace boosted_trees {
+class TreeEnsemble;
+}  // namespace boosted_trees
+
+// A StampedResource is a resource that has a stamp token associated with it.
+// Before reading from or applying updates to the resource, the stamp should
+// be checked to verify that the update is not stale.
+class StampedResource : public ResourceBase {
+ public:
+  StampedResource() : stamp_(-1) {}
+
+  bool is_stamp_valid(int64 stamp) const { return stamp_ == stamp; }
+
+  int64 stamp() const { return stamp_; }
+  void set_stamp(int64 stamp) { stamp_ = stamp; }
+
+ private:
+  int64 stamp_;
+};
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class BoostedTreesEnsembleResource : public StampedResource {
+ public:
+  BoostedTreesEnsembleResource();
+
+  string DebugString() override;
+
+  bool InitFromSerialized(const string& serialized, const int64 stamp_token);
+
+  string SerializeAsString() const;
+
+  int32 num_trees() const;
+
+  // Find the next node to which the example (specified by index_in_batch)
+  // traverses down from the current node indicated by tree_id and node_id.
+  // Args:
+  //   tree_id: the index of the tree in the ensemble.
+  //   node_id: the index of the node within the tree.
+  //   index_in_batch: the index of the example within the batch (relevant to
+  //       the index of the row to read in each bucketized_features).
+  //   bucketized_features: vector of feature Vectors.
+  int32 next_node(
+      const int32 tree_id, const int32 node_id, const int32 index_in_batch,
+      const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const;
+
+  float node_value(const int32 tree_id, const int32 node_id) const;
+
+  int32 GetNumLayersGrown(const int32 tree_id) const;
+
+  void SetNumLayersGrown(const int32 tree_id, int32 new_num_layers) const;
+
+  void UpdateLastLayerNodesRange(const int32 node_range_start,
+                                 int32 node_range_end) const;
+
+  void GetLastLayerNodesRange(int32* node_range_start,
+                              int32* node_range_end) const;
+
+  int64 GetNumNodes(const int32 tree_id);
+
+  void UpdateGrowingMetadata() const;
+
+  int32 GetNumLayersAttempted();
+
+  bool is_leaf(const int32 tree_id, const int32 node_id) const;
+
+  int32 feature_id(const int32 tree_id, const int32 node_id) const;
+
+  int32 bucket_threshold(const int32 tree_id, const int32 node_id) const;
+
+  int32 left_id(const int32 tree_id, const int32 node_id) const;
+
+  int32 right_id(const int32 tree_id, const int32 node_id) const;
+
+  // Add a tree to the ensemble and returns a new tree_id.
+  int32 AddNewTree(const float weight);
+
+  // Grows the tree by adding a split and leaves.
+  void AddBucketizedSplitNode(const int32 tree_id, const int32 node_id,
+                              const int32 feature_id, const int32 threshold,
+                              const float gain, const float left_contrib,
+                              const float right_contrib, int32* left_node_id,
+                              int32* right_node_id);
+
+  // Retrieves tree weights and returns as a vector.
+  // It involves a copy, so should be called only sparingly (like once per
+  // iteration, not per example).
+  std::vector<float> GetTreeWeights() const;
+
+  float GetTreeWeight(const int32 tree_id) const;
+
+  float IsTreeFinalized(const int32 tree_id) const;
+
+  float IsTreePostPruned(const int32 tree_id) const;
+
+  void SetIsFinalized(const int32 tree_id, const bool is_finalized);
+
+  // Sets the weight of i'th tree.
+  void SetTreeWeight(const int32 tree_id, const float weight);
+
+  // Resets the resource and frees the protos in arena.
+  // Caller needs to hold the mutex lock while calling this.
+  virtual void Reset();
+
+  void PostPruneTree(const int32 current_tree);
+
+  // For a given node, returns the id in a pruned tree, as well as correction
+  // to the cached prediction that should be applied. If tree was not
+  // post-pruned, current_node_id will be equal to initial_node_id and logit
+  // update will be equal to zero.
+  void GetPostPruneCorrection(const int32 tree_id, const int32 initial_node_id,
+                              int32* current_node_id,
+                              float* logit_update) const;
+  mutex* get_mutex() { return &mu_; }
+
+ private:
+  // Helper method to check whether a node is a terminal node in that it
+  // only has leaf nodes as children.
+  bool IsTerminalSplitNode(const int32 tree_id, const int32 node_id) const;
+
+  // For each pruned node, finds the leaf where it finally ended up and
+  // calculates the total update from that pruned node prediction.
+  void CalculateParentAndLogitUpdate(
+      const int32 start_node_id,
+      const std::vector<std::pair<int32, float>>& nodes_change,
+      int32* parent_id, float* change) const;
+
+  // Helper method to collect the information to be used to prune some nodes in
+  // the tree.
+  void RecursivelyDoPostPrunePreparation(
+      const int32 tree_id, const int32 node_id,
+      std::vector<int32>* nodes_to_delete,
+      std::vector<std::pair<int32, float>>* nodes_meta);
+
+ protected:
+  protobuf::Arena arena_;
+  mutex mu_;
+  boosted_trees::TreeEnsemble* tree_ensemble_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6dfcd63ab310a2b682054c6921d4425d344ada15
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -0,0 +1,309 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+const float kEps = 1e-15;
+}  // namespace
+
+class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
+ public:
+  explicit BoostedTreesCalculateBestGainsPerFeatureOp(
+      OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // node_id_range
+    const Tensor* node_id_range_t;
+    OP_REQUIRES_OK(context, context->input("node_id_range", &node_id_range_t));
+    const auto node_id_range = node_id_range_t->vec<int32>();
+    const int32 node_id_first = node_id_range(0);  // inclusive
+    const int32 node_id_last = node_id_range(1);   // exclusive
+    // stats_summary_list
+    OpInputList stats_summary_list;
+    OP_REQUIRES_OK(context, context->input_list("stats_summary_list",
+                                                &stats_summary_list));
+    const int64 num_buckets = stats_summary_list[0].dim_size(1);
+    std::vector<TTypes<float, 3>::ConstTensor> stats_summary;
+    stats_summary.reserve(stats_summary_list.size());
+    for (const auto& tensor : stats_summary_list) {
+      stats_summary.emplace_back(tensor.tensor<float, 3>());
+    }
+    const Tensor* l1_t;
+    OP_REQUIRES_OK(context, context->input("l1", &l1_t));
+    const auto l1 = l1_t->scalar<float>()();
+    const Tensor* l2_t;
+    OP_REQUIRES_OK(context, context->input("l2", &l2_t));
+    const auto l2 = l2_t->scalar<float>()();
+    const Tensor* tree_complexity_t;
+    OP_REQUIRES_OK(context,
+                   context->input("tree_complexity", &tree_complexity_t));
+    const auto tree_complexity = tree_complexity_t->scalar<float>()();
+    const Tensor* min_node_weight_t;
+    OP_REQUIRES_OK(context,
+                   context->input("min_node_weight", &min_node_weight_t));
+    const auto min_node_weight = min_node_weight_t->scalar<float>()();
+
+    // Allocate output lists of tensors:
+    OpOutputList output_node_ids_list;
+    OP_REQUIRES_OK(
+        context, context->output_list("node_ids_list", &output_node_ids_list));
+    OpOutputList output_gains_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list("gains_list", &output_gains_list));
+    OpOutputList output_thresholds_list;
+    OP_REQUIRES_OK(context, context->output_list("thresholds_list",
+                                                 &output_thresholds_list));
+    OpOutputList output_left_node_contribs_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list("left_node_contribs_list",
+                                        &output_left_node_contribs_list));
+    OpOutputList output_right_node_contribs_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list("right_node_contribs_list",
+                                        &output_right_node_contribs_list));
+
+    // Get the best split info per node for each feature.
+    for (int feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      std::vector<float> cum_grad;
+      std::vector<float> cum_hess;
+      cum_grad.reserve(num_buckets);
+      cum_hess.reserve(num_buckets);
+
+      std::vector<int32> output_node_ids;
+      std::vector<float> output_gains;
+      std::vector<int32> output_thresholds;
+      std::vector<float> output_left_node_contribs;
+      std::vector<float> output_right_node_contribs;
+      for (int node_id = node_id_first; node_id < node_id_last; ++node_id) {
+        // Calculate gains.
+        cum_grad.clear();
+        cum_hess.clear();
+        float total_grad = 0.0;
+        float total_hess = 0.0;
+        for (int bucket = 0; bucket < num_buckets; ++bucket) {
+          // TODO(nponomareva): Consider multi-dimensional gradients/hessians.
+          total_grad += stats_summary[feature_idx](node_id, bucket, 0);
+          total_hess += stats_summary[feature_idx](node_id, bucket, 1);
+          cum_grad.push_back(total_grad);
+          cum_hess.push_back(total_hess);
+        }
+        // Check if node has enough of average hessian.
+        if (total_hess < min_node_weight) {
+          // Do not split the node because not enough avg hessian.
+          continue;
+        }
+        float best_gain = std::numeric_limits<float>::lowest();
+        float best_bucket = 0;
+        float best_contrib_for_left = 0.0;
+        float best_contrib_for_right = 0.0;
+        // Parent gain.
+        float parent_gain;
+        float unused;
+        CalculateWeightsAndGains(total_grad, total_hess, l1, l2, &unused,
+                                 &parent_gain);
+
+        for (int bucket = 0; bucket < num_buckets; ++bucket) {
+          const float cum_grad_bucket = cum_grad[bucket];
+          const float cum_hess_bucket = cum_hess[bucket];
+          // Left child.
+          float contrib_for_left;
+          float gain_for_left;
+          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket, l1, l2,
+                                   &contrib_for_left, &gain_for_left);
+          // Right child.
+          float contrib_for_right;
+          float gain_for_right;
+          CalculateWeightsAndGains(total_grad - cum_grad_bucket,
+                                   total_hess - cum_hess_bucket, l1, l2,
+                                   &contrib_for_right, &gain_for_right);
+
+          if (gain_for_left + gain_for_right > best_gain) {
+            best_gain = gain_for_left + gain_for_right;
+            best_bucket = bucket;
+            best_contrib_for_left = contrib_for_left;
+            best_contrib_for_right = contrib_for_right;
+          }
+        }  // for bucket
+        output_node_ids.push_back(node_id);
+        // Remove the parent gain for the parent node.
+        output_gains.push_back(best_gain - parent_gain);
+        output_thresholds.push_back(best_bucket);
+        output_left_node_contribs.push_back(best_contrib_for_left);
+        output_right_node_contribs.push_back(best_contrib_for_right);
+      }  // for node_id
+      const int num_nodes = output_node_ids.size();
+      // output_node_ids
+      Tensor* output_node_ids_t;
+      OP_REQUIRES_OK(context,
+                     output_node_ids_list.allocate(feature_idx, {num_nodes},
+                                                   &output_node_ids_t));
+      auto output_node_ids_vec = output_node_ids_t->vec<int32>();
+      // output_gains
+      Tensor* output_gains_t;
+      OP_REQUIRES_OK(context, output_gains_list.allocate(
+                                  feature_idx, {num_nodes}, &output_gains_t));
+      auto output_gains_vec = output_gains_t->vec<float>();
+      // output_thresholds
+      Tensor* output_thresholds_t;
+      OP_REQUIRES_OK(context,
+                     output_thresholds_list.allocate(feature_idx, {num_nodes},
+                                                     &output_thresholds_t));
+      auto output_thresholds_vec = output_thresholds_t->vec<int32>();
+      // output_left_node_contribs
+      Tensor* output_left_node_contribs_t;
+      OP_REQUIRES_OK(context, output_left_node_contribs_list.allocate(
+                                  feature_idx, {num_nodes, 1},
+                                  &output_left_node_contribs_t));
+      auto output_left_node_contribs_matrix =
+          output_left_node_contribs_t->matrix<float>();
+      // output_right_node_contribs
+      Tensor* output_right_node_contribs_t;
+      OP_REQUIRES_OK(context, output_right_node_contribs_list.allocate(
+                                  feature_idx, {num_nodes, 1},
+                                  &output_right_node_contribs_t));
+      auto output_right_node_contribs_matrix =
+          output_right_node_contribs_t->matrix<float>();
+      // Sets output tensors from vectors.
+      for (int i = 0; i < num_nodes; ++i) {
+        output_node_ids_vec(i) = output_node_ids[i];
+        // Adjust the gains to penalize by tree complexity.
+        output_gains_vec(i) = output_gains[i] - tree_complexity;
+        output_thresholds_vec(i) = output_thresholds[i];
+        // Logits are 1-dimensional for now.
+        // TODO(nponomareva): Consider multi-dimensional logits.
+        output_left_node_contribs_matrix(i, 0) = output_left_node_contribs[i];
+        output_right_node_contribs_matrix(i, 0) = output_right_node_contribs[i];
+      }
+    }  // for f
+  }
+
+ private:
+  void CalculateWeightsAndGains(const float g, const float h, const float l1,
+                                const float l2, float* weight, float* gain) {
+    //
+    // The formula for weight is -(g+l1*sgn(w))/(H+l2), for gain it is
+    // (g+l1*sgn(w))^2/(h+l2).
+    // This is because for each leaf we optimize
+    // 1/2(h+l2)*w^2+g*w+l1*abs(w)
+    float g_with_l1 = g;
+    // Apply L1 regularization.
+    // 1) Assume w>0 => w=-(g+l1)/(h+l2)=> g+l1 < 0 => g < -l1
+    // 2) Assume w<0 => w=-(g-l1)/(h+l2)=> g-l1 > 0 => g > l1
+    // For g from (-l1, l1), thus there is no solution => set to 0.
+    if (l1 > 0) {
+      if (g > l1) {
+        g_with_l1 -= l1;
+      } else if (g < -l1) {
+        g_with_l1 += l1;
+      } else {
+        *weight = 0.0;
+        *gain = 0.0;
+        return;
+      }
+    }
+    // Apply L2 regularization.
+    if (h + l2 <= kEps) {
+      // Avoid division by 0 or infinitesimal.
+      *weight = 0;
+      *gain = 0;
+    } else {
+      *weight = -g_with_l1 / (h + l2);
+      *gain = -g_with_l1 * (*weight);
+    }
+  }
+
+  int max_splits_;
+  int num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesCalculateBestGainsPerFeature").Device(DEVICE_CPU),
+    BoostedTreesCalculateBestGainsPerFeatureOp);
+
+class BoostedTreesMakeStatsSummaryOp : public OpKernel {
+ public:
+  explicit BoostedTreesMakeStatsSummaryOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // node_ids
+    const Tensor* node_ids_t;
+    OP_REQUIRES_OK(context, context->input("node_ids", &node_ids_t));
+    const auto node_ids = node_ids_t->vec<int32>();
+    // gradients
+    const Tensor* gradients_t;
+    OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
+    const auto gradients = gradients_t->matrix<float>();
+    // hessians
+    const Tensor* hessians_t;
+    OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
+    const auto hessians = hessians_t->matrix<float>();
+    // bucketized_features
+    OpInputList bucketized_features_list;
+    OP_REQUIRES_OK(context, context->input_list("bucketized_features_list",
+                                                &bucketized_features_list));
+    std::vector<tensorflow::TTypes<int32>::ConstVec> bucketized_features;
+    bucketized_features.reserve(num_features_);
+    for (const Tensor& tensor : bucketized_features_list) {
+      bucketized_features.emplace_back(tensor.vec<int32>());
+    }
+
+    // Infer batch size.
+    const int64 batch_size = node_ids_t->dim_size(0);
+    // Allocate output stats tensor (Rank 4).
+    Tensor* output_stats_summary_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "stats_summary",
+                                {num_features_, max_splits_, num_buckets_, 2},
+                                &output_stats_summary_t));
+    auto output_stats_summary = output_stats_summary_t->tensor<float, 4>();
+    output_stats_summary.setZero();
+
+    // Partition by node, and then bucketize.
+    for (int feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      const auto& features = bucketized_features[feature_idx];
+      for (int i = 0; i < batch_size; ++i) {
+        const int32 node = node_ids(i);
+        const int32 bucket = features(i);
+        output_stats_summary(feature_idx, node, bucket, 0) += gradients(i, 0);
+        output_stats_summary(feature_idx, node, bucket, 1) += hessians(i, 0);
+      }
+    }
+  }
+
+ private:
+  int max_splits_;
+  int num_buckets_;
+  int num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesMakeStatsSummary").Device(DEVICE_CPU),
+                        BoostedTreesMakeStatsSummaryOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a14fd4a133d2089c7c42c1bed4fbcaa0458b9f84
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -0,0 +1,230 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+
+namespace tensorflow {
+
+namespace {
+constexpr float kLayerByLayerTreeWeight = 1.0;
+
+// TODO(nponomareva, youngheek): consider using vector.
+struct SplitCandidate {
+  SplitCandidate() {}
+
+  // Index in the list of the feature ids.
+  int64 feature_idx;
+
+  // Index in the tensor of node_ids for the feature with idx feature_idx.
+  int64 candidate_idx;
+
+  float gain;
+};
+
+enum PruningMode { kNoPruning = 0, kPrePruning = 1, kPostPruning = 2 };
+
+}  // namespace
+
+class BoostedTreesUpdateEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesUpdateEnsembleOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+
+    int32 pruning_index;
+    OP_REQUIRES_OK(context, context->GetAttr("pruning_mode", &pruning_index));
+    pruning_mode_ = static_cast<PruningMode>(pruning_index);
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // Get decision tree ensemble.
+    BoostedTreesEnsembleResource* ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &ensemble_resource));
+    core::ScopedUnref unref_me(ensemble_resource);
+    mutex_lock l(*ensemble_resource->get_mutex());
+    // Increase the ensemble stamp.
+    ensemble_resource->set_stamp(ensemble_resource->stamp() + 1);
+
+    // Read node ids, gains, thresholds and node contribs.
+    OpInputList node_ids_list;
+    OpInputList gains_list;
+    OpInputList thresholds_list;
+    OpInputList left_node_contribs;
+    OpInputList right_node_contribs;
+    OP_REQUIRES_OK(context, context->input_list("node_ids", &node_ids_list));
+    OP_REQUIRES_OK(context, context->input_list("gains", &gains_list));
+    OP_REQUIRES_OK(context,
+                   context->input_list("thresholds", &thresholds_list));
+    OP_REQUIRES_OK(context, context->input_list("left_node_contribs",
+                                                &left_node_contribs));
+    OP_REQUIRES_OK(context, context->input_list("right_node_contribs",
+                                                &right_node_contribs));
+
+    const Tensor* feature_ids_t;
+    OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+    const auto feature_ids = feature_ids_t->vec<int32>();
+
+    const Tensor* max_depth_t;
+    OP_REQUIRES_OK(context, context->input("max_depth", &max_depth_t));
+    const auto max_depth = max_depth_t->scalar<int32>()();
+
+    const Tensor* learning_rate_t;
+    OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
+    const auto learning_rate = learning_rate_t->scalar<float>()();
+
+    // Find best splits for each active node.
+    std::map<int32, SplitCandidate> best_splits;
+    FindBestSplitsPerNode(context, node_ids_list, gains_list, &best_splits);
+
+    int32 current_tree =
+        UpdateGlobalAttemptsAndRetrieveGrowableTree(ensemble_resource);
+
+    // No-op if no new splits can be considered.
+    if (best_splits.empty()) {
+      LOG(WARNING) << "Not growing tree ensemble as no good splits were found.";
+      return;
+    }
+
+    const int32 new_num_layers =
+        ensemble_resource->GetNumLayersGrown(current_tree) + 1;
+    VLOG(1) << "Adding layer #" << new_num_layers - 1 << " to tree #"
+            << current_tree << " of ensemble of " << current_tree + 1
+            << " trees.";
+    bool split_happened = false;
+    int32 node_id_start = ensemble_resource->GetNumNodes(current_tree);
+    // Add the splits to the tree.
+    for (auto& split_entry : best_splits) {
+      const int32 node_id = split_entry.first;
+      const SplitCandidate& candidate = split_entry.second;
+
+      const int64 feature_idx = candidate.feature_idx;
+      const int64 candidate_idx = candidate.candidate_idx;
+
+      const int32 feature_id = feature_ids(feature_idx);
+      const int32 threshold =
+          thresholds_list[feature_idx].vec<int32>()(candidate_idx);
+      const float gain = gains_list[feature_idx].vec<float>()(candidate_idx);
+
+      if (pruning_mode_ == kPrePruning) {
+        // Don't consider negative splits if we're pre-pruning the tree.
+        // Note that zero-gain splits are acceptable.
+        if (gain < 0) {
+          continue;
+        }
+      }
+      // For now assume that the weights vectors are one dimensional.
+      // TODO(nponomareva): change here for multiclass.
+      const float left_contrib =
+          learning_rate *
+          left_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
+      const float right_contrib =
+          learning_rate *
+          right_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
+
+      // unused.
+      int32 left_node_id;
+      int32 right_node_id;
+
+      ensemble_resource->AddBucketizedSplitNode(
+          current_tree, node_id, feature_id, threshold, gain, left_contrib,
+          right_contrib, &left_node_id, &right_node_id);
+      split_happened = true;
+    }
+    int32 node_id_end = ensemble_resource->GetNumNodes(current_tree);
+    if (split_happened) {
+      // Update growable tree metadata.
+      ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
+      // Finalize the tree if needed.
+      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth) {
+        // If the tree is finalized, next growing will start from node 0;
+        node_id_start = 0;
+        node_id_end = 1;
+        ensemble_resource->SetIsFinalized(current_tree, true);
+        if (pruning_mode_ == kPostPruning) {
+          ensemble_resource->PostPruneTree(current_tree);
+        }
+        if (ensemble_resource->num_trees() > 0) {
+          // Create a dummy new tree with an empty node.
+          ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
+        }
+      }
+      // If we managed to split, update the node range. If we didn't, don't
+      // update as we will try to split the same nodes with new instances.
+      ensemble_resource->UpdateLastLayerNodesRange(node_id_start, node_id_end);
+    }
+  }
+
+ private:
+  int32 UpdateGlobalAttemptsAndRetrieveGrowableTree(
+      BoostedTreesEnsembleResource* const ensemble_resource) {
+    int32 num_trees = ensemble_resource->num_trees();
+    int32 current_tree = num_trees - 1;
+
+    // Increment global attempt stats.
+    ensemble_resource->UpdateGrowingMetadata();
+
+    // Note we don't set tree weight to be equal to learning rate, since we
+    // apply learning rate to leaf weights instead, when doing layer-by-layer
+    // boosting.
+    if (num_trees <= 0) {
+      // Create a new tree with a no-op leaf.
+      current_tree = ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
+    }
+    return current_tree;
+  }
+
+  // Helper method which effectively does a reduce over all split candidates
+  // and finds the best split for each node.
+  void FindBestSplitsPerNode(
+      OpKernelContext* const context, const OpInputList& node_ids_list,
+      const OpInputList& gains_list,
+      std::map<int32, SplitCandidate>* best_split_per_node) {
+    // Find best split per node going through every feature candidate.
+    for (int64 feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      const auto& node_ids = node_ids_list[feature_idx].vec<int32>();
+      const auto& gains = gains_list[feature_idx].vec<float>();
+
+      for (size_t candidate_idx = 0; candidate_idx < node_ids.size();
+           ++candidate_idx) {
+        // Get current split candidate.
+        const auto& node_id = node_ids(candidate_idx);
+        const auto& gain = gains(candidate_idx);
+
+        auto best_split_it = best_split_per_node->find(node_id);
+        SplitCandidate candidate;
+        candidate.feature_idx = feature_idx;
+        candidate.candidate_idx = candidate_idx;
+        candidate.gain = gain;
+
+        if (best_split_it == best_split_per_node->end() ||
+            gain > best_split_it->second.gain) {
+          (*best_split_per_node)[node_id] = candidate;
+        }
+      }
+    }
+  }
+
+ private:
+  int32 num_features_;
+  PruningMode pruning_mode_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesUpdateEnsemble").Device(DEVICE_CPU),
+                        BoostedTreesUpdateEnsembleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2810925bbcd645f60af0e6025a74043cd45f21e7
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class BroadcastToOp : public OpKernel {
+ public:
+  explicit BroadcastToOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input_tensor = ctx->input(0);
+    const TensorShape& input_shape = input_tensor.shape();
+
+    const Tensor& shape_tensor = ctx->input(1);
+
+    TensorShape output_shape;
+    OP_REQUIRES_OK(ctx,
+                   ctx->op_kernel().MakeShape(shape_tensor, &output_shape));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
+
+    const Device& d = ctx->eigen_device<Device>();
+    functor::BroadcastTo<Device, T>()(d, ctx, *output_tensor, output_shape,
+                                      input_tensor, input_shape);
+  }
+};
+
+// As MakeShape is able to handle both DT_INT32 and DT_INT64,
+// no need to have TypeConstraint for `Tidx`
+#define REGISTER_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BroadcastTo").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BroadcastToOp<CPUDevice, type>);
+
+TF_CALL_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+
+namespace functor {
+#define DECLARE_GPU_TEMPLATE(Type)                              \
+  template <>                                                   \
+  void BroadcastTo<GPUDevice, Type>::operator()(                \
+      const GPUDevice& d, OpKernelContext* ctx, Tensor& output, \
+      const TensorShape& output_shape, const Tensor& input,     \
+      const TensorShape& input_shape);                          \
+  extern template struct BroadcastTo<GPUDevice, Type>;
+
+TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_TEMPLATE);
+#undef DECLARE_GPU_KERNEL
+}  // namespace functor
+
+#define REGISTER_KERNEL(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("BroadcastTo")            \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("shape"),      \
+                          BroadcastToOp<GPUDevice, type>);
+
+TF_CALL_GPU_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..608e9b6ac9c161ca2fb95897da371d355f0efe9f
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -0,0 +1,220 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+#define TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BroadcastTo {
+  void operator()(const Device &d, OpKernelContext *ctx, Tensor &output_tensor,
+                  const TensorShape &output_shape, const Tensor &input_tensor,
+                  const TensorShape &input_shape) {
+#define BROADCAST_SHAPE(broadcast, reshape, NDIMS, input_shape, output_shape) \
+  for (int i = 0; i < NDIMS; i++) {                                           \
+    OP_REQUIRES(ctx, (broadcast[i] % reshape[i] == 0),                        \
+                errors::InvalidArgument("invalid shape to broadcast from ",   \
+                                        input_shape.DebugString(), " to ",    \
+                                        output_shape.DebugString()));         \
+    broadcast[i] = broadcast[i] / reshape[i];                                 \
+  }
+
+    switch (output_shape.dims()) {
+      case 1: {
+        auto reshape = AsEigenDSizesWithPrefix<1>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<1>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 1, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 1>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 2: {
+        auto reshape = AsEigenDSizesWithPrefix<2>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<2>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 2, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 2>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 3: {
+        auto reshape = AsEigenDSizesWithPrefix<3>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<3>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 3, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 3>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 4: {
+        auto reshape = AsEigenDSizesWithPrefix<4>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<4>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 4, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 4>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 4: {
+            auto input = input_tensor.tensor<T, 4>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 5: {
+        auto reshape = AsEigenDSizesWithPrefix<5>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<5>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 5, input_shape, output_shape);
+        auto output = output_tensor.tensor<T, 5>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 4: {
+            auto input = input_tensor.tensor<T, 4>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 5: {
+            auto input = input_tensor.tensor<T, 5>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      default:
+        ctx->CtxFailure(errors::InvalidArgument(
+            "invalid shape to broadcast from ", input_shape.DebugString(),
+            " to ", output_shape.DebugString()));
+        break;
+    }
+  }
+
+ private:
+  template <int NDIMS>
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizesWithPrefix(
+      const TensorShape &shape) const {
+    Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+    for (int d = 0; d < NDIMS - shape.dims(); d++) {
+      dsizes[d] = 1;
+    }
+    for (int d = NDIMS - shape.dims(); d < NDIMS; d++) {
+      dsizes[d] = shape.dim_size(d - (NDIMS - shape.dims()));
+    }
+    return dsizes;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
diff --git a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..64595710853da0d6fc277c25f0700ab8fda6c526
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define INSTANTIATE_GPU_KERNEL(Type) \
+  template class functor::BroadcastTo<GPUDevice, Type>;
+TF_CALL_GPU_ALL_TYPES(INSTANTIATE_GPU_KERNEL);
+#undef INSTANTIATE_GPU_KERNEL
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index fd4e75d26f02dc75e13c8781049c904587d10afd..16d2e0e0a56d1f2f45a9394979b8cdcec1391154 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 3ae9f2ab4d9c102941927215441b4c02625387f0..382e5440e14954eec6e81fe7eabc2017706fe678 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -58,10 +58,14 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
   FN(arg0, arg1, std::complex<float>);       \
   FN(arg0, arg1, std::complex<double>)
 
-#define CURRY_TYPES3(FN, arg0, arg1)   \
-  CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
+#define CURRY_TYPES3_NO_BF16(FN, arg0, arg1) \
+  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)       \
   FN(arg0, arg1, Eigen::half);
 
+#define CURRY_TYPES3(FN, arg0, arg1)   \
+  CURRY_TYPES3_NO_BF16(FN, arg0, arg1) \
+  FN(arg0, arg1, bfloat16);
+
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
   if (DataTypeToEnum<OUT>::value == dst_dtype) {                           \
     return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {      \
diff --git a/tensorflow/core/kernels/cast_op_impl_bfloat.cc b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
index a06f815899016a9f1267fefb504a6ed6684a0101..bfa7ba0d4770e7a4ac1493482d90b166a4fcd3a2 100644
--- a/tensorflow/core/kernels/cast_op_impl_bfloat.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
@@ -24,17 +24,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetCpuCastFromBfloat(DataType dst_dtype) {
-  if (dst_dtype == DT_FLOAT) {
-    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
-      int64 N = out->NumElements();
-      auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-      auto work = [&inp, &out](int64 start, int64 end) {
-        BFloat16ToFloat(inp.flat<bfloat16>().data() + start,
-                        out->flat<float>().data() + start, end - start);
-      };
-      Shard(worker_threads->num_threads, worker_threads->workers, N, 2, work);
-    };
-  }
+  CURRY_TYPES3(CAST_CASE, CPUDevice, bfloat16);
   return nullptr;
 }
 
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index 5cd63f2458d2a01ec3d2c1621e6f86ecc6f5347f..c5c7394b43c92069aec4a46c9a712da1f606f6a8 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -29,7 +29,7 @@ GetCpuCastFromBool(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromBool(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, bool);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, bool);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_complex128.cc b/tensorflow/core/kernels/cast_op_impl_complex128.cc
index c428679d7c4983fe8b25461bc8bd431076fc9226..52899d58cdcff2df7fca07d223cc060ba080be82 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex128.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex128.cc
@@ -29,7 +29,7 @@ GetCpuCastFromComplex128(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromComplex128(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, std::complex<double>);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<double>);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_complex64.cc b/tensorflow/core/kernels/cast_op_impl_complex64.cc
index 07b46551b2ea5dbf214d167f9a9469fa658d8f4b..617bda53d5822f67186088c0251caf9c108e6a7d 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex64.cc
@@ -29,7 +29,7 @@ GetCpuCastFromComplex64(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromComplex64(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, std::complex<float>);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<float>);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index 1203f066a2db366d38ae99c5ff1e1f385979a8af..7dc485ddad275d6fcdcc54506fabce2a90819645 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -29,7 +29,7 @@ GetCpuCastFromDouble(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromDouble(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, double);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, double);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index 2ff9af21f2413b16e1d38a64594e0dcf89e14bcb..1c933914fde14987562b1d796ebf6621d7980b28 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -25,18 +25,6 @@ typedef Eigen::GpuDevice GPUDevice;
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetCpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, float);
-  if (dst_dtype == DT_BFLOAT16) {
-    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
-      int64 N = out->NumElements();
-      auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-      auto work = [&inp, &out](int64 start, int64 end) {
-        FloatToBFloat16(inp.flat<float>().data() + start,
-                        out->flat<bfloat16>().data() + start, end - start);
-      };
-      Shard(worker_threads->num_threads, worker_threads->workers, N, 2, work);
-    };
-  }
-
   return nullptr;
 }
 
@@ -44,7 +32,6 @@ GetCpuCastFromFloat(DataType dst_dtype) {
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, GPUDevice, float);
-  CAST_CASE(GPUDevice, float, bfloat16);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_half.cc b/tensorflow/core/kernels/cast_op_impl_half.cc
index e89d4646d71c5938543f626ffb1bca1f84b6b66f..ef4b94e3263054f46bbe5b7c5487cc7b30990995 100644
--- a/tensorflow/core/kernels/cast_op_impl_half.cc
+++ b/tensorflow/core/kernels/cast_op_impl_half.cc
@@ -29,7 +29,7 @@ GetCpuCastFromHalf(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromHalf(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, Eigen::half);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, Eigen::half);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_int16.cc b/tensorflow/core/kernels/cast_op_impl_int16.cc
index f12d852e957550e17521a4dc5b70d3807dacecd3..59360f744573803f44cf7c31d5acf836e580fdc4 100644
--- a/tensorflow/core/kernels/cast_op_impl_int16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int16.cc
@@ -29,7 +29,7 @@ GetCpuCastFromInt16(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromInt16(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, int16);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int16);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index 2a4b27a12dcba71f904f980c13d82fbc0dfc3ae2..a867392fde1c4aa45960bd5a490c2664c0ba0005 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -29,7 +29,7 @@ GetCpuCastFromInt32(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromInt32(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, int32);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int32);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 065defabbba6c926640b01eb68e4c0b20a8e241d..467a8f6c89b35ea1f1c8da1327f6d204b7e876b0 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -29,7 +29,7 @@ GetCpuCastFromInt64(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromInt64(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, int64);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int64);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_int8.cc b/tensorflow/core/kernels/cast_op_impl_int8.cc
index 8d678c47335f6f985c8e93bb72f61c95fbda834b..21002a4321be4474fdd6f60e61afce539d2ea177 100644
--- a/tensorflow/core/kernels/cast_op_impl_int8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int8.cc
@@ -29,7 +29,7 @@ GetCpuCastFromInt8(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromInt8(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, int8);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int8);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_uint16.cc b/tensorflow/core/kernels/cast_op_impl_uint16.cc
index c917aaf7bde6eccf9acb556152ee854c897143fa..cd829bae2a90af6daecf1a6f67be96cdb1854140 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint16.cc
@@ -29,7 +29,7 @@ GetCpuCastFromUint16(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromUint16(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, uint16);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint16);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_uint8.cc b/tensorflow/core/kernels/cast_op_impl_uint8.cc
index 377c8ca9536d69a49364e8f5f2b6ffb969ce234e..2d1a6f3a4edc72bfea53a2b95113d32e5b76c913 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint8.cc
@@ -29,7 +29,7 @@ GetCpuCastFromUint8(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromUint8(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, uint8);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint8);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index 057e209a71903ad24e2d4f757e4d2a3bc4357a76..7da9d28a3daf175e3cf6f2a667ea1213f83ab003 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -75,7 +75,8 @@ class CastOpTest : public OpsTestBase {
   TEST_CAST(in, int64);         \
   TEST_CAST(in, half);          \
   TEST_CAST(in, float);         \
-  TEST_CAST(in, double)
+  TEST_CAST(in, double);        \
+  TEST_CAST(in, bfloat16);
 
 TEST_ALL_CASTS_FROM(uint8)
 TEST_ALL_CASTS_FROM(uint16)
@@ -85,6 +86,7 @@ TEST_ALL_CASTS_FROM(int64)
 TEST_ALL_CASTS_FROM(half)
 TEST_ALL_CASTS_FROM(float)
 TEST_ALL_CASTS_FROM(double)
+TEST_ALL_CASTS_FROM(bfloat16)
 
 #undef TEST_ALL_CASTS_FROM
 #undef TEST_CAST
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index 534527c6bdc9ab971cd4c6001dcef8ee59a13a8d..c3c0c50007601c015a677705d452bd5fba63467e 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+
 #include <math.h>
 #include <algorithm>
 #include <numeric>
@@ -47,6 +49,8 @@ template <typename Device, typename T>
 class CheckNumericsOp;
 
 // Partial specialization for CPU
+// TODO(jeff,rmlarsen): We should make this variant be an AsyncOpKernel, as
+// was done for the GPU case below.
 template <typename T>
 class CheckNumericsOp<CPUDevice, T> : public OpKernel {
  public:
@@ -67,28 +71,32 @@ class CheckNumericsOp<CPUDevice, T> : public OpKernel {
     int fp_props =
         std::accumulate(data, data + size, 0, [](const int& x, const T& y) {
           int result = x;
-          if (Eigen::numext::isinf(y)) {
+          if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
+            // Do nothing: common case
+          } else if (Eigen::numext::isinf(y)) {
             result |= kInfBit;
           } else if (Eigen::numext::isnan(y)) {
             result |= kNaNBit;
           }
           return result;
         });
-    string status;
-    if ((fp_props & kInfBit) && (fp_props & kNaNBit)) {
-      status = "Inf and NaN";
-    } else {
-      if (fp_props & kInfBit) {
-        status = "Inf";
+    if (fp_props != 0) {
+      string status;
+      if ((fp_props & kInfBit) && (fp_props & kNaNBit)) {
+        status = "Inf and NaN";
+      } else {
+        if (fp_props & kInfBit) {
+          status = "Inf";
+        }
+        if (fp_props & kNaNBit) {
+          status = "NaN";
+        }
       }
-      if (fp_props & kNaNBit) {
-        status = "NaN";
+      if (!status.empty()) {
+        context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
+                                                   status, " values"));
       }
     }
-    if (!status.empty()) {
-      context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
-                                                 status, " values"));
-    }
   }
 
  private:
@@ -131,7 +139,7 @@ class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel {
     OP_REQUIRES_ASYNC(context, stream != nullptr,
                       errors::Internal("No GPU stream available."), done);
 
-    perftools::gputools::DeviceMemoryBase abnormal_detected_ptr(
+    se::DeviceMemoryBase abnormal_detected_ptr(
         abnormal_detected.flat<int>().data(),
         abnormal_detected.flat<int>().size());
     stream->ThenMemset32(&abnormal_detected_ptr, 0,
@@ -166,8 +174,8 @@ class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel {
     TensorReference abnormal_detected_ref(abnormal_detected);
     auto check_cb = [this, stream, abnormal_detected_ref,
                      abnormal_detected_host, context, done]() {
-      ::perftools::gputools::cuda::ScopedActivateExecutorContext
-          scoped_activation{stream->parent()};
+      se::cuda::ScopedActivateExecutorContext scoped_activation{
+          stream->parent()};
       auto abnormal_detected_host_flat = abnormal_detected_host.flat<int>();
       int is_nan = abnormal_detected_host_flat(0);
       int is_inf = abnormal_detected_host_flat(1);
@@ -213,6 +221,7 @@ class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel {
       Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       CheckNumericsOp<CPUDevice, T>);
 TF_CALL_half(REGISTER_CPU_KERNEL);
+TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5de41bac723ce2e62258c521a34d4775426643bd
--- /dev/null
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -0,0 +1,266 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace {
+class CollectiveOpKernel : public AsyncOpKernel {
+ public:
+  explicit CollectiveOpKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {}
+
+  // A string encoding instance, frame and iter to be handed off to
+  // the implementation for use in generating RecvBuf keys.
+  string GetCollectiveKey(OpKernelContext* c) {
+    return strings::StrCat(col_params_.instance.instance_key, ":",
+                           c->frame_iter().frame_id, ":",
+                           c->frame_iter().iter_id);
+  }
+
+  // Returns false if calling invocation of ComputeAsync should return
+  // immediately.
+  bool CanProceedWithCompute(OpKernelContext* c, CollectiveExecutor* col_exec,
+                             const DoneCallback& done) {
+    if (col_params_.group.group_size >
+        col_params_.instance.device_names.size()) {
+      // This is the first invocation: Finish initializing col_params_.
+      // Call in a blockable thread because it's not guaranteed that
+      // this call cannot block.
+      c->env()->SchedClosure([this, c, done, col_exec]() {
+        col_exec->CompleteParamsAsync(c->device()->name(), &col_params_,
+                                      c->cancellation_manager(),
+                                      [this, c, done](const Status& s) {
+                                        if (s.ok()) {
+                                          ComputeAsync(c, done);
+                                        } else {
+                                          c->SetStatus(s);
+                                          done();
+                                        }
+                                      });
+      });
+      return false;
+    }
+    return true;
+  }
+
+  CollectiveParams col_params_;
+};
+
+class CollectiveReduceOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveReduceOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("subdiv_offsets",
+                      &col_params_.instance.impl_details.subdiv_offsets));
+    string merge_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("merge_op", &merge_op_name));
+    OP_REQUIRES(c, merge_op_name == "Add" || merge_op_name == "Mul",
+                errors::InvalidArgument(
+                    "merge_op must be one of {\"Add\", \"Mul\"} but got ",
+                    merge_op_name));
+    string final_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("final_op", &final_op_name));
+    OP_REQUIRES(c, final_op_name == "Id" || final_op_name == "Div",
+                errors::InvalidArgument(
+                    "final_op must be one of {\"Id\", \"Div\"} but got ",
+                    final_op_name));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+
+    const NodeDef& real_node = c->def();
+    col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
+                                       merge_op_name, ",", final_op_name, ")");
+    col_params_.group.device_type = c->device_type();
+
+    // Find the OpKernels by name, type and device type.
+    NodeDef sub_node;
+    // The merge_op takes two inputs
+    sub_node.add_input(real_node.input(0));
+    sub_node.add_input(real_node.input(0));
+    sub_node.set_device(real_node.device());
+    SetAttrValue(col_params_.instance.data_type,
+                 &(*sub_node.mutable_attr())["T"]);
+    col_params_.merge_op = BuildOpKernel(c, merge_op_name, &sub_node);
+    col_params_.final_op = BuildOpKernel(c, final_op_name, &sub_node);
+  }
+
+  std::unique_ptr<OpKernel> BuildOpKernel(OpKernelConstruction* c,
+                                          const string& name,
+                                          NodeDef* sub_node) {
+    std::unique_ptr<OpKernel> k;
+    if (name.empty() || name == "Id") return k;
+    sub_node->set_name(name);
+    sub_node->set_op(name);
+    Status status;
+    k = CreateOpKernel(c->device_type(), c->device(),
+                       c->device()->GetAllocator(AllocatorAttributes()),
+                       *sub_node, c->graph_def_version(), &status);
+    if (!status.ok()) {
+      c->CtxFailureWithWarning(errors::Internal("Failed to build OpKernel for ",
+                                                name, " : ",
+                                                status.error_message()));
+    }
+    return k;
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    // Allocate the output tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->forward_input_or_allocate_output(
+                             {0}, 0, c->input(0).shape(), &output),
+                         done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveReduceOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_CPU),
+                        CollectiveReduceOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_GPU),
+                        CollectiveReduceOpKernel);
+
+class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveBcastSendOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    col_params_.is_source = true;
+    col_params_.instance.impl_details.subdiv_offsets = {0};
+
+    col_params_.name =
+        strings::StrCat(name(), ": Broadcast(", col_params_.is_source, ")");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    OP_REQUIRES_ASYNC(
+        c, shape_.IsSameSize(c->input(0).shape()),
+        errors::Internal("Declared shape of op ", col_params_.name,
+                         " does not match shape of input"),
+        done);
+    // Allocate the output Tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->forward_input_or_allocate_output({0}, 0, shape_, &output), done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TensorShape shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastSendOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_CPU),
+                        CollectiveBcastSendOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_GPU),
+                        CollectiveBcastSendOpKernel);
+
+class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveBcastRecvOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    col_params_.is_source = false;
+    col_params_.instance.impl_details.subdiv_offsets = {0};
+
+    col_params_.name =
+        strings::StrCat(name(), ": Broadcast(", col_params_.is_source, ")");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    // No input, so must allocate output.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape_, &output), done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TensorShape shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastRecvOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_CPU),
+                        CollectiveBcastRecvOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_GPU),
+                        CollectiveBcastRecvOpKernel);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index d8643c0b2fb2633f6b640b4f54dc2f8c92da654d..93e392d3032405ea848bd2f147653c9a5c7a1818 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -118,6 +118,7 @@ TF_CALL_complex128(REGISTER);
 TF_CALL_int64(REGISTER);
 TF_CALL_bfloat16(REGISTER);
 TF_CALL_bool(REGISTER);
+TF_CALL_uint8(REGISTER);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 0f7adaf24a8eff76c27109eb91389dffdca31380..a561d918bd36f711d1b813dfb533ec6d690af8ee 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -202,6 +202,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
 TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
 TF_CALL_int64(REGISTER_GPUCONCAT32);
+TF_CALL_uint8(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 REGISTER_GPUCONCAT32(bool);
 
@@ -209,6 +210,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
 TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
 TF_CALL_int64(REGISTER_GPUCONCAT64);
+TF_CALL_uint8(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 REGISTER_GPUCONCAT64(bool);
 
@@ -216,6 +218,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
 TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
 TF_CALL_int64(REGISTER_GPU32);
+TF_CALL_uint8(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 REGISTER_GPU32(bool);
 
@@ -223,6 +226,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
 TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
 TF_CALL_int64(REGISTER_GPU64);
+TF_CALL_uint8(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 REGISTER_GPU64(bool);
 
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 7011550f7e161c9727b8d31eff0917964b09044e..a87b63f913c279d35f625b096bb7ac947cb9230b 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -53,17 +53,38 @@ class ConcatBaseOp : public OpKernel {
   void Compute(OpKernelContext* c) override {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
-        AxisArgName == NAME_IS_AXIS
-            ? "axis"
-            : AxisArgName == NAME_IS_CONCAT_DIM ? "concat_dim" : "<invalid>";
+        AxisArgName == NAME_IS_AXIS ? "axis" : AxisArgName == NAME_IS_CONCAT_DIM
+                                                   ? "concat_dim"
+                                                   : "<invalid>";
     OP_REQUIRES_OK(c, c->input(axis_attribute_name, &concat_dim_tensor));
     OP_REQUIRES(c, IsLegacyScalar(concat_dim_tensor->shape()),
                 errors::InvalidArgument(
                     axis_attribute_name,
                     " tensor should be a scalar integer, but got shape ",
                     concat_dim_tensor->shape().DebugString()));
-    const int32 concat_dim =
-        internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    int64 concat_dim;
+    // In case of ConcatV2, "axis" could be int32 or int64
+    if (AxisArgName == NAME_IS_AXIS) {
+      OP_REQUIRES(
+          c, (concat_dim_tensor->dtype() == DT_INT32 ||
+              concat_dim_tensor->dtype() == DT_INT64),
+          errors::InvalidArgument(axis_attribute_name,
+                                  " tensor should be int32 or int64, but got ",
+                                  concat_dim_tensor->dtype()));
+    } else {
+      OP_REQUIRES(c, (concat_dim_tensor->dtype() == DT_INT32),
+                  errors::InvalidArgument(axis_attribute_name,
+                                          " tensor should be int32, but got ",
+                                          concat_dim_tensor->dtype()));
+    }
+    if (concat_dim_tensor->dtype() == DT_INT32) {
+      concat_dim =
+          internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    } else {
+      concat_dim =
+          internal::SubtleMustCopy(concat_dim_tensor->scalar<int64>()());
+    }
+
     OpInputList values;
     OP_REQUIRES_OK(c, c->input_list("values", &values));
     const int N = values.size();
@@ -154,17 +175,16 @@ using ConcatOp = ConcatBaseOp<Device, T, NAME_IS_CONCAT_DIM>;
 template <typename Device, typename T>
 using ConcatV2Op = ConcatBaseOp<Device, T, NAME_IS_AXIS>;
 
-#define REGISTER_CONCAT(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("Concat")                     \
-                              .Device(DEVICE_CPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .HostMemory("concat_dim"),     \
-                          ConcatOp<CPUDevice, type>)         \
-  REGISTER_KERNEL_BUILDER(Name("ConcatV2")                   \
-                              .Device(DEVICE_CPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("axis"),           \
+#define REGISTER_CONCAT(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("concat_dim"), \
+                          ConcatOp<CPUDevice, type>)     \
+  REGISTER_KERNEL_BUILDER(Name("ConcatV2")               \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("axis"),       \
                           ConcatV2Op<CPUDevice, type>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_CONCAT);
@@ -178,21 +198,21 @@ REGISTER_CONCAT(qint32);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                                   \
-  REGISTER_KERNEL_BUILDER(Name("Concat")                     \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .HostMemory("concat_dim"),     \
-                          ConcatOp<GPUDevice, type>)         \
-  REGISTER_KERNEL_BUILDER(Name("ConcatV2")                   \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("axis"),           \
+#define REGISTER_GPU(type)                               \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("concat_dim"), \
+                          ConcatOp<GPUDevice, type>)     \
+  REGISTER_KERNEL_BUILDER(Name("ConcatV2")               \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("axis"),       \
                           ConcatV2Op<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
+TF_CALL_uint8(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
@@ -212,7 +232,6 @@ REGISTER_KERNEL_BUILDER(Name("Concat")
 REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tidx")
                             .HostMemory("values")
                             .HostMemory("axis")
                             .HostMemory("output"),
@@ -221,17 +240,16 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                  \
-  REGISTER_KERNEL_BUILDER(Name("Concat")                     \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<type>("T")     \
-                              .HostMemory("concat_dim"),     \
-                          ConcatOp<SYCLDevice, type>)        \
-  REGISTER_KERNEL_BUILDER(Name("ConcatV2")                   \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("axis"),           \
+#define REGISTER_SYCL(type)                              \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("concat_dim"), \
+                          ConcatOp<SYCLDevice, type>)    \
+  REGISTER_KERNEL_BUILDER(Name("ConcatV2")               \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("axis"),       \
                           ConcatV2Op<SYCLDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
@@ -246,7 +264,6 @@ REGISTER_KERNEL_BUILDER(Name("Concat")
 REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tidx")
                             .HostMemory("values")
                             .HostMemory("axis")
                             .HostMemory("output"),
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index e3ba8ae9f691c8ec9be79952d7f97801552b2a56..39b44b2fcc8eb8336bfcf721919201536ed56133 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -78,6 +78,9 @@ static void BM_ConcatDim1Float(int iters, int dim2) {
 BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
 
+static void BM_ConcatDim1uint8(int iters, int dim2) {
+  ConcatHelper<uint8>(iters, 1, dim2);
+}
 static void BM_ConcatDim1int16(int iters, int dim2) {
   ConcatHelper<int16>(iters, 1, dim2);
 }
@@ -85,6 +88,7 @@ static void BM_ConcatDim1bfloat16(int iters, int dim2) {
   ConcatHelper<bfloat16>(iters, 1, dim2);
 }
 
+BENCHMARK(BM_ConcatDim1uint8)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index fdb03a5aae8bd9dbe180e9b722fc5847c740801b..fe1a1ba5a306422d410a7b4646078b7b5e4c31eb 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -105,7 +105,12 @@ REGISTER_KERNEL(GPU, int8);
 REGISTER_KERNEL(GPU, qint8);
 REGISTER_KERNEL(GPU, uint16);
 REGISTER_KERNEL(GPU, int16);
+REGISTER_KERNEL(GPU, qint16);
+REGISTER_KERNEL(GPU, quint16);
+REGISTER_KERNEL(GPU, uint32);
+REGISTER_KERNEL(GPU, qint32);
 REGISTER_KERNEL(GPU, int64);
+REGISTER_KERNEL(GPU, uint64);
 REGISTER_KERNEL(GPU, complex64);
 REGISTER_KERNEL(GPU, complex128);
 REGISTER_KERNEL(GPU, bool);
@@ -122,9 +127,15 @@ REGISTER_SYCL_KERNEL(SYCL, float);
 REGISTER_SYCL_KERNEL(SYCL, double);
 REGISTER_SYCL_KERNEL(SYCL, uint8);
 REGISTER_SYCL_KERNEL(SYCL, int8);
+REGISTER_SYCL_KERNEL(SYCL, qint8);
 REGISTER_SYCL_KERNEL(SYCL, uint16);
 REGISTER_SYCL_KERNEL(SYCL, int16);
+REGISTER_SYCL_KERNEL(SYCL, qint16);
+REGISTER_SYCL_KERNEL(SYCL, quint16);
+REGISTER_SYCL_KERNEL(SYCL, uint32);
+REGISTER_SYCL_KERNEL(SYCL, qint32);
 REGISTER_SYCL_KERNEL(SYCL, int64);
+REGISTER_SYCL_KERNEL(SYCL, uint64);
 REGISTER_SYCL_KERNEL(SYCL, bool);
 #undef REGISTER_SYCL_KERNEL
 #endif
@@ -247,13 +258,15 @@ REGISTER_KERNEL(GPU, Eigen::half);
 REGISTER_KERNEL(GPU, bfloat16);
 REGISTER_KERNEL(GPU, float);
 REGISTER_KERNEL(GPU, double);
+REGISTER_KERNEL(GPU, complex64);
+REGISTER_KERNEL(GPU, complex128);
 REGISTER_KERNEL(GPU, uint8);
 REGISTER_KERNEL(GPU, int8);
 REGISTER_KERNEL(GPU, uint16);
 REGISTER_KERNEL(GPU, int16);
 REGISTER_KERNEL(GPU, int64);
 REGISTER_KERNEL(GPU, bool);
-// Currently we do not support filling strings and complex64 on GPU
+// Currently we do not support filling strings on GPU
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index b8a5ae6a08e5c22fb5d69112b216b3c342b1bb1a..ef1e73e5ab1cbc28b31d1d47d89c6537ffebfdc1 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#ifdef TENSORFLOW_USE_LIBXSMM
+#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 #endif
 #include "tensorflow/core/kernels/ops_util.h"
@@ -106,7 +106,7 @@ struct LaunchConv2DBackpropFilterOp<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_LIBXSMM
+#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 template <typename Device, class T>
 struct LaunchXsmmBackwardFilter {
   bool operator()(OpKernelContext* context, const Device& d,
@@ -243,7 +243,8 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
       return;
     }
 
-#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+#if defined TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS && \
+    defined TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
     OP_REQUIRES_OK(
@@ -371,7 +372,8 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
             dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
             dims.spatial_dims[1].stride, padding_,
             &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
-#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+#if defined TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS && \
+    defined TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
     if (pad_left == pad_right && pad_top == pad_bottom) {
       if (LaunchXsmmBackwardFilter<Device, T>()(
               context, context->eigen_device<Device>(), input.tensor<T, 4>(),
@@ -518,6 +520,7 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 // GPU definitions.
@@ -529,7 +532,7 @@ struct ConvBackwardFilterAutoTuneGroup {
   static string name() { return "ConvBwdFilter"; }
 };
 typedef AutoTuneSingleton<ConvBackwardFilterAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConvBwdFilter;
 
 // Backprop for filter.
@@ -633,9 +636,9 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     const Tensor& out_backprop, const Tensor& input, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     Tensor* filter_backprop, TensorFormat data_format) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmDesc;
-  using perftools::gputools::dnn::ProfileResult;
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
 
   std::vector<int32> dilations(4, 1);
   dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
@@ -718,9 +721,9 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
 
     bool blas_launch_status =
         stream
-            ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                           perftools::gputools::blas::Transpose::kTranspose, n,
-                           m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
+            ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                           se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                           a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
             .ok();
     if (!blas_launch_status) {
       ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -748,9 +751,9 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
 
     bool blas_launch_status =
         stream
-            ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                           perftools::gputools::blas::Transpose::kTranspose, n,
-                           m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
+            ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                           se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                           b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
             .ok();
     if (!blas_launch_status) {
       ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -784,24 +787,24 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   CHECK(padding_rows >= 0 && padding_cols >= 0)
       << "Negative row or col paddings: (" << padding_rows << ", "
       << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input, data_format, 'H'))
       .set_width(GetTensorDim(compatible_input, data_format, 'W'))
       .set_feature_map_count(dims.in_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(dims.batch_size)
       .set_height(dims.spatial_dims[0].output_size)
       .set_width(dims.spatial_dims[1].output_size)
       .set_feature_map_count(dims.out_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
       .set_input_filter_width(dims.spatial_dims[1].filter_size)
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
@@ -909,7 +912,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
                                 conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+        &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
@@ -1015,11 +1019,17 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
+DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("filter_sizes"),
+                        Conv2DSlowBackpropFilterOp<GPUDevice, double>);
 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index b87c7899c00ab79c60bdbd85ce28399d103d271d..35f2676023afb71fed0e3a8d060213ae38b9f45c 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
-#ifdef TENSORFLOW_USE_LIBXSMM
+#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 #endif
 #include "tensorflow/core/kernels/ops_util.h"
@@ -111,7 +111,7 @@ struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_LIBXSMM
+#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 template <typename Device, class T>
 struct LaunchXsmmBackwardInputConvolution {
   bool operator()(OpKernelContext* context, const Device& d,
@@ -246,7 +246,8 @@ class Conv2DFastBackpropInputOp : public OpKernel {
       return;
     }
 
-#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+#if defined TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS && \
+    defined TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
     OP_REQUIRES_OK(
@@ -363,7 +364,8 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
 
 // TODO(andydavis) Consider moving code shared with
 // Conv2DCustomBackpropFilterOp into a shared helper function.
-#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+#if defined TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS && \
+    defined TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
     OP_REQUIRES_OK(
@@ -590,6 +592,7 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 // GPU definitions.
@@ -601,7 +604,7 @@ struct ConvBackwardDataAutoTuneGroup {
   static string name() { return "ConvBwdData"; }
 };
 typedef AutoTuneSingleton<ConvBackwardDataAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConvBwdData;
 
 // Backprop for input.
@@ -702,9 +705,9 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     const Tensor& out_backprop, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     Tensor* in_backprop, TensorFormat data_format) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmDesc;
-  using perftools::gputools::dnn::ProfileResult;
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
 
   std::vector<int32> strides(4, 1);
   std::vector<int32> dilations(4, 1);
@@ -775,8 +778,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
                                 in_backprop->template flat<T>().size());
 
-    auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto transpose = se::blas::Transpose::kTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
 
     bool blas_launch_status =
         stream
@@ -807,8 +810,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
                                 in_backprop->template flat<T>().size());
 
-    auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto transpose = se::blas::Transpose::kTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
 
     bool blas_launch_status =
         stream
@@ -838,24 +841,24 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   CHECK(padding_rows >= 0 && padding_cols >= 0)
       << "Negative row or col paddings: (" << padding_rows << ", "
       << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
       .set_width(GetTensorDim(compatible_input_shape, data_format, 'W'))
       .set_feature_map_count(dims.in_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(dims.batch_size)
       .set_height(dims.spatial_dims[0].output_size)
       .set_width(dims.spatial_dims[1].output_size)
       .set_feature_map_count(dims.out_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
       .set_input_filter_width(dims.spatial_dims[1].filter_size)
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
@@ -958,7 +961,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
                                 conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+        &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
@@ -1088,11 +1092,17 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
+DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("input_sizes"),
+                        Conv2DSlowBackpropInputOp<GPUDevice, double>);
 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 3650ab53b2533e3c95a764ead2d1318c4006c9e7..9edc6d416e33d66514f60ec6dac0619776a7ea15 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -35,7 +35,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
-using perftools::gputools::dnn::DimIndex;
+using stream_executor::dnn::DimIndex;
 #endif
 
 namespace tensorflow {
@@ -79,13 +79,18 @@ typedef Eigen::GpuDevice GPUDevice;
       context, out_depth == GetTensorDim(out_backprop, data_format_, 'C'),     \
       errors::InvalidArgument(                                                 \
           label, ": filter and out_backprop must have the same out_depth"));   \
+  const std::array<int64, 3> dilations = {                                     \
+      {GetTensorDim(dilation_, data_format_, '0'),                             \
+       GetTensorDim(dilation_, data_format_, '1'),                             \
+       GetTensorDim(dilation_, data_format_, '2')}};                           \
   const std::array<int64, 3> strides = {                                       \
       {GetTensorDim(stride_, data_format_, '0'),                               \
        GetTensorDim(stride_, data_format_, '1'),                               \
        GetTensorDim(stride_, data_format_, '2')}};                             \
   std::array<int64, 3> out, padding;                                           \
-  OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides,    \
-                                          padding_, &out, &padding));          \
+  OP_REQUIRES_OK(                                                              \
+      context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,  \
+                                 padding_, &out, &padding));                   \
   OP_REQUIRES(context, output_planes == out[0],                                \
               errors::InvalidArgument(                                         \
                   label,                                                       \
@@ -151,6 +156,26 @@ class Conv3DBackpropInputOp : public OpKernel {
               "Conv3DBackpropInputOpV2 only supports NDHWC on the CPU."));
     }
 
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+
+    // TODO(yangzihao): Add CPU version of dilated conv 3D.
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '2') == 1),
+                errors::InvalidArgument(
+                    "Current CPU implementation does not yet support "
+                    "dilation rates larger than 1."));
+
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -223,6 +248,7 @@ class Conv3DBackpropInputOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -261,6 +287,26 @@ class Conv3DBackpropFilterOp : public OpKernel {
               "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU."));
     }
 
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+
+    // TODO(yangzihao): Add CPU version of dilated conv 3D.
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '2') == 1),
+                errors::InvalidArgument(
+                    "Current CPU implementation does not yet support "
+                    "dilation rates larger than 1."));
+
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -370,6 +416,7 @@ class Conv3DBackpropFilterOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -421,7 +468,7 @@ struct Conv3dBackwardDataAutoTuneGroup {
   static string name() { return "Conv3dBwdData"; }
 };
 typedef AutoTuneSingleton<Conv3dBackwardDataAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
 
     AutoTuneConv3dBwdData;
 template <typename T>
@@ -438,6 +485,22 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                   errors::InvalidArgument("Invalid data format"));
     }
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -448,6 +511,12 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
          GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
@@ -471,6 +540,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
     if (filter_size[0] == 1 && filter_size[1] == 1 && filter_size[2] == 1 &&
+        dilation_[0] == 1 && dilation_[1] == 1 && dilation_[2] == 1 &&
         stride_[0] == 1 && stride_[1] == 1 && stride_[2] == 1 &&
         data_format_ == FORMAT_NHWC) {
       const uint64 m = batch * input_size[0] * input_size[1] * input_size[2];
@@ -484,8 +554,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
                                   in_backprop->template flat<T>().size());
 
-      auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto transpose = se::blas::Transpose::kTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
 
       bool blas_launch_status =
           stream
@@ -512,8 +582,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
                                   in_backprop->template flat<T>().size());
 
-      auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto transpose = se::blas::Transpose::kTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
 
       bool blas_launch_status =
           stream
@@ -559,28 +629,31 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc(3);
+    se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X, compatible_input_shape.dim_size(4))
         .set_spatial_dim(DimIndex::Y, compatible_input_shape.dim_size(3))
         .set_spatial_dim(DimIndex::Z, compatible_input_shape.dim_size(2))
         .set_feature_map_count(in_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X, output_cols)
         .set_spatial_dim(DimIndex::Y, output_rows)
         .set_spatial_dim(DimIndex::Z, output_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_size[2])
         .set_spatial_dim(DimIndex::Y, filter_size[1])
         .set_spatial_dim(DimIndex::Z, filter_size[0])
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_filter_stride(DimIndex::X, strides[2])
+    se::dnn::ConvolutionDescriptor conv_desc(3);
+    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+        .set_dilation_rate(DimIndex::Y, dilations[1])
+        .set_dilation_rate(DimIndex::Z, dilations[0])
+        .set_filter_stride(DimIndex::X, strides[2])
         .set_filter_stride(DimIndex::Y, strides[1])
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, padding_cols / 2)
@@ -645,24 +718,24 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
-        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
-        // conv is supported.
-        /*dilation=*/{{1, 1, 1}},
+        {{dilations[0], dilations[1], dilations[2]}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
         device_id,
     };
 
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmDesc;
-    using perftools::gputools::dnn::ProfileResult;
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+              stream->parent()),
+          &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
@@ -753,6 +826,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -765,7 +839,7 @@ struct Conv3dBackwardFilterAutoTuneGroup {
   static string name() { return "Conv3dBwdFilter"; }
 };
 typedef AutoTuneSingleton<Conv3dBackwardFilterAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConv3dBwdFilter;
 
 template <typename T>
@@ -782,6 +856,22 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                   errors::InvalidArgument("Invalid data format"));
     }
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -792,6 +882,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
          GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
@@ -818,6 +914,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
     if (filter_size[1] == 1 && filter_size[2] == 1 && filter_size[0] == 1 &&
+        dilations[2] == 1 && dilations[1] == 1 && dilations[0] == 1 &&
         strides[2] == 1 && strides[1] == 1 && strides[0] == 1 &&
         data_format_ == FORMAT_NHWC) {
       const uint64 m = in_depth;
@@ -844,9 +941,9 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
 
       bool blas_launch_status =
           stream
-              ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                             perftools::gputools::blas::Transpose::kTranspose,
-                             n, m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
+              ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                             se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                             a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
               .ok();
       if (!blas_launch_status) {
         context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -870,9 +967,9 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
 
       bool blas_launch_status =
           stream
-              ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                             perftools::gputools::blas::Transpose::kTranspose,
-                             n, m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
+              ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                             se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                             b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
               .ok();
       if (!blas_launch_status) {
         context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -917,7 +1014,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc(3);
+    se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X,
                          GetTensorDim(compatible_input, data_format_, '2'))
@@ -926,22 +1023,25 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         .set_spatial_dim(DimIndex::Z,
                          GetTensorDim(compatible_input, data_format_, '0'))
         .set_feature_map_count(in_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X, output_cols)
         .set_spatial_dim(DimIndex::Y, output_rows)
         .set_spatial_dim(DimIndex::Z, output_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_size[2])
         .set_spatial_dim(DimIndex::Y, filter_size[1])
         .set_spatial_dim(DimIndex::Z, filter_size[0])
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_filter_stride(DimIndex::X, strides[2])
+    se::dnn::ConvolutionDescriptor conv_desc(3);
+    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+        .set_dilation_rate(DimIndex::Y, dilations[1])
+        .set_dilation_rate(DimIndex::Z, dilations[0])
+        .set_filter_stride(DimIndex::X, strides[2])
         .set_filter_stride(DimIndex::Y, strides[1])
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, padding_cols / 2)
@@ -1014,22 +1114,24 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
-        {{1, 1, 1}},
+        {{dilations[0], dilations[1], dilations[2]}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
         device_id,
     };
 
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmDesc;
-    using perftools::gputools::dnn::ProfileResult;
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdFilter::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+              stream->parent()),
+          &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
@@ -1098,6 +1200,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 2b81e14f95b1b3f4c04e02d50180a9adda9e51e0..c6d36b40fe7129b58618798bd375bf0682894464 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/deep_conv2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#ifdef TENSORFLOW_USE_LIBXSMM
+#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 #endif
 #include "tensorflow/core/lib/core/errors.h"
@@ -185,7 +185,7 @@ class LaunchDeepConvOp<CPUDevice, float> {
   }
 };
 
-#ifdef TENSORFLOW_USE_LIBXSMM
+#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 template <typename Device, typename T>
 class LaunchXsmmConvOp {
  public:
@@ -401,7 +401,7 @@ class Conv2DOp : public BinaryOp<T> {
       return;
     }
 
-#ifdef TENSORFLOW_USE_LIBXSMM
+#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
     if (LaunchXsmmConvOp<Device, T>::Run(
             context, input, filter, batch, input_rows, input_cols, in_depth,
             filter_rows, filter_cols, pad_rows, pad_cols, out_rows, out_cols,
@@ -446,10 +446,11 @@ class Conv2DOp : public BinaryOp<T> {
 #if !defined(USE_GEMM_FOR_CONV)
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
 #endif  // USE_GEMM_FOR_CONV
 
 // To be used inside depthwise_conv_op.cc.
-template class LaunchConv2DOp<CPUDevice, float>;
+template struct LaunchConv2DOp<CPUDevice, float>;
 
 #if GOOGLE_CUDA
 int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
@@ -474,7 +475,7 @@ struct ConvAutoTuneGroup {
   static string name() { return "Conv"; }
 };
 typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConv;
 
 template <typename T>
@@ -483,9 +484,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     const Tensor& input_param, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     Tensor* output, TensorFormat data_format) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmDesc;
-  using perftools::gputools::dnn::ProfileResult;
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
@@ -513,7 +514,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                 output->template flat<T>().size());
 
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
     bool blas_launch_status =
         stream
             ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
@@ -542,7 +543,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                 output->template flat<T>().size());
 
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
     bool blas_launch_status =
         stream
             ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
@@ -628,24 +629,24 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   CHECK(padding_rows >= 0 && padding_cols >= 0)
       << "Negative row or col paddings: (" << padding_rows << ", "
       << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_depths)
       .set_height(in_rows)
       .set_width(in_cols)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(out_batch)
       .set_height(out_rows)
       .set_width(out_cols)
       .set_feature_map_count(out_depths)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(filter.dim_size(0))
       .set_input_filter_width(filter.dim_size(1))
       .set_input_feature_map_count(filter.dim_size(2))
       .set_output_feature_map_count(filter.dim_size(3));
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(row_dilation)
       .set_horizontal_dilation_rate(col_dilation)
       .set_vertical_filter_stride(row_stride)
@@ -709,7 +710,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+        &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
@@ -810,6 +812,7 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);     \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
+DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
 #undef DECLARE_GPU_SPEC
@@ -822,6 +825,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv2DOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    Conv2DOp<GPUDevice, double>);
 
 // To be used inside depthwise_conv_op.cc.
 template class LaunchConv2DOp<GPUDevice, float>;
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 21c84b2a0ed15eaada88e308e1761dcb58cb07b3..9ec16be67d8c0d6ea228898c14dbb575dd78cbe9 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -34,7 +34,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
-using perftools::gputools::dnn::DimIndex;
+using stream_executor::dnn::DimIndex;
 #endif
 
 namespace tensorflow {
@@ -49,12 +49,18 @@ template <typename T>
 struct LaunchConvOp<CPUDevice, T> {
   static void launch(OpKernelContext* context, bool cudnn_use_autotune,
                      const Tensor& input, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
                      const std::array<int64, 3>& strides, const Padding padding,
                      TensorFormat data_format, Tensor* output) {
     OP_REQUIRES(context, data_format == FORMAT_NHWC,
                 errors::InvalidArgument("CPU implementation of Conv3D "
                                         "currently only supports the NHWC "
                                         "tensor format."));
+    OP_REQUIRES(context,
+                dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1,
+                errors::InvalidArgument("CPU implementation of Conv3D "
+                                        "currently only supports dilated rates "
+                                        "of 1."));
     functor::CuboidConvolution<CPUDevice, T>()(
         context->eigen_device<CPUDevice>(), output->tensor<T, 5>(),
         input.tensor<T, 5>(), filter.tensor<T, 5>(), strides[2], strides[1],
@@ -80,6 +86,28 @@ class Conv3DOp : public BinaryOp<T> {
          GetTensorDim(stride_, data_format_, 'C') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'N') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'C') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
@@ -115,13 +143,18 @@ class Conv3DOp : public BinaryOp<T> {
          GetTensorDim(input, data_format_, '2')}};
     std::array<int64, 3> filter_size = {
         {filter.dim_size(0), filter.dim_size(1), filter.dim_size(2)}};
+    std::array<int64, 3> dilations = {
+        {GetTensorDim(dilation_, data_format_, '0'),
+         GetTensorDim(dilation_, data_format_, '1'),
+         GetTensorDim(dilation_, data_format_, '2')}};
     std::array<int64, 3> strides = {{GetTensorDim(stride_, data_format_, '0'),
                                      GetTensorDim(stride_, data_format_, '1'),
                                      GetTensorDim(stride_, data_format_, '2')}};
     std::array<int64, 3> out, padding;
 
-    OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides,
-                                            padding_, &out, &padding));
+    OP_REQUIRES_OK(
+        context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,
+                                   padding_, &out, &padding));
     TensorShape out_shape = ShapeFromFormat(
         data_format_, in_batch, {{out[0], out[1], out[2]}}, out_depth);
     Tensor* output;
@@ -131,10 +164,12 @@ class Conv3DOp : public BinaryOp<T> {
     if (out_shape.num_elements() == 0) return;
 
     LaunchConvOp<Device, T>::launch(context, cudnn_use_autotune_, input, filter,
-                                    strides, padding_, data_format_, output);
+                                    dilations, strides, padding_, data_format_,
+                                    output);
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -157,7 +192,7 @@ struct Conv3dAutoTuneGroup {
   static string name() { return "Conv3d"; }
 };
 typedef AutoTuneSingleton<Conv3dAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConv3d;
 
 // TODO(mjanusz): Share logic with 2d implementation as much as possible.
@@ -165,6 +200,7 @@ template <typename T>
 struct LaunchConvOp<GPUDevice, T> {
   static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
                      const Tensor& input_param, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
                      const std::array<int64, 3>& strides, const Padding padding,
                      TensorFormat data_format, Tensor* output) {
     auto* stream = ctx->op_device_context()->stream();
@@ -199,6 +235,7 @@ struct LaunchConvOp<GPUDevice, T> {
 
     // NOTE: This only works in NHWC.
     if (filter_planes == 1 && filter_rows == 1 && filter_cols == 1 &&
+        dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1 &&
         strides[0] == 1 && strides[1] == 1 && strides[2] == 1 &&
         data_format == FORMAT_NHWC) {
       // 1x1 filter, so call cublas directly.
@@ -213,7 +250,7 @@ struct LaunchConvOp<GPUDevice, T> {
       auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                   output->template flat<T>().size());
 
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
       bool blas_launch_status =
           stream
               ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
@@ -240,7 +277,7 @@ struct LaunchConvOp<GPUDevice, T> {
       auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                   output->template flat<T>().size());
 
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
       bool blas_launch_status =
           stream
               ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
@@ -309,28 +346,31 @@ struct LaunchConvOp<GPUDevice, T> {
     CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
         << "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
         << pad_planes << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc(3);
+    se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(in_batch)
         .set_feature_map_count(in_depth)
         .set_spatial_dim(DimIndex::X, in_cols)
         .set_spatial_dim(DimIndex::Y, in_rows)
         .set_spatial_dim(DimIndex::Z, in_planes)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(in_batch)
         .set_spatial_dim(DimIndex::X, out_cols)
         .set_spatial_dim(DimIndex::Y, out_rows)
         .set_spatial_dim(DimIndex::Z, out_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
         .set_spatial_dim(DimIndex::Y, filter_rows)
         .set_spatial_dim(DimIndex::Z, filter_planes)
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_filter_stride(DimIndex::X, strides[2])
+    se::dnn::ConvolutionDescriptor conv_desc(3);
+    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+        .set_dilation_rate(DimIndex::Y, dilations[1])
+        .set_dilation_rate(DimIndex::Z, dilations[0])
+        .set_filter_stride(DimIndex::X, strides[2])
         .set_filter_stride(DimIndex::Y, strides[1])
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, pad_cols / 2)
@@ -377,18 +417,16 @@ struct LaunchConvOp<GPUDevice, T> {
         {{in_planes, in_rows, in_cols}},
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
-        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
-        // conv is supported.
-        /*dilation=*/{{1, 1, 1}},
+        {{dilations[0], dilations[1], dilations[2]}},
         {{strides[0], strides[1], strides[2]}},
         {{pad_planes, pad_rows, pad_cols}},
         dtype,
         device_id,
     };
 
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmDesc;
-    using perftools::gputools::dnn::ProfileResult;
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
 
     AlgorithmConfig algorithm_config;
 
@@ -396,7 +434,9 @@ struct LaunchConvOp<GPUDevice, T> {
                                   conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+              stream->parent()),
+          &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index f0085be3a53b71af85d4c5f4bbcc6b07cd982ca8..d2c8020bb629c7c8ccbc3e06bdbe4f6af25833f2 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -36,25 +36,23 @@ int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
 // A class to provide scratch-space allocator for Stream-Executor Cudnn
 // callback. TensorFlow is responsible for releasing the temporary buffers after
 // the kernel finishes.
-class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
+class CudnnScratchAllocator : public se::ScratchAllocator {
  public:
   virtual ~CudnnScratchAllocator() {}
   CudnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return memory_limit_;
   }
-  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override {
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
     if (byte_size < 0) {
-      return perftools::gputools::port::Status{
-          perftools::gputools::port::error::INVALID_ARGUMENT,
-          "Requested negative byte size!"};
+      return se::port::Status{se::port::error::INVALID_ARGUMENT,
+                              "Requested negative byte size!"};
     }
     if (byte_size > memory_limit_) {
-      return perftools::gputools::port::StatusOr<
-          perftools::gputools::DeviceMemory<uint8>>();
+      return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     AllocationAttributes allocation_attr;
     allocation_attr.no_retry_on_failure = true;
@@ -62,15 +60,13 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return perftools::gputools::port::StatusOr<
-          perftools::gputools::DeviceMemory<uint8>>();
+      return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return perftools::gputools::port::StatusOr<
-        perftools::gputools::DeviceMemory<uint8>>(
+    return se::port::StatusOr<se::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
@@ -137,20 +133,17 @@ class ConvParameters {
     // clang-format on
   }
 
-  // TODO(yangzihao): The purpose of this function is to disable winograd
-  // nonfused conv algorithm for certain input parameters so as to avoid a bug
-  // in cuDNNv5 and cuDNNv6. Remove this once switch to cuDNNv7.
+  // The purpose of this function is to disable winograd nonfused conv algorithm
+  // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6.
   template <typename T>
-  bool ShouldIncludeWinogradNonfusedAlgo() const {
-    int64 total_size = 16 * std::ceil(batch_ / 16.0) *
-                       std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
-                       sizeof(T);
-    int64 threshold = 1LL << 31;
-    if (total_size >= threshold) {
-      return false;
-    } else {
+  bool ShouldIncludeWinogradNonfusedAlgo(
+      se::StreamExecutor* stream_exec) const {
+    // Skip this check for cuDNN 7 and newer.
+    auto version = stream_exec->AsDnn()->GetVersion();
+    if (version.ok() && version.ValueOrDie().major_version() >= 7) {
       return true;
     }
+    return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
   }
 
  protected:
@@ -166,6 +159,21 @@ class ConvParameters {
   uint64 hash_code_;
 
  private:
+  friend struct ConvParametersPeer;  // For testing purposes.
+
+  template <typename T>
+  bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() const {
+    int64 total_size = 16 * std::ceil(batch_ / 16.0) *
+                       std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
+                       sizeof(T);
+    int64 threshold = 1LL << 31;
+    if (total_size >= threshold) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
   int64 batch_;
   int64 in_depths_;
   int64 out_depths_;
diff --git a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
index b5dd26a9e47578619945da21c85b4c5b40a55132..52859af950e3c346536acd246bafde830b405ee5 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
@@ -25,6 +25,9 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
+template struct functor::InflatePadAndShuffle<GPUDevice, double, 4, int>;
+template struct functor::InflatePadAndShuffle<GPUDevice, double, 4,
+                                              Eigen::DenseIndex>;
 template struct functor::InflatePadAndShuffle<GPUDevice, float, 4, int>;
 template struct functor::InflatePadAndShuffle<GPUDevice, float, 4,
                                               Eigen::DenseIndex>;
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index a376534badc73065e3ec01972dde85da7bbdb0f8..2503b475dc10e631863e06b1e4d6931928fb4321 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -1039,9 +1039,11 @@ template struct functor::SwapDimension0And2InTensor3<GPUDevice, double2,
                                                      /*conjugate=*/true>;
 
 // For 2d ops.
+template struct functor::TransformFilter<GPUDevice, double, int, 4>;
 template struct functor::TransformFilter<GPUDevice, float, int, 4>;
 template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
 
+template struct functor::ReverseTransformFilter<GPUDevice, double, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;
 
@@ -1054,6 +1056,7 @@ template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
 template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;
 
 template struct functor::PadInput<GPUDevice, int, int, 4>;
+template struct functor::PadInput<GPUDevice, double, int, 4>;
 template struct functor::PadInput<GPUDevice, float, int, 4>;
 template struct functor::PadInput<GPUDevice, Eigen::half, int, 4>;
 
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 666bca265c95febf3753e71bf010a7caf95c0541..8afe6a2cbdf9dd551aac9b7c5b590c7267f849bb 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -22,20 +22,28 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
 
-#include "tensorflow/core/kernels/conv_ops_gpu.h"
-
 namespace tensorflow {
 
 #if GOOGLE_CUDA
 
+struct ConvParametersPeer {
+  template <typename T>
+  bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() {
+    return params.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
+  }
+
+  ConvParameters params;
+};
+
 TEST(ConvParameters, WinogradNonfusedAlgoSize) {
-  ConvParameters conv_params_small = {
+  ConvParametersPeer conv_params_small = {{
       1,         // batch
       32,        // in_depths
       {{300,     // in_rows
@@ -51,10 +59,11 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
         0}},     // padding_cols
       DT_FLOAT,  // tensor datatype
       0,         // device_id
-  };
-  EXPECT_TRUE(conv_params_small.ShouldIncludeWinogradNonfusedAlgo<float>());
+  }};
+  EXPECT_TRUE(
+      conv_params_small.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
 
-  ConvParameters conv_params_large = {
+  ConvParametersPeer conv_params_large = {{
       1,         // batch
       128,       // in_depths
       {{300,     // in_rows
@@ -70,8 +79,9 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
         0}},     // padding_cols
       DT_FLOAT,  // tensor datatype
       0,         // device_id
-  };
-  EXPECT_FALSE(conv_params_large.ShouldIncludeWinogradNonfusedAlgo<float>());
+  }};
+  EXPECT_FALSE(
+      conv_params_large.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
 }
 
 #endif  // GOOGLE_CUDA
@@ -401,7 +411,7 @@ class ConvOpTest : public OpsTestBase {
     // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
     // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
     // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
-    // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+    // (1*7)+(4*8)+(7*0)+(2*11)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
     // This means we should end up with this matrix:
     // |  105  |  150  |  183  |   95  |
     // |  235  |  312  |  357  |  178  |
diff --git a/tensorflow/core/kernels/critical_section.cc b/tensorflow/core/kernels/critical_section.cc
deleted file mode 100644
index 30a9abf4ee78cdb336e4c25c217239daf89bae11..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/critical_section.cc
+++ /dev/null
@@ -1,246 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#define EIGEN_USE_THREADS
-
-#include <deque>
-#include <utility>
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-class CriticalSection : public ResourceBase {
- public:
-  explicit CriticalSection() : is_locked_(false) {}
-  ~CriticalSection() override {
-    // Wait for all closures to finish running.
-    mutex_lock lock(mu_);
-    while (!closures_.empty()) {
-      queue_empty_cv_.wait(lock);
-    }
-  }
-
- private:
-  friend class ExecuteInCriticalSectionOp;
-
-  void Acquire(std::function<void()> closure) {
-    std::function<void()> next;
-    {
-      mutex_lock ml(mu_);
-      if (is_locked_) {
-        closures_.push_back(std::move(closure));
-      } else {
-        // This branch is the common case.  Avoid the queue.
-        is_locked_ = true;
-        next = std::move(closure);
-      }
-    }
-    if (next) {
-      next();
-    }
-  }
-
-  void Release() {
-    std::function<void()> next;
-    {
-      mutex_lock ml(mu_);
-      CHECK(is_locked_);
-      if (!closures_.empty()) {
-        // if queue is not empty, start the next entry off the queue.
-        std::swap(next, closures_.front());
-        closures_.pop_front();
-      } else {
-        is_locked_ = false;
-        queue_empty_cv_.notify_all();
-      }
-    }
-    if (next) {
-      next();
-    }
-  }
-
-  string DebugString() override {
-    tf_shared_lock ml(mu_);
-    return strings::StrCat("CriticalSection(locked: ", is_locked_,
-                           " queue_size: ", closures_.size(), ")");
-  }
-
- private:
-  mutex mu_;
-  std::deque<std::function<void()>> closures_ GUARDED_BY(mu_);
-  bool is_locked_ GUARDED_BY(mu_);
-  condition_variable queue_empty_cv_ GUARDED_BY(mu_);
-};
-
-class ExecuteInCriticalSectionOp : public AsyncOpKernel {
- public:
-  explicit ExecuteInCriticalSectionOp(OpKernelConstruction* c)
-      : AsyncOpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("f", &func_));
-  }
-
- public:
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    CriticalSection* critical_section = nullptr;
-    OP_REQUIRES_OK_ASYNC(c,
-                         LookupOrCreateResource<CriticalSection>(
-                             c, HandleFromInput(c, 0), &critical_section,
-                             [this, c](CriticalSection** ptr) {
-                               *ptr = new CriticalSection;
-                               return Status::OK();
-                             }),
-                         done);
-    // No need to Unref critical_section; the Closure below will take
-    // care of the Unref associated with this execution.
-
-    auto* execution = new Closure{std::move(done), c, critical_section, &func_};
-    execution->Start();
-  }
-
- private:
-  class Closure {
-   public:
-    AsyncOpKernel::DoneCallback done_;
-    OpKernelContext* ctx_;
-    CriticalSection* cs_;
-    FunctionLibraryRuntime::Handle handle_;
-    FunctionLibraryRuntime::Options opts_;
-    std::vector<Tensor> arguments_t_;
-    std::vector<Tensor> output_t_;
-    NameAttrList* func_;
-
-    explicit Closure(AsyncOpKernel::DoneCallback done, OpKernelContext* ctx,
-                     CriticalSection* critical_section, NameAttrList* func)
-        : done_(std::move(done)),
-          ctx_(ctx),
-          cs_(critical_section),
-          handle_(-1),
-          func_(func) {}
-
-    ~Closure();
-
-    void Start() {
-      // Perform ExecuteFunction isnide a separate thread to avoid
-      // having lightweight Functions be inlined in this thread.
-      // That inlining would in turn inline DoneAndDelete inside the
-      // same thread.  Since DoneAndDelete can call the next
-      // ExecuteFunction in the CriticalSection, this can cause a
-      // stack overflow.
-      cs_->Acquire(
-          [this]() { (*ctx_->runner())([this]() { ExecuteFunction(); }); });
-    }
-
-   private:
-    void ExecuteFunction();
-    void DoneAndDelete(const Status& status);
-  };
-
-  NameAttrList func_;
-};
-
-void ExecuteInCriticalSectionOp::Closure::ExecuteFunction() {
-  // Arguments to a Function are in the order:
-  //   concat(<formal arguments>, <captured arguments>)
-  OpInputList arguments;
-  Status s = ctx_->input_list("arguments", &arguments);
-  if (!s.ok()) {
-    DoneAndDelete(s);
-    return;
-  }
-
-  arguments_t_.reserve(arguments.size());
-  for (const Tensor& t : arguments) {
-    arguments_t_.push_back(t);
-  }
-
-  auto* function_library = ctx_->function_library();
-  s = function_library->Instantiate(func_->name(), AttrSlice(&func_->attr()),
-                                    &handle_);
-  if (!s.ok()) {
-    DoneAndDelete(s);
-    return;
-  }
-
-  opts_.step_id = CapturedFunction::generate_step_id();
-  auto* step_container =
-      new ScopedStepContainer(opts_.step_id, [this](const string& name) {
-        ctx_->resource_manager()->Cleanup(name).IgnoreError();
-      });
-  opts_.cancellation_manager = ctx_->cancellation_manager();
-  opts_.step_container = step_container;
-  opts_.runner = ctx_->runner();
-
-  function_library->Run(opts_, handle_, arguments_t_, &output_t_,
-                        [this](const Status& s) { DoneAndDelete(s); });
-}
-
-void ExecuteInCriticalSectionOp::Closure::DoneAndDelete(const Status& status) {
-  cs_->Release();
-
-  if (!status.ok()) {
-    ctx_->SetStatus(status);
-  } else {
-    OpOutputList output;
-    const Status s = ctx_->output_list("outputs", &output);
-    if (!s.ok()) {
-      ctx_->SetStatus(s);
-    } else if (output_t_.size() != output.size()) {
-      ctx_->SetStatus(errors::Internal(
-          "Could not set all outputs.  Expected output size is ", output.size(),
-          " but function set ", output_t_.size(), " output values."));
-    } else {
-      for (int i = 0; i < output_t_.size(); ++i) {
-        output.set(i, output_t_[i]);
-      }
-    }
-  }
-
-  delete opts_.step_container;
-  opts_.step_container = nullptr;
-  done_();
-  cs_->Unref();
-  delete this;
-}
-
-ExecuteInCriticalSectionOp::Closure::~Closure() {
-  CHECK(!opts_.step_container)
-      << "Initialized closure destroyed without calling Done";
-}
-
-REGISTER_KERNEL_BUILDER(Name("ExecuteInCriticalSection").Device(DEVICE_CPU),
-                        ExecuteInCriticalSectionOp);
-
-REGISTER_KERNEL_BUILDER(Name("CriticalSectionOp").Device(DEVICE_CPU),
-                        ResourceHandleOp<CriticalSection>);
-
-// TODO(ebrevdo): Re-enable once the cross-device function execution works.
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("ExecuteInCriticalSection")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("critical_section"),
-                        ExecuteInCriticalSectionOp);
-REGISTER_KERNEL_BUILDER(
-    Name("CriticalSectionOp").Device(DEVICE_GPU).HostMemory("resource"),
-    ResourceHandleOp<CriticalSection>);
-#endif  // GOOGLE_CUDA
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 45cc2fbbb8be8401c54f8024920f309178365747..54ef9c6fb48cb9f50b99a4d8065b909f35cf4805 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -39,17 +39,16 @@ limitations under the License.
 #include "tensorflow/core/platform/cuda.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
+namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 using Callback = std::function<void()>;
 
-namespace {
-
 static inline Status ParseAndCheckBoxSizes(const Tensor& boxes,
                                            const Tensor& box_index,
                                            int* num_boxes) {
@@ -753,8 +752,7 @@ inline void RunIfBoxIndexIsValid<GPUDevice>(
       context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
                              &isvalid_host_tensor, alloc_attr),
       done);
-  perftools::gputools::DeviceMemoryBase wrapped(isvalid_dev.data(),
-                                                sizeof(bool));
+  se::DeviceMemoryBase wrapped(isvalid_dev.data(), sizeof(bool));
   const bool status =
       stream
           ->ThenMemcpy(
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index a35e1b0788dbc60d6609faf1dfb97d5e7e4f515b..709082e79903d041771cc19235d4dee76fce66b3 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -242,7 +243,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("input image must be 4-D"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "input image must be 4-D"))
       << s;
 }
 
@@ -255,7 +256,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("box_index has incompatible shape"))
+      str_util::StrContains(s.ToString(), "box_index has incompatible shape"))
       << s;
 }
 
@@ -267,8 +268,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   AddInputFromArray<int32>(TensorShape({2}), {3, 3});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("box_index has values outside [0, batch_size)"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "box_index has values outside [0, batch_size)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 96bdb6a241b1d88c7b14f22fc618ea9c95fb7642..8cadeac68d7907443d860e67b26bdedaf3634e5e 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
@@ -213,20 +214,29 @@ class CTCGreedyDecoderOp : public OpKernel {
 
     // Perform best path decoding
     std::vector<std::vector<std::vector<int> > > sequences(batch_size);
-    for (int b = 0; b < batch_size; ++b) {
-      sequences[b].resize(1);
-      auto& sequence = sequences[b][0];
-      int prev_indices = -1;
-      for (int t = 0; t < seq_len_t(b); ++t) {
-        int max_class_indices;
-        log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
-        if (max_class_indices != blank_index &&
-            !(merge_repeated_ && max_class_indices == prev_indices)) {
-          sequence.push_back(max_class_indices);
+    auto decode = [&](const int64 begin, const int64 end) {
+      for (int b = begin; b < end; ++b) {
+        sequences[b].resize(1);
+        auto &sequence = sequences[b][0];
+        int prev_indices = -1;
+        for (int t = 0; t < seq_len_t(b); ++t) {
+          int max_class_indices;
+          log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
+          if (max_class_indices != blank_index &&
+              !(merge_repeated_ && max_class_indices == prev_indices)) {
+            sequence.push_back(max_class_indices);
+          }
+          prev_indices = max_class_indices;
         }
-        prev_indices = max_class_indices;
       }
-    }
+    };
+
+    const int64 kCostPerUnit = 50 * max_time * num_classes;
+    const int64 total = batch_size;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, total,
+          kCostPerUnit, decode);
 
     OP_REQUIRES_OK(
         ctx, decode_helper_.StoreAllDecodedSequences(
diff --git a/tensorflow/core/kernels/cuda_device_array.h b/tensorflow/core/kernels/cuda_device_array.h
index e7a5db0683eba48295dca96c6c7599126e436536..74dc298c7a5dc5395af015d2a56df60de2fc2db2 100644
--- a/tensorflow/core/kernels/cuda_device_array.h
+++ b/tensorflow/core/kernels/cuda_device_array.h
@@ -80,7 +80,7 @@ class CudaDeviceArrayOnHost {
     TensorReference tensor_ref(out_of_line_values_on_host_);
     TF_RETURN_IF_ERROR(context_->allocate_temp(
         DT_INT8, TensorShape{total_bytes_}, &out_of_line_values_on_gpu_));
-    perftools::gputools::DeviceMemoryBase output_values_base{
+    se::DeviceMemoryBase output_values_base{
         out_of_line_values_on_gpu_.flat<int8>().data(),
         static_cast<uint64>(total_bytes_)};
     stream->ThenMemcpy(&output_values_base,
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 6cec032f9492020fa55c468fcd6a5b09effb0e81..a857bd3ce4c3cd0a0c92f12ad9f75b52771d9345 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -35,8 +35,6 @@
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
-
 // The CUDA cublas_api.h API contains const-correctness errors. Instead of
 // casting away constness on our data, we instead reinterpret the CuBLAS
 // functions as what they were clearly meant to be, and thus we can call
@@ -80,10 +78,12 @@ using matinv_Z = cublasStatus_t(cublasContext*, int, const double2* const*, int,
 namespace tensorflow {
 namespace {
 
+using se::cuda::ScopedActivateExecutorContext;
+
 inline bool CopyHostToDevice(OpKernelContext* context, void* dst,
                              const void* src, uint64 bytes) {
   auto stream = context->op_device_context()->stream();
-  perftools::gputools::DeviceMemoryBase wrapped_dst(dst);
+  se::DeviceMemoryBase wrapped_dst(dst);
   return stream->ThenMemcpy(&wrapped_dst, src, bytes).ok();
 }
 
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index ecfa23750c213361bc2d0be8df0091ed6ea26dd9..b2e8ee23a9c7a2737dffa584ce43025a943952c4 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -398,7 +398,7 @@ class DeviceLapackInfo : public ScratchSpace<int> {
     CHECK(success != nullptr);
     HostLapackInfo copy(context(), size(), debug_info());
     auto stream = context()->op_device_context()->stream();
-    perftools::gputools::DeviceMemoryBase wrapped_src(
+    se::DeviceMemoryBase wrapped_src(
         static_cast<void*>(const_cast<int*>(this->data())));
     *success =
         stream->ThenMemcpy(copy.mutable_data(), wrapped_src, this->bytes())
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index 5939ecdf62bc32d90d53e80c1cc0fd57bca7b931..d2b9c9edaabb8b9595a30a141003a75e51c58576 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -31,12 +31,13 @@ namespace tensorflow {
 #if GOOGLE_CUDA
 
 template <typename T>
-void DnnPooling3dOp<T>::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
-    const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
-    const std::array<int64, 3>& padding, TensorFormat data_format,
-    const Tensor& tensor_in, Tensor* output) {
+void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
+                                se::dnn::PoolingMode pooling_mode,
+                                const std::array<int64, 3>& window,
+                                const std::array<int64, 3>& stride,
+                                const std::array<int64, 3>& padding,
+                                TensorFormat data_format,
+                                const Tensor& tensor_in, Tensor* output) {
   const auto in_shape = tensor_in.shape();
   const auto out_shape = output->shape();
 
@@ -67,18 +68,18 @@ void DnnPooling3dOp<T>::Compute(
     transformed_output = *output;
   }
 
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
+  se::dnn::PoolingDescriptor pooling_desc(3);
   pooling_desc.set_pooling_mode(pooling_mode);
-  perftools::gputools::dnn::BatchDescriptor input_desc(3);
+  se::dnn::BatchDescriptor input_desc(3);
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc(3);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc(3);
   output_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
   for (size_t i = 0; i < window.size(); ++i) {
-    const auto dim_i = static_cast<perftools::gputools::dnn::DimIndex>(i);
+    const auto dim_i = static_cast<se::dnn::DimIndex>(i);
     pooling_desc.set_window(dim_i, window[i]);
     pooling_desc.set_stride(dim_i, stride[i]);
     pooling_desc.set_padding(dim_i, padding[i]);
@@ -115,14 +116,13 @@ void DnnPooling3dOp<T>::Compute(
 
 template <typename T>
 void DnnPooling3dGradOp<T>::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
     const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
     const std::array<int64, 3>& padding,
     const std::array<int64, 3>& output_size, TensorFormat data_format,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
     const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
-  CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
+  CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
          "specified";
@@ -186,21 +186,21 @@ void DnnPooling3dGradOp<T>::Compute(
         transformed_output_backprop.tensor<T, 5>());
   }
 
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
+  se::dnn::PoolingDescriptor pooling_desc(3);
   pooling_desc.set_pooling_mode(pooling_mode);
 
-  perftools::gputools::dnn::BatchDescriptor orig_output_desc(3);
+  se::dnn::BatchDescriptor orig_output_desc(3);
   orig_output_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-  perftools::gputools::dnn::BatchDescriptor orig_input_desc(3);
+  se::dnn::BatchDescriptor orig_input_desc(3);
   orig_input_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
   for (size_t i = 0; i < window.size(); ++i) {
-    const auto dim_i = static_cast<perftools::gputools::dnn::DimIndex>(i);
+    const auto dim_i = static_cast<se::dnn::DimIndex>(i);
     pooling_desc.set_window(dim_i, window[i]);
     pooling_desc.set_stride(dim_i, stride[i]);
     pooling_desc.set_padding(dim_i, padding[i]);
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.h b/tensorflow/core/kernels/cudnn_pooling_gpu.h
index ff4de758451eb420786e3af46b4ec3ecab4cdcf3..280d697fc2a61e8f1e34b702b99121f92214a011 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.h
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.h
@@ -38,7 +38,7 @@ template <typename T>
 class DnnPooling3dOp {
  public:
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::array<int64, 3>& size,
                       const std::array<int64, 3>& stride,
                       const std::array<int64, 3>& padding,
@@ -52,7 +52,7 @@ template <typename T>
 class DnnPooling3dGradOp {
  public:
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::array<int64, 3>& window,
                       const std::array<int64, 3>& stride,
                       const std::array<int64, 3>& padding,
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25560b7c28273fafb5af1aa98062a4d0312cc574
--- /dev/null
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -0,0 +1,1681 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_THREADS
+
+#include <stddef.h>
+#include <atomic>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <string>
+#include <unordered_set>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/use_cudnn.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/stream_executor_util.h"
+#endif  // GOOGLE_CUDA
+
+/*
+ * This module implements ops that fuse a multi-layer multi-step RNN/LSTM model
+ * using the underlying Cudnn library.
+ *
+ * Cudnn RNN library exposes an opaque parameter buffer with unknown layout and
+ * format. And it is very likely that if saved, they cannot be used across
+ * different GPUs. So users need to first query the size of the opaque
+ * parameter buffer, and convert it to and from its canonical forms. But each
+ * actual training step is carried out with the parameter buffer.
+ *
+ * Similar to many other ops, the forward op has two flavors: training and
+ * inference. When training is specified, additional data in reserve_space will
+ * be produced for the backward pass. So there is a performance penalty.
+ *
+ * In addition to the actual data and reserve_space, Cudnn also needs more
+ * memory as temporary workspace. The memory management to and from
+ * stream-executor is done through ScratchAllocator. In general,
+ * stream-executor is responsible for creating the memory of proper size. And
+ * TensorFlow is responsible for making sure the memory is alive long enough
+ * and recycles afterwards.
+ *
+ */
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+#if GOOGLE_CUDA
+
+using GPUDevice = Eigen::GpuDevice;
+using se::Stream;
+using se::StreamExecutor;
+using se::dnn::RnnDescriptor;
+
+template <typename Device, typename T, typename Index>
+class CudnnRNNParamsSizeOp;
+
+template <typename Device, typename T>
+class CudnnRNNParamsToCanonical;
+
+template <typename Device, typename T>
+class CudnnRNNCanonicalToParams;
+
+template <typename Device, typename T>
+class CudnnRNNForwardOp;
+
+template <typename Device, typename T>
+class CudnnRNNBackwardOp;
+
+template <typename Device, typename T>
+class CudnnRNNForwardOpV2;
+
+template <typename Device, typename T>
+class CudnnRNNBackwardOpV2;
+
+enum class TFRNNInputMode {
+  kRNNLinearInput = 0,
+  kRNNSkipInput = 1,
+  kAutoSelect = 9999999
+};
+
+namespace {
+using se::DeviceMemory;
+using se::DeviceMemoryBase;
+using se::ScratchAllocator;
+using se::dnn::AlgorithmConfig;
+using se::dnn::AlgorithmDesc;
+using se::dnn::ProfileResult;
+using se::dnn::RnnDirectionMode;
+using se::dnn::RnnInputMode;
+using se::dnn::RnnMode;
+using se::dnn::RnnSequenceTensorDescriptor;
+using se::dnn::RnnStateTensorDescriptor;
+using se::dnn::ToDataType;
+using se::port::StatusOr;
+
+uint64 HashList(const std::vector<int>& list) {
+  if (list.empty()) {
+    return 0;
+  }
+  uint64 hash_code = list[0];
+  for (int i = 1; i < list.size(); i++) {
+    hash_code = Hash64Combine(hash_code, list[i]);
+  }
+  return hash_code;
+}
+
+// Encapsulate all the shape information that is used in both forward and
+// backward rnn operations.
+class CudnnRnnParameters {
+ public:
+  CudnnRnnParameters(int num_layers, int input_size, int num_units,
+                     int seq_length, int batch_size, int dir_count,
+                     bool has_dropout, bool is_training, RnnMode rnn_mode,
+                     TFRNNInputMode rnn_input_mode, DataType dtype)
+      : num_layers_(num_layers),
+        input_size_(input_size),
+        num_units_(num_units),
+        seq_length_(seq_length),
+        batch_size_(batch_size),
+        dir_count_(dir_count),
+        has_dropout_(has_dropout),
+        is_training_(is_training),
+        rnn_mode_(rnn_mode),
+        rnn_input_mode_(rnn_input_mode),
+        dtype_(dtype) {
+    hash_code_ = HashList(
+        {num_layers, input_size, num_units, seq_length, batch_size, dir_count,
+         static_cast<int>(has_dropout), static_cast<int>(is_training),
+         static_cast<int>(rnn_mode), static_cast<int>(rnn_input_mode), dtype});
+  }
+
+  bool operator==(const CudnnRnnParameters& other) const {
+    return this->get_data_as_tuple() == other.get_data_as_tuple();
+  }
+
+  bool operator!=(const CudnnRnnParameters& other) const {
+    return !(*this == other);
+  }
+  uint64 hash() const { return hash_code_; }
+
+  string ToString() const {
+    std::vector<string> fields = {
+        std::to_string(num_layers_),
+        std::to_string(input_size_),
+        std::to_string(num_units_),
+        std::to_string(seq_length_),
+        std::to_string(batch_size_),
+        std::to_string(dir_count_),
+        std::to_string(has_dropout_),
+        std::to_string(is_training_),
+        std::to_string(static_cast<int>(rnn_mode_)),
+        std::to_string(static_cast<int>(rnn_input_mode_)),
+        std::to_string(static_cast<int>(dtype_))};
+    return str_util::Join(fields, ", ");
+  }
+
+ private:
+  using ParameterDataType = std::tuple<int, int, int, int, int, int, bool, bool,
+                                       RnnMode, TFRNNInputMode, DataType>;
+
+  ParameterDataType get_data_as_tuple() const {
+    return std::make_tuple(num_layers_, input_size_, num_units_, seq_length_,
+                           batch_size_, dir_count_, has_dropout_, is_training_,
+                           rnn_mode_, rnn_input_mode_, dtype_);
+  }
+
+  const int num_layers_;
+  const int input_size_;
+  const int num_units_;
+  const int seq_length_;
+  const int batch_size_;
+  const int dir_count_;
+  const bool has_dropout_;
+  const bool is_training_;
+  const RnnMode rnn_mode_;
+  const TFRNNInputMode rnn_input_mode_;
+  const DataType dtype_;
+  uint64 hash_code_;
+};
+
+struct RnnAutoTuneGroup {
+  static string name() { return "Rnn"; }
+};
+
+using AutoTuneRnnConfigMap =
+    AutoTuneSingleton<RnnAutoTuneGroup, CudnnRnnParameters, AlgorithmConfig>;
+
+Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
+  if (str == "rnn_relu") {
+    *rnn_mode = RnnMode::kRnnRelu;
+    return Status::OK();
+  } else if (str == "rnn_tanh") {
+    *rnn_mode = RnnMode::kRnnTanh;
+    return Status::OK();
+  } else if (str == "lstm") {
+    *rnn_mode = RnnMode::kRnnLstm;
+    return Status::OK();
+  } else if (str == "gru") {
+    *rnn_mode = RnnMode::kRnnGru;
+    return Status::OK();
+  }
+  return errors::InvalidArgument("Invalid RNN mode: ", str);
+}
+
+Status ParseTFRNNInputMode(const string& str, TFRNNInputMode* rnn_input_mode) {
+  if (str == "linear_input") {
+    *rnn_input_mode = TFRNNInputMode::kRNNLinearInput;
+    return Status::OK();
+  } else if (str == "skip_input") {
+    *rnn_input_mode = TFRNNInputMode::kRNNSkipInput;
+    return Status::OK();
+  } else if (str == "auto_select") {
+    *rnn_input_mode = TFRNNInputMode::kAutoSelect;
+    return Status::OK();
+  }
+  return errors::InvalidArgument("Invalid RNN input mode: ", str);
+}
+
+Status ParseRNNDirectionMode(const string& str,
+                             RnnDirectionMode* rnn_dir_mode) {
+  if (str == "unidirectional") {
+    *rnn_dir_mode = RnnDirectionMode::kRnnUnidirectional;
+    return Status::OK();
+  } else if (str == "bidirectional") {
+    *rnn_dir_mode = RnnDirectionMode::kRnnBidirectional;
+    return Status::OK();
+  }
+  return errors::InvalidArgument("Invalid RNN direction mode: ", str);
+}
+
+Status ToRNNInputMode(TFRNNInputMode tf_input_mode, int num_units,
+                      int input_size, RnnInputMode* input_mode) {
+  switch (tf_input_mode) {
+    case TFRNNInputMode::kRNNLinearInput:
+      *input_mode = RnnInputMode::kRnnLinearSkip;
+      break;
+    case TFRNNInputMode::kRNNSkipInput:
+      *input_mode = RnnInputMode::kRnnSkipInput;
+      break;
+    case TFRNNInputMode::kAutoSelect:
+      *input_mode = (input_size == num_units) ? RnnInputMode::kRnnSkipInput
+                                              : RnnInputMode::kRnnLinearSkip;
+      break;
+    default:
+      return errors::InvalidArgument("Invalid TF input mode: ",
+                                     static_cast<int>(tf_input_mode));
+  }
+  return Status::OK();
+}
+
+// TODO(zhengxq): Merge those into stream_executor_util.h.
+template <typename T>
+const DeviceMemory<T> AsDeviceMemory(const Tensor* tensor) {
+  return DeviceMemory<T>::MakeFromByteSize(
+      const_cast<T*>(tensor->template flat<T>().data()),
+      tensor->template flat<T>().size() * sizeof(T));
+}
+
+template <typename T>
+DeviceMemory<T> AsDeviceMemory(Tensor* tensor) {
+  return DeviceMemory<T>::MakeFromByteSize(
+      tensor->template flat<T>().data(),
+      tensor->template flat<T>().size() * sizeof(T));
+}
+
+template <typename U, typename T>
+DeviceMemory<U> CastDeviceMemory(Tensor* tensor) {
+  return DeviceMemory<U>::MakeFromByteSize(
+      tensor->template flat<T>().data(),
+      tensor->template flat<T>().size() * sizeof(T));
+}
+
+DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
+                                   int64 offset, int64 size) {
+  const void* base_ptr = device_memory.opaque();
+  void* offset_ptr =
+      const_cast<char*>(reinterpret_cast<const char*>(base_ptr) + offset);
+  CHECK(offset + size <= device_memory.size())
+      << "The slice is not within the region of DeviceMemory.";
+  return DeviceMemoryBase(offset_ptr, size);
+}
+
+inline Status FromExecutorStatus(const se::port::Status& s) {
+  return s.ok() ? Status::OK()
+                : Status(static_cast<error::Code>(static_cast<int>(s.code())),
+                         s.error_message());
+}
+
+template <typename T>
+inline Status FromExecutorStatus(const se::port::StatusOr<T>& s) {
+  return FromExecutorStatus(s.status());
+}
+
+inline se::port::Status ToExecutorStatus(const Status& s) {
+  return s.ok() ? se::port::Status::OK()
+                : se::port::Status(static_cast<se::port::error::Code>(
+                                       static_cast<int>(s.code())),
+                                   s.error_message());
+}
+
+template <typename>
+struct ToTFDataType;
+
+template <>
+struct ToTFDataType<Eigen::half> : std::integral_constant<DataType, DT_HALF> {};
+
+template <>
+struct ToTFDataType<float> : std::integral_constant<DataType, DT_FLOAT> {};
+
+template <>
+struct ToTFDataType<double> : std::integral_constant<DataType, DT_DOUBLE> {};
+
+template <>
+struct ToTFDataType<uint8> : std::integral_constant<DataType, DT_UINT8> {};
+
+// A helper to allocate temporary scratch memory for Cudnn RNN models. It
+// takes the ownership of the underlying memory. The expectation is that the
+// memory should be alive for the span of the Cudnn RNN itself.
+template <typename T>
+class CudnnRnnAllocatorInTemp : public ScratchAllocator {
+ public:
+  ~CudnnRnnAllocatorInTemp() = default;
+
+  explicit CudnnRnnAllocatorInTemp(OpKernelContext* context)
+      : context_(context) {}
+  int64 GetMemoryLimitInBytes(Stream* stream) override {
+    return std::numeric_limits<int64>::max();
+  }
+
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
+                                              int64 byte_size) override {
+    Tensor temporary_memory;
+    const DataType tf_data_type = ToTFDataType<T>::value;
+    int64 allocate_count =
+        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
+    Status allocation_status(context_->allocate_temp(
+        tf_data_type, TensorShape({allocate_count}), &temporary_memory));
+    if (!allocation_status.ok()) {
+      return ToExecutorStatus(allocation_status);
+    }
+    // Hold the reference of the allocated tensors until the end of the
+    // allocator.
+    allocated_tensors_.push_back(temporary_memory);
+    total_byte_size_ += byte_size;
+    return DeviceMemory<uint8>::MakeFromByteSize(
+        temporary_memory.template flat<T>().data(),
+        temporary_memory.template flat<T>().size() * sizeof(T));
+  }
+
+  int64 TotalByteSize() const { return total_byte_size_; }
+
+  Tensor get_allocated_tensor(int index) const {
+    return allocated_tensors_[index];
+  }
+
+ private:
+  int64 total_byte_size_ = 0;
+  OpKernelContext* context_;  // not owned
+  std::vector<Tensor> allocated_tensors_;
+};
+
+// A helper to allocate memory for Cudnn RNN models as a kernel output. It is
+// used by forward pass kernel to feed the output to the backward pass.
+// The memory is expected to live long enough after the backward pass is
+// finished.
+template <typename T>
+class CudnnRnnAllocatorInOutput : public ScratchAllocator {
+ public:
+  ~CudnnRnnAllocatorInOutput() override {}
+  CudnnRnnAllocatorInOutput(OpKernelContext* context, int output_index)
+      : context_(context), output_index_(output_index) {}
+  int64 GetMemoryLimitInBytes(Stream* stream) override {
+    return std::numeric_limits<int64>::max();
+  }
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
+                                              int64 byte_size) override {
+    CHECK(total_byte_size_ == 0)
+        << "Reserve space allocator can only be called once";
+    int64 allocate_count =
+        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
+
+    Tensor* temporary_memory = nullptr;
+    Status allocation_status(context_->allocate_output(
+        output_index_, TensorShape({allocate_count}), &temporary_memory));
+    if (!allocation_status.ok()) {
+      return ToExecutorStatus(allocation_status);
+    }
+    total_byte_size_ += byte_size;
+    auto memory_uint8 = DeviceMemory<uint8>::MakeFromByteSize(
+        temporary_memory->template flat<T>().data(),
+        temporary_memory->template flat<T>().size() * sizeof(T));
+    return StatusOr<DeviceMemory<uint8>>(memory_uint8);
+  }
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 total_byte_size_ = 0;
+  OpKernelContext* context_;  // not owned
+  int output_index_;
+};
+
+// A helper to allocate persistent memory for Cudnn RNN models, which is
+// expected to live between kernel invocations.
+// This class is not thread-safe.
+class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
+ public:
+  explicit CudnnRNNPersistentSpaceAllocator(OpKernelContext* context)
+      : context_(context) {}
+
+  ~CudnnRNNPersistentSpaceAllocator() override {}
+
+  int64 GetMemoryLimitInBytes(Stream* stream) override {
+    return std::numeric_limits<int64>::max();
+  }
+
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
+                                              int64 byte_size) override {
+    if (total_byte_size_ != 0) {
+      return Status(error::FAILED_PRECONDITION,
+                    "Persistent space allocator can only be called once");
+    }
+
+    Status allocation_status = context_->allocate_persistent(
+        DT_UINT8, TensorShape({byte_size}), &handle_, nullptr);
+    if (!allocation_status.ok()) {
+      return ToExecutorStatus(allocation_status);
+    }
+    total_byte_size_ += byte_size;
+    return AsDeviceMemory<uint8>(handle_.AccessTensor(context_));
+  }
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 total_byte_size_ = 0;
+  PersistentTensor handle_;
+  OpKernelContext* context_;  // not owned
+};
+
+struct CudnnModelTypes {
+  RnnMode rnn_mode;
+  TFRNNInputMode rnn_input_mode;
+  RnnDirectionMode rnn_direction_mode;
+  bool HasInputC() const {
+    // For Cudnn 5.0, only LSTM has input-c. All other models use only
+    // input-h.
+    return rnn_mode == RnnMode::kRnnLstm;
+  }
+
+  string DebugString() const {
+    return strings::Printf(
+        "[rnn_mode, rnn_input_mode, rnn_direction_mode]: %d, %d, %d ",
+        static_cast<int>(rnn_mode), static_cast<int>(rnn_input_mode),
+        static_cast<int>(rnn_direction_mode));
+  }
+};
+
+// A helper class that collects the shapes to describe a RNN model.
+struct CudnnRnnModelShapes {
+  int num_layers;
+  int input_size;
+  int num_units;
+  int dir_count;
+  int seq_length;
+  int batch_size;
+  TensorShape input_shape;
+  TensorShape output_shape;
+  TensorShape hidden_state_shape;
+  // At present only fields related to cached RnnDescriptor are concerned.
+  bool IsCompatibleWith(const CudnnRnnModelShapes& rhs) const {
+    return num_layers == rhs.num_layers && input_size == rhs.input_size &&
+           num_units == rhs.num_units && dir_count == rhs.dir_count;
+  }
+  string DebugString() const {
+    return strings::Printf(
+        "[num_layers, input_size, num_units, dir_count, seq_length, "
+        "batch_size]: [%d, %d, %d, %d, %d, %d] ",
+        num_layers, input_size, num_units, dir_count, seq_length, batch_size);
+  }
+};
+
+// Utility class for using CudnnRnnConfig and AlgorithmDesc pair a hash table
+// key.
+struct CudnnRnnConfigHasher {
+  uint64 operator()(
+      const std::pair<CudnnRnnModelShapes, AlgorithmDesc>& to_hash) const {
+    auto& shapes = to_hash.first;
+    auto& algo_desc = to_hash.second;
+
+    uint64 hash =
+        HashList({shapes.num_layers, shapes.input_size, shapes.num_units,
+                  shapes.dir_count, shapes.batch_size});
+    hash = Hash64Combine(hash, algo_desc.hash());
+    return hash;
+  }
+};
+
+// Utility class for using CudnnRnnModelShapes and AlgorithmDesc pair as a hash
+// table key.
+struct CudnnRnnConfigComparator {
+  bool operator()(
+      const std::pair<CudnnRnnModelShapes, AlgorithmDesc>& lhs,
+      const std::pair<CudnnRnnModelShapes, AlgorithmDesc>& rhs) const {
+    return lhs.first.IsCompatibleWith(rhs.first) && lhs.second == rhs.second;
+  }
+};
+
+// Pointers to RNN scratch space for a specific set of shape parameters (used as
+// a hash table value in CudnnRNNForwardOp and CudnnRNNBackwardOp).
+struct RnnScratchSpace {
+  std::unique_ptr<RnnDescriptor> rnn_desc;
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator;
+};
+
+// Extract and checks the forward input tensors, parameters, and shapes from the
+// OpKernelContext.
+Status ExtractForwardInput(OpKernelContext* context,
+                           const CudnnModelTypes& model_types,
+                           const Tensor** input, const Tensor** input_h,
+                           const Tensor** input_c, const Tensor** params,
+                           CudnnRnnModelShapes* model_shapes) {
+  TF_RETURN_IF_ERROR(context->input("input", input));
+  TF_RETURN_IF_ERROR(context->input("input_h", input_h));
+  if (model_types.HasInputC()) {
+    TF_RETURN_IF_ERROR(context->input("input_c", input_c));
+  }
+  TF_RETURN_IF_ERROR(context->input("params", params));
+
+  if ((*input)->dims() != 3) {
+    return errors::InvalidArgument("RNN input must be a 3-D vector.");
+  }
+  model_shapes->seq_length = (*input)->dim_size(0);
+  model_shapes->batch_size = (*input)->dim_size(1);
+  model_shapes->input_size = (*input)->dim_size(2);
+  model_shapes->input_shape = (*input)->shape();
+  model_shapes->dir_count =
+      (model_types.rnn_direction_mode == RnnDirectionMode::kRnnBidirectional)
+          ? 2
+          : 1;
+
+  if ((*input_h)->dims() != 3) {
+    return errors::InvalidArgument("RNN input must be a 3-D vector.");
+  }
+  model_shapes->num_layers = (*input_h)->dim_size(0) / model_shapes->dir_count;
+  model_shapes->num_units = (*input_h)->dim_size(2);
+
+  model_shapes->hidden_state_shape =
+      TensorShape({model_shapes->dir_count * model_shapes->num_layers,
+                   model_shapes->batch_size, model_shapes->num_units});
+  if ((*input_h)->shape() != model_shapes->hidden_state_shape) {
+    return errors::InvalidArgument(
+        "Invalid input_h shape: ", (*input_h)->shape().DebugString(), " ",
+        model_shapes->hidden_state_shape.DebugString());
+  }
+  if (model_types.HasInputC()) {
+    if ((*input_h)->shape() != (*input_c)->shape()) {
+      return errors::InvalidArgument(
+          "input_h and input_c must have the same shape: ",
+          (*input_h)->shape().DebugString(), " ",
+          (*input_c)->shape().DebugString());
+    }
+  }
+  model_shapes->output_shape =
+      TensorShape({model_shapes->seq_length, model_shapes->batch_size,
+                   model_shapes->dir_count * model_shapes->num_units});
+  return Status::OK();
+}
+
+template <typename T>
+Status CreateForwardAndBackwardIODescriptors(
+    OpKernelContext* context, const CudnnRnnModelShapes& model_shapes,
+    std::unique_ptr<RnnSequenceTensorDescriptor>* input_desc,
+    std::unique_ptr<RnnStateTensorDescriptor>* state_desc,
+    std::unique_ptr<RnnSequenceTensorDescriptor>* output_desc) {
+  StreamExecutor* executor = context->op_device_context()->stream()->parent();
+  se::dnn::DataType data_type = ToDataType<T>::value;
+
+  const TensorShape& input_shape = model_shapes.input_shape;
+  const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
+  const TensorShape& output_shape = model_shapes.output_shape;
+
+  DCHECK_EQ(input_shape.dims(), 3);
+  auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
+      input_shape.dim_size(0), input_shape.dim_size(1), input_shape.dim_size(2),
+      data_type);
+  TF_RETURN_IF_ERROR(input_desc_s.status());
+  *input_desc = input_desc_s.ConsumeValueOrDie();
+
+  DCHECK_EQ(hidden_state_shape.dims(), 3);
+  auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
+      hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1),
+      hidden_state_shape.dim_size(2), data_type);
+  TF_RETURN_IF_ERROR(hidden_state_desc_s.status());
+  *state_desc = hidden_state_desc_s.ConsumeValueOrDie();
+
+  DCHECK_EQ(output_shape.dims(), 3);
+  auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
+      output_shape.dim_size(0), output_shape.dim_size(1),
+      output_shape.dim_size(2), data_type);
+  TF_RETURN_IF_ERROR(output_desc_s.status());
+  *output_desc = output_desc_s.ConsumeValueOrDie();
+  return Status::OK();
+}
+
+template <typename T>
+Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
+                 const CudnnModelTypes& model_types,
+                 const CudnnRnnModelShapes& model_shapes,
+                 /* forward inputs */
+                 const Tensor* input, const Tensor* input_h,
+                 const Tensor* input_c, const Tensor* params,
+                 const bool is_training,
+                 /* forward outputs, outputs of the function */
+                 Tensor* output, Tensor* output_h, Tensor* output_c,
+                 ScratchAllocator* reserve_space_allocator,
+                 ScratchAllocator* workspace_allocator,
+                 ProfileResult* output_profile_result) {
+  std::unique_ptr<RnnSequenceTensorDescriptor> input_desc;
+  std::unique_ptr<RnnStateTensorDescriptor> state_desc;
+  std::unique_ptr<RnnSequenceTensorDescriptor> output_desc;
+
+  TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors<T>(
+      context, model_shapes, &input_desc, &state_desc, &output_desc));
+
+  auto input_data = AsDeviceMemory<T>(input);
+  auto input_h_data = AsDeviceMemory<T>(input_h);
+  DeviceMemory<T> input_c_data;
+  if (model_types.HasInputC()) {
+    input_c_data = AsDeviceMemory<T>(input_c);
+  }
+  auto params_data = AsDeviceMemory<T>(params);
+  auto output_data = AsDeviceMemory<T>(output);
+  auto output_h_data = AsDeviceMemory<T>(output_h);
+  DeviceMemory<T> output_c_data;
+  if (model_types.HasInputC()) {
+    output_c_data = AsDeviceMemory<T>(output_c);
+  }
+
+  Stream* stream = context->op_device_context()->stream();
+  bool launch_success =
+      stream
+          ->ThenRnnForward(rnn_desc, *input_desc, input_data, *state_desc,
+                           input_h_data, *state_desc, input_c_data, params_data,
+                           *output_desc, &output_data, *state_desc,
+                           &output_h_data, *state_desc, &output_c_data,
+                           is_training, reserve_space_allocator,
+                           workspace_allocator, output_profile_result)
+          .ok();
+  return launch_success
+             ? Status::OK()
+             : errors::Internal(
+                   "Failed to call ThenRnnForward with model config: ",
+                   model_types.DebugString(), ", ", model_shapes.DebugString());
+}
+
+template <typename T>
+Status DoBackward(
+    OpKernelContext* context, const RnnDescriptor& rnn_desc,
+    const CudnnModelTypes& model_types, const CudnnRnnModelShapes& model_shapes,
+    /* forward inputs */
+    const Tensor* input, const Tensor* input_h, const Tensor* input_c,
+    const Tensor* params,
+    /* forward outptus */
+    const Tensor* output, const Tensor* output_h, const Tensor* output_c,
+    /* backprop inputs */
+    const Tensor* output_backprop, const Tensor* output_h_backprop,
+    const Tensor* output_c_backprop, const Tensor* reserve_space,
+    /* backprop outputs, output of the function */
+    Tensor* input_backprop, Tensor* input_h_backprop, Tensor* input_c_backprop,
+    Tensor* params_backprop, ScratchAllocator* workspace_allocator,
+    ProfileResult* output_profile_result) {
+  std::unique_ptr<RnnSequenceTensorDescriptor> input_desc;
+  std::unique_ptr<RnnStateTensorDescriptor> state_desc;
+  std::unique_ptr<RnnSequenceTensorDescriptor> output_desc;
+
+  TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors<T>(
+      context, model_shapes, &input_desc, &state_desc, &output_desc));
+
+  auto input_data = AsDeviceMemory<T>(input);
+  auto input_h_data = AsDeviceMemory<T>(input_h);
+  DeviceMemory<T> input_c_data;
+  if (model_types.HasInputC()) {
+    input_c_data = AsDeviceMemory<T>(input_c);
+  }
+  auto params_data = AsDeviceMemory<T>(params);
+  auto output_data = AsDeviceMemory<T>(output);
+  auto output_h_data = AsDeviceMemory<T>(output_h);
+  DeviceMemory<T> output_c_data;
+  if (model_types.HasInputC()) {
+    output_c_data = AsDeviceMemory<T>(output_c);
+  }
+  auto output_backprop_data = AsDeviceMemory<T>(output_backprop);
+  auto output_h_backprop_data = AsDeviceMemory<T>(output_h_backprop);
+  DeviceMemory<T> output_c_backprop_data;
+  if (model_types.HasInputC()) {
+    output_c_backprop_data = AsDeviceMemory<T>(output_c_backprop);
+  }
+  auto input_backprop_data = AsDeviceMemory<T>(input_backprop);
+  auto input_h_backprop_data = AsDeviceMemory<T>(input_h_backprop);
+  DeviceMemory<T> input_c_backprop_data;
+  if (model_types.HasInputC()) {
+    input_c_backprop_data = AsDeviceMemory<T>(input_c_backprop);
+  }
+  auto params_backprop_data = AsDeviceMemory<T>(params_backprop);
+  auto reserve_space_uint8 =
+      CastDeviceMemory<uint8, T>(const_cast<Tensor*>(reserve_space));
+
+  // Creates a memory callback for the workspace. The memory lives to the end
+  // of this kernel calls.
+  Stream* stream = context->op_device_context()->stream();
+  bool launch_success =
+      stream
+          ->ThenRnnBackward(rnn_desc, *input_desc, input_data, *state_desc,
+                            input_h_data, *state_desc, input_c_data,
+                            params_data, *output_desc, output_data, *state_desc,
+                            output_h_data, *state_desc, output_c_data,
+                            output_backprop_data, output_h_backprop_data,
+                            output_c_backprop_data, &input_backprop_data,
+                            &input_h_backprop_data, &input_c_backprop_data,
+                            &params_backprop_data, &reserve_space_uint8,
+                            workspace_allocator, output_profile_result)
+          .ok();
+  return launch_success
+             ? Status::OK()
+             : errors::Internal(
+                   "Failed to call ThenRnnBackward with model config: ",
+                   model_types.DebugString(), ", ", model_shapes.DebugString());
+}
+
+template <typename T>
+void RestoreParams(const OpInputList params_input,
+                   const std::vector<RnnDescriptor::ParamsRegion>& params,
+                   DeviceMemoryBase* data_dst, Stream* stream) {
+  int num_params = params.size();
+  CHECK(params_input.size() == num_params)
+      << "Number of params mismatch. Expected " << params_input.size()
+      << ", got " << num_params;
+  for (int i = 0; i < params.size(); i++) {
+    int64 size_in_bytes = params[i].size;
+    int64 size = size_in_bytes / sizeof(T);
+    CHECK(size == params_input[i].NumElements())
+        << "Params size mismatch. Expected " << size << ", got "
+        << params_input[i].NumElements();
+    auto data_src_ptr = StreamExecutorUtil::AsDeviceMemory<T>(params_input[i]);
+    DeviceMemoryBase data_dst_ptr =
+        SliceDeviceMemory(*data_dst, params[i].offset, size_in_bytes);
+    stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
+  }
+}
+
+}  // namespace
+
+// Note: all following kernels depend on a RnnDescriptor instance, which
+// according to Cudnn official doc should be kept around and reused across all
+// Cudnn kernels in the same model.
+// In Tensorflow, we don't pass the reference across different OpKernels,
+// rather, recreate it separately in each OpKernel, which does no cause issue:
+// CudnnDropoutDescriptor keeps a reference to a memory for
+// random number generator state. During recreation, this state is lost.
+// However, only forward-pass Cudnn APIs make use of the state.
+
+// A common base class for RNN kernels. It extracts common attributes and
+// shape validations.
+class CudnnRNNKernelCommon : public OpKernel {
+ protected:
+  explicit CudnnRNNKernelCommon(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dropout", &dropout_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
+    string str;
+    OP_REQUIRES_OK(context, context->GetAttr("rnn_mode", &str));
+    OP_REQUIRES_OK(context, ParseRNNMode(str, &model_types_.rnn_mode));
+    OP_REQUIRES_OK(context, context->GetAttr("input_mode", &str));
+    OP_REQUIRES_OK(context,
+                   ParseTFRNNInputMode(str, &model_types_.rnn_input_mode));
+    OP_REQUIRES_OK(context, context->GetAttr("direction", &str));
+    OP_REQUIRES_OK(
+        context, ParseRNNDirectionMode(str, &model_types_.rnn_direction_mode));
+    // Reset CudnnRnnDescriptor and related random number generate states in
+    // every Compute() call.
+    OP_REQUIRES_OK(context, ReadBoolFromEnvVar("TF_CUDNN_RESET_RND_GEN_STATE",
+                                               false, &reset_rnd_gen_state_));
+  }
+
+  bool HasInputC() const { return model_types_.HasInputC(); }
+  RnnMode rnn_mode() const { return model_types_.rnn_mode; }
+  TFRNNInputMode rnn_input_mode() const { return model_types_.rnn_input_mode; }
+  RnnDirectionMode rnn_direction_mode() const {
+    return model_types_.rnn_direction_mode;
+  }
+  const CudnnModelTypes& model_types() const { return model_types_; }
+  float dropout() const { return dropout_; }
+  uint64 seed() { return (static_cast<uint64>(seed_) << 32) | seed2_; }
+  bool ResetRndGenState() { return reset_rnd_gen_state_; }
+
+  template <typename T>
+  Status ExtractCudnnRNNParamsInfo(OpKernelContext* context,
+                                   std::unique_ptr<RnnDescriptor>* rnn_desc) {
+    const Tensor* num_layers_t = nullptr;
+    TF_RETURN_IF_ERROR(context->input("num_layers", &num_layers_t));
+    if (!TensorShapeUtils::IsScalar(num_layers_t->shape())) {
+      return errors::InvalidArgument("num_layers is not a scalar");
+    }
+    int num_layers = num_layers_t->scalar<int>()();
+    const Tensor* num_units_t = nullptr;
+    TF_RETURN_IF_ERROR(context->input("num_units", &num_units_t));
+    if (!TensorShapeUtils::IsScalar(num_units_t->shape())) {
+      return errors::InvalidArgument("num_units is not a scalar");
+    }
+    int num_units = num_units_t->scalar<int>()();
+    const Tensor* input_size_t = nullptr;
+    TF_RETURN_IF_ERROR(context->input("input_size", &input_size_t));
+    if (!TensorShapeUtils::IsScalar(input_size_t->shape())) {
+      return errors::InvalidArgument("input_size is not a scalar");
+    }
+    int input_size = input_size_t->scalar<int>()();
+
+    RnnInputMode input_mode;
+    TF_RETURN_IF_ERROR(
+        ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode));
+
+    Stream* stream = context->op_device_context()->stream();
+    // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
+    // random number generator, therefore set state_allocator to nullptr.
+    const AlgorithmConfig algo_config;
+    auto rnn_desc_s = stream->parent()->createRnnDescriptor(
+        num_layers, num_units, input_size, /*batch_size=*/0, input_mode,
+        rnn_direction_mode(), rnn_mode(), ToDataType<T>::value, algo_config,
+        dropout(), seed(), /* state_allocator=*/nullptr);
+    if (!rnn_desc_s.ok()) {
+      return FromExecutorStatus(rnn_desc_s);
+    }
+    *rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status CreateRnnDescriptor(OpKernelContext* context,
+                             const CudnnRnnModelShapes& model_shapes,
+                             const RnnInputMode& input_mode,
+                             const AlgorithmConfig& algo_config,
+                             ScratchAllocator* dropout_state_allocator,
+                             std::unique_ptr<RnnDescriptor>* rnn_desc) {
+    StreamExecutor* executor = context->op_device_context()->stream()->parent();
+    se::dnn::DataType data_type = ToDataType<T>::value;
+    auto rnn_desc_s = executor->createRnnDescriptor(
+        model_shapes.num_layers, model_shapes.num_units,
+        model_shapes.input_size, model_shapes.batch_size, input_mode,
+        rnn_direction_mode(), rnn_mode(), data_type, algo_config, dropout(),
+        seed(), dropout_state_allocator);
+    TF_RETURN_IF_ERROR(rnn_desc_s.status());
+
+    *rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    return Status::OK();
+  }
+
+  using RnnStateCache =
+      gtl::FlatMap<std::pair<CudnnRnnModelShapes, AlgorithmDesc>,
+                   RnnScratchSpace, CudnnRnnConfigHasher,
+                   CudnnRnnConfigComparator>;
+  // Returns a raw rnn descriptor pointer. The cache owns the rnn descriptor and
+  // should outlive the returned pointer.
+  template <typename T>
+  Status GetCachedRnnDescriptor(OpKernelContext* context,
+                                const CudnnRnnModelShapes& model_shapes,
+                                const RnnInputMode& input_mode,
+                                const AlgorithmConfig& algo_config,
+                                RnnStateCache* cache,
+                                RnnDescriptor** rnn_desc) {
+    auto key = std::make_pair(model_shapes, algo_config.algorithm());
+    RnnScratchSpace& rnn_state = (*cache)[key];
+    if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
+      CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
+          new CudnnRNNPersistentSpaceAllocator(context);
+      rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+      Status status =
+          CreateRnnDescriptor<T>(context, model_shapes, input_mode, algo_config,
+                                 dropout_state_allocator, &rnn_state.rnn_desc);
+      TF_RETURN_IF_ERROR(status);
+    }
+    *rnn_desc = rnn_state.rnn_desc.get();
+    return Status::OK();
+  }
+
+ private:
+  int seed_;
+  int seed2_;
+  float dropout_;
+  bool reset_rnd_gen_state_;
+
+  CudnnModelTypes model_types_;
+};
+
+// A class that returns the size of the opaque parameter buffer. The user should
+// use that to create the actual parameter buffer for training. However, it
+// should not be used for saving and restoring.
+template <typename T, typename Index>
+class CudnnRNNParamsSizeOp<GPUDevice, T, Index> : public CudnnRNNKernelCommon {
+ public:
+  explicit CudnnRNNParamsSizeOp(OpKernelConstruction* context)
+      : CudnnRNNKernelCommon(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    std::unique_ptr<RnnDescriptor> rnn_desc;
+    OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo<T>(context, &rnn_desc));
+    int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes();
+    CHECK(params_size_in_bytes % sizeof(T) == 0)
+        << "params_size_in_bytes must be multiple of element size";
+    int64 params_size = params_size_in_bytes / sizeof(T);
+
+    Tensor* output_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, {1}, &output_t));
+    *output_t->template flat<Index>().data() = params_size;
+  }
+};
+
+#define REGISTER_GPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsSize")       \
+                              .Device(DEVICE_GPU)          \
+                              .HostMemory("num_layers")    \
+                              .HostMemory("num_units")     \
+                              .HostMemory("input_size")    \
+                              .HostMemory("params_size")   \
+                              .TypeConstraint<T>("T")      \
+                              .TypeConstraint<int32>("S"), \
+                          CudnnRNNParamsSizeOp<GPUDevice, T, int32>);
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
+// Convert weight and bias params from a platform-specific layout to the
+// canonical form.
+template <typename T>
+class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
+ public:
+  explicit CudnnRNNParamsToCanonical(OpKernelConstruction* context)
+      : CudnnRNNKernelCommon(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_params", &num_params_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(3);
+    auto input_ptr = StreamExecutorUtil::AsDeviceMemory<T>(input);
+    Stream* stream = context->op_device_context()->stream();
+
+    std::unique_ptr<RnnDescriptor> rnn_desc;
+    OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo<T>(context, &rnn_desc));
+    int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes();
+    CHECK(params_size_in_bytes % sizeof(T) == 0)
+        << "params_size_in_bytes must be multiple of element size";
+
+    const Tensor* num_units_t = nullptr;
+    OP_REQUIRES_OK(context, context->input("num_units", &num_units_t));
+    CHECK(TensorShapeUtils::IsScalar(num_units_t->shape()))
+        << "num_units is not a scalar";
+    int num_units = num_units_t->scalar<int>()();
+
+    const Tensor* input_size_t = nullptr;
+    OP_REQUIRES_OK(context, context->input("input_size", &input_size_t));
+    CHECK(TensorShapeUtils::IsScalar(input_size_t->shape()))
+        << "input_size is not a scalar";
+    int input_size = input_size_t->scalar<int>()();
+
+    const Tensor* num_layers_t = nullptr;
+    OP_REQUIRES_OK(context, context->input("num_layers", &num_layers_t));
+    CHECK(TensorShapeUtils::IsScalar(num_layers_t->shape()))
+        << "num_layers is not a scalar";
+    int num_layers = num_layers_t->scalar<int>()();
+    int num_dirs = 1;
+    if (rnn_direction_mode() == RnnDirectionMode::kRnnBidirectional) {
+      num_dirs = 2;
+    }
+    const int num_params_per_layer = num_params_ / num_layers / num_dirs;
+    // Number of params applied on inputs. The rest are applied on recurrent
+    // hidden states.
+    const int num_params_input_state = num_params_per_layer / 2;
+    CHECK(num_params_ % (num_layers * num_dirs) == 0)
+        << "Number of params is not a multiple of num_layers * num_dirs.";
+    CHECK(num_params_per_layer % 2 == 0)
+        << "Number of params per layer is not a even number.";
+
+    CHECK(num_params_ == rnn_desc->ParamsWeightRegions().size())
+        << "Number of params mismatch. Expected " << num_params_ << ", got "
+        << rnn_desc->ParamsWeightRegions().size();
+    for (int i = 0; i < rnn_desc->ParamsWeightRegions().size(); i++) {
+      int64 size_in_bytes = rnn_desc->ParamsWeightRegions()[i].size;
+      int64 size = size_in_bytes / sizeof(T);
+      const int layer_idx = i / num_params_per_layer;
+      const int index_within_layer = i % num_params_per_layer;
+      int width = 0, height = num_units;
+      // In CuDNN layout, each layer has num_params_per_layer params, with the
+      // first half a.k.a num_params_input_state params applied on the inputs,
+      // and the second half on the recurrent hidden states.
+      bool apply_on_input_state = index_within_layer < num_params_input_state;
+      if (rnn_direction_mode() == RnnDirectionMode::kRnnUnidirectional) {
+        if (layer_idx == 0 && apply_on_input_state) {
+          width = input_size;
+        } else {
+          width = num_units;
+        }
+      } else {
+        if (apply_on_input_state) {
+          if (layer_idx <= 1) {
+            // First fwd or bak layer.
+            width = input_size;
+          } else {
+            // Following layers, cell inputs are concatenated outputs of
+            // its prior layer.
+            width = 2 * num_units;
+          }
+        } else {
+          width = num_units;
+        }
+      }
+      CHECK(size == width * height) << "Params size mismatch. Expected "
+                                    << width * height << ", got " << size;
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context, context->allocate_output(
+                                  i, TensorShape({height, width}), &output));
+      DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
+          input_ptr, rnn_desc->ParamsWeightRegions()[i].offset, size_in_bytes);
+      auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
+      stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
+    }
+
+    OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(),
+                errors::InvalidArgument("Number of params mismatch. Expected ",
+                                        num_params_, ", got ",
+                                        rnn_desc->ParamsBiasRegions().size()));
+    for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) {
+      int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size;
+      int64 size = size_in_bytes / sizeof(T);
+      OP_REQUIRES(context, size == num_units,
+                  errors::InvalidArgument("Params size mismatch. Expected ",
+                                          num_units, ", got ", size));
+
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(num_params_ + i,
+                                              TensorShape({size}), &output));
+      DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
+          input_ptr, rnn_desc->ParamsBiasRegions()[i].offset, size_in_bytes);
+      auto data_dst_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
+      stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes);
+    }
+  }
+
+ private:
+  int num_params_;
+};
+
+#define REGISTER_GPU(T)                                     \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsToCanonical") \
+                              .Device(DEVICE_GPU)           \
+                              .HostMemory("num_layers")     \
+                              .HostMemory("num_units")      \
+                              .HostMemory("input_size")     \
+                              .TypeConstraint<T>("T"),      \
+                          CudnnRNNParamsToCanonical<GPUDevice, T>);
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
+// Convert weight and bias params from the canonical form to a
+// platform-specific layout.
+template <typename T>
+class CudnnRNNCanonicalToParams<GPUDevice, T> : public CudnnRNNKernelCommon {
+ public:
+  explicit CudnnRNNCanonicalToParams(OpKernelConstruction* context)
+      : CudnnRNNKernelCommon(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    std::unique_ptr<RnnDescriptor> rnn_desc;
+    OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo<T>(context, &rnn_desc));
+    int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes();
+    CHECK(params_size_in_bytes % sizeof(T) == 0)
+        << "params_size_in_bytes must be multiple of element size";
+    Tensor* output = nullptr;
+    int params_size = params_size_in_bytes / sizeof(T);
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, {params_size}, &output));
+    auto output_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
+    Stream* stream = context->op_device_context()->stream();
+
+    OpInputList weights;
+    OP_REQUIRES_OK(context, context->input_list("weights", &weights));
+    RestoreParams<T>(weights, rnn_desc->ParamsWeightRegions(), &output_ptr,
+                     stream);
+
+    OpInputList biases;
+    OP_REQUIRES_OK(context, context->input_list("biases", &biases));
+    RestoreParams<T>(biases, rnn_desc->ParamsBiasRegions(), &output_ptr,
+                     stream);
+  }
+};
+
+#define REGISTER_GPU(T)                                     \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNCanonicalToParams") \
+                              .Device(DEVICE_GPU)           \
+                              .HostMemory("num_layers")     \
+                              .HostMemory("num_units")      \
+                              .HostMemory("input_size")     \
+                              .TypeConstraint<T>("T"),      \
+                          CudnnRNNCanonicalToParams<GPUDevice, T>);
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
+// Run the forward operation of the RNN model.
+template <typename T>
+class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
+ public:
+  explicit CudnnRNNForwardOp(OpKernelConstruction* context)
+      : CudnnRNNKernelCommon(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
+
+    // Read debug env variables.
+    is_debug_mode_ = DebugCudnnRnn();
+    debug_cudnn_rnn_algo_ = DebugCudnnRnnAlgo();
+    debug_use_tensor_ops_ = DebugCudnnRnnUseTensorOps();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    AlgorithmConfig algo_config;
+    ComputeAndReturnAlgorithm(context, &algo_config);
+  }
+
+ protected:
+  virtual void ComputeAndReturnAlgorithm(OpKernelContext* context,
+                                         AlgorithmConfig* output_algo_config) {
+    CHECK_NE(output_algo_config, nullptr);
+
+    const Tensor* input = nullptr;
+    const Tensor* input_h = nullptr;
+    const Tensor* input_c = nullptr;
+    const Tensor* params = nullptr;
+    CudnnRnnModelShapes model_shapes;
+    OP_REQUIRES_OK(context,
+                   ExtractForwardInput(context, model_types(), &input, &input_h,
+                                       &input_c, &params, &model_shapes));
+    RnnInputMode input_mode;
+    OP_REQUIRES_OK(context,
+                   ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
+                                  model_shapes.input_size, &input_mode));
+
+    Tensor* output = nullptr;
+    Tensor* output_h = nullptr;
+    Tensor* output_c = nullptr;
+    OP_REQUIRES_OK(context, AllocateOutputs(context, model_shapes, &output,
+                                            &output_h, &output_c));
+
+    // Creates a memory callback for the reserve_space. The memory lives in the
+    // output of this kernel. And it will be fed into the backward pass when
+    // needed.
+    CudnnRnnAllocatorInOutput<T> reserve_space_allocator(context, 3);
+    // Creates a memory callback for the workspace. The memory lives to the end
+    // of this kernel calls.
+    CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
+
+    if (is_debug_mode_) {
+      AlgorithmDesc algo_desc(debug_cudnn_rnn_algo_, debug_use_tensor_ops_);
+      output_algo_config->set_algorithm(algo_desc);
+    } else {
+      OP_REQUIRES_OK(context,
+                     MaybeAutoTune(context, model_shapes, input_mode, input,
+                                   input_h, input_c, params, output, output_h,
+                                   output_c, output_algo_config));
+    }
+
+    Status launch_status;
+    {
+      mutex_lock l(mu_);
+      RnnDescriptor* rnn_desc_ptr = nullptr;
+      OP_REQUIRES_OK(
+          context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
+                                             *output_algo_config,
+                                             &rnn_state_cache_, &rnn_desc_ptr));
+      launch_status = DoForward<T>(
+          context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+          input_c, params, is_training_, output, output_h, output_c,
+          &reserve_space_allocator, &workspace_allocator,
+          /*output_profile_result=*/nullptr);
+    }
+    OP_REQUIRES_OK(context, launch_status);
+  }
+
+ protected:
+  virtual Status MaybeAutoTune(OpKernelContext* context,
+                               const CudnnRnnModelShapes& model_shapes,
+                               const RnnInputMode& input_mode,
+                               const Tensor* input, const Tensor* input_h,
+                               const Tensor* input_c, const Tensor* params,
+                               Tensor* output, Tensor* output_h,
+                               Tensor* output_c,
+                               AlgorithmConfig* best_algo_config) {
+    CHECK_NE(best_algo_config, nullptr);
+    *best_algo_config = AlgorithmConfig();
+    return Status::OK();
+  }
+
+  bool is_training() const { return is_training_; }
+  bool is_debug_mode_;
+  bool debug_use_tensor_ops_;
+  int64 debug_cudnn_rnn_algo_;
+
+ private:
+  Status AllocateOutputs(OpKernelContext* context,
+                         const CudnnRnnModelShapes& model_shapes,
+                         Tensor** output, Tensor** output_h,
+                         Tensor** output_c) {
+    const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
+    const TensorShape& output_shape = model_shapes.output_shape;
+
+    TF_RETURN_IF_ERROR(context->allocate_output(0, output_shape, output));
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(1, hidden_state_shape, output_h));
+    if (HasInputC()) {
+      TF_RETURN_IF_ERROR(
+          context->allocate_output(2, hidden_state_shape, output_c));
+    } else {
+      // Only LSTM uses input_c and output_c. So for all other models, we only
+      // need to create dummy outputs.
+      TF_RETURN_IF_ERROR(context->allocate_output(2, {}, output_c));
+    }
+    if (!is_training_) {
+      Tensor* dummy_reserve_space = nullptr;
+      TF_RETURN_IF_ERROR(context->allocate_output(3, {}, &dummy_reserve_space));
+    }
+    return Status::OK();
+  }
+
+  mutex mu_;
+  bool is_training_;
+  RnnStateCache rnn_state_cache_ GUARDED_BY(mu_);
+};
+
+#define REGISTER_GPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("CudnnRNN").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      CudnnRNNForwardOp<GPUDevice, T>);
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
+template <typename T>
+class CudnnRNNForwardOpV2<GPUDevice, T>
+    : public CudnnRNNForwardOp<GPUDevice, T> {
+ private:
+  using CudnnRNNForwardOp<GPUDevice, T>::is_training;
+  using CudnnRNNKernelCommon::CreateRnnDescriptor;
+  using CudnnRNNKernelCommon::dropout;
+  using CudnnRNNKernelCommon::HasInputC;
+  using CudnnRNNKernelCommon::model_types;
+
+ public:
+  explicit CudnnRNNForwardOpV2(OpKernelConstruction* context)
+      : CudnnRNNForwardOp<GPUDevice, T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    AlgorithmConfig best_algo_config;
+    CudnnRNNForwardOp<GPUDevice, T>::ComputeAndReturnAlgorithm(
+        context, &best_algo_config);
+    if (!context->status().ok()) {
+      return;
+    }
+
+    Tensor* output_host_reserved = nullptr;
+    // output_host_reserved stores opaque info used for backprop when running
+    // in training mode. At present, it includes a serialization of the best
+    // AlgorithmDesc picked during rnn forward pass autotune.
+    // int8 algorithm_id
+    // int8 use_tensor_op
+    // If autotune is not enabled, the algorithm_id is
+    // stream_executor::dnn::kDefaultAlgorithm and use_tensor_op is false. If
+    // running in inference mode, the output_host_reserved is currently not
+    // populated.
+    if (is_training()) {
+      OP_REQUIRES_OK(context, context->allocate_output(4, TensorShape({2}),
+                                                       &output_host_reserved));
+      auto output_host_reserved_int8 = output_host_reserved->vec<int8>();
+      output_host_reserved_int8(0) = best_algo_config.algorithm().algo_id();
+      output_host_reserved_int8(1) =
+          best_algo_config.algorithm().tensor_ops_enabled();
+    } else {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(4, {}, &output_host_reserved));
+    }
+  }
+
+ protected:
+  Status MaybeAutoTune(OpKernelContext* context,
+                       const CudnnRnnModelShapes& model_shapes,
+                       const RnnInputMode& input_mode, const Tensor* input,
+                       const Tensor* input_h, const Tensor* input_c,
+                       const Tensor* params, Tensor* output, Tensor* output_h,
+                       Tensor* output_c,
+                       AlgorithmConfig* algo_config) override {
+    CHECK_NE(algo_config, nullptr);
+    if (!CudnnRnnUseAutotune() || this->is_debug_mode_) {
+      *algo_config = AlgorithmConfig();
+      return Status::OK();
+    }
+
+    std::vector<AlgorithmDesc> algorithms;
+    auto* stream = context->op_device_context()->stream();
+    CHECK(stream->parent()->GetRnnAlgorithms(&algorithms));
+    if (algorithms.empty()) {
+      LOG(WARNING) << "No Rnn algorithm found";
+      return Status::OK();
+    }
+
+    const auto& modeltypes = model_types();
+    CudnnRnnParameters rnn_params(
+        model_shapes.num_layers, model_shapes.input_size,
+        model_shapes.num_units, model_shapes.seq_length,
+        model_shapes.batch_size, model_shapes.dir_count,
+        /*has_dropout=*/std::abs(dropout()) > 1e-8, is_training(),
+        modeltypes.rnn_mode, modeltypes.rnn_input_mode, input->dtype());
+
+    if (AutoTuneRnnConfigMap::GetInstance()->Find(rnn_params, algo_config)) {
+      return Status::OK();
+    }
+
+    // Create temp tensors when profiling backprop pass.
+    auto data_type = input->dtype();
+    Tensor output_backprop;
+    Tensor output_h_backprop;
+    Tensor output_c_backprop;
+    Tensor input_backprop;
+    Tensor input_h_backprop;
+    Tensor input_c_backprop;
+    Tensor params_backprop;
+    if (is_training()) {
+      TF_RETURN_IF_ERROR(context->allocate_temp(
+          data_type, model_shapes.output_shape, &output_backprop));
+      TF_RETURN_IF_ERROR(context->allocate_temp(
+          data_type, model_shapes.hidden_state_shape, &output_h_backprop));
+
+      TF_RETURN_IF_ERROR(
+          context->allocate_temp(data_type, params->shape(), &params_backprop));
+      TF_RETURN_IF_ERROR(context->allocate_temp(
+          data_type, model_shapes.input_shape, &input_backprop));
+      TF_RETURN_IF_ERROR(context->allocate_temp(
+          data_type, model_shapes.hidden_state_shape, &input_h_backprop));
+      if (HasInputC()) {
+        TF_RETURN_IF_ERROR(context->allocate_temp(
+            data_type, model_shapes.hidden_state_shape, &output_c_backprop));
+        TF_RETURN_IF_ERROR(context->allocate_temp(
+            data_type, model_shapes.hidden_state_shape, &input_c_backprop));
+      }
+    }
+    ProfileResult best_result;
+    for (auto& algo : algorithms) {
+      Status status;
+      ProfileResult final_profile_result;
+
+      ProfileResult fwd_profile_result;
+      ProfileResult bak_profile_result;
+
+      // RnnDescriptor is algorithm-dependent, thus not reusable.
+      std::unique_ptr<RnnDescriptor> rnn_desc;
+      // Use a temp scratch allocator for the random num generator.
+      CudnnRnnAllocatorInTemp<uint8> dropout_state_allocator(context);
+      if (!this->template CreateRnnDescriptor<T>(
+                   context, model_shapes, input_mode, AlgorithmConfig(algo),
+                   &dropout_state_allocator, &rnn_desc)
+               .ok()) {
+        continue;
+      }
+
+      // Again use temp scratch allocator during profiling.
+      CudnnRnnAllocatorInTemp<T> reserve_space_allocator(context);
+      CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
+      status = DoForward<T>(
+          context, *rnn_desc.get(), model_types(), model_shapes, input, input_h,
+          input_c, params, is_training(), output, output_h, output_c,
+          &reserve_space_allocator, &workspace_allocator, &fwd_profile_result);
+      if (!status.ok()) {
+        continue;
+      }
+
+      if (is_training()) {
+        // Get reserve space from the forward pass.
+        Tensor reserve_space = reserve_space_allocator.get_allocated_tensor(0);
+        status = DoBackward<T>(
+            context, *rnn_desc.get(), model_types(), model_shapes, input,
+            input_h, input_c, params, output, output_h, output_c,
+            &output_backprop, &output_h_backprop, &output_c_backprop,
+            &reserve_space, &input_backprop, &input_h_backprop,
+            &input_c_backprop, &params_backprop, &workspace_allocator,
+            &bak_profile_result);
+        if (!status.ok()) {
+          continue;
+        }
+        final_profile_result.set_elapsed_time_in_ms(
+            fwd_profile_result.elapsed_time_in_ms() +
+            bak_profile_result.elapsed_time_in_ms());
+      } else {
+        final_profile_result = fwd_profile_result;
+      }
+
+      auto total_time = final_profile_result.elapsed_time_in_ms();
+      VLOG(1) << "Profile Cudnn RNN algo " << algo.algo_id()
+              << " run time: " << total_time << " ms";
+      if (total_time < best_result.elapsed_time_in_ms()) {
+        best_result.set_elapsed_time_in_ms(total_time);
+        best_result.set_algorithm(algo);
+      }
+    }
+
+    if (!best_result.is_valid()) {
+      return Status(error::Code::INTERNAL, "No algorithm worked!");
+    }
+    algo_config->set_algorithm(best_result.algorithm());
+    AutoTuneRnnConfigMap::GetInstance()->Insert(rnn_params, *algo_config);
+    return Status::OK();
+  }
+};
+
+#define REGISTER_GPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNV2")               \
+                              .Device(DEVICE_GPU)          \
+                              .HostMemory("host_reserved") \
+                              .TypeConstraint<T>("T"),     \
+                          CudnnRNNForwardOpV2<GPUDevice, T>);
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
+// Run the backward operation of the RNN model.
+template <typename T>
+class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
+ public:
+  explicit CudnnRNNBackwardOp(OpKernelConstruction* context)
+      : CudnnRNNKernelCommon(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input = nullptr;
+    const Tensor* input_h = nullptr;
+    const Tensor* input_c = nullptr;
+    const Tensor* params = nullptr;
+    CudnnRnnModelShapes model_shapes;
+    OP_REQUIRES_OK(context,
+                   ExtractForwardInput(context, model_types(), &input, &input_h,
+                                       &input_c, &params, &model_shapes));
+    RnnInputMode input_mode;
+    OP_REQUIRES_OK(context,
+                   ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
+                                  model_shapes.input_size, &input_mode));
+
+    const Tensor* output = nullptr;
+    const Tensor* output_h = nullptr;
+    const Tensor* output_c = nullptr;
+    const Tensor* output_backprop = nullptr;
+    const Tensor* output_h_backprop = nullptr;
+    const Tensor* output_c_backprop = nullptr;
+    const Tensor* reserve_space = nullptr;
+    OP_REQUIRES_OK(context,
+                   ExtractBackwardInputs(context, model_shapes, model_types(),
+                                         &output, &output_h, &output_c,
+                                         &output_backprop, &output_h_backprop,
+                                         &output_c_backprop, &reserve_space));
+
+    Tensor* input_backprop = nullptr;
+    Tensor* input_h_backprop = nullptr;
+    Tensor* input_c_backprop = nullptr;
+    Tensor* params_backprop = nullptr;
+    OP_REQUIRES_OK(context,
+                   AllocateOutputs(context, model_shapes, params->shape(),
+                                   &input_backprop, &input_h_backprop,
+                                   &input_c_backprop, &params_backprop));
+
+    // Creates a memory callback for the workspace. The memory lives to the end
+    // of this kernel calls.
+    CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
+    AlgorithmConfig algo_config;
+    OP_REQUIRES_OK(context, GetAlgorithm(context, &algo_config));
+    Status launch_status;
+    {
+      mutex_lock l(mu_);
+      RnnDescriptor* rnn_desc_ptr = nullptr;
+      OP_REQUIRES_OK(
+          context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
+                                             algo_config, &rnn_state_cache_,
+                                             &rnn_desc_ptr));
+      launch_status = DoBackward<T>(
+          context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+          input_c, params, output, output_h, output_c, output_backprop,
+          output_h_backprop, output_c_backprop, reserve_space, input_backprop,
+          input_h_backprop, input_c_backprop, params_backprop,
+          &workspace_allocator, /*output_profile_result=*/nullptr);
+    }
+    OP_REQUIRES_OK(context, launch_status);
+  }
+
+ protected:
+  virtual Status GetAlgorithm(OpKernelContext* context,
+                              AlgorithmConfig* algo_config) {
+    CHECK_NE(algo_config, nullptr);
+    *algo_config = AlgorithmConfig();
+    return Status::OK();
+  }
+
+ private:
+  mutex mu_;
+  RnnStateCache rnn_state_cache_ GUARDED_BY(mu_);
+
+  Status ExtractBackwardInputs(
+      OpKernelContext* context, const CudnnRnnModelShapes& model_shapes,
+      const CudnnModelTypes& model_types, const Tensor** output,
+      const Tensor** output_h, const Tensor** output_c,
+      const Tensor** output_backprop, const Tensor** output_h_backprop,
+      const Tensor** output_c_backprop, const Tensor** reserve_space) {
+    TF_RETURN_IF_ERROR(context->input("output", output));
+    TF_RETURN_IF_ERROR(context->input("output_backprop", output_backprop));
+    TF_RETURN_IF_ERROR(context->input("output_h", output_h));
+    TF_RETURN_IF_ERROR(context->input("output_h_backprop", output_h_backprop));
+    if (model_types.HasInputC()) {
+      TF_RETURN_IF_ERROR(context->input("output_c", output_c));
+      TF_RETURN_IF_ERROR(
+          context->input("output_c_backprop", output_c_backprop));
+    }
+    TF_RETURN_IF_ERROR(context->input("reserve_space", reserve_space));
+    const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
+    const TensorShape& output_shape = model_shapes.output_shape;
+
+    if (output_shape != (*output)->shape()) {
+      return errors::InvalidArgument(
+          "Invalid output shape: ", (*output)->shape().DebugString(), " ",
+          output_shape.DebugString());
+    }
+    if (hidden_state_shape != (*output_h)->shape()) {
+      return errors::InvalidArgument(
+          "Invalid output_h shape: ", (*output_h)->shape().DebugString(), " ",
+          hidden_state_shape.DebugString());
+    }
+
+    if (output_shape != (*output_backprop)->shape()) {
+      return errors::InvalidArgument("Invalid output_backprop shape: ",
+                                     (*output_backprop)->shape().DebugString(),
+                                     " ", output_shape.DebugString());
+    }
+    if (hidden_state_shape != (*output_h_backprop)->shape()) {
+      return errors::InvalidArgument(
+          "Invalid output_h_backprop shape: ",
+          (*output_h_backprop)->shape().DebugString(), " ",
+          hidden_state_shape.DebugString());
+    }
+
+    if (model_types.HasInputC()) {
+      if (hidden_state_shape != (*output_c)->shape()) {
+        return errors::InvalidArgument(
+            "Invalid output_c shape: ", (*output_c)->shape().DebugString(), " ",
+            hidden_state_shape.DebugString());
+      }
+      if (hidden_state_shape != (*output_c_backprop)->shape()) {
+        return errors::InvalidArgument(
+            "Invalid output_c_backprop shape: ",
+            (*output_c_backprop)->shape().DebugString(), " ",
+            hidden_state_shape.DebugString());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status AllocateOutputs(OpKernelContext* context,
+                         const CudnnRnnModelShapes& model_shapes,
+                         const TensorShape& params_shape,
+                         Tensor** input_backprop, Tensor** input_h_backprop,
+                         Tensor** input_c_backprop, Tensor** params_backprop) {
+    const TensorShape& input_shape = model_shapes.input_shape;
+    const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
+
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(0, input_shape, input_backprop));
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(1, hidden_state_shape, input_h_backprop));
+    if (HasInputC()) {
+      TF_RETURN_IF_ERROR(
+          context->allocate_output(2, hidden_state_shape, input_c_backprop));
+    } else {
+      // Only LSTM uses input_c and output_c. So for all other models, we only
+      // need to create dummy outputs.
+      TF_RETURN_IF_ERROR(context->allocate_output(2, {}, input_c_backprop));
+    }
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(3, params_shape, params_backprop));
+    return Status::OK();
+  }
+};
+
+#define REGISTER_GPU(T)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("CudnnRNNBackprop").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      CudnnRNNBackwardOp<GPUDevice, T>);
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
+template <typename T>
+class CudnnRNNBackwardOpV2<GPUDevice, T>
+    : public CudnnRNNBackwardOp<GPUDevice, T> {
+ public:
+  explicit CudnnRNNBackwardOpV2(OpKernelConstruction* context)
+      : CudnnRNNBackwardOp<GPUDevice, T>(context) {}
+
+ protected:
+  Status GetAlgorithm(OpKernelContext* context,
+                      AlgorithmConfig* algo_config) override {
+    CHECK_NE(algo_config, nullptr);
+    const Tensor* host_reserved = nullptr;
+    TF_RETURN_IF_ERROR(context->input("host_reserved", &host_reserved));
+
+    auto host_reserved_int8 = host_reserved->vec<int8>();
+    const AlgorithmDesc algo_desc(host_reserved_int8(0), host_reserved_int8(1));
+    algo_config->set_algorithm(algo_desc);
+    return Status::OK();
+  }
+};
+
+#define REGISTER_GPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNBackpropV2")       \
+                              .Device(DEVICE_GPU)          \
+                              .HostMemory("host_reserved") \
+                              .TypeConstraint<T>("T"),     \
+                          CudnnRNNBackwardOpV2<GPUDevice, T>);
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
+// TODO(zhengxq): Add the conversion of Cudnn RNN Params from and to
+// its canonical form.
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 1466f24202fea4200f752985d620f1fbea61d35a..1920c54e80759735686d8ac3e17feb4fb4310337 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -18,9 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32,
           int64);
-#if !defined(IS_MOBILE_PLATFORM)
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
-#endif
 
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64);
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index bf32c8a54b34586e43d34cf8890ed37fe64b8c34..9e4ffe950c9a88d22a3bfc081adc4703fd9e6b65 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
-          int64);
-REGISTER5(BinaryOp, CPU, "AddV2", functor::add, float, Eigen::half, double,
-          int32, int64);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
+          int64, bfloat16);
+REGISTER6(BinaryOp, CPU, "AddV2", functor::add, float, Eigen::half, double,
+          int32, int64, bfloat16);
 
 #if GOOGLE_CUDA
 REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
new file mode 100644
index 0000000000000000000000000000000000000000..14d889e8e3b7c03554388cd51054ee52966329cd
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -0,0 +1,225 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_op_clip.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Basic coefficient-wise tenary operations.
+// This is the case for example of the clip_by_value.
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined above. E.g., functor::clip.
+template <typename Device, typename T>
+class ClipOp : public OpKernel {
+ public:
+  explicit ClipOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+    const Tensor& in2 = ctx->input(2);
+
+    auto in0_flat = in0.flat<T>();
+    auto in1_flat = in1.flat<T>();
+    auto in2_flat = in2.flat<T>();
+    const Device& d = ctx->eigen_device<Device>();
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    auto out_flat = out->flat<T>();
+    if (in1.shape() == in2.shape()) {
+      if (in0.shape() == in1.shape()) {
+        functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                            out_flat);
+      } else {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                          out_flat);
+      }
+    } else {
+      if (in0.shape() == in1.shape()) {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                               out_flat);
+      } else {
+        OP_REQUIRES(ctx,
+                    (in0.shape() == in2.shape() &&
+                     TensorShapeUtils::IsScalar(in1.shape())),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                                out_flat);
+      }
+    }
+  }
+};
+
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipFunc {
+  UnaryClipFunc(const T& value_min, const T& value_max)
+      : value_min(value_min), value_max(value_max) {}
+  const T operator()(const T& value) const {
+    return std::max(std::min(value, value_max), value_min);
+  }
+  T value_min;
+  T value_max;
+};
+template <typename T>
+struct UnaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat = in0_flat.unaryExpr(UnaryClipFunc<T>(in1_flat(0), in2_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipFunc {
+  explicit BinaryRightClipFunc(const T& value_min) : value_min(value_min) {}
+  const T operator()(const T& value, const T& value_max) const {
+    return std::max(std::min(value, value_max), value_min);
+  }
+  T value_min;
+};
+template <typename T>
+struct BinaryRightClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in2_flat, BinaryRightClipFunc<T>(in1_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipFunc {
+  explicit BinaryLeftClipFunc(const T& value_max) : value_max(value_max) {}
+  const T operator()(const T& value, const T& value_min) const {
+    return std::max(std::min(value, value_max), value_min);
+  }
+  T value_max;
+};
+template <typename T>
+struct BinaryLeftClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in1_flat, BinaryLeftClipFunc<T>(in2_flat(0)));
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_CPU(T)                         \
+  template struct UnaryClipOp<CPUDevice, T>;       \
+  template struct BinaryRightClipOp<CPUDevice, T>; \
+  template struct BinaryLeftClipOp<CPUDevice, T>;  \
+  template struct TernaryClipOp<CPUDevice, T>;
+INSTANTIATE_CPU(Eigen::half);
+INSTANTIATE_CPU(float);
+INSTANTIATE_CPU(double);
+INSTANTIATE_CPU(int8);
+INSTANTIATE_CPU(int16);
+INSTANTIATE_CPU(int32);
+INSTANTIATE_CPU(int64);
+INSTANTIATE_CPU(uint8);
+INSTANTIATE_CPU(uint16);
+#undef INSTANTIATE_CPU
+}  // namespace functor
+
+#define REGISTER_CPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ClipOp<CPUDevice, type>);
+
+REGISTER_CPU_KERNEL(Eigen::half);
+REGISTER_CPU_KERNEL(float);
+REGISTER_CPU_KERNEL(double);
+REGISTER_CPU_KERNEL(int8);
+REGISTER_CPU_KERNEL(int16);
+REGISTER_CPU_KERNEL(int32);
+REGISTER_CPU_KERNEL(int64);
+REGISTER_CPU_KERNEL(uint8);
+REGISTER_CPU_KERNEL(uint16);
+#undef REGISTER_CPU_KERNEL
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      ClipOp<GPUDevice, type>);
+REGISTER_GPU_KERNEL(Eigen::half);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(int8);
+REGISTER_GPU_KERNEL(int16);
+REGISTER_GPU_KERNEL(int64);
+REGISTER_GPU_KERNEL(uint8);
+REGISTER_GPU_KERNEL(uint16);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("ClipByValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("t")
+                            .HostMemory("clip_value_min")
+                            .HostMemory("clip_value_max")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        ClipOp<CPUDevice, int32>);
+
+#undef REGISTER_GPU_KERNEL
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_clip.h b/tensorflow/core/kernels/cwise_op_clip.h
new file mode 100644
index 0000000000000000000000000000000000000000..171b6932c2d7935e2cdec76aebe4643926df7580
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename Device, typename T>
+struct UnaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename Device, typename T>
+struct BinaryRightClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename Device, typename T>
+struct BinaryLeftClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename Device, typename T>
+struct TernaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44dea7dee90822a332abbbd39b1d07a06a02b521
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/cwise_op_clip.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+template <typename T>
+__global__ void UnaryClipCustomKernel(const int32 size_in, const T *in0,
+                                      const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
+                                            const T *in1, const T *in2,
+                                            T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[i] < in0[i] ? in2[i] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryLeftClipCustomKernel(const int32 size_in, const T *in0,
+                                           const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[i] ? in1[i] : value;
+  }
+}
+
+namespace functor {
+
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    UnaryClipCustomKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+            out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryRightClipCustomKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+            out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryLeftClipCustomKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+            out_flat.data());
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_GPU(T)                         \
+  template struct UnaryClipOp<GPUDevice, T>;       \
+  template struct BinaryRightClipOp<GPUDevice, T>; \
+  template struct BinaryLeftClipOp<GPUDevice, T>;  \
+  template struct TernaryClipOp<GPUDevice, T>;
+INSTANTIATE_GPU(Eigen::half);
+INSTANTIATE_GPU(float);
+INSTANTIATE_GPU(double);
+INSTANTIATE_GPU(int8);
+INSTANTIATE_GPU(int16);
+INSTANTIATE_GPU(int32);
+INSTANTIATE_GPU(int64);
+INSTANTIATE_GPU(uint8);
+INSTANTIATE_GPU(uint16);
+#undef INSTANTIATE_GPU
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index c71c756e4461d4ed36628ea8a4f8a0922896302c..b12652f7fba4ea8a9bd4ec18b79469ad69e79902 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -16,14 +16,14 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Div", functor::div, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Div", functor::div, float, Eigen::half, double,
+          bfloat16, complex64, complex128);
 REGISTER5(BinaryOp, CPU, "Div", functor::safe_div, uint8, uint16, int16, int32,
           int64);
 REGISTER5(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8, uint16, int16,
           int32, int64);
-REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
+          bfloat16, complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
           uint16, int16, int64, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index fecbf859897bd1560da00f54756d4a1ffb7660d4..24da61fdf6ceed554d9ac4d68efdaa1f952c2835 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
-          int16, int32, int64);
+REGISTER6(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
+          int8, int16, int32, int64);
 REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
           Eigen::half, double);
 
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index aa180c247e7d01ef0f2898b4a50a71c3c3bc6941..707dc9e49ca1dd5b9872c2e5d3184e11eddd7a1f 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double,
+          bfloat16);
 
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 00cdecdbd184b84b6601eda76dd5dfded5aa1e1b..575968126fa82d585fcda9490da5cd69332366c6 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(BinaryOp, CPU, "Less", functor::less, float, Eigen::half, double,
-          int32, int64, uint8, int8, int16);
+REGISTER9(BinaryOp, CPU, "Less", functor::less, float, Eigen::half, double,
+          bfloat16, int32, int64, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER7(BinaryOp, GPU, "Less", functor::less, float, Eigen::half, double,
           int64, uint8, int8, int16);
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 11806c5fc774dc3a37abc733127e4b6660f27f9c..499200d0546ccf1d9119b63a9e552908de3d1ae1 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(BinaryOp, CPU, "LessEqual", functor::less_equal, float, Eigen::half,
-          double, int32, int64, uint8, int8, int16);
+REGISTER9(BinaryOp, CPU, "LessEqual", functor::less_equal, float, Eigen::half,
+          bfloat16, double, int32, int64, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER7(BinaryOp, GPU, "LessEqual", functor::less_equal, float, Eigen::half,
           double, int64, uint8, int8, int16);
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 98936e0f960f1f407c2187746ca80d3db0a93412..5d17c890cfec77cd3f50ee649adf4af6e20b5ed7 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
+          bfloat16, complex64, complex128);
 
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index dff83df828f076a076a8f220d04974344d8ffafc..9bc37003879f077288dfc058996e9b0b4162d16e 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Minimum", functor::minimum, float, Eigen::half,
-          double, int32, int64);
+REGISTER6(BinaryOp, CPU, "Minimum", functor::minimum, float, Eigen::half,
+          bfloat16, double, int32, int64);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Minimum", functor::minimum, float, Eigen::half,
           double, int64);
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 0e8d2e37350dbbb942bd5ed6b16392b6288313fe..cff0407b83a4bafd27573325615322f92e594d46 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
-          int32);
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
+          int32, bfloat16);
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 497756133d05249141823481e6ef43b73a84660b..205070761f13cbfe6b509eea2d6b36c2f0f37f04 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
+          bfloat16, complex64, complex128);
 
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
@@ -27,8 +27,8 @@ REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
 #endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 7fc2f6bf08b2c825f471123e1ab58bd060f6070a..84f695ddc29d7f8d3afc12ea81515e80a8a75255 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
-          int32, int64, complex64, complex128);
+REGISTER8(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
+          int32, int64, complex64, complex128, bfloat16);
 
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Square", functor::square, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index 025041946ac71f0e8f4724f9432d5e2901e348cc..eb27bddb78dfd8679b010f7f2cb67d2049a22a4b 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double, int32,
-          int64, complex64, complex128);
+REGISTER8(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double, int32,
+          int64, bfloat16, complex64, complex128);
 #if !defined(__ANDROID_TYPES_SLIM__)
 // Sub op for int8, uint8, int16, uint16
 REGISTER4(BinaryOp, CPU, "Sub", functor::sub, int8, uint8, int16, uint16);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 06918075a42648a3cf7135376d728fa466e7c469..a80905d1450cc38619bb27c2e27eda58b3cf169d 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -27,27 +27,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 
 namespace Eigen {
-namespace numext {
-#if GOOGLE_CUDA
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(
-    const std::complex<float>& x) {
-  auto com = ::expf(x.real());
-  auto res_real = com * ::cosf(x.imag());
-  auto res_imag = com * ::sinf(x.imag());
-  return std::complex<float>(res_real, res_imag);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(
-    const std::complex<double>& x) {
-  auto com = ::exp(x.real());
-  auto res_real = com * ::cos(x.imag());
-  auto res_imag = com * ::sin(x.imag());
-  return std::complex<double>(res_real, res_imag);
-}
-#endif
-}  // namespace numext
-
 namespace internal {
 
 template <typename T>
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index 8295fa939ee1aabf78a7d7b7f4677d851b407573..e32eccf547e07b71678abf0e75ac20973ecbf380 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/kernels/cwise_ops_sycl_common.h"
 #endif
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 39f497e71612fc08a085e410edae73669fc9993a..696d5840e8ce39c1bf210b54b9f28ae83cf232c7 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -231,14 +231,22 @@ BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
 
 Graph* BcastAdd(int rows, int cols, int dim) {
   Graph* g = new Graph(OpRegistry::Global());
-  Tensor lhs(DT_FLOAT, TensorShape({rows, cols}));
-  lhs.flat<float>().setRandom();
-  TensorShape rhs_shape;
-  if (dim == 0) {
+  TensorShape lhs_shape, rhs_shape;
+  if (dim == 0) {  // row
+    lhs_shape = TensorShape({rows, cols});
     rhs_shape = TensorShape({rows, 1});
-  } else {
+  } else if (dim == 1) {  // col
+    lhs_shape = TensorShape({rows, cols});
     rhs_shape = TensorShape({cols});
+  } else if (dim == 2) {  // cross_rc
+    lhs_shape = TensorShape({rows, 1});
+    rhs_shape = TensorShape({1, cols});
+  } else {  // cross_cr
+    lhs_shape = TensorShape({1, cols});
+    rhs_shape = TensorShape({rows, 1});
   }
+  Tensor lhs(DT_FLOAT, lhs_shape);
+  lhs.flat<float>().setRandom();
   Tensor rhs(DT_FLOAT, rhs_shape);
   rhs.flat<float>().setRandom();
   test::graph::Binary(g, "Add", test::graph::Constant(g, lhs),
@@ -298,5 +306,59 @@ BM_BCAST_ADD_COL_ALL(sycl);
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
+#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C)                            \
+  void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C(int iters, int arg) { \
+    const int rows = RowsFromArg(arg);                                 \
+    const int cols = ColsFromArg(arg);                                 \
+    const int64 tot = static_cast<int64>(iters) * rows * cols;         \
+    testing::ItemsProcessed(tot);                                      \
+    testing::BytesProcessed(tot * sizeof(float));                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2)).Run(iters);      \
+  }                                                                    \
+  BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C)                 \
+      ->Arg(RowsAndColsArg(R, C));
+
+#define BM_BCAST_ADD_CROSS_RC_ALL(DEVICE)   \
+  BM_BCAST_ADD_CROSS_RC(DEVICE, 512, 2048); \
+  BM_BCAST_ADD_CROSS_RC(DEVICE, 512, 4096); \
+  BM_BCAST_ADD_CROSS_RC(DEVICE, 2048, 512); \
+  BM_BCAST_ADD_CROSS_RC(DEVICE, 4096, 512);
+BM_BCAST_ADD_CROSS_RC_ALL(cpu);
+#if GOOGLE_CUDA
+BM_BCAST_ADD_CROSS_RC_ALL(gpu);
+#endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_CROSS_RC_ALL(sycl);
+#endif  // TENSORFLOW_USE_SYCL
+#undef BM_BCAST_ADD_CROSS_RC_ALL
+#undef BM_BCAST_ADD_CROSS_RC
+
+#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C)                            \
+  void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C(int iters, int arg) { \
+    const int rows = RowsFromArg(arg);                                 \
+    const int cols = ColsFromArg(arg);                                 \
+    const int64 tot = static_cast<int64>(iters) * rows * cols;         \
+    testing::ItemsProcessed(tot);                                      \
+    testing::BytesProcessed(tot * sizeof(float));                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3)).Run(iters);      \
+  }                                                                    \
+  BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C)                 \
+      ->Arg(RowsAndColsArg(R, C));
+
+#define BM_BCAST_ADD_CROSS_CR_ALL(DEVICE)   \
+  BM_BCAST_ADD_CROSS_CR(DEVICE, 512, 2048); \
+  BM_BCAST_ADD_CROSS_CR(DEVICE, 512, 4096); \
+  BM_BCAST_ADD_CROSS_CR(DEVICE, 2048, 512); \
+  BM_BCAST_ADD_CROSS_CR(DEVICE, 4096, 512);
+BM_BCAST_ADD_CROSS_CR_ALL(cpu);
+#if GOOGLE_CUDA
+BM_BCAST_ADD_CROSS_CR_ALL(gpu);
+#endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_CROSS_CR_ALL(sycl);
+#endif  // TENSORFLOW_USE_SYCL
+#undef BM_BCAST_ADD_CROSS_CR_ALL
+#undef BM_BCAST_ADD_CROSS_CR
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 1e3b0c231f35c12d2e9e23d8d503b3a7492ab676..c78e0aff83331998c172ebf572ec9954759d19f4 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -10,34 +10,13 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_kernel_library",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-cc_library(
-    name = "stats_aggregator",
-    hdrs = ["stats_aggregator.h"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
+    "tf_cc_test",
 )
 
 tf_kernel_library(
     name = "stats_aggregator_ops",
     srcs = ["stats_aggregator_ops.cc"],
     deps = [
-        ":stats_aggregator",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -49,14 +28,7 @@ cc_library(
     name = "dataset",
     srcs = [],
     hdrs = ["dataset.h"],
-    deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 cc_library(
@@ -113,6 +85,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "slide_dataset_op",
+    srcs = ["slide_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
 tf_kernel_library(
     name = "padded_batch_dataset_op",
     srcs = ["padded_batch_dataset_op.cc"],
@@ -162,6 +147,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
     ],
 )
@@ -209,6 +195,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "generator_dataset_op",
+    srcs = ["generator_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "scan_dataset_op",
     srcs = ["scan_dataset_op.cc"],
@@ -265,6 +264,26 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "prefetch_autotuner",
+    srcs = ["prefetch_autotuner.cc"],
+    hdrs = ["prefetch_autotuner.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "prefetch_autotuner_test",
+    srcs = ["prefetch_autotuner_test.cc"],
+    deps = [
+        ":prefetch_autotuner",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
 
@@ -273,6 +292,7 @@ tf_kernel_library(
     srcs = ["prefetch_dataset_op.cc"],
     deps = [
         ":dataset",
+        ":prefetch_autotuner",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -323,7 +343,6 @@ tf_kernel_library(
     srcs = ["stats_dataset_ops.cc"],
     deps = [
         ":dataset",
-        ":stats_aggregator",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -331,6 +350,16 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "stats_aggregator_dataset_op",
+    srcs = ["stats_aggregator_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "random_dataset_op",
     srcs = ["random_dataset_op.cc"],
@@ -417,6 +446,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "unbatch_dataset_op",
+    srcs = ["unbatch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
@@ -473,7 +515,7 @@ tf_kernel_library(
     srcs = ["iterator_ops.cc"],
     deps = [
         ":dataset",
-        ":stats_aggregator",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -498,18 +540,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "unique_dataset_op",
-    srcs = ["unique_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "dataset_ops",
     deps = [
@@ -519,6 +549,7 @@ tf_kernel_library(
         ":dense_to_sparse_batch_dataset_op",
         ":filter_dataset_op",
         ":flat_map_dataset_op",
+        ":generator_dataset_op",
         ":group_by_window_dataset_op",
         ":interleave_dataset_op",
         ":iterator_ops",
@@ -535,15 +566,31 @@ tf_kernel_library(
         ":scan_dataset_op",
         ":shuffle_dataset_op",
         ":skip_dataset_op",
+        ":slide_dataset_op",
         ":sparse_tensor_slice_dataset_op",
         ":sql_dataset_ops",
+        ":stats_aggregator_dataset_op",
         ":stats_aggregator_ops",
         ":stats_dataset_ops",
         ":take_dataset_op",
         ":tensor_dataset_op",
         ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
-        ":unique_dataset_op",
+        ":unbatch_dataset_op",
+        ":writer_ops",
         ":zip_dataset_op",
     ],
 )
+
+tf_kernel_library(
+    name = "writer_ops",
+    srcs = ["writer_ops.cc"],
+    deps = [
+        ":dataset",
+        ":dataset_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index f0a2192826e051586e4999d729c24ed5495be0ea..4b4728dab68523aa54176bdce6222a7aa5f8e9d3 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -308,6 +308,21 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
             input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             cache_(new std::vector<std::vector<Tensor>>) {}
 
+      ~MemoryWriterIterator() override {
+        mutex_lock l(mu_);
+        if (cache_) {
+          LOG(ERROR)
+              << "The calling iterator did not fully read the dataset we were "
+                 "attempting to cache. In order to avoid unexpected truncation "
+                 "of the sequence, the current [partially cached] sequence "
+                 "will be dropped. This can occur if you have a sequence "
+                 "similar to `dataset.cache().take(k).repeat()`. Instead, swap "
+                 "the order (i.e. `dataset.take(k).cache().repeat()`)";
+          mutex_lock l2(dataset()->mu_);
+          dataset()->writer_iterator_created_ = false;
+        }
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
@@ -318,7 +333,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
           // Guard on cache_ to not crash if GetNext is called a second time
           // after *end_of_sequence == true
           if (cache_) {
-            mutex_lock l2(dataset()->mu_);
+            mutex_lock l(dataset()->mu_);
             DCHECK(dataset()->writer_iterator_created_);
             DCHECK(!dataset()->cache_);
             cache_.swap(dataset()->cache_);
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index c4aa9ec26545a2792c1e741af69f61a292fcc216..dd61b7daee153bf2f3be3c72dd5c8e6032d0080b 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -256,6 +256,62 @@ Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
   return frame.ConsumeRetvals(rets);
 }
 
+Status CapturedFunction::Instantiate(IteratorContext* ctx) {
+  FunctionLibraryRuntime::Handle unused_handle;
+  TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &unused_handle));
+  mutex_lock l(mu_);
+  if (captured_runner_ == nullptr) {
+    captured_runner_ = *ctx->runner();
+  }
+  return Status::OK();
+}
+
+Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
+                                         std::vector<Tensor>* rets) {
+  FunctionLibraryRuntime* lib;
+  FunctionLibraryRuntime::Handle handle;
+  std::function<void(std::function<void()>)>* runner;
+  {
+    tf_shared_lock l(mu_);
+    if (lib_ == nullptr) {
+      return errors::FailedPrecondition(
+          "`CapturedFunction::Instantiate()` must be called before a call to "
+          "`CapturedFunction::RunInstantiated()`.");
+    }
+    lib = lib_;
+    handle = f_handle_;
+    runner = &captured_runner_;
+  }
+
+  FunctionLibraryRuntime::Options f_opts;
+  f_opts.step_id = CapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(f_opts.step_id, [lib](const string& name) {
+    lib->device()->resource_manager()->Cleanup(name).IgnoreError();
+  });
+  f_opts.step_container = &step_container;
+  f_opts.runner = runner;
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness of
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
+  CancellationManager c_mgr;
+  f_opts.cancellation_manager = &c_mgr;
+
+  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  Notification n;
+  Status s;
+
+  lib->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+    s.Update(func_status);
+    n.Notify();
+  });
+  n.WaitForNotification();
+  TF_RETURN_IF_ERROR(s);
+  return frame.ConsumeRetvals(rets);
+}
+
 void CapturedFunction::RunAsync(IteratorContext* ctx,
                                 std::vector<Tensor>&& args,
                                 std::vector<Tensor>* rets,
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 32d2bc3aaebf440584934231a8555199026074ae..490f5cd1e3b6676decc6646df9dfb722524d58e8 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -64,6 +64,21 @@ class CapturedFunction {
                              const std::vector<Tensor>& args,
                              std::vector<Tensor>* rets);
 
+  // Explicitly instantiate this function for use in the given
+  // context. This method, and the context-less overload
+  // `RunInstantiated()` below can be useful for calling a captured
+  // function in cases where an `IteratorContext*` is not available
+  // (such as a destructor).
+  Status Instantiate(IteratorContext* ctx);
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
+  // possible.
+  //
+  // REQUIRES: `this->Instantiate()` must have been called before this method.
+  Status RunInstantiated(const std::vector<Tensor>& args,
+                         std::vector<Tensor>* rets);
+
   // Asynchronously runs the captured function on the given `args`, stores
   // the results in `*rets`, and calls the given `done` callback when the
   // function returns. This method takes ownership of the tensors in `args`,
@@ -99,6 +114,7 @@ class CapturedFunction {
   FunctionLibraryRuntime::Handle f_handle_ GUARDED_BY(mu_);
   const std::vector<Tensor> captured_inputs_;
   DataTypeSlice ret_types_;
+  std::function<void(std::function<void()>)> captured_runner_ = nullptr;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
 };
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index e3a3601ee847148c459ab33decb8528f8b96521d..67ddb52d577edb6609764b55c66f1885e6e4d60b 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/common_runtime/device.h"
 
 namespace tensorflow {
 
@@ -45,6 +46,18 @@ Status MakeIteratorFromInputElement(
   return Status::OK();
 }
 
+IteratorContext MakeIteratorContext(OpKernelContext* ctx) {
+  IteratorContext::Params params;
+  params.env = ctx->env();
+  params.runner = *(ctx->runner());
+  params.lib = ctx->function_library();
+  DeviceBase* device = ctx->function_library()->device();
+  params.allocator_getter = [device](AllocatorAttributes attrs) {
+    return device->GetAllocator(attrs);
+  };
+  return IteratorContext(params);
+}
+
 }  // namespace dataset
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 6c4191c2be6c55bfde7c5e8bd2e3b1e92edbaf27..e5ca71dd99d7f5cba095f9ce3daa016150c4a6c9 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -28,6 +28,8 @@ Status MakeIteratorFromInputElement(
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
     std::unique_ptr<IteratorBase>* out_iterator);
 
+IteratorContext MakeIteratorContext(OpKernelContext* ctx);
+
 }  // namespace dataset
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index d16b5b7d416b85695287ccbab4bc4398a222c139..186b1e1c6c5a3d5a4aa8e4eb31d474aac4156243 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
@@ -44,21 +45,45 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.push_back(t);
     }
 
+    FunctionLibraryRuntime::Handle pred_handle;
+    OP_REQUIRES_OK(ctx,
+                   ctx->function_library()->Instantiate(
+                       func_.name(), AttrSlice(&func_.attr()), &pred_handle));
+    auto cleanup = gtl::MakeCleanup([ctx, pred_handle]() {
+      OP_REQUIRES_OK(ctx, ctx->function_library()->ReleaseHandle(pred_handle));
+    });
+
+    const FunctionBody* pred_body =
+        ctx->function_library()->GetFunctionBody(pred_handle);
+    OP_REQUIRES(ctx, pred_body->ret_nodes.size() == 1,
+                errors::InvalidArgument(
+                    "predicate function must have a single return value."));
+    Node* ret_node = pred_body->ret_nodes[0];
+    Node* ret_input_node;
+    OP_REQUIRES_OK(ctx, ret_node->input_node(0, &ret_input_node));
     std::unique_ptr<CapturedFunction> captured_func;
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(
                             func_, std::move(other_arguments), &captured_func));
 
-    *output = new Dataset(ctx, input, func_, std::move(captured_func));
+    if (ret_input_node->def().op() == "_Arg") {
+      int32 index = -1;
+      OP_REQUIRES_OK(ctx, GetNodeAttr(ret_input_node->def(), "index", &index));
+      *output = new FilterTensorDataset(ctx, input, func_,
+                                        std::move(captured_func), index);
+    } else {
+      *output = new FilterFunctionDataset(ctx, input, func_,
+                                          std::move(captured_func));
+    }
   }
 
  private:
   const int graph_def_version_;
 
-  class Dataset : public GraphDatasetBase {
+  class FilterDatasetBase : public GraphDatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func)
+    FilterDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
+                      const NameAttrList& func,
+                      std::unique_ptr<CapturedFunction> captured_func)
         : GraphDatasetBase(ctx),
           input_(input),
           func_(func),
@@ -66,7 +91,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       input_->Ref();
     }
 
-    ~Dataset() override { input_->Unref(); }
+    ~FilterDatasetBase() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIterator(
         const string& prefix) const override {
@@ -112,11 +137,15 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       return Status::OK();
     }
 
+    virtual Status EvaluatePredicate(IteratorContext* ctx,
+                                     const std::vector<Tensor>& element,
+                                     bool* out_matched) const = 0;
+
    private:
-    class Iterator : public DatasetIterator<Dataset> {
+    class Iterator : public DatasetIterator<FilterDatasetBase> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
+          : DatasetIterator<FilterDatasetBase>(params),
             input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -143,18 +172,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
 
-          // TODO(mrry): Avoid blocking a threadpool thread. We will need to
-          // stack-rip the iterators and use async kernels.
-          std::vector<Tensor> result;
-          TF_RETURN_IF_ERROR(dataset()->captured_func_->RunWithBorrowedArgs(
-              ctx, *out_tensors, &result));
-
-          if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
-              result[0].NumElements() != 1) {
-            return errors::InvalidArgument(
-                "Filter predicate `f` must return a scalar bool.");
-          }
-          matched = result[0].scalar<bool>()();
+          TF_RETURN_IF_ERROR(
+              dataset()->EvaluatePredicate(ctx, *out_tensors, &matched));
           if (!matched) {
             // Clear the output tensor list since it didn't match.
             out_tensors->clear();
@@ -192,9 +211,61 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
     const DatasetBase* const input_;
     const NameAttrList func_;
+
+   protected:
     const std::unique_ptr<CapturedFunction> captured_func_;
   };
 
+  class FilterFunctionDataset : public FilterDatasetBase {
+   public:
+    using FilterDatasetBase::FilterDatasetBase;
+
+   protected:
+    Status EvaluatePredicate(IteratorContext* ctx,
+                             const std::vector<Tensor>& element,
+                             bool* out_matched) const override {
+      // TODO(mrry): Avoid blocking a threadpool thread. We will need to
+      // stack-rip the iterators and use async kernels.
+      std::vector<Tensor> result;
+      TF_RETURN_IF_ERROR(
+          captured_func_->RunWithBorrowedArgs(ctx, element, &result));
+
+      if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+          result[0].NumElements() != 1) {
+        return errors::InvalidArgument(
+            "Filter predicate `f` must return a scalar bool.");
+      }
+      *out_matched = result[0].scalar<bool>()();
+      return Status::OK();
+    }
+  };
+
+  class FilterTensorDataset : public FilterDatasetBase {
+   public:
+    FilterTensorDataset(OpKernelContext* ctx, const DatasetBase* input,
+                        const NameAttrList& func,
+                        std::unique_ptr<CapturedFunction> captured_func,
+                        int32 index)
+        : FilterDatasetBase(ctx, input, func, std::move(captured_func)),
+          index_(index) {}
+
+   protected:
+    Status EvaluatePredicate(IteratorContext* ctx,
+                             const std::vector<Tensor>& element,
+                             bool* out_matched) const override {
+      const Tensor& predicate = element[index_];
+      if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
+        return errors::InvalidArgument(
+            "Filter predicate `f` must return a scalar bool.");
+      }
+      *out_matched = predicate.scalar<bool>()();
+      return Status::OK();
+    }
+
+   private:
+    const int32 index_;
+  };
+
  private:
   NameAttrList func_;
 };
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f1e441b91d0102b112523a46ac75ce415eacdd7
--- /dev/null
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -0,0 +1,201 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <vector>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class GeneratorDatasetOp : public DatasetOpKernel {
+ public:
+  explicit GeneratorDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("init_func", &init_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("next_func", &next_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("finalize_func", &finalize_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    OpInputList init_func_other_args_input;
+    OP_REQUIRES_OK(ctx, ctx->input_list("init_func_other_args",
+                                        &init_func_other_args_input));
+    std::vector<Tensor> init_func_other_args;
+    init_func_other_args.reserve(init_func_other_args_input.size());
+    for (const Tensor& t : init_func_other_args_input) {
+      init_func_other_args.push_back(t);
+    }
+    std::unique_ptr<CapturedFunction> init_func;
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(
+                 init_func_, std::move(init_func_other_args), &init_func));
+
+    OpInputList next_func_other_args_input;
+    OP_REQUIRES_OK(ctx, ctx->input_list("next_func_other_args",
+                                        &next_func_other_args_input));
+    std::vector<Tensor> next_func_other_args;
+    next_func_other_args.reserve(next_func_other_args_input.size());
+    for (const Tensor& t : next_func_other_args_input) {
+      next_func_other_args.push_back(t);
+    }
+    std::unique_ptr<CapturedFunction> next_func;
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(
+                 next_func_, std::move(next_func_other_args), &next_func));
+
+    OpInputList finalize_func_other_args_input;
+    OP_REQUIRES_OK(ctx, ctx->input_list("finalize_func_other_args",
+                                        &finalize_func_other_args_input));
+    std::vector<Tensor> finalize_func_other_args;
+    finalize_func_other_args.reserve(finalize_func_other_args_input.size());
+    for (const Tensor& t : finalize_func_other_args_input) {
+      finalize_func_other_args.push_back(t);
+    }
+    std::unique_ptr<CapturedFunction> finalize_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            finalize_func_, std::move(finalize_func_other_args),
+                            &finalize_func));
+
+    *output =
+        new Dataset(ctx, std::move(init_func), std::move(next_func),
+                    std::move(finalize_func), output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::unique_ptr<CapturedFunction> init_func,
+            std::unique_ptr<CapturedFunction> next_func,
+            std::unique_ptr<CapturedFunction> finalize_func,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : GraphDatasetBase(ctx),
+          init_func_(std::move(init_func)),
+          next_func_(std::move(next_func)),
+          finalize_func_(std::move(finalize_func)),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Generator")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "GeneratorDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      ~Iterator() override {
+        if (!finalized_) {
+          std::vector<Tensor> ignored;
+          Status s =
+              dataset()->finalize_func_->RunInstantiated(state_, &ignored);
+          if (!s.ok()) {
+            LOG(WARNING)
+                << "Error occurred when finalizing GeneratorDataset iterator: "
+                << s;
+          }
+        }
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+
+        if (!initialized_) {
+          TF_RETURN_IF_ERROR(
+              dataset()->init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
+          // Explicitly instantiate the finalize function here so that
+          // we can invoke it in the destructor.
+          TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(ctx));
+          initialized_ = true;
+        }
+
+        if (finalized_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        Status s = dataset()->next_func_->RunWithBorrowedArgs(ctx, state_,
+                                                              out_tensors);
+        if (s.ok()) {
+          *end_of_sequence = false;
+        } else if (errors::IsOutOfRange(s)) {
+          // `next_func` may deliberately raise `errors::OutOfRange`
+          // to indicate that we should terminate the iteration.
+          s = Status::OK();
+          *end_of_sequence = true;
+
+          // NOTE(mrry): We ignore any tensors returned by the
+          // finalize function.
+          std::vector<Tensor> ignored;
+          TF_RETURN_IF_ERROR(
+              dataset()->finalize_func_->RunInstantiated(state_, &ignored));
+          finalized_ = true;
+        }
+        return s;
+      }
+
+     private:
+      mutex mu_;
+      bool initialized_ GUARDED_BY(mu_) = false;
+      bool finalized_ GUARDED_BY(mu_) = false;
+      std::vector<Tensor> state_ GUARDED_BY(mu_);
+    };
+
+    const std::unique_ptr<CapturedFunction> init_func_;
+    const std::unique_ptr<CapturedFunction> next_func_;
+    const std::unique_ptr<CapturedFunction> finalize_func_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList init_func_;
+  NameAttrList next_func_;
+  NameAttrList finalize_func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("GeneratorDataset").Device(DEVICE_CPU),
+                        GeneratorDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 834c06bb930d1c723c5b3f880dcc13a892bb44f7..46f43dd1b1dcd79e1fc1f8fadc24858f3f7eae9f 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -263,6 +263,11 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 }
                 const int64 window_size =
                     window_size_func_output[0].scalar<int64>()();
+                if (window_size <= 0) {
+                  return errors::InvalidArgument(
+                      "Window size must be greater than zero, but got ",
+                      window_size, ".");
+                }
                 window_sizes_[key] = window_size;
               }
 
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index d7d4ad5cf7f6d5a3386be524c7a227006da0b3f4..a2f6c5fe2c3a4b92e67ba2785b2509e4f5029213 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -19,11 +19,12 @@ limitations under the License.
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/stats_aggregator.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -141,14 +142,20 @@ class IteratorResource : public ResourceBase {
     std::vector<Tensor> outputs;
     GraphRunner graph_runner(ctx->env());
 
-    // Build a new FLR that knows about the functions in the graph.
-    std::shared_ptr<FunctionLibraryDefinition> flib_def(
-        new FunctionLibraryDefinition(
-            *ctx->function_library()->GetFunctionLibraryDefinition()));
+    // Build a new FLR that knows about the functions in the graph, and use
+    // it for all operations on the restored iterator.
+    // NOTE(mrry): We clone the existing FLR and use it in the GraphRunner
+    // because some of the OpKernels in the graph might call functions that are
+    // only defined in the loaded GraphDef.
+    FunctionLibraryRuntime* lib;
+    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+    TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
     TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
 
     TF_RETURN_IF_ERROR(
-        graph_runner.Run(&graph, lib_, {}, {output_node}, &outputs));
+        graph_runner.Run(&graph, lib, {}, {output_node}, &outputs));
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
     TF_RETURN_IF_ERROR(set_iterator(dataset->MakeIterator("Iterator")));
@@ -158,9 +165,8 @@ class IteratorResource : public ResourceBase {
       IteratorContext::Params params;
       params.env = ctx->env();
       params.runner = *(ctx->runner());
-      params.function_library = flib_def;
-      params.lib = lib_;
-      DeviceBase* device = lib_->device();
+      params.lib = lib;
+      DeviceBase* device = lib->device();
       params.allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
@@ -168,7 +174,10 @@ class IteratorResource : public ResourceBase {
 
       TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader));
       mutex_lock l(mu_);
+      device_mgr_ = std::move(device_mgr);
       lib_def_ = std::move(flib_def);
+      pflr_ = std::move(pflr);
+      lib_ = lib;
       return Status::OK();
     } else {
       return errors::FailedPrecondition(
@@ -195,10 +204,6 @@ class IteratorResource : public ResourceBase {
     return Status::OK();
   }
 
-  void set_stats_aggregator(std::shared_ptr<StatsAggregator> stats_aggregator) {
-    mutex_lock l(mu_);
-    stats_aggregator_ = std::move(stats_aggregator);
-  }
 
   std::shared_ptr<StatsAggregator> stats_aggregator() {
     tf_shared_lock l(mu_);
@@ -579,9 +584,9 @@ class MakeIteratorOp : public OpKernel {
     IteratorResource* iterator_resource;
     OP_REQUIRES_OK(
         ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
+    core::ScopedUnref unref(iterator_resource);
     OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(
                             dataset->MakeIterator("Iterator")));
-    iterator_resource->Unref();
   }
 };
 
@@ -605,17 +610,7 @@ class ToSingleElementOp : public AsyncOpKernel {
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       auto iterator = dataset->MakeIterator("SingleElementIterator");
 
-      IteratorContext::Params params;
-      params.env = ctx->env();
-      params.runner = *(ctx->runner());
-      params.lib = ctx->function_library();
-      DeviceBase* device = ctx->function_library()->device();
-      params.allocator_getter = [device](AllocatorAttributes attrs) {
-        return device->GetAllocator(attrs);
-      };
-
-      IteratorContext iter_ctx(std::move(params));
-
+      IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
@@ -859,9 +854,7 @@ class IteratorGetNextOp : public AsyncOpKernel {
     // inter-op thread pool thread, so we issue the call from the
     // owned thread pool.
     thread_pool_->Schedule(std::bind(
-        [this, ctx, iterator](DoneCallback done) {
-          core::ScopedUnref unref_iterator(iterator);
-
+        [ctx, iterator](DoneCallback done) {
           std::vector<Tensor> components;
           bool end_of_sequence = false;
 
@@ -878,17 +871,22 @@ class IteratorGetNextOp : public AsyncOpKernel {
           };
           IteratorContext iter_ctx(std::move(params));
 
-          OP_REQUIRES_OK_ASYNC(
-              ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
-              done);
-          OP_REQUIRES_ASYNC(ctx, !end_of_sequence,
-                            errors::OutOfRange("End of sequence"), done);
-
-          for (int i = 0; i < components.size(); ++i) {
-            // TODO(mrry): Check that the shapes match the shape attrs.
-            ctx->set_output(i, components[i]);
+          Status s =
+              iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+          // NOTE(mrry): We must unref the iterator before calling `done()`, to
+          // avoid destruction races.
+          iterator->Unref();
+
+          if (!s.ok()) {
+            ctx->SetStatus(s);
+          } else if (end_of_sequence) {
+            ctx->SetStatus(errors::OutOfRange("End of sequence"));
+          } else {
+            for (int i = 0; i < components.size(); ++i) {
+              // TODO(mrry): Check that the shapes match the shape attrs.
+              ctx->set_output(i, components[i]);
+            }
           }
-
           done();
         },
         std::move(done)));
@@ -1064,30 +1062,6 @@ class DeserializeIteratorOp : public OpKernel {
   }
 };
 
-class IteratorSetStatsAggregatorOp : public OpKernel {
- public:
-  explicit IteratorSetStatsAggregatorOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    IteratorResource* iterator_resource;
-    OP_REQUIRES_OK(
-        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
-    core::ScopedUnref unref_iterator(iterator_resource);
-
-    StatsAggregatorResource* stats_aggregator_resource;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
-                                       &stats_aggregator_resource));
-    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
-    // TODO(mrry): Consider allowing multiple StatsAggregator ops to
-    // subscribe to updates, and/or unsubscribing.
-    OP_REQUIRES(ctx, !iterator_resource->stats_aggregator(),
-                errors::FailedPrecondition(
-                    "Iterator already associated with a StatsAggregator"));
-    iterator_resource->set_stats_aggregator(
-        stats_aggregator_resource->stats_aggregator());
-  }
-};
 
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
@@ -1108,8 +1082,6 @@ REGISTER_KERNEL_BUILDER(Name("SerializeIterator").Device(DEVICE_CPU),
                         SerializeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeserializeIterator").Device(DEVICE_CPU),
                         DeserializeIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorSetStatsAggregator").Device(DEVICE_CPU),
-                        IteratorSetStatsAggregatorOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 9ce263732f6e6c907dfdc89692455daa5dca86d1..7bc43e2072546118a2630bb6acfc170e3348798c 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -66,28 +66,37 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                 errors::InvalidArgument(
                     "num_parallel_batches must be greater than zero."));
 
+    bool drop_remainder;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "drop_remainder", &drop_remainder));
+
     std::unique_ptr<CapturedFunction> captured_func;
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(
                             func_, std::move(other_arguments), &captured_func));
 
-    *output = new Dataset(input, batch_size, num_parallel_batches,
-                          output_types_, output_shapes_,
+    *output = new Dataset(ctx, input, batch_size, num_parallel_batches,
+                          drop_remainder, output_types_, output_shapes_, func_,
                           std::move(captured_func), &ctx->eigen_cpu_device());
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input, int64 batch_size,
-            int64 num_parallel_batches, const DataTypeVector& output_types,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
+            int64 num_parallel_batches, bool drop_remainder,
+            const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const Eigen::ThreadPoolDevice* device)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
           batch_size_(batch_size),
           num_parallel_batches_(num_parallel_batches),
+          drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
+          map_fn_(func),
           captured_func_(std::move(captured_func)),
           device_(device) {
       input_->Ref();
@@ -111,6 +120,48 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "MapAndBatchDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* batch_size_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
+      Node* num_parallel_batches_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(num_parallel_batches_, &num_parallel_batches_node));
+      Node* drop_remainder_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
+
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(map_fn_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {std::make_pair(0, input_graph_node),
+           std::make_pair(2, batch_size_node),
+           std::make_pair(3, num_parallel_batches_node),
+           std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
+          {std::make_pair(1, other_arguments)},      // Tensor list inputs.
+          {std::make_pair("f", f),
+           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -177,13 +228,21 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           batch_results_[current_batch_index_].output.clear();
         } else {
           if (num_elements < dataset()->batch_size_) {
+            if (dataset()->drop_remainder_) {
+              // Deallocate tensors allocated for the output.
+              batch_results_[current_batch_index_].output.clear();
+              *end_of_sequence = true;
+              return Status::OK();
+            }
             const std::vector<Tensor>& output =
                 batch_results_[current_batch_index_].output;
             for (size_t i = 0; i < output.size(); ++i) {
               TensorShape component_shape(
                   batch_results_[current_batch_index_].output[i].shape());
               component_shape.set_dim(0, num_elements);
-              Tensor component(ctx->allocator({}), output[i].dtype(),
+              AllocatorAttributes attr;
+              attr.set_gpu_compatible(true);
+              Tensor component(ctx->allocator(attr), output[i].dtype(),
                                component_shape);
               TF_RETURN_IF_ERROR(
                   CopyPartialBatch(&component, output[i], num_elements));
@@ -203,9 +262,83 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return status;
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (current_batch_index_ == -1) {
+          // Iterator has not been used. Nothing to save.
+          return Status::OK();
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_batch_index"),
+                                               current_batch_index_));
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("invocation_results_size"), invocation_results_.size()));
+        for (size_t i = 0; i < invocation_results_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteInvocationResultLocked(writer, i));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"),
+                                               batch_results_.size()));
+        for (size_t i = 0; i < batch_results_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteBatchResultLocked(writer, i));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("current_batch_index"))) {
+          // Iterator was never used so nothing to restore.
+          return Status::OK();
+        }
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("current_batch_index"), &temp));
+          current_batch_index_ = static_cast<int32>(temp);
+          if (current_batch_index_ != temp) {
+            return errors::Internal("Invalid value for current_batch_index ",
+                                    temp);
+          }
+        }
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        size_t invocation_results_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("invocation_results_size"), &temp));
+          invocation_results_size = static_cast<size_t>(temp);
+          if (invocation_results_size != temp) {
+            return errors::Internal(
+                "Invalid value for invocation_results_size ", temp);
+          }
+        }
+        CHECK_EQ(invocation_results_.size(), invocation_results_size);
+        for (size_t i = 0; i < invocation_results_size; ++i) {
+          TF_RETURN_IF_ERROR(ReadInvocationResultLocked(reader, i));
+        }
+        size_t batch_results_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("batch_results_size"), &temp));
+          batch_results_size = static_cast<size_t>(temp);
+          if (batch_results_size != temp) {
+            return errors::Internal("Invalid value for batch_results_size ",
+                                    temp);
+          }
+        }
+        CHECK_EQ(batch_results_.size(), batch_results_size);
+        for (size_t i = 0; i < batch_results_size; ++i) {
+          TF_RETURN_IF_ERROR(ReadBatchResultLocked(ctx, reader, i));
+        }
+        return Status::OK();
+      }
+
      private:
       struct BatchResult {
-        mutex mu;
+        mutex mu ACQUIRED_AFTER(mu_);
         bool output_allocated GUARDED_BY(mu);
         std::vector<Tensor> output;
         std::unique_ptr<BlockingCounter> counter;
@@ -255,7 +388,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         for (size_t i = 0; i < num_components; ++i) {
           TensorShape component_shape({dataset()->batch_size_});
           component_shape.AppendShape(return_values[i].shape());
-          Tensor component(ctx->allocator({}), return_values[i].dtype(),
+          AllocatorAttributes attr;
+          attr.set_gpu_compatible(true);
+          Tensor component(ctx->allocator(attr), return_values[i].dtype(),
                            component_shape);
           batch_result->output.emplace_back(std::move(component));
         }
@@ -333,7 +468,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       void StartInvocationBatch(IteratorContext* ctx, int64 batch_index)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        port::Tracing::TraceMe activity(strings::StrCat(prefix(), "::Start"));
+        tracing::ScopedActivity activity(strings::StrCat(prefix(), "::Start"));
         // Initialize batch result.
         {
           mutex_lock l(batch_results_[batch_index].mu);
@@ -358,7 +493,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       Status WaitForBatch(int64 batch_index, int64* num_elements)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        port::Tracing::TraceMe activity(strings::StrCat(prefix(), "::Wait"));
+        tracing::ScopedActivity activity(strings::StrCat(prefix(), "::Wait"));
         batch_results_[batch_index].counter->Wait();
         Status status = Status::OK();
         for (size_t i = 0; i < dataset()->batch_size_; ++i, ++*num_elements) {
@@ -377,6 +512,177 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return status;
       }
 
+      Status WriteInvocationResultLocked(IteratorStateWriter* writer,
+                                         size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        const InvocationResult& result = invocation_results_[index];
+        string prefix = strings::StrCat("invocation_results_", index);
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, full_name(strings::StrCat(prefix, "_status")),
+            result.status));
+        if (result.end_of_input) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_return_values_size")),
+            result.return_values.size()));
+        for (size_t i = 0; i < result.return_values.size(); i++) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_return_values_", i)),
+              result.return_values[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadInvocationResultLocked(IteratorStateReader* reader,
+                                        size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        InvocationResult* result = &invocation_results_[index];
+        string prefix = strings::StrCat("invocation_results_", index);
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, full_name(strings::StrCat(prefix, "_status")),
+            &result->status));
+        result->end_of_input = reader->Contains(
+            full_name(strings::StrCat(prefix, "_end_of_input")));
+        size_t return_values_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_return_values_size")),
+              &temp));
+          return_values_size = static_cast<size_t>(temp);
+          if (temp != return_values_size) {
+            return errors::Internal("Invalid value for return_values_size ",
+                                    return_values_size);
+          }
+        }
+        result->return_values.reserve(return_values_size);
+        for (size_t i = 0; i < return_values_size; i++) {
+          result->return_values.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(prefix, "_return_values_", i)),
+              &result->return_values.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteBatchResultLocked(IteratorStateWriter* writer, size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Wait for the map_fn dispatches made in `InvokeFunctionLocked` to
+        // finish. This may delay saving a checkpoint by a bit but keeps the
+        // code clean and also saves us from checkpointing the state of the
+        // `BlockingCounter`.
+        int64 num_elements = 0;
+        WaitForBatch(index, &num_elements).IgnoreError();
+
+        const BatchResult& result = batch_results_[index];
+        string prefix = strings::StrCat("batch_results_", index);
+        {
+          mutex_lock l(batch_results_[index].mu);
+          if (result.output_allocated) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat(prefix, "_output_allocated")), ""));
+          }
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_output_size")),
+            result.output.size()));
+        for (size_t i = 0; i < result.output.size(); i++) {
+          // If the batch is not full, we only store the first
+          // `num_elements` values. The rest of the batch tensor is
+          // *uninitialized* and accessing that will raise msan errors.
+          if (num_elements < dataset()->batch_size_) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result.output[i].Slice(0, num_elements)));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result.output[i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status ReadBatchResultLocked(IteratorContext* ctx,
+                                   IteratorStateReader* reader, size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        BatchResult* result = &batch_results_[index];
+        string prefix = strings::StrCat("batch_results_", index);
+        {
+          mutex_lock l(batch_results_[index].mu);
+          result->output_allocated = reader->Contains(
+              full_name(strings::StrCat(prefix, "_output_allocated")));
+          // Simulate that the batch was fully generated.
+          batch_results_[index].counter.reset(new BlockingCounter(0));
+        }
+        size_t output_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_output_size")), &temp));
+          output_size = static_cast<size_t>(temp);
+          if (temp != output_size) {
+            return errors::Internal("Invalid value for output_size ",
+                                    output_size);
+          }
+        }
+        result->output.reserve(output_size);
+        for (size_t i = 0; i < output_size; i++) {
+          Tensor t;
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(prefix, "_output_", i)), &t));
+          // If the batch was not full, we may have stored only the relevant
+          // slice. Since tensors in `BatchResult.output` are expected to
+          // have the leading dimension of size batch_size, we build a larger
+          // tensor and copy the slice read from the checkpoint into it.
+          if (t.dim_size(0) < dataset()->batch_size_) {
+            TensorShape component_shape(t.shape());
+            component_shape.set_dim(0, dataset()->batch_size_);
+            AllocatorAttributes attr;
+            attr.set_gpu_compatible(true);
+            Tensor new_t(ctx->allocator(attr), t.dtype(), component_shape);
+            TF_RETURN_IF_ERROR(CopyPartialBatch(&new_t, t, t.dim_size(0)));
+            result->output.emplace_back(std::move(new_t));
+          } else {
+            result->output.emplace_back(std::move(t));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& prefix, const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                              Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
       mutex mu_;
       int32 current_batch_index_ GUARDED_BY(mu_) = -1;
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
@@ -388,8 +694,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     const NameAttrList func_;
     const int64 batch_size_;
     const int64 num_parallel_batches_;
+    const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const NameAttrList map_fn_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const Eigen::ThreadPoolDevice* device_;  // not owned
   };
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 3f88d6dee80ea7b07ef1ce88ee76edba65cddcde..fa33867ec1181257931715f42478e8518b27db6e 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 
@@ -35,7 +36,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
   explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
         graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
@@ -80,24 +81,28 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
 
     std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
-                            func_, std::move(other_arguments), &captured_func));
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(
+                 interleave_func_, std::move(other_arguments), &captured_func));
 
     *output =
-        new Dataset(input, std::move(captured_func), cycle_length, block_length,
-                    sloppy, buffer_output_elements, prefetch_input_elements,
-                    output_types_, output_shapes_);
+        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
+                    cycle_length, block_length, sloppy, buffer_output_elements,
+                    prefetch_input_elements, output_types_, output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
             int64 block_length, bool sloppy, int64 buffer_output_elements,
             int64 prefetch_input_elements, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
+          interleave_func_(func),
           captured_func_(std::move(captured_func)),
           cycle_length_(cycle_length),
           block_length_(block_length),
@@ -128,6 +133,52 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       return "ParallelInterleaveDatasetOp::Dataset";
     }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* cycle_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
+      Node* block_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
+      Node* sloppy_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
+      Node* buffer_output_elements_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
+      Node* prefetch_input_elements_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
+                                      &prefetch_input_elements_node));
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(interleave_func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {{0, input_node},
+           {2, cycle_length_node},
+           {3, block_length_node},
+           {4, sloppy_node},
+           {5, buffer_output_elements_node},
+           {6, prefetch_input_elements_node}},
+          {{1, other_arguments}},
+          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+      return Status::OK();
+    }
+
    private:
     int64 num_threads() const {
       return cycle_length_ + prefetch_input_elements_;
@@ -156,17 +207,17 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
     // that a caller will block waiting for an element to be produced.
     //
     // Pointers to these worker states are kept in 2 disjoint data structures:
-    //  1. `interleave_` is a vector containing pointers to `WorkerState`s that
-    //  we
-    //     are interleaving. Worker threads backing these WorkerStates should
-    //     be regularly producing values.
-    //  2. `staging_` is a deque containing pointers to WorkerStates that we
-    //     will move to `interleave_` when an iterator in `interleave_` is
-    //     exhausted.
+    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
+    //     in `workers_` that we are interleaving. Worker threads backing these
+    //     WorkerStates should be regularly producing values.
+    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
+    //     `workers_` that we will move to `interleave_indices_` when an
+    //     iterator in `interleave_indices_` is exhausted.
     //
     // The client calls `GetNext[Internal]()` to retrieve an output element. The
-    // internal implementation updates the state of `interleave_` and `staging_`
-    // as output iterators (run by the worker threads) are exhausted.
+    // internal implementation updates the state of `interleave_indices_` and
+    // `staging_indices_` as output iterators (run by the worker threads) are
+    // exhausted.
     //
     // `input_impl_` is the input iterator that generates arguments for the
     // flat-map function (`captured_func_`). It is set to an iterator at
@@ -175,18 +226,19 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
     // memory.
     //
     // A few invariants are maintained:
-    //  1. No element in interleave_ should be a nullptr unless `staging_` is
-    //     empty and `input_impl_` is empty.
+    //  1. No element in interleave_indices_ should be a -1 unless
+    //     `staging_indices_` is empty and `input_impl_` is empty.
     //  2. Every `worker_` element is pointed to by at most one element of the
-    //     union of `interleave_` and `staging_`.
+    //     union of `interleave_indices_` and `staging_indices_`.
     //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
-    //     an element in `interleave_` or `staging_`.
+    //     an element in `interleave_indices_` or `staging_indices_`.
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
             input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
-            workers_(dataset()->num_threads()) {}
+            workers_(dataset()->num_threads()),
+            worker_thread_states_(dataset()->num_threads()) {}
 
       ~Iterator() override {
         mutex_lock l(mu_);
@@ -211,10 +263,13 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           // not have an item readily available.
           bool can_produce_elements = false;
           bool must_wait_for_input = true;
-          for (int64 i = 0; i < interleave_.size(); ++i) {
-            int64 index = (next_index_ + i) % interleave_.size();
-            WorkerState* current_worker = interleave_[index];
-            if (!current_worker) continue;  // Empty interleave elements.
+          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
+            int64 index = (next_index_ + i) % interleave_indices_.size();
+            int64 current_worker_index = interleave_indices_[index];
+            if (current_worker_index < 0) {
+              continue;  // Empty interleave elements.
+            }
+            WorkerState* current_worker = &workers_[current_worker_index];
             can_produce_elements |= current_worker->MayHaveElements();
             if (!current_worker->outputs.empty()) {
               // We have an element!
@@ -222,7 +277,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
               if (i == 0) {
                 block_count_++;
                 if (block_count_ == dataset()->block_length_) {
-                  next_index_ = (index + 1) % interleave_.size();
+                  next_index_ = (index + 1) % interleave_indices_.size();
                   block_count_ = 0;
                 }
               } else {
@@ -245,7 +300,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
               break;
             } else if (!current_worker->is_producing) {
               // This iterator has reached end of input.
-              interleave_[index] = nullptr;
+              interleave_indices_[index] = -1;
               if (input_impl_) {
                 // Start prefetching a new iterator.
                 std::vector<Tensor> args;
@@ -255,16 +310,17 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                   input_impl_.reset();
                 } else {
                   current_worker->SetInputs(s, std::move(args));
-                  staging_.emplace_back(current_worker);
+                  staging_indices_.emplace_back(current_worker_index);
                 }
               }
 
-              if (!staging_.empty()) {
-                // Move a worker from `staging_` to `interleave_`.
-                interleave_[index] = staging_.front();
-                staging_.pop_front();
+              if (!staging_indices_.empty()) {
+                // Move a worker from `staging_indices_` to
+                // `interleave_indices_`.
+                interleave_indices_[index] = staging_indices_.front();
+                staging_indices_.pop_front();
 
-                next_index_ = (index + 1) % interleave_.size();
+                next_index_ = (index + 1) % interleave_indices_.size();
                 block_count_ = 0;
                 // Restart the inner [for] loop
                 can_produce_elements = true;
@@ -285,7 +341,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             if (dataset()->sloppy_) {
               sloppy_cond_var_.wait(l);
             } else {
-              interleave_[next_index_]->cond_var.wait(l);
+              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
             }
           }
         }
@@ -293,6 +349,137 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("next_index"), next_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_count"), block_count_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("workers_size"), workers_.size()));
+        for (int i = 0; i < workers_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
+        }
+        for (int i = 0; i < worker_thread_states_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
+                                               interleave_indices_.size()));
+        for (int i = 0; i < interleave_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("interleave_indices_", i)),
+              interleave_indices_[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
+                                               staging_indices_.size()));
+        for (int i = 0; i < staging_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("staging_indices_", i)),
+              staging_indices_[i]));
+        }
+        if (!worker_threads_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("worker_threads_running"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (!reader->Contains(full_name("input_exhausted"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        int64 temp;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
+        next_index_ = size_t(temp);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
+        block_count_ = size_t(temp);
+
+        // Restore WorkerStates.
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("workers_size"), &temp));
+        if (temp != dataset()->num_threads()) {
+          return errors::Internal("Expected ", dataset()->num_threads(),
+                                  " worker states but found ", temp, ".");
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
+        }
+
+        // Restore `interleave_indices_`.
+        std::set<int64> all_indices;
+        {
+          int64 interleave_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
+                                                &interleave_size));
+          interleave_indices_.reserve(interleave_size);
+          for (int64 i = 0; i < interleave_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("interleave_indices_", i)), &temp));
+            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            interleave_indices_.emplace_back(temp);
+          }
+        }
+
+        // Restore `staging_indices_`.
+        {
+          int64 staging_size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("staging_size"), &staging_size));
+          for (int i = 0; i < staging_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("staging_indices_", i)), &temp));
+            if (all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            staging_indices_.emplace_back(temp);
+          }
+        }
+
+        // Start Worker threads.
+        if (reader->Contains(full_name("worker_threads_running"))) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, "worker_thread",
+                std::bind(&Iterator::WorkerThread, this,
+                          new IteratorContext(*ctx), i)));
+          }
+        }
+        return Status::OK();
+      }
+
      private:
       // OutputElem contains the information from a call to GetNext by an output
       // iterator.
@@ -345,6 +532,31 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         }
       };
 
+      // The internal state of a worker thread that is not already captured
+      // in its `WorkerState`.
+      //
+      // This is needed only for checkpointing purposes. We keep this
+      // separate from `WorkerState` and guard its fields using a separate
+      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
+      struct WorkerThreadState {
+        // The output element that has been produced from the input iterator
+        // and is waiting to be added to `WorkerState.outputs`.
+        OutputElem output_elem;
+
+        // Whether the input iterator returned an `end_of_sequence`.
+        bool end_of_sequence = false;
+
+        // Status returned from `MakeIteratorFromInputElement`.
+        Status iterator_creation_status;
+
+        // The arguments to be used to construct `iterator`.
+        std::vector<Tensor> input;
+
+        std::unique_ptr<IteratorBase> iterator;
+
+        WorkerThreadState() : output_elem(Status::OK()) {}
+      };
+
       Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (worker_threads_.empty()) {
@@ -363,19 +575,38 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                 std::bind(&Iterator::WorkerThread, this,
                           new IteratorContext(*ctx), i)));
             if (i < dataset()->cycle_length_) {
-              interleave_.push_back(&workers_[i]);
+              interleave_indices_.push_back(i);
             } else {
-              staging_.push_back(&workers_[i]);
+              staging_indices_.push_back(i);
             }
           }
-          DCHECK(interleave_.size() == dataset()->cycle_length_);
-          DCHECK(staging_.size() == dataset()->prefetch_input_elements_);
+          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
+          DCHECK(staging_indices_.size() ==
+                 dataset()->prefetch_input_elements_);
         }
         return Status::OK();
       }
 
       // Produces elements into the worker's output buffers.
       void WorkerThread(IteratorContext* ctx_ptr, const int64 thread_index) {
+        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
+        //
+        // 1. Any local state that may need to be checkpointed should be kept
+        //    in `worker_thread_states_[thread_index]`.
+        // 2. `WorkerThreadState` should contain state that is needed only for
+        //    checkpointing, i.e., if we were to remove checkpointing support,
+        //    we could keep that state as local variables in this thread.
+        // 3. This thread should only read/write state at `thread_index`
+        //    and should not access other thread states.
+        // 4. When restoring from checkpoint, threads are started only after
+        //    the restore is complete.
+        // 5. Once restored from a checkpoint, the local state is edited only
+        //    by this thread. 3 & 4 allow making assumptions like temporarily
+        //    caching local state in this thread and using it outside a lock
+        //    e.g. `make_new_iterator`.
+        // 6. `ckpt_mu_` should be wisely used to create *consistent*
+        //    checkpoint markers.
+
         // std::function arguments are copy-constructable, so we pass raw
         // pointers, and then immediately wrap them to ensure correct ownership.
         std::unique_ptr<IteratorContext> ctx(ctx_ptr);
@@ -383,38 +614,135 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           mutex_lock l(mu_);
           workers_[thread_index].cond_var.notify_all();
         });
-
+        bool make_new_iterator;
+        {
+          tf_shared_lock l(ckpt_mu_);
+          // Decide whether a new iterator should be built.
+          // 1. If there is an existing iterator, we use it.
+          // 2. If there was an error in iterator creation that could not be
+          //    notified to the client we attempt to send that to the client
+          //    first.
+          make_new_iterator =
+              worker_thread_states_[thread_index].iterator == nullptr &&
+              worker_thread_states_[thread_index].iterator_creation_status.ok();
+        }
+        // Even though `make_new_iterator` has cached values from
+        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
+        // it is safe to *read* `make_new_iterator`outside of a lock without
+        // worrying about concurrent changes to values in
+        // `worker_thread_states_[thread_index]`. See comment at the start of
+        // this function for details.
         while (true) {
-          // 1. Wait for input.
-          std::vector<Tensor> input;
-          {
-            mutex_lock l(mu_);
-            while (!cancelled_ && !workers_[thread_index].is_producing) {
-              workers_[thread_index].cond_var.wait(l);
+          // Whether creation of the iterator succeeded.
+          Status iterator_creation_status;
+          // 1. Build a new iterator or use the existing one.
+          if (make_new_iterator) {
+            // 1a. Get new input tensors or use the exiting ones.
+
+            bool read_new_input;
+
+            {
+              tf_shared_lock l(ckpt_mu_);
+              // worker_thread_states_[thread_index].input will be non-empty
+              // if checkpointing happened at CHECKPOINT_MARKER_A.
+              read_new_input =
+                  worker_thread_states_[thread_index].input.empty();
             }
-            if (cancelled_) return;
-            input.swap(workers_[thread_index].input);
-          }
 
-          // 2. Run the user defined function to produce a new iterator.
-          std::unique_ptr<IteratorBase> iterator;
-          Status s = dataset::MakeIteratorFromInputElement(
-              ctx.get(), input, thread_index, dataset()->captured_func_.get(),
-              prefix(), &iterator);
-          input.clear();  // Release memory as early as possible.
+            if (read_new_input) {
+              mutex_lock l(mu_);
+              while (!cancelled_ && !workers_[thread_index].is_producing) {
+                workers_[thread_index].cond_var.wait(l);
+              }
+              if (cancelled_) return;
+              // Copy the input tensors so that we do not need to block on `mu_`
+              // when building the iterator.
+              // We keep a copy of the input tensors in
+              // `WorkerThreadState.input` till the iterator is in use. This is
+              // used in `RestoreInternal` to re-build the iterator.
+              // TODO(b/78046638): Explore ways to avoid tracking the input
+              // tensors.
+              tf_shared_lock ckpt_l(ckpt_mu_);
+              worker_thread_states_[thread_index].input.swap(
+                  workers_[thread_index].input);
+              // CHECKPOINT_MARKER_A
+              // We have the input tensors but have not built the iterator yet.
+            }
 
-          if (!s.ok()) {
+            // 1b. Run the user defined function to produce a new iterator.
+            {
+              tf_shared_lock l(ckpt_mu_);
+              worker_thread_states_[thread_index].iterator_creation_status =
+                  dataset::MakeIteratorFromInputElement(
+                      ctx.get(), worker_thread_states_[thread_index].input,
+                      thread_index, dataset()->captured_func_.get(), prefix(),
+                      &worker_thread_states_[thread_index].iterator);
+              iterator_creation_status =
+                  worker_thread_states_[thread_index].iterator_creation_status;
+              if (!iterator_creation_status.ok()) {
+                worker_thread_states_[thread_index].input.clear();
+              }
+              // CHECKPOINT_MARKER_B
+              // Either an iterator has been successfully built and placed in
+              // `worker_thread_states_[thread_index].iterator` or it failed and
+              // a non-OK status has been put in
+              // `worker_thread_states_[thread_index].iterator_creation_status`.
+            }
+          } else {
+            tf_shared_lock l(ckpt_mu_);
+            iterator_creation_status =
+                worker_thread_states_[thread_index].iterator_creation_status;
+            // Mark that we have used up the restored iterator.
+            make_new_iterator = true;
+          }
+          // 2. Start producing elements or send error state to client if
+          //    iterator creation failed.
+          if (!iterator_creation_status.ok()) {
             mutex_lock l(mu_);
-            workers_[thread_index].outputs.emplace_back(s);
+            // Wait for space in the prefetch queue.
+            while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                      dataset()->buffer_output_elements_) {
+              workers_[thread_index].cond_var.wait(l);
+            }
+            if (cancelled_) return;
+            tf_shared_lock ckpt_l(ckpt_mu_);
+            workers_[thread_index].outputs.emplace_back(
+                iterator_creation_status);
             workers_[thread_index].is_producing = false;
+            worker_thread_states_[thread_index].iterator_creation_status =
+                Status::OK();
+            // CHECKPOINT_MARKER_C
+            // Non-OK iterator creation status has been notified to the
+            // client.
             workers_[thread_index].cond_var.notify_one();
           } else {
-            // 3. Produce elements
             bool end_of_sequence = false;
             while (!end_of_sequence) {
               // 3.a Produce an element!
-              std::vector<Tensor> output_elem;
-              s = iterator->GetNext(ctx.get(), &output_elem, &end_of_sequence);
+              {
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                if (worker_thread_states_[thread_index]
+                        .output_elem.status.ok() &&
+                    worker_thread_states_[thread_index]
+                        .output_elem.output.empty() &&
+                    !worker_thread_states_[thread_index].end_of_sequence) {
+                  worker_thread_states_[thread_index].output_elem.status =
+                      worker_thread_states_[thread_index].iterator->GetNext(
+                          ctx.get(),
+                          &worker_thread_states_[thread_index]
+                               .output_elem.output,
+                          &worker_thread_states_[thread_index].end_of_sequence);
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                } else {
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                }
+                // CHECKPOINT_MARKER_D
+                // An element has been read or an error or end_of_sequence has
+                // been received from the input iterator and is waiting to be
+                // sent to client.
+              }
 
               // 3.b Make it available to the client.
               {
@@ -427,30 +755,255 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                 }
                 if (cancelled_) return;
 
-                // Output the element.
+                tf_shared_lock ckpt_l(ckpt_mu_);
                 workers_[thread_index].is_producing = !end_of_sequence;
-                if (!end_of_sequence) {
-                  workers_[thread_index].outputs.emplace_back(s);
+
+                // Output the element.
+
+                // Move the temporary state in WorkerThreadState to WorkerState
+                // and mark it as used.
+                if (end_of_sequence) {
+                  worker_thread_states_[thread_index].iterator.reset();
+                  worker_thread_states_[thread_index].input.clear();
+                  worker_thread_states_[thread_index].end_of_sequence = false;
+                } else {
+                  workers_[thread_index].outputs.emplace_back(
+                      worker_thread_states_[thread_index].output_elem.status);
                   workers_[thread_index].outputs.back().output.swap(
-                      output_elem);
+                      worker_thread_states_[thread_index].output_elem.output);
                 }
+                worker_thread_states_[thread_index].output_elem.status =
+                    Status::OK();
                 if (dataset()->sloppy_) {
                   sloppy_cond_var_.notify_one();
                 } else {
                   workers_[thread_index].cond_var.notify_one();
                 }
+                // CHECKPOINT_MARKER_E
+                // Output element or iterator status has been sent to the
+                // client.
               }
             }
           }
         }
       }
 
+      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_", index);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            workers_[index].input.size()));
+        for (int i = 0; i < workers_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              workers_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_outputs_size")),
+            workers_[index].outputs.size()));
+        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+              writer, workers_[index].outputs[i],
+              full_name(strings::StrCat(prefix, "_outputs_", i))));
+        }
+        if (workers_[index].is_producing) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_is_producing")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
+                                   IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        workers_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          workers_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &workers_[index].input.back()));
+        }
+        int64 outputs_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
+            &outputs_size));
+        for (int i = 0; i < outputs_size; ++i) {
+          workers_[index].outputs.emplace_back(Status::OK());
+          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+              reader, &workers_[index].outputs.back(),
+              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
+        }
+        if (reader->Contains(
+                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
+          workers_[index].is_producing = true;
+        } else {
+          workers_[index].is_producing = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
+                                          int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_thread_", index);
+        if (worker_thread_states_[index].iterator != nullptr) {
+          TF_RETURN_IF_ERROR(
+              SaveParent(writer, worker_thread_states_[index].iterator));
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            worker_thread_states_[index].input.size()));
+        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              worker_thread_states_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_iterator_creation_status"),
+            worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+            writer, worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(prefix, "_output"))));
+        if (worker_thread_states_[index].end_of_sequence) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
+                                         IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_thread_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        worker_thread_states_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          worker_thread_states_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &worker_thread_states_[index].input.back()));
+        }
+        // Restore iterator.
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
+          worker_thread_states_[index].iterator.reset();
+        } else {
+          std::unique_ptr<IteratorBase> iterator;
+          Status s = dataset::MakeIteratorFromInputElement(
+              ctx, worker_thread_states_[index].input, index,
+              dataset()->captured_func_.get(), prefix(), &iterator);
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, iterator));
+          worker_thread_states_[index].iterator.swap(iterator);
+        }
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
+            &worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+            reader, &worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(worker_prefix, "_output"))));
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
+          worker_thread_states_[index].end_of_sequence = true;
+        } else {
+          worker_thread_states_[index].end_of_sequence = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteOutputElemLocked(IteratorStateWriter* writer,
+                                   const OutputElem& output_elem,
+                                   const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_status"), output_elem.status));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
+                                output_elem.output.size()));
+        for (int i = 0; i < output_elem.output.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadOutputElemLocked(IteratorStateReader* reader,
+                                  OutputElem* output_elem, const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
+        int64 output_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            strings::StrCat(prefix, "_output_size"), &output_size));
+        output_elem->output.reserve(output_size);
+        for (int i = 0; i < output_size; ++i) {
+          output_elem->output.emplace_back();
+          TF_RETURN_IF_ERROR(
+              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
+                                 &output_elem->output.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& prefix, const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                              Status* status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
       // Mutex & condition variable to guard mutable iterator internals and
       // coordinate among worker threads and client thread[s].
-      mutex mu_;
+      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
       // The main thread waits on this condition variable if running in sloppy
       // mode and no values are available.
       condition_variable sloppy_cond_var_;
+      // Mutex used to wait for a consistent state while checkpointing.
+      // Only Save and Restore require an exclusive lock on this mutex. In
+      // other scenarios we just acquire a shared lock so the pipeline's
+      // performance should not be affected in the absence of checkpointing.
+      // A thread must not wait on any condition variable while holding
+      // `ckpt_mu_` in either shared or exclusive modes.
+      mutex ckpt_mu_;
 
       // The iterator producing elements which are converted to datasets by
       // the dataset()->captured_func_ then interleaved together.
@@ -461,10 +1014,14 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // workers_ elements are in at most one of interleave_ and staging_.
       std::vector<WorkerState> workers_ GUARDED_BY(mu_);
 
-      // The iterators to interleave
-      std::vector<WorkerState*> interleave_ GUARDED_BY(mu_);
-      // Prefetched iterators
-      std::deque<WorkerState*> staging_ GUARDED_BY(mu_);
+      // Stores the temporary state of WorkerThreads which is not stored in
+      // WorkerState. This is used for checkpointing purposes only.
+      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
+
+      // Indices in `workers_` of iterators to interleave.
+      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
+      // Indices in `workers_` of prefetched iterators.
+      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
 
       // The index into output_elements_ for next element to produce.
       size_t next_index_ GUARDED_BY(mu_) = 0;
@@ -479,6 +1036,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
+    const NameAttrList interleave_func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const int64 cycle_length_;
     const int64 block_length_;
@@ -492,7 +1050,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
   const int graph_def_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
+  NameAttrList interleave_func_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index bc4426a9fdbab971a4e49d57ffcea6896fc037a7..7e373f25686899ec8599fc064f9cf7beb3ebfe95 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -199,7 +199,14 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           }
         }
         ++num_outputs_consumed_;
-        return result->status;
+        if (errors::IsOutOfRange(result->status)) {
+          // `f` may deliberately raise `errors::OutOfRange` to indicate
+          // that we should terminate the iteration early.
+          *end_of_sequence = true;
+          return Status::OK();
+        } else {
+          return result->status;
+        }
       }
 
      protected:
@@ -311,7 +318,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
         // Get the next input element.
         std::vector<Tensor> input_element;
-        bool end_of_input;
+        bool end_of_input = false;
         result->status =
             input_impl_->GetNext(ctx, &input_element, &end_of_input);
         if (end_of_input) {
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.cc b/tensorflow/core/kernels/data/prefetch_autotuner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3272f6bcde56029d878f3e61a7809594db86b24
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
+
+namespace tensorflow {
+
+PrefetchAutotuner::PrefetchAutotuner(int64 initial_buffer_size)
+    : buffer_limit_(initial_buffer_size) {
+  if (initial_buffer_size == kAutoTune) {
+    mode_ = Mode::kUpswing;
+    buffer_limit_ = 1;
+  }
+}
+
+void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) {
+  switch (mode_) {
+    case Mode::kDisabled:
+      return;
+    case Mode::kUpswing:
+      if (current_buffer_size == buffer_limit_) {
+        mode_ = Mode::kDownswing;
+      }
+      return;
+    case Mode::kDownswing:
+      if (current_buffer_size == 0) {
+        buffer_limit_ *= 2;  // Increase the buffer size.
+        mode_ = Mode::kUpswing;
+      }
+      return;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.h b/tensorflow/core/kernels/data/prefetch_autotuner.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa8a1840723ec8af1d1314af1b89bed7f120abc0
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.h
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// PrefetchAutotuner dynamically adjusts the buffer size of a prefetch iterator.
+//
+// PrefetchAutotuner attempts to find the minimum buffer size such that there is
+// always at least 1 element in the prefetch queue every time the downstream
+// iterator calls GetNext().
+//
+// One common failure mode of input pipelines is being throughput bound. No
+// amount of prefetching can address that performance mode. In order to guard
+// against this condition, PrefetchAutotuner will only increase the buffer_limit
+// if the prefetching thread is able to successfully fill the buffer at its
+// current size.
+//
+// Note: in the current implementation, we never decrease the buffer_limit().
+// This should change in the future!
+//
+// PrefetchAutotuner is NOT thread safe.
+class PrefetchAutotuner {
+ public:
+  static const int64 kAutoTune = -1;
+
+  explicit PrefetchAutotuner(int64 initial_buffer_size);
+
+  int64 buffer_limit() const { return buffer_limit_; }
+
+  void RecordConsumption(size_t current_buffer_size);
+  void RecordEmpty() { RecordConsumption(0); }
+
+ private:
+  // PrefetchAutotuner operates as a state machine.
+  enum class Mode {
+    // Disables the autotuning.
+    kDisabled,
+
+    // We have increased the size of the buffer, and will transition to
+    // kDownswing if we successfully fill the buffer.
+    kUpswing,
+
+    // We have successfully filled a buffer of this size. If we ever block the
+    // downstream iterator, we should increase the buffer size.
+    kDownswing,
+  };
+
+  int64 buffer_limit_;
+  Mode mode_ = Mode::kDisabled;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2f573dfb3555b2466d84c6341eaa77e69414d103
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(PrefetchAutotuner, Disabled) {
+  PrefetchAutotuner t(2);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0);
+  t.RecordConsumption(2);
+  t.RecordConsumption(0);
+  t.RecordConsumption(2);
+  EXPECT_EQ(2, t.buffer_limit());
+}
+
+TEST(PrefetchAutotuner, Enabled) {
+  PrefetchAutotuner t(PrefetchAutotuner::kAutoTune);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(1);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(2);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(1);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(4, t.buffer_limit());
+  t.RecordConsumption(4);
+  EXPECT_EQ(4, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to stay the same!
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to stay the same!
+  EXPECT_EQ(8, t.buffer_limit());
+}
+
+TEST(PrefetchAutotuner, EnabledSteady) {
+  PrefetchAutotuner t(PrefetchAutotuner::kAutoTune);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(1);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(2);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(4, t.buffer_limit());
+
+  // Never reach zero again.
+  std::vector<size_t> consumption_values = {2, 3, 1, 4, 1, 2, 3, 1};
+  for (int i = 0; i < consumption_values.size(); ++i) {
+    t.RecordConsumption(consumption_values[i]);
+    EXPECT_EQ(4, t.buffer_limit())
+        << "Failed at index " << i << " with value: " << consumption_values[i];
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 1c548a30d2c8e7f33db85000d0f480b3151d6ecf..536de81fd891f1849cd285d6be4ddefb79fd3386 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 
 namespace tensorflow {
@@ -37,7 +38,8 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
     int64 buffer_size;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
-    OP_REQUIRES(ctx, buffer_size > 0,
+    OP_REQUIRES(ctx,
+                buffer_size > 0 || buffer_size == PrefetchAutotuner::kAutoTune,
                 errors::InvalidArgument("buffer_size must be > 0"));
 
     *output = new Dataset(ctx, input, buffer_size);
@@ -85,7 +87,8 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            auto_tuner_(params.dataset->buffer_size_) {}
 
       ~Iterator() override {
         // Signal the prefetch thread to terminate it. We will then
@@ -113,6 +116,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
           // Wait until the next element in the buffer has been
           // produced, or we are shutting down.
           while (!cancelled_ && !prefetch_thread_finished_ && buffer_.empty()) {
+            auto_tuner_.RecordEmpty();
             cond_var_.wait(l);
           }
 
@@ -129,6 +133,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
             if (s.ok()) {
               *out_tensors = std::move(buffer_.front().value);
             }
+            auto_tuner_.RecordConsumption(buffer_.size());
             buffer_.pop_front();
             *end_of_sequence = false;
 
@@ -242,7 +247,8 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
           // 1. Wait for a slot in the buffer.
           {
             mutex_lock l(mu_);
-            while (!cancelled_ && buffer_.size() == dataset()->buffer_size_) {
+            while (!cancelled_ &&
+                   buffer_.size() == auto_tuner_.buffer_limit()) {
               cond_var_.wait(l);
             }
 
@@ -323,6 +329,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
       mutex parent_mu_ ACQUIRED_BEFORE(mu_);
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
       condition_variable cond_var_;
+      PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
       std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f3537b6912bffb1c76d8a315790b4e5585a02d7
--- /dev/null
+++ b/tensorflow/core/kernels/data/slide_dataset_op.cc
@@ -0,0 +1,252 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class SlideDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit SlideDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 window_size = 0;
+    int64 stride = 1;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "window_size", &window_size));
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "stride", &stride));
+    OP_REQUIRES(
+        ctx, window_size > 0,
+        errors::InvalidArgument("Window size must be greater than zero."));
+    OP_REQUIRES(
+        ctx, stride > 0 && stride < window_size,
+        errors::InvalidArgument("Stride must be in [1, window_size)."));
+
+    *output = new Dataset(ctx, window_size, stride, input);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, int64 window_size, int64 stride, const DatasetBase* input)
+        : GraphDatasetBase(ctx), window_size_(window_size), stride_(stride), input_(input) {
+      input_->Ref();
+
+      const auto& input_shapes = input_->output_shapes();
+      output_shapes_.reserve(input_shapes.size());
+      for (const auto& input_shape : input_shapes) {
+        output_shapes_.emplace_back(
+            PartialTensorShape({-1}).Concatenate(input_shape));
+      }
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          Iterator::Params{this, strings::StrCat(prefix, "::Slide")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("SlideDatasetOp(", window_size_, ", ", stride_, ")::Dataset");
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* window_size = nullptr;
+      Node* stride = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(window_size_, &window_size));
+      TF_RETURN_IF_ERROR(b->AddScalar(stride_, &stride));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, window_size, stride}, output));
+      return Status::OK();
+    }
+
+   private:
+
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        const int64 window_size = dataset()->window_size_;
+        const int64 stride = dataset()->stride_;
+        std::vector<std::vector<Tensor>> batch_elements;
+        {
+          mutex_lock l(mu_);
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          batch_elements.reserve(window_size);
+          const bool first_call = cache_.empty();
+          if (first_call) {
+            cache_.reserve(window_size);
+          } else {
+            // Reuse cache in the previous iteration.
+            cache_.swap(batch_elements);
+          }
+          // Fill up with new elements.
+          *end_of_sequence = false;
+          for (size_t i = batch_elements.size(); i < window_size && !*end_of_sequence;
+              ++i) {
+            std::vector<Tensor> batch_element_tuple;
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                    end_of_sequence));
+            if (!*end_of_sequence) {
+              batch_elements.push_back(std::move(batch_element_tuple));
+            } else {
+              input_impl_.reset();
+            }
+          }
+          // Drop the final smaller blocks.
+          if (batch_elements.size() < window_size) {
+            DCHECK(*end_of_sequence);
+            return Status::OK();
+          }
+          // Cache the data used for the next iteration.
+          for (size_t i = stride; i < window_size; ++i) {
+            cache_.emplace_back(batch_elements[i]);
+          }
+        }
+
+        // Construct output tensors.
+        // Those codes below are copied from batch_dataset_op.cc.
+        const size_t num_tuple_components = batch_elements[0].size();
+        const int64 num_batch_elements = batch_elements.size();
+        for (size_t component_index = 0; component_index < num_tuple_components;
+             ++component_index) {
+          const Tensor& first_element = batch_elements[0][component_index];
+          TensorShape batch_component_shape({num_batch_elements});
+          batch_component_shape.AppendShape(first_element.shape());
+          Tensor batch_component(cpu_allocator(), first_element.dtype(),
+                                 batch_component_shape);
+          // Build the output tuple component by copying one slice
+          // from each input element in the batch.
+          for (size_t i = 0; i < num_batch_elements; ++i) {
+            if (batch_elements[i][component_index].shape() !=
+                first_element.shape()) {
+              return errors::InvalidArgument(
+                  "Cannot batch tensors with different shapes in component ",
+                  component_index, ". First element had shape ",
+                  first_element.shape().DebugString(), " and element ", i,
+                  " had shape ",
+                  batch_elements[i][component_index].shape().DebugString(),
+                  ".");
+            }
+            TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
+                std::move(batch_elements[i][component_index]), &batch_component,
+                i));
+          }
+          out_tensors->emplace_back(std::move(batch_component));
+        }
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        }
+        // Save cache.
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(strings::StrCat("cache_size"), cache_.size()));
+        for (int64 i = 0; i < cache_.size(); i++) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              strings::StrCat("cache[", i, "]_size"), cache_[i].size()));
+          for (int64 j = 0; j < cache_[i].size(); j++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                strings::StrCat("cache[", i, "][", j, "]"), cache_[i][j]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        // Restore cache.
+        int64 cache_size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(strings::StrCat("cache_size"), &cache_size));
+        cache_.resize(cache_size);
+        for (int64 i = 0; i < cache_size; i++) {
+          int64 vector_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              strings::StrCat("cache[", i, "]_size"), &vector_size));
+          cache_[i].resize(vector_size);
+          for (int64 j = 0; j < vector_size; j++) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                strings::StrCat("cache[", i, "][", j, "]"), &cache_[i][j]));
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::vector<std::vector<Tensor>> cache_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 window_size_;
+    const int64 stride_;
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("SlideDataset").Device(DEVICE_CPU),
+                        SlideDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD
index f4698bdaf7ae9767e068e49dad61d2a3d9f739a8..dc591208752c52d3f53484f5a1c564666727bb16 100644
--- a/tensorflow/core/kernels/data/sql/BUILD
+++ b/tensorflow/core/kernels/data/sql/BUILD
@@ -7,18 +7,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "sql",
     srcs = [
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb96b8a872cae71a55ca5de232df2ea402db4561
--- /dev/null
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+namespace {
+
+class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit SetStatsAggregatorDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    StatsAggregatorResource* stats_aggregator_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
+                                       &stats_aggregator_resource));
+    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
+
+    *output = new Dataset(ctx, input, stats_aggregator_resource);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
+                     StatsAggregatorResource* stats_aggregator_resource)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          stats_aggregator_resource_(stats_aggregator_resource) {
+      input_->Ref();
+      stats_aggregator_resource_->Ref();
+    }
+
+    ~Dataset() override {
+      input_->Unref();
+      stats_aggregator_resource_->Unref();
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::SetStatsAggregator")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override {
+      return "SetStatsAggregatorDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented(
+          "Cannot currently serialize the `stats_aggregator` for a "
+          "SetStatsAggregatorDataset.");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        StatsAggregatorResource* stats_aggregator_resource =
+            dataset()->stats_aggregator_resource_;
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = *(ctx->runner());
+        params.stats_aggregator_getter = [stats_aggregator_resource]() {
+          return stats_aggregator_resource->stats_aggregator();
+        };
+        params.lib = ctx->lib();
+        params.function_library = ctx->function_library();
+        params.allocator_getter = ctx->allocator_getter();
+        IteratorContext set_stats_aggregator_ctx(params);
+        return input_impl_->GetNext(&set_stats_aggregator_ctx, out_tensors,
+                                    end_of_sequence);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    StatsAggregatorResource* stats_aggregator_resource_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("SetStatsAggregatorDataset").Device(DEVICE_CPU),
+                        SetStatsAggregatorDatasetOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
index 5a2dd9c43dbcbf5250d4dcd4bd803ed4979999e0..33a56b2eb567a24f9586c386827588e19b04e877 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/stats_aggregator.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 
 #include <memory>
 
@@ -38,6 +38,11 @@ class StatsAggregatorImpl : public StatsAggregator {
     }
   }
 
+  void AddScalar(const string& name, float value) override {
+    mutex_lock l(mu_);
+    scalars_[name] = value;
+  }
+
   void EncodeToProto(Summary* out_summary) override {
     mutex_lock l(mu_);
     for (const auto& pair : histograms_) {
@@ -47,13 +52,19 @@ class StatsAggregatorImpl : public StatsAggregator {
       Summary::Value* value = out_summary->add_value();
       value->set_tag(name);
       histogram.EncodeToProto(value->mutable_histo(),
-                              true /* preserve_zero_buckets */);
+                              false /* doesn't preserve zero buckets */);
+    }
+    for (const auto& pair : scalars_) {
+      Summary::Value* value = out_summary->add_value();
+      value->set_tag(pair.first);
+      value->set_simple_value(pair.second);
     }
   }
 
  private:
   mutex mu_;
   std::unordered_map<string, histogram::Histogram> histograms_ GUARDED_BY(mu_);
+  std::unordered_map<string, float> scalars_ GUARDED_BY(mu_);
   TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorImpl);
 };
 
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 4dc1343e21faf947afc4e49539a45cdd1b38c0e9..633cd8545114e839e6ffdf9307236a79936957ad 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/stats_aggregator.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..241b615aca11ccdbf015bcab6634a243ba086fea
--- /dev/null
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -0,0 +1,204 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class UnbatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit UnbatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    *output = new Dataset(ctx, input);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, DatasetBase* input)
+        : GraphDatasetBase(ctx), input_(input) {
+      input_->Ref();
+      for (const PartialTensorShape& shape : input->output_shapes()) {
+        gtl::InlinedVector<int64, 4> partial_dim_sizes;
+        for (int i = 1; i < shape.dims(); ++i) {
+          partial_dim_sizes.push_back(shape.dim_size(i));
+        }
+        shapes_.emplace_back(std::move(partial_dim_sizes));
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Unbatch")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return shapes_;
+    }
+
+    string DebugString() override { return "UnbatchDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            current_index_(0),
+            current_batch_size_(0),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            shapes_(params.dataset->output_shapes().size()) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+        *end_of_sequence = false;
+        while (!*end_of_sequence) {
+          if (current_index_ < current_batch_size_) {
+            out_tensors->clear();
+            out_tensors->reserve(tensors_.size());
+            for (int i = 0; i < tensors_.size(); ++i) {
+              out_tensors->emplace_back(ctx->allocator({}), tensors_[i].dtype(),
+                                        shapes_[i]);
+              TF_RETURN_IF_ERROR(batch_util::MaybeMoveSliceToElement(
+                  &tensors_[i], &out_tensors->back(), current_index_));
+            }
+            ++current_index_;
+            *end_of_sequence = false;
+            return Status::OK();
+          }
+          current_index_ = 0;
+          current_batch_size_ = 0;
+          tensors_.clear();
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &tensors_, end_of_sequence));
+          if (!*end_of_sequence) {
+            for (size_t i = 0; i < tensors_.size(); ++i) {
+              if (tensors_[i].dims() == 0) {
+                return errors::InvalidArgument(
+                    "Input element must have a non-scalar value in each "
+                    "component.");
+              }
+              if (tensors_[i].dim_size(0) != tensors_[0].dim_size(0)) {
+                return errors::InvalidArgument(
+                    "Input element must have the same batch size in each "
+                    "component. Component 0 had size ",
+                    tensors_[0].dim_size(0), " but component ", i,
+                    " had size, ", tensors_[i].dim_size(0), ".");
+              }
+              shapes_[i] = tensors_[i].shape();
+              shapes_[i].RemoveDim(0);
+            }
+            current_batch_size_ = tensors_[0].dim_size(0);
+          }
+        }
+        input_impl_.reset();
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("current_index"), current_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("n"), current_batch_size_));
+        if (current_index_ < current_batch_size_) {
+          for (size_t i = 0; i < tensors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat("tensors[", i, "]")), tensors_[i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("current_index"), &current_index_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("n"), &current_batch_size_));
+        tensors_.clear();
+        tensors_.resize(dataset()->output_dtypes().size());
+        if (current_index_ < current_batch_size_) {
+          for (size_t i = 0; i < tensors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat("tensors[", i, "]")), &tensors_[i]));
+            shapes_[i] = tensors_[i].shape();
+            shapes_[i].RemoveDim(0);
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 current_index_ GUARDED_BY(mu_);
+      int64 current_batch_size_ GUARDED_BY(mu_);
+      std::vector<Tensor> tensors_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::vector<TensorShape> shapes_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnbatchDataset").Device(DEVICE_CPU),
+                        UnbatchDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..656fee1e856d213cd496ae8a6386230ad48efc3f
--- /dev/null
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+namespace {
+
+class ToTFRecordOp : public AsyncOpKernel {
+ public:
+  explicit ToTFRecordOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("to_tf_record__op_", SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    // The call to `iterator->GetNext()` may block and depend on an
+    // inter-op thread pool thread, so we issue the call from the
+    // owned thread pool.
+    thread_pool_->Schedule([this, ctx, done]() {
+      string filename;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
+      string compression_type;
+      OP_REQUIRES_OK_ASYNC(ctx,
+                           ParseScalarArgument<string>(ctx, "compression_type",
+                                                       &compression_type),
+                           done);
+      std::unique_ptr<WritableFile> file;
+      OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file),
+                           done);
+      std::unique_ptr<io::RecordWriter> writer;
+      writer.reset(new io::RecordWriter(
+          file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
+                          compression_type)));
+
+      DatasetBase* dataset;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
+      auto iterator = dataset->MakeIterator("ToTFRecordOpIterator");
+
+      IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+      std::vector<Tensor> components;
+      components.reserve(dataset->output_dtypes().size());
+      bool end_of_sequence;
+      do {
+        OP_REQUIRES_OK_ASYNC(
+            ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+            done);
+
+        if (!end_of_sequence) {
+          OP_REQUIRES_OK_ASYNC(
+              ctx, writer->WriteRecord(components[0].scalar<string>()()), done);
+        }
+        components.clear();
+      } while (!end_of_sequence);
+      done();
+    });
+  }
+
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
+                        ToTFRecordOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index fa67545a0dad0332cce55c173fc39ba25c055902..23319e6d0c56788e875eba0720006e2843d78a9d 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -37,25 +37,37 @@ class DataFormatDimMapOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
     string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(context, src_format.size() == 4,
+                errors::InvalidArgument(strings::StrCat(
+                    "Source format must of length 4, received src_format = ",
+                    src_format)));
     OP_REQUIRES(
-        context, src_format == "NHWC",
+        context, dst_format.size() == 4,
         errors::InvalidArgument(strings::StrCat(
-            "Current implementation doesn't support source data format ",
-            src_format)));
-    OP_REQUIRES(context, dst_format == "NCHW",
-                errors::InvalidArgument(strings::StrCat(
-                    "Current implementation doesn't support dst data format ",
-                    dst_format)));
+            "Destination format must of length 4, received dst_format = ",
+            dst_format)));
+    dst_idx_ = Tensor(DT_INT32, {static_cast<int64>(src_format.size())});
+    for (int i = 0; i < src_format.size(); ++i) {
+      for (int j = 0; j < dst_format.size(); ++j) {
+        if (dst_format[j] == src_format[i]) {
+          dst_idx_.vec<int>()(i) = j;
+          break;
+        }
+      }
+    }
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    Tensor* output = nullptr;
+    Tensor* output;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
     functor::DataFormatDimMap<Device, T>()(context->eigen_device<Device>(),
-                                           input.flat<T>(), output->flat<T>());
+                                           input.flat<T>(), output->flat<T>(),
+                                           dst_idx_.vec<int>());
   }
+
+  Tensor dst_idx_;
 };
 
 template <typename Device, typename T>
@@ -67,14 +79,8 @@ class DataFormatVecPermuteOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
     string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
-    OP_REQUIRES(context,
-                (src_format == "NHWC" && dst_format == "NCHW") ||
-                    (src_format == "NCHW" && dst_format == "NHWC"),
-                errors::InvalidArgument(strings::StrCat(
-                    "Current implementation only supports NCHW-to-NHWC and "
-                    "NHWC-to-NCHW format conversion; got source format ",
-                    src_format, " and destination format ", dst_format)));
-    nhwc_to_nchw_ = (src_format == "NHWC") ? true : false;
+    src_format_ = src_format;
+    dst_format_ = dst_format;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -104,13 +110,34 @@ class DataFormatVecPermuteOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
-    functor::DataFormatVecPermute<Device, T>()(
-        context->eigen_device<Device>(), input.flat<T>(), output->flat<T>(),
-        nhwc_to_nchw_);
+    // Support 1D and 2D cases.
+    Eigen::DSizes<Eigen::DenseIndex, 8> dst_idx;
+    ComputeDstIndex(input.dims(), &dst_idx);
+
+    functor::DataFormatVecPermute<Device, T>()(context->eigen_device<Device>(),
+                                               input.flat<T>(),
+                                               output->flat<T>(), dst_idx);
   }
 
  private:
-  bool nhwc_to_nchw_;
+  // Finds out the destination index. Support 1D and 2D cases.
+  // Example: HWNC --> NHWC
+  // 1D: dst = [1, 2, 0, 3],
+  // 2D: dst = [2, 3, 4, 5, 0, 1, 6, 7]
+  void ComputeDstIndex(int num_dim, Eigen::DSizes<Eigen::DenseIndex, 8>* dst) {
+    for (int i = 0; i < src_format_.size(); ++i) {
+      for (int j = 0; j < dst_format_.size(); ++j) {
+        if (dst_format_[j] != src_format_[i]) continue;
+        // Found the dst index. Set output based on the number of dims.
+        for (int k = 0; k < num_dim; ++k) {
+          (*dst)[i * num_dim + k] = j * num_dim + k;
+        }
+      }
+    }
+  }
+
+  string src_format_;
+  string dst_format_;
 };
 
 #define REGISTER_KERNEL(T)                                                \
@@ -132,11 +159,11 @@ TF_CALL_int64(REGISTER_KERNEL);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                \
-  template <>                                              \
-  void DataFormatDimMap<GPUDevice, T>::operator()(         \
-      const GPUDevice& d, typename TTypes<T>::ConstFlat x, \
-      typename TTypes<T>::Flat y);                         \
+#define DECLARE_GPU_SPEC(T)                                    \
+  template <>                                                  \
+  void DataFormatDimMap<GPUDevice, T>::operator()(             \
+      const GPUDevice& d, typename TTypes<T>::ConstFlat x,     \
+      typename TTypes<T>::Flat y, const TTypes<int>::Vec dst); \
   extern template struct DataFormatDimMap<GPUDevice, T>;
 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
 TF_CALL_int32(DECLARE_GPU_SPECS);
@@ -147,7 +174,8 @@ TF_CALL_int64(DECLARE_GPU_SPECS);
   template <>                                              \
   void DataFormatVecPermute<GPUDevice, T>::operator()(     \
       const GPUDevice& d, typename TTypes<T>::ConstFlat x, \
-      typename TTypes<T>::Vec y, bool nhwc_to_nchw);       \
+      typename TTypes<T>::Vec y,                           \
+      const Eigen::DSizes<Eigen::DenseIndex, 8>& dst_idx); \
   extern template struct DataFormatVecPermute<GPUDevice, T>;
 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
 TF_CALL_int32(DECLARE_GPU_SPECS);
@@ -167,7 +195,14 @@ TF_CALL_int64(REGISTER_GPU_KERNEL);
 #define REGISTER_GPU_KERNEL(T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("DataFormatVecPermute").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      DataFormatVecPermuteOp<GPUDevice, T>);
+      DataFormatVecPermuteOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("DataFormatVecPermute")                        \
+                              .Device(DEVICE_GPU)                             \
+                              .HostMemory("x")                                \
+                              .HostMemory("y")                                \
+                              .Label("host")                                  \
+                              .TypeConstraint<T>("T"),                        \
+                          DataFormatVecPermuteOp<CPUDevice, T>);
 TF_CALL_int32(REGISTER_GPU_KERNEL);
 TF_CALL_int64(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/data_format_ops.h b/tensorflow/core/kernels/data_format_ops.h
index bf704cc35cf2ff18b38202db5d192b460b415fbb..1ca144cb400ff828d334495b57572b67f60e28ef 100644
--- a/tensorflow/core/kernels/data_format_ops.h
+++ b/tensorflow/core/kernels/data_format_ops.h
@@ -27,49 +27,31 @@ namespace functor {
 template <typename Device, typename T>
 struct DataFormatDimMap {
   void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
-                  typename TTypes<T>::Flat y) {
+                  typename TTypes<T>::Flat y, const TTypes<int>::Vec dst) {
     auto zero = x.constant(0);
     auto one = x.constant(1);
-    auto three = x.constant(3);
+    auto two = x.constant(2);
+
+    auto f_zero = x.constant(dst(0));
+    auto f_one = x.constant(dst(1));
+    auto f_two = x.constant(dst(2));
+    auto f_three = x.constant(dst(3));
+
     auto four = x.constant(4);
     auto x_mod = (x + four) % 4;
+
     auto is_zero = (x_mod == zero);
-    auto is_three = (x_mod == three);
-    y.device(d) = is_zero.select(zero, is_three.select(one, x_mod + one));
-  }
-};
+    auto is_one = (x_mod == one);
+    auto is_two = (x_mod == two);
 
-template <typename T>
-struct VecPermuteNHWCToNCHW {
-  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
-      typename TTypes<T>::ConstFlat input) const {
-    Eigen::DSizes<Eigen::DenseIndex, 1> result;
-    result[0] = input.dimension(0);
-    return result;
-  }
-  template <typename Output, typename Device>
-  void eval(typename TTypes<T>::ConstFlat input, Output& output,
-            const Device& d) const {
-    if (input.size() == 8) {
-      output.template chip<0>(0).device(d) = input.template chip<0>(0);
-      output.template chip<0>(1).device(d) = input.template chip<0>(1);
-      output.template chip<0>(2).device(d) = input.template chip<0>(6);
-      output.template chip<0>(3).device(d) = input.template chip<0>(7);
-      output.template chip<0>(4).device(d) = input.template chip<0>(2);
-      output.template chip<0>(5).device(d) = input.template chip<0>(3);
-      output.template chip<0>(6).device(d) = input.template chip<0>(4);
-      output.template chip<0>(7).device(d) = input.template chip<0>(5);
-    } else {
-      output.template chip<0>(0).device(d) = input.template chip<0>(0);
-      output.template chip<0>(1).device(d) = input.template chip<0>(3);
-      output.template chip<0>(2).device(d) = input.template chip<0>(1);
-      output.template chip<0>(3).device(d) = input.template chip<0>(2);
-    }
+    y.device(d) = is_zero.select(
+        f_zero, is_one.select(f_one, is_two.select(f_two, f_three)));
   }
 };
 
 template <typename T>
-struct VecPermuteNCHWToNHWC {
+struct VecPermute {
+  VecPermute(const Eigen::DSizes<Eigen::DenseIndex, 8>& dst) : dst_(dst) {}
   Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
       typename TTypes<T>::ConstFlat input) const {
     Eigen::DSizes<Eigen::DenseIndex, 1> result;
@@ -79,34 +61,22 @@ struct VecPermuteNCHWToNHWC {
   template <typename Output, typename Device>
   void eval(typename TTypes<T>::ConstFlat input, Output& output,
             const Device& d) const {
-    if (input.size() == 8) {
-      output.template chip<0>(0).device(d) = input.template chip<0>(0);
-      output.template chip<0>(1).device(d) = input.template chip<0>(1);
-      output.template chip<0>(2).device(d) = input.template chip<0>(4);
-      output.template chip<0>(3).device(d) = input.template chip<0>(5);
-      output.template chip<0>(4).device(d) = input.template chip<0>(6);
-      output.template chip<0>(5).device(d) = input.template chip<0>(7);
-      output.template chip<0>(6).device(d) = input.template chip<0>(2);
-      output.template chip<0>(7).device(d) = input.template chip<0>(3);
-    } else {
-      output.template chip<0>(0).device(d) = input.template chip<0>(0);
-      output.template chip<0>(1).device(d) = input.template chip<0>(2);
-      output.template chip<0>(2).device(d) = input.template chip<0>(3);
-      output.template chip<0>(3).device(d) = input.template chip<0>(1);
+    for (int i = 0; i < input.size(); ++i) {
+      output.template chip<0>(dst_[i]).device(d) = input.template chip<0>(i);
     }
   }
+
+ private:
+  Eigen::DSizes<Eigen::DenseIndex, 8> dst_;
 };
 
 // Functor used by DataFormatVecPermuteOp to do the computations.
 template <typename Device, typename T>
 struct DataFormatVecPermute {
   void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
-                  typename TTypes<T>::Flat y, bool nhwc_to_nchw) {
-    if (nhwc_to_nchw) {
-      y.device(d) = x.customOp(VecPermuteNHWCToNCHW<T>());
-    } else {
-      y.device(d) = x.customOp(VecPermuteNCHWToNHWC<T>());
-    }
+                  typename TTypes<T>::Flat y,
+                  const Eigen::DSizes<Eigen::DenseIndex, 8>& dst) {
+    y.device(d) = x.customOp(VecPermute<T>(dst));
   }
 };
 
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index 0c42f632521dd86760e791626c8978c0b1e82709..3eed847c16229f20df7495e0f17b4e5e35a64a8f 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -34,6 +34,19 @@ class DecodeCSVOp : public OpKernel {
                 errors::InvalidArgument("Out type too large"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("field_delim", &delim));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_quote_delim", &use_quote_delim_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("select_cols", &select_cols_));
+    OP_REQUIRES(
+        ctx, out_type_.size() == select_cols_.size() || select_cols_.empty(),
+        errors::InvalidArgument("select_cols should match output size"));
+    select_all_cols_ = select_cols_.empty();
+    for (int i = 1; i < select_cols_.size(); i++) {
+      OP_REQUIRES(ctx, select_cols_[i - 1] < select_cols_[i],
+                  errors::InvalidArgument(
+                      "select_cols should be strictly increasing indices"));
+    }
+    OP_REQUIRES(
+        ctx, select_cols_.empty() || select_cols_.front() >= 0,
+        errors::InvalidArgument("select_cols should be non-negative indices"));
     OP_REQUIRES(ctx, delim.size() == 1,
                 errors::InvalidArgument("field_delim should be only 1 char"));
     delim_ = delim[0];
@@ -183,13 +196,18 @@ class DecodeCSVOp : public OpKernel {
 
  private:
   std::vector<DataType> out_type_;
+  std::vector<int64> select_cols_;
   char delim_;
   bool use_quote_delim_;
+  bool select_all_cols_;
   string na_value_;
 
   void ExtractFields(OpKernelContext* ctx, StringPiece input,
                      std::vector<string>* result) {
     int64 current_idx = 0;
+    int64 num_fields_parsed = 0;
+    int64 selector_idx = 0;  // Keep track of index into select_cols
+
     if (!input.empty()) {
       while (static_cast<size_t>(current_idx) < input.size()) {
         if (input[current_idx] == '\n' || input[current_idx] == '\r') {
@@ -198,6 +216,10 @@ class DecodeCSVOp : public OpKernel {
         }
 
         bool quoted = false;
+        bool include =
+            (select_all_cols_ || select_cols_[selector_idx] ==
+                                     static_cast<size_t>(num_fields_parsed));
+
         if (use_quote_delim_ && input[current_idx] == '"') {
           quoted = true;
           current_idx++;
@@ -214,7 +236,7 @@ class DecodeCSVOp : public OpKernel {
                             input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
-            field += input[current_idx];
+            if (include) field += input[current_idx];
             current_idx++;
           }
 
@@ -226,14 +248,14 @@ class DecodeCSVOp : public OpKernel {
               (static_cast<size_t>(current_idx) < input.size() - 1) &&
               (input[current_idx] != '"' || input[current_idx + 1] != delim_)) {
             if (input[current_idx] != '"') {
-              field += input[current_idx];
+              if (include) field += input[current_idx];
               current_idx++;
             } else {
               OP_REQUIRES(
                   ctx, input[current_idx + 1] == '"',
                   errors::InvalidArgument("Quote inside a string has to be "
                                           "escaped by another quote"));
-              field += '"';
+              if (include) field += '"';
               current_idx += 2;
             }
           }
@@ -250,11 +272,20 @@ class DecodeCSVOp : public OpKernel {
           current_idx += 2;
         }
 
-        result->push_back(field);
+        num_fields_parsed++;
+        if (include) {
+          result->push_back(field);
+          selector_idx++;
+          if (selector_idx == select_cols_.size()) return;
+        }
       }
 
+      bool include =
+          (select_all_cols_ || select_cols_[selector_idx] ==
+                                   static_cast<size_t>(num_fields_parsed));
       // Check if the last field is missing
-      if (input[input.size() - 1] == delim_) result->push_back(string());
+      if (include && input[input.size() - 1] == delim_)
+        result->push_back(string());
     }
   }
 };
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 912d04c1536600348e8263f03709f2305607d11f..2cafa44f37a76116386f0d37d50cb66ff37f1ed8 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -41,9 +41,9 @@ enum FileFormat {
 // Classify the contents of a file based on starting bytes (the magic number).
 FileFormat ClassifyFileFormat(StringPiece data) {
   // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
-  if (data.starts_with("\xff\xd8\xff")) return kJpgFormat;
-  if (data.starts_with("\x89PNG\r\n\x1a\n")) return kPngFormat;
-  if (data.starts_with("\x47\x49\x46\x38")) return kGifFormat;
+  if (str_util::StartsWith(data, "\xff\xd8\xff")) return kJpgFormat;
+  if (str_util::StartsWith(data, "\x89PNG\r\n\x1a\n")) return kPngFormat;
+  if (str_util::StartsWith(data, "\x47\x49\x46\x38")) return kGifFormat;
   return kUnknownFormat;
 }
 
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24f8a4f72fd55167a61de9b8e0f8808b93584cf9
--- /dev/null
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -0,0 +1,1151 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// DecodeProto is a TensorFlow Op which extracts arbitrary fields
+// from protos serialized as strings.
+//
+// See docs in ../ops/decode_proto_op.cc.
+//
+// This implementation reads the serialized format using a handful of
+// calls from the WireFormatLite API used by generated proto code.
+// WireFormatLite is marked as an "internal" proto API but is widely
+// used in practice and highly unlikely to change.
+// This will be much faster than the previous implementation based on
+// constructing a temporary dynamic message in memory and using the
+// proto reflection api to read it.
+// It can be used with any proto whose descriptors are available at
+// runtime but should be competitive in speed with approaches that
+// compile in the proto definitions.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/decode.h"
+#include "tensorflow/core/util/proto/descriptors.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::MakeUnique;
+using ::tensorflow::protobuf::Descriptor;
+using ::tensorflow::protobuf::DescriptorPool;
+using ::tensorflow::protobuf::DynamicMessageFactory;
+using ::tensorflow::protobuf::FieldDescriptor;
+using ::tensorflow::protobuf::Message;
+using ::tensorflow::protobuf::TextFormat;
+using ::tensorflow::protobuf::internal::WireFormatLite;
+using ::tensorflow::protobuf::io::CodedInputStream;
+
+const bool kFailOnDecodeError = true;
+
+// Returns true if the proto field type can be converted to the
+// tensorflow::DataType.
+bool CheckOutputType(FieldDescriptor::Type field_type, DataType output_type) {
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return output_type == tensorflow::DT_DOUBLE;
+    case WireFormatLite::TYPE_FLOAT:
+      return output_type == tensorflow::DT_FLOAT ||
+             output_type == tensorflow::DT_DOUBLE;
+    case WireFormatLite::TYPE_INT64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_UINT64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_INT32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_FIXED64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_FIXED32:
+      return output_type == tensorflow::DT_INT32 ||
+             output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_BOOL:
+      return output_type == tensorflow::DT_BOOL;
+    case WireFormatLite::TYPE_STRING:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_GROUP:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_MESSAGE:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_BYTES:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_UINT32:
+      return output_type == tensorflow::DT_INT32 ||
+             output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_ENUM:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SFIXED32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SFIXED64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_SINT32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SINT64:
+      return output_type == tensorflow::DT_INT64;
+      // default: intentionally omitted in order to enable static checking.
+  }
+}
+
+// Used to store the default value of a protocol message field, casted to the
+// type of the output tensor.
+//
+// TODO(paskin): Use absl::variant once TensorFlow gets absl dependencies.
+struct DefaultValue {
+  DataType dtype = DataType::DT_INVALID;
+  union Value {
+    bool v_bool;           // DT_BOOL
+    uint8 v_uint8;         // DT_UINT8
+    int8 v_int8;           // DT_INT8
+    int32 v_int32;         // DT_INT32
+    int64 v_int64;         // DT_INT64
+    float v_float;         // DT_FLOAT
+    double v_double;       // DT_DOUBLE
+    const char* v_string;  // DT_STRING
+  };
+  Value value;
+};
+
+// Initializes a DefaultValue object.  This generic template handles numeric
+// types and strings are handled by a template specialization below.
+//
+// Args:
+//   dtype: the type of the output tensor
+//   value: the default value as obtained from the FieldDescriptor
+//   result: the object to initialize
+template <typename T>
+Status InitDefaultValue(DataType dtype, const T value, DefaultValue* result) {
+  result->dtype = dtype;
+  switch (dtype) {
+    case DT_BOOL:
+      result->value.v_bool = static_cast<bool>(value);
+      break;
+    case DT_INT32:
+      result->value.v_int32 = static_cast<int32>(value);
+      break;
+    case DT_INT8:
+      result->value.v_int8 = static_cast<int8>(value);
+      break;
+    case DT_UINT8:
+      result->value.v_uint8 = static_cast<uint8>(value);
+      break;
+    case DT_INT64:
+      result->value.v_int64 = static_cast<int64>(value);
+      break;
+    case DT_FLOAT:
+      result->value.v_float = static_cast<float>(value);
+      break;
+    case DT_DOUBLE:
+      result->value.v_double = static_cast<double>(value);
+      break;
+    default:
+      // We should never get here, given the type checking that occurs earlier.
+      return errors::Internal(
+          "Cannot initialize default value for unsupported type: ",
+          DataTypeString(dtype));
+  }
+  return Status::OK();
+}
+
+template <>
+Status InitDefaultValue(DataType dtype, const char* value,
+                        DefaultValue* result) {
+  // These are sanity checks that should never trigger given the code that
+  // leads here.
+  if (TF_PREDICT_FALSE(dtype != DT_STRING)) {
+    return errors::InvalidArgument(
+        "Cannot cast field to anything but DT_STRING");
+  }
+  if (TF_PREDICT_FALSE(value == nullptr)) {
+    return errors::InvalidArgument("Null default string value.");
+  }
+  result->dtype = DT_STRING;
+  result->value.v_string = value;
+  return Status::OK();
+}
+
+// Initializes a default value from the output data type and the field
+// descriptor.
+Status InitDefaultValueFromFieldDescriptor(DataType dtype,
+                                           const FieldDescriptor* field_desc,
+                                           DefaultValue* result) {
+  switch (field_desc->type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return InitDefaultValue(dtype, field_desc->default_value_double(),
+                              result);
+    case WireFormatLite::TYPE_FLOAT:
+      return InitDefaultValue(dtype, field_desc->default_value_float(), result);
+    case WireFormatLite::TYPE_INT64:
+    case WireFormatLite::TYPE_SINT64:
+    case WireFormatLite::TYPE_SFIXED64:
+      return InitDefaultValue(dtype, field_desc->default_value_int64(), result);
+    case WireFormatLite::TYPE_FIXED64:
+    case WireFormatLite::TYPE_UINT64:
+      return InitDefaultValue(dtype, field_desc->default_value_uint64(),
+                              result);
+    case WireFormatLite::TYPE_ENUM:
+    case WireFormatLite::TYPE_INT32:
+    case WireFormatLite::TYPE_SINT32:
+    case WireFormatLite::TYPE_SFIXED32:
+      return InitDefaultValue(dtype, field_desc->default_value_int32(), result);
+    case WireFormatLite::TYPE_FIXED32:
+    case WireFormatLite::TYPE_UINT32:
+      return InitDefaultValue(dtype, field_desc->default_value_uint32(),
+                              result);
+    case WireFormatLite::TYPE_BOOL:
+      return InitDefaultValue(dtype, field_desc->default_value_bool(), result);
+    case WireFormatLite::TYPE_BYTES:
+    case WireFormatLite::TYPE_STRING:
+      // Manipulating default string values as C-style pointers should be OK
+      // for typical code-generated protocol messages.  It is possible in
+      // principle to register a message descriptor on the fly, and these
+      // pointers may not be stable if that descriptor has a weird
+      // implementation.  (But the return type of default_value_string() is
+      // const string&, so it'd have to be very weird.)
+      return InitDefaultValue(dtype, field_desc->default_value_string().c_str(),
+                              result);
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+      return InitDefaultValue(dtype, "", result);
+      // default: intentionally omitted in order to enable static checking.
+  }
+  return Status::OK();
+}
+
+// A FieldInfo holds a handful of information from the FieldDescriptor
+// and user attributes.
+struct FieldInfo {
+  FieldInfo(const FieldDescriptor* field_desc, int user_index,
+            DefaultValue def_value)
+      : output_index(user_index), default_value(def_value) {
+    // Without this intermediate data structure, the profile had hotspots
+    // calling methods of FieldDescriptor.
+    number = field_desc->number();
+
+    // The wire format library defines the same constants used in
+    // descriptor.proto. This static_cast is safe because they
+    // are guaranteed to stay in sync.
+    // We need the field type from the FieldDescriptor here
+    // because the wire format doesn't tell us anything about
+    // what happens inside a packed repeated field: there is
+    // enough information in the wire format to skip the
+    // whole field but not enough to know how to parse what's
+    // inside. For that we go to the schema.
+    type = static_cast<WireFormatLite::FieldType>(field_desc->type());
+    is_repeated = field_desc->is_repeated();
+  }
+
+  // Disable copy and move.
+  FieldInfo(const FieldInfo&) = delete;
+  FieldInfo& operator=(const FieldInfo&) = delete;
+
+  // Internally we sort field descriptors by wire number for
+  // fast lookup. In general this is different from the order
+  // given by the user. Output_index gives the index into
+  // the field_names and output_types attributes and into
+  // the output tensor list.
+  int output_index = -1;
+
+  // This is a cache of the relevant fields from `FieldDescriptorProto`.
+  // This was added after noticing that FieldDescriptor->type() was
+  // using 6% of the cpu profile.
+  WireFormatLite::FieldType type;
+  int number;
+  bool is_repeated;
+  DefaultValue default_value;
+};
+
+// A CountCollector counts sizes of repeated and optional fields in a proto.
+//
+// Each field is tracked by a single CountCollector instance. The
+// instance manages a single count, which is stored as a pointer (it
+// is intended to be a reference to the `sizes` output which is being
+// filled in). The pointer is passed in at initialization.
+//
+// Counting is done as a separate pass in order to allocate output tensors
+// all at once. This allows the TensorFlow runtime to optimize allocation
+// for the consumer, while removing the need for copying inside this op.
+// After this pass, the DenseCollector class (below) gathers the data:
+// It is more complex and provides better motivation for the API here.
+class CountCollector {
+ public:
+  // Default constructor allows the collector to be a vector element.
+  CountCollector() = default;
+
+  // The count may be stored inside an Eigen Tensor to eliminate copying.
+  explicit CountCollector(int32* count) : count_ptr_(count) {}
+
+  // Reads (in this case counts) a single value.
+  Status ReadValue(CodedInputStream* input, const FieldInfo& field) {
+    // Only repeated fields can have count > 1.
+    if (*count_ptr_ == 0 || field.is_repeated) {
+      (*count_ptr_)++;
+    }
+    // We expect a wire type based on the schema field_type, to allow
+    // a little more checking.
+    if (!SkipValue(input, field)) {
+      return errors::DataLoss("ReadValue: Failed skipping field when counting");
+    }
+    return Status::OK();
+  }
+
+  // Reads (in this case counts) a length-delimited list of values.
+  Status ReadPackedValues(CodedInputStream* input, const FieldInfo& field,
+                          size_t buf_size) {
+    if (buf_size == 0) {
+      return Status::OK();
+    }
+
+    const void* tmpbuf;
+    int unused_max_buf_size;
+
+    input->GetDirectBufferPointerInline(&tmpbuf, &unused_max_buf_size);
+    // This is safe because the underlying storage for the CodedInputStream is
+    // owned by the input tensor. If it were a Cord or file-backed stream this
+    // pointer would go stale after the bytes were skipped.
+    const uint8* buf = reinterpret_cast<const uint8*>(tmpbuf);
+
+    // Important: we skipped the input->{Push,Pop}Limit() calls for speed,
+    // so the bounds check on buf_size inside Skip() is critical, and
+    // must be done before scanning the contents.
+    if (!input->Skip(buf_size)) {
+      return errors::DataLoss("ReadPackedValues: Skipping packed field failed");
+    }
+
+    // Dispatch to the appropriately typed field reader based on the
+    // schema type.
+    Status st;
+    switch (field.type) {
+      case WireFormatLite::TYPE_DOUBLE:
+        st = CountPackedFixed<double>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FLOAT:
+        st = CountPackedFixed<float>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_INT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_UINT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_INT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FIXED64:
+        st = CountPackedFixed<uint64>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FIXED32:
+        st = CountPackedFixed<uint32>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_BOOL:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_STRING:
+        st = errors::DataLoss("TYPE_STRING encountered as packed");
+        break;
+      case WireFormatLite::TYPE_GROUP:
+        st = errors::DataLoss("TYPE_GROUP encountered as packed");
+        break;
+      case WireFormatLite::TYPE_MESSAGE:
+        st = errors::DataLoss("TYPE_MESSAGE encountered as packed");
+        break;
+      case WireFormatLite::TYPE_BYTES:
+        st = errors::DataLoss("TYPE_BYTES encountered as packed");
+        break;
+      case WireFormatLite::TYPE_UINT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_ENUM:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SFIXED32:
+        st = CountPackedFixed<int32>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SFIXED64:
+        st = CountPackedFixed<int64>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SINT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SINT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+        // default: intentionally omitted in order to enable static checking.
+    }
+    if (!st.ok()) {
+      return st;
+    }
+
+    if (!field.is_repeated && *count_ptr_ > 1) {
+      *count_ptr_ = 1;
+    }
+    return Status::OK();
+  }
+
+ private:
+  // Skips a length-delimited value.
+  static bool SkipBytes(CodedInputStream* input) {
+    uint32 length;
+    if (!input->ReadVarint32(&length)) {
+      return false;
+    }
+    return input->Skip(length);
+  }
+
+  // Counts the number of packed varints in an array.
+  // The end of a varint is signaled by a value < 0x80,
+  // so counting them requires parsing the bytestream.
+  // It is the caller's responsibility to ensure that len > 0.
+  Status CountPackedVarint(const uint8* buf, size_t len) {
+    const uint8* bound = buf + len;
+    int count;
+
+    // The last byte in a valid encoded varint is guaranteed to have
+    // the high bit unset. We rely on this property to prevent
+    // ReadVarint64FromArray from going out of bounds, so validate
+    // the end of the buf before scanning anything.
+    if (bound[-1] & 0x80) {
+      return errors::DataLoss("Corrupt packed varint");
+    }
+
+    // Now we can trust ReadVarint64FromArray to stay in bounds.
+    for (count = 0; buf < bound; ++count) {
+      uint64 temp;
+      bool ok;
+      buf = internal::ReadVarint64FromArray(buf, &ok, &temp);
+      if (!ok) {
+        return errors::DataLoss("Corrupt packed varint");
+      }
+    }
+
+    *count_ptr_ += count;
+    return Status::OK();
+  }
+
+  // Counts the number of fixed-size values in a packed field.
+  // This can be done without actually parsing anything.
+  template <typename T>
+  Status CountPackedFixed(const uint8* unused_buf, size_t len) {
+    int count = len / sizeof(T);
+    if (count * sizeof(T) != len) {
+      return errors::DataLoss(
+          "Illegal data length for packed fixed-size type: ", len);
+    }
+    *count_ptr_ += len / sizeof(T);
+    return Status::OK();
+  }
+
+  // Skips a single value in the input stream.
+  // Dispatches to the appropriately typed field skipper based on the
+  // schema type tag.
+  // This is not as permissive as just handling the wire type.
+  static bool SkipValue(CodedInputStream* input, const FieldInfo& field) {
+    uint32 tmp32;
+    protobuf_uint64 tmp64;
+    switch (field.type) {
+      case WireFormatLite::TYPE_DOUBLE:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_FLOAT:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_INT64:
+        return input->ReadVarint64(&tmp64);
+      case WireFormatLite::TYPE_UINT64:
+        return input->ReadVarint64(&tmp64);
+      case WireFormatLite::TYPE_INT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_FIXED64:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_FIXED32:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_BOOL:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_STRING:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_GROUP:
+        return WireFormatLite::SkipField(
+            input, WireFormatLite::MakeTag(
+                       field.number, WireFormatLite::WIRETYPE_START_GROUP));
+      case WireFormatLite::TYPE_MESSAGE:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_BYTES:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_UINT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_ENUM:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_SFIXED32:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_SFIXED64:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_SINT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_SINT64:
+        return input->ReadVarint64(&tmp64);
+        // default: intentionally omitted in order to enable static checking.
+    }
+  }
+
+  int32* count_ptr_ = nullptr;
+};
+
+// A DenseCollector accumulates values from a proto into a tensor.
+//
+// There is an instance of DenseCollector for each field of each
+// proto. The DenseCollector deserializes the value from the wire
+// directly into the preallocated output Tensor.
+//
+// This class is named DenseCollector because in the future there should
+// be a SparseCollector that accumulates field data into sparse tensors if
+// the user requests it.
+class DenseCollector {
+ public:
+  // Default constructor allows the collector to be a vector element.
+  DenseCollector() = default;
+
+  // A DenseCollector applies to one field of a serialized message.
+  // Note that default_value.dtype is the type of the output tensor.
+  DenseCollector(uint8* datap, DefaultValue default_value, int max_repeat_count)
+      : datap_(datap),
+        default_value_(default_value),
+        max_repeat_count_(max_repeat_count) {}
+
+  // Reads a value from the input stream and stores it.
+  //
+  // Always inlining gave a ~50% speedup on microbenchmarks at one point.
+  // TODO(nix): try removing it to see if that still holds.
+  // TODO(jsimsa): ABSL_ATTRIBUTE_ALWAYS_INLINE
+  Status ReadValue(CodedInputStream* input, const FieldInfo& field) {
+    // For required and optional fields, we overwrite values[0] with
+    // the latest one in the wire stream.
+    // See https://developers.google.com/protocol-buffers/docs/encoding#optional
+    // Only for repeated fields do we advance the next_repeat_index_ past 1.
+    // TODO(nix): to handle oneof we must also zero out any previous values
+    //  seen on the wire.
+    int32 index = 0;
+    if (field.is_repeated) {
+      index = next_repeat_index_;
+    }
+    next_repeat_index_ = index + 1;
+
+    return internal::ReadValue(input, field.type, field.number,
+                               default_value_.dtype, index, datap_);
+  }
+
+  // Reads and stores a length-delimited list of values.
+  Status ReadPackedValues(CodedInputStream* input, const FieldInfo& field,
+                          const size_t buf_size) {
+    const void* buf;
+    int unused_max_buf_size;
+    input->GetDirectBufferPointerInline(&buf, &unused_max_buf_size);
+    // This is safe because the underlying storage for the CodedInputStream is
+    // owned by the input tensor. If it were a Cord or file-backed stream this
+    // pointer would go stale after the bytes were skipped.
+    if (!input->Skip(buf_size)) {
+      return errors::DataLoss(
+          "ReadPackedValues: Skipping packed field failed.  Field tag: ",
+          field.number);
+    }
+
+    // Setting stride=0 causes new values to overwrite old ones for
+    // non-repeated fields.
+    const int stride = field.is_repeated ? 1 : 0;
+
+    if (next_repeat_index_ >= max_repeat_count_) {
+      return errors::DataLoss(
+          "ReadPackedValues: Tried to write more entries than allowed.  "
+          "Field tag: ",
+          field.number, ", Max entries allowed: ", max_repeat_count_);
+    } else {
+      return internal::ReadPackedFromArray(buf, buf_size, field.type,
+                                           field.number, default_value_.dtype,
+                                           stride, &next_repeat_index_, datap_);
+    }
+  }
+
+  // Fills in any missing values in the output array with defaults.
+  // Dispatches to the appropriately typed field default based on the
+  // runtime type tag.
+  Status FillWithDefaults() {
+    switch (default_value_.dtype) {
+      case DataType::DT_FLOAT:
+        return FillDefault<float>(default_value_.value.v_float);
+      case DataType::DT_DOUBLE:
+        return FillDefault<double>(default_value_.value.v_double);
+      case DataType::DT_INT32:
+        return FillDefault<int32>(default_value_.value.v_int32);
+      case DataType::DT_UINT8:
+        return FillDefault<uint8>(default_value_.value.v_uint8);
+      case DataType::DT_INT8:
+        return FillDefault<int8>(default_value_.value.v_int8);
+      case DataType::DT_STRING:
+        return FillDefault<string>(default_value_.value.v_string);
+      case DataType::DT_INT64:
+        return FillDefault<int64>(default_value_.value.v_int64);
+      case DataType::DT_BOOL:
+        return FillDefault<bool>(default_value_.value.v_bool);
+      default:
+        // There are many tensorflow dtypes not handled here, but they
+        // should not come up unless type casting is added to the Op.
+        // Chaining with tf.cast() should do the right thing until then.
+        return errors::DataLoss(
+            "Failed filling defaults in unknown tf::DataType");
+    }
+  }
+
+ private:
+  // Fills empty values in the dense representation with a
+  // default value. This uses next_repeat_index_ which counts the number
+  // of parsed values for the field.
+  template <class T>
+  Status FillDefault(const T& default_value) {
+    for (int i = next_repeat_index_; i < max_repeat_count_; i++) {
+      reinterpret_cast<T*>(datap_)[i] = default_value;
+    }
+    return Status::OK();
+  }
+
+  int32 next_repeat_index_ = 0;
+
+  // This is a pointer to data_[message_index_].
+  // There is no bounds checking at this level: we computed the max
+  // repeat size for each field in CountCollector and use the same
+  // code to traverse it here, so we are guaranteed not to be called
+  // for more items than we have allocated space.
+  void* const datap_ = nullptr;
+
+  const DefaultValue default_value_;
+  const int max_repeat_count_ = 0;
+};
+
+class DecodeProtoOp : public OpKernel {
+ public:
+  explicit DecodeProtoOp(OpKernelConstruction* context) : OpKernel(context) {
+    string descriptor_source;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("descriptor_source", &descriptor_source));
+
+    // We always get back a desc_pool, but we may not own it. If we own it,
+    // owned_desc_pool_ will be filled in.
+    DescriptorPool const* desc_pool;
+    OP_REQUIRES_OK(context, GetDescriptorPool(context->env(), descriptor_source,
+                                              &desc_pool, &owned_desc_pool_));
+
+    string message_type;
+    OP_REQUIRES_OK(context, context->GetAttr("message_type", &message_type));
+
+    const Descriptor* message_desc =
+        desc_pool->FindMessageTypeByName(message_type);
+    OP_REQUIRES(context, message_desc != nullptr,
+                errors::InvalidArgument("No descriptor found for message type ",
+                                        message_type));
+
+    std::vector<string> field_names;
+    OP_REQUIRES_OK(context, context->GetAttr("field_names", &field_names));
+    std::vector<DataType> output_types;
+    OP_REQUIRES_OK(context, context->GetAttr("output_types", &output_types));
+    OP_REQUIRES(
+        context, field_names.size() == output_types.size(),
+        errors::InvalidArgument("field_names and output_types attributes must "
+                                "have the same length"));
+
+    // Gather the field descriptors and check that requested output types match.
+
+    int field_index = 0;
+    std::vector<const FieldDescriptor*> field_descs;
+    for (const string& name : field_names) {
+      auto fd = message_desc->FindFieldByName(name);
+      OP_REQUIRES(context, fd != nullptr,
+                  errors::InvalidArgument("Unknown field: ", name,
+                                          " in message type ", message_type));
+      OP_REQUIRES(context,
+                  CheckOutputType(fd->type(), output_types[field_index]),
+                  // Many TensorFlow types don't have corresponding proto types
+                  // and the user will get an error if they are requested. It
+                  // would be nice to allow conversions here, but tf.cast
+                  // already exists so we don't duplicate the functionality.
+                  // Known unhandled types:
+                  //   DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
+                  //   DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
+                  errors::InvalidArgument("Unexpected output type for ",
+                                          fd->full_name(), ": ", fd->cpp_type(),
+                                          " to ", output_types[field_index]));
+
+      field_index++;
+      field_descs.push_back(fd);
+    }
+
+    // Internally we want the field_descs sorted by their number on the wire.
+    // But the output tensors are allocated in the order given by the caller.
+    // Build a mapping i->j, where field_descs[i] corresponds to outputs[j].
+    std::vector<int> output_indices;
+    output_indices.reserve(field_names.size());
+    for (int i = 0; i < field_names.size(); i++) {
+      output_indices.push_back(i);
+    }
+    std::sort(output_indices.begin(), output_indices.end(),
+              [field_descs](int a, int b) {
+                return field_descs[a]->number() < field_descs[b]->number();
+              });
+
+    // Now store the fields in sorted order.
+    for (int i = 0; i < field_names.size(); i++) {
+      const int output_index = output_indices[i];
+      const DataType dtype = output_types[output_index];
+      const FieldDescriptor* field_descriptor = field_descs[output_index];
+      DefaultValue default_value;
+      OP_REQUIRES_OK(context, InitDefaultValueFromFieldDescriptor(
+                                  dtype, field_descriptor, &default_value));
+      fields_.push_back(
+          MakeUnique<FieldInfo>(field_descriptor, output_index, default_value));
+    }
+
+    message_prototype_ = message_factory_.GetPrototype(message_desc);
+    OP_REQUIRES(context, message_prototype_ != nullptr,
+                errors::InvalidArgument("Couldn't get prototype message: ",
+                                        message_desc->full_name()));
+    string format;
+    OP_REQUIRES_OK(context, context->GetAttr("message_format", &format));
+    OP_REQUIRES(
+        context, format == "binary" || format == "text",
+        errors::InvalidArgument("format must be one of binary or text"));
+    is_binary_ = format == "binary";
+
+    // Enable the initial protobuf sanitizer, which is much
+    // more expensive than the decoder.
+    // TODO(nix): Remove this once the fast decoder
+    // has passed security review.
+    OP_REQUIRES_OK(context, context->GetAttr("sanitize", &sanitize_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& buf_tensor = ctx->input(0);
+    int message_count = buf_tensor.NumElements();
+    OP_REQUIRES(ctx, message_count >= 1,
+                errors::InvalidArgument(
+                    "Bufs argument must contain at least one value"));
+
+    int field_count = fields_.size();
+
+    // Save the argument shape for later, then flatten the input
+    // Tensor since we are working componentwise. We will restore
+    // the same shape in the returned Tensor.
+    const TensorShape& shape_prefix = buf_tensor.shape();
+
+    TensorShape sizes_shape = shape_prefix;
+    sizes_shape.AddDim(field_count);
+    Tensor* sizes_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, sizes_shape, &sizes_tensor));
+
+    // This is used to allocate binary bufs if used. It serves only
+    // to define memory ownership.
+    std::vector<string> tmp_binary_bufs(message_count);
+
+    // These are the actual buffers to use, which may be in tmp_binary_bufs
+    // or may be pointers into the buf_tensor. Either way they are not owned
+    // here.
+    std::vector<const string*> bufs;
+
+    if (is_binary_ && !sanitize_) {
+      // Fast path.
+      for (int mi = 0; mi < message_count; ++mi) {
+        const string* buf = &buf_tensor.flat<string>()(mi);
+        bufs.push_back(buf);
+      }
+    } else {
+      // We will have to allocate a copy, either to convert from text to
+      // binary or to sanitize a binary proto.
+      for (int mi = 0; mi < message_count; ++mi) {
+        ReserializeMessage(ctx, buf_tensor.flat<string>()(mi),
+                           &tmp_binary_bufs[mi]);
+        if (!ctx->status().ok()) {
+          return;
+        }
+        bufs.push_back(&tmp_binary_bufs[mi]);
+      }
+    }
+
+    // Walk through all the strings in the input tensor, counting
+    // the number of fields in each.
+    // We can't allocate our actual output Tensor until we know the
+    // maximum repeat count, so we do a first pass through the serialized
+    // proto just counting fields.
+    // We always allocate at least one value so that optional fields
+    // are populated with default values - this avoids a TF
+    // conditional when handling the output data.
+    // The caller can distinguish between real data and defaults
+    // using the repeat count matrix that is returned by decode_proto.
+    std::vector<int32> max_sizes(field_count, 1);
+    for (int mi = 0; mi < message_count; ++mi) {
+      CountFields(ctx, mi, *bufs[mi], sizes_tensor, &max_sizes);
+      if (!ctx->status().ok()) {
+        return;
+      }
+    }
+
+    // Allocate the output tensors now that we've seen the max size.
+    // TODO(nix): Use allocate_output_or_forward_input for the largest
+    //   output tensor. This can avoid one large allocation by re-using
+    //   the memory of the input tensor.
+    std::vector<Tensor*> outputs(field_count);
+    for (int fi = 0; fi < field_count; ++fi) {
+      TensorShape flat_shape = {static_cast<int64>(message_count),
+                                max_sizes[fi]};
+      TensorShape out_shape = shape_prefix;
+      out_shape.AddDim(max_sizes[fi]);
+
+      // Surprisingly we don't specify the types from the output_types
+      // attribute: that is done for us based on the Op declaration:
+      //  REGISTER_OP(...)
+      //    .Attr("output_types: list(type) >= 0")
+      //    .Output("values: output_types")
+      OP_REQUIRES_OK(ctx,
+                     // ctx->allocate_output(output_indices_[fi] + 1,
+                     ctx->allocate_output(fields_[fi]->output_index + 1,
+                                          out_shape, &outputs[fi]));
+    }
+
+    // Make the second pass through the serialized proto, decoding
+    // into preallocated tensors.
+    AccumulateFields(ctx, bufs, outputs);
+  }
+
+ private:
+  // Copy a serialized message to binary, e.g. to handle text proto inputs.
+  void ReserializeMessage(OpKernelContext* ctx, const string& buf,
+                          string* binary_buf) {
+    // Handle text protos by translating them to binary.
+    std::unique_ptr<Message> message(message_prototype_->New());
+    OP_REQUIRES(ctx, message, errors::DataLoss("Initializing message failed"));
+
+    if (is_binary_) {
+      // If we get here we are sanitizing the input protobuf by parsing
+      // and reserializing it with a trusted (but very slow) library.
+      OP_REQUIRES(ctx, message->ParseFromString(buf),
+                  errors::DataLoss("Unable to parse binary protobuf"));
+    } else {
+      OP_REQUIRES(ctx, TextFormat::ParseFromString(buf, message.get()),
+                  errors::DataLoss("Unable to parse text protobuf"));
+    }
+
+    OP_REQUIRES(ctx, message->SerializeToString(binary_buf),
+                errors::DataLoss("Unable to reserialize text proto as binary"));
+  }
+
+  // Count the number of occurrences of each requested field in a message batch.
+  void CountFields(OpKernelContext* ctx, int message_index, const string& buf,
+                   Tensor* sizes_tensor, std::vector<int32>* max_sizes) {
+    int field_count = fields_.size();
+
+    CodedInputStream input(reinterpret_cast<const uint8*>(buf.c_str()),
+                           buf.size());
+
+    std::vector<int32> field_sizes(field_count, 0);
+    std::vector<CountCollector> counters;
+    counters.reserve(field_count);
+    for (int i = 0; i < field_count; i++) {
+      counters.emplace_back(&field_sizes[i]);
+    }
+
+    Status st = Collect(&input, &counters);
+    if (st.ok() && !input.ConsumedEntireMessage()) {
+      st = errors::DataLoss("CountFields: Failed to consume entire buffer");
+    }
+    if (kFailOnDecodeError) {
+      OP_REQUIRES_OK(ctx, st);  // NOLINT
+    }
+    if (!st.ok()) {
+      // This code suppresses the corrupt proto, treating it as empty
+      // to avoid crashing the process.
+      LOG(WARNING) << "Proto counting error for message type " << message_type_
+                   << ": " << st;
+
+      for (int fi = 0; fi < field_count; fi++) {
+        field_sizes[fi] = 0;
+      }
+      // Finished decoding this message.
+      return;
+    }
+
+    // Update the size tensor and max repeat size for each field.
+    auto sizes = sizes_tensor->flat_inner_dims<int32>();
+    for (int fi = 0; fi < field_count; fi++) {
+      int32 size = field_sizes[fi];
+      sizes(message_index, fields_[fi]->output_index) = size;
+      if ((*max_sizes)[fi] < size) {
+        (*max_sizes)[fi] = size;
+      }
+    }
+  }
+
+  // Parse fields from a serialized message into preallocated tensors.
+  void AccumulateFields(OpKernelContext* ctx,
+                        const std::vector<const string*>& bufs,
+                        std::vector<Tensor*> outputs) {
+    struct TensorInfo {
+      explicit TensorInfo(Tensor* tensor) {
+        // Note that we can decode only max_repeat_count values before overflow.
+        // No other bounds checking is done for repeated fields. For
+        // optional fields there is a check to make sure that only the last
+        // value on the wire appears in the output tensor.
+        dtype = tensor->dtype();
+        last_dim_size = tensor->dim_size(tensor->dims() - 1);
+
+        if (dtype != DT_STRING) {
+          const int element_size = DataTypeSize(dtype);
+          CHECK_GT(element_size, 0);
+          stride = last_dim_size * element_size;
+
+          const int64 flatshape[1] = {tensor->NumElements() * element_size};
+          data = tensor->bit_casted_shaped<uint8, 1>(flatshape).data();
+        } else {
+          // DataTypeSize() returns 0 for string types.
+          stride = last_dim_size * sizeof(string);
+          data = reinterpret_cast<uint8*>(tensor->flat<string>().data());
+        }
+      }
+
+      DataType dtype;
+      int last_dim_size;
+      int stride;
+      uint8* data;
+    };
+
+    int field_count = fields_.size();
+
+    std::vector<TensorInfo> tensors;
+    tensors.reserve(field_count);
+    for (int fi = 0; fi < field_count; fi++) {
+      tensors.emplace_back(outputs[fi]);
+    }
+
+    for (int message_index = 0; message_index < bufs.size(); ++message_index) {
+      const string& buf = *bufs[message_index];
+
+      std::vector<DenseCollector> collectors;
+      collectors.reserve(field_count);
+      for (int output_index = 0; output_index < field_count; ++output_index) {
+        const TensorInfo& info = tensors[output_index];
+        const FieldInfo* field_info = fields_[output_index].get();
+        DCHECK(field_info != nullptr);
+        const DefaultValue default_value = field_info->default_value;
+        collectors.emplace_back(info.data + message_index * info.stride,
+                                default_value, info.last_dim_size);
+      }
+
+      // Fill in output tensors from the wire.
+      CodedInputStream input(reinterpret_cast<const uint8*>(buf.c_str()),
+                             buf.size());
+      Status st = Collect(&input, &collectors);
+      if (st.ok() && !input.ConsumedEntireMessage()) {
+        st = errors::DataLoss(
+            "AccumulateFields: Failed to consume entire buffer");
+      }
+      if (kFailOnDecodeError) {
+        OP_REQUIRES_OK(ctx, st);  // NOLINT
+      }
+      if (!st.ok()) {
+        // This code suppresses the corrupt proto, treating it as empty
+        // to avoid crashing training.
+        LOG(WARNING) << "Proto counting error for message type "
+                     << message_type_ << ": " << st;
+      }
+
+      // Fill the remainder of the dense outputs with default values.
+      for (auto& collector : collectors) {
+        OP_REQUIRES_OK(ctx, collector.FillWithDefaults());
+      }
+    }
+  }
+
+  // Look up the FieldDescriptor for a particular field number.
+  bool LookupField(int field_number, int* field_index) {
+    // Look up the FieldDescriptor using linear search.
+    // TODO(nix): this could be sped up with binary search, but we are
+    // already way off the fastpath at this point. If you see a hotspot
+    // here, somebody is sending you very inefficient protos.
+    for (int fi = fields_.size() - 1; fi >= 0; fi--) {
+      if (field_number == fields_[fi]->number) {
+        *field_index = fi;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Traverses a serialized protobuf, dispatching values to the collectors.
+  template <class CollectorClass>
+  Status Collect(CodedInputStream* input,
+                 std::vector<CollectorClass>* collectors) {
+    int last_good_field_index = -1;
+    bool fields_disordered = false;
+    int prev_field_number = -1;
+    int field_number = -1;
+    int last_good_field_number = -1;
+    int next_good_field_number = fields_[0]->number;
+
+    // The 'tag' variable should always be treated as tainted.
+    for (uint32 tag = input->ReadTag();
+         tag != 0 && WireFormatLite::GetTagWireType(tag) !=
+                         WireFormatLite::WIRETYPE_END_GROUP;
+         tag = input->ReadTag(), prev_field_number = field_number) {
+      field_number = WireFormatLite::GetTagFieldNumber(tag);
+      const FieldInfo* field = nullptr;
+
+      // This takes advantage of the sorted field numbers in most serialized
+      // protos: it tries the next expected field first rather than doing
+      // a lookup by field number.
+      // TODO(nix): haberman@ suggests a hybrid approach with a lookup table
+      // for small field numbers and a hash table for larger ones. This would
+      // be a simpler approach that should offer comparable speed in most
+      // cases.
+      if (field_number == last_good_field_number) {
+        field = fields_[last_good_field_index].get();
+      } else {
+        if (field_number < prev_field_number) {
+          fields_disordered = true;
+        }
+
+        // If fields are out of order, fall back to slow lookup.
+        if (fields_disordered) {
+          int field_index;
+          if (LookupField(field_number, &field_index)) {
+            field = fields_[field_index].get();
+            last_good_field_index = field_index;
+          }
+        } else {
+          // If we see a field that is past the next field we want,
+          // it was empty. Look for the one after that.
+          // Repeat until we run out of fields that we care about.
+          while (field_number >= next_good_field_number) {
+            if (field_number == next_good_field_number) {
+              last_good_field_number = field_number;
+              field = fields_[last_good_field_index + 1].get();
+            }
+
+            // Start looking for the field after the current one.
+            ++last_good_field_index;
+            if (last_good_field_index < fields_.size() - 1) {
+              next_good_field_number =
+                  fields_[last_good_field_index + 1]->number;
+            } else {
+              // Saw something past the last field we care about.
+              // Continue parsing the message just in case there
+              // are disordered fields later, but any remaining
+              // ordered fields will have no effect.
+              next_good_field_number = INT_MAX;
+            }
+          }
+        }
+      }
+
+      if (!field) {
+        // Unknown and unrequested fields are skipped.
+        if (!WireFormatLite::SkipField(input, tag)) {
+          return errors::DataLoss("Failed skipping unrequested field");
+        }
+        continue;
+      }
+
+      Status st = CollectField(*field, WireFormatLite::GetTagWireType(tag),
+                               input, &(*collectors)[last_good_field_index]);
+      if (!st.ok()) {
+        return st;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Collects values for a single field.
+  template <class CollectorClass>
+  Status CollectField(const FieldInfo& field,
+                      WireFormatLite::WireType wire_type,
+                      CodedInputStream* input, CollectorClass* collector) {
+    // The wire format library defines the same constants used in
+    // descriptor.proto. This static_cast is safe because they
+    // are guaranteed to stay in sync.
+    // We need the field type from the FieldDescriptor here
+    // because the wire format doesn't tell us anything about
+    // what happens inside a packed repeated field: there is
+    // enough information in the wire format to skip the
+    // whole field but not enough to know how to parse what's
+    // inside. For that we go to the schema.
+    WireFormatLite::WireType schema_wire_type =
+        WireFormatLite::WireTypeForFieldType(field.type);
+
+    // Handle packed repeated fields. SkipField would skip the
+    // whole length-delimited blob without letting us count the
+    // values, so we have to scan them ourselves.
+    if (wire_type == WireFormatLite::WIRETYPE_LENGTH_DELIMITED &&
+        schema_wire_type != WireFormatLite::WIRETYPE_LENGTH_DELIMITED) {
+      // Handle packed repeated primitives.
+      int length;
+      if (!input->ReadVarintSizeAsInt(&length)) {
+        return errors::DataLoss("CollectField: Failed reading packed size");
+      }
+      Status st = collector->ReadPackedValues(input, field, length);
+      if (!st.ok()) {
+        return st;
+      }
+      return Status::OK();
+    }
+
+    // Read ordinary values, including strings, bytes, and messages.
+    if (wire_type != schema_wire_type) {
+      if (!WireFormatLite::SkipField(
+              input, WireFormatLite::MakeTag(field.number, wire_type))) {
+        return errors::DataLoss(
+            "CollectField: Failed skipping malformed field");
+      }
+      return Status::OK();
+    }
+    return collector->ReadValue(input, field);
+  }
+
+  string message_type_;
+  // Note that fields are sorted by increasing field number,
+  // which is not in general the order given by the user-specified
+  // field_names and output_types Op attributes.
+  std::vector<std::unique_ptr<const FieldInfo>> fields_;
+
+  // Owned_desc_pool_ is null when using descriptor_source=local.
+  std::unique_ptr<DescriptorPool> owned_desc_pool_;
+  DynamicMessageFactory message_factory_;
+  const Message* message_prototype_;
+
+  // True if decoding binary format, false if decoding text format.
+  bool is_binary_;
+
+  // True if the protos should be sanitized before parsing.
+  // Enables the initial protobuf sanitizer, which is much
+  // more expensive than the decoder. The flag defaults to true
+  // but can be set to false for trusted sources.
+  // TODO(nix): flip the default to false when the fast decoder
+  // has passed security review.
+  bool sanitize_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DecodeProtoOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeProtoV2").Device(DEVICE_CPU),
+                        DecodeProtoOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index bacacb94ae4384151bc4282960dd810cbf1299a0..eaef5a6097ff5a7235caba37edf6ef94d5860931 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/dense_update_functor.cc b/tensorflow/core/kernels/dense_update_functor.cc
index a878fe9a97059e2d55164600923c4a2e1312161b..3ed3794e01d63d49e5be0406e3f892bfbec2c8c8 100644
--- a/tensorflow/core/kernels/dense_update_functor.cc
+++ b/tensorflow/core/kernels/dense_update_functor.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/dense_update_functor.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -70,4 +71,59 @@ struct DenseUpdate<CPUDevice, string, ASSIGN> {
 
 }  // namespace functor
 
+#define CPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+
+#define INSTANTIATE_GET_VARIANT_COPY_FN(DEVICE, TYPE_CALLER, TYPE_DENSE_COPY) \
+  template <>                                                                 \
+  Status VariantCopyFn<DEVICE>(OpKernelContext * context, const Tensor& from, \
+                               Tensor* to) {                                  \
+    PersistentTensor tmp;                                                     \
+    Tensor* tensor;                                                           \
+    AllocatorAttributes attr;                                                 \
+    attr.set_gpu_compatible(true);                                            \
+    attr.set_nic_compatible(true);                                            \
+    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
+        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
+    switch (from.dtype()) {                                                   \
+      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
+      default:                                                                \
+        return errors::InvalidArgument(                                       \
+            "VariantCopyFn: Could not perform a deep copy of variant "        \
+            "element of type: ",                                              \
+            DataTypeString(from.dtype()),                                     \
+            " using device: ", context->device()->name());                    \
+    }                                                                         \
+    *to = *tensor;                                                            \
+    return Status::OK();                                                      \
+  }
+
+INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
+
+#if GOOGLE_CUDA
+#define GPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
+  TF_CALL_GPU_ALL_TYPES(T);                 \
+  TF_CALL_int32(T);                         \
+  TF_CALL_int64(T);
+INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
+                                GPU_DENSE_COPY);
+#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
+#undef GPU_DENSE_COPY
+#endif  // GOOGLE_CUDA
+
+#undef CPU_DENSE_COPY
+#undef INSTANTIATE_GET_VARIANT_COPY_FN
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_functor.h b/tensorflow/core/kernels/dense_update_functor.h
index 4aefe26c545ee5eaf3868b73cd9ace38fd135f53..240c13261eaf1da256a326329c8eb72cce2cbcab 100644
--- a/tensorflow/core/kernels/dense_update_functor.h
+++ b/tensorflow/core/kernels/dense_update_functor.h
@@ -19,11 +19,14 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
@@ -89,6 +92,17 @@ struct DenseUpdate<SYCLDevice, T, ASSIGN> {
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
+
+template <typename Device>
+Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
+
+template <>
+Status VariantCopyFn<CPUDevice>(OpKernelContext* context, const Tensor& from,
+                                Tensor* to);
+template <>
+Status VariantCopyFn<GPUDevice>(OpKernelContext* context, const Tensor& from,
+                                Tensor* to);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/depthtospace_op.cc b/tensorflow/core/kernels/depthtospace_op.cc
index 39aa3e9eb0772076b25922520995174fc45efc4e..b74a09e2cb8b83d791d7ae5e9bbecaa07ead7266 100644
--- a/tensorflow/core/kernels/depthtospace_op.cc
+++ b/tensorflow/core/kernels/depthtospace_op.cc
@@ -187,6 +187,9 @@ TF_CALL_ALL_TYPES(REGISTER);
 REGISTER_KERNEL_BUILDER(
     Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     DepthToSpaceOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    DepthToSpaceOp<GPUDevice, Eigen::half>);
 REGISTER_KERNEL_BUILDER(
     Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
     DepthToSpaceOp<GPUDevice, qint8>);
diff --git a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
index 7a66285383368bb28dd3d0cd2fc6ff360eb82f5b..0656081177e8673bdc8e603a832d96a8884bff45 100644
--- a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
@@ -158,6 +158,9 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NHWC> {
 
     const int total_count =
         batch_size * output_height * output_width * output_depth;
+    if (total_count == 0) {
+      return;
+    }
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
     D2S_NHWC<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
         config.virtual_thread_count, input.data(), block_size, batch_size,
@@ -188,6 +191,9 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> {
       const int output_width = output.dimension(3);
       const int output_depth_by_input_area = output_depth * input_area;
       const int total_count = batch_size * output_depth_by_input_area;
+      if (total_count == 0) {
+        return;
+      }
       CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
       switch (block_size) {
         case 2:
@@ -213,6 +219,9 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> {
 
     // Other block sizes are processed by the generic kernel.
     const int total_count = batch_size * input_depth_by_input_area;
+    if (total_count == 0) {
+      return;
+    }
     auto config = GetCudaLaunchConfig(total_count, d);
     D2S_NCHW<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
         config.virtual_thread_count, input.data(), block_size, input_width,
@@ -229,6 +238,12 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> {
 template struct functor::DepthToSpaceOpFunctor<GPUDevice, float, FORMAT_NCHW>;
 template struct functor::DepthToSpaceOpFunctor<GPUDevice, float, FORMAT_NHWC>;
 
+// Instantiate the GPU implementations for Eigen::half.
+template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::half,
+                                               FORMAT_NCHW>;
+template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::half,
+                                               FORMAT_NHWC>;
+
 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
 template struct functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW>;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index c060b2e14d2f03f990af5267260bd88fa01a2c81..6dedb1a61ef47ccc1fa902e7f69ea21db3392f39 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -241,7 +241,7 @@ struct LaunchDepthwiseConvOp<CPUDevice, T> {
 };
 
 // Extern template instantiated in conv_ops.cc.
-extern template class LaunchConv2DOp<CPUDevice, float>;
+extern template struct LaunchConv2DOp<CPUDevice, float>;
 
 #if GOOGLE_CUDA
 
@@ -251,7 +251,7 @@ extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
 
 // Extern template instantiated in conv_ops.cc.
-extern template class LaunchConv2DOp<GPUDevice, float>;
+extern template struct LaunchConv2DOp<GPUDevice, float>;
 
 #endif
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 94989089ec9cdf9314860b43f67691f39f33c31f..0abd64030fbaecb4f04810b0cfdbe1ddfe47f439 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -1708,8 +1708,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
   // Initialize the results to 0.
   int num_filter_backprop =
       args.filter_rows * args.filter_cols * args.out_depth;
-  perftools::gputools::DeviceMemoryBase filter_bp_ptr(filter_backprop,
-                                                      num_filter_backprop);
+  se::DeviceMemoryBase filter_bp_ptr(filter_backprop, num_filter_backprop);
   stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
 
   if (args.filter_rows == 3 && args.filter_cols == 3) {
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 9dfeccff0e8d2488fec5a1dc7b93f83d2cfedca5..862a97723fd644c38895ae3c194deb5307844303 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -285,8 +285,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
         c->allocate_temp(partition_count.dtype(), partition_count.shape(),
                          &cpu_tensor, alloc_attr),
         done);
-    perftools::gputools::DeviceMemoryBase wrapped(
-        partition_count.flat<int32>().data(), num_partitions_ * sizeof(int32));
+    se::DeviceMemoryBase wrapped(partition_count.flat<int32>().data(),
+                                 num_partitions_ * sizeof(int32));
     const bool status =
         stream
             ->ThenMemcpy(cpu_tensor.flat<int32>().data(), wrapped,
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
index 9a7ed0af217b1c31fa14917f10128bb229b18dfd..17eb4e24b71043e10076731a48034e9e6289abe3 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -153,8 +154,8 @@ TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) {
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
   AddInputFromArray<int32>(TensorShape({5}), {0, 2, 99, 2, 2});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("partitions[2] = 99 is not in [0, 4)"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "partitions[2] = 99 is not in [0, 4)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index f018499f6cc1a1ba77f8482a7e0ef424a47203ef..b01db91720ae7077d2d5795ba413b9c8067aa903 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -326,6 +326,7 @@ struct ParallelDynamicStitchOpCPU : DynamicStitchOpImplCPU<T, true> {
                           ParallelDynamicStitchOpCPU<type>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH);
+TF_CALL_variant(REGISTER_DYNAMIC_STITCH);
 #undef REGISTER_DYNAMIC_STITCH
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_test.cc b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
index 6775893ce636786218ae03b72085e9e7926b38d1..7fa6e320f5732cac68376b522e99b1d04bb9c3ac 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -88,9 +89,9 @@ TEST_F(DynamicStitchOpTest, Error_IndicesMultiDimensional) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("data[1].shape = [5] does not start with "
-                            "indices[1].shape = [1,5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [5] does not start with indices[1].shape = [1,5]"))
       << s;
 }
 
@@ -103,9 +104,9 @@ TEST_F(DynamicStitchOpTest, Error_DataNumDimsMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({1, 5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("data[1].shape = [1,5] does not start with "
-                            "indices[1].shape = [5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [1,5] does not start with indices[1].shape = [5]"))
       << s;
 }
 
@@ -119,9 +120,10 @@ TEST_F(DynamicStitchOpTest, Error_DataDimSizeMismatch) {
   AddInputFromArray<float>(TensorShape({4, 2}),
                            {10, 11, 60, 61, 20, 21, 30, 31});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Need data[0].shape[1:] = data[1].shape[1:], "
-                            "got data[0].shape = [3,1], data[1].shape = [4,2]"))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Need data[0].shape[1:] = data[1].shape[1:], got "
+                            "data[0].shape = [3,1], data[1].shape = [4,2]"))
       << s;
 }
 
@@ -134,10 +136,9 @@ TEST_F(DynamicStitchOpTest, Error_DataAndIndicesSizeMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({4}), {10, 60, 20, 30});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains(
-              "data[1].shape = [4] does not start with indices[1].shape = [5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [4] does not start with indices[1].shape = [5]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 896c9957616037da4ead2dbda8cb2393eaea226f..2f83780525090c90a0a9cfa3268115daa6fbc89b 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -334,7 +334,8 @@ struct AvgPoolMeanReducer {
   }
 
   template <typename Packet>
-  void reducePacketWithType(T, const Packet& p, Packet* accum) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacketWithType(
+      T, const Packet& p, Packet* accum) {
     Packet skip_mask =
         pequal(p, pset1<Packet>(-Eigen::NumTraits<T>::highest()));
     (*accum) = padd<Packet>(*accum, psel(p, pset1<Packet>(0), skip_mask));
@@ -480,11 +481,9 @@ SpatialAvgPooling(const Input& input, DenseIndex patchRows,
                              Eigen::type2index<3> > >::type reduction_dims;
 #endif
   return input
-      .extract_image_patches(
-          patchRows, patchCols, strideRows, strideCols, in_strideRows,
-          in_strideCols, padding_type,
-          -Eigen::NumTraits<typename internal::remove_const<
-              typename internal::traits<Input>::Scalar>::type>::highest())
+      .extract_image_patches(patchRows, patchCols, strideRows, strideCols,
+                             in_strideRows, in_strideCols, padding_type,
+                             -Eigen::NumTraits<CoeffReturnType>::highest())
       .reduce(reduction_dims, mean_with_nan)
       .reshape(post_reduce_dims);
 }
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 1acbe3a658070222e99ff874815db9a6b07d4565..a4dff4b91c5c7a991b432f113cae2e29ecdcab31 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -797,6 +797,188 @@ struct gemm_pack_rhs<
   }
 };
 
+// Template specialization for packet_size = 2. We must special-case packet
+// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
+template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+        Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+      Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+
+  EIGEN_DEVICE_FUNC
+  static inline Index ceil_div(Index a, Index b) { return (a + b - 1) / b; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    typedef typename packet_traits<Scalar>::type Packet;
+
+    const int packet_size = 2;
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if (!non_standard_patches) {
+        const Index patch_depth = rhs.patchDepth();
+        if ((patch_depth % packet_size) == 0) {
+          const Index patch_cols = rhs.patchCols();
+          const Index patch_rows = rhs.patchRows();
+
+          const Index startCol = rhs.colOffset();
+          const Index max_cols = std::min<Index>(
+              ceil_div(peeled_k, patch_rows * patch_depth) + startCol,
+              patch_cols);
+
+          for (Index c = startCol; c < max_cols; ++c) {
+            eigen_assert(k < peeled_k);
+            const Index startRow = (c == startCol) ? rhs.rowOffset() : 0;
+            const Index max_rows = std::min<Index>(
+                ceil_div(peeled_k - c * patch_rows * patch_depth, patch_depth) +
+                    startRow,
+                patch_rows);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+            for (Index r = startRow; r < max_rows; ++r) {
+              eigen_assert(k < peeled_k);
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index startDepth =
+                  ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0;
+              const Index max_depth =
+                  std::min<Index>(peeled_k - c * patch_rows * patch_depth -
+                                      r * patch_depth + startDepth,
+                                  patch_depth);
+              eigen_assert((max_depth - startDepth) % packet_size == 0);
+              for (Index d = startDepth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 2> kernel0;
+            PacketBlock<Packet, 2> kernel1;
+            kernel0.packet[0] = dm0.loadPacketFast(k);
+            kernel0.packet[1] = dm1.loadPacketFast(k);
+            kernel1.packet[0] = dm2.loadPacketFast(k);
+            kernel1.packet[1] = dm3.loadPacketFast(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+            block += 4 * packet_size;
+          }
+        } else {
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 2> kernel0;
+            PacketBlock<Packet, 2> kernel1;
+            kernel0.packet[0] = dm0.loadPacketStandard(k);
+            kernel0.packet[1] = dm1.loadPacketStandard(k);
+            kernel1.packet[0] = dm2.loadPacketStandard(k);
+            kernel1.packet[1] = dm3.loadPacketStandard(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+      if (!rhs.nonStandardPatches()) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
 // Special case for non-vectorized types such as float16.
 template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
           typename ArgType, typename Device, typename Scalar, typename Index,
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b02ae52a23aeabe55e6233e34b15cffb2073ded
--- /dev/null
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -0,0 +1,591 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// EncodeProto is a TensorFlow Op which serializes tensors into
+// arbitrary protobufs.
+//
+// See the docstring in ../ops/encode_proto_op.cc for usage of the op.
+//
+// This implementation writes the serialized format using a handful of
+// calls from the WireFormatLite API.
+
+#include <memory>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/descriptors.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::protobuf::Descriptor;
+using ::tensorflow::protobuf::DescriptorPool;
+using ::tensorflow::protobuf::FieldDescriptor;
+using ::tensorflow::protobuf::internal::WireFormatLite;
+using ::tensorflow::protobuf::io::CodedOutputStream;
+using ::tensorflow::protobuf::io::StringOutputStream;
+
+// Computes the total serialized size for a packed repeated field.
+// For fixed-size types this can just multiply, but for variable-sized
+// types it has to iterate through the values in the tensor.
+template <WireFormatLite::FieldType FieldType, typename TensorT>
+size_t TotalPackedSize(const Tensor& input, int message_index, int size);
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_DOUBLE, double>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kDoubleSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FLOAT, double>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  return size * WireFormatLite::kFloatSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FLOAT, float>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  return size * WireFormatLite::kFloatSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_INT64, int64>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::Int64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT64, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_INT32, int32>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::Int32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED64, int64>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed64Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED32, int64>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED32, int32>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_BOOL, bool>(const Tensor& input,
+                                                        int message_index,
+                                                        int size) {
+  return size * WireFormatLite::kBoolSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT32, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT32, int32>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_ENUM, int32>(const Tensor& input,
+                                                         int message_index,
+                                                         int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size +=
+        WireFormatLite::EnumSize(input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SFIXED32, int32>(
+    const Tensor& input, int message_index, int size) {
+  return size * WireFormatLite::kSFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SFIXED64, int64>(
+    const Tensor& input, int message_index, int size) {
+  return size * WireFormatLite::kSFixed64Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SINT32, int32>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::SInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SINT64, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::SInt64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+// Writes a possibly repeated primitive field.
+// TensorFlow does not have unsigned types, so we decode them to signed and
+// encode them back to unsigned.
+template <typename TensorT, typename ProtoT,
+          WireFormatLite::FieldType FieldType,
+          void Writer(ProtoT, CodedOutputStream*)>
+void WriteField(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  auto wire_type = WireFormatLite::WireTypeForFieldType(
+      WireFormatLite::FieldType(field_desc.type()));
+
+  auto input_t = input.flat_inner_dims<TensorT>();
+  if (field_desc.options().packed()) {
+    // Write the tag for the packed field.
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+
+    // Write the total packed length.
+    size_t data_size =
+        TotalPackedSize<FieldType, TensorT>(input, message_index, size);
+    output->WriteVarint32(data_size);
+
+    // Write individual values.
+    for (int64 i = 0; i < size; i++) {
+      // Note implicit cast from signed to unsigned.
+      const ProtoT& value = input_t(static_cast<int64>(message_index), i);
+      Writer(value, output);
+    }
+  } else {
+    for (int64 i = 0; i < size; i++) {
+      WireFormatLite::WriteTag(field_desc.number(), wire_type, output);
+
+      // Note implicit cast from signed to unsigned.
+      const ProtoT& value = input_t(static_cast<int64>(message_index), i);
+      Writer(value, output);
+    }
+  }
+}
+
+// Writes a possibly repeated string, bytes, or message field.
+template <typename T, void Writer(int, const T&, CodedOutputStream*)>
+void WriteVarLenField(const FieldDescriptor& field_desc, const Tensor& input,
+                      int message_index, int size, CodedOutputStream* output) {
+  auto input_t = input.flat_inner_dims<T>();
+  for (int64 i = 0; i < size; i++) {
+    const T& value = input_t(static_cast<int64>(message_index), i);
+    // TODO(nix): there doesn't seem to be an inlined version of
+    // WireFormatLite::WriteString or its relatives, which might allow a
+    // small speedup.
+    Writer(field_desc.number(), value, output);
+  }
+}
+
+// Writes a group field.
+// Groups are treated like submessages, but tag-delimited
+// instead of length-delimited. WireFormatLite handles this
+// differently so we code it ourselves.
+void WriteGroup(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  auto input_t = input.flat_inner_dims<string>();
+  for (int64 i = 0; i < size; i++) {
+    const string& value = input_t(static_cast<int64>(message_index), i);
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_START_GROUP, output);
+    // Note the use of WriteRaw instead of WriteString to skip the length.
+    output->WriteRaw(value.data(), value.size());
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_END_GROUP, output);
+  }
+}
+
+// Writes a (possibly repeated) field into an output stream.
+// It is the caller's responsibility to ensure that the type of
+// the input tensor is compatible with the type of the proto
+// field descriptor, and that (message_index, size-1) is within
+// bounds.
+void WriteField(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  DataType tf_type = input.dtype();
+
+  switch (field_desc.type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return WriteField<double, double, WireFormatLite::TYPE_DOUBLE,
+                        WireFormatLite::WriteDoubleNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FLOAT:
+      switch (tf_type) {
+        case DataType::DT_FLOAT:
+          return WriteField<float, float, WireFormatLite::TYPE_FLOAT,
+                            WireFormatLite::WriteFloatNoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_DOUBLE:
+          return WriteField<double, float, WireFormatLite::TYPE_FLOAT,
+                            WireFormatLite::WriteFloatNoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_INT64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_INT64,
+                        WireFormatLite::WriteInt64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_UINT64:
+      return WriteField<int64, protobuf_uint64, WireFormatLite::TYPE_UINT64,
+                        WireFormatLite::WriteUInt64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_INT32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_INT32,
+                        WireFormatLite::WriteInt32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FIXED64:
+      return WriteField<int64, protobuf_uint64, WireFormatLite::TYPE_FIXED64,
+                        WireFormatLite::WriteFixed64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FIXED32:
+      switch (tf_type) {
+        case DataType::DT_INT64:
+          return WriteField<int64, uint32, WireFormatLite::TYPE_FIXED32,
+                            WireFormatLite::WriteFixed32NoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_INT32:
+          return WriteField<int32, uint32, WireFormatLite::TYPE_FIXED32,
+                            WireFormatLite::WriteFixed32NoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_BOOL:
+      return WriteField<bool, bool, WireFormatLite::TYPE_BOOL,
+                        WireFormatLite::WriteBoolNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_STRING:
+      return WriteVarLenField<string, WireFormatLite::WriteString>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_GROUP:
+      return WriteGroup(field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_MESSAGE:
+      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_BYTES:
+      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_UINT32:
+      switch (tf_type) {
+        case DataType::DT_INT64:
+          return WriteField<int64, uint32, WireFormatLite::TYPE_UINT32,
+                            WireFormatLite::WriteUInt32NoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_INT32:
+          return WriteField<int32, uint32, WireFormatLite::TYPE_UINT32,
+                            WireFormatLite::WriteUInt32NoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_ENUM:
+      return WriteField<int32, int32, WireFormatLite::TYPE_ENUM,
+                        WireFormatLite::WriteEnumNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SFIXED32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_SFIXED32,
+                        WireFormatLite::WriteSFixed32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SFIXED64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_SFIXED64,
+                        WireFormatLite::WriteSFixed64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SINT32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_SINT32,
+                        WireFormatLite::WriteSInt32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SINT64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_SINT64,
+                        WireFormatLite::WriteSInt64NoTag>(
+          field_desc, input, message_index, size, output);
+      // default: intentionally omitted in order to enable static checking.
+  }
+}
+
+// Checks that a Protobuf field is compatible with a TensorFlow datatype.
+// This is separated from WriteField to lift it out of the inner loop.
+bool IsCompatibleType(const FieldDescriptor& field_desc, DataType tf_type) {
+  switch (field_desc.type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return tf_type == DataType::DT_DOUBLE;
+    case WireFormatLite::TYPE_FLOAT:
+      return tf_type == DataType::DT_FLOAT || tf_type == DataType::DT_DOUBLE;
+    case WireFormatLite::TYPE_INT64:
+    case WireFormatLite::TYPE_SFIXED64:
+    case WireFormatLite::TYPE_SINT64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_UINT64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_INT32:
+    case WireFormatLite::TYPE_ENUM:
+    case WireFormatLite::TYPE_SFIXED32:
+    case WireFormatLite::TYPE_SINT32:
+      return tf_type == DataType::DT_INT32;
+    case WireFormatLite::TYPE_FIXED64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_FIXED32:
+    case WireFormatLite::TYPE_UINT32:
+      return tf_type == DataType::DT_INT64 || tf_type == DataType::DT_INT32;
+    case WireFormatLite::TYPE_BOOL:
+      return tf_type == DataType::DT_BOOL;
+    case WireFormatLite::TYPE_STRING:
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+    case WireFormatLite::TYPE_BYTES:
+      return tf_type == DataType::DT_STRING;
+      // default: intentionally omitted in order to enable static checking.
+  }
+  return false;
+}
+
+class EncodeProtoOp : public OpKernel {
+ public:
+  explicit EncodeProtoOp(OpKernelConstruction* context) : OpKernel(context) {
+    string descriptor_source;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("descriptor_source", &descriptor_source));
+    // We always get back a desc_pool, but we may not own it. If we own it,
+    // owned_desc_pool_ will be filled in.
+    DescriptorPool const* desc_pool;
+    OP_REQUIRES_OK(context, GetDescriptorPool(context->env(), descriptor_source,
+                                              &desc_pool, &owned_desc_pool_));
+
+    string message_type;
+    OP_REQUIRES_OK(context, context->GetAttr("message_type", &message_type));
+    const Descriptor* message_desc =
+        desc_pool->FindMessageTypeByName(message_type);
+    OP_REQUIRES(context, message_desc != nullptr,
+                errors::InvalidArgument("No descriptor found for message type ",
+                                        message_type));
+
+    OP_REQUIRES_OK(context, context->GetAttr("field_names", &field_names_));
+
+    // Gather the field descriptors for the given field_names.
+    field_descs_.resize(field_names_.size());
+    for (int i = 0; i < field_names_.size(); i++) {
+      const string& name = field_names_[i];
+      auto field_desc = message_desc->FindFieldByName(name);
+      OP_REQUIRES(context, field_desc != nullptr,
+                  errors::InvalidArgument("Unknown field: ", name,
+                                          " in message type ", message_type));
+
+      field_descs_[i] = field_desc;
+    }
+
+    // Build a list of indices into field_descs sorted by increasing
+    // field_number. This will be used to output fields in sorted order,
+    // which is strongly encouraged when serializing protobufs.
+    sorted_field_index_.resize(field_names_.size());
+    // Start with the fields sorted by current index.
+    for (int i = 0; i < field_names_.size(); i++) sorted_field_index_[i] = i;
+    // Then sort the field indices by their proto field number.
+    std::sort(sorted_field_index_.begin(), sorted_field_index_.end(),
+              [this](int a, int b) -> bool {
+                return field_descs_[a]->number() < field_descs_[b]->number();
+              });
+  }
+
+  void Compute(OpKernelContext* cx) override {
+    const Tensor* sizes_tensor;
+    OP_REQUIRES_OK(cx, cx->input("sizes", &sizes_tensor));
+
+    OpInputList values;
+    OP_REQUIRES_OK(cx, cx->input_list("values", &values));
+
+    OP_REQUIRES(cx, field_descs_.size() == values.size(),
+                errors::InvalidArgument(
+                    "Length of inputs list must match field_names"));
+
+    // Check the arguments for consistency.
+    TensorShape common_prefix;
+    int message_count;
+    for (int i = 0; i < field_descs_.size(); i++) {
+      const Tensor& v = values[i];
+
+      // The type of each value tensor must match the corresponding field.
+      OP_REQUIRES(cx, IsCompatibleType(*field_descs_[i], v.dtype()),
+                  errors::InvalidArgument(
+                      "Incompatible type for field " + field_names_[i] +
+                          ".  Saw dtype: ",
+                      DataTypeString(v.dtype()),
+                      " but field type is: ", field_descs_[i]->type_name()));
+
+      // All value tensors must have the same shape prefix (i.e. batch size).
+      TensorShape shape_prefix = v.shape();
+      shape_prefix.RemoveDim(shape_prefix.dims() - 1);
+
+      // Do some initialization on the first input value. The rest will
+      // have to match this one.
+      if (i == 0) {
+        OP_REQUIRES(cx, v.dims() >= 1,
+                    errors::InvalidArgument(
+                        "Expected value to be at least a vector, saw shape: ",
+                        v.shape().DebugString()));
+        common_prefix = shape_prefix;
+        message_count = common_prefix.num_elements();
+      } else {
+        OP_REQUIRES(cx, shape_prefix == common_prefix,
+                    errors::InvalidArgument(
+                        "Values must match up to the last dimension"));
+      }
+    }
+
+    TensorShape expected_sizes_shape = common_prefix;
+    expected_sizes_shape.AddDim(field_descs_.size());
+
+    OP_REQUIRES(cx, sizes_tensor->shape() == expected_sizes_shape,
+                errors::InvalidArgument(
+                    "sizes should be batch_size + [len(field_names)].  Saw: ",
+                    sizes_tensor->shape().DebugString(),
+                    " but expected: ", expected_sizes_shape.DebugString()));
+
+    auto sizes = sizes_tensor->flat_inner_dims<int32>();
+
+    for (int i = 0; i < field_descs_.size(); ++i) {
+      const Tensor& v = values[i];
+      int max_size = v.dim_size(v.dims() - 1);
+
+      // The last dimension of a value tensor must be greater than the
+      // corresponding
+      // size in the sizes tensor.
+      for (int message_index = 0; message_index < message_count;
+           message_index++) {
+        OP_REQUIRES(
+            cx, sizes(message_index, i) <= max_size,
+            errors::InvalidArgument(
+                "Size to write must not be larger than value tensor; but saw: ",
+                sizes(message_index, i), " > ", max_size, " at message ",
+                message_index, " field ", i));
+      }
+    }
+
+    // This pointer is owned by the context.
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(cx, cx->allocate_output(0, common_prefix, &output_tensor));
+
+    auto bufs = output_tensor->flat<string>();
+    for (int message_index = 0; message_index < message_count;
+         message_index++) {
+      // TODO(nix): possibly optimize allocation here by calling
+      //   bufs(message_index).reserve(DEFAULT_BUF_SIZE);
+      StringOutputStream output_string(&bufs(message_index));
+      CodedOutputStream out(&output_string);
+      // Write fields in ascending field_number order.
+      for (int i : sorted_field_index_) {
+        auto& field_desc = *field_descs_[i];
+        const Tensor& v = values[i];
+        int size = sizes(message_index, i);
+        if (!size) continue;
+        WriteField(field_desc, v, message_index, size, &out);
+      }
+    }
+  }
+
+ private:
+  std::vector<string> field_names_;
+  std::vector<const FieldDescriptor*> field_descs_;
+
+  // Owned_desc_pool_ is null when using descriptor_source=local.
+  std::unique_ptr<DescriptorPool> owned_desc_pool_;
+
+  // Contains indices into field_names_, sorted by field number since
+  // that's the order of writing.
+  std::vector<int> sorted_field_index_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EncodeProtoOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("EncodeProto").Device(DEVICE_CPU), EncodeProtoOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index ab5af8caada587805f3cf936c0bbef95ead98076..661bf5fc5fb43efdc84ea3ff270ab5394375b193 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -277,20 +277,19 @@ REGISTER_KERNEL_BUILDER(Name("IRFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
 #undef FFT_LABEL
 
 #if GOOGLE_CUDA
-namespace gpu = ::perftools::gputools;
 
 namespace {
 template <typename T>
-gpu::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
-  gpu::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
-  gpu::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 
 template <typename T>
-gpu::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
-  gpu::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
-  gpu::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 
@@ -299,19 +298,19 @@ gpu::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
 // the kernel finishes.
 // TODO(yangzihao): Refactor redundant code in subclasses of ScratchAllocator
 // into base class.
-class CufftScratchAllocator : public gpu::ScratchAllocator {
+class CufftScratchAllocator : public se::ScratchAllocator {
  public:
   ~CufftScratchAllocator() override {}
   CufftScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  int64 GetMemoryLimitInBytes(gpu::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return memory_limit_;
   }
-  gpu::port::StatusOr<gpu::DeviceMemory<uint8>> AllocateBytes(
-      gpu::Stream* stream, int64 byte_size) override {
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
     if (byte_size > memory_limit_) {
-      return gpu::port::StatusOr<gpu::DeviceMemory<uint8>>();
+      return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     AllocationAttributes allocation_attr;
     allocation_attr.no_retry_on_failure = true;
@@ -319,13 +318,13 @@ class CufftScratchAllocator : public gpu::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return gpu::port::StatusOr<gpu::DeviceMemory<uint8>>();
+      return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return gpu::port::StatusOr<gpu::DeviceMemory<uint8>>(
+    return se::port::StatusOr<se::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
@@ -394,9 +393,9 @@ class FFTGPUBase : public FFTBase {
 
     constexpr bool kInPlaceFft = false;
     const auto kFftType =
-        IsReal() ? (IsForward() ? gpu::fft::Type::kR2C : gpu::fft::Type::kC2R)
-                 : (IsForward() ? gpu::fft::Type::kC2CForward
-                                : gpu::fft::Type::kC2CInverse);
+        IsReal() ? (IsForward() ? se::fft::Type::kR2C : se::fft::Type::kC2R)
+                 : (IsForward() ? se::fft::Type::kC2CForward
+                                : se::fft::Type::kC2CInverse);
 
     CufftScratchAllocator scratch_allocator(CufftScratchSize, ctx);
     auto plan =
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
index 3487606778eabde386335f8450d627b7bf74ad42..050c95cf40d4b29bde66b6b6e72b1b48a7199965 100644
--- a/tensorflow/core/kernels/fill_functor.cu.cc
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -76,7 +76,7 @@ struct FillFunctor<GPUDevice, T> {
 };
 
 #define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
-TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
+TF_CALL_NUMBER_TYPES(DEFINE_FILL_GPU);
 TF_CALL_bool(DEFINE_FILL_GPU);
 #undef DEFINE_FILL_GPU
 
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 9d4bc35ba890c251b0800f266e7845e411e7a835..8f66f0a7b970f20641f18c9579bb9d2a5ec7ae57 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -28,11 +28,14 @@ limitations under the License.
 #include "tensorflow/core/graph/gradients.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 
-static const char* const kGradientOp = "SymbolicGradient";
+static const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
+static const char* const kGradientOp = FunctionLibraryDefinition::kGradientOp;
 
 class ArgOp : public OpKernel {
  public:
@@ -89,26 +92,25 @@ class RetvalOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
 };
 
-REGISTER_SYSTEM_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp);
-REGISTER_SYSTEM_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name(kArgOp).Device(DEVICE_CPU), ArgOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name(kRetOp).Device(DEVICE_CPU), RetvalOp);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \
-      Name("_Arg").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ArgOp);
+      Name(kArgOp).Device(DEVICE_SYCL).TypeConstraint<type>("T"), ArgOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
-TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Arg")
+TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kArgOp)
                                                    .Device(DEVICE_SYCL)
                                                    .HostMemory("output")
                                                    .TypeConstraint<int32>("T"),
                                                ArgOp);
 #undef REGISTER
-#define REGISTER(type)                                               \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("_Retval").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      RetvalOp);
+#define REGISTER(type)     \
+  REGISTER_KERNEL_BUILDER( \
+      Name(kRetOp).Device(DEVICE_SYCL).TypeConstraint<type>("T"), RetvalOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
-TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Retval")
+TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                    .Device(DEVICE_SYCL)
                                                    .HostMemory("input")
                                                    .TypeConstraint<int32>("T"),
@@ -118,16 +120,16 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Retval")
 
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \
-      Name("_Arg").Device(DEVICE_GPU).TypeConstraint<type>("T"), ArgOp);
+      Name(kArgOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), ArgOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
-TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Arg")
+TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kArgOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("output")
                                                    .TypeConstraint<int32>("T"),
                                                ArgOp);
 #undef REGISTER
 
-REGISTER_KERNEL_BUILDER(Name("_Arg")
+REGISTER_KERNEL_BUILDER(Name(kArgOp)
                             .Device(DEVICE_GPU)
                             .HostMemory("output")
                             .TypeConstraint<ResourceHandle>("T"),
@@ -135,13 +137,18 @@ REGISTER_KERNEL_BUILDER(Name("_Arg")
 
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \
-      Name("_Retval").Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
+      Name(kRetOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
-TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Retval")
+TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("input")
                                                    .TypeConstraint<int32>("T"),
                                                RetvalOp);
+REGISTER_KERNEL_BUILDER(Name(kRetOp)
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<ResourceHandle>("T")
+                            .HostMemory("input"),
+                        RetvalOp);
 #undef REGISTER
 
 class PassOn : public OpKernel {
@@ -287,7 +294,8 @@ REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_SYCL),
 class RemoteCallOp : public AsyncOpKernel {
  public:
   explicit RemoteCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr(FunctionLibraryDefinition::kFuncAttr, &func_));
   }
 
   ~RemoteCallOp() override {}
@@ -305,11 +313,30 @@ class RemoteCallOp : public AsyncOpKernel {
     AttrValueMap attr_values = func_.attr();
     FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
     instantiate_opts.target = target_device;
+
+    FunctionTarget function_target = {target_device, lib};
+
     FunctionLibraryRuntime::Handle handle;
-    OP_REQUIRES_OK_ASYNC(ctx,
-                         lib->Instantiate(func_.name(), AttrSlice(&attr_values),
-                                          instantiate_opts, &handle),
-                         done);
+    {
+      mutex_lock l(mu_);
+      auto cached_entry = handle_cache_.find(function_target);
+      if (cached_entry != handle_cache_.end()) {
+        handle = cached_entry->second;
+      } else {
+        VLOG(1) << "Instantiating " << func_.name() << " on " << target_device;
+        tracing::ScopedActivity activity(strings::StrCat(
+            "RemoteCall: Instantiate: ", func_.name(), " on ", target_device));
+        OP_REQUIRES_OK_ASYNC(
+            ctx,
+            lib->Instantiate(func_.name(), AttrSlice(&attr_values),
+                             instantiate_opts, &handle),
+            done);
+        auto insert_result = handle_cache_.insert({function_target, handle});
+        CHECK(insert_result.second) << "Insert unsuccessful.";
+        VLOG(1) << "Instantiated " << func_.name() << " on " << target_device
+                << ", resulting in handle: " << handle << " flr: " << lib;
+      }
+    }
 
     OpInputList arguments;
     OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
@@ -328,22 +355,33 @@ class RemoteCallOp : public AsyncOpKernel {
       args.push_back(argument);
     }
     auto* rets = new std::vector<Tensor>;
-    lib->Run(opts, handle, args, rets, [rets, done, ctx](const Status& status) {
-      if (!status.ok()) {
-        ctx->SetStatus(status);
-      } else {
-        for (size_t i = 0; i < rets->size(); ++i) {
-          ctx->set_output(i, (*rets)[i]);
-        }
-      }
-      delete rets;
-      done();
-    });
+    auto* activity = new tracing::ScopedActivity(strings::StrCat(
+        "RemoteCall: Run: ", func_.name(), " on ", target_device));
+    VLOG(1) << "Running " << func_.name() << " on " << target_device
+            << " with handle: " << handle;
+    lib->Run(opts, handle, args, rets,
+             [rets, activity, done, ctx](const Status& status) {
+               if (!status.ok()) {
+                 ctx->SetStatus(status);
+               } else {
+                 for (size_t i = 0; i < rets->size(); ++i) {
+                   ctx->set_output(i, (*rets)[i]);
+                 }
+               }
+               delete rets;
+               delete activity;
+               done();
+             });
   }
 
  private:
-  string target_;
   NameAttrList func_;
+
+  mutex mu_;
+  typedef std::pair<string, FunctionLibraryRuntime*> FunctionTarget;
+  std::map<FunctionTarget, FunctionLibraryRuntime::Handle> handle_cache_
+      GUARDED_BY(mu_);
+
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteCallOp);
 };
 
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index b687088db16a31d8ecb74a7a483c35d2c65a74f9..911aa3a78fff2f6f7272e7408388e6625df52037 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -21,10 +20,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/mutex.h"
 
-namespace tensorflow {
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
 
+namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef FunctionLibraryRuntime::Handle FHandle;
@@ -106,11 +107,9 @@ void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
   opts->runner = ctx->runner();
 }
 
-}  // end namespace
-
-class FunctionalIf : public AsyncOpKernel {
+class IfOp : public AsyncOpKernel {
  public:
-  explicit FunctionalIf(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit IfOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     auto lib = ctx->function_library();
     OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
     const NameAttrList* func;
@@ -120,7 +119,7 @@ class FunctionalIf : public AsyncOpKernel {
     OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &else_handle_));
   }
 
-  ~FunctionalIf() override {}
+  ~IfOp() override {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     bool cond;
@@ -134,8 +133,7 @@ class FunctionalIf : public AsyncOpKernel {
 
   class State {
    public:
-    State(FunctionalIf* kernel, OpKernelContext* ctx, bool cond,
-          DoneCallback done)
+    State(IfOp* kernel, OpKernelContext* ctx, bool cond, DoneCallback done)
         : kernel_(kernel),
           ctx_(ctx),
           cond_(cond),
@@ -168,7 +166,7 @@ class FunctionalIf : public AsyncOpKernel {
     }
 
    private:
-    FunctionalIf* const kernel_;
+    IfOp* const kernel_;
     OpKernelContext* const ctx_;
     const bool cond_;
     const DoneCallback done_;
@@ -179,18 +177,22 @@ class FunctionalIf : public AsyncOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), FunctionalIf);
+// TODO(drpng): remove this.
+REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
-                        FunctionalIf);
+                        IfOp);
+
+REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_CPU), IfOp);
+REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
 
-class FunctionalWhile : public AsyncOpKernel {
+class WhileOp : public AsyncOpKernel {
  public:
-  explicit FunctionalWhile(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit WhileOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("cond", &cond_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &body_func_));
   }
 
-  ~FunctionalWhile() override {}
+  ~WhileOp() override {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     auto lib = ctx->function_library();
@@ -234,7 +236,7 @@ class FunctionalWhile : public AsyncOpKernel {
 
   class State {
    public:
-    State(FunctionalWhile* kernel, OpKernelContext* ctx, FHandle cond_handle,
+    State(WhileOp* kernel, OpKernelContext* ctx, FHandle cond_handle,
           FHandle body_handle, DoneCallback done)
         : kernel_(kernel),
           ctx_(ctx),
@@ -253,7 +255,7 @@ class FunctionalWhile : public AsyncOpKernel {
     void Start() { EvalCond(); }
 
    private:
-    FunctionalWhile* const kernel_;
+    WhileOp* const kernel_;
     OpKernelContext* const ctx_;
     const FHandle cond_handle_;
     const FHandle body_handle_;
@@ -316,7 +318,152 @@ class FunctionalWhile : public AsyncOpKernel {
     }
   };
 };
-REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), FunctionalWhile);
-REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), FunctionalWhile);
+// TODO(drpng): remove these.
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), WhileOp);
+
+REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_CPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_GPU), WhileOp);
+
+Status GetScalar(OpKernelContext* ctx, int index, int32* value,
+                 const char* label) {
+  Tensor t = ctx->input(index);
+  if (!TensorShapeUtils::IsScalar(t.shape())) {
+    return errors::InvalidArgument(label, " must be a scalar, but ",
+                                   t.shape().DebugString());
+  }
+  *value = t.scalar<int32>()();
+  return Status::OK();
+}
+
+class ForOp : public AsyncOpKernel {
+ public:
+  explicit ForOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto lib = ctx->function_library();
+    OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
+    const NameAttrList* func;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &func));
+    OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &body_handle_));
+  }
+
+  ~ForOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    (new State(this, ctx, done))->Start();
+  }
+
+ private:
+  FHandle body_handle_;
+
+  class State {
+   public:
+    State(ForOp* kernel, OpKernelContext* ctx, DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          done_(std::move(done)),
+          lib_(CHECK_NOTNULL(ctx_->function_library())),
+          args_(1 + ctx_->num_inputs() - 3) {
+      args_[0] = Tensor(DT_INT32, {});
+      iter_ = &args_[0].scalar<int32>()();
+
+      const int32 num_loop_inputs = ctx_->num_inputs() - 3;
+      rets_.reserve(num_loop_inputs);
+      for (int i = 0; i < num_loop_inputs; ++i) {
+        rets_.push_back(ctx_->input(3 + i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() {
+      Status s = StartLoop();
+      if (!s.ok()) Finish(s);
+    }
+
+   private:
+    ForOp* const kernel_;
+    OpKernelContext* const ctx_;
+    const DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+
+    int32* iter_;  // points to args_[0].
+    int32 limit_;
+    int32 delta_;
+
+    // If an error e is returned, caller must call Finish(e).
+    // If OK is returned, the async loop execution has been started.
+    Status StartLoop() {
+      SetRunOptions(ctx_, &opts_, false /* always_collect_stats */);
+
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 0, iter_, "start"));
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 1, &limit_, "limit"));
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 2, &delta_, "delta"));
+
+      if ((delta_ > 0 && *iter_ <= limit_) ||
+          (delta_ < 0 && *iter_ >= limit_) ||
+          (delta_ == 0 && *iter_ == limit_)) {
+        RunNext();
+        return Status::OK();
+      } else {
+        return errors::InvalidArgument("Invalid start/limit/delta: ", *iter_,
+                                       " ", limit_, " ", delta_);
+      }
+    }
+
+    void RunNext() {
+      bool done_loop;
+      if (delta_ > 0) {
+        done_loop = *iter_ >= limit_;
+      } else {
+        done_loop = *iter_ <= limit_;
+      }
+      if (done_loop) {
+        Finish(Status::OK());
+        return;
+      }
+
+      if (rets_.size() >= args_.size()) {
+        Finish(errors::InvalidArgument(
+            "For loop body returned ", rets_.size(),
+            " arguments. Expected: ", args_.size() - 1));
+        return;
+      }
+      for (int i = 0; i < rets_.size(); ++i) {
+        args_[1 + i] = std::move(rets_[i]);
+      }
+      rets_.clear();
+      lib_->Run(opts_, kernel_->body_handle_, args_, &rets_,
+                [this](const Status& s) {
+                  if (s.ok()) {
+                    *iter_ += delta_;
+                    RunNext();
+                  } else {
+                    Finish(s);
+                  }
+                });
+    }
+
+    void Finish(Status s) {
+      if (s.ok()) {
+        s = SetOutputs(kernel_, ctx_, rets_);
+      }
+      ctx_->SetStatus(s);
+      done_();
+      delete this;
+    }
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("For").Device(DEVICE_CPU), ForOp);
+REGISTER_KERNEL_BUILDER(Name("For")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("start")
+                            .HostMemory("limit")
+                            .HostMemory("delta"),
+                        ForOp);
 
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 9b4dca851138235d7b4a95906f3c8a0e5d592aa7..f99dd643f76e641490acdee8e68a2b0198730b69 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -251,7 +251,7 @@ struct FusedBatchNorm<GPUDevice, T, U> {
     Tensor x_maybe_transformed = x;
     Tensor x_transformed;
     Tensor y_transformed;
-    perftools::gputools::DeviceMemory<T> y_ptr;
+    se::DeviceMemory<T> y_ptr;
 
     if (tensor_format == FORMAT_NCHW) {
       y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*y);
@@ -279,19 +279,19 @@ struct FusedBatchNorm<GPUDevice, T, U> {
       return;
     }
 
-    perftools::gputools::dnn::BatchDescriptor x_desc;
+    se::dnn::BatchDescriptor x_desc;
     x_desc.set_count(batch_size)
         .set_feature_map_count(channels)
         .set_height(height)
         .set_width(width)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-    perftools::gputools::dnn::BatchDescriptor scale_offset_desc;
+    se::dnn::BatchDescriptor scale_offset_desc;
     scale_offset_desc.set_count(1)
         .set_feature_map_count(channels)
         .set_height(1)
         .set_width(1)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
     auto x_ptr = StreamExecutorUtil::AsDeviceMemory<T>(x_maybe_transformed);
     auto scale_ptr = StreamExecutorUtil::AsDeviceMemory<U>(scale);
@@ -308,7 +308,7 @@ struct FusedBatchNorm<GPUDevice, T, U> {
         StreamExecutorUtil::AsDeviceMemory<U>(*saved_inv_var);
 
     GPUDevice d = context->eigen_device<GPUDevice>();
-    using perftools::gputools::DeviceMemory;
+    using se::DeviceMemory;
     Tensor inv_var;
     OP_REQUIRES_OK(
         context, context->allocate_temp(DataTypeToEnum<U>::value,
@@ -390,7 +390,7 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
 
     // Outputs
     Tensor x_backprop_transformed;
-    perftools::gputools::DeviceMemory<T> x_backprop_ptr;
+    se::DeviceMemory<T> x_backprop_ptr;
 
     if (tensor_format == FORMAT_NCHW) {
       x_backprop_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*x_backprop);
@@ -433,19 +433,19 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
       return;
     }
 
-    perftools::gputools::dnn::BatchDescriptor x_desc;
+    se::dnn::BatchDescriptor x_desc;
     x_desc.set_count(batch_size)
         .set_feature_map_count(channels)
         .set_height(height)
         .set_width(width)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-    perftools::gputools::dnn::BatchDescriptor scale_offset_desc;
+    se::dnn::BatchDescriptor scale_offset_desc;
     scale_offset_desc.set_count(1)
         .set_feature_map_count(channels)
         .set_height(1)
         .set_width(1)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
     auto y_backprop_ptr =
         StreamExecutorUtil::AsDeviceMemory<T>(y_backprop_maybe_transformed);
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 9a7eca03ce276d26321f01f80ad7f1a0a254e4db..8bfa40304eb642ca68ed0d85f5f4f5a42a51373d 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -17,18 +17,6 @@ cc_library(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib")
 
 tf_ops_fuzz_target_lib("identity")
@@ -49,6 +37,8 @@ tf_ops_fuzz_target_lib("decode_png")
 
 tf_ops_fuzz_target_lib("decode_jpeg")
 
+tf_ops_fuzz_target_lib("decode_wav")
+
 tf_ops_fuzz_target_lib("example_proto_fast_parsing")
 
 tf_ops_fuzz_target_lib("parse_tensor_op")
diff --git a/tensorflow/core/kernels/fuzzing/decode_wav_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_wav_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33a11d8e131c0f22b1410ee111554d10dad77076
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/decode_wav_fuzz.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzDecodeWav : public FuzzStringInputOp {
+  SINGLE_INPUT_OP_BUILDER(DT_STRING, DecodeWav);
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzDecodeWav);
+
+}  // namespace fuzzing
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 16ccb03b8502dd626c0dc4f0c10fcfe50224c7b8..2c6e8bf3bcbd9270ed47d37eec6c88d7b3cfdb1c 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
@@ -50,7 +51,7 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   }
   // Compute slice_bytes here so that static knowledge is available
   const size_t slice_bytes = slice_elems * sizeof(T);
-  auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+  auto* worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   mutex mu;
   // Store the value of invalidate index for printing error information, it's a
   // shared variable.
@@ -162,6 +163,16 @@ struct GatherFunctor<CPUDevice, T, Index> {
   }
 };
 
+template <typename Index>
+struct GatherFunctor<GPUDevice, Variant, Index> {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<Variant, 3>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<Variant, 3>::Tensor out) {
+    return GatherFunctorCPU<Variant, Index>()(ctx, params, indices, out);
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 08adf4badbcd9c8baf664b13098f23dfb0584e24..ef332ebee39cc31f71d6a03ce61d7bded39292de 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -143,6 +143,8 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
+TF_CALL_uint32(REGISTER_GATHER_CPU);
+TF_CALL_uint64(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index 3edcb34bca3eaa28249cffaa8b0a79f90cdfb7dd..0409cadb67f38aea0ae5b9f1859b9e721a67637b 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -171,7 +172,7 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("indices[2] = 99 is not in [0, 5)"))
+      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index ffc733e6bb6b45ab463f319de39dfd175e83e5c1..2f64619afc10364ac1c7229502caf3d379eb02ad 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -29,11 +29,9 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename T>
-inline perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
-                                                           uint64 size) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
-                                                size * sizeof(T));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
+inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 108d59db2c21cad6ff3136a0271f9ba1f7d7a237..66aeec51050c9fe16f54f5bb5e29c93ae4b2b8df 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -13,18 +13,6 @@ load(
     "tf_kernel_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_cc_test(
     name = "graph_transferer_test",
     size = "small",
@@ -45,6 +33,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:quantization_utils",
         "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:reduction_ops",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
@@ -81,6 +70,7 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
index 4040bf52bffe638d601f954f9a81d9eda78346a6..40bf5a4dc711960ee7a7f82470c012fb12cf3ef3 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/const_op.h"
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
index 352d548bd3e08dcf5d73691e64535ead1be34983..ada96ae4ea86a49d996392c1f5ed67e48346dc83 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
@@ -20,14 +20,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/hexagon/graph_transferer.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
+class RemoteFusedGraphExecuteInfo;
+
 class GraphTransferUtils {
  public:
   static std::priority_queue<std::tuple<float, int, string>>
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index 0963dff5fa08776f5edbafc67c5a146ca1683db1..7960cb4b0552de2f32cfb17db285b4a9ce42c7a1 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <algorithm>
 #include <cinttypes>
 
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -73,6 +75,12 @@ static Node* FindMutableNodeByName(const string& name, Graph* graph) {
   return nullptr;
 }
 
+GraphTransferer::GraphTransferer() {
+  graph_transfer_info_ = new GraphTransferInfo();
+}
+
+GraphTransferer::~GraphTransferer() { delete graph_transfer_info_; }
+
 /**
  * graph loading functions
  * - LoadGraphFromProto
@@ -142,8 +150,8 @@ Status GraphTransferer::LoadGraphFromProto(
 
   for (const std::pair<string, Tensor>& input_node_info :
        input_node_info_list) {
-    GraphTransferInfo::GraphInputNodeInfo& graph_input_node_info =
-        *graph_transfer_info_.add_graph_input_node_info();
+    GraphTransferGraphInputNodeInfo& graph_input_node_info =
+        *graph_transfer_info_->add_graph_input_node_info();
     graph_input_node_info.set_name(input_node_info.first);
     graph_input_node_info.set_dtype(input_node_info.second.dtype());
     for (const int64 dim : ToTensorShapeArray(input_node_info.second.shape())) {
@@ -159,8 +167,8 @@ Status GraphTransferer::LoadGraphFromProto(
     const Node* node = node_name_cache_list_.at(node_id);
     CHECK_NOTNULL(node);
 
-    GraphTransferInfo::GraphOutputNodeInfo& graph_output_node_info =
-        *graph_transfer_info_.add_graph_output_node_info();
+    GraphTransferGraphOutputNodeInfo& graph_output_node_info =
+        *graph_transfer_info_->add_graph_output_node_info();
     graph_output_node_info.set_name(strings::StrCat(node_name, ":", port));
 
     // Get output tensor shape type
@@ -231,17 +239,17 @@ Status GraphTransferer::LoadGraphFromProtoFile(
 
 void GraphTransferer::SortParams(const std::vector<string>& output_node_names) {
   // TODO(satok): optimize complexity
-  std::unordered_map<int, GraphTransferInfo::NodeInputInfo*> input_map;
-  for (GraphTransferInfo::NodeInputInfo& input :
-       *graph_transfer_info_.mutable_node_input_info()) {
+  std::unordered_map<int, GraphTransferNodeInputInfo*> input_map;
+  for (GraphTransferNodeInputInfo& input :
+       *graph_transfer_info_->mutable_node_input_info()) {
     input_map.emplace(input.node_id(), &input);
   }
 
   // Setup dependency map placeholder
   std::vector<int> output_node_ids;
   std::unordered_map<int, std::unordered_set<int>> dependency_map;
-  for (const GraphTransferInfo::NodeInfo& params :
-       graph_transfer_info_.node_info()) {
+  for (const GraphTransferNodeInfo& params :
+       graph_transfer_info_->node_info()) {
     const int node_id = params.node_id();
     for (const string& output_node_name : output_node_names) {
       if (params.name() == output_node_name) {
@@ -255,7 +263,7 @@ void GraphTransferer::SortParams(const std::vector<string>& output_node_names) {
       continue;
     }
     CHECK_EQ(input_map.count(node_id), 1);
-    for (const GraphTransferInfo::NodeInput& node_input :
+    for (const GraphTransferNodeInput& node_input :
          input_map.at(node_id)->node_input()) {
       dependency_map.at(node_id).emplace(node_input.node_id());
     }
@@ -267,8 +275,8 @@ void GraphTransferer::SortParams(const std::vector<string>& output_node_names) {
     FillDependencyRec(output_node_id, dependency_map, completed);
   }
 
-  std::sort(graph_transfer_info_.mutable_node_info()->begin(),
-            graph_transfer_info_.mutable_node_info()->end(),
+  std::sort(graph_transfer_info_->mutable_node_info()->begin(),
+            graph_transfer_info_->mutable_node_info()->end(),
             TransferParamsComparator(dependency_map));
 }
 
@@ -278,15 +286,15 @@ void GraphTransferer::EnableStrictCheckMode(const bool enable) {
 
 void GraphTransferer::SetSerializedGraphTransferInfo(
     const string& serialized_proto) {
-  graph_transfer_info_.ParseFromString(serialized_proto);
+  graph_transfer_info_->ParseFromString(serialized_proto);
 }
 
 const GraphTransferInfo& GraphTransferer::GetGraphTransferInfo() const {
-  return graph_transfer_info_;
+  return *graph_transfer_info_;
 }
 
 GraphTransferInfo& GraphTransferer::GetMutableGraphTransferInfo() {
-  return graph_transfer_info_;
+  return *graph_transfer_info_;
 }
 
 void GraphTransferer::CacheNode(const Node& node) {
@@ -473,8 +481,8 @@ void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner,
   data_size = max_bytes_per_data * num_output_elements;
   shape_array = BuildShapeArray(shape_handle, context);
 
-  GraphTransferInfo::ConstNodeInfo& const_node_info =
-      *graph_transfer_info_.add_const_node_info();
+  GraphTransferConstNodeInfo& const_node_info =
+      *graph_transfer_info_->add_const_node_info();
   const_node_info.set_name(node.name());
   const_node_info.set_node_id(id);
   // TODO(satok): Make this generic. Never assume rank is 4.
@@ -505,8 +513,8 @@ int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) {
     node_name_cache_list_.emplace_back(nullptr);
     const int id = node_name_cache_list_.size() - 1;
     node_name_to_id_cache_map_.emplace(shape_name, id);
-    GraphTransferInfo::ConstNodeInfo& const_node_info =
-        *graph_transfer_info_.add_const_node_info();
+    GraphTransferConstNodeInfo& const_node_info =
+        *graph_transfer_info_->add_const_node_info();
     const_node_info.set_name(shape_name);
     const_node_info.set_node_id(id);
     // TODO(satok): Make this generic. Never assume rank is 5.
@@ -528,8 +536,8 @@ int GraphTransferer::RegisterConstTensor(const Tensor& tensor,
     node_name_cache_list_.emplace_back(nullptr);
     const int id = node_name_cache_list_.size() - 1;
     node_name_to_id_cache_map_.emplace(node_name, id);
-    GraphTransferInfo::ConstNodeInfo& const_node_info =
-        *graph_transfer_info_.add_const_node_info();
+    GraphTransferConstNodeInfo& const_node_info =
+        *graph_transfer_info_->add_const_node_info();
     const_node_info.set_name(node_name);
     const_node_info.set_node_id(id);
     CHECK_EQ(4, SHAPE_ARRAY_SIZE);
@@ -558,8 +566,8 @@ int GraphTransferer::RegisterConstScalar(const DataType dt, const int val,
     node_name_cache_list_.emplace_back(nullptr);
     const int id = node_name_cache_list_.size() - 1;
     node_name_to_id_cache_map_.emplace(val_name, id);
-    GraphTransferInfo::ConstNodeInfo& const_node_info =
-        *graph_transfer_info_.add_const_node_info();
+    GraphTransferConstNodeInfo& const_node_info =
+        *graph_transfer_info_->add_const_node_info();
     const_node_info.set_name(val_name);
     const_node_info.set_node_id(id);
     // TODO(satok): Do not assume rank is 4 here.
@@ -715,8 +723,8 @@ void GraphTransferer::RegisterPadNode(
 
   CHECK_EQ(2, node.num_inputs());
 
-  GraphTransferInfo::NodeInputInfo& node_input_info =
-      *graph_transfer_info_.add_node_input_info();
+  GraphTransferNodeInputInfo& node_input_info =
+      *graph_transfer_info_->add_node_input_info();
   node_input_info.set_node_id(id);
 
   AddNodeInputByInputIndex(node, 0, &node_input_info);
@@ -761,8 +769,7 @@ void GraphTransferer::RegisterPadNode(
         new_const_tensor,
         strings::StrCat(input_node->name(), "_", node.name(), "_1"));
 
-    GraphTransferInfo::NodeInput& node_input =
-        *node_input_info.add_node_input();
+    GraphTransferNodeInput& node_input = *node_input_info.add_node_input();
     node_input.set_node_id(id);
     node_input.set_output_port(0);
   } else {
@@ -849,8 +856,7 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id,
                                        const int padding, const int inputs_size,
                                        const std::vector<int>& extra_inputs,
                                        const int outputs_size) {
-  GraphTransferInfo::NodeInfo& node_info =
-      *graph_transfer_info_.add_node_info();
+  GraphTransferNodeInfo& node_info = *graph_transfer_info_->add_node_info();
   node_info.set_name(name);
   node_info.set_node_id(id);
   node_info.set_type_name(type);
@@ -863,7 +869,7 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id,
 
 void GraphTransferer::AddNodeInputByInputIndex(
     const Node& node, const int idx,
-    GraphTransferInfo::NodeInputInfo* node_input_info) {
+    GraphTransferNodeInputInfo* node_input_info) {
   const Edge* edge = nullptr;
   TF_CHECK_OK(node.input_edge(idx, &edge));
   const Node* input_node = edge->src();
@@ -873,7 +879,7 @@ void GraphTransferer::AddNodeInputByInputIndex(
   const std::string& op_name = input_node->name();
   CHECK_GT(node_name_to_id_cache_map_.count(op_name), 0) << op_name;
   const int src_id = node_name_to_id_cache_map_[op_name];
-  GraphTransferInfo::NodeInput& node_input = *node_input_info->add_node_input();
+  GraphTransferNodeInput& node_input = *node_input_info->add_node_input();
   node_input.set_node_id(src_id);
   node_input.set_output_port(port);
 }
@@ -882,15 +888,14 @@ void GraphTransferer::AppendNodeInputParams(
     const int id, const Node& node, const std::vector<int>& extra_inputs) {
   VLOG(1) << "Append input params: " << node.name() << ", " << node.num_inputs()
           << ", " << extra_inputs.size();
-  GraphTransferInfo::NodeInputInfo& node_input_info =
-      *graph_transfer_info_.add_node_input_info();
+  GraphTransferNodeInputInfo& node_input_info =
+      *graph_transfer_info_->add_node_input_info();
   node_input_info.set_node_id(id);
   for (int i = 0; i < node.num_inputs(); ++i) {
     AddNodeInputByInputIndex(node, i, &node_input_info);
   }
   for (const int extra_input : extra_inputs) {
-    GraphTransferInfo::NodeInput& node_input =
-        *node_input_info.add_node_input();
+    GraphTransferNodeInput& node_input = *node_input_info.add_node_input();
     node_input.set_node_id(extra_input);
     node_input.set_output_port(0);
   }
@@ -900,8 +905,8 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
                                              const int id, const Node& node) {
   VLOG(1) << "Append output params: " << node.name() << ", "
           << node.num_outputs();
-  GraphTransferInfo::NodeOutputInfo& node_output_info =
-      *graph_transfer_info_.add_node_output_info();
+  GraphTransferNodeOutputInfo& node_output_info =
+      *graph_transfer_info_->add_node_output_info();
   node_output_info.set_node_id(id);
 
   std::vector<DataType> data_types;
@@ -1030,8 +1035,7 @@ GraphTransferer::TransferParamsComparator::TransferParamsComparator(
     : dependency_map_(dep_map) {}
 
 bool GraphTransferer::TransferParamsComparator::operator()(
-    const GraphTransferInfo::NodeInfo& obj0,
-    const GraphTransferInfo::NodeInfo& obj1) {
+    const GraphTransferNodeInfo& obj0, const GraphTransferNodeInfo& obj1) {
   const int node_id0 = obj0.node_id();
   const int node_id1 = obj1.node_id();
   bool obj0_uses_obj1 = false;
@@ -1114,8 +1118,8 @@ void GraphTransferer::ClearCache() {
 
 void GraphTransferer::DumpNodeTransferParams() const {
   LOG(INFO) << "*** Const Nodes ***";
-  for (const GraphTransferInfo::ConstNodeInfo& params :
-       graph_transfer_info_.const_node_info()) {
+  for (const GraphTransferConstNodeInfo& params :
+       graph_transfer_info_->const_node_info()) {
     // TODO(satok): Stop assuming shape size is 4.
     CHECK_EQ(params.shape_size(), 4);
     LOG(INFO) << "[ " << params.node_id() << " \"" << params.name()
@@ -1131,8 +1135,8 @@ void GraphTransferer::DumpNodeTransferParams() const {
   }
   LOG(INFO) << "******\n";
   LOG(INFO) << "*** Op Nodes ***";
-  for (const GraphTransferInfo::NodeInfo& params :
-       graph_transfer_info_.node_info()) {
+  for (const GraphTransferNodeInfo& params :
+       graph_transfer_info_->node_info()) {
     LOG(INFO) << "[ " << params.node_id() << " \"" << params.name();
     LOG(INFO) << "  type: " << params.type_name();
     LOG(INFO) << "  padding: " << ToPaddingDebugString(params.padding_id());
@@ -1146,18 +1150,18 @@ void GraphTransferer::DumpNodeTransferParams() const {
   }
   LOG(INFO) << "******\n";
   LOG(INFO) << "*** Node input params ***";
-  for (const GraphTransferInfo::NodeInputInfo& params :
-       graph_transfer_info_.node_input_info()) {
+  for (const GraphTransferNodeInputInfo& params :
+       graph_transfer_info_->node_input_info()) {
     LOG(INFO) << "[ " << params.node_id() << " ]";
-    for (const GraphTransferInfo::NodeInput& node_input : params.node_input()) {
+    for (const GraphTransferNodeInput& node_input : params.node_input()) {
       LOG(INFO) << "    src node id = " << node_input.node_id()
                 << ", output port = " << node_input.output_port();
     }
   }
   LOG(INFO) << "******\n";
   LOG(INFO) << "*** Node output params ***";
-  for (const GraphTransferInfo::NodeOutputInfo& params :
-       graph_transfer_info_.node_output_info()) {
+  for (const GraphTransferNodeOutputInfo& params :
+       graph_transfer_info_->node_output_info()) {
     LOG(INFO) << "[ " << params.node_id() << " ]";
     for (const int max_size : params.max_byte_size()) {
       LOG(INFO) << "    max_size = " << max_size;
@@ -1167,8 +1171,8 @@ void GraphTransferer::DumpNodeTransferParams() const {
 }
 
 void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
-  for (const GraphTransferInfo::ConstNodeInfo& params :
-       graph_transfer_info_.const_node_info()) {
+  for (const GraphTransferConstNodeInfo& params :
+       graph_transfer_info_->const_node_info()) {
     std::stringstream sstream;
     // TODO(satok): Stop assuming shape size is 4.
     CHECK_EQ(params.shape_size(), 4);
@@ -1182,9 +1186,9 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
     LOG(INFO) << sstream.str();
   }
   LOG(INFO) << "Const node count = "
-            << graph_transfer_info_.const_node_info_size();
-  for (const GraphTransferInfo::NodeInfo& params :
-       graph_transfer_info_.node_info()) {
+            << graph_transfer_info_->const_node_info_size();
+  for (const GraphTransferNodeInfo& params :
+       graph_transfer_info_->node_info()) {
     std::stringstream sstream;
     sstream << "---(OP) [" << params.name().c_str() << "," << std::hex
             << params.node_id() << std::dec << "," << params.soc_op_id() << ","
@@ -1197,12 +1201,12 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
             << "," << params.output_count() << "," << params.type_name() << "]";
     LOG(INFO) << sstream.str();
   }
-  LOG(INFO) << "Op node count = " << graph_transfer_info_.node_info_size();
-  for (const GraphTransferInfo::NodeInputInfo& params :
-       graph_transfer_info_.node_input_info()) {
+  LOG(INFO) << "Op node count = " << graph_transfer_info_->node_info_size();
+  for (const GraphTransferNodeInputInfo& params :
+       graph_transfer_info_->node_input_info()) {
     std::stringstream sstream;
     sstream << "---(INPUT) [" << std::hex << params.node_id() << std::dec;
-    for (const GraphTransferInfo::NodeInput& node_input : params.node_input()) {
+    for (const GraphTransferNodeInput& node_input : params.node_input()) {
       sstream << "," << std::hex << node_input.node_id() << std::dec << ","
               << node_input.output_port();
     }
@@ -1210,9 +1214,9 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
     LOG(INFO) << sstream.str();
   }
   LOG(INFO) << "Input params count = "
-            << graph_transfer_info_.node_input_info_size();
-  for (const GraphTransferInfo::NodeOutputInfo& params :
-       graph_transfer_info_.node_output_info()) {
+            << graph_transfer_info_->node_input_info_size();
+  for (const GraphTransferNodeOutputInfo& params :
+       graph_transfer_info_->node_output_info()) {
     std::stringstream sstream;
     sstream << "---(OUTPUT) [" << std::hex << params.node_id() << std::dec;
     for (const int max_size : params.max_byte_size()) {
@@ -1222,7 +1226,7 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
     LOG(INFO) << sstream.str();
   }
   LOG(INFO) << "Output params count = "
-            << graph_transfer_info_.node_output_info_size();
+            << graph_transfer_info_->node_output_info_size();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index 0d43d028cdbea02b820d8ac0c48378524e875e78..86c1c5625facb3420a8b5e8699a5f12285871b06 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -22,8 +22,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/shape_refiner.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
@@ -34,6 +32,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+class GraphTransferInfo;
+class GraphTransferNodeInfo;
+class GraphTransferNodeInputInfo;
+
 // GraphTransferer transfers graph definitions into SoC memory.
 // This functionality is effective if SoC is capable to run
 // the graph on that chip.
@@ -47,7 +49,9 @@ class GraphTransferer {
   static constexpr int SHAPE_ARRAY_SIZE = MAX_SUPPORTED_RANK;
   using TensorShapeMap = RemoteFusedGraphExecuteUtils::TensorShapeMap;
 
-  GraphTransferer() = default;
+  GraphTransferer();
+
+  ~GraphTransferer();
 
   // Load graph structure into GraphTransferer
   // TODO(satok): Pass a pair of TensorShape and DataType instead of
@@ -96,8 +100,8 @@ class GraphTransferer {
    public:
     TransferParamsComparator(
         const std::unordered_map<int, std::unordered_set<int>>& dep_map);
-    bool operator()(const GraphTransferInfo::NodeInfo& obj0,
-                    const GraphTransferInfo::NodeInfo& obj1);
+    bool operator()(const GraphTransferNodeInfo& obj0,
+                    const GraphTransferNodeInfo& obj1);
     const std::unordered_map<int, std::unordered_set<int>>& dependency_map_;
   };
 
@@ -174,9 +178,8 @@ class GraphTransferer {
                         const std::vector<int>& extra_inputs,
                         const int outputs_size);
 
-  void AddNodeInputByInputIndex(
-      const Node& node, const int idx,
-      GraphTransferInfo::NodeInputInfo* node_input_info);
+  void AddNodeInputByInputIndex(const Node& node, const int idx,
+                                GraphTransferNodeInputInfo* node_input_info);
 
   void AppendNodeInputParams(const int id, const Node& node,
                              const std::vector<int>& extra_inputs);
@@ -211,7 +214,7 @@ class GraphTransferer {
   // Dump pretty print of parameters
   void DumpNodeTransferParams() const;
 
-  GraphTransferInfo graph_transfer_info_{};
+  GraphTransferInfo* graph_transfer_info_;
 
   std::vector<const Node*> node_name_cache_list_{};
   std::unordered_map<string, int> node_name_to_id_cache_map_{};
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index 20b09f144bab5482f2cf1bfa86cf22f0b7ff815e..765795b1f4a3db3ef6f8068e0ce05a9d3e7250cc 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -191,9 +191,9 @@ static GraphDef CreatePoolGraphDef() {
   return def;
 }
 
-static const GraphTransferInfo::ConstNodeInfo* FindConstNodeInfo(
+static const GraphTransferConstNodeInfo* FindConstNodeInfo(
     const GraphTransferer& gt, const string& name) {
-  for (const GraphTransferInfo::ConstNodeInfo& params :
+  for (const GraphTransferConstNodeInfo& params :
        gt.GetGraphTransferInfo().const_node_info()) {
     if (params.name() == name) {
       return &params;
@@ -202,9 +202,9 @@ static const GraphTransferInfo::ConstNodeInfo* FindConstNodeInfo(
   return nullptr;
 }
 
-static const GraphTransferInfo::NodeInfo* FindNodeInfo(
-    const GraphTransferer& gt, const string& name) {
-  for (const GraphTransferInfo::NodeInfo& params :
+static const GraphTransferNodeInfo* FindNodeInfo(const GraphTransferer& gt,
+                                                 const string& name) {
+  for (const GraphTransferNodeInfo& params :
        gt.GetGraphTransferInfo().node_info()) {
     if (params.name() == name) {
       return &params;
@@ -213,9 +213,9 @@ static const GraphTransferInfo::NodeInfo* FindNodeInfo(
   return nullptr;
 }
 
-static const GraphTransferInfo::NodeInputInfo* FindNodeInputInfo(
+static const GraphTransferNodeInputInfo* FindNodeInputInfo(
     const GraphTransferer& gt, const int node_id) {
-  for (const GraphTransferInfo::NodeInputInfo& params :
+  for (const GraphTransferNodeInputInfo& params :
        gt.GetGraphTransferInfo().node_input_info()) {
     if (params.node_id() == node_id) {
       return &params;
@@ -224,9 +224,9 @@ static const GraphTransferInfo::NodeInputInfo* FindNodeInputInfo(
   return nullptr;
 }
 
-static const GraphTransferInfo::NodeOutputInfo* FindNodeOutputInfo(
+static const GraphTransferNodeOutputInfo* FindNodeOutputInfo(
     const GraphTransferer& gt, const int node_id) {
-  for (const GraphTransferInfo::NodeOutputInfo& params :
+  for (const GraphTransferNodeOutputInfo& params :
        gt.GetGraphTransferInfo().node_output_info()) {
     if (params.node_id() == node_id) {
       return &params;
@@ -236,21 +236,21 @@ static const GraphTransferInfo::NodeOutputInfo* FindNodeOutputInfo(
 }
 
 static void SanityCheckNodes(const GraphTransferer& gt) {
-  for (const GraphTransferInfo::NodeInfo& params :
+  for (const GraphTransferNodeInfo& params :
        gt.GetGraphTransferInfo().node_info()) {
     if (params.input_count() > 0) {
-      const GraphTransferInfo::NodeInputInfo* input_params =
+      const GraphTransferNodeInputInfo* input_params =
           FindNodeInputInfo(gt, params.node_id());
       ASSERT_NE(nullptr, input_params);
       EXPECT_EQ(params.input_count(), input_params->node_input_size());
       EXPECT_EQ(params.node_id(), input_params->node_id());
-      for (const GraphTransferInfo::NodeInput& node_input :
+      for (const GraphTransferNodeInput& node_input :
            input_params->node_input()) {
         EXPECT_GE(node_input.output_port(), 0);
       }
     }
     if (params.output_count() > 0) {
-      const GraphTransferInfo::NodeOutputInfo* output_params =
+      const GraphTransferNodeOutputInfo* output_params =
           FindNodeOutputInfo(gt, params.node_id());
       ASSERT_NE(nullptr, output_params);
       EXPECT_EQ(params.output_count(), output_params->max_byte_size_size());
@@ -273,8 +273,7 @@ TEST_F(GraphTransfererTest, LoadAddGraph) {
   const int const_node_count =
       gt_.GetGraphTransferInfo().const_node_info_size();
   ASSERT_EQ(2, const_node_count);
-  const GraphTransferInfo::ConstNodeInfo* params_a =
-      FindConstNodeInfo(gt_, NAME_A);
+  const GraphTransferConstNodeInfo* params_a = FindConstNodeInfo(gt_, NAME_A);
   ASSERT_TRUE(params_a != nullptr);
   EXPECT_EQ(NAME_A, params_a->name());
   ASSERT_EQ(4, params_a->shape_size());
@@ -284,8 +283,7 @@ TEST_F(GraphTransfererTest, LoadAddGraph) {
   EXPECT_EQ(1, params_a->shape(3));
   EXPECT_EQ(4, params_a->data().length());
 
-  const GraphTransferInfo::ConstNodeInfo* params_b =
-      FindConstNodeInfo(gt_, NAME_B);
+  const GraphTransferConstNodeInfo* params_b = FindConstNodeInfo(gt_, NAME_B);
   ASSERT_TRUE(params_b != nullptr);
   ASSERT_EQ(4, params_b->shape_size());
   EXPECT_EQ(1, params_b->shape(0));
@@ -328,7 +326,7 @@ TEST_F(GraphTransfererTest, LoadConvGraph) {
   ASSERT_EQ(2, const_node_count);
   const int op_node_count = gt_.GetGraphTransferInfo().node_info_size();
   ASSERT_EQ(4, op_node_count);
-  const GraphTransferInfo::NodeInfo* params_conv = FindNodeInfo(gt_, "conv");
+  const GraphTransferNodeInfo* params_conv = FindNodeInfo(gt_, "conv");
   ASSERT_TRUE(params_conv != nullptr);
   const int id = params_conv->node_id();
   EXPECT_GE(id, 0);
@@ -354,8 +352,7 @@ TEST_F(GraphTransfererTest, LoadMaxPoolGraph) {
   ASSERT_EQ(2, const_node_count);
   const int op_node_count = gt_.GetGraphTransferInfo().node_info_size();
   ASSERT_EQ(4, op_node_count);
-  const GraphTransferInfo::NodeInfo* params_max_pool =
-      FindNodeInfo(gt_, "maxpool");
+  const GraphTransferNodeInfo* params_max_pool = FindNodeInfo(gt_, "maxpool");
   ASSERT_TRUE(params_max_pool != nullptr);
   const int id = params_max_pool->node_id();
   EXPECT_GE(id, 0);
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index 9c2e1e123cacb0dfd4cec3ef395e30ae9fa1bd34..3810cbe5b55a4402a7be2cf325668c5735d06ca7 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
 
+#include "tensorflow/core/framework/graph_transfer_info.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 #include "tensorflow/core/kernels/hexagon/soc_interface.h"
@@ -54,9 +56,9 @@ static uint8* FindAlignedPointer(uint8* ptr) {
   return data_ptr;
 }
 
-/* static */ GraphTransferInfo::NodeInfo* HexagonControlWrapper::FindNodeInfo(
+/* static */ GraphTransferNodeInfo* HexagonControlWrapper::FindNodeInfo(
     const string& name, GraphTransferInfo* graph_transfer_info) {
-  for (GraphTransferInfo::NodeInfo& node_info :
+  for (GraphTransferNodeInfo& node_info :
        *graph_transfer_info->mutable_node_info()) {
     if (node_info.name() == name) {
       return &node_info;
@@ -138,9 +140,9 @@ bool HexagonControlWrapper::SetupGraph() {
       graph_transferer_.GetMutableGraphTransferInfo();
 
   // Overwrite op type of input nodes for hexagon
-  for (const GraphTransferInfo::GraphInputNodeInfo& graph_input :
+  for (const GraphTransferGraphInputNodeInfo& graph_input :
        graph_transfer_info.graph_input_node_info()) {
-    GraphTransferInfo::NodeInfo* node_info =
+    GraphTransferNodeInfo* node_info =
         FindNodeInfo(graph_input.name(), &graph_transfer_info);
     CHECK_NE(node_info, nullptr);
   }
@@ -148,13 +150,13 @@ bool HexagonControlWrapper::SetupGraph() {
   // Generate a new output node which is connected to graph output node
   // TODO(satok): Support multiple output nodes
   CHECK_EQ(graph_transfer_info.graph_output_node_info_size(), 1);
-  for (const GraphTransferInfo::GraphOutputNodeInfo& graph_output :
+  for (const GraphTransferGraphOutputNodeInfo& graph_output :
        graph_transfer_info.graph_output_node_info()) {
     const int new_output_node_id = graph_transfer_info.node_info_size() +
                                    graph_transfer_info.const_node_info_size() +
                                    2 /* offset for ids */;
     // Register a new output node
-    GraphTransferInfo::NodeInfo& new_output_node_info =
+    GraphTransferNodeInfo& new_output_node_info =
         *graph_transfer_info.add_node_info();
     new_output_node_info.set_name(OUTPUT_OP_NAME);
     new_output_node_info.set_node_id(new_output_node_id);
@@ -169,14 +171,13 @@ bool HexagonControlWrapper::SetupGraph() {
     const string node_name = tid.first.ToString();
     const int port = tid.second;
     // Register node input for the new output node
-    const GraphTransferInfo::NodeInfo* node_info =
+    const GraphTransferNodeInfo* node_info =
         FindNodeInfo(node_name, &graph_transfer_info);
     CHECK_NE(node_info, nullptr);
-    GraphTransferInfo::NodeInputInfo& node_input_info =
+    GraphTransferNodeInputInfo& node_input_info =
         *graph_transfer_info.add_node_input_info();
     node_input_info.set_node_id(new_output_node_id);
-    GraphTransferInfo::NodeInput& node_input =
-        *node_input_info.add_node_input();
+    GraphTransferNodeInput& node_input = *node_input_info.add_node_input();
     node_input.set_node_id(node_info->node_id());
     node_input.set_output_port(port);
   }
@@ -189,12 +190,12 @@ bool HexagonControlWrapper::SetupGraph() {
 
   int inputs_count = 0;
   int outputs_count = 0;
-  for (const GraphTransferInfo::NodeInputInfo& input_params :
+  for (const GraphTransferNodeInputInfo& input_params :
        graph_transfer_info.node_input_info()) {
     inputs_count += input_params.node_input_size();
   }
 
-  for (const GraphTransferInfo::NodeOutputInfo& output_params :
+  for (const GraphTransferNodeOutputInfo& output_params :
        graph_transfer_info.node_output_info()) {
     outputs_count += output_params.max_byte_size_size();
   }
@@ -204,15 +205,14 @@ bool HexagonControlWrapper::SetupGraph() {
 
   // Construct node input parameters
   std::unordered_map<int, std::tuple<void*, int>> inputs_map;
-  for (const GraphTransferInfo::NodeInputInfo& input_params :
+  for (const GraphTransferNodeInputInfo& input_params :
        graph_transfer_info.node_input_info()) {
     const int count = input_params.node_input_size();
     CHECK(count <= MAX_IN_OUT_COUNT);
     int node_ids[MAX_IN_OUT_COUNT];
     int ports[MAX_IN_OUT_COUNT];
     for (int i = 0; i < count; ++i) {
-      const GraphTransferInfo::NodeInput& node_input =
-          input_params.node_input(i);
+      const GraphTransferNodeInput& node_input = input_params.node_input(i);
       node_ids[i] = node_input.node_id() + NODE_ID_OFFSET;
       ports[i] = node_input.output_port();
     }
@@ -224,7 +224,7 @@ bool HexagonControlWrapper::SetupGraph() {
 
   // Construct node output parameters
   std::unordered_map<int, std::tuple<void*, int>> outputs_map;
-  for (const GraphTransferInfo::NodeOutputInfo& output_params :
+  for (const GraphTransferNodeOutputInfo& output_params :
        graph_transfer_info.node_output_info()) {
     const int count = output_params.max_byte_size_size();
     CHECK(count <= MAX_IN_OUT_COUNT);
@@ -244,7 +244,7 @@ bool HexagonControlWrapper::SetupGraph() {
 
   // Initialize graph
   // 1. Setup const nodes
-  for (const GraphTransferInfo::ConstNodeInfo& params :
+  for (const GraphTransferConstNodeInfo& params :
        graph_transfer_info.const_node_info()) {
     const int node_id = params.node_id();
     // TODO(satok): Stop assuming shape size is 4.
@@ -267,8 +267,7 @@ bool HexagonControlWrapper::SetupGraph() {
   }
 
   // 2. Setup op nodes
-  for (const GraphTransferInfo::NodeInfo& params :
-       graph_transfer_info.node_info()) {
+  for (const GraphTransferNodeInfo& params : graph_transfer_info.node_info()) {
     const int node_id = params.node_id();
     const int op_id = params.soc_op_id();
     CHECK(inputs_map.count(node_id) == 1);
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index dca1f94a9b156bc9199064a72efc69b34956e59f..132cfde2db0bdfab3289a7c44ea6f4a54a5e5cdd 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -67,8 +67,8 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
   // CAVEAT: Need offset as HVX library reserves some ids
   static constexpr int NODE_ID_OFFSET = 0x10000;
 
-  static GraphTransferInfo::NodeInfo* FindNodeInfo(
-      const string& node_name, GraphTransferInfo* graph_transfer_info);
+  static GraphTransferNodeInfo* FindNodeInfo(
+      const string& name, GraphTransferInfo* graph_transfer_info);
 
   const RemoteFusedGraphExecuteInfo* execute_info_{};
   GraphTransferer graph_transferer_{};
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 3f794dfb1a04cfdd6f7c114e0b2c7c0aac319a61..d53977703e4716561dc57ff1ad4ef8bd861eeea7 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -29,6 +29,8 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 
 #include <memory>
 
+#include "tensorflow/core/framework/graph_transfer_info.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
@@ -209,7 +211,7 @@ BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(
     const GraphTransferInfo& graph_transfer_info) {
   RemoteFusedGraphExecuteInfo execute_info;
   execute_info.set_executor_name("build_hexagon_remote_fused_graph_executor");
-  for (const GraphTransferInfo::GraphInputNodeInfo& input :
+  for (const GraphTransferGraphInputNodeInfo& input :
        graph_transfer_info.graph_input_node_info()) {
     execute_info.add_graph_input_node_name(input.name());
     RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
@@ -221,7 +223,7 @@ BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(
     }
   }
 
-  for (const GraphTransferInfo::GraphOutputNodeInfo& output :
+  for (const GraphTransferGraphOutputNodeInfo& output :
        graph_transfer_info.graph_output_node_info()) {
     execute_info.add_graph_output_node_name(output.name());
     RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
@@ -325,8 +327,8 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   // 1. check node_info
   ASSERT_EQ(gfi0.node_info_size(), gfi1.node_info_size());
   for (int i = 0; i < gfi0.node_info_size(); ++i) {
-    const GraphTransferInfo::NodeInfo& ni0 = gfi0.node_info(i);
-    const GraphTransferInfo::NodeInfo& ni1 = gfi1.node_info(i);
+    const GraphTransferNodeInfo& ni0 = gfi0.node_info(i);
+    const GraphTransferNodeInfo& ni1 = gfi1.node_info(i);
     EXPECT_EQ(ni0.DebugString(), ni1.DebugString());
     EXPECT_EQ(ni0.ByteSizeLong(), ni1.ByteSizeLong());
   }
@@ -334,8 +336,8 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   // 2. check const_node_info
   ASSERT_EQ(gfi0.const_node_info_size(), gfi1.const_node_info_size());
   for (int i = 0; i < gfi0.const_node_info_size(); ++i) {
-    const GraphTransferInfo::ConstNodeInfo& cni0 = gfi0.const_node_info(i);
-    const GraphTransferInfo::ConstNodeInfo& cni1 = gfi1.const_node_info(i);
+    const GraphTransferConstNodeInfo& cni0 = gfi0.const_node_info(i);
+    const GraphTransferConstNodeInfo& cni1 = gfi1.const_node_info(i);
     ASSERT_EQ(cni0.shape_size(), cni1.shape_size());
     for (int j = 0; j < cni0.shape_size(); ++j) {
       EXPECT_EQ(cni0.shape(j), cni1.shape(j));
@@ -347,8 +349,8 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   // 3. check node_input_info
   ASSERT_EQ(gfi0.node_input_info_size(), gfi1.node_input_info_size());
   for (int i = 0; i < gfi0.node_input_info_size(); ++i) {
-    const GraphTransferInfo::NodeInputInfo& nii0 = gfi0.node_input_info(i);
-    const GraphTransferInfo::NodeInputInfo& nii1 = gfi1.node_input_info(i);
+    const GraphTransferNodeInputInfo& nii0 = gfi0.node_input_info(i);
+    const GraphTransferNodeInputInfo& nii1 = gfi1.node_input_info(i);
     EXPECT_EQ(nii0.ByteSizeLong(), nii1.ByteSizeLong());
     EXPECT_EQ(nii0.DebugString(), nii1.DebugString());
   }
@@ -356,8 +358,8 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   // 4. check node_output_info
   ASSERT_EQ(gfi0.node_output_info_size(), gfi1.node_output_info_size());
   for (int i = 0; i < gfi0.node_output_info_size(); ++i) {
-    const GraphTransferInfo::NodeOutputInfo& noi0 = gfi0.node_output_info(i);
-    const GraphTransferInfo::NodeOutputInfo& noi1 = gfi1.node_output_info(i);
+    const GraphTransferNodeOutputInfo& noi0 = gfi0.node_output_info(i);
+    const GraphTransferNodeOutputInfo& noi1 = gfi1.node_output_info(i);
     ASSERT_EQ(noi0.max_byte_size_size(), noi1.max_byte_size_size());
     for (int j = 0; j < noi0.max_byte_size_size(); ++j) {
       EXPECT_EQ(noi0.max_byte_size(j), noi1.max_byte_size(j));
@@ -370,9 +372,9 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   ASSERT_EQ(gfi0.graph_input_node_info_size(),
             gfi1.graph_input_node_info_size());
   for (int i = 0; i < gfi0.graph_input_node_info_size(); ++i) {
-    const GraphTransferInfo::GraphInputNodeInfo& gini0 =
+    const GraphTransferGraphInputNodeInfo& gini0 =
         gfi0.graph_input_node_info(i);
-    const GraphTransferInfo::GraphInputNodeInfo& gini1 =
+    const GraphTransferGraphInputNodeInfo& gini1 =
         gfi0.graph_input_node_info(i);
     EXPECT_EQ(gini0.ByteSizeLong(), gini1.ByteSizeLong());
     EXPECT_EQ(gini0.DebugString(), gini1.DebugString());
@@ -382,9 +384,9 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   ASSERT_EQ(gfi0.graph_output_node_info_size(),
             gfi1.graph_output_node_info_size());
   for (int i = 0; i < gfi0.graph_output_node_info_size(); ++i) {
-    const GraphTransferInfo::GraphOutputNodeInfo& goni0 =
+    const GraphTransferGraphOutputNodeInfo& goni0 =
         gfi0.graph_output_node_info(i);
-    const GraphTransferInfo::GraphOutputNodeInfo& goni1 =
+    const GraphTransferGraphOutputNodeInfo& goni1 =
         gfi0.graph_output_node_info(i);
     EXPECT_EQ(goni0.ByteSizeLong(), goni1.ByteSizeLong());
     EXPECT_EQ(goni0.DebugString(), goni1.DebugString());
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
index eb6b64da583f0ea9e4bb462925ebdf1bcf8dc1e3..607241268929382f6e574b433d821028148118e4 100644
--- a/tensorflow/core/kernels/i_remote_fused_graph_executor.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
@@ -16,13 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
 #define TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
 
-#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
+class GraphDef;
+class RemoteFusedGraphExecuteInfo;
+
 class IRemoteFusedGraphExecutor {
  public:
   using TensorAllocatorFunc = std::function<Tensor*(const TensorShape& shape)>;
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index a18a72c66dc659ffd372c231524dbf038df6ac22..dffb4d71713f54307097fe6600622992e6b8977e 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -101,6 +101,10 @@ REGISTER_SYCL_HOST_KERNEL(bool);
   REGISTER_KERNEL_BUILDER(Name("DebugGradientIdentity")                     \
                               .Device(DEVICE_GPU)                           \
                               .TypeConstraint<type>("T"),                   \
+                          IdentityOp);                                      \
+  REGISTER_KERNEL_BUILDER(Name("PlaceholderWithDefault")                    \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<type>("dtype"),               \
                           IdentityOp)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
@@ -112,18 +116,30 @@ REGISTER_GPU_KERNEL(Variant);
 // A special GPU kernel for int32 and bool.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-#define REGISTER_GPU_HOST_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("Identity")                \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          IdentityOp);                    \
-  REGISTER_KERNEL_BUILDER(Name("RefIdentity")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
+#define REGISTER_GPU_HOST_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Identity")                    \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("input")            \
+                              .HostMemory("output")           \
+                              .TypeConstraint<type>("T"),     \
+                          IdentityOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("RefIdentity")                 \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("input")            \
+                              .HostMemory("output")           \
+                              .TypeConstraint<type>("T"),     \
+                          IdentityOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("StopGradient")                \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("input")            \
+                              .HostMemory("output")           \
+                              .TypeConstraint<type>("T"),     \
+                          IdentityOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("PlaceholderWithDefault")      \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("input")            \
+                              .HostMemory("output")           \
+                              .TypeConstraint<type>("dtype"), \
                           IdentityOp)
 
 REGISTER_GPU_HOST_KERNEL(int32);
diff --git a/tensorflow/core/kernels/immutable_constant_op_test.cc b/tensorflow/core/kernels/immutable_constant_op_test.cc
index b3814331ee7f42a63af93cb35e943463724cf5a6..b2dc16d5d729fff71ee6651dd87970a8c6b4bb66 100644
--- a/tensorflow/core/kernels/immutable_constant_op_test.cc
+++ b/tensorflow/core/kernels/immutable_constant_op_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/null_file_system.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc
index 9c428cdedc94a7f2851b5e7d7a8b43aa52fd0ee2..06d53eba305f98fe937839fc7261a950de9db7db 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.cc
+++ b/tensorflow/core/kernels/initializable_lookup_table.cc
@@ -44,7 +44,7 @@ Status InitializableLookupTable::Initialize(InitTableIterator& iter) {
     return errors::FailedPrecondition("Table already initialized.");
   }
 
-  TF_RETURN_IF_ERROR(DoPrepare(iter.total_size()));
+  TF_RETURN_IF_ERROR(DoLazyPrepare([&iter]() { return iter.total_size(); }));
   while (iter.Valid()) {
     TF_RETURN_IF_ERROR(DoInsert(iter.keys(), iter.values()));
     iter.Next();
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
index e9eae9f863b136fdd217792946437f4755cf55bb..990cbceac26e4748e4dfa96a525d5ffd8b0ec9c6 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.h
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -51,7 +51,7 @@ class InitializableLookupTable : public LookupInterface {
         "Insert not supported by InitializableLookupTable implementations");
   }
 
-  Status ExportValues(OpKernelContext* context) final {
+  Status ExportValues(OpKernelContext* context) {
     return errors::Unimplemented(
         "ExportValues not supported by InitializableLookupTable "
         "implementations");
@@ -92,6 +92,8 @@ class InitializableLookupTable : public LookupInterface {
   //
   // Then the iterator is exhausted, valid returns false and status returns
   // Status::OutOfRange.
+  //
+  // This class is Thread-unsafe.
   class InitTableIterator {
    public:
     InitTableIterator() {}
@@ -114,6 +116,7 @@ class InitializableLookupTable : public LookupInterface {
     virtual Status status() const = 0;
 
     // Returns the total number of elements that the iterator will produce.
+    // It might return -1 in case of error.
     virtual int64 total_size() const = 0;
 
    private:
@@ -129,6 +132,17 @@ class InitializableLookupTable : public LookupInterface {
   // number of expected elements.
   virtual Status DoPrepare(size_t expected_num_elements) = 0;
 
+  // Same as DoPrepare() but derived implementations might choose to skip
+  // calling get_expected_num_elements if size is not needed for DoPrepare.
+  virtual Status DoLazyPrepare(
+      std::function<int64(void)> get_expected_num_elements) {
+    int64 expected_num_elements = get_expected_num_elements();
+    if (expected_num_elements < 0) {
+      return errors::FailedPrecondition("Got negative expected_num_elements.");
+    }
+    return DoPrepare(expected_num_elements);
+  }
+
   // Populates the table in batches given keys and values as tensors into the
   // underlying data structure.
   virtual Status DoInsert(const Tensor& keys, const Tensor& values) = 0;
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index a71d047ed1a381bfc0311f86987f585f51b02536..ef6ce0546b0811edda3331de69906237cca76dd4 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -213,13 +213,13 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_EMPTY(type)                                  \
+#define REGISTER_PARALLEL_CONCAT_START(type)                  \
   REGISTER_KERNEL_BUILDER(Name("_ParallelConcatStart")        \
                               .Device(DEVICE_GPU)             \
                               .TypeConstraint<type>("dtype"), \
                           ParallelConcatStart<GPUDevice, type>);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_EMPTY)
-#undef REGISTER_EMPTY
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_PARALLEL_CONCAT_START)
+#undef REGISTER_PARALLEL_CONCAT_START
 
 #define REGISTER_PARALLEL_CONCAT(type)                                     \
   REGISTER_KERNEL_BUILDER(                                                 \
@@ -248,5 +248,295 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                         ParallelConcatUpdate<CPUDevice>);
 #endif
 
+class InplaceOpBase : public OpKernel {
+ public:
+  explicit InplaceOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto x = ctx->input(0);
+    auto i = ctx->input(1);
+    auto v = ctx->input(2);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(i.shape()),
+                errors::InvalidArgument("i must be a vector. ",
+                                        i.shape().DebugString()));
+    OP_REQUIRES(ctx, x.dims() == v.dims(),
+                errors::InvalidArgument(
+                    "x and v shape doesn't match (ranks differ): ",
+                    x.shape().DebugString(), " vs. ", v.shape().DebugString()));
+    for (int i = 1; i < x.dims(); ++i) {
+      OP_REQUIRES(
+          ctx, x.dim_size(i) == v.dim_size(i),
+          errors::InvalidArgument("x and v shape doesn't match at index ", i,
+                                  " : ", x.shape().DebugString(), " vs. ",
+                                  v.shape().DebugString()));
+    }
+    OP_REQUIRES(ctx, i.dim_size(0) == v.dim_size(0),
+                errors::InvalidArgument(
+                    "i and x shape doesn't match at index 0: ",
+                    i.shape().DebugString(), " vs. ", v.shape().DebugString()));
+
+    Tensor y = x;  // This creates an alias intentionally.
+    OP_REQUIRES_OK(ctx, DoCompute(ctx, i, v, &y));
+    ctx->set_output(0, y);
+  }
+
+ protected:
+  virtual Status DoCompute(OpKernelContext* ctx, const Tensor& i,
+                           const Tensor& v, Tensor* y) = 0;
+};
+
+}  // end namespace
+
+namespace functor {
+
+template <typename T>
+void DoInplaceOp(const CPUDevice& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  auto Ti = i.flat<int32>();
+  auto Tv = v.flat_outer_dims<T>();
+  auto Ty = y->flat_outer_dims<T>();
+  auto nrows = Ty.dimension(0);
+  for (int64 j = 0; j < Ti.size(); ++j) {
+    auto r = (Ti(j) % nrows + nrows) % nrows;  // Guard index range.
+    switch (op) {
+      case I_UPDATE:
+        Ty.template chip<0>(r).device(d) = Tv.template chip<0>(j);
+        break;
+      case I_ADD:
+        Ty.template chip<0>(r).device(d) += Tv.template chip<0>(j);
+        break;
+      case I_SUB:
+        Ty.template chip<0>(r).device(d) -= Tv.template chip<0>(j);
+        break;
+    }
+  }
+}
+
+// String type only supports inplace update.
+void DoInplaceStringUpdateOp(const CPUDevice& d, const Tensor& i,
+                             const Tensor& v, Tensor* y) {
+  auto Ti = i.flat<int32>();
+  auto Tv = v.flat_outer_dims<string>();
+  auto Ty = y->flat_outer_dims<string>();
+  auto nrows = Ty.dimension(0);
+  for (int64 j = 0; j < Ti.size(); ++j) {
+    auto r = (Ti(j) % nrows + nrows) % nrows;  // Guard index range.
+    Ty.template chip<0>(r).device(d) = Tv.template chip<0>(j);
+  }
+}
+
+template <>
+Status DoInplace(const CPUDevice& device, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  CHECK_EQ(v.dtype(), y->dtype());
+  if (op == I_UPDATE) {
+    if (v.dtype() == DT_STRING) {
+      DoInplaceStringUpdateOp(device, i, v, y);
+      return Status::OK();
+    } else if (v.dtype() == DT_BOOL) {
+      DoInplaceOp<bool>(device, op, i, v, y);
+      return Status::OK();
+    }
+  }
+  switch (v.dtype()) {
+#define CASE(type)                          \
+  case DataTypeToEnum<type>::value:         \
+    DoInplaceOp<type>(device, op, i, v, y); \
+    break;
+    TF_CALL_NUMBER_TYPES(CASE);
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+  }
+  return Status::OK();
+}
+
+}  // end namespace functor
+
+namespace {
+template <typename Device, functor::InplaceOpType op>
+class InplaceOp : public InplaceOpBase {
+ public:
+  explicit InplaceOp(OpKernelConstruction* ctx) : InplaceOpBase(ctx) {}
+
+ protected:
+  Status DoCompute(OpKernelContext* ctx, const Tensor& i, const Tensor& v,
+                   Tensor* y) override {
+    const auto& d = ctx->eigen_device<Device>();
+    return ::tensorflow::functor::DoInplace(d, op, i, v, y);
+  }
+};
+
+class CopyOpBase : public OpKernel {
+ public:
+  explicit CopyOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto x = ctx->input(0);
+    Tensor* y;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, x.shape(), &y));
+    OP_REQUIRES_OK(ctx, DoCompute(ctx, x, y));
+  }
+
+ protected:
+  virtual Status DoCompute(OpKernelContext* ctx, const Tensor& x,
+                           Tensor* y) = 0;
+};
+
+template <typename Device>
+class CopyOp : public CopyOpBase {
+ public:
+  explicit CopyOp(OpKernelConstruction* ctx) : CopyOpBase(ctx) {}
+
+ protected:
+  Status DoCompute(OpKernelContext* ctx, const Tensor& x, Tensor* y) override {
+    const auto& d = ctx->eigen_device<Device>();
+    return ::tensorflow::functor::DoCopy(d, x, y);
+  }
+};
+
+}  // end namespace
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <>
+Status DoCopy(const CPUDevice& device, const Tensor& x, Tensor* y) {
+  CHECK_EQ(x.dtype(), y->dtype());
+  switch (x.dtype()) {
+#define CASE(type)                                   \
+  case DataTypeToEnum<type>::value:                  \
+    y->flat<type>().device(device) = x.flat<type>(); \
+    break;
+
+    TF_CALL_NUMBER_TYPES(CASE);
+    TF_CALL_bool(CASE);
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", x.dtype());
+  }
+  return Status::OK();
+}
+
+}  // end namespace functor
+
+namespace {
+template <typename Device, typename T>
+class EmptyOp : public OpKernel {
+ public:
+  explicit EmptyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("init", &init_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(shape.shape()),
+        errors::InvalidArgument("shape must be a vector of int32, got shape ",
+                                shape.shape().DebugString()));
+    auto dims = shape.flat<int32>();
+    TensorShape out_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                            reinterpret_cast<const int32*>(dims.data()),
+                            dims.size(), &out_shape));
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+
+    if (init_) {
+      functor::SetZeroFunctor<Device, T>()(ctx->eigen_device<Device>(),
+                                           out->flat<T>());
+    }
+  }
+
+ private:
+  bool init_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("InplaceUpdate").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_UPDATE>);
+REGISTER_KERNEL_BUILDER(Name("InplaceAdd").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_ADD>);
+REGISTER_KERNEL_BUILDER(Name("InplaceSub").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_SUB>);
+REGISTER_KERNEL_BUILDER(Name("DeepCopy").Device(DEVICE_CPU), CopyOp<CPUDevice>);
+
+#define REGISTER_EMPTY(type, dev)                             \
+  REGISTER_KERNEL_BUILDER(Name("Empty")                       \
+                              .Device(DEVICE_##dev)           \
+                              .HostMemory("shape")            \
+                              .TypeConstraint<type>("dtype"), \
+                          EmptyOp<dev##Device, type>)
+
+REGISTER_EMPTY(float, CPU)
+REGISTER_EMPTY(double, CPU)
+REGISTER_EMPTY(Eigen::half, CPU)
+REGISTER_EMPTY(string, CPU)
+REGISTER_EMPTY(int32, CPU)
+REGISTER_EMPTY(int64, CPU)
+REGISTER_EMPTY(bool, CPU)
+
+#if GOOGLE_CUDA
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceUpdate").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      InplaceOp<GPUDevice, functor::I_UPDATE>);                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceAdd").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),    \
+      InplaceOp<GPUDevice, functor::I_ADD>);                              \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceSub").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),    \
+      InplaceOp<GPUDevice, functor::I_SUB>);                              \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DeepCopy").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),      \
+      CopyOp<GPUDevice>);
+
+REGISTER(float);
+REGISTER(double);
+REGISTER(Eigen::half);
+REGISTER(int64);
+
+REGISTER_KERNEL_BUILDER(Name("InplaceUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_UPDATE>);
+REGISTER_KERNEL_BUILDER(Name("InplaceAdd")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_ADD>);
+REGISTER_KERNEL_BUILDER(Name("InplaceSub")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_SUB>);
+
+REGISTER_KERNEL_BUILDER(Name("DeepCopy")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        CopyOp<CPUDevice>);
+REGISTER_EMPTY(float, GPU);
+REGISTER_EMPTY(double, GPU);
+REGISTER_EMPTY(Eigen::half, GPU);
+REGISTER_EMPTY(int64, GPU);
+
+#endif  // GOOGLE_CUDA
+
 }  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops_functor.h b/tensorflow/core/kernels/inplace_ops_functor.h
index 53529f51653f35f0815bb640ec244e4acccade2a..b806787e91c39d0add8ec6bb386a56d12a3b4b24 100644
--- a/tensorflow/core/kernels/inplace_ops_functor.h
+++ b/tensorflow/core/kernels/inplace_ops_functor.h
@@ -26,6 +26,23 @@ template <typename Device>
 Status DoParallelConcat(const Device& device, const Tensor& value, int32 loc,
                         Tensor* output);
 
+// Inplace update/add/sub values in 'y'. It computes
+//   y[i, :] = v if op is I_UPDATE
+//   y[i, :] += v if op is I_ADD
+//   y[i, :] -= v if op is I_SUB
+// Returns an error if the operation fails.
+enum InplaceOpType {
+  I_UPDATE,  // x = y
+  I_ADD,     // x += y
+  I_SUB,     // x -= y
+};
+template <typename Device>
+Status DoInplace(const Device& device, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y);
+// Copies x into y.
+template <typename Device>
+Status DoCopy(const Device& device, const Tensor& x, Tensor* y);
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 8467360435af7b2a267a14e62d432808ec39e239..f1616b1ea88c93fc8ce039c8afd0be0d13504317 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -77,6 +77,103 @@ Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
   return Status::OK();
 }
 
+template <typename T, InplaceOpType op>
+__global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
+                                  const int64 cols, const int64 n, const T* src,
+                                  const int32* rowids, T* dst) {
+  CUDA_1D_KERNEL_LOOP(idx, nthreads) {
+    int64 r = idx / cols;
+    int64 c = idx % cols;
+    r = (rowids[r] % rows + rows) % rows;  // Guard index range.
+    T* p = dst + r * cols + c;
+    const T* q = src + idx;
+    switch (op) {
+      case I_UPDATE:
+        *p = ldg(q);
+        break;
+      case I_ADD:
+        *p += ldg(q);
+        break;
+      case I_SUB:
+        *p -= ldg(q);
+        break;
+    }
+  }
+}
+
+template <typename T>
+void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  const int64 nelem = v.NumElements();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
+  auto Ty = y->flat_outer_dims<T>();
+  const int64 nrows = Ty.dimension(0);
+  const int64 ncols = Ty.dimension(1);
+  const int64 n = i.NumElements();
+  const T* src = v.flat<T>().data();
+  // TODO(sjhwang): Check that first dimension fits in int32 range.
+  const int32* rowids = i.flat<int32>().data();
+  T* dst = y->flat<T>().data();
+  switch (op) {
+    case I_UPDATE:
+      DoInplaceOpKernel<T, I_UPDATE>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+    case I_ADD:
+      DoInplaceOpKernel<T, I_ADD>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+    case I_SUB:
+      DoInplaceOpKernel<T, I_SUB>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+  }
+}
+
+template <>
+Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  CHECK_EQ(v.dtype(), y->dtype());
+  switch (v.dtype()) {
+#define CASE(type)                     \
+  case DataTypeToEnum<type>::value:    \
+    DoInplaceOp<type>(d, op, i, v, y); \
+    break;
+
+    CASE(float)
+    CASE(double)
+    CASE(Eigen::half)
+    CASE(int64)
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+  }
+  return Status::OK();
+}
+
+template <>
+Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
+  CHECK_EQ(x.dtype(), y->dtype());
+  switch (x.dtype()) {
+#define CASE(type)                              \
+  case DataTypeToEnum<type>::value:             \
+    y->flat<type>().device(d) = x.flat<type>(); \
+    break;
+
+    CASE(float)
+    CASE(double)
+    CASE(Eigen::half)
+    CASE(int64)
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported dtype: ", x.dtype());
+  }
+  return Status::OK();
+}
+
 }  // end namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index baf0a4abe48ea0c5a5fed5d7ef3e53925e393b10..84fa63fc001efcca1aa9ee73a86fd08233bd7535 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -112,6 +112,7 @@ bool TensorList::Decode(const VariantTensorData& data) {
       dims.push_back(scratch);
     }
   }
+  element_shape = PartialTensorShape(dims);
   return true;
 }
 
@@ -474,6 +475,115 @@ REGISTER_KERNEL_BUILDER(
 
 #endif  // GOOGLE_CUDA
 
+class TensorListConcatLists : public OpKernel {
+ public:
+  explicit TensorListConcatLists(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorShape& tl_a_shape = c->input(0).shape();
+    const TensorShape& tl_b_shape = c->input(1).shape();
+    OP_REQUIRES(
+        c, tl_a_shape == tl_b_shape,
+        errors::InvalidArgument("Incompatible input TensorList tensor shapes: ",
+                                tl_a_shape.DebugString(), " vs. ",
+                                tl_b_shape.DebugString()));
+    AllocatorAttributes attr;
+    std::unique_ptr<Tensor> tl_alias = c->forward_input(
+        0 /*input_index*/, 0 /*output_index*/, DT_VARIANT, tl_a_shape,
+        DEVICE_MEMORY /* input is always on DEVICE_MEMORY */, attr);
+
+    // tl_a may be aliased by tl_alias.
+    const Tensor& tl_a = c->input(0);
+    const Tensor& tl_b = c->input(1);
+
+    Tensor* output;
+    if (tl_alias) {
+      c->set_output(0, *tl_alias);
+      output = tl_alias.get();
+    } else {
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(c, c->allocate_output(0, tl_a_shape, &output, attr));
+    }
+
+    auto output_t = output->flat<Variant>();
+    auto tl_a_t = tl_a.flat<Variant>();
+    auto tl_b_t = tl_b.flat<Variant>();
+
+    for (int64 b = 0; b < tl_a.NumElements(); ++b) {
+      const TensorList* l_a = tl_a_t(b).get<TensorList>();
+      const TensorList* l_b = tl_b_t(b).get<TensorList>();
+      OP_REQUIRES(
+          c, l_a != nullptr,
+          errors::InvalidArgument("input_a is not a TensorList at index ", b,
+                                  ".  Saw: '", tl_a_t(b).DebugString(), "'"));
+      OP_REQUIRES(
+          c, l_b != nullptr,
+          errors::InvalidArgument("input_b is not a TensorList at index ", b,
+                                  ".  Saw: '", tl_b_t(b).DebugString(), "'"));
+      OP_REQUIRES(c, l_a->element_dtype == element_dtype_,
+                  errors::InvalidArgument(
+                      "input_a[", b, "].dtype != element_dtype.  Saw: ",
+                      DataTypeString(l_a->element_dtype), " vs. ",
+                      DataTypeString(element_dtype_)));
+      OP_REQUIRES(c, l_b->element_dtype == element_dtype_,
+                  errors::InvalidArgument(
+                      "input_b[", b, "].dtype != element_dtype.  Saw: ",
+                      DataTypeString(l_b->element_dtype), " vs. ",
+                      DataTypeString(element_dtype_)));
+      OP_REQUIRES(c, l_a->element_shape.IsIdenticalTo(l_b->element_shape),
+                  errors::InvalidArgument(
+                      "input_a and input_b TensorList element shapes are not "
+                      "identical at index ",
+                      b, ".  Saw ", l_a->element_shape.DebugString(), " vs. ",
+                      l_b->element_shape.DebugString()));
+      if (tl_alias) {
+        TensorList* out = output_t(b).get<TensorList>();
+        DCHECK(out != nullptr) << "Expected output to alias input_a, but it "
+                                  "doesn't contain a TensorList at index "
+                               << b;
+        std::copy(l_b->tensors.begin(), l_b->tensors.end(),
+                  std::back_inserter(out->tensors));
+      } else {
+        TensorList out = *l_a;
+        std::copy(l_b->tensors.begin(), l_b->tensors.end(),
+                  std::back_inserter(out.tensors));
+        output_t(b) = std::move(out);
+      }
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_CPU),
+                        TensorListConcatLists);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_GPU),
+                        TensorListConcatLists);
+
+#endif  // GOOGLE_CUDA
+
+#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(T)               \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListPushBackBatch<CPUDevice, T>)
+
+TF_CALL_ALL_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint8);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint8);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint16);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint16);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint32);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
+
+#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU
+
 #define REGISTER_TENSOR_LIST_STACK_CPU(T)                         \
   REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
                               .TypeConstraint<T>("element_dtype") \
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index 935f892dd0515025e97e02c8e941b96f21ed3b3e..0ea9362cbe4da46d531086aef71618c3382a25e7 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -51,6 +51,21 @@ REGISTER_TENSOR_LIST_STACK_GPU(bool);
 
 #undef REGISTER_TENSOR_LIST_STACK_GPU
 
+#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(T)               \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU),                \
+                          TensorListPushBackBatch<GPUDevice, T>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
+
+#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU
+
 #define REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(T)                   \
   REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
                               .TypeConstraint<T>("element_dtype") \
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 9733883001d4ce7888b4893ecb43047b621a3eba..42871c611301be2671a9c25e1e46abb0dc0a7b13 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -34,6 +34,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
 // Variant compatible type for a list of tensors. This is mutable but instances
 // should never be mutated after stored in a variant tensor.
 struct TensorList {
@@ -83,7 +85,8 @@ class TensorListStack : public OpKernel {
                                         DataTypeString(l->element_dtype)));
     OP_REQUIRES(c, l->element_shape.IsFullyDefined(),
                 errors::InvalidArgument("Tried to stack elements from a list "
-                                        "with non-fully-defined shape."));
+                                        "with non-fully-defined shape: ",
+                                        l->element_shape.DebugString()));
     if (num_elements_ != -1) {
       OP_REQUIRES(c, l->tensors.size() == num_elements_,
                   errors::InvalidArgument("Operation expected a list with ",
@@ -145,6 +148,10 @@ class TensorListFromTensor : public OpKernel {
     TensorList output_list;
     const Tensor& t = c->input(0);
     output_list.element_dtype = t.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    t.shape().DebugString()));
     TensorShape output_shape(t.shape());
     output_shape.RemoveDim(0);
     OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
@@ -159,15 +166,13 @@ class TensorListFromTensor : public OpKernel {
       tmp_shape.RemoveDim(0);
       OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
                   errors::Unknown("Unexpected shape error."));
-      if (tmp.IsAligned() || !DataTypeCanUseMemcpy(DataTypeToEnum<T>::value)) {
-        output_list.tensors.push_back(tmp);
-      } else {
-        Tensor aligned;
-        OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
-        aligned.flat<T>().device(c->eigen_device<Device>()) =
-            tmp.unaligned_flat<T>();
-        output_list.tensors.push_back(aligned);
-      }
+      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+      // prevent this.
+      Tensor aligned;
+      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+      aligned.flat<T>().device(c->eigen_device<Device>()) =
+          tmp.unaligned_flat<T>();
+      output_list.tensors.push_back(aligned);
     }
     output_tensor->scalar<Variant>()() = std::move(output_list);
   }
@@ -268,6 +273,121 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
   return Status::OK();
 }
 
+template <typename Device, typename T>
+class TensorListPushBackBatch : public OpKernel {
+ public:
+  explicit TensorListPushBackBatch(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  ~TensorListPushBackBatch() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input = c->input(1);
+    OP_REQUIRES(c, element_dtype_ == input.dtype(),
+                errors::InvalidArgument("Invalid data types; list elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but tried to append ",
+                                        DataTypeString(input.dtype())));
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+                errors::InvalidArgument(
+                    "Expected tensor to be at least a vector, but saw shape: ",
+                    input.shape().DebugString()));
+
+    const TensorShape& tls_shape = c->input(0).shape();
+
+    // For purposes of input forwarding, we want the least restrictive
+    // AllocatorAttributes possible.  If we need to allocate later,
+    // we'll request the DT_VARIANT be allocated on host.
+    AllocatorAttributes attr;
+
+    std::unique_ptr<Tensor> tls_alias = c->forward_input(
+        0 /*input_index*/, 0 /*output_index*/, DT_VARIANT, tls_shape,
+        DEVICE_MEMORY /* input is always on DEVICE_MEMORY */, attr);
+
+    const Tensor& tls = tls_alias ? *tls_alias : c->input(0);
+
+    OP_REQUIRES(c, tls.dtype() == DT_VARIANT,
+                errors::InvalidArgument(
+                    "Expected input_handles dtype to be Variant, but saw: ",
+                    DataTypeString(tls.dtype())));
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(tls_shape),
+                errors::InvalidArgument(
+                    "Expected input_handles to be a vector, but saw shape: ",
+                    tls_shape.DebugString()));
+    const int64 batch_size = tls.NumElements();
+    OP_REQUIRES(c, input.dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "Expected tensor.shape[0] == input_handles.size, but saw ",
+                    input.dim_size(0), " vs. ", batch_size));
+    auto tls_t = tls.vec<Variant>();
+
+    TensorShape input_element_shape = input.shape();
+    input_element_shape.RemoveDim(0);
+    std::vector<const TensorList*> tl_batch;
+    for (int64 b = 0; b < batch_size; ++b) {
+      const TensorList* l = tls_t(b).get<TensorList>();
+      OP_REQUIRES(c, l != nullptr,
+                  errors::InvalidArgument("Input handle at index ", b,
+                                          " is not a list. Saw: '",
+                                          tls_t(b).DebugString(), "'"));
+      OP_REQUIRES(
+          c, l->element_shape.IsCompatibleWith(input_element_shape),
+          errors::InvalidArgument(
+              "Tried to append a tensor with incompatible shape to a "
+              "list at index ",
+              b, ". Op element shape: ", input_element_shape.DebugString(),
+              " list shape: ", l->element_shape.DebugString()));
+      OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                  errors::InvalidArgument(
+                      "Invalid data type at index ", b, "; op elements ",
+                      DataTypeString(element_dtype_), " but list elements ",
+                      DataTypeString(l->element_dtype)));
+      tl_batch.push_back(l);
+    }
+
+    Tensor* result;
+
+    if (tls_alias) {
+      result = tls_alias.get();
+      c->set_output(0, *result);
+    } else {
+      // DT_VARIANT tensors always allocated on host.
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(
+          c, c->allocate_output(0, TensorShape{batch_size}, &result, attr));
+    }
+
+    if (batch_size == 0) {
+      return;
+    }
+
+    auto input_t = input.flat_outer_dims<T, 2>();
+    auto result_t = result->vec<Variant>();
+
+    for (int64 b = 0; b < batch_size; ++b) {
+      if (!tls_alias) {
+        result_t(b) = *tl_batch[b];
+      }
+      TensorList* output = result_t(b).get<TensorList>();
+      DCHECK(output != nullptr);
+      Tensor* frame;
+      PersistentTensor tmp;
+      OP_REQUIRES_OK(c, c->allocate_persistent(
+                            element_dtype_, input_element_shape, &tmp, &frame));
+      if (input_element_shape.num_elements() > 0) {
+        auto frame_t = frame->flat<T>();
+        frame_t.device(c->eigen_device<Device>()) = input_t.template chip<0>(b);
+      }
+      output->tensors.push_back(std::move(*frame));
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index bacf3e77408a12a8a95bf7e7ab8f3a580e675675..6b6a14e9a7383b0a0720782acf69e0896df2444e 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -90,4 +90,23 @@ class PrintOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("Print").Device(DEVICE_CPU), PrintOp);
 
+class TimestampOp : public OpKernel {
+ public:
+  explicit TimestampOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorShape output_shape;  // Default shape is 0 dim, 1 element
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape, &output_tensor));
+
+    auto output_scalar = output_tensor->scalar<double>();
+    double now_us = static_cast<double>(Env::Default()->NowMicros());
+    double now_s = now_us / 1000000;
+    output_scalar() = now_s;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Timestamp").Device(DEVICE_CPU), TimestampOp);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
index 9cf669a7efc973a7be4f3139b2180d4e3b07797b..5e6958f364dbbfd6ff6cf112a6cef544202ee955 100644
--- a/tensorflow/core/kernels/logging_ops_test.cc
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <chrono>
+#include <thread>
+
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -96,5 +99,27 @@ TEST_F(PrintingGraphTest, FirstNSuccess) {
   test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
 }
 
+class TimestampTest : public OpsTestBase {
+ protected:
+  Status Init() {
+    TF_CHECK_OK(NodeDefBuilder("op", "Timestamp").Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(TimestampTest, WaitAtLeast) {
+  TF_ASSERT_OK(Init());
+  TF_ASSERT_OK(RunOpKernel());
+  double ts1 = *((*GetOutput(0)).flat<double>().data());
+
+  // wait 1 second
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+
+  TF_ASSERT_OK(RunOpKernel());
+  double ts2 = *((*GetOutput(0)).flat<double>().data());
+
+  EXPECT_LE(1.0, ts2 - ts1);
+}
+
 }  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index e3872fee0edcae543b9193e0dcf6850d194ef067..57b7798ba04eab5d1a869d4782dfe7d0dc727df4 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/kernels/initializable_lookup_table.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -62,8 +63,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
     mutex_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
       value_values(i) = gtl::FindWithDefault(
-          table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
+          table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
     }
 
     return Status::OK();
@@ -78,9 +78,8 @@ class MutableHashTableOfScalars final : public LookupInterface {
       table_.clear();
     }
     for (int64 i = 0; i < key_values.size(); ++i) {
-      gtl::InsertOrUpdate(&table_,
-                          SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-                          SubtleMustCopyUnlessStringOrFloat(value_values(i)));
+      gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
+                          SubtleMustCopyIfIntegral(value_values(i)));
     }
     return Status::OK();
   }
@@ -172,8 +171,8 @@ class MutableHashTableOfTensors final : public LookupInterface {
 
     mutex_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
-      ValueArray* value_vec = gtl::FindOrNull(
-          table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)));
+      ValueArray* value_vec =
+          gtl::FindOrNull(table_, SubtleMustCopyIfIntegral(key_values(i)));
       if (value_vec != nullptr) {
         for (int64 j = 0; j < value_dim; j++) {
           value_values(i, j) = value_vec->at(j);
@@ -203,8 +202,8 @@ class MutableHashTableOfTensors final : public LookupInterface {
         V value = value_values(i, j);
         value_vec.push_back(value);
       }
-      gtl::InsertOrUpdate(
-          &table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)), value_vec);
+      gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
+                          value_vec);
     }
     return Status::OK();
   }
@@ -379,15 +378,14 @@ class MutableDenseHashTable final : public LookupInterface {
           for (int64 j = 0; j < value_size; ++j) {
             // TODO(andreasst): check if we can get rid of SubtleMustCopy
             // here and elsewhere in this file.
-            value_matrix(i, j) = SubtleMustCopyUnlessStringOrFloat(
-                value_buckets_matrix(bucket_index, j));
+            value_matrix(i, j) =
+                SubtleMustCopyIfIntegral(value_buckets_matrix(bucket_index, j));
           }
           break;
         }
         if (IsEqualKey(key_buckets_matrix, bucket_index, empty_key_matrix, 0)) {
           for (int64 j = 0; j < value_size; ++j) {
-            value_matrix(i, j) =
-                SubtleMustCopyUnlessStringOrFloat(default_flat(j));
+            value_matrix(i, j) = SubtleMustCopyIfIntegral(default_flat(j));
           }
           break;
         }
@@ -531,7 +529,7 @@ class MutableDenseHashTable final : public LookupInterface {
         if (IsEqualKey(key_buckets_matrix, bucket_index, key_matrix, i)) {
           for (int64 j = 0; j < value_size; ++j) {
             value_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(value_matrix(i, j));
+                SubtleMustCopyIfIntegral(value_matrix(i, j));
           }
           break;
         }
@@ -539,11 +537,11 @@ class MutableDenseHashTable final : public LookupInterface {
           ++num_entries_;
           for (int64 j = 0; j < key_size; ++j) {
             key_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(key_matrix(i, j));
+                SubtleMustCopyIfIntegral(key_matrix(i, j));
           }
           for (int64 j = 0; j < value_size; ++j) {
             value_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(value_matrix(i, j));
+                SubtleMustCopyIfIntegral(value_matrix(i, j));
           }
           break;
         }
@@ -849,6 +847,7 @@ REGISTER_KERNEL(string, int64);
 REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, Variant);
 
 #undef REGISTER_KERNEL
 
@@ -899,6 +898,7 @@ REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(string, float);
 REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(int64, bool);
+REGISTER_KERNEL(int64, Variant);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 5ba9b936e4ea309ceda645f63e9630f01a99c985..3977f16299fb74ed2121d7fd21180af1c1935154 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -125,19 +125,21 @@ namespace lookup {
 // integral types. However non-integer variables are not allowed and therefore
 // the local copy is unnecessary.
 template <typename T>
-T SubtleMustCopyUnlessStringOrFloat(const T& value) {
+T SubtleMustCopyIfIntegral(const T& value) {
   return internal::SubtleMustCopy(value);
 }
 
-inline const string& SubtleMustCopyUnlessStringOrFloat(const string& value) {
+inline const string& SubtleMustCopyIfIntegral(const string& value) {
   return value;
 }
 
-inline const float SubtleMustCopyUnlessStringOrFloat(const float value) {
+inline const float SubtleMustCopyIfIntegral(const float value) { return value; }
+
+inline const double SubtleMustCopyIfIntegral(const double value) {
   return value;
 }
 
-inline const double SubtleMustCopyUnlessStringOrFloat(const double value) {
+inline const Variant& SubtleMustCopyIfIntegral(const Variant& value) {
   return value;
 }
 
@@ -175,6 +177,30 @@ class HashTable : public InitializableLookupTable {
     return table_ ? table_->size() : 0;
   }
 
+  Status ExportValues(OpKernelContext* context) override {
+    if (!is_initialized_) {
+      return errors::Aborted("HashTable is not initialized.");
+    }
+
+    const int64 size = table_->size();
+
+    Tensor* keys;
+    Tensor* values;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output("keys", TensorShape({size}), &keys));
+    TF_RETURN_IF_ERROR(
+        context->allocate_output("values", TensorShape({size}), &values));
+
+    auto keys_data = keys->flat<K>();
+    auto values_data = values->flat<V>();
+    int64 i = 0;
+    for (auto it = table_->begin(); it != table_->end(); ++it, ++i) {
+      keys_data(i) = it->first;
+      values_data(i) = it->second;
+    }
+    return Status::OK();
+  }
+
   DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
 
   DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
@@ -191,6 +217,11 @@ class HashTable : public InitializableLookupTable {
     return Status::OK();
   };
 
+  Status DoLazyPrepare(std::function<int64(void)> unused) override {
+    constexpr size_t kUnusedSize = 0;
+    return DoPrepare(kUnusedSize);
+  }
+
   Status DoInsert(const Tensor& keys, const Tensor& values) override {
     if (!table_) {
       return errors::FailedPrecondition("HashTable is not prepared.");
@@ -199,8 +230,8 @@ class HashTable : public InitializableLookupTable {
     const auto key_values = keys.flat<K>();
     const auto value_values = values.flat<V>();
     for (int64 i = 0; i < key_values.size(); ++i) {
-      const K key = SubtleMustCopyUnlessStringOrFloat(key_values(i));
-      const V value = SubtleMustCopyUnlessStringOrFloat(value_values(i));
+      const K key = SubtleMustCopyIfIntegral(key_values(i));
+      const V value = SubtleMustCopyIfIntegral(value_values(i));
       const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
       if (previous_value != value) {
         return errors::FailedPrecondition(
@@ -219,8 +250,7 @@ class HashTable : public InitializableLookupTable {
 
     for (int64 i = 0; i < key_values.size(); ++i) {
       value_values(i) = gtl::FindWithDefault(
-          *table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
+          *table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index c7ce1c3747ea9f329f96d62af27708b0c9f4eb68..77386a16e01352a7691c744ee882c5c6e1b0d5d9 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -75,9 +75,6 @@ class TextFileLineIterator
   Status Init(const string& filename, int64 vocab_size, char delimiter,
               DataType key_dtype, int64 key_index, DataType value_dtype,
               int64 value_index, Env* env) {
-    if (vocab_size == -1) {
-      TF_RETURN_IF_ERROR(GetNumLinesInTextFile(env, filename, &vocab_size));
-    }
     filename_ = filename;
     vocab_size_ = vocab_size;
     delimiter_ = delimiter;
@@ -85,6 +82,7 @@ class TextFileLineIterator
     value_ = Tensor(value_dtype, TensorShape({}));
     key_index_ = key_index;
     value_index_ = value_index;
+    env_ = env;
 
     status_ = env->NewRandomAccessFile(filename_, &file_);
     if (!status_.ok()) return status_;
@@ -103,7 +101,8 @@ class TextFileLineIterator
     string line;
     status_ = input_buffer_->ReadLine(&line);
     if (!status_.ok()) {
-      if (errors::IsOutOfRange(status_) && next_id_ != vocab_size_) {
+      if (errors::IsOutOfRange(status_) && vocab_size_ != -1 &&
+          next_id_ != vocab_size_) {
         status_ = errors::InvalidArgument("Invalid vocab_size in ", filename_,
                                           ": expected ", vocab_size_,
                                           " but got ", next_id_);
@@ -111,7 +110,7 @@ class TextFileLineIterator
       valid_ = false;
       return;
     }
-    if (next_id_ >= vocab_size_) {
+    if (vocab_size_ != -1 && next_id_ >= vocab_size_) {
       LOG(WARNING) << "Truncated " << filename_ << " before its end at "
                    << vocab_size_ << " records.";
       LOG(WARNING) << "next_id_  : " << next_id_;
@@ -162,7 +161,18 @@ class TextFileLineIterator
 
   Status status() const override { return status_; }
 
-  int64 total_size() const override { return vocab_size_; }
+  int64 total_size() const override {
+    if (vocab_size_ == -1) {
+      int64 new_size;
+      Status status = GetNumLinesInTextFile(env_, filename_, &new_size);
+      if (!status.ok()) {
+        LOG(WARNING) << "Unable to get line count: " << status;
+        new_size = -1;
+      }
+      *const_cast<int64*>(&vocab_size_) = new_size;
+    }
+    return vocab_size_;
+  }
 
  private:
   Tensor key_;
@@ -170,6 +180,7 @@ class TextFileLineIterator
   bool valid_;  // true if the iterator points to an existing range.
   int64 key_index_;
   int64 value_index_;
+  Env* env_;
   int64 next_id_;
   int64 vocab_size_;
   string filename_;
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index c3a59c95762ad03f217768a9b14e31d6f501d789..b4252eb04446895a7c293b62473dba28a06845a1 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -187,14 +187,14 @@ struct LaunchLRN<GPUDevice, T> {
     const int cols = static_cast<int>(in.dim_size(2));
     const int depth = static_cast<int>(in.dim_size(3));
 
-    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    se::dnn::BatchDescriptor dimensions_desc;
     dimensions_desc.set_count(batch)
         .set_height(rows)
         .set_width(cols)
         .set_feature_map_count(depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+        .set_layout(se::dnn::DataLayout::kBatchYXDepth);
 
-    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    se::dnn::NormalizeDescriptor normalize_desc;
     normalize_desc.set_bias(bias_)
         .set_range(depth_radius_)
         .set_alpha(alpha_)
@@ -404,14 +404,14 @@ struct LaunchLRNGrad<GPUDevice, T> {
     const int64 cols = in_grads.dim_size(2);
     const int64 depth = in_grads.dim_size(3);
 
-    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    se::dnn::BatchDescriptor dimensions_desc;
     dimensions_desc.set_count(batch)
         .set_height(rows)
         .set_width(cols)
         .set_feature_map_count(depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+        .set_layout(se::dnn::DataLayout::kBatchYXDepth);
 
-    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    se::dnn::NormalizeDescriptor normalize_desc;
     normalize_desc.set_bias(bias_)
         .set_range(depth_radius_)
         .set_alpha(alpha_)
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index cdff7bad5fe222b6f0824a742caa0a4e5d939f71..7912ca1563c7ef27bb5eb5e5016a8b5dec9eb222 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -60,6 +60,7 @@ class MatchingFilesOp : public OpKernel {
         output(index++) = all_fnames[i][j];
       }
     }
+    std::sort(&output(0), &output(0) + num_files);
   }
 };
 
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index f499ce6519d097c7fea05e8175d08d102880f7fd..3664f95c3b1ed44fdc5d82390aa8d661ab2cfd30 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -112,7 +112,7 @@ bool ExplicitVectorMatrixOptimization<Eigen::half>(
 template <typename Device, typename T>
 struct LaunchMatMulBase {
 #if GOOGLE_CUDA
-  typedef perftools::gputools::blas::AlgorithmType AlgorithmType;
+  typedef se::blas::AlgorithmType AlgorithmType;
 #else
   typedef int64 AlgorithmType;
 #endif  // GOOGLE_CUDA
@@ -160,15 +160,12 @@ namespace {
 
 template <typename T>
 struct LaunchBlasGemv {
-  static void Compute(
-      OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
-      uint64 m, uint64 n, const perftools::gputools::DeviceMemory<T>& a,
-      const perftools::gputools::DeviceMemory<T>& b,
-      perftools::gputools::DeviceMemory<T>* c,
-      perftools::gputools::blas::ProfileResult* output_profile) {
-    const auto blas_trans =
-        trans ? perftools::gputools::blas::Transpose::kTranspose
-              : perftools::gputools::blas::Transpose::kNoTranspose;
+  static void Compute(OpKernelContext* ctx, se::Stream* stream, bool trans,
+                      uint64 m, uint64 n, const se::DeviceMemory<T>& a,
+                      const se::DeviceMemory<T>& b, se::DeviceMemory<T>* c,
+                      se::blas::ProfileResult* output_profile) {
+    const auto blas_trans = trans ? se::blas::Transpose::kTranspose
+                                  : se::blas::Transpose::kNoTranspose;
     if (output_profile == nullptr) {
       bool blas_launch_status =
           stream
@@ -198,11 +195,10 @@ struct LaunchBlasGemv {
 
 template <>
 void LaunchBlasGemv<Eigen::half>::Compute(
-    OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
-    uint64 m, uint64 n, const perftools::gputools::DeviceMemory<Eigen::half>& a,
-    const perftools::gputools::DeviceMemory<Eigen::half>& b,
-    perftools::gputools::DeviceMemory<Eigen::half>* c,
-    perftools::gputools::blas::ProfileResult* output_profile) {
+    OpKernelContext* ctx, se::Stream* stream, bool trans, uint64 m, uint64 n,
+    const se::DeviceMemory<Eigen::half>& a,
+    const se::DeviceMemory<Eigen::half>& b, se::DeviceMemory<Eigen::half>* c,
+    se::blas::ProfileResult* output_profile) {
   ctx->SetStatus(errors::Internal(
       "Blas GEMV launch failed: GEMV is not implemented for float16."));
 }
@@ -219,10 +215,9 @@ bool ShouldUseGemv(uint64 n) {
 
 }  // namespace
 
-bool GetCublasAutotuneComputationType(
-    const DataType& dtype,
-    perftools::gputools::blas::ComputationType* compute_type) {
-  using perftools::gputools::blas::ComputationType;
+bool GetCublasAutotuneComputationType(const DataType& dtype,
+                                      se::blas::ComputationType* compute_type) {
+  using se::blas::ComputationType;
   bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input();
   switch (dtype) {
     case DT_HALF:
@@ -250,7 +245,7 @@ struct MatmulAutoTuneGroup {
   static string name() { return "Matmul"; }
 };
 typedef AutoTuneSingleton<MatmulAutoTuneGroup, MatmulParameters,
-                          perftools::gputools::blas::AlgorithmConfig>
+                          se::blas::AlgorithmConfig>
     AutoTuneMatmul;
 
 template <typename T>
@@ -259,14 +254,14 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
       OpKernelContext* ctx, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
       std::vector<int64>* algorithms, bool use_autotune, Tensor* out) {
-    using perftools::gputools::blas::AlgorithmConfig;
-    using perftools::gputools::blas::ComputationType;
-    using perftools::gputools::blas::kDefaultAlgorithm;
-    using perftools::gputools::blas::kDefaultBlasGemm;
-    using perftools::gputools::blas::kDefaultBlasGemv;
-    using perftools::gputools::blas::kNoAlgorithm;
-    using perftools::gputools::blas::ProfileResult;
-    using perftools::gputools::blas::Transpose;
+    using se::blas::AlgorithmConfig;
+    using se::blas::ComputationType;
+    using se::blas::kDefaultAlgorithm;
+    using se::blas::kDefaultBlasGemm;
+    using se::blas::kDefaultBlasGemv;
+    using se::blas::kNoAlgorithm;
+    using se::blas::ProfileResult;
+    using se::blas::Transpose;
     Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
     const uint64 m = a.dim_size(1 - dim_pair[0].first);
     const uint64 k = a.dim_size(dim_pair[0].first);
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op.cc b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
index 6f7e6a7496840f027c08afa65d9381afe1f78a76..5de0d1118af1a00a00e8d80424810dec213f75fb 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@@ -34,11 +34,9 @@ namespace tensorflow {
 #if GOOGLE_CUDA
 namespace {
 template <typename Scalar>
-perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
-    const Scalar* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(
-      const_cast<Scalar*>(cuda_memory));
-  perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
+se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<Scalar*>(cuda_memory));
+  se::DeviceMemory<Scalar> typed(wrapped);
   return typed;
 }
 }  // namespace
@@ -204,18 +202,17 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
     // output' = rhs' / matrix' (' stands for transpose)
     // Upper/lower needs to be swapped for this.
 
-    perftools::gputools::blas::UpperLower upper_lower_matrix;
-    perftools::gputools::blas::Transpose transpose_matrix;
+    se::blas::UpperLower upper_lower_matrix;
+    se::blas::Transpose transpose_matrix;
     if (lower_) {
-      upper_lower_matrix = perftools::gputools::blas::UpperLower::kUpper;
+      upper_lower_matrix = se::blas::UpperLower::kUpper;
     } else {
-      upper_lower_matrix = perftools::gputools::blas::UpperLower::kLower;
+      upper_lower_matrix = se::blas::UpperLower::kLower;
     }
     if (adjoint_) {
-      transpose_matrix =
-          perftools::gputools::blas::Transpose::kConjugateTranspose;
+      transpose_matrix = se::blas::Transpose::kConjugateTranspose;
     } else {
-      transpose_matrix = perftools::gputools::blas::Transpose::kNoTranspose;
+      transpose_matrix = se::blas::Transpose::kNoTranspose;
     }
     uint64 leading_dim_matrix = matrix.cols();
     uint64 leading_dim_output = output.cols();
@@ -224,11 +221,11 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
     bool blas_launch_status =
         stream
             ->ThenBlasTrsm(
-                perftools::gputools::blas::Side::kRight /*side*/,
-                upper_lower_matrix /*uplo*/, transpose_matrix /*trans*/,
-                perftools::gputools::blas::Diagonal::kNonUnit /*diag*/,
-                colmajor_rows /*m*/, colmajor_cols /*n*/, Scalar(1.0) /*alpha*/,
-                matrix_ptr, leading_dim_matrix /*lda*/, &out_ptr,
+                se::blas::Side::kRight /*side*/, upper_lower_matrix /*uplo*/,
+                transpose_matrix /*trans*/,
+                se::blas::Diagonal::kNonUnit /*diag*/, colmajor_rows /*m*/,
+                colmajor_cols /*n*/, Scalar(1.0) /*alpha*/, matrix_ptr,
+                leading_dim_matrix /*lda*/, &out_ptr,
                 leading_dim_output /*ldb*/)
             .ok();
     if (!blas_launch_status) {
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 9be7408012bb81e80c73c29a6ee9bb6763c04490..507fc9983776d2fd54ca66cc70aa7695886b4b5e 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/eigen_pooling.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -56,7 +57,7 @@ template <typename Device, typename T>
 static void SpatialMaxPoolWithArgMaxHelper(
     OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
     Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
-    const PoolParameters& params, const Padding& padding) {
+    const PoolParameters& params) {
   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
       ConstEigenMatrixMap;
   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
@@ -151,7 +152,7 @@ static void SpatialMaxPoolWithArgMaxHelper(
       }
     }
 
-    {
+    if (input_backprop != nullptr) {
       auto input_backprop_flat = input_backprop->flat<T>();
       auto out_arg_max_flat = output_arg_max->flat<int64>();
       auto out_backprop_flat = out_backprop.flat<T>();
@@ -173,9 +174,9 @@ static void SpatialMaxPoolWithArgMaxHelper(
         // Although this check is in the inner loop, it is worth its value
         // so we don't end up with memory corruptions. Our benchmark shows that
         // the performance impact is quite small
-        CHECK(input_backprop_index >= in_start && input_backprop_index < in_end)
-            << "Invalid input backprop index: " << input_backprop_index << ", "
-            << in_start << ", " << in_end;
+        // CHECK(input_backprop_index >= in_start && input_backprop_index <
+        // in_end)
+        FastBoundsCheck(input_backprop_index - in_start, in_end - in_start);
         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
       }
     }
@@ -293,7 +294,7 @@ class MaxPoolingGradOp : public OpKernel {
 
     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
-        out_backprop, params, padding_);
+        out_backprop, params);
   }
 
  private:
@@ -403,10 +404,10 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
                     "Pooling is not yet supported on the batch dimension."));
 
     if (use_dnn_) {
-      DnnPoolingGradOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
-          stride, padding_, data_format_, &tensor_in, &tensor_out, out_backprop,
-          output_shape, propagate_nans_);
+      DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
+                                   ksize, stride, padding_, data_format_,
+                                   &tensor_in, &tensor_out, out_backprop,
+                                   output_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "Non-Cudnn MaxPoolGrad only supports NHWC format";
@@ -869,6 +870,17 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingWithArgmax;
 
+template <typename T>
+struct LaunchMaxPoolingWithArgmax<CPUDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& input, Tensor* output, Tensor* argmax,
+                     bool propagate_nans) {
+    Tensor unused;
+    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
+        context, output, argmax, nullptr, input, unused, params);
+  }
+};
+
 template <typename Device, typename T>
 class MaxPoolingWithArgmaxOp : public OpKernel {
  public:
@@ -921,6 +933,53 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingGradWithArgmax;
 
+template <typename T>
+struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      EigenMatrixMap;
+
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& grad_in, const Tensor& argmax,
+                     Tensor* grad_out) {
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    auto shard = [&grad_in, &argmax, &grad_out](int64 start, int64 limit) {
+      const int64 batch_size =
+          GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
+      const int64 output_size_per_batch = grad_out->NumElements() / batch_size;
+      const int64 input_size_per_batch = grad_in.NumElements() / batch_size;
+
+      {
+        auto grad_out_flat = grad_out->flat<T>();
+        auto argmax_flat = argmax.flat<int64>();
+        auto grad_in_flat = grad_in.flat<T>();
+
+        const int64 output_start = start * output_size_per_batch;
+        const int64 output_end = limit * output_size_per_batch;
+        EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1,
+                                  output_end - output_start);
+        inputShard.setConstant(T(0));
+
+        const int input_start = start * input_size_per_batch;
+        const int input_end = limit * input_size_per_batch;
+        for (int64 index = input_start; index < input_end; index++) {
+          const int64 grad_out_index = argmax_flat(index);
+          CHECK(grad_out_index >= output_start && grad_out_index < output_end)
+              << "Invalid output gradient index: " << grad_out_index << ", "
+              << output_start << ", " << output_end;
+          grad_out_flat(grad_out_index) += grad_in_flat(index);
+        }
+      }
+    };
+
+    const int64 batch_size = GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
+    const int64 shard_cost = grad_out->NumElements() / batch_size;
+    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
+          shard_cost, shard);
+  }
+};
+
 template <typename Device, typename T>
 class MaxPoolingGradWithArgmaxOp : public OpKernel {
  public:
@@ -1077,10 +1136,9 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
 
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(context,
-                               perftools::gputools::dnn::PoolingMode::kMaximum,
-                               ksize_, stride_, padding_, data_format_,
-                               tensor_in, out_shape, propagate_nans_);
+      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
+                               stride_, padding_, data_format_, tensor_in,
+                               out_shape, propagate_nans_);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
@@ -1181,9 +1239,8 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
                         params.out_width, params.depth);
     if (use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(context,
-                               perftools::gputools::dnn::PoolingMode::kMaximum,
-                               ksize, stride, padding_, data_format_, tensor_in,
+      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
+                               stride, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
@@ -1309,7 +1366,17 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
                               .HostMemory("ksize")                       \
                               .HostMemory("strides")                     \
                               .TypeConstraint<T>("T"),                   \
-                          MaxPoolingGradGradOp<D##Device, T>);
+                          MaxPoolingGradGradOp<D##Device, T>)            \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                      \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<int64>("Targmax")          \
+                              .TypeConstraint<T>("T"),                   \
+                          MaxPoolingWithArgmaxOp<D##Device, T>);         \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")                  \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .TypeConstraint<int64>("Targmax"),         \
+                          MaxPoolingGradWithArgmaxOp<D##Device, T>);
 
 // Below kernels implemented only for CPU device.
 #define REGISTER_CPU_ONLY_POOL_KERNELS(T)                          \
@@ -1374,16 +1441,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
                               .HostMemory("strides")                 \
                               .TypeConstraint<T>("T"),               \
                           MaxPoolingNoMaskV2Op<GPUDevice, T>);       \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<int64>("Targmax")      \
-                              .TypeConstraint<T>("T"),               \
-                          MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<int64>("Targmax"),     \
-                          MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
                               .Device(DEVICE_GPU)                    \
                               .TypeConstraint<T>("T")                \
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc
index 26e1082989f317a35d55826a466cb8d9ef306c4c..1c85c744fc8c0f9c3025bd15ac87752c8af56367 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/mirror_pad_op.cc
@@ -173,6 +173,7 @@ namespace functor {
   DECLARE_CPU_SPEC(T, int64, 5);
 
 TF_CALL_POD_TYPES(DECLARE_CPU_SPECS);
+TF_CALL_string(DECLARE_CPU_SPECS);
 
 #undef DECLARE_CPU_SPEC
 #undef DECLARE_CPU_SPECS
@@ -194,6 +195,7 @@ TF_CALL_POD_TYPES(DECLARE_CPU_SPECS);
 
 // Note that we do register for bool type, but not in the gradient op.
 TF_CALL_POD_TYPES(REGISTER_KERNEL);
+TF_CALL_string(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
index 6716a26fac2c77ee1ee5306cc26cf802585dcfc4..f27ca139c9d4a62114b9f7a261e1d7dc7f766123 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
+++ b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
@@ -29,6 +29,7 @@ using CpuDevice = Eigen::ThreadPoolDevice;
   template struct functor::MirrorPad<CpuDevice, T, int32, CPU_PROVIDED_IXDIM>; \
   template struct functor::MirrorPad<CpuDevice, T, int64, CPU_PROVIDED_IXDIM>;
 TF_CALL_POD_TYPES(DEFINE_CPU_SPECS);
+TF_CALL_string(DEFINE_CPU_SPECS);
 #undef DEFINE_CPU_SPECS
 
 #define DEFINE_CPU_SPECS(T)                                   \
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index c48a2038f921986635795575a69606cbab24f12a..723b445a7568775a13b89c9fbf0e7dc70c4b8b8c 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -198,12 +198,10 @@ class BatchMatMulMkl : public OpKernel {
   void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
                          const bool TransB, const MKL_INT *M_Array,
                          const MKL_INT *N_Array, const MKL_INT *K_Array,
-                         const complex128 **A_Array,
-                         const MKL_INT *lda_Array,
-                         const complex128 **B_Array,
-                         const MKL_INT *ldb_Array, complex128 **C_Array,
-                         const MKL_INT *ldc_Array, const MKL_INT group_count,
-                         const MKL_INT *group_size) {
+                         const complex128 **A_Array, const MKL_INT *lda_Array,
+                         const complex128 **B_Array, const MKL_INT *ldb_Array,
+                         complex128 **C_Array, const MKL_INT *ldc_Array,
+                         const MKL_INT group_count, const MKL_INT *group_size) {
     std::vector<CBLAS_TRANSPOSE> TransA_array(
         group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
     std::vector<CBLAS_TRANSPOSE> TransB_array(
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index f1f267e849aa39b43c153b857493160e0d103970..9ab95d765c39d70448e5a99aeb3fad6101827daf 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -519,9 +519,11 @@ class MklConcatOp : public OpKernel {
     mkl_tensor_tf_shape.AddDim(
         SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
     int tf_output_index = 0;
-    context->allocate_output(
+    // TODO(jktomer): replace this with OP_REQUIRES_OK and clean up this file
+    // to propagate the status up the call stack.
+    TF_CHECK_OK(context->allocate_output(
         GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
-        mkl_tensor_tf_shape, &mkl_tensor);
+        mkl_tensor_tf_shape, &mkl_tensor));
     mkl_tensor_mkl_shape.SerializeMklShape(
         mkl_tensor->flat<uint8>().data(),
         mkl_tensor->flat<uint8>().size() * sizeof(uint8));
@@ -549,9 +551,11 @@ class MklConcatOp : public OpKernel {
     mkl_tensor_tf_shape.AddDim(
         SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
     int tf_output_index = 0;
-    context->allocate_output(
+    // TODO(jktomer): replace this with OP_REQUIRES_OK and clean up this file
+    // to propagate the status up the call stack.
+    TF_CHECK_OK(context->allocate_output(
         GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
-        mkl_tensor_tf_shape, &mkl_tensor);
+        mkl_tensor_tf_shape, &mkl_tensor));
     mkl_tensor_mkl_shape.SerializeMklShape(
         mkl_tensor->flat<uint8>().data(),
         mkl_tensor->flat<uint8>().size() * sizeof(uint8));
@@ -799,8 +803,10 @@ class MklConcatOp : public OpKernel {
     Tensor* output_tensor = nullptr;
     TensorShape tf_shape_output;
     tf_shape_output.AddDim(dnn_shape_output.GetSerializeBufferSize());
-    context->allocate_output(GetTensorMetaDataIndex(0, context->num_outputs()),
-                             tf_shape_output, &output_tensor);
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       GetTensorMetaDataIndex(0, context->num_outputs()),
+                       tf_shape_output, &output_tensor));
     dnn_shape_output.SerializeMklDnnShape(
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index 25c2573741265d4d33c9c91474792be241dd3b32..d23027a54d169b5e597bd26a63f26d38a23239ae 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -79,8 +79,9 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
     } else if (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW) {
       mkl_context.c_size = GetTensorDim(input, data_format_, 'C');
     } else {
-      errors::InvalidArgument("Unknown format ",
-                              " Format must be either NCHW or NHWC. ");
+      context->CtxFailure(errors::InvalidArgument(
+          "Unknown format ", " Format must be either NCHW or NHWC. "));
+      return;
     }
     TensorShape output_shape{mkl_context.c_size};
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 1401bc65a45bd80ed78230840cf0b9958b1f012e..e0706568b15204312445446a161d0aa9911f9e33 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -444,6 +444,7 @@ class MklConv2DCustomBackpropFilterOp
   ~MklConv2DCustomBackpropFilterOp() {}
 
  private:
+  const int kDilationH = 0, kDilationW = 1;
   void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
                          const MklDnnShape& filter_mkl_shape,
                          const MklDnnShape& obp_mkl_shape) {
@@ -492,7 +493,9 @@ class MklConv2DCustomBackpropFilterOp
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor, const memory::dims& strides,
+                       Tensor** output_tensor,
+                       const memory::dims& strides,
+                       const memory::dims& dilations,
                        const memory::dims& padding_l,
                        const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
@@ -518,31 +521,32 @@ class MklConv2DCustomBackpropFilterOp
       bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x);
     }
 
-    // Create convolution backward weights primitive.
-    auto bwd_desc =
-        (biasEnabled && (bias_grad != nullptr))
-            ? convolution_backward_weights::desc(
-                  convolution_direct, input->GetOpMemDesc(),
-                  output->GetOpMemDesc(), bias_grad->GetOpMemDesc(),
-                  outbackprop->GetOpMemDesc(), strides, padding_l, padding_r,
-                  padding)
-            : convolution_backward_weights::desc(
-                  convolution_direct, input->GetOpMemDesc(),
-                  output->GetOpMemDesc(), outbackprop->GetOpMemDesc(), strides,
-                  padding_l, padding_r, padding);
-
-    auto bwd_pd = convolution_backward_weights::primitive_desc(
-        bwd_desc, cpu_engine, conv_fwd_pd);
-
-    // Allocate output tensor.
-    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
-                         output_tensor);
-
-    CHECK_NOTNULL(*output_tensor);
-    // Set buffer handle using allocated output tensor.
-    output->SetUsrMemDataHandle(*output_tensor);
-
     if (biasEnabled && (bias_grad != nullptr)) {
+      // Create convolution backward weights with bias primitive.
+      // Use dilated convolution in case dilate rates are greater than zero.
+      auto bwd_desc = (dilations[kDilationH] > 0 || dilations[kDilationW] > 0) ?
+        convolution_backward_weights::desc(convolution_direct,
+                                  input->GetOpMemDesc(), output->GetOpMemDesc(),
+                                  bias_grad->GetOpMemDesc(),
+                                  outbackprop->GetOpMemDesc(), strides,
+                                  dilations, padding_l, padding_r, padding) :
+        convolution_backward_weights::desc(convolution_direct,
+                                  input->GetOpMemDesc(), output->GetOpMemDesc(),
+                                  bias_grad->GetOpMemDesc(),
+                                  outbackprop->GetOpMemDesc(),
+                                  strides, padding_l, padding_r, padding);
+      auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
+                                                            cpu_engine,
+                                                            conv_fwd_pd);
+
+      // Allocate output tensor.
+      AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
+                           bwd_output_format, output_tensor);
+
+      CHECK_NOTNULL(*output_tensor);
+      // Set buffer handle using allocated output tensor.
+      output->SetUsrMemDataHandle(*output_tensor);
+
       // Allocate bias_grad tensor
       TensorShape bias_grad_shape({depth});
       Tensor* bias_grad_tensor = nullptr;
@@ -553,11 +557,32 @@ class MklConv2DCustomBackpropFilterOp
           memory::desc({bias_grad_dims}, MklDnnType<T>(), memory::format::x);
       bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor);
       bias_grad->SetUsrMemDataHandle(bias_grad_tensor);
-    }
 
-    if (biasEnabled && (bias_grad != nullptr)) {
-      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output, bias_grad);
+      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output,
+                                  bias_grad);
     } else {
+      // Create convolution backward weights primitive.
+      // Use dilated convolution in case dilate rates are greater than zero.
+      auto bwd_desc = (dilations[kDilationH] > 0 || dilations[kDilationW] > 0) ?
+        convolution_backward_weights::desc(convolution_direct,
+                                  input->GetOpMemDesc(), output->GetOpMemDesc(),
+                                  outbackprop->GetOpMemDesc(), strides,
+                                  dilations, padding_l, padding_r, padding) :
+        convolution_backward_weights::desc(convolution_direct,
+                                  input->GetOpMemDesc(), output->GetOpMemDesc(),
+                                  outbackprop->GetOpMemDesc(),
+                                  strides, padding_l, padding_r, padding);
+      auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
+                                                            cpu_engine,
+                                                            conv_fwd_pd);
+
+      // Allocate output tensor.
+      AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
+                           bwd_output_format, output_tensor);
+
+      CHECK_NOTNULL(*output_tensor);
+      // Set buffer handle using allocated output tensor.
+      output->SetUsrMemDataHandle(*output_tensor);
       PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output);
     }
   }
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index eeed0095310280997ebb2ec3e848451df378c4fa..d203c04934131ee56fbca169d4c3e5e534d7986f 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -369,6 +369,7 @@ class MklConv2DCustomBackpropInputOp
  private:
   const int kInputIndex_Filter = 1, kInputIndex_InputSizes = 0,
             kInputIndex_OutBackProp = 2;
+  const int kDilationH = 0, kDilationW = 1;
   void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
                          const MklDnnShape& filter_mkl_shape,
                          const MklDnnShape& obp_mkl_shape) {
@@ -419,7 +420,9 @@ class MklConv2DCustomBackpropInputOp
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor, const memory::dims& strides,
+                       Tensor** output_tensor,
+                       const memory::dims& strides,
+                       const memory::dims& dilations,
                        const memory::dims& padding_l,
                        const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
@@ -432,9 +435,16 @@ class MklConv2DCustomBackpropInputOp
     CHECK_NOTNULL(output_tensor);
 
     // Create convolution backward data primitive.
-    auto bwd_desc = convolution_backward_data::desc(
-        convolution_direct, output->GetOpMemDesc(), filter->GetOpMemDesc(),
-        outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, padding);
+    // Use dilated convolution in case dilate rates are greater than zero.
+    auto bwd_desc = (dilations[kDilationH] > 0 || dilations[kDilationW] > 0) ?
+        convolution_backward_data::desc(convolution_direct,
+                      output->GetOpMemDesc(), filter->GetOpMemDesc(),
+                      outbackprop->GetOpMemDesc(), strides,
+                      dilations, padding_l, padding_r, padding):
+        convolution_backward_data::desc(convolution_direct,
+                      output->GetOpMemDesc(), filter->GetOpMemDesc(),
+                      outbackprop->GetOpMemDesc(),
+                      strides, padding_l, padding_r, padding);
 
     auto bwd_pd = convolution_backward_data::primitive_desc(
         bwd_desc, cpu_engine, conv_fwd_pd);
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 2953426d5824064952858124882126c154fe6725..f2b14f12789d0aff02fb728cf843813ab45d6662 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <map>
 #include <string>
 #include <vector>
+#include <memory>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -42,14 +43,13 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
-
 #include "mkldnn.hpp"
 
 using mkldnn::prop_kind;
 using mkldnn::stream;
-
-using mkldnn::convolution_direct;
 using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
+
 #else
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -57,11 +57,232 @@ using mkldnn::convolution_forward;
 
 namespace tensorflow {
 
+#ifndef INTEL_MKL_ML
+
+struct ConvFwdDimensions {
+  memory::dims src_dims;
+  memory::dims filter_dims;
+  memory::dims bias_dims;
+  memory::dims dst_dims;
+  memory::dims strides;
+  memory::dims dilations;
+  memory::dims padding_left;
+  memory::dims padding_right;
+
+  ConvFwdDimensions(memory::dims src_dims,
+    memory::dims filter_dims, memory::dims bias_dims,
+    memory::dims dst_dims, memory::dims strides,
+    memory::dims dilations, memory::dims padding_left,
+    memory::dims padding_right) :
+      src_dims(src_dims), filter_dims(filter_dims),
+      bias_dims(bias_dims), dst_dims(dst_dims),
+      strides(strides), dilations(dilations),
+      padding_left(padding_left), padding_right(padding_right) {
+  }
+};
+
+template <typename T>
+class Conv2DFwd : public DnnOp {
+ public:
+  explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) {
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    // create conv primitive
+    if (conv_fwd_ == nullptr) {
+      Setup(convFwdDims);
+    }
+  }
+
+  ~Conv2DFwd() {}
+
+  // Convolution forward execute with bias
+  //   src_data:    input data buffer of src
+  //   filter_data: input data buffer of filter (weights)
+  //   bias_data:   input data buffer of bias
+  //   dst_data:    output data buffer of dst
+  void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) {
+    src_mem_->set_data_handle(static_cast<void*>(src_data));
+    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
+    bias_mem_->set_data_handle(static_cast<void*>(bias_data));
+    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
+    fwd_stream_->submit(fwd_primitives_);
+
+    // after exec, set data handle back
+    src_mem_->set_data_handle(DummyData);
+    filter_mem_->set_data_handle(DummyData);
+    bias_mem_->set_data_handle(DummyData);
+    dst_mem_->set_data_handle(DummyData);
+
+    return;
+  }
+
+  // Convolution forward execute without bias
+  //   src_data:    input data buffer of src
+  //   filter_data: input data buffer of filter (weights)
+  //   dst_data:    output data buffer of dst
+  void Execute(T* src_data, T* filter_data, T* dst_data) {
+    src_mem_->set_data_handle(static_cast<void*>(src_data));
+    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
+    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
+    fwd_stream_->submit(fwd_primitives_);
+
+    // after exec, set data handle back
+    src_mem_->set_data_handle(DummyData);
+    filter_mem_->set_data_handle(DummyData);
+    dst_mem_->set_data_handle(DummyData);
+
+    return;
+  }
+
+  // expected memory format for this primitive instance
+  memory::format src_fmt_;
+  memory::format filter_fmt_;
+
+  // convolution primitive
+  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_;
+  std::shared_ptr<mkldnn::primitive> conv_fwd_;
+
+ private:
+  void Setup(const ConvFwdDimensions& convFwdDims) {
+    // create memory descriptors for convolution data w/ no specified format
+    src_md_.reset(new memory::desc({convFwdDims.src_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    filter_md_.reset(new memory::desc({convFwdDims.filter_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    dst_md_.reset(new memory::desc({convFwdDims.dst_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    if (!convFwdDims.bias_dims.empty())
+        bias_md_.reset(new memory::desc({convFwdDims.bias_dims},
+            MklDnnType<T>(), memory::format::any));
+
+    // create a convolution
+    if (!convFwdDims.bias_dims.empty()) {
+      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+          convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_,
+          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+          convFwdDims.padding_right, padding_kind::zero));
+    } else {
+      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+          convolution_direct, *src_md_, *filter_md_, *dst_md_,
+          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+          convFwdDims.padding_right, padding_kind::zero));
+    }
+
+    fwd_pd_.reset(new convolution_forward::primitive_desc(
+        *fwd_desc_, cpu_engine_));
+
+    // store the expected memory format
+    src_fmt_ = static_cast<mkldnn::memory::format>(
+        fwd_pd_.get()->src_primitive_desc().desc().data.format);
+
+    filter_fmt_ = static_cast<mkldnn::memory::format>(
+        fwd_pd_.get()->weights_primitive_desc().desc().data.format);
+
+    // create memory primitive based on dummy data
+    src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData));
+    filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(),
+                      DummyData));
+    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData));
+
+    // create convolution primitive and add it to net
+    if (!convFwdDims.bias_dims.empty()) {
+        bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType<T>(),
+                        memory::format::x}, cpu_engine_}, DummyData));
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                        *filter_mem_, *bias_mem_, *dst_mem_));
+    } else {
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                        *filter_mem_, *dst_mem_));
+    }
+
+    fwd_primitives_.push_back(*conv_fwd_);
+    return;
+  }
+
+  // MKLDNN memory
+  std::shared_ptr<mkldnn::memory> src_mem_;
+  std::shared_ptr<mkldnn::memory> filter_mem_;
+  std::shared_ptr<mkldnn::memory> bias_mem_;
+  std::shared_ptr<mkldnn::memory> dst_mem_;
+
+  std::shared_ptr<mkldnn::stream> fwd_stream_;
+  std::vector<mkldnn::primitive> fwd_primitives_;
+
+  // desc & prmitive desc
+  std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
+
+  // memory desc
+  std::shared_ptr<mkldnn::memory::desc> src_md_;
+  std::shared_ptr<mkldnn::memory::desc> filter_md_;
+  std::shared_ptr<mkldnn::memory::desc> bias_md_;
+  std::shared_ptr<mkldnn::memory::desc> dst_md_;
+
+  engine cpu_engine_ = engine(engine::cpu, 0);
+};
+
+template <typename T>
+class Conv2DFwdFactory : public DnnOpFactory<T> {
+ public:
+  static Conv2DFwd<T>* Get(const ConvFwdDimensions& convFwdDims) {
+     Conv2DFwd<T>* conv2d_fwd = nullptr;
+
+     // try to find a suitable one in pool
+     conv2d_fwd = dynamic_cast<Conv2DFwd<T>*> (
+       Conv2DFwdFactory<T>::GetInstance().GetConv2DFwd(convFwdDims));
+
+     if (conv2d_fwd == nullptr) {
+       conv2d_fwd = new Conv2DFwd<T>(convFwdDims);
+       Conv2DFwdFactory<T>::GetInstance().SetConv2DFwd(
+           convFwdDims, conv2d_fwd);
+     }
+     return conv2d_fwd;
+  }
+
+ private:
+  Conv2DFwdFactory() {}
+  ~Conv2DFwdFactory() {}
+
+  static const int kDilationH = 0, kDilationW = 1;
+
+  static Conv2DFwdFactory& GetInstance() {
+    static Conv2DFwdFactory instance_;
+    return instance_;
+  }
+
+  static std::string CreateKey(const ConvFwdDimensions& convFwdDims) {
+    std::string prefix = "conv2d_fwd_";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(convFwdDims.src_dims);
+    key_creator.AddAsKey(convFwdDims.filter_dims);
+    key_creator.AddAsKey(convFwdDims.bias_dims);
+    key_creator.AddAsKey(convFwdDims.dst_dims);
+    key_creator.AddAsKey(convFwdDims.strides);
+    key_creator.AddAsKey(convFwdDims.dilations);
+    key_creator.AddAsKey(convFwdDims.padding_left);
+    key_creator.AddAsKey(convFwdDims.padding_right);
+    return key_creator.GetKey();
+  }
+
+  DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) {
+    std::string key = CreateKey(convFwdDims);
+    return this->GetOp(key);
+  }
+
+  void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) {
+    std::string key = CreateKey(convFwdDims);
+    this->SetOp(key, op);
+  }
+};
+
+#endif
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// MKL-DNN is now default. MKL-ML must be specified explicitly.
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
 #ifdef INTEL_MKL_ML
-
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
  public:
@@ -294,8 +515,10 @@ class MklConv2DOp : public OpKernel {
     mkl_filter_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd,
                                              dnnResourceFilter);
 
-    size_t filter_sizes[4] = {filter.dim_size(0), filter.dim_size(1),
-                              filter.dim_size(2), filter.dim_size(3)};
+    size_t filter_sizes[4] = {static_cast<size_t>(filter.dim_size(0)),
+                              static_cast<size_t>(filter.dim_size(1)),
+                              static_cast<size_t>(filter.dim_size(2)),
+                              static_cast<size_t>(filter.dim_size(3))};
     mkl_filter_output_mkl_shape.SetTfLayout(filter.dims(), filter_sizes,
                                             mkl_context.filter_strides);
 
@@ -491,6 +714,7 @@ class MklConv2DOp : public OpKernel {
   ~MklConv2DOp() {}
 
   explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
@@ -507,12 +731,24 @@ class MklConv2DOp : public OpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    const int64 dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
   }
 
   void Compute(OpKernelContext* context) override {
     try {
-      auto cpu_engine = engine(engine::cpu, 0);
-
       // Input tensors
       const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
@@ -521,45 +757,46 @@ class MklConv2DOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-                  errors::InvalidArgument("Filter should not be in "
-                                          "Mkl Layout"));
+            errors::InvalidArgument("Filter should not be in "
+            "Mkl Layout"));
 
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);  // output
 
-      memory::dims src_dims, filter_dims, padding_l, padding_r, strides;
-      memory::dims output_dims_tf_order, output_dims_mkl_order;
+      memory::dims src_dims, filter_dims, padding_left, padding_right,
+                   dilations, strides;
+      memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
       // Get shapes of input tensors in MKL-DNN order
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
+      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
+                             dilations_);
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
       conv_utl.GetConvFwdSizesInMklOrder(
-          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
-          &output_dims_tf_order, &output_dims_mkl_order, &padding_l,
-          &padding_r);
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims,
+          &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order,
+          &padding_left, &padding_right);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
-      TensorShape output_tf_shape = MklDnnDimsToTFShape(output_dims_tf_order);
+      TensorShape dst_tf_shape = MklDnnDimsToTFShape(dst_dims_tf_order);
 
       // Corner cases: output with 0 elements and 0 batch size.
-      Tensor* output_tensor = nullptr;
-      if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) {
-        // TODO(jbobba): Verify correctness here
-        //               Need semantics for Null MKL tensor
-        MklDnnShape output_mkl_shape;
-        output_mkl_shape.SetMklTensor(false);
-        AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
-                                  src_tf_shape, output_mkl_shape);
+      Tensor* dst_tensor = nullptr;
+      if (dst_tf_shape.num_elements() == 0 ||
+          dst_dims_tf_order[0] == 0) {
+        MklDnnShape dst_mkl_shape;
+        dst_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, kOutputIndex_Dst,
+                    &dst_tensor, src_tf_shape, dst_mkl_shape);
 
         // MklConv2D also outputs converted filter as 2nd output of Conv2D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
         AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor, filter_tf_shape,
-                                  filter_mkl_shape);
+                                  &output_filter_tensor,
+                                  filter_tf_shape, filter_mkl_shape);
         return;
       }
 
@@ -567,6 +804,7 @@ class MklConv2DOp : public OpKernel {
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
       auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+
       // If input is in MKL layout, then simply grab input layout; otherwise,
       // construct input Tf layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
@@ -575,6 +813,7 @@ class MklConv2DOp : public OpKernel {
                         ? src_mkl_shape.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
       src.SetUsrMem(src_md, &src_tensor);
+
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
@@ -583,83 +822,82 @@ class MklConv2DOp : public OpKernel {
                                           memory::format::hwio);
       filter.SetUsrMem(filter_md, &filter_tensor);
 
-      // Set output shape (output_dims) required in MKL-DNN order.
-      // Currently, we set output layout as Tensorflow's layout (NHWC or NCHW
-      // depending on data format). But later we propagate Mkl layout of the
-      // output to the next op directly.
-      output.SetUsrMem(output_dims_mkl_order, tf_fmt);
+      // MKLDNN dilation starts from 0.
+      dilations[kDilationH] -= 1;
+      dilations[kDilationW] -= 1;
 
-      // Create memory descriptors for convolution data w/ no specified format.
-      src.SetOpMemDesc(src_dims, memory::format::any);
-      filter.SetOpMemDesc(filter_dims, memory::format::any);
-      output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+      // get a conv2d fwd from primitive pool
+      Conv2DFwd<T> *conv2d_fwd = nullptr;
+      if (biasEnabled) {
+        memory::dims bias_dims = {};
+        conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
+        ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+      } else {
+        ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+      }
 
-      // If bias is enabled, then do the same steps as above for bias.
+      // allocate output tensors output_tensor and filter_out_tensor
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
+      conv_fwd_pd = conv2d_fwd->fwd_pd_;
+      AllocateOutputTensor(context, *conv_fwd_pd,
+                       dst_dims_mkl_order, tf_fmt, &dst_tensor);
+      Tensor* filter_out_tensor = nullptr;
+      AllocateFilterOutputTensor(context, *conv_fwd_pd,
+                                 TFShapeToMklDnnDims(filter_tf_shape),
+                                 &filter_out_tensor);
+
+      T* dst_data = static_cast<T*>(dst_tensor->flat<T>().data());
+
+      // check whether src/filter need reorder
+      std::vector<primitive> net;
+      if (src_md.data.format != conv2d_fwd->src_fmt_)
+          src.CheckReorderToOpMem(
+              conv_fwd_pd.get()->src_primitive_desc(), &net);
+
+      if (filter_md.data.format != conv2d_fwd->filter_fmt_)
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc(),
+              filter.GetTensorBuffer(filter_out_tensor), &net);
+      stream(stream::kind::eager).submit(net).wait();
+
+      T* src_data = static_cast<T*>(
+                src.GetOpMem().get_data_handle());
+      T* filter_data = static_cast<T*>(
+                filter.GetOpMem().get_data_handle());
+
+      // execute convolution
       if (biasEnabled) {
-        MklDnnData<T> bias(&cpu_engine);
-        memory::dims bias_size;
-        conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_size);
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
-        bias.SetUsrMem(bias_size, memory::format::x, &bias_tensor);
-        bias.SetOpMemDesc(bias_size, memory::format::any);
-
-        // Create convolution primitive with Bias.
-        auto conv_desc = convolution_forward::desc(
-            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
-            filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(),
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc =
-            convolution_forward::primitive_desc(conv_desc, cpu_engine);
-        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
-                             tf_fmt, &output_tensor);
-        // Set data handle for output.
-        output.SetUsrMemDataHandle(output_tensor);
-
-        Tensor* filter_out_tensor = nullptr;
-        AllocateFilterOutputTensor(context, conv_prim_desc,
-                                   TFShapeToMklDnnDims(filter_tf_shape),
-                                   &filter_out_tensor);
-
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output,
-                             filter_out_tensor);
+        T* bias_data = static_cast<T*>(const_cast<T*>(
+            bias_tensor.flat<T>().data()));
+
+        conv2d_fwd->Execute(src_data, filter_data, bias_data, dst_data);
       } else {
-        // Create convolution primitive without Bias.
-        auto conv_desc = convolution_forward::desc(
-            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
-            filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l,
-            padding_r, TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc =
-            convolution_forward::primitive_desc(conv_desc, cpu_engine);
-        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
-                             tf_fmt, &output_tensor);
-        // Set data handle for output.
-        output.SetUsrMemDataHandle(output_tensor);
-
-        Tensor* filter_out_tensor = nullptr;
-        AllocateFilterOutputTensor(context, conv_prim_desc,
-                                   TFShapeToMklDnnDims(filter_tf_shape),
-                                   &filter_out_tensor);
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output,
-                             filter_out_tensor);
+        conv2d_fwd->Execute(src_data, filter_data, dst_data);
       }
-    } catch (mkldnn::error& e) {
+    } catch (mkldnn::error &e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + std::string(e.message) + ", in file " +
-                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
+                       ", message: " + std::string(e.message) +
+                       ", in file " + std::string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+        errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
   std::vector<int32> strides_;
+  std::vector<int32> dilations_;
   Padding padding_;
   TensorFormat data_format_;
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
+  const int kDilationH = 0, kDilationW = 1;
+  engine cpu_engine = engine(engine::cpu, 0);
 
   // Allocate output tensor.
   void AllocateOutputTensor(
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 9dd88221a84671e1f69df13cca1b62b2ce65bb4e..8333a09316c2147e79a610eeb6c4d7aafde6e2bf 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -58,13 +58,19 @@ class MklDnnConvUtil {
  protected:
   OpKernelContext* context_;  // We don't own this.
   std::vector<int32> strides_;
+  std::vector<int32> dilations_;
   Padding padding_;
   TensorFormat data_format_;
 
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
-                 Padding pad, TensorFormat fm)
-      : context_(context), strides_(strides), padding_(pad), data_format_(fm) {}
+                 Padding pad, TensorFormat fm,
+                 const std::vector<int32>& dilations)
+      : context_(context),
+        strides_(strides),
+        dilations_(dilations),
+        padding_(pad),
+        data_format_(fm) {}
 
   virtual ~MklDnnConvUtil() { context_ = nullptr; }
 
@@ -78,6 +84,16 @@ class MklDnnConvUtil {
     *strides = {stride_rows, stride_cols};
   }
 
+  // Calculate Convolution dilations
+  virtual inline void GetDilationsInMklOrder(memory::dims *dilations) {
+    // For now we take the dilation from the second and third dimensions only
+    // (we do not support dilation on the batch or depth dimension).
+    CHECK_NOTNULL(dilations);
+    int dilations_rows = GetTensorDim(dilations_, data_format_, 'H');
+    int dilations_cols = GetTensorDim(dilations_, data_format_, 'W');
+    *dilations = {dilations_rows, dilations_cols};
+  }
+
   // Calculate Convolution input size in MKL-DNN order. MKL-DNN
   // requires input in NCHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
@@ -213,7 +229,8 @@ class MklDnnConvUtil {
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
   virtual inline void GetOutputAndPadSizeInMklOrder(
       const TensorShape& input_shape, const TensorShape& filter_shape,
-      const memory::dims& strides, memory::dims* output_dims_tf_order,
+      const memory::dims& strides, const memory::dims& dilations,
+      memory::dims* output_dims_tf_order,
       memory::dims* output_dims_mkl_order, memory::dims* pad_l,
       memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
@@ -232,6 +249,8 @@ class MklDnnConvUtil {
     // Stride is vector of 2 elements: {s_r, s_c}
     int stride_rows = strides[0];
     int stride_cols = strides[1];
+    int dilation_rows = dilations[0];
+    int dilation_cols = dilations[1];
 
     // Output batch is same as input batch.
     int out_batch = GetTensorDim(input_shape, data_format_, 'N');
@@ -241,11 +260,13 @@ class MklDnnConvUtil {
     int64 out_rows = 0, out_cols = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
 
-    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
-                                 input_rows, filter_rows, stride_rows, padding_,
+    OP_REQUIRES_OK(context_,
+            GetWindowedOutputSizeVerboseV2(input_rows, filter_rows,
+                                 dilation_rows, stride_rows, padding_,
                                  &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
-                                 input_cols, filter_cols, stride_cols, padding_,
+    OP_REQUIRES_OK(context_,
+            GetWindowedOutputSizeVerboseV2(input_cols, filter_cols,
+                                 dilation_cols, stride_cols, padding_,
                                  &out_cols, &pad_left, &pad_right));
 
     // Tensorflow output is in data_format order. (NHWC or NCHW)
@@ -271,7 +292,8 @@ class MklDnnConvUtil {
   //
   // Function does not return anything, but sets error in context status.
   inline void GetOutputAndPadSizeInMklOrder(
-      size_t src_index, size_t filter_index, const memory::dims& strides,
+      size_t src_index, size_t filter_index,
+      const memory::dims& strides, const memory::dims& dilations,
       memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
       memory::dims* pad_l, memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
@@ -286,9 +308,9 @@ class MklDnnConvUtil {
                 errors::InvalidArgument("input must be 4-dimensional",
                                         input_tf_shape.DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
-                                  output_dims_tf_order, output_dims_mkl_order,
-                                  pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
+                                  strides, dilations, output_dims_tf_order,
+                                  output_dims_mkl_order, pad_l, pad_r);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -300,12 +322,14 @@ class MklDnnConvUtil {
   inline void GetConvFwdSizesInMklOrder(
       const TensorShape& input_shape, const TensorShape& filter_shape,
       memory::dims* input_dims, memory::dims* filter_dims,
-      memory::dims* strides, memory::dims* output_dims_tf_order,
+      memory::dims* strides, memory::dims *dilations,
+      memory::dims* output_dims_tf_order,
       memory::dims* output_dims_mkl_order, memory::dims* pad_l,
       memory::dims* pad_r) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
+    CHECK_NOTNULL(dilations);
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -316,7 +340,9 @@ class MklDnnConvUtil {
     GetFilterSizeInMklOrder(input_shape, filter_shape, filter_dims);
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
-    GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides,
+    GetDilationsInMklOrder(dilations);
+    GetOutputAndPadSizeInMklOrder(input_shape, filter_shape,
+                                  *strides, *dilations,
                                   output_dims_tf_order, output_dims_mkl_order,
                                   pad_l, pad_r);
     if (!context_->status().ok()) return;
@@ -344,7 +370,21 @@ class MklConv2DBackpropCommonOp : public OpKernel {
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
-
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
@@ -406,15 +446,16 @@ class MklConv2DBackpropCommonOp : public OpKernel {
       // By default, all dims are in MKL order. Only dims in TF order
       // are those with prefix tf_order.
       memory::dims outbprop_dims, fwd_input_dims, fwd_filter_dims;
-      memory::dims padding_l, padding_r, strides, fwd_output_dims;
+      memory::dims padding_l, padding_r, dilations, strides, fwd_output_dims;
       memory::dims fwd_output_dims_tf_order;
 
       // Get forward convolution parameters.
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
+      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
+                             dilations_);
       conv_utl.GetConvFwdSizesInMklOrder(
           input_tf_shape, filter_tf_shape, &fwd_input_dims, &fwd_filter_dims,
-          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
-          &padding_r);
+          &strides, &dilations, &fwd_output_dims_tf_order, &fwd_output_dims,
+          &padding_l, &padding_r);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
@@ -437,10 +478,21 @@ class MklConv2DBackpropCommonOp : public OpKernel {
                                               memory::format::hwio);
       // Tensorflow Output of Conv2D is in data_format order.
       auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(), tf_fmt);
-      auto fwd_desc = convolution_forward::desc(
-          prop_kind::forward, convolution_direct, fwd_input_md, fwd_filter_md,
-          fwd_out_md, strides, padding_l, padding_r,
-          TFPaddingToMklDnnPadding(padding_));
+
+      const int kDilationH = 0, kDilationW = 1;
+      dilations[kDilationH] -= 1;
+      dilations[kDilationW] -= 1;
+      auto fwd_desc = (dilations[kDilationH] > 0 || dilations[kDilationW] > 0)?
+              convolution_forward::desc(prop_kind::forward,
+                     convolution_direct, fwd_input_md,
+                     fwd_filter_md, fwd_out_md,
+                     strides, dilations, padding_l, padding_r,
+                     TFPaddingToMklDnnPadding(padding_)) :
+              convolution_forward::desc(prop_kind::forward,
+                     convolution_direct, fwd_input_md,
+                     fwd_filter_md, fwd_out_md,
+                     strides, padding_l, padding_r,
+                     TFPaddingToMklDnnPadding(padding_));
       auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
 
       // Create memory for user data. Describe how the inputs and outputs of
@@ -485,8 +537,9 @@ class MklConv2DBackpropCommonOp : public OpKernel {
 
       // Operator-specific call to create and execute primitive.
       CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
-                      &outbackprop, &output, &output_tensor, strides, padding_l,
-                      padding_r, TFPaddingToMklDnnPadding(padding_),
+                      &outbackprop, &output, &output_tensor,
+                      strides, dilations, padding_l, padding_r,
+                      TFPaddingToMklDnnPadding(padding_),
                       bwd_output_dims, bwd_output_format);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
@@ -535,20 +588,21 @@ class MklConv2DBackpropCommonOp : public OpKernel {
   virtual memory::format GetOutputFormat(const memory::format data_format) = 0;
 
   /// Create and execute the primitive storing output in the output_tensor.
-  virtual void CreatePrimitive(
-      OpKernelContext* context, const engine& cpu_engine,
-      const convolution_forward::primitive_desc& conv_fwd_pd,
-      MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
-      MklDnnData<T>* output, Tensor** output_tensor,
-      const memory::dims& strides, const memory::dims& padding_l,
-      const memory::dims& padding_r, padding_kind padding,
-      const memory::dims& bwd_output_dims,
-      memory::format bwd_output_format) = 0;
+  virtual void CreatePrimitive(OpKernelContext* context,
+    const engine& cpu_engine,
+    const convolution_forward::primitive_desc& conv_fwd_pd,
+    MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
+    MklDnnData<T>* output, Tensor** output_tensor, const memory::dims& strides,
+    const memory::dims& dilations, const memory::dims& padding_l,
+    const memory::dims& padding_r, padding_kind padding,
+    const memory::dims& bwd_output_dims,
+    memory::format bwd_output_format) = 0;
 
   // Get the data_format {NCHW, NHWC}
   TensorFormat GetTFDataFormat() { return data_format_; }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 8313224d7fe3e2d307d3642ced5b277b95c85cdb..62aafa793056f233ac84d1c8ca49bba1e73035c9 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -262,7 +262,6 @@ class MklFusedBatchNormOp : public OpKernel {
     }
 
     void MklCreateInputLayout(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
       bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
       if (input_in_mkl_format) {
         mkl_lt_input =
@@ -818,8 +817,8 @@ class MklFusedBatchNormOp : public OpKernel {
       // set weights primitive
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
-      auto weights_desc =
-          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto weights_desc = memory::desc({2, static_cast<int>(depth_)},
+                                       MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
       T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
@@ -834,8 +833,8 @@ class MklFusedBatchNormOp : public OpKernel {
       }
 
       // set mean primitive
-      auto mean_desc =
-          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto mean_desc = memory::desc({1, static_cast<int>(depth_)},
+                                    MklDnnType<T>(), memory::format::nc);
       auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
       char* saved_mean_data_tf =
           reinterpret_cast<char*>(saved_mean_tensor->flat<T>().data());
@@ -845,8 +844,8 @@ class MklFusedBatchNormOp : public OpKernel {
           memory(mean_pd, reinterpret_cast<void*>(saved_mean_data_tf));
 
       // set variance primitive
-      auto variance_desc =
-          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto variance_desc = memory::desc({1, static_cast<int>(depth_)},
+                                        MklDnnType<T>(), memory::format::nc);
       auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
       char* saved_variance_data_tf =
           reinterpret_cast<char*>(saved_variance_tensor->flat<T>().data());
@@ -934,7 +933,7 @@ class MklFusedBatchNormOp : public OpKernel {
   bool is_training_;
   T* mean_values_;
   T* variance_values_;
-  size_t depth_;  // batch normalization is done for per channel.
+  int depth_;  // batch normalization is done for per channel.
 
   void ExtractParams(OpKernelContext* context) {
     const Tensor& input = MklGetInput(context, 0);
@@ -1110,19 +1109,12 @@ class MklFusedBatchNormGradOp : public OpKernel {
         return;
       }
 
-      if (dnn_shape_src.IsMklTensor())
-        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
-      else
-        ExtractParams(context);
-
-      memory::format format_m;
       if (dnn_shape_src.IsMklTensor()) {
-        if (dnn_shape_src.IsTensorInNCHWFormat())
-          format_m = memory::format::nchw;
-        else
-          format_m = memory::format::nhwc;
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      } else if (dnn_shape_diff_dst.IsMklTensor()) {
+        depth_ = dnn_shape_diff_dst.DimSize(MklDnnDims::Dim_C);
       } else {
-        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+        ExtractParams(context);
       }
 
       MklDnnData<T> src(&cpu_engine);
@@ -1146,20 +1138,20 @@ class MklFusedBatchNormGradOp : public OpKernel {
         diff_dst_dims =
             TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), tensor_format_);
 
-      // set src and diff_dst primitives
+      // set src and diff_dst primitives according to input layout
       memory::desc src_md({}, memory::data_undef, memory::format_undef);
       memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
-      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
-        if (dnn_shape_src.IsMklTensor()) {
-          src_md = dnn_shape_src.GetMklLayout();
-          diff_dst_md = src_md;
-        } else {
-          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
-          src_md = diff_dst_md;
-        }
+      if (dnn_shape_src.IsMklTensor()) {
+        src_md = dnn_shape_src.GetMklLayout();
       } else {
-        src_md = memory::desc(src_dims, MklDnnType<T>(), format_m);
-        diff_dst_md = src_md;
+        src_md =  memory::desc(src_dims, MklDnnType<T>(),
+                TFDataFormatToMklDnnDataFormat(tensor_format_));
+      }
+      if (dnn_shape_diff_dst.IsMklTensor()) {
+        diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+      } else {
+        diff_dst_md = memory::desc(diff_dst_dims, MklDnnType<T>(),
+                TFDataFormatToMklDnnDataFormat(tensor_format_));
       }
       src.SetUsrMem(src_md, &src_tensor);
       diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
@@ -1211,28 +1203,64 @@ class MklFusedBatchNormGradOp : public OpKernel {
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
       TensorShape tf_shape_diff_src;
-      if (dnn_shape_src.IsMklTensor()) {
+
+      // MKL-DNN's BN primitive not provide API to fetch internal format
+      // set common_md as OpMem
+      // src and diff_dst will reorder to common_md
+      // diff_src will set as common_md
+      memory::desc common_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        if (dnn_shape_src.IsMklTensor()) {
+          common_md = dnn_shape_src.GetMklLayout();
+        } else {
+          common_md = dnn_shape_diff_dst.GetMklLayout();
+        }
+      } else {
+        common_md = memory::desc(src_dims, MklDnnType<T>(),
+                TFDataFormatToMklDnnDataFormat(tensor_format_));
+      }
+      // if any of src and diff_dst as mkl layout,
+      // then we set diff_src as mkl layout
+      if (dnn_shape_src.IsMklTensor() ||
+              dnn_shape_diff_dst.IsMklTensor()) {
         dnn_shape_diff_src.SetMklTensor(true);
-        auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
+        // set diff_src's mkl layout as common_md
+        auto diff_src_pd = memory::primitive_desc(common_md, cpu_engine);
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
         dnn_shape_diff_src.SetElemType(MklDnnType<T>());
-        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
-                                       format_m);
-        dnn_shape_diff_src.SetTfDimOrder(dnn_shape_src.GetDimension(),
-                                         tensor_format_);
+        if (dnn_shape_src.IsMklTensor()) {
+          dnn_shape_diff_src.SetTfLayout(
+                  dnn_shape_src.GetDimension(),
+                  src_dims,
+                  dnn_shape_src.GetTfDataFormat());
+          dnn_shape_diff_src.SetTfDimOrder(
+                  dnn_shape_src.GetDimension(),
+                  tensor_format_);
+        } else {
+          dnn_shape_diff_src.SetTfLayout(
+                  dnn_shape_diff_dst.GetDimension(),
+                  src_dims,
+                  dnn_shape_diff_dst.GetTfDataFormat());
+          dnn_shape_diff_src.SetTfDimOrder(
+                  dnn_shape_diff_dst.GetDimension(),
+                  tensor_format_);
+        }
         tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
+        // both src and diff_dst are TensorFlow layout,
+        // so it is OK to get TensorFlow shape.
         tf_shape_diff_src = src_tensor.shape();
       }
       AllocateOutputSetMklShape(context, kDiffSrcIndex, &diff_src_tensor,
                                 tf_shape_diff_src, dnn_shape_diff_src);
 
-      diff_src.SetUsrMem(src_md, diff_src_tensor);
+      // set diff_src
+      diff_src.SetUsrMem(common_md, diff_src_tensor);
 
       prop_kind pk = prop_kind::backward;
       auto bnrm_bwd_desc = batch_normalization_backward::desc(
-          pk, diff_src.GetUsrMemDesc(), src.GetUsrMemDesc(), epsilon_,
+          pk, common_md, common_md, epsilon_,
           /* for inference, specify use_global_stats
              1. on fwd prop, use mean and variance
                 provided as inputs
@@ -1245,11 +1273,16 @@ class MklFusedBatchNormGradOp : public OpKernel {
       auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
           bnrm_bwd_desc, cpu_engine, bnrm_fwd_pd);
 
+      std::vector<primitive> net;
+      src.CheckReorderToOpMem(memory::primitive_desc(common_md,
+                                   cpu_engine), &net);
+      diff_dst.CheckReorderToOpMem(memory::primitive_desc(common_md,
+                                   cpu_engine), &net);
+
       auto bnrm_bwd_op = batch_normalization_backward(
           bnrm_bwd_pd, src.GetOpMem(), mean.GetOpMem(), variance.GetOpMem(),
           diff_dst.GetOpMem(), weights_m, diff_src.GetOpMem(), diff_weights_m);
 
-      std::vector<primitive> net;
       net.push_back(bnrm_bwd_op);
       stream(stream::kind::eager).submit(net).wait();
 
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index e9a2376b545fcec97e1ced5c592351203abadd69..cda1402b035cdccc4677a8d074178dfc79170798 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -263,21 +264,18 @@ class MklInputConversionOp : public OpKernel {
 
  private:
   void Compute(OpKernelContext* context) override {
-    const Tensor& input_tensor_0 = MklGetInput(context, 0);
+    const int kInputIndex_0 = 0, kInputIndex_1 = 1;
+    const Tensor& input_tensor_0 = MklGetInput(context, kInputIndex_0);
     MklDnnShape input_shape_0;
-    GetMklShape(context, 0, &input_shape_0);
+    GetMklShape(context, kInputIndex_0, &input_shape_0);
 
-    const Tensor& input_tensor_1 = MklGetInput(context, 1);
+    const Tensor& input_tensor_1 = MklGetInput(context, kInputIndex_1);
     MklDnnShape input_shape_1;
-    GetMklShape(context, 1, &input_shape_1);
-
-    bool tf_shapes_are_same =
-        context->input(0).shape() == context->input(1).shape();
+    GetMklShape(context, kInputIndex_1, &input_shape_1);
 
-    VLOG(1) << "MklInputConversionOp: Input shapes are "
-            << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
-            << context->input(0).shape().DebugString() << " and "
-            << context->input(1).shape().DebugString();
+    VLOG(1) << "MklInputConversionOp: Input shapes are: "
+            << context->input(kInputIndex_0).shape().DebugString() << " and "
+            << context->input(kInputIndex_1).shape().DebugString();
 
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // if both inputs are in TF format, just copy input tensors to output.
@@ -285,15 +283,20 @@ class MklInputConversionOp : public OpKernel {
       VLOG(1) << "MklInputConversionOp: No conversion needed, "
               << "copying TF inputs to output";
 
-      ForwardTfTensorInToOut(context, 0, 0);
-      ForwardTfTensorInToOut(context, 1, 1);
+      ForwardTfTensorInToOut(context, kInputIndex_0, kInputIndex_0);
+      ForwardTfTensorInToOut(context, kInputIndex_1, kInputIndex_1);
       return;
     }
 
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      if (tf_shapes_are_same) {
+      // It is safer to compare the original TensorFlow shapes than to compare
+      // Mkl shapes since element wise ops are forwarded to Eigen
+      // implementation.
+      TensorShape tf_shape0 = input_shape_0.GetTfShape();
+      TensorShape tf_shape1 = input_shape_1.GetTfShape();
+      if (tf_shape0 == tf_shape1) {
         auto input0_md = input_shape_0.GetMklLayout();
         auto input1_md = input_shape_1.GetMklLayout();
 
@@ -302,16 +305,15 @@ class MklInputConversionOp : public OpKernel {
           VLOG(1) << "MklInputConversionOp: No conversion needed, "
                   << "copying MKL inputs with identical shapes to output";
 
-          ForwardMklTensorInToOut(context, 0, 0);
-          ForwardMklTensorInToOut(context, 1, 1);
+          ForwardMklTensorInToOut(context, kInputIndex_0, kInputIndex_0);
+          ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
         } else {
           VLOG(1) << "MklInputConversionOp: Shape is same, but format is "
                      "different, "
                   << "need to convert to same format";
-
-          // Convert input0, and keep input1 unchanged
-          // Create MklDnnShape for output mkl tensor based on input0
+          // TODO: For now, input0 is converted and input1 is unchanged
+          //       we should choose the optimal MKL format to convert to.
           Tensor* tensor_out;
           MklDnnShape mkl_output_mkl_shape;
           mkl_output_mkl_shape.SetMklTensor(true);
@@ -324,7 +326,7 @@ class MklInputConversionOp : public OpKernel {
           mkl_output_mkl_shape.SetMklLayout(&input1_md);
 
           // Create output Mkl tensor for index 0
-          AllocateOutputSetMklShape(context, 0, &tensor_out,
+          AllocateOutputSetMklShape(context, kInputIndex_0, &tensor_out,
                                     input_tensor_0.shape(),
                                     mkl_output_mkl_shape);
 
@@ -342,7 +344,7 @@ class MklInputConversionOp : public OpKernel {
           stream(stream::kind::eager).submit(net).wait();
 
           // Input1 will be passed through
-          ForwardMklTensorInToOut(context, 1, 1);
+          ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
         }
       }
@@ -359,13 +361,16 @@ class MklInputConversionOp : public OpKernel {
       // with MKL tensors)
       VLOG(1) << "MklInputConversionOp: Broadcast needed, "
               << "converted MKL inputs to TF format";
-
+      // TODO: Cleanup op_data_type and has_avx512f_ after these two parameters
+      //       are removed from ConvertMklToTf
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 0);
+                                           op_data_type, has_avx512f_,
+                                           kInputIndex_0);
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 1);
-      SetDummyMklShapeOutput(context, 0);
-      SetDummyMklShapeOutput(context, 1);
+                                           op_data_type, has_avx512f_,
+                                           kInputIndex_1);
+      SetDummyMklShapeOutput(context, kInputIndex_0);
+      SetDummyMklShapeOutput(context, kInputIndex_1);
       return;
     }
 
@@ -377,7 +382,6 @@ class MklInputConversionOp : public OpKernel {
     const Tensor* mkl_tensor;
     const MklDnnShape* mkl_shape;
     const Tensor* tf_tensor;
-    MklDnnShape* tf_mkl_shape;
     uint mkl_tensor_index;
     uint tf_tensor_index;
     if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
@@ -385,14 +389,12 @@ class MklInputConversionOp : public OpKernel {
       mkl_shape = &input_shape_0;
       mkl_tensor_index = 0;
       tf_tensor = &input_tensor_1;
-      tf_mkl_shape = &input_shape_1;
       tf_tensor_index = 1;
     } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
       mkl_tensor = &input_tensor_1;
       mkl_shape = &input_shape_1;
       mkl_tensor_index = 1;
       tf_tensor = &input_tensor_0;
-      tf_mkl_shape = &input_shape_0;
       tf_tensor_index = 0;
     } else {
       CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
@@ -401,19 +403,7 @@ class MklInputConversionOp : public OpKernel {
     }
 
     // Broadcast is needed if the shapes are not the same
-    bool broadcast_needed;
-
-    size_t in0_size = 1;
-    for (size_t i = 0; i < mkl_shape->GetDimension(); ++i)
-      in0_size *= mkl_shape->TfDimSize(i);
-
-    size_t in1_size = 1;
-    for (size_t i = 0; i < tf_tensor->shape().dims(); ++i)
-      in1_size *= tf_tensor->shape().dim_size(i);
-
-    broadcast_needed = (in0_size != in1_size);
-
-    if (!broadcast_needed) {
+    if (mkl_shape->GetTfShape().num_elements() == tf_tensor->shape().num_elements() ) {
       // Both shapes are same, convert the TF input to MKL
       VLOG(1) << "MklInputConversionOp: No broadcast needed.";
       VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index
@@ -442,13 +432,21 @@ class MklInputConversionOp : public OpKernel {
       auto input_tf_md = mkl_output_mkl_shape.GetTfLayout();
       tf_input.SetUsrMem(input_tf_md, tf_tensor);
 
-      // Create reorder between tensorflow layout and Mkl layout.
+      // Create reorder between tensorflow layout and Mkl layout if necessary
       std::vector<primitive> net;
-      CHECK_EQ(tf_input.CheckReorderToOpMem(
+      bool reordered = tf_input.CheckReorderToOpMem(
                    memory::primitive_desc(output_mkl_md, cpu_engine),
-                   tensor_out, &net),
-               true);
-      stream(stream::kind::eager).submit(net).wait();
+                   tensor_out, &net);
+      if(!reordered) {
+        // This is the case that the TF tensor has the same shape and format of
+        // mkl tensor. However, tf_tensor can not be simply forwarded to the output
+        // tensor since mkl data tensor is always one dimensional tensor. 
+        // Tensor::CopyFrom shares the buffer of the other tensor while set its shape
+        // to the other tensor. 
+        tensor_out->CopyFrom(*tf_tensor, tensor_out->shape());
+      }
+      else  
+        stream(stream::kind::eager).submit(net).wait();
 
       // -- The tensor in MKL format passes through --
       ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index);
@@ -467,8 +465,9 @@ class MklInputConversionOp : public OpKernel {
     }
 
     VLOG(1) << "MklInputConversionOp: Shapes (output): "
-            << context->mutable_output(0)->shape().DebugString() << " and "
-            << context->mutable_output(1)->shape().DebugString();
+            << context->mutable_output(kInputIndex_0)->shape().DebugString()
+            << " and "
+            << context->mutable_output(kInputIndex_1)->shape().DebugString();
 
     VLOG(1) << "MklInputConversion completed successfully.";
   }
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 5f0a12a1fb9bff3086e05918e23b8396196eb389..eef254cdadbde377c463ea2c5dad693d890d1dc5 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -88,7 +88,8 @@ class MklLRNOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
     OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
     workspace_enabled_ = false;
-    context->GetAttr("workspace_enabled", &workspace_enabled_);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("workspace_enabled", &workspace_enabled_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -357,7 +358,8 @@ class MklLRNGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
     OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
     workspace_enabled_ = false;
-    context->GetAttr("workspace_enabled", &workspace_enabled_);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("workspace_enabled", &workspace_enabled_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -535,7 +537,6 @@ class MklLRNGradOp : public OpKernel {
                                 Tensor* mkl_tmp_outimage_buf_tensor) {
       const Tensor& in_grads = MklGetInput(context, 0);
       const Tensor& in_image = MklGetInput(context, 1);
-      const Tensor& out_image = MklGetInput(context, 2);
       const Tensor& workspace = MklGetInput(
           context,
           3); /*Worskpsace is enabled, get the buffer to the workspace */
@@ -544,8 +545,6 @@ class MklLRNGradOp : public OpKernel {
           static_cast<const void*>(in_grads.flat<T>().data()));
       void* user_fwd_input = const_cast<void*>(
           static_cast<const void*>(in_image.flat<T>().data()));
-      void* user_fwd_output = const_cast<void*>(
-          static_cast<const void*>(out_image.flat<T>().data()));
       void* workspace_buffer = const_cast<void*>(
           static_cast<const void*>(workspace.flat<T>().data()));
 
@@ -753,7 +752,8 @@ class MklLRNOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
     OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
     workspace_enabled_ = false;
-    context->GetAttr("workspace_enabled", &workspace_enabled_);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("workspace_enabled", &workspace_enabled_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1002,7 +1002,8 @@ class MklLRNGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
     OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
     workspace_enabled_ = false;
-    context->GetAttr("workspace_enabled", &workspace_enabled_);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("workspace_enabled", &workspace_enabled_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1044,7 +1045,6 @@ class MklLRNGradOp : public OpKernel {
       // Naming: diff_dst is input_gradient_tensor; src is orig_input_tensor.
       const Tensor& input_grad_tensor = MklGetInput(context, kIdxGradient);
       const Tensor& orig_input_tensor = MklGetInput(context, kIdxOrigInput);
-      const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
 
       // Get input sizes in MKL-DNN required NCHW format.
       // LRN does not have data_format attribute. But by default it has
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 25ad8c94a78a82cc7e4a6f98903aecf1d5a0d1b4..dfa6cecc9bdc231ebf35e587183b5f84b17489e0 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -171,13 +171,13 @@ class MklMatMulOp : public OpKernel {
   // For detailed info about parameters, look at FP32 function description.
   void MklBlasGemm(bool transa, bool transb, const int m, const int n,
                    const int k, const complex64* a, const int lda,
-                   const complex64* b, const int ldb,
-                   complex64* c, int const ldc) {
+                   const complex64* b, const int ldb, complex64* c,
+                   int const ldc) {
     const MKL_Complex8 alpha = {1.0f, 0.0f};
     const MKL_Complex8 beta = {0.0f, 0.0f};
     cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans,
-                m, n, k, &alpha, reinterpret_cast<const MKL_Complex8*>(a), lda,
+                transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
+                reinterpret_cast<const MKL_Complex8*>(a), lda,
                 reinterpret_cast<const MKL_Complex8*>(b), ldb, &beta,
                 reinterpret_cast<MKL_Complex8*>(c), ldc);
   }
@@ -187,13 +187,13 @@ class MklMatMulOp : public OpKernel {
   // description.
   void MklBlasGemm(bool transa, bool transb, const int m, const int n,
                    const int k, const complex128* a, const int lda,
-                   const complex128* b, const int ldb,
-                   complex128* c, const int ldc) {
+                   const complex128* b, const int ldb, complex128* c,
+                   const int ldc) {
     const MKL_Complex16 alpha = {1.0, 0.0};
     const MKL_Complex16 beta = {0.0, 0.0};
     cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans,
-                m, n, k, &alpha, reinterpret_cast<const MKL_Complex16*>(a), lda,
+                transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
+                reinterpret_cast<const MKL_Complex16*>(a), lda,
                 reinterpret_cast<const MKL_Complex16*>(b), ldb, &beta,
                 reinterpret_cast<MKL_Complex16*>(c), ldc);
   }
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 14607f26e0ccd1028dd62343000d90ac8451d7bb..ea537524b11ef1362ff08b79ae25ca6e7048a9cd 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -69,7 +69,8 @@ class MklMaxPoolingOp : public OpKernel {
     // We may not get this attribute for this node if it does not go through
     // graph rewrite pass. So we do not check for error while retrieving this
     // attribute value.
-    context->GetAttr("workspace_enabled", &workspace_enabled_);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("workspace_enabled", &workspace_enabled_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -118,7 +119,6 @@ class MklMaxPoolingOp : public OpKernel {
                               mkl_out_shape);
 
     Tensor* workspace_tensor;
-    void* workspace_buf = nullptr;
 
     TensorShape workspace_shape;
     mkl_workspace_shape.SetMklTensor(false);
@@ -226,7 +226,8 @@ class MklMaxPoolingGradOp : public OpKernel {
     // We may not get this attribute for this node if it does not go through
     // graph rewrite pass. So we do not check for error while retrieving this
     // attribute value.
-    context->GetAttr("workspace_enabled", &workspace_enabled_);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("workspace_enabled", &workspace_enabled_));
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 51db3991e2a24f087771f571cd91fc9fbb26040b..1ed43834dd8803679d07a59b2d3a14e3c4953995 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
-#include "tensorflow/core/platform/default/logging.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
@@ -368,8 +367,11 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.MklCleanup();
 }
 
+
+
 #else  // INTEL_MKL_ML
 
+
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
  public:
@@ -390,7 +392,7 @@ class MklReluOpBase : public OpKernel {
 
       Tensor* dst_tensor = nullptr;
       if (src_tensor.dims() == 0) {
-        Compute_Scalar(context);
+        Compute_Scalar(context); // scalar case doesn't use in-place operation
         return;
       }
 
@@ -435,11 +437,17 @@ class MklReluOpBase : public OpKernel {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-      AllocateOutputSetMklShape(context, dst_index, &dst_tensor, tf_shape_dst,
-                                dnn_shape_dst);
+      
+      // Allocate output and MklDnnShape tensors separately for possible
+      // in-place operation
+      OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                      {static_cast<const int>(src_index)},
+                                      static_cast<const int>(dst_index),
+                                      tf_shape_dst, &dst_tensor));
+      AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst);
 
       // Destination memory descriptor is same as source memory descriptor.
-      auto dst_md = src_md;
+      auto &dst_md = src_md;
       dst.SetUsrMem(dst_md, dst_tensor);
 
       // execute net
@@ -490,7 +498,7 @@ class MklReluGradOpBase : public OpKernel {
 
       int src_dims_size = src_tensor.dims();
       if (src_dims_size == 0) {
-        Compute_Scalar(context);
+        Compute_Scalar(context); // scalar case doesn't use in-place operation
         return;
       }
 
@@ -579,21 +587,37 @@ class MklReluGradOpBase : public OpKernel {
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
       TensorShape tf_shape_diff_src;
-      if (dnn_shape_src.IsMklTensor()) {
+      if (dnn_shape_src.IsMklTensor() ||
+              dnn_shape_diff_dst.IsMklTensor()) {
         dnn_shape_diff_src.SetMklTensor(true);
         auto diff_src_pd = relu_bwd_pd.diff_src_primitive_desc();
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
         dnn_shape_diff_src.SetElemType(MklDnnType<T>());
-        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
-                                       dnn_shape_src.GetSizesAsMklDnnDims(),
-                                       dnn_shape_src.GetTfDataFormat());
+        if (dnn_shape_src.IsMklTensor()) {
+          dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
+                                         dnn_shape_src.GetSizesAsMklDnnDims(),
+                                         dnn_shape_src.GetTfDataFormat());
+        } else {
+          dnn_shape_diff_src.SetTfLayout(dnn_shape_diff_dst.GetDimension(),
+                                 dnn_shape_diff_dst.GetSizesAsMklDnnDims(),
+                                 dnn_shape_diff_dst.GetTfDataFormat());
+        }
         tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
+        // both src and diff_dst are TensorFlow layout,
+        // so it is ok to get TensorFlow shape.
         tf_shape_diff_src = src_tensor.shape();
       }
-      AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
-                                tf_shape_diff_src, dnn_shape_diff_src);
+
+      // Allocate diff_src and MklDnnShape tensors separately for possible
+      // in-place operation
+      OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                      {static_cast<const int>(diff_dst_index)},
+                                      static_cast<const int>(diff_src_index),
+                                      tf_shape_diff_src,
+                                      &diff_src_tensor));
+      AllocateOutputSetMklShape(context, diff_src_index, dnn_shape_diff_src);
 
       // diff_src memory descriptor is same as memory descriptor for both
       // inputs.
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 5dbc4a2709e2bc379ae3b9aa68ed14f3d6893e7c..2cfde1f6fd4112ea1b4e489be3d9ce0014cbaa6a 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -266,7 +266,9 @@ class MklReshapeOp : public OpKernel {
                                                    &net)) {
               stream(stream::kind::eager).submit(net).wait();
             } else {
-              output_tensor->CopyFrom(input_tensor, shape_to);
+              OP_REQUIRES(
+                  context, output_tensor->CopyFrom(input_tensor, shape_to),
+                  errors::InvalidArgument("invalid input tensor shape"));
             }
             return;
           } else {
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index aceef1e234eff3660b33f5a091a2cd10e25ea2f9..f79e18cff29de5682ac2db445160d9346425414f 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -27,7 +27,6 @@ limitations under the License.
 
 #include "mkldnn.h"
 #include "mkldnn_types.h"
-#include "tensorflow/core/platform/default/logging.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 #include "mkldnn.hpp"
@@ -103,7 +102,7 @@ class MklSoftmaxOp : public OpKernel {
       // Softmax MklDnn output layout is same as input layout.
       auto dst_pd = src.GetUsrMemPrimDesc();
 
-      // if input is MKL shape, ouput is also MKL shape.
+      // if input is MKL shape, output is also MKL shape.
       // if input is TF shape, output is also TF shape
       if (src_mkl_shape.IsMklTensor()) {
         output_mkl_shape.SetMklTensor(true);
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index ddea9e281b2fbcf0e061fd2bf2758984833a3727..4120f013acd7a714d3015d4c6cdce53d0161d677 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index b44b4d6f542ed0128a83d20eedf6629f67427867..3f07b317c4d915fd7d304dbbab966837da64757a 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -63,25 +63,31 @@ INSTANTIATE(double, d)
 #undef INSTANTIATE
 
 template <>
-Status MKLTranspose2D<complex64>(const char trans, const Tensor& in, Tensor* out) {
-    const MKL_Complex8 alpha = { 1.0f, 0.0f };
-    mkl_comatcopy('R', trans, in.dim_size(0), in.dim_size(1), alpha,
-                  reinterpret_cast<const MKL_Complex8*>(in.flat<complex64>().data()),
-                  in.dim_size(1),
-                  reinterpret_cast<MKL_Complex8*>(const_cast<complex64*>(out->flat<complex64>().data())),
-                  in.dim_size(0));
-    return Status::OK();
+Status MKLTranspose2D<complex64>(const char trans, const Tensor& in,
+                                 Tensor* out) {
+  const MKL_Complex8 alpha = {1.0f, 0.0f};
+  mkl_comatcopy(
+      'R', trans, in.dim_size(0), in.dim_size(1), alpha,
+      reinterpret_cast<const MKL_Complex8*>(in.flat<complex64>().data()),
+      in.dim_size(1),
+      reinterpret_cast<MKL_Complex8*>(
+          const_cast<complex64*>(out->flat<complex64>().data())),
+      in.dim_size(0));
+  return Status::OK();
 }
 
 template <>
-Status MKLTranspose2D<complex128>(const char trans, const Tensor& in, Tensor* out) {
-    const MKL_Complex16 alpha = { 1.0, 0.0 };
-    mkl_zomatcopy('R', trans, in.dim_size(0), in.dim_size(1), alpha,
-                  reinterpret_cast<const MKL_Complex16*>(in.flat<complex128>().data()),
-                  in.dim_size(1),
-                  reinterpret_cast<MKL_Complex16*>(const_cast<complex128*>(out->flat<complex128>().data())),
-                  in.dim_size(0));
-	return Status::OK();
+Status MKLTranspose2D<complex128>(const char trans, const Tensor& in,
+                                  Tensor* out) {
+  const MKL_Complex16 alpha = {1.0, 0.0};
+  mkl_zomatcopy(
+      'R', trans, in.dim_size(0), in.dim_size(1), alpha,
+      reinterpret_cast<const MKL_Complex16*>(in.flat<complex128>().data()),
+      in.dim_size(1),
+      reinterpret_cast<MKL_Complex16*>(
+          const_cast<complex128*>(out->flat<complex128>().data())),
+      in.dim_size(0));
+  return Status::OK();
 }
 
 static const char kMKLTranspose = 'T';
diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc
index d086abb24760f1ab946605fd422a4fd0d5fc866d..7a647884486b52943c068efce6c5cccc73a97855 100644
--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/stateless_random_ops.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
@@ -127,18 +128,16 @@ struct MultinomialFunctor<CPUDevice, T, OutputType> {
 
 }  // namespace functor
 
+namespace {
+
 // Samples from a multinomial distribution.
 template <typename Device, typename T, typename OutputType>
 class MultinomialOp : public OpKernel {
  public:
-  explicit MultinomialOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, generator_.Init(context));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& logits_t = ctx->input(0);
-    const Tensor& num_samples_t = ctx->input(1);
+  explicit MultinomialOp(OpKernelConstruction* context) : OpKernel(context) {}
 
+  void DoCompute(OpKernelContext* ctx, const Tensor& logits_t,
+                 const Tensor& num_samples_t, GuardedPhiloxRandom* generator) {
     OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_t.shape()),
                 errors::InvalidArgument("logits should be a matrix, got shape ",
                                         logits_t.shape().DebugString()));
@@ -194,7 +193,7 @@ class MultinomialOp : public OpKernel {
       // CPU generates doubles = 2 samples per number.
       if (std::is_same<Device, CPUDevice>::value) num_samples_ceil_4 *= 2;
       auto rng =
-          generator_.ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
+          generator->ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
       functor::MultinomialFunctor<Device, T, OutputType>()(
           ctx, ctx->eigen_device<Device>(), logits_t.matrix<T>(),
           noises.flat<float>(), scores.flat<float>(), scratch.flat<float>(),
@@ -202,24 +201,38 @@ class MultinomialOp : public OpKernel {
           samples_t->matrix<OutputType>());
     }
   }
+};
+
+template <typename Device, typename T, typename OutputType>
+class StatefulMultinomialOp : public MultinomialOp<Device, T, OutputType> {
+ public:
+  explicit StatefulMultinomialOp(OpKernelConstruction* ctx)
+      : MultinomialOp<Device, T, OutputType>(ctx) {
+    OP_REQUIRES_OK(ctx, generator_.Init(ctx));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& logits_t = ctx->input(0);
+    const Tensor& num_samples_t = ctx->input(1);
+    this->DoCompute(ctx, logits_t, num_samples_t, &generator_);
+  }
 
  private:
   GuardedPhiloxRandom generator_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(MultinomialOp);
 };
 
-#define REGISTER(TYPE)                                                   \
-  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
-                              .Device(DEVICE_CPU)                        \
-                              .TypeConstraint<TYPE>("T")                 \
-                              .TypeConstraint("output_dtype", DT_INT32), \
-                          MultinomialOp<CPUDevice, TYPE, int32>);        \
-  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
-                              .Device(DEVICE_CPU)                        \
-                              .TypeConstraint<TYPE>("T")                 \
-                              .TypeConstraint("output_dtype", DT_INT64), \
-                          MultinomialOp<CPUDevice, TYPE, int64>);
+// TODO(b/77906027): Add a TPU implementation.
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                             \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT32),  \
+                          StatefulMultinomialOp<CPUDevice, TYPE, int32>); \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                             \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT64),  \
+                          StatefulMultinomialOp<CPUDevice, TYPE, int64>);
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
@@ -233,13 +246,83 @@ TF_CALL_double(REGISTER);
                               .HostMemory("num_samples")                 \
                               .TypeConstraint<TYPE>("T")                 \
                               .TypeConstraint("output_dtype", DT_INT32), \
-                          MultinomialOp<GPUDevice, TYPE, int32>)         \
+                          StatefulMultinomialOp<GPUDevice, TYPE, int32>) \
   REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
                               .Device(DEVICE_GPU)                        \
                               .HostMemory("num_samples")                 \
                               .TypeConstraint<TYPE>("T")                 \
                               .TypeConstraint("output_dtype", DT_INT64), \
-                          MultinomialOp<GPUDevice, TYPE, int64>)
+                          StatefulMultinomialOp<GPUDevice, TYPE, int64>)
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+#undef REGISTER
+
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename T, typename OutputType>
+class StatelessMultinomialOp : public MultinomialOp<Device, T, OutputType> {
+ public:
+  explicit StatelessMultinomialOp(OpKernelConstruction* ctx)
+      : MultinomialOp<Device, T, OutputType>(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& logits_t = ctx->input(0);
+    const Tensor& num_samples_t = ctx->input(1);
+
+    const Tensor& seed_t = ctx->input(2);
+    OP_REQUIRES(ctx, seed_t.dims() == 1 && seed_t.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_t.shape().DebugString()));
+
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    OP_REQUIRES_OK(ctx, GenerateKey(seed_t, &key, &counter));
+
+    GuardedPhiloxRandom generator;
+    generator.Init(counter, key);
+
+    this->DoCompute(ctx, logits_t, num_samples_t, &generator);
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+};
+
+#define REGISTER(TYPE)                                                     \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                     \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<TYPE>("T")                   \
+                              .TypeConstraint("output_dtype", DT_INT32),   \
+                          StatelessMultinomialOp<CPUDevice, TYPE, int32>); \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                     \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<TYPE>("T")                   \
+                              .TypeConstraint("output_dtype", DT_INT64),   \
+                          StatelessMultinomialOp<CPUDevice, TYPE, int64>);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+#undef REGISTER
+
+#if GOOGLE_CUDA
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                    \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("num_samples")                  \
+                              .HostMemory("seed")                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT32),  \
+                          StatelessMultinomialOp<GPUDevice, TYPE, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                    \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("num_samples")                  \
+                              .HostMemory("seed")                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT64),  \
+                          StatelessMultinomialOp<GPUDevice, TYPE, int64>)
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
@@ -248,4 +331,6 @@ TF_CALL_double(REGISTER);
 
 #endif  // GOOGLE_CUDA
 
+}  // end namespace
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ddb7a606c1a7f0264c7c4a9cbb2f97095d9fee01
--- /dev/null
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -0,0 +1,249 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <deque>
+#include <utility>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace {
+
+class Mutex : public ResourceBase {
+ public:
+  explicit Mutex(OpKernelContext* c, const string& name)
+      : locked_(false),
+        thread_pool_(new thread::ThreadPool(
+            c->env(), ThreadOptions(),
+            strings::StrCat("mutex_lock_thread_", SanitizeThreadSuffix(name)),
+            1 /* num_threads */, false /* low_latency_hint */)),
+        name_(name) {
+    VLOG(2) << "Creating mutex with name " << name << ": " << this;
+  }
+
+  string DebugString() override { return strings::StrCat("Mutex ", name_); }
+
+  class LockReleaser {
+   public:
+    explicit LockReleaser(Mutex* mutex) : mutex_(mutex) {}
+
+    LockReleaser(const LockReleaser&) = delete;
+    LockReleaser& operator=(const LockReleaser&) = delete;
+
+    virtual ~LockReleaser() {
+      VLOG(3) << "Destroying LockReleaser " << this << " for mutex: " << mutex_;
+      if (mutex_) {
+        mutex_lock lock(mutex_->mu_);
+        mutex_->locked_ = false;
+        mutex_->cv_.notify_all();
+        VLOG(3) << "Destroying LockReleaser " << this
+                << ": sent notifications.";
+      }
+    }
+
+   private:
+    Mutex* mutex_;
+  };
+
+  struct SharedLockReleaser {
+    std::shared_ptr<LockReleaser> shared_lock;
+
+    explicit SharedLockReleaser(std::shared_ptr<LockReleaser>&& lock)
+        : shared_lock(std::forward<decltype(lock)>(lock)) {
+      VLOG(3) << "Creating shared_ptr of " << shared_lock.get()
+              << " count is: " << shared_lock.use_count();
+    }
+
+    SharedLockReleaser(SharedLockReleaser&& rhs)
+        : shared_lock(std::move(rhs.shared_lock)) {
+      VLOG(3) << "Moving SharedLockReleaser of " << shared_lock.get()
+              << " count is: " << shared_lock.use_count();
+    }
+
+    SharedLockReleaser(const SharedLockReleaser& rhs)
+        : shared_lock(rhs.shared_lock) {
+      VLOG(3) << "Copying SharedLockReleaser of " << shared_lock.get()
+              << " count is: " << shared_lock.use_count();
+    }
+
+    ~SharedLockReleaser() {
+      VLOG(3) << "Destroying SharedLockReleaser of " << shared_lock.get()
+              << " count is: " << shared_lock.use_count();
+    }
+
+    void Encode(VariantTensorData*) const {
+      // Not supported.
+    }
+
+    bool Decode(const VariantTensorData&) {
+      return false;  // Not supported.
+    }
+  };
+
+  void AcquireAsync(
+      OpKernelContext* c,
+      std::function<void(const Status& s, SharedLockReleaser lock)> fn) {
+    CancellationManager* cm = c->cancellation_manager();
+    CancellationToken token{};
+    bool* cancelled = nullptr;
+    if (cm) {
+      cancelled = new bool(false);  // GUARDED_BY(mu_);
+      token = cm->get_cancellation_token();
+      const bool already_cancelled =
+          !cm->RegisterCallback(token, [this, cancelled]() {
+            mutex_lock lock(mu_);
+            *cancelled = true;
+            cv_.notify_all();
+          });
+      if (already_cancelled) {
+        delete cancelled;
+        fn(errors::Cancelled("Lock acquisition cancelled."),
+           SharedLockReleaser{nullptr});
+        return;
+      }
+    }
+    thread_pool_->Schedule(std::bind(
+        [this, cm, cancelled,
+         token](std::function<void(const Status& s, SharedLockReleaser&& lock)>
+                    fn_) {
+          bool local_locked;
+          {
+            mutex_lock lock(mu_);
+            while (locked_ && !(cancelled && *cancelled)) {
+              cv_.wait(lock);
+            }
+            local_locked = locked_ = !(cancelled && *cancelled);
+          }
+          if (cm) {
+            cm->DeregisterCallback(token);
+            delete cancelled;
+          }
+          if (local_locked) {  // Not cancelled.
+            fn_(Status::OK(),
+                SharedLockReleaser{std::make_shared<LockReleaser>(this)});
+          } else {
+            fn_(errors::Cancelled("Lock acqusition cancelled."),
+                SharedLockReleaser{nullptr});
+          }
+        },
+        std::move(fn)));
+  }
+
+ private:
+  mutex mu_;
+  condition_variable cv_ GUARDED_BY(mu_);
+  bool locked_ GUARDED_BY(mu_);
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  string name_;
+};
+
+}  // namespace
+
+class MutexLockOp : public AsyncOpKernel {
+ public:
+  explicit MutexLockOp(OpKernelConstruction* c) : AsyncOpKernel(c) {}
+
+ public:
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    Mutex* mutex = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c,
+        LookupOrCreateResource<Mutex>(c, HandleFromInput(c, 0), &mutex,
+                                      [c](Mutex** ptr) {
+                                        *ptr = new Mutex(
+                                            c, HandleFromInput(c, 0).name());
+                                        return Status::OK();
+                                      }),
+        done);
+
+    Tensor* variant;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, TensorShape({}), &variant),
+                         done);
+
+    mutex->AcquireAsync(
+        c, std::bind(
+               [c, variant, mutex](DoneCallback done_,
+                                   // End of bound arguments.
+                                   const Status& s,
+                                   Mutex::SharedLockReleaser&& lock) {
+                 VLOG(2) << "Finished locking mutex " << mutex
+                         << " with lock: " << lock.shared_lock.get()
+                         << " status: " << s.ToString();
+                 if (s.ok()) {
+                   variant->scalar<Variant>()() = std::move(lock);
+                 } else {
+                   c->SetStatus(s);
+                 }
+                 mutex->Unref();
+                 done_();
+               },
+               std::move(done), std::placeholders::_1, std::placeholders::_2));
+  }
+};
+
+class ConsumeMutexLockOp : public OpKernel {
+ public:
+  explicit ConsumeMutexLockOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    VLOG(2) << "Executing ConsumeMutexLockOp";
+    const Tensor& lock_t = c->input(0);
+    OP_REQUIRES(
+        c, lock_t.dims() == 0,
+        errors::InvalidArgument("Expected input to be a scalar, saw shape: ",
+                                lock_t.shape().DebugString()));
+    OP_REQUIRES(
+        c, lock_t.dtype() == DT_VARIANT,
+        errors::InvalidArgument("Expected input to be a variant, saw type: ",
+                                DataTypeString(lock_t.dtype())));
+    const auto* lock =
+        lock_t.scalar<Variant>()().get<Mutex::SharedLockReleaser>();
+    OP_REQUIRES(c, lock,
+                errors::InvalidArgument(
+                    "Expected input to contain a SharedLockReleaser "
+                    "object, but saw variant: '",
+                    lock_t.scalar<Variant>()().DebugString(), "'"));
+    const int use_count = lock->shared_lock.use_count();
+    OP_REQUIRES(
+        c, use_count == 1,
+        errors::InvalidArgument("Expected use count of lock to be 1, but saw: ",
+                                use_count));
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MutexLock").Device(DEVICE_CPU), MutexLockOp);
+
+REGISTER_KERNEL_BUILDER(Name("MutexV2").Device(DEVICE_CPU),
+                        ResourceHandleOp<Mutex>);
+
+REGISTER_KERNEL_BUILDER(Name("ConsumeMutexLock").Device(DEVICE_CPU),
+                        ConsumeMutexLockOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index c3d24e50effb3fe5184e264064393a7f339105f0..313d40c082b3e334a01ba97eaf4449e1940b013a 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -12,18 +12,6 @@ load(
     "tf_kernel_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_kernel_library(
     name = "neon_depthwise_conv_op",
     hdrs = [
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 67d9217b9502a30f5727b6a91fbf36da872ab972..9387fb13bc252418d247b25464c67b90a864901d 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -147,7 +148,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -160,7 +161,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
@@ -308,7 +309,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -322,7 +323,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index eff3e4d92cc3ecc3b172a970564225c8b204cdd4..41494f56c5ea6b099f8eb7e81d50c83269aa278f 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -70,7 +70,7 @@ class PadOp : public OpKernel {
             "The first dimension of paddings must be the rank of inputs",
             in1.shape().DebugString(), " ", in0.shape().DebugString()));
 
-    T pad_value(0);
+    T pad_value = T();
     if (context->num_inputs() == 3) {
       const Tensor& constant_values = context->input(2);
       OP_REQUIRES(
@@ -104,42 +104,147 @@ class PadOp : public OpKernel {
       return;
     }
 
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    TensorShape collapsed_input_shape;
+    TensorShape collapsed_output_shape;
+    Tensor collapsed_paddings;
+    if (fixed_dims > 1 &&
+        CollapseAdjacentNonPaddedDimensions(
+            in0.shape(), in1, output_shape, &collapsed_input_shape,
+            &collapsed_paddings, &collapsed_output_shape)) {
+      Tensor collapsed_input;
+      CHECK(collapsed_input.CopyFrom(in0, collapsed_input_shape));
+      Tensor collapsed_output;
+      AllocatorAttributes alloc_attrs;
+      alloc_attrs.set_on_host(context->input_memory_type(0) == HOST_MEMORY);
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(collapsed_input.dtype(),
+                                            collapsed_output_shape,
+                                            &collapsed_output, alloc_attrs));
+      const Tensor& collapsed_paddings_ref = collapsed_paddings;
+      typename TTypes<Tpadding>::ConstMatrix collapsed_paddings_matrix =
+          collapsed_paddings_ref.matrix<Tpadding>();
 
+      OperateWithVariableRank(context, collapsed_input_shape.dims(),
+                              collapsed_input, collapsed_paddings_matrix,
+                              pad_value, &collapsed_output);
+
+      Tensor output;
+      CHECK(output.CopyFrom(collapsed_output, output_shape));
+      context->set_output(0, output);
+    } else {
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, output_shape, &output));
+      OperateWithVariableRank(context, fixed_dims, in0, paddings, pad_value,
+                              output);
+    }
+  }
+
+ private:
+  // Collapses adjacent dimensions that are not padded to one dimension for
+  // speed. Returns true if any two dimensions are collapsed. For example,
+  //
+  //   Pad(input_shape=[8, 28, 28, 3],
+  //       paddings=[[0, 0], [0, 0], [0, 0], [0, 1]]
+  // is equivalent to
+  //   Pad(input_shape=[6272, 3],
+  //       paddings=[[0, 0], [0, 1]])
+  //
+  // input_shape: the original input shape.
+  // paddings_as_tensor: the original paddings.
+  // output_shape: the original output shape.
+  // collapsed_input_shape: the input shape after collapsing.
+  // collapsed_paddings_as_tensor: the paddings after collapsing.
+  // collapsed_output_shape: the output shape after collapsing.
+  static bool CollapseAdjacentNonPaddedDimensions(
+      const TensorShape& input_shape, const Tensor& paddings_as_tensor,
+      const TensorShape& output_shape, TensorShape* collapsed_input_shape,
+      Tensor* collapsed_paddings_as_tensor,
+      TensorShape* collapsed_output_shape) {
+    bool collapsed = false;
+    typename TTypes<Tpadding>::ConstMatrix paddings =
+        paddings_as_tensor.matrix<Tpadding>();
+    std::vector<std::pair<int, int>> collapsed_paddings;
+    int i = 0;
+    while (i < paddings.dimension(0)) {
+      if (paddings(i, 0) != 0 || paddings(i, 1) != 0) {
+        // If padded, copy the original dimension over.
+        collapsed_input_shape->InsertDim(collapsed_input_shape->dims(),
+                                         input_shape.dim_size(i));
+        collapsed_output_shape->InsertDim(collapsed_output_shape->dims(),
+                                          output_shape.dim_size(i));
+        collapsed_paddings.push_back({paddings(i, 0), paddings(i, 1)});
+        ++i;
+      } else {
+        // If not padded, find the next dimension that is padded and collapse
+        // all dimensions in between to one dimension.
+        int64 collapsed_input_dim_size = input_shape.dim_size(i);
+        int64 collapsed_output_dim_size = output_shape.dim_size(i);
+        ++i;
+        while (i < paddings.dimension(0) && paddings(i, 0) == 0 &&
+               paddings(i, 1) == 0) {
+          collapsed = true;
+          collapsed_input_dim_size *= input_shape.dim_size(i);
+          collapsed_output_dim_size *= output_shape.dim_size(i);
+          ++i;
+        }
+        collapsed_input_shape->InsertDim(collapsed_input_shape->dims(),
+                                         collapsed_input_dim_size);
+        collapsed_output_shape->InsertDim(collapsed_output_shape->dims(),
+                                          collapsed_output_dim_size);
+        collapsed_paddings.push_back({0, 0});
+      }
+    }
+
+    // Copy collapsed_paddings to collapsed_paddings_as_tensor.
+    *collapsed_paddings_as_tensor =
+        Tensor(paddings_as_tensor.dtype(),
+               TensorShape({static_cast<int64>(collapsed_paddings.size()), 2}));
+    auto collapsed_paddings_as_matrix =
+        collapsed_paddings_as_tensor->matrix<Tpadding>();
+    for (size_t i = 0; i < collapsed_paddings.size(); ++i) {
+      collapsed_paddings_as_matrix(i, 0) = collapsed_paddings[i].first;
+      collapsed_paddings_as_matrix(i, 1) = collapsed_paddings[i].second;
+    }
+    return collapsed;
+  }
+
+  void OperateWithVariableRank(OpKernelContext* context, int fixed_dims,
+                               const Tensor& input,
+                               typename TTypes<Tpadding>::ConstMatrix paddings,
+                               T pad_value, Tensor* output) {
     // Invoke the dims-specific implementation.
     switch (fixed_dims) {
       case 0:
-        Operate<0>(context, in0.tensor<T, 0>(), paddings, pad_value, output);
+        Operate<0>(context, input.tensor<T, 0>(), paddings, pad_value, output);
         break;
       case 1:
         // TODO(irving): Once Pad doesn't need a scalar special case,
         // change flat to tensor.  That is, once !allow_legacy_scalars().
-        Operate<1>(context, in0.flat<T>(), paddings, pad_value, output);
+        Operate<1>(context, input.flat<T>(), paddings, pad_value, output);
         break;
       case 2:
-        Operate<2>(context, in0.tensor<T, 2>(), paddings, pad_value, output);
+        Operate<2>(context, input.tensor<T, 2>(), paddings, pad_value, output);
         break;
       case 3:
-        Operate<3>(context, in0.tensor<T, 3>(), paddings, pad_value, output);
+        Operate<3>(context, input.tensor<T, 3>(), paddings, pad_value, output);
         break;
       case 4:
-        Operate<4>(context, in0.tensor<T, 4>(), paddings, pad_value, output);
+        Operate<4>(context, input.tensor<T, 4>(), paddings, pad_value, output);
         break;
       case 5:
-        Operate<5>(context, in0.tensor<T, 5>(), paddings, pad_value, output);
+        Operate<5>(context, input.tensor<T, 5>(), paddings, pad_value, output);
         break;
       case 6:
-        Operate<6>(context, in0.tensor<T, 6>(), paddings, pad_value, output);
+        Operate<6>(context, input.tensor<T, 6>(), paddings, pad_value, output);
         break;
       default:
         OP_REQUIRES(context, false,
                     errors::InvalidArgument("Only ranks up to 6 supported: ",
-                                            in0.shape().DebugString()));
+                                            input.shape().DebugString()));
     }
   }
 
- private:
   template <int Dims>
   void Operate(OpKernelContext* context,
                typename TTypes<T, Dims>::ConstTensor input,
@@ -186,6 +291,7 @@ class PadOp : public OpKernel {
                           PadOp<CPUDevice, type, int64>);
 
 TF_CALL_POD_TYPES(REGISTER_KERNEL);
+TF_CALL_string(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -215,6 +321,7 @@ namespace functor {
   DECLARE_GPU_SPEC(T, 6);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+TF_CALL_int8(DECLARE_GPU_SPECS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
@@ -247,6 +354,7 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
                           PadOp<GPUDevice, T, int64>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_int8(REGISTER_GPU_KERNEL);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
index 613ad628251915951be7a99ce687ceeef89d7aef..8e13e19e2ee03557e51ab21dc813ed33b75210dc 100644
--- a/tensorflow/core/kernels/pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -40,6 +40,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_PAD_SPECS(T, int64)
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+TF_CALL_int8(DEFINE_GPU_SPECS);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d66b1ba66399f84edfc9380d25b0814e5b9745fc
--- /dev/null
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -0,0 +1,279 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_partition.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+typedef FunctionLibraryRuntime::Handle FHandle;
+
+namespace {
+
+// A `PartitionedCallOp` asynchronously executes a function, potentially across
+// multiple devices but within a single process. The kernel places and
+// partitions a given function's underlying graph, and executes each of the
+// partitioned subgraphs as a function.
+//
+// TODO(akshayka): Support distributed execution.
+class PartitionedCallOp : public AsyncOpKernel {
+ public:
+  explicit PartitionedCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+  }
+
+  ~PartitionedCallOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    FunctionLibraryRuntime* lib = ctx->function_library();
+    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                      errors::Internal("No function library is provided."),
+                      done);
+
+    // The function body's graph is placed and partitioned the first time
+    // `ComputeAsync` is invoked; every subsequent invocation calls each
+    // of the function shards yielded by partitioning.
+    //
+    // The partitioning step yields a set of devices on which to run the
+    // function, and exactly one function shard is created for each device
+    // Inputs and outputs are pinned to the local device, for simplicity.
+    //
+    // TODO(akshayka): Support re-sharding the function on subsequent calls,
+    // via, e.g., virtual device annotations and a list of device names supplied
+    // through an attribute.
+    //
+    // TODO(akshayka): Lift the constraint pinning inputs and outputs to the
+    // local device.
+    //
+    // TODO(akshayka): Add a fastpath for functions that execute on a single
+    // device.
+    {
+      mutex_lock l(mu_);
+      if (!partitioned_) {
+        // Instantiate the function to obtain its underlying graph, complete
+        // with nodes for arguments and return values.
+        FunctionLibraryRuntime::InstantiateOptions opts;
+        FHandle handle;
+        OP_REQUIRES_OK_ASYNC(
+            ctx,
+            lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts,
+                             &handle),
+            done);
+        Graph* graph = lib->GetFunctionBody(handle)->graph;
+
+        // Pin the inputs and outputs to the local device to simplify the
+        // function-dispatching logic.
+        local_device_name_ = lib->device()->name();
+        for (Node* node : graph->op_nodes()) {
+          string node_type = node->type_string();
+          if (node_type == FunctionLibraryDefinition::kArgOp ||
+              node_type == FunctionLibraryDefinition::kRetOp) {
+            node->set_assigned_device_name(local_device_name_);
+          }
+        }
+
+        // Place the graph, i.e,. assign a device to every node in it.
+        DeviceSet device_set;
+        for (auto d : lib->device_mgr()->ListDevices()) {
+          device_set.AddDevice(d);
+        }
+        Placer placer(graph, &device_set);
+        OP_REQUIRES_OK_ASYNC(ctx, placer.Run(), done);
+
+        // Partition the graph into subgraphs: exactly one subgraph per device.
+        //
+        // TODO(akshayka): Let devices rewrite their graphs.
+        PartitionOptions partition_options;
+        partition_options.node_to_loc = [](const Node* node) {
+          // TODO(akshayka): To better support the distributed case, first split
+          // the graph by worker (e.g,. using the master session's
+          // `SplitByWorker` policy), and then recursively partition the
+          // per-worker shards at the remote worker(s).
+          return node->assigned_device_name();
+        };
+        int64 edge_name_counter = 0;
+        partition_options.new_name =
+            [&edge_name_counter](const string& prefix) {
+              return strings::StrCat(prefix, "/_", ++edge_name_counter);
+            };
+        partition_options.get_incarnation =
+            [&device_set](const string& name) -> int64 {
+          const Device* d = device_set.FindDeviceByName(name);
+          if (d == nullptr) {
+            return PartitionOptions::kIllegalIncarnation;
+          } else {
+            return d->attributes().incarnation();
+          }
+        };
+        partition_options.control_flow_added = false;
+        std::unordered_map<string, GraphDef> partitions;
+        OP_REQUIRES_OK_ASYNC(
+            ctx, Partition(partition_options, graph, &partitions), done);
+
+        VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
+                << partitions.size() << " shards.";
+
+        // `subgraphs` is a map from devices to their corresponding subgraphs.
+        gtl::FlatMap<string, std::unique_ptr<Graph>> subgraphs;
+        const FunctionLibraryDefinition* flib_def = &graph->flib_def();
+        for (const auto& partition : partitions) {
+          std::unique_ptr<Graph> subgraph(new Graph(flib_def));
+          GraphConstructorOptions opts;
+          opts.allow_internal_ops = true;
+          opts.expect_device_spec = true;
+          const string& device = partition.first;
+          const GraphDef& graph_def = partition.second;
+          OP_REQUIRES_OK_ASYNC(
+              ctx, ConvertGraphDefToGraph(opts, graph_def, subgraph.get()),
+              done);
+          subgraphs.emplace(device, std::move(subgraph));
+        }
+
+        // The FunctionLibraryRuntime's library cannot be mutated from within
+        // an OpKernel, so the functions are instantiated in an overlay library.
+        overlay_lib_.reset(new FunctionLibraryDefinition(
+            *lib->GetFunctionLibraryDefinition()));
+        for (const auto& pair : subgraphs) {
+          const string& target = pair.first;
+          Graph* subgraph = pair.second.get();
+          FunctionDef shard;
+          string unique_name = UniquifyFunctionName(func_.name());
+          OP_REQUIRES_OK_ASYNC(
+              ctx, GraphToFunctionDef(*subgraph, unique_name, &shard), done);
+          OP_REQUIRES_OK_ASYNC(ctx, overlay_lib_->AddFunctionDef(shard), done);
+          FunctionLibraryRuntime::InstantiateOptions opts;
+          opts.target = target;
+          opts.overlay_lib = overlay_lib_.get();
+          FHandle handle;
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              lib->Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
+                               &handle),
+              done);
+          device_handle_map_.emplace(target, handle);
+        }
+        partitioned_ = true;
+      }
+    }
+
+    FunctionLibraryRuntime::Options opts;
+    opts.step_id = ctx->step_id();
+    opts.step_container = ctx->step_container();
+    opts.cancellation_manager = ctx->cancellation_manager();
+    opts.stats_collector = ctx->stats_collector();
+    // TODO(akshayka): Consider selecting a runner on a per-device basis, i.e.,
+    // using device-specific threadpools when available.
+    opts.runner = ctx->runner();
+    opts.source_device = local_device_name_;
+    // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
+    // constructed rendezvous to a rendezvous manager.
+    Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
+    opts.rendezvous = rendez;
+
+    OpInputList arguments;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
+    // Dummy args vector for the remote shards, which do not have inputs.
+    std::vector<Tensor> dummy_args;
+
+    StatusCallback callback = std::bind(
+        [](Rendezvous* rendez, DoneCallback& done, const Status& status) {
+          rendez->Unref();
+          done();
+        },
+        rendez, std::move(done), std::placeholders::_1);
+    auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
+    for (int i = 1; i < device_handle_map_.size(); ++i) {
+      refcounted_done->Ref();
+    }
+
+    for (const auto& pair : device_handle_map_) {
+      const string& target_device = pair.first;
+      FHandle handle = pair.second;
+      VLOG(3) << "Running function shard on device " << target_device;
+      if (target_device == local_device_name_) {
+        opts.remote_execution = false;
+        std::vector<Tensor> args;
+        args.reserve(arguments.size());
+        for (const Tensor& argument : arguments) {
+          args.push_back(argument);
+        }
+        auto* rets = new std::vector<Tensor>;
+        lib->Run(opts, handle, args, rets,
+                 [rets, refcounted_done, ctx](const Status& status) {
+                   if (!status.ok()) {
+                     ctx->SetStatus(status);
+                   } else {
+                     for (int i = 0; i < rets->size(); ++i) {
+                       ctx->set_output(i, (*rets)[i]);
+                     }
+                   }
+                   delete rets;
+                   refcounted_done->Unref();
+                 });
+      } else {
+        opts.remote_execution = true;
+        std::vector<Tensor>* dummy_rets = new std::vector<Tensor>;
+        lib->Run(opts, handle, dummy_args, dummy_rets,
+                 [dummy_rets, refcounted_done, ctx](const Status& status) {
+                   if (!status.ok()) {
+                     ctx->SetStatus(status);
+                   }
+                   delete dummy_rets;
+                   refcounted_done->Unref();
+                 });
+      }
+    }
+  }
+
+ private:
+  string UniquifyFunctionName(const string& name) {
+    for (;; ++suffix_) {
+      const string candidate = strings::StrCat(name, "_", suffix_);
+      if (overlay_lib_->Find(candidate) == nullptr) {
+        return candidate;
+      }
+    }
+  }
+
+  // `func_` encapsulates the original, unsharded function.
+  NameAttrList func_;
+  string local_device_name_;
+  // Function shards are added to `overlay_lib_`.
+  std::unique_ptr<FunctionLibraryDefinition> overlay_lib_;
+  // A map from device names to handles of function shards.
+  gtl::FlatMap<string, FHandle> device_handle_map_;
+
+  mutex mu_;
+  bool partitioned_ GUARDED_BY(mu_) = false;
+
+  // Used to uniquify function names in `overlay_lib_`.
+  uint32 suffix_ = 0;
+};
+REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_CPU),
+                        PartitionedCallOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 01bcfede1e8d1f1a71059c5171f8a4d7290d7a5b..2180c4eb97711ccd704796c1431204154e18c159 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -748,9 +748,8 @@ struct LaunchPoolingOp<GPUDevice, T, AVG> {
                      const std::array<int64, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
-    DnnPooling3dOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kAverage, window,
-        stride, padding, data_format, tensor_in, output);
+    DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, window,
+                               stride, padding, data_format, tensor_in, output);
   }
 };
 
@@ -762,9 +761,8 @@ struct LaunchPoolingOp<GPUDevice, T, MAX> {
                      const std::array<int64, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
-    DnnPooling3dOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kMaximum, window,
-        stride, padding, data_format, tensor_in, output);
+    DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, window,
+                               stride, padding, data_format, tensor_in, output);
   }
 };
 
@@ -778,10 +776,10 @@ struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
                      const std::array<int64, 3>& padding,
                      TensorFormat data_format, Tensor* input_backprop) {
     const TensorShape output_shape = tensor_in.shape();
-    DnnPooling3dGradOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kMaximum, window,
-        stride, padding, out, data_format, out_backprop, output_shape,
-        &tensor_in, &tensor_out, input_backprop);
+    DnnPooling3dGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
+                                   window, stride, padding, out, data_format,
+                                   out_backprop, output_shape, &tensor_in,
+                                   &tensor_out, input_backprop);
   }
 };
 
@@ -796,9 +794,8 @@ struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
                      const std::array<int64, 3>& padding,
                      TensorFormat data_format, Tensor* output) {
     DnnPooling3dGradOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kAverage, window,
-        stride, padding, out, data_format, out_backprop, tensor_in_shape,
-        nullptr, nullptr, output);
+        context, se::dnn::PoolingMode::kAverage, window, stride, padding, out,
+        data_format, out_backprop, tensor_in_shape, nullptr, nullptr, output);
   }
 };
 
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index d4241b58090e4e4c1300fdcdc0e46411aa5a88f3..e583f7feb4df9605115cd16aec54d1f3e9bb8b9c 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -114,11 +114,9 @@ TensorShape PoolParameters::forward_output_shape() {
 
 namespace {
 template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
-                                                    uint64 size) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
-                                                size * sizeof(T));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 }  // namespace
@@ -138,12 +136,13 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
 }  // namespace functor
 
 template <typename T>
-void DnnPoolingOp<T>::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
-    const std::vector<int32>& size, const std::vector<int32>& stride,
-    Padding padding, TensorFormat data_format, const Tensor& tensor_in,
-    const TensorShape& tensor_out_shape, bool propagate_nans) {
+void DnnPoolingOp<T>::Compute(OpKernelContext* context,
+                              se::dnn::PoolingMode pooling_mode,
+                              const std::vector<int32>& size,
+                              const std::vector<int32>& stride, Padding padding,
+                              TensorFormat data_format, const Tensor& tensor_in,
+                              const TensorShape& tensor_out_shape,
+                              bool propagate_nans) {
   Tensor* tensor_out = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
@@ -184,7 +183,7 @@ void DnnPoolingOp<T>::Compute(
   }
 
   /// Get ready to call cudnn
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc;
+  se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
       .set_window_height(params.window_rows)
       .set_window_width(params.window_cols)
@@ -194,19 +193,19 @@ void DnnPoolingOp<T>::Compute(
       .set_horizontal_padding(params.pad_cols)
       .set_propagate_nans(propagate_nans);
 
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(params.tensor_in_batch)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(params.tensor_in_batch)
       .set_height(params.out_height)
       .set_width(params.out_width)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
   auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(),
                                    transformed_input.template flat<T>().size());
@@ -236,13 +235,12 @@ void DnnPoolingOp<T>::Compute(
 
 template <typename T>
 void DnnPoolingGradOp<T>::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
     const std::vector<int32>& size, const std::vector<int32>& stride,
     Padding padding, TensorFormat data_format, const Tensor* tensor_in,
     const Tensor* tensor_out, const Tensor& out_backprop,
     const TensorShape& tensor_in_shape, bool propagate_nans) {
-  CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
+  CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
          "specified";
@@ -327,7 +325,7 @@ void DnnPoolingGradOp<T>::Compute(
   }
 
   /// Get ready to call cudnn
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc;
+  se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
       .set_window_height(params.window_rows)
       .set_window_width(params.window_cols)
@@ -337,19 +335,19 @@ void DnnPoolingGradOp<T>::Compute(
       .set_horizontal_padding(params.pad_cols)
       .set_propagate_nans(propagate_nans);
 
-  perftools::gputools::dnn::BatchDescriptor orig_output_desc;
+  se::dnn::BatchDescriptor orig_output_desc;
   orig_output_desc.set_count(params.tensor_in_batch)
       .set_height(params.out_height)
       .set_width(params.out_width)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-  perftools::gputools::dnn::BatchDescriptor orig_input_desc;
+  se::dnn::BatchDescriptor orig_input_desc;
   orig_input_desc.set_count(params.tensor_in_batch)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
   auto orig_output_data =
       AsDeviceMemory(transformed_output.template flat<T>().data(),
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index 14584565857087b5c2479ff9d5ad513bd283a4a7..7362c5275f7ffbcbfe34df51e5ac3fe953c383ac 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -40,7 +40,7 @@ class DnnPoolingOp {
  public:
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
                       TensorFormat data_format, const Tensor& tensor_in,
@@ -55,7 +55,7 @@ class DnnPoolingGradOp {
  public:
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
                       TensorFormat data_format, const Tensor* tensor_in,
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 5ffcc7d65d1c83580adafe8761a4f5d8227e4327..e41df12d914d3971c0a577eb4ff4ee1b74ac8a46 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
@@ -379,8 +380,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
@@ -401,8 +402,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
   AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index ad606803ee7017380b33819dca7718023daa3900..6c19f9841cdd886a614e537d75cefee4c2e892d8 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -43,6 +43,7 @@ class QueueOp : public ResourceOpKernel<QueueInterface> {
 
   void Compute(OpKernelContext* context) override {
     ResourceOpKernel<QueueInterface>::Compute(context);
+    mutex_lock l(mu_);
     if (resource_ && context->track_allocations()) {
       context->record_persistent_memory_allocation(resource_->MemoryUsed());
     }
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 78ff7948fbf1b6406b2faca1d94acd7ea3325437..e37232539f2f32cba74cde354dade0efe8bf719a 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -495,6 +495,7 @@ class RandomGammaOp : public OpKernel {
                           RandomUniformIntOp<CPUDevice, IntType>);
 
 TF_CALL_half(REGISTER);
+TF_CALL_bfloat16(REGISTER);
 TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 TF_CALL_int32(REGISTER_INT);
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 15ae4c1fc53b2b9bfe1d6085d2ecbc3659705b47..0de2ebb5907caa13e0c1b2a4e11d218bd9701bae 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -244,6 +244,33 @@ __global__ void RowReduceKernel(
   if (row < num_rows && lane == 0) out[row] = sum;
 }
 
+template <typename T1>
+struct storage_type {
+  T1 val;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator T1() { return val; }
+  __host__ __device__ storage_type<T1>& operator=(const T1& in) {
+    val = in;
+    return *this;
+  }
+};
+
+template <typename T2>
+struct storage_type<std::complex<T2>> {
+  T2 real;
+  T2 imag;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator std::complex<T2>() {
+    return std::complex<T2>(real, imag);
+  }
+  __host__ __device__ storage_type<std::complex<T2>>& operator=(
+      const std::complex<T2>& in) {
+    real = in.real();
+    imag = in.imag();
+    return *this;
+  }
+};
+
 // Works only if there are <= 16 columns
 // each warps sums over multiple rows at once
 template <typename T, typename outT, typename Op>
@@ -268,7 +295,7 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += rows_per_warp * gridDim.y * blockDim.y;
   for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
@@ -280,8 +307,8 @@ __global__ void ColumnReduceMax16ColumnsKernel(
   const int rows_in_this_warp = min(rows_per_warp, num_rows - start_row_warp);
   // not the most efficient way to do this sum
   for (int i = 1; i < rows_in_this_warp; ++i) {
-    value_type tmp =
-        cub::ShuffleIndex(sum, threadIdx.x + i * num_cols, 32, 0xffffffff);
+    value_type tmp = cub::ShuffleIndex<32, value_type>(
+        sum, static_cast<int>(threadIdx.x + i * num_cols), 0xffffffff);
     if (lane < num_cols) sum = op(sum, tmp);
   }
 
@@ -294,7 +321,8 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
     if (blockDim.y > 1) {
       for (int row = 1; row < blockDim.y; ++row) {
-        s = op(s, partial_sums[threadIdx.x * 33 + row]);
+        value_type t = partial_sums[threadIdx.x * 33 + row];
+        s = op(s, t);
       }
     }
 
@@ -316,7 +344,7 @@ __global__ void ColumnReduceKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += gridDim.y * blockDim.y;
 
@@ -347,7 +375,8 @@ __global__ void ColumnReduceKernel(
         min(blockDim.y, num_rows - blockIdx.y * blockDim.y);
 
     for (int row = 1; row < numRowsThisBlock; ++row) {
-      s = op(s, partial_sums[threadIdx.x * 33 + row]);
+      value_type t = partial_sums[threadIdx.x * 33 + row];
+      s = op(s, t);
     }
 
     out[col * gridDim.y + blockIdx.y] = s;
diff --git a/tensorflow/core/kernels/regex_replace_op.cc b/tensorflow/core/kernels/regex_replace_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59ec854a79c90424966e4c7f19f8e5c10dfe17d4
--- /dev/null
+++ b/tensorflow/core/kernels/regex_replace_op.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "re2/re2.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class RegexReplaceOp : public OpKernel {
+ public:
+  explicit RegexReplaceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("replace_global", &replace_global_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<string>();
+
+    const Tensor* pattern_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("pattern", &pattern_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
+                errors::InvalidArgument("Pattern must be scalar, but received ",
+                                        pattern_tensor->shape().DebugString()));
+    const string pattern = pattern_tensor->flat<string>()(0);
+    const RE2 match(pattern);
+    OP_REQUIRES(ctx, match.ok(),
+                errors::InvalidArgument("Invalid pattern: ", pattern,
+                                        ", error: ", match.error()));
+
+    const Tensor* rewrite_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("rewrite", &rewrite_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rewrite_tensor->shape()),
+                errors::InvalidArgument("Rewrite must be scalar, but received ",
+                                        rewrite_tensor->shape().DebugString()));
+    const string rewrite = rewrite_tensor->flat<string>()(0);
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
+                                             &output_tensor));
+    auto output_flat = output_tensor->flat<string>();
+    for (size_t i = 0; i < input_flat.size(); ++i) {
+      output_flat(i) = input_flat(i);
+      if (replace_global_) {
+        RE2::GlobalReplace(&output_flat(i), match, rewrite);
+      } else {
+        RE2::Replace(&output_flat(i), match, rewrite);
+      }
+    }
+  }
+
+ private:
+  bool replace_global_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RegexReplace").Device(DEVICE_CPU),
+                        RegexReplaceOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index ec09d8dfea519a70474dca7d3167ba20d3d16d69..6e46c979f33496f8da2c561683723728e28a610e 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -19,15 +19,104 @@ limitations under the License.
 
 #include <stdio.h>
 
-#include "tensorflow/core/kernels/relu_op_functor.h"
-
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/relu_op_functor.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+namespace functor {
+#ifdef TF_HAS_CUDA_FP16
+
+// This kernel computes ReluGrad by processing one half2, two fp16, at a time.
+// It effectively does: backdrops = (feature > 0) ? gradient : 0
+// It also tries to use native half2 primitives as much as possible.
+__global__ void ReluGradHalfKernel(const Eigen::half* gradient,
+                                   const Eigen::half* feature,
+                                   Eigen::half* backprop, int32 count) {
+  int32 half2_count = count >> 1;
+  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_device_threads = gridDim.x * blockDim.x;
+
+  while (index < half2_count) {
+    // The fast branch.
+    // One half2, two fp16, is fetched and processed at a time.
+    half2 gradient_h2 = reinterpret_cast<const half2*>(gradient)[index];
+    half2 feature_h2 = reinterpret_cast<const half2*>(feature)[index];
+    half2* p_backprop_h2 = reinterpret_cast<half2*>(backprop) + index;
+
+#if __CUDA_ARCH__ >= 530
+    // Fast path, when half2 primitives are available.
+    const half2 kZeroH2 = __float2half2_rn(0.f);
+    // mask = (feature > 0)
+    half2 mask_h2 = __hgt2(feature_h2, kZeroH2);
+    // backprop = mask * gradient
+    half2 backprop_h2 = __hmul2(mask_h2, gradient_h2);
+#else
+    // Fall back: convert half2 to float2 for processing.
+    float2 feature_f2 = __half22float2(feature_h2);
+    float2 gradient_f2 = __half22float2(gradient_h2);
+    float2 backprop_f2 = make_float2((feature_f2.x > 0) ? gradient_f2.x : 0,
+                                     (feature_f2.y > 0) ? gradient_f2.y : 0);
+    // Convert back to half2.
+    half2 backprop_h2 = __float22half2_rn(backprop_f2);
+#endif
+
+    // Write back the result.
+    *p_backprop_h2 = backprop_h2;
+
+    index += total_device_threads;
+  }
+
+  if ((count & 0x1) == 1 && index == half2_count) {
+    // If the total number of the elements is odd, process the last element.
+    Eigen::half grad_h = gradient[count - 1];
+    Eigen::half feature_h = feature[count - 1];
+
+    float grad_f = static_cast<float>(grad_h);
+    float feature_f = static_cast<float>(feature_h);
+    float backprop_f = (feature_f > 0) ? grad_f : 0;
+
+    Eigen::half backprop_h(backprop_f);
+    backprop[count - 1] = backprop_h;
+  }
+}
+
+template <typename Device>
+struct ReluGrad<Device, Eigen::half> {
+  // Computes ReluGrad backprop.
+  //
+  // gradient: gradient backpropagated to the Relu op.
+  // feature: either the inputs that were passed to the Relu, or its outputs
+  //           (using either one yields the same result here).
+  // backprop: gradient to backpropagate to the Relu inputs.
+  void operator()(const Device& d,
+                  typename TTypes<Eigen::half>::ConstTensor gradient,
+                  typename TTypes<Eigen::half>::ConstTensor feature,
+                  typename TTypes<Eigen::half>::Tensor backprop) {
+    // NOTE: When the activation is exactly zero, we do not propagate the
+    // associated gradient value. This allows the output of the Relu to be used,
+    // as well as its input.
+    int32 count = gradient.size();
+    if (count == 0) return;
+    int32 half2_count = Eigen::divup(count, 2);
+    const int32 kThreadInBlock = 512;
+    CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
+        half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
+    ReluGradHalfKernel<<<config.block_count, config.thread_per_block, 0,
+                         d.stream()>>>(gradient.data(), feature.data(),
+                                       backprop.data(), count);
+  }
+};
+
+#endif  // TF_HAS_CUDA_FP16
+}  // namespace functor
+
 // Definition of the GPU implementations declared in relu_op.cc.
 #define DEFINE_GPU_KERNELS(T)                       \
   template struct functor::Relu<GPUDevice, T>;      \
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index e2709c117dcee86138e3aee550b9f0dde5930fc6..cc4d9a49a00139865f6a7bf2da360eb0b153add8 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -1125,46 +1127,43 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
     for (size_t i = 0; i < inputs.size(); ++i) {
       if (IsSameNodeName(node_def, inputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_INPUT,
-                                      tid.second, i, remote_graph_executor_name,
+        attr_str += BuildNodeTypeAttr(GRAPH_INPUT, tid.second, i,
+                                      remote_graph_executor_name,
                                       remote_fused_graph_node_name);
       }
     }
     for (size_t i = 0; i < outputs.size(); ++i) {
       if (IsSameNodeName(node_def, outputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_OUTPUT,
-                                      tid.second, i);
+        attr_str += BuildNodeTypeAttr(GRAPH_OUTPUT, tid.second, i);
       }
     }
     for (const string& fused_node_name : fused_node_names) {
       if (fused_node_name == node_def.name()) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::FUSED_NODE);
+        attr_str += BuildNodeTypeAttr(FUSED_NODE);
       }
     }
     for (const string& fused_node_name : fused_nodes_filtered_by_op_types) {
       if (fused_node_name == node_def.name()) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::FUSED_NODE);
+        attr_str += BuildNodeTypeAttr(FUSED_NODE);
       }
     }
     for (size_t i = 0; i < border_inputs.size(); ++i) {
       if (IsSameNodeName(node_def, border_inputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::BORDER_INPUT,
-                                      tid.second, i);
+        attr_str += BuildNodeTypeAttr(BORDER_INPUT, tid.second, i);
       }
     }
     for (size_t i = 0; i < border_outputs.size(); ++i) {
       if (IsSameNodeName(node_def, border_outputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(
-            RemoteFusedGraphExecuteInfo::BORDER_OUTPUT, tid.second, i);
+        attr_str += BuildNodeTypeAttr(BORDER_OUTPUT, tid.second, i);
       }
     }
     if (attr_str.empty()) {
-      attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::UNUSED);
+      attr_str += BuildNodeTypeAttr(UNUSED);
     }
     AddNodeAttr(ATTR_NODE_TYPE, attr_str, &node_def);
   }
@@ -1200,14 +1199,14 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
       }
       int node_type_int;
       CHECK(strings::safe_strto32(attr.at(0), &node_type_int)) << attr.at(0);
-      const RemoteFusedGraphExecuteInfo::NodeType node_type =
-          static_cast<RemoteFusedGraphExecuteInfo::NodeType>(node_type_int);
+      const RemoteFusedGraphNodeType node_type =
+          static_cast<RemoteFusedGraphNodeType>(node_type_int);
       const string& name = node_def.name();
       int port;
       int index;
 
       switch (node_type) {
-        case RemoteFusedGraphExecuteInfo::GRAPH_INPUT:
+        case GRAPH_INPUT:
           VLOG(2) << "Graph input: " << name;
           CHECK_EQ(5, attr.size());
           CHECK(strings::safe_strto32(attr.at(1), &port));
@@ -1224,33 +1223,33 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
             return Status::OK();
           }
           break;
-        case RemoteFusedGraphExecuteInfo::GRAPH_OUTPUT:
+        case GRAPH_OUTPUT:
           VLOG(2) << "Graph output: " << name;
           CHECK_EQ(3, attr.size());
           CHECK(strings::safe_strto32(attr.at(1), &port));
           CHECK(strings::safe_strto32(attr.at(2), &index));
           output_map.emplace(index, strings::StrCat(name, ":", port));
           break;
-        case RemoteFusedGraphExecuteInfo::FUSED_NODE:
+        case FUSED_NODE:
           VLOG(2) << "Fused node: " << name;
           CHECK_EQ(1, attr.size());
           fused_node_names.emplace(name);
           break;
-        case RemoteFusedGraphExecuteInfo::BORDER_INPUT:
+        case BORDER_INPUT:
           VLOG(2) << "Border input: " << name;
           CHECK_EQ(3, attr.size());
           CHECK(strings::safe_strto32(attr.at(1), &port));
           CHECK(strings::safe_strto32(attr.at(2), &index));
           border_input_map.emplace(index, strings::StrCat(name, ":", port));
           break;
-        case RemoteFusedGraphExecuteInfo::BORDER_OUTPUT:
+        case BORDER_OUTPUT:
           VLOG(2) << "Border output: " << name;
           CHECK_EQ(3, attr.size());
           CHECK(strings::safe_strto32(attr.at(1), &port));
           CHECK(strings::safe_strto32(attr.at(2), &index));
           border_output_map.emplace(index, strings::StrCat(name, ":", port));
           break;
-        case RemoteFusedGraphExecuteInfo::UNUSED:
+        case UNUSED:
           // do nothing
           break;
         default:
@@ -1461,20 +1460,19 @@ RemoteFusedGraphExecuteUtils::BuildNodeMapFromOpsDefinitions(
 }
 
 /* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr(
-    const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
-    const int index, const string& executor_name, const string& node_name) {
+    const RemoteFusedGraphNodeType node_type, const int port, const int index,
+    const string& executor_name, const string& node_name) {
   return strings::StrCat(static_cast<int>(node_type), ",", port, ",", index,
                          ",", executor_name, ",", node_name);
 }
 
 /* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr(
-    const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
-    const int index) {
+    const RemoteFusedGraphNodeType node_type, const int port, const int index) {
   return strings::StrCat(static_cast<int>(node_type), ",", port, ",", index);
 }
 
 /* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr(
-    const RemoteFusedGraphExecuteInfo::NodeType node_type) {
+    const RemoteFusedGraphNodeType node_type) {
   return strings::StrCat(static_cast<int>(node_type));
 }
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
index f0471442781de7c901e1c1cec69b840186015ce3..ea6b6a10154f3c056bfb0844429cc61afc2b7ea6 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -19,8 +19,6 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
@@ -30,6 +28,17 @@ limitations under the License.
 
 namespace tensorflow {
 
+enum RemoteFusedGraphNodeType {
+  UNUSED = 0,
+  GRAPH_INPUT = 1,
+  GRAPH_OUTPUT = 2,
+  FUSED_NODE = 3,
+  BORDER_INPUT = 4,
+  BORDER_OUTPUT = 5,
+};
+
+class RemoteFusedGraphExecuteInfo;
+
 // RemoteFusedGraphExecuteUtils provides APIs to register and get builder
 // functions for IRemoteFusedGraphExecutor.
 class RemoteFusedGraphExecuteUtils {
@@ -297,16 +306,15 @@ class RemoteFusedGraphExecuteUtils {
 
   static ExecutorBuildRegistry* GetExecutorBuildRegistry();
 
-  static string BuildNodeTypeAttr(
-      const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
-      const int index, const string& executor_name, const string& node_name);
+  static string BuildNodeTypeAttr(const RemoteFusedGraphNodeType node_type,
+                                  const int port, const int index,
+                                  const string& executor_name,
+                                  const string& node_name);
 
-  static string BuildNodeTypeAttr(
-      const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
-      const int index);
+  static string BuildNodeTypeAttr(const RemoteFusedGraphNodeType node_type,
+                                  const int port, const int index);
 
-  static string BuildNodeTypeAttr(
-      const RemoteFusedGraphExecuteInfo::NodeType node_type);
+  static string BuildNodeTypeAttr(const RemoteFusedGraphNodeType node_type);
 
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteUtils);
 };
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
index aca8ddfae9a91d0de40aafc7d5df43867bc9c7af..44251e6ff8e882d93faf8f57d3cb918b5d8421d4 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
index d5b37b1ce1279ef69f103bf2968801b641dfc1f5..1e0731e540cfe9138e61f219b6f0d24a0fbd9f98 100644
--- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/default_device.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -181,7 +183,7 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
     int cluster_count = 0;
     for (const NodeDef& node_def : output_graph_def_.node()) {
       const string& name = node_def.name();
-      if (StringPiece(name).starts_with(REMOTE_FUSED_GRAPH_NODE_NAME)) {
+      if (str_util::StartsWith(name, REMOTE_FUSED_GRAPH_NODE_NAME)) {
         ++cluster_count;
         RemoteFusedGraphExecuteInfo info;
         string serialized_proto;
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 25a37d5e1af5835d56dedb50922967704500ad46..c23570d885be2419a8e045633a31250e0b99703d 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -218,9 +219,8 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To0x0) {
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid argument: output dimensions must be positive"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index a920e6028104b69799e29e1fd9526baa9d85fc08..6d57892828593e30a0da5ea90b01b6742a71019f 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -457,9 +458,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid argument: output dimensions must be positive"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
 
@@ -467,8 +467,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidInputShape) {
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: input must be 4-dimensional"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: input must be 4-dimensional"))
       << s;
 }
 
@@ -476,8 +476,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeDim) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: shape_t must be 1-dimensional"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: shape_t must be 1-dimensional"))
       << s;
 }
 
@@ -485,8 +485,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeElements) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: shape_t must have two elements"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: shape_t must have two elements"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 702fb89aac9afe577cf7e4cd72616f7136a63b0b..916869fb566f11495bc2c6e86ff9213913347b2e 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -250,9 +250,11 @@ class AssignVariableOp : public OpKernel {
 
     // Copying is unnecessary if we are the last user of the value
     // tensor, we can just adopt the input tensor's buffer instead.
-    std::unique_ptr<Tensor> input_alias =
-        context->forward_input(1, dtype_, value.shape(), DEVICE_MEMORY, attr);
+    std::unique_ptr<Tensor> input_alias = context->forward_input(
+        1, OpKernelContext::Params::kNoReservation /*output_index*/, dtype_,
+        value.shape(), DEVICE_MEMORY, attr);
     mutex_lock ml(*variable->mu());
+    variable->is_initialized = true;
     if (input_alias) {
       *variable->tensor() = *input_alias;
       return;
@@ -277,64 +279,6 @@ class AssignVariableOp : public OpKernel {
   DataType dtype_;
 };
 
-template <typename Device>
-Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
-
-#define CPU_DENSE_COPY(T)                                                \
-  case DataTypeToEnum<T>::value: {                                       \
-    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
-    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
-                  from.flat<T>());                                       \
-    break;                                                               \
-  }
-
-#define INSTANTIATE_GET_VARIANT_COPY_FN(Device, TYPE_CALLER, TYPE_DENSE_COPY) \
-  template <>                                                                 \
-  Status VariantCopyFn<Device>(OpKernelContext * context, const Tensor& from, \
-                               Tensor* to) {                                  \
-    PersistentTensor tmp;                                                     \
-    Tensor* tensor;                                                           \
-    AllocatorAttributes attr;                                                 \
-    attr.set_gpu_compatible(true);                                            \
-    attr.set_nic_compatible(true);                                            \
-    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
-        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
-    switch (from.dtype()) {                                                   \
-      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
-      default:                                                                \
-        return errors::InvalidArgument(                                       \
-            "VariantCopyFn: Could not perform a deep copy of variant "        \
-            "element of type: ",                                              \
-            DataTypeString(from.dtype()),                                     \
-            " using device: ", context->device()->name());                    \
-    }                                                                         \
-    *to = *tensor;                                                            \
-    return Status::OK();                                                      \
-  }
-
-INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
-
-#if GOOGLE_CUDA
-#define GPU_DENSE_COPY(T)                                                \
-  case DataTypeToEnum<T>::value: {                                       \
-    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
-    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
-                  from.flat<T>());                                       \
-    break;                                                               \
-  }
-#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
-  TF_CALL_GPU_ALL_TYPES(T);                 \
-  TF_CALL_int32(T);                         \
-  TF_CALL_int64(T);
-INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
-                                GPU_DENSE_COPY);
-#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
-#undef GPU_DENSE_COPY
-#endif  // GOOGLE_CUDA
-
-#undef CPU_DENSE_COPY
-#undef INSTANTIATE_GET_VARIANT_COPY_FN
-
 template <typename Device>
 class AssignVariableOp<Device, Variant> : public OpKernel {
  public:
@@ -350,7 +294,7 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
     Var* variable = nullptr;
     OP_REQUIRES_OK(context, LookupOrCreateResource<Var>(
                                 context, HandleFromInput(context, 0), &variable,
-                                [this, context](Var** ptr) {
+                                [](Var** ptr) {
                                   // Created on host.
                                   *ptr = new Var(DT_VARIANT);
                                   return Status::OK();
@@ -362,18 +306,51 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(DT_VARIANT)));
 
-    mutex_lock ml(*variable->mu());
+    // For purposes of forwarding DT_VARIANT, we want the least
+    // restrictive attr; we already know the input is on host.
+    AllocatorAttributes attr;
 
+    // Copying is unnecessary if we are the last user of the value
+    // tensor, we can just adopt the input tensor's buffer instead.
+    // Note that Variant objects themselves always reside on host.
+    //
+    // We nevertheless want to signal to the runtime that the tensor
+    // should reside in memory of the associated device, as Variant
+    // tensors may be marked as sitting on either CPU or GPU.  This
+    // helps to elide one or more copies.
+    std::unique_ptr<Tensor> input_alias = context->forward_input(
+        1, OpKernelContext::Params::kNoReservation /*output_index*/, DT_VARIANT,
+        value.shape(),
+        DEVICE_MEMORY /* HOST_MEMORY is only reserved for special cases */,
+        attr);
+
+    mutex_lock ml(*variable->mu());
+    variable->is_initialized = true;
     *variable->tensor() = Tensor(DT_VARIANT, value.shape());
+
+    if (input_alias) {
+      *variable->tensor() = *input_alias;
+      return;
+    }
+
+    // Need to copy, but maybe we can re-use variable's buffer?
+    if (!variable->tensor()->RefCountIsOne() ||
+        !variable->tensor()->shape().IsSameSize(value.shape())) {
+      PersistentTensor unused;
+      Tensor* tmp;
+      // Allocation of DT_VARIANT is always on host.
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(context,
+                     context->allocate_persistent(DT_VARIANT, value.shape(),
+                                                  &unused, &tmp, attr));
+      *variable->tensor() = *tmp;
+    }
+
     const auto elements_in = value.flat<Variant>();
     auto elements_out = variable->tensor()->flat<Variant>();
-    auto copy_fn = std::bind(&VariantCopyFn<Device>, context,
-                             std::placeholders::_1, std::placeholders::_2);
     for (int64 i = 0; i < elements_in.size(); ++i) {
-      OP_REQUIRES_OK(context, VariantDeviceCopy(
-                                  VariantDeviceCopyDirection::DEVICE_TO_DEVICE,
-                                  elements_in(i), &elements_out(i), copy_fn));
-    };
+      elements_out(i) = elements_in(i);
+    }
   }
 
  private:
@@ -462,8 +439,29 @@ TF_CALL_int64(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
+class VarIsInitializedOp : public OpKernel {
+ public:
+  explicit VarIsInitializedOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* context) override {
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    auto output_tensor = output->tensor<bool, 0>();
+    Var* variable = nullptr;
+    Status s = LookupResource(context, HandleFromInput(context, 0), &variable);
+    if (!s.ok()) {
+      output_tensor() = false;
+      return;
+    }
+    core::ScopedUnref su(variable);
+    mutex_lock ml(*variable->mu());
+    output_tensor() = variable->is_initialized;
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("VarIsInitializedOp").Device(DEVICE_CPU),
-                        IsResourceInitialized<Var>);
+                        VarIsInitializedOp);
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("VarIsInitializedOp")
@@ -481,6 +479,7 @@ class ResourceGatherOp : public OpKernel {
   void Compute(OpKernelContext* c) override {
     Var* v = nullptr;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+    core::ScopedUnref su(v);
     // NOTE: We hold the lock for the whole gather operation instead
     // of increasing the reference count of v->tensor() to avoid a
     // situation where a write to the same variable will see a
@@ -509,7 +508,14 @@ class ResourceGatherOp : public OpKernel {
     }
 
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+    Tensor tmp;
+    if (params.dtype() == DT_VARIANT) {
+      tmp = Tensor(DT_VARIANT, result_shape);
+      c->set_output(0, tmp);
+      out = &tmp;
+    } else {
+      OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+    }
     if (N > 0) {
       const int64 gather_dim_size = params.dim_size(0);
       int64 inner_size = 1;
@@ -554,7 +560,24 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 #if GOOGLE_CUDA
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GATHER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
+
+// Variant objects themselves sit on CPU, even if they contain data
+// pointing to a device.
+REGISTER_KERNEL_BUILDER(Name("ResourceGather")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceGatherOp<GPUDevice, Variant, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceGather")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int64>("Tindices"),
+                        ResourceGatherOp<GPUDevice, Variant, int64>)
 
 #endif  // GOOGLE_CUDA
 
@@ -586,7 +609,7 @@ class ResourceScatterUpdateOp : public OpKernel {
                                 DataTypeString(DataTypeToEnum<Index>::v()),
                                 " indexing: ", N_big, " > ",
                                 std::numeric_limits<Index>::max()));
-    const Index N = static_cast<Index>(indices.NumElements());
+    const Index N = static_cast<Index>(N_big);
     OP_REQUIRES(
         c, params->dim_size(0) <= std::numeric_limits<Index>::max(),
         errors::InvalidArgument("params.shape[0] too large for ",
@@ -597,16 +620,35 @@ class ResourceScatterUpdateOp : public OpKernel {
     if (N > 0) {
       auto indices_flat = indices.flat<Index>();
       auto params_flat = params->flat_outer_dims<T>();
-      auto updates_flat = updates.shaped<T, 2>({N, updates.NumElements() / N});
-
-      functor::ScatterFunctor<Device, T, Index, op> functor;
-      const Index bad_i = functor(c, c->template eigen_device<Device>(),
-                                  params_flat, updates_flat, indices_flat);
-      OP_REQUIRES(c, bad_i < 0,
-                  errors::InvalidArgument(
-                      "indices", SliceDebugString(indices.shape(), bad_i),
-                      " = ", indices_flat(bad_i), " is not in [0, ",
-                      params->dim_size(0), ")"));
+      if (TensorShapeUtils::IsScalar(updates.shape())) {
+        const auto update = updates.scalar<T>();
+
+        functor::ScatterScalarFunctor<Device, T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<Device>(),
+                                    params_flat, update, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params->dim_size(0), ")"));
+      } else {
+        int64 num_updates = updates.NumElements();
+        OP_REQUIRES(c, num_updates % N == 0,
+                    errors::InvalidArgument(
+                        "shape of indices (", indices.shape().DebugString(),
+                        ") is not compatible with the shape of updates (",
+                        updates.shape().DebugString(), ")"));
+        auto updates_flat = updates.shaped<T, 2>({N, num_updates / N});
+
+        functor::ScatterFunctor<Device, T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<Device>(),
+                                    params_flat, updates_flat, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params->dim_size(0), ")"));
+      }
     }
   }
 };
@@ -624,35 +666,70 @@ class ResourceScatterUpdateOp : public OpKernel {
   REGISTER_SCATTER_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_KERNEL_INDEX(type, int64, dev, name, op);
 
-// TODO(apassos) add the other types here.
-#define REGISTER_SCATTER_ARITHEMTIC(type, dev)                \
+#define REGISTER_SCATTER_ARITHMETIC(type, dev)                \
   REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterAdd",    \
                           scatter_op::UpdateOp::ADD);         \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterSub",    \
+                          scatter_op::UpdateOp::SUB);         \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterMul",    \
+                          scatter_op::UpdateOp::MUL);         \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterDiv",    \
+                          scatter_op::UpdateOp::DIV);         \
   REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterUpdate", \
                           scatter_op::UpdateOp::ASSIGN);
+#define REGISTER_SCATTER_MINMAX(type, dev)                 \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterMin", \
+                          scatter_op::UpdateOp::MIN);      \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterMax", \
+                          scatter_op::UpdateOp::MAX);
 
 // Registers CPU kernels.
-#define REGISTER_SCATTER_ARITHEMTIC_CPU(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, CPU);
+#define REGISTER_SCATTER_ARITHMETIC_CPU(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, CPU);
+#define REGISTER_SCATTER_MINMAX_CPU(type) REGISTER_SCATTER_MINMAX(type, CPU);
 
-TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHEMTIC_CPU);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHMETIC_CPU);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
 
 REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
+REGISTER_SCATTER_KERNEL(Variant, CPU, "ResourceScatterUpdate",
+                        scatter_op::UpdateOp::ASSIGN);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
-#define REGISTER_SCATTER_ARITHEMTIC_GPU(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, GPU);
+#define REGISTER_SCATTER_ARITHMETIC_GPU(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, GPU);
+#define REGISTER_SCATTER_MINMAX_GPU(type) REGISTER_SCATTER_MINMAX(type, GPU);
 
 #define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
+
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, Variant, int32,
+                                                scatter_op::UpdateOp::ASSIGN>)
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int64>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, Variant, int64,
+                                                scatter_op::UpdateOp::ASSIGN>)
 
 #endif  // GOOGLE_CUDA
 
-#undef REGISTER_SCATTER_ARITHEMTIC
-#undef REGISTER_SCATTER_ARITHEMTIC_CPU
+#undef REGISTER_SCATTER_ARITHMETIC
+#undef REGISTER_SCATTER_ARITHMETIC_CPU
+#undef REGISTER_SCATTER_MINMAX
+#undef REGISTER_SCATTER_MINMAX_CPU
 #undef REGISTER_SCATTER_KERNEL
 #undef REGISTER_SCATTER_KERNEL_INDEX
 
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index bcbdbee058b4fdb587f2099c54545b8a6aec8ca9..4b630809c5a85496dc57476c6291729f54abc5a7 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -254,8 +254,11 @@ class RollOp : public OpKernel {
     // total modulo sum of shifts for each dimension
     gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
     for (int i = 0; i < num_shifts; i++) {
-      const int axis = axis_flat(i);
-      OP_REQUIRES(context, axis < num_dims,
+      int axis = axis_flat(i);
+      if (axis < 0) {
+        axis += num_dims;
+      }
+      OP_REQUIRES(context, 0 <= axis && axis < num_dims,
                   errors::InvalidArgument("axis ", axis, " is out of range"));
       const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
       const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
index 90b6f8d0f3094224ca694b59c851c14bb424d120..e431226aa634d34a13a67785d0cec0d894d04b67 100644
--- a/tensorflow/core/kernels/roll_op_test.cc
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -372,7 +373,8 @@ TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(), "input must be 1-D or higher"))
       << s;
 }
 
@@ -384,8 +386,8 @@ TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("axis must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "axis must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -397,8 +399,8 @@ TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("shift must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "shift must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -410,8 +412,8 @@ TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
   AddInputFromArray<int32>(TensorShape({1}), {1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("shift and axis must have the same size"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "shift and axis must have the same size"))
       << s;
 }
 
@@ -423,7 +425,7 @@ TEST_F(RollOpTest, Error_AxisOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "is out of range")) << s;
 }
 
 // isd - (inner shift dimension) The inner most dimension to be shifted.
diff --git a/tensorflow/core/kernels/rpc_op.cc b/tensorflow/core/kernels/rpc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2447ef504044e2289a99d19630112d33b0147a8a
--- /dev/null
+++ b/tensorflow/core/kernels/rpc_op.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// RpcOp is a TensorFlow op that sends and receives arbitrary messages.
+//
+// See docs in ../ops/rpc_op.cc.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/rpc/call_container.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+
+class RpcOp : public AsyncOpKernel {
+ public:
+  explicit RpcOp(OpKernelConstruction* context) : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("protocol", &protocol_));
+    OP_REQUIRES(context, !protocol_.empty(),
+                errors::InvalidArgument("protocol must be non-empty."));
+    bool fail_fast;
+    OP_REQUIRES_OK(context, context->GetAttr("fail_fast", &fail_fast));
+    int64 timeout_in_ms;
+    OP_REQUIRES_OK(context, context->GetAttr("timeout_in_ms", &timeout_in_ms));
+
+    RPCFactoryRegistry::RPCFactoryFn* rpc_factory_fn =
+        RPCFactoryRegistry::Global()->Get(protocol_);
+    OP_REQUIRES(context, rpc_factory_fn != nullptr,
+                errors::InvalidArgument("The protocol ", protocol_,
+                                        " was not recognized."));
+
+    rpc_factory_.reset((*rpc_factory_fn)(context, fail_fast, timeout_in_ms));
+  }
+
+  ~RpcOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor& address_t = ctx->input(0);
+    const Tensor& method_t = ctx->input(1);
+    const Tensor& request_t = ctx->input(2);
+
+    OP_REQUIRES_ASYNC(
+        ctx, address_t.dims() == 0 || address_t.dims() == 1,
+        errors::InvalidArgument("address must be a scalar or vector."), done);
+    OP_REQUIRES_ASYNC(
+        ctx, method_t.dims() == 0 || method_t.dims() == 1,
+        errors::InvalidArgument("method must be a scalar or vector."), done);
+    OP_REQUIRES_ASYNC(
+        ctx, request_t.dims() == 0 || request_t.dims() == 1,
+        errors::InvalidArgument("request must be a scalar or vector."), done);
+
+    TensorShape output_shape({});
+    for (const Tensor& t : {address_t, method_t, request_t}) {
+      if (t.dims() == 1) {
+        OP_REQUIRES_ASYNC(
+            ctx,
+            output_shape.dims() == 0 ||
+                output_shape.dim_size(0) == t.dim_size(0),
+            errors::InvalidArgument(
+                "Input vector shapes don't match: ", output_shape.DebugString(),
+                " vs. ", t.shape().DebugString()),
+            done);
+        output_shape = t.shape();
+      }
+    }
+
+    Tensor* response_t;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, ctx->allocate_output(0, output_shape, &response_t), done);
+
+    const bool try_rpc = (ctx->num_outputs() > 1);
+
+    Tensor* status_code_t = nullptr;
+    Tensor* status_message_t = nullptr;
+    if (try_rpc) {
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ctx->allocate_output(1, output_shape, &status_code_t), done);
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ctx->allocate_output(2, output_shape, &status_message_t), done);
+    }
+
+    if (request_t.NumElements() == 0) {
+      // Special case, we finished early!
+      done();
+      return;
+    }
+
+    int64 num_elements = output_shape.num_elements();
+
+    rpc_factory_->Call(ctx, num_elements, address_t, method_t, request_t,
+                       try_rpc, response_t, status_code_t, status_message_t,
+                       std::move(done));
+  }
+
+ private:
+  string protocol_;
+  std::unique_ptr<RPCFactory> rpc_factory_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RpcOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Rpc").Device(DEVICE_CPU), RpcOp);
+REGISTER_KERNEL_BUILDER(Name("TryRpc").Device(DEVICE_CPU), RpcOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_functor.cc b/tensorflow/core/kernels/scatter_functor.cc
index 7eba82899fe1d5e84d08cdd129e6c25ca4da15f1..cf5408123fb495c2f540595590a7cce92b39dea5 100644
--- a/tensorflow/core/kernels/scatter_functor.cc
+++ b/tensorflow/core/kernels/scatter_functor.cc
@@ -26,21 +26,30 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 // Forward declarations of the functor specializations for GPU.
-#define DECLARE_GPU_SPECS_OP(T, Index, op)                   \
-  template <>                                                \
-  Index ScatterFunctor<GPUDevice, T, Index, op>::operator()( \
-      OpKernelContext* c, const GPUDevice& d,                \
-      typename TTypes<T>::Matrix params,                     \
-      typename TTypes<T>::ConstMatrix updates,               \
-      typename TTypes<Index>::ConstFlat indices);            \
-  extern template struct ScatterFunctor<GPUDevice, T, Index, op>;
+#define DECLARE_GPU_SPECS_OP(T, Index, op)                         \
+  template <>                                                      \
+  Index ScatterFunctor<GPUDevice, T, Index, op>::operator()(       \
+      OpKernelContext* c, const GPUDevice& d,                      \
+      typename TTypes<T>::Matrix params,                           \
+      typename TTypes<T>::ConstMatrix updates,                     \
+      typename TTypes<Index>::ConstFlat indices);                  \
+  extern template struct ScatterFunctor<GPUDevice, T, Index, op>;  \
+  template <>                                                      \
+  Index ScatterScalarFunctor<GPUDevice, T, Index, op>::operator()( \
+      OpKernelContext* c, const GPUDevice& d,                      \
+      typename TTypes<T>::Matrix params,                           \
+      const typename TTypes<T>::ConstScalar update,                \
+      typename TTypes<Index>::ConstFlat indices);                  \
+  extern template struct ScatterScalarFunctor<GPUDevice, T, Index, op>;
 
 #define DECLARE_GPU_SPECS_INDEX(T, Index)                       \
   DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ASSIGN); \
   DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ADD);    \
   DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::SUB);    \
   DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MUL);    \
-  DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);
+  DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);    \
+  DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MIN);    \
+  DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MAX);
 
 #define DECLARE_GPU_SPECS(T)         \
   DECLARE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index 079f15e101308867389745ee42146086af91c47c..ebaa2bd9c6253abf975c74338125529282dd7850 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -18,8 +18,13 @@ limitations under the License.
 
 #include <type_traits>
 
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -33,7 +38,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 namespace scatter_op {
 
-enum class UpdateOp { ASSIGN, ADD, SUB, MUL, DIV };
+enum class UpdateOp { ASSIGN, ADD, SUB, MUL, DIV, MIN, MAX };
 
 namespace internal {
 
@@ -45,6 +50,10 @@ struct Assign<scatter_op::UpdateOp::ASSIGN> {
   static void Run(Params p, Update u) {
     p = u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p.setConstant(u);
+  }
 };
 template <>
 struct Assign<scatter_op::UpdateOp::ADD> {
@@ -52,6 +61,10 @@ struct Assign<scatter_op::UpdateOp::ADD> {
   static void Run(Params p, Update u) {
     p += u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p + u;
+  }
 };
 template <>
 struct Assign<scatter_op::UpdateOp::SUB> {
@@ -59,6 +72,10 @@ struct Assign<scatter_op::UpdateOp::SUB> {
   static void Run(Params p, Update u) {
     p -= u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p + static_cast<Update>(-u);
+  }
 };
 template <>
 struct Assign<scatter_op::UpdateOp::MUL> {
@@ -66,6 +83,10 @@ struct Assign<scatter_op::UpdateOp::MUL> {
   static void Run(Params p, Update u) {
     p *= u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p * u;
+  }
 };
 template <>
 struct Assign<scatter_op::UpdateOp::DIV> {
@@ -73,6 +94,34 @@ struct Assign<scatter_op::UpdateOp::DIV> {
   static void Run(Params p, Update u) {
     p /= u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p / u;
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::MIN> {
+  // This method requires that Params and Update are tensor types.
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p = p.cwiseMin(u);
+  }
+  // Same thing, but for Update being a scalar type.
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p.cwiseMin(u);
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::MAX> {
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p = p.cwiseMax(u);
+  }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p.cwiseMax(u);
+  }
 };
 
 #ifdef TENSORFLOW_USE_SYCL
@@ -117,6 +166,22 @@ struct AssignSYCL<scatter_op::UpdateOp::DIV> {
     p.device(d) = p / u;
   }
 };
+
+template <>
+struct AssignSYCL<scatter_op::UpdateOp::MIN> {
+  template <typename Device, typename Params, typename Update>
+  static void Run(Device d, Params p, Update u) {
+    p.device(d) = p.cwiseMin(u);
+  }
+};
+
+template <>
+struct AssignSYCL<scatter_op::UpdateOp::MAX> {
+  template <typename Device, typename Params, typename Update>
+  static void Run(Device d, Params p, Update u) {
+    p.device(d) = p.cwiseMax(u);
+  }
+};
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
@@ -141,9 +206,9 @@ struct ScatterFunctorBase {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -154,6 +219,42 @@ struct ScatterFunctorBase {
   }
 };
 
+template <typename Device, typename Index>
+struct ScatterFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   typename TTypes<Variant>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    DCHECK_EQ(N, updates.dimension(0));
+    DCHECK_EQ(cols, updates.dimension(1));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Copy last Ndim-1 dimensions of updates[i] to params[index]
+      for (int j = 0; j < cols; ++j) {
+        const Variant& to_scatter = updates(i, j);
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterFunctor<CPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<CPUDevice, Index> {};
+
+template <typename Index>
+struct ScatterFunctor<GPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<GPUDevice, Index> {};
+
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
@@ -165,9 +266,9 @@ struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -190,9 +291,10 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
     const Index limit = static_cast<Index>(params.dimension(0));
     if (!std::is_same<T, string>::value) {
       for (Index i = 0; i < N; i++) {
-        // Grab the index and check its validity.  An earlier version of the
-        // code checked it and then grabbed it from memory a second time, which
-        // was a security risk since it could have changed in between.
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
         const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
         if (!FastBoundsCheck(index, limit)) return i;
         memmove(params.data() + index * params.dimension(1),
@@ -201,9 +303,10 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
       }
     } else {
       for (Index i = 0; i < N; i++) {
-        // Grab the index and check its validity.  An earlier version of the
-        // code checked it and then grabbed it from memory a second time, which
-        // was a security risk since it could have changed in between.
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
         const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
         if (!FastBoundsCheck(index, limit)) return i;
         // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -241,6 +344,147 @@ struct ScatterFunctorSYCL {
 };
 #endif  // TENSORFLOW_USE_SYCL
 
+template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctor {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices);
+};
+
+template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctorBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::Assign<op>::RunScalar(
+          params.template chip<0>(index), update());
+    }
+    return -1;
+  }
+};
+
+template <typename Device, typename Index>
+struct ScatterScalarFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   const typename TTypes<Variant>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    const Variant& to_scatter = update();
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      for (Index j = 0; j < cols; ++j) {
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterScalarFunctor<CPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<CPUDevice, Index> {};
+template <typename Index>
+struct ScatterScalarFunctor<GPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<GPUDevice, Index> {};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
+  Index operator()(OpKernelContext* c, const SYCLDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::AssignSYCL<op>::RunScalar(
+          d, params.template chip<0>(index), update);
+    }
+    return -1;
+  }
+};
+#endif  // TENSORFLOW_USE_SYCL
+
+template <typename T, typename Index>
+struct ScatterScalarFunctorBase<CPUDevice, T, Index,
+                                scatter_op::UpdateOp::ASSIGN> {
+  Index operator()(OpKernelContext* c, const CPUDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::Assign<scatter_op::UpdateOp::ASSIGN>::RunScalar(
+          params.template chip<0>(index), update());
+    }
+    return -1;
+  }
+};
+
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctor<CPUDevice, T, Index, op>
+    : ScatterScalarFunctorBase<CPUDevice, T, Index, op> {};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctorSYCL {
+  Index operator()(OpKernelContext* c, const SYCLDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::Flat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::AssignSYCL<op>::Run(
+          d, params.template chip<0>(index), update());
+    }
+    return -1;
+  }
+};
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
index 52972997cc7858177a6d1b13c2d237d644a4d82d..59911bf0d26afe57a902b1533b75b76797070c06 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
@@ -23,15 +23,18 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_SPECS_OP(T, Index, op) \
-  template struct functor::ScatterFunctor<GPUDevice, T, Index, op>;
+#define DEFINE_GPU_SPECS_OP(T, Index, op)                           \
+  template struct functor::ScatterFunctor<GPUDevice, T, Index, op>; \
+  template struct functor::ScatterScalarFunctor<GPUDevice, T, Index, op>;
 
 #define DEFINE_GPU_SPECS_INDEX(T, Index)                       \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ASSIGN); \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ADD);    \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::SUB);    \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MUL);    \
-  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);    \
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MIN);    \
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MAX);
 
 #define DEFINE_GPU_SPECS(T)         \
   DEFINE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index be18658543ea330e3196d0f372154df32e4e1dfc..70809e4dcf93d80d562196d3515a305cf35fa8ba 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -29,12 +29,53 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+namespace scatter_op_gpu {
+
+template <typename T, scatter_op::UpdateOp op>
+struct ScatterOpKernelBody;
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::ASSIGN> {
+  __device__ void operator()(T* dest, T src) const { *dest = src; }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::ADD> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicAdd(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::SUB> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicSub(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MUL> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicMul(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::DIV> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicDiv(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MIN> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicMin(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MAX> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicMax(dest, src); }
+};
+
 template <typename T, typename Index, scatter_op::UpdateOp op>
 __global__ void ScatterOpCustomKernel(T* params, const T* updates,
                                       const Index* indices,
                                       Index first_dim_size, Index updates_size,
                                       Index indices_size) {
   Index update_block = updates_size / indices_size;
+  ScatterOpKernelBody<T, op> body;
   CUDA_1D_KERNEL_LOOP(i, updates_size) {
     int indices_i = i / update_block;
     int updates_i = i;
@@ -44,31 +85,33 @@ __global__ void ScatterOpCustomKernel(T* params, const T* updates,
       continue;
     }
     int params_i = param_first_index * update_block + (i % update_block);
-    switch (op) {
-      case scatter_op::UpdateOp::ASSIGN: {
-        params[params_i] = ldg(updates + updates_i);
-        break;
-      }
-      case scatter_op::UpdateOp::ADD: {
-        CudaAtomicAdd(params + params_i, ldg(updates + updates_i));
-        break;
-      }
-      case scatter_op::UpdateOp::SUB: {
-        CudaAtomicSub(params + params_i, ldg(updates + updates_i));
-        break;
-      }
-      case scatter_op::UpdateOp::MUL: {
-        CudaAtomicMul(params + params_i, ldg(updates + updates_i));
-        break;
-      }
-      case scatter_op::UpdateOp::DIV: {
-        CudaAtomicDiv(params + params_i, ldg(updates + updates_i));
-        break;
-      }
+    body(&params[params_i], ldg(updates + updates_i));
+  }
+}
+
+template <typename T, typename Index, scatter_op::UpdateOp op>
+__global__ void ScatterScalarOpCustomKernel(T* params, const T* update,
+                                            const Index* indices,
+                                            Index first_dim_size,
+                                            Index indices_size,
+                                            Index synthesized_updates_size) {
+  Index update_block = synthesized_updates_size / indices_size;
+  ScatterOpKernelBody<T, op> body;
+  CUDA_1D_KERNEL_LOOP(i, synthesized_updates_size) {
+    int indices_i = i / update_block;
+    int param_first_index = indices[indices_i];
+    const T update_val = *update;
+    if (!(param_first_index >= 0 && param_first_index < first_dim_size)) {
+      // Ignore indices that are out of range.
+      continue;
     }
+    int params_i = param_first_index * update_block + (i % update_block);
+    body(&params[params_i], update_val);
   }
 }
 
+}  // namespace scatter_op_gpu
+
 namespace functor {
 // Specialization for a GPU device.
 template <typename T, typename Index, scatter_op::UpdateOp op>
@@ -84,7 +127,7 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
     const Index indices_size = indices.size();
     const Index updates_size = updates.size();
     CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
-    ScatterOpCustomKernel<T, Index, op>
+    scatter_op_gpu::ScatterOpCustomKernel<T, Index, op>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
             params.data(), updates.data(), indices.data(), first_dim_size,
             updates_size, indices_size);
@@ -92,6 +135,27 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
   }
 };
 
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctor<GPUDevice, T, Index, op> {
+  Index operator()(OpKernelContext* c, const GPUDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // TODO(b/31801742): Implement indices range check. The hardest part is
+    // with returning a value after the range check, as we do not want to do
+    // device to host memcpy during a stream.
+    const Index first_dim_size = params.dimension(0);
+    const Index indices_size = indices.size();
+    const Index synthesized_updates_size = indices_size * params.dimension(1);
+    CudaLaunchConfig config = GetCudaLaunchConfig(synthesized_updates_size, d);
+    scatter_op_gpu::ScatterScalarOpCustomKernel<T, Index, op>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            params.data(), update.data(), indices.data(), first_dim_size,
+            indices_size, synthesized_updates_size);
+    return -1;
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 3a95dd1773398509e81a514f07fd79f5cb9a0928..0caa7bd3179a794b0c03fe0ca50729b6b3fa05bd 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -241,6 +241,7 @@ class ScatterNdUpdateOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU);
+TF_CALL_string(REGISTER_SCATTER_ND_CPU);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index e82660dcc1dcf9dbb7d531c0223e211ce46a8635..7cfffa20c5a491356d5172ec4346052c67328ff9 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -160,6 +160,7 @@ struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM> {
   REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::SUB);
 
 TF_CALL_ALL_TYPES(REGISTER_SCATTER_ND_UPDATE);
+REGISTER_SCATTER_ND_INDEX(string, scatter_nd_op::UpdateOp::ADD);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH)
 
 #undef REGISTER_SCATTER_ND_MATH
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index ae81efa31d4193e6bbb173a1bd6866889037b76e..c134a8dd5bcdb06445b063d4083c18e76c5f4265 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -183,9 +184,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid indices: [2,0] = [99] does not index into [5,3]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid indices: [2,0] = [99] does not index into [5,3]"))
       << s;
 }
 
@@ -198,10 +198,10 @@ TEST_F(ScatterNdUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("The outermost dimension of updates and indices "
-                            "must match. Got indices.shape [1,3,1], "
-                            "updates.shape [3,3]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "The outermost dimension of updates and indices must match. Got "
+      "indices.shape [1,3,1], updates.shape [3,3]"))
       << s;
 }
 
@@ -216,10 +216,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Must have updates.shape = indices.shape[:batch_dim]"))
-
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Must have updates.shape = indices.shape[:batch_dim]"))
       << s;
 }
 
@@ -233,10 +231,9 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains(
-              "The outermost dimension of updates and indices must match."))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "The outermost dimension of updates and indices must match."))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 282165349f316144d261859d5a3a992f047e0df3..0fbde764d57eb661314b699ef9902238ad38b2cf 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -38,6 +38,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 // Check whether updates.shape = indices.shape + params.shape[1:]
 static bool ValidShapes(const Tensor& params, const Tensor& updates,
                         const Tensor& indices) {
+  if (updates.dims() == 0) return true;
   if (updates.dims() != indices.dims() + params.dims() - 1) return false;
   for (int d = 0; d < indices.dims(); d++) {
     if (updates.dim_size(d) != indices.dim_size(d)) {
@@ -61,11 +62,11 @@ static void DoValidationChecking(OpKernelContext* c, const Tensor& params,
                                       params.shape().DebugString()));
   OP_REQUIRES(
       c, ValidShapes(params, updates, indices),
-      errors::InvalidArgument(
-          "Must have updates.shape = indices.shape + params.shape[1:], got ",
-          "updates.shape ", updates.shape().DebugString(), ", indices.shape ",
-          indices.shape().DebugString(), ", params.shape ",
-          params.shape().DebugString()));
+      errors::InvalidArgument("Must have updates.shape = indices.shape + "
+                              "params.shape[1:] or updates.shape = [], got ",
+                              "updates.shape ", updates.shape().DebugString(),
+                              ", indices.shape ", indices.shape().DebugString(),
+                              ", params.shape ", params.shape().DebugString()));
 }
 
 template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
@@ -122,16 +123,31 @@ class ScatterUpdateOp : public OpKernel {
     if (N > 0) {
       auto indices_flat = indices.flat<Index>();
       auto params_flat = params.flat_outer_dims<T>();
-      auto updates_flat = updates.shaped<T, 2>({N, updates.NumElements() / N});
-
-      functor::ScatterFunctor<Device, T, Index, op> functor;
-      const Index bad_i = functor(c, c->template eigen_device<Device>(),
-                                  params_flat, updates_flat, indices_flat);
-      OP_REQUIRES(
-          c, bad_i < 0,
-          errors::InvalidArgument(
-              "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-              indices_flat(bad_i), " is not in [0, ", params.dim_size(0), ")"));
+
+      if (TensorShapeUtils::IsScalar(updates.shape()) ||
+          IsLegacyScalar(updates.shape())) {
+        const auto update = updates.scalar<T>();
+        functor::ScatterScalarFunctor<Device, T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<Device>(),
+                                    params_flat, update, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params.dim_size(0), ")"));
+      } else {
+        auto updates_flat =
+            updates.shaped<T, 2>({N, updates.NumElements() / N});
+
+        functor::ScatterFunctor<Device, T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<Device>(),
+                                    params_flat, updates_flat, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params.dim_size(0), ")"));
+      }
     }
   }
 };
@@ -195,16 +211,31 @@ class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
 
       auto indices_flat = indices_host.flat<Index>();
       auto params_flat = params.flat_outer_dims<T>();
-      auto updates_flat = updates.shaped<T, 2>({N, updates.NumElements() / N});
-
-      functor::ScatterFunctorSYCL<T, Index, op> functor;
-      const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
-                                  params_flat, updates_flat, indices_flat);
-      OP_REQUIRES(
-          c, bad_i < 0,
-          errors::InvalidArgument(
-              "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-              indices_flat(bad_i), " is not in [0, ", params.dim_size(0), ")"));
+
+      if (TensorShapeUtils::IsScalar(updates.shape())) {
+        const auto update = updates.scalar<T>();
+
+        functor::ScatterScalarFunctorSYCL<T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
+                                    params_flat, update, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params.dim_size(0), ")"));
+      } else {
+        auto updates_flat =
+            updates.shaped<T, 2>({N, updates.NumElements() / N});
+
+        functor::ScatterFunctorSYCL<T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
+                                    params_flat, updates_flat, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params.dim_size(0), ")"));
+      }
     }
   }
 };
@@ -221,54 +252,71 @@ class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
   REGISTER_SCATTER_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_KERNEL_INDEX(type, int64, dev, name, op);
 
-#define REGISTER_SCATTER_ARITHEMTIC(type, dev)                                 \
+#define REGISTER_SCATTER_ARITHMETIC(type, dev)                                 \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterAdd", scatter_op::UpdateOp::ADD); \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterDiv", scatter_op::UpdateOp::DIV); \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterMul", scatter_op::UpdateOp::MUL); \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterSub", scatter_op::UpdateOp::SUB);
 
+#define REGISTER_SCATTER_MINMAX(type, dev)                                     \
+  REGISTER_SCATTER_KERNEL(type, dev, "ScatterMin", scatter_op::UpdateOp::MIN); \
+  REGISTER_SCATTER_KERNEL(type, dev, "ScatterMax", scatter_op::UpdateOp::MAX);
+
 #define REGISTER_SCATTER_UPDATE(type, dev)            \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterUpdate", \
                           scatter_op::UpdateOp::ASSIGN);
 
 // Registers CPU kernels.
-#define REGISTER_SCATTER_ARITHEMTIC_CPU(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, CPU);
+#define REGISTER_SCATTER_ARITHMETIC_CPU(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, CPU);
+
+#define REGISTER_SCATTER_MINMAX_CPU(type) REGISTER_SCATTER_MINMAX(type, CPU);
 
 #define REGISTER_SCATTER_UPDATE_CPU(type) REGISTER_SCATTER_UPDATE(type, CPU);
 
-TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHEMTIC_CPU);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHMETIC_CPU);
 TF_CALL_ALL_TYPES(REGISTER_SCATTER_UPDATE_CPU);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
-#define REGISTER_SCATTER_ARITHEMTIC_GPU(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, GPU);
+#define REGISTER_SCATTER_ARITHMETIC_GPU(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, GPU);
+
+#define REGISTER_SCATTER_MINMAX_GPU(type) REGISTER_SCATTER_MINMAX(type, GPU);
 
 #define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);
 
 #endif  // GOOGLE_CUDA
 
 // Registers GPU kernels.
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SCATTER_ARITHEMTIC_SYCL(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, SYCL);
+#define REGISTER_SCATTER_ARITHMETIC_SYCL(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, SYCL);
+
+#define REGISTER_SCATTER_MINMAX_SYCL(type) REGISTER_SCATTER_MINMAX(type, SYCL);
 
 #define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL);
 
-#undef REGISTER_SCATTER_ARITHEMTIC_SYCL
+#undef REGISTER_SCATTER_ARITHMETIC_SYCL
+#undef REGISTER_SCATTER_MINMAX_SYCL
 #undef REGISTER_SCATTER_UPDATE_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
-#undef REGISTER_SCATTER_ARITHEMTIC
-#undef REGISTER_SCATTER_ARITHEMTIC_CPU
-#undef REGISTER_SCATTER_ARITHEMTIC_GPU
+#undef REGISTER_SCATTER_ARITHMETIC
+#undef REGISTER_SCATTER_ARITHMETIC_CPU
+#undef REGISTER_SCATTER_ARITHMETIC_GPU
+#undef REGISTER_SCATTER_MINMAX
+#undef REGISTER_SCATTER_MINMAX_CPU
+#undef REGISTER_SCATTER_MINMAX_GPU
 #undef REGISTER_SCATTER_UPDATE
 #undef REGISTER_SCATTER_UPDATE_CPU
 #undef REGISTER_SCATTER_UPDATE_GPU
diff --git a/tensorflow/core/kernels/scatter_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
index 0b43704846d6958cbfc71e1b0b498bbada8e01e1..0df329310f0dc51bbe91b784a40fd7bf68b012f0 100644
--- a/tensorflow/core/kernels/scatter_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
@@ -24,15 +24,18 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 // Instantiates functor specializations for GPU.
-#define DEFINE_GPU_SPECS_OP(T, Index, op) \
-  template struct functor::ScatterFunctor<GPUDevice, T, Index, op>;
+#define DEFINE_GPU_SPECS_OP(T, Index, op)                           \
+  template struct functor::ScatterFunctor<GPUDevice, T, Index, op>; \
+  template struct functor::ScatterScalarFunctor<GPUDevice, T, Index, op>;
 
 #define DEFINE_GPU_SPECS_INDEX(T, Index)                       \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ASSIGN); \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ADD);    \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::SUB);    \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MUL);    \
-  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);    \
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MIN);    \
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MAX);
 
 #define DEFINE_GPU_SPECS(T)         \
   DEFINE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
index 0b8645a2ae98edc263eccd51f2bd7377c78930da..2ec8c422336551d79ffb800a882ac91db9d5bdf9 100644
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -170,7 +171,7 @@ TEST_F(ScatterUpdateOpTest, Error_IndexOutOfRange) {
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("indices[2] = 99 is not in [0, 5)"))
+      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
@@ -183,9 +184,10 @@ TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
-                            "params.shape[1:], got "))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
+                            "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
 
@@ -200,9 +202,10 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
-                            "params.shape[1:], got "))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
+                            "params.shape[1:] or updates.shape = [], got "))
 
       << s;
 }
@@ -217,9 +220,10 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
-                            "params.shape[1:], got "))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
+                            "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
 
@@ -300,6 +304,20 @@ static void BM_ScatterDivInt64(int iters, int embedding_size) {
   BM_ScatterHelper<int64>(iters, embedding_size, "ScatterDiv");
 }
 
+static void BM_ScatterMinInt32(int iters, int embedding_size) {
+  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMin");
+}
+static void BM_ScatterMinInt64(int iters, int embedding_size) {
+  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMin");
+}
+
+static void BM_ScatterMaxInt32(int iters, int embedding_size) {
+  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMax");
+}
+static void BM_ScatterMaxInt64(int iters, int embedding_size) {
+  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMax");
+}
+
 BENCHMARK(BM_ScatterUpdateInt32)
     ->Arg(1)
     ->Arg(10)
@@ -332,5 +350,11 @@ BENCHMARK(BM_ScatterMulInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
 BENCHMARK(BM_ScatterDivInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
 BENCHMARK(BM_ScatterDivInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
 
+BENCHMARK(BM_ScatterMinInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+BENCHMARK(BM_ScatterMinInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+
+BENCHMARK(BM_ScatterMaxInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+BENCHMARK(BM_ScatterMaxInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scoped_allocator_ops.cc b/tensorflow/core/kernels/scoped_allocator_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7b25ffad0408383a63d5127f85ce41f40890e87
--- /dev/null
+++ b/tensorflow/core/kernels/scoped_allocator_ops.cc
@@ -0,0 +1,216 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class ScopedAllocatorOp : public OpKernel {
+ public:
+  explicit ScopedAllocatorOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("T", &dtype_));
+    OP_REQUIRES_OK(context, context->GetAttr("shapes", &shapes_));
+    OP_REQUIRES_OK(context, context->GetAttr("sa_name", &name_));
+    OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
+    OP_REQUIRES_OK(context, context->GetAttr("expected_call_count",
+                                             &expected_call_count_));
+    device_ = context->device();
+    // Precalculate the size of the backing tensor and the offsets of
+    // the subtensors to be allocated from it, taking into account
+    // alignment considerations.
+    ScopedAllocatorMgr::PopulateFields(id_, shapes_, dtype_, &fields_);
+    size_t num_bytes = fields_.back().offset + fields_.back().bytes;
+    num_elements_ = num_bytes / DataTypeSize(dtype_);
+    OP_REQUIRES(context, num_bytes % DataTypeSize(dtype_) == 0,
+                errors::InvalidArgument(
+                    "Number of bytes ", num_bytes,
+                    " must be divisible by size of datatype ", dtype_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    ScopedAllocatorMgr* sam = device_->GetScopedAllocatorMgr();
+    if (!sam) {
+      context->SetStatus(errors::Internal(
+          "ScopedAllocatorMgr not supported on device ", device_->name()));
+      return;
+    }
+    Tensor* backing_tensor = nullptr;
+    AllocatorAttributes attr = context->output_alloc_attr(0);
+    Status s =
+        context->allocate_output(0, {num_elements_}, &backing_tensor, attr);
+    VLOG(1) << "_ScopedAllocatorOp new backing tensor size "
+            << backing_tensor->TotalBytes() << " num_elements_ "
+            << num_elements_ << " buffer " << DMAHelper::buffer(backing_tensor)
+            << " base addr " << DMAHelper::base(backing_tensor);
+    if (s.ok()) {
+      s = sam->AddScopedAllocator(*backing_tensor, context->step_id(), id_,
+                                  name_, fields_, expected_call_count_);
+    }
+    if (!s.ok()) {
+      context->SetStatus(s);
+    }
+  }
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataType dtype_;
+  int64 num_elements_;
+  std::vector<ScopedAllocator::Field> fields_;
+  string name_;
+  int32 id_;
+  int32 expected_call_count_;
+  DeviceBase* device_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocator").Device(DEVICE_CPU),
+                        ScopedAllocatorOp);
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocator").Device(DEVICE_GPU),
+                        ScopedAllocatorOp);
+
+class ScopedAllocatorConcatOp : public OpKernel {
+ public:
+  explicit ScopedAllocatorConcatOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &dtype_));
+    // This stuff is just for debugging
+    OP_REQUIRES_OK(context, context->GetAttr("sa_name", &name_));
+    OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
+    device_ = context->device();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& backing_tensor = context->input(0);
+    // Check that type matches.
+    OP_REQUIRES(
+        context, backing_tensor.dtype() == dtype_,
+        errors::InvalidArgument("Backing tensor type ", backing_tensor.dtype(),
+                                " does not match expected type ", dtype_));
+    // Check that backing tensor is at least as large as the shape of the
+    // output.
+    OP_REQUIRES(context, backing_tensor.NumElements() >= shape_.num_elements(),
+                errors::InvalidArgument("Backing tensor num elements ",
+                                        backing_tensor.NumElements(),
+                                        " is not equal to expected ",
+                                        shape_.num_elements()));
+    VLOG(1) << "_ScopedAllocatorConcatOp outputting backing tensor at "
+            << DMAHelper::base(&backing_tensor);
+    Tensor backing_copy(backing_tensor);
+    context->set_output(0, backing_copy);
+    const TensorBuffer* backing_buf = DMAHelper::buffer(&backing_copy);
+    const void* backing_tensor_lb = backing_buf->data();
+    const void* backing_tensor_ub = static_cast<const void*>(
+        static_cast<const char*>(backing_tensor_lb) + backing_buf->size());
+    // Check that all inputs lie entirely within the backing tensor.
+    for (int i = 1; i < context->num_inputs(); ++i) {
+      const TensorBuffer* input_buf = DMAHelper::buffer(&context->input(i));
+      const void* input_lb = input_buf->data();
+      OP_REQUIRES(
+          context, input_lb >= backing_tensor_lb,
+          errors::InvalidArgument("Lower bound check fail for input ", i,
+                                  " to node ", context->op_kernel().name()));
+      const void* input_ub = static_cast<const void*>(
+          static_cast<const char*>(input_lb) + input_buf->size());
+      OP_REQUIRES(
+          context, input_ub <= backing_tensor_ub,
+          errors::InvalidArgument("Upper bound check fail for input ", i,
+                                  " to node ", context->op_kernel().name()));
+    }
+  }
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  string name_;
+  int32 id_;
+  DeviceBase* device_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocatorConcat").Device(DEVICE_CPU),
+                        ScopedAllocatorConcatOp);
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocatorConcat").Device(DEVICE_GPU),
+                        ScopedAllocatorConcatOp);
+
+class ScopedAllocatorSplitOp : public OpKernel {
+ public:
+  explicit ScopedAllocatorSplitOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("T", &dtype_));
+    // This stuff is just for debugging
+    OP_REQUIRES_OK(context, context->GetAttr("sa_name", &name_));
+    OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
+    device_ = context->device();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    Tensor backing_copy(context->input(0));
+    // Check that type matches.
+    OP_REQUIRES(
+        context, backing_copy.dtype() == dtype_,
+        errors::InvalidArgument("Backing tensor type ", backing_copy.dtype(),
+                                " does not match expected type ", dtype_));
+    const TensorBuffer* backing_buf = DMAHelper::buffer(&backing_copy);
+    const void* backing_tensor_lb = backing_buf->data();
+    const void* backing_tensor_ub = static_cast<const void*>(
+        static_cast<const char*>(backing_tensor_lb) + backing_buf->size());
+    for (int i = 1; i < context->num_inputs(); ++i) {
+      VLOG(1) << "_ScopedAllocatorSplitOp assigning input " << i
+              << " to output " << i - 1 << " buf addr "
+              << DMAHelper::base(&context->input(i));
+      Tensor copy(context->input(i));
+      OP_REQUIRES(
+          context, copy.dtype() == dtype_,
+          errors::InvalidArgument("Input ", i, " tensor type ", copy.dtype(),
+                                  " does not match expected type ", dtype_));
+      context->set_output(i - 1, copy);
+      const TensorBuffer* input_buf = DMAHelper::buffer(&copy);
+      const void* input_lb = input_buf->data();
+      OP_REQUIRES(
+          context, input_lb >= backing_tensor_lb,
+          errors::InvalidArgument("Lower bound check fail for input ", i,
+                                  " to node ", context->op_kernel().name()));
+      const void* input_ub = static_cast<const void*>(
+          static_cast<const char*>(input_lb) + input_buf->size());
+      OP_REQUIRES(
+          context, input_ub <= backing_tensor_ub,
+          errors::InvalidArgument("Upper bound check fail for input ", i,
+                                  " to node ", context->op_kernel().name()));
+    }
+  }
+
+ private:
+  DataType dtype_;
+  string name_;
+  int32 id_;
+  DeviceBase* device_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocatorSplit").Device(DEVICE_CPU),
+                        ScopedAllocatorSplitOp);
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocatorSplit").Device(DEVICE_GPU),
+                        ScopedAllocatorSplitOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d36c8b7d43748abb91a5ecd2edd22dada7ae9c6
--- /dev/null
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -0,0 +1,296 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class ScopedAllocatorOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(const gtl::ArraySlice<TensorShape>& shapes, DataType dtype,
+              const string& name, int32 id, int32 expected_call_count) {
+    TF_EXPECT_OK(NodeDefBuilder("scoped_allocator_op", "_ScopedAllocator")
+                     .Attr("T", dtype)
+                     .Attr("shapes", shapes)
+                     .Attr("sa_name", name)
+                     .Attr("id", id)
+                     .Attr("expected_call_count", expected_call_count)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Allocate and Deallocate the tensors so that memory is not leaked
+    AllocatorAttributes attr;
+    Allocator* allocator;
+    for (size_t i = 0; i < shapes.size(); i++) {
+      attr.scope_id = id + i + 1;
+      allocator = device_->GetScopedAllocator(attr, context_->step_id());
+      Tensor temp(allocator, dtype, shapes[i]);
+    }
+  }
+};
+
+TEST_F(ScopedAllocatorOpTest, Simple) {
+  MakeOp({TensorShape({8})}, DT_FLOAT, "test", 120, 1);
+  MakeOp({TensorShape({32, 32})}, DT_DOUBLE, "test1", 130, 1);
+  MakeOp({TensorShape({64}), TensorShape({3, 3}), TensorShape({5, 5, 5})},
+         DT_HALF, "test2", 140, 3);
+  MakeOp({TensorShape({512}), TensorShape({64, 8})}, DT_UINT32, "test3", 150,
+         2);
+}
+
+// PrepOp is common to ConcatOp tests and SplitOpTests.
+// It allocates a backing tensor that is large enough to hold all slices defined
+// by fields, creates ScopedAllocatorInstances for each field, allocates the
+// tensors, and assigns them as inputs to the op.
+// We won't use the AddInput* suite of functions from ops_testutil.h because
+// they allocate new tensors for each input.  We need to mimic what a
+// ScopedAllocator would do.
+void PrepOp(DataType dtype, int32 id,
+            const std::vector<TensorShape>& fields_shapes,
+            std::vector<ScopedAllocator::Field>* fields,
+            Tensor** backing_tensor, Allocator* allocator,
+            ScopedAllocatorMgr* sam, const string& op_name,
+            std::vector<Tensor>* tensors,
+            gtl::InlinedVector<TensorValue, 4>* inputs,
+            const DataTypeVector& input_types) {
+  ScopedAllocatorMgr::PopulateFields(id, fields_shapes, dtype, fields);
+  // We don't simply allocate a tensor with shape as backing_tensor_shape,
+  // because we need to account for padding in the fields.  We actually need a
+  // tensor of size at least (fields[-1].offset + fields[-1].bytes).
+  size_t num_bytes = fields->back().offset + fields->back().bytes;
+  int32_t num_elements = num_bytes / DataTypeSize(dtype);
+  CHECK_EQ(num_bytes % DataTypeSize(dtype), 0);
+
+  *backing_tensor = new Tensor(allocator, dtype, {num_elements});
+  int64 step_id = 10;
+  Status s = sam->AddScopedAllocator(**backing_tensor, step_id, id,
+                                     "sa_" + op_name + "_test", *fields,
+                                     fields_shapes.size());
+  TF_ASSERT_OK(s);
+
+  ScopedAllocatorContainer* sac = sam->GetContainer(step_id);
+  std::vector<ScopedAllocatorInstance*> sa_instances(fields_shapes.size(),
+                                                     nullptr);
+  for (size_t i = 0; i < fields_shapes.size(); i++) {
+    sa_instances[i] = sac->GetInstance(id + i + 1);
+    tensors->push_back(Tensor(sa_instances[i], dtype, fields_shapes[i]));
+  }
+  // Now add the tensor as an input to ScopedAllocator<op_name>Op.
+  // Order matters here, so first add the backing tensor, then the slices.
+  inputs->reserve(1 + tensors->size());
+  CHECK_GT(input_types.size(), inputs->size());
+  CHECK_EQ(input_types[inputs->size()], dtype);
+  inputs->push_back({nullptr, *backing_tensor});
+  for (size_t i = 0; i < tensors->size(); i++) {
+    CHECK_EQ(input_types[inputs->size()], dtype);
+    inputs->push_back({nullptr, &((*tensors)[i])});
+  }
+}
+
+class ScopedAllocatorConcatOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(const TensorShape& shape, DataType dtype, const string& name,
+              int32 id, int32 num_tensors) {
+    TF_EXPECT_OK(
+        NodeDefBuilder("scoped_allocator_concat_op", "_ScopedAllocatorConcat")
+            .Attr("shape", shape)
+            .Attr("T", dtype)
+            .Attr("N", num_tensors)
+            .Attr("sa_name", name)
+            .Attr("id", id)
+            .Input(FakeInput(dtype))               // backing tensor
+            .Input(FakeInput(num_tensors, dtype))  // list of tensors
+            .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+
+  void ExecOp(DataType dtype, int32 id,
+              const std::vector<TensorShape>& fields_shapes) {
+    Tensor* backing_tensor = nullptr;
+    std::vector<Tensor> tensors;
+    std::vector<ScopedAllocator::Field> fields;
+    PrepOp(dtype, id, fields_shapes, &fields, &backing_tensor, allocator(),
+           device_->GetScopedAllocatorMgr(), "split", &tensors, &inputs_,
+           input_types_);
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Check input and output are same tensor.
+    const Tensor& input = context_->input(0);
+    OpOutputList output_list;
+    Status s = context_->output_list("output", &output_list);
+    TF_ASSERT_OK(s);
+    const Tensor& output = *(output_list[0]);
+    CHECK_EQ(DMAHelper::base(&input), DMAHelper::base(&output));
+    CHECK_EQ(input.dtype(), output.dtype());
+    CHECK_EQ(input.NumElements(), output.NumElements());
+
+    // Free the backing tensor which was allocated in PrepOp.
+    delete backing_tensor;
+  }
+};
+
+TEST_F(ScopedAllocatorConcatOpTest, Success1) {
+  MakeOp({32}, DT_FLOAT, "test", 120, 2);
+  ExecOp(DT_FLOAT, 120, {{16}, {16}});
+}
+
+TEST_F(ScopedAllocatorConcatOpTest, Success2) {
+  MakeOp({2, 2, 2}, DT_DOUBLE, "test", 120, 2);
+  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+}
+
+TEST_F(ScopedAllocatorConcatOpTest, Success3) {
+  MakeOp({3, 3, 3}, DT_HALF, "test", 120, 3);
+  ExecOp(DT_HALF, 120, {{3, 3}, {3, 3}, {3, 3}});
+}
+
+TEST_F(ScopedAllocatorConcatOpTest, FailDtypeCheck) {
+  MakeOp({8}, DT_FLOAT, "test", 120, 2);
+  EXPECT_DEATH(ExecOp(DT_DOUBLE, 120, {{4}, {4}}), "");
+}
+
+TEST_F(ScopedAllocatorConcatOpTest, FailNumElementsCheck) {
+  MakeOp({32}, DT_FLOAT, "test", 120, 2);
+  AddInputFromArray<float>({8}, {0, 1, 2, 3, 4, 5, 6, 7});
+  AddInputFromArray<float>({4}, {0, 1, 2, 3});
+  AddInputFromArray<float>({4}, {4, 5, 6, 7});
+  Status s = RunOpKernel();
+  EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
+}
+
+// This test should fail because the backing tensor and the input tensors are
+// unrelated, i.e. the inputs are not slices of the backing tensor.
+TEST_F(ScopedAllocatorConcatOpTest, FailBounds) {
+  MakeOp({8}, DT_DOUBLE, "test", 120, 2);
+  AddInputFromArray<double>({8}, {0, 1, 2, 3, 4, 5, 6, 7});
+  AddInputFromArray<double>({4}, {0, 1, 2, 3});
+  AddInputFromArray<double>({4}, {4, 5, 6, 7});
+  Status s = RunOpKernel();
+  EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
+}
+
+class ScopedAllocatorSplitOpTest : public OpsTestBase {
+ protected:
+  void BuildNodeDef(const TensorShape& shape, DataType dtype,
+                    const string& name, int32 id, int32 num_tensors) {
+    TF_EXPECT_OK(
+        NodeDefBuilder("scoped_allocator_split_op", "_ScopedAllocatorSplit")
+            .Attr("T", dtype)
+            .Attr("N", num_tensors)
+            .Attr("sa_name", name)
+            .Attr("id", id)
+            .Input(FakeInput(dtype))  // backing tensor and input
+            .Input(
+                FakeInput(num_tensors, dtype))  // list of subtensors to forward
+            .Finalize(node_def()));
+  }
+
+  void MakeOp(const TensorShape& shape, DataType dtype, const string& name,
+              int32 id, int32 num_tensors) {
+    BuildNodeDef(shape, dtype, name, id, num_tensors);
+    TF_EXPECT_OK(InitOp());
+  }
+
+  // Similar to ConcatOpTest, we add inputs that are allocated from
+  // ScopedAllocator so that the memory lines up nicely.
+  void ExecOp(DataType dtype, int32 id,
+              const std::vector<TensorShape>& fields_shapes) {
+    Tensor* backing_tensor = nullptr;
+    std::vector<Tensor> tensors;
+    std::vector<ScopedAllocator::Field> fields;
+    PrepOp(dtype, id, fields_shapes, &fields, &backing_tensor, allocator(),
+           device_->GetScopedAllocatorMgr(), "split", &tensors, &inputs_,
+           input_types_);
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Check that outputs are slices of backing tensor.
+    const Tensor& input = context_->input(0);
+    const void* lower_limit = DMAHelper::base(&input);
+    const char* lower_limit_c =
+        static_cast<const char*>(lower_limit);  // for pointer arithmetic
+    OpOutputList output_list;
+    Status s = context_->output_list("output", &output_list);
+    TF_ASSERT_OK(s);
+    for (int i = 0; i < output_list.size(); i++) {
+      const Tensor& output = *(output_list[i]);
+      const void* expected_base =
+          static_cast<const void*>(lower_limit_c + fields[i].offset);
+      CHECK_EQ(output.dtype(), input.dtype());
+      CHECK_EQ(expected_base, DMAHelper::base(&output));
+      CHECK_EQ(output.NumElements(), fields_shapes[i].num_elements());
+    }
+
+    // Free the backing tensor which was allocated in PrepOp.
+    delete backing_tensor;
+  }
+};
+
+TEST_F(ScopedAllocatorSplitOpTest, Success1) {
+  MakeOp({32}, DT_FLOAT, "test", 120, 2);
+  ExecOp(DT_FLOAT, 120, {{16}, {16}});
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, Success2) {
+  MakeOp({2, 2, 2}, DT_DOUBLE, "test", 120, 2);
+  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, Success3) {
+  MakeOp({3, 3, 3}, DT_HALF, "test", 120, 3);
+  ExecOp(DT_HALF, 120, {{3, 3}, {3, 3}, {3, 3}});
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, FailNLessThan2) {
+  BuildNodeDef({4, 4}, DT_FLOAT, "test", 120, 1);
+  Status s = InitOp();
+  EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, FailDtypeCheck) {
+  MakeOp({8}, DT_FLOAT, "test", 120, 2);
+  EXPECT_DEATH(ExecOp(DT_HALF, 120, {{4}, {4}}), "");
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, FailBounds) {
+  MakeOp({8}, DT_DOUBLE, "test", 120, 2);
+  AddInputFromArray<double>({8}, {0, 1, 2, 3, 4, 5, 6, 7});
+  AddInputFromArray<double>({4}, {0, 1, 2, 3});
+  AddInputFromArray<double>({4}, {4, 5, 6, 7});
+  Status s = RunOpKernel();
+  EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 066a4b80a2bc6976a6c95ced2c5efecbef13eeba..3e16ba8d042d85e62ecd1a7c1443e6a3f968c4cb 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <random>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 
@@ -226,7 +227,7 @@ const ExampleStatistics Example::ComputeWxAndWeightedExampleNorm(
 }
 
 // Examples contains all the training examples that SDCA uses for a mini-batch.
-Status Examples::SampleAdaptativeProbabilities(
+Status Examples::SampleAdaptiveProbabilities(
     const int num_loss_partitions, const Regularizations& regularization,
     const ModelWeights& model_weights,
     const TTypes<float>::Matrix example_state_data,
@@ -302,6 +303,11 @@ Status Examples::SampleAdaptativeProbabilities(
   return Status::OK();
 }
 
+void Examples::RandomShuffle() {
+  std::iota(sampled_index_.begin(), sampled_index_.end(), 0);
+  std::random_shuffle(sampled_index_.begin(), sampled_index_.end());
+}
+
 // TODO(sibyl-Aix6ihai): Refactor/shorten this function.
 Status Examples::Initialize(OpKernelContext* const context,
                             const ModelWeights& weights,
@@ -363,9 +369,9 @@ Status Examples::Initialize(OpKernelContext* const context,
   TF_RETURN_IF_ERROR(CreateDenseFeatureRepresentation(
       worker_threads, num_examples, num_dense_features, weights,
       dense_features_inputs, &examples_));
-  ComputeSquaredNormPerExample(worker_threads, num_examples,
-                               num_sparse_features, num_dense_features,
-                               &examples_);
+  TF_RETURN_IF_ERROR(ComputeSquaredNormPerExample(
+      worker_threads, num_examples, num_sparse_features, num_dense_features,
+      &examples_));
   return Status::OK();
 }
 
@@ -377,7 +383,7 @@ Status Examples::CreateSparseFeatureRepresentation(
     const OpInputList& sparse_feature_values_inputs,
     std::vector<Example>* const examples) {
   mutex mu;
-  Status result GUARDED_BY(mu);
+  Status result;  // Guarded by mu
   auto parse_partition = [&](const int64 begin, const int64 end) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
@@ -455,7 +461,7 @@ Status Examples::CreateDenseFeatureRepresentation(
     const OpInputList& dense_features_inputs,
     std::vector<Example>* const examples) {
   mutex mu;
-  Status result GUARDED_BY(mu);
+  Status result;  // Guarded by mu
   auto parse_partition = [&](const int64 begin, const int64 end) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
@@ -481,14 +487,17 @@ Status Examples::CreateDenseFeatureRepresentation(
   return result;
 }
 
-void Examples::ComputeSquaredNormPerExample(
+Status Examples::ComputeSquaredNormPerExample(
     const DeviceBase::CpuWorkerThreads& worker_threads, const int num_examples,
     const int num_sparse_features, const int num_dense_features,
     std::vector<Example>* const examples) {
+  mutex mu;
+  Status result;  // Guarded by mu
   // Compute norm of examples.
   auto compute_example_norm = [&](const int64 begin, const int64 end) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
+    gtl::FlatSet<int64> previous_indices;
     for (int example_id = static_cast<int>(begin); example_id < end;
          ++example_id) {
       double squared_norm = 0;
@@ -496,12 +505,19 @@ void Examples::ComputeSquaredNormPerExample(
       for (int j = 0; j < num_sparse_features; ++j) {
         const Example::SparseFeatures& sparse_features =
             example->sparse_features_[j];
-        if (sparse_features.values) {
-          const Eigen::Tensor<float, 0, Eigen::RowMajor> sn =
-              sparse_features.values->square().sum();
-          squared_norm += sn();
-        } else {
-          squared_norm += sparse_features.indices->size();
+        previous_indices.clear();
+        for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
+          const int64 feature_index = (*sparse_features.indices)(k);
+          if (previous_indices.insert(feature_index).second == false) {
+            mutex_lock l(mu);
+            result =
+                errors::InvalidArgument("Duplicate index in sparse vector.");
+            return;
+          }
+          const double feature_value = sparse_features.values == nullptr
+                                           ? 1.0
+                                           : (*sparse_features.values)(k);
+          squared_norm += feature_value * feature_value;
         }
       }
       for (int j = 0; j < num_dense_features; ++j) {
@@ -516,6 +532,7 @@ void Examples::ComputeSquaredNormPerExample(
   const int64 kCostPerUnit = num_dense_features + num_sparse_features;
   Shard(worker_threads.num_threads, worker_threads.workers, num_examples,
         kCostPerUnit, compute_example_norm);
+  return result;
 }
 
 }  // namespace sdca
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 45915693ac6f0b4ad2d5f2aacebcd4aa34c03439..897c48870263b6e4b0e4a04c3b4526a66f6f9af5 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SDCA_INTERNAL_H_
-#define TENSORFLOW_KERNELS_SDCA_INTERNAL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SDCA_INTERNAL_H_
+#define TENSORFLOW_CORE_KERNELS_SDCA_INTERNAL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -75,7 +75,7 @@ struct ExampleStatistics {
 
 class Regularizations {
  public:
-  Regularizations(){};
+  Regularizations() {}
 
   // Initialize() must be called immediately after construction.
   Status Initialize(OpKernelConstruction* const context) {
@@ -199,7 +199,7 @@ class FeatureWeightsDenseStorage {
   FeatureWeightsDenseStorage(const TTypes<const float>::Matrix nominals,
                              TTypes<float>::Matrix deltas)
       : nominals_(nominals), deltas_(deltas) {
-    CHECK(deltas.rank() > 1);
+    CHECK_GT(deltas.rank(), 1);
   }
 
   // Check if a feature index is with-in the bounds.
@@ -322,21 +322,20 @@ class Examples {
     return examples_.at(example_index);
   }
 
-  int sampled_index(const int id, const bool adaptative) const {
-    if (adaptative) return sampled_index_[id];
-    return id;
-  }
+  int sampled_index(const int id) const { return sampled_index_[id]; }
 
   // Adaptive SDCA in the current implementation only works for
   // binary classification, where the input argument for num_weight_vectors
   // is 1.
-  Status SampleAdaptativeProbabilities(
+  Status SampleAdaptiveProbabilities(
       const int num_loss_partitions, const Regularizations& regularization,
       const ModelWeights& model_weights,
       const TTypes<float>::Matrix example_state_data,
       const std::unique_ptr<DualLossUpdater>& loss_updater,
       const int num_weight_vectors);
 
+  void RandomShuffle();
+
   int num_examples() const { return examples_.size(); }
 
   int num_features() const { return num_features_; }
@@ -370,7 +369,7 @@ class Examples {
 
   // Computes squared example norm per example i.e |x|^2. This function modifies
   // the |examples| passed in and adds the squared norm per example.
-  static void ComputeSquaredNormPerExample(
+  static Status ComputeSquaredNormPerExample(
       const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
       int num_sparse_features, int num_dense_features,
       std::vector<Example>* const examples);
@@ -378,7 +377,7 @@ class Examples {
   // All examples in the batch.
   std::vector<Example> examples_;
 
-  // Adaptative sampling variables
+  // Adaptive sampling variables.
   std::vector<float> probabilities_;
   std::vector<int> sampled_index_;
   std::vector<int> sampled_count_;
@@ -391,4 +390,4 @@ class Examples {
 }  // namespace sdca
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SDCA_INTERNAL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SDCA_INTERNAL_H_
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index dbe0177dda337a271433cd3bb4257026dc702364..05c835ebc467f7f666765e3f21f30a996a9ef7ed 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -80,7 +80,7 @@ struct ComputeOptions {
           context, false,
           errors::InvalidArgument("Unsupported loss type: ", loss_type));
     }
-    OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptative));
+    OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptive));
     OP_REQUIRES_OK(
         context, context->GetAttr("num_sparse_features", &num_sparse_features));
     OP_REQUIRES_OK(context, context->GetAttr("num_sparse_features_with_values",
@@ -113,7 +113,7 @@ struct ComputeOptions {
   int num_dense_features = 0;
   int num_inner_iterations = 0;
   int num_loss_partitions = 0;
-  bool adaptative = false;
+  bool adaptive = true;
   Regularizations regularizations;
 };
 
@@ -147,23 +147,25 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
   OP_REQUIRES_OK(context, context->set_output("out_example_state_data",
                                               mutable_example_state_data_t));
 
-  if (options.adaptative) {
+  if (options.adaptive) {
     OP_REQUIRES_OK(context,
-                   examples.SampleAdaptativeProbabilities(
+                   examples.SampleAdaptiveProbabilities(
                        options.num_loss_partitions, options.regularizations,
                        model_weights, example_state_data, options.loss_updater,
                        /*num_weight_vectors =*/1));
+  } else {
+    examples.RandomShuffle();
   }
-
-  mutex mu;
-  Status train_step_status GUARDED_BY(mu);
+  struct {
+    mutex mu;
+    Status value GUARDED_BY(mu);
+  } train_step_status;
   std::atomic<std::int64_t> atomic_index(-1);
   auto train_step = [&](const int64 begin, const int64 end) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
     for (int id = static_cast<int>(begin); id < end; ++id) {
-      const int64 example_index =
-          examples.sampled_index(++atomic_index, options.adaptative);
+      const int64 example_index = examples.sampled_index(++atomic_index);
       const Example& example = examples.example(example_index);
       const float dual = example_state_data(example_index, 0);
       const float example_weight = example.example_weight();
@@ -171,8 +173,8 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
       const Status conversion_status =
           options.loss_updater->ConvertLabel(&example_label);
       if (!conversion_status.ok()) {
-        mutex_lock l(mu);
-        train_step_status = conversion_status;
+        mutex_lock l(train_step_status.mu);
+        train_step_status.value = conversion_status;
         // Return from this worker thread - the calling thread is
         // responsible for checking context status and returning on error.
         return;
@@ -217,7 +219,8 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
 
   Shard(worker_threads.num_threads, worker_threads.workers,
         examples.num_examples(), kCostPerUnit, train_step);
-  OP_REQUIRES_OK(context, train_step_status);
+  mutex_lock l(train_step_status.mu);
+  OP_REQUIRES_OK(context, train_step_status.value);
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 6c4685a50a4139b9f33d22b409059f7c03fa2812..c87ce78e051a4552e4c1e44c7ab5767bb7c72310 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -40,7 +40,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/platform/cuda.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -242,7 +242,7 @@ class SegmentSumGPUOp : public AsyncOpKernel {
       return;
     }
 
-    perftools::gputools::DeviceMemoryBase output_rows_device(
+    se::DeviceMemoryBase output_rows_device(
         const_cast<Tensor&>(segment_ids).template flat<Index>().data() +
         (num_indices - 1));
     ScratchSpace<Index> output_rows_host(context, 1, /* on_host */ true);
@@ -679,7 +679,12 @@ class SparseSegmentReductionOpBase : public OpKernel {
     // we need to explicitly set missing indices to the default value.
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-    if (num_indices == 0) return;
+    if (num_indices == 0) {
+      if (output_rows > 0) {
+        output->flat_outer_dims<T>().setConstant(default_value_);
+      }
+      return;
+    }
     OP_REQUIRES(context, output_rows > 0,
                 errors::InvalidArgument("segment ids must be >= 0"));
     auto output_flat = output->flat_outer_dims<T>();
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 51814273b305bfa35bca0ddce0376658064ea56a..2ad9fa265e89a0bdebcb9994fed3333e930344c4 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,50 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index ba979e6bb216b649ff4fc3cefa7099ac9cbc1b91..3511c85f7174f8dab47ca3ba05f01d7c4f5110b8 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -17,10 +17,13 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+// We need to include cuda_kernel_helper.h before segment_reduction_ops.h
+// See comment in segment_reduction_ops.h for more details.
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/util/cuda_device_functions.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc
index bcd88773902824c6e88db4226af43993d5649007..cea5883db7bd5d07fd594628268c2f79cfd2b5fc 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -55,6 +56,9 @@ class SelfAdjointEigOp : public LinearAlgebraOp<Scalar> {
       return;
     }
 
+    // This algorithm relies on denormals, so switch them back on locally.
+    port::ScopedDontFlushDenormal dont_flush_denormals;
+
     Eigen::SelfAdjointEigenSolver<
         Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
         es(inputs[0]);
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h b/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
index 8c0633f4226005b401ed0f0fcd1f56bbba772701..271dd2c4858aef6d9970b907f2a8d205178a978f 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
+++ b/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -61,6 +62,9 @@ class SelfAdjointEigV2Op : public LinearAlgebraOp<Scalar> {
       return;
     }
 
+    // This algorithm relies on denormals, so switch them back on locally.
+    port::ScopedDontFlushDenormal dont_flush_denormals;
+
     Eigen::SelfAdjointEigenSolver<Matrix> eig(
         inputs[0],
         compute_v_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 688e61fcadc3ad01b579f8dfc712af2d8032ee35..2f87057f4ef431a0ed4cac928ff21575d7af34a9 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -169,9 +169,10 @@ Rendezvous::DoneCallback make_recv_callback(OpKernelContext* ctx,
 }  // namespace
 
 void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
-  OP_REQUIRES(
+  OP_REQUIRES_ASYNC(
       ctx, ctx->rendezvous() != nullptr,
-      errors::Internal("Op kernel context needs to provide a rendezvous."));
+      errors::Internal("Op kernel context needs to provide a rendezvous."),
+      done);
 
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 799c574d1542c345c606c276b0cc24fe61a47bba..9e041d98f7f1b116a85de25ef93fa19c112af4b6 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -44,6 +44,8 @@ class SerializeSparseOp : public OpKernel {
   explicit SerializeSparseOp(OpKernelConstruction* context)
       : OpKernel(context) {}
 
+  bool IsExpensive() override;
+
   Status Initialize(Tensor* result);
   Status Serialize(const Tensor& input, T* result);
 
@@ -82,6 +84,21 @@ class SerializeSparseOp : public OpKernel {
   }
 };
 
+// NOTE(mrry): We specialize the IsExpensive() method differently for
+// the string and variant cases, because (i) the string version
+// actually performs memory copies as part of its serialization (and
+// is hence potentially expensive), and (ii) the variant version
+// performs O(1) shallow copies (and hence is much cheaper than
+// dispatching to another thread would be).
+template <>
+bool SerializeSparseOp<string>::IsExpensive() {
+  return true;
+}
+template <>
+bool SerializeSparseOp<Variant>::IsExpensive() {
+  return false;
+}
+
 template <>
 Status SerializeSparseOp<string>::Initialize(Tensor* result) {
   *result = Tensor(DT_STRING, TensorShape({3}));
@@ -323,7 +340,7 @@ class DeserializeSparseOp : public OpKernel {
             "but has a zero dimension ",
             serialized_sparse.shape().DebugString()));
 
-    if (num_sparse_tensors == 0 && serialized_sparse.shape().dims() == 1) {
+    if (num_sparse_tensors == 1 && serialized_sparse.shape().dims() == 0) {
       // Special case with a single sparse tensor. We can avoid data
       // motion in the Concat and Reshape.
       const auto& serialized_sparse_t = serialized_sparse.vec<T>();
diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc
index a545fb146c94342869441dee3770e7b677fa5142..9cd590ae615a67de2d683a77bb2bc562b46b362f 100644
--- a/tensorflow/core/kernels/shape_op_test.cc
+++ b/tensorflow/core/kernels/shape_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -62,8 +63,8 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE");
 REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE",
                                       GetShapeFromKnownVecSize);
 
-static void ExpectHasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+static void ExpectHasError(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
 
diff --git a/tensorflow/core/kernels/snapshot_op.cc b/tensorflow/core/kernels/snapshot_op.cc
index 50157d5d48f93bfe61cbac95246123ef0a7d446e..fe04dcf72e2aa73a0140338a8f207177048c7d0a 100644
--- a/tensorflow/core/kernels/snapshot_op.cc
+++ b/tensorflow/core/kernels/snapshot_op.cc
@@ -22,6 +22,26 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename Scalar>
+class SnapshotOp : public OpKernel {
+ public:
+  explicit SnapshotOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    Tensor* output = nullptr;
+    // Try to use buffer forwarding to avoid an explicit copy.
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input.shape(), &output));
+    if (!output->SharesBufferWith(input)) {
+      functor::Snapshot<Device, Scalar> functor;
+      functor(context->eigen_device<Device>(), input.flat<Scalar>(),
+              output->flat<Scalar>());
+    }
+  }
+};
 
 #define REGISTER_KERNEL(TYPE)                                        \
   REGISTER_KERNEL_BUILDER(                                           \
@@ -31,6 +51,16 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 TF_CALL_POD_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
+#if GOOGLE_CUDA
+#define REGISTER_KERNEL(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Snapshot").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<GPUDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+#endif
+
 #if TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SyclDevice;
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
diff --git a/tensorflow/core/kernels/snapshot_op.h b/tensorflow/core/kernels/snapshot_op.h
index b94834f15988a21ad41eefc8030b3da1a58875f8..a18065d42ba832d5b34f2dd534bc103c907310fe 100644
--- a/tensorflow/core/kernels/snapshot_op.h
+++ b/tensorflow/core/kernels/snapshot_op.h
@@ -26,29 +26,19 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
+namespace functor {
 
+// Functor used by SnapshotOp.
 template <typename Device, typename Scalar>
-class SnapshotOp : public OpKernel {
- public:
-  explicit SnapshotOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    Tensor* output = nullptr;
-    // Try to use buffer forwarding to avoid an explicit copy.
-    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {0}, 0, input.shape(), &output));
-    if (!output->SharesBufferWith(input)) {
-      // We had to allocate a new buffer since the refcount on the input was
-      // greater than 1. Copy the input to the new buffer.
-      const Device& device = context->eigen_device<Device>();
-      device.memcpy(output->template flat<Scalar>().data(),
-                    input.template flat<Scalar>().data(),
-                    input.NumElements() * sizeof(Scalar));
-    }
+struct Snapshot {
+  void operator()(const Device& device,
+                  typename TTypes<Scalar>::ConstTensor input,
+                  typename TTypes<Scalar>::Tensor output) {
+    device.memcpy(output.data(), input.data(), input.size() * sizeof(Scalar));
   }
 };
 
+}  // namespace functor
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
diff --git a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
index 52070be838d65d21813dfe097db9c395ef5a8448..e4e3bd5220382b50eca263d50d91d503b3a1c526 100644
--- a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
@@ -24,13 +24,10 @@ limitations under the License.
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_KERNEL(TYPE)                                        \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("Snapshot").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
-      SnapshotOp<GPUDevice, TYPE>);
+// Definition of the GPU implementations declared in softsign_op.cc.
+#define DEFINE_GPU_KERNELS(T) template struct functor::Snapshot<GPUDevice, T>;
 
-TF_CALL_POD_TYPES(REGISTER_KERNEL);
-#undef REGISTER_KERNEL
+TF_CALL_POD_TYPES(DEFINE_GPU_KERNELS);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index e1712ac239d6be2d51b0c0598a799959a8b53a94..e72608945b0b4494123afb5763fe882f54717a00 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -55,7 +56,7 @@ template <typename Device, typename T>
 class SoftmaxOp : public OpKernel {
  public:
   explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 1f4a82a7334e35c48af09c897895f79ee30e1ebd..b63dcbb163b1b7c1bee68571e2b43bb0a6f358a8 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -33,8 +34,42 @@ namespace tensorflow {
 
 namespace {
 
+template <typename U, typename T>
+__device__ __host__ EIGEN_STRONG_INLINE
+    typename std::enable_if<!std::is_same<T, U>::value, U>::type
+    strict_cast(T t);
+
+template <typename U, typename T>
+__device__ __host__ EIGEN_STRONG_INLINE
+    typename std::enable_if<std::is_same<T, U>::value, U>::type
+    strict_cast(T t) {
+  return t;
+}
+
+template <>
+__device__ __host__ EIGEN_STRONG_INLINE float strict_cast<float, Eigen::half>(
+    Eigen::half t) {
+  return functor::HalfToFloat()(t);
+}
+
+template <>
+__device__ __host__ EIGEN_STRONG_INLINE Eigen::half
+strict_cast<Eigen::half, float>(float t) {
+  return functor::FloatToHalf()(t);
+}
+
 template <typename T>
-__global__ void GenerateNormalizedProb(const T* logits, const T* sum_probs,
+struct softmax_traits {
+  using accumulator_type = T;
+};
+
+template <>
+struct softmax_traits<Eigen::half> {
+  using accumulator_type = float;
+};
+
+template <typename T, typename U>
+__global__ void GenerateNormalizedProb(const T* logits, const U* sum_probs,
                                        const T* max_logits, T* output,
                                        const int num_rows, const int num_cols,
                                        const bool in_log_space) {
@@ -43,25 +78,33 @@ __global__ void GenerateNormalizedProb(const T* logits, const T* sum_probs,
   const int row = tid / num_cols;
   const int col = tid % num_cols;
 
+  // TODO(jamesqin): change to half2 load when inputs are Eigen::half.
+  U input = strict_cast<U>(logits[tid]);
+  U max_val = strict_cast<U>(ldg(max_logits + row));
+  U result;
+
   if (row < num_rows && col < num_cols) {
-    if (in_log_space)
-      output[tid] =
-          logits[tid] - ldg(max_logits + row) - log(ldg(sum_probs + row));
-    else
-      output[tid] =
-          exp(logits[tid] - ldg(max_logits + row)) / ldg(sum_probs + row);
+    if (in_log_space) {
+      result = input - max_val - log(ldg(sum_probs + row));
+    } else {
+      result = exp(input - max_val) / ldg(sum_probs + row);
+    }
+    output[tid] = strict_cast<T>(result);
   }
 }
 
-template <typename T>
+template <typename T, typename U>
 struct SubtractAndExpFunctor {
   __host__ __device__ SubtractAndExpFunctor(const T* logits,
                                             const T* max_logits,
                                             const int num_cols)
       : logits_(logits), max_logits_(max_logits), num_cols_(num_cols) {}
 
-  __host__ __device__ T operator()(const int gid) const {
-    return exp(logits_[gid] - ldg(max_logits_ + gid / num_cols_));
+  __host__ __device__ U operator()(const int gid) const {
+    // TODO(jamesqin): change to half2 load when inputs are Eigen::half.
+    const U diff =
+        strict_cast<U>(logits_[gid] - ldg(max_logits_ + gid / num_cols_));
+    return exp(diff);
   }
 
   const T* logits_;
@@ -80,14 +123,13 @@ void DoRowReduction(OpKernelContext* context, T* output, InputIter input,
   functor::ReduceImpl<T, Op, T*, InputIter, ReductionAxes>(
       context, output, input, 2, rows, cols, 1, 1, constants.kOne, op);
 }
-
 }  // namespace
 
 template <typename T>
 class SoftmaxOpGPU : public OpKernel {
  public:
   explicit SoftmaxOpGPU(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
@@ -108,8 +150,10 @@ class SoftmaxOpGPU : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_temp(DataTypeToEnum<T>::value,
                                             softmax_out->shape(), &max_logits));
+
+      typedef typename softmax_traits<T>::accumulator_type acc_type;
       OP_REQUIRES_OK(context,
-                     context->allocate_temp(DataTypeToEnum<T>::value,
+                     context->allocate_temp(DataTypeToEnum<acc_type>::value,
                                             softmax_out->shape(), &sum_probs));
 
       DoRowReduction<T, cub::Max, const T*>(
@@ -120,25 +164,28 @@ class SoftmaxOpGPU : public OpKernel {
       const int numBlocks = Eigen::divup(rows * cols, numThreads);
 
       cub::CountingInputIterator<int> counting_iterator(0);
-      typedef cub::TransformInputIterator<T, SubtractAndExpFunctor<T>,
+      typedef cub::TransformInputIterator<acc_type,
+                                          SubtractAndExpFunctor<T, acc_type>,
                                           cub::CountingInputIterator<int>>
           InputIterType;
 
       InputIterType input_itr(
           counting_iterator,
-          SubtractAndExpFunctor<T>(
+          SubtractAndExpFunctor<T, acc_type>(
               reinterpret_cast<const T*>(logits_in_.flat<T>().data()),
               reinterpret_cast<const T*>(max_logits.flat<T>().data()), cols));
 
-      DoRowReduction<T, cub::Sum, InputIterType>(
-          context, const_cast<T*>(sum_probs.flat<T>().data()), input_itr, rows,
-          cols);
+      DoRowReduction<acc_type, cub::Sum, InputIterType>(
+          context, const_cast<acc_type*>(sum_probs.flat<acc_type>().data()),
+          input_itr, rows, cols);
 
-      GenerateNormalizedProb<<<numBlocks, numThreads, 0, cu_stream>>>(
-          reinterpret_cast<const T*>(logits_in_.flat<T>().data()),
-          reinterpret_cast<const T*>(sum_probs.flat<T>().data()),
-          reinterpret_cast<const T*>(max_logits.flat<T>().data()),
-          const_cast<T*>(softmax_out->flat<T>().data()), rows, cols, log_);
+      GenerateNormalizedProb<T, acc_type>
+          <<<numBlocks, numThreads, 0, cu_stream>>>(
+              reinterpret_cast<const T*>(logits_in_.flat<T>().data()),
+              reinterpret_cast<const acc_type*>(
+                  sum_probs.flat<acc_type>().data()),
+              reinterpret_cast<const T*>(max_logits.flat<T>().data()),
+              const_cast<T*>(softmax_out->flat<T>().data()), rows, cols, log_);
     }
   }
 
diff --git a/tensorflow/core/kernels/spacetodepth_op.cc b/tensorflow/core/kernels/spacetodepth_op.cc
index 23df1c35e5205ff3045643efee203aa501d4959d..e59adfc6acbeef3e2d309629121d308c6e228703 100644
--- a/tensorflow/core/kernels/spacetodepth_op.cc
+++ b/tensorflow/core/kernels/spacetodepth_op.cc
@@ -187,6 +187,9 @@ TF_CALL_ALL_TYPES(REGISTER);
 REGISTER_KERNEL_BUILDER(
     Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     SpaceToDepthOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    SpaceToDepthOp<GPUDevice, Eigen::half>);
 REGISTER_KERNEL_BUILDER(
     Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
     SpaceToDepthOp<GPUDevice, qint8>);
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index a1a01e8813261592a0d9ea97d6f76a163d070ee4..f38459724abcb544252885b89dede635960b24b9 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -154,6 +154,9 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NHWC> {
 
     const int total_count =
         batch_size * input_height * input_width * input_depth;
+    if (total_count == 0) {
+      return;
+    }
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
     S2D_NHWC<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
         config.virtual_thread_count, input.data(), block_size, batch_size,
@@ -184,6 +187,9 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> {
       const int input_width = input.dimension(3);
       const int input_depth_by_output_area = input_depth * output_area;
       const int total_count = batch_size * input_depth_by_output_area;
+      if (total_count == 0) {
+        return;
+      }
       CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
       switch (block_size) {
         case 2:
@@ -209,6 +215,9 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> {
 
     // Other block sizes are processed by the generic kernel.
     const int total_count = batch_size * output_depth_by_output_area;
+    if (total_count == 0) {
+      return;
+    }
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
     S2D_NCHW<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
         config.virtual_thread_count, input.data(), block_size, output_width,
@@ -225,6 +234,12 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> {
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, float, FORMAT_NCHW>;
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, float, FORMAT_NHWC>;
 
+// Instantiate the GPU implementations for Eigen::half.
+template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::half,
+                                               FORMAT_NCHW>;
+template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::half,
+                                               FORMAT_NHWC>;
+
 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, int32, FORMAT_NCHW>;
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 7cd4532ad63812d905ceb6b96291aa50293070ef..4ebb7fbcc70e10a0d1b7a5dd063c2524b01b6dfc 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -327,7 +327,7 @@ class SparseCrossOp : public OpKernel {
 
     typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater updater(
         output_start_indices, indices_out, values_out);
-    auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) {
+    auto do_work = [&columns, crosser, updater](int64 begin, int64 end) {
       for (int b = begin; b < end; b++) {
         ProductIterator<InternalType> product_iterator(columns, b);
         int64 cross_count = 0;
@@ -419,7 +419,7 @@ class SparseCrossOp : public OpKernel {
           context, TensorShapeUtils::IsMatrix(dense_list_in[i].shape()),
           errors::InvalidArgument(
               "Dense inputs should be a matrix but received shape ",
-              indices_list_in[i].shape().DebugString(), " at position ", i));
+              dense_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(context, dense_list_in[i].dim_size(0) == batch_size,
                   errors::InvalidArgument("Expected batch size ", batch_size,
                                           " got ", dense_list_in[i].dim_size(0),
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index fe198af7e6c131ab19daf877063a2a6838d1f2c7..29577ebb4ed1f678768cf8af92dff48a5a901e67 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -32,7 +33,7 @@ namespace tensorflow {
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index 14ef2ed7044a796dff67e287230d955e32ca62cd..e89280724ee38f5b15d8113ea665dc4fa4651b0e 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_WINDOWS)
diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h
index a08949e626cc8e5d4c3707b75a902d82b46c3376..bc1fa28f8f8f23085d89e5b98d57914de778ea0b 100644
--- a/tensorflow/core/kernels/split_lib.h
+++ b/tensorflow/core/kernels/split_lib.h
@@ -31,31 +31,31 @@ struct SplitCustom {
                   const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_sizes);
 };
 
-template <typename Device, typename T>
+template <typename Device, typename T, int NDims>
 struct Split {
-  void operator()(const Device& d, typename TTypes<T, 3>::Tensor output,
-                  typename TTypes<T, 3>::ConstTensor input,
-                  const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
-                  const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes);
+  void operator()(const Device& d, typename TTypes<T, NDims>::Tensor output,
+                  typename TTypes<T, NDims>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes);
 };
 
-template <typename T>
-struct Split<Eigen::ThreadPoolDevice, T> {
+template <typename T, int NDims>
+struct Split<Eigen::ThreadPoolDevice, T, NDims> {
   void operator()(const Eigen::ThreadPoolDevice& d,
-                  typename TTypes<T, 3>::Tensor output,
-                  typename TTypes<T, 3>::ConstTensor input,
-                  const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
-                  const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes);
+                  typename TTypes<T, NDims>::Tensor output,
+                  typename TTypes<T, NDims>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes);
 };
 
 #ifdef TENSORFLOW_USE_SYCL
-template <typename T>
+template <typename T, int NDims>
 struct Split<Eigen::SyclDevice, T> {
   void operator()(const Eigen::SyclDevice& d,
-                  typename TTypes<T, 3>::Tensor output,
-                  typename TTypes<T, 3>::ConstTensor input,
-                  const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
-                  const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes);
+                  typename TTypes<T, NDims>::Tensor output,
+                  typename TTypes<T, NDims>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes);
 };
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 771c633b156edf7c7d9944fe95703a0e0cd9e981..a3060e4e90d8db6866bd0c56570beeef65ab58ce 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -24,12 +24,12 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-template <typename T>
-void Split<Eigen::ThreadPoolDevice, T>::operator()(
-    const Eigen::ThreadPoolDevice& d, typename TTypes<T, 3>::Tensor output,
-    typename TTypes<T, 3>::ConstTensor input,
-    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
-    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
+template <typename T, int NDims>
+void Split<Eigen::ThreadPoolDevice, T, NDims>::operator()(
+    const Eigen::ThreadPoolDevice& d, typename TTypes<T, NDims>::Tensor output,
+    typename TTypes<T, NDims>::ConstTensor input,
+    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
+    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes) {
   if (output.size() < 131072) {
     output = input.slice(slice_indices, slice_sizes);
   } else {
@@ -37,22 +37,26 @@ void Split<Eigen::ThreadPoolDevice, T>::operator()(
   }
 }
 
-#define DEFINE_CPU_KERNELS(T) template struct Split<Eigen::ThreadPoolDevice, T>;
+#define DEFINE_CPU_KERNELS(T)                           \
+  template struct Split<Eigen::ThreadPoolDevice, T, 2>; \
+  template struct Split<Eigen::ThreadPoolDevice, T, 3>;
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
 
 #ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void Split<Eigen::SyclDevice, T>::operator()(
-    const Eigen::SyclDevice& d, typename TTypes<T, 3>::Tensor output,
-    typename TTypes<T, 3>::ConstTensor input,
-    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
-    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
+template <typename T, int NDims>
+void Split<Eigen::SyclDevice, T, NDims>::operator()(
+    const Eigen::SyclDevice& d, typename TTypes<T, NDims>::Tensor output,
+    typename TTypes<T, NDims>::ConstTensor input,
+    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
+    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes) {
   output.device(d) = input.slice(slice_indices, slice_sizes);
 }
 
-#define DEFINE_SYCL_KERNELS(T) template struct Split<Eigen::SyclDevice, T>;
+#define DEFINE_SYCL_KERNELS(T)                    \
+  template struct Split<Eigen::SyclDevice, T, 2>; \
+  template struct Split<Eigen::SyclDevice, T, 3>;
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS);
 #endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 9f234fc0935be0662b0d8df1a6bd1c109ab24fd9..393818730bb4fe7fc6bba7f66b2cc96b12cab390 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -29,12 +29,12 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-template <typename Device, typename T>
-void Split<Device, T>::operator()(
-    const Device& d, typename TTypes<T, 3>::Tensor output,
-    typename TTypes<T, 3>::ConstTensor input,
-    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
-    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
+template <typename Device, typename T, int NDims>
+void Split<Device, T, NDims>::operator()(
+    const Device& d, typename TTypes<T, NDims>::Tensor output,
+    typename TTypes<T, NDims>::ConstTensor input,
+    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
+    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes) {
   To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
 }
 
@@ -47,7 +47,9 @@ void SplitCustom<Device, T>::operator()(
   To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
 }
 
-#define DEFINE_GPU_KERNELS(T) template struct Split<Eigen::GpuDevice, T>;
+#define DEFINE_GPU_KERNELS(T)                    \
+  template struct Split<Eigen::GpuDevice, T, 2>; \
+  template struct Split<Eigen::GpuDevice, T, 3>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 85f529326dbf5d9d5ae72156da05f08f805d1271..7cc3c532c95584a66cfab3f184ffef4028ee4bdb 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -121,6 +121,77 @@ class SplitOpBase : public OpKernel {
   }
 };
 
+template <typename T, typename InputReshapedType, int NDims>
+class SplitOpCPUImpl {
+ public:
+  template <typename MakeSizesType, typename ReshapeResultType>
+  void operator()(OpKernelContext* context,
+                  const InputReshapedType& input_reshaped,
+                  const TensorShape& input_shape, int32 split_dim,
+                  Eigen::DenseIndex prefix_dim_size,
+                  Eigen::DenseIndex split_dim_size,
+                  Eigen::DenseIndex suffix_dim_size,
+                  const MakeSizesType& make_sizes,
+                  const ReshapeResultType& reshape_result, int32 num_split,
+                  int64 split_dim_output_size) const {
+    const auto num_threads =
+        context->device()->tensorflow_cpu_worker_threads()->num_threads;
+    // TODO(jewillco): Tune heuristic further.
+    const auto input_element_count = input_shape.num_elements();
+    const bool use_parallelism_between_outputs =
+        (num_split >= 4 &&
+         input_element_count >= std::max(num_threads, num_split) * 4096 &&
+         input_element_count < num_split * 180 * 1024);
+    Eigen::DSizes<Eigen::DenseIndex, NDims> indices;
+    for (int i = 0; i < NDims; ++i) {
+      indices[i] = 0;
+    }
+    auto sizes = make_sizes(split_dim_output_size);
+    TensorShape output_shape(input_shape);
+    output_shape.set_dim(split_dim, split_dim_output_size);
+
+    auto range_output_func = [&indices, context, &output_shape, prefix_dim_size,
+                              split_dim_output_size, suffix_dim_size, &sizes,
+                              use_parallelism_between_outputs, &input_reshaped,
+                              &reshape_result](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        Tensor* result = nullptr;
+        OP_REQUIRES_OK(context,
+                       context->allocate_output(i, output_shape, &result));
+        if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) {
+          Eigen::DSizes<Eigen::DenseIndex, NDims> slice_indices;
+          Eigen::DSizes<Eigen::DenseIndex, NDims> slice_sizes;
+          for (int j = 0; j < NDims; ++j) {
+            slice_indices[j] =
+                (j == NDims - 2 ? i * split_dim_output_size : indices[j]);
+            slice_sizes[j] = sizes[j];
+          }
+
+          auto result_shaped = reshape_result(result, split_dim_output_size);
+
+          if (use_parallelism_between_outputs) {
+            // Use sequential implementation for single output.
+            result_shaped = input_reshaped.slice(slice_indices, slice_sizes);
+          } else {
+            // This implementation may be parallel internally.
+            functor::Split<CPUDevice, T, NDims>()(
+                context->eigen_device<CPUDevice>(), result_shaped,
+                input_reshaped, slice_indices, slice_sizes);
+          }
+        }
+      }
+    };
+    if (use_parallelism_between_outputs) {
+      // Run in parallel, disabling parallelism in functor.
+      context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+          num_split, input_element_count / num_split, range_output_func);
+    } else {
+      // Run sequentially, but allow internal parallelism in functor.
+      range_output_func(0, num_split);
+    }
+  }
+};
+
 template <typename T>
 class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
  public:
@@ -154,66 +225,37 @@ class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
 
     std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
         Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim);
-    auto input_reshaped =
-        input.shaped<T, 3>({prefix_dim_size, split_dim_size, suffix_dim_size});
 
     const int64 split_dim_output_size = split_dim_size / num_split;
-    TensorShape output_shape(input_shape);
-    output_shape.set_dim(split_dim, split_dim_output_size);
-
-    Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0};
-    const Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
-        prefix_dim_size, split_dim_output_size, suffix_dim_size};
-
-    const auto num_threads =
-        context->device()->tensorflow_cpu_worker_threads()->num_threads;
-    // TODO(jewillco): Tune heuristic further.
-    const auto input_element_count = input_shape.num_elements();
-    const bool use_parallelism_between_outputs =
-        (num_split >= 4 &&
-         input_element_count >= std::max(num_threads, num_split) * 4096 &&
-         input_element_count < num_split * 180 * 1024);
-
-    auto range_output_func = [&indices, context, &output_shape, prefix_dim_size,
-                              split_dim_output_size, suffix_dim_size, &sizes,
-                              use_parallelism_between_outputs,
-                              &input_reshaped](int64 start, int64 limit) {
-      for (int64 i = start; i < limit; ++i) {
-        Tensor* result = nullptr;
-        OP_REQUIRES_OK(context,
-                       context->allocate_output(i, output_shape, &result));
-        if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) {
-          Eigen::DSizes<Eigen::DenseIndex, 3> slice_indices;
-          Eigen::DSizes<Eigen::DenseIndex, 3> slice_sizes;
-          for (int j = 0; j < 3; ++j) {
-            slice_indices[j] =
-                (j == 1 ? i * split_dim_output_size : indices[j]);
-            slice_sizes[j] = sizes[j];
-          }
-
-          auto result_shaped = result->shaped<T, 3>(
-              {prefix_dim_size, split_dim_output_size, suffix_dim_size});
 
-          if (use_parallelism_between_outputs) {
-            // Use sequential implementation for single output.
-            result_shaped = input_reshaped.slice(slice_indices, slice_sizes);
-          } else {
-            // This implementation may be parallel internally.
-            functor::Split<CPUDevice, T>()(context->eigen_device<CPUDevice>(),
-                                           result_shaped, input_reshaped,
-                                           slice_indices, slice_sizes);
-          }
-        }
-      }
-    };
-    if (use_parallelism_between_outputs) {
-      // Run in parallel, disabling parallelism in functor.
-      Shard(num_split,
-            context->device()->tensorflow_cpu_worker_threads()->workers,
-            num_split, input_element_count / num_split, range_output_func);
+    if (prefix_dim_size == 1) {
+      auto input_reshaped =
+          input.shaped<T, 2>({split_dim_size, suffix_dim_size});
+      auto make_sizes = [&](Eigen::DenseIndex split_size) {
+        return Eigen::DSizes<Eigen::DenseIndex, 2>{split_size, suffix_dim_size};
+      };
+      auto reshape_result = [&](Tensor* result, Eigen::DenseIndex split_size) {
+        return result->shaped<T, 2>({split_size, suffix_dim_size});
+      };
+      SplitOpCPUImpl<T, decltype(input_reshaped), 2>{}(
+          context, input_reshaped, input_shape, split_dim, prefix_dim_size,
+          split_dim_size, suffix_dim_size, make_sizes, reshape_result,
+          num_split, split_dim_output_size);
     } else {
-      // Run sequentially, but allow internal parallelism in functor.
-      range_output_func(0, num_split);
+      auto input_reshaped = input.shaped<T, 3>(
+          {prefix_dim_size, split_dim_size, suffix_dim_size});
+      auto make_sizes = [&](Eigen::DenseIndex split_size) {
+        return Eigen::DSizes<Eigen::DenseIndex, 3>{prefix_dim_size, split_size,
+                                                   suffix_dim_size};
+      };
+      auto reshape_result = [&](Tensor* result, Eigen::DenseIndex split_size) {
+        return result->shaped<T, 3>(
+            {prefix_dim_size, split_size, suffix_dim_size});
+      };
+      SplitOpCPUImpl<T, decltype(input_reshaped), 3>{}(
+          context, input_reshaped, input_shape, split_dim, prefix_dim_size,
+          split_dim_size, suffix_dim_size, make_sizes, reshape_result,
+          num_split, split_dim_output_size);
     }
   }
 };
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 7ff5df47d70fa8e47aabfb24e82874c146708ef1..5c19a45fb18abdacb5f89f623f9690b43bdfa1e5 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -55,8 +55,13 @@ class SplitVOpBase : public OpKernel {
     const Tensor& input = context->input(0);
     const TensorShape& input_shape = input.shape();
     const Tensor& split_tensor = context->input(1);
+    const Tensor& split_dim_tensor = context->input(2);
 
-    const int32 split_dim_orig = context->input(2).flat<int32>()(0);
+    OP_REQUIRES(context, split_dim_tensor.NumElements() == 1,
+                errors::InvalidArgument("split_dim_tensor must have "
+                                        "exactly one element."));
+
+    const int32 split_dim_orig = split_dim_tensor.flat<int32>()(0);
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
 
@@ -175,6 +180,77 @@ class SplitVOpBase : public OpKernel {
   }
 };
 
+template <typename T, typename Tlen, typename InputReshapedType, int NDims>
+class SplitVOpCPUImpl {
+ public:
+  template <typename MakeSizesType, typename ReshapeResultType>
+  void operator()(OpKernelContext* context,
+                  const InputReshapedType& input_reshaped,
+                  const std::vector<int64>& split_start_points,
+                  const TensorShape& input_shape, int32 split_dim,
+                  Eigen::DenseIndex prefix_dim_size,
+                  Eigen::DenseIndex split_dim_size,
+                  Eigen::DenseIndex suffix_dim_size,
+                  std::vector<Tlen>& split_sizes_vec,
+                  const MakeSizesType& make_sizes,
+                  const ReshapeResultType& reshape_result) const {
+    Eigen::DSizes<Eigen::DenseIndex, NDims> indices;
+    for (int i = 0; i < NDims; ++i) {
+      indices[i] = 0;
+    }
+    const auto num_threads =
+        context->device()->tensorflow_cpu_worker_threads()->num_threads;
+    // TODO(jewillco): Tune heuristic further.
+    const auto input_element_count = input_shape.num_elements();
+    const int num_split = split_start_points.size();
+    const bool use_parallelism_between_outputs =
+        (num_split >= 4 &&
+         input_element_count >= std::max(num_threads, num_split) * 4096 &&
+         input_element_count < num_split * 180 * 1024);
+
+    auto range_output_func = [&indices, context, &input_shape, split_dim,
+                              &split_sizes_vec, &split_start_points,
+                              use_parallelism_between_outputs, &input_reshaped,
+                              &make_sizes,
+                              &reshape_result](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        TensorShape output_shape(input_shape);
+        output_shape.set_dim(split_dim, split_sizes_vec[i]);
+        Tensor* result = nullptr;
+        OP_REQUIRES_OK(context,
+                       context->allocate_output(i, output_shape, &result));
+
+        const auto sizes = make_sizes(split_sizes_vec[i]);
+
+        if (sizes.TotalSize() > 0) {
+          auto result_shaped = reshape_result(result, split_sizes_vec[i]);
+
+          auto current_indices = indices;
+          current_indices[NDims - 2] = split_start_points[i];
+          if (use_parallelism_between_outputs) {
+            // Use sequential implementation for single output.
+            result_shaped = input_reshaped.slice(current_indices, sizes);
+          } else {
+            // This implementation may be parallel internally.
+            functor::Split<CPUDevice, T, NDims>()(
+                context->eigen_device<CPUDevice>(), result_shaped,
+                input_reshaped, current_indices, sizes);
+          }
+        }
+      }
+    };
+    if (use_parallelism_between_outputs) {
+      // Run in parallel, disabling parallelism in functor.
+      Shard(num_split,
+            context->device()->tensorflow_cpu_worker_threads()->workers,
+            num_split, input_element_count / num_split, range_output_func);
+    } else {
+      // Run sequentially, but allow internal parallelism in functor.
+      range_output_func(0, num_split);
+    }
+  }
+};
+
 template <typename T, typename Tlen>
 class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
  public:
@@ -209,10 +285,6 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
 
     std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
         Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim);
-    auto input_reshaped =
-        input.shaped<T, 3>({prefix_dim_size, split_dim_size, suffix_dim_size});
-
-    Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0};
     std::vector<int64> split_start_points(num_split);
     for (int i = 0; i < num_split; ++i) {
       if (i == 0) {
@@ -223,55 +295,34 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
       }
     }
 
-    const auto num_threads =
-        context->device()->tensorflow_cpu_worker_threads()->num_threads;
-    // TODO(jewillco): Tune heuristic further.
-    const auto input_element_count = input_shape.num_elements();
-    const bool use_parallelism_between_outputs =
-        (num_split >= 4 &&
-         input_element_count >= std::max(num_threads, num_split) * 4096 &&
-         input_element_count < num_split * 180 * 1024);
-
-    auto range_output_func = [&indices, context, &input_shape, prefix_dim_size,
-                              split_dim, &split_sizes_vec, &split_start_points,
-                              suffix_dim_size, use_parallelism_between_outputs,
-                              &input_reshaped](int64 start, int64 limit) {
-      for (int64 i = start; i < limit; ++i) {
-        TensorShape output_shape(input_shape);
-        output_shape.set_dim(split_dim, split_sizes_vec[i]);
-        Tensor* result = nullptr;
-        OP_REQUIRES_OK(context,
-                       context->allocate_output(i, output_shape, &result));
-
-        Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
-            prefix_dim_size, split_sizes_vec[i], suffix_dim_size};
-
-        if (sizes.TotalSize() > 0) {
-          auto result_shaped = result->shaped<T, 3>(
-              {prefix_dim_size, split_sizes_vec[i], suffix_dim_size});
-
-          auto current_indices = indices;
-          current_indices[1] = split_start_points[i];
-          if (use_parallelism_between_outputs) {
-            // Use sequential implementation for single output.
-            result_shaped = input_reshaped.slice(current_indices, sizes);
-          } else {
-            // This implementation may be parallel internally.
-            functor::Split<CPUDevice, T>()(context->eigen_device<CPUDevice>(),
-                                           result_shaped, input_reshaped,
-                                           current_indices, sizes);
-          }
-        }
-      }
-    };
-    if (use_parallelism_between_outputs) {
-      // Run in parallel, disabling parallelism in functor.
-      Shard(num_split,
-            context->device()->tensorflow_cpu_worker_threads()->workers,
-            num_split, input_element_count / num_split, range_output_func);
+    if (prefix_dim_size == 1) {
+      auto input_reshaped =
+          input.shaped<T, 2>({split_dim_size, suffix_dim_size});
+      auto make_sizes = [&](Eigen::DenseIndex split_size) {
+        return Eigen::DSizes<Eigen::DenseIndex, 2>{split_size, suffix_dim_size};
+      };
+      auto reshape_result = [&](Tensor* result, Tlen split_size) {
+        return result->shaped<T, 2>({split_size, suffix_dim_size});
+      };
+      SplitVOpCPUImpl<T, Tlen, decltype(input_reshaped), 2>{}(
+          context, input_reshaped, split_start_points, input_shape, split_dim,
+          prefix_dim_size, split_dim_size, suffix_dim_size, split_sizes_vec,
+          make_sizes, reshape_result);
     } else {
-      // Run sequentially, but allow internal parallelism in functor.
-      range_output_func(0, num_split);
+      auto input_reshaped = input.shaped<T, 3>(
+          {prefix_dim_size, split_dim_size, suffix_dim_size});
+      auto make_sizes = [&](Eigen::DenseIndex split_size) {
+        return Eigen::DSizes<Eigen::DenseIndex, 3>{prefix_dim_size, split_size,
+                                                   suffix_dim_size};
+      };
+      auto reshape_result = [&](Tensor* result, Tlen split_size) {
+        return result->shaped<T, 3>(
+            {prefix_dim_size, split_size, suffix_dim_size});
+      };
+      SplitVOpCPUImpl<T, Tlen, decltype(input_reshaped), 3>{}(
+          context, input_reshaped, split_start_points, input_shape, split_dim,
+          prefix_dim_size, split_dim_size, suffix_dim_size, split_sizes_vec,
+          make_sizes, reshape_result);
     }
   }
 };
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 03fc4467a1dcf9d70c90c19809690934b0a7c2f4..73a02a34cf231799e6a813f042757d70b4e9414a 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -32,53 +32,8 @@ namespace {
 
 class Buffer : public ResourceBase {
  public:
-  // public types
   using Tuple = std::vector<Tensor>;
 
- private:
-  // private variables
-  std::size_t capacity_;
-  std::size_t memory_limit_;
-  std::size_t current_bytes_;
-  std::mutex mu_;
-  std::condition_variable non_empty_cond_var_;
-  std::condition_variable full_cond_var_;
-  std::deque<Tuple> buf_;
-
- private:
-  // private methods
-
-  // If the buffer is configured for bounded capacity, notify
-  // waiting inserters that space is now available
-  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
-    if (IsBounded()) {
-      lock->unlock();
-      // Notify all inserters. The removal of an element
-      // may make memory available for many inserters
-      // to insert new elements
-      full_cond_var_.notify_all();
-    }
-  }
-
-  // Are there a limit number of elements or a memory limit
-  // configued on this buffer?
-  bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
-
-  bool IsCapacityFull() const { return buf_.size() >= capacity_; }
-
-  bool WouldExceedMemoryLimit(std::size_t bytes) const {
-    return bytes + current_bytes_ > memory_limit_;
-  }
-
-  std::size_t GetTupleBytes(const Tuple& tuple) {
-    return std::accumulate(tuple.begin(), tuple.end(), 0,
-                           [](const std::size_t& lhs, const Tensor& rhs) {
-                             return lhs + rhs.TotalBytes();
-                           });
-  }
-
- public:
-  // public methods
   explicit Buffer(std::size_t capacity, std::size_t memory_limit)
       : capacity_(capacity), memory_limit_(memory_limit), current_bytes_(0) {}
 
@@ -181,6 +136,44 @@ class Buffer : public ResourceBase {
     std::unique_lock<std::mutex> lock(mu_);
     return strings::StrCat("Staging size: ", buf_.size());
   }
+
+ private:
+  // If the buffer is configured for bounded capacity, notify
+  // waiting inserters that space is now available
+  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
+    if (IsBounded()) {
+      lock->unlock();
+      // Notify all inserters. The removal of an element
+      // may make memory available for many inserters
+      // to insert new elements
+      full_cond_var_.notify_all();
+    }
+  }
+
+  // Are there a limit number of elements or a memory limit
+  // configued on this buffer?
+  bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
+
+  bool IsCapacityFull() const { return buf_.size() >= capacity_; }
+
+  bool WouldExceedMemoryLimit(std::size_t bytes) const {
+    return bytes + current_bytes_ > memory_limit_;
+  }
+
+  std::size_t GetTupleBytes(const Tuple& tuple) {
+    return std::accumulate(tuple.begin(), tuple.end(), 0,
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
+  }
+
+  std::size_t capacity_;
+  std::size_t memory_limit_;
+  std::size_t current_bytes_;
+  std::mutex mu_;
+  std::condition_variable non_empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::deque<Tuple> buf_;
 };
 
 Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 88fcf542fb0cc726b228be34d0fe7b92663ce95d..eab176c7fb78c1a2f0a48b907c3b01bc640758d0 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -27,6 +27,41 @@ namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
+Status GenerateKey(Tensor seed, random::PhiloxRandom::Key* out_key,
+                   random::PhiloxRandom::ResultType* out_counter) {
+  // Grab the two seeds
+  uint64 seed0;
+  uint64 seed1;
+  if (seed.dtype() == DT_INT32) {
+    const auto seed_vals = seed.flat<int32>();
+    seed0 = internal::SubtleMustCopy(seed_vals(0));
+    seed1 = internal::SubtleMustCopy(seed_vals(1));
+  } else if (seed.dtype() == DT_INT64) {
+    const auto seed_vals = seed.flat<int64>();
+    seed0 = internal::SubtleMustCopy(seed_vals(0));
+    seed1 = internal::SubtleMustCopy(seed_vals(1));
+  } else {
+    return errors::InvalidArgument("Invalid seed type: ",
+                                   DataTypeString(seed.dtype()));
+  }
+
+  // Scramble the seeds so that the user doesn't need to worry about which
+  // part of the seed needs to be strong.
+  (*out_key)[0] = 0x3ec8f720;
+  (*out_key)[1] = 0x02461e29;
+  (*out_counter)[0] = static_cast<uint32>(seed0);
+  (*out_counter)[1] = static_cast<uint32>(seed0 >> 32);
+  (*out_counter)[2] = static_cast<uint32>(seed1);
+  (*out_counter)[3] = static_cast<uint32>(seed1 >> 32);
+  const auto mix = random::PhiloxRandom(*out_counter, *out_key)();
+  (*out_key)[0] = mix[0];
+  (*out_key)[1] = mix[1];
+  (*out_counter)[0] = (*out_counter)[1] = 0;
+  (*out_counter)[2] = mix[2];
+  (*out_counter)[3] = mix[3];
+  return Status::OK();
+}
+
 namespace {
 
 class StatelessRandomOpBase : public OpKernel {
@@ -49,36 +84,9 @@ class StatelessRandomOpBase : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
     if (shape.num_elements() == 0) return;
 
-    // Grab the two seeds
-    uint64 seed0;
-    uint64 seed1;
-    if (context->input_dtype(1) == DT_INT32) {
-      const auto seed = seed_t.flat<int32>();
-      seed0 = internal::SubtleMustCopy(seed(0));
-      seed1 = internal::SubtleMustCopy(seed(1));
-    } else {
-      CHECK_EQ(DT_INT64, context->input_dtype(1));
-      const auto seed = seed_t.flat<int64>();
-      seed0 = internal::SubtleMustCopy(seed(0));
-      seed1 = internal::SubtleMustCopy(seed(1));
-    }
-
-    // Scramble the seeds so that the user doesn't need to worry about which
-    // part of the seed needs to be strong.
     random::PhiloxRandom::Key key;
     random::PhiloxRandom::ResultType counter;
-    key[0] = 0x3ec8f720;
-    key[1] = 0x02461e29;
-    counter[0] = static_cast<uint32>(seed0);
-    counter[1] = static_cast<uint32>(seed0 >> 32);
-    counter[2] = static_cast<uint32>(seed1);
-    counter[3] = static_cast<uint32>(seed1 >> 32);
-    const auto mix = random::PhiloxRandom(counter, key)();
-    key[0] = mix[0];
-    key[1] = mix[1];
-    counter[0] = counter[1] = 0;
-    counter[2] = mix[2];
-    counter[3] = mix[3];
+    OP_REQUIRES_OK(context, GenerateKey(seed_t, &key, &counter));
 
     // Fill in the random numbers
     Fill(context, random::PhiloxRandom(counter, key), output);
@@ -105,8 +113,6 @@ class StatelessRandomOp : public StatelessRandomOpBase {
   }
 };
 
-}  // namespace
-
 #define REGISTER(TYPE)                                                 \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("StatelessRandomUniform")                                   \
@@ -176,4 +182,6 @@ TF_CALL_double(REGISTER);
 
 #endif  // GOOGLE_CUDA
 
+}  // namespace
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops.h b/tensorflow/core/kernels/stateless_random_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcd29c487397726627fd23b7538d46f1f89a573f
--- /dev/null
+++ b/tensorflow/core/kernels/stateless_random_ops.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+// Generates a key and counter that can be used to seed a PhiloxRandom,
+// generator, based on the seed value in `seed_t`.
+//
+// REQUIRES: `seed_t` must be a length-2 vector of type DT_INT{32,64}.
+// `out_key` and `out_counter` must be non-null.
+Status GenerateKey(Tensor seed_t, random::PhiloxRandom::Key* out_key,
+                   random::PhiloxRandom::ResultType* out_counter);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 7745effe2abe94ba73a2f0d761210e07c62e499c..1e3e92a68a05123bafad77348e6811a14c303301 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -109,17 +109,27 @@ class StridedSliceOp : public OpKernel {
     if (is_identity) {
       VLOG(1) << "Strided slice identity ";
       Tensor tmp;
-      CHECK(tmp.CopyFrom(input, final_shape));
+      OP_REQUIRES(context, tmp.CopyFrom(input, final_shape),
+                  errors::Internal("Copy failed"));
       context->set_output(0, tmp);
       return;
     }
 
     // Optimization #2, slice is memory contiguous (only occurs in dim 0)
     if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), begin[0], end[0])) {
-      CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
+      OP_REQUIRES(context, input.dims() >= 1,
+                  errors::InvalidArgument(
+                      "Input must have rank at least 1, got: ", input.dims()));
+      // Otherwise, is_identity should be true.
       VLOG(1) << "Strided slice dim 0: " << input.shape().DebugString();
+      OP_REQUIRES(
+          context, begin[0] <= end[0],
+          errors::InvalidArgument("begin[0] (", begin[0],
+                                  ") must less or equal to end[0] (", end[0]));
+      Tensor slice = input.Slice(begin[0], end[0]);
       Tensor tmp;
-      CHECK(tmp.CopyFrom(input.Slice(begin[0], end[0]), final_shape));
+      OP_REQUIRES(context, tmp.CopyFrom(slice, final_shape),
+                  errors::Internal("Copy failed"));
       context->set_output(0, tmp);
       return;
     }
@@ -238,7 +248,8 @@ class StridedSliceGradOp : public OpKernel {
 
     if (processing_shape.dims() == 0) {
       auto in = context->input(4);
-      CHECK(result->CopyFrom(in, processing_shape));
+      OP_REQUIRES(context, result->CopyFrom(in, processing_shape),
+                  errors::Internal("Copy failed"));
       return;
     }
 
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 9efbd66ef7542a7f6ee0226362347cac7b30e8f5..4c2b312c3454e658ac4e288d06580d0ab5c04d52 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -71,7 +71,7 @@ class StringSplitOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("delimiter", &delimiter_tensor));
     OP_REQUIRES(
         ctx, TensorShapeUtils::IsScalar(delimiter_tensor->shape()),
-        errors::InvalidArgument("delimiter must scalar, got shape: ",
+        errors::InvalidArgument("delimiter must be a scalar, got shape: ",
                                 delimiter_tensor->shape().DebugString()));
     const auto delimiter_vec = delimiter_tensor->flat<string>();
     const string& delimiter = delimiter_vec(0);
diff --git a/tensorflow/core/kernels/string_strip_op.cc b/tensorflow/core/kernels/string_strip_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae700f42942da0c5f937e5ad2859d39f04f32a49
--- /dev/null
+++ b/tensorflow/core/kernels/string_strip_op.cc
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/string_ops.cc.
+
+#include <string>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+class StringStripOp : public OpKernel {
+ public:
+  explicit StringStripOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor));
+
+    const auto input = input_tensor->flat<string>();
+    auto output = output_tensor->flat<string>();
+
+    for (int64 i = 0; i < input.size(); ++i) {
+      StringPiece entry(input(i));
+      str_util::RemoveWhitespaceContext(&entry);
+      output(i) = entry.ToString();
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StringStrip").Device(DEVICE_CPU), StringStripOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.h b/tensorflow/core/kernels/string_to_hash_bucket_op.h
index 2fd22c3f4ed2bbc70eff08ba9b21c5811c0596d8..62ef35bbba401965882cc5cd146c69b41a106d1c 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.h
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.h
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <uint64 hash(const string&)>
+template <uint64 hash(StringPiece)>
 class StringToHashBucketOp : public OpKernel {
  public:
   explicit StringToHashBucketOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
diff --git a/tensorflow/core/kernels/summary_interface.h b/tensorflow/core/kernels/summary_interface.h
index 02391e967a84b2d2ff015d541969163807b9adc2..1854fe55268e993411ba42fff8b046d30d9d7f5e 100644
--- a/tensorflow/core/kernels/summary_interface.h
+++ b/tensorflow/core/kernels/summary_interface.h
@@ -17,14 +17,15 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 
+class Event;
+class GraphDef;
+
 // Main interface for the summary writer resource.
 class SummaryWriterInterface : public ResourceBase {
  public:
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index d317a8d33db5b69a84a0d193cb6322afaa53dff6..b287f0cc2f1337cff5705b5a40ba455b837307f9 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index 3c46abb8ab1ab2aafeff140ae07f987a3eba7db5..9dcabcc5843a9db13d93723e4364959cc74ea820 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -122,7 +123,7 @@ TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not the same shape")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not the same shape")) << s;
 }
 
 TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
@@ -133,7 +134,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -145,7 +146,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -256,7 +257,7 @@ TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) {
   AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
@@ -266,7 +267,7 @@ TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 // --------------------------------------------------------------------------
@@ -365,7 +366,7 @@ TEST_F(SummaryMergeOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}),
                             {s1.SerializeAsString(), s2.SerializeAsString()});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("Duplicate tag")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "Duplicate tag")) << s;
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index af93d814ec06ff86c6c7eb3312d97224dee485f2..7ec26d95e6886d639d2dde5a61456898529be524 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -1104,9 +1104,9 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
       indices[1] = i;
 
       if (element_shape.num_elements() > 0) {
-        functor::Split<Device, T>()(ctx->eigen_device<Device>(),
-                                    tensor_value_i_t, tensor_value_t, indices,
-                                    sizes);
+        functor::Split<Device, T, 3>()(ctx->eigen_device<Device>(),
+                                       tensor_value_i_t, tensor_value_t,
+                                       indices, sizes);
       }
 
       write_values.push_back(persistent_tensor);
@@ -1295,9 +1295,9 @@ class TensorArraySplitOp : public OpKernel {
         auto tensor_value_i_t = tensor_value_i->shaped<T, 3>(
             {1, tensor_lengths_t(i), elements_per_row});
 
-        functor::Split<Device, T>()(ctx->eigen_device<Device>(),
-                                    tensor_value_i_t, tensor_value_t, indices,
-                                    sizes);
+        functor::Split<Device, T, 3>()(ctx->eigen_device<Device>(),
+                                       tensor_value_i_t, tensor_value_t,
+                                       indices, sizes);
       }
 
       write_values.push_back(persistent_tensor);
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index f6e2a5ae251f57eed99b5c968d806310be11440e..7e56e15450aba23e6625b27da34a29b1ad2ecce2 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
@@ -40,14 +41,27 @@ Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor) {
     // updating.
     PersistentTensor unused;
     Tensor* tmp;
-    AllocatorAttributes attr;
-    attr.set_gpu_compatible(true);
-    attr.set_nic_compatible(true);
-    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
-        tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
-    functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
-    copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
-                 const_cast<const Tensor*>(tensor)->flat<T>());
+    if (std::is_same<T, Variant>::value) {
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
+
+      const auto elements_in = tensor->flat<Variant>();
+      auto elements_out = tmp->flat<Variant>();
+      for (int64 i = 0; i < elements_in.size(); ++i) {
+        elements_out(i) = elements_in(i);
+      }
+    } else {
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
+      functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+      copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
+                   const_cast<const Tensor*>(tensor)->flat<T>());
+    }
     *tensor = *tmp;
   }
   return Status::OK();
@@ -64,24 +78,21 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
                                   bool lock_held, bool sparse, Tensor* out) {
   if (ctx->input_dtype(input) == DT_RESOURCE) {
     Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      core::ScopedUnref unref_var(var);
-      if (lock_held) {
+    TF_RETURN_IF_ERROR(LookupResource(ctx, HandleFromInput(ctx, input), &var));
+    core::ScopedUnref unref_var(var);
+    if (lock_held) {
+      TF_RETURN_IF_ERROR(
+          PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
+      *out = *var->tensor();
+    } else {
+      mutex_lock ml(*var->mu());
+      if (!sparse) {
         TF_RETURN_IF_ERROR(
             PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-        *out = *var->tensor();
-      } else {
-        mutex_lock ml(*var->mu());
-        if (!sparse) {
-          TF_RETURN_IF_ERROR(
-              PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-        }
-        *out = *var->tensor();
       }
-      return Status::OK();
-    } else {
-      return errors::Internal("Invalid variable reference.");
+      *out = *var->tensor();
     }
+    return Status::OK();
   }
   *out = ctx->mutable_input(input, lock_held);
   return Status::OK();
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 233aa03c32333e62281cb8ab71828649b4fabe7e..271329599fa97a9799c10977bf8cf6629fa8afb3 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+
 #include <algorithm>
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -151,8 +153,10 @@ struct ApplyAdagrad<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
-                  typename TTypes<T>::ConstFlat grad) {
-    accum.device(d) += grad.square();
+                  typename TTypes<T>::ConstFlat grad, bool update_slots) {
+    if (update_slots) {
+      accum.device(d) += grad.square();
+    }
     var.device(d) -= grad * lr() * accum.rsqrt();
   }
 };
@@ -328,6 +332,27 @@ struct ApplyAdamSYCL {
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
 
+template <typename Device, typename T>
+struct ApplyAdaMaxNonCuda {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    m.device(d) += (grad - m) * (T(1) - beta1());
+    // Here v is u in section 7.1
+    v.device(d) = (beta2() * v).cwiseMax(grad.abs());
+    // var is θ in section 7.1
+    var.device(d) -= lr() / (T(1) - beta1_power()) * (m / (v + epsilon()));
+  }
+};
+
+template <typename T>
+struct ApplyAdaMax<CPUDevice, T> : ApplyAdaMaxNonCuda<CPUDevice, T> {};
+
 template <typename T>
 struct ApplyRMSProp<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -494,6 +519,7 @@ class ApplyGradientDescentOp<SYCLDevice, T> : public OpKernel {
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -647,6 +673,7 @@ class ApplyAdadeltaOp : public OpKernel {
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -822,6 +849,7 @@ class SparseApplyAdadeltaOp : public OpKernel {
   REGISTER_KERNELS(T, int64);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -1048,6 +1076,7 @@ class ApplyAdagradOp : public OpKernel {
  public:
   explicit ApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -1085,13 +1114,15 @@ class ApplyAdagradOp : public OpKernel {
 
     const Device& device = ctx->template eigen_device<Device>();
     functor::ApplyAdagrad<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
-                                       lr.scalar<T>(), grad.flat<T>());
+                                       lr.scalar<T>(), grad.flat<T>(),
+                                       update_slots_);
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
+  bool update_slots_;
 };
 
 #define REGISTER_KERNELS(D, T)                                        \
@@ -1107,6 +1138,7 @@ class ApplyAdagradOp : public OpKernel {
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -1118,7 +1150,7 @@ namespace functor {
   void ApplyAdagrad<GPUDevice, T>::operator()(                            \
       const GPUDevice& d, typename TTypes<T>::Flat var,                   \
       typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
-      typename TTypes<T>::ConstFlat grad);                                \
+      typename TTypes<T>::ConstFlat grad, bool update_slots);             \
   extern template struct ApplyAdagrad<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -1239,6 +1271,7 @@ class SparseApplyAdagradOp : public OpKernel {
  public:
   explicit SparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_));
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
@@ -1312,7 +1345,9 @@ class SparseApplyAdagradOp : public OpKernel {
           auto a = accum_flat.template chip<0>(index);
           auto g = grad_flat.template chip<0>(i);
           auto v = var_flat.template chip<0>(index);
-          a += g.square();
+          if (update_slots_) {
+            a += g.square();
+          }
           v -= g.constant(lr_scalar) * g * a.rsqrt();
         }
       } else {
@@ -1331,7 +1366,9 @@ class SparseApplyAdagradOp : public OpKernel {
                                           " in indices is out of range")));
           T& a = accum_flat(index);
           const T& g = grad_flat(i);
-          a += g * g;
+          if (update_slots_) {
+            a += g * g;
+          }
           var_flat(index) -= lr_scalar * g / Eigen::numext::sqrt(a);
         }
       }
@@ -1342,6 +1379,7 @@ class SparseApplyAdagradOp : public OpKernel {
 
  private:
   bool use_exclusive_lock_;
+  bool update_slots_;
 };
 
 #define REGISTER_KERNELS(T, Tindices)                                \
@@ -1360,6 +1398,7 @@ class SparseApplyAdagradOp : public OpKernel {
   REGISTER_KERNELS(T, int64);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -1961,6 +2000,7 @@ class ApplyFtrlOp : public OpKernel {
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -1982,6 +2022,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -2230,6 +2271,7 @@ class SparseApplyFtrlOp : public OpKernel {
   REGISTER_KERNELS(T, int64);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -2254,6 +2296,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
   REGISTER_KERNELS(T, int64);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -2332,6 +2375,7 @@ class ApplyMomentumOp : public OpKernel {
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -2471,6 +2515,7 @@ class SparseApplyMomentumOp : public OpKernel {
   REGISTER_KERNELS(T, int64);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -2698,6 +2743,7 @@ class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -2737,6 +2783,135 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyAdaMaxOp : public OpKernel {
+ public:
+  explicit ApplyAdaMaxOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 2, use_exclusive_lock_, false, &v));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+
+    const Tensor& beta1_power = ctx->input(3);
+    const Tensor& lr = ctx->input(4);
+    const Tensor& beta1 = ctx->input(5);
+    const Tensor& beta2 = ctx->input(6);
+    const Tensor& epsilon = ctx->input(7);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(8);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdaMax<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+        beta1_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>());
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                     \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("ApplyAdaMax").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdaMaxOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdaMax")                \
+                              .HostMemory("var")                   \
+                              .HostMemory("m")                     \
+                              .HostMemory("v")                     \
+                              .Device(DEVICE_##D)                  \
+                              .TypeConstraint<T>("T"),             \
+                          ApplyAdaMaxOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                   \
+  template <>                                                 \
+  void ApplyAdaMax<GPUDevice, T>::operator()(                   \
+      const GPUDevice& d, typename TTypes<T>::Flat var,       \
+      typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+      typename TTypes<T>::ConstScalar beta1_power,            \
+      typename TTypes<T>::ConstScalar lr,                     \
+      typename TTypes<T>::ConstScalar beta1,                  \
+      typename TTypes<T>::ConstScalar beta2,                  \
+      typename TTypes<T>::ConstScalar epsilon,                \
+      typename TTypes<T>::ConstFlat grad); \
+  extern template struct ApplyAdaMax<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyRMSPropOp : public OpKernel {
  public:
@@ -2937,6 +3112,7 @@ class ApplyCenteredRMSPropOp : public OpKernel {
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -3352,6 +3528,7 @@ class ApplyAddSignOp : public OpKernel {
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
@@ -3457,6 +3634,7 @@ class ApplyPowerSignOp : public OpKernel {
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 7ee956053abd320058963a8cc0bffa1fdc2e085c..495a94f1a1beaf1bfc79fee74063d4fb6e743705 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -68,7 +68,7 @@ struct ApplyAdagrad {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
-                  typename TTypes<T>::ConstFlat grad);
+                  typename TTypes<T>::ConstFlat grad, bool update_slots);
 };
 
 template <typename Device, typename T>
@@ -139,6 +139,18 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstFlat grad, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyAdaMax {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyRMSProp {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 0376a3b2c602c13b3082b7762cf61a2b30552199..4bd32592db16b70b2731a6cf775dbf774263d283 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -42,8 +42,10 @@ struct ApplyAdagrad<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
-                  typename TTypes<T>::ConstFlat grad) {
-    accum.device(d) += grad.square();
+                  typename TTypes<T>::ConstFlat grad, bool update_slots) {
+    if (update_slots) {
+      accum.device(d) += grad.square();
+    }
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
@@ -142,6 +144,32 @@ struct ApplyAdam<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAdaMax<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    const auto one = static_cast<T>(1.0);
+    m.device(d) =
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
+    v.device(d) =
+        (beta2.reshape(single).broadcast(bcast) * v).cwiseMax(grad.abs());
+    var.device(d) -=
+        lr / (beta1_power.constant(one) -
+                 beta1_power).reshape(single).broadcast(bcast) *
+                     (m / (v + epsilon));
+  }
+};
+
 template <typename T>
 struct ApplyRMSProp<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -278,6 +306,10 @@ template struct functor::ApplyAdam<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdam<GPUDevice, float>;
 template struct functor::ApplyAdam<GPUDevice, double>;
 
+template struct functor::ApplyAdaMax<GPUDevice, Eigen::half>;
+template struct functor::ApplyAdaMax<GPUDevice, float>;
+template struct functor::ApplyAdaMax<GPUDevice, double>;
+
 template struct functor::ApplyRMSProp<GPUDevice, Eigen::half>;
 template struct functor::ApplyRMSProp<GPUDevice, float>;
 template struct functor::ApplyRMSProp<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 0ef8724b10e492373c7663a58420bfe236be7df7..31388e42904608f20edd48152330f9ad2fb7d0ca 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -223,6 +223,16 @@ class UniqueOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
+                          UniqueOp<type, int64>);                \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueOp<type, int32>)                 \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
                           UniqueOp<type, int64>)
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 764b6a252adf09c13511a01f95332857f46eee96..1e1647db5c1c41d6242cab87b0d8a8cf66d32a28 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -90,21 +90,21 @@ class UnpackOp : public OpKernel {
     }
 #endif  // TENSORFLOW_USE_SYCL
 
-    int64 before_dim = 1;
+    Eigen::DenseIndex before_dim = 1;
     for (int i = 0; i < axis; ++i) {
       before_dim *= input_shape.dim_size(i);
     }
 
-    int64 after_dim = 1;
+    Eigen::DenseIndex after_dim = 1;
     for (int i = axis + 1; i < input_shape.dims(); ++i) {
       after_dim *= input_shape.dim_size(i);
     }
-    const int64 axis_dim = input_shape.dim_size(axis);
+    const Eigen::DenseIndex axis_dim = input_shape.dim_size(axis);
 
     // Except for shape, unpack is a special case of split, so we reuse the
     // same computational kernels.
     auto input_reshaped =
-        input.shaped<T, 3>({1, before_dim, axis_dim * after_dim});
+        input.shaped<T, 2>({before_dim, axis_dim * after_dim});
 
     for (int i = 0; i < num; ++i) {
       Tensor* output;
@@ -112,12 +112,12 @@ class UnpackOp : public OpKernel {
                      context->allocate_output(i, output_shape, &output));
 
       if (output_shape.num_elements() > 0) {
-        auto output_shaped = output->shaped<T, 3>({1, before_dim, after_dim});
-        Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, i * after_dim};
-        Eigen::DSizes<Eigen::DenseIndex, 3> sizes{1, before_dim, after_dim};
-        functor::Split<Device, T>()(context->eigen_device<Device>(),
-                                    output_shaped, input_reshaped, indices,
-                                    sizes);
+        auto output_shaped = output->shaped<T, 2>({before_dim, after_dim});
+        Eigen::DSizes<Eigen::DenseIndex, 2> indices{0, i * after_dim};
+        Eigen::DSizes<Eigen::DenseIndex, 2> sizes{before_dim, after_dim};
+        functor::Split<Device, T, 2>()(context->eigen_device<Device>(),
+                                       output_shaped, input_reshaped, indices,
+                                       sizes);
       }
     }
   }
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 83134bad378bfef18c3e93be5cc3c6b70ab4f523..f27dab4dddab8776f3043f21cc67c5db89209d5a 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -27,31 +28,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Resource stored by variables in the resource manager
-// (new, resource-style version).
-class Var : public ResourceBase {
- public:
-  explicit Var(DataType dtype) : tensor_(dtype) {}
-  // Not copyable or movable.
-  Var(const Var&) = delete;
-  Var& operator=(const Var&) = delete;
-
-  // TODO(ebrevdo): Use LockSet instead of exposing mu.
-  mutex* mu() { return &mu_; }
-  Tensor* tensor() { return &tensor_; }
-
-  string DebugString() override {
-    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
-                           tensor_.shape().DebugString());
-  }
-
- private:
-  mutex mu_;
-  Tensor tensor_;
-
-  ~Var() override {}
-};
-
 class VariableOp : public OpKernel {
  public:
   explicit VariableOp(OpKernelConstruction* context);
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index f92c4ed17af501eaf79523bc6977e614b8168720..3330442ffd602c7293a4ddc3c675524698364c4e 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/platform/cuda.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -278,8 +278,7 @@ class WhereGPUOp : public AsyncOpKernel {
 
     auto num_true_t = num_true.scalar<Tindex>();
 
-    perftools::gputools::DeviceMemoryBase num_true_ptr(
-        static_cast<void*>(num_true_t.data()));
+    se::DeviceMemoryBase num_true_ptr(static_cast<void*>(num_true_t.data()));
     // Push kernel to stream to get number of true elements.
     const GPUDevice& d = context->eigen_device<GPUDevice>();
     Status s = functor::NumTrue<GPUDevice, T, Tindex>::Compute(
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index a6a71fdfaf126410b26766954c0c2fc5b86d003a..9a3612bd72cdc2bc1c3c471beed6616816072a71 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -17,12 +17,14 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/xent_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/xent_op.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
@@ -41,37 +43,56 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& logits_in = context->input(0);
     const Tensor& labels_in = context->input(1);
-    OP_REQUIRES(context, logits_in.IsSameSize(labels_in),
-                errors::InvalidArgument(
-                    "logits and labels must be same size: logits_size=",
-                    logits_in.shape().DebugString(),
-                    " labels_size=", labels_in.shape().DebugString()));
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
-                errors::InvalidArgument("logits must be 2-dimensional"));
-    // As we already tested that both inputs have the same shape no need to
-    // check that "labels" is a matrix too.
+
+    TensorShape shape_in = logits_in.shape();
+
+    BCast bcast(BCast::FromShape(logits_in.shape()),
+                BCast::FromShape(labels_in.shape()));
+    if (!logits_in.IsSameSize(labels_in)) {
+      OP_REQUIRES(context, bcast.IsValid(),
+                  errors::InvalidArgument(
+                      "logits and labels must be broadcastable: logits_size=",
+                      logits_in.shape().DebugString(),
+                      " labels_size=", labels_in.shape().DebugString()));
+      shape_in = BCast::ToShape(bcast.output_shape());
+    }
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(shape_in),
+                errors::InvalidArgument("logits and labels must be beither "
+                                        "2-dimensional, or roadcasted to "
+                                        "2-dimensional"));
 
     // loss is 1-D (one per example), and size is batch_size.
 
     Tensor scratch;
     OP_REQUIRES_OK(
         context, context->allocate_temp(DataTypeToEnum<T>::value,
-                                        TensorShape({logits_in.dim_size(0), 1}),
+                                        TensorShape({shape_in.dim_size(0), 1}),
                                         &scratch));
 
     Tensor* loss_out = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(
-                       0, TensorShape({logits_in.dim_size(0)}), &loss_out));
+                       0, TensorShape({shape_in.dim_size(0)}), &loss_out));
     Tensor* back_out = nullptr;
     // Try to reuse the logits_in buffer for the backprop output.
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {0}, 1, logits_in.shape(), &back_out));
-    if (logits_in.dim_size(0) > 0) {
+                                {0}, 1, shape_in, &back_out));
+    if (shape_in.dim_size(0) > 0) {
       functor::XentFunctor<Device, T> functor;
-      functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
-              labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
-              back_out->matrix<T>());
+      if (logits_in.IsSameSize(labels_in)) {
+        functor(context->eigen_device<Device>(), shape_in.AsEigenDSizes<2>(),
+                Eigen::array<Eigen::DenseIndex, 2>{1, 1},
+                Eigen::array<Eigen::DenseIndex, 2>{1, 1}, logits_in.matrix<T>(),
+                labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
+                back_out->matrix<T>());
+      } else {
+        functor(context->eigen_device<Device>(), shape_in.AsEigenDSizes<2>(),
+                BCast::ToIndexArray<2>(bcast.x_bcast()),
+                BCast::ToIndexArray<2>(bcast.y_bcast()),
+                logits_in.template shaped<T, 2>(bcast.x_reshape()),
+                labels_in.template shaped<T, 2>(bcast.y_reshape()),
+                scratch.matrix<T>(), loss_out->vec<T>(), back_out->matrix<T>());
+      }
     }
   }
 };
@@ -81,13 +102,17 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
 namespace functor {
 template <typename Device, typename T>
 struct XentFunctorBase {
-  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(const Device& d,
+                  const Eigen::DSizes<Eigen::DenseIndex, 2>& shape,
+                  const Eigen::array<Eigen::DenseIndex, 2>& logits_bcast,
+                  const Eigen::array<Eigen::DenseIndex, 2>& labels_bcast,
+                  typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::ConstMatrix labels,
                   typename TTypes<T>::Matrix scratch,
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
-                                      backprop);
+    XentEigenImpl<Device, T>::Compute(d, shape, logits_bcast, labels_bcast,
+                                      logits, labels, scratch, loss, backprop);
   }
 };
 
diff --git a/tensorflow/core/kernels/xent_op.h b/tensorflow/core/kernels/xent_op.h
index e689fca7ff822cdebe68fa0d7f197a03f74104ce..87be17fca98d756a179a74552518a13484d03850 100644
--- a/tensorflow/core/kernels/xent_op.h
+++ b/tensorflow/core/kernels/xent_op.h
@@ -18,6 +18,7 @@ limitations under the License.
 // Functor definition for XentOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
@@ -33,7 +34,11 @@ struct XentFunctor {
   // scratch: temporary tensor, dims: batch_size, 1
   // loss: output tensor for the loss, dims: batch_size.
   // backprop: output tensor for the backprop, dims: batch_size, num_classes.
-  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(const Device &d,
+                  const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                  const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                  const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                  typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::ConstMatrix labels,
                   typename TTypes<T>::Matrix scratch,
                   typename TTypes<T>::Vec loss,
@@ -45,7 +50,11 @@ struct XentFunctor {
 // specializations for both device types.
 template <typename Device, typename T>
 struct XentEigenImpl {
-  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  static void Compute(const Device &d,
+                      const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                      const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                      const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                      typename TTypes<T>::ConstMatrix logits,
                       typename TTypes<T>::ConstMatrix labels,
                       typename TTypes<T>::Matrix scratch,
                       typename TTypes<T>::Vec loss,
@@ -57,8 +66,8 @@ struct XentEigenImpl {
     const int kBatchDim = 0;
     const int kClassDim = 1;
 
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
+    const int batch_size = shape[kBatchDim];
+    const int num_classes = shape[kClassDim];
 
 // These arrays are used to reduce along the class dimension, and broadcast
 // the resulting value to all classes.
@@ -84,10 +93,12 @@ struct XentEigenImpl {
 #endif
 
     // max_logits along classes.
-    scratch.reshape(batch_only).device(d) = logits.maximum(along_class);
+    scratch.reshape(batch_only).device(d) =
+        logits.broadcast(logits_bcast).maximum(along_class);
 
     // logits - max_logits.
-    backprop.device(d) = logits - scratch.broadcast(one_by_class);
+    backprop.device(d) =
+        logits.broadcast(logits_bcast) - scratch.broadcast(one_by_class);
 
     // sum(exp(logits - max_logits)) along classes.
     scratch.reshape(batch_only).device(d) = backprop.exp().sum(along_class);
@@ -99,15 +110,15 @@ struct XentEigenImpl {
     //  sum(-labels *
     //     ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
     //  along classes
-    loss.device(d) =
-        (labels * (scratch.log().eval().broadcast(one_by_class) - backprop))
-            .eval()
-            .sum(along_class);
+    loss.device(d) = (labels.broadcast(labels_bcast) *
+                      (scratch.log().eval().broadcast(one_by_class) - backprop))
+                         .eval()
+                         .sum(along_class);
 
     // backprop: prob - labels, where
     //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
-    backprop.device(d) =
-        (backprop.exp() / scratch.broadcast(one_by_class)) - labels;
+    backprop.device(d) = (backprop.exp() / scratch.broadcast(one_by_class)) -
+                         labels.broadcast(labels_bcast);
   }
 };
 
diff --git a/tensorflow/core/kernels/xent_op_gpu.cu.cc b/tensorflow/core/kernels/xent_op_gpu.cu.cc
index 05ee7da490e34d427cc4023bb33322c35006acf5..2c0c0b3a027e28c9502b162aa491dac83fea5fdd 100644
--- a/tensorflow/core/kernels/xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/xent_op_gpu.cu.cc
@@ -31,12 +31,17 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 template <typename T>
 struct XentFunctor<GPUDevice, T> {
-  void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(const GPUDevice &d,
+                  const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                  const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                  const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                  typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::ConstMatrix labels,
                   typename TTypes<T>::Matrix scratch,
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    XentEigenImpl<GPUDevice, T>::Compute(d, logits, labels, scratch, loss,
+    XentEigenImpl<GPUDevice, T>::Compute(d, shape, logits_bcast, labels_bcast,
+                                         logits, labels, scratch, loss,
                                          backprop);
   }
 };
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index ba03357cc6ac22d42f9f1cceab6875ef7e49b4c2..f8c06988cbac021d1f0924ca274c8bee5e9272a5 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // Make this file empty (or nearly empty) so that it can be compiled even when
 // libxsmm is not available.
 
-#ifndef TENSORFLOW_USE_LIBXSMM
+#ifndef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 void dummy_xsmm_conv2d_ensure_file_is_not_empty();
 #else
 
@@ -32,9 +32,9 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty();
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
-#include "libxsmm_main.h"  // TODO(bsteiner): API to avoid incl. header from src/
 #include "include/libxsmm_cpuid.h"
 #include "include/libxsmm_malloc.h"
+#include "third_party/libxsmm/src/libxsmm_main.h"  // TODO(bsteiner): API to avoid incl. header from src/
 
 namespace tensorflow {
 
@@ -173,8 +173,16 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
                                    InputPtr input, FilterPtr filter,
                                    OutputPtr output) {
 #if defined(LIBXSMM_DETAILED_TIMING)
-  unsigned long long l_tick1, l_tick2, l_tick3, l_tick4, l_tick5, l_tick6,
-      l_tick7, l_tick8, l_tick9, l_tick10;
+  uint64 l_tick1;
+  uint64 l_tick2;
+  uint64 l_tick3;
+  uint64 l_tick4;
+  uint64 l_tick5;
+  uint64 l_tick6;
+  uint64 l_tick7;
+  uint64 l_tick8;
+  uint64 l_tick9;
+  uint64 l_tick10;
   l_tick1 = libxsmm_timer_tick();
 #endif
   // setup scoped allocator, which adopts the allocator from the context
@@ -453,6 +461,7 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   return true;  // Succeeded
 }
 
+#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 template <typename T>
 struct XsmmFwdConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
@@ -461,7 +470,9 @@ struct XsmmFwdConv2D<CPUDevice, T> {
                                   input, filter, output);
   }
 };
+#endif
 
+#ifdef TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
 template <typename T>
 struct XsmmBkwInputConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
@@ -479,6 +490,7 @@ struct XsmmBkwFilterConv2D<CPUDevice, T> {
                                   input, filter, output);
   }
 };
+#endif
 
 }  // namespace functor
 
@@ -488,4 +500,4 @@ template struct functor::XsmmBkwFilterConv2D<CPUDevice, float>;
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_USE_LIBXSMM
+#endif  // TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index f9cca0ef2ab90c677e47d979a4636b3fc25ec919..2c0576ff10e7c7cee7a6a64d7d346a7f4240057c 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
 #define TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
 
+#include <cmath>
 #include <complex>
 
+#include "tensorflow/core/platform/byte_order.h"
+
 #ifdef __CUDACC__
 // All functions callable from CUDA code must be qualified with __device__
 #define B16_DEVICE_FUNC __host__ __device__
@@ -85,15 +88,13 @@ struct bfloat16 {
       : bfloat16(static_cast<float>(val)) {}
 
   B16_DEVICE_FUNC explicit operator float() const {
-    float result;
+    float result = 0;
 
     uint16_t* q = reinterpret_cast<uint16_t*>(&result);
 
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     q[0] = value;
-    q[1] = 0;
 #else
-    q[0] = 0;
     q[1] = value;
 #endif
     return result;
@@ -161,6 +162,192 @@ struct bfloat16 {
     return complex128(double(*this), double(0.0));
   }
 
+  union FP32 {
+    unsigned int u;
+    float f;
+  };
+
+  // Converts a float point to bfloat16, with round-nearest-to-even as rounding
+  // method.
+  // TODO(b/69266521): Add a truncate_to_bfloat16 function and make this
+  // function as default behavior.
+  // TODO: There is a slightly faster implementation (8% faster on CPU)
+  // than this (documented in cl/175987786), that is exponentially harder to
+  // understand and document. Switch to the faster version when converting to
+  // BF16 becomes compute-bound.
+  B16_DEVICE_FUNC static bfloat16 round_to_bfloat16(float v) {
+    uint32_t input;
+    FP32 f;
+    f.f = v;
+    input = f.u;
+    bfloat16 output;
+
+    if (float_isnan(v)) {
+      // If the value is a NaN, squash it to a qNaN with msb of fraction set,
+      // this makes sure after truncation we don't end up with an inf.
+      //
+      // qNaN magic: All exponent bits set + most significant bit of fraction
+      // set.
+      output.value = 0x7fc0;
+    } else {
+      // Fast rounding algorithm that rounds a half value to nearest even. This
+      // reduces expected error when we convert a large number of floats. Here
+      // is how it works:
+      //
+      // Definitions:
+      // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+      // with the following tags:
+      //
+      // Sign |  Exp (8 bits) | Frac (23 bits)
+      //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+      //
+      //  S: Sign bit.
+      //  E: Exponent bits.
+      //  F: First 6 bits of fraction.
+      //  L: Least significant bit of resulting bfloat16 if we truncate away the
+      //  rest of the float32. This is also the 7th bit of fraction
+      //  R: Rounding bit, 8th bit of fraction.
+      //  T: Sticky bits, rest of fraction, 15 bits.
+      //
+      // To round half to nearest even, there are 3 cases where we want to round
+      // down (simply truncate the result of the bits away, which consists of
+      // rounding bit and sticky bits) and two cases where we want to round up
+      // (truncate then add one to the result).
+      //
+      // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+      // 1s) as the rounding bias, adds the rounding bias to the input, then
+      // truncates the last 16 bits away.
+      //
+      // To understand how it works, we can analyze this algorithm case by case:
+      //
+      // 1. L = 0, R = 0:
+      //   Expect: round down, this is less than half value.
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 0 = 0x7fff
+      //   - Adding rounding bias to input may create any carry, depending on
+      //   whether there is any value set to 1 in T bits.
+      //   - R may be set to 1 if there is a carry.
+      //   - L remains 0.
+      //   - Note that this case also handles Inf and -Inf, where all fraction
+      //   bits, including L, R and Ts are all 0. The output remains Inf after
+      //   this algorithm.
+      //
+      // 2. L = 1, R = 0:
+      //   Expect: round down, this is less than half value.
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 1 = 0x8000
+      //   - Adding rounding bias to input doesn't change sticky bits but
+      //   adds 1 to rounding bit.
+      //   - L remains 1.
+      //
+      // 3. L = 0, R = 1, all of T are 0:
+      //   Expect: round down, this is exactly at half, the result is already
+      //   even (L=0).
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 0 = 0x7fff
+      //   - Adding rounding bias to input sets all sticky bits to 1, but
+      //   doesn't create a carry.
+      //   - R remains 1.
+      //   - L remains 0.
+      //
+      // 4. L = 1, R = 1:
+      //   Expect: round up, this is exactly at half, the result needs to be
+      //   round to the next even number.
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 1 = 0x8000
+      //   - Adding rounding bias to input doesn't change sticky bits, but
+      //   creates a carry from rounding bit.
+      //   - The carry sets L to 0, creates another carry bit and propagate
+      //   forward to F bits.
+      //   - If all the F bits are 1, a carry then propagates to the exponent
+      //   bits, which then creates the minimum value with the next exponent
+      //   value. Note that we won't have the case where exponents are all 1,
+      //   since that's either a NaN (handled in the other if condition) or inf
+      //   (handled in case 1).
+      //
+      // 5. L = 0, R = 1, any of T is 1:
+      //   Expect: round up, this is greater than half.
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 0 = 0x7fff
+      //   - Adding rounding bias to input creates a carry from sticky bits,
+      //   sets rounding bit to 0, then create another carry.
+      //   - The second carry sets L to 1.
+      //
+      // Examples:
+      //
+      //  Exact half value that is already even:
+      //    Input:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+      //
+      //     This falls into case 3. We truncate the rest of 16 bits and no
+      //     carry is created into F and L:
+      //
+      //    Output:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+      //     S     E E E E E E E E      F F F F F F L
+      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+      //
+      //  Exact half value, round to next even number:
+      //    Input:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+      //
+      //     This falls into case 4. We create a carry from R and T,
+      //     which then propagates into L and F:
+      //
+      //    Output:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+      //     S     E E E E E E E E      F F F F F F L
+      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+      //
+      //
+      //  Max denormal value round to min normal value:
+      //    Input:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+      //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+      //
+      //     This falls into case 4. We create a carry from R and T,
+      //     propagate into L and F, which then propagates into exponent
+      //     bits:
+      //
+      //    Output:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+      //     S     E E E E E E E E      F F F F F F L
+      //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+      //
+      //  Max normal value round to Inf:
+      //    Input:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+      //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+      //
+      //     This falls into case 4. We create a carry from R and T,
+      //     propagate into L and F, which then propagates into exponent
+      //     bits:
+      //
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+      //     S     E E E E E E E E      F F F F F F L
+      //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+      //
+      //
+      // Least significant bit of resulting bfloat.
+      uint32_t lsb = (input >> 16) & 1;
+      uint32_t rounding_bias = 0x7fff + lsb;
+      input += rounding_bias;
+      output.value = static_cast<uint16_t>(input >> 16);
+    }
+    return output;
+  }
+
   static bfloat16 epsilon() {
     bfloat16 x;
     x.value = 0x3c00;  // 0x1.0p-7
@@ -173,7 +360,7 @@ struct bfloat16 {
   static const uint16_t NAN_VALUE = 0x7FC0;
 
  private:
-  B16_DEVICE_FUNC bool float_isnan(const float& x) {
+  B16_DEVICE_FUNC static bool float_isnan(const float& x) {
 #ifdef __CUDA_ARCH__
     return ::isnan(x);
 #else
@@ -271,6 +458,35 @@ struct hash<tensorflow::bfloat16> {
     return hash<float>()(static_cast<float>(v));
   }
 };
+
+using tensorflow::bfloat16;
+inline bool isinf(const bfloat16& a) { return std::isinf(float(a)); }
+inline bool isnan(const bfloat16& a) { return std::isnan(float(a)); }
+inline bool isfinite(const bfloat16& a) { return std::isfinite(float(a)); }
+inline bfloat16 abs(const bfloat16& a) { return bfloat16(std::abs(float(a))); }
+inline bfloat16 exp(const bfloat16& a) { return bfloat16(std::exp(float(a))); }
+inline bfloat16 log(const bfloat16& a) { return bfloat16(std::log(float(a))); }
+inline bfloat16 log10(const bfloat16& a) {
+  return bfloat16(std::log10(float(a)));
+}
+inline bfloat16 sqrt(const bfloat16& a) {
+  return bfloat16(std::sqrt(float(a)));
+}
+inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(std::pow(float(a), float(b)));
+}
+inline bfloat16 sin(const bfloat16& a) { return bfloat16(std::sin(float(a))); }
+inline bfloat16 cos(const bfloat16& a) { return bfloat16(std::cos(float(a))); }
+inline bfloat16 tan(const bfloat16& a) { return bfloat16(std::tan(float(a))); }
+inline bfloat16 tanh(const bfloat16& a) {
+  return bfloat16(std::tanh(float(a)));
+}
+inline bfloat16 floor(const bfloat16& a) {
+  return bfloat16(std::floor(float(a)));
+}
+inline bfloat16 ceil(const bfloat16& a) {
+  return bfloat16(std::ceil(float(a)));
+}
 }  // namespace std
 
 #endif  // TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
diff --git a/tensorflow/core/lib/core/coding.cc b/tensorflow/core/lib/core/coding.cc
index bb95c27410419cba557ea5e47d69786f969bfa57..50872eef83a591718468173b5940d46079924638 100644
--- a/tensorflow/core/lib/core/coding.cc
+++ b/tensorflow/core/lib/core/coding.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/coding.h"
 
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 
 namespace tensorflow {
 namespace core {
diff --git a/tensorflow/core/lib/core/error_codes.proto b/tensorflow/core/lib/core/error_codes.proto
index a7306c8cc1212118a22b25efc67b4589316b207d..b82d3891460cb416aecb9058f9bd9c2d2693c197 100644
--- a/tensorflow/core/lib/core/error_codes.proto
+++ b/tensorflow/core/lib/core/error_codes.proto
@@ -119,7 +119,7 @@ enum Code {
   // Operation is not implemented or not supported/enabled in this service.
   UNIMPLEMENTED = 12;
 
-  // Internal errors.  Means some invariants expected by underlying
+  // Internal errors.  Means some invariant expected by the underlying
   // system has been broken.  If you see one of these errors,
   // something is very broken.
   INTERNAL = 13;
diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index 1fd62755d835bb282d79babb96700b5abcb0086b..51c09032dfb47fd699f142c98a091b1fab617782 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_CORE_ERRORS_H_
 #define TENSORFLOW_LIB_CORE_ERRORS_H_
 
+#include <sstream>
+
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -26,6 +28,33 @@ namespace errors {
 
 typedef ::tensorflow::error::Code Code;
 
+namespace internal {
+
+// The DECLARE_ERROR macro below only supports types that can be converted
+// into StrCat's AlphaNum. For the other types we rely on a slower path
+// through std::stringstream. To add support of a new type, it is enough to
+// make sure there is an operator<<() for it:
+//
+//   std::ostream& operator<<(std::ostream& os, const MyType& foo) {
+//     os << foo.ToString();
+//     return os;
+//   }
+// Eventually absl::strings will have native support for this and we will be
+// able to completely remove PrepareForStrCat().
+template <typename T>
+typename std::enable_if<!std::is_constructible<strings::AlphaNum, T>::value,
+                        string>::type
+PrepareForStrCat(const T& t) {
+  std::stringstream ss;
+  ss << t;
+  return ss.str();
+}
+inline const strings::AlphaNum& PrepareForStrCat(const strings::AlphaNum& a) {
+  return a;
+}
+
+}  // namespace internal
+
 // Append some context to an error message.  Each time we append
 // context put it on a new line, since it is possible for there
 // to be several layers of additional context.
@@ -61,8 +90,10 @@ void AppendToMessage(::tensorflow::Status* status, Args... args) {
 #define DECLARE_ERROR(FUNC, CONST)                                       \
   template <typename... Args>                                            \
   ::tensorflow::Status FUNC(Args... args) {                              \
-    return ::tensorflow::Status(::tensorflow::error::CONST,              \
-                                ::tensorflow::strings::StrCat(args...)); \
+    return ::tensorflow::Status(                                         \
+        ::tensorflow::error::CONST,                                      \
+        ::tensorflow::strings::StrCat(                                   \
+            ::tensorflow::errors::internal::PrepareForStrCat(args)...)); \
   }                                                                      \
   inline bool Is##FUNC(const ::tensorflow::Status& status) {             \
     return status.code() == ::tensorflow::error::CONST;                  \
diff --git a/tensorflow/core/lib/core/raw_coding.h b/tensorflow/core/lib/core/raw_coding.h
index bbfd33d3037456e88d43fdae6e8570155f7ae90b..37201b755d5a37fd63b20c34fdbcb1f8c23e15a1 100644
--- a/tensorflow/core/lib/core/raw_coding.h
+++ b/tensorflow/core/lib/core/raw_coding.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LIB_CORE_RAW_CODING_H_
 
 #include <string.h>
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/lib/core/stringpiece.cc b/tensorflow/core/lib/core/stringpiece.cc
index 29b727fc4463d933ceeb402c5dd92f3ea5b8a62a..4c488066e4b44bd7f38735ebcc944586c1f2af36 100644
--- a/tensorflow/core/lib/core/stringpiece.cc
+++ b/tensorflow/core/lib/core/stringpiece.cc
@@ -17,23 +17,14 @@ limitations under the License.
 
 #include <algorithm>
 #include <iostream>
-#include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
 
-size_t StringPieceHasher::operator()(StringPiece s) const {
-  return Hash64(s.data(), s.size());
-}
-
 std::ostream& operator<<(std::ostream& o, StringPiece piece) {
   o.write(piece.data(), piece.size());
   return o;
 }
 
-bool StringPiece::contains(StringPiece s) const {
-  return std::search(begin(), end(), s.begin(), s.end()) != end();
-}
-
 size_t StringPiece::find(char c, size_t pos) const {
   if (pos >= size_) {
     return npos;
@@ -60,6 +51,4 @@ StringPiece StringPiece::substr(size_t pos, size_t n) const {
   return StringPiece(data_ + pos, n);
 }
 
-const StringPiece::size_type StringPiece::npos = size_type(-1);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index caa9642774bebec05a28b7a0c2ea71d18d6ebd1a..d7ecc44e507e25f4536acc8895ce219d37fb1f8e 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -35,8 +35,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-struct StringPieceHasher;
-
 class StringPiece {
  public:
   typedef size_t size_type;
@@ -67,7 +65,7 @@ class StringPiece {
   iterator begin() const { return data_; }
   iterator end() const { return data_ + size_; }
 
-  static const size_t npos;
+  static const size_t npos = size_type(-1);
 
   // Return the ith byte in the referenced data.
   // REQUIRES: n < size()
@@ -90,22 +88,11 @@ class StringPiece {
 
   size_t find(char c, size_t pos = 0) const;
   size_t rfind(char c, size_t pos = npos) const;
-  bool contains(StringPiece s) const;
-
-  // Checks whether StringPiece starts with x and if so advances the beginning
-  // of it to past the match.  It's basically a shortcut for starts_with
-  // followed by remove_prefix.
-  bool Consume(StringPiece x) {
-    if (starts_with(x)) {
-      remove_prefix(x.size_);
-      return true;
-    }
-    return false;
-  }
 
   StringPiece substr(size_t pos, size_t n = npos) const;
 
   // Return a string that contains the copy of the referenced data.
+  // DEPRECATED: use std::string(sv) instead.
   std::string ToString() const { return std::string(data_, size_); }
 
   // Three-way comparison.  Returns value:
@@ -114,14 +101,11 @@ class StringPiece {
   //   >  0 iff "*this" >  "b"
   int compare(StringPiece b) const;
 
-  // Return true iff "x" is a prefix of "*this"
-  bool starts_with(StringPiece x) const {
-    return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
-  }
-  // Return true iff "x" is a suffix of "*this"
-  bool ends_with(StringPiece x) const {
-    return ((size_ >= x.size_) &&
-            (memcmp(data_ + (size_ - x.size_), x.data_, x.size_) == 0));
+  // Converts to `std::basic_string`.
+  template <typename A>
+  explicit operator std::basic_string<char, std::char_traits<char>, A>() const {
+    if (!data()) return {};
+    return std::basic_string<char, std::char_traits<char>, A>(data(), size());
   }
 
  private:
@@ -131,10 +115,6 @@ class StringPiece {
   // Intentionally copyable
 };
 
-struct StringPieceHasher {
-  size_t operator()(StringPiece s) const;
-};
-
 inline bool operator==(StringPiece x, StringPiece y) {
   return ((x.size() == y.size()) &&
           (memcmp(x.data(), y.data(), x.size()) == 0));
diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/lib/core/stringpiece_test.cc
index 8f17b85b6d7941d7084ce4e142de4ad33f1e8202..952b9eaaaae43a502f06816d7536f3af57266b43 100644
--- a/tensorflow/core/lib/core/stringpiece_test.cc
+++ b/tensorflow/core/lib/core/stringpiece_test.cc
@@ -55,84 +55,9 @@ TEST(StringPiece, Ctor) {
   }
 }
 
-TEST(StringPiece, Contains) {
-  StringPiece a("abcdefg");
-  StringPiece b("abcd");
-  StringPiece c("efg");
-  StringPiece d("gh");
-  EXPECT_TRUE(a.contains(b));
-  EXPECT_TRUE(a.contains(c));
-  EXPECT_TRUE(!a.contains(d));
+TEST(StringPiece, ConversionToString) {
+  EXPECT_EQ("", std::string(StringPiece("")));
+  EXPECT_EQ("foo", std::string(StringPiece("foo")));
 }
 
-TEST(StringPieceHasher, Equality) {
-  StringPieceHasher hasher;
-
-  StringPiece s1("foo");
-  StringPiece s2("bar");
-  StringPiece s3("baz");
-  StringPiece s4("zot");
-
-  EXPECT_TRUE(hasher(s1) != hasher(s2));
-  EXPECT_TRUE(hasher(s1) != hasher(s3));
-  EXPECT_TRUE(hasher(s1) != hasher(s4));
-  EXPECT_TRUE(hasher(s2) != hasher(s3));
-  EXPECT_TRUE(hasher(s2) != hasher(s4));
-  EXPECT_TRUE(hasher(s3) != hasher(s4));
-
-  EXPECT_TRUE(hasher(s1) == hasher(s1));
-  EXPECT_TRUE(hasher(s2) == hasher(s2));
-  EXPECT_TRUE(hasher(s3) == hasher(s3));
-  EXPECT_TRUE(hasher(s4) == hasher(s4));
-}
-
-TEST(StringPieceHasher, HashMap) {
-  string s1("foo");
-  string s2("bar");
-  string s3("baz");
-
-  StringPiece p1(s1);
-  StringPiece p2(s2);
-  StringPiece p3(s3);
-
-  std::unordered_map<StringPiece, int, StringPieceHasher> map;
-
-  map.insert(std::make_pair(p1, 0));
-  map.insert(std::make_pair(p2, 1));
-  map.insert(std::make_pair(p3, 2));
-  EXPECT_EQ(map.size(), 3);
-
-  bool found[3] = {false, false, false};
-  for (auto const& val : map) {
-    int x = val.second;
-    EXPECT_TRUE(x >= 0 && x < 3);
-    EXPECT_TRUE(!found[x]);
-    found[x] = true;
-  }
-  EXPECT_EQ(found[0], true);
-  EXPECT_EQ(found[1], true);
-  EXPECT_EQ(found[2], true);
-
-  auto new_iter = map.find("zot");
-  EXPECT_TRUE(new_iter == map.end());
-
-  new_iter = map.find("bar");
-  EXPECT_TRUE(new_iter != map.end());
-
-  map.erase(new_iter);
-  EXPECT_EQ(map.size(), 2);
-
-  found[0] = false;
-  found[1] = false;
-  found[2] = false;
-  for (const auto& iter : map) {
-    int x = iter.second;
-    EXPECT_TRUE(x >= 0 && x < 3);
-    EXPECT_TRUE(!found[x]);
-    found[x] = true;
-  }
-  EXPECT_EQ(found[0], true);
-  EXPECT_EQ(found[1], false);
-  EXPECT_EQ(found[2], true);
-}
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index e55ed79d36cd2db7a6f6b19f3579f47e73b4b2d9..99684ae47b547dea63a65701e3c317eb1ad5766c 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -59,10 +59,9 @@ struct EigenEnvironment {
 
   Task CreateTask(std::function<void()> f) {
     uint64 id = 0;
-    if (port::Tracing::IsActive()) {
-      id = port::Tracing::UniqueId();
-      port::Tracing::RecordEvent(port::Tracing::EventCategory::kScheduleClosure,
-                                 id);
+    if (tracing::EventCollector::IsEnabled()) {
+      id = tracing::GetUniqueArg();
+      tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
     }
     return Task{
         std::unique_ptr<TaskImpl>(new TaskImpl{
@@ -75,13 +74,9 @@ struct EigenEnvironment {
 
   void ExecuteTask(const Task& t) {
     WithContext wc(t.f->context);
-    if (t.f->trace_id != 0) {
-      port::Tracing::ScopedActivity region(
-          port::Tracing::EventCategory::kRunClosure, t.f->trace_id);
-      t.f->f();
-    } else {
-      t.f->f();
-    }
+    tracing::ScopedRegion region(tracing::EventCategory::kRunClosure,
+                                 t.f->trace_id);
+    t.f->f();
   }
 };
 
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 627ef5a892a35ec43d0c31220dcf046b4b8eda55..320f3ebb8328b23c5e0b10ae2effe1de2528246b 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/test.h"
@@ -35,6 +36,7 @@ TEST(ThreadPool, Empty) {
 }
 
 TEST(ThreadPool, DoWork) {
+  Context outer_context(ContextKind::kThread);
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
     fprintf(stderr, "Testing with %d threads\n", num_threads);
     const int kWorkItems = 15;
@@ -45,7 +47,9 @@ TEST(ThreadPool, DoWork) {
     {
       ThreadPool pool(Env::Default(), "test", num_threads);
       for (int i = 0; i < kWorkItems; i++) {
-        pool.Schedule([&work, i]() {
+        pool.Schedule([&outer_context, &work, i]() {
+          Context inner_context(ContextKind::kThread);
+          ASSERT_EQ(outer_context, inner_context);
           ASSERT_FALSE(work[i]);
           work[i] = true;
         });
@@ -58,6 +62,7 @@ TEST(ThreadPool, DoWork) {
 }
 
 TEST(ThreadPool, ParallelFor) {
+  Context outer_context(ContextKind::kThread);
   // Make ParallelFor use as many threads as possible.
   int64 kHugeCost = 1 << 30;
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
@@ -68,12 +73,15 @@ TEST(ThreadPool, ParallelFor) {
     for (int i = 0; i < kWorkItems; i++) {
       work[i] = false;
     }
-    pool.ParallelFor(kWorkItems, kHugeCost, [&work](int64 begin, int64 end) {
-      for (int64 i = begin; i < end; ++i) {
-        ASSERT_FALSE(work[i]);
-        work[i] = true;
-      }
-    });
+    pool.ParallelFor(kWorkItems, kHugeCost,
+                     [&outer_context, &work](int64 begin, int64 end) {
+                       Context inner_context(ContextKind::kThread);
+                       ASSERT_EQ(outer_context, inner_context);
+                       for (int64 i = begin; i < end; ++i) {
+                         ASSERT_FALSE(work[i]);
+                         work[i] = true;
+                       }
+                     });
     for (int i = 0; i < kWorkItems; i++) {
       ASSERT_TRUE(work[i]);
     }
@@ -167,5 +175,40 @@ static void BM_Parallel(int iters) {
 }
 BENCHMARK(BM_Parallel);
 
+static void BM_ParallelFor(int iters, int total, int cost_per_unit) {
+  ThreadPool pool(Env::Default(), "test", kNumThreads);
+  // Decrement count concurrently until 0.
+  std::atomic_int_fast32_t count(iters);
+  mutex done_lock;
+  condition_variable done;
+  bool done_flag = false;
+  for (int i = 0; i < iters; ++i) {
+    pool.ParallelFor(
+        total, cost_per_unit,
+        [&count, &done_lock, &done, &done_flag](int64 begin, int64 end) {
+          for (int64 i = begin; i < end; ++i) {
+            if (count.fetch_sub(1) == 1) {
+              mutex_lock l(done_lock);
+              done_flag = true;
+              done.notify_all();
+            }
+          }
+        });
+  }
+  mutex_lock l(done_lock);
+  if (!done_flag) {
+    done.wait(l);
+  }
+}
+BENCHMARK(BM_ParallelFor)
+    ->ArgPair(1 << 10, 1)
+    ->ArgPair(1 << 20, 1)
+    ->ArgPair(1 << 10, 1 << 10)
+    ->ArgPair(1 << 20, 1 << 10)
+    ->ArgPair(1 << 10, 1 << 20)
+    ->ArgPair(1 << 20, 1 << 20)
+    ->ArgPair(1 << 10, 1 << 30)
+    ->ArgPair(1 << 20, 1 << 30);
+
 }  // namespace thread
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index 9ff87e8d66d2575966c703a896ac9ff0bc51661a..ce09c2009ac81b5cd2736800852a148bfefff6a9 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -42,9 +42,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/core/lib/gtl/flatmap_test.cc b/tensorflow/core/lib/gtl/flatmap_test.cc
index bb65e5357a845ebc132a8518fc28fec94b669bde..0901eba9265a48351d108a73a620dd753f4ec92f 100644
--- a/tensorflow/core/lib/gtl/flatmap_test.cc
+++ b/tensorflow/core/lib/gtl/flatmap_test.cc
@@ -321,7 +321,7 @@ TEST(FlatMap, Copy) {
     NumMap copy2;
     copy2 = src;
     EXPECT_EQ(Contents(src), Contents(copy2));
-    copy2 = copy2;  // Self-assignment
+    copy2 = *&copy2;  // Self-assignment, avoiding -Wself-assign.
     EXPECT_EQ(Contents(src), Contents(copy2));
   }
 }
diff --git a/tensorflow/core/lib/gtl/flatset_test.cc b/tensorflow/core/lib/gtl/flatset_test.cc
index 09fbbb1fb6c6670d24345c0043c56df0ed2c7bb0..010b4bb5df3337ad814caa3a8767796074be1d18 100644
--- a/tensorflow/core/lib/gtl/flatset_test.cc
+++ b/tensorflow/core/lib/gtl/flatset_test.cc
@@ -252,7 +252,7 @@ TEST(FlatSet, Copy) {
     NumSet copy2;
     copy2 = src;
     EXPECT_EQ(Contents(src), Contents(copy2));
-    copy2 = copy2;  // Self-assignment
+    copy2 = *&copy2;  // Self-assignment, avoiding -Wself-assign.
     EXPECT_EQ(Contents(src), Contents(copy2));
   }
 }
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index 6e3cb2206d9658a3b0bc24b506049f503ae304ed..2011f7d4a1192cbd845f1ea74f8ef52856320b43 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/gtl/manual_constructor.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/lib/gtl/manual_constructor.h b/tensorflow/core/lib/gtl/manual_constructor.h
index 0a76e0962e6b0939763dfddaeeb95e399670571a..0176cdc94d8b543d33ac27a651ef2ac9da241719 100644
--- a/tensorflow/core/lib/gtl/manual_constructor.h
+++ b/tensorflow/core/lib/gtl/manual_constructor.h
@@ -53,7 +53,7 @@ template <int size>
 struct AlignType<0, size> {
   typedef char result[size];
 };
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
 #define TF_LIB_GTL_ALIGN_ATTRIBUTE(X) __declspec(align(X))
 #define TF_LIB_GTL_ALIGN_OF(T) __alignof(T)
 #elif defined(COMPILER_GCC3) || __GNUC__ >= 3 || defined(__APPLE__) || \
diff --git a/tensorflow/core/lib/hash/hash.cc b/tensorflow/core/lib/hash/hash.cc
index ed9b4df37a043ec0c1241be088c93e031ed1f42e..dc9d300d00e596478553f0f296b0c4fcf8c85068 100644
--- a/tensorflow/core/lib/hash/hash.cc
+++ b/tensorflow/core/lib/hash/hash.cc
@@ -126,15 +126,4 @@ uint64 Hash64(const char* data, size_t n, uint64 seed) {
   return h;
 }
 
-bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
-                                    string* result) {
-  const size_t size = msg.ByteSizeLong();
-  *result = string(size, '\0');
-  protobuf::io::ArrayOutputStream array_stream(&(*result)[0], size);
-  protobuf::io::CodedOutputStream output_stream(&array_stream);
-  output_stream.SetSerializationDeterministic(true);
-  msg.SerializeWithCachedSizes(&output_stream);
-  return !output_stream.HadError() && size == output_stream.ByteCount();
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/hash/hash.h b/tensorflow/core/lib/hash/hash.h
index 4d312ab7e830963671a8be9d4622a5b83488d295..ca05e6346e2045da52ac048f80716cf046b2fd26 100644
--- a/tensorflow/core/lib/hash/hash.h
+++ b/tensorflow/core/lib/hash/hash.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -64,13 +63,6 @@ struct hash<T*> {
   }
 };
 
-template <>
-struct hash<bfloat16> {
-  size_t operator()(const bfloat16& t) const {
-    return std::hash<float>()(static_cast<float>(t));
-  }
-};
-
 template <>
 struct hash<string> {
   size_t operator()(const string& s) const {
@@ -84,6 +76,7 @@ struct hash<StringPiece> {
     return static_cast<size_t>(Hash64(sp.data(), sp.size()));
   }
 };
+using StringPieceHasher = ::tensorflow::hash<StringPiece>;
 
 template <typename T, typename U>
 struct hash<std::pair<T, U>> {
@@ -92,15 +85,6 @@ struct hash<std::pair<T, U>> {
   }
 };
 
-// Wrapper around protocol buffer serialization that requests deterministic
-// serialization, in particular for Map fields, which serialize in a random
-// order by default. Returns true on success.
-// Serialization is guaranteed to be deterministic for a given binary only.
-// See the following for more details:
-// https://github.com/google/protobuf/blob/a1bb147e96b6f74db6cdf3c3fcb00492472dbbfa/src/google/protobuf/io/coded_stream.h#L834
-bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
-                                    string* result);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_LIB_HASH_HASH_H_
diff --git a/tensorflow/core/lib/hash/hash_test.cc b/tensorflow/core/lib/hash/hash_test.cc
index 0e5f6c6803389d78d648142992e9ad5b0d487d26..7d583131322dec509f06d2f4569c2cbc41e19e9b 100644
--- a/tensorflow/core/lib/hash/hash_test.cc
+++ b/tensorflow/core/lib/hash/hash_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <map>
+#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/core/lib/hash/hash.h"
@@ -81,4 +83,75 @@ static void BM_Hash32(int iters, int len) {
 }
 BENCHMARK(BM_Hash32)->Range(1, 1024);
 
+TEST(StringPieceHasher, Equality) {
+  StringPieceHasher hasher;
+
+  StringPiece s1("foo");
+  StringPiece s2("bar");
+  StringPiece s3("baz");
+  StringPiece s4("zot");
+
+  EXPECT_TRUE(hasher(s1) != hasher(s2));
+  EXPECT_TRUE(hasher(s1) != hasher(s3));
+  EXPECT_TRUE(hasher(s1) != hasher(s4));
+  EXPECT_TRUE(hasher(s2) != hasher(s3));
+  EXPECT_TRUE(hasher(s2) != hasher(s4));
+  EXPECT_TRUE(hasher(s3) != hasher(s4));
+
+  EXPECT_TRUE(hasher(s1) == hasher(s1));
+  EXPECT_TRUE(hasher(s2) == hasher(s2));
+  EXPECT_TRUE(hasher(s3) == hasher(s3));
+  EXPECT_TRUE(hasher(s4) == hasher(s4));
+}
+
+TEST(StringPieceHasher, HashMap) {
+  string s1("foo");
+  string s2("bar");
+  string s3("baz");
+
+  StringPiece p1(s1);
+  StringPiece p2(s2);
+  StringPiece p3(s3);
+
+  std::unordered_map<StringPiece, int, StringPieceHasher> map;
+
+  map.insert(std::make_pair(p1, 0));
+  map.insert(std::make_pair(p2, 1));
+  map.insert(std::make_pair(p3, 2));
+  EXPECT_EQ(map.size(), 3);
+
+  bool found[3] = {false, false, false};
+  for (auto const& val : map) {
+    int x = val.second;
+    EXPECT_TRUE(x >= 0 && x < 3);
+    EXPECT_TRUE(!found[x]);
+    found[x] = true;
+  }
+  EXPECT_EQ(found[0], true);
+  EXPECT_EQ(found[1], true);
+  EXPECT_EQ(found[2], true);
+
+  auto new_iter = map.find("zot");
+  EXPECT_TRUE(new_iter == map.end());
+
+  new_iter = map.find("bar");
+  EXPECT_TRUE(new_iter != map.end());
+
+  map.erase(new_iter);
+  EXPECT_EQ(map.size(), 2);
+
+  found[0] = false;
+  found[1] = false;
+  found[2] = false;
+  for (const auto& iter : map) {
+    int x = iter.second;
+    EXPECT_TRUE(x >= 0 && x < 3);
+    EXPECT_TRUE(!found[x]);
+    found[x] = true;
+  }
+  EXPECT_EQ(found[0], true);
+  EXPECT_EQ(found[1], false);
+  EXPECT_EQ(found[2], true);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/format.cc b/tensorflow/core/lib/io/format.cc
index 64852943ad560edeede640fbd882e2984a4afee5..0c24c660a246eacde9fe0a0368a66eb511b1786d 100644
--- a/tensorflow/core/lib/io/format.cc
+++ b/tensorflow/core/lib/io/format.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <limits>
+
 #include "tensorflow/core/lib/io/format.h"
 
 #include "tensorflow/core/lib/core/coding.h"
@@ -84,6 +86,11 @@ Status ReadBlock(RandomAccessFile* file, const BlockHandle& handle,
   // Read the block contents as well as the type/crc footer.
   // See table_builder.cc for the code that built this structure.
   size_t n = static_cast<size_t>(handle.size());
+
+  if (kBlockTrailerSize > std::numeric_limits<size_t>::max() - n) {
+    return errors::DataLoss("handle.size() too big");
+  }
+
   char* buf = new char[n + kBlockTrailerSize];
   StringPiece contents;
   Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc
index 6be1f819c2081dd4cc73853276d1cd94399614ff..3608008b30181ca5025644437740f1cd0fe1a156 100644
--- a/tensorflow/core/lib/io/inputbuffer_test.cc
+++ b/tensorflow/core/lib/io/inputbuffer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -287,7 +288,7 @@ TEST(InputBuffer, Seek) {
     EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(1, &read)));
 
     EXPECT_TRUE(
-        StringPiece(in.Seek(-1).ToString()).contains("negative position"));
+        str_util::StrContains(in.Seek(-1).ToString(), "negative position"));
   }
 }
 
diff --git a/tensorflow/core/lib/io/path.cc b/tensorflow/core/lib/io/path.cc
index 83f15e134d6f60c65a7523458353ffd62345b7cc..996fbf62e5c1736f9922fcb652f65259e985a7f1 100644
--- a/tensorflow/core/lib/io/path.cc
+++ b/tensorflow/core/lib/io/path.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/strings/scanner.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 namespace io {
diff --git a/tensorflow/core/lib/io/path.h b/tensorflow/core/lib/io/path.h
index 47bb2b998d637099b3ab788f7ce274f83e4fc646..818ba99888d041f016210292a7c0cf18ef7d0e41 100644
--- a/tensorflow/core/lib/io/path.h
+++ b/tensorflow/core/lib/io/path.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_IO_PATH_H_
 #define TENSORFLOW_LIB_IO_PATH_H_
 
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 254fdf115da132343b8e6f176e67672a11281cd0..c24628be570a6138d745c2acbde4e376521c9860 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -56,110 +56,55 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
 
 RecordReader::RecordReader(RandomAccessFile* file,
                            const RecordReaderOptions& options)
-    : src_(file), options_(options) {
+    : options_(options),
+      input_stream_(new RandomAccessInputStream(file)),
+      last_read_failed_(false) {
   if (options.buffer_size > 0) {
-    input_stream_.reset(new BufferedInputStream(file, options.buffer_size));
-  } else {
-    input_stream_.reset(new RandomAccessInputStream(file));
+    input_stream_.reset(new BufferedInputStream(input_stream_.release(),
+                                                options.buffer_size, true));
   }
   if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
 // We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
     LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
 #else   // IS_SLIM_BUILD
-    zlib_input_stream_.reset(new ZlibInputStream(
-        input_stream_.get(), options.zlib_options.input_buffer_size,
-        options.zlib_options.output_buffer_size, options.zlib_options));
+    input_stream_.reset(new ZlibInputStream(
+        input_stream_.release(), options.zlib_options.input_buffer_size,
+        options.zlib_options.output_buffer_size, options.zlib_options, true));
 #endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
-    LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
+    LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
   }
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
 // stored in the last 4 bytes and store the first n bytes in *result.
-// May use *storage as backing store.
-Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
-                                     StringPiece* result, string* storage) {
+//
+// offset corresponds to the user-provided value to ReadRecord()
+// and is used only in error messages.
+Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) {
   if (n >= SIZE_MAX - sizeof(uint32)) {
     return errors::DataLoss("record size too large");
   }
 
   const size_t expected = n + sizeof(uint32);
-  storage->resize(expected);
-
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    // If we have a zlib compressed buffer, we assume that the
-    // file is being read sequentially, and we use the underlying
-    // implementation to read the data.
-    //
-    // No checks are done to validate that the file is being read
-    // sequentially.  At some point the zlib input buffer may support
-    // seeking, possibly inefficiently.
-    TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage));
-
-    if (storage->size() != expected) {
-      if (storage->empty()) {
-        return errors::OutOfRange("eof");
-      } else {
-        return errors::DataLoss("truncated record at ", offset);
-      }
-    }
+  TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result));
 
-    uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-    if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-      return errors::DataLoss("corrupted record at ", offset);
-    }
-    *result = StringPiece(storage->data(), n);
-  } else {
-#endif  // IS_SLIM_BUILD
-    if (options_.buffer_size > 0) {
-      // If we have a buffer, we assume that the file is being read
-      // sequentially, and we use the underlying implementation to read the
-      // data.
-      //
-      // No checks are done to validate that the file is being read
-      // sequentially.
-      TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage));
-
-      if (storage->size() != expected) {
-        if (storage->empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-
-      const uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(storage->data(), n);
+  if (result->size() != expected) {
+    if (result->empty()) {
+      return errors::OutOfRange("eof");
     } else {
-      // This version supports reading from arbitrary offsets
-      // since we are accessing the random access file directly.
-      StringPiece data;
-      TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0]));
-      if (data.size() != expected) {
-        if (data.empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-      const uint32 masked_crc = core::DecodeFixed32(data.data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(data.data(), n);
+      return errors::DataLoss("truncated record at ", offset);
     }
-#if !defined(IS_SLIM_BUILD)
   }
-#endif  // IS_SLIM_BUILD
 
+  const uint32 masked_crc = core::DecodeFixed32(result->data() + n);
+  if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) {
+    return errors::DataLoss("corrupted record at ", offset);
+  }
+  result->resize(n);
   return Status::OK();
 }
 
@@ -167,48 +112,42 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) {
   static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
   static const size_t kFooterSize = sizeof(uint32);
 
+  // Position the input stream.
+  int64 curr_pos = input_stream_->Tell();
+  int64 desired_pos = static_cast<int64>(*offset);
+  if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
+      (curr_pos == desired_pos && last_read_failed_)) {
+    last_read_failed_ = false;
+    TF_RETURN_IF_ERROR(input_stream_->Reset());
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos));
+  } else if (curr_pos < desired_pos) {
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos));
+  }
+  DCHECK_EQ(desired_pos, input_stream_->Tell());
+
   // Read header data.
-  StringPiece lbuf;
-  Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record);
+  Status s = ReadChecksummed(*offset, sizeof(uint64), record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     return s;
   }
-  const uint64 length = core::DecodeFixed64(lbuf.data());
+  const uint64 length = core::DecodeFixed64(record->data());
 
   // Read data
-  StringPiece data;
-  s = ReadChecksummed(*offset + kHeaderSize, length, &data, record);
+  s = ReadChecksummed(*offset + kHeaderSize, length, record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     if (errors::IsOutOfRange(s)) {
       s = errors::DataLoss("truncated record at ", *offset);
     }
     return s;
   }
 
-  if (record->data() != data.data()) {
-    // RandomAccessFile placed the data in some other location.
-    memmove(&(*record)[0], data.data(), data.size());
-  }
-
-  record->resize(data.size());
-
   *offset += kHeaderSize + length + kFooterSize;
+  DCHECK_EQ(*offset, input_stream_->Tell());
   return Status::OK();
 }
 
-Status RecordReader::SkipNBytes(uint64 offset) {
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset));
-  } else {
-#endif
-    if (options_.buffer_size > 0) {
-      TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset));
-    }
-  }
-  return Status::OK();
-}  // namespace io
-
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
     : underlying_(file, options), offset_(0) {}
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index 62dd2efb792988c4197cf7172b25ac34cdd77ed9..f6d587dfa0e9596b9d46a28a903255e81f070145 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_IO_RECORD_READER_H_
 #define TENSORFLOW_LIB_IO_RECORD_READER_H_
 
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
-#if !defined(IS_SLIM_BUILD)
 #include "tensorflow/core/lib/io/inputstream_interface.h"
+#if !defined(IS_SLIM_BUILD)
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_inputstream.h"
 #endif  // IS_SLIM_BUILD
@@ -69,25 +69,14 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
-  //
-  // Note: if buffering is used (with or without compression), access must be
-  // sequential.
   Status ReadRecord(uint64* offset, string* record);
 
-  // Skip the records till "offset". Returns OK on success,
-  // OUT_OF_RANGE for end of file, or something else for an error.
-  Status SkipNBytes(uint64 offset);
-
  private:
-  Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result,
-                         string* storage);
+  Status ReadChecksummed(uint64 offset, size_t n, string* result);
 
-  RandomAccessFile* src_;
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
-#if !defined(IS_SLIM_BUILD)
-  std::unique_ptr<ZlibInputStream> zlib_input_stream_;
-#endif  // IS_SLIM_BUILD
+  bool last_read_failed_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
 };
@@ -121,7 +110,6 @@ class SequentialRecordReader {
       return errors::InvalidArgument(
           "Trying to seek offset: ", offset,
           " which is less than the current offset: ", offset_);
-    TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_));
     offset_ = offset;
     return Status::OK();
   }
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index b7e51256a22b0d84e734e2a036a184b3adc3e547..da514bd21c7dbfc6b2ac3770af5a7cdb95d5149d 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -20,15 +20,17 @@ limitations under the License.
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace io {
+namespace {
 
 // Construct a string of the specified length made out of the supplied
 // partial string.
-static string BigString(const string& partial_string, size_t n) {
+string BigString(const string& partial_string, size_t n) {
   string result;
   while (result.size() < n) {
     result.append(partial_string);
@@ -38,62 +40,66 @@ static string BigString(const string& partial_string, size_t n) {
 }
 
 // Construct a string from a number
-static string NumberString(int n) {
+string NumberString(int n) {
   char buf[50];
   snprintf(buf, sizeof(buf), "%d.", n);
   return string(buf);
 }
 
 // Return a skewed potentially long string
-static string RandomSkewedString(int i, random::SimplePhilox* rnd) {
+string RandomSkewedString(int i, random::SimplePhilox* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
-class RecordioTest : public ::testing::Test {
+class StringDest : public WritableFile {
+ public:
+  explicit StringDest(string* contents) : contents_(contents) {}
+
+  Status Close() override { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
+  Status Sync() override { return Status::OK(); }
+  Status Append(const StringPiece& slice) override {
+    contents_->append(slice.data(), slice.size());
+    return Status::OK();
+  }
+
  private:
-  class StringDest : public WritableFile {
-   public:
-    string contents_;
-
-    Status Close() override { return Status::OK(); }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
-    Status Append(const StringPiece& slice) override {
-      contents_.append(slice.data(), slice.size());
-      return Status::OK();
+  string* contents_;
+};
+
+class StringSource : public RandomAccessFile {
+ public:
+  explicit StringSource(string* contents)
+      : contents_(contents), force_error_(false) {}
+
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
+    if (force_error_) {
+      force_error_ = false;
+      return errors::DataLoss("read error");
     }
-  };
-
-  class StringSource : public RandomAccessFile {
-   public:
-    StringPiece contents_;
-    mutable bool force_error_;
-    mutable bool returned_partial_;
-    StringSource() : force_error_(false), returned_partial_(false) {}
-
-    Status Read(uint64 offset, size_t n, StringPiece* result,
-                char* scratch) const override {
-      EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error";
-
-      if (force_error_) {
-        force_error_ = false;
-        returned_partial_ = true;
-        return errors::DataLoss("read error");
-      }
-
-      if (offset >= contents_.size()) {
-        return errors::OutOfRange("end of file");
-      }
-
-      if (contents_.size() < offset + n) {
-        n = contents_.size() - offset;
-        returned_partial_ = true;
-      }
-      *result = StringPiece(contents_.data() + offset, n);
-      return Status::OK();
+
+    if (offset >= contents_->size()) {
+      return errors::OutOfRange("end of file");
+    }
+
+    if (contents_->size() < offset + n) {
+      n = contents_->size() - offset;
     }
-  };
+    *result = StringPiece(contents_->data() + offset, n);
+    return Status::OK();
+  }
+
+  void force_error() { force_error_ = true; }
+
+ private:
+  string* contents_;
+  mutable bool force_error_;
+};
 
+class RecordioTest : public ::testing::Test {
+ private:
+  string contents_;
   StringDest dest_;
   StringSource source_;
   bool reading_;
@@ -103,7 +109,9 @@ class RecordioTest : public ::testing::Test {
 
  public:
   RecordioTest()
-      : reading_(false),
+      : dest_(&contents_),
+        source_(&contents_),
+        reading_(false),
         readpos_(0),
         writer_(new RecordWriter(&dest_)),
         reader_(new RecordReader(&source_)) {}
@@ -118,12 +126,11 @@ class RecordioTest : public ::testing::Test {
     TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg)));
   }
 
-  size_t WrittenBytes() const { return dest_.contents_.size(); }
+  size_t WrittenBytes() const { return contents_.size(); }
 
   string Read() {
     if (!reading_) {
       reading_ = true;
-      source_.contents_ = StringPiece(dest_.contents_);
     }
     string record;
     Status s = reader_->ReadRecord(&readpos_, &record);
@@ -136,26 +143,20 @@ class RecordioTest : public ::testing::Test {
     }
   }
 
-  void IncrementByte(int offset, int delta) {
-    dest_.contents_[offset] += delta;
-  }
+  void IncrementByte(int offset, int delta) { contents_[offset] += delta; }
 
-  void SetByte(int offset, char new_byte) {
-    dest_.contents_[offset] = new_byte;
-  }
+  void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; }
 
-  void ShrinkSize(int bytes) {
-    dest_.contents_.resize(dest_.contents_.size() - bytes);
-  }
+  void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); }
 
   void FixChecksum(int header_offset, int len) {
     // Compute crc of type/len/data
-    uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len);
+    uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len);
     crc = crc32c::Mask(crc);
-    core::EncodeFixed32(&dest_.contents_[header_offset], crc);
+    core::EncodeFixed32(&contents_[header_offset], crc);
   }
 
-  void ForceError() { source_.force_error_ = true; }
+  void ForceError() { source_.force_error(); }
 
   void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; }
 
@@ -164,7 +165,6 @@ class RecordioTest : public ::testing::Test {
     Write("bar");
     Write(BigString("x", 10000));
     reading_ = true;
-    source_.contents_ = StringPiece(dest_.contents_);
     uint64 offset = WrittenBytes() + offset_past_end;
     string record;
     Status s = reader_->ReadRecord(&offset, &record);
@@ -216,16 +216,100 @@ TEST_F(RecordioTest, RandomRead) {
   ASSERT_EQ("EOF", Read());
 }
 
+void TestNonSequentialReads(const RecordWriterOptions& writer_options,
+                            const RecordReaderOptions& reader_options) {
+  string contents;
+  StringDest dst(&contents);
+  RecordWriter writer(&dst, writer_options);
+  for (int i = 0; i < 10; ++i) {
+    TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i;
+  }
+  TF_ASSERT_OK(writer.Close());
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  string record;
+  // First read sequentially to fill in the offsets table.
+  uint64 offsets[10] = {0};
+  uint64 offset = 0;
+  for (int i = 0; i < 10; ++i) {
+    offsets[i] = offset;
+    TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i;
+  }
+
+  // Read randomly: First go back to record #3 then forward to #8.
+  offset = offsets[3];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("3.", record);
+  EXPECT_EQ(offsets[4], offset);
+
+  offset = offsets[8];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("8.", record);
+  EXPECT_EQ(offsets[9], offset);
+}
+
+TEST_F(RecordioTest, NonSequentialReads) {
+  TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 10;
+  TestNonSequentialReads(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithCompression) {
+  TestNonSequentialReads(
+      RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+      RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
+}
+
 // Tests of all the error paths in log_reader.cc follow:
-static void AssertHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+void AssertHasSubstr(StringPiece s, StringPiece expected) {
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
+void TestReadError(const RecordWriterOptions& writer_options,
+                   const RecordReaderOptions& reader_options) {
+  const string wrote = BigString("well hello there!", 100);
+  string contents;
+  StringDest dst(&contents);
+  TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote));
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  uint64 offset = 0;
+  string read;
+  file.force_error();
+  Status status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(errors::IsDataLoss(status));
+  ASSERT_EQ(0, offset);
+
+  // A failed Read() shouldn't update the offset, and thus a retry shouldn't
+  // lose the record.
+  status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(status.ok()) << status;
+  EXPECT_GT(offset, 0);
+  EXPECT_EQ(wrote, read);
+}
+
 TEST_F(RecordioTest, ReadError) {
-  Write("foo");
-  ForceError();
-  AssertHasSubstr(Read(), "Data loss");
+  TestReadError(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, ReadErrorWithBuffering) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 20;
+  TestReadError(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, ReadErrorWithCompression) {
+  TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+                RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
 }
 
 TEST_F(RecordioTest, CorruptLength) {
@@ -256,5 +340,6 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
 
 TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
 
+}  // namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index 984fbc2810c28ac818ab8f5a86451bed4e101605..47de36bf6c677b38d693fa166e3a696ec1b5ff33 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -25,8 +25,9 @@ ZlibInputStream::ZlibInputStream(
     InputStreamInterface* input_stream,
     size_t input_buffer_bytes,   // size of z_stream.next_in buffer
     size_t output_buffer_bytes,  // size of z_stream.next_out buffer
-    const ZlibCompressionOptions& zlib_options)
-    : input_stream_(input_stream),
+    const ZlibCompressionOptions& zlib_options, bool owns_input_stream)
+    : owns_input_stream_(owns_input_stream),
+      input_stream_(input_stream),
       input_buffer_capacity_(input_buffer_bytes),
       output_buffer_capacity_(output_buffer_bytes),
       z_stream_input_(new Bytef[input_buffer_capacity_]),
@@ -37,14 +38,25 @@ ZlibInputStream::ZlibInputStream(
   InitZlibBuffer();
 }
 
+ZlibInputStream::ZlibInputStream(InputStreamInterface* input_stream,
+                                 size_t input_buffer_bytes,
+                                 size_t output_buffer_bytes,
+                                 const ZlibCompressionOptions& zlib_options)
+    : ZlibInputStream(input_stream, input_buffer_bytes, output_buffer_bytes,
+                      zlib_options, false) {}
+
 ZlibInputStream::~ZlibInputStream() {
   if (z_stream_) {
     inflateEnd(z_stream_.get());
   }
+  if (owns_input_stream_) {
+    delete input_stream_;
+  }
 }
 
 Status ZlibInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
+  inflateEnd(z_stream_.get());
   InitZlibBuffer();
   bytes_read_ = 0;
   return Status::OK();
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index 9c7e14441ce92756f2bc0716210dc41652c9e105..37339163ee0fafa92b2ad3bc1e16fa04c3c6fae5 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -40,7 +40,15 @@ class ZlibInputStream : public InputStreamInterface {
   // Create a ZlibInputStream for `input_stream` with a buffer of size
   // `input_buffer_bytes` bytes for reading contents from `input_stream` and
   // another buffer with size `output_buffer_bytes` for caching decompressed
-  // contents. Does *not* take ownership of "input_stream".
+  // contents.
+  //
+  // Takes ownership of `input_stream` iff `owns_input_stream` is true.
+  ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
+                  size_t output_buffer_bytes,
+                  const ZlibCompressionOptions& zlib_options,
+                  bool owns_input_stream);
+
+  // Equivalent to the previous constructor with owns_input_stream=false.
   ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
                   size_t output_buffer_bytes,
                   const ZlibCompressionOptions& zlib_options);
@@ -65,10 +73,11 @@ class ZlibInputStream : public InputStreamInterface {
  private:
   void InitZlibBuffer();
 
-  InputStreamInterface* input_stream_;  // Not owned
-  size_t input_buffer_capacity_;        // Size of z_stream_input_
-  size_t output_buffer_capacity_;       // Size of z_stream_output_
-  char* next_unread_byte_;              // Next unread byte in z_stream_output_
+  const bool owns_input_stream_;
+  InputStreamInterface* input_stream_;
+  size_t input_buffer_capacity_;   // Size of z_stream_input_
+  size_t output_buffer_capacity_;  // Size of z_stream_output_
+  char* next_unread_byte_;         // Next unread byte in z_stream_output_
 
   // Buffer for storing contents read from compressed stream.
   // TODO(srbs): Consider using circular buffers. That would greatly simplify
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index cba473927dd1fce30bbe690b4bfda1e382ca12c0..62c803afb24fe09293f1b5b8b5fcaa88ddcf5ead 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/png/png_io.h"
-#include "tensorflow/core/platform/cpu_info.h"  // endian
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/png.h"
 
diff --git a/tensorflow/core/lib/psnr/testdata/cat_q20.jpg b/tensorflow/core/lib/psnr/testdata/cat_q20.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d7b882a7a7b17ca6f77876d6f534c41c3c62a11a
Binary files /dev/null and b/tensorflow/core/lib/psnr/testdata/cat_q20.jpg differ
diff --git a/tensorflow/core/lib/psnr/testdata/cat_q72.jpg b/tensorflow/core/lib/psnr/testdata/cat_q72.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2b5dd75ac9e391a92f29aebfcad0fd2079bc6029
Binary files /dev/null and b/tensorflow/core/lib/psnr/testdata/cat_q72.jpg differ
diff --git a/tensorflow/core/lib/psnr/testdata/cat_q95.jpg b/tensorflow/core/lib/psnr/testdata/cat_q95.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7fa3c3157fbfa4f02bc5feb726e46b9a33cc2f2f
Binary files /dev/null and b/tensorflow/core/lib/psnr/testdata/cat_q95.jpg differ
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 3fe1f9bc6cf06158df4811eaa177988b60890006..e963511f5cfe64fb74101cfdd3724843453b0959 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -23,8 +23,10 @@ limitations under the License.
 
 #include <string.h>
 #include <algorithm>
+#include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 
 namespace tensorflow {
@@ -32,11 +34,27 @@ namespace random {
 
 // Helper function to convert a 16-bit integer to a half between [0..1).
 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
+// Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
 // Helper function to convert a 32-bit integer to a float between [0..1).
 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
 // Helper function to convert two 32-bit integers to a double between [0..1).
 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
 
+// Computes a + b. Requires that the result is representable in the destination
+// type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
+// need *not* be representable in that type. (The condition on b excludes the
+// extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
+// compute.)
+template <typename Int>
+PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
+                                   typename std::make_unsigned<Int>::type b) {
+  // Implementation note: both b_div_2 and b - b_div_2 are positive and
+  // representatble as Int.
+  auto b_div_2 = b >> 1;
+  return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
+}
+
 // A class that generates uniform distribution random numbers from the
 // underlying random integer generator.
 // Arguments:
@@ -75,6 +93,30 @@ class UniformDistribution<Generator, Eigen::half> {
   }
 };
 
+template <class Generator>
+class UniformDistribution<Generator, bfloat16> {
+ public:
+  // The number of elements that will be returned.
+  static const int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static const int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static const bool kVariableSamplesPerOutput = false;
+  typedef Array<bfloat16, kResultElementCount> ResultType;
+  typedef bfloat16 ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i) {
+      result[i] = Uint16ToGfloat16(sample[i]);
+    }
+    return result;
+  }
+};
+
 template <class Generator>
 class UniformDistribution<Generator, float> {
  public:
@@ -137,14 +179,15 @@ class UniformDistribution<Generator, int32> {
   typedef int32 ResultElementType;
 
   // Must have lo < hi
-  UniformDistribution(int32 lo, int32 hi) : lo_(lo), range_(hi - lo) {}
+  UniformDistribution(int32 lo, int32 hi)
+      : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
     typename Generator::ResultType sample = (*gen)();
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
-      result[i] = lo_ + static_cast<int32>(sample[i] % range_);
+      result[i] = SignedAdd(lo_, sample[i] % range_);
     }
     return result;
   }
@@ -171,7 +214,8 @@ class UniformDistribution<Generator, int64> {
   typedef int64 ResultElementType;
 
   // Must have lo < hi
-  UniformDistribution(int64 lo, int64 hi) : lo_(lo), range_(hi - lo) {}
+  UniformDistribution(int64 lo, int64 hi)
+      : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
@@ -179,7 +223,7 @@ class UniformDistribution<Generator, int64> {
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
       auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
-      result[i] = lo_ + static_cast<int64>(bits % range_);
+      result[i] = SignedAdd(lo_, bits % range_);
     }
     return result;
   }
@@ -305,6 +349,36 @@ class NormalDistribution<Generator, Eigen::half> {
   }
 };
 
+template <class Generator>
+class NormalDistribution<Generator, bfloat16> {
+ public:
+  // The number of elements that will be returned.
+  static const int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static const int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static const bool kVariableSamplesPerOutput = false;
+  typedef Array<bfloat16, kResultElementCount> ResultType;
+  typedef bfloat16 ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator* gen) {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    static_assert(kResultElementCount % 2 == 0,
+                  "kResultElementCount should be an even number");
+    for (int i = 0; i < kResultElementCount; i += 2) {
+      float f[2];
+      // Box-Muller transform requires processing 2 elements at a time.
+      BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
+      result[i] = bfloat16(f[0]);
+      result[i + 1] = bfloat16(f[1]);
+    }
+    return result;
+  }
+};
+
 template <class Generator>
 class NormalDistribution<Generator, float> {
  public:
@@ -414,6 +488,48 @@ class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
   }
 };
 
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
+ public:
+  // The number of elements that will be returned.
+  static const int kResultElementCount =
+      SingleSampleGenerator::kNativeElementCount;
+  // Cost of generation of a single element (in cycles).
+  static const int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static const bool kVariableSamplesPerOutput = true;
+  // The threshold where the normal distribution is truncated.
+  const float kTruncateValue = 2.0f;
+
+  typedef Array<bfloat16, kResultElementCount> ResultType;
+  typedef bfloat16 ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator* gen) {
+    ResultType results;
+    int index = 0;
+    while (true) {
+      // Repeatedly take samples from the normal distribution, until we have
+      // the desired number of elements that fall within the pre-defined cutoff
+      // threshold.
+      const uint32 x0 = (*gen)();
+      const uint32 x1 = (*gen)();
+      float f[2];
+      BoxMullerFloat(x0, x1, &f[0], &f[1]);
+
+      for (int i = 0; i < 2; ++i) {
+        if (Eigen::numext::abs(f[i]) < kTruncateValue) {
+          results[index++] = bfloat16(f[i]);
+          if (index >= kResultElementCount) {
+            return results;
+          }
+        }
+      }
+    }
+  }
+};
+
 // Partial specialization for float.
 template <class SingleSampleGenerator>
 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
@@ -567,6 +683,27 @@ PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
   return result - Eigen::half(1.0);
 }
 
+// Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
+// This can create a uniform distribution of values between [0..1).
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
+  // bfloat are formatted as follows (MSB first):
+  //    sign(1) exponent(8) mantissa(7)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 127  -- an excess 127 representation of a zero exponent
+  //    mantissa == 7 random bits
+  const uint16 man = x & 0x7fu;  // 7 bit mantissa
+  const uint16 exp = static_cast<uint16>(127);
+  const uint16 val = (exp << 7) | man;
+
+  bfloat16 result;
+  memcpy(&result, &val, sizeof(val));
+  // The mantissa has an implicit leading 1, so the above code creates a value
+  // in [1, 2). The minus will not cause a rounding that makes the result 1.
+  // Instead it will just be close to 1.
+  return result - bfloat16(1.0);
+}
+
 // Helper function to convert an 32-bit integer to a float between [0..1).
 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
   // IEEE754 floats are formatted as follows (MSB first):
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index 85d68f456e1e27b7a62315f2b0a962843da87d52..8868672a10ae027415d81f76ef146d1a5f28bddd 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -37,6 +37,10 @@ namespace {
 // unit normal distribution, it should almost definitely never exceed 6.
 static constexpr float kZLimit = 6.0;
 
+// As bfloat16 has much less precision, the largest z-value will should be
+// larger than float32.
+static constexpr float kZLimitBfloat16 = 20.0;
+
 // A utility function to fill the given array with samples from the given
 // distribution, using the single adapter of the underlying generator
 template <class Distribution>
@@ -93,7 +97,7 @@ bool CheckSamplesMoments(const std::vector<T>& samples,
       // mode, given the large number of samples.
       moments_data[i] += moment;
       ++moments_sample_count_data[i];
-      moment *= samples_data[index];
+      moment *= static_cast<double>(samples_data[index]);
     }
   }
 
@@ -125,7 +129,7 @@ bool CheckSamplesMoments(const std::vector<T>& samples,
     const double z_test =
         fabs((moments[i] - moments_i_mean) / sqrt(total_variance));
 
-    if (z_test > z_limit) {
+    if (z_test > static_cast<double>(z_limit)) {
       LOG(ERROR) << "failing z_test:"
                  << " moment: " << i << " stride: " << stride
                  << " z_test: " << z_test << " z_limit: " << z_limit
@@ -252,6 +256,22 @@ void RandomParametersMomentsTest(int count, int max_moments,
   }
 }
 
+TEST(PhiloxRandomTest, UniformBfloat16MomentsTest) {
+  const std::vector<int> strides = {0, 1, 4, 17};
+  UniformMomentsTest<bfloat16>(1 << 20, 40, strides, bfloat16(kZLimitBfloat16));
+}
+
+TEST(PhiloxRandomTest, NormalBfloat16MomentsTest) {
+  const std::vector<int> strides = {0, 1, 4, 17};
+  NormalMomentsTest<bfloat16>(8 << 20, 25, strides, bfloat16(kZLimitBfloat16));
+}
+
+TEST(PhiloxRandomTest, RandomParametersBfloat16MomentsTest) {
+  const std::vector<int> strides = {0, 1, 4, 17};
+  RandomParametersMomentsTest<bfloat16>(1 << 20, 40, strides,
+                                        bfloat16(kZLimitBfloat16));
+}
+
 TEST(PhiloxRandomTest, UniformFloatMomentsTest) {
   const std::vector<int> strides = {0, 1, 4, 17};
   UniformMomentsTest<float>(1 << 20, 40, strides, kZLimit);
diff --git a/tensorflow/core/lib/ssim/testdata/checkerboard1.png b/tensorflow/core/lib/ssim/testdata/checkerboard1.png
new file mode 100644
index 0000000000000000000000000000000000000000..6ede4395e2c8d3eb530c074ff3ac1c1974a441ef
Binary files /dev/null and b/tensorflow/core/lib/ssim/testdata/checkerboard1.png differ
diff --git a/tensorflow/core/lib/ssim/testdata/checkerboard2.png b/tensorflow/core/lib/ssim/testdata/checkerboard2.png
new file mode 100644
index 0000000000000000000000000000000000000000..baaa43229074f737dcf5040d209e9fb9ecad989c
Binary files /dev/null and b/tensorflow/core/lib/ssim/testdata/checkerboard2.png differ
diff --git a/tensorflow/core/lib/ssim/testdata/checkerboard3.png b/tensorflow/core/lib/ssim/testdata/checkerboard3.png
new file mode 100644
index 0000000000000000000000000000000000000000..95fa3bbb3ee42673b2b7e52cf84e704f249fa26b
Binary files /dev/null and b/tensorflow/core/lib/ssim/testdata/checkerboard3.png differ
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index f5822fad8e3d3b8559d19c79ee2885e580ea3e11..e4b909296e86089c0a78707eda2ce36f28f9b1cf 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -23,6 +23,9 @@ limitations under the License.
 #include <locale>
 #include <unordered_map>
 
+#include "double-conversion/double-conversion.h"
+
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -32,19 +35,26 @@ namespace tensorflow {
 
 namespace {
 
+template <typename T>
+const std::unordered_map<string, T>* GetSpecialNumsSingleton() {
+  static const std::unordered_map<string, T>* special_nums =
+      CHECK_NOTNULL((new const std::unordered_map<string, T>{
+          {"inf", std::numeric_limits<T>::infinity()},
+          {"+inf", std::numeric_limits<T>::infinity()},
+          {"-inf", -std::numeric_limits<T>::infinity()},
+          {"infinity", std::numeric_limits<T>::infinity()},
+          {"+infinity", std::numeric_limits<T>::infinity()},
+          {"-infinity", -std::numeric_limits<T>::infinity()},
+          {"nan", std::numeric_limits<T>::quiet_NaN()},
+          {"+nan", std::numeric_limits<T>::quiet_NaN()},
+          {"-nan", -std::numeric_limits<T>::quiet_NaN()},
+      }));
+  return special_nums;
+}
+
 template <typename T>
 T locale_independent_strtonum(const char* str, const char** endptr) {
-  static const std::unordered_map<string, T> special_nums = {
-      {"inf", std::numeric_limits<T>::infinity()},
-      {"+inf", std::numeric_limits<T>::infinity()},
-      {"-inf", -std::numeric_limits<T>::infinity()},
-      {"infinity", std::numeric_limits<T>::infinity()},
-      {"+infinity", std::numeric_limits<T>::infinity()},
-      {"-infinity", -std::numeric_limits<T>::infinity()},
-      {"nan", std::numeric_limits<T>::quiet_NaN()},
-      {"+nan", std::numeric_limits<T>::quiet_NaN()},
-      {"-nan", -std::numeric_limits<T>::quiet_NaN()},
-  };
+  auto special_nums = GetSpecialNumsSingleton<T>();
   std::stringstream s(str);
 
   // Check if str is one of the special numbers.
@@ -56,8 +66,8 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
         std::tolower(special_num_str[i], std::locale::classic());
   }
 
-  auto entry = special_nums.find(special_num_str);
-  if (entry != special_nums.end()) {
+  auto entry = special_nums->find(special_num_str);
+  if (entry != special_nums->end()) {
     *endptr = str + (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
                              : s.tellg());
     return entry->second;
@@ -102,23 +112,37 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
   return result;
 }
 
+static inline const double_conversion::StringToDoubleConverter&
+StringToFloatConverter() {
+  static const double_conversion::StringToDoubleConverter converter(
+      double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES |
+          double_conversion::StringToDoubleConverter::ALLOW_HEX |
+          double_conversion::StringToDoubleConverter::ALLOW_TRAILING_SPACES |
+          double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY,
+      0., 0., "inf", "nan");
+  return converter;
+}
+
 }  // namespace
 
 namespace strings {
 
-char* FastInt32ToBufferLeft(int32 i, char* buffer) {
+size_t FastInt32ToBufferLeft(int32 i, char* buffer) {
   uint32 u = i;
+  size_t length = 0;
   if (i < 0) {
     *buffer++ = '-';
+    ++length;
     // We need to do the negation in modular (i.e., "unsigned")
     // arithmetic; MSVC++ apparently warns for plain "-u", so
     // we write the equivalent expression "0 - u" instead.
     u = 0 - u;
   }
-  return FastUInt32ToBufferLeft(u, buffer);
+  length += FastUInt32ToBufferLeft(u, buffer);
+  return length;
 }
 
-char* FastUInt32ToBufferLeft(uint32 i, char* buffer) {
+size_t FastUInt32ToBufferLeft(uint32 i, char* buffer) {
   char* start = buffer;
   do {
     *buffer++ = ((i % 10) + '0');
@@ -126,19 +150,22 @@ char* FastUInt32ToBufferLeft(uint32 i, char* buffer) {
   } while (i > 0);
   *buffer = 0;
   std::reverse(start, buffer);
-  return buffer;
+  return buffer - start;
 }
 
-char* FastInt64ToBufferLeft(int64 i, char* buffer) {
+size_t FastInt64ToBufferLeft(int64 i, char* buffer) {
   uint64 u = i;
+  size_t length = 0;
   if (i < 0) {
     *buffer++ = '-';
+    ++length;
     u = 0 - u;
   }
-  return FastUInt64ToBufferLeft(u, buffer);
+  length += FastUInt64ToBufferLeft(u, buffer);
+  return length;
 }
 
-char* FastUInt64ToBufferLeft(uint64 i, char* buffer) {
+size_t FastUInt64ToBufferLeft(uint64 i, char* buffer) {
   char* start = buffer;
   do {
     *buffer++ = ((i % 10) + '0');
@@ -146,19 +173,18 @@ char* FastUInt64ToBufferLeft(uint64 i, char* buffer) {
   } while (i > 0);
   *buffer = 0;
   std::reverse(start, buffer);
-  return buffer;
+  return buffer - start;
 }
 
 static const double kDoublePrecisionCheckMax = DBL_MAX / 1.000000000000001;
 
-char* DoubleToBuffer(double value, char* buffer) {
+size_t DoubleToBuffer(double value, char* buffer) {
   // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
   // platforms these days.  Just in case some system exists where DBL_DIG
   // is significantly larger -- and risks overflowing our buffer -- we have
   // this assert.
   static_assert(DBL_DIG < 20, "DBL_DIG is too big");
 
-  bool full_precision_needed = true;
   if (std::abs(value) <= kDoublePrecisionCheckMax) {
     int snprintf_result =
         snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG, value);
@@ -167,18 +193,20 @@ char* DoubleToBuffer(double value, char* buffer) {
     // larger than the precision we asked for.
     DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
 
-    full_precision_needed =
-        locale_independent_strtonum<double>(buffer, nullptr) != value;
+    if (locale_independent_strtonum<double>(buffer, nullptr) == value) {
+      // Round-tripping the string to double works; we're done.
+      return snprintf_result;
+    }
+    // else: full precision formatting needed. Fall through.
   }
 
-  if (full_precision_needed) {
-    int snprintf_result =
-        snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG + 2, value);
+  int snprintf_result =
+      snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG + 2, value);
 
-    // Should never overflow; see above.
-    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
-  }
-  return buffer;
+  // Should never overflow; see above.
+  DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+
+  return snprintf_result;
 }
 
 namespace {
@@ -196,7 +224,7 @@ bool safe_strto64(StringPiece str, int64* value) {
 
   int64 vlimit = kint64max;
   int sign = 1;
-  if (str.Consume("-")) {
+  if (str_util::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different limit for positive and negative integers.
     vlimit = kint64min;
@@ -258,7 +286,7 @@ bool safe_strto32(StringPiece str, int32* value) {
 
   int64 vmax = kint32max;
   int sign = 1;
-  if (str.Consume("-")) {
+  if (str_util::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different max for positive and negative integers.
     ++vmax;
@@ -304,28 +332,34 @@ bool safe_strtou32(StringPiece str, uint32* value) {
 }
 
 bool safe_strtof(const char* str, float* value) {
-  const char* endptr;
-  *value = locale_independent_strtonum<float>(str, &endptr);
-  while (isspace(*endptr)) ++endptr;
-  // Ignore range errors from strtod/strtof.
-  // The values it returns on underflow and
-  // overflow are the right fallback in a
-  // robust setting.
-  return *str != '\0' && *endptr == '\0';
+  int processed_characters_count = -1;
+  auto len = str_util::Strnlen(str, kFastToBufferSize);
+
+  // If there is no zero-termination in str, fail.
+  if (len == kFastToBufferSize) return false;
+  // If string length exceeds int max, fail.
+  if (len > std::numeric_limits<int>::max()) return false;
+
+  *value = StringToFloatConverter().StringToFloat(str, static_cast<int>(len),
+                                                  &processed_characters_count);
+  return processed_characters_count > 0;
 }
 
 bool safe_strtod(const char* str, double* value) {
-  const char* endptr;
-  *value = locale_independent_strtonum<double>(str, &endptr);
-  while (isspace(*endptr)) ++endptr;
-  // Ignore range errors from strtod/strtof.
-  // The values it returns on underflow and
-  // overflow are the right fallback in a
-  // robust setting.
-  return *str != '\0' && *endptr == '\0';
+  int processed_characters_count = -1;
+  auto len = str_util::Strnlen(str, kFastToBufferSize);
+
+  // If there is no zero-termination in str, fail.
+  if (len == kFastToBufferSize) return false;
+  // If string length exceeds int max, fail.
+  if (len > std::numeric_limits<int>::max()) return false;
+
+  *value = StringToFloatConverter().StringToDouble(str, static_cast<int>(len),
+                                                   &processed_characters_count);
+  return processed_characters_count > 0;
 }
 
-char* FloatToBuffer(float value, char* buffer) {
+size_t FloatToBuffer(float value, char* buffer) {
   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
   // platforms these days.  Just in case some system exists where FLT_DIG
   // is significantly larger -- and risks overflowing our buffer -- we have
@@ -347,7 +381,7 @@ char* FloatToBuffer(float value, char* buffer) {
     // Should never overflow; see above.
     DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
   }
-  return buffer;
+  return snprintf_result;
 }
 
 string FpToString(Fprint fp) {
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 3c45b9027401999ba4e6c32005456312970cccba..e9add428492f3e408764e44988293b783cfb0272 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -60,19 +60,18 @@ static const int kFastToBufferSize = 32;
 // the output.  The buffer should typically be at least kFastToBufferSize
 // bytes.
 //
-// Returns a pointer to the end of the string (i.e. the null character
-// terminating the string).
+// Returns the number of characters written.
 // ----------------------------------------------------------------------
 
-char* FastInt32ToBufferLeft(int32 i, char* buffer);    // at least 12 bytes
-char* FastUInt32ToBufferLeft(uint32 i, char* buffer);  // at least 12 bytes
-char* FastInt64ToBufferLeft(int64 i, char* buffer);    // at least 22 bytes
-char* FastUInt64ToBufferLeft(uint64 i, char* buffer);  // at least 22 bytes
+size_t FastInt32ToBufferLeft(int32 i, char* buffer);    // at least 12 bytes
+size_t FastUInt32ToBufferLeft(uint32 i, char* buffer);  // at least 12 bytes
+size_t FastInt64ToBufferLeft(int64 i, char* buffer);    // at least 22 bytes
+size_t FastUInt64ToBufferLeft(uint64 i, char* buffer);  // at least 22 bytes
 
 // Required buffer size for DoubleToBuffer is kFastToBufferSize.
 // Required buffer size for FloatToBuffer is kFastToBufferSize.
-char* DoubleToBuffer(double i, char* buffer);
-char* FloatToBuffer(float i, char* buffer);
+size_t DoubleToBuffer(double value, char* buffer);
+size_t FloatToBuffer(float value, char* buffer);
 
 // Convert a 64-bit fingerprint value to an ASCII representation.
 string FpToString(Fprint fp);
@@ -115,11 +114,13 @@ bool safe_strtou64(StringPiece str, uint64* value);
 // Convert strings to floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtof(const char* str, float* value);
 
 // Convert strings to double precision floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtod(const char* str, double* value);
 
 inline bool ProtoParseNumeric(StringPiece s, int32* value) {
diff --git a/tensorflow/core/lib/strings/numbers_test.cc b/tensorflow/core/lib/strings/numbers_test.cc
index e15161de66c75ced0c9cbc9ccb2a6900dc8c7d02..0f22dac262bf555cb0ba85ca7102615dc6d7cc7d 100644
--- a/tensorflow/core/lib/strings/numbers_test.cc
+++ b/tensorflow/core/lib/strings/numbers_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/numbers.h"
 
+#include <cmath>
 #include <string>
 #include "tensorflow/core/platform/test.h"
 
@@ -277,7 +278,49 @@ TEST(safe_strtof, Float) {
   EXPECT_TRUE(safe_strtof("-0x2A", &result));
   EXPECT_EQ(-42.0f, result);
 
+  EXPECT_TRUE(safe_strtof(" -0x2", &result));
+  EXPECT_EQ(-2.0f, result);
+
+  EXPECT_TRUE(safe_strtof("8 \t", &result));
+  EXPECT_EQ(8.0f, result);
+
+  EXPECT_TRUE(safe_strtof("\t20.0\t ", &result));
+  EXPECT_EQ(20.0f, result);
+
   EXPECT_FALSE(safe_strtof("-infinity is awesome", &result));
+
+  // Make sure we exit cleanly if the string is not terminated
+  char test_str[2 * kFastToBufferSize];
+  for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a';
+  EXPECT_FALSE(safe_strtof(test_str, &result));
+
+  // Make sure we exit cleanly if the string is too long
+  test_str[kFastToBufferSize + 1] = '\0';
+  EXPECT_FALSE(safe_strtof(test_str, &result));
+
+  EXPECT_TRUE(safe_strtof("-inf", &result));
+  EXPECT_EQ(-std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("+inf", &result));
+  EXPECT_EQ(std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("InF", &result));
+  EXPECT_EQ(std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("-INF", &result));
+  EXPECT_EQ(-std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtof("-nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtof("-NaN", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtof("+NAN", &result));
+  EXPECT_TRUE(std::isnan(result));
 }
 
 TEST(safe_strtod, Double) {
@@ -287,6 +330,15 @@ TEST(safe_strtod, Double) {
   EXPECT_EQ(0.1234567890123, result);
   EXPECT_FALSE(safe_strtod("0.1234567890123abc", &result));
 
+  // Make sure we exit cleanly if the string is not terminated
+  char test_str[2 * kFastToBufferSize];
+  for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a';
+  EXPECT_FALSE(safe_strtod(test_str, &result));
+
+  // Make sure we exit cleanly if the string is too long
+  test_str[kFastToBufferSize + 1] = '\0';
+  EXPECT_FALSE(safe_strtod(test_str, &result));
+
   // Overflow to infinity, underflow to 0.
   EXPECT_TRUE(safe_strtod("1e310", &result));
   EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
@@ -296,6 +348,41 @@ TEST(safe_strtod, Double) {
 
   EXPECT_TRUE(safe_strtod("1e-325", &result));
   EXPECT_EQ(0, result);
+
+  EXPECT_TRUE(safe_strtod(" -0x1c", &result));
+  EXPECT_EQ(-28.0, result);
+
+  EXPECT_TRUE(safe_strtod("50 \t", &result));
+  EXPECT_EQ(50.0, result);
+
+  EXPECT_TRUE(safe_strtod("\t82.0\t ", &result));
+  EXPECT_EQ(82.0, result);
+
+  EXPECT_FALSE(safe_strtod("infinity", &result));
+
+  EXPECT_TRUE(safe_strtod("-inf", &result));
+  EXPECT_EQ(-std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("+inf", &result));
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("InF", &result));
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("-INF", &result));
+  EXPECT_EQ(-std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtod("-nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtod("-NaN", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtod("+NAN", &result));
+  EXPECT_TRUE(std::isnan(result));
 }
 
 }  // namespace strings
diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc
index fee8a6f93e9a56c1d3a152683a27ad4fec8950ef..ede9f4d390180501bd65c3cbfe301da86d7530a6 100644
--- a/tensorflow/core/lib/strings/ordered_code_test.cc
+++ b/tensorflow/core/lib/strings/ordered_code_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -128,7 +129,7 @@ void TestWriteAppends(T first, U second) {
   string encoded_first_only = encoded;
   OCWriteToString<U>(&encoded, second);
   EXPECT_NE(encoded, encoded_first_only);
-  EXPECT_TRUE(StringPiece(encoded).starts_with(encoded_first_only));
+  EXPECT_TRUE(str_util::StartsWith(encoded, encoded_first_only));
 }
 
 template <typename T>
diff --git a/tensorflow/core/lib/strings/proto_serialization.cc b/tensorflow/core/lib/strings/proto_serialization.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c1fbda2155492c00049f52ce12ae8da665cbda0
--- /dev/null
+++ b/tensorflow/core/lib/strings/proto_serialization.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
+                                    string* result) {
+  DCHECK_LE(msg.ByteSizeLong(), static_cast<size_t>(INT_MAX));
+  const int size = static_cast<int>(msg.ByteSizeLong());
+  *result = string(size, '\0');
+  protobuf::io::ArrayOutputStream array_stream(&(*result)[0], size);
+  protobuf::io::CodedOutputStream output_stream(&array_stream);
+  output_stream.SetSerializationDeterministic(true);
+  msg.SerializeWithCachedSizes(&output_stream);
+  return !output_stream.HadError() && size == output_stream.ByteCount();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/proto_serialization.h b/tensorflow/core/lib/strings/proto_serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..6664928e2818c747268ec1c361acce6bcf6c862e
--- /dev/null
+++ b/tensorflow/core/lib/strings/proto_serialization.h
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_PROTO_SERIALIZATION_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_PROTO_SERIALIZATION_H_
+
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Wrapper around protocol buffer serialization that requests deterministic
+// serialization, in particular for Map fields, which serialize in a random
+// order by default. Returns true on success.
+// Serialization is guaranteed to be deterministic for a given binary only.
+// See the following for more details:
+// https://github.com/google/protobuf/blob/a1bb147e96b6f74db6cdf3c3fcb00492472dbbfa/src/google/protobuf/io/coded_stream.h#L834
+bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
+                                    string* result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_PROTO_SERIALIZATION_H_
diff --git a/tensorflow/core/lib/strings/scanner.h b/tensorflow/core/lib/strings/scanner.h
index d3b63357ee71394250a2e13f7895363b8d82de29..c82e771368c1c2b8a945e070cfcd45f72f91f0a3 100644
--- a/tensorflow/core/lib/strings/scanner.h
+++ b/tensorflow/core/lib/strings/scanner.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
@@ -75,14 +76,14 @@ class Scanner {
   // Consume the next s.size() characters of the input, if they match <s>. If
   // they don't match <s>, this is a no-op.
   Scanner& ZeroOrOneLiteral(StringPiece s) {
-    cur_.Consume(s);
+    str_util::ConsumePrefix(&cur_, s);
     return *this;
   }
 
   // Consume the next s.size() characters of the input, if they match <s>. If
   // they don't match <s>, then GetResult will ultimately return false.
   Scanner& OneLiteral(StringPiece s) {
-    if (!cur_.Consume(s)) {
+    if (!str_util::ConsumePrefix(&cur_, s)) {
       error_ = true;
     }
     return *this;
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index d28857803d7ef1edd66ae6c1a6b81a7ed1dbce85..4598b8ccc79fcd7bad612974947bce97d6a7b120 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 
 #include <ctype.h>
+#include <algorithm>
 #include <vector>
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace str_util {
@@ -373,7 +375,7 @@ size_t RemoveWhitespaceContext(StringPiece* text) {
 }
 
 bool ConsumePrefix(StringPiece* s, StringPiece expected) {
-  if (s->starts_with(expected)) {
+  if (StartsWith(*s, expected)) {
     s->remove_prefix(expected.size());
     return true;
   }
@@ -381,7 +383,7 @@ bool ConsumePrefix(StringPiece* s, StringPiece expected) {
 }
 
 bool ConsumeSuffix(StringPiece* s, StringPiece expected) {
-  if (s->ends_with(expected)) {
+  if (EndsWith(*s, expected)) {
     s->remove_suffix(expected.size());
     return true;
   }
@@ -452,5 +454,30 @@ bool SplitAndParseAsFloats(StringPiece text, char delim,
                                     result);
 }
 
+size_t Strnlen(const char* str, const size_t string_max_len) {
+  size_t len = 0;
+  while (len < string_max_len && str[len] != '\0') {
+    ++len;
+  }
+  return len;
+}
+
+bool StrContains(StringPiece haystack, StringPiece needle) {
+  return std::search(haystack.begin(), haystack.end(), needle.begin(),
+                     needle.end()) != haystack.end();
+}
+
+bool StartsWith(StringPiece text, StringPiece prefix) {
+  return prefix.empty() ||
+         (text.size() >= prefix.size() &&
+          memcmp(text.data(), prefix.data(), prefix.size()) == 0);
+}
+
+bool EndsWith(StringPiece text, StringPiece suffix) {
+  return suffix.empty() || (text.size() >= suffix.size() &&
+                            memcmp(text.data() + (text.size() - suffix.size()),
+                                   suffix.data(), suffix.size()) == 0);
+}
+
 }  // namespace str_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index 44c52850fa99f7688fb496784a18b651c147bb8b..e97d00b975e2f6d53ab530656cd5acb68edfbde4 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -141,6 +140,21 @@ bool SplitAndParseAsInts(StringPiece text, char delim,
 bool SplitAndParseAsFloats(StringPiece text, char delim,
                            std::vector<float>* result);
 
+// StartsWith()
+//
+// Returns whether a given string `text` begins with `prefix`.
+bool StartsWith(StringPiece text, StringPiece prefix);
+
+// EndsWith()
+//
+// Returns whether a given string `text` ends with `suffix`.
+bool EndsWith(StringPiece text, StringPiece suffix);
+
+// StrContains()
+//
+// Returns whether a given string `haystack` contains the substring `needle`.
+bool StrContains(StringPiece haystack, StringPiece needle);
+
 // ------------------------------------------------------------------
 // Implementation details below
 template <typename T>
@@ -209,6 +223,11 @@ std::vector<string> Split(StringPiece text, char delims, Predicate p) {
   return Split(text, StringPiece(&delims, 1), p);
 }
 
+// Returns the length of the given null-terminated byte string 'str'.
+// Returns 'string_max_len' if the null character was not found in the first
+// 'string_max_len' bytes of 'str'.
+size_t Strnlen(const char* str, const size_t string_max_len);
+
 }  // namespace str_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index 6d461241f7e9c5a29064c015991039d5bf95a80f..3bf3e99825f9a5e5b5c5af7c5613f7d8a1ebb161 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -430,4 +430,12 @@ TEST(StringReplace, EmptyStringReplaceAll) {
   EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
 }
 
+TEST(Strnlen, Basic) {
+  EXPECT_EQ(0, str_util::Strnlen("ab", 0));
+  EXPECT_EQ(1, str_util::Strnlen("a", 1));
+  EXPECT_EQ(2, str_util::Strnlen("abcd", 2));
+  EXPECT_EQ(3, str_util::Strnlen("abc", 10));
+  EXPECT_EQ(4, str_util::Strnlen("a \t\n", 10));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/strcat.cc b/tensorflow/core/lib/strings/strcat.cc
index 5b1cff486dba46ab761762b3076610e60d636711..f140ec3d260efdb7b82234706c5a33584c38cbb2 100644
--- a/tensorflow/core/lib/strings/strcat.cc
+++ b/tensorflow/core/lib/strings/strcat.cc
@@ -20,16 +20,12 @@ limitations under the License.
 #include <stdio.h>
 #include <string.h>
 
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace strings {
 
-AlphaNum::AlphaNum(const Eigen::half &f)
-    : piece_(digits_, strlen(FloatToBuffer(static_cast<float>(f), digits_))) {}
-
 AlphaNum::AlphaNum(Hex hex) {
   char *const end = &digits_[kFastToBufferSize];
   char *writer = end;
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index 2bc14945cd0413751003c03c7f5255c300790321..fb2cd5bc7e5fb69650dfc2758b132d73e88375a9 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -27,10 +27,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace Eigen {
-struct half;
-}
-
 // The AlphaNum type was designed to be used as the parameter type for StrCat().
 // Any routine accepting either a string or a number may accept it.
 // The basic idea is that by accepting a "const AlphaNum &" as an argument
@@ -105,27 +101,23 @@ class AlphaNum {
   // A bool ctor would also convert incoming pointers (bletch).
 
   AlphaNum(int i32)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastInt32ToBufferLeft(i32, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastInt32ToBufferLeft(i32, digits_)) {}
   AlphaNum(unsigned int u32)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastUInt32ToBufferLeft(u32, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastUInt32ToBufferLeft(u32, digits_)) {}
   AlphaNum(long x)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastInt64ToBufferLeft(x, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastInt64ToBufferLeft(x, digits_)) {}
   AlphaNum(unsigned long x)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastUInt64ToBufferLeft(x, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastUInt64ToBufferLeft(x, digits_)) {}
   AlphaNum(long long int i64)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastInt64ToBufferLeft(i64, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastInt64ToBufferLeft(i64, digits_)) {}
   AlphaNum(unsigned long long int u64)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastUInt64ToBufferLeft(u64, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastUInt64ToBufferLeft(u64, digits_)) {}
 
   AlphaNum(float f)  // NOLINT(runtime/explicit)
-      : piece_(digits_, strlen(FloatToBuffer(f, digits_))) {}
-  AlphaNum(bfloat16 f)  // NOLINT(runtime/explicit)
-      : piece_(digits_, strlen(FloatToBuffer(static_cast<float>(f), digits_))) {
-  }
+      : piece_(digits_, FloatToBuffer(f, digits_)) {}
   AlphaNum(double f)  // NOLINT(runtime/explicit)
-      : piece_(digits_, strlen(DoubleToBuffer(f, digits_))) {}
+      : piece_(digits_, DoubleToBuffer(f, digits_)) {}
 
-  AlphaNum(const Eigen::half &f);  // NOLINT(runtime/explicit)
   AlphaNum(Hex hex);               // NOLINT(runtime/explicit)
 
   AlphaNum(const char *c_str) : piece_(c_str) {}   // NOLINT(runtime/explicit)
diff --git a/tensorflow/core/lib/strings/strcat_test.cc b/tensorflow/core/lib/strings/strcat_test.cc
index 7cb186e6375fae4d8a7140dd2f9ee6e7e64ddd1a..8cc64a6f0aecfd3dcce772b9a6c5c30ced86ba12 100644
--- a/tensorflow/core/lib/strings/strcat_test.cc
+++ b/tensorflow/core/lib/strings/strcat_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <string>
 
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -131,11 +130,6 @@ TEST(StrCat, Basics) {
   result = tensorflow::strings::StrCat("A hundred K and a half squared is ", d);
   EXPECT_EQ(result, "A hundred K and a half squared is 10000100000.25");
 
-  Eigen::half h(10007.0f);
-  result =
-      tensorflow::strings::StrCat("Ten thousand seven is approximately ", h);
-  EXPECT_EQ(result, "Ten thousand seven is approximately 10008");
-
   result = tensorflow::strings::StrCat(1, 2, 333, 4444, 55555, 666666, 7777777,
                                        88888888, 999999999);
   EXPECT_EQ(result, "12333444455555666666777777788888888999999999");
diff --git a/tensorflow/core/lib/strings/stringprintf.cc b/tensorflow/core/lib/strings/stringprintf.cc
index 03eba4c851f8a9c7c65ddf9396d87800e0ac57df..bbffa062a93e554d9258ad29316c5e8ee2cf428e 100644
--- a/tensorflow/core/lib/strings/stringprintf.cc
+++ b/tensorflow/core/lib/strings/stringprintf.cc
@@ -22,12 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 
-#ifdef COMPILER_MSVC
-enum { IS_COMPILER_MSVC = 1 };
-#else
-enum { IS_COMPILER_MSVC = 0 };
-#endif
-
 void Appendv(string* dst, const char* format, va_list ap) {
   // First try with a small fixed size buffer
   static const int kSpaceLength = 1024;
@@ -48,13 +42,13 @@ void Appendv(string* dst, const char* format, va_list ap) {
       return;
     }
 
-    if (IS_COMPILER_MSVC) {
+#ifdef _MSC_VER
       // Error or MSVC running out of space.  MSVC 8.0 and higher
       // can be asked about space needed with the special idiom below:
       va_copy(backup_ap, ap);
       result = vsnprintf(nullptr, 0, format, backup_ap);
       va_end(backup_ap);
-    }
+#endif
 
     if (result < 0) {
       // Just an error.
diff --git a/tensorflow/core/lib/strings/stringprintf_test.cc b/tensorflow/core/lib/strings/stringprintf_test.cc
index d61a1a945ae0b6cab5e472b41402772f04875077..02cf4cbcadc9f64cc2249f6d1b104c1a6fab103f 100644
--- a/tensorflow/core/lib/strings/stringprintf_test.cc
+++ b/tensorflow/core/lib/strings/stringprintf_test.cc
@@ -30,9 +30,9 @@ TEST(PrintfTest, Empty) {
 
 TEST(PrintfTest, Misc) {
 // MSVC does not support $ format specifier.
-#if !defined(COMPILER_MSVC)
+#if !defined(_MSC_VER)
   EXPECT_EQ("123hello w", Printf("%3$d%2$s %1$c", 'w', "hello", 123));
-#endif  // !COMPILER_MSVC
+#endif  // !_MSC_VER
 }
 
 TEST(AppendfTest, Empty) {
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 77d3c88998e655ec429469a8e4bc5535b95bf1b9..36d939e0611a21a6753d2e3af6d3c1316aacdd89 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -81,13 +81,38 @@ inline float Int16SampleToFloat(int16 data) {
   return data * kMultiplier;
 }
 
+}  // namespace
+
+// Handles moving the data index forward, validating the arguments, and avoiding
+// overflow or underflow.
+Status IncrementOffset(int old_offset, size_t increment, size_t max_size,
+                       int* new_offset) {
+  if (old_offset < 0) {
+    return errors::InvalidArgument("Negative offsets are not allowed: ",
+                                   old_offset);
+  }
+  if (old_offset > max_size) {
+    return errors::InvalidArgument("Initial offset is outside data range: ",
+                                   old_offset);
+  }
+  *new_offset = old_offset + increment;
+  if (*new_offset > max_size) {
+    return errors::InvalidArgument("Data too short when trying to read string");
+  }
+  // See above for the check that the input offset is positive. If it's negative
+  // here then it means that there's been an overflow in the arithmetic.
+  if (*new_offset < 0) {
+    return errors::InvalidArgument("Offset too large, overflowed: ",
+                                   *new_offset);
+  }
+  return Status::OK();
+}
+
 Status ExpectText(const string& data, const string& expected_text,
                   int* offset) {
-  const int new_offset = *offset + expected_text.size();
-  if (new_offset > data.size()) {
-    return errors::InvalidArgument("Data too short when trying to read ",
-                                   expected_text);
-  }
+  int new_offset;
+  TF_RETURN_IF_ERROR(
+      IncrementOffset(*offset, expected_text.size(), data.size(), &new_offset));
   const string found_text(data.begin() + *offset, data.begin() + new_offset);
   if (found_text != expected_text) {
     return errors::InvalidArgument("Header mismatch: Expected ", expected_text,
@@ -97,40 +122,16 @@ Status ExpectText(const string& data, const string& expected_text,
   return Status::OK();
 }
 
-template <class T>
-Status ReadValue(const string& data, T* value, int* offset) {
-  const int new_offset = *offset + sizeof(T);
-  if (new_offset > data.size()) {
-    return errors::InvalidArgument("Data too short when trying to read value");
-  }
-  if (port::kLittleEndian) {
-    memcpy(value, data.data() + *offset, sizeof(T));
-  } else {
-    *value = 0;
-    const uint8* data_buf =
-        reinterpret_cast<const uint8*>(data.data() + *offset);
-    int shift = 0;
-    for (int i = 0; i < sizeof(T); ++i, shift += 8) {
-      *value = *value | (data_buf[i] << shift);
-    }
-  }
-  *offset = new_offset;
-  return Status::OK();
-}
-
 Status ReadString(const string& data, int expected_length, string* value,
                   int* offset) {
-  const int new_offset = *offset + expected_length;
-  if (new_offset > data.size()) {
-    return errors::InvalidArgument("Data too short when trying to read string");
-  }
+  int new_offset;
+  TF_RETURN_IF_ERROR(
+      IncrementOffset(*offset, expected_length, data.size(), &new_offset));
   *value = string(data.begin() + *offset, data.begin() + new_offset);
   *offset = new_offset;
   return Status::OK();
 }
 
-}  // namespace
-
 Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
                              size_t num_channels, size_t num_frames,
                              string* wav_string) {
@@ -272,6 +273,11 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
     TF_RETURN_IF_ERROR(ReadString(wav_string, 4, &chunk_id, &offset));
     uint32 chunk_size;
     TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &chunk_size, &offset));
+    if (chunk_size > std::numeric_limits<int32>::max()) {
+      return errors::InvalidArgument(
+          "WAV data chunk '", chunk_id, "' is too large: ", chunk_size,
+          " bytes, but the limit is ", std::numeric_limits<int32>::max());
+    }
     if (chunk_id == kDataChunkId) {
       if (was_data_found) {
         return errors::InvalidArgument("More than one data chunk found in WAV");
@@ -279,6 +285,12 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
       was_data_found = true;
       *sample_count = chunk_size / bytes_per_sample;
       const uint32 data_count = *sample_count * *channel_count;
+      int unused_new_offset = 0;
+      // Validate that the data exists before allocating space for it
+      // (prevent easy OOM errors).
+      TF_RETURN_IF_ERROR(IncrementOffset(offset, sizeof(int16) * data_count,
+                                         wav_string.size(),
+                                         &unused_new_offset));
       float_values->resize(data_count);
       for (int i = 0; i < data_count; ++i) {
         int16 single_channel_value = 0;
diff --git a/tensorflow/core/lib/wav/wav_io.h b/tensorflow/core/lib/wav/wav_io.h
index adca0ee3034a1a850707be1b64917931be321f97..f004524177eef56a876f7aa7cfe6bf80559f4335 100644
--- a/tensorflow/core/lib/wav/wav_io.h
+++ b/tensorflow/core/lib/wav/wav_io.h
@@ -21,6 +21,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -55,6 +58,36 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
                                     uint32* sample_count, uint16* channel_count,
                                     uint32* sample_rate);
 
+// Everything below here is only exposed publicly for testing purposes.
+
+// Handles moving the data index forward, validating the arguments, and avoiding
+// overflow or underflow.
+Status IncrementOffset(int old_offset, size_t increment, size_t max_size,
+                       int* new_offset);
+
+// This function is only exposed in the header for testing purposes, as a
+// template that needs to be instantiated. Reads a typed numeric value from a
+// stream of data.
+template <class T>
+Status ReadValue(const string& data, T* value, int* offset) {
+  int new_offset;
+  TF_RETURN_IF_ERROR(
+      IncrementOffset(*offset, sizeof(T), data.size(), &new_offset));
+  if (port::kLittleEndian) {
+    memcpy(value, data.data() + *offset, sizeof(T));
+  } else {
+    *value = 0;
+    const uint8* data_buf =
+        reinterpret_cast<const uint8*>(data.data() + *offset);
+    int shift = 0;
+    for (int i = 0; i < sizeof(T); ++i, shift += 8) {
+      *value = *value | (data_buf[i] << shift);
+    }
+  }
+  *offset = new_offset;
+  return Status::OK();
+}
+
 }  // namespace wav
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index 40ddd94abef649285528b6ee177620eedb07df20..9e41da6a20dc5c1491786a5391e12ca7aa2e722c 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -19,12 +19,19 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace wav {
 
+// These are defined in wav_io.cc, and the signatures are here so we don't have
+// to expose them in the public header.
+Status ExpectText(const string& data, const string& expected_text, int* offset);
+Status ReadString(const string& data, int expected_length, string* value,
+                  int* offset);
+
 TEST(WavIO, BadArguments) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
   string result;
@@ -155,5 +162,318 @@ TEST(WavIO, BasicStereo) {
   EXPECT_EQ(expected, result);
 }
 
+// Test how chunk sizes larger than 2GB are handled, since they're stored as
+// unsigned int32s, so there are lots of ways for conversions to confuse the
+// decoding logic. The expected behavior is to fail with an error, since such
+// large WAV files are not common, and are unsupported by many readers.
+// See b/72655902.
+TEST(WavIO, ChunkSizeOverflow) {
+  std::vector<uint8> wav_data = {
+      'R', 'I', 'F', 'F',      // ChunkID
+      60, 0, 0, 0,             // ChunkSize: 36 + SubChunk2Size
+      'W', 'A', 'V', 'E',      // Format
+      'f', 'm', 't', ' ',      // Subchunk1ID
+      16, 0, 0, 0,             // Subchunk1Size
+      1, 0,                    // AudioFormat: 1=PCM
+      1, 0,                    // NumChannels
+      0x44, 0xac, 0, 0,        // SampleRate: 44100
+      0x88, 0x58, 0x1, 0,      // BytesPerSecond: SampleRate * NumChannels *
+                               //                 BitsPerSample/8
+      2, 0,                    // BytesPerSample: NumChannels * BitsPerSample/8
+      16, 0,                   // BitsPerSample
+      'd', 'a', 't', 'a',      // Subchunk2ID
+      8, 0, 0, 0,              // Subchunk2Size: NumSamples * NumChannels *
+                               //                BitsPerSample/8
+      0, 0,                    // Sample 1: 0
+      0xff, 0x7f,              // Sample 2: 32767 (saturated)
+      0, 0,                    // Sample 3: 0
+      0x00, 0x80,              // Sample 4: -32768 (saturated)
+      'f', 'o', 'o', 'o',      // Subchunk2ID
+      0xff, 0xff, 0xff, 0xf8,  // Chunk size that could cause an infinite loop.
+      0, 0,                    // Sample 1: 0
+      0xff, 0x7f,              // Sample 2: 32767 (saturated)
+      0, 0,                    // Sample 3: 0
+      0x00, 0x80,              // Sample 4: -32768 (saturated)
+  };
+  string wav_data_string(wav_data.begin(), wav_data.end());
+  std::vector<float> decoded_audio;
+  uint32 decoded_sample_count;
+  uint16 decoded_channel_count;
+  uint32 decoded_sample_rate;
+  Status decode_status = DecodeLin16WaveAsFloatVector(
+      wav_data_string, &decoded_audio, &decoded_sample_count,
+      &decoded_channel_count, &decoded_sample_rate);
+  EXPECT_FALSE(decode_status.ok());
+  EXPECT_TRUE(str_util::StrContains(decode_status.error_message(), "too large"))
+      << decode_status.error_message();
+}
+
+TEST(WavIO, IncrementOffset) {
+  int new_offset = -1;
+  TF_EXPECT_OK(IncrementOffset(0, 10, 20, &new_offset));
+  EXPECT_EQ(10, new_offset);
+
+  new_offset = -1;
+  TF_EXPECT_OK(IncrementOffset(10, 4, 20, &new_offset));
+  EXPECT_EQ(14, new_offset);
+
+  new_offset = -1;
+  TF_EXPECT_OK(IncrementOffset(99, 1, 100, &new_offset));
+  EXPECT_EQ(100, new_offset);
+
+  new_offset = -1;
+  EXPECT_FALSE(IncrementOffset(-1, 1, 100, &new_offset).ok());
+
+  new_offset = -1;
+  EXPECT_FALSE(IncrementOffset(0, -1, 100, &new_offset).ok());
+
+  new_offset = -1;
+  EXPECT_FALSE(IncrementOffset(std::numeric_limits<int>::max(), 1,
+                               std::numeric_limits<int>::max(), &new_offset)
+                   .ok());
+
+  new_offset = -1;
+  EXPECT_FALSE(IncrementOffset(101, 1, 100, &new_offset).ok());
+}
+
+TEST(WavIO, ExpectText) {
+  std::vector<uint8> test_data = {
+      'E', 'x', 'p', 'e', 'c', 't', 'e', 'd',
+  };
+  string test_string(test_data.begin(), test_data.end());
+
+  int offset = 0;
+  TF_EXPECT_OK(ExpectText(test_string, "Expected", &offset));
+  EXPECT_EQ(8, offset);
+
+  offset = 0;
+  Status expect_status = ExpectText(test_string, "Unexpected", &offset);
+  EXPECT_FALSE(expect_status.ok());
+
+  offset = 0;
+  TF_EXPECT_OK(ExpectText(test_string, "Exp", &offset));
+  EXPECT_EQ(3, offset);
+  TF_EXPECT_OK(ExpectText(test_string, "ected", &offset));
+  EXPECT_EQ(8, offset);
+  expect_status = ExpectText(test_string, "foo", &offset);
+  EXPECT_FALSE(expect_status.ok());
+}
+
+TEST(WavIO, ReadString) {
+  std::vector<uint8> test_data = {
+      'E', 'x', 'p', 'e', 'c', 't', 'e', 'd',
+  };
+  string test_string(test_data.begin(), test_data.end());
+
+  int offset = 0;
+  string read_value;
+  TF_EXPECT_OK(ReadString(test_string, 2, &read_value, &offset));
+  EXPECT_EQ("Ex", read_value);
+  EXPECT_EQ(2, offset);
+
+  TF_EXPECT_OK(ReadString(test_string, 6, &read_value, &offset));
+  EXPECT_EQ("pected", read_value);
+  EXPECT_EQ(8, offset);
+
+  Status read_status = ReadString(test_string, 3, &read_value, &offset);
+  EXPECT_FALSE(read_status.ok());
+}
+
+TEST(WavIO, ReadValueInt8) {
+  std::vector<uint8> test_data = {0x00, 0x05, 0xff, 0x80};
+  string test_string(test_data.begin(), test_data.end());
+
+  int offset = 0;
+  int8 read_value;
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(0, read_value);
+  EXPECT_EQ(1, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(5, read_value);
+  EXPECT_EQ(2, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(-1, read_value);
+  EXPECT_EQ(3, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(-128, read_value);
+  EXPECT_EQ(4, offset);
+
+  Status read_status = ReadValue(test_string, &read_value, &offset);
+  EXPECT_FALSE(read_status.ok());
+}
+
+TEST(WavIO, ReadValueUInt8) {
+  std::vector<uint8> test_data = {0x00, 0x05, 0xff, 0x80};
+  string test_string(test_data.begin(), test_data.end());
+
+  int offset = 0;
+  uint8 read_value;
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(0, read_value);
+  EXPECT_EQ(1, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(5, read_value);
+  EXPECT_EQ(2, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(255, read_value);
+  EXPECT_EQ(3, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(128, read_value);
+  EXPECT_EQ(4, offset);
+
+  Status read_status = ReadValue(test_string, &read_value, &offset);
+  EXPECT_FALSE(read_status.ok());
+}
+
+TEST(WavIO, ReadValueInt16) {
+  std::vector<uint8> test_data = {
+      0x00, 0x00,  // 0
+      0xff, 0x00,  // 255
+      0x00, 0x01,  // 256
+      0xff, 0xff,  // -1
+      0x00, 0x80,  // -32768
+  };
+  string test_string(test_data.begin(), test_data.end());
+
+  int offset = 0;
+  int16 read_value;
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(0, read_value);
+  EXPECT_EQ(2, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(255, read_value);
+  EXPECT_EQ(4, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(256, read_value);
+  EXPECT_EQ(6, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(-1, read_value);
+  EXPECT_EQ(8, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(-32768, read_value);
+  EXPECT_EQ(10, offset);
+
+  Status read_status = ReadValue(test_string, &read_value, &offset);
+  EXPECT_FALSE(read_status.ok());
+}
+
+TEST(WavIO, ReadValueUInt16) {
+  std::vector<uint8> test_data = {
+      0x00, 0x00,  // 0
+      0xff, 0x00,  // 255
+      0x00, 0x01,  // 256
+      0xff, 0xff,  // 65535
+      0x00, 0x80,  // 32768
+  };
+  string test_string(test_data.begin(), test_data.end());
+
+  int offset = 0;
+  uint16 read_value;
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(0, read_value);
+  EXPECT_EQ(2, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(255, read_value);
+  EXPECT_EQ(4, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(256, read_value);
+  EXPECT_EQ(6, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(65535, read_value);
+  EXPECT_EQ(8, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(32768, read_value);
+  EXPECT_EQ(10, offset);
+
+  Status read_status = ReadValue(test_string, &read_value, &offset);
+  EXPECT_FALSE(read_status.ok());
+}
+
+TEST(WavIO, ReadValueInt32) {
+  std::vector<uint8> test_data = {
+      0x00, 0x00, 0x00, 0x00,  // 0
+      0xff, 0x00, 0x00, 0x00,  // 255
+      0x00, 0xff, 0x00, 0x00,  // 65280
+      0x00, 0x00, 0xff, 0x00,  // 16,711,680
+      0xff, 0xff, 0xff, 0xff,  // -1
+  };
+  string test_string(test_data.begin(), test_data.end());
+
+  int offset = 0;
+  int32 read_value;
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(0, read_value);
+  EXPECT_EQ(4, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(255, read_value);
+  EXPECT_EQ(8, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(65280, read_value);
+  EXPECT_EQ(12, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(16711680, read_value);
+  EXPECT_EQ(16, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(-1, read_value);
+  EXPECT_EQ(20, offset);
+
+  Status read_status = ReadValue(test_string, &read_value, &offset);
+  EXPECT_FALSE(read_status.ok());
+}
+
+TEST(WavIO, ReadValueUInt32) {
+  std::vector<uint8> test_data = {
+      0x00, 0x00, 0x00, 0x00,  // 0
+      0xff, 0x00, 0x00, 0x00,  // 255
+      0x00, 0xff, 0x00, 0x00,  // 65280
+      0x00, 0x00, 0xff, 0x00,  // 16,711,680
+      0xff, 0xff, 0xff, 0xff,  // 4,294,967,295
+  };
+  string test_string(test_data.begin(), test_data.end());
+
+  int offset = 0;
+  uint32 read_value;
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(0, read_value);
+  EXPECT_EQ(4, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(255, read_value);
+  EXPECT_EQ(8, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(65280, read_value);
+  EXPECT_EQ(12, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(16711680, read_value);
+  EXPECT_EQ(16, offset);
+
+  TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
+  EXPECT_EQ(4294967295, read_value);
+  EXPECT_EQ(20, offset);
+
+  Status read_status = ReadValue(test_string, &read_value, &offset);
+  EXPECT_FALSE(read_status.ok());
+}
+
 }  // namespace wav
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 267ce88440080399aae783903503f0bbd025d8b4..88fc03826a8dcc99c875e186dacbc31456d6fbcf 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -27,6 +27,7 @@ namespace tensorflow {
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
+using shape_inference::UnchangedShape;
 
 namespace {
 
@@ -178,46 +179,88 @@ Status SetOutputShapeForReshape(InferenceContext* c) {
     c->set_output(0, out);
     return Status::OK();
   }
-  DimensionHandle num_in_elems = c->NumElements(in);
-  if (c->FullyDefined(out)) {
-    DimensionHandle num_out_elems = c->NumElements(out);
-    if (c->ValueKnown(num_in_elems) &&
-        c->Value(num_in_elems) != c->Value(num_out_elems)) {
-      return errors::InvalidArgument(
-          "Cannot reshape a tensor with ", c->DebugString(num_in_elems),
-          " elements to shape ", c->DebugString(out), " (",
-          c->DebugString(num_out_elems), " elements)");
-    }
-    c->set_output(0, out);
-    return Status::OK();
-  }
 
-  if (c->ValueKnown(num_in_elems)) {
+  if (c->RankKnown(out) && c->RankKnown(in)) {
     // We don't know the number of output elements, but we can try to infer
     // the missing dimension.
-    int32 unknown_idx = -1;
     bool too_many_unknown = false;
-    DimensionHandle known_elems = c->MakeDim(1);
-    for (int32 i = 0; i < c->Rank(out); ++i) {
-      DimensionHandle dim = c->Dim(out, i);
-      if (!c->ValueKnown(dim)) {
-        if (unknown_idx >= 0) {
-          too_many_unknown = true;
-          break;
+    int32 out_unknown_idx = -1;
+
+    DimensionHandle known_out_elems = c->NumElements(out);
+    if (!c->ValueKnown(known_out_elems)) {
+      known_out_elems = c->MakeDim(1);
+      for (int32 i = 0; i < c->Rank(out); ++i) {
+        DimensionHandle dim = c->Dim(out, i);
+        if (!c->ValueKnown(dim)) {
+          if (out_unknown_idx >= 0) {
+            too_many_unknown = true;
+            break;
+          }
+          out_unknown_idx = i;
+        } else {
+          TF_RETURN_IF_ERROR(
+              c->Multiply(known_out_elems, dim, &known_out_elems));
+        }
+      }
+    }
+    int32 in_unknown_idx = -1;
+    DimensionHandle known_in_elems = c->NumElements(in);
+    if (!c->ValueKnown(known_in_elems)) {
+      known_in_elems = c->MakeDim(1);
+      for (int32 i = 0; i < c->Rank(in); ++i) {
+        DimensionHandle dim = c->Dim(in, i);
+        if (!c->ValueKnown(dim)) {
+          if (in_unknown_idx >= 0) {
+            too_many_unknown = true;
+            break;
+          }
+          in_unknown_idx = i;
+        } else {
+          TF_RETURN_IF_ERROR(c->Multiply(known_in_elems, dim, &known_in_elems));
         }
-        unknown_idx = i;
-      } else {
-        TF_RETURN_IF_ERROR(c->Multiply(known_elems, dim, &known_elems));
       }
     }
-    if (!too_many_unknown && c->Value(known_elems) != 0) {
-      DimensionHandle inferred_dim;
-      TF_RETURN_IF_ERROR(c->Divide(num_in_elems, c->Value(known_elems),
-                                   true /* evenly_divisible */, &inferred_dim));
-      TF_RETURN_IF_ERROR(c->ReplaceDim(out, unknown_idx, inferred_dim, &out));
+
+    if (!too_many_unknown) {
+      if (in_unknown_idx < 0 && out_unknown_idx < 0) {
+        // Just check that the dimensions match.
+        if (c->Value(known_in_elems) != c->Value(known_out_elems)) {
+          return errors::InvalidArgument(
+              "Cannot reshape a tensor with ", c->DebugString(known_in_elems),
+              " elements to shape ", c->DebugString(out), " (",
+              c->DebugString(known_out_elems), " elements)");
+        }
+      } else if (in_unknown_idx < 0 && out_unknown_idx >= 0 &&
+                 c->Value(known_out_elems) > 0) {
+        // Input fully known, infer the one missing output dim
+        DimensionHandle inferred_dim;
+        TF_RETURN_IF_ERROR(c->Divide(known_in_elems, c->Value(known_out_elems),
+                                     true /* evenly_divisible */,
+                                     &inferred_dim));
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(out, out_unknown_idx, inferred_dim, &out));
+
+      } else if (in_unknown_idx >= 0 && out_unknown_idx < 0 &&
+                 c->Value(known_in_elems) != 0) {
+        // Output fully known, infer the one missing input dim
+        DimensionHandle inferred_dim;
+        TF_RETURN_IF_ERROR(c->Divide(known_out_elems, c->Value(known_in_elems),
+                                     true /* evenly_divisible */,
+                                     &inferred_dim));
+        DimensionHandle unknown_in_dim = c->Dim(in, in_unknown_idx);
+        TF_RETURN_IF_ERROR(
+            c->Merge(unknown_in_dim, inferred_dim, &unknown_in_dim));
+      } else if (in_unknown_idx >= 0 && out_unknown_idx >= 0) {
+        // Exactly one unknown dimension in both input and output. These 2 are
+        // equal iff the known elements are equal.
+        if (c->Value(known_in_elems) == c->Value(known_out_elems)) {
+          DimensionHandle unknown_in_dim = c->Dim(in, in_unknown_idx);
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(out, out_unknown_idx, unknown_in_dim, &out));
+        }
+      }
     }
   }
-
   c->set_output(0, out);
   return Status::OK();
 }
@@ -299,6 +342,50 @@ REGISTER_OP("Pack")
       return Status::OK();
     });
 
+REGISTER_OP("DeepCopy")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetIsStateful()
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceUpdate")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceAdd")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceSub")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("Empty")
+    .Input("shape: int32")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("init: bool = false")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 REGISTER_OP("Unpack")
     .Input("value: T")
@@ -342,6 +429,58 @@ REGISTER_OP("UnravelIndex")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
 
+REGISTER_OP("BroadcastTo")
+    .Input("input: T")
+    .Input("shape: Tidx")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle in = c->input(0);
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+
+      if (!c->RankKnown(out)) {
+        // We have no information about the shape of the output.
+        c->set_output(0, out);
+        return Status::OK();
+      }
+
+      if (!c->RankKnown(in)) {
+        // We have no information about the shape of the input,
+        // nothing to do here.
+        c->set_output(0, out);
+        return Status::OK();
+      }
+      if (c->Rank(out) < c->Rank(in)) {
+        return errors::InvalidArgument("Cannot broadcast a tensor with shape ",
+                                       c->DebugString(in), " shape ",
+                                       c->DebugString(out));
+      }
+
+      int32 in_offset = c->Rank(out) - c->Rank(in);
+      for (int32 i = 0; i < c->Rank(out); ++i) {
+        DimensionHandle dim = c->Dim(out, i);
+        if (c->ValueKnown(dim)) {
+          // The first in_offset dimensions for input will be expanded with 1,
+          // so no check needed.
+          if (i >= in_offset) {
+            DimensionHandle in_dim = c->Dim(in, i - in_offset);
+            if (c->ValueKnown(in_dim)) {
+              if (c->Value(dim) % c->Value(in_dim) != 0) {
+                return errors::InvalidArgument(
+                    "Cannot broadcast a tensor with shape ", c->DebugString(in),
+                    " shape ", c->DebugString(out));
+              }
+            }
+          }
+        }
+      }
+
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
 // in the N == 1 case to remove the node.
@@ -452,9 +591,9 @@ REGISTER_OP("SplitV")
       const Tensor* size_splits = c->input_tensor(1);
       if (rank == InferenceContext::kUnknownRank) {
         // If the rank of input tensor is unknown, then return unknown shapes.
-        output_shape = c->UnknownShape();
+        // Note that the shape of each output can be different.
         for (int i = 0; i < num_outputs; ++i) {
-          c->set_output(i, output_shape);
+          c->set_output(i, c->UnknownShape());
         }
       } else if (rank == 0) {
         // Throw error if input is a scalar.
@@ -463,18 +602,19 @@ REGISTER_OP("SplitV")
         // If split dimension is known, but the sizes are unknown, then
         // only the split dimension is unknown
         output_shape = input;
-        TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
-                                         c->Value(split_dimension),
-                                         c->UnknownDim(), &output_shape));
         for (int i = 0; i < num_outputs; ++i) {
+          TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
+                                           c->Value(split_dimension),
+                                           c->UnknownDim(), &output_shape));
           c->set_output(i, output_shape);
         }
       } else if (size_splits == nullptr && !c->ValueKnown(split_dimension)) {
         // If split dimension or tensor containing the split sizes is unknown,
-        // then return unknown shapes of same rank as input.
-        output_shape = c->UnknownShapeOfRank(rank);
+        // then return unknown shapes of same rank as input. Note that each
+        // output shape can be different since splitv doesn't always split
+        // tensors evenly.
         for (int i = 0; i < num_outputs; ++i) {
-          c->set_output(i, output_shape);
+          c->set_output(i, c->UnknownShapeOfRank(rank));
         }
       } else {
         // Determine the output shape if split dimension and split sizes are
@@ -579,7 +719,7 @@ REGISTER_OP("OnesLike")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {bfloat16, float, double, int8, uint8, int16, uint16, int32, "
+        "T: {bfloat16, half, float, double, int8, uint8, int16, uint16, int32, "
         "int64, complex64, complex128, bool}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -587,7 +727,9 @@ REGISTER_OP("OnesLike")
 REGISTER_OP("Diag")
     .Input("diagonal: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(in, 1, &in));
@@ -602,7 +744,9 @@ REGISTER_OP("Diag")
 REGISTER_OP("DiagPart")
     .Input("input: T")
     .Output("diagonal: T")
-    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
@@ -746,17 +890,41 @@ REGISTER_OP("ReverseV2")
     .Output("output: T")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, bfloat16, "
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, bfloat16, half, "
         "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle axis;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &axis));
-      // TODO(aselle): if input(0)'s dimension is known we could validate axis
       if (c->Rank(input) > 8) {
         return errors::InvalidArgument(
             "reverse does not work on tensors with more than 8 dimensions");
       }
+      const Tensor* axis_tensor = c->input_tensor(1);
+      if (axis_tensor != nullptr && c->RankKnown(input)) {
+        int32 rank = c->Rank(input);
+        std::vector<int64> axis_value;
+        if (axis_tensor->dtype() == DT_INT32) {
+          axis_value = AsInt64<int32>(axis_tensor, axis_tensor->NumElements());
+        } else {
+          axis_value = AsInt64<int64>(axis_tensor, axis_tensor->NumElements());
+        }
+        std::vector<bool> axes_dense(c->Rank(input), false);
+        for (int i = 0; i < axis_value.size(); i++) {
+          int64 canonical_axis =
+              axis_value[i] < 0 ? rank + axis_value[i] : axis_value[i];
+          if (canonical_axis < 0 || canonical_axis >= rank) {
+            return errors::InvalidArgument("'axis'[", i, "] = ", axis_value[i],
+                                           " is out of valid range [", 0, ", ",
+                                           rank - 1);
+          }
+          if (axes_dense[canonical_axis]) {
+            return errors::InvalidArgument("axis ", canonical_axis,
+                                           " specified more than once.");
+          }
+          axes_dense[canonical_axis] = true;
+        }
+      }
       c->set_output(0, input);
       return Status::OK();
     });
@@ -1098,7 +1266,7 @@ REGISTER_OP("PreventGradient")
 REGISTER_OP("CheckNumerics")
     .Input("tensor: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .Attr("message: string")
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -1168,7 +1336,9 @@ REGISTER_OP("Unique")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(1, c->input(0));
-      return Status::OK();
+      // Assert that the input rank is 1.
+      ShapeHandle dummy;
+      return c->WithRank(c->input(0), 1, &dummy);
     });
 
 REGISTER_OP("UniqueV2")
@@ -1201,6 +1371,23 @@ REGISTER_OP("UniqueWithCounts")
       return Status::OK();
     });
 
+REGISTER_OP("UniqueWithCountsV2")
+    .Input("x: T")
+    .Input("axis: Taxis")
+    .Output("y: T")
+    .Output("idx: out_idx")
+    .Output("count: out_idx")
+    .Attr("T: type")
+    .Attr("Taxis: {int32,int64} = DT_INT64")
+    .Attr("out_idx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      auto uniq = c->Vector(InferenceContext::kUnknownDim);
+      c->set_output(0, uniq);
+      c->set_output(1, c->input(0));
+      c->set_output(2, uniq);
+      return Status::OK();
+    });
+
 namespace {
 
 Status ShapeShapeFn(InferenceContext* c) {
@@ -1547,6 +1734,9 @@ REGISTER_OP("Tile")
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &multiples));
       if (c->RankKnown(input)) {
         TF_RETURN_IF_ERROR(c->WithRank(multiples, c->Rank(input), &multiples));
+        ShapeHandle dummy;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->input(1), c->Vector(c->Rank(input)), &dummy));
       }
 
       if (!c->RankKnown(multiples)) {
@@ -2361,13 +2551,12 @@ REGISTER_OP("Bitcast")
     .Output("output: type")
     // All supported dtypes are listed here to include qint16 and quint16.
     .Attr(
-        "T: {bfloat16, float, double, int64, int32, uint8, uint16, int8, int16,"
-        " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
-        " half}")
+        "T: {bfloat16, half, float, double, int64, int32, uint8, uint16, int8, "
+        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32}")
     .Attr(
-        "type: {bfloat16, float, double, int64, int32, uint8, uint16, int8, "
-        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
-        " half}")
+        "type: {bfloat16, half, float, double, int64, int32, uint8, uint16, "
+        "int8, int16, complex64, complex128, qint8, quint8, qint16, quint16, "
+        "qint32}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       if (!c->RankKnown(input)) {
@@ -2463,7 +2652,7 @@ REGISTER_OP("QuantizeAndDequantize")
     .Attr("input_min: float = 0")
     .Attr("input_max: float = 0")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Deprecated(22, "Replaced by QuantizeAndDequantizeV2");
 
@@ -2476,7 +2665,7 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("num_bits: int = 8")
     .Attr("range_given: bool = false")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -2493,7 +2682,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
     .Attr("signed_input: bool = true")
     .Attr("range_given: bool = true")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 86d64635f4c1bc1c34407a517267758ce5cf60fc..b1463338fbe726e10a3fb0a2cdc69521ab021ce6 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -368,7 +368,11 @@ TEST(ArrayOpsTest, ShapeN_ShapeFn) {
 TEST(ArrayOpsTest, Unique_ShapeFn) {
   ShapeInferenceTestOp op("Unique");
   INFER_OK(op, "?", "[?];in0");
-  INFER_OK(op, "[1,2,3,?,5]", "[?];in0");
+  INFER_OK(op, "[5]", "[?];in0");
+  INFER_ERROR(
+      "Shape must be rank 1 but is rank 5 for '' (op: '') with input shapes: "
+      "[1,2,3,?,5].",
+      op, "[1,2,3,?,5]");
 }
 
 TEST(ArrayOpsTest, UniqueWithCounts_ShapeFn) {
@@ -834,7 +838,7 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
   // Unknown dimensions.
   // Flatten:
   new_shape = test::AsTensor<int32>({-1});
-  INFER_OK(op, "[?];[1]", "[?]");
+  INFER_OK(op, "[?];[1]", "[d0_0]");
   INFER_OK(op, "[2,2];[1]", "[4]");
   // The first dimension is inferred:
   new_shape = test::AsTensor<int32>({2, -1});
@@ -847,6 +851,10 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
   new_shape = test::AsTensor<int32>({-1, -1, 2});
   INFER_OK(op, "[8];[3]", "[?,?,2]");
 
+  // Symbolic shape propagation
+  new_shape = test::AsTensor<int32>({-1, 2, 3});
+  INFER_OK(op, "[?,2,3];[3]", "[d0_0,2,3]");
+
   // Reshaping to a scalar.
   new_shape = test::AsTensor<int32>({});
   INFER_OK(op, "[1];[0]", "[]");
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88d6eaf819e1166c9fbcecfe1689eb52d90954d7
--- /dev/null
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -0,0 +1,310 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_OP(BoostedTreesEnsembleResource);
+
+REGISTER_OP("IsBoostedTreesEnsembleInitialized")
+    .Input("tree_ensemble_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
+    .Input("node_id_range: int32")
+    .Input("stats_summary_list: num_features * float32")
+    .Input("l1: float")
+    .Input("l2: float")
+    .Input("tree_complexity: float")
+    .Input("min_node_weight: float")
+    .Attr("max_splits: int >= 1")
+    .Attr("num_features: int >= 1")  // not passed but populated automatically.
+    .Output("node_ids_list: num_features * int32")
+    .Output("gains_list: num_features * float32")
+    .Output("thresholds_list: num_features * int32")
+    .Output("left_node_contribs_list: num_features * float32")
+    .Output("right_node_contribs_list: num_features * float32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Confirms the rank of the inputs and sets the shape of the outputs.
+      int max_splits;
+      int num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("max_splits", &max_splits));
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+      shape_inference::ShapeHandle node_id_range_shape;
+      shape_inference::ShapeHandle unused_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &node_id_range_shape));
+      TF_RETURN_IF_ERROR(
+          c->Merge(node_id_range_shape, c->MakeShape({2}), &unused_shape));
+      // Checks that all stats summary entries are of the same shape.
+      shape_inference::ShapeHandle summary_shape_base;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &summary_shape_base));
+      TF_RETURN_IF_ERROR(c->Merge(summary_shape_base,
+                                  c->MakeShape({max_splits, -1, 2}),
+                                  &unused_shape));
+      for (int i = 1; i < num_features; ++i) {
+        shape_inference::ShapeHandle summary_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(1 + i), 3, &summary_shape));
+        TF_RETURN_IF_ERROR(
+            c->Merge(summary_shape_base, summary_shape, &unused_shape));
+      }
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 1), 0, &unused_shape));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 2), 0, &unused_shape));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 3), 0, &unused_shape));
+      // Sets the output lists.
+      std::vector<shape_inference::ShapeHandle> output_shapes_vec(
+          num_features, c->MakeShape({-1}));
+      TF_RETURN_IF_ERROR(c->set_output("node_ids_list", output_shapes_vec));
+      TF_RETURN_IF_ERROR(c->set_output("gains_list", output_shapes_vec));
+      TF_RETURN_IF_ERROR(c->set_output("thresholds_list", output_shapes_vec));
+      std::vector<shape_inference::ShapeHandle> output_shapes_contribs(
+          num_features, c->MakeShape({-1, 1}));
+      TF_RETURN_IF_ERROR(
+          c->set_output("left_node_contribs_list", output_shapes_contribs));
+      TF_RETURN_IF_ERROR(
+          c->set_output("right_node_contribs_list", output_shapes_contribs));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesCreateEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("tree_ensemble_serialized: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesDeserializeEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("tree_ensemble_serialized: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesGetEnsembleStates")
+    .Input("tree_ensemble_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("num_trees: int32")
+    .Output("num_finalized_trees: int32")
+    .Output("num_attempted_layers: int32")
+    .Output("last_layer_nodes_range: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      c->set_output(3, c->Scalar());
+      c->set_output(4, c->Vector(2));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesMakeStatsSummary")
+    .Input("node_ids: int32")
+    .Input("gradients: float")
+    .Input("hessians: float")
+    .Input("bucketized_features_list: num_features * int32")
+    .Attr("max_splits: int >= 1")
+    .Attr("num_buckets: int >= 1")
+    .Attr("num_features: int >= 1")
+    .Output("stats_summary: float")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Sets the shape of the output as a Rank 4 Tensor.
+      int max_splits;
+      int num_buckets;
+      int num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("max_splits", &max_splits));
+      TF_RETURN_IF_ERROR(c->GetAttr("num_buckets", &num_buckets));
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+      shape_inference::ShapeHandle node_ids_shape;
+      shape_inference::ShapeHandle gradients_shape;
+      shape_inference::ShapeHandle hessians_shape;
+      shape_inference::ShapeHandle bucketized_feature_shape;
+      shape_inference::ShapeHandle unused_shape;
+      shape_inference::DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &node_ids_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &gradients_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &hessians_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(node_ids_shape, 0),
+                                  c->Dim(gradients_shape, 0), &unused_dim));
+      TF_RETURN_IF_ERROR(
+          c->Merge(gradients_shape, hessians_shape, &unused_shape));
+      for (int f = 0; f < num_features; ++f) {
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(3 + f), 1, &bucketized_feature_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(node_ids_shape, 0),
+                                    c->Dim(bucketized_feature_shape, 0),
+                                    &unused_dim));
+      }
+      c->set_output(0,
+                    c->MakeShape({num_features, max_splits, num_buckets, 2}));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesPredict")
+    .Input("tree_ensemble_handle: resource")
+    .Input("bucketized_features: num_bucketized_features * int32")
+    .Attr("num_bucketized_features: int >= 1")  // Inferred.
+    .Attr("logits_dimension: int")
+    .Output("logits: float")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle feature_shape;
+      int num_bucketized_features;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_bucketized_features", &num_bucketized_features));
+      shape_inference::ShapeHandle unused_input;
+      for (int i = 0; i < num_bucketized_features; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 1), 1, &feature_shape));
+        // Check that the shapes of all bucketized features are the same.
+        TF_RETURN_IF_ERROR(c->Merge(c->input(1), feature_shape, &unused_input));
+      }
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      auto logits_shape =
+          c->MakeShape({c->Dim(feature_shape, 0), logits_dimension});
+      // Logits.
+      c->set_output(0, logits_shape);
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesSerializeEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("tree_ensemble_serialized: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesTrainingPredict")
+    .Input("tree_ensemble_handle: resource")
+    .Input("cached_tree_ids: int32")
+    .Input("cached_node_ids: int32")
+    .Input("bucketized_features: num_bucketized_features * int32")
+    .Attr("num_bucketized_features: int >= 1")
+    .Attr("logits_dimension: int")
+    .Output("partial_logits: float")
+    .Output("tree_ids: int32")
+    .Output("node_ids: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle feature_shape;
+      int num_bucketized_features;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_bucketized_features", &num_bucketized_features));
+
+      shape_inference::ShapeHandle unused_input;
+      for (int i = 0; i < num_bucketized_features; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 3), 1, &feature_shape));
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->input(i + 3), feature_shape, &unused_input));
+      }
+      // all inputs/outputs except logits should have same shape.
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), feature_shape, &unused_input));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), feature_shape, &unused_input));
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      auto logits_shape =
+          c->MakeShape({c->Dim(feature_shape, 0), logits_dimension});
+      // Partial logits.
+      c->set_output(0, logits_shape);
+      // Tree ids.
+      c->set_output(1, c->MakeShape({c->Dim(feature_shape, 0)}));
+      // Node ids.
+      c->set_output(2, c->MakeShape({c->Dim(feature_shape, 0)}));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesUpdateEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("feature_ids: int32")
+    .Input("node_ids: num_features * int32")
+    .Input("gains: num_features * float")
+    .Input("thresholds: num_features * int32")
+    .Input("left_node_contribs: num_features * float")
+    .Input("right_node_contribs: num_features * float")
+    .Input("max_depth: int32")
+    .Input("learning_rate: float")
+    .Attr("pruning_mode: int >=0")
+    .Attr("num_features: int >= 0")  // Inferred.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle shape_handle;
+      int num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+
+      // Feature_ids, should be one for each feature.
+      shape_inference::ShapeHandle feature_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &feature_ids_shape));
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->input(1), c->Vector(num_features), &shape_handle));
+
+      for (int i = 0; i < num_features; ++i) {
+        // Node ids.
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 2), 1, &shape_handle));
+        auto shape_rank_1 = c->MakeShape({c->Dim(shape_handle, 0)});
+        auto shape_rank_2 = c->MakeShape({c->Dim(shape_handle, 0), 1});
+
+        // Gains.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features + 2), 1, &shape_handle));
+        // TODO(nponomareva): replace this with input("name",vector of shapes).
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features + 2),
+                                    shape_rank_1, &shape_handle));
+        // Thresholds.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 2 + 2), 1, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 2 + 2),
+                                    shape_rank_1, &shape_handle));
+        // Left and right node contribs.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 3 + 2), 2, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 3 + 2),
+                                    shape_rank_2, &shape_handle));
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 4 + 2), 2, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 4 + 2),
+                                    shape_rank_2, &shape_handle));
+      }
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6157a69df5cf535a0957df8b7ed6d4f597acd1d
--- /dev/null
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("CollectiveReduce")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
+    .Attr("final_op: {'Id', 'Div'}")
+    .Attr("subdiv_offsets: list(int)")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("CollectiveBcastSend")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
+REGISTER_OP("CollectiveBcastRecv")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 6cdb1586bc826f5de4c926c0b5f7bf0f9285cd93..c613ab144f8824586121200b3f89c87b25cc7522 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -57,18 +57,3 @@ tf_cc_binary(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index fc9e5b02a2253621203a47c5f7d1b7d311c82a97..71ba5f016a76e22e97804c733f11d38df0afc6a6 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -64,6 +64,31 @@ op {
     }
   }
 }
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "AccumulateNV2"
   input_arg {
@@ -607,6 +632,33 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Acosh"
   input_arg {
@@ -656,6 +708,31 @@ op {
     }
   }
 }
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Add"
   input_arg {
@@ -725,6 +802,41 @@ op {
     }
   }
 }
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "AddManySparseToTensorsMap"
   input_arg {
@@ -1094,6 +1206,42 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AdjustContrast"
   input_arg {
@@ -1386,6 +1534,85 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdaMax"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ApplyAdadelta"
   input_arg {
@@ -1895,42 +2122,25 @@ op {
   }
 }
 op {
-  name: "ApplyAdagradDA"
+  name: "ApplyAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -1943,14 +2153,96 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -6166,6 +6458,33 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Asinh"
   input_arg {
@@ -6215,6 +6534,31 @@ op {
     }
   }
 }
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Assert"
   input_arg {
@@ -6761,6 +7105,33 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Atan2"
   input_arg {
@@ -6812,6 +7183,33 @@ op {
     }
   }
 }
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Atanh"
   input_arg {
@@ -6861,6 +7259,31 @@ op {
     }
   }
 }
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "AudioSpectrogram"
   input_arg {
@@ -7332,60 +7755,9 @@ op {
   }
 }
 op {
-  name: "AvgPool3DGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "AvgPool3DGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
+  name: "AvgPool3D"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -7432,6 +7804,176 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7492,6 +8034,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -8328,6 +8871,50 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "BatchMatrixBandPart"
   input_arg {
@@ -10155,34 +10742,95 @@ op {
   }
 }
 op {
-  name: "BitwiseAnd"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "Bitcast"
   input_arg {
-    name: "y"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "output"
+    type_attr: "type"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT8
         type: DT_INT16
-        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-  is_commutative: true
+}
+op {
+  name: "BitwiseAnd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
 }
 op {
   name: "BitwiseAnd"
@@ -10340,6 +10988,336 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "BoostedTreesCalculateBestGainsPerFeature"
+  input_arg {
+    name: "node_id_range"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "node_ids_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "thresholds_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "left_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "right_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesCreateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesDeserializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesEnsembleResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesGetEnsembleStates"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_finalized_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_attempted_layers"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesMakeStatsSummary"
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "bucketized_features_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "stats_summary"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesSerializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesTrainingPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "cached_tree_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "cached_node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "partial_logits"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "tree_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesUpdateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "max_depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "pruning_mode"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
 op {
   name: "BroadcastArgs"
   input_arg {
@@ -10400,6 +11378,38 @@ op {
     }
   }
 }
+op {
+  name: "BroadcastTo"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Bucketize"
   input_arg {
@@ -10745,6 +11755,29 @@ op {
     }
   }
 }
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "CheckNumerics"
   input_arg {
@@ -10798,6 +11831,33 @@ op {
     type: "string"
   }
 }
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
 op {
   name: "Cholesky"
   input_arg {
@@ -10868,160 +11928,353 @@ op {
   }
 }
 op {
-  name: "CompareAndBitpack"
+  name: "ClipByValue"
   input_arg {
-    name: "input"
+    name: "t"
     type_attr: "T"
   }
   input_arg {
-    name: "threshold"
+    name: "clip_value_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_max"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_UINT8
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BOOL
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Complex"
-  input_arg {
-    name: "real"
-    type_attr: "T"
-  }
+  name: "CloseSummaryWriter"
   input_arg {
-    name: "imag"
-    type_attr: "T"
+    name: "writer"
+    type: DT_RESOURCE
   }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastRecv"
   output_arg {
-    name: "out"
-    type_attr: "Tout"
+    name: "data"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tout"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
   }
+  is_stateful: true
 }
 op {
-  name: "ComplexAbs"
+  name: "CollectiveBcastSend"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type_attr: "Tout"
+    name: "data"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-}
-op {
-  name: "ComputeAccidentalHits"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "ids"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "weights"
-    type: DT_FLOAT
-  }
   attr {
-    name: "num_true"
+    name: "group_size"
     type: "int"
   }
   attr {
-    name: "seed"
+    name: "group_key"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   attr {
-    name: "seed2"
+    name: "instance_key"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
 }
 op {
-  name: "Concat"
+  name: "CollectiveReduce"
   input_arg {
-    name: "concat_dim"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
+op {
+  name: "CompareAndBitpack"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "threshold"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Complex"
+  input_arg {
+    name: "real"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "imag"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ComplexAbs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ComputeAccidentalHits"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "ids"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Concat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
   }
   input_arg {
     name: "values"
@@ -11461,66 +12714,15 @@ op {
   }
 }
 op {
-  name: "ControlTrigger"
-}
-op {
-  name: "Conv2D"
+  name: "ConsumeMutexLock"
   input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    name: "mutex_lock"
+    type: DT_VARIANT
   }
+  is_stateful: true
+}
+op {
+  name: "ControlTrigger"
 }
 op {
   name: "Conv2D"
@@ -11542,7 +12744,6 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -11581,31 +12782,15 @@ op {
       }
     }
   }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
 }
 op {
-  name: "Conv2DBackpropFilter"
+  name: "Conv2D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -11618,6 +12803,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -11656,19 +12842,27 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "Conv2DBackpropFilter"
+  name: "Conv2D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -11683,6 +12877,7 @@ op {
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -11734,14 +12929,14 @@ op {
   }
 }
 op {
-  name: "Conv2DBackpropInput"
+  name: "Conv2DBackpropFilter"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "filter_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "out_backprop"
@@ -11797,14 +12992,14 @@ op {
   }
 }
 op {
-  name: "Conv2DBackpropInput"
+  name: "Conv2DBackpropFilter"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "filter_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "out_backprop"
@@ -11873,13 +13068,17 @@ op {
   }
 }
 op {
-  name: "Conv3D"
+  name: "Conv2DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
@@ -11891,6 +13090,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11899,8 +13100,13 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "padding"
@@ -11912,17 +13118,46 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "Conv3D"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
     type_attr: "T"
   }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -11932,16 +13167,21 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "padding"
@@ -11957,26 +13197,30 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "NHWC"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "Conv3D"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
     type_attr: "T"
   }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -11989,15 +13233,19 @@ op {
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "padding"
@@ -12013,12 +13261,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "NHWC"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -12031,16 +13279,15 @@ op {
         i: 1
         i: 1
         i: 1
-        i: 1
       }
     }
   }
 }
 op {
-  name: "Conv3DBackpropFilter"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
@@ -12059,6 +13306,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12067,8 +13316,13 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "padding"
@@ -12080,71 +13334,40 @@ op {
       }
     }
   }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
-    name: "strides"
+    name: "dilations"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
+    default_value {
       list {
-        s: "SAME"
-        s: "VALID"
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
-  deprecation {
-    version: 10
-  }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -12179,17 +13402,13 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -12237,17 +13456,13 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -12310,7 +13525,7 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "Conv3DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -12358,7 +13573,7 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "Conv3DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -12406,6 +13621,403 @@ op {
     version: 10
   }
 }
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
 op {
   name: "Conv3DBackpropInputV2"
   input_arg {
@@ -12739,6 +14351,80 @@ op {
     }
   }
 }
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Cosh"
   input_arg {
@@ -12754,6 +14440,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -12762,15 +14449,802 @@ op {
       }
     }
   }
-}
-op {
-  name: "Cosh"
-  input_arg {
-    name: "x"
+}
+op {
+  name: "CountUpTo"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "limit"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "CreateSummaryDbWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "db_uri"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "experiment_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "run_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "user_name"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "CreateSummaryFileWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "logdir"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "max_queue"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flush_millis"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filename_suffix"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradBoxes"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradBoxes"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "CudnnRNN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackprop"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackpropV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "params_backprop"
     type_attr: "T"
   }
   attr {
@@ -12779,95 +15253,109 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
-}
-op {
-  name: "CountUpTo"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
   }
   attr {
-    name: "limit"
-    type: "int"
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
-}
-op {
-  name: "CriticalSectionOp"
-  output_arg {
-    name: "resource"
-    type: DT_RESOURCE
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "seed"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "seed2"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   is_stateful: true
 }
 op {
-  name: "CropAndResize"
+  name: "CudnnRNNCanonicalToParams"
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "num_layers"
+    type: DT_INT32
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "num_units"
+    type: DT_INT32
   }
   input_arg {
-    name: "box_ind"
+    name: "input_size"
     type: DT_INT32
   }
   input_arg {
-    name: "crop_size"
-    type: DT_INT32
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  input_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
   }
   output_arg {
-    name: "crops"
-    type: DT_FLOAT
+    name: "params"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -12875,116 +15363,98 @@ op {
     }
   }
   attr {
-    name: "method"
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "lstm"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
   attr {
-    name: "extrapolation_value"
-    type: "float"
+    name: "input_mode"
+    type: "string"
     default_value {
-      f: 0
+      s: "linear_input"
     }
-  }
-}
-op {
-  name: "CropAndResize"
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "crop_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "crops"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
   attr {
-    name: "method"
+    name: "direction"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "unidirectional"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
   attr {
-    name: "extrapolation_value"
+    name: "dropout"
     type: "float"
     default_value {
       f: 0
     }
   }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
 }
 op {
-  name: "CropAndResizeGradBoxes"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
+  name: "CudnnRNNParamsSize"
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "num_layers"
+    type: DT_INT32
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "num_units"
+    type: DT_INT32
   }
   input_arg {
-    name: "box_ind"
+    name: "input_size"
     type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type: DT_FLOAT
+    name: "params_size"
+    type_attr: "S"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -12992,251 +15462,308 @@ op {
     }
   }
   attr {
-    name: "method"
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "lstm"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
-}
-op {
-  name: "CropAndResizeGradBoxes"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
   attr {
-    name: "method"
+    name: "direction"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "unidirectional"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
 }
 op {
-  name: "CropAndResizeGradImage"
+  name: "CudnnRNNParamsToCanonical"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "num_layers"
+    type: DT_INT32
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "num_units"
+    type: DT_INT32
   }
   input_arg {
-    name: "box_ind"
+    name: "input_size"
     type: DT_INT32
   }
   input_arg {
-    name: "image_size"
-    type: DT_INT32
+    name: "params"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "weights"
     type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "method"
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "lstm"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
-}
-op {
-  name: "Cross"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
 }
 op {
-  name: "Cross"
+  name: "CudnnRNNV2"
   input_arg {
-    name: "a"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "b"
+    name: "input_h"
     type_attr: "T"
   }
-  output_arg {
-    name: "product"
+  input_arg {
+    name: "input_c"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "Cross"
   input_arg {
-    name: "a"
+    name: "params"
     type_attr: "T"
   }
-  input_arg {
-    name: "b"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   output_arg {
-    name: "product"
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
     type_attr: "T"
   }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "Cross"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
   }
-  output_arg {
-    name: "product"
-    type_attr: "T"
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
 }
 op {
   name: "Cumprod"
@@ -13865,6 +16392,21 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -14601,6 +17143,65 @@ op {
     }
   }
 }
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
 op {
   name: "DecodeCompressed"
   input_arg {
@@ -14725,6 +17326,55 @@ op {
     }
   }
 }
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "message_format"
+    type: "string"
+    default_value {
+      s: "binary"
+    }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "DecodeRaw"
   input_arg {
@@ -14823,6 +17473,22 @@ op {
     }
   }
 }
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -15375,61 +18041,187 @@ op {
       }
     }
   }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
 }
 op {
   name: "DepthwiseConv2dNativeBackpropInput"
@@ -15454,6 +18246,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -15486,6 +18279,18 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "DepthwiseConv2dNativeBackpropInput"
@@ -15510,6 +18315,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -15873,6 +18679,33 @@ op {
     }
   }
 }
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "DiagPart"
   input_arg {
@@ -15924,6 +18757,33 @@ op {
     }
   }
 }
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Digamma"
   input_arg {
@@ -15969,6 +18829,29 @@ op {
     }
   }
 }
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Dilation2D"
   input_arg {
@@ -16742,6 +19625,41 @@ op {
     }
   }
 }
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "DrawBoundingBoxes"
   input_arg {
@@ -17037,6 +19955,29 @@ op {
     }
   }
 }
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "EmptyTensorList"
   input_arg {
@@ -17198,6 +20139,42 @@ op {
     }
   }
 }
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "EncodeWav"
   input_arg {
@@ -17343,6 +20320,46 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Erf"
   input_arg {
@@ -17389,7 +20406,7 @@ op {
   }
 }
 op {
-  name: "Erfc"
+  name: "Erf"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -17403,6 +20420,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -17426,7 +20444,6 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -17434,76 +20451,50 @@ op {
   }
 }
 op {
-  name: "ExecuteInCriticalSection"
-  input_arg {
-    name: "critical_section"
-    type: DT_RESOURCE
-  }
+  name: "Erfc"
   input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "ExecuteInCriticalSection"
-  input_arg {
-    name: "critical_section"
-    type: DT_RESOURCE
-  }
+  name: "Erfc"
   input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  is_stateful: true
 }
 op {
   name: "Exit"
@@ -17569,6 +20560,31 @@ op {
     }
   }
 }
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ExpandDims"
   input_arg {
@@ -17650,6 +20666,31 @@ op {
     }
   }
 }
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ExtractGlimpse"
   input_arg {
@@ -19430,6 +22471,29 @@ op {
     }
   }
 }
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "FloorDiv"
   input_arg {
@@ -19499,6 +22563,68 @@ op {
     }
   }
 }
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "FloorMod"
   input_arg {
@@ -19520,6 +22646,7 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -19548,12 +22675,53 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
 }
+op {
+  name: "FlushSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
 op {
   name: "FractionalAvgPool"
   input_arg {
@@ -20556,6 +23724,65 @@ op {
     minimum: -1
   }
 }
+op {
+  name: "GeneratorDataset"
+  input_arg {
+    name: "init_func_other_args"
+    type_list_attr: "Tinit_func_args"
+  }
+  input_arg {
+    name: "next_func_other_args"
+    type_list_attr: "Tnext_func_args"
+  }
+  input_arg {
+    name: "finalize_func_other_args"
+    type_list_attr: "Tfinalize_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "next_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tinit_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tnext_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "GetSessionHandle"
   input_arg {
@@ -21578,6 +24805,45 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
 op {
   name: "Igamma"
   input_arg {
@@ -21797,6 +25063,18 @@ op {
     type: "string"
   }
 }
+op {
+  name: "ImportEvent"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "event"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "InTopK"
   input_arg {
@@ -21989,6 +25267,75 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceSub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "InterleaveDataset"
   input_arg {
@@ -22221,6 +25568,33 @@ op {
     }
   }
 }
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "InvGrad"
   input_arg {
@@ -22311,9 +25685,99 @@ op {
       }
     }
   }
-  deprecation {
-    version: 17
-  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
 }
 op {
   name: "InvGrad"
@@ -22334,8 +25798,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -22345,17 +25809,13 @@ op {
   }
 }
 op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
+  name: "Invert"
   input_arg {
-    name: "dy"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -22363,31 +25823,24 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
-  deprecation {
-    version: 17
-  }
 }
 op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
+  name: "Invert"
   input_arg {
-    name: "dy"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -22395,18 +25848,20 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Invert"
+  name: "InvertPermutation"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -22418,65 +25873,70 @@ op {
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
       }
     }
   }
 }
 op {
-  name: "Invert"
+  name: "IsBoostedTreesEnsembleInitialized"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "IsFinite"
   input_arg {
     name: "x"
     type_attr: "T"
   }
   output_arg {
     name: "y"
-    type_attr: "T"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "InvertPermutation"
+  name: "IsFinite"
   input_arg {
     name: "x"
     type_attr: "T"
   }
   output_arg {
     name: "y"
-    type_attr: "T"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -22496,6 +25956,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -22504,7 +25965,7 @@ op {
   }
 }
 op {
-  name: "IsFinite"
+  name: "IsInf"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -22519,7 +25980,6 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -22542,6 +26002,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -22563,8 +26024,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -22616,6 +26077,29 @@ op {
     }
   }
 }
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsVariableInitialized"
   input_arg {
@@ -22751,18 +26235,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "IteratorSetStatsAggregator"
-  input_arg {
-    name: "iterator_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "stats_aggregator_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
 op {
   name: "IteratorToStringHandle"
   input_arg {
@@ -23578,6 +27050,29 @@ op {
     }
   }
 }
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "LinSpace"
   input_arg {
@@ -23794,6 +27289,31 @@ op {
     }
   }
 }
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Log1p"
   input_arg {
@@ -23843,6 +27363,31 @@ op {
     }
   }
 }
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "LogMatrixDeterminant"
   input_arg {
@@ -24334,6 +27879,10 @@ op {
     name: "num_parallel_batches"
     type: DT_INT64
   }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -24843,8 +28392,52 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -25958,6 +29551,63 @@ op {
     }
   }
 }
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "MaxPool3DGrad"
   input_arg {
@@ -26229,6 +29879,88 @@ op {
     }
   }
 }
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "MaxPool3DGradGrad"
   input_arg {
@@ -28732,6 +32464,36 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Mean"
   input_arg {
@@ -29387,6 +33149,36 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "MirrorPad"
   input_arg {
@@ -29499,7 +33291,65 @@ op {
   }
 }
 op {
-  name: "Mod"
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mul"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -29517,14 +33367,21 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  is_commutative: true
 }
 op {
   name: "Mul"
@@ -29546,6 +33403,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -29580,8 +33438,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -30112,6 +33970,40 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "MutexLock"
+  input_arg {
+    name: "mutex"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "mutex_lock"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "MutexV2"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Neg"
   input_arg {
@@ -30165,6 +34057,33 @@ op {
     }
   }
 }
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "NegTrain"
   input_arg {
@@ -30348,6 +34267,46 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "NthElement"
   input_arg {
@@ -30615,6 +34574,38 @@ op {
     }
   }
 }
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
 op {
   name: "OrderedMapClear"
   attr {
@@ -31896,6 +35887,31 @@ op {
     type: "type"
   }
 }
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
 op {
   name: "Placeholder"
   output_arg {
@@ -32125,6 +36141,37 @@ op {
     }
   }
 }
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "PrefetchDataset"
   input_arg {
@@ -32857,7 +36904,66 @@ op {
     }
   }
   deprecation {
-    version: 21
+    version: 21
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
   }
 }
 op {
@@ -32910,6 +37016,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -32970,6 +37077,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -33080,6 +37188,58 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "QuantizeAndDequantizeV3"
   input_arg {
@@ -33175,6 +37335,55 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "QuantizeDownAndShrinkRange"
   input_arg {
@@ -37221,6 +41430,41 @@ op {
     }
   }
 }
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Reciprocal"
   input_arg {
@@ -37274,6 +41518,33 @@ op {
     }
   }
 }
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ReciprocalGrad"
   input_arg {
@@ -37359,6 +41630,35 @@ op {
     }
   }
 }
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "RecordInput"
   output_arg {
@@ -37659,6 +41959,32 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "RegexReplace"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "rewrite"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "replace_global"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "Relu"
   input_arg {
@@ -37987,21 +42313,56 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "Relu6Grad"
+  name: "ReluGrad"
   input_arg {
     name: "gradients"
     type_attr: "T"
@@ -38022,15 +42383,12 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -38063,6 +42421,8 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -38097,6 +42457,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -38123,52 +42484,48 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "ReluGrad"
+  name: "RemoteCall"
   input_arg {
-    name: "gradients"
-    type_attr: "T"
+    name: "target"
+    type: DT_STRING
   }
   input_arg {
-    name: "features"
-    type_attr: "T"
+    name: "args"
+    type_list_attr: "Tin"
   }
   output_arg {
-    name: "backprops"
-    type_attr: "T"
+    name: "output"
+    type_list_attr: "Tout"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
   }
 }
 op {
@@ -38201,6 +42558,7 @@ op {
     name: "f"
     type: "func"
   }
+  is_stateful: true
 }
 op {
   name: "RemoteFusedGraphExecute"
@@ -38991,6 +43349,202 @@ op {
     }
   }
 }
+op {
+  name: "ResourceApplyAdaMax"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAdadelta"
   input_arg {
@@ -39040,6 +43594,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -39089,17 +43646,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -39116,7 +43674,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdadelta"
+  name: "ResourceApplyAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -39125,22 +43683,10 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
   input_arg {
     name: "grad"
     type_attr: "T"
@@ -39164,73 +43710,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -39280,6 +43759,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -39331,6 +43812,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -39368,21 +43850,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -39445,6 +43927,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_stateful: true
 }
 op {
@@ -42578,18 +47067,170 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -42606,7 +47247,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "ResourceScatterDiv"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -42626,17 +47267,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -42656,7 +47298,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "ResourceScatterMax"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -42676,21 +47318,72 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMin"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -42707,7 +47400,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "ResourceScatterMul"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -42794,6 +47487,57 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterSub"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterUpdate"
   input_arg {
@@ -43441,21 +48185,153 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -43476,70 +48352,11 @@ op {
       b: false
     }
   }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
+    name: "update_slots"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   is_stateful: true
@@ -46644,6 +51461,56 @@ op {
     }
   }
 }
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "RightShift"
   input_arg {
@@ -46719,6 +51586,29 @@ op {
     }
   }
 }
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Roll"
   input_arg {
@@ -46815,6 +51705,74 @@ op {
     }
   }
 }
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Rpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Rsqrt"
   input_arg {
@@ -46864,6 +51822,31 @@ op {
     }
   }
 }
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "RsqrtGrad"
   input_arg {
@@ -46949,6 +51932,35 @@ op {
     }
   }
 }
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SampleDistortedBoundingBox"
   input_arg {
@@ -47667,148 +52679,326 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMax"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterDiv"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterDiv"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
   attr {
     name: "Tindices"
     type: "type"
@@ -47828,7 +53018,7 @@ op {
   }
 }
 op {
-  name: "ScatterDiv"
+  name: "ScatterMin"
   input_arg {
     name: "ref"
     type_attr: "T"
@@ -47852,86 +53042,12 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
         type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ScatterDiv"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -50742,6 +55858,34 @@ op {
     }
   }
 }
+op {
+  name: "SetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Shape"
   input_arg {
@@ -51041,6 +56185,31 @@ op {
     }
   }
 }
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SigmoidGrad"
   input_arg {
@@ -51126,6 +56295,35 @@ op {
     }
   }
 }
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sign"
   input_arg {
@@ -51179,6 +56377,33 @@ op {
     }
   }
 }
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sin"
   input_arg {
@@ -51228,6 +56453,31 @@ op {
     }
   }
 }
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sinh"
   input_arg {
@@ -51277,6 +56527,31 @@ op {
     }
   }
 }
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Size"
   input_arg {
@@ -51457,6 +56732,37 @@ op {
     }
   }
 }
+op {
+  name: "SlideDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "Snapshot"
   input_arg {
@@ -53693,21 +58999,165 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -53728,76 +59178,11 @@ op {
       b: false
     }
   }
-}
-op {
-  name: "SparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
   attr {
-    name: "use_locking"
+    name: "update_slots"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
 }
@@ -60404,6 +65789,31 @@ op {
     }
   }
 }
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SqrtGrad"
   input_arg {
@@ -60489,6 +65899,35 @@ op {
     }
   }
 }
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Square"
   input_arg {
@@ -60542,6 +65981,33 @@ op {
     }
   }
 }
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SquaredDifference"
   input_arg {
@@ -60605,6 +66071,38 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Squeeze"
   input_arg {
@@ -60974,6 +66472,71 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatelessMultinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomNormal"
   input_arg {
@@ -61589,34 +67152,45 @@ op {
   }
 }
 op {
-  name: "StringSplit"
+  name: "StringSplit"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "delimiter"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "skip_empty"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "StringStrip"
   input_arg {
     name: "input"
     type: DT_STRING
   }
-  input_arg {
-    name: "delimiter"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
   output_arg {
-    name: "values"
+    name: "output"
     type: DT_STRING
   }
-  output_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "skip_empty"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
 }
 op {
   name: "StringToHashBucket"
@@ -61822,6 +67396,41 @@ op {
     }
   }
 }
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Substr"
   input_arg {
@@ -62087,6 +67696,28 @@ op {
     }
   }
 }
+op {
+  name: "SummaryWriter"
+  output_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Svd"
   input_arg {
@@ -62440,6 +68071,33 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Tanh"
   input_arg {
@@ -62489,6 +68147,31 @@ op {
     }
   }
 }
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TanhGrad"
   input_arg {
@@ -62574,6 +68257,35 @@ op {
     }
   }
 }
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TemporaryVariable"
   output_arg {
@@ -63813,6 +69525,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorListConcatLists"
+  input_arg {
+    name: "input_a"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "input_b"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListElementShape"
   input_arg {
@@ -63931,6 +69662,25 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListPushBackBatch"
+  input_arg {
+    name: "input_handles"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handles"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListReserve"
   input_arg {
@@ -64366,6 +70116,14 @@ op {
     version: 3
   }
 }
+op {
+  name: "Timestamp"
+  output_arg {
+    name: "ts"
+    type: DT_DOUBLE
+  }
+  is_stateful: true
+}
 op {
   name: "TopK"
   input_arg {
@@ -64809,7 +70567,77 @@ op {
   }
 }
 op {
-  name: "TruncateDiv"
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateMod"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -64827,18 +70655,10 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -64864,6 +70684,7 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -64892,6 +70713,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -64993,6 +70815,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unbatch"
   input_arg {
@@ -65034,6 +70905,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -65218,29 +71112,6 @@ op {
     }
   }
 }
-op {
-  name: "UniqueDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UniqueV2"
   input_arg {
@@ -65362,6 +71233,59 @@ op {
     }
   }
 }
+op {
+  name: "UniqueWithCountsV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  output_arg {
+    name: "count"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Unpack"
   input_arg {
@@ -65430,7 +71354,163 @@ op {
   }
   input_arg {
     name: "num_segments"
-    type: DT_INT32
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
   }
   output_arg {
     name: "output"
@@ -65444,12 +71524,15 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -65463,47 +71546,12 @@ op {
       }
     }
   }
-}
-op {
-  name: "UnsortedSegmentMax"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tnumsegments"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    default_value {
+      type: DT_INT32
     }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
     allowed_values {
       list {
         type: DT_INT32
@@ -65513,7 +71561,7 @@ op {
   }
 }
 op {
-  name: "UnsortedSegmentMax"
+  name: "UnsortedSegmentMin"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -65538,15 +71586,15 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -65575,7 +71623,7 @@ op {
   }
 }
 op {
-  name: "UnsortedSegmentMax"
+  name: "UnsortedSegmentProd"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -66205,6 +72253,31 @@ op {
     }
   }
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
@@ -66250,6 +72323,39 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "WriteAudioSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "WriteFile"
   input_arg {
@@ -66261,6 +72367,180 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "WriteGraphSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteHistogramSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteImageSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bad_color"
+    type: DT_UINT8
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteScalarSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "summary_metadata"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "ZerosLike"
   input_arg {
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/core/ops/cudnn_rnn_ops.cc
similarity index 53%
rename from tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
rename to tensorflow/core/ops/cudnn_rnn_ops.cc
index 1a79bf066c3a27e040099729fb079ee963f59270..f78f7a897a12349f7bedcb3c8abbaffe4fe250ad 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops.cc
@@ -21,31 +21,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr auto kCudnnRNNCommonInputs = R"doc(
-num_layers: Specifies the number of layers in the RNN model.
-num_units: Specifies the size of the hidden state.
-input_size: Specifies the size of the input state.
-)doc";
-
-constexpr auto kCudnnRNNCommonAttrs = R"doc(
-rnn_mode: Indicates the type of the RNN model.
-input_mode: Indicate whether there is a linear projection between the input and
-    The actual computation before the first layer. 'skip_input' is only allowed
-    when input_size == num_units; 'auto_select' implies 'skip_input' when
-    input_size == num_units; otherwise, it implies 'linear_input'.
-direction: Indicates whether a bidirectional model will be used.
-    dir = (direction == bidirectional) ? 2 : 1
-dropout: dropout probability. When set to 0., dropout is disabled.
-seed: the 1st part of a seed to initialize dropout.
-seed2: the 2nd part of a seed to initialize dropout.
-)doc";
-
-constexpr auto kCudnnRNNParamsBuffer = R"doc(
-Note that the params buffer may not be compatible across different GPUs. So any
-save and restoration should be converted to and from the canonical weights and
-biases.
-)doc";
-
 constexpr auto kRNNModeAttrs =
     "rnn_mode: {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'} = 'lstm'";
 
@@ -56,21 +31,13 @@ constexpr auto kRNNInputModeAttrs =
 constexpr auto kRNNDirectionAttrs =
     "direction: {'unidirectional', 'bidirectional'} = 'unidirectional'";
 
-constexpr auto kCudnnRNNParamsCanonical = R"doc(
-weights: the canonical form of weights that can be used for saving
-    and restoration. They are more likely to be compatible across different
-    generations.
-biases: the canonical form of biases that can be used for saving
-    and restoration. They are more likely to be compatible across different
-    generations.
-)doc";
-
 }  // namespace
 
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+
 REGISTER_OP("CudnnRNNParamsSize")
     .Input("num_layers: int32")
     .Input("num_units: int32")
@@ -87,38 +54,8 @@ REGISTER_OP("CudnnRNNParamsSize")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(1));
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Return the params size that can be used by the Cudnn RNN model. Subsequent
-weight allocation and initialization should use this size.
-)doc",
-                         kCudnnRNNCommonInputs, kCudnnRNNCommonAttrs,
-                         R"doc(
-params_size: The size of the params buffer that should be allocated and
-    initialized for this RNN model. Note that this params buffer may not be
-    compatible across GPUs. Please use CudnnRNNParamsWeights and
-    CudnnRNNParamsBiases to save and restore them in a way that is compatible
-    across different runs.
-)doc",
-                         kCudnnRNNParamsBuffer));
+    });
 
-static string CudnnRNNForwardTensors() {
-  return R"doc(
-input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
-input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
-    num_units].
-input_c: For LSTM, a 3-D tensor with the shape of
-    [num_layer * dir, batch, num_units]. For other models, it is ignored.
-params: a 1-D tensor that contains the weights and biases in an opaque layout.
-    The size must be created through CudnnRNNParamsSize, and initialized
-    separately. Note that they might not be compatible across different
-    generations. So it is a good idea to save and restore
-output: a 3-D tensor with the shape of [seq_length, batch_size,
-    dir * num_units].
-output_h: the same shape has input_h.
-output_c: the same shape as input_c for LSTM. An empty tensor for other models.
-)doc";
-}
 
 REGISTER_OP("CudnnRNN")
     .Input("input: T")
@@ -160,18 +97,51 @@ REGISTER_OP("CudnnRNN")
       c->set_output(2, output_c_shape);
       c->set_output(3, c->UnknownShape());
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Computes the RNN from the input and initial states, with respect to the params
-buffer.
-)doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
-                         R"doc(
-is_training: Indicates whether this operation is used for inferenece or
-    training.
-reserve_space: an opaque tensor that can be used in backprop calculation. It
-    is only produced if is_training is false.
-)doc"));
+    });
+
+REGISTER_OP("CudnnRNNV2")
+    .Input("input: T")
+    .Input("input_h: T")
+    .Input("input_c: T")
+    .Input("params: T")
+    .SetIsStateful()
+    .Output("output: T")
+    .Output("output_h: T")
+    .Output("output_c: T")
+    .Output("reserve_space: T")
+    .Output("host_reserved: int8")
+    .Attr("T: {float16, float32, float64}")
+    .Attr(kRNNModeAttrs)
+    .Attr(kRNNInputModeAttrs)
+    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .Attr("is_training: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      auto input_shape = c->input(0);
+      auto input_h_shape = c->input(1);
+      auto seq_length = c->Dim(input_shape, 0);
+      auto batch_size = c->Dim(input_shape, 1);
+      auto num_units = c->Dim(input_h_shape, 2);
+      string direction;
+      TF_RETURN_IF_ERROR(c->GetAttr("direction", &direction));
+      string rnn_mode;
+      TF_RETURN_IF_ERROR(c->GetAttr("rnn_mode", &rnn_mode));
+      int dir_count = (direction == "bidirectional") ? 2 : 1;
+      DimensionHandle output_size;
+      TF_RETURN_IF_ERROR(c->Multiply(num_units, dir_count, &output_size));
+      auto output_shape = c->MakeShape({seq_length, batch_size, output_size});
+      auto output_h_shape = input_h_shape;
+      auto output_c_shape TF_ATTRIBUTE_UNUSED =
+          (rnn_mode == "lstm") ? output_h_shape : c->MakeShape({});
+      c->set_output(0, output_shape);
+      c->set_output(1, output_h_shape);
+      c->set_output(2, output_c_shape);
+      c->set_output(3, c->UnknownShape());
+      c->set_output(4, c->UnknownShape());
+      return Status::OK();
+    });
 
 REGISTER_OP("CudnnRNNBackprop")
     .Input("input: T")
@@ -207,27 +177,44 @@ REGISTER_OP("CudnnRNNBackprop")
       c->set_output(2, input_c_shape);
       c->set_output(3, params_shape);
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Compute the backprop of both data and weights in a RNN.
-)doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
-                         R"doc(
-output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-    pass.
-output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-    pass.
-reserve_space: The same reserve_space produced in for forward operation.
-input_backprop: The backprop to input in the forward pass. Has the same shape
-    as input.
-input_h_backprop: The backprop to input_h in the forward pass. Has the same
-    shape as input_h.
-input_c_backprop: The backprop to input_c in the forward pass. Has the same
-    shape as input_c.
-params_backprop: The backprop to the params buffer in the forward pass. Has the
-    same shape as params.
-)doc"));
+    });
+
+REGISTER_OP("CudnnRNNBackpropV2")
+    .Input("input: T")
+    .Input("input_h: T")
+    .Input("input_c: T")
+    .Input("params: T")
+    .Input("output: T")
+    .Input("output_h: T")
+    .Input("output_c: T")
+    .Input("output_backprop: T")
+    .Input("output_h_backprop: T")
+    .Input("output_c_backprop: T")
+    .Input("reserve_space: T")
+    .Input("host_reserved: int8")
+    .SetIsStateful()
+    .Output("input_backprop: T")
+    .Output("input_h_backprop: T")
+    .Output("input_c_backprop: T")
+    .Output("params_backprop: T")
+    .Attr("T: {float16, float32, float64}")
+    .Attr(kRNNModeAttrs)
+    .Attr(kRNNInputModeAttrs)
+    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .SetShapeFn([](InferenceContext* c) {
+      auto input_shape = c->input(0);
+      auto input_h_shape = c->input(1);
+      auto input_c_shape = c->input(2);
+      auto params_shape = c->input(3);
+      c->set_output(0, input_shape);
+      c->set_output(1, input_h_shape);
+      c->set_output(2, input_c_shape);
+      c->set_output(3, params_shape);
+      return Status::OK();
+    });
 
 REGISTER_OP("CudnnRNNParamsToCanonical")
     .Input("num_layers: int32")
@@ -259,17 +246,8 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
         c->set_output(num_params + i, c->Vector(InferenceContext::kUnknownDim));
       }
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Retrieves a set of weights from the opaque params buffer that can be saved and
-restored in a way compatible with future runs.
-)doc",
-                         kCudnnRNNCommonInputs, kCudnnRNNParamsBuffer, R"doc(
-num_params: number of parameter sets for all layers.
-    Each layer may contain multiple parameter sets, with each set consisting of
-    a weight matrix and a bias vector.
-)doc",
-                         kCudnnRNNParamsCanonical, kCudnnRNNCommonAttrs));
+    });
+
 
 REGISTER_OP("CudnnRNNCanonicalToParams")
     .Input("num_layers: int32")
@@ -289,17 +267,6 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Writes a set of weights into the opaque params buffer so they can be used in
-upcoming training or inferences.
-)doc",
-                         kCudnnRNNCommonInputs, kCudnnRNNParamsCanonical,
-                         kCudnnRNNParamsBuffer, R"doc(
-num_params: number of parameter sets for all layers.
-    Each layer may contain multiple parameter sets, with each set consisting of
-    a weight matrix and a bias vector.
-)doc",
-                         kCudnnRNNCommonAttrs));
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops_test.cc b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
similarity index 65%
rename from tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops_test.cc
rename to tensorflow/core/ops/cudnn_rnn_ops_test.cc
index 95d45c0bb8078308fd3f55367003157bc5cc3892..2dd867561b80f4d2c0b55def3c9667f376782484 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops_test.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
@@ -30,6 +30,24 @@ TEST(CudnnRNNOpsTest, ParamsSize_ShapeFn) {
 }
 
 TEST(CudnnRNNOpsTest, ForwardLstm_ShapeFn) {
+  int seq_length = 2;
+  int batch_size = 3;
+  int num_units = 4;
+  int num_layers = 5;
+  int dir_count = 1;
+  std::vector<int> input_shape = {seq_length, batch_size, num_units};
+  std::vector<int> input_h_shape = {num_layers * dir_count, batch_size,
+                                    num_units};
+  std::vector<int> output_shape = {seq_length, batch_size,
+                                   num_units * dir_count};
+  auto shape_to_str = [](const std::vector<int>& v) {
+    return strings::StrCat("[", str_util::Join(v, ","), "]");
+  };
+  string input_shapes_desc = strings::StrCat(
+      shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
+      shape_to_str(input_h_shape), ";", "[?]");
+  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?";
+
   ShapeInferenceTestOp op("CudnnRNN");
   TF_ASSERT_OK(NodeDefBuilder("test", "CudnnRNN")
                    .Input({"input", 0, DT_FLOAT})
@@ -40,6 +58,10 @@ TEST(CudnnRNNOpsTest, ForwardLstm_ShapeFn) {
                    .Attr("input_mode", "auto_select")
                    .Attr("direction", "unidirectional")
                    .Finalize(&op.node_def));
+  INFER_OK(op, input_shapes_desc, output_shapes_desc);
+}
+
+TEST(CudnnRNNOpsTest, ForwardV2Lstm_ShapeFn) {
   int seq_length = 2;
   int batch_size = 3;
   int num_units = 4;
@@ -56,7 +78,18 @@ TEST(CudnnRNNOpsTest, ForwardLstm_ShapeFn) {
   string input_shapes_desc = strings::StrCat(
       shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
       shape_to_str(input_h_shape), ";", "[?]");
-  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?";
+  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?;?";
+
+  ShapeInferenceTestOp op("CudnnRNNV2");
+  TF_ASSERT_OK(NodeDefBuilder("test", "CudnnRNNV2")
+                   .Input({"input", 0, DT_FLOAT})
+                   .Input({"input_h", 0, DT_FLOAT})
+                   .Input({"input_c", 0, DT_FLOAT})
+                   .Input({"params", 0, DT_FLOAT})
+                   .Attr("rnn_mode", "lstm")
+                   .Attr("input_mode", "auto_select")
+                   .Attr("direction", "unidirectional")
+                   .Finalize(&op.node_def));
   INFER_OK(op, input_shapes_desc, output_shapes_desc);
 }
 
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 4f946fb3ca7608816180351b7753d01f13d469f2..3112f35da43d16d7a4cd4c1c8e017cab3366e070 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -668,13 +668,31 @@ REGISTER_OP("TensorArrayGatherV3")
     .Attr("dtype: type")
     .Attr("element_shape: shape = { unknown_rank: true }")
     .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle indices;
       ShapeHandle unused;
       DimensionHandle unused_dim;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &indices));
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      return shape_inference::UnknownShape(c);
+      auto shapes = c->input_handle_shapes_and_types(0);
+      if (shapes != nullptr && !shapes->empty()) {
+        ShapeHandle tensor_shape = shapes->at(0).shape;
+        ShapeHandle output_shape;
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(indices, tensor_shape, &output_shape));
+        c->set_output(0, output_shape);
+        return Status::OK();
+      } else {
+        PartialTensorShape p;
+        TF_RETURN_IF_ERROR(c->GetAttr("element_shape", &p));
+        ShapeHandle s;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s));
+        ShapeHandle output_shape;
+        TF_RETURN_IF_ERROR(c->Concatenate(indices, s, &output_shape));
+        c->set_output(0, output_shape);
+        return Status::OK();
+      }
     });
 
 REGISTER_OP("TensorArrayScatterV3")
@@ -685,12 +703,25 @@ REGISTER_OP("TensorArrayScatterV3")
     .Output("flow_out: float")
     .Attr("T: type")
     .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle indices;
       ShapeHandle unused;
       DimensionHandle unused_dim;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &indices));
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      ShapeHandle value_shape;
+      // Assert that the length of the indices tensor is equal to the first
+      // dimension of the value tensor.
+      TF_RETURN_IF_ERROR(
+          c->MergePrefix(c->input(2), indices, &value_shape, &indices));
+      auto shapes = c->input_handle_shapes_and_types(0);
+      if (shapes != nullptr && !shapes->empty()) {
+        ShapeHandle tensor_shape = shapes->at(0).shape;
+        ShapeHandle fed_shape;
+        TF_RETURN_IF_ERROR(c->Subshape(value_shape, 1, &fed_shape));
+        TF_RETURN_IF_ERROR(c->Merge(tensor_shape, fed_shape, &fed_shape));
+      }
       return shape_inference::ScalarShape(c);
     });
 
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 9e98f56c745a2b0b16531e2785e43ba8464d42b8..4ba3f15ef03eca4e37f77513dd3555fe57109017 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -66,6 +66,30 @@ REGISTER_OP("SparseTensorSliceDataset")
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("GeneratorDataset")
+    .Input("init_func_other_args: Tinit_func_args")
+    .Input("next_func_other_args: Tnext_func_args")
+    .Input("finalize_func_other_args: Tfinalize_func_args")
+    .Output("handle: variant")
+    .Attr("init_func: func")
+    .Attr("next_func: func")
+    .Attr("finalize_func: func")
+    .Attr("Tinit_func_args: list(type) >= 0")
+    .Attr("Tnext_func_args: list(type) >= 0")
+    .Attr("Tfinalize_func_args: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("UnbatchDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ZipDataset")
     .Input("input_datasets: N * variant")
     .Output("handle: variant")
@@ -88,8 +112,11 @@ REGISTER_OP("RepeatDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate the
-                                                // shape of `count`.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TakeDataset")
     .Input("input_dataset: variant")
@@ -97,7 +124,11 @@ REGISTER_OP("TakeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("SkipDataset")
     .Input("input_dataset: variant")
@@ -105,7 +136,11 @@ REGISTER_OP("SkipDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("BytesProducedStatsDataset")
     .Input("input_dataset: variant")
@@ -113,7 +148,11 @@ REGISTER_OP("BytesProducedStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("LatencyStatsDataset")
     .Input("input_dataset: variant")
@@ -121,6 +160,18 @@ REGISTER_OP("LatencyStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("SetStatsAggregatorDataset")
+    .Input("input_dataset: variant")
+    .Input("stats_aggregator: resource")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MapDataset")
@@ -149,6 +200,7 @@ REGISTER_OP("MapAndBatchDataset")
     .Input("other_arguments: Targuments")
     .Input("batch_size: int64")
     .Input("num_parallel_batches: int64")
+    .Input("drop_remainder: bool")
     .Output("handle: variant")
     .Attr("f: func")
     .Attr("Targuments: list(type) >= 0")
@@ -162,7 +214,12 @@ REGISTER_OP("PrefetchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ScanDataset")
     .Input("input_dataset: variant")
@@ -246,7 +303,28 @@ REGISTER_OP("BatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+// TODO(mrry): move SlideDataset to contrib in the future.
+REGISTER_OP("SlideDataset")
+    .Input("input_dataset: variant")
+    .Input("window_size: int64")
+    .Input("stride: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // window_size and stride should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("PaddedBatchDataset")
     .Input("input_dataset: variant")
@@ -276,7 +354,14 @@ REGISTER_OP("DenseToSparseBatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // row_shape should be a 1-D vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("RangeDataset")
     .Input("start: int64")
@@ -287,7 +372,14 @@ REGISTER_OP("RangeDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // start, stop, and step should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("RandomDataset")
     .Input("seed: int64")
@@ -297,7 +389,13 @@ REGISTER_OP("RandomDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
@@ -308,7 +406,14 @@ REGISTER_OP("ShuffleDataset")
     .Attr("reshuffle_each_iteration: bool = true")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ShuffleAndRepeatDataset")
     .Input("input_dataset: variant")
@@ -319,7 +424,15 @@ REGISTER_OP("ShuffleAndRepeatDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, seed2, and count should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("CacheDataset")
     .Input("input_dataset: variant")
@@ -327,14 +440,12 @@ REGISTER_OP("CacheDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("UniqueDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // filename should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TextLineDataset")
     .Input("filenames: string")
@@ -343,10 +454,16 @@ REGISTER_OP("TextLineDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): validate
-                                                // that `filenames` is
-                                                // a scalar or a
-                                                // vector.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      return shape_inference::ScalarShape(c);
+      // `compression_type` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // `buffer_size` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+    });
 
 REGISTER_OP("SqlDataset")
     .Input("driver_name: string")
@@ -357,7 +474,14 @@ REGISTER_OP("SqlDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // driver_name, data_source_name, and query should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("FixedLengthRecordDataset")
     .Input("filenames: string")
@@ -368,7 +492,18 @@ REGISTER_OP("FixedLengthRecordDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // header_bytes, record_bytes, footer_bytes, buffer_size should be
+      // scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TFRecordDataset")
     .Input("filenames: string")
@@ -377,7 +512,16 @@ REGISTER_OP("TFRecordDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // `compression_type` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // `buffer_size` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("Iterator")
     .Output("handle: resource")
@@ -474,11 +618,6 @@ REGISTER_OP("StatsAggregatorHandle")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''");
 
-REGISTER_OP("IteratorSetStatsAggregator")
-    .Input("iterator_handle: resource")
-    .Input("stats_aggregator_handle: resource")
-    .SetShapeFn(shape_inference::NoOutputs);
-
 REGISTER_OP("StatsAggregatorSummary")
     .Input("iterator: resource")
     .Output("summary: string")
@@ -498,7 +637,12 @@ REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
     // length of `output_types` is `N`, the `output_shapes` are
     // (as far as possible to tell statically) compatible with `padded_shapes`,
     // and that `padding_values` are all scalars.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("EnqueueInQueueDataset")
     .Input("queue: variant")
@@ -509,4 +653,10 @@ REGISTER_OP("EnqueueInQueueDataset")
     // reading from queue handle (is that even possible?).
     .SetShapeFn(shape_inference::NoOutputs);
 
+REGISTER_OP("DatasetToTFRecord")
+    .Input("input_dataset: variant")
+    .Input("filename: string")
+    .Input("compression_type: string")
+    .SetShapeFn(shape_inference::NoOutputs);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/decode_proto_ops.cc b/tensorflow/core/ops/decode_proto_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f6fb2f58234c564960ad378867a6af27d1b5d2e
--- /dev/null
+++ b/tensorflow/core/ops/decode_proto_ops.cc
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("DecodeProtoV2")
+    .Input("bytes: string")
+    .Attr("message_type: string")
+    .Attr("field_names: list(string)")
+    .Attr("output_types: list(type) >= 0")
+    .Attr("descriptor_source: string = 'local://'")
+    .Attr("message_format: string = 'binary'")
+    .Attr("sanitize: bool = false")
+    .Output("sizes: int32")
+    .Output("values: output_types")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input = c->input(0);
+
+      std::vector<tensorflow::DataType> output_types;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_types", &output_types));
+
+      ShapeHandle sizes;
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(input, c->Vector(output_types.size()), &sizes));
+      c->set_output(0, sizes);
+
+      // TODO(nix): to do the best possible job of shape inference, we
+      // should examine the proto descriptors here in order to set shape
+      // indices to 1 instead of unknown for optional or required fields.
+      // Any general-purpose code will have to handle the unknown case,
+      // but there might be XLA code that could be sped up with the additional
+      // knowledge.
+      for (int i = 0; i < output_types.size(); ++i) {
+        ShapeHandle values;
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(input, c->Vector(c->UnknownDim()), &values));
+        c->set_output(i + 1, values);
+      }
+
+      return Status::OK();
+    });
+
+// TODO(nix): Consider adding an additional input argument that truncates
+// repeated fields to a maximum count. For now this could be done by passing
+// the output through tf.slice.
+
+// TODO(nix): define missing value behavior.
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/encode_proto_ops.cc b/tensorflow/core/ops/encode_proto_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5ec3056e35c8be82302f6cb32174661c9979225
--- /dev/null
+++ b/tensorflow/core/ops/encode_proto_ops.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("EncodeProto")
+    .Input("sizes: int32")
+    .Input("values: Tinput_types")
+    .Attr("field_names: list(string)")
+    .Attr("message_type: string")
+    .Attr("descriptor_source: string = 'local://'")
+    .Attr("Tinput_types: list(type)")
+    .Output("bytes: string")
+    .SetShapeFn([](InferenceContext* c) {
+      int first_field_index = 1;
+      int num_fields = c->num_inputs() - 1;
+
+      ShapeHandle output;
+      for (int i = num_fields - 1; i >= 0; --i) {
+        ShapeHandle input = c->input(first_field_index + i);
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(input, 2, &input));
+        ShapeHandle inner;
+        TF_RETURN_IF_ERROR(c->Subshape(input, 0, -1, &inner));
+        TF_RETURN_IF_ERROR(c->Merge(inner, output, &output));
+      }
+
+      c->set_output(0, output);
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/function_ops.cc b/tensorflow/core/ops/function_ops.cc
index ada96fa1d2ddf79b2669fa3fc437ce7b872a2eb1..a6914d9383d2f5c623b17fb0b918c4907ed84175 100644
--- a/tensorflow/core/ops/function_ops.cc
+++ b/tensorflow/core/ops/function_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -55,6 +56,7 @@ REGISTER_OP("_ListToArray")
     .Attr("Tin: list(type)")
     .Attr("T: type")
     .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Converts a list of tensors to an array of tensors.
 )doc");
@@ -65,6 +67,7 @@ REGISTER_OP("_ArrayToList")
     .Attr("T: type")
     .Attr("N: int >= 1")
     .Attr("out_types: list(type)")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Converts an array of tensors to a list of tensors.
 )doc");
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 9e18d20db65075e471862def3924811b260f8a08..4d4a370478e0512bfb0c4a3ee146bd39d4934e76 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -47,8 +47,10 @@ REGISTER_OP("RemoteCall")
     .Attr("Tin: list(type)")
     .Attr("Tout: list(type)")
     .Attr("f: func")
+    .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
 
+// TODO(drpng): remove this.
 REGISTER_OP("_If")
     .Input("cond: Tcond")
     .Input("input: Tin")
@@ -75,8 +77,18 @@ else_branch: A function that takes 'inputs' and returns a list of
     tensors.  whose types are the same as what then_branch returns.
 )doc");
 
-// TODO(b/37549631) setting the While Op to always be stateful is too
-// conservative.
+REGISTER_OP("If")
+    .Input("cond: Tcond")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tcond: type")
+    .Attr("Tin: list(type)")
+    .Attr("Tout: list(type)")
+    .Attr("then_branch: func")
+    .Attr("else_branch: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+// TODO(drpng): remove this.
 REGISTER_OP("_While")
     .Input("input: T")
     .Output("output: T")
@@ -107,4 +119,39 @@ body: A function that takes a list of tensors and returns another
       by T.
 )doc");
 
+// TODO(b/37549631) setting the While Op to always be stateful is too
+// conservative.
+REGISTER_OP("While")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("cond: func")
+    .Attr("body: func")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    });
+
+REGISTER_OP("For")
+    .Input("start: int32")
+    .Input("limit: int32")
+    .Input("delta: int32")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("body: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+// TODO(b/73826847, b/37549631) Mark as stateful.
+REGISTER_OP("PartitionedCall")
+    .Input("args: Tin")
+    .Output("output: Tout")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("f: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 3487c955cbb2b06bdb33000da549c0fc6e7f86e8..b9f94ba1c5a62ec3463208ba4946f0370cfb8f0b 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -30,7 +30,8 @@ REGISTER_OP("EmptyTensorList")
       DataType t;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
       shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(
+          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{{s, t}});
       return Status::OK();
@@ -70,6 +71,50 @@ REGISTER_OP("TensorListPushBack")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListPushBackBatch")
+    .Input("input_handles: variant")
+    .Input("tensor: element_dtype")
+    .Output("output_handles: variant")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle input_handles;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input_handles));
+
+      shape_inference::ShapeHandle tensor;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &tensor));
+
+      TF_RETURN_IF_ERROR(
+          c->MergePrefix(tensor, input_handles, &tensor, &input_handles));
+
+      c->set_output(0, input_handles);
+
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle s = c->UnknownShape();
+
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to push to list with wrong variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != t) {
+          return errors::InvalidArgument(
+              "Trying to push to list with wrong element dtype. List has type ",
+              DataTypeString(list_shape_type.dtype),
+              " but trying to push element with type ", DataTypeString(t));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        s = list_shape_type.shape;
+      }
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListLength")
     .Input("input_handle: variant")
     .Output("length: int32")
@@ -135,10 +180,6 @@ REGISTER_OP("TensorListStack")
         }
         shape_inference::ShapeHandle ignored;
         TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        if (!c->FullyDefined(s) || !c->FullyDefined(list_shape_type.shape)) {
-          return errors::InvalidArgument(
-              "Can only gather from a list with fully defined shapes.");
-        }
         s = list_shape_type.shape;
       }
       int expected_num_elements = -1;
@@ -197,6 +238,7 @@ REGISTER_OP("TensorListReserve")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
       shape_inference::ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
       DataType t;
@@ -253,5 +295,46 @@ REGISTER_OP("TensorListSetItem")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListConcatLists")
+    .Input("input_a: variant")
+    .Input("input_b: variant")
+    .Attr("element_dtype: type")
+    .Output("output: variant")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      auto input_a = c->input(0);
+      auto input_b = c->input(1);
+      TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input_a));
+      c->set_output(0, input_a);
+
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+
+      auto* handle_data_a = c->input_handle_shapes_and_types(0);
+      auto* handle_data_b = c->input_handle_shapes_and_types(1);
+      if (handle_data_a == nullptr && handle_data_b == nullptr) {
+        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        return Status::OK();
+      }
+      shape_inference::ShapeAndType list_shape_type_a =
+          (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
+      const shape_inference::ShapeAndType& list_shape_type_b =
+          (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
+      if (list_shape_type_a.dtype != t) {
+        return errors::InvalidArgument("input_a.type != element_dtype: ",
+                                       DataTypeString(list_shape_type_a.dtype),
+                                       " vs. ", DataTypeString(t));
+      }
+      if (list_shape_type_b.dtype != t) {
+        return errors::InvalidArgument("input_b.type != element_dtype: ",
+                                       DataTypeString(list_shape_type_b.dtype),
+                                       " vs. ", DataTypeString(t));
+      }
+      TF_RETURN_IF_ERROR(c->Merge(list_shape_type_a.shape,
+                                  list_shape_type_b.shape,
+                                  &list_shape_type_a.shape));
+      c->set_output_handle_shapes_and_types(0, {list_shape_type_a});
+      return Status::OK();
+    });
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index d263dc25b29d5c867a10ef20ea1b39fa9b9662f1..fbde692e959769fca53c91fef649b18c248526a6 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -111,4 +111,9 @@ REGISTER_OP("MergeSummary")
     .Attr("N : int >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("Timestamp")
+    .Output("ts: float64")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
index 95b4774fe6e230800e71d237c2cd027acf6e054b..e180f3d5f6958b372c31a98e17c66f88ef188f8e 100644
--- a/tensorflow/core/ops/manip_ops.cc
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -28,6 +28,17 @@ REGISTER_OP("Roll")
     .Attr("T: type")
     .Attr("Tshift: {int32,int64}")
     .Attr("Taxis: {int32,int64}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // The `input` must be 1-D or higher
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+      // The `shift` must be scalar or 1-D.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &unused));
+      // The `axis` must be scalar or 1-D.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      // Validate 'shift' is the same shape as axis'.
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->input(2), &unused));
+      return shape_inference::UnchangedShape(c);
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 8dcd3e815f3c19b41b1ef02a23e1f5ce36697a23..da38a6bc2497aca1623faed40c41386a4daff113 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 
@@ -362,7 +363,7 @@ class MathGradTest : public ::testing::Test {
 };
 
 void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8f33d51d5a20fc207102e4bf79e7605d9817eb9f..8f8443a46cfa68e9879825d36b305b4f7774bd66 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -65,7 +65,7 @@ REGISTER_OP("BatchMatMul")
     .Input("x: T")
     .Input("y: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
+    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -133,7 +133,7 @@ _HostCast requires its input and produces its output in host memory.
 REGISTER_OP("Abs")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("ComplexAbs")
@@ -148,27 +148,27 @@ REGISTER_OP("ComplexAbs")
   Input("x: T")                                                          \
       .Output("y: T")                                                    \
       .Attr(                                                             \
-          "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+          "T: {bfloat16, half, float, double, int32, int64, complex64, " \
           "complex128}")                                                 \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_REAL()                              \
   Input("x: T")                                   \
       .Output("y: T")                             \
-      .Attr("T: {half, bfloat16, float, double}") \
+      .Attr("T: {bfloat16, half, float, double}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_COMPLEX()                                                  \
   Input("x: T")                                                          \
       .Output("y: T")                                                    \
-      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
+      .Attr("T: {bfloat16, half, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_GRADIENT_COMPLEX()                                         \
   Input("y: T")                                                          \
       .Input("dy: T")                                                    \
       .Output("z: T")                                                    \
-      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
+      .Attr("T: {bfloat16, half, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 REGISTER_OP("Neg").UNARY();
@@ -246,57 +246,57 @@ REGISTER_OP("Atan").UNARY();
 REGISTER_OP("IsNan")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsInf")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsFinite")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Sign")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
         "complex128}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Floor")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Ceil")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Rint")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // Declares cwise binary operations signature: 't, 't -> 't.
 
 #define BINARY_MORE()                                                          \
   Input("x: T").Input("y: T").Output("z: T").Attr(                             \
-      "T: {half, bfloat16, float, double, uint8, int8, uint16, int16, int32, " \
+      "T: {bfloat16, half, float, double, uint8, int8, uint16, int16, int32, " \
       "int64, complex64, complex128}")
 
 #define BINARY_FEWER()                                               \
   Input("x: T").Input("y: T").Output("z: T").Attr(                   \
-      "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+      "T: {bfloat16, half, float, double, int32, int64, complex64, " \
       "complex128}")
 
 REGISTER_OP("Add")
@@ -304,7 +304,7 @@ REGISTER_OP("Add")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
         "complex64, complex128, string}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -315,7 +315,7 @@ REGISTER_OP("AddV2")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
         "complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .SetIsAggregate()
@@ -412,7 +412,7 @@ REGISTER_OP("Maximum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -437,7 +437,7 @@ REGISTER_OP("Minimum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -445,21 +445,21 @@ REGISTER_OP("Mod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, float16, half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("FloorMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("TruncateMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Pow")
@@ -467,7 +467,7 @@ REGISTER_OP("Pow")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "T: {bfloat16, float, half, double, int32, int64, complex64, "
         "complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -503,7 +503,7 @@ REGISTER_OP("Atan2")
     .Input("y: T")
     .Input("x: T")
     .Output("z: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Betainc")
@@ -574,7 +574,7 @@ REGISTER_OP("GreaterEqual").COMPARISON();
       .Output("z: bool")                                                   \
       .SetIsCommutative()                                                  \
       .Attr(                                                               \
-          "T: {half, bfloat16, float, double, uint8, int8, int16, int32, " \
+          "T: {bfloat16, half, float, double, uint8, int8, int16, int32, " \
           "int64, complex64, quint8, qint8, qint32, string, bool, "        \
           "complex128}")                                                   \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
@@ -713,7 +713,7 @@ REGISTER_OP("MatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
+    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
     .SetShapeFn(shape_inference::MatMulShape);
 
 REGISTER_OP("SparseMatMul")
@@ -1558,6 +1558,14 @@ REGISTER_OP("Bucketize")
     .Attr("boundaries: list(float)")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("ClipByValue")
+    .Input("t: T")
+    .Input("clip_value_min: T")
+    .Input("clip_value_max: T")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 #ifdef INTEL_MKL
 REGISTER_OP("_MklAddN")
     .Input("inputs: N * T")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index ca3772e6f89805b70f05f1c9fd5e36ee99f2d510..8f974d5367a486dca39cddfd3fbdca4d4a3bf6eb 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -239,20 +240,21 @@ TEST(MathOpsTest, Select_ShapeFn) {
 
   // Expect an error when the shapes can't be merged.
   handle_data[2]->at(0).first = shape_proto({2, 2});
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
+                                    "must be equal, but are 1 and 2"));
   handle_data[2]->at(0).first = i1;  // restore to valid
 
   // Expect an error when the types can't be merged.
   handle_data[2]->at(1).second = DT_INT64;
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("pointing to different dtypes"));
+  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
+                                    "pointing to different dtypes"));
   handle_data[2]->at(1).second = DT_INT32;  // restore to valid
 
   // Expect an error when different numbers of tensors are merged.
   handle_data[2]->push_back({i1, DT_FLOAT});
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("pointing to different numbers of tensors"));
+  EXPECT_TRUE(
+      str_util::StrContains(run_inference_for_handles().error_message(),
+                            "pointing to different numbers of tensors"));
   handle_data[2]->pop_back();  // restore to valid.
 }
 
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 67481fd202b3c3b35033b72e4c1c5fd294d98696..bb46dafd424fe6a4bc1cf27a80b0adc94f573926 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -266,7 +266,7 @@ REGISTER_OP("Conv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -279,7 +279,7 @@ REGISTER_OP("Conv2DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -301,7 +301,7 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -472,7 +472,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -490,7 +490,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -524,6 +524,7 @@ REGISTER_OP("Conv3DBackpropInput")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropInputV2")
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     });
@@ -537,6 +538,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropFilterV2")
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &out));
@@ -589,7 +591,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("AvgPool3DGrad")
@@ -600,7 +602,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -618,7 +620,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {half, bfloat16, float}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("MaxPool3DGrad")
@@ -630,8 +632,8 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float} = DT_FLOAT")
-    .Attr("TInput: {bfloat16, float} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     });
@@ -1062,12 +1064,27 @@ REGISTER_OP("SoftmaxCrossEntropyWithLogits")
     .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
-      TF_RETURN_IF_ERROR(c->Merge(input, c->input(1), &input));
+      if (c->WithRank(c->input(0), 2, &input) == Status::OK() &&
+          c->Merge(input, c->input(1), &input) == Status::OK()) {
+        DimensionHandle batch_size = c->Dim(input, 0);
+        c->set_output(0, c->Vector(batch_size));
+        c->set_output(1, input);
+        return Status::OK();
+      }
+      TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFn(c, 1));
 
-      DimensionHandle batch_size = c->Dim(input, 0);
+      if (!c->RankKnown(c->output(1))) {
+        return errors::InvalidArgument(
+            "Shape must be broadcasted with rank 2, but is rank is unknown.");
+      }
+
+      if (c->Rank(c->output(1)) != 2) {
+        return errors::InvalidArgument(
+            "Shape must be broadcasted with rank 2, but is rank ",
+            c->Rank(c->output(1)));
+      }
+      DimensionHandle batch_size = c->Dim(c->output(1), 0);
       c->set_output(0, c->Vector(batch_size));
-      c->set_output(1, input);
       return Status::OK();
     });
 
@@ -1155,9 +1172,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument(
-        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
-        c->Value(last_dim));
+    return errors::InvalidArgument("input must have last dimension >= k = ",
+                                   c->Value(k_dim), " but is ",
+                                   c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1211,9 +1228,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument(
-            "Input must have last dimension > n = ", c->Value(n_dim),
-            " but is ", c->Value(last_dim));
+        return errors::InvalidArgument("Input must have last dimension > n = ",
+                                       c->Value(n_dim), " but is ",
+                                       c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
@@ -1498,6 +1515,7 @@ REGISTER_OP("_MklConv2D")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 MKL version of Conv2D operator. Uses MKL DNN APIs to perform 2D convolution.
@@ -1516,6 +1534,8 @@ REGISTER_OP("__MklDummyConv2DWithBias")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 Dummy node that enables fusing Conv2D and BiasAdd operator for MKL. This node
 does not perform anything. It is just created as an intermediate output of
@@ -1541,6 +1561,8 @@ REGISTER_OP("_MklConv2DWithBias")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 MKL version of Conv2D and BiasAdd operator. Uses MKL DNN APIs to perform
 2D convolution and add Bias to the output of convolution.
@@ -1563,6 +1585,7 @@ REGISTER_OP("_MklConv2DBackpropFilter")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
@@ -1589,6 +1612,7 @@ REGISTER_OP("__MklDummyConv2DBackpropFilterWithBias")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input_shape;
       // Fetch the data_format attribute, which may not exist.
@@ -1633,6 +1657,7 @@ REGISTER_OP("_MklConv2DBackpropFilterWithBias")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input_shape;
       // Fetch the data_format attribute, which may not exist.
@@ -1660,6 +1685,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+#ifdef INTEL_MKL_ML
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -1668,6 +1694,7 @@ REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .Doc(R"doc(
 MKL version of Conv2DBackpropBias. Uses MKL DNN APIs to compute the
 gradients of convolution with respect to the bias.
@@ -1675,6 +1702,7 @@ gradients of convolution with respect to the bias.
 NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
+#endif
 
 REGISTER_OP("_MklConv2DBackpropInput")
     .Input("input_sizes: int32")
@@ -1690,6 +1718,7 @@ REGISTER_OP("_MklConv2DBackpropInput")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -2007,10 +2036,10 @@ REGISTER_OP("_MklFusedBatchNorm")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &x));
 
       bool is_training;
-      c->GetAttr("is_training", &is_training);
+      TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
       int number_inputs = (is_training) ? 3 : 5;
       string data_format;
-      c->GetAttr("data_format", &data_format);
+      TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format));
       DimensionHandle channel_dim =
           (data_format == "NHWC") ? c->Dim(x, 3) : c->Dim(x, 1);
 
@@ -2076,8 +2105,8 @@ REGISTER_OP("_MklFusedBatchNormGrad")
 
       bool is_training;
       string data_format;
-      c->GetAttr("is_training", &is_training);
-      c->GetAttr("data_format", &data_format);
+      TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
+      TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format));
       DimensionHandle channel_dim = (data_format == "NHWC")
                                         ? c->Dim(y_backprop, 3)
                                         : c->Dim(y_backprop, 1);
@@ -2131,6 +2160,7 @@ REGISTER_OP("_MklToTf")
     .Output("output: T")
     .Attr("T: {half, float, double}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to convert a tensor from MKL layout to TensorFlow layout.
 
@@ -2152,6 +2182,7 @@ REGISTER_OP("_MklInputConversion")
         "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, "
         "complex64, complex128}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to process the inputs to an elementwise MKL op. Both inputs
 need to be either in TF or in MKL format. This op is added before every
diff --git a/tensorflow/core/ops/nn_ops_test.cc b/tensorflow/core/ops/nn_ops_test.cc
index 1b17a7cda65f210e1981e0f46f47691f0faba465..289b95305561dc60e52951b950dc4f6ade179fe2 100644
--- a/tensorflow/core/ops/nn_ops_test.cc
+++ b/tensorflow/core/ops/nn_ops_test.cc
@@ -410,10 +410,18 @@ TEST(NNOpsTest, SoftmaxCrossEntropyWithLogits_ShapeFn) {
   INFER_OK(op, "[1,?];[?,2]", "[d0_0];[d0_0,d0_1|d1_1]");
   INFER_OK(op, "[?,2];[1,2]", "[d1_0];in1");
 
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1,?];[2,?]");
-  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[1,2,3];?");
-  INFER_ERROR("Shapes must be equal rank, but are 2 and 3", op, "?;[1,2,3]");
+  INFER_ERROR("Shape must be broadcasted with rank 2", op, "[1,2,3];?");
+  INFER_ERROR("Shape must be broadcasted with rank 2", op, "?;[1,2,3]");
+
+  // Broadcast example
+  // [1,4] and [2,4] are broadcasted to [2,4]
+  INFER_OK(op, "[1,4];[2,4]", "[d1_0];[d1_0,d0_1|d1_1]");
+  // [2,4] and [2,1] are broadcasted to [2,4]
+  INFER_OK(op, "[2,4];[2,1]", "[d0_0];[d0_0|d1_0,d0_1]");
+  // [1,?] and [2,4] are broadcasted to [2,4]
+  INFER_OK(op, "[1,?];[2,4]", "[d1_0];[d1_0,d0_1|d1_1]");
+  // [2,4] and [?,1] are broadcasted to [2,4]
+  INFER_OK(op, "[2,4];[?,1]", "[d0_0];[d0_0|d1_0,d0_1]");
 }
 
 TEST(NNOpsTest, SparseSoftmaxCrossEntropyWithLogits_ShapeFn) {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 45ff08f38b134f963460d15f949411a7f1619d0c..90368fe614eb2d839b5dedc23d034fc80b4baa98 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -30,8 +30,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -210,8 +210,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -237,8 +237,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -266,8 +266,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -423,8 +423,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -684,6 +684,85 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdaMax"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ApplyAdadelta"
   input_arg {
@@ -812,6 +891,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "ApplyAdagradDA"
@@ -1932,8 +2018,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -1959,8 +2045,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2191,8 +2277,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -2223,6 +2309,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -2244,8 +2331,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2448,6 +2535,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -2509,6 +2597,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -3004,8 +3093,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -3854,6 +3943,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -3869,7 +3959,6 @@ op {
         type: DT_QINT16
         type: DT_QUINT16
         type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -3879,6 +3968,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -3894,7 +3984,6 @@ op {
         type: DT_QINT16
         type: DT_QUINT16
         type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -3996,352 +4085,405 @@ op {
   is_commutative: true
 }
 op {
-  name: "BroadcastArgs"
+  name: "BoostedTreesCalculateBestGainsPerFeature"
   input_arg {
-    name: "s0"
-    type_attr: "T"
+    name: "node_id_range"
+    type: DT_INT32
   }
   input_arg {
-    name: "s1"
-    type_attr: "T"
+    name: "stats_summary_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
   }
-  output_arg {
-    name: "r0"
-    type_attr: "T"
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "BroadcastGradientArgs"
   input_arg {
-    name: "s0"
-    type_attr: "T"
+    name: "tree_complexity"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "s1"
-    type_attr: "T"
+    name: "min_node_weight"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "r0"
-    type_attr: "T"
+    name: "node_ids_list"
+    type: DT_INT32
+    number_attr: "num_features"
   }
   output_arg {
-    name: "r1"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Bucketize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
   }
   output_arg {
-    name: "output"
+    name: "thresholds_list"
     type: DT_INT32
+    number_attr: "num_features"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "boundaries"
-    type: "list(float)"
-  }
-}
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
+  output_arg {
+    name: "left_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "right_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "max_splits"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "num_features"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
 }
 op {
-  name: "CTCBeamSearchDecoder"
+  name: "BoostedTreesCreateEnsemble"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "decoded_indices"
+    name: "stamp_token"
     type: DT_INT64
-    number_attr: "top_paths"
   }
-  output_arg {
-    name: "decoded_values"
-    type: DT_INT64
-    number_attr: "top_paths"
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
   }
-  output_arg {
-    name: "decoded_shape"
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesDeserializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
     type: DT_INT64
-    number_attr: "top_paths"
   }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesEnsembleResourceHandleOp"
   output_arg {
-    name: "log_probability"
-    type: DT_FLOAT
+    name: "resource"
+    type: DT_RESOURCE
   }
   attr {
-    name: "beam_width"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "top_paths"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "merge_repeated"
-    type: "bool"
+    name: "shared_name"
+    type: "string"
     default_value {
-      b: true
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "CTCGreedyDecoder"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
+  name: "BoostedTreesGetEnsembleStates"
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "decoded_indices"
+    name: "stamp_token"
     type: DT_INT64
   }
   output_arg {
-    name: "decoded_values"
-    type: DT_INT64
+    name: "num_trees"
+    type: DT_INT32
   }
   output_arg {
-    name: "decoded_shape"
-    type: DT_INT64
+    name: "num_finalized_trees"
+    type: DT_INT32
   }
   output_arg {
-    name: "log_probability"
-    type: DT_FLOAT
+    name: "num_attempted_layers"
+    type: DT_INT32
   }
-  attr {
-    name: "merge_repeated"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
   }
+  is_stateful: true
 }
 op {
-  name: "CTCLoss"
+  name: "BoostedTreesMakeStatsSummary"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "node_ids"
+    type: DT_INT32
   }
   input_arg {
-    name: "labels_indices"
-    type: DT_INT64
+    name: "gradients"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "labels_values"
-    type: DT_INT32
+    name: "hessians"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "sequence_length"
+    name: "bucketized_features_list"
     type: DT_INT32
+    number_attr: "num_features"
   }
   output_arg {
-    name: "loss"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "gradient"
+    name: "stats_summary"
     type: DT_FLOAT
   }
   attr {
-    name: "preprocess_collapse_repeated"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "ctc_merge_repeated"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "ignore_longer_outputs_than_inputs"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "CacheDataset"
+  name: "BoostedTreesPredict"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "logits"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "num_bucketized_features"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesSerializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesTrainingPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "cached_tree_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "cached_node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "partial_logits"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "tree_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
 }
 op {
-  name: "Cast"
+  name: "BoostedTreesUpdateEnsemble"
   input_arg {
-    name: "x"
-    type_attr: "SrcT"
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "y"
-    type_attr: "DstT"
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "max_depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
   }
   attr {
-    name: "SrcT"
-    type: "type"
+    name: "pruning_mode"
+    type: "int"
+    has_minimum: true
   }
   attr {
-    name: "DstT"
-    type: "type"
+    name: "num_features"
+    type: "int"
+    has_minimum: true
   }
+  is_stateful: true
 }
 op {
-  name: "Ceil"
+  name: "BroadcastArgs"
   input_arg {
-    name: "x"
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "r0"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "CheckNumerics"
+  name: "BroadcastGradientArgs"
   input_arg {
-    name: "tensor"
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "r0"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r1"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  attr {
-    name: "message"
-    type: "string"
-  }
 }
 op {
-  name: "Cholesky"
+  name: "BroadcastTo"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "shape"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -4349,315 +4491,370 @@ op {
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "CholeskyGrad"
-  input_arg {
-    name: "l"
-    type_attr: "T"
-  }
+  name: "Bucketize"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type: DT_INT32
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
+        type: DT_INT64
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+  }
 }
 op {
-  name: "CompareAndBitpack"
+  name: "BytesProducedStatsDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "threshold"
-    type_attr: "T"
+    name: "tag"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type: DT_UINT8
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "Complex"
+  name: "CTCBeamSearchDecoder"
   input_arg {
-    name: "real"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "imag"
-    type_attr: "T"
+    name: "sequence_length"
+    type: DT_INT32
   }
   output_arg {
-    name: "out"
-    type_attr: "Tout"
+    name: "decoded_indices"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "beam_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "top_paths"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
     default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+      b: true
     }
   }
 }
 op {
-  name: "ComplexAbs"
+  name: "CTCGreedyDecoder"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
   }
   output_arg {
-    name: "y"
-    type_attr: "Tout"
+    name: "decoded_indices"
+    type: DT_INT64
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "merge_repeated"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ComputeAccidentalHits"
+  name: "CTCLoss"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "sampled_candidates"
+    name: "labels_indices"
     type: DT_INT64
   }
-  output_arg {
-    name: "indices"
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
     type: DT_INT32
   }
   output_arg {
-    name: "ids"
-    type: DT_INT64
+    name: "loss"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "weights"
+    name: "gradient"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
-    type: "int"
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "ctc_merge_repeated"
+    type: "bool"
     default_value {
-      i: 0
+      b: true
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
 }
 op {
-  name: "Concat"
+  name: "CacheDataset"
   input_arg {
-    name: "concat_dim"
-    type: DT_INT32
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
+    name: "filename"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "N"
-    type: "int"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
-    minimum: 2
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "ConcatOffset"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
+  name: "Cast"
   input_arg {
-    name: "shape"
-    type: DT_INT32
-    number_attr: "N"
+    name: "x"
+    type_attr: "SrcT"
   }
   output_arg {
-    name: "offset"
-    type: DT_INT32
-    number_attr: "N"
+    name: "y"
+    type_attr: "DstT"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
   }
 }
 op {
-  name: "ConcatV2"
+  name: "Ceil"
   input_arg {
-    name: "values"
+    name: "x"
     type_attr: "T"
-    number_attr: "N"
   }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "CheckNumerics"
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "tensor"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "Tidx"
+    name: "message"
+    type: "string"
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "ConcatenateDataset"
+  name: "CholeskyGrad"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "l"
+    type_attr: "T"
   }
   input_arg {
-    name: "another_dataset"
-    type: DT_VARIANT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "ConditionalAccumulator"
+  name: "ClipByValue"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_max"
+    type_attr: "T"
+  }
   output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -4681,75 +4878,1266 @@ op {
       }
     }
   }
+}
+op {
+  name: "CloseSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
   attr {
     name: "shape"
     type: "shape"
   }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
   attr {
-    name: "container"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
     }
   }
   attr {
-    name: "shared_name"
+    name: "final_op"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
     }
   }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
   is_stateful: true
 }
 op {
-  name: "Conj"
+  name: "CompareAndBitpack"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "threshold"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Complex"
+  input_arg {
+    name: "real"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "imag"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ComplexAbs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ComputeAccidentalHits"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "ids"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Concat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ConcatOffset"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  output_arg {
+    name: "offset"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "ConcatV2"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ConcatenateDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Conj"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "ConjugateTranspose"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "perm"
+    type_attr: "Tperm"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tperm"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Const"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "ConsumeMutexLock"
+  input_arg {
+    name: "mutex_lock"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ControlTrigger"
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+    explanation: "Use Conv3DBackpropFilterV2"
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+    explanation: "Use Conv3DBackpropInputV2"
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Copy"
+  input_arg {
+    name: "input"
+    description: "Input tensor."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Output tensor, deep-copied from input."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "The name of the input tensor."
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
+  }
+  summary: "Copy Op."
+  description: "Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the\ndevice on which the tensor is allocated.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the CopyHost Op, this op does not have HostMemory constraint on its\ninput or output."
+  allows_uninitialized_input: true
+}
+op {
+  name: "CopyHost"
+  input_arg {
+    name: "input"
+    description: "Input tensor."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Output tensor, deep-copied from input."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "The name of the input tensor."
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
+  }
+  summary: "Copy Host Op."
+  description: "Performs CPU-to-CPU deep-copying of tensor.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the Copy Op, this op has HostMemory constraint on its input or output."
+  allows_uninitialized_input: true
+}
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cosh"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_VARIANT
       }
     }
   }
 }
 op {
-  name: "ConjugateTranspose"
+  name: "CountUpTo"
   input_arg {
-    name: "x"
+    name: "ref"
     type_attr: "T"
-  }
-  input_arg {
-    name: "perm"
-    type_attr: "Tperm"
+    is_ref: true
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "limit"
+    type: "int"
   }
   attr {
-    name: "Tperm"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -4759,382 +6147,422 @@ op {
   }
 }
 op {
-  name: "Const"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+  name: "CreateSummaryDbWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "value"
-    type: "tensor"
+  input_arg {
+    name: "db_uri"
+    type: DT_STRING
   }
-  attr {
-    name: "dtype"
-    type: "type"
+  input_arg {
+    name: "experiment_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "run_name"
+    type: DT_STRING
   }
+  input_arg {
+    name: "user_name"
+    type: DT_STRING
+  }
+  is_stateful: true
 }
 op {
-  name: "ControlTrigger"
+  name: "CreateSummaryFileWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "logdir"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "max_queue"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flush_millis"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filename_suffix"
+    type: DT_STRING
+  }
+  is_stateful: true
 }
 op {
-  name: "Conv2D"
+  name: "CropAndResize"
   input_arg {
-    name: "input"
+    name: "image"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "crops"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
+    name: "method"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "bilinear"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "bilinear"
       }
     }
   }
   attr {
-    name: "dilations"
-    type: "list(int)"
+    name: "extrapolation_value"
+    type: "float"
     default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
+      f: 0
     }
   }
 }
 op {
-  name: "Conv2DBackpropFilter"
+  name: "CropAndResizeGradBoxes"
   input_arg {
-    name: "input"
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
+    name: "boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "box_ind"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
+    name: "method"
+    type: "string"
     default_value {
-      b: true
+      s: "bilinear"
     }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "data_format"
+    name: "method"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "bilinear"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "bilinear"
       }
     }
   }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
   attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Conv2DBackpropInput"
+  name: "CudnnRNN"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "input_h"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
     type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
+    name: "rnn_mode"
+    type: "string"
     default_value {
-      b: true
+      s: "lstm"
     }
-  }
-  attr {
-    name: "padding"
-    type: "string"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
   attr {
-    name: "data_format"
+    name: "input_mode"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "linear_input"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
   attr {
-    name: "dilations"
-    type: "list(int)"
+    name: "direction"
+    type: "string"
     default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
+      s: "unidirectional"
     }
-  }
-}
-op {
-  name: "Conv3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "seed2"
+    type: "int"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      i: 0
     }
   }
   attr {
-    name: "dilations"
-    type: "list(int)"
+    name: "is_training"
+    type: "bool"
     default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
+      b: true
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Conv3DBackpropFilter"
+  name: "CudnnRNNBackprop"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "input_h"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "input_c"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "params"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "output"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
   }
-  deprecation {
-    version: 10
-    explanation: "Use Conv3DBackpropFilterV2"
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
   }
-}
-op {
-  name: "Conv3DBackpropFilterV2"
   input_arg {
-    name: "input"
+    name: "output_h_backprop"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
+    name: "output_c_backprop"
+    type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "reserve_space"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
     type_attr: "T"
   }
   attr {
@@ -5143,71 +6571,140 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
+    name: "rnn_mode"
     type: "string"
+    default_value {
+      s: "lstm"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
   attr {
-    name: "data_format"
+    name: "input_mode"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "linear_input"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
   attr {
-    name: "dilations"
-    type: "list(int)"
+    name: "direction"
+    type: "string"
     default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
       list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
-}
-op {
-  name: "Conv3DBackpropInput"
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackpropV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
   input_arg {
-    name: "input"
+    name: "output_c_backprop"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "reserve_space"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
     type_attr: "T"
   }
   attr {
@@ -5222,42 +6719,96 @@ op {
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
   }
   attr {
-    name: "padding"
+    name: "input_mode"
     type: "string"
+    default_value {
+      s: "linear_input"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
-  deprecation {
-    version: 10
-    explanation: "Use Conv3DBackpropInputV2"
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "Conv3DBackpropInputV2"
+  name: "CudnnRNNCanonicalToParams"
   input_arg {
-    name: "input_sizes"
+    name: "num_layers"
     type: DT_INT32
   }
   input_arg {
-    name: "filter"
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
     type_attr: "T"
+    number_attr: "num_params"
   }
   input_arg {
-    name: "out_backprop"
+    name: "biases"
     type_attr: "T"
+    number_attr: "num_params"
   }
   output_arg {
-    name: "output"
+    name: "params"
     type_attr: "T"
   }
   attr {
@@ -5266,138 +6817,98 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "num_params"
+    type: "int"
     has_minimum: true
-    minimum: 5
+    minimum: 1
   }
   attr {
-    name: "padding"
+    name: "rnn_mode"
     type: "string"
+    default_value {
+      s: "lstm"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
   attr {
-    name: "data_format"
+    name: "input_mode"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "linear_input"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
   attr {
-    name: "dilations"
-    type: "list(int)"
+    name: "direction"
+    type: "string"
     default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
       list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
-}
-op {
-  name: "Copy"
-  input_arg {
-    name: "input"
-    description: "Input tensor."
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    description: "Output tensor, deep-copied from input."
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "tensor_name"
-    type: "string"
+    name: "seed"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
-    description: "The name of the input tensor."
   }
   attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
+    name: "seed2"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 0
     }
-    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
   }
-  summary: "Copy Op."
-  description: "Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the\ndevice on which the tensor is allocated.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the CopyHost Op, this op does not have HostMemory constraint on its\ninput or output."
-  allows_uninitialized_input: true
 }
 op {
-  name: "CopyHost"
+  name: "CudnnRNNParamsSize"
   input_arg {
-    name: "input"
-    description: "Input tensor."
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    description: "Output tensor, deep-copied from input."
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-    description: "The name of the input tensor."
+    name: "num_layers"
+    type: DT_INT32
   }
-  attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
   }
-  summary: "Copy Host Op."
-  description: "Performs CPU-to-CPU deep-copying of tensor.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the Copy Op, this op has HostMemory constraint on its input or output."
-  allows_uninitialized_input: true
-}
-op {
-  name: "Cos"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input_size"
+    type: DT_INT32
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "params_size"
+    type_attr: "S"
   }
   attr {
     name: "T"
@@ -5405,121 +6916,118 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
-}
-op {
-  name: "Cosh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "S"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-}
-op {
-  name: "CountUpTo"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
   }
   attr {
-    name: "limit"
-    type: "int"
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
-}
-op {
-  name: "CriticalSectionOp"
-  output_arg {
-    name: "resource"
-    type: DT_RESOURCE
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "seed"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "seed2"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
-  is_stateful: true
 }
 op {
-  name: "CropAndResize"
+  name: "CudnnRNNParamsToCanonical"
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "num_layers"
+    type: DT_INT32
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "num_units"
+    type: DT_INT32
   }
   input_arg {
-    name: "box_ind"
+    name: "input_size"
     type: DT_INT32
   }
   input_arg {
-    name: "crop_size"
-    type: DT_INT32
+    name: "params"
+    type_attr: "T"
   }
   output_arg {
-    name: "crops"
-    type: DT_FLOAT
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -5527,58 +7035,118 @@ op {
     }
   }
   attr {
-    name: "method"
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "lstm"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
   attr {
-    name: "extrapolation_value"
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
     type: "float"
     default_value {
       f: 0
     }
   }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
 }
 op {
-  name: "CropAndResizeGradBoxes"
+  name: "CudnnRNNV2"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "image"
+    name: "input_h"
     type_attr: "T"
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "input_c"
+    type_attr: "T"
   }
   input_arg {
-    name: "box_ind"
-    type: DT_INT32
+    name: "params"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -5586,98 +7154,76 @@ op {
     }
   }
   attr {
-    name: "method"
+    name: "rnn_mode"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "lstm"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
-}
-op {
-  name: "CropAndResizeGradImage"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "image_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
   attr {
-    name: "method"
+    name: "direction"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "unidirectional"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
-}
-op {
-  name: "Cross"
-  input_arg {
-    name: "a"
-    type_attr: "T"
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
-  input_arg {
-    name: "b"
-    type_attr: "T"
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-  output_arg {
-    name: "product"
-    type_attr: "T"
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
     }
   }
+  is_stateful: true
 }
 op {
   name: "Cumprod"
@@ -5912,6 +7458,21 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -6262,6 +7823,14 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
   name: "DecodeCompressed"
@@ -6367,23 +7936,72 @@ op {
     type_attr: "dtype"
   }
   attr {
-    name: "channels"
-    type: "int"
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
     default_value {
-      i: 0
+      s: "local://"
     }
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "message_format"
+    type: "string"
     default_value {
-      type: DT_UINT8
+      s: "binary"
     }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_UINT16
-      }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
 }
@@ -6451,6 +8069,22 @@ op {
     }
   }
 }
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -6720,6 +8354,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -6789,6 +8424,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -7003,6 +8639,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7029,6 +8666,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7054,8 +8692,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7260,8 +8898,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -7475,6 +9113,29 @@ op {
     }
   }
 }
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "EmptyTensorList"
   input_arg {
@@ -7636,6 +9297,42 @@ op {
     }
   }
 }
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "EncodeWav"
   input_arg {
@@ -7721,8 +9418,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -7757,8 +9454,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7780,49 +9477,14 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
 }
-op {
-  name: "ExecuteInCriticalSection"
-  input_arg {
-    name: "critical_section"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "outputs"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  is_stateful: true
-}
 op {
   name: "Exit"
   input_arg {
@@ -7853,8 +9515,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -7910,8 +9572,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -8778,8 +10440,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8805,8 +10467,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -8843,12 +10505,53 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
 }
+op {
+  name: "FlushSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
 op {
   name: "FractionalAvgPool"
   input_arg {
@@ -9656,6 +11359,65 @@ op {
     minimum: -1
   }
 }
+op {
+  name: "GeneratorDataset"
+  input_arg {
+    name: "init_func_other_args"
+    type_list_attr: "Tinit_func_args"
+  }
+  input_arg {
+    name: "next_func_other_args"
+    type_list_attr: "Tnext_func_args"
+  }
+  input_arg {
+    name: "finalize_func_other_args"
+    type_list_attr: "Tfinalize_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "next_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tinit_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tnext_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "GetSessionHandle"
   input_arg {
@@ -10192,7 +11954,46 @@ op {
       s: ""
     }
   }
-  is_stateful: true
+  is_stateful: true
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
 }
 op {
   name: "Igamma"
@@ -10357,6 +12158,18 @@ op {
     type: "string"
   }
 }
+op {
+  name: "ImportEvent"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "event"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "InTopK"
   input_arg {
@@ -10549,6 +12362,75 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceSub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "InterleaveDataset"
   input_arg {
@@ -10608,8 +12490,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -10639,8 +12521,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -10700,6 +12582,18 @@ op {
     }
   }
 }
+op {
+  name: "IsBoostedTreesEnsembleInitialized"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
 op {
   name: "IsFinite"
   input_arg {
@@ -10715,8 +12609,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -10738,8 +12632,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -10761,8 +12655,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -10892,18 +12786,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "IteratorSetStatsAggregator"
-  input_arg {
-    name: "iterator_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "stats_aggregator_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
 op {
   name: "IteratorToStringHandle"
   input_arg {
@@ -11276,8 +13158,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11424,8 +13306,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -11449,8 +13331,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -11873,6 +13755,10 @@ op {
     name: "num_parallel_batches"
     type: DT_INT64
   }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -12302,8 +14188,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -12798,6 +14684,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -12865,6 +14752,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -12878,6 +14766,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -13537,8 +15426,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -13793,8 +15682,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -13909,6 +15798,8 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -13935,8 +15826,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -14308,6 +16199,40 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "MutexLock"
+  input_arg {
+    name: "mutex"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "mutex_lock"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "MutexV2"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Neg"
   input_arg {
@@ -14323,8 +16248,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14459,8 +16384,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -14624,6 +16549,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT8
@@ -15773,6 +17699,31 @@ op {
     type: "type"
   }
 }
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
 op {
   name: "Placeholder"
   output_arg {
@@ -15902,9 +17853,9 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
@@ -16334,6 +18285,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -16389,6 +18341,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -16437,6 +18390,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -18984,8 +20938,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -19015,8 +20969,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -19046,8 +21000,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -19309,6 +21263,32 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "RegexReplace"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "rewrite"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "replace_global"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "Relu"
   input_arg {
@@ -19471,6 +21451,7 @@ op {
     name: "f"
     type: "func"
   }
+  is_stateful: true
 }
 op {
   name: "RemoteFusedGraphExecute"
@@ -19838,9 +21819,48 @@ op {
   }
 }
 op {
-  name: "ResizeNearestNeighbor"
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighborGrad"
   input_arg {
-    name: "images"
+    name: "grads"
     type_attr: "T"
   }
   input_arg {
@@ -19848,7 +21868,7 @@ op {
     type: DT_INT32
   }
   output_arg {
-    name: "resized_images"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -19856,12 +21876,9 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
         type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
+        type: DT_INT8
         type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -19877,17 +21894,41 @@ op {
   }
 }
 op {
-  name: "ResizeNearestNeighborGrad"
+  name: "ResourceApplyAdaMax"
   input_arg {
-    name: "grads"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   attr {
@@ -19895,22 +21936,34 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_INT16
         type: DT_INT8
-        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
   name: "ResourceApplyAdadelta"
@@ -20026,6 +22079,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_stateful: true
 }
 op {
@@ -20829,7 +22889,197 @@ op {
     type: "int"
   }
   attr {
-    name: "T"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceGather"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterDiv"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMax"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
@@ -20841,7 +23091,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceGather"
+  name: "ResourceScatterMin"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -20850,20 +23100,34 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "updates"
     type_attr: "dtype"
   }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   attr {
     name: "dtype"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
     name: "Tindices"
@@ -20878,7 +23142,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "ResourceScatterMul"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -20965,6 +23229,57 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterSub"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterUpdate"
   input_arg {
@@ -21137,6 +23452,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_stateful: true
 }
 op {
@@ -22053,8 +24375,8 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BOOL
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -22112,6 +24434,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -22176,8 +24499,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -22188,6 +24511,47 @@ op {
     }
   }
 }
+op {
+  name: "Rpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Rsqrt"
   input_arg {
@@ -22203,8 +24567,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -22232,8 +24596,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -22566,20 +24930,146 @@ op {
     has_minimum: true
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "ScatterAdd"
+  name: "ScatterMax"
   input_arg {
     name: "ref"
     type_attr: "T"
@@ -22603,23 +25093,12 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -22642,7 +25121,7 @@ op {
   }
 }
 op {
-  name: "ScatterDiv"
+  name: "ScatterMin"
   input_arg {
     name: "ref"
     type_attr: "T"
@@ -22666,23 +25145,12 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -23785,6 +26253,34 @@ op {
     }
   }
 }
+op {
+  name: "SetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Shape"
   input_arg {
@@ -23979,8 +26475,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -24008,8 +26504,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -24033,8 +26529,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -24060,8 +26556,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -24085,8 +26581,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -24248,6 +26744,37 @@ op {
     }
   }
 }
+op {
+  name: "SlideDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "Snapshot"
   input_arg {
@@ -24969,6 +27496,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "SparseApplyAdagradDA"
@@ -27449,8 +29983,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -27478,8 +30012,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -27503,8 +30037,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -27534,8 +30068,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -27888,6 +30422,71 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatelessMultinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomNormal"
   input_arg {
@@ -28383,6 +30982,17 @@ op {
     }
   }
 }
+op {
+  name: "StringStrip"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
 op {
   name: "StringToHashBucket"
   input_arg {
@@ -28483,8 +31093,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -28588,6 +31198,28 @@ op {
     }
   }
 }
+op {
+  name: "SummaryWriter"
+  output_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Svd"
   input_arg {
@@ -28846,8 +31478,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -28873,8 +31505,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -28902,8 +31534,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -29896,7 +32528,26 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
+  is_stateful: true
+}
+op {
+  name: "TensorListConcatLists"
+  input_arg {
+    name: "input_a"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "input_b"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
 }
 op {
   name: "TensorListElementShape"
@@ -30016,6 +32667,25 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListPushBackBatch"
+  input_arg {
+    name: "input_handles"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handles"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListReserve"
   input_arg {
@@ -30368,6 +33038,14 @@ op {
     explanation: "TileGrad has been replaced with reduce_sum"
   }
 }
+op {
+  name: "Timestamp"
+  output_arg {
+    name: "ts"
+    type: DT_DOUBLE
+  }
+  is_stateful: true
+}
 op {
   name: "TopK"
   input_arg {
@@ -30516,8 +33194,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -30554,6 +33232,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -30608,6 +33287,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unbatch"
   input_arg {
@@ -30649,6 +33377,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -30778,29 +33529,6 @@ op {
     }
   }
 }
-op {
-  name: "UniqueDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UniqueV2"
   input_arg {
@@ -30886,6 +33614,59 @@ op {
     }
   }
 }
+op {
+  name: "UniqueWithCountsV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  output_arg {
+    name: "count"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Unpack"
   input_arg {
@@ -30921,15 +33702,139 @@ op {
     type_attr: "Tidx"
   }
   input_arg {
-    name: "dims"
-    type_attr: "Tidx"
+    name: "dims"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
   }
   output_arg {
     name: "output"
-    type_attr: "Tidx"
+    type_attr: "T"
   }
   attr {
-    name: "Tidx"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
     type: "type"
     default_value {
       type: DT_INT32
@@ -30943,7 +33848,7 @@ op {
   }
 }
 op {
-  name: "UnsortedSegmentMax"
+  name: "UnsortedSegmentProd"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -31284,6 +34189,31 @@ op {
     }
   }
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
@@ -31329,6 +34259,39 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "WriteAudioSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "WriteFile"
   input_arg {
@@ -31340,6 +34303,180 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "WriteGraphSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteHistogramSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteImageSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bad_color"
+    type: DT_UINT8
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteScalarSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "summary_metadata"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "ZerosLike"
   input_arg {
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index ddd2aa92748f244c2d132f00780a0d6424f1e595..ddb714b4e951aa485d087daa31368bad9f1261e4 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -245,6 +245,7 @@ REGISTER_OP("DecodeCSV")
     .Attr("field_delim: string = ','")
     .Attr("use_quote_delim: bool = true")
     .Attr("na_value: string = ''")
+    .Attr("select_cols: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       // Validate the record_defaults inputs.
       for (int i = 1; i < c->num_inputs(); ++i) {
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index f6c668f5c98efff07a49be15b1187f1858800110..416ce9c0d82ca0bfba730d3d7f4513260876e9ad 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -43,7 +43,12 @@ REGISTER_OP("RandomUniformInt")
     .Attr("seed2: int = 0")
     .Attr("Tout: {int32, int64}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::RandomShape(c);
+    });
 
 REGISTER_OP("RandomStandardNormal")
     .Input("shape: T")
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index 8dae7e1ff5f872c33dd56509c0349180cec78593..3d0a6c2157d050869d5758128e9467e0ecdc7203 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -167,27 +167,75 @@ REGISTER_OP("ResourceGather")
       return Status::OK();
     });
 
+namespace {
+
+Status ResourceScatterUpdateShape(InferenceContext* c) {
+  ShapeAndType handle_shape_and_type;
+  TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &handle_shape_and_type));
+  ShapeHandle var_shape = handle_shape_and_type.shape;
+  ShapeHandle indices_shape = c->input(1);
+
+  ShapeHandle unused_updates_shape;
+  ShapeHandle concat;
+  ShapeHandle var_subshape;
+  TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
+  TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
+  TF_RETURN_IF_ERROR(
+      InferenceContext::Rank(c->input(2)) == 0
+          ? Status::OK()
+          : c->Merge(c->input(2), concat, &unused_updates_shape));
+  return Status::OK();
+}
+
+}  // namespace
+
 REGISTER_OP("ResourceScatterAdd")
     .Input("resource: resource")
     .Input("indices: Tindices")
     .Input("updates: dtype")
     .Attr("dtype: numbertype")
     .Attr("Tindices: {int32, int64}")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeAndType handle_shape_and_type;
-      TF_RETURN_IF_ERROR(
-          ValidateVariableResourceHandle(c, &handle_shape_and_type));
-      ShapeHandle var_shape = handle_shape_and_type.shape;
-      ShapeHandle indices_shape = c->input(1);
+    .SetShapeFn(ResourceScatterUpdateShape);
 
-      ShapeHandle unused_updates_shape;
-      ShapeHandle concat;
-      ShapeHandle var_subshape;
-      TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
-      TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
-      TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
-      return Status::OK();
-    });
+REGISTER_OP("ResourceScatterSub")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
+
+REGISTER_OP("ResourceScatterMul")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
+
+REGISTER_OP("ResourceScatterDiv")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
+
+REGISTER_OP("ResourceScatterMin")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
+
+REGISTER_OP("ResourceScatterMax")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
 
 REGISTER_OP("ResourceScatterUpdate")
     .Input("resource: resource")
@@ -195,23 +243,9 @@ REGISTER_OP("ResourceScatterUpdate")
     .Input("updates: dtype")
     .Attr("dtype: type")
     .Attr("Tindices: {int32, int64}")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeAndType handle_shape_and_type;
-      TF_RETURN_IF_ERROR(
-          ValidateVariableResourceHandle(c, &handle_shape_and_type));
-      ShapeHandle var_shape = handle_shape_and_type.shape;
-      ShapeHandle indices_shape = c->input(1);
+    .SetShapeFn(ResourceScatterUpdateShape);
 
-      ShapeHandle unused_updates_shape;
-      ShapeHandle concat;
-      ShapeHandle var_subshape;
-      TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
-      TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
-      TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
-      return Status::OK();
-    });
-
-REGISTER_OP("CriticalSectionOp")
+REGISTER_OP("MutexV2")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .Output("resource: resource")
@@ -221,24 +255,18 @@ REGISTER_OP("CriticalSectionOp")
       return Status::OK();
     });
 
-REGISTER_OP("ExecuteInCriticalSection")
-    .Input("critical_section: resource")
-    .Input("arguments: Targuments")
-    .Output("outputs: output_types")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 0")
-    .Attr("output_shapes: list(shape) >= 0")
+REGISTER_OP("MutexLock")
+    .Input("mutex: resource")
+    .Output("mutex_lock: variant")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
-      std::vector<PartialTensorShape> output_shapes;
-      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-      for (int i = 0; i < output_shapes.size(); ++i) {
-        ShapeHandle s;
-        TF_RETURN_IF_ERROR(
-            c->MakeShapeFromPartialTensorShape(output_shapes[i], &s));
-        c->set_output(i, s);
-      }
+      c->set_output(0, c->Scalar());
       return Status::OK();
     });
 
+REGISTER_OP("ConsumeMutexLock")
+    .Input("mutex_lock: variant")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/rpc_ops.cc b/tensorflow/core/ops/rpc_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72fda5e6eba3fd4cf3e53a26e0b4d9f5d6b19100
--- /dev/null
+++ b/tensorflow/core/ops/rpc_ops.cc
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+Status RpcShapeOp(InferenceContext* c, bool try_rpc) {
+  ShapeHandle address;
+  ShapeHandle method;
+  ShapeHandle request;
+  ShapeHandle output;
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &address));
+  if (c->Rank(address) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, address, &output));
+  }
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &method));
+  if (c->Rank(method) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, method, &output));
+  }
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &request));
+  if (c->Rank(request) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, request, &output));
+  }
+  if (!c->RankKnown(output)) {
+    output = request;
+  }
+  c->set_output(0, output);  // response
+  if (try_rpc) {
+    c->set_output(1, output);  // status_code
+    c->set_output(2, output);  // status_message
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("Rpc")
+    .Input("address: string")
+    .Input("method: string")
+    .Input("request: string")
+    .Attr("protocol: string = ''")
+    .Attr("fail_fast: bool = true")
+    .Attr("timeout_in_ms: int = 0")
+    .Output("response: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      return RpcShapeOp(c, /*try_rpc=*/false);
+    });
+
+REGISTER_OP("TryRpc")
+    .Input("address: string")
+    .Input("method: string")
+    .Input("request: string")
+    .Attr("protocol: string = ''")
+    .Attr("fail_fast: bool = true")
+    .Attr("timeout_in_ms: int = 0")
+    .Output("response: string")
+    .Output("status_code: int32")
+    .Output("status_message: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      return RpcShapeOp(c, /*try_rpc=*/true);
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/scoped_allocator_ops.cc b/tensorflow/core/ops/scoped_allocator_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f053a53f4cf02e89fce53461bfda0cb23756287b
--- /dev/null
+++ b/tensorflow/core/ops/scoped_allocator_ops.cc
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("_ScopedAllocator")
+    .Output("output: T")
+    .Attr("shapes: list(shape)")
+    .Attr("T: type")
+    .Attr("sa_name: string")
+    .Attr("id: int")
+    .Attr("expected_call_count: int")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape)
+    .Doc(R"doc(
+Allocates a mutable tensor that becomes available to appropriately annotated
+downstream Ops as backing store for their output tensor allocations via the
+ScopedAllocatorMgr.
+Returns a reference to this value.
+
+This is an experimental op for internal use only.  It is possible to use this
+op in unsafe ways.
+)doc");
+
+REGISTER_OP("_ScopedAllocatorConcat")
+    .Output("output: T")
+    .Input("backing: T")
+    .Input("inputs: N * T")
+    .Attr("shape: shape")
+    .Attr("T: type")
+    .Attr("sa_name: string")
+    .Attr("id: int")
+    .Attr("N: int >= 2")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape)
+    .Doc(R"doc(
+Acts like a Concat Op that merges multple tensors into one, however it must
+only be used in conjunction with a ScopedAllocator which is backing the memory
+of all of its input tensors so that actually it just outputs a read-only
+reference to that ScopedAllocator's backing tensor.
+
+This is an experimental op for internal use only.  It is possible to use this
+op in unsafe ways.
+)doc");
+
+REGISTER_OP("_ScopedAllocatorSplit")
+    .Output("output: N * T")
+    .Input("concat: T")
+    .Input("split: N * T")
+    .Attr("T: type")
+    .Attr("sa_name: string")
+    .Attr("id: int")
+    .Attr("N: int >= 2")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape)
+    .Doc(R"doc(
+Acts like a Concat Op that merges multple tensors into one, however it must
+only be used in conjunction with a ScopedAllocator which is backing the memory
+of all of its input tensors so that actually it just outputs a read-only
+reference to that ScopedAllocator's backing tensor.
+
+This is an experimental op for internal use only.  It is possible to use this
+op in unsafe ways.
+)doc");
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/ops/shape_function_test.cc b/tensorflow/core/ops/shape_function_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..120995f3aac7da4111d0404a64f322a50d30a491
--- /dev/null
+++ b/tensorflow/core/ops/shape_function_test.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/platform/test.h"
+
+// Test to ensure that all core ops have shape functions defined. This is done
+// by looking at all ops registered in the test binary.
+
+namespace tensorflow {
+
+TEST(ShapeFunctionTest, RegisteredOpsHaveShapeFns) {
+  OpRegistry* op_registry = OpRegistry::Global();
+  std::vector<OpRegistrationData> op_data;
+  op_registry->GetOpRegistrationData(&op_data);
+  for (const OpRegistrationData& op_reg_data : op_data) {
+    EXPECT_TRUE(op_reg_data.shape_inference_fn != nullptr)
+        << op_reg_data.op_def.name();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/spectral_ops.cc b/tensorflow/core/ops/spectral_ops.cc
index 508cea3495a9e811d4d12bf022b0ddfdcb33d718..2790aee37e93d3915ff9cba80af2e7ddccf4774e 100644
--- a/tensorflow/core/ops/spectral_ops.cc
+++ b/tensorflow/core/ops/spectral_ops.cc
@@ -142,26 +142,32 @@ REGISTER_OP("IRFFT3D")
 REGISTER_OP("BatchFFT")
     .Input("input: complex64")
     .Output("output: complex64")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(15, "Use FFT");
 REGISTER_OP("BatchIFFT")
     .Input("input: complex64")
     .Output("output: complex64")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(15, "Use IFFT");
 REGISTER_OP("BatchFFT2D")
     .Input("input: complex64")
     .Output("output: complex64")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(15, "Use FFT2D");
 REGISTER_OP("BatchIFFT2D")
     .Input("input: complex64")
     .Output("output: complex64")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(15, "Use IFFT2D");
 REGISTER_OP("BatchFFT3D")
     .Input("input: complex64")
     .Output("output: complex64")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(15, "Use FFT3D");
 REGISTER_OP("BatchIFFT3D")
     .Input("input: complex64")
     .Output("output: complex64")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(15, "Use IFFT3D");
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index 7a524b60c0aa711f36158b73b93fa91606266592..664f52452e3339e895f568f83e1fbf80cdd8f035 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -122,7 +122,10 @@ Status ScatterUpdateShape(InferenceContext* c) {
   ShapeHandle var_subshape;
   TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
   TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
-  TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
+  TF_RETURN_IF_ERROR(
+      InferenceContext::Rank(c->input(2)) == 0
+          ? Status::OK()
+          : c->Merge(c->input(2), concat, &unused_updates_shape));
 
   c->set_output(0, var_shape);
   return Status::OK();
@@ -180,6 +183,26 @@ REGISTER_OP("ScatterDiv")
     .Attr("use_locking: bool = false")
     .SetShapeFn(ScatterUpdateShape);
 
+REGISTER_OP("ScatterMin")
+    .Input("ref: Ref(T)")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output_ref: Ref(T)")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn(ScatterUpdateShape);
+
+REGISTER_OP("ScatterMax")
+    .Input("ref: Ref(T)")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output_ref: Ref(T)")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn(ScatterUpdateShape);
+
 REGISTER_OP("ScatterNdUpdate")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index 553850610a3c51986664fee52e04809626de22c1..742709fb1836a0f7e3f0bd94f3dbc3e15423a271 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -29,7 +29,7 @@ static Status StatelessShape(shape_inference::InferenceContext* context) {
   TF_RETURN_IF_ERROR(context->WithValue(context->Dim(seed, 0), 2, &unused));
 
   // Set output shape
-  shape_inference::ShapeHandle out;
+  ShapeHandle out;
   TF_RETURN_IF_ERROR(context->MakeShapeFromShapeTensor(0, &out));
   context->set_output(0, out);
   return Status::OK();
@@ -54,6 +54,32 @@ REGISTER_STATELESS_OP("StatelessRandomNormal");
 // This op is exposed through contrib/stateless only.  The interface may change.
 REGISTER_STATELESS_OP("StatelessTruncatedNormal");
 
+// This op is exposed through contrib/stateless only.  The interface may change.
+REGISTER_OP("StatelessMultinomial")
+    .Input("logits: T")
+    .Input("num_samples: int32")
+    .Input("seed: Tseed")
+    .Output("output: output_dtype")
+    .Attr("T: realnumbertype")
+    .Attr("Tseed: {int32, int64} = DT_INT64")
+    .Attr("output_dtype: {int32, int64} = DT_INT64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Check seed shape
+      ShapeHandle seed;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &seed));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused_dim));
+
+      ShapeHandle logits_shape;
+      ShapeHandle unused;
+      DimensionHandle num_samples;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &logits_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(1, &num_samples));
+      c->set_output(0, c->Matrix(c->Dim(logits_shape, 0), num_samples));
+      return Status::OK();
+    });
+
 #undef REGISTER_STATELESS_OP
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index e4c5bcfb540660a609aca013b795d566e69f54a8..469f193cf417758a9b21b72c09c899493d54299a 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -23,6 +23,20 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+REGISTER_OP("RegexReplace")
+    .Input("input: string")
+    .Input("pattern: string")
+    .Input("rewrite: string")
+    .Output("output: string")
+    .Attr("replace_global: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringToHashBucketFast")
     .Input("input: string")
     .Output("output: int64")
@@ -109,6 +123,11 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
+REGISTER_OP("StringStrip")
+    .Input("input: string")
+    .Output("output: string")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("EncodeBase64")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/ops/summary_ops.cc b/tensorflow/core/ops/summary_ops.cc
index aa7458f903cf76af660c04149ff50ac899987eac..742a221adcb101e3d2d152d60e343b666d3fb96b 100644
--- a/tensorflow/core/ops/summary_ops.cc
+++ b/tensorflow/core/ops/summary_ops.cc
@@ -22,15 +22,7 @@ REGISTER_OP("SummaryWriter")
     .Output("writer: resource")
     .Attr("shared_name: string = ''")
     .Attr("container: string = ''")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Returns a handle to be used to access a summary writer.
-
-The summary writer is an in-graph resource which can be used by ops to write
-summaries to event files.
-
-writer: the summary writer resource. Scalar handle.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("CreateSummaryFileWriter")
     .Input("writer: resource")
@@ -38,17 +30,7 @@ REGISTER_OP("CreateSummaryFileWriter")
     .Input("max_queue: int32")
     .Input("flush_millis: int32")
     .Input("filename_suffix: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Creates a summary file writer accessible by the given resource handle.
-
-writer: A handle to the summary writer resource
-logdir: Directory where the event file will be written.
-max_queue: Size of the queue of pending events and summaries.
-flush_millis: How often, in milliseconds, to flush the pending events and
-  summaries to disk.
-filename_suffix: Every event file's name is suffixed with this suffix.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("CreateSummaryDbWriter")
     .Input("writer: resource")
@@ -56,47 +38,15 @@ REGISTER_OP("CreateSummaryDbWriter")
     .Input("experiment_name: string")
     .Input("run_name: string")
     .Input("user_name: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Creates summary database writer accessible by given resource handle.
-
-This can be used to write tensors from the execution graph directly
-to a database. Only SQLite is supported right now. This function
-will create the schema if it doesn't exist. Entries in the Users,
-Experiments, and Runs tables will be created automatically if they
-don't already exist.
-
-writer: Handle to SummaryWriter resource to overwrite.
-db_uri: For example "file:/tmp/foo.sqlite".
-experiment_name: Can't contain ASCII control characters or <>. Case
-  sensitive. If empty, then the Run will not be associated with any
-  Experiment.
-run_name: Can't contain ASCII control characters or <>. Case sensitive.
-  If empty, then each Tag will not be associated with any Run.
-user_name: Must be valid as both a DNS label and Linux username. If
-  empty, then the Experiment will not be associated with any User.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("FlushSummaryWriter")
     .Input("writer: resource")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"(
-Flushes the writer's unwritten events.
-
-writer: A handle to the summary writer resource.
-)");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("CloseSummaryWriter")
     .Input("writer: resource")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"(
-Flushes and closes the summary writer.
-
-Also removes it from the resource manager. To reopen, use another
-CreateSummaryFileWriter op.
-
-writer: A handle to the summary writer resource.
-)");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteSummary")
     .Input("writer: resource")
@@ -105,31 +55,12 @@ REGISTER_OP("WriteSummary")
     .Input("tag: string")
     .Input("summary_metadata: string")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Outputs a `Summary` protocol buffer with a tensor.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tensor: A tensor to serialize.
-tag: The summary's tag.
-summary_metadata: Serialized SummaryMetadata protocol buffer containing
- plugin-related metadata for this summary.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("ImportEvent")
     .Input("writer: resource")
     .Input("event: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Outputs a `tf.Event` protocol buffer.
-
-When CreateSummaryDbWriter is being used, this op can be useful for
-importing data from event logs.
-
-writer: A handle to a summary writer.
-event: A string containing a binary-encoded tf.Event proto.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteScalarSummary")
     .Input("writer: resource")
@@ -137,17 +68,7 @@ REGISTER_OP("WriteScalarSummary")
     .Input("tag: string")
     .Input("value: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `Summary` protocol buffer with scalar values.
-
-The input `tag` and `value` must have the scalars.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tag: Tag for the summary.
-value: Value for the summary.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteHistogramSummary")
     .Input("writer: resource")
@@ -155,21 +76,7 @@ REGISTER_OP("WriteHistogramSummary")
     .Input("tag: string")
     .Input("values: T")
     .Attr("T: realnumbertype = DT_FLOAT")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `Summary` protocol buffer with a histogram.
-
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing a histogram for `values`.
-
-This op reports an `InvalidArgument` error if any value is not finite.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tag: Scalar.  Tag to use for the `Summary.Value`.
-values: Any shape. Values to use to build the histogram.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteImageSummary")
     .Input("writer: resource")
@@ -179,52 +86,7 @@ REGISTER_OP("WriteImageSummary")
     .Input("bad_color: uint8")
     .Attr("max_images: int >= 1 = 3")
     .Attr("T: {uint8, float, half} = DT_FLOAT")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `Summary` protocol buffer with images.
-
-The summary has up to `max_images` summary values containing images. The
-images are built from `tensor` which must be 4-D with shape `[batch_size,
-height, width, channels]` and where `channels` can be:
-
-*  1: `tensor` is interpreted as Grayscale.
-*  3: `tensor` is interpreted as RGB.
-*  4: `tensor` is interpreted as RGBA.
-
-The images have the same number of channels as the input tensor. For float
-input, the values are normalized one image at a time to fit in the range
-`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-normalization algorithms:
-
-*  If the input values are all positive, they are rescaled so the largest one
-   is 255.
-
-*  If any input value is negative, the values are shifted so input value 0.0
-   is at 127.  They are then rescaled so that either the smallest value is 0,
-   or the largest one is 255.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_images` is 1, the summary value tag is '*tag*/image'.
-*  If `max_images` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-
-The `bad_color` argument is the color to use in the generated images for
-non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-Each element must be in the range `[0, 255]` (It represents the value of a
-pixel in the output image).  Non-finite values in the input tensor are
-replaced by this tensor in the output image.  The default value is the color
-red.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tag: Scalar. Used to build the `tag` attribute of the summary values.
-tensor: 4-D of shape `[batch_size, height, width, channels]` where
-  `channels` is 1, 3, or 4.
-max_images: Max number of batch elements to generate images for.
-bad_color: Color to use for pixels with non-finite values.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteAudioSummary")
     .Input("writer: resource")
@@ -233,41 +95,12 @@ REGISTER_OP("WriteAudioSummary")
     .Input("tensor: float")
     .Input("sample_rate: float")
     .Attr("max_outputs: int >= 1 = 3")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `Summary` protocol buffer with audio.
-
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tag: Scalar. Used to build the `tag` attribute of the summary values.
-tensor: 2-D of shape `[batch_size, frames]`.
-sample_rate: The sample rate of the signal in hertz.
-max_outputs: Max number of batch elements to generate audio for.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteGraphSummary")
     .Input("writer: resource")
     .Input("step: int64")
     .Input("tensor: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `GraphDef` protocol buffer to a `SummaryWriter`.
-
-writer: Handle of `SummaryWriter`.
-step: The step to write the summary for.
-tensor: A scalar string of the serialized tf.GraphDef proto.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 6ce9595fb60b78525bde19515077f7245a219d39..94ff092a85d512e602da5e97fc3007d4c68c5937 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -253,6 +253,7 @@ REGISTER_OP("ApplyAdagrad")
     .Output("out: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, false /* sparse */);
     });
@@ -264,6 +265,7 @@ REGISTER_OP("ResourceApplyAdagrad")
     .Input("grad: T")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, false /* sparse */);
     });
@@ -320,6 +322,7 @@ REGISTER_OP("SparseApplyAdagrad")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, true /* sparse */);
     });
@@ -333,6 +336,7 @@ REGISTER_OP("ResourceSparseApplyAdagrad")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, true /* sparse */);
     });
@@ -737,6 +741,57 @@ REGISTER_OP("ResourceApplyAdam")
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
 
+static Status ApplyAdaMaxShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyAdaMax")
+    .Input("var: Ref(T)")
+    .Input("m: Ref(T)")
+    .Input("v: Ref(T)")
+    .Input("beta1_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdaMaxShapeFn(c, false /* sparse */);
+    });
+
+REGISTER_OP("ResourceApplyAdaMax")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("v: resource")
+    .Input("beta1_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdaMaxShapeFn(c, false /* sparse */);
+    });
+
 static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
diff --git a/tensorflow/core/ops/word2vec_ops.cc b/tensorflow/core/ops/word2vec_ops.cc
index ed685dcf0ae9a3c61a1db491751f7de4e981300d..e469771103925e107d2f8aeced6df9dfb56cbe24 100644
--- a/tensorflow/core/ops/word2vec_ops.cc
+++ b/tensorflow/core/ops/word2vec_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 
 namespace tensorflow {
@@ -33,7 +34,8 @@ REGISTER_OP("Skipgram")
     .Attr("batch_size: int")
     .Attr("window_size: int = 5")
     .Attr("min_count: int = 5")
-    .Attr("subsample: float = 1e-3");
+    .Attr("subsample: float = 1e-3")
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("NegTrain")
     .Deprecated(19,
@@ -46,6 +48,7 @@ REGISTER_OP("NegTrain")
     .Input("lr: float")
     .SetIsStateful()
     .Attr("vocab_count: list(int)")
-    .Attr("num_negative_samples: int");
+    .Attr("num_negative_samples: int")
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/platform/abi.cc b/tensorflow/core/platform/abi.cc
index 4df62734e96c86fdbdae4dcf34f7a1f2a6583d5c..e597a490d619e55ad491d108c4a536727431b92b 100644
--- a/tensorflow/core/platform/abi.cc
+++ b/tensorflow/core/platform/abi.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/abi.h"
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 #include <windows.h>
 #include <cstring>
 #else
@@ -26,19 +26,19 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 
 extern "C" char* __unDName(char* output_string, const char* name,
                            int max_string_length, void* (*p_alloc)(std::size_t),
                            void (*p_free)(void*), unsigned short disable_flags);
 
-#endif  // defined(PLATFORM_WINDOWS)
+#endif  // defined(_MSC_VER)
 
 namespace tensorflow {
 namespace port {
 
 std::string MaybeAbiDemangle(const char* name) {
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
   std::unique_ptr<char> demangled{__unDName(nullptr, name, 0, std::malloc,
                                             std::free,
                                             static_cast<unsigned short>(0))};
diff --git a/tensorflow/core/platform/byte_order.h b/tensorflow/core/platform/byte_order.h
new file mode 100644
index 0000000000000000000000000000000000000000..aab6535e4b0a692254064e7c34932beddd5b635a
--- /dev/null
+++ b/tensorflow/core/platform/byte_order.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
+#define TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
+
+// Byte order defines provided by gcc. MSVC doesn't define those so
+// we define them here.
+// We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
+#define __ORDER_LITTLE_ENDIAN__ 0x4d2
+#define __ORDER_BIG_ENDIAN__ 0x10e1
+#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
+
+namespace tensorflow {
+namespace port {
+
+// TODO(jeff,sanjay): Make portable
+constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 9ba25dea4fb278cbfaf4080e21beef8a3e9de769..0fc1e4ae45c4163604a7cf9d596006c8deeea147 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -14,20 +14,6 @@ load(
     "if_windows",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "expiring_lru_cache",
     hdrs = ["expiring_lru_cache.h"],
@@ -38,13 +24,24 @@ cc_library(
 
 cc_library(
     name = "file_block_cache",
-    srcs = ["file_block_cache.cc"],
     hdrs = ["file_block_cache.h"],
     copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = ["//tensorflow/core:lib"],
 )
 
+cc_library(
+    name = "ram_file_block_cache",
+    srcs = ["ram_file_block_cache.cc"],
+    hdrs = ["ram_file_block_cache.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":file_block_cache",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "gcs_dns_cache",
     srcs = ["gcs_dns_cache.cc"],
@@ -83,10 +80,12 @@ cc_library(
         ":gcs_throttle",
         ":google_auth_provider",
         ":http_request",
+        ":ram_file_block_cache",
         ":retrying_file_system",
         ":retrying_utils",
         ":time_util",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@jsoncpp_git//:jsoncpp",
     ],
@@ -202,9 +201,6 @@ cc_library(
 
 cc_library(
     name = "retrying_file_system",
-    srcs = [
-        "retrying_file_system.cc",
-    ],
     hdrs = [
         "retrying_file_system.h",
     ],
@@ -245,12 +241,12 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "file_block_cache_test",
+    name = "ram_file_block_cache_test",
     size = "small",
-    srcs = ["file_block_cache_test.cc"],
+    srcs = ["ram_file_block_cache_test.cc"],
     deps = [
-        ":file_block_cache",
         ":now_seconds_env",
+        ":ram_file_block_cache",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:test",
@@ -265,6 +261,7 @@ tf_cc_test(
     deps = [
         ":gcs_file_system",
         ":http_request_fake",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index 88a5d1e96dc2fcb7d12e2c0891d2f04d64bac594..1ac6a7531b0c0b6f26f2636032388dcaefe894ee 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -21,9 +21,12 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
 
+#define CHECK_CURL_OK(expr) CHECK_EQ(expr, CURLE_OK)
+
 namespace tensorflow {
 
 namespace {
@@ -129,20 +132,21 @@ CurlHttpRequest::CurlHttpRequest(LibCurl* libcurl, Env* env)
   //       default in //third_party:curl.BUILD and can be customized via an
   //       environment variable.
 
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_VERBOSE, kVerboseOutput);
-  libcurl_->curl_easy_setopt(
+  CHECK_CURL_OK(
+      libcurl_->curl_easy_setopt(curl_, CURLOPT_VERBOSE, kVerboseOutput));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(
       curl_, CURLOPT_USERAGENT,
-      strings::StrCat("TensorFlow/", TF_VERSION_STRING).c_str());
+      strings::StrCat("TensorFlow/", TF_VERSION_STRING).c_str()));
   // Do not use signals for timeouts - does not work in multi-threaded programs.
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_NOSIGNAL, 1L);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION,
-                             CURL_HTTP_VERSION_2_0);
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_NOSIGNAL, 1L));
+
+  // TODO(b/74351157): Enable HTTP/2.
 
   // Set up the progress meter.
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_NOPROGRESS, 0ULL);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFODATA, this);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFOFUNCTION,
-                             &CurlHttpRequest::ProgressCallback);
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_NOPROGRESS, 0ULL));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFODATA, this));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFOFUNCTION,
+                                           &CurlHttpRequest::ProgressCallback));
 
   // If response buffer is not set, libcurl will print results to stdout,
   // so we always set it.
@@ -175,13 +179,13 @@ void CurlHttpRequest::SetUri(const string& uri) {
   CheckNotSent();
   is_uri_set_ = true;
   uri_ = uri;
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_URL, uri.c_str());
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_URL, uri.c_str()));
 }
 
 void CurlHttpRequest::SetRange(uint64 start, uint64 end) {
   CheckNotSent();
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_RANGE,
-                             strings::StrCat(start, "-", end).c_str());
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(
+      curl_, CURLOPT_RANGE, strings::StrCat(start, "-", end).c_str()));
 }
 
 void CurlHttpRequest::AddHeader(const string& name, const string& value) {
@@ -206,11 +210,19 @@ void CurlHttpRequest::AddAuthBearerHeader(const string& auth_token) {
   }
 }
 
+void CurlHttpRequest::SetRequestStats(RequestStats* stats) {
+  CheckNotSent();
+  CHECK(stats_ == nullptr) << "SetRequestStats already called";
+  stats_ = stats;
+}
+
 void CurlHttpRequest::SetDeleteRequest() {
   CheckNotSent();
   CheckMethodNotSet();
   is_method_set_ = true;
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_CUSTOMREQUEST, "DELETE");
+  method_ = RequestMethod::kDelete;
+  CHECK_CURL_OK(
+      libcurl_->curl_easy_setopt(curl_, CURLOPT_CUSTOMREQUEST, "DELETE"));
 }
 
 Status CurlHttpRequest::SetPutFromFile(const string& body_filepath,
@@ -218,6 +230,7 @@ Status CurlHttpRequest::SetPutFromFile(const string& body_filepath,
   CheckNotSent();
   CheckMethodNotSet();
   is_method_set_ = true;
+  method_ = RequestMethod::kPut;
   if (put_body_) {
     fclose(put_body_);
   }
@@ -232,9 +245,9 @@ Status CurlHttpRequest::SetPutFromFile(const string& body_filepath,
 
   curl_headers_ = libcurl_->curl_slist_append(
       curl_headers_, strings::StrCat("Content-Length: ", size).c_str());
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_PUT, 1);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
-                             reinterpret_cast<void*>(put_body_));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_PUT, 1));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
+                                           reinterpret_cast<void*>(put_body_)));
   // Using the default CURLOPT_READFUNCTION, which is doing an fread() on the
   // FILE * userdata set with CURLOPT_READDATA.
   return Status::OK();
@@ -244,26 +257,28 @@ void CurlHttpRequest::SetPutEmptyBody() {
   CheckNotSent();
   CheckMethodNotSet();
   is_method_set_ = true;
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_PUT, 1);
-  curl_headers_ =
-      libcurl_->curl_slist_append(curl_headers_, "Content-Length: 0");
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
-                             reinterpret_cast<void*>(this));
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
-                             &CurlHttpRequest::ReadCallback);
+  method_ = RequestMethod::kPut;
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_PUT, 1));
+  AddHeader("Content-Length", "0");
+  AddHeader("Transfer-Encoding", "identity");
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
+                                           reinterpret_cast<void*>(this)));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
+                                           &CurlHttpRequest::ReadCallback));
 }
 
 void CurlHttpRequest::SetPostFromBuffer(const char* buffer, size_t size) {
   CheckNotSent();
   CheckMethodNotSet();
   is_method_set_ = true;
+  method_ = RequestMethod::kPost;
   curl_headers_ = libcurl_->curl_slist_append(
       curl_headers_, strings::StrCat("Content-Length: ", size).c_str());
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_POST, 1);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
-                             reinterpret_cast<void*>(this));
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
-                             &CurlHttpRequest::ReadCallback);
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_POST, 1));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
+                                           reinterpret_cast<void*>(this)));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
+                                           &CurlHttpRequest::ReadCallback));
   post_body_buffer_ = StringPiece(buffer, size);
 }
 
@@ -271,13 +286,14 @@ void CurlHttpRequest::SetPostEmptyBody() {
   CheckNotSent();
   CheckMethodNotSet();
   is_method_set_ = true;
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_POST, 1);
-  curl_headers_ =
-      libcurl_->curl_slist_append(curl_headers_, "Content-Length: 0");
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
-                             reinterpret_cast<void*>(this));
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
-                             &CurlHttpRequest::ReadCallback);
+  method_ = RequestMethod::kPost;
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_POST, 1));
+  AddHeader("Content-Length", "0");
+  AddHeader("Transfer-Encoding", "identity");
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
+                                           reinterpret_cast<void*>(this)));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
+                                           &CurlHttpRequest::ReadCallback));
 }
 
 void CurlHttpRequest::SetResultBuffer(std::vector<char>* out_buffer) {
@@ -287,10 +303,10 @@ void CurlHttpRequest::SetResultBuffer(std::vector<char>* out_buffer) {
   out_buffer->clear();
   response_buffer_ = out_buffer;
 
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEDATA,
-                             reinterpret_cast<void*>(this));
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEFUNCTION,
-                             &CurlHttpRequest::WriteCallback);
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEDATA,
+                                           reinterpret_cast<void*>(this)));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEFUNCTION,
+                                           &CurlHttpRequest::WriteCallback));
 }
 
 void CurlHttpRequest::SetResultBufferDirect(char* buffer, size_t size) {
@@ -298,11 +314,10 @@ void CurlHttpRequest::SetResultBufferDirect(char* buffer, size_t size) {
   CheckNotSent();
 
   direct_response_ = DirectResponseState{buffer, size, 0};
-
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEDATA,
-                             reinterpret_cast<void*>(this));
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEFUNCTION,
-                             &CurlHttpRequest::WriteCallbackDirect);
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEDATA,
+                                           reinterpret_cast<void*>(this)));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(
+      curl_, CURLOPT_WRITEFUNCTION, &CurlHttpRequest::WriteCallbackDirect));
 }
 
 bool CurlHttpRequest::IsDirectResponse() const {
@@ -406,37 +421,58 @@ Status CurlHttpRequest::Send() {
   is_sent_ = true;
 
   if (curl_headers_) {
-    libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTPHEADER, curl_headers_);
+    CHECK_CURL_OK(
+        libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTPHEADER, curl_headers_));
   }
   if (resolve_list_) {
-    libcurl_->curl_easy_setopt(curl_, CURLOPT_RESOLVE, resolve_list_);
+    CHECK_CURL_OK(
+        libcurl_->curl_easy_setopt(curl_, CURLOPT_RESOLVE, resolve_list_));
   }
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_HEADERDATA,
-                             reinterpret_cast<void*>(this));
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_HEADERFUNCTION,
-                             &CurlHttpRequest::HeaderCallback);
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_HEADERDATA,
+                                           reinterpret_cast<void*>(this)));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_HEADERFUNCTION,
+                                           &CurlHttpRequest::HeaderCallback));
 
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_TIMEOUT, request_timeout_secs_);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_CONNECTTIMEOUT,
-                             connect_timeout_secs_);
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_TIMEOUT,
+                                           request_timeout_secs_));
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_CONNECTTIMEOUT,
+                                           connect_timeout_secs_));
 
   char error_buffer[CURL_ERROR_SIZE] = {0};
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_ERRORBUFFER, error_buffer);
+  CHECK_CURL_OK(
+      libcurl_->curl_easy_setopt(curl_, CURLOPT_ERRORBUFFER, error_buffer));
+
+  if (stats_ != nullptr) {
+    stats_->RecordRequest(this, uri_, method_);
+  }
 
-  const auto curl_result = libcurl_->curl_easy_perform(curl_);
+  const CURLcode curl_result = libcurl_->curl_easy_perform(curl_);
+  TF_CURL_RETURN_WITH_CONTEXT_IF_ERROR(
+      curl_result, "Performing request. Detailed error: ", error_buffer);
+
+  auto get_error_message = [this, curl_result, &error_buffer]() -> string {
+    StringPiece response = GetResponse();
+    string error_message = strings::StrCat(
+        "Error executing an HTTP request (HTTP response code ", response_code_,
+        ", error code ", curl_result, ", error message '", error_buffer, "')");
+    if (!response.empty()) {
+      return strings::StrCat(
+          error_message, ", response '",
+          response.substr(0,
+                          std::min(response.size(), response_to_error_limit_)),
+          "'");
+    }
+    return error_message;
+  };
 
   double written_size = 0;
-  libcurl_->curl_easy_getinfo(curl_, CURLINFO_SIZE_DOWNLOAD, &written_size);
-
-  libcurl_->curl_easy_getinfo(curl_, CURLINFO_RESPONSE_CODE, &response_code_);
+  CHECK_CURL_OK(libcurl_->curl_easy_getinfo(curl_, CURLINFO_SIZE_DOWNLOAD,
+                                            &written_size));
 
-  const auto& error_message = strings::StrCat(
-      "Error executing an HTTP request (HTTP response code ", response_code_,
-      ", error code ", curl_result, ", error message '", error_buffer, "')");
+  CHECK_CURL_OK(libcurl_->curl_easy_getinfo(curl_, CURLINFO_RESPONSE_CODE,
+                                            &response_code_));
 
   Status result;
-  StringPiece response = GetResponse();
-  string extended_error_message;
   switch (response_code_) {
     // The group of response codes indicating that the request achieved
     // the expected goal.
@@ -444,14 +480,9 @@ Status CurlHttpRequest::Send() {
     case 201:  // Created
     case 204:  // No Content
     case 206:  // Partial Content
-      if (curl_result != CURLE_OK) {
-        // This means the server executed the request successfully, but then
-        // something went wrong during the transmission of the response.
-        result = errors::Unavailable(error_message);
-      } else {
-        result = Status::OK();
-      }
+      result = Status::OK();
       break;
+
     case 416:  // Requested Range Not Satisfiable
       // The requested range had no overlap with the available range.
       // This doesn't indicate an error, but this does mean an empty response
@@ -463,27 +494,19 @@ Status CurlHttpRequest::Send() {
     // INVALID_ARGUMENT indicates a problem with how the request is constructed.
     case 400:  // Bad Request
     case 411:  // Length Required
-      result = errors::InvalidArgument(error_message);
+      result = errors::InvalidArgument(get_error_message());
       break;
 
     // PERMISSION_DENIED indicates an authentication or an authorization issue.
     case 401:  // Unauthorized
     case 403:  // Forbidden
-      if (!response.empty()) {
-        extended_error_message = strings::StrCat(
-            error_message, ", response ",
-            response.substr(
-                0, std::min(response.size(), response_to_error_limit_)));
-        result = errors::PermissionDenied(extended_error_message);
-      } else {
-        result = errors::PermissionDenied(error_message);
-      }
+      result = errors::PermissionDenied(get_error_message());
       break;
 
     // NOT_FOUND indicates that the requested resource does not exist.
     case 404:  // Not found
     case 410:  // Gone
-      result = errors::NotFound(error_message);
+      result = errors::NotFound(get_error_message());
       break;
 
     // FAILED_PRECONDITION indicates that the request failed because some
@@ -493,26 +516,35 @@ Status CurlHttpRequest::Send() {
     case 303:  // See Other
     case 304:  // Not Modified
     case 307:  // Temporary Redirect
-    case 308:  // Resume Incomplete
     case 412:  // Precondition Failed
     case 413:  // Payload Too Large
-      result = errors::FailedPrecondition(error_message);
+      result = errors::FailedPrecondition(get_error_message());
       break;
 
     // UNAVAILABLE indicates a problem that can go away if the request
-    // is just retried without any modification.
+    // is just retried without any modification. 308 return codes are intended
+    // for write requests that can be retried. See the documentation and the
+    // official library:
+    // https://cloud.google.com/storage/docs/json_api/v1/how-tos/resumable-upload
+    // https://github.com/google/apitools/blob/master/apitools/base/py/transfer.py
+    case 308:  // Resume Incomplete
     case 409:  // Conflict
     case 429:  // Too Many Requests
     case 500:  // Internal Server Error
     case 502:  // Bad Gateway
     case 503:  // Service Unavailable
     default:   // All other HTTP response codes also should be retried.
-      result = errors::Unavailable(error_message);
+      result = errors::Unavailable(get_error_message());
       break;
   }
   if (!result.ok()) {
     response_buffer_->clear();
   }
+
+  if (stats_ != nullptr) {
+    stats_->RecordResponse(this, uri_, method_, result);
+  }
+
   return result;
 }
 
@@ -596,4 +628,12 @@ int CurlHttpRequest::ProgressCallback(void* this_object, curl_off_t dltotal,
   return 0;
 }
 
+Status CURLcodeToStatus(CURLcode code) {
+  // Return Unavailable to retry by default. We probably should distinguish
+  // between permanent or temporary failures.
+  return errors::Unavailable("Error executing an HTTP request (error code ",
+                             code, ", error message '",
+                             curl_easy_strerror(code), "')");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/curl_http_request.h b/tensorflow/core/platform/cloud/curl_http_request.h
index cfa26f2b795a6cc33aba308597c77088362f1e1b..e658948ab9b9dd95eee1889b390a48beb2f74f9e 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.h
+++ b/tensorflow/core/platform/cloud/curl_http_request.h
@@ -75,6 +75,8 @@ class CurlHttpRequest : public HttpRequest {
   /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
   void AddAuthBearerHeader(const string& auth_token) override;
 
+  void SetRequestStats(RequestStats* stats) override;
+
   /// Makes the request a DELETE request.
   void SetDeleteRequest() override;
 
@@ -186,6 +188,8 @@ class CurlHttpRequest : public HttpRequest {
   curl_slist* curl_headers_ = nullptr;
   curl_slist* resolve_list_ = nullptr;
 
+  RequestStats* stats_ = nullptr;
+
   std::vector<char> default_response_buffer_;
 
   std::unordered_map<string, string> response_headers_;
@@ -213,6 +217,7 @@ class CurlHttpRequest : public HttpRequest {
 
   // Store the URI to help disambiguate requests when errors occur.
   string uri_;
+  RequestMethod method_ = RequestMethod::kGet;
 
   // Limit the size of a http response that is copied into an error message.
   const size_t response_to_error_limit_ = 500;
@@ -229,26 +234,28 @@ class LibCurl {
 
   virtual CURL* curl_easy_init() = 0;
   virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
-                                    uint64 param) = 0;
+                                    uint64 param) TF_MUST_USE_RESULT = 0;
   virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
-                                    const char* param) = 0;
+                                    const char* param) TF_MUST_USE_RESULT = 0;
   virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
-                                    void* param) = 0;
-  virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
-                                    size_t (*param)(void*, size_t, size_t,
-                                                    FILE*)) = 0;
+                                    void* param) TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_setopt(
+      CURL* curl, CURLoption option,
+      size_t (*param)(void*, size_t, size_t, FILE*)) TF_MUST_USE_RESULT = 0;
   virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
                                     size_t (*param)(const void*, size_t, size_t,
-                                                    void*)) = 0;
+                                                    void*))
+      TF_MUST_USE_RESULT = 0;
   virtual CURLcode curl_easy_setopt(
       CURL* curl, CURLoption option,
       int (*param)(void* clientp, curl_off_t dltotal, curl_off_t dlnow,
-                   curl_off_t ultotal, curl_off_t ulnow)) = 0;
-  virtual CURLcode curl_easy_perform(CURL* curl) = 0;
+                   curl_off_t ultotal,
+                   curl_off_t ulnow)) TF_MUST_USE_RESULT = 0;
+  virtual CURLcode curl_easy_perform(CURL* curl) TF_MUST_USE_RESULT = 0;
   virtual CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
-                                     uint64* value) = 0;
+                                     uint64* value) TF_MUST_USE_RESULT = 0;
   virtual CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
-                                     double* value) = 0;
+                                     double* value) TF_MUST_USE_RESULT = 0;
   virtual void curl_easy_cleanup(CURL* curl) = 0;
   virtual curl_slist* curl_slist_append(curl_slist* list, const char* str) = 0;
   virtual void curl_slist_free_all(curl_slist* list) = 0;
@@ -258,6 +265,17 @@ class LibCurl {
   virtual const char* curl_easy_strerror(CURLcode errornum) = 0;
 };
 
+Status CURLcodeToStatus(CURLcode code);
+
+#define TF_CURL_RETURN_WITH_CONTEXT_IF_ERROR(_code, ...)                    \
+  do {                                                                      \
+    if (_code != CURLE_OK) {                                                \
+      ::tensorflow::Status _status = ::tensorflow::CURLcodeToStatus(_code); \
+      ::tensorflow::errors::AppendToMessage(&_status, __VA_ARGS__);         \
+      return _status;                                                       \
+    }                                                                       \
+  } while (0)
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_CURL_HTTP_REQUEST_H_
diff --git a/tensorflow/core/platform/cloud/curl_http_request_test.cc b/tensorflow/core/platform/cloud/curl_http_request_test.cc
index 86d26a028733c303b85390b0be8fb8808c6e082a..522b717568777b22865d38a214086b12e3525e48 100644
--- a/tensorflow/core/platform/cloud/curl_http_request_test.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request_test.cc
@@ -346,7 +346,6 @@ TEST(CurlHttpRequestTest, GetRequest_Empty) {
 
 TEST(CurlHttpRequestTest, GetRequest_RangeOutOfBound) {
   FakeLibCurl libcurl("get response", 416);
-  libcurl.curl_easy_perform_result_ = CURLE_WRITE_ERROR;
   CurlHttpRequest http_request(&libcurl);
 
   std::vector<char> scratch;
@@ -377,10 +376,10 @@ TEST(CurlHttpRequestTest, GetRequest_503) {
   const auto& status = http_request.Send();
   EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_EQ(
-      "Error executing an HTTP request (HTTP response code 503, "
-      "error code 23, error message '')",
+      "Error executing an HTTP request (error code 23, error message 'Failed "
+      "writing received data to disk/application')\n\tPerforming request. "
+      "Detailed error: ",
       status.error_message());
-  EXPECT_EQ(503, http_request.GetResponseCode());
 }
 
 TEST(CurlHttpRequestTest, GetRequest_HttpCode0) {
@@ -396,8 +395,9 @@ TEST(CurlHttpRequestTest, GetRequest_HttpCode0) {
   const auto& status = http_request.Send();
   EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_EQ(
-      "Error executing an HTTP request (HTTP response code 0, "
-      "error code 28, error message 'Operation timed out')",
+      "Error executing an HTTP request (error code 28, error message 'Timeout "
+      "was reached')\n\tPerforming request. Detailed error: Operation timed "
+      "out",
       status.error_message());
   EXPECT_EQ(0, http_request.GetResponseCode());
 }
@@ -476,9 +476,10 @@ TEST(CurlHttpRequestTest, PutRequest_WithoutBody) {
   EXPECT_TRUE(libcurl.is_initialized_);
   EXPECT_EQ("http://www.testuri.com", libcurl.url_);
   EXPECT_EQ("", libcurl.custom_request_);
-  EXPECT_EQ(2, libcurl.headers_->size());
+  EXPECT_EQ(3, libcurl.headers_->size());
   EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
   EXPECT_EQ("Content-Length: 0", (*libcurl.headers_)[1]);
+  EXPECT_EQ("Transfer-Encoding: identity", (*libcurl.headers_)[2]);
   EXPECT_TRUE(libcurl.is_put_);
   EXPECT_EQ("", libcurl.posted_content_);
 }
@@ -517,9 +518,10 @@ TEST(CurlHttpRequestTest, PostRequest_WithoutBody) {
   EXPECT_TRUE(libcurl.is_initialized_);
   EXPECT_EQ("http://www.testuri.com", libcurl.url_);
   EXPECT_EQ("", libcurl.custom_request_);
-  EXPECT_EQ(2, libcurl.headers_->size());
+  EXPECT_EQ(3, libcurl.headers_->size());
   EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
   EXPECT_EQ("Content-Length: 0", (*libcurl.headers_)[1]);
+  EXPECT_EQ("Transfer-Encoding: identity", (*libcurl.headers_)[2]);
   EXPECT_TRUE(libcurl.is_post_);
   EXPECT_EQ("", libcurl.posted_content_);
 }
@@ -628,10 +630,214 @@ TEST(CurlHttpRequestTest, ProgressIsStuck) {
   auto status = http_request.Send();
   EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_EQ(
-      "Error executing an HTTP request (HTTP response code 200, "
-      "error code 42, error message '')",
+      "Error executing an HTTP request (error code 42, error message "
+      "'Operation was aborted by an application callback')\n\tPerforming "
+      "request. Detailed error: ",
       status.error_message());
 }
 
+class TestStats : public HttpRequest::RequestStats {
+ public:
+  ~TestStats() override = default;
+
+  void RecordRequest(const HttpRequest* request, const string& uri,
+                     HttpRequest::RequestMethod method) override {
+    has_recorded_request_ = true;
+    record_request_request_ = request;
+    record_request_uri_ = uri;
+    record_request_method_ = method;
+  }
+
+  void RecordResponse(const HttpRequest* request, const string& uri,
+                      HttpRequest::RequestMethod method,
+                      const Status& result) override {
+    has_recorded_response_ = true;
+    record_response_request_ = request;
+    record_response_uri_ = uri;
+    record_response_method_ = method;
+    record_response_result_ = result;
+  }
+
+  const HttpRequest* record_request_request_ = nullptr;
+  string record_request_uri_ = "http://www.testuri.com";
+  HttpRequest::RequestMethod record_request_method_ =
+      HttpRequest::RequestMethod::kGet;
+
+  const HttpRequest* record_response_request_ = nullptr;
+  string record_response_uri_ = "http://www.testuri.com";
+  HttpRequest::RequestMethod record_response_method_ =
+      HttpRequest::RequestMethod::kGet;
+  Status record_response_result_;
+
+  bool has_recorded_request_ = false;
+  bool has_recorded_response_ = false;
+};
+
+class StatsTestFakeLibCurl : public FakeLibCurl {
+ public:
+  StatsTestFakeLibCurl(TestStats* stats, const string& response_content,
+                       uint64 response_code)
+      : FakeLibCurl(response_content, response_code), stats_(stats) {}
+  CURLcode curl_easy_perform(CURL* curl) override {
+    CHECK(!performed_request_);
+    performed_request_ = true;
+    stats_had_recorded_request_ = stats_->has_recorded_request_;
+    stats_had_recorded_response_ = stats_->has_recorded_response_;
+    return FakeLibCurl::curl_easy_perform(curl);
+  };
+
+  TestStats* stats_;
+  bool performed_request_ = false;
+  bool stats_had_recorded_request_;
+  bool stats_had_recorded_response_;
+};
+
+TEST(CurlHttpRequestTest, StatsGetSuccessful) {
+  TestStats stats;
+  StatsTestFakeLibCurl libcurl(&stats, "get response", 200);
+  CurlHttpRequest http_request(&libcurl);
+
+  std::vector<char> scratch;
+  scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
+  scratch.reserve(100);
+
+  http_request.SetRequestStats(&stats);
+
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetRange(100, 199);
+  http_request.SetResultBuffer(&scratch);
+  TF_EXPECT_OK(http_request.Send());
+
+  EXPECT_EQ("get response", string(scratch.begin(), scratch.end()));
+
+  // Check interaction with stats.
+  ASSERT_TRUE(stats.has_recorded_request_);
+  EXPECT_EQ(&http_request, stats.record_request_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_request_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kGet, stats.record_request_method_);
+
+  ASSERT_TRUE(stats.has_recorded_response_);
+  EXPECT_EQ(&http_request, stats.record_response_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_response_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kGet, stats.record_response_method_);
+  TF_EXPECT_OK(stats.record_response_result_);
+
+  // Check interaction with libcurl.
+  EXPECT_TRUE(libcurl.performed_request_);
+  EXPECT_TRUE(libcurl.stats_had_recorded_request_);
+  EXPECT_FALSE(libcurl.stats_had_recorded_response_);
+}
+
+TEST(CurlHttpRequestTest, StatsGetNotFound) {
+  TestStats stats;
+  StatsTestFakeLibCurl libcurl(&stats, "get other response", 404);
+  CurlHttpRequest http_request(&libcurl);
+
+  std::vector<char> scratch;
+  scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
+  scratch.reserve(100);
+
+  http_request.SetRequestStats(&stats);
+
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetRange(100, 199);
+  http_request.SetResultBuffer(&scratch);
+  Status s = http_request.Send();
+
+  // Check interaction with stats.
+  ASSERT_TRUE(stats.has_recorded_request_);
+  EXPECT_EQ(&http_request, stats.record_request_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_request_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kGet, stats.record_request_method_);
+
+  ASSERT_TRUE(stats.has_recorded_response_);
+  EXPECT_EQ(&http_request, stats.record_response_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_response_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kGet, stats.record_response_method_);
+  EXPECT_TRUE(errors::IsNotFound(stats.record_response_result_));
+  EXPECT_EQ(s, stats.record_response_result_);
+
+  // Check interaction with libcurl.
+  EXPECT_TRUE(libcurl.performed_request_);
+  EXPECT_TRUE(libcurl.stats_had_recorded_request_);
+  EXPECT_FALSE(libcurl.stats_had_recorded_response_);
+}
+
+TEST(CurlHttpRequestTest, StatsPost) {
+  TestStats stats;
+
+  FakeLibCurl libcurl("", 200);
+  CurlHttpRequest http_request(&libcurl);
+
+  http_request.SetRequestStats(&stats);
+
+  string content = "post body content";
+
+  http_request.SetUri("http://www.testuri.com");
+  http_request.SetPostFromBuffer(content.c_str(), content.size());
+  TF_EXPECT_OK(http_request.Send());
+
+  // Check interaction with stats.
+  ASSERT_TRUE(stats.has_recorded_request_);
+  EXPECT_EQ(&http_request, stats.record_request_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_request_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kPost, stats.record_request_method_);
+
+  ASSERT_TRUE(stats.has_recorded_response_);
+  EXPECT_EQ(&http_request, stats.record_response_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_response_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kPost, stats.record_response_method_);
+  TF_EXPECT_OK(stats.record_response_result_);
+}
+
+TEST(CurlHttpRequestTest, StatsDelete) {
+  TestStats stats;
+
+  FakeLibCurl libcurl("", 200);
+  CurlHttpRequest http_request(&libcurl);
+  http_request.SetRequestStats(&stats);
+  http_request.SetUri("http://www.testuri.com");
+  http_request.SetDeleteRequest();
+  TF_EXPECT_OK(http_request.Send());
+
+  // Check interaction with stats.
+  ASSERT_TRUE(stats.has_recorded_request_);
+  EXPECT_EQ(&http_request, stats.record_request_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_request_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kDelete, stats.record_request_method_);
+
+  ASSERT_TRUE(stats.has_recorded_response_);
+  EXPECT_EQ(&http_request, stats.record_response_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_response_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kDelete, stats.record_response_method_);
+  TF_EXPECT_OK(stats.record_response_result_);
+}
+
+TEST(CurlHttpRequestTest, StatsPut) {
+  TestStats stats;
+
+  FakeLibCurl libcurl("", 200);
+  CurlHttpRequest http_request(&libcurl);
+  http_request.SetRequestStats(&stats);
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetPutEmptyBody();
+  TF_EXPECT_OK(http_request.Send());
+
+  // Check interaction with stats.
+  ASSERT_TRUE(stats.has_recorded_request_);
+  EXPECT_EQ(&http_request, stats.record_request_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_request_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kPut, stats.record_request_method_);
+
+  ASSERT_TRUE(stats.has_recorded_response_);
+  EXPECT_EQ(&http_request, stats.record_response_request_);
+  EXPECT_EQ("http://www.testuri.com", stats.record_response_uri_);
+  EXPECT_EQ(HttpRequest::RequestMethod::kPut, stats.record_response_method_);
+  TF_EXPECT_OK(stats.record_response_result_);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache.h b/tensorflow/core/platform/cloud/expiring_lru_cache.h
index c738497ddd533b5b9a8339e51a21ac204acf68b5..e2d048f141c8e711ca9ce1eb37d1fff671a3192d 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache.h
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache.h
@@ -51,6 +51,14 @@ class ExpiringLRUCache {
     InsertLocked(key, value);
   }
 
+  // Delete the entry with key `key`. Return true if the entry was found for
+  // `key`, false if the entry was not found. In both cases, there is no entry
+  // with key `key` existed after the call.
+  bool Delete(const string& key) {
+    mutex_lock lock(mu_);
+    return DeleteLocked(key);
+  }
+
   /// Look up the entry with key `key` and copy it to `value` if found. Returns
   /// true if an entry was found for `key`, and its timestamp is not more than
   /// max_age_ seconds in the past.
@@ -141,6 +149,16 @@ class ExpiringLRUCache {
     }
   }
 
+  bool DeleteLocked(const string& key) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return false;
+    }
+    lru_list_.erase(it->second.lru_iterator);
+    cache_.erase(it);
+    return true;
+  }
+
   /// The maximum age of entries in the cache, in seconds. A value of 0 means
   /// that no entry is ever placed in the cache.
   const uint64 max_age_;
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
index 3bc6db38429155ca61732b44da3815422b480c92..42879e80a9e68383719c0ccb34a0d3101141f94a 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
@@ -174,5 +174,22 @@ TEST(ExpiringLRUCacheTest, Clear) {
   EXPECT_FALSE(cache.Lookup("d", &value));
 }
 
+TEST(ExpiringLRUCacheTest, Delete) {
+  // Insert an entry.
+  ExpiringLRUCache<int> cache(1, 4);
+  cache.Insert("a", 1);
+  int value = 0;
+  EXPECT_TRUE(cache.Lookup("a", &value));
+  EXPECT_EQ(value, 1);
+
+  // Delete the entry.
+  EXPECT_TRUE(cache.Delete("a"));
+  EXPECT_FALSE(cache.Lookup("a", &value));
+
+  // Try deleting the entry again.
+  EXPECT_FALSE(cache.Delete("a"));
+  EXPECT_FALSE(cache.Lookup("a", &value));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/file_block_cache.h b/tensorflow/core/platform/cloud/file_block_cache.h
index 5c180e2332042af3ae938c2685ac416952b00187..da167882470bfa3d833faeb7f031fdd7064aba35 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.h
+++ b/tensorflow/core/platform/cloud/file_block_cache.h
@@ -32,7 +32,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-/// \brief An LRU block cache of file contents, keyed by {filename, offset}.
+/// \brief A block cache of file contents, keyed by {filename, offset}.
 ///
 /// This class should be shared by read-only random access files on a remote
 /// filesystem (e.g. GCS).
@@ -48,27 +48,7 @@ class FileBlockCache {
                                size_t* bytes_transferred)>
       BlockFetcher;
 
-  FileBlockCache(size_t block_size, size_t max_bytes, uint64 max_staleness,
-                 BlockFetcher block_fetcher, Env* env = Env::Default())
-      : block_size_(block_size),
-        max_bytes_(max_bytes),
-        max_staleness_(max_staleness),
-        block_fetcher_(block_fetcher),
-        env_(env) {
-    if (max_staleness_ > 0) {
-      pruning_thread_.reset(env_->StartThread(ThreadOptions(), "TF_prune_FBC",
-                                              [this] { Prune(); }));
-    }
-  }
-
-  ~FileBlockCache() {
-    if (pruning_thread_) {
-      stop_pruning_thread_.Notify();
-      // Destroying pruning_thread_ will block until Prune() receives the above
-      // notification and returns.
-      pruning_thread_.reset();
-    }
-  }
+  virtual ~FileBlockCache() {}
 
   /// Read `n` bytes from `filename` starting at `offset` into `out`. This
   /// method will return:
@@ -84,143 +64,22 @@ class FileBlockCache {
   ///    placed in `out`.
   /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
   ///    in `out`).
-  Status Read(const string& filename, size_t offset, size_t n, char* buffer,
-              size_t* bytes_transferred);
+  virtual Status Read(const string& filename, size_t offset, size_t n,
+                      char* buffer, size_t* bytes_transferred) = 0;
 
   /// Remove all cached blocks for `filename`.
-  void RemoveFile(const string& filename) LOCKS_EXCLUDED(mu_);
+  virtual void RemoveFile(const string& filename) = 0;
 
   /// Remove all cached data.
-  void Flush() LOCKS_EXCLUDED(mu_);
+  virtual void Flush() = 0;
 
   /// Accessors for cache parameters.
-  size_t block_size() const { return block_size_; }
-  size_t max_bytes() const { return max_bytes_; }
-  uint64 max_staleness() const { return max_staleness_; }
+  virtual size_t block_size() const = 0;
+  virtual size_t max_bytes() const = 0;
+  virtual uint64 max_staleness() const = 0;
 
   /// The current size (in bytes) of the cache.
-  size_t CacheSize() const LOCKS_EXCLUDED(mu_);
-
- private:
-  /// The size of the blocks stored in the LRU cache, as well as the size of the
-  /// reads from the underlying filesystem.
-  const size_t block_size_;
-  /// The maximum number of bytes (sum of block sizes) allowed in the LRU cache.
-  const size_t max_bytes_;
-  /// The maximum staleness of any block in the LRU cache, in seconds.
-  const uint64 max_staleness_;
-  /// The callback to read a block from the underlying filesystem.
-  const BlockFetcher block_fetcher_;
-  /// The Env from which we read timestamps.
-  Env* const env_;  // not owned
-
-  /// \brief The key type for the file block cache.
-  ///
-  /// The file block cache key is a {filename, offset} pair.
-  typedef std::pair<string, size_t> Key;
-
-  /// \brief The state of a block.
-  ///
-  /// A block begins in the CREATED stage. The first thread will attempt to read
-  /// the block from the filesystem, transitioning the state of the block to
-  /// FETCHING. After completing, if the read was successful the state should
-  /// be FINISHED. Otherwise the state should be ERROR. A subsequent read can
-  /// re-fetch the block if the state is ERROR.
-  enum class FetchState {
-    CREATED,
-    FETCHING,
-    FINISHED,
-    ERROR,
-  };
-
-  /// \brief A block of a file.
-  ///
-  /// A file block consists of the block data, the block's current position in
-  /// the LRU cache, the timestamp (seconds since epoch) at which the block
-  /// was cached, a coordination lock, and state & condition variables.
-  ///
-  /// Thread safety:
-  /// The iterator and timestamp fields should only be accessed while holding
-  /// the block-cache-wide mu_ instance variable. The state variable should only
-  /// be accessed while holding the Block's mu lock. The data vector should only
-  /// be accessed after state == FINISHED, and it should never be modified.
-  ///
-  /// In order to prevent deadlocks, never grab the block-cache-wide mu_ lock
-  /// AFTER grabbing any block's mu lock. It is safe to grab mu without locking
-  /// mu_.
-  struct Block {
-    /// The block data.
-    std::vector<char> data;
-    /// A list iterator pointing to the block's position in the LRU list.
-    std::list<Key>::iterator lru_iterator;
-    /// A list iterator pointing to the block's position in the LRA list.
-    std::list<Key>::iterator lra_iterator;
-    /// The timestamp (seconds since epoch) at which the block was cached.
-    uint64 timestamp;
-    /// Mutex to guard state variable
-    mutex mu;
-    /// The state of the block.
-    FetchState state GUARDED_BY(mu) = FetchState::CREATED;
-    /// Wait on cond_var if state is FETCHING.
-    condition_variable cond_var;
-  };
-
-  /// \brief The block map type for the file block cache.
-  ///
-  /// The block map is an ordered map from Key to Block.
-  typedef std::map<Key, std::shared_ptr<Block>> BlockMap;
-
-  /// Prune the cache by removing files with expired blocks.
-  void Prune() LOCKS_EXCLUDED(mu_);
-
-  bool BlockNotStale(const std::shared_ptr<Block>& block)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  /// Look up a Key in the block cache.
-  std::shared_ptr<Block> Lookup(const Key& key) LOCKS_EXCLUDED(mu_);
-
-  Status MaybeFetch(const Key& key, const std::shared_ptr<Block>& block)
-      LOCKS_EXCLUDED(mu_);
-
-  /// Trim the block cache to make room for another entry.
-  void Trim() EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  /// Update the LRU iterator for the block at `key`.
-  Status UpdateLRU(const Key& key, const std::shared_ptr<Block>& block)
-      LOCKS_EXCLUDED(mu_);
-
-  /// Remove all blocks of a file, with mu_ already held.
-  void RemoveFile_Locked(const string& filename) EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  /// Remove the block `entry` from the block map and LRU list, and update the
-  /// cache size accordingly.
-  void RemoveBlock(BlockMap::iterator entry) EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  /// The cache pruning thread that removes files with expired blocks.
-  std::unique_ptr<Thread> pruning_thread_;
-
-  /// Notification for stopping the cache pruning thread.
-  Notification stop_pruning_thread_;
-
-  /// Guards access to the block map, LRU list, and cached byte count.
-  mutable mutex mu_;
-
-  /// The block map (map from Key to Block).
-  BlockMap block_map_ GUARDED_BY(mu_);
-
-  /// The LRU list of block keys. The front of the list identifies the most
-  /// recently accessed block.
-  std::list<Key> lru_list_ GUARDED_BY(mu_);
-
-  /// The LRA (least recently added) list of block keys. The front of the list
-  /// identifies the most recently added block.
-  ///
-  /// Note: blocks are added to lra_list_ only after they have successfully been
-  /// fetched from the underlying block store.
-  std::list<Key> lra_list_ GUARDED_BY(mu_);
-
-  /// The combined number of bytes in all of the cached blocks.
-  size_t cache_size_ GUARDED_BY(mu_) = 0;
+  virtual size_t CacheSize() const = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
index 8be452ff44d03bf3a8a66b99b0e65f98da537d5f..237ce6b5e5afb16628c6570c0c60e81b6cc9b0be 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
@@ -36,7 +36,7 @@ class TestHttpRequest : public HttpRequest {
   }
 
   void AddAuthBearerHeader(const string& auth_token) override {}
-
+  void SetRequestStats(HttpRequest::RequestStats* stats) override {}
   void SetDeleteRequest() override {}
 
   Status SetPutFromFile(const string& body_filepath, size_t offset) override {
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 01ca0d76bab2720513775ef33ff8670bd148c241..2d9c99c124a3f224a1cad72dc7a88966c00ba2a2 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
+#include "tensorflow/core/platform/cloud/ram_file_block_cache.h"
 #include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include "tensorflow/core/platform/cloud/time_util.h"
 #include "tensorflow/core/platform/env.h"
@@ -171,7 +172,7 @@ Status ParseGcsPath(StringPiece fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
                                    fname);
   }
-  objectp.Consume("/");
+  str_util::ConsumePrefix(&objectp, "/");
   *object = objectp.ToString();
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("GCS path doesn't contain an object name: ",
@@ -300,6 +301,14 @@ class GcsRandomAccessFile : public RandomAccessFile {
     TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, scratch,
                                                &bytes_transferred));
     *result = StringPiece(scratch, bytes_transferred);
+    string checkpoint_ending = "/checkpoint";
+    // Check if the file is the checkpoint file as we should not be caching
+    // that. As it's contents are updated and used for iterating checkpoints.
+    if (std::equal(checkpoint_ending.rbegin(), checkpoint_ending.rend(),
+                   filename_.rbegin())) {
+      // Remove the checkpoint file from the cache
+      file_block_cache_->RemoveFile(filename_);
+    }
     if (bytes_transferred < n) {
       // This is not an error per se. The RandomAccessFile interface expects
       // that Read returns OutOfRange if fewer bytes were read than requested.
@@ -534,7 +543,8 @@ class GcsWritableFile : public WritableFile {
       *uploaded = 0;
     } else {
       StringPiece range_piece(received_range);
-      range_piece.Consume("bytes=");  // May or may not be present.
+      str_util::ConsumePrefix(&range_piece,
+                              "bytes=");  // May or may not be present.
       std::vector<int64> range_parts;
       if (!str_util::SplitAndParseAsInts(range_piece, '-', &range_parts) ||
           range_parts.size() != 2) {
@@ -783,13 +793,13 @@ Status GcsFileSystem::NewRandomAccessFile(
 // A helper function to build a FileBlockCache for GcsFileSystem.
 std::unique_ptr<FileBlockCache> GcsFileSystem::MakeFileBlockCache(
     size_t block_size, size_t max_bytes, uint64 max_staleness) {
-  std::unique_ptr<FileBlockCache> file_block_cache(
-      new FileBlockCache(block_size, max_bytes, max_staleness,
-                         [this](const string& filename, size_t offset, size_t n,
-                                char* buffer, size_t* bytes_transferred) {
-                           return LoadBufferFromGCS(filename, offset, n, buffer,
-                                                    bytes_transferred);
-                         }));
+  std::unique_ptr<FileBlockCache> file_block_cache(new RamFileBlockCache(
+      block_size, max_bytes, max_staleness,
+      [this](const string& filename, size_t offset, size_t n, char* buffer,
+             size_t* bytes_transferred) {
+        return LoadBufferFromGCS(filename, offset, n, buffer,
+                                 bytes_transferred);
+      }));
   return file_block_cache;
 }
 
@@ -812,6 +822,10 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   request->SetResultBufferDirect(buffer, n);
   request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.read);
 
+  if (stats_ != nullptr) {
+    stats_->RecordBlockLoadRequest(filename, offset);
+  }
+
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading gs://",
                                   bucket, "/", object);
 
@@ -820,9 +834,13 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   VLOG(1) << "Successful read of gs://" << bucket << "/" << object << " @ "
           << offset << " of size: " << bytes_read;
 
+  if (stats_ != nullptr) {
+    stats_->RecordBlockRetrieved(filename, offset, bytes_read);
+  }
+
   throttle_.RecordResponse(bytes_read);
 
-  if (bytes_read < block_size()) {
+  if (bytes_read < n) {
     // Check stat cache to see if we encountered an interrupted read.
     FileStatistics stat;
     if (stat_cache_->Lookup(filename, &stat)) {
@@ -839,14 +857,20 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   return Status::OK();
 }
 
+void GcsFileSystem::ClearFileCaches(const string& fname) {
+  file_block_cache_->RemoveFile(fname);
+  stat_cache_->Delete(fname);
+  // TODO(rxsang): Remove the patterns that matche the file in
+  // MatchingPathsCache as well.
+}
+
 Status GcsFileSystem::NewWritableFile(const string& fname,
                                       std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
-  result->reset(new GcsWritableFile(
-      bucket, object, this, &timeouts_,
-      [this, fname]() { file_block_cache_->RemoveFile(fname); },
-      initial_retry_delay_usec_));
+  result->reset(new GcsWritableFile(bucket, object, this, &timeouts_,
+                                    [this, fname]() { ClearFileCaches(fname); },
+                                    initial_retry_delay_usec_));
   return Status::OK();
 }
 
@@ -886,8 +910,7 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
       bucket, object, this, old_content_filename, &timeouts_,
-      [this, fname]() { file_block_cache_->RemoveFile(fname); },
-      initial_retry_delay_usec_));
+      [this, fname]() { ClearFileCaches(fname); }, initial_retry_delay_usec_));
   return Status::OK();
 }
 
@@ -1163,7 +1186,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         // 'object_prefix', which is part of 'dirname', should be removed from
         // the beginning of 'name'.
         StringPiece relative_path(name);
-        if (!relative_path.Consume(object_prefix)) {
+        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(strings::StrCat(
               "Unexpected response: the returned file name ", name,
               " doesn't match the prefix ", object_prefix));
@@ -1192,7 +1215,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         }
         const string& prefix_str = prefix.asString();
         StringPiece relative_path(prefix_str);
-        if (!relative_path.Consume(object_prefix)) {
+        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(
               "Unexpected response: the returned folder name ", prefix_str,
               " doesn't match the prefix ", object_prefix);
@@ -1259,7 +1282,7 @@ Status GcsFileSystem::DeleteFile(const string& fname) {
   request->SetDeleteRequest();
 
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when deleting ", fname);
-  file_block_cache_->RemoveFile(fname);
+  ClearFileCaches(fname);
   return Status::OK();
 }
 
@@ -1454,6 +1477,13 @@ void GcsFileSystem::FlushCaches() {
   matching_paths_cache_->Clear();
 }
 
+void GcsFileSystem::SetStats(GcsStatsInterface* stats) {
+  CHECK(stats_ == nullptr) << "SetStats() has already been called.";
+  CHECK(stats != nullptr);
+  stats_ = stats;
+  stats_->Init(this, &throttle_, file_block_cache_.get());
+}
+
 // Creates an HttpRequest and sets several parameters that are common to all
 // requests.  All code (in GcsFileSystem) that creates an HttpRequest should
 // go through this method, rather than directly using http_request_factory_.
@@ -1473,6 +1503,10 @@ Status GcsFileSystem::CreateHttpRequest(std::unique_ptr<HttpRequest>* request) {
                            additional_header_->second);
   }
 
+  if (stats_ != nullptr) {
+    new_request->SetRequestStats(stats_->HttpStats());
+  }
+
   if (!throttle_.AdmitRequest()) {
     return errors::Unavailable("Request throttled");
   }
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index e8edde8a445aad4c0310394d89480dc6ae445dfa..6250aa75948d22c71ba625300e51fac1ca2d9914 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -32,6 +32,39 @@ limitations under the License.
 
 namespace tensorflow {
 
+class GcsFileSystem;
+
+/// GcsStatsInterface allows for instrumentation of the GCS file system.
+///
+/// GcsStatsInterface and its subclasses must be safe to use from multiple
+/// threads concurrently.
+///
+/// WARNING! This is an experimental interface that may change or go away at any
+/// time.
+class GcsStatsInterface {
+ public:
+  /// Init is called by the GcsFileSystem immediately after being registered.
+  virtual void Init(GcsFileSystem* fs, GcsThrottle* throttle,
+                    const FileBlockCache* block_cache) = 0;
+
+  /// RecordBlockLoadRequest is called to record a block load request is about
+  /// to be made.
+  virtual void RecordBlockLoadRequest(const string& file, size_t offset) = 0;
+
+  /// RecordBlockRetrieved is called once a block within the file has been
+  /// retrieved.
+  virtual void RecordBlockRetrieved(const string& file, size_t offset,
+                                    size_t bytes_transferred) = 0;
+
+  /// HttpStats is called to optionally provide a RequestStats listener
+  /// to be annotated on every HTTP request made to the GCS API.
+  ///
+  /// HttpStats() may return nullptr.
+  virtual HttpRequest::RequestStats* HttpStats() = 0;
+
+  virtual ~GcsStatsInterface() = default;
+};
+
 /// Google Cloud Storage implementation of a file system.
 ///
 /// The clients should use RetryingGcsFileSystem defined below,
@@ -90,6 +123,9 @@ class GcsFileSystem : public FileSystem {
 
   void FlushCaches() override;
 
+  /// Set an object to collect runtime statistics from the GcsFilesystem.
+  void SetStats(GcsStatsInterface* stats);
+
   /// These accessors are mainly for testing purposes, to verify that the
   /// environment variables that control these parameters are handled correctly.
   size_t block_size() const { return file_block_cache_->block_size(); }
@@ -191,6 +227,9 @@ class GcsFileSystem : public FileSystem {
   Status LoadBufferFromGCS(const string& filename, size_t offset, size_t n,
                            char* buffer, size_t* bytes_transferred);
 
+  // Clear all the caches related to the file with name `filename`.
+  void ClearFileCaches(const string& fname);
+
   std::unique_ptr<AuthProvider> auth_provider_;
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
   std::unique_ptr<FileBlockCache> file_block_cache_;
@@ -205,6 +244,8 @@ class GcsFileSystem : public FileSystem {
 
   TimeoutConfig timeouts_;
 
+  GcsStatsInterface* stats_ = nullptr;  // Not owned.
+
   /// The initial delay for exponential backoffs when retrying failed calls.
   const int64 initial_retry_delay_usec_ = 1000000L;
 
@@ -215,10 +256,10 @@ class GcsFileSystem : public FileSystem {
 };
 
 /// Google Cloud Storage implementation of a file system with retry on failures.
-class RetryingGcsFileSystem : public RetryingFileSystem {
+class RetryingGcsFileSystem : public RetryingFileSystem<GcsFileSystem> {
  public:
   RetryingGcsFileSystem()
-      : RetryingFileSystem(std::unique_ptr<FileSystem>(new GcsFileSystem)) {}
+      : RetryingFileSystem(std::unique_ptr<GcsFileSystem>(new GcsFileSystem)) {}
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index d452074ce312f98abe6b058ea56d2e0ce4cf047a..c639299954396020a001f82294e87a72d482ce77 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -197,6 +198,54 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   EXPECT_EQ("0123", result);
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_CheckpointFile_WithBlockCache) {
+  // Our underlying file in this test changes as new data comes in
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "012345678"),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "abcdefghi")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  char scratch[100];
+  StringPiece result;
+  {
+    // We are instantiating this in an enclosed scope to make sure after the
+    // unique ptr goes out of scope, we can still access result.
+    std::unique_ptr<RandomAccessFile> file;
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/checkpoint", &file));
+
+    // Read the first chunk. The cache will be populated with the first block of
+    // 9 bytes.
+    scratch[5] = 'x';
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("0123", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+
+    // The second chunk should not be in cache so we make a new request
+    // As the checkpoint file should not be cached
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("abcd", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+  }
+}
+
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   // Our underlying file in this test is a 15 byte file with contents
   // "0123456789abcde".
@@ -311,6 +360,47 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
             fs.NewRandomAccessFile("gs://bucket/", &file).code());
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"6\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-5\n"
+           "Timeouts: 5 1 20\n",
+           "012")});
+
+  // Set stat_cache_max_age to 1000s so that StatCache could work.
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   1e3 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  // Stat the file first so that the file stats are cached.
+  FileStatistics stat;
+  TF_ASSERT_OK(fs.Stat("gs://bucket/random_access.txt", &stat));
+
+  std::unique_ptr<RandomAccessFile> file;
+  TF_ASSERT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+
+  char scratch[6];
+  StringPiece result;
+
+  EXPECT_EQ(errors::Code::INTERNAL,
+            file->Read(0, sizeof(scratch), &result, scratch).code());
+}
+
 TEST(GcsFileSystemTest, NewWritableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
@@ -393,7 +483,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                            "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
-                           "", errors::FailedPrecondition("308"), nullptr,
+                           "", errors::Unavailable("308"), nullptr,
                            {{"Range", "0-10"}}, 308),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
@@ -406,13 +496,26 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                            "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
-                           "", errors::FailedPrecondition("308"), nullptr,
+                           "", errors::Unavailable("308"), nullptr,
                            {{"Range", "bytes=0-12"}}, 308),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 13-16/17\n"
                            "Timeouts: 5 1 30\n"
                            "Put body: ent2\n",
+                           "", errors::Unavailable("308"), 308),
+       new FakeHttpRequest("Uri: https://custom/upload/location\n"
+                           "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
+                           "Header Content-Range: bytes */17\n"
+                           "Put: yes\n",
+                           "", errors::Unavailable("308"), nullptr,
+                           {{"Range", "bytes=0-14"}}, 308),
+       new FakeHttpRequest("Uri: https://custom/upload/location\n"
+                           "Auth Token: fake_token\n"
+                           "Header Content-Range: bytes 15-16/17\n"
+                           "Timeouts: 5 1 30\n"
+                           "Put body: t2\n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -521,14 +624,14 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
                            "Put body: content1,content2\n",
                            "", errors::Unavailable("503"), 503)});
   for (int i = 0; i < 10; i++) {
-    requests.emplace_back(new FakeHttpRequest(
-        "Uri: https://custom/upload/location\n"
-        "Auth Token: fake_token\n"
-        "Timeouts: 5 1 10\n"
-        "Header Content-Range: bytes */17\n"
-        "Put: yes\n",
-        "", errors::FailedPrecondition("important HTTP error 308"), nullptr,
-        {{"Range", "0-10"}}, 308));
+    requests.emplace_back(
+        new FakeHttpRequest("Uri: https://custom/upload/location\n"
+                            "Auth Token: fake_token\n"
+                            "Timeouts: 5 1 10\n"
+                            "Header Content-Range: bytes */17\n"
+                            "Put: yes\n",
+                            "", errors::Unavailable("important HTTP error 308"),
+                            nullptr, {{"Range", "0-10"}}, 308));
     requests.emplace_back(new FakeHttpRequest(
         "Uri: https://custom/upload/location\n"
         "Auth Token: fake_token\n"
@@ -571,8 +674,9 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
   TF_EXPECT_OK(file->Append("content2"));
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::ABORTED, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("All 10 retry attempts failed. The last failure: "
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "All 10 retry attempts failed. The last failure: "
                             "Unavailable: important HTTP error 503"))
       << status;
 }
@@ -628,13 +732,12 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::UNAVAILABLE, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message())
-          .contains(
-              "Upload to gs://bucket/path/writeable.txt failed, caused by: "
-              "Not found: important HTTP error 410"))
+      str_util::StrContains(status.error_message(),
+                            "Upload to gs://bucket/path/writeable.txt failed, "
+                            "caused by: Not found: important HTTP error 410"))
       << status;
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("when uploading gs://bucket/path/writeable.txt"))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "when uploading gs://bucket/path/writeable.txt"))
       << status;
 }
 
@@ -1448,6 +1551,56 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
             fs.DeleteFile("gs://bucket/").code());
 }
 
+TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "file.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"1010\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
+                           "/bucket/o/file.txt\n"
+                           "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
+                           "Delete: yes\n",
+                           ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "file.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           "", errors::NotFound("404"), 404),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+           "fields=items%2Fname%2CnextPageToken&prefix=file.txt%2F"
+           "&maxResults=1\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           "{}")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      16 /* block size */, 16 /* max bytes */, 0 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  // Stats the file first so the stat is cached.
+  FileStatistics stat_before_deletion;
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat_before_deletion));
+  EXPECT_EQ(1010, stat_before_deletion.length);
+
+  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/file.txt"));
+
+  FileStatistics stat_after_deletion;
+  EXPECT_EQ(error::Code::NOT_FOUND,
+            fs.Stat("gs://bucket/file.txt", &stat_after_deletion).code());
+}
+
 TEST(GcsFileSystemTest, DeleteDir_Empty) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
@@ -2608,5 +2761,74 @@ TEST(GcsFileSystemTest, CreateHttpRequest) {
   TF_EXPECT_OK(request->Send());
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
+  class TestGcsStats : public GcsStatsInterface {
+   public:
+    void Init(GcsFileSystem* fs, GcsThrottle* throttle,
+              const FileBlockCache* block_cache) override {
+      CHECK(fs_ == nullptr);
+      CHECK(throttle_ == nullptr);
+      CHECK(block_cache_ == nullptr);
+
+      fs_ = fs;
+      throttle_ = throttle;
+      block_cache_ = block_cache;
+    }
+
+    void RecordBlockLoadRequest(const string& file, size_t offset) override {
+      block_load_request_file_ = file;
+    }
+
+    void RecordBlockRetrieved(const string& file, size_t offset,
+                              size_t bytes_transferred) override {
+      block_retrieved_file_ = file;
+      block_retrieved_bytes_transferred_ = bytes_transferred;
+    }
+
+    HttpRequest::RequestStats* HttpStats() override { return nullptr; }
+
+    GcsFileSystem* fs_ = nullptr;
+    GcsThrottle* throttle_ = nullptr;
+    const FileBlockCache* block_cache_ = nullptr;
+
+    string block_load_request_file_;
+    string block_retrieved_file_;
+    size_t block_retrieved_bytes_transferred_ = 0;
+  };
+
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+      "Auth Token: fake_token\n"
+      "Range: 0-5\n"
+      "Timeouts: 5 1 20\n",
+      "012345")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  TestGcsStats stats;
+  fs.SetStats(&stats);
+  EXPECT_EQ(stats.fs_, &fs);
+
+  std::unique_ptr<RandomAccessFile> file;
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+
+  char scratch[6];
+  StringPiece result;
+
+  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  EXPECT_EQ("012345", result);
+
+  EXPECT_EQ("gs://bucket/random_access.txt", stats.block_load_request_file_);
+  EXPECT_EQ("gs://bucket/random_access.txt", stats.block_retrieved_file_);
+  EXPECT_EQ(6, stats.block_retrieved_bytes_transferred_);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.cc b/tensorflow/core/platform/cloud/gcs_throttle.cc
index eb5f8958a37f45aeac1a836ca037f91931bb34a6..27dd06a6250ad457d0ec142c07d29a2358dddaee 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle.cc
+++ b/tensorflow/core/platform/cloud/gcs_throttle.cc
@@ -26,10 +26,9 @@ GcsThrottle::GcsThrottle(EnvTime* env_time)
 
 bool GcsThrottle::AdmitRequest() {
   mutex_lock l(mu_);
-  if (!config_.enabled) return true;
   UpdateState();
   if (available_tokens_ < config_.tokens_per_request) {
-    return false;
+    return false || !config_.enabled;
   }
   available_tokens_ -= config_.tokens_per_request;
   return true;
@@ -37,7 +36,6 @@ bool GcsThrottle::AdmitRequest() {
 
 void GcsThrottle::RecordResponse(size_t num_bytes) {
   mutex_lock l(mu_);
-  if (!config_.enabled) return;
   UpdateState();
   available_tokens_ -= request_bytes_to_tokens(num_bytes);
 }
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h
index 1a89daef084e921f1ad8bd856cefcc62d0d7aa1c..97a858e3fecfbbd5175db2da2057836c7dd9f482 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle.h
+++ b/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -109,13 +109,24 @@ class GcsThrottle {
    * purpose of this function is to make available to monitoring or other
    * instrumentation the number of available tokens in the pool.
    */
-  inline int64 available_tokens() {
+  inline int64 available_tokens() LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
-    if (!config_.enabled) return 0;
     UpdateState();
     return available_tokens_;
   }
 
+  /**
+   * is_enabled determines if the throttle is enabled.
+   *
+   * If !is_enabled(), AdmitRequest() will always return true. To enable the
+   * throttle, call SetConfig passing in a configuration that has enabled set to
+   * true.
+   */
+  bool is_enabled() LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    return config_.enabled;
+  }
+
  private:
   /**
    * UpdateState updates the available_tokens_ and last_updated_secs_ variables.
diff --git a/tensorflow/core/platform/cloud/gcs_throttle_test.cc b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
index 694756022e37263a07f8215bf7496c9ca130fd58..57193ac4057550463b6bea29089bdd545f2f0a33 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
@@ -96,6 +96,24 @@ TEST_F(GcsThrottleTest, ReverseTime) {
   EXPECT_EQ(200000, throttle_.available_tokens());
 }
 
+TEST(GcsThrottleDisabledTest, Disabled) {
+  TestTime time;
+  GcsThrottle throttle(&time);
+  ASSERT_FALSE(throttle.is_enabled());  // Verify throttle is disabled.
+
+  EXPECT_EQ(0, throttle.available_tokens());
+  time.AdvanceSeconds(1);
+  EXPECT_EQ(100000, throttle.available_tokens());
+  EXPECT_TRUE(throttle.AdmitRequest());
+  EXPECT_EQ(99900, throttle.available_tokens());
+  time.AdvanceSeconds(1);
+  EXPECT_EQ(199900, throttle.available_tokens());
+  throttle.RecordResponse(128000000);  // 128 MB response.
+  EXPECT_LT(0, throttle.available_tokens());
+  // Admit request even without available tokens
+  EXPECT_TRUE(throttle.AdmitRequest());
+}
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/http_request.h b/tensorflow/core/platform/cloud/http_request.h
index df8a5b86a0b9b3354514be69cb03dd6472e51e86..2343bca608a6bd812354d0e243429c67c261b3ed 100644
--- a/tensorflow/core/platform/cloud/http_request.h
+++ b/tensorflow/core/platform/cloud/http_request.h
@@ -47,6 +47,46 @@ class HttpRequest {
     virtual HttpRequest* Create() = 0;
   };
 
+  /// RequestMethod is used to capture what type of HTTP request is made and
+  /// is used in conjunction with RequestStats for instrumentation and
+  /// monitoring of HTTP requests and their responses.
+  enum class RequestMethod : char {
+    kGet,
+    kPost,
+    kPut,
+    kDelete,
+  };
+
+  /// RequestMethodName converts a RequestMethod to the canonical method string.
+  inline static const char* RequestMethodName(RequestMethod m) {
+    switch (m) {
+      case RequestMethod::kGet:
+        return "GET";
+      case RequestMethod::kPost:
+        return "POST";
+      case RequestMethod::kPut:
+        return "PUT";
+      case RequestMethod::kDelete:
+        return "DELETE";
+      default:
+        return "???";
+    }
+  }
+
+  /// RequestStats is a class that can be used to instrument an Http Request.
+  class RequestStats {
+   public:
+    virtual ~RequestStats() = default;
+
+    /// RecordRequest is called right before a request is sent on the wire.
+    virtual void RecordRequest(const HttpRequest* request, const string& uri,
+                               RequestMethod method) = 0;
+
+    /// RecordResponse is called after the response has been received.
+    virtual void RecordResponse(const HttpRequest* request, const string& uri,
+                                RequestMethod method, const Status& result) = 0;
+  };
+
   HttpRequest() {}
   virtual ~HttpRequest() {}
 
@@ -73,6 +113,9 @@ class HttpRequest {
   /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
   virtual void AddAuthBearerHeader(const string& auth_token) = 0;
 
+  /// Sets the RequestStats object to use to record the request and response.
+  virtual void SetRequestStats(RequestStats* stats) = 0;
+
   /// Makes the request a DELETE request.
   virtual void SetDeleteRequest() = 0;
 
diff --git a/tensorflow/core/platform/cloud/file_block_cache.cc b/tensorflow/core/platform/cloud/ram_file_block_cache.cc
similarity index 89%
rename from tensorflow/core/platform/cloud/file_block_cache.cc
rename to tensorflow/core/platform/cloud/ram_file_block_cache.cc
index 6add1142a15fb69044828bd82a6d6e838959de08..55a5657a503a334866cad737bb11fe505e59699a 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/cloud/file_block_cache.h"
+#include "tensorflow/core/platform/cloud/ram_file_block_cache.h"
 #include <cstring>
 #include <memory>
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-bool FileBlockCache::BlockNotStale(const std::shared_ptr<Block>& block) {
+bool RamFileBlockCache::BlockNotStale(const std::shared_ptr<Block>& block) {
   mutex_lock l(block->mu);
   if (block->state != FetchState::FINISHED) {
     return true;  // No need to check for staleness.
@@ -30,7 +30,8 @@ bool FileBlockCache::BlockNotStale(const std::shared_ptr<Block>& block) {
   return env_->NowSeconds() - block->timestamp <= max_staleness_;
 }
 
-std::shared_ptr<FileBlockCache::Block> FileBlockCache::Lookup(const Key& key) {
+std::shared_ptr<RamFileBlockCache::Block> RamFileBlockCache::Lookup(
+    const Key& key) {
   mutex_lock lock(mu_);
   auto entry = block_map_.find(key);
   if (entry != block_map_.end()) {
@@ -55,15 +56,15 @@ std::shared_ptr<FileBlockCache::Block> FileBlockCache::Lookup(const Key& key) {
 }
 
 // Remove blocks from the cache until we do not exceed our maximum size.
-void FileBlockCache::Trim() {
+void RamFileBlockCache::Trim() {
   while (!lru_list_.empty() && cache_size_ > max_bytes_) {
     RemoveBlock(block_map_.find(lru_list_.back()));
   }
 }
 
 /// Move the block to the front of the LRU list if it isn't already there.
-Status FileBlockCache::UpdateLRU(const Key& key,
-                                 const std::shared_ptr<Block>& block) {
+Status RamFileBlockCache::UpdateLRU(const Key& key,
+                                    const std::shared_ptr<Block>& block) {
   mutex_lock lock(mu_);
   if (block->timestamp == 0) {
     // The block was evicted from another thread. Allow it to remain evicted.
@@ -92,8 +93,8 @@ Status FileBlockCache::UpdateLRU(const Key& key,
   return Status::OK();
 }
 
-Status FileBlockCache::MaybeFetch(const Key& key,
-                                  const std::shared_ptr<Block>& block) {
+Status RamFileBlockCache::MaybeFetch(const Key& key,
+                                     const std::shared_ptr<Block>& block) {
   bool downloaded_block = false;
   auto reconcile_state =
       gtl::MakeCleanup([this, &downloaded_block, &key, &block] {
@@ -151,11 +152,11 @@ Status FileBlockCache::MaybeFetch(const Key& key,
     }
   }
   return errors::Internal(
-      "Control flow should never reach the end of FileBlockCache::Fetch.");
+      "Control flow should never reach the end of RamFileBlockCache::Fetch.");
 }
 
-Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
-                            char* buffer, size_t* bytes_transferred) {
+Status RamFileBlockCache::Read(const string& filename, size_t offset, size_t n,
+                               char* buffer, size_t* bytes_transferred) {
   *bytes_transferred = 0;
   if (n == 0) {
     return Status::OK();
@@ -216,12 +217,12 @@ Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
   return Status::OK();
 }
 
-size_t FileBlockCache::CacheSize() const {
+size_t RamFileBlockCache::CacheSize() const {
   mutex_lock lock(mu_);
   return cache_size_;
 }
 
-void FileBlockCache::Prune() {
+void RamFileBlockCache::Prune() {
   while (!WaitForNotificationWithTimeout(&stop_pruning_thread_, 1000000)) {
     mutex_lock lock(mu_);
     uint64 now = env_->NowSeconds();
@@ -238,7 +239,7 @@ void FileBlockCache::Prune() {
   }
 }
 
-void FileBlockCache::Flush() {
+void RamFileBlockCache::Flush() {
   mutex_lock lock(mu_);
   block_map_.clear();
   lru_list_.clear();
@@ -246,12 +247,12 @@ void FileBlockCache::Flush() {
   cache_size_ = 0;
 }
 
-void FileBlockCache::RemoveFile(const string& filename) {
+void RamFileBlockCache::RemoveFile(const string& filename) {
   mutex_lock lock(mu_);
   RemoveFile_Locked(filename);
 }
 
-void FileBlockCache::RemoveFile_Locked(const string& filename) {
+void RamFileBlockCache::RemoveFile_Locked(const string& filename) {
   Key begin = std::make_pair(filename, 0);
   auto it = block_map_.lower_bound(begin);
   while (it != block_map_.end() && it->first.first == filename) {
@@ -261,7 +262,7 @@ void FileBlockCache::RemoveFile_Locked(const string& filename) {
   }
 }
 
-void FileBlockCache::RemoveBlock(BlockMap::iterator entry) {
+void RamFileBlockCache::RemoveBlock(BlockMap::iterator entry) {
   // This signals that the block is removed, and should not be inadvertently
   // reinserted into the cache in UpdateLRU.
   entry->second->timestamp = 0;
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache.h b/tensorflow/core/platform/cloud/ram_file_block_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fdd7b2e0294e1cf289a77464fb60e08bdb28da7
--- /dev/null
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache.h
@@ -0,0 +1,229 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_RAM_FILE_BLOCK_CACHE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_RAM_FILE_BLOCK_CACHE_H_
+
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/cloud/file_block_cache.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+/// \brief An LRU block cache of file contents, keyed by {filename, offset}.
+///
+/// This class should be shared by read-only random access files on a remote
+/// filesystem (e.g. GCS).
+class RamFileBlockCache : public FileBlockCache {
+ public:
+  /// The callback executed when a block is not found in the cache, and needs to
+  /// be fetched from the backing filesystem. This callback is provided when the
+  /// cache is constructed. The returned Status should be OK as long as the
+  /// read from the remote filesystem succeeded (similar to the semantics of the
+  /// read(2) system call).
+  typedef std::function<Status(const string& filename, size_t offset,
+                               size_t buffer_size, char* buffer,
+                               size_t* bytes_transferred)>
+      BlockFetcher;
+
+  RamFileBlockCache(size_t block_size, size_t max_bytes, uint64 max_staleness,
+                    BlockFetcher block_fetcher, Env* env = Env::Default())
+      : block_size_(block_size),
+        max_bytes_(max_bytes),
+        max_staleness_(max_staleness),
+        block_fetcher_(block_fetcher),
+        env_(env) {
+    if (max_staleness_ > 0) {
+      pruning_thread_.reset(env_->StartThread(ThreadOptions(), "TF_prune_FBC",
+                                              [this] { Prune(); }));
+    }
+  }
+
+  ~RamFileBlockCache() override {
+    if (pruning_thread_) {
+      stop_pruning_thread_.Notify();
+      // Destroying pruning_thread_ will block until Prune() receives the above
+      // notification and returns.
+      pruning_thread_.reset();
+    }
+  }
+
+  /// Read `n` bytes from `filename` starting at `offset` into `out`. This
+  /// method will return:
+  ///
+  /// 1) The error from the remote filesystem, if the read from the remote
+  ///    filesystem failed.
+  /// 2) PRECONDITION_FAILED if the read from the remote filesystem succeeded,
+  ///    but the read returned a partial block, and the LRU cache contained a
+  ///    block at a higher offset (indicating that the partial block should have
+  ///    been a full block).
+  /// 3) OUT_OF_RANGE if the read from the remote filesystem succeeded, but
+  ///    the file contents do not extend past `offset` and thus nothing was
+  ///    placed in `out`.
+  /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
+  ///    in `out`).
+  Status Read(const string& filename, size_t offset, size_t n, char* buffer,
+              size_t* bytes_transferred) override;
+
+  /// Remove all cached blocks for `filename`.
+  void RemoveFile(const string& filename) override LOCKS_EXCLUDED(mu_);
+
+  /// Remove all cached data.
+  void Flush() LOCKS_EXCLUDED(mu_) override;
+
+  /// Accessors for cache parameters.
+  size_t block_size() const override { return block_size_; }
+  size_t max_bytes() const override { return max_bytes_; }
+  uint64 max_staleness() const override { return max_staleness_; }
+
+  /// The current size (in bytes) of the cache.
+  size_t CacheSize() const override LOCKS_EXCLUDED(mu_);
+
+ private:
+  /// The size of the blocks stored in the LRU cache, as well as the size of the
+  /// reads from the underlying filesystem.
+  const size_t block_size_;
+  /// The maximum number of bytes (sum of block sizes) allowed in the LRU cache.
+  const size_t max_bytes_;
+  /// The maximum staleness of any block in the LRU cache, in seconds.
+  const uint64 max_staleness_;
+  /// The callback to read a block from the underlying filesystem.
+  const BlockFetcher block_fetcher_;
+  /// The Env from which we read timestamps.
+  Env* const env_;  // not owned
+
+  /// \brief The key type for the file block cache.
+  ///
+  /// The file block cache key is a {filename, offset} pair.
+  typedef std::pair<string, size_t> Key;
+
+  /// \brief The state of a block.
+  ///
+  /// A block begins in the CREATED stage. The first thread will attempt to read
+  /// the block from the filesystem, transitioning the state of the block to
+  /// FETCHING. After completing, if the read was successful the state should
+  /// be FINISHED. Otherwise the state should be ERROR. A subsequent read can
+  /// re-fetch the block if the state is ERROR.
+  enum class FetchState {
+    CREATED,
+    FETCHING,
+    FINISHED,
+    ERROR,
+  };
+
+  /// \brief A block of a file.
+  ///
+  /// A file block consists of the block data, the block's current position in
+  /// the LRU cache, the timestamp (seconds since epoch) at which the block
+  /// was cached, a coordination lock, and state & condition variables.
+  ///
+  /// Thread safety:
+  /// The iterator and timestamp fields should only be accessed while holding
+  /// the block-cache-wide mu_ instance variable. The state variable should only
+  /// be accessed while holding the Block's mu lock. The data vector should only
+  /// be accessed after state == FINISHED, and it should never be modified.
+  ///
+  /// In order to prevent deadlocks, never grab the block-cache-wide mu_ lock
+  /// AFTER grabbing any block's mu lock. It is safe to grab mu without locking
+  /// mu_.
+  struct Block {
+    /// The block data.
+    std::vector<char> data;
+    /// A list iterator pointing to the block's position in the LRU list.
+    std::list<Key>::iterator lru_iterator;
+    /// A list iterator pointing to the block's position in the LRA list.
+    std::list<Key>::iterator lra_iterator;
+    /// The timestamp (seconds since epoch) at which the block was cached.
+    uint64 timestamp;
+    /// Mutex to guard state variable
+    mutex mu;
+    /// The state of the block.
+    FetchState state GUARDED_BY(mu) = FetchState::CREATED;
+    /// Wait on cond_var if state is FETCHING.
+    condition_variable cond_var;
+  };
+
+  /// \brief The block map type for the file block cache.
+  ///
+  /// The block map is an ordered map from Key to Block.
+  typedef std::map<Key, std::shared_ptr<Block>> BlockMap;
+
+  /// Prune the cache by removing files with expired blocks.
+  void Prune() LOCKS_EXCLUDED(mu_);
+
+  bool BlockNotStale(const std::shared_ptr<Block>& block)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Look up a Key in the block cache.
+  std::shared_ptr<Block> Lookup(const Key& key) LOCKS_EXCLUDED(mu_);
+
+  Status MaybeFetch(const Key& key, const std::shared_ptr<Block>& block)
+      LOCKS_EXCLUDED(mu_);
+
+  /// Trim the block cache to make room for another entry.
+  void Trim() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Update the LRU iterator for the block at `key`.
+  Status UpdateLRU(const Key& key, const std::shared_ptr<Block>& block)
+      LOCKS_EXCLUDED(mu_);
+
+  /// Remove all blocks of a file, with mu_ already held.
+  void RemoveFile_Locked(const string& filename) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Remove the block `entry` from the block map and LRU list, and update the
+  /// cache size accordingly.
+  void RemoveBlock(BlockMap::iterator entry) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// The cache pruning thread that removes files with expired blocks.
+  std::unique_ptr<Thread> pruning_thread_;
+
+  /// Notification for stopping the cache pruning thread.
+  Notification stop_pruning_thread_;
+
+  /// Guards access to the block map, LRU list, and cached byte count.
+  mutable mutex mu_;
+
+  /// The block map (map from Key to Block).
+  BlockMap block_map_ GUARDED_BY(mu_);
+
+  /// The LRU list of block keys. The front of the list identifies the most
+  /// recently accessed block.
+  std::list<Key> lru_list_ GUARDED_BY(mu_);
+
+  /// The LRA (least recently added) list of block keys. The front of the list
+  /// identifies the most recently added block.
+  ///
+  /// Note: blocks are added to lra_list_ only after they have successfully been
+  /// fetched from the underlying block store.
+  std::list<Key> lra_list_ GUARDED_BY(mu_);
+
+  /// The combined number of bytes in all of the cached blocks.
+  size_t cache_size_ GUARDED_BY(mu_) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_RAM_FILE_BLOCK_CACHE_H_
diff --git a/tensorflow/core/platform/cloud/file_block_cache_test.cc b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
similarity index 91%
rename from tensorflow/core/platform/cloud/file_block_cache_test.cc
rename to tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
index 596fdbf19eb03a70c5659d392db368b3cdb791fe..10203783fcbe06fefeb1c7451ffbb20045cc421a 100644
--- a/tensorflow/core/platform/cloud/file_block_cache_test.cc
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/cloud/file_block_cache.h"
+#include "tensorflow/core/platform/cloud/ram_file_block_cache.h"
 #include <cstring>
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,8 +25,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-Status ReadCache(FileBlockCache* cache, const string& filename, size_t offset,
-                 size_t n, std::vector<char>* out) {
+Status ReadCache(RamFileBlockCache* cache, const string& filename,
+                 size_t offset, size_t n, std::vector<char>* out) {
   out->clear();
   out->resize(n, 0);
   size_t bytes_transferred = 0;
@@ -37,7 +37,7 @@ Status ReadCache(FileBlockCache* cache, const string& filename, size_t offset,
   return status;
 }
 
-TEST(FileBlockCacheTest, PassThrough) {
+TEST(RamFileBlockCacheTest, PassThrough) {
   const string want_filename = "foo/bar";
   const size_t want_offset = 42;
   const size_t want_n = 1024;
@@ -54,9 +54,9 @@ TEST(FileBlockCacheTest, PassThrough) {
     return Status::OK();
   };
   // If block_size, max_bytes, or both are zero, the cache is a pass-through.
-  FileBlockCache cache1(1, 0, 0, fetcher);
-  FileBlockCache cache2(0, 1, 0, fetcher);
-  FileBlockCache cache3(0, 0, 0, fetcher);
+  RamFileBlockCache cache1(1, 0, 0, fetcher);
+  RamFileBlockCache cache2(0, 1, 0, fetcher);
+  RamFileBlockCache cache3(0, 0, 0, fetcher);
   std::vector<char> out;
   TF_EXPECT_OK(ReadCache(&cache1, want_filename, want_offset, want_n, &out));
   EXPECT_EQ(calls, 1);
@@ -66,7 +66,7 @@ TEST(FileBlockCacheTest, PassThrough) {
   EXPECT_EQ(calls, 3);
 }
 
-TEST(FileBlockCacheTest, BlockAlignment) {
+TEST(RamFileBlockCacheTest, BlockAlignment) {
   // Initialize a 256-byte buffer.  This is the file underlying the reads we'll
   // do in this test.
   const size_t size = 256;
@@ -89,7 +89,7 @@ TEST(FileBlockCacheTest, BlockAlignment) {
   for (size_t block_size = 2; block_size <= 4; block_size++) {
     // Make a cache of N-byte block size (1 block) and verify that reads of
     // varying offsets and lengths return correct data.
-    FileBlockCache cache(block_size, block_size, 0, fetcher);
+    RamFileBlockCache cache(block_size, block_size, 0, fetcher);
     for (size_t offset = 0; offset < 10; offset++) {
       for (size_t n = block_size - 2; n <= block_size + 2; n++) {
         std::vector<char> got;
@@ -117,7 +117,7 @@ TEST(FileBlockCacheTest, BlockAlignment) {
   }
 }
 
-TEST(FileBlockCacheTest, CacheHits) {
+TEST(RamFileBlockCacheTest, CacheHits) {
   const size_t block_size = 16;
   std::set<size_t> calls;
   auto fetcher = [&calls, block_size](const string& filename, size_t offset,
@@ -132,7 +132,7 @@ TEST(FileBlockCacheTest, CacheHits) {
     return Status::OK();
   };
   const uint32 block_count = 256;
-  FileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
+  RamFileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
   out.resize(block_count, 0);
   // The cache has space for `block_count` blocks. The loop with i = 0 should
@@ -146,7 +146,7 @@ TEST(FileBlockCacheTest, CacheHits) {
   }
 }
 
-TEST(FileBlockCacheTest, OutOfRange) {
+TEST(RamFileBlockCacheTest, OutOfRange) {
   // Tests reads of a 24-byte file with block size 16.
   const size_t block_size = 16;
   const size_t file_size = 24;
@@ -172,7 +172,7 @@ TEST(FileBlockCacheTest, OutOfRange) {
     *bytes_transferred = bytes_to_copy;
     return Status::OK();
   };
-  FileBlockCache cache(block_size, block_size, 0, fetcher);
+  RamFileBlockCache cache(block_size, block_size, 0, fetcher);
   std::vector<char> out;
   // Reading the first 16 bytes should be fine.
   TF_EXPECT_OK(ReadCache(&cache, "", 0, block_size, &out));
@@ -191,7 +191,7 @@ TEST(FileBlockCacheTest, OutOfRange) {
   EXPECT_EQ(out.size(), file_size - block_size);
 }
 
-TEST(FileBlockCacheTest, Inconsistent) {
+TEST(RamFileBlockCacheTest, Inconsistent) {
   // Tests the detection of interrupted reads leading to partially filled blocks
   // where we expected complete blocks.
   const size_t block_size = 16;
@@ -205,7 +205,7 @@ TEST(FileBlockCacheTest, Inconsistent) {
     *bytes_transferred = 1;
     return Status::OK();
   };
-  FileBlockCache cache(block_size, 2 * block_size, 0, fetcher);
+  RamFileBlockCache cache(block_size, 2 * block_size, 0, fetcher);
   std::vector<char> out;
   // Read the second block; this should yield an OK status and a single byte.
   TF_EXPECT_OK(ReadCache(&cache, "", block_size, block_size, &out));
@@ -216,7 +216,7 @@ TEST(FileBlockCacheTest, Inconsistent) {
   EXPECT_EQ(status.code(), error::INTERNAL);
 }
 
-TEST(FileBlockCacheTest, LRU) {
+TEST(RamFileBlockCacheTest, LRU) {
   const size_t block_size = 16;
   std::list<size_t> calls;
   auto fetcher = [&calls, block_size](const string& filename, size_t offset,
@@ -233,7 +233,7 @@ TEST(FileBlockCacheTest, LRU) {
     return Status::OK();
   };
   const uint32 block_count = 2;
-  FileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
+  RamFileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
   // Read blocks from the cache, and verify the LRU behavior based on the
   // fetcher calls that the cache makes.
@@ -265,7 +265,7 @@ TEST(FileBlockCacheTest, LRU) {
   TF_EXPECT_OK(ReadCache(&cache, "", 0, 1, &out));
 }
 
-TEST(FileBlockCacheTest, MaxStaleness) {
+TEST(RamFileBlockCacheTest, MaxStaleness) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
@@ -278,7 +278,7 @@ TEST(FileBlockCacheTest, MaxStaleness) {
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
   // Create a cache with max staleness of 2 seconds, and verify that it works as
   // expected.
-  FileBlockCache cache1(8, 16, 2 /* max staleness */, fetcher, env.get());
+  RamFileBlockCache cache1(8, 16, 2 /* max staleness */, fetcher, env.get());
   // Execute the first read to load the block.
   TF_EXPECT_OK(ReadCache(&cache1, "", 0, 1, &out));
   EXPECT_EQ(calls, 1);
@@ -294,7 +294,7 @@ TEST(FileBlockCacheTest, MaxStaleness) {
   // as expected.
   calls = 0;
   env->SetNowSeconds(0);
-  FileBlockCache cache2(8, 16, 0 /* max staleness */, fetcher, env.get());
+  RamFileBlockCache cache2(8, 16, 0 /* max staleness */, fetcher, env.get());
   // Execute the first read to load the block.
   TF_EXPECT_OK(ReadCache(&cache2, "", 0, 1, &out));
   EXPECT_EQ(calls, 1);
@@ -305,7 +305,7 @@ TEST(FileBlockCacheTest, MaxStaleness) {
   EXPECT_EQ(calls, 1);
 }
 
-TEST(FileBlockCacheTest, RemoveFile) {
+TEST(RamFileBlockCacheTest, RemoveFile) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
@@ -321,7 +321,7 @@ TEST(FileBlockCacheTest, RemoveFile) {
   };
   // This cache has space for 4 blocks; we'll read from two files.
   const size_t n = 3;
-  FileBlockCache cache(8, 32, 0, fetcher);
+  RamFileBlockCache cache(8, 32, 0, fetcher);
   std::vector<char> out;
   std::vector<char> a(n, 'a');
   std::vector<char> b(n, 'b');
@@ -367,7 +367,7 @@ TEST(FileBlockCacheTest, RemoveFile) {
   EXPECT_EQ(calls, 6);
 }
 
-TEST(FileBlockCacheTest, Prune) {
+TEST(RamFileBlockCacheTest, Prune) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
@@ -381,7 +381,7 @@ TEST(FileBlockCacheTest, Prune) {
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
   uint64 now = Env::Default()->NowSeconds();
   env->SetNowSeconds(now);
-  FileBlockCache cache(8, 32, 1 /* max staleness */, fetcher, env.get());
+  RamFileBlockCache cache(8, 32, 1 /* max staleness */, fetcher, env.get());
   // Read three blocks into the cache, and advance the timestamp by one second
   // with each read. Start with a block of "a" at the current timestamp `now`.
   TF_EXPECT_OK(ReadCache(&cache, "a", 0, 1, &out));
@@ -426,7 +426,7 @@ TEST(FileBlockCacheTest, Prune) {
   EXPECT_EQ(cache.CacheSize(), 0);
 }
 
-TEST(FileBlockCacheTest, ParallelReads) {
+TEST(RamFileBlockCacheTest, ParallelReads) {
   // This fetcher won't respond until either `callers` threads are calling it
   // concurrently (at which point it will respond with success to all callers),
   // or 10 seconds have elapsed (at which point it will respond with an error).
@@ -444,7 +444,7 @@ TEST(FileBlockCacheTest, ParallelReads) {
     return Status::OK();
   };
   const int block_size = 8;
-  FileBlockCache cache(block_size, 2 * callers * block_size, 0, fetcher);
+  RamFileBlockCache cache(block_size, 2 * callers * block_size, 0, fetcher);
   std::vector<std::unique_ptr<Thread>> threads;
   for (int i = 0; i < callers; i++) {
     threads.emplace_back(
@@ -461,7 +461,7 @@ TEST(FileBlockCacheTest, ParallelReads) {
   // executed, or 10 seconds have passed).
 }
 
-TEST(FileBlockCacheTest, CoalesceConcurrentReads) {
+TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
   // Concurrent reads to the same file blocks should be de-duplicated.
   const size_t block_size = 16;
   int num_requests = 0;
@@ -479,7 +479,7 @@ TEST(FileBlockCacheTest, CoalesceConcurrentReads) {
     Env::Default()->SleepForMicroseconds(100000);  // 0.1 secs
     return Status::OK();
   };
-  FileBlockCache cache(block_size, block_size, 0, fetcher);
+  RamFileBlockCache cache(block_size, block_size, 0, fetcher);
   // Fork off thread for parallel read.
   std::unique_ptr<Thread> concurrent(
       Env::Default()->StartThread({}, "concurrent", [&cache, block_size] {
@@ -487,8 +487,7 @@ TEST(FileBlockCacheTest, CoalesceConcurrentReads) {
         TF_EXPECT_OK(ReadCache(&cache, "", 0, block_size / 2, &out));
         EXPECT_EQ(out.size(), block_size / 2);
       }));
-  EXPECT_TRUE(WaitForNotificationWithTimeout(&notification, 10000))
-      << "Timeout waiting for concurrent thread to start.";
+  notification.WaitForNotification();
   std::vector<char> out;
   TF_EXPECT_OK(ReadCache(&cache, "", block_size / 2, block_size / 2, &out));
   EXPECT_EQ(out.size(), block_size / 2);
@@ -496,7 +495,7 @@ TEST(FileBlockCacheTest, CoalesceConcurrentReads) {
   EXPECT_EQ(1, num_requests);
 }
 
-TEST(FileBlockCacheTest, Flush) {
+TEST(RamFileBlockCacheTest, Flush) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
@@ -505,7 +504,7 @@ TEST(FileBlockCacheTest, Flush) {
     *bytes_transferred = n;
     return Status::OK();
   };
-  FileBlockCache cache(16, 32, 0, fetcher);
+  RamFileBlockCache cache(16, 32, 0, fetcher);
   std::vector<char> out;
   TF_EXPECT_OK(ReadCache(&cache, "", 0, 16, &out));
   TF_EXPECT_OK(ReadCache(&cache, "", 0, 16, &out));
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.cc b/tensorflow/core/platform/cloud/retrying_file_system.cc
deleted file mode 100644
index be9ebe67b18e7be76e95149258cb1fcce6047d85..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/cloud/retrying_file_system.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/cloud/retrying_file_system.h"
-#include <functional>
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/platform/cloud/retrying_utils.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/file_system.h"
-
-namespace tensorflow {
-
-namespace {
-
-class RetryingRandomAccessFile : public RandomAccessFile {
- public:
-  RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
-                           int64 delay_microseconds)
-      : base_file_(std::move(base_file)),
-        initial_delay_microseconds_(delay_microseconds) {}
-
-  Status Read(uint64 offset, size_t n, StringPiece* result,
-              char* scratch) const override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&RandomAccessFile::Read, base_file_.get(), offset, n, result,
-                  scratch),
-        initial_delay_microseconds_);
-  }
-
- private:
-  std::unique_ptr<RandomAccessFile> base_file_;
-  const int64 initial_delay_microseconds_;
-};
-
-class RetryingWritableFile : public WritableFile {
- public:
-  RetryingWritableFile(std::unique_ptr<WritableFile> base_file,
-                       int64 delay_microseconds)
-      : base_file_(std::move(base_file)),
-        initial_delay_microseconds_(delay_microseconds) {}
-
-  ~RetryingWritableFile() override {
-    // Makes sure the retrying version of Close() is called in the destructor.
-    Close().IgnoreError();
-  }
-
-  Status Append(const StringPiece& data) override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&WritableFile::Append, base_file_.get(), data),
-        initial_delay_microseconds_);
-  }
-  Status Close() override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&WritableFile::Close, base_file_.get()),
-        initial_delay_microseconds_);
-  }
-  Status Flush() override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&WritableFile::Flush, base_file_.get()),
-        initial_delay_microseconds_);
-  }
-  Status Sync() override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&WritableFile::Sync, base_file_.get()),
-        initial_delay_microseconds_);
-  }
-
- private:
-  std::unique_ptr<WritableFile> base_file_;
-  const int64 initial_delay_microseconds_;
-};
-
-}  // namespace
-
-Status RetryingFileSystem::NewRandomAccessFile(
-    const string& filename, std::unique_ptr<RandomAccessFile>* result) {
-  std::unique_ptr<RandomAccessFile> base_file;
-  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::NewRandomAccessFile, base_file_system_.get(),
-                filename, &base_file),
-      initial_delay_microseconds_));
-  result->reset(new RetryingRandomAccessFile(std::move(base_file),
-                                             initial_delay_microseconds_));
-  return Status::OK();
-}
-
-Status RetryingFileSystem::NewWritableFile(
-    const string& filename, std::unique_ptr<WritableFile>* result) {
-  std::unique_ptr<WritableFile> base_file;
-  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::NewWritableFile, base_file_system_.get(), filename,
-                &base_file),
-      initial_delay_microseconds_));
-  result->reset(new RetryingWritableFile(std::move(base_file),
-                                         initial_delay_microseconds_));
-  return Status::OK();
-}
-
-Status RetryingFileSystem::NewAppendableFile(
-    const string& filename, std::unique_ptr<WritableFile>* result) {
-  std::unique_ptr<WritableFile> base_file;
-  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::NewAppendableFile, base_file_system_.get(),
-                filename, &base_file),
-      initial_delay_microseconds_));
-  result->reset(new RetryingWritableFile(std::move(base_file),
-                                         initial_delay_microseconds_));
-  return Status::OK();
-}
-
-Status RetryingFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& filename, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::NewReadOnlyMemoryRegionFromFile,
-                base_file_system_.get(), filename, result),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::FileExists(const string& fname) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::FileExists, base_file_system_.get(), fname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::Stat(const string& fname, FileStatistics* stat) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::Stat, base_file_system_.get(), fname, stat),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::GetChildren(const string& dir,
-                                       std::vector<string>* result) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::GetChildren, base_file_system_.get(), dir, result),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::GetMatchingPaths(const string& pattern,
-                                            std::vector<string>* result) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::GetMatchingPaths, base_file_system_.get(), pattern,
-                result),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::DeleteFile(const string& fname) {
-  return RetryingUtils::DeleteWithRetries(
-      std::bind(&FileSystem::DeleteFile, base_file_system_.get(), fname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::CreateDir(const string& dirname) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::CreateDir, base_file_system_.get(), dirname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::DeleteDir(const string& dirname) {
-  return RetryingUtils::DeleteWithRetries(
-      std::bind(&FileSystem::DeleteDir, base_file_system_.get(), dirname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::GetFileSize(const string& fname, uint64* file_size) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::GetFileSize, base_file_system_.get(), fname,
-                file_size),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::RenameFile(const string& src, const string& target) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::RenameFile, base_file_system_.get(), src, target),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::IsDirectory(const string& dirname) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::IsDirectory, base_file_system_.get(), dirname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::DeleteRecursively(const string& dirname,
-                                             int64* undeleted_files,
-                                             int64* undeleted_dirs) {
-  return RetryingUtils::DeleteWithRetries(
-      std::bind(&FileSystem::DeleteRecursively, base_file_system_.get(),
-                dirname, undeleted_files, undeleted_dirs),
-      initial_delay_microseconds_);
-}
-
-void RetryingFileSystem::FlushCaches() { base_file_system_->FlushCaches(); }
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h
index a262a5fd940f9b269721790c80caaef38d79d690..399a21617eedf2625c284f41b92deba2efa4ca18 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/cloud/retrying_file_system.h
@@ -16,17 +16,24 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_RETRYING_FILE_SYSTEM_H_
 #define TENSORFLOW_CORE_PLATFORM_RETRYING_FILE_SYSTEM_H_
 
+#include <functional>
 #include <string>
 #include <vector>
+
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/cloud/retrying_utils.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
 
 namespace tensorflow {
 
 /// A wrapper to add retry logic to another file system.
+template <typename Underlying>
 class RetryingFileSystem : public FileSystem {
  public:
-  RetryingFileSystem(std::unique_ptr<FileSystem> base_file_system,
+  RetryingFileSystem(std::unique_ptr<Underlying> base_file_system,
                      int64 delay_microseconds = 1000000)
       : base_file_system_(std::move(base_file_system)),
         initial_delay_microseconds_(delay_microseconds) {}
@@ -45,39 +52,200 @@ class RetryingFileSystem : public FileSystem {
       const string& filename,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(const string& fname) override;
+  Status FileExists(const string& fname) override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&FileSystem::FileExists, base_file_system_.get(), fname),
+        initial_delay_microseconds_);
+  }
+
+  Status GetChildren(const string& dir, std::vector<string>* result) override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&FileSystem::GetChildren, base_file_system_.get(), dir,
+                  result),
+        initial_delay_microseconds_);
+  }
+
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* result) override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&FileSystem::GetMatchingPaths, base_file_system_.get(),
+                  pattern, result),
+        initial_delay_microseconds_);
+  }
+
+  Status Stat(const string& fname, FileStatistics* stat) override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&FileSystem::Stat, base_file_system_.get(), fname, stat),
+        initial_delay_microseconds_);
+  }
+
+  Status DeleteFile(const string& fname) override {
+    return RetryingUtils::DeleteWithRetries(
+        std::bind(&FileSystem::DeleteFile, base_file_system_.get(), fname),
+        initial_delay_microseconds_);
+  }
+
+  Status CreateDir(const string& dirname) override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&FileSystem::CreateDir, base_file_system_.get(), dirname),
+        initial_delay_microseconds_);
+  }
+
+  Status DeleteDir(const string& dirname) override {
+    return RetryingUtils::DeleteWithRetries(
+        std::bind(&FileSystem::DeleteDir, base_file_system_.get(), dirname),
+        initial_delay_microseconds_);
+  }
+
+  Status GetFileSize(const string& fname, uint64* file_size) override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&FileSystem::GetFileSize, base_file_system_.get(), fname,
+                  file_size),
+        initial_delay_microseconds_);
+  }
+
+  Status RenameFile(const string& src, const string& target) override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&FileSystem::RenameFile, base_file_system_.get(), src,
+                  target),
+        initial_delay_microseconds_);
+  }
+
+  Status IsDirectory(const string& dirname) override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&FileSystem::IsDirectory, base_file_system_.get(), dirname),
+        initial_delay_microseconds_);
+  }
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status DeleteRecursively(const string& dirname, int64* undeleted_files,
+                           int64* undeleted_dirs) override {
+    return RetryingUtils::DeleteWithRetries(
+        std::bind(&FileSystem::DeleteRecursively, base_file_system_.get(),
+                  dirname, undeleted_files, undeleted_dirs),
+        initial_delay_microseconds_);
+  }
 
-  Status GetMatchingPaths(const string& dir,
-                          std::vector<string>* result) override;
+  void FlushCaches() override { base_file_system_->FlushCaches(); }
 
-  Status Stat(const string& fname, FileStatistics* stat) override;
+  Underlying* underlying() const { return base_file_system_.get(); }
 
-  Status DeleteFile(const string& fname) override;
+ private:
+  std::unique_ptr<Underlying> base_file_system_;
+  const int64 initial_delay_microseconds_;
 
-  Status CreateDir(const string& dirname) override;
+  TF_DISALLOW_COPY_AND_ASSIGN(RetryingFileSystem);
+};
 
-  Status DeleteDir(const string& dirname) override;
+namespace retrying_internals {
 
-  Status GetFileSize(const string& fname, uint64* file_size) override;
+class RetryingRandomAccessFile : public RandomAccessFile {
+ public:
+  RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
+                           int64 delay_microseconds)
+      : base_file_(std::move(base_file)),
+        initial_delay_microseconds_(delay_microseconds) {}
 
-  Status RenameFile(const string& src, const string& target) override;
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&RandomAccessFile::Read, base_file_.get(), offset, n, result,
+                  scratch),
+        initial_delay_microseconds_);
+  }
 
-  Status IsDirectory(const string& dir) override;
+ private:
+  std::unique_ptr<RandomAccessFile> base_file_;
+  const int64 initial_delay_microseconds_;
+};
 
-  Status DeleteRecursively(const string& dirname, int64* undeleted_files,
-                           int64* undeleted_dirs) override;
+class RetryingWritableFile : public WritableFile {
+ public:
+  RetryingWritableFile(std::unique_ptr<WritableFile> base_file,
+                       int64 delay_microseconds)
+      : base_file_(std::move(base_file)),
+        initial_delay_microseconds_(delay_microseconds) {}
 
-  void FlushCaches() override;
+  ~RetryingWritableFile() override {
+    // Makes sure the retrying version of Close() is called in the destructor.
+    Close().IgnoreError();
+  }
+
+  Status Append(const StringPiece& data) override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&WritableFile::Append, base_file_.get(), data),
+        initial_delay_microseconds_);
+  }
+  Status Close() override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&WritableFile::Close, base_file_.get()),
+        initial_delay_microseconds_);
+  }
+  Status Flush() override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&WritableFile::Flush, base_file_.get()),
+        initial_delay_microseconds_);
+  }
+  Status Sync() override {
+    return RetryingUtils::CallWithRetries(
+        std::bind(&WritableFile::Sync, base_file_.get()),
+        initial_delay_microseconds_);
+  }
 
  private:
-  std::unique_ptr<FileSystem> base_file_system_;
+  std::unique_ptr<WritableFile> base_file_;
   const int64 initial_delay_microseconds_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(RetryingFileSystem);
 };
 
+}  // namespace retrying_internals
+
+template <typename Underlying>
+Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
+    const string& filename, std::unique_ptr<RandomAccessFile>* result) {
+  std::unique_ptr<RandomAccessFile> base_file;
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::NewRandomAccessFile, base_file_system_.get(),
+                filename, &base_file),
+      initial_delay_microseconds_));
+  result->reset(new retrying_internals::RetryingRandomAccessFile(
+      std::move(base_file), initial_delay_microseconds_));
+  return Status::OK();
+}
+
+template <typename Underlying>
+Status RetryingFileSystem<Underlying>::NewWritableFile(
+    const string& filename, std::unique_ptr<WritableFile>* result) {
+  std::unique_ptr<WritableFile> base_file;
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::NewWritableFile, base_file_system_.get(), filename,
+                &base_file),
+      initial_delay_microseconds_));
+  result->reset(new retrying_internals::RetryingWritableFile(
+      std::move(base_file), initial_delay_microseconds_));
+  return Status::OK();
+}
+
+template <typename Underlying>
+Status RetryingFileSystem<Underlying>::NewAppendableFile(
+    const string& filename, std::unique_ptr<WritableFile>* result) {
+  std::unique_ptr<WritableFile> base_file;
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::NewAppendableFile, base_file_system_.get(),
+                filename, &base_file),
+      initial_delay_microseconds_));
+  result->reset(new retrying_internals::RetryingWritableFile(
+      std::move(base_file), initial_delay_microseconds_));
+  return Status::OK();
+}
+
+template <typename Underlying>
+Status RetryingFileSystem<Underlying>::NewReadOnlyMemoryRegionFromFile(
+    const string& filename, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+  return RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::NewReadOnlyMemoryRegionFromFile,
+                base_file_system_.get(), filename, result),
+      initial_delay_microseconds_);
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_RETRYING_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index d3f763bb3c845436e8458135a0a754d8cb002957..ec2c470db797a137438be98493edfec08f212373 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -183,7 +184,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -210,7 +211,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_SuccessWith3rdTry) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -234,7 +235,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -245,7 +246,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   char scratch[10];
   const auto& status = random_access_file->Read(0, 10, &result, scratch);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -264,7 +265,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -290,7 +291,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -316,7 +317,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -342,7 +343,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry_ViaDestructor) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -367,7 +368,7 @@ TEST(RetryingFileSystemTest, NewAppendableFile_SuccessWith3rdTry) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped appendable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -390,7 +391,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -399,7 +400,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   // Use it and check the results.
   const auto& status = writable_file->Sync();
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -411,7 +412,7 @@ TEST(RetryingFileSystemTest,
        std::make_tuple("NewReadOnlyMemoryRegionFromFile", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result));
@@ -422,13 +423,13 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
       CreateRetriableErrors("NewReadOnlyMemoryRegionFromFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   const auto& status =
       fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -439,7 +440,7 @@ TEST(RetryingFileSystemTest, GetChildren_SuccessWith2ndTry) {
        std::make_tuple("GetChildren", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetChildren("gs://path", &result));
@@ -449,12 +450,12 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("GetChildren", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.GetChildren("gs://path", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -465,7 +466,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_SuccessWith2ndTry) {
        std::make_tuple("GetMatchingPaths", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://path/dir", &result));
@@ -476,12 +477,12 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
       CreateRetriableErrors("GetMatchingPaths", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -491,7 +492,7 @@ TEST(RetryingFileSystemTest, DeleteFile_SuccessWith2ndTry) {
        std::make_tuple("DeleteFile", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.DeleteFile("gs://path/file.txt"));
@@ -501,12 +502,12 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("DeleteFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.DeleteFile("gs://path/file.txt");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -516,7 +517,7 @@ TEST(RetryingFileSystemTest, CreateDir_SuccessWith2ndTry) {
        std::make_tuple("CreateDir", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.CreateDir("gs://path/newdir"));
@@ -526,12 +527,12 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("CreateDir", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.CreateDir("gs://path/newdir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -541,7 +542,7 @@ TEST(RetryingFileSystemTest, DeleteDir_SuccessWith2ndTry) {
        std::make_tuple("DeleteDir", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.DeleteDir("gs://path/dir"));
@@ -551,12 +552,12 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("DeleteDir", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.DeleteDir("gs://path/dir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -567,7 +568,7 @@ TEST(RetryingFileSystemTest, GetFileSize_SuccessWith2ndTry) {
        std::make_tuple("GetFileSize", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://path/file.txt", &size));
@@ -577,12 +578,12 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("GetFileSize", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   uint64 size;
   const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -592,7 +593,7 @@ TEST(RetryingFileSystemTest, RenameFile_SuccessWith2ndTry) {
        std::make_tuple("RenameFile", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   TF_EXPECT_OK(fs.RenameFile("old_name", "new_name"));
 }
@@ -601,11 +602,11 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("RenameFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   const auto& status = fs.RenameFile("old_name", "new_name");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -615,7 +616,7 @@ TEST(RetryingFileSystemTest, Stat_SuccessWith2ndTry) {
        std::make_tuple("Stat", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("file_name", &stat));
@@ -625,12 +626,12 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("Stat", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   FileStatistics stat;
   const auto& status = fs.Stat("file_name", &stat);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -638,11 +639,11 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("FileExists", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   const auto& status = fs.FileExists("file_name");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -652,7 +653,7 @@ TEST(RetryingFileSystemTest, FileExists_SuccessWith2ndTry) {
        std::make_tuple("FileExists", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   TF_EXPECT_OK(fs.FileExists("gs://path/dir"));
 }
@@ -664,7 +665,7 @@ TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
        std::make_tuple("IsDirectory", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://path/dir"));
 }
@@ -673,11 +674,11 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("IsDirectory", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   const auto& status = fs.IsDirectory("gs://path/dir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -688,7 +689,7 @@ TEST(RetryingFileSystemTest, DeleteRecursively_SuccessWith2ndTry) {
        std::make_tuple("DeleteRecursively", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
   int64 undeleted_files, undeleted_dirs;
 
   TF_EXPECT_OK(
@@ -700,13 +701,13 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
       CreateRetriableErrors("DeleteRecursively", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
   int64 undeleted_files, undeleted_dirs;
 
   const auto& status =
       fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -714,7 +715,7 @@ TEST(RetryingFileSystemTest, FlushCaches) {
   ExpectedCalls none;
   bool flushed = false;
   std::unique_ptr<MockFileSystem> base_fs(new MockFileSystem(none, &flushed));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
   fs.FlushCaches();
   EXPECT_TRUE(flushed);
 }
diff --git a/tensorflow/core/platform/cloud/retrying_utils_test.cc b/tensorflow/core/platform/cloud/retrying_utils_test.cc
index 6eb340e09438eafbe59844a378aa06801ed3b4bf..1b6527618a8e0fa1261b96bd79bdd8e5e2e6f8d1 100644
--- a/tensorflow/core/platform/cloud/retrying_utils_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -31,10 +32,9 @@ TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
 
   const auto& status = RetryingUtils::CallWithRetries(f, 500000L, sleep);
   EXPECT_EQ(errors::Code::ABORTED, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("All 10 retry attempts "
-                            "failed. The last failure: "
-                            "Unavailable: Failed."))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "All 10 retry attempts failed. The last failure: Unavailable: Failed."))
       << status;
 
   EXPECT_EQ(10, requested_delays.size());
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index b5706581580ea00865b45cf50a4d92d22c647e53..9d00aa7b7feadec3a2a2861247c5d536a4237c3e 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <mutex>
 #include <string>
 
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 331f3e525169a93fa01739eefdf2dc6c588980a0..b5be7e8b54543daddcc07cc8b31a228d4edf14f4 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -18,16 +18,17 @@ limitations under the License.
 
 #include <string>
 
-#if defined(PLATFORM_WINDOWS)
+// TODO(ahentz): This is not strictly required here but, for historical
+// reasons, many people depend on cpu_info.h in order to use kLittleEndian.
+#include "tensorflow/core/platform/byte_order.h"
+
+#if defined(_MSC_VER)
 #include "tensorflow/core/platform/windows/cpu_info.h"
 #endif
 
 namespace tensorflow {
 namespace port {
 
-// TODO(jeff,sanjay): Make portable
-constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
-
 // Returns an estimate of the number of schedulable CPUs for this
 // process.  Usually, it's constant throughout the lifetime of a
 // process, but it might change if the underlying cluster management
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 2102c5cca383b553c56fb3704596e3d1335c55c2..107c38114b55734b197df1202e9c3fc4dc3dae20 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,7 +1,6 @@
 # Platform-specific build configurations.
 
 load("@protobuf_archive//:protobuf.bzl", "proto_gen")
-load("@protobuf_archive//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
@@ -110,6 +109,12 @@ def _proto_cc_srcs(srcs, use_grpc_plugin=False):
     ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
   return ret
 
+def _proto_py_outs(srcs, use_grpc_plugin=False):
+  ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
+  if use_grpc_plugin:
+    ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
+  return ret
+
 # Re-defined protocol buffer rule to allow building "header only" protocol
 # buffers, to avoid duplicate registrations. Also allows non-iterable cc_libs
 # containing select() statements.
@@ -122,6 +127,7 @@ def cc_proto_library(
     protoc="@protobuf_archive//:protoc",
     internal_bootstrap_hack=False,
     use_grpc_plugin=False,
+    use_grpc_namespace=False,
     default_header=False,
     **kargs):
   """Bazel rule to create a C++ protobuf library from proto source files.
@@ -169,8 +175,11 @@ def cc_proto_library(
     return
 
   grpc_cpp_plugin = None
+  plugin_options = []
   if use_grpc_plugin:
     grpc_cpp_plugin = "//external:grpc_cpp_plugin"
+    if use_grpc_namespace:
+      plugin_options = ["services_namespace=grpc"]
 
   gen_srcs = _proto_cc_srcs(srcs, use_grpc_plugin)
   gen_hdrs = _proto_cc_hdrs(srcs, use_grpc_plugin)
@@ -184,6 +193,7 @@ def cc_proto_library(
       protoc=protoc,
       plugin=grpc_cpp_plugin,
       plugin_language="grpc",
+      plugin_options=plugin_options,
       gen_cc=1,
       outs=outs,
       visibility=["//visibility:public"],
@@ -212,6 +222,80 @@ def cc_proto_library(
       hdrs=gen_hdrs,
       **kargs)
 
+# Re-defined protocol buffer rule to bring in the change introduced in commit
+# https://github.com/google/protobuf/commit/294b5758c373cbab4b72f35f4cb62dc1d8332b68
+# which was not part of a stable protobuf release in 04/2018.
+# TODO(jsimsa): Remove this once the protobuf dependency version is updated
+# to include the above commit.
+def py_proto_library(
+        name,
+        srcs=[],
+        deps=[],
+        py_libs=[],
+        py_extra_srcs=[],
+        include=None,
+        default_runtime="@protobuf_archive//:protobuf_python",
+        protoc="@protobuf_archive//:protoc",
+        use_grpc_plugin=False,
+        **kargs):
+  """Bazel rule to create a Python protobuf library from proto source files
+
+  NOTE: the rule is only an internal workaround to generate protos. The
+  interface may change and the rule may be removed when bazel has introduced
+  the native rule.
+
+  Args:
+    name: the name of the py_proto_library.
+    srcs: the .proto files of the py_proto_library.
+    deps: a list of dependency labels; must be py_proto_library.
+    py_libs: a list of other py_library targets depended by the generated
+        py_library.
+    py_extra_srcs: extra source files that will be added to the output
+        py_library. This attribute is used for internal bootstrapping.
+    include: a string indicating the include path of the .proto files.
+    default_runtime: the implicitly default runtime which will be depended on by
+        the generated py_library target.
+    protoc: the label of the protocol compiler to generate the sources.
+    use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
+        when processing the proto files.
+    **kargs: other keyword arguments that are passed to cc_library.
+  """
+  outs = _proto_py_outs(srcs, use_grpc_plugin)
+
+  includes = []
+  if include != None:
+    includes = [include]
+
+  grpc_python_plugin = None
+  if use_grpc_plugin:
+    grpc_python_plugin = "//external:grpc_python_plugin"
+    # Note: Generated grpc code depends on Python grpc module. This dependency
+    # is not explicitly listed in py_libs. Instead, host system is assumed to
+    # have grpc installed.
+
+  proto_gen(
+      name=name + "_genproto",
+      srcs=srcs,
+      deps=[s + "_genproto" for s in deps],
+      includes=includes,
+      protoc=protoc,
+      gen_py=1,
+      outs=outs,
+      visibility=["//visibility:public"],
+      plugin=grpc_python_plugin,
+      plugin_language="grpc"
+  )
+
+  if default_runtime and not default_runtime in py_libs + deps:
+    py_libs = py_libs + [default_runtime]
+
+  native.py_library(
+      name=name,
+      srcs=outs+py_extra_srcs,
+      deps=py_libs+deps,
+      imports=includes,
+      **kargs)
+
 def tf_proto_library_cc(name, srcs = [], has_services = None,
                         protodeps = [],
                         visibility = [], testonly = 0,
@@ -219,7 +303,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
                         cc_stubby_versions = None,
                         cc_grpc_version = None,
                         j2objc_api_version = 1,
-                        cc_api_version = 2, go_api_version = 2,
+                        cc_api_version = 2,
                         java_api_version = 2, py_api_version = 2,
                         js_api_version = 2, js_codegen = "jspb",
                         default_header = False):
@@ -235,10 +319,34 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
   use_grpc_plugin = None
   if cc_grpc_version:
     use_grpc_plugin = True
+
+  cc_deps = tf_deps(protodeps, "_cc")
+  cc_name = name + "_cc"
+  if not srcs:
+    # This is a collection of sub-libraries. Build header-only and impl
+    # libraries containing all the sources.
+    proto_gen(
+        name = cc_name + "_genproto",
+        deps = [s + "_genproto" for s in cc_deps],
+        protoc = "@protobuf_archive//:protoc",
+        visibility=["//visibility:public"],
+    )
+    native.cc_library(
+        name = cc_name,
+        deps = cc_deps + ["@protobuf_archive//:protobuf_headers"] +
+               if_static([name + "_cc_impl"]),
+    )
+    native.cc_library(
+        name = cc_name + "_impl",
+        deps = [s + "_impl" for s in cc_deps] + ["@protobuf_archive//:cc_wkt_protos"],
+    )
+
+    return
+
   cc_proto_library(
-      name = name + "_cc",
+      name = cc_name,
       srcs = srcs,
-      deps = tf_deps(protodeps, "_cc") + ["@protobuf_archive//:cc_wkt_protos"],
+      deps = cc_deps + ["@protobuf_archive//:cc_wkt_protos"],
       cc_libs = cc_libs + if_static(
           ["@protobuf_archive//:protobuf"],
           ["@protobuf_archive//:protobuf_headers"]
@@ -256,17 +364,34 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
   )
 
 def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
-                        testonly=0,
-                        srcs_version="PY2AND3"):
+                        testonly=0, srcs_version="PY2AND3", use_grpc_plugin=False):
+  py_deps = tf_deps(protodeps, "_py")
+  py_name = name + "_py"
+  if not srcs:
+    # This is a collection of sub-libraries. Build header-only and impl
+    # libraries containing all the sources.
+    proto_gen(
+        name = py_name + "_genproto",
+        deps = [s + "_genproto" for s in py_deps],
+        protoc = "@protobuf_archive//:protoc",
+        visibility=["//visibility:public"],
+    )
+    native.py_library(
+        name = py_name,
+        deps = py_deps + ["@protobuf_archive//:protobuf_python"])
+
+    return
+
   py_proto_library(
-      name = name + "_py",
+      name = py_name,
       srcs = srcs,
       srcs_version = srcs_version,
-      deps = deps + tf_deps(protodeps, "_py") + ["@protobuf_archive//:protobuf_python"],
+      deps = deps + py_deps + ["@protobuf_archive//:protobuf_python"],
       protoc = "@protobuf_archive//:protoc",
       default_runtime = "@protobuf_archive//:protobuf_python",
       visibility = visibility,
       testonly = testonly,
+      use_grpc_plugin = use_grpc_plugin,
   )
 
 def tf_jspb_proto_library(**kwargs):
@@ -280,7 +405,6 @@ def tf_proto_library(name, srcs = [], has_services = None,
                      visibility = [], testonly = 0,
                      cc_libs = [],
                      cc_api_version = 2, cc_grpc_version = None,
-                     go_api_version = 2,
                      j2objc_api_version = 1,
                      java_api_version = 2, py_api_version = 2,
                      js_api_version = 2, js_codegen = "jspb",
@@ -306,8 +430,26 @@ def tf_proto_library(name, srcs = [], has_services = None,
       srcs_version = "PY2AND3",
       testonly = testonly,
       visibility = visibility,
+      use_grpc_plugin = has_services,
   )
 
+# A list of all files under platform matching the pattern in 'files'. In
+# contrast with 'tf_platform_srcs' below, which seletive collects files that
+# must be compiled in the 'default' platform, this is a list of all headers
+# mentioned in the platform/* files.
+def tf_platform_hdrs(files):
+  return native.glob(["platform/*/" + f for f in files])
+
+def tf_platform_srcs(files):
+  base_set = ["platform/default/" + f for f in files]
+  windows_set = base_set + ["platform/windows/" + f for f in files]
+  posix_set = base_set + ["platform/posix/" + f for f in files]
+  return select({
+    "//tensorflow:windows" : native.glob(windows_set),
+    "//tensorflow:windows_msvc" : native.glob(windows_set),
+    "//conditions:default" : native.glob(posix_set),
+  })
+
 def tf_additional_lib_hdrs(exclude = []):
   windows_hdrs = native.glob([
       "platform/default/*.h",
@@ -363,7 +505,6 @@ def tf_additional_proto_hdrs():
 
 def tf_additional_proto_srcs():
   return [
-      "platform/default/logging.cc",
       "platform/default/protobuf.cc",
   ]
 
@@ -386,25 +527,6 @@ def tf_protos_grappler():
       extra_deps=tf_protos_grappler_impl(),
       otherwise=["//tensorflow/core/grappler/costs:op_performance_data_cc"])
 
-def tf_env_time_hdrs():
-  return [
-      "platform/env_time.h",
-  ]
-
-def tf_env_time_srcs():
-  win_env_time = native.glob([
-    "platform/windows/env_time.cc",
-    "platform/env_time.cc",
-  ], exclude = [])
-  return select({
-    "//tensorflow:windows" : win_env_time,
-    "//tensorflow:windows_msvc" : win_env_time,
-    "//conditions:default" : native.glob([
-        "platform/posix/env_time.cc",
-        "platform/env_time.cc",
-      ], exclude = []),
-  })
-
 def tf_additional_cupti_wrapper_deps():
   return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"]
 
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 2cd607edbe554cd18d21626e258176e8570282ed..44a89c3a96ad293be76d709ac21ac6bafe6afcbd 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -113,6 +113,12 @@ cc_library(
     copts = tf_copts(),
 )
 
+cc_library(
+    name = "base",
+    srcs = [],
+    copts = tf_copts(),
+)
+
 cc_library(
     name = "platformlib",
     copts = tf_copts(),
@@ -128,6 +134,11 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "stacktrace",
+    srcs = [],
+)
+
 cc_library(
     name = "gif",
     copts = tf_copts(),
@@ -160,6 +171,13 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "test_lite_main",
+    testonly = 1,
+    linkstatic = 1,
+    deps = [],
+)
+
 cc_library(
     name = "test_main",
     testonly = 1,
@@ -218,15 +236,3 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/platform/default/context.h b/tensorflow/core/platform/default/context.h
index d8afeb47a9ca06e61a8c02962bc98d7797a279f7..682f64c26d7e3c4306df4139f0f48297e5c01a03 100644
--- a/tensorflow/core/platform/default/context.h
+++ b/tensorflow/core/platform/default/context.h
@@ -22,6 +22,8 @@ class Context {
  public:
   Context() {}
   Context(const ContextKind kind) {}
+
+  bool operator==(const Context& other) const { return true; }
 };
 
 class WithContext {
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index 8e60a7f0910ff9cf77a33f9d72d680ec42847777..ccddf1eafc0c92b12267788b588f25223dc0b413 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cupti_wrapper.h"
 #include "tensorflow/core/platform/env.h"
@@ -288,7 +289,7 @@ TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
 
 class DeviceTracerImpl : public DeviceTracer,
                          public CUPTIClient,
-                         public port::Tracing::Engine {
+                         public tracing::TraceCollector {
  public:
   DeviceTracerImpl();
   ~DeviceTracerImpl() override;
@@ -298,25 +299,25 @@ class DeviceTracerImpl : public DeviceTracer,
   Status Stop() override;
   Status Collect(StepStatsCollector *collector) override;
 
-  // port::Tracing::Engine interface:
-  bool IsEnabled() const override {
-    // We only register the Engine while tracing is enabled.
-    return true;
-  }
-  Annotation *PushAnnotation(StringPiece name) override {
-    VLOG(2) << "PushAnnotation " << name;
-    struct Impl : public port::Tracing::Engine::Annotation {
+  // tracing::TraceCollector interface:
+  virtual std::unique_ptr<Handle> CreateAnnotationHandle(
+      StringPiece name_part1, StringPiece name_part2) const {
+    struct Impl : public tracing::TraceCollector::Handle {
       string annotation;
-      explicit Impl(StringPiece n) : annotation(n.ToString()) {
+      explicit Impl(string &&name_scope) : annotation(name_scope) {
+        VLOG(2) << "CreateAnnotationHandle " << annotation;
         // Remember the most recent ScopedAnnotation for each thread.
         tls_current_annotation.get() = annotation.c_str();
       }
       ~Impl() override { tls_current_annotation.get() = nullptr; }
     };
-    return new Impl(name);
+    return std::unique_ptr<Handle>(
+        new Impl{ConcatenateNames(name_part1, name_part2)});
   }
-  Tracer *StartTracing(StringPiece label, bool is_expensive) override {
-    // We don't do anything with 'TraceMe' regions yet.
+
+  virtual std::unique_ptr<Handle> CreateActivityHandle(StringPiece, StringPiece,
+                                                       bool) const {
+    // We don't do anything with 'Activities' yet.
     return nullptr;
   }
 
@@ -410,7 +411,7 @@ Status DeviceTracerImpl::Start() {
   }
 
   // Register as a TraceEngine to receive ScopedAnnotations.
-  port::Tracing::RegisterEngine(this);
+  tracing::SetTraceCollector(this);
 
   // Intercept launch and memcpy calls to capture the Op name annotation.
   // TODO(pbar) Add callbacks for memcpy variants.
@@ -458,7 +459,7 @@ Status DeviceTracerImpl::Stop() {
     return Status::OK();
   }
   CUPTI_CALL(Unsubscribe(subscriber_));
-  port::Tracing::RegisterEngine(nullptr);
+  tracing::SetTraceCollector(nullptr);
   TF_RETURN_IF_ERROR(cupti_manager_->DisableTrace());
   end_walltime_us_ = NowInUsec();
   CUPTI_CALL(GetTimestamp(&end_timestamp_));
diff --git a/tensorflow/core/platform/default/fingerprint.h b/tensorflow/core/platform/default/fingerprint.h
index 71f9951e53edc16ed07550a08e3a3b34747b480c..f901befc16bafd4b014f61a87cc43fb9587eb1ad 100644
--- a/tensorflow/core/platform/default/fingerprint.h
+++ b/tensorflow/core/platform/default/fingerprint.h
@@ -18,14 +18,16 @@ limitations under the License.
 
 #include <farmhash.h>
 
+#include "tensorflow/core/lib/core/stringpiece.h"
+
 namespace tensorflow {
 
-inline uint64 Fingerprint64(const string& s) {
-  return ::util::Fingerprint64(s);
+inline uint64 Fingerprint64(StringPiece s) {
+  return ::util::Fingerprint64(s.data(), s.size());
 }
 
-inline Fprint128 Fingerprint128(const string& s) {
-  const auto fingerprint = ::util::Fingerprint128(s);
+inline Fprint128 Fingerprint128(StringPiece s) {
+  const auto fingerprint = ::util::Fingerprint128(s.data(), s.size());
   return {::util::Uint128Low64(fingerprint),
           ::util::Uint128High64(fingerprint)};
 }
diff --git a/tensorflow/core/platform/default/from_stream_executor_status.h b/tensorflow/core/platform/default/from_stream_executor_status.h
deleted file mode 100644
index 2a2297a657773613a1f4d63a2c967e302eb7bb2f..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/default/from_stream_executor_status.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_PLATFORM_DEFAULT_FROM_STREAM_EXECUTOR_STATUS_H_
-#define TENSORFLOW_PLATFORM_DEFAULT_FROM_STREAM_EXECUTOR_STATUS_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/from_stream_executor_status.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/from_stream_executor_status.h
-
-#include "tensorflow/stream_executor/lib/status.h"
-
-namespace tensorflow {
-
-namespace gpu = ::perftools::gputools;
-
-// On the open-source platform, stream_executor currently uses
-// tensorflow::Status
-inline Status FromStreamExecutorStatus(
-    const perftools::gputools::port::Status& s) {
-  return s;
-}
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_PLATFORM_DEFAULT_FROM_STREAM_EXECUTOR_STATUS_H_
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
index 580db4844f29729726b179770e4b89581db873cc..7ac5e5c4450708a486be956a5806e31b8dd36fa3 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
@@ -28,27 +28,27 @@ namespace profiler {
 
 namespace dynload {
 
-#define LIBCUPTI_WRAP(__name)                                               \
-  struct DynLoadShim__##__name {                                            \
-    static const char* kName;                                               \
-    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;        \
-    static void* GetDsoHandle() {                                           \
-      static auto status = perftools::gputools::internal::CachedDsoLoader:: \
-          GetLibcuptiDsoHandle();                                           \
-      return status.ValueOrDie();                                           \
-    }                                                                       \
-    static FuncPointerT DynLoad() {                                         \
-      static void* f;                                                       \
-      TF_CHECK_OK(::tensorflow::Env::Default()->GetSymbolFromLibrary(       \
-          GetDsoHandle(), kName, &f))                                       \
-          << "could not find " << kName << "in libcupti DSO";               \
-      return reinterpret_cast<FuncPointerT>(f);                             \
-    }                                                                       \
-    template <typename... Args>                                             \
-    CUptiResult operator()(Args... args) {                                  \
-      return DynLoad()(args...);                                            \
-    }                                                                       \
-  } __name;                                                                 \
+#define LIBCUPTI_WRAP(__name)                                                 \
+  struct DynLoadShim__##__name {                                              \
+    static const char* kName;                                                 \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                             \
+      static auto status =                                                    \
+          stream_executor::internal::CachedDsoLoader::GetLibcuptiDsoHandle(); \
+      return status.ValueOrDie();                                             \
+    }                                                                         \
+    static FuncPointerT DynLoad() {                                           \
+      static void* f;                                                         \
+      TF_CHECK_OK(::tensorflow::Env::Default()->GetSymbolFromLibrary(         \
+          GetDsoHandle(), kName, &f))                                         \
+          << "could not find " << kName << "in libcupti DSO";                 \
+      return reinterpret_cast<FuncPointerT>(f);                               \
+    }                                                                         \
+    template <typename... Args>                                               \
+    CUptiResult operator()(Args... args) {                                    \
+      return DynLoad()(args...);                                              \
+    }                                                                         \
+  } __name;                                                                   \
   const char* DynLoadShim__##__name::kName = #__name;
 
 LIBCUPTI_WRAP(cuptiActivityDisable);
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.h b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
index acd889e47496f8bc1cc9f89c3848848d47c4e91f..e3ebe6ca1d025b047abfe91d8f5ab2e1fedd5a1b 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@@ -23,7 +23,7 @@ limitations under the License.
 #if defined(WIN32)
 #include "extras/CUPTI/include/cupti.h"
 #else
-#include "cuda/extras/CUPTI/include/cupti.h"
+#include "cupti.h"
 #endif
 namespace perftools {
 namespace gputools {
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 2b874da1981bed396330ca3c526d82779046bdf2..c6e5777c265137ca1b215e14a7be0c6422804b4b 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <android/log.h>
 #include <iostream>
 #include <sstream>
+#include <cstring>
 #endif
 
 #include <stdlib.h>
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index f0efa31d5576393e9d9bba6e39a454b2a33cddc3..2c134f1be931982930047850736d1d3a33fdffcc 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -64,11 +64,11 @@ class LogMessageFatal : public LogMessage {
 };
 
 #define _TF_LOG_INFO \
-  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::INFO)
+  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, ::tensorflow::INFO)
 #define _TF_LOG_WARNING \
-  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::WARNING)
+  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, ::tensorflow::WARNING)
 #define _TF_LOG_ERROR \
-  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::ERROR)
+  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, ::tensorflow::ERROR)
 #define _TF_LOG_FATAL \
   ::tensorflow::internal::LogMessageFatal(__FILE__, __LINE__)
 
diff --git a/tensorflow/core/platform/default/mutex.cc b/tensorflow/core/platform/default/mutex.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79830a4738afeb2a19e3e3811be4c88e13f06090
--- /dev/null
+++ b/tensorflow/core/platform/default/mutex.cc
@@ -0,0 +1,89 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/mutex.h"
+#include <chrono>
+#include <condition_variable>
+#include "nsync_cv.h"
+#include "nsync_mu.h"
+
+namespace tensorflow {
+
+// Check that the external_mu_space struct used to reserve space for the mutex
+// in tensorflow::mutex is big enough.
+static_assert(sizeof(nsync::nsync_mu) <= sizeof(mutex::external_mu_space),
+              "tensorflow::mutex::external_mu_space needs to be bigger");
+
+// Cast a pointer to mutex::external_mu_space to a pointer to the mutex mutex
+// representation.  This is done so that the header files for nsync_mu do not
+// need to be included in every file that uses tensorflow's mutex.
+static inline nsync::nsync_mu *mu_cast(mutex::external_mu_space *mu) {
+  return reinterpret_cast<nsync::nsync_mu *>(mu);
+}
+
+mutex::mutex() { nsync::nsync_mu_init(mu_cast(&mu_)); }
+
+void mutex::lock() { nsync::nsync_mu_lock(mu_cast(&mu_)); }
+
+bool mutex::try_lock() { return nsync::nsync_mu_trylock(mu_cast(&mu_)) != 0; };
+
+void mutex::unlock() { nsync::nsync_mu_unlock(mu_cast(&mu_)); }
+
+void mutex::lock_shared() { nsync::nsync_mu_rlock(mu_cast(&mu_)); }
+
+bool mutex::try_lock_shared() {
+  return nsync::nsync_mu_rtrylock(mu_cast(&mu_)) != 0;
+};
+
+void mutex::unlock_shared() { nsync::nsync_mu_runlock(mu_cast(&mu_)); }
+
+// Check that the external_cv_space struct used to reserve space for the
+// condition variable in tensorflow::condition_variable is big enough.
+static_assert(
+    sizeof(nsync::nsync_cv) <= sizeof(condition_variable::external_cv_space),
+    "tensorflow::condition_variable::external_cv_space needs to be bigger");
+
+// Cast a pointer to mutex::external_cv_space to a pointer to the condition
+// variable representation.  This is done so that the header files for nsync_mu
+// do not need to be included in every file that uses tensorflow's
+// condition_variable.
+static inline nsync::nsync_cv *cv_cast(
+    condition_variable::external_cv_space *cv) {
+  return reinterpret_cast<nsync::nsync_cv *>(cv);
+}
+
+condition_variable::condition_variable() {
+  nsync::nsync_cv_init(cv_cast(&cv_));
+}
+
+void condition_variable::wait(mutex_lock &lock) {
+  nsync::nsync_cv_wait(cv_cast(&cv_), mu_cast(&lock.mutex()->mu_));
+}
+
+std::cv_status condition_variable::wait_until_system_clock(
+    mutex_lock &lock,
+    const std::chrono::system_clock::time_point timeout_time) {
+  int r = nsync::nsync_cv_wait_with_deadline(
+      cv_cast(&cv_), mu_cast(&lock.mutex()->mu_), timeout_time, nullptr);
+  return r ? std::cv_status::timeout : std::cv_status::no_timeout;
+}
+
+void condition_variable::notify_one() { nsync::nsync_cv_signal(cv_cast(&cv_)); }
+
+void condition_variable::notify_all() {
+  nsync::nsync_cv_broadcast(cv_cast(&cv_));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/mutex.h b/tensorflow/core/platform/default/mutex.h
index 044c754e80bd0dee04c73e969c325a2aa4a89c31..a12d92795e14665f62222879488ab1cb89da8cfc 100644
--- a/tensorflow/core/platform/default/mutex.h
+++ b/tensorflow/core/platform/default/mutex.h
@@ -22,9 +22,8 @@ limitations under the License.
 #include <chrono>
 #include <condition_variable>
 #include <mutex>
-#include "nsync_cv.h"
-#include "nsync_mu.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+
 namespace tensorflow {
 
 #undef mutex_lock
@@ -38,26 +37,26 @@ class condition_variable;
 // lock.
 class LOCKABLE mutex {
  public:
-  mutex() { nsync::nsync_mu_init(&mu_); }
-  // The default implementation of nsync_mutex is safe to use after the linker
-  // initializations
+  mutex();
+  // The default implementation of the underlying mutex is safe to use after
+  // the linker initialization to zero.
   explicit mutex(LinkerInitialized x) {}
 
-  void lock() EXCLUSIVE_LOCK_FUNCTION() { nsync::nsync_mu_lock(&mu_); }
-  bool try_lock() EXCLUSIVE_TRYLOCK_FUNCTION(true) {
-    return nsync::nsync_mu_trylock(&mu_) != 0;
-  };
-  void unlock() UNLOCK_FUNCTION() { nsync::nsync_mu_unlock(&mu_); }
+  void lock() EXCLUSIVE_LOCK_FUNCTION();
+  bool try_lock() EXCLUSIVE_TRYLOCK_FUNCTION(true);
+  void unlock() UNLOCK_FUNCTION();
+
+  void lock_shared() SHARED_LOCK_FUNCTION();
+  bool try_lock_shared() SHARED_TRYLOCK_FUNCTION(true);
+  void unlock_shared() UNLOCK_FUNCTION();
 
-  void lock_shared() SHARED_LOCK_FUNCTION() { nsync::nsync_mu_rlock(&mu_); }
-  bool try_lock_shared() SHARED_TRYLOCK_FUNCTION(true) {
-    return nsync::nsync_mu_rtrylock(&mu_) != 0;
+  struct external_mu_space {
+    void* space[2];
   };
-  void unlock_shared() UNLOCK_FUNCTION() { nsync::nsync_mu_runlock(&mu_); }
 
  private:
   friend class condition_variable;
-  nsync::nsync_mu mu_;
+  external_mu_space mu_;
 };
 
 // Mimic a subset of the std::unique_lock<tensorflow::mutex> functionality.
@@ -139,26 +138,29 @@ class SCOPED_LOCKABLE tf_shared_lock {
 // Mimic std::condition_variable.
 class condition_variable {
  public:
-  condition_variable() { nsync::nsync_cv_init(&cv_); }
+  condition_variable();
 
-  void wait(mutex_lock& lock) {
-    nsync::nsync_cv_wait(&cv_, &lock.mutex()->mu_);
-  }
+  void wait(mutex_lock& lock);
   template <class Rep, class Period>
   std::cv_status wait_for(mutex_lock& lock,
                           std::chrono::duration<Rep, Period> dur) {
-    int r = nsync::nsync_cv_wait_with_deadline(
-        &cv_, &lock.mutex()->mu_, std::chrono::system_clock::now() + dur,
-        nullptr);
-    return r ? std::cv_status::timeout : std::cv_status::no_timeout;
+    return wait_until_system_clock(lock,
+                                   std::chrono::system_clock::now() + dur);
   }
-  void notify_one() { nsync::nsync_cv_signal(&cv_); }
-  void notify_all() { nsync::nsync_cv_broadcast(&cv_); }
+  void notify_one();
+  void notify_all();
+
+  struct external_cv_space {
+    void* space[2];
+  };
 
  private:
   friend ConditionResult WaitForMilliseconds(mutex_lock* mu,
                                              condition_variable* cv, int64 ms);
-  nsync::nsync_cv cv_;
+  std::cv_status wait_until_system_clock(
+      mutex_lock& lock,
+      const std::chrono::system_clock::time_point timeout_time);
+  external_cv_space cv_;
 };
 
 inline ConditionResult WaitForMilliseconds(mutex_lock* mu,
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
index 03d8b6c2380151f6148a560648317aa8a98a5e2e..c732c76ff79412cc2c676757343bb5d669c84634 100644
--- a/tensorflow/core/platform/default/protobuf.h
+++ b/tensorflow/core/platform/default/protobuf.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "google/protobuf/arena.h"
 #include "google/protobuf/compiler/importer.h"
 #include "google/protobuf/descriptor.h"
+#include "google/protobuf/dynamic_message.h"
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
diff --git a/tensorflow/core/platform/default/tracing.cc b/tensorflow/core/platform/default/tracing.cc
index 422564fb3e4df85215539a21df069ba930c35194..3efcef09b8da27b5b404451aca68f946abc85cae 100644
--- a/tensorflow/core/platform/default/tracing.cc
+++ b/tensorflow/core/platform/default/tracing.cc
@@ -15,21 +15,33 @@ limitations under the License.
 
 #include "tensorflow/core/platform/tracing.h"
 
-namespace tensorflow {
-namespace port {
-
-void Tracing::RegisterEvent(EventCategory id, const char* name) {
-  // TODO(opensource): implement
-}
+#include <cstdlib>
 
-void Tracing::Initialize() {}
+#ifndef PLATFORM_WINDOWS
+#include <unistd.h>
+#endif
 
-static bool DoInit() {
-  Tracing::Initialize();
-  return true;
+namespace tensorflow {
+namespace tracing {
+namespace {
+bool TryGetEnv(const char* name, const char** value) {
+  *value = getenv(name);
+  return *value != nullptr && (*value)[0] != '\0';
 }
-
-static const bool dummy = DoInit();
-
-}  // namespace port
+}  // namespace
+
+void EventCollector::SetCurrentThreadName(const char*) {}
+
+const char* GetLogDir() {
+  const char* dir;
+  if (TryGetEnv("TEST_TMPDIR", &dir)) return dir;
+  if (TryGetEnv("TMP", &dir)) return dir;
+  if (TryGetEnv("TMPDIR", &dir)) return dir;
+#ifndef PLATFORM_WINDOWS
+  dir = "/tmp";
+  if (access(dir, R_OK | W_OK | X_OK) == 0) return dir;
+#endif
+  return ".";  // Default to current directory.
+}
+}  // namespace tracing
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/tracing_impl.h b/tensorflow/core/platform/default/tracing_impl.h
index e813e4a17aca918582e8346d4bf2655724a143b3..b1613784053ba25763ce49914fa14e3f82f1419c 100644
--- a/tensorflow/core/platform/default/tracing_impl.h
+++ b/tensorflow/core/platform/default/tracing_impl.h
@@ -21,14 +21,8 @@ limitations under the License.
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/tracing.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/tracing.h
 
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/tracing.h"
 
-namespace tensorflow {
-namespace port {
-
 // Definitions that do nothing for platforms that don't have underlying thread
 // tracing support.
 #define TRACELITERAL(a) \
@@ -41,21 +35,12 @@ namespace port {
   do {                           \
   } while (0)
 
-inline uint64 Tracing::UniqueId() { return random::New64(); }
-inline bool Tracing::IsActive() { return false; }
-inline void Tracing::RegisterCurrentThread(const char* name) {}
-
-// Posts an atomic threadscape event with the supplied category and arg.
-inline void Tracing::RecordEvent(EventCategory category, uint64 arg) {
-  // TODO(opensource): Implement
-}
-
-inline Tracing::ScopedActivity::ScopedActivity(EventCategory category,
-                                               uint64 arg) {}
+namespace tensorflow {
+namespace tracing {
 
-inline Tracing::ScopedActivity::~ScopedActivity() {}
+inline bool EventCollector::IsEnabled() { return false; }
 
-}  // namespace port
+}  // namespace tracing
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_DEFAULT_TRACING_IMPL_H_
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index 3631d9ddf99430372c11403dba56c14331a3db24..c510dc204f71990e782d5bf086dea631a7e8bb4d 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/denormal.h"
-#include "third_party/eigen3/Eigen/Core"
+#include <tuple>
+
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
 // If we're on gcc 4.8 or older, there's a known bug that prevents the use of
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 12509c250eab9047b869694e930bf523a975a4f8..b9a9ef85eb16e62bf9fecf01910fb98673e8cf7b 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #endif
 
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 4ce4e0b4e024d50ae2bd081ec7b8b155060d2a4a..9192f7ba10d466aa8bcfc2b2536d5d42a9263533 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -291,10 +291,10 @@ class Env {
   virtual string FormatLibraryFileName(const string& name,
                                        const string& version) = 0;
 
- private:
   // Returns a possible list of local temporary directories.
-  void GetLocalTempDirectories(std::vector<string>* list);
+  virtual void GetLocalTempDirectories(std::vector<string>* list) = 0;
 
+ private:
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
   TF_DISALLOW_COPY_AND_ASSIGN(Env);
   EnvTime* envTime = EnvTime::Default();
@@ -358,6 +358,10 @@ class EnvWrapper : public Env {
   }
 
  private:
+  void GetLocalTempDirectories(std::vector<string>* list) override {
+    target_->GetLocalTempDirectories(list);
+  }
+
   Env* target_;
 };
 
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 47ddf0ccb93e827d410e87050d6802747fb84fbf..a70a417e6a2f3ade644f5d7377adf5ebc52d77e5 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/null_file_system.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -372,9 +373,8 @@ TEST_F(DefaultEnvTest, CreateUniqueFileName) {
 
   EXPECT_TRUE(env->CreateUniqueFileName(&filename, suffix));
 
-  StringPiece str(filename);
-  EXPECT_TRUE(str.starts_with(prefix));
-  EXPECT_TRUE(str.ends_with(suffix));
+  EXPECT_TRUE(str_util::StartsWith(filename, prefix));
+  EXPECT_TRUE(str_util::EndsWith(filename, suffix));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 271d73f5f1a7bd3e1301520aed09cbafd89c8ebc..b55e94d552ed3a66bd05930702acd9633cd02f81 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -18,41 +18,15 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
-namespace {
-
-constexpr int kNumThreads = 8;
-
-// Run a function in parallel using a ThreadPool, but skip the ThreadPool
-// on the iOS platform due to its problems with more than a few threads.
-void ForEach(int first, int last, const std::function<void(int)>& f) {
-#if TARGET_OS_IPHONE
-  for (int i = first; i < last; i++) {
-    f(i);
-  }
-#else
-  int num_threads = std::min(kNumThreads, last - first);
-  thread::ThreadPool threads(Env::Default(), "ForEach", num_threads);
-  for (int i = first; i < last; i++) {
-    threads.Schedule([f, i] { f(i); });
-  }
-#endif
-}
-
-}  // anonymous namespace
-
 FileSystem::~FileSystem() {}
 
 string FileSystem::TranslateName(const string& name) const {
@@ -97,76 +71,6 @@ bool FileSystem::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-Status FileSystem::GetMatchingPaths(const string& pattern,
-                                    std::vector<string>* results) {
-  results->clear();
-  // Find the fixed prefix by looking for the first wildcard.
-  string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
-  string eval_pattern = pattern;
-  std::vector<string> all_files;
-  string dir = io::Dirname(fixed_prefix).ToString();
-  // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
-  // include . as the top level directory.
-  if (dir.empty()) {
-    dir = ".";
-    fixed_prefix = io::JoinPath(dir, fixed_prefix);
-    eval_pattern = io::JoinPath(dir, pattern);
-  }
-
-  // Setup a BFS to explore everything under dir.
-  std::deque<string> dir_q;
-  dir_q.push_back(dir);
-  Status ret;  // Status to return.
-  // children_dir_status holds is_dir status for children. It can have three
-  // possible values: OK for true; FAILED_PRECONDITION for false; CANCELLED
-  // if we don't calculate IsDirectory (we might do that because there isn't
-  // any point in exploring that child path).
-  std::vector<Status> children_dir_status;
-  while (!dir_q.empty()) {
-    string current_dir = dir_q.front();
-    dir_q.pop_front();
-    std::vector<string> children;
-    Status s = GetChildren(current_dir, &children);
-    ret.Update(s);
-    if (children.empty()) continue;
-    // This IsDirectory call can be expensive for some FS. Parallelizing it.
-    children_dir_status.resize(children.size());
-    ForEach(0, children.size(),
-            [this, &current_dir, &children, &fixed_prefix,
-             &children_dir_status](int i) {
-              const string child_path = io::JoinPath(current_dir, children[i]);
-              // In case the child_path doesn't start with the fixed_prefix then
-              // we don't need to explore this path.
-              if (!StringPiece(child_path).starts_with(fixed_prefix)) {
-                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
-                                                "Operation not needed");
-              } else {
-                children_dir_status[i] = IsDirectory(child_path);
-              }
-            });
-    for (int i = 0; i < children.size(); ++i) {
-      const string child_path = io::JoinPath(current_dir, children[i]);
-      // If the IsDirectory call was cancelled we bail.
-      if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
-        continue;
-      }
-      // If the child is a directory add it to the queue.
-      if (children_dir_status[i].ok()) {
-        dir_q.push_back(child_path);
-      }
-      all_files.push_back(child_path);
-    }
-  }
-
-  // Match all obtained files to the input pattern.
-  for (const auto& f : all_files) {
-    if (Env::Default()->MatchPath(f, eval_pattern)) {
-      results->push_back(f);
-    }
-  }
-  return ret;
-}
-
 Status FileSystem::DeleteRecursively(const string& dirname,
                                      int64* undeleted_files,
                                      int64* undeleted_dirs) {
@@ -244,7 +148,7 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname) {
       return status;
     }
     // Basename returns "" for / ending dirs.
-    if (!remaining_dir.ends_with("/")) {
+    if (!str_util::EndsWith(remaining_dir, "/")) {
       sub_dirs.push_back(io::Basename(remaining_dir));
     }
     remaining_dir = io::Dirname(remaining_dir);
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 3085b6958fd921ae124b885107e807f0a02e1d9d..077b1d79cfb259c6d497f37e7f06d0da189f3ff5 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/platform/file_statistics.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 #ifdef PLATFORM_WINDOWS
@@ -139,10 +138,8 @@ class FileSystem {
   ///  * OK - no errors
   ///  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
   ///                    implemented
-  /// The default implementation uses a combination of GetChildren, MatchPath
-  /// and IsDirectory.
   virtual Status GetMatchingPaths(const string& pattern,
-                                  std::vector<string>* results);
+                                  std::vector<string>* results) = 0;
 
   /// \brief Obtains statistics for the given path.
   virtual Status Stat(const string& fname, FileStatistics* stat) = 0;
@@ -306,74 +303,6 @@ class ReadOnlyMemoryRegion {
   virtual uint64 length() = 0;
 };
 
-// START_SKIP_DOXYGEN
-
-#ifndef SWIG
-// Degenerate file system that provides no implementations.
-class NullFileSystem : public FileSystem {
- public:
-  NullFileSystem() {}
-
-  ~NullFileSystem() override = default;
-
-  Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
-    return errors::Unimplemented("NewRandomAccessFile unimplemented");
-  }
-
-  Status NewWritableFile(const string& fname,
-                         std::unique_ptr<WritableFile>* result) override {
-    return errors::Unimplemented("NewWritableFile unimplemented");
-  }
-
-  Status NewAppendableFile(const string& fname,
-                           std::unique_ptr<WritableFile>* result) override {
-    return errors::Unimplemented("NewAppendableFile unimplemented");
-  }
-
-  Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
-    return errors::Unimplemented(
-        "NewReadOnlyMemoryRegionFromFile unimplemented");
-  }
-
-  Status FileExists(const string& fname) override {
-    return errors::Unimplemented("FileExists unimplemented");
-  }
-
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
-    return errors::Unimplemented("GetChildren unimplemented");
-  }
-
-  Status DeleteFile(const string& fname) override {
-    return errors::Unimplemented("DeleteFile unimplemented");
-  }
-
-  Status CreateDir(const string& dirname) override {
-    return errors::Unimplemented("CreateDir unimplemented");
-  }
-
-  Status DeleteDir(const string& dirname) override {
-    return errors::Unimplemented("DeleteDir unimplemented");
-  }
-
-  Status GetFileSize(const string& fname, uint64* file_size) override {
-    return errors::Unimplemented("GetFileSize unimplemented");
-  }
-
-  Status RenameFile(const string& src, const string& target) override {
-    return errors::Unimplemented("RenameFile unimplemented");
-  }
-
-  Status Stat(const string& fname, FileStatistics* stat) override {
-    return errors::Unimplemented("Stat unimplemented");
-  }
-};
-#endif
-
-// END_SKIP_DOXYGEN
-
 /// \brief A registry for file system implementations.
 ///
 /// Filenames are specified as an URI, which is of the form
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22c5057281959fa1584828d927387e8094bfa50a
--- /dev/null
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/file_system_helper.h"
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/platform.h"
+
+namespace tensorflow {
+namespace internal {
+
+namespace {
+
+constexpr int kNumThreads = 8;
+
+// Run a function in parallel using a ThreadPool, but skip the ThreadPool
+// on the iOS platform due to its problems with more than a few threads.
+void ForEach(int first, int last, const std::function<void(int)>& f) {
+#if TARGET_OS_IPHONE
+  for (int i = first; i < last; i++) {
+    f(i);
+  }
+#else
+  int num_threads = std::min(kNumThreads, last - first);
+  thread::ThreadPool threads(Env::Default(), "ForEach", num_threads);
+  for (int i = first; i < last; i++) {
+    threads.Schedule([f, i] { f(i); });
+  }
+#endif
+}
+
+}  // namespace
+
+Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                        std::vector<string>* results) {
+  results->clear();
+  // Find the fixed prefix by looking for the first wildcard.
+  string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
+  string eval_pattern = pattern;
+  std::vector<string> all_files;
+  string dir = io::Dirname(fixed_prefix).ToString();
+  // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
+  // include . as the top level directory.
+  if (dir.empty()) {
+    dir = ".";
+    fixed_prefix = io::JoinPath(dir, fixed_prefix);
+    eval_pattern = io::JoinPath(dir, pattern);
+  }
+
+  // Setup a BFS to explore everything under dir.
+  std::deque<string> dir_q;
+  dir_q.push_back(dir);
+  Status ret;  // Status to return.
+  // children_dir_status holds is_dir status for children. It can have three
+  // possible values: OK for true; FAILED_PRECONDITION for false; CANCELLED
+  // if we don't calculate IsDirectory (we might do that because there isn't
+  // any point in exploring that child path).
+  std::vector<Status> children_dir_status;
+  while (!dir_q.empty()) {
+    string current_dir = dir_q.front();
+    dir_q.pop_front();
+    std::vector<string> children;
+    Status s = fs->GetChildren(current_dir, &children);
+    ret.Update(s);
+    if (children.empty()) continue;
+    // This IsDirectory call can be expensive for some FS. Parallelizing it.
+    children_dir_status.resize(children.size());
+    ForEach(0, children.size(),
+            [fs, &current_dir, &children, &fixed_prefix,
+             &children_dir_status](int i) {
+              const string child_path = io::JoinPath(current_dir, children[i]);
+              // In case the child_path doesn't start with the fixed_prefix then
+              // we don't need to explore this path.
+              if (!str_util::StartsWith(child_path, fixed_prefix)) {
+                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
+                                                "Operation not needed");
+              } else {
+                children_dir_status[i] = fs->IsDirectory(child_path);
+              }
+            });
+    for (int i = 0; i < children.size(); ++i) {
+      const string child_path = io::JoinPath(current_dir, children[i]);
+      // If the IsDirectory call was cancelled we bail.
+      if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
+        continue;
+      }
+      // If the child is a directory add it to the queue.
+      if (children_dir_status[i].ok()) {
+        dir_q.push_back(child_path);
+      }
+      all_files.push_back(child_path);
+    }
+  }
+
+  // Match all obtained files to the input pattern.
+  for (const auto& f : all_files) {
+    if (env->MatchPath(f, eval_pattern)) {
+      results->push_back(f);
+    }
+  }
+  return ret;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system_helper.h b/tensorflow/core/platform/file_system_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d812b0e38150f9190f69fd279561944d42174c6
--- /dev/null
+++ b/tensorflow/core/platform/file_system_helper.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+#define TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class FileSystem;
+class Env;
+
+namespace internal {
+
+// Given a pattern, stores in 'results' the set of paths (in the given file
+// system) that match that pattern.
+//
+// This helper may be used by implementations of FileSystem::GetMatchingPaths()
+// in order to provide parallel scanning of subdirectories (except on iOS).
+//
+// Arguments:
+//   fs: may not be null and will be used to identify directories and list
+//       their contents.
+//   env: may not be null and will be used to check if a match has been found.
+//   pattern: see FileSystem::GetMatchingPaths() for details.
+//   results: will be cleared and may not be null.
+//
+// Returns an error status if any call to 'fs' failed.
+Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                        std::vector<string>* results);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index abe88ab6c7e876046db28cd15e804d756fee7066..f261b8f5761506fc5d706c9646c36eef912fc18f 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/null_file_system.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -123,7 +124,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     io::ParseURI(name, &scheme, &host, &path);
     ASSERT_EQ(scheme, "ipfs");
     ASSERT_EQ(host, "solarsystem");
-    path.Consume("/");
+    str_util::ConsumePrefix(&path, "/");
     *parsed_path = path.ToString();
   }
 
@@ -159,7 +160,8 @@ string Match(InterPlanetaryFileSystem* ipfs, const string& suffix_pattern) {
     std::sort(results.begin(), results.end());
     for (const string& result : results) {
       StringPiece trimmed_result(result);
-      EXPECT_TRUE(trimmed_result.Consume(strings::StrCat(kPrefix, "/")));
+      EXPECT_TRUE(str_util::ConsumePrefix(&trimmed_result,
+                                          strings::StrCat(kPrefix, "/")));
       trimmed_results.push_back(trimmed_result);
     }
     return str_util::Join(trimmed_results, ",");
diff --git a/tensorflow/core/platform/fingerprint.h b/tensorflow/core/platform/fingerprint.h
index fd0347a10be917bf368e3c83793bd111b74039ed..b47dcdedd74de1bd6f6b86d09701ef83a6e86a04 100644
--- a/tensorflow/core/platform/fingerprint.h
+++ b/tensorflow/core/platform/fingerprint.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_FINGERPRINT_H_
 #define TENSORFLOW_CORE_PLATFORM_FINGERPRINT_H_
 
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -36,15 +37,12 @@ struct Fprint128Hasher {
   }
 };
 
-// TODO(sibyl-Mooth6ku): Change these to accept StringPiece (or make them templated
-// on any kind of byte array?).
-
 // This is a portable fingerprint interface for strings that will never change.
 // However, it is not suitable for cryptography.
-uint64 Fingerprint64(const string& s);
+uint64 Fingerprint64(StringPiece s);
 
 // 128-bit variant of Fingerprint64 above (same properties and caveats apply).
-Fprint128 Fingerprint128(const string& s);
+Fprint128 Fingerprint128(StringPiece s);
 
 namespace internal {
 // Mixes some of the bits that got propagated to the high bits back into the
diff --git a/tensorflow/core/platform/hadoop/BUILD b/tensorflow/core/platform/hadoop/BUILD
index 774a439855e49904b29f1e0c3d82196b1b9afb5d..7c38c399bd7a4645b3556e653110c19b8b9ab9ff 100644
--- a/tensorflow/core/platform/hadoop/BUILD
+++ b/tensorflow/core/platform/hadoop/BUILD
@@ -12,18 +12,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "hadoop_file_system",
     srcs = ["hadoop_file_system.cc"],
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 74863293a32451e8881c93de468539b913169aaa..a8cb40502c11851cb8f18e4194131482616f34c8 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/posix/error.h"
@@ -108,6 +109,8 @@ class LibHDFS {
 // in the libhdfs documentation.
 #if defined(PLATFORM_WINDOWS)
     const char* kLibHdfsDso = "hdfs.dll";
+#elif defined(MACOS) || defined(TARGET_OS_MAC)
+    const char* kLibHdfsDso = "libhdfs.dylib";
 #else
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
@@ -396,6 +399,11 @@ Status HadoopFileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
+Status HadoopFileSystem::GetMatchingPaths(const string& pattern,
+                                          std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status HadoopFileSystem::DeleteFile(const string& fname) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.h b/tensorflow/core/platform/hadoop/hadoop_file_system.h
index 5f2b222622cf01033af117f92d49458eeae00e6f..6af7a698ffe91d79d1460a4e335ddd7bf8727a3c 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.h
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.h
@@ -49,6 +49,9 @@ class HadoopFileSystem : public FileSystem {
 
   Status GetChildren(const string& dir, std::vector<string>* result) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index 6ba2f04d0f839cedee9d75d8ed960a50668e541c..b207d3474977361777383299a2a603a9f21481d4 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -197,7 +198,7 @@ TEST_F(HadoopFileSystemTest, WriteWhileReading) {
   // Skip the test if we're not testing on HDFS. Hadoop's local filesystem
   // implementation makes no guarantees that writable files are readable while
   // being written.
-  if (!StringPiece(fname).starts_with("hdfs://")) {
+  if (!str_util::StartsWith(fname, "hdfs://")) {
     return;
   }
 
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 6119edfd5a63d1aa4e81bb91d95736ed2835c478..37239681755c04152d3ae4a91ab7ec73a89f522b 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -31,13 +31,14 @@ limitations under the License.
   __attribute__((__format__(__printf__, string_index, first_to_check)))
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check) \
   __attribute__((__format__(__scanf__, string_index, first_to_check)))
-#elif defined(COMPILER_MSVC)
+#elif defined(_MSC_VER)
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN __declspec(noreturn)
-#define TF_ATTRIBUTE_ALWAYS_INLINE
+#define TF_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD
+#define TF_ATTRIBUTE_WEAK
 #define TF_MUST_USE_RESULT
 #define TF_PACKED
 #define TF_PRINTF_ATTRIBUTE(string_index, first_to_check)
@@ -57,7 +58,7 @@ limitations under the License.
 #endif
 
 // Control visiblity outside .so
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_EXPORT __declspec(dllexport)
 #else
@@ -65,13 +66,20 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
+
+#ifdef __has_builtin
+#define TF_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define TF_HAS_BUILTIN(x) 0
+#endif
 
-// GCC can be told that a certain branch is not likely to be taken (for
-// instance, a CHECK failure), and use that information in static analysis.
-// Giving it this information can help it optimize for the common case in
-// the absence of better information (ie. -fprofile-arcs).
-#if defined(COMPILER_GCC3)
+// Compilers can be told that a certain branch is not likely to be taken
+// (for instance, a CHECK failure), and use that information in static
+// analysis. Giving it this information can help it optimize for the
+// common case in the absence of better information (ie.
+// -fprofile-arcs).
+#if TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3)
 #define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
 #define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
 #else
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index 7bb9fc264fbf6ee3f20e9b2687c9ba52b6171ec4..fca3a2332d15f986d637f7d3a5eb91069dfce1a0 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -59,7 +59,7 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes);
 // routine, this routine returns 0.
 std::size_t MallocExtension_GetAllocatedSize(const void* p);
 
-// Returns the amount of RAM available in kB, or INT64_MAX if unknown.
+// Returns the amount of RAM available in bytes, or INT64_MAX if unknown.
 int64 AvailableRam();
 
 }  // namespace port
diff --git a/tensorflow/core/platform/null_file_system.h b/tensorflow/core/platform/null_file_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..420abc1ada81456e8883e48dd693614f75b81116
--- /dev/null
+++ b/tensorflow/core/platform/null_file_system.h
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_NULL_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_NULL_FILE_SYSTEM_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
+
+namespace tensorflow {
+
+// START_SKIP_DOXYGEN
+
+#ifndef SWIG
+// Degenerate file system that provides no implementations.
+class NullFileSystem : public FileSystem {
+ public:
+  NullFileSystem() {}
+
+  ~NullFileSystem() override = default;
+
+  Status NewRandomAccessFile(
+      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
+    return errors::Unimplemented("NewRandomAccessFile unimplemented");
+  }
+
+  Status NewWritableFile(const string& fname,
+                         std::unique_ptr<WritableFile>* result) override {
+    return errors::Unimplemented("NewWritableFile unimplemented");
+  }
+
+  Status NewAppendableFile(const string& fname,
+                           std::unique_ptr<WritableFile>* result) override {
+    return errors::Unimplemented("NewAppendableFile unimplemented");
+  }
+
+  Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
+    return errors::Unimplemented(
+        "NewReadOnlyMemoryRegionFromFile unimplemented");
+  }
+
+  Status FileExists(const string& fname) override {
+    return errors::Unimplemented("FileExists unimplemented");
+  }
+
+  Status GetChildren(const string& dir, std::vector<string>* result) override {
+    return errors::Unimplemented("GetChildren unimplemented");
+  }
+
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override {
+    return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+  }
+
+  Status DeleteFile(const string& fname) override {
+    return errors::Unimplemented("DeleteFile unimplemented");
+  }
+
+  Status CreateDir(const string& dirname) override {
+    return errors::Unimplemented("CreateDir unimplemented");
+  }
+
+  Status DeleteDir(const string& dirname) override {
+    return errors::Unimplemented("DeleteDir unimplemented");
+  }
+
+  Status GetFileSize(const string& fname, uint64* file_size) override {
+    return errors::Unimplemented("GetFileSize unimplemented");
+  }
+
+  Status RenameFile(const string& src, const string& target) override {
+    return errors::Unimplemented("RenameFile unimplemented");
+  }
+
+  Status Stat(const string& fname, FileStatistics* stat) override {
+    return errors::Unimplemented("Stat unimplemented");
+  }
+};
+#endif
+
+// END_SKIP_DOXYGEN
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_NULL_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 8097624e09f81364071895ad114f26f93f4aab14..418874d3406200566cdd9a4c6141852b948413ff 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -118,6 +118,9 @@ class PosixEnv : public Env {
                                const string& version) override {
     return tensorflow::internal::FormatLibraryFileName(name, version);
   }
+
+ private:
+  void GetLocalTempDirectories(std::vector<string>* list) override;
 };
 
 }  // namespace
@@ -131,7 +134,7 @@ Env* Env::Default() {
 }
 #endif
 
-void Env::GetLocalTempDirectories(std::vector<string>* list) {
+void PosixEnv::GetLocalTempDirectories(std::vector<string>* list) {
   list->clear();
   // Directories, in order of preference. If we find a dir that
   // exists, we stop adding other less-preferred dirs
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 494acde803a778fb839a7444e4d5ac2fd094eb09..8e316472fe2ea6f7c3187f0a5f98052c20f5ce6b 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -177,7 +177,7 @@ int64 AvailableRam() {
   struct sysinfo info;
   int err = sysinfo(&info);
   if (err == 0) {
-    return info.freeram / 1024;
+    return info.freeram;
   }
 #endif
   return INT64_MAX;
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 9a8021565cbcc2a172a23439d2a7139108c0df39..47bfa020cef991e6a2e9e9de283318b287788454 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/posix/posix_file_system.h"
@@ -225,6 +226,11 @@ Status PosixFileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
+Status PosixFileSystem::GetMatchingPaths(const string& pattern,
+                                         std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status PosixFileSystem::DeleteFile(const string& fname) {
   Status result;
   if (unlink(TranslateName(fname).c_str()) != 0) {
diff --git a/tensorflow/core/platform/posix/posix_file_system.h b/tensorflow/core/platform/posix/posix_file_system.h
index 98ffa43b8acf8a10a4ace1bf11cc7d6f5e8a95a7..e8898d0a97f50e29d1216bf2d9d340711cb29754 100644
--- a/tensorflow/core/platform/posix/posix_file_system.h
+++ b/tensorflow/core/platform/posix/posix_file_system.h
@@ -47,6 +47,9 @@ class PosixFileSystem : public FileSystem {
 
   Status Stat(const string& fname, FileStatistics* stats) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/posix/subprocess.cc b/tensorflow/core/platform/posix/subprocess.cc
index cefc66831a9b9fe11a170013e64ff7a1cd2e2bcd..a661c34ef01b1afba05dedf3d7c6c2aef86245fa 100644
--- a/tensorflow/core/platform/posix/subprocess.cc
+++ b/tensorflow/core/platform/posix/subprocess.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <memory>
+#include <vector>
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/subprocess.h"
@@ -461,4 +463,12 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
   return WaitInternal(&status) ? status : -1;
 }
 
+std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
+  std::unique_ptr<SubProcess> proc(new SubProcess());
+  proc->SetProgram(argv[0], argv);
+  proc->SetChannelAction(CHAN_STDERR, ACTION_DUPPARENT);
+  proc->SetChannelAction(CHAN_STDOUT, ACTION_DUPPARENT);
+  return proc;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/test.cc b/tensorflow/core/platform/posix/test.cc
index a69127b3e88834458503012ad4d8d9334bba247a..28f7478a6d5a371079acef135eaab69ead1ebf4b 100644
--- a/tensorflow/core/platform/posix/test.cc
+++ b/tensorflow/core/platform/posix/test.cc
@@ -20,19 +20,10 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/subprocess.h"
 
 namespace tensorflow {
 namespace testing {
 
-std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
-  std::unique_ptr<SubProcess> proc(new SubProcess());
-  proc->SetProgram(argv[0], argv);
-  proc->SetChannelAction(CHAN_STDERR, ACTION_DUPPARENT);
-  proc->SetChannelAction(CHAN_STDOUT, ACTION_DUPPARENT);
-  return proc;
-}
-
 int PickUnusedPortOrDie() { return internal::PickUnusedPortOrDie(); }
 
 string TensorFlowSrcRoot() {
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index 3a0ad2e9bd09211aa452f8b39b621343a113785d..21038cfeb15be052f7460151bacaa15544c8d77c 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -13,18 +13,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_cc_binary(
     name = "s3_file_system.so",
     srcs = [
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 301fcb9dbf653d29f6ac5321332c8764adaad681..6da679dc7523f52724cf992e7ba70351de3cf393 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/s3/aws_logging.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
@@ -155,7 +156,7 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("S3 path doesn't contain a bucket name: ",
                                    fname);
   }
-  objectp.Consume("/");
+  str_util::ConsumePrefix(&objectp, "/");
   *object = objectp.ToString();
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("S3 path doesn't contain an object name: ",
@@ -497,6 +498,11 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
   return Status::OK();
 }
 
+Status S3FileSystem::GetMatchingPaths(const string& pattern,
+                                      std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status S3FileSystem::DeleteFile(const string& fname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 31264be621d93c1efb68f7b0b49e28cb65b05de1..5d0565b378198a39f80940c0627f7638e92691fa 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -46,6 +46,9 @@ class S3FileSystem : public FileSystem {
 
   Status Stat(const string& fname, FileStatistics* stat) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index f31e556a7085f57553022aeb8b117d6bf271a76d..0a590b3d40c0dbf007feee07fc93be4838924679 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -19,10 +19,8 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 
 #if defined(PLATFORM_GOOGLE)
-#include "tensorflow/core/platform/google/from_stream_executor_status.h"
 #include "tensorflow/stream_executor/platform/google/dso_loader.h"
 #else
-#include "tensorflow/core/platform/default/from_stream_executor_status.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
@@ -37,4 +35,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(jlebar): Remove this once we've completed
+// the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
+}  // namespace gputools
+}  // namespace perftools
+
 #endif  // TENSORFLOW_PLATFORM_STREAM_EXECUTOR_H_
diff --git a/tensorflow/core/platform/stream_executor_no_cuda.h b/tensorflow/core/platform/stream_executor_no_cuda.h
index 4a41d7adf517cf00f45843b57162d15f8827812f..50a5e732c0ec222d3ee2329a57fc6ea9ac4b233c 100644
--- a/tensorflow/core/platform/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/stream_executor_no_cuda.h
@@ -19,10 +19,8 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 
 #if defined(PLATFORM_GOOGLE)
-#include "tensorflow/core/platform/google/from_stream_executor_status.h"
 #include "tensorflow/stream_executor/platform/google/dso_loader.h"
 #else
-#include "tensorflow/core/platform/default/from_stream_executor_status.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
diff --git a/tensorflow/core/platform/subprocess.h b/tensorflow/core/platform/subprocess.h
index dfdcf82173b774dee119f1fe9e84818e45d7b50c..dcc0c1a4ee33ff47beefa6c3f82c6954770e7036 100644
--- a/tensorflow/core/platform/subprocess.h
+++ b/tensorflow/core/platform/subprocess.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_SUBPROCESS_H_
 #define TENSORFLOW_PLATFORM_SUBPROCESS_H_
 
+#include <memory>
+#include <vector>
+
 namespace tensorflow {
 
 // Channel identifiers.
@@ -43,6 +46,12 @@ enum ChannelAction {
 // Supports spawning and killing child processes.
 class SubProcess;
 
+// Returns an object that represents a child process that will be
+// launched with the given command-line arguments `argv`. The process
+// must be explicitly started by calling the Start() method on the
+// returned object.
+std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv);
+
 }  // namespace tensorflow
 
 #include "tensorflow/core/platform/platform.h"
diff --git a/tensorflow/core/platform/test.h b/tensorflow/core/platform/test.h
index 295957c3d801cc959eeb3e60dd2a74587ee14197..99bae63edf8ae26fb51acde12dc1a4f8bcaf778c 100644
--- a/tensorflow/core/platform/test.h
+++ b/tensorflow/core/platform/test.h
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/platform/types.h"
 
 // As of September 2016, we continue to attempt to avoid the use of gmock aka
@@ -49,12 +48,6 @@ string TensorFlowSrcRoot();
 // Returns the same value for the lifetime of the process.
 int RandomSeed();
 
-// Returns an object that represents a child process that will be
-// launched with the given command-line arguments `argv`. The process
-// must be explicitly started by calling the Start() method on the
-// returned object.
-std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv);
-
 // Returns an unused port number, for use in multi-process testing.
 // NOTE: This function is not thread-safe.
 int PickUnusedPortOrDie();
diff --git a/tensorflow/core/platform/test_main.cc b/tensorflow/core/platform/test_main.cc
index 677114f5f22b4fe70c6f006e536a2da5f17977d6..e57bbd80af4b84a84a99bbb749004d66982731d1 100644
--- a/tensorflow/core/platform/test_main.cc
+++ b/tensorflow/core/platform/test_main.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #include <iostream>
 
-#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/stacktrace_handler.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -37,7 +37,7 @@ GTEST_API_ int main(int argc, char** argv) {
   tensorflow::testing::InstallStacktraceHandler();
   testing::InitGoogleTest(&argc, argv);
   for (int i = 1; i < argc; i++) {
-    if (tensorflow::StringPiece(argv[i]).starts_with("--benchmarks=")) {
+    if (tensorflow::str_util::StartsWith(argv[i], "--benchmarks=")) {
       const char* pattern = argv[i] + strlen("--benchmarks=");
       tensorflow::testing::Benchmark::Run(pattern);
       return 0;
diff --git a/tensorflow/core/platform/tracing.cc b/tensorflow/core/platform/tracing.cc
index f7d2a8e282de51b30394400aef3519e933dafb12..c0386c0a3fc052e501360449c6518e7c0534220f 100644
--- a/tensorflow/core/platform/tracing.cc
+++ b/tensorflow/core/platform/tracing.cc
@@ -15,24 +15,24 @@ limitations under the License.
 
 #include "tensorflow/core/platform/tracing.h"
 
+#include <array>
 #include <atomic>
 #include <map>
 #include <string>
 #include <vector>
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace tracing {
+namespace {
+std::atomic<uint64> unique_arg{1};
+std::atomic<const TraceCollector*> trace_collector;
+}  // namespace
 
-namespace port {
-
-int32 Tracing::category_id_[kEventCategoryMax];
-uint64 Tracing::event_mask_ = 0;
-std::map<string, int32>* Tracing::name_map_ = new std::map<string, int32>;
-
-// This needs to be kept in sync with the EventCategory enumeration.
-const char* Tracing::EventCategoryString(EventCategory category) {
+const char* GetEventCategoryName(EventCategory category) {
   switch (category) {
     case EventCategory::kScheduleClosure:
       return "ScheduleClosure";
@@ -40,63 +40,45 @@ const char* Tracing::EventCategoryString(EventCategory category) {
       return "RunClosure";
     case EventCategory::kCompute:
       return "Compute";
-    case EventCategory::kEventCategoryMax:
-      return "EventCategoryMax";
+    default:
+      return "Unknown";
   }
-  return "Unknown";
 }
 
-// This function allows the user to specify arbitrary subsets of the
-// supported Threadscape events and activities.
-bool Tracing::ParseEventMask(const char* flagname, const string& value) {
-  VLOG(1) << flagname << " set to " << value;
-  int64 new_mask = 0;
-  std::vector<string> events =
-      str_util::Split(value, ',', str_util::SkipEmpty());
-  for (string name : events) {
-    bool clear = false;
-    int64 mask = 0;
-    if (name[0] == '!') {
-      // invert the sense of the flag
-      clear = true;
-      name = name.substr(1);
-    }
-    if (name == "ALL") {
-      mask = ~0;
-    } else {
-      auto it = name_map_->find(name);
-      int32 id;
-      if (it == name_map_->end()) {
-        id = -1;
-      } else {
-        id = it->second;
-      }
-      if (id < 0) {
-        LOG(ERROR) << "Can't parse event mask name " << name;
-        return false;
-      }
-      mask = 1 << id;
-    }
-    if (clear) {
-      new_mask &= ~mask;
-    } else {
-      new_mask |= mask;
-    }
-  }
-  // parsing was successful; set the permanent event mask
-  event_mask_ = new_mask;
-  return true;
+std::array<const EventCollector*, GetNumEventCategories()>
+    EventCollector::instances_;
+
+void SetEventCollector(EventCategory category,
+                       const EventCollector* collector) {
+  EventCollector::instances_[static_cast<unsigned>(category)] = collector;
+}
+
+uint64 GetUniqueArg() {
+  return unique_arg.fetch_add(1, std::memory_order_relaxed);
 }
 
-/*static*/ std::atomic<Tracing::Engine*> Tracing::tracing_engine_;
+uint64 GetArgForName(StringPiece name) {
+  return Hash64(name.data(), name.size());
+}
 
-void Tracing::RegisterEngine(Engine* e) {
-  tracing_engine_.store(e, std::memory_order_release);
+string TraceCollector::ConcatenateNames(StringPiece first, StringPiece second) {
+  std::string result;
+  bool has_two_parts = !first.empty() && !second.empty();
+  result.reserve(first.size() + second.size() +
+                 static_cast<int>(has_two_parts));
+  result.append(first.data(), first.size());
+  if (has_two_parts) result.append({':'});
+  result.append(second.data(), second.size());
+  return result;
 }
 
-Tracing::Engine::~Engine() {}
-Tracing::Engine::Annotation::~Annotation() {}
-Tracing::Engine::Tracer::~Tracer() {}
+void SetTraceCollector(const TraceCollector* collector) {
+  return trace_collector.store(collector, std::memory_order_release);
+}
+
+const TraceCollector* GetTraceCollector() {
+  return trace_collector.load(std::memory_order_acquire);
+}
 
-}  // namespace port
+}  // namespace tracing
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index 8f7bff1bb020ee501c982d5d0761d36537993e63..c322777705a7fc57cb3dabbaa4fb66379071f548 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 // Tracing interface
 
+#include <array>
 #include <atomic>
 #include <map>
 #include <memory>
@@ -30,253 +31,205 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+namespace tracing {
+
+// This enumeration contains the identifiers of all TensorFlow CPU profiler
+// events. It must be kept in sync with the code in GetEventCategoryName().
+enum struct EventCategory : unsigned {
+  kScheduleClosure = 0,
+  kRunClosure = 1,
+  kCompute = 2,
+  kNumCategories = 3  // sentinel - keep last
+};
+constexpr unsigned GetNumEventCategories() {
+  return static_cast<unsigned>(EventCategory::kNumCategories);
+}
+const char* GetEventCategoryName(EventCategory);
 
-namespace port {
-
-class Tracing {
+// Interface for CPU profiler events.
+class EventCollector {
  public:
-  // This enumeration contains the identifiers of all TensorFlow
-  // threadscape events and code regions.  Threadscape assigns its
-  // own identifiers at runtime when we register our events and we
-  // cannot know in advance what IDs it will choose.  The "RecordEvent"
-  // method and "ScopedActivity" use these event IDs for consistency
-  // and remap them to threadscape IDs at runtime.  This enum is limited
-  // to 64 values since we use a bitmask to configure which events are
-  // enabled.  It must also be kept in step with the code in
-  // "Tracing::EventCategoryString".
-  enum EventCategory {
-    kScheduleClosure = 0,
-    kRunClosure = 1,
-    kCompute = 2,
-    kEventCategoryMax = 3  // sentinel - keep last
-  };
-  // Note: We currently only support up to 64 categories.
-  static_assert(kEventCategoryMax <= 64, "only support up to 64 events");
+  virtual ~EventCollector() {}
+  virtual void RecordEvent(uint64 arg) const = 0;
+  virtual void StartRegion(uint64 arg) const = 0;
+  virtual void StopRegion() const = 0;
 
-  // Called by main programs to initialize tracing facilities
-  static void Initialize();
+  // Annotates the current thread with a name.
+  static void SetCurrentThreadName(const char* name);
+  // Returns whether event collection is enabled.
+  static bool IsEnabled();
 
-  // Return the pathname of the directory where we are writing log files.
-  static const char* LogDir();
+ private:
+  friend void SetEventCollector(EventCategory, const EventCollector*);
+  friend const EventCollector* GetEventCollector(EventCategory);
 
-  // Returns a non-zero identifier which can be used to correlate
-  // related events.
-  static inline uint64 UniqueId();
+  static std::array<const EventCollector*, GetNumEventCategories()> instances_;
+};
+// Set the callback for RecordEvent and ScopedRegion of category.
+// Not thread safe. Only call while EventCollector::IsEnabled returns false.
+void SetEventCollector(EventCategory category, const EventCollector* collector);
+
+// Returns the callback for RecordEvent and ScopedRegion of category if
+// EventCollector::IsEnabled(), otherwise returns null.
+inline const EventCollector* GetEventCollector(EventCategory category) {
+  if (EventCollector::IsEnabled()) {
+    return EventCollector::instances_[static_cast<unsigned>(category)];
+  }
+  return nullptr;
+}
 
-  // Returns true if a trace is in progress.  Can be used to reduce tracing
-  // overheads in fast-path code.
-  static inline bool IsActive();
+// Returns a unique id to pass to RecordEvent/ScopedRegion. Never returns zero.
+uint64 GetUniqueArg();
 
-  // Associate name with the current thread.
-  static void RegisterCurrentThread(const char* name);
+// Returns an id for name to pass to RecordEvent/ScopedRegion.
+uint64 GetArgForName(StringPiece name);
 
-  // Posts an event with the supplied category and arg.
-  static void RecordEvent(EventCategory category, uint64 arg);
+// Records an atomic event through the currently registered EventCollector.
+inline void RecordEvent(EventCategory category, uint64 arg) {
+  if (auto collector = GetEventCollector(category)) {
+    collector->RecordEvent(arg);
+  }
+}
 
-  // Traces a region of code.  Posts a tracing "EnterCodeRegion" event
-  // when created and an "ExitCodeRegion" event when destroyed.
-  class ScopedActivity {
-   public:
-    explicit ScopedActivity(EventCategory category, uint64 arg);
-    ~ScopedActivity();
+// Records an event for the duration of the instance lifetime through the
+// currently registered EventCollector.
+class ScopedRegion {
+  ScopedRegion(ScopedRegion&) = delete;             // Not copy-constructible.
+  ScopedRegion& operator=(ScopedRegion&) = delete;  // Not assignable.
 
-   private:
-#if defined(PLATFORM_GOOGLE)
-    const bool enabled_;
-    const int32 region_id_;
-#endif
-
-    TF_DISALLOW_COPY_AND_ASSIGN(ScopedActivity);
-  };
+ public:
+  ScopedRegion(ScopedRegion&& other) noexcept  // Move-constructible.
+      : collector_(other.collector_) {
+    other.collector_ = nullptr;
+  }
 
-  // Trace collection engine can be registered with this module.
-  // If no engine is registered, ScopedAnnotation and TraceMe are no-ops.
-  class Engine;
-  static void RegisterEngine(Engine*);
+  ScopedRegion(EventCategory category, uint64 arg)
+      : collector_(GetEventCollector(category)) {
+    if (collector_) {
+      collector_->StartRegion(arg);
+    }
+  }
 
-  // Forward declaration of the GPU utility classes.
-  class ScopedAnnotation;
-  class TraceMe;
+  // Same as ScopedRegion(category, GetUniqueArg()), but faster if
+  // EventCollector::IsEnaled() returns false.
+  ScopedRegion(EventCategory category)
+      : collector_(GetEventCollector(category)) {
+    if (collector_) {
+      collector_->StartRegion(GetUniqueArg());
+    }
+  }
 
- private:
-  friend class TracingTest;
-  friend class ScopedAnnotation;
-  friend class TraceMe;
+  // Same as ScopedRegion(category, GetArgForName(name)), but faster if
+  // EventCollector::IsEnaled() returns false.
+  ScopedRegion(EventCategory category, StringPiece name)
+      : collector_(GetEventCollector(category)) {
+    if (collector_) {
+      collector_->StartRegion(GetArgForName(name));
+    }
+  }
 
-  static std::atomic<Tracing::Engine*> tracing_engine_;
-  static Tracing::Engine* engine() {
-    return tracing_engine_.load(std::memory_order_acquire);
+  ~ScopedRegion() {
+    if (collector_) {
+      collector_->StopRegion();
+    }
   }
 
-  static void RegisterEvent(EventCategory id, const char* name);
-  static const char* EventCategoryString(EventCategory category);
-
-  //
-  // Parses event mask expressions in 'value' of the form:
-  //   expr ::= <term> (,<term>)*
-  //   term ::= <event> | "!" <event>
-  //   event ::= "ALL" | <wait_event> | <other_event>
-  //   wait_event ::= "ENewSession" | "ECloseSession" | ...
-  //   other_event ::= "Send" | "Wait" | ...
-  // ALL denotes all events, <event> turns on tracing for this event, and
-  // !<event> turns off tracing for this event.
-  // If the expression can be parsed correctly it returns true and sets
-  // the event_mask_. Otherwise it returns false and the event_mask_ is left
-  // unchanged.
-  static bool ParseEventMask(const char* flagname, const string& value);
-
-  // Bit mask of enabled trace categories.
-  static uint64 event_mask_;
-
-  // Records the mappings between Threadscape IDs and the "EventCategory" enum.
-  static int32 category_id_[kEventCategoryMax];
-  static std::map<string, int32>* name_map_;
+  bool IsEnabled() const { return collector_ != nullptr; }
+
+ private:
+  const EventCollector* collector_;
 };
 
-// Trace collection engine that actually implements collection.
-class Tracing::Engine {
+// Interface for accelerator profiler annotations.
+class TraceCollector {
  public:
-  Engine() {}
-  virtual ~Engine();
-
-  // Returns true if Tracing is currently enabled.
-  virtual bool IsEnabled() const = 0;
-
-  // Represents an active annotation.
-  class Annotation {
+  class Handle {
    public:
-    Annotation() {}
-    virtual ~Annotation();
+    virtual ~Handle() {}
   };
 
-  // Represents an active trace.
-  class Tracer {
-   public:
-    Tracer() {}
-    virtual ~Tracer();
-  };
+  virtual ~TraceCollector() {}
+  virtual std::unique_ptr<Handle> CreateAnnotationHandle(
+      StringPiece name_part1, StringPiece name_part2) const = 0;
+  virtual std::unique_ptr<Handle> CreateActivityHandle(
+      StringPiece name_part1, StringPiece name_part2,
+      bool is_expensive) const = 0;
 
- private:
-  friend class ScopedAnnotation;
-  friend class TraceMe;
-
-  // Register the specified name as an annotation on the current thread.
-  // Caller should delete the result to remove the annotation.
-  // Annotations from the same thread are destroyed in a LIFO manner.
-  // May return nullptr if annotations are not supported.
-  virtual Annotation* PushAnnotation(StringPiece name) = 0;
-
-  // Start tracing under the specified label. Caller should delete the result
-  // to stop tracing.
-  // May return nullptr if tracing is not supported.
-  virtual Tracer* StartTracing(StringPiece label, bool is_expensive) = 0;
-  // Same as above, but implementations can avoid copying the string.
-  virtual Tracer* StartTracing(string&& label, bool is_expensive) {
-    return StartTracing(StringPiece(label), is_expensive);
-  }
+ protected:
+  static string ConcatenateNames(StringPiece first, StringPiece second);
 
-  // Backwards compatibility one arg variants (assume is_expensive=true).
-  Tracer* StartTracing(StringPiece label) {
-    return StartTracing(label, /*is_expensive=*/true);
-  }
-  Tracer* StartTracing(string&& label) {
-    return StartTracing(StringPiece(label), /*is_expensive=*/true);
-  }
+ private:
+  friend void SetTraceCollector(const TraceCollector*);
+  friend const TraceCollector* GetTraceCollector();
 };
+// Set the callback for ScopedAnnotation and ScopedActivity.
+void SetTraceCollector(const TraceCollector* collector);
+// Returns the callback for ScopedAnnotation and ScopedActivity.
+const TraceCollector* GetTraceCollector();
 
-// This class permits a user to apply annotation on kernels and memcpys
-// when launching them. While an annotation is in scope, all activities
-// within that scope get their names replaced by the annotation. The kernel
-// name replacement is done when constructing the protobuf for sending out to
-// a client (e.g., the stubby requestor) for both API and Activity records.
-//
-// Ownership: The creator of ScopedAnnotation assumes ownership of the object.
+// Adds an annotation to all activities for the duration of the instance
+// lifetime through the currently registered TraceCollector.
 //
 // Usage: {
-//          ScopedAnnotation annotation("first set of kernels");
+//          ScopedAnnotation annotation("my kernels");
 //          Kernel1<<<x,y>>>;
-//          LaunchKernel2(); // Which eventually launches a cuda kernel.
+//          LaunchKernel2(); // Launches a CUDA kernel.
 //        }
-// In the above scenario, the GPUProf UI would show 2 kernels with the name
-// "first set of kernels" executing -- they will appear as the same kernel.
-class Tracing::ScopedAnnotation {
+// This will add 'my kernels' to both kernels in the profiler UI
+class ScopedAnnotation {
  public:
-  explicit ScopedAnnotation(StringPiece name);
+  explicit ScopedAnnotation(StringPiece name)
+      : ScopedAnnotation(name, StringPiece()) {}
 
-  // If tracing is enabled, set up an annotation with a label of
-  // "<name_part1>:<name_part2>".  Can be cheaper than the
+  // If tracing is enabled, add a name scope of
+  // "<name_part1>:<name_part2>".  This can be cheaper than the
   // single-argument constructor because the concatenation of the
   // label string is only done if tracing is enabled.
-  ScopedAnnotation(StringPiece name_part1, StringPiece name_part2);
+  ScopedAnnotation(StringPiece name_part1, StringPiece name_part2)
+      : handle_([&] {
+          auto trace_collector = GetTraceCollector();
+          return trace_collector ? trace_collector->CreateAnnotationHandle(
+                                       name_part1, name_part2)
+                                 : nullptr;
+        }()) {}
 
-  // Returns true iff scoped annotations are active.
-  static bool Enabled() {
-    auto e = Tracing::engine();
-    return e && e->IsEnabled();
-  }
+  bool IsEnabled() const { return static_cast<bool>(handle_); }
 
  private:
-  std::unique_ptr<Engine::Annotation> annotation_;
+  std::unique_ptr<TraceCollector::Handle> handle_;
 };
 
-// TODO(opensource): clean up the scoped classes for GPU tracing.
-// This class permits user-specified (CPU) tracing activities. A trace
-// activity is started when an object of this class is created and stopped
-// when the object is destroyed.
-class Tracing::TraceMe {
+// Adds an activity through the currently registered TraceCollector.
+// The activity starts when an object of this class is created and stops when
+// the object is destroyed.
+class ScopedActivity {
  public:
-  explicit TraceMe(StringPiece name);
-  TraceMe(StringPiece name, bool is_expensive);
+  explicit ScopedActivity(StringPiece name, bool is_expensive = true)
+      : ScopedActivity(name, StringPiece(), is_expensive) {}
 
-  // If tracing is enabled, set up a traceMe with a label of
+  // If tracing is enabled, set up an activity with a label of
   // "<name_part1>:<name_part2>".  This can be cheaper than the
   // single-argument constructor because the concatenation of the
   // label string is only done if tracing is enabled.
-  TraceMe(StringPiece name_part1, StringPiece name_part2);
-  TraceMe(StringPiece name_part1, StringPiece name_part2, bool is_expensive);
+  ScopedActivity(StringPiece name_part1, StringPiece name_part2,
+                 bool is_expensive = true)
+      : handle_([&] {
+          auto trace_collector = GetTraceCollector();
+          return trace_collector ? trace_collector->CreateActivityHandle(
+                                       name_part1, name_part2, is_expensive)
+                                 : nullptr;
+        }()) {}
+
+  bool IsEnabled() const { return static_cast<bool>(handle_); }
 
  private:
-  std::unique_ptr<Engine::Tracer> tracer_;
+  std::unique_ptr<TraceCollector::Handle> handle_;
 };
 
-inline Tracing::ScopedAnnotation::ScopedAnnotation(StringPiece name) {
-  auto e = Tracing::engine();
-  if (e && e->IsEnabled()) {
-    annotation_.reset(e->PushAnnotation(name));
-  }
-}
-
-inline Tracing::ScopedAnnotation::ScopedAnnotation(StringPiece name_part1,
-                                                   StringPiece name_part2) {
-  auto e = Tracing::engine();
-  if (e && e->IsEnabled()) {
-    annotation_.reset(
-        e->PushAnnotation(strings::StrCat(name_part1, ":", name_part2)));
-  }
-}
-
-inline Tracing::TraceMe::TraceMe(StringPiece name) : TraceMe(name, true) {}
-
-inline Tracing::TraceMe::TraceMe(StringPiece name, bool is_expensive) {
-  auto e = Tracing::engine();
-  if (e && e->IsEnabled()) {
-    tracer_.reset(e->StartTracing(name, is_expensive));
-  }
-}
-
-inline Tracing::TraceMe::TraceMe(StringPiece name_part1, StringPiece name_part2)
-    : TraceMe(name_part1, name_part2, true) {}
-
-inline Tracing::TraceMe::TraceMe(StringPiece name_part1, StringPiece name_part2,
-                                 bool is_expensive) {
-  auto e = Tracing::engine();
-  if (e && e->IsEnabled()) {
-    tracer_.reset(e->StartTracing(strings::StrCat(name_part1, ":", name_part2),
-                                  is_expensive));
-  }
-}
+// Return the pathname of the directory where we are writing log files.
+const char* GetLogDir();
 
-}  // namespace port
+}  // namespace tracing
 }  // namespace tensorflow
 
 #if defined(PLATFORM_GOOGLE)
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index e2dd5b003f291b6ce88ebabe2d66114762bd2c57..68897ac423f1caf41007c950452f2a00241c7611 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -31,12 +31,6 @@ limitations under the License.
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
 
-#if defined(PLATFORM_WINDOWS)
-#include "tensorflow/core/platform/windows/cpu_info.h"
-#endif
-
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
-
 namespace tensorflow {
 
 // Define tensorflow::string to refer to appropriate platform specific type.
@@ -66,4 +60,10 @@ typedef uint64 Fprint;
 
 }  // namespace tensorflow
 
+// Alias namespace ::stream_executor as ::tensorflow::se.
+namespace stream_executor {}
+namespace tensorflow {
+namespace se = ::stream_executor;
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_PLATFORM_TYPES_H_
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index f20939d3c0ff02be30f19be170644fab44b6f45e..ba2126abcfcf9cc274a16485bbe404a90f37250b 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -19,13 +19,4 @@ limitations under the License.
 // included so __cpuidex function is available for GETCPUID on Windows
 #include <intrin.h>
 
-// Byte order defines provided by gcc. MSVC doesn't define those so
-// we define them here.
-// We assume that all windows platform out there are little endian.
-#if defined(_MSC_VER) && !defined(__clang__)
-#define __ORDER_LITTLE_ENDIAN__ 0x4d2
-#define __ORDER_BIG_ENDIAN__ 0x10e1
-#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-#endif
-
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 41b264417071cadb5f70806b458ee2b46ebb2feb..2f54f423b2ee7d8842f2edab7f6bf29877fac173 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -160,6 +160,8 @@ class WindowsEnv : public Env {
   }
 
  private:
+  void GetLocalTempDirectories(std::vector<string>* list) override;
+
   typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
   FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
@@ -174,7 +176,7 @@ Env* Env::Default() {
   return default_env;
 }
 
-void Env::GetLocalTempDirectories(std::vector<string>* list) {
+void WindowsEnv::GetLocalTempDirectories(std::vector<string>* list) {
   list->clear();
   // On windows we'll try to find a directory in this order:
   //   C:/Documents & Settings/whomever/TEMP (or whatever GetTempPath() is)
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 582b232054b850a2ef5ab8f47c089eb35a7bb3cf..174f41a993f8010112f316dc9ba220f6ecc2804e 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #endif
 
 #include <Windows.h>
+#include <shlwapi.h>
 
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/demangle.h"
@@ -149,18 +150,23 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 string Demangle(const char* mangled) { return mangled; }
 
 double NominalCPUFrequency() {
-#ifdef TENSORFLOW_USE_ABSL
-  return absl::base_internal::NominalCPUFrequency();
-#else
+  DWORD data;
+  DWORD data_size = sizeof(data);
+  #pragma comment(lib, "shlwapi.lib")  // For SHGetValue().
+  if (SUCCEEDED(
+          SHGetValueA(HKEY_LOCAL_MACHINE,
+                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                      "~MHz", nullptr, &data, &data_size))) {
+    return data * 1e6;  // Value is MHz.
+  }
   return 1.0;
-#endif
 }
 
 int64 AvailableRam() {
   MEMORYSTATUSEX statex;
   statex.dwLength = sizeof(statex);
   if (GlobalMemoryStatusEx(&statex)) {
-    return statex.ullAvailPhys / 1024;
+    return statex.ullAvailPhys;
   }
   return INT64_MAX;
 }
diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h
index 66ec44885d52195b807f4957aec6d590324b2975..f00471d484014d431665dbf0cb0d38ea82a14435 100644
--- a/tensorflow/core/platform/windows/subprocess.h
+++ b/tensorflow/core/platform/windows/subprocess.h
@@ -16,11 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
 #define TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
 
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+
 namespace tensorflow {
 
 // SubProcess is not yet implemented for Windows.
 class SubProcess {};
 
+std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
+  LOG(FATAL) << "CreateSubProcess NOT IMPLEMENTED for Windows yet ! ";
+  return nullptr;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
diff --git a/tensorflow/core/platform/windows/test.cc b/tensorflow/core/platform/windows/test.cc
index 584acad91b24fc6be9b93f71b7d44b0fba3cb2e8..ad2b7bc6ff6e6037a922352d34b628b2138d0712 100644
--- a/tensorflow/core/platform/windows/test.cc
+++ b/tensorflow/core/platform/windows/test.cc
@@ -22,11 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace testing {
 
-std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
-  LOG(FATAL) << "CreateSubProcess NOT IMPLEMENTED for Windows yet ! ";
-  return nullptr;
-}
-
 int PickUnusedPortOrDie() { return internal::PickUnusedPortOrDie(); }
 
 string TensorFlowSrcRoot() {
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index b6b3722caae4dc0cdc0ddff91be479ab91a744b2..dc2efbeaf5e3eabc6077df4c1c126762d36ba8a6 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/windows/error.h"
@@ -382,7 +383,8 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
 
 Status WindowsFileSystem::FileExists(const string& fname) {
   constexpr int kOk = 0;
-  if (_access(TranslateName(fname).c_str(), kOk) == 0) {
+  std::wstring ws_translated_fname = Utf8ToWideChar(TranslateName(fname));
+  if (_waccess(ws_translated_fname.c_str(), kOk) == 0) {
     return Status::OK();
   }
   return errors::NotFound(fname, " not found");
@@ -493,7 +495,8 @@ Status WindowsFileSystem::GetMatchingPaths(const string& pattern,
   // but no code appears to rely on this behavior.
   string converted_pattern(pattern);
   std::replace(converted_pattern.begin(), converted_pattern.end(), '\\', '/');
-  TF_RETURN_IF_ERROR(FileSystem::GetMatchingPaths(converted_pattern, results));
+  TF_RETURN_IF_ERROR(internal::GetMatchingPaths(this, Env::Default(),
+                                                converted_pattern, results));
   for (string& result : *results) {
     std::replace(result.begin(), result.end(), '/', '\\');
   }
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 35d99930186381edbb80aa6485856e288f1dd568..af034bdd7d2ce4c25357e33ee18dd968c9e572a0 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -1,24 +1,7 @@
-package(
-    default_visibility = ["//visibility:public"],
-)
+package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
@@ -57,7 +40,6 @@ tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
     cc_api_version = 2,
-    go_api_version = 2,
     java_api_version = 2,
     protodeps = tf_additional_all_protos(),
     visibility = ["//visibility:public"],
diff --git a/tensorflow/core/profiler/g3doc/profile_model_architecture.md b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
index 61bb66bd21b336074475142ee564414ee154cafc..4ccd43ce68379db40b5bba0c3f2969d899b39d08 100644
--- a/tensorflow/core/profiler/g3doc/profile_model_architecture.md
+++ b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
@@ -45,22 +45,22 @@ sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 
 For an operation to have float operation statistics:
 
-* It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
-use the definition to calculate float operations. Contributes are welcome.
-
-* It must have known "shape" information for RegisterStatistics('flops')
-to calculate the statistics. It is suggested to pass in `-run_meta_path` if
-shape is only known during runtime. tfprof can fill in the missing shape with
-the runtime shape information from RunMetadata.
-Hence, it is suggested to use `-account_displayed_op_only`
-option so that you know the statistics are only for the operations printed out.
-
-* If no RunMetadata provided, tfprof count float_ops of each graph node once,
-even if it is defined in tf.while_loop. This is because tfprof doesn't know
-how many times are run statically. If RunMetadata provided, tfprof calculate
-float_ops as float_ops * run_count.
-
-
+*   It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
+    uses the definition to calculate float operations. Contributions are
+    welcomed.
+
+*   It must have known "shape" information for RegisterStatistics('flops') to
+    calculate the statistics. It is suggested to pass in `-run_meta_path` if
+    shape is only known during runtime. tfprof can fill in the missing shape
+    with the runtime shape information from RunMetadata. Hence, it is suggested
+    to use `-account_displayed_op_only` option so that you know the statistics
+    are only for the operations printed out.
+
+*   If no RunMetadata is provided, tfprof counts float_ops of each graph node
+    once, even if it is defined in a tf.while_loop. This is because tfprof
+    doesn't know statically how many times each graph node is run. If
+    RunMetadata is provided, tfprof calculates float_ops as float_ops *
+    run_count.
 
 ```python
 # To profile float opertions in commandline, you need to pass --graph_path
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 05a798bff80a0775e5170bf8f428d9e88d8060b3..8dcfde9a2adbd3a1774bce8506a84f80ca099c34 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -365,17 +365,3 @@ cc_library(
         "//tensorflow/core:regexp_internal",
     ],
 )
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/profiler/internal/advisor/BUILD b/tensorflow/core/profiler/internal/advisor/BUILD
index 40cfd1e12e609de0f70b12b2cf98ef1086b4d024..1fedb05ae319176886cb0ff0409ea6685df76a4c 100644
--- a/tensorflow/core/profiler/internal/advisor/BUILD
+++ b/tensorflow/core/profiler/internal/advisor/BUILD
@@ -73,18 +73,3 @@ tf_cc_test(
         "//tensorflow/core/profiler/internal:tfprof_tf_testlib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index e968b9c97e28eeae22954102d5f0e07e09d75f7f..96b6cc30bd9b3d603eb585a05023f36fe7b816b7 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
 
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -82,8 +83,8 @@ TEST_F(TFProfAdvisorTest, OperationChecker) {
   (*options.mutable_checkers())[kCheckers[1]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[1]).reports_size(), 1);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[1]).reports(0))
-                  .contains("NCHW"));
+  EXPECT_TRUE(str_util::StrContains(
+      advice.checkers().at(kCheckers[1]).reports(0), "NCHW"));
 }
 
 TEST_F(TFProfAdvisorTest, UtilizationChecker) {
@@ -91,16 +92,17 @@ TEST_F(TFProfAdvisorTest, UtilizationChecker) {
   (*options.mutable_checkers())[kCheckers[0]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[0]).reports_size(), 1);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[0]).reports(0))
-                  .contains("low utilization"));
+  EXPECT_TRUE(str_util::StrContains(
+      advice.checkers().at(kCheckers[0]).reports(0), "low utilization"));
 }
 
 TEST_F(TFProfAdvisorTest, ExpensiveOperationChecker) {
   AdvisorOptionsProto options;
   (*options.mutable_checkers())[kCheckers[2]];
   AdviceProto advice = advisor_->Advise(options);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[2]).reports(0))
-                  .contains("top 1 operation type: Conv2D"));
+  EXPECT_TRUE(
+      str_util::StrContains(advice.checkers().at(kCheckers[2]).reports(0),
+                            "top 1 operation type: Conv2D"));
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto b/tensorflow/core/protobuf/checkpointable_object_graph.proto
similarity index 85%
rename from tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
rename to tensorflow/core/protobuf/checkpointable_object_graph.proto
index 024765acb28726fd102dfbf167f4e780072ce6e7..651f692f6d7b6d677b480a007f9ffe5c814beec3 100644
--- a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
+++ b/tensorflow/core/protobuf/checkpointable_object_graph.proto
@@ -2,14 +2,14 @@ syntax = "proto3";
 
 option cc_enable_arenas = true;
 
-package tensorflow.contrib.eager;
+package tensorflow;
 
-// Prototype format which saves extra information about the objects which own
-// variables, allowing for more robust checkpoint loading into modified
-// programs. Currently stored in its own entry in a TensorBundle.
+// A TensorBundle addition which saves extra information about the objects which
+// own variables, allowing for more robust checkpoint loading into modified
+// programs.
 
 message CheckpointableObjectGraph {
-  message Object {
+  message CheckpointableObject {
     message ObjectReference {
       // An index into `CheckpointableObjectGraph.nodes`, indicating the object
       // being referenced.
@@ -51,5 +51,5 @@ message CheckpointableObjectGraph {
     repeated SlotVariableReference slot_variables = 3;
   }
 
-  repeated Object nodes = 1;
+  repeated CheckpointableObject nodes = 1;
 }
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 3606c5f127ce1f533d018e645b0a48c20e79cd8d..c1a0075b6468cded7e5378ddcca1a7bcff914e98 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -67,9 +67,7 @@ message GPUOptions {
   // set or set to 0, gets set to a non-zero default.
   int32 polling_active_delay_usecs = 6;
 
-  // In the event polling loop sleep this many millisconds between
-  // PollEvents calls, when the queue is empty.  If value is not
-  // set or set to 0, gets set to a non-zero default.
+  // This field is deprecated and ignored.
   int32 polling_inactive_delay_msecs = 7;
 
   // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
@@ -410,3 +408,42 @@ message RunMetadata {
   // Graphs of the partitions executed by executors.
   repeated GraphDef partition_graphs = 3;
 }
+
+// Defines a connection between two tensors in a `GraphDef`.
+message TensorConnection {
+  // A tensor name. The value of this tensor will be substituted for
+  // the tensor named in `to_tensor`.
+  string from_tensor = 1;
+
+  // A tensor name. The value of this tensor will be bound to the
+  // value of the tensor named in `from_tensor`.
+  string to_tensor = 2;
+}
+
+// Defines a subgraph in another `GraphDef` as a set of feed points and nodes
+// to be fetched or executed.
+//
+// Compare with the arguments to `Session::Run()`.
+message CallableOptions {
+  // Tensors to be fed in the callable. Each feed is the name of a tensor.
+  repeated string feed = 1;
+
+  // Fetches. A list of tensor names. The caller of the callable expects a
+  // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The
+  // order of specified fetches does not change the execution order.
+  repeated string fetch = 2;
+
+  // Target Nodes. A list of node names. The named nodes will be run by the
+  // callable but their outputs will not be returned.
+  repeated string target = 3;
+
+  // Options that will be applied to each run.
+  RunOptions run_options = 4;
+
+  // Tensors to be connected in the callable. Each TensorConnection denotes
+  // a pair of tensors in the graph, between which an edge will be created
+  // in the callable.
+  repeated TensorConnection tensor_connection = 5;
+
+  // Next: 6
+}
diff --git a/tensorflow/core/protobuf/control_flow.proto b/tensorflow/core/protobuf/control_flow.proto
index 2c9476a08ad946e7f019475055397fcd6cfbbc5a..3c05b4f0e22e5ce2104980ad4fa52c8d8ad57070 100644
--- a/tensorflow/core/protobuf/control_flow.proto
+++ b/tensorflow/core/protobuf/control_flow.proto
@@ -17,6 +17,15 @@ message ValuesDef {
   map<string, string> external_values = 2;
 }
 
+// Container for any kind of control flow context. Any other control flow
+// contexts that are added below should also be added here.
+message ControlFlowContextDef {
+  oneof ctxt {
+    CondContextDef cond_ctxt = 1;
+    WhileContextDef while_ctxt = 2;
+  }
+}
+
 // Protocol buffer representing a CondContext object.
 message CondContextDef {
   // Name of the context.
@@ -33,6 +42,9 @@ message CondContextDef {
 
   // Values and external values in control flow context.
   ValuesDef values_def = 5;
+
+  // Contexts contained inside this context (e.g. nested conds).
+  repeated ControlFlowContextDef nested_contexts = 6;
 }
 
 // Protocol buffer representing a WhileContext object.
@@ -70,5 +82,8 @@ message WhileContextDef {
   // Optional name of the maximum_iterations tensor.
   string maximum_iterations_name = 11;
 
-  // Next available id: 12.
+  // Contexts contained inside this context (e.g. nested whiles).
+  repeated ControlFlowContextDef nested_contexts = 12;
+
+  // Next available id: 13.
 }
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
new file mode 100644
index 0000000000000000000000000000000000000000..9a7d0edb35ecfdfa470623134325b94cba209ae1
--- /dev/null
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -0,0 +1,165 @@
+syntax = "proto3";
+
+package tensorflow.eager;
+
+import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/framework/device_attributes.proto";
+import "tensorflow/core/framework/function.proto";
+import "tensorflow/core/framework/versions.proto";
+import "tensorflow/core/protobuf/tensorflow_server.proto";
+
+message RemoteTensorHandle {
+  // The ID of the operation that produced this tensor.
+  int64 op_id = 1;
+  // The index into the outputs of the operation that produced this tensor.
+  int32 output_num = 2;
+}
+
+// A proto representation of an eager operation.
+message Operation {
+  // A unique identifier for the operation. Set by the client so that the client
+  // can uniquely identify the outputs of the scheduled operation.
+  //
+  // In the initial implementation, sending duplicate IDs has undefined
+  // behaviour, but additional constraints may be placed upon this in the
+  // future.
+  int64 id = 1;
+  string name = 2;
+  repeated RemoteTensorHandle inputs = 3;
+
+  // Control Operation IDs that will be respected when ops are re-ordered by
+  // async execution. If async execution (+ op re-ordering) is not enabled, this
+  // should have no effect.
+  repeated int64 control_op_ids = 4;
+  map<string, AttrValue> attrs = 5;
+  string device = 6;
+}
+
+message QueueItem {
+  // The remote executor should be able to handle either executing ops directly,
+  // or releasing any unused tensor handles, since the tensor lifetime is
+  // maintained by the client.
+  oneof item {
+    RemoteTensorHandle handle_to_decref = 1;
+    Operation operation = 2;
+  }
+}
+
+message CreateContextRequest {
+  // Identifies the full cluster, and this particular worker's position within.
+  ServerDef server_def = 1;
+
+  // Whether the ops on the worker should be executed synchronously or
+  // asynchronously. By default, ops are executed synchronously.
+  bool async = 2;
+
+  // Number of seconds to keep the context alive. If more than keep_alive_secs
+  // has passed since a particular context has been communicated with, it will
+  // be garbage collected.
+  int64 keep_alive_secs = 3;
+
+  // This is the version for all the ops that will be enqueued by the client.
+  VersionDef version_def = 4;
+}
+
+message CreateContextResponse {
+  // The ID of the created context. This is usually a randomly generated number,
+  // that will be used to identify the context in future requests to the
+  // service. Contexts are not persisted through server restarts.
+  fixed64 context_id = 1;
+
+  // List of devices that are locally accessible to the worker.
+  repeated DeviceAttributes device_attributes = 2;
+}
+
+message EnqueueRequest {
+  fixed64 context_id = 1;
+
+  repeated QueueItem queue = 3;
+}
+
+message EnqueueResponse {
+}
+
+message WaitQueueDoneRequest {
+  fixed64 context_id = 1;
+
+  // Ids to wait on. If empty, wait on everything currently pending.
+  repeated int64 op_id = 2;
+}
+
+message WaitQueueDoneResponse {
+  // TODO(nareshmodi): Consider adding NodeExecStats here to be able to
+  // propagate some stats.
+}
+
+message KeepAliveRequest {
+  fixed64 context_id = 1;
+}
+
+message KeepAliveResponse {
+}
+
+message CloseContextRequest {
+  fixed64 context_id = 1;
+}
+
+message CloseContextResponse {
+}
+
+message RegisterFunctionRequest {
+  fixed64 context_id = 1;
+
+  FunctionDef function_def = 2;
+}
+
+message RegisterFunctionResponse {
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Eager Service defines a TensorFlow service that executes operations eagerly
+// on a set of local devices, on behalf of a remote Eager executor.
+//
+// The service impl will keep track of the various clients and devices it has
+// access to and allows the client to enqueue ops on any devices that it is able
+// to access and schedule data transfers from/to any of the peers.
+//
+// A client can generate multiple contexts to be able to independently execute
+// operations, but cannot share data between the two contexts.
+//
+// NOTE: Even though contexts generated by clients should be independent, the
+// lower level tensorflow execution engine is not, so they might share some data
+// (e.g. a Device's ResourceMgr).
+//
+////////////////////////////////////////////////////////////////////////////////
+service EagerService {
+  // This initializes the worker, informing it about the other workers in the
+  // cluster and exchanging authentication tokens which will be used in all
+  // other RPCs to detect whether the worker has restarted.
+  rpc CreateContext(CreateContextRequest) returns (CreateContextResponse);
+
+  // This takes a list of Execute and DeleteTensorHandle operations and enqueues
+  // (in async mode) or executes (in sync mode) them on the remote server.
+  // All outputs of ops which were not explicitly deleted with
+  // DeleteTensorHandle entries will be assumed to be alive and are usable by
+  // future calls to Enqueue.
+  rpc Enqueue(EnqueueRequest) returns (EnqueueResponse);
+
+  // Takes a set of op IDs and waits until those ops are done. Returns any error
+  // in the stream so far.
+  rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse);
+
+  // Contexts are always created with a deadline and no RPCs within a deadline
+  // will trigger a context garbage collection. KeepAlive calls can be used to
+  // delay this.
+  rpc KeepAlive(KeepAliveRequest) returns (KeepAliveResponse);
+
+  // Closes the context. No calls to other methods using the existing context ID
+  // are valid after this.
+  rpc CloseContext(CloseContextRequest) returns (CloseContextResponse);
+
+  // Takes a FunctionDef and makes it enqueable on the remote worker.
+  rpc RegisterFunction(RegisterFunctionRequest)
+      returns (RegisterFunctionResponse);
+}
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 0437cb1b83e12d83bf3b8713e2940a6d45173fb5..96c91536f7386556c4c75ef463c4f781edd0aebb 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -23,6 +23,7 @@ option java_package = "org.tensorflow.distruntime";
 
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
+import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/lib/core/error_codes.proto";
 import "tensorflow/core/protobuf/config.proto";
 import "tensorflow/core/protobuf/named_tensor.proto";
@@ -264,3 +265,70 @@ message ListDevicesResponse {
   repeated DeviceAttributes local_device = 1;
   repeated DeviceAttributes remote_device = 2;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// MakeCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message MakeCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+
+  // Options that define the behavior of the created callable.
+  CallableOptions options = 2;
+}
+
+message MakeCallableResponse {
+  // A handle to the created callable.
+  int64 handle = 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// RunCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message RunCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+  // REQUIRED: handle must be returned by a MakeCallable call to the same
+  // master service.
+  int64 handle = 2;
+
+  // Values of the tensors passed as arguments to the callable, in the order
+  // defined in the CallableOptions.feed field passed to MakeCallable.
+  repeated TensorProto feed = 3;
+}
+
+message RunCallableResponse {
+  // Values of the tensors returned by the callable, in the order defined in the
+  // CallableOptions.fetch field passed to MakeCallable.
+  repeated TensorProto fetch = 1;
+
+  // Returned metadata if requested in the options.
+  RunMetadata metadata = 2;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ReleaseCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message ReleaseCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+
+  // REQUIRED: handle must be returned by a MakeCallable call to the same
+  // master service.
+  int64 handle = 2;
+}
+
+message ReleaseCallableResponse {
+}
diff --git a/tensorflow/core/protobuf/master_service.proto b/tensorflow/core/protobuf/master_service.proto
index 771c80562a7885e983e7e25f167e0ca56bba6cc8..1170611f37232704f7702185a3009bd1fa1e3f64 100644
--- a/tensorflow/core/protobuf/master_service.proto
+++ b/tensorflow/core/protobuf/master_service.proto
@@ -107,4 +107,13 @@ service MasterService {
   // will no longer affect fresh ones via the resources in containers listed in
   // the ResetRequest.  See ResetRequest for more details.
   rpc Reset(ResetRequest) returns (ResetResponse);
+
+  // Registers a callable for execution with RunCallable.
+  rpc MakeCallable(MakeCallableRequest) returns (MakeCallableResponse);
+
+  // Executes a callable registered with MakeCallable.
+  rpc RunCallable(RunCallableRequest) returns (RunCallableResponse);
+
+  // Frees resources associated with a callable registered with MakeCallable.
+  rpc ReleaseCallable(ReleaseCallableRequest) returns (ReleaseCallableResponse);
 }
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 0e9e202bc9a2d2368772c7fede9eb877d9d99023..029b27cd04370516404b00b7583914057087fe05 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -6,6 +6,8 @@ option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
+import "tensorflow/core/framework/attr_value.proto";
+
 message AutoParallelOptions {
   bool enable = 1;
   int32 num_replicas = 2;
@@ -29,32 +31,61 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
+  // Enum controling the number of times to run optimizers. The default is to
+  // run them once.
+  enum NumIterationsType {
+    DEFAULT_NUM_ITERS = 0;
+    ONE = 1;
+    TWO = 2;
+  }
+
   // Optimize tensor layouts (default is ON)
+  // e.g. This will try to use NCHW layout on GPU which is faster.
   Toggle layout_optimizer = 1;
   // Fold constants (default is ON)
+  // Statically infer the value of tensors when possible, and materialize the
+  // result using constants.
   Toggle constant_folding = 3;
   // Arithmetic optimizations (default is ON)
+  // e.g. Simplify arithmetic ops; merge ops with same value (like constants).
   Toggle arithmetic_optimization = 7;
   // Control dependency optimizations (default is ON).
+  // Remove redundant control dependencies, which may enable other optimization.
   Toggle dependency_optimization = 8;
-  // Loop optimizations (default is OFF).
+  // Loop optimizations (default is ON).
   Toggle loop_optimization = 9;
+  // Function optimizations (default is ON).
+  Toggle function_optimization = 10;
+  // Strips debug-related nodes from the graph (off by default).
+  Toggle debug_stripper = 11;
   // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
 
+  // Controls how many times we run the optimizers in meta optimizer (default
+  // is once).
+  NumIterationsType meta_optimizer_iterations = 12;
+
   enum MemOptType {
-    // The default setting (SCHEDULING_HEURISTICS only)
+    // The default setting (SCHEDULING and SWAPPING HEURISTICS only)
     DEFAULT_MEM_OPT = 0;
     // Disabled in the meta-optimizer.
     NO_MEM_OPT = 1;
     // Driven by manual op-level annotations.
     MANUAL = 2;
+
     // Driven by heuristics. The behavior of these heuristics is subject to
     // change. Currently includes an experimental recomputation and swapping
     // heuristics. Manual annotations are respected, but additional nodes are
     // selected automatically.
+
+    // Swapping heuristic will move a tensor from the GPU to the CPU and move
+    // it back when needed to reduce peak memory usage.
     SWAPPING_HEURISTICS = 4;
+    // Recomputation heuristics will recompute ops (such as Relu activation)
+    // during backprop instead of storing them, reducing peak memory usage.
     RECOMPUTATION_HEURISTICS = 5;
+    // Scheduling will split big ops such as AddN and try to enforce a schedule
+    // of the new computations that decreases peak memory usage.
     SCHEDULING_HEURISTICS = 6;
     // Use any combination of swapping and recomputation heuristics.
     HEURISTICS = 3;
@@ -63,16 +94,15 @@ message RewriterConfig {
   // effect on manually requested memory optimization passes in the optimizers
   // field.
   MemOptType memory_optimization = 4;
-  // The prefix for nodes which are valid outputs of recomputations. Inputs to
-  // nodes with this name prefix may be recomputed (subject either to manual
-  // annotation of those input nodes or to manual annotation and heuristics
-  // depending on memory_optimization), but the prefixed nodes themselves will
-  // not be recomputed. Typically this will be "gradients/", indicating that
-  // activations from the forward pass of a graph may be recomputed as inputs to
-  // gradients, but may be adjusted if gradients are inside a name scope or if
-  // inputs to non-gradients should be recomputed. Defaults to "gradients/" if
-  // empty or not set.
-  string memory_optimizer_target_node_name_prefix = 6;
+  // A node name scope for node names which are valid outputs of recompuations.
+  // Inputs to nodes that match this scope may be recomputed (subject either to
+  // manual annotation of those input nodes or to manual annotation and
+  // heuristics depending on memory_optimization), but the nodes themselves will
+  // not be recomputed. This matches any sub-scopes as well, meaning the scope
+  // can appear not just as a top-level scope. For example, if the value is
+  // "gradients/", the default, it will match node name "gradients/foo",
+  // "foo/gradients/bar", but not "foo_gradients/"
+  string memory_optimizer_target_node_name_scope = 6;
 
   // Configures AutoParallel optimization passes either through the
   // meta-optimizer or when manually specified through the optimizers field.
@@ -87,5 +117,17 @@ message RewriterConfig {
   // ("autoparallel"). Memory optimization passes ("memory") invoked here are
   // not configurable (in contrast to memory optimization passes through the
   // meta-optimizer) and act only on manual op annotations.
+  //
+  // Custom registered optimizers will be run after the base optimizers, in
+  // the order that they are specified.
   repeated string optimizers = 100;
+
+  // Message to describe custom graph optimizer and its parameters
+  message CustomGraphOptimizer {
+    string name = 1;
+    map<string, AttrValue> parameter_map = 2;
+  }
+
+  // list of CustomGraphOptimizers to apply.
+  repeated CustomGraphOptimizer custom_optimizers = 200;
 }
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 3e7289bd919015dcb6712ee89ccee3605dc6d907..1819a352481e3d4be29b2f4931c81244742dc872 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -103,6 +103,9 @@ message RegisterGraphRequest {
   // Subgraphs are scoped within one session.
   string session_handle = 1;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  bool create_worker_session_called = 6;
+
   // "graph_def" has the subgraph of nodes for this worker, with each node
   // having its device_name filled in.
   GraphDef graph_def = 2;
@@ -144,6 +147,9 @@ message DeregisterGraphRequest {
   // empty, a single global namespace is used.
   string session_handle = 2;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  bool create_worker_session_called = 3;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -200,6 +206,9 @@ message RunGraphRequest {
   // search for the graph_handle.
   string session_handle = 8;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  bool create_worker_session_called = 10;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -234,7 +243,7 @@ message RunGraphRequest {
   // truncate long metadata messages.
   bool store_errors_in_response_body = 9;
 
-  // Next: 10
+  // Next: 11
 }
 
 message RunGraphResponse {
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index 75ad50f6f2d59a8f4b8282d8e7b395e2323d62e1..d58c877cfd3a820ba6671433defe36693df539c7 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -195,6 +195,41 @@ class Session {
     return errors::Unimplemented(
         "LocalDeviceManager is not supported for this session.");
   }
+
+  /// \brief A handle to a subgraph, created with `Session::MakeCallable()`.
+  typedef int64 CallableHandle;
+
+  /// \brief Creates a `handle` for invoking the subgraph defined by
+  /// `callable_options`.
+  /// NOTE: This API is still experimental and may change.
+  virtual Status MakeCallable(const CallableOptions& callable_options,
+                              CallableHandle* out_handle) {
+    return errors::Unimplemented(
+        "MakeCallable is not supported for this session.");
+  }
+
+  /// \brief Invokes the subgraph named by `handle` with the given options and
+  /// input tensors.
+  ///
+  /// The order of tensors in `feed_tensors` must and `fetch_tensors` will
+  /// match the order of names in `CallableOptions::feed()` and
+  /// `CallableOptions::fetch()` when this subgraph was created.
+  /// NOTE: This API is still experimental and may change.
+  virtual Status RunCallable(CallableHandle handle,
+                             const std::vector<Tensor>& feed_tensors,
+                             std::vector<Tensor>* fetch_tensors,
+                             RunMetadata* run_metadata) {
+    return errors::Unimplemented(
+        "RunCallable is not supported for this session.");
+  }
+
+  /// \brief Releases resources associated with the given `handle` in this
+  /// session.
+  /// NOTE: This API is still experimental and may change.
+  virtual Status ReleaseCallable(CallableHandle handle) {
+    return errors::Unimplemented(
+        "ReleaseCallable is not supported for this session.");
+  }
 };
 
 /// \brief Create a new session with the given options.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 7405e01e14494fb6e4e241f1a2b8bc33a4200fa7..ba69efb289a42a5ee2b8dbcba29603dc915671d8 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,7 +19,7 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 6
+#define TF_MINOR_VERSION 8
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
diff --git a/tensorflow/core/user_ops/fact.cc b/tensorflow/core/user_ops/fact.cc
index 3a4fc8115a7f91badfeda369a599b3dba3057c63..2e8b22a49b620d08aa4f13da35e847b362dd2b3a 100644
--- a/tensorflow/core/user_ops/fact.cc
+++ b/tensorflow/core/user_ops/fact.cc
@@ -15,10 +15,13 @@ limitations under the License.
 
 // An example Op.
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
-REGISTER_OP("Fact").Output("fact: string");
+REGISTER_OP("Fact")
+    .Output("fact: string")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape);
 
 class FactOp : public tensorflow::OpKernel {
  public:
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 3efc703faf7b23958eb49d59fd0dd4565f090bbe..480ce94fcaeddd62c30089d09752cc4d965ebf01 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -28,7 +29,9 @@ bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                      const std::function<bool(string)>& hook,
                      bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     *value_parsing_ok = hook(arg.ToString());
     return true;
   }
@@ -40,7 +43,9 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int32)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     int32 parsed_int32;
     if (sscanf(arg.data(), "%d%c", &parsed_int32, &extra) != 1) {
@@ -60,7 +65,9 @@ bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int64)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     int64 parsed_int64;
     if (sscanf(arg.data(), "%lld%c", &parsed_int64, &extra) != 1) {
@@ -80,7 +87,8 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    const std::function<bool(bool)>& hook,
                    bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag)) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *value_parsing_ok = hook(true);
       return true;
@@ -107,7 +115,9 @@ bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(float)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     float parsed_float;
     if (sscanf(arg.data(), "%f%c", &parsed_float, &extra) != 1) {
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index 1521349e4ddf064ce55726a9c6ca400ae6342c15..317420204e20ab2994ca9b7b7f4cc39e688e728f 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -26,18 +26,6 @@ alias(
     actual = ":mobile_srcs",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "ctc",
     deps = [
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 3c59524cb6f85911544b8f2d7d3339e19af7f5b4..0ab875625ff617028c4bc53fa8ccba0488c3d0d1 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -21,6 +21,11 @@ limitations under the License.
 #include "tensorflow/core/util/cuda_device_functions.h"
 #include "tensorflow/core/util/cuda_launch_config.h"
 
+#if CUDA_VERSION >= 7050
+#include "cuda/include/cuda_fp16.h"
+#define TF_HAS_CUDA_FP16
+#endif
+
 // Deprecated, use 'for(int i : CudaGridRangeX(n))' instead.
 #define CUDA_1D_KERNEL_LOOP(i, n) \
   for (int i : ::tensorflow::CudaGridRangeX<int>(n))
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index c1bc0f33785ef1576bf0f0d3db71e8daa1a51801..ff9c108f10cdbfa6f1ca3bb966d42e32fb223c74 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -408,7 +408,7 @@ static void MergeDevNamesError(const string& name_a, const string& name_b,
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StringPiece(s.error_message()).contains(expected_error_substr))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), expected_error_substr))
       << s;
 }
 
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index f1ec497a6772c84d599a76169515ef417c11f430..b87dce0dff536733397bff946c12e992a8097666 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -144,7 +145,7 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
 
   int first_control_input = actual.input_size();
   for (int i = 0; i < actual.input_size(); ++i) {
-    if (StringPiece(actual.input(i)).starts_with("^")) {
+    if (str_util::StartsWith(actual.input(i), "^")) {
       first_control_input = i;
       break;
     }
@@ -240,7 +241,7 @@ uint64 NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
   // Normal inputs. Order important.
   int first_control_input = ndef.input_size();
   for (int i = 0; i < ndef.input_size(); ++i) {
-    if (StringPiece(ndef.input(i)).starts_with("^")) {
+    if (str_util::StartsWith(ndef.input(i), "^")) {
       first_control_input = i;
       break;
     }
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index 65d2c5a09c5c98a70e834e182d5751350506a1a1..9ce85be551191dee754f34ec531e65f3eac056b7 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -81,7 +81,35 @@ message TaggedRunMetadata {
   bytes run_metadata = 2;
 }
 
-// For communicating live events back to a coordinator
-message SessionStatus {
-  repeated Event event = 1;
+// Worker heartbeat messages.  Support for these operations is currently
+// internal and expected to change.
+
+// Current health status of a worker.
+enum WorkerHealth {
+  OK = 0;  // By default a worker is healthy.
+  RECEIVED_SHUTDOWN_SIGNAL = 1;
+  INTERNAL_ERROR = 2;
+}
+
+// Indicates the behavior of the worker when an internal error or shutdown
+// signal is received.
+enum WorkerShutdownMode {
+  DEFAULT = 0;
+  SHUTDOWN_IMMEDIATELY = 1;
+  WAIT_FOR_COORDINATOR = 2;
+}
+
+message WatchdogConfig {
+  int64 timeout_ms = 1;
+}
+
+message WorkerHeartbeatRequest {
+  WorkerShutdownMode shutdown_mode = 1;
+  WatchdogConfig watchdog_config = 2;
+}
+
+message WorkerHeartbeatResponse {
+  WorkerHealth health_status = 1;
+  repeated Event worker_log = 2;
+  string hostname = 3;
 }
diff --git a/tensorflow/core/util/events_writer.cc b/tensorflow/core/util/events_writer.cc
index 49507616ed8c6461f8d59d8899d93abb4ba58cd2..c50e329bda4b44cb5390081d889d81f231b031a5 100644
--- a/tensorflow/core/util/events_writer.cc
+++ b/tensorflow/core/util/events_writer.cc
@@ -122,9 +122,11 @@ Status EventsWriter::Flush() {
   CHECK(recordio_file_ != nullptr) << "Unexpected NULL file";
 
   TF_RETURN_WITH_CONTEXT_IF_ERROR(recordio_writer_->Flush(), "Failed to flush ",
-                                  num_outstanding_events_, " to ", filename_);
+                                  num_outstanding_events_, " events to ",
+                                  filename_);
   TF_RETURN_WITH_CONTEXT_IF_ERROR(recordio_file_->Sync(), "Failed to sync ",
-                                  num_outstanding_events_, " to ", filename_);
+                                  num_outstanding_events_, " events to ",
+                                  filename_);
 
   // The FileStillExists() condition is necessary because
   // recordio_writer_->Sync() can return OK even if the underlying
@@ -135,7 +137,8 @@ Status EventsWriter::Flush() {
   // disappearing file, in case for some file system File::Exists() is
   // false after File::Open() but before File::Sync().
   TF_RETURN_WITH_CONTEXT_IF_ERROR(FileStillExists(), "Failed to flush ",
-                                  num_outstanding_events_, " to ", filename_);
+                                  num_outstanding_events_, " events to ",
+                                  filename_);
   VLOG(1) << "Wrote " << num_outstanding_events_ << " events to disk.";
   num_outstanding_events_ = 0;
   return Status::OK();
diff --git a/tensorflow/core/util/guarded_philox_random.cc b/tensorflow/core/util/guarded_philox_random.cc
index 2d1e9a293e7b979076aa4d4f951b984491904c72..7c7ba4cef6bf1d20b1a9487bb9883c2a4e82bb9c 100644
--- a/tensorflow/core/util/guarded_philox_random.cc
+++ b/tensorflow/core/util/guarded_philox_random.cc
@@ -43,6 +43,14 @@ void GuardedPhiloxRandom::Init(int64 seed, int64 seed2) {
   initialized_ = true;
 }
 
+void GuardedPhiloxRandom::Init(random::PhiloxRandom::ResultType counter,
+                               random::PhiloxRandom::Key key) {
+  CHECK(!initialized_);
+  mutex_lock lock(mu_);
+  generator_ = random::PhiloxRandom(counter, key);
+  initialized_ = true;
+}
+
 random::PhiloxRandom GuardedPhiloxRandom::ReserveSamples128(int64 samples) {
   CHECK(initialized_);
   mutex_lock lock(mu_);
diff --git a/tensorflow/core/util/guarded_philox_random.h b/tensorflow/core/util/guarded_philox_random.h
index 5b94a76777b25e0aea29ae0898e1a5e8d7fab80e..44970eb9499be37a6bdf7ad61256c72aac3bccda 100644
--- a/tensorflow/core/util/guarded_philox_random.h
+++ b/tensorflow/core/util/guarded_philox_random.h
@@ -49,6 +49,8 @@ class GuardedPhiloxRandom {
 
   // Initialize with given seeds.
   void Init(int64 seed, int64 seed2);
+  void Init(random::PhiloxRandom::ResultType counter,
+            random::PhiloxRandom::Key key);
 
   // Reserve a certain number of 128-bit samples.
   // This function is thread safe.  The returned generator is valid for the
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index a0f43d2d4a745722d2095b6817c9156415c78127..d3439cbc9385184da830f70e53acb27eff570ba1 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/util/memmapped_file_system.h"
 
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/memmapped_file_system.pb.h"
 
@@ -157,6 +158,12 @@ Status MemmappedFileSystem::GetChildren(const string& filename,
   return errors::Unimplemented("memmapped format doesn't support GetChildren");
 }
 
+Status MemmappedFileSystem::GetMatchingPaths(const string& pattern,
+                                             std::vector<string>* results) {
+  return errors::Unimplemented(
+      "memmapped format doesn't support GetMatchingPaths");
+}
+
 Status MemmappedFileSystem::DeleteFile(const string& filename) {
   return errors::Unimplemented("memmapped format doesn't support DeleteFile");
 }
@@ -178,7 +185,7 @@ const void* MemmappedFileSystem::GetMemoryWithOffset(uint64 offset) const {
   return reinterpret_cast<const uint8*>(mapped_memory_->data()) + offset;
 }
 
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
 constexpr char* MemmappedFileSystem::kMemmappedPackagePrefix;
 constexpr char* MemmappedFileSystem::kMemmappedPackageDefaultGraphDef;
 #else
@@ -236,7 +243,7 @@ Status MemmappedFileSystem::InitializeFromFile(Env* env,
 }
 
 bool MemmappedFileSystem::IsMemmappedPackageFilename(const string& filename) {
-  return StringPiece(filename).starts_with(kMemmappedPackagePrefix);
+  return str_util::StartsWith(filename, kMemmappedPackagePrefix);
 }
 
 namespace {
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 541587aeab05242f0c71beb139fe74c768b810b0..958e23d28e5ee8cf0052f84064f73315bc1d117b 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -53,7 +53,7 @@ class MemmappedFileSystem : public FileSystem {
  public:
   // Memmapped regions use this prefix to distinguish from
   // the filesystem.
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
   static constexpr char* kMemmappedPackagePrefix =
 #else
   static constexpr char kMemmappedPackagePrefix[] =
@@ -61,7 +61,7 @@ class MemmappedFileSystem : public FileSystem {
       "memmapped_package://";
 
 // The default graphdef in the package.
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
   static constexpr char* kMemmappedPackageDefaultGraphDef =
 #else
   static constexpr char kMemmappedPackageDefaultGraphDef[] =
@@ -85,6 +85,8 @@ class MemmappedFileSystem : public FileSystem {
   Status NewAppendableFile(const string& fname,
                            std::unique_ptr<WritableFile>* result) override;
   Status GetChildren(const string& dir, std::vector<string>* r) override;
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
   Status DeleteFile(const string& f) override;
   Status CreateDir(const string& d) override;
   Status DeleteDir(const string& d) override;
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index eda966bc3342912334f90a7beddf8ccd3aefa68d..50a8e305749eec2bc59156399f9a9506a527305d 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include <string>
 #include <vector>
+#include <unordered_map>
+#include <utility>
 
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -45,6 +47,10 @@ using mkldnn::primitive;
 using mkldnn::reorder;
 #endif
 
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
 // The file contains a number of utility classes and functions used by MKL
 // enabled kernels
 
@@ -1112,8 +1118,10 @@ inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
 // Forward the MKL shape ONLY (used in elementwise and other ops where
 // we call the eigen implementation and MKL shape is not used)
 inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
-                                      uint32 idx_data_in, uint32_t idx_data_out) {
-  uint32 idx_meta_in = GetTensorMetaDataIndex(idx_data_in, context->num_inputs());
+                                      uint32 idx_data_in,
+                                      uint32_t idx_data_out) {
+  uint32 idx_meta_in =
+      GetTensorMetaDataIndex(idx_data_in, context->num_inputs());
   uint32 idx_meta_out =
       GetTensorMetaDataIndex(idx_data_out, context->num_outputs());
 
@@ -1577,10 +1585,10 @@ class MklDnnData {
   }
 
   /// Set function for data buffer of user memory primitive.
-  inline void* SetUsrMemDataHandle(void* data_buffer) {
+  inline void SetUsrMemDataHandle(void* data_buffer) {
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(data_buffer);
-    return user_memory_->set_data_handle(data_buffer);
+    user_memory_->set_data_handle(data_buffer);
   }
 
   /// Set function for data buffer of user memory primitive.
@@ -1753,7 +1761,90 @@ class MklDnnData {
   }
 };
 
-#endif  // INTEL_MKL_ML
+/// Base class for operations with reuse of DNN primitives
+///
+class DnnOp {
+ public:
+  virtual ~DnnOp() {}
+
+  // Dummy data. Its size, hard-coded as 256 here, does
+  // not matter since MKL should never operate on this buffer.
+  unsigned char DummyData[256];
+};
+
+const mkldnn::memory::dims NONE_DIMS = {};
+// This constant is used to declare dummy buffer (size), for MKL primitives
+template <typename T>
+class DnnOpFactory {
+ public:
+  DnnOpFactory() {}
+  ~DnnOpFactory() {}
+
+  DnnOp* GetOp(const std::string& key) {
+    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+    if (stream_iter == DnnOpFactory<T>::GetHashMap().end()) {
+      return nullptr;
+    } else {
+      return stream_iter->second;
+    }
+  }
+
+  void SetOp(const std::string& key, DnnOp* op) {
+    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+
+    CHECK(stream_iter == DnnOpFactory<T>::GetHashMap().end());
+
+    DnnOpFactory<T>::GetHashMap()[key] = op;
+  }
+
+ private:
+  static inline std::unordered_map<std::string, DnnOp*> &GetHashMap() {
+    static thread_local std::unordered_map<std::string, DnnOp*> map_;
+    return map_;
+  }
+};
+
+// utility class for creating keys of MKL primitive pool.
+class FactoryKeyCreator {
+ public:
+  FactoryKeyCreator() {
+    key_.reserve(kMaxKeyLength);
+  }
+
+  ~FactoryKeyCreator() {}
+
+  void AddAsKey(const string &str) {
+    auto buffer = reinterpret_cast<const char *>(str.c_str());
+    Append(buffer, str.length());
+  }
+
+  void AddAsKey(const mkldnn::memory::dims &dims) {
+    for (unsigned int i = 0; i < dims.size(); i++) {
+      AddAsKey<int>(dims[i]);
+    }
+  }
+
+  template <typename T>
+  void AddAsKey(const T data) {
+    auto buffer = reinterpret_cast<const char *>(&data);
+    Append(buffer, sizeof(T));
+  }
+
+  std::string GetKey() {
+    return key_;
+  }
+
+ private:
+  string key_;
+  const char delimiter = 'x';
+  const int kMaxKeyLength = 256;
+  void Append(const char* data, int len) {
+    key_.append(data, len);
+    key_.append(1, delimiter);
+  }
+};
+
+#endif  // INTEL_MKL_DNN
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/port.cc b/tensorflow/core/util/port.cc
index d93b971f8567059e129d3349d44c50ebad5f607c..490c584dc5cf67cb765cdec583a078e7cf6686c1 100644
--- a/tensorflow/core/util/port.cc
+++ b/tensorflow/core/util/port.cc
@@ -39,4 +39,11 @@ bool CudaSupportsHalfMatMulAndConv() {
 #endif
 }
 
+bool IsMklEnabled() {
+#ifdef INTEL_MKL
+  return true;
+#else
+  return false;
+#endif
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/port.h b/tensorflow/core/util/port.h
index ed65341711fff08db01315a662aa6e140e8451b2..981def9d22a029731366d6de0e3d2f5eefa0d8e1 100644
--- a/tensorflow/core/util/port.h
+++ b/tensorflow/core/util/port.h
@@ -25,6 +25,9 @@ bool IsGoogleCudaEnabled();
 // half-precision matrix multiplications and convolution operations.
 bool CudaSupportsHalfMatMulAndConv();
 
+// Returns true if INTEL_MKL is defined
+bool IsMklEnabled();
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_UTIL_PORT_H_
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ade14ed1620e92a2246963eaa0b317275dd4ad3d
--- /dev/null
+++ b/tensorflow/core/util/proto/BUILD
@@ -0,0 +1,62 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "decode",
+    hdrs = ["decode.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "descriptors",
+    srcs = ["descriptors.cc"],
+    hdrs = ["descriptors.h"],
+    deps = [
+        ":descriptor_pool_registry",
+        ":local_descriptor_pool_registration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "descriptor_pool_registry",
+    srcs = ["descriptor_pool_registry.cc"],
+    hdrs = ["descriptor_pool_registry.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "descriptor_pool_registry_test",
+    srcs = ["descriptor_pool_registry_test.cc"],
+    deps = [
+        ":descriptor_pool_registry",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+# Depending on this target adds support for using the special
+# value "local://" (or "") for descriptor source, in which case
+# descriptors linked into the code will be searched.
+cc_library(
+    name = "local_descriptor_pool_registration",
+    srcs = ["local_descriptor_pool_registration.cc"],
+    deps = [
+        ":descriptor_pool_registry",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
new file mode 100644
index 0000000000000000000000000000000000000000..74634a356a84db0fb72a15e223f373598c668eee
--- /dev/null
+++ b/tensorflow/core/util/proto/decode.h
@@ -0,0 +1,592 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Inline functions for parsing the protocol buffers wire format.
+//
+// These functions have been optimized at the expense of safety.
+// They are broken out into a separate file for readability but are
+// not intended for use by clients other than the decode_proto op.
+//
+// The calling code in the decode_proto op does some fairly
+// complicated things to ensure that this code is called
+// safely. Changes to this code should be thoroughly fuzz tested.
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace internal {
+
+using tensorflow::protobuf::internal::WireFormatLite;
+using tensorflow::protobuf::io::CodedInputStream;
+using tensorflow::protobuf::io::CodedOutputStream;
+using tensorflow::protobuf::io::StringOutputStream;
+
+// Converts an uint64 to an int64 without loss of information.
+// Unsigned values greater than INT64_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int64 WrapUnsignedAsSigned64(uint64 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT64_MAX) {
+    return static_cast<int64>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT64_MIN) {
+    return static_cast<int64>(unsigned_value - INT64_MIN) + INT64_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Converts an uint32 to an int32 without loss of information.
+// Unsigned values greater than INT_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int32 WrapUnsignedAsSigned32(uint32 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT_MAX) {
+    return static_cast<int32>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT_MIN) {
+    return static_cast<int32>(unsigned_value - INT_MIN) + INT_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value);
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+// This is slightly less efficient than the private version in
+// coded_stream.cc but we duplicate less code by calling
+// the 64 bit version instead of copying the code.
+inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
+                                          uint32* value) {
+  uint64 tmp;
+  const uint8* buf = ReadVarint64FromArray(buffer, ok, &tmp);
+  *value = tmp & 0xffffffff;
+  return buf;
+}
+
+// Reads a single proto field value from a byte array into an array.
+// The array is part of a Tensor that was allocated by the caller
+// with type TensorType, while DeclaredType is the proto field type.
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+const uint8* ReadFromArray(const uint8* buf, TensorType* value);
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int32>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, int64* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = WrapUnsignedAsSigned32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_UINT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int64>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, int64* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_FIXED64>(
+    const uint8* buf, int64* value) {
+  protobuf_uint64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_uint64,
+                                               WireFormatLite::TYPE_FIXED64>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SFIXED32>(
+    const uint8* buf, int32* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<int32,
+                                                WireFormatLite::TYPE_SFIXED32>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SFIXED64>(
+    const uint8* buf, int64* value) {
+  protobuf_int64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_int64,
+                                               WireFormatLite::TYPE_SFIXED64>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<float, WireFormatLite::TYPE_FLOAT>(
+    const uint8* buf, float* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<float,
+                                                WireFormatLite::TYPE_FLOAT>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
+    const uint8* buf, double* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<double,
+                                                WireFormatLite::TYPE_DOUBLE>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
+    const uint8* buf, bool* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = temp != 0;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
+    const uint8* buf, int* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int>(temp);
+  return buf;
+}
+
+// Reads packed values from an array.
+// Stride is set to 1 for repeated fields, and 0 for non-repeated fields
+// (where any value overwrites previous values).
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+inline int ReadPackedPrimitives(const void* bufp, const size_t len,
+                                const int index, const int stride,
+                                void* datap) {
+  const uint8* buf = reinterpret_cast<const uint8*>(bufp);
+  const uint8* bound = buf + len;
+  TensorType* data = reinterpret_cast<TensorType*>(datap) + index;
+  int count;
+
+  // This could overrun the bound by stride-1. This is defended
+  // against in the caller, where it ensures that the input buffer
+  // contains complete values.
+  for (count = 0; buf < bound; count += stride) {
+    buf = ReadFromArray<TensorType, DeclaredType>(buf, data + count);
+  }
+  return count;
+}
+
+// Reads a primitive value field from a serialized proto.
+// The value is parsed from the serialized format, then static_cast
+// to the desired type for TensorFlow and stored.
+template <class ValueType, class TensorType,
+          enum WireFormatLite::FieldType DeclaredType>
+inline Status ReadPrimitive(CodedInputStream* input, int index, void* data) {
+  ValueType v;
+  if (!WireFormatLite::ReadPrimitive<ValueType, DeclaredType>(input, &v)) {
+    return errors::DataLoss("Failed reading primitive");
+  }
+
+  reinterpret_cast<TensorType*>(data)[index] = v;
+  return Status::OK();
+}
+
+// Reads a string, submessage, or other variable-length field from a
+// serialized proto.
+// May read all or part of a repeated field.
+inline Status ReadBytes(CodedInputStream* input, int index, void* datap) {
+  string* data = reinterpret_cast<string*>(datap) + index;
+  if (!WireFormatLite::ReadBytes(input, data)) {
+    return errors::DataLoss("Failed reading bytes");
+  }
+  return Status::OK();
+}
+
+// Reads a tag-delimited field (TYPE_GROUP) from a serialized proto,
+// as a bytestring.
+inline Status ReadGroupBytes(CodedInputStream* input, int field_number,
+                             int index, void* datap) {
+  // WireFormatLite::SkipField has an option to emit the
+  // skipped bytes to an output stream. We could do better by implementing our
+  // own scanner but this is simpler for now.
+  // TODO(nix): there is a faster way to grab TYPE_GROUP bytes by relying
+  // on input->IsFlat() == true and using input->GetDirectBufferPointer()
+  // with input->CurrentPosition().
+  string* data = reinterpret_cast<string*>(datap) + index;
+  StringOutputStream string_stream(data);
+  CodedOutputStream out(&string_stream);
+  if (!WireFormatLite::SkipField(
+          input,
+          WireFormatLite::MakeTag(field_number,
+                                  WireFormatLite::WIRETYPE_START_GROUP),
+          &out)) {
+    return errors::DataLoss("Failed reading group");
+  }
+  return Status::OK();
+}
+
+// Reads a single field value from a CodedInputStream into a tensor.
+inline Status ReadValue(CodedInputStream* input,
+                        WireFormatLite::FieldType field_type, int field_number,
+                        DataType dtype, int index, void* datap) {
+  // Dispatch to the appropriately typed field reader based on the
+  // schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return ReadPrimitive<double, double, WireFormatLite::TYPE_DOUBLE>(
+          input, index, datap);
+    case WireFormatLite::TYPE_FLOAT:
+      if (dtype == DataType::DT_FLOAT) {
+        return ReadPrimitive<float, float, WireFormatLite::TYPE_FLOAT>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_DOUBLE) {
+        return ReadPrimitive<float, double, WireFormatLite::TYPE_FLOAT>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FLOAT");
+    case WireFormatLite::TYPE_INT64:
+      return ReadPrimitive<protobuf_int64, int64, WireFormatLite::TYPE_INT64>(
+          input, index, datap);
+    case WireFormatLite::TYPE_UINT64:
+      return ReadPrimitive<protobuf_uint64, int64, WireFormatLite::TYPE_UINT64>(
+          input, index, datap);
+    case WireFormatLite::TYPE_INT32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_INT32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_FIXED64:
+      return ReadPrimitive<protobuf_uint64, int64,
+                           WireFormatLite::TYPE_FIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_FIXED32:
+      if (dtype == DataType::DT_INT64) {
+        return ReadPrimitive<uint32, int64, WireFormatLite::TYPE_FIXED32>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_INT32) {
+        return ReadPrimitive<uint32, int32, WireFormatLite::TYPE_FIXED32>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FIXED32");
+    case WireFormatLite::TYPE_BOOL:
+      return ReadPrimitive<bool, bool, WireFormatLite::TYPE_BOOL>(input, index,
+                                                                  datap);
+    case WireFormatLite::TYPE_STRING:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_GROUP:
+      return ReadGroupBytes(input, field_number, index, datap);
+    case WireFormatLite::TYPE_MESSAGE:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_BYTES:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_UINT32:
+      if (dtype == DataType::DT_INT64) {
+        return ReadPrimitive<uint32, int64, WireFormatLite::TYPE_UINT32>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_INT32) {
+        return ReadPrimitive<uint32, int32, WireFormatLite::TYPE_UINT32>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_UINT32");
+    case WireFormatLite::TYPE_ENUM:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_ENUM>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SFIXED32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SFIXED32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SFIXED64:
+      return ReadPrimitive<protobuf_int64, int64,
+                           WireFormatLite::TYPE_SFIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_SINT32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SINT32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SINT64:
+      return ReadPrimitive<protobuf_int64, int64, WireFormatLite::TYPE_SINT64>(
+          input, index, datap);
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads and stores a length-delimited list of values.
+inline Status ReadPackedFromArray(const void* buf, size_t buf_size,
+                                  const WireFormatLite::FieldType field_type,
+                                  const int field_number, const DataType dtype,
+                                  const int stride, int* index, void* data) {
+  // Dispatch to the appropriately typed field reader based on the
+  // schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      *index += ReadPackedPrimitives<double, WireFormatLite::TYPE_DOUBLE>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FLOAT:
+      *index += ReadPackedPrimitives<float, WireFormatLite::TYPE_FLOAT>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_INT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_INT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_UINT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_UINT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_INT32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_INT32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FIXED64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_FIXED64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FIXED32:
+      if (dtype == DataType::DT_INT64) {
+        *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_FIXED32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      if (dtype == DataType::DT_INT32) {
+        *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_FIXED32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FIXED32");
+    case WireFormatLite::TYPE_BOOL:
+      *index += ReadPackedPrimitives<bool, WireFormatLite::TYPE_BOOL>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_STRING:
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+    case WireFormatLite::TYPE_BYTES:
+      return errors::DataLoss("Non-primitive type encountered as packed");
+    case WireFormatLite::TYPE_UINT32:
+      if (dtype == DataType::DT_INT64) {
+        *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_UINT32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      if (dtype == DataType::DT_INT32) {
+        *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_UINT32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_UINT32");
+    case WireFormatLite::TYPE_ENUM:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_ENUM>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_SFIXED32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SFIXED32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SFIXED64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_SFIXED64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SINT32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SINT32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SINT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_SINT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads a varint from the given buffer, write it to *value, and return the
+// new buffer pointer.
+// This was copied from coded_stream.cc where it is private.
+// Important: This routine may read as much as kMaxVarintBytes from
+// the buffer. It is the caller's responsibility to make sure that there is
+// enough space in the buffer.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value) {
+  const uint8* ptr = buffer;
+  uint32 b;
+
+  // Splitting into 32-bit pieces gives better performance on 32-bit
+  // processors.
+  uint32 part0 = 0, part1 = 0, part2 = 0;
+
+  b = *(ptr++);
+  part0 = b;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80;
+  b = *(ptr++);
+  part0 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 7;
+  b = *(ptr++);
+  part0 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 14;
+  b = *(ptr++);
+  part0 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 21;
+  b = *(ptr++);
+  part1 = b;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80;
+  b = *(ptr++);
+  part1 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 7;
+  b = *(ptr++);
+  part1 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 14;
+  b = *(ptr++);
+  part1 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 21;
+  b = *(ptr++);
+  part2 = b;
+  if (!(b & 0x80)) goto done;
+  part2 -= 0x80;
+  b = *(ptr++);
+  part2 += b << 7;
+  if (!(b & 0x80)) goto done;
+  // "part2 -= 0x80 << 7" is irrelevant because (0x80 << 7) << 56 is 0.
+
+  // We have overrun the maximum size of a varint (10 bytes).  Assume
+  // the data is corrupt.
+  *ok = false;
+  return ptr;
+
+done:
+  *ok = true;
+  *value = (static_cast<uint64>(part0)) | (static_cast<uint64>(part1) << 28) |
+           (static_cast<uint64>(part2) << 56);
+  return ptr;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.cc b/tensorflow/core/util/proto/descriptor_pool_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f0423f76b74c2b24555e6908a2b61b3ba28598f
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+namespace tensorflow {
+
+DescriptorPoolRegistry* DescriptorPoolRegistry::Global() {
+  static DescriptorPoolRegistry* registry = new DescriptorPoolRegistry;
+  return registry;
+}
+
+DescriptorPoolRegistry::DescriptorPoolFn* DescriptorPoolRegistry::Get(
+    const string& source) {
+  auto found = fns_.find(source);
+  if (found == fns_.end()) return nullptr;
+  return &found->second;
+}
+
+void DescriptorPoolRegistry::Register(
+    const string& source,
+    const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
+  auto existing = Get(source);
+  CHECK_EQ(existing, nullptr)
+      << "descriptor pool for source: " << source << " already registered";
+  fns_.insert(std::pair<const string&, DescriptorPoolFn>(source, pool_fn));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.h b/tensorflow/core/util/proto/descriptor_pool_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..66c20e9e41337292bccf0c11c6c0b94a05e5df54
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+class DescriptorPoolRegistry {
+ public:
+  typedef std::function<Status(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool)>
+      DescriptorPoolFn;
+
+  // Returns a pointer to a global DescriptorPoolRegistry object.
+  static DescriptorPoolRegistry* Global();
+
+  // Returns a pointer to a descriptor pool function for the given source.
+  DescriptorPoolFn* Get(const string& source);
+
+  // Registers a descriptor pool factory.
+  void Register(const string& source, const DescriptorPoolFn& pool_fn);
+
+ private:
+  std::map<string, DescriptorPoolFn> fns_;
+};
+
+namespace descriptor_pool_registration {
+
+class DescriptorPoolRegistration {
+ public:
+  DescriptorPoolRegistration(
+      const string& source,
+      const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
+    DescriptorPoolRegistry::Global()->Register(source, pool_fn);
+  }
+};
+
+}  // namespace descriptor_pool_registration
+
+#define REGISTER_DESCRIPTOR_POOL(source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(__COUNTER__, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(ctr, source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)       \
+  static descriptor_pool_registration::DescriptorPoolRegistration \
+      descriptor_pool_registration_fn_##ctr(source, pool_fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry_test.cc b/tensorflow/core/util/proto/descriptor_pool_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6899998ab57a2c90458db31596dc6bf00e8adc0
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry_test.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Value {
+  static Status Function(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+    return Status::OK();
+  }
+};
+
+REGISTER_DESCRIPTOR_POOL("TEST POOL 1", Value::Function);
+REGISTER_DESCRIPTOR_POOL("TEST POOL 2", Value::Function);
+}  // namespace
+
+TEST(DescriptorPoolRegistryTest, TestBasic) {
+  EXPECT_EQ(DescriptorPoolRegistry::Global()->Get("NON-EXISTENT"), nullptr);
+  auto pool1 = DescriptorPoolRegistry::Global()->Get("TEST POOL 1");
+  EXPECT_NE(pool1, nullptr);
+  auto pool2 = DescriptorPoolRegistry::Global()->Get("TEST POOL 2");
+  EXPECT_NE(pool2, nullptr);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptors.cc b/tensorflow/core/util/proto/descriptors.cc
new file mode 100644
index 0000000000000000000000000000000000000000..271c85efd88de0f8acbedb3d2254af3397601c6b
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptors.cc
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+#include "tensorflow/core/util/proto/descriptors.h"
+
+namespace tensorflow {
+namespace {
+
+// Build a `DescriptorPool` from the named file or URI. The file or URI
+// must be available to the current TensorFlow environment.
+//
+// The file must contiain a serialized `FileDescriptorSet`. See
+// `GetDescriptorPool()` for more information.
+Status GetDescriptorPoolFromFile(
+    tensorflow::Env* env, const string& filename,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+  Status st = env->FileExists(filename);
+  if (!st.ok()) {
+    return st;
+  }
+
+  // Read and parse the FileDescriptorSet.
+  tensorflow::protobuf::FileDescriptorSet descs;
+  std::unique_ptr<tensorflow::ReadOnlyMemoryRegion> buf;
+  st = env->NewReadOnlyMemoryRegionFromFile(filename, &buf);
+  if (!st.ok()) {
+    return st;
+  }
+  if (!descs.ParseFromArray(buf->data(), buf->length())) {
+    return errors::InvalidArgument(
+        "descriptor_source contains invalid FileDescriptorSet: ", filename);
+  }
+
+  // Build a DescriptorPool from the FileDescriptorSet.
+  owned_desc_pool->reset(new tensorflow::protobuf::DescriptorPool());
+  for (const auto& filedesc : descs.file()) {
+    if ((*owned_desc_pool)->BuildFile(filedesc) == nullptr) {
+      return errors::InvalidArgument(
+          "Problem loading FileDescriptorProto (missing dependencies?): ",
+          filename);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status GetDescriptorPool(
+    tensorflow::Env* env, string const& descriptor_source,
+    tensorflow::protobuf::DescriptorPool const** desc_pool,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+  // Attempt to lookup the pool in the registry.
+  auto pool_fn = DescriptorPoolRegistry::Global()->Get(descriptor_source);
+  if (pool_fn != nullptr) {
+    return (*pool_fn)(desc_pool, owned_desc_pool);
+  }
+
+  // If there is no pool function registered for the given source, let the
+  // runtime find the file or URL.
+  Status status =
+      GetDescriptorPoolFromFile(env, descriptor_source, owned_desc_pool);
+  if (status.ok()) {
+    *desc_pool = owned_desc_pool->get();
+  }
+  *desc_pool = owned_desc_pool->get();
+  return status;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptors.h b/tensorflow/core/util/proto/descriptors.h
new file mode 100644
index 0000000000000000000000000000000000000000..92ee8997ab28f151a7b15b0d81628988e98159f4
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptors.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+class Env;
+class Status;
+
+// Get a `DescriptorPool` object from the named `descriptor_source`.
+// `descriptor_source` may be a path to a file accessible to TensorFlow, in
+// which case it is parsed as a `FileDescriptorSet` and used to build the
+// `DescriptorPool`.
+//
+// `owned_desc_pool` will be filled in with the same pointer as `desc_pool` if
+// the caller should take ownership.
+extern tensorflow::Status GetDescriptorPool(
+    tensorflow::Env* env, string const& descriptor_source,
+    tensorflow::protobuf::DescriptorPool const** desc_pool,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
diff --git a/tensorflow/core/util/proto/local_descriptor_pool_registration.cc b/tensorflow/core/util/proto/local_descriptor_pool_registration.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48fe0102d011724a91004ff3297e07259df87c27
--- /dev/null
+++ b/tensorflow/core/util/proto/local_descriptor_pool_registration.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+namespace tensorflow {
+namespace {
+
+struct LocalDescriptorPool {
+  static Status Function(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+    *desc_pool = ::tensorflow::protobuf::DescriptorPool::generated_pool();
+    if (*desc_pool == nullptr) {
+      return errors::InvalidArgument("Problem loading protobuf generated_pool");
+    }
+    return Status::OK();
+  }
+};
+
+REGISTER_DESCRIPTOR_POOL("", LocalDescriptorPool::Function);
+REGISTER_DESCRIPTOR_POOL("local://", LocalDescriptorPool::Function);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/reporter.cc b/tensorflow/core/util/reporter.cc
index ee38f81f3e1b370b3f10d3998dee5b3a6d916e7d..a595c9509e66c7bb3ac7fadbd0e87cfd81d1d611 100644
--- a/tensorflow/core/util/reporter.cc
+++ b/tensorflow/core/util/reporter.cc
@@ -47,6 +47,18 @@ Status TestReporter::Benchmark(int64 iters, double cpu_time, double wall_time,
   return Status::OK();
 }
 
+Status TestReporter::SetProperty(const string& name, const string& value) {
+  if (closed_) return Status::OK();
+  (*benchmark_entry_.mutable_extras())[name].set_string_value(value);
+  return Status::OK();
+}
+
+Status TestReporter::SetProperty(const string& name, double value) {
+  if (closed_) return Status::OK();
+  (*benchmark_entry_.mutable_extras())[name].set_double_value(value);
+  return Status::OK();
+}
+
 Status TestReporter::Initialize() {
   if (fname_.empty()) {
     return Status::OK();
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index bcae12204ec369af74d748e5e73d06616309f289..e551e2e4f57decff586fc0bd4a8514ca7af8e0ec 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -34,11 +34,13 @@ namespace tensorflow {
 //
 // If this environment variable is not defined, no logging is performed.
 //
-// The intended use is via the following 4 lines:
+// The intended use is via the following lines:
 //
 //  TestReporter reporter(test_name);
 //  TF_CHECK_OK(reporter.Initialize()));
 //  TF_CHECK_OK(reporter.Benchmark(iters, cpu_time, wall_time, throughput));
+//  TF_CHECK_OK(reporter.SetProperty("some_string_property", "some_value");
+//  TF_CHECK_OK(reporter.SetProperty("some_double_property", double_value);
 //  TF_CHECK_OK(reporter.Close());
 //
 // For example, if the environment variable
@@ -75,6 +77,12 @@ class TestReporter {
   Status Benchmark(int64 iters, double cpu_time, double wall_time,
                    double throughput);
 
+  // Set property on Benchmark to the given value.
+  Status SetProperty(const string& name, double value);
+
+  // Set property on Benchmark to the given value.
+  Status SetProperty(const string& name, const string& value);
+
   // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
   ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 575c27d4ef72ec33c4b9352de59fc806b12d6385..0972b86ea5fefa4b490ee61eeb1937b136783801 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
@@ -115,5 +115,28 @@ TEST(TestReporter, Benchmark) {
   EXPECT_EQ(benchmark_entry.throughput(), 3.0);
 }
 
+TEST(TestReporter, SetProperties) {
+  string fname =
+      strings::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  TestReporter test_reporter(fname, "b2/3/4");
+  TF_EXPECT_OK(test_reporter.Initialize());
+  TF_EXPECT_OK(test_reporter.SetProperty("string_prop", "abc"));
+  TF_EXPECT_OK(test_reporter.SetProperty("double_prop", 4.0));
+
+  TF_EXPECT_OK(test_reporter.Close());
+  string expected_fname = strings::StrCat(fname, "b2__3__4");
+  string read;
+  TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
+
+  BenchmarkEntries benchmark_entries;
+  ASSERT_TRUE(benchmark_entries.ParseFromString(read));
+  ASSERT_EQ(1, benchmark_entries.entry_size());
+  const BenchmarkEntry& benchmark_entry = benchmark_entries.entry(0);
+  const auto& extras = benchmark_entry.extras();
+  ASSERT_EQ(2, extras.size());
+  EXPECT_EQ("abc", extras.at("string_prop").string_value());
+  EXPECT_EQ(4.0, extras.at("double_prop").double_value());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/BUILD b/tensorflow/core/util/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f0f161ecc017966cef1a59e6870b016bdfb8d3ec
--- /dev/null
+++ b/tensorflow/core/util/rpc/BUILD
@@ -0,0 +1,48 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "call_container",
+    hdrs = ["call_container.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "rpc_factory",
+    srcs = ["rpc_factory.cc"],
+    hdrs = ["rpc_factory.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "rpc_factory_registry",
+    srcs = ["rpc_factory_registry.cc"],
+    hdrs = ["rpc_factory_registry.h"],
+    deps = [
+        ":rpc_factory",
+        "//tensorflow/core:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "rpc_factory_registry_test",
+    srcs = ["rpc_factory_registry_test.cc"],
+    deps = [
+        ":rpc_factory_registry",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/util/rpc/call_container.h b/tensorflow/core/util/rpc/call_container.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1226a7f162cafceb73dad441fbc97fbc6f501fe
--- /dev/null
+++ b/tensorflow/core/util/rpc/call_container.h
@@ -0,0 +1,175 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
+#define TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
+
+#include <list>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
+
+namespace tensorflow {
+
+namespace internal {
+// The following class is used for coordination between a `CallContainer`
+// instance and a cancellation callback to make sure that the `CallContainer`
+// instance waits for the cancellation callback to be destroyed (either because
+// a cancellation occurred or because the callback was deregistered) before
+// deleting itself. Without this coordination the cancellation callback could
+// attempt to access a `CallContainer` instance that is no longer valid.
+class NotifyWhenDestroyed {
+ public:
+  explicit NotifyWhenDestroyed(std::shared_ptr<Notification> notification)
+      : notification_(std::move(notification)) {}
+
+  ~NotifyWhenDestroyed() { notification_->Notify(); }
+
+ private:
+  std::shared_ptr<Notification> notification_;
+};
+}  // namespace internal
+
+// The following class is responsible for the life cycle management of a set of
+// RPC calls. The calls are started when an instance of the class is created and
+// the class contract guarantees to invoke a "done" callback provided by the
+// caller when all RPC calls have either completed or been cancelled.
+//
+// The caller should not make any assumptions about the validity of an instance
+// of this class after the provided callback has been invoked, which may be
+// immediately after the instance was created.
+template <class Call>
+class CallContainer {
+ public:
+  typedef std::function<void(CallContainer<Call>*, int)> CreateCallFn;
+  typedef std::function<void(Call*)> StartCallFn;
+
+  // Uses the provided `create_call_fn` and `start_call_fn` functions to create
+  // and start a set of RPC calls. When all RPC calls have either completed or
+  // been cancelled, the `done` callback is invoked. The caller should not make
+  // any assumptions about the validity of the created instance as the instance
+  // will delete itself after invoking the `done` callback.
+  explicit CallContainer(OpKernelContext* ctx, int num_calls, bool fail_fast,
+                         bool try_rpc, AsyncOpKernel::DoneCallback done,
+                         CreateCallFn create_call_fn,
+                         StartCallFn start_call_fn);
+
+  // Registers a call with this container. This method expects its arguments to
+  // match those of a `Call` constructor as it forwards them to an underlying
+  // collection, which creates a `Call` instance in place.
+  template <class... Args>
+  void RegisterCall(Args&&... args);
+
+  // Starts the cancellation of all RPC calls managed by this container.
+  void StartCancel();
+
+  // Indicates that the `index`-th RPC call has finished.
+  void Done(const Status& s, int index);
+
+ private:
+  OpKernelContext* ctx_;
+  std::list<Call> calls_;
+  const AsyncOpKernel::DoneCallback done_;
+  const CancellationToken token_;
+  const bool fail_fast_;
+  const bool try_rpc_;
+  std::shared_ptr<Notification> callback_destroyed_;
+
+  // Performs its own reference counting.
+  ReffedStatusCallback* reffed_status_callback_;
+};
+
+template <class Call>
+CallContainer<Call>::CallContainer(
+    OpKernelContext* ctx, int num_calls, bool fail_fast, bool try_rpc,
+    AsyncOpKernel::DoneCallback done,
+    typename CallContainer<Call>::CreateCallFn create_call_fn,
+    typename CallContainer<Call>::StartCallFn start_call_fn)
+    : ctx_(ctx),
+      done_(std::move(done)),
+      token_(ctx->cancellation_manager()->get_cancellation_token()),
+      fail_fast_(fail_fast),
+      try_rpc_(try_rpc),
+      callback_destroyed_(new Notification) {
+  CHECK_GT(num_calls, 0);
+
+  // This will run when all RPCs are finished.
+  reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) {
+    ctx_->cancellation_manager()->DeregisterCallback(token_);
+    ctx_->SetStatus(s);
+    done_();
+    callback_destroyed_->WaitForNotification();
+    delete this;
+  });
+
+  // The cancellation callback needs to be registered before the RPC calls are
+  // started to make sure that the callback is properly cleaned up by the
+  // `reffed_status_callback` when all calls complete. At the same time, the
+  // cancellation callback should wait for the RPC calls to be started for the
+  // cancellation to take effect.
+  std::shared_ptr<internal::NotifyWhenDestroyed> notify_when_destroyed(
+      new internal::NotifyWhenDestroyed(callback_destroyed_));
+  std::shared_ptr<Notification> calls_started(new Notification);
+  bool is_cancelled = !ctx_->cancellation_manager()->RegisterCallback(
+      token_, [this, calls_started, notify_when_destroyed]() {
+        calls_started->WaitForNotification();
+        StartCancel();
+      });
+
+  for (int i = 0; i < num_calls; ++i) {
+    create_call_fn(this, i);
+    // Increase the reference on the callback for each new RPC.
+    reffed_status_callback_->Ref();
+  }
+  for (Call& call : calls_) {
+    start_call_fn(&call);
+  }
+  calls_started->Notify();
+
+  if (is_cancelled) {
+    ctx_->SetStatus(errors::Cancelled("Operation has been cancelled."));
+    StartCancel();
+  }
+
+  // Subtract reference count from the initial creation.
+  reffed_status_callback_->Unref();
+}
+
+template <class Call>
+template <class... Args>
+void CallContainer<Call>::RegisterCall(Args&&... args) {
+  calls_.emplace_back(std::forward<Args>(args)...);
+}
+
+template <class Call>
+void CallContainer<Call>::StartCancel() {
+  for (auto& call : calls_) {
+    call.StartCancel();
+  }
+}
+
+template <class Call>
+void CallContainer<Call>::Done(const Status& s, int index) {
+  if (!try_rpc_) {
+    reffed_status_callback_->UpdateStatus(s);
+  }
+  reffed_status_callback_->Unref();
+}
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory.cc b/tensorflow/core/util/rpc/rpc_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8530f02b6e2e021ed1c01db9a5bf25f5789a1142
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/strings/numbers.h"
+
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+template <>
+bool GetEnvVar(const char* key, const string& default_value, string* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+  } else {
+    *value = env_value;
+  }
+  return true;
+}
+
+template <>
+bool GetEnvVar(const char* key, const int64& default_value, int64* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+    return true;
+  }
+  return strings::safe_strto64(env_value, value);
+}
+
+template <>
+bool GetEnvVar(const char* key, const uint64& default_value, uint64* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+    return true;
+  }
+  return strings::safe_strtou64(env_value, value);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/rpc_factory.h b/tensorflow/core/util/rpc/rpc_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4eaaf44570a49728bd979c8a1d05d4e6e002676
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory.h
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
+#define TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+// Return the environment variable `key`.  If the variable is not set,
+// use the default value.  If it is set but could not be parsed,
+// return `false`.  Otherwise set `value` and return `true`.
+template <typename T>
+bool GetEnvVar(const char* key, const T& default_value, T* value);
+
+class RPCFactory {
+ public:
+  RPCFactory() {}
+  virtual ~RPCFactory() {}
+
+  // Asynchronously invokes methods `method_t` at addresses `address_t` with
+  // request strings from `request_t`.  Any of these may be scalar
+  // Tensors, in which case the operands are broadcasted.
+  // Upon completion of all requests, `response_t` will be populated and the
+  // `done` callback will be invoked.
+  //
+  // If `try_rpc` is `true`, then `status_message_t` and
+  // `status_code_t` will be populated as well.
+  //
+  // If `try_rpc` is `false`, then `status_message_t` and
+  // `status_code_t` are ignored (and may be nullptr).  Instead, the
+  // status of any failed call will be propagated to the op.
+  //
+  // REQUIRES:
+  //   - `response_t` is not null, and is a string Tensor with the same shape as
+  //     `request_t`.
+  //
+  //   If `try_rpc` is `true`:
+  //      - `status_code_t` and `status_message_t` are not null.
+  //      - `status_code_t` is an int32 Tensor with the same shape as
+  //        `request_t`.
+  //      - `status_message_t` is a string Tensor with the same shape as
+  //        `request_t`.
+  virtual void Call(OpKernelContext* ctx, int64 num_elements,
+                    const Tensor& address_t, const Tensor& method_t,
+                    const Tensor& request_t, const bool try_rpc,
+                    Tensor* response_t, Tensor* status_code_t,
+                    Tensor* status_message_t,
+                    AsyncOpKernel::DoneCallback done) = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(RPCFactory);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry.cc b/tensorflow/core/util/rpc/rpc_factory_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a148b5c04d0dbe551dd11d001f6434b23e99714f
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+
+RPCFactoryRegistry* RPCFactoryRegistry::Global() {
+  static RPCFactoryRegistry* registry = new RPCFactoryRegistry;
+  return registry;
+}
+
+RPCFactoryRegistry::RPCFactoryFn* RPCFactoryRegistry::Get(
+    const string& protocol) {
+  auto found = fns_.find(protocol);
+  if (found == fns_.end()) return nullptr;
+  return &found->second;
+}
+
+void RPCFactoryRegistry::Register(const string& protocol,
+                                  const RPCFactoryFn& factory_fn) {
+  auto existing = Get(protocol);
+  CHECK_EQ(existing, nullptr)
+      << "RPC factory for protocol: " << protocol << " already registered";
+  fns_.insert(std::pair<const string&, RPCFactoryFn>(protocol, factory_fn));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry.h b/tensorflow/core/util/rpc/rpc_factory_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..2635a4012e8f243c8d4334ad3477e184e8cd53a2
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
+
+#include <map>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+class RPCFactoryRegistry {
+ public:
+  typedef std::function<RPCFactory*(OpKernelConstruction* ctx, bool fail_fast,
+                                    int64 timeout_in_ms)>
+      RPCFactoryFn;
+
+  // Returns a pointer to a global RPCFactoryRegistry object.
+  static RPCFactoryRegistry* Global();
+
+  // Returns a pointer to an function that creates an RPC factory for the given
+  // protocol.
+  RPCFactoryFn* Get(const string& protocol);
+
+  // Registers a function that creates and RPC factory for the given protocol.
+  // The function should transfer the ownership of the factory to its caller.
+  void Register(const string& protocol, const RPCFactoryFn& factory_fn);
+
+ private:
+  std::map<string, RPCFactoryFn> fns_;
+};
+
+namespace rpc_factory_registration {
+
+class RPCFactoryRegistration {
+ public:
+  RPCFactoryRegistration(const string& protocol,
+                         const RPCFactoryRegistry::RPCFactoryFn& factory_fn) {
+    RPCFactoryRegistry::Global()->Register(protocol, factory_fn);
+  }
+};
+
+}  // namespace rpc_factory_registration
+
+#define REGISTER_RPC_FACTORY(protocol, factory_fn) \
+  REGISTER_RPC_FACTORY_UNIQ_HELPER(__COUNTER__, protocol, factory_fn)
+
+#define REGISTER_RPC_FACTORY_UNIQ_HELPER(ctr, protocol, factory_fn) \
+  REGISTER_RPC_FACTORY_UNIQ(ctr, protocol, factory_fn)
+
+#define REGISTER_RPC_FACTORY_UNIQ(ctr, protocol, factory_fn) \
+  static rpc_factory_registration::RPCFactoryRegistration    \
+      rpc_factory_registration_fn_##ctr(protocol, factory_fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry_test.cc b/tensorflow/core/util/rpc/rpc_factory_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfd0f95016ed344924c9366bf43ff0ccb47e548c
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry_test.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Value {
+  static RPCFactory* Function(OpKernelConstruction* ctx, bool fail_fast,
+                              int64 timeout_in_ms) {
+    return nullptr;
+  }
+};
+
+REGISTER_RPC_FACTORY("TEST FACTORY 1", Value::Function);
+REGISTER_RPC_FACTORY("TEST FACTORY 2", Value::Function);
+}  // namespace
+
+TEST(RPCFactoryRegistryTest, TestBasic) {
+  EXPECT_EQ(RPCFactoryRegistry::Global()->Get("NON-EXISTENT"), nullptr);
+  auto factory1 = RPCFactoryRegistry::Global()->Get("TEST FACTORY 1");
+  EXPECT_NE(factory1, nullptr);
+  auto factory2 = RPCFactoryRegistry::Global()->Get("TEST FACTORY 2");
+  EXPECT_NE(factory2, nullptr);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/session_message.cc b/tensorflow/core/util/session_message.cc
deleted file mode 100644
index 28a6517a1a3c584b896c0b51f9937bc786283b16..0000000000000000000000000000000000000000
--- a/tensorflow/core/util/session_message.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/util/session_message.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/util/event.pb.h"
-
-static const int kMaxLogEvents = 1000;
-
-namespace tensorflow {
-
-SessionLogger::SessionLogger() : status_(new SessionStatus) {}
-
-SessionLogger::~SessionLogger() {}
-
-string SessionLogger::DebugString() { return "SessionLogger"; }
-
-void SessionLogger::Log(StringPiece message) {
-  mutex_lock lock(mu_);
-
-  Event* event = status_->add_event();
-  event->set_wall_time(Env::Default()->NowMicros());
-  event->set_step(0);
-  LogMessage* log = event->mutable_log_message();
-  log->set_message(message.ToString());
-  log->set_level(LogMessage::INFO);
-
-  // Clip log events by 10% if we overflow
-  if (status_->event_size() > kMaxLogEvents) {
-    auto events = status_->mutable_event();
-    events->DeleteSubrange(0, kMaxLogEvents / 10);
-  }
-}
-
-SessionLogger* GetSessionLogger(ResourceMgr* rm) {
-  SessionLogger* logger;
-
-  std::function<Status(SessionLogger**)> status_creator =
-      [](SessionLogger** result) {
-        *result = new SessionLogger();
-        return Status::OK();
-      };
-
-  if (!rm->LookupOrCreate<SessionLogger>("session", "status", &logger,
-                                         status_creator)
-           .ok()) {
-    return nullptr;
-  }
-
-  return logger;
-}
-
-void LogSessionMessage(ResourceMgr* rm, StringPiece message) {
-  return GetSessionLogger(rm)->Log(message);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/util/session_message.h b/tensorflow/core/util/session_message.h
deleted file mode 100644
index c0f3d78b46a50386403c453fcc92d56a456206de..0000000000000000000000000000000000000000
--- a/tensorflow/core/util/session_message.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H_
-#define TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H_
-
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-
-class ResourceMgr;
-class SessionStatus;
-
-class SessionLogger : public ResourceBase {
- public:
-  SessionLogger();
-  ~SessionLogger();
-
-  void Log(StringPiece message);
-  string DebugString() override;
-
-  const SessionStatus& status() { return *status_; }
-
- private:
-  std::unique_ptr<SessionStatus> status_;
-  mutex mu_;
-};
-
-// Return a SessionLogger instance for the current session.  If the logger
-// will be used across multiple computations, you must explicitly acquire
-// and release references using Ref()/Unref().
-//
-// Returns nullptr if a logger cannot be created.
-SessionLogger* GetSessionLogger(ResourceMgr* rm);
-
-// Attach `message` to the logger for the current session.
-void LogSessionMessage(ResourceMgr* rm, StringPiece message);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index f7b63e86869c2713e08439ed1b0c0d343ad07451..79fa63723ec843e72364f09ad82617788ae9a901 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -186,7 +186,7 @@ class StatSummarizer {
   void Reset();
 
   // Returns number of runs.
-  int num_runs() const { return run_total_us_.count(); }
+  int num_runs() const { return static_cast<int>(run_total_us_.count()); }
 
   // Returns stats of total microseconds spent by all nodes in each run.
   const Stat<int64>& run_total_us() const { return run_total_us_; }
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index f7767ace716782e53a2023bea7acc7b2f3c6604c..4787bcf6ded5d2ffd7395cb26cfe612e5cbb67a7 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -30,21 +30,9 @@ class StreamExecutorUtil {
   // Map a Tensor as a DeviceMemory object wrapping the given typed
   // buffer.
   template <typename T>
-  static perftools::gputools::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
+  static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
     T* ptr = reinterpret_cast<T*>(const_cast<char*>(t.tensor_data().data()));
-    return perftools::gputools::DeviceMemory<T>(
-        perftools::gputools::DeviceMemoryBase(ptr, t.TotalBytes()));
-  }
-
-  // Converts from a StreamExecutor Status to a TensorFlow Status.
-  //
-  // This assumes that the error codes between the two implementations
-  // match.
-  static Status ConvertStatus(const perftools::gputools::port::Status& s) {
-    return s.ok() ? Status::OK()
-                  : Status(static_cast<tensorflow::error::Code>(
-                               static_cast<int>(s.code())),
-                           s.error_message());
+    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
   }
 };
 
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index 166bd0f659dae3124faac6d71d69cbcd41c15b48..648358606c130b7ed64a56739be0b37884f585d5 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -75,18 +75,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 08f1aa7125bc47421e0db24a9db6f6e2b2f1e365..7f166f0ec0aeee78738648060ef7782827918cd8 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/table_builder.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -293,7 +294,7 @@ void VersionTest(const VersionDef& version, StringPiece expected_error) {
   BundleReader reader(Env::Default(), path);
   EXPECT_TRUE(errors::IsInvalidArgument(reader.status()));
   EXPECT_TRUE(
-      StringPiece(reader.status().error_message()).starts_with(expected_error));
+      str_util::StartsWith(reader.status().error_message(), expected_error));
 }
 
 }  // namespace
@@ -588,7 +589,7 @@ TEST(TensorBundleTest, Error) {
     TF_EXPECT_OK(writer.Add("foo", Constant_2x3(1.f)));
     EXPECT_FALSE(writer.Add("foo", Constant_2x3(2.f)).ok());
     EXPECT_TRUE(
-        StringPiece(writer.status().ToString()).contains("duplicate key"));
+        str_util::StrContains(writer.status().ToString(), "duplicate key"));
     EXPECT_FALSE(writer.Finish().ok());
   }
   {  // Double finish
@@ -598,7 +599,7 @@ TEST(TensorBundleTest, Error) {
   }
   {  // Not found.
     BundleReader reader(Env::Default(), Prefix("nonexist"));
-    EXPECT_TRUE(StringPiece(reader.status().ToString()).contains("Not found"));
+    EXPECT_TRUE(str_util::StrContains(reader.status().ToString(), "Not found"));
   }
 }
 
@@ -629,7 +630,7 @@ TEST(TensorBundleTest, Checksum) {
     BundleReader reader(Env::Default(), Prefix(prefix));
     Status status = reader.Lookup(key, &val);
     EXPECT_TRUE(errors::IsDataLoss(status));
-    EXPECT_TRUE(StringPiece(status.ToString()).contains(expected_msg));
+    EXPECT_TRUE(str_util::StrContains(status.ToString(), expected_msg));
   };
 
   // Corrupts a float tensor.
@@ -680,8 +681,8 @@ TEST(TensorBundleTest, Endianness) {
 
   BundleReader reader(Env::Default(), Prefix("end"));
   EXPECT_TRUE(errors::IsUnimplemented(reader.status()));
-  EXPECT_TRUE(StringPiece(reader.status().ToString())
-                  .contains("different endianness from the reader"));
+  EXPECT_TRUE(str_util::StrContains(reader.status().ToString(),
+                                    "different endianness from the reader"));
 }
 
 TEST(TensorBundleTest, TruncatedTensorContents) {
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index 010cc36823b739a6209b9f56fd883cf6d6abc6d7..3c9590e488d1895fcc5c630f846c0fb63aea12f5 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -422,7 +423,7 @@ static void VersionTest(const VersionDef& versions, const string& error) {
   // Read it back in and verify that we get the expected error
   TensorSliceReader reader(path, OpenTableTensorSliceReader);
   EXPECT_TRUE(reader.status().code() == error::INVALID_ARGUMENT &&
-              StringPiece(reader.status().error_message()).starts_with(error))
+              str_util::StartsWith(reader.status().error_message(), error))
       << "Expected error starting with '" << errors::InvalidArgument(error)
       << "', got '" << reader.status() << "'";
 }
diff --git a/tensorflow/core/util/tensor_slice_writer_test.cc b/tensorflow/core/util/tensor_slice_writer_test.cc
index ff5bfd65aef360cd89908a94bee7d8bb052f1905..31397f11b66ce7b2a64fa7f5e508a801a1d47969 100644
--- a/tensorflow/core/util/tensor_slice_writer_test.cc
+++ b/tensorflow/core/util/tensor_slice_writer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -333,8 +334,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<int8> data(300000000, -1);
     Status s = writer.Add("test1", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("Tensor slice is too large to serialize"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "Tensor slice is too large to serialize"));
   }
 
   // Add a large string tensor slice, which will fail.
@@ -344,8 +345,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<string> data(256 * 1024, std::string(8192, 'f'));
     Status s = writer.Add("test2", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("Tensor slice is too large to serialize"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "Tensor slice is too large to serialize"));
   }
 }
 
diff --git a/tensorflow/core/util/use_cudnn.cc b/tensorflow/core/util/use_cudnn.cc
index d7d03f151e2228f4dd09f0385d4a3d33a1452290..c119df6419e39d66ced20653f77dc1f431d67b18 100644
--- a/tensorflow/core/util/use_cudnn.cc
+++ b/tensorflow/core/util/use_cudnn.cc
@@ -22,9 +22,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-#define ADD_CUDNN_FLAG(func_name, flag_name, default_value)                \
+#define ADD_BOOL_CUDNN_FLAG(func_name, flag_name, default_value)           \
   bool func_name() {                                                       \
-    bool value;                                                            \
+    bool value = default_value;                                            \
     Status status = ReadBoolFromEnvVar(#flag_name, default_value, &value); \
     if (!status.ok()) {                                                    \
       LOG(ERROR) << status;                                                \
@@ -32,12 +32,44 @@ namespace tensorflow {
     return value;                                                          \
   }
 
-ADD_CUDNN_FLAG(CanUseCudnn, TF_USE_CUDNN, true);
-ADD_CUDNN_FLAG(CudnnUseAutotune, TF_CUDNN_USE_AUTOTUNE, true);
-ADD_CUDNN_FLAG(CudnnDisableConv1x1Optimization,
-               TF_CUDNN_DISABLE_CONV_1X1_OPTIMIZATION, false);
+ADD_BOOL_CUDNN_FLAG(CanUseCudnn, TF_USE_CUDNN, true);
+ADD_BOOL_CUDNN_FLAG(CudnnUseAutotune, TF_CUDNN_USE_AUTOTUNE, true);
+// Whether to auto-tuning Cudnn RNN forward and backward pass to pick
+// statistically the best cudnnRNNAlgo_t and cudnnMathType_t.
+// The flag is disabled when TF_DEBUG_CUDNN_RNN is turned on.
+ADD_BOOL_CUDNN_FLAG(CudnnRnnUseAutotune, TF_CUDNN_RNN_USE_AUTOTUNE, true);
+ADD_BOOL_CUDNN_FLAG(CudnnDisableConv1x1Optimization,
+                    TF_CUDNN_DISABLE_CONV_1X1_OPTIMIZATION, false);
 
-#undef ADD_CUDNN_FLAG
+// Whether to run Cudnn RNN forward and backward in debug mode, where users can
+// force a specified cudnnRNNAlgo_t and cudnnMathType_t, when used together with
+// the following two env vars:
+// TF_DEBUG_CUDNN_RNN_USE_TENSOR_OPS
+// TF_DEBUG_CUDNN_RNN_ALGO
+// By default it is disabled and only intended for testing and profiling.
+ADD_BOOL_CUDNN_FLAG(DebugCudnnRnn, TF_DEBUG_CUDNN_RNN, false);
+// If using TENSOR_OP_MATH in Cudnn RNN for both forward and backward pass. Only
+// effective when TF_DEBUG_CUDNN_RNN is true.
+// Note none of the persistent RNN algorithm support TENSOR_OP_MATH before
+// Cudnn 7.1. See Nvidia Cudnn manual for more details.
+ADD_BOOL_CUDNN_FLAG(DebugCudnnRnnUseTensorOps,
+                    TF_DEBUG_CUDNN_RNN_USE_TENSOR_OPS, false);
+#undef ADD_BOOL_CUDNN_FLAG
+
+#define ADD_INT64_CUDNN_FLAG(func_name, flag_name, default_value)           \
+  int64 func_name() {                                                       \
+    int64 value = default_value;                                            \
+    Status status = ReadInt64FromEnvVar(#flag_name, default_value, &value); \
+    if (!status.ok()) {                                                     \
+      LOG(ERROR) << status;                                                 \
+    }                                                                       \
+    return value;                                                           \
+  }
+// Cudnn RNN algorithm to use for both forward and backward pass. Only effective
+// when TF_DEBUG_CUDNN_RNN is true. See Nvidia Cudnn manual for allowed
+// cudnnRNNAlgo_t.
+ADD_INT64_CUDNN_FLAG(DebugCudnnRnnAlgo, TF_DEBUG_CUDNN_RNN_ALGO, -1);
+#undef ADD_INT64_CUDNN_FLAG
 
 FP16ConvMode CudnnConvComputeMode() {
   string value;
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index a39a032e3f4de0985f446a8c0ae6e00fc56c7249..f8cc5944d71f7d169b1fd7e6a38d8c0e6fa0a1fd 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -15,8 +15,10 @@ limitations under the License.
 
 // The utility to check Cudnn dependency and set Cudnn-related flags.
 
-#ifndef TENSORFLOW_UTIL_USE_CUDNN_H_
-#define TENSORFLOW_UTIL_USE_CUDNN_H_
+#ifndef TENSORFLOW_CORE_UTIL_USE_CUDNN_H_
+#define TENSORFLOW_CORE_UTIL_USE_CUDNN_H_
+
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
@@ -31,9 +33,12 @@ enum class FP16ConvMode {
 
 bool CanUseCudnn();
 bool CudnnUseAutotune();
+bool CudnnRnnUseAutotune();
 bool CudnnDisableConv1x1Optimization();
 FP16ConvMode CudnnConvComputeMode();
-
+bool DebugCudnnRnn();
+bool DebugCudnnRnnUseTensorOps();
+int64 DebugCudnnRnnAlgo();
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_USE_CUDNN_H_
+#endif  // TENSORFLOW_CORE_UTIL_USE_CUDNN_H_
diff --git a/tensorflow/docs_src/about/index.md b/tensorflow/docs_src/about/index.md
index 5326b1e11012618261b85d13770c793bc05736bf..dc1e9af8763e0b55bbee936ec491fba75c6507fd 100644
--- a/tensorflow/docs_src/about/index.md
+++ b/tensorflow/docs_src/about/index.md
@@ -3,7 +3,6 @@
 This section provides a few documents about TensorFlow itself,
 including the following:
 
-  * @{$roadmap$Roadmap}, which summarizes upcoming additions to TensorFlow.
   * @{$uses$TensorFlow in Use}, which provides a link to our model zoo and
     lists some popular ways that TensorFlow is being used.
   * @{$bib$TensorFlow White Papers}, which provides abstracts of white papers
diff --git a/tensorflow/docs_src/about/leftnav_files b/tensorflow/docs_src/about/leftnav_files
index 28f039e9b5f8c948dae16f6bcb74d03b3a7804e7..63763b9d9c9d5d1c604035678e855f29925b408e 100644
--- a/tensorflow/docs_src/about/leftnav_files
+++ b/tensorflow/docs_src/about/leftnav_files
@@ -1,5 +1,4 @@
 index.md
-roadmap.md
 uses.md
 bib.md
 attribution.md
diff --git a/tensorflow/docs_src/about/uses.md b/tensorflow/docs_src/about/uses.md
index d646880bd350c42e463680a5c7eb0903f2c0a497..d3db98203e8746b8d824d3ac853dcfbc35ab9d25 100644
--- a/tensorflow/docs_src/about/uses.md
+++ b/tensorflow/docs_src/about/uses.md
@@ -18,9 +18,9 @@ This section describes some of the current uses of the TensorFlow system.
 
 > If you are using TensorFlow for research, for education, or for production
 > usage in some product, we would love to add something about your usage here.
-> Please feel free to email us a brief description of how you're using
-> TensorFlow, or even better, send us a pull request to add an entry to this
-> file.
+> Please feel free to [email us](mailto:usecases@tensorflow.org) a brief
+> description of how you're using TensorFlow, or even better, send us a
+> pull request to add an entry to this file.
 
 * **Deep Speech**
 <ul>
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
index 956dccb64f971f8f2f5b97422583ea5913da1ff5..74fe4a323aafbed986a094daa8a6d5f010521ba4 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
@@ -6,43 +6,39 @@ Monte Carlo integration and helpers.
 ## Background
 
 Monte Carlo integration refers to the practice of estimating an expectation with
-a sample mean.  For example, given random variable `Z in R^k` with density `p`,
+a sample mean.  For example, given random variable Z in \\(R^k\\) with density `p`,
 the expectation of function `f` can be approximated like:
 
-```
-E_p[f(Z)] = \int f(z) p(z) dz
-          ~ S_n
-          := n^{-1} \sum_{i=1}^n f(z_i),  z_i iid samples from p.
-```
+$$E_p[f(Z)] = \int f(z) p(z) dz$$
+$$          ~ S_n
+          := n^{-1} \sum_{i=1}^n f(z_i),  z_i\ iid\ samples\ from\ p.$$
 
-If `E_p[|f(Z)|] < infinity`, then `S_n --> E_p[f(Z)]` by the strong law of large
-numbers.  If `E_p[f(Z)^2] < infinity`, then `S_n` is asymptotically normal with
-variance `Var[f(Z)] / n`.
+If \\(E_p[|f(Z)|] < infinity\\), then \\(S_n\\) --> \\(E_p[f(Z)]\\) by the strong law of large
+numbers.  If \\(E_p[f(Z)^2] < infinity\\), then \\(S_n\\) is asymptotically normal with
+variance \\(Var[f(Z)] / n\\).
 
 Practitioners of Bayesian statistics often find themselves wanting to estimate
-`E_p[f(Z)]` when the distribution `p` is known only up to a constant.  For
+\\(E_p[f(Z)]\\) when the distribution `p` is known only up to a constant.  For
 example, the joint distribution `p(z, x)` may be known, but the evidence
-`p(x) = \int p(z, x) dz` may be intractable.  In that case, a parameterized
-distribution family `q_lambda(z)` may be chosen, and the optimal `lambda` is the
-one minimizing the KL divergence between `q_lambda(z)` and
-`p(z | x)`.  We only know `p(z, x)`, but that is sufficient to find `lambda`.
+\\(p(x) = \int p(z, x) dz\\) may be intractable.  In that case, a parameterized
+distribution family \\(q_\lambda(z)\\) may be chosen, and the optimal \\(\lambda\\) is the
+one minimizing the KL divergence between \\(q_\lambda(z)\\) and
+\\(p(z | x)\\).  We only know `p(z, x)`, but that is sufficient to find \\(\lambda\\).
 
 
 ## Log-space evaluation and subtracting the maximum
 
 Care must be taken when the random variable lives in a high dimensional space.
-For example, the naive importance sample estimate `E_q[f(Z) p(Z) / q(Z)]`
-involves the ratio of two terms `p(Z) / q(Z)`, each of which must have tails
-dropping off faster than `O(|z|^{-(k + 1)})` in order to have finite integral.
+For example, the naive importance sample estimate \\(E_q[f(Z) p(Z) / q(Z)]\\)
+involves the ratio of two terms \\(p(Z) / q(Z)\\), each of which must have tails
+dropping off faster than \\(O(|z|^{-(k + 1)})\\) in order to have finite integral.
 This ratio would often be zero or infinity up to numerical precision.
 
 For that reason, we write
 
-```
-Log E_q[ f(Z) p(Z) / q(Z) ]
-   = Log E_q[ exp{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C} ] + C,  where
-C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].
-```
+$$Log E_q[ f(Z) p(Z) / q(Z) ]$$
+$$   = Log E_q[ \exp\{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C\} ] + C,$$  where
+$$C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].$$
 
 The maximum value of the exponentiated term will be 0.0, and the expectation
 can be evaluated in a stable manner.
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
index 0ce187b329bce38fe096f2640a09cc93c71f9543..e169897f31717d994a0229f1e1b485874d2b0572 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
@@ -28,6 +28,5 @@ To apply a `Bijector`, use `distributions.TransformedDistribution`.
 *   @{tf.contrib.distributions.bijectors.Inline}
 *   @{tf.contrib.distributions.bijectors.Invert}
 *   @{tf.contrib.distributions.bijectors.PowerTransform}
-*   @{tf.contrib.distributions.bijectors.SigmoidCentered}
 *   @{tf.contrib.distributions.bijectors.SoftmaxCentered}
 *   @{tf.contrib.distributions.bijectors.Softplus}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
index de4f126507930331d348cc795bd03b9971778d07..20fe88a799b3e0f6767207eb36d132d4c9e4b220 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
@@ -61,21 +61,21 @@ A subgraph can be created in several ways:
 
 * using a list of ops:
 
-```python
-my_sgv = ge.sgv(ops)
-```
+  ```python
+  my_sgv = ge.sgv(ops)
+  ```
 
 * from a name scope:
 
-```python
-my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
-```
+  ```python
+  my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
+  ```
 
 * using regular expression:
 
-```python
-my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
-```
+  ```python
+  my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
+  ```
 
 Note that the Graph Editor is meant to manipulate several graphs at the same
 time, typically during transform or copy operation. For that reason,
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
index d7f862625e02a50cd716118f882344c1d16ffe1c..8b7442216c05ccb0df6be540edb15165ff4752c1 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.losses.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -107,19 +107,19 @@ weighted average over the individual prediction errors:
   loss = tf.contrib.losses.mean_squared_error(predictions, depths, weight)
 ```
 
-@{tf.contrib.losses.absolute_difference}
-@{tf.contrib.losses.add_loss}
-@{tf.contrib.losses.hinge_loss}
-@{tf.contrib.losses.compute_weighted_loss}
-@{tf.contrib.losses.cosine_distance}
-@{tf.contrib.losses.get_losses}
-@{tf.contrib.losses.get_regularization_losses}
-@{tf.contrib.losses.get_total_loss}
-@{tf.contrib.losses.log_loss}
-@{tf.contrib.losses.mean_pairwise_squared_error}
-@{tf.contrib.losses.mean_squared_error}
-@{tf.contrib.losses.sigmoid_cross_entropy}
-@{tf.contrib.losses.softmax_cross_entropy}
-@{tf.contrib.losses.sparse_softmax_cross_entropy}
+* @{tf.contrib.losses.absolute_difference}
+* @{tf.contrib.losses.add_loss}
+* @{tf.contrib.losses.hinge_loss}
+* @{tf.contrib.losses.compute_weighted_loss}
+* @{tf.contrib.losses.cosine_distance}
+* @{tf.contrib.losses.get_losses}
+* @{tf.contrib.losses.get_regularization_losses}
+* @{tf.contrib.losses.get_total_loss}
+* @{tf.contrib.losses.log_loss}
+* @{tf.contrib.losses.mean_pairwise_squared_error}
+* @{tf.contrib.losses.mean_squared_error}
+* @{tf.contrib.losses.sigmoid_cross_entropy}
+* @{tf.contrib.losses.softmax_cross_entropy}
+* @{tf.contrib.losses.sparse_softmax_cross_entropy}
 
 
diff --git a/tensorflow/docs_src/api_guides/python/io_ops.md b/tensorflow/docs_src/api_guides/python/io_ops.md
index 94cf0de32a2d2ea16d1581e7c42a08b59aa52888..86b4b39409863f09c3669dc6971901f6350377ca 100644
--- a/tensorflow/docs_src/api_guides/python/io_ops.md
+++ b/tensorflow/docs_src/api_guides/python/io_ops.md
@@ -8,7 +8,7 @@ Note: Functions taking `Tensor` arguments can also take anything accepted by
 ## Placeholders
 
 TensorFlow provides a placeholder operation that must be fed with data
-on execution.  For more info, see the section on @{$reading_data#feeding$Feeding data}.
+on execution.  For more info, see the section on @{$reading_data#Feeding$Feeding data}.
 
 *   @{tf.placeholder}
 *   @{tf.placeholder_with_default}
@@ -42,7 +42,7 @@ formats into tensors.
 
 ### Example protocol buffer
 
-TensorFlow's @{$reading_data#standard-tensorflow-format$recommended format for training examples}
+TensorFlow's @{$reading_data#standard_tensorflow_format$recommended format for training examples}
 is serialized `Example` protocol buffers, [described
 here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 They contain `Features`, [described
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
index 8e6fd1cff93332b84f552c18f627ba05dc67103e..8d8daaae19fa3e7863f9fa88393c35a3d95edf87 100644
--- a/tensorflow/docs_src/api_guides/python/nn.md
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -89,7 +89,7 @@ bottom. Note that this is different from existing libraries such as cuDNN and
 Caffe, which explicitly specify the number of padded pixels and always pad the
 same number of pixels on both sides.
 
-For the `'VALID`' scheme, the output height and width are computed as:
+For the `'VALID'` scheme, the output height and width are computed as:
 
     out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
     out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))
@@ -98,10 +98,10 @@ and no padding is used.
 
 Given the output size and the padding, the output can be computed as
 
-    output[b, i, j, :] =
-        sum_{di, dj} input[b, strides[1] * i + di - pad_top,
-                           strides[2] * j + dj - pad_left, ...] *
-                     filter[di, dj, ...]
+$$    output[b, i, j, :] =
+        sum_{d_i, d_j} input[b, strides[1] * i + d_i - pad_{top},\
+                           strides[2] * j + d_j - pad_{left}, ...] *
+                     filter[d_i, d_j,\ ...]$$
 
 where any value outside the original input image region are considered zero (
 i.e. we pad zero values around the border of the image).
@@ -161,12 +161,12 @@ Morphological operators are non-linear filters used in image processing.
 ](https://en.wikipedia.org/wiki/Dilation_(morphology))
 is the max-sum counterpart of standard sum-product convolution:
 
-    output[b, y, x, c] =
+$$    output[b, y, x, c] =
         max_{dy, dx} input[b,
                            strides[1] * y + rates[1] * dy,
                            strides[2] * x + rates[2] * dx,
                            c] +
-                     filter[dy, dx, c]
+                     filter[dy, dx, c]$$
 
 The `filter` is usually called structuring function. Max-pooling is a special
 case of greyscale morphological dilation when the filter assumes all-zero
@@ -176,12 +176,12 @@ values (a.k.a. flat structuring function).
 ](https://en.wikipedia.org/wiki/Erosion_(morphology))
 is the min-sum counterpart of standard sum-product convolution:
 
-    output[b, y, x, c] =
+$$    output[b, y, x, c] =
         min_{dy, dx} input[b,
                            strides[1] * y - rates[1] * dy,
                            strides[2] * x - rates[2] * dx,
                            c] -
-                     filter[dy, dx, c]
+                     filter[dy, dx, c]$$
 
 Dilation and erosion are dual to each other. The dilation of the input signal
 `f` by the structuring signal `g` is equal to the negation of the erosion of
diff --git a/tensorflow/docs_src/api_guides/python/state_ops.md b/tensorflow/docs_src/api_guides/python/state_ops.md
index 0d612ee0c7e5e3693cf8a46813633dcc22229355..ec2d8773860f0595cabe91d591a5fdc025e99b83 100644
--- a/tensorflow/docs_src/api_guides/python/state_ops.md
+++ b/tensorflow/docs_src/api_guides/python/state_ops.md
@@ -83,6 +83,8 @@ automatically by the optimizers in most cases.
 *   @{tf.scatter_sub}
 *   @{tf.scatter_mul}
 *   @{tf.scatter_div}
+*   @{tf.scatter_min}
+*   @{tf.scatter_max}
 *   @{tf.scatter_nd_update}
 *   @{tf.scatter_nd_add}
 *   @{tf.scatter_nd_sub}
diff --git a/tensorflow/docs_src/community/contributing.md b/tensorflow/docs_src/community/contributing.md
new file mode 100644
index 0000000000000000000000000000000000000000..afbb8bbdd0fd25f1e4fa607ac6b4f74e4cc37c0c
--- /dev/null
+++ b/tensorflow/docs_src/community/contributing.md
@@ -0,0 +1,49 @@
+# Contributing to TensorFlow
+
+TensorFlow is an open-source project, and we welcome your participation
+and contribution. This page describes how to get involved.
+
+## Repositories
+
+The code for TensorFlow is hosted in the [TensorFlow GitHub
+organization](https://github.com/tensorflow). Multiple projects are located
+inside the organization, including:
+
+* [TensorFlow](https://github.com/tensorflow/tensorflow)
+* [Models](https://github.com/tensorflow/models)
+* [TensorBoard](https://github.com/tensorflow/tensorboard)
+* [TensorFlow.js](https://github.com/tensorflow/tfjs)
+* [TensorFlow Serving](https://github.com/tensorflow/serving)
+* [TensorFlow Documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/docs_src)
+
+## Contributor checklist
+
+* Before contributing to TensorFlow source code, please review the [contribution
+guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
+
+* Join the
+[developers@tensorflow.org](https://groups.google.com/a/tensorflow.org/d/forum/developers)
+mailing list, to coordinate and discuss with others contributing to TensorFlow.
+
+* For coding style conventions, read the @{$style_guide$TensorFlow Style Guide}.
+
+* Finally, review @{$documentation$Writing TensorFlow Documentation}, which
+  explains documentation conventions.
+
+You may also wish to review our guide to @{$benchmarks$defining and running benchmarks}.
+
+## Special Interest Groups
+
+To enable focused collaboration on particular areas of TensorFlow, we host
+Special Interest Groups (SIGs). SIGs do their work in public: if you want to
+join and contribute, review the work of the group, and get in touch with the
+relevant SIG leader.  Membership policies vary on a per-SIG basis.
+
+* **SIG Build** focuses on issues surrounding building, packaging, and
+  distribution of TensorFlow. [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/build).
+
+* **SIG TensorBoard** furthers the development and direction of TensorBoard and its plugins.
+  [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard).
+
+* **SIG Rust** collaborates on the development of TensorFlow's Rust bindings.
+  [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/rust).
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index 003e0a25ecd7c6afcc42aed08bd5d91f7c85a9bb..8639656d07228540b72d0eca3ab5f67d6b9753a7 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -148,19 +148,7 @@ viewing. Do not include url parameters in the source code URL.
 Before building the documentation, you must first set up your environment by
 doing the following:
 
-1. If pip isn't installed on your machine, install it now by issuing the
-following command:
-
-        $ sudo easy_install pip
-
-2. Use pip to install codegen, mock, and pandas by issuing the following
-   command (Note: If you are using
-   a [virtualenv](https://virtualenv.pypa.io/en/stable/) to manage your
-   dependencies, you may not want to use sudo for these installations):
-
-        $ sudo pip install codegen mock pandas
-
-3. If bazel is not installed on your machine, install it now. If you are on
+1. If bazel is not installed on your machine, install it now. If you are on
    Linux, install bazel by issuing the following command:
 
         $ sudo apt-get install bazel  # Linux
@@ -168,10 +156,10 @@ following command:
     If you are on Mac OS, find bazel installation instructions on
     [this page](https://bazel.build/versions/master/docs/install.html#mac-os-x).
 
-4. Change directory to the top-level `tensorflow` directory of the TensorFlow
+2. Change directory to the top-level `tensorflow` directory of the TensorFlow
    source code.
 
-5. Run the `configure` script and answer its prompts appropriately for your
+3. Run the `configure` script and answer its prompts appropriately for your
    system.
 
         $ ./configure
@@ -414,24 +402,24 @@ types and default values.
 
 For example:
 
-    ```c++
-    REGISTER_OP("PngDecode")
-      .Input("contents: string")
-      .Attr("channels: int = 0")
-      .Output("image: uint8")
-      .Doc(R"doc(
-    Decodes the contents of a PNG file into a uint8 tensor.
-
-    contents: PNG file contents.
-    channels: Number of color channels, or 0 to autodetect based on the input.
-      Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
-      If the input has a different number of channels, it will be transformed
-      accordingly.
-    image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
-      If `channels` is 0, the last dimension is determined
-      from the png contents.
-    )doc");
-    ```
+```c++
+REGISTER_OP("PngDecode")
+  .Input("contents: string")
+  .Attr("channels: int = 0")
+  .Output("image: uint8")
+  .Doc(R"doc(
+Decodes the contents of a PNG file into a uint8 tensor.
+
+contents: PNG file contents.
+channels: Number of color channels, or 0 to autodetect based on the input.
+  Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
+  If the input has a different number of channels, it will be transformed
+  accordingly.
+image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
+  If `channels` is 0, the last dimension is determined
+  from the png contents.
+)doc");
+```
 
 Results in this piece of Markdown:
 
@@ -441,12 +429,12 @@ Results in this piece of Markdown:
 
     #### Args:
 
-    *  <b>contents</b>: A string Tensor. PNG file contents.
-    *  <b>channels</b>: An optional int. Defaults to 0.
+    *  **contents**: A string Tensor. PNG file contents.
+    *  **channels**: An optional int. Defaults to 0.
        Number of color channels, or 0 to autodetect based on the input.
        Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.  If the
        input has a different number of channels, it will be transformed accordingly.
-    *  <b>name</b>: A name for the operation (optional).
+    *  **name**: A name for the operation (optional).
 
     #### Returns:
     A 3-D uint8 tensor of shape `[height, width, channels]`.  If `channels` is
@@ -454,7 +442,7 @@ Results in this piece of Markdown:
 
 Much of the argument description is added automatically. In particular, the doc
 generator automatically adds the name and type of all inputs, attrs, and
-outputs. In the above example, `<b>contents</b>: A string Tensor.` was added
+outputs. In the above example, `contents: A string Tensor.` was added
 automatically. You should write your additional text to flow naturally after
 that description.
 
@@ -477,31 +465,29 @@ should use Markdown in the docstring.
 
 Here's a simple example:
 
-```python
-def foo(x, y, name="bar"):
-  """Computes foo.
+    def foo(x, y, name="bar"):
+      """Computes foo.
 
-  Given two 1-D tensors `x` and `y`, this operation computes the foo.
+      Given two 1-D tensors `x` and `y`, this operation computes the foo.
 
-  Example:
+      Example:
 
-  ```
-  # x is [1, 1]
-  # y is [2, 2]
-  tf.foo(x, y) ==> [3, 3]
-  ```
-  Args:
-    x: A `Tensor` of type `int32`.
-    y: A `Tensor` of type `int32`.
-    name: A name for the operation (optional).
+      ```
+      # x is [1, 1]
+      # y is [2, 2]
+      tf.foo(x, y) ==> [3, 3]
+      ```
+      Args:
+        x: A `Tensor` of type `int32`.
+        y: A `Tensor` of type `int32`.
+        name: A name for the operation (optional).
 
-  Returns:
-    A `Tensor` of type `int32` that is the foo of `x` and `y`.
+      Returns:
+        A `Tensor` of type `int32` that is the foo of `x` and `y`.
 
-  Raises:
-    ValueError: If `x` or `y` are not of type `int32`.
-  """
-```
+      Raises:
+        ValueError: If `x` or `y` are not of type `int32`.
+      """
 
 ## Description of the docstring sections
 
@@ -678,10 +664,10 @@ This generates the following Args section in
 
     #### Args:
 
-    * <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The PNG-encoded
+    * **`contents`**: A `Tensor` of type `string`. 0-D.  The PNG-encoded
       image.
-    * <b>`channels`</b>: An optional `int`. Defaults to `0`. Number of color
+    * **`channels`**: An optional `int`. Defaults to `0`. Number of color
       channels for the decoded image.
-    * <b>`dtype`</b>: An optional `tf.DType` from: `tf.uint8,
+    * **`dtype`**: An optional `tf.DType` from: `tf.uint8,
       tf.uint16`. Defaults to `tf.uint 8`.
-    * <b>`name`</b>: A name for the operation (optional).
+    * **`name`**: A name for the operation (optional).
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
new file mode 100644
index 0000000000000000000000000000000000000000..d92f5775fafa394d795dd451077a721a2ecbb259
--- /dev/null
+++ b/tensorflow/docs_src/community/groups.md
@@ -0,0 +1,17 @@
+# User Groups
+
+TensorFlow has communities around the world.
+
+## Asia
+
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+
+
+## Europe
+
+* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
+* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+
diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md
index 8e67022648d4c7161b02072446371e6d7e7168e2..eec2e51a8706b73abcedb8329df3ad03e3b349c3 100644
--- a/tensorflow/docs_src/community/index.md
+++ b/tensorflow/docs_src/community/index.md
@@ -1,14 +1,85 @@
 # Community
 
-This section contains the following documents:
-
-  * @{$welcome$Welcome to the TensorFlow Community}, which explains how
-    you can get involved, where to report issues, and where to join
-    like-minded TensorFlow enthusiasts online.
-  * @{$documentation$Writing TensorFlow Documentation}, which explains
-    TensorFlow's documentation conventions.  If you are modifying
-    TensorFlow source code or documentation, please read this guide.
-  * @{$style_guide$TensorFlow Style Guide}, which identifies coding style
-    conventions that TensorFlow developers and users should follow.
-  * @{$community/benchmarks$Benchmarks}, Benchmarks, a guide for defining and
-    running a TensorFlow benchmark.
+Welcome to the TensorFlow community! This page explains where to get help, and
+different ways to be part of the community. We are committed to fostering an
+open and welcoming environment, and request that you review our [code of
+conduct](https://github.com/tensorflow/tensorflow/blob/master/CODE_OF_CONDUCT.md).
+
+## Get Help
+
+### Technical Questions
+
+To ask or answer technical questions about TensorFlow, use [Stack
+Overflow](https://stackoverflow.com/questions/tagged/tensorflow). For example,
+ask or search about a particular error message you encountered during
+installation.
+
+### Bugs and Feature Requests
+
+To report bugs or make feature requests, file an issue on GitHub. Please choose
+the appropriate repository for the project. Major repositories include:
+
+  * [TensorFlow](https://github.com/tensorflow/tensorflow/issues)
+  * [TensorBoard](https://github.com/tensorflow/tensorboard/issues)
+  * [TensorFlow models](https://github.com/tensorflow/models/issues)
+  
+### Security
+
+Before using TensorFlow, please take a look at our security model, list of
+recent security announcements, and ways you can report security issues to the
+TensorFlow team at the
+[Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) page on GitHub.
+
+## Stay Informed
+
+### Announcements Mailing List
+
+All major releases and important announcements are sent to
+[announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
+We recommend that you join this list if you depend on TensorFlow in any way.
+
+### Development Roadmap
+
+The @{$roadmap$Roadmap} summarizes plans for upcoming additions to TensorFlow.
+
+### Social Media
+
+For news and updates from around the universe of TensorFlow projects, follow
+[@tensorflow](https://twitter.com/tensorflow) on Twitter.
+
+### Blog
+
+We post regularly to the [TensorFlow Blog](http://blog.tensorflow.org/),
+with content from the TensorFlow team and the best articles from the community.
+
+### YouTube
+
+Our [YouTube Channel](http://youtube.com/tensorflow/) focuses on machine learing
+and AI with TensorFlow. On it we have a number of new shows, including:
+
+- TensorFlow Meets: meet with community contributors to learn and share what they're doing
+- Ask TensorFlow: the team answers the best questions tagged #AskTensorFlow from social media 
+- Coding TensorFlow: short bites with tips for success with TensorFlow
+
+## Community Support
+
+### Mailing Lists
+
+For general discussion about TensorFlow development and direction, please join
+the [TensorFlow discuss mailing
+list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
+
+A number of other mailing lists exist, focused on different project areas, which
+can be found at @{$lists$TensorFlow Mailing Lists}.
+
+### User Groups
+
+To meet with like-minded people local to you, check out the many
+@{$groups$TensorFlow user groups} around the world.
+
+
+## Contributing To TensorFlow
+
+We welcome contributions and collaboration on TensorFlow. For more information,
+please read [Contributing to TensorFlow](contributing.md).
+
diff --git a/tensorflow/docs_src/community/leftnav_files b/tensorflow/docs_src/community/leftnav_files
index c1595d3c955bb87120fe6a6c9185c58e9db1097e..2bae60d9ddc5c18f67b1611054ac58b072e9674a 100644
--- a/tensorflow/docs_src/community/leftnav_files
+++ b/tensorflow/docs_src/community/leftnav_files
@@ -1,5 +1,9 @@
 index.md
-welcome.md
+roadmap.md
+contributing.md
+lists.md
+groups.md
 documentation.md
 style_guide.md
 benchmarks.md
+swift.md
diff --git a/tensorflow/docs_src/community/lists.md b/tensorflow/docs_src/community/lists.md
new file mode 100644
index 0000000000000000000000000000000000000000..7450ab36c436538dd584541fb0dafb5a2c6067b3
--- /dev/null
+++ b/tensorflow/docs_src/community/lists.md
@@ -0,0 +1,51 @@
+# Mailing Lists
+
+As a community, we do much of our collaboration on public mailing lists.
+Please note that if you're looking for help using TensorFlow, [Stack
+Overflow](https://stackoverflow.com/questions/tagged/tensorflow) and
+[GitHub issues](https://github.com/tensorflow/tensorflow/issues)
+are the best initial places to look. For more information,
+see [how to get help](/community/#get_help).
+
+## General TensorFlow lists
+
+* [announce](https://groups.google.com/a/tensorflow.org/d/forum/announce) - Low-volume announcements of new releases.
+* [discuss](https://groups.google.com/a/tensorflow.org/d/forum/discuss) - General community discussion around TensorFlow.
+* [developers](https://groups.google.com/a/tensorflow.org/d/forum/developers) - Discussion for developers contributing to TensorFlow.
+
+## Project-specific lists
+
+These projects inside the TensorFlow GitHub organization have lists dedicated to their communities:
+
+* [hub](https://groups.google.com/a/tensorflow.org/d/forum/hub) -
+  Discussion and collaboration around [TensorFlow Hub](https://github.com/tensorflow/hub).
+* [magenta-discuss](https://groups.google.com/a/tensorflow.org/d/forum/magenta-discuss) -
+  General discussion about [Magenta](https://magenta.tensorflow.org/)
+  development and directions.
+* [swift](https://groups.google.com/a/tensorflow.org/d/forum/swift) -
+  Community and collaboration around Swift for TensorFlow.
+* [tensor2tensor](https://groups.google.com/d/forum/tensor2tensor) - Discussion
+  and peer support for Tensor2Tensor.
+* [tfjs-announce](https://groups.google.com/a/tensorflow.org/d/forum/tfjs-announce) -
+  Announcements of new TensorFlow.js releases.
+* [tfjs](https://groups.google.com/a/tensorflow.org/d/forum/tfjs) - Discussion
+  and peer support for TensorFlow.js.
+* [tflite](https://groups.google.com/a/tensorflow.org/d/forum/tflite) - Discussion and
+  peer support for TensorFlow Lite.
+* [tpu-users](https://groups.google.com/a/tensorflow.org/d/forum/tpu-users) - Community discussion
+  and support for TPU users.
+
+## Special Interest Groups
+
+TensorFlow's [Special Interest
+Groups](/community/contributing#special_interest_groups) (SIGs) support
+community collaboration on particular project focuses. Members of these groups
+work together to build and support TensorFlow related projects. While their
+archives are public, different SIGs have their own membership policies.
+
+* [build](https://groups.google.com/a/tensorflow.org/d/forum/build) -
+  Supporting SIG Build, for build, distribution and packaging of TensorFlow.
+* [sig-tensorboard](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard) -
+  Supporting SIG TensorBoard, for plugin development and other contribution.
+* [rust](https://groups.google.com/a/tensorflow.org/d/forum/rust) -
+  Supporting SIG Rust, for the Rust language bindings.
diff --git a/tensorflow/docs_src/about/roadmap.md b/tensorflow/docs_src/community/roadmap.md
similarity index 50%
rename from tensorflow/docs_src/about/roadmap.md
rename to tensorflow/docs_src/community/roadmap.md
index 1f934acab69276d4c32393bb73632d978e0d15c3..0463ca05fe5353944acef004f3a5582c5caaa3b3 100644
--- a/tensorflow/docs_src/about/roadmap.md
+++ b/tensorflow/docs_src/community/roadmap.md
@@ -1,5 +1,5 @@
 # Roadmap
-**Last updated: Feb 15, 2018**
+**Last updated: Apr 27, 2018**
 
 TensorFlow is a rapidly moving, community supported project. This document is intended 
 to provide guidance about priorities and focus areas of the core set of TensorFlow 
@@ -14,12 +14,12 @@ expected in the next one to two releases.
 
 ### APIs
 #### High Level APIs:
-* Easy multi-GPU utilization with Estimators
+* Easy multi-GPU and TPU utilization with Estimators
 * Easy-to-use high-level pre-made estimators for Gradient Boosted Trees, Time Series, and other models
 
 #### Eager Execution:
 * Efficient utilization of multiple GPUs
-* Distributed training (multi-machine)
+* Distributed training support (multi-machine)
 * Performance improvements
 * Simpler export to a GraphDef/SavedModel 
 
@@ -31,14 +31,14 @@ to create Keras models Eager- style via Model subclassing)
 
 #### Official Models:
 * A set of 
-[reference models](https://github.com/tensorflow/models/tree/master/official) 
+[models](https://github.com/tensorflow/models/tree/master/official) 
 across image recognition, speech, object detection, and 
   translation that demonstrate best practices and serve as a starting point for 
   high-performance model development.
 
 #### Contrib:
-* Deprecation notices added to parts of tf.contrib where preferred implementations exist outside of tf.contrib.
-* As much as possible, large projects inside tf.contrib moved to separate repositories.
+* Deprecate parts of tf.contrib where preferred implementations exist outside of tf.contrib.
+* As much as possible, move large projects inside tf.contrib to separate repositories.
 * The tf.contrib module will eventually be discontinued in its current form, experimental development will in future happen in other repositories.
 
 
@@ -50,37 +50,72 @@ across image recognition, speech, object detection, and
 
 ### Platforms
 #### TensorFlow Lite:
-* Increased coverage of supported ops in TensorFlow Lite
+* Increase coverage of supported ops in TensorFlow Lite
 * Easier conversion of a trained TensorFlow graph for use on TensorFlow Lite
 * Support for GPU acceleration in TensorFlow Lite (iOS and Android)
 * Support for hardware accelerators via Android NeuralNets API 
-* Improved CPU performance by quantization and other network optimizations (eg. pruning, distillation)
-* Increased support for devices beyond Android and iOS (eg. RPi, Cortex-M)
+* Improve CPU performance by quantization and other network optimizations (eg. pruning, distillation)
+* Increase support for devices beyond Android and iOS (eg. RPi, Cortex-M)
+
+#### TensorFlow.js:
+* Release package for Node.js bindings to the TensorFlow C API through the TensorFlow.js backend interface
+* Expand support for importing TensorFlow SavedModels and Keras models into browser with unified APIs supporting retraining in browser
+* Improve Layers API and allow model exporting/saving
+* Release tfjs-data API for efficient data input pipelines
+
+#### TensorFlow with Swift:
+* Establish open source project including documentation, open design, and code availability.
+* Continue implementing and refining implementation and design through 2018.
+* Aim for implementation to be solid enough for general use later in 2018.
 
 ### Performance
 #### Distributed TensorFlow:
-* Multi-GPU support optimized for a variety of GPU topologies
-* Improved mechanisms for distributing computations on several machines
+* Optimize Multi-GPU support for a variety of GPU topologies
+* Improve mechanisms for distributing computations on several machines
+
+#### GPU Optimizations:
+* Simplify mixed precision API with initial example model and guide.
+* Finalize TensorRT API and move to core.
+* CUDA 9.2 and NCCL 2.x default in TensorFlow builds.
+* Optimizations for DGX-2.
+* Remove support for CUDA less than 8.x and cuDNN less than 6.x.
 
-#### Optimizations:
-* Mixed precision training support with initial example model and guide
-* Native TensorRT support
+
+#### CPU Optimizations
 * Int8 support for SkyLake via MKL
 * Dynamic loading of SIMD-optimized kernels
+* MKL for Linux and Windows
+
+### End-to-end ML systems:
+#### TensorFlow Hub:
+* Expand support for module-types in TF Hub with TF Eager integration, Keras layers integration, and TensorFlow.js integration
+* Accept variable-sized image input
+* Improve multi-GPU estimator support
+* Document and improve TPU integration
+
+#### TensorFlow Extended:
+* Open source more of the TensorFlow Extended platform to facilitate adoption of TensorFlow in production settings.
+* Release TFX libraries for Data Validation
+
+### Documentation and Resources:
+* Update documentation, tutorials and Getting Started guides on all features and APIs
+* Update [Youtube Tensorflow channel](https://youtube.com/tensorflow) weekly with new content:
+Coding TensorFlow - where we teach folks coding with tensorflow
+TensorFlow Meets - where we highlight community contributions
+Ask TensorFlow - where we answer community questions
+Guest and Showcase videos
+* Update [Official TensorFlow blog](https://blog.tensorflow.org) with regular articles from Google team and the Community
 
-### Documentation and Usability:
-* Updated documentation, tutorials and Getting Started guides
-* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications
 
 ### Community and Partner Engagement
 #### Special Interest Groups: 
-* Mobilizing the community to work together in focused domains
-* [tf-distribute](https://groups.google.com/a/tensorflow.org/forum/#!forum/tf-distribute)
-: build and packaging of TensorFlow
-* More to be identified and launched
+* Mobilize the community to work together in focused domains
+* [tf-distribute](https://groups.google.com/a/tensorflow.org/forum/#!forum/tf-distribute): build and packaging of TensorFlow
+* SIG TensorBoard, SIG Rust, and more to be identified and launched
 
 #### Community:
 * Incorporate public feedback on significant design decisions via a Request-for-Comment (RFC) process
 * Formalize process for external contributions to land in TensorFlow and associated projects 
 * Grow global TensorFlow communities and user groups
 * Collaborate with partners to co-develop and publish research papers
+* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications
diff --git a/tensorflow/docs_src/community/security.md b/tensorflow/docs_src/community/security.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d13c7a1eaf336193b39ea3a9ee8e316a04fcb63
--- /dev/null
+++ b/tensorflow/docs_src/community/security.md
@@ -0,0 +1,7 @@
+# Using TensorFlow Securely
+
+Before using TensorFlow, please take a look at our security model, list of
+recent security announcements, and ways you can report security issues to the
+TensorFlow team at the
+[https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md](Using
+TensorFlow Securely) page on GitHub.
diff --git a/tensorflow/docs_src/community/swift.md b/tensorflow/docs_src/community/swift.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7da189a5c2f97e478822207d3b6ad72a2db6509
--- /dev/null
+++ b/tensorflow/docs_src/community/swift.md
@@ -0,0 +1,60 @@
+<p align="center">
+  <img src="../images/swift_tensorflow_logo.png">
+</p>
+
+# Swift for TensorFlow
+
+Welcome to the Swift for TensorFlow development community!
+
+Swift for TensorFlow is a new way to develop machine learning models. It
+gives you the power of
+[TensorFlow](https://www.tensorflow.org/programmers_guide/eager) directly
+integrated into the [Swift programming language](https://swift.org/about).
+With Swift, you can write the following imperative code, and Swift
+automatically turns it into **a single TensorFlow Graph** and runs it
+with the full performance of TensorFlow Sessions on CPU, GPU and
+[TPU](https://cloud.google.com/tpu/docs/tpus).
+
+```swift
+import TensorFlow
+
+var x = Tensor([[1, 2], [3, 4]])
+
+for i in 1...5 {
+  x += x ⊗ x
+}
+
+print(x)
+```
+
+Swift combines the flexibility of
+[Eager Execution](https://www.tensorflow.org/programmers_guide/eager) with the
+high performance of [Graphs and Sessions](https://www.tensorflow.org/programmers_guide/graphs).
+Behind the scenes, Swift analyzes your Tensor code and automatically builds
+graphs for you. Swift also catches type errors and shape mismatches before
+running your code, and has [Automatic Differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
+built right in. We believe that machine learning tools are so important that
+they deserve **a first-class language and a compiler**.
+
+**Note:** Swift for TensorFlow is an early stage research project. It has been
+released to enable open source development and is not yet ready for general use
+by machine learning developers.
+
+## Open Source
+
+We have released Swift for TensorFlow as an open-source project on GitHub!
+
+Our [documentation repository](https://github.com/tensorflow/swift) contains a
+[project overview](https://github.com/tensorflow/swift/blob/master/docs/DesignOverview.md)
+and [technical papers](https://github.com/tensorflow/swift/tree/master/docs)
+explaining specific areas in depth. There are also instructions for [installing
+pre-built packages](https://github.com/tensorflow/swift/blob/master/Installation.md)
+(for macOS and Ubuntu) as well as a simple
+[usage tutorial](https://github.com/tensorflow/swift/blob/master/Usage.md).
+
+Moving forward, we will use an open design model and all discussions will be
+public.
+
+[Sign up here to join the community Google
+group](https://groups.google.com/a/tensorflow.org/d/forum/swift), which we will
+use for announcements and general discussion.
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
deleted file mode 100644
index 9f6fe91b1490ef4ffe43acc877ecb83cc9121118..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/community/welcome.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# Welcome to the TensorFlow Community
-
-TensorFlow is an open-source project.  This page explains how to contribute,
-where to ask questions, and how to help each other.
-
-
-## Development
-
-The source code for TensorFlow is on
-[GitHub](https://github.com/tensorflow/tensorflow).
-
-Before contributing to TensorFlow source code, please review the
-[Contribution guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
-
-### Projects developed by the TensorFlow community
-
-The TensorFlow community has created many great projects around TensorFlow, including:
-
-* [Machine Learning with TensorFlow (Book & Code)](http://tensorflowbook.com)
-* [@jtoy's awesome "Awesome TensorFlow" list of awesome things](https://github.com/jtoy/awesome-tensorflow)
-* [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
-* [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
-* [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
-* [Rust language bindings](https://github.com/google/tensorflow-rust)
-* [Operator Vectorization Library](https://github.com/opveclib/opveclib)
-* [Swift language bindings](https://github.com/PerfectlySoft/Perfect-TensorFlow)
-* [Sublime Tensorflow - A plugin for Sublime Text](https://github.com/baptisteArnaud/Sublime-Tensorflow)
-* [Edward - A library for probabilistic modeling, inference, and criticism](http://edwardlib.org) ([Github](https://github.com/blei-lab/edward), [Forum](https://discourse.edwardlib.org))
-* [GPflow - Gaussian processes in TensorFlow](https://github.com/GPflow/GPflow)
-* [CS 20SI: Tensorflow for Deep Learning Research](https://web.stanford.edu/class/cs20si/) - Please note, this course was designed with TensorFlow v0.12, so some of the notes may be out of date - but it's still a great resource.
-
-## TensorFlow Communities Around the World
-
-Asia:
-
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
-* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
-
-
-Europe:
-
-* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
-* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
-
-
-
-## Support
-
-TensorFlow provides multiple communication paths.  To pick the right path,
-please read the following list carefully:
-
-  * To ask or answer technical questions about TensorFlow, use
-    [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).
-    For example, ask or search Stack Overflow about a particular error message
-    you encountered during installation.
-  * To join general discussions about TensorFlow development and directions,
-    please join the
-    [TensorFlow discuss mailing list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
-    For example, use this mailing list to learn about new features in
-    upcoming releases of TensorFlow.
-  * To report bugs or make feature requests, use the
-    [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
-    on GitHub.  For example, use the issue tracker to request a
-    new operation in TensorFlow.
-  * To report vulnerabilities, please follow our
-    [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/SECURITY.md).
-
diff --git a/tensorflow/docs_src/deploy/deploy_to_js.md b/tensorflow/docs_src/deploy/deploy_to_js.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7ce3ea90bda25a84c6dc8ca52e97b1613043c0b
--- /dev/null
+++ b/tensorflow/docs_src/deploy/deploy_to_js.md
@@ -0,0 +1,4 @@
+# Deploy to JavaScript
+
+You can find details about deploying JavaScript TensorFlow programs
+in the separate [js.tensorflow.org site](https://js.tensorflow.org).
diff --git a/tensorflow/docs_src/deploy/leftnav_files b/tensorflow/docs_src/deploy/leftnav_files
index c682e7add16c741279aedb40c1b12f4ca8f0286a..93f5bd1ed20d34eaf7c9ef64ea89e5632331d5c1 100644
--- a/tensorflow/docs_src/deploy/leftnav_files
+++ b/tensorflow/docs_src/deploy/leftnav_files
@@ -2,3 +2,4 @@ index.md
 distributed.md
 hadoop.md
 s3.md
+deploy_to_js.md
diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
index 38f84286347622d1de0646cdc621d5fb1447e588..ef3b030e3277c1ff82b15949f0733ea78d3d0f10 100644
--- a/tensorflow/docs_src/deploy/s3.md
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -1,22 +1,13 @@
 # How to run TensorFlow on S3
 
-This document describes how to run TensorFlow on S3 file system.
+Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitious, and can help in situations where data must accessed by multiple actors, such as in distributed training.
 
-## S3
+This document guides you through the required setup, and provides examples on usage.
 
-We assume that you are familiar with @{$reading_data$reading data}.
-
-To use S3 with TensorFlow, change the file paths you use to read and write
-data to an S3 path. For example:
-
-```python
-filenames = ["s3://bucketname/path/to/file1.tfrecord",
-             "s3://bucketname/path/to/file2.tfrecord"]
-dataset = tf.data.TFRecordDataset(filenames)
-```
+## Configuration
 
 When reading or writing data on S3 with your TensorFlow program, the behavior
-could be controlled by various environmental variables:
+can be controlled by various environmental variables:
 
 *   **AWS_REGION**: By default, regional endpoint is used for S3, with region
     controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then
@@ -28,7 +19,7 @@ could be controlled by various environmental variables:
 *   **S3_VERIFY_SSL**: If HTTPS is used, SSL verification could be disabled
     with `S3_VERIFY_SSL=0`.
 
-To read or write objects in a bucket that is no publicly accessible,
+To read or write objects in a bucket that is not publicly accessible,
 AWS credentials must be provided through one of the following methods:
 
 *   Set credentials in the AWS credentials profile file on the local system,
@@ -38,3 +29,65 @@ AWS credentials must be provided through one of the following methods:
     variables.
 *   If TensorFlow is deployed on an EC2 instance, specify an IAM role and then
     give the EC2 instance access to that role.
+
+## Example Setup
+
+Using the above information, we can configure Tensorflow to communicate to an S3 endpoint by setting the following environment variables:
+
+```bash
+AWS_ACCESS_KEY_ID=XXXXX                 # Credentials only needed if connecting to a private endpoint
+AWS_SECRET_ACCESS_KEY=XXXXX
+AWS_REGION=us-east-1                    # Region for the S3 bucket, this is not always needed. Default is us-east-1.
+S3_ENDPOINT=s3.us-east-1.amazonaws.com  # The S3 API Endpoint to connect to. This is specified in a HOST:PORT format.
+S3_USE_HTTPS=1                          # Whether or not to use HTTPS. Disable with 0.
+S3_VERIFY_SSL=1                         # If HTTPS is used, conterols if SSL should be enabled. Disable with 0.
+```
+
+## Usage
+
+Once setup is completed, Tensorflow can interact with S3 in a variety of ways. Anywhere there is a Tensorflow IO function, an S3 URL can be used.
+
+### Smoke Test
+
+To test your setup, stat a file:
+
+```python
+from tensorflow.python.lib.io import file_io
+print file_io.stat('s3://bucketname/path/')
+```
+
+You should see output similar to this:
+
+```console
+<tensorflow.python.pywrap_tensorflow_internal.FileStatistics; proxy of <Swig Object of type 'tensorflow::FileStatistics *' at 0x10c2171b0> >
+```
+
+### Reading Data
+
+When @{$reading_data$reading data}, change the file paths you use to read and write
+data to an S3 path. For example:
+
+```python
+filenames = ["s3://bucketname/path/to/file1.tfrecord",
+             "s3://bucketname/path/to/file2.tfrecord"]
+dataset = tf.data.TFRecordDataset(filenames)
+```
+
+### Tensorflow Tools
+
+Many Tensorflow tools, such as Tensorboard or model serving, can also take S3 URLS as arguments:
+
+```bash
+tensorboard --logdir s3://bucketname/path/to/model/
+tensorflow_model_server --port=9000 --model_name=model --model_base_path=s3://bucketname/path/to/model/export/
+```
+
+This enables an end to end workflow using S3 for all data needs.
+
+## S3 Endpoint Implementations
+
+S3 was invented by Amazon, but the S3 API has spread in popularity and has several implementations. The following implementations have passed basic compatibility tests:
+
+* [Amazon S3](https://aws.amazon.com/s3/)
+* [Google Storage](https://cloud.google.com/storage/docs/interoperability)
+* [Minio](https://www.minio.io/kubernetes.html)(Standalone mode only)
diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md
index 06f11de4eb0ea7878b01cd37d994c5a40ec400be..bc0f662f0cf8054add41c4c677e369a9e1582343 100644
--- a/tensorflow/docs_src/extend/add_filesys.md
+++ b/tensorflow/docs_src/extend/add_filesys.md
@@ -225,7 +225,7 @@ it will use the `FooBarFileSystem` implementation.
 Next, you must build a shared object containing this implementation. An example
 of doing so using bazel's `cc_binary` rule can be found
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/BUILD#L244),
-but you may use any build system to do so. See the section on @{$adding_an_op#build-the-op-library$building the op library} for similar
+but you may use any build system to do so. See the section on @{$adding_an_op#build_the_op_library$building the op library} for similar
 instructions.
 
 The result of building this target is a `.so` shared object file.
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 15075e1df8e703415b4acb8e53f76dc9a4a41b50..c3795492cef7d6f26de157ce40544280c8194e59 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -267,7 +267,7 @@ REGISTER_CPU(int32);
 #ifdef GOOGLE_CUDA
 #define REGISTER_GPU(T)                                          \
   /* Declare explicit instantiations in kernel_example.cu.cc. */ \
-  extern template ExampleFunctor<GPUDevice, float>;              \
+  extern template ExampleFunctor<GPUDevice, T>;                  \
   REGISTER_KERNEL_BUILDER(                                       \
       Name("Example").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       ExampleOp<GPUDevice, T>);
@@ -530,56 +530,58 @@ form [described below](#attr_types).
 
 For example, if you'd like the `ZeroOut` op to preserve a user-specified index,
 instead of only the 0th element, you can register the op like so:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("preserve\_index: int")</b>
-    .Input("to\_zero: int32")
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("preserve_index: int")
+    .Input("to_zero: int32")
     .Output("zeroed: int32");
-</code></pre>
+```
 
 (Note that the set of [attribute types](#attr_types) is different from the
 @{tf.DType$tensor types} used for inputs and outputs.)
 
 Your kernel can then access this attr in its constructor via the `context`
 parameter:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
 class ZeroOutOp : public OpKernel {
  public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {<b>
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {
     // Get the index of the value to preserve
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;GetAttr("preserve\_index", &preserve\_index\_));
-    // Check that preserve\_index is positive
-    OP\_REQUIRES(context, preserve\_index_ &gt;= 0,
-                errors::InvalidArgument("Need preserve\_index &gt;= 0, got ",
-                                        preserve\_index_));
-  </b>}
-  void Compute(OpKernelContext\* context) override {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("preserve_index", &preserve_index_));
+    // Check that preserve_index is positive
+    OP_REQUIRES(context, preserve_index_ >= 0,
+                errors::InvalidArgument("Need preserve_index >= 0, got ",
+                                        preserve_index_));
+  }
+  void Compute(OpKernelContext* context) override {
     // ...
   }
- <b>private:
-  int preserve\_index\_;</b>
+ private:
+  int preserve_index_;
 };
-</code></pre>
+```
 
 which can then be used in the `Compute` method:
-<pre class="prettyprint"><code class="lang-cpp">
-  void Compute(OpKernelContext\* context) override {
+```c++
+  void Compute(OpKernelContext* context) override {
     // ...
-<br/>
-    <b>// We're using saved attr to validate potentially dynamic input
-    // So we check that preserve\_index is in range
-    OP\_REQUIRES(context, preserve\_index_ &lt; input.dimension(0),
-                errors::InvalidArgument("preserve\_index out of range"));<br/>
-    </b>// Set all the elements of the output tensor to 0
+
+    // We're using saved attr to validate potentially dynamic input
+    // So we check that preserve_index is in range
+    OP_REQUIRES(context, preserve_index_ < input.dimension(0),
+                errors::InvalidArgument("preserve_index out of range"));
+
+    // Set all the elements of the output tensor to 0
     const int N = input.size();
     for (int i = 0; i < N; i++) {
       output\_flat(i) = 0;
-    }<br/>
-    <b>// Preserve the requested input value
-    output\_flat(preserve\_index\_) = input(preserve\_index\_);</b>
+    }
+
+    // Preserve the requested input value
+    output_flat(preserve_index_) = input(preserve_index_);
   }
-</code></pre>
+```
 
 #### Attr types
 
@@ -725,12 +727,12 @@ you would then register an `OpKernel` for each supported type.
 
 For instance, if you'd like the `ZeroOut` op to work on `float`s
 in addition to `int32`s, your op registration might look like:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Your op registration now specifies that the input's type must be `float`, or
 `int32`, and that its output will be the same type, since both have type `T`.
@@ -790,66 +792,73 @@ Your op registration now specifies that the input's type must be `float`, or
 >   """
 > ```
 
-<pre class="prettyprint"><code class="lang-cpp">
-\#include "tensorflow/core/framework/op_kernel.h"<br/>
-class ZeroOut<b>Int32</b>Op : public OpKernel {
+```c++
+#include "tensorflow/core/framework/op_kernel.h"
+
+class ZeroOutInt32Op : public OpKernel {
   // as before
-};<br/>
-class ZeroOut<b>Float</b>Op : public OpKernel {
+};
+
+class ZeroOutFloatOp : public OpKernel {
  public:
-  explicit ZeroOut<b>Float</b>Op(OpKernelConstruction\* context)
-      : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
+  explicit ZeroOutFloatOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
     // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat&lt;<b>float</b>&gt;();<br/>
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<float>();
+
     // Create an output tensor
     Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat&lt;<b>float</b>&gt;();<br/>
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_tensor.shape(), &output));
+    auto output_flat = output->template flat<float>();
+
     // Set all the elements of the output tensor to 0
     const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
+    for (int i = 0; i < N; i++) {
+      output_flat(i) = 0;
+    }
+
     // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
-};<br/><b>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+};
+
+// Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
+// instantiation.
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    <b>.TypeConstraint&lt;int32&gt;("T"),</b>
-    ZeroOutOp<b>Int32</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<int32>("T"),
+    ZeroOutOpInt32);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
+    .Device(DEVICE_CPU)
+    .TypeConstraint<float>("T"),
     ZeroOutFloatOp);
-</b></code></pre>
+```
 
 > To preserve [backwards compatibility](#backwards-compatibility), you should
 > specify a [default value](#default-values-constraints) when adding an attr to
 > an existing op:
 >
-> <pre class="prettyprint"><code class="lang-cpp">
-> REGISTER\_OP("ZeroOut")
->   <b>.Attr("T: {float, int32} = DT_INT32")</b>
->   .Input("to\_zero: T")
+> ```c++
+> REGISTER_OP("ZeroOut")
+>   .Attr("T: {float, int32} = DT_INT32")
+>   .Input("to_zero: T")
 >   .Output("zeroed: T")
-> </code></pre>
+> ```
 
 Let's say you wanted to add more types, say `double`:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, <b>double,</b> int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, double, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index bdff60b39ec6fe939273a529ec4e46407cface8a..1ab0340ad983de891ef5e18a729c1e4fb3c4e0d9 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -16,9 +16,10 @@ TensorFlow:
     for your own file and record formats.
 
 Python is currently the only language supported by TensorFlow's API stability
-promises.  However, TensorFlow also provides functionality in C++, Java, and Go,
+promises. However, TensorFlow also provides functionality in C++, Go, Java and
+[JavaScript](https://js.tensorflow.org),
 plus community support for [Haskell](https://github.com/tensorflow/haskell) and
-[Rust](https://github.com/tensorflow/rust).  If you'd like to create or
+[Rust](https://github.com/tensorflow/rust). If you'd like to create or
 develop TensorFlow features in a language other than these languages, read the
 following guide:
 
diff --git a/tensorflow/docs_src/extend/language_bindings.md b/tensorflow/docs_src/extend/language_bindings.md
index b9fd72978dd11046e5347b9bce2bddd345ca426b..9a968d365be15e087482c9dcf555b8c128a3e21d 100644
--- a/tensorflow/docs_src/extend/language_bindings.md
+++ b/tensorflow/docs_src/extend/language_bindings.md
@@ -112,11 +112,11 @@ There are a few ways to get a list of the `OpDef`s for the registered ops:
     to interpret the `OpDef` messages.
 -   The C++ function `OpRegistry::Global()->GetRegisteredOps()` returns the same
     list of all registered `OpDef`s (defined in
-    [`tensorflow/core/framework/op.h`]). This can be used to write the generator
+    [`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h)). This can be used to write the generator
     in C++ (particularly useful for languages that do not have protocol buffer
     support).
 -   The ASCII-serialized version of that list is periodically checked in to
-    [`tensorflow/core/ops/ops.pbtxt`] by an automated process.
+    [`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt) by an automated process.
 
 The `OpDef` specifies the following:
 
@@ -159,7 +159,7 @@ between the generated code and the `OpDef`s checked into the repository, but is
 useful for languages where code is expected to be generated ahead of time like
 `go get` for Go and `cargo ops` for Rust. At the other end of the spectrum, for
 some languages the code could be generated dynamically from
-[`tensorflow/core/ops/ops.pbtxt`].
+[`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt).
 
 #### Handling Constants
 
@@ -229,6 +229,3 @@ and "while") is not available in languages other than Python. This will be
 updated when the [C API] provides necessary support.
 
 [C API]: https://www.tensorflow.org/code/tensorflow/c/c_api.h
-[`tensorflow/core/ops/ops.pbtxt`]: https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt
-[`tensorflow/python/BUILD`]: https://www.tensorflow.org/code/tensorflow/python/BUILD
-[`tensorflow/core/framework/op.h`]: https://www.tensorflow.org/code/tensorflow/core/framework/op.h
diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index b3cc96804740991ada56a9b7f60439a63e9eb895..2c33a6b6f7e5f1faf04d38e95b74d184134a1edf 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -1,4 +1,4 @@
-# Custom Data Readers
+# Reading custom file and record formats
 
 PREREQUISITES:
 
@@ -9,187 +9,273 @@ PREREQUISITES:
 
 We divide the task of supporting a file format into two pieces:
 
-*   File formats: We use a *Reader* Op to read a *record* (which can be any
-    string) from a file.
-*   Record formats: We use decoder or parsing Ops to turn a string record
+*   File formats: We use a reader `tf.data.Dataset` to read raw *records* (which
+    are typically represented by scalar string tensors, but can have more
+    structure) from a file.
+*   Record formats: We use decoder or parsing ops to turn a string record
     into tensors usable by TensorFlow.
 
 For example, to read a
 [CSV file](https://en.wikipedia.org/wiki/Comma-separated_values), we use
-@{tf.TextLineReader$a Reader for text files}
-followed by
-@{tf.decode_csv$an Op that parses CSV data from a line of text}.
+@{tf.data.TextLineDataset$a dataset for reading text files line-by-line}
+and then @{tf.data.Dataset.map$map} an
+@{tf.decode_csv$op} that parses CSV data from each line of text in the dataset.
 
 [TOC]
 
-## Writing a Reader for a file format
+## Writing a `Dataset` for a file format
 
-A `Reader` is something that reads records from a file.  There are some examples
-of Reader Ops already built into TensorFlow:
+A @{tf.data.Dataset} represents a sequence of *elements*, which can be the
+individual records in a file. There are several examples of "reader" datasets
+that are already built into TensorFlow:
 
-*   @{tf.TFRecordReader}
-    ([source in `kernels/tf_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/tf_record_reader_op.cc))
-*   @{tf.FixedLengthRecordReader}
-    ([source in `kernels/fixed_length_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/fixed_length_record_reader_op.cc))
-*   @{tf.TextLineReader}
-    ([source in `kernels/text_line_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/text_line_reader_op.cc))
+*   @{tf.data.TFRecordDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
+*   @{tf.data.FixedLengthRecordDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
+*   @{tf.data.TextLineDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
 
-You can see these all expose the same interface, the only differences
-are in their constructors.  The most important method is `read`.
-It takes a queue argument, which is where it gets filenames to
-read from whenever it needs one (e.g. when the `read` op first runs, or
-the previous `read` reads the last record from a file).  It produces
-two scalar tensors: a string key and a string value.
+Each of these implementations comprises three related classes:
 
-To create a new reader called `SomeReader`, you will need to:
+* A `tensorflow::DatasetOpKernel` subclass (e.g. `TextLineDatasetOp`), which
+  tells TensorFlow how to construct a dataset object from the inputs to and
+  attrs of an op, in its `MakeDataset()` method.
 
-1.  In C++, define a subclass of
-    [`tensorflow::ReaderBase`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h)
-    called `SomeReader`.
-2.  In C++, register a new reader op and kernel with the name `"SomeReader"`.
-3.  In Python, define a subclass of @{tf.ReaderBase} called `SomeReader`.
+* A `tensorflow::GraphDatasetBase` subclass (e.g. `TextLineDatasetOp::Dataset`),
+  which represents the *immutable* definition of the dataset itself, and tells
+  TensorFlow how to construct an iterator object over that dataset, in its
+  `MakeIterator()` method.
 
-You can put all the C++ code in a file in
-`tensorflow/core/user_ops/some_reader_op.cc`. The code to read a file will live
-in a descendant of the C++ `ReaderBase` class, which is defined in
-[`tensorflow/core/kernels/reader_base.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h).
-You will need to implement the following methods:
+* A `tensorflow::DatasetIterator<Dataset>` subclass (e.g.
+  `TextLineDatasetOp::Dataset::Iterator`), which represents the *mutable* state
+  of an iterator over a particular dataset, and tells TensorFlow how to get the
+  next element from the iterator, in its `GetNextInternal()` method.
 
-*   `OnWorkStartedLocked`: open the next file
-*   `ReadLocked`: read a record or report EOF/error
-*   `OnWorkFinishedLocked`: close the current file, and
-*   `ResetLocked`: get a clean slate after, e.g., an error
+The most important method is the `GetNextInternal()` method, since it defines
+how to actually read records from the file and represent them as one or more
+`Tensor` objects.
 
-These methods have names ending in "Locked" since `ReaderBase` makes sure
-to acquire a mutex before calling any of these methods, so you generally don't
-have to worry about thread safety (though that only protects the members of the
-class, not global state).
+To create a new reader dataset called (for example) `MyReaderDataset`, you will
+need to:
 
-For `OnWorkStartedLocked`, the name of the file to open is the value returned by
-the `current_work()` method.  `ReadLocked` has this signature:
+1. In C++, define subclasses of `tensorflow::DatasetOpKernel`,
+   `tensorflow::GraphDatasetBase`, and `tensorflow::DatasetIterator<Dataset>`
+   that implement the reading logic.
+2. In C++, register a new reader op and kernel with the name
+   `"MyReaderDataset"`.
+3. In Python, define a subclass of @{tf.data.Dataset} called `MyReaderDataset`.
 
-```c++
-Status ReadLocked(string* key, string* value, bool* produced, bool* at_end)
-```
-
-If `ReadLocked` successfully reads a record from the file, it should fill in:
-
-*   `*key`: with an identifier for the record, that a human could use to find
-    this record again.  You can include the filename from `current_work()`,
-    and append a record number or whatever.
-*   `*value`: with the contents of the record.
-*   `*produced`: set to `true`.
-
-If you hit the end of a file (EOF), set `*at_end` to `true`.  In either case,
-return `Status::OK()`.  If there is an error, simply return it using one of the
-helper functions from
-[`tensorflow/core/lib/core/errors.h`](https://www.tensorflow.org/code/tensorflow/core/lib/core/errors.h)
-without modifying any arguments.
-
-Next you will create the actual Reader op.  It will help if you are familiar
-with @{$adding_an_op$the adding an op how-to}.  The main steps
-are:
-
-*   Registering the op.
-*   Define and register an `OpKernel`.
-
-To register the op, you will use a `REGISTER_OP` call defined in
-[`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h).
-Reader ops never take any input and always have a single output with type
-`resource`.  They should have string `container` and `shared_name` attrs.
-You may optionally define additional attrs
-for configuration or include documentation in a `Doc`.  For examples, see
-[`tensorflow/core/ops/io_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/ops/io_ops.cc),
-e.g.:
+You can put all the C++ code in a single file, such as
+`my_reader_dataset_op.cc`. It will help if you are
+familiar with @{$adding_an_op$the adding an op how-to}. The following skeleton
+can be used as a starting point for your implementation:
 
 ```c++
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
-REGISTER_OP("TextLineReader")
-    .Output("reader_handle: resource")
-    .Attr("skip_header_lines: int = 0")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A Reader that outputs the lines of a file delimited by '\n'.
-)doc");
-```
-
-To define an `OpKernel`, Readers can use the shortcut of descending from
-`ReaderOpKernel`, defined in
-[`tensorflow/core/framework/reader_op_kernel.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_op_kernel.h),
-and implement a constructor that calls `SetReaderFactory`.  After defining
-your class, you will need to register it using `REGISTER_KERNEL_BUILDER(...)`.
-An example with no attrs:
+namespace tensorflow {
+namespace {
 
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TFRecordReaderOp : public ReaderOpKernel {
+class MyReaderDatasetOp : public DatasetOpKernel {
  public:
-  explicit TFRecordReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    Env* env = context->env();
-    SetReaderFactory([this, env]() { return new TFRecordReader(name(), env); });
-  }
-};
 
-REGISTER_KERNEL_BUILDER(Name("TFRecordReader").Device(DEVICE_CPU),
-                        TFRecordReaderOp);
-```
+  MyReaderDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
+    // Parse and validate any attrs that define the dataset using
+    // `ctx->GetAttr()`, and store them in member variables.
+  }
 
-An example with attrs:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    // Parse and validate any input tensors 0that define the dataset using
+    // `ctx->input()` or the utility function
+    // `ParseScalarArgument<T>(ctx, &arg)`.
 
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TextLineReaderOp : public ReaderOpKernel {
- public:
-  explicit TextLineReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    int skip_header_lines = -1;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("skip_header_lines", &skip_header_lines));
-    OP_REQUIRES(context, skip_header_lines >= 0,
-                errors::InvalidArgument("skip_header_lines must be >= 0 not ",
-                                        skip_header_lines));
-    Env* env = context->env();
-    SetReaderFactory([this, skip_header_lines, env]() {
-      return new TextLineReader(name(), skip_header_lines, env);
-    });
+    // Create the dataset object, passing any (already-validated) arguments from
+    // attrs or input tensors.
+    *output = new Dataset(ctx);
   }
-};
 
-REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU),
-                        TextLineReaderOp);
-```
-
-The last step is to add the Python wrapper.  You can either do this by
-@{$adding_an_op#building_the_op_library$compiling a dynamic library}
-or, if you are building TensorFlow from source, adding to `user_ops.py`.
-For the latter, you will import `tensorflow.python.ops.io_ops` in
-[`tensorflow/python/user_ops/user_ops.py`](https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py)
-and add a descendant of [`io_ops.ReaderBase`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx) : GraphDatasetBase(ctx) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::MyReader")}));
+    }
+
+    // Record structure: Each record is represented by a scalar string tensor.
+    //
+    // Dataset elements can have a fixed number of components of different
+    // types and shapes; replace the following two methods to customize this
+    // aspect of the dataset.
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "MyReaderDatasetOp::Dataset"; }
+
+   protected:
+    // Optional: Implementation of `GraphDef` serialization for this dataset.
+    //
+    // Implement this method if you want to be able to save and restore
+    // instances of this dataset (and any iterators over it).
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      // Construct nodes to represent any of the input tensors from this
+      // object's member variables using `b->AddScalar()` and `b->AddVector()`.
+      std::vector<Node*> input_tensors;
+      TF_RETURN_IF_ERROR(b->AddDataset(this, input_tensors, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      // Implementation of the reading logic.
+      //
+      // The example implementation in this file yields the string "MyReader!"
+      // ten times. In general there are three cases:
+      //
+      // 1. If an element is successfully read, store it as one or more tensors
+      //    in `*out_tensors`, set `*end_of_sequence = false` and return
+      //    `Status::OK()`.
+      // 2. If the end of input is reached, set `*end_of_sequence = true` and
+      //    return `Status::OK()`.
+      // 3. If an error occurs, return an error status using one of the helper
+      //    functions from "tensorflow/core/lib/core/errors.h".
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        // NOTE: `GetNextInternal()` may be called concurrently, so it is
+        // recommended that you protect the iterator state with a mutex.
+        mutex_lock l(mu_);
+        if (i_ < 10) {
+          // Create a scalar string tensor and add it to the output.
+          Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+          record_tensor.scalar<string>()() = "MyReader!";
+          out_tensors->emplace_back(std::move(record_tensor));
+          ++i_;
+          *end_of_sequence = false;
+        } else {
+          *end_of_sequence = true;
+        }
+        return Status::OK();
+      }
+
+     protected:
+      // Optional: Implementation of iterator state serialization for this
+      // iterator.
+      //
+      // Implement these two methods if you want to be able to save and restore
+      // instances of this iterator.
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
+        return Status::OK();
+      }
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+    };
+  };
+};
 
-```python
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import common_shapes
-from tensorflow.python.ops import io_ops
+// Register the op definition for MyReaderDataset.
+//
+// Dataset ops always have a single output, of type `variant`, which represents
+// the constructed `Dataset` object.
+//
+// Add any attrs and input tensors that define the dataset here.
+REGISTER_OP("MyReaderDataset")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
 
-class SomeReader(io_ops.ReaderBase):
+// Register the kernel implementation for MyReaderDataset.
+REGISTER_KERNEL_BUILDER(Name("MyReaderDataset").Device(DEVICE_CPU),
+                        MyReaderDatasetOp);
 
-    def __init__(self, name=None):
-        rr = gen_user_ops.some_reader(name=name)
-        super(SomeReader, self).__init__(rr)
+}  // namespace
+}  // namespace tensorflow
+```
 
+The last step is to build the C++ code and add a Python wrapper. The easiest way
+to do this is by @{$adding_an_op#build_the_op_library$compiling a dynamic
+library} (e.g. called `"my_reader_dataset_op.so"`), and adding a Python class
+that subclasses @{tf.data.Dataset} to wrap it. An example Python program is
+given here:
 
-ops.NotDifferentiable("SomeReader")
+```python
+import tensorflow as tf
+
+# Assumes the file is in the current working directory.
+my_reader_dataset_module = tf.load_op_library("./my_reader_dataset_op.so")
+
+class MyReaderDataset(tf.data.Dataset):
+
+  def __init__(self):
+    super(MyReaderDataset, self).__init__()
+    # Create any input attrs or tensors as members of this class.
+
+  def _as_variant_tensor(self):
+    # Actually construct the graph node for the dataset op.
+    #
+    # This method will be invoked when you create an iterator on this dataset
+    # or a dataset derived from it.
+    return my_reader_dataset_module.my_reader_dataset()
+
+  # The following properties define the structure of each element: a scalar
+  # `tf.string` tensor. Change these properties to match the `output_dtypes()`
+  # and `output_shapes()` methods of `MyReaderDataset::Dataset` if you modify
+  # the structure of each element.
+  @property
+  def output_types(self):
+    return tf.string
+
+  @property
+  def output_shapes(self):
+    return tf.TensorShape([])
+
+  @property
+  def output_classes(self):
+    return tf.Tensor
+
+if __name__ == "__main__":
+  # Create a MyReaderDataset and print its elements.
+  with tf.Session() as sess:
+    iterator = MyReaderDataset().make_one_shot_iterator()
+    next_element = iterator.get_next()
+    try:
+      while True:
+        print(sess.run(next_element))  # Prints "MyReader!" ten times.
+    except tf.errors.OutOfRangeError:
+      pass
 ```
 
-You can see some examples in
-[`tensorflow/python/ops/io_ops.py`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+You can see some examples of `Dataset` wrapper classes in
+[`tensorflow/python/data/ops/dataset_ops.py`](https://www.tensorflow.org/code/tensorflow/python/data/ops/dataset_ops.py).
 
 ## Writing an Op for a record format
 
@@ -201,9 +287,7 @@ track down where the bad data came from.
 
 Examples of Ops useful for decoding records:
 
-*   @{tf.parse_single_example}
-    (and
-    @{tf.parse_example})
+*   @{tf.parse_single_example} (and @{tf.parse_example})
 *   @{tf.decode_csv}
 *   @{tf.decode_raw}
 
@@ -211,11 +295,6 @@ Note that it can be useful to use multiple Ops to decode a particular record
 format.  For example, you may have an image saved as a string in
 [a `tf.train.Example` protocol buffer](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 Depending on the format of that image, you might take the corresponding output
-from a
-@{tf.parse_single_example}
-op and call @{tf.image.decode_jpeg},
-@{tf.image.decode_png}, or
-@{tf.decode_raw}.  It is common to
-take the output of `tf.decode_raw` and use
-@{tf.slice} and
-@{tf.reshape} to extract pieces.
+from a @{tf.parse_single_example} op and call @{tf.image.decode_jpeg},
+@{tf.image.decode_png}, or @{tf.decode_raw}.  It is common to take the output
+of `tf.decode_raw` and use @{tf.slice} and @{tf.reshape} to extract pieces.
diff --git a/tensorflow/docs_src/get_started/checkpoints.md b/tensorflow/docs_src/get_started/checkpoints.md
index dfa2110e691167f54e6ea8b7a832f0a88d0ec41a..8dfd91e3c8368f4a649c5b5fa3947e97441ef390 100644
--- a/tensorflow/docs_src/get_started/checkpoints.md
+++ b/tensorflow/docs_src/get_started/checkpoints.md
@@ -38,8 +38,10 @@ Estimators automatically write the following to disk:
     uses to create visualizations.
 
 To specify the top-level directory in which the Estimator stores its
-information, assign a value to the optional `model_dir` argument of any
-Estimator's constructor.  For example, the following code sets the `model_dir`
+information, assign a value to the optional `model_dir` argument of *any*
+`Estimator`'s constructor.
+Taking `DNNClassifier` as an example,
+the following code sets the `model_dir`
 argument to the `models/iris` directory:
 
 ```python
@@ -154,7 +156,7 @@ classifier = tf.estimator.DNNClassifier(
 
 The first time you call an Estimator's `train` method, TensorFlow saves a
 checkpoint to the `model_dir`. Each subsequent call to the Estimator's
-`train`, `eval`, or `predict` method causes the following:
+`train`, `evaluate`, or `predict` method causes the following:
 
 1.  The Estimator builds the model's
     [graph](https://developers.google.com/machine-learning/glossary/#graph)
@@ -222,7 +224,7 @@ does not match the shape stored in checkpoint: [20]
 
 To run experiments in which you train and compare slightly different
 versions of a model, save a copy of the code that created each
-`model-dir`, possibly by creating a separate git branch for each version.
+`model_dir`, possibly by creating a separate git branch for each version.
 This separation will keep your checkpoints recoverable.
 
 ## Summary
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 42a246678a054d637fea5a82a03ecb84ff412bd9..275cda12bc397e1a8a980f6c97e6b2d97c5e64e8 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -164,9 +164,9 @@ To implement a typical model function, you must do the following:
 * [Define the model](#define_the_model).
 * Specify additional calculations for each of
   the [three different modes](#modes):
-  * [Predict](#predict)
-  * [Evaluate](#evaluate)
-  * [Train](#train)
+    * [Predict](#predict)
+    * [Evaluate](#evaluate)
+    * [Train](#train)
 
 ## Define the model
 
@@ -213,7 +213,7 @@ is connected to every node in the preceding layer.  Here's the relevant code:
 ```
 
 * The `units` parameter defines the number of output neurons in a given layer.
-* The `activation` parameter defines the [activation function](https://developers.google.com/machine-learning/glossary/#a) —
+* The `activation` parameter defines the [activation function](https://developers.google.com/machine-learning/glossary/#activation_function) —
   [Relu](https://developers.google.com/machine-learning/glossary/#ReLU) in this
   case.
 
@@ -546,8 +546,8 @@ In brief, here's what the three graphs tell you:
 
 * accuracy: The accuracy is recorded by the following two lines:
 
-  * `eval_metric_ops={'my_accuracy': accuracy})`, during evaluation.
-  * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
+    * `eval_metric_ops={'my_accuracy': accuracy}`, during evaluation.
+    * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
 
 These tensorboard graphs are one of the main reasons it's important to pass a
 `global_step` to your optimizer's `minimize` method. The model can't record
diff --git a/tensorflow/docs_src/get_started/datasets_quickstart.md b/tensorflow/docs_src/get_started/datasets_quickstart.md
index bc69773d2138f5bf280015b61f1f243fd874bdac..c972e5e555eea1fab5a67fdecf13264897785519 100644
--- a/tensorflow/docs_src/get_started/datasets_quickstart.md
+++ b/tensorflow/docs_src/get_started/datasets_quickstart.md
@@ -265,9 +265,6 @@ ds = tf.data.TextLineDataset(train_path).skip(1)
 
 ### Build a csv line parser
 
-Ultimately we will need to parse each of the lines in the dataset, to
-produce the necessary `(features, label)` pairs.
-
 We will start by building a function to parse a single line.
 
 The following `iris_data.parse_line` function accomplishes this task using the
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad89f0154c06d97673bdb0d598ca5387c61bc6ac
--- /dev/null
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -0,0 +1,3 @@
+# Get Started with Eager Execution
+
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
index ad3e1fe3e3a4e3f5278e76bcaa0fc8eee2faf374..79c26679793f2f2830ce772708bb72c143b1ec9a 100644
--- a/tensorflow/docs_src/get_started/feature_columns.md
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -138,7 +138,7 @@ The model will represent the buckets as follows:
 |< 1960               | [1, 0, 0, 0] |
 |>= 1960 but < 1980   | [0, 1, 0, 0] |
 |>= 1980 but < 2000   | [0, 0, 1, 0] |
-|> 2000               | [0, 0, 0, 1] |
+|>= 2000              | [0, 0, 0, 1] |
 
 Why would you want to split a number—a perfectly valid input to your
 model—into a categorical value? Well, notice that the categorization splits a
@@ -146,10 +146,10 @@ single input number into a four-element vector. Therefore, the model now can
 learn _four individual weights_ rather than just one; four weights creates a
 richer model than one weight. More importantly, bucketizing enables the model
 to clearly distinguish between different year categories since only one of the
-elements is set (1) and the other three elements are cleared (0). When we just
-use a single number (a year) as input, the model can only learn a linear
-relationship. So, bucketing provides the model with additional flexibility that
-the model can use to learn.
+elements is set (1) and the other three elements are cleared (0). For example,
+when we just use a single number (a year) as input, a linear model can only
+learn a linear relationship. So, bucketing provides the model with additional
+flexibility that the model can use to learn.
 
 The following code demonstrates how to create a bucketized feature:
 
@@ -242,7 +242,7 @@ on an explicit vocabulary list. For example:
 # the elements in the vocabulary list.
 vocabulary_feature_column =
     tf.feature_column.categorical_column_with_vocabulary_list(
-        key="a feature returned by input_fn()",
+        key=feature_name_from_input_fn,
         vocabulary_list=["kitchenware", "electronics", "sports"])
 ```
 
@@ -259,7 +259,7 @@ you place the vocabulary words in a separate file. For example:
 # the elements in the vocabulary file
 vocabulary_feature_column =
     tf.feature_column.categorical_column_with_vocabulary_file(
-        key="a feature returned by input_fn()",
+        key=feature_name_from_input_fn,
         vocabulary_file="product_class.txt",
         vocabulary_size=3)
 ```
@@ -364,7 +364,7 @@ def make_dataset(latitude, longitude, labels):
     return tf.data.Dataset.from_tensor_slices((features, labels))
 
 
-# Bucketize the latitude and longitude usig the `edges`
+# Bucketize the latitude and longitude using the `edges`
 latitude_bucket_fc = tf.feature_column.bucketized_column(
     tf.feature_column.numeric_column('latitude'),
     list(atlanta.latitude.edges))
diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md
index b88483be699630d2275850cbc7c461eeb90f5943..fbe0ed74f82bb34bc55dd7bab5819c0d9fdc54e9 100644
--- a/tensorflow/docs_src/get_started/get_started_for_beginners.md
+++ b/tensorflow/docs_src/get_started/get_started_for_beginners.md
@@ -1,4 +1,4 @@
-# Getting Started for ML Beginners
+# Get Started with Graph Execution
 
 This document explains how to use machine learning to classify (categorize)
 Iris flowers by species.  This document dives deeply into the TensorFlow
@@ -14,6 +14,11 @@ If you are already familiar with basic machine learning concepts
 but are new to TensorFlow, read
 @{$premade_estimators$Getting Started with TensorFlow: for ML Experts}.
 
+If you'd like to learn a lot about the basics of Machine Learning,
+consider taking
+[Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/).
+
+
 ## The Iris classification problem
 
 Imagine you are a botanist seeking an automated way to classify each
@@ -86,6 +91,9 @@ a number.  Here's the representation scheme:
 * 1 represents versicolor
 * 2 represents virginica
 
+For a look at other examples of labels and examples, see the
+[ML Terminology section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/framing/ml-terminology).
+
 
 ## Models and training
 
@@ -371,7 +379,7 @@ There are several categories of neural networks.
 We'll be using a [**fully connected neural
 network**](https://developers.google.com/machine-learning/glossary/#fully_connected_layer),
 which means that the neurons in one layer take inputs from *every* neuron in
-the previous layer.  For example, the following figure illustrates a 
+the previous layer.  For example, the following figure illustrates a
 fully connected neural network consisting of three hidden layers:
 
 *   The first hidden layer contains four neurons.
@@ -385,6 +393,9 @@ fully connected neural network consisting of three hidden layers:
 **A neural network with three hidden layers.**
 <p>&nbsp;</p>
 
+For a more detailed introduction to neural networks, see the
+[Introduction to Neural Nets section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/introduction-to-neural-networks/anatomy).
+
 To specify a model type, instantiate an
 [**Estimator**](https://developers.google.com/machine-learning/glossary/#Estimators)
 class.  TensorFlow provides two categories of Estimators:
@@ -448,9 +459,9 @@ will become very important.
 
 ### Train the model
 
-Instantiating a `tf.Estimator.DNNClassifier` creates a framework for learning 
-the model. Basically, we've wired a network but haven't yet let data flow 
-through it. To train the neural network, call the Estimator object's `train` 
+Instantiating a `tf.Estimator.DNNClassifier` creates a framework for learning
+the model. Basically, we've wired a network but haven't yet let data flow
+through it. To train the neural network, call the Estimator object's `train`
 method. For example:
 
 ```python
@@ -559,15 +570,15 @@ of 0.5.  The following suggests a more effective model:
     <th colspan="1">Label</th>
     <th colspan="1">Prediction</th>
   </tr>
-  <tr> <td>5.9</td> <td>3.0</td> <td>4.3</td> <td>1.5</td> <td>1</td> 
+  <tr> <td>5.9</td> <td>3.0</td> <td>4.3</td> <td>1.5</td> <td>1</td>
           <td style="background-color:green">1</td></tr>
-  <tr> <td>6.9</td> <td>3.1</td> <td>5.4</td> <td>2.1</td> <td>2</td> 
+  <tr> <td>6.9</td> <td>3.1</td> <td>5.4</td> <td>2.1</td> <td>2</td>
           <td style="background-color:green">2</td></tr>
-  <tr> <td>5.1</td> <td>3.3</td> <td>1.7</td> <td>0.5</td> <td>0</td> 
+  <tr> <td>5.1</td> <td>3.3</td> <td>1.7</td> <td>0.5</td> <td>0</td>
           <td style="background-color:green">0</td></tr>
-  <tr> <td>6.0</td> <td>3.4</td> <td>4.5</td> <td>1.6</td> <td>1</td> 
+  <tr> <td>6.0</td> <td>3.4</td> <td>4.5</td> <td>1.6</td> <td>1</td>
           <td style="background-color:red">2</td></tr>
-  <tr> <td>5.5</td> <td>2.5</td> <td>4.0</td> <td>1.3</td> <td>1</td> 
+  <tr> <td>5.5</td> <td>2.5</td> <td>4.0</td> <td>1.3</td> <td>1</td>
           <td style="background-color:green">1</td></tr>
 </table>
 
@@ -631,6 +642,10 @@ Test set accuracy: 0.967
 An accuracy of 0.967 implies that our trained model correctly classified 29
 out of the 30 Iris species in the test set.
 
+To get a deeper understanding of different metrics for evaluating
+models, see the
+[Classification section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/classification).
+
 
 ### Predicting
 
@@ -723,7 +738,6 @@ Prediction is "Virginica" (97.9%), expected "Virginica"
 
 ## Summary
 
-<!--TODO(barryr): When MLCC is released, add pointers to relevant sections.-->
 This document provides a short introduction to machine learning.
 
 Because `premade_estimators.py` relies on high-level APIs, much of the
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index b7bd1286e3ce9026df49718d94cf53cf784a3be8..578080bb592d93902e7fc05cb3f74754ed188d4b 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -1,10 +1,22 @@
-# Getting Started
+# Get Started
+
+If you are new to machine learning, we recommend taking the following online
+course prior to diving into TensorFlow documentation:
+
+  * [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/),
+    which introduces machine learning concepts and encourages experimentation
+    with existing TensorFlow code.
 
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-TensorFlow provides many APIs. This section focuses on the high-level APIs.
-If you are new to TensorFlow, begin by reading one of the following documents:
+The easiest way to get started with TensorFlow is by using Eager Execution.
+
+  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
+
+TensorFlow provides many APIs. The remainder of this section focuses on the
+Estimator API which provide scalable, high-performance models.
+To get started with Estimators, begin by reading one of the following documents:
 
   * @{$get_started/get_started_for_beginners}, which is aimed at readers
     new to machine learning.
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index 437791d6a32db3e43415e381a034424ae8225f6f..4c12f0d84b3d13e4d9ececcb4559e806486b4120 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -1,10 +1,14 @@
 index.md
 
-### Getting Started
+### Beginners
+eager.md
 get_started_for_beginners.md
 premade_estimators.md
 
-### Details
+### Estimators
+get_started_for_beginners.md: For Beginners
+premade_estimators.md: Premade Estimators
+>>>
 checkpoints.md
 feature_columns.md
 datasets_quickstart.md
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index 6bffd2e065548a42eb726df34542ecc7480ad38d..4be7e508f94074f20d07e271259bf77074dd19e3 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -1,5 +1,4 @@
-
-# Getting Started with TensorFlow
+# Premade Estimators
 
 This document introduces the TensorFlow programming environment and shows you
 how to solve the Iris classification problem in TensorFlow.
@@ -397,9 +396,9 @@ predictions and their probabilities:
 
 
 ``` python
-for pred_dict, expec in zip(predictions, expected):
-    template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
+template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
 
+for pred_dict, expec in zip(predictions, expected):
     class_id = pred_dict['class_ids'][0]
     probability = pred_dict['probabilities'][class_id]
 
diff --git a/tensorflow/docs_src/install/index.md b/tensorflow/docs_src/install/index.md
index 3c8488643f071c147dfbc4e0b4b4760b0a817718..4f85383925bbb8a03372b020e448a0e604f3b999 100644
--- a/tensorflow/docs_src/install/index.md
+++ b/tensorflow/docs_src/install/index.md
@@ -3,7 +3,7 @@
 We've built and tested TensorFlow on the following 64-bit laptop/desktop
 operating systems:
 
-  * MacOS X 10.11 (El Capitan) or later.
+  * macOS 10.12.6 (Sierra) or later.
   * Ubuntu 16.04 or later
   * Windows 7 or later.
 
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index f3620cf687359ebc4abfc3365beb3da694ec7baf..8c165aad52499acdbdbcb1de81dc86a9b15ffbc1 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -15,7 +15,7 @@ instructions might also work on other variants, we have only tested
 following requirements:
 
   * Linux, 64-bit, x86
-  * macOS X, Version 10.11 (El Capitan) or higher
+  * macOS X, Version 10.12.6 (Sierra) or higher
 
 
 ## Installation
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
@@ -113,6 +113,6 @@ If executing `a.out` fails, ask yourself the following questions:
   * Did you export those environment variables?
 
 If you are still seeing build or execution error messages, search (or post to)
-[StackOverflow](www.stackoverflow.com/questions/tagged/tensorflow) for
+[StackOverflow](https://stackoverflow.com/questions/tagged/tensorflow) for
 possible solutions.
 
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 4bf4bacaecb88c9335cbe5ccbd7e6557cd21aca6..26cbcc9a9b0a99c8037faf109a8be295b43423c2 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -17,7 +17,7 @@ instructions might also work on other variants, we have only tested
 following requirements:
 
   * Linux, 64-bit, x86
-  * macOS X, 10.11 (El Capitan) or higher
+  * macOS X, 10.12.6 (Sierra) or higher
 
 
 ## Installation
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc1.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index a63a6c7ebe29c70c547336831ee12e51d49f851e..05b287870174870b6b808880df266f8453f5efa3 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -18,7 +18,7 @@ instructions might also work on other variants, we have only tested
 following requirements:
 
   * Ubuntu 16.04 or higher; 64-bit, x86
-  * macOS X 10.11 (El Capitan) or higher
+  * macOS 10.12.6 (Sierra) or higher
   * Windows 7 or higher; 64-bit, x86
 
 The installation instructions for Android are in a separate
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.6.0-rc1</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 ```
 
@@ -65,11 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-<<<<<<< HEAD
-                 <version>1.6.0-rc1</version>
-=======
-                 <version>1.6.0-rc0</version>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+                 <version>1.8.0-rc1</version>
                </dependency>
              </dependencies>
          </project>
@@ -97,6 +93,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
 
               // Execute the "MyConst" operation in a Session.
               try (Session s = new Session(g);
+                   // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
                    Tensor output = s.runner().fetch("MyConst").run().get(0)) {
                 System.out.println(new String(output.bytesValue(), "UTF-8"));
               }
@@ -127,20 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-<<<<<<< HEAD
-  <version>1.6.0-rc1</version>
-=======
-  <version>1.6.0-rc0</version>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+  <version>1.8.0-rc1</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-<<<<<<< HEAD
-  <version>1.6.0-rc1</version>
-=======
-  <version>1.6.0-rc0</version>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+  <version>1.8.0-rc1</version>
 </dependency>
 ```
 
@@ -159,11 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-<<<<<<< HEAD
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc1.jar),
-=======
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -182,11 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-<<<<<<< HEAD
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc1.tar.gz" |
-=======
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -194,17 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-<<<<<<< HEAD
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc1.zip).
-=======
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
-     which is the TensorFlow Java Archive (JAR).
-  2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc0.zip).
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc1.zip).
   3. Extract this .zip file.
 
 
@@ -234,6 +208,7 @@ public class HelloTF {
 
       // Execute the "MyConst" operation in a Session.
       try (Session s = new Session(g);
+           // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
            Tensor output = s.runner().fetch("MyConst").run().get(0)) {
         System.out.println(new String(output.bytesValue(), "UTF-8"));
       }
@@ -252,11 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<<<<<<< HEAD
-<pre><b>javac -cp libtensorflow-1.6.0-rc1.jar HelloTF.java</b></pre>
-=======
-<pre><b>javac -cp libtensorflow-1.6.0-rc0.jar HelloTF.java</b></pre>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+<pre><b>javac -cp libtensorflow-1.8.0-rc1.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -270,19 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<<<<<<< HEAD
-<pre><b>java -cp libtensorflow-1.6.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
-
-And the following command line executes the `HelloTF` program on Windows:
-
-<pre><b>java -cp libtensorflow-1.6.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
-=======
-<pre><b>java -cp libtensorflow-1.6.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.6.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+<pre><b>java -cp libtensorflow-1.8.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 681b45423f29f9f43da69e2d4d516a11f66636a3..c66d50c3cb1b98fea67766976f9b16e1ad9f241f 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -1,326 +1,273 @@
 # Installing TensorFlow on Ubuntu
 
-This guide explains how to install TensorFlow on Ubuntu. Although these
-instructions might also work on other Linux variants, we have only
-tested (and we only support) these instructions on machines meeting the
-following requirements:
+This guide explains how to install TensorFlow on Ubuntu Linux. While these
+instructions may work on other Linux variants, they are tested and supported with
+the following system requirements:
 
-  * 64-bit desktops or laptops
-  * Ubuntu 16.04 or higher
+* 64-bit desktops or laptops
+* Ubuntu 16.04 or higher
 
 
-## Determine which TensorFlow to install
+## Choose which TensorFlow to install
 
-You must choose one of the following types of TensorFlow to install:
+The following TensorFlow variants are available for installation:
 
-  * **TensorFlow with CPU support only**. If your system does not have a
-    NVIDIA® GPU, you must install this version. Note that this version of
-    TensorFlow is typically much easier to install (typically,
-    in 5 or 10 minutes), so even if you have an NVIDIA GPU, we recommend
-    installing this version first.
-  * **TensorFlow with GPU support**. TensorFlow programs typically run
-    significantly faster on a GPU than on a CPU. Therefore, if your
-    system has a NVIDIA® GPU meeting the prerequisites shown below and you
-    need to run performance-critical applications, you should ultimately
-    install this version.
+* __TensorFlow with CPU support only__. If your system does not have a
+  NVIDIA®&nbsp;GPU, you must install this version. This version of TensorFlow is
+  usually easier to install, so even if you have an NVIDIA GPU, we recommend
+  installing this version first.
+* __TensorFlow with GPU support__. TensorFlow programs usually run much faster on
+  a GPU instead of a CPU. If you run performance-critical applications and your
+  system has an NVIDIA®&nbsp;GPU that meets the prerequisites, you should install
+  this version. See [TensorFlow GPU support](#NVIDIARequirements) for details.
 
-<a name="NVIDIARequirements"></a>
-### NVIDIA requirements to run TensorFlow with GPU support
-
-If you are installing TensorFlow with GPU support using one of the
-mechanisms described in this guide, then the following NVIDIA software
-must be installed on your system:
-
-  * CUDA® Toolkit 9.0. For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
-    Ensure that you append the relevant Cuda pathnames to the
-    `LD_LIBRARY_PATH` environment variable as described in the
-    NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
-  * cuDNN v7.0. For details, see
-    [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
-    Ensure that you create the `CUDA_HOME` environment variable as
-    described in the NVIDIA documentation.
-  * GPU card with CUDA Compute Capability 3.0 or higher.  See
-    [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for
-    a list of supported GPU cards.
-  * The libcupti-dev library, which is the NVIDIA CUDA Profile Tools Interface.
-    This library provides advanced profiling support. To install this library,
-    issue the following command for CUDA Toolkit >= 8.0:
-
-    <pre>
-    $ <b>sudo apt-get install cuda-command-line-tools</b>
-    </pre>
-
-    and add its path to your `LD_LIBRARY_PATH` environment variable:
-
-    <pre>
-    $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b>
-    </pre>
-
-    For CUDA Toolkit <= 7.5 do:
-
-    <pre>
-    $ <b>sudo apt-get install libcupti-dev</b>
-    </pre>
-
-If you have an earlier version of the preceding packages, please upgrade to
-the specified versions. If upgrading is not possible, then you may still run
-TensorFlow with GPU support, but only if you do the following:
-
-  * Install TensorFlow from sources as documented in
-    @{$install_sources$Installing TensorFlow from Sources}.
-  * Install or upgrade to at least the following NVIDIA versions:
-    * CUDA toolkit 7.0 or greater
-    * cuDNN v3 or greater
-    * GPU card with CUDA Compute Capability 3.0 or higher.
-
-
-## Determine how to install TensorFlow
-
-You must pick the mechanism by which you install TensorFlow. The
-supported choices are as follows:
-
-  * [Virtualenv](#InstallingVirtualenv)
-  * ["native" pip](#InstallingNativePip)
-  * [Docker](#InstallingDocker)
-  * [Anaconda](#InstallingAnaconda)
-  * installing from sources, which is documented in
-    [a separate guide](https://www.tensorflow.org/install/install_sources).
-
-**We recommend the Virtualenv installation.**
-[Virtualenv](https://virtualenv.pypa.io/en/stable/)
-is a virtual Python environment isolated from other Python development,
-incapable of interfering with or being affected by other Python programs
-on the same machine.  During the Virtualenv installation process,
-you will install not only TensorFlow but also all the packages that
-TensorFlow requires.  (This is actually pretty easy.)
-To start working with TensorFlow, you simply need to "activate" the
-virtual environment.  All in all, Virtualenv provides a safe and
-reliable mechanism for installing and running TensorFlow.
-
-Native pip installs TensorFlow directly on your system without going
-through any container system. **We recommend the native pip install for
-system administrators aiming to make TensorFlow available to everyone on a
-multi-user system.** Since a native pip installation is not walled-off in
-a separate container, the pip installation might interfere with other
-Python-based installations on your system. However, if you understand pip
-and your Python environment, a native pip installation often entails only
-a single command.
 
-Docker completely isolates the TensorFlow installation
-from pre-existing packages on your machine. The Docker container contains
-TensorFlow and all its dependencies. Note that the Docker image can be quite
-large (hundreds of MBs). You might choose the Docker installation if you are
-incorporating TensorFlow into a larger application architecture that already
-uses Docker.
+## How to install TensorFlow
 
-In Anaconda, you may use conda to create a virtual environment.
-However, within Anaconda, we recommend installing TensorFlow with the
-`pip install` command, not with the `conda install` command.
-
-**NOTE:** The conda package is community supported, not officially supported.
-That is, the TensorFlow team neither tests nor maintains the conda package.
-Use that package at your own risk.
+There are a few options to install TensorFlow on your machine:
 
+* [Use pip in a virtual environment](#InstallingVirtualenv) *(recommended)*
+* [Use pip in your system environment](#InstallingNativePip)
+* [Configure a Docker container](#InstallingDocker)
+* [Use pip in Anaconda](#InstallingAnaconda)
+* [Install TensorFlow from source](/install/install_sources)
 
 <a name="InstallingVirtualenv"></a>
-## Installing with Virtualenv
+### Use `pip` in a virtual environment
 
-Take the following steps to install TensorFlow with Virtualenv:
+Key Point: Using a virtual environment is the recommended install method.
 
-  1. Install pip and Virtualenv by issuing one of the following commands:
+The [Virtualenv](https://virtualenv.pypa.io/en/stable/) tool creates virtual
+Python environments that are isolated from other Python development on the same
+machine. In this scenario, you install TensorFlow and its dependencies within a
+virtual environment that is available when *activated*. Virtualenv provides a
+reliable way to install and run TensorFlow while avoiding conflicts with the rest
+of the system.
 
-     <pre>$ <b>sudo apt-get install python-pip python-dev python-virtualenv</b> # for Python 2.7
-    $ <b>sudo apt-get install python3-pip python3-dev python-virtualenv</b> # for Python 3.n</pre>
+##### 1. Install Python, `pip`, and `virtualenv`.
 
-  2. Create a Virtualenv environment by issuing one of the following commands:
+On Ubuntu, Python is automatically installed and `pip` is *usually* installed.
+Confirm the `python` and `pip` versions:
 
-     <pre>$ <b>virtualenv --system-site-packages</b> <i>targetDirectory</i> # for Python 2.7
-    $ <b>virtualenv --system-site-packages -p python3</b> <i>targetDirectory</i> # for Python 3.n</pre>
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">python -V  # or: python3 -V</code>
+  <code class="devsite-terminal">pip -V     # or: pip3 -V</code>
+</pre>
 
-     where <code><em>targetDirectory</em></code> specifies the top of the
-     Virtualenv tree.  Our instructions assume that
-     <code><em>targetDirectory</em></code> is `~/tensorflow`, but you may
-     choose any directory.
+To install these packages on Ubuntu:
 
-  3. Activate the Virtualenv environment by issuing one of the following
-     commands:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install python-pip python-dev python-virtualenv   # for Python 2.7</code>
+  <code class="devsite-terminal">sudo apt-get install python3-pip python3-dev python-virtualenv # for Python 3.n</code>
+</pre>
 
-     <pre>$ <b>source ~/tensorflow/bin/activate</b> # bash, sh, ksh, or zsh
-    $ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh</pre>
+We *recommend* using `pip` version 8.1 or higher. If using a release before
+version 8.1,  upgrade `pip`:
 
-     The preceding <tt>source</tt> command should change your prompt
-     to the following:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install -U pip</code>
+</pre>
 
-     <pre>(tensorflow)$ </pre>
+If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is
+installed, use `easy_install` to install `pip`:
 
-  4. Ensure pip ≥8.1 is installed:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">easy_install -U pip</code>
+</pre>
 
-     <pre>(tensorflow)$ <b>easy_install -U pip</b></pre>
+##### 2. Create a directory for the virtual environment and choose a Python interpreter.
 
-  5. Issue one of the following commands to install TensorFlow in the active
-     Virtualenv environment:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">mkdir ~/tensorflow  # somewhere to work out of</code>
+  <code class="devsite-terminal">cd ~/tensorflow</code>
+  <code># Choose one of the following Python environments for the ./venv directory:</code>
+  <code class="devsite-terminal">virtualenv --system-site-packages <var>venv</var>            # Use python default (Python 2.7)</code>
+  <code class="devsite-terminal">virtualenv --system-site-packages -p python3 <var>venv</var> # Use Python 3.n</code>
+</pre>
 
-     <pre>(tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
-    (tensorflow)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
-    (tensorflow)$ <b>pip install --upgrade tensorflow-gpu</b>  # for Python 2.7 and GPU
-    (tensorflow)$ <b>pip3 install --upgrade tensorflow-gpu</b> # for Python 3.n and GPU</pre>
+##### 3. Activate the Virtualenv environment.
 
-     If the above command succeeds, skip Step 6. If the preceding
-     command fails, perform Step 6.
+Use one of these shell-specific commands to activate the virtual environment:
 
-  6. (Optional) If Step 5 failed (typically because you invoked a pip version
-     lower than 8.1), install TensorFlow in the active Virtualenv environment
-     by issuing a command of the following format:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">source ~/tensorflow/<var>venv</var>/bin/activate      # bash, sh, ksh, or zsh</code>
+  <code class="devsite-terminal">source ~/tensorflow/<var>venv</var>/bin/activate.csh  # csh or tcsh</code>
+  <code class="devsite-terminal">. ~/tensorflow/<var>venv</var>/bin/activate.fish      # fish</code>
+</pre>
 
-     <pre>(tensorflow)$ <b>pip install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
-    (tensorflow)$ <b>pip3 install --upgrade</b> <i>tfBinaryURL</i>  # Python 3.n </pre>
+When the Virtualenv is activated, the shell prompt displays as `(venv) $`.
 
-     where <code><em>tfBinaryURL</em></code> identifies the URL of the
-     TensorFlow Python package. The appropriate value of
-     <code><em>tfBinaryURL</em></code>depends on the operating system,
-     Python version, and GPU support. Find the appropriate value for
-     <code><em>tfBinaryURL</em></code> for your system
-     [here](#the_url_of_the_tensorflow_python_package).  For example, if you
-     are installing TensorFlow for Linux, Python 3.4, and CPU-only support,
-     issue the following command to install TensorFlow in the active
-     Virtualenv environment:
+##### 4. Upgrade `pip` in the virtual environment.
 
-     <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+Within the active virtual environment, upgrade `pip`:
 
-If you encounter installation problems, see
-[Common Installation Problems](#common_installation_problems).
+<pre class="prettyprint lang-bsh">
+(venv)$ pip install -U pip
+</pre>
 
+You can install other Python packages within the virtual environment without
+affecting packages outside the `virtualenv`.
 
-### Next Steps
+##### 5. Install TensorFlow in the virtual environment.
 
-After installing TensorFlow,
-[validate the installation](#ValidateYourInstallation).
+Choose one of the available TensorFlow packages for installation:
 
-Note that you must activate the Virtualenv environment each time you
-use TensorFlow. If the Virtualenv environment is not currently active,
-invoke one of the following commands:
+* `tensorflow` —Current release for CPU
+* `tensorflow-gpu` —Current release with GPU support
+* `tf-nightly` —Nightly build for CPU
+* `tf-nightly-gpu` —Nightly build with GPU support
 
-<pre> $ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
-$ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh</pre>
+Within an active Virtualenv environment, use `pip` to install the package:
 
-When the Virtualenv environment is active, you may run
-TensorFlow programs from this shell.  Your prompt will become
-the following to indicate that your tensorflow environment is active:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">pip install -U tensorflow</code>
+</pre>
 
-<pre>(tensorflow)$ </pre>
+Use `pip list` to show the packages installed in the virtual environment.
+[Validate the install](#ValidateYourInstallation) and test the version:
 
-When you are done using TensorFlow, you may deactivate the
-environment by invoking the `deactivate` function as follows:
+<pre class="prettyprint lang-bsh">
+(venv)$ python -c "import tensorflow as tf; print(tf.__version__)"
+</pre>
 
-<pre>(tensorflow)$ <b>deactivate</b> </pre>
+Success: TensorFlow is now installed.
 
-The prompt will revert back to your default prompt (as defined by the
-`PS1` environment variable).
+Use the `deactivate` command to stop the Python virtual environment.
 
+#### Problems
 
-### Uninstalling TensorFlow
+If the above steps failed, try installing the TensorFlow binary using the remote
+URL of the `pip` package:
 
-To uninstall TensorFlow, simply remove the tree you created.
-For example:
+<pre class="prettyprint lang-bsh">
+(venv)$ pip install --upgrade <var>remote-pkg-URL</var>   # Python 2.7
+(venv)$ pip3 install --upgrade <var>remote-pkg-URL</var>  # Python 3.n
+</pre>
 
-<pre>$ <b>rm -r</b> <i>targetDirectory</i> </pre>
+The <var>remote-pkg-URL</var> depends on the operating system, Python version,
+and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the
+URL naming scheme and location.
+
+See [Common Installation Problems](#common_installation_problems) if you
+encounter problems.
+
+#### Uninstall TensorFlow
+
+To uninstall TensorFlow, remove the Virtualenv directory you created in step 2:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">deactivate  # stop the virtualenv</code>
+  <code class="devsite-terminal">rm -r ~/tensorflow/<var>venv</var></code>
+</pre>
 
 
 <a name="InstallingNativePip"></a>
-## Installing with native pip
+### Use `pip` in your system environment
 
-You may install TensorFlow through pip, choosing between a simple
-installation procedure or a more complex one.
+Use `pip` to install the TensorFlow package directly on your system without
+using a container or virtual environment for isolation. This method is
+recommended for system administrators that want a TensorFlow installation that is
+available to everyone on a multi-user system.
 
-**Note:** The
+Since a system install is not isolated, it could interfere with other
+Python-based installations. But if you understand `pip` and your Python
+environment, a system `pip` install is straightforward.
+
+See the
 [REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
-lists the TensorFlow packages that pip will install or upgrade.
+for a list of packages that TensorFlow installs.
+
+##### 1. Install Python, `pip`, and `virtualenv`.
 
+On Ubuntu, Python is automatically installed and `pip` is *usually* installed.
+Confirm the `python` and `pip` versions:
 
-### Prerequisite: Python and Pip
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">python -V  # or: python3 -V</code>
+  <code class="devsite-terminal">pip -V     # or: pip3 -V</code>
+</pre>
 
-Python is automatically installed on Ubuntu.  Take a moment to confirm
-(by issuing a `python -V` command) that one of the following Python
-versions is already installed on your system:
+To install these packages on Ubuntu:
 
-  * Python 2.7
-  * Python 3.4+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install python-pip python-dev   # for Python 2.7</code>
+  <code class="devsite-terminal">sudo apt-get install python3-pip python3-dev # for Python 3.n</code>
+</pre>
 
-The pip or pip3 package manager is *usually* installed on Ubuntu.  Take a
-moment to confirm (by issuing a `pip -V` or `pip3 -V` command)
-that pip or pip3 is installed.  We strongly recommend version 8.1 or higher
-of pip or pip3.  If Version 8.1 or later is not installed, issue the
-following command, which will either install or upgrade to the latest
-pip version:
+We *recommend* using `pip` version 8.1 or higher. If using a release before
+version 8.1,  upgrade `pip`:
 
-<pre>$ <b>sudo apt-get install python-pip python-dev</b>   # for Python 2.7
-$ <b>sudo apt-get install python3-pip python3-dev</b> # for Python 3.n
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install -U pip</code>
 </pre>
 
+If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is
+installed, use `easy_install` to install `pip`:
 
-### Install TensorFlow
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">easy_install -U pip</code>
+</pre>
 
-Assuming the prerequisite software is installed on your Linux host,
-take the following steps:
+##### 2. Install TensorFlow on system.
 
-  1. Install TensorFlow by invoking **one** of the following commands:
+Choose one of the available TensorFlow packages for installation:
 
-     <pre>$ <b>pip install tensorflow</b>      # Python 2.7; CPU support (no GPU support)
-    $ <b>pip3 install tensorflow</b>     # Python 3.n; CPU support (no GPU support)
-    $ <b>pip install tensorflow-gpu</b>  # Python 2.7;  GPU support
-    $ <b>pip3 install tensorflow-gpu</b> # Python 3.n; GPU support </pre>
+* `tensorflow` —Current release for CPU
+* `tensorflow-gpu` —Current release with GPU support
+* `tf-nightly` —Nightly build for CPU
+* `tf-nightly-gpu` —Nightly build with GPU support
 
-     If the preceding command runs to completion, you should now
-     [validate your installation](#ValidateYourInstallation).
+And use `pip` to install the package for Python 2 or 3:
 
-  2. (Optional.) If Step 1 failed, install the latest version of TensorFlow
-     by issuing a command of the following format:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install -U tensorflow   # Python 2.7</code>
+  <code class="devsite-terminal">sudo pip3 install -U tensorflow  # Python 3.n</code>
+</pre>
 
-     <pre>$ <b>sudo pip  install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
-    $ <b>sudo pip3 install --upgrade</b> <i>tfBinaryURL</i>   # Python 3.n </pre>
+Use `pip list` to show the packages installed on the system.
+[Validate the install](#ValidateYourInstallation) and test the version:
 
-     where <code><em>tfBinaryURL</em></code> identifies the URL of the
-     TensorFlow Python package. The appropriate value of
-     <code><em>tfBinaryURL</em></code> depends on the operating system,
-     Python version, and GPU support. Find the appropriate value for
-     <code><em>tfBinaryURL</em></code>
-     [here](#the_url_of_the_tensorflow_python_package).  For example, to
-     install TensorFlow for Linux, Python 3.4, and CPU-only support, issue
-     the following command:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">python -c "import tensorflow as tf; print(tf.__version__)"</code>
+</pre>
 
-     <pre>
-     $ <b>sudo pip3 install --upgrade \
-<<<<<<< HEAD
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl</b>
-=======
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
-     </pre>
+Success: TensorFlow is now installed.
 
-     If this step fails, see
-     [Common Installation Problems](#common_installation_problems).
+#### Problems
 
+If the above steps failed, try installing the TensorFlow binary using the remote
+URL of the `pip` package:
 
-### Next Steps
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install --upgrade <var>remote-pkg-URL</var>   # Python 2.7</code>
+  <code class="devsite-terminal">sudo pip3 install --upgrade <var>remote-pkg-URL</var>  # Python 3.n</code>
+</pre>
 
-After installing TensorFlow, [validate your installation](#ValidateYourInstallation).
+The <var>remote-pkg-URL</var> depends on the operating system, Python version,
+and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the
+URL naming scheme and location.
 
+See [Common Installation Problems](#common_installation_problems) if you
+encounter problems.
 
-### Uninstalling TensorFlow
+#### Uninstall TensorFlow
 
-To uninstall TensorFlow, issue one of following commands:
+To uninstall TensorFlow on your system, use one of following commands:
 
-<pre>
-$ <b>sudo pip uninstall tensorflow</b>  # for Python 2.7
-$ <b>sudo pip3 uninstall tensorflow</b> # for Python 3.n
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip uninstall tensorflow   # for Python 2.7</code>
+  <code class="devsite-terminal">sudo pip3 uninstall tensorflow  # for Python 3.n</code>
 </pre>
 
-
 <a name="InstallingDocker"></a>
-## Installing with Docker
+### Configure a Docker container
+
+Docker completely isolates the TensorFlow installation
+from pre-existing packages on your machine. The Docker container contains
+TensorFlow and all its dependencies. Note that the Docker image can be quite
+large (hundreds of MBs). You might choose the Docker installation if you are
+incorporating TensorFlow into a larger application architecture that already
+uses Docker.
 
 Take the following steps to install TensorFlow through Docker:
 
@@ -340,7 +287,7 @@ Take the following steps to install TensorFlow through Docker:
 The remainder of this section explains how to launch a Docker container.
 
 
-### CPU-only
+#### CPU-only
 
 To launch a Docker container with CPU-only support (that is, without
 GPU support), enter a command of the following format:
@@ -360,24 +307,23 @@ where:
     to 6006.
   * <tt><i>TensorFlowCPUImage</i></tt> is required. It identifies the Docker
     container. Specify one of the following values:
-    * <tt>gcr.io/tensorflow/tensorflow</tt>, which is the TensorFlow CPU binary image.
-    * <tt>gcr.io/tensorflow/tensorflow:latest-devel</tt>, which is the latest
+    * <tt>tensorflow/tensorflow</tt>, which is the TensorFlow CPU binary image.
+    * <tt>tensorflow/tensorflow:latest-devel</tt>, which is the latest
       TensorFlow CPU Binary image plus source code.
-    * <tt>gcr.io/tensorflow/tensorflow:<i>version</i></tt>, which is the
+    * <tt>tensorflow/tensorflow:<i>version</i></tt>, which is the
       specified version (for example, 1.1.0rc1) of TensorFlow CPU binary image.
-    * <tt>gcr.io/tensorflow/tensorflow:<i>version</i>-devel</tt>, which is
+    * <tt>tensorflow/tensorflow:<i>version</i>-devel</tt>, which is
       the specified version (for example, 1.1.0rc1) of the TensorFlow GPU
       binary image plus source code.
 
-    <tt>gcr.io</tt> is the Google Container Registry. Note that some
-    TensorFlow images are also available at
+    TensorFlow images are available at
     [dockerhub](https://hub.docker.com/r/tensorflow/tensorflow/).
 
 For example, the following command launches the latest TensorFlow CPU binary image
 in a Docker container from which you can run TensorFlow programs in a shell:
 
 <pre>
-$ <b>docker run -it gcr.io/tensorflow/tensorflow bash</b>
+$ <b>docker run -it tensorflow/tensorflow bash</b>
 </pre>
 
 The following command also launches the latest TensorFlow CPU binary image in a
@@ -385,13 +331,13 @@ Docker container. However, in this Docker container, you can run TensorFlow
 programs in a Jupyter notebook:
 
 <pre>
-$ <b>docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow</b>
+$ <b>docker run -it -p 8888:8888 tensorflow/tensorflow</b>
 </pre>
 
 Docker will download the TensorFlow binary image the first time you launch it.
 
 
-### GPU support
+#### GPU support
 
 Prior to installing TensorFlow with GPU support, ensure that your system meets all
 [NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
@@ -409,14 +355,14 @@ where:
     <tt><i>hostPort</i></tt> and <code><em>containerPort</em></code> to `8888`.
   * <i>TensorFlowGPUImage</i> specifies the Docker container. You must
     specify one of the following values:
-    * <tt>gcr.io/tensorflow/tensorflow:latest-gpu</tt>, which is the latest
+    * <tt>tensorflow/tensorflow:latest-gpu</tt>, which is the latest
       TensorFlow GPU binary image.
-    * <tt>gcr.io/tensorflow/tensorflow:latest-devel-gpu</tt>, which is
+    * <tt>tensorflow/tensorflow:latest-devel-gpu</tt>, which is
       the latest TensorFlow GPU Binary image plus source code.
-    * <tt>gcr.io/tensorflow/tensorflow:<i>version</i>-gpu</tt>, which is the
+    * <tt>tensorflow/tensorflow:<i>version</i>-gpu</tt>, which is the
       specified version (for example, 0.12.1) of the TensorFlow GPU
       binary image.
-    * <tt>gcr.io/tensorflow/tensorflow:<i>version</i>-devel-gpu</tt>, which is
+    * <tt>tensorflow/tensorflow:<i>version</i>-devel-gpu</tt>, which is
       the specified version (for example, 0.12.1) of the TensorFlow GPU
       binary image plus source code.
 
@@ -425,7 +371,7 @@ following command launches the latest TensorFlow GPU binary image in a
 Docker container from which you can run TensorFlow programs in a shell:
 
 <pre>
-$ <b>nvidia-docker run -it gcr.io/tensorflow/tensorflow:latest-gpu bash</b>
+$ <b>nvidia-docker run -it tensorflow/tensorflow:latest-gpu bash</b>
 </pre>
 
 The following command also launches the latest TensorFlow GPU binary image
@@ -433,13 +379,13 @@ in a Docker container. In this Docker container, you can run TensorFlow
 programs in a Jupyter notebook:
 
 <pre>
-$ <b>nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu</b>
+$ <b>nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:latest-gpu</b>
 </pre>
 
 The following command installs an older TensorFlow version (0.12.1):
 
 <pre>
-$ <b>nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:0.12.1-gpu</b>
+$ <b>nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:0.12.1-gpu</b>
 </pre>
 
 Docker will download the TensorFlow binary image the first time you launch it.
@@ -447,14 +393,22 @@ For more details see the
 [TensorFlow docker readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker).
 
 
-### Next Steps
+#### Next Steps
 
 You should now
 [validate your installation](#ValidateYourInstallation).
 
 
 <a name="InstallingAnaconda"></a>
-## Installing with Anaconda
+### Use `pip` in Anaconda
+
+Anaconda provides the `conda` utility to create a virtual environment. However,
+within Anaconda, we recommend installing TensorFlow using the `pip install`
+command and *not* with the `conda install` command.
+
+Caution: `conda` is a community supported package this is not officially
+maintained by the TensorFlow team. Use this package at your own risk since it is
+not tested on new TensorFlow releases.
 
 Take the following steps to install TensorFlow in an Anaconda environment:
 
@@ -484,12 +438,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-<<<<<<< HEAD
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
-=======
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
-
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -514,7 +463,7 @@ If you installed through Docker, start a Docker container
 from which you can run bash. For example:
 
 <pre>
-$ <b>docker run -it gcr.io/tensorflow/tensorflow bash</b>
+$ <b>docker run -it tensorflow/tensorflow bash</b>
 </pre>
 
 
@@ -539,11 +488,94 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with TensorFlow}.
-
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
 
+If you are new to machine learning, we recommend the following:
+
+*  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
+*  @{$get_started/eager}
+
+
+<a name="NVIDIARequirements"></a>
+## TensorFlow GPU support
+
+To install TensorFlow with GPU support, configure the following NVIDIA® software
+on your system:
+
+* [CUDA Toolkit 9.0](http://nvidia.com/cuda). For details, see
+  [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+  Append the relevant CUDA pathnames to the `LD_LIBRARY_PATH` environmental
+  variable as described in the NVIDIA documentation.
+* [cuDNN SDK v7](http://developer.nvidia.com/cudnn). For details, see
+  [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
+  Create the `CUDA_HOME` environment variable as described in the NVIDIA
+  documentation.
+* A GPU card with CUDA Compute Capability 3.0 or higher for building TensorFlow
+  from source. To use the TensorFlow binaries, version 3.5 or higher is required.
+  See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
+  list of supported GPU cards.
+* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
+  Toolkit.
+* The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
+  library provides advanced profiling support. To install this library,
+  use the following command for CUDA Toolkit >= 8.0:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install cuda-command-line-tools</code>
+</pre>
+
+Add this path to the `LD_LIBRARY_PATH` environmental variable:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</code>
+</pre>
+
+For CUDA Toolkit <= 7.5 use:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install libcupti-dev</code>
+</pre>
+
+* *OPTIONAL*:  For optimized performance during inference, install
+  *NVIDIA&nbsp;TensorRT&nbsp;3.0*. To install the minimal amount of TensorRT
+  runtime components required to use with the pre-built `tensorflow-gpu` package:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</code>
+  <code class="devsite-terminal">sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</code>
+  <code class="devsite-terminal">sudo apt-get update</code>
+  <code class="devsite-terminal">sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0</code>
+</pre>
+
+Note: For compatibility with the pre-built `tensorflow-gpu` package, use the
+Ubuntu *14.04* package of TensorRT (shown above). Use this even when installing
+on an Ubuntu 16.04 system.
+
+To build the TensorFlow-TensorRT integration module from source instead of using
+the pre-built binaries, see the
+[module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow).
+For detailed TensorRT installation instructions, see
+[NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).
+
+To avoid cuDNN version conflicts during later system upgrades, hold the cuDNN
+version at 7.0.5:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-mark hold libcudnn7 libcudnn7-dev</code>
+</pre>
+
+To allow upgrades, remove the this hold:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-mark unhold libcudnn7 libcudnn7-dev</code>
+</pre>
+
+If you have an earlier version of the preceding packages, upgrade to the
+specified versions. If upgrading is not possible, you can still run TensorFlow
+with GPU support by @{$install_sources}.
+
+
 ## Common installation problems
 
 We are relying on Stack Overflow to document TensorFlow installation problems
@@ -556,7 +588,7 @@ ask a new question about it on Stack Overflow and specify
 the `tensorflow` tag.
 
 <table>
-<tr> <th>Stack Overflow Link</th> <th>Error Message</th> </tr>
+<tr> <th>Link to GitHub or Stack&nbsp;Overflow</th> <th>Error Message</th> </tr>
 
 <tr>
   <td><a href="https://stackoverflow.com/q/36159194">36159194</a></td>
@@ -656,22 +688,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp27-none-linux_x86_64.whl
-=======
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp27-none-linux_x86_64.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp27-none-linux_x86_64.whl
-=======
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp27-none-linux_x86_64.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -683,22 +707,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl
-=======
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp34-cp34m-linux_x86_64.whl
-=======
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -710,22 +726,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp35-cp35m-linux_x86_64.whl
-=======
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp35-cp35m-linux_x86_64.whl
-=======
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -737,22 +745,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp36-cp36m-linux_x86_64.whl
-=======
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp36-cp36m-linux_x86_64.whl
-=======
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index a2f484ebf8867620630fa815fee55c06585f4fa1..ff6c2f5e447873f479570429d1f19b859ab6c56d 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -5,7 +5,11 @@ instructions might also work on other macOS variants, we have only
 tested (and we only support) these instructions on machines meeting the
 following requirements:
 
-  * macOS X 10.11 (El Capitan) or higher
+  * macOS 10.12.6 (Sierra) or higher
+
+Note: There are known, accuracy-affecting numerical issues before macOS 10.12.6
+(Sierra) that are described in
+[GitHub#15933](https://github.com/tensorflow/tensorflow/issues/15933#issuecomment-366331383).
 
 Note: As of version 1.2, TensorFlow no longer provides GPU support on macOS.
 
@@ -115,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -234,15 +238,11 @@ take the following steps:
      operating system and Python version. Find the appropriate
      value for <i>tfBinaryURL</i>
      [here](#the_url_of_the_tensorflow_python_package).  For example, if
-     you are installing TensorFlow for Mac OS and Python 2.7
+     you are installing TensorFlow for macOS and Python 2.7
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-<<<<<<< HEAD
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py3-none-any.whl</b> </pre>
-=======
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b> </pre>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -292,24 +292,23 @@ where:
     to 6006.
   * <i>TensorFlowImage</i> is required. It identifies the Docker container.
     You must specify one of the following values:
-    * <code>gcr.io/tensorflow/tensorflow</code>: TensorFlow binary image.
-    * <code>gcr.io/tensorflow/tensorflow:latest-devel</code>: TensorFlow
+    * <code>tensorflow/tensorflow</code>: TensorFlow binary image.
+    * <code>tensorflow/tensorflow:latest-devel</code>: TensorFlow
       Binary image plus source code.
 
-<code>gcr.io</code> is the Google Container Registry. Note that some
-TensorFlow images are also available at
+The TensorFlow images are available at
 [dockerhub](https://hub.docker.com/r/tensorflow/tensorflow/).
 
 For example, the following command launches a TensorFlow CPU binary image
 in a Docker container from which you can run TensorFlow programs in a shell:
 
-<pre>$ <b>docker run -it gcr.io/tensorflow/tensorflow bash</b></pre>
+<pre>$ <b>docker run -it tensorflow/tensorflow bash</b></pre>
 
 The following command also launches a TensorFlow CPU binary image in a
 Docker container. However, in this Docker container, you can run
 TensorFlow programs in a Jupyter notebook:
 
-<pre>$ <b>docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow</b></pre>
+<pre>$ <b>docker run -it -p 8888:8888 tensorflow/tensorflow</b></pre>
 
 Docker will download the TensorFlow binary image the first time you launch it.
 
@@ -351,11 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-<<<<<<< HEAD
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py2-none-any.whl</b></pre>
-=======
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl</b></pre>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -380,7 +375,7 @@ do the following:
 If you installed through Docker, start a Docker container that runs bash.
 For example:
 
-<pre>$ <b>docker run -it gcr.io/tensorflow/tensorflow bash</b></pre>
+<pre>$ <b>docker run -it tensorflow/tensorflow bash</b></pre>
 
 
 
@@ -405,12 +400,18 @@ writing TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see
-@{$get_started/premade_estimators$Getting Started with TensorFlow}.
-
 If the system outputs an error message instead of a greeting, see
 [Common installation problems](#common_installation_problems).
 
+If you are new to machine learning, we recommend the following:
+
+*  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
+*  @{$get_started/get_started_for_beginners$Getting Started for ML Beginners}
+
+If you are experienced with machine learning but new to TensorFlow, see
+@{$get_started/premade_estimators$Getting Started with TensorFlow}.
+
+
 ## Common installation problems
 
 We are relying on Stack Overflow to document TensorFlow installation problems
@@ -517,22 +518,13 @@ RuntimeError: Broken toolchain: cannot link a simple C program</pre>
 ## The URL of the TensorFlow Python package
 
 A few installation mechanisms require the URL of the TensorFlow Python package.
-The value you specify depends on three factors:
-
-  * operating system
-  * Python version
-
-This section documents the relevant values for Mac OS installations.
+The value you specify depends on your Python version.
 
 ### Python 2.7
 
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py2-none-any.whl
-=======
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl
 </pre>
 
 
@@ -540,9 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-a
 
 
 <pre>
-<<<<<<< HEAD
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py3-none-any.whl
-=======
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index d01be212601d5bd2941fe4852498a4fb4d4403d6..5c5c9e057b227380d8a0c98ccbb5d0600776ad03 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -133,30 +133,21 @@ The following NVIDIA <i>hardware</i> must be installed on your system:
 
 The following NVIDIA <i>software</i> must be installed on your system:
 
-  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 9.0.
+  * [CUDA Toolkit](http://nvidia.com/cuda) (>= 7.0). We recommend version 9.0.
     For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
-    Ensure that you append the relevant Cuda pathnames to the
+    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+    Ensure that you append the relevant CUDA pathnames to the
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
-  * The NVIDIA drivers associated with NVIDIA's Cuda Toolkit.
-  * cuDNN (>= v3). We recommend version 6.0. For details, see
-    [NVIDIA's documentation](https://developer.nvidia.com/cudnn),
-    particularly the description of appending the appropriate pathname
-    to your `LD_LIBRARY_PATH` environment variable.
-
-Finally, you must also install `libcupti` which for Cuda Toolkit >= 8.0 you do via
-
-<pre> $ <b>sudo apt-get install cuda-command-line-tools</b> </pre>
-
-and add its path to your `LD_LIBRARY_PATH` environment variable:
-
-<pre> $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b> </pre>
-
-For Cuda Toolkit <= 7.5, you install `libcupti-dev` by invoking the following command:
-
-<pre> $ <b>sudo apt-get install libcupti-dev</b> </pre>
+  * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA
+    Toolkit.
+  * [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= v3). We recommend version 7.0. For details, see
+    [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
+  * [CUPTI](http://docs.nvidia.com/cuda/cupti/) ships with the CUDA Toolkit, but
+    you also need to append its path to the `LD_LIBRARY_PATH` environment
+    variable:
 
+    <pre> $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b> </pre>
 
 ### Next
 
@@ -240,8 +231,8 @@ such as compiler flags. You must run this script *prior* to
 creating the pip package and installing TensorFlow.
 
 If you wish to build TensorFlow with GPU, `configure` will ask
-you to specify the version numbers of Cuda and cuDNN. If several
-versions of Cuda or cuDNN are installed on your system, explicitly select
+you to specify the version numbers of CUDA and cuDNN. If several
+versions of CUDA or cuDNN are installed on your system, explicitly select
 the desired version instead of relying on the default.
 
 One of the questions that `configure` will ask is as follows:
@@ -250,12 +241,12 @@ One of the questions that `configure` will ask is as follows:
 Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]
 </pre>
 
-This question refers to a later phase in which you'll use bazel to
-[build the pip package](#build-the-pip-package).  We recommend
-accepting the default (`-march=native`), which will
-optimize the generated code for your local machine's CPU type.  However,
-if you are building TensorFlow on one CPU type but will run TensorFlow on
-a different CPU type, then consider specifying a more specific optimization
+This question refers to a later phase in which you'll use bazel to [build the
+pip package](#build-the-pip-package) or the [C/Java libraries](#BuildCorJava).
+We recommend accepting the default (`-march=native`), which will optimize the
+generated code for your local machine's CPU type.  However, if you are building
+TensorFlow on one CPU type but will run TensorFlow on a different CPU type, then
+consider specifying a more specific optimization
 flag as described in [the gcc
 documentation](https://gcc.gnu.org/onlinedocs/gcc-4.5.3/gcc/i386-and-x86_002d64-Options.html).
 
@@ -289,12 +280,12 @@ Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
 CUDA support will be enabled for TensorFlow
 Do you want to use clang as CUDA compiler? [y/N]
 nvcc will be used as CUDA compiler
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
 Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
 Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
 Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-Please specify a list of comma-separated Cuda compute capabilities you want to build with.
+Please specify a list of comma-separated CUDA compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
 [Default is: "3.5,5.2"]: <b>3.0</b>
@@ -304,14 +295,14 @@ Configuration finished
 </pre>
 
 If you told `configure` to build for GPU support, then `configure`
-will create a canonical set of symbolic links to the Cuda libraries
-on your system.  Therefore, every time you change the Cuda library paths,
+will create a canonical set of symbolic links to the CUDA libraries
+on your system.  Therefore, every time you change the CUDA library paths,
 you must rerun the `configure` script before re-invoking
 the <code>bazel build</code> command.
 
 Note the following:
 
-  * Although it is possible to build both Cuda and non-Cuda configs
+  * Although it is possible to build both CUDA and non-CUDA configs
     under the same source tree, we recommend running `bazel clean` when
     switching between these two configurations in the same source tree.
   * If you don't run the `configure` script *before* running the
@@ -320,6 +311,10 @@ Note the following:
 
 ## Build the pip package
 
+Note: If you're only interested in building the libraries for the TensorFlow C
+or Java APIs, see [Build the C or Java libraries](#BuildCorJava), you do not
+need to build the pip package in that case.
+
 To build a pip package for TensorFlow with CPU-only support,
 you would typically invoke the following command:
 
@@ -359,10 +354,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.6.0rc1 on Linux:
+for TensorFlow 1.8.0rc1 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc1-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -393,15 +388,14 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with
-TensorFlow}.
+If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
 
-## Common installation problems
+## Common build and installation problems
 
-The installation problems you encounter typically depend on the
+The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
@@ -454,19 +448,23 @@ Stack Overflow and specify the `tensorflow` tag.
   </td>
 </tr>
 
+<tr>
+  <td><a href="https://stackoverflow.com/q/47080760">47080760</a></td>
+  <td><pre>undefined reference to `cublasGemmEx@libcublas.so.9.0'</pre></td>
+</tr>
+
 </table>
 
 ## Tested source configurations
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<<<<<<< HEAD
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
-=======
-<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@@ -484,11 +482,9 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<<<<<<< HEAD
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
-=======
-<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -502,13 +498,12 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<<<<<<< HEAD
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
-=======
-<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
->>>>>>> 943a21fcdc1c48c8e95d872911ad52b13f0c037d
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
@@ -522,3 +517,20 @@ Stack Overflow and specify the `tensorflow` tag.
 <tr><td>tensorflow-1.0.0</td><td>CPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.0.0</td><td>GPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>5.1</td><td>8</td></tr>
 </table>
+
+<a name="BuildCorJava"></a>
+## Build the C or Java libraries
+
+The instructions above are tailored to building the TensorFlow Python packages.
+
+If you're interested in building the libraries for the TensorFlow C API, do the
+following:
+
+1.  Follow the steps up to [Configure the installation](#ConfigureInstallation)
+2.  Build the C libraries following instructions in the [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md).
+
+If you're interested inv building the libraries for the TensorFlow Java API,
+do the following:
+
+1.  Follow the steps up to [Configure the installation](#ConfigureInstallation)
+2.  Build the Java library following instructions in the [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md).
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index dedf485f93d6fd6a8ce7b4465548cc998d307daa..86add74da15005a56bf0fd88c775139cd030c243 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -17,7 +17,7 @@ You must choose one of the following types of TensorFlow to install:
     NVIDIA® GPU, you must install this version. Note that this version of
     TensorFlow is typically much easier to install (typically,
     in 5 or 10 minutes), so even if you have an NVIDIA GPU, we recommend
-    installing this version first.
+    installing this version first. Prebuilt binaries will use AVX instructions.
   * **TensorFlow with GPU support**. TensorFlow programs typically run
     significantly faster on a GPU than on a CPU. Therefore, if your
     system has a NVIDIA® GPU meeting the prerequisites shown below
@@ -41,7 +41,8 @@ installed on your system:
     Note that cuDNN is typically installed in a different location from the
     other CUDA DLLs. Ensure that you add the directory where you installed
     the cuDNN DLL to your `%PATH%` environment variable.
-  * GPU card with CUDA Compute Capability 3.0 or higher.  See
+  * GPU card with CUDA Compute Capability 3.0 or higher for building
+    from source and 3.5 or higher for our binaries. See
     [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
     list of supported GPU cards.
 
@@ -153,14 +154,17 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with
-TensorFlow}.
-
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
 
-There is also a helpful [script](https://gist.github.com/mrry/ee5dbcfdd045fa48a27d56664411d41c)
-for Windows TensorFlow installation issues.
+If you are new to machine learning, we recommend the following:
+
+*  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
+*  @{$get_started/get_started_for_beginners$Getting Started for ML Beginners}
+
+If you are experienced with machine learning but new to TensorFlow, see
+@{$get_started/premade_estimators$Getting Started with TensorFlow}.
+
 
 ## Common installation problems
 
diff --git a/tensorflow/docs_src/javascript/index.md b/tensorflow/docs_src/javascript/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad63eeb255d870064567a0de8a28815ce2ae0172
--- /dev/null
+++ b/tensorflow/docs_src/javascript/index.md
@@ -0,0 +1,5 @@
+# JavaScript 
+
+You may develop TensorFlow programs in JavaScript, training and deploying
+models right in your browser.  For details, see
+[js.tensorflow.org](https://js.tensorflow.org).
diff --git a/tensorflow/docs_src/javascript/leftnav_files b/tensorflow/docs_src/javascript/leftnav_files
new file mode 100644
index 0000000000000000000000000000000000000000..fc0ab8a5435943f6442969ec5787305b98c7908b
--- /dev/null
+++ b/tensorflow/docs_src/javascript/leftnav_files
@@ -0,0 +1 @@
+index.md
diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md
index b5a1d5d7d1bf3b456ab24165e273969bdbd7bfca..c35530061dcaf2a4a894dcdf54fd794907d98162 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/docs_src/mobile/android_build.md
@@ -51,7 +51,8 @@ If you haven't already, do the following two things:
         // set to 'bazel', 'cmake', 'makefile', 'none'
         def nativeBuildSystem = 'none'
 
-4. Click the Run button (the green arrow) or use **Run -> Run 'android'** from the top menu.
+4. Click the *Run* button (the green arrow) or select *Run > Run 'android'* from the
+    top menu. You may need to rebuild the project using *Build > Rebuild Project*.
 
     If it asks you to use Instant Run, click **Proceed Without Instant Run**.
 
@@ -90,8 +91,8 @@ using [ADB](https://developer.android.com/studio/command-line/adb.html). This
 requires some knowledge of build systems and Android developer tools, but we'll
 guide you through the basics here.
 
-- First, follow our instructions for @{$install/install_sources$installing from
-  sources}. This will also guide you through installing Bazel and cloning the
+- First, follow our instructions for @{$install/install_sources$installing from sources}.
+  This will also guide you through installing Bazel and cloning the
   TensorFlow code.
 
 - Download the Android [SDK](https://developer.android.com/studio/index.html)
diff --git a/tensorflow/docs_src/mobile/leftnav_files b/tensorflow/docs_src/mobile/leftnav_files
index ac50f528ba468d8a830c059539d3399f413f39c8..585470d5f0847716863ba6129bf75c26631fecbd 100644
--- a/tensorflow/docs_src/mobile/leftnav_files
+++ b/tensorflow/docs_src/mobile/leftnav_files
@@ -1,7 +1,9 @@
 index.md
 ### TensorFlow Lite
 tflite/index.md
+tflite/devguide.md
 tflite/demo_android.md
+tflite/demo_ios.md
 >>>
 ### TensorFlow Mobile
 mobile_intro.md
diff --git a/tensorflow/docs_src/mobile/optimizing.md b/tensorflow/docs_src/mobile/optimizing.md
index 44cacff5dbbcb0685044c342184464b47a8ed090..778e4d3a6233c3bec70b830bc998013745a1f0ba 100644
--- a/tensorflow/docs_src/mobile/optimizing.md
+++ b/tensorflow/docs_src/mobile/optimizing.md
@@ -233,6 +233,8 @@ order by how long they took. From left to right, the columns are:
 - The cumulative total time of this and the previous ops in the table. This is
   handy for understanding what the distribution of work is across the layers, to
   see if just a few of the nodes are taking up most of the time.
+  
+- The amount of memory consumed by outputs of this type of op.
 
 - Name of the node.
 
@@ -290,8 +292,8 @@ run it on a 64-bit ARM device:
 
 You can interpret the results in exactly the same way as the desktop version
 above. If you have any trouble figuring out what the right input and output
-names and types are, take a look at the @{$mobile/prepare_models$Preparing
-models} page for details about detecting these for your model, and look at the
+names and types are, take a look at the @{$mobile/prepare_models$Preparing models}
+page for details about detecting these for your model, and look at the
 `summarize_graph` tool which may give you
 helpful information.
 
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 360ee302aa96bc3a0b65eab7b39c3dacf56b42c0..8b22c04d872f18607c485775cb8f096f0a361995 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -60,7 +60,7 @@ and serialized as protocol buffers:
   the `NodeDef`, so if all the `Variable` weights are converted to `Const` nodes,
   then we only need a single `GraphDef` file to hold the model architecture and
   the weights. Freezing the graph handles the process of loading the
-  checkpoints, and then converts all Consts to Variables. You can then load the
+  checkpoints, and then converts all Variables to Consts. You can then load the
   resulting file in a single call, without having to restore variable values
   from checkpoints. One thing to watch out for with `GraphDef` files is that
   sometimes they’re stored in text format for easy inspection. These versions
diff --git a/tensorflow/docs_src/mobile/tflite/demo_android.md b/tensorflow/docs_src/mobile/tflite/demo_android.md
index 79b567897cb8a38ed2e27e73aa7e8fee95f718b8..7f2f8882a24702d167599452e66afbe720026808 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_android.md
+++ b/tensorflow/docs_src/mobile/tflite/demo_android.md
@@ -1,39 +1,144 @@
-# TensorFlow Lite Demo for Android
+# Android Demo App
 
-The TensorFlow Lite demo is a camera app that continuously classifies whatever
-it sees from your device's back camera, using a quantized MobileNet model.
+An example Android application using TensorFLow Lite is available
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
+The demo is a sample camera app that classifies images continuously
+using either a quantized Mobilenet model or a floating point Inception-v3 model.
+To run the demo, a device running Android 5.0 ( API 21) or higher is required.
 
-You'll need an Android device running Android 5.0 or higher to run the demo.
+In the demo app, inference is done using the TensorFlow Lite Java API. The demo
+app classifies frames in real-time, displaying the top most probable
+classifications. It also displays the time taken to detect the object.
 
-To get you started working with TensorFlow Lite on Android, we'll walk you
-through building and deploying our TensorFlow demo app in Android Studio.
+There are three ways to get the demo app to your device:
 
-It's also possible to build the demo app with Bazel, but we only recommend
-this for advanced users who are very familiar with the Bazel build
-environment. For more information on that, see our page [on Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite#building-tensorflow-lite-and-the-demo-app-from-source).
+* Download the [prebuilt binary APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+* Use Android Studio to build the application.
+* Download the source code for TensorFlow Lite and the demo and build it using
+  bazel.
 
-## Build and deploy with Android Studio
 
-1. Clone the TensorFlow repository from GitHub if you haven't already:
+## Download the pre-built binary
 
-        git clone https://github.com/tensorflow/tensorflow
+The easiest way to try the demo is to download the
+[pre-built binary APK](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
 
-2. Install the latest version of Android Studio from [here](https://developer.android.com/studio/index.html).
+Once the APK is installed, click the app icon to start the program. The first
+time the app is opened, it asks for runtime permissions to access the device
+camera. The demo app opens the back-camera of the device and recognizes objects
+in the camera's field of view. At the bottom of the image (or at the left
+of the image if the device is in landscape mode), it displays top three objects
+classified and the classification latency.
 
-3. From the **Welcome to Android Studio** screen, use the **Import Project
-   (Gradle, Eclipse ADT, etc)** option to import the
-   `tensorflow/contrib/lite/java/demo` directory as an existing Android Studio
-   Project.
 
-    Android Studio may prompt you to install Gradle upgrades and other tool
-    versions; you should accept these upgrades.
+## Build in Android Studio with TensorFlow Lite AAR from JCenter
 
-4. Download the TensorFlow Lite MobileNet model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip).
+Use Android Studio to try out changes in the project code and compile the demo
+app:
 
-    Unzip this and copy the `mobilenet_quant_v1_224.tflite` file to the assets
-    directory: `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
+* Install the latest version of
+  [Android Studio](https://developer.android.com/studio/index.html).
+* Make sure the Android SDK version is greater than 26 and NDK version is greater
+  than 14 (in the Android Studio settings).
+* Import the `tensorflow/contrib/lite/java/demo` directory as a new
+  Android Studio project.
+* Install all the Gradle extensions it requests.
 
-5. Build and run the app in Android Studio.
+To get a model, either:
 
-You'll have to grant permissions for the app to use the device's camera. Point
-the camera at various objects and enjoy seeing how the model classifies things!
+* Download the quantized [Mobilenet TensorFlow Lite model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
+  and unzip and copy `mobilenet_quant_v1_224.tflite` to the assets directory:
+  `tensorflow/contrib/lite/java/demo/app/src/main/assets/`.
+* Or, download the floating point [Inception-v3 model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
+  and unzip and copy `inceptionv3_non_slim_2015.tflite` to the assets
+  directory. Change the chosen classifier in
+  [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java)<br>
+  from: `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`<br>
+  to: `classifier = new ImageClassifierFloatInception(getActivity());`.
+
+Now you can build and run the demo app.
+
+
+## Build TensorFlow Lite and the demo app from source
+
+### Clone the TensorFlow repo
+
+```sh
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Install Bazel
+
+If `bazel` is not installed on your system, see
+[Installing Bazel](https://bazel.build/versions/master/docs/install.html).
+
+Note: Bazel does not currently support Android builds on Windows. Windows users
+should download the
+[prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+
+### Install Android NDK and SDK
+
+The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The
+current recommended version is *14b* and can be found on the
+[NDK Archives](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads)
+page.
+
+The Android SDK and build tools can be
+[downloaded separately](https://developer.android.com/tools/revisions/build-tools.html)
+or used as part of
+[Android Studio](https://developer.android.com/studio/index.html). To build the
+TensorFlow Lite Android demo, build tools require API >= 23 (but it will run on
+devices with API >= 21).
+
+In the root of the TensorFlow repository, update the `WORKSPACE` file with the
+`api_level` and location of the SDK and NDK. If you installed it with
+Android Studio, the SDK path can be found in the SDK manager. The default NDK
+path is:`{SDK path}/ndk-bundle.` For example:
+
+```
+android_sdk_repository (
+    name = "androidsdk",
+    api_level = 23,
+    build_tools_version = "23.0.2",
+    path = "/home/xxxx/android-sdk-linux/",
+)
+
+android_ndk_repository(
+    name = "androidndk",
+    path = "/home/xxxx/android-ndk-r10e/",
+    api_level = 19,
+)
+```
+
+Some additional details are available on the
+[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
+
+### Build the source code
+
+To build the demo app, run `bazel`:
+
+```
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
+```
+
+Caution: Because of an bazel bug, we only support building the Android demo app
+within a Python 2 environment.
+
+
+## About the demo
+
+The demo app is resizing each camera image frame (224 width * 224 height) to
+match the quantized MobileNets model (299 * 299 for Inception-v3). The resized
+image is converted—row by row—into a
+[ByteBuffer](https://developer.android.com/reference/java/nio/ByteBuffer.html).
+Its size is  1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch.
+224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents
+the 3 colors of a pixel.
+
+This demo uses the TensorFlow Lite Java inference API
+for models which take a single input and provide a single output. This outputs a
+two-dimensional array, with the first dimension being the category index and the
+second dimension being the confidence of classification. Both models have 1001
+unique categories and the app sorts the probabilities of all the categories and
+displays the top three. The model file must be downloaded and bundled within the
+assets directory of the app.
diff --git a/tensorflow/docs_src/mobile/tflite/demo_ios.md b/tensorflow/docs_src/mobile/tflite/demo_ios.md
new file mode 100644
index 0000000000000000000000000000000000000000..3be21da89f9e53d324c2ade0cb937f4b5b30fad4
--- /dev/null
+++ b/tensorflow/docs_src/mobile/tflite/demo_ios.md
@@ -0,0 +1,68 @@
+# iOS Demo App
+
+The TensorFlow Lite demo is a camera app that continuously classifies whatever
+it sees from your device's back camera, using a quantized MobileNet model. These
+instructions walk you through building and running the demo on an iOS device.
+
+## Prerequisites
+
+* You must have [Xcode](https://developer.apple.com/xcode/) installed and have a
+  valid Apple Developer ID, and have an iOS device set up and linked to your
+  developer account with all of the appropriate certificates. For these
+  instructions, we assume that you have already been able to build and deploy an
+  app to an iOS device with your current developer environment.
+
+* The demo app requires a camera and must be executed on a real iOS device. You
+  can build it and run with the iPhone Simulator but it won't have any camera
+  information to classify.
+
+* You don't need to build the entire TensorFlow library to run the demo, but you
+  will need to clone the TensorFlow repository if you haven't already:
+
+        git clone https://github.com/tensorflow/tensorflow
+
+* You'll also need the Xcode command-line tools:
+
+        xcode-select --install
+
+    If this is a new install, you will need to run the Xcode application once to
+    agree to the license before continuing.
+
+## Building the iOS Demo App
+
+1. Install CocoaPods if you don't have it:
+
+        sudo gem install cocoapods
+
+2. Download the model files used by the demo app (this is done from inside the
+   cloned directory):
+
+        sh tensorflow/contrib/lite/examples/ios/download_models.sh
+
+3. Install the pod to generate the workspace file:
+
+        cd tensorflow/contrib/lite/examples/ios/camera
+        pod install
+
+    If you have installed this pod before and that command doesn't work, try
+
+        pod update
+
+    At the end of this step you should have a file called 
+    `tflite_camera_example.xcworkspace`.
+
+4. Open the project in Xcode by typing this on the command line:
+
+        open tflite_camera_example.xcworkspace
+
+    This launches Xcode if it isn't open already and opens the
+    `tflite_camera_example` project.
+
+5. Build and run the app in Xcode.
+
+    Note that as mentioned earlier, you must already have a device set up and
+    linked to your Apple Developer account in order to deploy the app on a
+    device.
+
+You'll have to grant permissions for the app to use the device's camera. Point
+the camera at various objects and enjoy seeing how the model classifies things!
diff --git a/tensorflow/docs_src/mobile/tflite/devguide.md b/tensorflow/docs_src/mobile/tflite/devguide.md
new file mode 100644
index 0000000000000000000000000000000000000000..4133bc172a1924f0ce8bb515d66fc03d716923c8
--- /dev/null
+++ b/tensorflow/docs_src/mobile/tflite/devguide.md
@@ -0,0 +1,231 @@
+# Developer Guide
+
+Using a TensorFlow Lite model in your mobile app requires multiple
+considerations: you must choose a pre-trained or custom model, convert the model
+to a TensorFLow Lite format, and finally, integrate the model in your app.
+
+## 1. Choose a model
+
+Depending on the use case, you can choose one of the popular open-sourced models,
+such as *InceptionV3* or *MobileNets*, and re-train these models with a custom
+data set or even build your own custom model.
+
+### Use a pre-trained model
+
+[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
+is a family of mobile-first computer vision models for TensorFlow designed to
+effectively maximize accuracy, while taking into consideration the restricted
+resources for on-device or embedded applications. MobileNets are small,
+low-latency, low-power models parameterized to meet the resource constraints for
+a variety of uses. They can be used for classification, detection, embeddings, and
+segmentation—similar to other popular large scale models, such as
+[Inception](https://arxiv.org/pdf/1602.07261.pdf). Google provides 16 pre-trained
+[ImageNet](http://www.image-net.org/challenges/LSVRC/) classification checkpoints
+for MobileNets that can be used in mobile projects of all sizes.
+
+[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model
+that achieves fairly high accuracy recognizing general objects with 1000 classes,
+for example, "Zebra", "Dalmatian", and "Dishwasher". The model extracts general
+features from input images using a convolutional neural network and classifies
+them based on those features with fully-connected and softmax layers.
+
+[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+is an on-device model that provides one-touch replies for incoming text messages
+by suggesting contextually relevant messages. The model is built specifically for
+memory constrained devices, such as watches and phones, and has been successfully
+used in Smart Replies on Android Wear. Currently, this model is Android-specific.
+
+These pre-trained models are [available for download](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/models.md)
+
+### Re-train Inception-V3 or MobileNet for a custom data set
+
+These pre-trained models were trained on the *ImageNet* data set which contains
+1000 predefined classes. If these classes are not sufficient for your use case,
+the model will need to be re-trained. This technique is called
+*transfer learning* and starts with a model that has been already trained on a
+problem, then retrains the model on a similar problem. Deep learning from
+scratch can take days, but transfer learning is fairly quick. In order to do
+this, you need to generate a custom data set labeled with the relevant classes.
+
+The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
+codelab walks through the re-training process step-by-step. The code supports
+both floating point and quantized inference.
+
+### Train a custom model
+
+A developer may choose to train a custom model using Tensorflow (see the
+@{$tutorials} for examples of building and training models). If you have already
+written a model, the first step is to export this to a @{tf.GraphDef} file. This
+is required because some formats do not store the model structure outside the
+code, and we must communicate with other parts of the framework. See
+[Exporting the Inference Graph](https://github.com/tensorflow/models/blob/master/research/slim/README.md)
+to create .pb file for the custom model.
+
+TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to the
+[TensorFlow Lite & TensorFlow Compatibility Guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
+for supported operators and their usage. This set of operators will continue to
+grow in future Tensorflow Lite releases.
+
+
+## 2. Convert the model format
+
+The model generated (or downloaded) in the previous step is a *standard*
+Tensorflow model and you should now have a .pb or .pbtxt @{tf.GraphDef} file.
+Models generated with transfer learning (re-training) or custom models must be
+converted—but, we must first freeze the graph to convert the model to the
+Tensorflow Lite format. This process uses several model formats:
+
+* @{tf.GraphDef} (.pb) —A protobuf that represents the TensorFlow training or
+  computation graph. It contains operators, tensors, and variables definitions.
+* *CheckPoint* (.ckpt) —Serialized variables from a TensorFlow graph. Since this
+  does not contain a graph structure, it cannot be interpreted by itself.
+* `FrozenGraphDef` —A subclass of `GraphDef` that does not contain
+  variables. A `GraphDef` can be converted to a `FrozenGraphDef` by taking a
+  CheckPoint and a `GraphDef`, and converting each variable into a constant
+  using the value retrieved from the CheckPoint.
+* `SavedModel` —A `GraphDef` and CheckPoint with a signature that labels
+  input and output arguments to a model. A `GraphDef` and CheckPoint can be
+  extracted from a `SavedModel`.
+* *TensorFlow Lite model* (.tflite) —A serialized
+  [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
+  Lite operators and tensors for the TensorFlow Lite interpreter, similar to a
+  `FrozenGraphDef`.
+
+### Freeze Graph
+
+To use the `GraphDef` .pb file with TensorFlow Lite, you must have checkpoints
+that contain trained weight parameters. The .pb file only contains the structure
+of the graph. The process of merging the checkpoint values with the graph
+structure is called *freezing the graph*.
+
+You should have a checkpoints folder or download them for a pre-trained model
+(for example,
+[MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
+
+To freeze the graph, use the following command (changing the arguments):
+
+```
+freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
+  --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
+  --input_binary=true \
+  --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
+  --output_node_names=MobileNetV1/Predictions/Reshape_1
+```
+
+The `input_binary` flag must be enabled so the protobuf is read and written in
+a binary format. Set the `input_graph` and `input_checkpoint` files.
+
+The `output_node_names` may not be obvious outside of the code that built the
+model. The easiest way to find them is to visualize the graph, either with
+[TensorBoard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3)
+or `graphviz`.
+
+The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
+(.tflite) for use on Android or iOS devices. For Android, the Tensorflow
+Optimizing Converter tool supports both float and quantized models. To convert
+the frozen `GraphDef` to the .tflite format:
+
+```
+toco --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
+  --inference_type=FLOAT \
+  --input_type=FLOAT \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --input_shapes=1,224,224,3
+```
+
+The `input_file` argument should reference the frozen `GraphDef` file
+containing the model architecture. The [frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
+file used here is available for download. `output_file` is where the TensorFlow
+Lite model will get generated. The `input_type` and `inference_type`
+arguments should be set to `FLOAT`, unless converting a
+@{$performance/quantization$quantized model}. Setting the `input_array`,
+`output_array`, and `input_shape` arguments are not as straightforward. The
+easiest way to find these values is to explore the graph using Tensorboard. Reuse
+the arguments for specifying the output nodes for inference in the
+`freeze_graph` step.
+
+It is also possible to use the Tensorflow Optimizing Converter with protobufs
+from either Python or from the command line (see the 
+[toco_from_protos.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py)
+example). This allows you to integrate the conversion step into the model design
+workflow, ensuring the model is easily convertible to a mobile inference graph.
+For example:
+
+```python
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+out = tf.identity(val, name="out")
+
+with tf.Session() as sess:
+  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
+  open("converteds_model.tflite", "wb").write(tflite_model)
+```
+
+For usage, see the Tensorflow Optimizing Converter
+[command-line examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
+
+Refer to the
+[Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
+for troubleshooting help, and if that doesn't help, please
+[file an issue](https://github.com/tensorflow/tensorflow/issues).
+
+The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
+to visualize TensorFlow Lite models after conversion. To build the
+[visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/tools/visualize.py)
+tool:
+
+```sh
+bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
+```
+
+This generates an interactive HTML page listing subgraphs, operations, and a
+graph visualization.
+
+
+## 3. Use the TensorFlow Lite model for inference in a mobile app
+
+After completing the prior steps, you should now have a `.tflite` model file.
+
+### Android
+
+Since Android apps are written in Java and the core TensorFlow library is in C++,
+a JNI library is provided as an interface. This is only meant for inference—it
+provides the ability to load a graph, set up inputs, and run the model to
+calculate outputs.
+
+The open source Android demo app uses the JNI interface and is available
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
+You can also download a
+[prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+See the @{$tflite/demo_android} guide for details.
+
+The @{$mobile/android_build} guide has instructions for installing TensorFlow on
+Android and setting up `bazel` and Android Studio.
+
+### iOS
+
+To integrate a TensorFlow model in an iOS app, see the
+[TensorFlow Lite for iOS](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md)
+guide and @{$tflite/demo_ios} guide.
+
+#### Core ML support
+
+Core ML is a machine learning framework used in Apple products. In addition to
+using Tensorflow Lite models directly in your applications, you can convert
+trained Tensorflow models to the
+[CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
+devices. To use the converter, refer to the
+[Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+
+### Raspberry Pi
+
+Compile Tensorflow Lite for a Raspberry Pi by following the
+[RPi build instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/rpi.md)
+This compiles a static library file (`.a`) used to build your app. There are
+plans for Python bindings and a demo app.
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index beb24794fc98724e2423e02a71028f79be45cf75..11f11ea4dc54b9f152f2560384cb47cec6b308c0 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -155,7 +155,9 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-![tensorflow lite architecture](https://www.tensorflow.org/images/tflite-architecture.jpg)
+<img src="/images/tflite-architecture.jpg"
+     alt="TensorFlow Lite architecture diagram"
+     style="max-width:600px;">
 
 Starting with a trained TensorFlow model on disk, you'll convert that model to
 the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index 316f023f43dcfe781c7819d1681335267ddd5f76..1f894c39fe4554261cd37ebc8cd48af6b36eef43 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,7 +1,9 @@
+index.md
 performance_guide.md
 datasets_performance.md
 performance_models.md
 benchmarks.md
+quantization.md
 
 ### XLA
 xla/index.md
@@ -11,6 +13,3 @@ xla/jit.md
 xla/operation_semantics.md
 xla/shapes.md
 xla/tfcompile.md
-
-### Quantization
-quantization.md
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index cd47fc2803bc1429d28bd0ae4c2ad68e632a6f03..b1796cf9b2d0bf7459e70ab542b6e6fcb203667a 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -78,7 +78,7 @@ training CIFAR-10 illustrates the use of the `tf.data` API along with
 The `tf.data` API utilizes C++ multi-threading and has a much lower overhead
 than the Python-based `queue_runner` that is limited by Python's multi-threading
 performance. A detailed performance guide for the `tf.data` API can be found
-[here](#datasets_performance).
+[here](@{$datasets_performance}).
 
 While feeding data using a `feed_dict` offers a high level of flexibility, in
 general `feed_dict` does not provide a scalable solution. If only a single GPU
@@ -475,7 +475,7 @@ optimizations.
 ### TensorFlow with Intel® MKL DNN
 
 Intel® has added optimizations to TensorFlow for Intel® Xeon® and Intel® Xeon
-Phi™ though the use of Intel® Math Kernel Library for Deep Neural Networks
+Phi™ through the use of the Intel® Math Kernel Library for Deep Neural Networks
 (Intel® MKL-DNN) optimized primitives. The optimizations also provide speedups
 for the consumer line of processors, e.g. i5 and i7 Intel processors. The Intel
 published paper
@@ -581,9 +581,9 @@ Each variable that impacts performance is discussed below.
     for optimal settings.
 
 *   **intra_op_parallelism_threads**: Setting this equal to the number of
-    physical cores is recommended. Setting the value to 0, which is the default
-    and will result in the value being set to the number of logical cores, is an
-    option to try for some architectures.  This value and `OMP_NUM_THREADS`
+    physical cores is recommended. Setting the value to 0, which is the default,
+    results in the value being set to the number of logical cores - this is an
+    alternate option to try for some architectures.  This value and `OMP_NUM_THREADS`
     should be equal.
 
 *   **inter_op_parallelism_threads**: Setting this equal to the number of
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 544274cab68934419e8601a4d9714d80335fca28..2fea02d861d314cc61f2ba20475bf08ebea8fb5f 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -1,226 +1,253 @@
-# How to Quantize Neural Networks with TensorFlow
-
-When modern neural networks were being developed, the biggest challenge was
-getting them to work at all! That meant that accuracy and speed during training
-were the top priorities. Using floating point arithmetic was the easiest way to
-preserve accuracy, and GPUs were well-equipped to accelerate those calculations,
-so it's natural that not much attention was paid to other numerical formats.
-
-These days, we actually have a lot of models being deployed in commercial
-applications. The computation demands of training grow with the number of
-researchers, but the cycles needed for inference expand in proportion to users.
-That means pure inference efficiency has become a burning issue for a lot of
-teams.
-
-That is where quantization comes in. It's an umbrella term that covers a lot of
-different techniques to store numbers and perform calculations on them in more
-compact formats than 32-bit floating point. I am going to focus on eight-bit
-fixed point, for reasons I'll go into more detail on later.
-
-[TOC]
-
-## Why does Quantization Work?
-
-Training neural networks is done by applying many tiny nudges to the weights,
-and these small increments typically need floating point precision to work
-(though there are research efforts to use quantized representations here too).
-
-Taking a pre-trained model and running inference is very different. One of the
-magical qualities of deep networks is that they tend to cope very well with high
-levels of noise in their inputs. If you think about recognizing an object in a
-photo you've just taken, the network has to ignore all the CCD noise, lighting
-changes, and other non-essential differences between it and the training
-examples it's seen before, and focus on the important similarities instead. This
-ability means that they seem to treat low-precision calculations as just another
-source of noise, and still produce accurate results even with numerical formats
-that hold less information.
-
-## Why Quantize?
-
-Neural network models can take up a lot of space on disk, with the original
-AlexNet being over 200 MB in float format for example. Almost all of that size
-is taken up with the weights for the neural connections, since there are often
-many millions of these in a single model. Because they're all slightly different
-floating point numbers, simple compression formats like zip don't compress them
-well. They are arranged in large layers though, and within each layer the
-weights tend to be normally distributed within a certain range, for example -3.0
-to 6.0.
-
-The simplest motivation for quantization is to shrink file sizes by storing the
-min and max for each layer, and then compressing each float value to an
-eight-bit integer representing the closest real number in a linear set of 256
-within the range. For example with the -3.0 to 6.0 range, a 0 byte would
-represent -3.0, a 255 would stand for 6.0, and 128 would represent about 1.5.
-I'll go into the exact calculations later, since there's some subtleties, but
-this means you can get the benefit of a file on disk that's shrunk by 75%, and
-then convert back to float after loading so that your existing floating-point
-code can work without any changes.
-
-Another reason to quantize is to reduce the computational resources you need to
-do the inference calculations, by running them entirely with eight-bit inputs
-and outputs. This is a lot more difficult since it requires changes everywhere
-you do calculations, but offers a lot of potential rewards. Fetching eight-bit
-values only requires 25% of the memory bandwidth of floats, so you'll make much
-better use of caches and avoid bottlenecking on RAM access. You can also
-typically use SIMD operations that do many more operations per clock cycle. In
-some case you'll have a DSP chip available that can accelerate eight-bit
-calculations too, which can offer a lot of advantages.
-
-Moving calculations over to eight bit will help you run your models faster, and
-use less power (which is especially important on mobile devices). It also opens
-the door to a lot of embedded systems that can't run floating point code
-efficiently, so it can enable a lot of applications in the IoT world.
-
-## Why Not Train in Lower Precision Directly?
-
-There have been some experiments training at lower bit depths, but the results
-seem to indicate that you need higher than eight bit to handle the back
-propagation and gradients. That makes implementing the training more
-complicated, and so starting with inference made sense. We also already have a
-lot of float models already that we use and know well, so being able to convert
-them directly is very convenient.
-
-## How Can You Quantize Your Models?
-
-TensorFlow has production-grade support for eight-bit calculations built in. It
-also has a process for converting many models trained in floating-point over to
-equivalent graphs using quantized calculations for inference. For example,
-here's how you can translate the latest GoogLeNet model into a version that uses
-eight-bit computations:
-
-```sh
-curl -L "https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz" |
-  tar -C tensorflow/examples/label_image/data -xz
-bazel build tensorflow/tools/graph_transforms:transform_graph
-bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
-  --in_graph=tensorflow/examples/label_image/data/inception_v3_2016_08_28_frozen.pb \
-  --out_graph=/tmp/quantized_graph.pb \
-  --inputs=input \
-  --outputs=InceptionV3/Predictions/Reshape_1 \
-  --transforms='add_default_attributes strip_unused_nodes(type=float, shape="1,299,299,3")
-    remove_nodes(op=Identity, op=CheckNumerics) fold_constants(ignore_errors=true)
-    fold_batch_norms fold_old_batch_norms quantize_weights quantize_nodes
-    strip_unused_nodes sort_by_execution_order'
+# Fixed Point Quantization
+
+Quantization techniques store and calculate numbers in more compact formats.
+[TensorFlow Lite](/mobile/tflite/) adds quantization that uses an 8-bit fixed
+point representation.
+
+Since a challenge for modern neural networks is optimizing for high accuracy, the
+priority has been improving accuracy and speed during training. Using floating
+point arithmetic is an easy way to preserve accuracy and GPUs are designed to
+accelerate these calculations.
+
+However, as more machine learning models are deployed to mobile devices,
+inference efficiency has become a critical issue. Where the computational demand
+for *training* grows with the amount of models trained on different
+architectures, the computational demand for *inference* grows in proportion to
+the amount of users.
+
+## Quantization benefits
+
+
+Using 8-bit calculations help your models run faster and use less power. This is
+especially important for mobile devices and embedded applications that can't run
+floating point code efficiently, for example, Internet of Things (IoT) and
+robotics devices. There are additional opportunities to extend this support to
+more backends and research lower precision networks.
+
+### Smaller file sizes {: .hide-from-toc}
+
+Neural network models require a lot of space on disk. For example, the original
+AlexNet requires over 200 MB for the float format—almost all of that for the
+model's millions of weights. Because the weights are slightly different
+floating point numbers, simple compression formats perform poorly (like zip).
+
+Weights fall in large layers of numerical values. For each layer, weights tend to
+be normally distributed within a range. Quantization can shrink file sizes by
+storing the minimum and maximum weight for each layer, then compress each
+weight's float value to an 8-bit integer representing the closest real number in
+a linear set of 256 within the range.
+
+### Faster inference {: .hide-from-toc}
+
+Since calculations are run entirely on 8-bit inputs and outputs, quantization
+reduces the computational resources needed for inference calculations. This is
+more involved, requiring changes to all floating point calculations, but results
+in a large speed-up for inference time.
+
+### Memory efficiency {: .hide-from-toc}
+
+Since fetching 8-bit values only requires 25% of the memory bandwidth of floats,
+more efficient caches avoid bottlenecks for RAM access. In many cases, the power
+consumption for running a neural network is dominated by memory access. The
+savings from using fixed-point 8-bit weights and activations are significant. 
+
+Typically, SIMD operations are available that run more operations per clock
+cycle. In some cases, a DSP chip is available that accelerates 8-bit calculations
+resulting in a massive speedup.
+
+## Fixed point quantization techniques
+
+The goal is to use the same precision for weights and activations during both
+training and inference. But an important difference is that training consists of
+a forward pass and a backward pass, while inference only uses a forward pass.
+When we train the model with quantization in the loop, we ensure that the forward
+pass matches precision for both training and inference.
+
+To minimize the loss in accuracy for fully fixed point models (weights and
+activations), train the model with quantization in the loop. This simulates
+quantization in the forward pass of a model so weights tend towards values that
+perform better during quantized inference. The backward pass uses quantized
+weights and activations and models quantization as a straight through estimator.
+(See Bengio et al., [2013](https://arxiv.org/abs/1308.3432))
+
+Additionally, the minimum and maximum values for activations are determined
+during training. This allows a model trained with quantization in the loop to be
+converted to a fixed point inference model with little effort, eliminating the
+need for a separate calibration step.
+
+## Quantization training with TensorFlow
+
+TensorFlow can train models with quantization in the loop. Because training
+requires small gradient adjustments, floating point values are still used. To
+keep models as floating point while adding the quantization error in the training
+loop, @{$array_ops#Fake_quantization$fake quantization} nodes simulate the
+effect of quantization in the forward and backward passes.
+
+Since it's difficult to add these fake quantization operations to all the
+required locations in the model, there's a function available that rewrites the
+training graph. To create a fake quantized training graph:
+
+```
+# Build forward pass of model.
+loss = tf.losses.get_total_loss()
+
+# Call the training rewrite which rewrites the graph in-place with
+# FakeQuantization nodes and folds batchnorm for training. It is
+# often needed to fine tune a floating point model for quantization
+# with this training tool. When training from scratch, quant_delay
+# can be used to activate quantization after training to converge
+# with the float graph, effectively fine-tuning the model.
+tf.contrib.quantize.create_training_graph(quant_delay=2000000)
+
+# Call backward pass optimizer as usual.
+optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+optimizer.minimize(loss)
 ```
 
-This will produce a new model that runs the same operations as the original, but
-with eight bit calculations internally, and all weights quantized as well. If
-you look at the file size, you'll see it's about a quarter of the original (23MB
-versus 91MB). You can still run this model using exactly the same inputs and
-outputs though, and you should get equivalent results. Here's an example:
+The rewritten *eval graph* is non-trivially different from the *training graph*
+since the quantization ops affect the batch normalization step. Because of this,
+we've added a separate rewrite for the *eval graph*:
 
-```sh
-bazel build tensorflow/examples/label_image:label_image
-bazel-bin/tensorflow/examples/label_image/label_image \
---graph=/tmp/quantized_graph.pb \
+```
+# Build eval model
+logits = tf.nn.softmax_cross_entropy_with_logits_v2(...)
+
+# Call the eval rewrite which rewrites the graph in-place with
+# FakeQuantization nodes and fold batchnorm for eval.
+tf.contrib.quantize.create_eval_graph()
+
+# Save the checkpoint and eval graph proto to disk for freezing
+# and providing to TFLite.
+with open(eval_graph_file, ‘w’) as f:
+  f.write(str(g.as_graph_def()))
+saver = tf.train.Saver()
+saver.save(sess, checkpoint_name)
+```
+
+Methods to rewrite the training and eval graphs are an active area of research
+and experimentation. Although rewrites and quantized training might not work or
+improve performance for all models, we are working to generalize these
+techniques.
+
+## Generating fully quantized models
+
+The previously demonstrated after-rewrite eval graph only *simulates*
+quantization. To generate real fixed point computations from a trained
+quantization model, convert it to a fixed point kernel. Tensorflow Lite supports
+this conversion from the graph resulting from `create_eval_graph`.
+
+First, create a frozen graph that will be the input for the TensorFlow Lite
+toolchain:
+
+```
+bazel build tensorflow/python/tools:freeze_graph && \
+  bazel-bin/tensorflow/python/tools/freeze_graph \
+  --input_graph=eval_graph_def.pb \
+  --input_checkpoint=checkpoint \
+  --output_graph=frozen_eval_graph.pb --output_node_names=outputs
 ```
 
-You'll see that this runs the newly-quantized graph, and outputs a very similar
-answer to the original.
-
-You can run the same process on your own models saved out as GraphDefs, with the
-input and output names adapted to those your network requires. I recommend that
-you run them through the freeze_graph script first, to convert checkpoints into
-constants stored in the file.
-
-## How Does the Quantization Process Work?
-
-We've implemented quantization by writing equivalent eight-bit versions of
-operations that are commonly used during inference. These include convolution,
-matrix multiplication, activation functions, pooling operations and
-concatenation. The conversion script first replaces all the individual ops it
-knows about with quantized equivalents. These are small sub-graphs that have
-conversion functions before and after to move the data between float and
-eight-bit. Below is an example of what they look like. First here's the original
-Relu operation, with float inputs and outputs:
-
-![Relu Diagram](https://www.tensorflow.org/images/quantization0.png)
-
-Then, this is the equivalent converted subgraph, still with float inputs and
-outputs, but with internal conversions so the calculations are done in eight
-bit.
-
-![Converted Diagram](https://www.tensorflow.org/images/quantization1.png)
-
-The min and max operations actually look at the values in the input float
-tensor, and then feeds them into the Dequantize operation that converts the
-tensor into eight-bits. There are more details on how the quantized representation
-works later on.
-
-Once the individual operations have been converted, the next stage is to remove
-unnecessary conversions to and from float. If there are consecutive sequences of
-operations that all have float equivalents, then there will be a lot of adjacent
-Dequantize/Quantize ops. This stage spots that pattern, recognizes that they
-cancel each other out, and removes them, like this:
-
-![Stripping Diagram](https://www.tensorflow.org/images/quantization2.png)
-
-Applied on a large scale to models where all of the operations have quantized
-equivalents, this gives a graph where all of the tensor calculations are done in
-eight bit, without having to convert to float.
-
-## What Representation is Used for Quantized Tensors?
-
-We approach converting floating-point arrays of numbers into eight-bit
-representations as a compression problem. We know that the weights and
-activation tensors in trained neural network models tend to have values that are
-distributed across comparatively small ranges (for example you might have -15 to
-+15 for weights, -500 to 1000 for activations on an image model, though the
-exact numbers will vary). We also know from experiment that neural nets tend to
-be very robust in the face of noise, and so the noise-like error produced by
-quantizing down to a small set of values will not hurt the precision of the
-overall results very much. We also want to pick a representation that's easy to
-perform calculations on, especially the large matrix multiplications that form
-the bulk of the work that's needed to run a model.
-
-These led us to pick a representation that has two floats to store the overall
-minimum and maximum values that are represented by the lowest and highest
-quantized value. Each entry in the quantized array represents a float value in
-that range, distributed linearly between the minimum and maximum. For example,
-if we have minimum = -10.0, and maximum = 30.0f, and an eight-bit array, here's
-what the quantized values represent:
+Provide this to the TensorFlow Lite Optimizing Converter (TOCO) to get a fully
+quantized TensorFLow Lite model:
 
 ```
-Quantized | Float
---------- | -----
-0         | -10.0
-255       | 30.0
-128       | 10.0
+bazel build tensorflow/contrib/lite/toco:toco && \
+  ./bazel-bin/third_party/tensorflow/contrib/lite/toco/toco \
+  --input_file=frozen_eval_graph.pb \
+  --output_file=tflite_model.tflite \
+  --input_format=TENSORFLOW_GRAPHDEF --output_format=TFLITE \
+  --inference_type=QUANTIZED_UINT8 \
+  --input_shape="1,224, 224,3" \
+  --input_array=input \
+  --output_array=outputs \
+  --std_value=127.5 --mean_value=127.5
 ```
 
-The advantages of this format are that it can represent arbitrary magnitudes of
-ranges, they don't have to be symmetrical, it can represent signed and unsigned
-values, and the linear spread makes doing multiplications straightforward. There
-are alternatives like [Song Han's code books](http://arxiv.org/pdf/1510.00149.pdf)
-that can use lower bit depths by non-linearly distributing the float values
-across the representation, but these tend to be more expensive to calculate on.
-
-The advantage of having a strong and clear definition of the quantized format is
-that it's always possible to convert back and forth from float for operations
-that aren't quantization-ready, or to inspect the tensors for debugging
-purposes. One implementation detail in TensorFlow that we're hoping to improve
-in the future is that the minimum and maximum float values need to be passed as
-separate tensors to the one holding the quantized values, so graphs can get a
-bit dense!
-
-The nice thing about the minimum and maximum ranges is that they can often be
-pre-calculated. Weight parameters are constants known at load time, so their
-ranges can also be stored as constants. We often know the ranges for inputs (for
-examples images are usually RGB values in the range 0.0 to 255.0), and many
-activation functions have known ranges too. This can avoid having to analyze the
-outputs of an operation to determine the range, which we need to do for math ops
-like convolution or matrix multiplication which produce 32-bit accumulated
-results from 8-bit inputs.
-
-## What's Next?
-
-We've found that we can get extremely good performance on mobile and embedded
-devices by using eight-bit arithmetic rather than floating-point. You can see
-the framework we use to optimize matrix multiplications at
-[gemmlowp](https://github.com/google/gemmlowp). We still need to apply all the
-lessons we've learned to the TensorFlow ops to get maximum performance on
-mobile, but we're actively working on that. Right now, this quantized
-implementation is a reasonably fast and accurate reference implementation that
-we're hoping will enable wider support for our eight-bit models on a wider
-variety of devices. We also hope that this demonstration will encourage the
-community to explore what's possible with low-precision neural networks.
+See the documentation for @{tf.contrib.quantize} and
+[TensorFlow Lite](/mobile/tflite/).
+
+## Quantized accuracy
+
+Fixed point [MobileNet](https://arxiv.org/abs/1704.0486) models are released with
+8-bit weights and activations. Using the rewriters, these models achieve the
+Top-1 accuracies listed in Table 1. For comparison, the floating point accuracies
+are listed for the same models. The code used to generate these models
+[is available](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)
+along with links to all of the pretrained mobilenet_v1 models.
+
+<figure>
+  <table>
+    <tr>
+      <th>Image Size</th>
+      <th>Depth</th>
+      <th>Top-1 Accuracy:<br>Floating point</th>
+      <th>Top-1 Accuracy:<br>Fixed point: 8 bit weights and activations</th>
+    </tr>
+    <tr><td>128</td><td>0.25</td><td>0.415</td><td>0.399</td></tr>
+    <tr><td>128</td><td>0.5</td><td>0.563</td><td>0.549</td></tr>
+    <tr><td>128</td><td>0.75</td><td>0.621</td><td>0.598</td></tr>
+    <tr><td>128</td><td>1</td><td>0.652</td><td>0.64</td></tr>
+    <tr><td>160</td><td>0.25</td><td>0.455</td><td>0.435</td></tr>
+    <tr><td>160</td><td>0.5</td><td>0.591</td><td>0.577</td></tr>
+    <tr><td>160</td><td>0.75</td><td>0.653</td><td>0.639</td></tr>
+    <tr><td>160</td><td>1</td><td>0.68</td><td>0.673</td></tr>
+    <tr><td>192</td><td>0.25</td><td>0.477</td><td>0.458</td></tr>
+    <tr><td>192</td><td>0.5</td><td>0.617</td><td>0.604</td></tr>
+    <tr><td>192</td><td>0.75</td><td>0.672</td><td>0.662</td></tr>
+    <tr><td>192</td><td>1</td><td>0.7</td><td>0.69</td></tr>
+    <tr><td>224</td><td>0.25</td><td>0.498</td><td>0.482</td></tr>
+    <tr><td>224</td><td>0.5</td><td>0.633</td><td>0.622</td></tr>
+    <tr><td>224</td><td>0.75</td><td>0.684</td><td>0.679</td></tr>
+    <tr><td>224</td><td>1</td><td>0.709</td><td>0.697</td></tr>
+  </table>
+  <figcaption>
+    <b>Table 1</b>: MobileNet Top-1 accuracy on Imagenet Validation dataset.
+  </figcaption>
+</figure>
+
+## Representation for quantized tensors
+
+TensorFlow approaches the conversion of floating-point arrays of numbers into
+8-bit representations as a compression problem. Since the weights and activation
+tensors in trained neural network models tend to have values that are distributed
+across comparatively small ranges (for example, -15 to +15 for weights or -500 to
+1000 for image model activations). And since neural nets tend to be robust
+handling noise, the error introduced by quantizing to a small set of values
+maintains the precision of the overall results within an acceptable threshold. A
+chosen representation must perform fast calculations, especially the large matrix
+multiplications that comprise the bulk of the computations while running a model.
+
+This is represented with two floats that store the overall minimum and maximum
+values corresponding to the lowest and highest quantized value. Each entry in the
+quantized array represents a float value in that range, distributed linearly
+between the minimum and maximum. For example, with a minimum of -10.0 and maximum
+of 30.0f, and an 8-bit array, the quantized values represent the following:
+
+<figure>
+  <table>
+    <tr><th>Quantized</th><th>Float</th></tr>
+    <tr><td>0</td><td>-10.0</td></tr>
+    <tr><td>255</td><td>30.0</td></tr>
+    <tr><td>128</td><td>10.0</td></tr>
+  </table>
+  <figcaption>
+    <b>Table 2</b>: Example quantized value range
+  </figcaption>
+</figure>
+
+The advantages of this representation format are:
+
+* It efficiently represents an arbitrary magnitude of ranges.
+* The values don't have to be symmetrical.
+* The format represents both signed and unsigned values.
+* The linear spread makes multiplications straightforward.
+
+Alternative techniques use lower bit depths by non-linearly distributing the
+float values across the representation, but currently are more expensive in terms
+of computation time. (See Han et al.,
+[2016](https://arxiv.org/abs/1510.00149).)
+
+The advantage of having a clear definition of the quantized format is that it's
+always possible to convert back and forth from fixed-point to floating-point for
+operations that aren't quantization-ready, or to inspect the tensors for
+debugging.
diff --git a/tensorflow/docs_src/performance/xla/jit.md b/tensorflow/docs_src/performance/xla/jit.md
index d4dc3e57c8fb5ec2a979b6ba7ebe2a3b6c3a5f94..d9a979ccbd31773b9d227ff946486706844a8f81 100644
--- a/tensorflow/docs_src/performance/xla/jit.md
+++ b/tensorflow/docs_src/performance/xla/jit.md
@@ -157,7 +157,7 @@ to fuse Ops is visible by starting at `hlo_graph_0.dot` and viewing each diagram
 in succession.
 
 To Render the .dot file into a png, install
-[GraphViz](http://www.graphviz.org/Download..php) and run:
+[GraphViz](https://www.graphviz.org/download/) and run:
 
 ```shell
 dot -Tpng hlo_graph_80.dot -o hlo_graph_80.png
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index daa2d4767c20d30594ecd90911bfd1771053c2e9..f530fe1206c0e9e67816d2c9a2ba276858314737 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -25,7 +25,7 @@ Calculates gradients of batch norm.
 <b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
 
 | Arguments       | Type                    | Semantics                        |
-| --------------  | ----------------------- | -------------------------------- |
+| --------------- | ----------------------- | -------------------------------- |
 | `operand`       | `ComputationDataHandle` | n dimensional array to be        |
 :                 :                         : normalized (x)                   :
 | `scale`         | `ComputationDataHandle` | 1 dimensional array              |
@@ -45,28 +45,37 @@ feature dimension in `operand`), the operation calculates the gradients with
 respect to `operand`, `offset` and `scale` across all the other dimensions. The
 `feature_index` must be a valid index for the feature dimension in `operand`.
 
-The three gradients are defined by the following formulas:
-
-\\( \nabla x = \nabla y * \gamma * \sqrt{\sigma^2+\epsilon} \\)
-
-\\( \nabla \gamma = sum(\nabla y * (x - \mu) * \sqrt{\sigma^2 + \epsilon}) \\)
-
-\\( \nabla \beta = sum(\nabla y) \\)
-
-The inputs `mean` and `variance` represents moments value
+The three gradients are defined by the following formulas (assuming a
+4-dimensional tensor as `operand` and with feature dimension index \\(l\\),
+batch size `m` and spatial sizes `w` and `h`):
+
+\\[ \begin{split} c_l&=
+\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h
+\left( \nabla y_{ijkl} \frac{x_{ijkl} - \mu_l}{\sigma^2_l+\epsilon} \right)
+\\\\
+\nabla x_{ijkl} &= \frac{\gamma_{l}}{\sqrt{\sigma^2_{l}+\epsilon}}
+\left( \nabla y_{ijkl} - \mathrm{mean}(\nabla y) - c_l (x_{ijkl} - \mu_{l})
+\right)
+\\\\
+\nabla \gamma_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \left( \nabla y_{ijkl}
+\frac{x_{ijkl} - \mu_l}{\sqrt{\sigma^2_{l}+\epsilon}} \right)
+\\\\\
+\nabla \beta_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl}
+\end{split} \\]
+
+The inputs `mean` and `variance` represent moments value
 across batch and spatial dimensions.
 
 The output type is a tuple of three handles:
 
-|Outputs       | Type                    | Semantics                           |
-|------------- | ----------------------- | ------------------------------------|
-|`grad_operand`| `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `operand`                           :
-|`grad_scale`  | `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `scale`                             :
-|`grad_offset` | `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `offset`                            :
-
+| Outputs        | Type                    | Semantics                         |
+| -------------  | ----------------------- | --------------------------------- |
+| `grad_operand` | `ComputationDataHandle` | gradient with respect to input    |
+:                :                         : `operand` (\\( \nabla x\\))       :
+| `grad_scale`   | `ComputationDataHandle` | gradient with respect to input    |
+:                :                         : `scale` (\\( \nabla \gamma\\))    :
+| `grad_offset`  | `ComputationDataHandle` | gradient with respect to input    |
+:                :                         : `offset`(\\( \nabla \beta\\))     :
 
 ## BatchNormInference
 
@@ -119,11 +128,11 @@ Normalizes an array across batch and spatial dimensions.
 | Arguments       | Type                    | Semantics                        |
 | --------------- | ----------------------- | -------------------------------- |
 | `operand`       | `ComputationDataHandle` | n dimensional array to be        |
-:                 :                         : normalized                       :
+:                 :                         : normalized (x)                   :
 | `scale`         | `ComputationDataHandle` | 1 dimensional array              |
 :                 :                         : (\\(\gamma\\))                   :
 | `offset`        | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\beta\\ )                    :
+:                 :                         : (\\(\beta\\))                    :
 | `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
 | `feature_index` | `int64`                 | Index to feature dimension       |
 :                 :                         : in `operand`                     :
@@ -135,8 +144,8 @@ element in `operand`. The `feature_index` must be a valid index for the feature
 dimension in `operand`.
 
 The algorithm goes as follows for each batch in `operand` \\(x\\) that
-contains `m` elements with `w` and `h` as the size of spatial dimensions (
-assuming `operand` is an 4 dimensional array):
+contains `m` elements with `w` and `h` as the size of spatial dimensions
+(assuming `operand` is an 4 dimensional array):
 
 - Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
 \\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
@@ -170,7 +179,7 @@ Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
 operation from a data shape to a target shape. The dimensions must match, and
 the conversion is an element-wise one; e.g. `s32` elements become `f32` elements
 via bitcast routine. Bitcast is implemented as a low-level cast, so machines
-with different floating point representations will give different results.
+with different floating-point representations will give different results.
 
 <b> `BitcastConvertType(operand, new_element_type)` </b>
 
@@ -238,13 +247,10 @@ See also
 
 Clamps an operand to within the range between a minimum and maximum value.
 
-<b> `Clamp(computation, args...)` </b>
+<b> `Clamp(min, operand, max)` </b>
 
 | Arguments     | Type                    | Semantics                        |
 | ------------- | ----------------------- | -------------------------------- |
-| `computation` | `Computation`           | computation of type `T_0, T_1,   |
-:               :                         : ..., T_N -> S` with N parameters :
-:               :                         : of arbitrary type                :
 | `min`         | `ComputationDataHandle` | array of type T                  |
 | `operand`     | `ComputationDataHandle` | array of type T                  |
 | `max`         | `ComputationDataHandle` | array of type T                  |
@@ -254,7 +260,7 @@ the range between the minimum and maximum, else returns the minimum value if the
 operand is below this range or the maximum value if the operand is above this
 range.  That is, `clamp(a, x, b) =  min(max(a, x), b)`.
 
-All three arrays must be the same shape. Alternately, as a restricted form of
+All three arrays must be the same shape. Alternatively, as a restricted form of
 [broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`.
 
 Example with scalar `min` and `max`:
@@ -351,7 +357,7 @@ each other) and contains the arguments in the order that they were specified.
 :             :                         : concatenated between the `operands`. :
 
 With the exception of `dimension` all dimensions must be the same. This is
-because XLA does not support "ragged" arrays Also note that rank-0 values
+because XLA does not support "ragged" arrays. Also note that rank-0 values
 cannot be concatenated (as it's impossible to name the dimension along which the
 concatenation occurs).
 
@@ -468,7 +474,7 @@ filter/kernel/window. The dimensions are, in this order:
     window that moves across the base area.
 
 The `window_strides` argument specifies the stride of the convolutional window
-in the spatial dimensions. For example, if the stride in a the first spatial
+in the spatial dimensions. For example, if the stride in the first spatial
 dimension is 3, then the window can only be placed at coordinates where the
 first spatial index is divisible by 3.
 
@@ -786,9 +792,7 @@ DynamicSlice extracts a sub-array from the input array at dynamic
 dimension: [start, start + size). The shape of `start_indices` must be rank ==
 1, with dimension size equal to the rank of `operand`.
 Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo input dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
+calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
@@ -845,9 +849,7 @@ is updated.
 The shape of `start_indices` must be rank == 1, with dimension size equal to
 the rank of `operand`.
 Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo update dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
+calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
@@ -856,12 +858,13 @@ array accesses, but this behavior may change in future implementations.
 | `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
 | `update`        | `ComputationDataHandle` | N dimensional array of type T    |
 :                 :                         : containing the slice update.     :
-:                 :                         : Each dimension of update shape    :
+:                 :                         : Each dimension of update shape   :
 :                 :                         : must be strictly greater than    :
 :                 :                         : zero, and start + update must be :
-:                 :                         : less than operand size for each  :
-:                 :                         : dimension to avoid generating    :
-:                 :                         : out-of-bounds update indices.    :
+:                 :                         : less than or equal to the operand:
+:                 :                         : size for each dimension to avoid :
+:                 :                         : generating out-of-bounds update  :
+:                 :                         : indices.                         :
 | `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
 :                 :                         : containing the starting indices  :
 :                 :                         : of the slice for each dimension. :
@@ -942,7 +945,7 @@ expand the rank of the lower-rank operand up to the rank of the higher-rank
 operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
 the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
 shape are filled with dimensions of size one. Degenerate-dimension broadcasting
-then broadcasts the shapes along these degenerate dimension to equalize the
+then broadcasts the shapes along these degenerate dimensions to equalize the
 shapes of both operands. The semantics are described in detail on the
 @{$broadcasting$broadcasting page}.
 
@@ -1048,6 +1051,9 @@ For a more intuitive description, see the "Informal Description" section below.
 :                  :                         : indices of the slices we're     :
 :                  :                         : we're stitching together into   :
 :                  :                         : the output tensor.              :
+|`index_vector_dim`  | `int64`               | The dimension in                |
+:                  :                         : `gather_indices` that contains  :
+:                  :                         : the starting indices.           :
 |`output_window_dims` | `ArraySlice<int64>`  | The set of dimensions in the    |
 :                  :                         : output shape that are _window   :
 :                  :                         : dimensions_ (defined below).    :
@@ -1064,22 +1070,20 @@ For a more intuitive description, see the "Informal Description" section below.
 :                  :            : `output_window_dims`) and the window         :
 :                  :            : dimensions that are elided (via              :
 :                  :            : `elided_window_dims`).                       :
-|`gather_dims_to_operand_dims` | `ArraySlice<int64>` | A dimension map (the  |
+|`gather_dims_to_operand_dims` | `ArraySlice<int64>` | A dimension map (the    |
 :                  :            : array is interpreted as mapping `i` to       :
 :                  :            : `gather_dims_to_operand_dims[i]`)  from      :
 :                  :            : the gather indices in `gather_indices` to    :
 :                  :            : the operand index space.  It has to be       :
 :                  :            : one-to-one and total.                        :
 
-If `gather_indices` is a vector with `N` elements then we implicitly reshape it
-to a tensor of shape `[N,1]` before proceeding.
-
 For every index `Out` in the output tensor, we compute two things (more
 precisely described later):
 
-  - An index into the first `gather_indices.rank` - `1` dimensions of
-    `gather_indices`, which gives us a starting index of a slice, _operand
-    slice_, in the operand tensor.
+  - An index into `gather_indices.rank` - `1` dimensions of `gather_indices`,
+    which gives us a starting index of a slice, _operand slice_, in the operand
+    tensor.  These `gather_indices.rank` - `1` dimensions are all the dimensions
+    in `gather_indices` except `index_vector_dim`.
 
   - A _window index_ that has the same rank as the operand.  This index is
     composed of the values in `Out` at dimensions `output_window_dims`, embedded
@@ -1091,29 +1095,42 @@ should be present in the output at index `Out`.
 The output is a tensor of rank `output_window_dims.size` + `gather_indices.rank`
 - `1`.  Additionally, as a shorthand, we define `output_gather_dims` of type
 `ArraySlice<int64>` as the set of dimensions in the output shape but not in
-`output_window_dims`, in ascending order.  E.g. if the output tensor has rank 5,
-`output_window_dims` is {`2`, `4`} then `output_gather_dims` is {`0`, `1`, `3`}
+`output_window_dims`, in ascending order.  E.g. if the output tensor has rank
+`5`, `output_window_dims` is {`2`, `4`} then `output_gather_dims` is {`0`, `1`,
+`3`}
+
+If `index_vector_dim` is equal to `gather_indices.rank` we implicitly
+consider `gather_indices` to have a trailing `1` dimension (i.e. if
+`gather_indices` was of shape `[6,7]` and `index_vector_dim` is `2` then
+we implicitly consider the shape of `gather_indices` to be `[6,7,1]`).
 
 The bounds for the output tensor along dimension `i` is computed as follows:
 
   1. If `i` is present in `output_gather_dims` (i.e. is equal to
-    `output_gather_dims[k]` for some `k`) then we pick the corresponding
-    dimension bounds out of `gather_indices.shape` (i.e. pick
-    `gather_indices.shape.dims[k]`).
+     `output_gather_dims[k]` for some `k`) then we pick the corresponding
+     dimension bounds out of `gather_indices.shape`, skipping
+     `index_vector_dim` (i.e. pick `gather_indices.shape.dims`[`k`] if `k`
+     < `index_vector_dim` and `gather_indices.shape.dims`[`k`+`1`]
+     otherwise).
   2. If `i` is present in `output_window_dims` (i.e. equal to
-     `output_window_dims[k]` for some `k`) then we pick the corresponding bound
-     out of `window_bounds` after accounting for `elided_window_dims` (i.e. we
-     pick `adjusted_window_bounds[k]` where `adjusted_window_bounds` is
-     `window_bounds` with the bounds at indices `elided_window_dims` removed).
+     `output_window_dims`[`k`] for some `k`) then we pick the corresponding
+     bound out of `window_bounds` after accounting for `elided_window_dims`
+     (i.e. we pick `adjusted_window_bounds`[`k`] where `adjusted_window_bounds`
+     is `window_bounds` with the bounds at indices `elided_window_dims`
+     removed).
 
 The operand index `In` corresponding to an output index `Out` is computed as
 follows:
 
   1. Let `G` = { `Out`[`k`] for `k` in `output_gather_dims` }.  Use `G` to slice
-     out vector `S` such that `S`[`i`] = `gather_indices`[`G`, `i`].
-  2. Create an index, `S`<sub>`in`</sub>, into `operand` using `S` by scattering
-     `S` using the `gather_dims_to_operand_dims` map (`S`<sub>`in`</sub> is the
-     starting indices for _operand slice_ mentioned above.).  More precisely:
+     out vector `S` such that `S`[`i`] = `gather_indices`[Combine(`G`, `i`)]
+     where Combine(A, b) inserts b at position `index_vector_dim` into A.
+     Note that this is well defined even if `G` is empty -- if `G` is empty then
+     `S` = `gather_indices`.
+  2. Create an index, `S`<sub>`in`</sub>, into `operand` using `S` by
+     scattering `S` using the `gather_dims_to_operand_dims` map
+     (`S`<sub>`in`</sub> is the starting indices for _operand slice_ mentioned
+     above).  More precisely:
        1. `S`<sub>`in`</sub>[`gather_dims_to_operand_dims`[`k`]] = `S`[`k`] if `k` <
           `gather_dims_to_operand_dims.size`.
        2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
@@ -1134,7 +1151,12 @@ follows:
 `operand.rank` is `6` and `elided_window_dims` is {`0`, `2`} then
 `window_dims_to_operand_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}.
 
-### Informal Description
+### Informal Description and Examples
+
+`index_vector_dim` is set to `gather_indices.rank` - `1` in all of the
+examples that follow.  More interesting values for `index_vector_dim`
+does not change the operation fundamentally, but makes the visual representation
+more cumbersome.
 
 To get an intuition on how all of the above fits together, let's look at an
 example that gathers 5 slices of shape `[8,6]` from a `[16,11]` tensor.  The
@@ -1269,7 +1291,7 @@ result2 = while (condition, init = result1) {
 ```
 
 Nested tuple shapes are not supported. For an empty tuple shape, the Infeed
-operation is effectively a nop and proceeds without reading any data from the
+operation is effectively a no-op and proceeds without reading any data from the
 Infeed of the device.
 
 > Note: We plan to allow multiple Infeed operations without a total order, in
@@ -1332,7 +1354,7 @@ dimension.
 
 `PaddingConfig` is a repeated field of `PaddingConfigDimension`, which contains
 three fields for each dimension: `edge_padding_low`, `edge_padding_high`, and
-`interior_padding`. `edge_padding_low` and `edge_padding_high` specifies the
+`interior_padding`. `edge_padding_low` and `edge_padding_high` specify the
 amount of padding added at the low-end (next to index 0) and the high-end (next
 to the highest index) of each dimension respectively. The amount of edge padding
 can be negative -- the absolute value of negative padding indicates the number
@@ -1341,8 +1363,8 @@ the amount of padding added between any two elements in each dimension. Interior
 padding occurs logically before edge padding, so in the case of negative edge
 padding elements are removed from the interior-padded operand. This operation is
 a no-op if the edge padding pairs are all (0, 0) and the interior padding values
-are all 0. Figure below shows examples of different `edge_padding` and
-`interior_padding` values for a two dimensional array.
+are all 0. The figure below shows examples of different `edge_padding` and
+`interior_padding` values for a two-dimensional array.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
@@ -1399,12 +1421,12 @@ Applies a reduction function to an array.
 | `dimensions`  | `int64` array           | unordered array of dimensions to |
 :               :                         : reduce                           :
 
-Conceptually, this operation reduces one or more dimensions in the input array
-into scalars. The rank of the result array is `rank(operand) - len(dimensions)`.
-`init_value` is the initial value used for every reduction and may also be
-inserted anywhere during computation if the back-end chooses to do so. So in
-most cases `init_value` should be an identity of the reduction function (for
-example, 0 for addition).
+This operation reduces one or more dimensions of the input array into scalars.
+The rank of the returned array is `rank(operand) - len(dimensions)`.
+`init_value` is the initial value used for every reduction and may be inserted
+anywhere during computation by the back-end. In most cases, `init_value` is an
+identity of the reduction function (for example, 0 for addition). The applied
+`computation` is always passed the `init_value` on the left-hand side.
 
 The evaluation order of the reduction function is arbitrary and may be
 non-deterministic. Therefore, the reduction function should not be overly
@@ -1424,8 +1446,7 @@ could be computed as
 
 but there are also many other possibilities, e.g.
 
-`f(init_value, f(f(10, f(init_value, 11)), f(f(init_value, 12), f(13,
-init_value))))`
+`f(init_value, f(f(10, f(init_value, 11)), f(f(init_value, 12), f(init_value, 13))))`
 
 The following is a rough pseudo-code example of how reduction could be
 implemented, using summation as the reduction computation with an initial value
@@ -1543,7 +1564,9 @@ See also
 Applies a reduction function to all elements in each window of the input
 multi-dimensional array, producing an output multi-dimensional array with the
 same number of elements as the number of valid positions of the window. A
-pooling layer can be expressed as a `ReduceWindow`.
+pooling layer can be expressed as a `ReduceWindow`. Similar to
+[`Reduce`](#reduce), the applied `computation` is always passed the `init_value`
+on the left-hand side.
 
 <b> `ReduceWindow(operand, init_value, computation, window_dimensions,
 window_strides, padding)` </b>
diff --git a/tensorflow/docs_src/performance/xla/tfcompile.md b/tensorflow/docs_src/performance/xla/tfcompile.md
index f57ca3948dd52d6e300f5f498b0e047fa63a3aff..8521d7eacb4a7fec7d187bdd1c4f452b644dc8b2 100644
--- a/tensorflow/docs_src/performance/xla/tfcompile.md
+++ b/tensorflow/docs_src/performance/xla/tfcompile.md
@@ -86,7 +86,7 @@ code. `tf_library` utilizes `tfcompile` to compile the TensorFlow graph into
 executable code.
 
 ```build
-load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 # Use the tf_library macro to compile your graph into executable code.
 tf_library(
@@ -258,8 +258,8 @@ file.
 
 ```build
 # Example of linking your binary
-# Also see //third_party/tensorflow/compiler/aot/tests/BUILD
-load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+# Also see //tensorflow/compiler/aot/tests/BUILD
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 # The same tf_library call from step 2 above.
 tf_library(
diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md
index d19200e80cdfe6620789ddd273647660c10b2a60..67be41b1a68bc72913d859f83ccd74b529350a4c 100644
--- a/tensorflow/docs_src/programmers_guide/datasets.md
+++ b/tensorflow/docs_src/programmers_guide/datasets.md
@@ -18,11 +18,11 @@ The `tf.data` API introduces two new abstractions to TensorFlow:
   tensors representing the image data and a label. There are two distinct
   ways to create a dataset:
 
-  * Creating a **source** (e.g. `Dataset.from_tensor_slices()`) constructs a
+    * Creating a **source** (e.g. `Dataset.from_tensor_slices()`) constructs a
     dataset from
     one or more `tf.Tensor` objects.
 
-  * Applying a **transformation** (e.g. `Dataset.batch()`) constructs a dataset
+    * Applying a **transformation** (e.g. `Dataset.batch()`) constructs a dataset
     from one or more `tf.data.Dataset` objects.
 
 * A `tf.data.Iterator` provides the main way to extract elements from a
@@ -327,6 +327,35 @@ same op/node (created by `Iterator.get_next()`). Therefore,  evaluating *any* of
 these tensors will advance the iterator for all components. A typical consumer
 of an iterator will include all components in a single expression.
 
+### Saving iterator state
+
+The @{tf.contrib.data.make_saveable_from_iterator} function creates a
+`SaveableObject` from an iterator, which can be used to save and
+restore the current state of the iterator (and, effectively, the whole input
+pipeline). A saveable object thus created can be added to @{tf.train.Saver}
+variables list or the `tf.GraphKeys.SAVEABLE_OBJECTS` collection for saving and
+restoring in the same manner as a @{tf.Variable}. Refer to
+@{$saved_model$Saving and Restoring} for details on how to save and restore
+variables.
+
+```python
+# Create saveable object from iterator.
+saveable = tf.contrib.data.make_saveable_from_iterator(iterator)
+
+# Save the iterator state by adding it to the saveable objects collection.
+tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable)
+saver = tf.train.Saver()
+
+with tf.Session() as sess:
+
+  if should_checkpoint:
+    saver.save(path_to_checkpoint)
+
+# Restore the iterator state.
+with tf.Session() as sess:
+  saver.restore(sess, path_to_checkpoint)
+```
+
 ## Reading input data
 
 ### Consuming NumPy arrays
@@ -511,7 +540,7 @@ batched into a fixed size.
 # to a fixed shape.
 def _parse_function(filename, label):
   image_string = tf.read_file(filename)
-  image_decoded = tf.image.decode_image(image_string)
+  image_decoded = tf.image.decode_jpeg(image_string)
   image_resized = tf.image.resize_images(image_decoded, [28, 28])
   return image_resized, label
 
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index c8fdae6f60c33776b6d9a8c1a33666ce4ddb1cb2..f7817b06d4c8bd8fe6ff8c3ffae2db856611cb8e 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -4,27 +4,31 @@
 
 [TOC]
 
-TensorFlow debugger (**tfdbg**) is a specialized debugger for TensorFlow. It
-lets you view the internal structure and states of running TensorFlow graphs
-during training and inference, which is difficult to debug with general-purpose
-debuggers such as Python's `pdb` due to TensorFlow's computation-graph paradigm.
-
-> NOTE: TensorFlow debugger uses a
-> [curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based
-> text user interface. On Mac OS X, the `ncurses` library is required and can
-> be installed with `brew install homebrew/dupes/ncurses`. On Windows, curses
-> isn't as well supported, so a
-> [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based interface can
-> be used with tfdbg by installing `pyreadline` with pip.
-> If you use Anaconda3, you can install it with a command
-> such as `"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`.
-> Unofficial Windows curses packages can be downloaded
-> [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
-> installed using `pip install <your_version>.whl`, however curses on Windows
-> may not work as reliably as curses on Linux or Mac.
-
-This tutorial demonstrates how to use the **tfdbg** command-line interface
-(CLI) to debug the appearance of [`nan`s](https://en.wikipedia.org/wiki/NaN)
+`tfdbg` is a specialized debugger for TensorFlow. It lets you view the internal
+structure and states of running TensorFlow graphs during training and inference,
+which is difficult to debug with general-purpose debuggers such as Python's `pdb`
+due to TensorFlow's computation-graph paradigm.
+
+This guide focuses on the command-line interface (CLI) of `tfdbg`. For guide on
+how to use the graphical user interface (GUI) of tfdbg, i.e., the
+**TensorBoard Debugger Plugin**, please visit
+[its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+
+Note: The TensorFlow debugger uses a
+[curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text
+user interface. On Mac OS X, the `ncurses` library is required and can be
+installed with `brew install homebrew/dupes/ncurses`. On Windows, curses isn't as
+well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based
+interface can be used with tfdbg by installing `pyreadline` with `pip`. If you
+use Anaconda3, you can install it with a command such as
+`"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`. Unofficial
+Windows curses packages can be downloaded
+[here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
+installed using `pip install <your_version>.whl`, however curses on Windows may
+not work as reliably as curses on Linux or Mac.
+
+This tutorial demonstrates how to use the **tfdbg** CLI to debug the appearance
+of [`nan`s](https://en.wikipedia.org/wiki/NaN)
 and [`inf`s](https://en.wikipedia.org/wiki/Infinity), a frequently-encountered
 type of bug in TensorFlow model development.
 The following example is for users who use the low-level
@@ -150,6 +154,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `-n <name_pattern>` | List dumped tensors with names matching given regular-expression pattern. | `lt -n Softmax.*` |
 | | `-t <op_pattern>` | List dumped tensors with op types matching given regular-expression pattern. | `lt -t MatMul` |
 | | `-f <filter_name>` | List only the tensors that pass a registered tensor filter. | `lt -f has_inf_or_nan` |
+| | `-f <filter_name> -fenn <regex>` | List only the tensors that pass a registered tensor filter, excluding nodes with names matching the regular expression. | `lt -f has_inf_or_nan` `-fenn .*Sqrt.*` |
 | | `-s <sort_key>` | Sort the output by given `sort_key`, whose possible values are `timestamp` (default), `dump_size`, `op_type` and `tensor_name`. | `lt -s dump_size` |
 | | `-r` | Sort in reverse order. | `lt -r -s dump_size` |
 | **`pt`** | | **Print value of a dumped tensor.** | |
@@ -195,6 +200,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `-n` | Execute through the next `Session.run` without debugging, and drop to CLI right before the run after that. | `run -n` |
 | | `-t <T>` | Execute `Session.run` `T - 1` times without debugging, followed by a run with debugging. Then drop to CLI right after the debugged run. | `run -t 10` |
 | | `-f <filter_name>` | Continue executing `Session.run` until any intermediate tensor triggers the specified Tensor filter (causes the filter to return `True`). | `run -f has_inf_or_nan` |
+| | `-f <filter_name> -fenn <regex>` | Continue executing `Session.run` until any intermediate tensor whose node names doesn't match the regular expression triggers the specified Tensor filter (causes the filter to return `True`). | `run -f has_inf_or_nan -fenn .*Sqrt.*` |
 | | `--node_name_filter <pattern>` | Execute the next `Session.run`, watching only nodes with names matching the given regular-expression pattern. | `run --node_name_filter Softmax.*` |
 | | `--op_type_filter <pattern>` | Execute the next `Session.run`, watching only nodes with op types matching the given regular-expression pattern. | `run --op_type_filter Variable.*` |
 | | `--tensor_dtype_filter <pattern>` | Execute the next `Session.run`, dumping only Tensors with data types (`dtype`s) matching the given regular-expression pattern. | `run --tensor_dtype_filter int.*` |
@@ -394,7 +400,7 @@ diff = -(y_ * tf.log(y))
 to the built-in, numerically-stable implementation of softmax cross-entropy:
 
 ```python
-diff = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits)
+diff = tf.losses.softmax_cross_entropy(labels=y_, logits=logits)
 ```
 
 Rerun with the `--debug` flag as follows:
@@ -454,7 +460,7 @@ accuracy_score = classifier.evaluate(x=test_set.data,
 
 
 [debug_tflearn_iris.py](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_tflearn_iris.py),
-based on {$tflearn$tf-learn's iris tutorial}, contains a full example of how to
+based on [tf-learn's iris tutorial](https://www.tensorflow.org/versions/r1.2/get_started/tflearn), contains a full example of how to
 use the tfdbg with `Estimator`s. To run this example, do:
 
 ```none
@@ -741,15 +747,16 @@ There are three possible workarounds or solutions:
    to which tfdbg dumps the debug data. You can use it to let tfdbg dump the
    debug data on a disk with larger free space. For example:
 
-   ``` python
-   # For LocalCLIDebugWrapperSession
-   sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
+```python
+# For LocalCLIDebugWrapperSession
+sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
 
-   # For LocalCLIDebugHook
-   hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
-   ```
+# For LocalCLIDebugHook
+hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
+```
    Make sure that the directory pointed to by dump_root is empty or nonexistent.
-   tfdbg cleans up the dump directories before exiting.
+   `tfdbg` cleans up the dump directories before exiting.
+
 *  Reduce the batch size used during the runs.
 *  Use the filtering options of tfdbg's `run` command to watch only specific
    nodes in the graph. For example:
@@ -806,3 +813,27 @@ sess.run(b)
 
 the constant-folding would not occur and `tfdbg` should show the intermediate
 tensor dumps.
+
+
+**Q**: I am debugging a model that generates unwanted infinities or NaNs. But
+       there are some nodes in my model that are known to generate infinities
+       or NaNs in their output tensors even under completely normal conditions.
+       How can I skip those nodes during my `run -f has_inf_or_nan` actions?
+
+**A**: Use the `--filter_exclude_node_names` (`-fenn` for short) flag. For
+       example, if you known you have a node with name matching the regular
+       expression `.*Sqrt.*` that generates infinities or NaNs regardless
+       of whether the model is behaving correctly, you can exclude the nodes
+       from the infinity/NaN-finding runs with the command
+       `run -f has_inf_or_nan -fenn .*Sqrt.*`.
+
+
+**Q**: Is there a GUI for tfdbg?
+
+**A**: Yes, the **TensorBoard Debugger Plugin** is the GUI of tfdbg.
+       It offers features such as inspection of the computation graph,
+       real-time visualization of tensor values, continuation to tensor
+       and conditional breakpoints, and tying tensors to their
+       graph-construction source code, all in the browser environment.
+       To get started, please visit
+       [its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
new file mode 100644
index 0000000000000000000000000000000000000000..595e6be4af78d7d684ddeca0adea59e5a754134d
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -0,0 +1,848 @@
+# Eager Execution
+
+TensorFlow's eager execution is an imperative programming environment that
+evaluates operations immediately, without building graphs: operations return
+concrete values instead of constructing a computational graph to run later. This
+makes it easy to get started with TensorFlow and debug models, and it
+reduces boilerplate as well. To follow along with this guide, run the code
+samples below in an interactive `python` interpreter.
+
+Eager execution is a flexible machine learning platform for research and
+experimentation, providing:
+
+* *An intuitive interface*—Structure your code naturally and use Python data
+  structures. Quickly iterate on small models and small data.
+* *Easier debugging*—Call ops directly to inspect running models and test
+  changes. Use standard Python debugging tools for immediate error reporting.
+* *Natural control flow*—Use Python control flow instead of graph control
+  flow, simplifying the specification of dynamic models.
+
+Eager execution supports most TensorFlow operations and GPU acceleration. For a
+collection of examples running in eager execution, see:
+[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
+
+Note: Some models may experience increased overhead with eager execution
+enabled. Performance improvements are ongoing, but please
+[file a bug](https://github.com/tensorflow/tensorflow/issues) if you find a
+problem and share your benchmarks.
+
+## Setup and basic usage
+
+Upgrade to the latest version of TensorFlow:
+
+```
+$ pip install --upgrade tensorflow
+```
+
+To start eager execution, add `tf.enable_eager_execution()` to the beginning of
+the program or console session. Do not add this operation to other modules that
+the program calls.
+
+```py
+from __future__ import absolute_import, division, print_function
+
+import tensorflow as tf
+
+tf.enable_eager_execution()
+```
+
+Now you can run TensorFlow operations and the results will return immediately:
+
+```py
+tf.executing_eagerly()        # => True
+
+x = [[2.]]
+m = tf.matmul(x, x)
+print("hello, {}".format(m))  # => "hello, [[4.]]"
+```
+
+Enabling eager execution changes how TensorFlow operations behave—now they
+immediately evaluate and return their values to Python. `tf.Tensor` objects
+reference concrete values instead of symbolic handles to nodes in a computational
+graph. Since there isn't a computational graph to build and run later in a
+session, it's easy to inspect results using `print()` or a debugger. Evaluating,
+printing, and checking tensor values does not break the flow for computing
+gradients.
+
+Eager execution works nicely with [NumPy](http://www.numpy.org/). NumPy
+operations accept `tf.Tensor` arguments. TensorFlow
+[math operations](https://www.tensorflow.org/api_guides/python/math_ops) convert
+Python objects and NumPy arrays to `tf.Tensor` objects. The
+`tf.Tensor.numpy` method returns the object's value as a NumPy `ndarray`.
+
+```py
+a = tf.constant([[1, 2],
+                 [3, 4]])
+print(a)
+# => tf.Tensor([[1 2]
+#               [3 4]], shape=(2, 2), dtype=int32)
+
+# Broadcasting support
+b = tf.add(a, 1)
+print(b)
+# => tf.Tensor([[2 3]
+#               [4 5]], shape=(2, 2), dtype=int32)
+
+# Operator overloading is supported
+print(a * b)
+# => tf.Tensor([[ 2  6]
+#               [12 20]], shape=(2, 2), dtype=int32)
+
+# Use NumPy values
+import numpy as np
+
+c = np.multiply(a, b)
+print(c)
+# => [[ 2  6]
+#     [12 20]]
+
+# Obtain numpy value from a tensor:
+print(a.numpy())
+# => [[1 2]
+#     [3 4]]
+```
+
+The `tf.contrib.eager` module contains symbols available to both eager and graph execution
+environments and is useful for writing code to [work with graphs](#work_with_graphs):
+
+```py
+tfe = tf.contrib.eager
+```
+
+## Dynamic control flow
+
+A major benefit of eager execution is that all the functionality of the host
+language is available while your model is executing. So, for example,
+it is easy to write [fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz):
+
+```py
+def fizzbuzz(max_num):
+  counter = tf.constant(0)
+  for num in range(max_num):
+    num = tf.constant(num)
+    if num % 3 == 0 and num % 5 == 0:
+      print('FizzBuzz')
+    elif num % 3 == 0:
+      print('Fizz')
+    elif num % 5 == 0:
+      print('Buzz')
+    else:
+      print(num)
+    counter += 1
+  return counter
+```
+
+This has conditionals that depend on tensor values and it prints these values
+at runtime.
+
+## Build a model
+
+Many machine learning models are represented by composing layers. When
+using TensorFlow with eager execution you can either write your own layers or
+use a layer provided in the `tf.keras.layers` package.
+
+While you can use any Python object to represent a layer,
+TensorFlow has `tf.keras.layers.Layer` as a convenient base class. Inherit from
+it to implement your own layer:
+
+```py
+class MySimpleLayer(tf.keras.layers.Layer):
+  def __init__(self, output_units):
+    self.output_units = output_units
+
+  def build(self, input):
+    # The build method gets called the first time your layer is used.
+    # Creating variables on build() allows you to make their shape depend
+    # on the input shape and hence remove the need for the user to specify
+    # full shapes. It is possible to create variables during __init__() if
+    # you already know their full shapes.
+    self.kernel = self.add_variable(
+      "kernel", [input.shape[-1], self.output_units])
+
+  def call(self, input):
+    # Override call() instead of __call__ so we can perform some bookkeeping.
+    return tf.matmul(input, self.kernel)
+```
+
+Use `tf.keras.layers.Dense` layer instead  of `MySimpleLayer` above as it has
+a superset of its functionality (it can also add a bias).
+
+When composing layers into models you can use `tf.keras.Sequential` to represent
+models which are a linear stack of layers. It is easy to use for basic models:
+
+```py
+model = tf.keras.Sequential([
+  tf.keras.layers.Dense(10, input_shape=(784,)),  # must declare input shape
+  tf.keras.layers.Dense(10)
+])
+```
+
+Alternatively, organize models in classes by inheriting from `tf.keras.Model`.
+This is a container for layers that is a layer itself, allowing `tf.keras.Model`
+objects to contain other `tf.keras.Model` objects.
+
+```py
+class MNISTModel(tf.keras.Model):
+  def __init__(self):
+    super(MNISTModel, self).__init__()
+    self.dense1 = tf.keras.layers.Dense(units=10)
+    self.dense2 = tf.keras.layers.Dense(units=10)
+
+  def call(self, input):
+    """Run the model."""
+    result = self.dense1(input)
+    result = self.dense2(result)
+    result = self.dense2(result)  # reuse variables from dense2 layer
+    return result
+
+model = MNISTModel()
+```
+
+It's not required to set an input shape for the `tf.keras.Model` class since
+the parameters are set the first time input is passed to the layer.
+
+`tf.keras.layers` classes create and contain their own model variables that
+are tied to the lifetime of their layer objects. To share layer variables, share
+their objects.
+
+
+## Eager training
+
+### Computing gradients
+
+[Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
+is useful for implementing machine learning algorithms such as
+[backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
+neural networks. During eager execution, use `tf.GradientTape` to trace
+operations for computing gradients later.
+
+`tf.GradientTape` is an opt-in feature to provide maximal performance when
+not tracing. Since different operations can occur during each call, all
+forward-pass operations get recorded to a "tape". To compute the gradient, play
+the tape backwards and then discard. A particular `tf.GradientTape` can only
+compute one gradient; subsequent calls throw a runtime error.
+
+```py
+w = tfe.Variable([[1.0]])
+with tf.GradientTape() as tape:
+  loss = w * w
+
+grad = tape.gradient(loss, [w])
+print(grad)  # => [tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32)]
+```
+
+Here's an example of `tf.GradientTape` that records forward-pass operations
+to train a simple model:
+
+```py
+# A toy dataset of points around 3 * x + 2
+NUM_EXAMPLES = 1000
+training_inputs = tf.random_normal([NUM_EXAMPLES])
+noise = tf.random_normal([NUM_EXAMPLES])
+training_outputs = training_inputs * 3 + 2 + noise
+
+def prediction(input, weight, bias):
+  return input * weight + bias
+
+# A loss function using mean-squared error
+def loss(weights, biases):
+  error = prediction(training_inputs, weights, biases) - training_outputs
+  return tf.reduce_mean(tf.square(error))
+
+# Return the derivative of loss with respect to weight and bias
+def grad(weights, biases):
+  with tf.GradientTape() as tape:
+    loss_value = loss(weights, biases)
+  return tape.gradient(loss_value, [weights, biases])
+
+train_steps = 200
+learning_rate = 0.01
+# Start with arbitrary values for W and B on the same batch of data
+W = tfe.Variable(5.)
+B = tfe.Variable(10.)
+
+print("Initial loss: {:.3f}".format(loss(W, B)))
+
+for i in range(train_steps):
+  dW, dB = grad(W, B)
+  W.assign_sub(dW * learning_rate)
+  B.assign_sub(dB * learning_rate)
+  if i % 20 == 0:
+    print("Loss at step {:03d}: {:.3f}".format(i, loss(W, B)))
+
+print("Final loss: {:.3f}".format(loss(W, B)))
+print("W = {}, B = {}".format(W.numpy(), B.numpy()))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 71.204
+Loss at step 000: 68.333
+Loss at step 020: 30.222
+Loss at step 040: 13.691
+Loss at step 060: 6.508
+Loss at step 080: 3.382
+Loss at step 100: 2.018
+Loss at step 120: 1.422
+Loss at step 140: 1.161
+Loss at step 160: 1.046
+Loss at step 180: 0.996
+Final loss: 0.974
+W = 3.01582956314, B = 2.1191945076
+```
+
+Replay the `tf.GradientTape` to compute the gradients and apply them in a
+training loop. This is demonstrated in an excerpt from the
+[mnist_eager.py](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_eager.py)
+example:
+
+```py
+dataset = tf.data.Dataset.from_tensor_slices((data.train.images,
+                                              data.train.labels))
+...
+for (batch, (images, labels)) in enumerate(dataset):
+  ...
+  with tf.GradientTape() as tape:
+    logits = model(images, training=True)
+    loss_value = loss(logits, labels)
+  ...
+  grads = tape.gradient(loss_value, model.variables)
+  optimizer.apply_gradients(zip(grads, model.variables),
+                            global_step=tf.train.get_or_create_global_step())
+```
+
+
+The following example creates a multi-layer model that classifies the standard
+[MNIST handwritten digits](https://www.tensorflow.org/tutorials/layers). It
+demonstrates the optimizer and layer APIs to build trainable graphs in an eager
+execution environment.
+
+### Train a model
+
+Even without training, call the model and inspect the output in eager execution:
+
+```py
+# Create a tensor representing a blank image
+batch = tf.zeros([1, 1, 784])
+print(batch.shape)  # => (1, 1, 784)
+
+result = model(batch)
+# => tf.Tensor([[[ 0.  0., ..., 0.]]], shape=(1, 1, 10), dtype=float32)
+```
+
+This example uses the
+[dataset.py module](https://github.com/tensorflow/models/blob/master/official/mnist/dataset.py)
+from the
+[TensorFlow MNIST example](https://github.com/tensorflow/models/tree/master/official/mnist);
+download this file to your local directory. Run the following to download the
+MNIST data files to your working directory and prepare a `tf.data.Dataset`
+for training:
+
+```py
+import dataset  # download dataset.py file
+dataset_train = dataset.train('./datasets').shuffle(60000).repeat(4).batch(32)
+```
+
+To train a model, define a loss function to optimize and then calculate
+gradients. Use an optimizer to update the variables:
+
+```py
+def loss(model, x, y):
+  prediction = model(x)
+  return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=prediction)
+
+def grad(model, inputs, targets):
+  with tf.GradientTape() as tape:
+    loss_value = loss(model, inputs, targets)
+  return tape.gradient(loss_value, model.variables)
+
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+
+x, y = iter(dataset_train).next()
+print("Initial loss: {:.3f}".format(loss(model, x, y)))
+
+# Training loop
+for (i, (x, y)) in enumerate(dataset_train):
+  # Calculate derivatives of the input function with respect to its parameters.
+  grads = grad(model, x, y)
+  # Apply the gradient to the model
+  optimizer.apply_gradients(zip(grads, model.variables),
+                            global_step=tf.train.get_or_create_global_step())
+  if i % 200 == 0:
+    print("Loss at step {:04d}: {:.3f}".format(i, loss(model, x, y)))
+
+print("Final loss: {:.3f}".format(loss(model, x, y)))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 2.674
+Loss at step 0000: 2.593
+Loss at step 0200: 2.143
+Loss at step 0400: 2.009
+Loss at step 0600: 2.103
+Loss at step 0800: 1.621
+Loss at step 1000: 1.695
+...
+Loss at step 6600: 0.602
+Loss at step 6800: 0.557
+Loss at step 7000: 0.499
+Loss at step 7200: 0.744
+Loss at step 7400: 0.681
+Final loss: 0.670
+```
+
+And for faster training, move the computation to a GPU:
+
+```py
+with tf.device("/gpu:0"):
+  for (i, (x, y)) in enumerate(dataset_train):
+    # minimize() is equivalent to the grad() and apply_gradients() calls.
+    optimizer.minimize(lambda: loss(model, x, y),
+                       global_step=tf.train.get_or_create_global_step())
+```
+
+### Variables and optimizers
+
+`tfe.Variable` objects store mutable `tf.Tensor` values accessed during
+training to make automatic differentiation easier. The parameters of a model can
+be encapsulated in classes as variables.
+
+Better encapsulate model parameters by using `tfe.Variable` with
+`tf.GradientTape`. For example, the automatic differentiation example above
+can be rewritten:
+
+```py
+class Model(tf.keras.Model):
+  def __init__(self):
+    super(Model, self).__init__()
+    self.W = tfe.Variable(5., name='weight')
+    self.B = tfe.Variable(10., name='bias')
+  def predict(self, inputs):
+    return inputs * self.W + self.B
+
+# A toy dataset of points around 3 * x + 2
+NUM_EXAMPLES = 2000
+training_inputs = tf.random_normal([NUM_EXAMPLES])
+noise = tf.random_normal([NUM_EXAMPLES])
+training_outputs = training_inputs * 3 + 2 + noise
+
+# The loss function to be optimized
+def loss(model, inputs, targets):
+  error = model.predict(inputs) - targets
+  return tf.reduce_mean(tf.square(error))
+
+def grad(model, inputs, targets):
+  with tf.GradientTape() as tape:
+    loss_value = loss(model, inputs, targets)
+  return tape.gradient(loss_value, [model.W, model.B])
+
+# Define:
+# 1. A model.
+# 2. Derivatives of a loss function with respect to model parameters.
+# 3. A strategy for updating the variables based on the derivatives.
+model = Model()
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+
+print("Initial loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))
+
+# Training loop
+for i in range(300):
+  grads = grad(model, training_inputs, training_outputs)
+  optimizer.apply_gradients(zip(grads, [model.W, model.B]),
+                            global_step=tf.train.get_or_create_global_step())
+  if i % 20 == 0:
+    print("Loss at step {:03d}: {:.3f}".format(i, loss(model, training_inputs, training_outputs)))
+
+print("Final loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))
+print("W = {}, B = {}".format(model.W.numpy(), model.B.numpy()))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 69.066
+Loss at step 000: 66.368
+Loss at step 020: 30.107
+Loss at step 040: 13.959
+Loss at step 060: 6.769
+Loss at step 080: 3.567
+Loss at step 100: 2.141
+Loss at step 120: 1.506
+Loss at step 140: 1.223
+Loss at step 160: 1.097
+Loss at step 180: 1.041
+Loss at step 200: 1.016
+Loss at step 220: 1.005
+Loss at step 240: 1.000
+Loss at step 260: 0.998
+Loss at step 280: 0.997
+Final loss: 0.996
+W = 2.99431324005, B = 2.02129220963
+```
+
+## Use objects for state during eager execution
+
+With graph execution, program state (such as the variables) is stored in global
+collections and their lifetime is managed by the `tf.Session` object. In
+contrast, during eager execution the lifetime of state objects is determined by
+the lifetime of their corresponding Python object.
+
+### Variables are objects
+
+During eager execution, variables persist until the last reference to the object
+is removed, and is then deleted.
+
+```py
+with tf.device("gpu:0"):
+  v = tfe.Variable(tf.random_normal([1000, 1000]))
+  v = None  # v no longer takes up GPU memory
+```
+
+### Object-based saving
+
+`tfe.Checkpoint` can save and restore `tfe.Variable`s to and from
+checkpoints:
+
+```py
+x = tfe.Variable(10.)
+
+checkpoint = tfe.Checkpoint(x=x)  # save as "x"
+
+x.assign(2.)   # Assign a new value to the variables and save.
+save_path = checkpoint.save('./ckpt/')
+
+x.assign(11.)  # Change the variable after saving.
+
+# Restore values from the checkpoint
+checkpoint.restore(save_path)
+
+print(x)  # => 2.0
+```
+
+To save and load models, `tfe.Checkpoint` stores the internal state of objects,
+without requiring hidden variables. To record the state of a `model`,
+an `optimizer`, and a global step, pass them to a `tfe.Checkpoint`:
+
+```py
+model = MyModel()
+optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
+checkpoint_dir = ‘/path/to/model_dir’
+checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
+root = tfe.Checkpoint(optimizer=optimizer,
+                      model=model,
+                      optimizer_step=tf.train.get_or_create_global_step())
+
+root.save(file_prefix=checkpoint_prefix)
+# or
+root.restore(tf.train.latest_checkpoint(checkpoint_dir))
+```
+
+### Object-oriented metrics
+
+`tfe.metrics` are stored as objects. Update a metric by passing the new data to
+the callable, and retrieve the result using the `tfe.metrics.result` method,
+for example:
+
+```py
+m = tfe.metrics.Mean("loss")
+m(0)
+m(5)
+m.result()  # => 2.5
+m([8, 9])
+m.result()  # => 5.5
+```
+
+#### Summaries and TensorBoard
+
+@{$summaries_and_tensorboard$TensorBoard} is a visualization tool for
+understanding, debugging and optimizing the model training process. It uses
+summary events that are written while executing the program.
+
+`tf.contrib.summary` is compatible with both eager and graph execution
+environments. Summary operations, such as `tf.contrib.summary.scalar`, are
+inserted during model construction. For example, to record summaries once every
+100 global steps:
+
+```py
+writer = tf.contrib.summary.create_file_writer(logdir)
+global_step=tf.train.get_or_create_global_step()  # return global step var
+
+writer.set_as_default()
+
+for _ in range(iterations):
+  global_step.assign_add(1)
+  # Must include a record_summaries method
+  with tf.contrib.summary.record_summaries_every_n_global_steps(100):
+    # your model code goes here
+    tf.contrib.summary.scalar('loss', loss)
+     ...
+```
+
+## Advanced automatic differentiation topics
+
+### Dynamic models
+
+`tf.GradientTape` can also be used in dynamic models. This example for a
+[backtracking line search](https://wikipedia.org/wiki/Backtracking_line_search)
+algorithm looks like normal NumPy code, except there are gradients and is
+differentiable, despite the complex control flow:
+
+```py
+def line_search_step(fn, init_x, rate=1.0):
+  with tf.GradientTape() as tape:
+    # Variables are automatically recorded, but manually watch a tensor
+    tape.watch(init_x)
+    value = fn(init_x)
+  grad, = tape.gradient(value, [init_x])
+  grad_norm = tf.reduce_sum(grad * grad)
+  init_value = value
+  while value > init_value - rate * grad_norm:
+    x = init_x - rate * grad
+    value = fn(x)
+    rate /= 2.0
+  return x, value
+```
+
+### Additional functions to compute gradients
+
+`tf.GradientTape` is a powerful interface for computing gradients, but there
+is another [Autograd](https://github.com/HIPS/autograd)-style API available for
+automatic differentiation. These functions are useful if writing math code with
+only tensors and gradient functions, and without `tfe.Variables`:
+
+* `tfe.gradients_function` —Returns a function that computes the derivatives
+  of its input function parameter with respect to its arguments. The input
+  function parameter must return a scalar value. When the returned function is
+  invoked, it returns a list of `tf.Tensor` objects: one element for each
+  argument of the input function. Since anything of interest must be passed as a
+  function parameter, this becomes unwieldy if there's a dependency on many
+  trainable parameters.
+* `tfe.value_and_gradients_function` —Similar to
+  `tfe.gradients_function`, but when the returned function is invoked, it
+  returns the value from the input function in addition to the list of
+  derivatives of the input function with respect to its arguments.
+
+In the following example, `tfe.gradients_function` takes the `square`
+function as an argument and returns a function that computes the partial
+derivatives of `square` with respect to its inputs. To calculate the derivative
+of `square` at `3`, `grad(3.0)` returns `6`.
+
+```py
+def square(x):
+  return tf.multiply(x, x)
+
+grad = tfe.gradients_function(square)
+
+square(3.)  # => 9.0
+grad(3.)    # => [6.0]
+
+# The second-order derivative of square:
+gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
+gradgrad(3.)  # => [2.0]
+
+# The third-order derivative is None:
+gradgradgrad = tfe.gradients_function(lambda x: gradgrad(x)[0])
+gradgradgrad(3.)  # => [None]
+
+
+# With flow control:
+def abs(x):
+  return x if x > 0. else -x
+
+grad = tfe.gradients_function(abs)
+
+grad(3.)   # => [1.0]
+grad(-3.)  # => [-1.0]
+```
+
+### Custom gradients
+
+Custom gradients are an easy way to override gradients in eager and graph
+execution. Within the forward function, define the gradient with respect to the
+inputs, outputs, or intermediate results. For example, here's an easy way to clip
+the norm of the gradients in the backward pass:
+
+```py
+@tf.custom_gradient
+def clip_gradient_by_norm(x, norm):
+  y = tf.identity(x)
+  def grad_fn(dresult):
+    return [tf.clip_by_norm(dresult, norm), None]
+  return y, grad_fn
+```
+
+Custom gradients are commonly used to provide a numerically stable gradient for a
+sequence of operations:
+
+```py
+def log1pexp(x):
+  return tf.log(1 + tf.exp(x))
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# The gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# However, x = 100 fails because of numerical instability.
+grad_log1pexp(100.)  # => [nan]
+```
+
+Here, the `log1pexp` function can be analytically simplified with a custom
+gradient. The implementation below reuses the value for `tf.exp(x)` that is
+computed during the forward pass—making it more efficient by eliminating
+redundant calculations:
+
+```py
+@tf.custom_gradient
+def log1pexp(x):
+  e = tf.exp(x)
+  def grad(dy):
+    return dy * (1 - 1 / (1 + e))
+  return tf.log(1 + e), grad
+
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# As before, the gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# And the gradient computation also works at x = 100.
+grad_log1pexp(100.)  # => [1.0]
+```
+
+## Performance
+
+Computation is automatically offloaded to GPUs during eager execution. If you
+want control over where a computation runs you can enclose it in a
+`tf.device('/gpu:0')` block (or the CPU equivalent):
+
+```py
+import time
+
+def measure(x, steps):
+  # TensorFlow initializes a GPU the first time it's used, exclude from timing.
+  tf.matmul(x, x)
+  start = time.time()
+  for i in range(steps):
+    x = tf.matmul(x, x)
+    _ = x.numpy()  # Make sure to execute op and not just enqueue it
+  end = time.time()
+  return end - start
+
+shape = (1000, 1000)
+steps = 200
+print("Time to multiply a {} matrix by itself {} times:".format(shape, steps))
+
+# Run on CPU:
+with tf.device("/cpu:0"):
+  print("CPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+
+# Run on GPU, if available:
+if tfe.num_gpus() > 0:
+  with tf.device("/gpu:0"):
+    print("GPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+else:
+  print("GPU: not found")
+```
+
+Output (exact numbers depend on hardware):
+
+```
+Time to multiply a (1000, 1000) matrix by itself 200 times:
+CPU: 4.614904403686523 secs
+GPU: 0.5581181049346924 secs
+```
+
+A `tf.Tensor` object can be copied to a different device to execute its
+operations:
+
+```py
+x = tf.random_normal([10, 10])
+
+x_gpu0 = x.gpu()
+x_cpu = x.cpu()
+
+_ = tf.matmul(x_cpu, x_cpu)    # Runs on CPU
+_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
+
+if tfe.num_gpus() > 1:
+  x_gpu1 = x.gpu(1)
+  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
+```
+
+### Benchmarks
+
+For compute-heavy models, such as
+[ResNet50](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/resnet50)
+training on a GPU, eager execution performance is comparable to graph execution.
+But this gap grows larger for models with less computation and there is work to
+be done for optimizing hot code paths for models with lots of small operations.
+
+
+## Work with graphs
+
+While eager execution makes development and debugging more interactive,
+TensorFlow graph execution has advantages for distributed training, performance
+optimizations, and production deployment. However, writing graph code can feel
+different than writing regular Python code and more difficult to debug.
+
+For building and training graph-constructed models, the Python program first
+builds a graph representing the computation, then invokes `Session.run` to send
+the graph for execution on the C++-based runtime.  This provides:
+
+* Automatic differentiation using static autodiff.
+* Simple deployment to a platform independent server.
+* Graph-based optimizations (common subexpression elimination, constant-folding, etc.).
+* Compilation and kernel fusion.
+* Automatic distribution and replication (placing nodes on the distributed system).
+
+Deploying code written for eager execution is more difficult: either generate a
+graph from the model, or run the Python runtime and code directly on the server.
+
+### Write compatible code
+
+The same code written for eager execution will also build a graph during graph
+execution. Do this by simply running the same code in a new Python session where
+eager execution is not enabled.
+
+Most TensorFlow operations work during eager execution, but there are some things
+to keep in mind:
+
+* Use `tf.data` for input processing instead of queues. It's faster and easier.
+* Use object-oriented layer APIs—like `tf.keras.layers` and
+  `tf.keras.Model`—since they have explicit storage for variables.
+* Most model code works the same during eager and graph execution, but there are
+  exceptions. (For example, dynamic models using Python control flow to change the
+  computation based on inputs.)
+* Once eager execution is enabled with `tf.enable_eager_execution`, it
+  cannot be turned off. Start a new Python session to return to graph execution.
+
+It's best to write code for both eager execution *and* graph execution. This
+gives you eager's interactive experimentation and debuggability with the
+distributed performance benefits of graph execution.
+
+Write, debug, and iterate in eager execution, then import the model graph for
+production deployment. Use `tfe.Checkpoint` to save and restore model
+variables, this allows movement between eager and graph execution environments.
+See the examples in:
+[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
+
+### Use eager execution in a graph environment
+
+Selectively enable eager execution in a TensorFlow graph environment using
+`tfe.py_func`. This is used when `tf.enable_eager_execution()` has *not*
+been called.
+
+```py
+def my_py_func(x):
+  x = tf.matmul(x, x)  # You can use tf ops
+  print(x)  # but it's eager!
+  return x
+
+with tf.Session() as sess:
+  x = tf.placeholder(dtype=tf.float32)
+  # Call eager function in graph!
+  pf = tfe.py_func(my_py_func, [x], tf.float32)
+  sess.run(pf, feed_dict={x: [[2.0]]})  # [[4.0]]
+```
diff --git a/tensorflow/docs_src/programmers_guide/embedding.md b/tensorflow/docs_src/programmers_guide/embedding.md
index e8027fc12b368ddfbc51cc47441478901d7caec7..d5703e07375b1f68f4e22476288f1ed57d340c5b 100644
--- a/tensorflow/docs_src/programmers_guide/embedding.md
+++ b/tensorflow/docs_src/programmers_guide/embedding.md
@@ -7,6 +7,9 @@ with the TensorBoard Embedding Projector
 newcomers to machine learning or TensorFlow, and the Embedding Projector how-to
 is for users at all levels.
 
+An alternative tutorial on these concepts is available in the
+[Embeddings section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture).
+
 [TOC]
 
 An **embedding** is a mapping from discrete objects, such as words, to vectors
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 70931f2862de98cb1e934f85919d558a3b36304a..51c1a1e032baae7eff334da785fc5ffa2438e0ca 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -121,7 +121,7 @@ dimensions:
   devices, which makes it possible to speed up
   @{$deep_cnn$CIFAR-10 training using multiple GPUs}.
 * The Session API allows multiple concurrent steps (i.e. calls to
-  @{tf.Session.run} in parallel. This
+  @{tf.Session.run} in parallel). This
   enables the runtime to get higher throughput, if a single step does not use
   all of the resources in your computer.
 
@@ -159,8 +159,7 @@ available. These operations allow you to build sophisticated
 @{$reading_data$input pipelines}, at the cost of making the
 TensorFlow computation somewhat more complicated. See the how-to documentation
 for
-@{$reading_data#creating-threads-to-prefetch-using-queuerunner-objects$using
-`QueueRunner` objects to drive queues and readers}
+@{$reading_data#creating_threads_to_prefetch_using_queuerunner_objects$using `QueueRunner` objects to drive queues and readers}
 for more information on how to use them.
 
 ## Variables
@@ -273,7 +272,7 @@ Prefer predefined TensorFlow operations such as @{tf.decode_raw},
 
 If your data is not easily parsable with the built-in TensorFlow operations,
 consider converting it, offline, to a format that is easily parsable, such
-as ${tf.python_io.TFRecordWriter$`TFRecord`} format.
+as @{tf.python_io.TFRecordWriter$`TFRecord`} format.
 
 The more efficient method to customize the parsing behavior is to
 @{$adding_an_op$add a new op written in C++} that parses your
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 9049a5a9f3d44e255188c6c41cdb12a619464379..f0dd8def17fd6dfed241167a5ebb5be678152c16 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -96,7 +96,7 @@ to all API functions in the same context.  For example:
   (See @{$programmers_guide/variables} for more information about variables.)
 
 * Calling @{tf.train.Optimizer.minimize} will add operations and tensors to the
-  default graph that calculate gradients, and return a @{tf.Operation} that,
+  default graph that calculates gradients, and return a @{tf.Operation} that,
   when run, will apply those gradients to a set of variables.
 
 Most programs rely solely on the default graph. However,
@@ -210,9 +210,8 @@ with tf.device("/device:GPU:0"):
   # Operations created in this context will be pinned to the GPU.
   result = tf.matmul(weights, img)
 ```
-
-If you are deploying TensorFlow in a @{$deploy/distributed$typical distributed
-configuration}, you might specify the job name and task ID to place variables on
+If you are deploying TensorFlow in a @{$distributed$typical distributed configuration},
+you might specify the job name and task ID to place variables on
 a task in the parameter server job (`"/job:ps"`), and the other operations on
 task in the worker job (`"/job:worker"`):
 
@@ -336,20 +335,20 @@ described below.
   controls the behavior of the session. For example, some of the configuration
   options include:
 
-  * `allow_soft_placement`. Set this to `True` to enable a "soft" device
+    * `allow_soft_placement`. Set this to `True` to enable a "soft" device
     placement algorithm, which ignores @{tf.device} annotations that attempt
     to place CPU-only operations on a GPU device, and places them on the CPU
     instead.
 
-  * `cluster_def`. When using distributed TensorFlow, this option allows you
+    * `cluster_def`. When using distributed TensorFlow, this option allows you
     to specify what machines to use in the computation, and provide a mapping
     between job names, task indices, and network addresses. See
     @{tf.train.ClusterSpec.as_cluster_def} for details.
 
-  * `graph_options.optimizer_options`. Provides control over the optimizations
+    * `graph_options.optimizer_options`. Provides control over the optimizations
     that TensorFlow performs on your graph before executing it.
 
-  * `gpu_options.allow_growth`. Set this to `True` to change the GPU memory
+    * `gpu_options.allow_growth`. Set this to `True` to change the GPU memory
     allocator so that it gradually increases the amount of memory allocated,
     rather than allocating most of the memory at startup.
 
@@ -363,7 +362,7 @@ operations that are needed to compute the result.
 
 @{tf.Session.run} requires you to specify a list of **fetches**, which determine
 the return values, and may be a @{tf.Operation}, a @{tf.Tensor}, or
-a [tensor-like type](#tensor-like-objects) such as @{tf.Variable}. These fetches
+a [tensor-like type](#tensor-like_objects) such as @{tf.Variable}. These fetches
 determine what **subgraph** of the overall @{tf.Graph} must be executed to
 produce the result: this is the subgraph that contains all operations named in
 the fetch list, plus all operations whose outputs are used to compute the value
@@ -506,7 +505,7 @@ multiple graphs in the same process.
 As noted above, TensorFlow provides a "default graph" that is implicitly passed
 to all API functions in the same context. For many applications, a single graph
 is sufficient. However, TensorFlow also provides methods for manipulating
-the default graph, which can be useful in more advanced used cases. For example:
+the default graph, which can be useful in more advanced use cases. For example:
 
 * A @{tf.Graph} defines the namespace for @{tf.Operation} objects: each
   operation in a single graph must have a unique name. TensorFlow will
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 7a5e90081d9145ca934929f0af11f2a40cb2dcae..648d001bd3535fe3dcc460c9ebdb6e6a997dc332 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -5,6 +5,7 @@ works. The units are as follows:
 
 ## High Level APIs
 
+  * @{$programmers_guide/eager}, which is the easiest way to use TensorFlow.
   * @{$programmers_guide/estimators}, which introduces a high-level
     TensorFlow API that greatly simplifies ML programming.
   * @{$programmers_guide/datasets}, which explains how to
@@ -30,8 +31,12 @@ works. The units are as follows:
     can still be helpful.
   * @{$programmers_guide/saved_model}, which
     explains how to save and restore variables and models.
+
+## Accelerators
+
   * @{$using_gpu} explains how TensorFlow assigns operations to
     devices and how you can change the arrangement manually.
+  * @{$using_tpu} explains how to modify `Estimator` programs to run on a TPU.
 
 
 ## ML Concepts
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 3fe4cb2ddaee40d9d6c6470bee171dedb27ad890..7ac63bf2e019fc3b6aa7ab1b3e6422a97858d8c6 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,8 +1,9 @@
 index.md
 
 ### High Level APIs
-estimators.md
+eager.md
 datasets.md
+estimators.md
 
 ### Low Level APIs
 low_level_intro.md
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index f27a658342b8d33407e1c6ed5799a10c2305a74c..c6ef87c54a3bc37dbfc0553232a8e3d30f8ee2f6 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -1,35 +1,33 @@
-# Saving and Restoring
+# Save and Restore
 
-This document explains how to save and restore
-@{$variables$variables} and models.
+The @{tf.train.Saver} class provides methods to save and restore models. The
+@{tf.saved_model.simple_save} function is an easy way to build a
+@{tf.saved_model$saved model} suitable for serving.
+[Estimators](@{$programmers_guide/estimators}) automatically save and restore
+variables in the `model_dir`.
 
+## Save and restore variables
 
-## Saving and restoring variables
+TensorFlow @{$variables} are the best way to represent shared, persistent state
+manipulated by your program. The `tf.train.Saver` constructor adds `save` and
+`restore` ops to the graph for all, or a specified list, of the variables in the
+graph.  The `Saver` object provides methods to run these ops, specifying paths
+for the checkpoint files to write to or read from.
 
-A TensorFlow variable provides the best way to represent shared, persistent
-state manipulated by your program. (See @{$variables$Variables} for details.)
-This section explains how to save and restore variables.
-Note that Estimators automatically saves and restores variables
-(in the `model_dir`).
-
-The `tf.train.Saver` class provides methods for saving and restoring models.
-The `tf.train.Saver` constructor adds `save` and `restore` ops to the graph
-for all, or a specified list, of the variables in the graph.  The `Saver`
-object provides methods to run these ops, specifying paths for the checkpoint
-files to write to or read from.
-
-The saver will restore all variables already defined in your model. If you're
+`Saver` restores all variables already defined in your model. If you're
 loading a model without knowing how to build its graph (for example, if you're
 writing a generic program to load models), then read the
 [Overview of saving and restoring models](#models) section
 later in this document.
 
-TensorFlow saves variables in binary **checkpoint files** that,
-roughly speaking, map variable names to tensor values.
+TensorFlow saves variables in binary *checkpoint files* that map variable
+names to tensor values.
 
+Caution: TensorFlow model files are code. Be careful with untrusted code.
+See [Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md)
+for details.
 
-
-### Saving variables
+### Save variables
 
 Create a `Saver` with `tf.train.Saver()` to manage all variables in the
 model. For example, the following snippet demonstrates how to call the
@@ -61,9 +59,7 @@ with tf.Session() as sess:
   print("Model saved in path: %s" % save_path)
 ```
 
-
-
-### Restoring variables
+### Restore variables
 
 The `tf.train.Saver` object not only saves variables to checkpoint files, it
 also restores variables. Note that when you restore variables you do not have
@@ -92,14 +88,11 @@ with tf.Session() as sess:
   print("v2 : %s" % v2.eval())
 ```
 
-Notes:
-
-*  There is not a physical file called "/tmp/model.ckpt". It is the **prefix**
-   of filenames created for the checkpoint. Users only interact with the
-   prefix instead of physical checkpoint files.
-
+Note: There is not a physical file called `/tmp/model.ckpt`. It is the *prefix* of
+filenames created for the checkpoint. Users only interact with the prefix
+instead of physical checkpoint files.
 
-### Choosing which variables to save and restore
+### Choose variables to save and restore
 
 If you do not pass any arguments to `tf.train.Saver()`, the saver handles all
 variables in the graph.  Each variable is saved under the name that was passed
@@ -198,29 +191,42 @@ chkp.print_tensors_in_checkpoint_file("/tmp/model.ckpt", tensor_name='v2', all_t
 
 
 <a name="models"></a>
-## Overview of saving and restoring models
+## Save and restore models
 
-When you want to save and load variables, the graph, and the
-graph's metadata--basically, when you want to save or restore
-your model--we recommend using SavedModel.
-**SavedModel** is a language-neutral, recoverable, hermetic
-serialization format.  SavedModel enables higher-level systems
-and tools to produce, consume, and transform TensorFlow models.
-TensorFlow provides several mechanisms for interacting with
-SavedModel, including tf.saved_model APIs, Estimator APIs and a CLI.
+Use `SavedModel` to save and load your model—variables, the graph, and the
+graph's metadata. This is a language-neutral, recoverable, hermetic
+serialization format that enables higher-level systems and tools to produce,
+consume, and transform TensorFlow models. TensorFlow provides several ways to
+interact with `SavedModel`, including the @{tf.saved_model} APIs,
+@{tf.estimator.Estimator}, and a command-line interface.
 
 
-## APIs to build and load a SavedModel
+## Build and load a SavedModel
 
-This section focuses on the APIs for building and loading a SavedModel,
-particularly when using lower-level TensorFlow APIs.
+### Simple save
 
+The easiest way to create a `SavedModel` is to use the @{tf.saved_model.simple_save}
+function:
 
-### Building a SavedModel
+```python
+simple_save(session,
+            export_dir,
+            inputs={"x": x, "y": y},
+            outputs={"z": z})
+```
+
+This configures the `SavedModel` so it can be loaded by
+[TensorFlow serving](/serving/serving_basic) and supports the
+[Predict API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
+To access the classify, regress, or multi-inference APIs, use the manual
+`SavedModel` builder APIs or an @{tf.estimator.Estimator}.
+
+### Manually build a SavedModel
 
-We provide a Python implementation of the SavedModel
-@{tf.saved_model.builder$builder}.
-The `SavedModelBuilder` class provides functionality to
+If your use case isn't covered by @{tf.saved_model.simple_save}, use the manual
+@{tf.saved_model.builder$builder APIs} to create a `SavedModel`.
+
+The @{tf.saved_model.builder.SavedModelBuilder} class provides functionality to
 save multiple `MetaGraphDef`s.  A **MetaGraph** is a dataflow graph, plus
 its associated variables, assets, and signatures.  A **`MetaGraphDef`**
 is the protocol buffer representation of a MetaGraph.  A **signature** is
@@ -250,16 +256,51 @@ with tf.Session(graph=tf.Graph()) as sess:
   builder.add_meta_graph_and_variables(sess,
                                        [tag_constants.TRAINING],
                                        signature_def_map=foo_signatures,
-                                       assets_collection=foo_assets)
+                                       assets_collection=foo_assets,
+                                       strip_default_attrs=True)
 ...
 # Add a second MetaGraphDef for inference.
 with tf.Session(graph=tf.Graph()) as sess:
   ...
-  builder.add_meta_graph([tag_constants.SERVING])
+  builder.add_meta_graph([tag_constants.SERVING], strip_default_attrs=True)
 ...
 builder.save()
 ```
 
+<a name="forward_compatibility"></a>
+#### Forward compatibility via `strip_default_attrs=True`
+
+Following the guidance below gives you forward compatibility only if the set of
+Ops has not changed.
+
+The @{tf.saved_model.builder.SavedModelBuilder$`SavedModelBuilder`} class allows
+users to control whether default-valued attributes must be stripped from the
+@{$extend/tool_developers#nodes$`NodeDefs`}
+while adding a meta graph to the SavedModel bundle. Both
+@{tf.saved_model.builder.SavedModelBuilder.add_meta_graph_and_variables$`SavedModelBuilder.add_meta_graph_and_variables`}
+and @{tf.saved_model.builder.SavedModelBuilder.add_meta_graph$`SavedModelBuilder.add_meta_graph`}
+methods accept a Boolean flag `strip_default_attrs` that controls this behavior.
+
+If `strip_default_attrs` is `False`, the exported @{tf.MetaGraphDef} will have
+the default valued attributes in all its @{tf.NodeDef} instances.
+This can break forward compatibility with a sequence of events such as the
+following:
+
+*  An existing Op (`Foo`) is updated to include a new attribute (`T`) with a
+   default (`bool`) at version 101.
+*  A model producer such as a "trainer binary" picks up this change (version 101)
+   to the `OpDef` and re-exports an existing model that uses Op `Foo`.
+*  A model consumer (such as [Tensorflow Serving](/serving)) running an older
+   binary (version 100) doesn't have attribute `T` for Op `Foo`, but tries to
+   import this model. The model consumer doesn't recognize attribute `T` in a
+   `NodeDef` that uses Op `Foo` and therefore fails to load the model.
+*  By setting `strip_default_attrs` to True, the model producers can strip away
+   any default valued attributes in the `NodeDefs`. This helps ensure that newly
+   added attributes with defaults don't cause older model consumers to fail
+   loading models regenerated with newer training binaries.
+
+See [compatibility guidance](https://www.tensorflow.org/programmers_guide/version_compat)
+for more information.
 
 ### Loading a SavedModel in Python
 
@@ -285,7 +326,7 @@ with tf.Session(graph=tf.Graph()) as sess:
 ```
 
 
-### Loading a SavedModel in C++
+### Load a SavedModel in C++
 
 The C++ version of the SavedModel
 [loader](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/loader.h)
@@ -303,7 +344,7 @@ LoadSavedModel(session_options, run_options, export_dir, {kSavedModelTagTrain},
                &bundle);
 ```
 
-### Loading and Serving a SavedModel in TensorFlow Serving
+### Load and serve a SavedModel in TensorFlow serving
 
 You can easily load and serve a SavedModel with the TensorFlow Serving Model
 Server binary. See [instructions](https://www.tensorflow.org/serving/setup#installing_using_apt-get)
@@ -359,7 +400,7 @@ defined in:
 
 After training an `Estimator` model, you may want to create a service
 from that model that takes requests and returns a result.  You can run such a
-service locally on your machine or deploy it scalably in the cloud.
+service locally on your machine or deploy it in the cloud.
 
 To prepare a trained Estimator for serving, you must export it in the standard
 SavedModel format. This section explains how to:
@@ -371,7 +412,7 @@ SavedModel format. This section explains how to:
 * Serve the model from a local server and request predictions.
 
 
-### Preparing serving inputs
+### Prepare serving inputs
 
 During training, an @{$premade_estimators#input_fn$`input_fn()`} ingests data
 and prepares it for use by the model.  At serving time, similarly, a
@@ -444,31 +485,8 @@ portion of the signature.  That is, when writing a
 to expect and how to map them to your model's expected inputs.
 By contrast, the *output* portion of the signature is determined by the model.
 
-
-### Performing the export
-
-To export your trained Estimator, call
-@{tf.estimator.Estimator.export_savedmodel} with the export base path and
-the `serving_input_receiver_fn`.
-
-```py
-estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn)
-```
-
-This method builds a new graph by first calling the
-`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling
-this `Estimator`'s `model_fn()` to generate the model graph based on those
-features. It starts a fresh `Session`, and, by default, restores the most recent
-checkpoint into it.  (A different checkpoint may be passed, if needed.)
-Finally it creates a time-stamped export directory below the given
-`export_dir_base` (i.e., `export_dir_base/<timestamp>`), and writes a
-SavedModel into it containing a single `MetaGraphDef` saved from this
-Session.
-
-> Note: It is your responsibility to garbage-collect old exports.
-> Otherwise, successive exports will accumulate under `export_dir_base`.
-
-### Specifying the outputs of a custom model
+<a name="specify_outputs"></a>
+### Specify the outputs of a custom model
 
 When writing a custom `model_fn`, you must populate the `export_outputs` element
 of the @{tf.estimator.EstimatorSpec} return value. This is a dict of
@@ -499,8 +517,32 @@ using [`signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`](https://www.tens
 indicating which `SignatureDef` will be served when an inference request
 does not specify one.
 
+<a name="perform_export"></a>
+### Perform the export
+
+To export your trained Estimator, call
+@{tf.estimator.Estimator.export_savedmodel} with the export base path and
+the `serving_input_receiver_fn`.
 
-### Serving the exported model locally
+```py
+estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn,
+                            strip_default_attrs=True)
+```
+
+This method builds a new graph by first calling the
+`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling
+this `Estimator`'s `model_fn()` to generate the model graph based on those
+features. It starts a fresh `Session`, and, by default, restores the most recent
+checkpoint into it.  (A different checkpoint may be passed, if needed.)
+Finally it creates a time-stamped export directory below the given
+`export_dir_base` (i.e., `export_dir_base/<timestamp>`), and writes a
+SavedModel into it containing a single `MetaGraphDef` saved from this
+Session.
+
+> Note: It is your responsibility to garbage-collect old exports.
+> Otherwise, successive exports will accumulate under `export_dir_base`.
+
+### Serve the exported model locally
 
 For local deployment, you can serve your model using
 [TensorFlow Serving](https://github.com/tensorflow/serving), an open-source project that loads a
@@ -519,7 +561,7 @@ bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --port=9000 -
 Now you have a server listening for inference requests via gRPC on port 9000!
 
 
-### Requesting predictions from a local server
+### Request predictions from a local server
 
 The server responds to gRPC requests according to the
 [PredictionService](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/prediction_service.proto#L15)
@@ -612,7 +654,7 @@ passing in sample inputs in various formats (for example, Python
 expressions) and then fetching the output.
 
 
-### Installing the SavedModel CLI
+### Install the SavedModel CLI
 
 Broadly speaking, you can install TensorFlow in either of the following
 two ways:
@@ -694,15 +736,15 @@ executing the computation graph later. For example:
 $ saved_model_cli show --dir \
 /tmp/saved_model_dir --tag_set serve --signature_def serving_default
 The given SavedModel SignatureDef contains the following input(s):
-inputs['x'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x:0
+  inputs['x'] tensor_info:
+      dtype: DT_FLOAT
+      shape: (-1, 1)
+      name: x:0
 The given SavedModel SignatureDef contains the following output(s):
-outputs['y'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
+  outputs['y'] tensor_info:
+      dtype: DT_FLOAT
+      shape: (-1, 1)
+      name: y:0
 Method name is: tensorflow/serving/predict
 ```
 
@@ -714,32 +756,32 @@ $ saved_model_cli show --dir /tmp/saved_model_dir --all
 MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
 
 signature_def['classify_x2_to_y3']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x2:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['scores'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y3:0
-Method name is: tensorflow/serving/classify
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x2:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['scores'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y3:0
+  Method name is: tensorflow/serving/classify
 
 ...
 
 signature_def['serving_default']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['x'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['y'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
-Method name is: tensorflow/serving/predict
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['x'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['y'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y:0
+  Method name is: tensorflow/serving/predict
 ```
 
 
@@ -839,7 +881,7 @@ For example:
 `<input_key>=[{"age":[22,24],"education":["BS","MS"]}]`
 ```
 
-#### Save Output
+#### Save output
 
 By default, the SavedModel CLI writes output to stdout. If a directory is
 passed to `--outdir` option, the outputs will be saved as npy files named after
@@ -848,7 +890,7 @@ output tensor keys under the given directory.
 Use `--overwrite` to overwrite existing output files.
 
 
-#### TensorFlow Debugger (tfdbg) Integration
+#### TensorFlow debugger (tfdbg) integration
 
 If `--tf_debug` option is set, the SavedModel CLI will use the
 TensorFlow Debugger (tfdbg) to watch the intermediate Tensors and runtime
@@ -955,6 +997,3 @@ of checkpoints and assets:
 
 Each graph is associated with a specific set of tags, which enables
 identification during a load or restore operation.
-
-
-
diff --git a/tensorflow/docs_src/programmers_guide/summaries_and_tensorboard.md b/tensorflow/docs_src/programmers_guide/summaries_and_tensorboard.md
index 05dfdfdc4d2257fc680e7fa99b666ef86e3bef09..fadfa03e78349801d69e0045991a8fa9a0a59df9 100644
--- a/tensorflow/docs_src/programmers_guide/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/programmers_guide/summaries_and_tensorboard.md
@@ -16,10 +16,17 @@ TensorBoard is fully configured, it looks like this:
   </iframe>
 </div>
 
-This tutorial is intended to get you started with simple TensorBoard usage.
-There are other resources available as well! The [TensorBoard's GitHub](https://github.com/tensorflow/tensorboard)
-has a lot more information on TensorBoard usage, including tips & tricks, and
-debugging information.
+This 30-minute tutorial is intended to get you started with simple TensorBoard
+usage. It assumes a basic understanding of TensorFlow.
+
+There are other resources available as well! The [TensorBoard GitHub](https://github.com/tensorflow/tensorboard)
+has a lot more information on using individual dashboards within TensorBoard
+including tips & tricks and debugging information.
+
+## Setup
+
+[Install TensorFlow](https://www.tensorflow.org/install/). Installing TensorFlow
+via pip should also automatically install TensorBoard.
 
 ## Serializing the data
 
@@ -76,7 +83,7 @@ data than you need, though. Instead, consider running the merged summary op
 every `n` steps.
 
 The code example below is a modification of the
-@{$layers$simple MNIST tutorial},
+[simple MNIST tutorial](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/mnist/mnist.py),
 in which we have added some summary ops, and run them every ten steps. If you
 run this and then launch `tensorboard --logdir=/tmp/tensorflow/mnist`, you'll be able
 to visualize statistics, such as how the weights or accuracy varied during
@@ -214,4 +221,5 @@ corner. Each tab represents a set of serialized data that can be visualized.
 For in depth information on how to use the *graph* tab to visualize your graph,
 see @{$graph_viz$TensorBoard: Graph Visualization}.
 
-For more usage information on TensorBoard in general, see the [TensorBoard's GitHub](https://github.com/tensorflow/tensorboard).
+For more usage information on TensorBoard in general, see the
+[TensorBoard GitHub](https://github.com/tensorflow/tensorboard).
diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
index d74d7f3181c9cf44e6c97e13742db682858f4694..5e3e49d43402cd76f8b7062483259df4598bd8ff 100644
--- a/tensorflow/docs_src/programmers_guide/using_tpu.md
+++ b/tensorflow/docs_src/programmers_guide/using_tpu.md
@@ -11,7 +11,7 @@ This doc is aimed at users who:
   using an existing model.
 * Have, perhaps, skimmed the code of an example TPU model
   [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py)
-  [[2]](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models).
+  [[2]](https://github.com/tensorflow/tpu/tree/master/models).
 * Are interested in porting an existing `Estimator` model to
   run on Cloud TPUs
 
@@ -129,10 +129,9 @@ my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
 Typically the `FLAGS` would be set by command line arguments. To switch from
 training locally to training on a cloud TPU you would need to:
 
-  1) Set `FLAGS.use_tpu` to `True`
-  1) Set `FLAGS.tpu_name` so the
-     `tf.contrib.cluster_resolver.TPUClusterResolver` can find it
-  1) Set `FLAGS.model_dir` to a Google Cloud Storage bucket url (`gs://`).
+* Set `FLAGS.use_tpu` to `True`
+* Set `FLAGS.tpu_name` so the `tf.contrib.cluster_resolver.TPUClusterResolver` can find it
+* Set `FLAGS.model_dir` to a Google Cloud Storage bucket url (`gs://`).
 
 
 ## Optimizer
@@ -281,15 +280,15 @@ Where `params['batch-size']` will contain the batch size.
 ### Static shapes and batch size
 
 The input pipeline generated by your `input_fn` is run on CPU. So it is mostly
-free strict static shape requirements imposed by the XLA/TPU environment. The
-one requirement is that the batches of data fed from your input pipeline to
+free from the strict static shape requirements imposed by the XLA/TPU environment.
+The one requirement is that the batches of data fed from your input pipeline to
 the TPU have a static shape, as determined by the standard TensorFlow shape
 inference algorithm. Intermediate tensors are free to have a dynamic shapes.
 If shape inference has failed, but the shape is known it is possible to
 impose the correct shape using `tf.set_shape()`. 
 
 In the example below the shape
-inference algorithm fails, but it is corrected using `set_shape`:
+inference algorithm fails, but it is correctly using `set_shape`:
 
 ```
 >>> x = tf.zeros(tf.constant([1,2,3])+1)
@@ -372,10 +371,10 @@ in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so
 that data is available when needed.
 
 The TPU-demos repo includes
-[a script](https://github.com/tensorflow/tpu-demos/blob/master/cloud_tpu/datasets/imagenet_to_gcs.py)
+[a script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
 for downloading the imagenet dataset and converting it to an appropriate format.
 This together with the imagenet
-[models](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models)
+[models](https://github.com/tensorflow/tpu/tree/master/models)
 included in the repo demonstrate all of these best-practices.
 
 
@@ -388,7 +387,7 @@ For details on how to actually set up and run a Cloud TPU see:
 This document is by no means exhaustive. The best source of more detail on how
 to make a Cloud TPU compatible model are the example models published in:
 
- * The [TPU Demos Repository.](https://github.com/tensorflow/tpu-demos/)
+ * The [TPU Demos Repository.](https://github.com/tensorflow/tpu)
 
 For more information about tuning TensorFlow code for performance see:
 
diff --git a/tensorflow/docs_src/programmers_guide/version_compat.md b/tensorflow/docs_src/programmers_guide/version_compat.md
index a28f1385c87c7a083ee96977c5ab268c6977e17e..72e427c5f8f0f6581d528f4ead18699736eafd04 100644
--- a/tensorflow/docs_src/programmers_guide/version_compat.md
+++ b/tensorflow/docs_src/programmers_guide/version_compat.md
@@ -60,7 +60,8 @@ patch versions.  The public APIs consist of
     * [`tensor_shape`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor_shape.proto)
     * [`types`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto)
 
-## What is *not* covered {not_covered}
+<a name="not_covered"></a>
+## What is *not* covered
 
 Some API functions are explicitly marked as "experimental" and can change in
 backward incompatible ways between minor releases. These include:
@@ -182,7 +183,7 @@ Our versioning scheme has three requirements:
 *   **Forward compatibility** to support scenarios where the producer of a
     graph or checkpoint is upgraded to a newer version of TensorFlow before
     the consumer.
-*   Enable evolving TensorFlow in incompatible ways. For example, removing Ops,
+*   Enable evolving TensorFlow in incompatible ways. For example, removing ops,
     adding attributes, and removing attributes.
 
 Note that while the `GraphDef` version mechanism is separate from the TensorFlow
@@ -244,32 +245,51 @@ contains a main data version which is treated as either `producer` or
     `TF_CHECKPOINT_VERSION_MIN_CONSUMER`, and
     `TF_CHECKPOINT_VERSION_MIN_PRODUCER`.
 
+### Add a new attribute with default to an existing op
+
+Following the guidance below gives you forward compatibility only if the set of
+ops has not changed:
+
+1. If forward compatibility is desired,  set `strip_default_attrs` to `True`
+   while exporting the model using either the
+   @{tf.saved_model.builder.SavedModelBuilder.add_meta_graph_and_variables$`add_meta_graph_and_variables`}
+   and @{tf.saved_model.builder.SavedModelBuilder.add_meta_graph$`add_meta_graph`}
+   methods of the `SavedModelBuilder` class, or
+   @{tf.estimator.Estimator.export_savedmodel$`Estimator.export_savedmodel`}
+2. This strips off the default valued attributes at the time of
+   producing/exporting the models. This makes sure that the exported
+   @{tf.MetaGraphDef} does not contain the new op-attribute when the default
+   value is used.
+3. Having this control could allow out-of-date consumers (for example, serving
+   binaries that lag behind training binaries) to continue loading the models
+   and prevent interruptions in model serving.
+
 ### Evolving GraphDef versions
 
 This section explains how to use this versioning mechanism to make different
 types of changes to the `GraphDef` format.
 
-#### Add an Op
+#### Add an op
 
-Add the new Op to both consumers and producers at the same time, and do not
+Add the new op to both consumers and producers at the same time, and do not
 change any `GraphDef` versions. This type of change is automatically
 backward compatible, and does not impact forward compatibility plan since
 existing producer scripts will not suddenly use the new functionality.
 
-#### Add an Op and switch existing Python wrappers to use it
+#### Add an op and switch existing Python wrappers to use it
 
 1.  Implement new consumer functionality and increment the `GraphDef` version.
 2.  If it is possible to make the wrappers use the new functionality only in
     cases that did not work before, the wrappers can be updated now.
 3.  Change Python wrappers to use the new functionality. Do not increment
-    `min_consumer`, since models that do not use this Op should not break.
+    `min_consumer`, since models that do not use this op should not break.
 
-#### Remove or restrict an Op's functionality
+#### Remove or restrict an op's functionality
 
-1.  Fix all producer scripts (not TensorFlow itself) to not use the banned Op or
+1.  Fix all producer scripts (not TensorFlow itself) to not use the banned op or
     functionality.
 2.  Increment the `GraphDef` version and implement new consumer functionality
-    that bans the removed Op or functionality for GraphDefs at the new version
+    that bans the removed op or functionality for GraphDefs at the new version
     and above. If possible, make TensorFlow stop producing `GraphDefs` with the
     banned functionality. To do so, add the
     [`REGISTER_OP(...).Deprecated(deprecated_at_version,
@@ -278,15 +298,15 @@ existing producer scripts will not suddenly use the new functionality.
 4.  Increase `min_producer` to the GraphDef version from (2) and remove the
     functionality entirely.
 
-#### Change an Op's functionality
+#### Change an op's functionality
 
-1.  Add a new similar Op named `SomethingV2` or similar and go through the
+1.  Add a new similar op named `SomethingV2` or similar and go through the
     process of adding it and switching existing Python wrappers to use it, which
     may take three weeks if forward compatibility is desired.
-2.  Remove the old Op (Can only take place with a major version change due to
+2.  Remove the old op (Can only take place with a major version change due to
     backward compatibility).
-3.  Increase `min_consumer` to rule out consumers with the old Op, add back the
-    old Op as an alias for `SomethingV2`, and go through the process to switch
+3.  Increase `min_consumer` to rule out consumers with the old op, add back the
+    old op as an alias for `SomethingV2`, and go through the process to switch
     existing Python wrappers to use it.
 4.  Go through the process to remove `SomethingV2`.
 
@@ -294,6 +314,6 @@ existing producer scripts will not suddenly use the new functionality.
 
 1.  Bump the `GraphDef` version and add the bad version to `bad_consumers` for
     all new GraphDefs. If possible, add to `bad_consumers` only for GraphDefs
-    which contain a certain Op or similar.
+    which contain a certain op or similar.
 2.  If existing consumers have the bad version, push them out as soon as
     possible.
diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 7d79f433c41b42a268816d8277ea69b0d62a04f3..372ab47df7df309ab926836ca19f34d2d0d38915 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -280,7 +280,7 @@ tool:
 ```
 bazel run tensorflow/examples/wav_to_spectrogram:wav_to_spectrogram -- \
 --input_wav=/tmp/speech_dataset/happy/ab00c4b2_nohash_0.wav \
---output_png=/tmp/spectrogram.png
+--output_image=/tmp/spectrogram.png
 ```
 
 If you open up `/tmp/spectrogram.png` you should see something like this:
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index 679754020470dddfcffa76e62ca8f55a439ec4f5..6a4c9a9b0727208a158b1b57d13ca70290961ec2 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -268,7 +268,7 @@ in `cifar10_input.py`.
 
 `cifar10_train.py` periodically @{tf.train.Saver$saves}
 all model parameters in
-@{$variables#saving-and-restoring$checkpoint files}
+@{$programmers_guide/saved_model$checkpoint files}
 but it does *not* evaluate the model. The checkpoint file
 will be used by `cifar10_eval.py` to measure the predictive
 performance (see [Evaluating a Model](#evaluating-a-model) below).
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index df15bc0a9c3763aa51c2fc8cf36ce9fc3544ae68..27784eef9cdb5c6f8b9af44b3fc3f876cda39d13 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -1,397 +1,4 @@
 # How to Retrain Inception's Final Layer for New Categories
 
-Modern object recognition models have millions of parameters and can take weeks
-to fully train. Transfer learning is a technique that shortcuts a lot of this
-work by taking a fully-trained model for a set of categories like ImageNet, and
-retrains from the existing weights for new classes. In this example we'll be
-retraining the final layer from scratch, while leaving all the others untouched.
-For more information on the approach you can see
-[this paper on Decaf](https://arxiv.org/pdf/1310.1531v1.pdf).
-
-Though it's not as good as a full training run, this is surprisingly effective
-for many applications, and can be run in as little as thirty minutes on a
-laptop, without requiring a GPU. This tutorial will show you how to run the
-example script on your own images, and will explain some of the options you have
-to help control the training process.
-
-Note: A version of this tutorial is also available
-[as a codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0).
-
-Before you start, you must @{$install$install tensorflow}.
-
-[TOC]
-
-## Training on Flowers
-
-![Daisies by Kelly Sikkema](https://www.tensorflow.org/images/daisies.jpg)
-
-[Image by Kelly Sikkema](https://www.flickr.com/photos/95072945@N05/9922116524/)
-
-Before you start any training, you'll need a set of images to teach the network
-about the new classes you want to recognize. There's a later section that
-explains how to prepare your own images, but to make it easy we've created an
-archive of creative-commons licensed flower photos to use initially. To get the
-set of flower photos, run these commands:
-
-```sh
-cd ~
-curl -O http://download.tensorflow.org/example_images/flower_photos.tgz
-tar xzf flower_photos.tgz
-```
-
-Once you have the images, you can clone the tensorflow repository using the
-following command (these examples are not included in the installation):
-
-```sh
-git clone https://github.com/tensorflow/tensorflow
-```
-
-Then checkout the version of the tensorflow repository matching your
-installation and this tutorial as follows:
-
-``` sh
-cd tensorflow
-git checkout {version}
-```
-
-In the simplest cases the retrainer can then be run like this:
-
-```sh
-python tensorflow/examples/image_retraining/retrain.py --image_dir ~/flower_photos
-```
-
-The script has many other options. You can get a full listing with:
-
-```sh
-python tensorflow/examples/image_retraining/retrain.py -h
-```
-
-This script loads the pre-trained Inception v3 model, removes the old top layer,
-and trains a new one on the flower photos you've downloaded. None of the flower
-species were in the original ImageNet classes the full network was trained on.
-The magic of transfer learning is that lower layers that have been trained to
-distinguish between some objects can be reused for many recognition tasks
-without any alteration.
-
-## Bottlenecks
-
-The script can take thirty minutes or more to complete, depending on the speed
-of your machine. The first phase analyzes all the images on disk and calculates
-the bottleneck values for each of them. 'Bottleneck' is an informal term we
-often use for the layer just before the final output layer that actually does
-the classification. This penultimate layer has been trained to output a set of
-values that's good enough for the classifier to use to distinguish between all
-the classes it's been asked to recognize. That means it has to be a meaningful
-and compact summary of the images, since it has to contain enough information
-for the classifier to make a good choice in a very small set of values. The
-reason our final layer retraining can work on new classes is that it turns out
-the kind of information needed to distinguish between all the 1,000 classes in
-ImageNet is often also useful to distinguish between new kinds of objects.
-
-Because every image is reused multiple times during training and calculating
-each bottleneck takes a significant amount of time, it speeds things up to
-cache these bottleneck values on disk so they don't have to be repeatedly
-recalculated. By default they're stored in the `/tmp/bottleneck` directory, and
-if you rerun the script they'll be reused so you don't have to wait for this
-part again.
-
-## Training
-
-Once the bottlenecks are complete, the actual training of the top layer of the
-network begins. You'll see a series of step outputs, each one showing training
-accuracy, validation accuracy, and the cross entropy. The training accuracy
-shows what percent of the images used in the current training batch were
-labeled with the correct class. The validation accuracy is the precision on a
-randomly-selected group of images from a different set. The key difference is
-that the training accuracy is based on images that the network has been able
-to learn from so the network can overfit to the noise in the training data. A
-true measure of the performance of the network is to measure its performance on
-a data set not contained in the training data -- this is measured by the
-validation accuracy. If the train accuracy is high but the validation accuracy
-remains low, that means the network is overfitting and memorizing particular
-features in the training images that aren't helpful more generally. Cross
-entropy is a loss function which gives a glimpse into how well the learning
-process is progressing. The training's objective is to make the loss as small as
-possible, so you can tell if the learning is working by keeping an eye on
-whether the loss keeps trending downwards, ignoring the short-term noise.
-
-By default this script will run 4,000 training steps. Each step chooses ten
-images at random from the training set, finds their bottlenecks from the cache,
-and feeds them into the final layer to get predictions. Those predictions are
-then compared against the actual labels to update the final layer's weights
-through the back-propagation process. As the process continues you should see
-the reported accuracy improve, and after all the steps are done, a final test
-accuracy evaluation is run on a set of images kept separate from the training
-and validation pictures. This test evaluation is the best estimate of how the
-trained model will perform on the classification task. You should see an
-accuracy value of between 90% and 95%, though the exact value will vary from run
-to run since there's randomness in the training process. This number is based on
-the percent of the images in the test set that are given the correct label
-after the model is fully trained.
-
-## Visualizing the Retraining with TensorBoard
-
-The script includes TensorBoard summaries that make it easier to understand, debug, and optimize the retraining. For example, you can visualize the graph and statistics, such as how the weights or accuracy varied during training.
-
-To launch TensorBoard, run this command during or after retraining:
-
-```sh
-tensorboard --logdir /tmp/retrain_logs
-```
-
-Once TensorBoard is running, navigate your web browser to `localhost:6006` to view the TensorBoard.
-
-The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
-
-The [TensorBoard's GitHub](https://github.com/tensorflow/tensorboard) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
-
-## Using the Retrained Model
-
-The script will write out a version of the Inception v3 network with a final
-layer retrained to your categories to /tmp/output_graph.pb, and a text file
-containing the labels to /tmp/output_labels.txt. These are both in a format that
-the @{$image_recognition$C++ and Python image classification examples}
-can read in, so you can start using your new model immediately. Since you've
-replaced the top layer, you will need to specify the new name in the script, for
-example with the flag `--output_layer=final_result` if you're using label_image.
-
-Here's an example of how to run the label_image example with your
-retrained graphs:
-
-```sh
-python tensorflow/examples/label_image/label_image.py \
---graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---input_layer=Mul \
---output_layer=final_result \
---input_mean=128 --input_std=128 \
---image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
-```
-
-You should see a list of flower labels, in most cases with daisy on top
-(though each retrained model may be slightly different). You can replace the
-`--image` parameter with your own images to try those out.
-
-If you'd like to use the retrained model in your own Python program, then the
-above
-[`label_image` script](https://www.tensorflow.org/code/tensorflow/examples/label_image/label_image.py)
-is a reasonable starting point. The `label_image`
-directory also contains C++ code which you can use as a template to integrate
-tensorflow with your own applications.
-
-If you find the default Inception v3 model is too large or slow for your
-application, take a look at the [Other Model Architectures section](/tutorials/image_retraining#other_model_architectures)
-below for options to speed up and slim down your network.
-
-## Training on Your Own Categories
-
-If you've managed to get the script working on the flower example images, you
-can start looking at teaching it to recognize categories you care about instead.
-In theory all you'll need to do is point it at a set of sub-folders, each named
-after one of your categories and containing only images from that category. If
-you do that and pass the root folder of the subdirectories as the argument to
-`--image_dir`, the script should train just like it did for the flowers.
-
-Here's what the folder structure of the flowers archive looks like, to give you
-and example of the kind of layout the script is looking for:
-
-![Folder Structure](https://www.tensorflow.org/images/folder_structure.png)
-
-In practice it may take some work to get the accuracy you want. I'll try to
-guide you through some of the common problems you might encounter below.
-
-## Creating a Set of Training Images
-
-The first place to start is by looking at the images you've gathered, since the
-most common issues we see with training come from the data that's being fed in.
-
-For training to work well, you should gather at least a hundred photos of each
-kind of object you want to recognize. The more you can gather, the better the
-accuracy of your trained model is likely to be. You also need to make sure that
-the photos are a good representation of what your application will actually
-encounter. For example, if you take all your photos indoors against a blank wall
-and your users are trying to recognize objects outdoors, you probably won't see
-good results when you deploy.
-
-Another pitfall to avoid is that the learning process will pick up on anything
-that the labeled images have in common with each other, and if you're not
-careful that might be something that's not useful. For example if you photograph
-one kind of object in a blue room, and another in a green one, then the model
-will end up basing its prediction on the background color, not the features of
-the object you actually care about. To avoid this, try to take pictures in as
-wide a variety of situations as you can, at different times, and with different
-devices. If you want to know more about this problem, you can read about the
-classic (and possibly apocryphal)
-[tank recognition problem](https://www.jefftk.com/p/detecting-tanks).
-
-You may also want to think about the categories you use. It might be worth
-splitting big categories that cover a lot of different physical forms into
-smaller ones that are more visually distinct. For example instead of 'vehicle'
-you might use 'car', 'motorbike', and 'truck'. It's also worth thinking about
-whether you have a 'closed world' or an 'open world' problem. In a closed world,
-the only things you'll ever be asked to categorize are the classes of object you
-know about. This might apply to a plant recognition app where you know the user
-is likely to be taking a picture of a flower, so all you have to do is decide
-which species. By contrast a roaming robot might see all sorts of different
-things through its camera as it wanders around the world. In that case you'd
-want the classifier to report if it wasn't sure what it was seeing. This can be
-hard to do well, but often if you collect a large number of typical 'background'
-photos with no relevant objects in them, you can add them to an extra 'unknown'
-class in your image folders.
-
-It's also worth checking to make sure that all of your images are labeled
-correctly. Often user-generated tags are unreliable for our purposes, for
-example using #daisy for pictures of a person named Daisy. If you go through
-your images and weed out any mistakes it can do wonders for your overall
-accuracy.
-
-## Training Steps
-
-If you're happy with your images, you can take a look at improving your results
-by altering the details of the learning process. The simplest one to try is
-`--how_many_training_steps`. This defaults to 4,000, but if you increase it to
-8,000 it will train for twice as long. The rate of improvement in the accuracy
-slows the longer you train for, and at some point will stop altogether, but you
-can experiment to see when you hit that limit for your model.
-
-## Distortions
-
-A common way of improving the results of image training is by deforming,
-cropping, or brightening the training inputs in random ways. This has the
-advantage of expanding the effective size of the training data thanks to all the
-possible variations of the same images, and tends to help the network learn to
-cope with all the distortions that will occur in real-life uses of the
-classifier. The biggest disadvantage of enabling these distortions in our script
-is that the bottleneck caching is no longer useful, since input images are never
-reused exactly. This means the training process takes a lot longer, so I
-recommend trying this as a way of fine-tuning your model once you've got one
-that you're reasonably happy with.
-
-You enable these distortions by passing `--random_crop`, `--random_scale` and
-`--random_brightness` to the script. These are all percentage values that
-control how much of each of the distortions is applied to each image. It's
-reasonable to start with values of 5 or 10 for each of them and then experiment
-to see which of them help with your application. `--flip_left_right` will
-randomly mirror half of the images horizontally, which makes sense as long as
-those inversions are likely to happen in your application. For example it
-wouldn't be a good idea if you were trying to recognize letters, since flipping
-them destroys their meaning.
-
-## Hyper-parameters
-
-There are several other parameters you can try adjusting to see if they help
-your results. The `--learning_rate` controls the magnitude of the updates to the
-final layer during training. Intuitively if this is smaller then the learning
-will take longer, but it can end up helping the overall precision. That's not
-always the case though, so you need to experiment carefully to see what works
-for your case. The `--train_batch_size` controls how many images are examined
-during one training step, and because the learning rate is applied per batch
-you'll need to reduce it if you have larger batches to get the same overall
-effect.
-
-## Training, Validation, and Testing Sets
-
-One of the things the script does under the hood when you point it at a folder
-of images is divide them up into three different sets. The largest is usually
-the training set, which are all the images fed into the network during training,
-with the results used to update the model's weights. You might wonder why we
-don't use all the images for training? A big potential problem when we're doing
-machine learning is that our model may just be memorizing irrelevant details of
-the training images to come up with the right answers. For example, you could
-imagine a network remembering a pattern in the background of each photo it was
-shown, and using that to match labels with objects. It could produce good
-results on all the images it's seen before during training, but then fail on new
-images because it's not learned general characteristics of the objects, just
-memorized unimportant details of the training images.
-
-This problem is known as overfitting, and to avoid it we keep some of our data
-out of the training process, so that the model can't memorize them. We then use
-those images as a check to make sure that overfitting isn't occurring, since if
-we see good accuracy on them it's a good sign the network isn't overfitting. The
-usual split is to put 80% of the images into the main training set, keep 10%
-aside to run as validation frequently during training, and then have a final 10%
-that are used less often as a testing set to predict the real-world performance
-of the classifier. These ratios can be controlled using the
-`--testing_percentage` and `--validation_percentage` flags. In general
-you should be able to leave these values at their defaults, since you won't
-usually find any advantage to training to adjusting them.
-
-Note that the script uses the image filenames (rather than a completely random
-function) to divide the images among the training, validation, and test sets.
-This is done to ensure that images don't get moved between training and testing
-sets on different runs, since that could be a problem if images that had been
-used for training a model were subsequently used in a validation set.
-
-You might notice that the validation accuracy fluctuates among iterations. Much
-of this fluctuation arises from the fact that a random subset of the validation
-set is chosen for each validation accuracy measurement. The fluctuations can be
-greatly reduced, at the cost of some increase in training time, by choosing
-`--validation_batch_size=-1`, which uses the entire validation set for each
-accuracy computation.
-
-Once training is complete, you may find it insightful to examine misclassified
-images in the test set. This can be done by adding the flag
-`--print_misclassified_test_images`. This may help you get a feeling for which
-types of images were most confusing for the model, and which categories were
-most difficult to distinguish. For instance, you might discover that some
-subtype of a particular category, or some unusual photo angle, is particularly
-difficult to identify, which may encourage you to add more training images of
-that subtype. Oftentimes, examining misclassified images can also point to
-errors in the input data set, such as mislabeled, low-quality, or ambiguous
-images. However, one should generally avoid point-fixing individual errors in
-the test set, since they are likely to merely reflect more general problems in
-the (much larger) training set.
-
-## Other Model Architectures
-
-By default the script uses a pretrained version of the Inception v3 model
-architecture. This is a good place to start because it provides high accuracy
-results, but if you intend to deploy your model on mobile devices or other
-resource-constrained environments you may want to trade off a little accuracy
-for much smaller file sizes or faster speeds. To help with that, the
-[retrain.py script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py)
-supports 32 different variations on the [Mobilenet architecture](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html).
-
-These are a little less precise than Inception v3, but can result in far
-smaller file sizes (down to less than a megabyte) and can be many times faster
-to run. To train with one of these models, pass in the `--architecture` flag,
-for example:
-
-```
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos --architecture mobilenet_0.25_128_quantized
-```
-
-This will create a 941KB model file in `/tmp/output_graph.pb`, with 25% of the
-parameters of the full Mobilenet, taking 128x128 sized input images, and with
-its weights quantized down to eight bits on disk. You can choose '1.0', '0.75',
-'0.50', or '0.25' to control the number of weight parameters, and so the file
-size (and to some extent the speed), '224', '192', '160', or '128' for the input
-image size, with smaller sizes giving faster speeds, and an optional
-'_quantized' at the end to indicate whether the file should contain 8-bit or
-32-bit float weights.
-
-The speed and size advantages come at a loss to accuracy of course, but for many
-purposes this isn't critical. They can also be somewhat offset with improved
-training data. For example, training with distortions allows me to get above 80%
-accuracy on the flower data set even with the 0.25/128/quantized graph above.
-
-If you're going to be using the Mobilenet models in label_image or your own
-programs, you'll need to feed in an image of the specified size converted to a
-float range into the 'input' tensor. Typically 24-bit images are in the range
-[0,255], and you must convert them to the [-1,1] float range expected by the
-model with the formula  `(image - 128.)/128.`.
-
-The default arguments for the `label_image` script are set for Inception V3.
-To use it with a MobileNet, specify the above normalization parameters as
-`input_mean` and `input_std` on the command line. You also must specify the
-image size that your model expects, as follows:
-
-```sh
-python tensorflow/examples/label_image/label_image.py \
---graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---input_layer=input \
---output_layer=final_result \
---input_height=224 --input_width=224 \
---input_mean=128 --input_std=128 \
---image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
-```
+**NOTE: This tutorial has moved to**
+https://github.com/tensorflow/hub/tree/master/docs/tutorials/image_retraining.md
diff --git a/tensorflow/docs_src/tutorials/index.md b/tensorflow/docs_src/tutorials/index.md
index 8c697e48e550c4e425db33bab7257532d209ac7a..af01d3eaa12157f82c981de005708509f6652cca 100644
--- a/tensorflow/docs_src/tutorials/index.md
+++ b/tensorflow/docs_src/tutorials/index.md
@@ -10,7 +10,7 @@ these tutorials.
 
 These tutorials cover different aspects of image recognition:
 
-  * @{$layers}, which introduces convolutional neural networks (CNNs) and
+  * @{$layers$MNIST}, which introduces convolutional neural networks (CNNs) and
     demonstrates how to build a CNN in TensorFlow.
   * @{$image_recognition}, which introduces the field of image recognition and
     uses a pre-trained model (Inception) for recognizing images.
diff --git a/tensorflow/docs_src/tutorials/kernel_methods.md b/tensorflow/docs_src/tutorials/kernel_methods.md
index 63f408c2ca304d6345ffff459b799b011f8d8035..73e5c5105784ddc9729b8cea6cd31921572837e1 100644
--- a/tensorflow/docs_src/tutorials/kernel_methods.md
+++ b/tensorflow/docs_src/tutorials/kernel_methods.md
@@ -1,9 +1,9 @@
 # Improving Linear Models Using Explicit Kernel Methods
 
-Note: This document uses a deprecated version of ${tf.estimator},
-which has a ${tf.contrib.learn.estimator$different interface}.
+Note: This document uses a deprecated version of @{tf.estimator},
+which has a @{tf.contrib.learn.Estimator$different interface}.
 It also uses other `contrib` methods whose
-${$version_compat#not_covered$API may not be stable}.
+@{$version_compat#not_covered$API may not be stable}.
 
 In this tutorial, we demonstrate how combining (explicit) kernel methods with
 linear models can drastically increase the latters' quality of predictions
@@ -53,7 +53,7 @@ In order to feed data to a `tf.contrib.learn Estimator`, it is helpful to conver
 it to Tensors. For this, we will use an `input function` which adds Ops to the
 TensorFlow graph that, when executed, create mini-batches of Tensors to be used
 downstream. For more background on input functions, check
-@{$get_started/premade_estimators#input_fn$this section on input functions}.
+@{$get_started/premade_estimators#create_input_functions$this section on input functions}.
 In this example, we will use the `tf.train.shuffle_batch` Op which, besides
 converting numpy arrays to Tensors, allows us to specify the batch_size and
 whether to randomize the input every time the input_fn Ops are executed
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index 5111b16247e2b5c3410e69dcdf08318a35b18c2f..37cd2bb1397deaddec9f7194d8c5093145e08644 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -192,23 +192,28 @@ dive deeper into the `tf.layers` code used to create each layer, as well as how
 to calculate loss, configure the training op, and generate predictions. If
 you're already experienced with CNNs and @{$get_started/custom_estimators$TensorFlow `Estimator`s},
 and find the above code intuitive, you may want to skim these sections or just
-skip ahead to ["Training and Evaluating the CNN MNIST
-Classifier"](#training-and-evaluating-the-cnn-mnist-classifier).
+skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#train_eval_mnist).
 
 ### Input Layer
 
 The methods in the `layers` module for creating convolutional and pooling layers
 for two-dimensional image data expect input tensors to have a shape of
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
-<em>channels</em>]</code>, defined as follows:
+<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>,
+<em>channels</em>]</code> by default. This behavior can be changed using the <code><em>data_format</em></code> parameter; defined as follows:
+
 
 *   _`batch_size`_. Size of the subset of examples to use when performing
     gradient descent during training.
-*   _`image_width`_. Width of the example images.
 *   _`image_height`_. Height of the example images.
+*   _`image_width`_. Width of the example images.
 *   _`channels`_. Number of color channels in the example images. For color
     images, the number of channels is 3 (red, green, blue). For monochrome
     images, there is just 1 channel (black).
+*   _`image_height`_. Height of the example images.
+*   _`data_format`_. A string, one of `channels_last` (default) or `channels_first`.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
 
 Here, our MNIST dataset is composed of monochrome 28x28 pixel images, so the
 desired shape for our input layer is <code>[<em>batch_size</em>, 28, 28,
@@ -247,28 +252,27 @@ conv1 = tf.layers.conv2d(
 ```
 
 The `inputs` argument specifies our input tensor, which must have the shape
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
+<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>,
 <em>channels</em>]</code>. Here, we're connecting our first convolutional layer
 to `input_layer`, which has the shape <code>[<em>batch_size</em>, 28, 28,
 1]</code>.
 
 > Note: <code>conv2d()</code> will instead accept a shape of
-> <code>[<em>channels</em>, <em>batch_size</em>, <em>image_width</em>,
-> <em>image_height</em>]</code> when passed the argument
+> <code>[<em>batch_size</em>, <em>channels</em>, <em>image_height</em>, <em>image_width</em>]</code> when passed the argument
 > <code>data_format=channels_first</code>.
 
 The `filters` argument specifies the number of filters to apply (here, 32), and
-`kernel_size` specifies the dimensions of the filters as <code>[<em>width</em>,
-<em>height</em>]</code> (here, <code>[5, 5]</code>).
+`kernel_size` specifies the dimensions of the filters as <code>[<em>height</em>,
+<em>width</em>]</code> (here, <code>[5, 5]</code>).
 
-<p class="tip"><b>TIP:</b> If filter width and height have the same value, you can instead specify a
+<p class="tip"><b>TIP:</b> If filter height and width have the same value, you can instead specify a
 single integer for <code>kernel_size</code>—e.g., <code>kernel_size=5</code>.</p>
 
 The `padding` argument specifies one of two enumerated values
 (case-insensitive): `valid` (default value) or `same`. To specify that the
-output tensor should have the same width and height values as the input tensor,
+output tensor should have the same height and width values as the input tensor,
 we set `padding=same` here, which instructs TensorFlow to add 0 values to the
-edges of the input tensor to preserve width and height of 28. (Without padding,
+edges of the input tensor to preserve height and width of 28. (Without padding,
 a 5x5 convolution over a 28x28 tensor will produce a 24x24 tensor, as there are
 24x24 locations to extract a 5x5 tile from a 28x28 grid.)
 
@@ -277,7 +281,7 @@ output of the convolution. Here, we specify ReLU activation with
 @{tf.nn.relu}.
 
 Our output tensor produced by `conv2d()` has a shape of
-<code>[<em>batch_size</em>, 28, 28, 32]</code>: the same width and height
+<code>[<em>batch_size</em>, 28, 28, 32]</code>: the same height and width
 dimensions as the input, but now with 32 channels holding the output from each
 of the filters.
 
@@ -292,31 +296,30 @@ pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
 ```
 
 Again, `inputs` specifies the input tensor, with a shape of
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
+<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>,
 <em>channels</em>]</code>. Here, our input tensor is `conv1`, the output from
 the first convolutional layer, which has a shape of <code>[<em>batch_size</em>,
 28, 28, 32]</code>.
 
 > Note: As with <code>conv2d()</code>, <code>max_pooling2d()</code> will instead
-> accept a shape of <code>[<em>channels</em>, <em>batch_size</em>,
-> <em>image_width</em>, <em>image_height</em>]</code> when passed the argument
+> accept a shape of <code>[<em>batch_size</em>, <em>channels</em>, 
+> <em>image_height</em>, <em>image_width</em>]</code> when passed the argument
 > <code>data_format=channels_first</code>.
 
 The `pool_size` argument specifies the size of the max pooling filter as
-<code>[<em>width</em>, <em>height</em>]</code> (here, `[2, 2]`). If both
+<code>[<em>height</em>, <em>width</em>]</code> (here, `[2, 2]`). If both
 dimensions have the same value, you can instead specify a single integer (e.g.,
 `pool_size=2`).
 
 The `strides` argument specifies the size of the stride. Here, we set a stride
 of 2, which indicates that the subregions extracted by the filter should be
-separated by 2 pixels in both the width and height dimensions (for a 2x2 filter,
+separated by 2 pixels in both the height and width dimensions (for a 2x2 filter,
 this means that none of the regions extracted will overlap). If you want to set
-different stride values for width and height, you can instead specify a tuple or
+different stride values for height and width, you can instead specify a tuple or
 list (e.g., `stride=[3, 6]`).
 
 Our output tensor produced by `max_pooling2d()` (`pool1`) has a shape of
-<code>[<em>batch_size</em>, 14, 14, 32]</code>: the 2x2 filter reduces width and
-height by 50% each.
+<code>[<em>batch_size</em>, 14, 14, 32]</code>: the 2x2 filter reduces height and width by 50% each.
 
 ### Convolutional Layer #2 and Pooling Layer #2
 
@@ -338,13 +341,11 @@ pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
 
 Note that convolutional layer #2 takes the output tensor of our first pooling
 layer (`pool1`) as input, and produces the tensor `conv2` as output. `conv2`
-has a shape of <code>[<em>batch_size</em>, 14, 14, 64]</code>, the same width
-and height as `pool1` (due to `padding="same"`), and 64 channels for the 64
+has a shape of <code>[<em>batch_size</em>, 14, 14, 64]</code>, the same height and width as `pool1` (due to `padding="same"`), and 64 channels for the 64
 filters applied.
 
 Pooling layer #2 takes `conv2` as input, producing `pool2` as output. `pool2`
-has shape <code>[<em>batch_size</em>, 7, 7, 64]</code> (50% reduction of width
-and height from `conv2`).
+has shape <code>[<em>batch_size</em>, 7, 7, 64]</code> (50% reduction of height and width from `conv2`).
 
 ### Dense Layer
 
@@ -360,7 +361,7 @@ pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
 
 In the `reshape()` operation above, the `-1` signifies that the *`batch_size`*
 dimension will be dynamically calculated based on the number of examples in our
-input data. Each example has 7 (`pool2` width) * 7 (`pool2` height) * 64
+input data. Each example has 7 (`pool2` height) * 7 (`pool2` width) * 64
 (`pool2` channels) features, so we want the `features` dimension to have a value
 of 7 * 7 * 64 (3136 in total). The output tensor, `pool2_flat`, has shape
 <code>[<em>batch_size</em>, 3136]</code>.
@@ -446,7 +447,7 @@ tf.nn.softmax(logits, name="softmax_tensor")
 
 > Note: We use the `name` argument to explicitly name this operation
 > `softmax_tensor`, so we can reference it later. (We'll set up logging for the
-> softmax values in ["Set Up a Logging Hook"](#set-up-a-logging-hook).
+> softmax values in ["Set Up a Logging Hook"](#set-up-a-logging-hook)).
 
 We compile our predictions in a dict, and return an `EstimatorSpec` object:
 
@@ -534,9 +535,9 @@ if mode == tf.estimator.ModeKeys.TRAIN:
 ```
 
 > Note: For a more in-depth look at configuring training ops for Estimator model
-> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining
-> the training op for the model"} in the @{$get_started/custom_estimators$"Creating Estimations in
-> tf.estimator"} tutorial.
+> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining the training op for the model"}
+> in the @{$get_started/custom_estimators$"Creating Estimations in tf.estimator"} tutorial.
+
 
 ### Add evaluation metrics
 
@@ -551,7 +552,8 @@ return tf.estimator.EstimatorSpec(
     mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 ```
 
-## Training and Evaluating the CNN MNIST Classifier {#training_and_evaluating_the_cnn_mnist_classifier}
+<a id="train_eval_mnist"></a>
+## Training and Evaluating the CNN MNIST Classifier
 
 We've coded our MNIST CNN model function; now we're ready to train and evaluate
 it.
@@ -611,9 +613,9 @@ following to `main()`:
 
 ```python
 # Set up logging for predictions
-  tensors_to_log = {"probabilities": "softmax_tensor"}
-  logging_hook = tf.train.LoggingTensorHook(
-      tensors=tensors_to_log, every_n_iter=50)
+tensors_to_log = {"probabilities": "softmax_tensor"}
+logging_hook = tf.train.LoggingTensorHook(
+    tensors=tensors_to_log, every_n_iter=50)
 ```
 
 We store a dict of the tensors we want to log in `tensors_to_log`. Each key is a
@@ -625,8 +627,8 @@ operation earlier when we generated the probabilities in `cnn_model_fn`.
 > Note: If you don't explicitly assign a name to an operation via the `name`
 > argument, TensorFlow will assign a default name. A couple easy ways to
 > discover the names applied to operations are to visualize your graph on
-> @{$graph_viz$TensorBoard}) or to enable the @{$debugger$TensorFlow Debugger
-> (tfdbg)}.
+> @{$graph_viz$TensorBoard}) or to enable the
+> @{$programmers_guide/debugger$TensorFlow Debugger (tfdbg)}.
 
 Next, we create the `LoggingTensorHook`, passing `tensors_to_log` to the
 `tensors` argument. We set `every_n_iter=50`, which specifies that probabilities
diff --git a/tensorflow/docs_src/tutorials/recurrent_quickdraw.md b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
index e22536adb6f0b893602ff79612cfb01e10586a18..5d83fbe2a3709c0834f448cbc316453f80428dd1 100644
--- a/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
+++ b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
@@ -38,8 +38,8 @@ To try the code for this tutorial:
 1.  [Download the data](#download-the-data) in `TFRecord` format from
     [here](http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz) and unzip it. More details about [how to
     obtain the original Quick, Draw!
-    data](#optional-download-the-full-quick-draw-data) and [how to convert that
-    to `TFRecord` files](#optional-converting-the-data) is available below.
+    data](#optional_download_the_full_quick_draw_data) and [how to convert that
+    to `TFRecord` files](#optional_converting_the_data) is available below.
 
 1.  Execute the tutorial code with the following command to train the RNN-based
     model described in this tutorial. Make sure to adjust the paths to point to
@@ -108,8 +108,9 @@ This download will take a while and download a bit more than 23GB of data.
 ### Optional: Converting the data
 
 To convert the `ndjson` files to
-@{$python/python_io#tfrecords_format_details$TFRecord} files containing
-${tf.train.Example} protos run the following command.
+@{$python/python_io#TFRecords_Format_Details$TFRecord} files containing
+[`tf.train.Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+protos run the following command.
 
 ```shell
    python create_dataset.py --ndjson_path rnn_tutorial_data \
@@ -117,7 +118,7 @@ ${tf.train.Example} protos run the following command.
 ```
 
 This will store the data in 10 shards of
-@{$python/python_io#tfrecords_format_details$TFRecord} files with 10000 items
+@{$python/python_io#TFRecords_Format_Details$TFRecord} files with 10000 items
 per class for the training data and 1000 items per class as eval data.
 
 This conversion process is described in more detail in the following.
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index 005dc020f94f666da295f4ff0342fae858121012..27ce75a30dd2acd5925702611042270e767b0c73 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -74,8 +74,8 @@ Here's a list of columns available in the Census Income dataset:
 | relationship   | Categorical | Wife, Own-child, Husband,         |
 :                :             : Not-in-family, Other-relative,    :
 :                :             : Unmarried.                        :
-| race           | Categorical | White, Asian-Pac-Islander,        |
-:                :             : Amer-Indian-Eskimo, Other, Black. :
+| race           | Categorical | Amer-Indian-Eskimo, Asian-Pac-    |
+:                :             : Islander, Black, White, Other.    :
 | gender         | Categorical | Female, Male.                     |
 | capital_gain   | Continuous  | Capital gains recorded.           |
 | capital_loss   | Continuous  | Capital Losses recorded.          |
@@ -247,7 +247,7 @@ hours_per_week = tf.feature_column.numeric_column('hours_per_week')
 ### Making Continuous Features Categorical through Bucketization
 
 Sometimes the relationship between a continuous feature and the label is not
-linear. As an hypothetical example, a person's income may grow with age in the
+linear. As a hypothetical example, a person's income may grow with age in the
 early stage of one's career, then the growth may slow at some point, and finally
 the income decreases after retirement. In this scenario, using the raw `age` as
 a real-valued feature column might not be a good choice because the model can
@@ -361,6 +361,16 @@ The first line of the final output should be something like
 `accuracy: 0.83557522`, which means the accuracy is 83.6%. Feel free to try more
 features and transformations and see if you can do even better!
 
+After the model is evaluated, we can use the model to predict whether an individual has an annual income of over
+50,000 dollars given an individual's information input.
+```python
+  pred_iter = model.predict(input_fn=lambda: input_fn(FLAGS.test_data, 1, False, 1))
+  for pred in pred_iter:
+    print(pred['classes'])
+```
+
+The model prediction output would be like `[b'1']` or `[b'0']` which means whether corresponding individual has an annual income of over 50,000 dollars or not.
+
 If you'd like to see a working end-to-end example, you can download our
 [example code](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py)
 and set the `model_type` flag to `wide`.
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index b3ed6589ed062dc2331b7dc64184a2b39062271e..cf8054be6a3e89a307a10fdb711a62ac3a46d410 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -139,15 +139,3 @@ tf_cc_binary(
         "//tensorflow/core:framework",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/android/AndroidManifest.xml b/tensorflow/examples/android/AndroidManifest.xml
index bb75431a1f8bab2951299520903aa6e043f8415e..5c47ce6b673e4c9d635b867c1ccdc679f67c6ae5 100644
--- a/tensorflow/examples/android/AndroidManifest.xml
+++ b/tensorflow/examples/android/AndroidManifest.xml
@@ -40,6 +40,7 @@
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
                 <category android:name="android.intent.category.LAUNCHER" />
+                <category android:name="android.intent.category.LEANBACK_LAUNCHER" />
             </intent-filter>
         </activity>
 
@@ -49,6 +50,7 @@
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
                 <category android:name="android.intent.category.LAUNCHER" />
+                <category android:name="android.intent.category.LEANBACK_LAUNCHER" />
             </intent-filter>
         </activity>
 
@@ -58,6 +60,7 @@
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
                 <category android:name="android.intent.category.LAUNCHER" />
+                <category android:name="android.intent.category.LEANBACK_LAUNCHER" />
             </intent-filter>
         </activity>
 
@@ -67,6 +70,7 @@
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
                 <category android:name="android.intent.category.LAUNCHER" />
+                <category android:name="android.intent.category.LEANBACK_LAUNCHER" />
             </intent-filter>
         </activity>
     </application>
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 12146477972a116903f731a03b9755aafd92acc1..aa594a63c6ad5ab7129e452e7a6345114b994231 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -76,7 +76,6 @@ android_binary(
     custom_package = "org.tensorflow.demo",
     inline_constants = 1,
     manifest = "AndroidManifest.xml",
-    manifest_merger = "legacy",
     resource_files = glob(["res/**"]),
     tags = [
         "manual",
@@ -100,22 +99,6 @@ filegroup(
 )
 # LINT.ThenChange(//tensorflow/examples/android/download-models.gradle)
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-            "gradleBuild/**",
-            "libs/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "java_files",
     srcs = glob(["src/**/*.java"]),
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index 8bd4abb154a8f8c74f2195d4acbb99d3d5d498ea..429138abe5338e63d602ef6005e7607d21e1e357 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -351,6 +351,10 @@ public abstract class CameraActivity extends Activity
 
   protected void setFragment() {
     String cameraId = chooseCamera();
+    if (cameraId == null) {
+      Toast.makeText(this, "No Camera Detected", Toast.LENGTH_SHORT).show();
+      finish();
+    }
 
     Fragment fragment;
     if (useCamera2API) {
@@ -416,7 +420,8 @@ public abstract class CameraActivity extends Activity
 
   @Override
   public boolean onKeyDown(final int keyCode, final KeyEvent event) {
-    if (keyCode == KeyEvent.KEYCODE_VOLUME_DOWN || keyCode == KeyEvent.KEYCODE_VOLUME_UP) {
+    if (keyCode == KeyEvent.KEYCODE_VOLUME_DOWN || keyCode == KeyEvent.KEYCODE_VOLUME_UP
+            || keyCode == KeyEvent.KEYCODE_BUTTON_L1 || keyCode == KeyEvent.KEYCODE_DPAD_CENTER) {
       debug = !debug;
       requestRender();
       onSetDebug(debug);
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java
index 8a1d86d9eedf3a1e1aa80e998ff150ad0c2447a1..1cddf3dc5568babb8c08c690fad143299f5ccca5 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java
@@ -332,8 +332,10 @@ public class SpeechActivity extends Activity {
                 }
                 final View labelView = labelsListView.getChildAt(labelIndex - 2);
 
-                AnimatorSet colorAnimation = (AnimatorSet) AnimatorInflater.loadAnimator(
-                    SpeechActivity.this, R.animator.color_animation);
+                AnimatorSet colorAnimation =
+                    (AnimatorSet)
+                        AnimatorInflater.loadAnimator(
+                            SpeechActivity.this, R.animator.color_animation);
                 colorAnimation.setTarget(labelView);
                 colorAnimation.start();
               }
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
index 6a66ec3927be62f1f996eb18bb6c04ea66f43152..33ec65e9f73a1d04bcafdc09d1618b32e03b1dc0 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
@@ -16,8 +16,10 @@
 
 package org.tensorflow.demo;
 
+import android.app.UiModeManager;
 import android.content.Context;
 import android.content.res.AssetManager;
+import android.content.res.Configuration;
 import android.graphics.Bitmap;
 import android.graphics.Bitmap.Config;
 import android.graphics.BitmapFactory;
@@ -31,9 +33,11 @@ import android.graphics.Typeface;
 import android.media.ImageReader.OnImageAvailableListener;
 import android.os.Bundle;
 import android.os.SystemClock;
+import android.util.DisplayMetrics;
 import android.util.Size;
 import android.util.TypedValue;
 import android.view.Display;
+import android.view.KeyEvent;
 import android.view.MotionEvent;
 import android.view.View;
 import android.view.View.OnClickListener;
@@ -43,6 +47,7 @@ import android.widget.BaseAdapter;
 import android.widget.Button;
 import android.widget.GridView;
 import android.widget.ImageView;
+import android.widget.RelativeLayout;
 import android.widget.Toast;
 import java.io.IOException;
 import java.io.InputStream;
@@ -381,6 +386,27 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
     grid = (GridView) findViewById(R.id.grid_layout);
     grid.setAdapter(adapter);
     grid.setOnTouchListener(gridTouchAdapter);
+
+    // Change UI on Android TV
+    UiModeManager uiModeManager = (UiModeManager) getSystemService(UI_MODE_SERVICE);
+    if (uiModeManager.getCurrentModeType() == Configuration.UI_MODE_TYPE_TELEVISION) {
+      DisplayMetrics displayMetrics = new DisplayMetrics();
+      getWindowManager().getDefaultDisplay().getMetrics(displayMetrics);
+      int styleSelectorHeight = displayMetrics.heightPixels;
+      int styleSelectorWidth = displayMetrics.widthPixels - styleSelectorHeight;
+      RelativeLayout.LayoutParams layoutParams = new RelativeLayout.LayoutParams(styleSelectorWidth, ViewGroup.LayoutParams.MATCH_PARENT);
+
+      // Calculate number of style in a row, so all the style can show up without scrolling
+      int numOfStylePerRow = 3;
+      while (styleSelectorWidth / numOfStylePerRow * Math.ceil((float) (adapter.getCount() - 2) / numOfStylePerRow) > styleSelectorHeight) {
+        numOfStylePerRow++;
+      }
+      grid.setNumColumns(numOfStylePerRow);
+      layoutParams.addRule(RelativeLayout.ALIGN_PARENT_RIGHT);
+      grid.setLayoutParams(layoutParams);
+      adapter.buttons.clear();
+    }
+
     setStyle(adapter.items[0], 1.0f);
   }
 
@@ -602,4 +628,38 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
 
     borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
   }
+
+  @Override
+  public boolean onKeyDown(int keyCode, KeyEvent event) {
+    int moveOffset = 0;
+    switch (keyCode) {
+      case KeyEvent.KEYCODE_DPAD_LEFT:
+        moveOffset = -1;
+        break;
+      case KeyEvent.KEYCODE_DPAD_RIGHT:
+        moveOffset = 1;
+        break;
+      case KeyEvent.KEYCODE_DPAD_UP:
+        moveOffset = -1 * grid.getNumColumns();
+        break;
+      case KeyEvent.KEYCODE_DPAD_DOWN:
+        moveOffset = grid.getNumColumns();
+        break;
+      default:
+        return super.onKeyDown(keyCode, event);
+    }
+
+    // get the highest selected style
+    int currentSelect = 0;
+    float highestValue = 0;
+    for (int i = 0; i < adapter.getCount(); i++) {
+      if (adapter.items[i].value > highestValue) {
+        currentSelect = i;
+        highestValue = adapter.items[i].value;
+      }
+    }
+    setStyle(adapter.items[(currentSelect + moveOffset + adapter.getCount()) % adapter.getCount()], 1);
+
+    return true;
+  }
 }
diff --git a/tensorflow/examples/benchmark/BUILD b/tensorflow/examples/benchmark/BUILD
index c4bb0a5bd952ea175a4fd2444a3d632dc13445de..98611a9aadf6f456dd4f9fe4f423e3e2ce9722ec 100644
--- a/tensorflow/examples/benchmark/BUILD
+++ b/tensorflow/examples/benchmark/BUILD
@@ -23,9 +23,3 @@ tf_py_logged_benchmark(
     name = "sample_logged_benchmark",
     target = "//tensorflow/examples/benchmark:sample_benchmark",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**/*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/get_started/regression/BUILD b/tensorflow/examples/get_started/regression/BUILD
index 577b970c9063dfa9a2acdb7d18362aa8adba827f..bee94d7d90fb3f70107a5dd9e9223f3013402073 100644
--- a/tensorflow/examples/get_started/regression/BUILD
+++ b/tensorflow/examples/get_started/regression/BUILD
@@ -2,18 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_test(
     name = "test",
     size = "medium",
diff --git a/tensorflow/examples/get_started/regression/imports85.py b/tensorflow/examples/get_started/regression/imports85.py
index a8e4c782b3f7b5d01a91f38a48e5edb2202108de..4fdaceea9afee74550196031fe590c3a2abd20ed 100644
--- a/tensorflow/examples/get_started/regression/imports85.py
+++ b/tensorflow/examples/get_started/regression/imports85.py
@@ -131,11 +131,12 @@ def dataset(y_name="price", train_fraction=0.7):
     # booleans but we are dealing with symbolic tensors.
     return ~in_training_set(line)
 
-  base_dataset = (tf.data
-                  # Get the lines from the file.
-                  .TextLineDataset(path)
-                  # drop lines with question marks.
-                  .filter(has_no_question_marks))
+  base_dataset = (
+      tf.data
+      # Get the lines from the file.
+      .TextLineDataset(path)
+      # drop lines with question marks.
+      .filter(has_no_question_marks))
 
   train = (base_dataset
            # Take only the training-set lines.
diff --git a/tensorflow/examples/how_tos/reading_data/BUILD b/tensorflow/examples/how_tos/reading_data/BUILD
index 4a43585d5395b1df94dd8a8767f92f131cfcaea4..64a054d3712035252666ca84e676add3d079e52a 100644
--- a/tensorflow/examples/how_tos/reading_data/BUILD
+++ b/tensorflow/examples/how_tos/reading_data/BUILD
@@ -54,15 +54,3 @@ py_binary(
         "//tensorflow/examples/tutorials/mnist:input_data",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index 461fb1c5173f66278eb585d30bd8749a58fb6245..307eede5c03780e9244b035f020fc7846290d4d9 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -45,6 +45,7 @@ VALIDATION_FILE = 'validation.tfrecords'
 
 
 def decode(serialized_example):
+  """Parses an image and label from the given `serialized_example`."""
   features = tf.parse_single_example(
       serialized_example,
       # Defaults are not specified since both keys are required.
@@ -66,6 +67,7 @@ def decode(serialized_example):
 
 
 def augment(image, label):
+  """Placeholder for data augmentation."""
   # OPTIONAL: Could reshape into a 28x28 image and apply distortions
   # here.  Since we are not applying any distortions in this
   # example, and the next step expects the image to be flattened
@@ -74,9 +76,8 @@ def augment(image, label):
 
 
 def normalize(image, label):
-  # Convert from [0, 255] -> [-0.5, 0.5] floats.
+  """Convert `image` from [0, 255] -> [-0.5, 0.5] floats."""
   image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
-
   return image, label
 
 
@@ -106,18 +107,23 @@ def inputs(train, batch_size, num_epochs):
                           if train else VALIDATION_FILE)
 
   with tf.name_scope('input'):
-    # TFRecordDataset opens a protobuf and reads entries line by line
-    # could also be [list, of, filenames]
+    # TFRecordDataset opens a binary file and reads one record at a time.
+    # `filename` could also be a list of filenames, which will be read in order.
     dataset = tf.data.TFRecordDataset(filename)
-    dataset = dataset.repeat(num_epochs)
 
-    # map takes a python function and applies it to every sample
+    # The map transformation takes a function and applies it to every element
+    # of the dataset.
     dataset = dataset.map(decode)
     dataset = dataset.map(augment)
     dataset = dataset.map(normalize)
 
-    #the parameter is the queue size
+    # The shuffle transformation uses a finite-sized buffer to shuffle elements
+    # in memory. The parameter is the number of elements in the buffer. For
+    # completely uniform shuffling, set the parameter to be the same as the
+    # number of elements in the dataset.
     dataset = dataset.shuffle(1000 + 3 * batch_size)
+
+    dataset = dataset.repeat(num_epochs)
     dataset = dataset.batch(batch_size)
 
     iterator = dataset.make_one_shot_iterator()
@@ -153,7 +159,7 @@ def run_training():
       sess.run(init_op)
       try:
         step = 0
-        while True:  #train until OutOfRangeError
+        while True:  # Train until OutOfRangeError
           start_time = time.time()
 
           # Run one step of the model.  The return values are
diff --git a/tensorflow/examples/image_retraining/BUILD b/tensorflow/examples/image_retraining/BUILD
deleted file mode 100644
index 9f9244a74c4d073cc67b7c8252b0bcff86e9400f..0000000000000000000000000000000000000000
--- a/tensorflow/examples/image_retraining/BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-# Description:
-# Transfer learning example for TensorFlow.
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-py_binary(
-    name = "retrain",
-    srcs = [
-        "retrain.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "retrain_test",
-    size = "small",
-    srcs = [
-        "retrain.py",
-        "retrain_test.py",
-    ],
-    data = [
-        ":data/labels.txt",
-        "//tensorflow/examples/label_image:data/grace_hopper.jpg",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":retrain",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/image_retraining/README.md b/tensorflow/examples/image_retraining/README.md
index 8a49525c6eff003f2c7acb592f213285e627eb51..3f0b3d12682b81e7c13ed6c2d32149746d506cd3 100644
--- a/tensorflow/examples/image_retraining/README.md
+++ b/tensorflow/examples/image_retraining/README.md
@@ -1,12 +1,15 @@
-retrain.py is an example script that shows how one can adapt a pretrained
-network for other classification problems. A detailed overview of this script
-can be found at:
-https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0
+**NOTE: This code has moved to**
+https://github.com/tensorflow/hub/tree/master/examples/image_retraining
 
-The script also shows how one can train layers
-with quantized weights and activations instead of taking a pre-trained floating
-point model and then quantizing weights and activations.
-The output graphdef produced by this script is compatible with the TensorFlow
-Lite Optimizing Converter and can be converted to TFLite format.
+retrain.py is an example script that shows how one can adapt a pretrained
+network for other classification problems (including use with TFLite and
+quantization).
 
+As of TensorFlow 1.7, it is recommended to use a pretrained network from
+TensorFlow Hub, using the new version of this example found in the location
+above, as explained in TensorFlow's revised [image retraining
+tutorial](https://www.tensorflow.org/tutorials/image_retraining).
 
+Older versions of this example (using frozen GraphDefs instead of
+TensorFlow Hub modules) are available in the release branches of
+TensorFlow versions up to and including 1.7.
diff --git a/tensorflow/examples/image_retraining/data/labels.txt b/tensorflow/examples/image_retraining/data/labels.txt
deleted file mode 100644
index bc1131ac4591ca1bdb840695b55f79a6feb95db3..0000000000000000000000000000000000000000
--- a/tensorflow/examples/image_retraining/data/labels.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Runner-up
-Winner
-Loser
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
deleted file mode 100644
index c49e7e7ee2e397e353b468c727263ff3eb931401..0000000000000000000000000000000000000000
--- a/tensorflow/examples/image_retraining/retrain.py
+++ /dev/null
@@ -1,1421 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Simple transfer learning with Inception v3 or Mobilenet models.
-
-With support for TensorBoard.
-
-This example shows how to take a Inception v3 or Mobilenet model trained on
-ImageNet images, and train a new top layer that can recognize other classes of
-images.
-
-The top layer receives as input a 2048-dimensional vector (1001-dimensional for
-Mobilenet) for each image. We train a softmax layer on top of this
-representation. Assuming the softmax layer contains N labels, this corresponds
-to learning N + 2048*N (or 1001*N)  model parameters corresponding to the
-learned biases and weights.
-
-Here's an example, which assumes you have a folder containing class-named
-subfolders, each full of images for each label. The example folder flower_photos
-should have a structure like this:
-
-~/flower_photos/daisy/photo1.jpg
-~/flower_photos/daisy/photo2.jpg
-...
-~/flower_photos/rose/anotherphoto77.jpg
-...
-~/flower_photos/sunflower/somepicture.jpg
-
-The subfolder names are important, since they define what label is applied to
-each image, but the filenames themselves don't matter. Once your images are
-prepared, you can run the training with a command like this:
-
-```bash
-bazel build tensorflow/examples/image_retraining:retrain && \
-bazel-bin/tensorflow/examples/image_retraining/retrain \
-    --image_dir ~/flower_photos
-```
-
-Or, if you have a pip installation of tensorflow, `retrain.py` can be run
-without bazel:
-
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos
-```
-
-You can replace the image_dir argument with any folder containing subfolders of
-images. The label for each image is taken from the name of the subfolder it's
-in.
-
-This produces a new model file that can be loaded and run by any TensorFlow
-program, for example the label_image sample code.
-
-By default this script will use the high accuracy, but comparatively large and
-slow Inception v3 model architecture. It's recommended that you start with this
-to validate that you have gathered good training data, but if you want to deploy
-on resource-limited platforms, you can try the `--architecture` flag with a
-Mobilenet model. For example:
-
-Run floating-point version of mobilenet:
-
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos --architecture mobilenet_1.0_224
-```
-
-Run quantized version of mobilenet:
-
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos/   --architecture mobilenet_1.0_224_quantized
-```
-
-There are 32 different Mobilenet models to choose from, with a variety of file
-size and latency options. The first number can be '1.0', '0.75', '0.50', or
-'0.25' to control the size, and the second controls the input image size, either
-'224', '192', '160', or '128', with smaller sizes running faster. See
-https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html
-for more information on Mobilenet.
-
-To use with TensorBoard:
-
-By default, this script will log summaries to /tmp/retrain_logs directory
-
-Visualize the summaries with this command:
-
-tensorboard --logdir /tmp/retrain_logs
-
-To use with Tensorflow Serving:
-
-```bash
-tensorflow_model_server --port=9000 --model_name=inception \
-    --model_base_path=/tmp/saved_models/
-```
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-from datetime import datetime
-import hashlib
-import os.path
-import random
-import re
-import sys
-import tarfile
-
-import numpy as np
-from six.moves import urllib
-import tensorflow as tf
-
-from tensorflow.contrib.quantize.python import quant_ops
-from tensorflow.python.framework import graph_util
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.platform import gfile
-from tensorflow.python.util import compat
-
-FLAGS = None
-
-# These are all parameters that are tied to the particular model architecture
-# we're using for Inception v3. These include things like tensor names and their
-# sizes. If you want to adapt this script to work with another model, you will
-# need to update these to reflect the values in the network you're using.
-MAX_NUM_IMAGES_PER_CLASS = 2 ** 27 - 1  # ~134M
-
-
-def create_image_lists(image_dir, testing_percentage, validation_percentage):
-  """Builds a list of training images from the file system.
-
-  Analyzes the sub folders in the image directory, splits them into stable
-  training, testing, and validation sets, and returns a data structure
-  describing the lists of images for each label and their paths.
-
-  Args:
-    image_dir: String path to a folder containing subfolders of images.
-    testing_percentage: Integer percentage of the images to reserve for tests.
-    validation_percentage: Integer percentage of images reserved for validation.
-
-  Returns:
-    A dictionary containing an entry for each label subfolder, with images split
-    into training, testing, and validation sets within each label.
-  """
-  if not gfile.Exists(image_dir):
-    tf.logging.error("Image directory '" + image_dir + "' not found.")
-    return None
-  result = {}
-  sub_dirs = [x[0] for x in gfile.Walk(image_dir)]
-  # The root directory comes first, so skip it.
-  is_root_dir = True
-  for sub_dir in sub_dirs:
-    if is_root_dir:
-      is_root_dir = False
-      continue
-    extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
-    file_list = []
-    dir_name = os.path.basename(sub_dir)
-    if dir_name == image_dir:
-      continue
-    tf.logging.info("Looking for images in '" + dir_name + "'")
-    for extension in extensions:
-      file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
-      file_list.extend(gfile.Glob(file_glob))
-    if not file_list:
-      tf.logging.warning('No files found')
-      continue
-    if len(file_list) < 20:
-      tf.logging.warning(
-          'WARNING: Folder has less than 20 images, which may cause issues.')
-    elif len(file_list) > MAX_NUM_IMAGES_PER_CLASS:
-      tf.logging.warning(
-          'WARNING: Folder {} has more than {} images. Some images will '
-          'never be selected.'.format(dir_name, MAX_NUM_IMAGES_PER_CLASS))
-    label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())
-    training_images = []
-    testing_images = []
-    validation_images = []
-    for file_name in file_list:
-      base_name = os.path.basename(file_name)
-      # We want to ignore anything after '_nohash_' in the file name when
-      # deciding which set to put an image in, the data set creator has a way of
-      # grouping photos that are close variations of each other. For example
-      # this is used in the plant disease data set to group multiple pictures of
-      # the same leaf.
-      hash_name = re.sub(r'_nohash_.*$', '', file_name)
-      # This looks a bit magical, but we need to decide whether this file should
-      # go into the training, testing, or validation sets, and we want to keep
-      # existing files in the same set even if more files are subsequently
-      # added.
-      # To do that, we need a stable way of deciding based on just the file name
-      # itself, so we do a hash of that and then use that to generate a
-      # probability value that we use to assign it.
-      hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
-      percentage_hash = ((int(hash_name_hashed, 16) %
-                          (MAX_NUM_IMAGES_PER_CLASS + 1)) *
-                         (100.0 / MAX_NUM_IMAGES_PER_CLASS))
-      if percentage_hash < validation_percentage:
-        validation_images.append(base_name)
-      elif percentage_hash < (testing_percentage + validation_percentage):
-        testing_images.append(base_name)
-      else:
-        training_images.append(base_name)
-    result[label_name] = {
-        'dir': dir_name,
-        'training': training_images,
-        'testing': testing_images,
-        'validation': validation_images,
-    }
-  return result
-
-
-def get_image_path(image_lists, label_name, index, image_dir, category):
-  """"Returns a path to an image for a label at the given index.
-
-  Args:
-    image_lists: Dictionary of training images for each label.
-    label_name: Label string we want to get an image for.
-    index: Int offset of the image we want. This will be moduloed by the
-    available number of images for the label, so it can be arbitrarily large.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    category: Name string of set to pull images from - training, testing, or
-    validation.
-
-  Returns:
-    File system path string to an image that meets the requested parameters.
-
-  """
-  if label_name not in image_lists:
-    tf.logging.fatal('Label does not exist %s.', label_name)
-  label_lists = image_lists[label_name]
-  if category not in label_lists:
-    tf.logging.fatal('Category does not exist %s.', category)
-  category_list = label_lists[category]
-  if not category_list:
-    tf.logging.fatal('Label %s has no images in the category %s.',
-                     label_name, category)
-  mod_index = index % len(category_list)
-  base_name = category_list[mod_index]
-  sub_dir = label_lists['dir']
-  full_path = os.path.join(image_dir, sub_dir, base_name)
-  return full_path
-
-
-def get_bottleneck_path(image_lists, label_name, index, bottleneck_dir,
-                        category, architecture):
-  """"Returns a path to a bottleneck file for a label at the given index.
-
-  Args:
-    image_lists: Dictionary of training images for each label.
-    label_name: Label string we want to get an image for.
-    index: Integer offset of the image we want. This will be moduloed by the
-    available number of images for the label, so it can be arbitrarily large.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    category: Name string of set to pull images from - training, testing, or
-    validation.
-    architecture: The name of the model architecture.
-
-  Returns:
-    File system path string to an image that meets the requested parameters.
-  """
-  return get_image_path(image_lists, label_name, index, bottleneck_dir,
-                        category) + '_' + architecture + '.txt'
-
-
-def create_model_graph(model_info):
-  """"Creates a graph from saved GraphDef file and returns a Graph object.
-
-  Args:
-    model_info: Dictionary containing information about the model architecture.
-
-  Returns:
-    Graph holding the trained Inception network, and various tensors we'll be
-    manipulating.
-  """
-  with tf.Graph().as_default() as graph:
-    model_path = os.path.join(FLAGS.model_dir, model_info['model_file_name'])
-    print('Model path: ', model_path)
-    with gfile.FastGFile(model_path, 'rb') as f:
-      graph_def = tf.GraphDef()
-      graph_def.ParseFromString(f.read())
-      bottleneck_tensor, resized_input_tensor = (tf.import_graph_def(
-          graph_def,
-          name='',
-          return_elements=[
-              model_info['bottleneck_tensor_name'],
-              model_info['resized_input_tensor_name'],
-          ]))
-  return graph, bottleneck_tensor, resized_input_tensor
-
-
-def run_bottleneck_on_image(sess, image_data, image_data_tensor,
-                            decoded_image_tensor, resized_input_tensor,
-                            bottleneck_tensor):
-  """Runs inference on an image to extract the 'bottleneck' summary layer.
-
-  Args:
-    sess: Current active TensorFlow Session.
-    image_data: String of raw JPEG data.
-    image_data_tensor: Input data layer in the graph.
-    decoded_image_tensor: Output of initial image resizing and preprocessing.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: Layer before the final softmax.
-
-  Returns:
-    Numpy array of bottleneck values.
-  """
-  # First decode the JPEG image, resize it, and rescale the pixel values.
-  resized_input_values = sess.run(decoded_image_tensor,
-                                  {image_data_tensor: image_data})
-  # Then run it through the recognition network.
-  bottleneck_values = sess.run(bottleneck_tensor,
-                               {resized_input_tensor: resized_input_values})
-  bottleneck_values = np.squeeze(bottleneck_values)
-  return bottleneck_values
-
-
-def maybe_download_and_extract(data_url):
-  """Download and extract model tar file.
-
-  If the pretrained model we're using doesn't already exist, this function
-  downloads it from the TensorFlow.org website and unpacks it into a directory.
-
-  Args:
-    data_url: Web location of the tar file containing the pretrained model.
-  """
-  dest_directory = FLAGS.model_dir
-  if not os.path.exists(dest_directory):
-    os.makedirs(dest_directory)
-  filename = data_url.split('/')[-1]
-  filepath = os.path.join(dest_directory, filename)
-  if not os.path.exists(filepath):
-
-    def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' %
-                       (filename,
-                        float(count * block_size) / float(total_size) * 100.0))
-      sys.stdout.flush()
-
-    filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
-    print()
-    statinfo = os.stat(filepath)
-    tf.logging.info('Successfully downloaded %s %d bytes.',
-                    filename, statinfo.st_size)
-    print('Extracting file from ', filepath)
-    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
-  else:
-    print('Not extracting or downloading files, model already present in disk')
-
-
-def ensure_dir_exists(dir_name):
-  """Makes sure the folder exists on disk.
-
-  Args:
-    dir_name: Path string to the folder we want to create.
-  """
-  if not os.path.exists(dir_name):
-    os.makedirs(dir_name)
-
-
-bottleneck_path_2_bottleneck_values = {}
-
-
-def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor,
-                           decoded_image_tensor, resized_input_tensor,
-                           bottleneck_tensor):
-  """Create a single bottleneck file."""
-  tf.logging.info('Creating bottleneck at ' + bottleneck_path)
-  image_path = get_image_path(image_lists, label_name, index,
-                              image_dir, category)
-  if not gfile.Exists(image_path):
-    tf.logging.fatal('File does not exist %s', image_path)
-  image_data = gfile.FastGFile(image_path, 'rb').read()
-  try:
-    bottleneck_values = run_bottleneck_on_image(
-        sess, image_data, jpeg_data_tensor, decoded_image_tensor,
-        resized_input_tensor, bottleneck_tensor)
-  except Exception as e:
-    raise RuntimeError('Error during processing file %s (%s)' % (image_path,
-                                                                 str(e)))
-  bottleneck_string = ','.join(str(x) for x in bottleneck_values)
-  with open(bottleneck_path, 'w') as bottleneck_file:
-    bottleneck_file.write(bottleneck_string)
-
-
-def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
-                             category, bottleneck_dir, jpeg_data_tensor,
-                             decoded_image_tensor, resized_input_tensor,
-                             bottleneck_tensor, architecture):
-  """Retrieves or calculates bottleneck values for an image.
-
-  If a cached version of the bottleneck data exists on-disk, return that,
-  otherwise calculate the data and save it to disk for future use.
-
-  Args:
-    sess: The current active TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    label_name: Label string we want to get an image for.
-    index: Integer offset of the image we want. This will be modulo-ed by the
-    available number of images for the label, so it can be arbitrarily large.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    category: Name string of which set to pull images from - training, testing,
-    or validation.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    jpeg_data_tensor: The tensor to feed loaded jpeg data into.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The output tensor for the bottleneck values.
-    architecture: The name of the model architecture.
-
-  Returns:
-    Numpy array of values produced by the bottleneck layer for the image.
-  """
-  label_lists = image_lists[label_name]
-  sub_dir = label_lists['dir']
-  sub_dir_path = os.path.join(bottleneck_dir, sub_dir)
-  ensure_dir_exists(sub_dir_path)
-  bottleneck_path = get_bottleneck_path(image_lists, label_name, index,
-                                        bottleneck_dir, category, architecture)
-  if not os.path.exists(bottleneck_path):
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor,
-                           decoded_image_tensor, resized_input_tensor,
-                           bottleneck_tensor)
-  with open(bottleneck_path, 'r') as bottleneck_file:
-    bottleneck_string = bottleneck_file.read()
-  did_hit_error = False
-  try:
-    bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
-  except ValueError:
-    tf.logging.warning('Invalid float found, recreating bottleneck')
-    did_hit_error = True
-  if did_hit_error:
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor,
-                           decoded_image_tensor, resized_input_tensor,
-                           bottleneck_tensor)
-    with open(bottleneck_path, 'r') as bottleneck_file:
-      bottleneck_string = bottleneck_file.read()
-    # Allow exceptions to propagate here, since they shouldn't happen after a
-    # fresh creation
-    bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
-  return bottleneck_values
-
-
-def cache_bottlenecks(sess, image_lists, image_dir, bottleneck_dir,
-                      jpeg_data_tensor, decoded_image_tensor,
-                      resized_input_tensor, bottleneck_tensor, architecture):
-  """Ensures all the training, testing, and validation bottlenecks are cached.
-
-  Because we're likely to read the same image multiple times (if there are no
-  distortions applied during training) it can speed things up a lot if we
-  calculate the bottleneck layer values once for each image during
-  preprocessing, and then just read those cached values repeatedly during
-  training. Here we go through all the images we've found, calculate those
-  values, and save them off.
-
-  Args:
-    sess: The current active TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    jpeg_data_tensor: Input tensor for jpeg data from file.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The penultimate output layer of the graph.
-    architecture: The name of the model architecture.
-
-  Returns:
-    Nothing.
-  """
-  how_many_bottlenecks = 0
-  ensure_dir_exists(bottleneck_dir)
-  for label_name, label_lists in image_lists.items():
-    for category in ['training', 'testing', 'validation']:
-      category_list = label_lists[category]
-      for index, unused_base_name in enumerate(category_list):
-        get_or_create_bottleneck(
-            sess, image_lists, label_name, index, image_dir, category,
-            bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
-            resized_input_tensor, bottleneck_tensor, architecture)
-
-        how_many_bottlenecks += 1
-        if how_many_bottlenecks % 100 == 0:
-          tf.logging.info(
-              str(how_many_bottlenecks) + ' bottleneck files created.')
-
-
-def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
-                                  bottleneck_dir, image_dir, jpeg_data_tensor,
-                                  decoded_image_tensor, resized_input_tensor,
-                                  bottleneck_tensor, architecture):
-  """Retrieves bottleneck values for cached images.
-
-  If no distortions are being applied, this function can retrieve the cached
-  bottleneck values directly from disk for images. It picks a random set of
-  images from the specified category.
-
-  Args:
-    sess: Current TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    how_many: If positive, a random sample of this size will be chosen.
-    If negative, all bottlenecks will be retrieved.
-    category: Name string of which set to pull from - training, testing, or
-    validation.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    jpeg_data_tensor: The layer to feed jpeg image data into.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The bottleneck output layer of the CNN graph.
-    architecture: The name of the model architecture.
-
-  Returns:
-    List of bottleneck arrays, their corresponding ground truths, and the
-    relevant filenames.
-  """
-  class_count = len(image_lists.keys())
-  bottlenecks = []
-  ground_truths = []
-  filenames = []
-  if how_many >= 0:
-    # Retrieve a random sample of bottlenecks.
-    for unused_i in range(how_many):
-      label_index = random.randrange(class_count)
-      label_name = list(image_lists.keys())[label_index]
-      image_index = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
-      image_name = get_image_path(image_lists, label_name, image_index,
-                                  image_dir, category)
-      bottleneck = get_or_create_bottleneck(
-          sess, image_lists, label_name, image_index, image_dir, category,
-          bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
-          resized_input_tensor, bottleneck_tensor, architecture)
-      bottlenecks.append(bottleneck)
-      ground_truths.append(label_index)
-      filenames.append(image_name)
-  else:
-    # Retrieve all bottlenecks.
-    for label_index, label_name in enumerate(image_lists.keys()):
-      for image_index, image_name in enumerate(
-          image_lists[label_name][category]):
-        image_name = get_image_path(image_lists, label_name, image_index,
-                                    image_dir, category)
-        bottleneck = get_or_create_bottleneck(
-            sess, image_lists, label_name, image_index, image_dir, category,
-            bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
-            resized_input_tensor, bottleneck_tensor, architecture)
-        bottlenecks.append(bottleneck)
-        ground_truths.append(label_index)
-        filenames.append(image_name)
-  return bottlenecks, ground_truths, filenames
-
-
-def get_random_distorted_bottlenecks(
-    sess, image_lists, how_many, category, image_dir, input_jpeg_tensor,
-    distorted_image, resized_input_tensor, bottleneck_tensor):
-  """Retrieves bottleneck values for training images, after distortions.
-
-  If we're training with distortions like crops, scales, or flips, we have to
-  recalculate the full model for every image, and so we can't use cached
-  bottleneck values. Instead we find random images for the requested category,
-  run them through the distortion graph, and then the full graph to get the
-  bottleneck results for each.
-
-  Args:
-    sess: Current TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    how_many: The integer number of bottleneck values to return.
-    category: Name string of which set of images to fetch - training, testing,
-    or validation.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    input_jpeg_tensor: The input layer we feed the image data to.
-    distorted_image: The output node of the distortion graph.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The bottleneck output layer of the CNN graph.
-
-  Returns:
-    List of bottleneck arrays and their corresponding ground truths.
-  """
-  class_count = len(image_lists.keys())
-  bottlenecks = []
-  ground_truths = []
-  for unused_i in range(how_many):
-    label_index = random.randrange(class_count)
-    label_name = list(image_lists.keys())[label_index]
-    image_index = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
-    image_path = get_image_path(image_lists, label_name, image_index, image_dir,
-                                category)
-    if not gfile.Exists(image_path):
-      tf.logging.fatal('File does not exist %s', image_path)
-    jpeg_data = gfile.FastGFile(image_path, 'rb').read()
-    # Note that we materialize the distorted_image_data as a numpy array before
-    # sending running inference on the image. This involves 2 memory copies and
-    # might be optimized in other implementations.
-    distorted_image_data = sess.run(distorted_image,
-                                    {input_jpeg_tensor: jpeg_data})
-    bottleneck_values = sess.run(bottleneck_tensor,
-                                 {resized_input_tensor: distorted_image_data})
-    bottleneck_values = np.squeeze(bottleneck_values)
-    bottlenecks.append(bottleneck_values)
-    ground_truths.append(label_index)
-  return bottlenecks, ground_truths
-
-
-def should_distort_images(flip_left_right, random_crop, random_scale,
-                          random_brightness):
-  """Whether any distortions are enabled, from the input flags.
-
-  Args:
-    flip_left_right: Boolean whether to randomly mirror images horizontally.
-    random_crop: Integer percentage setting the total margin used around the
-    crop box.
-    random_scale: Integer percentage of how much to vary the scale by.
-    random_brightness: Integer range to randomly multiply the pixel values by.
-
-  Returns:
-    Boolean value indicating whether any distortions should be applied.
-  """
-  return (flip_left_right or (random_crop != 0) or (random_scale != 0) or
-          (random_brightness != 0))
-
-
-def add_input_distortions(flip_left_right, random_crop, random_scale,
-                          random_brightness, input_width, input_height,
-                          input_depth, input_mean, input_std):
-  """Creates the operations to apply the specified distortions.
-
-  During training it can help to improve the results if we run the images
-  through simple distortions like crops, scales, and flips. These reflect the
-  kind of variations we expect in the real world, and so can help train the
-  model to cope with natural data more effectively. Here we take the supplied
-  parameters and construct a network of operations to apply them to an image.
-
-  Cropping
-  ~~~~~~~~
-
-  Cropping is done by placing a bounding box at a random position in the full
-  image. The cropping parameter controls the size of that box relative to the
-  input image. If it's zero, then the box is the same size as the input and no
-  cropping is performed. If the value is 50%, then the crop box will be half the
-  width and height of the input. In a diagram it looks like this:
-
-  <       width         >
-  +---------------------+
-  |                     |
-  |   width - crop%     |
-  |    <      >         |
-  |    +------+         |
-  |    |      |         |
-  |    |      |         |
-  |    |      |         |
-  |    +------+         |
-  |                     |
-  |                     |
-  +---------------------+
-
-  Scaling
-  ~~~~~~~
-
-  Scaling is a lot like cropping, except that the bounding box is always
-  centered and its size varies randomly within the given range. For example if
-  the scale percentage is zero, then the bounding box is the same size as the
-  input and no scaling is applied. If it's 50%, then the bounding box will be in
-  a random range between half the width and height and full size.
-
-  Args:
-    flip_left_right: Boolean whether to randomly mirror images horizontally.
-    random_crop: Integer percentage setting the total margin used around the
-    crop box.
-    random_scale: Integer percentage of how much to vary the scale by.
-    random_brightness: Integer range to randomly multiply the pixel values by.
-    graph.
-    input_width: Horizontal size of expected input image to model.
-    input_height: Vertical size of expected input image to model.
-    input_depth: How many channels the expected input image should have.
-    input_mean: Pixel value that should be zero in the image for the graph.
-    input_std: How much to divide the pixel values by before recognition.
-
-  Returns:
-    The jpeg input layer and the distorted result tensor.
-  """
-
-  jpeg_data = tf.placeholder(tf.string, name='DistortJPGInput')
-  decoded_image = tf.image.decode_jpeg(jpeg_data, channels=input_depth)
-  decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
-  decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
-  margin_scale = 1.0 + (random_crop / 100.0)
-  resize_scale = 1.0 + (random_scale / 100.0)
-  margin_scale_value = tf.constant(margin_scale)
-  resize_scale_value = tf.random_uniform(tensor_shape.scalar(),
-                                         minval=1.0,
-                                         maxval=resize_scale)
-  scale_value = tf.multiply(margin_scale_value, resize_scale_value)
-  precrop_width = tf.multiply(scale_value, input_width)
-  precrop_height = tf.multiply(scale_value, input_height)
-  precrop_shape = tf.stack([precrop_height, precrop_width])
-  precrop_shape_as_int = tf.cast(precrop_shape, dtype=tf.int32)
-  precropped_image = tf.image.resize_bilinear(decoded_image_4d,
-                                              precrop_shape_as_int)
-  precropped_image_3d = tf.squeeze(precropped_image, squeeze_dims=[0])
-  cropped_image = tf.random_crop(precropped_image_3d,
-                                 [input_height, input_width, input_depth])
-  if flip_left_right:
-    flipped_image = tf.image.random_flip_left_right(cropped_image)
-  else:
-    flipped_image = cropped_image
-  brightness_min = 1.0 - (random_brightness / 100.0)
-  brightness_max = 1.0 + (random_brightness / 100.0)
-  brightness_value = tf.random_uniform(tensor_shape.scalar(),
-                                       minval=brightness_min,
-                                       maxval=brightness_max)
-  brightened_image = tf.multiply(flipped_image, brightness_value)
-  offset_image = tf.subtract(brightened_image, input_mean)
-  mul_image = tf.multiply(offset_image, 1.0 / input_std)
-  distort_result = tf.expand_dims(mul_image, 0, name='DistortResult')
-  return jpeg_data, distort_result
-
-
-def variable_summaries(var):
-  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
-  with tf.name_scope('summaries'):
-    mean = tf.reduce_mean(var)
-    tf.summary.scalar('mean', mean)
-    with tf.name_scope('stddev'):
-      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
-    tf.summary.scalar('stddev', stddev)
-    tf.summary.scalar('max', tf.reduce_max(var))
-    tf.summary.scalar('min', tf.reduce_min(var))
-    tf.summary.histogram('histogram', var)
-
-
-def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
-                           bottleneck_tensor_size, quantize_layer):
-  """Adds a new softmax and fully-connected layer for training.
-
-  We need to retrain the top layer to identify our new classes, so this function
-  adds the right operations to the graph, along with some variables to hold the
-  weights, and then sets up all the gradients for the backward pass.
-
-  The set up for the softmax and fully-connected layers is based on:
-  https://www.tensorflow.org/versions/master/tutorials/mnist/beginners/index.html
-
-  Args:
-    class_count: Integer of how many categories of things we're trying to
-        recognize.
-    final_tensor_name: Name string for the new final node that produces results.
-    bottleneck_tensor: The output of the main CNN graph.
-    bottleneck_tensor_size: How many entries in the bottleneck vector.
-    quantize_layer: Boolean, specifying whether the newly added layer should be
-        quantized.
-
-  Returns:
-    The tensors for the training and cross entropy results, and tensors for the
-    bottleneck input and ground truth input.
-  """
-  with tf.name_scope('input'):
-    bottleneck_input = tf.placeholder_with_default(
-        bottleneck_tensor,
-        shape=[None, bottleneck_tensor_size],
-        name='BottleneckInputPlaceholder')
-
-    ground_truth_input = tf.placeholder(
-        tf.int64, [None], name='GroundTruthInput')
-
-  # Organizing the following ops as `final_training_ops` so they're easier
-  # to see in TensorBoard
-  layer_name = 'final_training_ops'
-  with tf.name_scope(layer_name):
-    with tf.name_scope('weights'):
-      initial_value = tf.truncated_normal(
-          [bottleneck_tensor_size, class_count], stddev=0.001)
-      layer_weights = tf.Variable(initial_value, name='final_weights')
-      if quantize_layer:
-        quantized_layer_weights = quant_ops.MovingAvgQuantize(
-            layer_weights, is_training=True)
-        variable_summaries(quantized_layer_weights)
-
-      variable_summaries(layer_weights)
-    with tf.name_scope('biases'):
-      layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases')
-      if quantize_layer:
-        quantized_layer_biases = quant_ops.MovingAvgQuantize(
-            layer_biases, is_training=True)
-        variable_summaries(quantized_layer_biases)
-
-      variable_summaries(layer_biases)
-
-    with tf.name_scope('Wx_plus_b'):
-      if quantize_layer:
-        logits = tf.matmul(bottleneck_input,
-                           quantized_layer_weights) + quantized_layer_biases
-        logits = quant_ops.MovingAvgQuantize(
-            logits,
-            init_min=-32.0,
-            init_max=32.0,
-            is_training=True,
-            num_bits=8,
-            narrow_range=False,
-            ema_decay=0.5)
-        tf.summary.histogram('pre_activations', logits)
-      else:
-        logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases
-        tf.summary.histogram('pre_activations', logits)
-
-  final_tensor = tf.nn.softmax(logits, name=final_tensor_name)
-
-  tf.summary.histogram('activations', final_tensor)
-
-  with tf.name_scope('cross_entropy'):
-    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
-        labels=ground_truth_input, logits=logits)
-
-  tf.summary.scalar('cross_entropy', cross_entropy_mean)
-
-  with tf.name_scope('train'):
-    optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
-    train_step = optimizer.minimize(cross_entropy_mean)
-
-  return (train_step, cross_entropy_mean, bottleneck_input, ground_truth_input,
-          final_tensor)
-
-
-def add_evaluation_step(result_tensor, ground_truth_tensor):
-  """Inserts the operations we need to evaluate the accuracy of our results.
-
-  Args:
-    result_tensor: The new final node that produces results.
-    ground_truth_tensor: The node we feed ground truth data
-    into.
-
-  Returns:
-    Tuple of (evaluation step, prediction).
-  """
-  with tf.name_scope('accuracy'):
-    with tf.name_scope('correct_prediction'):
-      prediction = tf.argmax(result_tensor, 1)
-      correct_prediction = tf.equal(prediction, ground_truth_tensor)
-    with tf.name_scope('accuracy'):
-      evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  tf.summary.scalar('accuracy', evaluation_step)
-  return evaluation_step, prediction
-
-
-def save_graph_to_file(sess, graph, graph_file_name):
-  output_graph_def = graph_util.convert_variables_to_constants(
-      sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
-
-  with gfile.FastGFile(graph_file_name, 'wb') as f:
-    f.write(output_graph_def.SerializeToString())
-  return
-
-
-def prepare_file_system():
-  # Setup the directory we'll write summaries to for TensorBoard
-  if tf.gfile.Exists(FLAGS.summaries_dir):
-    tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
-  tf.gfile.MakeDirs(FLAGS.summaries_dir)
-  if FLAGS.intermediate_store_frequency > 0:
-    ensure_dir_exists(FLAGS.intermediate_output_graphs_dir)
-  return
-
-
-def create_model_info(architecture):
-  """Given the name of a model architecture, returns information about it.
-
-  There are different base image recognition pretrained models that can be
-  retrained using transfer learning, and this function translates from the name
-  of a model to the attributes that are needed to download and train with it.
-
-  Args:
-    architecture: Name of a model architecture.
-
-  Returns:
-    Dictionary of information about the model, or None if the name isn't
-    recognized
-
-  Raises:
-    ValueError: If architecture name is unknown.
-  """
-  architecture = architecture.lower()
-  is_quantized = False
-  if architecture == 'inception_v3':
-    # pylint: disable=line-too-long
-    data_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
-    # pylint: enable=line-too-long
-    bottleneck_tensor_name = 'pool_3/_reshape:0'
-    bottleneck_tensor_size = 2048
-    input_width = 299
-    input_height = 299
-    input_depth = 3
-    resized_input_tensor_name = 'Mul:0'
-    model_file_name = 'classify_image_graph_def.pb'
-    input_mean = 128
-    input_std = 128
-  elif architecture.startswith('mobilenet_'):
-    parts = architecture.split('_')
-    if len(parts) != 3 and len(parts) != 4:
-      tf.logging.error("Couldn't understand architecture name '%s'",
-                       architecture)
-      return None
-    version_string = parts[1]
-    if (version_string != '1.0' and version_string != '0.75' and
-        version_string != '0.50' and version_string != '0.25'):
-      tf.logging.error(
-          """"The Mobilenet version should be '1.0', '0.75', '0.50', or '0.25',
-  but found '%s' for architecture '%s'""",
-          version_string, architecture)
-      return None
-    size_string = parts[2]
-    if (size_string != '224' and size_string != '192' and
-        size_string != '160' and size_string != '128'):
-      tf.logging.error(
-          """The Mobilenet input size should be '224', '192', '160', or '128',
- but found '%s' for architecture '%s'""",
-          size_string, architecture)
-      return None
-    if len(parts) == 3:
-      is_quantized = False
-    else:
-      if parts[3] != 'quantized':
-        tf.logging.error(
-            "Couldn't understand architecture suffix '%s' for '%s'", parts[3],
-            architecture)
-        return None
-      is_quantized = True
-
-    if is_quantized:
-      data_url = 'http://download.tensorflow.org/models/mobilenet_v1_'
-      data_url += version_string + '_' + size_string + '_quantized_frozen.tgz'
-      bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
-      resized_input_tensor_name = 'Placeholder:0'
-      model_dir_name = ('mobilenet_v1_' + version_string + '_' + size_string +
-                        '_quantized_frozen')
-      model_base_name = 'quantized_frozen_graph.pb'
-
-    else:
-      data_url = 'http://download.tensorflow.org/models/mobilenet_v1_'
-      data_url += version_string + '_' + size_string + '_frozen.tgz'
-      bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
-      resized_input_tensor_name = 'input:0'
-      model_dir_name = 'mobilenet_v1_' + version_string + '_' + size_string
-      model_base_name = 'frozen_graph.pb'
-
-    bottleneck_tensor_size = 1001
-    input_width = int(size_string)
-    input_height = int(size_string)
-    input_depth = 3
-    model_file_name = os.path.join(model_dir_name, model_base_name)
-    input_mean = 127.5
-    input_std = 127.5
-  else:
-    tf.logging.error("Couldn't understand architecture name '%s'", architecture)
-    raise ValueError('Unknown architecture', architecture)
-
-  return {
-      'data_url': data_url,
-      'bottleneck_tensor_name': bottleneck_tensor_name,
-      'bottleneck_tensor_size': bottleneck_tensor_size,
-      'input_width': input_width,
-      'input_height': input_height,
-      'input_depth': input_depth,
-      'resized_input_tensor_name': resized_input_tensor_name,
-      'model_file_name': model_file_name,
-      'input_mean': input_mean,
-      'input_std': input_std,
-      'quantize_layer': is_quantized,
-  }
-
-
-def add_jpeg_decoding(input_width, input_height, input_depth, input_mean,
-                      input_std):
-  """Adds operations that perform JPEG decoding and resizing to the graph..
-
-  Args:
-    input_width: Desired width of the image fed into the recognizer graph.
-    input_height: Desired width of the image fed into the recognizer graph.
-    input_depth: Desired channels of the image fed into the recognizer graph.
-    input_mean: Pixel value that should be zero in the image for the graph.
-    input_std: How much to divide the pixel values by before recognition.
-
-  Returns:
-    Tensors for the node to feed JPEG data into, and the output of the
-      preprocessing steps.
-  """
-  jpeg_data = tf.placeholder(tf.string, name='DecodeJPGInput')
-  decoded_image = tf.image.decode_jpeg(jpeg_data, channels=input_depth)
-  decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
-  decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
-  resize_shape = tf.stack([input_height, input_width])
-  resize_shape_as_int = tf.cast(resize_shape, dtype=tf.int32)
-  resized_image = tf.image.resize_bilinear(decoded_image_4d,
-                                           resize_shape_as_int)
-  offset_image = tf.subtract(resized_image, input_mean)
-  mul_image = tf.multiply(offset_image, 1.0 / input_std)
-  return jpeg_data, mul_image
-
-
-def export_model(sess, architecture, saved_model_dir):
-  """Exports model for serving.
-
-  Args:
-    sess: Current active TensorFlow Session.
-    architecture: Model architecture.
-    saved_model_dir: Directory in which to save exported model and variables.
-  """
-  if architecture == 'inception_v3':
-    input_tensor = 'DecodeJpeg/contents:0'
-  elif architecture.startswith('mobilenet_'):
-    input_tensor = 'input:0'
-  else:
-    raise ValueError('Unknown architecture', architecture)
-  in_image = sess.graph.get_tensor_by_name(input_tensor)
-  inputs = {'image': tf.saved_model.utils.build_tensor_info(in_image)}
-
-  out_classes = sess.graph.get_tensor_by_name('final_result:0')
-  outputs = {'prediction': 
-             tf.saved_model.utils.build_tensor_info(out_classes)}
-
-  signature = tf.saved_model.signature_def_utils.build_signature_def(
-      inputs=inputs,
-      outputs=outputs,
-      method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
-
-  legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
-
-  # Save out the SavedModel.
-  builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir)
-  builder.add_meta_graph_and_variables(
-      sess, [tf.saved_model.tag_constants.SERVING],
-      signature_def_map={
-          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: 
-          signature
-      },
-      legacy_init_op=legacy_init_op)
-  builder.save()
-
-
-def main(_):
-  # Needed to make sure the logging output is visible.
-  # See https://github.com/tensorflow/tensorflow/issues/3047
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  # Prepare necessary directories that can be used during training
-  prepare_file_system()
-
-  # Gather information about the model architecture we'll be using.
-  model_info = create_model_info(FLAGS.architecture)
-  if not model_info:
-    tf.logging.error('Did not recognize architecture flag')
-    return -1
-
-  # Set up the pre-trained graph.
-  maybe_download_and_extract(model_info['data_url'])
-  graph, bottleneck_tensor, resized_image_tensor = (
-      create_model_graph(model_info))
-
-  # Look at the folder structure, and create lists of all the images.
-  image_lists = create_image_lists(FLAGS.image_dir, FLAGS.testing_percentage,
-                                   FLAGS.validation_percentage)
-  class_count = len(image_lists.keys())
-  if class_count == 0:
-    tf.logging.error('No valid folders of images found at ' + FLAGS.image_dir)
-    return -1
-  if class_count == 1:
-    tf.logging.error('Only one valid folder of images found at ' +
-                     FLAGS.image_dir +
-                     ' - multiple classes are needed for classification.')
-    return -1
-
-  # See if the command-line flags mean we're applying any distortions.
-  do_distort_images = should_distort_images(
-      FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
-      FLAGS.random_brightness)
-
-  with tf.Session(graph=graph) as sess:
-    # Set up the image decoding sub-graph.
-    jpeg_data_tensor, decoded_image_tensor = add_jpeg_decoding(
-        model_info['input_width'], model_info['input_height'],
-        model_info['input_depth'], model_info['input_mean'],
-        model_info['input_std'])
-
-    if do_distort_images:
-      # We will be applying distortions, so setup the operations we'll need.
-      (distorted_jpeg_data_tensor,
-       distorted_image_tensor) = add_input_distortions(
-           FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
-           FLAGS.random_brightness, model_info['input_width'],
-           model_info['input_height'], model_info['input_depth'],
-           model_info['input_mean'], model_info['input_std'])
-    else:
-      # We'll make sure we've calculated the 'bottleneck' image summaries and
-      # cached them on disk.
-      cache_bottlenecks(sess, image_lists, FLAGS.image_dir,
-                        FLAGS.bottleneck_dir, jpeg_data_tensor,
-                        decoded_image_tensor, resized_image_tensor,
-                        bottleneck_tensor, FLAGS.architecture)
-
-    # Add the new layer that we'll be training.
-    (train_step, cross_entropy, bottleneck_input, ground_truth_input,
-     final_tensor) = add_final_training_ops(
-         len(image_lists.keys()), FLAGS.final_tensor_name, bottleneck_tensor,
-         model_info['bottleneck_tensor_size'], model_info['quantize_layer'])
-
-    # Create the operations we need to evaluate the accuracy of our new layer.
-    evaluation_step, prediction = add_evaluation_step(
-        final_tensor, ground_truth_input)
-
-    # Merge all the summaries and write them out to the summaries_dir
-    merged = tf.summary.merge_all()
-    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
-                                         sess.graph)
-
-    validation_writer = tf.summary.FileWriter(
-        FLAGS.summaries_dir + '/validation')
-
-    # Set up all our weights to their initial default values.
-    init = tf.global_variables_initializer()
-    sess.run(init)
-
-    # Run the training for as many cycles as requested on the command line.
-    for i in range(FLAGS.how_many_training_steps):
-      # Get a batch of input bottleneck values, either calculated fresh every
-      # time with distortions applied, or from the cache stored on disk.
-      if do_distort_images:
-        (train_bottlenecks,
-         train_ground_truth) = get_random_distorted_bottlenecks(
-             sess, image_lists, FLAGS.train_batch_size, 'training',
-             FLAGS.image_dir, distorted_jpeg_data_tensor,
-             distorted_image_tensor, resized_image_tensor, bottleneck_tensor)
-      else:
-        (train_bottlenecks,
-         train_ground_truth, _) = get_random_cached_bottlenecks(
-             sess, image_lists, FLAGS.train_batch_size, 'training',
-             FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-             decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
-             FLAGS.architecture)
-      # Feed the bottlenecks and ground truth into the graph, and run a training
-      # step. Capture training summaries for TensorBoard with the `merged` op.
-      train_summary, _ = sess.run(
-          [merged, train_step],
-          feed_dict={bottleneck_input: train_bottlenecks,
-                     ground_truth_input: train_ground_truth})
-      train_writer.add_summary(train_summary, i)
-
-      # Every so often, print out how well the graph is training.
-      is_last_step = (i + 1 == FLAGS.how_many_training_steps)
-      if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
-        train_accuracy, cross_entropy_value = sess.run(
-            [evaluation_step, cross_entropy],
-            feed_dict={bottleneck_input: train_bottlenecks,
-                       ground_truth_input: train_ground_truth})
-        tf.logging.info('%s: Step %d: Train accuracy = %.1f%%' %
-                        (datetime.now(), i, train_accuracy * 100))
-        tf.logging.info('%s: Step %d: Cross entropy = %f' %
-                        (datetime.now(), i, cross_entropy_value))
-        validation_bottlenecks, validation_ground_truth, _ = (
-            get_random_cached_bottlenecks(
-                sess, image_lists, FLAGS.validation_batch_size, 'validation',
-                FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-                decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
-                FLAGS.architecture))
-        # Run a validation step and capture training summaries for TensorBoard
-        # with the `merged` op.
-        validation_summary, validation_accuracy = sess.run(
-            [merged, evaluation_step],
-            feed_dict={bottleneck_input: validation_bottlenecks,
-                       ground_truth_input: validation_ground_truth})
-        validation_writer.add_summary(validation_summary, i)
-        tf.logging.info('%s: Step %d: Validation accuracy = %.1f%% (N=%d)' %
-                        (datetime.now(), i, validation_accuracy * 100,
-                         len(validation_bottlenecks)))
-
-      # Store intermediate results
-      intermediate_frequency = FLAGS.intermediate_store_frequency
-
-      if (intermediate_frequency > 0 and (i % intermediate_frequency == 0)
-          and i > 0):
-        intermediate_file_name = (FLAGS.intermediate_output_graphs_dir +
-                                  'intermediate_' + str(i) + '.pb')
-        tf.logging.info('Save intermediate result to : ' +
-                        intermediate_file_name)
-        save_graph_to_file(sess, graph, intermediate_file_name)
-
-    # We've completed all our training, so run a final test evaluation on
-    # some new images we haven't used before.
-    test_bottlenecks, test_ground_truth, test_filenames = (
-        get_random_cached_bottlenecks(
-            sess, image_lists, FLAGS.test_batch_size, 'testing',
-            FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-            decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
-            FLAGS.architecture))
-    test_accuracy, predictions = sess.run(
-        [evaluation_step, prediction],
-        feed_dict={bottleneck_input: test_bottlenecks,
-                   ground_truth_input: test_ground_truth})
-    tf.logging.info('Final test accuracy = %.1f%% (N=%d)' %
-                    (test_accuracy * 100, len(test_bottlenecks)))
-
-    if FLAGS.print_misclassified_test_images:
-      tf.logging.info('=== MISCLASSIFIED TEST IMAGES ===')
-      for i, test_filename in enumerate(test_filenames):
-        if predictions[i] != test_ground_truth[i]:
-          tf.logging.info('%70s  %s' %
-                          (test_filename,
-                           list(image_lists.keys())[predictions[i]]))
-
-    # Write out the trained graph and labels with the weights stored as
-    # constants.
-    save_graph_to_file(sess, graph, FLAGS.output_graph)
-    with gfile.FastGFile(FLAGS.output_labels, 'w') as f:
-      f.write('\n'.join(image_lists.keys()) + '\n')
-
-    export_model(sess, FLAGS.architecture, FLAGS.saved_model_dir)
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--image_dir',
-      type=str,
-      default='',
-      help='Path to folders of labeled images.'
-  )
-  parser.add_argument(
-      '--output_graph',
-      type=str,
-      default='/tmp/output_graph.pb',
-      help='Where to save the trained graph.'
-  )
-  parser.add_argument(
-      '--intermediate_output_graphs_dir',
-      type=str,
-      default='/tmp/intermediate_graph/',
-      help='Where to save the intermediate graphs.'
-  )
-  parser.add_argument(
-      '--intermediate_store_frequency',
-      type=int,
-      default=0,
-      help="""\
-         How many steps to store intermediate graph. If "0" then will not
-         store.\
-      """
-  )
-  parser.add_argument(
-      '--output_labels',
-      type=str,
-      default='/tmp/output_labels.txt',
-      help='Where to save the trained graph\'s labels.'
-  )
-  parser.add_argument(
-      '--summaries_dir',
-      type=str,
-      default='/tmp/retrain_logs',
-      help='Where to save summary logs for TensorBoard.'
-  )
-  parser.add_argument(
-      '--how_many_training_steps',
-      type=int,
-      default=4000,
-      help='How many training steps to run before ending.'
-  )
-  parser.add_argument(
-      '--learning_rate',
-      type=float,
-      default=0.01,
-      help='How large a learning rate to use when training.'
-  )
-  parser.add_argument(
-      '--testing_percentage',
-      type=int,
-      default=10,
-      help='What percentage of images to use as a test set.'
-  )
-  parser.add_argument(
-      '--validation_percentage',
-      type=int,
-      default=10,
-      help='What percentage of images to use as a validation set.'
-  )
-  parser.add_argument(
-      '--eval_step_interval',
-      type=int,
-      default=10,
-      help='How often to evaluate the training results.'
-  )
-  parser.add_argument(
-      '--train_batch_size',
-      type=int,
-      default=100,
-      help='How many images to train on at a time.'
-  )
-  parser.add_argument(
-      '--test_batch_size',
-      type=int,
-      default=-1,
-      help="""\
-      How many images to test on. This test set is only used once, to evaluate
-      the final accuracy of the model after training completes.
-      A value of -1 causes the entire test set to be used, which leads to more
-      stable results across runs.\
-      """
-  )
-  parser.add_argument(
-      '--validation_batch_size',
-      type=int,
-      default=100,
-      help="""\
-      How many images to use in an evaluation batch. This validation set is
-      used much more often than the test set, and is an early indicator of how
-      accurate the model is during training.
-      A value of -1 causes the entire validation set to be used, which leads to
-      more stable results across training iterations, but may be slower on large
-      training sets.\
-      """
-  )
-  parser.add_argument(
-      '--print_misclassified_test_images',
-      default=False,
-      help="""\
-      Whether to print out a list of all misclassified test images.\
-      """,
-      action='store_true'
-  )
-  parser.add_argument(
-      '--model_dir',
-      type=str,
-      default='/tmp/imagenet',
-      help="""\
-      Path to classify_image_graph_def.pb,
-      imagenet_synset_to_human_label_map.txt, and
-      imagenet_2012_challenge_label_map_proto.pbtxt.\
-      """
-  )
-  parser.add_argument(
-      '--bottleneck_dir',
-      type=str,
-      default='/tmp/bottleneck',
-      help='Path to cache bottleneck layer values as files.'
-  )
-  parser.add_argument(
-      '--final_tensor_name',
-      type=str,
-      default='final_result',
-      help="""\
-      The name of the output classification layer in the retrained graph.\
-      """
-  )
-  parser.add_argument(
-      '--flip_left_right',
-      default=False,
-      help="""\
-      Whether to randomly flip half of the training images horizontally.\
-      """,
-      action='store_true'
-  )
-  parser.add_argument(
-      '--random_crop',
-      type=int,
-      default=0,
-      help="""\
-      A percentage determining how much of a margin to randomly crop off the
-      training images.\
-      """
-  )
-  parser.add_argument(
-      '--random_scale',
-      type=int,
-      default=0,
-      help="""\
-      A percentage determining how much to randomly scale up the size of the
-      training images by.\
-      """
-  )
-  parser.add_argument(
-      '--random_brightness',
-      type=int,
-      default=0,
-      help="""\
-      A percentage determining how much to randomly multiply the training image
-      input pixels up or down by.\
-      """
-  )
-  parser.add_argument(
-      '--architecture',
-      type=str,
-      default='inception_v3',
-      help="""\
-      Which model architecture to use. 'inception_v3' is the most accurate, but
-      also the slowest. For faster or smaller models, chose a MobileNet with the
-      form 'mobilenet_<parameter size>_<input_size>[_quantized]'. For example,
-      'mobilenet_1.0_224' will pick a model that is 17 MB in size and takes 224
-      pixel input images, while 'mobilenet_0.25_128_quantized' will choose a much
-      less accurate, but smaller and faster network that's 920 KB on disk and
-      takes 128x128 images. See https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html
-      for more information on Mobilenet.\
-      """)
-  parser.add_argument(
-      '--saved_model_dir',
-      type=str,
-      default='/tmp/saved_models/1/',
-      help='Where to save the exported graph.'
-  )
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
deleted file mode 100644
index 8b8dd45fd72e3d29bdb7f6291cc53b912adf3644..0000000000000000000000000000000000000000
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=g-bad-import-order,unused-import
-"""Tests the graph freezing tool."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import os
-
-from tensorflow.examples.image_retraining import retrain
-from tensorflow.python.framework import test_util
-
-
-class ImageRetrainingTest(test_util.TensorFlowTestCase):
-
-  def dummyImageLists(self):
-    return {'label_one': {'dir': 'somedir', 'training': ['image_one.jpg',
-                                                         'image_two.jpg'],
-                          'testing': ['image_three.jpg', 'image_four.jpg'],
-                          'validation': ['image_five.jpg', 'image_six.jpg']},
-            'label_two': {'dir': 'otherdir', 'training': ['image_one.jpg',
-                                                          'image_two.jpg'],
-                          'testing': ['image_three.jpg', 'image_four.jpg'],
-                          'validation': ['image_five.jpg', 'image_six.jpg']}}
-
-  def testGetImagePath(self):
-    image_lists = self.dummyImageLists()
-    self.assertEqual('image_dir/somedir/image_one.jpg', retrain.get_image_path(
-        image_lists, 'label_one', 0, 'image_dir', 'training'))
-    self.assertEqual('image_dir/otherdir/image_four.jpg',
-                     retrain.get_image_path(image_lists, 'label_two', 1,
-                                            'image_dir', 'testing'))
-
-  def testGetBottleneckPath(self):
-    image_lists = self.dummyImageLists()
-    self.assertEqual('bottleneck_dir/somedir/image_five.jpg_imagenet_v3.txt',
-                     retrain.get_bottleneck_path(
-                         image_lists, 'label_one', 0, 'bottleneck_dir',
-                         'validation', 'imagenet_v3'))
-
-  def testShouldDistortImage(self):
-    self.assertEqual(False, retrain.should_distort_images(False, 0, 0, 0))
-    self.assertEqual(True, retrain.should_distort_images(True, 0, 0, 0))
-    self.assertEqual(True, retrain.should_distort_images(False, 10, 0, 0))
-    self.assertEqual(True, retrain.should_distort_images(False, 0, 1, 0))
-    self.assertEqual(True, retrain.should_distort_images(False, 0, 0, 50))
-
-  def testAddInputDistortions(self):
-    with tf.Graph().as_default():
-      with tf.Session() as sess:
-        retrain.add_input_distortions(True, 10, 10, 10, 299, 299, 3, 128, 128)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('DistortJPGInput:0'))
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('DistortResult:0'))
-
-  @tf.test.mock.patch.object(retrain, 'FLAGS', learning_rate=0.01)
-  def testAddFinalTrainingOps(self, flags_mock):
-    with tf.Graph().as_default():
-      with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization
-        retrain.add_final_training_ops(5, 'final', bottleneck, 1024, False)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
-
-  @tf.test.mock.patch.object(retrain, 'FLAGS', learning_rate=0.01)
-  def testAddFinalTrainingOpsQuantized(self, flags_mock):
-    with tf.Graph().as_default():
-      with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization
-        retrain.add_final_training_ops(5, 'final', bottleneck, 1024, True)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
-
-  def testAddEvaluationStep(self):
-    with tf.Graph().as_default():
-      final = tf.placeholder(tf.float32, [1], name='final')
-      gt = tf.placeholder(tf.int64, [1], name='gt')
-      self.assertIsNotNone(retrain.add_evaluation_step(final, gt))
-
-  def testAddJpegDecoding(self):
-    with tf.Graph().as_default():
-      jpeg_data, mul_image = retrain.add_jpeg_decoding(10, 10, 3, 0, 255)
-      self.assertIsNotNone(jpeg_data)
-      self.assertIsNotNone(mul_image)
-
-  def testCreateModelInfo(self):
-    did_raise_value_error = False
-    try:
-      retrain.create_model_info('no_such_model_name')
-    except ValueError:
-      did_raise_value_error = True
-    self.assertTrue(did_raise_value_error)
-    model_info = retrain.create_model_info('inception_v3')
-    self.assertIsNotNone(model_info)
-    self.assertEqual(299, model_info['input_width'])
-
-  def testCreateModelInfoQuantized(self):
-    # Test for mobilenet_quantized
-    model_info = retrain.create_model_info('mobilenet_1.0_224')
-    self.assertIsNotNone(model_info)
-    self.assertEqual(224, model_info['input_width'])
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md
index 5bdaeb43ce143e36e78cfe301fd9b59e8b85b034..5d7bd36837b2a2c33ab4bc311a582c174666dcd5 100644
--- a/tensorflow/examples/ios/README.md
+++ b/tensorflow/examples/ios/README.md
@@ -119,11 +119,13 @@ rundown:
    `tensorflow/contrib/makefile/gen/lib` to the Library Search Paths setting.
 
  - You'll also need to add `libprotobuf.a` and `libprotobuf-lite.a` from
-   `tensorflow/contrib/makefile/gen/protobuf_ios/lib` to your _Build Stages_ and
-   _Library Search Paths_.
+   `tensorflow/contrib/makefile/gen/protobuf_ios/lib`
+   and `nsync.a` from `tensorflow/contrib/makefile/downloads/nsync/builds/lipo.ios.c++11` 
+   to your _Build Stages_ and _Library Search Paths_.
 
  - The _Header Search_ paths needs to contain:
    - the root folder of tensorflow,
+   - `tensorflow/contrib/makefile/downloads/nsync/public`
    - `tensorflow/contrib/makefile/downloads/protobuf/src`
    - `tensorflow/contrib/makefile/downloads`,
    - `tensorflow/contrib/makefile/downloads/eigen`, and
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 2abbe9dacca79b8d6e516550e28a9b203b18f123..c50fd93d03953b12113c17d420c4c5306a02ebe9 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -9,6 +9,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+exports_files(["data/grace_hopper.jpg"])
+
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 tf_cc_binary(
@@ -60,17 +62,3 @@ py_binary(
         "//tensorflow:tensorflow_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 63bc39de6c0a420e03adada56cbc8b0f895b6155..baa65d3243ffbebdf3ccf8a786a2434dfb7cfdad 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -137,15 +138,15 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
-  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+  if (tensorflow::str_util::EndsWith(file_name, ".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".gif")) {
     // gif decoder returns 4-D tensor, remove the first dim
     image_reader =
         Squeeze(root.WithOpName("squeeze_first_dim"),
                 DecodeGif(root.WithOpName("gif_reader"), file_reader));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".bmp")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".bmp")) {
     image_reader = DecodeBmp(root.WithOpName("bmp_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index aba7f600b53cf8286d46ee70823a0a425944076f..bdbcb0b1638a400f12f66bb3c4ee9d852fe145d2 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -152,15 +152,3 @@ sh_test(
         "notap",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
index 98819b20bfea5021d52e2c50b004bccdaf1f25e7..3ead8614b68959b95ccad43623d4df4a5c4665bd 100644
--- a/tensorflow/examples/learn/mnist.py
+++ b/tensorflow/examples/learn/mnist.py
@@ -61,8 +61,10 @@ def conv_model(features, labels, mode):
 
   # Densely connected layer with 1024 neurons.
   h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    h_fc1 = tf.layers.dropout(h_fc1, rate=0.5)
+  h_fc1 = tf.layers.dropout(
+      h_fc1, 
+      rate=0.5, 
+      training=(mode == tf.estimator.ModeKeys.TRAIN))
 
   # Compute logits (1 per class) and compute loss.
   logits = tf.layers.dense(h_fc1, N_DIGITS, activation=None)
diff --git a/tensorflow/examples/learn/resnet.py b/tensorflow/examples/learn/resnet.py
index 9542e552504580a6614f8bd2f43c38dfa795750f..c00de932a8707ad5717aaf1251cf5c88464a28b0 100755
--- a/tensorflow/examples/learn/resnet.py
+++ b/tensorflow/examples/learn/resnet.py
@@ -53,6 +53,8 @@ def res_net_model(features, labels, mode):
     ndim = int(sqrt(input_shape[1]))
     x = tf.reshape(x, [-1, ndim, ndim, 1])
 
+  training = (mode == tf.estimator.ModeKeys.TRAIN)
+  
   # First convolution expands to 64 channels
   with tf.variable_scope('conv_layer1'):
     net = tf.layers.conv2d(
@@ -60,7 +62,7 @@ def res_net_model(features, labels, mode):
         filters=64,
         kernel_size=7,
         activation=tf.nn.relu)
-    net = tf.layers.batch_normalization(net)
+    net = tf.layers.batch_normalization(net, training=training)
 
   # Max pool
   net = tf.layers.max_pooling2d(
@@ -88,7 +90,7 @@ def res_net_model(features, labels, mode):
             kernel_size=1,
             padding='valid',
             activation=tf.nn.relu)
-        conv = tf.layers.batch_normalization(conv)
+        conv = tf.layers.batch_normalization(conv, training=training)
 
       with tf.variable_scope(name + '/conv_bottleneck'):
         conv = tf.layers.conv2d(
@@ -97,7 +99,7 @@ def res_net_model(features, labels, mode):
             kernel_size=3,
             padding='same',
             activation=tf.nn.relu)
-        conv = tf.layers.batch_normalization(conv)
+        conv = tf.layers.batch_normalization(conv, training=training)
 
       # 1x1 convolution responsible for restoring dimension
       with tf.variable_scope(name + '/conv_out'):
@@ -108,7 +110,7 @@ def res_net_model(features, labels, mode):
             kernel_size=1,
             padding='valid',
             activation=tf.nn.relu)
-        conv = tf.layers.batch_normalization(conv)
+        conv = tf.layers.batch_normalization(conv, training=training)
 
       # shortcut connections that turn the network into its counterpart
       # residual function (identity shortcut)
@@ -154,7 +156,7 @@ def res_net_model(features, labels, mode):
   loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
-  if mode == tf.estimator.ModeKeys.TRAIN:
+  if training:
     optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
diff --git a/tensorflow/examples/multibox_detector/BUILD b/tensorflow/examples/multibox_detector/BUILD
index 91a5bfa51cda71ed2bca37869c7305d752e1e035..4f9908cd52d98acc20b9238d9a0fdff39284ea32 100644
--- a/tensorflow/examples/multibox_detector/BUILD
+++ b/tensorflow/examples/multibox_detector/BUILD
@@ -27,17 +27,3 @@ tf_cc_binary(
         "//tensorflow/core:tensorflow",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index e38704fd98cea6928231f2fc2bc989705ae46bb4..96ea525a4e74c68da17d0310f0ad475789314215 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -84,10 +85,10 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
-  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+  if (tensorflow::str_util::EndsWith(file_name, ".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".gif")) {
     image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
@@ -131,7 +132,7 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
 
 Status SaveImage(const Tensor& tensor, const string& file_path) {
   LOG(INFO) << "Saving image to " << file_path;
-  CHECK(tensorflow::StringPiece(file_path).ends_with(".png"))
+  CHECK(tensorflow::str_util::EndsWith(file_path, ".png"))
       << "Only saving of png files is supported.";
 
   auto root = tensorflow::Scope::NewRootScope();
diff --git a/tensorflow/examples/saved_model/BUILD b/tensorflow/examples/saved_model/BUILD
index 1cdf5ec6e1d80c8337d7929159860e093ad07364..ebefc6576d646467426a784d03f4be206aeaba38 100644
--- a/tensorflow/examples/saved_model/BUILD
+++ b/tensorflow/examples/saved_model/BUILD
@@ -8,19 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-)
-
 py_binary(
     name = "saved_model_half_plus_two",
     srcs = [
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index 12479211c32a965642d23226406617df6ff5a29c..13bca34a86b0c2fba7e5e8e3527d13587feacaae 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -245,15 +245,3 @@ tf_cc_binary(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index e7db9cddf02daf9a32d3ed859ee9bd35b2cae838..63dd18457fea42acb09058b9ddd4623d72d1fd04 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -457,7 +457,7 @@ class AudioProcessor(object):
           self.time_shift_offset_placeholder_: time_shift_offset,
       }
       # Choose a section of background noise to mix in.
-      if use_background:
+      if use_background or sample['label'] == SILENCE_LABEL:
         background_index = np.random.randint(len(self.background_data))
         background_samples = self.background_data[background_index]
         background_offset = np.random.randint(
@@ -465,7 +465,9 @@ class AudioProcessor(object):
         background_clipped = background_samples[background_offset:(
             background_offset + desired_samples)]
         background_reshaped = background_clipped.reshape([desired_samples, 1])
-        if np.random.uniform(0, 1) < background_frequency:
+        if sample['label'] == SILENCE_LABEL:
+          background_volume = np.random.uniform(0, 1)
+        elif np.random.uniform(0, 1) < background_frequency:
           background_volume = np.random.uniform(0, background_volume_range)
         else:
           background_volume = 0
diff --git a/tensorflow/examples/speech_commands/label_wav_dir.py b/tensorflow/examples/speech_commands/label_wav_dir.py
index 2f305359e380e7192795851112c8261ea896c290..a34db512dda86be138e07a4ffaa1963fe00a5cea 100644
--- a/tensorflow/examples/speech_commands/label_wav_dir.py
+++ b/tensorflow/examples/speech_commands/label_wav_dir.py
@@ -32,8 +32,8 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import sys
 import glob
+import sys
 
 import tensorflow as tf
 
@@ -65,7 +65,7 @@ def run_graph(wav_dir, labels, input_layer_name, output_layer_name,
     #   predictions  will contain a two-dimensional array, where one
     #   dimension represents the input image count, and the other has
     #   predictions per class
-    for wav_path in glob.glob(wav_dir + "/*.wav"):
+    for wav_path in glob.glob(wav_dir + '/*.wav'):
       if not wav_path or not tf.gfile.Exists(wav_path):
         tf.logging.fatal('Audio file does not exist %s', wav_path)
 
diff --git a/tensorflow/examples/tutorials/estimators/BUILD b/tensorflow/examples/tutorials/estimators/BUILD
index ecbc1a431d9a2173e80434b6f9350c225fc9bfb4..bab609f208b6ca3dd6daa8ecfd0c0c762ef87a22 100644
--- a/tensorflow/examples/tutorials/estimators/BUILD
+++ b/tensorflow/examples/tutorials/estimators/BUILD
@@ -20,15 +20,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/image_retraining/__init__.py b/tensorflow/examples/tutorials/estimators/__init__.py
similarity index 100%
rename from tensorflow/examples/image_retraining/__init__.py
rename to tensorflow/examples/tutorials/estimators/__init__.py
diff --git a/tensorflow/examples/tutorials/input_fn/__init__.py b/tensorflow/examples/tutorials/input_fn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/examples/tutorials/layers/BUILD b/tensorflow/examples/tutorials/layers/BUILD
index f8a29c79c63cb77d15ff03d0cf5c98ae36ccc3f8..aad78b18409bab1fe6924849ec5b61c6f3a052f7 100644
--- a/tensorflow/examples/tutorials/layers/BUILD
+++ b/tensorflow/examples/tutorials/layers/BUILD
@@ -19,15 +19,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/layers/__init__.py b/tensorflow/examples/tutorials/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 6d4e67063d8470788a74e0083b62a2db12dd7c64..d7bc6a5a7d1e4cd3927c7c5067ccc22993885994 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -51,6 +51,7 @@ py_binary(
         "fully_connected_feed.py",
     ],
     srcs_version = "PY2AND3",
+    tags = ["optonly"],
     deps = [
         ":input_data",
         ":mnist",
@@ -132,15 +133,3 @@ py_test(
         "//tensorflow:tensorflow_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/monitors/BUILD b/tensorflow/examples/tutorials/monitors/BUILD
index 4220e8144de1259dc5bd873ddb5810bf95dcafae..1c49e3fe5390ad48a3dea7cd5688996270b1dc9d 100644
--- a/tensorflow/examples/tutorials/monitors/BUILD
+++ b/tensorflow/examples/tutorials/monitors/BUILD
@@ -23,15 +23,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/monitors/__init__.py b/tensorflow/examples/tutorials/monitors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py
index 850d105f7b1b33fadd40bc6a6cab3d08c0da3734..a2b7fe60237da0604f74f31c0a09951f708e908b 100644
--- a/tensorflow/examples/tutorials/monitors/iris_monitors.py
+++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py
@@ -32,9 +32,9 @@ IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv")
 def main(unused_argv):
   # Load datasets.
   training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float)
+      filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
   test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float)
+      filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
 
   validation_metrics = {
       "accuracy":
@@ -83,7 +83,7 @@ def main(unused_argv):
 
   # Classify two new flower samples.
   new_samples = np.array(
-      [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
+      [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
   y = list(classifier.predict(new_samples))
   print("Predictions: {}".format(str(y)))
 
diff --git a/tensorflow/examples/tutorials/word2vec/BUILD b/tensorflow/examples/tutorials/word2vec/BUILD
index 42d6355b4f06258a3c22d0ef324bb31880f2d9a3..2e19c038bdf04235ccd2f4fdbfeff250ca72a07e 100644
--- a/tensorflow/examples/tutorials/word2vec/BUILD
+++ b/tensorflow/examples/tutorials/word2vec/BUILD
@@ -13,19 +13,11 @@ py_binary(
         "word2vec_basic.py",
     ],
     srcs_version = "PY2AND3",
+    tags = [
+        "no-internal-py3",
+    ],
     deps = [
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 14ae7fbf35836ad7f5d56101ae0fc33a3f3fb9ba..b09ee9976897fcab2e90fdc17e8030532080aca8 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -224,7 +224,7 @@ with graph.as_default():
     optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
 
   # Compute the cosine similarity between minibatch examples and all embeddings.
-  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
+  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
   normalized_embeddings = embeddings / norm
   valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                             valid_dataset)
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
index c99870c686c18c1b201ec44c8335b3d9ba24f5a1..cc8835728d59b6a57d46167686987aa34ab9d0a0 100644
--- a/tensorflow/examples/wav_to_spectrogram/BUILD
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -49,17 +49,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index b1bd87eb0c3b3a498a1db45f11d9a48552e08079..e251356ec8e97311affaf752c0a515be97013fa8 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -5,7 +5,7 @@ Construct and execute TensorFlow graphs in Go.
 [![GoDoc](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go?status.svg)](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
 
 > *WARNING*: The API defined in this package is not stable and can change
-> without notice. The same goes for the awkward package path
+> without notice. The same goes for the package path:
 > (`github.com/tensorflow/tensorflow/tensorflow/go`).
 
 ## Quickstart
diff --git a/tensorflow/go/genop/internal/api_def_map.go b/tensorflow/go/genop/internal/api_def_map.go
index 07b689dbba23a3aa991983f3b373fa8445c673e1..8600452b476dee49292cbffe630026cf6077e22b 100644
--- a/tensorflow/go/genop/internal/api_def_map.go
+++ b/tensorflow/go/genop/internal/api_def_map.go
@@ -31,7 +31,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
 )
 
 // Encapsulates a collection of API definitions.
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index 82f7510f2ed947e0a87e4d88cfce1ecaaa6362f8..fb8163121850cee36e1fcc652ca258b1fe2d42ff 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -47,7 +47,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
 )
 
 // GenerateFunctionsForRegisteredOps writes a Go source code file to w
@@ -359,13 +359,13 @@ type attrWrapper struct {
 	api *pb.ApiDef_Attr
 }
 
-func (a *attrWrapper) Name() string             { return a.api.Name }
-func (a *attrWrapper) RenameTo() string         { return a.api.RenameTo }
-func (a *attrWrapper) Description() string      { return a.api.Description }
-func (a *attrWrapper) Type() string             { return a.op.Type }
-func (a *attrWrapper) IsListAttr() bool         { return isListAttr(a.op) }
-func (a *attrWrapper) HasMinimum() bool         { return a.op.HasMinimum }
-func (a *attrWrapper) Minimum() int64           { return a.op.Minimum }
+func (a *attrWrapper) Name() string              { return a.api.Name }
+func (a *attrWrapper) RenameTo() string          { return a.api.RenameTo }
+func (a *attrWrapper) Description() string       { return a.api.Description }
+func (a *attrWrapper) Type() string              { return a.op.Type }
+func (a *attrWrapper) IsListAttr() bool          { return isListAttr(a.op) }
+func (a *attrWrapper) HasMinimum() bool          { return a.op.HasMinimum }
+func (a *attrWrapper) Minimum() int64            { return a.op.Minimum }
 func (a *attrWrapper) DefaultValue() interface{} { return a.api.DefaultValue }
 
 type argWrapper struct {
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index b3a23dff102a690b1f7f08b675219929355f139f..d20d22e0c1502f92ade7ef5aa40985dce73b7552 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -22,7 +22,7 @@ import (
 	"testing"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
 )
 
 // Creates an ApiDef based on opdef and applies overrides
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 13f38dfb32a476477d306093bad6b56e1744a640..9e87995441620636c71e87f069002ff0ef7838fd 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -38,67 +38,46 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 	return list, start + size, nil
 }
 
-// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
-type WriteImageSummaryAttr func(optionalAttr)
+// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
+type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
 
-// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
+// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
 //
-// REQUIRES: value >= 1
-func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
+// value: The bitwidth of the quantization; between 2 and 16, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
 	return func(m optionalAttr) {
-		m["max_images"] = value
+		m["num_bits"] = value
 	}
 }
 
-// Writes a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
+// FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange sets the optional narrow_range attribute to value.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
 //
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
+// shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
+//   same as `gradients`.
+// min, max: Quantization interval, floats of shape `[d]`.
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
 //
-// Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
-//	bad_color: Color to use for pixels with non-finite values.
 //
-// Returns the created operation.
-func WriteImageSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
+// Returns Backpropagated gradients w.r.t. inputs, shape same as
+// `inputs`:
+//   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -107,159 +86,166 @@ func WriteImageSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WriteImageSummary",
+		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
 		Input: []tf.Input{
-			writer, step, tag, tensor, bad_color,
+			gradients, inputs, min, max,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Outputs a `tf.Event` protocol buffer.
-//
-// When CreateSummaryDbWriter is being used, this op can be useful for
-// importing data from event logs.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	event: A string containing a binary-encoded tf.Event proto.
-//
-// Returns the created operation.
-func ImportEvent(scope *Scope, writer tf.Output, event tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
+type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ImportEvent",
-		Input: []tf.Input{
-			writer, event,
-		},
+}
+
+// FakeQuantWithMinMaxVarsPerChannelNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsPerChannelNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor.
+// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
 //
-// Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tensor: A tensor to serialize.
-//	tag: The summary's tag.
-//	summary_metadata: Serialized SummaryMetadata protocol buffer containing
-// plugin-related metadata for this summary.
+// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
+// to 'outputs' tensor of same shape as `inputs`.
 //
-// Returns the created operation.
-func WriteSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) {
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "WriteSummary",
+		Type: "FakeQuantWithMinMaxVarsPerChannel",
 		Input: []tf.Input{
-			writer, step, tensor, tag, summary_metadata,
+			inputs, min, max,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates summary database writer accessible by given resource handle.
-//
-// This can be used to write tensors from the execution graph directly
-// to a database. Only SQLite is supported right now. This function
-// will create the schema if it doesn't exist. Entries in the Users,
-// Experiments, and Runs tables will be created automatically if they
-// don't already exist.
-//
-// Arguments:
-//	writer: Handle to SummaryWriter resource to overwrite.
-//	db_uri: For example "file:/tmp/foo.sqlite".
-//	experiment_name: Can't contain ASCII control characters or <>. Case
-// sensitive. If empty, then the Run will not be associated with any
-// Experiment.
-//	run_name: Can't contain ASCII control characters or <>. Case sensitive.
-// If empty, then each Tag will not be associated with any Run.
-//	user_name: Must be valid as both a DNS label and Linux username. If
-// empty, then the Experiment will not be associated with any User.
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
 //
-// Returns the created operation.
-func CreateSummaryDbWriter(scope *Scope, writer tf.Output, db_uri tf.Output, experiment_name tf.Output, run_name tf.Output, user_name tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "CreateSummaryDbWriter",
-		Input: []tf.Input{
-			writer, db_uri, experiment_name, run_name, user_name,
-		},
+}
+
+// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Creates a summary file writer accessible by the given resource handle.
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
 //
 // Arguments:
-//	writer: A handle to the summary writer resource
-//	logdir: Directory where the event file will be written.
-//	max_queue: Size of the queue of pending events and summaries.
-//	flush_millis: How often, in milliseconds, to flush the pending events and
-// summaries to disk.
-//	filename_suffix: Every event file's name is suffixed with this suffix.
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
 //
-// Returns the created operation.
-func CreateSummaryFileWriter(scope *Scope, writer tf.Output, logdir tf.Output, max_queue tf.Output, flush_millis tf.Output, filename_suffix tf.Output) (o *tf.Operation) {
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CreateSummaryFileWriter",
+		Type: "FakeQuantWithMinMaxVarsGradient",
 		Input: []tf.Input{
-			writer, logdir, max_queue, flush_millis, filename_suffix,
+			gradients, inputs, min, max,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
-type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
 // If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
 	return func(m optionalAttr) {
 		m["num_bits"] = value
 	}
 }
 
-// FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange sets the optional narrow_range attribute to value.
-//
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
 // If not specified, defaults to false
-func FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
 	return func(m optionalAttr) {
 		m["narrow_range"] = value
 	}
 }
 
-// Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
 //
 // Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
-// shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
-//   same as `gradients`.
-// min, max: Quantization interval, floats of shape `[d]`.
-//
-//
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
 //
-// Returns Backpropagated gradients w.r.t. inputs, shape same as
-// `inputs`:
-//   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
-// `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
-// `sum_per_d(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -268,640 +254,771 @@ func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
+		Type: "FakeQuantWithMinMaxArgsGradient",
 		Input: []tf.Input{
-			gradients, inputs, min, max,
+			gradients, inputs,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-//
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
-//
-// `data.shape` must start with `partitions.shape`.
-//
-// For example:
-//
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
-//
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
-// ```
-//
-// See `dynamic_stitch` for an example on how to merge partitions back.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
-	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
-		Input: []tf.Input{
-			data, partitions,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
-	}
-	return outputs
-}
-
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
+type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
 
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["min"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["max"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["num_bits"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["narrow_range"] = value
 	}
 }
 
-// Creates an empty hash table.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// Attributes `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+// Quantization is called fake since the output is still in floating point.
+func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
+		Type: "FakeQuantWithMinMaxArgs",
+		Input: []tf.Input{
+			inputs,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// Scatter `updates` into a new (initially zero) tensor according to `indices`.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+// Creates a new tensor by applying sparse `updates` to individual
+// values or slices within a zero tensor of the given `shape` according to
+// indices.  This operator is the inverse of the @{tf.gather_nd} operator which
+// extracts values or slices from a given tensor.
 //
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
 //
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "ScatterNd",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			indices, updates, shape,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
+// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
+type QuantizeAndDequantizeV2Attr func(optionalAttr)
 
-// MutableHashTableV2Container sets the optional container attribute to value.
+// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+// value: If the quantization is signed or unsigned.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["signed_input"] = value
 	}
 }
 
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+// value: The bitwidth of the quantization.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["num_bits"] = value
 	}
 }
 
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
+// value: If the range is given or should be computed from the tensor.
 // If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["range_given"] = value
 	}
 }
 
-// Creates an empty hash table.
+// Quantizes then dequantizes a tensor.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// This op simulates the precision loss from the quantized forward pass by:
+// 1. Quantizing the tensor to fixed point numbers, which should match the target
+//    quantization method when it is used in inference.
+// 2. Dequantizing it back to floating point numbers for the following ops, most
+//    likely matmul.
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// There are different ways to quantize. This version does not use the full range
+// of the output type, choosing to elide the lowest possible value for symmetry
+// (e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
+// quantization), so that 0.0 maps to 0.
 //
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
-
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// To perform this op, we first find the range of values in our tensor. The range
+// we use is always centered on 0, so we find m such that
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// 1. m = max(abs(input_min), abs(input_max)) if range_given is true,
+// 2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns a random (key, value)
+// Our input tensor range is then [-m, m].
 //
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
+// If signed_input is true, this is
+//
+//   [min_fixed, max_fixed ] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
+//
+// Otherwise, if signed_input is false, the fixed-point range is
+//
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
+//
+// From this we compute our scaling factor, s:
+//
+//   s = (max_fixed - min_fixed) / (2 * m).
+//
+// Now we can quantize and dequantize the elements of our tensor.  An element e
+// is transformed into e':
+//
+//   e' = (e * s).round_to_nearest() / s.
+//
+// Note that we have a different number of buckets in the signed vs. unsigned
+// cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
+// vs. 255 in the unsigned case.
+//
+// For example, suppose num_bits = 8 and m = 1.  Then
+//
+//   [min_fixed, max_fixed] = [-127, 127], and
+//   s = (127 + 127) / 2 = 127.
+//
+// Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
+// {-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
+//
+// Arguments:
+//	input: Tensor to quantize and then dequantize.
+//	input_min: If range_given, this is the min of the range, otherwise this input
+// will be ignored.
+//	input_max: If range_given, this is the max of the range, otherwise this input
+// will be ignored.
+func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
+		Type: "QuantizeAndDequantizeV2",
 		Input: []tf.Input{
-			indices,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
-		return
-	}
-	return key, values
-}
-
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
-
-// HashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// HashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
+	return op.Output(0)
 }
 
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// Bitcasts a tensor from one type to another without copying data.
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates a non-initialized hash table.
+// Given a tensor `input`, this operation returns a tensor that has the same buffer
+// data as `input` with datatype `type`.
 //
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+// If the input datatype `T` is larger than the output datatype `type` then the
+// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// If `T` is smaller than `type`, the operator requires that the rightmost
+// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
+// [..., sizeof(`type`)/sizeof(`T`)] to [...].
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
+// endian orderings will give different results.
+func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"type": type_}
 	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
+		Type: "Bitcast",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Replaces the contents of the table with the specified keys and values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// Extract `patches` from `images` and put them in the "depth" output dimension.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// We specify the size-related attributes as:
+//
+// ```python
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
+// ```
+//
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "ExtractImagePatches",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			images,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
+// SpaceToDepthAttr is an optional argument to SpaceToDepth.
+type SpaceToDepthAttr func(optionalAttr)
 
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
+// SpaceToDepthDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["data_format"] = value
 	}
 }
 
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// SpaceToDepth for tensors of type T.
 //
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
+// Rearranges blocks of spatial data, into depth. More specifically,
+// this op outputs a copy of the input tensor where values from the `height`
+// and `width` dimensions are moved to the `depth` dimension.
+// The attr `block_size` indicates the input block size.
 //
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+//   * Non-overlapping blocks of size `block_size x block size` are rearranged
+//     into depth at each location.
+//   * The depth of the output tensor is `block_size * block_size * input_depth`.
+//   * The Y, X coordinates within each block of the input become the high order
+//     component of the output channel index.
+//   * The input tensor's height and width must be divisible by block_size.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
+//                         within the output image, bX, bY means coordinates
+//                         within the input block, iC means input channels).
+//      The output would be a transpose to the following layout:
+//      n,oY,oX,bY,bX,iC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1], [2]],
+//       [[3], [4]]]]
+// ```
+//
+// This operation will output a tensor of shape `[1, 1, 1, 4]`:
+//
+// ```
+// [[[[1, 2, 3, 4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
+// the corresponding output will have a single element (i.e. width and height are
+// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
+// The output element shape is `[1, 1, 4]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// This operation, for block_size of 2, will return the following tensor of shape
+// `[1, 1, 1, 12]`
+//
+// ```
+// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [5],  [6]],
+//       [[3],   [4],  [7],  [8]],
+//       [[9],  [10], [13],  [14]],
+//       [[11], [12], [15],  [16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 2 2 4]`:
+//
+// ```
+// x = [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block.
+func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"block_size": block_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapPeek",
+		Type: "SpaceToDepth",
 		Input: []tf.Input{
-			key, indices,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// SpaceToBatch for 4-D tensors of type T.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// This is a legacy version of the more general SpaceToBatchND.
+//
+// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
+// More specifically, this op outputs a copy of the input tensor where values from
+// the `height` and `width` dimensions are moved to the `batch` dimension. After
+// the zero-padding, both `height` and `width` of the input must be divisible by the
+// block size.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, depth]`.
+//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+//   the padding of the input with zeros across the spatial dimensions as follows:
+//
+//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
+//
+//   The effective spatial dimensions of the zero-padded input tensor will be:
+//
+//       height_pad = pad_top + height + pad_bottom
+//       width_pad = pad_left + width + pad_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+//   * Non-overlapping blocks of size `block_size x block size` in the height and
+//     width dimensions are rearranged into the batch dimension at each location.
+//   * The batch of the output tensor is `batch * block_size * block_size`.
+//   * Both height_pad and width_pad must be divisible by block_size.
+//
+// The shape of the output will be:
+//
+//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//      depth]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+//
+func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "SpaceToBatch",
 		Input: []tf.Input{
-			x, y,
+			input, paddings,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Forwards the input to the output.
+// SpaceToBatch for N-D tensors of type T.
 //
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
+// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+// grid of blocks of shape `block_shape`, and interleaves these blocks with the
+// "batch" dimension (0) such that in the output, the spatial dimensions
+// `[1, ..., M]` correspond to the position within the grid, and the batch
+// dimension combines both the position within a spatial block and the original
+// batch position.  Prior to division into blocks, the spatial dimensions of the
+// input are optionally zero padded according to `paddings`.  See below for a
+// precise description.
 //
 // Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has `M` dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
 //
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LoopCond",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x * y element-wise, working on quantized buffers.
+// This operation is equivalent to the following steps:
 //
-// Arguments:
+// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+//    input according to `paddings` to produce `padded` of shape `padded_shape`.
 //
+// 2. Reshape `padded` to `reshaped_padded` of shape:
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//        block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1],
+//       block_shape[M-1]] +
+//      remaining_shape
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// 3. Permute dimensions of `reshaped_padded` to produce
+//    `permuted_reshaped_padded` of shape:
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+//      block_shape +
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+//    dimension, producing an output tensor of shape:
+//
+//      [batch * prod(block_shape)] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+//     paddings = `[[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 3, 1]` and value:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "SpaceToBatchND",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			input, block_shape, paddings,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
 
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+// ListDiffOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["out_idx"] = value
 	}
 }
 
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// Computes the difference between two lists of numbers or strings.
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+// For example, given this input:
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// ```
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
+// ```
+//
+// This operation would return:
+//
+// ```
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
+// ```
 //
 // Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -910,339 +1027,419 @@ func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, ma
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
+		Type: "ListDiff",
 		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
+			x, y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// A placeholder op that passes through `input` when its output is not fed.
+// Inserts a dimension of 1 into a tensor's shape.
 //
-// Arguments:
-//	input: The default value to produce when `output` is not fed.
-//	shape: The (possibly partial) shape of the tensor.
+// Given a tensor `input`, this operation inserts a dimension of 1 at the
+// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
+// zero; if you specify a negative number for `axis` it is counted backward from
+// the end.
 //
-// Returns A placeholder tensor that defaults to `input` if it is not fed.
-func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
+// This operation is useful if you want to add a batch dimension to a single
+// element. For example, if you have a single image of shape `[height, width,
+// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+// which will make the shape `[1, height, width, channels]`.
+//
+// Other examples:
+//
+// ```
+// # 't' is a tensor of shape [2]
+// shape(expand_dims(t, 0)) ==> [1, 2]
+// shape(expand_dims(t, 1)) ==> [2, 1]
+// shape(expand_dims(t, -1)) ==> [2, 1]
+//
+// # 't2' is a tensor of shape [2, 3, 5]
+// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+// ```
+//
+// This operation requires that:
+//
+// `-1-input.dims() <= dim <= input.dims()`
+//
+// This operation is related to `squeeze()`, which removes dimensions of
+// size 1.
+//
+// Arguments:
+//
+//	axis: 0-D (scalar). Specifies the dimension index at which to
+// expand the shape of `input`. Must be in the range
+// `[-rank(input) - 1, rank(input)]`.
+//
+// Returns Contains the same data as `input`, but its shape has an additional
+// dimension of size 1 added.
+func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "PlaceholderWithDefault",
+		Type: "ExpandDims",
 		Input: []tf.Input{
-			input,
+			input, axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the complex conjugate of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
-//
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+// A placeholder op that passes through `input` when its output is not fed.
 //
-// For example:
+// Arguments:
+//	input: The default value to produce when `output` is not fed.
+//	shape: The (possibly partial) shape of the tensor.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A placeholder tensor that defaults to `input` if it is not fed.
+func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "Conj",
+		Type: "PlaceholderWithDefault",
 		Input: []tf.Input{
 			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// A placeholder op for a value that will be fed into the computation.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
+//
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor. The shape can be any partially-specified
+// shape.  To be unconstrained, pass in a shape with unknown rank.
+//
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "PlaceholderV2",
+
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// PlaceholderAttr is an optional argument to Placeholder.
+type PlaceholderAttr func(optionalAttr)
+
+// PlaceholderShape sets the optional shape attribute to value.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
+// shape is unconstrained.
+// If not specified, defaults to <unknown_rank:true >
+func PlaceholderShape(value tf.Shape) PlaceholderAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["shape"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
+// A placeholder op for a value that will be fed into the computation.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+//	dtype: The type of elements in the tensor.
 //
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
-		},
+		Type: "Placeholder",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
 //
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
+//
+// The folded size of each dimension D of the output is:
+//
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
 //
 // For example:
 //
 // ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
 // ```
 //
 // Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "MirrorPadGrad",
 		Input: []tf.Input{
-			start, limit, delta,
+			input, paddings,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
+// Pads a tensor with mirrored values.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
+// ```
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
+//
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "MirrorPad",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			input, paddings,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// Pads a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// This operation pads `input` according to the `paddings` and `constant_values`
+// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many padding values to add before the contents of `input` in that dimension,
+// and `paddings[D, 1]` indicates how many padding values to add after the contents
+// of `input` in that dimension. `constant_values` is a scalar tensor of the same
+// type as `input` that indicates the value to use for padding `input`.
 //
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// The padded size of each dimension D of the output is:
 //
-// Arguments:
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// For example:
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # 'constant_values' is 0
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "PadV2",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input, paddings, constant_values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// This is typically used by gradient computations for a broadcasting operation.
+func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "BroadcastGradientArgs",
 		Input: []tf.Input{
-			handle,
+			s0, s1,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// For example:
+// Returns the gradient of `Tile`.
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
 //
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
+// Since `Tile` takes an input and repeats the input `multiples` times
+// along each dimension, `TileGrad` takes in `multiples` and aggregates
+// each repeated tile of `input` into `output`.
+func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TileGrad",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Constructs a tensor by tiling a given tensor.
 //
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
 //
 // Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
+		Type: "Tile",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			input, multiples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["begin_mask"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
 //
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1251,9 +1448,9 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "StridedSliceGrad",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			shape, begin, end, strides, dy,
 		},
 		Attrs: attrs,
 	}
@@ -1261,201 +1458,253 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+// StridedSliceAttr is an optional argument to StridedSlice.
+type StridedSliceAttr func(optionalAttr)
+
+// StridedSliceBeginMask sets the optional begin_mask attribute to value.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Bincount",
-		Input: []tf.Input{
-			arr, size, weights,
-		},
+// value: a bitmask where a bit i being 1 means to ignore the begin
+// value and instead use the largest interval possible. At runtime
+// begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
+// `[-1, n-1]` if `stride[i] < 0`
+// If not specified, defaults to 0
+func StridedSliceBeginMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// For example:
+// StridedSliceEndMask sets the optional end_mask attribute to value.
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+// value: analogous to `begin_mask`
+// If not specified, defaults to 0
+func StridedSliceEndMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
 //
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
+// value: a bitmask where bit `i` being 1 means the `i`th
+// position is actually an ellipsis. One bit at most can be 1.
+// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
+// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
+// implicitly creates as many range specifications as necessary to fully
+// specify the sliced range for every dimension. For example for a 4-dimensional
+// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
+// If not specified, defaults to 0
+func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` being 1 means the `i`th
+// specification creates a new shape 1 dimension. For example
+// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+// If not specified, defaults to 0
+func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Sinh",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` implies that the `i`th
+// specification should shrink the dimensionality. begin and end
+// must imply a slice of size 1 in the dimension. For example in
+// python one might do `foo[:, 3, :]` which would result in
+// `shrink_axis_mask` being 2.
+// If not specified, defaults to 0
+func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
+// Return a strided slice from `input`.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// Note, most python users will want to use the Python `Tensor.__getitem__`
+// or `Variable.__getitem__` rather than this op directly.
 //
-// Computes a tensor such that
-// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
+// The goal of this op is to produce a new tensor with a subset of
+// the elements from the `n` dimensional `input` tensor. The subset is chosen using
+// a sequence of `m` sparse range specifications encoded into the arguments
+// of this function. Note, in some cases
+// `m` could be equal to `n`, but this need not be the case. Each
+// range specification entry can be one of the following:
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
+// - An ellipsis (...). Ellipses are used to imply zero or more
+//   dimensions of full-dimension selection and are produced using
+//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
 //
-// `num_segments` should equal the number of distinct segment IDs.
+// - A new axis. This is used to insert a new shape=1 dimension and is
+//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
+//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
 //
-// Arguments:
+// - A range `begin:end:stride`. This is used to specify how much to choose from
+//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
+//   which represents the index of the first value to select while `end` represents
+//   the index of the last value to select. The number of values selected in each
+//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
+//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
+//   the second to last. `begin_mask` controls whether to replace the explicitly
+//   given `begin` with an implicit effective value of `0` if `stride > 0` and
+//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
+//   required to create the largest open interval. For example, given a shape
+//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
+//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
+//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
+//   first dimension of a tensor while dropping the last two (in the original
+//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+// - A single index. This is used to keep only elements that have a given
+//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
+//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
+//   `shrink_axis_mask`.
 //
+// Each conceptual range specification is encoded in the op's argument. This
+// encoding is best understand by considering a non-trivial example. In
+// particular,
+// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// ```
+// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
+// end = [2, 4, x, x, -3, x]
+// strides = [1, 1, x, x, -1, 1]
+// begin_mask = 1<<4 | 1 << 5 = 48
+// end_mask = 1<<5 = 32
+// ellipsis_mask = 1<<3 = 8
+// new_axis_mask = 1<<2 4
+// shrink_axis_mask = 1<<0
+// ```
+//
+// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
+// the slice becomes (2, 1, 5, 5, 2, 5).
+// Let us walk step by step through each argument specification.
+//
+// 1.  The first argument in the example slice is turned into `begin = 1` and
+// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
+// also set the appropriate bit in `shrink_axis_mask`.
+//
+// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
+// zero bits contributed.
+//
+// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
+// dimension in the final shape. Dummy values are contributed to begin,
+// end and stride, while the new_axis_mask bit is set.
+//
+// 4. `...` grab the full ranges from as many dimensions as needed to
+// fully specify a slice for every dimension of the input shape.
+//
+// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
+// with a dimension that has shape `s` is converted to a positive index
+// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
+// is done internally so begin, end and strides receive x, -3, and -1.
+// The appropriate begin_mask bit is set to indicate the start range is the
+// full range (ignoring the x).
+//
+// 6. `:` indicates that the entire contents of the corresponding dimension
+// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
+// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
+// `end_mask` are also set.
+//
+// *Requirements*:
+//   `0 != strides[i] for i in [0, m)`
+//   `ellipsis_mask must be a power of two (only one ellipsis)`
+//
+// Arguments:
+//
+//	begin: `begin[k]` specifies the offset into the `k`th range specification.
+// The exact dimension this corresponds to will be determined by context.
+// Out-of-bounds values will be silently clamped. If the `k`th bit of
+// `begin_mask` then `begin[k]` is ignored and the full range of the
+// appropriate dimension is used instead. Negative values causes indexing
+// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
+//	end: `end[i]` is like `begin` with the exception that `end_mask` is
+// used to determine full ranges.
+//	strides: `strides[i]` specifies the increment in the `i`th specification
+// after extracting a given element. Negative indices will reverse
+// the original order. Out or range values are
+// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
+func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "StridedSlice",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input, begin, end, strides,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns which elements of x are finite.
+// Return a slice from 'input'.
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
+//
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+//
+// Arguments:
+//
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "Slice",
 		Input: []tf.Input{
-			x,
+			input, begin, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
 
-// MatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["out_type"] = value
 	}
 }
 
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// Returns the size of a tensor.
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
+// This operation returns an integer representing the number of elements in
+// `input`.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// For example:
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// ```
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1464,9 +1713,9 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "Size",
 		Input: []tf.Input{
-			a, b,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -1474,213 +1723,254 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 	return op.Output(0)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
+// Returns the rank of a tensor.
 //
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
+// This operation returns an integer representing the rank of `input`.
 //
 // For example:
 //
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
-//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
 // ```
 //
-// Arguments:
-//
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
-//
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "Rank",
 		Input: []tf.Input{
-			condition, x, y,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalOr",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+// Reverses variable length slices.
 //
-// The regularized incomplete beta integral is defined as:
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
 //
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 //
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
 //
-// where
+// For example:
 //
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
 //
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
 //
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Betainc",
-		Input: []tf.Input{
-			a, b, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// In contrast, if:
 //
-// N is the size of the segment being reduced.
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
 //
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
 //
 // Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
+		Type: "ReverseSequence",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			input, seq_lengths,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+// UniqueWithCountsV2Attr is an optional argument to UniqueWithCountsV2.
+type UniqueWithCountsV2Attr func(optionalAttr)
+
+// UniqueWithCountsV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsV2OutIdx(value tf.DataType) UniqueWithCountsV2Attr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements along an axis of a tensor.
 //
-// The upper regularized incomplete Gamma function is defined as:
+// This operation either returns a tensor `y` containing unique elements
+// along the `axis` of a tensor. The returned unique elements is sorted
+// in the same order as they occur along `axis` in `x`.
+// This operation also returns a tensor `idx` and a tensor `count`
+// that are the same size as the number of the elements in `x` along the
+// `axis` dimension. The `idx` contains the index in the unique output `y`
+// and the `count` contains the count in the unique output `y`.
+// In other words, for an `1-D` tensor `x` with `axis = None:
 //
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// where
+// For example:
 //
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
 //
-// is the upper incomplete Gama function.
+// For an `2-D` tensor `x` with `axis = 0`:
 //
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx, count = unique_with_counts(x, axis=0)
+// y ==> [[1, 0, 0],
+//        [2, 0, 0]]
+// idx ==> [0, 0, 1]
+// count ==> [2, 1]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 1`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx, count = unique_with_counts(x, axis=1)
+// y ==> [[1, 0],
+//        [1, 0],
+//        [2, 0]]
+// idx ==> [0, 1, 1]
+// count ==> [1, 2]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+// find the unique elements.
+//
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.A 1-D Tensor. The count of each value of x in the output y.
+func UniqueWithCountsV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueWithCountsV2Attr) (y tf.Output, idx tf.Output, count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "UniqueWithCountsV2",
 		Input: []tf.Input{
-			a, x,
+			x, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
-type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
 	return func(m optionalAttr) {
-		m["num_bits"] = value
+		m["out_idx"] = value
 	}
 }
 
-// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+// Finds unique elements in a 1-D tensor.
 //
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxVars operation.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
 //
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-// min, max: Quantization interval, scalar floats.
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
+// For example:
 //
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
 //
-// Returns Backpropagated gradients w.r.t. inputs:
-// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
-// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
-// `sum(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1689,9 +1979,9 @@ func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsGradient",
+		Type: "UniqueWithCounts",
 		Input: []tf.Input{
-			gradients, inputs, min, max,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -1699,92 +1989,123 @@ func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
+// UniqueV2Attr is an optional argument to UniqueV2.
+type UniqueV2Attr func(optionalAttr)
 
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+// UniqueV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["out_idx"] = value
 	}
 }
 
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// Finds unique elements along an axis of a tensor.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a log-uniform distribution.
+// This operation either returns a tensor `y` containing unique elements
+// along the `axis` of a tensor. The returned unique elements is sorted
+// in the same order as they occur along `axis` in `x`.
+// This operation also returns a tensor `idx` that is the same size as
+// the number of the elements in `x` along the `axis` dimension. It
+// contains the index in the unique output `y`.
+// In other words, for an `1-D` tensor `x` with `axis = None:
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// For example:
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 0`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx = unique(x, axis=0)
+// y ==> [[1, 0, 0],
+//        [2, 0, 0]]
+// idx ==> [0, 0, 1]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 1`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx = unique(x, axis=1)
+// y ==> [[1, 0],
+//        [1, 0],
+//        [2, 0]]
+// idx ==> [0, 1, 1]
+// ```
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+// find the unique elements.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.
+func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
+		Type: "UniqueV2",
 		Input: []tf.Input{
-			true_classes,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
 
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
 	return func(m optionalAttr) {
-		m["tolerance"] = value
+		m["out_idx"] = value
 	}
 }
 
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1793,183 +2114,299 @@ func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...Approx
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "Unique",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns x / y element-wise.
+// Shuffle dimensions of x according to a permutation and conjugate the result.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Div",
+		Type: "ConjugateTranspose",
 		Input: []tf.Input{
-			x, y,
+			x, perm,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x * y element-wise.
+// Reshapes a tensor.
 //
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Given `tensor`, this operation returns a tensor that has the same values
+// as `tensor` with shape `shape`.
+//
+// If one component of `shape` is the special value -1, the size of that dimension
+// is computed so that the total size remains constant.  In particular, a `shape`
+// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+//
+// If `shape` is 1-D or higher, then the operation returns a tensor with shape
+// `shape` filled with the values of `tensor`. In this case, the number of elements
+// implied by `shape` must be the same as the number of elements in `tensor`.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+// # tensor 't' has shape [9]
+// reshape(t, [3, 3]) ==> [[1, 2, 3],
+//                         [4, 5, 6],
+//                         [7, 8, 9]]
+//
+// # tensor 't' is [[[1, 1], [2, 2]],
+// #                [[3, 3], [4, 4]]]
+// # tensor 't' has shape [2, 2, 2]
+// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+//                         [3, 3, 4, 4]]
+//
+// # tensor 't' is [[[1, 1, 1],
+// #                 [2, 2, 2]],
+// #                [[3, 3, 3],
+// #                 [4, 4, 4]],
+// #                [[5, 5, 5],
+// #                 [6, 6, 6]]]
+// # tensor 't' has shape [3, 2, 3]
+// # pass '[-1]' to flatten 't'
+// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+//
+// # -1 can also be used to infer the shape
+//
+// # -1 is inferred to be 9:
+// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 2:
+// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 3:
+// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+//                               [2, 2, 2],
+//                               [3, 3, 3]],
+//                              [[4, 4, 4],
+//                               [5, 5, 5],
+//                               [6, 6, 6]]]
+//
+// # tensor 't' is [7]
+// # shape `[]` reshapes to a scalar
+// reshape(t, []) ==> 7
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mul",
+		Type: "Reshape",
 		Input: []tf.Input{
-			x, y,
+			tensor, shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
-
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+// Checks a tensor for NaN and Inf values.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"message": message}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
+		Type: "CheckNumerics",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// Gather slices from `params` into a Tensor with shape specified by `indices`.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Adds `bias` to `value`.
+// `indices` is an K-dimensional integer tensor, best thought of as a
+// (K-1)-dimensional tensor of indices into `params`, where each element defines a
+// slice of `params`:
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+//     output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
+//
+// Whereas in @{tf.gather} `indices` defines slices into the first
+// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
+//
+// The last dimension of `indices` can be at most the rank of
+// `params`:
+//
+//     indices.shape[-1] <= params.rank
+//
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] == params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
+//
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
+// Some examples below.
+//
+// Simple indexing into a matrix:
+//
+// ```python
+//     indices = [[0, 0], [1, 1]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = ['a', 'd']
+// ```
+//
+// Slice indexing into a matrix:
+//
+// ```python
+//     indices = [[1], [0]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['c', 'd'], ['a', 'b']]
+// ```
+//
+// Indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['a1', 'b1'], ['c1', 'd1']]]
+//
+//
+//     indices = [[0, 1], [1, 0]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['c0', 'd0'], ['a1', 'b1']]
+//
+//
+//     indices = [[0, 0, 1], [1, 0, 1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = ['b0', 'b1']
+// ```
+//
+// Batched indexing into a matrix:
+//
+// ```python
+//     indices = [[[0, 0]], [[0, 1]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['a'], ['b']]
+// ```
+//
+// Batched slice indexing into a matrix:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [[['c', 'd']], [['a', 'b']]]
+// ```
+//
+// Batched indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
+//               [[['a0', 'b0'], ['c0', 'd0']]]]
+//
+//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['c0', 'd0'], ['a1', 'b1']],
+//               [['a0', 'b0'], ['c1', 'd1']]]
+//
+//
+//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['b0', 'b1'], ['d0', 'c1']]
+// ```
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
+func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "GatherNd",
 		Input: []tf.Input{
-			value, bias,
+			params, indices,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
 
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+// GatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func GatherValidateIndices(value bool) GatherAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// The backward operation for "BiasAdd" on the "bias" tensor.
+// Gather slices from `params` according to `indices`.
 //
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// Arguments:
-//	out_backprop: Any number of dimensions.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+//
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
+//
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1978,9 +2415,9 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "Gather",
 		Input: []tf.Input{
-			out_backprop,
+			params, indices,
 		},
 		Attrs: attrs,
 	}
@@ -1988,73 +2425,107 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
+// Creates a tensor filled with a scalar value.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x + y element-wise.
+// This operation creates a tensor of shape `dims` and fills it with `value`.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// For example:
+//
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
+//
+// Arguments:
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Add",
+		Type: "Fill",
 		Input: []tf.Input{
-			x, y,
+			dims, value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
 
-// NthElementReverse sets the optional reverse attribute to value.
+// EditDistanceNormalize sets the optional normalize attribute to value.
 //
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["normalize"] = value
 	}
 }
 
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
+// Computes the (possibly normalized) Levenshtein Edit Distance.
 //
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
 //
-//     values.shape = input.shape[:-1]
+// The inputs are:
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
 //
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2063,9 +2534,9 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NthElement",
+		Type: "EditDistance",
 		Input: []tf.Input{
-			input, n,
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
 		},
 		Attrs: attrs,
 	}
@@ -2073,157 +2544,138 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 	return op.Output(0)
 }
 
-// Computes the Max along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum
-// such that:
-//
-// \\(output_i = \max_j data_j\\) where max is over `j` such
-// that `segment_ids[j] == i`.
+// Clips tensor values to a specified min and max.
 //
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
-//  `output[i] = numeric_limits<T>::min()`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
 //
 // Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.
-//
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "ClipByValue",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			t, clip_value_min, clip_value_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a sequence of numbers.
+//
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
+//
+// For example:
+//
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
+//
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exp",
+		Type: "Range",
 		Input: []tf.Input{
-			x,
+			start, limit, delta,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns an element-wise indication of the sign of a number.
+// Computes gradients for SparseSegmentSqrtN.
 //
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "SparseSegmentSqrtNGrad",
 		Input: []tf.Input{
-			x,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
-
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x + y element-wise, working on quantized buffers.
+// Computes the mean along sparse segments of a tensor.
 //
-// Arguments:
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// Arguments:
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
-
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
+	return op.Output(0)
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
+// Pop the element at the top of the stack.
 //
 // Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "StackPopV2",
 		Input: []tf.Input{
-			input, dimension,
+			handle,
 		},
 		Attrs: attrs,
 	}
@@ -2231,103 +2683,111 @@ func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+// Computes the sum along sparse segments of a tensor.
 //
-// output range specified with 'requested_output_min' and 'requested_output_max'.
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// Arguments:
+// For example:
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "Requantize",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the determinant of one or more square matrices.
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			input,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
+//
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+// An identity op that triggers an error if a gradient is requested.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Erfc",
+		Type: "PreventGradient",
 		Input: []tf.Input{
-			x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
-//
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
+		Type: "Asin",
 		Input: []tf.Input{
 			x,
 		},
@@ -2336,74 +2796,64 @@ func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
 // If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+// Converts a sparse representation into a dense tensor.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+// Builds an array `dense` with shape `output_shape` such that
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the filter.
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
 		Attrs: attrs,
 	}
@@ -2411,243 +2861,290 @@ func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output,
 	return op.Output(0)
 }
 
-// Returns the number of work units this Reader has finished processing.
+// Computes the sum along sparse segments of a tensor.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			reader_handle,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RealDiv",
+		Type: "Sinh",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Lgamma",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+// Computes the minimum along segments of a tensor.
 //
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the minimum such that:
+//
+// \\(output_i = \min_j data_j\\) where min is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the minimum is empty for a given segment ID `i`, it outputs the largest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::max()`.
 //
 // Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
+		Type: "UnsortedSegmentMin",
 		Input: []tf.Input{
-			l, grad,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Acosh",
+		Type: "Relu6",
 		Input: []tf.Input{
-			x,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
+// Computes the sum along segments of a tensor.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// Computes a tensor such that
+// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			data, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
 
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["begin_mask"] = value
 	}
 }
 
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["end_mask"] = value
 	}
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["ellipsis_mask"] = value
 	}
 }
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["new_axis_mask"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+//
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
+//
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			size,
+			ref, begin, end, strides, value,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
+
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			input, dimension,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cosh",
+		Type: "IsFinite",
 		Input: []tf.Input{
 			x,
 		},
@@ -2656,337 +3153,302 @@ func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
+// Multiply the matrix "a" by the matrix "b".
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
+//
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "MatMul",
 		Input: []tf.Input{
-			x,
+			a, b,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
+// Selects elements from `x` or `y`, depending on `condition`.
+//
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
+//
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "Select",
 		Input: []tf.Input{
-			gradients, features,
+			condition, x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
-
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+// Returns the truth value of x OR y element-wise.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Resize `images` to `size` using bicubic interpolation.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// Input images can be of different types but output images are always float.
+// The regularized incomplete beta integral is defined as:
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "Betainc",
 		Input: []tf.Input{
-			images, size,
+			a, b, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes natural logarithm of x element-wise.
-//
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Log",
+		Type: "Identity",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Rounds the values of a tensor to the nearest integer, element-wise.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "Atan2",
 		Input: []tf.Input{
-			x,
+			y, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RecordInputAttr is an optional argument to RecordInput.
-type RecordInputAttr func(optionalAttr)
-
-// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
+// Creates a dataset that passes a sliding window over `input_dataset`.
 //
-// value: Random seeds used to produce randomized records.
-// If not specified, defaults to 301
-func RecordInputFileRandomSeed(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_random_seed"] = value
-	}
-}
-
-// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
+// Arguments:
 //
-// value: Shifts the list of files after the list is randomly
-// shuffled.
-// If not specified, defaults to 0
-func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_shuffle_shift_ratio"] = value
-	}
-}
-
-// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	stride: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be in `[1, window_size)`.
 //
-// value: The randomization shuffling buffer.
-// If not specified, defaults to 10000
-func RecordInputFileBufferSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_buffer_size"] = value
-	}
-}
-
-// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
 //
-// value: How many sstables are opened and concurrently iterated over.
-// If not specified, defaults to 16
-func RecordInputFileParallelism(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_parallelism"] = value
+func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RecordInputBatchSize sets the optional batch_size attribute to value.
-//
-// value: The batch size.
-// If not specified, defaults to 32
-func RecordInputBatchSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["batch_size"] = value
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SlideDataset",
+		Input: []tf.Input{
+			input_dataset, window_size, stride,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RecordInputCompressionType sets the optional compression_type attribute to value.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// value: The type of compression for the file. Currently ZLIB and
-// GZIP are supported. Defaults to none.
-// If not specified, defaults to ""
-func RecordInputCompressionType(value string) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// Emits randomized records.
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
-//	file_pattern: Glob pattern for the data files.
 //
-// Returns A tensor of shape [batch_size].
-func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"file_pattern": file_pattern}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RecordInput",
-
-		Attrs: attrs,
+		Type: "SparseSegmentSqrtNWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes reciprocal of square root of x element-wise.
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rsqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inserts a dimension of 1 into a tensor's shape.
-//
-// Given a tensor `input`, this operation inserts a dimension of 1 at the
-// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
-// zero; if you specify a negative number for `axis` it is counted backward from
-// the end.
-//
-// This operation is useful if you want to add a batch dimension to a single
-// element. For example, if you have a single image of shape `[height, width,
-// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-// which will make the shape `[1, height, width, channels]`.
-//
-// Other examples:
-//
-// ```
-// # 't' is a tensor of shape [2]
-// shape(expand_dims(t, 0)) ==> [1, 2]
-// shape(expand_dims(t, 1)) ==> [2, 1]
-// shape(expand_dims(t, -1)) ==> [2, 1]
-//
-// # 't2' is a tensor of shape [2, 3, 5]
-// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-// ```
-//
-// This operation requires that:
+// The upper regularized incomplete Gamma function is defined as:
 //
-// `-1-input.dims() <= dim <= input.dims()`
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
 //
-// This operation is related to `squeeze()`, which removes dimensions of
-// size 1.
+// where
 //
-// Arguments:
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
 //
-//	axis: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`. Must be in the range
-// `[-rank(input) - 1, rank(input)]`.
+// is the upper incomplete Gama function.
 //
-// Returns Contains the same data as `input`, but its shape has an additional
-// dimension of size 1 added.
-func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExpandDims",
+		Type: "Igammac",
 		Input: []tf.Input{
-			input, axis,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
 
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["tolerance"] = value
 	}
 }
 
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
-//
-// The op uses LU decomposition with partial pivoting to compute the inverses.
-//
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2995,9 +3457,9 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -3005,335 +3467,325 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 	return op.Output(0)
 }
 
-// Computes square of x element-wise.
+// Returns x / y element-wise.
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Square",
+		Type: "Div",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+// Returns x * y element-wise.
 //
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Elu",
+		Type: "Mul",
 		Input: []tf.Input{
-			features,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Adds `bias` to `value`.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Reciprocal",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			x,
+			value, bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapClearAttr is an optional argument to OrderedMapClear.
-type OrderedMapClearAttr func(optionalAttr)
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
 
-// OrderedMapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// REQUIRES: value >= 0
-func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearContainer(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
 //
-// Returns the created operation.
-func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapClear",
-
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the reciprocal of x element-wise.
+// Returns x + y element-wise.
 //
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "AddV2",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Computes the complex absolute value of a tensor.
+// Returns x + y element-wise.
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "Add",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x AND y element-wise.
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
+
+// NthElementReverse sets the optional reverse attribute to value.
 //
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Finds values of the `n`-th order statistic for the last dimension.
+//
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
+//
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+//
+//     values.shape = input.shape[:-1]
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "NthElement",
 		Input: []tf.Input{
-			x, y,
+			input, n,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+// Computes the maximum along segments of a tensor.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
+//
+// \\(output_i = \max_j data_j\\) where max is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "UnsortedSegmentMax",
 		Input: []tf.Input{
-			x,
+			data, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
-
-// MaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the maximum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Max",
+		Type: "Exp",
 		Input: []tf.Input{
-			input, axis,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Quantized Batch normalization.
-//
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
+// Returns an element-wise indication of the sign of a number.
 //
-// Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
 //
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Type: "Sign",
 		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
 
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["output_type"] = value
 	}
 }
 
-// Return histogram of values.
-//
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
-//
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
 //
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3342,9 +3794,9 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
+		Type: "ArgMin",
 		Input: []tf.Input{
-			values, value_range, nbins,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
@@ -3352,29 +3804,33 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 	return op.Output(0)
 }
 
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+// output range specified with 'requested_output_min' and 'requested_output_max'.
 //
-// Arguments:
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 //
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
+// Arguments:
 //
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
+		Type: "Requantize",
 		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
 		Attrs: attrs,
 	}
@@ -3382,216 +3838,208 @@ func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input t
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Produces the average pool of the input tensor for quantized types.
+// Computes the determinant of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Updates the table to associates keys with values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
-//
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
+		Type: "Sin",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
-
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+	opspec := tf.OpSpec{
+		Type: "Erfc",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Digamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
+// Shuffle dimensions of x according to a permutation.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Transpose",
+		Input: []tf.Input{
+			x, perm,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
+
+// MinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Performs fractional average pooling on the input.
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
+		Type: "Min",
 		Input: []tf.Input{
-			value,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
 
-// RandomCropSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// RandomCropSeed2 sets the optional seed2 attribute to value.
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Randomly crop `image`.
-//
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
-//
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
 //
 // Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomCrop",
+		Type: "Conv2DBackpropFilter",
 		Input: []tf.Input{
-			image, size,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -3599,257 +4047,195 @@ func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...Rando
 	return op.Output(0)
 }
 
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
-
-// TopKV2Sorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
+// Returns the number of work units this Reader has finished processing.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TopKV2",
+		Type: "ReaderNumWorkUnitsCompletedV2",
 		Input: []tf.Input{
-			input, k,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns x // y element-wise.
-//
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorDiv",
+		Type: "Lgamma",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// which has shape (2, 4, 4)
-// ```
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
 //
 // Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
 //
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			diagonal,
+			l, grad,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
+// Computes the mean along sparse segments of a tensor.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "SparseSegmentMeanWithNumSegments",
 		Input: []tf.Input{
-			predictions, targets,
+			data, indices, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
-//
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
+		Type: "Cosh",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
-//
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "TensorSliceDataset",
 		Input: []tf.Input{
-			x, y,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
+// Computes natural logarithm of (1 + x) element-wise.
 //
-//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
 //
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softmax",
+		Type: "Relu6Grad",
 		Input: []tf.Input{
-			logits,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeBmpAttr is an optional argument to DecodeBmp.
-type DecodeBmpAttr func(optionalAttr)
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
 
-// DecodeBmpChannels sets the optional channels attribute to value.
-// If not specified, defaults to 0
-func DecodeBmpChannels(value int64) DecodeBmpAttr {
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Decode the first frame of a BMP-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
+// Resize `images` to `size` using bicubic interpolation.
 //
-// *   0: Use the number of channels in the BMP-encoded image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	contents: 0-D.  The BMP-encoded image.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns 3-D with shape `[height, width, channels]`. RGB order
-func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3858,9 +4244,9 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBmp",
+		Type: "ResizeBicubic",
 		Input: []tf.Input{
-			contents,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -3868,146 +4254,138 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 	return op.Output(0)
 }
 
-// Computes softsign gradients for a softsign operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+// Computes natural logarithm of x element-wise.
 //
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
+		Type: "Log",
 		Input: []tf.Input{
-			gradients, features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Round",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
+
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
 //
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["adj_y"] = value
+		m["file_random_seed"] = value
 	}
 }
 
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
 //
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
+// value: Shifts the list of files after the list is randomly
+// shuffled.
+// If not specified, defaults to 0
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_shuffle_shift_ratio"] = value
+	}
+}
+
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
 //
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_buffer_size"] = value
+	}
+}
+
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
 //
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_parallelism"] = value
+	}
+}
+
+// RecordInputBatchSize sets the optional batch_size attribute to value.
 //
-// It is computed as:
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["batch_size"] = value
+	}
+}
+
+// RecordInputCompressionType sets the optional compression_type attribute to value.
 //
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+// value: The type of compression for the file. Currently ZLIB and
+// GZIP are supported. Defaults to none.
+// If not specified, defaults to ""
+func RecordInputCompressionType(value string) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Emits randomized records.
 //
 // Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//	file_pattern: Glob pattern for the data files.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+		Type: "RecordInput",
 
-// Pads a tensor.
-//
-// This operation pads `input` according to the `paddings` and `constant_values`
-// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many padding values to add before the contents of `input` in that dimension,
-// and `paddings[D, 1]` indicates how many padding values to add after the contents
-// of `input` in that dimension. `constant_values` is a scalar tensor of the same
-// type as `input` that indicates the value to use for padding `input`.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # 'constant_values' is 0
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "PadV2",
-		Input: []tf.Input{
-			input, paddings, constant_values,
-		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns which elements of x are NaN.
+// Computes reciprocal of square root of x element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "Rsqrt",
 		Input: []tf.Input{
 			x,
 		},
@@ -4016,46 +4394,40 @@ func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
 
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
 // If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
+// Computes the inverse of one or more square invertible matrices or their
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// adjoints (conjugate transposes).
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
+//
+// The op uses LU decomposition with partial pivoting to compute the inverses.
+//
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4064,9 +4436,9 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -4074,333 +4446,217 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+// Computes square of x element-wise.
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "Square",
 		Input: []tf.Input{
-			gradients, outputs,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
-//
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-//
-// Arguments:
-//
-//	num_buckets: The number of buckets.
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "Elu",
 		Input: []tf.Input{
-			string_tensor,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
+// Computes the reciprocal of x element-wise.
 //
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
-//
-//
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "Reciprocal",
 		Input: []tf.Input{
-			input_dataset, count,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
+// OrderedMapClearAttr is an optional argument to OrderedMapClear.
+type OrderedMapClearAttr func(optionalAttr)
+
+// OrderedMapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes rectified linear gradients for a Relu operation.
+// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
+// REQUIRES: value >= 0
+func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearContainer(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
 //
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns the created operation.
+func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReluGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
+		Type: "OrderedMapClear",
+
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// Computes the reciprocal of x element-wise.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "Inv",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
 
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If true, merge repeated classes in output.
-// If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
 	return func(m optionalAttr) {
-		m["merge_repeated"] = value
+		m["Tout"] = value
 	}
 }
 
-// Performs beam search decoding on the logits given in input.
-//
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+// Computes the complex absolute value of a tensor.
 //
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	log_probability = op.Output(idx)
-	return decoded_indices, decoded_values, decoded_shape, log_probability
-}
-
-// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
-type AudioSpectrogramAttr func(optionalAttr)
-
-// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
-//
-// value: Whether to return the squared magnitude or just the
-// magnitude. Using squared magnitude can avoid extra calculations.
-// If not specified, defaults to false
-func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
-	return func(m optionalAttr) {
-		m["magnitude_squared"] = value
-	}
+	return op.Output(0)
 }
 
-// Produces a visualization of audio data over time.
-//
-// Spectrograms are a standard way of representing audio information as a series of
-// slices of frequency information, one slice for each window of time. By joining
-// these together into a sequence, they form a distinctive fingerprint of the sound
-// over time.
-//
-// This op expects to receive audio data as an input, stored as floats in the range
-// -1 to 1, together with a window width in samples, and a stride specifying how
-// far to move the window between slices. From this it generates a three
-// dimensional output. The lowest dimension has an amplitude value for each
-// frequency during that time slice. The next dimension is time, with successive
-// frequency slices. The final dimension is for the channels in the input, so a
-// stereo audio input would have two here for example.
-//
-// This means the layout when converted and saved as an image is rotated 90 degrees
-// clockwise from a typical spectrogram. Time is descending down the Y axis, and
-// the frequency decreases from left to right.
-//
-// Each value in the result represents the square root of the sum of the real and
-// imaginary parts of an FFT on the current window of samples. In this way, the
-// lowest dimension represents the power of each frequency in the current window,
-// and adjacent windows are concatenated in the next dimension.
-//
-// To get a more intuitive and visual look at what this operation does, you can run
-// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
-// resulting spectrogram as a PNG image.
-//
-// Arguments:
-//	input: Float representation of audio data.
-//	window_size: How wide the input window is in samples. For the highest efficiency
-// this should be a power of two, but other values are accepted.
-//	stride: How widely apart the center of adjacent sample windows should be.
+// Returns the truth value of x AND y element-wise.
 //
-// Returns 3D representation of the audio frequencies as an image.
-func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AudioSpectrogram",
+		Type: "LogicalAnd",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
+// Checks whether a tree ensemble has been initialized.
 //
-// The polygamma function is defined as:
-//
-//
-// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resouce.
 //
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns output boolean on whether it is initialized or not.
+func IsBoostedTreesEnsembleInitialized(scope *Scope, tree_ensemble_handle tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "IsBoostedTreesEnsembleInitialized",
 		Input: []tf.Input{
-			a, x,
+			tree_ensemble_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
+		Type: "Cast",
 		Input: []tf.Input{
-			input, grad, argmax,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -4408,47 +4664,44 @@ func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, ar
 	return op.Output(0)
 }
 
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
 
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Computes the maximum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
+		Type: "Max",
 		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -4456,232 +4709,259 @@ func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// Fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
-
-// MaxPoolDataFormat sets the optional data_format attribute to value.
+// Quantized Batch normalization.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
+		Type: "QuantizedBatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			input,
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
 //
 // Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
 //
-// Returns Same shape with 'input', each value of input replaced with bucket index.
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
 //
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
+		Type: "QuantizedBiasAdd",
 		Input: []tf.Input{
-			input,
+			input, bias, min_input, max_input, min_bias, max_bias,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes gradients of the maxpooling function.
+// Produces the average pool of the input tensor for quantized types.
 //
 // Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
 //	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
 //	padding: The type of padding algorithm to use.
 //
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
+		Type: "QuantizedAvgPool",
 		Input: []tf.Input{
-			input, grad, argmax,
+			input, min_input, max_input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// CriticalSectionOpAttr is an optional argument to CriticalSectionOp.
-type CriticalSectionOpAttr func(optionalAttr)
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
 
-// CriticalSectionOpContainer sets the optional container attribute to value.
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// value: the container this critical section is placed in.
-// If not specified, defaults to ""
-func CriticalSectionOpContainer(value string) CriticalSectionOpAttr {
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// CriticalSectionOpSharedName sets the optional shared_name attribute to value.
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
 //
-// value: the name by which this critical section is referred to.
-// If not specified, defaults to ""
-func CriticalSectionOpSharedName(value string) CriticalSectionOpAttr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Creates a handle to a CriticalSection resource.
-func CriticalSectionOp(scope *Scope, optional ...CriticalSectionOpAttr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "CriticalSectionOp",
+}
 
-		Attrs: attrs,
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["min"] = value
+		m["seed2"] = value
 	}
 }
 
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
+
+// RandomCropSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
 	return func(m optionalAttr) {
-		m["num_bits"] = value
+		m["seed"] = value
 	}
 }
 
-// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+// RandomCropSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
 	return func(m optionalAttr) {
-		m["narrow_range"] = value
+		m["seed2"] = value
 	}
 }
 
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+// Randomly crop `image`.
+//
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
 //
 // Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
 //
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4690,9 +4970,9 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
+		Type: "RandomCrop",
 		Input: []tf.Input{
-			gradients, inputs,
+			image, size,
 		},
 		Attrs: attrs,
 	}
@@ -4700,66 +4980,68 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
 
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
+// TopKV2Sorted sets the optional sorted attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["sorted"] = value
 	}
 }
 
-// Performs 3D average pooling on the input.
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
+		Type: "TopKV2",
 		Input: []tf.Input{
-			input,
+			input, k,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+// Returns x // y element-wise.
 //
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mod",
+		Type: "FloorDiv",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -4768,282 +5050,193 @@ func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes square root of x element-wise.
+// Returns a batched diagonal tensor with a given batched diagonal values.
 //
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sqrt",
+		Type: "MatrixDiag",
 		Input: []tf.Input{
-			x,
+			diagonal,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Computes the inverse permutation of a tensor.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
+//
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+//
+// The values must include 0. There can be no duplicate values or negative values.
+//
+// For example:
+//
+// ```
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// ```
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+//	x: 1-D.
+//
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
+		Type: "InvertPermutation",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
+// Computes log softmax activations.
 //
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
+		Type: "LogSoftmax",
 		Input: []tf.Input{
-			y, dy,
+			logits,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
-
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
-//
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
-	return func(m optionalAttr) {
-		m["batch_dim"] = value
-	}
-}
-
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
-//
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
-//
-// Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
+// Returns the truth value of (x <= y) element-wise.
 //
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			input, seq_lengths,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
+// Computes softmax activations.
 //
-// ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-// ```
+// For each batch `i` and class `j` we have
 //
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
 //
 // Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "Softmax",
 		Input: []tf.Input{
-			input, filter,
+			logits,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["channels"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
 //
-// All elements selected by `indices` must have the same shape.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	contents: 0-D.  The BMP-encoded image.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "DecodeBmp",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -5051,187 +5244,178 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
+// Computes softsign gradients for a softsign operation.
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "SoftsignGrad",
 		Input: []tf.Input{
-			input,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+// Provides the time since epoch in seconds.
 //
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Maximum",
-		Input: []tf.Input{
-			x, y,
-		},
+		Type: "Timestamp",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs all keys and values in the table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
 //
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
 //
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
 //
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			table_handle,
+			x, y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
+// Returns which elements of x are NaN.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.rfft
+// Equivalent to np.isnan
 // @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
+		Type: "IsNan",
 		Input: []tf.Input{
-			input, fft_length,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
-//
-// For example:
+// Identity op for gradient debugging.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "DebugGradientIdentity",
 		Input: []tf.Input{
-			real, imag,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
 
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Returns the imaginary part of a complex number.
+// var: Should be from a Variable().
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// Arguments:
 //
-// For example:
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5240,99 +5424,59 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			input,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
+// Computes rectified linear gradients for a Relu operation.
 //
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
 //
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "ReluGrad",
 		Input: []tf.Input{
-			x, q,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNGradBias sets the optional bias attribute to value.
-//
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNGradAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNGradBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
+// Computes the gradient of morphological 2-D dilation with respect to the input.
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "Dilation2DBackpropInput",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -5340,150 +5484,190 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 	return op.Output(0)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
 
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["merge_repeated"] = value
 	}
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
+// Performs beam search decoding on the logits given in input.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "CTCBeamSearchDecoder",
 		Input: []tf.Input{
-			input, axis,
+			inputs, sequence_length,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
+// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
+type AudioSpectrogramAttr func(optionalAttr)
 
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: Whether to return the squared magnitude or just the
+// magnitude. Using squared magnitude can avoid extra calculations.
 // If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["magnitude_squared"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// Produces a visualization of audio data over time.
 //
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Spectrograms are a standard way of representing audio information as a series of
+// slices of frequency information, one slice for each window of time. By joining
+// these together into a sequence, they form a distinctive fingerprint of the sound
+// over time.
+//
+// This op expects to receive audio data as an input, stored as floats in the range
+// -1 to 1, together with a window width in samples, and a stride specifying how
+// far to move the window between slices. From this it generates a three
+// dimensional output. The lowest dimension has an amplitude value for each
+// frequency during that time slice. The next dimension is time, with successive
+// frequency slices. The final dimension is for the channels in the input, so a
+// stereo audio input would have two here for example.
+//
+// This means the layout when converted and saved as an image is rotated 90 degrees
+// clockwise from a typical spectrogram. Time is descending down the Y axis, and
+// the frequency decreases from left to right.
+//
+// Each value in the result represents the square root of the sum of the real and
+// imaginary parts of an FFT on the current window of samples. In this way, the
+// lowest dimension represents the power of each frequency in the current window,
+// and adjacent windows are concatenated in the next dimension.
+//
+// To get a more intuitive and visual look at what this operation does, you can run
+// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+// resulting spectrogram as a PNG image.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	input: Float representation of audio data.
+//	window_size: How wide the input window is in samples. For the highest efficiency
+// this should be a power of two, but other values are accepted.
+//	stride: How widely apart the center of adjacent sample windows should be.
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+// Returns 3D representation of the audio frequencies as an image.
+func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "AudioSpectrogram",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// The polygamma function is defined as:
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+//
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Polygamma",
+		Input: []tf.Input{
+			a, x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "MaxPoolGradGradWithArgmax",
 		Input: []tf.Input{
-			shape,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
@@ -5491,218 +5675,112 @@ func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ..
 	return op.Output(0)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
 
-// AssertSummarize sets the optional summarize attribute to value.
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["data_format"] = value
 	}
 }
 
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "MaxPoolGradGradV2",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
-//
-// For each entry in `x`, calculates the number of `1` (on) bits in the binary
-// representation of that entry.
-//
-// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-// `int32` or `int64` and perform the bitcount on the result, than to feed in
-// 8- or 16-bit inputs and then aggregate the resulting counts.
-func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "PopulationCount",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "MaxPoolGradWithArgmax",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
-}
-
-// Returns the truth value of (x < y) element-wise.
-//
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Less",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
+// MutexV2Attr is an optional argument to MutexV2.
+type MutexV2Attr func(optionalAttr)
 
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
+// MutexV2Container sets the optional container attribute to value.
 //
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
-		Input: []tf.Input{
-			features, max_value, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// SummaryWriterAttr is an optional argument to SummaryWriter.
-type SummaryWriterAttr func(optionalAttr)
-
-// SummaryWriterSharedName sets the optional shared_name attribute to value.
+// value: If non-empty, this variable is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func SummaryWriterSharedName(value string) SummaryWriterAttr {
+func MutexV2Container(value string) MutexV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["container"] = value
 	}
 }
 
-// SummaryWriterContainer sets the optional container attribute to value.
+// MutexV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this variable is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
 // If not specified, defaults to ""
-func SummaryWriterContainer(value string) SummaryWriterAttr {
+func MutexV2SharedName(value string) MutexV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Returns a handle to be used to access a summary writer.
+// Creates a Mutex resource that can be locked by `MutexLock`.
 //
-// The summary writer is an in-graph resource which can be used by ops to write
-// summaries to event files.
-//
-// Returns the summary writer resource. Scalar handle.
-func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Output) {
+// Returns The mutex resource.
+func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5711,7 +5789,7 @@ func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SummaryWriter",
+		Type: "MutexV2",
 
 		Attrs: attrs,
 	}
@@ -5719,103 +5797,192 @@ func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Outpu
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "AvgPool3D",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
-//
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "Mod",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// DepthToSpaceAttr is an optional argument to DepthToSpace.
+type DepthToSpaceAttr func(optionalAttr)
 
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+// DepthToSpaceDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Use RandomPoissonV2 instead.
+// DepthToSpace for tensors of type T.
 //
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
-		Input: []tf.Input{
-			shape, rate,
+// Rearranges data from depth into blocks of spatial data.
+// This is the reverse transformation of SpaceToDepth. More specifically,
+// this op outputs a copy of the input tensor where values from the `depth`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions.
+// The attr `block_size` indicates the input block size and how the data is moved.
+//
+//   * Chunks of data of size `block_size * block_size` from depth are rearranged
+//     into non-overlapping blocks of size `block_size x block_size`
+//   * The width the output tensor is `input_depth * block_size`, whereas the
+//     height is `input_height * block_size`.
+//   * The Y, X coordinates within each block of the output image are determined
+//     by the high order component of the input channel index.
+//   * The depth of the input tensor must be divisible by
+//     `block_size * block_size`.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+//                         within the input image, bX, bY means coordinates
+//                         within the output block, oC means output channels).
+//      The output would be the input transposed to the following layout:
+//      n,iY,bY,iX,bX,oC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1, 2, 3, 4]]]]
+//
+// ```
+//
+// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+//
+// ```
+//    [[[[1], [2]],
+//      [[3], [4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+// the corresponding output will have 2x2 elements and will have a depth of
+// 1 channel (1 = `4 / (block_size * block_size)`).
+// The output element shape is `[2, 2, 1]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// This operation, for block size of 2, will return the following tensor of shape
+// `[1, 2, 2, 3]`
+//
+// ```
+//    [[[[1, 2, 3], [4, 5, 6]],
+//      [[7, 8, 9], [10, 11, 12]]]]
+//
+// ```
+//
+// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+//
+// ```
+// x =  [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 4 4 1]`:
+//
+// ```
+// x = [[[ [1],   [2],  [5],  [6]],
+//       [ [3],   [4],  [7],  [8]],
+//       [ [9],  [10], [13],  [14]],
+//       [ [11], [12], [15],  [16]]]]
+//
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block, same as in Space2Depth.
+func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthToSpace",
+		Input: []tf.Input{
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -5823,47 +5990,62 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
 
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of the maxpooling function.
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
+		Type: "Conv3DBackpropInputV2",
 		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -5871,175 +6053,202 @@ func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, gr
 	return op.Output(0)
 }
 
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
+// Computes square root of x element-wise.
 //
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
+		Type: "Sqrt",
 		Input: []tf.Input{
-			reader_handle, state,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["dilations"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Associates the given iterator with the given statistics aggregator.
+// Computes the gradient for the rsqrt of `x` wrt its input.
 //
-// Returns the created operation.
-func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorSetStatsAggregator",
+		Type: "RsqrtGrad",
 		Input: []tf.Input{
-			iterator_handle, stats_aggregator_handle,
+			y, dy,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns element-wise smallest integer in not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Ceil",
-		Input: []tf.Input{
-			x,
-		},
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the number of elements in the given table.
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
 // Arguments:
-//	table_handle: Handle to the table.
 //
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
+		Type: "DepthwiseConv2dNative",
 		Input: []tf.Input{
-			table_handle,
+			input, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "MaxPoolGradV2",
 		Input: []tf.Input{
-			grads, original_image,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -6047,72 +6256,71 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
+// Restore a reader to a previously saved state.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "ReaderRestoreStateV2",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			reader_handle, state,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
 
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["data_format"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "MaxPoolGrad",
 		Input: []tf.Input{
-			shape, seed,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -6120,94 +6328,66 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 	return op.Output(0)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// CropAndResizeMethod sets the optional method attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["method"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
-//
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
-//
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
-//
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
-		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
-
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["extrapolation_value"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
+// Extracts crops from the input image tensor and bilinearly resizes them (possibly
 //
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
+// with aspect ratio change) to a common output size specified by `crop_size`. This
+// is more general than the `crop_to_bounding_box` op which extracts a fixed size
+// slice from the input image and does not allow resizing or aspect ratio change.
 //
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
+// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
+// method will give identical results to using `tf.image.resize_bilinear()`
+// with `align_corners=True`.
 //
 // Arguments:
-//	x: 1-D.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
 //
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6216,123 +6396,175 @@ func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
+		Type: "CropAndResize",
 		Input: []tf.Input{
-			x,
+			image, boxes, box_ind, crop_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a normal distribution.
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
+//
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "SparseFillEmptyRows",
 		Input: []tf.Input{
-			shape, seed,
+			indices, values, dense_shape, default_value,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
+// Reverses specific dimensions of a tensor.
+//
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
 //
 // ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
 //
-// Arguments:
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
 //
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
 //
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
+		Type: "Reverse",
 		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
+			tensor, dims,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
 
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["data_format"] = value
 	}
 }
 
-// Gather slices from `params` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
-// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-// `indices` are always validated to be within range. If assigned to GPU,
-// out-of-bound indices result in safe but unspecified behavior, which may include
-// raising an error.
+// Arguments:
+//	out_backprop: Any number of dimensions.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6341,9 +6573,9 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Gather",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			params, indices,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6351,110 +6583,60 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 	return op.Output(0)
 }
 
-// Returns the truth value of (x != y) element-wise.
+// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
+type FusedBatchNormV2Attr func(optionalAttr)
+
+// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NotEqual",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
-//
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
-
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
 //
-// value: A `bool`. If `True`, skip the empty strings from the result.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
 // If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
+func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["skip_empty"] = value
+		m["is_training"] = value
 	}
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
-//
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
+// Batch normalization.
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6463,143 +6645,65 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "FusedBatchNormV2",
 		Input: []tf.Input{
-			input, delimiter,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// WriteAudioSummaryAttr is an optional argument to WriteAudioSummary.
-type WriteAudioSummaryAttr func(optionalAttr)
-
-// WriteAudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Writes a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns the created operation.
-func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteAudioSummary",
-		Input: []tf.Input{
-			writer, step, tag, tensor, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Computes the product of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	tensor: A Tensor of type `T`.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "SerializeTensor",
 		Input: []tf.Input{
-			input, axis,
+			tensor,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
 
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
 // If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
+// Solves systems of linear equations.
 //
-// Input images can be of different types but output images are always float.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6608,9 +6712,9 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			images, size,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
@@ -6618,440 +6722,493 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softsign",
+		Type: "Acos",
 		Input: []tf.Input{
-			features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
-type GenerateVocabRemappingAttr func(optionalAttr)
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
 
-// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
-//
-// value: Number of entries in the old vocab file to consider.  If -1,
-// use the entire old vocabulary.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["old_vocab_size"] = value
+		m["signed_input"] = value
 	}
 }
 
-// Given a path to new and old vocabulary files, returns a remapping Tensor of
-//
-// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-// default value of -1.
-//
-// `num_vocab_offset` enables
-// use in the partitioned variable case, and should generally be set through
-// examining partitioning info.  The format of the files should be a text file,
-// with each line containing a single entity within the vocabulary.
-//
-// For example, with `new_vocab_file` a text file containing each of the following
-// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-// `[0, -1, 2]`.
-//
-// The op also returns a count of how many entries in the new vocabulary
-// were present in the old vocabulary, which is used to calculate the number of
-// values to initialize in a weight matrix remapping
-//
-// This functionality can be used to remap both row vocabularies (typically,
-// features) and column vocabularies (typically, classes) from TensorFlow
-// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-// corresponding to div-partitioned variables.  Moreover, the underlying remapping
-// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-// use the corresponding index_table_from_file() as the FeatureColumn framework
-// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
-//
-// Arguments:
-//	new_vocab_file: Path to the new vocab file.
-//	old_vocab_file: Path to the old vocab file.
-//	new_vocab_offset: How many entries into the new vocab file to start reading.
-//	num_new_vocab: Number of entries in the new vocab file to remap.
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
 //
-// Returns A Tensor of length num_new_vocab where the element at index i
-// is equal to the old ID that maps to the new ID i.  This element is -1 for any
-// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "GenerateVocabRemapping",
+		Type: "QuantizeAndDequantize",
 		Input: []tf.Input{
-			new_vocab_file, old_vocab_file,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
+// Returns locations of nonzero / true values in a tensor.
 //
-// This operation computes
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
 //
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
+// For example:
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
+// ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
+		Type: "Where",
 		Input: []tf.Input{
-			resource, indices, updates,
+			condition,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
 
-// CumsumReverse sets the optional reverse attribute to value.
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
+// Dequeues a tuple of one or more tensors from the given queue.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			x, axis,
+			handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
+	}
+	return components
 }
 
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erf",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "Floor",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
 
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+// OneHotAxis sets the optional axis attribute to value.
 //
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
 	return func(m optionalAttr) {
-		m["header_bytes"] = value
+		m["axis"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+// Returns a one-hot tensor.
 //
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
 //
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
 //
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
 //
-// Arguments:
-//	record_bytes: Number of bytes in the record.
 //
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The gradient operator for the SparseAdd op.
+// Examples
+// =========
 //
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+//
+//     ```output =
+//       [5.0 0.0 0.0]  // one_hot(0)
+//       [0.0 0.0 5.0]  // one_hot(2)
+//       [0.0 0.0 0.0]  // one_hot(-1)
+//       [0.0 5.0 0.0]  // one_hot(1)
+//     ```
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+//
+//     ```output =
+//       [0.0 3.0 3.0 3.0]
+//       [3.0 3.0 3.0 0.0]
+//       [3.0 3.0 3.0 3.0]
+//       [3.0 0.0 3.0 3.0]
+//     //  ^                one_hot(0)
+//     //      ^            one_hot(2)
+//     //          ^        one_hot(-1)
+//     //              ^    one_hot(1)
+//     ```
+// Suppose that
+//
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+//
+//     ```output =
+//       [
+//         [1.0, 0.0, 0.0]  // one_hot(0)
+//         [0.0, 0.0, 1.0]  // one_hot(2)
+//       ][
+//         [0.0, 1.0, 0.0]  // one_hot(1)
+//         [0.0, 0.0, 0.0]  // one_hot(-1)
+//       ]```
 //
 // Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
 //
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
-		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "OneHot",
 		Input: []tf.Input{
-			x,
+			indices, depth, on_value, off_value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Encode audio data using the WAV file format.
-//
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
-//
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
-//
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "EncodeWav",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			audio, sample_rate,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
+//	bytes: All the elements must have the same length.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			input,
+			bytes,
 		},
 		Attrs: attrs,
 	}
@@ -7059,140 +7216,192 @@ func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64,
 	return op.Output(0)
 }
 
-// Generates values in an interval.
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
 //
 // For example:
 //
 // ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
 // ```
 //
 // Arguments:
-//	start: First entry in the range.
-//	stop: Last entry in the range.
-//	num: Number of values to generate.
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
 //
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "MatrixBandPart",
 		Input: []tf.Input{
-			start, stop, num,
+			input, num_lower, num_upper,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
+		m["data_format"] = value
 	}
 }
 
-// Deletes the resource specified by the handle.
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// All subsequent operations using the resource will result in a NotFound
-// error status.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	resource: handle to the resource to delete.
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			resource,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
 
-// CumprodExclusive sets the optional exclusive attribute to value.
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// CumprodReverse sets the optional reverse attribute to value.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["bias"] = value
 	}
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
+// LRNGradAlpha sets the optional alpha attribute to value.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7201,9 +7410,9 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			x, axis,
+			input_grads, input_image, output_image,
 		},
 		Attrs: attrs,
 	}
@@ -7211,91 +7420,86 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
 //
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical or" of elements across dimensions of a tensor.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "Any",
 		Input: []tf.Input{
-			data, segment_ids,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
 
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7304,127 +7508,93 @@ func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "ResourceApplyFtrl",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size,
-		},
-		Attrs: attrs,
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse fast Fourier transform.
+// Outputs random values from a uniform distribution.
 //
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			input,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
 
-// LRNDepthRadius sets the optional depth_radius attribute to value.
+// AssertSummarize sets the optional summarize attribute to value.
 //
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["summarize"] = value
 	}
 }
 
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// Asserts that the given condition is true.
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
+//
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7433,173 +7603,164 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "Assert",
 		Input: []tf.Input{
-			input,
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
+//
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
+//
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ZipDataset",
+		Type: "PopulationCount",
 		Input: []tf.Input{
-			tf.OutputList(input_datasets),
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes a `GraphDef` protocol buffer to a `SummaryWriter`.
-//
-// Arguments:
-//	writer: Handle of `SummaryWriter`.
-//	step: The step to write the summary for.
-//	tensor: A scalar string of the serialized tf.GraphDef proto.
-//
-// Returns the created operation.
-func WriteGraphSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "WriteGraphSummary",
+		Type: "CollectiveBcastSend",
 		Input: []tf.Input{
-			writer, step, tensor,
+			input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// Makes a copy of `x`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	x: The source tensor of type `T`.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+// Returns     y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
+//       is not an alias of `x`.
+func DeepCopy(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "DeepCopy",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			x,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// 2D real-valued fast Fourier transform.
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Graphically the output tensors are:
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			input, fft_length,
+			split_dim, indices, values, shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
 
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["seed"] = value
 	}
 }
 
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7608,9 +7769,9 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			images, size,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -7618,32 +7779,46 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7652,47 +7827,47 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "ResourceSparseApplyFtrlV2",
 		Input: []tf.Input{
-			shape, seed,
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
 
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["src_format"] = value
 	}
 }
 
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// For example:
+// one in the source data format.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
-// ```
+// Arguments:
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7701,9 +7876,9 @@ func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Angle",
+		Type: "DataFormatVecPermute",
 		Input: []tf.Input{
-			input,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -7711,174 +7886,126 @@ func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Outp
 	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
-
-// VarHandleOpContainer sets the optional container attribute to value.
+// Reads the value of a variable.
 //
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
+// The tensor returned by this operation is immutable.
 //
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a Variable resource.
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
 // Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
+		Type: "ReadVariableOp",
+		Input: []tf.Input{
+			resource,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise XOR of `x` and `y`.
-//
-// The result will have those bits set, that are different in `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseXor",
+		Type: "Tan",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
+// Updates the tree ensemble by either adding a layer to the last tree being grown
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// or by starting a new tree.
 //
 // Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+//
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "BoostedTreesUpdateEnsemble",
 		Input: []tf.Input{
-			serialized_sparse,
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
+// value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7887,246 +8014,175 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SizeAttr is an optional argument to Size.
-type SizeAttr func(optionalAttr)
-
-// SizeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func SizeOutType(value tf.DataType) SizeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the size of a tensor.
-//
-// This operation returns an integer representing the number of elements in
-// `input`.
-//
-// For example:
+// Returns which elements of x are Inf.
 //
-// ```
-// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-// size(t) ==> 12
-// ```
-func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Size",
+		Type: "IsInf",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-//     [1, 11, 3, 10, 9, 6, 7, 12]
+// N is the size of the segment being reduced.
 //
-// See @{tf.scatter_nd} for more details about how to make updates to
-// slices.
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
 //
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			ref, indices, updates,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
-
-// StageSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-// REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
 //
-// REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// StageSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StageSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["dtype"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageSize",
-
+		Type: "StatelessTruncatedNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
 
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
 	return func(m optionalAttr) {
-		m["iou_threshold"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Restores a tensor from checkpoint files.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
+//
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			file_pattern, tensor_name, shape_and_slice,
 		},
 		Attrs: attrs,
 	}
@@ -8134,91 +8190,77 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 	return op.Output(0)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+// Returns the imaginary part of a complex number.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+// For example:
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
+		Type: "Imag",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["Tout"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Converts two real numbers to a complex number.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+// The input tensors `real` and `imag` must have the same shape.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8227,75 +8269,82 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "Complex",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			real, imag,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns which elements of x are Inf.
+// Divides sparse updates into the variable referenced by `resource`.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] /= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] /= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "ResourceScatterDiv",
 		Input: []tf.Input{
-			x,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["dtype"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// The generated values will have mean 0 and standard deviation 1.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8304,25 +8353,92 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMin",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of (x > y) element-wise.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedReshape",
+		Input: []tf.Input{
+			tensor, shape, input_min, input_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Greater",
+		Type: "NotEqual",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -8331,141 +8447,156 @@ func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+// Inverse 3D real-valued fast Fourier transform.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
+
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
 	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+		m["skip_empty"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplit",
+		Input: []tf.Input{
+			input, delimiter,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["use_locking"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
 // If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+// That is for rows we have grad for, we update var and accum as follows:
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8474,235 +8605,247 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "ResourceSparseApplyMomentum",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x / y element-wise for integer types.
+// Returns the complex conjugate of a complex number.
 //
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
+		Type: "Conj",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
-//
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bilinear interpolation.
 //
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
+			images, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
-	}
-	return tensors
+	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
-//
-// Arguments:
-//	input: Base64 strings to decode.
-//
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "Softsign",
 		Input: []tf.Input{
-			input,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
+// Creates a TensorList which, when stacked, has the value of `tensor`.
 //
-// Arguments:
-//	value: The tensor to be stored.
+// Each tensor in the result list corresponds to one row of the input tensor.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
+		Type: "TensorListFromTensor",
 		Input: []tf.Input{
-			value,
+			tensor, element_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["old_vocab_size"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+// Given a path to new and old vocabulary files, returns a remapping Tensor of
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
+//
+// `num_vocab_offset` enables
+// use in the partitioned variable case, and should generally be set through
+// examining partitioning info.  The format of the files should be a text file,
+// with each line containing a single entity within the vocabulary.
+//
+// For example, with `new_vocab_file` a text file containing each of the following
+// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+// `[0, -1, 2]`.
+//
+// The op also returns a count of how many entries in the new vocabulary
+// were present in the old vocabulary, which is used to calculate the number of
+// values to initialize in a weight matrix remapping
+//
+// This functionality can be used to remap both row vocabularies (typically,
+// features) and column vocabularies (typically, classes) from TensorFlow
+// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+// corresponding to div-partitioned variables.  Moreover, the underlying remapping
+// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+// use the corresponding index_table_from_file() as the FeatureColumn framework
+// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	new_vocab_file: Path to the new vocab file.
+//	old_vocab_file: Path to the old vocab file.
+//	new_vocab_offset: How many entries into the new vocab file to start reading.
+//	num_new_vocab: Number of entries in the new vocab file to remap.
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+// Returns A Tensor of length num_new_vocab where the element at index i
+// is equal to the old ID that maps to the new ID i.  This element is -1 for any
+// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			new_vocab_file, old_vocab_file,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Floor",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+// Assigns sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Erf",
+		Type: "ResourceScatterUpdate",
 		Input: []tf.Input{
-			x,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Reads the value of a variable.
-//
-// The tensor returned by this operation is immutable.
+// Creates and returns an empty tensor list.
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
+		Type: "EmptyTensorList",
 		Input: []tf.Input{
-			resource,
+			element_shape,
 		},
 		Attrs: attrs,
 	}
@@ -8710,35 +8853,35 @@ func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value
 	return op.Output(0)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
 
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
 //	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8747,9 +8890,9 @@ func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, gr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -8757,469 +8900,367 @@ func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, gr
 	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
-
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["capacity"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
+		Type: "StageClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
 
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes the ids of the positions in sampled_candidates that match true_labels.
+//
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
 //
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "ComputeAccidentalHits",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			true_classes, sampled_candidates,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// Arguments:
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TruncateMod",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			x, y,
+			features, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
-//
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
+
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
 //
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["header_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+}
+
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
 //
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExample",
-		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
-
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["encoding"] = value
 	}
 }
 
-// Returns the shape of the variable pointed to by `resource`.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// A Reader that outputs fixed-length records from a file.
 //
-// For example:
+// Arguments:
+//	record_bytes: Number of bytes in the record.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "FixedLengthRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
-//
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
-//
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
-//
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
-//
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
-//
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-//
-//     empty_row_indicator[i] = True iff row i was an empty row.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
+// The hash function is deterministic on the content of the string within the
+// process.
 //
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
 //
 // Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
 //
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// Computes gradients for the exponential linear (Elu) operation.
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
 //
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reverse",
+		Type: "EluGrad",
 		Input: []tf.Input{
-			tensor, dims,
+			gradients, outputs,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
-// For each batch `i` and class `j` we have
+// Arguments:
 //
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			logits,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
+// The gradient operator for the SparseAdd op.
 //
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
-//
-// For example:
-//
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
 //
 // Arguments:
-//	x: 1-D.
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
 //
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan",
 		Input: []tf.Input{
 			x,
 		},
@@ -9228,108 +9269,96 @@ func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-//
-// This operation folds the padded areas of `input` by `MirrorPad` according to the
-// `paddings` you specify. `paddings` must be the same as `paddings` argument
-// given to the corresponding `MirrorPad` op.
-//
-// The folded size of each dimension D of the output is:
-//
-// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+// Encode audio data using the WAV file format.
 //
-// For example:
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-// # 'paddings' is [[0, 1]], [0, 1]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[ 1,  5]
-//                       [11, 28]]
-// ```
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	input: The input tensor to be folded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: The mode used in the `MirrorPad` op.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// Returns The folded tensor.
-func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "MirrorPadGrad",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			input, paddings,
+			audio, sample_rate,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
 //
-// Inputs are the logits, not probabilities.
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Type: "StringToHashBucketStrong",
 		Input: []tf.Input{
-			features, labels,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// RegexReplaceAttr is an optional argument to RegexReplace.
+type RegexReplaceAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["replace_global"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// Replaces the match of pattern in input with rewrite.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expresion.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+// Returns The text after applying pattern and rewrite.
+func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9338,22 +9367,25 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "RegexReplace",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			input, pattern, rewrite,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes numerical negative value element-wise.
+//
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalNot",
+		Type: "Neg",
 		Input: []tf.Input{
 			x,
 		},
@@ -9362,143 +9394,101 @@ func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// 3D real-valued fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// Execute a sub graph on a remote processor.
 //
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//	inputs: Arbitrary number of tensors with arbitrary data types
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "RemoteFusedGraphExecute",
 		Input: []tf.Input{
-			input, fft_length,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
-//
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
 	}
+	return outputs
 }
 
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["data_format"] = value
 	}
 }
 
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
-//
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
-	}
-}
-
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "MaxPool3DGradGrad",
 		Input: []tf.Input{
-			size,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
 
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
 //
 // value: The data format of the input and output data. With the
 // default format "NDHWC", the data is stored in the order of:
@@ -9506,64 +9496,51 @@ type MaxPool3DAttr func(optionalAttr)
 // Alternatively, the format could be "NCDHW", the data storage order is:
 //     [batch, in_channels, in_depth, in_height, in_width].
 // If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
 // Arguments:
 //	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
 //	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
 // out_channels]`.
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "Conv3DBackpropFilterV2",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -9571,123 +9548,200 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 	return op.Output(0)
 }
 
-// Inverse 2D fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// Arguments:
-//	input: A complex64 tensor.
+// and `max` to 'outputs' tensor of same shape as `inputs`.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			input,
+			inputs, min, max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a tensor filled with a scalar value.
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// This operation creates a tensor of shape `dims` and fills it with `value`.
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
 //
-// For example:
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
 //
-// ```
-// # Output tensor has shape [2, 3].
-// fill([2, 3], 9) ==> [[9, 9, 9]
-//                      [9, 9, 9]]
-// ```
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
-//	dims: 1-D. Represents the shape of the output tensor.
-//	value: 0-D (scalar). Value to fill the returned tensor.
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
 //
-// @compatibility(numpy)
-// Equivalent to np.full
-// @end_compatibility
-func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Fill",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			dims, value,
+			sp_indices, sp_values, sp_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// 2D fast Fourier transform.
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// Arguments:
-//	input: A complex64 tensor.
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// `data.shape` must start with `partitions.shape`.
+//
+// For example:
+//
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
+//
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "FFT2D",
+		Type: "DynamicPartition",
 		Input: []tf.Input{
-			input,
+			data, partitions,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
+	}
+	return outputs
+}
 
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
+
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' as FOBOS algorithm with fixed learning rate.
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
 //
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9696,244 +9750,279 @@ func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
+			var_, accum, lr, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// Return the shape of s0 op s1 with broadcast.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
-			y, dy,
+			s0, s1,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the value of the tensor specified by its handle.
-//
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
-		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x - y element-wise.
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sub",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// Returns the dimension index in the destination data format given the one in
 //
-// Inputs are the logits, not probabilities.
+// the source data format.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			features, labels,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
 
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, retain reduced dimensions with length `1`.
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_locking"] = value
 	}
 }
 
-// ReduceJoinSeparator sets the optional separator attribute to value.
+// Update '*var' according to the AddSign update.
 //
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyPowerSign",
+		Input: []tf.Input{
+			var_, m, lr, logbase, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Joins a string Tensor across the given dimensions.
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
 //
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.
+// is alive, any other request to use `MutexLock` with this mutex will wait.
 //
-// For example:
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
 //
 // ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-// tf.reduce_join(a, []) ==> ["abcd"]
+//
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
 // ```
 //
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
+//
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//	mutex: The mutex resource to lock.
 //
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "MutexLock",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			mutex,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the mean along segments of a tensor.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cos",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			x,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["use_locking"] = value
 	}
 }
 
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9942,121 +10031,137 @@ func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
-
-// TopKSorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
-// If two elements are equal, the lower-index element appears first.
+// Arguments:
 //
-// If `k` varies dynamically, use `TopKV2` below.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
 //
-// Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "BatchDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, batch_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
-//
-// Arguments:
-//	tensor: A Tensor of type `T`.
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
 //
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
-		Input: []tf.Input{
-			tensor,
-		},
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
 
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
+// value: If true try to recover an image from truncated input.
 // If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["try_recover_truncated"] = value
 	}
 }
 
-// Solves systems of linear equations.
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10065,9 +10170,9 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "DecodeAndCropJpeg",
 		Input: []tf.Input{
-			matrix, rhs,
+			contents, crop_window,
 		},
 		Attrs: attrs,
 	}
@@ -10075,293 +10180,273 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 	return op.Output(0)
 }
 
-// Looks up keys in a table, outputs the corresponding values.
-//
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
-//
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
+
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
-		Input: []tf.Input{
-			table_handle, keys, default_value,
-		},
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT3D",
-		Input: []tf.Input{
-			input,
-		},
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// This is a deprecated version of BiasAdd and will be soon removed.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			value, bias,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
 //
-// # 'dims' is [3] or 'dims' is [-1]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
 //
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
 //
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-// `[-rank(tensor), rank(tensor))`.
-//
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseV2",
+		Type: "SparseAdd",
 		Input: []tf.Input{
-			tensor, axis,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["capacity"] = value
 	}
 }
 
-// Returns the real part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// For example:
+// REQUIRES: value >= 0
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
-			input,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapPeek", err)
+		return
+	}
+	return values
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+// Inverse fast Fourier transform.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+// Generates values in an interval.
 //
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// For example:
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			tag, tensor,
+			start, stop, num,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
 
-// QrFullMatrices sets the optional full_matrices attribute to value.
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["ignore_lookup_error"] = value
 	}
 }
 
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+// Deletes the resource specified by the handle.
 //
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	resource: handle to the resource to delete.
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10370,63 +10455,56 @@ func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BytesProducedStatsDataset",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			input_dataset, tag,
+			resource,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+// Update '*var' according to the RMSProp algorithm.
 //
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10435,144 +10513,168 @@ func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, al
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
-
-// MeanKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// Returns the truth value of (x > y) element-wise.
 //
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Mean",
+		Type: "Greater",
 		Input: []tf.Input{
-			input, axis,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
 
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["vocab_size"] = value
+		m["seed"] = value
 	}
 }
 
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
 //
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["delimiter"] = value
+		m["seed2"] = value
 	}
 }
 
-// Initializes a table from a text file.
-//
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
 //
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
-//
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
 	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
-		Input: []tf.Input{
-			table_handle, filename,
-		},
-		Attrs: attrs,
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
 
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_image_if_no_bounding_boxes"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// Generate a single randomly distorted bounding box for an image.
 //
-// Arguments:
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10581,9 +10683,9 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			image_size, bounding_boxes,
 		},
 		Attrs: attrs,
 	}
@@ -10591,106 +10693,139 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
 //
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
 //
-// Reshaping does not affect the order of values in the SparseTensor.
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
 //
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "LRN",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySplitV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
+		Type: "ZipDataset",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			tf.OutputList(input_datasets),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PackAttr is an optional argument to Pack.
-type PackAttr func(optionalAttr)
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
-// PackAxis sets the optional axis attribute to value.
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: Dimension along which to pack.  Negative values wrap around, so the
-// valid range is `[-(R+1), R+1)`.
-// If not specified, defaults to 0
-func PackAxis(value int64) PackAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
-//
-// Packs the `N` tensors in `values` into a tensor with rank one higher than each
-// tensor in `values`, by packing them along the `axis` dimension.
-// Given a list of tensors of shape `(A, B, C)`;
-//
-// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-// Etc.
-//
-// For example:
-//
-// ```
-// # 'x' is [1, 4]
-// # 'y' is [2, 5]
-// # 'z' is [3, 6]
-// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-// ```
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// This is the opposite of `unpack`.
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	values: Must be of same shape and type.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns The packed tensor.
-func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10699,96 +10834,122 @@ func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Pack",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// Reordering does not affect the shape of the SparseTensor.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseReorder",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Makes its input available to the next iteration.
+//
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			features,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
+
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["Tout"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// Returns the argument of a complex number.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
 //
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10797,148 +10958,184 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
+		Type: "Angle",
 		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
 
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
+// VarHandleOpContainer sets the optional container attribute to value.
 //
-// `value  20 5  16 3  7`
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes gradient of the FractionalMaxPool function.
+// Creates a handle to a Variable resource.
 //
 // Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
-		},
+		Type: "VarHandleOp",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// Elementwise computes the bitwise XOR of `x` and `y`.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// The result will have those bits set, that are different in `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseXor",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// Deserialize `SparseTensor` objects.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
 //
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "DeserializeSparse",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
 
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
+// Update '*var' according to the RMSProp algorithm.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10947,99 +11144,77 @@ func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
-//
-//
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			input_dataset, count,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["use_locking"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// Applies sparse `updates` to individual values or slices within a given
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+// variable according to `indices`.
 //
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 //
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See @{tf.scatter_nd} for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11048,43 +11223,59 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "ResourceScatterNdUpdate",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
 
-// MinKeepDims sets the optional keep_dims attribute to value.
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
+	}
+}
+
+// Removes dimensions of size 1 from the shape of a tensor.
+//
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
+//
+// For example:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	input: The `input` to squeeze.
 //
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11093,9 +11284,9 @@ func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Min",
+		Type: "Squeeze",
 		Input: []tf.Input{
-			input, axis,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -11103,87 +11294,126 @@ func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (ou
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adadelta scheme.
+//
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Transpose",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			x, perm,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["iou_threshold"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			boxes, scores, max_output_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
@@ -11191,328 +11421,259 @@ func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_s
 	return op.Output(0)
 }
 
-// Flushes the writer's unwritten events.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
+//
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	writer: A handle to the summary writer resource.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns the created operation.
-func FlushSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FlushSummaryWriter",
+		Type: "SparseDenseCwiseMul",
 		Input: []tf.Input{
-			writer,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
-
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["mode"] = value
-	}
-}
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
+// Resize `images` to `size` using area interpolation.
 //
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+// Input images can be of different types but output images are always float.
 //
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
 //
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
-//
-// Now we can quantize the elements of our tensor:
-// ```c++
-// result = round(input * s)
-// ```
-//
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			images, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
+// 2D real-valued fast Fourier transform.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "RFFT2D",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// Pads a tensor with zeros.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
+// Checks whether a resource handle-based variable has been initialized.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+//	resource: the input resource handle.
 //
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "VarIsInitializedOp",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			resource,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
 
-// Conv3DDataFormat sets the optional data_format attribute to value.
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
+// Gather specific elements from the TensorArray into output `value`.
 //
-// Our Conv3D implements a form of cross-correlation.
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			input, filter,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -11520,157 +11681,141 @@ func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, pa
 	return op.Output(0)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
-//
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Mutually reduces multiple tensors of identical type and shape.
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "CollectiveReduce",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Read an element from the TensorArray into output `value`.
+// This op consumes a lock created by `MutexLock`.
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
 //
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
 //
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
+		Type: "ConsumeMutexLock",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			mutex_lock,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
-
-// EncodePngCompression sets the optional compression attribute to value.
+// Returns x / y element-wise for integer types.
 //
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
-	return func(m optionalAttr) {
-		m["compression"] = value
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
+//
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateDiv",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PNG-encode an image.
+// Restores tensors from a V2 checkpoint.
 //
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
 //
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
+		Type: "RestoreV2",
 		Input: []tf.Input{
-			image,
+			prefix, tensor_names, shape_and_slices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
-
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
 	}
+	return tensors
 }
 
-// Returns the permuted vector/tensor in the destination data format given the
-//
-// one in the source data format.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
 // Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			x,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -11678,107 +11823,170 @@ func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPe
 	return op.Output(0)
 }
 
-// Returns element-wise integer closest to x.
+// Computes the maximum along segments of a tensor.
 //
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rint",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			x,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
-
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode web-safe base64-encoded strings.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
+//
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Op removes and returns the (key, value) element with the smallest
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
+		Type: "ResourceSparseApplyProximalAdagrad",
 		Input: []tf.Input{
-			indices,
+			var_, accum, lr, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
 
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
 // value: The data format of the input and output data. With the
 // default format "NDHWC", the data is stored in the order of:
@@ -11786,13 +11994,13 @@ type MaxPool3DGradGradAttr func(optionalAttr)
 // Alternatively, the format could be "NCDHW", the data storage order is:
 //     [batch, in_channels, in_depth, in_height, in_width].
 // If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Computes gradients of max pooling function.
 //
 // Arguments:
 //	orig_input: The original input tensor.
@@ -11803,9 +12011,7 @@ func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11814,7 +12020,7 @@ func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
 			orig_input, orig_output, grad,
 		},
@@ -11824,62 +12030,54 @@ func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
 
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -11887,114 +12085,115 @@ func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Outpu
 	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor.
-//
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of the variable pointed to by `resource`.
 //
-// Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+// For example:
 //
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
+		Type: "VariableShape",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
-	}
-	return outputs
+	return op.Output(0)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			true_classes,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
@@ -12002,216 +12201,311 @@ func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, n
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			input, ksize, strides,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Deprecated. Use TensorArrayReadV3
+// Fast Fourier transform.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "FFT",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Serves as a control trigger for scheduling.
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
 //
-// Only useful as a placeholder for control edges.
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
 //
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Batch normalization.
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "MaxPoolWithArgmax",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
 
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["use_locking"] = value
 	}
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
 // If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+func EncodeJpegFormat(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["format"] = value
 	}
 }
 
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
 // If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["progressive"] = value
 	}
 }
 
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
 //
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["optimize_size"] = value
 	}
 }
 
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
 //
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
+		m["chroma_downsampling"] = value
 	}
 }
 
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
 //
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["max_load_factor"] = value
+		m["density_unit"] = value
 	}
 }
 
-// Creates an empty hash table that uses tensors as the backing store.
+// EncodeJpegXDensity sets the optional x_density attribute to value.
 //
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
 //
 // Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//	value_dtype: Type of the table values.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			empty_key,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -12219,59 +12513,73 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.D
 	return op.Output(0)
 }
 
-// Produces the max pool of the input tensor for quantized types.
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Softplus",
+		Type: "Multinomial",
 		Input: []tf.Input{
-			features,
+			logits, num_samples,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
-//
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "LogicalNot",
 		Input: []tf.Input{
 			x,
 		},
@@ -12280,198 +12588,303 @@ func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
+// 3D real-valued fast Fourier transform.
 //
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			reader_handle,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
 //
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
 //
-// Arguments:
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Write data via Write and read via Read or Pack.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "TensorArrayV3",
 		Input: []tf.Input{
-			data, segment_ids,
+			size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
 //
 // Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
 	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
+		Type: "BoostedTreesPredict",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
+//
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
+//
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	resource: the input resource handle.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			resource,
+			matrix, rhs, l2_regularizer,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
+// Elementwise computes the bitwise OR of `x` and `y`.
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "BitwiseOr",
 		Input: []tf.Input{
-			input, paddings,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
+		m["data_format"] = value
 	}
 }
 
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -12479,92 +12892,79 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
+
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
 //
 // Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
-			serialized_sparse,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// StringJoinSeparator sets the optional separator attribute to value.
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Joins the strings in the given list of string tensors into one tensor;
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
 //
-// with the given separator (default is an empty separator).
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12573,260 +12973,209 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns immutable tensor from memory region.
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// The current implementation memmaps the tensor from a file.
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
+}
 
-		Attrs: attrs,
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
 //
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+		Type: "MutableHashTableOfTensorsV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
+// Subtracts sparse updates from the variable referenced by `resource`.
 //
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
+// This operation computes
 //
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
 //
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
 //
-// then the output will be
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
 //
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
 //
-// Graphically this is equivalent to doing
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "ResourceScatterSub",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			resource, indices, updates,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
+// Inverse 2D fast Fourier transform.
 //
-//     inputs[2]: Tensor [["f"], ["g"]]
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
-// then the output will be
+// Arguments:
+//	input: A complex64 tensor.
 //
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 //
-// if hashed_output=true then the output will be
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 2D fast Fourier transform.
 //
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
+//	input: A complex64 tensor.
 //
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "FFT2D",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ListDiffAttr is an optional argument to ListDiff.
-type ListDiffAttr func(optionalAttr)
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
 
-// ListDiffOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the difference between two lists of numbers or strings.
-//
-// Given a list `x` and a list `y`, this operation returns a list `out` that
-// represents all values that are in `x` but not in `y`. The returned list `out`
-// is sorted in the same order that the numbers appear in `x` (duplicates are
-// preserved). This operation also returns a list `idx` that represents the
-// position of each `out` element in `x`. In other words:
-//
-// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-//
-// For example, given this input:
-//
-// ```
-// x = [1, 2, 3, 4, 5, 6]
-// y = [1, 3, 5]
-// ```
-//
-// This operation would return:
+// Update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// ```
-// out ==> [2, 4, 6]
-// idx ==> [1, 3, 5]
-// ```
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
-//	x: 1-D. Values to keep.
-//	y: 1-D. Values to remove.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
 //
-// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
-func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12835,108 +13184,140 @@ func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ListDiff",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			x, y,
+			var_, alpha, l1, l2, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// Computes the gradient for the sqrt of `x` wrt its input.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "SqrtGrad",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
+// Get the value of the tensor specified by its handle.
+//
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
+//
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "GetSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Returns x - y element-wise.
 //
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
+
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "LogUniformCandidateSampler",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
@@ -12944,260 +13325,194 @@ func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_value
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes numerical negative value element-wise.
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Neg",
+		Type: "Maximum",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
-//
-// Returns the created operation.
-func WriteHistogramSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteHistogramSummary",
-		Input: []tf.Input{
-			writer, step, tag, values,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAdd",
+		Type: "SoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			features, labels,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// value: The separator to use when joining.
 // If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["separator"] = value
 	}
 }
 
-// Op peeks at the values at the specified key.  If the
+// Joins a string Tensor across the given dimensions.
 //
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.
+//
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+// tf.reduce_join(a, []) ==> ["abcd"]
+// ```
+//
+// Arguments:
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			key, indices,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
+	opspec := tf.OpSpec{
+		Type: "Cos",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
 
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+		m["epsilon"] = value
 	}
 }
 
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+		m["data_format"] = value
 	}
 }
 
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["is_training"] = value
 	}
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
+// Gradient for batch normalization.
 //
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13206,460 +13521,488 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			contents, crop_window,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
 
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
+// TopKSorted sets the optional sorted attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["sorted"] = value
 	}
 }
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"k": k}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "TopK",
 		Input: []tf.Input{
-			true_classes,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns the element-wise min of two SparseTensors.
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// The Hurwitz zeta function is defined as:
 //
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
+		Type: "Zeta",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			x, q,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Constructs a tensor by tiling a given tensor.
-//
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
 //
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Tile",
+		Type: "Prod",
 		Input: []tf.Input{
-			input, multiples,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Saves the input tensors to disk.
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// See also `SaveSlices`.
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			input, size, paddings, filter,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+// Returns a list of tensors with the same shapes and contents as the input
 //
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// tensors.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
+//
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
+//
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorMod",
+		Type: "IdentityN",
 		Input: []tf.Input{
-			x, y,
+			tf.OutputList(input),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
-
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
 	}
+	return output
 }
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-//
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-// ```
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// and
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
 //
-// ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-// ```
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
 //
-// then the final `SparseTensor` will be:
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
 //
-// ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-// ```
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "ResourceApplyCenteredRMSProp",
 		Input: []tf.Input{
-			sparse_handles,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
+// Adds `bias` to `value`.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// This is a deprecated version of BiasAdd and will be soon removed.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InTopKV2",
+		Type: "BiasAddV1",
 		Input: []tf.Input{
-			predictions, targets, k,
+			value, bias,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Assigns a new value to a variable.
+// Reverses specific dimensions of a tensor.
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
 //
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns a tensor of ones with the same shape and type as x.
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
 //
-// Arguments:
-//	x: a tensor of type T.
+// For example:
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OnesLike",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The gradient of SparseFillEmptyRows.
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
 //
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
+// # 'dims' is [3] or 'dims' is [-1]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
 //
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
 // Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
 //
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
+		Type: "ReverseV2",
 		Input: []tf.Input{
-			reverse_index_map, grad_values,
+			tensor, axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
+
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the real part of a complex number.
 //
-// if < 0, `scale * features` otherwise.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Selu",
+		Type: "Real",
 		Input: []tf.Input{
-			features,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
 
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Number of unique elements along last dimension of input `set`.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
@@ -13667,51 +14010,138 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 	return op.Output(0)
 }
 
-// Computes the sign and the log of the absolute value of the determinant of
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
 //
-// one or more square matrices.
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
 //
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
 //
 // Arguments:
-//	input: Shape is `[N, M, M]`.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
+		Type: "Qr",
 		Input: []tf.Input{
 			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
-
-// SumKeepDims sets the optional keep_dims attribute to value.
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
+
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a tensor.
+// Computes the mean of elements across dimensions of a tensor.
 //
 // Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
@@ -13724,7 +14154,7 @@ func SumKeepDims(value bool) SumAttr {
 // `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13733,7 +14163,7 @@ func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "Mean",
 		Input: []tf.Input{
 			input, axis,
 		},
@@ -13743,105 +14173,139 @@ func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (ou
 	return op.Output(0)
 }
 
-// Delete the tensor specified by its handle in the session.
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
+//
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
+	}
+}
+
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
+	}
+}
+
+// Initializes a table from a text file.
+//
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
+//
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
 //
 // Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			handle,
+			table_handle, filename,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// L2 Loss.
+// Real-valued fast Fourier transform.
 //
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
 //
-//     output = sum(t ** 2) / 2
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "L2Loss",
+		Type: "RFFT",
 		Input: []tf.Input{
-			t,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
 
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["out_type"] = value
 	}
 }
 
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// Computes Quantized Rectified Linear: `max(features, 0)`
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
 //
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "QuantizedRelu",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
@@ -13849,144 +14313,117 @@ func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Out
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
-
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// Reshapes a SparseTensor to represent values in a new dense shape.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
-	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
-	}
-}
-
-// Performs a resize and padding as a preprocess during a convolution.
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
 //
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+// Reshaping does not affect the order of values in the SparseTensor.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
+		Type: "SparseReshape",
 		Input: []tf.Input{
-			input, size, paddings, filter,
+			input_indices, input_shape, new_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// Deprecated. Use TensorArraySplitV3
 //
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
+		Type: "TensorArraySplitV2",
 		Input: []tf.Input{
-			resource, value,
+			handle, value, lengths, flow_in,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
+// PackAttr is an optional argument to Pack.
+type PackAttr func(optionalAttr)
 
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
+// PackAxis sets the optional axis attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
+// value: Dimension along which to pack.  Negative values wrap around, so the
+// valid range is `[-(R+1), R+1)`.
+// If not specified, defaults to 0
+func PackAxis(value int64) PackAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["axis"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
+// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
 //
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
+// Packs the `N` tensors in `values` into a tensor with rank one higher than each
+// tensor in `values`, by packing them along the `axis` dimension.
+// Given a list of tensors of shape `(A, B, C)`;
 //
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+// Etc.
 //
-// See also `RestoreSlice`.
+// For example:
+//
+// ```
+// # 'x' is [1, 4]
+// # 'y' is [2, 5]
+// # 'z' is [3, 6]
+// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+// ```
+//
+// This is the opposite of `unpack`.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
+//	values: Must be of same shape and type.
 //
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+// Returns The packed tensor.
+func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "Pack",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
@@ -13994,229 +14431,196 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.
 	return op.Output(0)
 }
 
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
+// Reorders a SparseTensor into the canonical, row-major ordering.
 //
-// Input images and output images must be quantized types.
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// Reordering does not affect the shape of the SparseTensor.
 //
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
 //
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
+		Type: "SparseReorder",
 		Input: []tf.Input{
-			images, size, min, max,
+			input_indices, input_values, input_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "Relu",
 		Input: []tf.Input{
-			data, segment_ids,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: Whether to use Adapative SDCA for the inner loop.
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
+// Update '*var' according to the AddSign update.
 //
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
+	return scope.AddOperation(opspec)
+}
+
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
+
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
 
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["direction"] = value
 	}
 }
 
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["dropout"] = value
 	}
 }
 
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
+		m["seed"] = value
 	}
 }
 
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
+		m["seed2"] = value
 	}
 }
 
-// Multiply matrix "a" by matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". This op is optimized for the case where at
-// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
-// matrix multiply on one platform was 30% zero values in the sparse matrix.
-//
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+// Backprop step of CudnnRNN.
+//
+// Compute the backprop of both data and weights in a RNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: a 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: a 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: the same shape has input_h.
+// output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14225,62 +14629,97 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
+		Type: "CudnnRNNBackprop",
 		Input: []tf.Input{
-			a, b,
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Computes the power of one value to another.
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
+//
+// Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "FractionalMaxPoolGrad",
 		Input: []tf.Input{
-			x, y,
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
 
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Returns the shape of a tensor.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Update '*var' according to the proximal adagrad scheme.
 //
-// For example:
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14289,92 +14728,98 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			input,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes fingerprints of the input strings.
-//
-// Arguments:
-//	input: vector of strings to compute fingerprints on.
-//
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
+
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
-		Input: []tf.Input{
-			input,
-		},
+}
+
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
 
-// RandomPoissonV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
 // If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["dropout"] = value
 	}
 }
 
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["seed"] = value
 	}
 }
 
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["seed2"] = value
 	}
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// Converts CudnnRNN params from canonical form to usable form.
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
 //
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14383,9 +14828,9 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
+		Type: "CudnnRNNCanonicalToParams",
 		Input: []tf.Input{
-			shape, rate,
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
 		},
 		Attrs: attrs,
 	}
@@ -14393,59 +14838,41 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 	return op.Output(0)
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
 
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
 	return func(m optionalAttr) {
-		m["lower"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.triangular_solve
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations with upper or lower triangular matrices by
-//
-// backsubstitution.
-//
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
-//
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14454,48 +14881,33 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			matrix, rhs,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
 //
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
 //
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			start, stop, step,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -14503,69 +14915,76 @@ func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output,
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the input.
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+//
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
+//
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -14573,23 +14992,63 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
-// Adds sparse updates to the variable referenced by `resource`.
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReadFile",
+		Input: []tf.Input{
+			filename,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Multiplies sparse updates into the variable referenced by `resource`.
 //
 // This operation computes
 //
 //     # Scalar indices
-//     ref[indices, ...] += updates[...]
+//     ref[indices, ...] *= updates[...]
 //
 //     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
+//     ref[indices[i], ...] *= updates[i, ...]
 //
 //     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
 //
 // Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
+// the same location, their contributions multiply.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 // <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
@@ -14601,12 +15060,12 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 //	updates: A tensor of updated values to add to `ref`.
 //
 // Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
+		Type: "ResourceScatterMul",
 		Input: []tf.Input{
 			resource, indices, updates,
 		},
@@ -14614,101 +15073,101 @@ func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// Computes sigmoid of `x` element-wise.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "Sigmoid",
 		Input: []tf.Input{
-			y, dy,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//     Updates specified rows with values in `v`.
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//     Computes `x[i, :] = v; return x`.
+//
+// Arguments:
+//	x: A tensor of type `T`.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "InplaceUpdate",
 		Input: []tf.Input{
-			x, y,
+			x, i, v,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
+		m["epsilon"] = value
 	}
 }
 
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
-	}
-}
-
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
-//
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
+		m["data_format"] = value
 	}
 }
 
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
+		m["is_training"] = value
 	}
 }
 
-// Transforms a spectrogram into a form that's useful for speech recognition.
+// Batch normalization.
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14717,39 +15176,62 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mfcc",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			spectrogram, sample_rate,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Returns the element-wise sum of a list of tensors.
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
 //
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
 //
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -14757,95 +15239,73 @@ func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Out
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Computes gradient of the FractionalAvgPool function.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14854,9 +15314,9 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			shape, alpha,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
@@ -14864,132 +15324,124 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
+// BoostedTreesEnsembleResourceHandleOpAttr is an optional argument to BoostedTreesEnsembleResourceHandleOp.
+type BoostedTreesEnsembleResourceHandleOpAttr func(optionalAttr)
 
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// BoostedTreesEnsembleResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpContainer(value string) BoostedTreesEnsembleResourceHandleOpAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["container"] = value
 	}
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+// BoostedTreesEnsembleResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpSharedName(value string) BoostedTreesEnsembleResourceHandleOpAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// Creates a handle to a BoostedTreesEnsembleResource
+func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTreesEnsembleResourceHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesEnsembleResourceHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates tensors along one dimension.
 //
 // Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "Concat",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			concat_dim, tf.OutputList(values),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
 
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceGather",
-		Input: []tf.Input{
-			resource, indices,
-		},
-		Attrs: attrs,
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			handle,
+			var_, accum, lr, grad, momentum,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
@@ -15042,147 +15494,156 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
+// Returns the last element of the input list as well as a list with all but that element.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+// Fails if the list is empty.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListPopBack",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// Returns element-wise integer closest to x.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "Rint",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// SkipgramWindowSize sets the optional window_size attribute to value.
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["window_size"] = value
+		m["capacity"] = value
 	}
 }
 
-// SkipgramMinCount sets the optional min_count attribute to value.
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["min_count"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["subsample"] = value
+		m["container"] = value
 	}
 }
 
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
-//
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the (key, value) element with the smallest
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
+		Type: "OrderedMapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
 
-// StringToNumberOutType sets the optional out_type attribute to value.
+// SerializeManySparseOutType sets the optional out_type attribute to value.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15191,9 +15652,9 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "SerializeManySparse",
 		Input: []tf.Input{
-			string_tensor,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -15201,99 +15662,60 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "Acosh",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			x,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
 
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["element_shape"] = value
 	}
 }
 
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dynamic_size"] = value
 	}
 }
 
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15302,9 +15724,9 @@ func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			shape,
+			size,
 		},
 		Attrs: attrs,
 	}
@@ -15312,39 +15734,63 @@ func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
-type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxVarsPerChannelNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsPerChannelNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelAttr {
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["num_bits"] = value
+		m["field_delim"] = value
 	}
 }
 
-// FakeQuantWithMinMaxVarsPerChannelNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsPerChannelNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelAttr {
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["narrow_range"] = value
+		m["use_quote_delim"] = value
 	}
 }
 
-// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+// DecodeCSVNaValue sets the optional na_value attribute to value.
 //
-// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
-// to 'outputs' tensor of same shape as `inputs`.
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelAttr) (outputs tf.Output) {
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15353,118 +15799,68 @@ func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsPerChannel",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			inputs, min, max,
+			records, tf.OutputList(record_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
 
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// MapClearCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["capacity"] = value
 	}
 }
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
-//
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
-//
-// Arguments:
-//	value: The tensor to be shuffled.
-//
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
-type OrderedMapIncompleteSizeAttr func(optionalAttr)
-
-// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
+// MapClearContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+func MapClearContainer(value string) MapClearAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// MapClearSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+func MapClearSharedName(value string) MapClearAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op returns the number of incomplete elements in the underlying container.
-func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15473,161 +15869,125 @@ func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...Or
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapIncompleteSize",
+		Type: "MapClear",
 
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["little_endian"] = value
+		m["seed"] = value
 	}
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
-//
-// Arguments:
-//	bytes: All the elements must have the same length.
-//
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// Useful special cases:
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
 
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["data_format"] = value
 	}
 }
 
-// Decompress strings.
-//
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
-//
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
+// Performs max pooling on the input.
 //
 // Arguments:
-//	bytes: A Tensor of string which is compressed.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			bytes,
+			input, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -15635,205 +15995,209 @@ func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompresse
 	return op.Output(0)
 }
 
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
 
-// WholeFileReaderV2Container sets the optional container attribute to value.
+// SkipgramWindowSize sets the optional window_size attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["window_size"] = value
 	}
 }
 
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+// SkipgramMinCount sets the optional min_count attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["min_count"] = value
 	}
 }
 
-// A Reader that outputs the entire contents of a file as a value.
+// SkipgramSubsample sets the optional subsample attribute to value.
 //
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
+		Type: "Skipgram",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// Transforms a tf.Example proto (as a string) into typed tensors.
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
-		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Acos",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			x,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
 
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
-//
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "ResourceApplyFtrlV2",
 		Input: []tf.Input{
-			input,
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			serialized,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -15841,366 +16205,344 @@ func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (outp
 	return op.Output(0)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
 
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
 //
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapClearSharedName sets the optional shared_name attribute to value.
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
 // If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["field_delim"] = value
+		m["value_shape"] = value
 	}
 }
 
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
 //
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
+		m["initial_num_buckets"] = value
 	}
 }
 
-// DecodeCSVNaValue sets the optional na_value attribute to value.
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
 //
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["na_value"] = value
+		m["max_load_factor"] = value
 	}
 }
 
-// Convert CSV records to tensors. Each column maps to one tensor.
+// Creates an empty hash table that uses tensors as the backing store.
 //
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or empty if the column is required.
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//	value_dtype: Type of the table values.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			empty_key,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Returns the rank of a tensor.
-//
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
 //
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rank",
+		Type: "TruncateMod",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Makes its input available to the next iteration.
+// Inverse 2D real-valued fast Fourier transform.
 //
-// Arguments:
-//	data: The tensor to be made available to the next iteration.
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NextIteration",
-		Input: []tf.Input{
-			data,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that skips `count` elements from the `input_dataset`.
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// Arguments:
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
 //
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			input_dataset, count,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
 //
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
 //
-// Arguments:
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Accepted values are:
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			data, segment_ids,
+			contents,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
+// Serializes the tree ensemble to a proto.
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+//	tree_ensemble_handle: Handle to the tree ensemble.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// Returns Stamp token of the tree ensemble resource.Serialized proto of the ensemble.
+func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "BoostedTreesSerializeEnsemble",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			tree_ensemble_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
 
-// StageClearCapacity sets the optional capacity attribute to value.
+// StageSizeCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
+func StageSizeCapacity(value int64) StageSizeAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// StageClearContainer sets the optional container attribute to value.
+// StageSizeContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
+func StageSizeContainer(value string) StageSizeAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// StageClearSharedName sets the optional shared_name attribute to value.
+// StageSizeSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
+func StageSizeSharedName(value string) StageSizeAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16209,65 +16551,36 @@ func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageClear",
+		Type: "StageSize",
 
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
-
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the ids of the positions in sampled_candidates that match true_labels.
-//
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			true_classes, sampled_candidates,
+			input, min_input, max_input,
 		},
 		Attrs: attrs,
 	}
@@ -16275,288 +16588,203 @@ func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candida
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "Softplus",
 		Input: []tf.Input{
-			x,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
+// Computes exponential of x - 1 element-wise.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a normal distribution.
+// Returns the number of records this Reader has produced.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			shape,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+// Computes the sum along segments of a tensor.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a dataset that emits the lines of one or more text files.
+//
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "TextLineDataset",
 		Input: []tf.Input{
-			x,
+			filenames, compression_type, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
-type FusedBatchNormV2Attr func(optionalAttr)
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
 
-// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["input_mode"] = value
 	}
 }
 
-// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["direction"] = value
 	}
 }
 
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
 // If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
+// Computes size of weights that can be used by a Cudnn RNN model.
 //
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T, "S": S}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "CudnnRNNParamsSize",
 		Input: []tf.Input{
-			logits, num_samples,
+			num_layers, num_units, input_size,
 		},
 		Attrs: attrs,
 	}
@@ -16564,135 +16792,186 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
+// Computes gradients for SparseSegmentMean.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegQuality sets the optional quality attribute to value.
+// Returns the set of files matching one or more glob patterns.
 //
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatchingFiles",
+		Input: []tf.Input{
+			pattern,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegProgressive sets the optional progressive attribute to value.
-//
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
+
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
 	return func(m optionalAttr) {
-		m["progressive"] = value
+		m["dtype"] = value
 	}
 }
 
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+// Return histogram of values.
 //
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
+//
+// Arguments:
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramFixedWidth",
+		Input: []tf.Input{
+			values, value_range, nbins,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+// Returns the truth value of (x >= y) element-wise.
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "GreaterEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegYDensity sets the optional y_density attribute to value.
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["y_density"] = value
+		m["data_format"] = value
 	}
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// Conv3DDilations sets the optional dilations attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["dilations"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// Our Conv3D implements a form of cross-correlation.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
-//
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			image,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -16700,47 +16979,57 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
-
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseAdd",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes gradients of the maxpooling function.
+// Read an element from the TensorArray into output `value`.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	handle: The handle to a TensorArray.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -16748,116 +17037,185 @@ func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad
 	return op.Output(0)
 }
 
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["mode"] = value
 	}
 }
 
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
-//
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
+		m["round_mode"] = value
 	}
 }
 
-// Extracts crops from the input image tensor and bilinearly resizes them (possibly
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// with aspect ratio change) to a common output size specified by `crop_size`. This
-// is more general than the `crop_to_bounding_box` op which extracts a fixed size
-// slice from the input image and does not allow resizing or aspect ratio change.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
-// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
-// method will give identical results to using `tf.image.resize_bilinear()`
-// with `align_corners=True`.
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
 //
 // Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
 //
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the truth value of (x < y) element-wise.
+//
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Less",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
 
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_type"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
 //
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16866,96 +17224,101 @@ func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
-//
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
-		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
-		},
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
 //
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
 
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// PrintFirstN sets the optional first_n attribute to value.
-//
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
-	}
-}
-
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
+// Draws samples from a multinomial distribution.
 //
 // Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16964,54 +17327,9 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Print",
-		Input: []tf.Input{
-			input, tf.OutputList(data),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
-//
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
-//
-// Arguments:
-//
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
-//
-//
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "StatelessMultinomial",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			logits, num_samples, seed,
 		},
 		Attrs: attrs,
 	}
@@ -17019,59 +17337,44 @@ func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Outpu
 	return op.Output(0)
 }
 
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
 
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
 	return func(m optionalAttr) {
-		m["description"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// TensorSummaryLabels sets the optional labels attribute to value.
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
-	}
-}
-
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["display_name"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with a tensor.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
+		Type: "ResourceGather",
 		Input: []tf.Input{
-			tensor,
+			resource, indices,
 		},
 		Attrs: attrs,
 	}
@@ -17079,163 +17382,119 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 	return op.Output(0)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
-//
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with scalar values.
+// Delete the TensorArray from its resource container.
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "TensorArrayCloseV3",
 		Input: []tf.Input{
-			tags, values,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// Saves the input tensors to disk.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// See also `SaveSlices`.
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "Save",
 		Input: []tf.Input{
-			tag, values,
+			filename, tensor_names, tf.OutputList(data),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the number of elements in the given queue.
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
-// Arguments:
-//	handle: The handle to a queue.
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "FloorMod",
 		Input: []tf.Input{
-			handle,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
 
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["max_images"] = value
+		m["adjoint_a"] = value
 	}
 }
 
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
 //
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["bad_color"] = value
+		m["adjoint_b"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with images.
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 //
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17244,9 +17503,9 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			tag, tensor,
+			a_indices, a_values, a_shape, b,
 		},
 		Attrs: attrs,
 	}
@@ -17254,101 +17513,103 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 	return op.Output(0)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
-
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
 //
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
 //
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+// and
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			tag, tensor, sample_rate,
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
 
-// AvgPoolDataFormat sets the optional data_format attribute to value.
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["separator"] = value
 	}
 }
 
-// Performs average pooling on the input.
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
+// with the given separator (default is an empty separator).
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			value,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -17356,353 +17617,449 @@ func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
+// Returns immutable tensor from memory region.
 //
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// The current implementation memmaps the tensor from a file.
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+// Concatenates a list of `SparseTensor` along the specified dimension.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
 //
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
 //
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
 //
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Writes a `Summary` protocol buffer with scalar values.
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
 //
-// The input `tag` and `value` must have the scalars.
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
 //
 // Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Tag for the summary.
-//	value: Value for the summary.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
 //
-// Returns the created operation.
-func WriteScalarSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "WriteScalarSummary",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			writer, step, tag, value,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the matrix exponential of one or more square matrices:
-//
-// exp(A) = \sum_{n=0}^\infty A^n/n!
-//
-// The exponential is computed using a combination of the scaling and squaring
-// method and the Pade approximation. Details can be founds in:
-// Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
-// revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// Concatenates quantized tensors along one dimension.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.expm
-// @end_compatibility
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
+		Type: "QuantizedConcat",
 		Input: []tf.Input{
-			input,
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
-type QueueDequeueUpToV2Attr func(optionalAttr)
-
-// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
+// Slice a `SparseTensor` based on the `start` and `size`.
 //
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues `n` tuples of one or more tensors from the given queue.
+// For example, if the input is
 //
-// This operation is not supported by all queues.  If a queue does not support
-// DequeueUpTo, then an Unimplemented error is returned.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// If the queue is closed and there are more than 0 but less than `n`
-// elements remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If
-// the queue is closed and there are 0 elements left in the queue, then
-// an OutOfRange error is returned just like in QueueDequeueMany.
-// Otherwise the behavior is identical to QueueDequeueMany:
+// Graphically the output tensors are:
 //
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueUpToV2",
+		Type: "SparseSlice",
 		Input: []tf.Input{
-			handle, n,
+			indices, values, shape, start, size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueUpToV2", err)
-		return
-	}
-	return components
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
-//
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// Returns the element-wise min of two SparseTensors.
 //
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			input,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Writes contents to the file at input filename. Creates file and recursively
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
 //
-// creates directory if not existing.
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
+//
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
+//
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
 //
 // Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
 //
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "WriteFile",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			filename, contents,
+			sparse_handles,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
 
-// AllKeepDims sets the optional keep_dims attribute to value.
+// MaxPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the "logical and" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Performs max pooling on the input.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "MaxPool",
 		Input: []tf.Input{
-			input, axis,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -17710,228 +18067,164 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+// Says whether the targets are in the top `K` predictions.
 //
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
+// More formally, let
 //
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "InTopKV2",
 		Input: []tf.Input{
-			input,
+			predictions, targets, k,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softplus gradients for a softplus operation.
+// Assigns a new value to a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
+		Type: "AssignVariableOp",
 		Input: []tf.Input{
-			gradients, features,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that contains the unique elements of `input_dataset`.
-func UniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns a tensor of ones with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "UniqueDataset",
+		Type: "OnesLike",
 		Input: []tf.Input{
-			input_dataset,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
-
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
-//
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
-	return func(m optionalAttr) {
-		m["compute_v"] = value
-	}
-}
-
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
-//
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+// The gradient of SparseFillEmptyRows.
 //
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
+//
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
 //
 // Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
 //
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
+		Type: "SparseFillEmptyRowsGrad",
 		Input: []tf.Input{
-			input,
+			reverse_index_map, grad_values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
 //
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
-		Input: []tf.Input{
-			images, scale,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise OR of `x` and `y`.
+// if < 0, `scale * features` otherwise.
 //
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
+		Type: "Selu",
 		Input: []tf.Input{
-			x, y,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
 
-// MatrixSolveLsFast sets the optional fast attribute to value.
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
 // If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+func SetSizeValidateIndices(value bool) SetSizeAttr {
 	return func(m optionalAttr) {
-		m["fast"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+// Number of unique elements along last dimension of input `set`.
 //
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-// sufficiently large.
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17940,9 +18233,9 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "SetSize",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			set_indices, set_values, set_shape,
 		},
 		Attrs: attrs,
 	}
@@ -17950,109 +18243,64 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 	return op.Output(0)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
-//
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
-	}
-}
-
-// SvdFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the singular value decompositions of one or more matrices.
+// Computes the sign and the log of the absolute value of the determinant of
 //
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+// one or more square matrices.
 //
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	input: Shape is `[N, M, M]`.
 //
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Svd",
+		Type: "LogMatrixDeterminant",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
 
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// SumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Enqueues zero or more tuples of one or more tensors in the given queue.
-//
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// Computes the sum of elements across dimensions of a tensor.
 //
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18061,490 +18309,467 @@ func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "Sum",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			input, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
+// Delete the tensor specified by its handle in the session.
 //
 // Arguments:
+//	handle: The handle for a tensor stored in the session state.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "DeleteSessionTensor",
 		Input: []tf.Input{
-			data, segment_ids,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Converts one or more images from RGB to HSV.
+// L2 Loss.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Computes half the L2 norm of a tensor without the `sqrt`:
 //
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+//     output = sum(t ** 2) / 2
 //
 // Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//	t: Typically 2-D, but may have any dimensions.
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "L2Loss",
 		Input: []tf.Input{
-			images,
+			t,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
 
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
-//
-// value: see above.
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
 // If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
 //
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
 //
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			set1, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
+// Subtracts a value from the current value of a variable.
 //
-// See also `Save`.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
 // Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SaveSlices",
+		Type: "AssignSubVariableOp",
 		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+			resource, value,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
 
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// Restores a tensor from checkpoint files.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
 //
-// Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+// See also `RestoreSlice`.
 //
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "Restore",
 		Input: []tf.Input{
-			set1, set2,
+			file_pattern, tensor_name,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
+
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
+//
+// Input images and output images must be quantized types.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+//
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
+		Type: "QuantizedResizeBilinear",
 		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
-		Input: []tf.Input{
-			basename, num_shards,
+			images, size, min, max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+// Computes the minimum along segments of a tensor.
 //
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
-	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
-		Attrs: attrs,
+		Type: "SegmentMin",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
 
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
 	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
+		m["adaptative"] = value
 	}
 }
 
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-//
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
-//
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
-//
-// The remappings are 1-D tensors with the following properties:
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
 //
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
 //
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
 //
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
 //
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 //
 // Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
 //
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
+		Type: "SdcaOptimizer",
 		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
 
-// TFRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["a_is_sparse"] = value
 	}
 }
 
-// A Reader that outputs the records from a TensorFlow Records file.
-//
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". This op is optimized for the case where at
+// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
+// matrix multiply on one platform was 30% zero values in the sparse matrix.
+//
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
 
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
 	return func(m optionalAttr) {
-		m["range_given"] = value
+		m["out_type"] = value
 	}
 }
 
-// Quantizes then dequantizes a tensor.
+// Returns the shape of a tensor.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18553,9 +18778,9 @@ func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
+		Type: "Shape",
 		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -18563,38 +18788,106 @@ func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output,
 	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// IdentityReaderV2Container sets the optional container attribute to value.
+// Computes fingerprints of the input strings.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
+
+// RandomPoissonV2Seed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["seed"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed2"] = value
 	}
 }
 
-// A Reader that outputs the queued work as both the key and value.
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
+//
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18603,37 +18896,69 @@ func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
+		Type: "RandomPoissonV2",
+		Input: []tf.Input{
+			shape, rate,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
 
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
+	}
+}
+
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.triangular_solve
+// @end_compatibility
 // If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// Solves systems of linear equations with upper or lower triangular matrices by
+//
+// backsubstitution.
+//
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18642,273 +18967,262 @@ func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
-//
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "Asinh",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
-//
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Restore a Reader to its initial clean state.
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			reader_handle,
+			start, stop, step,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["data_format"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["dilations"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
-//
-// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+// Computes the gradients of depthwise convolution with respect to the input.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "DepthwiseConv2dNativeBackpropInput",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
+// Stops gradient computation.
 //
-// Arguments:
-//	value: The tensor to be stored.
+// When executed in a graph, this op outputs its input tensor as-is.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
+		Type: "StopGradient",
 		Input: []tf.Input{
-			value,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
-//
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-//
-// Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+// Eagerly executes a python function to compute func(input)->output. The
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "EagerPyFunc",
 		Input: []tf.Input{
-			pattern,
+			tf.OutputList(input),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
-
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+	if scope.Err() != nil {
+		return
 	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
 }
 
-// Computes the gradient of bicubic interpolation.
+// Adds sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
+		Type: "ResourceScatterAdd",
 		Input: []tf.Input{
-			grads, original_image,
+			resource, indices, updates,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
-
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+// Says whether the targets are in the top `K` predictions.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using nearest neighbor interpolation.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// More formally, let
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "InTopK",
 		Input: []tf.Input{
-			images, size,
+			predictions, targets,
 		},
 		Attrs: attrs,
 	}
@@ -18916,157 +19230,106 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
-
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+// Returns (x - y)(x - y) element-wise.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient of nearest neighbor interpolation.
+// Forwards the input to the output.
+//
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
+		Type: "LoopCond",
 		Input: []tf.Input{
-			grads, size,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// Returns the element-wise sum of a list of tensors.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
 //
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
 //
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "AccumulateNV2",
 		Input: []tf.Input{
-			contents,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -19074,173 +19337,192 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 	return op.Output(0)
 }
 
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
-
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Extract the shape information of a JPEG-encoded image.
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
 //
-// This op only parses the image header, so it is much faster than DecodeJpeg.
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
 //
 // Arguments:
-//	contents: 0-D. The JPEG-encoded image.
 //
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			contents,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
 
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// RandomGammaSeed sets the optional seed attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["seed"] = value
 	}
 }
 
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["seed2"] = value
 	}
 }
 
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
+//
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGamma",
+		Input: []tf.Input{
+			shape, alpha,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// A queue that produces elements in first-in first-out order.
+// Computes the product along segments of a tensor.
 //
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
 //
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
-		Attrs: attrs,
+		Type: "UnsortedSegmentProd",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-// DecodePngChannels sets the optional channels attribute to value.
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// value: Number of color channels for the decoded image.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["seed"] = value
 	}
 }
 
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["seed2"] = value
 	}
 }
 
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// Outputs random integers from a uniform distribution.
 //
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
 //
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19249,9 +19531,9 @@ func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			contents,
+			shape, minval, maxval,
 		},
 		Attrs: attrs,
 	}
@@ -19259,310 +19541,297 @@ func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (ima
 	return op.Output(0)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
 //
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
 //
-//     convert $src.gif -coalesce $dst.gif
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
 //
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
 //
 // Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+//	value: The tensor to be shuffled.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "RandomShuffle",
 		Input: []tf.Input{
-			contents,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
+type OrderedMapIncompleteSizeAttr func(optionalAttr)
 
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["capacity"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
-//
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
-//
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IdentityN",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
+}
+
+// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	return output
 }
 
-// Computes the gradient of the sigmoid of `x` wrt its input.
-//
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Op returns the number of incomplete elements in the underlying container.
+func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+		Type: "OrderedMapIncompleteSize",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
+// Counts the number of occurrences of each value in an integer array.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "Bincount",
 		Input: []tf.Input{
-			images,
+			arr, size, weights,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["exclusive"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["reverse"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumsum",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["exclusive"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// CumprodReverse sets the optional reverse attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
+// value: A `bool` (default: False).
 // If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+func CumprodReverse(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["reverse"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
 //
-// For example,
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
 //
 // ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
 //
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
 // ```
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// This is more efficient than using separate `tf.reverse` ops.
 //
-// Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
+// The `reverse` and `exclusive` kwargs can also be combined:
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19571,88 +19840,75 @@ func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
+		Type: "Cumprod",
 		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
 
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["centered"] = value
+		m["Toutput"] = value
 	}
 }
 
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["normalized"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["uniform_noise"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// Extracts a glimpse from the input tensor.
-//
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
-//
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
 //
-// The argument `normalized` and `centered` controls how the windows are built:
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
 //
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
 //
 // Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
 //
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19661,76 +19917,62 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			input, size, offsets,
+			a, b, min_a, max_a, min_b, max_b,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// A container for an iterator resource.
+// Does nothing. Serves as a control trigger for scheduling.
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Iterator",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
-type ShuffleDatasetAttr func(optionalAttr)
-
-// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
-//
-// value: If true, each iterator over this dataset will be given
-// a different pseudorandomly generated seed, based on a sequence seeded by the
-// `seed` and `seed2` inputs. If false, each iterator will be given the same
-// seed, and repeated iteration over this dataset will yield the exact same
-// sequence of results.
-// If not specified, defaults to true
-func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
-	return func(m optionalAttr) {
-		m["reshuffle_each_iteration"] = value
+		Type: "ControlTrigger",
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
-//
-// Arguments:
+// Batch normalization.
 //
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
 //
-func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "ShuffleDataset",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2,
+			t, m, v, beta, gamma,
 		},
 		Attrs: attrs,
 	}
@@ -19738,69 +19980,51 @@ func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output
 	return op.Output(0)
 }
 
-// 3D fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
+// Deprecated. Use TensorArrayReadV3
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "FFT3D",
+		Type: "TensorArrayReadV2",
 		Input: []tf.Input{
-			input,
+			handle, index, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
 
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["Toutput"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// Returns x * y element-wise, working on quantized buffers.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19809,65 +20033,119 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			grads, image, boxes, box_ind,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Saves tensors in V2 checkpoint format.
-//
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
+
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x + y element-wise, working on quantized buffers.
 //
 // Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
 //
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "QuantizedAdd",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			x, y, min_x, max_x, min_y, max_y,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
-type StatsAggregatorHandleAttr func(optionalAttr)
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
 
-// StatsAggregatorHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["upper_frequency_limit"] = value
 	}
 }
 
-// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["lower_frequency_limit"] = value
 	}
 }
 
-// Creates a statistics manager resource.
-func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+//
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
+	}
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+//
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
+//
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
+//
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19876,204 +20154,224 @@ func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatsAggregatorHandle",
-
+		Type: "Mfcc",
+		Input: []tf.Input{
+			spectrogram, sample_rate,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// Given a quantized tensor described by (input, input_min, input_max), outputs a
 //
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// range that covers the actual values present in that tensor.  This op is
+// typically used to produce the requested_output_min and requested_output_max for
+// Requantize.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
+		Type: "RequantizationRange",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
+			input, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Reshapes a tensor.
+// Rolls the elements of a tensor along an axis.
 //
-// Given `tensor`, this operation returns a tensor that has the same values
-// as `tensor` with shape `shape`.
-//
-// If one component of `shape` is the special value -1, the size of that dimension
-// is computed so that the total size remains constant.  In particular, a `shape`
-// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-//
-// If `shape` is 1-D or higher, then the operation returns a tensor with shape
-// `shape` filled with the values of `tensor`. In this case, the number of elements
-// implied by `shape` must be the same as the number of elements in `tensor`.
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
 //
 // For example:
 //
 // ```
-// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-// # tensor 't' has shape [9]
-// reshape(t, [3, 3]) ==> [[1, 2, 3],
-//                         [4, 5, 6],
-//                         [7, 8, 9]]
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
 //
-// # tensor 't' is [[[1, 1], [2, 2]],
-// #                [[3, 3], [4, 4]]]
-// # tensor 't' has shape [2, 2, 2]
-// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-//                         [3, 3, 4, 4]]
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
 //
-// # tensor 't' is [[[1, 1, 1],
-// #                 [2, 2, 2]],
-// #                [[3, 3, 3],
-// #                 [4, 4, 4]],
-// #                [[5, 5, 5],
-// #                 [6, 6, 6]]]
-// # tensor 't' has shape [3, 2, 3]
-// # pass '[-1]' to flatten 't'
-// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
 //
-// # -1 can also be used to infer the shape
+// Arguments:
 //
-// # -1 is inferred to be 9:
-// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 2:
-// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 3:
-// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-//                               [2, 2, 2],
-//                               [3, 3, 3]],
-//                              [[4, 4, 4],
-//                               [5, 5, 5],
-//                               [6, 6, 6]]]
-//
-// # tensor 't' is [7]
-// # shape `[]` reshapes to a scalar
-// reshape(t, []) ==> 7
-// ```
-//
-// Arguments:
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
 //
-//	shape: Defines the shape of the output tensor.
-func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reshape",
+		Type: "Roll",
 		Input: []tf.Input{
-			tensor, shape,
+			input, shift, axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
-		Input: []tf.Input{
-			indices, values, dense_shape,
-		},
+}
+
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
+		Type: "MapPeek",
 		Input: []tf.Input{
-			input_dataset, another_dataset,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
 }
 
-// Adds a value to the current value of a variable.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
 //
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
 //
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			resource, value,
+			table_handle, keys, default_value,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
+//
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "LatencyStatsDataset",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			input_dataset, tag,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -20081,80 +20379,111 @@ func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, o
 	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
+// Calculates gains for each feature and returns the best possible split information for the feature.
 //
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+//
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+//
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 //
 // Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
 //
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
 		Input: []tf.Input{
-			json_examples,
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
+
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
+	return func(m optionalAttr) {
+		m["compression"] = value
+	}
+}
+
+// PNG-encode an image.
 //
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
 //
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
 //
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2D",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			input, filter,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -20162,80 +20491,101 @@ func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64
 	return op.Output(0)
 }
 
-// Converts the given variant tensor to an iterator and stores it in the given resource.
+// Updates the table to associates keys with values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
-// resource.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
 //
 // Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
+		Type: "LookupTableInsertV2",
 		Input: []tf.Input{
-			resource_handle, serialized,
+			table_handle, keys, values,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
-
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+// Returns element-wise smallest integer in not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+// Computes the number of elements in the given table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
+		Type: "LookupTableSizeV2",
 		Input: []tf.Input{
-			handle, flow_in,
+			table_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
+
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// Arguments:
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of bilinear interpolation.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
@@ -20243,333 +20593,247 @@ func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Out
 	return op.Output(0)
 }
 
-// Creates a dataset that batches input elements into a SparseTensor.
+// Outputs all keys and values in the table.
 //
 // Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
+//	table_handle: Handle to the table.
 //
 //
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
+		Type: "LookupTableExportV2",
 		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
+			table_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Deprecated. Use TensorArrayGradV3
+// Replaces the contents of the table with the specified keys and values.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
+		Type: "LookupTableImportV2",
 		Input: []tf.Input{
-			handle, flow_in,
+			table_handle, keys, values,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
 
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["capacity"] = value
 	}
 }
 
-// var: Should be from a Variable().
-//
-// Arguments:
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
 //
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
+		Type: "MapUnstageNoKey",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+			indices,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return key, values
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
+
+// HashTableV2Container sets the optional container attribute to value.
 //
-// If `pos` is negative or specifies a character index larger than any of the input
-// strings, then an `InvalidArgumentError` is thrown.
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
-//
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
-//
-// Broadcasting `input` onto `pos` and `len`:
-//
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
-//
-// output = [b'hir', b'ee', b'n']
-// ```
-//
-// Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
-//
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Substr",
-		Input: []tf.Input{
-			input, pos, len,
-		},
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a Dataset that returns pseudorandom numbers.
+// Creates a non-initialized hash table.
 //
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
+		Type: "HashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that shuffles and repeats elements from `input_dataset`
-//
-// pseudorandomly.
-//
-// Arguments:
-//
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//	count: A scalar representing the number of times the underlying dataset
-// should be repeated. The default is `-1`, which results in infinite repetition.
-//
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
 //
-func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ShuffleAndRepeatDataset",
-		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2, count,
-		},
-		Attrs: attrs,
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that caches elements from `input_dataset`.
-//
-// A CacheDataset will iterate over the input_dataset, and store tensors. If the
-// cache already exists, the cache will be used. If the cache is inappropriate
-// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-// will the returned when used.
-//
-// Arguments:
-//
-//	filename: A path on the filesystem where we should cache the dataset. Note: this
-// will be a directory.
-//
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "CacheDataset",
-		Input: []tf.Input{
-			input_dataset, filename,
-		},
-		Attrs: attrs,
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// PlaceholderAttr is an optional argument to Placeholder.
-type PlaceholderAttr func(optionalAttr)
-
-// PlaceholderShape sets the optional shape attribute to value.
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
-// shape is unconstrained.
-// If not specified, defaults to <unknown_rank:true >
-func PlaceholderShape(value tf.Shape) PlaceholderAttr {
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["shape"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// A placeholder op for a value that will be fed into the computation.
+// Creates an empty hash table.
 //
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	dtype: The type of elements in the tensor.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Placeholder",
+		Type: "MutableHashTableV2",
 
 		Attrs: attrs,
 	}
@@ -20577,286 +20841,254 @@ func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
+
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that emits the records from one or more binary files.
+// Dequantize the 'input' tensor into a float Tensor.
 //
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	header_bytes: A scalar representing the number of bytes to skip at the
-// beginning of a file.
-//	record_bytes: A scalar representing the number of bytes in each record.
-//	footer_bytes: A scalar representing the number of bytes to skip at the end
-// of a file.
-//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
-func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordDataset",
-		Input: []tf.Input{
-			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Slice a `SparseTensor` based on the `start` and `size`.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
 //
-// For example, if the input is
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
+// ```
+// if T == qint8, in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// Graphically the output tensors are:
+// *MIN_COMBINED Mode Example*
 //
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
 //
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
 //
 // Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
+		Type: "Dequantize",
 		Input: []tf.Input{
-			indices, values, shape, start, size,
+			input, min_range, max_range,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Concatenates quantized tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
+// Flips all bits elementwise.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
+		Type: "Invert",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// Inverse 3D fast Fourier transform.
 //
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
+//	input: A complex64 tensor.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "IFFT3D",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the records from one or more TFRecord files.
+// Deprecated. Disallowed in GraphDef version >= 2.
 //
-// Arguments:
-//	filenames: A scalar or vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar representing the number of bytes to buffer. A value of
-// 0 means no buffering will be performed.
-func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordDataset",
+		Type: "AdjustContrast",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			images, contrast_factor, min_value, max_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
+// Table initializer that takes two tensors for keys and values respectively.
 //
 // Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
 //
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
 //
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// PrintFirstN sets the optional first_n attribute to value.
 //
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["first_n"] = value
+	}
+}
+
+// PrintSummarize sets the optional summarize attribute to value.
 //
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
 //
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
+// Passes `input` through to `output` and prints `data` when evaluating.
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
 //
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
+		Type: "Print",
 		Input: []tf.Input{
-			input, crops,
+			input, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
@@ -20864,482 +21096,480 @@ func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int
 	return op.Output(0)
 }
 
-// Makes a new iterator from the given `dataset` and stores it in `iterator`.
-//
-// This operation may be executed multiple times. Each execution will reset the
-// iterator in `iterator` to the first element of `dataset`.
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
-// Returns the created operation.
-func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MakeIterator",
+		Type: "TensorSummaryV2",
 		Input: []tf.Input{
-			dataset, iterator,
+			tag, tensor, serialized_summary_metadata,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adjust the contrast of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
-// Contrast is adjusted independently for each channel of each image.
+// Arguments:
 //
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
 //
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
+		Type: "PrefetchDataset",
 		Input: []tf.Input{
-			images, contrast_factor,
+			input_dataset, buffer_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator.
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
-		Input: []tf.Input{
-			iterator,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
-		return
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
+
+// TensorSummaryDescription sets the optional description attribute to value.
+//
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["description"] = value
 	}
-	return components
 }
 
-// Outputs the single element from the given dataset.
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["labels"] = value
+	}
+}
+
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
 //
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
 //
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			dataset,
+			tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the tanh of `x` wrt its input.
+//
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
-	return components
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Converts the given `resource_handle` representing an iterator to a string.
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
+		Type: "ResourceScatterMax",
 		Input: []tf.Input{
-			resource_handle,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ShapeNAttr is an optional argument to ShapeN.
-type ShapeNAttr func(optionalAttr)
-
-// ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeNOutType(value tf.DataType) ShapeNAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns shape of tensors.
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
+//
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ShapeN",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			tags, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ShapeN", err)
-		return
-	}
-	return output
-}
-
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
-
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
-	}
+	return op.Output(0)
 }
 
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
-	}
-}
-
-// Converts the given string representing a handle to an iterator to a resource.
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	string_handle: A string representation of the given handle.
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
 //
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			string_handle,
+			tag, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+// Computes the number of elements in the given queue.
 //
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan2",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			y, x,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
+
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
+}
+
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
+//
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["bad_color"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Gather slices from `params` axis `axis` according to `indices`.
+// Outputs a `Summary` protocol buffer with images.
 //
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
 //
-// ```python
-//     # Scalar indices (output is rank(params) - 1).
-//     output[a_0, ..., a_n, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
 //
-//     # Vector indices (output is rank(params)).
-//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
 //
-//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
-//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-// ```
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
 //
 // Arguments:
-//	params: The tensor from which to gather values. Must be at least rank
-// `axis + 1`.
-//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-//	axis: The axis in `params` to gather `indices` from. Defaults to the first
-// dimension. Supports negative indexes.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
 //
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GatherV2",
+		Type: "ImageSummary",
 		Input: []tf.Input{
-			params, indices, axis,
+			tag, tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
+
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			resource_handle,
+			tag, tensor, sample_rate,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
-
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
 
-// FIFOQueueV2Container sets the optional container attribute to value.
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["data_format"] = value
 	}
 }
 
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// Performs average pooling on the input.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces a summary of any statistics recorded by the given statistics manager.
-func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatsAggregatorSummary",
+		Type: "AvgPool",
 		Input: []tf.Input{
-			iterator,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the pairwise cross product.
+// Merges summaries.
 //
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
 //
 // Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
 //
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cross",
+		Type: "MergeSummary",
 		Input: []tf.Input{
-			a, b,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Performs a padding as a preprocess during a convolution.
-//
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
 //	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+//
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
+		Type: "Dilation2DBackpropFilter",
 		Input: []tf.Input{
-			input, paddings, filter,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -21347,73 +21577,66 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	return op.Output(0)
 }
 
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
 
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the input.
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
+		Type: "AddSparseToTensorsMap",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -21421,128 +21644,81 @@ func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output,
 	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
 //
-// For example, if each `indices[m]` is scalar or vector, we have
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListPushBack",
+		Input: []tf.Input{
+			input_handle, tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of tensors in the input tensor list.
 //
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values are merged in order, so if an index appears in both `indices[m][i]` and
-// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-// merged result. If you do not need this guarantee, ParallelDynamicStitch might
-// perform better on some devices.
-//
-// For example:
-//
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
-//
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DynamicStitch",
+		Type: "TensorListLength",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			input_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x == y) element-wise.
+// The shape of the elements of the given list, as a tensor.
 //
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "Equal",
+		Type: "TensorListElementShape",
 		Input: []tf.Input{
-			x, y,
+			input_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
-
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayGatherV3
+// Returns the item in the list with the given index.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
+//
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
+		Type: "TensorListGetItem",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			input_handle, index,
 		},
 		Attrs: attrs,
 	}
@@ -21550,464 +21726,335 @@ func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
+// Computes the matrix exponential of one or more square matrices:
 //
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
+// exp(A) = \sum_{n=0}^\infty A^n/n!
 //
-// For example:
+// The exponential is computed using a combination of the scaling and squaring
+// method and the Pade approximation. Details can be founds in:
+// Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
+// revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
 //
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
+// Returns Shape is `[..., M, M]`.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.expm
+// @end_compatibility
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
+		Type: "MatrixExponential",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// Computes the matrix logarithm of one or more square matrices:
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+//
+// log(exp(A)) = A
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InvGrad",
+		Type: "MatrixLogarithm",
 		Input: []tf.Input{
-			y, dy,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StridedSliceAttr is an optional argument to StridedSlice.
-type StridedSliceAttr func(optionalAttr)
+// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
+type QueueDequeueUpToV2Attr func(optionalAttr)
 
-// StridedSliceBeginMask sets the optional begin_mask attribute to value.
+// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: a bitmask where a bit i being 1 means to ignore the begin
-// value and instead use the largest interval possible. At runtime
-// begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
-// `[-1, n-1]` if `stride[i] < 0`
-// If not specified, defaults to 0
-func StridedSliceBeginMask(value int64) StridedSliceAttr {
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
 	return func(m optionalAttr) {
-		m["begin_mask"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// StridedSliceEndMask sets the optional end_mask attribute to value.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// value: analogous to `begin_mask`
-// If not specified, defaults to 0
-func StridedSliceEndMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
+// This operation is not supported by all queues.  If a queue does not support
+// DequeueUpTo, then an Unimplemented error is returned.
 //
-// value: a bitmask where bit `i` being 1 means the `i`th
-// position is actually an ellipsis. One bit at most can be 1.
-// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
-// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
-// implicitly creates as many range specifications as necessary to fully
-// specify the sliced range for every dimension. For example for a 4-dimensional
-// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
-// If not specified, defaults to 0
-func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
 //
-// value: a bitmask where bit `i` being 1 means the `i`th
-// specification creates a new shape 1 dimension. For example
-// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
-// If not specified, defaults to 0
-func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size n in the 0th dimension.
 //
-// value: a bitmask where bit `i` implies that the `i`th
-// specification should shrink the dimensionality. begin and end
-// must imply a slice of size 1 in the dimension. For example in
-// python one might do `foo[:, 3, :]` which would result in
-// `shrink_axis_mask` being 2.
-// If not specified, defaults to 0
-func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueDequeueUpToV2",
+		Input: []tf.Input{
+			handle, n,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueUpToV2", err)
+		return
 	}
+	return components
 }
 
-// Return a strided slice from `input`.
-//
-// Note, most python users will want to use the Python `Tensor.__getitem__`
-// or `Variable.__getitem__` rather than this op directly.
-//
-// The goal of this op is to produce a new tensor with a subset of
-// the elements from the `n` dimensional `input` tensor. The subset is chosen using
-// a sequence of `m` sparse range specifications encoded into the arguments
-// of this function. Note, in some cases
-// `m` could be equal to `n`, but this need not be the case. Each
-// range specification entry can be one of the following:
-//
-// - An ellipsis (...). Ellipses are used to imply zero or more
-//   dimensions of full-dimension selection and are produced using
-//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
-//
-// - A new axis. This is used to insert a new shape=1 dimension and is
-//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
-//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
-//
-//
-// - A range `begin:end:stride`. This is used to specify how much to choose from
-//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
-//   which represents the index of the first value to select while `end` represents
-//   the index of the last value to select. The number of values selected in each
-//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
-//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
-//   the second to last. `begin_mask` controls whether to replace the explicitly
-//   given `begin` with an implicit effective value of `0` if `stride > 0` and
-//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
-//   required to create the largest open interval. For example, given a shape
-//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
-//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
-//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
-//   first dimension of a tensor while dropping the last two (in the original
-//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
-//
-// - A single index. This is used to keep only elements that have a given
-//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
-//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
-//   `shrink_axis_mask`.
-//
-// Each conceptual range specification is encoded in the op's argument. This
-// encoding is best understand by considering a non-trivial example. In
-// particular,
-// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
-//
-// ```
-// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
-// end = [2, 4, x, x, -3, x]
-// strides = [1, 1, x, x, -1, 1]
-// begin_mask = 1<<4 | 1 << 5 = 48
-// end_mask = 1<<5 = 32
-// ellipsis_mask = 1<<3 = 8
-// new_axis_mask = 1<<2 4
-// shrink_axis_mask = 1<<0
-// ```
-//
-// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
-// the slice becomes (2, 1, 5, 5, 2, 5).
-// Let us walk step by step through each argument specification.
-//
-// 1.  The first argument in the example slice is turned into `begin = 1` and
-// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
-// also set the appropriate bit in `shrink_axis_mask`.
-//
-// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
-// zero bits contributed.
-//
-// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
-// dimension in the final shape. Dummy values are contributed to begin,
-// end and stride, while the new_axis_mask bit is set.
+// Computes the Cholesky decomposition of one or more square matrices.
 //
-// 4. `...` grab the full ranges from as many dimensions as needed to
-// fully specify a slice for every dimension of the input shape.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
 //
-// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
-// with a dimension that has shape `s` is converted to a positive index
-// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
-// is done internally so begin, end and strides receive x, -3, and -1.
-// The appropriate begin_mask bit is set to indicate the start range is the
-// full range (ignoring the x).
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
 //
-// 6. `:` indicates that the entire contents of the corresponding dimension
-// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
-// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
-// `end_mask` are also set.
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
 //
-// *Requirements*:
-//   `0 != strides[i] for i in [0, m)`
-//   `ellipsis_mask must be a power of two (only one ellipsis)`
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
 //
 // Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-//	begin: `begin[k]` specifies the offset into the `k`th range specification.
-// The exact dimension this corresponds to will be determined by context.
-// Out-of-bounds values will be silently clamped. If the `k`th bit of
-// `begin_mask` then `begin[k]` is ignored and the full range of the
-// appropriate dimension is used instead. Negative values causes indexing
-// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
-//	end: `end[i]` is like `begin` with the exception that `end_mask` is
-// used to determine full ranges.
-//	strides: `strides[i]` specifies the increment in the `i`th specification
-// after extracting a given element. Negative indices will reverse
-// the original order. Out or range values are
-// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
-func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StridedSlice",
+		Type: "Cholesky",
 		Input: []tf.Input{
-			input, begin, end, strides,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
-type PriorityQueueV2Attr func(optionalAttr)
-
-// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
+// Writes contents to the file at input filename. Creates file and recursively
 //
-// value: The type of each component in a value.
-// If not specified, defaults to <>
+// creates directory if not existing.
 //
-// REQUIRES: len(value) >= 0
-func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["component_types"] = value
-	}
-}
-
-// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+// Arguments:
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// PriorityQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	opspec := tf.OpSpec{
+		Type: "WriteFile",
+		Input: []tf.Input{
+			filename, contents,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
+
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// A queue that produces elements sorted by the first component value.
+// Computes the "logical and" of elements across dimensions of a tensor.
 //
-// Note that the PriorityQueue requires the first component of any element
-// to be a scalar int64, in addition to the other elements declared by
-// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-// entry in their input (resp. output) lists.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	shapes: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The handle to the queue.
-func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PriorityQueueV2",
-
+		Type: "All",
+		Input: []tf.Input{
+			input, axis,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnstageAttr is an optional argument to Unstage.
-type UnstageAttr func(optionalAttr)
-
-// UnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 //
-// REQUIRES: value >= 0
-func UnstageCapacity(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// UnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
 //
-// REQUIRES: value >= 0
-func UnstageMemoryLimit(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// UnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnstageContainer(value string) UnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnstageSharedName(value string) UnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op is similar to a lightweight Dequeue.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
 //
-// The basic functionality is similar to dequeue with many fewer
-// capabilities and options.  This Op is optimized for performance.
-func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Unstage",
-
-		Attrs: attrs,
+		Type: "SelfAdjointEig",
+		Input: []tf.Input{
+			input,
+		},
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softplus gradients for a softplus operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
+//
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("Unstage", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "SoftplusGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
 	}
-	return values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
 
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+//
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["compute_v"] = value
 	}
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
 //
 // Arguments:
+//	input: `Tensor` input of shape `[N, N]`.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22016,70 +22063,95 @@ func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "SelfAdjointEigV2",
 		Input: []tf.Input{
-			input, dimension,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
+// Adjust the saturation of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
+//
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
+	opspec := tf.OpSpec{
+		Type: "AdjustSaturation",
+		Input: []tf.Input{
+			images, scale,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
 
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// SvdComputeUv sets the optional compute_uv attribute to value.
+//
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+		m["compute_uv"] = value
 	}
 }
 
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// SvdFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
+// Computes the singular value decompositions of one or more matrices.
 //
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22088,44 +22160,50 @@ func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, en
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "Svd",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
-type QueueEnqueueV2Attr func(optionalAttr)
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
 
-// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: If the queue is full, this operation will block for up to
-// timeout_ms milliseconds.
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
 // Note: This option is not supported yet.
 // If not specified, defaults to -1
-func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
 	return func(m optionalAttr) {
 		m["timeout_ms"] = value
 	}
 }
 
-// Enqueues a tuple of one or more tensors in the given queue.
+// Enqueues zero or more tuples of one or more tensors in the given queue.
+//
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
 //
 // The components input has k elements, which correspond to the components of
 // tuples stored in the given queue.
 //
 // N.B. If the queue is full, this operation will block until the given
-// element has been enqueued (or 'timeout_ms' elapses, if specified).
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
 //	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should be taken.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
 //
 // Returns the created operation.
-func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22134,7 +22212,7 @@ func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, opti
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueV2",
+		Type: "QueueEnqueueManyV2",
 		Input: []tf.Input{
 			handle, tf.OutputList(components),
 		},
@@ -22143,98 +22221,114 @@ func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, opti
 	return scope.AddOperation(opspec)
 }
 
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
-
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// Computes the product along segments of a tensor.
 //
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentProd",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
+// Converts one or more images from RGB to HSV.
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			handle, n,
+			images,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Does nothing. Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "NoOp",
 	}
-	return components
+	return scope.AddOperation(opspec)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
 
-// EncodeBase64Pad sets the optional pad attribute to value.
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
 	return func(m optionalAttr) {
-		m["pad"] = value
+		m["delete_old_dirs"] = value
 	}
 }
 
-// Encode strings into web-safe base64 format.
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
 //
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22243,360 +22337,524 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			input,
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Deprecated. Use TensorArrayCloseV3
+// Saves input tensors slices to disk.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
 //
 // Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
+		Type: "SaveSlices",
 		Input: []tf.Input{
-			handle,
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
 
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// Applies set operation along last dimension of 2 `Tensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			set1, set2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+// Generate a sharded filename. The filename is printf formatted as
+//
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReadFile",
+		Type: "ShardedFilename",
 		Input: []tf.Input{
-			filename,
+			basename, shard, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
+// BatchToSpace for N-D tensors of type T.
+//
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
 //
 // Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Forwards the value of an available tensor from `inputs` to `output`.
+// This operation is equivalent to the following steps:
 //
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
 //
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
 //
-// Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
 //
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Merge",
+		Type: "BatchToSpaceND",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input, block_shape, crops,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
 
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+// UnpackAxis sets the optional axis attribute to value.
 //
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
 	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
+		m["axis"] = value
 	}
 }
 
-// Closes the given queue.
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
 //
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
+//
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
 //
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num": num}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "Unpack",
 		Input: []tf.Input{
-			handle,
+			value,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
+	}
+	return output
 }
 
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
+//
+// Arguments:
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
+//
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "Atanh",
+		Type: "ResourceCountUpTo",
 		Input: []tf.Input{
-			x,
+			resource,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns true if queue is closed.
-//
-// This operation returns true if the queue is closed and false if the queue
-// is open.
+// Delete the stack from its resource container.
 //
 // Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+//	handle: The handle to a stack.
+//
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
+		Type: "StackCloseV2",
 		Input: []tf.Input{
 			handle,
 		},
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilespec",
+		Input: []tf.Input{
+			basename, num_shards,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the batched diagonal part of a batched tensor.
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
+
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// This operation returns a tensor with the `diagonal` part
-// of the batched `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-//
-// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-//
-// The input must be at least a matrix.
-//
-// For example:
-//
-// ```
-// # 'input' is [[[1, 0, 0, 0]
-//                [0, 2, 0, 0]
-//                [0, 0, 3, 0]
-//                [0, 0, 0, 4]],
-//               [[5, 0, 0, 0]
-//                [0, 6, 0, 0]
-//                [0, 0, 7, 0]
-//                [0, 0, 0, 8]]]
-//
-// and input.shape = (2, 4, 4)
-//
-// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// which has shape (2, 4)
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor where `k >= 2`.
-//
-// Returns The extracted diagonal(s) having shape
-// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiagPart",
-		Input: []tf.Input{
-			input,
-		},
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["skip_header_lines"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the absolute value of a tensor.
+// TextLineReaderV2Container sets the optional container attribute to value.
 //
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Abs",
-		Input: []tf.Input{
-			x,
-		},
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Flushes and closes the summary writer.
-//
-// Also removes it from the resource manager. To reopen, use another
-// CreateSummaryFileWriter op.
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	writer: A handle to the summary writer resource.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Returns the created operation.
-func CloseSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CloseSummaryWriter",
-		Input: []tf.Input{
-			writer,
-		},
+		Type: "TextLineReaderV2",
+
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
 
-// StackV2StackName sets the optional stack_name attribute to value.
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
 //
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
 	return func(m optionalAttr) {
-		m["stack_name"] = value
+		m["max_rows_in_memory"] = value
 	}
 }
 
-// A stack that produces elements in first-in last-out order.
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+//
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
+//
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
+//
+// The remappings are 1-D tensors with the following properties:
+//
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
+//
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+//
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
+//
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
+//
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
 //
 // Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			max_size,
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
 		Attrs: attrs,
 	}
@@ -22604,104 +22862,83 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
 
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["container"] = value
 	}
 }
 
-// OrderedMapStageContainer sets the optional container attribute to value.
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
 // If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["shared_name"] = value
 	}
 }
 
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
 // If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["compression_type"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
+// A Reader that outputs the records from a TensorFlow Records file.
 //
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
+		Type: "TFRecordReaderV2",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
-//
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
 	return func(m optionalAttr) {
-		m["swap_memory"] = value
+		m["signed_input"] = value
 	}
 }
 
-// Push an element onto the stack.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
 //
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22710,9 +22947,9 @@ func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...Sta
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StackPushV2",
+		Type: "QuantizeAndDequantizeV3",
 		Input: []tf.Input{
-			handle, elem,
+			input, input_min, input_max, num_bits,
 		},
 		Attrs: attrs,
 	}
@@ -22720,63 +22957,38 @@ func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...Sta
 	return op.Output(0)
 }
 
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
-
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
 
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// A Reader that outputs the queued work as both the key and value.
 //
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22785,268 +22997,215 @@ func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
-		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
-		},
+		Type: "IdentityReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
-//
-// If the given TensorArray gradient already exists, returns a reference to it.
-//
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
-//
-// **A note about the input flow_in:**
-//
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
-//
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
-//
-// **A note about the source attribute:**
-//
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
-//
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			handle, flow_in,
+			var_, alpha, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
-//
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
-//
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
-//
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
-// ```
-//
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
 // Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
 //
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
+		Type: "ReaderReadV2",
 		Input: []tf.Input{
-			input, threshold,
+			reader_handle, queue_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Push an element onto the tensor_array.
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	index: The position to write to inside the TensorArray.
-//	value: The tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV3",
+		Type: "ReaderReadUpToV2",
 		Input: []tf.Input{
-			handle, index, value, flow_in,
+			reader_handle, queue_handle, num_records,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Scatter the data from the input value into specific TensorArray elements.
-//
-// `indices` must be a vector, its length must match the first dim of `value`.
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations at which to write the tensor elements.
-//	value: The concatenated tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV3",
-		Input: []tf.Input{
-			handle, indices, value, flow_in,
-		},
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
-type TensorArrayConcatV3Attr func(optionalAttr)
-
-// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: The expected shape of an element, if known,
-// excluding the first dimension. Used to validate the shapes of
-// TensorArray elements. If this shape is not fully specified, concatenating
-// zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Concat the elements from the TensorArray into value `value`.
-//
-// Takes `T` elements of shapes
-//
-//   ```
-//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-//   ```
-//
-// and concatenates them into a Tensor of shape:
-//
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+// Update '*var' according to the Adam algorithm.
 //
-// All elements must have the same shape (excepting the first dimension).
+// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns All of the elements in the TensorArray, concatenated along the first
-// axis.A vector of the row sizes of the original T elements in the
-// value output.  In the example above, this would be the values:
-// `(n1, n2, ..., n(T-1))`.
-func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV3",
+		Type: "ResourceApplyAdam",
 		Input: []tf.Input{
-			handle, flow_in,
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+// Store the input tensor in the state of the current session.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandleV2",
+		Input: []tf.Input{
+			value,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
+
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Outputs random values from a normal distribution. The parameters may each be a
-//
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
+// Computes the gradient of bicubic interpolation.
 //
 // Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23055,9 +23214,9 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "ResizeBicubicGrad",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
@@ -23065,105 +23224,113 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 	return op.Output(0)
 }
 
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-//
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-//
-// For example:
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
 // Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			diagonal,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Split the data from the input value into TensorArray elements.
-//
-// Assuming that `lengths` takes on values
-//
-//   ```(n0, n1, ..., n(T-1))```
-//
-// and that `value` has shape
-//
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
-//
-// this splits values into a TensorArray with T tensors.
-//
-// TensorArray index t will be the subtensor of values with starting position
-//
-//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
-//
-// and having size
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-//   ```nt x d0 x d1 x ...```
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of nearest neighbor interpolation.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	value: The concatenated tensor to write to the TensorArray.
-//	lengths: The vector of lengths, how to split the rows of value into the
-// TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV3",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			grads, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
 
-// SerializeSparseOutType sets the optional out_type attribute to value.
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["output_type"] = value
 	}
 }
 
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+// Extract the shape information of a JPEG-encoded image.
+//
+// This op only parses the image header, so it is much faster than DecodeJpeg.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+//	contents: 0-D. The JPEG-encoded image.
+//
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23172,9 +23339,9 @@ func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "ExtractJpegShape",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -23182,97 +23349,72 @@ func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Ou
 	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
 
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
 // value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shapes"] = value
 	}
 }
 
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
 // value: The upper bound on the number of elements in this queue.
 // Negative numbers mean no limit.
 // If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
-//
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
 //
 // value: If non-empty, this queue is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
 // value: If non-empty, this queue will be shared under the given name
 // across multiple sessions.
 // If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// A queue that randomizes the order of elements.
+// A queue that produces elements in first-in first-out order.
+//
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
 //
 // Arguments:
 //	component_types: The type of each component in a value.
 //
 // Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23281,7 +23423,7 @@ func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
+		Type: "PaddingFIFOQueueV2",
 
 		Attrs: attrs,
 	}
@@ -23289,291 +23431,318 @@ func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
-//
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
-		Input: []tf.Input{
-			images, boxes,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
 
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// DecodePngChannels sets the optional channels attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// value: Number of color channels for the decoded image.
 // If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+func DecodePngChannels(value int64) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["channels"] = value
 	}
 }
 
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dtype"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// Accepted values are:
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	contents: 0-D.  The PNG-encoded image.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			true_classes,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes gradients for the scaled exponential linear (Selu) operation.
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+//
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
+//	contents: 0-D.  The GIF-encoded image.
 //
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SeluGrad",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			gradients, outputs,
+			contents,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the current size of the TensorArray.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//	flow_in: A float scalar that enforces proper chaining of operations.
+// Computes the gradient of the sigmoid of `x` wrt its input.
 //
-// Returns The current size of the TensorArray.
-func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV3",
+		Type: "SigmoidGrad",
 		Input: []tf.Input{
-			handle, flow_in,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
+// Convert one or more images from HSV to RGB.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
+//
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HSVToRGB",
+		Input: []tf.Input{
+			images,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
-
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesGetEnsembleStates",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+// Gets the next output from the given iterator.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
+// This operation is a synchronous version IteratorGetNext. It should only be used
+// in situations where the iterator does not block the calling thread, or where
+// the calling thread is not a member of the thread pool used to execute parallel
+// operations (e.g. in eager mode).
+func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
+		Type: "IteratorGetNextSync",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			iterator,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNextSync", err)
+		return
+	}
+	return components
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
 
-// AsStringPrecision sets the optional precision attribute to value.
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
 //
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["precision"] = value
+		m["seed"] = value
 	}
 }
 
-// AsStringScientific sets the optional scientific attribute to value.
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["scientific"] = value
+		m["seed2"] = value
 	}
 }
 
-// AsStringShortest sets the optional shortest attribute to value.
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["shortest"] = value
+		m["aspect_ratio_range"] = value
 	}
 }
 
-// AsStringWidth sets the optional width attribute to value.
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["width"] = value
+		m["area_range"] = value
 	}
 }
 
-// AsStringFill sets the optional fill attribute to value.
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
 //
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["fill"] = value
+		m["max_attempts"] = value
 	}
 }
 
-// Converts each entry in the given tensor to strings.  Supports many numeric
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23582,326 +23751,302 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AsString",
+		Type: "SampleDistortedBoundingBoxV2",
 		Input: []tf.Input{
-			input,
+			image_size, bounding_boxes, min_object_covered,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Deprecated. Use TensorArrayScatterV3
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
-func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["centered"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV2",
-		Input: []tf.Input{
-			handle, indices, value, flow_in,
-		},
+}
+
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
+//
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Applies sparse addition to `input` using individual values or slices
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
 //
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
-// ```
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
 //
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
 //
-// The resulting value `output` would look like this:
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
 //
-//     [1, 13, 3, 14, 14, 6, 7, 20]
+// The argument `normalized` and `centered` controls how the windows are built:
 //
-// See @{tf.scatter_nd} for more details about how to make updates to slices.
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
 //
 // Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
 //
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
+		Type: "ExtractGlimpse",
 		Input: []tf.Input{
-			input, indices, updates,
+			input, size, offsets,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
-
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+// A container for an iterator resource.
 //
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "Iterator",
 
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
 	return func(m optionalAttr) {
-		m["deterministic"] = value
+		m["method"] = value
 	}
 }
 
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
+//
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
 	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
+type ShuffleDatasetAttr func(optionalAttr)
+
+// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+// value: If true, each iterator over this dataset will be given
+// a different pseudorandomly generated seed, based on a sequence seeded by the
+// `seed` and `seed2` inputs. If false, each iterator will be given the same
+// seed, and repeated iteration over this dataset will yield the exact same
+// sequence of results.
+// If not specified, defaults to true
+func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["reshuffle_each_iteration"] = value
 	}
 }
 
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
-//
-// Then, row_pooling_sequence should satisfy:
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
+// Arguments:
 //
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
 //
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			value,
+			input_dataset, buffer_size, seed, seed2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySizeV3
+// 3D fast Fourier transform.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
+		Type: "FFT3D",
 		Input: []tf.Input{
-			handle, flow_in,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
-
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
 
-// Conv2DDilations sets the optional dilations attribute to value.
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DDilations(value []int64) Conv2DAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["method"] = value
 	}
 }
 
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
-//
-// In detail, with the default NHWC format,
-//
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 //
 // Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-// `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2D",
+		Type: "CropAndResizeGradBoxes",
 		Input: []tf.Input{
-			input, filter,
+			grads, image, boxes, box_ind,
 		},
 		Attrs: attrs,
 	}
@@ -23909,130 +24054,55 @@ func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, pa
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
-type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+// Saves tensors in V2 checkpoint format.
 //
-// Attributes `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
 //
-// Quantization is called fake since the output is still in floating point.
-func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgs",
+		Type: "SaveV2",
 		Input: []tf.Input{
-			inputs,
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StageAttr is an optional argument to Stage.
-type StageAttr func(optionalAttr)
-
-// StageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageCapacity(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
 	}
+	return scope.AddOperation(opspec)
 }
 
-// StageMemoryLimit sets the optional memory_limit attribute to value.
-//
-// value: The maximum number of bytes allowed for Tensors in the Staging Area.
-// If > 0, inserts will block until sufficient space is available.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageMemoryLimit(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
+// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
+type StatsAggregatorHandleAttr func(optionalAttr)
 
-// StageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
+// StatsAggregatorHandleContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func StageContainer(value string) StageAttr {
+func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// StageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
+// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func StageSharedName(value string) StageAttr {
+func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Stage values similar to a lightweight Enqueue.
-//
-// The basic functionality of this Op is similar to a queue with many
-// fewer capabilities and options.  This Op is optimized for performance.
-//
-// Arguments:
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-// Returns the created operation.
-func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
+// Creates a statistics manager resource.
+func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24041,268 +24111,513 @@ func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Opera
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Stage",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
+		Type: "StatsAggregatorHandle",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StagePeekAttr is an optional argument to StagePeek.
-type StagePeekAttr func(optionalAttr)
-
-// StagePeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// REQUIRES: value >= 0
-func StagePeekCapacity(value int64) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
 //
-// REQUIRES: value >= 0
-func StagePeekMemoryLimit(value int64) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// StagePeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StagePeekContainer(value string) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionV2",
+		Input: []tf.Input{
+			boxes, scores, max_output_size, iou_threshold,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StagePeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StagePeekSharedName(value string) StagePeekAttr {
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
+
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// Op peeks at the values at the specified index.  If the
+// The op serializes protobuf messages provided in the input tensors.
 //
-// underlying container does not contain sufficient elements
-// this op will block until it does.   This Op is optimized for
-// performance.
-func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
+//
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
+//
+// Arguments:
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
+//
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StagePeek",
+		Type: "EncodeProto",
 		Input: []tf.Input{
-			index,
+			sizes, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorArray for storing the gradients of values in the given handle.
+//
+// If the given TensorArray gradient already exists, returns a reference to it.
+//
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
+//
+// **A note about the input flow_in:**
+//
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
+//
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
+//
+// **A note about the source attribute:**
+//
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
+//
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("StagePeek", err)
-		return
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV3",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
 	}
-	return values
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
+// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
+type DecodeProtoV2Attr func(optionalAttr)
 
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+// value: Either the special value `local://` or a path to a file containing
+// a serialized `FileDescriptorSet`.
+// If not specified, defaults to "local://"
+func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+// value: Either `binary` or `text`.
+// If not specified, defaults to "binary"
+func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["message_format"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+//
+// value: Whether to sanitize the result or not.
+// If not specified, defaults to false
+func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["sanitize"] = value
+	}
+}
+
+// The op extracts fields from a serialized protocol buffers message into tensors.
+//
+// The `decode_proto` op extracts fields from a serialized protocol buffers
+// message into tensors.  The fields in `field_names` are decoded and converted
+// to the corresponding `output_types` if possible.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// Each output tensor is a dense tensor. This means that it is padded to
+// hold the largest number of repeated elements seen in the input
+// minibatch. (The shape is also padded by one to prevent zero-sized
+// dimensions). The actual repeat counts for each example in the
+// minibatch can be found in the `sizes` output. In many cases the output
+// of `decode_proto` is fed immediately into tf.squeeze if missing values
+// are not a concern. When using tf.squeeze, always pass the squeeze
+// dimension explicitly to avoid surprises.
+//
+// For the most part, the mapping between Proto field types and
+// TensorFlow dtypes is straightforward. However, there are a few
+// special cases:
+//
+// - A proto field that contains a submessage or group can only be converted
+// to `DT_STRING` (the serialized submessage). This is to reduce the
+// complexity of the API. The resulting string can be used as input
+// to another instance of the decode_proto op.
+//
+// - TensorFlow lacks support for unsigned integers. The ops represent uint64
+// types as a `DT_INT64` with the same twos-complement bit pattern
+// (the obvious way). Unsigned int32 values can be represented exactly by
+// specifying type `DT_INT64`, or using twos-complement if the caller
+// specifies `DT_INT32` in the `output_types` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// Both binary and text proto serializations are supported, and can be
+// chosen using the `format` attribute.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	message_type: Name of the proto message type to decode.
+//	field_names: List of strings containing proto field names.
+//	output_types: List of TF types to use for the respective field in field_names.
+//
+// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+// Each entry is the number of values found for the corresponding field.
+// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
+// `values[i]` has datatype `output_types[i]`
+// and shape `[batch_shape, max(sizes[...,i])]`.
+func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
+		Type: "DecodeProtoV2",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			bytes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	sizes = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("DecodeProtoV2", err)
+		return
+	}
+	return sizes, values
 }
 
-// DepthToSpaceAttr is an optional argument to DepthToSpace.
-type DepthToSpaceAttr func(optionalAttr)
-
-// DepthToSpaceDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorSliceDataset",
+		Input: []tf.Input{
+			indices, values, dense_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DepthToSpace for tensors of type T.
-//
-// Rearranges data from depth into blocks of spatial data.
-// This is the reverse transformation of SpaceToDepth. More specifically,
-// this op outputs a copy of the input tensor where values from the `depth`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions.
-// The attr `block_size` indicates the input block size and how the data is moved.
+// Returns x / y element-wise for real types.
 //
-//   * Chunks of data of size `block_size * block_size` from depth are rearranged
-//     into non-overlapping blocks of size `block_size x block_size`
-//   * The width the output tensor is `input_depth * block_size`, whereas the
-//     height is `input_height * block_size`.
-//   * The Y, X coordinates within each block of the output image are determined
-//     by the high order component of the input channel index.
-//   * The depth of the input tensor must be divisible by
-//     `block_size * block_size`.
+// If `x` and `y` are reals, this will return the floating-point division.
 //
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Adds v into specified rows of x.
 //
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-//                         within the input image, bX, bY means coordinates
-//                         within the output block, oC means output channels).
-//      The output would be the input transposed to the following layout:
-//      n,iY,bY,iX,bX,oC
+//     Computes y = x; y[i, :] += v; return y.
 //
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
 //
-// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-// block_size = 2:
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceAdd",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a Reader to its initial clean state.
 //
-// ```
-// x = [[[[1, 2, 3, 4]]]]
+// Arguments:
+//	reader_handle: Handle to a Reader.
 //
-// ```
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
 //
-// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
 //
-// ```
-//    [[[[1], [2]],
-//      [[3], [4]]]]
-// ```
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
 //
-// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-// the corresponding output will have 2x2 elements and will have a depth of
-// 1 channel (1 = `4 / (block_size * block_size)`).
-// The output element shape is `[2, 2, 1]`.
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
 //
-// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
 //
-// ```
-// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
 //
-// This operation, for block size of 2, will return the following tensor of shape
-// `[1, 2, 2, 3]`
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
 //
 // ```
-//    [[[[1, 2, 3], [4, 5, 6]],
-//      [[7, 8, 9], [10, 11, 12]]]]
-//
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
 // ```
 //
-// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+// then call this op with arguments:
 //
 // ```
-// x =  [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
 // ```
 //
-// the operator will return the following tensor of shape `[1 4 4 1]`:
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
 //
-// ```
-// x = [[[ [1],   [2],  [5],  [6]],
-//       [ [3],   [4],  [7],  [8]],
-//       [ [9],  [10], [13],  [14]],
-//       [ [11], [12], [15],  [16]]]]
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
 //
-// ```
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
 //
 // Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
 //
-//	block_size: The size of the spatial block, same as in Space2Depth.
-func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthToSpace",
+		Type: "Rpc",
 		Input: []tf.Input{
-			input,
+			address, method, request,
 		},
 		Attrs: attrs,
 	}
@@ -24310,54 +24625,56 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...D
 	return op.Output(0)
 }
 
-// MapStageAttr is an optional argument to MapStage.
-type MapStageAttr func(optionalAttr)
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
 
-// MapStageCapacity sets the optional capacity attribute to value.
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
 //
 // value: Maximum number of elements in the Staging Area. If > 0, inserts
 // on the container will block when the capacity is reached.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapStageCapacity(value int64) MapStageAttr {
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapStageMemoryLimit(value int64) MapStageAttr {
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// MapStageContainer sets the optional container attribute to value.
+// OrderedMapStageContainer sets the optional container attribute to value.
 //
 // value: If non-empty, this queue is placed in the given container. Otherwise,
 // a default container is used.
 // If not specified, defaults to ""
-func MapStageContainer(value string) MapStageAttr {
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapStageSharedName sets the optional shared_name attribute to value.
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
 //
 // value: It is necessary to match this name to the matching Unstage Op.
 // If not specified, defaults to ""
-func MapStageSharedName(value string) MapStageAttr {
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a hashtable.
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
 //
 // Arguments:
 //	key: int64
@@ -24367,7 +24684,7 @@ func MapStageSharedName(value string) MapStageAttr {
 //
 //
 // Returns the created operation.
-func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24376,7 +24693,7 @@ func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapStage",
+		Type: "OrderedMapStage",
 		Input: []tf.Input{
 			key, indices, tf.OutputList(values),
 		},
@@ -24385,75 +24702,100 @@ func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output
 	return scope.AddOperation(opspec)
 }
 
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
 
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
 //
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["swap_memory"] = value
 	}
 }
 
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Push an element onto the stack.
 //
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
+//
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "StackPushV2",
+		Input: []tf.Input{
+			handle, elem,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MapUnstage",
+		Type: "ConcatenateDataset",
 		Input: []tf.Input{
-			key, indices,
+			input_dataset, another_dataset,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds a value to the current value of a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
+	opspec := tf.OpSpec{
+		Type: "AssignAddVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	return values
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "LatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // MapSizeAttr is an optional argument to MapSize.
@@ -24513,408 +24855,414 @@ func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size
 	return op.Output(0)
 }
 
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
-
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
 //
-// REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
+//
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
+//
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
-
+		Type: "Dilation2D",
+		Input: []tf.Input{
+			input, filter,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
-type OrderedMapUnstageAttr func(optionalAttr)
-
-// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Converts the given variant tensor to an iterator and stores it in the given resource.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//	serialized: A variant tensor storing the state of the iterator contained in the
+// resource.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Returns the created operation.
+func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	opspec := tf.OpSpec{
+		Type: "DeserializeIterator",
+		Input: []tf.Input{
+			resource_handle, serialized,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
+
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["element_shape_except0"] = value
 	}
 }
 
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstage",
+		Type: "TensorArrayConcatV2",
 		Input: []tf.Input{
-			key, indices,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstage", err)
-		return
-	}
-	return values
+	return op.Output(0), op.Output(1)
 }
 
-// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
-type OrderedMapSizeAttr func(optionalAttr)
-
-// OrderedMapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
-// REQUIRES: value >= 0
-func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Arguments:
 //
-// REQUIRES: value >= 0
-func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapSize",
-
+		Type: "PaddedBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CTCLossAttr is an optional argument to CTCLoss.
-type CTCLossAttr func(optionalAttr)
-
-// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+// Creates a dataset that batches input elements into a SparseTensor.
 //
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
-	}
-}
-
-// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
 //
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
+//
+func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DenseToSparseBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, row_shape,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+// Deprecated. Use TensorArrayGradV3
 //
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
-// If not specified, defaults to false
-func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ignore_longer_outputs_than_inputs"] = value
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+// Return substrings from `Tensor` of strings.
 //
-// the gradient.  This class performs the softmax operation for you, so inputs
-// should be e.g. linear projections of outputs by an LSTM.
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// If `pos` is negative or specifies a character index larger than any of the input
+// strings, then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
-// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-// `(batch b, time t)`.
-//	labels_values: The values (labels) associated with the given batch and time.
-//	sequence_length: A vector containing sequence lengths (batch).
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
 //
-// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
-// `(max_time x batch_size x num_classes)`.
-func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CTCLoss",
+		Type: "Substr",
 		Input: []tf.Input{
-			inputs, labels_indices, labels_values, sequence_length,
+			input, pos, len,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
-type CTCGreedyDecoderAttr func(optionalAttr)
-
-// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If True, merge repeated classes in output.
-// If not specified, defaults to false
-func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
+	return op.Output(0)
 }
 
-// Performs greedy decoding on the logits given in inputs.
-//
-// A note about the attribute merge_repeated: if enabled, when
-// consecutive logits' maximum indices are the same, only the first of
-// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
-// becomes "A B B" if merge_repeated = True and "A B B B B" if
-// merge_repeated = False.
-//
-// Regardless of the value of merge_repeated, if the maximum index of a given
-// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
-// element is emitted.
+// Creates a Dataset that returns pseudorandom numbers.
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
-// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
-// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
-// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
-// log-probabilities.
-func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
+//
+func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "CTCGreedyDecoder",
+		Type: "RandomDataset",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			seed, seed2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Forwards `data` to the output port determined by `pred`.
-//
-// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-// the data goes to `output_false`.
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
 //
-// See also `RefSwitch` and `Merge`.
+// pseudorandomly.
 //
 // Arguments:
-//	data: The tensor to be forwarded to the appropriate output.
-//	pred: A scalar that specifies which output port will receive data.
 //
-// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
-func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
+//
+//
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Switch",
+		Type: "ShuffleAndRepeatDataset",
 		Input: []tf.Input{
-			data, pred,
+			input_dataset, buffer_size, seed, seed2, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Add all input tensors element wise.
+// Creates a dataset that caches elements from `input_dataset`.
+//
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
 //
 // Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+//
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
+//
+//
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "AddN",
+		Type: "CacheDataset",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input_dataset, filename,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
-
-// EnterIsConstant sets the optional is_constant attribute to value.
-//
-// value: If true, the output is constant within the child frame.
-// If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
-	return func(m optionalAttr) {
-		m["is_constant"] = value
-	}
-}
-
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
-//
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
-	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
-	}
-}
-
-// Creates or finds a child frame, and makes `data` available to the child frame.
-//
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
+// Creates a dataset that executes a SQL query and emits rows of the result set.
 //
 // Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
 //
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+//
+func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Enter",
+		Type: "SqlDataset",
 		Input: []tf.Input{
-			data,
+			driver_name, data_source_name, query,
 		},
 		Attrs: attrs,
 	}
@@ -24922,308 +25270,392 @@ func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAtt
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// Creates a dataset that emits the records from one or more binary files.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "FixedLengthRecordDataset",
 		Input: []tf.Input{
-			reader_handle,
+			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Exits the current frame to its parent frame.
+// Gradients for batch normalization.
 //
-// Exit makes its input `data` available to the parent frame.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	data: The tensor to be made available to the parent frame.
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "Exit",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			data,
+			t, m, v, gamma, backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Returns a copy of the input tensor.
-func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
+// Creates a dataset that emits the records from one or more TFRecord files.
+//
+// Arguments:
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar representing the number of bytes to buffer. A value of
+// 0 means no buffering will be performed.
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Snapshot",
+		Type: "TFRecordDataset",
 		Input: []tf.Input{
-			input,
+			filenames, compression_type, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Scatter `updates` into a new (initially zero) tensor according to `indices`.
+// BatchToSpace for 4-D tensors of type T.
 //
-// Creates a new tensor by applying sparse `updates` to individual
-// values or slices within a zero tensor of the given `shape` according to
-// indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-// extracts values or slices from a given tensor.
+// This is a legacy version of the more general BatchToSpaceND.
 //
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates.
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
 //
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
 //
-//     indices.shape[-1] <= shape.rank
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
 //
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
 //
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
 //
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
+// The attr `block_size` must be greater than one. It indicates the block size.
 //
-// In Python, this scatter operation would look like this:
+// Some examples:
 //
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 // ```
 //
-// The resulting tensor would look like this:
+// The output tensor has shape `[1, 2, 2, 1]` and value:
 //
-//     [0, 11, 0, 10, 9, 0, 0, 12]
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
 //
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-// </div>
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
 //
-// In Python, this scatter operation would look like this:
+// The output tensor has shape `[1, 2, 2, 3]` and value:
 //
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
 // ```
 //
-// The resulting tensor would look like this:
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
 //
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
 //
-// Arguments:
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
+// The output tensor has shape `[1, 4, 4, 1]` and value:
 //
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "ScatterNd",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			indices, updates, shape,
+			input, crops,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SpaceToDepthAttr is an optional argument to SpaceToDepth.
-type SpaceToDepthAttr func(optionalAttr)
-
-// SpaceToDepthDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+//
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
+//
+// Returns the created operation.
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MakeIterator",
+		Input: []tf.Input{
+			dataset, iterator,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// SpaceToDepth for tensors of type T.
-//
-// Rearranges blocks of spatial data, into depth. More specifically,
-// this op outputs a copy of the input tensor where values from the `height`
-// and `width` dimensions are moved to the `depth` dimension.
-// The attr `block_size` indicates the input block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` are rearranged
-//     into depth at each location.
-//   * The depth of the output tensor is `block_size * block_size * input_depth`.
-//   * The Y, X coordinates within each block of the input become the high order
-//     component of the output channel index.
-//   * The input tensor's height and width must be divisible by block_size.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
-//                         within the output image, bX, bY means coordinates
-//                         within the input block, iC means input channels).
-//      The output would be a transpose to the following layout:
-//      n,oY,oX,bY,bX,iC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
-// block_size = 2:
+// Makes the summary of accumulated stats for the batch.
 //
-// ```
-// x = [[[[1], [2]],
-//       [[3], [4]]]]
-// ```
+// The summary stats contains gradients and hessians accumulated into the corresponding node and bucket for each example.
 //
-// This operation will output a tensor of shape `[1, 1, 1, 4]`:
+// Arguments:
+//	node_ids: int32 Rank 1 Tensor containing node ids, which each example falls into for the requested layer.
+//	gradients: float32; Rank 2 Tensor (shape=[#examples, 1]) for gradients.
+//	hessians: float32; Rank 2 Tensor (shape=[#examples, 1]) for hessians.
+//	bucketized_features_list: int32 list of Rank 1 Tensors, each containing the bucketized feature (for each feature column).
+//	max_splits: int; the maximum number of splits possible in the whole tree.
+//	num_buckets: int; equals to the maximum possible value of bucketized feature.
 //
-// ```
-// [[[[1, 2, 3, 4]]]]
-// ```
+// Returns output Rank 4 Tensor (shape=[#features, #splits, #buckets, 2]) containing accumulated stats put into the corresponding node and bucket. The first index of 4th dimension refers to gradients, and the second to hessians.
+func BoostedTreesMakeStatsSummary(scope *Scope, node_ids tf.Output, gradients tf.Output, hessians tf.Output, bucketized_features_list []tf.Output, max_splits int64, num_buckets int64) (stats_summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits, "num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesMakeStatsSummary",
+		Input: []tf.Input{
+			node_ids, gradients, hessians, tf.OutputList(bucketized_features_list),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adjust the contrast of one or more images.
 //
-// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-// the corresponding output will have a single element (i.e. width and height are
-// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-// The output element shape is `[1, 1, 4]`.
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
 //
-// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
+// Contrast is adjusted independently for each channel of each image.
 //
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
 //
-// This operation, for block_size of 2, will return the following tensor of shape
-// `[1, 1, 1, 12]`
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
 //
-// ```
-// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrastv2",
+		Input: []tf.Input{
+			images, contrast_factor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gets the next output from the given iterator.
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNext",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
+	}
+	return components
+}
+
+// Outputs the single element from the given dataset.
 //
-// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
+// Arguments:
+//	dataset: A handle to a dataset that contains a single element.
 //
-// ```
-// x = [[[[1],   [2],  [5],  [6]],
-//       [[3],   [4],  [7],  [8]],
-//       [[9],  [10], [13],  [14]],
-//       [[11], [12], [15],  [16]]]]
-// ```
 //
-// the operator will return the following tensor of shape `[1 2 2 4]`:
 //
-// ```
-// x = [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
+// Returns The components of the single element of `input`.
+func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DatasetToSingleElement",
+		Input: []tf.Input{
+			dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("DatasetToSingleElement", err)
+		return
+	}
+	return components
+}
+
+// Converts the given `resource_handle` representing an iterator to a string.
 //
 // Arguments:
+//	resource_handle: A handle to an iterator resource.
 //
-//	block_size: The size of the spatial block.
-func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SpaceToDepth",
+		Type: "IteratorToStringHandle",
 		Input: []tf.Input{
-			input,
+			resource_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AbortAttr is an optional argument to Abort.
-type AbortAttr func(optionalAttr)
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
 
-// AbortErrorMsg sets the optional error_msg attribute to value.
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// value: A string which is the message associated with the exception.
-// If not specified, defaults to ""
-func AbortErrorMsg(value string) AbortAttr {
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["error_msg"] = value
+		m["output_types"] = value
 	}
 }
 
-// AbortExitWithoutError sets the optional exit_without_error attribute to value.
-// If not specified, defaults to false
-func AbortExitWithoutError(value bool) AbortAttr {
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+//
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["exit_without_error"] = value
+		m["output_shapes"] = value
 	}
 }
 
-// Raise a exception to abort the process when called.
-//
-// If exit_without_error is true, the process will exit normally,
-// otherwise it will exit with a SIGABORT signal.
+// Converts the given string representing a handle to an iterator to a resource.
 //
-// Returns nothing but an exception.
+// Arguments:
+//	string_handle: A string representation of the given handle.
 //
-// Returns the created operation.
-func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25232,371 +25664,309 @@ func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Abort",
-
+		Type: "IteratorFromStringHandle",
+		Input: []tf.Input{
+			string_handle,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
-type UniformCandidateSamplerAttr func(optionalAttr)
-
-// UniformCandidateSamplerSeed sets the optional seed attribute to value.
+// Gather slices from `params` axis `axis` according to `indices`.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a uniform distribution.
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+// ```
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "UniformCandidateSampler",
+		Type: "GatherV2",
 		Input: []tf.Input{
-			true_classes,
+			params, indices, axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
-type FixedUnigramCandidateSamplerAttr func(optionalAttr)
-
-// FixedUnigramCandidateSamplerVocabFile sets the optional vocab_file attribute to value.
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
 //
-// value: Each valid line in this file (which should have a CSV-like format)
-// corresponds to a valid word ID. IDs are in sequential order, starting from
-// num_reserved_ids. The last entry in each line is expected to be a value
-// corresponding to the count or relative probability. Exactly one of vocab_file
-// and unigrams needs to be passed to this op.
-// If not specified, defaults to ""
-func FixedUnigramCandidateSamplerVocabFile(value string) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["vocab_file"] = value
-	}
-}
-
-// FixedUnigramCandidateSamplerDistortion sets the optional distortion attribute to value.
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
 //
-// value: The distortion is used to skew the unigram probability distribution.
-// Each weight is first raised to the distortion's power before adding to the
-// internal unigram distribution. As a result, distortion = 1.0 gives regular
-// unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
-// a uniform distribution.
-// If not specified, defaults to 1
-func FixedUnigramCandidateSamplerDistortion(value float32) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["distortion"] = value
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FixedUnigramCandidateSamplerNumReservedIds sets the optional num_reserved_ids attribute to value.
-//
-// value: Optionally some reserved IDs can be added in the range [0,
-// ..., num_reserved_ids) by the users. One use case is that a special unknown
-// word token is used as ID 0. These IDs will have a sampling probability of 0.
-// If not specified, defaults to 0
-func FixedUnigramCandidateSamplerNumReservedIds(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["num_reserved_ids"] = value
+	opspec := tf.OpSpec{
+		Type: "SerializeIterator",
+		Input: []tf.Input{
+			resource_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FixedUnigramCandidateSamplerNumShards sets the optional num_shards attribute to value.
-//
-// value: A sampler can be used to sample from a subset of the original range
-// in order to speed up the whole computation through parallelism. This parameter
-// (together with 'shard') indicates the number of partitions that are being
-// used in the overall computation.
-// If not specified, defaults to 1
-//
-// REQUIRES: value >= 1
-func FixedUnigramCandidateSamplerNumShards(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["num_shards"] = value
-	}
-}
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
 
-// FixedUnigramCandidateSamplerShard sets the optional shard attribute to value.
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: A sampler can be used to sample from a subset of the original range
-// in order to speed up the whole computation through parallelism. This parameter
-// (together with 'num_shards') indicates the particular partition number of a
-// sampler op, when partitioning is being used.
-// If not specified, defaults to 0
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
 //
-// REQUIRES: value >= 0
-func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSamplerAttr {
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["shard"] = value
+		m["shapes"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerUnigrams sets the optional unigrams attribute to value.
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: A list of unigram counts or probabilities, one per ID in sequential
-// order. Exactly one of vocab_file and unigrams should be passed to this op.
-// If not specified, defaults to <>
-func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["unigrams"] = value
+		m["capacity"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// FIFOQueueV2Container sets the optional container attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FixedUnigramCandidateSamplerSeed(value int64) FixedUnigramCandidateSamplerAttr {
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["container"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSamplerAttr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// A unigram sampler could use a fixed unigram distribution read from a
-// file or passed in as an in-memory array instead of building up the distribution
-// from data on the fly. There is also an option to skew the distribution by
-// applying a distortion power to the weights.
-//
-// The vocabulary file should be in CSV-like format, with the last field
-// being the weight associated with the word.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// A queue that produces elements in first-in first-out order.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	component_types: The type of each component in a value.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...FixedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedUnigramCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
+		Type: "FIFOQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Elementwise computes the bitwise AND of `x` and `y`.
-//
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise left-shift of `x` and `y`.
-//
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Produces a summary of any statistics recorded by the given statistics manager.
+func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LeftShift",
+		Type: "StatsAggregatorSummary",
 		Input: []tf.Input{
-			x, y,
+			iterator,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
+// Compute the pairwise cross product.
 //
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
 //
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RightShift",
+		Type: "Cross",
 		Input: []tf.Input{
-			x, y,
+			a, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// Performs a padding as a preprocess during a convolution.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "FusedPadConv2D",
 		Input: []tf.Input{
-			images, delta,
+			input, paddings, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
 
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of average pooling function.
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the input.
 //
 // Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
+		Type: "Conv2DBackpropInput",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -25604,409 +25974,387 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
-type ParseSingleSequenceExampleAttr func(optionalAttr)
-
-// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// Builds a merged tensor such that
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+// For example, if each `indices[m]` is scalar or vector, we have
 //
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
 //
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//     merged.shape = [max(indices)] + constant
 //
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// Values are merged in order, so if an index appears in both `indices[m][i]` and
+// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
+// merged result. If you do not need this guarantee, ParallelDynamicStitch might
+// perform better on some devices.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
-	}
-}
-
-// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+// For example:
 //
-// Arguments:
-//	serialized: A scalar containing a binary serialized SequenceExample proto.
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExample.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExample.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	debug_name: A scalar containing the name of the serialized proto.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty scalar if no name is available.
-func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleSequenceExample",
+		Type: "DynamicStitch",
 		Input: []tf.Input{
-			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x == y) element-wise.
+//
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Equal",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeWavAttr is an optional argument to DecodeWav.
-type DecodeWavAttr func(optionalAttr)
-
-// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
-//
-// value: Number of sample channels wanted.
-// If not specified, defaults to -1
-func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_channels"] = value
-	}
-}
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
 
-// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
-//
-// value: Length of audio requested.
-// If not specified, defaults to -1
-func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
 	return func(m optionalAttr) {
-		m["desired_samples"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Decode a 16-bit PCM WAV file to a float tensor.
-//
-// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
-//
-// When desired_channels is set, if the input contains fewer channels than this
-// then the last channel will be duplicated to give the requested number, else if
-// the input has more channels than requested then the additional channels will be
-// ignored.
-//
-// If desired_samples is set, then the audio will be cropped or padded with zeroes
-// to the requested length.
-//
-// The first output contains a Tensor with the content of the audio samples. The
-// lowest dimension will be the number of channels, and the second will be the
-// number of samples. For example, a ten-sample-long stereo WAV file should give an
-// output shape of [10, 2].
-//
-// Arguments:
-//	contents: The WAV-encoded audio, usually from a file.
+// Deprecated. Use TensorArrayGatherV3
 //
-// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
-func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeWav",
+		Type: "TensorArrayGatherV2",
 		Input: []tf.Input{
-			contents,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// UniqueAttr is an optional argument to Unique.
-type UniqueAttr func(optionalAttr)
-
-// UniqueOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueOutIdx(value tf.DataType) UniqueAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
+	return op.Output(0)
 }
 
-// Finds unique elements in a 1-D tensor.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
+// Builds a merged tensor such that
 //
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
 //
-// For example:
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
 //
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
 // ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
 // ```
 //
-// Arguments:
-//	x: 1-D.
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
 //
-// Returns 1-D.1-D.
-func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Unique",
+		Type: "ParallelDynamicStitch",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Concatenates a list of `N` tensors along the first dimension.
-//
-// The input tensors are all required to have size 1 in the first dimension.
-//
-// For example:
-//
-// ```
-// # 'x' is [[1, 4]]
-// # 'y' is [[2, 5]]
-// # 'z' is [[3, 6]]
-// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// ```
-//
-// The difference between concat and parallel_concat is that concat requires all
-// of the inputs be computed before the operation will begin but doesn't require
-// that the input shapes be known during graph construction.  Parallel concat
-// will copy pieces of the input into the output as they become available, in
-// some situations this can provide a performance benefit.
-//
-// Arguments:
-//	values: Tensors to be concatenated. All must have size 1 in the first dimension
-// and same shape.
-//	shape: the final shape of the result; should be equal to the shapes of any input
-// but with the number of input values in the first dimension.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// Returns The concatenated tensor.
-func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "ParallelConcat",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
+// List of the given size with empty elements.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "Concat",
+		Type: "TensorListReserve",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values),
+			element_shape, num_elements,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
+
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
 //
-// The lower regularized incomplete Gamma function is defined as:
+// value: The type of each component in a value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["component_types"] = value
+	}
+}
+
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
 //
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PriorityQueueV2Container sets the optional container attribute to value.
 //
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// where
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements sorted by the first component value.
 //
-// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
 //
-// is the lower incomplete Gamma function.
+// Arguments:
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
 //
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Igamma",
-		Input: []tf.Input{
-			a, x,
-		},
+		Type: "PriorityQueueV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes offsets of concat inputs within its output.
-//
-// For example:
-//
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
+
+// UnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// This is typically used by gradient computations for a concat operation.
+// REQUIRES: value >= 0
+func UnstageCapacity(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
+// REQUIRES: value >= 0
+func UnstageMemoryLimit(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// UnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnstageContainer(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnstageSharedName(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op is similar to a lightweight Dequeue.
 //
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
-		},
+		Type: "Unstage",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
@@ -26014,73 +26362,108 @@ func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset
 	}
 	var idx int
 	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("Unstage", err)
 		return
 	}
-	return offset
+	return values
 }
 
-// Splits a tensor into `num_split` tensors along one dimension.
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
+
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues a tuple of one or more tensors in the given queue.
+//
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
-//	value: The tensor to split.
-//	num_split: The number of ways to split.  Must evenly divide
-// `value.shape[split_dim]`.
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
 //
-// Returns They are identically shaped tensors, whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `values.shape[split_dim] / num_split`.
-func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
+// Returns the created operation.
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Split",
+		Type: "QueueEnqueueV2",
 		Input: []tf.Input{
-			axis, value,
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Split", err)
-		return
+	return scope.AddOperation(opspec)
+}
+
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
+
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
 	}
-	return output
 }
 
-// Splits a tensor into `num_split` tensors along one dimension.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// Arguments:
-//	value: The tensor to split.
-//	size_splits: list containing the sizes of each output tensor along the split
-// dimension. Must sum to the dimension of value along split_dim.
-// Can contain one -1 indicating that dimension is to be inferred.
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
 //
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
 //
-// Returns Tensors whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `size_splits[i]`.
-func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SplitV",
+		Type: "QueueDequeueManyV2",
 		Input: []tf.Input{
-			value, size_splits, axis,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
@@ -26090,165 +26473,129 @@ func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output
 	}
 	var idx int
 	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("SplitV", err)
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
 		return
 	}
-	return output
+	return components
 }
 
-// Gives a guarantee to the TF runtime that the input tensor is a constant.
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// The runtime is then free to make optimizations based on this.
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
 //
-// Only accepts value typed tensors as inputs and rejects resource variable handles
-// as input.
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
 //
-// Returns the input tensor without modification.
-func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GuaranteeConst",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
 			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a tensor of zeros with the same shape and type as x.
+// Deprecated. Use TensorArrayCloseV3
 //
-// Arguments:
-//	x: a tensor of type T.
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
 //
-// Returns a tensor of the same shape and type as x but filled with zeros.
-func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ZerosLike",
+		Type: "TensorArrayCloseV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Flips all bits elementwise.
+// Forwards the value of an available tensor from `inputs` to `output`.
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
+//
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
+//
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Invert",
+		Type: "Merge",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
 
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+//
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["cancel_pending_enqueues"] = value
 	}
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// if T == qint8, in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
+// Closes the given queue.
 //
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
 //
 // Arguments:
+//	handle: The handle to a queue.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26257,393 +26604,218 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "QueueCloseV2",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			handle,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "Atanh",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns a batched matrix tensor with new batched diagonal values.
-//
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
-//
-// The output is computed as follows:
-//
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+// Returns true if queue is closed.
 //
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+// This operation returns true if the queue is closed and false if the queue
+// is open.
 //
 // Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
+		Type: "QueueIsClosedV2",
 		Input: []tf.Input{
-			input, diagonal,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
+// Returns the batched diagonal part of a batched tensor.
 //
-// value: boolean (if true, edit distances are normalized by length of truth).
+// This operation returns a tensor with the `diagonal` part
+// of the batched `input`. The `diagonal` part is computed as follows:
 //
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
-	return func(m optionalAttr) {
-		m["normalize"] = value
-	}
-}
-
-// Computes the (possibly normalized) Levenshtein Edit Distance.
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
 //
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
+// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
 //
-// The inputs are:
+// The input must be at least a matrix.
 //
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
+// For example:
 //
-// Returns A dense float tensor with rank R - 1.
+// ```
+// # 'input' is [[[1, 0, 0, 0]
+//                [0, 2, 0, 0]
+//                [0, 0, 3, 0]
+//                [0, 0, 0, 4]],
+//               [[5, 0, 0, 0]
+//                [0, 6, 0, 0]
+//                [0, 0, 7, 0]
+//                [0, 0, 0, 8]]]
 //
-// For the example input:
+// and input.shape = (2, 4, 4)
 //
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
+// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
 //
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
+// which has shape (2, 4)
+// ```
 //
-// The output will be:
+// Arguments:
+//	input: Rank `k` tensor where `k >= 2`.
 //
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+// Returns The extracted diagonal(s) having shape
+// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
+func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EditDistance",
+		Type: "MatrixDiagPart",
 		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gather slices from `params` into a Tensor with shape specified by `indices`.
-//
-// `indices` is an K-dimensional integer tensor, best thought of as a
-// (K-1)-dimensional tensor of indices into `params`, where each element defines a
-// slice of `params`:
-//
-//     output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
-//
-// Whereas in @{tf.gather} `indices` defines slices into the first
-// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
-// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
-//
-// The last dimension of `indices` can be at most the rank of
-// `params`:
-//
-//     indices.shape[-1] <= params.rank
-//
-// The last dimension of `indices` corresponds to elements
-// (if `indices.shape[-1] == params.rank`) or slices
-// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
-// of `params`.  The output tensor has shape
-//
-//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
-//
-// Some examples below.
-//
-// Simple indexing into a matrix:
-//
-// ```python
-//     indices = [[0, 0], [1, 1]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = ['a', 'd']
-// ```
-//
-// Slice indexing into a matrix:
-//
-// ```python
-//     indices = [[1], [0]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['c', 'd'], ['a', 'b']]
-// ```
-//
-// Indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['a1', 'b1'], ['c1', 'd1']]]
-//
-//
-//     indices = [[0, 1], [1, 0]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['c0', 'd0'], ['a1', 'b1']]
-//
-//
-//     indices = [[0, 0, 1], [1, 0, 1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = ['b0', 'b1']
-// ```
-//
-// Batched indexing into a matrix:
-//
-// ```python
-//     indices = [[[0, 0]], [[0, 1]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['a'], ['b']]
-// ```
-//
-// Batched slice indexing into a matrix:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [[['c', 'd']], [['a', 'b']]]
-// ```
-//
-// Batched indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
-//               [[['a0', 'b0'], ['c0', 'd0']]]]
-//
-//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['c0', 'd0'], ['a1', 'b1']],
-//               [['a0', 'b0'], ['c1', 'd1']]]
-//
-//
-//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['b0', 'b1'], ['d0', 'c1']]
-// ```
-//
-// Arguments:
-//	params: The tensor from which to gather values.
-//	indices: Index tensor.
+// Computes the absolute value of a tensor.
 //
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
-func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GatherNd",
+		Type: "Abs",
 		Input: []tf.Input{
-			params, indices,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Eagerly executes a python function to compute func(input)->output. The
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
+
+// StackV2StackName sets the optional stack_name attribute to value.
 //
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
-	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
+	return func(m optionalAttr) {
+		m["stack_name"] = value
 	}
-	return output
 }
 
-// Stops gradient computation.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
+// A stack that produces elements in first-in last-out order.
 //
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
+// Arguments:
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
 //
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StopGradient",
+		Type: "StackV2",
 		Input: []tf.Input{
-			input,
+			max_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
+
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
-
-// PreventGradientMessage sets the optional message attribute to value.
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["data_format"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
 //
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	input: any tensor.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26652,33 +26824,56 @@ func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientA
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "FusedBatchNormGradV2",
 		Input: []tf.Input{
-			input,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Checks a tensor for NaN and Inf values.
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
 //
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
+//
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
+//
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
 //
 // Arguments:
+//	bytes: A Tensor of string which is compressed.
 //
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message": message}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
+		Type: "DecodeCompressed",
 		Input: []tf.Input{
-			tensor,
+			bytes,
 		},
 		Attrs: attrs,
 	}
@@ -26686,62 +26881,98 @@ func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Ou
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation and conjugate the result.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
-func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
+
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ConjugateTranspose",
-		Input: []tf.Input{
-			x, perm,
-		},
+}
+
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// UniqueV2Attr is an optional argument to UniqueV2.
-type UniqueV2Attr func(optionalAttr)
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
 
-// UniqueV2OutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["dropout"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`.
-//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
-// find the unique elements.
-//
-// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
-// value of x in the output y.
-func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: a 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: a 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: the same shape has input_h.
+// output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: an opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26750,112 +26981,143 @@ func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Att
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniqueV2",
+		Type: "CudnnRNN",
 		Input: []tf.Input{
-			x, axis,
+			input, input_h, input_c, params,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Return a slice from 'input'.
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
 //
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
 //
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
 //
 // Arguments:
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
 //
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Slice",
+		Type: "CompareAndBitpack",
 		Input: []tf.Input{
-			input, begin, size,
+			input, threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
-type StridedSliceGradAttr func(optionalAttr)
-
-// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
+// Push an element onto the tensor_array.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// StridedSliceGradEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
+	opspec := tf.OpSpec{
+		Type: "TensorArrayWriteV3",
+		Input: []tf.Input{
+			handle, index, value, flow_in,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
+// Scatter the data from the input value into specific TensorArray elements.
+//
+// `indices` must be a vector, its length must match the first dim of `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV3",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+// EmptyAttr is an optional argument to Empty.
+type EmptyAttr func(optionalAttr)
+
+// EmptyInit sets the optional init attribute to value.
+//
+// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
+// If not specified, defaults to false
+func EmptyInit(value bool) EmptyAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["init"] = value
 	}
 }
 
-// Returns the gradient of `StridedSlice`.
+// Creates a tensor with the given shape.
 //
-// Since `StridedSlice` cuts out pieces of its `input` which is size
-// `shape`, its gradient will have the same shape (which is passed here
-// as `shape`). The gradient will be zero in any element that the slice
-// does not select.
+// This operation creates a tensor of `shape` and `dtype`.
 //
-// Arguments are the same as StridedSliceGrad with the exception that
-// `dy` is the input gradient to be propagated and `shape` is the
-// shape of `StridedSlice`'s `input`.
-func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
+// Arguments:
+//	shape: 1-D. Represents the shape of the output tensor.
+//
+//
+// Returns A `Tensor` of type `T`.
+func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StridedSliceGrad",
+		Type: "Empty",
 		Input: []tf.Input{
-			shape, begin, end, strides, dy,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -26863,60 +27125,105 @@ func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Out
 	return op.Output(0)
 }
 
-// Returns the gradient of `Tile`.
+// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
+type TensorArrayConcatV3Attr func(optionalAttr)
+
+// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
 //
-// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
+// value: The expected shape of an element, if known,
+// excluding the first dimension. Used to validate the shapes of
+// TensorArray elements. If this shape is not fully specified, concatenating
+// zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Concat the elements from the TensorArray into value `value`.
 //
-// Since `Tile` takes an input and repeats the input `multiples` times
-// along each dimension, `TileGrad` takes in `multiples` and aggregates
-// each repeated tile of `input` into `output`.
-func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+// Takes `T` elements of shapes
+//
+//   ```
+//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+//   ```
+//
+// and concatenates them into a Tensor of shape:
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+//
+// All elements must have the same shape (excepting the first dimension).
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TileGrad",
+		Type: "TensorArrayConcatV3",
 		Input: []tf.Input{
-			input, multiples,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
 
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["src_format"] = value
+		m["seed"] = value
 	}
 }
 
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["seed2"] = value
 	}
 }
 
-// Returns the dimension index in the destination data format given the one in
+// Outputs random values from a normal distribution. The parameters may each be a
 //
-// the source data format.
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
 // Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
 //
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26925,9 +27232,9 @@ func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			x,
+			shape, means, stdevs, minvals, maxvals,
 		},
 		Attrs: attrs,
 	}
@@ -26935,156 +27242,126 @@ func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAtt
 	return op.Output(0)
 }
 
-// Return the shape of s0 op s1 with broadcast.
+// Sets the index-th position of the list to contain the given tensor.
 //
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
+//
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
+		Type: "TensorListSetItem",
 		Input: []tf.Input{
-			s0, s1,
+			input_handle, index, item,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
+// Returns a diagonal tensor with a given diagonal values.
 //
-// This is typically used by gradient computations for a broadcasting operation.
-func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastGradientArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Pads a tensor with mirrored values.
-//
-// This operation pads a `input` with mirrored values according to the `paddings`
-// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many values to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many values to add after the contents of `input`
-// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-// (if false, respectively).
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// The padded size of each dimension D of the output is:
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
 //
 // For example:
 //
 // ```
-// # 't' is [[1, 2, 3], [4, 5, 6]].
-// # 'paddings' is [[1, 1]], [2, 2]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-//                       [2, 1, 1, 2, 3, 3, 2]
-//                       [5, 4, 4, 5, 6, 6, 5]
-//                       [5, 4, 4, 5, 6, 6, 5]]
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
 // ```
 //
 // Arguments:
-//	input: The input tensor to be padded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-// do not include the borders, while in symmetric mode the padded regions
-// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-// it is `[1, 2, 3, 3, 2]` in symmetric mode.
-//
-// Returns The padded tensor.
-func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "MirrorPad",
+		Type: "Diag",
 		Input: []tf.Input{
-			input, paddings,
+			diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// A placeholder op for a value that will be fed into the computation.
+// Split the data from the input value into TensorArray elements.
 //
-// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
+// Assuming that `lengths` takes on values
 //
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
+//   ```(n0, n1, ..., n(T-1))```
+//
+// and that `value` has shape
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+//
+// this splits values into a TensorArray with T tensors.
+//
+// TensorArray index t will be the subtensor of values with starting position
+//
+//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+//
+// and having size
+//
+//   ```nt x d0 x d1 x ...```
 //
 // Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor. The shape can be any partially-specified
-// shape.  To be unconstrained, pass in a shape with unknown rank.
+//	handle: The handle to a TensorArray.
+//	value: The concatenated tensor to write to the TensorArray.
+//	lengths: The vector of lengths, how to split the rows of value into the
+// TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "PlaceholderV2",
-
-		Attrs: attrs,
+		Type: "TensorArraySplitV3",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// SerializeSparseOutType sets the optional out_type attribute to value.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_type"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
-//
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27093,866 +27370,2798 @@ func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "SerializeSparse",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
 
-// SqueezeAxis sets the optional axis attribute to value.
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+		m["shapes"] = value
 	}
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
 //
-// For example:
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
 //
-// Or, to remove specific size 1 dimensions:
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
 //
 // Arguments:
-//	input: The `input` to squeeze.
+//	component_types: The type of each component in a value.
 //
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "RandomShuffleQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SpaceToBatch for N-D tensors of type T.
+// Draw bounding boxes on a batch of images.
 //
-// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-// grid of blocks of shape `block_shape`, and interleaves these blocks with the
-// "batch" dimension (0) such that in the output, the spatial dimensions
-// `[1, ..., M]` correspond to the position within the grid, and the batch
-// dimension combines both the position within a spatial block and the original
-// batch position.  Prior to division into blocks, the spatial dimensions of the
-// input are optionally zero padded according to `paddings`.  See below for a
-// precise description.
-//
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has `M` dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
-//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
-//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
-//    input according to `paddings` to produce `padded` of shape `padded_shape`.
-//
-// 2. Reshape `padded` to `reshaped_padded` of shape:
-//
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//        block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1],
-//       block_shape[M-1]] +
-//      remaining_shape
-//
-// 3. Permute dimensions of `reshaped_padded` to produce
-//    `permuted_reshaped_padded` of shape:
-//
-//      block_shape +
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
-//    dimension, producing an output tensor of shape:
-//
-//      [batch * prod(block_shape)] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
-//     paddings = `[[0, 0], [2, 0]]`:
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
 //
-// The output tensor has shape `[8, 1, 3, 1]` and value:
+// Parts of the bounding box may fall outside the image.
 //
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
 //
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SpaceToBatchND",
+		Type: "DrawBoundingBoxes",
 		Input: []tf.Input{
-			input, block_shape, paddings,
+			images, boxes,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
-type QuantizeAndDequantizeV2Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
-//
-// value: If the quantization is signed or unsigned.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
 
-// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: The bitwidth of the quantization.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["num_bits"] = value
+		m["seed"] = value
 	}
 }
 
-// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: If the range is given or should be computed from the tensor.
-// If not specified, defaults to false
-func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["range_given"] = value
+		m["seed2"] = value
 	}
 }
 
-// Quantizes then dequantizes a tensor.
-//
-// This op simulates the precision loss from the quantized forward pass by:
-// 1. Quantizing the tensor to fixed point numbers, which should match the target
-//    quantization method when it is used in inference.
-// 2. Dequantizing it back to floating point numbers for the following ops, most
-//    likely matmul.
-//
-// There are different ways to quantize. This version does not use the full range
-// of the output type, choosing to elide the lowest possible value for symmetry
-// (e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
-// quantization), so that 0.0 maps to 0.
-//
-// To perform this op, we first find the range of values in our tensor. The range
-// we use is always centered on 0, so we find m such that
-//
-// 1. m = max(abs(input_min), abs(input_max)) if range_given is true,
-// 2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
-//
-// Our input tensor range is then [-m, m].
-//
-// Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
-// If signed_input is true, this is
-//
-//   [min_fixed, max_fixed ] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
-//
-// Otherwise, if signed_input is false, the fixed-point range is
-//
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
-//
-// From this we compute our scaling factor, s:
-//
-//   s = (max_fixed - min_fixed) / (2 * m).
-//
-// Now we can quantize and dequantize the elements of our tensor.  An element e
-// is transformed into e':
-//
-//   e' = (e * s).round_to_nearest() / s.
-//
-// Note that we have a different number of buckets in the signed vs. unsigned
-// cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
-// vs. 255 in the unsigned case.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// For example, suppose num_bits = 8 and m = 1.  Then
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-//   [min_fixed, max_fixed] = [-127, 127], and
-//   s = (127 + 127) / 2 = 127.
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
-// {-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	input: Tensor to quantize and then dequantize.
-//	input_min: If range_given, this is the min of the range, otherwise this input
-// will be ignored.
-//	input_max: If range_given, this is the max of the range, otherwise this input
-// will be ignored.
-func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV2",
+		Type: "LearnedUnigramCandidateSampler",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SpaceToBatch for 4-D tensors of type T.
-//
-// This is a legacy version of the more general SpaceToBatchND.
-//
-// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-// More specifically, this op outputs a copy of the input tensor where values from
-// the `height` and `width` dimensions are moved to the `batch` dimension. After
-// the zero-padding, both `height` and `width` of the input must be divisible by the
-// block size.
+// Computes gradients for the scaled exponential linear (Selu) operation.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, depth]`.
-//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-//   the padding of the input with zeros across the spatial dimensions as follows:
-//
-//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
-//
-//   The effective spatial dimensions of the zero-padded input tensor will be:
-//
-//       height_pad = pad_top + height + pad_bottom
-//       width_pad = pad_left + width + pad_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` in the height and
-//     width dimensions are rearranged into the batch dimension at each location.
-//   * The batch of the output tensor is `batch * block_size * block_size`.
-//   * Both height_pad and width_pad must be divisible by block_size.
-//
-// The shape of the output will be:
-//
-//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//      depth]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[8, 1, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
 //
-func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "SpaceToBatch",
+		Type: "SeluGrad",
 		Input: []tf.Input{
-			input, paddings,
+			gradients, outputs,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
-
-// UnpackAxis sets the optional axis attribute to value.
-//
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
-//
-// This is the opposite of `pack`.
+// Get the current size of the TensorArray.
 //
 // Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Returns The current size of the TensorArray.
+func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "TensorArraySizeV3",
 		Input: []tf.Input{
-			value,
+			handle, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
-//
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
-//
+// Deprecated. Use TensorArrayGradV3
 //
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
+		Type: "TensorArrayWriteV2",
 		Input: []tf.Input{
-			resource,
+			handle, index, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the stack from its resource container.
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
+
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	handle: The handle to a stack.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
+		Type: "SparseReduceMax",
 		Input: []tf.Input{
-			handle,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// BatchToSpace for N-D tensors of type T.
-//
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
+
+// AsStringPrecision sets the optional precision attribute to value.
 //
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
-//
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
-//
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
-//
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
+	}
+}
+
+// AsStringScientific sets the optional scientific attribute to value.
 //
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["scientific"] = value
+	}
+}
+
+// AsStringShortest sets the optional shortest attribute to value.
 //
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["shortest"] = value
+	}
+}
+
+// AsStringWidth sets the optional width attribute to value.
 //
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["width"] = value
+	}
+}
+
+// AsStringFill sets the optional fill attribute to value.
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
+	return func(m optionalAttr) {
+		m["fill"] = value
+	}
+}
+
+// Converts each entry in the given tensor to strings.  Supports many numeric
 //
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
+		Type: "AsString",
 		Input: []tf.Input{
-			input, block_shape, crops,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Extract `patches` from `images` and put them in the "depth" output dimension.
-//
-// Arguments:
-//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `images`.
-//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
-// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-// input stride, specifying how far two consecutive patch samples are in the
-// input. Equivalent to extracting patches with
-// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`. This is equivalent to
-// `rate` in dilated (a.k.a. Atrous) convolutions.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_rows, ksize_cols, 1]
-//       strides = [1, strides_rows, strides_cols, 1]
-//       rates = [1, rates_rows, rates_cols, 1]
-// ```
+// Deprecated. Use TensorArrayScatterV3
 //
-// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-// ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-// `out_rows` and `out_cols` are the dimensions of the output patches.
-func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ExtractImagePatches",
+		Type: "TensorArrayScatterV2",
 		Input: []tf.Input{
-			images,
+			handle, indices, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Bitcasts a tensor from one type to another without copying data.
-//
-// Given a tensor `input`, this operation returns a tensor that has the same buffer
-// data as `input` with datatype `type`.
-//
-// If the input datatype `T` is larger than the output datatype `type` then the
-// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
+// Creates a tree ensemble model and returns a handle to it.
 //
-// If `T` is smaller than `type`, the operator requires that the rightmost
-// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-// [..., sizeof(`type`)/sizeof(`T`)] to [...].
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
+//	stamp_token: Token to use as the initial value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
 //
-// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-// endian orderings will give different results.
-func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
+// Returns the created operation.
+func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"type": type_}
 	opspec := tf.OpSpec{
-		Type: "Bitcast",
+		Type: "BoostedTreesCreateEnsemble",
 		Input: []tf.Input{
-			input,
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
-
-// OneHotAxis sets the optional axis attribute to value.
+// Applies sparse addition to `input` using individual values or slices
 //
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Returns a one-hot tensor.
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
 //
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 //
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
-// If `indices` is a vector of length `features`, the output shape will be:
 // ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
+// [d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
 // ```
 //
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
 //
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
 //
-// Examples
-// =========
+// The resulting value `output` would look like this:
 //
-// Suppose that
+//     [1, 13, 3, 14, 14, 6, 7, 20]
 //
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
+// See @{tf.scatter_nd} for more details about how to make updates to slices.
 //
-// Then output is `[4 x 3]`:
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
 //
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNdNonAliasingAdd",
+		Input: []tf.Input{
+			input, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// Suppose that
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
 //
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// Then output is `[3 x 4]`:
+// `index  0  1  2  3  4`
 //
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
-// Suppose that
+// `value  20 5  16 3  7`
 //
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
 //
-// Then output is `[2 x 2 x 3]`:
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
 //
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
+//
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
+//
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
 //
 // Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Deprecated. Use TensorArraySizeV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySizeV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
+//
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
+//
+// In detail, with the default NHWC format,
+//
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
+//
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StageAttr is an optional argument to Stage.
+type StageAttr func(optionalAttr)
+
+// StageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageCapacity(value int64) StageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageMemoryLimit sets the optional memory_limit attribute to value.
+//
+// value: The maximum number of bytes allowed for Tensors in the Staging Area.
+// If > 0, inserts will block until sufficient space is available.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageMemoryLimit(value int64) StageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func StageContainer(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func StageSharedName(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage values similar to a lightweight Enqueue.
+//
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
+//
+// Arguments:
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+// Returns the created operation.
+func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Stage",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StagePeekAttr is an optional argument to StagePeek.
+type StagePeekAttr func(optionalAttr)
+
+// StagePeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StagePeekCapacity(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StagePeekMemoryLimit(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StagePeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StagePeekContainer(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StagePeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StagePeekSharedName(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified index.  If the
+//
+// underlying container does not contain sufficient elements
+// this op will block until it does.   This Op is optimized for
+// performance.
+func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StagePeek",
+		Input: []tf.Input{
+			index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("StagePeek", err)
+		return
+	}
+	return values
+}
+
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
+
+// MapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageCapacity(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageMemoryLimit(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func MapStageContainer(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func MapStageSharedName(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a hashtable.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
+	}
+	return values
+}
+
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
+type OrderedMapUnstageAttr func(optionalAttr)
+
+// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstage", err)
+		return
+	}
+	return values
+}
+
+// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
+type OrderedMapSizeAttr func(optionalAttr)
+
+// OrderedMapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
+
+// ShapeNOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns shape of tensors.
+//
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ShapeN",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
+	}
+	return output
+}
+
+// CudnnRNNParamsToCanonicalAttr is an optional argument to CudnnRNNParamsToCanonical.
+type CudnnRNNParamsToCanonicalAttr func(optionalAttr)
+
+// CudnnRNNParamsToCanonicalRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsToCanonicalRnnMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsToCanonicalInputMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsToCanonicalDirection(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalDropout(value float32) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Retrieves CudnnRNN params in canonical form.
+//
+// Retrieves a set of weights from the opaque params buffer that can be saved and
+// restored in a way compatible with future runs.
+//
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, params tf.Output, num_params int64, optional ...CudnnRNNParamsToCanonicalAttr) (weights []tf.Output, biases []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_params": num_params}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsToCanonical",
+		Input: []tf.Input{
+			num_layers, num_units, input_size, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if weights, idx, err = makeOutputList(op, idx, "weights"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	if biases, idx, err = makeOutputList(op, idx, "biases"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	return weights, biases
+}
+
+// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
+type UniformCandidateSamplerAttr func(optionalAttr)
+
+// UniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// CTCLossAttr is an optional argument to CTCLoss.
+type CTCLossAttr func(optionalAttr)
+
+// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+//
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+//
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+//
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
+//
+// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCLoss",
+		Input: []tf.Input{
+			inputs, labels_indices, labels_values, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
+type CTCGreedyDecoderAttr func(optionalAttr)
+
+// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If True, merge repeated classes in output.
+// If not specified, defaults to false
+func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
+	}
+}
+
+// Performs greedy decoding on the logits given in inputs.
+//
+// A note about the attribute merge_repeated: if enabled, when
+// consecutive logits' maximum indices are the same, only the first of
+// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
+// becomes "A B B" if merge_repeated = True and "A B B B B" if
+// merge_repeated = False.
+//
+// Regardless of the value of merge_repeated, if the maximum index of a given
+// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
+// element is emitted.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
+//
+// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
+// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
+// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
+// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
+// log-probabilities.
+func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCGreedyDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Forwards `data` to the output port determined by `pred`.
+//
+// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
+// the data goes to `output_false`.
+//
+// See also `RefSwitch` and `Merge`.
+//
+// Arguments:
+//	data: The tensor to be forwarded to the appropriate output.
+//	pred: A scalar that specifies which output port will receive data.
+//
+// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
+func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Switch",
+		Input: []tf.Input{
+			data, pred,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Add all input tensors element wise.
+//
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddN",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
+
+// TryRpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// TryRpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TryRpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
+
+// EnterIsConstant sets the optional is_constant attribute to value.
+//
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
+	return func(m optionalAttr) {
+		m["is_constant"] = value
+	}
+}
+
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+//
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
+	}
+}
+
+// Creates or finds a child frame, and makes `data` available to the child frame.
+//
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
+//
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"frame_name": frame_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Enter",
+		Input: []tf.Input{
+			data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Exits the current frame to its parent frame.
+//
+// Exit makes its input `data` available to the parent frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the parent frame.
+//
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exit",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a copy of the input tensor.
+func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Snapshot",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AbortAttr is an optional argument to Abort.
+type AbortAttr func(optionalAttr)
+
+// AbortErrorMsg sets the optional error_msg attribute to value.
+//
+// value: A string which is the message associated with the exception.
+// If not specified, defaults to ""
+func AbortErrorMsg(value string) AbortAttr {
+	return func(m optionalAttr) {
+		m["error_msg"] = value
+	}
+}
+
+// AbortExitWithoutError sets the optional exit_without_error attribute to value.
+// If not specified, defaults to false
+func AbortExitWithoutError(value bool) AbortAttr {
+	return func(m optionalAttr) {
+		m["exit_without_error"] = value
+	}
+}
+
+// Raise a exception to abort the process when called.
+//
+// If exit_without_error is true, the process will exit normally,
+// otherwise it will exit with a SIGABORT signal.
+//
+// Returns nothing but an exception.
+//
+// Returns the created operation.
+func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Abort",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
+type FixedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// FixedUnigramCandidateSamplerVocabFile sets the optional vocab_file attribute to value.
+//
+// value: Each valid line in this file (which should have a CSV-like format)
+// corresponds to a valid word ID. IDs are in sequential order, starting from
+// num_reserved_ids. The last entry in each line is expected to be a value
+// corresponding to the count or relative probability. Exactly one of vocab_file
+// and unigrams needs to be passed to this op.
+// If not specified, defaults to ""
+func FixedUnigramCandidateSamplerVocabFile(value string) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["vocab_file"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerDistortion sets the optional distortion attribute to value.
+//
+// value: The distortion is used to skew the unigram probability distribution.
+// Each weight is first raised to the distortion's power before adding to the
+// internal unigram distribution. As a result, distortion = 1.0 gives regular
+// unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
+// a uniform distribution.
+// If not specified, defaults to 1
+func FixedUnigramCandidateSamplerDistortion(value float32) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["distortion"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerNumReservedIds sets the optional num_reserved_ids attribute to value.
+//
+// value: Optionally some reserved IDs can be added in the range [0,
+// ..., num_reserved_ids) by the users. One use case is that a special unknown
+// word token is used as ID 0. These IDs will have a sampling probability of 0.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerNumReservedIds(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["num_reserved_ids"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerNumShards sets the optional num_shards attribute to value.
+//
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'shard') indicates the number of partitions that are being
+// used in the overall computation.
+// If not specified, defaults to 1
+//
+// REQUIRES: value >= 1
+func FixedUnigramCandidateSamplerNumShards(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["num_shards"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerShard sets the optional shard attribute to value.
+//
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'num_shards') indicates the particular partition number of a
+// sampler op, when partitioning is being used.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["shard"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerUnigrams sets the optional unigrams attribute to value.
+//
+// value: A list of unigram counts or probabilities, one per ID in sequential
+// order. Exactly one of vocab_file and unigrams should be passed to this op.
+// If not specified, defaults to <>
+func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["unigrams"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// A unigram sampler could use a fixed unigram distribution read from a
+// file or passed in as an in-memory array instead of building up the distribution
+// from data on the fly. There is also an option to skew the distribution by
+// applying a distortion power to the weights.
+//
+// The vocabulary file should be in CSV-like format, with the last field
+// being the weight associated with the word.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...FixedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
+//
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
+//
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a tf.Example proto (as a string) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleExample",
+		Input: []tf.Input{
+			serialized, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// Deserializes a serialized tree ensemble config and replaces current tree
+//
+// ensemble.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//
+// Returns the created operation.
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesDeserializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the update to cached logits. It is designed to be used during training.
+// It traverses the trees starting from cached tree id and cached node id and
+// calculates the updates to be pushed to the cache.
+//
+// Arguments:
+//
+//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
+// tree of prediction.
+//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
+// node of prediction.
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Rank 2 Tensor containing logits update (with respect to cached
+// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
+func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesTrainingPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise left-shift of `x` and `y`.
+//
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LeftShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
+	return func(m optionalAttr) {
+		m["num_elements"] = value
+	}
+}
+
+// Stacks all tensors in the list.
+//
+// Requires that all tensors have the same shape.
+//
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
+//
+func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListStack",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise right-shift of `x` and `y`.
+//
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
+//
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RightShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
+
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["max_enqueued_batches"] = value
+	}
+}
+
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["allowed_batch_sizes"] = value
+	}
+}
+
+// BatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BatchContainer(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BatchSharedName(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
+//
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
+//
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
+//
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
+//
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
+//
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Batch",
+		Input: []tf.Input{
+			tf.OutputList(in_tensors),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
+	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
+}
+
+// UnbatchAttr is an optional argument to Unbatch.
+type UnbatchAttr func(optionalAttr)
+
+// UnbatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchContainer(value string) UnbatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchSharedName(value string) UnbatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Reverses the operation of Batch for a single output Tensor.
+//
+// An instance of Unbatch either receives an empty batched_tensor, in which case it
+// asynchronously waits until the values become available from a concurrently
+// running instance of Unbatch with the same container and shared_name, or receives
+// a non-empty batched_tensor in which case it finalizes all other concurrently
+// running instances and outputs its own element from the batch.
+//
+// batched_tensor: The possibly transformed output of Batch. The size of the first
+//  dimension should remain unchanged by the transformations for the operation to
+//  work.
+// batch_index: The matching batch_index obtained from Batch.
+// id: The id scalar emitted by Batch.
+// unbatched_tensor: The Tensor corresponding to this execution.
+// timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+//  batched input tensor associated with a given invocation of the op.
+// container: Container to control resource sharing.
+// shared_name: Instances of Unbatch with the same container and shared_name are
+//  assumed to possibly belong to the same batch. If left empty, the op name will
+//  be used as the shared name.
+func Unbatch(scope *Scope, batched_tensor tf.Output, batch_index tf.Output, id tf.Output, timeout_micros int64, optional ...UnbatchAttr) (unbatched_tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"timeout_micros": timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unbatch",
+		Input: []tf.Input{
+			batched_tensor, batch_index, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of average pooling function.
+//
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3DGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
+
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExample.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExample.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleSequenceExample",
+		Input: []tf.Input{
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+}
+
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
+
+// UnbatchGradContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradContainer(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchGradSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradSharedName(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Gradient of Unbatch.
+//
+// Acts like Batch but using the given batch_index index of batching things as they
+// become available. This ensures that the gradients are propagated back in the
+// same session which did the forward pass.
 //
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+// original_input: The input to the Unbatch operation this is the gradient of.
+// batch_index: The batch_index given to the Unbatch operation this is the gradient
+// of.
+// grad: The downstream gradient.
+// id: The id scalar emitted by Batch.
+// batched_grad: The return value, either an empty tensor or the batched gradient.
+// container: Container to control resource sharing.
+// shared_name: Instances of UnbatchGrad with the same container and shared_name
+//  are assumed to possibly belong to the same batch. If left empty, the op name
+//  will be used as the shared name.
+func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27961,9 +30170,9 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OneHot",
+		Type: "UnbatchGrad",
 		Input: []tf.Input{
-			indices, depth, on_value, off_value,
+			original_input, batch_index, grad, id,
 		},
 		Attrs: attrs,
 	}
@@ -27971,236 +30180,348 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 	return op.Output(0)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
+// DecodeWavAttr is an optional argument to DecodeWav.
+type DecodeWavAttr func(optionalAttr)
 
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
 //
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
+// value: Number of sample channels wanted.
 // If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["desired_channels"] = value
 	}
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
+// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
 //
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
+// value: Length of audio requested.
+// If not specified, defaults to -1
+func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_samples"] = value
+	}
+}
+
+// Decode a 16-bit PCM WAV file to a float tensor.
 //
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+//
+// When desired_channels is set, if the input contains fewer channels than this
+// then the last channel will be duplicated to give the requested number, else if
+// the input has more channels than requested then the additional channels will be
+// ignored.
+//
+// If desired_samples is set, then the audio will be cropped or padded with zeroes
+// to the requested length.
+//
+// The first output contains a Tensor with the content of the audio samples. The
+// lowest dimension will be the number of channels, and the second will be the
+// number of samples. For example, a ten-sample-long stereo WAV file should give an
+// output shape of [10, 2].
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
+//	contents: The WAV-encoded audio, usually from a file.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
+func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "DecodeWav",
 		Input: []tf.Input{
-			handle,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Concatenates a list of `N` tensors along the first dimension.
+//
+// The input tensors are all required to have size 1 in the first dimension.
+//
+// For example:
+//
+// ```
+// # 'x' is [[1, 4]]
+// # 'y' is [[2, 5]]
+// # 'z' is [[3, 6]]
+// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// ```
+//
+// The difference between concat and parallel_concat is that concat requires all
+// of the inputs be computed before the operation will begin but doesn't require
+// that the input shapes be known during graph construction.  Parallel concat
+// will copy pieces of the input into the output as they become available, in
+// some situations this can provide a performance benefit.
+//
+// Arguments:
+//	values: Tensors to be concatenated. All must have size 1 in the first dimension
+// and same shape.
+//	shape: the final shape of the result; should be equal to the shapes of any input
+// but with the number of input values in the first dimension.
+//
+// Returns The concatenated tensor.
+func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "ParallelConcat",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Subtracts `v` into specified rows of `x`.
+//
+//     Computes y = x; y[i, :] -= v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	return components
+	opspec := tf.OpSpec{
+		Type: "InplaceSub",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns locations of nonzero / true values in a tensor.
+// Converts a flat index or array of flat indices into a tuple of
 //
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
+// coordinate arrays.
+//
+// @compatibility(numpy)
+// Equivalent to np.unravel_index
+// @end_compatibility
+//
+// Arguments:
+//	indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
+// flattened version of an array of dimensions dims.
+//	dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
+// indices.
+//
+// Returns An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+// same shape as the indices array.
+func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnravelIndex",
+		Input: []tf.Input{
+			indices, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
+//
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes offsets of concat inputs within its output.
 //
 // For example:
 //
 // ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
 //
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// This is typically used by gradient computations for a concat operation.
 //
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// Arguments:
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatOffset",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(shape),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
+}
+
+// Splits a tensor into `num_split` tensors along one dimension.
+//
+// Arguments:
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//	value: The tensor to split.
+//	num_split: The number of ways to split.  Must evenly divide
+// `value.shape[split_dim]`.
+//
+// Returns They are identically shaped tensors, whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `values.shape[split_dim] / num_split`.
+func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "Split",
 		Input: []tf.Input{
-			condition,
+			axis, value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
-
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Split", err)
+		return
 	}
+	return output
 }
 
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
+// Splits a tensor into `num_split` tensors along one dimension.
+//
+// Arguments:
+//	value: The tensor to split.
+//	size_splits: list containing the sizes of each output tensor along the split
+// dimension. Must sum to the dimension of value along split_dim.
+// Can contain one -1 indicating that dimension is to be inferred.
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//
+//
+// Returns Tensors whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `size_splits[i]`.
+func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SplitV",
+		Input: []tf.Input{
+			value, size_splits, axis,
+		},
+		Attrs: attrs,
 	}
-}
-
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_max"] = value
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("SplitV", err)
+		return
 	}
+	return output
 }
 
-// Use QuantizeAndDequantizeV2 instead.
+// Gives a guarantee to the TF runtime that the input tensor is a constant.
 //
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+// The runtime is then free to make optimizations based on this.
+//
+// Only accepts value typed tensors as inputs and rejects resource variable handles
+// as input.
+//
+// Returns the input tensor without modification.
+func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
+		Type: "GuaranteeConst",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the diagonal part of the tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-//
-// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-//
-// For example:
-//
-// ```
-// # 'input' is [[1, 0, 0, 0]
-//               [0, 2, 0, 0]
-//               [0, 0, 3, 0]
-//               [0, 0, 0, 4]]
-//
-// tf.diag_part(input) ==> [1, 2, 3, 4]
-// ```
+// Returns a tensor of zeros with the same shape and type as x.
 //
 // Arguments:
-//	input: Rank k tensor where k is even and not zero.
+//	x: a tensor of type T.
 //
-// Returns The extracted diagonal.
-func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+// Returns a tensor of the same shape and type as x but filled with zeros.
+func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DiagPart",
+		Type: "ZerosLike",
 		Input: []tf.Input{
-			input,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -28289,51 +30610,102 @@ func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
+// Returns the diagonal part of the tensor.
+//
+// This operation returns a tensor with the `diagonal` part
+// of the `input`. The `diagonal` part is computed as follows:
+//
+// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
+//
+// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
+//
+// For example:
+//
+// ```
+// # 'input' is [[1, 0, 0, 0]
+//               [0, 2, 0, 0]
+//               [0, 0, 3, 0]
+//               [0, 0, 0, 4]]
+//
+// tf.diag_part(input) ==> [1, 2, 3, 4]
+// ```
+//
+// Arguments:
+//	input: Rank k tensor where k is even and not zero.
+//
+// Returns The extracted diagonal.
+func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DiagPart",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMaximum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Returns a batched matrix tensor with new batched diagonal values.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// The output is computed as follows:
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+//
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+//
+// Arguments:
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "MatrixSetDiag",
 		Input: []tf.Input{
-			inputs, min, max,
+			input, diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 9dee1aa72bf0d76ee35931f1e852bfd22556a540..ab7d698a45b7fc0cd498f8367fc1cecf07e4ba3c 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -311,9 +311,11 @@ tf_cc_test(
     srcs = [
         "src/gen/cc/source_writer_test.cc",
     ],
+    data = [
+        "src/gen/resources/test.java.snippet",
+    ],
     deps = [
         ":java_op_gen_lib",
-        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -385,15 +387,3 @@ genrule(
     cmd = "cp $< $@",
     output_to_bindir = 1,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index d35bb4111271c11839a160517dc9695ead5b46e9..08cc860f5795a4cf20f4ab2d09d2c2d37a52faf6 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.6.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index d9ba1bbbfb91170257f64a56f47c6c980e8a9570..fcc7eacc33b7bab366159425405b4bf5b0216cf1 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.6.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index f6f532c2c10d0a4dad9fc2d7750ea708652000b1..3d22d86a4970def52bf9a4a452a8131e1357341a 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.6.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 0a6b3d23d7d37515cf275e6a46842e32ada4fee1..0a09a5ea7cb96776b8296f68f599c333559a0729 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.6.0-rc1</version>
+  <version>1.8.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 1d8e8723731f959c8142f0648fc805593d7beac8..77ec6a0ddbab2749119a015094e47a9570e110e2 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.6.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow-android/pom-android.xml.template b/tensorflow/java/maven/tensorflow-android/pom-android.xml.template
index 5cbd0c898dc52ec5dfb72f0a2ac893d492a7d4be..37d2372d7b09f6f144e7abb145cb75bf98356615 100644
--- a/tensorflow/java/maven/tensorflow-android/pom-android.xml.template
+++ b/tensorflow/java/maven/tensorflow-android/pom-android.xml.template
@@ -20,10 +20,8 @@
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <project.build.number>${build_number}</project.build.number>
     <project.build.commitid>${build_commit_id}</project.build.commitid>
     <project.build.type>${build_type}</project.build.type>
-    <project.build.url>${build_url}</project.build.url>
   </properties>
 
 </project>
diff --git a/tensorflow/java/maven/tensorflow-android/update.py b/tensorflow/java/maven/tensorflow-android/update.py
index 4ae666e4e5351f1bdaf79d1b5cfdb63b0f811e2b..2206d800ca1fe82c5596ff39e56518bc5aea6211 100644
--- a/tensorflow/java/maven/tensorflow-android/update.py
+++ b/tensorflow/java/maven/tensorflow-android/update.py
@@ -45,6 +45,9 @@ def get_json(url):
 
 def get_commit_id(build_info):
   """Fetch the git commit id from the build info json object."""
+  release_commit_id = build_info.get('build_commit_id')
+  if release_commit_id:
+    return release_commit_id
   actions = build_info.get('actions')
   build_data = next(
       a for a in actions
@@ -95,20 +98,12 @@ def main():
     release_prefix = 'https://storage.googleapis.com/tensorflow/libtensorflow'
     info_url = '%s/android_buildinfo-%s.json' % (release_prefix, args.version)
     aar_url = '%s/tensorflow-%s.aar' % (release_prefix, args.version)
-    build_type = 'release-matrix-android2'
+    build_type = 'release-android'
 
   # Retrieve build information
   build_info = get_json(info_url)
 
   # Check all required build info is present
-  if build_info.get('result') != 'SUCCESS':
-    raise ValueError('Invalid json: %s' % build_info)
-  build_url = build_info.get('url')
-  if not build_url:
-    raise ValueError('Missing url: %s' % build_info)
-  build_number = build_info.get('number')
-  if not build_number:
-    raise ValueError('Missing build number: %s' % build_info)
   build_commit_id = get_commit_id(build_info)
   if not build_commit_id:
     raise ValueError('Missing commit id: %s' % build_info)
@@ -119,9 +114,7 @@ def main():
     f.write(
         template.substitute({
             'build_commit_id': build_commit_id,
-            'build_number': build_number,
             'build_type': build_type,
-            'build_url': build_url,
             'version': args.version
         }))
 
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 5c1b55085c5df1ec473a3f4e0bf750b236cfc264..0df1f2814906e548855522335f710e9702f8bb2a 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.6.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/gen/cc/java_defs.h b/tensorflow/java/src/gen/cc/java_defs.h
index 615cdc165b36abdc3cf5e717ddb8b385367c067f..59f8beaee78a2f40f6743ca10f72435e757db090 100644
--- a/tensorflow/java/src/gen/cc/java_defs.h
+++ b/tensorflow/java/src/gen/cc/java_defs.h
@@ -17,10 +17,7 @@ limitations under the License.
 #define TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
 
 #include <string>
-#include <vector>
-#include <deque>
-
-#include "tensorflow/core/platform/env.h"
+#include <list>
 
 namespace tensorflow {
 namespace java {
@@ -104,17 +101,17 @@ class Type {
     description_ = description;
     return *this;
   }
-  const std::vector<Type>& parameters() const { return parameters_; }
+  const std::list<Type>& parameters() const { return parameters_; }
   Type& add_parameter(const Type& parameter) {
     parameters_.push_back(parameter);
     return *this;
   }
-  const std::vector<Annotation>& annotations() const { return annotations_; }
+  const std::list<Annotation>& annotations() const { return annotations_; }
   Type& add_annotation(const Annotation& annotation) {
     annotations_.push_back(annotation);
     return *this;
   }
-  const std::deque<Type>& supertypes() const { return supertypes_; }
+  const std::list<Type>& supertypes() const { return supertypes_; }
   Type& add_supertype(const Type& type) {
     if (type.kind_ == CLASS) {
       supertypes_.push_front(type);  // keep superclass at the front of the list
@@ -141,9 +138,9 @@ class Type {
   string name_;
   string package_;
   string description_;
-  std::vector<Type> parameters_;
-  std::vector<Annotation> annotations_;
-  std::deque<Type> supertypes_;
+  std::list<Type> parameters_;
+  std::list<Annotation> annotations_;
+  std::list<Type> supertypes_;
 };
 
 // Definition of a Java annotation
@@ -223,16 +220,12 @@ class Method {
     return_description_ = description;
     return *this;
   }
-  const std::vector<Variable>& arguments() const { return arguments_; }
-  Method& add_arguments(const std::vector<Variable>& args) {
-    arguments_.insert(arguments_.cend(), args.cbegin(), args.cend());
-    return *this;
-  }
+  const std::list<Variable>& arguments() const { return arguments_; }
   Method& add_argument(const Variable& var) {
     arguments_.push_back(var);
     return *this;
   }
-  const std::vector<Annotation>& annotations() const { return annotations_; }
+  const std::list<Annotation>& annotations() const { return annotations_; }
   Method& add_annotation(const Annotation& annotation) {
     annotations_.push_back(annotation);
     return *this;
@@ -244,29 +237,13 @@ class Method {
   bool constructor_;
   string description_;
   string return_description_;
-  std::vector<Variable> arguments_;
-  std::vector<Annotation> annotations_;
+  std::list<Variable> arguments_;
+  std::list<Annotation> annotations_;
 
   Method(const string& name, const Type& return_type, bool constructor)
     : name_(name), return_type_(return_type), constructor_(constructor) {}
 };
 
-// A piece of code to read from a file.
-class Snippet {
- public:
-  static Snippet Create(const string& fname, Env* env = Env::Default()) {
-    return Snippet(fname, env);
-  }
-  const string& data() const { return data_; }
-
- private:
-  string data_;
-
-  Snippet(const string& fname, Env* env) {
-    TF_CHECK_OK(ReadFileToString(env, fname, &data_));
-  }
-};
-
 }  // namespace java
 }  // namespace tensorflow
 
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
index 2da81f2911e60be6a47ac13fe8be6142fa283780..a02f75ad6e7f5f1a9f22ad976e488ae5bf02a731 100644
--- a/tensorflow/java/src/gen/cc/source_writer.cc
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -14,49 +14,328 @@ limitations under the License.
 ==============================================================================*/
 
 #include <string>
+#include <algorithm>
+#include <deque>
 
 #include "tensorflow/java/src/gen/cc/source_writer.h"
 
 namespace tensorflow {
+namespace java {
 
-SourceWriter& SourceWriter::Append(const StringPiece& str) {
-  if (!str.empty()) {
-    if (newline_) {
-      DoAppend(left_margin_ + line_prefix_);
-      newline_ = false;
-    }
-    DoAppend(str);
+SourceWriter::SourceWriter() {
+  // Push an empty generic namespace at start, for simplification.
+  generic_namespaces_.push(new GenericNamespace());
+}
+
+SourceWriter::~SourceWriter() {
+  // Remove empty generic namespace added at start as well as any other
+  // namespace objects that haven't been removed.
+  while (!generic_namespaces_.empty()) {
+    GenericNamespace* generic_namespace = generic_namespaces_.top();
+    generic_namespaces_.pop();
+    delete generic_namespace;
   }
+}
+
+SourceWriter& SourceWriter::Indent(int tab) {
+  left_margin_.resize(
+      std::max(static_cast<int>(left_margin_.size() + tab), 0), ' ');
+  return *this;
+}
+
+SourceWriter& SourceWriter::Prefix(const char* line_prefix) {
+  line_prefix_ = line_prefix;
   return *this;
 }
 
-SourceWriter& SourceWriter::Write(const string& str) {
+SourceWriter& SourceWriter::Write(const StringPiece& str) {
   size_t line_pos = 0;
   do {
     size_t start_pos = line_pos;
     line_pos = str.find('\n', start_pos);
     if (line_pos != string::npos) {
       ++line_pos;
-      Append(StringPiece(str.data() + start_pos, line_pos - start_pos));
+      Append(str.substr(start_pos, line_pos - start_pos));
       newline_ = true;
     } else {
-      Append(StringPiece(str.data() + start_pos, str.size() - start_pos));
+      Append(str.substr(start_pos, str.size() - start_pos));
     }
   } while (line_pos != string::npos && line_pos < str.size());
 
   return *this;
 }
 
+SourceWriter& SourceWriter::WriteFromFile(const string& fname, Env* env) {
+  string data_;
+  TF_CHECK_OK(ReadFileToString(env, fname, &data_));
+  return Write(data_);
+}
+
+SourceWriter& SourceWriter::Append(const StringPiece& str) {
+  if (!str.empty()) {
+    if (newline_) {
+      DoAppend(left_margin_ + line_prefix_);
+      newline_ = false;
+    }
+    DoAppend(str);
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::AppendType(const Type& type) {
+  if (type.kind() == Type::Kind::GENERIC && type.name().empty()) {
+    Append("?");
+  } else {
+    Append(type.name());
+  }
+  if (!type.parameters().empty()) {
+    Append("<");
+    for (const Type& t : type.parameters()) {
+      if (&t != &type.parameters().front()) {
+        Append(", ");
+      }
+      AppendType(t);
+    }
+    Append(">");
+  }
+  return *this;
+}
+
 SourceWriter& SourceWriter::EndLine() {
   Append("\n");
   newline_ = true;
   return *this;
 }
 
-SourceWriter& SourceWriter::Indent(int tab) {
-  left_margin_.resize(std::max(static_cast<int>(left_margin_.size() + tab), 0),
-                      ' ');
+SourceWriter& SourceWriter::BeginMethod(const Method& method, int modifiers) {
+  GenericNamespace* generic_namespace = PushGenericNamespace(modifiers);
+  if (!method.constructor()) {
+    generic_namespace->Visit(method.return_type());
+  }
+  for (const Variable& v : method.arguments()) {
+    generic_namespace->Visit(v.type());
+  }
+  EndLine();
+  WriteDoc(method.description(), method.return_description(),
+      &method.arguments());
+  if (!method.annotations().empty()) {
+    WriteAnnotations(method.annotations());
+  }
+  WriteModifiers(modifiers);
+  if (!generic_namespace->declared_types().empty()) {
+    WriteGenerics(generic_namespace->declared_types());
+    Append(" ");
+  }
+  if (!method.constructor()) {
+    AppendType(method.return_type()).Append(" ");
+  }
+  Append(method.name()).Append("(");
+  for (const Variable& v : method.arguments()) {
+    if (&v != &method.arguments().front()) {
+      Append(", ");
+    }
+    AppendType(v.type()).Append(v.variadic() ? "... " : " ").Append(v.name());
+  }
+  return Append(")").BeginBlock();
+}
+
+SourceWriter& SourceWriter::EndMethod() {
+  EndBlock();
+  PopGenericNamespace();
+  return *this;
+}
+
+SourceWriter& SourceWriter::BeginType(const Type& type,
+    const std::list<Type>* dependencies, int modifiers) {
+  if (!type.package().empty()) {
+    Append("package ").Append(type.package()).Append(";").EndLine();
+  }
+  if (dependencies != nullptr && !dependencies->empty()) {
+    TypeImporter type_importer(type.package());
+    for (const Type& t : *dependencies) {
+      type_importer.Visit(t);
+    }
+    EndLine();
+    for (const string& s : type_importer.imports()) {
+      Append("import ").Append(s).Append(";").EndLine();
+    }
+  }
+  return BeginInnerType(type, modifiers);
+}
+
+SourceWriter& SourceWriter::BeginInnerType(const Type& type, int modifiers) {
+  GenericNamespace* generic_namespace = PushGenericNamespace(modifiers);
+  generic_namespace->Visit(type);
+  EndLine();
+  WriteDoc(type.description());
+  if (!type.annotations().empty()) {
+    WriteAnnotations(type.annotations());
+  }
+  WriteModifiers(modifiers);
+  CHECK_EQ(Type::Kind::CLASS, type.kind()) << ": Not supported yet";
+  Append("class ").Append(type.name());
+  if (!generic_namespace->declared_types().empty()) {
+    WriteGenerics(generic_namespace->declared_types());
+  }
+  if (!type.supertypes().empty()) {
+    bool first_interface = true;
+    for (const Type& t : type.supertypes()) {
+      if (t.kind() == Type::CLASS) {  // superclass is always first in list
+        Append(" extends ");
+      } else if (first_interface) {
+        Append(" implements ");
+        first_interface = false;
+      } else {
+        Append(", ");
+      }
+      AppendType(t);
+    }
+  }
+  return BeginBlock();
+}
+
+SourceWriter& SourceWriter::EndType() {
+  EndBlock();
+  PopGenericNamespace();
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteFields(const std::list<Variable>& fields,
+    int modifiers) {
+  EndLine();
+  for (const Variable& v : fields) {
+    WriteModifiers(modifiers);
+    AppendType(v.type()).Append(" ").Append(v.name()).Append(";");
+    EndLine();
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteModifiers(int modifiers) {
+  if (modifiers & PUBLIC) {
+    Append("public ");
+  } else if (modifiers & PROTECTED) {
+    Append("protected ");
+  } else if (modifiers & PRIVATE) {
+    Append("private ");
+  }
+  if (modifiers & STATIC) {
+    Append("static ");
+  }
+  if (modifiers & FINAL) {
+    Append("final ");
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteDoc(const string& description,
+    const string& return_description, const std::list<Variable>* parameters) {
+  if (description.empty() && return_description.empty()
+      && (parameters == nullptr || parameters->empty())) {
+    return *this;  // no doc to write
+  }
+  bool do_line_break = false;
+  Append("/**").EndLine().Prefix(" * ");
+  if (!description.empty()) {
+    Write(description).EndLine();
+    do_line_break = true;
+  }
+  if (parameters != nullptr && !parameters->empty()) {
+    if (do_line_break) {
+      EndLine();
+      do_line_break = false;
+    }
+    for (const Variable& v : *parameters) {
+      Append("@param ").Append(v.name());
+      if (!v.description().empty()) {
+        Append(" ").Write(v.description());
+      }
+      EndLine();
+    }
+  }
+  if (!return_description.empty()) {
+    if (do_line_break) {
+      EndLine();
+      do_line_break = false;
+    }
+    Append("@return ").Write(return_description).EndLine();
+  }
+  return Prefix("").Append(" **/").EndLine();
+}
+
+SourceWriter& SourceWriter::WriteAnnotations(
+    const std::list<Annotation>& annotations) {
+  for (const Annotation& a : annotations) {
+    Append("@" + a.name());
+    if (!a.attributes().empty()) {
+      Append("(").Append(a.attributes()).Append(")");
+    }
+    EndLine();
+  }
   return *this;
 }
 
+SourceWriter& SourceWriter::WriteGenerics(
+    const std::list<const Type*>& generics) {
+  Append("<");
+  for (const Type* pt : generics) {
+    if (pt != generics.front()) {
+      Append(", ");
+    }
+    Append(pt->name());
+    if (!pt->supertypes().empty()) {
+      Append(" extends ").AppendType(pt->supertypes().front());
+    }
+  }
+  return Append(">");
+}
+
+SourceWriter::GenericNamespace* SourceWriter::PushGenericNamespace(
+    int modifiers) {
+  GenericNamespace* generic_namespace;
+  if (modifiers & STATIC) {
+    generic_namespace = new GenericNamespace();
+  } else {
+    generic_namespace = new GenericNamespace(generic_namespaces_.top());
+  }
+  generic_namespaces_.push(generic_namespace);
+  return generic_namespace;
+}
+
+void SourceWriter::PopGenericNamespace() {
+  GenericNamespace* generic_namespace = generic_namespaces_.top();
+  generic_namespaces_.pop();
+  delete generic_namespace;
+}
+
+void SourceWriter::TypeVisitor::Visit(const Type& type) {
+  DoVisit(type);
+  for (const Type& t : type.parameters()) {
+    DoVisit(t);
+  }
+  for (const Annotation& t : type.annotations()) {
+    DoVisit(t);
+  }
+  for (const Type& t : type.supertypes()) {
+    DoVisit(t);
+  }
+}
+
+void SourceWriter::GenericNamespace::DoVisit(const Type& type) {
+  // ignore non-generic parameters, wildcards and generics already declared
+  if (type.kind() == Type::GENERIC
+      && !type.IsWildcard()
+      && generic_names_.find(type.name()) == generic_names_.end()) {
+    declared_types_.push_back(&type);
+    generic_names_.insert(type.name());
+  }
+}
+
+void SourceWriter::TypeImporter::DoVisit(const Type& type) {
+  if (!type.package().empty() && type.package() != current_package_) {
+    imports_.insert(type.package() + '.' + type.name());
+  }
+}
+
+}  // namespace java
 }  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
index bff26eb185db0cf933632f33f916b87d8a757edd..f011acd30aae3963c7f6ac995838c401f54fde54 100644
--- a/tensorflow/java/src/gen/cc/source_writer.h
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -17,44 +17,23 @@ limitations under the License.
 #define TENSORFLOW_JAVA_SRC_GEN_CC_SOURCE_WRITER_H_
 
 #include <string>
+#include <stack>
+#include <list>
+#include <set>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/java/src/gen/cc/java_defs.h"
 
 namespace tensorflow {
+namespace java {
 
-// A utility class for writing source code, normally generated at
-// compile-time.
-//
-// Source writers are language-agnostic and therefore only expose generic
-// methods common to most languages. Extend or wrap this class to implement
-// language-specific features.
-//
-// Note: if you are looking to reuse this class for generating code in another
-// language than Java, please do by moving it at the '//tensorflow/core/lib/io'
-// level.
+// A class for writing Java source code.
 class SourceWriter {
  public:
-  virtual ~SourceWriter() = default;
-
-  // Returns true if the writer is at the beginnig of a new line
-  bool newline() const { return newline_; }
-
-  // Appends a piece of code or text.
-  //
-  // It is expected that no newline character is present in the data provided,
-  // otherwise Write() must be used.
-  SourceWriter& Append(const StringPiece& str);
+  SourceWriter();
 
-  // Writes a block of code or text.
-  //
-  // The data might potentially contain newline characters, therefore it will
-  // be scanned to ensure that each line is indented and prefixed properly,
-  // making it a bit slower than Append().
-  SourceWriter& Write(const string& text);
-
-  // Appends a newline character and start writing on a new line.
-  SourceWriter& EndLine();
+  virtual ~SourceWriter();
 
   // Indents following lines with white spaces.
   //
@@ -75,18 +54,166 @@ class SourceWriter {
   // Indent(2)->Prefix("//") will result in prefixing lines with "  //".
   //
   // An empty value ("") will remove any line prefix that was previously set.
-  SourceWriter& Prefix(const char* line_prefix) {
-    line_prefix_ = line_prefix;
-    return *this;
+  SourceWriter& Prefix(const char* line_prefix);
+
+  // Writes a source code snippet.
+  //
+  // The data might potentially contain newline characters, therefore it will
+  // be scanned to ensure that each line is indented and prefixed properly,
+  // making it a bit slower than Append().
+  SourceWriter& Write(const StringPiece& str);
+
+  // Writes a source code snippet read from a file.
+  //
+  // All lines of the file at the provided path will be read and written back
+  // to the output of this writer in regard of its current attributes (e.g.
+  // the indentation, prefix, etc.)
+  SourceWriter& WriteFromFile(const string& fname, Env* env = Env::Default());
+
+  // Appends a piece of source code.
+  //
+  // It is expected that no newline character is present in the data provided,
+  // otherwise Write() must be used.
+  SourceWriter& Append(const StringPiece& str);
+
+  // Appends a type to the current line.
+  //
+  // The type is written in its simple form (i.e. not prefixed by its package)
+  // and followed by any parameter types it has enclosed in brackets (<>).
+  SourceWriter& AppendType(const Type& type);
+
+  // Appends a newline character.
+  //
+  // Data written after calling this method will start on a new line, in respect
+  // of the current indentation.
+  SourceWriter& EndLine();
+
+  // Begins a block of source code.
+  //
+  // This method appends a new opening brace to the current data and indent the
+  // next lines according to Google Java Style Guide. The block can optionally
+  // be preceded by an expression (e.g. Append("if(true)").BeginBlock();)
+  SourceWriter& BeginBlock() {
+    return Append(newline_ ? "{" : " {").EndLine().Indent(2);
+  }
+
+  // Ends the current block of source code.
+  //
+  // This method appends a new closing brace to the current data and outdent the
+  // next lines back to the margin used before BeginBlock() was invoked.
+  SourceWriter& EndBlock() {
+    return Indent(-2).Append("}").EndLine();
   }
 
+  // Begins to write a method.
+  //
+  // This method outputs the signature of the Java method from the data passed
+  // in the 'method' parameter and starts a new block. Additionnal modifiers can
+  // also be passed in parameter to define the accesses and the scope of this
+  // method.
+  SourceWriter& BeginMethod(const Method& method, int modifiers = 0);
+
+  // Ends the current method.
+  //
+  // This method ends the block of code that has begun when invoking
+  // BeginMethod() prior to this.
+  SourceWriter& EndMethod();
+
+  // Begins to write the main type of a source file.
+  //
+  // This method outputs the declaration of the Java type from the data passed
+  // in the 'type' parameter and starts a new block. Additionnal modifiers can
+  // also be passed in parameter to define the accesses and the scope of this
+  // type.
+  //
+  // If not null, all types found in the 'dependencies' list will be imported
+  // before declaring the new type.
+  SourceWriter& BeginType(const Type& clazz,
+      const std::list<Type>* dependencies, int modifiers = 0);
+
+  // Begins to write a new inner type.
+  //
+  // This method outputs the declaration of the Java type from the data passed
+  // in the 'type' parameter and starts a new block. Additionnal modifiers can
+  // also be passed in parameter to define the accesses and the scope of this
+  // type.
+  SourceWriter& BeginInnerType(const Type& type, int modifiers = 0);
+
+  // Ends the current type.
+  //
+  // This method ends the block of code that has begun when invoking
+  // BeginType() or BeginInnerType() prior to this.
+  SourceWriter& EndType();
+
+  // Writes a list of variables as fields of a type.
+  //
+  // This method must be called within the definition of a type (see BeginType()
+  // or BeginInnerType()). Additional modifiers can also be passed in parameter
+  // to define the accesses and the scope of those fields.
+  SourceWriter& WriteFields(const std::list<Variable>& fields,
+      int modifiers = 0);
+
  protected:
   virtual void DoAppend(const StringPiece& str) = 0;
 
  private:
+  // A utility base class for visiting elements of a type.
+  class TypeVisitor {
+   public:
+    virtual ~TypeVisitor() = default;
+    void Visit(const Type& type);
+
+   protected:
+    virtual void DoVisit(const Type& type) = 0;
+  };
+
+  // A utility class for keeping track of declared generics in a given scope.
+  class GenericNamespace : public TypeVisitor {
+   public:
+    GenericNamespace() = default;
+    explicit GenericNamespace(const GenericNamespace* parent)
+      : generic_names_(parent->generic_names_) {}
+    std::list<const Type*> declared_types() {
+      return declared_types_;
+    }
+   protected:
+    virtual void DoVisit(const Type& type);
+
+   private:
+    std::list<const Type*> declared_types_;
+    std::set<string> generic_names_;
+  };
+
+  // A utility class for collecting a list of import statements to declare.
+  class TypeImporter : public TypeVisitor {
+   public:
+    explicit TypeImporter(const string& current_package)
+      : current_package_(current_package) {}
+    virtual ~TypeImporter() = default;
+    const std::set<string> imports() {
+      return imports_;
+    }
+   protected:
+    virtual void DoVisit(const Type& type);
+
+   private:
+    string current_package_;
+    std::set<string> imports_;
+  };
+
   string left_margin_;
   string line_prefix_;
   bool newline_ = true;
+  std::stack<GenericNamespace*> generic_namespaces_;
+
+  SourceWriter& WriteModifiers(int modifiers);
+  SourceWriter& WriteDoc(const string& description,
+    const string& return_description = "",
+    const std::list<Variable>* parameters = nullptr);
+  SourceWriter& WriteAnnotations(const std::list<Annotation>& annotations);
+  SourceWriter& WriteGenerics(const std::list<const Type*>& generics);
+  GenericNamespace* PushGenericNamespace(int modifiers);
+  void PopGenericNamespace();
 };
 
 // A writer that outputs source code into a file.
@@ -128,6 +255,7 @@ class SourceBufferWriter : public SourceWriter {
   string* buffer_;
 };
 
+}  // namespace java
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_JAVA_SRC_GEN_CC_SOURCE_WRITER_H_
diff --git a/tensorflow/java/src/gen/cc/source_writer_test.cc b/tensorflow/java/src/gen/cc/source_writer_test.cc
index e9738957548184726395c4e6634ba12a5a9a0109..4bce2fea7040a0e5cb9256dc2672399c3af8a03d 100644
--- a/tensorflow/java/src/gen/cc/source_writer_test.cc
+++ b/tensorflow/java/src/gen/cc/source_writer_test.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/java/src/gen/cc/source_writer.h"
+#include <list>
+
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/java/src/gen/cc/java_defs.h"
+#include "tensorflow/java/src/gen/cc/source_writer.h"
 
 namespace tensorflow {
+namespace java {
 namespace {
 
 TEST(AppendTest, SingleLineText) {
@@ -211,5 +215,368 @@ TEST(MarginTest, EmptyPrefix) {
   ASSERT_STREQ(expected, writer.str().data());
 }
 
+TEST(StreamTest, BlocksAndLines) {
+  SourceBufferWriter writer;
+
+  writer.Append("int i = 0;").EndLine()
+        .Append("int j = 10;").EndLine()
+        .Append("if (true)")
+        .BeginBlock()
+          .Append("int aLongWayToTen = 0;").EndLine()
+          .Append("while (++i <= j)")
+          .BeginBlock()
+            .Append("++aLongWayToTen;").EndLine()
+          .EndBlock()
+        .EndBlock();
+
+  const char* expected =
+      "int i = 0;\n"
+      "int j = 10;\n"
+      "if (true) {\n"
+      "  int aLongWayToTen = 0;\n"
+      "  while (++i <= j) {\n"
+      "    ++aLongWayToTen;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(StreamTest, Types) {
+  SourceBufferWriter writer;
+  Type generic = Type::Generic("T").add_supertype(Type::Class("Number"));
+
+  writer.AppendType(Type::Int()).Append(", ")
+        .AppendType(Type::Class("String")).Append(", ")
+        .AppendType(generic).Append(", ")
+        .AppendType(Type::ListOf(generic)).Append(", ")
+        .AppendType(Type::ListOf(Type::IterableOf(generic))).Append(", ")
+        .AppendType(Type::ListOf(Type::Generic()));
+
+  const char* expected =
+      "int, String, T, List<T>, List<Iterable<T>>, List<?>";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(StreamTest, FileSnippet) {
+  SourceBufferWriter writer;
+  const string fname = tensorflow::io::JoinPath(
+      tensorflow::testing::TensorFlowSrcRoot(),
+      "java/src/gen/resources/test.java.snippet");
+
+  writer.WriteFromFile(fname)
+        .BeginBlock()
+        .WriteFromFile(fname)
+        .EndBlock();
+
+  const char* expected =
+      "// Here is a little snippet\n"
+      "System.out.println(\"Hello!\");\n"
+      "{\n"
+      "  // Here is a little snippet\n"
+      "  System.out.println(\"Hello!\");\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, SimpleClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, SimpleClassWithDependencies) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  std::list<Type> deps;
+  deps.push_back(Type::Class("TypeA", "org.test.sub"));
+  deps.push_back(Type::Class("TypeA", "org.test.sub"));  // a second time
+  deps.push_back(Type::Class("TypeB", "org.other"));
+  deps.push_back(Type::Class("SamePackageType", "org.tensorflow"));
+  deps.push_back(Type::Class("NoPackageType"));
+
+  writer.BeginType(clazz, &deps, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "import org.other.TypeB;\n"
+      "import org.test.sub.TypeA;\n\n"
+      "public class Test {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, AnnotatedAndDocumentedClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  clazz.description("This class has a\n<p>\nmultiline description.");
+  clazz.add_annotation(Annotation::Create("Bean"));
+  clazz.add_annotation(Annotation::Create("SuppressWarnings")
+      .attributes("\"rawtypes\""));
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "/**\n"
+      " * This class has a\n"
+      " * <p>\n"
+      " * multiline description.\n"
+      " **/\n"
+      "@Bean\n"
+      "@SuppressWarnings(\"rawtypes\")\n"
+      "public class Test {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, ParameterizedClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  clazz.add_parameter(Type::Generic("T"));
+  clazz.add_parameter(Type::Generic("U").add_supertype(Type::Class("Number")));
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T, U extends Number> {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, ParameterizedClassAndSupertypes) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T");
+  clazz.add_parameter(type_t);
+  Type type_u = Type::Generic("U").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_u);
+  clazz.add_supertype(Type::Interface("Parametrizable").add_parameter(type_u));
+  clazz.add_supertype(Type::Interface("Runnable"));
+  clazz.add_supertype(Type::Class("SuperTest").add_parameter(type_t));
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T, U extends Number>"
+      " extends SuperTest<T> implements Parametrizable<U>, Runnable {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, ParameterizedClassFields) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  std::list<Variable> static_fields;
+  static_fields.push_back(Variable::Create("field1", Type::Class("String")));
+  std::list<Variable> member_fields;
+  member_fields.push_back(Variable::Create("field2", Type::Class("String")));
+  member_fields.push_back(Variable::Create("field3", type_t));
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .WriteFields(static_fields, STATIC | PUBLIC | FINAL)
+          .WriteFields(member_fields, PRIVATE)
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public static final String field1;\n"
+      "  \n"
+      "  private String field2;\n"
+      "  private T field3;\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, SimpleInnerClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type inner_class = Type::Class("InnerTest");
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginInnerType(inner_class, PUBLIC)
+          .EndType()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  public class InnerTest {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, StaticParameterizedInnerClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  Type inner_class = Type::Class("InnerTest");
+  inner_class.add_parameter(type_t);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginInnerType(inner_class, PUBLIC | STATIC)
+          .EndType()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public static class InnerTest<T extends Number> {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, SimpleMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Method method = Method::Create("doNothing", Type::Void());
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC).EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  public void doNothing() {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, AnnotatedAndDocumentedMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Method method = Method::Create("doNothing", Type::Void());
+  method.description("This method has a\n<p>\nmultiline description.");
+  method.add_annotation(Annotation::Create("Override"));
+  method.add_annotation(Annotation::Create("SuppressWarnings")
+      .attributes("\"rawtypes\""));
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC).EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  /**\n"
+      "   * This method has a\n"
+      "   * <p>\n"
+      "   * multiline description.\n"
+      "   **/\n"
+      "  @Override\n"
+      "  @SuppressWarnings(\"rawtypes\")\n"
+      "  public void doNothing() {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, DocumentedMethodWithArguments) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Method method = Method::Create("boolToInt", Type::Int());
+  method.description("Converts a boolean to an int");
+  method.return_description("int value for this boolean");
+  method.add_argument(Variable::Create("b", Type::Boolean()));
+  Variable reverse = Variable::Create("reverse", Type::Boolean());
+  reverse.description("if true, value is reversed");
+  method.add_argument(reverse);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC)
+            .Append("if (b && !reverse)")
+            .BeginBlock()
+              .Append("return 1;").EndLine()
+            .EndBlock()
+          .Append("return 0;").EndLine()
+          .EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  /**\n"
+      "   * Converts a boolean to an int\n"
+      "   * \n"
+      "   * @param b\n"
+      "   * @param reverse if true, value is reversed\n"
+      "   * @return int value for this boolean\n"
+      "   **/\n"
+      "  public int boolToInt(boolean b, boolean reverse) {\n"
+      "    if (b && !reverse) {\n"
+      "      return 1;\n"
+      "    }\n"
+      "    return 0;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, ParameterizedMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  Method method = Method::Create("doNothing", type_t);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC)
+            .Append("return null;").EndLine()
+          .EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public T doNothing() {\n"
+      "    return null;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, StaticParameterizedMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  Method method = Method::Create("doNothing", type_t);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC | STATIC)
+            .Append("return null;").EndLine()
+          .EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public static <T extends Number> T doNothing() {\n"
+      "    return null;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
 }  // namespace
+}  // namespace java
 }  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/resources/test.java.snippet b/tensorflow/java/src/gen/resources/test.java.snippet
new file mode 100644
index 0000000000000000000000000000000000000000..5e412a9aef436bb73a4d013d1b698b75ad9fbab4
--- /dev/null
+++ b/tensorflow/java/src/gen/resources/test.java.snippet
@@ -0,0 +1,2 @@
+// Here is a little snippet
+System.out.println("Hello!");
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
index 489e95c3102557d7a75d83789c46106aa5aa3ed4..3948991c84d35009217f7c05844551fdcc49fb22 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
@@ -101,6 +101,7 @@ public class LabelImage {
                   b.constant("mean", mean)),
               b.constant("scale", scale));
       try (Session s = new Session(g)) {
+        // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
         return s.runner().fetch(output.op().name()).run().get(0).expect(Float.class);
       }
     }
@@ -110,6 +111,7 @@ public class LabelImage {
     try (Graph g = new Graph()) {
       g.importGraphDef(graphDef);
       try (Session s = new Session(g);
+          // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
           Tensor<Float> result =
               s.runner().feed("input", image).fetch("output").run().get(0).expect(Float.class)) {
         final long[] rshape = result.shape();
diff --git a/tensorflow/java/src/main/native/tensor_jni.cc b/tensorflow/java/src/main/native/tensor_jni.cc
index 745abec244d1528e918464473e5d3fb19ad5082c..7e3cf4a88aac5acd4721a07c8316d8d124dce001 100644
--- a/tensorflow/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/java/src/main/native/tensor_jni.cc
@@ -400,7 +400,13 @@ size_t nonScalarTF_STRINGTensorSize(JNIEnv* env, jarray value, int num_dims) {
   for (jsize i = 0; i < len; ++i) {
     jarray elem = static_cast<jarray>(
         env->GetObjectArrayElement(static_cast<jobjectArray>(value), i));
+    if (elem == nullptr) {
+      throwException(env, kNullPointerException,
+                     "null entries in provided array");
+      return ret;
+    }
     ret += nonScalarTF_STRINGTensorSize(env, elem, num_dims - 1);
+    if (env->ExceptionCheck()) return ret;
   }
   return ret;
 }
@@ -421,8 +427,8 @@ void fillNonScalarTF_STRINGTensorData(JNIEnv* env, jarray value, int num_dims,
   for (jsize i = 0; i < len; ++i) {
     jarray elem = static_cast<jarray>(
         env->GetObjectArrayElement(static_cast<jobjectArray>(value), i));
-    if (TF_GetCode(status) != TF_OK) return;
     fillNonScalarTF_STRINGTensorData(env, elem, num_dims - 1, writer, status);
+    if (TF_GetCode(status) != TF_OK) return;
   }
 }
 }  // namespace
@@ -444,6 +450,7 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_allocateNonScalarBytes(
   }
   const size_t encoded_size =
       nonScalarTF_STRINGTensorSize(env, value, num_dims);
+  if (env->ExceptionCheck()) return 0;
   TF_Tensor* t = TF_AllocateTensor(TF_STRING, dims, num_dims,
                                    8 * num_elements + encoded_size);
   if (t == nullptr) {
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index 6538359d11a95eae698cc5aac8430e74ab1ed74c..1bd00a763ddff2f067183f57cfa80fdcbed84fd2 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -432,7 +432,7 @@ public class TensorTest {
     try (Tensor<Integer> t = Tensor.create(vector, Integer.class)) {
       fail("Tensor.create() should fail because it was given an array of boxed values");
     } catch (IllegalArgumentException e) {
-        // The expected exception
+      // The expected exception
     }
   }
 
@@ -536,4 +536,15 @@ public class TensorTest {
       assertArrayEquals(matrix, cpy.copyTo(new float[2][3]));
     }
   }
+
+  @Test
+  public void gracefullyFailCreationFromNullArrayForStringTensor() {
+    // Motivated by: https://github.com/tensorflow/tensorflow/issues/17130
+    byte[][] array = new byte[1][];
+    try {
+      Tensors.create(array);
+    } catch (NullPointerException e) {
+      // expected.
+    }
+  }
 }
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index cee7c47e00d5673cd2abf2b1e526523ad61bbafd..44d9147bb63598cafa7977cf47720ed338169147 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4,16 +4,14 @@
 # Public targets:
 #  ":platform" - Low-level and platform-specific Python code.
 
-package(
-    default_visibility = [
-        "//engedu/ml/tf_from_scratch:__pkg__",
-        "//tensorflow:internal",
-        "//tensorflow/contrib/lite/toco/python:__pkg__",
-        "//tensorflow_models:__subpackages__",
-        # TODO(aselle): to pass open source test.
-        "//bazel_pip/tensorflow/contrib/lite/toco/python:__pkg__",
-    ],
-)
+package(default_visibility = [
+    "//engedu/ml/tf_from_scratch:__pkg__",
+    "//tensorflow:internal",
+    "//tensorflow/contrib/lite/toco/python:__pkg__",
+    "//tensorflow_models:__subpackages__",
+    # TODO(aselle): to pass open source test.
+    "//bazel_pip/tensorflow/contrib/lite/toco/python:__pkg__",
+])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -28,6 +26,8 @@ load("//tensorflow:tensorflow.bzl", "py_tests")
 load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library_additional_deps_impl")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library")
@@ -58,14 +58,30 @@ py_library(
         "//tensorflow/tools/api/generator:__pkg__",
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
+    deps = [
+        ":no_contrib",
+        "//tensorflow/contrib:contrib_py",
+    ],
+)
+
+py_library(
+    name = "no_contrib",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow/python/tools:__pkg__",
+    ],
     deps = [
         ":array_ops",
         ":bitwise_ops",
+        ":boosted_trees_ops",
         ":check_ops",
         ":client",
         ":client_testlib",
         ":confusion_matrix",
         ":control_flow_ops",
+        ":cudnn_rnn_ops_gen",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -86,39 +102,38 @@ py_library(
         ":ops",
         ":platform",
         ":pywrap_tensorflow",
+        ":saver_test_utils",
         ":script_ops",
         ":session_ops",
         ":sets",
         ":sparse_ops",
         ":spectral_ops",
+        ":spectral_ops_test_util",
         ":standard_ops",
         ":state_ops",
         ":string_ops",
+        ":subscribe",
         ":summary",
         ":tensor_array_ops",
-        ":training",
-        ":saver_test_utils",
-        ":subscribe",
         ":test_ops",  # TODO: Break testing code out into separate rule.
-        ":tf_item",
         ":tf_cluster",
+        ":tf_item",
         ":tf_optimizer",
+        ":training",
         ":util",
         ":weights_broadcast_ops",
-        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras",
-        "//tensorflow/python/ops/losses",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/losses",
         "//tensorflow/python/profiler",
         "//tensorflow/python/saved_model",
-    ] + if_not_windows([
-        "//tensorflow/contrib:contrib_py",
-    ]),
+        "//third_party/py/numpy",
+    ],
 )
 
 tf_py_build_info_genrule()
@@ -270,6 +285,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "py_exception_registry",
+    srcs = ["lib/core/py_exception_registry.cc"],
+    hdrs = ["lib/core/py_exception_registry.h"],
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "kernel_registry",
     srcs = ["util/kernel_registry.cc"],
@@ -286,6 +312,7 @@ cc_library(
     srcs = ["util/util.cc"],
     hdrs = ["util/util.h"],
     deps = [
+        ":safe_ptr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//util/python:python_headers",
@@ -399,6 +426,7 @@ tf_cc_shared_object(
             "-lm",
         ],
         "//tensorflow:darwin": [],
+        "//tensorflow:windows": [],
     }),
     deps = [
         "//tensorflow/core:framework_headers_lib",
@@ -765,6 +793,31 @@ py_library(
     ],
 )
 
+py_library(
+    name = "smart_cond",
+    srcs = ["framework/smart_cond.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_ops",
+        ":tensor_util",
+    ],
+)
+
+py_test(
+    name = "smart_cond_test",
+    size = "small",
+    srcs = ["framework/smart_cond_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":constant_op",
+        ":framework_ops",
+        ":math_ops",
+        ":session",
+        ":smart_cond",
+    ],
+)
+
 py_library(
     name = "sparse_tensor",
     srcs = ["framework/sparse_tensor.py"],
@@ -781,6 +834,7 @@ py_library(
     srcs = ["framework/tensor_shape.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":dtypes",
         ":util",
         "//tensorflow/core:protos_all_py",
     ],
@@ -921,7 +975,6 @@ py_test(
     srcs = ["framework/contrib_test.py"],
     main = "framework/contrib_test.py",
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
@@ -1007,6 +1060,11 @@ cuda_py_tests(
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
     ],
+    shard_count = 10,
+    tags = [
+        "noasan",
+        "optonly",
+    ],
 )
 
 py_test(
@@ -1023,7 +1081,7 @@ py_test(
 
 py_test(
     name = "framework_importer_test",
-    size = "medium",
+    size = "large",
     srcs = ["framework/importer_test.py"],
     main = "framework/importer_test.py",
     srcs_version = "PY2AND3",
@@ -1286,7 +1344,6 @@ py_test(
     srcs = ["framework/dtypes_test.py"],
     main = "framework/dtypes_test.py",
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -1331,6 +1388,20 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "boosted_trees_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "summary_ops_gen",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = ["//tensorflow/core:summary_ops_op_lib"],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "audio_ops_gen",
     require_shape_functions = True,
@@ -1340,6 +1411,13 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "cudnn_rnn_ops_gen",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "candidate_sampling_ops_gen",
     visibility = ["//learning/brain/python/ops:__pkg__"],
@@ -1538,7 +1616,10 @@ py_library(
 
 py_library(
     name = "array_ops",
-    srcs = ["ops/array_ops.py"],
+    srcs = [
+        "ops/array_ops.py",
+        "ops/inplace_ops.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops_gen",
@@ -1567,6 +1648,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "boosted_trees_ops",
+    srcs = ["ops/boosted_trees_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":boosted_trees_ops_gen",
+        ":framework",
+        ":ops",
+        ":training",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+    ],
+)
+
 py_library(
     name = "sets",
     srcs = [
@@ -1628,7 +1722,6 @@ py_test(
     size = "small",
     srcs = ["ops/clip_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":clip_ops",
@@ -1700,6 +1793,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "cudnn_rnn_grad",
+    srcs = ["ops/cudnn_rnn_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_for_generated_wrappers",
+        "//tensorflow/python:cudnn_rnn_ops_gen",
+    ],
+)
+
 py_library(
     name = "data_flow_grad",
     srcs = ["ops/data_flow_grad.py"],
@@ -1750,6 +1853,7 @@ py_library(
 py_library(
     name = "gradients",
     srcs = [
+        "ops/custom_gradient.py",
         "ops/gradients.py",
         "ops/gradients_impl.py",
     ],
@@ -1763,6 +1867,7 @@ py_library(
         ":control_flow_util",
         ":framework",
         ":framework_for_generated_wrappers",
+        ":framework_ops",
         ":functional_ops",
         ":image_grad",
         ":linalg_grad",
@@ -1773,8 +1878,13 @@ py_library(
         ":math_grad",
         ":math_ops",
         ":platform",
+        ":resource_variable_ops",
         ":spectral_grad",
         ":util",
+        ":variable_scope",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -1817,13 +1927,16 @@ py_library(
         ":control_flow_ops",
         ":framework",
         ":framework_for_generated_wrappers",
+        ":gradients",
         ":image_ops_gen",
         ":math_ops",
+        ":nn",
         ":nn_ops_gen",
         ":random_ops",
         ":string_ops",
         ":util",
         ":variables",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -1835,7 +1948,8 @@ py_library(
         ":array_ops",
         ":constant_op",
         ":dtypes",
-        ":linalg_ops",
+        ":linalg_ops_gen",
+        ":linalg_ops_impl",
         ":math_ops",
         ":nn_ops",
         ":random_ops",
@@ -1886,7 +2000,22 @@ py_library(
         ":array_ops",
         ":dtypes",
         ":framework_ops",
+        ":functional_ops",
         ":linalg_ops_gen",
+        ":linalg_ops_impl",
+        ":math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "linalg_ops_impl",
+    srcs = ["ops/linalg_ops_impl.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":dtypes",
+        ":framework_ops",
         ":math_ops",
         "//third_party/py/numpy",
     ],
@@ -2172,7 +2301,6 @@ py_library(
         ":clip_ops",
         ":framework_for_generated_wrappers",
         ":init_ops",
-        ":layers_base",
         ":math_ops",
         ":nn_ops",
         ":partitioned_variables",
@@ -2366,6 +2494,7 @@ py_library(
         ":clip_ops",
         ":confusion_matrix",
         ":control_flow_ops",
+        ":cudnn_rnn_grad",
         ":data_flow_grad",
         ":data_flow_ops",
         ":framework_for_generated_wrappers",
@@ -2449,6 +2578,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "summary_ops_v2",
+    srcs = ["ops/summary_ops_v2.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":array_ops",
+        ":constant_op",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":math_ops",
+        ":resource_variable_ops",
+        ":smart_cond",
+        ":summary_op_util",
+        ":summary_ops_gen",
+        ":training_util",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "template",
     srcs = ["ops/template.py"],
@@ -2552,6 +2705,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":user_ops_gen",
+        ":util",
         "@six_archive//:six",
     ],
 )
@@ -2624,6 +2778,7 @@ cuda_py_test(
         ":framework_test_lib",
         ":functional_ops",
         ":gradients",
+        ":layers",
         ":math_grad",
         ":math_ops",
         ":nn_grad",
@@ -2633,6 +2788,7 @@ cuda_py_test(
         ":tensor_array_grad",
         ":tensor_array_ops",
         ":test_ops",
+        ":variable_scope",
         "//third_party/py/numpy",
     ],
 )
@@ -2667,7 +2823,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "image_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["ops/image_ops_test.py"],
     additional_deps = [
         ":array_ops",
@@ -2688,7 +2844,6 @@ cuda_py_test(
     ],
     data = ["//tensorflow/core:image_testdata"],
     shard_count = 5,
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2811,20 +2966,32 @@ py_library(
     name = "training",
     srcs = glob(
         ["training/**/*.py"],
-        exclude = ["**/*test*"],
+        exclude = [
+            "**/*test*",
+            # The following targets have their own build rules (same name as the
+            # file):
+            "training/checkpointable.py",
+            "training/saveable_object.py",
+            "training/training_util.py",
+        ],
     ),
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":array_ops_gen",
         ":checkpoint_ops_gen",
+        ":checkpointable",
         ":client",
         ":control_flow_ops",
         ":data_flow_ops",
+        ":device",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
+        ":framework_ops",
         ":gradients",
         ":init_ops",
+        ":distribute",
         ":io_ops",
         ":io_ops_gen",
         ":layers_base",
@@ -2837,18 +3004,25 @@ py_library(
         ":random_ops",
         ":resource_variable_ops",
         ":resources",
+        ":saveable_object",
         ":sdca_ops",
         ":sparse_ops",
         ":state_ops",
         ":string_ops",
         ":summary",
         ":training_ops_gen",
+        ":training_util",
         ":util",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/eager:backprop",
         "//third_party/py/numpy",
         "@six_archive//:six",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops/losses",
+        # `layers` dependency only exists due to the use of a small utility.
+        "//tensorflow/python/keras:layers",
     ],
 )
 
@@ -2857,10 +3031,10 @@ py_library(
     srcs = ["training/checkpointable.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":array_ops",
         ":dtypes",
         ":io_ops_gen",
         ":ops",
-        ":pywrap_tensorflow",
         ":util",
         "//tensorflow/python/eager:context",
     ],
@@ -2876,6 +3050,87 @@ py_test(
     ],
 )
 
+py_library(
+    name = "saveable_object",
+    srcs = ["training/saveable_object.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "device_util",
+    srcs = ["training/device_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":device",
+        ":framework_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "distribute",
+    srcs = ["training/distribute.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":device_util",
+        ":framework_ops",
+        ":platform",
+        ":resource_variable_ops",
+        ":state_ops",
+        ":util",
+        ":variable_scope",
+        "//tensorflow/python/data",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
+py_test(
+    name = "checkpointable_utils_test",
+    srcs = ["training/checkpointable_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",  # b/74395663
+    ],
+    deps = [
+        ":checkpointable",
+        ":constant_op",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":framework_test_lib",
+        ":init_ops",
+        ":resource_variable_ops",
+        ":session",
+        ":state_ops",
+        ":template",
+        ":training",
+        ":training_util",
+        ":variable_scope",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "distribute_test",
+    size = "small",
+    srcs = ["training/distribute_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":distribute",
+        ":variable_scope",
+    ],
+)
+
 py_test(
     name = "evaluation_test",
     size = "small",
@@ -3051,7 +3306,6 @@ tf_proto_library(
             "framework/cpp_shape_inference.proto",
         ],
     ),
-    go_api_version = 2,
 )
 
 tf_proto_library_py(
@@ -3065,6 +3319,8 @@ tf_proto_library(
     srcs = ["framework/cpp_shape_inference.proto"],
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
+    # TODO(b/74620627): remove when _USE_C_SHAPES is removed
+    visibility = ["//tensorflow:internal"],
 )
 
 py_test(
@@ -3149,6 +3405,7 @@ cuda_py_tests(
         ":client_testlib",
         ":framework_test_lib",
         ":platform_test",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -3208,6 +3465,7 @@ tf_py_wrap_cc(
         "grappler/model_analyzer.i",
         "grappler/tf_optimizer.i",
         "lib/core/bfloat16.i",
+        "lib/core/py_exception_registry.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
         "lib/io/file_io.i",
@@ -3226,6 +3484,10 @@ tf_py_wrap_cc(
         "util/transform_graph.i",
         "util/util.i",
     ],
+    win_def_file = select({
+        "//tensorflow:windows": ":pywrap_tensorflow_filtered_def_file",
+        "//conditions:default": None,
+    }),
     deps = [
         ":bfloat16_lib",
         ":cost_analyzer_lib",
@@ -3235,6 +3497,7 @@ tf_py_wrap_cc(
         ":kernel_registry",
         ":numpy_lib",
         ":safe_ptr",
+        ":py_exception_registry",
         ":py_func_lib",
         ":py_record_reader_lib",
         ":py_record_writer_lib",
@@ -3245,6 +3508,7 @@ tf_py_wrap_cc(
         "//tensorflow/c:python_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_rpc_factory_registration",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
         "//tensorflow/core/grappler:grappler_item",
@@ -3261,6 +3525,7 @@ tf_py_wrap_cc(
         "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/python/eager:pywrap_tfe_lib",
+        "//tensorflow/python/eager:python_eager_op_gen",
         "//util/python:python_headers",
     ] + (tf_additional_lib_deps() +
          tf_additional_plugin_deps() +
@@ -3269,6 +3534,65 @@ tf_py_wrap_cc(
          tf_additional_gdr_deps()),
 )
 
+# ** Targets for Windows build (start) **
+# We need the following targets to expose symbols from _pywrap_tensorflow.dll
+
+# Build a cc_binary from tf_custom_op_library_additional_deps_impl,
+# it contains all object code from its dependencies.
+tf_native_cc_binary(
+    name = "tf_custom_op_library_additional_deps.so",
+    linkshared = 1,
+    linkstatic = 1,
+    deps = tf_custom_op_library_additional_deps_impl(),
+)
+
+# Get a DEF file generated by parsing all object files
+# of tf_custom_op_library_additional_deps.so
+filegroup(
+    name = "pywrap_tensorflow_def_file",
+    srcs = [":tf_custom_op_library_additional_deps.so"],
+    output_group = "def_file",
+)
+
+# Filter the DEF file to reduce the number of symbols to 64K or less.
+# Note that we also write the name of the pyd file into DEF file so that
+# the dynamic libraries of custom ops can find it at runtime.
+genrule(
+    name = "pywrap_tensorflow_filtered_def_file",
+    srcs = [":pywrap_tensorflow_def_file"],
+    outs = ["pywrap_tensorflow_filtered_def_file.def"],
+    cmd = select({
+        "//tensorflow:windows": """
+              $(location @local_config_def_file_filter//:def_file_filter) \\
+              --input $(location :pywrap_tensorflow_def_file) \\
+              --output $@ \\
+              --target _pywrap_tensorflow_internal.pyd
+          """,
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    tools = ["@local_config_def_file_filter//:def_file_filter"],
+)
+
+# Get the import library of  _pywrap_tensorflow_internal.dll
+filegroup(
+    name = "pywrap_tensorflow_import_lib_file",
+    srcs = [":_pywrap_tensorflow_internal.so"],
+    output_group = "interface_library",
+)
+
+# Create a cc_import rule for the import library of _pywrap_tensorflow_internal.dll
+# so that custom ops' dynamic libraries can link against it.
+cc_import(
+    name = "pywrap_tensorflow_import_lib",
+    interface_library = select({
+        "//tensorflow:windows": ":pywrap_tensorflow_import_lib_file",
+        "//conditions:default": "not_exsiting_on_unix.lib",  # Just a placeholder for Unix platforms
+    }),
+    system_provided = 1,
+)
+
+# ** Targets for Windows build (end) **
+
 py_library(
     name = "lib",
     srcs = [
@@ -3602,6 +3926,7 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -3630,6 +3955,7 @@ py_test(
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":state_ops_gen",
+        ":variable_scope",
         ":variables",
         "//tensorflow/core:protos_all_py",
     ],
@@ -3640,7 +3966,6 @@ py_test(
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":lib",
@@ -3833,6 +4158,7 @@ py_test(
     srcs = ["training/saver_large_partitioned_variable_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_windows",
         "noasan",  # http://b/30782289
         "notsan",  # http://b/30782289
     ],
@@ -3920,7 +4246,13 @@ py_test(
     size = "small",
     srcs = ["training/checkpoint_utils_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
+    tags = [
+        "manual",
+        "no_cuda_on_cpu_tap",
+        "no_oss",
+        "no_windows",
+        "notap",
+    ],
     deps = [
         ":client",
         ":client_testlib",
@@ -3929,6 +4261,7 @@ py_test(
         ":partitioned_variables",
         ":platform",
         ":pywrap_tensorflow",
+        ":resource_variable_ops",
         ":state_ops",
         ":training",
         ":variable_scope",
@@ -3941,7 +4274,6 @@ py_test(
     size = "small",
     srcs = ["training/checkpoint_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":checkpoint_ops_gen",
         ":client",
@@ -3958,15 +4290,31 @@ py_test(
     ],
 )
 
+py_test(
+    name = "warm_starting_util_test",
+    size = "small",
+    srcs = ["training/warm_starting_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":dtypes",
+        ":framework_ops",
+        ":init_ops",
+        ":training",
+        ":variable_scope",
+        ":variables",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "monitored_session_test",
     size = "medium",
     srcs = ["training/monitored_session_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",  # b/67945581
-    ],
+    tags = ["notsan"],  # b/67945581
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -3984,6 +4332,25 @@ py_test(
     ],
 )
 
+py_library(
+    name = "training_util",
+    srcs = ["training/training_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework",
+        ":framework_ops",
+        ":init_ops",
+        ":platform",
+        ":resource_variable_ops",
+        ":state_ops",
+        ":util",
+        ":variable_scope",
+        ":variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_test(
     name = "training_util_test",
     size = "small",
@@ -3994,13 +4361,14 @@ py_test(
         ":framework",
         ":platform",
         ":training",
+        ":training_util",
         ":variables",
     ],
 )
 
 tf_py_test(
     name = "input_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/input_test.py"],
     additional_deps = [
         ":array_ops",
@@ -4023,6 +4391,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":distribute",
         ":framework",
         ":framework_for_generated_wrappers",
         ":platform",
@@ -4038,6 +4407,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":client",
+        ":constant_op",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -4048,6 +4419,8 @@ py_library(
         ":pywrap_tensorflow",
         ":summary_op_util",
         ":summary_ops",
+        ":summary_ops_gen",
+        ":summary_ops_v2",
         ":util",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
@@ -4074,7 +4447,7 @@ py_tests(
         ":platform",
         ":platform_test",
         ":summary",
-        ":training",
+        ":summary_ops_v2",
         "//tensorflow/core:protos_all_py",
     ],
 )
@@ -4092,12 +4465,14 @@ py_library(
         ":control_flow_ops",
         ":framework_for_generated_wrappers",
         ":platform",
+        ":smart_cond",
         ":tensor_util",
         ":util",
         ":variable_scope",
         ":variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/keras:engine",
         "//third_party/py/numpy",
     ],
 )
@@ -4108,8 +4483,6 @@ py_library(
         "layers/convolutional.py",
         "layers/core.py",
         "layers/layers.py",
-        "layers/maxout.py",
-        "layers/network.py",
         "layers/normalization.py",
         "layers/pooling.py",
     ],
@@ -4136,6 +4509,7 @@ py_library(
         ":variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/keras:layers",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -4162,25 +4536,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "layers_network_test",
-    size = "small",
-    srcs = ["layers/network_test.py"],
-    main = "layers/network_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":framework_test_lib",
-        ":layers",
-        ":layers_base",
-        ":sparse_ops",
-        "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "layers_core_test",
     size = "small",
@@ -4219,22 +4574,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "layers_maxout_test",
-    size = "small",
-    srcs = ["layers/maxout_test.py"],
-    main = "layers/maxout_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":layers",
-        ":math_ops",
-        ":nn_ops",
-        ":random_ops",
-    ],
-)
-
 py_test(
     name = "layers_utils_test",
     size = "small",
@@ -4311,18 +4650,6 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cuda_py_test(
     name = "accumulate_n_benchmark",
     size = "large",
@@ -4624,12 +4951,41 @@ py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        ":tf_item",
         ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
     ],
 )
 
+py_library(
+    name = "graph_placer",
+    srcs = [
+        "grappler/controller.py",
+        "grappler/graph_placer.py",
+        "grappler/hierarchical_controller.py",
+    ],
+    deps = [
+        ":python",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "graph_placer_test",
+    size = "large",
+    srcs = ["grappler/graph_placer_test.py"],
+    tags = [
+        "grappler",
+        "no_pip",  # graph_placer is not available in pip.
+    ],
+    deps = [
+        ":client_testlib",
+        ":graph_placer",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_test(
     name = "memory_optimizer_test",
     size = "medium",
@@ -4656,6 +5012,29 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "constant_folding_test",
+    size = "medium",
+    srcs = [
+        "grappler/constant_folding_test.py",
+    ],
+    additional_deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":array_ops",
+        ":control_flow_ops",
+        ":dtypes",
+        ":functional_ops",
+        ":math_ops",
+        ":ops",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+    ],
+)
+
 cuda_py_test(
     name = "layout_optimizer_test",
     size = "medium",
@@ -4721,6 +5100,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "grappler",
+        "no_cuda_on_cpu_tap",
         "no_pip",
     ],
     deps = [
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 02ed5517ca895ab070a89f8810f77dadcff9212b..cf707fb2c731c0db57c2335d3ffd49b292c811cc 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -98,6 +98,12 @@ from tensorflow.python.summary import summary
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
 
+# Import boosted trees ops to make sure the ops are registered (but unused).
+from tensorflow.python.ops import gen_boosted_trees_ops as _gen_boosted_trees_ops
+
+# Import cudnn rnn ops to make sure their ops are registered.
+from tensorflow.python.ops import gen_cudnn_rnn_ops as _
+
 
 # Import the names from python/training.py as train.Name.
 from tensorflow.python.training import training as train
@@ -114,59 +120,26 @@ from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import sysconfig
 from tensorflow.python.platform import test
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.all_util import make_all
 from tensorflow.python.util.tf_export import tf_export
 
-# Import modules whose docstrings contribute, for use by remove_undocumented
-# below.
-from tensorflow.python.client import client_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import framework_lib
-from tensorflow.python.framework import subscribe
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import confusion_matrix as confusion_matrix_m
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import histogram_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import session_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import tensor_array_ops
+# Eager execution
+from tensorflow.python.eager.context import executing_eagerly
+from tensorflow.python.framework.ops import enable_eager_execution
 
-# Symbols whitelisted for export without documentation.
-# TODO(cwhipkey): review these and move to contrib, expose through
-# documentation, or remove.
-_allowed_symbols = [
-    'AttrValue',
-    'ConfigProto',
-    'ClusterDef',
-    'DeviceSpec',
-    'Event',
-    'GPUOptions',
-    'GRAPH_DEF_VERSION',
-    'GRAPH_DEF_VERSION_MIN_CONSUMER',
-    'GRAPH_DEF_VERSION_MIN_PRODUCER',
-    'GraphDef',
-    'GraphOptions',
-    'HistogramProto',
-    'LogMessage',
-    'MetaGraphDef',
-    'NameAttrList',
-    'NodeDef',
-    'OptimizerOptions',
-    'RunOptions',
-    'RunMetadata',
-    'SessionLog',
-    'Summary',
-    'SummaryMetadata',
-    'TensorInfo',  # Used for tf.saved_model functionality.
-]
+# Necessary for the symbols in this module to be taken into account by
+# the namespace management system (API decorators).
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+
+# Required due to `rnn` and `rnn_cell` not being imported in `nn` directly
+# (due to a circular dependency issue: rnn depends on layers).
+nn.dynamic_rnn = rnn.dynamic_rnn
+nn.static_rnn = rnn.static_rnn
+nn.raw_rnn = rnn.raw_rnn
+nn.bidirectional_dynamic_rnn = rnn.bidirectional_dynamic_rnn
+nn.static_state_saving_rnn = rnn.static_state_saving_rnn
+nn.rnn_cell = rnn_cell
 
 # Export protos
 # pylint: disable=undefined-variable
@@ -192,119 +165,6 @@ tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
 tf_export('TensorInfo')(TensorInfo)
 # pylint: enable=undefined-variable
 
-
-# The following symbols are kept for compatibility. It is our plan
-# to remove them in the future.
-_allowed_symbols.extend([
-    'arg_max',
-    'arg_min',
-    'mul',  # use tf.multiply instead.
-    'neg',  # use tf.negative instead.
-    'sub',  # use tf.subtract instead.
-    'create_partitioned_variables',
-    'deserialize_many_sparse',
-    'lin_space',
-    'list_diff',  # Use tf.listdiff instead.
-    'listdiff',  # Use tf.listdiff instead.
-    'parse_single_sequence_example',
-    'serialize_many_sparse',
-    'serialize_sparse',
-    'sparse_matmul',  ## use tf.matmul instead.
-])
-
-# This is needed temporarily because we import it explicitly.
-_allowed_symbols.extend([
-    'pywrap_tensorflow',
-])
-
-# Dtypes exported by framework/dtypes.py.
-# TODO(cwhipkey): expose these through documentation.
-_allowed_symbols.extend([
-    'QUANTIZED_DTYPES',
-    'bfloat16',
-    'bool',
-    'complex64',
-    'complex128',
-    'double',
-    'half',
-    'float16',
-    'float32',
-    'float64',
-    'int16',
-    'int32',
-    'int64',
-    'int8',
-    'qint16',
-    'qint32',
-    'qint8',
-    'quint16',
-    'quint8',
-    'string',
-    'uint64',
-    'uint32',
-    'uint16',
-    'uint8',
-    'resource',
-    'variant',
-])
-
-# Export modules and constants.
-_allowed_symbols.extend([
-    'app',
-    'bitwise',
-    'compat',
-    'data',
-    'distributions',
-    'errors',
-    'estimator',
-    'feature_column',
-    'flags',
-    'gfile',
-    'graph_util',
-    'image',
-    'initializers',
-    'keras',
-    'layers',
-    'linalg',
-    'logging',
-    'losses',
-    'manip',
-    'metrics',
-    'newaxis',
-    'nn',
-    'profiler',
-    'python_io',
-    'resource_loader',
-    'saved_model',
-    'sets',
-    'spectral',
-    'summary',
-    'sysconfig',
-    'test',
-    'train',
-    'user_ops',
-])
-
-# Variables framework.versions:
-_allowed_symbols.extend([
-    'VERSION',
-    'GIT_VERSION',
-    'COMPILER_VERSION',
-    'CXX11_ABI_FLAG',
-    'MONOLITHIC_BUILD',
-])
-
-# Remove all extra symbols that don't have a docstring or are not explicitly
-# referenced in the whitelist.
-remove_undocumented(__name__, _allowed_symbols, [
-    framework_lib, array_ops, check_ops, client_lib, compat, constant_op,
-    control_flow_ops, confusion_matrix_m, data, distributions,
-    functional_ops, histogram_ops, io_ops, keras, layers,
-    losses, math_ops, metrics, nn, profiler, resource_loader, sets, script_ops,
-    session_ops, sparse_ops, state_ops, string_ops, summary, tensor_array_ops,
-    train
-])
-
 # Special dunders that we choose to export:
 _exported_dunders = set([
     '__version__',
diff --git a/tensorflow/python/client/client_lib.py b/tensorflow/python/client/client_lib.py
index b9ecaa4c851c080696b89c859c1ab5d17f3e0075..c94767a03c28cd90d2085a8a5db33d8e1237f2ed 100644
--- a/tensorflow/python/client/client_lib.py
+++ b/tensorflow/python/client/client_lib.py
@@ -16,30 +16,6 @@
 """Support for launching graphs and executing operations.
 
 See the @{$python/client} guide.
-
-@@Session
-@@InteractiveSession
-@@get_default_session
-@@OpError
-@@CancelledError
-@@UnknownError
-@@InvalidArgumentError
-@@DeadlineExceededError
-@@NotFoundError
-@@AlreadyExistsError
-@@PermissionDeniedError
-@@UnauthenticatedError
-@@ResourceExhaustedError
-@@FailedPreconditionError
-@@AbortedError
-@@OutOfRangeError
-@@UnimplementedError
-@@InternalError
-@@UnavailableError
-@@DataLossError
-@@exception_type_from_error_code
-@@error_code_from_exception_type
-@@raise_exception_on_not_ok_status
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/client/device_lib.i b/tensorflow/python/client/device_lib.i
index 51c04584a5492e13f5fead627685954d4f810dfa..944e855cee2ab9da7a4a801d1b993bec4d8ebc55 100644
--- a/tensorflow/python/client/device_lib.i
+++ b/tensorflow/python/client/device_lib.i
@@ -15,19 +15,39 @@ limitations under the License.
 
 %include "tensorflow/python/platform/base.i"
 
+%typemap(in) const tensorflow::ConfigProto& (tensorflow::ConfigProto temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The ConfigProto could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
 %{
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace swig {
 
-static std::vector<string> ListDevices(TF_Status* out_status) {
+static std::vector<string> ListDevicesWithSessionConfig(
+    const tensorflow::ConfigProto& config, TF_Status* out_status) {
   std::vector<string> output;
   SessionOptions options;
+  options.config = config;
   std::vector<Device*> devices;
   Status status = DeviceFactory::AddDevices(
       options, "" /* name_prefix */, &devices);
@@ -35,7 +55,8 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
     Set_TF_Status_from_Status(out_status, status);
   }
 
-  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(), devices.end());
+  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(),
+                                                     devices.end());
 
   for (const Device* device : devices) {
     const DeviceAttributes& attr = device->attributes();
@@ -53,6 +74,11 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
   return output;
 }
 
+std::vector<string> ListDevices(TF_Status* out_status) {
+  tensorflow::ConfigProto session_config;
+  return ListDevicesWithSessionConfig(session_config, out_status);
+}
+
 }  // namespace swig
 }  // namespace tensorflow
 
@@ -62,21 +88,28 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
 
 %unignore tensorflow;
 %unignore tensorflow::swig;
+%unignore tensorflow::swig::ListDevicesWithSessionConfig;
 %unignore tensorflow::swig::ListDevices;
 
 // Wrap this function
 namespace tensorflow {
 namespace swig {
 std::vector<string> ListDevices(TF_Status* out_status);
+static std::vector<string> ListDevicesWithSessionConfig(
+    const tensorflow::ConfigProto& config, TF_Status* out_status);
 }  // namespace swig
 }  // namespace tensorflow
 
 %insert("python") %{
-def list_devices():
+def list_devices(session_config=None):
   from tensorflow.python.framework import errors
 
   with errors.raise_exception_on_not_ok_status() as status:
-    return ListDevices(status)
+    if session_config:
+      return ListDevicesWithSessionConfig(session_config.SerializeToString(),
+                                          status)
+    else:
+      return ListDevices(status)
 %}
 
 %unignoreall
diff --git a/tensorflow/python/client/device_lib.py b/tensorflow/python/client/device_lib.py
index ad430cbae5a42a388cc8c41bf8be9db253aa92f2..9d90d5395e288e5988c60df64b9d962f5cccc22a 100644
--- a/tensorflow/python/client/device_lib.py
+++ b/tensorflow/python/client/device_lib.py
@@ -22,9 +22,12 @@ from tensorflow.core.framework import device_attributes_pb2
 from tensorflow.python import pywrap_tensorflow
 
 
-def list_local_devices():
+def list_local_devices(session_config=None):
   """List the available devices available in the local process.
 
+  Args:
+    session_config: a session config proto or None to use the default config.
+
   Returns:
     A list of `DeviceAttribute` protocol buffers.
   """
@@ -33,4 +36,7 @@ def list_local_devices():
     m.ParseFromString(pb_str)
     return m
 
-  return [_convert(s) for s in pywrap_tensorflow.list_devices()]
+  return [
+      _convert(s)
+      for s in pywrap_tensorflow.list_devices(session_config=session_config)
+  ]
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index aaf41626ab0078489026036d2b838f33a893a540..fec41f50b6c130704d587d6c7b80297c95183005 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import device_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -31,6 +32,10 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     self.assertGreater(len(devices), 0)
     self.assertEqual(devices[0].device_type, "CPU")
 
+    devices = device_lib.list_local_devices(config_pb2.ConfigProto())
+    self.assertGreater(len(devices), 0)
+    self.assertEqual(devices[0].device_type, "CPU")
+
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index f3c4fecdc0fde0436bea76cc774edaabe1bc07dd..5507d011bb0746c84b868ca7efcc3e4f8d2e146a 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -21,12 +21,12 @@ from __future__ import print_function
 import functools
 import re
 import threading
+import warnings
 
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -628,14 +628,12 @@ class BaseSession(SessionInterface):
     self._session = None
     opts = tf_session.TF_NewSessionOptions(target=self._target, config=config)
     try:
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          # pylint: disable=protected-access
-          self._session = tf_session.TF_NewSession(self._graph._c_graph, opts,
-                                                   status)
-          # pylint: enable=protected-access
-        else:
-          self._session = tf_session.TF_NewDeprecatedSession(opts, status)
+      if self._created_with_new_api:
+        # pylint: disable=protected-access
+        self._session = tf_session.TF_NewSession(self._graph._c_graph, opts)
+        # pylint: enable=protected-access
+      else:
+        self._session = tf_session.TF_NewDeprecatedSession(opts)
     finally:
       tf_session.TF_DeleteSessionOptions(opts)
 
@@ -662,22 +660,20 @@ class BaseSession(SessionInterface):
     Returns:
       A list of devices in the session.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        raw_device_list = tf_session.TF_SessionListDevices(
-            self._session, status)
-      else:
-        raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
-            self._session, status)
-      device_list = []
-      size = tf_session.TF_DeviceListCount(raw_device_list)
-      for i in range(size):
-        name = tf_session.TF_DeviceListName(raw_device_list, i, status)
-        device_type = tf_session.TF_DeviceListType(raw_device_list, i, status)
-        memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i, status)
-        device_list.append(_DeviceAttributes(name, device_type, memory))
-      tf_session.TF_DeleteDeviceList(raw_device_list)
-      return device_list
+    if self._created_with_new_api:
+      raw_device_list = tf_session.TF_SessionListDevices(self._session)
+    else:
+      raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
+          self._session)
+    device_list = []
+    size = tf_session.TF_DeviceListCount(raw_device_list)
+    for i in range(size):
+      name = tf_session.TF_DeviceListName(raw_device_list, i)
+      device_type = tf_session.TF_DeviceListType(raw_device_list, i)
+      memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i)
+      device_list.append(_DeviceAttributes(name, device_type, memory))
+    tf_session.TF_DeleteDeviceList(raw_device_list)
+    return device_list
 
   def close(self):
     """Closes this session.
@@ -691,15 +687,13 @@ class BaseSession(SessionInterface):
     if self._created_with_new_api:
       if self._session and not self._closed:
         self._closed = True
-        with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_CloseSession(self._session, status)
+        tf_session.TF_CloseSession(self._session)
 
     else:
       with self._extend_lock:
         if self._opened and not self._closed:
           self._closed = True
-          with errors.raise_exception_on_not_ok_status() as status:
-            tf_session.TF_CloseDeprecatedSession(self._session, status)
+          tf_session.TF_CloseDeprecatedSession(self._session)
 
   def __del__(self):
     # cleanly ignore all exceptions
@@ -709,11 +703,10 @@ class BaseSession(SessionInterface):
       pass
     if self._session is not None:
       try:
-        status = c_api_util.ScopedTFStatus()
         if self._created_with_new_api:
-          tf_session.TF_DeleteSession(self._session, status)
+          tf_session.TF_DeleteSession(self._session)
         else:
-          tf_session.TF_DeleteDeprecatedSession(self._session, status)
+          tf_session.TF_DeleteDeprecatedSession(self._session)
       except AttributeError:
         # At shutdown, `c_api_util` or `tf_session` may have been garbage
         # collected, causing the above method calls to fail. In this case,
@@ -888,6 +881,8 @@ class BaseSession(SessionInterface):
       Either a single value if `fetches` is a single graph element, or
       a list of values if `fetches` is a list, or a dictionary with the
       same keys as `fetches` if that is a dictionary (described above).
+      Order in which `fetches` operations are evaluated inside the call
+      is undefined.
 
     Raises:
       RuntimeError: If this `Session` is in an invalid state (e.g. has been
@@ -1028,11 +1023,11 @@ class BaseSession(SessionInterface):
     # Set up a graph with feeds and fetches for partial run.
     def _setup_fn(session, feed_list, fetch_list, target_list):
       self._extend_graph()
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          return tf_session.TF_SessionPRunSetup_wrapper(
-              session, feed_list, fetch_list, target_list, status)
-        else:
+      if self._created_with_new_api:
+        return tf_session.TF_SessionPRunSetup_wrapper(
+            session, feed_list, fetch_list, target_list)
+      else:
+        with errors.raise_exception_on_not_ok_status() as status:
           return tf_session.TF_PRunSetup(session, feed_list, fetch_list,
                                          target_list, status)
 
@@ -1085,7 +1080,10 @@ class BaseSession(SessionInterface):
           if isinstance(subfeed_val, ops.Tensor):
             raise TypeError('The value of a feed cannot be a tf.Tensor object. '
                             'Acceptable feed values include Python scalars, '
-                            'strings, lists, numpy ndarrays, or TensorHandles.')
+                            'strings, lists, numpy ndarrays, or TensorHandles.'
+                            'For reference, the tensor object was ' +
+                            str(feed_val) + ' which was passed to the '
+                            'feed with key ' + str(feed) + '.')
 
           subfeed_dtype = subfeed_t.dtype.as_numpy_dtype
           if isinstance(subfeed_val, int) and _convert_to_numpy_obj(
@@ -1217,19 +1215,12 @@ class BaseSession(SessionInterface):
           compat.as_bytes(options.SerializeToString())) if options else None
       run_metadata_ptr = tf_session.TF_NewBuffer() if run_metadata else None
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          if self._created_with_new_api:
-            results = tf_session.TF_SessionRun_wrapper(
-                self._session, options_ptr, {}, fetch_list, target_list,
-                run_metadata_ptr, status)
-          else:
-            results = tf_session.TF_Run(self._session, options_ptr, {},
-                                        fetch_list, target_list, status,
-                                        run_metadata_ptr)
-          if fetch_handler:
-            results = fetch_handler.build_results(self, results)
-          else:
-            results = results[0] if results else None
+        results = self._call_tf_sessionrun(
+            options_ptr, {}, fetch_list, target_list, run_metadata_ptr)
+        if fetch_handler:
+          results = fetch_handler.build_results(self, results)
+        else:
+          results = results[0] if results else None
         if run_metadata:
           proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
           run_metadata.ParseFromString(compat.as_bytes(proto_data))
@@ -1250,13 +1241,7 @@ class BaseSession(SessionInterface):
       assert len(target_list) == 1
 
       def _single_operation_run():
-        with errors.raise_exception_on_not_ok_status() as status:
-          if self._created_with_new_api:
-            tf_session.TF_SessionRun_wrapper(self._session, None, {}, [],
-                                             target_list, None, status)
-          else:
-            tf_session.TF_Run(self._session, None, {}, [], target_list, status,
-                              None)
+        self._call_tf_sessionrun(None, {}, [], target_list, None)
 
       return _single_operation_run
     elif isinstance(fetches, ops.Tensor):
@@ -1266,13 +1251,7 @@ class BaseSession(SessionInterface):
       assert not target_list
 
       def _single_tensor_run():
-        with errors.raise_exception_on_not_ok_status() as status:
-          if self._created_with_new_api:
-            results = tf_session.TF_SessionRun_wrapper(
-                self._session, None, {}, fetch_list, [], None, status)
-          else:
-            results = tf_session.TF_Run(self._session, None, {}, fetch_list, [],
-                                        status, None)
+        results = self._call_tf_sessionrun(None, {}, fetch_list, [], None)
         return results[0]
 
       return _single_tensor_run
@@ -1280,13 +1259,8 @@ class BaseSession(SessionInterface):
       # In all other cases, we must use `fetch_handler` to build the
       # results for us.
       def _fetch_handler_run():
-        with errors.raise_exception_on_not_ok_status() as status:
-          if self._created_with_new_api:
-            results = tf_session.TF_SessionRun_wrapper(
-                self._session, None, {}, fetch_list, target_list, None, status)
-          else:
-            results = tf_session.TF_Run(self._session, None, {}, fetch_list,
-                                        target_list, status, None)
+        results = self._call_tf_sessionrun(
+            None, {}, fetch_list, target_list, None)
         return fetch_handler.build_results(self, results)
 
       return _fetch_handler_run
@@ -1326,35 +1300,22 @@ class BaseSession(SessionInterface):
       fetches = _name_list(fetch_list)
       targets = _name_list(target_list)
 
-    def _run_fn(session, feed_dict, fetch_list, target_list, options,
-                run_metadata):
+    def _run_fn(feed_dict, fetch_list, target_list, options, run_metadata):
       # Ensure any changes to the graph are reflected in the runtime.
       self._extend_graph()
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          return tf_session.TF_SessionRun_wrapper(session, options, feed_dict,
-                                                  fetch_list, target_list,
-                                                  run_metadata, status)
-        else:
-          return tf_session.TF_Run(session, options, feed_dict, fetch_list,
-                                   target_list, status, run_metadata)
+      return self._call_tf_sessionrun(
+          options, feed_dict, fetch_list, target_list, run_metadata)
 
-    def _prun_fn(session, handle, feed_dict, fetch_list):
+    def _prun_fn(handle, feed_dict, fetch_list):
       if target_list:
         raise RuntimeError('partial_run() requires empty target_list.')
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          return tf_session.TF_SessionPRun_wrapper(session, handle, feed_dict,
-                                                   fetch_list, status)
-        else:
-          return tf_session.TF_PRun(session, handle, feed_dict, fetch_list,
-                                    status)
+      return self._call_tf_sessionprun(handle, feed_dict, fetch_list)
 
     if handle is None:
-      return self._do_call(_run_fn, self._session, feeds, fetches, targets,
-                           options, run_metadata)
+      return self._do_call(_run_fn, feeds, fetches, targets, options,
+                           run_metadata)
     else:
-      return self._do_call(_prun_fn, self._session, handle, feeds, fetches)
+      return self._do_call(_prun_fn, handle, feeds, fetches)
 
   def _do_call(self, fn, *args):
     try:
@@ -1374,23 +1335,22 @@ class BaseSession(SessionInterface):
       raise type(e)(node_def, op, message)
 
   def _extend_graph(self):
-    # Nothing to do if we're using the new session interface
-    # TODO(skyewm): remove this function altogether eventually
     if self._created_with_new_api:
-      return
-
-    # Ensure any changes to the graph are reflected in the runtime.
-    with self._extend_lock:
-      if self._graph.version > self._current_version:
-        # pylint: disable=protected-access
-        graph_def, self._current_version = self._graph._as_graph_def(
-            from_version=self._current_version, add_shapes=self._add_shapes)
-        # pylint: enable=protected-access
+      with self._graph._lock:  # pylint: disable=protected-access
+        tf_session.ExtendSession(self._session)
+    else:
+      # Ensure any changes to the graph are reflected in the runtime.
+      with self._extend_lock:
+        if self._graph.version > self._current_version:
+          # pylint: disable=protected-access
+          graph_def, self._current_version = self._graph._as_graph_def(
+              from_version=self._current_version, add_shapes=self._add_shapes)
+          # pylint: enable=protected-access
 
-        with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_ExtendGraph(self._session,
-                                    graph_def.SerializeToString(), status)
-        self._opened = True
+          with errors.raise_exception_on_not_ok_status() as status:
+            tf_session.TF_ExtendGraph(self._session,
+                                      graph_def.SerializeToString(), status)
+          self._opened = True
 
   # The threshold to run garbage collection to delete dead tensors.
   _DEAD_HANDLES_THRESHOLD = 10
@@ -1441,6 +1401,87 @@ class BaseSession(SessionInterface):
         feed_dict[feed_tensor] = np_val
       return handles
 
+  def _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list,
+                          run_metadata):
+    if self._created_with_new_api:
+      return tf_session.TF_SessionRun_wrapper(
+          self._session, options, feed_dict, fetch_list, target_list,
+          run_metadata)
+    else:
+      with errors.raise_exception_on_not_ok_status() as status:
+        return tf_session.TF_Run(
+            self._session, options, feed_dict, fetch_list, target_list,
+            status, run_metadata)
+
+  def _call_tf_sessionprun(self, handle, feed_dict, fetch_list):
+    if self._created_with_new_api:
+      return tf_session.TF_SessionPRun_wrapper(
+          self._session, handle, feed_dict, fetch_list)
+    else:
+      with errors.raise_exception_on_not_ok_status() as status:
+        return tf_session.TF_PRun(
+            self._session, handle, feed_dict, fetch_list, status)
+
+  # pylint: disable=protected-access
+  class _Callable(object):
+    """Experimental wrapper for the C++ `Session::MakeCallable()` API."""
+
+    def __init__(self, session, callable_options):
+      self._session = session
+      self._handle = None
+      options_ptr = tf_session.TF_NewBufferFromString(
+          compat.as_bytes(callable_options.SerializeToString()))
+      try:
+        with errors.raise_exception_on_not_ok_status() as status:
+          if session._created_with_new_api:
+            self._handle = tf_session.TF_SessionMakeCallable(
+                session._session, options_ptr, status)
+          else:
+            self._handle = tf_session.TF_DeprecatedSessionMakeCallable(
+                session._session, options_ptr, status)
+      finally:
+        tf_session.TF_DeleteBuffer(options_ptr)
+
+    def __call__(self, *args):
+      # TODO(b/74355905): Support argument and return value nested structures,
+      # and tensor-like objects such as SparseTensors.
+      with errors.raise_exception_on_not_ok_status() as status:
+        if self._session._created_with_new_api:
+          return tf_session.TF_SessionRunCallable(
+              self._session._session, self._handle, args, status, None)
+        else:
+          return tf_session.TF_DeprecatedSessionRunCallable(
+              self._session._session, self._handle, args, status, None)
+
+    def __del__(self):
+      # NOTE(mrry): It is possible that `self._session.__del__()` could be
+      # called before this destructor, in which case `self._session._session`
+      # will be `None`.
+      if self._handle is not None and self._session._session is not None:
+        with errors.raise_exception_on_not_ok_status() as status:
+          if self._session._created_with_new_api:
+            tf_session.TF_SessionReleaseCallable(
+                self._session._session, self._handle, status)
+          else:
+            tf_session.TF_DeprecatedSessionReleaseCallable(
+                self._session._session, self._handle, status)
+  # pylint: enable=protected-access
+
+  # TODO(b/74355905): Reimplement `Session.make_callable()` using this method
+  # where possible.
+  def _make_callable_from_options(self, callable_options):
+    """Returns a handle to a "callable" with the given options.
+
+    Args:
+      callable_options: A `CallableOptions` protocol buffer message describing
+        the computation that will be performed by the callable.
+
+    Returns:
+      A handle to the new callable.
+    """
+    self._extend_graph()
+    return BaseSession._Callable(self, callable_options)
+
 
 @tf_export('Session')
 class Session(BaseSession):
@@ -1637,6 +1678,9 @@ class InteractiveSession(BaseSession):
   ```
   """
 
+  _count_lock = threading.Lock()
+  _active_session_count = 0  # GUARDED_BY(_count_lock)
+
   def __init__(self, target='', graph=None, config=None):
     """Creates a new interactive TensorFlow session.
 
@@ -1665,6 +1709,19 @@ class InteractiveSession(BaseSession):
     config.graph_options.place_pruned_graph = True
 
     super(InteractiveSession, self).__init__(target, graph, config)
+    with InteractiveSession._count_lock:
+      if InteractiveSession._active_session_count > 0:
+        warnings.warn('An interactive session is already active. This can '
+                      'cause out-of-memory errors in some cases. You must '
+                      'explicitly call `InteractiveSession.close()` to release '
+                      'resources held by the other session(s).')
+      InteractiveSession._active_session_count += 1
+    # NOTE(mrry): We do not use `Session._closed` here because it has unhelpful
+    # semantics (in particular, it is not set to true if `Session.close()` is
+    # called on a session that has not been "opened" by running a step) and we
+    # cannot change those semantics without breaking existing code.
+    self._explicitly_closed = False
+
     self._default_session = self.as_default()
     self._default_session.enforce_nesting = False
     self._default_session.__enter__()
@@ -1677,6 +1734,14 @@ class InteractiveSession(BaseSession):
   def close(self):
     """Closes an `InteractiveSession`."""
     super(InteractiveSession, self).close()
+    with InteractiveSession._count_lock:
+      if not self._explicitly_closed:
+        InteractiveSession._active_session_count -= 1
+        self._explicitly_closed = True
+      else:
+        return
     if self._explicit_graph is not None:
       self._default_graph.__exit__(None, None, None)
+      self._default_graph = None
     self._default_session.__exit__(None, None, None)
+    self._default_session = None
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index 5a7413c12e9db92cb85d54a69602753ff6476425..38a3acb2dc304968915e84c8054621e441294e61 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -23,7 +23,6 @@ from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
 from tensorflow.python.client import session
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -42,21 +41,13 @@ class SessionListDevicesTestMethods(object):
 
   def testInvalidDeviceNumber(self):
     opts = tf_session.TF_NewSessionOptions()
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_session = tf_session.TF_NewSession(
-          ops.get_default_graph()._c_graph, opts, status)
-      raw_device_list = tf_session.TF_SessionListDevices(
-          c_session, status)
+    c_session = tf_session.TF_NewSession(ops.get_default_graph()._c_graph, opts)
+    raw_device_list = tf_session.TF_SessionListDevices(c_session)
     size = tf_session.TF_DeviceListCount(raw_device_list)
-    # Test that invalid device numbers return -1 rather than a Swig-wrapped
-    # pointer.
-    status_no_exception = c_api_util.ScopedTFStatus()
-    memory = tf_session.TF_DeviceListMemoryBytes(
-        raw_device_list, size, status_no_exception)
-    self.assertEqual(memory, -1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      tf_session.TF_DeviceListMemoryBytes(raw_device_list, size)
     tf_session.TF_DeleteDeviceList(raw_device_list)
-    with errors.raise_exception_on_not_ok_status() as status:
-      tf_session.TF_CloseSession(c_session, status)
+    tf_session.TF_CloseSession(c_session)
 
   def testListDevicesGrpcSession(self):
     server = server_lib.Server.create_local_server()
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 490572254b0be6a110ef06cea15d20d780f732cf..92497272c66b5c3be36aba75b9e3b7f3d99b062d 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -22,13 +22,13 @@ import os
 import sys
 import threading
 import time
+import warnings
 
 import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.core.framework import types_pb2
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -37,6 +37,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
+from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
@@ -46,6 +47,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
@@ -63,6 +65,10 @@ ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 @test_util.with_c_api
 class SessionTest(test_util.TensorFlowTestCase):
 
+  def setUp(self):
+    super(SessionTest, self).setUp()
+    warnings.simplefilter('always')
+
   def testUseExistingGraph(self):
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
       a = constant_op.constant(6.0, shape=[1, 1])
@@ -187,12 +193,10 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(0.0, shape=[2, 3])
       # NOTE(mrry): The original_op is nonsense, but used here to test that the
       #   errors are reported correctly.
-      # pylint: disable=protected-access
       with sess.graph._original_op(a.op):
         b = array_ops.identity(a, name='id')
       with sess.graph._original_op(b.op):
         c = array_ops.placeholder(dtypes.float32)
-      # pylint: enable=protected-access
 
       def exc_predicate(e):
         return (e.op == c.op and e.op._original_op == b.op and
@@ -1052,6 +1056,43 @@ class SessionTest(test_util.TensorFlowTestCase):
       for t in threads:
         t.join()
 
+  def testParallelRunAndBuild(self):
+    with session.Session() as sess:
+      c = constant_op.constant(5.0)
+      stop = threading.Event()
+
+      def run_loop():
+        while not stop.is_set():
+          self.assertEqual(sess.run(c), 5.0)
+
+      threads = [self.checkedThread(target=run_loop) for _ in range(100)]
+      for t in threads:
+        t.start()
+
+      # Do some graph construction. Try to exercise non-trivial paths.
+      graph = ops.get_default_graph()
+      gdef = None
+      for _ in range(10):
+        x = array_ops.placeholder(dtype=dtypes.float32)
+        with ops.colocate_with(x):
+          y = array_ops.placeholder(dtype=dtypes.float32)
+        with ops.device('/cpu:0'):
+          z = control_flow_ops.while_loop(
+              lambda x, y: x < 10, lambda x, y: (x + 1, x * y), [x, y])
+        with graph._attr_scope({'_a': attr_value_pb2.AttrValue(b=False)}):
+          gradients_impl.gradients(z, [x, y])
+          if gdef is None:
+            gdef = graph.as_graph_def()
+          else:
+            # NOTE(skyewm): import_graph_def breaks the running threads without
+            # the C API enabled. This is not a regression so I didn't fix it.
+            if ops._USE_C_API:
+              importer.import_graph_def(gdef, name='import')
+
+      stop.set()
+      for t in threads:
+        t.join()
+
   def testRunFeedDict(self):
     with session.Session() as s:
       x = array_ops.zeros([2])
@@ -1153,6 +1194,33 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[24.0]], e.eval())
       sess.close()
 
+  def testMultipleInteractiveSessionsWarning(self):
+    # Reinitialize the global state to ensure that the expected warnings will
+    # be emitted.
+    session.InteractiveSession._active_session_count = 0  # pylint: disable=protected-access
+
+    sess = session.InteractiveSession()
+    sess.run(constant_op.constant(4.0))  # Run so that the session is "opened".
+    sess.close()
+    # Opening and closing interactive sessions serially should not warn.
+    with warnings.catch_warnings(record=True) as w:
+      sess = session.InteractiveSession()
+      sess.close()
+    self.assertEqual(0, len(w))
+
+    with warnings.catch_warnings(record=True) as w:
+      sess = session.InteractiveSession()
+    self.assertEqual(0, len(w))
+    with warnings.catch_warnings(record=True) as w:
+      sess2 = session.InteractiveSession()
+    self.assertEqual(1, len(w))
+    self.assertTrue('An interactive session is already active. This can cause '
+                    'out-of-memory errors in some cases. You must explicitly '
+                    'call `InteractiveSession.close()` to release resources '
+                    'held by the other session(s).' in str(w[0].message))
+    sess2.close()
+    sess.close()
+
   def testInteractivePlacePrunedGraph(self):
     sess = session.InteractiveSession()
 
@@ -1303,6 +1371,18 @@ class SessionTest(test_util.TensorFlowTestCase):
                               run_metadata=run_metadata))
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
+  def testOptimizedMakeCallable(self):
+    with session.Session() as sess:
+      ph = array_ops.placeholder(dtypes.float32)
+      a = math_ops.add(ph, 1.0)
+      callable_opts = config_pb2.CallableOptions()
+      callable_opts.feed.append(ph.name)
+      callable_opts.fetch.append(a.name)
+      for _ in range(3):
+        callable_fn = sess._make_callable_from_options(callable_opts)
+        for _ in range(5):
+          self.assertEqual([2.0], callable_fn(np.array(1.0, dtype=np.float32)))
+
   def testFeedError(self):
     with session.Session() as sess:
       feed_t = array_ops.placeholder(dtype=dtypes.float32)
@@ -1745,8 +1825,8 @@ class SessionTest(test_util.TensorFlowTestCase):
     # Ensure that errors from building the graph get propagated.
     data = array_ops.placeholder(dtypes.float32, shape=[])
     # pylint: disable=protected-access
-    enter_1 = gen_control_flow_ops._enter(data, 'foo_1', False)
-    enter_2 = gen_control_flow_ops._enter(data, 'foo_2', False)
+    enter_1 = gen_control_flow_ops.enter(data, 'foo_1', False)
+    enter_2 = gen_control_flow_ops.enter(data, 'foo_2', False)
     # pylint: enable=protected-access
     res = math_ops.add(enter_1, enter_2)
     with self.assertRaisesOpError('has inputs from different frames'):
@@ -1815,144 +1895,5 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(a, feed_dict={a: 1})
 
 
-class GraphMutationTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._original_use_c_api_value = ops._USE_C_API
-    ops._USE_C_API = True
-    super(GraphMutationTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self._original_use_c_api_value
-    super(GraphMutationTest, self).tearDown()
-
-  def testUpdateInputAfterRunning(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-    with session.Session(graph=g) as sess:
-      self.assertAllEqual(3.0, sess.run(c))
-      c.op._update_input(1, a)  # pylint: disable=protected-access
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          'add.*was changed by updating input tensor after it was run'):
-        sess.run(c)
-
-      # Check that running the graph with a new session is fine
-      with session.Session(graph=g) as sess2:
-        self.assertAllEqual(2.0, sess2.run(c))
-
-  def testSetDeviceAfterRunning(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-    with session.Session(graph=g) as sess:
-      self.assertAllEqual(3.0, sess.run(c))
-      c.op._set_device('/cpu:0')  # pylint: disable=protected-access
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          'add.*was changed by setting device after it was run'):
-        sess.run(c)
-
-  def testSetAttrAfterRunning(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0, dtype=dtypes.float32)
-      b = math_ops.cast(a, dtypes.float64)
-
-    with session.Session(graph=g) as sess:
-      self.assertAllEqual(1.0, sess.run(b))
-      b.op._set_attr('DstT', attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          'Cast.*was changed by setting attribute after it was run'):
-        sess.run(b)
-
-  def testRunModifyRun(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-      with session.Session(graph=g) as sess:
-        self.assertAllEqual(3.0, sess.run(c))
-
-        d = b + c
-        d.op._update_input(0, a)  # pylint: disable=protected-access
-        self.assertAllEqual(3.0, sess.run(c))
-        self.assertAllEqual(4.0, sess.run(d))
-
-  def testRunModifyRunTwoSessions(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-      with session.Session(graph=g) as sess1:
-        with session.Session(graph=g) as sess2:
-          self.assertAllEqual(3.0, sess1.run(c))
-          self.assertAllEqual(3.0, sess2.run(c))
-
-          d = b + c
-          d.op._update_input(0, a)  # pylint: disable=protected-access
-          self.assertAllEqual(3.0, sess2.run(c))
-          self.assertAllEqual(4.0, sess2.run(d))
-
-          d.op._update_input(0, b)  # pylint: disable=protected-access
-          self.assertAllEqual(3.0, sess1.run(c))
-          self.assertAllEqual(5.0, sess1.run(d))
-
-          with self.assertRaisesRegexp(
-              errors.FailedPreconditionError,
-              'add.*was changed by updating input tensor after it was run'):
-            sess2.run(c)
-
-  def testTwoSessionsOneRunBeforeModification(self):
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-    with session.Session(graph=g) as sess1:
-      with session.Session(graph=g) as sess2:
-        sess1.run(c)
-
-        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
-
-        with self.assertRaisesRegexp(
-            errors.FailedPreconditionError,
-            'add.*was changed by setting device after it was run'):
-          sess1.run(c)
-
-        # sess2 was not run before modification
-        self.assertAllEqual(3.0, sess2.run(c))
-
-  def testTwoSessionsBothRunBeforeModification(self):
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-    with session.Session(graph=g) as sess1:
-      with session.Session(graph=g) as sess2:
-        sess1.run(c)
-        sess2.run(c)
-
-        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
-
-        with self.assertRaisesRegexp(
-            errors.FailedPreconditionError,
-            'add.*was changed by setting device after it was run'):
-          sess1.run(c)
-
-        with self.assertRaisesRegexp(
-            errors.FailedPreconditionError,
-            'add.*was changed by setting device after it was run'):
-          sess2.run(c)
-
-
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 1fd488e7b6388f7953a279dca8f93ab57a85f63d..1db1432d6521bb5f48558081916158792010b1c5 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -18,11 +18,11 @@ limitations under the License.
 %{
 
 #include "tensorflow/c/python_api.h"
-#include "tensorflow/python/client/tf_session_helper.h"
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/python/client/tf_session_helper.h"
 
 // Helper function to convert a Python list of Tensors to a C++ vector of
 // TF_Outputs.
@@ -72,7 +72,7 @@ void PyInt64ListToVector(PyObject* py_int_seq, std::vector<int64_t>* vec) {
   int size = PySequence_Fast_GET_SIZE(py_int_seq);
   for (int i = 0; i < size; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(py_int_seq, i);
-    vec->push_back(PyInt_AsLong(item));
+    vec->push_back(PyLong_AsLongLong(item));
   }
 }
 
@@ -157,6 +157,25 @@ tensorflow::ImportNumpy();
   }
 }
 
+// We use TF_OperationGetControlOutputs_wrapper instead of
+// TF_OperationGetControlOutputs
+%ignore TF_OperationGetControlOutputs;
+%unignore TF_OperationGetControlOutputs_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_OperationGetControlOutputs_wrapper;
+
+// Build a Python list of TF_Operation* and return it.
+%typemap(out) std::vector<TF_Operation*> tensorflow::TF_OperationGetControlOutputs_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOperation($1[i]));
+  }
+}
+
 %ignore TF_OperationOutputConsumers;
 %unignore TF_OperationOutputConsumers_wrapper;
 // See comment for "%noexception TF_SessionRun_wrapper;"
@@ -419,6 +438,30 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
   $result = new_result;
 }
 
+%typemap(in, numinputs=0) int64_t* out_handle (int64_t out_handle) {
+  $1 = &out_handle;
+}
+
+%typemap(argout) int64_t* out_handle {
+  $result = PyLong_FromLongLong(*$1);
+}
+
+%typemap(in) int64_t handle {
+  if (!PyLong_Check($input)) {
+    SWIG_exception_fail(
+        SWIG_TypeError,
+        tensorflow::strings::Printf(
+            "Expected a python long for conversion to callable handle but got %s",
+            Py_TYPE($input)->tp_name).c_str());
+  }
+  $1 = PyLong_AsLongLong($input);
+}
+
+// Override default py3 behavior of attempting to encode into Unicode.
+%typemap(out) std::string tensorflow::GetResourceHandleShapeAndType {
+  $result = PyBytes_FromStringAndSize($1.data(), $1.size());
+}
+
 // TODO(skyewm): SWIG emits a warning for the const char* in TF_WhileParams,
 // skip for now
 %ignore TF_WhileParams;
@@ -452,6 +495,17 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
 // See comment for "%noexception TF_SessionRun_wrapper;"
 %noexception TF_SessionPRun_wrapper;
 
+%unignore TF_DeprecatedSessionMakeCallable;
+%unignore TF_SessionMakeCallable;
+%unignore TF_DeprecatedSessionRunCallable;
+%unignore TF_SessionRunCallable;
+%unignore TF_DeprecatedSessionReleaseCallable;
+%unignore TF_SessionReleaseCallable;
+
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_DeprecatedSessionRunCallable;
+%noexception TF_SessionRunCallable;
+
 %rename("_TF_SetTarget") TF_SetTarget;
 %rename("_TF_SetConfig") TF_SetConfig;
 %rename("_TF_NewSessionOptions") TF_NewSessionOptions;
@@ -469,9 +523,8 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
       _TF_SetTarget(opts, target)
     if config is not None:
       from tensorflow.python.framework import errors
-      with errors.raise_exception_on_not_ok_status() as status:
-        config_str = config.SerializeToString()
-        _TF_SetConfig(opts, config_str, status)
+      config_str = config.SerializeToString()
+      _TF_SetConfig(opts, config_str)
     return opts
 %}
 
@@ -719,6 +772,12 @@ def TF_Reset(target, containers=None, config=None):
   $1 = &types_local;
 }
 
+%unignore SetRequireShapeInferenceFns;
+%unignore TF_TryEvaluateConstant_wrapper;
+%noexception TF_TryEvaluateConstant_wrapper;
+%unignore ExtendSession;
+%unignore ResourceHandleShapeAndType;
+
 %include "tensorflow/python/client/tf_session_helper.h"
 
 %unignoreall
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 361dbc22b097a9bc82f656d7416b88c4a3a1ec2d..b6481e7e29e4057f08e1c78b310bf5581afc5411 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -155,6 +155,156 @@ void TF_Run_wrapper(TF_DeprecatedSession* session, const TF_Buffer* run_options,
   ClearDecrefCache();
 }
 
+namespace {
+void MakeCallableHelper(tensorflow::Session* session,
+                        const TF_Buffer* callable_options, int64_t* out_handle,
+                        TF_Status* out_status) {
+  tensorflow::CallableOptions callable_options_proto;
+  if (callable_options != nullptr &&
+      !callable_options_proto.ParseFromArray(callable_options->data,
+                                             callable_options->length)) {
+    Set_TF_Status_from_Status(
+        out_status,
+        errors::InvalidArgument("Unparseable CallableOptions proto"));
+    return;
+  }
+  tensorflow::Session::CallableHandle handle;
+  Status s = session->MakeCallable(callable_options_proto, &handle);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+  *out_handle = handle;
+}
+}  // namespace
+
+void TF_DeprecatedSessionMakeCallable(TF_DeprecatedSession* session,
+                                      const TF_Buffer* callable_options,
+                                      int64_t* out_handle,
+                                      TF_Status* out_status) {
+  MakeCallableHelper(session->session, callable_options, out_handle,
+                     out_status);
+}
+void TF_SessionMakeCallable(TF_Session* session,
+                            const TF_Buffer* callable_options,
+                            int64_t* out_handle, TF_Status* out_status) {
+  MakeCallableHelper(session->session, callable_options, out_handle,
+                     out_status);
+}
+
+namespace {
+void RunCallableHelper(tensorflow::Session* session, int64_t handle,
+                       PyObject* feed_values, TF_Status* out_status,
+                       PyObjectVector* out_values, TF_Buffer* run_metadata) {
+  // Convert feed values to a vector of tensorflow::Tensor objects.
+  std::vector<Tensor> input_tensors;
+  Status s;
+  {
+    feed_values =
+        PySequence_Fast(feed_values, "feed_values must be a sequence");
+    if (feed_values == nullptr) return;
+    Safe_PyObjectPtr feed_values_holder(make_safe(feed_values));
+    Py_ssize_t len = PySequence_Fast_GET_SIZE(feed_values);
+    input_tensors.reserve(len);
+    for (Py_ssize_t i = 0; i < len; ++i) {
+      PyObject* elem = PySequence_Fast_GET_ITEM(feed_values, i);
+      if (!elem) {
+        Set_TF_Status_from_Status(
+            out_status, errors::Internal("Could not get feed value ", i));
+        return;
+      }
+      Tensor t;
+      s = NdarrayToTensor(elem, &t);
+      if (!s.ok()) {
+        Set_TF_Status_from_Status(out_status, s);
+        return;
+      }
+      input_tensors.push_back(std::move(t));
+    }
+  }
+
+  // Allocate a RunMetadata protobuf object to receive the metadata,
+  // if the caller is expecting any.
+  std::unique_ptr<RunMetadata> run_metadata_proto;
+  if (run_metadata != nullptr) {
+    run_metadata_proto.reset(new RunMetadata);
+  }
+
+  // Run the callable.
+  std::vector<Tensor> output_tensors;
+  Py_BEGIN_ALLOW_THREADS;
+  s = session->RunCallable(handle, input_tensors, &output_tensors,
+                           run_metadata_proto.get());
+  Py_END_ALLOW_THREADS;
+
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+
+  // If requested, serialize the RunMetadata to pass it back to the caller.
+  if (run_metadata != nullptr) {
+    s = MessageToBuffer(*run_metadata_proto, run_metadata);
+    if (!s.ok()) {
+      Set_TF_Status_from_Status(out_status, s);
+      return;
+    }
+  }
+
+  // Convert results to NumPy arrays. Since this can fail, stage the
+  // results via a safe container that takes care of decreasing the
+  // reference count on failure.
+  std::vector<Safe_PyObjectPtr> py_outputs_safe;
+  py_outputs_safe.reserve(output_tensors.size());
+  for (const Tensor& output : output_tensors) {
+    PyObject* py_array;
+    s = TensorToNdarray(output, &py_array);
+    if (!s.ok()) {
+      Set_TF_Status_from_Status(out_status, s);
+      return;
+    }
+    py_outputs_safe.push_back(make_safe(py_array));
+  }
+
+  // If we reach this point, we have successfully built a list of objects
+  // so we can release them from the safe container.
+  out_values->reserve(py_outputs_safe.size());
+  for (auto& output : py_outputs_safe) {
+    out_values->push_back(output.release());
+  }
+}
+}  // namespace
+
+void TF_DeprecatedSessionRunCallable(TF_DeprecatedSession* session,
+                                     int64_t handle, PyObject* feed_values,
+                                     TF_Status* out_status,
+                                     PyObjectVector* out_values,
+                                     TF_Buffer* run_metadata) {
+  RunCallableHelper(session->session, handle, feed_values, out_status,
+                    out_values, run_metadata);
+  ClearDecrefCache();
+}
+void TF_SessionRunCallable(TF_Session* session, int64_t handle,
+                           PyObject* feed_values, TF_Status* out_status,
+                           PyObjectVector* out_values,
+                           TF_Buffer* run_metadata) {
+  RunCallableHelper(session->session, handle, feed_values, out_status,
+                    out_values, run_metadata);
+  ClearDecrefCache();
+}
+
+void TF_DeprecatedSessionReleaseCallable(TF_DeprecatedSession* session,
+                                         int64_t handle,
+                                         TF_Status* out_status) {
+  Set_TF_Status_from_Status(out_status,
+                            session->session->ReleaseCallable(handle));
+}
+void TF_SessionReleaseCallable(TF_Session* session, int64_t handle,
+                               TF_Status* out_status) {
+  Set_TF_Status_from_Status(out_status,
+                            session->session->ReleaseCallable(handle));
+}
+
 // Wrapper for TF_PRunSetup that converts the arguments to appropriate types.
 // If *out_status is OK, the caller becomes the owner of *out_handle.
 void TF_PRunSetup_wrapper(TF_DeprecatedSession* session,
@@ -400,6 +550,15 @@ std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
   return control_inputs;
 }
 
+std::vector<TF_Operation*> TF_OperationGetControlOutputs_wrapper(
+    TF_Operation* oper) {
+  std::vector<TF_Operation*> control_outputs(
+      TF_OperationNumControlOutputs(oper));
+  TF_OperationGetControlOutputs(oper, control_outputs.data(),
+                                control_outputs.size());
+  return control_outputs;
+}
+
 std::vector<const char*> TF_OperationOutputConsumers_wrapper(
     TF_Output oper_out) {
   int num_consumers = TF_OperationOutputNumConsumers(oper_out);
@@ -470,15 +629,6 @@ void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
   TF_GraphSetTensorShape(graph, output, dims.data(), dims.size(), status);
 }
 
-std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
-                                                    TF_Output output,
-                                                    int num_dims,
-                                                    TF_Status* status) {
-  std::vector<int64_t> dims(num_dims);
-  TF_GraphGetTensorShape(graph, output, dims.data(), num_dims, status);
-  return dims;
-}
-
 std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
     TF_ImportGraphDefResults* results) {
   int num_missing_unused_input_mappings;
@@ -493,4 +643,19 @@ std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
   return input_strs;
 }
 
+PyObject* TF_TryEvaluateConstant_wrapper(TF_Graph* graph, TF_Output output,
+                                         TF_Status* status) {
+  TF_Tensor* result_tensor;
+  bool evaluated =
+      TF_TryEvaluateConstant(graph, output, &result_tensor, status);
+  if (!evaluated || TF_GetCode(status) != TF_OK) Py_RETURN_NONE;
+
+  Safe_TF_TensorPtr safe_result_tensor(result_tensor);
+  PyObject* out;
+  Status s = TF_TensorToPyArray(std::move(safe_result_tensor), &out);
+  Set_TF_Status_from_Status(status, s);
+  if (!s.ok()) Py_RETURN_NONE;
+  return out;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 29d5b28f40a7c07c199eec8c8cd85de626f6b068..cfd27c2bee990ab4e2829652a532761e674ed8e0 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -59,6 +59,31 @@ void TF_Run_wrapper(TF_DeprecatedSession* session, const TF_Buffer* run_options,
                     const NameVector& target_nodes, TF_Status* out_status,
                     PyObjectVector* out_values, TF_Buffer* run_outputs);
 
+// Python wrappers for the `Session::MakeCallable()` API.
+void TF_DeprecatedSessionMakeCallable(TF_DeprecatedSession* session,
+                                      const TF_Buffer* callable_options,
+                                      int64_t* out_handle,
+                                      TF_Status* out_status);
+void TF_SessionMakeCallable(TF_Session* session,
+                            const TF_Buffer* callable_options,
+                            int64_t* out_handle, TF_Status* out_status);
+
+// Python wrappers for the `Session::RunCallable()` API.
+void TF_DeprecatedSessionRunCallable(TF_DeprecatedSession* session,
+                                     int64_t handle, PyObject* feed_values,
+                                     TF_Status* out_status,
+                                     PyObjectVector* out_values,
+                                     TF_Buffer* run_metadata);
+void TF_SessionRunCallable(TF_Session* session, int64_t handle,
+                           PyObject* feed_values, TF_Status* out_status,
+                           PyObjectVector* out_values, TF_Buffer* run_metadata);
+
+// Python wrappers for the `Session::ReleaseCallable()` API.
+void TF_DeprecatedSessionReleaseCallable(TF_DeprecatedSession* session,
+                                         int64_t handle, TF_Status* out_status);
+void TF_SessionReleaseCallable(TF_Session* session, int64_t handle,
+                               TF_Status* out_status);
+
 // Set up the graph with the intended feeds and fetches for partial run.
 // *out_handle is owned by the caller.
 //
@@ -111,8 +136,7 @@ string EqualAttrValueWrapper(const string& actual, const string& expected);
 //
 // If shape is unknown, sets unknown_shape to true.
 tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
-    TF_Graph* graph, TF_Output output, TF_Status* out_status,
-    bool* unknown_shape);
+    TF_Graph* graph, TF_Output output, TF_Status* status, bool* unknown_shape);
 
 // Runs the graph associated with the session starting with the supplied inputs.
 // On success, `py_outputs` is populated with a numpy ndarray for each output
@@ -124,7 +148,7 @@ void TF_SessionRun_wrapper(TF_Session* session, const TF_Buffer* run_options,
                            const std::vector<PyObject*>& input_ndarrays,
                            const std::vector<TF_Output>& outputs,
                            const std::vector<TF_Operation*>& targets,
-                           TF_Buffer* run_metadata, TF_Status* out_status,
+                           TF_Buffer* run_metadata, TF_Status* status,
                            std::vector<PyObject*>* py_outputs);
 
 // Set up the graph with the intended feeds (inputs) and fetches (output) for
@@ -140,8 +164,7 @@ void TF_SessionPRunSetup_wrapper(TF_Session* session,
                                  const std::vector<TF_Output>& inputs,
                                  const std::vector<TF_Output>& outputs,
                                  const std::vector<TF_Operation*>& targets,
-                                 const char** out_handle,
-                                 TF_Status* out_status);
+                                 const char** out_handle, TF_Status* status);
 
 // Continue to run the graph with additional feeds and fetches. The
 // execution state is uniquely identified by the handle.
@@ -157,7 +180,7 @@ void TF_SessionPRun_wrapper(TF_Session* session, const char* handle,
                             const std::vector<TF_Output>& inputs,
                             const std::vector<PyObject*>& input_ndarrays,
                             const std::vector<TF_Output>& outputs,
-                            TF_Status* out_status,
+                            TF_Status* status,
                             std::vector<PyObject*>* py_outputs);
 
 // Retrieves the inputs of this operation.
@@ -167,6 +190,10 @@ std::vector<TF_Output> GetOperationInputs(TF_Operation* oper);
 std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
     TF_Operation* oper);
 
+// Retrieves the control outputs of this operation.
+std::vector<TF_Operation*> TF_OperationGetControlOutputs_wrapper(
+    TF_Operation* oper);
+
 // Retrieves the op names of the consumers of `oper_out`. The returned strings
 // have the lifetime of the underlying TF_Graph.
 std::vector<const char*> TF_OperationOutputConsumers_wrapper(
@@ -179,7 +206,7 @@ TF_Function* TF_GraphToFunction_wrapper(
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
     const NameVector& output_names, const TF_FunctionOptions* opts,
-    const char* description, TF_Status* out_status);
+    const char* description, TF_Status* status);
 
 // Set the shapes and types for the output's handle.
 //
@@ -202,17 +229,15 @@ void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
                                     const std::vector<int64_t>& dims,
                                     bool unknown_shape, TF_Status* status);
 
-// Return the shape of output. `num_dims` should be the output of
-// TF_GraphGetTensorNumDims. If `num_dims = -1`, this should not be called.
-std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
-                                                    TF_Output output,
-                                                    int num_dims,
-                                                    TF_Status* status);
-
 // Returns the string representations of the missing unused input mappings.
 std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
     TF_ImportGraphDefResults* results);
 
+// If evaluation was possible, returns the numpy ndarray of the evaluated
+// result. Otherwise returns None.
+PyObject* TF_TryEvaluateConstant_wrapper(TF_Graph* graph, TF_Output output,
+                                         TF_Status* status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_CLIENT_TF_SESSION_HELPER_H_
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index 9641b8b7f2735e2e0477aec59edd539e999fa969..c046e9cfd45d7d7677a1dbab0a7168e526c89bca 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.client import timeline
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -155,9 +156,10 @@ class TimelineTest(test.TestCase):
     ctf = step_analysis.chrome_trace.format_to_string()
     self._validateTrace(ctf)
     maximums = step_analysis.allocator_maximums
-    self.assertTrue('cpu' in maximums)
+    cpuname = 'mklcpu' if test_util.IsMklEnabled() else 'cpu'
+    self.assertTrue(cpuname in maximums)
     cpu_max = maximums[
-        'cuda_host_bfc'] if 'cuda_host_bfc' in maximums else maximums['cpu']
+        'cuda_host_bfc'] if 'cuda_host_bfc' in maximums else maximums[cpuname]
     # At least num1 + num2, both float32s (4 bytes each)
     self.assertGreater(cpu_max.num_bytes, 8)
     self.assertGreater(cpu_max.timestamp, 0)
diff --git a/tensorflow/python/data/BUILD b/tensorflow/python/data/BUILD
index b5bee36dcdfd463056d0e883acb3c701509b1eee..3e08c1587e3e0df70e3cd5be58d24103c4a78339 100644
--- a/tensorflow/python/data/BUILD
+++ b/tensorflow/python/data/BUILD
@@ -15,15 +15,3 @@ py_library(
         "//tensorflow/python/data/ops:readers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 239f9b0d5923451f3967eca572b1db099d463466..7efe0948e7729c398f972977b51426d80b8cd83e 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -15,12 +15,6 @@
 """`tf.data.Dataset` API for input pipelines.
 
 See the @{$datasets$Importing Data} Programmer's Guide for an overview.
-
-@@Dataset
-@@Iterator
-@@FixedLengthRecordDataset
-@@TextLineDataset
-@@TFRecordDataset
 """
 
 from __future__ import absolute_import
@@ -34,6 +28,3 @@ from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
 from tensorflow.python.data.ops.readers import TextLineDataset
 from tensorflow.python.data.ops.readers import TFRecordDataset
 # pylint: enable=unused-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__)
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 8b8adefa65a5c54d40bc28d8f50953513cfd3605..ed0c11e6c117dcbb810fd3acfc484128ed3519fa 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -367,15 +367,3 @@ tf_py_test(
         "no_windows",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
index 02720a2e985914d3a6774dc6f64d1316890c46bf..25269dc810ae2e3107f8b5317496a35a8ff59d0c 100644
--- a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
@@ -297,6 +297,21 @@ class MemoryCacheDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(i2.get_next())
 
+  def testCacheTakeRepeat(self):
+    dataset = dataset_ops.Dataset.range(10).cache().take(5).repeat(2)
+    itr = dataset.make_one_shot_iterator()
+    n = itr.get_next()
+
+    expected_values = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
+
+    with self.test_session() as sess:
+      for i, expected in enumerate(expected_values):
+        self.assertEqual(expected, sess.run(n),
+                         "Unexpected value at index %s" % i)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
index 14627810b57f68fd96e3e3cc7b51b4fbf7365299..ea5b41e5d819743ad03f3148d654329aea51dab7 100644
--- a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
@@ -263,7 +263,7 @@ class DatasetConstructorTest(test.TestCase):
       for i in range(3):
         results = sess.run(get_next)
         for component, result_component in zip(
-            (zip(*components[:3])[i] + expected[i]), results):
+            (list(zip(*components[:3]))[i] + expected[i]), results):
           if sparse_tensor.is_sparse(component):
             self.assertSparseValuesEqual(component, result_component)
           else:
diff --git a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
index f129d07b57b96b7869c84467aeb2276c93531ef8..6aabad2f574551cbdc152fe378eb9dc0f5f71995 100644
--- a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
@@ -21,9 +21,12 @@ import threading
 
 import numpy as np
 
+from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
@@ -302,6 +305,89 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testFromGeneratorStopShort(self):
+
+    def generator():
+      yield 0
+      yield 1
+      yield 2
+
+    iterator = (
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.int64).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual(0, sess.run(get_next))
+      self.assertAllEqual(1, sess.run(get_next))
+
+  def testFromGeneratorDestructorCalled(self):
+    # Use an `Event` to signal that the generator has been deleted.
+    event = threading.Event()
+
+    class GeneratorWrapper(object):
+
+      def __iter__(self):
+        return self
+
+      def next(self):
+        return self.__next__()
+
+      def __next__(self):
+        return 42
+
+      def __del__(self):
+        event.set()
+
+    iterator = dataset_ops.Dataset.from_generator(
+        GeneratorWrapper,
+        output_types=dtypes.int64).take(2).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual(42, sess.run(get_next))
+      self.assertAllEqual(42, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      # Test that `GeneratorWrapper` object is destroyed when the
+      # iterator terminates (and the generator iterator is deleted).
+      self.assertTrue(event.is_set())
+
+  def testGeneratorDatasetFinalizeFunctionCalled(self):
+    # NOTE(mrry): This test tests the internal `_GeneratorDataset`,
+    # which affords more control over what the finalize function can do than
+    # the `Dataset.from_generator()` wrapper.
+
+    # Use an `Event` to signal that the generator has been deleted.
+    event = threading.Event()
+
+    def finalize_fn(_):
+      def finalize_py_func():
+        event.set()
+        return 0
+      return script_ops.py_func(finalize_py_func, [], [dtypes.int64],
+                                stateful=True)
+
+    dummy = constant_op.constant(37)
+    iterator = (dataset_ops._GeneratorDataset(dummy, lambda x: x,
+                                              lambda x: x, finalize_fn)
+                .take(2)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual(37, sess.run(get_next))
+      self.assertAllEqual(37, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+        self.assertTrue(event.is_set())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
index b9258b720edd4ecd620c61eed18f6f975cb7f439..4f2216f0a340acb582c2d09523b0c78af99bdd90 100644
--- a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
@@ -17,11 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 import numpy as np
 
+from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
@@ -156,6 +160,65 @@ class FilterDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testReturnComponent(self):
+    iterator = (
+        dataset_ops.Dataset.zip(
+            (dataset_ops.Dataset.range(10),
+             dataset_ops.Dataset.from_tensors(True).repeat(None)))
+        .filter(lambda x, y: y).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual((i, True), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testParallelFilters(self):
+    dataset = dataset_ops.Dataset.range(10).filter(
+        lambda x: math_ops.equal(x % 2, 0))
+    iterators = [dataset.make_one_shot_iterator() for _ in range(10)]
+    next_elements = [iterator.get_next() for iterator in iterators]
+    with self.test_session() as sess:
+      self.assertEqual([0 for _ in range(10)], sess.run(next_elements))
+
+
+class FilterDatasetBenchmark(test.Benchmark):
+
+  def _benchmark(self, predicate, name):
+    with ops.Graph().as_default():
+      dataset = (
+          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
+      iterator = dataset.make_one_shot_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(5):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        print("Filter dataset using %s. Median wall time: %f" %
+              (name, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name="benchmark_filter_dataset_%s" % name)
+
+  def benchmarkSimpleFunction(self):
+    self._benchmark(array_ops.identity, "simple_function")
+
+  def benchmarkReturnComponentOptimization(self):
+    self._benchmark(lambda x: x, "return_component")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
index 23c6d7385f8d4a12019fa514f349f2598d9629de..820c167b6bb9dc3b1c25d9c6156cef17ad20eb1b 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -22,11 +22,13 @@ import warnings
 
 import numpy as np
 
+from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -44,27 +46,26 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
 
 
 class IteratorTest(test.TestCase):
 
-  def testAttemptingGradientsRaiseExceptions(self):
-    component = constant_op.constant([1])
-    side = constant_op.constant(0)
+  def testNoGradients(self):
+    component = constant_op.constant([1.])
+    side = constant_op.constant(0.)
     add = lambda x: x + side
     dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
     value = dataset.make_one_shot_iterator().get_next()
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, component)
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, side)
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, [component, side])
+    self.assertIsNone(gradients_impl.gradients(value, component)[0])
+    self.assertIsNone(gradients_impl.gradients(value, side)[0])
+    self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
 
   def testCapturingStateInOneShotRaisesException(self):
     var = variables.Variable(37.0, name="myvar")
-    dataset = (dataset_ops.Dataset.from_tensor_slices([0.0, 1.0, 2.0])
-               .map(lambda x: x + var))
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices([0.0, 1.0, 2.0])
+        .map(lambda x: x + var))
     with self.assertRaisesRegexp(
         ValueError, r"`Dataset.make_one_shot_iterator\(\)` does not support "
         "datasets that capture stateful objects.+myvar"):
@@ -78,8 +79,9 @@ class IteratorTest(test.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-                .repeat(14).make_one_shot_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(14).make_one_shot_iterator())
     get_next = iterator.get_next()
 
     self.assertEqual([c.shape[1:] for c in components],
@@ -103,8 +105,9 @@ class IteratorTest(test.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(tensor_components)
-                .map(_map_fn).repeat(14).make_one_shot_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(tensor_components)
+        .map(_map_fn).repeat(14).make_one_shot_iterator())
     get_next = iterator.get_next()
 
     self.assertEqual([c.shape[1:] for c in components],
@@ -125,10 +128,13 @@ class IteratorTest(test.TestCase):
                   np.array(37.0) * np.arange(7))
 
     def within_container():
+
       def _map_fn(x, y, z):
         return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-      iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                  .map(_map_fn).repeat(14).make_one_shot_iterator())
+
+      iterator = (
+          dataset_ops.Dataset.from_tensor_slices(components)
+          .map(_map_fn).repeat(14).make_one_shot_iterator())
       return iterator.get_next()
 
     server = server_lib.Server.create_local_server()
@@ -159,8 +165,8 @@ class IteratorTest(test.TestCase):
 
     # Create a session with a single thread to ensure that the
     # one-shot iterator initializer does not deadlock.
-    config = config_pb2.ConfigProto(inter_op_parallelism_threads=1,
-                                    use_per_session_threads=True)
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=1, use_per_session_threads=True)
     with session.Session(config=config) as sess:
       self.assertAllEqual([1, 4, 9], sess.run(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -169,6 +175,7 @@ class IteratorTest(test.TestCase):
     # Test with multiple threads invoking the one-shot iterator concurrently.
     with session.Session(config=config) as sess:
       results = []
+
       def consumer_thread():
         try:
           results.append(sess.run(next_element))
@@ -177,7 +184,8 @@ class IteratorTest(test.TestCase):
 
       num_threads = 8
       threads = [
-          self.checkedThread(consumer_thread) for _ in range(num_threads)]
+          self.checkedThread(consumer_thread) for _ in range(num_threads)
+      ]
       for t in threads:
         t.start()
       for t in threads:
@@ -205,24 +213,24 @@ class IteratorTest(test.TestCase):
         sess.run(next_element)
 
     with self.test_session() as sess:
+
       def consumer_thread():
         with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
           sess.run(next_element)
 
       num_threads = 8
       threads = [
-          self.checkedThread(consumer_thread) for _ in range(num_threads)]
+          self.checkedThread(consumer_thread) for _ in range(num_threads)
+      ]
       for t in threads:
         t.start()
       for t in threads:
         t.join()
 
   def testSimpleSharedResource(self):
-    components = (
-        np.array(1, dtype=np.int64),
-        np.array([1, 2, 3], dtype=np.int64),
-        np.array(37.0, dtype=np.float64)
-    )
+    components = (np.array(1, dtype=np.int64),
+                  np.array([1, 2, 3], dtype=np.int64),
+                  np.array(37.0, dtype=np.float64))
 
     server = server_lib.Server.create_local_server()
 
@@ -231,9 +239,10 @@ class IteratorTest(test.TestCase):
     # first session (initializing the iterator) is visible in the
     # second session.
     with ops.Graph().as_default():
-      iterator = (dataset_ops.Dataset.from_tensors(components)
-                  .map(lambda x, y, z: (x, y, z)).make_initializable_iterator(
-                      shared_name="shared_iterator"))
+      iterator = (
+          dataset_ops.Dataset.from_tensors(components)
+          .map(lambda x, y, z: (x, y, z)).make_initializable_iterator(
+              shared_name="shared_iterator"))
       init_op = iterator.initializer
       get_next = iterator.get_next()
 
@@ -269,8 +278,9 @@ class IteratorTest(test.TestCase):
 
   def testNotInitializedError(self):
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensors(components)
+        .make_initializable_iterator())
     get_next = iterator.get_next()
 
     with self.test_session() as sess:
@@ -320,8 +330,8 @@ class IteratorTest(test.TestCase):
   def testReinitializableIteratorStaticErrors(self):
     # Non-matching structure for types and shapes.
     with self.assertRaises(TypeError):
-      iterator = iterator_ops.Iterator.from_structure((dtypes.int64,
-                                                       dtypes.float64), [None])
+      iterator = iterator_ops.Iterator.from_structure(
+          (dtypes.int64, dtypes.float64), [None])
 
     # Test validation of dataset argument.
     iterator = iterator_ops.Iterator.from_structure((dtypes.int64,
@@ -337,18 +347,18 @@ class IteratorTest(test.TestCase):
     # Incompatible types.
     with self.assertRaises(TypeError):
       iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors((constant_op.constant(
-              [1, 2, 3], dtype=dtypes.int32), constant_op.constant(
-                  [4., 5., 6., 7.], dtype=dtypes.float32))))
+          dataset_ops.Dataset.from_tensors(
+              (constant_op.constant([1, 2, 3], dtype=dtypes.int32),
+               constant_op.constant([4., 5., 6., 7.], dtype=dtypes.float32))))
 
     # Incompatible shapes.
     iterator = iterator_ops.Iterator.from_structure(
         (dtypes.int64, dtypes.float64), ([None], []))
     with self.assertRaises(TypeError):
       iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors((constant_op.constant(
-              [1, 2, 3], dtype=dtypes.int64), constant_op.constant(
-                  [4., 5., 6., 7.], dtype=dtypes.float64))))
+          dataset_ops.Dataset.from_tensors(
+              (constant_op.constant([1, 2, 3], dtype=dtypes.int64),
+               constant_op.constant([4., 5., 6., 7.], dtype=dtypes.float64))))
 
   def testIteratorStringHandle(self):
     dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
@@ -370,33 +380,40 @@ class IteratorTest(test.TestCase):
       iterator_3_handle = sess.run(iterator_3.string_handle())
       iterator_4_handle = sess.run(iterator_4.string_handle())
 
-      self.assertEqual(
-          10, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(
-          1, sess.run(next_element,
-                      feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(
-          20, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(
-          2, sess.run(next_element,
-                      feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(
-          30, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(
-          3, sess.run(next_element,
-                      feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(
-          40, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(10,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(1,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_3_handle}))
+      self.assertEqual(20,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(2,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_3_handle}))
+      self.assertEqual(30,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(3,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_3_handle}))
+      self.assertEqual(40,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_4_handle}))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element,
-                 feed_dict={handle_placeholder: iterator_3_handle})
+        sess.run(
+            next_element, feed_dict={handle_placeholder: iterator_3_handle})
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element,
-                 feed_dict={handle_placeholder: iterator_4_handle})
+        sess.run(
+            next_element, feed_dict={handle_placeholder: iterator_4_handle})
 
   def testIteratorStringHandleReuseTensorObject(self):
     dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
@@ -427,8 +444,8 @@ class IteratorTest(test.TestCase):
     self.assertIsNot(handle_with_name, handle_with_same_name)
 
   def testIteratorStringHandleError(self):
-    dataset_int_scalar = (dataset_ops.Dataset.from_tensor_slices([1, 2,
-                                                                  3]).repeat())
+    dataset_int_scalar = (
+        dataset_ops.Dataset.from_tensor_slices([1, 2, 3]).repeat())
     dataset_float_vector = (dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]))
 
     handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
@@ -522,6 +539,58 @@ class IteratorTest(test.TestCase):
                 target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
             })
 
+  def testRemoteIteratorUsingRemoteCallOpMultiWorkers(self):
+    s1 = server_lib.Server.create_local_server()
+    s2 = server_lib.Server.create_local_server()
+    s3 = server_lib.Server.create_local_server()
+
+    cluster_def = cluster_pb2.ClusterDef()
+    workers = cluster_def.job.add()
+    workers.name = "worker"
+    workers.tasks[0] = s1.target[len("grpc://"):]
+    workers.tasks[1] = s2.target[len("grpc://"):]
+    client = cluster_def.job.add()
+    client.name = "client"
+    client.tasks[0] = s3.target[len("grpc://"):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    worker_devices = [
+        "/job:worker/replica:0/task:%d/cpu:0" % i for i in range(2)
+    ]
+    itr_handles = []
+    for device in worker_devices:
+      with ops.device(device):
+        src = dataset_ops.Dataset.from_tensor_slices([device])
+        itr = src.make_one_shot_iterator()
+        itr_handles.append(itr.string_handle())
+
+    targets = dataset_ops.Dataset.from_tensor_slices(worker_devices)
+    handles = dataset_ops.Dataset.from_tensor_slices(itr_handles)
+
+    @function.Defun(dtypes.string)
+    def loading_func(h):
+      remote_itr = iterator_ops.Iterator.from_string_handle(
+          h, itr.output_types, itr.output_shapes)
+      return remote_itr.get_next()
+
+    def map_fn(target, handle):
+      return functional_ops.remote_call(
+          args=[handle], Tout=[dtypes.string], f=loading_func, target=target)
+
+    with ops.device("/job:client"):
+      client_dataset = dataset_ops.Dataset.zip((targets, handles)).map(map_fn)
+      itr = client_dataset.make_initializable_iterator()
+      n = itr.get_next()
+
+    with session.Session(s3.target, config=config) as sess:
+      sess.run(itr.initializer)
+      expected_values = worker_devices
+      for expected in expected_values:
+        self.assertEqual((compat.as_bytes(expected),), sess.run(n))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(n)
+
   def testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -641,12 +710,19 @@ class IteratorTest(test.TestCase):
     with warnings.catch_warnings(record=True) as w:
       for _ in range(100):
         iterator.get_next()
-    self.assertEqual(100 - iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD,
-                     len(w))
+    self.assertEqual(100 - iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD, len(w))
     for warning in w:
       self.assertTrue(
           iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE in str(warning.message))
 
+  def testEagerIteratorAsync(self):
+    with context.eager_mode(), context.execution_mode(context.ASYNC):
+      val = 0
+      dataset = dataset_ops.Dataset.range(10)
+      for foo in dataset:
+        self.assertEqual(val, foo.numpy())
+        val += 1
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
index 4e7691ee8144a19a62476281d86fb5df46dd3e4b..f7d7d085c974fa217ed30708723cb1b887034ca0 100644
--- a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
@@ -46,8 +46,9 @@ class ListFilesDatasetOpTest(test.TestCase):
     dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
     with self.test_session() as sess:
       itr = dataset.make_one_shot_iterator()
+      next_element = itr.get_next()
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
+        sess.run(next_element)
 
   def testSimpleDirectory(self):
     filenames = ['a', 'b', 'c']
@@ -56,29 +57,79 @@ class ListFilesDatasetOpTest(test.TestCase):
     dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
     with self.test_session() as sess:
       itr = dataset.make_one_shot_iterator()
+      next_element = itr.get_next()
 
       full_filenames = []
       produced_filenames = []
       for filename in filenames:
         full_filenames.append(
             compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
+        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
       self.assertItemsEqual(full_filenames, produced_filenames)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(itr.get_next())
 
+  def testSimpleDirectoryNotShuffled(self):
+    filenames = ['b', 'c', 'a']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=False)
+    with self.test_session() as sess:
+      itr = dataset.make_one_shot_iterator()
+      next_element = itr.get_next()
+
+      for filename in sorted(filenames):
+        self.assertEqual(compat.as_bytes(path.join(self.tmp_dir, filename)),
+                         sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
+  def testFixedSeedResultsInRepeatableOrder(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
+    with self.test_session() as sess:
+      itr = dataset.make_initializable_iterator()
+      next_element = itr.get_next()
+
+      full_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename))
+                        for filename in filenames]
+
+      all_produced_filenames = []
+      for _ in range(3):
+        produced_filenames = []
+        sess.run(itr.initializer)
+        try:
+          while True:
+            produced_filenames.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+        all_produced_filenames.append(produced_filenames)
+
+      # Each run should produce the same set of filenames, which may be
+      # different from the order of `full_filenames`.
+      self.assertItemsEqual(full_filenames, all_produced_filenames[0])
+      # However, the different runs should produce filenames in the same order
+      # as each other.
+      self.assertEqual(all_produced_filenames[0], all_produced_filenames[1])
+      self.assertEqual(all_produced_filenames[0], all_produced_filenames[2])
+
   def testEmptyDirectoryInitializer(self):
     filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
     dataset = dataset_ops.Dataset.list_files(filename_placeholder)
 
     with self.test_session() as sess:
       itr = dataset.make_initializable_iterator()
+      next_element = itr.get_next()
       sess.run(
           itr.initializer,
           feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
+        sess.run(next_element)
 
   def testSimpleDirectoryInitializer(self):
     filenames = ['a', 'b', 'c']
@@ -89,6 +140,7 @@ class ListFilesDatasetOpTest(test.TestCase):
 
     with self.test_session() as sess:
       itr = dataset.make_initializable_iterator()
+      next_element = itr.get_next()
       sess.run(
           itr.initializer,
           feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
@@ -98,7 +150,7 @@ class ListFilesDatasetOpTest(test.TestCase):
       for filename in filenames:
         full_filenames.append(
             compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
+        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
 
       self.assertItemsEqual(full_filenames, produced_filenames)
 
@@ -114,6 +166,7 @@ class ListFilesDatasetOpTest(test.TestCase):
 
     with self.test_session() as sess:
       itr = dataset.make_initializable_iterator()
+      next_element = itr.get_next()
       sess.run(
           itr.initializer,
           feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py')})
@@ -123,7 +176,7 @@ class ListFilesDatasetOpTest(test.TestCase):
       for filename in filenames[1:-1]:
         full_filenames.append(
             compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
+        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
       self.assertItemsEqual(full_filenames, produced_filenames)
 
       with self.assertRaises(errors.OutOfRangeError):
@@ -138,6 +191,7 @@ class ListFilesDatasetOpTest(test.TestCase):
 
     with self.test_session() as sess:
       itr = dataset.make_initializable_iterator()
+      next_element = itr.get_next()
       sess.run(
           itr.initializer,
           feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py*')})
@@ -147,13 +201,44 @@ class ListFilesDatasetOpTest(test.TestCase):
       for filename in filenames[1:]:
         full_filenames.append(
             compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
+        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
 
       self.assertItemsEqual(full_filenames, produced_filenames)
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(itr.get_next())
 
+  def testNoShuffle(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    # Repeat the list twice and ensure that the order is the same each time.
+    # NOTE(mrry): This depends on an implementation detail of `list_files()`,
+    # which is that the list of files is captured when the iterator is
+    # initialized. Otherwise, or if e.g. the iterator were initialized more than
+    # once, it's possible that the non-determinism of `tf.matching_files()`
+    # would cause this test to fail. However, it serves as a useful confirmation
+    # that the `shuffle=False` argument is working as intended.
+    # TODO(b/73959787): Provide some ordering guarantees so that this test is
+    # more meaningful.
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=False).repeat(2)
+    with self.test_session() as sess:
+      itr = dataset.make_one_shot_iterator()
+      next_element = itr.get_next()
+
+      full_filenames = []
+      produced_filenames = []
+      for filename in filenames * 2:
+        full_filenames.append(
+            compat.as_bytes(path.join(self.tmp_dir, filename)))
+        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+      self.assertItemsEqual(full_filenames, produced_filenames)
+      self.assertEqual(produced_filenames[:len(filenames)],
+                       produced_filenames[len(filenames):])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 04d1abdb254feea1df6f1b8cfc5a512802107224..1ad0b9de5e76e3edd66303ab4666108f43a27428 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -602,6 +602,42 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testParallelMapOutOfRangeError(self):
+    def raising_py_func(i):
+      if i == 100:
+        raise StopIteration()
+      else:
+        return i
+
+    iterator = (
+        dataset_ops.Dataset.range(105)
+        .map(lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64),
+             num_parallel_calls=2)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(100):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testConstantOutput(self):
+    iterator = (
+        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual((i, b"hello", 10), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 class MapDatasetBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
index d7140088c310767d40bd2cf3413c899375acab15..1ddedfda4e1c9d6b6949f796be1870f167435763 100644
--- a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
@@ -21,6 +21,7 @@ import gzip
 import os
 import zlib
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import constant_op
@@ -736,12 +737,43 @@ class TFRecordDatasetTest(test.TestCase):
     one_mebibyte = 2**20
     d = readers.TFRecordDataset(self.test_filenames, buffer_size=one_mebibyte)
     iterator = d.make_one_shot_iterator()
+    next_element = iterator.get_next()
     with self.test_session() as sess:
       for j in range(self._num_files):
         for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(iterator.get_next()))
+          self.assertAllEqual(self._record(j, i), sess.run(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+        sess.run(next_element)
+
+  def testReadFromDatasetOfFiles(self):
+    files = dataset_ops.Dataset.from_tensor_slices(self.test_filenames)
+    d = readers.TFRecordDataset(files)
+    iterator = d.make_one_shot_iterator()
+    next_element = iterator.get_next()
+    with self.test_session() as sess:
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertAllEqual(self._record(j, i), sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testReadTenEpochsFromDatasetOfFilesInParallel(self):
+    files = dataset_ops.Dataset.from_tensor_slices(
+        self.test_filenames).repeat(10)
+    d = readers.TFRecordDataset(files, num_parallel_reads=4)
+    iterator = d.make_one_shot_iterator()
+    next_element = iterator.get_next()
+    expected = []
+    actual = []
+    with self.test_session() as sess:
+      for _ in range(10):
+        for j in range(self._num_files):
+          for i in range(self._num_records):
+            expected.append(self._record(j, i))
+            actual.append(sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self.assertEqual(sorted(expected), sorted(actual))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
index c089fb08c1082c1cf74d492796550980d6755591..5fcc48831f3ca744e015c92760f12ea4dbef2ff7 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
@@ -132,6 +132,33 @@ class ShuffleDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testSeedZero(self):
+    """Test for same behavior when the seed is a Python or Tensor zero."""
+    iterator = (
+        dataset_ops.Dataset.range(10).shuffle(10, seed=0)
+        .make_one_shot_iterator())
+    get_next = iterator.get_next()
+
+    elems = []
+    with self.test_session() as sess:
+      for _ in range(10):
+        elems.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = (
+        dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder)
+        .make_initializable_iterator())
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer, feed_dict={seed_placeholder: 0})
+      for elem in elems:
+        self.assertEqual(elem, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
   def testDefaultArguments(self):
     components = [0, 1, 2, 3, 4]
     iterator = (dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index f12b358a7dc35c18338171e489fa88ba1a82d11b..fa2e86eab18b0b97ea01a96e309b0ea82d91b267 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -23,6 +23,7 @@ py_library(
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/data/util:sparse",
         "//third_party/py/numpy",
     ],
@@ -34,6 +35,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_ops",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -50,20 +52,10 @@ py_library(
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/eager:context",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index b665443b7acb9eb266b6fcf36a002cfce54875f1..bd9686f692102df4ef64f0e81c33d2dec1b2222c 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -26,16 +26,17 @@ import six
 
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
@@ -90,7 +91,7 @@ class Dataset(object):
     Raises:
       RuntimeError: If eager execution is enabled.
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError(
           "dataset.make_initializable_iterator is not supported when eager "
           "execution is enabled.")
@@ -110,6 +111,24 @@ class Dataset(object):
                                  self.output_types, self.output_shapes,
                                  self.output_classes)
 
+  def __iter__(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    The returned iterator implements the Python iterator protocol and therefore
+    can only be used in eager mode.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+
+    Raises:
+      RuntimeError: If eager execution is not enabled.
+    """
+    if context.executing_eagerly():
+      return iterator_ops.EagerIterator(self)
+    else:
+      raise RuntimeError("dataset.__iter__() is only supported when eager "
+                         "execution is enabled.")
+
   def make_one_shot_iterator(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
 
@@ -118,14 +137,9 @@ class Dataset(object):
 
     Returns:
       An `Iterator` over the elements of this dataset.
-
-    Raises:
-      RuntimeError: If eager execution is enabled.
     """
-    if context.in_eager_mode():
-      raise RuntimeError(
-          "dataset.make_one_shot_iterator is not supported when eager "
-          "execution is enabled.")
+    if context.executing_eagerly():
+      return iterator_ops.EagerIterator(self)
     # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
     # a 0-argument function.
     @function.Defun(capture_by_value=True)
@@ -331,10 +345,10 @@ class Dataset(object):
 
     generator_state = Dataset._GeneratorState(generator)
 
-    def get_iterator_id_map_fn(unused_dummy):
+    def get_iterator_id_fn(unused_dummy):
       """Creates a unique `iterator_id` for each pass over the dataset.
 
-      The "iterator_id" disambiguates between multiple concurrently
+      The returned `iterator_id` disambiguates between multiple concurrently
       existing iterators.
 
       Args:
@@ -347,7 +361,7 @@ class Dataset(object):
       return script_ops.py_func(
           generator_state.get_next_id, [], dtypes.int64, stateful=True)
 
-    def generator_map_fn(iterator_id_t):
+    def generator_next_fn(iterator_id_t):
       """Generates the next element from iterator with ID `iterator_id_t`.
 
       We map this function across an infinite repetition of the
@@ -363,11 +377,9 @@ class Dataset(object):
 
       def generator_py_func(iterator_id):
         """A `py_func` that will be called to invoke the iterator."""
-        try:
-          values = next(generator_state.get_iterator(iterator_id))
-        except StopIteration:
-          generator_state.iterator_completed(iterator_id)
-          raise StopIteration("Iteration finished.")
+        # `next()` raises `StopIteration` when there are no more
+        # elements remaining to be generated.
+        values = next(generator_state.get_iterator(iterator_id))
 
         # Use the same _convert function from the py_func() implementation to
         # convert the returned values to arrays early, so that we can inspect
@@ -408,17 +420,31 @@ class Dataset(object):
 
       return nest.pack_sequence_as(output_types, flat_values)
 
+    def finalize_fn(iterator_id_t):
+      """Releases host-side state for the iterator with ID `iterator_id_t`."""
+
+      def finalize_py_func(iterator_id):
+        generator_state.iterator_completed(iterator_id)
+        # We return a dummy value so that the `finalize_fn` has a valid
+        # signature.
+        # NOTE(mrry): Explicitly create an array of `np.int64` because implicit
+        # casting in `py_func()` will create an array of `np.int32` on Windows,
+        # leading to a runtime error.
+        return np.array(0, dtype=np.int64)
+
+      return script_ops.py_func(
+          finalize_py_func, [iterator_id_t], dtypes.int64, stateful=True)
+
     # This function associates each traversal of `generator` with a unique
     # iterator ID.
-    def flat_map_fn(iterator_id_t):
-      # First, generate an infinite dataset containing the iterator ID repeated
-      # forever.
-      repeated_id = Dataset.from_tensors(iterator_id_t).repeat(None)
-
-      # The `generator_map_fn` gets the next element from the iterator with the
-      # relevant ID, and raises StopIteration when that iterator contains no
+    def flat_map_fn(dummy_arg):
+      # The `get_iterator_id_fn` gets a unique ID for the current instance of
+      # of the generator.
+      # The `generator_next_fn` gets the next element from the iterator with the
+      # given ID, and raises StopIteration when that iterator contains no
       # more elements.
-      return repeated_id.map(generator_map_fn)
+      return _GeneratorDataset(dummy_arg, get_iterator_id_fn, generator_next_fn,
+                               finalize_fn)
 
     # A single-element dataset that, each time it is evaluated, contains a
     # freshly-generated and unique (for the returned dataset) int64
@@ -426,7 +452,7 @@ class Dataset(object):
     # is encapsulated in `generator_state`, and captured in
     # `get_iterator_id_map_fn`.
     dummy = 0
-    id_dataset = Dataset.from_tensors(dummy).map(get_iterator_id_map_fn)
+    id_dataset = Dataset.from_tensors(dummy)
 
     # A dataset that contains all of the elements generated by a
     # single iterator created from `generator`, identified by the
@@ -537,7 +563,7 @@ class Dataset(object):
 
     Args:
       buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
-        maximum number elements that will be buffered when prefetching.
+        maximum number of elements that will be buffered when prefetching.
 
     Returns:
       Dataset: A `Dataset`.
@@ -545,9 +571,13 @@ class Dataset(object):
     return PrefetchDataset(self, buffer_size)
 
   @staticmethod
-  def list_files(file_pattern):
+  def list_files(file_pattern, shuffle=None, seed=None):
     """A dataset of all files matching a pattern.
 
+    NOTE: The default behavior of this method is to return filenames in
+    a non-deterministic random shuffled order. Pass a `seed` or `shuffle=False`
+    to get results in a deterministic order.
+
     Example:
       If we had the following files on our filesystem:
         - /path/to/dir/a.txt
@@ -558,16 +588,29 @@ class Dataset(object):
         - /path/to/dir/b.py
         - /path/to/dir/c.py
 
-    NOTE: The order of the file names returned can be non-deterministic.
-
     Args:
       file_pattern: A string or scalar string `tf.Tensor`, representing
         the filename pattern that will be matched.
+      shuffle: (Optional.) If `True`, the file names will be shuffled randomly.
+        Defaults to `True`.
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+        random seed that will be used to create the distribution. See
+        @{tf.set_random_seed} for behavior.
 
     Returns:
      Dataset: A `Dataset` of strings corresponding to file names.
     """
-    return Dataset.from_tensor_slices(gen_io_ops.matching_files(file_pattern))
+    if shuffle is None:
+      shuffle = True
+    matching_files = gen_io_ops.matching_files(file_pattern)
+    dataset = Dataset.from_tensor_slices(matching_files)
+    if shuffle:
+      # NOTE(mrry): The shuffle buffer size must be greater than zero, but the
+      # list of files might be empty.
+      buffer_size = math_ops.maximum(
+          array_ops.shape(matching_files, out_type=dtypes.int64)[0], 1)
+      dataset = dataset.shuffle(buffer_size, seed=seed)
+    return dataset
 
   def repeat(self, count=None):
     """Repeats this dataset `count` times.
@@ -746,11 +789,31 @@ class Dataset(object):
   def padded_batch(self, batch_size, padded_shapes, padding_values=None):
     """Combines consecutive elements of this dataset into padded batches.
 
-    Like `Dataset.dense_to_sparse_batch()`, this method combines
-    multiple consecutive elements of this dataset, which might have
-    different shapes, into a single element. The tensors in the
-    resulting element have an additional outer dimension, and are
-    padded to the respective shape in `padded_shapes`.
+    This transformation combines multiple consecutive elements of the input
+    dataset into a single element. Like @{tf.data.Dataset.batch}, the tensors
+    in the resulting element have an additional outer dimension, which will be
+    `batch_size` for all but the last element, and `N % batch_size` for the
+    last element (where `N` is the number of elements in this dataset). Unlike
+    @{tf.data.Dataset.batch}, the elements may have different shapes for some
+    of their components, and this transformation will pad each component to
+    the respective shape in `padding_shapes`. The `padding_shapes` argument
+    determines the resulting shape for each dimension of each component in an
+    output element:
+
+    * If the dimension is a constant (e.g. `tf.Dimension(37)`), the component
+      will be padded out to that length in that dimension.
+    * If the dimension is unknown (e.g. `tf.Dimension(None)`), the component
+      will be padded out to the maximum length of all elements in that
+      dimension.
+
+    NOTE: If the number of elements (`N`) in this dataset is not an exact
+    multiple of `batch_size`, the final batch contain smaller tensors with
+    shape `N % batch_size` in the batch dimension. If your program depends on
+    the batches having the same shape, consider using the
+    @{tf.contrib.data.padded_batch_and_drop_remainder} transformation instead.
+
+    See also @{tf.contrib.data.dense_to_sparse_batch}, which combines elements
+    that may have different shapes into a @{tf.SparseTensor}.
 
     Args:
       batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
@@ -1033,6 +1096,196 @@ class SparseTensorSliceDataset(Dataset):
     return (dtypes.int64, self._sparse_tensor.dtype, dtypes.int64)
 
 
+class _GeneratorDataset(Dataset):
+  """A `Dataset` that generates elements by invoking a function."""
+
+  def __init__(self, init_args, init_func, next_func, finalize_func):
+    """Constructs a `_GeneratorDataset`.
+
+    Args:
+      init_args: A nested structure representing the arguments to `init_func`.
+      init_func: A TensorFlow function that will be called on `init_args` each
+        time a C++ iterator over this dataset is constructed. Returns a nested
+        structure representing the "state" of the dataset.
+      next_func: A TensorFlow function that will be called on the result of
+        `init_func` to produce each element, and that raises `OutOfRangeError`
+        to terminate iteration.
+      finalize_func: A TensorFlow function that will be called on the result of
+        `init_func` immediately before a C++ iterator over this dataset is
+        destroyed. The return value is ignored.
+    """
+    super(_GeneratorDataset, self).__init__()
+    # These members will be initialized by `tf_init_func`.
+    self._state_classes = None
+    self._state_shapes = None
+    self._state_types = None
+
+    self._init_args = init_args
+
+    init_args_classes = sparse.get_classes(init_args)
+    init_args_shapes = nest.pack_sequence_as(
+        init_args, [t.get_shape() for t in nest.flatten(init_args)])
+    init_args_types = nest.pack_sequence_as(
+        init_args, [t.dtype for t in nest.flatten(init_args)])
+
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(init_args_types, init_args_classes)))
+    def tf_init_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      dense_shapes = sparse.as_dense_shapes(init_args_shapes, init_args_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(init_args_classes, args)
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, init_args_types, init_args_shapes, init_args_classes)
+      if _should_unpack_args(nested_args):
+        ret = init_func(*nested_args)
+      else:
+        ret = init_func(nested_args)
+
+      # If `init_func` returns a list of tensors, `nest.flatten()` and
+      # `ops.convert_to_tensor()` would conspire to attempt to stack
+      # those tensors into a single tensor, because the customized
+      # version of `nest.flatten()` does not recurse into lists. Since
+      # it is more likely that the list arose from returning the
+      # result of an operation (such as `tf.py_func()`) that returns a
+      # list of not-necessarily-stackable tensors, we treat the
+      # returned value is a `tuple` instead. A user wishing to pack
+      # the return value into a single tensor can use an explicit
+      # `tf.stack()` before returning.
+      if isinstance(ret, list):
+        ret = tuple(ret)
+
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
+      ret = nest.pack_sequence_as(ret, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
+      ])
+
+      self._state_classes = sparse.get_classes(ret)
+      self._state_shapes = nest.pack_sequence_as(
+          ret, [t.get_shape() for t in nest.flatten(ret)])
+      self._state_types = nest.pack_sequence_as(
+          ret, [t.dtype for t in nest.flatten(ret)])
+
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
+      return nest.flatten(ret)
+
+    self._init_func = tf_init_func
+    self._init_func.add_to_graph(ops.get_default_graph())
+
+    # These members will be initialized by `tf_next_func`.
+    self._output_classes = None
+    self._output_shapes = None
+    self._output_types = None
+
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(self._state_types, self._state_classes)))
+    def tf_next_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      dense_shapes = sparse.as_dense_shapes(self._state_shapes,
+                                            self._state_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(self._state_classes, args)
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, self._state_types, self._state_shapes,
+          self._state_classes)
+      if _should_unpack_args(nested_args):
+        ret = next_func(*nested_args)
+      else:
+        ret = next_func(nested_args)
+
+      # If `next_func` returns a list of tensors, `nest.flatten()` and
+      # `ops.convert_to_tensor()` would conspire to attempt to stack
+      # those tensors into a single tensor, because the customized
+      # version of `nest.flatten()` does not recurse into lists. Since
+      # it is more likely that the list arose from returning the
+      # result of an operation (such as `tf.py_func()`) that returns a
+      # list of not-necessarily-stackable tensors, we treat the
+      # returned value is a `tuple` instead. A user wishing to pack
+      # the return value into a single tensor can use an explicit
+      # `tf.stack()` before returning.
+      if isinstance(ret, list):
+        ret = tuple(ret)
+
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
+      ret = nest.pack_sequence_as(ret, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
+      ])
+
+      self._output_classes = sparse.get_classes(ret)
+      self._output_shapes = nest.pack_sequence_as(
+          ret, [t.get_shape() for t in nest.flatten(ret)])
+      self._output_types = nest.pack_sequence_as(
+          ret, [t.dtype for t in nest.flatten(ret)])
+
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
+      return nest.flatten(ret)
+
+    self._next_func = tf_next_func
+    self._next_func.add_to_graph(ops.get_default_graph())
+
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(self._state_types, self._state_classes)))
+    def tf_finalize_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the state.
+      dense_shapes = sparse.as_dense_shapes(self._state_shapes,
+                                            self._state_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(self._state_classes, args)
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, self._state_types, self._state_shapes,
+          self._state_classes)
+      if _should_unpack_args(nested_args):
+        return finalize_func(*nested_args)
+      else:
+        return finalize_func(nested_args)
+
+    self._finalize_func = tf_finalize_func
+    self._finalize_func.add_to_graph(ops.get_default_graph())
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.generator_dataset(
+        nest.flatten(self._init_args) + self._init_func.captured_inputs,
+        self._next_func.captured_inputs,
+        self._finalize_func.captured_inputs,
+        init_func=self._init_func,
+        next_func=self._next_func,
+        finalize_func=self._finalize_func,
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
 class ZipDataset(Dataset):
   """A `Dataset` that zips its inputs together."""
 
@@ -1282,16 +1535,7 @@ class ShuffleDataset(Dataset):
     self._input_dataset = input_dataset
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
-    seed, seed2 = random_seed.get_seed(seed)
-    if seed is None:
-      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
-    else:
-      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
-    if seed2 is None:
-      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
-    else:
-      self._seed2 = ops.convert_to_tensor(
-          seed2, dtype=dtypes.int64, name="seed2")
+    self._seed, self._seed2 = random_seed.get_seed(seed)
     if reshuffle_each_iteration is None:
       self._reshuffle_each_iteration = True
     else:
@@ -1574,10 +1818,12 @@ class MapDataset(Dataset):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
       ret = nest.pack_sequence_as(ret, [
           sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
       ])
 
       self._output_classes = sparse.get_classes(ret)
@@ -1586,11 +1832,9 @@ class MapDataset(Dataset):
       self._output_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
-      # Serialize any sparse tensors and convert result to tensors.
-      ret = nest.pack_sequence_as(ret, [
-          ops.convert_to_tensor(t)
-          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
-      ])
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
       return nest.flatten(ret)
 
     self._map_func = tf_map_func
@@ -1708,47 +1952,13 @@ class FlatMapDataset(Dataset):
     return self._output_types
 
 
-class InterleaveDataset(Dataset):
+class InterleaveDataset(FlatMapDataset):
   """A `Dataset` that maps a function over its input and interleaves the result.
   """
 
   def __init__(self, input_dataset, map_func, cycle_length, block_length):
     """See `Dataset.interleave()` for details."""
-    super(InterleaveDataset, self).__init__()
-    self._input_dataset = input_dataset
-
-    @function.Defun(*nest.flatten(
-        sparse.as_dense_types(input_dataset.output_types,
-                              input_dataset.output_classes)))
-    def tf_map_func(*args):
-      """A wrapper for Defun that facilitates shape inference."""
-      # Pass in shape information from the input_dataset.
-      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
-                                            input_dataset.output_classes)
-      for arg, shape in zip(args, nest.flatten(dense_shapes)):
-        arg.set_shape(shape)
-
-      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-      nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types, input_dataset.output_shapes,
-          input_dataset.output_classes)
-      if _should_unpack_args(nested_args):
-        dataset = map_func(*nested_args)
-      else:
-        dataset = map_func(nested_args)
-
-      if not isinstance(dataset, Dataset):
-        raise TypeError("`map_func` must return a `Dataset` object.")
-
-      self._output_classes = dataset.output_classes
-      self._output_types = dataset.output_types
-      self._output_shapes = dataset.output_shapes
-
-      return dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-    self._map_func = tf_map_func
-    self._map_func.add_to_graph(ops.get_default_graph())
-
+    super(InterleaveDataset, self).__init__(input_dataset, map_func)
     self._cycle_length = ops.convert_to_tensor(
         cycle_length, dtype=dtypes.int64, name="cycle_length")
     self._block_length = ops.convert_to_tensor(
@@ -1757,27 +1967,15 @@ class InterleaveDataset(Dataset):
   def _as_variant_tensor(self):
     return gen_dataset_ops.interleave_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,
+        self._map_func.captured_inputs,  # pylint: disable=protected-access
         self._cycle_length,
         self._block_length,
-        f=self._map_func,
+        f=self._map_func,  # pylint: disable=protected-access
         output_types=nest.flatten(
             sparse.as_dense_types(self.output_types, self.output_classes)),
         output_shapes=nest.flatten(
             sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
 
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
 
 class FilterDataset(Dataset):
   """A `Dataset` that filters its input according to a predicate function."""
@@ -1847,6 +2045,8 @@ class PrefetchDataset(Dataset):
     """See `Dataset.prefetch()` for details."""
     super(PrefetchDataset, self).__init__()
     self._input_dataset = input_dataset
+    if buffer_size is None:
+      buffer_size = -1  # This is the sentinel for auto-tuning.
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 4756ec74820bace5bea4e1f41ebe214420fe5c3d..0c76afd29d4626be9120c059d60218daab5cc0ac 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -17,14 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
 import warnings
 
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -412,3 +416,151 @@ class Iterator(object):
       of an element of this dataset.
     """
     return self._output_types
+
+
+_uid_counter = 0
+_uid_lock = threading.Lock()
+
+
+def _generate_shared_name(prefix):
+  with _uid_lock:
+    global _uid_counter
+    uid = _uid_counter
+    _uid_counter += 1
+  return "{}{}".format(prefix, uid)
+
+
+class EagerIterator(object):
+  """An iterator producing tf.Tensor objects from a tf.data.Dataset."""
+
+  def __init__(self, dataset):
+    """Creates a new iterator over the given dataset.
+
+    For example:
+    ```python
+    dataset = tf.data.Dataset.range(4)
+    for x in Iterator(dataset):
+      print(x)
+    ```
+
+    Tensors produced will be placed on the device on which this iterator object
+    was created.
+
+    Args:
+      dataset: A `tf.data.Dataset` object.
+
+    Raises:
+      RuntimeError: When invoked without eager execution enabled.
+    """
+
+    if not context.executing_eagerly():
+      raise RuntimeError(
+          "{} objects can only be used when eager execution is enabled, use "
+          "tf.data.Dataset.make_initializable_iterator or "
+          "tf.data.Dataset.make_one_shot_iterator for graph construction".
+          format(type(self)))
+    with ops.device("/device:CPU:0"):
+      ds_variant = dataset._as_variant_tensor()  # pylint: disable=protected-access
+      self._output_classes = dataset.output_classes
+      self._output_types = dataset.output_types
+      self._output_shapes = dataset.output_shapes
+      self._flat_output_types = nest.flatten(
+          sparse.as_dense_types(self._output_types, self._output_classes))
+      self._flat_output_shapes = nest.flatten(
+          sparse.as_dense_shapes(self._output_shapes, self._output_classes))
+      self._resource = gen_dataset_ops.iterator(
+          shared_name="",
+          container=_generate_shared_name("eageriterator"),
+          output_types=self._flat_output_types,
+          output_shapes=self._flat_output_shapes)
+      gen_dataset_ops.make_iterator(ds_variant, self._resource)
+      # Delete the resource when this object is deleted
+      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
+          handle=self._resource, handle_device="/device:CPU:0")
+    self._device = context.context().device_name
+
+  def __iter__(self):
+    return self
+
+  def __next__(self):  # For Python 3 compatibility
+    return self.next()
+
+  def _next_internal(self):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      with ops.device(self._device):
+        # TODO(ashankar): Consider removing this ops.device() contextmanager
+        # and instead mimic ops placement in graphs: Operations on resource
+        # handles execute on the same device as where the resource is placed.
+        # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
+        # because in eager mode this code will run synchronously on the calling
+        # thread. Therefore we do not need to make a defensive context switch
+        # to a background thread, and can achieve a small constant performance
+        # boost by invoking the iterator synchronously.
+        ret = gen_dataset_ops.iterator_get_next_sync(
+            self._resource,
+            output_types=self._flat_output_types,
+            output_shapes=self._flat_output_shapes)
+
+      return sparse.deserialize_sparse_tensors(
+          nest.pack_sequence_as(self._output_types, ret), self._output_types,
+          self._output_shapes, self._output_classes)
+
+  def next(self):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """
+    try:
+      return self._next_internal()
+    except errors.OutOfRangeError:
+      raise StopIteration
+
+  @property
+  def output_classes(self):
+    """Returns the class of each component of an element of this iterator.
+
+    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+
+    Returns:
+      A nested structure of Python `type` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return self._output_classes
+
+  @property
+  def output_shapes(self):
+    """Returns the shape of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.TensorShape` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    """Returns the type of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this dataset.
+    """
+    return self._output_types
+
+  def get_next(self, name=None):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+
+    Args:
+      name: (Optional.) A name for the created operation. Currently unused.
+
+    Returns:
+      A nested structure of `tf.Tensor` objects.
+
+    Raises:
+      `tf.errors.OutOfRangeError`: If the end of the dataset has been reached.
+    """
+    del name
+    return self._next_internal()
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index fa7601741b11f018e9b53ed3b77a7561be50d3f4..fe033f5546498d57dd98289d2cda1a8bbb1c7822 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -17,11 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -31,7 +34,7 @@ _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
 @tf_export("data.TextLineDataset")
-class TextLineDataset(Dataset):
+class TextLineDataset(dataset_ops.Dataset):
   """A `Dataset` comprising lines from one or more text files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -73,8 +76,7 @@ class TextLineDataset(Dataset):
     return dtypes.string
 
 
-@tf_export("data.TFRecordDataset")
-class TFRecordDataset(Dataset):
+class _TFRecordDataset(dataset_ops.Dataset):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -87,7 +89,7 @@ class TFRecordDataset(Dataset):
       buffer_size: (Optional.) A `tf.int64` scalar representing the number of
         bytes in the read buffer. 0 means no buffering.
     """
-    super(TFRecordDataset, self).__init__()
+    super(_TFRecordDataset, self).__init__()
     # Force the type to string even if filenames is an empty list.
     self._filenames = ops.convert_to_tensor(
         filenames, dtypes.string, name="filenames")
@@ -118,8 +120,112 @@ class TFRecordDataset(Dataset):
     return dtypes.string
 
 
+class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
+  """A `Dataset` that maps a function over its input and flattens the result."""
+
+  def __init__(self, input_dataset, map_func, cycle_length, block_length,
+               sloppy, buffer_output_elements, prefetch_input_elements):
+    """See `tf.contrib.data.parallel_interleave()` for details."""
+    super(ParallelInterleaveDataset, self).__init__(input_dataset, map_func,
+                                                    cycle_length, block_length)
+    self._sloppy = ops.convert_to_tensor(
+        sloppy, dtype=dtypes.bool, name="sloppy")
+    self._buffer_output_elements = convert.optional_param_to_tensor(
+        "buffer_output_elements",
+        buffer_output_elements,
+        argument_default=2 * block_length)
+    self._prefetch_input_elements = convert.optional_param_to_tensor(
+        "prefetch_input_elements",
+        prefetch_input_elements,
+        argument_default=2 * cycle_length)
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    return gen_dataset_ops.parallel_interleave_dataset(
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.captured_inputs,
+        self._cycle_length,
+        self._block_length,
+        self._sloppy,
+        self._buffer_output_elements,
+        self._prefetch_input_elements,
+        f=self._map_func,
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    # pylint: enable=protected-access
+
+
+@tf_export("data.TFRecordDataset")
+class TFRecordDataset(dataset_ops.Dataset):
+  """A `Dataset` comprising records from one or more TFRecord files."""
+
+  def __init__(self, filenames, compression_type=None, buffer_size=None,
+               num_parallel_reads=None):
+    """Creates a `TFRecordDataset` to read for one or more TFRecord files.
+
+    NOTE: The `num_parallel_reads` argument can be used to improve performance
+    when reading from a remote filesystem.
+
+    Args:
+      filenames: A `tf.string` tensor or `tf.data.Dataset` containing one or
+        more filenames.
+      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
+        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
+      buffer_size: (Optional.) A `tf.int64` scalar representing the number of
+        bytes in the read buffer. 0 means no buffering.
+      num_parallel_reads: (Optional.) A `tf.int64` scalar representing the
+        number of files to read in parallel. Defaults to reading files
+        sequentially.
+
+    Raises:
+      TypeError: If any argument does not have the expected type.
+      ValueError: If any argument does not have the expected shape.
+    """
+    super(TFRecordDataset, self).__init__()
+    if isinstance(filenames, dataset_ops.Dataset):
+      if filenames.output_types != dtypes.string:
+        raise TypeError(
+            "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
+      if not filenames.output_shapes.is_compatible_with(tensor_shape.scalar()):
+        raise ValueError(
+            "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` "
+            "elements.")
+    else:
+      filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
+      filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
+      filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
+
+    def read_one_file(filename):
+      return _TFRecordDataset(filename, compression_type, buffer_size)
+
+    if num_parallel_reads is None:
+      self._impl = filenames.flat_map(read_one_file)
+    else:
+      self._impl = ParallelInterleaveDataset(
+          filenames, read_one_file, cycle_length=num_parallel_reads,
+          block_length=1, sloppy=False, buffer_output_elements=None,
+          prefetch_input_elements=None)
+
+  def _as_variant_tensor(self):
+    return self._impl._as_variant_tensor()  # pylint: disable=protected-access
+
+  @property
+  def output_classes(self):
+    return self._impl.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._impl.output_shapes
+
+  @property
+  def output_types(self):
+    return self._impl.output_types
+
+
 @tf_export("data.FixedLengthRecordDataset")
-class FixedLengthRecordDataset(Dataset):
+class FixedLengthRecordDataset(dataset_ops.Dataset):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
   def __init__(self,
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index e32c7b54a48dd887c2748897c3ce3661aab9f497..0fc32d51b9fe581a54519139f3bf12118f8f4028 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -86,14 +86,26 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+py_library(
+    name = "random_seed",
+    srcs = ["random_seed.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+    ],
+)
+
+py_test(
+    name = "random_seed_test",
+    size = "small",
+    srcs = ["random_seed_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":random_seed",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:util",
+    ],
 )
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index e90ce3fb40af68fb68d6ee8bac6892848d8c5a79..eff6e02c1484b2ab2e67fd8c0e7ba7c027c0b571 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -44,7 +44,6 @@ import collections as _collections
 import six as _six
 
 from tensorflow.python.framework import sparse_tensor as _sparse_tensor
-from tensorflow.python.util.all_util import remove_undocumented
 
 
 def _sorted(dict_):
@@ -538,16 +537,3 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
 
-
-_allowed_symbols = [
-    "assert_same_structure",
-    "is_sequence",
-    "flatten",
-    "pack_sequence_as",
-    "map_structure",
-    "assert_shallow_structure",
-    "flatten_up_to",
-    "map_structure_up_to",
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/data/util/random_seed.py b/tensorflow/python/data/util/random_seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c9d8672f94587fd3164f25f97b44a97526be07
--- /dev/null
+++ b/tensorflow/python/data/util/random_seed.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for generating Tensor-valued random seeds."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def get_seed(seed):
+  """Returns the local seeds an operation should use given an op-specific seed.
+
+  See @{tf.get_seed} for more details. This wrapper adds support for the case
+  where `seed` may be a tensor.
+
+  Args:
+    seed: An integer or a @{tf.int64} scalar tensor.
+
+  Returns:
+    A tuple of two @{tf.int64} scalar tensors that should be used for the local
+    seed of the calling dataset.
+  """
+  seed, seed2 = random_seed.get_seed(seed)
+  if seed is None:
+    seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
+  else:
+    seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
+  if seed2 is None:
+    seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+  else:
+    with ops.name_scope("seed2") as scope:
+      seed2 = ops.convert_to_tensor(seed2, dtype=dtypes.int64)
+      seed2 = array_ops.where(
+          math_ops.logical_and(
+              math_ops.equal(seed, 0), math_ops.equal(seed2, 0)),
+          constant_op.constant(2**31 - 1, dtype=dtypes.int64),
+          seed2,
+          name=scope)
+  return seed, seed2
diff --git a/tensorflow/python/data/util/random_seed_test.py b/tensorflow/python/data/util/random_seed_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..33227e82afe6fe1c748693d107d4e9844abb8e09
--- /dev/null
+++ b/tensorflow/python/data/util/random_seed_test.py
@@ -0,0 +1,83 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utilities working with arbitrarily nested structures."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import random_seed as data_random_seed
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class RandomSeedTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testRandomSeed(self):
+    zero_t = constant_op.constant(0, dtype=dtypes.int64, name='zero')
+    one_t = constant_op.constant(1, dtype=dtypes.int64, name='one')
+    intmax_t = constant_op.constant(
+        2**31 - 1, dtype=dtypes.int64, name='intmax')
+    test_cases = [
+        # Each test case is a tuple with input to get_seed:
+        # (input_graph_seed, input_op_seed)
+        # and output from get_seed:
+        # (output_graph_seed, output_op_seed)
+        ((None, None), (0, 0)),
+        ((None, 1), (random_seed.DEFAULT_GRAPH_SEED, 1)),
+        ((1, 1), (1, 1)),
+        ((0, 0), (0, 2**31 - 1)),  # Avoid nondeterministic (0, 0) output
+        ((2**31 - 1, 0), (0, 2**31 - 1)),  # Don't wrap to (0, 0) either
+        ((0, 2**31 - 1), (0, 2**31 - 1)),  # Wrapping for the other argument
+        # Once more, with tensor-valued arguments
+        ((None, one_t), (random_seed.DEFAULT_GRAPH_SEED, 1)),
+        ((1, one_t), (1, 1)),
+        ((0, zero_t), (0, 2**31 - 1)),  # Avoid nondeterministic (0, 0) output
+        ((2**31 - 1, zero_t), (0, 2**31 - 1)),  # Don't wrap to (0, 0) either
+        ((0, intmax_t), (0, 2**31 - 1)),  # Wrapping for the other argument
+    ]
+    for tc in test_cases:
+      tinput, toutput = tc[0], tc[1]
+      random_seed.set_random_seed(tinput[0])
+      g_seed, op_seed = data_random_seed.get_seed(tinput[1])
+      g_seed = self.evaluate(g_seed)
+      op_seed = self.evaluate(op_seed)
+      msg = 'test_case = {0}, got {1}, want {2}'.format(
+          tinput, (g_seed, op_seed), toutput)
+      self.assertEqual((g_seed, op_seed), toutput, msg=msg)
+      random_seed.set_random_seed(None)
+
+    if not context.executing_eagerly():
+      random_seed.set_random_seed(1)
+      tinput = (1, None)
+      toutput = (1, ops.get_default_graph()._last_id)  # pylint: disable=protected-access
+      random_seed.set_random_seed(tinput[0])
+      g_seed, op_seed = data_random_seed.get_seed(tinput[1])
+      g_seed = self.evaluate(g_seed)
+      op_seed = self.evaluate(op_seed)
+      msg = 'test_case = {0}, got {1}, want {2}'.format(1, (g_seed, op_seed),
+                                                        toutput)
+      self.assertEqual((g_seed, op_seed), toutput, msg=msg)
+      random_seed.set_random_seed(None)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index f0e90f67772d114142ccc218ed9f42b723a1b556..b5760df1ed47be9a25b8ece3c7850984b6eac1d3 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -913,6 +913,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 py_test(
@@ -920,6 +921,7 @@ py_test(
     size = "small",
     srcs = ["cli/profile_analyzer_cli_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":debugger_cli_common",
         ":profile_analyzer_cli",
@@ -967,7 +969,6 @@ cuda_py_test(
         ":grpc_wrapper",
         ":hooks",
         ":session_debug_testlib",
-        "//third_party/py/numpy",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -983,6 +984,30 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "grpc_large_data_test",
+    size = "medium",
+    srcs = ["lib/grpc_large_data_test.py"],
+    additional_deps = [
+        ":dumping_wrapper",
+        ":grpc_debug_test_server",
+        ":grpc_wrapper",
+        ":session_debug_testlib",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "no_windows",
+        "noasan",  # Times out due to size of test (b/73731462).
+        "oss_serial",
+    ],
+)
+
 # TODO(cais): Run the test in OSS, perhaps through a sh_test.
 cuda_py_test(
     name = "dist_session_debug_grpc_test",
@@ -1073,15 +1098,3 @@ sh_test(
         ":offline_analyzer",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/debug/README.md b/tensorflow/python/debug/README.md
index a2273b050bb1ecd5a35938c3de57fb8562f1d26d..269bbb19bdb898d1d81d0b9c618a284a437e68b9 100644
--- a/tensorflow/python/debug/README.md
+++ b/tensorflow/python/debug/README.md
@@ -37,12 +37,18 @@ models:
 * Association of nodes and tensors in graphs with Python source lines
 * Profiling of models at the level of graph nodes and Python source lines.
 (Omitted internal-only feature)
+* A [gRPC](https://grpc.io/)-based remote debugging protocol, which allows us to
+  build a browser-based graphical user interface (GUI) for TFDBG: the
+  [TensorBoard Debugger Plugin](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
 
 ## How to use TFDBG?
 
 * For a walkthrough of TFDBG command-line interface, see https://www.tensorflow.org/programmers_guide/debugger.
+* For information on the web GUI of TFDBG (TensorBoard Debugger Plugin), see
+  [this README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
 * For programmatic use of the API of TFDBG, see https://www.tensorflow.org/api_docs/python/tfdbg.
 
+
 ## Related Publications
 
 * Cai, S., Breck E., Nielsen E., Salib M., Sculley D. (2016) TensorFlow Debugger:
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index 156afdfd4c44f2f1a07ffdd1e68ad48bbbe31cba..9a47cd12b47b35d0a85cfc1a211fdfee7cfa25bc 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -185,6 +185,15 @@ class DebugAnalyzer(object):
         type=str,
         default="",
         help="List only Tensors passing the filter of the specified name")
+    ap.add_argument(
+        "-fenn",
+        "--filter_exclude_node_names",
+        dest="filter_exclude_node_names",
+        type=str,
+        default="",
+        help="When applying the tensor filter, exclude node with names "
+        "matching the regular expression. Applicable only if --tensor_filter "
+        "or -f is used.")
     ap.add_argument(
         "-n",
         "--node_name_filter",
@@ -484,6 +493,10 @@ class DebugAnalyzer(object):
 
     Returns:
       Output text lines as a RichTextLines object.
+
+    Raises:
+      ValueError: If `--filter_exclude_node_names` is used without `-f` or
+        `--tensor_filter` being used.
     """
 
     # TODO(cais): Add annotations of substrings for dumped tensor names, to
@@ -520,8 +533,15 @@ class DebugAnalyzer(object):
         _add_main_menu(output, node_name=None, enable_list_tensors=False)
         return output
 
-      data_to_show = self._debug_dump.find(filter_callable)
+      data_to_show = self._debug_dump.find(
+          filter_callable,
+          exclude_node_names=parsed.filter_exclude_node_names)
     else:
+      if parsed.filter_exclude_node_names:
+        raise ValueError(
+            "The flag --filter_exclude_node_names is valid only when "
+            "the flag -f or --tensor_filter is used.")
+
       data_to_show = self._debug_dump.dumped_tensor_data
 
     # TODO(cais): Implement filter by lambda on tensor value.
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index 6b110fda9eba301f298e84b63d091bb300549bee..55231954d1c8ea987bbf87755dfde83d5efd03f0 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -820,6 +820,32 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         op_type_regex="(Add|MatMul)")
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  def testListTensorWithFilterAndNodeNameExclusionWorks(self):
+    # First, create and register the filter.
+    def is_2x1_vector(datum, tensor):
+      del datum  # Unused.
+      return list(tensor.shape) == [2, 1]
+    self._analyzer.add_tensor_filter("is_2x1_vector", is_2x1_vector)
+
+    # Use shorthand alias for the command prefix.
+    out = self._registry.dispatch_command(
+        "lt", ["-f", "is_2x1_vector", "--filter_exclude_node_names", ".*v.*"])
+
+    # If the --filter_exclude_node_names were not used, then the matching
+    # tensors would be:
+    #   - simple_mul_add/v:0
+    #   - simple_mul_add/v/read:0
+    #   - simple_mul_add/matmul:0
+    #   - simple_mul_add/add:0
+    #
+    # With the --filter_exclude_node_names option, only the last two should
+    # show up in the result.
+    assert_listed_tensors(
+        self,
+        out, ["simple_mul_add/matmul:0", "simple_mul_add/add:0"],
+        ["MatMul", "Add"], tensor_filter_name="is_2x1_vector")
+    check_main_menu(self, out, list_tensors_enabled=False)
+
   def testListTensorsFilterNanOrInf(self):
     """Test register and invoke a tensor filter."""
 
diff --git a/tensorflow/python/debug/cli/curses_ui.py b/tensorflow/python/debug/cli/curses_ui.py
index bb52f9051250625836b0d7a0f8e30265d9b34e92..f66cefb427c9ccfa0769655415193e8d2535e53c 100644
--- a/tensorflow/python/debug/cli/curses_ui.py
+++ b/tensorflow/python/debug/cli/curses_ui.py
@@ -1185,6 +1185,22 @@ class CursesUI(base_ui.BaseUI):
       self._main_menu = None
       self._main_menu_pad = None
 
+  def _pad_line_end_with_whitespace(self, pad, row, line_end_x):
+    """Pad the whitespace at the end of a line with the default color pair.
+
+    Prevents spurious color pairs from appearing at the end of the lines in
+    certain text terimnals.
+
+    Args:
+      pad: The curses pad object to operate on.
+      row: (`int`) row index.
+      line_end_x: (`int`) column index of the end of the line (beginning of
+        the whitespace).
+    """
+    if line_end_x < self._max_x - 2:
+      pad.addstr(row, line_end_x, " " * (self._max_x - 3 - line_end_x),
+                 self._default_color_pair)
+
   def _screen_add_line_to_output_pad(self, pad, row, txt, color_segments=None):
     """Render a line in a text pad.
 
@@ -1208,6 +1224,7 @@ class CursesUI(base_ui.BaseUI):
 
     if not color_segments:
       pad.addstr(row, 0, txt, self._default_color_pair)
+      self._pad_line_end_with_whitespace(pad, row, len(txt))
       return
 
     if not isinstance(color_segments, list):
@@ -1248,6 +1265,8 @@ class CursesUI(base_ui.BaseUI):
     for segment, color_pair in zip(all_segments, all_color_pairs):
       if segment[1] < self._max_x:
         pad.addstr(row, segment[0], txt[segment[0]:segment[1]], color_pair)
+    if all_segments:
+      self._pad_line_end_with_whitespace(pad, row, all_segments[-1][1])
 
   def _screen_scroll_output_pad(self, pad, viewport_top, viewport_left,
                                 screen_location_top, screen_location_left,
diff --git a/tensorflow/python/debug/cli/readline_ui.py b/tensorflow/python/debug/cli/readline_ui.py
index 151638789f7eb509a39a6f066e9ad9a0c92c1fe3..3296e45d07e9f385fdcda3f34804547b049dcf02 100644
--- a/tensorflow/python/debug/cli/readline_ui.py
+++ b/tensorflow/python/debug/cli/readline_ui.py
@@ -19,6 +19,8 @@ from __future__ import print_function
 
 import readline
 
+import six
+
 from tensorflow.python.debug.cli import base_ui
 from tensorflow.python.debug.cli import debugger_cli_common
 
@@ -39,11 +41,7 @@ class ReadlineUI(base_ui.BaseUI):
     readline.set_completer(self._readline_complete)
     readline.parse_and_bind("tab: complete")
 
-    # For Python 2-3 compatibility.
-    try:
-      self._input = raw_input
-    except NameError:
-      self._input = input
+    self._input = six.moves.input
 
   def _readline_complete(self, text, state):
     context, prefix, except_last_word = self._analyze_tab_complete_input(text)
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 8d355aa27f6fa10a1889420a9087800be12a81ce..8a65ad087b3002d8ad93f3a64f48715d26ff62d8 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -23,6 +23,7 @@ import glob
 import json
 import os
 import platform
+import re
 
 import numpy as np
 import six
@@ -1411,7 +1412,11 @@ class DebugDumpDir(object):
 
     return self._watch_key_to_datum[device_name].get(debug_watch_key, [])
 
-  def find(self, predicate, first_n=0, device_name=None):
+  def find(self,
+           predicate,
+           first_n=0,
+           device_name=None,
+           exclude_node_names=None):
     """Find dumped tensor data by a certain predicate.
 
     Args:
@@ -1430,17 +1435,24 @@ class DebugDumpDir(object):
         time order) for which the predicate returns True. To return all the
         `DebugTensotDatum` instances, let first_n be <= 0.
       device_name: optional device name.
+      exclude_node_names: Optional regular expression to exclude nodes with
+        names matching the regular expression.
 
     Returns:
       A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object
        for which predicate returns True, sorted in ascending order of the
        timestamp.
     """
+    if exclude_node_names:
+      exclude_node_names = re.compile(exclude_node_names)
 
     matched_data = []
     for device in (self._dump_tensor_data if device_name is None
                    else (self._dump_tensor_data[device_name],)):
       for datum in self._dump_tensor_data[device]:
+        if exclude_node_names and exclude_node_names.match(datum.node_name):
+          continue
+
         if predicate(datum, datum.get_tensor()):
           matched_data.append(datum)
 
diff --git a/tensorflow/python/debug/lib/debug_gradients.py b/tensorflow/python/debug/lib/debug_gradients.py
index 16f51a4b32f711b97077643cec669bb8970e0b21..589a13db7f798aef3bb82dfbd442deabfbcf2a41 100644
--- a/tensorflow/python/debug/lib/debug_gradients.py
+++ b/tensorflow/python/debug/lib/debug_gradients.py
@@ -156,11 +156,12 @@ class GradientsDebugger(object):
     # TODO(cais): Implement value_stack.
     grad_debug_op_name = _tensor_to_grad_debug_op_name(input_tensor, self._uuid)
     # pylint: disable=protected-access
-    identity_op = (gen_array_ops._debug_gradient_ref_identity
-                   if input_tensor.dtype._is_ref_dtype
-                   else gen_array_ops._debug_gradient_identity)
-    debug_grad_identity = identity_op(input_tensor, name=grad_debug_op_name)
+    identity_op = (
+        gen_array_ops.debug_gradient_ref_identity
+        if input_tensor.dtype._is_ref_dtype else
+        gen_array_ops.debug_gradient_identity)
     # pylint: enable=protected-access
+    debug_grad_identity = identity_op(input_tensor, name=grad_debug_op_name)
     assert debug_grad_identity.dtype == input_tensor.dtype
     if debug_grad_identity.op.name != grad_debug_op_name:
       raise ValueError(
diff --git a/tensorflow/python/debug/lib/grpc_large_data_test.py b/tensorflow/python/debug/lib/grpc_large_data_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc477a9baeb7116530fc9122b926458c1a6c08e
--- /dev/null
+++ b/tensorflow/python/debug/lib/grpc_large_data_test.py
@@ -0,0 +1,210 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sending large-size data through tfdbg grpc channels.
+
+"Large-size data" includes large GraphDef protos and large Tensor protos.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.debug.lib import grpc_debug_test_server
+from tensorflow.python.debug.lib import session_debug_testlib
+from tensorflow.python.debug.wrappers import framework
+from tensorflow.python.debug.wrappers import grpc_wrapper
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    (cls.debug_server_port, cls.debug_server_url, _, cls.debug_server_thread,
+     cls.debug_server
+    ) = grpc_debug_test_server.start_server_on_separate_thread(
+        dump_to_filesystem=False)
+    tf_logging.info("debug server url: %s", cls.debug_server_url)
+
+  @classmethod
+  def tearDownClass(cls):
+    cls.debug_server.stop_server().wait()
+    cls.debug_server_thread.join()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    self.debug_server.clear_data()
+
+  def testSendingLargeGraphDefsWorks(self):
+    with self.test_session(
+        use_gpu=True,
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
+      u = variables.Variable(42.0, name="original_u")
+      for _ in xrange(50 * 1000):
+        u = array_ops.identity(u)
+      sess.run(variables.global_variables_initializer())
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"original_u")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      self.assertAllClose(42.0, sess.run(u))
+
+      self.assertAllClose(
+          [42.0],
+          self.debug_server.debug_tensor_values["original_u:0:DebugIdentity"])
+      self.assertEqual(2 if test.is_gpu_available() else 1,
+                       len(self.debug_server.partition_graph_defs))
+      max_graph_def_size = max([
+          len(graph_def.SerializeToString())
+          for graph_def in self.debug_server.partition_graph_defs])
+      self.assertGreater(max_graph_def_size, 4 * 1024 * 1024)
+
+  def testSendingLargeFloatTensorWorks(self):
+    with self.test_session(
+        use_gpu=True,
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
+      u_init_val_array = list(xrange(1200 * 1024))
+      # Size: 4 * 1200 * 1024 = 4800k > 4M
+
+      u_init = constant_op.constant(
+          u_init_val_array, dtype=dtypes.float32, name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds  # Unused by this watch_fn.
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      self.assertAllEqual(
+          u_init_val_array,
+          self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
+
+  def testSendingStringTensorWithAlmostTooLargeStringsWorks(self):
+    with self.test_session(
+        use_gpu=True,
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
+      u_init_val = [
+          b"", b"spam", b"A" * 2500 * 1024, b"B" * 2500 * 1024, b"egg", b""]
+      u_init = constant_op.constant(
+          u_init_val, dtype=dtypes.string, name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      self.assertAllEqual(
+          u_init_val,
+          self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
+
+  def testSendingLargeStringTensorWorks(self):
+    with self.test_session(
+        use_gpu=True,
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
+      strs_total_size_threshold = 5000 * 1024
+      cum_size = 0
+      u_init_val_array = []
+      while cum_size < strs_total_size_threshold:
+        strlen = np.random.randint(200)
+        u_init_val_array.append(b"A" * strlen)
+        cum_size += strlen
+
+      u_init = constant_op.constant(
+          u_init_val_array, dtype=dtypes.string, name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      self.assertAllEqual(
+          u_init_val_array,
+          self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
+
+  def testSendingEmptyFloatTensorWorks(self):
+    with self.test_session(
+        use_gpu=True,
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
+      u_init = constant_op.constant(
+          [], dtype=dtypes.float32, shape=[0], name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      u_init_value = self.debug_server.debug_tensor_values[
+          "u_init:0:DebugIdentity"][0]
+      self.assertEqual(np.float32, u_init_value.dtype)
+      self.assertEqual(0, len(u_init_value))
+
+  def testSendingEmptyStringTensorWorks(self):
+    with self.test_session(
+        use_gpu=True,
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
+      u_init = constant_op.constant(
+          [], dtype=dtypes.string, shape=[0], name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      u_init_value = self.debug_server.debug_tensor_values[
+          "u_init:0:DebugIdentity"][0]
+      self.assertEqual(np.object, u_init_value.dtype)
+      self.assertEqual(0, len(u_init_value))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
index 1a6bedbbcbf94eb95e49d43e2d03c85b53bebb7b..ba0f15b4e2ff23295eae764088144a3d1b533f01 100644
--- a/tensorflow/python/debug/lib/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -22,7 +22,6 @@ import shutil
 import tempfile
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_utils
@@ -36,13 +35,6 @@ from tensorflow.python.platform import googletest
 
 class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
 
-  def _no_rewrite_session_config(self):
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
-    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
-    return config_pb2.ConfigProto(graph_options=graph_options)
-
   def _debug_urls(self, run_number=None):
     return ["file://%s" % self._debug_dump_dir(run_number=run_number)]
 
@@ -55,7 +47,8 @@ class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
   def testAllowsDifferentWatchesOnDifferentRuns(self):
     """Test watching different tensors on different runs of the same graph."""
 
-    with session.Session(config=self._no_rewrite_session_config()) as sess:
+    with session.Session(
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
       u_init_val = [[5.0, 3.0], [-1.0, 0.0]]
       v_init_val = [[2.0], [-1.0]]
 
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index b623ee31c5dc59894373ec7952e53acd0f6e1126..ff49b6954776264ccb2eceeceab7da5a881081f0 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -24,11 +24,9 @@ from __future__ import print_function
 import os
 import shutil
 
-import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_utils
@@ -38,28 +36,15 @@ from tensorflow.python.debug.wrappers import framework
 from tensorflow.python.debug.wrappers import grpc_wrapper
 from tensorflow.python.debug.wrappers import hooks
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import monitored_session
 
 
-def no_rewrite_session_config():
-  rewriter_config = rewriter_config_pb2.RewriterConfig(
-      disable_model_pruning=True,
-      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
-  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
-  return config_pb2.ConfigProto(graph_options=graph_options)
-
-
 class GrpcDebugServerTest(test_util.TensorFlowTestCase):
 
   def testRepeatedRunServerRaisesException(self):
@@ -142,19 +127,22 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
       return os.path.join(self._dump_root, "run_%d" % run_number)
 
   def testConstructGrpcDebugWrapperSessionWithInvalidTypeRaisesException(self):
-    sess = session.Session(config=no_rewrite_session_config())
+    sess = session.Session(
+        config=session_debug_testlib.no_rewrite_session_config())
     with self.assertRaisesRegexp(
         TypeError, "Expected type str or list in grpc_debug_server_addresses"):
       grpc_wrapper.GrpcDebugWrapperSession(sess, 1337)
 
   def testConstructGrpcDebugWrapperSessionWithInvalidTypeRaisesException2(self):
-    sess = session.Session(config=no_rewrite_session_config())
+    sess = session.Session(
+        config=session_debug_testlib.no_rewrite_session_config())
     with self.assertRaisesRegexp(
         TypeError, "Expected type str in list grpc_debug_server_addresses"):
       grpc_wrapper.GrpcDebugWrapperSession(sess, ["localhost:1337", 1338])
 
   def testUseInvalidWatchFnTypeWithGrpcDebugWrapperSessionRaisesException(self):
-    sess = session.Session(config=no_rewrite_session_config())
+    sess = session.Session(
+        config=session_debug_testlib.no_rewrite_session_config())
     with self.assertRaises(TypeError):
       grpc_wrapper.GrpcDebugWrapperSession(
           sess, "localhost:%d" % self._server_port, watch_fn="foo")
@@ -164,7 +152,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     v = variables.Variable(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
-    sess = session.Session(config=no_rewrite_session_config())
+    sess = session.Session(
+        config=session_debug_testlib.no_rewrite_session_config())
     sess.run(u.initializer)
     sess.run(v.initializer)
 
@@ -190,7 +179,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     v = variables.Variable(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
-    sess = session.Session(config=no_rewrite_session_config())
+    sess = session.Session(
+        config=session_debug_testlib.no_rewrite_session_config())
     sess.run(u.initializer)
     sess.run(v.initializer)
 
@@ -223,7 +213,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     v = variables.Variable(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
-    sess = session.Session(config=no_rewrite_session_config())
+    sess = session.Session(
+        config=session_debug_testlib.no_rewrite_session_config())
     sess.run(u.initializer)
     sess.run(v.initializer)
 
@@ -254,7 +245,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     v = variables.Variable(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
-    sess = session.Session(config=no_rewrite_session_config())
+    sess = session.Session(
+        config=session_debug_testlib.no_rewrite_session_config())
     sess.run(u.initializer)
     sess.run(v.initializer)
 
@@ -298,7 +290,8 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     v = variables.Variable(20.0, name="v")
     w = math_ops.multiply(u, v, name="w")
 
-    sess = session.Session(config=no_rewrite_session_config())
+    sess = session.Session(
+        config=session_debug_testlib.no_rewrite_session_config())
     sess.run(variables.global_variables_initializer())
 
     grpc_debug_hook = hooks.TensorBoardDebugHook(
@@ -324,168 +317,6 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     hooks.GrpcDebugHook(["foo:42424"])
 
 
-class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    (cls.debug_server_port, cls.debug_server_url, _, cls.debug_server_thread,
-     cls.debug_server
-    ) = grpc_debug_test_server.start_server_on_separate_thread(
-        dump_to_filesystem=False)
-    tf_logging.info("debug server url: %s", cls.debug_server_url)
-
-  @classmethod
-  def tearDownClass(cls):
-    cls.debug_server.stop_server().wait()
-    cls.debug_server_thread.join()
-
-  def tearDown(self):
-    ops.reset_default_graph()
-    self.debug_server.clear_data()
-
-  def testSendingLargeGraphDefsWorks(self):
-    with self.test_session(
-        use_gpu=True, config=no_rewrite_session_config()) as sess:
-      u = variables.Variable(42.0, name="original_u")
-      for _ in xrange(50 * 1000):
-        u = array_ops.identity(u)
-      sess.run(variables.global_variables_initializer())
-
-      def watch_fn(fetches, feeds):
-        del fetches, feeds
-        return framework.WatchOptions(
-            debug_ops=["DebugIdentity"],
-            node_name_regex_whitelist=r"original_u")
-      sess = grpc_wrapper.GrpcDebugWrapperSession(
-          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
-      self.assertAllClose(42.0, sess.run(u))
-
-      self.assertAllClose(
-          [42.0],
-          self.debug_server.debug_tensor_values["original_u:0:DebugIdentity"])
-      self.assertEqual(2 if test.is_gpu_available() else 1,
-                       len(self.debug_server.partition_graph_defs))
-      max_graph_def_size = max([
-          len(graph_def.SerializeToString())
-          for graph_def in self.debug_server.partition_graph_defs])
-      self.assertGreater(max_graph_def_size, 4 * 1024 * 1024)
-
-  def testSendingLargeFloatTensorWorks(self):
-    with self.test_session(
-        use_gpu=True, config=no_rewrite_session_config()) as sess:
-      u_init_val_array = list(xrange(1200 * 1024))
-      # Size: 4 * 1200 * 1024 = 4800k > 4M
-
-      u_init = constant_op.constant(
-          u_init_val_array, dtype=dtypes.float32, name="u_init")
-      u = variables.Variable(u_init, name="u")
-
-      def watch_fn(fetches, feeds):
-        del fetches, feeds  # Unused by this watch_fn.
-        return framework.WatchOptions(
-            debug_ops=["DebugIdentity"],
-            node_name_regex_whitelist=r"u_init")
-      sess = grpc_wrapper.GrpcDebugWrapperSession(
-          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
-      sess.run(u.initializer)
-
-      self.assertAllEqual(
-          u_init_val_array,
-          self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
-
-  def testSendingStringTensorWithAlmostTooLargeStringsWorks(self):
-    with self.test_session(
-        use_gpu=True, config=no_rewrite_session_config()) as sess:
-      u_init_val = [
-          b"", b"spam", b"A" * 2500 * 1024, b"B" * 2500 * 1024, b"egg", b""]
-      u_init = constant_op.constant(
-          u_init_val, dtype=dtypes.string, name="u_init")
-      u = variables.Variable(u_init, name="u")
-
-      def watch_fn(fetches, feeds):
-        del fetches, feeds
-        return framework.WatchOptions(
-            debug_ops=["DebugIdentity"],
-            node_name_regex_whitelist=r"u_init")
-      sess = grpc_wrapper.GrpcDebugWrapperSession(
-          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
-      sess.run(u.initializer)
-
-      self.assertAllEqual(
-          u_init_val,
-          self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
-
-  def testSendingLargeStringTensorWorks(self):
-    with self.test_session(
-        use_gpu=True, config=no_rewrite_session_config()) as sess:
-      strs_total_size_threshold = 5000 * 1024
-      cum_size = 0
-      u_init_val_array = []
-      while cum_size < strs_total_size_threshold:
-        strlen = np.random.randint(200)
-        u_init_val_array.append(b"A" * strlen)
-        cum_size += strlen
-
-      u_init = constant_op.constant(
-          u_init_val_array, dtype=dtypes.string, name="u_init")
-      u = variables.Variable(u_init, name="u")
-
-      def watch_fn(fetches, feeds):
-        del fetches, feeds
-        return framework.WatchOptions(
-            debug_ops=["DebugIdentity"],
-            node_name_regex_whitelist=r"u_init")
-      sess = grpc_wrapper.GrpcDebugWrapperSession(
-          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
-      sess.run(u.initializer)
-
-      self.assertAllEqual(
-          u_init_val_array,
-          self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
-
-  def testSendingEmptyFloatTensorWorks(self):
-    with self.test_session(
-        use_gpu=True, config=no_rewrite_session_config()) as sess:
-      u_init = constant_op.constant(
-          [], dtype=dtypes.float32, shape=[0], name="u_init")
-      u = variables.Variable(u_init, name="u")
-
-      def watch_fn(fetches, feeds):
-        del fetches, feeds
-        return framework.WatchOptions(
-            debug_ops=["DebugIdentity"],
-            node_name_regex_whitelist=r"u_init")
-      sess = grpc_wrapper.GrpcDebugWrapperSession(
-          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
-      sess.run(u.initializer)
-
-      u_init_value = self.debug_server.debug_tensor_values[
-          "u_init:0:DebugIdentity"][0]
-      self.assertEqual(np.float32, u_init_value.dtype)
-      self.assertEqual(0, len(u_init_value))
-
-  def testSendingEmptyStringTensorWorks(self):
-    with self.test_session(
-        use_gpu=True, config=no_rewrite_session_config()) as sess:
-      u_init = constant_op.constant(
-          [], dtype=dtypes.string, shape=[0], name="u_init")
-      u = variables.Variable(u_init, name="u")
-
-      def watch_fn(fetches, feeds):
-        del fetches, feeds
-        return framework.WatchOptions(
-            debug_ops=["DebugIdentity"],
-            node_name_regex_whitelist=r"u_init")
-      sess = grpc_wrapper.GrpcDebugWrapperSession(
-          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
-      sess.run(u.initializer)
-
-      u_init_value = self.debug_server.debug_tensor_values[
-          "u_init:0:DebugIdentity"][0]
-      self.assertEqual(np.object, u_init_value.dtype)
-      self.assertEqual(0, len(u_init_value))
-
-
 class SessionDebugConcurrentTest(
     session_debug_testlib.DebugConcurrentRunCallsTest):
 
@@ -548,7 +379,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
     self._server_2.clear_data()
 
   def testToggleEnableTwoDebugWatchesNoCrosstalkBetweenDebugNodes(self):
-    with session.Session(config=no_rewrite_session_config()) as sess:
+    with session.Session(
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
       v_1 = variables.Variable(50.0, name="v_1")
       v_2 = variables.Variable(-50.0, name="v_1")
       delta_1 = constant_op.constant(5.0, name="delta_1")
@@ -617,7 +449,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
                                         ("toggled_2", 0, "DebugIdentity")])
     self._servers_and_threads.append((server, server_thread))
 
-    with session.Session(config=no_rewrite_session_config()) as sess:
+    with session.Session(
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
       v_1 = variables.Variable(50.0, name="v_1")
       v_2 = variables.Variable(-50.0, name="v_1")
       # These two nodes have names that match those in the
@@ -656,7 +489,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
           self.assertEqual(0, len(server.debug_tensor_values))
 
   def testToggleEnableTwoDebugWatchesNoCrosstalkBetweenServers(self):
-    with session.Session(config=no_rewrite_session_config()) as sess:
+    with session.Session(
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
       v = variables.Variable(50.0, name="v")
       delta = constant_op.constant(5.0, name="delta")
       inc_v = state_ops.assign_add(v, delta, name="inc_v")
@@ -698,7 +532,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
           self.assertEqual(0, len(self._server_2.debug_tensor_values))
 
   def testToggleBreakpointsWorks(self):
-    with session.Session(config=no_rewrite_session_config()) as sess:
+    with session.Session(
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
       v_1 = variables.Variable(50.0, name="v_1")
       v_2 = variables.Variable(-50.0, name="v_2")
       delta_1 = constant_op.constant(5.0, name="delta_1")
@@ -755,7 +590,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
           self.assertSetEqual(set(), self._server_1.breakpoints)
 
   def testTensorBoardDebuggerWrapperToggleBreakpointsWorks(self):
-    with session.Session(config=no_rewrite_session_config()) as sess:
+    with session.Session(
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
       v_1 = variables.Variable(50.0, name="v_1")
       v_2 = variables.Variable(-50.0, name="v_2")
       delta_1 = constant_op.constant(5.0, name="delta_1")
@@ -827,7 +663,8 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
             self._server_1.query_source_file_line(__file__, 1)
 
   def testTensorBoardDebuggerWrapperDisablingTracebackSourceSendingWorks(self):
-    with session.Session(config=no_rewrite_session_config()) as sess:
+    with session.Session(
+        config=session_debug_testlib.no_rewrite_session_config()) as sess:
       v_1 = variables.Variable(50.0, name="v_1")
       v_2 = variables.Variable(-50.0, name="v_2")
       delta_1 = constant_op.constant(5.0, name="delta_1")
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index f4fac1401918ccacd38aae5ad2ef8d686c9204b9..070d9c4cd7094c81b18192e75885ae6dd6729cbf 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -669,6 +669,55 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       self.assertEqual(1, len(first_bad_datum))
       self.assertEqual(x_name, first_bad_datum[0].node_name)
 
+  def testFindInfOrNanWithOpNameExclusion(self):
+    with session.Session() as sess:
+      u_name = "testFindInfOrNanWithOpNameExclusion/u"
+      v_name = "testFindInfOrNanWithOpNameExclusion/v"
+      w_name = "testFindInfOrNanWithOpNameExclusion/w"
+      x_name = "testFindInfOrNanWithOpNameExclusion/x"
+      y_name = "testFindInfOrNanWithOpNameExclusion/y"
+      z_name = "testFindInfOrNanWithOpNameExclusion/z"
+
+      u_init = constant_op.constant([2.0, 4.0])
+      u = variables.Variable(u_init, name=u_name)
+      v_init = constant_op.constant([2.0, 1.0])
+      v = variables.Variable(v_init, name=v_name)
+
+      # Expected output: [0.0, 3.0]
+      w = math_ops.subtract(u, v, name=w_name)
+
+      # Expected output: [inf, 1.3333]
+      x = math_ops.div(u, w, name=x_name)
+
+      # Expected output: [nan, 4.0]
+      y = math_ops.multiply(w, x, name=y_name)
+
+      z = math_ops.multiply(y, y, name=z_name)
+
+      u.initializer.run()
+      v.initializer.run()
+
+      _, dump = self._debug_run_and_get_dump(
+          sess, z,
+          expected_partition_graph_count=self._expected_partition_graph_count)
+
+      # Find all "offending tensors".
+      bad_data = dump.find(debug_data.has_inf_or_nan,
+                           exclude_node_names=".*/x$")
+
+      # Verify that the nodes with bad values are caught through running find
+      # on the debug dump.
+      self.assertEqual(2, len(bad_data))
+      # Assert that the node `x` should have been excluded.
+      self.assertEqual(y_name, bad_data[0].node_name)
+      self.assertEqual(z_name, bad_data[1].node_name)
+
+      first_bad_datum = dump.find(
+          debug_data.has_inf_or_nan, first_n=1, exclude_node_names=".*/x$")
+
+      self.assertEqual(1, len(first_bad_datum))
+      self.assertEqual(y_name, first_bad_datum[0].node_name)
+
   def _session_run_for_graph_structure_lookup(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
       u_name = "testDumpGraphStructureLookup/u"
diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index fb9494f57636e46e54ef230cf4803dbb6ccad0c7..1f9c8fa5a96b4d6826fae0870608e0e737c7cd88 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -21,6 +21,8 @@ import signal
 import sys
 import traceback
 
+import six
+
 # Google-internal import(s).
 from tensorflow.python.debug.lib import common
 from tensorflow.python.debug.wrappers import framework
@@ -140,14 +142,9 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
 
 
 def _signal_handler(unused_signal, unused_frame):
-  try:
-    input_func = raw_input
-  except NameError:
-    # Python 3 does not have raw_input.
-    input_func = input
-
   while True:
-    response = input_func("\nSIGINT received. Quit program? (Y/n): ").strip()
+    response = six.moves.input(
+        "\nSIGINT received. Quit program? (Y/n): ").strip()
     if response in ("", "Y", "y"):
       sys.exit(0)
     elif response in ("N", "n"):
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 6705cd31e291d2eab7aa8179e9b2b829f8970c18..5e4604fda4d7249a1244f12a533e1cb09e16782f 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -31,15 +31,18 @@ from tensorflow.python.training import session_run_hook
 class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   """Command-line-interface debugger hook.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s. Provides a substitute for
+  `tfdbg.LocalCLIDebugWrapperSession` in cases where the session is not directly
+  available.
   """
 
   def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
-      ui_type: (str) user-interface type.
+      ui_type: (`str`) requested user-interface type. Currently supported:
+        (curses | readline).
       dump_root: (`str`) optional path to the dump root directory. Must be a
         directory that does not exist or an empty directory. If the directory
         does not exist, it will be created by the debugger core during debug
@@ -153,8 +156,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
 class DumpingDebugHook(session_run_hook.SessionRunHook):
   """A debugger hook that dumps debug data to filesystem.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s.
   """
 
   def __init__(self,
@@ -229,8 +232,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
   When the arguments of debug_utils.watch_graph changes, strongly consider
   changing arguments here too so that features are available to tflearn users.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s.
   """
 
   def __init__(self,
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index 1465cb72950c8fa6a453ebd4290bbf6382173ff8..c8625655e51a43a222addedd4beecdd3515d7fb6 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -115,6 +115,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     #   unavailable (i.e., is None), the run-start CLI will be launched to ask
     #   the user. This is the case, e.g., right before the first run starts.
     self._active_tensor_filter = None
+    self._active_filter_exclude_node_names = None
     self._active_tensor_filter_run_start_response = None
     self._run_through_times = 1
     self._skip_debug = False
@@ -148,6 +149,15 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         type=str,
         default="",
         help="Run until a tensor in the graph passes the specified filter.")
+    ap.add_argument(
+        "-fenn",
+        "--filter_exclude_node_names",
+        dest="filter_exclude_node_names",
+        type=str,
+        default="",
+        help="When applying the tensor filter, exclude node with names "
+        "matching the regular expression. Applicable only if --tensor_filter "
+        "or -f is used.")
     ap.add_argument(
         "--node_name_filter",
         dest="node_name_filter",
@@ -324,9 +334,11 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       debug_dump.set_python_graph(self._sess.graph)
 
       passed_filter = None
+      passed_filter_exclude_node_names = None
       if self._active_tensor_filter:
         if not debug_dump.find(
-            self._tensor_filters[self._active_tensor_filter], first_n=1):
+            self._tensor_filters[self._active_tensor_filter], first_n=1,
+            exclude_node_names=self._active_filter_exclude_node_names):
           # No dumped tensor passes the filter in this run. Clean up the dump
           # directory and move on.
           self._remove_dump_root()
@@ -334,10 +346,14 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         else:
           # Some dumped tensor(s) from this run passed the filter.
           passed_filter = self._active_tensor_filter
+          passed_filter_exclude_node_names = (
+              self._active_filter_exclude_node_names)
           self._active_tensor_filter = None
+          self._active_filter_exclude_node_names = None
 
       self._prep_debug_cli_for_run_end(
-          debug_dump, request.tf_error, passed_filter)
+          debug_dump, request.tf_error, passed_filter,
+          passed_filter_exclude_node_names)
 
       self._run_start_response = self._launch_cli()
 
@@ -358,7 +374,11 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if os.path.isdir(self._dump_root):
       shutil.rmtree(self._dump_root)
 
-  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self,
+                                  debug_dump,
+                                  tf_error,
+                                  passed_filter,
+                                  passed_filter_exclude_node_names):
     """Prepare (but not launch) CLI for run-end, with debug dump from the run.
 
     Args:
@@ -368,6 +388,9 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         (if any).
       passed_filter: (None or str) Name of the tensor filter that just passed
         and caused the preparation of this run-end CLI (if any).
+      passed_filter_exclude_node_names: (None or str) Regular expression used
+        with the tensor filter to exclude ops with names matching the regular
+        expresssion.
     """
 
     if tf_error:
@@ -383,6 +406,9 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       if passed_filter is not None:
         # Some dumped tensor(s) from this run passed the filter.
         self._init_command = "lt -f %s" % passed_filter
+        if passed_filter_exclude_node_names:
+          self._init_command += (" --filter_exclude_node_names %s" %
+                                 passed_filter_exclude_node_names)
         self._title_color = "red_on_white"
 
     self._run_cli = analyzer_cli.create_analyzer_ui(
@@ -496,6 +522,11 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     parsed.op_type_filter = parsed.op_type_filter or None
     parsed.tensor_dtype_filter = parsed.tensor_dtype_filter or None
 
+    if parsed.filter_exclude_node_names and not parsed.till_filter_pass:
+      raise ValueError(
+          "The --filter_exclude_node_names (or -feon) flag is valid only if "
+          "the --till_filter_pass (or -f) flag is used.")
+
     if parsed.profile:
       raise debugger_cli_common.CommandLineExit(
           exit_token=framework.OnRunStartResponse(
@@ -525,6 +556,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       if parsed.till_filter_pass in self._tensor_filters:
         action = framework.OnRunStartAction.DEBUG_RUN
         self._active_tensor_filter = parsed.till_filter_pass
+        self._active_filter_exclude_node_names = (
+            parsed.filter_exclude_node_names)
         self._active_tensor_filter_run_start_response = run_start_response
       else:
         # Handle invalid filter name.
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 490812c96d83791cdc20c56f16c968f1a1851af8..b06fa26a935b42709575f8e400e0bda951ffbbc7 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -87,7 +87,11 @@ class LocalCLIDebuggerWrapperSessionForTest(
   def _prep_cli_for_run_start(self):
     pass
 
-  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self,
+                                  debug_dump,
+                                  tf_error,
+                                  passed_filter,
+                                  passed_filter_exclude_op_names):
     self.observers["debug_dumps"].append(debug_dump)
     self.observers["tf_errors"].append(tf_error)
 
@@ -451,6 +455,36 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
     self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
 
+  def testRunTillFilterPassesWithExcludeOpNames(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run", "-f", "greater_than_twelve",
+          "--filter_exclude_node_names", "inc_v.*"],
+         ["run"], ["run"]],
+        self.sess,
+        dump_root=self._tmp_dir)
+
+    def greater_than_twelve(datum, tensor):
+      del datum  # Unused.
+      return tensor > 12.0
+
+    # Verify that adding the same tensor filter more than once is tolerated
+    # (i.e., as if it were added only once).
+    wrapped_sess.add_tensor_filter("greater_than_twelve", greater_than_twelve)
+
+    # run five times.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    self.assertAllClose(14.0, self.sess.run(self.v))
+
+    self.assertEqual([1], wrapped_sess.observers["run_start_cli_run_numbers"])
+
+    # Due to the --filter_exclude_op_names flag, the run-end CLI should show up
+    # not after run 3, but after run 4.
+    self.assertEqual([4], wrapped_sess.observers["run_end_cli_run_numbers"])
+
   def testRunTillFilterPassesWorksInConjunctionWithOtherNodeNameFilter(self):
     """Test that --.*_filter flags work in conjunction with -f.
 
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index ab81d40148476735492890f608315b19eaa0a33f..b3268c9047e264b8264ae37b404b51be6a88962f 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -42,7 +42,6 @@ py_library(
         ":backprop",
         ":context",
         ":core",
-        ":custom_gradient",
         ":execute",
         ":function",
         ":graph_callable",
@@ -103,10 +102,10 @@ cuda_py_test(
     additional_deps = [
         ":backprop",
         ":context",
-        ":custom_gradient",
         ":test",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:resource_variable_ops",
@@ -143,6 +142,8 @@ cuda_py_test(
         ":tape",
         ":test",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
     ],
@@ -206,21 +207,6 @@ cc_library(
     ],
 )
 
-py_library(
-    name = "custom_gradient",
-    srcs = ["custom_gradient.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":context",
-        ":tape",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:util",
-    ],
-)
-
 py_library(
     name = "graph_only_ops",
     srcs = ["graph_only_ops.py"],
@@ -364,7 +350,6 @@ py_test(
     deps = [
         ":backprop",
         ":context",
-        ":custom_gradient",
         ":test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -415,21 +400,6 @@ py_test(
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "imperative_grad",
     srcs = ["imperative_grad.py"],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 5c235382652811ff83ec800c0a28a3beccd45f0f..92774d4d50e00c85599ceaef1cc99bb062bd3ce3 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import functools
 import operator
 import threading
@@ -32,7 +31,6 @@ from tensorflow.python.eager import imperative_grad
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -41,26 +39,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
-
-
-class _TensorCache(object):
-  """Simple cache which evicts items based on length in a FIFO manner."""
-
-  def __init__(self, max_items=256):
-    self._data = collections.OrderedDict()
-    self._max_items = max_items if max_items else 256
-
-  def put(self, key, value):
-    self._data[key] = value
-
-    if len(self._data) > self._max_items:
-      self._data.popitem(last=False)
-
-  def get(self, key):
-    return self._data.get(key, None)
-
-  def flush(self):
-    self._data = {}
+from tensorflow.python.util.tf_export import tf_export
 
 
 _op_attr_type_cache = {}
@@ -70,12 +49,10 @@ def op_attr_type(op_type, attr_name):
   try:
     return _op_attr_type_cache[(op_type, attr_name)]
   except KeyError:
-    with errors.raise_exception_on_not_ok_status() as status:
-      h = context.context()._handle  # pylint: disable=protected-access
-      attr_type = pywrap_tensorflow.TFE_OpNameGetAttrType(
-          h, op_type, attr_name, status)
-    _op_attr_type_cache[(op_type, attr_name)] = attr_type
-    return attr_type
+    h = context.context()._handle  # pylint: disable=protected-access
+    attr_type = pywrap_tensorflow.TFE_OpNameGetAttrType(h, op_type, attr_name)
+  _op_attr_type_cache[(op_type, attr_name)] = attr_type
+  return attr_type
 
 
 def make_attr(attr_type, value):
@@ -106,6 +83,14 @@ class _MockOp(object):
         return make_attr(typ, self.attrs[i + 1])
     raise KeyError(attr)
 
+  def _get_control_flow_context(self):
+    raise NotImplementedError(
+        "tf.GradientTape.gradients() does not support graph control flow "
+        "operations like tf.cond or tf.while at this time. Use tf.gradients() "
+        "instead. If you need this feature, please file a feature request at "
+        "https://github.com/tensorflow/tensorflow/issues/new"
+    )
+
 
 def _magic_gradient_function(op_name, attr_tuple, num_inputs,
                              inputs, outputs, out_grads):
@@ -137,112 +122,6 @@ _gradient_functions_lock = threading.Lock()
 _tracing = False
 
 
-# TODO(apassos) replace this with a mechanism which can happen at the op
-# gradient function registration site, to be less error-prone
-# TODO(apassos) add ops other than those in nn_grad and math_grad
-_ops_which_dont_need_outputs = set([
-    "Identity",
-    "MatMul",
-    "Conv2DBackpropInput",
-    "Conv2DBackpropFilter",
-    "Conv3D",
-    "Conv3DBackpropInputV2",
-    "AvgPool3D",
-    "AvgPool3DGrad",
-    "MaxPool3D",
-    "MaxPool3DGrad",
-    "MaxPool3DGradGrad",
-    "BiasAdd",
-    "BiasAddV1",
-    "BiasAddGrad",
-    "Relu6",
-    "Softplus",
-    "SoftplusGrad",
-    "Softsign",
-    "ReluGrad",
-    "Conv2D",
-    "DepthwiseConv2dNative",
-    "Dilation2D",
-    "AvgPool",
-    "AvgPoolGrad",
-    "BatchNormWithGlobalNormalization",
-    "L2Loss",
-    "Sum",
-    "Prod",
-    "SegmentSum",
-    "SegmentMean",
-    "SparseSegmentSum",
-    "SparseSegmentMean",
-    "SparseSegmentSqrtN",
-    "SegmentMin",
-    "SegmentMax",
-    "UnsortedSegmentSum",
-    "UnsortedSegmentMax",
-    "UnsortedSegmentMin",
-    "UnsortedSegmentProd",
-    "Abs",
-    "Neg",
-    "ReciprocalGrad",
-    "Square",
-    "Expm1",
-    "Log",
-    "Log1p",
-    "TanhGrad",
-    "SigmoidGrad",
-    "Sign",
-    "Sin",
-    "Cos",
-    "Tan",
-    "Add",
-    "Sub",
-    "Mul",
-    "Div",
-    "RealDiv",
-    "Maximum",
-    "Minimum",
-    "SquaredDifference",
-    "Select",
-    "SparseMatMul",
-    "BatchMatMul",
-    "Complex",
-    "Real",
-    "Imag",
-    "Angle",
-    "Conj",
-    "Cast",
-    "Cross",
-    "Cumsum",
-    "Cumprod",
-    "ReadVariableOp",
-    "VarHandleOp",
-    "Shape",
-])
-
-_ops_which_dont_need_inputs = set([
-    "Identity",
-    "Softmax",
-    "LogSoftmax",
-    "BiasAdd",
-    "Relu",
-    "Elu",
-    "Selu",
-    "SparseSoftmaxCrossEntropyWithLogits",
-    "Neg",
-    "Inv",
-    "Reciprocal",
-    "Sqrt",
-    "Exp",
-    "Tanh",
-    "Sigmoid",
-    "Real",
-    "Imag",
-    "Conj",
-    "ReadVariableOp",
-    "VarHandleOp",
-    "Shape",
-])
-
-
 # TODO(agarwal): use an automatic mechanism for handling None arguments to
 # gradient functions.
 # Some gradient functions can accept None arguments for gradients. The following
@@ -261,57 +140,25 @@ _grad_fn_accepts_none_for_indices = {
 }
 
 
-def _record_gradient(op_name, inputs, attrs, results, name):
-  """Records gradients for a TensorFlow operation.
-
-  Args:
-    op_name: Name of the TensorFlow operation (see REGISTER_OP in C++ code) to
-      execute.
-    inputs: A flat list of Tensor object inputs to the operation.
-    attrs: A tuple with alternating string attr names and attr values for this
-      operation.
-    results: The results of the operation (as a flat list).
-    name: Customized name for the operation.
-
-  Returns:
-    A list of maybe-wrapped results. Either Tensors or TensorNodes.
-
-  Raises:
-    An exception on error.
-  """
-  if not tape.could_possibly_record():
-    return
-
-  if op_name in _ops_which_dont_need_outputs:
-    op_outputs = None
-  else:
-    # TODO(apassos) this line creates a weak circular reference where the
-    # backprop function keeps an output alive which in turn keeps the tape entry
-    # alive which keeps the backprop function alive. Figure out how to break
-    # this up without breaking second derivatives of ops like Exp whose
-    # gradients depend only on the outputs.
-    op_outputs = results
-
-  if op_name in _ops_which_dont_need_inputs:
-    op_inputs = None
-  else:
-    op_inputs = inputs
-
-  num_inputs = len(inputs)
+def _get_backward_fn(op_name, attrs, num_inputs, op_inputs, op_outputs):
 
   def grad_fn(*orig_outputs):
-    """Generated gradient function."""
     result = _magic_gradient_function(op_name, attrs, num_inputs,
                                       op_inputs, op_outputs, orig_outputs)
     if _tracing:
-      print("Gradient for", (name if name else op_name), "inputs", op_inputs,
-            "output_grads", orig_outputs, "gradients", result)
+      print("Gradient for", op_name, "inputs", op_inputs, "output_grads",
+            orig_outputs, "gradients", result)
     return nest.flatten(result)
 
-  tape.record_operation(op_name, results, inputs, grad_fn)
-  if _tracing:
-    print("Computed op", (name if name else op_name), "inputs", inputs,
-          "outputs", results)
+  return grad_fn
+
+
+pywrap_tensorflow.TFE_Py_RegisterBackwardFunctionGetter(_get_backward_fn)
+
+
+def _record_gradient(op_name, inputs, attrs, results, name):
+  return pywrap_tensorflow.TFE_Py_RecordGradient(op_name, inputs, attrs,
+                                                 results, name)
 
 
 execute.record_gradient = _record_gradient
@@ -321,8 +168,8 @@ def implicit_val_and_grad(f):
   """Returns a function which differentiates f with respect to variables.
 
   The wrapped function returns the value and the gradient of f when called with
-  the same arguments. The gradient is with respect to all TFE variables which
-  have `variable.watch()` called on them by f.
+  the same arguments. The gradient is with respect to all trainable TFE
+  variables accessed by `f`.
 
   This function is useful when the exact set of variables to differentiate with
   is not known ahead of time.
@@ -378,6 +225,7 @@ def implicit_val_and_grad(f):
       tape.pop_tape(this_tape)
     # Sorting variables by id, which is monotonically increasing in construction
     # order. This ensures unique order across executions.
+    # TODO(josh11b): Move the sort to the C++ implementation in pywrap_tfe_src.cc.
     variables = list(sorted(this_tape.watched_variables(),
                             key=lambda v: v.handle._id))  # pylint: disable=protected-access
     sources = [x.handle for x in variables]
@@ -398,8 +246,8 @@ def implicit_grad(f):
   """Returns a function which differentiates f with respect to variables.
 
   The wrapped function returns the gradient of f when called with the same
-  arguments. The gradient is with respect to all TFE variables which have
-  `variable.watch()` called on them by f.
+  arguments. The gradient is with respect to all trainable TFE variables
+  accessed by `f`.
 
   This function is useful when the exact set of variables to differentiate with
   is not known ahead of time.
@@ -639,7 +487,7 @@ def val_and_grad_function(f, params=None):
   return decorated
 
 
-def make_vjp(f, params=None):
+def make_vjp(f, params=None, persistent=True):
   """Returns a function that computes f and is vjp w.r.t. params.
 
   The term "vjp" here is an abbreviation for vector-jacobian product.
@@ -648,6 +496,8 @@ def make_vjp(f, params=None):
     f: the function to be differentiated.
     params: the parameters (numbers or names) to differentiate with respect to.
        A value of None will differentiate with respect to all parameters.
+    persistent: Boolean controlling whether the VJP function can be re-used.
+      Must be True or False.
 
   Returns:
     A function, which when called, returns a tuple (value, vjp), where:
@@ -675,7 +525,7 @@ def make_vjp(f, params=None):
     """Computes the value and gradient of the decorated function."""
     parameter_positions = _get_arg_spec(f, params, args)
     assert not kwds, "The gradient function can't take keyword arguments."
-    this_tape = tape.push_new_tape()
+    this_tape = tape.push_new_tape(persistent=persistent)
     try:
       sources = []
       args = [
@@ -757,7 +607,7 @@ def _num_elements(grad):
   raise ValueError("`grad` not a Tensor or IndexedSlices.")
 
 
-_zeros_cache = _TensorCache()
+_zeros_cache = context._TensorCache()  # pylint: disable=protected-access
 
 
 def _fast_fill(value, shape, dtype):
@@ -793,64 +643,62 @@ _default_vspace = imperative_grad.VSpace(
     ones=_ones)
 
 
+def _handle_or_self(x):
+  """If x is ResourceVariable, return its handle, else x."""
+  if isinstance(x, resource_variable_ops.ResourceVariable):
+    x = x.handle
+  return x
+
+
+@tf_export("GradientTape")
 class GradientTape(object):
-  """Records operations to use to compute gradients.
+  """Record operations for automatic differentiation.
 
-  Operations are recorded if:
-    - they happen in code marked by this context manager
-    - at least one of their inputs is being watched
+  Operations are recorded if they are executed within this context manager and
+  at least one of their inputs is being "watched".
 
-  Outputs of recorded operations are watched. Variables are automatically
-  watched and tensors can be manually watched by calling the watch method on the
-  context manager.
+  Trainable variables (created by `tf.contrib.eager.Variable` or
+  @{tf.get_variable}, trainable=True is default in both cases) are automatically
+  watched. Tensors can be manually watched by invoking the `watch` method on
+  this context manager.
 
-  Example usage:
+  For example, consider the function `y = x * x`. The gradient at `x = 3.0` can
+  be computed as:
 
   ```python
+  x = tf.constant(3.)
   with tfe.GradientTape() as g:
-    x = tf.constant(3.0)
     g.watch(x)
     y = x * x
-  grad = g.gradient(y, [x])[0]
-  assert grad.numpy() == 6.0
+  grad = g.gradient(y, [x])[0] # Will compute to 6.0
   ```
 
-  It is possible to use GradientTapes to compute higher-order derivatives as
-  follows:
+  GradientTapes can be nested to compute higher-order derivatives. For example,
 
   ```python
+  x = tf.constant(3.0)
   with tfe.GradientTape() as g:
-    x = tf.constant(3.0)
-    g.watch(x)
-    y = x * x
     with tfe.GradientTape() as gg:
-      gg.watch(y)
-      z = 2 * y
-    inner_grad = gg.gradient(z, [y])[0]
-    assert inner_grad.numpy() == 2
-    y = y + inner_grad
-  grad = g.gradient(y, [x])[0]
-  assert grad.numpy() == 6.0
+      gg.watch(x)
+      y = x * x
+    dy_dx = gg.gradient(y, [x])[0]     # Will compute to 6.0
+  d2y_dx2 = g.gradient(dy_dx, [x])[0]  # Will compute to 2.0
   ```
 
   By default, the resources held by a GradientTape are released as soon as
-  GradientTape.gradient() method is called. However, if one need to compute
-  multiple gradients over the same computation, she can create a persistent
-  GradientTape. Persistent tapes allow multiple calls to the gradient() method
-  and release resources when the tape object is destructed.
-
-  Example usage:
+  GradientTape.gradient() method is called. To compute multiple gradients over
+  the same computation, create a persistent gradient tape. This allows multiple
+  calls to the gradient() method as resources are released when the tape object
+  is garbage collected. For example:
 
   ```python
+  x = tf.constant(3.0)
   with tfe.GradientTape(persistent=True) as g:
-    x = tf.constant(3.0)
     g.watch(x)
     y = x * x
     z = y * y
-  dz_dx = g.gradient(z, [x])[0]
-  assert dz_dx.numpy() == 108.0   # 4*x^3 at x = 3
-  dy_dx = g.gradient(y, [x])[0]
-  assert dy_dx.numpy() == 6.0
+  dy_dx = g.gradient(z, [x])[0]  # 6.0
+  dz_dx = g.gradient(y, [x])[0]  # 108.0 (4*x^3 at x = 3)
   del g  # Drop the reference to the tape
   """
 
@@ -859,8 +707,8 @@ class GradientTape(object):
 
     Args:
       persistent: Boolean controlling whether a persistent gradient tape
-        is created. Must be True or False.
-
+        is created. False by default, which means at most one call can
+        be made to the gradient() method on this object.
     """
     self._tape = None
     self._persistent = persistent
@@ -876,44 +724,50 @@ class GradientTape(object):
     """Ensures that `tensor` is being traced by this tape.
 
     Args:
-      tensor: a Tensor or Variable a list of Tensors or Variables.
+      tensor: a Tensor or list of Tensors.
     """
     for t in nest.flatten(tensor):
-      if isinstance(t, resource_variable_ops.ResourceVariable):
-        t = t.handle
-      tape.watch(t)
+      tape.watch(_handle_or_self(t))
 
   def watched_variables(self):
-    return self._tape.watched_variables()
+    # Sorting variables by id, which is monotonically increasing in construction
+    # order. This ensures unique order across executions.
+    # TODO(josh11b): Move the sort to the C++ implementation in pywrap_tfe_src.cc.
+    return list(sorted(self._tape.watched_variables(),
+                       key=lambda v: v.handle._id))  # pylint: disable=protected-access
 
   def gradient(self, target, sources, output_gradients=None):
-    """Computes the gradient using information traced by the tape.
+    """Computes the gradient using operations recorded in context of this tape.
 
     Args:
-      target: the tensor to be differentiated.
-      sources: a list of Tensors or Variables, the target will be
-       differentiated with respect to the sources.
+      target: Tensor to be differentiated.
+      sources: a list or nested structure of Tensors or Variables. `target`
+        will be differentiated against elements in `sources`.
       output_gradients: a list of gradients, one for each element of
-       target. Defaults to None.
+        target. Defaults to None.
 
     Returns:
-      a list of Tensors (or IndexedSlices, or None), one for each element in
-      `sources`.
+      a list or nested structure of Tensors (or IndexedSlices, or None),
+      one for each element in `sources`. Returned structure is the same as
+      the structure of `sources`.
 
     Raises:
       RuntimeError: if called inside the context of the tape, or if called more
-       than once.
+       than once on a non-persistent tape.
     """
     if self._tape is None:
       raise RuntimeError("GradientTape.gradient can only be called once "
                          "on non-persistent tapes, and "
                          "only when the context manager has exited.")
-    sources = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
-               else x
-               for x in sources]
-    grad = imperative_grad.imperative_grad(
-        _default_vspace, self._tape, [target], sources,
+    flat_sources = nest.flatten(sources)
+    flat_sources = [_handle_or_self(x) for x in flat_sources]
+
+    flat_grad = imperative_grad.imperative_grad(
+        _default_vspace, self._tape, [target], flat_sources,
         output_gradients=output_gradients)
+
     if not self._persistent:
       self._tape = None
+
+    grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index a12113893ab3eac671e8138472bc95e9d8b89499..991b4dbe7a688c8f6dc6420b6d6b7f7158d6bf86 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import custom_gradient
 from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -32,6 +31,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
@@ -115,6 +116,19 @@ class BackpropTest(test.TestCase):
     with self.assertRaises(RuntimeError):
       backprop.gradients_function(f)(constant_op.constant(1.0))
 
+  def testGradientsFunctionInCustomGradient(self):
+
+    @custom_gradient.custom_gradient
+    def f(x):
+      (y,) = backprop.gradients_function(lambda x: x * x)(x)
+
+      def grad(dy):
+        return [2 * dy]
+
+      return y, grad
+
+    self.assertAllEqual(f(1.0), 2.0)
+
   def testImplicitGradOverEmbeddingLookup(self):
     batch_size = 8
     embedding_size = 512
@@ -182,6 +196,19 @@ class BackpropTest(test.TestCase):
     g, = backprop.gradients_function(loss, [0])(logits, labels)
     self.assertAllEqual(g.numpy(), [[-0.5, 0.5]])
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientWithinTapeBlock(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    self.evaluate(v1.initializer)
+    with backprop.GradientTape() as t:
+      loss = 2 * v1
+      with self.assertRaises(RuntimeError):
+        t.gradient(loss, [v1])
+    with backprop.GradientTape(persistent=True) as t:
+      loss = 2 * v1
+      grad = t.gradient(loss, [v1])
+    self.assertAllEqual(self.evaluate(grad[0]), 2.0)
+
   @test_util.assert_no_new_tensors
   def testSecondGrad(self):
 
@@ -205,11 +232,22 @@ class BackpropTest(test.TestCase):
     def f(x):
       return x * x
 
-    wrapped_fn = backprop.make_vjp(f)
+    wrapped_fn = backprop.make_vjp(f, persistent=False)
     result, vjp = wrapped_fn(constant_op.constant(3.0))
     self.assertAllEqual(result, 9.0)
     self.assertAllEqual(vjp(2.0)[0], 12.0)
 
+  def testPersistentMakeVJP(self):
+
+    def f(x):
+      return x * x
+
+    wrapped_fn = backprop.make_vjp(f, persistent=True)
+    _, vjp = wrapped_fn(constant_op.constant(3.0))
+    vjp_result1 = vjp(2.0)[0]
+    vjp_result2 = vjp(2.0)[0]
+    self.assertAllEqual(vjp_result1, vjp_result2, 12.0)
+
   @test_util.assert_no_new_tensors
   def testGradGrad(self):
 
@@ -332,6 +370,54 @@ class BackpropTest(test.TestCase):
     self.assertEqual(backprop.implicit_grad(f)()[0][0], None)
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeRepeatedSource(self):
+    with backprop.GradientTape(persistent=False) as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = 2 * x
+    grad = g.gradient(target=y, sources=[x, x])
+    self.assertEqual(self.evaluate(grad), [2.0, 2.0])
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testPersistentGradientTapeRepeatedSource(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant(3.0)
+      y = constant_op.constant(5.0)
+      g.watch(x)
+      g.watch(y)
+      z = x * x + x * y
+    grad = g.gradient(target=z, sources=[x, x])
+    self.assertEqual(self.evaluate(grad), [11.0, 11.0])
+    grad = g.gradient(target=z, sources=[y, x])
+    self.assertEqual(self.evaluate(grad), [3.0, 11.0])
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeStructure(self):
+    with backprop.GradientTape(persistent=True) as g:
+      # Using different constant values because constant tensors are
+      # cached, leading to a different gradient then what one might expect.
+      x1 = constant_op.constant(3.0)
+      x2 = constant_op.constant(3.1)
+      x3 = constant_op.constant(3.2)
+      g.watch(x1)
+      g.watch(x2)
+      g.watch(x3)
+      y = x1  + 2 * x2  + 3 * x3
+    self.assertEqual(self.evaluate(g.gradient(y, x1)), [1.0])
+    self.assertEqual(self.evaluate(g.gradient(y, (x1,))), (1.0,))
+    self.assertEqual(self.evaluate(g.gradient(y, (x1, x2))), (1.0, 2.0))
+    self.assertEqual(self.evaluate(g.gradient(y, [(x1, x2), (x2, x3)])),
+                     [(1.0, 2.0), (2.0, 3.0)])
+    self.assertEqual(self.evaluate(g.gradient(y, (x1, x2, [x1, x3]))),
+                     (1.0, 2.0, [1.0, 3.0]))
+    self.assertEqual(self.evaluate(g.gradient(y, [x1, {'x2': x2, 'x3': x3}])),
+                     [1.0, {'x2': 2.0, 'x3': 3.0}])
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testGradientTape(self):
     with backprop.GradientTape() as g:
       x = constant_op.constant(3.0)
@@ -341,10 +427,53 @@ class BackpropTest(test.TestCase):
         gg.watch(y)
         z = 2 * y
       inner_grad = gg.gradient(z, [y])[0]
-      self.assertEqual(inner_grad.numpy(), 2.0)
+      self.assertEqual(self.evaluate(inner_grad), 2.0)
       y += inner_grad
     grad = g.gradient(y, [x])[0]
-    self.assertEqual(grad.numpy(), 6.0)
+    self.assertEqual(self.evaluate(grad), 6.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeWithCond(self):
+    x = constant_op.constant(3.0)
+
+    def true_fn():
+      return x
+
+    def false_fn():
+      return x * x
+
+    with backprop.GradientTape() as g:
+      g.watch(x)
+      y = control_flow_ops.cond(x < x, true_fn, false_fn)
+
+    if not context.executing_eagerly():
+      with self.assertRaisesRegexp(NotImplementedError, 'tf.gradients'):
+        dy = g.gradient(y, [x])[0]
+    else:
+      dy = g.gradient(y, [x])[0]
+      self.assertEqual(self.evaluate(dy), 6.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeWithWhileLoop(self):
+    i = constant_op.constant(1)
+    x = constant_op.constant(2.)
+
+    def cond(i, _):
+      return i < 3
+
+    def body(i, x):
+      return i + 1, x * 2
+
+    with backprop.GradientTape() as g:
+      g.watch([x])
+      _, y = control_flow_ops.while_loop(cond, body, [i, x])
+
+    if not context.executing_eagerly():
+      with self.assertRaisesRegexp(NotImplementedError, 'tf.gradients'):
+        dy = g.gradient(y, [x])[0]
+    else:
+      dy = g.gradient(y, [x])[0]
+      self.assertEqual(self.evaluate(dy), 4.0)
 
   @test_util.assert_no_new_tensors
   def testGradientTapeGradientCalledMultipleTimes(self):
@@ -359,6 +488,7 @@ class BackpropTest(test.TestCase):
       g.gradient(y, [x])
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testPersistentTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -366,12 +496,13 @@ class BackpropTest(test.TestCase):
       y = x * x
       z = y * y
     dz_dx = g.gradient(z, [x])[0]
-    self.assertEqual(dz_dx.numpy(), 4*3*3*3)
+    self.assertEqual(self.evaluate(dz_dx), 4 * 3 * 3 * 3)
     dy_dx = g.gradient(y, [x])[0]
-    self.assertEqual(dy_dx.numpy(), 2*3)
+    self.assertEqual(self.evaluate(dy_dx), 2 * 3)
     del g
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testPersistentNestedTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -382,22 +513,24 @@ class BackpropTest(test.TestCase):
         z = 2 * y
       for _ in range(2):
         inner_grad = gg.gradient(z, [y])[0]
-        self.assertEqual(inner_grad.numpy(), 2.0)
+        self.assertEqual(self.evaluate(inner_grad), 2.0)
       y += inner_grad
       del gg
     grad = g.gradient(y, [x])[0]
-    self.assertEqual(grad.numpy(), 6.0)
+    self.assertEqual(self.evaluate(grad), 6.0)
     grad = g.gradient(z, [x])[0]
-    self.assertEqual(grad.numpy(), 12.0)
+    self.assertEqual(self.evaluate(grad), 12.0)
     del g
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testGradientTapeVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0, name='v')
+    self.evaluate(v.initializer)
     with backprop.GradientTape() as g:
       y = v * v
     grad = g.gradient(y, [v])[0]
-    self.assertAllEqual(grad, 2.0)
+    self.assertAllEqual(self.evaluate(grad), 2.0)
 
   @test_util.assert_no_new_tensors
   def testEmptyParamsForValueAndGradFunction(self):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index b56cbe80a7ab6b90d715187b0f0a44847038fc37..3aad4a114a710280b5046666256b6b43dc0d5523 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -35,7 +35,6 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
-from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
@@ -56,11 +55,11 @@ def c_tfe_py_fastpath_execute(a,
                               transpose_b=False,
                               name=None):
   ctx = context.context()
-  assert not ctx.in_graph_mode(
+  assert ctx.executing_eagerly(
   ), "The prototype doesn't contain C code for graph construction"
   try:
     return pywrap_tensorflow.TFE_Py_FastPathExecute(
-        ctx._handle, ctx.device_name, "MatMul", execute.record_gradient, name,
+        ctx._handle, ctx.device_name, "MatMul", name,
         ctx._post_execution_callbacks, a, b, "transpose_a", transpose_a,
         "transpose_b", transpose_b)
   except core._NotOkStatusException as e:
@@ -83,16 +82,24 @@ class MicroBenchmarks(test.Benchmark):
     self._num_iters_2_by_2 = 30000
     self._num_iters_100_by_784 = 1000
 
-  def _run(self, func, num_iters):
+  def _run(self, func, num_iters, execution_mode=None):
     # call func to maybe warm up the GPU
-    func()
-    start = time.time()
-    for _ in xrange(num_iters):
+    ctx = context.context()
+    with ctx.execution_mode(execution_mode):
       func()
-    end = time.time()
-    mean_us = (end - start) * 1e6 / num_iters
-    self.report_benchmark(iters=num_iters, wall_time=mean_us,
-                          extras={"examples_per_sec": num_iters/(end-start)})
+      if execution_mode == context.ASYNC:
+        ctx.async_wait()
+      start = time.time()
+      for _ in xrange(num_iters):
+        func()
+      if execution_mode == context.ASYNC:
+        ctx.async_wait()
+      end = time.time()
+      mean_us = (end - start) * 1e6 / num_iters
+      self.report_benchmark(
+          iters=num_iters,
+          wall_time=mean_us,
+          extras={"examples_per_sec": num_iters / (end - start)})
 
   def benchmark_create_np_array(self):
     func = lambda: np.array([3.0])
@@ -194,6 +201,9 @@ class MicroBenchmarks(test.Benchmark):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)
 
+  def benchmark_slowpath_tf_identity(self):
+    self._run(lambda: gen_array_ops.identity(1), 30000)
+
   def benchmark_tfe_py_execute_identity(self):
     m = self._m_2
     ctx_handle = context.context()._handle
@@ -207,10 +217,11 @@ class MicroBenchmarks(test.Benchmark):
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_identity(self):
-    m = self._m_2
-    self._run(
-        lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
-        30000)
+    with context.device(CPU):
+      m = gen_array_ops.identity(self._m_2)
+      self._run(
+          lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
+          30000)
 
   def benchmark_tf_gradient_forward_identity(self):
     with backprop.GradientTape() as tape:
@@ -226,10 +237,11 @@ class MicroBenchmarks(test.Benchmark):
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_no_op(self):
-    m = self._m_2
-    self._run(
-        lambda: backprop.gradients_function(lambda x: x, [0])(m),
-        30000)
+    with context.device(CPU):
+      m = gen_array_ops.identity(self._m_2)
+      self._run(
+          lambda: backprop.gradients_function(lambda x: x, [0])(m),
+          30000)
 
   def _benchmark_np_matmul(self, m, transpose_b, num_iters):
     a = m.cpu().numpy()
@@ -237,13 +249,15 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: np.dot(a, b)
     self._run(func, num_iters)
 
-  def _benchmark_tf_matmul(self, m, transpose_b, num_iters):
+  def _benchmark_tf_matmul(self, m, transpose_b, num_iters,
+                           execution_mode=None):
     func = lambda: math_ops.matmul(m, m, transpose_b=transpose_b)
-    self._run(func, num_iters)
+    self._run(func, num_iters, execution_mode=execution_mode)
 
   def _benchmark_gen_math_ops_matmul(self, m, transpose_b, num_iters):
     def func():
-      gen_math_ops._mat_mul(m, m, transpose_b=transpose_b)
+      gen_math_ops.mat_mul(m, m, transpose_b=transpose_b)
+
     self._run(func, num_iters)
 
   def _benchmark_tfe_py_fastpath_execute_matmul(self, m, transpose_b,
@@ -259,22 +273,37 @@ class MicroBenchmarks(test.Benchmark):
     # pylint: disable=protected-access
     ctx_handle = context.context()._handle
     # pylint: enable=protected-access
+    device = context.context().device_name
     attrs = ("transpose_a", False, "transpose_b", transpose_b, "T",
              m.dtype.as_datatype_enum)
     def func():
-      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, None, "MatMul", inputs,
-                                       attrs, 1)
+      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, device, "MatMul",
+                                       inputs, attrs, 1)
 
     self._run(func, num_iters)
 
-  def _benchmark_defun_matmul(self, m, transpose_b, num_iters):
+  def _benchmark_defun_matmul(self,
+                              m,
+                              transpose_b,
+                              num_iters,
+                              execution_mode=None):
     f = function.defun(math_ops.matmul)
     func = lambda: f(m, m, transpose_b)
-    self._run(func, num_iters)
+    self._run(func, num_iters, execution_mode=execution_mode)
 
   def _benchmark_read_variable(self, m, num_iters):
     self._run(m.value, num_iters)
 
+  def _benchmark_matmul_read_variable(self, m, num_iters):
+    self._benchmark_gen_math_ops_matmul(
+        m, transpose_b=False, num_iters=num_iters)
+
+  def _benchmark_matmul_read_variable_with_tape(self, m, num_iters):
+    with backprop.GradientTape() as tape:
+      tape.watch(m)
+      self._benchmark_gen_math_ops_matmul(
+          m, transpose_b=False, num_iters=num_iters)
+
   def _benchmark_read_variable_with_tape(self, m, num_iters):
     with backprop.GradientTape() as tape:
       tape.watch(m)
@@ -291,6 +320,15 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_tf_matmul_2_by_2_CPU_async(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_tf_matmul(
+          m,
+          transpose_b=False,
+          num_iters=self._num_iters_2_by_2,
+          execution_mode=context.ASYNC)
+
   def benchmark_gen_math_ops_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -315,6 +353,15 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_defun_matmul_2_by_2_CPU_async(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_defun_matmul(
+          m,
+          transpose_b=False,
+          num_iters=self._num_iters_2_by_2,
+          execution_mode=context.ASYNC)
+
   def benchmark_tf_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -323,6 +370,17 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_tf_matmul_2_by_2_GPU_async(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_tf_matmul(
+          m,
+          transpose_b=False,
+          num_iters=self._num_iters_2_by_2,
+          execution_mode=context.ASYNC)
+
   def benchmark_gen_math_ops_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -347,6 +405,17 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_defun_matmul_2_by_2_GPU_async(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_defun_matmul(
+          m,
+          transpose_b=False,
+          num_iters=self._num_iters_2_by_2,
+          execution_mode=context.ASYNC)
+
   # Benchmarks for AA.T, A of dimension 100 by 784.
   def benchmark_np_matmul_100_by_784(self):
     self._benchmark_np_matmul(
@@ -360,6 +429,15 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_tf_matmul_100_by_784_CPU_async(self):
+    with context.device(CPU):
+      m = self._m_100_by_784.cpu()
+      self._benchmark_tf_matmul(
+          m,
+          transpose_b=True,
+          num_iters=self._num_iters_100_by_784,
+          execution_mode=context.ASYNC)
+
   def benchmark_gen_math_ops_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -392,6 +470,17 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_tf_matmul_100_by_784_GPU_async(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_100_by_784.gpu()
+      self._benchmark_tf_matmul(
+          m,
+          transpose_b=True,
+          num_iters=self._num_iters_100_by_784,
+          execution_mode=context.ASYNC)
+
   def benchmark_gen_math_ops_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -416,6 +505,17 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_matmul_read_variable_op_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_matmul_read_variable(m, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_matmul_read_variable_op_with_tape_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_matmul_read_variable_with_tape(
+          m, num_iters=self._num_iters_2_by_2)
+
   def benchmark_read_variable_op_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 07652d3e02b6364e23b6579a64dcadf02dc5eb99..9e146f021e813886b42ca72b07122b485901a24b 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -28,10 +28,10 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
 
 GRAPH_MODE = 0
 EAGER_MODE = 1
@@ -52,6 +52,28 @@ DEVICE_PLACEMENT_WARN = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_WARN
 DEVICE_PLACEMENT_SILENT = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_SILENT
 DEVICE_PLACEMENT_SILENT_FOR_INT32 = (
     pywrap_tensorflow.TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32)
+SYNC = 0
+ASYNC = 1
+
+
+class _TensorCache(object):
+  """Simple cache which evicts items based on length in a FIFO manner."""
+
+  def __init__(self, max_items=256):
+    self._data = collections.OrderedDict()
+    self._max_items = max_items if max_items else 256
+
+  def put(self, key, value):
+    self._data[key] = value
+
+    if len(self._data) > self._max_items:
+      self._data.popitem(last=False)
+
+  def get(self, key):
+    return self._data.get(key, None)
+
+  def flush(self):
+    self._data = {}
 
 
 # TODO(agarwal): better name ?
@@ -60,32 +82,44 @@ class _EagerContext(threading.local):
 
   def __init__(self):
     super(_EagerContext, self).__init__()
-    self.device_spec = pydev.DeviceSpec.from_string(
-        "/job:localhost/replica:0/task:0/device:CPU:0")
+    self.device_spec = pydev.DeviceSpec.from_string("")
     self.device_name = self.device_spec.to_string()
     self.mode = _default_mode
+    self.is_eager = _default_mode == EAGER_MODE
     self.scope_name = ""
     self.recording_summaries = False
     self.summary_writer_resource = None
     self.scalar_cache = {}
+    self.ones_rank_cache = _TensorCache()
+    self.execution_mode = None
 
 
-ContextStackEntry = collections.namedtuple(
-    "ContextStackEntry", ["is_building_function", "enter_context_fn"])
+ContextSwitch = collections.namedtuple(
+    "ContextSwitch", ["is_building_function", "enter_context_fn"])
 
 
-class ContextStack(threading.local):
+# `_ContextSwitchStack` is a `threading.local` to match the semantics of
+# ``DefaultGraphStack`, which is also a `threading.local`.
+class _ContextSwitchStack(threading.local):
   """A thread-local stack of context switches."""
 
-  def __init__(self):
-    super(ContextStack, self).__init__()
+  def __init__(self, eager):
+    super(_ContextSwitchStack, self).__init__()
     self.stack = []
+    if eager:
+      # Initialize the stack with a pointer to enter the eager context; this
+      # ensures that the fact that eager execution was enabled is propagated
+      # across threads, since (1) `enable_eager_execution` modifies a
+      # process-level flag (`_default_mode`) and (2) `__init__` is called each
+      # time a threading.local object is used in a separate thread.
+      self.push(is_building_function=False, enter_context_fn=eager_mode)
 
   def push(self, is_building_function, enter_context_fn):
     """Push metadata about a context switch onto the stack.
 
     A context switch can take one of two forms: installing a graph as the
-    default graph, or entering the eager context.
+    default graph, or entering the eager context. For each context switch,
+    we record whether or not the entered context is building a function.
 
     Args:
       is_building_function: (bool.) Whether the context is building a function.
@@ -94,7 +128,7 @@ class ContextStack(threading.local):
     """
 
     self.stack.append(
-        ContextStackEntry(is_building_function, enter_context_fn))
+        ContextSwitch(is_building_function, enter_context_fn))
 
   def pop(self):
     """Pop the stack."""
@@ -102,34 +136,49 @@ class ContextStack(threading.local):
     self.stack.pop()
 
 
-context_stack = ContextStack()
-
-
 # TODO(agarwal): rename to EagerContext / EagerRuntime ?
 # TODO(agarwal): consider keeping the corresponding Graph here.
 class Context(object):
   """Environment in which eager operations execute."""
 
-  def __init__(self, config=None, device_policy=None):
+  # TODO(agarwal): create and link in some documentation for `execution_mode`.
+  # pylint: disable=redefined-outer-name
+  def __init__(self, config=None, device_policy=None, execution_mode=None):
     """Creates a new Context.
 
     Args:
       config: (Optional.) A `ConfigProto` protocol buffer with configuration
-       options for the Context. Note that a lot of these options may be
-       currently unimplemented or irrelevant when eager execution is enabled.
+        options for the Context. Note that a lot of these options may be
+        currently unimplemented or irrelevant when eager execution is enabled.
       device_policy: (Optional.) What policy to use when trying to run an
-       operation on a device with inputs which are not on that device.
-       Valid values:
-         tfe.DEVICE_PLACEMENT_EXPLICIT: raises an error if the placement is not
-           correct.
-         tfe.DEVICE_PLACEMENT_WARN: copies the tensors which are not on the
+         operation on a device with inputs which are not on that device.
+         When set to None, an appropriate value will be picked automatically.
+         The value picked may change between TensorFlow releases.
+
+         Defaults to tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32.
+         Valid values:
+         - tfe.DEVICE_PLACEMENT_EXPLICIT: raises an error if the placement is
+           not correct.
+         - tfe.DEVICE_PLACEMENT_WARN: copies the tensors which are not on the
            right device but raises a warning.
-         tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
+         - tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
            hide performance problems.
-         tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
+         - tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
            raising errors on the other ones.
+      execution_mode: (Optional.) Policy controlling how operations dispatched
+        are actually executed. When set to None, an appropriate value will be
+        picked automatically. The value picked may change between TensorFlow
+        releases.
+        Valid values:
+        - tf.contrib.eager.SYNC: executes each operation synchronously.
+        - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+          operations may return "non-ready" handles.
+
+    Raises:
+     ValueError: If execution_mode is not valid.
     """
     self._eager_context = _EagerContext()
+    self._context_switches = _ContextSwitchStack(self.executing_eagerly())
     self._context_handle = None
     self._context_devices = None
     self._post_execution_callbacks = []
@@ -137,6 +186,14 @@ class Context(object):
     self._seed = None
     self._initialize_lock = threading.Lock()
     self._device_policy = device_policy
+    if execution_mode not in (None, SYNC, ASYNC):
+      raise ValueError(
+          "execution_mode should be None/SYNC/ASYNC. Got %s" % execution_mode)
+    if execution_mode is None:
+      execution_mode = SYNC
+    self._execution_mode = execution_mode
+
+  # pylint: enable=redefined-outer-name
 
   def _set_global_seed(self, seed):
     """Set a global eager mode seed for random ops."""
@@ -166,32 +223,27 @@ class Context(object):
       assert self._context_devices is None
       opts = pywrap_tensorflow.TFE_NewContextOptions()
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          if self._config is not None:
-            config_str = self._config.SerializeToString()
-            pywrap_tensorflow.TFE_ContextOptionsSetConfig(
-                opts, config_str, len(config_str), status)
-          if self._device_policy is not None:
-            pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
-                opts, self._device_policy)
-          self._context_handle = pywrap_tensorflow.TFE_NewContext(opts, status)
+        if self._config is not None:
+          config_str = self._config.SerializeToString()
+          pywrap_tensorflow.TFE_ContextOptionsSetConfig(opts, config_str)
+        if self._device_policy is not None:
+          pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
+              opts, self._device_policy)
+        if self._execution_mode == ASYNC:
+          pywrap_tensorflow.TFE_ContextOptionsSetAsync(opts, True)
+        self._context_handle = pywrap_tensorflow.TFE_NewContext(opts)
       finally:
         pywrap_tensorflow.TFE_DeleteContextOptions(opts)
       # Store list of devices
       self._context_devices = []
-      with errors.raise_exception_on_not_ok_status() as status:
-        device_list = pywrap_tensorflow.TFE_ContextListDevices(
-            self._context_handle, status)
+      device_list = pywrap_tensorflow.TFE_ContextListDevices(
+          self._context_handle)
       try:
         self._num_gpus = 0
         for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
-          with errors.raise_exception_on_not_ok_status() as status:
-            dev_name = pywrap_tensorflow.TF_DeviceListName(
-                device_list, i, status)
+          dev_name = pywrap_tensorflow.TF_DeviceListName(device_list, i)
           self._context_devices.append(pydev.canonical_name(dev_name))
-          with errors.raise_exception_on_not_ok_status() as status:
-            dev_type = pywrap_tensorflow.TF_DeviceListType(
-                device_list, i, status)
+          dev_type = pywrap_tensorflow.TF_DeviceListType(device_list, i)
           if dev_type == "GPU":
             self._num_gpus += 1
 
@@ -228,30 +280,37 @@ class Context(object):
 
   @tf_contextlib.contextmanager
   def _mode(self, mode):
+    """A context manager to allow setting the mode to EAGER/GRAPH."""
     ctx = self._eager_context
     old_mode = ctx.mode
+    old_is_eager = ctx.is_eager
     ctx.mode = mode
+    ctx.is_eager = mode == EAGER_MODE
     if mode == EAGER_MODE:
-      context_stack.push(False, eager_mode)
+      # Entering graph mode does not provide us with sufficient information to
+      # record a context switch; graph-based context switches are only logged
+      # when a graph is registered as the default graph.
+      self.context_switches.push(False, eager_mode)
     try:
       yield
     finally:
+      ctx.is_eager = old_is_eager
       ctx.mode = old_mode
       if mode == EAGER_MODE:
-        context_stack.pop()
-
-  def in_graph_mode(self):
-    """Returns True if current thread is in GRAPH mode."""
-    return self._eager_context.mode == GRAPH_MODE
+        self.context_switches.pop()
 
-  def in_eager_mode(self):
-    """Returns True if current thread is in EAGER mode."""
-    return self._eager_context.mode == EAGER_MODE
+  def executing_eagerly(self):
+    """Returns True if current thread has eager executing enabled."""
+    return self._eager_context.is_eager
 
   def scalar_cache(self):
     """Per-device cache for scalars."""
     return self._eager_context.scalar_cache
 
+  def ones_rank_cache(self):
+    """Per-device cache for scalars."""
+    return self._eager_context.ones_rank_cache
+
   @property
   def scope_name(self):
     """Returns scope name for the current thread."""
@@ -335,6 +394,40 @@ class Context(object):
     """List of the names of devices available to execute operations."""
     return self._devices
 
+  def get_execution_mode(self):
+    mode = self._eager_context.execution_mode
+    if mode is None:
+      mode = self._execution_mode
+    return mode
+
+  def set_execution_mode(self, mode):
+    """Sets execution mode for current thread."""
+    if mode not in (None, SYNC, ASYNC):
+      raise ValueError(
+          "Execution mode should be None/SYNC/ASYNC. Got %s" % mode)
+    if mode is None:
+      mode = SYNC
+    self._eager_context.execution_mode = mode
+    pywrap_tensorflow.TFE_ContextSetAsyncForThread(self._handle, mode == ASYNC)
+
+  @tf_contextlib.contextmanager
+  def execution_mode(self, mode):
+    """Context manager for setting execution mode for current thread."""
+    old_mode = self.get_execution_mode()
+    try:
+      self.set_execution_mode(mode)
+      yield
+    finally:
+      self.set_execution_mode(old_mode)
+
+  def async_wait(self):
+    """Waits for ops dispatched in ASYNC mode to finish."""
+    pywrap_tensorflow.TFE_ContextAsyncWait(self._handle)
+
+  def async_clear_error(self):
+    """Clears errors raised during ASYNC execution."""
+    pywrap_tensorflow.TFE_ContextAsyncClearError(self._handle)
+
   def num_gpus(self):
     """The number of GPUs available to execute operations."""
     self._initialize_handle_and_devices()
@@ -349,11 +442,9 @@ class Context(object):
     Args:
       fn: A wrapped TF_Function (returned from TF_GraphToFunction_wrapper).
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAddFunction(
-          self._handle,  # pylint: disable=protected-access
-          fn,
-          status)
+    pywrap_tensorflow.TFE_ContextAddFunction(
+        self._handle,  # pylint: disable=protected-access
+        fn)
 
   def add_function_def(self, fdef):
     """Add a function definition to the context.
@@ -365,12 +456,10 @@ class Context(object):
       fdef: A FunctionDef protocol buffer message.
     """
     fdef_string = fdef.SerializeToString()
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAddFunctionDef(
-          self._handle,  # pylint: disable=protected-access
-          fdef_string,
-          len(fdef_string),
-          status)
+    pywrap_tensorflow.TFE_ContextAddFunctionDef(
+        self._handle,  # pylint: disable=protected-access
+        fdef_string,
+        len(fdef_string))
 
   def add_post_execution_callback(self, callback):
     """Add a post-execution callback to the context.
@@ -413,23 +502,19 @@ class Context(object):
     To retrieve the accumulated metadata call context.export_run_metadata()
     and to stop tracing call context.disable_run_metadata().
     """
-    if not self._context_handle:
-      self._initialize_handle_and_devices()
-    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._context_handle)
+    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._handle)
 
   @tf_contextlib.contextmanager
   def device_policy(self, policy):
-    if not self._context_handle:
-      self._initialize_handle_and_devices()
-    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-        self._context_handle)
+    handle = self._handle
+    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(handle)
     pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-        self._handle, policy)
+        handle, policy)
     try:
       yield
     finally:
       pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-          self._handle, old)
+          handle, old)
 
   def disable_run_metadata(self):
     """Disables tracing of op execution via RunMetadata."""
@@ -449,14 +534,18 @@ class Context(object):
     if not self._context_handle:
       return None
     with c_api_util.tf_buffer() as buffer_:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pywrap_tensorflow.TFE_ContextExportRunMetadata(
-            self._context_handle, buffer_, status)
+      pywrap_tensorflow.TFE_ContextExportRunMetadata(
+          self._context_handle, buffer_)
       proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
     run_metadata = config_pb2.RunMetadata()
     run_metadata.ParseFromString(compat.as_bytes(proto_data))
     return run_metadata
 
+  @property
+  def context_switches(self):
+    """Returns a stack of context switches."""
+    return self._context_switches
+
 _context = None
 _context_lock = threading.Lock()
 
@@ -475,6 +564,10 @@ def context():
   return _context
 
 
+def context_safe():
+  return _context
+
+
 # TODO(agarwal): remove this.
 def get_default_context():
   """Same as context."""
@@ -498,23 +591,29 @@ def internal_operation_seed():
   return context()._internal_operation_seed()  # pylint: disable=protected-access
 
 
-def in_graph_mode():
-  """Returns True if current thread is in GRAPH mode for default context."""
-  return context().in_graph_mode()
+@tf_export("executing_eagerly")
+def executing_eagerly():
+  """Returns True if the current thread has eager execution enabled.
+
+  Eager execution is typically enabled via @{tf.enable_eager_execution},
+  but may also be enabled within the context of a Python function via
+  tf.contrib.eager.py_func.
+  """
+  return context().executing_eagerly()
 
 
 def in_eager_mode():
-  """Returns True if current thread is in EAGER mode for default context."""
-  return context().in_eager_mode()
+  """Use executing_eagerly() instead. This function will be removed."""
+  return executing_eagerly()
 
 
 def graph_mode():
-  """Context-manager to enable GRAPH mode for current thread."""
+  """Context-manager to disable eager execution for the current thread."""
   return context()._mode(GRAPH_MODE)  # pylint: disable=protected-access
 
 
 def eager_mode():
-  """Context-manager to enable EAGER mode for current thread."""
+  """Context-manager to enable eager execution for the current thread."""
   return context()._mode(EAGER_MODE)  # pylint: disable=protected-access
 
 
@@ -568,6 +667,26 @@ def list_devices():
   return context().devices()
 
 
+def set_execution_mode(mode):
+  """Sets execution mode for the current thread."""
+  context().set_execution_mode(mode)
+
+
+def execution_mode(mode):
+  """Context manager for setting execution mode for current thread."""
+  return context().execution_mode(mode)
+
+
+def async_wait():
+  """Waits for ops dispatched in ASYNC mode to finish."""
+  return context().async_wait()
+
+
+def async_clear_error():
+  """Clears errors raised during ASYNC execution mode."""
+  return context().async_clear_error()
+
+
 def num_gpus():
   """Get the number of available GPU devices.
 
@@ -607,4 +726,8 @@ def export_run_metadata():
 # (for example, enable_eager_execution in python/framework/ops.py),
 # but they do all import this file.  Note that IS_IN_GRAPH_MODE and
 # in_graph_mode are both parameterless functions.
-is_in_graph_mode.IS_IN_GRAPH_MODE = in_graph_mode
+def _tmp_in_graph_mode():
+  return not executing_eagerly()
+
+
+is_in_graph_mode.IS_IN_GRAPH_MODE = _tmp_in_graph_mode
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index c68e2f422eb81a915d4f941ffb920f221d9be250..3fabe7060e980423268eb6f52ab4043cc4a4847c 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -33,7 +33,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
 def execute(op_name, num_outputs, inputs, attrs=None):
@@ -54,19 +57,27 @@ class TFETest(test_util.TensorFlowTestCase):
 
   def testContext(self):
     ctx = context.Context()
-    self.assertFalse(ctx.in_graph_mode())
-    self.assertTrue(ctx.in_eager_mode())
+    self.assertTrue(ctx.executing_eagerly())
 
     self.assertEqual('', ctx.scope_name)
     ctx.scope_name = 'foo'
     self.assertEqual('foo', ctx.scope_name)
 
+    self.assertEqual(context.SYNC, ctx.get_execution_mode())
+    ctx.set_execution_mode(context.ASYNC)
+    self.assertEqual(context.ASYNC, ctx.get_execution_mode())
+    ctx.set_execution_mode(context.SYNC)
+    self.assertEqual(context.SYNC, ctx.get_execution_mode())
+    with ctx.execution_mode(context.ASYNC):
+      self.assertEqual(context.ASYNC, ctx.get_execution_mode())
+    ctx.set_execution_mode(context.SYNC)
+    self.assertEqual(context.SYNC, ctx.get_execution_mode())
+
     self.assertIsNone(ctx.summary_writer_resource)
     ctx.summary_writer_resource = 'mock'
     self.assertEqual('mock', ctx.summary_writer_resource)
 
-    self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
-                     ctx.device_name)
+    self.assertEqual('', ctx.device_name)
     self.assertEqual(ctx.device_name, ctx.device_spec.to_string())
     with ctx.device('GPU:0'):
       self.assertEqual('/job:localhost/replica:0/task:0/device:GPU:0',
@@ -86,6 +97,14 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertTrue(has_cpu_device)
     del ctx
 
+  def testAsyncBasic(self):
+    ctx = context.Context(execution_mode=context.ASYNC)
+    has_cpu_device = False
+    for x in ctx.devices():
+      has_cpu_device = has_cpu_device or 'CPU' in x
+    self.assertTrue(has_cpu_device)
+    del ctx
+
   def testRunMetadata(self):
     context.enable_run_metadata()
     t = constant_op.constant(1.0)
@@ -97,22 +116,32 @@ class TFETest(test_util.TensorFlowTestCase):
     cpu_stats = step_stats.dev_stats[0]
     self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
                      cpu_stats.device)
-    self.assertEqual(len(cpu_stats.node_stats), 1)
-    self.assertEqual(cpu_stats.node_stats[0].node_name, 'Add')
+    self.assertGreaterEqual(len(cpu_stats.node_stats), 1)
 
-  def testContextStackContainsEagerMode(self):
-    # Eager execution has been enabled, and no other context
-    # switch has occurred, so `context_stack` should contain
-    # exactly one entry.
-    self.assertEqual(len(context.context_stack.stack), 1)
-    stack_entry = context.context_stack.stack[0]
+  def testShouldCopy(self):
+    if not context.context().num_gpus():
+      self.skipTest('No devices other than CPUs found')
+    with ops.device('gpu:0'):
+      x = constant_op.constant(1.0)
+    y = array_ops.identity(x)
+    # The value we're testing y.device against will depend on what the behavior
+    # of not explicitly specifying a device in the context is.  This behavior is
+    # subject to change (for example, in the future we may want to use GPUs, if
+    # available, when no device is explicitly provided)
+    self.assertEqual(y.device, '/job:localhost/replica:0/task:0/device:CPU:0')
+
+  def testContextSwitchStackContainsEagerMode(self):
+    # Eager execution has been enabled, and no other context switch has
+    # occurred, so `context_switches` should contain exactly one entry.
+    self.assertEqual(len(context.context().context_switches.stack), 1)
+    switch = context.context().context_switches.stack[0]
 
     # The entry should log that eager mode was entered.
-    self.assertIs(stack_entry.enter_context_fn, context.eager_mode)
+    self.assertIs(switch.enter_context_fn, context.eager_mode)
 
     # It is not possible to build a graph function when eager execution
     # is enabled; the stack entry should reflect this fact.
-    self.assertFalse(stack_entry.is_building_function)
+    self.assertFalse(switch.is_building_function)
 
   def testInt32GPU(self):
     if not context.context().num_gpus():
@@ -136,9 +165,9 @@ class TFETest(test_util.TensorFlowTestCase):
 
     def get_context_values(ctx):
       return [
-          ctx.in_graph_mode(),
-          ctx.in_eager_mode(), ctx.scope_name, ctx.summary_writer_resource,
-          ctx.device_name, ctx.num_gpus()
+          ctx.executing_eagerly(), ctx.scope_name, ctx.summary_writer_resource,
+          ctx.device_name,
+          ctx.num_gpus()
       ]
 
     def get_values(ctx, values):
@@ -169,6 +198,18 @@ class TFETest(test_util.TensorFlowTestCase):
         attrs=('T', x.dtype.as_datatype_enum))[0].cpu().numpy()
     self.assertEqual(3, result)
 
+  def testResourceTensorPlacement(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+
+    with context.device('gpu:0'):
+      v = resource_variable_ops.ResourceVariable(1.0)
+    with context.device('cpu:0'):
+      # Check that even though we specified the cpu device we'll run the read op
+      # in the device where the handle is.
+      self.assertAllEqual(
+          gen_resource_variable_ops.read_variable_op(v.handle, v.dtype), 1.0)
+
   def testCopyBetweenDevices(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -183,6 +224,23 @@ class TFETest(test_util.TensorFlowTestCase):
     with self.assertRaises(RuntimeError):
       x.gpu(context.context().num_gpus() + 1)
 
+  def testCopyBetweenDevicesAsync(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    with context.execution_mode(context.ASYNC):
+      x = constant_op.constant([[1., 2.], [3., 4.]])
+      x = x.cpu()
+      x = x.gpu()
+      x = x.gpu()
+      x = x.cpu()
+      context.async_wait()
+
+    # Invalid device
+    with self.assertRaises(RuntimeError):
+      x.gpu(context.context().num_gpus() + 1)
+      context.async_wait()
+    context.async_clear_error()
+
   def testCopyScope(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -223,16 +281,49 @@ class TFETest(test_util.TensorFlowTestCase):
         attrs=('T', three.dtype.as_datatype_enum))[0]
     self.assertAllEqual(15, product)
 
+  def testExecuteBasicAsync(self):
+    with context.execution_mode(context.ASYNC):
+      three = constant_op.constant(3)
+      five = constant_op.constant(5)
+      product = execute(
+          b'Mul',
+          num_outputs=1,
+          inputs=[three, five],
+          attrs=('T', three.dtype.as_datatype_enum))[0]
+      self.assertAllEqual(15, product)
+    # Error: Invalid arguments
+    context.set_execution_mode(context.ASYNC)
+    with self.assertRaises(errors.InvalidArgumentError):
+      execute(
+          b'MatMul',
+          num_outputs=1,
+          inputs=[three, five],
+          attrs=('transpose_a', False, 'transpose_b', False, 'T',
+                 three.dtype.as_datatype_enum))
+      context.async_wait()
+    context.async_clear_error()
+    context.set_execution_mode(context.SYNC)
+
   def testExecuteTooManyNumOutputs(self):
     # num_outputs provided is 50, but only one output is produced.
-    # That should be okay.
     product = execute(
         b'Mul',
         num_outputs=50,
-        inputs=[constant_op.constant(3), constant_op.constant(5)],
+        inputs=[constant_op.constant(3),
+                constant_op.constant(5)],
         attrs=('T', dtypes.int32.as_datatype_enum))[0]
     self.assertAllEqual(15, product)
 
+  def testExecuteTooFewNumOutputs(self):
+    # num_outputs provided is 0, but one output is produced.
+    with self.assertRaises(errors.InvalidArgumentError):
+      _ = execute(
+          b'Mul',
+          num_outputs=0,
+          inputs=[constant_op.constant(3),
+                  constant_op.constant(5)],
+          attrs=('T', dtypes.int32.as_datatype_enum))[0]
+
   def testMatMulGPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -520,5 +611,62 @@ class TFETest(test_util.TensorFlowTestCase):
       self.assertIsInstance(t, ops.EagerTensor)
 
 
+class SendRecvTest(test_util.TensorFlowTestCase):
+
+  cpu_device = '/job:localhost/replica:0/task:0/device:CPU:0'
+
+  def _send(self, tensor, tensor_name, to_device):
+    return execute(
+        b'_Send', num_outputs=0, inputs=[tensor],
+        attrs=('T', tensor.dtype.as_datatype_enum,
+               'tensor_name', tensor_name,
+               'send_device', tensor.device,
+               'send_device_incarnation', 0,
+               'recv_device', to_device,
+               'client_terminated', True))
+
+  def _recv(self, dtype, tensor_name, from_device):
+    device_name = context.context().device_name
+    if not device_name:
+      device_name = self.cpu_device
+    return execute(
+        b'_Recv', num_outputs=1, inputs=[],
+        attrs=('tensor_type', dtype.as_datatype_enum,
+               'tensor_name', tensor_name,
+               'send_device', from_device,
+               'send_device_incarnation', 0,
+               'recv_device', device_name,
+               'client_terminated', False))[0]
+
+  def testBasic(self):
+    t0 = constant_op.constant(1.0)
+    t1 = constant_op.constant(2.0)
+    self._send(t0, 't0', self.cpu_device)
+    self._send(t1, 't1', self.cpu_device)
+    self.assertAllEqual(
+        self._recv(dtypes.float32, 't0', self.cpu_device),
+        1.0)
+    self.assertAllEqual(
+        self._recv(dtypes.float32, 't1', self.cpu_device),
+        2.0)
+
+  def testLocalCrossDevice(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    gpu_device_name = '/job:localhost/replica:0/task:0/device:GPU:0'
+    with ops.device('GPU:0'):
+      t0 = constant_op.constant(1.0)
+      self._send(t0, 't0', self.cpu_device)
+    with ops.device('cpu:0'):
+      self.assertAllEqual(
+          self._recv(dtypes.float32, 't0', gpu_device_name),
+          1.0)
+      self._send(constant_op.constant(2.0), 't1', gpu_device_name)
+    with ops.device('GPU:0'):
+      self.assertAllEqual(
+          self._recv(dtypes.float32, 't1', self.cpu_device),
+          2.0)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/custom_gradient.py b/tensorflow/python/eager/custom_gradient.py
deleted file mode 100644
index 05460ff9968312528d87f5fc2ad0495b4da2ad1a..0000000000000000000000000000000000000000
--- a/tensorflow/python/eager/custom_gradient.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Decorator to overrides the gradient for a function."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import ops as tf_ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_decorator
-
-
-def custom_gradient(f):
-  """Decorator to define a function with a custom gradient.
-
-  The input function is expected to return the tuple
-    (results, gradient_function).
-
-  The output function will return results while possibly recording the
-  gradient_function and inputs in the tape.
-
-  Args:
-    f: function to be decorated.
-
-  Returns:
-    decorated function.
-  """
-
-  def decorated(*args, **kwargs):
-    """Decorated function with custom gradient."""
-    if context.in_graph_mode():
-      if kwargs:
-        raise ValueError(
-            "custom_gradient in graph mode doesn't support keyword arguments.")
-      name = "CustomGradient-%s" % tf_ops.uid()
-      args = [tf_ops.convert_to_tensor(x) for x in args]
-      result, grad_fn = f(*args)
-      flat_result = nest.flatten(result)
-      all_tensors = flat_result + args
-
-      @tf_ops.RegisterGradient(name)
-      def internal_grad_fn(unused_op, *result_grads):  # pylint: disable=unused-variable
-        gradients = nest.flatten(grad_fn(*result_grads[:len(flat_result)]))
-        # Need to return one value per input to the IdentityN, so pad the
-        # gradients of the inputs of the custom_gradient function with the
-        # gradients of the outputs as well.
-        return ([None] * len(flat_result)) + gradients
-
-      with tf_ops.get_default_graph().gradient_override_map(
-          {"IdentityN": name}):
-        all_tensors = array_ops.identity_n(all_tensors)
-      return nest.pack_sequence_as(
-          structure=result, flat_sequence=all_tensors[:len(flat_result)])
-
-    input_tensors = [tf_ops.convert_to_tensor(x) for x in args]
-
-    with tape.stop_recording():
-      result, grad_fn = f(*args, **kwargs)
-      flat_result = nest.flatten(result)
-      # TODO(apassos) consider removing the identity below.
-      flat_result = [gen_array_ops.identity(x) for x in flat_result]
-
-    def actual_grad_fn(*outputs):
-      return nest.flatten(grad_fn(*outputs))
-
-    tape.record_operation(
-        f.__name__,
-        flat_result,
-        input_tensors,
-        actual_grad_fn)
-    flat_result = list(flat_result)
-    return nest.pack_sequence_as(result, flat_result)
-
-  return tf_decorator.make_decorator(f, decorated)
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 535361498a9dd33003d0479051e97d7ff2553067..9a082596535f51e5a4fb6cc2a11a4dd8a422ed44 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -253,7 +253,7 @@ def add_execution_callback(callback):
       `f(op_type, op_name, attrs, inputs, outputs)`.
       `op_type` is the type of the operation that was just executed (e.g.,
         `MatMul`).
-      `op_name` is the name of the operation that has was just executed. This
+      `op_name` is the name of the operation that was just executed. This
         name is set by the client who created the operation and can be `None` if
         it is unset.
       `attrs` contains the attributes of the operation as a `tuple` of
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 28f5289ffc0ace6f9b6cad7cdd1160a184f882c7..426ee4c215a899a98ce6f737bd21f041765665d6 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -20,8 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import contextlib
-import threading
 
 import numpy as np
 
@@ -32,35 +30,16 @@ from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import c_api_util
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
-# Thread-local storage for tfe Tensors which are referenced while evaluating a
-# graph-mode function.
-_scoped_captures = threading.local()
-# _scoped_captures.tensors is either None or a map from Tensor id to a pair
-# of a tfe tensor and its corresponding placeholder to pass as a function
-# argument. The value should be None unless we're in function definition
-# context.
-_scoped_captures.tensors = None
-
-
-@contextlib.contextmanager
-def capture_tensors(captures):
-  old = _scoped_captures.__dict__.get("tensors", None)
-  try:
-    _scoped_captures.tensors = captures
-    yield
-  finally:
-    _scoped_captures.tensors = old
-
 
 def capture_value(tensor_map, value, dtype, name):
   """Capture a value from outside the function, to pass in as an extra arg."""
@@ -69,23 +48,32 @@ def capture_value(tensor_map, value, dtype, name):
     captured_value = graph_placeholder(
         dtype=dtype or value.dtype, shape=value.shape, name=name)
     if captured_value.dtype == dtypes_module.resource:
-      handle_data = value._handle_data  # pylint: disable=protected-access
-      captured_value._handle_data = handle_data  # pylint: disable=protected-access
+      if ops._USE_C_SHAPES:  # pylint: disable=protected-access
+        if isinstance(value, ops.EagerTensor):
+          handle_data = value._handle_data  # pylint: disable=protected-access
+        else:
+          handle_data = resource_variable_ops.get_resource_handle_data(value)
+      else:
+        handle_data = value._handle_data  # pylint: disable=protected-access
       if handle_data is not None and handle_data.is_set:
+        # pylint: disable=protected-access
+        if ops._USE_C_SHAPES:
+          pywrap_tensorflow.SetResourceHandleShapeAndType(
+              captured_value.graph._c_graph, captured_value._as_tf_output(),
+              handle_data.SerializeToString())
+        else:
+          captured_value._handle_data = handle_data
+        # pylint: enable=protected-access
         # Ensure that shapes and dtypes are propagated.
         shapes, types = zip(*[(pair.shape, pair.dtype)
                               for pair in handle_data.shape_and_type])
         ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
         shapes = [[d.size for d in s.dim]
                   if not s.unknown_rank else None for s in shapes]
-        with errors.raise_exception_on_not_ok_status() as status:
-          pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
-              captured_value._op._graph._c_graph,  # pylint: disable=protected-access
-              captured_value._as_tf_output(),  # pylint: disable=protected-access
-              shapes,
-              ranks,
-              types,
-              status)
+        pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+            captured_value._op._graph._c_graph,  # pylint: disable=protected-access
+            captured_value._as_tf_output(),  # pylint: disable=protected-access
+            shapes, ranks, types)
 
     tensor_map[ops.tensor_id(value)] = (value, captured_value)
   else:
@@ -95,43 +83,6 @@ def capture_value(tensor_map, value, dtype, name):
   return captured_value
 
 
-def _convert_to_graph_tensor(value, dtype=None, name=None, as_ref=False):
-  """Captures a Tensor while building a graph mode function.
-
-  Arguments:
-    value: A Tensor object.
-    dtype: The datatype of the value produced by the node in the graph.
-    name:  str, Name of the node in the graph.
-    as_ref: Ignored (required by register_tensor_conversion_function).
-
-  Returns:
-    Returns a constant (the current value of the tensor) if capturing
-    is not enabled. A placeholder which will have the value of the
-    tensor at runtime otherwise.
-  """
-  del as_ref  # Unused.
-
-  if context.in_eager_mode():
-    return value
-
-  default_graph = ops.get_default_graph()
-  if not default_graph.building_function:
-    return value
-
-  tensor_map = _scoped_captures.tensors
-  if tensor_map is None:
-    # Capturing is not enabled.
-    if value.dtype == dtypes_module.resource:
-      return value
-    return constant_op.constant(value.numpy())
-  if type(value) == ops.Tensor and value.graph is default_graph:
-    # The tensor has already been converted and captured. The type check
-    # is intentional: we are checking that value is a Tensor and not an
-    # EagerTensor.
-    return value
-  return capture_value(tensor_map, value, dtype, name)
-
-
 class CapturingGraph(ops.Graph):
   """Graph used when constructing eager functions."""
 
@@ -151,6 +102,15 @@ class CapturingGraph(ops.Graph):
   def clear_resource_control_flow_state(self):
     self._last_op_using_resource_tensor = {}
 
+  def maybe_capture_tensor(self, tensor):
+    if isinstance(tensor, ops.EagerTensor):
+      return capture_value(
+          self.captures, tensor, tensor.dtype, str(ops.uid()))
+    if tensor.graph is not self:
+      return capture_value(
+          self.captures, tensor, tensor.dtype, tensor.op.name)
+    return tensor
+
   def create_op(
       self,
       op_type,
@@ -162,67 +122,86 @@ class CapturingGraph(ops.Graph):
       op_def=None,
       compute_shapes=True,
       compute_device=True):
-    # TODO(apassos) probably control flow has to be handled delicately here as
-    # in if a resource is accessed inside a control flow context we need the
-    # control dependency to point to something outside the context which is
-    # guaranteed to happen after the access.
-    #
     # TODO(apassos) this should do some form of alias analysis as ops which
     # forward the resources such as Identity and Switch can cause serialization
     # to fail.
-    resource_inputs = set()
-    control_inputs = set()
     for i, inp in enumerate(inputs):
-      if inp.graph is not self:
-        inputs[i] = capture_value(self.captures, inp, inp.dtype, inp.op.name)
-      inp = inputs[i]
-      if inp.dtype == dtypes_module.resource:
-        if inp.name in self._last_op_using_resource_tensor:
-          control_inputs.add(self._last_op_using_resource_tensor[inp.name])
-        resource_inputs.add(inp.name)
-    with self.control_dependencies(list(control_inputs)):
-      op = super(CapturingGraph, self).create_op(
-          op_type, inputs, dtypes, input_types, name, attrs, op_def,
-          compute_shapes, compute_device)
-    for name in resource_inputs:
-      self._last_op_using_resource_tensor[name] = op
-    return op
-
-
-# TODO(apassos): it'd be really nice if we could scope this registration.
-# Note that we register this at a higher priority than ops.Tensor since we want
-# to handle subclass specific conversion before a superclass conversion.
-ops.register_tensor_conversion_function(
-    ops.EagerTensor, _convert_to_graph_tensor, priority=-1)
-
-
-class _CapturingContext(object):
-  """Tracks references to Tensors outside this context while it is active."""
+      inputs[i] = self.maybe_capture_tensor(inp)
+    return super(CapturingGraph, self).create_op(
+        op_type, inputs, dtypes, input_types, name, attrs, op_def,
+        compute_shapes, compute_device)
 
-  def __init__(self):
-    # known_ops are ops which are created while this context is active
-    self.known_ops = set()
 
-    # captured_tensors are all tensors referenced to by ops in this context but
-    # not produced in it
-    self.captured_tensors = set()
+# pylint: disable=invalid-name
+class HelperContext(object):
+  """ControlFlowContext with a customizable AddOp method."""
+
+  def __init__(self, add_op_internal):
+    self._add_op_internal = add_op_internal
+    self._values = set()  # control flow code sometimes updates this.
+
+  def _AddOpInternal(self, op):
+    self._add_op_internal(op)
+
+  @property
+  def outer_context(self):
+    return self._outer_context
+
+  def GetWhileContext(self):
+    if self._outer_context:
+      return self._outer_context.GetWhileContext()
+
+  def IsWhileContext(self):
+    return False
+
+  def IsCondContext(self):
+    return False
+
+  def IsXLAContext(self):
+    return False
 
   def AddOp(self, op):  # pylint: disable=invalid-name
-    if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
-      raise ValueError("tfe.defun cannot capture variables created without "
-                       "using tf.get_variable. Op: %s" % op)
-    self.known_ops.add(op)
-    for i in op.inputs:
-      if i.op not in self.known_ops:
-        self.captured_tensors.add(i)
+    self._AddOpInternal(op)
+    if self._outer_context:
+      self._outer_context.AddOp(op)
+
+  def AddName(self, _):
+    pass
+
+  def AddInnerOp(self, op):
+    self._AddOpInternal(op)
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  def AddValue(self, val):
+    if self._outer_context:
+      return self._outer_context.AddValue(val)
+    else:
+      return val
+
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
 
   def __enter__(self):
+    # pylint: disable=protected-access
     self._g = ops.get_default_graph()
-    self._old = self._g._get_control_flow_context()  # pylint: disable=protected-access
-    self._g._set_control_flow_context(self)  # pylint: disable=protected-access
+    self._outer_context = self._g._get_control_flow_context()
+    self._g._set_control_flow_context(self)
+    self._nested_contexts = (
+        self._outer_context._nested_contexts
+        if self._outer_context is not None else None)
+    # pylint: enable=protected-access
 
-  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
-    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
+  def __exit__(self, *_):
+    self._g._set_control_flow_context(self._outer_context)  # pylint: disable=protected-access
+# pylint: enable=invalid-name
 
 
 def _forward_name(n):
@@ -257,34 +236,31 @@ class _EagerDefinedFunction(object):
       inputs: the tensors in the graph to be used as inputs to the function
       outputs: the tensors in the graph which will be outputs to the function
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
-          graph._c_graph,  # pylint: disable=protected-access
-          compat.as_str(name),
-          False,
-          [o._c_op for o in operations],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
-          [],
-          None,
-          compat.as_str(""),
-          status)
+    fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
+        graph._c_graph,  # pylint: disable=protected-access
+        compat.as_str(name),
+        False,
+        [o._c_op for o in operations],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
+        [],
+        None,
+        compat.as_str(""))
     # TODO(apassos) avoid creating a FunctionDef (specially to grab the
     # signature, but also in general it's nice not to depend on it.
     with c_api_util.tf_buffer() as buffer_:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_, status)
+      pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_)
       proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
     function_def = function_pb2.FunctionDef()
     function_def.ParseFromString(compat.as_bytes(proto_data))
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       _register(fn)
     self.definition = function_def
     self.name = function_def.signature.name
     self.signature = function_def.signature
     self.grad_func_name = None
     self.python_grad_func = None
-    self._c_func = fn
+    self._c_func = c_api_util.ScopedTFFunction(fn)
     self._grad_func = None
 
 
@@ -368,21 +344,31 @@ class GraphModeFunction(object):
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
     with self._graph.as_default(), context.graph_mode():
-      c = _CapturingContext()
-      with c:
-        filtered_outputs = [x for x in self._returns if x is not None]
-        self._out_grad_placeholders = [
-            graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
-        in_gradients = gradients_impl.gradients(
-            filtered_outputs,
-            self._input_placeholders,
-            grad_ys=self._out_grad_placeholders)
+      c_known_ops = set()
+      c_captured_tensors = set()
+
+      existing_op_len = len(self._graph.get_operations())
+      filtered_outputs = [x for x in self._returns if x is not None]
+      self._out_grad_placeholders = [
+          graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
+      in_gradients = gradients_impl.gradients(
+          filtered_outputs,
+          self._input_placeholders,
+          grad_ys=self._out_grad_placeholders)
+      for op in self._graph.get_operations()[existing_op_len:]:
+        if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
+          raise ValueError("tfe.defun cannot capture variables created without "
+                           "using tf.get_variable. Op: %s" % op)
+        c_known_ops.add(op)
+        for i in op.inputs:
+          if i.op not in c_known_ops:
+            c_captured_tensors.add(i)
 
     backward_outputs = tuple(
         grad for grad in _flatten(in_gradients) if grad is not None)
     output_shapes = tuple(grad.shape for grad in backward_outputs)
 
-    captures = list(sorted(c.captured_tensors, key=lambda x: x.name))
+    captures = list(sorted(c_captured_tensors, key=lambda x: x.name))
     forward_name = _forward_name(self._func_name)
     self._forward_fdef = _EagerDefinedFunction(
         forward_name, self._graph, self._ops, self._input_placeholders,
@@ -395,7 +381,7 @@ class GraphModeFunction(object):
     # means rerunning the function-defining code will always define the same
     # function, which is useful if we serialize this etc.
     function_def_ops = tuple(x
-                             for x in sorted(c.known_ops, key=lambda x: x.name)
+                             for x in sorted(c_known_ops, key=lambda x: x.name)
                              if x not in all_ignored_ops)
     bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
@@ -407,7 +393,14 @@ class GraphModeFunction(object):
     all_args = args + self._extra_inputs
     signature = self._forward_fdef.signature
     ctx = context.context()
-    if ctx.in_graph_mode():
+    if ctx.executing_eagerly():
+      outputs = execute.execute(
+          str(signature.name),
+          num_outputs=len(signature.output_arg),
+          inputs=all_args,
+          attrs=None,
+          ctx=ctx)
+    else:
       g = ops.get_default_graph()
       g._add_function(self._forward_fdef)  # pylint: disable=protected-access
       op = g.create_op(
@@ -422,13 +415,6 @@ class GraphModeFunction(object):
           outputs, (ops.Tensor, type(None))) else list(outputs)
       for i, s in enumerate(self._output_shapes):
         outputs[i].set_shape(s)
-    else:
-      outputs = execute.execute(
-          str(signature.name),
-          num_outputs=len(signature.output_arg),
-          inputs=all_args,
-          attrs=None,
-          ctx=ctx)
     real_outputs = outputs[:len(self._returns)]
     side_outputs = outputs[len(self._returns):]
 
@@ -499,7 +485,14 @@ class GraphModeFunction(object):
       return self._backprop_call(tensor_inputs)
 
     ctx = context.context()
-    if ctx.in_graph_mode():
+    if ctx.executing_eagerly():
+      result = execute.execute(
+          str(self._func_name),
+          num_outputs=self._num_outputs,
+          inputs=tensor_inputs + self._extra_inputs,
+          attrs=None,
+          ctx=ctx)
+    else:
       g = ops.get_default_graph()
       self.add_to_graph(g)
       signature = self._function_def.definition.signature
@@ -516,13 +509,6 @@ class GraphModeFunction(object):
         return op
       for i, s in enumerate(self._output_shapes):
         result[i].set_shape(s)
-    else:
-      result = execute.execute(
-          str(self._func_name),
-          num_outputs=self._num_outputs,
-          inputs=tensor_inputs + self._extra_inputs,
-          attrs=None,
-          ctx=ctx)
 
     return self._build_call_outputs(result)
 
@@ -590,29 +576,31 @@ def _defun_internal(name, func, args, kwds):
     for collection in curr_graph.collections:
       tmp_graph.get_collection_ref(collection)[:] = curr_graph.get_collection(
           collection)
-    with tmp_graph.as_default():
+    with tmp_graph.as_default(), AutomaticControlDependencies() as a:
       func_inputs = _get_defun_inputs(args)
 
       def convert(x):
         if x is None:
           return None
-        return ops.convert_to_tensor_or_indexed_slices(x)
-
-      with capture_tensors(captures):
-        this_tape = tape.push_new_tape()
-        try:
-          func_outputs = func(*func_inputs, **kwds)
-          func_outputs = nest.map_structure(convert, func_outputs)
-        finally:
-          tape.pop_tape(this_tape)
-        variables = this_tape.watched_variables()
-
-        # Returning a closed-over tensor as an output does not trigger a
-        # call to convert_to_tensor, so we manually capture all such tensors.
-        outputs_list = _flatten(func_outputs)
-        func_def_outputs = [
-            _convert_to_graph_tensor(x) for x in outputs_list if x is not None
-        ]
+        x = ops.convert_to_tensor_or_indexed_slices(x)
+        x = a.mark_as_return(x)
+        return x
+
+      this_tape = tape.push_new_tape()
+      try:
+        func_outputs = func(*func_inputs, **kwds)
+        func_outputs = nest.map_structure(convert, func_outputs)
+      finally:
+        tape.pop_tape(this_tape)
+      variables = this_tape.watched_variables()
+
+      # Returning a closed-over tensor as an output does not trigger a
+      # call to convert_to_tensor, so we manually capture all such tensors.
+      outputs_list = _flatten(func_outputs)
+      func_def_outputs = [
+          tmp_graph.maybe_capture_tensor(x) for x in outputs_list
+          if x is not None
+      ]
 
       ids = list(sorted(captures.keys()))
       if ids:
@@ -633,10 +621,10 @@ def _defun_internal(name, func, args, kwds):
                      if x not in all_ignored_ops)
   # Register any other functions defined in the graph
   # TODO(ashankar): Oh lord, forgive me for this lint travesty.
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     for f in tmp_graph._functions.values():  # pylint: disable=protected-access
       # TODO(ashankar): What about the gradient registry?
-      _register(f._c_func)  # pylint: disable=protected-access
+      _register(f._c_func.func)  # pylint: disable=protected-access
   return GraphModeFunction(
       fname, all_inputs, extra_inputs, tmp_graph, operations, func_def_outputs,
       func_outputs, output_shapes, variables)
@@ -841,10 +829,39 @@ class AutomaticControlDependencies(object):
     self._returned_tensors = set()
 
   def mark_as_return(self, tensor):
+    """Acts like identity but marks the `Tensor` as a return value.
+
+    This will possibly return a copy of the `Tensor`. Usage:
+
+    ```
+      with AutomaticControlDependencies() as a:
+       ...
+       t = a.mark_as_return(t)
+      _ = ...(t...)  # i.e. it's safe to use t here
+    ```
+
+    Args:
+      tensor: the `Tensor` to be marked
+
+    Returns:
+      a copy of the `Tensor`.
+    """
+    if isinstance(tensor, ops.IndexedSlices):
+      values = array_ops.identity(tensor.values)
+      indices = array_ops.identity(tensor.indices)
+      self._returned_tensors.add(indices)
+      self._returned_tensors.add(values)
+      return ops.IndexedSlices(values, indices, dense_shape=tensor.dense_shape)
+    # We want to make the return values depend on the stateful operations, but
+    # we don't want to introduce a cycle, so we make the return value the result
+    # of a new identity operation that the stateful operations definitely don't
+    # depend on.
+    tensor = array_ops.identity(tensor)
     self._returned_tensors.add(tensor)
+    return tensor
 
   def __enter__(self):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       return self
     # This code assumes no other thread is adding ops to the graph while
     # we're adding ops to the graph.
@@ -915,7 +932,7 @@ class AutomaticControlDependencies(object):
       merge_for_resource[o] = new_merge[0].op
 
   def __exit__(self, unused_type, unused_value, unused_traceback):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       return
 
     if self._graph is not ops.get_default_graph():
@@ -962,7 +979,8 @@ class AutomaticControlDependencies(object):
     for op in new_operations:
       control_inputs = set()
       # Ensure stateful ops run
-      if self._graph._registered_ops[op.type].is_stateful:  # pylint: disable=protected-access
+      if (op.type not in self._graph._registered_ops  # pylint: disable=protected-access
+          or self._graph._registered_ops[op.type].is_stateful):  # pylint: disable=protected-access
         ops_which_must_run.add(op)
       # Ignore switches (they're handled separately)
       if op.type == "Switch" and op.inputs[0].dtype == dtypes_module.resource:
@@ -998,9 +1016,10 @@ class AutomaticControlDependencies(object):
 
     # Ensure all ops which must run do run
     for r in self._returned_tensors:
-      r.op._add_control_inputs(  # pylint: disable=protected-access
-          [o for o in ops_which_must_run
-           if o._control_flow_context is r.op._control_flow_context])  # pylint: disable=protected-access
+      if ops_which_must_run:
+        r.op._add_control_inputs(  # pylint: disable=protected-access
+            [o for o in ops_which_must_run
+             if o._control_flow_context is r.op._control_flow_context])  # pylint: disable=protected-access
 
 
 def automatic_control_dependencies(f):
@@ -1020,8 +1039,7 @@ def automatic_control_dependencies(f):
   def wrapper(*args, **kwds):
     with AutomaticControlDependencies() as a:
       result = f(*args, **kwds)
-      for t in nest.flatten(result):
-        a.mark_as_return(t)
-      return result
+      result_flat = [a.mark_as_return(t) for t in nest.flatten(result)]
+      return nest.pack_sequence_as(result, result_flat)
 
   return tf_decorator.make_decorator(f, wrapper)
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 431d9388c0ee97eda197142ec97b9448d985b04b..185f6d981cb36a277c9e63f1195501c1f86409d0 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -26,19 +26,23 @@ from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.training import gradient_descent
 
 
+@test_util.with_c_shapes
 class FunctionTest(test.TestCase):
 
   def testBasic(self):
@@ -104,6 +108,7 @@ class FunctionTest(test.TestCase):
     matmul = function.defun(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
+
     def a_times_b(inputs):
       return matmul(inputs.a['a'], inputs.b['b'])
 
@@ -304,7 +309,7 @@ class FunctionTest(test.TestCase):
     def g(x):
       return backprop.gradients_function(f, [0])(x)[0]
 
-    self.assertAllEqual(2, g(constant_op.constant(2)))
+    self.assertAllEqual(2, g(constant_op.constant(2.)))
 
   def testGraphModeEagerGradError(self):
     with context.graph_mode():
@@ -312,6 +317,7 @@ class FunctionTest(test.TestCase):
         x = variable_scope.get_variable(
             'v', initializer=constant_op.constant(1.0))
         return x * constant_op.constant(2.0)
+
       with self.assertRaisesRegexp(ValueError,
                                    'No trainable variables were accessed'):
         backprop.implicit_val_and_grad(f)()
@@ -376,23 +382,23 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
 
   def testGradientOfGatherWithDefun(self):
+    with ops.device('cpu:0'):
+      v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
 
-    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
-
-    def sum_gather():
-      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+      def sum_gather():
+        return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
 
-    grad_fn = backprop.implicit_grad(sum_gather)
-    gradient = grad_fn()
-    defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
-    defun_gradient = defun_grad_fn()
-    self.assertEqual(len(gradient), len(defun_gradient))
+      grad_fn = backprop.implicit_grad(sum_gather)
+      gradient = grad_fn()
+      defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
+      defun_gradient = defun_grad_fn()
+      self.assertEqual(len(gradient), len(defun_gradient))
 
-    gradient = gradient[0][0]
-    defun_gradient = defun_gradient[0][0]
-    self.assertAllEqual(gradient.values, defun_gradient.values)
-    self.assertAllEqual(gradient.indices, defun_gradient.indices)
-    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+      gradient = gradient[0][0]
+      defun_gradient = defun_gradient[0][0]
+      self.assertAllEqual(gradient.values, defun_gradient.values)
+      self.assertAllEqual(gradient.indices, defun_gradient.indices)
+      self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
 
   def testReturningIndexedSlicesWithDefun(self):
 
@@ -475,9 +481,7 @@ class FunctionTest(test.TestCase):
     reshape = function.defun(array_ops.reshape)
     value = constant_op.constant([1., 2.])
     shape = constant_op.constant([2, 1]).gpu()
-    with self.assertRaises(errors.InvalidArgumentError):
-      with ops.device('gpu:0'):
-        reshape(value, shape)
+    reshape(value, shape)  # No error is raised
 
   def testDifferentiableFunctionNoneOutputs(self):
 
@@ -583,6 +587,7 @@ class FunctionTest(test.TestCase):
       with ops.name_scope('foo'):
         v = resource_variable_ops.ResourceVariable(0.0, name='bar')
       self.assertEqual(v.name, 'foo/bar:0')
+
     create_variable()
 
   def testVariableNamesRespectNameScopesWithDefunInGraph(self):
@@ -592,10 +597,27 @@ class FunctionTest(test.TestCase):
         with ops.name_scope('foo'):
           v = resource_variable_ops.ResourceVariable([1.0, 2.0], name='bar')
         self.assertEqual(v.name, 'foo/bar:0')
+
       with ops.get_default_graph().as_default():
         create_variable()
 
+  def testLayerInDefun(self):
+    conv = convolutional.Conv2D(
+        filters=1,
+        kernel_size=2,
+        kernel_initializer=init_ops.ones_initializer(),
+        bias_initializer=init_ops.zeros_initializer())
 
+    @function.defun
+    def model(x):
+      return conv(x)
+
+    x = array_ops.ones([1, 2, 2, 1])
+    y = model(x)
+    self.assertAllEqual([[[[4.0]]]], y.numpy())
+
+
+@test_util.with_c_shapes
 class AutomaticControlDependenciesTest(test.TestCase):
 
   def testBasic(self):
@@ -606,7 +628,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
         v.assign(v + 1)
         v.assign(2 * v)
         val = v.read_value()
-        c.mark_as_return(val)
+        val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(), 4.0)
 
   def testCondMustRun(self):
@@ -626,7 +648,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
         control_flow_ops.cond(p, true_fn, false_fn)
         val = v.read_value()
-        c.mark_as_return(val)
+        val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 6.0)
 
@@ -647,7 +669,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
         control_flow_ops.cond(p, true_fn, false_fn)
         one = constant_op.constant(1.0)
-        c.mark_as_return(one)
+        one = c.mark_as_return(one)
       one.eval(feed_dict={p: False})
       self.assertAllEqual(v.read_value().eval(), 5.0)
       one.eval(feed_dict={p: True})
@@ -681,7 +703,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
         control_flow_ops.cond(p, true_fn, false_fn)
         with ops.name_scope('final'):
           val = v.read_value()
-        c.mark_as_return(val)
+        val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(feed_dict={p: False, q: False}), 3.0)
       self.assertAllEqual(val.eval(feed_dict={p: False, q: True}), 6.0)
       self.assertAllEqual(val.eval(feed_dict={p: True, q: True}), 7.0)
@@ -703,7 +725,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
         control_flow_ops.cond(p, true_fn, false_fn)
         val = v.read_value()
-        c.mark_as_return(val)
+        val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 5.0)
 
@@ -724,7 +746,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
         control_flow_ops.cond(p, true_fn, false_fn)
         val = v.read_value()
-        c.mark_as_return(val)
+        val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(feed_dict={p: False}), 6.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 12.0)
 
@@ -745,7 +767,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
         control_flow_ops.cond(p, true_fn, false_fn)
         v.assign(v * 2)
         val = v.read_value()
-        c.mark_as_return(val)
+        val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(feed_dict={p: False}), 10.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 20.0)
 
@@ -762,6 +784,37 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
       self.assertAllEqual(f().eval(), 4.0)
 
+  def testOptimizerInDefun(self):
+    def loss(v):
+      return v**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
+
+    @function.defun
+    def train():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      grad = backprop.implicit_grad(loss)(v)
+      optimizer.apply_gradients(grad)
+      return v.read_value()
+
+    value = train()
+    self.assertEqual(value.numpy(), -1.0)
+
+  def testOptimizerInDefunWithCapturedVariable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+    def loss():
+      return v**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
+
+    @function.defun
+    def train():
+      grad = backprop.implicit_grad(loss)()
+      optimizer.apply_gradients(grad)
+
+    train()
+    self.assertEqual(v.numpy(), -1.0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index 62106bf0e2809e3c056e4a357f3d05251b7dca68..d9ffcbd2036b9e312967012597ceea22e607d2a7 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -278,10 +278,13 @@ def _graph_callable_internal(func, shape_and_dtypes):
       # variables. As a side-effect this will populate the variable capturing
       # scope's view of which variables exist.
       variable_captures = _VariableCapturingScope()
-      with variable_captures.initializing_scope(), function.capture_tensors(
-          captures):
+      with variable_captures.initializing_scope(
+          ), function.AutomaticControlDependencies() as a:
         func_outputs = func(*func_inputs)
-      outputs_list = nest.flatten(func_outputs)
+        outputs_list = nest.flatten(func_outputs)
+        for i, x in enumerate(outputs_list):
+          if x is not None:
+            outputs_list[i] = a.mark_as_return(x)
       if len(outputs_list) == 1 and outputs_list[0] is None:
         outputs_list = []
       output_shapes = [x.shape for x in outputs_list]
@@ -293,10 +296,13 @@ def _graph_callable_internal(func, shape_and_dtypes):
       # placeholders. This assumes the variable capturing scope created above
       # knows about all variables.
       tmp_graph.clear_resource_control_flow_state()
-      with variable_captures.capturing_scope(), function.capture_tensors(
-          captures):
+      with variable_captures.capturing_scope(
+          ), function.AutomaticControlDependencies() as a:
         captured_outputs = func(*func_inputs)
       captured_outlist = nest.flatten(captured_outputs)
+      for i, x in enumerate(captured_outlist):
+        if x is not None:
+          captured_outlist[i] = a.mark_as_return(x)
       capturing_operations = tmp_graph.get_operations()[
           len(initializing_operations):]
 
@@ -319,7 +325,7 @@ def _graph_callable_internal(func, shape_and_dtypes):
   # Also, what about the gradient registry of these functions? Those need to be
   # addressed as well.
   for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    function._register(f._c_func)  # pylint: disable=protected-access
+    function._register(f._c_func.func)  # pylint: disable=protected-access
   initializer_function = function.GraphModeFunction(
       initialization_name,
       placeholder_inputs,
@@ -400,7 +406,7 @@ def graph_callable(shape_and_dtypes):
     A callable graph object.
   """
   # TODO(alive,apassos): support initialized_value and friends from tf.Variable.
-  assert context.in_eager_mode(), (
+  assert context.executing_eagerly(), (
       "graph_callable can only be used when Eager execution is enabled.")
   def decorator(func):
     return tf_decorator.make_decorator(func,
diff --git a/tensorflow/python/eager/imperative_grad.py b/tensorflow/python/eager/imperative_grad.py
index 837cad974ac6555ef2b13d1a1a5e0e5f5166b01d..000152855d1a90f32936cca40c10f00c2df863a5 100644
--- a/tensorflow/python/eager/imperative_grad.py
+++ b/tensorflow/python/eager/imperative_grad.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import collections
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import errors
 
 
 VSpace = collections.namedtuple(
@@ -60,6 +59,5 @@ def imperative_grad(
      or if only non-differentiable functions of the source were used in the
      computation of target.
   """
-  with errors.raise_exception_on_not_ok_status() as status:
-    return pywrap_tensorflow.TFE_Py_TapeGradient(
-        tape._tape, vspace, target, sources, output_gradients, status)  # pylint: disable=protected-access
+  return pywrap_tensorflow.TFE_Py_TapeGradient(
+      tape._tape, vspace, target, sources, output_gradients)  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index f2e70341d975fb06bce7f2ce6cba7d8c3bc9826c..fc76ede4c502ae8b554c925a921e419bf003c40c 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
 import numpy as np
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import test
@@ -130,8 +132,12 @@ class OpsTest(test_util.TensorFlowTestCase):
                                    dtype=dtypes.int64)
     values = constant_op.constant([2, 3, 5, 7, 11])
     shape = constant_op.constant([2, 7], dtype=dtypes.int64)
-    result = sparse_ops.gen_sparse_ops._sparse_split(  # pylint: disable=protected-access
-        split_dim, indices, values, shape, num_split=2)
+    result = sparse_ops.gen_sparse_ops.sparse_split(
+        split_dim,
+        indices,
+        values,
+        shape,
+        num_split=2)
     output_indices, output_values, output_shape = result
     self.assertEqual(2, len(output_indices))
     self.assertEqual(2, len(output_values))
@@ -277,6 +283,25 @@ class OpsTest(test_util.TensorFlowTestCase):
       context._context = context.Context()
     # pylint: enable=protected-access
 
+  def testSoftPlacement(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    # Temporarily replace the context
+    # pylint: disable=protected-access
+    del context._context
+    try:
+      context._context = context.Context(
+          device_policy=context.DEVICE_PLACEMENT_SILENT,
+          config=config_pb2.ConfigProto(allow_soft_placement=True))
+      cpu_tensor = constant_op.constant(1.0)
+      result = cpu_tensor + cpu_tensor
+      self.assertEqual(result.device,
+                       '/job:localhost/replica:0/task:0/device:GPU:0')
+    finally:
+      del context._context
+      context._context = context.Context()
+    # pylint: enable=protected-access
+
   def testRandomUniform(self):
     scalar_shape = constant_op.constant([], dtype=dtypes.int32)
 
@@ -352,6 +377,22 @@ class OpsTest(test_util.TensorFlowTestCase):
   def testNoOpIsNone(self):
     self.assertTrue(control_flow_ops.no_op() is None)
 
+  def testEagerContextPreservedAcrossThreads(self):
+    def init_fn():
+      self.assertTrue(context.executing_eagerly())
+      with ops.init_scope():
+        self.assertTrue(context.executing_eagerly())
+        context_switches = context.context().context_switches
+        self.assertEqual(len(context_switches.stack), 1)
+        self.assertFalse(context_switches.stack[0].is_building_function)
+        self.assertEqual(context_switches.stack[0].enter_context_fn,
+                         context.eager_mode)
+
+    self.assertTrue(context.executing_eagerly())
+    t1 = threading.Thread(target=init_fn)
+    t1.start()
+    t1.join()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index e6d03297e0b85856ff165af310149c79e494ab36..9afab0077b666b36d77abea5a7d8c444b6400812 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -117,7 +117,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
                    const string& function_name)
       : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
     op_name_ = function_name_;
-    op_name_.Consume("_");
+    str_util::ConsumePrefix(&op_name_, "_");
   }
   ~GenEagerPythonOp() override {}
 
@@ -366,8 +366,8 @@ string GenEagerPythonOp::Code() {
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
   // Handle graph-mode case
   strings::StrAppend(&result_,
-                     "  _ctx = _context.context()\n"
-                     "  if _ctx.in_graph_mode():\n",
+                     "  _ctx = _context._context\n"
+                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
                      function_setup,
                      "    _, _, _op = _op_def_lib._apply_op_helper(\n");
   AddBodyNoReturn("        ");
@@ -492,7 +492,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
       strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
                          " = ", default_value, "\n");
     }
-    if (attr_type.starts_with("list(")) {
+    if (str_util::StartsWith(attr_type, "list(")) {
       ExpectListArg(indentation, attr_api_name, function_setup);
     }
 
@@ -683,13 +683,14 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
     return true;
   }
 
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix), parameters);
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
+             strings::StrCat(parameters, ", ctx=None"));
   strings::StrAppend(
       &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
   strings::StrAppend(&result_, "  This is for function ", function_name_,
                      "\n  \"\"\"\n");
 
-  strings::StrAppend(&result_, "  _ctx = _context.context()\n");
+  strings::StrAppend(&result_, "  _ctx = ctx if ctx else _context.context()\n");
 
   string function_setup;
   if (!GetEagerFunctionSetup("  ", &function_setup)) {
@@ -713,8 +714,8 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
 
 void GenEagerPythonOp::AddEagerFastPathExecute() {
   string fastpath_execute_params = strings::StrCat(
-      "_ctx._handle, _ctx.device_name, \"", op_def_.name(), "\", ",
-      "_execute.record_gradient, name, _ctx._post_execution_callbacks");
+      "_ctx._context_handle, _ctx._eager_context.device_name, \"",
+      op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
   string fallback_params;
 
   for (int i = 0; i < api_def_.in_arg_size(); i++) {
@@ -755,6 +756,8 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   strings::StrAppend(&result_, "      ", "return _result\n");
 
   // Handle fallback.
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "ctx=_ctx");
   strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
   strings::StrAppend(
       &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
@@ -955,10 +958,10 @@ from tensorflow.python.util.tf_export import tf_export
     if (api_def->visibility() == ApiDef::SKIP) {
       continue;
     }
-
     // An op is hidden if either its ApiDef visibility is HIDDEN
     // or it is in the hidden_ops list.
     bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
+    bool hidden_by_api_def = is_hidden;
     if (!is_hidden) {
       for (const string& hidden : hidden_ops) {
         if (op_def.name() == hidden) {
@@ -971,13 +974,22 @@ from tensorflow.python.util.tf_export import tf_export
     string function_name;
     python_op_gen_internal::GenerateLowerCaseOpName(op_def.name(),
                                                     &function_name);
-    if (is_hidden) function_name = strings::StrCat("_", function_name);
-
-    // When users create custom python wrappers, they may link in the
-    // default op registry by accident, and because they can't
-    // enumerate all 'hidden' symbols, this guard is to prevent
-    // instantiating a python reserved word in their wrapper.
-    if (python_op_gen_internal::IsPythonReserved(function_name)) {
+    bool is_reserved = python_op_gen_internal::IsPythonReserved(function_name);
+
+    // Prefix an op with underscore if the op is listed in hidden_ops or
+    // name is reserved or it is of the exceptions in IsOpWithUnderscorePrefix.
+    // Do not add underscores to ops set to HIDDEN in ApiDef otherwise.
+    // TODO(annarev): don't prefix with underscores even if op is in hidden_ops.
+    if (is_hidden) {
+      if (!hidden_by_api_def || is_reserved ||
+          python_op_gen_internal::IsOpWithUnderscorePrefix(function_name)) {
+        function_name = strings::StrCat("_", function_name);
+      }
+    } else if (is_reserved) {
+      // When users create custom python wrappers, they may link in the
+      // default op registry by accident, and because they can't
+      // enumerate all 'hidden' symbols, this guard is to prevent
+      // instantiating a python reserved word in their wrapper.
       continue;
     }
 
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 6fa076507d11ab9c88891cbeb0a4fb3959e4e99d..b5b4e394e33bd3adcf8e90fa4c35f87fbbb5a155 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -60,42 +60,6 @@ TFE_TensorHandle* NumpyToTensorHandle(PyObject* obj) {
   }
 }
 
-// Casts data referred to by `handle` from type `src_type_enum` to type
-// `dst_type_enum`.
-TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
-                            TF_DataType src_type_enum,
-                            TF_DataType dst_type_enum, TF_Status* out_status) {
-  if (ctx == nullptr) return nullptr;
-  const char* op_name = "Cast";
-  const char* device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
-  TFE_Op* op = TFE_NewOp(ctx, op_name, out_status);
-#define RETURN_ERROR  \
-  {                   \
-    TFE_DeleteOp(op); \
-    return nullptr;   \
-  }
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
-  TFE_OpSetDevice(op, device_name, out_status);
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
-  TFE_OpAddInput(op, handle, out_status);
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
-  TFE_OpSetAttrType(op, "SrcT", src_type_enum);
-  TFE_OpSetAttrType(op, "DstT", dst_type_enum);
-  TFE_TensorHandle* output = nullptr;
-  int num_outputs = 1;
-  TFE_Execute(op, &output, &num_outputs, out_status);
-  if (TF_GetCode(out_status) != TF_OK || num_outputs != 1 ||
-      output == nullptr) {
-    if (output != nullptr) {
-      TFE_DeleteTensorHandle(output);
-    }
-    RETURN_ERROR
-  }
-  TFE_DeleteOp(op);
-  return output;
-#undef RETURN_ERROR
-}
-
 TFE_TensorHandle* CopyToDevice(TFE_TensorHandle* handle, PyObject* ctx,
                                PyObject* dev) {
   const char* device = "";
@@ -161,9 +125,103 @@ PyObject* PyIntFromDataType(TF_DataType l) {
 
 }  // namespace
 
+namespace tensorflow {
+// Casts data referred to by `handle` from type `src_type_enum` to type
+// `dst_type_enum`.
+TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
+                            TF_DataType src_type_enum,
+                            TF_DataType dst_type_enum, TF_Status* out_status) {
+  if (ctx == nullptr) return nullptr;
+  const char* op_name = "Cast";
+  const char* device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_Op* op = TFE_NewOp(ctx, op_name, out_status);
+#define RETURN_ERROR  \
+  {                   \
+    TFE_DeleteOp(op); \
+    return nullptr;   \
+  }
+  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  TFE_OpSetDevice(op, device_name, out_status);
+  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  TFE_OpAddInput(op, handle, out_status);
+  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  TFE_OpSetAttrType(op, "SrcT", src_type_enum);
+  TFE_OpSetAttrType(op, "DstT", dst_type_enum);
+  TFE_TensorHandle* output = nullptr;
+  int num_outputs = 1;
+  TFE_Execute(op, &output, &num_outputs, out_status);
+  if (TF_GetCode(out_status) != TF_OK || num_outputs != 1 ||
+      output == nullptr) {
+    if (output != nullptr) {
+      TFE_DeleteTensorHandle(output);
+    }
+    RETURN_ERROR
+  }
+  TFE_DeleteOp(op);
+  return output;
+#undef RETURN_ERROR
+}
+
+TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) {
+  int desired_dtype = -1;
+  if (dtype != Py_None) {
+    if (!PyIntToDataType(dtype, &desired_dtype)) {
+      PyErr_SetString(PyExc_TypeError,
+                      tensorflow::strings::StrCat(
+                          "Expecting a DataType value for dtype. Got ",
+                          Py_TYPE(dtype)->tp_name)
+                          .c_str());
+      return nullptr;
+    }
+  }
+  if (PyArray_Check(value)) {
+    int desired_np_dtype = -1;
+    if (desired_dtype >= 0) {
+      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
+               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
+               .ok()) {
+        PyErr_SetString(PyExc_TypeError,
+                        tensorflow::strings::StrCat(
+                            "Invalid dtype argument value ", desired_dtype)
+                            .c_str());
+        return nullptr;
+      }
+    }
+    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
+    int current_np_dtype = PyArray_TYPE(array);
+    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
+    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
+        !PyArray_ISCARRAY(array)) {
+      int new_dtype =
+          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
+      safe_value = tensorflow::make_safe(
+          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
+                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
+      if (PyErr_Occurred()) return nullptr;
+      if (safe_value == nullptr) {
+        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
+        return nullptr;
+      }
+      value = safe_value.get();
+    }
+    return NumpyToTensorHandle(value);
+  } else {
+    tensorflow::Tensor t;
+    // TODO(josh11b): Have PySeqToTensor set python errors instead of
+    // returning Status.
+    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
+    if (!cppstatus.ok()) {
+      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
+      return nullptr;
+    }
+    return TFE_NewTensorHandle(t);
+  }
+}
+}  // namespace tensorflow
+
 extern "C" {
 
-static const int kMaxEagerTensorParentSize = 32;
+static const int kMaxEagerTensorParentSize = 64;
 
 // TODO(agarwal): store context handle in EagerTensor.
 typedef struct EagerTensor {
@@ -185,6 +243,16 @@ typedef struct EagerTensor {
 
   // This stores `_keras_mask` object and is set by Tensorflow layers.
   PyObject* keras_mask;
+
+  // This stores `_tensor_shape`, a cached `TensorShape` object, and is set the
+  // first time that `_EagerTensorBase`'s `shape` property is called.
+  PyObject* tensor_shape;
+
+  // We store a status object here as an optimization to avoid allocating a new
+  // Status objects on different functions that operate on EagerTensor and need
+  // to use a TF_Status object. However note that accesses to `status` are not
+  // thread-safe.
+  TF_Status* status;
 } EagerTensor;
 
 // tp_init for EagerTensor.
@@ -195,6 +263,9 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   self->handle_data = Py_None;
   Py_INCREF(Py_None);
   self->keras_mask = Py_None;
+  Py_INCREF(Py_None);
+  self->tensor_shape = Py_None;
+  self->status = TF_NewStatus();
   PyObject* value;
   PyObject* context = nullptr;
   PyObject* device = nullptr;
@@ -217,69 +288,24 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       return -1;
     }
   }
-  tensorflow::Safe_TFE_TensorHandlePtr handle =
-      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(nullptr));
   PyErr_Clear();
-  if (PyArray_Check(value)) {
-    int desired_np_dtype = -1;
-    if (desired_dtype >= 0) {
-      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
-               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
-               .ok()) {
-        PyErr_SetString(PyExc_TypeError,
-                        tensorflow::strings::StrCat(
-                            "Invalid dtype argument value ", desired_dtype)
-                            .c_str());
-        return -1;
-      }
-    }
-    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
-    int current_np_dtype = PyArray_TYPE(array);
-    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
-    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
-        !PyArray_ISCARRAY(array)) {
-      int new_dtype =
-          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
-      safe_value = tensorflow::make_safe(
-          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
-                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
-      if (PyErr_Occurred()) return -1;
-      if (safe_value == nullptr) {
-        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
-        return -1;
-      }
-      value = safe_value.get();
-    }
-    handle = tensorflow::make_safe(NumpyToTensorHandle(value));
-  } else {
-    tensorflow::Tensor t;
-    // TODO(josh11b): Have PySeqToTensor set python errors instead of
-    // returning Status.
-    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
-    if (!cppstatus.ok()) {
-      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
-      return -1;
-    }
-    handle = tensorflow::make_safe(TFE_NewTensorHandle(t));
-  }
-  if (PyErr_Occurred()) return -1;
-  if (handle == nullptr) {
-    PyErr_SetString(PyExc_ValueError, "Error while creating an EagerTensor");
-    return -1;
-  }
+  tensorflow::Safe_TFE_TensorHandlePtr handle =
+      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(
+          tensorflow::ConvertToEagerTensor(value, dtype)));
+  if (handle == nullptr) return -1;
   TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
   if (desired_dtype >= 0 && desired_dtype != handle_dtype) {
-    auto out_status = tensorflow::make_safe(TF_NewStatus());
-    handle = tensorflow::make_safe(
-        EagerCast(GetContext(context), handle.get(), handle_dtype,
-                  static_cast<TF_DataType>(desired_dtype), out_status.get()));
-    if (TF_GetCode(out_status.get()) != TF_OK) {
-      PyErr_SetString(
-          PyExc_ValueError,
-          tensorflow::strings::StrCat("Error while casting from DataType ",
-                                      handle_dtype, " to ", desired_dtype, ". ",
-                                      TF_Message(out_status.get()))
-              .c_str());
+    handle = tensorflow::make_safe(tensorflow::EagerCast(
+        GetContext(context), handle.get(), handle_dtype,
+        static_cast<TF_DataType>(desired_dtype), self->status));
+    if (TF_GetCode(self->status) != TF_OK) {
+      PyErr_SetString(PyExc_ValueError,
+                      tensorflow::strings::StrCat(
+                          "Error while casting from DataType ", handle_dtype,
+                          " to ", desired_dtype, ". ", TF_Message(self->status))
+                          .c_str());
+      // Cleanup self->status before returning.
+      TF_SetStatus(self->status, TF_OK, "");
       return -1;
     }
     handle_dtype = TFE_TensorHandleDataType(handle.get());
@@ -323,10 +349,14 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
 
 // tp_dealloc for EagerTensor.
 void EagerTensor_dealloc(EagerTensor* self) {
+  TF_DeleteStatus(self->status);
   Py_DECREF(self->handle_data);
   Py_DECREF(self->keras_mask);
-  TFE_DeleteTensorHandle(self->handle);
-  self->handle = nullptr;
+  Py_DECREF(self->tensor_shape);
+  if (self->handle != nullptr) {
+    TFE_DeleteTensorHandle(self->handle);
+    self->handle = nullptr;
+  }
   // We have the global interpreter lock, so use this chance to perform delayed
   // refcount decrements.
   tensorflow::ClearDecrefCache();
@@ -348,12 +378,21 @@ static PyObject* EagerTensor_datatype_enum(EagerTensor* self) {
 // Getter for `_shape_tuple`.
 static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
   auto handle = self->handle;
-  int n = TFE_TensorHandleNumDims(handle);
+  int n = TFE_TensorHandleNumDims(handle, self->status);
+  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+    // Cleanup self->status before returning.
+    TF_SetStatus(self->status, TF_OK, "");
+    return nullptr;
+  }
   PyObject* shape = PyTuple_New(n);
   if (PyErr_Occurred()) return nullptr;
   for (int i = 0; i < n; ++i) {
-    PyObject* dim = PyLong_FromLongLong(TFE_TensorHandleDim(handle, i));
-    if (dim == nullptr || PyTuple_SetItem(shape, i, dim) != 0) {
+    PyObject* dim =
+        PyLong_FromLongLong(TFE_TensorHandleDim(handle, i, self->status));
+    if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError) ||
+        dim == nullptr || PyTuple_SetItem(shape, i, dim) != 0) {
+      // Cleanup self->status before returning.
+      TF_SetStatus(self->status, TF_OK, "");
       Py_DECREF(shape);
       if (dim != nullptr) Py_DECREF(dim);
       PyErr_SetString(PyExc_RuntimeError, "Error while creating shape");
@@ -365,10 +404,16 @@ static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
 
 // Getter for `_rank`.
 static PyObject* EagerTensor_rank(EagerTensor* self) {
+  int num_dims = TFE_TensorHandleNumDims(self->handle, self->status);
+  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+    // Cleanup self->status before returning.
+    TF_SetStatus(self->status, TF_OK, "");
+    return nullptr;
+  }
 #if PY_MAJOR_VERSION < 3
-  return PyInt_FromLong(TFE_TensorHandleNumDims(self->handle));
+  return PyInt_FromLong(num_dims);
 #else
-  return PyLong_FromLong(TFE_TensorHandleNumDims(self->handle));
+  return PyLong_FromLong(num_dims);
 #endif
 }
 
@@ -397,6 +442,19 @@ static int EagerTensor_setkeras_mask(EagerTensor* self, PyObject* value,
   self->keras_mask = value;
   return 0;
 }
+
+static PyObject* EagerTensor_tensor_shape(EagerTensor* self, void* unused) {
+  Py_INCREF(self->tensor_shape);
+  return self->tensor_shape;
+}
+
+static int EagerTensor_settensor_shape(EagerTensor* self, PyObject* value,
+                                       void* unused) {
+  Py_DECREF(self->tensor_shape);
+  Py_INCREF(value);
+  self->tensor_shape = value;
+  return 0;
+}
 // Function `_copy_to_device`.
 static PyObject* EagerTensor_copy_to_device(EagerTensor* self, PyObject* args,
                                             PyObject* kwds) {
@@ -437,10 +495,16 @@ static PyObject* EagerTensor_numpy(EagerTensor* self) {
 
 // Getter `device`.
 static PyObject* EagerTensor_device(EagerTensor* self) {
+  const char* device = TFE_TensorHandleDeviceName(self->handle, self->status);
+  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+    // Cleanup self->status before returning.
+    TF_SetStatus(self->status, TF_OK, "");
+    return nullptr;
+  }
 #if PY_MAJOR_VERSION >= 3
-  return PyUnicode_FromString(TFE_TensorHandleDeviceName(self->handle));
+  return PyUnicode_FromString(device);
 #else
-  return PyBytes_FromString(TFE_TensorHandleDeviceName(self->handle));
+  return PyBytes_FromString(device);
 #endif
 }
 
@@ -455,6 +519,9 @@ static PyGetSetDef EagerTensor_getseters[] = {
     {const_cast<char*>("_keras_mask"), (getter)EagerTensor_keras_mask,
      (setter)EagerTensor_setkeras_mask, const_cast<char*>("_keras_mask"),
      nullptr},
+    {const_cast<char*>("_tensor_shape"), (getter)EagerTensor_tensor_shape,
+     (setter)EagerTensor_settensor_shape, const_cast<char*>("_tensor_shape"),
+     nullptr},
     {nullptr} /* Sentinel */
 };
 
@@ -491,16 +558,11 @@ PyTypeObject* EagerTensorType = nullptr;
 
 #if PY_MAJOR_VERSION >= 3
 static PyType_Slot EagerTensor_Type_slots[] = {
-    Py_tp_dealloc,
-    reinterpret_cast<void*>(EagerTensor_dealloc),
-    Py_tp_methods,
-    reinterpret_cast<void*>(EagerTensor_methods),
-    Py_tp_getset,
-    reinterpret_cast<void*>(EagerTensor_getseters),
-    Py_tp_init,
-    reinterpret_cast<void*>(EagerTensor_init),
-    0,
-    nullptr,
+    {Py_tp_dealloc, reinterpret_cast<void*>(EagerTensor_dealloc)},
+    {Py_tp_methods, reinterpret_cast<void*>(EagerTensor_methods)},
+    {Py_tp_getset, reinterpret_cast<void*>(EagerTensor_getseters)},
+    {Py_tp_init, reinterpret_cast<void*>(EagerTensor_init)},
+    {0, nullptr},
 };
 
 PyType_Spec EagerTensor_Type_spec = {"EagerTensor", sizeof(EagerTensor), 0,
@@ -575,7 +637,10 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
     t->handle_data = Py_None;
     Py_INCREF(Py_None);
     t->keras_mask = Py_None;
+    Py_INCREF(Py_None);
+    t->tensor_shape = Py_None;
     t->handle = handle;
+    t->status = TF_NewStatus();
   }
   return reinterpret_cast<PyObject*>(t);
 }
@@ -649,12 +714,12 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   return reinterpret_cast<PyObject*>(EagerTensorType);
 }
 
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
-  if (!PyList_Check(tensor_list)) {
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
+  if (!PyList_Check(tensors) && !PyTuple_Check(tensors)) {
     PyErr_SetString(PyExc_TypeError,
                     tensorflow::strings::StrCat(
-                        "tensor_list argument must be a list. Got \"",
-                        Py_TYPE(tensor_list)->tp_name, "\"")
+                        "tensors argument must be a list or a tuple. Got \"",
+                        Py_TYPE(tensors)->tp_name, "\"")
                         .c_str());
     return nullptr;
   }
@@ -668,13 +733,14 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
     return nullptr;
   }
 
-  Py_ssize_t num_tensors = PyList_Size(tensor_list);
+  Py_ssize_t num_tensors = PySequence_Fast_GET_SIZE(tensors);
   int64_t num_tensors_int = static_cast<int64_t>(num_tensors);
   auto tensor = tensorflow::make_safe(TF_AllocateTensor(
       TF_INT32, &num_tensors_int, /*num_dims=*/1, /*len=*/4 * num_tensors_int));
   int32_t* data = reinterpret_cast<int32_t*>(TF_TensorData(tensor.get()));
+  auto status = tensorflow::make_safe(TF_NewStatus());
   for (Py_ssize_t i = 0; i < num_tensors; ++i) {
-    PyObject* tensor_obj = PyList_GET_ITEM(tensor_list, i);
+    PyObject* tensor_obj = PySequence_Fast_GET_ITEM(tensors, i);
     if (!EagerTensor_CheckExact(tensor_obj)) {
       PyErr_SetString(PyExc_TypeError,
                       tensorflow::strings::StrCat(
@@ -687,21 +753,27 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
 
     EagerTensor* t = reinterpret_cast<EagerTensor*>(tensor_obj);
     TFE_TensorHandle* handle = t->handle;
-    if (slice_dim >= TFE_TensorHandleNumDims(handle)) {
-      PyErr_SetString(PyExc_IndexError,
-                      tensorflow::strings::StrCat(
-                          "Slice dimension (", slice_dim,
-                          ") must be smaller than rank of all "
-                          "tensors, but tensor at index ",
-                          i, " has rank ", TFE_TensorHandleNumDims(handle))
-                          .c_str());
+    int num_dims = TFE_TensorHandleNumDims(handle, status.get());
+    if (MaybeRaiseExceptionFromTFStatus(status.get(), PyExc_ValueError)) {
+      return nullptr;
+    }
+    if (slice_dim >= num_dims) {
+      PyErr_SetString(
+          PyExc_IndexError,
+          tensorflow::strings::StrCat("Slice dimension (", slice_dim,
+                                      ") must be smaller than rank of all "
+                                      "tensors, but tensor at index ",
+                                      i, " has rank ", num_dims)
+              .c_str());
+      return nullptr;
+    }
+    int64_t dim = TFE_TensorHandleDim(handle, slice_dim, status.get());
+    if (MaybeRaiseExceptionFromTFStatus(status.get(), PyExc_ValueError)) {
       return nullptr;
     }
-    int64_t dim = TFE_TensorHandleDim(handle, slice_dim);
     data[i] = dim;
   }
 
-  auto status = tensorflow::make_safe(TF_NewStatus());
   TFE_TensorHandle* handle = TFE_NewTensorHandle(tensor.get(), status.get());
   if (TF_GetCode(status.get()) != TF_OK) {
     PyErr_SetString(
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index aa1efdd1b81cca9df0088c4cecedfe52f258d2bc..63ab1ed84d5ba3f280904be8dd202912e42241d0 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -22,4 +22,14 @@ limitations under the License.
 bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 EagerTensor_id(const PyObject* tensor);
 
+namespace tensorflow {
+TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype);
+
+// TODO(nareshmodi): Move EagerCast and ReadVariableOp (which use the C API to
+// execute TFE Ops) to a separate common library.
+TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
+                            TF_DataType src_type_enum,
+                            TF_DataType dst_type_enum, TF_Status* out_status);
+}
+
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 16b7d1a119a409d1d0a77b220d5d0945b280b638..691b613e48b217c595fe0f3249c493facf756d47 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -51,6 +51,13 @@ void TFE_Py_Execute(TFE_Context* ctx, const char* device_name,
 // This function is not thread-safe.
 PyObject* TFE_Py_RegisterExceptionClass(PyObject* e);
 
+// Registers e as the type of the ResourceVariable class.
+// Returns Py_None if registration succeeds, else throws a TypeError and returns
+// NULL.
+//
+// This function is not thread-safe.
+PyObject* TFE_Py_RegisterResourceVariableType(PyObject* e);
+
 // Registers e as the Exception to be raised when the conditions of
 // TFE_Py_FastPathExecute_C have not been met. When this exception is set, it
 // is a signal to the calling code that it should fall back to the safer (and
@@ -59,6 +66,15 @@ PyObject* TFE_Py_RegisterExceptionClass(PyObject* e);
 // This function is not thread-safe.
 PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e);
 
+// Registers e as the backward_function_getter.
+// The registered function creates a backward function (a function that can
+// return the gradient of the inputs an op given the gradient of it's outputs).
+// The registered function will be passed the following arguments:
+//    op_name, attrs, num_inputs, op_inputs, op_outputs
+//
+// This function is not thread-safe.
+PyObject* TFE_Py_RegisterBackwardFunctionGetter(PyObject* e);
+
 // Returns 0 if 'status' is TF_OK. Otherwise, raises an exception (using
 // `exception` if not nullptr, else using the class registered via
 // TFE_Py_RegisterExceptionClass), and returns -1.
@@ -151,13 +167,10 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
 //  Item 2: device_name: Name of the device on which to execute the operation,
 //          or NULL for automatic selection.
 //  Item 3: op_name: Name of the TensorFlow op to execute.
-//  Item 4: record_gradient_callback: Callback that records the gradient of the
-//          result. The callback takes (op_name, inputs, attrs, result, name)
-//          - all sequences and records the gradient.
-//  Item 5: name: An optional name for the operation.
-//  Item 6: List representing all callbacks to execute after successful
+//  Item 4: name: An optional name for the operation.
+//  Item 5: List representing all callbacks to execute after successful
 //  op execute.
-//  Item 7 onwards: inputs - This is a list of inputs followed by a list of
+//  Item 6 onwards: inputs - This is a list of inputs followed by a list of
 //        attrs. It is not necessary for type attrs to be present.
 //
 // This is named _C since there doesn't seem to be any way to make it visible
@@ -165,19 +178,24 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
 // directive.
 PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args);
 
+// Record the gradient for a given op.
+PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
+                                PyObject* attrs, PyObject* results,
+                                PyObject* name);
+
 // Returns the set of variables watched by the given tape.
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
 
-// Returns an EagerTensor of dimension [len(`tensor_list`)] containing
-// the `slice_dim`'th dimension of each tensor in `tensor_list`. In other words,
+// Returns an EagerTensor of dimension [len(`tensors`)] containing
+// the `slice_dim`'th dimension of each tensor in `tensors`. In other words,
 // TFE_Py_TensorShapeSlice takes a slice of dimensions of tensors in
-// `tensor_list`. For example, if `tensor_list` contains tensors of with shapes
+// `tensors`. For example, if `tensors` contains tensors of with shapes
 // [1, 2, 3], [4, 5], [6, 7, 8, 9], TFE_Py_TensorShapeSlice called with
 // `slice_dim` equal to 1 will return [2, 5, 7].
 // On error, returns nullptr and sets python exception.
-// REQUIRES: `tensor_list` is a python list of EagerTensors
+// REQUIRES: `tensors` is a python list/tuple of EagerTensors
 // REQUIRES: `slice_dim` is non-negative and smaller than the rank of all
-//   tensors in `tensor_list`.
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim);
+//   tensors in `tensors`.
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index cabbcc48fd56563a50591cc6adabc3af75918401..4ecba1a46be8fffa413a3d7fb2e643dc00f1b0ef 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -24,18 +24,93 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/compactptrset.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/eager/pywrap_tensor.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
 
 using tensorflow::string;
 using tensorflow::strings::Printf;
 
 namespace {
 
+struct InputInfo {
+  InputInfo(int i, bool is_list) : i(i), is_list(is_list) {}
+
+  int i;
+  bool is_list = false;
+};
+
+using AttrToInputsMap =
+    tensorflow::gtl::FlatMap<string,
+                             tensorflow::gtl::InlinedVector<InputInfo, 4>>;
+
+tensorflow::mutex all_attr_to_input_maps_lock(
+    tensorflow::LINKER_INITIALIZED);
+tensorflow::gtl::FlatMap<string, AttrToInputsMap*>* GetAllAttrToInputsMaps() {
+  static auto* all_attr_to_input_maps =
+      new tensorflow::gtl::FlatMap<string, AttrToInputsMap*>;
+  return all_attr_to_input_maps;
+}
+
+AttrToInputsMap* GetAttrToInputsMap(const tensorflow::OpDef& op_def) {
+  tensorflow::mutex_lock l(all_attr_to_input_maps_lock);
+  auto* all_attr_to_input_maps = GetAllAttrToInputsMaps();
+
+  auto* output =
+      tensorflow::gtl::FindPtrOrNull(*all_attr_to_input_maps, op_def.name());
+  if (output != nullptr) {
+    return output;
+  }
+
+  std::unique_ptr<AttrToInputsMap> m(new AttrToInputsMap);
+
+  // Store a list of InputIndex -> List of corresponding inputs.
+  for (int i = 0; i < op_def.input_arg_size(); i++) {
+    if (!op_def.input_arg(i).type_attr().empty()) {
+      auto it = m->find(op_def.input_arg(i).type_attr());
+      if (it == m->end()) {
+        it = m->insert({op_def.input_arg(i).type_attr(), {}}).first;
+      }
+      it->second.emplace_back(i, !op_def.input_arg(i).number_attr().empty());
+    }
+  }
+
+  auto* retval = m.get();
+  (*all_attr_to_input_maps)[op_def.name()] = m.release();
+
+  return retval;
+}
+
+struct FastPathOpExecInfo {
+  TFE_Context* ctx;
+  const char* device_name;
+  // The op def of the main op being executed.
+  const tensorflow::OpDef* op_def;
+
+  bool run_callbacks;
+  bool run_post_exec_callbacks;
+  bool run_gradient_callback;
+
+  // The op name of the main op being executed.
+  PyObject* name;
+  // The op type name of the main op being executed.
+  PyObject* op_name;
+  PyObject* callbacks;
+
+  // All the args passed into the FastPathOpExecInfo.
+  PyObject* args;
+
+  // DTypes can come from another input that has the same attr. So build that
+  // map.
+  const AttrToInputsMap* attr_to_inputs_map;
+  tensorflow::gtl::FlatMap<string, tensorflow::DataType> cached_dtypes;
+};
+
 #define PARSE_VALUE(fn_name, type, check_fn, parse_fn)                       \
   bool fn_name(const string& key, PyObject* py_value, TF_Status* status,     \
                type* value) {                                                \
@@ -57,12 +132,29 @@ PARSE_VALUE(ParseIntValue, int, PyLong_Check, PyLong_AsLong)
 PARSE_VALUE(ParseInt64Value, int64_t, PyLong_Check, PyLong_AsLong)
 #else
 PARSE_VALUE(ParseIntValue, int, PyInt_Check, PyInt_AsLong)
-PARSE_VALUE(ParseInt64Value, int64_t, PyInt_Check, PyInt_AsLong)
-PARSE_VALUE(ParseInt64LongValue, int64_t, PyLong_Check, PyLong_AsLong)
 #endif
 PARSE_VALUE(ParseFloatValue, float, PyFloat_Check, PyFloat_AsDouble)
 #undef PARSE_VALUE
 
+#if PY_MAJOR_VERSION < 3
+bool ParseInt64Value(const string& key, PyObject* py_value, TF_Status* status,
+                     int64_t* value) {
+  if (PyInt_Check(py_value)) {
+    *value = static_cast<int64_t>(PyInt_AsLong(py_value));
+    return true;
+  } else if (PyLong_Check(py_value)) {
+    *value = static_cast<int64_t>(PyLong_AsLong(py_value));
+    return true;
+  }
+  TF_SetStatus(
+      status, TF_INVALID_ARGUMENT,
+      tensorflow::strings::StrCat("Expecting int or long value for attr ", key,
+                                  ", got ", py_value->ob_type->tp_name)
+          .c_str());
+  return false;
+}
+#endif
+
 Py_ssize_t TensorShapeNumDims(PyObject* value) {
   const auto size = PySequence_Size(value);
   if (size == -1) {
@@ -74,6 +166,34 @@ Py_ssize_t TensorShapeNumDims(PyObject* value) {
   return size;
 }
 
+bool IsInteger(PyObject* py_value) {
+#if PY_MAJOR_VERSION >= 3
+  return PyLong_Check(py_value);
+#else
+  return PyInt_Check(py_value);
+#endif
+}
+
+bool ParseDimensionValue(const string& key, PyObject* py_value,
+                         TF_Status* status, int64_t* value) {
+  if (IsInteger(py_value)) {
+    return ParseInt64Value(key, py_value, status, value);
+  }
+
+  tensorflow::Safe_PyObjectPtr dimension_value(
+      PyObject_GetAttrString(py_value, "_value"));
+  if (dimension_value == nullptr) {
+    TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        tensorflow::strings::StrCat("Expecting a Dimension for attr ", key,
+                                    ", got ", py_value->ob_type->tp_name)
+            .c_str());
+    return false;
+  }
+
+  return ParseInt64Value(key, dimension_value.get(), status, value);
+}
+
 bool ParseStringValue(const string& key, PyObject* py_value, TF_Status* status,
                       const char** value) {
   if (PyBytes_Check(py_value)) {
@@ -100,14 +220,6 @@ bool ParseBoolValue(const string& key, PyObject* py_value, TF_Status* status,
   return true;
 }
 
-bool IsInteger(PyObject* py_value) {
-#if PY_MAJOR_VERSION >= 3
-  return PyLong_Check(py_value);
-#else
-  return PyInt_Check(py_value);
-#endif
-}
-
 // The passed in py_value is expected to be an object of the python type
 // dtypes.DType or an int.
 bool ParseTypeValue(const string& key, PyObject* py_value, TF_Status* status,
@@ -116,18 +228,18 @@ bool ParseTypeValue(const string& key, PyObject* py_value, TF_Status* status,
     return ParseIntValue(key, py_value, status, value);
   }
 
-  PyObject* py_type_enum = PyObject_GetAttrString(py_value, "_type_enum");
+  tensorflow::Safe_PyObjectPtr py_type_enum(
+      PyObject_GetAttrString(py_value, "_type_enum"));
   if (py_type_enum == nullptr) {
+    TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        tensorflow::strings::StrCat("Expecting a DType.dtype for attr ", key,
+                                    ", got ", py_value->ob_type->tp_name)
+            .c_str());
     return false;
   }
 
-  if (!ParseIntValue(key, py_type_enum, status, value)) {
-    Py_DECREF(py_type_enum);
-    return false;
-  }
-
-  Py_DECREF(py_type_enum);
-  return true;
+  return ParseIntValue(key, py_type_enum.get(), status, value);
 }
 
 bool SetOpAttrList(
@@ -145,11 +257,11 @@ bool SetOpAttrList(
   const int num_values = PySequence_Size(py_list);
   if (attr_list_sizes != nullptr) (*attr_list_sizes)[key] = num_values;
 
-#define PARSE_LIST(c_type, parse_fn)                                \
-  std::unique_ptr<c_type[]> values(new c_type[num_values]);         \
-  for (int i = 0; i < num_values; ++i) {                            \
-    auto py_value = PySequence_ITEM(py_list, i);                    \
-    if (!parse_fn(key, py_value, status, &values[i])) return false; \
+#define PARSE_LIST(c_type, parse_fn)                                      \
+  std::unique_ptr<c_type[]> values(new c_type[num_values]);               \
+  for (int i = 0; i < num_values; ++i) {                                  \
+    tensorflow::Safe_PyObjectPtr py_value(PySequence_ITEM(py_list, i));   \
+    if (!parse_fn(key, py_value.get(), status, &values[i])) return false; \
   }
 
   if (type == TF_ATTR_STRING) {
@@ -174,9 +286,9 @@ bool SetOpAttrList(
     // dims across all the input lists.
     int total_dims = 0;
     for (int i = 0; i < num_values; ++i) {
-      auto py_value = PySequence_ITEM(py_list, i);
-      if (py_value != Py_None) {
-        if (!PySequence_Check(py_value)) {
+      tensorflow::Safe_PyObjectPtr py_value(PySequence_ITEM(py_list, i));
+      if (py_value.get() != Py_None) {
+        if (!PySequence_Check(py_value.get())) {
           TF_SetStatus(
               status, TF_INVALID_ARGUMENT,
               tensorflow::strings::StrCat(
@@ -185,7 +297,7 @@ bool SetOpAttrList(
                   .c_str());
           return false;
         }
-        const auto size = TensorShapeNumDims(py_value);
+        const auto size = TensorShapeNumDims(py_value.get());
         if (size >= 0) {
           total_dims += size;
         }
@@ -195,16 +307,16 @@ bool SetOpAttrList(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
-      auto py_value = PySequence_ITEM(py_list, i);
-      if (py_value == Py_None) {
+      tensorflow::Safe_PyObjectPtr py_value(PySequence_ITEM(py_list, i));
+      if (py_value.get() == Py_None) {
         dims[i] = nullptr;
         num_dims[i] = -1;
       } else {
-        const auto size = TensorShapeNumDims(py_value);
+        const auto size = TensorShapeNumDims(py_value.get());
         if (size == -1) {
           dims[i] = nullptr;
           num_dims[i] = -1;
@@ -213,10 +325,12 @@ bool SetOpAttrList(
         dims[i] = offset;
         num_dims[i] = size;
         for (int j = 0; j < size; ++j) {
-          auto inner_py_value = PySequence_ITEM(py_value, j);
-          if (inner_py_value == Py_None) {
+          tensorflow::Safe_PyObjectPtr inner_py_value(
+              PySequence_ITEM(py_value.get(), j));
+          if (inner_py_value.get() == Py_None) {
             *offset = -1;
-          } else if (!ParseInt64Value(key, inner_py_value, status, offset)) {
+          } else if (!ParseDimensionValue(key, inner_py_value.get(), status,
+                                          offset)) {
             return false;
           }
           ++offset;
@@ -237,21 +351,12 @@ bool SetOpAttrList(
   return true;
 }
 
-// This is only declared here since GetFunc makes a recursive call to
-// SetOpAttrScalarDefault.
-void SetOpAttrScalarDefault(
-    TFE_Context* ctx, TFE_Op* op, const tensorflow::AttrValue& default_value,
-    const char* attr_name,
-    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
-    TF_Status* status);
-
 TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
                 TF_Status* status) {
   TFE_Op* func_op = TFE_NewOp(ctx, func.name().data(), status);
   for (const auto& attr : func.attr()) {
     if (TF_GetCode(status) != TF_OK) return nullptr;
-    SetOpAttrScalarDefault(ctx, func_op, attr.second, attr.first.data(),
-                           nullptr, status);
+    SetOpAttrValueScalar(ctx, func_op, attr.second, attr.first.data(), status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
   }
   return func_op;
@@ -264,7 +369,7 @@ void SetOpAttrListDefault(
     TF_Status* status) {
   if (type == TF_ATTR_STRING) {
     int num_values = attr.default_value().list().s_size();
-    std::unique_ptr<const char* []> values(new const char*[num_values]);
+    std::unique_ptr<const char*[]> values(new const char*[num_values]);
     (*attr_list_sizes)[key] = num_values;
     for (int i = 0; i < num_values; i++) {
       values[i] = attr.default_value().list().s(i).data();
@@ -317,7 +422,7 @@ void SetOpAttrListDefault(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
@@ -337,7 +442,7 @@ void SetOpAttrListDefault(
   } else if (type == TF_ATTR_FUNC) {
     int num_values = attr.default_value().list().func_size();
     (*attr_list_sizes)[key] = num_values;
-    std::unique_ptr<const TFE_Op* []> funcs(new const TFE_Op*[num_values]);
+    std::unique_ptr<const TFE_Op*[]> funcs(new const TFE_Op*[num_values]);
     for (int i = 0; i < num_values; i++) {
       funcs[i] = GetFunc(ctx, attr.default_value().list().func(i), status);
     }
@@ -397,10 +502,12 @@ bool SetOpAttrScalar(
       }
       std::unique_ptr<int64_t[]> dims(new int64_t[num_dims]);
       for (int i = 0; i < num_dims; ++i) {
-        auto inner_py_value = PySequence_ITEM(py_value, i);
-        if (inner_py_value == Py_None) {
+        tensorflow::Safe_PyObjectPtr inner_py_value(
+            PySequence_ITEM(py_value, i));
+        if (inner_py_value.get() == Py_None) {
           dims[i] = -1;
-        } else if (!ParseInt64Value(key, inner_py_value, status, &dims[i])) {
+        } else if (!ParseDimensionValue(key, inner_py_value.get(), status,
+                                        &dims[i])) {
           return false;
         }
       }
@@ -451,57 +558,9 @@ void SetOpAttrScalarDefault(
     const char* attr_name,
     tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
     TF_Status* status) {
-  switch (default_value.value_case()) {
-    case tensorflow::AttrValue::kS:
-      TFE_OpSetAttrString(op, attr_name, default_value.s().data());
-      break;
-    case tensorflow::AttrValue::kI:
-      TFE_OpSetAttrInt(op, attr_name, static_cast<int64_t>(default_value.i()));
-      (*attr_list_sizes)[attr_name] = default_value.i();
-      break;
-    case tensorflow::AttrValue::kF:
-      TFE_OpSetAttrFloat(op, attr_name, default_value.f());
-      break;
-    case tensorflow::AttrValue::kB:
-      TFE_OpSetAttrBool(op, attr_name, default_value.b());
-      break;
-    case tensorflow::AttrValue::kType:
-      TFE_OpSetAttrType(op, attr_name,
-                        static_cast<TF_DataType>(default_value.type()));
-      break;
-    case tensorflow::AttrValue::kShape: {
-      const auto& tensor_shape = default_value.shape();
-      if (tensor_shape.unknown_rank()) {
-        TFE_OpSetAttrShape(op, attr_name, nullptr, -1, status);
-      } else {
-        const auto num_dims = tensor_shape.dim_size();
-        std::unique_ptr<int64_t[]> dims(new int64_t[num_dims]);
-        for (int i = 0; i < num_dims; ++i) {
-          dims[i] = tensor_shape.dim(i).size();
-        }
-        TFE_OpSetAttrShape(op, attr_name, dims.get(), num_dims, status);
-      }
-    } break;
-    case tensorflow::AttrValue::kFunc: {
-      const auto func_op = GetFunc(ctx, default_value.func(), status);
-      if (TF_GetCode(status) != TF_OK) return;
-      // TODO(nareshmodi): TFE_OpSetAttrFunction and TFE_OpSetAttrFunctionList
-      // require TFE_Op* and just convert it internally a NameAttrValue, so
-      // consider adding an overload to the C API to make this case easier.
-      TFE_OpSetAttrFunction(op, attr_name, func_op);
-    } break;
-    case tensorflow::AttrValue::kList:
-      TF_FALLTHROUGH_INTENDED;
-    case tensorflow::AttrValue::kTensor:
-      TF_FALLTHROUGH_INTENDED;
-    case tensorflow::AttrValue::kPlaceholder:
-      TF_FALLTHROUGH_INTENDED;
-    case tensorflow::AttrValue::VALUE_NOT_SET:
-      TF_SetStatus(
-          status, TF_UNIMPLEMENTED,
-          tensorflow::strings::StrCat("Unable to get setfor default value: ",
-                                      default_value.DebugString())
-              .data());
+  SetOpAttrValueScalar(ctx, op, default_value, attr_name, status);
+  if (default_value.value_case() == tensorflow::AttrValue::kI) {
+    (*attr_list_sizes)[attr_name] = default_value.i();
   }
 }
 
@@ -575,6 +634,11 @@ PyObject* exception_class GUARDED_BY(exception_class_mutex) = nullptr;
 // Python subclass of Exception that is created to signal fallback.
 PyObject* fallback_exception_class = nullptr;
 
+// Python function that returns a backward_function.
+PyObject* backward_function_getter = nullptr;
+
+PyTypeObject* resource_variable_type = nullptr;
+
 tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
 tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
 
@@ -623,11 +687,28 @@ PyObject* TFE_Py_RegisterExceptionClass(PyObject* e) {
                     "TFE_Py_RegisterExceptionClass: "
                     "Registered class should be subclass of Exception.");
     return nullptr;
-  } else {
-    Py_INCREF(e);
-    exception_class = e;
-    Py_RETURN_NONE;
   }
+
+  Py_INCREF(e);
+  exception_class = e;
+  Py_RETURN_NONE;
+}
+
+PyObject* TFE_Py_RegisterResourceVariableType(PyObject* e) {
+  if (!PyType_Check(e)) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "TFE_Py_RegisterResourceVariableType: Need to register a type.");
+    return nullptr;
+  }
+
+  if (resource_variable_type != nullptr) {
+    Py_DECREF(resource_variable_type);
+  }
+
+  Py_INCREF(e);
+  resource_variable_type = reinterpret_cast<PyTypeObject*>(e);
+  Py_RETURN_NONE;
 }
 
 PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e) {
@@ -647,6 +728,23 @@ PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e) {
   }
 }
 
+PyObject* TFE_Py_RegisterBackwardFunctionGetter(PyObject* e) {
+  if (backward_function_getter != nullptr) {
+    Py_DECREF(backward_function_getter);
+  }
+  if (!PyCallable_Check(e)) {
+    backward_function_getter = nullptr;
+    PyErr_SetString(PyExc_TypeError,
+                    "TFE_Py_RegisterBackwardFunctionGetter: "
+                    "Registered object should be function.");
+    return nullptr;
+  } else {
+    Py_INCREF(e);
+    backward_function_getter = e;
+    Py_RETURN_NONE;
+  }
+}
+
 void RaiseFallbackException(const char* message) {
   if (fallback_exception_class != nullptr) {
     PyErr_SetObject(fallback_exception_class, Py_BuildValue("s", message));
@@ -987,7 +1085,15 @@ static tensorflow::eager::TapeTensor TapeTensorFromTensor(PyObject* tensor) {
   if (EagerTensor_CheckExact(tensor)) {
     TFE_TensorHandle* t = EagerTensor_Handle(tensor);
     tensorflow::int64 id = EagerTensor_id(tensor);
-    return tensorflow::eager::TapeTensor{id, t->t.dtype(), t->t.shape()};
+    const tensorflow::Tensor* tensor = nullptr;
+    const tensorflow::Status status = t->handle->Tensor(&tensor);
+    if (MaybeRaiseExceptionFromStatus(status, nullptr)) {
+      return tensorflow::eager::TapeTensor{id, t->handle->dtype,
+                                           tensorflow::TensorShape({})};
+    } else {
+      return tensorflow::eager::TapeTensor{id, t->handle->dtype,
+                                           tensor->shape()};
+    }
   }
   tensorflow::int64 id = FastTensorId(tensor);
   if (PyErr_Occurred()) {
@@ -1062,16 +1168,10 @@ PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape) {
   return result;
 }
 
-void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
-                                   PyObject* input_tensors,
-                                   PyObject* backward_function) {
-  if (GetTapeSet()->empty() || *ThreadTapeIsStopped()) {
-    return;
-  }
-  std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
-  if (PyErr_Occurred()) {
-    return;
-  }
+namespace {
+void TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
+                            const std::vector<tensorflow::int64>& input_ids,
+                            PyObject* backward_function) {
   std::vector<tensorflow::eager::TapeTensor> output_info;
   PyObject* seq = PySequence_Fast(output_tensors,
                                   "expected a sequence of integer tensor ids");
@@ -1110,6 +1210,19 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
         [backward_function]() { Py_DECREF(backward_function); });
   }
 }
+}  // namespace
+
+void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
+                                   PyObject* input_tensors,
+                                   PyObject* backward_function) {
+  if (GetTapeSet()->empty() || *ThreadTapeIsStopped()) {
+    return;
+  }
+  std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
+  if (PyErr_Occurred()) return;
+
+  TapeSetRecordOperation(op_type, output_tensors, input_ids, backward_function);
+}
 
 void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) {
   for (TFE_Py_Tape* tape : SafeTapeSet()) {
@@ -1284,6 +1397,20 @@ std::vector<PyObject*> MakeTensorList(PyObject* tensors) {
 PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
                               PyObject* target, PyObject* sources,
                               PyObject* output_gradients, TF_Status* status) {
+  TFE_Py_Tape* tape_obj = reinterpret_cast<TFE_Py_Tape*>(tape);
+  if (!tape_obj->tape->IsPersistent()) {
+    auto* tape_set = GetTapeSet();
+    if (tape_set->find(tape_obj) != tape_set->end()) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "gradient() cannot be invoked within the "
+                      "GradientTape context (i.e., while operations are being "
+                      "recorded). Either move the call to gradient() to be "
+                      "outside the 'with tf.GradientTape' block, or "
+                      "use a persistent tape: "
+                      "'with tf.GradientTape(persistent=true)'");
+      return nullptr;
+    }
+  }
   PyVSpace c_vspace(vspace);
   if (!c_vspace.Initialize().ok()) {
     return nullptr;
@@ -1309,7 +1436,6 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
       Py_INCREF(tensor);
     }
   }
-  TFE_Py_Tape* tape_obj = reinterpret_cast<TFE_Py_Tape*>(tape);
   std::vector<PyObject*> result;
   status->status = tape_obj->tape->ComputeGradient(
       c_vspace, target_vec, sources_vec, outgrad_vec, &result);
@@ -1323,11 +1449,15 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
   }
   if (!result.empty()) {
     PyObject* py_result = PyList_New(result.size());
+    tensorflow::gtl::FlatSet<PyObject*> seen_results(result.size());
     for (int i = 0; i < result.size(); ++i) {
       if (result[i] == nullptr) {
         Py_INCREF(Py_None);
         result[i] = Py_None;
+      } else if (seen_results.find(result[i]) != seen_results.end()) {
+        Py_INCREF(result[i]);
       }
+      seen_results.insert(result[i]);
       PyList_SET_ITEM(py_result, i, reinterpret_cast<PyObject*>(result[i]));
     }
     return py_result;
@@ -1336,7 +1466,7 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
 }
 
 namespace {
-static const int kFastPathExecuteInputStartIndex = 6;
+static const int kFastPathExecuteInputStartIndex = 5;
 
 PyObject* GetPythonObjectFromString(const char* s) {
 #if PY_MAJOR_VERSION >= 3
@@ -1346,18 +1476,338 @@ PyObject* GetPythonObjectFromString(const char* s) {
 #endif
 }
 
-bool CheckEagerTensors(PyObject* seq, int start_index,
-                       const tensorflow::OpDef& op_def) {
+PyObject* GetPythonObjectFromInt(int num) {
+#if PY_MAJOR_VERSION >= 3
+  return PyLong_FromLong(num);
+#else
+  return PyInt_FromLong(num);
+#endif
+}
+
+bool CheckResourceVariable(PyObject* item) {
+  return PyObject_TypeCheck(item, resource_variable_type);
+}
+
+bool IsNumberType(PyObject* item) {
+#if PY_MAJOR_VERSION >= 3
+  return PyFloat_Check(item) || PyLong_Check(item);
+#else
+  return PyFloat_Check(item) || PyInt_Check(item) || PyLong_Check(item);
+#endif
+}
+
+bool CheckOneInput(PyObject* item) {
+  if (EagerTensor_CheckExact(item) || CheckResourceVariable(item) ||
+      PyArray_Check(item) || IsNumberType(item)) {
+    return true;
+  }
+
+  // Sequences are not properly handled. Sequences with purely python numeric
+  // types work, but sequences with mixes of EagerTensors and python numeric
+  // types don't work.
+  // TODO(nareshmodi): fix
+  return false;
+}
+
+bool CheckInputsOk(PyObject* seq, int start_index,
+                   const tensorflow::OpDef& op_def) {
   for (int i = 0; i < op_def.input_arg_size(); i++) {
     PyObject* item = PyTuple_GET_ITEM(seq, i + start_index);
     if (!op_def.input_arg(i).number_attr().empty() ||
         !op_def.input_arg(i).type_list_attr().empty()) {
-      // This item should be a list input.
-      if (!PyList_Check(item)) return false;
-      for (Py_ssize_t j = 0; j < PyList_Size(item); j++) {
-        if (!EagerTensor_CheckExact(PyList_GET_ITEM(item, j))) return false;
+      // This item should be a seq input.
+      if (!PySequence_Check(item)) {
+        VLOG(1) << "Falling back to slow path for Op \"" << op_def.name()
+                << "\", Input \"" << op_def.input_arg(i).name()
+                << "\" since we expected a sequence, but got "
+                << item->ob_type->tp_name;
+        return false;
+      }
+      for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(item); j++) {
+        PyObject* inner_item = PySequence_Fast_GET_ITEM(item, j);
+        if (!CheckOneInput(inner_item)) {
+          VLOG(1)
+              << "Falling back to slow path for Op \"" << op_def.name()
+              << "\", Input \"" << op_def.input_arg(i).name() << "\", Index "
+              << j
+              << " since we expected an EagerTensor/ResourceVariable, but got "
+              << inner_item->ob_type->tp_name;
+          return false;
+        }
+      }
+    } else if (!CheckOneInput(item)) {
+      VLOG(1)
+          << "Falling back to slow path for Op \"" << op_def.name()
+          << "\", Input \"" << op_def.input_arg(i).name()
+          << "\" since we expected an EagerTensor/ResourceVariable, but got "
+          << item->ob_type->tp_name;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+PyObject* MaybeGetDType(PyObject* item) {
+  if (EagerTensor_CheckExact(item)) {
+    tensorflow::Safe_PyObjectPtr py_dtype(
+        PyObject_GetAttrString(item, "dtype"));
+    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
+  }
+
+  if (CheckResourceVariable(item)) {
+    tensorflow::Safe_PyObjectPtr py_dtype(
+        PyObject_GetAttrString(item, "_dtype"));
+    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
+  }
+
+  return nullptr;
+}
+
+PyObject* MaybeGetDTypeForAttr(const string& attr,
+                               FastPathOpExecInfo* op_exec_info) {
+  auto cached_it = op_exec_info->cached_dtypes.find(attr);
+  if (cached_it != op_exec_info->cached_dtypes.end()) {
+    return GetPythonObjectFromInt(cached_it->second);
+  }
+
+  auto it = op_exec_info->attr_to_inputs_map->find(attr);
+  if (it == op_exec_info->attr_to_inputs_map->end()) {
+    // No other inputs - this should never happen.
+    Py_RETURN_NONE;
+  }
+
+  for (const auto& input_info : it->second) {
+    PyObject* item = PyTuple_GET_ITEM(
+        op_exec_info->args, kFastPathExecuteInputStartIndex + input_info.i);
+    if (input_info.is_list) {
+      for (int i = 0; i < PySequence_Fast_GET_SIZE(item); i++) {
+        auto* dtype = MaybeGetDType(PySequence_Fast_GET_ITEM(item, i));
+        if (dtype != nullptr) return dtype;
       }
-    } else if (!EagerTensor_CheckExact(item)) {
+    } else {
+      auto* dtype = MaybeGetDType(item);
+      if (dtype != nullptr) return dtype;
+    }
+  }
+
+  Py_RETURN_NONE;
+}
+
+bool OpDoesntRequireOutput(const string& op_name) {
+  static tensorflow::gtl::FlatSet<string>* ops_that_dont_require_outputs =
+      new tensorflow::gtl::FlatSet<string>({
+          "Identity",
+          "MatMul",
+          "Conv2DBackpropInput",
+          "Conv2DBackpropFilter",
+          "Conv3D",
+          "Conv3DBackpropInputV2",
+          "AvgPool3D",
+          "AvgPool3DGrad",
+          "MaxPool3D",
+          "MaxPool3DGrad",
+          "MaxPool3DGradGrad",
+          "BiasAdd",
+          "BiasAddV1",
+          "BiasAddGrad",
+          "Relu6",
+          "Softplus",
+          "SoftplusGrad",
+          "Softsign",
+          "ReluGrad",
+          "Conv2D",
+          "DepthwiseConv2dNative",
+          "Dilation2D",
+          "AvgPool",
+          "AvgPoolGrad",
+          "BatchNormWithGlobalNormalization",
+          "L2Loss",
+          "Sum",
+          "Prod",
+          "SegmentSum",
+          "SegmentMean",
+          "SparseSegmentSum",
+          "SparseSegmentMean",
+          "SparseSegmentSqrtN",
+          "SegmentMin",
+          "SegmentMax",
+          "UnsortedSegmentSum",
+          "UnsortedSegmentMax",
+          "Abs",
+          "Neg",
+          "ReciprocalGrad",
+          "Square",
+          "Expm1",
+          "Log",
+          "Log1p",
+          "TanhGrad",
+          "SigmoidGrad",
+          "Sign",
+          "Sin",
+          "Cos",
+          "Tan",
+          "Add",
+          "Sub",
+          "Mul",
+          "Div",
+          "RealDiv",
+          "Maximum",
+          "Minimum",
+          "SquaredDifference",
+          "Select",
+          "SparseMatMul",
+          "BatchMatMul",
+          "Complex",
+          "Real",
+          "Imag",
+          "Angle",
+          "Conj",
+          "Cast",
+          "Cross",
+          "Cumsum",
+          "Cumprod",
+          "ReadVariableOp",
+          "VarHandleOp",
+          "Shape",
+      });
+
+  return ops_that_dont_require_outputs->find(op_name) !=
+         ops_that_dont_require_outputs->end();
+}
+
+bool OpDoesntRequireInput(const string& op_name) {
+  static tensorflow::gtl::FlatSet<string>* ops_that_dont_require_inputs =
+      new tensorflow::gtl::FlatSet<string>({
+          "Identity",
+          "Softmax",
+          "LogSoftmax",
+          "BiasAdd",
+          "Relu",
+          "Elu",
+          "Selu",
+          "SparseSoftmaxCrossEntropyWithLogits",
+          "Neg",
+          "Inv",
+          "Reciprocal",
+          "Sqrt",
+          "Exp",
+          "Tanh",
+          "Sigmoid",
+          "Real",
+          "Imag",
+          "Conj",
+          "ReadVariableOp",
+          "VarHandleOp",
+          "Shape",
+      });
+
+  return ops_that_dont_require_inputs->find(op_name) !=
+         ops_that_dont_require_inputs->end();
+}
+
+PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
+                         PyObject* results, PyObject* name) {
+  std::vector<tensorflow::int64> input_ids = MakeTensorIDList(inputs);
+  if (PyErr_Occurred()) return nullptr;
+
+  bool should_record = false;
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
+    if (tape->tape->ShouldRecord(input_ids)) {
+      should_record = true;
+      break;
+    }
+  }
+  if (!should_record) Py_RETURN_NONE;
+
+  string c_op_name = TFE_GetPythonString(op_name);
+  PyObject* op_outputs;
+  if (OpDoesntRequireOutput(c_op_name)) {
+    op_outputs = Py_None;
+  } else {
+    op_outputs = results;
+  }
+
+  PyObject* op_inputs;
+  if (OpDoesntRequireInput(c_op_name)) {
+    op_inputs = Py_None;
+  } else {
+    op_inputs = inputs;
+  }
+
+  PyObject* num_inputs = PyLong_FromLong(PySequence_Size(inputs));
+  PyObject* callback_args =
+      Py_BuildValue("OOOOO", op_name, attrs, num_inputs, op_inputs, op_outputs);
+
+  PyObject* backward_function =
+      PyObject_CallObject(backward_function_getter, callback_args);
+  Py_DECREF(callback_args);
+  if (backward_function == nullptr) return nullptr;
+
+  TapeSetRecordOperation(op_name, results, input_ids, backward_function);
+
+  Py_DECREF(backward_function);
+
+  Py_RETURN_NONE;
+}
+
+void MaybeWatchVariable(PyObject* input) {
+  DCHECK(CheckResourceVariable(input));
+  DCHECK(PyObject_HasAttrString(input, "_trainable"));
+
+  tensorflow::Safe_PyObjectPtr trainable(
+      PyObject_GetAttrString(input, "_trainable"));
+  if (trainable.get() == Py_False) return;
+  TFE_Py_TapeSetWatchVariable(input);
+}
+
+bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
+                    PyObject* input, tensorflow::Safe_PyObjectPtr* output,
+                    TF_Status* status) {
+  MaybeWatchVariable(input);
+
+  TFE_Op* op = TFE_NewOp(parent_op_exec_info.ctx, "ReadVariableOp", status);
+  auto cleaner = tensorflow::gtl::MakeCleanup([op] { TFE_DeleteOp(op); });
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
+
+  // Set dtype
+  DCHECK(PyObject_HasAttrString(input, "_dtype"));
+  tensorflow::Safe_PyObjectPtr dtype(PyObject_GetAttrString(input, "_dtype"));
+  int value;
+  if (!ParseTypeValue("_dtype", dtype.get(), status, &value)) {
+    return false;
+  }
+  TFE_OpSetAttrType(op, "dtype", static_cast<TF_DataType>(value));
+
+  TFE_OpSetDevice(op, parent_op_exec_info.device_name, status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
+
+  // Get handle
+  tensorflow::Safe_PyObjectPtr handle(PyObject_GetAttrString(input, "_handle"));
+  if (!EagerTensor_CheckExact(handle.get())) return false;
+  TFE_OpAddInput(op, EagerTensor_Handle(handle.get()), status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
+
+  int num_retvals = 1;
+  TFE_TensorHandle* output_handle;
+  TFE_Execute(op, &output_handle, &num_retvals, status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
+
+  // Always create the py object (and correctly DECREF it) from the returned
+  // value, else the data will leak.
+  output->reset(EagerTensorFromHandle(output_handle));
+
+  // TODO(nareshmodi): Should we run post exec callbacks here?
+  if (parent_op_exec_info.run_gradient_callback) {
+    tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(1));
+    PyTuple_SET_ITEM(inputs.get(), 0, handle.release());
+
+    tensorflow::Safe_PyObjectPtr outputs(PyTuple_New(1));
+    Py_INCREF(output->get());  // stay alive after since tuple steals.
+    PyTuple_SET_ITEM(outputs.get(), 0, output->get());
+
+    if (!RecordGradient(GetPythonObjectFromString("ReadVariableOp"),
+                        inputs.get(), Py_None, outputs.get(), Py_None)) {
       return false;
     }
   }
@@ -1365,30 +1815,129 @@ bool CheckEagerTensors(PyObject* seq, int start_index,
   return true;
 }
 
+// Supports only 2 cases at the moment:
+//  i) input is an EagerTensor
+//  ii) input is a ResourceVariable - in this case, the is_variable param is set
+//  to true.
+//
+//  NOTE: dtype_hint_getter must *always* return a PyObject that can be
+//  decref'd. So if no hint is found, Py_RETURN_NONE (which correctly
+//  increfs Py_None).
+bool ConvertToTensor(
+    const FastPathOpExecInfo& op_exec_info, PyObject* input,
+    tensorflow::Safe_PyObjectPtr* output_handle,
+    // This gets a hint for this particular input.
+    const std::function<PyObject*()>& dtype_hint_getter,
+    // This sets the dtype after conversion is complete.
+    const std::function<void(const TF_DataType& dtype)>& dtype_setter,
+    TF_Status* status) {
+  if (EagerTensor_CheckExact(input)) {
+    Py_INCREF(input);
+    output_handle->reset(input);
+    return true;
+  } else if (CheckResourceVariable(input)) {
+    return ReadVariableOp(op_exec_info, input, output_handle, status);
+  }
+
+  // The hint comes from a supposedly similarly typed tensor.
+  tensorflow::Safe_PyObjectPtr dtype_hint(dtype_hint_getter());
+  if (PyErr_Occurred()) {
+    return false;
+  }
+
+  tensorflow::Safe_TFE_TensorHandlePtr handle =
+      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(
+          tensorflow::ConvertToEagerTensor(input, dtype_hint.get())));
+  if (handle == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Unable to convert value to tensor");
+    return false;
+  }
+
+  int desired_dtype = -1;
+  if (dtype_hint.get() != Py_None) {
+    if (!ParseTypeValue("", dtype_hint.get(), status, &desired_dtype)) {
+      status->status = tensorflow::errors::InvalidArgument(
+          "Expecting a DataType value for dtype. Got ",
+          Py_TYPE(dtype_hint.get())->tp_name);
+    }
+  }
+
+  TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
+  if (desired_dtype >= 0 && desired_dtype != handle_dtype) {
+    handle = tensorflow::make_safe(
+        tensorflow::EagerCast(op_exec_info.ctx, handle.get(), handle_dtype,
+                              static_cast<TF_DataType>(desired_dtype), status));
+    if (!status->status.ok()) return false;
+
+    handle_dtype = TFE_TensorHandleDataType(handle.get());
+  }
+
+  if (handle_dtype != TF_INT32) {
+    // Note that this is a shallow copy and will share the underlying buffer
+    // if copying to the same device.
+    handle = tensorflow::make_safe(TFE_TensorHandleCopyToDevice(
+        handle.get(), op_exec_info.ctx, op_exec_info.device_name, status));
+    if (!status->status.ok()) return false;
+  }
+
+  output_handle->reset(EagerTensorFromHandle(handle.release()));
+
+  dtype_setter(handle_dtype);
+
+  return true;
+}
+
 // Adds input and type attr to the op, and to the list of flattened
 // inputs/attrs.
-bool AddInputToOp(PyObject* input, const tensorflow::OpDef::ArgDef* input_arg,
-                  std::vector<PyObject*>* flattened_attrs,
-                  std::vector<PyObject*>* flattened_inputs, TFE_Op* op,
-                  TF_Status* status) {
-  TFE_TensorHandle* input_handle = EagerTensor_Handle(input);
-  if (input_arg != nullptr && !input_arg->type_attr().empty()) {
+bool AddInputToOp(FastPathOpExecInfo* op_exec_info, PyObject* input,
+                  const bool add_type_attr,
+                  const tensorflow::OpDef::ArgDef& input_arg,
+                  std::vector<tensorflow::Safe_PyObjectPtr>* flattened_attrs,
+                  std::vector<tensorflow::Safe_PyObjectPtr>* flattened_inputs,
+                  TFE_Op* op, TF_Status* status) {
+  // py_eager_tensor's ownership is transferred to flattened_inputs if it is
+  // required, else the object is destroyed and DECREF'd when the object goes
+  // out of scope in this function.
+  tensorflow::Safe_PyObjectPtr py_eager_tensor = nullptr;
+
+  if (!ConvertToTensor(
+          *op_exec_info, input, &py_eager_tensor,
+          [&]() {
+            if (input_arg.type() != tensorflow::DataType::DT_INVALID) {
+              return GetPythonObjectFromInt(input_arg.type());
+            }
+            return MaybeGetDTypeForAttr(input_arg.type_attr(), op_exec_info);
+          },
+          [&](const TF_DataType dtype) {
+            op_exec_info->cached_dtypes[input_arg.type_attr()] =
+                static_cast<tensorflow::DataType>(dtype);
+          },
+          status)) {
+    return false;
+  }
+
+  TFE_TensorHandle* input_handle = EagerTensor_Handle(py_eager_tensor.get());
+
+  if (add_type_attr && !input_arg.type_attr().empty()) {
     auto dtype = TFE_TensorHandleDataType(input_handle);
-    TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype);
+    TFE_OpSetAttrType(op, input_arg.type_attr().data(), dtype);
     if (flattened_attrs != nullptr) {
-      flattened_attrs->push_back(
-          GetPythonObjectFromString(input_arg->type_attr().data()));
-      flattened_attrs->push_back(PyLong_FromLong(dtype));
+      flattened_attrs->emplace_back(
+          GetPythonObjectFromString(input_arg.type_attr().data()));
+      flattened_attrs->emplace_back(PyLong_FromLong(dtype));
     }
   }
 
   if (flattened_inputs != nullptr) {
-    flattened_inputs->push_back(input);
+    flattened_inputs->emplace_back(std::move(py_eager_tensor));
   }
+
   TFE_OpAddInput(op, input_handle, status);
   if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
     return false;
   }
+
   return true;
 }
 
@@ -1418,11 +1967,11 @@ const char* GetDeviceName(PyObject* py_device_name) {
   return nullptr;
 }
 
-bool RaiseIfNotPyList(PyObject* list, const string& attr_name) {
-  if (!PyList_Check(list)) {
+bool RaiseIfNotPySequence(PyObject* seq, const string& attr_name) {
+  if (!PySequence_Check(seq)) {
     PyErr_SetString(PyExc_TypeError,
-                    Printf("expected a list for attr %s, got %s instead",
-                           attr_name.data(), list->ob_type->tp_name)
+                    Printf("expected a sequence for attr %s, got %s instead",
+                           attr_name.data(), seq->ob_type->tp_name)
                         .data());
 
     return false;
@@ -1430,67 +1979,53 @@ bool RaiseIfNotPyList(PyObject* list, const string& attr_name) {
   return true;
 }
 
-bool RunCallbacks(bool run_gradient_callback, bool run_post_exec_callbacks,
-                  const tensorflow::OpDef* op_def, PyObject* args,
-                  const std::vector<PyObject*>& flattened_inputs,
-                  const std::vector<PyObject*>& flattened_attrs,
-                  PyObject* flattened_result, PyObject* op_name, PyObject* name,
-                  PyObject* record_gradient_callback, PyObject* callbacks) {
-  PyObject* inputs = PyTuple_New(flattened_inputs.size());
+bool RunCallbacks(
+    const FastPathOpExecInfo& op_exec_info, PyObject* args,
+    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_inputs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_attrs,
+    PyObject* flattened_result) {
+  if (!op_exec_info.run_callbacks) return true;
+
+  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs.size()));
   for (int i = 0; i < flattened_inputs.size(); i++) {
-    PyObject* input = flattened_inputs[i];
+    PyObject* input = flattened_inputs[i].get();
     Py_INCREF(input);
-    PyTuple_SET_ITEM(inputs, i, input);
+    PyTuple_SET_ITEM(inputs.get(), i, input);
   }
 
   int num_non_inferred_attrs = PyTuple_GET_SIZE(args) -
-                               op_def->input_arg_size() -
+                               op_exec_info.op_def->input_arg_size() -
                                kFastPathExecuteInputStartIndex;
   int num_attrs = flattened_attrs.size() + num_non_inferred_attrs;
-  PyObject* attrs = PyTuple_New(num_attrs);
+  tensorflow::Safe_PyObjectPtr attrs(PyTuple_New(num_attrs));
 
   for (int i = 0; i < num_non_inferred_attrs; i++) {
-    auto* attr = PyTuple_GET_ITEM(
-        args, kFastPathExecuteInputStartIndex + op_def->input_arg_size() + i);
+    auto* attr =
+        PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex +
+                                   op_exec_info.op_def->input_arg_size() + i);
     Py_INCREF(attr);
-    PyTuple_SET_ITEM(attrs, i, attr);
+    PyTuple_SET_ITEM(attrs.get(), i, attr);
   }
   for (int i = num_non_inferred_attrs; i < num_attrs; i++) {
-    // Not INCREFing anything in flattened_attrs as each of those is a new
-    // reference, so allow the attrs tuple to steal the reference.
-    PyTuple_SET_ITEM(attrs, i, flattened_attrs.at(i - num_non_inferred_attrs));
+    PyObject* attr_or_name =
+        flattened_attrs.at(i - num_non_inferred_attrs).get();
+    Py_INCREF(attr_or_name);
+    PyTuple_SET_ITEM(attrs.get(), i, attr_or_name);
   }
 
-  PyObject* callback_args =
-      Py_BuildValue("OOOOO", op_name, inputs, attrs, flattened_result, name);
-
-  auto cleaner = tensorflow::gtl::MakeCleanup([inputs, attrs, callback_args] {
-    Py_DECREF(inputs);
-    Py_DECREF(attrs);
-    Py_DECREF(callback_args);
-  });
-
-  if (run_gradient_callback) {
-    if (!PyCallable_Check(record_gradient_callback)) {
-      PyErr_SetString(PyExc_TypeError,
-                      Printf("expected a function for "
-                             "record_gradient_callback, got %s instead",
-                             record_gradient_callback->ob_type->tp_name)
-                          .c_str());
+  if (op_exec_info.run_gradient_callback) {
+    if (!RecordGradient(op_exec_info.op_name, inputs.get(), attrs.get(),
+                        flattened_result, op_exec_info.name)) {
       return false;
     }
-
-    PyObject* callback_result =
-        PyObject_CallObject(record_gradient_callback, callback_args);
-    if (!callback_result) {
-      return false;
-    }
-    Py_DECREF(callback_result);
   }
 
-  if (run_post_exec_callbacks) {
-    for (Py_ssize_t i = 0; i < PyList_Size(callbacks); i++) {
-      PyObject* callback_fn = PyList_GET_ITEM(callbacks, i);
+  if (op_exec_info.run_post_exec_callbacks) {
+    tensorflow::Safe_PyObjectPtr callback_args(
+        Py_BuildValue("OOOOO", op_exec_info.op_name, inputs.get(), attrs.get(),
+                      flattened_result, op_exec_info.name));
+    for (Py_ssize_t i = 0; i < PyList_Size(op_exec_info.callbacks); i++) {
+      PyObject* callback_fn = PyList_GET_ITEM(op_exec_info.callbacks, i);
       if (!PyCallable_Check(callback_fn)) {
         PyErr_SetString(
             PyExc_TypeError,
@@ -1501,7 +2036,7 @@ bool RunCallbacks(bool run_gradient_callback, bool run_post_exec_callbacks,
         return false;
       }
       PyObject* callback_result =
-          PyObject_CallObject(callback_fn, callback_args);
+          PyObject_CallObject(callback_fn, callback_args.get());
       if (!callback_result) {
         return false;
       }
@@ -1525,15 +2060,40 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
-  TFE_Context* ctx = reinterpret_cast<TFE_Context*>(
+  FastPathOpExecInfo op_exec_info;
+
+  op_exec_info.ctx = reinterpret_cast<TFE_Context*>(
       PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
-  const char* device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
-  PyObject* op_name = PyTuple_GET_ITEM(args, 2);
-  const tensorflow::OpDef* op_def = GetOpDef(op_name);
-  if (op_def == nullptr) return nullptr;
-  PyObject* record_gradient_callback = PyTuple_GET_ITEM(args, 3);
-  PyObject* name = PyTuple_GET_ITEM(args, 4);
-  PyObject* callbacks = PyTuple_GET_ITEM(args, 5);
+  op_exec_info.args = args;
+
+  if (op_exec_info.ctx == nullptr) {
+    // The context hasn't been initialized. It will be in the slow path.
+    RaiseFallbackException(
+        "This function does not handle the case of the path where "
+        "all inputs are not already EagerTensors.");
+    return nullptr;
+  }
+
+  op_exec_info.device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
+  op_exec_info.op_name = PyTuple_GET_ITEM(args, 2);
+  op_exec_info.op_def = GetOpDef(op_exec_info.op_name);
+  if (op_exec_info.op_def == nullptr) return nullptr;
+  op_exec_info.name = PyTuple_GET_ITEM(args, 3);
+  op_exec_info.callbacks = PyTuple_GET_ITEM(args, 4);
+
+  const tensorflow::OpDef* op_def = op_exec_info.op_def;
+
+  // TODO(nareshmodi): Add a benchmark for the fast-path with gradient callbacks
+  // (similar to benchmark_tf_gradient_function_*). Also consider using an
+  // InlinedVector for flattened_attrs and flattened_inputs if the benchmarks
+  // point out problems with heap allocs.
+  op_exec_info.run_gradient_callback =
+      !*ThreadTapeIsStopped() && !GetTapeSet()->empty();
+  op_exec_info.run_post_exec_callbacks =
+      op_exec_info.callbacks != Py_None &&
+      PyList_Size(op_exec_info.callbacks) > 0;
+  op_exec_info.run_callbacks = op_exec_info.run_gradient_callback ||
+                               op_exec_info.run_post_exec_callbacks;
 
   if (args_size < kFastPathExecuteInputStartIndex + op_def->input_arg_size()) {
     PyErr_SetString(
@@ -1546,15 +2106,17 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
-  if (!CheckEagerTensors(args, kFastPathExecuteInputStartIndex, *op_def)) {
+  if (!CheckInputsOk(args, kFastPathExecuteInputStartIndex, *op_def)) {
     RaiseFallbackException(
         "This function does not handle the case of the path where "
         "all inputs are not already EagerTensors.");
     return nullptr;
   }
 
+  op_exec_info.attr_to_inputs_map = GetAttrToInputsMap(*op_def);
+
   TF_Status* status = TF_NewStatus();
-  TFE_Op* op = TFE_NewOp(ctx, op_def->name().c_str(), status);
+  TFE_Op* op = TFE_NewOp(op_exec_info.ctx, op_def->name().c_str(), status);
   auto cleaner = tensorflow::gtl::MakeCleanup([status, op] {
     TF_DeleteStatus(status);
     TFE_DeleteOp(op);
@@ -1581,10 +2143,13 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     // OpRegistrationData.
     for (const auto& attr : op_def->attr()) {
       if (attr_name == attr.name()) {
-        SetOpAttrWithDefaults(ctx, op, attr, attr_name.data(), py_attr_value,
-                              &attr_list_sizes, status);
+        SetOpAttrWithDefaults(op_exec_info.ctx, op, attr, attr_name.data(),
+                              py_attr_value, &attr_list_sizes, status);
 
         if (TF_GetCode(status) != TF_OK) {
+          VLOG(1) << "Falling back to slow path for Op \"" << op_def->name()
+                  << "\" since we are unable to set the value for attr \""
+                  << attr.name() << "\" due to: " << TF_Message(status);
           RaiseFallbackException(TF_Message(status));
           return nullptr;
         }
@@ -1594,34 +2159,28 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     }
   }
 
-  TFE_OpSetDevice(op, device_name, status);
+  TFE_OpSetDevice(op, op_exec_info.device_name, status);
   if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
     return nullptr;
   }
 
-  // TODO(nareshmodi): Add a benchmark for the fast-path with gradient callbacks
-  // (similar to benchmark_tf_gradient_function_*). Also consider using an
-  // InlinedVector for flattened_attrs and flattened_inputs if the benchmarks
-  // point out problems with heap allocs.
-  bool run_gradient_callback = !*ThreadTapeIsStopped() &&
-                               !GetTapeSet()->empty() &&
-                               record_gradient_callback != Py_None;
-  bool run_post_exec_callbacks =
-      callbacks != Py_None && PyList_Size(callbacks) > 0;
-  bool run_callbacks = run_gradient_callback || run_post_exec_callbacks;
   // Flat attrs and inputs as required by the record_gradient call. The attrs
   // here only contain inferred attrs (non-inferred attrs are added directly
   // from the input args).
-  // All items in flattened_attrs contain new references.
-  // All items in flattened_inputs contain borrowed references.
+  // All items in flattened_attrs and flattened_inputs contain
+  // Safe_PyObjectPtr - any time something steals a reference to this, it must
+  // INCREF.
   // TODO(nareshmodi): figure out why PyList_New/PyList_Append don't work
   // directly.
-  std::unique_ptr<std::vector<PyObject*>> flattened_attrs = nullptr;
-  std::unique_ptr<std::vector<PyObject*>> flattened_inputs = nullptr;
+  std::unique_ptr<std::vector<tensorflow::Safe_PyObjectPtr>> flattened_attrs =
+      nullptr;
+  std::unique_ptr<std::vector<tensorflow::Safe_PyObjectPtr>> flattened_inputs =
+      nullptr;
 
-  if (run_callbacks) {
-    flattened_attrs.reset(new std::vector<PyObject*>);
-    flattened_inputs.reset(new std::vector<PyObject*>);
+  // TODO(nareshmodi): Encapsulate callbacks information into a struct.
+  if (op_exec_info.run_callbacks) {
+    flattened_attrs.reset(new std::vector<tensorflow::Safe_PyObjectPtr>);
+    flattened_inputs.reset(new std::vector<tensorflow::Safe_PyObjectPtr>);
   }
 
   // Add inferred attrs and inputs.
@@ -1637,29 +2196,29 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
         PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i);
     if (!input_arg.number_attr().empty()) {
       // The item is a homogeneous list.
-      if (!RaiseIfNotPyList(input, input_arg.number_attr())) return nullptr;
-      Py_ssize_t len = PyList_Size(input);
+      if (!RaiseIfNotPySequence(input, input_arg.number_attr())) return nullptr;
+      Py_ssize_t len = PySequence_Fast_GET_SIZE(input);
 
       TFE_OpSetAttrInt(op, input_arg.number_attr().data(), len);
-      if (run_callbacks) {
-        flattened_attrs->push_back(
+      if (op_exec_info.run_callbacks) {
+        flattened_attrs->emplace_back(
             GetPythonObjectFromString(input_arg.number_attr().data()));
-        flattened_attrs->push_back(PyLong_FromLong(len));
+        flattened_attrs->emplace_back(PyLong_FromLong(len));
       }
       attr_list_sizes[input_arg.number_attr()] = len;
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(PyList_GET_ITEM(input, 0), &input_arg,
-                          flattened_attrs.get(), flattened_inputs.get(), op,
-                          status)) {
+        if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
+                          true, input_arg, flattened_attrs.get(),
+                          flattened_inputs.get(), op, status)) {
           return nullptr;
         }
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(PyList_GET_ITEM(input, j), nullptr /* input_arg */,
-                            nullptr /* flattened_attrs */,
+          if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, j),
+                            false, input_arg, nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
             return nullptr;
           }
@@ -1667,17 +2226,28 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       }
     } else if (!input_arg.type_list_attr().empty()) {
       // The item is a heterogeneous list.
-      if (!RaiseIfNotPyList(input, input_arg.type_list_attr())) return nullptr;
+      if (!RaiseIfNotPySequence(input, input_arg.type_list_attr())) {
+        return nullptr;
+      }
       const string& attr_name = input_arg.type_list_attr();
-      Py_ssize_t len = PyList_Size(input);
+      Py_ssize_t len = PySequence_Fast_GET_SIZE(input);
       tensorflow::gtl::InlinedVector<TF_DataType, 4> attr_value(len);
       PyObject* py_attr_value = nullptr;
-      if (run_callbacks) {
+      if (op_exec_info.run_callbacks) {
         py_attr_value = PyTuple_New(len);
       }
       for (Py_ssize_t j = 0; j < len; j++) {
-        PyObject* py_input = PyList_GET_ITEM(input, j);
-        TFE_TensorHandle* input_handle = EagerTensor_Handle(py_input);
+        PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
+        tensorflow::Safe_PyObjectPtr py_eager_tensor;
+        if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
+                             []() { Py_RETURN_NONE; },
+                             [](const TF_DataType& dtype) {}, status)) {
+          return nullptr;
+        }
+
+        TFE_TensorHandle* input_handle =
+            EagerTensor_Handle(py_eager_tensor.get());
+
         attr_value[j] = TFE_TensorHandleDataType(input_handle);
 
         TFE_OpAddInput(op, input_handle, status);
@@ -1685,23 +2255,25 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
           return nullptr;
         }
 
-        if (run_callbacks) {
-          flattened_inputs->push_back(py_input);
+        if (op_exec_info.run_callbacks) {
+          flattened_inputs->emplace_back(std::move(py_eager_tensor));
 
           PyTuple_SET_ITEM(py_attr_value, j, PyLong_FromLong(attr_value[j]));
         }
       }
-      if (run_callbacks) {
-        flattened_attrs->push_back(GetPythonObjectFromString(attr_name.data()));
-        flattened_attrs->push_back(py_attr_value);
+      if (op_exec_info.run_callbacks) {
+        flattened_attrs->emplace_back(
+            GetPythonObjectFromString(attr_name.data()));
+        flattened_attrs->emplace_back(py_attr_value);
       }
       TFE_OpSetAttrTypeList(op, attr_name.data(), attr_value.data(),
                             attr_value.size());
       attr_list_sizes[attr_name] = len;
     } else {
       // The item is a single item.
-      if (!AddInputToOp(input, &input_arg, flattened_attrs.get(),
-                        flattened_inputs.get(), op, status)) {
+      if (!AddInputToOp(&op_exec_info, input, true, input_arg,
+                        flattened_attrs.get(), flattened_inputs.get(), op,
+                        status)) {
         return nullptr;
       }
     }
@@ -1724,27 +2296,27 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
   Py_BEGIN_ALLOW_THREADS;
   TFE_Execute(op, retvals.data(), &num_retvals, status);
   Py_END_ALLOW_THREADS;
+
   if (TF_GetCode(status) != TF_OK) {
     // Augment the status with the op_name for easier debugging similar to
     // TFE_Py_Execute.
     TF_SetStatus(status, TF_GetCode(status),
-                 tensorflow::strings::StrCat(TF_Message(status), " [Op:",
-                                             TFE_GetPythonString(op_name), "]")
+                 tensorflow::strings::StrCat(
+                     TF_Message(status),
+                     " [Op:", TFE_GetPythonString(op_exec_info.op_name), "]")
                      .c_str());
 
     MaybeRaiseExceptionFromTFStatus(status, nullptr);
     return nullptr;
   }
 
-  PyObject* flat_result = PyList_New(num_retvals);
+  tensorflow::Safe_PyObjectPtr flat_result(PyList_New(num_retvals));
   for (int i = 0; i < num_retvals; ++i) {
-    PyList_SET_ITEM(flat_result, i, EagerTensorFromHandle(retvals[i]));
+    PyList_SET_ITEM(flat_result.get(), i, EagerTensorFromHandle(retvals[i]));
   }
 
-  if (run_callbacks &&
-      !RunCallbacks(run_gradient_callback, run_post_exec_callbacks, op_def,
-                    args, *flattened_inputs, *flattened_attrs, flat_result,
-                    op_name, name, record_gradient_callback, callbacks)) {
+  if (!RunCallbacks(op_exec_info, args, *flattened_inputs, *flattened_attrs,
+                    flat_result.get())) {
     return nullptr;
   }
 
@@ -1756,11 +2328,10 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
   if (op_def->output_arg_size() == 1) {
     if (!op_def->output_arg(0).number_attr().empty() ||
         !op_def->output_arg(0).type_list_attr().empty()) {
-      return flat_result;
+      return flat_result.release();
     } else {
-      auto* result = PyList_GET_ITEM(flat_result, 0);
+      auto* result = PyList_GET_ITEM(flat_result.get(), 0);
       Py_INCREF(result);
-      Py_DECREF(flat_result);
       return result;
     }
   }
@@ -1773,7 +2344,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       int list_length = attr_list_sizes[op_def->output_arg(i).number_attr()];
       PyObject* inner_list = PyList_New(list_length);
       for (int j = 0; j < list_length; j++) {
-        PyObject* obj = PyList_GET_ITEM(flat_result, flat_result_index++);
+        PyObject* obj = PyList_GET_ITEM(flat_result.get(), flat_result_index++);
         Py_INCREF(obj);
         PyList_SET_ITEM(inner_list, j, obj);
       }
@@ -1782,17 +2353,26 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       int list_length = attr_list_sizes[op_def->output_arg(i).type_list_attr()];
       PyObject* inner_list = PyList_New(list_length);
       for (int j = 0; j < list_length; j++) {
-        PyObject* obj = PyList_GET_ITEM(flat_result, flat_result_index++);
+        PyObject* obj = PyList_GET_ITEM(flat_result.get(), flat_result_index++);
         Py_INCREF(obj);
         PyList_SET_ITEM(inner_list, j, obj);
       }
       PyList_SET_ITEM(result, i, inner_list);
     } else {
-      PyObject* obj = PyList_GET_ITEM(flat_result, flat_result_index++);
+      PyObject* obj = PyList_GET_ITEM(flat_result.get(), flat_result_index++);
       Py_INCREF(obj);
       PyList_SET_ITEM(result, i, obj);
     }
   }
-  Py_DECREF(flat_result);
   return result;
 }
+
+PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
+                                PyObject* attrs, PyObject* results,
+                                PyObject* name) {
+  if (*ThreadTapeIsStopped() || GetTapeSet()->empty()) {
+    Py_RETURN_NONE;
+  }
+
+  return RecordGradient(op_name, inputs, attrs, results, name);
+}
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 49323e6640e664ef5f98b227964f9dd4e248ca39..faaae40b3f1ef02984a7a75c23ae4acae65ac335 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -21,13 +21,13 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import execute
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
 class Tests(test.TestCase):
@@ -46,15 +46,28 @@ class Tests(test.TestCase):
     self.assertAllClose(
         math_ops.matmul(a_2_by_2, b_2_by_2),
         pywrap_tensorflow.TFE_Py_FastPathExecute(
-            ctx._handle, ctx.device_name, "MatMul", execute.record_gradient,
-            None, None, a_2_by_2, b_2_by_2, "transpose_a", False, "transpose_b",
-            False))
+            ctx._handle, ctx.device_name, "MatMul", None, None, a_2_by_2,
+            b_2_by_2, "transpose_a", False, "transpose_b", False))
     self.assertAllClose(
         math_ops.matmul(a_100_by_784, b_100_by_784, transpose_b=True),
         pywrap_tensorflow.TFE_Py_FastPathExecute(
-            ctx._handle, ctx.device_name, "MatMul", execute.record_gradient,
-            None, None, a_100_by_784, b_100_by_784, "transpose_a", False,
-            "transpose_b", True))
+            ctx._handle, ctx.device_name, "MatMul", None, None, a_100_by_784,
+            b_100_by_784, "transpose_a", False, "transpose_b", True))
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_ResourceVariableMatMulCorrectResponse(self):
+    ctx = context.context()
+    a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
+    m = resource_variable_ops.ResourceVariable(a_2_by_2)
+    x = pywrap_tensorflow.TFE_Py_FastPathExecute(
+        ctx._handle, ctx.device_name, "MatMul", None, None, m, m, "transpose_a",
+        False, "transpose_b", False)
+    y = pywrap_tensorflow.TFE_Py_FastPathExecute(
+        ctx._handle, ctx.device_name, "MatMul", None, None, a_2_by_2, a_2_by_2,
+        "transpose_a", False, "transpose_b", False)
+
+    self.assertAllEqual(x, y)
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -64,12 +77,27 @@ class Tests(test.TestCase):
       a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
       tape.watch(a_2_by_2)
       z = pywrap_tensorflow.TFE_Py_FastPathExecute(
-          ctx._handle, ctx.device_name, "MatMul", execute.record_gradient, None,
-          None, a_2_by_2, a_2_by_2, "transpose_a", False, "transpose_b", False)
+          ctx._handle, ctx.device_name, "MatMul", None, None, a_2_by_2,
+          a_2_by_2, "transpose_a", False, "transpose_b", False)
     dz_dy = tape.gradient(z, [a_2_by_2])[0]
     self.assertAllEqual(dz_dy.numpy(),
                         constant_op.constant(4.0, shape=[2, 2]).numpy())
 
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_ResourceVariableTapeWrite(self):
+    ctx = context.context()
+    with backprop.GradientTape(persistent=True) as tape:
+      a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
+      m = resource_variable_ops.ResourceVariable(a_2_by_2)
+      tape.watch(m)
+      z = pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx._handle, ctx.device_name, "MatMul", None, None, m, m,
+          "transpose_a", False, "transpose_b", False)
+    dz_dy = tape.gradient(z, [m])[0]
+    self.assertAllEqual(dz_dy.numpy(),
+                        constant_op.constant(4.0, shape=[2, 2]).numpy())
+
   # Tests homogeneous list op
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -80,9 +108,9 @@ class Tests(test.TestCase):
 
     self.assertAllClose(
         math_ops.add_n([a_2_by_2, b_2_by_2]),
-        pywrap_tensorflow.TFE_Py_FastPathExecute(
-            ctx._handle, ctx.device_name, "AddN", execute.record_gradient, None,
-            None, [a_2_by_2, b_2_by_2]))
+        pywrap_tensorflow.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
+                                                 "AddN", None, None,
+                                                 [a_2_by_2, b_2_by_2]))
 
   # Tests homogeneous list op
   @test_util.assert_no_new_tensors
@@ -96,8 +124,8 @@ class Tests(test.TestCase):
       tape.watch(a_2_by_2)
       tape.watch(b_2_by_2)
       z1 = pywrap_tensorflow.TFE_Py_FastPathExecute(
-          ctx._handle, ctx.device_name, "AddN", execute.record_gradient, None,
-          None, [a_2_by_2, b_2_by_2])
+          ctx._handle, ctx.device_name, "AddN", None, None,
+          [a_2_by_2, b_2_by_2])
       z2 = math_ops.add_n([a_2_by_2, b_2_by_2])
     dz1_dy = tape.gradient(z1, [a_2_by_2])[0]
     dz2_dy = tape.gradient(z2, [a_2_by_2])[0]
@@ -113,9 +141,9 @@ class Tests(test.TestCase):
 
     self.assertAllClose(
         array_ops.identity_n([a_2_by_2, b_2_by_2]),
-        pywrap_tensorflow.TFE_Py_FastPathExecute(
-            ctx._handle, ctx.device_name, "IdentityN", execute.record_gradient,
-            None, None, [a_2_by_2, b_2_by_2]))
+        pywrap_tensorflow.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
+                                                 "IdentityN", None, None,
+                                                 [a_2_by_2, b_2_by_2]))
 
   # Tests heterogeneous list op
   @test_util.assert_no_new_tensors
@@ -129,8 +157,8 @@ class Tests(test.TestCase):
       tape.watch(a_2_by_2)
       tape.watch(b_2_by_2)
       z1 = pywrap_tensorflow.TFE_Py_FastPathExecute(
-          ctx._handle, ctx.device_name, "IdentityN", execute.record_gradient,
-          None, None, [a_2_by_2, b_2_by_2])
+          ctx._handle, ctx.device_name, "IdentityN", None, None,
+          [a_2_by_2, b_2_by_2])
       z2 = array_ops.identity_n([a_2_by_2, b_2_by_2])
     dz1_dy = tape.gradient(z1[0], [a_2_by_2])[0]
     dz2_dy = tape.gradient(z2[0], [a_2_by_2])[0]
@@ -141,28 +169,26 @@ class Tests(test.TestCase):
   def testFastpathExecute_InvalidInputs(self):
     a_2_by_2 = random_ops.random_uniform((2, 2))
     ctx = context.context()
-    assert not ctx.in_graph_mode(
+    assert ctx.executing_eagerly(
     ), "The prototype doesn't contain C code for graph construction"
     ctx_handle = ctx._handle  # pylint: disable=protected-access
 
     # Not enough base params
     with self.assertRaisesRegexp(ValueError,
-                                 "at least 6 items in the input tuple"):
+                                 "at least 5 items in the input tuple"):
       pywrap_tensorflow.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name,
                                                "Identity")
 
     # Not enough inputs
     with self.assertRaisesRegexp(ValueError,
-                                 "Expected to be at least 7, was 6"):
-      pywrap_tensorflow.TFE_Py_FastPathExecute(
-          ctx_handle, ctx_handle, "Identity", backprop._record_gradient, None,
-          [])
+                                 "Expected to be at least 6, was 5"):
+      pywrap_tensorflow.TFE_Py_FastPathExecute(ctx_handle, ctx_handle,
+                                               "Identity", None, [])
 
     # Bad type
     with self.assertRaisesRegexp(TypeError, "expected a string for op_name"):
-      pywrap_tensorflow.TFE_Py_FastPathExecute(
-          ctx_handle, ctx.device_name, ctx_handle, backprop._record_gradient,
-          None, [], a_2_by_2)
+      pywrap_tensorflow.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name,
+                                               ctx_handle, None, [], a_2_by_2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/eager/tape_test.py b/tensorflow/python/eager/tape_test.py
index b490bac66db03b0a61a8852f45f1f558cccaf121..4326d5efa3d362e883815eb2d3dafb27df25afd4 100644
--- a/tensorflow/python/eager/tape_test.py
+++ b/tensorflow/python/eager/tape_test.py
@@ -21,11 +21,11 @@ from __future__ import print_function
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import custom_gradient
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 # Importing nn_grad for the registration functions.
@@ -165,21 +165,6 @@ class TapeTest(test.TestCase):
     g, = backprop.gradients_function(fn, [0])(t)
     self.assertAllEqual(g, 1.0)
 
-  def testCustomGradientGraphMode(self):
-    with context.graph_mode(), self.test_session():
-
-      @custom_gradient.custom_gradient
-      def f(x):
-
-        def grad(dresult):
-          return dresult * 10.0
-
-        return x, grad
-
-      inp = constant_op.constant(1.0)
-      grad = gradients_impl.gradients(f(inp), inp)
-      self.assertAllEqual(grad[0].eval(), 10.0)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 0bd5a5dbafd5ea8da21d4fb8a7dcae9fe23dd3d2..b044b30231603b0265aa1ef0320e9f1cfb303724 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -278,14 +278,9 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
     with self.assertRaisesRegexp(
         TypeError,
-        r"tensor_list argument must be a list. Got \"EagerTensor\""):
+        r"tensors argument must be a list or a tuple. Got \"EagerTensor\""):
       pywrap_tensorflow.TFE_Py_TensorShapeSlice(t1, -2)
 
-    with self.assertRaisesRegexp(
-        TypeError,
-        r"tensor_list argument must be a list. Got \"tuple\""):
-      pywrap_tensorflow.TFE_Py_TensorShapeSlice((t1,), -2)
-
   def testNegativeSliceDim(self):
     t1 = _create_tensor([1, 2], dtype=dtypes.int32)
 
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index c519fd557a9319d6ef5522b26198e5b4202917fc..c6bb9b9be7cb8049acb2d6c3fe0f50a720e71f2e 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -8,24 +8,13 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "estimator_py",
     srcs = ["estimator_lib.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":baseline",
+        ":boosted_trees",
         ":dnn",
         ":dnn_linear_combined",
         ":estimator",
@@ -37,7 +26,6 @@ py_library(
         ":parsing_utils",
         ":run_config",
         ":training",
-        ":warm_starting_util",
         "//tensorflow/python:util",
     ],
 )
@@ -217,6 +205,7 @@ py_test(
         "no_pip",
         "noasan",  # test flakily times out in asan mode.
         "notsan",  # b/67510291
+        "optonly",  # flakily times out in fastbuild
     ],
     deps = [
         ":baseline",
@@ -250,6 +239,54 @@ py_test(
     ],
 )
 
+py_library(
+    name = "boosted_trees",
+    srcs = ["canned/boosted_trees.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":head",
+        ":model_fn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
+py_test(
+    name = "boosted_trees_test",
+    size = "medium",
+    srcs = ["canned/boosted_trees_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":boosted_trees",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
 py_library(
     name = "dnn",
     srcs = ["canned/dnn.py"],
@@ -264,7 +301,6 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:summary",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
@@ -278,12 +314,12 @@ py_library(
     srcs = ["canned/dnn_testing_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator",
         ":head",
         ":metric_keys",
         ":model_fn",
         ":numpy_io",
         ":prediction_keys",
-        ":warm_starting_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -291,6 +327,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -347,6 +384,7 @@ py_library(
         ":model_fn",
         ":optimizers",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:layers",
@@ -427,10 +465,10 @@ py_library(
         ":model_fn",
         ":run_config",
         ":util",
-        ":warm_starting_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:metrics",
         "//tensorflow/python:platform",
@@ -617,6 +655,7 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:training",
         "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
@@ -707,6 +746,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -868,39 +908,3 @@ py_test(
         "//tensorflow/python:training",
     ],
 )
-
-py_library(
-    name = "warm_starting_util",
-    srcs = ["warm_starting_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
-    ],
-)
-
-py_test(
-    name = "warm_starting_util_test",
-    size = "small",
-    srcs = ["warm_starting_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":warm_starting_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/python/estimator/canned/baseline_test.py b/tensorflow/python/estimator/canned/baseline_test.py
index 18c955f5a0e998de983b31fc4cc595895e6bbcbd..7bf2e62da9c4598c28ad38825aac2031c9d51905 100644
--- a/tensorflow/python/estimator/canned/baseline_test.py
+++ b/tensorflow/python/estimator/canned/baseline_test.py
@@ -42,13 +42,13 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
@@ -482,7 +482,7 @@ class BaselineRegressorTrainingTest(test.TestCase):
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
@@ -490,7 +490,7 @@ class BaselineRegressorTrainingTest(test.TestCase):
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
 
     mock_optimizer = test.mock.NonCallableMock(
@@ -685,13 +685,13 @@ class BaselineClassifierTrainingTest(test.TestCase):
       # Verify loss. We can't check the value directly, so we add an assert op.
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
           loss,
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
 
     mock_optimizer = test.mock.NonCallableMock(
         spec=optimizer.Optimizer,
@@ -1071,11 +1071,13 @@ class BaselineClassifierEvaluationTest(test.TestCase):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: 1.3133,
           metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
           metric_keys.MetricKeys.LABEL_MEAN: 1.,
           metric_keys.MetricKeys.ACCURACY_BASELINE: 1,
           metric_keys.MetricKeys.AUC: 0.,
-          metric_keys.MetricKeys.AUC_PR: 0.5,
+          metric_keys.MetricKeys.AUC_PR: 1.,
       }
     else:
       # Multi classes: loss = 1 * -log ( softmax(logits)[label] )
@@ -1132,11 +1134,13 @@ class BaselineClassifierEvaluationTest(test.TestCase):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
           metric_keys.MetricKeys.ACCURACY: 0.5,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
           metric_keys.MetricKeys.LABEL_MEAN: 0.5,
           metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
           metric_keys.MetricKeys.AUC: 0.5,
-          metric_keys.MetricKeys.AUC_PR: 0.25,
+          metric_keys.MetricKeys.AUC_PR: 0.75,
       }
     else:
       # Expand logits since batch_size=2
@@ -1207,12 +1211,14 @@ class BaselineClassifierEvaluationTest(test.TestCase):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
           metric_keys.MetricKeys.ACCURACY: 2. / (1. + 2.),
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: predictions_mean,
           metric_keys.MetricKeys.LABEL_MEAN: label_mean,
           metric_keys.MetricKeys.ACCURACY_BASELINE: (
               max(label_mean, 1-label_mean)),
           metric_keys.MetricKeys.AUC: 0.5,
-          metric_keys.MetricKeys.AUC_PR: 0.16666645,
+          metric_keys.MetricKeys.AUC_PR: 2. / (1. + 2.),
       }
     else:
       # Multi classes: unweighted_loss = 1 * -log ( soft_max(logits)[label] )
@@ -1542,4 +1548,3 @@ class BaselineLogitFnTest(test.TestCase):
 
 if __name__ == '__main__':
   test.main()
-
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
new file mode 100644
index 0000000000000000000000000000000000000000..085dace1b3eb1b75b1b2d688e7cdffec10c2a878
--- /dev/null
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -0,0 +1,803 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimator classes for BoostedTrees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.summary import summary
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
+
+# TODO(nponomareva): Reveal pruning params here.
+_TreeHParams = collections.namedtuple('TreeHParams', [
+    'n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity',
+    'min_node_weight'
+])
+
+_HOLD_FOR_MULTI_CLASS_SUPPORT = object()
+_HOLD_FOR_MULTI_DIM_SUPPORT = object()
+
+
+def _get_max_buckets(feature_columns):
+  """Gets the maximum number of buckets from feature_columns.
+
+  Args:
+    feature_columns: a list/set of tf.feature_column.
+
+  Returns:
+    max_buckets: the maximum number of buckets among bucketized_columns.
+
+  Raises:
+    ValueError: when unsupported feature_columns are given.
+  """
+  if not feature_columns:
+    raise ValueError('feature_columns must be a non-empty list/set of '
+                     'tf.feature_column.')
+  max_buckets = 1
+  for fc in feature_columns:
+    if isinstance(fc, feature_column_lib._BucketizedColumn):  # pylint:disable=protected-access
+      # N boundaries creates (N+1) buckets.
+      max_buckets = max(max_buckets, len(fc.boundaries) + 1)
+    else:
+      raise ValueError('For now, only bucketized_column is supported but '
+                       'got: {}'.format(fc))
+  return max_buckets
+
+
+def _get_transformed_features(features, feature_columns):
+  """Gets the transformed features from features/feature_columns pair.
+
+  Args:
+    features: a dicionary of name to Tensor.
+    feature_columns: a list/set of tf.feature_column.
+
+  Returns:
+    result_features: a list of the transformed features, sorted by the name.
+
+  Raises:
+    ValueError: when unsupported features/columns are tried.
+  """
+  # pylint:disable=protected-access
+  for fc in feature_columns:
+    if not isinstance(fc, feature_column_lib._BucketizedColumn):
+      raise ValueError('For now, only bucketized_column is supported but '
+                       'got: {}'.format(fc))
+  transformed_features = feature_column_lib._transform_features(
+      features, feature_columns)
+  # pylint:enable=protected-access
+  result_features = []
+  for column in sorted(transformed_features, key=lambda tc: tc.name):
+    source_name = column.source_column.name
+    squeezed_tensor = array_ops.squeeze(transformed_features[column], axis=1)
+    if len(squeezed_tensor.shape) > 1:
+      raise ValueError('For now, only supports features equivalent to rank 1 '
+                       'but column `{}` got: {}'.format(
+                           source_name, features[source_name].shape))
+    result_features.append(squeezed_tensor)
+  return result_features
+
+
+def _local_variable(tensor, name=None):
+  """Stores a tensor as a local Variable for faster read."""
+  return variable_scope.variable(
+      initial_value=tensor,
+      trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      validate_shape=False,
+      name=name)
+
+
+def _cache_transformed_features(features, feature_columns, batch_size):
+  """Transform features and cache, then returns (cached_features, cache_op)."""
+  num_features = len(feature_columns)
+  cached_features = [
+      _local_variable(
+          array_ops.zeros([batch_size], dtype=dtypes.int32),
+          name='cached_feature_{}'.format(i))
+      for i in range(num_features)
+  ]
+  are_features_cached = _local_variable(False, name='are_features_cached')
+
+  def cache_features_and_return():
+    """Caches transoformed features.
+
+    The intention is to hide get_transformed_features() from the graph by
+    caching the result except the first step, since bucketize operation
+    (inside get_transformed_features) is expensive.
+
+    Returns:
+      input_feature_list: a list of input features.
+      cache_flip_op: op to add to graph to make sure cache update is included to
+          the graph.
+    """
+
+    transformed_features = _get_transformed_features(features, feature_columns)
+    cached = [
+        state_ops.assign(cached_features[i], transformed_features[i])
+        for i in range(num_features)
+    ]
+    # TODO(youngheek): Try other combination of dependencies so that the
+    # function returns a single result, not a tuple.
+    with ops.control_dependencies(cached):
+      cache_flip_op = are_features_cached.assign(True)
+    return cached, cache_flip_op
+
+  input_feature_list, cache_flip_op = control_flow_ops.cond(
+      are_features_cached,
+      lambda: (cached_features, control_flow_ops.no_op()),
+      cache_features_and_return)
+  return input_feature_list, cache_flip_op
+
+
+class _CacheTrainingStatesUsingHashTable(object):
+  """Caching logits, etc. using MutableHashTable."""
+
+  def __init__(self, example_ids, logits_dimension):
+    """Creates a cache with the given configuration.
+
+    It maintains a MutableDenseHashTable for all values.
+    The API lookup() and insert() would have those specs,
+      tree_ids: shape=[batch_size], dtype=int32
+      node_ids: shape=[batch_size], dtype=int32
+      logits: shape=[batch_size, logits_dimension], dtype=float32
+    However in the MutableDenseHashTable, ids are bitcasted into float32 and
+    all values are concatenated as a single tensor (of float32).
+
+    Hence conversion happens internally before inserting to the HashTable and
+    after lookup from it.
+
+    Args:
+      example_ids: a Rank 1 tensor to be used as a key of the cache.
+      logits_dimension: a constant (int) for the dimension of logits.
+
+    Raises:
+      ValueError: if example_ids is other than int64 or string.
+    """
+    if dtypes.as_dtype(dtypes.int64).is_compatible_with(example_ids.dtype):
+      empty_key = -1 << 62
+    elif dtypes.as_dtype(dtypes.string).is_compatible_with(example_ids.dtype):
+      empty_key = ''
+    else:
+      raise ValueError('Unsupported example_id_feature dtype %s.',
+                       example_ids.dtype)
+    # Cache holds latest <tree_id, node_id, logits> for each example.
+    # tree_id and node_id are both int32 but logits is a float32.
+    # To reduce the overhead, we store all of them together as float32 and
+    # bitcast the ids to int32.
+    self._table_ref = lookup_ops.mutable_dense_hash_table_v2(
+        empty_key=empty_key, value_dtype=dtypes.float32, value_shape=[3])
+    self._example_ids = example_ids
+    self._logits_dimension = logits_dimension
+
+  def lookup(self):
+    """Returns cached_tree_ids, cached_node_ids, cached_logits."""
+    cached_tree_ids, cached_node_ids, cached_logits = array_ops.split(
+        lookup_ops.lookup_table_find_v2(
+            self._table_ref, self._example_ids, default_value=[0.0, 0.0, 0.0]),
+        [1, 1, self._logits_dimension],
+        axis=1)
+    cached_tree_ids = array_ops.squeeze(
+        array_ops.bitcast(cached_tree_ids, dtypes.int32))
+    cached_node_ids = array_ops.squeeze(
+        array_ops.bitcast(cached_node_ids, dtypes.int32))
+    return (cached_tree_ids, cached_node_ids, cached_logits)
+
+  def insert(self, tree_ids, node_ids, logits):
+    """Inserts values and returns the op."""
+    insert_op = lookup_ops.lookup_table_insert_v2(
+        self._table_ref, self._example_ids,
+        array_ops.concat(
+            [
+                array_ops.expand_dims(
+                    array_ops.bitcast(tree_ids, dtypes.float32), 1),
+                array_ops.expand_dims(
+                    array_ops.bitcast(node_ids, dtypes.float32), 1),
+                logits,
+            ],
+            axis=1,
+            name='value_concat_for_cache_insert'))
+    return insert_op
+
+
+class _CacheTrainingStatesUsingVariables(object):
+  """Caching logits, etc. using Variables."""
+
+  def __init__(self, batch_size, logits_dimension):
+    """Creates a cache with the given configuration.
+
+    It maintains three variables, tree_ids, node_ids, logits, for caching.
+      tree_ids: shape=[batch_size], dtype=int32
+      node_ids: shape=[batch_size], dtype=int32
+      logits: shape=[batch_size, logits_dimension], dtype=float32
+
+    Note, this can be used only with in-memory data setting.
+
+    Args:
+      batch_size: `int`, the size of the cache.
+      logits_dimension: a constant (int) for the dimension of logits.
+    """
+    self._logits_dimension = logits_dimension
+    self._tree_ids = _local_variable(
+        array_ops.zeros([batch_size], dtype=dtypes.int32),
+        name='tree_ids_cache')
+    self._node_ids = _local_variable(
+        array_ops.zeros([batch_size], dtype=dtypes.int32),
+        name='node_ids_cache')
+    self._logits = _local_variable(
+        array_ops.zeros([batch_size, logits_dimension], dtype=dtypes.float32),
+        name='logits_cache')
+
+  def lookup(self):
+    """Returns cached_tree_ids, cached_node_ids, cached_logits."""
+    return (self._tree_ids, self._node_ids, self._logits)
+
+  def insert(self, tree_ids, node_ids, logits):
+    """Inserts values and returns the op."""
+    return control_flow_ops.group(
+        [
+            self._tree_ids.assign(tree_ids),
+            self._node_ids.assign(node_ids),
+            self._logits.assign(logits)
+        ],
+        name='cache_insert')
+
+
+class _StopAtAttemptsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at the number of attempts."""
+
+  def __init__(self, num_finalized_trees_tensor, num_attempted_layers_tensor,
+               max_trees, max_depth):
+    self._num_finalized_trees_tensor = num_finalized_trees_tensor
+    self._num_attempted_layers_tensor = num_attempted_layers_tensor
+    self._max_trees = max_trees
+    self._max_depth = max_depth
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(
+        [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
+
+  def after_run(self, run_context, run_values):
+    # num_* tensors should be retrieved by a separate session than the training
+    # one, in order to read the values after growing.
+    # So, if it's approaching to the limit, get the actual value by additional
+    # session.
+    num_finalized_trees, num_attempted_layers = run_values.results
+    if (num_finalized_trees >= self._max_trees - 1 or
+        num_attempted_layers > 2 * self._max_trees * self._max_depth - 1):
+      num_finalized_trees, num_attempted_layers = run_context.session.run(
+          [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
+    if (num_finalized_trees >= self._max_trees or
+        num_attempted_layers > 2 * self._max_trees * self._max_depth):
+      run_context.request_stop()
+
+
+def _bt_model_fn(
+    features,
+    labels,
+    mode,
+    head,
+    feature_columns,
+    tree_hparams,
+    n_batches_per_layer,
+    config,
+    closed_form_grad_and_hess_fn=None,
+    example_id_column_name=None,
+    # TODO(youngheek): replace this later using other options.
+    train_in_memory=False,
+    name='boosted_trees'):
+  """Gradient Boosted Trees model_fn.
+
+  Args:
+    features: dict of `Tensor`.
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
+      dtype `int32` or `int64` in the range `[0, n_classes)`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `head_lib._Head` instance.
+    feature_columns: Iterable of `feature_column._FeatureColumn` model inputs.
+    tree_hparams: TODO. collections.namedtuple for hyper parameters.
+    n_batches_per_layer: A `Tensor` of `int64`. Each layer is built after at
+      least n_batches_per_layer accumulations.
+    config: `RunConfig` object to configure the runtime settings.
+    closed_form_grad_and_hess_fn: a function that accepts logits and labels
+      and returns gradients and hessians. By default, they are created by
+      tf.gradients() from the loss.
+    example_id_column_name: Name of the feature for a unique ID per example.
+      Currently experimental -- not exposed to public API.
+    train_in_memory: `bool`, when true, it assumes the dataset is in memory,
+      i.e., input_fn should return the entire dataset as a single batch, and
+      also n_batches_per_layer should be set as 1.
+    name: Name to use for the model.
+
+  Returns:
+      An `EstimatorSpec` instance.
+
+  Raises:
+    ValueError: mode or params are invalid, or features has the wrong type.
+  """
+  is_single_machine = (config.num_worker_replicas <= 1)
+  if train_in_memory:
+    assert n_batches_per_layer == 1, (
+        'When train_in_memory is enabled, input_fn should return the entire '
+        'dataset as a single batch, and n_batches_per_layer should be set as '
+        '1.')
+    if (not config.is_chief or config.num_worker_replicas > 1 or
+        config.num_ps_replicas > 0):
+      raise ValueError('train_in_memory is supported only for '
+                       'non-distributed training.')
+  worker_device = control_flow_ops.no_op().device
+  # maximum number of splits possible in the whole tree =2^(D-1)-1
+  # TODO(youngheek): perhaps storage could be optimized by storing stats with
+  # the dimension max_splits_per_layer, instead of max_splits (for the entire
+  # tree).
+  max_splits = (1 << tree_hparams.max_depth) - 1
+  max_buckets = _get_max_buckets(feature_columns)
+  train_op = []
+  with ops.name_scope(name) as name:
+    # Prepare.
+    global_step = training_util.get_or_create_global_step()
+    num_features = len(feature_columns)
+    # Extract input features and set up cache for training.
+    training_state_cache = None
+    if mode == model_fn.ModeKeys.TRAIN and train_in_memory:
+      # cache transformed features as well for in-memory training.
+      batch_size = array_ops.shape(labels)[0]
+      input_feature_list, input_cache_op = _cache_transformed_features(
+          features, feature_columns, batch_size)
+      train_op.append(input_cache_op)
+      training_state_cache = _CacheTrainingStatesUsingVariables(
+          batch_size, head.logits_dimension)
+    else:
+      input_feature_list = _get_transformed_features(features, feature_columns)
+      if mode == model_fn.ModeKeys.TRAIN and example_id_column_name:
+        example_ids = features[example_id_column_name]
+        training_state_cache = _CacheTrainingStatesUsingHashTable(
+            example_ids, head.logits_dimension)
+
+    # Create Ensemble resources.
+    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
+    # Create logits.
+    if mode != model_fn.ModeKeys.TRAIN:
+      logits = boosted_trees_ops.predict(
+          # For non-TRAIN mode, ensemble doesn't change after initialization,
+          # so no local copy is needed; using tree_ensemble directly.
+          tree_ensemble_handle=tree_ensemble.resource_handle,
+          bucketized_features=input_feature_list,
+          logits_dimension=head.logits_dimension)
+    else:
+      if is_single_machine:
+        local_tree_ensemble = tree_ensemble
+        ensemble_reload = control_flow_ops.no_op()
+      else:
+        # Have a local copy of ensemble for the distributed setting.
+        with ops.device(worker_device):
+          local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
+              name=name + '_local', is_local=True)
+        # TODO(soroush): Do partial updates if this becomes a bottleneck.
+        ensemble_reload = local_tree_ensemble.deserialize(
+            *tree_ensemble.serialize())
+      if training_state_cache:
+        cached_tree_ids, cached_node_ids, cached_logits = (
+            training_state_cache.lookup())
+      else:
+        # Always start from the beginning when no cache is set up.
+        batch_size = array_ops.shape(labels)[0]
+        cached_tree_ids, cached_node_ids, cached_logits = (
+            array_ops.zeros([batch_size], dtype=dtypes.int32),
+            array_ops.zeros([batch_size], dtype=dtypes.int32),
+            array_ops.zeros(
+                [batch_size, head.logits_dimension], dtype=dtypes.float32))
+      with ops.control_dependencies([ensemble_reload]):
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         last_layer_nodes_range) = local_tree_ensemble.get_states()
+        summary.scalar('ensemble/num_trees', num_trees)
+        summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
+        summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
+
+        partial_logits, tree_ids, node_ids = boosted_trees_ops.training_predict(
+            tree_ensemble_handle=local_tree_ensemble.resource_handle,
+            cached_tree_ids=cached_tree_ids,
+            cached_node_ids=cached_node_ids,
+            bucketized_features=input_feature_list,
+            logits_dimension=head.logits_dimension)
+      logits = cached_logits + partial_logits
+
+    # Create training graph.
+    def _train_op_fn(loss):
+      """Run one training iteration."""
+      if training_state_cache:
+        train_op.append(training_state_cache.insert(tree_ids, node_ids, logits))
+      if closed_form_grad_and_hess_fn:
+        gradients, hessians = closed_form_grad_and_hess_fn(logits, labels)
+      else:
+        gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0]
+        hessians = gradients_impl.gradients(
+            gradients, logits, name='Hessians')[0]
+      stats_summary_list = [
+          array_ops.squeeze(
+              boosted_trees_ops.make_stats_summary(
+                  node_ids=node_ids,
+                  gradients=gradients,
+                  hessians=hessians,
+                  bucketized_features_list=[input_feature_list[f]],
+                  max_splits=max_splits,
+                  num_buckets=max_buckets),
+              axis=0) for f in range(num_features)
+      ]
+
+      def grow_tree_from_stats_summaries(stats_summary_list):
+        """Updates ensemble based on the best gains from stats summaries."""
+        (node_ids_per_feature, gains_list, thresholds_list,
+         left_node_contribs_list, right_node_contribs_list) = (
+             boosted_trees_ops.calculate_best_gains_per_feature(
+                 node_id_range=last_layer_nodes_range,
+                 stats_summary_list=stats_summary_list,
+                 l1=tree_hparams.l1,
+                 l2=tree_hparams.l2,
+                 tree_complexity=tree_hparams.tree_complexity,
+                 min_node_weight=tree_hparams.min_node_weight,
+                 max_splits=max_splits))
+        grow_op = boosted_trees_ops.update_ensemble(
+            # Confirm if local_tree_ensemble or tree_ensemble should be used.
+            tree_ensemble.resource_handle,
+            feature_ids=math_ops.range(0, num_features, dtype=dtypes.int32),
+            node_ids=node_ids_per_feature,
+            gains=gains_list,
+            thresholds=thresholds_list,
+            left_node_contribs=left_node_contribs_list,
+            right_node_contribs=right_node_contribs_list,
+            learning_rate=tree_hparams.learning_rate,
+            max_depth=tree_hparams.max_depth,
+            pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING)
+        return grow_op
+
+      if train_in_memory and is_single_machine:
+        train_op.append(distribute_lib.increment_var(global_step))
+        train_op.append(grow_tree_from_stats_summaries(stats_summary_list))
+      else:
+        summary_accumulator = data_flow_ops.ConditionalAccumulator(
+            dtype=dtypes.float32,
+            # The stats consist of gradients and hessians (the last dimension).
+            shape=[num_features, max_splits, max_buckets, 2],
+            shared_name='stats_summary_accumulator')
+        apply_grad = summary_accumulator.apply_grad(
+            array_ops.stack(stats_summary_list, axis=0), stamp_token)
+
+        def grow_tree_from_accumulated_summaries_fn():
+          """Updates the tree with the best layer from accumulated summaries."""
+          # Take out the accumulated summaries from the accumulator and grow.
+          stats_summary_list = array_ops.unstack(
+              summary_accumulator.take_grad(1), axis=0)
+          grow_op = grow_tree_from_stats_summaries(stats_summary_list)
+          return grow_op
+
+        with ops.control_dependencies([apply_grad]):
+          train_op.append(distribute_lib.increment_var(global_step))
+          if config.is_chief:
+            train_op.append(
+                control_flow_ops.cond(
+                    math_ops.greater_equal(
+                        summary_accumulator.num_accumulated(),
+                        n_batches_per_layer),
+                    grow_tree_from_accumulated_summaries_fn,
+                    control_flow_ops.no_op,
+                    name='wait_until_n_batches_accumulated'))
+
+      return control_flow_ops.group(train_op, name='train_op')
+
+  estimator_spec = head.create_estimator_spec(
+      features=features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_train_op_fn,
+      logits=logits)
+  if mode == model_fn.ModeKeys.TRAIN:
+    # Add an early stop hook.
+    estimator_spec = estimator_spec._replace(
+        training_hooks=estimator_spec.training_hooks +
+        (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
+                             tree_hparams.n_trees, tree_hparams.max_depth),))
+  return estimator_spec
+
+
+def _create_classification_head(n_classes,
+                                weight_column=None,
+                                label_vocabulary=None):
+  """Creates a classification head. Refer to canned.head for details on args."""
+  # TODO(nponomareva): Support multi-class cases.
+  if n_classes == 2:
+    # pylint: disable=protected-access
+    return head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column=weight_column,
+        label_vocabulary=label_vocabulary,
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    # pylint: enable=protected-access
+  else:
+    raise ValueError('For now only binary classification is supported.'
+                     'n_classes given as {}'.format(n_classes))
+
+
+def _create_classification_head_and_closed_form(n_classes, weight_column,
+                                                label_vocabulary):
+  """Creates a head for classifier and the closed form gradients/hessians."""
+  head = _create_classification_head(n_classes, weight_column, label_vocabulary)
+  if n_classes == 2 and weight_column is None and label_vocabulary is None:
+    # Use the closed-form gradients/hessians for 2 class.
+    def _grad_and_hess_for_logloss(logits, labels):
+      # TODO(youngheek): add weights handling.
+      predictions = math_ops.reciprocal(math_ops.exp(-logits) + 1.0)
+      normalizer = math_ops.reciprocal(
+          math_ops.cast(array_ops.size(predictions), dtypes.float32))
+      gradients = (predictions - labels) * normalizer
+      hessians = predictions * (1.0 - predictions) * normalizer
+      return gradients, hessians
+
+    closed_form = _grad_and_hess_for_logloss
+  else:
+    closed_form = None
+  return (head, closed_form)
+
+
+def _create_regression_head(label_dimension, weight_column=None):
+  if label_dimension != 1:
+    raise ValueError('For now only 1 dimension regression is supported.'
+                     'label_dimension given as {}'.format(label_dimension))
+  # pylint: disable=protected-access
+  return head_lib._regression_head_with_mean_squared_error_loss(
+      label_dimension=label_dimension,
+      weight_column=weight_column,
+      loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+  # pylint: enable=protected-access
+
+
+@tf_export('estimator.BoostedTreesClassifier')
+class BoostedTreesClassifier(estimator.Estimator):
+  """A Classifier for Tensorflow Boosted Trees models."""
+
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               model_dir=None,
+               n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
+               weight_column=None,
+               label_vocabulary=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               min_node_weight=0.,
+               config=None):
+    """Initializes a `BoostedTreesClassifier` instance.
+
+    Example:
+
+    ```python
+    bucketized_feature_1 = bucketized_column(
+      numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+    bucketized_feature_2 = bucketized_column(
+      numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+    classifier = estimator.BoostedTreesClassifier(
+        feature_columns=[bucketized_feature_1, bucketized_feature_2],
+        n_trees=100,
+        ... <some other params>
+    )
+
+    def input_fn_train():
+      ...
+      return dataset
+
+    classifier.train(input_fn=input_fn_train)
+
+    def input_fn_eval():
+      ...
+      return dataset
+
+    metrics = classifier.evaluate(input_fn=input_fn_eval)
+    ```
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      n_batches_per_layer: the number of batches to collect statistics per
+        layer.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      n_classes: number of label classes. Default is binary classification.
+        Multiclass support is not yet implemented.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      n_trees: number trees to be created.
+      max_depth: maximum depth of the tree to grow.
+      learning_rate: shrinkage parameter to be used when a tree added to the
+        model.
+      l1_regularization: regularization multiplier applied to the absolute
+        weights of the tree leafs.
+      l2_regularization: regularization multiplier applied to the square weights
+        of the tree leafs.
+      tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: min_node_weight: minimum hessian a node must have for a
+        split to be considered. The value will be compared with
+        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: when wrong arguments are given or unsupported functionalities
+         are requested.
+    """
+    # TODO(nponomareva): Support multi-class cases.
+    if n_classes == _HOLD_FOR_MULTI_CLASS_SUPPORT:
+      n_classes = 2
+    head, closed_form = _create_classification_head_and_closed_form(
+        n_classes, weight_column, label_vocabulary=label_vocabulary)
+
+    # HParams for the model.
+    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
+                                l1_regularization, l2_regularization,
+                                tree_complexity, min_node_weight)
+
+    def _model_fn(features, labels, mode, config):
+      return _bt_model_fn(  # pylint: disable=protected-access
+          features,
+          labels,
+          mode,
+          head,
+          feature_columns,
+          tree_hparams,
+          n_batches_per_layer,
+          config,
+          closed_form_grad_and_hess_fn=closed_form)
+
+    super(BoostedTreesClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+@tf_export('estimator.BoostedTreesRegressor')
+class BoostedTreesRegressor(estimator.Estimator):
+  """A Regressor for Tensorflow Boosted Trees models."""
+
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               model_dir=None,
+               label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
+               weight_column=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               min_node_weight=0.,
+               config=None):
+    """Initializes a `BoostedTreesRegressor` instance.
+
+    Example:
+
+    ```python
+    bucketized_feature_1 = bucketized_column(
+      numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+    bucketized_feature_2 = bucketized_column(
+      numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+    regressor = estimator.BoostedTreesRegressor(
+        feature_columns=[bucketized_feature_1, bucketized_feature_2],
+        n_trees=100,
+        ... <some other params>
+    )
+
+    def input_fn_train():
+      ...
+      return dataset
+
+    regressor.train(input_fn=input_fn_train)
+
+    def input_fn_eval():
+      ...
+      return dataset
+
+    metrics = regressor.evaluate(input_fn=input_fn_eval)
+    ```
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      n_batches_per_layer: the number of batches to collect statistics per
+        layer.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      label_dimension: Number of regression targets per example.
+        Multi-dimensional support is not yet implemented.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      n_trees: number trees to be created.
+      max_depth: maximum depth of the tree to grow.
+      learning_rate: shrinkage parameter to be used when a tree added to the
+        model.
+      l1_regularization: regularization multiplier applied to the absolute
+        weights of the tree leafs.
+      l2_regularization: regularization multiplier applied to the square weights
+        of the tree leafs.
+      tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: min_node_weight: minimum hessian a node must have for a
+        split to be considered. The value will be compared with
+        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: when wrong arguments are given or unsupported functionalities
+         are requested.
+    """
+    # TODO(nponomareva): Extend it to multi-dimension cases.
+    if label_dimension == _HOLD_FOR_MULTI_DIM_SUPPORT:
+      label_dimension = 1
+    head = _create_regression_head(label_dimension, weight_column)
+
+    # HParams for the model.
+    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
+                                l1_regularization, l2_regularization,
+                                tree_complexity, min_node_weight)
+
+    def _model_fn(features, labels, mode, config):
+      return _bt_model_fn(  # pylint: disable=protected-access
+          features, labels, mode, head, feature_columns, tree_hparams,
+          n_batches_per_layer, config)
+
+    super(BoostedTreesRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8c52d3bc649c939b7f7531f52880580289a83d9
--- /dev/null
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -0,0 +1,952 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests boosted_trees estimators and model_fn."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.canned import boosted_trees
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import checkpoint_utils
+
+NUM_FEATURES = 3
+
+BUCKET_BOUNDARIES = [-2., .5, 12.]  # Boundaries for all the features.
+INPUT_FEATURES = np.array(
+    [
+        [12.5, 1.0, -2.001, -2.0001, -1.999],  # feature_0 quantized:[3,2,0,0,1]
+        [2.0, -3.0, 0.5, 0.0, 0.4995],         # feature_1 quantized:[2,0,2,1,1]
+        [3.0, 20.0, 50.0, -100.0, 102.75],     # feature_2 quantized:[2,3,3,0,3]
+    ],
+    dtype=np.float32)
+CLASSIFICATION_LABELS = [[0.], [1.], [1.], [0.], [0.]]
+REGRESSION_LABELS = [[1.5], [0.3], [0.2], [2.], [5.]]
+FEATURES_DICT = {'f_%d' % i: INPUT_FEATURES[i] for i in range(NUM_FEATURES)}
+
+# EXAMPLE_ID is not exposed to Estimator yet, but supported at model_fn level.
+EXAMPLE_IDS = np.array([0, 1, 2, 3, 4], dtype=np.int64)
+EXAMPLE_ID_COLUMN = '__example_id__'
+
+
+def _make_train_input_fn(is_classification):
+  """Makes train input_fn for classification/regression."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    return features_dict, labels
+
+  return _input_fn
+
+
+def _make_train_input_fn_dataset(is_classification, batch=None, repeat=None):
+  """Makes input_fn using Dataset."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    if batch:
+      ds = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensor_slices(features_dict),
+           dataset_ops.Dataset.from_tensor_slices(labels))).batch(batch)
+    else:
+      ds = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensors(features_dict),
+           dataset_ops.Dataset.from_tensors(labels)))
+    # repeat indefinitely by default, or stop at the given step.
+    ds = ds.repeat(repeat)
+    return ds
+
+  return _input_fn
+
+
+class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES)
+        for i in range(NUM_FEATURES)
+    }
+
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
+
+  def testTrainAndEvaluateBinaryClassifier(self):
+    input_fn = _make_train_input_fn(is_classification=True)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+
+  def testInferBinaryClassifier(self):
+    train_input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
+  def testTrainClassifierWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
+  def testTrainAndEvaluateRegressor(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        max_depth=5)
+
+    # It will stop after 10 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
+
+  def testInferRegressor(self):
+    train_input_fn = _make_train_input_fn(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetBatch(self):
+    # The batch_size as the entire data size should yield the same result as
+    # dataset without batching.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=5)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetLargerBatch(self):
+    # The batch_size as the multiple of the entire data size should still yield
+    # the same result.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=15)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetSmallerBatch(self):
+    # Even when using small batches, if (n_batches_per_layer * batch_size) makes
+    # the same entire data size, the result should be the same.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=1)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=5,
+        n_trees=1,
+        max_depth=5)
+    # Train stops after (n_batches_per_layer * n_trees * max_depth) steps.
+    est.train(train_input_fn, steps=100)
+    self._assert_checkpoint(
+        est.model_dir, global_step=25, finalized_trees=1, attempted_layers=5)
+    # 5 batches = one epoch.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=5)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetWhenInputIsOverEarlier(self):
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, repeat=3)  # to stop input after 3 steps.
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    # Note that training will stop when input exhausts.
+    # This might not be a typical pattern, but dataset.repeat(3) causes
+    # the input stream to cease after 3 steps.
+    est.train(train_input_fn, steps=100)
+    self._assert_checkpoint(
+        est.model_dir, global_step=3, finalized_trees=0, attempted_layers=3)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 3.777295)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.353850], [0.254100], [0.106850], [0.712100], [1.012100]],
+        [pred['predictions'] for pred in predictions])
+
+
+class ModelFnTests(test_util.TensorFlowTestCase):
+  """Tests bt_model_fn including unexposed internal functionalities."""
+
+  def setUp(self):
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
+    }
+    self._tree_hparams = boosted_trees._TreeHParams(  # pylint:disable=protected-access
+        n_trees=2,
+        max_depth=2,
+        learning_rate=0.1,
+        l1=0.,
+        l2=0.01,
+        tree_complexity=0.,
+        min_node_weight=0.)
+
+  def _get_expected_ensembles_for_classification(self):
+    first_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 2
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.387675
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.181818
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0625
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+        """
+    second_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 2
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.387675
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.0
+              original_leaf {
+                scalar: -0.181818
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.105518
+              original_leaf {
+                scalar: 0.0625
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.348397
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.181818
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.224091
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.056815
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 0
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+        """
+    third_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 2
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.387675
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.0
+              original_leaf {
+                scalar: -0.181818
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.105518
+              original_leaf {
+                scalar: 0.0625
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.348397
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.181818
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.224091
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.056815
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.287131
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.162042
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.086986
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+        """
+    return (first_round, second_round, third_round)
+
+  def _get_expected_ensembles_for_regression(self):
+    first_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 1.169714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.241322
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.083951
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+        """
+    second_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 1.169714
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 1
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.673407
+              original_leaf {
+                scalar: 0.241322
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.324102
+              original_leaf {
+                scalar: 0.083951
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.563167
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.247047
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.095273
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.222102
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 0
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+        """
+    third_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 1.169714
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 1
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.673407
+              original_leaf {
+                scalar: 0.241322
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.324102
+              original_leaf {
+                scalar: 0.083951
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.563167
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.247047
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.095273
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.222102
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.981026
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.005166
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.180281
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+        """
+    return (first_round, second_round, third_round)
+
+  def _get_train_op_and_ensemble(self, head, config, is_classification,
+                                 train_in_memory):
+    """Calls bt_model_fn() and returns the train_op and ensemble_serialzed."""
+    features, labels = _make_train_input_fn(is_classification)()
+    estimator_spec = boosted_trees._bt_model_fn(  # pylint:disable=protected-access
+        features=features,
+        labels=labels,
+        mode=model_fn.ModeKeys.TRAIN,
+        head=head,
+        feature_columns=self._feature_columns,
+        tree_hparams=self._tree_hparams,
+        example_id_column_name=EXAMPLE_ID_COLUMN,
+        n_batches_per_layer=1,
+        config=config,
+        train_in_memory=train_in_memory)
+    resources.initialize_resources(resources.shared_resources()).run()
+    variables.global_variables_initializer().run()
+    variables.local_variables_initializer().run()
+
+    # Gets the train_op and serialized proto of the ensemble.
+    shared_resources = resources.shared_resources()
+    self.assertEqual(1, len(shared_resources))
+    train_op = estimator_spec.train_op
+    with ops.control_dependencies([train_op]):
+      _, ensemble_serialized = (
+          gen_boosted_trees_ops.boosted_trees_serialize_ensemble(
+              shared_resources[0].handle))
+    return train_op, ensemble_serialized
+
+  def testTrainClassifierInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_classification())
+    with self.test_session() as sess:
+      # Train with train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_classification_head(n_classes=2),
+            run_config.RunConfig(),
+            is_classification=True,
+            train_in_memory=True)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+  def testTrainClassifierNonInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_classification())
+    with self.test_session() as sess:
+      # Train without train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_classification_head(n_classes=2),
+            run_config.RunConfig(),
+            is_classification=True,
+            train_in_memory=False)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+  def testTrainRegressorInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_regression())
+    with self.test_session() as sess:
+      # Train with train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_regression_head(label_dimension=1),
+            run_config.RunConfig(),
+            is_classification=False,
+            train_in_memory=True)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+  def testTrainRegressorNonInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_regression())
+    with self.test_session() as sess:
+      # Train without train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_regression_head(label_dimension=1),
+            run_config.RunConfig(),
+            is_classification=False,
+            train_in_memory=False)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 7043da8de036e5be27d223271c37e065d9ffbcdd..6382622e0b5c72e5d3fcd9b9c6863968a425b86f 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -32,7 +32,6 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
-from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
 # The default learning rate of 0.05 is a historical artifact of the initial
@@ -183,17 +182,11 @@ def _dnn_model_fn(features,
         input_layer_partitioner=input_layer_partitioner)
     logits = logit_fn(features=features, mode=mode)
 
-    def _train_op_fn(loss):
-      """Returns the op to optimize the loss."""
-      return optimizer.minimize(
-          loss,
-          global_step=training_util.get_global_step())
-
     return head.create_estimator_spec(
         features=features,
         mode=mode,
         labels=labels,
-        train_op_fn=_train_op_fn,
+        optimizer=optimizer,
         logits=logits)
 
 
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 6d0fb96057ee93964ee3571bae3b878faad88882..f47706db2fc5f9baa38a36790832a958d5098587 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -31,10 +31,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
@@ -215,8 +215,7 @@ def _dnn_linear_combined_model_fn(features,
 
     train_op = control_flow_ops.group(*train_ops)
     with ops.control_dependencies([train_op]):
-      with ops.colocate_with(global_step):
-        return state_ops.assign_add(global_step, 1)
+      return distribute_lib.increment_var(global_step)
 
   return head.create_estimator_spec(
       features=features,
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
index 84675bf2a4a1655026bbba37c5d7a63d2f788c46..d275695eb319117cf94aefd7038ab5ee685e05a9 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
@@ -26,7 +26,7 @@ import six
 
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
-from tensorflow.python.estimator import warm_starting_util
+from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import dnn_testing_utils
 from tensorflow.python.estimator.canned import linear_testing_utils
@@ -866,7 +866,7 @@ class DNNLinearCombinedWarmStartingTest(test.TestCase):
                 learning_rate=0.0),
             # The provided regular expression will only warm-start the deep
             # portion of the model.
-            warm_start_from=warm_starting_util.WarmStartSettings(
+            warm_start_from=estimator.WarmStartSettings(
                 ckpt_to_initialize_from=dnn_lc_classifier.model_dir,
                 vars_to_warm_start='.*(dnn).*')))
 
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index cbae43e4f7fef0271de20a4ec54449989455d4bd..62b13c3200dd782c14dc427b62f9b03086c7174f 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -27,8 +27,8 @@ import six
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
@@ -44,16 +44,16 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary as summary_lib
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import monitored_session
-from tensorflow.python.training import optimizer
+from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
@@ -134,7 +134,8 @@ def mock_head(testcase, hidden_units, logits_dimension, expected_logits):
       hidden_weights_names + hidden_biases_names +
       [LOGITS_WEIGHTS_NAME + '/part_0:0', LOGITS_BIASES_NAME + '/part_0:0'])
 
-  def _create_estimator_spec(features, mode, logits, labels, train_op_fn):
+  def _create_estimator_spec(
+      features, mode, logits, labels, train_op_fn=None, optimizer=None):
     del features, labels  # Not used.
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     testcase.assertItemsEqual(expected_var_names,
@@ -144,8 +145,12 @@ def mock_head(testcase, hidden_units, logits_dimension, expected_logits):
         expected_logits, logits, message='Failed for mode={}. '.format(mode))
     with ops.control_dependencies([assert_logits]):
       if mode == model_fn.ModeKeys.TRAIN:
+        if train_op_fn is not None:
+          train_op = train_op_fn(loss)
+        elif optimizer is not None:
+          train_op = optimizer.minimize(loss, global_step=None)
         return model_fn.EstimatorSpec(
-            mode=mode, loss=loss, train_op=train_op_fn(loss))
+            mode=mode, loss=loss, train_op=train_op)
       elif mode == model_fn.ModeKeys.EVAL:
         return model_fn.EstimatorSpec(mode=mode, loss=array_ops.identity(loss))
       elif mode == model_fn.ModeKeys.PREDICT:
@@ -191,7 +196,7 @@ def mock_optimizer(testcase, hidden_units, expected_loss=None):
     testcase.assertEquals(0, loss.shape.ndims)
     if expected_loss is None:
       if global_step is not None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       return control_flow_ops.no_op()
     assert_loss = assert_close(
         math_ops.to_float(expected_loss, name='expected'),
@@ -199,12 +204,12 @@ def mock_optimizer(testcase, hidden_units, expected_loss=None):
         name='assert_loss')
     with ops.control_dependencies((assert_loss,)):
       if global_step is not None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       return control_flow_ops.no_op()
 
   optimizer_mock = test.mock.NonCallableMagicMock(
-      spec=optimizer.Optimizer,
-      wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+      spec=optimizer_lib.Optimizer,
+      wraps=optimizer_lib.Optimizer(use_locking=False, name='my_optimizer'))
   optimizer_mock.minimize = test.mock.MagicMock(wraps=_minimize)
 
   return optimizer_mock
@@ -828,7 +833,7 @@ class BaseDNNWarmStartingTest(object):
         optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
         # The provided regular expression will only warm-start the city
         # embedding, not the kernels and biases of the hidden weights.
-        warm_start_from=warm_starting_util.WarmStartSettings(
+        warm_start_from=estimator.WarmStartSettings(
             ckpt_to_initialize_from=dnn_classifier.model_dir,
             vars_to_warm_start='.*(city).*'))
 
@@ -892,7 +897,7 @@ class BaseDNNWarmStartingTest(object):
         dimension=2)
     # We can create our VocabInfo object from the new and old occupation
     # FeatureColumn's.
-    occupation_vocab_info = warm_starting_util.VocabInfo(
+    occupation_vocab_info = estimator.VocabInfo(
         new_vocab=new_occupation.categorical_column.vocabulary_file,
         new_vocab_size=new_occupation.categorical_column.vocabulary_size,
         num_oov_buckets=new_occupation.categorical_column.num_oov_buckets,
@@ -907,7 +912,7 @@ class BaseDNNWarmStartingTest(object):
         feature_columns=[occupation],
         n_classes=4,
         optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        warm_start_from=warm_starting_util.WarmStartSettings(
+        warm_start_from=estimator.WarmStartSettings(
             ckpt_to_initialize_from=dnn_classifier.model_dir,
             var_name_to_vocab_info={
                 OCCUPATION_EMBEDDING_NAME: occupation_vocab_info
@@ -978,7 +983,7 @@ class BaseDNNWarmStartingTest(object):
         optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
         # The 'city' variable correspond to the 'locality' variable in the
         # previous model.
-        warm_start_from=warm_starting_util.WarmStartSettings(
+        warm_start_from=estimator.WarmStartSettings(
             ckpt_to_initialize_from=dnn_classifier.model_dir,
             var_name_to_prev_var_name={
                 CITY_EMBEDDING_NAME:
@@ -1035,13 +1040,16 @@ class BaseDNNClassifierEvaluateTest(object):
         metric_keys.MetricKeys.LOSS: expected_loss,
         metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2.,
         metric_keys.MetricKeys.ACCURACY: 0.5,
+        metric_keys.MetricKeys.PRECISION: 0.0,
+        metric_keys.MetricKeys.RECALL: 0.0,
         metric_keys.MetricKeys.PREDICTION_MEAN: 0.11105597,
         metric_keys.MetricKeys.LABEL_MEAN: 0.5,
         metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
         # There is no good way to calculate AUC for only two data points. But
         # that is what the algorithm returns.
         metric_keys.MetricKeys.AUC: 0.5,
-        metric_keys.MetricKeys.AUC_PR: 0.25,
+        metric_keys.MetricKeys.AUC_PR: 0.75,
+
         ops.GraphKeys.GLOBAL_STEP: global_step
     }, dnn_classifier.evaluate(input_fn=_input_fn, steps=1))
 
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index cb9e3fc6ca116ac0f48a37cea92fa4119754f324..efa4bdf5980a34001e10bb7a1125e4434215f0ee 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
@@ -56,8 +57,8 @@ _PREDICT_SERVING_KEY = 'predict'
 
 # A LossSpec contains
 # * a scalar `Tensor` representing reduced weighted training loss
-# * a scalar `Tensor` representing the unreduced unweighted loss
-# * a scalar `Tensor` representing the example weights
+# * a `Tensor` representing the unreduced unweighted loss
+# * a `Tensor` representing the example weights
 # * possibly processed labels (e.g. vocabulary lookup, shape manipulation, etc)
 LossSpec = collections.namedtuple(
     'LossSpec', ['training_loss', 'unreduced_loss', 'weights',
@@ -85,40 +86,39 @@ class _Head(object):
     ```python
     def _my_dnn_model_fn(features, labels, mode, params, config=None):
       # Optionally your callers can pass head to model_fn as a param.
-      head = tf.contrib.learn.regression_head(...)
-      input = tf.contrib.layers.input_from_feature_columns(features, ...)
-      last_hidden_layer_out = tf.contrib.layers.stack(
-          input, tf.contrib.layers.fully_connected, [1000, 500])
-      logits = tf.contrib.layers.fully_connected(
-          last_hidden_layer_out, head.logits_dimension, activation_fn=None)
-
-      def _train_op_fn(loss):
-        return optimizer.minimize(loss)
+      head = tf.contrib.estimator.regression_head(...)
+      inputs = tf.feature_column.input_layer(features, ...)
+      hidden_layer0 = tf.layers.dense(
+          inputs, units=1000, activation=tf.nn.relu)
+      hidden_layer1 = tf.layers.dense(
+          hidden_layer0, units=500, activation=tf.nn.relu)
+      logits = tf.layers.dense(
+          hidden_layer1, units=head.logits_dimension, activation=None)
 
       return head.create_estimator_spec(
           features=features,
           labels=labels,
           mode=mode,
           logits=logits,
-          train_op_fn=_train_op_fn)
+          optimizer=optimizer)
     ```
 
   There are cases where computing and applying gradients can not be meaningfully
-  captured with train_op_fn we support (for example, with sync optimizer). In
-  such case, you can take the responsibility on your own. Here is a common
-  use case,
+  captured with optimizer or train_op_fn we support (for example, with sync
+  optimizer). In such case, you can take the responsibility on your own. Here is
+  a common use case,
     ```python
     estimator_spec = head.create_estimator_spec(
         features=features,
         labels=labels,
         mode=mode,
         logits=logits,
-        train_op_fn=tf.contrib.learn.no_op_train_fn)
+        train_op_fn=lambda _: tf.no_op())
     if mode == model_fn.ModeKeys.TRAIN:
       optimizer = ...
       sync = tf.train.SyncReplicasOptimizer(opt=optimizer, ...)
-      update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
-                                                  loss=estimator_spec.loss, ...)
+      update_op = sync.minimize(
+          estimator_spec.loss, global_step=tf.get_global_step())
       hooks = [sync.make_session_run_hook(is_chief)]
       ... update train_op and hooks in EstimatorSpec and return
     ```
@@ -163,8 +163,8 @@ class _Head(object):
     Returns:
       A LossSpec that contains
       * the scalar `Tensor` representing reduced weighted training loss
-      * the scalar `Tensor` representing the unreduced unweighted loss
-      * the scalar `Tensor` representing the example weights
+      * the `Tensor` representing the unreduced unweighted loss
+      * the `Tensor` representing the example weights
       * possibly processed labels (e.g. vocabulary lookup, shape manipulation,
         etc.)
 
@@ -172,10 +172,12 @@ class _Head(object):
     """
     raise NotImplementedError('Calling an abstract method.')
 
+  # TODO(b/65403806): By default, collect regularization_losses from
+  # GraphKeys.REGULARIZATION_LOSSES collection.
   @abc.abstractmethod
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns `EstimatorSpec` that a model_fn can return.
 
     Please note that,
@@ -186,10 +188,14 @@ class _Head(object):
       mode: Estimator's `ModeKeys`.
       logits: logits `Tensor` to be used by the head.
       labels: Labels `Tensor`, or `dict` of same.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
-        to optimize the model with the loss. This is used in TRAIN mode and
-        must not be None. None is allowed in other modes. If you want to
-        optimize loss yourself you can pass `no_op_train_fn` and then use
+        to optimize the model with the loss in TRAIN mode. Used if `optimizer`
+        is `None`. Exactly one of `train_op_fn` and `optimizer` must be set in
+        TRAIN mode. None is allowed in other modes. If you want to optimize loss
+        yourself you can pass `lambda _: tf.no_op()` and then use
         EstimatorSpec.loss to compute and apply gradients.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses.
@@ -257,9 +263,12 @@ def _check_dense_labels_match_logits_and_reshape(
         if (dim1 is not None) and (dim1 != expected_labels_dimension):
           raise ValueError(
               'Mismatched label shape. '
-              'Classifier configured with n_classes=%s.  Received %s. '
-              'Suggested Fix: check your n_classes argument to the estimator '
-              'and/or the shape of your label.' %
+              'Expected labels dimension=%s.  Received %s. '
+              'Suggested Fix:'
+              'If your classifier expects one-hot encoding label,'
+              'check your n_classes argument to the estimator'
+              'and/or the shape of your label.'
+              'Otherwise, check the shape of your label.' %
               (expected_labels_dimension, dim1))
       expected_labels_shape = array_ops.concat(
           [logits_shape[:-1], [expected_labels_dimension]], axis=0)
@@ -694,8 +703,8 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         processed_labels=label_ids)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -706,8 +715,11 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
       labels: Labels integer or string `Tensor` with shape matching `logits`,
         namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is
         required argument when `mode` equals `TRAIN` or `EVAL`.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Required in TRAIN mode.
+        `train_op`. Used if `optimizer` is `None`.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses. These losses are
         usually expressed as a batch average, so for best results users need to
@@ -717,7 +729,8 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
     Returns:
       `EstimatorSpec`.
     Raises:
-      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode, or if both are set.
     """
     with ops.name_scope(self._name, 'head'):
       logits = _check_logits_final_dim(logits, self.logits_dimension)
@@ -780,8 +793,16 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
                 regularization_loss=regularization_loss))
 
       # Train.
-      if train_op_fn is None:
-        raise ValueError('train_op_fn cannot be None.')
+      if optimizer is not None:
+        if train_op_fn is not None:
+          raise ValueError('train_op_fn and optimizer cannot both be set.')
+        train_op = optimizer.minimize(
+            regularized_training_loss,
+            global_step=training_util.get_global_step())
+      elif train_op_fn is not None:
+        train_op = train_op_fn(regularized_training_loss)
+      else:
+        raise ValueError('train_op_fn and optimizer cannot both be None.')
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -807,7 +828,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
-        train_op=train_op_fn(regularized_training_loss))
+        train_op=train_op)
 
 
 def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
@@ -869,11 +890,12 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
   Raises:
     ValueError: If `thresholds` contains a value outside of `(0, 1)`.
     ValueError: If `loss_reduction` is invalid.
+    TypeError: if `label_vocabulary` has invalid type.
   """
   thresholds = tuple(thresholds) if thresholds else tuple()
   if label_vocabulary is not None and not isinstance(label_vocabulary,
                                                      (list, tuple)):
-    raise ValueError(
+    raise TypeError(
         'label_vocabulary should be a list or tuple. Given type: {}'.format(
             type(label_vocabulary)))
 
@@ -940,6 +962,18 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                   predictions=class_ids,
                   weights=weights,
                   name=keys.ACCURACY),
+          _summary_key(self._name, keys.PRECISION):
+              metrics_lib.precision(
+                  labels=labels,
+                  predictions=class_ids,
+                  weights=weights,
+                  name=keys.PRECISION),
+          _summary_key(self._name, keys.RECALL):
+              metrics_lib.recall(
+                  labels=labels,
+                  predictions=class_ids,
+                  weights=weights,
+                  name=keys.RECALL),
           _summary_key(self._name, keys.PREDICTION_MEAN):
               _predictions_mean(
                   predictions=logistic,
@@ -1008,7 +1042,7 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
           vocabulary_list=tuple(self._label_vocabulary),
           name='class_id_lookup').lookup(labels)
     labels = math_ops.to_float(labels)
-    labels = _assert_range(labels, 2)
+    labels = _assert_range(labels, n_classes=2)
     if self._loss_fn:
       unweighted_loss = _call_loss_fn(
           loss_fn=self._loss_fn, labels=labels, logits=logits,
@@ -1027,8 +1061,8 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
         processed_labels=labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -1039,8 +1073,11 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
       labels: Labels integer or string `Tensor` with shape matching `logits`,
         namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is required
         argument when `mode` equals `TRAIN` or `EVAL`.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Required in TRAIN mode.
+        `train_op`. Used if `optimizer` is `None`.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses. These losses are
         usually expressed as a batch average, so for best results users need to
@@ -1050,7 +1087,8 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
     Returns:
       `EstimatorSpec`.
     Raises:
-      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode, or if both are set.
     """
     # Predict.
     with ops.name_scope(self._name, 'head'):
@@ -1122,8 +1160,16 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                 regularization_loss=regularization_loss))
 
       # Train.
-      if train_op_fn is None:
-        raise ValueError('train_op_fn can not be None.')
+      if optimizer is not None:
+        if train_op_fn is not None:
+          raise ValueError('train_op_fn and optimizer cannot both be set.')
+        train_op = optimizer.minimize(
+            regularized_training_loss,
+            global_step=training_util.get_global_step())
+      elif train_op_fn is not None:
+        train_op = train_op_fn(regularized_training_loss)
+      else:
+        raise ValueError('train_op_fn and optimizer cannot both be None.')
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -1148,7 +1194,7 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
-        train_op=train_op_fn(regularized_training_loss))
+        train_op=train_op)
 
 
 def _regression_head_with_mean_squared_error_loss(
@@ -1156,6 +1202,7 @@ def _regression_head_with_mean_squared_error_loss(
     label_dimension=1,
     loss_reduction=losses.Reduction.SUM,
     loss_fn=None,
+    inverse_link_fn=None,
     name=None):
   """Creates a `_Head` for regression using the `mean_squared_error` loss.
 
@@ -1174,10 +1221,16 @@ def _regression_head_with_mean_squared_error_loss(
   `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
   `[D0, D1, ... DN, label_dimension]`.
 
-  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  Supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
   `(labels, logits, features)` as arguments and returns unreduced loss with
   shape `[D0, D1, ... DN, label_dimension]`.
 
+  Also supports custom `inverse_link_fn`, also known as 'mean function'.
+  `inverse_link_fn` takes `logits` as argument and returns predicted values.
+  This function is the inverse of the link function defined in
+  https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function
+  Namely, for poisson regression, set `inverse_link_fn=tf.exp`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -1188,7 +1241,9 @@ def _regression_head_with_mean_squared_error_loss(
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
-    loss_fn: Optional loss function.
+    loss_fn: Optional loss function. Defaults to `mean_squared_error`.
+    inverse_link_fn: Optional inverse link function, also known as 'mean
+      function'. Defaults to identity.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -1208,6 +1263,7 @@ def _regression_head_with_mean_squared_error_loss(
       label_dimension=label_dimension,
       loss_reduction=loss_reduction,
       loss_fn=loss_fn,
+      inverse_link_fn=inverse_link_fn,
       name=name)
 
 
@@ -1220,6 +1276,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
       weight_column=None,
       loss_reduction=losses.Reduction.SUM,
       loss_fn=None,
+      inverse_link_fn=None,
       name=None):
     """`Head` for regression."""
     if label_dimension < 1:
@@ -1228,6 +1285,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
     self._weight_column = weight_column
     self._loss_reduction = loss_reduction
     self._loss_fn = loss_fn
+    self._inverse_link_fn = inverse_link_fn
     self._name = name
 
   @property
@@ -1265,8 +1323,8 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         processed_labels=labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -1278,8 +1336,11 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         `[D0, D1, ... DN, logits_dimension]`. When `logits_dimension=1`, shape
         `[D0, D1, ... DN]` is also supported. `labels` is required argument when
         `mode` equals `TRAIN` or `EVAL`.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Required in TRAIN mode.
+        `train_op`. Used if `optimizer` is `None`.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses. These losses are
         usually expressed as a batch average, so for best results users need to
@@ -1289,14 +1350,25 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
     Returns:
       `EstimatorSpec`.
     Raises:
-      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode, or if both are set.
     """
     # Predict.
     with ops.name_scope(self._name, 'head'):
       logits = _check_logits_final_dim(logits, self._logits_dimension)
-      predictions = {prediction_keys.PredictionKeys.PREDICTIONS: logits}
+      if self._inverse_link_fn:
+        predicted_value = self._inverse_link_fn(logits)
+        predictions = {
+            prediction_keys.PredictionKeys.PREDICTIONS: predicted_value,
+            prediction_keys.PredictionKeys.LOGITS: logits,
+        }
+      else:
+        predicted_value = logits
+        predictions = {
+            prediction_keys.PredictionKeys.PREDICTIONS: predicted_value}
       if mode == model_fn.ModeKeys.PREDICT:
-        regression_output = export_output.RegressionOutput(value=logits)
+        regression_output = export_output.RegressionOutput(
+            value=predicted_value)
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.PREDICT,
             predictions=predictions,
@@ -1339,8 +1411,16 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
             eval_metric_ops=eval_metric_ops)
 
       # Train.
-      if train_op_fn is None:
-        raise ValueError('train_op_fn can not be None.')
+      if optimizer is not None:
+        if train_op_fn is not None:
+          raise ValueError('train_op_fn and optimizer cannot both be set.')
+        train_op = optimizer.minimize(
+            regularized_training_loss,
+            global_step=training_util.get_global_step())
+      elif train_op_fn is not None:
+        train_op = train_op_fn(regularized_training_loss)
+      else:
+        raise ValueError('train_op_fn and optimizer cannot both be None.')
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -1365,17 +1445,17 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
-        train_op=train_op_fn(regularized_training_loss))
+        train_op=train_op)
 
 
 def _assert_range(labels, n_classes, message=None):
   with ops.name_scope(None, 'assert_range', (labels,)):
-    assert_less = check_ops.assert_less(
+    assert_less = check_ops.assert_less_equal(
         labels,
-        ops.convert_to_tensor(n_classes, dtype=labels.dtype),
-        message=message or 'Label IDs must < n_classes')
+        ops.convert_to_tensor(n_classes - 1, dtype=labels.dtype),
+        message=message or 'Labels must <= n_classes - 1')
     assert_greater = check_ops.assert_non_negative(
-        labels, message=message or 'Label IDs must >= 0')
+        labels, message=message or 'Labels must >= 0')
     with ops.control_dependencies((assert_less, assert_greater)):
       return array_ops.identity(labels)
 
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index c09f88262af3cdbb952a2ebadf2b2bdaf2a651cb..7da3df01dc48d9ed3aef8a030f7a516db3a5abeb 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -255,14 +255,14 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
         logits=logits_placeholder,
         labels=labels_placeholder)[0]
     with self.test_session():
-      with self.assertRaisesOpError('Label IDs must < n_classes'):
+      with self.assertRaisesOpError('Labels must <= n_classes - 1'):
         training_loss.eval({
             labels_placeholder: labels_2x1_with_large_id,
             logits_placeholder: logits_2x3
         })
 
     with self.test_session():
-      with self.assertRaisesOpError('Label IDs must >= 0'):
+      with self.assertRaisesOpError('Labels must >= 0'):
         training_loss.eval({
             labels_placeholder: labels_2x1_with_negative_id,
             logits_placeholder: logits_2x3
@@ -300,7 +300,12 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     features = {'x': values_2x3}
 
     # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'Dimensions must be equal'):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'Shape mismatch: The shape of labels \(received \(3,\)\) should equal '
+        r'the shape of logits except for the last dimension '
+        r'\(received \(2, 3\)\)\.'
+    ):
       head.create_loss(
           features=features,
           mode=model_fn.ModeKeys.EVAL,
@@ -837,6 +842,41 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
       }, summary_str, tol)
 
+  def test_train_with_optimizer(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+
+    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    features = {'x': np.array(((42,),), dtype=np.int32)}
+    expected_train_result = 'my_train_op'
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        return string_ops.string_join(
+            [constant_op.constant(expected_train_result),
+             string_ops.as_string(loss, precision=2)])
+
+    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
+    expected_loss = 10.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+
   def test_train_summaries_with_head_name(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
@@ -1554,11 +1594,13 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
         # loss_mean = loss/2 = 41./2 = 20.5
         keys.LOSS_MEAN: 20.5,
         keys.ACCURACY: 1./2,
+        keys.PRECISION: 1.,
+        keys.RECALL: 1./2,
         keys.PREDICTION_MEAN: 1./2,
         keys.LABEL_MEAN: 2./2,
         keys.ACCURACY_BASELINE: 2./2,
         keys.AUC: 0.,
-        keys.AUC_PR: 0.74999905,
+        keys.AUC_PR: 1.,
     }
 
     # Assert spec contains expected tensors.
@@ -1597,11 +1639,13 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     expected_metric_keys = [
         '{}/some_binary_head'.format(metric_keys.MetricKeys.LOSS_MEAN),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.ACCURACY),
+        '{}/some_binary_head'.format(metric_keys.MetricKeys.PRECISION),
+        '{}/some_binary_head'.format(metric_keys.MetricKeys.RECALL),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.PREDICTION_MEAN),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.LABEL_MEAN),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.ACCURACY_BASELINE),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.AUC),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.AUC_PR)
+        '{}/some_binary_head'.format(metric_keys.MetricKeys.AUC_PR),
     ]
     self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
 
@@ -1632,11 +1676,13 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
         keys.LOSS_MEAN: expected_unregularized_loss,
         keys.LOSS_REGULARIZATION: expected_regularization_loss,
         keys.ACCURACY: 1./2,
+        keys.PRECISION: 1.,
+        keys.RECALL: 1./2,
         keys.PREDICTION_MEAN: 1./2,
         keys.LABEL_MEAN: 2./2,
         keys.ACCURACY_BASELINE: 2./2,
         keys.AUC: 0.,
-        keys.AUC_PR: 0.75,
+        keys.AUC_PR: 1.,
     }
 
     # Assert predictions, loss, and metrics.
@@ -1737,11 +1783,13 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     expected_metrics = {
         keys.LOSS_MEAN: 1.62652338 / 2.,
         keys.ACCURACY: 1./2,
+        keys.PRECISION: 1.,
+        keys.RECALL: .5,
         keys.PREDICTION_MEAN: 1./2,
         keys.LABEL_MEAN: 2./2,
         keys.ACCURACY_BASELINE: 2./2,
         keys.AUC: 0.,
-        keys.AUC_PR: 0.74999905,
+        keys.AUC_PR: 1.,
         keys.ACCURACY_AT_THRESHOLD % thresholds[0]: 1.,
         keys.PRECISION_AT_THRESHOLD % thresholds[0]: 1.,
         keys.RECALL_AT_THRESHOLD % thresholds[0]: 1.,
@@ -1929,6 +1977,39 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: 20.5,
       }, summary_str)
 
+  def test_train_with_optimizer(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    labels = np.array(((1,), (1,),), dtype=np.float64)
+    expected_train_result = b'my_train_op'
+    features = {'x': np.array(((42,),), dtype=np.float32)}
+    # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
+    expected_loss = 41.
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        with ops.control_dependencies((check_ops.assert_equal(
+            math_ops.to_float(expected_loss), math_ops.to_float(loss),
+            name='assert_loss'),)):
+          return constant_op.constant(expected_train_result)
+
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+
   def test_train_summaries_with_head_name(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
         name='some_binary_head')
@@ -2009,6 +2090,24 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
               expected_regularization_loss),
       }, summary_str)
 
+  def test_float_labels_invalid_values(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    logits = np.array([[0.5], [-0.3]], dtype=np.float32)
+    labels = np.array([[1.2], [0.4]], dtype=np.float32)
+    features = {'x': np.array([[42]], dtype=np.float32)}
+    training_loss = head.create_loss(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)[0]
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r'Labels must <= n_classes - 1'):
+      with self.test_session():
+        _initialize_variables(self, monitored_session.Scaffold())
+        training_loss.eval()
+
   def test_float_labels_train_create_loss(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
 
@@ -2182,13 +2281,15 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
         keys.LOSS_MEAN: 26.9615384615,
         # accuracy = (1*1 + .1*0 + 1.5*0)/(1 + .1 + 1.5) = 1/2.6 = .38461538461
         keys.ACCURACY: .38461538461,
+        keys.PRECISION: 1./2.5,
+        keys.RECALL: 1./1.1,
         # prediction_mean = (1*1 + .1*0 + 1.5*1)/(1 + .1 + 1.5) = 2.5/2.6
         #                 = .96153846153
         keys.PREDICTION_MEAN: .96153846153,
         keys.LABEL_MEAN: expected_label_mean,
         keys.ACCURACY_BASELINE: 1 - expected_label_mean,
         keys.AUC: .45454565,
-        keys.AUC_PR: .21923049,
+        keys.AUC_PR: .6737757325172424,
     }
 
     # Assert spec contains expected tensors.
@@ -2481,13 +2582,15 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     expected_metrics = {
         keys.LOSS_MEAN: expected_loss / np.sum(weights),
         keys.ACCURACY: (1.*0. + 1.5*1. + 2.*1. + 2.5*0.) / np.sum(weights),
+        keys.PRECISION: 2.0/3.0,
+        keys.RECALL: 2.0/4.5,
         keys.PREDICTION_MEAN: (1.*1 + 1.5*0 + 2.*1 + 2.5*0) / np.sum(weights),
         keys.LABEL_MEAN: (1.*0 + 1.5*0 + 2.*1 + 2.5*1) / np.sum(weights),
         keys.ACCURACY_BASELINE: (1.*0 + 1.5*0 + 2.*1 + 2.5*1) / np.sum(weights),
         # We cannot reliably calculate AUC with only 4 data points, but the
         # values should not change because of backwards-compatibility.
         keys.AUC: 0.5222,
-        keys.AUC_PR: 0.5119,
+        keys.AUC_PR: 0.7341,
     }
 
     tol = 1e-2
@@ -2703,10 +2806,9 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     self.assertIsNone(spec.loss)
     self.assertEqual({}, spec.eval_metric_ops)
     self.assertIsNone(spec.train_op)
+    default_serving_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     self.assertItemsEqual(
-        (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-         'predict',
-         'regression'),
+        (default_serving_key, 'predict', 'regression'),
         spec.export_outputs.keys())
     _assert_no_hooks(self, spec)
 
@@ -2714,6 +2816,54 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     with self.test_session():
       _initialize_variables(self, spec.scaffold)
       self.assertAllClose(logits, spec.predictions[prediction_key].eval())
+      self.assertAllClose(
+          logits, spec.export_outputs[default_serving_key].value.eval())
+      self.assertAllClose(
+          logits, spec.export_outputs['regression'].value.eval())
+      self.assertAllClose(
+          logits, spec.export_outputs['predict'].outputs['predictions'].eval())
+
+  def test_predict_with_inverse_link_fn(self):
+    def _inverse_link_fn(logits):
+      return logits - 10.
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        inverse_link_fn=_inverse_link_fn)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.int32)
+    expected_predictions = np.array(((35,), (31,),), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    # Assert spec contains expected tensors.
+    keys = prediction_keys.PredictionKeys
+    self.assertItemsEqual(
+        (keys.PREDICTIONS, keys.LOGITS), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[keys.PREDICTIONS].dtype)
+    self.assertEqual(dtypes.float32, spec.predictions[keys.LOGITS].dtype)
+    default_serving_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    self.assertItemsEqual(
+        (default_serving_key, 'predict', 'regression'),
+        spec.export_outputs.keys())
+
+    # Assert predictions.
+    with self.test_session():
+      _initialize_variables(self, spec.scaffold)
+      self.assertAllClose(
+          expected_predictions, spec.predictions[keys.PREDICTIONS].eval())
+      self.assertAllClose(logits, spec.predictions[keys.LOGITS].eval())
+      self.assertAllClose(
+          expected_predictions,
+          spec.export_outputs[default_serving_key].value.eval())
+      self.assertAllClose(
+          expected_predictions, spec.export_outputs['regression'].value.eval())
+      self.assertAllClose(
+          expected_predictions,
+          spec.export_outputs['predict'].outputs['predictions'].eval())
+      self.assertAllClose(
+          logits, spec.export_outputs['predict'].outputs['logits'].eval())
 
   def test_eval_create_loss(self):
     head = head_lib._regression_head_with_mean_squared_error_loss()
@@ -3012,6 +3162,40 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: 6.5,
       }, summary_str)
 
+  def test_train_with_optimizer(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    labels = np.array(((43.,), (44.,),), dtype=np.float64)
+    expected_train_result = b'my_train_op'
+    features = {'x': np.array(((42.,),), dtype=np.float32)}
+    # loss = (43-45)^2 + (44-41)^2 = 4 + 9 = 13
+    expected_loss = 13
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        with ops.control_dependencies((check_ops.assert_equal(
+            math_ops.to_float(expected_loss), math_ops.to_float(loss),
+            name='assert_loss'),)):
+          return constant_op.constant(expected_train_result)
+
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+
   def test_train_summaries_with_head_name(self):
     head = head_lib._regression_head_with_mean_squared_error_loss(
         name='some_regression_head')
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index a2f24ef27044680fe93b176b5207593165d0d109..e7ec4179917a88703444f8aa835ed0359ff58a46 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -33,7 +33,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import ftrl
-from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -157,17 +156,11 @@ def _linear_model_fn(features, labels, mode, head, feature_columns, optimizer,
         units=head.logits_dimension, feature_columns=feature_columns)
     logits = logit_fn(features=features)
 
-    def _train_op_fn(loss):
-      """Returns the op to optimize the loss."""
-      return optimizer.minimize(
-          loss,
-          global_step=training_util.get_global_step())
-
     return head.create_estimator_spec(
         features=features,
         mode=mode,
         labels=labels,
-        train_op_fn=_train_op_fn,
+        optimizer=optimizer,
         logits=logits)
 
 
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index e88fcbbd2e0e3617dde428662e58b1d86c4eddd0..0e6436b42143f4b136165d47c41e143dacb4d476 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -31,7 +31,6 @@ from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import linear
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.export import export
@@ -48,13 +47,13 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import optimizer as optimizer_lib
@@ -683,7 +682,7 @@ class BaseLinearRegressorTrainingTest(object):
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
@@ -691,7 +690,7 @@ class BaseLinearRegressorTrainingTest(object):
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
 
     mock_optimizer = test.mock.NonCallableMock(
@@ -906,13 +905,13 @@ class BaseLinearClassifierTrainingTest(object):
       # Verify loss. We can't check the value directly, so we add an assert op.
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
           loss,
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
 
     mock_optimizer = test.mock.NonCallableMock(
         spec=optimizer_lib.Optimizer,
@@ -1338,11 +1337,13 @@ class BaseLinearClassifierEvaluationTest(object):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: 41.,
           metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: 0.,
           metric_keys.MetricKeys.LABEL_MEAN: 1.,
           metric_keys.MetricKeys.ACCURACY_BASELINE: 1,
           metric_keys.MetricKeys.AUC: 0.,
-          metric_keys.MetricKeys.AUC_PR: 0.5,
+          metric_keys.MetricKeys.AUC_PR: 1.,
       }
     else:
       # Multi classes: loss = 1 * -log ( soft_max(logits)[label] )
@@ -1407,6 +1408,8 @@ class BaseLinearClassifierEvaluationTest(object):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
           metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: 0.5,
           metric_keys.MetricKeys.LABEL_MEAN: 0.5,
           metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
@@ -1488,6 +1491,8 @@ class BaseLinearClassifierEvaluationTest(object):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
           metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: predictions_mean,
           metric_keys.MetricKeys.LABEL_MEAN: label_mean,
           metric_keys.MetricKeys.ACCURACY_BASELINE: (
@@ -1968,7 +1973,7 @@ class BaseLinearWarmStartingTest(object):
         optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
         # The provided regular expression will only warm-start the age variable
         # and not the bias.
-        warm_start_from=warm_starting_util.WarmStartSettings(
+        warm_start_from=estimator.WarmStartSettings(
             ckpt_to_initialize_from=linear_classifier.model_dir,
             vars_to_warm_start='.*(age).*'))
 
@@ -2016,7 +2021,7 @@ class BaseLinearWarmStartingTest(object):
         vocabulary_size=len(new_vocab_list))
     # We can create our VocabInfo object from the new and old occupation
     # FeatureColumn's.
-    occupation_vocab_info = warm_starting_util.VocabInfo(
+    occupation_vocab_info = estimator.VocabInfo(
         new_vocab=new_occupation.vocabulary_file,
         new_vocab_size=new_occupation.vocabulary_size,
         num_oov_buckets=new_occupation.num_oov_buckets,
@@ -2030,7 +2035,7 @@ class BaseLinearWarmStartingTest(object):
         feature_columns=[occupation],
         n_classes=4,
         optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        warm_start_from=warm_starting_util.WarmStartSettings(
+        warm_start_from=estimator.WarmStartSettings(
             ckpt_to_initialize_from=linear_classifier.model_dir,
             var_name_to_vocab_info={
                 OCCUPATION_WEIGHT_NAME: occupation_vocab_info
@@ -2082,7 +2087,7 @@ class BaseLinearWarmStartingTest(object):
         optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
         # The 'age' variable correspond to the 'age_in_years' variable in the
         # previous model.
-        warm_start_from=warm_starting_util.WarmStartSettings(
+        warm_start_from=estimator.WarmStartSettings(
             ckpt_to_initialize_from=linear_classifier.model_dir,
             var_name_to_prev_var_name={
                 AGE_WEIGHT_NAME: AGE_WEIGHT_NAME.replace('age', 'age_in_years')
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
index 44eb680939203fea67e3391326a6f1013f022ad5..f374d3154982e3b7cdc637e9e3606b3a2947cbf3 100644
--- a/tensorflow/python/estimator/canned/metric_keys.py
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -28,6 +28,8 @@ class MetricKeys(object):
   LOSS_REGULARIZATION = 'regularization_loss'
 
   ACCURACY = 'accuracy'
+  PRECISION = 'precision'
+  RECALL = 'recall'
   # This is the best the model could do by always predicting one class.
   # Should be < ACCURACY in a trained model.
   ACCURACY_BASELINE = 'accuracy_baseline'
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 1167b3834eb6a79abf670f629ec2cbc37957d191..e750e243bef4f054caa7eb2618efa79aaa705d89 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 import os
 import tempfile
@@ -29,31 +30,37 @@ import six
 from google.protobuf import message
 from tensorflow.core.framework import summary_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator import util
-from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.export.export import build_all_signature_defs
 from tensorflow.python.estimator.export.export import get_temp_export_dir
 from tensorflow.python.estimator.export.export import get_timestamped_export_dir
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder as saved_model_builder
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import device_setter
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import evaluation
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
+from tensorflow.python.training import warm_starting_util
 from tensorflow.python.util import compat
 from tensorflow.python.util import compat_internal
 from tensorflow.python.util import nest
@@ -94,10 +101,6 @@ class Estimator(object):
   None of `Estimator`'s methods can be overridden in subclasses (its
   constructor enforces this). Subclasses should use `model_fn` to configure
   the base class, and may add methods implementing specialized functionality.
-
-  @compatibility(eager)
-  Estimators are not compatible with eager execution.
-  @end_compatibility
   """
 
   def __init__(self, model_fn, model_dir=None, config=None, params=None,
@@ -137,8 +140,8 @@ class Estimator(object):
                  to configure Estimators from hyper parameter tuning.
           * `config`: Optional configuration object. Will receive what is passed
                  to Estimator in `config` parameter, or the default `config`.
-                 Allows updating things in your model_fn based on configuration
-                 such as `num_ps_replicas`, or `model_dir`.
+                 Allows updating things in your `model_fn` based on
+                 configuration such as `num_ps_replicas`, or `model_dir`.
 
         * Returns:
           `EstimatorSpec`
@@ -160,15 +163,10 @@ class Estimator(object):
                        vocabularies and Tensor names are unchanged.
 
     Raises:
-      RuntimeError: If eager execution is enabled.
       ValueError: parameters of `model_fn` don't match `params`.
       ValueError: if this is called via a subclass and if that class overrides
         a member of `Estimator`.
     """
-    if context.in_eager_mode():
-      raise RuntimeError(
-          'Estimators are not supported when eager execution is enabled.')
-
     Estimator._assert_members_are_not_overridden(self)
 
     if config is None:
@@ -181,6 +179,9 @@ class Estimator(object):
             config)
       self._config = config
 
+    # The distribute field contains an instance of DistributionStrategy.
+    self._distribution = self._config.train_distribute
+
     # Model directory.
     model_dir = compat_internal.path_to_str(model_dir)
     if (model_dir is not None) and (self._config.model_dir is not None):
@@ -203,11 +204,16 @@ class Estimator(object):
     logging.info('Using config: %s', str(vars(self._config)))
 
     if self._config.session_config is None:
-      self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+      rewrite_opts = rewriter_config_pb2.RewriterConfig(
+          meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE)
+      graph_opts = config_pb2.GraphOptions(rewrite_options=rewrite_opts)
+      self._session_config = config_pb2.ConfigProto(
+          allow_soft_placement=True, graph_options=graph_opts)
     else:
       self._session_config = self._config.session_config
 
-    self._device_fn = _get_replica_device_setter(self._config)
+    self._device_fn = self._config.device_fn or \
+                      _get_replica_device_setter(self._config)
 
     if model_fn is None:
       raise ValueError('model_fn must be provided to Estimator.')
@@ -216,8 +222,8 @@ class Estimator(object):
     self._params = copy.deepcopy(params or {})
 
     # pylint: disable=protected-access
-    self._warm_start_settings = (
-        warm_starting_util._get_default_warm_start_settings(warm_start_from))
+    self._warm_start_settings = _get_default_warm_start_settings(
+        warm_start_from)
     # pylint: enable=protected-access
 
   @property
@@ -260,7 +266,8 @@ class Estimator(object):
       ValueError: If the Estimator has not produced a checkpoint yet.
     """
     _check_checkpoint_available(self.model_dir)
-    return training.load_variable(self.model_dir, name)
+    with context.graph_mode():
+      return training.load_variable(self.model_dir, name)
 
   def get_variable_names(self):
     """Returns list of all variable names in this model.
@@ -272,7 +279,8 @@ class Estimator(object):
       ValueError: If the Estimator has not produced a checkpoint yet.
     """
     _check_checkpoint_available(self.model_dir)
-    return [name for name, _ in training.list_variables(self.model_dir)]
+    with context.graph_mode():
+      return [name for name, _ in training.list_variables(self.model_dir)]
 
   def latest_checkpoint(self):
     """Finds the filename of latest saved checkpoint file in `model_dir`.
@@ -281,7 +289,8 @@ class Estimator(object):
       The full path to the latest checkpoint or `None` if no checkpoint was
       found.
     """
-    return saver.latest_checkpoint(self.model_dir)
+    with context.graph_mode():
+      return saver.latest_checkpoint(self.model_dir)
 
   def train(self,
             input_fn,
@@ -299,11 +308,11 @@ class Estimator(object):
 
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
             tuple (features, labels) with same constraints as below.
-          * A tuple (features, labels): Where features is a `Tensor` or a
-            dictionary of string feature name to `Tensor` and labels is a
+          * A tuple (features, labels): Where `features` is a `Tensor` or a
+            dictionary of string feature name to `Tensor` and `labels` is a
             `Tensor` or a dictionary of string label name to `Tensor`. Both
-            features and labels are consumed by `model_fn`. They should satisfy
-            the expectation of `model_fn` from inputs.
+            `features` and `labels` are consumed by `model_fn`. They should
+            satisfy the expectation of `model_fn` from inputs.
 
       hooks: List of `SessionRunHook` subclass instances. Used for callbacks
         inside the training loop.
@@ -333,27 +342,28 @@ class Estimator(object):
       ValueError: If both `steps` and `max_steps` are not `None`.
       ValueError: If either `steps` or `max_steps` is <= 0.
     """
-    if (steps is not None) and (max_steps is not None):
-      raise ValueError('Can not provide both steps and max_steps.')
-    if steps is not None and steps <= 0:
-      raise ValueError('Must specify steps > 0, given: {}'.format(steps))
-    if max_steps is not None and max_steps <= 0:
-      raise ValueError(
-          'Must specify max_steps > 0, given: {}'.format(max_steps))
+    with context.graph_mode():
+      if (steps is not None) and (max_steps is not None):
+        raise ValueError('Can not provide both steps and max_steps.')
+      if steps is not None and steps <= 0:
+        raise ValueError('Must specify steps > 0, given: {}'.format(steps))
+      if max_steps is not None and max_steps <= 0:
+        raise ValueError(
+            'Must specify max_steps > 0, given: {}'.format(max_steps))
 
-    if max_steps is not None:
-      start_step = _load_global_step_from_checkpoint_dir(self._model_dir)
-      if max_steps <= start_step:
-        logging.info('Skipping training since max_steps has already saved.')
-        return self
+      if max_steps is not None:
+        start_step = _load_global_step_from_checkpoint_dir(self._model_dir)
+        if max_steps <= start_step:
+          logging.info('Skipping training since max_steps has already saved.')
+          return self
 
-    hooks = _check_hooks_type(hooks)
-    hooks.extend(self._convert_train_steps_to_hooks(steps, max_steps))
+      hooks = _check_hooks_type(hooks)
+      hooks.extend(self._convert_train_steps_to_hooks(steps, max_steps))
 
-    saving_listeners = _check_listeners_type(saving_listeners)
-    loss = self._train_model(input_fn, hooks, saving_listeners)
-    logging.info('Loss for final step: %s.', loss)
-    return self
+      saving_listeners = _check_listeners_type(saving_listeners)
+      loss = self._train_model(input_fn, hooks, saving_listeners)
+      logging.info('Loss for final step: %s.', loss)
+      return self
 
   def _convert_train_steps_to_hooks(self, steps, max_steps):
     if steps is not None or max_steps is not None:
@@ -379,11 +389,11 @@ class Estimator(object):
 
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
             tuple (features, labels) with same constraints as below.
-          * A tuple (features, labels): Where features is a `Tensor` or a
-            dictionary of string feature name to `Tensor` and labels is a
+          * A tuple (features, labels): Where `features` is a `Tensor` or a
+            dictionary of string feature name to `Tensor` and `labels` is a
             `Tensor` or a dictionary of string label name to `Tensor`. Both
-            features and labels are consumed by `model_fn`. They should satisfy
-            the expectation of `model_fn` from inputs.
+            `features` and `labels` are consumed by `model_fn`. They should
+            satisfy the expectation of `model_fn` from inputs.
 
       steps: Number of steps for which to evaluate model. If `None`, evaluates
         until `input_fn` raises an end-of-input exception.
@@ -406,14 +416,15 @@ class Estimator(object):
       ValueError: If no model has been trained, namely `model_dir`, or the
         given `checkpoint_path` is empty.
     """
-    hooks = _check_hooks_type(hooks)
-    hooks.extend(self._convert_eval_steps_to_hooks(steps))
+    with context.graph_mode():
+      hooks = _check_hooks_type(hooks)
+      hooks.extend(self._convert_eval_steps_to_hooks(steps))
 
-    return self._evaluate_model(
-        input_fn=input_fn,
-        hooks=hooks,
-        checkpoint_path=checkpoint_path,
-        name=name)
+      return self._evaluate_model(
+          input_fn=input_fn,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          name=name)
 
   def _convert_eval_steps_to_hooks(self, steps):
     if steps is None:
@@ -455,67 +466,70 @@ class Estimator(object):
       checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
         latest checkpoint in `model_dir` is used.
       yield_single_examples: If False, yield the whole batch as returned by the
-        model_fn instead of decomposing the batch into individual elements. This
-        is useful if model_fn return some tensor with first dimension not
-        equal to the batch size
+        `model_fn` instead of decomposing the batch into individual elements.
+        This is useful if `model_fn` returns some tensors whose first dimension
+        is not equal to the batch size.
 
     Yields:
       Evaluated values of `predictions` tensors.
 
     Raises:
-      ValueError: Could not find a trained model in model_dir.
-      ValueError: if batch length of predictions are not same and
-        yield_single_examples is True.
+      ValueError: Could not find a trained model in `model_dir`.
+      ValueError: If batch length of predictions is not the same and
+        `yield_single_examples` is True.
       ValueError: If there is a conflict between `predict_keys` and
         `predictions`. For example if `predict_keys` is not `None` but
         `EstimatorSpec.predictions` is not a `dict`.
     """
-    hooks = _check_hooks_type(hooks)
-    # Check that model has been trained.
-    if not checkpoint_path:
-      checkpoint_path = saver.latest_checkpoint(self._model_dir)
-    if not checkpoint_path:
-      raise ValueError('Could not find trained model in model_dir: {}.'.format(
-          self._model_dir))
-
-    with ops.Graph().as_default() as g:
-      random_seed.set_random_seed(self._config.tf_random_seed)
-      self._create_and_assert_global_step(g)
-      features, input_hooks = self._get_features_from_input_fn(
-          input_fn, model_fn_lib.ModeKeys.PREDICT)
-      estimator_spec = self._call_model_fn(
-          features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
-      predictions = self._extract_keys(estimator_spec.predictions, predict_keys)
-      all_hooks = list(input_hooks)
-      all_hooks.extend(hooks)
-      all_hooks.extend(list(estimator_spec.prediction_hooks or []))
-      with training.MonitoredSession(
-          session_creator=training.ChiefSessionCreator(
-              checkpoint_filename_with_path=checkpoint_path,
-              master=self._config.master,
-              scaffold=estimator_spec.scaffold,
-              config=self._session_config),
-          hooks=all_hooks) as mon_sess:
-        while not mon_sess.should_stop():
-          preds_evaluated = mon_sess.run(predictions)
-          if not yield_single_examples:
-            yield preds_evaluated
-          elif not isinstance(predictions, dict):
-            for pred in preds_evaluated:
-              yield pred
-          else:
-            for i in range(self._extract_batch_length(preds_evaluated)):
-              yield {
-                  key: value[i]
-                  for key, value in six.iteritems(preds_evaluated)
-              }
+    with context.graph_mode():
+      hooks = _check_hooks_type(hooks)
+      # Check that model has been trained.
+      if not checkpoint_path:
+        checkpoint_path = saver.latest_checkpoint(self._model_dir)
+      if not checkpoint_path:
+        raise ValueError(
+            'Could not find trained model in model_dir: {}.'.format(
+                self._model_dir))
+
+      with ops.Graph().as_default() as g:
+        random_seed.set_random_seed(self._config.tf_random_seed)
+        self._create_and_assert_global_step(g)
+        features, input_hooks = self._get_features_from_input_fn(
+            input_fn, model_fn_lib.ModeKeys.PREDICT)
+        estimator_spec = self._call_model_fn(
+            features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
+        predictions = self._extract_keys(
+            estimator_spec.predictions, predict_keys)
+        all_hooks = list(input_hooks)
+        all_hooks.extend(hooks)
+        all_hooks.extend(list(estimator_spec.prediction_hooks or []))
+        with training.MonitoredSession(
+            session_creator=training.ChiefSessionCreator(
+                checkpoint_filename_with_path=checkpoint_path,
+                master=self._config.master,
+                scaffold=estimator_spec.scaffold,
+                config=self._session_config),
+            hooks=all_hooks) as mon_sess:
+          while not mon_sess.should_stop():
+            preds_evaluated = mon_sess.run(predictions)
+            if not yield_single_examples:
+              yield preds_evaluated
+            elif not isinstance(predictions, dict):
+              for pred in preds_evaluated:
+                yield pred
+            else:
+              for i in range(self._extract_batch_length(preds_evaluated)):
+                yield {
+                    key: value[i]
+                    for key, value in six.iteritems(preds_evaluated)
+                }
 
   def _assert_members_are_not_overridden(self):
     """Asserts members of `Estimator` are not overridden."""
     allowed_overrides = set([
         '_call_input_fn', '_create_global_step',
         '_convert_train_steps_to_hooks', '_convert_eval_steps_to_hooks',
-        '_tf_api_names'
+        '_tf_api_names', '_validate_features_in_predict_input'
     ])
     estimator_members = set([m for m in Estimator.__dict__.keys()
                              if not m.startswith('__')])
@@ -570,7 +584,7 @@ class Estimator(object):
       export_dir_base: A string containing a directory in which to create
         timestamped subdirectories containing exported SavedModels.
       serving_input_receiver_fn: A function that takes no argument and
-        returns a `ServingInputReceiver`.
+        returns a `ServingInputReceiver` or `TensorServingInputReceiver`.
       assets_extra: A dict specifying how to populate the assets.extra directory
         within the exported SavedModel, or `None` if no extra assets are needed.
       as_text: whether to write the SavedModel proto in text format.
@@ -588,73 +602,73 @@ class Estimator(object):
           are provided, or no checkpoint can be found.
     """
     # pylint: enable=line-too-long
-    if serving_input_receiver_fn is None:
-      raise ValueError('serving_input_receiver_fn must be defined.')
-
-    with ops.Graph().as_default() as g:
-      self._create_and_assert_global_step(g)
-      random_seed.set_random_seed(self._config.tf_random_seed)
-      serving_input_receiver = serving_input_receiver_fn()
-
-      # Call the model_fn and collect the export_outputs.
-      estimator_spec = self._call_model_fn(
-          features=serving_input_receiver.features,
-          labels=None,
-          mode=model_fn_lib.ModeKeys.PREDICT,
-          config=self.config)
-
-      # Build the SignatureDefs from receivers and all outputs
-      signature_def_map = build_all_signature_defs(
-          serving_input_receiver.receiver_tensors,
-          estimator_spec.export_outputs,
-          serving_input_receiver.receiver_tensors_alternatives)
-
-      if not checkpoint_path:
-        # Locate the latest checkpoint
-        checkpoint_path = saver.latest_checkpoint(self._model_dir)
-      if not checkpoint_path:
-        raise ValueError("Couldn't find trained model at %s." % self._model_dir)
-
-      export_dir = get_timestamped_export_dir(export_dir_base)
-      temp_export_dir = get_temp_export_dir(export_dir)
-
-      # TODO(soergel): Consider whether MonitoredSession makes sense here
-      with tf_session.Session(config=self._session_config) as session:
-
-        saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
-            sharded=True)
-        saver_for_restore.restore(session, checkpoint_path)
-
-        # pylint: disable=protected-access
-        local_init_op = (
-            estimator_spec.scaffold.local_init_op or
-            monitored_session.Scaffold._default_local_init_op())
-        # pylint: enable=protected-access
-
-        # Perform the export
-        builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
-        builder.add_meta_graph_and_variables(
-            session, [tag_constants.SERVING],
-            signature_def_map=signature_def_map,
-            assets_collection=ops.get_collection(
-                ops.GraphKeys.ASSET_FILEPATHS),
-            legacy_init_op=local_init_op,
-            strip_default_attrs=strip_default_attrs)
-        builder.save(as_text)
-
-      # Add the extra assets
-      if assets_extra:
-        assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
-                                         compat.as_bytes('assets.extra'))
-        for dest_relative, source in assets_extra.items():
-          dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
-                                       compat.as_bytes(dest_relative))
-          dest_path = os.path.dirname(dest_absolute)
-          gfile.MakeDirs(dest_path)
-          gfile.Copy(source, dest_absolute)
-
-      gfile.Rename(temp_export_dir, export_dir)
-      return export_dir
+    with context.graph_mode():
+      if serving_input_receiver_fn is None:
+        raise ValueError('serving_input_receiver_fn must be defined.')
+
+      with ops.Graph().as_default() as g:
+        self._create_and_assert_global_step(g)
+        random_seed.set_random_seed(self._config.tf_random_seed)
+        serving_input_receiver = serving_input_receiver_fn()
+
+        # Call the model_fn and collect the export_outputs.
+        estimator_spec = self._call_model_fn(
+            features=serving_input_receiver.features,
+            labels=None,
+            mode=model_fn_lib.ModeKeys.PREDICT,
+            config=self.config)
+
+        # Build the SignatureDefs from receivers and all outputs
+        signature_def_map = build_all_signature_defs(
+            serving_input_receiver.receiver_tensors,
+            estimator_spec.export_outputs,
+            serving_input_receiver.receiver_tensors_alternatives)
+
+        if not checkpoint_path:
+          # Locate the latest checkpoint
+          checkpoint_path = saver.latest_checkpoint(self._model_dir)
+        if not checkpoint_path:
+          raise ValueError(
+              "Couldn't find trained model at %s." % self._model_dir)
+
+        export_dir = get_timestamped_export_dir(export_dir_base)
+        temp_export_dir = get_temp_export_dir(export_dir)
+
+        # TODO(soergel): Consider whether MonitoredSession makes sense here
+        with tf_session.Session(config=self._session_config) as session:
+
+          saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
+              sharded=True)
+          saver_for_restore.restore(session, checkpoint_path)
+
+          local_init_op = (
+              estimator_spec.scaffold.local_init_op or
+              monitored_session.Scaffold.default_local_init_op())
+
+          # Perform the export
+          builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
+          builder.add_meta_graph_and_variables(
+              session, [tag_constants.SERVING],
+              signature_def_map=signature_def_map,
+              assets_collection=ops.get_collection(
+                  ops.GraphKeys.ASSET_FILEPATHS),
+              legacy_init_op=local_init_op,
+              strip_default_attrs=strip_default_attrs)
+          builder.save(as_text)
+
+        # Add the extra assets
+        if assets_extra:
+          assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
+                                           compat.as_bytes('assets.extra'))
+          for dest_relative, source in assets_extra.items():
+            dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
+                                         compat.as_bytes(dest_relative))
+            dest_path = os.path.dirname(dest_absolute)
+            gfile.MakeDirs(dest_path)
+            gfile.Copy(source, dest_absolute)
+
+        gfile.Rename(temp_export_dir, export_dir)
+        return export_dir
 
   def _get_features_from_input_fn(self, input_fn, mode):
     """Extracts the `features` from return values of `input_fn`."""
@@ -668,20 +682,30 @@ class Estimator(object):
       # Unconditionally drop the label (the second element of result).
       result = result[0]
 
+    self._validate_features_in_predict_input(result)
+    return result, input_hooks
+
+  def _validate_features_in_predict_input(self, result):
     if not _has_dataset_or_queue_runner(result):
       logging.warning('Input graph does not use tf.data.Dataset or contain a '
                       'QueueRunner. That means predict yields forever. '
                       'This is probably a mistake.')
-    return result, input_hooks
 
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
     """Extracts the `features` and labels from return values of `input_fn`."""
-    result = self._call_input_fn(input_fn, mode)
     input_hooks = []
-    if isinstance(result, dataset_ops.Dataset):
+    if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
+      result = self._distribution.distribute_dataset(
+          lambda: self._call_input_fn(input_fn, mode))
       iterator = result.make_initializable_iterator()
       input_hooks.append(_DatasetInitializerHook(iterator))
       result = iterator.get_next()
+    else:
+      result = self._call_input_fn(input_fn, mode)
+      if isinstance(result, dataset_ops.Dataset):
+        iterator = result.make_initializable_iterator()
+        input_hooks.append(_DatasetInitializerHook(iterator))
+        result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
@@ -696,7 +720,7 @@ class Estimator(object):
       batch_length = batch_length or value.shape[0]
       if value.shape[0] != batch_length:
         raise ValueError('Batch length of predictions should be same. %s has '
-                         'different batch length then others.' % key)
+                         'different batch length than others.' % key)
     return batch_length
 
   def _extract_keys(self, predictions, predict_keys):
@@ -720,7 +744,7 @@ class Estimator(object):
     """Creates the global step tensor in graph.
 
     The global step tensor must be an integer type with name 'global_step' and
-    be added to the collection ${tf.GraphKeys.GLOBAL_STEP}.
+    be added to the collection @{tf.GraphKeys.GLOBAL_STEP}.
 
     Args:
       graph: The graph in which to create the global step tensor.
@@ -810,6 +834,12 @@ class Estimator(object):
     return model_fn_results
 
   def _train_model(self, input_fn, hooks, saving_listeners):
+    if self._distribution:
+      return self._train_model_distributed(input_fn, hooks, saving_listeners)
+    else:
+      return self._train_model_default(input_fn, hooks, saving_listeners)
+
+  def _train_model_default(self, input_fn, hooks, saving_listeners):
     worker_hooks = []
     with ops.Graph().as_default() as g, g.device(self._device_fn):
       random_seed.set_random_seed(self._config.tf_random_seed)
@@ -821,86 +851,210 @@ class Estimator(object):
       worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
+      return self._train_with_estimator_spec(estimator_spec, worker_hooks,
+                                             hooks, global_step_tensor,
+                                             saving_listeners)
 
-      if self._warm_start_settings:
-        logging.info('Warm-starting with WarmStartSettings: %s' %
-                     (self._warm_start_settings,))
-        # pylint: disable=protected-access
-        warm_starting_util._warm_start(self._warm_start_settings)
-        # pylint: enable=protected-access
-      # Check if the user created a loss summary, and add one if they didn't.
-      # We assume here that the summary is called 'loss'. If it is not, we will
-      # make another one with the name 'loss' to ensure it shows up in the right
-      # graph in TensorBoard.
-      if not any([x.op.name == 'loss'
-                  for x in ops.get_collection(ops.GraphKeys.SUMMARIES)]):
-        summary.scalar('loss', estimator_spec.loss)
-      ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
-      worker_hooks.extend(hooks)
-      worker_hooks.extend([
-          training.NanTensorHook(estimator_spec.loss),
-          training.LoggingTensorHook(
-              {
-                  'loss': estimator_spec.loss,
-                  'step': global_step_tensor
-              },
-              every_n_iter=100)
-      ])
-      worker_hooks.extend(estimator_spec.training_hooks)
-
-      if not (estimator_spec.scaffold.saver or
-              ops.get_collection(ops.GraphKeys.SAVERS)):
-        ops.add_to_collection(
-            ops.GraphKeys.SAVERS,
-            training.Saver(
-                sharded=True,
-                max_to_keep=self._config.keep_checkpoint_max,
-                keep_checkpoint_every_n_hours=(
-                    self._config.keep_checkpoint_every_n_hours),
-                defer_build=True,
-                save_relative_paths=True))
-
-      chief_hooks = []
-      all_hooks = worker_hooks + list(estimator_spec.training_chief_hooks)
-      saver_hooks = [
-          h for h in all_hooks if isinstance(h, training.CheckpointSaverHook)]
-      if (self._config.save_checkpoints_secs or
-          self._config.save_checkpoints_steps):
-        if not saver_hooks:
-          chief_hooks = [
-              training.CheckpointSaverHook(
-                  self._model_dir,
-                  save_secs=self._config.save_checkpoints_secs,
-                  save_steps=self._config.save_checkpoints_steps,
-                  scaffold=estimator_spec.scaffold)
-          ]
-          saver_hooks = [chief_hooks[0]]
-      if saving_listeners:
-        if not saver_hooks:
-          raise ValueError(
-              'There should be a CheckpointSaverHook to use saving_listeners. '
-              'Please set one of the RunConfig.save_checkpoints_steps or '
-              'RunConfig.save_checkpoints_secs.')
+  def _train_model_distributed(self, input_fn, hooks, saving_listeners):
+    self._distribution.configure(self._session_config)
+    worker_hooks = []
+    with ops.Graph().as_default() as g:
+      with self._distribution.scope():
+        random_seed.set_random_seed(self._config.tf_random_seed)
+        features, labels, input_hooks = (
+            self._get_features_and_labels_from_input_fn(
+                input_fn, model_fn_lib.ModeKeys.TRAIN))
+        worker_hooks.extend(input_hooks)
+        global_step_tensor = self._create_and_assert_global_step(g)
+        # The default destination for the global_step_tensor fetch call is the
+        # CPU.
+        global_step_read_tensor = self._distribution.fetch(global_step_tensor)
+        # we want to add to the global collection in the main thread not the
+        # tower threads.
+        ops.add_to_collection(training_util.GLOBAL_STEP_READ_KEY,
+                              global_step_read_tensor)
+        grouped_estimator_spec = self._distribution.call_for_each_tower(
+            self._call_model_fn,
+            features,
+            labels,  # although this will be None it seems
+            model_fn_lib.ModeKeys.TRAIN,
+            self.config)
+
+        # TODO(anjalisridhar): Figure out how to resolve the folowing scaffold
+        # parameters: init_feed_dict, init_fn.
+        scaffold_list = self._distribution.unwrap(
+            grouped_estimator_spec.scaffold)
+        init_feed_dict = [
+            s.init_feed_dict
+            for s in scaffold_list
+            if s.init_feed_dict is not None
+        ]
+        if init_feed_dict:
+          init_feed_dict = self._distribution.group(init_feed_dict)
         else:
-          # It is expected to have one CheckpointSaverHook. If multiple, we pick
-          # up the first one to add listener.
-          saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
-      with training.MonitoredTrainingSession(
-          master=self._config.master,
-          is_chief=self._config.is_chief,
-          checkpoint_dir=self._model_dir,
-          scaffold=estimator_spec.scaffold,
-          hooks=worker_hooks,
-          chief_only_hooks=(
-              tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
-          save_checkpoint_secs=0,  # Saving is handled by a hook.
-          save_summaries_steps=self._config.save_summary_steps,
-          config=self._session_config,
-          log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
-        loss = None
-        while not mon_sess.should_stop():
-          _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
-      return loss
+          init_feed_dict = None
+
+        init_fn = [s.init_fn for s in scaffold_list if s.init_fn is not None]
+        if init_fn:
+          init_fn = self._distribution.group(init_fn)
+        else:
+          init_fn = None
+
+        init_op = [s.init_op for s in scaffold_list if s.init_op is not None]
+        if init_op:
+          init_op = self._distribution.group(init_op)
+        else:
+          init_op = None
+
+        ready_op = self._distribution.call_for_each_tower(
+            create_per_tower_ready_op, grouped_estimator_spec.scaffold)
+        if ready_op is not None:
+          ready_op = self._distribution.group(ready_op)
+        else:
+          ready_op = None
+
+        ready_for_local_init_op = self._distribution.call_for_each_tower(
+            create_per_tower_ready_for_local_init_op,
+            grouped_estimator_spec.scaffold)
+        if ready_for_local_init_op is not None:
+          ready_for_local_init_op = self._distribution.group(
+              ready_for_local_init_op)
+        else:
+          ready_for_local_init_op = None
+
+        local_init_op = [
+            s.local_init_op
+            for s in scaffold_list
+            if s.local_init_op is not None
+        ]
+        if local_init_op:
+          local_init_op = self._distribution.group(local_init_op)
+        else:
+          local_init_op = None
+
+        summary_op = [
+            s.summary_op for s in scaffold_list if s.summary_op is not None
+        ]
+        if summary_op:
+          summary_op = self._distribution.group(summary_op)
+        else:
+          summary_op = None
+
+        scaffold = monitored_session.Scaffold(
+            init_op=init_op,
+            ready_op=ready_op,
+            ready_for_local_init_op=ready_for_local_init_op,
+            local_init_op=local_init_op,
+            summary_op=summary_op,
+            init_feed_dict=init_feed_dict,
+            init_fn=init_fn)
+
+        def get_hooks_from_the_first_device(per_device_hooks):
+          hooks_list = self._distribution.unwrap(per_device_hooks)
+          assert hooks_list
+          return hooks_list[0]
+
+        training_hooks = get_hooks_from_the_first_device(
+            grouped_estimator_spec.training_hooks)
+        training_chief_hooks = get_hooks_from_the_first_device(
+            grouped_estimator_spec.training_chief_hooks)
+
+        estimator_spec = model_fn_lib.EstimatorSpec(
+            mode=grouped_estimator_spec.mode,
+            loss=self._distribution.unwrap(
+                self._distribution.reduce(distribute_lib.get_loss_reduction(),
+                                          grouped_estimator_spec.loss,
+                                          destinations='/device:CPU:0'))[0],
+            train_op=self._distribution.group(grouped_estimator_spec.train_op),
+            training_hooks=training_hooks,
+            training_chief_hooks=training_chief_hooks,
+            scaffold=scaffold)
+        return self._train_with_estimator_spec(estimator_spec, worker_hooks,
+                                               hooks, global_step_read_tensor,
+                                               saving_listeners)
+
+  def _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks,
+                                 global_step_tensor, saving_listeners):
+    """Train a model with the given Estimator Spec."""
+    if self._warm_start_settings:
+      logging.info('Warm-starting with WarmStartSettings: %s' %
+                   (self._warm_start_settings,))
+      # pylint: disable=protected-access
+      warm_starting_util.warm_start(*self._warm_start_settings)
+      # pylint: enable=protected-access
+    # Check if the user created a loss summary, and add one if they didn't.
+    # We assume here that the summary is called 'loss'. If it is not, we will
+    # make another one with the name 'loss' to ensure it shows up in the right
+    # graph in TensorBoard.
+    if not any([x.op.name == 'loss'
+                for x in ops.get_collection(ops.GraphKeys.SUMMARIES)]):
+      summary.scalar('loss', estimator_spec.loss)
+    ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
+    worker_hooks.extend(hooks)
+    worker_hooks.extend([
+        training.NanTensorHook(estimator_spec.loss),
+        training.LoggingTensorHook(
+            {
+                'loss': estimator_spec.loss,
+                'step': global_step_tensor
+            },
+            every_n_iter=self._config.log_step_count_steps)
+    ])
+    worker_hooks.extend(estimator_spec.training_hooks)
+
+    if not (estimator_spec.scaffold.saver or
+            ops.get_collection(ops.GraphKeys.SAVERS)):
+      ops.add_to_collection(
+          ops.GraphKeys.SAVERS,
+          training.Saver(
+              sharded=True,
+              max_to_keep=self._config.keep_checkpoint_max,
+              keep_checkpoint_every_n_hours=(
+                  self._config.keep_checkpoint_every_n_hours),
+              defer_build=True,
+              save_relative_paths=True))
+
+    chief_hooks = []
+    all_hooks = worker_hooks + list(estimator_spec.training_chief_hooks)
+    saver_hooks = [
+        h for h in all_hooks if isinstance(h, training.CheckpointSaverHook)]
+    if (self._config.save_checkpoints_secs or
+        self._config.save_checkpoints_steps):
+      if not saver_hooks:
+        chief_hooks = [
+            training.CheckpointSaverHook(
+                self._model_dir,
+                save_secs=self._config.save_checkpoints_secs,
+                save_steps=self._config.save_checkpoints_steps,
+                scaffold=estimator_spec.scaffold)
+        ]
+        saver_hooks = [chief_hooks[0]]
+    if saving_listeners:
+      if not saver_hooks:
+        raise ValueError(
+            'There should be a CheckpointSaverHook to use saving_listeners. '
+            'Please set one of the RunConfig.save_checkpoints_steps or '
+            'RunConfig.save_checkpoints_secs.')
+      else:
+        # It is expected to have one CheckpointSaverHook. If multiple, we pick
+        # up the first one to add listener.
+        saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
+    with training.MonitoredTrainingSession(
+        master=self._config.master,
+        is_chief=self._config.is_chief,
+        checkpoint_dir=self._model_dir,
+        scaffold=estimator_spec.scaffold,
+        hooks=worker_hooks,
+        chief_only_hooks=(
+            tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
+        save_checkpoint_secs=0,  # Saving is handled by a hook.
+        save_summaries_steps=self._config.save_summary_steps,
+        config=self._session_config,
+        log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
+      loss = None
+      while not mon_sess.should_stop():
+        _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
+    return loss
 
   def _evaluate_model(self,
                       input_fn,
@@ -967,6 +1121,35 @@ class Estimator(object):
     return eval_results
 
 
+def create_per_tower_ready_op(scaffold):
+  """Create a Scaffold.ready_op inside a tower."""
+  if scaffold.ready_op:
+    return scaffold.ready_op
+
+  def default_ready_op():
+    return array_ops.concat([
+        variables.report_uninitialized_variables(),
+        resources.report_uninitialized_resources()
+    ], 0)
+
+  return monitored_session.Scaffold.get_or_default(
+      'ready_op', ops.GraphKeys.READY_OP, default_ready_op)
+
+
+def create_per_tower_ready_for_local_init_op(scaffold):
+  """Create a Scaffold.ready_for_local_init_op inside a tower."""
+  if scaffold.ready_for_local_init_op:
+    return scaffold.ready_for_local_init_op
+
+  def default_ready_for_local_init_op():
+    return variables.report_uninitialized_variables(
+        variables.global_variables())
+
+  return monitored_session.Scaffold.get_or_default(
+      'ready_for_local_init_op', ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
+      default_ready_for_local_init_op)
+
+
 def _check_checkpoint_available(model_dir):
   latest_path = saver.latest_checkpoint(model_dir)
   if not latest_path:
@@ -1007,13 +1190,6 @@ def _get_replica_device_setter(config):
   Returns:
     A replica device setter, or None.
   """
-  ps_ops = [
-      'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
-      'MutableHashTableV2', 'MutableHashTableOfTensors',
-      'MutableHashTableOfTensorsV2', 'MutableDenseHashTable',
-      'MutableDenseHashTableV2', 'VarHandleOp'
-  ]
-
   if config.task_type:
     worker_device = '/job:%s/task:%d' % (config.task_type, config.task_id)
   else:
@@ -1024,7 +1200,7 @@ def _get_replica_device_setter(config):
         ps_tasks=config.num_ps_replicas,
         worker_device=worker_device,
         merge_devices=True,
-        ps_ops=ps_ops,
+        ps_ops=list(device_setter.STANDARD_PS_OPS),
         cluster=config.cluster_spec)
   else:
     return None
@@ -1085,7 +1261,8 @@ def _dict_to_str(dictionary):
     A `str` representing the `dictionary`.
   """
   return ', '.join('%s = %s' % (k, v)
-                   for k, v in sorted(six.iteritems(dictionary)))
+                   for k, v in sorted(six.iteritems(dictionary))
+                   if not isinstance(v, six.binary_type))
 
 
 def _write_dict_to_summary(output_dir,
@@ -1118,7 +1295,7 @@ def _write_dict_to_summary(output_dir,
       try:
         summ = summary_pb2.Summary.FromString(dictionary[key])
         for i, _ in enumerate(summ.value):
-          summ.value[i].tag = key
+          summ.value[i].tag = '%s/%d' % (key, i)
         summary_proto.value.extend(summ.value)
       except message.DecodeError:
         logging.warn('Skipping summary for %s, cannot parse string to Summary.',
@@ -1155,3 +1332,187 @@ class _DatasetInitializerHook(training.SessionRunHook):
   def after_create_session(self, session, coord):
     del coord
     session.run(self._initializer)
+
+VocabInfo = warm_starting_util.VocabInfo  # pylint: disable=invalid-name
+
+
+@tf_export('estimator.WarmStartSettings')
+class WarmStartSettings(
+    collections.namedtuple('WarmStartSettings', [
+        'ckpt_to_initialize_from',
+        'vars_to_warm_start',
+        'var_name_to_vocab_info',
+        'var_name_to_prev_var_name',
+    ])):
+  """Settings for warm-starting in Estimators.
+
+  Example Use with canned `DNNEstimator`:
+
+  ```
+  emb_vocab_file = tf.feature_column.embedding_column(
+      tf.feature_column.categorical_column_with_vocabulary_file(
+          "sc_vocab_file", "new_vocab.txt", vocab_size=100),
+      dimension=8)
+  emb_vocab_list = tf.feature_column.embedding_column(
+      tf.feature_column.categorical_column_with_vocabulary_list(
+          "sc_vocab_list", vocabulary_list=["a", "b"]),
+      dimension=8)
+  estimator = tf.estimator.DNNClassifier(
+    hidden_units=[128, 64], feature_columns=[emb_vocab_file, emb_vocab_list],
+    warm_start_from=ws)
+  ```
+
+  where `ws` could be defined as:
+
+  Warm-start all weights in the model (input layer and hidden weights).
+  Either the directory or a specific checkpoint can be provided (in the case
+  of the former, the latest checkpoint will be used):
+
+  ```
+  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp")
+  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp/model-1000")
+  ```
+
+  Warm-start only the embeddings (input layer):
+
+  ```
+  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp",
+                         vars_to_warm_start=".*input_layer.*")
+  ```
+
+  Warm-start all weights but the embedding parameters corresponding to
+  `sc_vocab_file` have a different vocab from the one used in the current
+  model:
+
+  ```
+  vocab_info = tf.estimator.VocabInfo(
+      new_vocab=sc_vocab_file.vocabulary_file,
+      new_vocab_size=sc_vocab_file.vocabulary_size,
+      num_oov_buckets=sc_vocab_file.num_oov_buckets,
+      old_vocab="old_vocab.txt"
+  )
+  ws = WarmStartSettings(
+      ckpt_to_initialize_from="/tmp",
+      var_name_to_vocab_info={
+          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
+      })
+  ```
+
+  Warm-start only `sc_vocab_file` embeddings (and no other variables), which
+  have a different vocab from the one used in the current model:
+
+  ```
+  vocab_info = tf.estimator.VocabInfo(
+      new_vocab=sc_vocab_file.vocabulary_file,
+      new_vocab_size=sc_vocab_file.vocabulary_size,
+      num_oov_buckets=sc_vocab_file.num_oov_buckets,
+      old_vocab="old_vocab.txt"
+  )
+  ws = WarmStartSettings(
+      ckpt_to_initialize_from="/tmp",
+      vars_to_warm_start=None,
+      var_name_to_vocab_info={
+          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
+      })
+  ```
+
+  Warm-start all weights but the parameters corresponding to `sc_vocab_file`
+  have a different vocab from the one used in current checkpoint, and only
+  100 of those entries were used:
+
+  ```
+  vocab_info = tf.estimator.VocabInfo(
+      new_vocab=sc_vocab_file.vocabulary_file,
+      new_vocab_size=sc_vocab_file.vocabulary_size,
+      num_oov_buckets=sc_vocab_file.num_oov_buckets,
+      old_vocab="old_vocab.txt",
+      old_vocab_size=100
+  )
+  ws = WarmStartSettings(
+      ckpt_to_initialize_from="/tmp",
+      var_name_to_vocab_info={
+          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
+      })
+  ```
+
+  Warm-start all weights but the parameters corresponding to `sc_vocab_file`
+  have a different vocab from the one used in current checkpoint and the
+  parameters corresponding to `sc_vocab_list` have a different name from the
+  current checkpoint:
+
+  ```
+  vocab_info = tf.estimator.VocabInfo(
+      new_vocab=sc_vocab_file.vocabulary_file,
+      new_vocab_size=sc_vocab_file.vocabulary_size,
+      num_oov_buckets=sc_vocab_file.num_oov_buckets,
+      old_vocab="old_vocab.txt",
+      old_vocab_size=100
+  )
+  ws = WarmStartSettings(
+      ckpt_to_initialize_from="/tmp",
+      var_name_to_vocab_info={
+          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
+      },
+      var_name_to_prev_var_name={
+          "input_layer/sc_vocab_list_embedding/embedding_weights":
+              "old_tensor_name"
+      })
+  ```
+
+  Attributes:
+    ckpt_to_initialize_from: [Required] A string specifying the directory with
+      checkpoint file(s) or path to checkpoint from which to warm-start the
+      model parameters.
+    vars_to_warm_start: [Optional] A regular expression that captures which
+      variables to warm-start (see tf.get_collection).  Defaults to `'.*'`,
+      which warm-starts all variables.  If `None` is explicitly given, only
+      variables specified in `var_name_to_vocab_info` will be warm-started.
+    var_name_to_vocab_info: [Optional] Dict of variable names (strings) to
+      VocabInfo. The variable names should be "full" variables, not the names
+      of the partitions.  If not explicitly provided, the variable is assumed to
+      have no vocabulary.
+    var_name_to_prev_var_name: [Optional] Dict of variable names (strings) to
+      name of the previously-trained variable in `ckpt_to_initialize_from`. If
+      not explicitly provided, the name of the variable is assumed to be same
+      between previous checkpoint and current model.
+  """
+
+  def __new__(cls,
+              ckpt_to_initialize_from,
+              vars_to_warm_start='.*',
+              var_name_to_vocab_info=None,
+              var_name_to_prev_var_name=None):
+    if not ckpt_to_initialize_from:
+      raise ValueError(
+          '`ckpt_to_initialize_from` MUST be set in WarmStartSettings')
+    return super(WarmStartSettings, cls).__new__(
+        cls,
+        ckpt_to_initialize_from,
+        vars_to_warm_start,
+        var_name_to_vocab_info or {},
+        var_name_to_prev_var_name or {},
+    )
+
+
+def _get_default_warm_start_settings(warm_start_from):
+  """Returns default WarmStartSettings.
+
+  Args:
+    warm_start_from: Either a string representing the filepath of a checkpoint
+      to initialize from, or an instance of WarmStartSettings.
+
+  Returns:
+    Either None or an instance of WarmStartSettings.
+
+  Raises:
+    ValueError: If warm_start_from is not None but is neither a string nor an
+      instance of WarmStartSettings.
+  """
+  if warm_start_from is None:
+    return None
+  if isinstance(warm_start_from, six.string_types):
+    return WarmStartSettings(ckpt_to_initialize_from=warm_start_from)
+  elif isinstance(warm_start_from, WarmStartSettings):
+    return warm_start_from
+  else:
+    raise ValueError('warm_start_from must be a string or a WarmStartSettings')
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index 01699e7399c4089281e9ece76e534e1f82692257..3815f4247056444f85996c62a2a4e3fff03e3f2a 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.estimator.canned.baseline import BaselineClassifier
 from tensorflow.python.estimator.canned.baseline import BaselineRegressor
+from tensorflow.python.estimator.canned.boosted_trees import BoostedTreesClassifier
+from tensorflow.python.estimator.canned.boosted_trees import BoostedTreesRegressor
 from tensorflow.python.estimator.canned.dnn import DNNClassifier
 from tensorflow.python.estimator.canned.dnn import DNNRegressor
 from tensorflow.python.estimator.canned.dnn_linear_combined import DNNLinearCombinedClassifier
@@ -30,6 +32,8 @@ from tensorflow.python.estimator.canned.linear import LinearRegressor
 from tensorflow.python.estimator.canned.parsing_utils import classifier_parse_example_spec
 from tensorflow.python.estimator.canned.parsing_utils import regressor_parse_example_spec
 from tensorflow.python.estimator.estimator import Estimator
+from tensorflow.python.estimator.estimator import VocabInfo
+from tensorflow.python.estimator.estimator import WarmStartSettings
 from tensorflow.python.estimator.export import export_lib as export
 from tensorflow.python.estimator.exporter import Exporter
 from tensorflow.python.estimator.exporter import FinalExporter
@@ -41,47 +45,6 @@ from tensorflow.python.estimator.run_config import RunConfig
 from tensorflow.python.estimator.training import EvalSpec
 from tensorflow.python.estimator.training import train_and_evaluate
 from tensorflow.python.estimator.training import TrainSpec
-from tensorflow.python.estimator.warm_starting_util import VocabInfo
-from tensorflow.python.estimator.warm_starting_util import WarmStartSettings
 
 
-from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
-
-_allowed_symbols = [
-    # Canned Estimators
-    'BaselineClassifier',
-    'BaselineRegressor',
-    'DNNClassifier',
-    'DNNRegressor',
-    'DNNLinearCombinedClassifier',
-    'DNNLinearCombinedRegressor',
-    'LinearClassifier',
-    'LinearRegressor',
-
-    # I/O
-    'classifier_parse_example_spec',
-    'regressor_parse_example_spec',
-    'inputs',
-    'export',
-
-    # Estimator
-    'Estimator',
-    'EstimatorSpec',
-    'ModeKeys',
-    'RunConfig',
-
-    # Training utilities
-    'train_and_evaluate',
-    'EvalSpec',
-    'TrainSpec',
-    'Exporter',
-    'LatestExporter',
-    'FinalExporter',
-
-    # Warm-starting
-    'WarmStartSettings',
-    'VocabInfo',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index b0a7752ec74913c959fc176e3eb9001f7418b4a2..0fea86124cc58a339621d4c7b857f10a0a1d889a 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -48,6 +48,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import state_ops
@@ -678,8 +679,10 @@ class EstimatorTrainTest(test.TestCase):
     ckpt = checkpoint_state_pb2.CheckpointState()
     text_format.Merge(checkpoint_file_content, ckpt)
     self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
+    # TODO(b/78461127): Please modify tests to not directly rely on names of
+    # checkpoints.
     self.assertAllEqual(
-        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+        ['model.ckpt-0', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
 
   def test_train_save_copy_reload(self):
     tmpdir = tempfile.mkdtemp()
@@ -1268,10 +1271,10 @@ class EstimatorEvaluateTest(test.TestCase):
       _, _ = features, labels
       global_step = training.get_global_step()
 
-      image = array_ops.zeros([1, 3, 3, 1])
+      image = array_ops.zeros([5, 3, 3, 1])
       eval_metric_ops = {
-          'image': (summary.image('image', image, max_outputs=1),
-                    constant_op.constant(1))
+          'foo': (summary.image('image', image, max_outputs=3),
+                  constant_op.constant(1))
       }
       return model_fn_lib.EstimatorSpec(
           mode,
@@ -1291,10 +1294,10 @@ class EstimatorEvaluateTest(test.TestCase):
     writer_cache.FileWriterCache.clear()
 
     # Get last evaluation Event written.
-    if check_eventfile_for_keyword('image', 
-                                   os.path.join(est.model_dir, 'eval')):
-      return
-    self.fail('{} should be part of reported summaries.'.format('image'))
+    for key in ['foo/0', 'foo/1', 'foo/2']:
+      self.assertTrue(
+          check_eventfile_for_keyword(key, os.path.join(est.model_dir, 'eval')),
+          '{} should be part of reported summaries.'.format(key))
 
 
 class EstimatorPredictTest(test.TestCase):
@@ -1936,6 +1939,60 @@ class EstimatorExportTest(test.TestCase):
     # cleanup
     gfile.DeleteRecursively(tmpdir)
 
+  def test_export_savedmodel_tensor_features(self):
+    """Test that models accepting a single raw Tensor can be exported.
+
+    See https://github.com/tensorflow/tensorflow/issues/11674
+
+    If the model_fn and receiver_fn accept raw tensors rather than dictionaries
+    as input, export_savedmodel should be okay with that, too.
+
+    """
+
+    tmpdir = tempfile.mkdtemp()
+
+    def _input_fn_tensor_features():
+      t = array_ops.constant([1, 2, 3], dtype=dtypes.float32, shape=[1, 3])
+      return (t, None)
+
+    def _model_fn_tensor_features(features, labels, mode):
+      _ = labels
+      prediction = math_ops.matmul(features, features, transpose_b=True)
+
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=prediction,
+          loss=constant_op.constant(1.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          export_outputs={
+              'test': export_output.PredictOutput({'prediction': prediction})
+          })
+
+    def _serving_input_receiver_fn():
+      feat = array_ops.placeholder(dtype=dtypes.float32)
+      return export.TensorServingInputReceiver(
+          features=feat, receiver_tensors=feat)
+
+    est = estimator.Estimator(model_fn=_model_fn_tensor_features)
+    est.train(input_fn=_input_fn_tensor_features, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est.export_savedmodel(
+        export_dir_base, _serving_input_receiver_fn)
+
+    # Restore, to validate that the export was well-formed.
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name.lower() for x in graph.get_operations()]
+        self.assertTrue('const' in graph_ops)
+        self.assertTrue('matmul' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
   def test_scaffold_is_used_for_saver(self):
     tmpdir = tempfile.mkdtemp()
 
@@ -2232,6 +2289,7 @@ class EstimatorHookOrderingTest(test.TestCase):
 
 class EstimatorIntegrationTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_complete_flow_with_a_simple_linear_model(self):
 
     def _model_fn(features, labels, mode):
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 83251c79fc561e16ebddb638668b92b3c69b8af4..41c1f5a2e25cd6ba1e93744b5b82ecc34c4375d0 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -21,17 +21,16 @@ from __future__ import print_function
 
 import collections
 import os
-import time
 
 import six
 
+from tensorflow.python.estimator import util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
@@ -75,8 +74,20 @@ class ServingInputReceiver(collections.namedtuple(
         raise ValueError('feature keys must be strings: {}.'.format(name))
       if not (isinstance(tensor, ops.Tensor)
               or isinstance(tensor, sparse_tensor.SparseTensor)):
-        raise ValueError(
+        value_error = ValueError(
             'feature {} must be a Tensor or SparseTensor.'.format(name))
+        # NOTE(ericmc): This if-else block is a specific carve-out for
+        # LabeledTensor, which has a `.tensor` attribute and which is
+        # convertible to tf.Tensor via ops.convert_to_tensor.
+        # Allowing all types convertible to tf.Tensor is considered by soergel@
+        # to be too permissive.
+        if hasattr(tensor, 'tensor'):
+          try:
+            ops.convert_to_tensor(tensor)
+          except TypeError:
+            raise value_error
+        else:
+          raise value_error
 
     if receiver_tensors is None:
       raise ValueError('receiver_tensors must be defined.')
@@ -120,6 +131,62 @@ class ServingInputReceiver(collections.namedtuple(
         receiver_tensors_alternatives=receiver_tensors_alternatives)
 
 
+@tf_export('estimator.export.TensorServingInputReceiver')
+class TensorServingInputReceiver(collections.namedtuple(
+    'TensorServingInputReceiver',
+    ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
+  """A return type for a serving_input_receiver_fn.
+
+  This is for use with models that expect a single `Tensor` or `SparseTensor`
+  as an input feature, as opposed to a dict of features.
+
+  The normal `ServingInputReceiver` always returns a feature dict, even if it
+  contains only one entry, and so can be used only with models that accept such
+  a dict.  For models that accept only a single raw feature, the
+  `serving_input_receiver_fn` provided to `Estimator.export_savedmodel()` should
+  return this `TensorServingInputReceiver` instead.  See:
+  https://github.com/tensorflow/tensorflow/issues/11674
+
+  Note that the receiver_tensors and receiver_tensor_alternatives arguments
+  will be automatically converted to the dict representation in either case,
+  because the SavedModel format requires each input `Tensor` to have a name
+  (provided by the dict key).
+
+  The expected return values are:
+    features: A single `Tensor` or `SparseTensor`, representing the feature
+      to be passed to the model.
+    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
+      input nodes where this receiver expects to be fed by default.  Typically,
+      this is a single placeholder expecting serialized `tf.Example` protos.
+    receiver_tensors_alternatives: a dict of string to additional
+      groups of receiver tensors, each of which may be a `Tensor` or a dict of
+      string to `Tensor`.  These named receiver tensor alternatives generate
+      additional serving signatures, which may be used to feed inputs at
+      different points within the input receiver subgraph.  A typical usage is
+      to allow feeding raw feature `Tensor`s *downstream* of the
+      tf.parse_example() op.  Defaults to None.
+  """
+
+  def __new__(cls, features, receiver_tensors,
+              receiver_tensors_alternatives=None):
+    if features is None:
+      raise ValueError('features must be defined.')
+    if not (isinstance(features, ops.Tensor)
+            or isinstance(features, sparse_tensor.SparseTensor)):
+      raise ValueError('feature must be a Tensor or SparseTensor.')
+
+    receiver = ServingInputReceiver(
+        features=features,
+        receiver_tensors=receiver_tensors,
+        receiver_tensors_alternatives=receiver_tensors_alternatives)
+
+    return super(TensorServingInputReceiver, cls).__new__(
+        cls,
+        features=receiver.features[_SINGLE_FEATURE_DEFAULT_NAME],
+        receiver_tensors=receiver.receiver_tensors,
+        receiver_tensors_alternatives=receiver.receiver_tensors_alternatives)
+
+
 @tf_export('estimator.export.build_parsing_serving_input_receiver_fn')
 def build_parsing_serving_input_receiver_fn(feature_spec,
                                             default_batch_size=None):
@@ -273,13 +340,6 @@ def _log_signature_report(signature_def_map, excluded_signatures):
     logging.warn('Export includes no default signature!')
 
 
-# When we create a timestamped directory, there is a small chance that the
-# directory already exists because another worker is also writing exports.
-# In this case we just wait one second to get a new timestamp and try again.
-# If this fails several times in a row, then something is seriously wrong.
-MAX_DIRECTORY_CREATION_ATTEMPTS = 10
-
-
 def get_timestamped_export_dir(export_dir_base):
   """Builds a path to a new subdirectory within the base directory.
 
@@ -298,25 +358,7 @@ def get_timestamped_export_dir(export_dir_base):
     RuntimeError: if repeated attempts fail to obtain a unique timestamped
       directory name.
   """
-  attempts = 0
-  while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
-    export_timestamp = int(time.time())
-
-    export_dir = os.path.join(
-        compat.as_bytes(export_dir_base),
-        compat.as_bytes(str(export_timestamp)))
-    if not gfile.Exists(export_dir):
-      # Collisions are still possible (though extremely unlikely): this
-      # directory is not actually created yet, but it will be almost
-      # instantly on return from this function.
-      return export_dir
-    time.sleep(1)
-    attempts += 1
-    logging.warn(
-        'Export directory {} already exists; retrying (attempt {}/{})'.format(
-            export_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS))
-  raise RuntimeError('Failed to obtain a unique export directory name after '
-                     '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS))
+  return util.get_timestamped_dir(export_dir_base)
 
 
 def get_temp_export_dir(timestamped_export_dir):
diff --git a/tensorflow/python/estimator/export/export_lib.py b/tensorflow/python/estimator/export/export_lib.py
index 99cd81d678bc04e7ed52de721a1fdf799c116795..f4ac8581ea555bfcdf4b714326cb23a16b1f83e5 100644
--- a/tensorflow/python/estimator/export/export_lib.py
+++ b/tensorflow/python/estimator/export/export_lib.py
@@ -22,22 +22,11 @@ from __future__ import print_function
 from tensorflow.python.estimator.export.export import build_parsing_serving_input_receiver_fn
 from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
 from tensorflow.python.estimator.export.export import ServingInputReceiver
+from tensorflow.python.estimator.export.export import TensorServingInputReceiver
 from tensorflow.python.estimator.export.export_output import ClassificationOutput
 from tensorflow.python.estimator.export.export_output import ExportOutput
 from tensorflow.python.estimator.export.export_output import PredictOutput
 from tensorflow.python.estimator.export.export_output import RegressionOutput
 
-from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long
 
-_allowed_symbols = [
-    'build_parsing_serving_input_receiver_fn',
-    'build_raw_serving_input_receiver_fn',
-    'ServingInputReceiver',
-    'ClassificationOutput',
-    'ExportOutput',
-    'PredictOutput',
-    'RegressionOutput',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index 8442bf04accbd0bc15f5958069bf3060debd42bc..c203be7dacf80043079b35cd8e89fc5b69dcaaa0 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -39,6 +39,21 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 
 
+class LabeledTensorMock(object):
+  """Mock class emulating LabeledTensor."""
+
+  def __init__(self):
+    self.tensor = constant_op.constant([1])
+
+
+def _convert_labeled_tensor_mock_to_tensor(value, *args, **kwargs):
+  return ops.internal_convert_to_tensor(value.tensor, *args, **kwargs)
+
+
+ops.register_tensor_conversion_function(LabeledTensorMock,
+                                        _convert_labeled_tensor_mock_to_tensor)
+
+
 class ExportTest(test_util.TensorFlowTestCase):
 
   def test_serving_input_receiver_constructor(self):
@@ -135,6 +150,11 @@ class ExportTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       _ = export.ServingInputReceiver(feature, receiver_tensor)
 
+  def test_feature_labeled_tensor(self):
+    feature = LabeledTensorMock()
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    _ = export.ServingInputReceiver(feature, receiver_tensor)
+
   def test_receiver_wrong_type(self):
     feature = constant_op.constant(5)
     receiver_tensor = "not a tensor"
@@ -385,5 +405,67 @@ class ExportTest(test_util.TensorFlowTestCase):
     self.assertTrue(int(time_2) < int(time_3))
 
 
+class TensorServingReceiverTest(test_util.TensorFlowTestCase):
+
+  def test_tensor_serving_input_receiver_constructor(self):
+    features = constant_op.constant([0])
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    r = export.TensorServingInputReceiver(features, receiver_tensors)
+    self.assertTrue(isinstance(r.features, ops.Tensor))
+    self.assertTrue(isinstance(r.receiver_tensors, dict))
+
+  def test_tensor_serving_input_receiver_sparse(self):
+    features = sparse_tensor.SparseTensor(
+        indices=[[0, 0]], values=[1], dense_shape=[1, 1])
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    r = export.TensorServingInputReceiver(features, receiver_tensors)
+    self.assertTrue(isinstance(r.features, sparse_tensor.SparseTensor))
+    self.assertTrue(isinstance(r.receiver_tensors, dict))
+
+  def test_serving_input_receiver_features_invalid(self):
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+
+    with self.assertRaisesRegexp(ValueError, "features must be defined"):
+      export.TensorServingInputReceiver(
+          features=None,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(ValueError, "feature must be a Tensor"):
+      export.TensorServingInputReceiver(
+          features={"1": constant_op.constant([1])},
+          receiver_tensors=receiver_tensors)
+
+  def test_serving_input_receiver_receiver_tensors_invalid(self):
+    features = constant_op.constant([0])
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors must be defined"):
+      export.TensorServingInputReceiver(
+          features=features,
+          receiver_tensors=None)
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors keys must be strings"):
+      export.TensorServingInputReceiver(
+          features=features,
+          receiver_tensors={
+              1: array_ops.placeholder(dtypes.string, name="example0")})
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensor example1 must be a Tensor"):
+      export.TensorServingInputReceiver(
+          features=features,
+          receiver_tensors={"example1": [1]})
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/estimator/inputs/inputs.py b/tensorflow/python/estimator/inputs/inputs.py
index 1a1c9a6c3fb11ea83d317234ea79cb39aac76388..6be168ee08ddf7e4a4a03c3fa75e3de927d2a3a3 100644
--- a/tensorflow/python/estimator/inputs/inputs.py
+++ b/tensorflow/python/estimator/inputs/inputs.py
@@ -22,12 +22,4 @@ from __future__ import print_function
 from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn
 from tensorflow.python.estimator.inputs.pandas_io import pandas_input_fn
 
-from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long
-
-_allowed_symbols = [
-    'numpy_input_fn',
-    'pandas_input_fn'
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 3e021242c4cc914990c6b38736b8f725213b5b7e..8162b249f1f0be6c901f09ff21f9a67f7c5e492f 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -27,11 +27,13 @@ import six
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.estimator import util
 from tensorflow.python.util import compat_internal
 from tensorflow.python.util.tf_export import tf_export
 
 
 _USE_DEFAULT = object()
+_VALID_DEVICE_FN_ARGS = set(['op'])
 
 # A list of the property names in RunConfig that the user is allowed to change.
 _DEFAULT_REPLACEABLE_LIST = [
@@ -43,7 +45,9 @@ _DEFAULT_REPLACEABLE_LIST = [
     'session_config',
     'keep_checkpoint_max',
     'keep_checkpoint_every_n_hours',
-    'log_step_count_steps'
+    'log_step_count_steps',
+    'train_distribute',
+    'device_fn'
 ]
 
 _SAVE_CKPT_ERR = (
@@ -278,6 +282,11 @@ def _validate_properties(run_config):
   _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types),
             message='tf_random_seed must be integer.')
 
+  _validate('device_fn', lambda device_fn: six.callable(device_fn) and
+            set(util.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS,
+            message='device_fn must be callable with exactly'
+                    ' one argument "op".')
+
 
 class TaskType(object):
   MASTER = 'master'
@@ -300,7 +309,9 @@ class RunConfig(object):
                session_config=None,
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
-               log_step_count_steps=100):
+               log_step_count_steps=100,
+               train_distribute=None,
+               device_fn=None):
     """Constructs a RunConfig.
 
     All distributed training related properties `cluster_spec`, `is_chief`,
@@ -314,7 +325,7 @@ class RunConfig(object):
     a list of task addresses.
 
     `task` has two attributes: `type` and `index`, where `type` can be any of
-    the task types in `cluster`. ` When `TF_CONFIG` contains said information,
+    the task types in `cluster`. When `TF_CONFIG` contains said information,
     the following properties are set on this class:
 
     * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If
@@ -345,7 +356,7 @@ class RunConfig(object):
       os.environ['TF_CONFIG'] = json.dumps(
           {'cluster': cluster,
            'task': {'type': 'worker', 'index': 1}})
-      config = ClusterConfig()
+      config = RunConfig()
       assert config.master == 'host4:2222'
       assert config.task_id == 1
       assert config.num_ps_replicas == 2
@@ -363,7 +374,7 @@ class RunConfig(object):
       os.environ['TF_CONFIG'] = json.dumps(
           {'cluster': cluster,
            'task': {'type': 'chief', 'index': 0}})
-      config = ClusterConfig()
+      config = RunConfig()
       assert config.master == 'host0:2222'
       assert config.task_id == 0
       assert config.num_ps_replicas == 2
@@ -381,7 +392,7 @@ class RunConfig(object):
       os.environ['TF_CONFIG'] = json.dumps(
           {'cluster': cluster,
            'task': {'type': 'evaluator', 'index': 0}})
-      config = ClusterConfig()
+      config = RunConfig()
       assert config.master == ''
       assert config.evaluator_master == ''
       assert config.task_id == 0
@@ -423,8 +434,15 @@ class RunConfig(object):
         to be saved. The default value of 10,000 hours effectively disables
         the feature.
       log_step_count_steps: The frequency, in number of global steps, that the
-        global step/sec will be logged during training.
-
+        global step/sec and the loss will be logged during training.
+      train_distribute: an optional instance of
+        `tf.contrib.distribute.DistributionStrategy`. If specified,
+        then Estimator will distribute the user's model during training,
+        according to the policy specified by that strategy.
+      device_fn: A callable invoked for every `Operation` that takes the
+        `Operation` and returns the device string. If `None`, defaults to
+        the device function returned by `tf.train.replica_device_setter`
+        with round-robin strategy.
 
     Raises:
       ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
@@ -460,7 +478,9 @@ class RunConfig(object):
         session_config=session_config,
         keep_checkpoint_max=keep_checkpoint_max,
         keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
-        log_step_count_steps=log_step_count_steps)
+        log_step_count_steps=log_step_count_steps,
+        train_distribute=train_distribute,
+        device_fn=device_fn)
 
     self._init_distributed_setting_from_environment_var(tf_config)
 
@@ -562,6 +582,16 @@ class RunConfig(object):
   def cluster_spec(self):
     return self._cluster_spec
 
+  @property
+  def device_fn(self):
+    """Returns the device_fn.
+
+    If device_fn is not `None`, it overrides the default
+    device function used in `Estimator`.
+    Otherwise the default one is used.
+    """
+    return self._device_fn
+
   @property
   def evaluation_master(self):
     return self._evaluation_master
@@ -671,12 +701,18 @@ class RunConfig(object):
     """Returns the platform defined (in TF_CONFIG) service dict."""
     return self._service
 
+  @property
+  def train_distribute(self):
+    """Returns the optional `tf.contrib.distribute.DistributionStrategy` object.
+    """
+    return self._train_distribute
+
   def replace(self, **kwargs):
     """Returns a new instance of `RunConfig` replacing specified properties.
 
     Only the properties in the following list are allowed to be replaced:
 
-      - `model_dir`.
+      - `model_dir`,
       - `tf_random_seed`,
       - `save_summary_steps`,
       - `save_checkpoints_steps`,
@@ -685,6 +721,8 @@ class RunConfig(object):
       - `keep_checkpoint_max`,
       - `keep_checkpoint_every_n_hours`,
       - `log_step_count_steps`,
+      - `train_distribute`,
+      - `device_fn`.
 
     In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
     can be set (should not be both).
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
index a3eef4c53fd90a1ce69f3067d0b5c15909f43cec..c8b12605e1aaad11e114e4ace63697b93f3b2b92 100644
--- a/tensorflow/python/estimator/run_config_test.py
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -42,6 +42,7 @@ _SESSION_CONFIG_ERR = 'session_config must be instance of ConfigProto'
 _KEEP_CKPT_MAX_ERR = 'keep_checkpoint_max should be >= 0'
 _KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0'
 _TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer'
+_DEVICE_FN_ERR = 'device_fn must be callable with exactly one argument "op".'
 _ONE_CHIEF_ERR = 'The "cluster" in TF_CONFIG must have only one "chief" node.'
 _ONE_MASTER_ERR = 'The "cluster" in TF_CONFIG must have only one "master" node.'
 _INVALID_TASK_TYPE_FOR_EVAL_MASTER = (
@@ -83,6 +84,7 @@ class RunConfigTest(test.TestCase):
     self.assertEqual(5, config.keep_checkpoint_max)
     self.assertEqual(10000, config.keep_checkpoint_every_n_hours)
     self.assertIsNone(config.service)
+    self.assertIsNone(config.device_fn)
 
   def test_model_dir(self):
     empty_config = run_config_lib.RunConfig()
@@ -93,6 +95,7 @@ class RunConfigTest(test.TestCase):
 
   def test_replace_with_allowed_properties(self):
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    device_fn = lambda op: "/cpu:0"
 
     config = run_config_lib.RunConfig().replace(
         tf_random_seed=11,
@@ -100,13 +103,15 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_secs=14,
         session_config=session_config,
         keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17)
+        keep_checkpoint_every_n_hours=17,
+        device_fn=device_fn)
     self.assertEqual(11, config.tf_random_seed)
     self.assertEqual(12, config.save_summary_steps)
     self.assertEqual(14, config.save_checkpoints_secs)
     self.assertEqual(session_config, config.session_config)
     self.assertEqual(16, config.keep_checkpoint_max)
     self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+    self.assertEqual(device_fn, config.device_fn)
 
   def test_replace_none_value(self):
     config = run_config_lib.RunConfig().replace(
@@ -117,7 +122,8 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_steps=None,
         session_config=None,
         keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None)
+        keep_checkpoint_every_n_hours=None,
+        device_fn=None)
     self.assertIsNone(config.tf_random_seed)
     self.assertIsNone(config.model_dir)
     self.assertIsNone(config.save_summary_steps)
@@ -126,6 +132,7 @@ class RunConfigTest(test.TestCase):
     self.assertIsNone(config.session_config)
     self.assertIsNone(config.keep_checkpoint_max)
     self.assertIsNone(config.keep_checkpoint_every_n_hours)
+    self.assertIsNone(config.device_fn)
 
   def test_replace_with_disallowallowed_properties(self):
     config = run_config_lib.RunConfig()
@@ -166,9 +173,12 @@ class RunConfigTest(test.TestCase):
       config.replace(keep_checkpoint_every_n_hours=0)
     with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
       config.replace(tf_random_seed=1.0)
+    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
+      config.replace(device_fn=lambda x, y: 0)
 
   def test_init_with_allowed_properties(self):
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    device_fn = lambda op: "/cpu:0"
 
     config = run_config_lib.RunConfig(
         tf_random_seed=11,
@@ -176,13 +186,15 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_secs=14,
         session_config=session_config,
         keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17)
+        keep_checkpoint_every_n_hours=17,
+        device_fn=device_fn)
     self.assertEqual(11, config.tf_random_seed)
     self.assertEqual(12, config.save_summary_steps)
     self.assertEqual(14, config.save_checkpoints_secs)
     self.assertEqual(session_config, config.session_config)
     self.assertEqual(16, config.keep_checkpoint_max)
     self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+    self.assertEqual(device_fn, config.device_fn)
 
   def test_init_none_value(self):
     config = run_config_lib.RunConfig(
@@ -193,7 +205,8 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_steps=None,
         session_config=None,
         keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None)
+        keep_checkpoint_every_n_hours=None,
+        device_fn=None)
     self.assertIsNone(config.tf_random_seed)
     self.assertIsNone(config.model_dir)
     self.assertIsNone(config.save_summary_steps)
@@ -202,6 +215,7 @@ class RunConfigTest(test.TestCase):
     self.assertIsNone(config.session_config)
     self.assertIsNone(config.keep_checkpoint_max)
     self.assertIsNone(config.keep_checkpoint_every_n_hours)
+    self.assertIsNone(config.device_fn)
 
   def test_init_invalid_values(self):
     with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
@@ -220,6 +234,8 @@ class RunConfigTest(test.TestCase):
       run_config_lib.RunConfig(keep_checkpoint_every_n_hours=0)
     with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
       run_config_lib.RunConfig(tf_random_seed=1.0)
+    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
+      run_config_lib.RunConfig(device_fn=lambda x: "/cpu:0")
 
 
 class RunConfigDistributedSettingTest(test.TestCase):
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 63328dcfb55646ce2aaf8929d5517c8522c418f2..534c357067770b7ca7b1c234d32ccb999a6f1992 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -128,9 +128,16 @@ class TrainSpec(
     """Creates a validated `TrainSpec` instance.
 
     Args:
-      input_fn: Training input function returning a tuple of:
-          features - `Tensor` or dictionary of string feature name to `Tensor`.
-          labels - `Tensor` or dictionary of `Tensor` with labels.
+      input_fn: A function that provides input data for training as minibatches.
+        See @{$get_started/premade_estimators#create_input_functions} for more
+        information. The function should construct and return one of
+        the following:
+          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+            tuple (features, labels) with same constraints as below.
+          * A tuple (features, labels): Where features is a `Tensor` or a
+            dictionary of string feature name to `Tensor` and labels is a
+            `Tensor` or a dictionary of string label name to `Tensor`.
+
       max_steps: Int. Positive number of total steps for which to train model.
         If `None`, train forever. The training `input_fn` is not expected to
         generate `OutOfRangeError` or `StopIteration` exceptions. See the
@@ -185,9 +192,16 @@ class EvalSpec(
     """Creates a validated `EvalSpec` instance.
 
     Args:
-      input_fn: Evaluation input function returning a tuple of:
-          features - `Tensor` or dictionary of string feature name to `Tensor`.
-          labels - `Tensor` or dictionary of `Tensor` with labels.
+      input_fn: A function that constructs the input data for evaluation.
+        See @{$get_started/premade_estimators#create_input_functions} for more
+        information. The function should construct and return one of
+        the following:
+          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+            tuple (features, labels) with same constraints as below.
+          * A tuple (features, labels): Where features is a `Tensor` or a
+            dictionary of string feature name to `Tensor` and labels is a
+            `Tensor` or a dictionary of string label name to `Tensor`.
+
       steps: Int. Positive number of steps for which to evaluate model. If
         `None`, evaluates until `input_fn` raises an end-of-input exception.
         See `Estimator.evaluate` for details.
@@ -320,7 +334,8 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   can read and write). The only extra work to do is setting the environment
   variable `TF_CONFIG` properly for each worker correspondingly.
 
-  Also see: https://www.tensorflow.org/deploy/distributed
+  Also see
+  [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed).
 
   Setting environment variable depends on the platform. For example, on Linux,
   it can be done as follows (`$` is the shell prompt):
@@ -412,6 +427,8 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   Raises:
     ValueError: if environment variable `TF_CONFIG` is incorrectly set.
   """
+  _assert_eval_spec(eval_spec)  # fail fast if eval_spec is invalid.
+
   executor = _TrainingExecutor(
       estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
 
@@ -455,15 +472,21 @@ class _TrainingExecutor(object):
                train_hooks=None,
                continuous_eval_listener=None):
     if not isinstance(estimator, estimator_lib.Estimator):
-      raise TypeError('`estimator` must have type `tf.estimator.Estimator`.')
+      raise TypeError(
+          '`estimator` must have type `tf.estimator.Estimator`. '
+          'Got: {}'.format(type(estimator)))
     self._estimator = estimator
 
     if not isinstance(train_spec, TrainSpec):
-      raise TypeError('`train_spec` must have type `tf.estimator.TrainSpec`.')
+      raise TypeError(
+          '`train_spec` must have type `tf.estimator.TrainSpec`. '
+          'Got: {}'.format(type(train_spec)))
     self._train_spec = train_spec
 
-    if not isinstance(eval_spec, EvalSpec):
-      raise TypeError('`eval_spec` must have type `tf.estimator.EvalSpec`.')
+    if eval_spec and not isinstance(eval_spec, EvalSpec):
+      raise TypeError('`eval_spec` must be either `None` or have type '
+                      '`tf.estimator.EvalSpec`. Got: {}'.format(
+                          type(eval_spec)))
     self._eval_spec = eval_spec
 
     self._train_hooks = _validate_hooks(train_hooks)
@@ -559,6 +582,8 @@ class _TrainingExecutor(object):
           logging.info('Skip the current checkpoint eval due to throttle secs '
                        '({} secs).'.format(self._eval_throttle_secs))
 
+    _assert_eval_spec(self._eval_spec)
+
     # Final export signal: For any eval result with global_step >= train
     # max_steps, the evaluator will send the final export signal. There is a
     # small chance that the Estimator.train stopping logic sees a different
@@ -607,6 +632,8 @@ class _TrainingExecutor(object):
         return True
       return False
 
+    _assert_eval_spec(self._eval_spec)
+
     if self._eval_spec.throttle_secs <= 0:
       raise ValueError('eval_spec.throttle_secs should be positive, given: {}.'
                        'It is used do determine how long each training '
@@ -720,6 +747,9 @@ class _TrainingExecutor(object):
 
   def _start_continuous_evaluation(self):
     """Repeatedly calls `Estimator` evaluate and export until training ends."""
+
+    _assert_eval_spec(self._eval_spec)
+
     start_delay_secs = self._eval_spec.start_delay_secs
     if start_delay_secs:
       logging.info('Waiting %f secs before starting eval.', start_delay_secs)
@@ -748,6 +778,9 @@ class _TrainingExecutor(object):
   def _execute_evaluator_once(self, evaluator, continuous_eval_listener,
                               throttle_secs):
     """Executes the `evaluator`."""
+
+    _assert_eval_spec(self._eval_spec)
+
     start = time.time()
 
     eval_result = None
@@ -786,7 +819,10 @@ class _TrainingExecutor(object):
 
     def __init__(self, estimator, eval_spec, max_training_steps):
       self._estimator = estimator
+
+      _assert_eval_spec(eval_spec)
       self._eval_spec = eval_spec
+
       self._is_final_export_triggered = False
       self._previous_ckpt_path = None
       self._last_warning_time = 0
@@ -975,3 +1011,10 @@ class _ContinuousEvalListener(object):
     """
     del eval_result
     return True
+
+
+def _assert_eval_spec(eval_spec):
+  """Raise error if `eval_spec` is not of the right type."""
+  if not isinstance(eval_spec, EvalSpec):
+    raise TypeError('`eval_spec` must have type `tf.estimator.EvalSpec`. '
+                    'Got: {}'.format(type(eval_spec)))
diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
index 4f7da848086514b6241799645997c8c6a246631f..c04905ae65d0ef54af2272a2a8ae26bbc9ce1cd5 100644
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py
@@ -72,6 +72,8 @@ _NONE_EXPORTER_NAME_MSG = (
     'An Exporter cannot have a name that is `None` or empty.')
 _INVALID_TRAIN_SPEC_MSG = '`train_spec` must have type `tf.estimator.TrainSpec`'
 _INVALID_EVAL_SPEC_MSG = '`eval_spec` must have type `tf.estimator.EvalSpec`'
+_EVAL_SPEC_OR_NONE_MSG = (
+    '`eval_spec` must be either `None` or have type `tf.estimator.EvalSpec`')
 _INVALID_EVAL_LISTENER_MSG = 'must have type `_ContinuousEvalListener`'
 _INVALID_CONFIG_FOR_STD_SERVER_MSG = 'Could not start server; .*TF_CONFIG'
 _INVALID_LOCAL_TASK_WITH_CLUSTER = '`task.type` in TF_CONFIG cannot be `local`'
@@ -356,11 +358,23 @@ class TrainAndEvaluateTest(test.TestCase):
       training.train_and_evaluate(invalid_estimator, mock_train_spec,
                                   mock_eval_spec)
 
+  def test_fail_fast_if_invalid_eval_spec(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    invalid_eval_spec = object()
+
+    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
+      with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+        training.train_and_evaluate(mock_est, mock_train_spec,
+                                    invalid_eval_spec)
+
+      mock_executor.assert_not_called()
+
 
 class TrainingExecutorConstructorTest(test.TestCase):
   """Tests constructor of _TrainingExecutor."""
 
-  def testRequiredArgumentsSet(self):
+  def test_required_arguments_set(self):
     estimator = estimator_lib.Estimator(model_fn=lambda features: features)
     train_spec = training.TrainSpec(input_fn=lambda: 1)
     eval_spec = training.EvalSpec(input_fn=lambda: 1)
@@ -389,9 +403,17 @@ class TrainingExecutorConstructorTest(test.TestCase):
     train_spec = training.TrainSpec(input_fn=lambda: 1)
     invalid_eval_spec = object()
 
-    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+    with self.assertRaisesRegexp(TypeError, _EVAL_SPEC_OR_NONE_MSG):
       training._TrainingExecutor(estimator, train_spec, invalid_eval_spec)
 
+  def test_eval_spec_none(self):
+    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
+    train_spec = training.TrainSpec(input_fn=lambda: 1)
+    eval_spec = None
+
+    # Tests that no error is raised.
+    training._TrainingExecutor(estimator, train_spec, eval_spec)
+
   def test_invalid_train_hooks(self):
     estimator = estimator_lib.Estimator(model_fn=lambda features: features)
     train_spec = training.TrainSpec(input_fn=lambda: 1)
@@ -457,6 +479,36 @@ class _TrainingExecutorTrainingTest(object):
     mock_est.evaluate.assert_not_called()
     mock_est.export_savedmodel.assert_not_called()
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_train_with_no_eval_spec(self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.config = self._run_config
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
+    eval_spec = None
+    mock_server_instance = mock_server.return_value
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    self._run_task(executor)
+
+    mock_server.assert_called_with(
+        mock_est.config.cluster_spec,
+        job_name=mock_est.config.task_type,
+        task_index=mock_est.config.task_id,
+        config=test.mock.ANY,
+        start=False)
+
+    self.assertTrue(mock_server_instance.start.called)
+
+    mock_est.train.assert_called_with(
+        input_fn=train_spec.input_fn,
+        max_steps=train_spec.max_steps,
+        hooks=list(train_spec.hooks),
+        saving_listeners=test.mock.ANY)
+    mock_est.evaluate.assert_not_called()
+    mock_est.export_savedmodel.assert_not_called()
+
   @test.mock.patch.object(time, 'sleep')
   @test.mock.patch.object(server_lib, 'Server')
   def test_train_with_train_hooks(self, unused_mock_server, unused_mock_sleep):
@@ -683,6 +735,20 @@ class TrainingExecutorRunMasterTest(test.TestCase):
         saving_listeners=test.mock.ANY)
     mock_est.export_savedmodel.assert_not_called()
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_train_with_no_eval_spec_fails(self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
+    mock_est.config = self._run_config
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
+    eval_spec = None
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+      executor.run_master()
+
   @test.mock.patch.object(time, 'sleep')
   @test.mock.patch.object(server_lib, 'Server')
   def test_train_with_train_hooks(self, mock_server, unused_mock_sleep):
@@ -980,6 +1046,19 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
         hooks=eval_spec.hooks)
     self.assertFalse(mock_est.train.called)
 
+  def test_evaluate_with_no_eval_spec_fails(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.latest_checkpoint.return_value = 'latest_it_is'
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
+
+    eval_spec = None
+
+    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
+
+    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+      executor.run_evaluator()
+
   def test_evaluate_with_train_hooks(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_est.latest_checkpoint.return_value = 'latest_it_is'
@@ -1635,6 +1714,17 @@ class TrainingExecutorRunLocalTest(test.TestCase):
     self.assertEqual(train_spec.input_fn, train_args['input_fn'])
     self.assertEqual(train_spec.max_steps, train_args['max_steps'])
 
+  def test_train_with_no_eval_spec_fails(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=300, hooks=[_FakeHook()])
+    eval_spec = None
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+
+    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+      executor.run_local()
+
   def test_train_hooks(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
     mock_est.latest_checkpoint.return_value = 'checkpoint_path/'
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index 3ce8eea84b6bf601ce89dfaa7d8e3a5d193468b3..bb4bdd3fdfb2e19dbc1c581d7771f2e1ac4442ba 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py
@@ -20,7 +20,12 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
+import time
 
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -56,3 +61,48 @@ def fn_args(fn):
     if _is_bounded_method(fn):
       args.remove('self')
   return tuple(args)
+
+
+# When we create a timestamped directory, there is a small chance that the
+# directory already exists because another process is also creating these
+# directories. In this case we just wait one second to get a new timestamp and
+# try again. If this fails several times in a row, then something is seriously
+# wrong.
+MAX_DIRECTORY_CREATION_ATTEMPTS = 10
+
+
+def get_timestamped_dir(dir_base):
+  """Builds a path to a new subdirectory within the base directory.
+
+  The subdirectory will be named using the current time.
+  This guarantees monotonically increasing directory numbers even across
+  multiple runs of the pipeline.
+  The timestamp used is the number of seconds since epoch UTC.
+
+  Args:
+    dir_base: A string containing a directory to create the subdirectory under.
+
+  Returns:
+    The full path of the new subdirectory (which is not actually created yet).
+
+  Raises:
+    RuntimeError: if repeated attempts fail to obtain a unique timestamped
+      directory name.
+  """
+  attempts = 0
+  while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
+    timestamp = int(time.time())
+
+    result_dir = os.path.join(
+        compat.as_bytes(dir_base), compat.as_bytes(str(timestamp)))
+    if not gfile.Exists(result_dir):
+      # Collisions are still possible (though extremely unlikely): this
+      # directory is not actually created yet, but it will be almost
+      # instantly on return from this function.
+      return result_dir
+    time.sleep(1)
+    attempts += 1
+    logging.warn('Directory {} already exists; retrying (attempt {}/{})'.format(
+        result_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS))
+  raise RuntimeError('Failed to obtain a unique export directory name after '
+                     '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS))
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index a758f8a4fc4898713772c4e919acda48b0f6ad0b..295d4ca094cc8cb85c0f1f7fd47c20b910c270df 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -6,18 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "feature_column_py",
     srcs = ["feature_column_lib.py"],
@@ -45,6 +33,7 @@ py_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
@@ -54,6 +43,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/keras",
         "@six_archive//:six",
     ],
 )
@@ -74,7 +64,10 @@ py_test(
     srcs = ["feature_column_test.py"],
     data = [":vocabulary_testdata"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_pip",
+    ],
     deps = [
         ":feature_column",
         ":feature_column_py",
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index c416881c3119c160d28f4b8e37cd2aeb22f239a6..c16c3cda4892b8017571c2b37736b85c80f3d8a4 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -16,7 +16,7 @@
 
 FeatureColumns provide a high level abstraction for ingesting and representing
 features. FeatureColumns are also the primary way of encoding features for
-canned ${tf.estimator.Estimator}s.
+canned @{tf.estimator.Estimator}s.
 
 When using FeatureColumns with `Estimators`, the type of feature column you
 should choose depends on (1) the feature type and (2) the model type.
@@ -135,10 +135,13 @@ import numpy as np
 import six
 
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -148,6 +151,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import template
@@ -158,7 +162,6 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 def _internal_input_layer(features,
@@ -405,58 +408,207 @@ def linear_model(features,
     ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
       nor `_CategoricalColumn`.
   """
-  feature_columns = _clean_feature_columns(feature_columns)
-  for column in feature_columns:
-    if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
-      raise ValueError('Items of feature_columns must be either a _DenseColumn '
-                       'or _CategoricalColumn. Given: {}'.format(column))
-  weight_collections = list(weight_collections or [])
-  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
-  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
-  with variable_scope.variable_scope(
-      None, default_name='linear_model', values=features.values()):
-    weighted_sums = []
-    ordered_columns = []
-    builder = _LazyBuilder(features)
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
-        ordered_columns.append(column)
-        weighted_sum = _create_weighted_sum(
-            column=column,
-            builder=builder,
-            units=units,
-            sparse_combiner=sparse_combiner,
-            weight_collections=weight_collections,
-            trainable=trainable)
-        weighted_sums.append(weighted_sum)
-        if cols_to_vars is not None:
-          # Retrieve the variables created.
-          cols_to_vars[column] = ops.get_collection(
-              ops.GraphKeys.GLOBAL_VARIABLES,
-              scope=variable_scope.get_variable_scope().name)
-    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
-    predictions_no_bias = math_ops.add_n(
-        weighted_sums, name='weighted_sum_no_bias')
-    bias = variable_scope.get_variable(
+  linear_model_layer = _LinearModel(
+      feature_columns=feature_columns,
+      units=units,
+      sparse_combiner=sparse_combiner,
+      weight_collections=weight_collections,
+      trainable=trainable,
+      name='linear_model')
+  retval = linear_model_layer(features)  # pylint: disable=not-callable
+  if cols_to_vars is not None:
+    cols_to_vars.update(linear_model_layer.cols_to_vars())
+  return retval
+
+
+def _add_to_collections(var, weight_collections):
+  # TODO(rohanj): Explore adding a _get_variable_list method on `Variable`
+  # so that we don't have to do this check.
+  if isinstance(var, variables.PartitionedVariable):
+    for constituent_var in list(var):
+      ops.add_to_collections(weight_collections, constituent_var)
+  else:
+    ops.add_to_collections(weight_collections, var)
+
+
+class _FCLinearWrapper(base.Layer):
+  """Wraps a _FeatureColumn in a layer for use in a linear model.
+
+  See `linear_model` above.
+  """
+
+  def __init__(self,
+               feature_column,
+               units=1,
+               sparse_combiner='sum',
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_FCLinearWrapper, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self._feature_column = feature_column
+    self._units = units
+    self._sparse_combiner = sparse_combiner
+    self._weight_collections = weight_collections
+
+  def build(self, _):
+    if isinstance(self._feature_column, _CategoricalColumn):
+      weight = self.add_variable(
+          name='weights',
+          shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
+    else:
+      num_elements = self._feature_column._variable_shape.num_elements()  # pylint: disable=protected-access
+      weight = self.add_variable(
+          name='weights',
+          shape=[num_elements, self._units],
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
+    _add_to_collections(weight, self._weight_collections)
+    self._weight_var = weight
+    self.built = True
+
+  def call(self, builder):
+    weighted_sum = _create_weighted_sum(
+        column=self._feature_column,
+        builder=builder,
+        units=self._units,
+        sparse_combiner=self._sparse_combiner,
+        weight_collections=self._weight_collections,
+        trainable=self.trainable,
+        weight_var=self._weight_var)
+    return weighted_sum
+
+
+class _BiasLayer(base.Layer):
+  """A layer for the bias term.
+  """
+
+  def __init__(self,
+               units=1,
+               trainable=True,
+               weight_collections=None,
+               name=None,
+               **kwargs):
+    super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)
+    self._units = units
+    self._weight_collections = weight_collections
+
+  def build(self, _):
+    self._bias_variable = self.add_variable(
         'bias_weights',
-        shape=[units],
+        shape=[self._units],
         initializer=init_ops.zeros_initializer(),
+        trainable=self.trainable)
+    _add_to_collections(self._bias_variable, self._weight_collections)
+    self.built = True
+
+  def call(self, _):
+    return self._bias_variable
+
+
+def _get_expanded_variable_list(variable):
+  if (isinstance(variable, variables.Variable) or
+      resource_variable_ops.is_resource_variable(variable)):
+    return [variable]  # Single variable case.
+  else:  # Must be a PartitionedVariable, so convert into a list.
+    return list(variable)
+
+
+def _strip_leading_slashes(name):
+  return name.rsplit('/', 1)[-1]
+
+
+class _LinearModel(training.Model):
+  """Creates a linear model using feature columns.
+
+  See `linear_model` for details.
+  """
+
+  def __init__(self,
+               feature_columns,
+               units=1,
+               sparse_combiner='sum',
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_LinearModel, self).__init__(name=name, **kwargs)
+    self._feature_columns = _clean_feature_columns(feature_columns)
+    self._weight_collections = list(weight_collections or [])
+    if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:
+      self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+
+    column_layers = {}
+    for column in sorted(self._feature_columns, key=lambda x: x.name):
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name) as vs:  # pylint: disable=protected-access
+        # Having the fully expressed variable scope name ends up doubly
+        # expressing the outer scope (scope with which this method was called)
+        # in the name of the variable that would get created.
+        column_name = _strip_leading_slashes(vs.name)
+      column_layer = _FCLinearWrapper(column, units, sparse_combiner,
+                                      self._weight_collections, trainable,
+                                      column_name, **kwargs)
+      column_layers[column_name] = column_layer
+    self._column_layers = self._add_layers(column_layers)
+    self._bias_layer = _BiasLayer(
+        units=units,
         trainable=trainable,
-        collections=weight_collections)
-    predictions = nn_ops.bias_add(
-        predictions_no_bias, bias, name='weighted_sum')
-    if cols_to_vars is not None:
-      # Add the bias to cols_to_vars as well, converting the Variable or
-      # PartitionedVariable to a list of Variable's.
-      if isinstance(bias, variables.Variable):
-        cols_to_vars['bias'] = [bias]
-      else:  # Must be a PartitionedVariable.
-        cols_to_vars['bias'] = list(bias)
+        weight_collections=self._weight_collections,
+        name='bias_layer',
+        **kwargs)
+    self._cols_to_vars = {}
+
+  def cols_to_vars(self):
+    """Returns a dict mapping _FeatureColumns to variables.
+
+    See `linear_model` for more information.
+    This is not populated till `call` is called i.e. layer is built.
+    """
+    return self._cols_to_vars
+
+  def call(self, features):
+    with variable_scope.variable_scope(self.name):
+      for column in self._feature_columns:
+        if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
+          raise ValueError(
+              'Items of feature_columns must be either a '
+              '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
+      weighted_sums = []
+      ordered_columns = []
+      builder = _LazyBuilder(features)
+      for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
+        column = layer._feature_column  # pylint: disable=protected-access
+        ordered_columns.append(column)
+        weighted_sum = layer(builder)
+        weighted_sums.append(weighted_sum)
+        self._cols_to_vars[column] = ops.get_collection(
+            ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
+
+      _verify_static_batch_size_equality(weighted_sums, ordered_columns)
+      predictions_no_bias = math_ops.add_n(
+          weighted_sums, name='weighted_sum_no_bias')
+      predictions = nn_ops.bias_add(
+          predictions_no_bias,
+          self._bias_layer(  # pylint: disable=not-callable
+              builder,
+              scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
+          name='weighted_sum')
+      bias = self._bias_layer.variables[0]
+      self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
     return predictions
 
+  def _add_layers(self, layers):
+    # "Magic" required for keras.Model classes to track all the variables in
+    # a list of layers.Layer objects.
+    # TODO(ashankar): Figure out API so user code doesn't have to do this.
+    for name, layer in layers.items():
+      setattr(self, 'layer-%s' % name, layer)
+    return layers
+
 
 def _transform_features(features, feature_columns):
   """Returns transformed features based on features columns passed in.
@@ -653,11 +805,22 @@ def embedding_column(
     initializer = init_ops.truncated_normal_initializer(
         mean=0.0, stddev=1 / math.sqrt(dimension))
 
+  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
+
+  def _creator(weight_collections, scope):
+    embedding_column_layer = _EmbeddingColumnLayer(
+        embedding_shape=embedding_shape,
+        initializer=initializer,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        name='embedding_column_layer')
+    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
+
   return _EmbeddingColumn(
       categorical_column=categorical_column,
       dimension=dimension,
       combiner=combiner,
-      initializer=initializer,
+      layer_creator=_creator,
       ckpt_to_load_from=ckpt_to_load_from,
       tensor_name_in_ckpt=tensor_name_in_ckpt,
       max_norm=max_norm,
@@ -780,6 +943,7 @@ def shared_embedding_columns(
   sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
 
   c0 = sorted_columns[0]
+  num_buckets = c0._num_buckets  # pylint: disable=protected-access
   if not isinstance(c0, _CategoricalColumn):
     raise ValueError(
         'All categorical_columns must be subclasses of _CategoricalColumn. '
@@ -795,23 +959,45 @@ def shared_embedding_columns(
           'the same type, or be weighted_categorical_column of the same type. '
           'Given column: {} of type: {} does not match given column: {} of '
           'type: {}'.format(c0, type(c0), c, type(c)))
+    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same number of buckets. Given column: {} with buckets: {} does  '
+          'not match column: {} with buckets: {}'.format(
+              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
 
   if not shared_embedding_collection_name:
     shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
     shared_embedding_collection_name += '_shared_embedding'
 
+  # Create the state (_SharedEmbeddingColumnLayer) here.
+  embedding_shape = num_buckets, dimension
+
+  shared_embedding_column_layer = _EmbeddingColumnLayer(
+      embedding_shape=embedding_shape,
+      initializer=initializer,
+      weight_collections=[],
+      trainable=trainable,
+      name=shared_embedding_collection_name)
+
   result = []
   for column in categorical_columns:
-    result.append(_SharedEmbeddingColumn(
-        categorical_column=column,
-        dimension=dimension,
-        combiner=combiner,
-        initializer=initializer,
-        shared_embedding_collection_name=shared_embedding_collection_name,
-        ckpt_to_load_from=ckpt_to_load_from,
-        tensor_name_in_ckpt=tensor_name_in_ckpt,
-        max_norm=max_norm,
-        trainable=trainable))
+    result.append(
+        _SharedEmbeddingColumn(
+            categorical_column=column,
+            initializer=initializer,
+            dimension=dimension,
+            combiner=combiner,
+            var_scope_name=shared_embedding_collection_name,
+            ckpt_to_load_from=ckpt_to_load_from,
+            tensor_name_in_ckpt=tensor_name_in_ckpt,
+            max_norm=max_norm,
+            trainable=trainable))
+
+  for single_result in result:
+    single_result._set_layer(shared_embedding_column_layer)  # pylint: disable=protected-access
+    single_result._set_all_columns(result)  # pylint: disable=protected-access
+
   return result
 
 
@@ -1568,6 +1754,57 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
       hash_key=hash_key)
 
 
+# TODO(rohanj): Clearly define semantics of this layer.
+class _EmbeddingColumnLayer(base.Layer):
+  """A layer that stores all the state required for a embedding column."""
+
+  def __init__(self,
+               embedding_shape,
+               initializer,
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    """Constructor.
+
+    Args:
+      embedding_shape: Shape of the embedding variable used for lookup.
+      initializer: A variable initializer function to be used in embedding
+        variable initialization. If not specified, defaults to
+        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+        `1/sqrt(dimension)`.
+      weight_collections: A list of collection names to which the Variable will
+        be added. Note that, variables will also be added to collections
+        `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+      trainable: If `True` also add the variable to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: Name of the layer
+      **kwargs: keyword named properties.
+    """
+    super(_EmbeddingColumnLayer, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self._embedding_shape = embedding_shape
+    self._initializer = initializer
+    self._weight_collections = weight_collections
+
+  def build(self, _):
+    self._embedding_weight_var = self.add_variable(
+        name='embedding_weights',
+        shape=self._embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self._initializer,
+        trainable=self.trainable)
+    # self.add_variable already appends to GLOBAL_VARIABLES collection.
+    if self._weight_collections and not context.executing_eagerly():
+      for weight_collection in self._weight_collections:
+        if weight_collection != ops.GraphKeys.GLOBAL_VARIABLES:
+          _add_to_collections(self._embedding_weight_var, [weight_collection])
+    self.built = True
+
+  def call(self, _):
+    return self._embedding_weight_var
+
+
 class _FeatureColumn(object):
   """Represents a feature column abstraction.
 
@@ -1626,7 +1863,7 @@ class _FeatureColumn(object):
 
     It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a
     dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
-    supported objects. Please check documentation of ${tf.parse_example} for all
+    supported objects. Please check documentation of @{tf.parse_example} for all
     supported spec objects.
 
     Let's say a Feature column depends on raw feature ('raw') and another
@@ -1641,6 +1878,14 @@ class _FeatureColumn(object):
     """
     pass
 
+  def _reset_config(self):
+    """Resets the configuration in the column.
+
+    Some feature columns e.g. embedding or shared embedding columns might
+    have some state that is needed to be reset sometimes. Use this method
+    in that scenario.
+    """
+
 
 class _DenseColumn(_FeatureColumn):
   """Represents a column which can be represented as `Tensor`.
@@ -1677,7 +1922,7 @@ class _DenseColumn(_FeatureColumn):
       weight_collections: List of graph collections to which Variables (if any
         will be created) are added.
       trainable: If `True` also add variables to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.Variable}).
+        `GraphKeys.TRAINABLE_VARIABLES` (see @{tf.Variable}).
 
     Returns:
       `Tensor` of shape [batch_size] + `_variable_shape`.
@@ -1685,13 +1930,13 @@ class _DenseColumn(_FeatureColumn):
     pass
 
 
-def _create_weighted_sum(
-    column,
-    builder,
-    units,
-    sparse_combiner,
-    weight_collections,
-    trainable):
+def _create_weighted_sum(column,
+                         builder,
+                         units,
+                         sparse_combiner,
+                         weight_collections,
+                         trainable,
+                         weight_var=None):
   """Creates a weighted sum for a dense or sparse column for linear_model."""
   if isinstance(column, _CategoricalColumn):
     return _create_categorical_column_weighted_sum(
@@ -1700,18 +1945,24 @@ def _create_weighted_sum(
         units=units,
         sparse_combiner=sparse_combiner,
         weight_collections=weight_collections,
-        trainable=trainable)
+        trainable=trainable,
+        weight_var=weight_var)
   else:
     return _create_dense_column_weighted_sum(
         column=column,
         builder=builder,
         units=units,
         weight_collections=weight_collections,
-        trainable=trainable)
+        trainable=trainable,
+        weight_var=weight_var)
 
 
-def _create_dense_column_weighted_sum(
-    column, builder, units, weight_collections, trainable):
+def _create_dense_column_weighted_sum(column,
+                                      builder,
+                                      units,
+                                      weight_collections,
+                                      trainable,
+                                      weight_var=None):
   """Create a weighted sum of a dense column for linear_model."""
   tensor = column._get_dense_tensor(  # pylint: disable=protected-access
       builder,
@@ -1720,12 +1971,15 @@ def _create_dense_column_weighted_sum(
   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-  weight = variable_scope.get_variable(
-      name='weights',
-      shape=[num_elements, units],
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+  if weight_var is not None:
+    weight = weight_var
+  else:
+    weight = variable_scope.get_variable(
+        name='weights',
+        shape=[num_elements, units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return math_ops.matmul(tensor, weight, name='weighted_sum')
 
 
@@ -1735,7 +1989,7 @@ class _CategoricalColumn(_FeatureColumn):
   WARNING: Do not subclass this layer unless you know what you are doing:
   the API is subject to future changes.
 
-  A categorical feature typically handled with a ${tf.SparseTensor} of IDs.
+  A categorical feature typically handled with a @{tf.SparseTensor} of IDs.
   """
   __metaclass__ = abc.ABCMeta
 
@@ -1770,13 +2024,18 @@ class _CategoricalColumn(_FeatureColumn):
       weight_collections: List of graph collections to which variables (if any
         will be created) are added.
       trainable: If `True` also add variables to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.get_variable}).
+        `GraphKeys.TRAINABLE_VARIABLES` (see @{tf.get_variable}).
     """
     pass
 
 
-def _create_categorical_column_weighted_sum(
-    column, builder, units, sparse_combiner, weight_collections, trainable):
+def _create_categorical_column_weighted_sum(column,
+                                            builder,
+                                            units,
+                                            sparse_combiner,
+                                            weight_collections,
+                                            trainable,
+                                            weight_var=None):
   """Create a weighted sum of a categorical column for linear_model."""
   sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
       builder,
@@ -1790,12 +2049,15 @@ def _create_categorical_column_weighted_sum(
     weight_tensor = sparse_ops.sparse_reshape(
         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
-  weight = variable_scope.get_variable(
-      name='weights',
-      shape=(column._num_buckets, units),  # pylint: disable=protected-access
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+  if weight_var is not None:
+    weight = weight_var
+  else:
+    weight = variable_scope.get_variable(
+        name='weights',
+        shape=(column._num_buckets, units),  # pylint: disable=protected-access
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return _safe_embedding_lookup_sparse(
       weight,
       id_tensor,
@@ -1804,6 +2066,21 @@ def _create_categorical_column_weighted_sum(
       name='weighted_sum')
 
 
+class _SequenceDenseColumn(_FeatureColumn):
+  """Represents dense sequence data."""
+
+  __metaclass__ = abc.ABCMeta
+
+  TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
+      'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
+
+  @abc.abstractmethod
+  def _get_sequence_dense_tensor(
+      self, inputs, weight_collections=None, trainable=None):
+    """Returns a `TensorSequenceLengthPair`."""
+    pass
+
+
 class _LazyBuilder(object):
   """Handles caching of transformations while building the model.
 
@@ -1874,12 +2151,12 @@ class _LazyBuilder(object):
       self._feature_tensors[key] = feature_tensor
       return feature_tensor
 
-    if not isinstance(key, (str, _FeatureColumn)):
-      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
-                      'Provided: {}'.format(key))
+    if isinstance(key, str):
+      raise ValueError('Feature {} is not in features dictionary.'.format(key))
 
     if not isinstance(key, _FeatureColumn):
-      raise ValueError('Feature {} is not in features dictionary.'.format(key))
+      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
+                      'Provided: {}'.format(key))
 
     column = key
     logging.debug('Transforming feature_column %s.', column)
@@ -2152,11 +2429,11 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
 
 
 class _EmbeddingColumn(
-    _DenseColumn,
-    collections.namedtuple('_EmbeddingColumn', (
-        'categorical_column', 'dimension', 'combiner', 'initializer',
-        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
-    ))):
+    _DenseColumn, _SequenceDenseColumn,
+    collections.namedtuple(
+        '_EmbeddingColumn',
+        ('categorical_column', 'dimension', 'combiner', 'layer_creator',
+         'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
   """See `embedding_column`."""
 
   @property
@@ -2178,21 +2455,21 @@ class _EmbeddingColumn(
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _get_dense_tensor_internal(self,
+                                 inputs,
+                                 weight_collections=None,
+                                 trainable=None):
+    """Private method that follows the signature of _get_dense_tensor."""
     # Get sparse IDs and weights.
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
         inputs, weight_collections=weight_collections, trainable=trainable)
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
-    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-    embedding_weights = variable_scope.get_variable(
-        name='embedding_weights',
-        shape=embedding_shape,
-        dtype=dtypes.float32,
-        initializer=self.initializer,
-        trainable=self.trainable and trainable,
-        collections=weight_collections)
+    embedding_weights = self.layer_creator(
+        weight_collections=weight_collections,
+        scope=variable_scope.get_variable_scope())
+
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
       if isinstance(to_restore, variables.PartitionedVariable):
@@ -2210,14 +2487,59 @@ class _EmbeddingColumn(
         name='%s_weights' % self.name,
         max_norm=self.max_norm)
 
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    if isinstance(self.categorical_column, _SequenceCategoricalColumn):
+      raise ValueError(
+          'In embedding_column: {}. '
+          'categorical_column must not be of type _SequenceCategoricalColumn. '
+          'Suggested fix A: If you wish to use input_layer, use a '
+          'non-sequence categorical_column_with_*. '
+          'Suggested fix B: If you wish to create sequence input, use '
+          'sequence_input_layer instead of input_layer. '
+          'Given (type {}): {}'.format(
+              self.name, type(self.categorical_column),
+              self.categorical_column))
+    return self._get_dense_tensor_internal(
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable)
+
+  def _get_sequence_dense_tensor(
+      self, inputs, weight_collections=None, trainable=None):
+    if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
+      raise ValueError(
+          'In embedding_column: {}. '
+          'categorical_column must be of type _SequenceCategoricalColumn '
+          'to use sequence_input_layer. '
+          'Suggested fix: Use one of sequence_categorical_column_with_*. '
+          'Given (type {}): {}'.format(
+              self.name, type(self.categorical_column),
+              self.categorical_column))
+    dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable)
+    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
+        sparse_tensors.id_tensor)
+    return _SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
+
+
+def _get_graph_for_variable(var):
+  if isinstance(var, variables.PartitionedVariable):
+    return list(var)[0].graph
+  else:
+    return var.graph
+
 
 class _SharedEmbeddingColumn(
     _DenseColumn,
-    collections.namedtuple('_SharedEmbeddingColumn', (
-        'categorical_column', 'dimension', 'combiner', 'initializer',
-        'shared_embedding_collection_name', 'ckpt_to_load_from',
-        'tensor_name_in_ckpt', 'max_norm', 'trainable'
-    ))):
+    collections.namedtuple(
+        '_SharedEmbeddingColumn',
+        ('categorical_column', 'dimension', 'combiner', 'initializer',
+         'var_scope_name', 'ckpt_to_load_from', 'tensor_name_in_ckpt',
+         'max_norm', 'trainable'))):
   """See `embedding_column`."""
 
   @property
@@ -2228,7 +2550,7 @@ class _SharedEmbeddingColumn(
 
   @property
   def _var_scope_name(self):
-    return self.shared_embedding_collection_name
+    return self.var_scope_name
 
   @property
   def _parse_example_spec(self):
@@ -2237,6 +2559,22 @@ class _SharedEmbeddingColumn(
   def _transform_feature(self, inputs):
     return inputs.get(self.categorical_column)
 
+  def _set_layer(self, layer):
+    self._layer = layer
+
+  def _set_all_columns(self, all_columns):
+    self._all_columns = all_columns
+
+  def _reset_config(self):
+    config = self._layer.get_config()
+    config['embedding_shape'] = (
+        self.categorical_column._num_buckets,  # pylint: disable=protected-access
+        self.dimension)
+    config['initializer'] = self.initializer
+    self._layer = self._layer.__class__.from_config(config)
+    for column in self._all_columns:
+      column._set_layer(self._layer)  # pylint: disable=protected-access
+
   @property
   def _variable_shape(self):
     if not hasattr(self, '_shape'):
@@ -2254,38 +2592,17 @@ class _SharedEmbeddingColumn(
       sparse_ids = sparse_tensors.id_tensor
       sparse_weights = sparse_tensors.weight_tensor
 
-      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-      shared_embedding_collection = ops.get_collection(
-          self.shared_embedding_collection_name)
-      if shared_embedding_collection:
-        if len(shared_embedding_collection) > 1:
-          raise ValueError(
-              'Collection {} can only contain one variable. '
-              'Suggested fix A: Choose a unique name for this collection. '
-              'Suggested fix B: Do not add any variables to this collection. '
-              'The feature_column library already adds a variable under the '
-              'hood.'.format(shared_embedding_collection))
-        embedding_weights = shared_embedding_collection[0]
-        if embedding_weights.get_shape() != embedding_shape:
-          raise ValueError(
-              'Shared embedding collection {} contains variable {} of '
-              'unexpected shape {}. Expected shape is {}. '
-              'Suggested fix A: Choose a unique name for this collection. '
-              'Suggested fix B: Do not add any variables to this collection. '
-              'The feature_column library already adds a variable under the '
-              'hood.'.format(
-                  self.shared_embedding_collection_name, embedding_weights.name,
-                  embedding_weights.get_shape(), embedding_shape))
-      else:
-        embedding_weights = variable_scope.get_variable(
-            name='embedding_weights',
-            shape=embedding_shape,
-            dtype=dtypes.float32,
-            initializer=self.initializer,
-            trainable=self.trainable and trainable,
-            collections=weight_collections)
-        ops.add_to_collection(
-            self.shared_embedding_collection_name, embedding_weights)
+      embedding_weights = self._layer(
+          None, scope=variable_scope.get_variable_scope())
+      # If we're in graph mode and this is called with a different graph,
+      # then we should reset.
+      if not context.executing_eagerly() and (
+          ops.get_default_graph() !=
+          _get_graph_for_variable(embedding_weights)):
+        self._reset_config()
+        embedding_weights = self._layer(
+            None, scope=variable_scope.get_variable_scope())
+
       if self.ckpt_to_load_from is not None:
         to_restore = embedding_weights
         if isinstance(to_restore, variables.PartitionedVariable):
@@ -2835,6 +3152,9 @@ def _safe_embedding_lookup_sparse(embedding_weights,
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+    if combiner != 'sum':
+      sparse_ids, sparse_weights = _prune_invalid_weights(
+          sparse_ids, sparse_weights)
 
     # Fill in dummy values for empty features, if necessary.
     sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
@@ -2883,14 +3203,24 @@ def _prune_invalid_ids(sparse_ids, sparse_weights):
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
   if sparse_weights is not None:
     is_id_valid = math_ops.logical_and(
-        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+        is_id_valid,
+        array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool))
   sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
   if sparse_weights is not None:
     sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
   return sparse_ids, sparse_weights
 
 
-class _IndicatorColumn(_DenseColumn,
+def _prune_invalid_weights(sparse_ids, sparse_weights):
+  """Prune invalid weights (< 0) from the input ids and weights."""
+  if sparse_weights is not None:
+    is_weights_valid = math_ops.greater(sparse_weights.values, 0)
+    sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid)
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid)
+  return sparse_ids, sparse_weights
+
+
+class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
                        collections.namedtuple('_IndicatorColumn',
                                               ['categorical_column'])):
   """Represents a one-hot column for use in deep networks.
@@ -2966,15 +3296,53 @@ class _IndicatorColumn(_DenseColumn,
 
     Returns:
       Dense `Tensor` created within `_transform_feature`.
+
+    Raises:
+      ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`.
     """
     # Do nothing with weight_collections and trainable since no variables are
     # created in this function.
     del weight_collections
     del trainable
+    if isinstance(self.categorical_column, _SequenceCategoricalColumn):
+      raise ValueError(
+          'In indicator_column: {}. '
+          'categorical_column must not be of type _SequenceCategoricalColumn. '
+          'Suggested fix A: If you wish to use input_layer, use a '
+          'non-sequence categorical_column_with_*. '
+          'Suggested fix B: If you wish to create sequence input, use '
+          'sequence_input_layer instead of input_layer. '
+          'Given (type {}): {}'.format(
+              self.name, type(self.categorical_column),
+              self.categorical_column))
     # Feature has been already transformed. Return the intermediate
     # representation created by _transform_feature.
     return inputs.get(self)
 
+  def _get_sequence_dense_tensor(
+      self, inputs, weight_collections=None, trainable=None):
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
+      raise ValueError(
+          'In indicator_column: {}. '
+          'categorical_column must be of type _SequenceCategoricalColumn '
+          'to use sequence_input_layer. '
+          'Suggested fix: Use one of sequence_categorical_column_with_*. '
+          'Given (type {}): {}'.format(
+              self.name, type(self.categorical_column),
+              self.categorical_column))
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    dense_tensor = inputs.get(self)
+    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
+        sparse_tensors.id_tensor)
+    return _SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
+
 
 def _verify_static_batch_size_equality(tensors, columns):
   # bath_size is a tf.Dimension object.
@@ -2990,3 +3358,68 @@ def _verify_static_batch_size_equality(tensors, columns):
             'Batch size of columns ({}, {}): ({}, {})'.format(
                 columns[bath_size_column_index].name, columns[i].name,
                 expected_batch_size, tensors[i].shape[0]))
+
+
+def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
+  """Returns a [batch_size] Tensor with per-example sequence length."""
+  with ops.name_scope(None, 'sequence_length') as name_scope:
+    row_ids = sp_tensor.indices[:, 0]
+    column_ids = sp_tensor.indices[:, 1]
+    column_ids += array_ops.ones_like(column_ids)
+    seq_length = math_ops.to_int64(
+        math_ops.segment_max(column_ids, segment_ids=row_ids) / num_elements)
+    # If the last n rows do not have ids, seq_length will have shape
+    # [batch_size - n]. Pad the remaining values with zeros.
+    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
+    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
+    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
+
+
+class _SequenceCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple(
+        '_SequenceCategoricalColumn', ['categorical_column'])):
+  """Represents sequences of categorical data."""
+
+  @property
+  def name(self):
+    return self.categorical_column.name
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    return self.categorical_column._transform_feature(inputs)  # pylint: disable=protected-access
+
+  @property
+  def _num_buckets(self):
+    return self.categorical_column._num_buckets  # pylint: disable=protected-access
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    id_tensor = sparse_tensors.id_tensor
+    weight_tensor = sparse_tensors.weight_tensor
+    # Expands final dimension, so that embeddings are not combined during
+    # embedding lookup.
+    check_id_rank = check_ops.assert_equal(
+        array_ops.rank(id_tensor), 2,
+        data=[
+            'Column {} expected ID tensor of rank 2. '.format(self.name),
+            'id_tensor shape: ', array_ops.shape(id_tensor)])
+    with ops.control_dependencies([check_id_rank]):
+      id_tensor = sparse_ops.sparse_reshape(
+          id_tensor,
+          shape=array_ops.concat([id_tensor.dense_shape, [1]], axis=0))
+    if weight_tensor is not None:
+      check_weight_rank = check_ops.assert_equal(
+          array_ops.rank(weight_tensor), 2,
+          data=[
+              'Column {} expected weight tensor of rank 2.'.format(self.name),
+              'weight_tensor shape:', array_ops.shape(weight_tensor)])
+      with ops.control_dependencies([check_weight_rank]):
+        weight_tensor = sparse_ops.sparse_reshape(
+            weight_tensor,
+            shape=array_ops.concat([weight_tensor.dense_shape, [1]], axis=0))
+    return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 505a1408d271e9262226b2ea4cff234345e2f3b6..3b818f18b5b0fce99b81e51ce89e58c72cab0b91 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -20,25 +20,4 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.feature_column.feature_column import *
-
-from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long
-
-_allowed_symbols = [
-    'input_layer',
-    'linear_model',
-    'make_parse_example_spec',
-    'embedding_column',
-    'shared_embedding_columns',
-    'crossed_column',
-    'numeric_column',
-    'bucketized_column',
-    'categorical_column_with_hash_bucket',
-    'categorical_column_with_vocabulary_file',
-    'categorical_column_with_vocabulary_list',
-    'categorical_column_with_identity',
-    'weighted_categorical_column',
-    'indicator_column',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 6f366e77229577b1a6a5363f882daa07203f525c..d963dd9b551c0ebefbbf2677af75114abc59c084 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.feature_column.feature_column import _LinearModel
 from tensorflow.python.feature_column.feature_column import _transform_features
 from tensorflow.python.feature_column.feature_column import InputLayer
 from tensorflow.python.framework import constant_op
@@ -339,6 +340,20 @@ class NumericColumnTest(test.TestCase):
         sess.run(price_var.assign([[10.]]))
         self.assertAllClose([[10.], [50.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[10.], [50.]], predictions.eval())
+
 
 class BucketizedColumnTest(test.TestCase):
 
@@ -561,6 +576,62 @@ class BucketizedColumnTest(test.TestCase):
         sess.run(bias.assign([1.]))
         self.assertAllClose([[81.], [141.]], predictions.eval())
 
+  def test_keras_linear_model_one_input_value(self):
+    """Tests _LinearModel for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+
+  def test_keras_linear_model_two_input_values(self):
+    """Tests _LinearModel for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1., 1.], [5., 6.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight per bucket per input column, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
+            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
+                                         [60.], [70.], [80.], [90.], [100.]]))
+        # 1st example:
+        #   price -1. is in the 0th bucket, whose weight is 10.
+        #   price 1. is in the 6th bucket, whose weight is 70.
+        # 2nd example:
+        #   price 5. is in the 3rd bucket, whose weight is 40.
+        #   price 6. is in the 9th bucket, whose weight is 100.
+        self.assertAllClose([[80.], [140.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], predictions.eval())
+
 
 class HashedCategoricalColumnTest(test.TestCase):
 
@@ -767,6 +838,28 @@ class HashedCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
         self.assertAllClose(((4.,), (6.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 3: wire_var[3] = 4
+        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
 
 class CrossedColumnTest(test.TestCase):
 
@@ -1039,49 +1132,809 @@ class CrossedColumnTest(test.TestCase):
         return _CategoricalColumn.IdWeightPair(
             id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
-    t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError,
-          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
-        fc.linear_model({
-            t.name: sparse_tensor.SparseTensor(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=[0, 1, 2],
-                dense_shape=(2, 2)),
-            '{}_weights'.format(t.name): sparse_tensor.SparseTensor(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=[1., 10., 2.],
-                dense_shape=(2, 2)),
-            'c': sparse_tensor.SparseTensor(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=['cA', 'cB', 'cC'],
-                dense_shape=(2, 2)),
-        }, (crossed,))
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        fc.linear_model({
+            t.name: sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[0, 1, 2],
+                dense_shape=(2, 2)),
+            '{}_weights'.format(t.name): sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[1., 10., 2.],
+                dense_shape=(2, 2)),
+            'c': sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=['cA', 'cB', 'cC'],
+                dense_shape=(2, 2)),
+        }, (crossed,))
+
+  def test_keras_linear_model(self):
+    """Tests _LinearModel.
+
+    Uses data from test_get_sparse_tesnsors_simple.
+    """
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+      }, (crossed,))
+      bias = get_linear_model_bias()
+      crossed_var = get_linear_model_column_var(crossed)
+      with _initialized_session() as sess:
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            crossed_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+
+  def test_keras_linear_model_with_weights(self):
+
+    class _TestColumnWithWeights(_CategoricalColumn):
+      """Produces sparse IDs and sparse weights."""
+
+      @property
+      def name(self):
+        return 'test_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {
+            self.name:
+                parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name):
+                parsing_ops.VarLenFeature(dtypes.float32),
+        }
+
+      @property
+      def _num_buckets(self):
+        return 5
+
+      def _transform_feature(self, inputs):
+        return (inputs.get(self.name),
+                inputs.get('{}_weights'.format(self.name)))
+
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
+                              trainable=None):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = inputs.get(self)
+        return _CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        get_keras_linear_model_predictions({
+            t.name:
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[0, 1, 2],
+                    dense_shape=(2, 2)),
+            '{}_weights'.format(t.name):
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[1., 10., 2.],
+                    dense_shape=(2, 2)),
+            'c':
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=['cA', 'cB', 'cC'],
+                    dense_shape=(2, 2)),
+        }, (crossed,))
+
+
+def get_linear_model_bias():
+  with variable_scope.variable_scope('linear_model', reuse=True):
+    return variable_scope.get_variable('bias_weights')
+
+
+def get_linear_model_column_var(column):
+  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                            'linear_model/' + column.name)[0]
+
+
+def get_keras_linear_model_predictions(features,
+                                       feature_columns,
+                                       units=1,
+                                       sparse_combiner='sum',
+                                       weight_collections=None,
+                                       trainable=True,
+                                       cols_to_vars=None):
+  keras_linear_model = _LinearModel(
+      feature_columns,
+      units,
+      sparse_combiner,
+      weight_collections,
+      trainable,
+      name='linear_model')
+  retval = keras_linear_model(features)  # pylint: disable=not-callable
+  if cols_to_vars is not None:
+    cols_to_vars.update(keras_linear_model.cols_to_vars())
+  return retval
+
+
+@test_util.with_c_api
+class LinearModelTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.linear_model(features={}, feature_columns=[])
+
+  def test_should_be_feature_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+      fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
+
+  def test_should_be_dense_or_categorical_column(self):
+
+    class NotSupportedColumn(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'NotSupportedColumn'
+
+      def _transform_feature(self, cache):
+        pass
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    with self.assertRaisesRegexp(
+        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.linear_model(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_dense_bias(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        sess.run(price_var.assign([[10.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[15.], [55.]], predictions.eval())
+
+  def test_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_and_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [wire_cast, price])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+
+  def test_dense_and_sparse_column(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _DenseAndSparseColumn(_DenseColumn, _CategoricalColumn):
+
+      @property
+      def name(self):
+        return 'dense_and_sparse_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
+
+      def _transform_feature(self, inputs):
+        return inputs.get(self.name)
+
+      @property
+      def _variable_shape(self):
+        raise ValueError('Should not use this method.')
+
+      def _get_dense_tensor(self, inputs, weight_collections=None,
+                            trainable=None):
+        raise ValueError('Should not use this method.')
+
+      @property
+      def _num_buckets(self):
+        return 4
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        sp_tensor = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [1, 0], [1, 1]],
+            values=[2, 0, 3],
+            dense_shape=[2, 2])
+        return _CategoricalColumn.IdWeightPair(sp_tensor, None)
+
+    dense_and_sparse_column = _DenseAndSparseColumn()
+    with ops.Graph().as_default():
+      sp_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {dense_and_sparse_column.name: sp_tensor}
+      predictions = fc.linear_model(features, [dense_and_sparse_column])
+      bias = get_linear_model_bias()
+      dense_and_sparse_column_var = get_linear_model_column_var(
+          dense_and_sparse_column)
+      with _initialized_session() as sess:
+        sess.run(dense_and_sparse_column_var.assign(
+            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_multi_output(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        sess.run(price_var.assign([[10., 100., 1000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
+                            predictions.eval())
+
+  def test_sparse_multi_output(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast], units=3)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        sess.run(
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
+                1000., 1100., 1200.
+            ], [10000., 11000., 12000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
+                            predictions.eval())
+
+  def test_dense_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price])
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_sparse_multi_rank(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
+      wire_value = sparse_tensor.SparseTensorValue(
+          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
+          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
+          dense_shape=[2, 2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(
+            np.zeros((2, 1)),
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        self.assertAllClose(
+            [[1010.], [11000.]],
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+
+  def test_sparse_combiner(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(
+          features, [wire_cast], sparse_combiner='mean')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+
+  def test_sparse_combiner_with_negative_weights(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {
+          'wire_cast': wire_tensor,
+          'weights': constant_op.constant([[1., 1., -1.0]])
+      }
+      predictions = fc.linear_model(
+          features, [wire_cast_weights], sparse_combiner='sum')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+
+  def test_dense_multi_dimension_multi_output(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
+        sess.run(bias.assign([2., 3., 4.]))
+        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
+                            predictions.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(
+            Exception,
+            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+          predictions = fc.linear_model(features, [price])
+      else:
+        predictions = fc.linear_model(features, [price])
+        with _initialized_session():
+          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+            predictions.eval()
+
+  def test_dense_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_dense_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price1_var.eval())
+        self.assertAllClose([[0.]], price2_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price1_var.assign([[10.], [100.]]))
+        sess.run(price2_var.assign([[1000.]]))
+        sess.run(bias.assign([7.]))
+        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+
+  def test_fills_cols_to_vars(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      cols_to_vars = {}
+      fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      self.assertAllEqual(cols_to_vars['bias'], [bias])
+      self.assertAllEqual(cols_to_vars[price1], [price1_var])
+      self.assertAllEqual(cols_to_vars[price2], [price2_var])
+
+  def test_fills_cols_to_vars_partitioned_variables(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2', shape=3)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [6., 7.]],
+          'price2': [[3., 4., 5.], [8., 9., 10.]]
+      }
+      cols_to_vars = {}
+      with variable_scope.variable_scope(
+          'linear',
+          partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
+        fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
+      with _initialized_session():
+        self.assertEqual([0.], cols_to_vars['bias'][0].eval())
+        # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
+        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
+        # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
+        # a [1, 1] Variable.
+        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
+
+  def test_dense_collection(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      self.assertIn(bias, my_vars)
+      self.assertIn(price_var, my_vars)
+
+  def test_sparse_collection(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(
+          features, [wire_cast], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, my_vars)
+      self.assertIn(wire_cast_var, my_vars)
+
+  def test_dense_trainable_default(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(price_var, trainable_vars)
+
+  def test_sparse_trainable_default(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast])
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(wire_cast_var, trainable_vars)
+
+  def test_dense_trainable_false(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_sparse_trainable_false(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [price_a, wire_cast, price_b],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [wire_cast, price_b, price_a],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+      fc.linear_model(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.linear_model(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'must have the same size and shape'):
+          sess.run(
+              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            predictions,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_with_numpy_input_fn(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([-1., 2., 13., 104.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = fc.linear_model(features, [price_buckets, body_style])
+    # self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      bias = get_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_with_1d_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': constant_op.constant([-1., 12.,]),
+        'body-style': sparse_tensor.SparseTensor(
+            indices=((0,), (1,)),
+            values=('sedan', 'hardtop'),
+            dense_shape=(2,)),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+
+    net = fc.linear_model(features, [price_buckets, body_style])
+    with _initialized_session() as sess:
+      bias = get_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+
+    price_data = np.array([-1., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)),
+        values=('sedan', 'hardtop'),
+        dense_shape=(2,))
+    country_data = np.array(['US', 'CA'])
+
+    net = fc.linear_model(features, [price_buckets, body_style, country])
+    bias = get_linear_model_bias()
+    price_buckets_var = get_linear_model_column_var(price_buckets)
+    body_style_var = get_linear_model_column_var(body_style)
+    with _initialized_session() as sess:
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
 
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          sess.run(
+                              net,
+                              feed_dict={
+                                  features['price']: price_data,
+                                  features['body-style']: body_style_data,
+                                  features['country']: country_data
+                              }))
 
-def get_linear_model_bias():
-  with variable_scope.variable_scope('linear_model', reuse=True):
-    return variable_scope.get_variable('bias_weights')
+  def test_with_rank_0_feature(self):
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
 
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      fc.linear_model(features, [price])
 
-def get_linear_model_column_var(column):
-  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                            'linear_model/' + column.name)[0]
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = fc.linear_model(features, [price])
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
 @test_util.with_c_api
-class LinearModelTest(test.TestCase):
+class _LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
-      fc.linear_model(features={}, feature_columns=[])
+      get_keras_linear_model_predictions(features={}, feature_columns=[])
 
   def test_should_be_feature_column(self):
     with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
-      fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
+      get_keras_linear_model_predictions(
+          features={'a': [[0]]}, feature_columns='NotSupported')
 
   def test_should_be_dense_or_categorical_column(self):
 
@@ -1100,7 +1953,7 @@ class LinearModelTest(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
 
   def test_does_not_support_dict_columns(self):
@@ -1112,7 +1965,7 @@ class LinearModelTest(test.TestCase):
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features={'a': [[0]]},
           feature_columns=[fc.numeric_column('a'),
                            fc.numeric_column('a')])
@@ -1121,7 +1974,7 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [price])
+      predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
@@ -1138,7 +1991,7 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(features, [wire_cast])
+      predictions = get_keras_linear_model_predictions(features, [wire_cast])
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
@@ -1157,7 +2010,8 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [wire_cast, price])
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [wire_cast, price])
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       price_var = get_linear_model_column_var(price)
@@ -1187,7 +2041,9 @@ class LinearModelTest(test.TestCase):
       def _variable_shape(self):
         raise ValueError('Should not use this method.')
 
-      def _get_dense_tensor(self, inputs, weight_collections=None,
+      def _get_dense_tensor(self,
+                            inputs,
+                            weight_collections=None,
                             trainable=None):
         raise ValueError('Should not use this method.')
 
@@ -1195,7 +2051,9 @@ class LinearModelTest(test.TestCase):
       def _num_buckets(self):
         return 4
 
-      def _get_sparse_tensors(self, inputs, weight_collections=None,
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
                               trainable=None):
         sp_tensor = sparse_tensor.SparseTensor(
             indices=[[0, 0], [1, 0], [1, 1]],
@@ -1210,13 +2068,15 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {dense_and_sparse_column.name: sp_tensor}
-      predictions = fc.linear_model(features, [dense_and_sparse_column])
+      predictions = get_keras_linear_model_predictions(
+          features, [dense_and_sparse_column])
       bias = get_linear_model_bias()
       dense_and_sparse_column_var = get_linear_model_column_var(
           dense_and_sparse_column)
       with _initialized_session() as sess:
-        sess.run(dense_and_sparse_column_var.assign(
-            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(
+            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
+                                                [10000.]]))
         sess.run(bias.assign([5.]))
         self.assertAllClose([[1005.], [10015.]], predictions.eval())
 
@@ -1224,7 +2084,8 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [price], units=3)
+      predictions = get_keras_linear_model_predictions(
+          features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
@@ -1243,16 +2104,17 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(features, [wire_cast], units=3)
+      predictions = get_keras_linear_model_predictions(
+          features, [wire_cast], units=3)
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
         self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
         sess.run(
-            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
-                1000., 1100., 1200.
-            ], [10000., 11000., 12000.]]))
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
+                                  [1000., 1100.,
+                                   1200.], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
                             predictions.eval())
@@ -1261,7 +2123,7 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      predictions = fc.linear_model(features, [price])
+      predictions = get_keras_linear_model_predictions(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([[0.], [0.]], price_var.eval())
@@ -1277,7 +2139,7 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
           dense_shape=[2, 2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(features, [wire_cast])
+      predictions = get_keras_linear_model_predictions(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
@@ -1297,7 +2159,7 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(
+      predictions = get_keras_linear_model_predictions(
           features, [wire_cast], sparse_combiner='mean')
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
@@ -1310,7 +2172,8 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      predictions = fc.linear_model(features, [price], units=3)
+      predictions = get_keras_linear_model_predictions(
+          features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
@@ -1329,9 +2192,9 @@ class LinearModelTest(test.TestCase):
         with self.assertRaisesRegexp(
             Exception,
             r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-          predictions = fc.linear_model(features, [price])
+          predictions = get_keras_linear_model_predictions(features, [price])
       else:
-        predictions = fc.linear_model(features, [price])
+        predictions = get_keras_linear_model_predictions(features, [price])
         with _initialized_session():
           with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
             predictions.eval()
@@ -1340,7 +2203,7 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      predictions = fc.linear_model(features, [price])
+      predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
@@ -1354,11 +2217,9 @@ class LinearModelTest(test.TestCase):
     price1 = fc.numeric_column('price1', shape=2)
     price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3.], [4.]]
-      }
-      predictions = fc.linear_model(features, [price1, price2])
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
       bias = get_linear_model_bias()
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
@@ -1378,7 +2239,8 @@ class LinearModelTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       cols_to_vars = {}
-      fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
+      get_keras_linear_model_predictions(
+          features, [price1, price2], cols_to_vars=cols_to_vars)
       bias = get_linear_model_bias()
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
@@ -1398,7 +2260,8 @@ class LinearModelTest(test.TestCase):
       with variable_scope.variable_scope(
           'linear',
           partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
-        fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
+        get_keras_linear_model_predictions(
+            features, [price1, price2], cols_to_vars=cols_to_vars)
       with _initialized_session():
         self.assertEqual([0.], cols_to_vars['bias'][0].eval())
         # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
@@ -1413,7 +2276,8 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
-      fc.linear_model(features, [price], weight_collections=['my-vars'])
+      get_keras_linear_model_predictions(
+          features, [price], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -1426,7 +2290,7 @@ class LinearModelTest(test.TestCase):
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       features = {'wire_cast': wire_tensor}
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features, [wire_cast], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
       bias = get_linear_model_bias()
@@ -1438,7 +2302,7 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
-      fc.linear_model(features, [price])
+      get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -1451,7 +2315,7 @@ class LinearModelTest(test.TestCase):
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       features = {'wire_cast': wire_tensor}
-      fc.linear_model(features, [wire_cast])
+      get_keras_linear_model_predictions(features, [wire_cast])
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
@@ -1462,7 +2326,7 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
-      fc.linear_model(features, [price], trainable=False)
+      get_keras_linear_model_predictions(features, [price], trainable=False)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertEqual([], trainable_vars)
 
@@ -1472,7 +2336,7 @@ class LinearModelTest(test.TestCase):
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       features = {'wire_cast': wire_tensor}
-      fc.linear_model(features, [wire_cast], trainable=False)
+      get_keras_linear_model_predictions(features, [wire_cast], trainable=False)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertEqual([], trainable_vars)
 
@@ -1488,7 +2352,7 @@ class LinearModelTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       }
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features, [price_a, wire_cast, price_b],
           weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
@@ -1504,7 +2368,7 @@ class LinearModelTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       }
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features, [wire_cast, price_b, price_a],
           weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
@@ -1523,7 +2387,7 @@ class LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-      fc.linear_model(features, [price1, price2])
+      get_keras_linear_model_predictions(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -1538,7 +2402,7 @@ class LinearModelTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.linear_model(features, [price1, price2, price3])
+        get_keras_linear_model_predictions(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -1548,7 +2412,8 @@ class LinearModelTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      predictions = fc.linear_model(features, [price1, price2])
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
       with _initialized_session() as sess:
         with self.assertRaisesRegexp(errors.OpError,
                                      'must have the same size and shape'):
@@ -1563,7 +2428,8 @@ class LinearModelTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
           'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
       }
-      predictions = fc.linear_model(features, [price1, price2])
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
       with _initialized_session() as sess:
         sess.run(
             predictions,
@@ -1574,7 +2440,12 @@ class LinearModelTest(test.TestCase):
 
   def test_with_numpy_input_fn(self):
     price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
@@ -1586,7 +2457,8 @@ class LinearModelTest(test.TestCase):
         batch_size=2,
         shuffle=False)
     features = input_fn()
-    net = fc.linear_model(features, [price_buckets, body_style])
+    net = get_keras_linear_model_predictions(features,
+                                             [price_buckets, body_style])
     # self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
       coord = coordinator.Coordinator()
@@ -1607,22 +2479,33 @@ class LinearModelTest(test.TestCase):
 
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
     features = {
-        'price': constant_op.constant([-1., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
+        'price':
+            constant_op.constant([
+                -1.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
     }
     self.assertEqual(1, features['price'].shape.ndims)
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
 
-    net = fc.linear_model(features, [price_buckets, body_style])
+    net = get_keras_linear_model_predictions(features,
+                                             [price_buckets, body_style])
     with _initialized_session() as sess:
       bias = get_linear_model_bias()
       price_buckets_var = get_linear_model_column_var(price_buckets)
@@ -1636,7 +2519,12 @@ class LinearModelTest(test.TestCase):
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
     price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     country = fc.categorical_column_with_vocabulary_list(
@@ -1653,12 +2541,11 @@ class LinearModelTest(test.TestCase):
 
     price_data = np.array([-1., 12.])
     body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)),
-        values=('sedan', 'hardtop'),
-        dense_shape=(2,))
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
     country_data = np.array(['US', 'CA'])
 
-    net = fc.linear_model(features, [price_buckets, body_style, country])
+    net = get_keras_linear_model_predictions(
+        features, [price_buckets, body_style, country])
     bias = get_linear_model_bias()
     price_buckets_var = get_linear_model_column_var(price_buckets)
     body_style_var = get_linear_model_column_var(body_style)
@@ -1685,13 +2572,13 @@ class LinearModelTest(test.TestCase):
 
     # Static rank 0 should fail
     with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      fc.linear_model(features, [price])
+      get_keras_linear_model_predictions(features, [price])
 
     # Dynamic rank 0 should fail
     features = {
         'price': array_ops.placeholder(dtypes.float32),
     }
-    net = fc.linear_model(features, [price])
+    net = get_keras_linear_model_predictions(features, [price])
     self.assertEqual(1, net.shape[1])
     with _initialized_session() as sess:
       with self.assertRaisesOpError('Feature .* cannot have rank 0'):
@@ -2035,6 +2922,114 @@ class FunctionalInputLayerTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  def test_multiple_layers_with_same_embedding_column(self):
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+
+    with ops.Graph().as_default():
+      features = {
+          'sparse_feature': [['a'], ['x']],
+      }
+      all_cols = [some_embedding_column]
+      fc.input_layer(features, all_cols)
+      fc.input_layer(features, all_cols)
+      # Make sure that 2 variables get created in this case.
+      self.assertEqual(2, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      expected_var_names = [
+          'input_layer/sparse_feature_embedding/embedding_weights:0',
+          'input_layer_1/sparse_feature_embedding/embedding_weights:0'
+      ]
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  def test_multiple_layers_with_same_shared_embedding_column(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      all_cols = [embedding_column_a, embedding_column_b]
+      fc.input_layer(features, all_cols)
+      fc.input_layer(features, all_cols)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+    all_cols = [embedding_column_a, embedding_column_b]
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      fc.input_layer(features, all_cols)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+
+    with ops.Graph().as_default():
+      features1 = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+
+      fc.input_layer(features1, all_cols)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
   def test_with_numpy_input_fn(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
@@ -2715,6 +3710,32 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
 
@@ -3082,6 +4103,31 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
 
 class IdentityCategoricalColumnTest(test.TestCase):
 
@@ -3285,15 +4331,37 @@ class IdentityCategoricalColumnTest(test.TestCase):
               input_shape: (2, 2),
           }))
 
-  def test_linear_model(self):
+  def test_linear_model(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual(3, column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] = 1
+        # weight_var[2] + weight_var[1] = 3+2 = 5
+        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
+  def test_keras_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          column.name: sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(0, 2, 1),
-              dense_shape=(2, 2))
+      predictions = get_keras_linear_model_predictions({
+          column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2))
       }, (column,))
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
@@ -3537,6 +4605,25 @@ class IndicatorColumnTest(test.TestCase):
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
         self.assertAllClose([[2. + 3.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+
+      predictions = get_keras_linear_model_predictions(features, [animal])
+      weight_var = get_linear_model_column_var(animal)
+      with _initialized_session():
+        # All should be zero-initialized.
+        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
+        self.assertAllClose([[0.]], predictions.eval())
+        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
+        self.assertAllClose([[2. + 3.]], predictions.eval())
+
   def test_input_layer(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -3562,7 +4649,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('mean', embedding_column.combiner)
-    self.assertIsNotNone(embedding_column.initializer)
     self.assertIsNone(embedding_column.ckpt_to_load_from)
     self.assertIsNone(embedding_column.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column.max_norm)
@@ -3587,7 +4673,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
-    self.assertEqual('my_initializer', embedding_column.initializer())
     self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
     self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
     self.assertEqual(42., embedding_column.max_norm)
@@ -3618,7 +4703,6 @@ class EmbeddingColumnTest(test.TestCase):
 
       self.assertEqual(embedding_dimension, embedding_column.dimension)
       self.assertEqual('my_combiner', embedding_column.combiner)
-      self.assertEqual('my_initializer', embedding_column.initializer())
       self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column.max_norm)
@@ -3727,8 +4811,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval())
@@ -3787,8 +4871,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval())
@@ -3815,8 +4899,9 @@ class EmbeddingColumnTest(test.TestCase):
         }), weight_collections=('my_vars',))
 
     # Assert expected embedding variable and lookups.
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     my_vars = ops.get_collection('my_vars')
     self.assertItemsEqual(
         ('embedding_weights:0',), tuple([v.name for v in my_vars]))
@@ -4023,6 +5108,82 @@ class EmbeddingColumnTest(test.TestCase):
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
         self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    # Inputs.
+    batch_size = 4
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(batch_size, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          categorical_column.name: sparse_input
+      }, (embedding_column,))
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_embedding/weights:0',
+          'linear_model/aaa_embedding/embedding_weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_embedding/embedding_weights:0']
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # example 2, ids [], embedding[2] = [0, 0]
+        # example 3, ids [1], embedding[3] = [3, 5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+
   def test_input_layer(self):
     # Inputs.
     vocabulary_size = 3
@@ -4159,14 +5320,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertEqual(embedding_dimension, embedding_column_b.dimension)
     self.assertEqual('mean', embedding_column_a.combiner)
     self.assertEqual('mean', embedding_column_b.combiner)
-    self.assertIsNotNone(embedding_column_a.initializer)
-    self.assertIsNotNone(embedding_column_b.initializer)
     self.assertIsNone(embedding_column_a.ckpt_to_load_from)
     self.assertIsNone(embedding_column_b.ckpt_to_load_from)
     self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_a.shared_embedding_collection_name)
+                     embedding_column_a.var_scope_name)
     self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_b.shared_embedding_collection_name)
+                     embedding_column_b.var_scope_name)
     self.assertIsNone(embedding_column_a.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column_b.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column_a.max_norm)
@@ -4212,12 +5371,10 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertEqual(embedding_dimension, embedding_column_b.dimension)
     self.assertEqual('my_combiner', embedding_column_a.combiner)
     self.assertEqual('my_combiner', embedding_column_b.combiner)
-    self.assertEqual('my_initializer', embedding_column_a.initializer())
-    self.assertEqual('my_initializer', embedding_column_b.initializer())
     self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_a.shared_embedding_collection_name)
+                     embedding_column_a.var_scope_name)
     self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_b.shared_embedding_collection_name)
+                     embedding_column_b.var_scope_name)
     self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
     self.assertEqual('my_ckpt', embedding_column_b.ckpt_to_load_from)
     self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
@@ -4267,9 +5424,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
       self.assertEqual(embedding_dimension, embedding_column_a.dimension)
       self.assertEqual('my_combiner', embedding_column_a.combiner)
-      self.assertEqual('my_initializer', embedding_column_a.initializer())
       self.assertEqual('shared_embedding_collection_name',
-                       embedding_column_a.shared_embedding_collection_name)
+                       embedding_column_a.var_scope_name)
       self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column_a.max_norm)
@@ -4445,8 +5601,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
       self.assertAllEqual(embedding_values, embedding_var.eval())
@@ -4595,6 +5751,97 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
         self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    # Inputs.
+    batch_size = 2
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          categorical_column_a.name: input_a,
+          categorical_column_b.name: input_b,
+      }, (embedding_column_a, embedding_column_b))
+      # Linear weights do not follow the column name. But this is a rare use
+      # case, and fixing it would add too much complexity to the code.
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_bbb_shared_embedding/weights:0',
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
+      linear_weights_a = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/weights:0']
+      linear_weights_b = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights_a.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+        linear_weights_b.assign(((3.,), (5.,))).eval()
+        # example 0, ids [0], embedding[0] = [1, 2]
+        # example 1, ids [], embedding[1] = 0, 0]
+        # sum(embeddings * linear_weights)
+        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+
   def _test_input_layer(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
@@ -4663,10 +5910,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           tuple([v.name for v in trainable_vars]))
     else:
       self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = ops.get_collection('aaa_bbb_shared_embedding')
-    self.assertItemsEqual(
-        ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
-        tuple([v.name for v in shared_embedding_vars]))
+    shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
       self.assertAllEqual(expected_lookups, input_layer.eval())
@@ -4880,6 +6124,103 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           weight_tensor.eval())
 
+  def test_keras_linear_model(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(.5, 1., .1),
+                  dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  def test_keras_linear_model_mismatched_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError,
+                                   r'Dimensions.*are not compatible'):
+        get_keras_linear_model_predictions({
+            'ids':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 2, 1),
+                    dense_shape=(2, 2)),
+            'values':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (0, 1), (1, 0), (1, 1)),
+                    values=(.5, 11., 1., .1),
+                    dense_shape=(2, 2))
+        }, (column,))
+
+  def test_keras_linear_model_mismatched_dense_values(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions(
+          {
+              'ids':
+                  sparse_tensor.SparseTensorValue(
+                      indices=((0, 0), (1, 0), (1, 1)),
+                      values=(0, 2, 1),
+                      dense_shape=(2, 2)),
+              'values': ((.5,), (1.,))
+          }, (column,),
+          sparse_combiner='mean')
+      with _initialized_session():
+        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+          predictions.eval()
+
+  def test_keras_linear_model_mismatched_dense_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values': ((.5,), (1.,), (.1,))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
   def test_linear_model(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -4933,13 +6274,16 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          'ids': sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(0, 2, 1),
-              dense_shape=(2, 2)),
-          'values': ((.5,), (1.,))
-      }, (column,))
+      predictions = fc.linear_model(
+          {
+              'ids':
+                  sparse_tensor.SparseTensorValue(
+                      indices=((0, 0), (1, 0), (1, 1)),
+                      values=(0, 2, 1),
+                      dense_shape=(2, 2)),
+              'values': ((.5,), (1.,))
+          }, (column,),
+          sparse_combiner='mean')
       with _initialized_session():
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
           predictions.eval()
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 6c522de452b59ea9a200ccf89cfb428a26970db1..7bbe3183dfa376776dde8412a3270b1684a02391 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -33,7 +33,7 @@ class ScopedTFStatus(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteStatus is not None:
+    if c_api is not None and c_api.TF_DeleteStatus is not None:
       c_api.TF_DeleteStatus(self.status)
 
 
@@ -46,7 +46,7 @@ class ScopedTFGraph(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteGraph is not None:
+    if c_api is not None and c_api.TF_DeleteGraph is not None:
       c_api.TF_DeleteGraph(self.graph)
 
 
@@ -59,10 +59,36 @@ class ScopedTFImportGraphDefOptions(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteImportGraphDefOptions is not None:
+    if c_api is not None and c_api.TF_DeleteImportGraphDefOptions is not None:
       c_api.TF_DeleteImportGraphDefOptions(self.options)
 
 
+class ScopedTFImportGraphDefResults(object):
+  """Wrapper around TF_ImportGraphDefOptions that handles deletion."""
+
+  def __init__(self, results):
+    self.results = results
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteImportGraphDefResults is not None:
+      c_api.TF_DeleteImportGraphDefResults(self.results)
+
+
+class ScopedTFFunction(object):
+  """Wrapper around TF_Function that handles deletion."""
+
+  def __init__(self, func):
+    self.func = func
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteFunction is not None:
+      c_api.TF_DeleteFunction(self.func)
+
+
 @tf_contextlib.contextmanager
 def tf_buffer(data=None):
   """Context manager that creates and deletes TF_Buffer.
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index d3d8c9c154fbfcc9613acce4e1bdab7df2e7d56d..b3eb57d067ba291b1941604b31744bbff0ff782b 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -15,24 +15,6 @@
 """Operations that generate constants.
 
 See the @{$python/constant_op$constants guide}.
-
-@@zeros
-@@zeros_like
-@@ones
-@@ones_like
-@@fill
-@@constant
-@@linspace
-@@range
-@@random_normal
-@@truncated_normal
-@@random_uniform
-@@random_shuffle
-@@random_crop
-@@multinomial
-@@random_gamma
-@@random_poisson
-@@set_random_seed
 """
 
 # Must be separate from array_ops to avoid a cyclic dependency.
@@ -181,7 +163,7 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
     TypeError: if shape is incorrectly specified or unsupported.
   """
   ctx = context.context()
-  if not ctx.in_graph_mode():
+  if ctx.executing_eagerly():
     t = convert_to_eager_tensor(value, ctx, dtype)
     if shape is None:
       return t
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 99ae8b24f11c4955379ae532ba7b921ebec63385..7f9ef53457ae060600067b946e686487f55adda1 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -297,6 +297,9 @@ class DType(object):
   def __hash__(self):
     return self._type_enum
 
+  def __reduce__(self):
+    return as_dtype, (self.name,)
+
   @property
   def size(self):
     if (self._type_enum == types_pb2.DT_VARIANT or
@@ -343,7 +346,9 @@ tf_export("uint8").export_constant(__name__, "uint8")
 uint16 = DType(types_pb2.DT_UINT16)
 tf_export("uint16").export_constant(__name__, "uint16")
 uint32 = DType(types_pb2.DT_UINT32)
+tf_export("uint32").export_constant(__name__, "uint32")
 uint64 = DType(types_pb2.DT_UINT64)
+tf_export("uint64").export_constant(__name__, "uint64")
 int16 = DType(types_pb2.DT_INT16)
 tf_export("int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
@@ -646,6 +651,11 @@ QUANTIZED_DTYPES = frozenset([
 ])
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
+_PYTHON_TO_TF = {
+    float: float32,
+    bool: bool,
+}
+
 
 @tf_export("as_dtype")
 def as_dtype(type_value):
@@ -677,6 +687,11 @@ def as_dtype(type_value):
   except KeyError:
     pass
 
+  try:
+    return _PYTHON_TO_TF[type_value]
+  except KeyError:
+    pass
+
   if isinstance(type_value, np.dtype):
     # The numpy dtype for strings is variable length. We can not compare
     # dtype with a single constant (np.string does not exist) to decide
@@ -685,11 +700,13 @@ def as_dtype(type_value):
     if type_value.type == np.string_ or type_value.type == np.unicode_:
       return string
 
-  for key, val in _NP_TO_TF:
-    try:
-      if key == type_value:
-        return val
-    except TypeError as e:
-      raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e))
+  if isinstance(type_value, (type, np.dtype)):
+    for key, val in _NP_TO_TF:
+      try:
+        if key == type_value:
+          return val
+      except TypeError as e:
+        raise TypeError("Cannot convert {} to a dtype. {}".format(
+            type_value, e))
 
   raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index e49e2fda5d84da4f8f87fae73874351afe0a20f2..a873670e0461884d06cde1db4db2cf2db98fde3c 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -295,6 +295,20 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertNotEqual(dtypes.int32, int)
     self.assertNotEqual(dtypes.float64, 2.1)
 
+  def testPythonTypesConversion(self):
+    self.assertIs(dtypes.float32, dtypes.as_dtype(float))
+    self.assertIs(dtypes.bool, dtypes.as_dtype(bool))
+
+  def testReduce(self):
+    for enum in dtypes._TYPE_TO_STRING:
+      dtype = dtypes.DType(enum)
+      ctor, args = dtype.__reduce__()
+      self.assertEquals(ctor, dtypes.as_dtype)
+      self.assertEquals(args, (dtype.name,))
+      reconstructed = ctor(*args)
+      self.assertEquals(reconstructed, dtype)
+
 
 if __name__ == "__main__":
   googletest.main()
+
diff --git a/tensorflow/python/framework/errors.py b/tensorflow/python/framework/errors.py
index c8cf9ae39b6db925be89b72c8c2a48c8fe5fd8f9..be0187c2ef8bf2c07e7f598a9ead0a6e1af8bd57 100644
--- a/tensorflow/python/framework/errors.py
+++ b/tensorflow/python/framework/errors.py
@@ -25,50 +25,4 @@ from tensorflow.python.framework import errors_impl as _impl
 # pylint: disable=wildcard-import
 from tensorflow.python.framework.errors_impl import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
 
-# These are referenced in client/client_lib.py.
-# Unfortunately, we can't import client_lib to examine
-# the references, since it would create a dependency cycle.
-_allowed_symbols = [
-    "AbortedError",
-    "AlreadyExistsError",
-    "CancelledError",
-    "DataLossError",
-    "DeadlineExceededError",
-    "FailedPreconditionError",
-    "InternalError",
-    "InvalidArgumentError",
-    "NotFoundError",
-    "OpError",
-    "OutOfRangeError",
-    "PermissionDeniedError",
-    "ResourceExhaustedError",
-    "UnauthenticatedError",
-    "UnavailableError",
-    "UnimplementedError",
-    "UnknownError",
-    "error_code_from_exception_type",
-    "exception_type_from_error_code",
-    "raise_exception_on_not_ok_status",
-    # Scalars that have no docstrings:
-    "OK",
-    "CANCELLED",
-    "UNKNOWN",
-    "INVALID_ARGUMENT",
-    "DEADLINE_EXCEEDED",
-    "NOT_FOUND",
-    "ALREADY_EXISTS",
-    "PERMISSION_DENIED",
-    "UNAUTHENTICATED",
-    "RESOURCE_EXHAUSTED",
-    "FAILED_PRECONDITION",
-    "ABORTED",
-    "OUT_OF_RANGE",
-    "UNIMPLEMENTED",
-    "INTERNAL",
-    "UNAVAILABLE",
-    "DATA_LOSS",
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 2a40316d51c023df9c664d0dd79a0df3b2ac5041..84106c32c673e15832ff747a7fededdfbfb94ed8 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -473,6 +473,8 @@ _CODE_TO_EXCEPTION_CLASS = {
     DATA_LOSS: DataLossError,
 }
 
+c_api.PyExceptionRegistry_Init(_CODE_TO_EXCEPTION_CLASS)
+
 _EXCEPTION_CLASS_TO_CODE = dict((
     (class_, code) for (code, class_) in _CODE_TO_EXCEPTION_CLASS.items()))
 
@@ -499,6 +501,7 @@ def _make_specific_exception(node_def, op, message, error_code):
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
+# TODO(b/77295559): expand use of TF_Status* SWIG typemap and deprecate this.
 @tf_export("errors.raise_exception_on_not_ok_status")  # pylint: disable=invalid-name
 class raise_exception_on_not_ok_status(object):
   """Context manager to check for C API status."""
diff --git a/tensorflow/python/framework/framework_lib.py b/tensorflow/python/framework/framework_lib.py
index 3172f3c2c3d259d2c3f2b340b101aef043d0fc33..fffb6488425524b85363c5365bd080ce688e6263 100644
--- a/tensorflow/python/framework/framework_lib.py
+++ b/tensorflow/python/framework/framework_lib.py
@@ -14,58 +14,7 @@
 # ==============================================================================
 
 # pylint: disable=unused-import,g-bad-import-order
-"""Classes and functions for building TensorFlow graphs.
-
-## Core graph data structures
-
-@@Graph
-@@Operation
-@@Tensor
-
-## Tensor types
-
-@@DType
-@@as_dtype
-
-## Utility functions
-
-@@device
-@@container
-@@name_scope
-@@colocate_with
-@@control_dependencies
-@@convert_to_tensor
-@@convert_to_tensor_or_indexed_slices
-@@convert_to_tensor_or_sparse_tensor
-@@get_default_graph
-@@reset_default_graph
-@@import_graph_def
-@@load_file_system_library
-@@load_op_library
-@@make_tensor_proto
-@@make_ndarray
-
-## Graph collections
-
-@@add_to_collection
-@@get_collection
-@@get_collection_ref
-@@GraphKeys
-
-## Defining new operations
-
-@@RegisterGradient
-@@NotDifferentiable
-@@NoGradient
-@@TensorShape
-@@Dimension
-@@op_scope
-@@get_seed
-
-## For libraries building on TensorFlow
-
-@@register_tensor_conversion_function
-"""
+"""Classes and functions for building TensorFlow graphs."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -92,6 +41,7 @@ from tensorflow.python.framework.ops import get_default_graph
 from tensorflow.python.framework.ops import reset_default_graph
 from tensorflow.python.framework.ops import GraphKeys
 from tensorflow.python.framework.ops import add_to_collection
+from tensorflow.python.framework.ops import add_to_collections
 from tensorflow.python.framework.ops import get_collection
 from tensorflow.python.framework.ops import get_collection_ref
 from tensorflow.python.framework.ops import convert_to_tensor
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index cba225e749d88a45c43266e45172a7335a8e0b71..e7f9e590af8421a5dc43bbf68060752461cef195 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -30,7 +30,6 @@ from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -275,8 +274,7 @@ class _DefinedFunction(object):
     self._create_definition_if_needed()
     if self._c_func:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_FunctionToFunctionDef(self._c_func, buf, status)
+        c_api.TF_FunctionToFunctionDef(self._c_func.func, buf)
         fdef = function_pb2.FunctionDef()
         proto_data = c_api.TF_GetBuffer(buf)
         fdef.ParseFromString(compat.as_bytes(proto_data))
@@ -353,8 +351,12 @@ class _DefinedFunction(object):
           outputs = (outputs,)
         if any([_ is None for _ in outputs]):
           raise ValueError("Function can not return None.")
-      # Ensures each output is a Tensor.
-      outputs = [ops.convert_to_tensor(_) for _ in outputs]
+      # Ensures each output is a Tensor in the function graph.
+      outputs = [ops.convert_to_tensor(t) for t in outputs]
+      outputs = [
+          temp_graph.capture(t) if t.graph is not temp_graph else t
+          for t in outputs
+      ]
     self._extra_inputs = temp_graph.extra_inputs
     inputs.extend(temp_graph.extra_args)
     # pylint: disable=protected-access
@@ -362,9 +364,13 @@ class _DefinedFunction(object):
     # pylint: enable=protected-access
 
     # Extra kwargs are treated as attrs on the function def.
-    base_func_name = self._func_name or _get_func_name(self._func)
-    kwargs_attr = _parse_kwargs_as_attrs(base_func_name,
-                                         **self._extra_kwargs)
+    if self._func_name:
+      base_func_name = self._func_name
+    else:
+      base_func_name = _get_func_name(self._func)
+      if self._grad_func:
+        base_func_name += ("_%s" % self._grad_func.name)
+    kwargs_attr = _parse_kwargs_as_attrs(base_func_name, **self._extra_kwargs)
 
     if not temp_graph._c_graph:  # pylint: disable=protected-access
       # Build the FunctionDef
@@ -397,18 +403,17 @@ class _DefinedFunction(object):
                       if self._out_names else [])
       description = self._func.__doc__ or None
       # pylint: disable=protected-access
-      with errors.raise_exception_on_not_ok_status() as status:
-        self._c_func = c_api.TF_GraphToFunction_wrapper(
-            temp_graph._c_graph,
-            base_func_name,
-            self._func_name is None,  # append_hash_to_fn_name
-            None,  # opers
-            [t._as_tf_output() for t in inputs],
-            [t._as_tf_output() for t in outputs],
-            output_names,
-            None,  # opts
-            description,
-            status)
+      c_func = c_api.TF_GraphToFunction_wrapper(
+          temp_graph._c_graph,
+          base_func_name,
+          self._func_name is None,  # append_hash_to_fn_name
+          None,  # opers
+          [t._as_tf_output() for t in inputs],
+          [t._as_tf_output() for t in outputs],
+          output_names,
+          None,  # opts
+          description)
+      self._c_func = c_api_util.ScopedTFFunction(c_func)
       # pylint: enable=protected-access
       self._set_c_attrs(kwargs_attr)
 
@@ -431,9 +436,8 @@ class _DefinedFunction(object):
       serialized = attr_value.SerializeToString()
       # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
       # It might be worth creating a convenient way to re-use the same status.
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.TF_FunctionSetAttrValueProto(self._c_func, compat.as_str(name),
-                                           serialized, status)
+      c_api.TF_FunctionSetAttrValueProto(self._c_func.func, compat.as_str(name),
+                                         serialized)
 
   def _create_hash_str(self, input_arg, output_arg, node_def):
     """Creates an 8-character string unique to this input.
@@ -487,10 +491,10 @@ class _DefinedFunction(object):
 
     # Adds this function into 'g'.
     # pylint: disable=protected-access
-    if context.in_graph_mode():
-      g._add_function(self)
-    else:
+    if context.executing_eagerly():
       context.context().add_function_def(self.definition)
+    else:
+      g._add_function(self)
     # pylint: enable=protected-access
 
     # Ensures related sub-routines are defined in 'g', too.
@@ -505,6 +509,12 @@ class _DefinedFunction(object):
     self.add_to_graph(ops.get_default_graph())
     args = [ops.convert_to_tensor(_) for _ in args] + self._extra_inputs
     ret, op = _call(self._signature, *args, **kwargs)
+
+    # Set a hidden attr in 'op' so that gradients_impl can refer back
+    # to this _DefinedFunction instance to access python_grad_func.
+    assert isinstance(op, ops.Operation)
+    setattr(op, "__defun", self)
+
     if self._shape_func is not None:
       shapes = self._shape_func(op)
       if len(shapes) != len(op.outputs):
@@ -593,12 +603,11 @@ class _OverloadedFunction(object):
         # _OverloadedFunction. We need to instantiate it with the
         # right input types.
         output_types = [
-            dtypes.DType(_.type)
-            for _ in defined._signature.output_arg  # pylint: disable=protected-access
+            dtypes.DType(_.type) for _ in defined._signature.output_arg  # pylint: disable=protected-access
         ]
         # pylint: disable=protected-access
-        defined._grad_func = self._grad_func.instantiate(
-            input_types + output_types)
+        defined._grad_func = self._grad_func.instantiate(input_types +
+                                                         output_types)
         # pylint: enable=protected-access
       self._overload[key] = defined
     return defined
@@ -683,28 +692,46 @@ class _FuncGraph(ops.Graph):
   def create_op(self, op_type, inputs, data_types, **kwargs):
     for i, x in enumerate(inputs):
       if isinstance(x, ops.EagerTensor) or x.graph is not self:
-        # Referring to a tensor from other graph.
-        if x in self._captured:
-          # Captured already.
-          inputs[i] = self._captured[x]
-        elif self._capture_by_value:
-          inputs[i] = self._add_tensor_and_parents(x)
-        else:
-          # Substitute with a placeholder.
-          self.extra_inputs.append(x)
-          # Hoist the new input placeholder out of any control flow context
-          # we're currently in.
-          with ops.control_dependencies(None):
-            ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
-          # pylint: disable=protected-access
-          ph._handle_data = x._handle_data
-          # pylint: enable=protected-access
-          inputs[i] = ph
-          self._captured[x] = ph
-          self.extra_args.append(ph)
+        inputs[i] = self.capture(x)
     return super(_FuncGraph, self).create_op(op_type, inputs, data_types,
                                              **kwargs)
 
+  def capture(self, tensor):
+    """Adds the given tensor to this graph and returns the captured tensor."""
+    if tensor in self._captured:
+      # Captured already.
+      return self._captured[tensor]
+    elif self._capture_by_value:
+      return self._add_tensor_and_parents(tensor)
+    else:
+      return self._capture_tensor_as_extra_input(tensor)
+
+  def _capture_tensor_as_extra_input(self, tensor):
+    # Substitute with a placeholder.
+    self.extra_inputs.append(tensor)
+    # Hoist the new input placeholder out of any control flow context
+    # we're currently in.
+    with ops.control_dependencies(None):
+      ph = array_ops.placeholder(tensor.dtype, shape=tensor.get_shape())
+    # pylint: disable=protected-access
+    if ops._USE_C_SHAPES:
+      handle_data = c_api.GetResourceHandleShapeAndType(tensor.graph._c_graph,
+                                                        tensor._as_tf_output())
+      if handle_data:
+        c_api.SetResourceHandleShapeAndType(ph.graph._c_graph,
+                                            ph._as_tf_output(),
+                                            compat.as_bytes(handle_data))
+    else:
+      ph._handle_data = tensor._handle_data
+    # pylint: enable=protected-access
+    self._captured[tensor] = ph
+    self.extra_args.append(ph)
+    if _is_guaranteed_const(tensor):
+      with ops.control_dependencies(None):
+        return array_ops.guarantee_const(ph)
+    else:
+      return ph
+
   def _add_tensor_and_parents(self, tensor):
     op = self._add_op_and_parents(tensor.op)
     return op.outputs[tensor.value_index]
@@ -735,6 +762,57 @@ class _FuncGraph(ops.Graph):
     return captured_op
 
 
+def _is_guaranteed_const(tensor):
+  """Determines whether `tensor` is guaranteed to be a constant.
+
+  A tensor is guaranteed to be a constant if either it was produced by
+  a `GuaranteeConst` op or if all of its children are guaranteed to be
+  constants.
+
+  Args:
+    tensor: The tensor for which to determine const-ness.
+
+  Returns:
+    True if `tensor` is guaranteed to be a constant, False otherwise.
+  """
+
+  if isinstance(tensor, ops.EagerTensor):
+    return False
+
+  class Work(object):
+
+    def __init__(self, op, leaving):
+      self.op = op
+      self.leaving = leaving
+
+  is_guaranteed_const = lambda op: op.node_def.op == "GuaranteeConst"
+  constants = set([])
+  def all_inputs_const(op):
+    # If all inputs of an op are guaranteed constants, then we can infer that
+    # the op produces a constant as well.
+    return op.inputs and all(inp.op in constants for inp in op.inputs)
+
+  visited = set([])
+  stack = [Work(tensor.op, leaving=False)]
+  while stack:
+    work = stack.pop()
+    if work.leaving:
+      if all_inputs_const(work.op):
+        constants.add(work.op)
+      continue
+    visited.add(work.op)
+    if is_guaranteed_const(work.op):
+      constants.add(work.op)
+      continue
+
+    # This op will be revisited after all its inputs are checked for const-ness.
+    stack.append(Work(work.op, leaving=True))
+    for inp in work.op.inputs:
+      if inp.op not in visited:
+        stack.append(Work(inp.op, leaving=False))
+  return tensor.op in constants
+
+
 def _call(sig, *inputs, **kwargs):
   """Adds a node calling a function.
 
@@ -766,8 +844,8 @@ def _call(sig, *inputs, **kwargs):
     ValueError: if the arguments are invalid.
   """
   if len(inputs) != len(sig.input_arg):
-    raise ValueError("Expected number of arguments: %d, received: %d" %
-                     (len(sig.input_arg), len(inputs)))
+    raise ValueError("Expected number of arguments: %d, received: %d" % (len(
+        sig.input_arg), len(inputs)))
   name = kwargs.pop("name", None)
   g = ops.get_default_graph()
   func_name = sig.name
@@ -822,8 +900,8 @@ def _from_definition(fdef, grad_func=None):
   # pylint: disable=protected-access
   if ops._USE_C_API:
     serialized = fdef.SerializeToString()
-    with errors.raise_exception_on_not_ok_status() as status:
-      result._c_func = c_api.TF_FunctionImportFunctionDef(serialized, status)
+    c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+    result._c_func = c_api_util.ScopedTFFunction(c_func)
     result._extra_inputs = []
   else:
     result._definition = fdef
@@ -883,8 +961,8 @@ def _from_library(lib):
       fdef for fdef in lib.function if func_to_grad[fdef.signature.name] is None
   ]
   if not ready:
-    raise ValueError("FunctionDefLibrary contains cyclic gradient functions!\n"
-                     + str(lib))
+    raise ValueError(
+        "FunctionDefLibrary contains cyclic gradient functions!\n" + str(lib))
   # function name -> _DefinedFunction
   initialized = {}
 
@@ -926,6 +1004,12 @@ def _parse_kwargs_as_attrs(func_name, **kwargs):
           s=("function_%s" % func_name).encode())
     # pylint: enable=protected-access
 
+  kwargs_keys = list(kwargs.keys())
+  for key in kwargs_keys:
+    if key.startswith("experimental_"):
+      attrs[key] = attr_value_pb2.AttrValue(s=compat.as_bytes(kwargs[key]))
+      del kwargs[key]
+
   if kwargs:
     raise ValueError("Unknown keyword arguments: %s" % kwargs.keys())
   return attrs
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 301a7f682dde8dbeccd1e81675b0059433990a09..a5c19f189ea5c48a6a80f3bcf0069aba2f2f4125 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -37,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_logging_ops
@@ -58,15 +57,35 @@ def _OptimizerOptions():
   for cse in [False, True]:
     for inline in [False, True]:
       for cfold in [False, True]:
-        yield config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-            optimizer_options=config_pb2.OptimizerOptions(
-                opt_level=config_pb2.OptimizerOptions.L0,
-                do_common_subexpression_elimination=cse,
-                do_function_inlining=inline,
-                do_constant_folding=cfold)))
-
-
-@test_util.with_c_api
+        cfg = config_pb2.ConfigProto(
+            graph_options=config_pb2.GraphOptions(
+                optimizer_options=config_pb2.OptimizerOptions(
+                    opt_level=config_pb2.OptimizerOptions.L0,
+                    do_common_subexpression_elimination=cse,
+                    do_function_inlining=inline,
+                    do_constant_folding=cfold)))
+        if cse:
+          cfg.graph_options.rewrite_options.arithmetic_optimization = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.arithmetic_optimization = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        if inline:
+          cfg.graph_options.rewrite_options.function_optimization = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.function_optimization = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        if cfold:
+          cfg.graph_options.rewrite_options.constant_folding = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.constant_folding = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        yield cfg
+
+
+@test_util.with_c_shapes
 class FunctionTest(test.TestCase):
   """Test methods for verifying Function support.
 
@@ -117,7 +136,8 @@ class FunctionTest(test.TestCase):
   def testTooManyOutputNames(self):
 
     @function.Defun(
-        dtypes.float32, func_name="MyIdentity",
+        dtypes.float32,
+        func_name="MyIdentity",
         out_names=["my_result1", "my_result2"])
     def MyIdentityFunc(a):
       return a
@@ -193,7 +213,7 @@ class FunctionTest(test.TestCase):
 
     @function.Defun(dtypes.float32, dtypes.float32)
     def XSquarePlusOneGrad(x, dy):
-      dx = functional_ops._symbolic_gradient(
+      dx = functional_ops.symbolic_gradient(
           input=[x, dy], Tout=[dtypes.float32], f="XSquarePlusOneFn", name="dx")
       return dx
 
@@ -220,10 +240,11 @@ class FunctionTest(test.TestCase):
 
     inp = np.array([-1, 1, 2, -2], dtype=np.float32)
     feed = {x: inp}
-    cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-        optimizer_options=config_pb2.OptimizerOptions(
-            opt_level=config_pb2.OptimizerOptions.L1,
-            do_function_inlining=True)))
+    cfg = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L1,
+                do_function_inlining=True)))
     with session.Session(graph=g, config=cfg) as sess:
       out, = sess.run(dx, feed)
     self.assertAllClose(1 - np.square(np.tanh(inp)), out)
@@ -295,7 +316,7 @@ class FunctionTest(test.TestCase):
       # gradient function is (x, y, dz) -> (dx, dy).  dx's shape
       # should be the same as x's; and dy's shape should be the same
       # as y's.
-      dx, dy = functional_ops._symbolic_gradient(
+      dx, dy = functional_ops.symbolic_gradient(
           input=[x, y, dz], Tout=[dtypes.float32] * 2, f="Foo")
       self.assertEqual(x.get_shape(), dx.get_shape())
       self.assertEqual(y.get_shape(), dy.get_shape())
@@ -315,18 +336,20 @@ class FunctionTest(test.TestCase):
       y = Foo(x)
       dx, = gradients_impl.gradients(y, [x])
 
-    cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-        optimizer_options=config_pb2.OptimizerOptions(
-            opt_level=config_pb2.OptimizerOptions.L0,
-            do_common_subexpression_elimination=True,
-            do_function_inlining=True,
-            do_constant_folding=True)))
+    cfg = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L0,
+                do_common_subexpression_elimination=True,
+                do_function_inlining=True,
+                do_constant_folding=True)))
 
     with self.test_session(graph=g, config=cfg):
       self.assertAllClose(y.eval(), 6.)
       self.assertAllClose(dx.eval(), 2.)
 
   def _testZNoDepOnY(self, use_const_grad_ys):
+
     @function.Defun(dtypes.float32, dtypes.float32)
     def Foo(x, y):  # pylint: disable=unused-argument
       return x * 2
@@ -412,7 +435,6 @@ class FunctionTest(test.TestCase):
                                    "assertion failed.*-3"):
         self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0)
 
-  @test_util.disable_c_api   # Op._add_control_inputs doesn't work with C API
   def testAssertWrapper(self):
 
     @function.Defun(dtypes.float32)
@@ -427,7 +449,6 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
-  @test_util.disable_c_api   # Op._add_control_inputs doesn't work with C API
   def testWhileLoopCallsFunc(self):
     with self.test_session(use_gpu=True) as sess:
 
@@ -447,7 +468,6 @@ class FunctionTest(test.TestCase):
       ans = sess.run(loop)
       self.assertAllClose(ans, 131072.)
 
-  @test_util.disable_c_api   # Op._add_control_inputs doesn't work with C API
   def testControlFlowStrictness(self):
     """Inlined functions must not execute in a untaken control flow branch."""
 
@@ -725,9 +745,16 @@ class FunctionTest(test.TestCase):
 
       y = Foo(constant_op.constant([[10.]]))
 
+      @function.Defun()
+      def Bar():
+        return w
+
+      z = Bar()
+
     with self.test_session(graph=g):
       variables.global_variables_initializer().run()
       self.assertAllEqual(y.eval(), [[12.0]])
+      self.assertAllEqual(z.eval(), [[1.0]])
 
   def testCaptureControls(self):
     g = ops.Graph()
@@ -752,9 +779,9 @@ class FunctionTest(test.TestCase):
 
       @function.Defun()
       def Foo():
-        return control_flow_ops.while_loop(lambda i: i < 10,
-                                           lambda i: i + x,
+        return control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + x,
                                            [0])
+
       y = Foo()
 
     with self.test_session(graph=g) as sess:
@@ -767,9 +794,8 @@ class FunctionTest(test.TestCase):
 
       @function.Defun(dtypes.bool)
       def Foo(pred):
-        return control_flow_ops.cond(pred,
-                                     lambda: x,
-                                     lambda: x + 1)
+        return control_flow_ops.cond(pred, lambda: x, lambda: x + 1)
+
       y = Foo(True)
       z = Foo(False)
 
@@ -922,6 +948,7 @@ class FunctionTest(test.TestCase):
     self.assertEqual(len(f.signature.input_arg), 3)
 
   def testGradientWithIntegerFunctionArgument(self):
+
     @function.Defun(dtypes.int32, dtypes.float32)
     def Foo(t, x):
       return x[t]
@@ -936,8 +963,7 @@ class FunctionTest(test.TestCase):
     x = np.zeros((2,)).astype(np.float32)
     with session.Session(graph=g) as sess:
       self.assertAllClose(
-          np.array([1.0, 0.0]).astype(np.float32),
-          sess.run(dinp, {inp: x}))
+          np.array([1.0, 0.0]).astype(np.float32), sess.run(dinp, {inp: x}))
 
   def testFunctionMarkedStateful(self):
 
@@ -1027,8 +1053,85 @@ class FunctionTest(test.TestCase):
         self.assertEqual(44.0, sess.run(f_1))
         self.assertEqual((42.0, 44.0), sess.run((f_0, f_1)))
 
+  def testGuaranteedConstsAreCaptured(self):
+    var = variables.Variable(1.0)
+    const = array_ops.guarantee_const(var)
+    also_const = array_ops.identity(const)
+    still_const = array_ops.identity(also_const)
+    not_const = still_const + var
+    also_not_const = array_ops.placeholder(dtypes.float32)
+
+    @function.Defun()
+    def CapturesGuaranteedConst():
+      output = const + also_const + still_const + not_const + also_not_const
+      first, second, third, fourth, fifth = function.get_extra_args()
+      self.assertEqual("GuaranteeConst", first.consumers()[0].node_def.op)
+      self.assertEqual("GuaranteeConst", second.consumers()[0].node_def.op)
+      self.assertEqual("GuaranteeConst", third.consumers()[0].node_def.op)
+      self.assertNotEqual("GuaranteeConst", fourth.consumers()[0].node_def.op)
+      self.assertNotEqual("GuaranteeConst", fifth.consumers()[0].node_def.op)
+      return output
+
+    with self.test_session(use_gpu=False) as sess:
+      sess.run(var.initializer)
+      _ = sess.run(CapturesGuaranteedConst(), {also_not_const: 1.0})
+
+  def testSameFunctionDifferentGrads(self):
+
+    def PartOne(x):
+
+      # Default grad is dx = dy * 2
+      @function.Defun(dtypes.float32)
+      def Foo(x):
+        return x * 2
+
+      return Foo(x)
+
+    def PartTwo(x):
+
+      @function.Defun(dtypes.float32, dtypes.float32)
+      def Bar(x, dy):
+        return x + dy  # crazy backprop
+
+      @function.Defun(dtypes.float32, grad_func=Bar)
+      def Foo(x):
+        return x * 2
 
-@test_util.with_c_api
+      return Foo(x)
+
+    def PartThree(x):
+
+      def Bar(op, dy):
+        return op.inputs[0] * dy / 2  # crazy backprop
+
+      @function.Defun(dtypes.float32, python_grad_func=Bar)
+      def Foo(x):
+        return x * 2
+
+      return Foo(x)
+
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant(100.)
+      x0 = x
+      y0 = PartOne(x0)
+      dx0, = gradients_impl.gradients(ys=[y0], xs=[x0])
+      x1 = x
+      y1 = PartTwo(x1)
+      dx1, = gradients_impl.gradients(ys=[y1], xs=[x1])
+      x2 = x
+      y2 = PartThree(x2)
+      dx2, = gradients_impl.gradients(ys=[y2], xs=[x2])
+
+    with self.test_session(graph=g) as sess:
+      v0, v1, v2 = sess.run([dx0, dx1, dx2])
+
+    self.assertAllEqual(v0, 2.)
+    self.assertAllEqual(v1, 101.)
+    self.assertAllEqual(v2, 50.)
+
+
+@test_util.with_c_shapes
 class FunctionsFromProtos(test.TestCase):
 
   def expectFunctionsEqual(self, func, grad_func=None, new_func=None):
@@ -1220,8 +1323,18 @@ class FunctionsFromProtos(test.TestCase):
         ValueError, "FunctionDefLibrary contains cyclic gradient functions!"):
       function._from_library(library)
 
+  def testExperimentalAttrs(self):
+
+    @function.Defun(dtypes.int32, experimental_tag="tag_value")
+    def FunctionWithAttr(i):
+      return array_ops.identity(i)
+
+    self.assertTrue("experimental_tag" in FunctionWithAttr.definition.attr)
+    self.assertEqual(FunctionWithAttr.definition.attr["experimental_tag"].s,
+                     b"tag_value")
+
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class FunctionOverloadTest(test.TestCase):
 
   def testBasic(self):
@@ -1274,7 +1387,7 @@ class FunctionOverloadTest(test.TestCase):
                      "Successor of x.")
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class FunctionCaptureByValueTest(test.TestCase):
 
   def testCaptureByValue(self):
@@ -1304,7 +1417,7 @@ class FunctionCaptureByValueTest(test.TestCase):
       self.assertAllEqual(y.eval(), [[12.0]])
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class UnrollLSTMTest(test.TestCase):
   BATCH_SIZE = 16
   LSTM_DIMS = 32
@@ -1326,7 +1439,7 @@ class UnrollLSTMTest(test.TestCase):
         value=math_ops.matmul(xm, weights), num_or_size_splits=4, axis=1)
     new_c = math_ops.sigmoid(f_g) * cprev + math_ops.sigmoid(
         i_g) * math_ops.tanh(i_i)
-    new_c = clip_ops.clip_by_value(new_c, -50.0, 50.0)
+    new_c = math_ops.maximum(math_ops.minimum(new_c, 50.0), -50.0)
     new_m = math_ops.sigmoid(o_g) * math_ops.tanh(new_c)
     return new_m, new_c
 
@@ -1346,7 +1459,8 @@ class UnrollLSTMTest(test.TestCase):
       return Loop(cell, weights, inp)
 
     cell = function.Defun(dtypes.float32, dtypes.float32, dtypes.float32,
-                          dtypes.float32)(cell)
+                          dtypes.float32)(
+                              cell)
     if mode == "cell":
       # Just represent the LSTM as a function.
       return Loop(cell, weights, inp)
@@ -1440,17 +1554,18 @@ class UnrollLSTMTest(test.TestCase):
       self.assertAllClose(d0, d3, rtol=1e-4, atol=1e-4)
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class FunctionInlineControlTest(test.TestCase):
 
   def testFoo(self):
     dtype = dtypes.float32
-    cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-        optimizer_options=config_pb2.OptimizerOptions(
-            opt_level=config_pb2.OptimizerOptions.L0,
-            do_common_subexpression_elimination=True,
-            do_function_inlining=True,
-            do_constant_folding=True)))
+    cfg = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L0,
+                do_common_subexpression_elimination=True,
+                do_function_inlining=True,
+                do_constant_folding=True)))
     cell_func_call_pattern = re.compile(r"Cell[^/]*\(")
     for noinline in [False, True]:
 
@@ -1508,10 +1623,6 @@ def Linear2(w1, b1, w2, b2, x):
   return Linear(w2, b2, Linear(w1, b1, x))
 
 
-# Set C API before defining module level functions
-ops._USE_C_API = True
-
-
 @function.Defun(*[dtypes.float32] * 3)
 def LinearWithCApi(w, b, x):
   return nn_ops.relu(math_ops.matmul(x, w) + b)
@@ -1522,25 +1633,9 @@ def Linear2WithCApi(w1, b1, w2, b2, x):
   return LinearWithCApi(w2, b2, LinearWithCApi(w1, b1, x))
 
 
-# Unset C API after defining module level functions
-ops._USE_C_API = False
-
-
 class ModuleFunctionTest(test.TestCase):
 
   def testBasic(self):
-    with ops.Graph().as_default():
-      a, b, c, d, e = [
-          constant_op.constant([[_]], dtype=dtypes.float32) for _ in range(5)
-      ]
-      y = Linear(a, b, c)
-      z = Linear2(a, b, c, d, e)
-      with session.Session() as sess:
-        self.assertAllEqual([[1]], sess.run(y))
-        self.assertAllEqual([[5]], sess.run(z))
-
-  @test_util.enable_c_api
-  def testBasicWithCApi(self):
     with ops.Graph().as_default():
       a, b, c, d, e = [
           constant_op.constant([[_]], dtype=dtypes.float32) for _ in range(5)
@@ -1552,7 +1647,7 @@ class ModuleFunctionTest(test.TestCase):
         self.assertAllEqual([[5]], sess.run(z))
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class VariableHoistingTest(test.TestCase):
 
   def _testSimpleModel(self, use_forward_func, use_resource=False):
diff --git a/tensorflow/python/framework/graph_util.py b/tensorflow/python/framework/graph_util.py
index a666630e44b0ecceef0ae3a720544b74c67090cc..c5cc1107343a7a23d177bbe9b8de3d9e2921c60b 100644
--- a/tensorflow/python/framework/graph_util.py
+++ b/tensorflow/python/framework/graph_util.py
@@ -28,14 +28,3 @@ from tensorflow.python.framework.graph_util_impl import must_run_on_cpu
 from tensorflow.python.framework.graph_util_impl import remove_training_nodes
 from tensorflow.python.framework.graph_util_impl import tensor_shape_from_node_def_name
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    # TODO(drpng): find a good place to reference this.
-    "convert_variables_to_constants",
-    "extract_sub_graph",
-    "must_run_on_cpu",
-    "tensor_shape_from_node_def_name",
-    "remove_training_nodes",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 5a543317e665a940841714fd72d834a430f8406a..394fac6c856197030f85aab5b11fa881eddf670d 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -235,7 +235,7 @@ def convert_variables_to_constants(sess,
   variable_names = []
   variable_dict_names = []
   for node in inference_graph.node:
-    if node.op in ["Variable", "VariableV2"]:
+    if node.op in ["Variable", "VariableV2", "VarHandleOp"]:
       variable_name = node.name
       if ((variable_names_whitelist is not None and
            variable_name not in variable_names_whitelist) or
@@ -243,7 +243,10 @@ def convert_variables_to_constants(sess,
            variable_name in variable_names_blacklist)):
         continue
       variable_dict_names.append(variable_name)
-      variable_names.append(variable_name + ":0")
+      if node.op == "VarHandleOp":
+        variable_names.append(variable_name + "/Read/ReadVariableOp:0")
+      else:
+        variable_names.append(variable_name + ":0")
   if variable_names:
     returned_variables = sess.run(variable_names)
   else:
@@ -266,12 +269,23 @@ def convert_variables_to_constants(sess,
               tensor=tensor_util.make_tensor_proto(
                   data, dtype=dtype.type, shape=data.shape)))
       how_many_converted += 1
+    elif input_node.op == "ReadVariableOp" and (
+        input_node.input[0] in found_variables):
+      # The preceding branch converts all VarHandleOps of ResourceVariables to
+      # constants, so we need to convert the associated ReadVariableOps to
+      # Identity ops.
+      output_node.op = "Identity"
+      output_node.name = input_node.name
+      output_node.input.extend([input_node.input[0]])
+      output_node.attr["T"].CopyFrom(input_node.attr["dtype"])
+      if "_class" in input_node.attr:
+        output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
     else:
       output_node.CopyFrom(input_node)
     output_graph_def.node.extend([output_node])
 
   output_graph_def.library.CopyFrom(inference_graph.library)
-  print("Converted %d variables to const ops." % how_many_converted)
+  logging.info("Converted %d variables to const ops.", how_many_converted)
   return output_graph_def
 
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 0421837d49de753d642aed59d1524619a243dcb8..2dafb94ba7e4e779c8703096795f4e49139100ab 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops as math_ops_lib
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -47,46 +48,46 @@ class DeviceFunctionsTest(test.TestCase):
 
   def testTwoDeviceFunctions(self):
     with ops.Graph().as_default() as g:
-      var_0 = gen_state_ops._variable(
+      var_0 = gen_state_ops.variable(
           shape=[1],
           dtype=dtypes.float32,
           name="var_0",
           container="",
           shared_name="")
       with g.device(test_device_func_pin_variable_to_cpu):
-        var_1 = gen_state_ops._variable(
+        var_1 = gen_state_ops.variable(
             shape=[1],
             dtype=dtypes.float32,
             name="var_1",
             container="",
             shared_name="")
-      var_2 = gen_state_ops._variable(
+      var_2 = gen_state_ops.variable(
           shape=[1],
           dtype=dtypes.float32,
           name="var_2",
           container="",
           shared_name="")
-      var_3 = gen_state_ops._variable(
+      var_3 = gen_state_ops.variable(
           shape=[1],
           dtype=dtypes.float32,
           name="var_3",
           container="",
           shared_name="")
       with g.device(test_device_func_pin_variable_to_cpu):
-        var_4 = gen_state_ops._variable(
+        var_4 = gen_state_ops.variable(
             shape=[1],
             dtype=dtypes.float32,
             name="var_4",
             container="",
             shared_name="")
         with g.device("/device:GPU:0"):
-          var_5 = gen_state_ops._variable(
+          var_5 = gen_state_ops.variable(
               shape=[1],
               dtype=dtypes.float32,
               name="var_5",
               container="",
               shared_name="")
-        var_6 = gen_state_ops._variable(
+        var_6 = gen_state_ops.variable(
             shape=[1],
             dtype=dtypes.float32,
             name="var_6",
@@ -208,7 +209,7 @@ class DeviceFunctionsTest(test.TestCase):
           defun_node, 2.0, name="output_node")
 
       with session.Session() as sess:
-        init = variables.initialize_variables([variable_node])
+        init = variables.variables_initializer([variable_node])
         sess.run(init)
         output = sess.run(output_node)
         self.assertNear(4.0, output, 0.00001)
@@ -226,52 +227,62 @@ class DeviceFunctionsTest(test.TestCase):
                          constant_graph_def.library)
 
   def testConvertVariablesToConsts(self):
-    with ops.Graph().as_default():
-      variable_node = variables.Variable(1.0, name="variable_node")
-      _ = variables.Variable(1.0, name="unused_variable_node")
-      output_node = math_ops_lib.multiply(
-          variable_node, 2.0, name="output_node")
-      with session.Session() as sess:
-        init = variables.initialize_variables([variable_node])
-        sess.run(init)
-        output = sess.run(output_node)
-        self.assertNear(2.0, output, 0.00001)
-        variable_graph_def = sess.graph.as_graph_def()
-        # First get the constant_graph_def when variable_names_whitelist is set,
-        # note that if variable_names_whitelist is not set an error will be
-        # thrown because unused_variable_node is not initialized.
-        constant_graph_def = graph_util.convert_variables_to_constants(
-            sess,
-            variable_graph_def, ["output_node"],
-            variable_names_whitelist=set(["variable_node"]))
+    self._test_variable_to_const_conversion(use_resource=False)
 
-        # Then initialize the unused variable, and get another
-        # constant_graph_def when variable_names_whitelist is not set.
-        sess.run(variables.global_variables_initializer())
-        constant_graph_def_without_variable_whitelist = (
-            graph_util.convert_variables_to_constants(sess, variable_graph_def,
-                                                      ["output_node"]))
-
-        # The unused variable should be cleared so the two graphs should be
-        # equivalent.
-        self.assertEqual(
-            str(constant_graph_def),
-            str(constant_graph_def_without_variable_whitelist))
-
-        # Test variable name black list. This should result in the variable not
-        # being a const.
-        sess.run(variables.global_variables_initializer())
-        constant_graph_def_with_blacklist = (
-            graph_util.convert_variables_to_constants(
-                sess,
-                variable_graph_def, ["output_node"],
-                variable_names_blacklist=set(["variable_node"])))
-        variable_node = None
-        for node in constant_graph_def_with_blacklist.node:
-          if node.name == "variable_node":
-            variable_node = node
-        self.assertIsNotNone(variable_node)
-        self.assertEqual(variable_node.op, "VariableV2")
+  def testConvertResourceVariablesToConsts(self):
+    self._test_variable_to_const_conversion(use_resource=True)
+
+  def _test_variable_to_const_conversion(self, use_resource):
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope("", use_resource=use_resource):
+        variable_node = variable_scope.get_variable(
+            "variable_node", initializer=1.0)
+        another_variable = variable_scope.get_variable(
+            "unused_variable_node", initializer=1.0)
+        output_node = math_ops_lib.multiply(
+            variable_node, 2.0, name="output_node")
+        with session.Session() as sess:
+          sess.run(variable_node.initializer)
+          output = sess.run(output_node)
+          self.assertNear(2.0, output, 0.00001)
+          variable_graph_def = sess.graph.as_graph_def()
+          # First get the constant_graph_def when variable_names_whitelist is
+          # set, note that if variable_names_whitelist is not set an error will
+          # be thrown because unused_variable_node is not initialized.
+          constant_graph_def = graph_util.convert_variables_to_constants(
+              sess,
+              variable_graph_def, ["output_node"],
+              variable_names_whitelist=set(["variable_node"]))
+
+          # Then initialize the unused variable, and get another
+          # constant_graph_def when variable_names_whitelist is not set.
+          sess.run(another_variable.initializer)
+          constant_graph_def_without_variable_whitelist = (
+              graph_util.convert_variables_to_constants(
+                  sess, variable_graph_def, ["output_node"]))
+
+          # The unused variable should be cleared so the two graphs should be
+          # equivalent.
+          self.assertEqual(
+              str(constant_graph_def),
+              str(constant_graph_def_without_variable_whitelist))
+
+          # Test variable name black list. This should result in the variable
+          # not being a const.
+          constant_graph_def_with_blacklist = (
+              graph_util.convert_variables_to_constants(
+                  sess,
+                  variable_graph_def, ["output_node"],
+                  variable_names_blacklist=set(["variable_node"])))
+          variable_node = None
+          for node in constant_graph_def_with_blacklist.node:
+            if node.name == "variable_node":
+              variable_node = node
+          self.assertIsNotNone(variable_node)
+          if use_resource:
+            self.assertEqual(variable_node.op, "VarHandleOp")
+          else:
+            self.assertEqual(variable_node.op, "VariableV2")
 
     # Now we make sure the variable is now a constant, and that the graph still
     # produces the expected result.
@@ -279,8 +290,9 @@ class DeviceFunctionsTest(test.TestCase):
       _ = importer.import_graph_def(constant_graph_def, name="")
       self.assertEqual(4, len(constant_graph_def.node))
       for node in constant_graph_def.node:
-        self.assertNotEqual("Variable", node.op)
-        self.assertNotEqual("VariableV2", node.op)
+        self.assertNotIn(
+            node.op,
+            ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"])
       with session.Session() as sess:
         output_node = sess.graph.get_tensor_by_name("output_node:0")
         output = sess.run(output_node)
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 6ecc1a40ae14760dd39242aaf595b32a9decdc9f..5112bea48b5033e2cd16a555d65993b575f475eb 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -301,14 +301,17 @@ def _ProcessNewOps(graph):
   colocation_pairs = {}
 
   for new_op in graph._add_new_tf_operations(compute_devices=False):  # pylint: disable=protected-access
+    original_device = new_op.device
+    new_op._set_device('')  # pylint: disable=protected-access
     colocation_names = _GetColocationNames(new_op)
     if colocation_names:
       colocation_pairs[new_op] = colocation_names
-      # Don't apply this op's device function, since colocation constraints
-      # override device functions. Note that this op's device may still be set
-      # by the loop below.
+      # Don't set a device for this op, since colocation constraints override
+      # device functions and the original device. Note that this op's device may
+      # still be set by the loop below.
+      # TODO(skyewm): why does it override the original device?
     else:
-      with _MaybeDevice(new_op.device):
+      with _MaybeDevice(original_device):
         graph._apply_device_functions(new_op)  # pylint: disable=protected-access
 
   # The following loop populates the device field of ops that are colocated
@@ -475,38 +478,45 @@ def import_graph_def(graph_def,
     _PopulateTFImportGraphDefOptions(options, prefix, input_map,
                                      return_elements)
 
-    with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
-      try:
-        with errors.raise_exception_on_not_ok_status() as status:
+    # _ProcessNewOps mutates the new operations. _lock ensures a Session.run
+    # call cannot occur between creating the TF_Operations in the
+    # TF_GraphImportGraphDefWithResults call and mutating the them in
+    # _ProcessNewOps.
+    with graph._lock:  # pylint: disable=protected-access
+      with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
+        try:
           results = c_api.TF_GraphImportGraphDefWithResults(
-              graph._c_graph, serialized, options, status)  # pylint: disable=protected-access
-      except errors.InvalidArgumentError as e:
-        # Convert to ValueError for backwards compatibility.
-        raise ValueError(str(e))
+              graph._c_graph, serialized, options)  # pylint: disable=protected-access
+          results = c_api_util.ScopedTFImportGraphDefResults(results)
+        except errors.InvalidArgumentError as e:
+          # Convert to ValueError for backwards compatibility.
+          raise ValueError(str(e))
+
+      # Create _DefinedFunctions for any imported functions.
+      #
+      # We do this by creating _DefinedFunctions directly from `graph_def`, and
+      # adding them to `graph`. Adding an existing function to a TF_Graph is a
+      # no-op, so this only has the effect of updating the Python state (usually
+      # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
+      #
+      # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
+      # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
+      # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
+      # _USE_C_SHAPES is removed.
+      if graph_def.library and graph_def.library.function:
+        # pylint: disable=protected-access
+        functions = function._from_library(graph_def.library)
+        for f in functions:
+          f.add_to_graph(graph)
+        # pylint: enable=protected-access
 
-    _ProcessNewOps(graph)
-
-    # Create _DefinedFunctions for any imported functions.
-    #
-    # We do this by creating _DefinedFunctions directly from `graph_def`, and
-    # adding them to `graph`. Adding an existing function to a TF_Graph is a
-    # no-op, so this only has the effect of updating the Python state (usually
-    # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
-    #
-    # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
-    # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
-    if graph_def.library and graph_def.library.function:
-      # pylint: disable=protected-access
-      functions = function._from_library(graph_def.library)
-      for f in functions:
-        f.add_to_graph(graph)
-      # pylint: enable=protected-access
+      _ProcessNewOps(graph)
 
     # Treat input mappings that don't appear in the graph as an error, because
     # they are likely to be due to a typo.
     missing_unused_input_keys = (
         c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
-            results))
+            results.results))
     if missing_unused_input_keys:
       missing_unused_input_keys = [
           compat.as_str(s) for s in missing_unused_input_keys
@@ -518,7 +528,7 @@ def import_graph_def(graph_def,
     if return_elements is None:
       return None
     else:
-      return _GatherReturnElements(return_elements, graph, results)
+      return _GatherReturnElements(return_elements, graph, results.results)
 
   else:
     g = graph
@@ -562,7 +572,14 @@ def import_graph_def(graph_def,
         if node.name in name_to_op:
           raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
         if node.op not in op_dict:
-          raise ValueError('No op named %s in defined operations.' % node.op)
+          raise ValueError(
+              'No op named %s in defined operations. If the Graph you are '
+              'importing uses custom ops or any parts of tf.contrib, you '
+              'should explicitly import the libraries defining those ops '
+              'before loading the Graph. Note that tf.contrib is lazily loaded '
+              'when accessed, so simply referencing (e.g.) '
+              '`tf.contrib.resampler` will cause those ops to be made '
+              'available.' % node.op)
         op_def = op_dict[node.op]
 
         output_types = _OutputTypes(node, op_dict)
@@ -675,11 +692,10 @@ def import_graph_def(graph_def,
                      ', '.join(x.name for x in op._input_types))))
         # pylint: enable=protected-access
 
-        if not g._is_function(op.type):  # pylint: disable=protected-access
-          # Execute shape inference for this op.
-          # NOTE(mrry): If the graph contains a cycle, the full shape
-          # information may not be available for this op's inputs.
-          ops.set_shapes_for_outputs(op)
+        # Execute shape inference for this op.
+        # NOTE(mrry): If the graph contains a cycle, the full shape
+        # information may not be available for this op's inputs.
+        ops.set_shape_and_handle_data_for_outputs(op)
         # For nodes with _output_shapes set, set the output shapes.
         if '_output_shapes' in op.node_def.attr:
           for i, output in enumerate(op.outputs):
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index bf5d9fe0936882c242198bdc7118f9f3a4e79260..2c913d1e028e15e293158fe180e263a78c514ee4 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -218,6 +219,23 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(outer_inner.name, "outer/inner_1")
       self.assertEqual(outer_inner_c.name, "outer/inner/c_1")
 
+  def testEmptyNameScope(self):
+    with ops.Graph().as_default():
+      # Create name scope but don't create any ops with it
+      with ops.name_scope("foo"):
+        pass
+
+      # Import graph def that uses name scope name
+      op, = importer.import_graph_def(
+          self._MakeGraphDef("node { name: 'foo' op: 'IntOutput' }"),
+          return_elements=["foo"],
+          name="")
+
+      if ops._USE_C_API:
+        self.assertEqual(op.name, "foo")
+      else:
+        self.assertEqual(op.name, "foo_1")
+
   def testInputMap(self):
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(0, dtype=dtypes.int32)
@@ -356,6 +374,39 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(d._input_types, [dtypes.int32_ref, dtypes.int32])
       self.assertEqual(d.outputs, [])
 
+  def testResources(self):
+    # Produce GraphDef containing a ops producing and consuming resources.
+    graph = ops.Graph()
+    with graph.as_default():
+      var = resource_variable_ops.ResourceVariable(1.0)
+      var_assign = var.assign(2.0)
+      # Use an op that requires handle shape to be set.
+      var_shape = resource_variable_ops.variable_shape(var.handle)
+      init = variables.global_variables_initializer()
+    graph_def = graph.as_graph_def()
+
+    # Import the GraphDef.
+    with ops.Graph().as_default():
+      # pylint: disable=unused-variable
+      imported_var, imported_assign, imported_shape, imported_init = (
+          importer.import_graph_def(
+              graph_def,
+              return_elements=[var.name, var_assign.name, var_shape.name,
+                               init.name]))
+
+      # Make sure the handle shape is set on the imported variable.
+      new_var_shape = resource_variable_ops.variable_shape(imported_var)
+      # pylint: enable=unused-variable
+
+      # Run the imported graph.
+      # TODO(b/76173421): make this work (currently DCHECKS)
+      # with self.test_session() as sess:
+      #   sess.run(imported_init)
+      #   self.assertEqual(sess.run(imported_var), 1.0)
+      #   self.assertEqual(sess.run(imported_assign), 2.0)
+      #   self.assertEqual(list(sess.run(imported_shape)), [])
+      #   self.assertEqual(list(sess.run(new_var_shape)), [])
+
   def testWhileLoop(self):
     # Produce GraphDef containing while loop.
     graph = ops.Graph()
@@ -680,6 +731,49 @@ class ImportGraphDefTest(test.TestCase):
           "list { s: 'loc:@imported_graph/A' }",
           b.node_def.attr["_class"])
 
+  def testColocationAndDevice(self):
+    # A and B are colocated, device set on A.
+    original_graph_def = self._MakeGraphDef("""
+          node { name: 'A' op: 'None' device: '/device:CPU:0' attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }
+          node { name: 'B' op: 'None'  attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }""")
+
+    with ops.Graph().as_default():
+      a, b = importer.import_graph_def(original_graph_def,
+                                       return_elements=["A", "B"],
+                                       name="")
+      self.assertEqual(a.device, "/device:CPU:0")
+      self.assertEqual(b.device, "/device:CPU:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@A"])
+
+    # A and B are colocated, device set on B.
+    original_graph_def = self._MakeGraphDef("""
+          node { name: 'A' op: 'None' attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }
+          node { name: 'B' op: 'None' device: '/device:CPU:0' attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }""")
+
+    with ops.Graph().as_default():
+      a, b = importer.import_graph_def(original_graph_def,
+                                       return_elements=["A", "B"],
+                                       name="")
+      # TODO(skyewm): this behavior seems inconsistent with the above. Why is
+      # B's device ignored?
+      self.assertEqual(a.device, "")
+      self.assertEqual(b.device, "")
+      self.assertEqual(a.colocation_groups(), [b"loc:@A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@A"])
+
   def testColocationWithDeviceFn(self):
     original_graph_def = self._MakeGraphDef("""
           node { name: 'A' op: 'None' attr {
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 1f2aa264c110930b318f30e3a24010a96ebce47e..9a8477debb05fdf11d7d8c39adf9e5e55cc2caeb 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -26,7 +26,6 @@ import threading  # pylint: disable=unused-import
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-import
 from tensorflow.python import pywrap_tensorflow as py_tf
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -54,13 +53,12 @@ def load_op_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library or get the python wrappers.
   """
-  with errors_impl.raise_exception_on_not_ok_status() as status:
-    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
+  lib_handle = py_tf.TF_LoadLibrary(library_filename)
 
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
   op_list.ParseFromString(compat.as_bytes(op_list_str))
-  wrappers = py_tf.GetPythonWrappers(op_list_str)
+  wrappers = py_tf.GetEagerPythonWrappers(op_list_str)
 
   # Delete the library handle to release any memory held in C
   # that are no longer needed.
@@ -99,5 +97,4 @@ def load_file_system_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library.
   """
-  with errors_impl.raise_exception_on_not_ok_status() as status:
-    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
+  py_tf.TF_LoadLibrary(library_filename)
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 4c1bd736d727e974375ad9008a579361137fb9d6..923e76fc9c8f231cc9a43bc05280dac1ea458d3c 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -439,9 +439,10 @@ def add_collection_def(meta_graph_def, key, graph=None,
       else:
         getattr(col_def, kind).value.extend([x for x in collection_list])
   except Exception as e:  # pylint: disable=broad-except
-    logging.warning("Error encountered when serializing %s.\n"
+    logging.warning("Issue encountered when serializing %s.\n"
                     "Type is unsupported, or the types of the items don't "
-                    "match field type in CollectionDef.\n%s", key, str(e))
+                    "match field type in CollectionDef. Note this is a warning "
+                    "and probably safe to ignore.\n%s", key, str(e))
     if key in meta_graph_def.collection_def:
       del meta_graph_def.collection_def[key]
     return
@@ -695,7 +696,7 @@ def import_scoped_meta_graph(meta_graph_or_file,
   Raises:
     ValueError: If the graph_def contains unbound inputs.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise ValueError("Exporting/importing meta graphs is not supported when "
                      "eager execution is enabled.")
   if isinstance(meta_graph_or_file, meta_graph_pb2.MetaGraphDef):
@@ -737,7 +738,9 @@ def import_scoped_meta_graph(meta_graph_or_file,
         import_scope or "", mark_as_used=False)
 
     importer.import_graph_def(
-        input_graph_def, name=(import_scope or ""), input_map=input_map,
+        input_graph_def,
+        name=(import_scope or scope_to_prepend_to_names),
+        input_map=input_map,
         producer_op_list=producer_op_list)
 
     # Restores all the other collections.
@@ -856,7 +859,7 @@ def export_scoped_meta_graph(filename=None,
   Raises:
     ValueError: When the `GraphDef` is larger than 2GB.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise ValueError("Exporting/importing meta graphs is not supported when "
                      "Eager Execution is enabled.")
   graph = graph or ops.get_default_graph()
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 19dcd6a1b34741290b2578d93b79883c103fdb1b..0532ed464cc7a2d7e1eb2908c360aed1316648ea 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -476,11 +476,12 @@ class ScopedMetaGraphTest(test.TestCase):
     # Create a simple while loop.
     with ops.Graph().as_default():
       with ops.name_scope("export"):
-        var = variables.Variable(0)
+        var = variables.Variable(0.)
         var_name = var.name
-        _, output = control_flow_ops.while_loop(lambda i, x: i < 5,
-                                                lambda i, x: (i + 1, x + i),
-                                                [0, var])
+        _, output = control_flow_ops.while_loop(
+            lambda i, x: i < 5,
+            lambda i, x: (i + 1, x + math_ops.cast(i, dtypes.float32)),
+            [0, var])
         output_name = output.name
 
       # Generate a MetaGraphDef containing the while loop with an export scope.
@@ -522,6 +523,31 @@ class ScopedMetaGraphTest(test.TestCase):
         actual_grad_value = sess.run(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
+  def testImportWhileLoopInWhileLoop(self):
+    # Create a simple while loop.
+    with ops.Graph().as_default():
+      var = variables.Variable(0.0)
+      _, output = control_flow_ops.while_loop(lambda i, x: i < 5,
+                                              lambda i, x: (i + 1, x * 2.0),
+                                              [0, var])
+      output_name = output.name
+
+      # Generate a MetaGraphDef containing the while loop with an export scope.
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph()
+
+    # Restore the MetaGraphDef in a while loop in a new graph.
+    with ops.Graph().as_default():
+
+      def body(i, _):
+        meta_graph.import_scoped_meta_graph(meta_graph_def)
+        return i + 1, ops.get_default_graph().get_tensor_by_name(output_name)
+
+      _, x = control_flow_ops.while_loop(lambda i, x: i < 2, body, [0, 0.0],
+                                         name="")
+      with session.Session() as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(x)
+
   def testScopedImportUnderNameScope(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -537,6 +563,21 @@ class ScopedMetaGraphTest(test.TestCase):
         self.assertEqual(list(imported_variables.values())[0].name,
                          "foo/bar/myvar:0")
 
+  def testScopedImportUnderNameScopeNoVarScope(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      variables.Variable(initial_value=1.0, trainable=True, name="myvar")
+    meta_graph_def, _ = meta_graph.export_scoped_meta_graph(graph=graph)
+
+    graph = ops.Graph()
+    with graph.as_default():
+      with ops.name_scope("foo"):
+        imported_variables = meta_graph.import_scoped_meta_graph(
+            meta_graph_def)
+        self.assertEqual(len(imported_variables), 1)
+        self.assertEqual(list(imported_variables.values())[0].name,
+                         "foo/myvar:0")
+
   def testImportsUsingSameScopeName(self):
     with ops.Graph().as_default():
       variables.Variable(0, name="v")
@@ -905,20 +946,6 @@ class ExportImportAcrossScopesTest(test.TestCase):
       with variable_scope.variable_scope("importA/keepA"):
         graph_fn(use_resource=use_resource)
 
-      if use_resource:
-        # Bringing in collections that contain ResourceVariables will adds ops
-        # to the graph the first time a variable is encountered, so mimic the
-        # same behavior.
-        seen_variables = set()
-        for collection_key in sorted([
-            ops.GraphKeys.GLOBAL_VARIABLES,
-            ops.GraphKeys.TRAINABLE_VARIABLES,
-        ]):
-          for var in expected_graph.get_collection(collection_key):
-            if var not in seen_variables:
-              var._read_variable_op()
-              seen_variables.add(var)
-
     result = meta_graph.export_scoped_meta_graph(graph=imported_graph)[0]
     expected = meta_graph.export_scoped_meta_graph(graph=expected_graph)[0]
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 398b3f67e20660dc23f8fb339774ad0e3b2eff9d..dd9acdd9ebb817261b1ac77c9434afaf0caf0fd3 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import copy
+import functools
 import linecache
 import os
 import re
@@ -42,6 +43,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -62,7 +64,8 @@ from tensorflow.python.util.tf_export import tf_export
 # calls to the C API. Currently disabled by default but can be manually enabled
 # in code or via the environment variable. This will be removed once all
 # functionality is supported and there's no performance penalty with it enabled.
-_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "0") is not "0"
+_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "1") is not "0"
+_USE_C_SHAPES = os.getenv("TF_C_API_GRAPH_CONSTRUCTION_SHAPES", "0") is not "0"
 
 
 def tensor_id(tensor):
@@ -287,14 +290,26 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-    self._shape_val = tensor_shape.unknown_shape()
+
+    if _USE_C_API:
+      # This will be set by set_shape_and_handle_data_for_outputs.
+      self._shape_val = None
+    else:
+      # The Python code requires all tensors start with a shape to support shape
+      # inference on imported while loops. This isn't necessary with the C API
+      # enabled because the C API provides the shapes for imported nodes.
+      # TODO(skyewm): remove when _USE_C_API is removed.
+      self._shape_val = tensor_shape.unknown_shape()
+
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
 
-    # Attributes used for C++ shape inference. Not inspected, only forwarded.
-    # If set, will be a HandleData object from cpp_shape_inference.proto.
-    self._handle_data = None
+    if not _USE_C_SHAPES:
+      # Attributes used for C++ shape inference. Not inspected, only forwarded.
+      # If set, will be a HandleData object from cpp_shape_inference.proto.
+      self._handle_data = None
+
     self._id = uid()
 
   @property
@@ -368,21 +383,45 @@ class Tensor(_TensorLike):
       A `TensorShape` representing the shape of this tensor.
 
     """
-    if _USE_C_API:
-      graph = self._op._graph._c_graph  # pylint: disable=protected-access
-      with errors.raise_exception_on_not_ok_status() as status:
-        num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output(),
-                                                  status)
-      if num_dims == -1:
-        dim_list = None
+    if self._shape_val is None:
+      if _USE_C_SHAPES:
+        self._shape_val = self._c_api_shape()
       else:
-        with errors.raise_exception_on_not_ok_status() as status:
-          dim_list = c_api.TF_GraphGetTensorShape_wrapper(
-              graph, self._as_tf_output(), num_dims, status)
-        dim_list = [None if i == -1 else i for i in dim_list]
-      return tensor_shape.TensorShape(dim_list)
+        assert _USE_C_API
+        # Call set_shape_and_handle_data_for_outputs in topological order on all
+        # ops that are needed to compute self.op's shape. We do this instead of
+        # having set_shape_and_handle_data_for_outputs recursively call
+        # Operation.shape on self.op.inputs to overflowing the call stack.
+        need_shapes = self._get_input_ops_without_shapes(self.op)
+        need_shapes.sort(key=lambda op: op._id)
+        for op in need_shapes:
+          set_shape_and_handle_data_for_outputs(op)
     return self._shape_val
 
+  def _get_input_ops_without_shapes(self, target_op):
+    """Returns ops needing shape inference to compute target_op's shape."""
+    result = []
+    stack = [self._op]
+    visited = set()
+    while stack:
+      op = stack.pop()
+      if op in visited: continue
+      result.append(op)
+      stack.extend(t.op for t in op.inputs if t._shape_val is None)
+      visited.add(op)
+    return result
+
+  def _c_api_shape(self):
+    """Returns the TensorShape of this tensor according to the C API."""
+    c_graph = self._op._graph._c_graph  # pylint: disable=protected-access
+    shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+        c_graph, self._as_tf_output())
+    if unknown_shape:
+      return tensor_shape.unknown_shape()
+    else:
+      shape_vector = [None if d == -1 else d for d in shape_vector]
+      return tensor_shape.TensorShape(shape_vector)
+
   @property
   def _shape(self):
     logging.warning("Tensor._shape is private, use Tensor.shape "
@@ -395,10 +434,10 @@ class Tensor(_TensorLike):
         "Tensor._shape cannot be assigned, use Tensor.set_shape instead.")
 
   def __iter__(self):
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       raise TypeError(
-          "`Tensor` objects are not iterable when eager execution is not "
-          "enabled. To iterate over this tensor use `tf.map_fn`.")
+          "Tensor objects are not iterable when eager execution is not "
+          "enabled. To iterate over this tensor use tf.map_fn.")
     shape = self._shape_tuple()
     if shape is None:
       raise TypeError("Cannot iterate over a tensor with unknown shape.")
@@ -466,9 +505,16 @@ class Tensor(_TensorLike):
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    if not _USE_C_API:
-      self._shape_val = self._shape_val.merge_with(shape)
-      return
+    if _USE_C_SHAPES:  # pylint: disable=protected-access
+      # Reset cached shape.
+      self._shape_val = None
+    else:
+      self._shape_val = self.shape.merge_with(shape)
+
+    if not self._op._graph._c_graph: return
+
+    # Update C shape even if _USE_C_SHAPES = False, since we still want
+    # set_shape to be reflected in the C API graph for when we run it.
     if not isinstance(shape, tensor_shape.TensorShape):
       shape = tensor_shape.TensorShape(shape)
     dim_list = []
@@ -482,13 +528,11 @@ class Tensor(_TensorLike):
         else:
           dim_list.append(dim.value)
     try:
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.TF_GraphSetTensorShape_wrapper(
-            self._op._graph._c_graph,  # pylint: disable=protected-access
-            self._as_tf_output(),
-            dim_list,
-            unknown_shape,
-            status)
+      c_api.TF_GraphSetTensorShape_wrapper(
+          self._op._graph._c_graph,  # pylint: disable=protected-access
+          self._as_tf_output(),
+          dim_list,
+          unknown_shape)
     except errors.InvalidArgumentError as e:
       # Convert to ValueError for backwards compatibility.
       raise ValueError(str(e))
@@ -577,6 +621,16 @@ class Tensor(_TensorLike):
     # Necessary to support Python's collection membership operators
     return id(self) == id(other)
 
+  def __copy__(self):
+    # Make sure _shape_val is computed before we copy.
+    # TODO(b/77597810): get rid of Tensor copies.
+    if self._shape_val is None:
+      set_shape_and_handle_data_for_outputs(self.op)
+    cls = self.__class__
+    result = cls.__new__(cls)
+    result.__dict__.update(self.__dict__)
+    return result
+
   # NOTE(mrry): This enables the Tensor's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
   # accords the Tensor class higher priority than an ndarray, or a
@@ -772,7 +826,7 @@ class _EagerTensorBase(Tensor):
       six.raise_from(core._status_to_exception(e.code, e.message), None)
 
     # Record the copy on tape and define backprop copy as well.
-    if not context.in_graph_mode():
+    if context.executing_eagerly():
       self_device = self.device
       def grad_fun(dresult):
         return [dresult._copy(device_name=self_device)]
@@ -782,7 +836,11 @@ class _EagerTensorBase(Tensor):
 
   @property
   def shape(self):
-    return tensor_shape.TensorShape(self._shape_tuple())
+    if self._tensor_shape is None:  # pylint: disable=access-member-before-definition
+      # `_tensor_shape` is declared and defined in the definition of
+      # `EagerTensor`, in C.
+      self._tensor_shape = tensor_shape.TensorShape(self._shape_tuple())
+    return self._tensor_shape
 
   def get_shape(self):
     """Alias of Tensor.shape."""
@@ -829,41 +887,51 @@ class _EagerTensorBase(Tensor):
   def set_shape(self, shape):
     if not self.shape.is_compatible_with(shape):
       raise ValueError(
-          "EagerTensor's shape %s is not compatible with supplied shape %s" %
+          "Tensor's shape %s is not compatible with supplied shape %s" %
           (self.shape, shape))
 
   # Methods not supported / implemented for Eager Tensors.
   @property
   def op(self):
-    raise AttributeError("op not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.op is meaningless when eager execution is enabled.")
 
   @property
   def graph(self):
-    raise AttributeError("graph not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.graph is meaningless when eager execution is enabled.")
 
   @property
   def name(self):
-    raise AttributeError("name not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.name is meaningless when eager execution is enabled.")
 
   @property
   def value_index(self):
-    raise AttributeError("value_index not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.value_index is meaningless when eager execution is enabled.")
 
   def consumers(self):
-    raise NotImplementedError("consumers not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "Tensor.consumers is meaningless when eager execution is enabled.")
 
   def _add_consumer(self, consumer):
-    raise NotImplementedError("_add_consumer not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "_add_consumer not supported when eager execution is enabled.")
 
   def _as_node_def_input(self):
     raise NotImplementedError(
-        "_as_node_def_input not supported for Eager Tensors.")
+        "_as_node_def_input not supported when eager execution is enabled.")
 
   def _as_tf_output(self):
-    raise NotImplementedError("_as_tf_output not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "_as_tf_output not supported when eager execution is enabled.")
 
   def eval(self, feed_dict=None, session=None):
-    raise NotImplementedError("eval not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "eval is not supported when eager execution is enabled, "
+        "is .numpy() what you're looking for?"
+    )
 
 
 # This call creates an EagerTensor class, as a subclass of _EagerTensorBase, and
@@ -989,7 +1057,7 @@ def internal_convert_to_tensor(value,
 
   """
   if ctx is None: ctx = context.context()
-  if ctx.in_eager_mode():
+  if ctx.executing_eagerly():
     # Fast path for EagerTensors that don't need any conversion.
     if isinstance(value, EagerTensor):
       # Note that we don't check that value's dtype matches the dtype
@@ -1317,6 +1385,22 @@ def register_tensor_conversion_function(base_type,
     if not callable(conversion_func):
       raise TypeError("conversion_func must be callable.")
 
+    # context._context is checked so that we don't inadvertently create it.
+    # This is because enable_eager_execution will fail when called from the main
+    # function if the context._context is already created, and the
+    # register_tensor_conversion_function calls happen when the module is
+    # imported.
+    if context._context is not None and context.executing_eagerly(
+    ) and isinstance(base_type, six.integer_types + (
+        float,
+        np.ndarray,
+    )):
+      # TODO(nareshmodi): consider setting a context variable which disables the
+      # fastpath instead.
+      raise TypeError(
+          "Cannot register conversions for numpy arrays, python number types "
+          "when executing eagerly.")
+
     try:
       funcs_at_priority = _tensor_conversion_func_registry[priority]
     except KeyError:
@@ -1493,13 +1577,10 @@ def _create_c_op(graph, node_def, inputs, control_inputs):
     serialized = attr_value.SerializeToString()
     # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
     # It might be worth creating a convenient way to re-use the same status.
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_api.TF_SetAttrValueProto(op_desc,
-                                 compat.as_str(name), serialized, status)
+    c_api.TF_SetAttrValueProto(op_desc, compat.as_str(name), serialized)
 
   try:
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_op = c_api.TF_FinishOperation(op_desc, status)
+    c_op = c_api.TF_FinishOperation(op_desc)
   except errors.InvalidArgumentError as e:
     # Convert to ValueError for backwards compatibility.
     raise ValueError(str(e))
@@ -1644,6 +1725,9 @@ class Operation(object):
       self._control_inputs_val = control_input_ops
       self._node_def_val = copy.deepcopy(node_def)
       self._op_def_val = op_def
+    else:
+      # This will be set by self.inputs.
+      self._inputs_val = None
 
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
@@ -1897,7 +1981,8 @@ class Operation(object):
     tensor._add_consumer(self)  # pylint: disable=protected-access
     self._recompute_node_def()
 
-  def _update_input(self, index, tensor):
+  # TODO(skyewm): Remove `update_dtype` when we enable the C API.
+  def _update_input(self, index, tensor, update_dtype=True):
     """Update the input to this operation at the given index.
 
     NOTE: This is for TF internal use only. Please don't use it.
@@ -1905,6 +1990,7 @@ class Operation(object):
     Args:
       index: the index of the input to update.
       tensor: the Tensor to be used as the input at the given index.
+      update_dtype: If `False`, the type for this input is not updated.
 
     Raises:
       TypeError: if tensor is not a Tensor,
@@ -1914,17 +2000,25 @@ class Operation(object):
     if not isinstance(tensor, Tensor):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
+
+    # Make sure output shapes are already computed for this op in case we create
+    # a cycle (we cannot compute shapes for cycles). Usually shapes are computed
+    # lazily upon request.
+    if not _USE_C_SHAPES:
+      set_shape_and_handle_data_for_outputs(self)
+
     if self._c_op:
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.UpdateEdge(
-            self._graph._c_graph,  # pylint: disable=protected-access
-            tensor._as_tf_output(),  # pylint: disable=protected-access
-            self._tf_input(index),
-            status)
+      # Reset cached inputs.
+      self._inputs_val = None
+      c_api.UpdateEdge(
+          self._graph._c_graph,  # pylint: disable=protected-access
+          tensor._as_tf_output(),  # pylint: disable=protected-access
+          self._tf_input(index))
     else:
       self._inputs_val[index].consumers().remove(self)
       self._inputs_val[index] = tensor
-      self._input_types_val[index] = tensor.dtype
+      if update_dtype:
+        self._input_types_val[index] = tensor.dtype
       tensor._add_consumer(self)  # pylint: disable=protected-access
       self._recompute_node_def()
 
@@ -2030,15 +2124,18 @@ class Operation(object):
   def inputs(self):
     """The list of `Tensor` objects representing the data inputs of this op."""
     if self._c_op:
-      tf_outputs = c_api.GetOperationInputs(self._c_op)
-      # pylint: disable=protected-access
-      retval = [
-          self.graph._get_tensor_by_tf_output(tf_output)
-          for tf_output in tf_outputs
-      ]
-      # pylint: enable=protected-access
-      return Operation._InputList(retval)
-    return Operation._InputList(self._inputs_val)
+      if self._inputs_val is None:
+        tf_outputs = c_api.GetOperationInputs(self._c_op)
+        # pylint: disable=protected-access
+        retval = [
+            self.graph._get_tensor_by_tf_output(tf_output)
+            for tf_output in tf_outputs
+        ]
+        # pylint: enable=protected-access
+        self._inputs_val = Operation._InputList(retval)
+      return self._inputs_val
+    else:
+      return Operation._InputList(self._inputs_val)
 
   @property
   def _inputs(self):
@@ -2091,6 +2188,30 @@ class Operation(object):
     else:
       return self._control_inputs_val
 
+  @property
+  def _control_outputs(self):
+    """The `Operation` objects which have a control dependency on this op.
+
+    Before any of the ops in self._control_outputs can execute tensorflow will
+    ensure self has finished executing.
+
+    Returns:
+      A list of `Operation` objects.
+
+    """
+    if self._c_op:
+      control_c_ops = c_api.TF_OperationGetControlOutputs_wrapper(self._c_op)
+      # pylint: disable=protected-access
+      return [
+          self.graph._get_operation_by_name_unsafe(
+              c_api.TF_OperationName(c_op)) for c_op in control_c_ops
+      ]
+      # pylint: enable=protected-access
+    else:
+      # TODO(apassos) this should be less inefficient.
+      return [o for o in self._graph.get_operations()
+              if self in o.control_inputs]
+
   @property
   def _control_inputs(self):
     logging.warning("Operation._control_inputs is private, use "
@@ -2137,8 +2258,7 @@ class Operation(object):
     # pylint: enable=line-too-long
     if self._c_op:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_OperationToNodeDef(self._c_op, buf, status)
+        c_api.TF_OperationToNodeDef(self._c_op, buf)
         data = c_api.TF_GetBuffer(buf)
       node_def = node_def_pb2.NodeDef()
       node_def.ParseFromString(compat.as_bytes(data))
@@ -2196,11 +2316,9 @@ class Operation(object):
       buf = c_api.TF_NewBufferFromString(
           compat.as_bytes(attr_value.SerializeToString()))
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf,
-                        status)
-          # pylint: enable=protected-access
+        # pylint: disable=protected-access
+        c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf)
+        # pylint: enable=protected-access
       finally:
         c_api.TF_DeleteBuffer(buf)
     else:
@@ -2222,8 +2340,7 @@ class Operation(object):
     if self._c_op:
       try:
         with c_api_util.tf_buffer() as buf:
-          with errors.raise_exception_on_not_ok_status() as status:
-            c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf, status)
+          c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
           data = c_api.TF_GetBuffer(buf)
       except errors.InvalidArgumentError as e:
         # Convert to ValueError for backwards compatibility.
@@ -2432,28 +2549,41 @@ class RegisterShape(object):
     return f
 
 
-def _set_shapes_for_outputs_c_api(op):
-  """set_shapes_for_outputs implementation when C API is enabled."""
-  # The C API computes the shapes when the TF_Operation is created. Fetch the
-  # output shapes from the C object.
+# TODO(b/74620627): remove when _USE_C_SHAPES is removed
+def _set_shape_and_handle_data_for_outputs_c_api(op):
+  """Set shapes and resource handle data using info from the C API."""
+  assert not _USE_C_SHAPES
   for output in op.outputs:
-    with errors.raise_exception_on_not_ok_status() as status:
-      # pylint: disable=protected-access
-      shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
-          op._graph._c_graph, output._as_tf_output(), status)
-      # pylint: enable=protected-access
-    if unknown_shape:
-      output.set_shape(tensor_shape.unknown_shape())
-    elif not shape_vector:
-      output.set_shape(tensor_shape.scalar())
+    output._shape_val = output._c_api_shape()
+    # Set the resource handle data for compatibility with the Python shape
+    # inference code.
+    serialized = c_api.GetResourceHandleShapeAndType(op._graph._c_graph,
+                                                     output._as_tf_output())
+    if serialized:
+      output._handle_data = (
+          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData
+          .FromString(compat.as_bytes(serialized)))
     else:
-      shape_vector = [None if d == -1 else d for d in shape_vector]
-      output.set_shape(tensor_shape.TensorShape(shape_vector))
+      output._handle_data = None
+
 
+# TODO(b/74620627): remove when _USE_C_SHAPES is removed
+def set_shape_and_handle_data_for_outputs(op):
+  """Set the shapes and resource handle data for op's outputs.
+
+  When _USE_C_API = True, this is lazily called when a tensor's shape is first
+  requested. Usually this should work automatically, but some edge cases may
+  require manaully calling this first to make sure Tensor._shape_val and
+  Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
+  Tensor).
+  """
+  if _USE_C_SHAPES: return
+
+  if op.graph._is_function(op.type):
+    for output in op.outputs:
+      output._shape_val = tensor_shape.unknown_shape()
+    return
 
-# TODO(skyewm): remove this when _USE_C_API flag is removed.
-def _set_shapes_for_outputs(op):
-  """set_shapes_for_outputs implementation when C API is disabled."""
   try:
     shape_func = _shape_registry.lookup(op.type)
   except LookupError:
@@ -2472,8 +2602,10 @@ def _set_shapes_for_outputs(op):
     shapes = shapes_dict["shapes"]
     handle_datas = shapes_dict["handle_data"]
     for output, handle_data in zip(op.outputs, handle_datas):
+      # Don't override any existing handle data that may have been manually set.
       # pylint: disable=protected-access
-      output._handle_data = handle_data
+      if output._handle_data is None:
+        output._handle_data = handle_data
       # pylint: enable=protected-access
 
   if len(op.outputs) != len(shapes):
@@ -2481,15 +2613,8 @@ def _set_shapes_for_outputs(op):
         "Shape function for op %s returned %d shapes but expected %d %s %s" %
         (op, len(shapes), len(op.outputs), shape_func.__name__, str(shapes)))
   for output, s in zip(op.outputs, shapes):
-    output.set_shape(s)
-
-
-def set_shapes_for_outputs(op):
-  """Set the shapes for op's outputs."""
-  if op._c_op:  # pylint: disable=protected-access
-    return _set_shapes_for_outputs_c_api(op)
-  else:
-    return _set_shapes_for_outputs(op)
+    output._shape_val = tensor_shape.unknown_shape()
+    output._shape_val = output._shape_val.merge_with(s)
 
 
 class OpStats(object):
@@ -2690,32 +2815,41 @@ class Graph(object):
 
   def __init__(self):
     """Creates a new, empty Graph."""
-    # Protects the core state that may be accessed by multiple readers.
-    # Only state that can be returned via public accessors (`as_graph_def()`,
-    # `get_operations()`, `as_graph_element()`, `get_collection()`, and
-    # `get_collection_ref()`) is by the lock. Thread-safety is provided on a
-    # best-effort basis to support buggy programs, and is not guaranteed by the
-    # public `tf.Graph` API.
+    # Protects core state that can be returned via public accessors, as well as
+    # synchronizes Session.run calls with methods that create and mutate ops
+    # (e.g. Graph.create_op()). This synchronization is necessary because it's
+    # illegal to modify an operation after it's been run. Thread-safety is
+    # provided on a best-effort basis to support buggy programs, and is not
+    # guaranteed by the public `tf.Graph` API.
+    #
+    # The lock must be reentrant because create_op can be called recursively due
+    # to control flow. Without a reentrant lock, many methods would also need a
+    # "locked" version or parameter (including generated code).
+    #
     # NOTE(mrry): This does not protect the various stacks. A warning will
     # be reported if these are used from multiple threads
-    self._lock = threading.Lock()
+    self._lock = threading.RLock()
     self._nodes_by_id = dict()  # GUARDED_BY(self._lock)
     self._next_id_counter = 0  # GUARDED_BY(self._lock)
     self._nodes_by_name = dict()  # GUARDED_BY(self._lock)
     self._version = 0  # GUARDED_BY(self._lock)
-    # Current name stack: uniquified names
-    self._name_stack = ""
     # Maps a name used in the graph to the next id to use for that name.
     self._names_in_use = {}
+    self._stack_state_is_thread_local = False
+    self._thread_local = threading.local()
     # Functions that will be applied to choose a device if none is specified.
-    self._device_function_stack = []
+    # After switch_to_thread_local(), self._thread_local._device_function_stack
+    # is used instead.
+    self._graph_device_function_stack = []
     # Default original_op applied to new ops.
     self._default_original_op = None
     # Current control flow context. It could be either CondContext or
     # WhileContext defined in ops/control_flow_ops.py
     self._control_flow_context = None
     # A new node will depend of the union of all of the nodes in the stack.
-    self._control_dependencies_stack = []
+    # After switch_to_thread_local(),
+    # self._thread_local._control_dependencies_stack is used instead.
+    self._graph_control_dependencies_stack = []
     # Arbitrary collections of objects.
     self._collections = {}
     # The graph-level random seed
@@ -2737,8 +2871,9 @@ class Graph(object):
         producer=versions.GRAPH_DEF_VERSION,
         min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER)
     self._building_function = False
-    # Stack of colocate_with ops
-    self._colocation_stack = []
+    # Stack of colocate_with ops. After switch_to_thread_local(),
+    # self._thread_local._colocation_stack is used instead.
+    self._graph_colocation_stack = []
     # Set of tensors that are dangerous to feed!
     self._unfeedable_tensors = set()
     # Set of operations that are dangerous to fetch!
@@ -2756,21 +2891,27 @@ class Graph(object):
     # being called inside function definitions behave as if they were seeing the
     # actual outside graph).
     self._graph_key = "grap-key-%d/" % (uid(),)
+    # A string with the last reduction method passed to
+    # losses.compute_weighted_loss(), or None.
+    self._last_loss_reduction = None
     self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
 
     # TODO(skyewm): fold as much of the above as possible into the C
     # implementation
-    if _USE_C_API or self._use_c_api_hack():
+    if self._use_c_api_hack():
       self._scoped_c_graph = c_api_util.ScopedTFGraph()
+      # The C API requires all ops to have shape functions. Disable this
+      # requirement (many custom ops do not have shape functions, and we don't
+      # want to break these existing cases).
+      c_api.SetRequireShapeInferenceFns(self._c_graph, False)
     else:
       self._scoped_c_graph = None
-    self._variable_creator_stack = []
 
   # TODO(apassos) remove once the C API is used by default.
   def _use_c_api_hack(self):
     """Temporary hack; can be overridden to force C API usage."""
-    return False
+    return _USE_C_API
 
   def _convert_stack(self, stack, include_func_start_lineno=False):
     """Converts a stack extracted using _extract_stack() to a traceback stack.
@@ -2806,17 +2947,26 @@ class Graph(object):
   # frozen, and this functionality is still not ready for public visibility.
   @tf_contextlib.contextmanager
   def _variable_creator_scope(self, creator):
+    # This step makes a copy of the existing stack, and it also initializes
+    # self._thread_local._variable_creator_stack if it doesn't exist yet.
     old = list(self._variable_creator_stack)
-    self._variable_creator_stack.append(creator)
+    self._thread_local._variable_creator_stack.append(creator)
     try:
       yield
     finally:
-      self._variable_creator_stack = old
+      self._thread_local._variable_creator_stack = old
 
   # Note: this method is private because the API of tf.Graph() is public and
   # frozen, and this functionality is still not ready for public visibility.
-  def _get_variable_creator_stack(self):
-    return list(self._variable_creator_stack)
+  @property
+  def _variable_creator_stack(self):
+    if not hasattr(self._thread_local, "_variable_creator_stack"):
+      self._thread_local._variable_creator_stack = []
+    return list(self._thread_local._variable_creator_stack)
+
+  @_variable_creator_stack.setter
+  def _variable_creator_stack(self, variable_creator_stack):
+    self._thread_local._variable_creator_stack = variable_creator_stack
 
   def _extract_stack(self):
     """A lightweight, extensible re-implementation of traceback.extract_stack.
@@ -2929,8 +3079,7 @@ class Graph(object):
     # pylint: enable=line-too-long
     if self._c_graph:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_GraphVersions(self._c_graph, buf, status)
+        c_api.TF_GraphVersions(self._c_graph, buf)
         data = c_api.TF_GetBuffer(buf)
       version_def = versions_pb2.VersionDef()
       version_def.ParseFromString(compat.as_bytes(data))
@@ -3030,11 +3179,10 @@ class Graph(object):
 
     """
     # pylint: enable=line-too-long
-    if _USE_C_API:
+    if self._c_graph:
       with self._lock:
         with c_api_util.tf_buffer() as buf:
-          with errors.raise_exception_on_not_ok_status() as status:
-            c_api.TF_GraphToGraphDef(self._c_graph, buf, status)
+          c_api.TF_GraphToGraphDef(self._c_graph, buf)
           data = c_api.TF_GetBuffer(buf)
         graph = graph_pb2.GraphDef()
         graph.ParseFromString(compat.as_bytes(data))
@@ -3143,14 +3291,12 @@ class Graph(object):
       # remove this when all functions are generated using the C API by default
       # as this will be unnecessary.
       if not function._c_func:
-        with errors.raise_exception_on_not_ok_status() as status:
-          serialized = function.definition.SerializeToString()
-          function._c_func = c_api.TF_FunctionImportFunctionDef(
-              serialized, status)
-      with errors.raise_exception_on_not_ok_status() as status:
-        gradient = function._grad_func._c_func if function._grad_func else None
-        c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient,
-                                   status)
+        serialized = function.definition.SerializeToString()
+        c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+        function._c_func = c_api_util.ScopedTFFunction(c_func)
+      gradient = (function._grad_func._c_func.func if function._grad_func
+                  else None)
+      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
     else:
       # If there is already a function with the same name, raise an error
       # if bodies are different. Else, do nothing. The C API version above
@@ -3248,17 +3394,30 @@ class Graph(object):
 
     input_ops = set([t.op for t in inputs])
     control_inputs = self._control_dependencies_for_inputs(input_ops)
-    ret = Operation(
-        node_def,
-        self,
-        inputs=inputs,
-        output_types=dtypes,
-        control_inputs=control_inputs,
-        input_types=input_types,
-        original_op=self._default_original_op,
-        op_def=op_def)
-    self._create_op_helper(ret, compute_shapes=compute_shapes,
-                           compute_device=compute_device)
+    # _create_op_helper mutates the new Operation. _lock ensures a Session.run
+    # call cannot occur between creating and mutating the op.
+    with self._lock:
+      ret = Operation(
+          node_def,
+          self,
+          inputs=inputs,
+          output_types=dtypes,
+          control_inputs=control_inputs,
+          input_types=input_types,
+          original_op=self._default_original_op,
+          op_def=op_def)
+
+      # Note: shapes are lazily computed with the C API enabled.
+      #
+      # TODO(skyewm): unlike in the original Python implementation, the C API
+      # always computes shape information (even for function calls, which the
+      # original Python shape inference code doesn't handle). Deprecate the
+      # compute_shapes argument.
+      if not _USE_C_API and compute_shapes:
+        set_shape_and_handle_data_for_outputs(ret)
+
+      self._create_op_helper(ret, compute_shapes=compute_shapes,
+                             compute_device=compute_device)
     return ret
 
   def _create_op_from_tf_operation(self, c_op, compute_device=True):
@@ -3283,22 +3442,17 @@ class Graph(object):
     """
     self._check_not_finalized()
     ret = Operation(c_op, self)
-    assert ret.name not in self._names_in_use
-    self._names_in_use[ret.name] = 1
+    # If a name_scope was created with ret.name but no nodes were created in it,
+    # the name will still appear in _names_in_use even though the name hasn't
+    # been used. This is ok, just leave _names_in_use as-is in this case.
+    # TODO(skyewm): make the C API guarantee no name conflicts.
+    if ret.name not in self._names_in_use:
+      self._names_in_use[ret.name] = 1
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
   def _create_op_helper(self, op, compute_shapes=True, compute_device=True):
     """Common logic for creating an op in this graph."""
-    # TODO(vrv): Instead of eagerly filling in shape property for every op, only
-    # populate the shape when requested.
-    #
-    # TODO(skyewm): unlike in the original Python implementation, the C API
-    # always computes shape information (even for function calls, which the
-    # original Python shape inference code doesn't handle). Deprecate the
-    # compute_shapes argument.
-    if op._c_op or compute_shapes:  # pylint: disable=protected-access
-      set_shapes_for_outputs(op)
     # TODO(b/XXXX): move to Operation.__init__ once _USE_C_API flag is removed.
     self._add_op(op)
 
@@ -3350,9 +3504,9 @@ class Graph(object):
           if (op.device and pydev.canonical_name(op.device) !=
               pydev.canonical_name(colocation_op.device)):
             logging.warning("Tried to colocate %s with an op %s that had "
-                            "a different device: %s vs %s. "
-                            "Ignoring colocation property.", op.name,
-                            colocation_op.name, op.device,
+                            "a different device: %s vs %s. Postponing "
+                            "error-checking until all devices are assigned.",
+                            op.name, colocation_op.name, op.device,
                             colocation_op.device)
           else:
             op._set_device(colocation_op.device)  # pylint: disable=protected-access
@@ -3402,12 +3556,17 @@ class Graph(object):
         for c_op in c_api_util.new_tf_operations(self)
     ]
 
+    # pylint: disable=protected-access
     for op in new_ops:
+      # Operations created by the C API always retrieve shapes from the C API so
+      # we preserve the shapes of ops created in import_graph_def (from the
+      # "_output_shapes" attr of the imported NodeDef).
+      if not _USE_C_SHAPES:
+        _set_shape_and_handle_data_for_outputs_c_api(op)
       new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
-      # pylint: disable=protected-access
       op._add_control_inputs(new_control_inputs)
       op._control_flow_post_processing()
-      # pylint: enable=protected-access
+    # pylint: enable=protected-access
 
     return new_ops
 
@@ -3653,11 +3812,9 @@ class Graph(object):
     """Returns the `OpDef` proto for `type`. `type` is a string."""
     if self._c_graph:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.TF_GraphGetOpDef(self._c_graph,
-                                 compat.as_bytes(type), buf, status)
-          # pylint: enable=protected-access
+        # pylint: disable=protected-access
+        c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
+        # pylint: enable=protected-access
         data = c_api.TF_GetBuffer(buf)
       op_def = op_def_pb2.OpDef()
       op_def.ParseFromString(compat.as_bytes(data))
@@ -3850,6 +4007,17 @@ class Graph(object):
     finally:
       self._default_original_op = old_original_op
 
+  @property
+  def _name_stack(self):
+    # This may be called from a thread where name_stack doesn't yet exist.
+    if not hasattr(self._thread_local, "_name_stack"):
+      self._thread_local._name_stack = ""
+    return self._thread_local._name_stack
+
+  @_name_stack.setter
+  def _name_stack(self, name_stack):
+    self._thread_local._name_stack = name_stack
+
   # pylint: disable=g-doc-return-or-yield,line-too-long
   @tf_contextlib.contextmanager
   def name_scope(self, name):
@@ -4028,6 +4196,19 @@ class Graph(object):
     """
     return self._name_stack
 
+  @tf_contextlib.contextmanager
+  def _colocate_with_for_gradient(self, op, gradient_uid,
+                                  ignore_existing=False):
+    with self.colocate_with(op, ignore_existing):
+      if gradient_uid is not None and self._control_flow_context is not None:
+        try:
+          self._control_flow_context.EnterGradientColocation(op, gradient_uid)
+          yield
+        finally:
+          self._control_flow_context.ExitGradientColocation(op, gradient_uid)
+      else:
+        yield
+
   @tf_contextlib.contextmanager
   def colocate_with(self, op, ignore_existing=False):
     """Returns a context manager that specifies an op to colocate with.
@@ -4422,6 +4603,22 @@ class Graph(object):
         return tf.matmul(tensor, tensor)
     ```
 
+    Also note that though execution of ops created under this scope will trigger
+    execution of the dependencies, the ops created under this scope might still
+    be pruned from a normal tensorflow graph. For example, in the following
+    snippet of code the dependencies are never executed:
+
+    ```python
+      loss = model.loss()
+      with tf.control_dependencies(dependencies):
+        loss = loss + tf.constant(1)  # note: dependencies ignored in the
+                                      # backward pass
+      return tf.gradients(loss, model.variables)
+    ```
+
+    This is because evaluating the gradient graph does not require evaluating
+    the constant(1) op created in the forward pass.
+
     Args:
       control_inputs: A list of `Operation` or `Tensor` objects which
         must be executed or computed before running the operations
@@ -4669,6 +4866,79 @@ class Graph(object):
     else:
       return tensor_or_op not in self._unfetchable_ops
 
+  def switch_to_thread_local(self):
+    """Make device, colocation and dependencies stacks thread-local.
+
+    Device, colocation and dependencies stacks are not thread-local be default.
+    If multiple threads access them, then the state is shared.  This means that
+    one thread may affect the behavior of another thread.
+
+    After this method is called, the stacks become thread-local.  If multiple
+    threads access them, then the state is not shared.  Each thread uses its own
+    value; a thread doesn't affect other threads by mutating such a stack.
+
+    The initial value for every thread's stack is set to the current value
+    of the stack when `switch_to_thread_local()` was first called.
+    """
+    if not self._stack_state_is_thread_local:
+      self._stack_state_is_thread_local = True
+
+  @property
+  def _device_function_stack(self):
+    if self._stack_state_is_thread_local:
+      # This may be called from a thread where device_function_stack doesn't yet
+      # exist.
+      if not hasattr(self._thread_local, "_device_function_stack"):
+        self._thread_local._device_function_stack = (
+            self._graph_device_function_stack[:])
+      return self._thread_local._device_function_stack
+    else:
+      return self._graph_device_function_stack
+
+  @_device_function_stack.setter
+  def _device_function_stack(self, device_function_stack):
+    if self._stack_state_is_thread_local:
+      self._thread_local._device_function_stack = device_function_stack
+    else:
+      self._graph_device_function_stack = device_function_stack
+
+  @property
+  def _colocation_stack(self):
+    if self._stack_state_is_thread_local:
+      # This may be called from a thread where colocation_stack doesn't yet
+      # exist.
+      if not hasattr(self._thread_local, "_colocation_stack"):
+        self._thread_local._colocation_stack = self._graph_colocation_stack[:]
+      return self._thread_local._colocation_stack
+    else:
+      return self._graph_colocation_stack
+
+  @_colocation_stack.setter
+  def _colocation_stack(self, colocation_stack):
+    if self._stack_state_is_thread_local:
+      self._thread_local._colocation_stack = colocation_stack
+    else:
+      self._graph_colocation_stack = colocation_stack
+
+  @property
+  def _control_dependencies_stack(self):
+    if self._stack_state_is_thread_local:
+      # This may be called from a thread where control_dependencies_stack
+      # doesn't yet exist.
+      if not hasattr(self._thread_local, "_control_dependencies_stack"):
+        self._thread_local._control_dependencies_stack = (
+            self._graph_control_dependencies_stack[:])
+      return self._thread_local._control_dependencies_stack
+    else:
+      return self._graph_control_dependencies_stack
+
+  @_control_dependencies_stack.setter
+  def _control_dependencies_stack(self, control_dependencies):
+    if self._stack_state_is_thread_local:
+      self._thread_local._control_dependencies_stack = control_dependencies
+    else:
+      self._graph_control_dependencies_stack = control_dependencies
+
 
 # TODO(agarwal): currently device directives in an outer eager scope will not
 # apply to inner graph mode code. Fix that.
@@ -4693,15 +4963,15 @@ def device(device_name_or_function):
   Raises:
     RuntimeError: If eager execution is enabled and a function is passed in.
   """
-  if context.in_graph_mode():
-    return get_default_graph().device(device_name_or_function)
-  else:
+  if context.executing_eagerly():
     # TODO(agarwal): support device functions in EAGER mode.
     if callable(device_name_or_function):
       raise RuntimeError(
           "tf.device does not support functions when eager execution "
           "is enabled.")
     return context.device(device_name_or_function)
+  else:
+    return get_default_graph().device(device_name_or_function)
 
 
 @tf_export("container")
@@ -4718,15 +4988,27 @@ def container(container_name):
   return get_default_graph().container(container_name)
 
 
-@tf_export("colocate_with")
-def colocate_with(op, ignore_existing=False):
-  if context.in_graph_mode():
-    return get_default_graph().colocate_with(op, ignore_existing)
-  else:
+def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
+  if context.executing_eagerly():
     if op is not None:
       return device(op.device)
     else:
       return _NullContextmanager()
+  else:
+    default_graph = get_default_graph()
+    if isinstance(op, EagerTensor):
+      if default_graph.building_function:
+        return default_graph.device(op.device)
+      else:
+        raise ValueError("Encountered an Eager-defined Tensor during graph "
+                         "construction, but a function was not being built.")
+    return default_graph._colocate_with_for_gradient(
+        op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
+
+
+@tf_export("colocate_with")
+def colocate_with(op, ignore_existing=False):
+  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
 
 
 @tf_export("control_dependencies")
@@ -4736,20 +5018,29 @@ def control_dependencies(control_inputs):
   See @{tf.Graph.control_dependencies}
   for more details.
 
+  When eager execution is enabled, any callable object in the `control_inputs`
+  list will be called.
+
   Args:
     control_inputs: A list of `Operation` or `Tensor` objects which
       must be executed or computed before running the operations
       defined in the context.  Can also be `None` to clear the control
-      dependencies.
+      dependencies. If eager execution is enabled, any callable object in the
+      `control_inputs` list will be called.
 
   Returns:
    A context manager that specifies control dependencies for all
    operations constructed within the context.
   """
-  if context.in_graph_mode():
-    return get_default_graph().control_dependencies(control_inputs)
-  else:
+  if context.executing_eagerly():
+    if control_inputs:
+      # Excute any pending callables.
+      for control in control_inputs:
+        if callable(control):
+          control()
     return _NullContextmanager()
+  else:
+    return get_default_graph().control_dependencies(control_inputs)
 
 
 class _DefaultStack(threading.local):
@@ -4970,11 +5261,33 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     try:
-      context.context_stack.push(default.building_function, default.as_default)
+      if context.executing_eagerly():
+        # A Graph alone on the context stack would keep init_scope-wrapped
+        # operations graph building when entered (assuming init_scope is called
+        # in a graph building context). Instead, we push a context which first
+        # enables eager execution and then re-enters the Graph.
+        context.context().context_switches.push(
+            default.building_function,
+            functools.partial(
+                _enter_context_and_graph,
+                context.eager_mode,
+                default.as_default))
+      else:
+        # This Graph is being used from a graph building context. A lack of
+        # context switch implies that the context is graph building.
+        context.context().context_switches.push(default.building_function,
+                                                default.as_default)
       with super(_DefaultGraphStack, self).get_controller(default) as g:
         yield g
     finally:
-      context.context_stack.pop()
+      context.context().context_switches.pop()
+
+
+@tf_contextlib.contextmanager
+def _enter_context_and_graph(context_fn, graph_fn):
+  """Combines two context managers."""
+  with context_fn(), graph_fn():
+    yield
 
 
 _default_graph_stack = _DefaultGraphStack()
@@ -5000,87 +5313,130 @@ def init_scope():
         graph function. Here, a context is defined as either a graph or an eager
         context. Every context switch, i.e., every installation of a graph as
         the default graph and every switch into eager mode, is logged in a
-        thread-local stack called the `context_stack`; the log entry for a
+        thread-local stack called `context_switches`; the log entry for a
         context switch is popped from the stack when the context is exited.
-        Entering an `init_scope` is equivalent to crawling up the
-        `context_stack`, finding the first context that is not building a graph
-        function, and entering it. A caveat is that if graph mode is enabled
-        but the default graph stack is empty, then entering an `init_scope`
-        will simply install a fresh graph as the default one.
+        Entering an `init_scope` is equivalent to crawling up
+        `context_switches`, finding the first context that is not building a
+        graph function, and entering it. A caveat is that if graph mode is
+        enabled but the default graph stack is empty, then entering an
+        `init_scope` will simply install a fresh graph as the default one.
 
     (3) The gradient tape is paused while the scope is active.
   """
   # pylint: enable=g-doc-return-or-yield,line-too-long
 
-  in_graph_mode = context.in_graph_mode()
-  # Retrieve the active name scope: entering an `init_scope` preserves
-  # the name scope of the current context.
-  if in_graph_mode:
+  if context.executing_eagerly():
+    # Fastpath.
+    with tape.stop_recording():
+      yield
+  else:
+    # Retrieve the active name scope: entering an `init_scope` preserves
+    # the name scope of the current context.
     default_graph = get_default_graph()
     scope = default_graph.get_name_scope()
-  else:
-    scope = context.context().scope_name
-  if scope and scope[-1] != '/':
-    # Names that end with trailing slashes are treated by `name_scope` as
-    # absolute.
-    scope = scope + '/'
-
-  outer_context = None
-  if in_graph_mode and not _default_graph_stack.stack:
-    outer_context = default_graph.as_default
-  else:
-    for stack_entry in reversed(context.context_stack.stack):
-      if not stack_entry.is_building_function:
-        outer_context = stack_entry.enter_context_fn
-        break
+    if scope and scope[-1] != '/':
+      # Names that end with trailing slashes are treated by `name_scope` as
+      # absolute.
+      scope = scope + '/'
+
+    outer_context = None
+    if not _default_graph_stack.stack:
+      # If the default graph stack is empty, then we cannot be building a
+      # function. Install the global graph (which, in this case, is also the
+      # default graph) as the outer context.
+      if default_graph.building_function:
+        raise RuntimeError("The global graph is building a function.")
+      outer_context = default_graph.as_default
+    else:
+      # Find a context that is not building a function.
+      for stack_entry in reversed(context.context().context_switches.stack):
+        if not stack_entry.is_building_function:
+          outer_context = stack_entry.enter_context_fn
+          break
+
+      if outer_context is None:
+        # As a last resort, obtain the global default graph; this graph doesn't
+        # necessarily live on the graph stack (and hence it doesn't necessarily
+        # live on the context stack), but it is stored in the graph stack's
+        # encapsulating object.
+        outer_context = _default_graph_stack._GetGlobalDefaultGraph().as_default  # pylint: disable=protected-access
 
-  if outer_context is None:
-    raise AssertionError("All graphs are building functions, and no "
+    if outer_context is None:
+      # Sanity check; this shouldn't be triggered.
+      raise RuntimeError("All graphs are building functions, and no "
                          "eager context was previously active.")
 
-  try:
     with outer_context(), name_scope(scope), control_dependencies(
         None), tape.stop_recording():
       yield
-  finally:
-    pass
 
 
-def enable_eager_execution(config=None, device_policy=None):
-  """Enables, for the rest of the lifetime of this program, eager execution.
+@tf_export("enable_eager_execution")
+def enable_eager_execution(config=None, device_policy=None,
+                           execution_mode=None):
+  """Enables eager execution for the lifetime of this program.
 
-  If not called immediately on startup risks creating breakage and bugs.
+  Eager execution provides an imperative interface to TensorFlow. With eager
+  execution enabled, TensorFlow functions execute operations immediately (as
+  opposed to adding to a graph to be executed later in a @{tf.Session}) and
+  return concrete values (as opposed to symbolic references to a node in a
+  computational graph).
 
-  Example:
+  For example:
   ```python
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
 
   # After eager execution is enabled, operations are executed as they are
-  # defined and `Tensor`s hold concrete values, which can be accessed as
-  # `numpy.ndarray`s through the `numpy()` method.
+  # defined and Tensor objects hold concrete values, which can be accessed as
+  # numpy.ndarray`s through the numpy() method.
   assert tf.multiply(6, 7).numpy() == 42
   ```
 
+  Eager execution cannot be enabled after TensorFlow APIs have been used to
+  create or execute graphs. It is typically recommended to invoke this function
+  at program startup and not in a library (as most libraries should be usable
+  both with and without eager execution).
+
   Args:
-    config: (Optional.) A `ConfigProto` protocol buffer with configuration
-     options for the Context. Note that a lot of these options may be
-     currently unimplemented or irrelevant when eager execution is enabled.
-    device_policy: (Optional.) What policy to use when trying to run an
-     operation on a device with inputs which are not on that device.
+    config: (Optional.) A @{tf.ConfigProto} to use to configure the environment
+      in which operations are executed. Note that @{tf.ConfigProto} is also
+      used to configure graph execution (via @{tf.Session}) and many options
+      within `tf.ConfigProto` are not implemented (or are irrelevant) when
+     eager execution is enabled.
+    device_policy: (Optional.) Policy controlling how operations requiring
+     inputs on a specific device (e.g., a GPU 0) handle inputs on a different
+     device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
+     picked automatically. The value picked may change between TensorFlow
+     releases.
      Valid values:
-       tfe.DEVICE_PLACEMENT_EXPLICIT: raises an error if the placement is not
-         correct.
-       tfe.DEVICE_PLACEMENT_WARN: copies the tensors which are not on the
-         right device but raises a warning.
-       tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
-         hide performance problems.
-       tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
-         raising errors on the other ones.
+
+      - tf.contrib.eager.DEVICE_PLACEMENT_EXPLICIT: raises an error if the
+        placement is not correct.
+
+      - tf.contrib.eager.DEVICE_PLACEMENT_WARN: copies the tensors which are not
+        on the right device but logs a warning.
+
+      - tf.contrib.eager.DEVICE_PLACEMENT_SILENT: silently copies the tensors.
+        Note that this may hide performance problems as there is no notification
+        provided when operations are blocked on the tensor being copied between
+        devices.
+
+      - tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies
+        int32 tensors, raising errors on the other ones.
+    execution_mode: (Optional.) Policy controlling how operations dispatched are
+      actually executed. When set to None, an appropriate value will be picked
+      automatically. The value picked may change between TensorFlow releases.
+      Valid values:
+
+        - tf.contrib.eager.SYNC: executes each operation synchronously.
+
+        - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+          operations may return "non-ready" handles.
 
   Raises:
-    ValueError: If trying to create a context after using graph operations
-     or if trying to create a context with nontrivial options which differ
-     from those of the existing context.
+    ValueError: If eager execution is enabled after creating/executing a
+     TensorFlow graph, or if options provided conflict with a previous call
+     to this function.
   """
   if config is not None and not isinstance(config, config_pb2.ConfigProto):
     raise TypeError(
@@ -5090,8 +5446,12 @@ def enable_eager_execution(config=None, device_policy=None):
                            context.DEVICE_PLACEMENT_SILENT,
                            context.DEVICE_PLACEMENT_SILENT_FOR_INT32):
     raise ValueError(
-        "device_policy must be one of None, tfe.DEVICE_PLACEMENT_*"
+        "device_policy must be one of None, tf.contrib.eager.DEVICE_PLACEMENT_*"
     )
+  if execution_mode not in (None, context.SYNC, context.ASYNC):
+    raise ValueError(
+        "execution_mode must be one of None, tf.contrib.eager.SYNC, "
+        "tf.contrib.eager.ASYNC")
   # pylint: disable=protected-access
   if context._default_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
@@ -5099,30 +5459,33 @@ def enable_eager_execution(config=None, device_policy=None):
         _default_graph_stack._global_default_graph is not None)
     if graph_mode_has_been_used:
       raise ValueError(
-          "tfe.enable_eager_execution has to be called at program startup.")
+          "tf.enable_eager_execution must be called at program startup.")
   context._default_mode = context.EAGER_MODE
   if context._context is None:
-    context._context = context.Context(config=config,
-                                       device_policy=device_policy)
-    if context.context_stack.stack:
-      raise AssertionError("Invariant violated: The context stack must "
-                           "be empty when eager execution is enabled.")
-    # Log that eager execution has been enabled by pushing an entry onto the
-    # context stack; this entry won't ever be popped, as it's impossible to
-    # disable eager execution
-    context.context_stack.push(False, context.eager_mode)
-  elif ((config is not None and config is not context._context._config)
-        or (device_policy is not None
-            and device_policy is not context._context._device_policy)):
+    context._context = context.Context(
+        config=config,
+        device_policy=device_policy,
+        execution_mode=execution_mode)
+  elif ((config is not None and config is not context._context._config) or
+        (device_policy is not None and
+         device_policy is not context._context._device_policy) or
+        (execution_mode is not None and
+         execution_mode is not context._context._execution_mode)):
     raise ValueError("Trying to change the options of an active eager"
                      " execution. Context config: %s, specified config:"
-                     " %s. Context device policy: %s; specified device"
-                     " policy: %s." % (config, context._context._config,
-                                       device_policy,
-                                       context._context._device_policy))
+                     " %s. Context device policy: %s, specified device"
+                     " policy: %s. Context execution mode: %s, "
+                     " specified execution mode %s." %
+                     (context._context._config, config,
+                      context._context._device_policy, device_policy,
+                      context._context._execution_mode, execution_mode))
   else:
     raise ValueError(
-        "tfe.enable_eager_execution has to be called at program startup.")
+        "tf.enable_eager_execution must be called at program startup.")
+
+  # Monkey patch to get rid of an unnecessary conditional since the context is
+  # now initialized.
+  context.context = context.context_safe
 
 
 def eager_run(main=None, argv=None):
@@ -5206,6 +5569,8 @@ def get_name_scope():
   Returns:
     A string representing the current name scope.
   """
+  if context.executing_eagerly():
+    return context.context().scope_name.rstrip("/")
   return get_default_graph().get_name_scope()
 
 
@@ -5460,7 +5825,7 @@ def add_to_collection(name, value):
   """
   get_default_graph().add_to_collection(name, value)
 
-
+@tf_export("add_to_collections")
 def add_to_collections(names, value):
   """Wrapper for `Graph.add_to_collections()` using the default graph.
 
@@ -5582,7 +5947,7 @@ class name_scope(object):  # pylint: disable=invalid-name
     self._default_name = default_name
     self._values = values
     self._ctx = context.context()
-    self._in_eager_mode = self._ctx.in_eager_mode()
+    self._in_eager_mode = self._ctx.executing_eagerly()
 
   def __enter__(self):
     """Start the scope block.
@@ -5656,6 +6021,9 @@ def strip_name_scope(name, export_scope):
     is None.
   """
   if export_scope:
+    if export_scope[-1] == "/":
+      export_scope = export_scope[:-1]
+
     try:
       # Strips export_scope/, export_scope///,
       # ^export_scope/, loc:@export_scope/.
@@ -5681,6 +6049,9 @@ def prepend_name_scope(name, import_scope):
     is None.
   """
   if import_scope:
+    if import_scope[-1] == "/":
+      import_scope = import_scope[:-1]
+
     try:
       str_to_replace = r"([\^]|loc:@|^)(.*)"
       return re.sub(str_to_replace, r"\1" + import_scope + r"/\2",
@@ -5761,10 +6132,11 @@ def get_from_proto_function(collection_name):
 
 
 def _assert_collection_is_ok(collection_name):
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     if collection_name in GraphKeys._VARIABLE_COLLECTIONS:  # pylint: disable=protected-access
-      raise ValueError("When Eager Execution is enabled, variable "
-                       "collections are not supported.")
+      raise ValueError(
+          "variable collections are not supported when eager execution is enabled."
+      )
 
 
 def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index c6deafd89eb1bdc4892a65ba3ab8c7900915390f..c9c1a3d66be1051859b3dc4eef67803881efcd55 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
+import threading
 import weakref
 
 from tensorflow.core.framework import attr_value_pb2
@@ -472,6 +473,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x, x])
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, x, x, y, y])
+    self.assertEqual(x._control_outputs, [z])
 
   def testAddControlInputC(self):
     # The C API dedups redundant control edges, pure Python does not
@@ -486,6 +488,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x])
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, y])
+    self.assertEqual(x._control_outputs, [z])
 
   def testRemoveAllControlInputs(self):
     a = constant_op.constant(1)
@@ -762,6 +765,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(g.get_operation_by_name("myop"), op)
     self.assertEqual(g.get_tensor_by_name("myop:0"), op.outputs[0])
 
+  @test_util.enable_c_shapes
   def testShape(self):
     g = ops.Graph()
     with g.as_default():
@@ -1381,6 +1385,209 @@ class DeviceTest(test_util.TensorFlowTestCase):
     """, gd)
 
 
+@test_util.with_c_api
+class MultithreadedGraphStateTest(test_util.TensorFlowTestCase):
+
+  class TestThread(threading.Thread):
+
+    def __init__(self, graph, replica_id):
+      super(MultithreadedGraphStateTest.TestThread, self).__init__()
+      self._graph = graph
+      self._replica_id = replica_id
+      # This thread sets this event when it mutated the graph.  The caller can
+      # wait for that.
+      self.has_mutated_graph = threading.Event()
+      # This thread waits for when it should continue.  The caller can set this
+      # event.
+      self.should_continue = threading.Event()
+
+    def run(self):
+      # Mutate a graph's stack, then set `has_mutated_graph`, then wait for
+      # `should_continue`, then add an op to the graph affected by the graph's
+      # stack.
+      raise NotImplementedError("must be implemented in descendants")
+
+  def testDeviceFunctionStack(self):
+
+    class DeviceSettingThread(self.TestThread):
+
+      def run(self):
+        with g.device("/job:worker/replica:{}".format(self._replica_id)):
+          self.has_mutated_graph.set()
+          self.should_continue.wait()
+          self.should_continue.clear()
+          g.create_op(
+              "FloatOutput", [], [dtypes.float32],
+              name="FloatOutput_{}".format(self._replica_id))
+
+    g = ops.Graph()
+    # If `switch_to_thread` isn't called, then device placement of the ops
+    # below is not deterministic.
+    g.switch_to_thread_local()
+    threads = [DeviceSettingThread(g, i) for i in range(3)]
+    for t in threads:
+      t.start()
+      t.has_mutated_graph.wait()
+      t.has_mutated_graph.clear()
+    for t in threads:
+      t.should_continue.set()
+      t.join()
+
+    gd = g.as_graph_def()
+    self.assertProtoEqualsVersion("""
+      node { name: "FloatOutput_0" op: "FloatOutput"
+             device: "/job:worker/replica:0" }
+      node { name: "FloatOutput_1" op: "FloatOutput"
+             device: "/job:worker/replica:1" }
+      node { name: "FloatOutput_2" op: "FloatOutput"
+             device: "/job:worker/replica:2" }
+    """, gd)
+
+  def testColocateWith(self):
+
+    class ColocatingThread(self.TestThread):
+
+      def __init__(self, graph, replica_id, op_to_colocate_with):
+        super(ColocatingThread, self).__init__(graph, replica_id)
+        self._op_to_colocate_with = op_to_colocate_with
+
+      def run(self):
+        with g.colocate_with(self._op_to_colocate_with):
+          self.has_mutated_graph.set()
+          self.should_continue.wait()
+          self.should_continue.clear()
+          g.create_op(
+              "FloatOutput", [], [dtypes.float32],
+              name="FloatOutput_{}".format(self._replica_id))
+
+    g = ops.Graph()
+    ops_to_colocate_with = []
+    for i in range(3):
+      with g.device("/job:worker/replica:{}".format(i)):
+        ops_to_colocate_with.append(
+            g.create_op(
+                "FloatOutput", [], [dtypes.float32],
+                name="ColocateWithMe_{}".format(i)))
+
+    # If `switch_to_thread` isn't called, then `device` and `attr` values for
+    # the ops below are not deterministic.
+    g.switch_to_thread_local()
+    threads = [
+        ColocatingThread(g, i, ops_to_colocate_with[i]) for i in range(3)
+    ]
+    for t in threads:
+      t.start()
+      t.has_mutated_graph.wait()
+      t.has_mutated_graph.clear()
+    for t in threads:
+      t.should_continue.set()
+      t.join()
+
+    gd = g.as_graph_def()
+    self.assertProtoEqualsVersion("""
+      node { name: "ColocateWithMe_0" op: "FloatOutput"
+             device: "/job:worker/replica:0" }
+      node { name: "ColocateWithMe_1" op: "FloatOutput"
+             device: "/job:worker/replica:1" }
+      node { name: "ColocateWithMe_2" op: "FloatOutput"
+             device: "/job:worker/replica:2" }
+      node { name: "FloatOutput_0" op: "FloatOutput"
+             device: "/job:worker/replica:0"
+             attr { key: "_class"
+               value { list {
+                 s: "loc:@ColocateWithMe_0"}}}}
+      node { name: "FloatOutput_1" op: "FloatOutput"
+             device: "/job:worker/replica:1"
+             attr { key: "_class"
+               value { list {
+                 s: "loc:@ColocateWithMe_1"}}}}
+      node { name: "FloatOutput_2" op: "FloatOutput"
+             device: "/job:worker/replica:2"
+             attr { key: "_class"
+               value { list {
+                 s: "loc:@ColocateWithMe_2"}}}}
+    """, gd)
+
+  def testControlDependencies(self):
+
+    class DependingThread(self.TestThread):
+
+      def __init__(self, graph, replica_id, dependency_op):
+        super(DependingThread, self).__init__(graph, replica_id)
+        self._dependency_op = dependency_op
+
+      def run(self):
+        with g.control_dependencies([self._dependency_op]):
+          self.has_mutated_graph.set()
+          self.should_continue.wait()
+          self.should_continue.clear()
+          g.create_op(
+              "FloatOutput", [], [dtypes.float32],
+              name="FloatOutput_{}".format(self._replica_id))
+
+    g = ops.Graph()
+    dependency_ops = []
+    for i in range(3):
+      dependency_ops.append(
+          g.create_op(
+              "FloatOutput", [], [dtypes.float32],
+              name="ColocateWithMe_{}".format(i)))
+
+    # If `switch_to_thread` isn't called, then `input` values for the ops below
+    # are not deterministic.
+    g.switch_to_thread_local()
+    threads = [DependingThread(g, i, dependency_ops[i]) for i in range(3)]
+    for t in threads:
+      t.start()
+      t.has_mutated_graph.wait()
+      t.has_mutated_graph.clear()
+    for t in threads:
+      t.should_continue.set()
+      t.join()
+
+    gd = g.as_graph_def()
+    self.assertProtoEqualsVersion("""
+      node { name: "ColocateWithMe_0" op: "FloatOutput" }
+      node { name: "ColocateWithMe_1" op: "FloatOutput" }
+      node { name: "ColocateWithMe_2" op: "FloatOutput" }
+      node { name: "FloatOutput_0" op: "FloatOutput"
+             input: "^ColocateWithMe_0" }
+      node { name: "FloatOutput_1" op: "FloatOutput"
+             input: "^ColocateWithMe_1" }
+      node { name: "FloatOutput_2" op: "FloatOutput"
+             input: "^ColocateWithMe_2" }
+    """, gd)
+
+  def testNameStack(self):
+
+    class NameSettingThread(self.TestThread):
+
+      def run(self):
+        with g.name_scope("foo"):
+          op1 = g.create_op("FloatOutput", [], [dtypes.float32])
+          self.has_mutated_graph.set()
+          self.should_continue.wait()
+          self.should_continue.clear()
+          op2 = g.create_op("FloatOutput", [], [dtypes.float32])
+          self.result = (op1, op2)
+
+    g = ops.Graph()
+    threads = [NameSettingThread(g, i) for i in range(3)]
+    for t in threads:
+      t.start()
+      t.has_mutated_graph.wait()
+      t.has_mutated_graph.clear()
+
+    for t in threads:
+      t.should_continue.set()
+      t.join()
+
+    suffixes = ["", "_1", "_2"]
+    for t, s in zip(threads, suffixes):
+      self.assertEquals("foo" + s + "/FloatOutput", t.result[0].name)
+      self.assertEquals("foo" + s + "/FloatOutput_1", t.result[1].name)
+
+
 @test_util.with_c_api
 class ObjectWithName(object):
 
@@ -1588,7 +1795,13 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
       return constant_op.constant(2.0)
     future.calls = 0
 
-    if context.in_graph_mode():
+    if context.executing_eagerly():
+      a = constant_op.constant(1.0)
+      b = future
+      with ops.control_dependencies([a, b]):
+        c = constant_op.constant(3.0)
+      self.assertEqual(future.calls, 1)
+    else:
       g = ops.Graph()
       with g.as_default():
         a = constant_op.constant(1.0)
@@ -1597,12 +1810,6 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
           c = constant_op.constant(3.0)
       self.assertEqual(c.op.control_inputs, [a.op, b.op])
       self.assertEqual(future.calls, 1)
-    else:
-      a = constant_op.constant(1.0)
-      b = future()
-      with ops.control_dependencies([a, b]):
-        c = constant_op.constant(3.0)
-      self.assertEqual(future.calls, 1)
 
   def testBasicWithConversion(self):
     g = ops.Graph()
@@ -1975,19 +2182,11 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           with ops.init_scope():
             # Because g is building a function, init_scope should
             # escape out to the eager context.
-            self.assertTrue(context.in_eager_mode())
+            self.assertTrue(context.executing_eagerly())
           # g should be reinstated as the default graph, and the
           # graph context should be re-entered.
           self.assertIs(g, ops.get_default_graph())
-          self.assertTrue(context.in_graph_mode())
-
-  def testAllGraphsBuildingFunctionsRaisesError(self):
-    g = ops.Graph()
-    g._building_function = True  # pylint: disable=protected-access
-    with g.as_default():
-      with self.assertRaises(AssertionError):
-        with ops.init_scope():
-          pass
+          self.assertFalse(context.executing_eagerly())
 
   def testStaysInEagerWhenOnlyEagerContextActive(self):
     with context.eager_mode():
@@ -2066,6 +2265,29 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual(4, int(compiled_outer(inner=compiled_inner)))
       self.assertEqual(7, int(compiled_outer(inner=compiled_inner)))
 
+  def testFallsBackToGlobalGraphWhenAllGraphsAreBuildingFunctions(self):
+    with context.graph_mode():
+      ops.reset_default_graph()
+      # This doesn't push anything onto the graph stack, but it does
+      # set the stack's global graph.
+      global_graph = ops.get_default_graph()
+      fn_graph = ops.Graph()
+
+      # pylint: disable=protected-access
+      fn_graph._building_function = True
+      self.assertEqual(len(ops._default_graph_stack.stack), 0)
+      with fn_graph.as_default():
+        self.assertEqual(len(ops._default_graph_stack.stack), 1)
+        with ops.init_scope():
+          self.assertGreater(len(ops._default_graph_stack.stack), 1)
+          dummy = constant_op.constant(1.0)
+        self.assertEqual(len(ops._default_graph_stack.stack), 1)
+      # Note that the global graph is _not_ on the graph stack.
+      self.assertEqual(len(ops._default_graph_stack.stack), 0)
+      # Ensure that `dummy` was added to the global graph.
+      self.assertEqual(global_graph, dummy.graph)
+      # pylint: enable=protected-access
+
   def testInstallsDefaultGraphWhenGraphStackIsEmptyInGraphMode(self):
     with context.graph_mode():
       # pylint: disable=protected-access
@@ -2083,16 +2305,24 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           self.assertEqual(ops.get_name_scope(), "inner")
       self.assertEqual(ops.get_name_scope(), "")
 
+  def testEagerGraphContextsExecuteEagerly(self):
+    with context.eager_mode():
+      with ops.Graph().as_default():
+        with context.graph_mode():
+          with ops.init_scope():
+            self.assertTrue(context.executing_eagerly())
+
   def testPreservesNameScopeInEagerExecution(self):
     with context.eager_mode():
       def foo():
         with ops.name_scope("inner"), ops.init_scope():
-          if context.in_graph_mode():
-            self.assertEqual(ops.get_name_scope(), "inner")
-          else:
+          if context.executing_eagerly():
             # A trailing slash is always appended when eager execution is
             # enabled.
             self.assertEqual(context.context().scope_name, "inner/")
+          else:
+            self.assertEqual(ops.get_name_scope(), "inner")
+
       foo()
       self.assertEqual(ops.get_name_scope(), "")
       foo_compiled = eager_function.defun(foo)
@@ -2702,7 +2932,7 @@ class OutputTypesTest(test_util.TensorFlowTestCase):
     with g.as_default():
       x = constant_op.constant([1, 1, 2, 4, 4, 4, 7, 8, 8],
                                dtype=dtypes.double)
-      y, _ = gen_array_ops._unique(x)
+      y, _ = gen_array_ops.unique(x)
       self.assertEqual([types_pb2.DT_DOUBLE, types_pb2.DT_INT32],
                        y.op._output_types)  # pylint: disable=protected-access
 
@@ -2727,6 +2957,9 @@ class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "device_policy must be one of"):
       c = config_pb2.ConfigProto()
       ops.enable_eager_execution(c, c)
+    with self.assertRaisesRegexp(ValueError, "execution_mode must be one of"):
+      c = config_pb2.ConfigProto()
+      ops.enable_eager_execution(c, execution_mode=c)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index c95149d177990e364c3d6b9daeae5dc535cf0070..ad6c36b4b1773ecbbb5be56f7eaf2d78af3d010c 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -75,6 +75,33 @@ bool IsPythonReserved(const string& s) {
   return kPythonReserved->count(s) > 0;
 }
 
+bool IsOpWithUnderscorePrefix(const string& s) {
+  static const std::set<string>* const kUnderscoreOps = new std::set<string>(
+      {// Lowercase built-in functions and types in Python, from:
+       // [x for x in dir(__builtins__) if x[0].islower()] except "round".
+       // These need to be excluded so they don't conflict with actual built-in
+       // functions since we use '*' imports.
+       "abs", "all", "any", "apply", "bin", "bool", "buffer", "bytearray",
+       "bytes", "callable", "chr", "classmethod", "cmp", "coerce", "compile",
+       "complex", "copyright", "credits", "delattr", "dict", "dir", "divmod",
+       "enumerate", "eval", "execfile", "exit", "file", "filter", "float",
+       "format", "frozenset", "getattr", "globals", "hasattr", "hash", "help",
+       "hex", "id", "input", "int", "intern", "isinstance", "issubclass",
+       "iter", "len", "license", "list", "locals", "long", "map", "max",
+       "memoryview", "min", "next", "object", "oct", "open", "ord", "pow",
+       "print", "property", "quit", "range", "raw_input", "reduce", "reload",
+       "repr", "reversed", "set", "setattr", "slice", "sorted", "staticmethod",
+       "str", "sum", "super", "tuple", "type", "unichr", "unicode", "vars",
+       "xrange", "zip",
+       // These have the same name as ops defined in Python and might be used
+       // incorrectly depending on order of '*' imports.
+       // TODO(annarev): reduce usage of '*' imports and remove these from the
+       // list.
+       "fused_batch_norm", "histogram_fixed_width", "stack",
+       "batch_norm_with_global_normalization", "clip_by_value"});
+  return kUnderscoreOps->count(s) > 0;
+}
+
 string AvoidPythonReserved(const string& s) {
   if (IsPythonReserved(s)) return strings::StrCat(s, "_");
   return s;
@@ -421,7 +448,7 @@ string AttrValueToPython(const string& type, const AttrValue& value,
     return TensorToPython(value.tensor());
   } else if (type == "func") {
     return StringToPython(value.func().name());
-  } else if (StringPiece(type).starts_with("list(")) {
+  } else if (str_util::StartsWith(type, "list(")) {
     return strings::StrCat("[", AttrListToPython(value, dtype_module), "]");
   } else {
     return "?";
@@ -816,6 +843,7 @@ from tensorflow.python.util.tf_export import tf_export
     // An op is hidden if either its ApiDef visibility is HIDDEN
     // or it is in the hidden_ops list.
     bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
+    bool hidden_by_api_def = is_hidden;
     if (!is_hidden) {
       for (const string& hidden : hidden_ops) {
         if (op_def.name() == hidden) {
@@ -828,13 +856,22 @@ from tensorflow.python.util.tf_export import tf_export
     string function_name;
     python_op_gen_internal::GenerateLowerCaseOpName(op_def.name(),
                                                     &function_name);
-    if (is_hidden) function_name = strings::StrCat("_", function_name);
-
-    // When users create custom python wrappers, they may link in the
-    // default op registry by accident, and because they can't
-    // enumerate all 'hidden' symbols, this guard is to prevent
-    // instantiating a python reserved word in their wrapper.
-    if (python_op_gen_internal::IsPythonReserved(function_name)) {
+    bool is_reserved = python_op_gen_internal::IsPythonReserved(function_name);
+
+    // Prefix an op with underscore if the op is listed in hidden_ops or
+    // name is reserved or it is of the exceptions in IsOpWithUnderscorePrefix.
+    // Do not add underscores to ops set to HIDDEN in ApiDef otherwise.
+    // TODO(annarev): don't prefix with underscores even if op is in hidden_ops.
+    if (is_hidden) {
+      if (!hidden_by_api_def || is_reserved ||
+          python_op_gen_internal::IsOpWithUnderscorePrefix(function_name)) {
+        function_name = strings::StrCat("_", function_name);
+      }
+    } else if (is_reserved) {
+      // When users create custom python wrappers, they may link in the
+      // default op registry by accident, and because they can't
+      // enumerate all 'hidden' symbols, this guard is to prevent
+      // instantiating a python reserved word in their wrapper.
       continue;
     }
 
diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index 26ec4e8e66b5d4e3be433c9e59f9b6034109d153..efcce2f20941790571199d358dfcfcc3c0283d08 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -16,10 +16,10 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
-#include "tensorflow/python/framework/python_op_gen.h"
+#include "tensorflow/python/eager/python_eager_op_gen.h"
 %}
 
-// Input typemap for GetPythonWrappers.
+// Input typemap for GetEagerPythonWrappers.
 // Accepts a python object of 'bytes' type, and converts it to
 // a const char* pointer and size_t length. The default typemap
 // going from python bytes to const char* tries to decode the
@@ -37,5 +37,5 @@ limitations under the License.
 
 
 %ignoreall;
-%unignore tensorflow::GetPythonWrappers;
-%include "tensorflow/python/framework/python_op_gen.h"
+%unignore tensorflow::GetEagerPythonWrappers;
+%include "tensorflow/python/eager/python_eager_op_gen.h"
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index 4319e5a7820b33283df8153fdc76e0e567813a17..e0cfb05f4bdf8afd09957c62a9ba3af1fd0882a6 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -29,6 +29,9 @@ namespace python_op_gen_internal {
 // Returns true if s is a Python keyword or built-in.
 bool IsPythonReserved(const string& s);
 
+// Whether the op should be prefixed with underscore.
+bool IsOpWithUnderscorePrefix(const string& s);
+
 // Add a _ to the end of s if necessary to avoid a Python keyword or built-in.
 string AvoidPythonReserved(const string& s);
 
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index bc5ca195da50499c6fbab822a9a093be3f0277e0..ca6ed42beec4a3d5ff70d5a605ab006265d1cce9 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -95,7 +96,8 @@ string InferSourceFileName(const char* argv_zero) {
   // operators defined in <op type>_ops.cc
   const char* kExecPrefix = "gen_";
   const char* kExecSuffix = "_py_wrappers_cc";
-  if (command_str.Consume(kExecPrefix) && command_str.ends_with(kExecSuffix)) {
+  if (str_util::ConsumePrefix(&command_str, kExecPrefix) &&
+      str_util::EndsWith(command_str, kExecSuffix)) {
     command_str.remove_suffix(strlen(kExecSuffix));
     return strings::StrCat(command_str, ".cc");
   } else {
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 1e74a790a3fb0c72b7c0fb1127ffac95f386d85e..b724432e00b0d11de86a0fff9ff31758ad36479f 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -52,20 +52,20 @@ def get_seed(op_seed):
     A tuple of two integers that should be used for the local seed of this
     operation.
   """
-  is_graph_mode = context.in_graph_mode()
+  eager = context.executing_eagerly()
 
-  if is_graph_mode:
-    global_seed = ops.get_default_graph().seed
-  else:
+  if eager:
     global_seed = context.global_seed()
+  else:
+    global_seed = ops.get_default_graph().seed
 
   if global_seed is not None:
     if op_seed is None:
       # pylint: disable=protected-access
-      if is_graph_mode:
-        op_seed = ops.get_default_graph()._last_id
-      else:
+      if eager:
         op_seed = context.internal_operation_seed()
+      else:
+        op_seed = ops.get_default_graph()._last_id
 
     seeds = _truncate_seed(global_seed), _truncate_seed(op_seed)
   else:
@@ -176,7 +176,7 @@ def set_random_seed(seed):
   Args:
     seed: integer.
   """
-  if context.in_graph_mode():
-    ops.get_default_graph().seed = seed
-  else:
+  if context.executing_eagerly():
     context.set_global_seed(seed)
+  else:
+    ops.get_default_graph().seed = seed
diff --git a/tensorflow/python/framework/random_seed_test.py b/tensorflow/python/framework/random_seed_test.py
index b4c98ab8b289c850c6171425167bb17606a4162d..194492268631abfa911bd45f13a302c09a2c8bda 100644
--- a/tensorflow/python/framework/random_seed_test.py
+++ b/tensorflow/python/framework/random_seed_test.py
@@ -40,13 +40,13 @@ class RandomSeedTest(test.TestCase):
         ((2**31 - 1, 0), (0, 2**31 - 1)),  # Don't wrap to (0, 0) either
         ((0, 2**31 - 1), (0, 2**31 - 1)),  # Wrapping for the other argument
     ]
-    if context.in_graph_mode():
-      # 0 will be the default_graph._lastid.
-      test_cases.append(((1, None), (1, 0)))
-    else:
+    if context.executing_eagerly():
       # operation seed is random number generated based on global seed.
       # it's not tested due to possibility of platform or version difference.
       pass
+    else:
+      # 0 will be the default_graph._lastid.
+      test_cases.append(((1, None), (1, 0)))
     for tc in test_cases:
       tinput, toutput = tc[0], tc[1]
       random_seed.set_random_seed(tinput[0])
diff --git a/tensorflow/python/framework/smart_cond.py b/tensorflow/python/framework/smart_cond.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a834392b47b4cdcc82381153852584052a5aad
--- /dev/null
+++ b/tensorflow/python/framework/smart_cond.py
@@ -0,0 +1,121 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""smart_cond and related utilties."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import control_flow_ops
+
+
+def smart_cond(pred, true_fn=None, false_fn=None, name=None):
+  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
+
+  If `pred` is a bool or has a constant value, we return either `true_fn()`
+  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
+
+  Arguments:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix when using `tf.cond`.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`.
+
+  Raises:
+    TypeError: If `true_fn` or `false_fn` is not callable.
+  """
+  if not callable(true_fn):
+    raise TypeError("`true_fn` must be callable.")
+  if not callable(false_fn):
+    raise TypeError("`false_fn` must be callable.")
+
+  pred_value = smart_constant_value(pred)
+  if pred_value is not None:
+    if pred_value:
+      return true_fn()
+    else:
+      return false_fn()
+  else:
+    return control_flow_ops.cond(pred, true_fn=true_fn, false_fn=false_fn,
+                                 name=name)
+
+
+def smart_constant_value(pred):
+  """Return the bool value for `pred`, or None if `pred` had a dynamic value.
+
+  Arguments:
+    pred: A scalar, either a Python bool or tensor.
+
+  Returns:
+    True or False if `pred` has a constant boolean value, None otherwise.
+
+  Raises:
+    TypeError: If `pred` is not a Tensor or bool.
+  """
+  if pred in {0, 1}:  # Accept 1/0 as valid boolean values
+    pred_value = bool(pred)
+  elif isinstance(pred, bool):
+    pred_value = pred
+  elif isinstance(pred, ops.Tensor):
+    pred_value = tensor_util.constant_value(pred)
+    # TODO(skyewm): consider folding this into tensor_util.constant_value when
+    # _USE_C_API is removed (there may be performance and correctness bugs, so I
+    # wanted to limit the change hidden behind _USE_C_API).
+    # pylint: disable=protected-access
+    if pred_value is None and ops._USE_C_API:
+      pred_value = c_api.TF_TryEvaluateConstant_wrapper(pred.graph._c_graph,
+                                                        pred._as_tf_output())
+    # pylint: enable=protected-access
+
+  else:
+    raise TypeError("`pred` must be a Tensor, or a Python bool, or 1 or 0. "
+                    "Found instead: %s" % pred)
+  return pred_value
+
+
+def smart_case(pred_fn_pairs, default=None, exclusive=False, name="smart_case"):
+  """Like tf.case, except attempts to statically evaluate predicates.
+
+  If any predicate in `pred_fn_pairs` is a bool or has a constant value, the
+  associated callable will be called or omitted depending on its value.
+  Otherwise this functions like tf.case.
+
+  Args:
+    pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor and a
+                   callable which returns a list of tensors.
+    default: Optional callable that returns a list of tensors.
+    exclusive: True iff at most one predicate is allowed to evaluate to `True`.
+    name: A name for this operation (optional).
+
+  Returns:
+    The tensors returned by the first pair whose predicate evaluated to True, or
+    those returned by `default` if none does.
+
+  Raises:
+    TypeError: If `pred_fn_pairs` is not a list/dictionary.
+    TypeError: If `pred_fn_pairs` is a list but does not contain 2-tuples.
+    TypeError: If `fns[i]` is not callable for any i, or `default` is not
+               callable.
+  """
+  return control_flow_ops._case_helper(  # pylint: disable=protected-access
+      smart_cond, pred_fn_pairs, default, exclusive, name,
+      allow_python_preds=True)
diff --git a/tensorflow/python/framework/smart_cond_test.py b/tensorflow/python/framework/smart_cond_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1170a41c99995ae875e58a2d5491e05bc1e40df6
--- /dev/null
+++ b/tensorflow/python/framework/smart_cond_test.py
@@ -0,0 +1,166 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+
+
+def raise_exception():
+  raise RuntimeError("did not expect to be called")
+
+
+@test_util.with_c_api
+class SmartCondTest(test_util.TensorFlowTestCase):
+
+  def testTrue(self):
+    with ops.Graph().as_default():
+      with session.Session():
+        x = constant_op.constant(2)
+        y = constant_op.constant(5)
+        z = smart_cond.smart_cond(True, lambda: math_ops.multiply(x, 16),
+                                  lambda: math_ops.multiply(y, 5))
+        self.assertEqual(z.eval(), 32)
+
+  def testFalse(self):
+    with ops.Graph().as_default():
+      with session.Session():
+        x = constant_op.constant(4)
+        y = constant_op.constant(3)
+        z = smart_cond.smart_cond(False, lambda: math_ops.multiply(x, 16),
+                                  lambda: math_ops.multiply(y, 3))
+        self.assertEqual(z.eval(), 9)
+
+  def testUnknown(self):
+    with ops.Graph().as_default():
+      with session.Session():
+        x = array_ops.placeholder(dtype=dtypes.int32)
+        y = smart_cond.smart_cond(x > 0, lambda: constant_op.constant(1),
+                                  lambda: constant_op.constant(2))
+        self.assertEqual(y.eval(feed_dict={x: 1}), 1)
+        self.assertEqual(y.eval(feed_dict={x: -1}), 2)
+
+  def testEval(self):
+    # Constant expression evaluation only works with the C API enabled.
+    if not ops._USE_C_API: return
+
+    with ops.Graph().as_default():
+      with session.Session():
+        x = constant_op.constant(1)
+        y = constant_op.constant(2)
+        # x * y > 0 can be evaluated at graph construction time, so the false
+        # branch shouldn't be evaluated at all.
+        z = smart_cond.smart_cond(x * y > 0, lambda: constant_op.constant(1),
+                                  raise_exception)
+        self.assertEqual(z.eval(feed_dict={x: 1}), 1)
+
+  def testPlaceholderWithDefault(self):
+    with ops.Graph().as_default():
+      with session.Session():
+        x = array_ops.placeholder_with_default(1, shape=())
+        y = smart_cond.smart_cond(x > 0, lambda: constant_op.constant(1),
+                                  lambda: constant_op.constant(2))
+        self.assertEqual(y.eval(), 1)
+        self.assertEqual(y.eval(feed_dict={x: -1}), 2)
+
+  def testMissingArg1(self):
+    with ops.Graph().as_default():
+      with session.Session():
+        x = constant_op.constant(1)
+        with self.assertRaises(TypeError):
+          smart_cond.smart_cond(True, false_fn=lambda: x)
+
+  def testMissingArg2(self):
+    with ops.Graph().as_default():
+      with session.Session():
+        x = constant_op.constant(1)
+        with self.assertRaises(TypeError):
+          smart_cond.smart_cond(True, lambda: x)
+
+
+@test_util.with_c_api
+class SmartCaseTest(test_util.TensorFlowTestCase):
+
+  def testTrue(self):
+    x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    conditions = [(True, lambda: constant_op.constant(1)),
+                  (x == 0, raise_exception)]
+    y = smart_cond.smart_case(conditions, default=raise_exception,
+                              exclusive=False)
+    z = smart_cond.smart_case(conditions, default=raise_exception,
+                              exclusive=True)
+    with session.Session() as sess:
+      # No feed_dict necessary
+      self.assertEqual(sess.run(y), 1)
+      self.assertEqual(sess.run(z), 1)
+
+  def testFalse(self):
+    conditions = [(False, raise_exception)]
+    y = smart_cond.smart_case(conditions,
+                              default=lambda: constant_op.constant(1),
+                              exclusive=False)
+    z = smart_cond.smart_case(conditions,
+                              default=lambda: constant_op.constant(1),
+                              exclusive=True)
+    with session.Session() as sess:
+      self.assertEqual(sess.run(y), 1)
+      self.assertEqual(sess.run(z), 1)
+
+  def testMix(self):
+    # Constant expression evaluation only works with the C API enabled.
+    if not ops._USE_C_API: return
+
+    x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    y = constant_op.constant(10)
+    conditions = [(x > 1, lambda: constant_op.constant(1)),
+                  (y < 1, raise_exception),
+                  (False, raise_exception),
+                  (True, lambda: constant_op.constant(3))]
+    z = smart_cond.smart_case(conditions, default=raise_exception)
+    with session.Session() as sess:
+      self.assertEqual(sess.run(z, feed_dict={x: 2}), 1)
+      self.assertEqual(sess.run(z, feed_dict={x: 0}), 3)
+
+
+@test_util.with_c_api
+class SmartConstantValueTest(test_util.TensorFlowTestCase):
+
+  # TODO(skyewm): this is essentially a regression test for
+  # TF_TryEvaluateConstant, and is not really a valid smart_constant_value test
+  # (smart_constant_value is only supposed to return bools). Move the
+  # TF_TryEvaluateConstant call to tensor_util.constant_value and make this a
+  # constant_value test instead.
+  def testCond(self):
+    with ops.Graph().as_default():
+      pred = array_ops.placeholder_with_default(True, shape=())
+      x = control_flow_ops.cond(pred,
+                                lambda: constant_op.constant(1),
+                                lambda: constant_op.constant(2))
+      self.assertIsNone(smart_cond.smart_constant_value(x))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 222071cb9e87aa0fdd9788d1c72df4c66ea61547..0dd29460ed93aadf61ef1f1b2dbf1d7802ca4877 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -30,6 +31,8 @@ class Dimension(object):
     """Creates a new Dimension with the given value."""
     if value is None:
       self._value = None
+    elif isinstance(value, dtypes.DType):
+      raise TypeError("Cannot convert %s to Dimension" % value)
     else:
       self._value = int(value)
       if (not isinstance(value, compat.bytes_or_text_types) and
@@ -156,7 +159,7 @@ class Dimension(object):
     ```
 
     Args:
-      other: Another Dimension.
+      other: Another Dimension, or a value accepted by `as_dimension`.
 
     Returns:
       A Dimension whose value is the sum of `self` and `other`.
@@ -167,6 +170,17 @@ class Dimension(object):
     else:
       return Dimension(self._value + other.value)
 
+  def __radd__(self, other):
+    """Returns the sum of `other` and `self`.
+
+    Args:
+      other: Another Dimension, or a value accepted by `as_dimension`.
+
+    Returns:
+      A Dimension whose value is the sum of `self` and `other`.
+    """
+    return self + other
+
   def __sub__(self, other):
     """Returns the subtraction of `other` from `self`.
 
@@ -180,10 +194,10 @@ class Dimension(object):
     ```
 
     Args:
-      other: Another Dimension.
+      other: Another Dimension, or a value accepted by `as_dimension`.
 
     Returns:
-      A Dimension whose value is the subtraction of sum of `other` from `self`.
+      A Dimension whose value is the subtraction of `other` from `self`.
     """
     other = as_dimension(other)
     if self._value is None or other.value is None:
@@ -191,6 +205,21 @@ class Dimension(object):
     else:
       return Dimension(self._value - other.value)
 
+  def __rsub__(self, other):
+    """Returns the subtraction of `self` from `other`.
+
+    Args:
+      other: Another Dimension, or a value accepted by `as_dimension`.
+
+    Returns:
+      A Dimension whose value is the subtraction of `self` from `other`.
+    """
+    other = as_dimension(other)
+    if self._value is None or other.value is None:
+      return Dimension(None)
+    else:
+      return Dimension(other.value - self._value)
+
   def __mul__(self, other):
     """Returns the product of `self` and `other`.
 
@@ -204,17 +233,32 @@ class Dimension(object):
     ```
 
     Args:
-      other: Another Dimension.
+      other: Another Dimension, or a value accepted by `as_dimension`.
 
     Returns:
       A Dimension whose value is the product of `self` and `other`.
     """
-    other = as_dimension(other)
+    try:
+      other = as_dimension(other)
+    except (TypeError, ValueError):
+      return NotImplemented
+
     if self._value is None or other.value is None:
       return Dimension(None)
     else:
       return Dimension(self._value * other.value)
 
+  def __rmul__(self, other):
+    """Returns the product of `self` and `other`.
+
+    Args:
+      other: Another Dimension, or a value accepted by `as_dimension`.
+
+    Returns:
+      A Dimension whose value is the product of `self` and `other`.
+    """
+    return self * other
+
   def __floordiv__(self, other):
     """Returns the quotient of `self` and `other` rounded down.
 
@@ -228,17 +272,35 @@ class Dimension(object):
     ```
 
     Args:
-      other: Another `Dimension`.
+      other: Another Dimension, or a value accepted by `as_dimension`.
 
     Returns:
       A `Dimension` whose value is the integer quotient of `self` and `other`.
     """
-    other = as_dimension(other)
+    try:
+      other = as_dimension(other)
+    except (TypeError, ValueError):
+      return NotImplemented
     if self._value is None or other.value is None:
       return Dimension(None)
     else:
       return Dimension(self._value // other.value)
 
+  def __rfloordiv__(self, other):
+    """Returns the quotient of `other` and `self` rounded down.
+
+    Args:
+      other: Another Dimension, or a value accepted by `as_dimension`.
+
+    Returns:
+      A `Dimension` whose value is the integer quotient of `self` and `other`.
+    """
+    other = as_dimension(other)
+    if self._value is None or other.value is None:
+      return Dimension(None)
+    else:
+      return Dimension(other.value // self._value)
+
   def __div__(self, other):
     """DEPRECATED: Use `__floordiv__` via `x // y` instead.
 
@@ -256,7 +318,7 @@ class Dimension(object):
     return self // other
 
   def __mod__(self, other):
-    """Returns `self` modulo `other.
+    """Returns `self` modulo `other`.
 
     Dimension moduli are computed as follows:
 
@@ -268,17 +330,35 @@ class Dimension(object):
     ```
 
     Args:
-      other: Another Dimension.
+      other: Another Dimension, or a value accepted by `as_dimension`.
 
     Returns:
       A Dimension whose value is `self` modulo `other`.
     """
-    other = as_dimension(other)
+    try:
+      other = as_dimension(other)
+    except (TypeError, ValueError):
+      return NotImplemented
     if self._value is None or other.value is None:
       return Dimension(None)
     else:
       return Dimension(self._value % other.value)
 
+  def __rmod__(self, other):
+    """Returns `other` modulo `self`.
+
+    Args:
+      other: Another Dimension, or a value accepted by `as_dimension`.
+
+    Returns:
+      A Dimension whose value is `other` modulo `self`.
+    """
+    try:
+      other = as_dimension(other)
+    except (TypeError, ValueError):
+      return NotImplemented
+    return other % self
+
   def __lt__(self, other):
     """Returns True if `self` is known to be less than `other`.
 
@@ -379,6 +459,9 @@ class Dimension(object):
     else:
       return self._value >= other.value
 
+  def __reduce__(self):
+    return Dimension, (self._value,)
+
 
 def as_dimension(value):
   """Converts the given value to a Dimension.
@@ -456,6 +539,7 @@ class TensorShape(object):
       else:
         # Got a list of dimensions
         self._dims = [as_dimension(d) for d in dims_iter]
+    self._ndims = None
 
   def __repr__(self):
     return "TensorShape(%r)" % self._dims
@@ -473,19 +557,26 @@ class TensorShape(object):
     """Returns a list of Dimensions, or None if the shape is unspecified."""
     return self._dims
 
+  @dims.setter
+  def dims(self, dims):
+    self._dims = dims
+    self._ndims = None
+
   @property
   def ndims(self):
     """Returns the rank of this shape, or None if it is unspecified."""
     if self._dims is None:
       return None
     else:
-      return len(self._dims)
+      if self._ndims is None:
+        self._ndims = len(self._dims)
+      return self._ndims
 
   def __len__(self):
     """Returns the rank of this shape, or raises ValueError if unspecified."""
     if self._dims is None:
       raise ValueError("Cannot take the length of Shape with unknown rank.")
-    return len(self._dims)
+    return self.ndims
 
   def __bool__(self):
     """Returns True if this shape contains non-zero information."""
@@ -843,6 +934,9 @@ class TensorShape(object):
       return True
     return self._dims != other.dims
 
+  def __reduce__(self):
+    return TensorShape, (self._dims,)
+
 
 def as_shape(shape):
   """Converts the given object to a TensorShape."""
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index fffd86c7a6241b8be92ad33852da244ab9b5284d..9232d99a1f932b9e48cd7ddc125e353390064873 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -34,12 +35,20 @@ class DimensionTest(test_util.TensorFlowTestCase):
     self.assertEqual(tensor_shape.Dimension(15),
                      dim + tensor_shape.Dimension(3))
     self.assertEqual(tensor_shape.Dimension(15), dim + 3)
+    self.assertEqual(tensor_shape.Dimension(15), 3 + dim)
+    self.assertEqual(tensor_shape.Dimension(9), dim - 3)
+    self.assertEqual(tensor_shape.Dimension(1), 13 - dim)
     self.assertEqual(tensor_shape.Dimension(24),
                      dim * tensor_shape.Dimension(2))
     self.assertEqual(tensor_shape.Dimension(24), dim * 2)
+    self.assertEqual(tensor_shape.Dimension(24), 2 * dim)
+    self.assertEqual([4] * 12, [4] * dim)
+    self.assertEqual(12 * [4], dim * [4])
+    self.assertEqual(tensor_shape.Dimension(24), 2 * dim)
     self.assertEqual(
         tensor_shape.Dimension(6), dim // tensor_shape.Dimension(2))
     self.assertEqual(tensor_shape.Dimension(6), dim // 2)
+    self.assertEqual(tensor_shape.Dimension(0), 2 // dim)
     self.assertEqual(tensor_shape.Dimension(12),
                      dim.merge_with(tensor_shape.Dimension(12)))
     self.assertEqual(tensor_shape.Dimension(12), dim.merge_with(12))
@@ -176,6 +185,26 @@ class DimensionTest(test_util.TensorFlowTestCase):
     self.assertEqual(str(tensor_shape.Dimension(7)), "7")
     self.assertEqual(str(tensor_shape.Dimension(None)), "?")
 
+  def testUnsupportedType(self):
+    with self.assertRaises(TypeError):
+      tensor_shape.Dimension(dtypes.string)
+
+  def testMod(self):
+    four = tensor_shape.Dimension(4)
+    nine = tensor_shape.Dimension(9)
+    self.assertEqual(nine % four, 1)
+    # test both __mod__ and __rmod__.
+    self.assertEqual(nine % 4, 1)
+    self.assertEqual(4 % nine, 4)
+
+  def testReduce(self):
+    dim = tensor_shape.Dimension(5)
+    ctor, args = dim.__reduce__()
+    self.assertEquals(ctor, tensor_shape.Dimension)
+    self.assertEquals(args, (5,))
+    reconstructed = ctor(*args)
+    self.assertEquals(reconstructed, dim)
+
 
 class ShapeTest(test_util.TensorFlowTestCase):
 
@@ -401,5 +430,15 @@ class ShapeTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([2, None, 4], tensor_shape.TensorShape(
         (2, None, 4)).as_list())
 
+  def testReduce(self):
+    shape = tensor_shape.TensorShape([2, 3])
+    ctor, args = shape.__reduce__()
+    self.assertEquals(ctor, tensor_shape.TensorShape)
+    self.assertEquals(args, ([tensor_shape.Dimension(2),
+                              tensor_shape.Dimension(3)],))
+    reconstructed = ctor(*args)
+    self.assertEquals(reconstructed, shape)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index a0411bc3d9b4b2b87e5a31e9f201154f28ccf1cc..6676cfcaa334e02208d9ec346de7d266c4700f24 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -65,6 +65,11 @@ class TensorSpec(object):
     else:
       raise ValueError("`tensor` should be a tf.Tensor")
 
+  @classmethod
+  def is_bounded(cls):
+    del cls
+    return False
+
   @property
   def shape(self):
     """Returns the `TensorShape` that represents the shape of the tensor."""
@@ -80,6 +85,16 @@ class TensorSpec(object):
     """Returns the name of the described tensor."""
     return self._name
 
+  @property
+  def is_discrete(self):
+    """Whether spec is discrete."""
+    return self.dtype.is_integer
+
+  @property
+  def is_continuous(self):
+    """Whether spec is continuous."""
+    return self.dtype.is_floating
+
   def is_compatible_with(self, spec_or_tensor):
     """True if the shape and dtype of `spec_or_tensor` are compatible."""
     return (self._dtype.is_compatible_with(spec_or_tensor.dtype) and
@@ -95,6 +110,9 @@ class TensorSpec(object):
   def __ne__(self, other):
     return not self == other
 
+  def __reduce__(self):
+    return TensorSpec, (self._shape, self._dtype, self._name)
+
 
 class BoundedTensorSpec(TensorSpec):
   """A `TensorSpec` that specifies minimum and maximum values.
@@ -163,19 +181,16 @@ class BoundedTensorSpec(TensorSpec):
     self._maximum = np.array(maximum, dtype=self.dtype.as_numpy_dtype())
     self._maximum.setflags(write=False)
 
+  @classmethod
+  def is_bounded(cls):
+    del cls
+    return True
+
   @classmethod
   def from_spec(cls, spec):
     dtype = dtypes.as_dtype(spec.dtype)
-    if dtype in [dtypes.float64, dtypes.float32]:
-      # Avoid under/over-flow for `dtype.maximum - dtype.minimum`.
-      low = dtype.min / 2
-      high = dtype.max / 2
-    else:
-      low = dtype.min
-      high = dtype.max
-
-    minimum = getattr(spec, "minimum", low)
-    maximum = getattr(spec, "maximum", high)
+    minimum = getattr(spec, "minimum", dtype.min)
+    maximum = getattr(spec, "maximum", dtype.max)
     return BoundedTensorSpec(spec.shape, dtype, minimum, maximum, spec.name)
 
   @property
@@ -198,4 +213,7 @@ class BoundedTensorSpec(TensorSpec):
     return (tensor_spec_eq and np.allclose(self.minimum, other.minimum) and
             np.allclose(self.maximum, other.maximum))
 
+  def __reduce__(self):
+    return BoundedTensorSpec, (self._shape, self._dtype, self._minimum,
+                               self._maximum, self._name)
 
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index 54ca4d9a19c2e1c879c05cfb828085951bdd8444..2e9e43e12279fe833d640d4163c5474c398e70cd 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import pickle
+
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -127,6 +129,26 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(bounded_spec.dtype, spec.dtype)
     self.assertEqual(bounded_spec.name, spec.name)
 
+  def testIsDiscrete(self):
+    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
+    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
+    self.assertTrue(discrete_spec.is_discrete)
+    self.assertFalse(continuous_spec.is_discrete)
+
+  def testIsContinuous(self):
+    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
+    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
+    self.assertFalse(discrete_spec.is_continuous)
+    self.assertTrue(continuous_spec.is_continuous)
+
+  def testIsBounded(self):
+    unbounded_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
+    self.assertFalse(unbounded_spec.is_bounded())
+
+  def testSerialization(self):
+    desc = tensor_spec.TensorSpec([1, 5], dtypes.float32, "test")
+    self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
+
 
 class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
 
@@ -138,6 +160,11 @@ class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "not compatible"):
       tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, 0, (1, 1, 1))
 
+  def testIsBounded(self):
+    bounded_spec = tensor_spec.BoundedTensorSpec(
+        (1, 2), dtypes.int32, minimum=0, maximum=1)
+    self.assertTrue(bounded_spec.is_bounded())
+
   def testMinimumMaximumAttributes(self):
     spec = tensor_spec.BoundedTensorSpec(
         (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
@@ -222,6 +249,10 @@ class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(spec.dtype.max, bounded_spec.maximum)
     self.assertEqual(spec.name, bounded_spec.name)
 
+  def testSerialization(self):
+    desc = tensor_spec.BoundedTensorSpec([1, 5], dtypes.float32, -1, 1, "test")
+    self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 27afaa074a6becd5c8b7db94be59e8da1611c13a..8cf24206edab8be807eca1d067662a57585e2bda 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -22,7 +22,6 @@ import six
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
-from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
@@ -559,16 +558,16 @@ def MakeNdarray(tensor):
   if tensor.tensor_content:
     return (np.frombuffer(tensor.tensor_content, dtype=dtype).copy()
             .reshape(shape))
-  elif tensor_dtype == dtypes.float16:
+  elif tensor_dtype == dtypes.float16 or tensor_dtype == dtypes.bfloat16:
     # the half_val field of the TensorProto stores the binary representation
     # of the fp16: we need to reinterpret this as a proper float16
     if len(tensor.half_val) == 1:
       tmp = np.array(tensor.half_val[0], dtype=np.uint16)
-      tmp.dtype = np.float16
+      tmp.dtype = tensor_dtype.as_numpy_dtype
       return np.repeat(tmp, num_elements).reshape(shape)
     else:
       tmp = np.fromiter(tensor.half_val, dtype=np.uint16)
-      tmp.dtype = np.float16
+      tmp.dtype = tensor_dtype.as_numpy_dtype
       return tmp.reshape(shape)
   elif tensor_dtype == dtypes.float32:
     if len(tensor.float_val) == 1:
@@ -586,8 +585,7 @@ def MakeNdarray(tensor):
       return np.fromiter(tensor.double_val, dtype=dtype).reshape(shape)
   elif tensor_dtype in [
       dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16, dtypes.int8,
-      dtypes.qint32, dtypes.quint8, dtypes.qint8, dtypes.qint16, dtypes.quint16,
-      dtypes.bfloat16
+      dtypes.qint32, dtypes.quint8, dtypes.qint8, dtypes.qint16, dtypes.quint16
   ]:
     if len(tensor.int_val) == 1:
       return np.repeat(np.array(tensor.int_val[0], dtype=dtype),
@@ -824,17 +822,32 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   all-or-nothing.
 
   Args:
-    tensor: The rank-1 Tensor to be evaluated.
+    tensor: The rank-0 or rank-1 Tensor to be evaluated.
 
   Returns:
     A `TensorShape` based on the constant value of the given `tensor`.
+
+  Raises:
+    ValueError: If the shape is rank-0 and is not statically known to be -1.
   """
-  if context.in_eager_mode():
+  if isinstance(tensor, ops.EagerTensor):
     return tensor_shape.as_shape(
         [dim if dim != -1 else None for dim in tensor.numpy()])
 
+  if tensor.get_shape().ndims == 0:
+    value = constant_value(tensor)
+    if value is None:
+      raise ValueError(
+          "Received a scalar with unknown value as shape; require a statically "
+          "known scalar with value '-1' to describe an unknown shape.")
+    if value != -1:
+      raise ValueError(
+          "Received a scalar value '%s' as shape; require a statically known "
+          "scalar with value '-1' to describe an unknown shape." % value)
+    return tensor_shape.unknown_shape()
+
   shape = tensor.get_shape().with_rank(1)
-  if tensor.get_shape() == [0]:
+  if shape == [0]:
     return tensor_shape.scalar()
   elif tensor.op.type == "Shape":
     return tensor.op.inputs[0].get_shape()
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index bea0ee34fd4900cc9d4d5d52348ba4512368e81f..35fff80c61b98e7603d3b7b5df3cabdb59059a72 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -235,6 +235,26 @@ class TensorUtilTest(test.TestCase):
     self.assertEquals(np.float16, a.dtype)
     self.assertAllClose(np.array([10.0, 20.0], dtype=np.float16), a)
 
+  def testBfloat16(self):
+    test_type = dtypes.bfloat16.as_numpy_dtype
+    t = tensor_util.make_tensor_proto(np.array([10.0, 20.0], dtype=test_type))
+    # 10.0: 16672 = 010000010(130) 0100000: (1+0/2+1/4) * 2^(130-127)
+    # 20.0: 16800 = 010000011(131) 0100000: (1+0/2+1/4) * 2^(131-127)
+    self.assertProtoEquals("""
+      dtype: DT_BFLOAT16
+      tensor_shape {
+        dim {
+          size: 2
+        }
+      }
+      half_val: 16672
+      half_val: 16800
+      """, t)
+
+    a = tensor_util.MakeNdarray(t)
+    self.assertEquals(test_type, a.dtype)
+    self.assertAllClose(np.array([10.0, 20.0], dtype=test_type), a)
+
   def testInt(self):
     t = tensor_util.make_tensor_proto(10)
     self.assertProtoEquals("""
@@ -768,7 +788,7 @@ class ConstantValueTest(test.TestCase):
     self.assertAllClose(np_val, tensor_util.constant_value(tf_val))
 
   def testUnknown(self):
-    tf_val = gen_state_ops._variable(
+    tf_val = gen_state_ops.variable(
         shape=[3, 4, 7],
         dtype=dtypes.float32,
         name="tf_val",
diff --git a/tensorflow/python/framework/test_file_system.cc b/tensorflow/python/framework/test_file_system.cc
index 094ea6f658ab800736eebce2db7ee80da151a033..6e9915adbb619c5c4891742ddda700da47ed590f 100644
--- a/tensorflow/python/framework/test_file_system.cc
+++ b/tensorflow/python/framework/test_file_system.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/null_file_system.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 682b2b366724294de3cf222633c95f69600823ba..dc56d88066cbe6b76cdacaa7cb799462c2c670c0 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import contextlib
 import gc
+import itertools
 import math
 import random
 import re
@@ -53,9 +54,11 @@ from tensorflow.python.eager import tape  # pylint: disable=unused-import
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -205,6 +208,10 @@ def CudaSupportsHalfMatMulAndConv():
   return pywrap_tensorflow.CudaSupportsHalfMatMulAndConv()
 
 
+def IsMklEnabled():
+  return pywrap_tensorflow.IsMklEnabled()
+
+
 def InstallStackTraceHandler():
   pywrap_tensorflow.InstallStacktraceHandler()
 
@@ -331,6 +338,8 @@ def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs):
     # Make sure default graph reflects prev_value in case next test doesn't call
     # reset_default_graph().
     ops.reset_default_graph()
+
+
 # pylint: disable=protected-access
 
 
@@ -403,6 +412,31 @@ def enable_c_api(fn):
   return wrapper
 
 
+def enable_c_shapes(fn):
+  """Decorator for enabling C shapes on a test.
+
+  Note this enables the C shapes after running the test class's setup/teardown
+  methods.
+
+  Args:
+    fn: the function to be wrapped
+
+  Returns:
+    The wrapped function
+  """
+
+  def wrapper(*args, **kwargs):
+    prev_value = ops._USE_C_SHAPES
+    # Only use C shapes if the C API is already enabled.
+    ops._USE_C_SHAPES = ops._USE_C_API
+    try:
+      fn(*args, **kwargs)
+    finally:
+      ops._USE_C_SHAPES = prev_value
+
+  return wrapper
+
+
 # This decorator is a hacky way to run all the test methods in a decorated
 # class with and without C API enabled.
 # TODO(iga): Remove this and its uses once we switch to using C API by default.
@@ -419,12 +453,77 @@ def with_c_api(cls):
   Returns:
     cls with new test methods added
   """
+  # If the C API is already enabled, don't do anything. Some tests break if the
+  # same test is run twice, so this allows us to turn on the C API by default
+  # without breaking these tests.
+  if ops._USE_C_API:
+    return cls
+
   for name, value in cls.__dict__.copy().items():
     if callable(value) and name.startswith("test"):
       setattr(cls, name + "WithCApi", enable_c_api(value))
   return cls
 
 
+def with_c_shapes(cls):
+  """Adds methods that call original methods but with C API shapes enabled.
+
+  Note this enables C shapes in new methods after running the test class's
+  setup method.
+
+  Args:
+    cls: class to decorate
+
+  Returns:
+    cls with new test methods added
+  """
+  # If C shapes are already enabled, don't do anything. Some tests break if the
+  # same test is run twice, so this allows us to turn on the C shapes by default
+  # without breaking these tests.
+  if ops._USE_C_SHAPES:
+    return cls
+
+  for name, value in cls.__dict__.copy().items():
+    if callable(value) and name.startswith("test"):
+      setattr(cls, name + "WithCShapes", enable_c_shapes(value))
+  return cls
+
+
+def assert_no_new_pyobjects_executing_eagerly(f):
+  """Decorator for asserting that no new Python objects persist after a test.
+
+  Runs the test multiple times executing eagerly, first as a warmup and then
+  several times to let objects accumulate. The warmup helps ignore caches which
+  do not grow as the test is run repeatedly.
+
+  Useful for checking that there are no missing Py_DECREFs in the C exercised by
+  a bit of Python.
+  """
+
+  def decorator(self, **kwargs):
+    """Warms up, gets an object count, runs the test, checks for new objects."""
+    with context.eager_mode():
+      gc.disable()
+      f(self, **kwargs)
+      gc.collect()
+      previous_count = len(gc.get_objects())
+      for _ in range(3):
+        f(self, **kwargs)
+      gc.collect()
+      # There should be no new Python objects hanging around.
+      new_count = len(gc.get_objects())
+      # In some cases (specifacally on MacOS), new_count is somehow
+      # smaller than previous_count.
+      # Using plain assert because not all classes using this decorator
+      # have assertLessEqual
+      assert new_count <= previous_count, (
+          "new_count(%d) is not less than or equal to previous_count(%d)" % (
+              new_count, previous_count))
+      gc.enable()
+
+  return decorator
+
+
 def assert_no_new_tensors(f):
   """Decorator for asserting that no new Tensors persist after a test.
 
@@ -446,15 +545,17 @@ def assert_no_new_tensors(f):
   def decorator(self, **kwargs):
     """Finds existing Tensors, runs the test, checks for new Tensors."""
 
-    def _is_tensor(obj):
+    def _is_tensorflow_object(obj):
       try:
-        return (isinstance(obj, ops.Tensor) or
-                isinstance(obj, variables.Variable))
+        return isinstance(obj,
+                          (ops.Tensor, variables.Variable,
+                           tensor_shape.Dimension, tensor_shape.TensorShape))
       except ReferenceError:
         # If the object no longer exists, we don't care about it.
         return False
 
-    tensors_before = set(id(obj) for obj in gc.get_objects() if _is_tensor(obj))
+    tensors_before = set(
+        id(obj) for obj in gc.get_objects() if _is_tensorflow_object(obj))
     outside_graph_key = ops.get_default_graph()._graph_key
     with ops.Graph().as_default():
       # Run the test in a new graph so that collections get cleared when it's
@@ -464,11 +565,12 @@ def assert_no_new_tensors(f):
     # Make an effort to clear caches, which would otherwise look like leaked
     # Tensors.
     backprop._zeros_cache.flush()
+    context.get_default_context().ones_rank_cache().flush()
     context.get_default_context().scalar_cache().clear()
     gc.collect()
     tensors_after = [
         obj for obj in gc.get_objects()
-        if _is_tensor(obj) and id(obj) not in tensors_before
+        if _is_tensorflow_object(obj) and id(obj) not in tensors_before
     ]
     if tensors_after:
       raise AssertionError(("%d Tensors not deallocated after test: %s" % (
@@ -501,6 +603,30 @@ def assert_no_garbage_created(f):
     previous_garbage = len(gc.garbage)
     f(self, **kwargs)
     gc.collect()
+    if len(gc.garbage) > previous_garbage:
+      logging.error(
+          "The decorated test created work for Python's garbage collector, "
+          "likely due to a reference cycle. New objects in cycle(s):")
+      for i, obj in enumerate(gc.garbage[previous_garbage:]):
+        try:
+          logging.error("Object %d of %d", i,
+                        len(gc.garbage) - previous_garbage)
+
+          def _safe_object_str(obj):
+            return "<%s %d>" % (obj.__class__.__name__, id(obj))
+
+          logging.error("  Object type: %s", _safe_object_str(obj))
+          logging.error("  Referrer types: %s", ", ".join(
+              [_safe_object_str(ref) for ref in gc.get_referrers(obj)]))
+          logging.error("  Referent types: %s", ", ".join(
+              [_safe_object_str(ref) for ref in gc.get_referents(obj)]))
+          logging.error("  Object attribute names: %s", dir(obj))
+          logging.error("  Object __str__:")
+          logging.error(obj)
+          logging.error("  Object __repr__:")
+          logging.error(repr(obj))
+        except Exception:
+          logging.error("(Exception while printing object)")
     # This will fail if any garbage has been created, typically because of a
     # reference cycle.
     self.assertEqual(previous_garbage, len(gc.garbage))
@@ -514,74 +640,91 @@ def assert_no_garbage_created(f):
 
 
 def run_in_graph_and_eager_modes(__unused__=None,
-                                 graph=None,
                                  config=None,
-                                 use_gpu=False,
-                                 force_gpu=False,
+                                 use_gpu=True,
                                  reset_test=True,
                                  assert_no_eager_garbage=False):
-  """Runs the test in both graph and eager modes.
+  """Execute the decorated test with and without enabling eager execution.
+
+  This function returns a decorator intended to be applied to test methods in
+  a @{tf.test.TestCase} class. Doing so will cause the contents of the test
+  method to be executed twice - once normally, and once with eager execution
+  enabled. This allows unittests to confirm the equivalence between eager
+  and graph execution (see @{tf.enable_eager_execution}).
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(tf.test.TestCase):
+
+    @run_in_graph_and_eager_modes()
+    def test_foo(self):
+      x = tf.constant([1, 2])
+      y = tf.constant([3, 4])
+      z = tf.add(x, y)
+      self.assertAllEqual([4, 6], self.evaluate(z))
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test validates that `tf.add()` has the same behavior when computed with
+  eager execution enabled as it does when constructing a TensorFlow graph and
+  executing the `z` tensor in a session.
+
 
   Args:
     __unused__: Prevents sliently skipping tests.
-    graph: Optional graph to use during the returned session.
     config: An optional config_pb2.ConfigProto to use to configure the
-      session.
-    use_gpu: If True, attempt to run as many ops as possible on GPU.
-    force_gpu: If True, pin all ops to `/device:GPU:0`.
-    reset_test: If True, tearDown and SetUp the test case again.
+      session when executing graphs.
+    use_gpu: If True, attempt to run as many operations as possible on GPU.
+    reset_test: If True, tearDown and SetUp the test case between the two
+      executions of the test (once with and once without eager execution).
     assert_no_eager_garbage: If True, sets DEBUG_SAVEALL on the garbage
       collector and asserts that no extra garbage has been created when running
-      the test in eager mode. This will fail if there are reference cycles
-      (e.g. a = []; a.append(a)). Off by default because some tests may create
-      garbage for legitimate reasons (e.g. they define a class which inherits
-      from `object`), and because DEBUG_SAVEALL is sticky in some Python
-      interpreters (meaning that tests which rely on objects being collected
-      elsewhere in the unit test file will not work). Additionally, checks that
-      nothing still has a reference to Tensors that the test allocated.
+      the test with eager execution enabled. This will fail if there are
+      reference cycles (e.g. a = []; a.append(a)). Off by default because some
+      tests may create garbage for legitimate reasons (e.g. they define a class
+      which inherits from `object`), and because DEBUG_SAVEALL is sticky in some
+      Python interpreters (meaning that tests which rely on objects being
+      collected elsewhere in the unit test file will not work). Additionally,
+      checks that nothing still has a reference to Tensors that the test
+      allocated.
   Returns:
-    Returns a decorator that will run the decorated test function
-        using both a graph and using eager execution.
+    Returns a decorator that will run the decorated test method twice:
+    once by constructing and executing a graph in a session and once with
+    eager execution enabled.
   """
 
   assert not __unused__, "Add () after run_in_graph_and_eager_modes."
 
   def decorator(f):
-    """Test method decorator."""
-
     def decorated(self, **kwargs):
-      """Decorated the test method."""
       with context.graph_mode():
-        with self.test_session(graph, config, use_gpu, force_gpu):
+        with self.test_session(use_gpu=use_gpu):
           f(self, **kwargs)
 
       if reset_test:
         # This decorator runs the wrapped test twice.
         # Reset the test environment between runs.
         self.tearDown()
+        self._tempdir = None
         self.setUp()
 
-      def run_eager_mode(self, **kwargs):
-        if force_gpu:
-          gpu_name = gpu_device_name()
-          if not gpu_name:
-            gpu_name = "/device:GPU:0"
-          with context.device(gpu_name):
-            f(self)
-        elif use_gpu:
-          # TODO(xpan): Support softplacement and gpu by default when available.
-          f(self, **kwargs)
-        else:
-          with context.device("/device:CPU:0"):
+      def run_eagerly(self, **kwargs):
+        if not use_gpu:
+          with ops.device("/cpu:0"):
             f(self, **kwargs)
+        else:
+          f(self, **kwargs)
 
       if assert_no_eager_garbage:
-        run_eager_mode = assert_no_new_tensors(
-            assert_no_garbage_created(run_eager_mode))
+        run_eagerly = assert_no_new_tensors(
+            assert_no_garbage_created(run_eagerly))
 
       with context.eager_mode():
         with ops.Graph().as_default():
-          run_eager_mode(self, **kwargs)
+          run_eagerly(self, **kwargs)
 
     return decorated
 
@@ -615,15 +758,23 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
       return 0, 0
     return int(match.group(1)), int(match.group(2))
 
-  for local_device in device_lib.list_local_devices():
-    if local_device.device_type == "GPU":
-      if (min_cuda_compute_capability is None or
-          compute_capability_from_device_desc(local_device.physical_device_desc)
-          >= min_cuda_compute_capability):
+  try:
+    for local_device in device_lib.list_local_devices():
+      if local_device.device_type == "GPU":
+        if (min_cuda_compute_capability is None or
+            compute_capability_from_device_desc(
+                local_device.physical_device_desc) >=
+            min_cuda_compute_capability):
+          return True
+      if local_device.device_type == "SYCL" and not cuda_only:
         return True
-    if local_device.device_type == "SYCL" and not cuda_only:
-      return True
-  return False
+    return False
+  except errors_impl.NotFoundError as e:
+    if not all([x in str(e) for x in ["CUDA", "not find"]]):
+      raise e
+    else:
+      logging.error(str(e))
+      return False
 
 
 @contextlib.contextmanager
@@ -782,7 +933,7 @@ class TensorFlowTestCase(googletest.TestCase):
     Returns:
       tensors numpy values.
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       return self._eval_helper(tensors)
     else:
       sess = ops.get_default_session()
@@ -812,9 +963,9 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Use the `use_gpu` and `force_gpu` options to control where ops are run. If
     `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if
-    `use_gpu`
-    is True, TensorFlow tries to run as many ops on the GPU as possible. If both
-    `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
+    `use_gpu` is True, TensorFlow tries to run as many ops on the GPU as
+    possible. If both `force_gpu and `use_gpu` are False, all ops are pinned to
+    the CPU.
 
     Example:
     ```python
@@ -1062,8 +1213,14 @@ class TensorFlowTestCase(googletest.TestCase):
     self.assertTrue(self._NDArrayNear(ndarray1, ndarray2, err), msg=msg)
 
   def _GetNdArray(self, a):
+    # If a is a tensor then convert it to ndarray
+    if isinstance(a, ops.Tensor):
+      if isinstance(a, ops._EagerTensorBase):
+        return a.numpy()
+      else:
+        a = self.evaluate(a)
     if not isinstance(a, np.ndarray):
-      a = np.array(a)
+      return np.array(a)
     return a
 
   def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
@@ -1136,8 +1293,8 @@ class TensorFlowTestCase(googletest.TestCase):
       # Try to directly compare a, b as ndarrays; if not work, then traverse
       # through the sequence, which is more expensive.
       try:
-        a_as_ndarray = np.array(a)
-        b_as_ndarray = np.array(b)
+        a_as_ndarray = self._GetNdArray(a)
+        b_as_ndarray = self._GetNdArray(b)
         self._assertArrayLikeAllClose(
             a_as_ndarray,
             b_as_ndarray,
@@ -1166,22 +1323,24 @@ class TensorFlowTestCase(googletest.TestCase):
             msg="Mismatched value: a%s is different from b%s." % (path_str,
                                                                   path_str))
       except TypeError as e:
-        msg = "Error: a%s has %s, but b%s has %s" % (
-            path_str, type(a), path_str, type(b))
-        e.args = ((e.args[0] + ' : ' + msg,) + e.args[1:])
+        msg = "Error: a%s has %s, but b%s has %s" % (path_str, type(a),
+                                                     path_str, type(b))
+        e.args = ((e.args[0] + " : " + msg,) + e.args[1:])
         raise
 
   def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
-    """Asserts that two structures of numpy arrays, have near values.
+    """Asserts that two structures of numpy arrays or Tensors, have near values.
 
     `a` and `b` can be arbitrarily nested structures. A layer of a nested
     structure can be a `dict`, `namedtuple`, `tuple` or `list`.
 
     Args:
       a: The expected numpy `ndarray`, or anything that can be converted into a
-          numpy `ndarray`, or any arbitrarily nested of structure of these.
+         numpy `ndarray` (including Tensor), or any arbitrarily nested of
+         structure of these.
       b: The actual numpy `ndarray`, or anything that can be converted into a
-          numpy `ndarray`, or any arbitrarily nested of structure of these.
+         numpy `ndarray` (including Tensor), or any arbitrarily nested of
+         structure of these.
       rtol: relative tolerance.
       atol: absolute tolerance.
       msg: Optional message to report on failure.
@@ -1241,8 +1400,26 @@ class TensorFlowTestCase(googletest.TestCase):
 
     self.assertAllClose(a, b, rtol=rtol, atol=atol, msg=msg)
 
+  def assertNotAllClose(self, a, b, **kwargs):
+    """Assert that two numpy arrays, or or Tensors, do not have near values.
+
+    Args:
+      a: the first value to compare.
+      b: the second value to compare.
+      **kwargs: additional keyword arguments to be passed to the underlying
+        `assertAllClose` call.
+
+    Raises:
+      AssertionError: If `a` and `b` are unexpectedly close at all elements.
+    """
+    try:
+      self.assertAllClose(a, b, **kwargs)
+    except AssertionError:
+      return
+    raise AssertionError("The two values are close at all elements")
+
   def assertAllEqual(self, a, b, msg=None):
-    """Asserts that two numpy arrays have the same values.
+    """Asserts that two numpy arrays or Tensors have the same values.
 
     Args:
       a: the expected numpy ndarray or anything can be converted to one.
@@ -1256,7 +1433,9 @@ class TensorFlowTestCase(googletest.TestCase):
                      " %s" % (a.shape, b.shape, msg))
     same = (a == b)
 
-    if a.dtype == np.float32 or a.dtype == np.float64:
+    if (a.dtype in [
+        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
+    ]):
       same = np.logical_or(same, np.logical_and(np.isnan(a), np.isnan(b)))
     if not np.all(same):
       # Prints more details than np.testing.assert_array_equal.
@@ -1272,6 +1451,174 @@ class TensorFlowTestCase(googletest.TestCase):
       print("not equal rhs = ", y)
       np.testing.assert_array_equal(a, b, err_msg=msg)
 
+  def assertAllGreater(self, a, comparison_target):
+    """Assert element values are all greater than a target value.
+
+    Args:
+      a: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      comparison_target: The target value of comparison.
+    """
+    a = self._GetNdArray(a)
+    self.assertGreater(np.min(a), comparison_target)
+
+  def assertAllLess(self, a, comparison_target):
+    """Assert element values are all greater than a target value.
+
+    Args:
+      a: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      comparison_target: The target value of comparison.
+    """
+    a = self._GetNdArray(a)
+    self.assertLess(np.max(a), comparison_target)
+
+  def assertAllGreaterEqual(self, a, comparison_target):
+    """Assert element values are all greater than a target value.
+
+    Args:
+      a: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      comparison_target: The target value of comparison.
+    """
+    a = self._GetNdArray(a)
+    self.assertGreaterEqual(np.min(a), comparison_target)
+
+  def assertAllLessEqual(self, a, comparison_target):
+    """Assert element values are all greater than a target value.
+
+    Args:
+      a: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      comparison_target: The target value of comparison.
+    """
+    a = self._GetNdArray(a)
+    self.assertLessEqual(np.max(a), comparison_target)
+
+  def _format_subscripts(self, subscripts, value, limit=10, indent=2):
+    """Generate a summary of ndarray subscripts as a list of str.
+
+    If limit == N, this method will print up to the first N subscripts on
+    separate
+    lines. A line of ellipses (...) will be appended at the end if the number of
+    subscripts exceeds N.
+
+    Args:
+      subscripts: The tensor (np.ndarray) subscripts, of the same format as
+        np.where()'s return value, i.e., a tuple of arrays with each array
+        corresponding to a dimension. E.g., (array([1, 1]), array([0, 1])).
+      value: (np.ndarray) value of the tensor.
+      limit: (int) The maximum number of indices to print.
+      indent: (int) Number of characters to indent at the beginning of each
+        line.
+
+    Returns:
+      (list of str) the multi-line representation of the subscripts and values,
+        potentially with omission at the end.
+    """
+    lines = []
+    subscripts = np.transpose(subscripts)
+    prefix = " " * indent
+    for subscript in itertools.islice(subscripts, limit):
+      lines.append(prefix + str(subscript) + " : " +
+                   str(value[tuple(subscript)]))
+    if len(subscripts) > limit:
+      lines.append(prefix + "...")
+    return lines
+
+  def assertAllInRange(self,
+                       target,
+                       lower_bound,
+                       upper_bound,
+                       open_lower_bound=False,
+                       open_upper_bound=False):
+    """Assert that elements in a Tensor are all in a given range.
+
+    Args:
+      target: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      lower_bound: lower bound of the range
+      upper_bound: upper bound of the range
+      open_lower_bound: (`bool`) whether the lower bound is open (i.e., > rather
+        than the default >=)
+      open_upper_bound: (`bool`) whether the upper bound is open (i.e., < rather
+        than the default <=)
+
+    Raises:
+      AssertionError:
+        if the value tensor does not have an ordered numeric type (float* or
+          int*), or
+        if there are nan values, or
+        if any of the elements do not fall in the specified range.
+    """
+    target = self._GetNdArray(target)
+    if not (np.issubdtype(target.dtype, np.float) or
+            np.issubdtype(target.dtype, np.integer)):
+      raise AssertionError(
+          "The value of %s does not have an ordered numeric type, instead it "
+          "has type: %s" % (target, target.dtype))
+
+    nan_subscripts = np.where(np.isnan(target))
+    if np.size(nan_subscripts):
+      raise AssertionError(
+          "%d of the %d element(s) are NaN. "
+          "Subscripts(s) and value(s) of the NaN element(s):\n" %
+          (len(nan_subscripts[0]), np.size(target)) +
+          "\n".join(self._format_subscripts(nan_subscripts, target)))
+
+    range_str = (("(" if open_lower_bound else "[") + str(lower_bound) + ", " +
+                 str(upper_bound) + (")" if open_upper_bound else "]"))
+
+    violations = (
+        np.less_equal(target, lower_bound)
+        if open_lower_bound else np.less(target, lower_bound))
+    violations = np.logical_or(
+        violations,
+        np.greater_equal(target, upper_bound)
+        if open_upper_bound else np.greater(target, upper_bound))
+    violation_subscripts = np.where(violations)
+    if np.size(violation_subscripts):
+      raise AssertionError(
+          "%d of the %d element(s) are outside the range %s. " %
+          (len(violation_subscripts[0]), np.size(target), range_str) +
+          "Subscript(s) and value(s) of the offending elements:\n" +
+          "\n".join(self._format_subscripts(violation_subscripts, target)))
+
+  def assertAllInSet(self, target, expected_set):
+    """Assert that elements of a Tensor are all in a given closed set.
+
+    Args:
+      target: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      expected_set: (`list`, `tuple` or `set`) The closed set that the elements
+        of the value of `target` are expected to fall into.
+
+    Raises:
+      AssertionError:
+        if any of the elements do not fall into `expected_set`.
+    """
+    target = self._GetNdArray(target)
+
+    # Elements in target that are not in expected_set.
+    diff = np.setdiff1d(target.flatten(), list(expected_set))
+    if np.size(diff):
+      raise AssertionError("%d unique element(s) are not in the set %s: %s" %
+                           (np.size(diff), expected_set, diff))
+
+  def assertDTypeEqual(self, target, expected_dtype):
+    """Assert ndarray data type is equal to expected.
+
+    Args:
+      target: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      expected_dtype: Expected data type.
+    """
+    target = self._GetNdArray(target)
+    if not isinstance(target, list):
+      arrays = [target]
+    for arr in arrays:
+      self.assertEqual(arr.dtype, expected_dtype)
+
   # pylint: disable=g-doc-return-or-yield
   @contextlib.contextmanager
   def assertRaisesWithPredicateMatch(self, exception_type,
@@ -1348,8 +1695,7 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     device1 = pydev.canonical_name(device1)
     device2 = pydev.canonical_name(device2)
-    self.assertEqual(device1, device2,
-                     "Devices %s and %s are not equal. %s" % 
+    self.assertEqual(device1, device2, "Devices %s and %s are not equal. %s" %
                      (device1, device2, msg))
 
   # Fix Python 3 compatibility issues
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index a717eb39513ac3369ae133b6090ff82597f12eb7..8d492256aac17d48dc5ac801a4eaf019fbe277de 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -31,13 +31,16 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -82,6 +85,14 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     else:
       print("GoogleCuda is disabled")
 
+  def testIsMklEnabled(self):
+    # This test doesn't assert anything.
+    # It ensures the py wrapper function is generated correctly.
+    if test_util.IsMklEnabled():
+      print("MKL is enabled")
+    else:
+      print("MKL is disabled")
+
   def testAssertProtoEqualsStr(self):
 
     graph_str = "node { name: 'w1' op: 'params' }"
@@ -201,6 +212,21 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self._WeMustGoDeeper("name")
     self._WeMustGoDeeper("orig")
 
+  def testAllCloseTensors(self):
+    a_raw_data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+    a = constant_op.constant(a_raw_data)
+    b = math_ops.add(1, constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]]))
+    self.assertAllClose(a, b)
+    self.assertAllClose(a, a_raw_data)
+
+    a_dict = {"key": a}
+    b_dict = {"key": b}
+    self.assertAllClose(a_dict, b_dict)
+
+    x_list = [a, b]
+    y_list = [a_raw_data, b]
+    self.assertAllClose(x_list, y_list)
+
   def testAllCloseScalars(self):
     self.assertAllClose(7, 7 + 1e-8)
     with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
@@ -309,6 +335,12 @@ class TestUtilTest(test_util.TensorFlowTestCase):
         rtol=1e-8, atol=1e-8
     )
 
+    self.assertAllCloseAccordingToType(
+        constant_op.constant([1e-8], dtype=dtypes.float64),
+        constant_op.constant([2e-8], dtype=dtypes.float64),
+        rtol=1e-8,
+        atol=1e-8)
+
     with (self.assertRaises(AssertionError)):
       self.assertAllCloseAccordingToType(
           np.asarray([1e-7], dtype=np.float64),
@@ -324,6 +356,14 @@ class TestUtilTest(test_util.TensorFlowTestCase):
         float_rtol=1e-7, float_atol=1e-7
     )
 
+    self.assertAllCloseAccordingToType(
+        constant_op.constant([1e-7], dtype=dtypes.float32),
+        constant_op.constant([2e-7], dtype=dtypes.float32),
+        rtol=1e-8,
+        atol=1e-8,
+        float_rtol=1e-7,
+        float_atol=1e-7)
+
     with (self.assertRaises(AssertionError)):
       self.assertAllCloseAccordingToType(
           np.asarray([1e-6], dtype=np.float32),
@@ -341,6 +381,16 @@ class TestUtilTest(test_util.TensorFlowTestCase):
         half_rtol=1e-4, half_atol=1e-4
     )
 
+    self.assertAllCloseAccordingToType(
+        constant_op.constant([1e-4], dtype=dtypes.float16),
+        constant_op.constant([2e-4], dtype=dtypes.float16),
+        rtol=1e-8,
+        atol=1e-8,
+        float_rtol=1e-7,
+        float_atol=1e-7,
+        half_rtol=1e-4,
+        half_atol=1e-4)
+
     with (self.assertRaises(AssertionError)):
       self.assertAllCloseAccordingToType(
           np.asarray([1e-3], dtype=np.float16),
@@ -350,6 +400,157 @@ class TestUtilTest(test_util.TensorFlowTestCase):
           half_rtol=1e-4, half_atol=1e-4
       )
 
+  def testAssertAllEqual(self):
+    i = variables.Variable([100] * 3, dtype=dtypes.int32, name="i")
+    j = constant_op.constant([20] * 3, dtype=dtypes.int32, name="j")
+    k = math_ops.add(i, j, name="k")
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([120] * 3, k)
+    self.assertAllEqual([20] * 3, j)
+
+  def testAssertNotAllClose(self):
+    # Test with arrays
+    self.assertNotAllClose([0.1], [0.2])
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([-1.0, 2.0], [-1.0, 2.0])
+
+    # Test with tensors
+    x = constant_op.constant([1.0, 1.0], name="x")
+    y = math_ops.add(x, x)
+
+    self.assertAllClose([2.0, 2.0], y)
+    self.assertNotAllClose([0.9, 1.0], x)
+
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([1.0, 1.0], x)
+
+  def testAssertNotAllCloseRTol(self):
+    # Test with arrays
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([1.1, 2.1], [1.0, 2.0], rtol=0.2)
+
+    # Test with tensors
+    x = constant_op.constant([1.0, 1.0], name="x")
+    y = math_ops.add(x, x)
+
+    self.assertAllClose([2.0, 2.0], y)
+
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([0.9, 1.0], x, rtol=0.2)
+
+  def testAssertNotAllCloseATol(self):
+    # Test with arrays
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([1.1, 2.1], [1.0, 2.0], atol=0.2)
+
+    # Test with tensors
+    x = constant_op.constant([1.0, 1.0], name="x")
+    y = math_ops.add(x, x)
+
+    self.assertAllClose([2.0, 2.0], y)
+
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([0.9, 1.0], x, atol=0.2)
+
+  def testAssertAllGreaterLess(self):
+    x = constant_op.constant([100.0, 110.0, 120.0], dtype=dtypes.float32)
+    y = constant_op.constant([10.0] * 3, dtype=dtypes.float32)
+    z = math_ops.add(x, y)
+
+    self.assertAllClose([110.0, 120.0, 130.0], z)
+
+    self.assertAllGreater(x, 95.0)
+    self.assertAllLess(x, 125.0)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllGreater(x, 105.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllGreater(x, 125.0)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllLess(x, 115.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllLess(x, 95.0)
+
+  def testAssertAllGreaterLessEqual(self):
+    x = constant_op.constant([100.0, 110.0, 120.0], dtype=dtypes.float32)
+    y = constant_op.constant([10.0] * 3, dtype=dtypes.float32)
+    z = math_ops.add(x, y)
+
+    self.assertAllEqual([110.0, 120.0, 130.0], z)
+
+    self.assertAllGreaterEqual(x, 95.0)
+    self.assertAllLessEqual(x, 125.0)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllGreaterEqual(x, 105.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllGreaterEqual(x, 125.0)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllLessEqual(x, 115.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllLessEqual(x, 95.0)
+
+  def testAssertAllInRangeWithNonNumericValuesFails(self):
+    s1 = constant_op.constant("Hello, ", name="s1")
+    c = constant_op.constant([1 + 2j, -3 + 5j], name="c")
+    b = constant_op.constant([False, True], name="b")
+
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(s1, 0.0, 1.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(c, 0.0, 1.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(b, 0, 1)
+
+  def testAssertAllInRange(self):
+    x = constant_op.constant([10.0, 15.0], name="x")
+    self.assertAllInRange(x, 10, 15)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 10, 15, open_lower_bound=True)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 10, 15, open_upper_bound=True)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(
+          x, 10, 15, open_lower_bound=True, open_upper_bound=True)
+
+  def testAssertAllInRangeErrorMessageEllipses(self):
+    x_init = np.array([[10.0, 15.0]] * 12)
+    x = constant_op.constant(x_init, name="x")
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 5, 10)
+
+  def testAssertAllInRangeDetectsNaNs(self):
+    x = constant_op.constant(
+        [[np.nan, 0.0], [np.nan, np.inf], [np.inf, np.nan]], name="x")
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 0.0, 2.0)
+
+  def testAssertAllInRangeWithInfinities(self):
+    x = constant_op.constant([10.0, np.inf], name="x")
+    self.assertAllInRange(x, 10, np.inf)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 10, np.inf, open_upper_bound=True)
+
+  def testAssertAllInSet(self):
+    b = constant_op.constant([True, False], name="b")
+    x = constant_op.constant([13, 37], name="x")
+
+    self.assertAllInSet(b, [False, True])
+    self.assertAllInSet(b, (False, True))
+    self.assertAllInSet(b, {False, True})
+    self.assertAllInSet(x, [0, 13, 37, 42])
+    self.assertAllInSet(x, (0, 13, 37, 42))
+    self.assertAllInSet(x, {0, 13, 37, 42})
+
+    with self.assertRaises(AssertionError):
+      self.assertAllInSet(b, [False])
+    with self.assertRaises(AssertionError):
+      self.assertAllInSet(x, (42,))
+
   def testRandomSeed(self):
     # Call setUp again for WithCApi case (since it makes a new defeault graph
     # after setup).
@@ -440,6 +641,26 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
     LeakedTensorTest().test_has_no_leak()
 
+  def test_no_new_objects_decorator(self):
+
+    class LeakedObjectTest(object):
+
+      def __init__(inner_self):  # pylint: disable=no-self-argument
+        inner_self.assertEqual = self.assertEqual  # pylint: disable=invalid-name
+        inner_self.accumulation = []
+
+      @test_util.assert_no_new_pyobjects_executing_eagerly
+      def test_has_leak(self):
+        self.accumulation.append([1.])
+
+      @test_util.assert_no_new_pyobjects_executing_eagerly
+      def test_has_no_leak(self):
+        self.not_accumulating = [1.]
+
+    with self.assertRaises(AssertionError):
+      LeakedObjectTest().test_has_leak()
+
+    LeakedObjectTest().test_has_no_leak()
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index 06955b885852a641bc814f88c99838effe03bfd4..472ccbcac7a447926989cfbef27ec1ea9d71e91c 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -29,15 +29,19 @@ __cxx11_abi_flag__ = pywrap_tensorflow.__cxx11_abi_flag__
 __monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 
 VERSION = __version__
-tf_export("VERSION").export_constant(__name__, "VERSION")
+tf_export("VERSION", "__version__").export_constant(__name__, "VERSION")
 GIT_VERSION = __git_version__
-tf_export("GIT_VERSION").export_constant(__name__, "GIT_VERSION")
+tf_export("GIT_VERSION", "__git_version__").export_constant(
+    __name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
-tf_export("COMPILER_VERSION").export_constant(__name__, "COMPILER_VERSION")
+tf_export("COMPILER_VERSION", "__compiler_version__").export_constant(
+    __name__, "COMPILER_VERSION")
 CXX11_ABI_FLAG = __cxx11_abi_flag__
-tf_export("CXX11_ABI_FLAG").export_constant(__name__, "CXX11_ABI_FLAG")
+tf_export("CXX11_ABI_FLAG", "__cxx11_abi_flag__").export_constant(
+    __name__, "CXX11_ABI_FLAG")
 MONOLITHIC_BUILD = __monolithic_build__
-tf_export("MONOLITHIC_BUILD").export_constant(__name__, "MONOLITHIC_BUILD")
+tf_export("MONOLITHIC_BUILD", "__monolithic_build__").export_constant(
+    __name__, "MONOLITHIC_BUILD")
 
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
 tf_export("GRAPH_DEF_VERSION").export_constant(__name__, "GRAPH_DEF_VERSION")
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 8079cb307bb1f5904b71bae891d5ef5f1e749e66..6816e204075bc37c6958efa5b028417078c36b2b 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -206,7 +206,7 @@ static PyObject* TF_ListDevices(GCluster cluster) {
   return result;
 }
 
-static std::vector<string> TF_ListAvailableOps() {
+static PyObject* TF_ListAvailableOps() {
   tensorflow::OpRegistry* registry = tensorflow::OpRegistry::Global();
   std::vector<tensorflow::OpDef> ops;
   registry->GetRegisteredOps(&ops);
@@ -215,7 +215,14 @@ static std::vector<string> TF_ListAvailableOps() {
     op_names.push_back(op.name());
   }
   std::sort(op_names.begin(), op_names.end());
-  return op_names;
+
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* result = PyList_New(op_names.size());
+  for (int i = 0; i < op_names.size(); ++i) {
+    PyList_SetItem(result, i, PyString_FromString(op_names[i].c_str()));
+  }
+  PyGILState_Release(gstate);
+  return result;
 }
 
 static PyObject* TF_GetSupportedDevices(GCluster cluster, GItem item) {
@@ -313,7 +320,8 @@ static PyObject* TF_MeasureCosts(
   tensorflow::OpPerformanceList op_performance_data;
   tensorflow::StepStats step_stats;
 
-  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), 10, 0);
+  const int num_measurements = cluster->type() == "virtual" ? 1 : 10;
+  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), num_measurements, 0);
 
   tensorflow::grappler::Costs costs;
   tensorflow::Status status = _GetOpPerformanceDataAndRunTime(
@@ -432,7 +440,7 @@ static GCluster TF_NewVirtualCluster(
     TF_Status* out_status);
 static void TF_ShutdownCluster(GCluster cluster);
 static PyObject* TF_ListDevices(GCluster cluster);
-static std::vector<string> TF_ListAvailableOps();
+static PyObject* TF_ListAvailableOps();
 static PyObject* TF_GetSupportedDevices(GCluster cluster, GItem item);
 static float TF_EstimatePerformance(const tensorflow::NamedDevice& device);
 static PyObject* TF_MeasureCosts(
diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py
index caae5b114e1f896fc758613380a6f702853d98a5..26c6f22d34b27c8b866c0b23a36fdef5164348a4 100644
--- a/tensorflow/python/grappler/cluster_test.py
+++ b/tensorflow/python/grappler/cluster_test.py
@@ -87,9 +87,10 @@ class ClusterTest(test.TestCase):
 
   def testVirtualCluster(self):
     with ops.Graph().as_default() as g:
-      a = random_ops.random_uniform(shape=())
-      b = random_ops.random_uniform(shape=())
-      c = a + b
+      with ops.device('/device:GPU:0'):
+        a = random_ops.random_uniform(shape=[1024, 1024])
+        b = random_ops.random_uniform(shape=[1024, 1024])
+        c = a + b
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(c)
       mg = meta_graph.create_meta_graph_def(graph=g)
@@ -102,10 +103,13 @@ class ClusterTest(test.TestCase):
               'architecture': '7'
           })
       named_device = device_properties_pb2.NamedDevice(
-          properties=device_properties, name='/GPU:0')
-      grappler_cluster = cluster.Cluster(devices=[named_device])
+          properties=device_properties, name='/device:GPU:0')
+      grappler_cluster = cluster.Cluster(
+          disable_detailed_stats=False,
+          disable_timeline=False,
+          devices=[named_device])
       op_perfs, run_time, _ = grappler_cluster.MeasureCosts(grappler_item)
-      self.assertGreater(run_time, 0)
+      self.assertEqual(run_time, 0.000545)
       self.assertEqual(len(op_perfs), 15)
 
       estimated_perf = grappler_cluster.EstimatePerformance(named_device)
@@ -131,8 +135,8 @@ class ClusterTest(test.TestCase):
   def testAvailableOps(self):
     with cluster.Provision() as gcluster:
       op_names = gcluster.ListAvailableOps()
-      self.assertTrue(b'Add' in op_names)
-      self.assertTrue(b'MatMul' in op_names)
+      self.assertTrue('Add' in op_names)
+      self.assertTrue('MatMul' in op_names)
       self.assertEqual(op_names, sorted(op_names))
 
   def testSupportDevices(self):
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab1d0ed25b9130fabcffbb8da2265c046206da46
--- /dev/null
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Grappler Constant Folding."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ConstantFoldingTest(test.TestCase):
+
+  # See b/76008022.
+  def testScanInsideWhile(self):
+
+    def loop_cond(idx_step, *unused_args):
+      return idx_step < 1
+
+    def loop_body(idx_step, y):
+      x = array_ops.zeros([10, 20, 30], dtype=dtypes.float32)
+      x = functional_ops.scan(
+          math_ops.add,
+          x,
+          initializer=array_ops.zeros([20, 30], dtype=dtypes.float32),
+          back_prop=False,
+          parallel_iterations=1)
+
+      with ops.device('/cpu:0'):
+        y = array_ops.identity(x)
+
+        return idx_step + 1, y
+
+    if test.is_gpu_available(cuda_only=True):
+      init_y = array_ops.zeros([10, 20, 30], dtype=dtypes.float32)
+      _, y = control_flow_ops.while_loop(
+          loop_cond,
+          loop_body,
+          loop_vars=[0, init_y],
+          back_prop=False,
+          parallel_iterations=1)
+      with session.Session() as sess:
+        y_v = sess.run(y)
+        self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/grappler/controller.py b/tensorflow/python/grappler/controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..5677f4f52310dd68dc80c87275b50be95ba86b60
--- /dev/null
+++ b/tensorflow/python/grappler/controller.py
@@ -0,0 +1,142 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Controller Class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+
+
+class Controller(object):
+  """Controller class."""
+
+  def __init__(self, item, cluster):
+    """Controller class initializer.
+
+    Args:
+      item: The metagraph to place wrapped in a cluster.
+      cluster: A cluster of devices on which to place the item.
+    """
+    self.item = item
+
+    self._node = {}
+    for node in item.metagraph.graph_def.node:
+      self._node[node.name] = node
+
+    self._fanout = defaultdict(lambda: [])
+    for node in item.metagraph.graph_def.node:
+      for fanin in self._get_node_fanin(node):
+        self._fanout[fanin.name].append(node)
+
+    important_op_names = item.IdentifyImportantOps(sort_topologically=True)
+
+    # List of important ops (these are the ops to place) sorted in topological
+    # order. The order of this collection is deterministic.
+    self.important_ops = []
+    for name in important_op_names:
+      self.important_ops.append(self._node[name])
+
+    self.node_properties = item.GetOpProperties()
+
+    self.cluster = cluster
+    self.devices = cluster.ListDevices()
+
+    self.colocation_constraints = item.GetColocationGroups()
+
+    self.placement_constraints = cluster.GetSupportedDevices(item)
+    for node_name, dev in self.placement_constraints.items():
+      if len(dev) == 1:
+        # Place the node on the supported device
+        node = self._node[node_name]
+        node.device = dev[0]
+        fanout = self.get_node_fanout(node)
+        # Update the fanout of the fanin to bypass the node
+        for fanin in self._get_node_fanin(node):
+          fanout_of_fanin = self.get_node_fanout(fanin)
+          fanout_of_fanin += fanout
+          fanout_of_fanin.remove(node)
+        # Remove node from the list of important ops since we don't need to
+        # place the node.
+        if node in self.important_ops:
+          self.important_ops.remove(node)
+          important_op_names.remove(node.name)
+
+    # List of important op names, in non deterministic order.
+    self.important_op_names = frozenset(important_op_names)
+
+  @property
+  def input_graph_def(self):
+    return self.item.metagraph.graph_def
+
+  @property
+  def num_devices(self):
+    return len(self.devices)
+
+  def get_node_by_name(self, node_name):
+    return self._node[node_name]
+
+  def get_node_fanout(self, node):
+    return self._fanout[node.name]
+
+  def get_placements(self, *args, **kwargs):
+    """Returns: Two TF ops.
+
+    Args:
+      *args: "".
+      **kwargs: "".
+
+    Returns:
+      y_preds: tensor of size [batch_size, num_ops]
+      log_probs: python dict of at least two fields: "sample", "target" each
+      containing a tensor of size [batch_size], corresponding to the log_probs.
+    """
+    raise NotImplementedError
+
+  def eval_placement(self, sess, *args, **kwargs):
+    """At this time, this method evaluates ONLY ONE placement.
+
+    Args:
+      sess: a tf.Session() object used to retrieve cached assignment info.
+      *args: "".
+      **kwargs: "".
+
+    Returns:
+      run_time: scalar
+    """
+    raise NotImplementedError
+
+  def export_placement(self, metagraph):
+    """Annotate the placement onto the specified metagraph.
+
+    Args:
+      metagraph: the metagraph to annotate with the placement.
+    """
+    for node in metagraph.graph_def.node:
+      if node.name in self.important_op_names:
+        node.device = self.get_node_by_name(node.name).device
+
+  # Get the nodes in the immediate fanin of node.
+  # Beware: this doesn't take into account the nodes that may be skipped
+  # since placement constraints force their placement.
+  def _get_node_fanin(self, node):
+    input_ops = []
+    for fanin_name in node.input:
+      if fanin_name[0] == "^":
+        fanin_name = fanin_name[1:]
+      fanin_name = fanin_name.split(":")[0]
+      input_ops.append(self.get_node_by_name(fanin_name))
+    return input_ops
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index 88bf900dca6d97773959eb309a4a3c5931fdcb88..b474e19894957d01c7c8978282c547df81a9b2b3 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -30,11 +30,12 @@ CostAnalyzer::CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
       analytical_estimator_(cluster, false),
       suffix_(suffix) {}
 
-Status CostAnalyzer::GenerateReport(std::ostream& os, bool per_node_report) {
+Status CostAnalyzer::GenerateReport(std::ostream& os, bool per_node_report,
+                                    bool verbose) {
   GatherCosts();
   PreprocessCosts();
   AnalyzeCosts();
-  PrintAnalysis(os, per_node_report);
+  PrintAnalysis(os, per_node_report, verbose);
   return Status::OK();
 }
 
@@ -158,7 +159,8 @@ void CostAnalyzer::AnalyzeCosts() {
   }
 }
 
-void CostAnalyzer::PrintAnalysis(std::ostream& os, bool per_node_report) const {
+void CostAnalyzer::PrintAnalysis(std::ostream& os, bool per_node_report,
+                                 bool verbose) const {
   os << std::endl;
   os << std::left << std::setw(50)
      << "Total time measured in ns (serialized): " << std::right
@@ -227,10 +229,55 @@ void CostAnalyzer::PrintAnalysis(std::ostream& os, bool per_node_report) const {
   os << std::endl;
 
   if (per_node_report) {
-    os << "Below is the per-node report:" << std::endl;
-    os << op_perf_.DebugString();
+    if (verbose) {
+      os << "Below is the full per-node report:" << std::endl;
+      os << op_perf_.DebugString();
+    } else {
+      os << "Below is the per-node report summary:" << std::endl;
+      int width = 35;
+      int width_narrow = 15;
+      int width_wide = 20;
+      os << std::setw(width + 1) << "Op,";
+      os << std::setw(width_wide + 1) << "Measured time (ns),";
+      os << std::setw(width_wide + 1) << "Compute time (ns),";
+      os << std::setw(width_wide + 1) << "Memory time (ns),";
+      os << std::setw(width_narrow + 2) << "Compute eff,";
+      os << std::setw(width_narrow + 2) << "Memory eff,";
+      os << "    Inputs" << std::endl;
+      for (int i = 0; i < op_perf_.op_performance_size(); i++) {
+        const auto& perf = op_perf_.op_performance(i);
+        string op_name = perf.op().op();
+        os << std::setw(width) << op_name << ",";
+        os << std::setw(width_wide) << perf.compute_cost() << ",";
+        os << std::setw(width_wide) << perf.compute_time() << ",";
+        os << std::setw(width_wide) << perf.memory_time() << ",";
+        os << std::setw(width_narrow) << std::setprecision(2)
+           << perf.compute_efficiency() * 100 << "%,";
+        os << std::setw(width_narrow) << std::setprecision(2)
+           << perf.memory_efficiency() * 100 << "%,";
+        os << "    [";
+        for (int j = 0; j < perf.op().inputs_size(); j++) {
+          const auto& shape = perf.op().inputs(j).shape();
+          if (shape.dim_size() > 0) {
+            os << "(";
+            std::vector<int> dims;
+            for (int k = 0; k < shape.dim_size(); k++) {
+              os << shape.dim(k).size();
+              if (k < shape.dim_size() - 1) {
+                os << ", ";
+              }
+            }
+            os << ")";
+            if (j < perf.op().inputs_size() - 1) {
+              os << ", ";
+            }
+          }
+        }
+        os << "]" << std::endl;
+      }
+      os << std::endl;
+    }
   }
 }
-
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/python/grappler/cost_analyzer.h b/tensorflow/python/grappler/cost_analyzer.h
index 0e860e0fee9923510292d3cf1a8069435787476f..b5364aa37ab2fbbeb0a33e6764539cca795f2fa6 100644
--- a/tensorflow/python/grappler/cost_analyzer.h
+++ b/tensorflow/python/grappler/cost_analyzer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <iostream>
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/cost_estimator.h"
@@ -50,7 +51,7 @@ class CostAnalyzer {
  public:
   explicit CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
                         const string& suffix);
-  Status GenerateReport(std::ostream& os, bool per_node_report);
+  Status GenerateReport(std::ostream& os, bool per_node_report, bool verbose);
 
  private:
   void PredictCosts(CostEstimator* cost_estimator, CostGraphDef* cost_graph,
@@ -59,7 +60,8 @@ class CostAnalyzer {
   void PreprocessCosts();
   void AnalyzeCosts();
   void SortOpsByTime(std::map<string, OpPerfSummary> ops);
-  void PrintAnalysis(std::ostream& os, bool per_node_report) const;
+  void PrintAnalysis(std::ostream& os, bool per_node_report,
+                     bool verbose) const;
 
   const GrapplerItem* item_;
   MeasuringCostEstimator measure_estimator_;
diff --git a/tensorflow/python/grappler/cost_analyzer.i b/tensorflow/python/grappler/cost_analyzer.i
index 4c0953435ba3fa6423bbc869fcca909d0c2ccb25..8f7fdb47f267bea582e371eb9ea6982b6e9341ad 100644
--- a/tensorflow/python/grappler/cost_analyzer.i
+++ b/tensorflow/python/grappler/cost_analyzer.i
@@ -44,7 +44,7 @@ limitations under the License.
 
 %{
 string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool per_node_report,
-                          GCluster cluster) {
+                          bool verbose, GCluster cluster) {
   tensorflow::grappler::ItemConfig cfg;
   cfg.apply_optimizations = false;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
@@ -57,11 +57,11 @@ string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool per_no
   tensorflow::grappler::CostAnalyzer analyzer(*item, cluster.get(), suffix);
 
   std::stringstream os;
-  analyzer.GenerateReport(os, per_node_report);
+  analyzer.GenerateReport(os, per_node_report, verbose);
   return os.str();
 }
 
 %}
 
 string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool per_node_report,
-                          GCluster cluster);
+                          bool verbose, GCluster cluster);
diff --git a/tensorflow/python/grappler/cost_analyzer.py b/tensorflow/python/grappler/cost_analyzer.py
index a1ff915c61ba14d9a899d7f6c9a2c49855969b00..6a4690e91ba981706eed0d9fdfae2e64359d0416 100644
--- a/tensorflow/python/grappler/cost_analyzer.py
+++ b/tensorflow/python/grappler/cost_analyzer.py
@@ -24,7 +24,10 @@ from tensorflow.python.grappler import cluster as gcluster
 from tensorflow.python.grappler import item as gitem
 
 
-def GenerateCostReport(metagraph, per_node_report=False, cluster=None):
+def GenerateCostReport(metagraph,
+                       per_node_report=False,
+                       verbose=False,
+                       cluster=None):
   """Analyze the cost of each TensorFlow op and node in the provided metagraph.
 
   Args:
@@ -32,6 +35,7 @@ def GenerateCostReport(metagraph, per_node_report=False, cluster=None):
     per_node_report: by default the report contains stats aggregated on a per op
       type basis, setting per_node_report to True adds results for each
       individual node to the report.
+    verbose: Prints out the entire operation proto instead of a summary table.
     cluster: Analyze the costs using the specified cluster, or the local machine
       if no cluster was specified.
 
@@ -42,8 +46,9 @@ def GenerateCostReport(metagraph, per_node_report=False, cluster=None):
     cluster = gcluster.Cluster(disable_detailed_stats=False)
 
   with errors.raise_exception_on_not_ok_status():
-    ret_from_swig = tf_wrap.GenerateCostReport(
-        metagraph.SerializeToString(), per_node_report, cluster.tf_cluster)
+    ret_from_swig = tf_wrap.GenerateCostReport(metagraph.SerializeToString(),
+                                               per_node_report, verbose,
+                                               cluster.tf_cluster)
   return ret_from_swig
 
 
diff --git a/tensorflow/python/grappler/cost_analyzer_test.py b/tensorflow/python/grappler/cost_analyzer_test.py
index 511908c79ce47d6849bf97d11bc42f2f1bb13f18..b8225b81a52f1a2ee10663544d54f1c9bd7ee785 100644
--- a/tensorflow/python/grappler/cost_analyzer_test.py
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@@ -48,7 +48,7 @@ class CostAnalysisTest(test.TestCase):
     train_op.append(d)
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
 
-    report = cost_analyzer.GenerateCostReport(mg)
+    report = cost_analyzer.GenerateCostReport(mg, per_node_report=True)
 
     # Check the report headers
     self.assertTrue(b"Total time measured in ns (serialized):" in report)
@@ -57,6 +57,26 @@ class CostAnalysisTest(test.TestCase):
     self.assertTrue(b"Total time analytical in ns (lower bound):" in report)
     self.assertTrue(b"Overall efficiency (analytical upper/actual):" in report)
     self.assertTrue(b"Overall efficiency (analytical lower/actual):" in report)
+    self.assertTrue(b"Below is the per-node report summary:" in report)
+
+    # Also print the report to make it easier to debug
+    print("{}".format(report))
+
+  def testVerbose(self):
+    """Make sure the full report is generated with verbose=True."""
+    a = constant_op.constant(10, name="a")
+    b = constant_op.constant(20, name="b")
+    c = math_ops.add_n([a, b], name="c")
+    d = math_ops.add_n([b, c], name="d")
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    report = cost_analyzer.GenerateCostReport(
+        mg, per_node_report=True, verbose=True)
+
+    # Check the report headers
+    self.assertTrue(b"Below is the full per-node report:" in report)
 
     # Also print the report to make it easier to debug
     print("{}".format(report))
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index 86db87d51530621d71b9b90f145e0f10d0b72443..0853db252406966cec36b63efafec9ec755c7e87 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -35,7 +35,8 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.training import saver
 
 
-def main(_):
+def get_metagraph():
+  """Constructs and returns a MetaGraphDef from the input file."""
   if FLAGS.metagraphdef:
     with gfile.GFile(FLAGS.metagraphdef) as meta_file:
       metagraph = meta_graph_pb2.MetaGraphDef()
@@ -45,7 +46,8 @@ def main(_):
         metagraph.ParseFromString(meta_file.read())
     if FLAGS.fetch is not None:
       fetch_collection = meta_graph_pb2.CollectionDef()
-      fetch_collection.node_list.value.append(FLAGS.fetch)
+      for fetch in FLAGS.fetch.split(","):
+        fetch_collection.node_list.value.append(fetch)
       metagraph.collection_def["train_op"].CopyFrom(fetch_collection)
   else:
     with gfile.GFile(FLAGS.graphdef) as graph_file:
@@ -56,21 +58,28 @@ def main(_):
         graph_def.ParseFromString(graph_file.read())
       importer.import_graph_def(graph_def, name="")
       graph = ops.get_default_graph()
-      fetch = graph.get_operation_by_name(FLAGS.fetch)
-      graph.add_to_collection("train_op", fetch)
+      for fetch in FLAGS.fetch.split(","):
+        fetch_op = graph.get_operation_by_name(fetch)
+        graph.add_to_collection("train_op", fetch_op)
       metagraph = saver.export_meta_graph(
           graph_def=graph.as_graph_def(), graph=graph)
+  return metagraph
+
 
+def main(_):
+  metagraph = get_metagraph()
   rewriter_config = rewriter_config_pb2.RewriterConfig()
   if FLAGS.rewriter_config is not None:
     text_format.Merge(FLAGS.rewriter_config, rewriter_config)
   optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
   metagraph.graph_def.CopyFrom(optimized_graph)
 
-  report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report)
-  print(report)
-  report = cost_analyzer.GenerateMemoryReport(metagraph)
+  report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report,
+                                            FLAGS.verbose)
   print(report)
+  if FLAGS.memory_report:
+    report = cost_analyzer.GenerateMemoryReport(metagraph)
+    print(report)
 
 
 if __name__ == "__main__":
@@ -89,9 +98,7 @@ if __name__ == "__main__":
       "--fetch",
       type=str,
       default=None,
-      help=
-      "The name of the fetch node."
-  )
+      help="The names of the fetch node delimited by comma.")
   parser.add_argument(
       "--rewriter_config",
       type=str,
@@ -107,5 +114,13 @@ if __name__ == "__main__":
       help="Generate per-node report. By default the report contains stats "
       "aggregated on a per op type basis, per_node_report adds results "
       "for each individual node to the report.")
+  parser.add_argument(
+      "--memory_report",
+      action="store_true",
+      help="Generate memory usage report.")
+  parser.add_argument(
+      "--verbose",
+      action="store_true",
+      help="Generate verbose reports. By default, succinct reports are used.")
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/grappler/graph_placer.py b/tensorflow/python/grappler/graph_placer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd51df4d962583555e08ae973ab43d15ba01997
--- /dev/null
+++ b/tensorflow/python/grappler/graph_placer.py
@@ -0,0 +1,120 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Graph Placer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops as tf_ops
+from tensorflow.python.grappler import cluster as gcluster
+from tensorflow.python.grappler import hierarchical_controller
+from tensorflow.python.grappler import item as gitem
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.training import training
+
+
+def PlaceGraph(metagraph,
+               cluster=None,
+               allotted_time=3600,
+               hparams=None,
+               verbose=False):
+  """Place the provided metagraph.
+
+  Args:
+    metagraph: the metagraph to place.
+    cluster: an optional set of hardware resource to optimize the placement for.
+      If none is specified, we'll optimize the placement for the hardware
+      available on the local machine.
+    allotted_time: the maximum amount to time in seconds to spend optimizing
+      the placement.
+    hparams: hyperparameters used to fine tune the placer.
+    verbose: prints debug information if True.
+
+  Returns:
+    The placed metagraph.
+  """
+  if cluster is None:
+    cluster = gcluster.Cluster()
+
+  # Optimize the metagraph to speedup the placement
+  rewriter_config = rewriter_config_pb2.RewriterConfig()
+  rewriter_config.optimizers.append("pruning")
+  rewriter_config.optimizers.append("constfold")
+  rewriter_config.optimizers.append("arithmetic")
+  rewriter_config.optimizers.append("dependency")
+  rewriter_config.optimizers.append("pruning")
+  optimized_graph = tf_optimizer.OptimizeGraph(
+      rewriter_config, metagraph, verbose=verbose, cluster=cluster)
+  optimized_metagraph = meta_graph_pb2.MetaGraphDef()
+  optimized_metagraph.CopyFrom(metagraph)
+  optimized_metagraph.graph_def.CopyFrom(optimized_graph)
+
+  item = gitem.Item(optimized_metagraph)
+
+  # Measure the runtime achievable with the original placement.
+  try:
+    _, original_run_time, _ = cluster.MeasureCosts(item)
+    if verbose:
+      print("Runtime for original placement: " + str(original_run_time))
+  except errors.OpError as e:
+    if verbose:
+      print("Original placement isn't feasible: " + str(e))
+    original_run_time = hparams.failing_signal
+
+  if hparams is None:
+    hparams = hierarchical_controller.hierarchical_controller_hparams()
+  # We run with a single child
+  hparams.num_children = 1
+
+  with tf_ops.Graph().as_default():
+    # Place all the nodes of the controller on the CPU. We don't want them to
+    # fight for accelerator memory with the model to optimize.
+    with tf_ops.device("/device:CPU:0"):
+      model = hierarchical_controller.HierarchicalController(
+          hparams, item, cluster)
+      ops = model.build_controller()
+      session_creator = training.ChiefSessionCreator()
+      with training.MonitoredSession(session_creator=session_creator) as sess:
+        start_time = time.time()
+        current_time = start_time
+        while current_time - start_time < allotted_time:
+          grouping_actions = model.generate_grouping(sess)
+          input_to_seq2seq = model.create_group_embeddings(
+              grouping_actions, verbose=verbose)
+          model.generate_placement(input_to_seq2seq, sess)
+          try:
+            run_time = model.eval_placement(
+                sess,
+                verbose=verbose)
+          except errors.OpError as e:
+            if verbose:
+              print("Failed to run graph:" + str(e))
+            run_time = hparams.failing_signal
+          updated = model.update_reward(sess, run_time, verbose=verbose)
+          if updated and run_time < original_run_time:
+            if verbose:
+              print("Found better placement, with runtime " + str(run_time))
+            model.export_placement(metagraph)
+
+          model.process_reward(sess)
+
+          current_time = time.time()
+
+  return metagraph
diff --git a/tensorflow/python/grappler/graph_placer_test.py b/tensorflow/python/grappler/graph_placer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eabe3cd5437022eb3b98010d0f384cc9f6bac2a
--- /dev/null
+++ b/tensorflow/python/grappler/graph_placer_test.py
@@ -0,0 +1,140 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests the graph placer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from tensorflow.core.protobuf import device_properties_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops as tf_ops
+from tensorflow.python.grappler import cluster
+from tensorflow.python.grappler import graph_placer
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+
+
+class GraphPlacerTest(test.TestCase):
+
+  @staticmethod
+  def _buildMnist(batch_size=128,
+                  input_size=256,
+                  num_classes=1024,
+                  num_layers=10,
+                  hidden_size=256,
+                  name='mnist'):
+    g = tf_ops.get_default_graph()
+    with g.as_default():
+      ops = {}
+      x = random_ops.random_uniform(
+          [batch_size, input_size], -0.1, 0.1, dtype=dtypes.float32)
+      for layer_id in range(num_layers):
+        with variable_scope.variable_scope('layer_{}'.format(layer_id)):
+          a = input_size if layer_id == 0 else hidden_size
+          b = hidden_size if layer_id < num_layers - 1 else num_classes
+          w = variable_scope.get_variable('w', [a, b])
+          x = math_ops.matmul(x, w)
+          x = nn_ops.relu(x)
+      ops['y_preds'] = math_ops.argmax(x, axis=1)
+
+    train_op = g.get_collection_ref(tf_ops.GraphKeys.TRAIN_OP)
+    train_op.append(ops['y_preds'])
+    return g
+
+  @staticmethod
+  def _buildCluster(num_cpus=1, num_gpus=1):
+    devices = []
+    if num_gpus > 0:
+      device_properties = device_properties_pb2.DeviceProperties(
+          type='GPU',
+          vendor='NVidia',
+          model='GeForce GTX TITAN X',
+          frequency=1076,
+          num_cores=24,
+          environment={'architecture': '5.2',
+                       'cuda': '8000',
+                       'cudnn': '6021'},
+          num_registers=65536,
+          l1_cache_size=24576,
+          l2_cache_size=3145728,
+          shared_memory_size_per_multiprocessor=98304,
+          memory_size=12783648768,
+          bandwidth=336480000)
+      for i in range(num_gpus):
+        devices.append(
+            device_properties_pb2.NamedDevice(
+                properties=device_properties, name='/GPU:' + str(i)))
+
+    assert num_cpus > 0
+    device_properties = device_properties_pb2.DeviceProperties(
+        type='CPU',
+        frequency=2000,
+        num_cores=4,
+        l1_cache_size=32768,
+        l2_cache_size=262144,
+        l3_cache_size=12582912)
+    for i in range(num_cpus):
+      devices.append(
+          device_properties_pb2.NamedDevice(
+              properties=device_properties, name='/CPU:' + str(i)))
+
+    return cluster.Cluster(devices=devices)
+
+  def testBasic(self):
+    """Place a trivial graph."""
+    a = constant_op.constant(10, name='a')
+    b = constant_op.constant(20, name='b')
+    c = math_ops.add_n([a, b], name='c')
+    d = math_ops.add_n([b, c], name='d')
+    train_op = tf_ops.get_collection_ref(tf_ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+    mg = meta_graph.create_meta_graph_def(graph=tf_ops.get_default_graph())
+
+    gcluster = cluster.Cluster()
+    placed_mg = graph_placer.PlaceGraph(mg, allotted_time=15, cluster=gcluster)
+
+    self.assertEqual(4, len(placed_mg.graph_def.node))
+    self.assertItemsEqual([node.name for node in placed_mg.graph_def.node],
+                          [node.name for node in mg.graph_def.node])
+
+    available_devices = [device.name for device in gcluster.ListDevices()]
+    for node in placed_mg.graph_def.node:
+      # The constant nodes are optimized away before the placer is run, and
+      # therefore won't be placed.
+      self.assertTrue(not node.device or node.device in available_devices)
+
+  def testMNIST(self):
+    graph = GraphPlacerTest._buildMnist()
+    mg = meta_graph.create_meta_graph_def(graph=graph)
+    gcluster = GraphPlacerTest._buildCluster(num_gpus=1)
+    # Spend 15 seconds trying to optimize the placement of the model. This
+    # should give us enough time to exercise the code, but not enough to find
+    # a good placement, so we'll just check for legality.
+    placed_mg = graph_placer.PlaceGraph(mg, allotted_time=15, cluster=gcluster)
+    self.assertEqual(len(placed_mg.graph_def.node), len(mg.graph_def.node))
+    self.assertItemsEqual([node.name for node in placed_mg.graph_def.node],
+                          [node.name for node in mg.graph_def.node])
+    available_devices = [device.name for device in gcluster.ListDevices()]
+    for node in placed_mg.graph_def.node:
+      self.assertTrue(not node.device or node.device in available_devices)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/grappler/hierarchical_controller.py b/tensorflow/python/grappler/hierarchical_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0866c1069ac7f7e25cbd12cb5a490e2ed5e4bec
--- /dev/null
+++ b/tensorflow/python/grappler/hierarchical_controller.py
@@ -0,0 +1,1117 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""HierarchicalController Class.
+
+The HierarchicalController encompasses the entire lifecycle of training the
+device placement policy, including generating op embeddings, getting groups for
+each op, placing those groups and running the predicted placements.
+
+Different assignment models can inherit from this class.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+import six
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops as tf_ops
+from tensorflow.python.grappler.controller import Controller
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import adam
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import learning_rate_decay
+from tensorflow.python.training import training_util
+
+
+class PlacerParams(object):
+  """Class to hold a set of placement parameters as name-value pairs.
+
+  A typical usage is as follows:
+
+  ```python
+  # Create a PlacerParams object specifying names and values of the model
+  # parameters:
+  params = PlacerParams(hidden_size=128, decay_steps=50)
+
+  # The parameters are available as attributes of the PlacerParams object:
+  hparams.hidden_size ==> 128
+  hparams.decay_steps ==> 50
+  ```
+
+  """
+
+  def __init__(self, **kwargs):
+    """Create an instance of `PlacerParams` from keyword arguments.
+
+    The keyword arguments specify name-values pairs for the parameters.
+    The parameter types are inferred from the type of the values passed.
+
+    The parameter names are added as attributes of `PlacerParams` object,
+    and they can be accessed directly with the dot notation `params._name_`.
+
+    Example:
+
+    ```python
+    # Define 1 parameter: 'hidden_size'
+    params = PlacerParams(hidden_size=128)
+    params.hidden_size ==> 128
+    ```
+
+    Args:
+      **kwargs: Key-value pairs where the key is the parameter name and
+        the value is the value for the parameter.
+    """
+    for name, value in six.iteritems(kwargs):
+      self.add_param(name, value)
+
+  def add_param(self, name, value):
+    """Adds {name, value} pair to hyperparameters.
+
+    Args:
+      name: Name of the hyperparameter.
+      value: Value of the hyperparameter. Can be one of the following types:
+        int, float, string, int list, float list, or string list.
+
+    Raises:
+      ValueError: if one of the arguments is invalid.
+    """
+    # Keys in kwargs are unique, but 'name' could be the name of a pre-existing
+    # attribute of this object.  In that case we refuse to use it as a
+    # parameter name.
+    if getattr(self, name, None) is not None:
+      raise ValueError("Parameter name is reserved: %s" % name)
+    setattr(self, name, value)
+
+
+def hierarchical_controller_hparams():
+  """Hyperparameters for hierarchical planner."""
+  return PlacerParams(
+      hidden_size=512,
+      forget_bias_init=1.0,
+      temperature=1.0,
+      logits_std_noise=0.5,
+      stop_noise_step=750,
+      decay_steps=50,
+      max_num_outputs=5,
+      max_output_size=5,
+      tanh_constant=1.0,
+      adj_embed_dim=20,
+      grouping_hidden_size=64,
+      num_groups=None,
+      bi_lstm=True,
+      failing_signal=100,
+      stop_sampling=500,
+      start_with_failing_signal=True,
+      always_update_baseline=False,
+      bl_dec=0.9,
+      grad_bound=1.0,
+      lr=0.1,
+      lr_dec=0.95,
+      start_decay_step=400,
+      optimizer_type="adam",
+      stop_updating_after_steps=1000,
+      name="hierarchical_controller",
+      keep_prob=1.0,
+      reward_function="sqrt",
+      seed=1234,
+      # distributed training params
+      num_children=1)
+
+
+class HierarchicalController(Controller):
+  """HierarchicalController class."""
+
+  def __init__(self, hparams, item, cluster, controller_id=0):
+    """HierarchicalController class initializer.
+
+    Args:
+      hparams: All hyper-parameters.
+      item: The metagraph to place.
+      cluster: The cluster of hardware devices to optimize for.
+      controller_id: the id of the controller in a multi-controller setup.
+    """
+    super(HierarchicalController, self).__init__(item, cluster)
+    self.ctrl_id = controller_id
+    self.hparams = hparams
+
+    if self.hparams.num_groups is None:
+      self.num_groups = min(256, 20 * self.num_devices)
+    else:
+      self.num_groups = self.hparams.num_groups
+
+    # creates self.op_embeddings and self.type_dict
+    self.create_op_embeddings(verbose=False)
+    # TODO(azalia) clean up embedding/group_embedding_size names
+    self.group_emb_size = (
+        2 * self.num_groups + len(self.type_dict) +
+        self.hparams.max_num_outputs * self.hparams.max_output_size)
+    self.embedding_size = self.group_emb_size
+    self.initializer = init_ops.glorot_uniform_initializer(
+        seed=self.hparams.seed)
+
+    with variable_scope.variable_scope(
+        self.hparams.name,
+        initializer=self.initializer,
+        reuse=variable_scope.AUTO_REUSE):
+      # define parameters of feedforward
+      variable_scope.get_variable("w_grouping_ff", [
+          1 + self.hparams.max_num_outputs * self.hparams.max_output_size +
+          self.hparams.adj_embed_dim, self.hparams.grouping_hidden_size
+      ])
+      variable_scope.get_variable(
+          "w_grouping_softmax",
+          [self.hparams.grouping_hidden_size, self.num_groups])
+      if self.hparams.bi_lstm:
+        variable_scope.get_variable("encoder_lstm_forward", [
+            self.embedding_size + self.hparams.hidden_size / 2,
+            2 * self.hparams.hidden_size
+        ])
+        variable_scope.get_variable("encoder_lstm_backward", [
+            self.embedding_size + self.hparams.hidden_size / 2,
+            2 * self.hparams.hidden_size
+        ])
+        variable_scope.get_variable(
+            "device_embeddings", [self.num_devices, self.hparams.hidden_size])
+        variable_scope.get_variable(
+            "decoder_lstm",
+            [2 * self.hparams.hidden_size, 4 * self.hparams.hidden_size])
+        variable_scope.get_variable(
+            "device_softmax", [2 * self.hparams.hidden_size, self.num_devices])
+        variable_scope.get_variable("device_go_embedding",
+                                    [1, self.hparams.hidden_size])
+        variable_scope.get_variable(
+            "encoder_forget_bias",
+            shape=1,
+            dtype=dtypes.float32,
+            initializer=init_ops.constant_initializer(
+                self.hparams.forget_bias_init))
+        variable_scope.get_variable(
+            "decoder_forget_bias",
+            shape=1,
+            dtype=dtypes.float32,
+            initializer=init_ops.constant_initializer(
+                self.hparams.forget_bias_init))
+        variable_scope.get_variable(
+            "attn_w_1", [self.hparams.hidden_size, self.hparams.hidden_size])
+        variable_scope.get_variable(
+            "attn_w_2", [self.hparams.hidden_size, self.hparams.hidden_size])
+        variable_scope.get_variable("attn_v", [self.hparams.hidden_size, 1])
+
+      else:
+        variable_scope.get_variable("encoder_lstm", [
+            self.embedding_size + self.hparams.hidden_size,
+            4 * self.hparams.hidden_size
+        ])
+        variable_scope.get_variable(
+            "device_embeddings", [self.num_devices, self.hparams.hidden_size])
+        variable_scope.get_variable(
+            "decoder_lstm",
+            [2 * self.hparams.hidden_size, 4 * self.hparams.hidden_size])
+        variable_scope.get_variable(
+            "device_softmax", [2 * self.hparams.hidden_size, self.num_devices])
+        variable_scope.get_variable("device_go_embedding",
+                                    [1, self.hparams.hidden_size])
+        variable_scope.get_variable(
+            "encoder_forget_bias",
+            shape=1,
+            dtype=dtypes.float32,
+            initializer=init_ops.constant_initializer(
+                self.hparams.forget_bias_init))
+        variable_scope.get_variable(
+            "decoder_forget_bias",
+            shape=1,
+            dtype=dtypes.float32,
+            initializer=init_ops.constant_initializer(
+                self.hparams.forget_bias_init))
+        variable_scope.get_variable(
+            "attn_w_1", [self.hparams.hidden_size, self.hparams.hidden_size])
+        variable_scope.get_variable(
+            "attn_w_2", [self.hparams.hidden_size, self.hparams.hidden_size])
+        variable_scope.get_variable("attn_v", [self.hparams.hidden_size, 1])
+    seq2seq_input_layer = array_ops.placeholder_with_default(
+        array_ops.zeros([self.hparams.num_children,
+                         self.num_groups,
+                         self.group_emb_size],
+                        dtypes.float32),
+        shape=(self.hparams.num_children, self.num_groups, self.group_emb_size))
+    self.seq2seq_input_layer = seq2seq_input_layer
+
+  def compute_reward(self, run_time):
+    if self.hparams.reward_function == "id":
+      reward = run_time
+    elif self.hparams.reward_function == "sqrt":
+      reward = math.sqrt(run_time)
+    elif self.hparams.reward_function == "log":
+      reward = math.log1p(run_time)
+    else:
+      raise NotImplementedError(
+          "Unrecognized reward function '%s', consider your "
+          "--reward_function flag value." % self.hparams.reward_function)
+    return reward
+
+  def build_controller(self):
+    """RL optimization interface.
+
+    Returns:
+      ops: A dictionary holding handles of the model used for training.
+    """
+
+    self._global_step = training_util.get_or_create_global_step()
+    ops = {}
+    ops["loss"] = 0
+
+    failing_signal = self.compute_reward(self.hparams.failing_signal)
+
+    ctr = {}
+
+    with tf_ops.name_scope("controller_{}".format(self.ctrl_id)):
+      with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
+        ctr["reward"] = {"value": [], "ph": [], "update": []}
+        ctr["ready"] = {"value": [], "ph": [], "update": []}
+        ctr["best_reward"] = {"value": [], "update": []}
+        for i in range(self.hparams.num_children):
+          reward_value = variable_scope.get_local_variable(
+              "reward_{}".format(i),
+              initializer=0.0,
+              dtype=dtypes.float32,
+              trainable=False)
+          reward_ph = array_ops.placeholder(
+              dtypes.float32, shape=(), name="reward_ph_{}".format(i))
+          reward_update = state_ops.assign(
+              reward_value, reward_ph, use_locking=True)
+          ctr["reward"]["value"].append(reward_value)
+          ctr["reward"]["ph"].append(reward_ph)
+          ctr["reward"]["update"].append(reward_update)
+          best_reward = variable_scope.get_local_variable(
+              "best_reward_{}".format(i),
+              initializer=failing_signal,
+              dtype=dtypes.float32,
+              trainable=False)
+          ctr["best_reward"]["value"].append(best_reward)
+          ctr["best_reward"]["update"].append(
+              state_ops.assign(best_reward,
+                               math_ops.minimum(best_reward, reward_update)))
+
+          ready_value = variable_scope.get_local_variable(
+              "ready_{}".format(i),
+              initializer=True,
+              dtype=dtypes.bool,
+              trainable=False)
+          ready_ph = array_ops.placeholder(
+              dtypes.bool, shape=(), name="ready_ph_{}".format(i))
+          ready_update = state_ops.assign(
+              ready_value, ready_ph, use_locking=True)
+          ctr["ready"]["value"].append(ready_value)
+          ctr["ready"]["ph"].append(ready_ph)
+          ctr["ready"]["update"].append(ready_update)
+
+      ctr["grouping_y_preds"], ctr["grouping_log_probs"] = self.get_groupings()
+      summary.histogram(
+          "grouping_actions",
+          array_ops.slice(ctr["grouping_y_preds"]["sample"], [0, 0],
+                          [1, array_ops.shape(self.op_embeddings)[0]]))
+
+      with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
+        ctr["baseline"] = variable_scope.get_local_variable(
+            "baseline",
+            initializer=failing_signal
+            if self.hparams.start_with_failing_signal else 0.0,
+            dtype=dtypes.float32,
+            trainable=False)
+
+      new_baseline = self.hparams.bl_dec * ctr["baseline"] + (
+          1 - self.hparams.bl_dec) * math_ops.reduce_mean(
+              ctr["reward"]["value"])
+      if not self.hparams.always_update_baseline:
+        baseline_mask = math_ops.less(ctr["reward"]["value"], failing_signal)
+        selected_reward = array_ops.boolean_mask(ctr["reward"]["value"],
+                                                 baseline_mask)
+        selected_baseline = control_flow_ops.cond(
+            math_ops.reduce_any(baseline_mask),
+            lambda: math_ops.reduce_mean(selected_reward),
+            lambda: constant_op.constant(0, dtype=dtypes.float32))
+        ctr["pos_reward"] = selected_baseline
+        pos_ = math_ops.less(
+            constant_op.constant(0, dtype=dtypes.float32), selected_baseline)
+        selected_baseline = self.hparams.bl_dec * ctr["baseline"] + (
+            1 - self.hparams.bl_dec) * selected_baseline
+        selected_baseline = control_flow_ops.cond(
+            pos_, lambda: selected_baseline, lambda: ctr["baseline"])
+        new_baseline = control_flow_ops.cond(
+            math_ops.less(self.global_step,
+                          self.hparams.stop_updating_after_steps),
+            lambda: new_baseline, lambda: selected_baseline)
+      ctr["baseline_update"] = state_ops.assign(
+          ctr["baseline"], new_baseline, use_locking=True)
+
+      ctr["y_preds"], ctr["log_probs"] = self.get_placements()
+      summary.histogram("actions", ctr["y_preds"]["sample"])
+      mask = math_ops.less(ctr["reward"]["value"], failing_signal)
+      ctr["loss"] = ctr["reward"]["value"] - ctr["baseline"]
+      ctr["loss"] *= (
+          ctr["log_probs"]["sample"] + ctr["grouping_log_probs"]["sample"])
+
+      selected_loss = array_ops.boolean_mask(ctr["loss"], mask)
+      selected_loss = control_flow_ops.cond(
+          math_ops.reduce_any(mask),
+          lambda: math_ops.reduce_mean(-selected_loss),
+          lambda: constant_op.constant(0, dtype=dtypes.float32))
+
+      ctr["loss"] = control_flow_ops.cond(
+          math_ops.less(self.global_step,
+                        self.hparams.stop_updating_after_steps),
+          lambda: math_ops.reduce_mean(-ctr["loss"]), lambda: selected_loss)
+
+      ctr["reward_s"] = math_ops.reduce_mean(ctr["reward"]["value"])
+      summary.scalar("loss", ctr["loss"])
+      summary.scalar("avg_reward", ctr["reward_s"])
+      summary.scalar("best_reward_so_far", best_reward)
+      summary.scalar(
+          "advantage",
+          math_ops.reduce_mean(ctr["reward"]["value"] - ctr["baseline"]))
+
+    with variable_scope.variable_scope(
+        "optimizer", reuse=variable_scope.AUTO_REUSE):
+      (ctr["train_op"], ctr["lr"], ctr["grad_norm"],
+       ctr["grad_norms"]) = self._get_train_ops(
+           ctr["loss"],
+           tf_ops.get_collection(tf_ops.GraphKeys.TRAINABLE_VARIABLES),
+           self.global_step,
+           grad_bound=self.hparams.grad_bound,
+           lr_init=self.hparams.lr,
+           lr_dec=self.hparams.lr_dec,
+           start_decay_step=self.hparams.start_decay_step,
+           decay_steps=self.hparams.decay_steps,
+           optimizer_type=self.hparams.optimizer_type)
+
+    summary.scalar("gradnorm", ctr["grad_norm"])
+    summary.scalar("lr", ctr["lr"])
+    ctr["summary"] = summary.merge_all()
+    ops["controller"] = ctr
+
+    self.ops = ops
+    return ops
+
+  @property
+  def global_step(self):
+    return self._global_step
+
+  def create_op_embeddings(self, verbose=False):
+    if verbose:
+      print("process input graph for op embeddings")
+    self.num_ops = len(self.important_ops)
+    # topological sort of important nodes
+    topo_order = [op.name for op in self.important_ops]
+
+    # create index to name for topologicaly sorted important nodes
+    name_to_topo_order_index = {}
+    for idx, x in enumerate(topo_order):
+      name_to_topo_order_index[x] = idx
+    self.name_to_topo_order_index = name_to_topo_order_index
+
+    # create adj matrix
+    adj_dict = {}
+    for idx, op in enumerate(self.important_ops):
+      for output_op in self.get_node_fanout(op):
+        output_op_name = output_op.name
+        if output_op_name in self.important_op_names:
+          if name_to_topo_order_index[op.name] not in adj_dict:
+            adj_dict[name_to_topo_order_index[op.name]] = []
+          adj_dict[name_to_topo_order_index[op.name]].extend(
+              [name_to_topo_order_index[output_op_name], 1])
+          if output_op_name not in adj_dict:
+            adj_dict[name_to_topo_order_index[output_op_name]] = []
+          adj_dict[name_to_topo_order_index[output_op_name]].extend(
+              [name_to_topo_order_index[op.name], -1])
+
+    # get op_type op_output_shape, and adj info
+    output_embed_dim = (self.hparams.max_num_outputs *
+                        self.hparams.max_output_size)
+
+    # TODO(bsteiner): don't filter based on used ops so that we can generalize
+    # to models that use other types of ops.
+    used_ops = set()
+    for node in self.important_ops:
+      op_type = str(node.op)
+      used_ops.add(op_type)
+
+    self.type_dict = {}
+    for op_type in self.cluster.ListAvailableOps():
+      if op_type in used_ops:
+        self.type_dict[op_type] = len(self.type_dict)
+
+    op_types = np.zeros([self.num_ops], dtype=np.int32)
+    op_output_shapes = np.full(
+        [self.num_ops, output_embed_dim], -1.0, dtype=np.float32)
+    for idx, node in enumerate(self.important_ops):
+      op_types[idx] = self.type_dict[node.op]
+      # output shape
+      op_name = node.name
+      for i, output_prop in enumerate(self.node_properties[op_name]):
+        if output_prop.shape.__str__() == "<unknown>":
+          continue
+        shape = output_prop.shape
+        for j, dim in enumerate(shape.dim):
+          if dim.size >= 0:
+            if i * self.hparams.max_output_size + j >= output_embed_dim:
+              break
+            op_output_shapes[idx,
+                             i * self.hparams.max_output_size + j] = dim.size
+    # adj for padding
+    op_adj = np.full(
+        [self.num_ops, self.hparams.adj_embed_dim], 0, dtype=np.float32)
+    for idx in adj_dict:
+      neighbors = adj_dict[int(idx)]
+      min_dim = min(self.hparams.adj_embed_dim, len(neighbors))
+      padding_size = self.hparams.adj_embed_dim - min_dim
+      neighbors = neighbors[:min_dim] + [0] * padding_size
+      op_adj[int(idx)] = neighbors
+
+    # op_embedding   starts here
+    op_embeddings = np.zeros(
+        [
+            self.num_ops,
+            1 + self.hparams.max_num_outputs * self.hparams.max_output_size +
+            self.hparams.adj_embed_dim
+        ],
+        dtype=np.float32)
+    for idx, op_name in enumerate(topo_order):
+      op_embeddings[idx] = np.concatenate(
+          (np.array([op_types[idx]]), op_output_shapes[idx], op_adj[int(idx)]))
+    self.op_embeddings = constant_op.constant(
+        op_embeddings, dtype=dtypes.float32)
+    if verbose:
+      print("num_ops = {}".format(self.num_ops))
+      print("num_types = {}".format(len(self.type_dict)))
+
+  def get_groupings(self, *args, **kwargs):
+    num_children = self.hparams.num_children
+    with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
+      grouping_actions_cache = variable_scope.get_local_variable(
+          "grouping_actions_cache",
+          initializer=init_ops.zeros_initializer,
+          dtype=dtypes.int32,
+          shape=[num_children, self.num_ops],
+          trainable=False)
+    input_layer = self.op_embeddings
+    input_layer = array_ops.expand_dims(input_layer, 0)
+    feed_ff_input_layer = array_ops.tile(input_layer, [num_children, 1, 1])
+    grouping_actions, grouping_log_probs = {}, {}
+    grouping_actions["sample"], grouping_log_probs[
+        "sample"] = self.make_grouping_predictions(feed_ff_input_layer)
+
+    grouping_actions["sample"] = state_ops.assign(grouping_actions_cache,
+                                                  grouping_actions["sample"])
+    self.grouping_actions_cache = grouping_actions_cache
+
+    return grouping_actions, grouping_log_probs
+
+  def make_grouping_predictions(self, input_layer, reuse=None):
+    """model that predicts grouping (grouping_actions).
+
+    Args:
+      input_layer: group_input_layer
+      reuse: reuse
+
+    Returns:
+       grouping_actions: actions
+       grouping_log_probs: log probabilities corresponding to actions
+    """
+    with variable_scope.variable_scope(self.hparams.name, reuse=True):
+      # input_layer: tensor of size [1, num_ops, hidden_size]
+      w_grouping_ff = variable_scope.get_variable("w_grouping_ff")
+      w_grouping_softmax = variable_scope.get_variable("w_grouping_softmax")
+
+    batch_size = array_ops.shape(input_layer)[0]
+    embedding_dim = array_ops.shape(input_layer)[2]
+
+    reshaped = array_ops.reshape(input_layer,
+                                 [batch_size * self.num_ops, embedding_dim])
+    ff_output = math_ops.matmul(reshaped, w_grouping_ff)
+    logits = math_ops.matmul(ff_output, w_grouping_softmax)
+    if self.hparams.logits_std_noise > 0:
+      num_in_logits = math_ops.cast(
+          array_ops.size(logits), dtype=dtypes.float32)
+      avg_norm = math_ops.divide(
+          linalg_ops.norm(logits), math_ops.sqrt(num_in_logits))
+      logits_noise = random_ops.random_normal(
+          array_ops.shape(logits),
+          stddev=self.hparams.logits_std_noise * avg_norm)
+      logits = control_flow_ops.cond(
+          self.global_step > self.hparams.stop_noise_step, lambda: logits,
+          lambda: logits + logits_noise)
+    logits = array_ops.reshape(logits,
+                               [batch_size * self.num_ops, self.num_groups])
+    actions = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
+    actions = math_ops.to_int32(actions)
+    actions = array_ops.reshape(actions, [batch_size, self.num_ops])
+    action_label = array_ops.reshape(actions, [-1])
+    log_probs = nn_ops.sparse_softmax_cross_entropy_with_logits(
+        logits=logits, labels=action_label)
+    log_probs = array_ops.reshape(log_probs, [batch_size, -1])
+    log_probs = math_ops.reduce_sum(log_probs, 1)
+    grouping_actions = actions
+    grouping_log_probs = log_probs
+    return grouping_actions, grouping_log_probs
+
+  def create_group_embeddings(self, grouping_actions, verbose=False):
+    """Approximating the blocks of a TF graph from a graph_def.
+
+    Args:
+      grouping_actions: grouping predictions.
+      verbose: print stuffs.
+
+    Returns:
+      groups: list of groups.
+    """
+    groups = [
+        self._create_group_embeddings(grouping_actions, i, verbose) for
+        i in range(self.hparams.num_children)
+    ]
+    return np.stack(groups, axis=0)
+
+  def _create_group_embeddings(self, grouping_actions, child_id, verbose=False):
+    """Approximating the blocks of a TF graph from a graph_def for each child.
+
+    Args:
+      grouping_actions: grouping predictions.
+      child_id: child_id for the group.
+      verbose: print stuffs.
+
+    Returns:
+      groups: group embedding for the child_id.
+    """
+    if verbose:
+      print("Processing input_graph")
+
+    # TODO(azalia): Build inter-adjacencies dag matrix.
+    # record dag_matrix
+    dag_matrix = np.zeros([self.num_groups, self.num_groups], dtype=np.float32)
+    for op in self.important_ops:
+      topo_op_index = self.name_to_topo_order_index[op.name]
+      group_index = grouping_actions[child_id][topo_op_index]
+      for output_op in self.get_node_fanout(op):
+        if output_op.name not in self.important_op_names:
+          continue
+        output_group_index = (
+            grouping_actions[child_id][self.name_to_topo_order_index[
+                output_op.name]])
+        dag_matrix[group_index, output_group_index] += 1.0
+    num_connections = np.sum(dag_matrix)
+    num_intra_group_connections = dag_matrix.trace()
+    num_inter_group_connections = num_connections - num_intra_group_connections
+    if verbose:
+      print("grouping evaluation metric")
+      print(("num_connections={} num_intra_group_connections={} "
+             "num_inter_group_connections={}").format(
+                 num_connections, num_intra_group_connections,
+                 num_inter_group_connections))
+    self.dag_matrix = dag_matrix
+
+    # output_shape
+    op_output_shapes = np.zeros(
+        [
+            len(self.important_ops),
+            self.hparams.max_num_outputs * self.hparams.max_output_size
+        ],
+        dtype=np.float32)
+
+    for idx, op in enumerate(self.important_ops):
+      for i, output_properties in enumerate(self.node_properties[op.name]):
+        if output_properties.shape.__str__() == "<unknown>":
+          continue
+        if i > self.hparams.max_num_outputs:
+          break
+        shape = output_properties.shape
+        for j, dim in enumerate(shape.dim):
+          if dim.size > 0:
+            k = i * self.hparams.max_output_size + j
+            if k >= self.hparams.max_num_outputs * self.hparams.max_output_size:
+              break
+            op_output_shapes[idx, k] = dim.size
+
+    # group_embedding
+    group_embedding = np.zeros(
+        [
+            self.num_groups, len(self.type_dict) +
+            self.hparams.max_num_outputs * self.hparams.max_output_size
+        ],
+        dtype=np.float32)
+    for op_index, op in enumerate(self.important_ops):
+      group_index = grouping_actions[child_id][
+          self.name_to_topo_order_index[op.name]]
+      type_name = str(op.op)
+      type_index = self.type_dict[type_name]
+      group_embedding[group_index, type_index] += 1
+      group_embedding[group_index, :self.hparams.max_num_outputs * self.hparams.
+                      max_output_size] += (
+                          op_output_shapes[op_index])
+    grouping_adjacencies = np.concatenate(
+        [dag_matrix, np.transpose(dag_matrix)], axis=1)
+    group_embedding = np.concatenate(
+        [grouping_adjacencies, group_embedding], axis=1)
+    group_normalizer = np.amax(group_embedding, axis=1, keepdims=True)
+    group_embedding /= (group_normalizer + 1.0)
+    if verbose:
+      print("Finished Processing Input Graph")
+    return group_embedding
+
+  def get_placements(self, *args, **kwargs):
+    num_children = self.hparams.num_children
+    with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
+      actions_cache = variable_scope.get_local_variable(
+          "actions_cache",
+          initializer=init_ops.zeros_initializer,
+          dtype=dtypes.int32,
+          shape=[num_children, self.num_groups],
+          trainable=False)
+
+    x = self.seq2seq_input_layer
+    last_c, last_h, attn_mem = self.encode(x)
+    actions, log_probs = {}, {}
+    actions["sample"], log_probs["sample"] = (
+        self.decode(
+            x, last_c, last_h, attn_mem, mode="sample"))
+    actions["target"], log_probs["target"] = (
+        self.decode(
+            x,
+            last_c,
+            last_h,
+            attn_mem,
+            mode="target",
+            y=actions_cache))
+    actions["greedy"], log_probs["greedy"] = (
+        self.decode(
+            x, last_c, last_h, attn_mem, mode="greedy"))
+    actions["sample"] = control_flow_ops.cond(
+        self.global_step < self.hparams.stop_sampling,
+        lambda: state_ops.assign(actions_cache, actions["sample"]),
+        lambda: state_ops.assign(actions_cache, actions["target"]))
+    self.actions_cache = actions_cache
+
+    return actions, log_probs
+
+  def encode(self, x):
+    """Encoder using LSTM.
+
+    Args:
+      x: tensor of size [num_children, num_groups, embedding_size]
+
+    Returns:
+      last_c, last_h: tensors of size [num_children, hidden_size], the final
+        LSTM states
+      attn_mem: tensor of size [num_children, num_groups, hidden_size], the
+      attention
+        memory, i.e. concatenation of all hidden states, linearly transformed by
+        an attention matrix attn_w_1
+    """
+    if self.hparams.bi_lstm:
+      with variable_scope.variable_scope(self.hparams.name, reuse=True):
+        w_lstm_forward = variable_scope.get_variable("encoder_lstm_forward")
+        w_lstm_backward = variable_scope.get_variable("encoder_lstm_backward")
+        forget_bias = variable_scope.get_variable("encoder_forget_bias")
+        attn_w_1 = variable_scope.get_variable("attn_w_1")
+    else:
+      with variable_scope.variable_scope(self.hparams.name, reuse=True):
+        w_lstm = variable_scope.get_variable("encoder_lstm")
+        forget_bias = variable_scope.get_variable("encoder_forget_bias")
+        attn_w_1 = variable_scope.get_variable("attn_w_1")
+
+    embedding_size = array_ops.shape(x)[2]
+
+    signals = array_ops.split(x, self.num_groups, axis=1)
+    for i in range(len(signals)):
+      signals[i] = array_ops.reshape(
+          signals[i], [self.hparams.num_children, embedding_size])
+
+    if self.hparams.bi_lstm:
+
+      def body(i, prev_c_forward, prev_h_forward, prev_c_backward,
+               prev_h_backward):
+        """while loop for LSTM."""
+        signal_forward = signals[i]
+        next_c_forward, next_h_forward = lstm(signal_forward, prev_c_forward,
+                                              prev_h_forward, w_lstm_forward,
+                                              forget_bias)
+
+        signal_backward = signals[self.num_groups - 1 - i]
+        next_c_backward, next_h_backward = lstm(
+            signal_backward, prev_c_backward, prev_h_backward, w_lstm_backward,
+            forget_bias)
+
+        next_h = array_ops.concat([next_h_forward, next_h_backward], axis=1)
+        all_h.append(next_h)
+
+        return (next_c_forward, next_h_forward, next_c_backward,
+                next_h_backward)
+
+      c_forward = array_ops.zeros(
+          [self.hparams.num_children, self.hparams.hidden_size / 2],
+          dtype=dtypes.float32)
+      h_forward = array_ops.zeros(
+          [self.hparams.num_children, self.hparams.hidden_size / 2],
+          dtype=dtypes.float32)
+
+      c_backward = array_ops.zeros(
+          [self.hparams.num_children, self.hparams.hidden_size / 2],
+          dtype=dtypes.float32)
+      h_backward = array_ops.zeros(
+          [self.hparams.num_children, self.hparams.hidden_size / 2],
+          dtype=dtypes.float32)
+      all_h = []
+
+      for i in range(0, self.num_groups):
+        c_forward, h_forward, c_backward, h_backward = body(
+            i, c_forward, h_forward, c_backward, h_backward)
+
+      last_c = array_ops.concat([c_forward, c_backward], axis=1)
+      last_h = array_ops.concat([h_forward, h_backward], axis=1)
+      attn_mem = array_ops.stack(all_h)
+
+    else:
+
+      def body(i, prev_c, prev_h):
+        signal = signals[i]
+        next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias)
+        all_h.append(next_h)
+        return next_c, next_h
+
+      c = array_ops.zeros(
+          [self.hparams.num_children, self.hparams.hidden_size],
+          dtype=dtypes.float32)
+      h = array_ops.zeros(
+          [self.hparams.num_children, self.hparams.hidden_size],
+          dtype=dtypes.float32)
+      all_h = []
+
+      for i in range(0, self.num_groups):
+        c, h = body(i, c, h)
+
+      last_c = c
+      last_h = h
+      attn_mem = array_ops.stack(all_h)
+
+    attn_mem = array_ops.transpose(attn_mem, [1, 0, 2])
+    attn_mem = array_ops.reshape(
+        attn_mem,
+        [self.hparams.num_children * self.num_groups, self.hparams.hidden_size])
+    attn_mem = math_ops.matmul(attn_mem, attn_w_1)
+    attn_mem = array_ops.reshape(
+        attn_mem,
+        [self.hparams.num_children, self.num_groups, self.hparams.hidden_size])
+
+    return last_c, last_h, attn_mem
+
+  def decode(self,
+             x,
+             last_c,
+             last_h,
+             attn_mem,
+             mode="target",
+             y=None):
+    """Decoder using LSTM.
+
+    Args:
+      x: tensor of size [num_children, num_groups, embedding_size].
+      last_c: tensor of size [num_children, hidden_size], the final LSTM states
+          computed by self.encoder.
+      last_h: same as last_c.
+      attn_mem: tensor of size [num_children, num_groups, hidden_size].
+      mode: "target" or "sample".
+      y: tensor of size [num_children, num_groups], the device placements.
+
+    Returns:
+      actions: tensor of size [num_children, num_groups], the placements of
+          devices
+    """
+    with variable_scope.variable_scope(self.hparams.name, reuse=True):
+      w_lstm = variable_scope.get_variable("decoder_lstm")
+      forget_bias = variable_scope.get_variable("decoder_forget_bias")
+      device_embeddings = variable_scope.get_variable("device_embeddings")
+      device_softmax = variable_scope.get_variable("device_softmax")
+      device_go_embedding = variable_scope.get_variable("device_go_embedding")
+      attn_w_2 = variable_scope.get_variable("attn_w_2")
+      attn_v = variable_scope.get_variable("attn_v")
+
+    actions = tensor_array_ops.TensorArray(
+        dtypes.int32,
+        size=self.num_groups,
+        infer_shape=False,
+        clear_after_read=False)
+
+    # pylint: disable=unused-argument
+    def condition(i, *args):
+      return math_ops.less(i, self.num_groups)
+
+    # pylint: disable=missing-docstring
+    def body(i, prev_c, prev_h, actions, log_probs):
+      # pylint: disable=g-long-lambda
+      signal = control_flow_ops.cond(
+          math_ops.equal(i, 0),
+          lambda: array_ops.tile(device_go_embedding,
+                                 [self.hparams.num_children, 1]),
+          lambda: embedding_ops.embedding_lookup(device_embeddings,
+                                                 actions.read(i - 1))
+      )
+      if self.hparams.keep_prob is not None:
+        signal = nn_ops.dropout(signal, self.hparams.keep_prob)
+      next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias)
+      query = math_ops.matmul(next_h, attn_w_2)
+      query = array_ops.reshape(
+          query, [self.hparams.num_children, 1, self.hparams.hidden_size])
+      query = math_ops.tanh(query + attn_mem)
+      query = array_ops.reshape(query, [
+          self.hparams.num_children * self.num_groups, self.hparams.hidden_size
+      ])
+      query = math_ops.matmul(query, attn_v)
+      query = array_ops.reshape(query,
+                                [self.hparams.num_children, self.num_groups])
+      query = nn_ops.softmax(query)
+      query = array_ops.reshape(query,
+                                [self.hparams.num_children, self.num_groups, 1])
+      query = math_ops.reduce_sum(attn_mem * query, axis=1)
+      query = array_ops.concat([next_h, query], axis=1)
+      logits = math_ops.matmul(query, device_softmax)
+      logits /= self.hparams.temperature
+      if self.hparams.tanh_constant > 0:
+        logits = math_ops.tanh(logits) * self.hparams.tanh_constant
+      if self.hparams.logits_std_noise > 0:
+        num_in_logits = math_ops.cast(
+            array_ops.size(logits), dtype=dtypes.float32)
+        avg_norm = math_ops.divide(
+            linalg_ops.norm(logits), math_ops.sqrt(num_in_logits))
+        logits_noise = random_ops.random_normal(
+            array_ops.shape(logits),
+            stddev=self.hparams.logits_std_noise * avg_norm)
+        logits = control_flow_ops.cond(
+            self.global_step > self.hparams.stop_noise_step, lambda: logits,
+            lambda: logits + logits_noise)
+
+      if mode == "sample":
+        next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
+      elif mode == "greedy":
+        next_y = math_ops.argmax(logits, 1)
+      elif mode == "target":
+        next_y = array_ops.slice(y, [0, i], [-1, 1])
+      else:
+        raise NotImplementedError
+      next_y = math_ops.to_int32(next_y)
+      next_y = array_ops.reshape(next_y, [self.hparams.num_children])
+      actions = actions.write(i, next_y)
+      log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=next_y)
+      return i + 1, next_c, next_h, actions, log_probs
+
+    loop_vars = [
+        constant_op.constant(0, dtype=dtypes.int32), last_c, last_h, actions,
+        array_ops.zeros([self.hparams.num_children], dtype=dtypes.float32)
+    ]
+    loop_outputs = control_flow_ops.while_loop(condition, body, loop_vars)
+
+    last_c = loop_outputs[-4]
+    last_h = loop_outputs[-3]
+    actions = loop_outputs[-2].stack()
+    actions = array_ops.transpose(actions, [1, 0])
+    log_probs = loop_outputs[-1]
+    return actions, log_probs
+
+  def eval_placement(self,
+                     sess,
+                     child_id=0,
+                     verbose=False):
+    grouping_actions, actions = sess.run([
+        self.grouping_actions_cache,
+        self.actions_cache
+    ])
+    grouping_actions = grouping_actions[child_id]
+    actions = actions[child_id]
+    if verbose:
+      global_step = sess.run(self.global_step)
+      if global_step % 100 == 0:
+        log_string = "op group assignments: "
+        for a in grouping_actions:
+          log_string += "{} ".format(a)
+        print(log_string[:-1])
+        log_string = "group device assignments: "
+        for a in actions:
+          log_string += "{} ".format(a)
+        print(log_string[:-1])
+
+    for op in self.important_ops:
+      topo_order_index = self.name_to_topo_order_index[op.name]
+      group_index = grouping_actions[topo_order_index]
+      op.device = self.devices[actions[group_index]].name
+    try:
+      _, run_time, _ = self.cluster.MeasureCosts(self.item)
+    except errors.ResourceExhaustedError:
+      run_time = self.hparams.failing_signal
+    return run_time
+
+  def update_reward(self,
+                    sess,
+                    run_time,
+                    child_id=0,
+                    verbose=False):
+    reward = self.compute_reward(run_time)
+    controller_ops = self.ops["controller"]
+    _, best_reward = sess.run(
+        [
+            controller_ops["reward"]["update"][child_id],
+            controller_ops["best_reward"]["update"][child_id]
+        ],
+        feed_dict={
+            controller_ops["reward"]["ph"][child_id]: reward,
+        })
+    if verbose:
+      print(("run_time={:<.5f} reward={:<.5f} "
+             "best_reward={:<.5f}").format(run_time, reward, best_reward))
+
+    # Reward is a double, best_reward a float: allow for some slack in the
+    # comparison.
+    updated = abs(best_reward - reward) < 1e-6
+    return updated
+
+  def generate_grouping(self, sess):
+    controller_ops = self.ops["controller"]
+    grouping_actions = sess.run(controller_ops["grouping_y_preds"]["sample"])
+    return grouping_actions
+
+  def generate_placement(self, grouping, sess):
+    controller_ops = self.ops["controller"]
+    feed_seq2seq_input_dict = {}
+    feed_seq2seq_input_dict[self.seq2seq_input_layer] = grouping
+    sess.run(
+        controller_ops["y_preds"]["sample"], feed_dict=feed_seq2seq_input_dict)
+
+  def process_reward(self, sess):
+    controller_ops = self.ops["controller"]
+    run_ops = [
+        controller_ops["loss"], controller_ops["lr"],
+        controller_ops["grad_norm"], controller_ops["grad_norms"],
+        controller_ops["train_op"]
+    ]
+    sess.run(run_ops)
+    sess.run(controller_ops["baseline_update"])
+
+  def _get_train_ops(self,
+                     loss,
+                     tf_variables,
+                     global_step,
+                     grad_bound=1.25,
+                     lr_init=1e-3,
+                     lr_dec=0.9,
+                     start_decay_step=10000,
+                     decay_steps=100,
+                     optimizer_type="adam"):
+    """Loss optimizer.
+
+    Args:
+      loss: scalar tf tensor
+      tf_variables: list of training variables, typically
+        tf.trainable_variables()
+      global_step: global_step
+      grad_bound: max gradient norm
+      lr_init: initial learning rate
+      lr_dec: leaning rate decay coefficient
+      start_decay_step: start decaying learning rate after this many steps
+      decay_steps: apply decay rate factor at this step intervals
+      optimizer_type: optimizer type should be either adam or sgd
+
+    Returns:
+      train_op: training op
+      learning_rate: scalar learning rate tensor
+      grad_norm: l2 norm of the gradient vector
+      all_grad_norms: l2 norm of each component
+    """
+    lr_gstep = global_step - start_decay_step
+
+    def f1():
+      return constant_op.constant(lr_init)
+
+    def f2():
+      return learning_rate_decay.exponential_decay(lr_init, lr_gstep,
+                                                   decay_steps, lr_dec, True)
+
+    learning_rate = control_flow_ops.cond(
+        math_ops.less(global_step, start_decay_step),
+        f1,
+        f2,
+        name="learning_rate")
+
+    if optimizer_type == "adam":
+      opt = adam.AdamOptimizer(learning_rate)
+    elif optimizer_type == "sgd":
+      opt = gradient_descent.GradientDescentOptimizer(learning_rate)
+    grads_and_vars = opt.compute_gradients(loss, tf_variables)
+    grad_norm = clip_ops.global_norm([g for g, v in grads_and_vars])
+    all_grad_norms = {}
+    clipped_grads = []
+    clipped_rate = math_ops.maximum(grad_norm / grad_bound, 1.0)
+    for g, v in grads_and_vars:
+      if g is not None:
+        if isinstance(g, tf_ops.IndexedSlices):
+          clipped = g.values / clipped_rate
+          norm_square = math_ops.reduce_sum(clipped * clipped)
+          clipped = tf_ops.IndexedSlices(clipped, g.indices)
+        else:
+          clipped = g / clipped_rate
+          norm_square = math_ops.reduce_sum(clipped * clipped)
+        all_grad_norms[v.name] = math_ops.sqrt(norm_square)
+        clipped_grads.append((clipped, v))
+
+    train_op = opt.apply_gradients(clipped_grads, global_step)
+    return train_op, learning_rate, grad_norm, all_grad_norms
+
+
+def lstm(x, prev_c, prev_h, w_lstm, forget_bias):
+  """LSTM cell.
+
+  Args:
+    x: tensors of size [num_children, hidden_size].
+    prev_c: tensors of size [num_children, hidden_size].
+    prev_h: same as prev_c.
+    w_lstm: .
+    forget_bias: .
+
+  Returns:
+    next_c:
+    next_h:
+  """
+  ifog = math_ops.matmul(array_ops.concat([x, prev_h], axis=1), w_lstm)
+  i, f, o, g = array_ops.split(ifog, 4, axis=1)
+  i = math_ops.sigmoid(i)
+  f = math_ops.sigmoid(f + forget_bias)
+  o = math_ops.sigmoid(o)
+  g = math_ops.tanh(g)
+  next_c = i * g + f * prev_c
+  next_h = o * math_ops.tanh(next_c)
+  return next_c, next_h
diff --git a/tensorflow/python/grappler/item.i b/tensorflow/python/grappler/item.i
index d0fc1a04f220e0a053257e0206bb07b25f3767c6..593d38206d127978f1982a0f2cc22e17daee1a3d 100644
--- a/tensorflow/python/grappler/item.i
+++ b/tensorflow/python/grappler/item.i
@@ -83,7 +83,6 @@ static GItem TF_NewItem(
   tensorflow::grappler::ItemConfig cfg;
   cfg.ignore_user_placement = ignore_user_placement;
   cfg.ignore_colocation = ignore_colocation;
-  cfg.inline_functions = true;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef("item", meta_graph, cfg);
   if (!item) {
@@ -96,10 +95,10 @@ static GItem TF_NewItem(
   return GItem(item.release());
 }
 
-static std::vector<string> TF_IdentifyImportantOps(GItem item, bool sort_topologically,
+static PyObject* TF_IdentifyImportantOps(GItem item, bool sort_topologically,
                                                    TF_Status* status) {
   if (item.is_none()) {
-    return {};
+    Py_RETURN_NONE;
   }
 
   std::vector<const tensorflow::NodeDef*> main_ops = item->MainOpsFanin();
@@ -132,7 +131,13 @@ static std::vector<string> TF_IdentifyImportantOps(GItem item, bool sort_topolog
     }
   }
 
-  return ops;
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* result = PyList_New(ops.size());
+  for (int i = 0; i < ops.size(); ++i) {
+    PyList_SetItem(result, i, PyString_FromString(ops[i].c_str()));
+  }
+  PyGILState_Release(gstate);
+  return result;
 }
 
 static PyObject* TF_GetOpProperties(GItem item) {
@@ -305,7 +310,7 @@ static PyObject* TF_GetColocationGroups(GItem item) {
 static GItem TF_NewItem(
     const tensorflow::MetaGraphDef& meta_graph, bool ignore_colocation,
     bool ignore_user_placement, TF_Status* out_status);
-static std::vector<string> TF_IdentifyImportantOps(GItem item, bool sort_topologically,
-                                                   TF_Status* status);
+static PyObject* TF_IdentifyImportantOps(GItem item, bool sort_topologically,
+                                         TF_Status* status);
 static PyObject* TF_GetOpProperties(GItem item);
 static PyObject* TF_GetColocationGroups(GItem item);
diff --git a/tensorflow/python/grappler/item.py b/tensorflow/python/grappler/item.py
index 4a083849bd39f606877069419396d8c42ef077eb..1748efdd130268f2668cd8cb1b5c2da18bafd549 100644
--- a/tensorflow/python/grappler/item.py
+++ b/tensorflow/python/grappler/item.py
@@ -51,9 +51,7 @@ class Item(object):
     self._BuildTFItem()
 
   def IdentifyImportantOps(self, sort_topologically=False):
-    with errors.raise_exception_on_not_ok_status() as status:
-      return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically,
-                                             status)
+    return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically)
 
   def GetOpProperties(self):
     ret_from_swig = tf_item.TF_GetOpProperties(self.tf_item)
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index cd70e2fdecc74f9d99240ac566f3c28e900a06c2..c40de9da0abca3bb99a82a1456261f45b1c45c99 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -56,7 +56,7 @@ class ItemTest(test.TestCase):
       mg = meta_graph.create_meta_graph_def(graph=g)
       grappler_item = item.Item(mg)
       op_list = grappler_item.IdentifyImportantOps()
-      self.assertItemsEqual([b'Const', b'Const_1', b'add'], op_list)
+      self.assertItemsEqual(['Const', 'Const_1', 'add'], op_list)
 
   def testOpProperties(self):
     with ops.Graph().as_default() as g:
@@ -111,7 +111,7 @@ class ItemTest(test.TestCase):
     with ops.Graph().as_default() as g:
       c = constant_op.constant([10])
       v = variables.Variable([3], dtype=dtypes.int32)
-      i = gen_array_ops._ref_identity(v)
+      i = gen_array_ops.ref_identity(v)
       a = state_ops.assign(i, c)
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(a)
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 0f5150174049250e86bbac0a49eb998339058326..e3dd4b0bdfbb28480c78b0add5947c79852fb44d 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -321,7 +321,7 @@ class LayoutOptimizerTest(test.TestCase):
       conv = _two_layer_model(x)
       dim = array_ops.placeholder(dtype='int32')
       sizes = constant_op.constant([50, 10, 4], shape=[3])
-      split = gen_array_ops._split_v(
+      split = gen_array_ops.split_v(
           value=conv, size_splits=sizes, axis=dim, num_split=3)
       output = math_ops.reduce_sum(split[0])
 
@@ -476,7 +476,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2], keepdims=True)
       squeeze = array_ops.squeeze(reduce_sum, axis=[1, 2])
       output = array_ops.identity(squeeze)
 
@@ -506,7 +506,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2], keepdims=True)
       squeeze = array_ops.squeeze(reduce_sum, axis=[0, 1, 2])
       output = array_ops.identity(squeeze)
 
@@ -623,7 +623,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[3], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[3], keepdims=True)
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
@@ -653,7 +653,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[2], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[2], keepdims=True)
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
@@ -682,7 +682,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[2, 3], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[2, 3], keepdims=True)
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
@@ -896,7 +896,7 @@ class LayoutOptimizerTest(test.TestCase):
       add = math_ops.add(conv, conv)
       mean = math_ops.reduce_mean(conv)
       condition = math_ops.less(conv, mean)
-      select = gen_math_ops._select(condition, conv, add)
+      select = gen_math_ops.select(condition, conv, add)
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
@@ -926,7 +926,7 @@ class LayoutOptimizerTest(test.TestCase):
       conv = _two_layer_model(x)
       add = math_ops.add(conv, conv)
       condition = array_ops.placeholder(dtype='bool')
-      select = gen_math_ops._select(condition, conv, add)
+      select = gen_math_ops.select(condition, conv, add)
       output = array_ops.identity(select)
 
       condition_val = np.zeros((1, 7, 7, 64))
@@ -957,7 +957,7 @@ class LayoutOptimizerTest(test.TestCase):
       conv = _two_layer_model(x)
       add = math_ops.add(conv, conv)
       condition = constant_op.constant(True)
-      select = gen_math_ops._select(condition, conv, add)
+      select = gen_math_ops.select(condition, conv, add)
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
@@ -1023,7 +1023,7 @@ class LayoutOptimizerTest(test.TestCase):
       conv = _two_layer_model(x)
       ksize = constant_op.constant([1, 2, 3, 1], shape=[4])
       strides = array_ops.placeholder(dtype='int32', shape=[4])
-      max_pool = gen_nn_ops._max_pool_v2(conv, ksize, strides, 'VALID')
+      max_pool = gen_nn_ops.max_pool_v2(conv, ksize, strides, 'VALID')
       output = array_ops.identity(max_pool)
 
       strides_val = [1, 3, 2, 1]
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 948911f099674af4c6dd19bfdac75e5fc1f75c78..3f9d8864a2b4b750a33d14161fd18d764f68d7bf 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -76,6 +76,7 @@ class MemoryOptimizerSwapTest(test.TestCase):
 
     rewriter_config = rewriter_config_pb2.RewriterConfig(
         disable_model_pruning=True,
+        meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
         constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
         memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
     graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
@@ -162,7 +163,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             memory_optimization=rewriter_config_pb2.RewriterConfig.
             RECOMPUTATION_HEURISTICS,
-            memory_optimizer_target_node_name_prefix='optimizer/gradients/'),
+            # Checks that name scope "gradients/" also match sub-scope.
+            memory_optimizer_target_node_name_scope='gradients/'),
         original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
@@ -176,6 +178,35 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         len([node for node in rewritten_graph_def.node
              if 'Recomputed/' in node.name]))
 
+  def testRewritingNameScopedGradientNamesScope(self):
+    """Tests that rewriting occurs with non-standard gradient names."""
+    (original_metagraph, _, _,
+     _) = self._GetMetaGraph(optimizer_scope_name='foo/bar')
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+        rewriter_config_pb2.RewriterConfig(
+            disable_model_pruning=True,
+            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
+            arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.
+            RECOMPUTATION_HEURISTICS,
+            # This should not match anything.
+            memory_optimizer_target_node_name_scope='r/gradients/'),
+        original_metagraph)
+    self.assertEqual(
+        len(rewritten_graph_def.node), len(original_metagraph.graph_def.node))
+    self.assertEqual(0,
+                     len([
+                         node for node in original_metagraph.graph_def.node
+                         if 'Recomputed/' in node.name
+                     ]))
+    self.assertEqual(0,
+                     len([
+                         node for node in rewritten_graph_def.node
+                         if 'Recomputed/' in node.name
+                     ]))
+
   def _GetMemoryOptimizerSessionConfig(self):
     rewrite_options = rewriter_config_pb2.RewriterConfig(
         disable_model_pruning=True,
diff --git a/tensorflow/python/grappler/model_analyzer.cc b/tensorflow/python/grappler/model_analyzer.cc
index d23eb811ac2b0a6a8802979b4d966b5617c8a8d9..5a76cdd8fb29361cd800dea60cb9ebc0e39f6487 100644
--- a/tensorflow/python/grappler/model_analyzer.cc
+++ b/tensorflow/python/grappler/model_analyzer.cc
@@ -26,9 +26,10 @@ namespace grappler {
 
 ModelAnalyzer::ModelAnalyzer(const GrapplerItem& item) : item_(item) {}
 
-Status ModelAnalyzer::GenerateReport(bool debug, std::ostream& os) {
+Status ModelAnalyzer::GenerateReport(bool debug, bool assume_valid_feeds,
+                                     std::ostream& os) {
   GraphProperties properties(item_);
-  TF_RETURN_IF_ERROR(properties.InferStatically(false));
+  TF_RETURN_IF_ERROR(properties.InferStatically(assume_valid_feeds));
 
   for (const auto& node : item_.MainOpsFanin()) {
     PrintNodeInfo(node, properties, debug, os);
diff --git a/tensorflow/python/grappler/model_analyzer.h b/tensorflow/python/grappler/model_analyzer.h
index 5bc551927d88db723e21b29903d6f5b941048139..97ffafabe1f785e3b2c3044143b8fb8006b59225 100644
--- a/tensorflow/python/grappler/model_analyzer.h
+++ b/tensorflow/python/grappler/model_analyzer.h
@@ -31,7 +31,7 @@ class GraphProperties;
 class ModelAnalyzer {
  public:
   explicit ModelAnalyzer(const GrapplerItem& item);
-  Status GenerateReport(bool debug, std::ostream& os);
+  Status GenerateReport(bool debug, bool assume_valid_feeds, std::ostream& os);
 
  private:
   void PrintNodeInfo(const NodeDef* node, const GraphProperties& properties,
diff --git a/tensorflow/python/grappler/model_analyzer.i b/tensorflow/python/grappler/model_analyzer.i
index 7c3a692d0efc501341ff1dff3cf24b8a4830ec84..4955780764be802b9e4be3598bf114b227757194 100644
--- a/tensorflow/python/grappler/model_analyzer.i
+++ b/tensorflow/python/grappler/model_analyzer.i
@@ -40,7 +40,8 @@ limitations under the License.
 %}
 
 %{
-string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug) {
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph,
+                           bool assume_valid_feeds, bool debug) {
   tensorflow::grappler::ItemConfig cfg;
   cfg.apply_optimizations = false;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
@@ -53,10 +54,11 @@ string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug
   tensorflow::grappler::ModelAnalyzer analyzer(*item);
 
   std::stringstream os;
-  analyzer.GenerateReport(debug, os);
+  analyzer.GenerateReport(debug, assume_valid_feeds, os);
   return os.str();
 }
 
 %}
 
-string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug);
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph,
+                           bool assume_valid_feeds, bool debug);
diff --git a/tensorflow/python/grappler/model_analyzer.py b/tensorflow/python/grappler/model_analyzer.py
index 535889e1c4034952562a05e4d044fcafeddbc0ca..98cdc5785011dcebbaaf43704772b3de00c9d6ca 100644
--- a/tensorflow/python/grappler/model_analyzer.py
+++ b/tensorflow/python/grappler/model_analyzer.py
@@ -22,11 +22,12 @@ from tensorflow.python import pywrap_tensorflow as tf_wrap
 from tensorflow.python.framework import errors
 
 
-def GenerateModelReport(metagraph, debug=False):
+def GenerateModelReport(metagraph, assume_valid_feeds=True, debug=False):
   """Report what's known statically about each node in the provided metagraph.
 
   Args:
     metagraph: A TensorFlow MetaGraphDef.
+    assume_valid_feeds: If True, assume that the shape of the fed nodes is valid
     debug: Add some information useful for debugging.
 
   Returns:
@@ -34,6 +35,6 @@ def GenerateModelReport(metagraph, debug=False):
   """
   with errors.raise_exception_on_not_ok_status():
     ret_from_swig = tf_wrap.GenerateModelReport(metagraph.SerializeToString(),
-                                                debug)
+                                                assume_valid_feeds, debug)
 
   return ret_from_swig
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index 1b657983a4690dd0ddb7f569ce514b08cb10400a..39ca71e99af06c19fb7fe5bf185c29106729f5e9 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -98,8 +98,8 @@ PyObject* TF_OptimizeGraph(
       const tensorflow::MetaGraphDef& metagraph,
       bool verbose, const string& graph_id, TF_Status* out_status) {
     tensorflow::grappler::ItemConfig item_config;
-    item_config.inline_functions = false;
     item_config.apply_optimizations = false;
+    item_config.ignore_user_placement = false;
     std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
         tensorflow::grappler::GrapplerItemFromMetaGraphDef(graph_id, metagraph, item_config);
 
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index 55dcbe2071f74204e0bbdd141560f33cefdf174d..1c0f072dd32d38f048cfa48d38b45264951d095e 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -17,13 +17,18 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.grappler import item as gitem
 from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -48,6 +53,72 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     self.assertEqual(len(graph.node), 1)
     self.assertItemsEqual([node.name for node in graph.node], ['d'])
 
+  def testKeepNodes(self):
+    g = ops.Graph()
+    with g.as_default():
+      a1 = variables.Variable(
+          1.0)  # Must be preserved since it's in the collection 'variables'.
+      a2 = constant_op.constant(0, shape=[50, 50], name='keep')
+      ops.add_to_collection('a2', a2)  # Explicitly add to collection.
+      b = constant_op.constant(1, shape=[100, 10])
+      c = constant_op.constant(0, shape=[10, 30])
+      d = math_ops.matmul(b, c)
+      ops.add_to_collection('train_op', d)  # d is the fetch node.
+
+    # Optimize the graph.
+    mg = meta_graph.create_meta_graph_def(graph=g)
+    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+
+    # Check that the nodes referenced in various collections have been preserved
+    self.assertEqual(len(optimized_graph.node), 5)
+    self.assertEqual(d.op.name, optimized_graph.node[0].name)
+    self.assertEqual(a1.op.name, optimized_graph.node[1].name)
+    self.assertEqual('Variable/initial_value', optimized_graph.node[2].name)
+    self.assertEqual(a2.op.name, optimized_graph.node[3].name)
+    self.assertEqual('Variable/Assign', optimized_graph.node[4].name)
+
+  def testLoops(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      def _Cond(_, counter):
+        return counter < end
+
+      def _Body(buf, counter):
+        buf = array_ops.concat([buf, [counter]], 0)
+        counter += 1
+        return [buf, counter]
+
+      start = array_ops.placeholder(shape=[], dtype=dtypes.int32)
+      end = array_ops.placeholder(shape=[], dtype=dtypes.int32)
+      init_buf = array_ops.zeros(shape=[0], dtype=dtypes.int32)
+      loop_vars = [init_buf, start]
+      shape_inv = [
+          tensor_shape.TensorShape([None]),
+          tensor_shape.TensorShape([])
+      ]
+      buf, _ = control_flow_ops.while_loop(_Cond, _Body, loop_vars, shape_inv)
+
+      f = -array_ops.ones_like(buf, optimize=False)
+      buf_shape = array_ops.shape(buf)
+      f_shape = array_ops.shape(f)
+      ops.add_to_collection('train_op', buf_shape)
+      ops.add_to_collection('train_op', f_shape)
+
+    # Optimize the graph.
+    mg = meta_graph.create_meta_graph_def(graph=g)
+    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    mg.graph_def.CopyFrom(optimized_graph)
+
+    # Check that the nodes referenced in various collections have been preserved
+    item = gitem.Item(mg)
+    props = item.GetOpProperties()
+    buf_prop = props[buf.op.name]
+    f_prop = props[f.op.name]
+    self.assertEqual(buf_prop, f_prop)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 1956478f39a6b5d4dea720436d9b87f66ca20426..1b66f589397527d62155c456b22f4c67ff470158 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -8,13 +8,18 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//visibility:public"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+config_setting(
+    name = "empty_condition",
+    values = {"define": "UNUSED=unused"},
+)
 
 py_library(
     name = "keras",
     srcs = [
         "__init__.py",
         "_impl/keras/__init__.py",
-        "_impl/keras/activations.py",
         "_impl/keras/applications/__init__.py",
         "_impl/keras/applications/densenet.py",
         "_impl/keras/applications/imagenet_utils.py",
@@ -26,9 +31,6 @@ py_library(
         "_impl/keras/applications/vgg16.py",
         "_impl/keras/applications/vgg19.py",
         "_impl/keras/applications/xception.py",
-        "_impl/keras/backend.py",
-        "_impl/keras/callbacks.py",
-        "_impl/keras/constraints.py",
         "_impl/keras/datasets/__init__.py",
         "_impl/keras/datasets/boston_housing.py",
         "_impl/keras/datasets/cifar.py",
@@ -38,44 +40,15 @@ py_library(
         "_impl/keras/datasets/imdb.py",
         "_impl/keras/datasets/mnist.py",
         "_impl/keras/datasets/reuters.py",
-        "_impl/keras/engine/__init__.py",
-        "_impl/keras/engine/topology.py",
-        "_impl/keras/engine/training.py",
-        "_impl/keras/engine/training_eager.py",
         "_impl/keras/estimator.py",
-        "_impl/keras/initializers.py",
-        "_impl/keras/layers/__init__.py",
-        "_impl/keras/layers/advanced_activations.py",
-        "_impl/keras/layers/convolutional.py",
-        "_impl/keras/layers/convolutional_recurrent.py",
-        "_impl/keras/layers/core.py",
-        "_impl/keras/layers/embeddings.py",
-        "_impl/keras/layers/local.py",
-        "_impl/keras/layers/merge.py",
-        "_impl/keras/layers/noise.py",
-        "_impl/keras/layers/normalization.py",
-        "_impl/keras/layers/pooling.py",
-        "_impl/keras/layers/recurrent.py",
-        "_impl/keras/layers/serialization.py",
-        "_impl/keras/layers/wrappers.py",
-        "_impl/keras/losses.py",
-        "_impl/keras/metrics.py",
-        "_impl/keras/models.py",
-        "_impl/keras/optimizers.py",
         "_impl/keras/preprocessing/__init__.py",
         "_impl/keras/preprocessing/image.py",
         "_impl/keras/preprocessing/sequence.py",
         "_impl/keras/preprocessing/text.py",
-        "_impl/keras/regularizers.py",
         "_impl/keras/testing_utils.py",
         "_impl/keras/utils/__init__.py",
-        "_impl/keras/utils/conv_utils.py",
-        "_impl/keras/utils/data_utils.py",
-        "_impl/keras/utils/generic_utils.py",
-        "_impl/keras/utils/io_utils.py",
-        "_impl/keras/utils/layer_utils.py",
+        "_impl/keras/utils/multi_gpu_utils.py",
         "_impl/keras/utils/np_utils.py",
-        "_impl/keras/utils/training_utils.py",
         "_impl/keras/utils/vis_utils.py",
         "_impl/keras/wrappers/__init__.py",
         "_impl/keras/wrappers/scikit_learn.py",
@@ -119,6 +92,24 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
+        ":backend",
+        ":engine",
+        ":layers",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "backend",
+    srcs = ["_impl/keras/backend.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -135,8 +126,6 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:layers_base",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
@@ -151,17 +140,91 @@ py_library(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/saved_model",
+    ],
+)
+
+py_library(
+    name = "engine",
+    srcs = [
+        "_impl/keras/activations.py",
+        "_impl/keras/callbacks.py",
+        "_impl/keras/constraints.py",
+        "_impl/keras/engine/__init__.py",
+        "_impl/keras/engine/base_layer.py",
+        "_impl/keras/engine/input_layer.py",
+        "_impl/keras/engine/network.py",
+        "_impl/keras/engine/saving.py",
+        "_impl/keras/engine/sequential.py",
+        "_impl/keras/engine/training.py",
+        "_impl/keras/engine/training_arrays.py",
+        "_impl/keras/engine/training_eager.py",
+        "_impl/keras/engine/training_generator.py",
+        "_impl/keras/engine/training_utils.py",
+        "_impl/keras/initializers.py",
+        "_impl/keras/losses.py",
+        "_impl/keras/metrics.py",
+        "_impl/keras/models.py",
+        "_impl/keras/optimizers.py",
+        "_impl/keras/regularizers.py",
+        "_impl/keras/utils/data_utils.py",
+        "_impl/keras/utils/io_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        "//tensorflow/python/data",
         "@six_archive//:six",
     ],
 )
 
+py_library(
+    name = "layers",
+    srcs = [
+        "_impl/keras/layers/__init__.py",
+        "_impl/keras/layers/advanced_activations.py",
+        "_impl/keras/layers/convolutional.py",
+        "_impl/keras/layers/convolutional_recurrent.py",
+        "_impl/keras/layers/core.py",
+        "_impl/keras/layers/cudnn_recurrent.py",
+        "_impl/keras/layers/embeddings.py",
+        "_impl/keras/layers/local.py",
+        "_impl/keras/layers/merge.py",
+        "_impl/keras/layers/noise.py",
+        "_impl/keras/layers/normalization.py",
+        "_impl/keras/layers/pooling.py",
+        "_impl/keras/layers/recurrent.py",
+        "_impl/keras/layers/serialization.py",
+        "_impl/keras/layers/wrappers.py",
+        "_impl/keras/utils/conv_utils.py",
+        "_impl/keras/utils/generic_utils.py",
+        "_impl/keras/utils/layer_utils.py",
+        "_impl/keras/utils/tf_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":engine",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:cudnn_rnn_ops_gen",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_test(
     name = "integration_test",
     size = "medium",
@@ -256,6 +319,11 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/metrics_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "no_oss",
+        "notap",
+    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -268,6 +336,7 @@ py_test(
     size = "large",
     srcs = ["_impl/keras/applications/densenet_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["nomsan"],  # times out, http://b/78650237
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -316,6 +385,7 @@ py_test(
     size = "large",
     srcs = ["_impl/keras/applications/nasnet_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["nomsan"],  # times out, http://b/78573625
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -381,11 +451,10 @@ py_test(
 
 py_test(
     name = "convolutional_recurrent_test",
-    size = "medium",
+    size = "large",
     srcs = ["_impl/keras/layers/convolutional_recurrent_test.py"],
     shard_count = 2,
     srcs_version = "PY2AND3",
-    tags = ["noasan"],  # times out b/63678675
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -410,6 +479,19 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "cudnn_recurrent_test",
+    size = "large",
+    srcs = ["_impl/keras/layers/cudnn_recurrent_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 2,
+)
+
 py_test(
     name = "pooling_test",
     size = "small",
@@ -562,7 +644,10 @@ py_test(
     size = "medium",
     srcs = ["_impl/keras/layers/wrappers_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "noasan",  # http://b/78599823
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -585,13 +670,15 @@ py_test(
 
 py_test(
     name = "data_utils_test",
-    size = "medium",
+    size = "large",
     srcs = ["_impl/keras/utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_windows",
         "noasan",  # times out
         "notsan",
+        "optonly",  # times out
     ],
     deps = [
         ":keras",
@@ -616,7 +703,10 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/utils/io_utils_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -636,16 +726,17 @@ py_test(
     ],
 )
 
-py_test(
-    name = "training_utils_test",
-    size = "medium",
-    srcs = ["_impl/keras/utils/training_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["multi_gpu"],
-    deps = [
+cuda_py_test(
+    name = "multi_gpu_utils_test",
+    srcs = ["_impl/keras/utils/multi_gpu_utils_test.py"],
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "guitar",
+        "multi_gpu",
     ],
 )
 
@@ -754,11 +845,37 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/engine/topology_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no-internal-py3",
+    ],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "saving_test",
+    size = "medium",
+    srcs = ["_impl/keras/engine/saving_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "sequential_test",
+    size = "small",
+    srcs = ["_impl/keras/engine/sequential_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":keras",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//third_party/py/numpy",
     ],
 )
@@ -779,7 +896,7 @@ py_test(
 
 py_test(
     name = "estimator_test",
-    size = "medium",
+    size = "large",
     srcs = ["_impl/keras/estimator_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
@@ -821,15 +938,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/keras/_impl/keras/__init__.py b/tensorflow/python/keras/_impl/keras/__init__.py
index b63907b2e60acfc80ee411b9193b2829f0224c3e..53f5d31e9c5b861c551a7a9ca3700c383ea679d7 100644
--- a/tensorflow/python/keras/_impl/keras/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/__init__.py
@@ -40,4 +40,4 @@ from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.models import Sequential
 
-__version__ = '2.1.4-tf'
+__version__ = '2.1.5-tf'
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index 236e17653e1b762e1e6962f453b714d1bf7bcbf7..8def7ec49375c7ce23e8f2a24a4c3615d05ca9bb 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -22,8 +22,8 @@ import six
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.layers.base import Layer
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -43,10 +43,10 @@ def softmax(x, axis=-1):
   """
   ndim = K.ndim(x)
   if ndim == 2:
-    return K.softmax(x)
+    return nn.softmax(x)
   elif ndim > 2:
-    e = K.exp(x - K.max(x, axis=axis, keepdims=True))
-    s = K.sum(e, axis=axis, keepdims=True)
+    e = math_ops.exp(x - math_ops.reduce_max(x, axis=axis, keepdims=True))
+    s = math_ops.reduce_sum(e, axis=axis, keepdims=True)
     return e / s
   else:
     raise ValueError('Cannot apply softmax to a tensor that is 1D')
@@ -79,12 +79,12 @@ def selu(x):
 
 @tf_export('keras.activations.softplus')
 def softplus(x):
-  return K.softplus(x)
+  return nn.softplus(x)
 
 
 @tf_export('keras.activations.softsign')
 def softsign(x):
-  return K.softsign(x)
+  return nn.softsign(x)
 
 
 @tf_export('keras.activations.relu')
@@ -94,12 +94,12 @@ def relu(x, alpha=0., max_value=None):
 
 @tf_export('keras.activations.tanh')
 def tanh(x):
-  return K.tanh(x)
+  return nn.tanh(x)
 
 
 @tf_export('keras.activations.sigmoid')
 def sigmoid(x):
-  return K.sigmoid(x)
+  return nn.sigmoid(x)
 
 
 @tf_export('keras.activations.hard_sigmoid')
@@ -134,12 +134,6 @@ def get(identifier):
     identifier = str(identifier)
     return deserialize(identifier)
   elif callable(identifier):
-    if isinstance(identifier, Layer):
-      logging.warning(
-          'Do not pass a layer instance (such as {identifier}) as the '
-          'activation argument of another layer. Instead, advanced '
-          'activation layers should be used just like any other '
-          'layer in a model.'.format(identifier=identifier.__class__.__name__))
     return identifier
   else:
     raise ValueError('Could not interpret '
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet.py b/tensorflow/python/keras/_impl/keras/applications/densenet.py
index 6521f8410435fd13393b9991d3ee9a6342a912d0..ca83e8691237216e799f2ca738dcb6822506e2cb 100644
--- a/tensorflow/python/keras/_impl/keras/applications/densenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/densenet.py
@@ -31,7 +31,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index c26a28ed4087e30968585ec8ac0b64b51513bcae..d928a7afdc639485d443be382420cac09ba9abd6 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -22,8 +22,10 @@ import json
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -151,11 +153,11 @@ def _preprocess_symbolic_input(x, data_format, mode):
     std = None
 
   if _IMAGENET_MEAN is None:
-    _IMAGENET_MEAN = K.constant(-np.array(mean))
+    _IMAGENET_MEAN = constant_op.constant(-np.array(mean), dtype=K.floatx())
 
   # Zero-center by mean pixel
   if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
-    x = K.bias_add(x, K.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
+    x = K.bias_add(x, math_ops.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
   else:
     x = K.bias_add(x, _IMAGENET_MEAN, data_format)
   if std is not None:
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
index bf3901fc54419c2b401bf9c4d6311b39a18f1aba..17e407dd58460e6d6802a3e137a96faf38a6f576 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
@@ -31,7 +31,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
index e268e97bc663773a218f01b958b08f8e43c74ee2..2897c6058eb445ceacc34084b53dc89f556e3e9c 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
@@ -37,7 +37,7 @@ from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index 1bbbedb85e47902b9e6d3dd741e9d52ab9209080..7b7288793de954d05d8405515037d4b197ed4df2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -79,16 +79,17 @@ from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
-from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
 from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import DepthwiseConv2D
 from tensorflow.python.keras._impl.keras.layers import Dropout
 from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import Reshape
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
@@ -116,195 +117,6 @@ def preprocess_input(x):
   return imagenet_utils.preprocess_input(x, mode='tf')
 
 
-class DepthwiseConv2D(Conv2D):
-  """Depthwise separable 2D convolution.
-
-  Depthwise Separable convolutions consists in performing
-  just the first step in a depthwise spatial convolution
-  (which acts on each input channel separately).
-  The `depth_multiplier` argument controls how many
-  output channels are generated per input channel in the depthwise step.
-
-  Arguments:
-      kernel_size: An integer or tuple/list of 2 integers, specifying the
-          width and height of the 2D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 2 integers,
-          specifying the strides of the convolution along the width and height.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `'valid'` or `'same'` (case-insensitive).
-      depth_multiplier: The number of depthwise convolution output channels
-          for each input channel.
-          The total number of depthwise convolution output
-          channels will be equal to `filters_in * depth_multiplier`.
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be 'channels_last'.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. 'linear' activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      depthwise_initializer: Initializer for the depthwise kernel matrix.
-      bias_initializer: Initializer for the bias vector.
-      depthwise_regularizer: Regularizer function applied to
-          the depthwise kernel matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its 'activation')..
-      depthwise_constraint: Constraint function applied to
-          the depthwise kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-
-  Input shape:
-      4D tensor with shape:
-      `[batch, channels, rows, cols]` if data_format='channels_first'
-      or 4D tensor with shape:
-      `[batch, rows, cols, channels]` if data_format='channels_last'.
-
-  Output shape:
-      4D tensor with shape:
-      `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
-      or 4D tensor with shape:
-      `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
-      `rows` and `cols` values might have changed due to padding.
-  """
-
-  def __init__(self,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               depth_multiplier=1,
-               data_format=None,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(DepthwiseConv2D, self).__init__(
-        filters=None,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        activation=activation,
-        use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = initializers.get(depthwise_initializer)
-    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
-    self.depthwise_constraint = constraints.get(depthwise_constraint)
-    self.bias_initializer = initializers.get(bias_initializer)
-
-  @shape_type_conversion
-  def build(self, input_shape):
-    if len(input_shape) < 4:
-      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
-                       'Received input shape:', str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = 3
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs to '
-                       '`DepthwiseConv2D` '
-                       'should be defined. Found `None`.')
-    input_dim = int(input_shape[channel_axis])
-    depthwise_kernel_shape = (self.kernel_size[0], self.kernel_size[1],
-                              input_dim, self.depth_multiplier)
-
-    self.depthwise_kernel = self.add_weight(
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        name='depthwise_kernel',
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(input_dim * self.depth_multiplier,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    # Set input spec.
-    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
-    self.built = True
-
-  def call(self, inputs, training=None):
-    outputs = K.depthwise_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        strides=self.strides,
-        padding=self.padding,
-        dilation_rate=self.dilation_rate,
-        data_format=self.data_format)
-
-    if self.bias:
-      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-
-    return outputs
-
-  @shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-      out_filters = input_shape[1] * self.depth_multiplier
-    elif self.data_format == 'channels_last':
-      rows = input_shape[1]
-      cols = input_shape[2]
-      out_filters = input_shape[3] * self.depth_multiplier
-
-    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
-                                         self.padding, self.strides[0])
-    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
-                                         self.padding, self.strides[1])
-
-    if self.data_format == 'channels_first':
-      return (input_shape[0], out_filters, rows, cols)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], rows, cols, out_filters)
-
-  def get_config(self):
-    config = super(DepthwiseConv2D, self).get_config()
-    config.pop('filters')
-    config.pop('kernel_initializer')
-    config.pop('kernel_regularizer')
-    config.pop('kernel_constraint')
-    config['depth_multiplier'] = self.depth_multiplier
-    config['depthwise_initializer'] = initializers.serialize(
-        self.depthwise_initializer)
-    config['depthwise_regularizer'] = regularizers.serialize(
-        self.depthwise_regularizer)
-    config['depthwise_constraint'] = constraints.serialize(
-        self.depthwise_constraint)
-    return config
-
-
 @tf_export('keras.applications.MobileNet',
            'keras.applications.mobilenet.MobileNet')
 def MobileNet(input_shape=None,
@@ -318,18 +130,11 @@ def MobileNet(input_shape=None,
               classes=1000):
   """Instantiates the MobileNet architecture.
 
-  Note that only TensorFlow is supported for now,
-  therefore it only works with the data format
-  `image_data_format='channels_last'` in your Keras config
-  at `~/.keras/keras.json`.
-
   To load a MobileNet model via `load_model`, import the custom
-  objects `relu6` and `DepthwiseConv2D` and pass them to the
-  `custom_objects` parameter.
+  objects `relu6` and pass them to the `custom_objects` parameter.
   E.g.
   model = load_model('mobilenet.h5', custom_objects={
-                     'relu6': mobilenet.relu6,
-                     'DepthwiseConv2D': mobilenet.DepthwiseConv2D})
+                     'relu6': mobilenet.relu6})
 
   Arguments:
       input_shape: optional shape tuple, only to be specified
@@ -383,11 +188,6 @@ def MobileNet(input_shape=None,
           backend that does not support separable convolutions.
   """
 
-  if K.backend() != 'tensorflow':
-    raise RuntimeError('Only TensorFlow backend is currently supported, '
-                       'as other backends do not support '
-                       'depthwise convolution.')
-
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
@@ -522,7 +322,7 @@ def MobileNet(input_shape=None,
   # load weights
   if weights == 'imagenet':
     if K.image_data_format() == 'channels_first':
-      raise ValueError('Weights for "channels_last" format '
+      raise ValueError('Weights for "channels_first" format '
                        'are not available.')
     if alpha == 1.0:
       alpha_text = '1_0'
@@ -598,14 +398,14 @@ def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
   """
   channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
   filters = int(filters * alpha)
+  x = ZeroPadding2D(padding=(1, 1), name='conv1_pad')(inputs)
   x = Conv2D(
       filters,
       kernel,
-      padding='same',
+      padding='valid',
       use_bias=False,
       strides=strides,
-      name='conv1')(
-          inputs)
+      name='conv1')(x)
   x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
   return Activation(relu6, name='conv1_relu')(x)
 
@@ -665,15 +465,14 @@ def _depthwise_conv_block(inputs,
   """
   channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
   pointwise_conv_filters = int(pointwise_conv_filters * alpha)
-
+  x = ZeroPadding2D(padding=(1, 1), name='conv_pad_%d' % block_id)(inputs)
   x = DepthwiseConv2D(  # pylint: disable=not-callable
       (3, 3),
-      padding='same',
+      padding='valid',
       depth_multiplier=depth_multiplier,
       strides=strides,
       use_bias=False,
-      name='conv_dw_%d' % block_id)(
-          inputs)
+      name='conv_dw_%d' % block_id)(x)
   x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
   x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet.py b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
index 08dae57f006c64021cbca26404770cd89b1ce176..dd33230a7eb9272f8fc60daee63e1f92574cf5e3 100644
--- a/tensorflow/python/keras/_impl/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
@@ -49,7 +49,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import add
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index a47dd657bb9ea0627d82831b7ee5d0b33788b5b7..c3a92bea8920cad3297fee3efc50158813e72361 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -34,7 +34,7 @@ from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
@@ -45,6 +45,7 @@ from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
@@ -237,8 +238,7 @@ def ResNet50(include_top=True,
     bn_axis = 1
 
   x = Conv2D(
-      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(
-          img_input)
+      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(img_input)
   x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
   x = Activation('relu')(x)
   x = MaxPooling2D((3, 3), strides=(2, 2))(x)
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
index 9da74253abc2124844ab89b7727ddda4f754d8e2..cefb25063e30505c9c34b49fd2df6eb7210d7ca8 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
@@ -32,7 +32,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Conv2D
 from tensorflow.python.keras._impl.keras.layers import Dense
 from tensorflow.python.keras._impl.keras.layers import Flatten
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
index 961c1f991893dbc0df858e9f72b61202c9fee500..dadaf4fdf0cc5922752c6867720c5d8cdbcab19a 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
@@ -32,7 +32,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Conv2D
 from tensorflow.python.keras._impl.keras.layers import Dense
 from tensorflow.python.keras._impl.keras.layers import Flatten
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/_impl/keras/applications/xception.py
index 7e7ca5a18a31622ac79d61ab01ce65341a4a46c5..971063a16d1f5ba0e25189f1ef2f6c24eb5f8d61 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/_impl/keras/applications/xception.py
@@ -44,7 +44,7 @@ from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
 from tensorflow.python.keras._impl.keras.layers import Conv2D
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index a238a3f7483685ca0b08c746b55bbc9e868cb2d3..b1f1270623ddccc2030bd5a37df4ce43158c387a 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 import collections
 import json
 import os
+import weakref
 
 import numpy as np
 
@@ -34,7 +35,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -54,11 +55,11 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
-from tensorflow.python.training import moving_averages
+
+from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
-
 py_all = all
 py_sum = sum
 
@@ -262,6 +263,12 @@ def set_image_data_format(data_format):
   _IMAGE_DATA_FORMAT = str(data_format)
 
 
+# A global dictionary mapping graph objects to an index of counters used
+# for various layer names in each graph.
+# Allows to give unique autogenerated names to layers, in a graph-specific way.
+PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
+
+
 @tf_export('keras.backend.get_uid')
 def get_uid(prefix=''):
   """Associates a string prefix with an integer counter in a TensorFlow graph.
@@ -282,17 +289,16 @@ def get_uid(prefix=''):
   ```
   """
   graph = ops.get_default_graph()
-  if graph not in tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS:
-    tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(
-        int)
-  layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph]
+  if graph not in PER_GRAPH_LAYER_NAME_UIDS:
+    PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(int)
+  layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph]
   layer_name_uids[prefix] += 1
   return layer_name_uids[prefix]
 
 
 @tf_export('keras.backend.reset_uids')
 def reset_uids():
-  per_graph_layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS
+  per_graph_layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS
   keys = list(per_graph_layer_name_uids.keys())
   for key in keys:
     del per_graph_layer_name_uids[key]
@@ -343,7 +349,7 @@ def learning_phase():
   Returns:
       Learning phase (scalar integer tensor or Python integer).
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     if 'eager' not in _GRAPH_LEARNING_PHASES:
       # Fallback to inference mode as default.
       return 0
@@ -369,13 +375,42 @@ def set_learning_phase(value):
   """
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   if value not in {0, 1}:
-    raise ValueError('Expected learning phase to be ' '0 or 1.')
-  if context.in_eager_mode():
+    raise ValueError('Expected learning phase to be 0 or 1.')
+  if context.executing_eagerly():
     _GRAPH_LEARNING_PHASES['eager'] = value
   else:
     _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
 
 
+@tf_contextlib.contextmanager
+def learning_phase_scope(value):
+  """Provides a scope within which the learning phase is equal to `value`.
+
+  The learning phase gets restored to its original value upon exiting the scope.
+
+  Arguments:
+     value: Learning phase value, either 0 or 1 (integers).
+
+  Yields:
+    The provided value.
+
+  Raises:
+     ValueError: if `value` is neither `0` nor `1`.
+  """
+  if value not in {0, 1}:
+    raise ValueError('Expected learning phase to be 0 or 1.')
+  previous_value = learning_phase()
+  try:
+    set_learning_phase(value)
+    yield value
+  finally:
+    # Restore learning phase to initial value.
+    if context.executing_eagerly():
+      _GRAPH_LEARNING_PHASES['eager'] = previous_value
+    else:
+      _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = previous_value
+
+
 @tf_export('keras.backend.get_session')
 def get_session():
   """Returns the TF session to be used by the backend.
@@ -394,8 +429,9 @@ def get_session():
       A TensorFlow session.
   """
   global _SESSION
-  if ops.get_default_session() is not None:
-    session = ops.get_default_session()
+  default_session = ops.get_default_session()
+  if default_session is not None:
+    session = default_session
   else:
     if _SESSION is None:
       if not os.environ.get('OMP_NUM_THREADS'):
@@ -466,7 +502,7 @@ def _is_current_explicit_device(device_type):
   """
   device_type = device_type.upper()
   if device_type not in ['CPU', 'GPU']:
-    raise ValueError('device_type should be either "CPU" or "GPU".')
+    raise ValueError('`device_type` should be either "CPU" or "GPU".')
   device = _get_current_tf_device()
   return device is not None and device.device_type == device_type.upper()
 
@@ -1245,6 +1281,11 @@ def moving_average_update(x, value, momentum):
   Returns:
       An Operation to update the variable.
   """
+  # `training` is higher-up than the Keras backend in the abstraction hierarchy.
+  # In particular, `training` depends on layers, and thus on Keras.
+  # moving_averages, being low-level ops, should not be part of the training
+  # module.
+  from tensorflow.python.training import moving_averages  # pylint: disable=g-import-not-at-top
   return moving_averages.assign_moving_average(
       x, value, momentum, zero_debias=True)
 
@@ -2596,7 +2637,7 @@ def get_value(x):
   Returns:
       A Numpy array.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return x.numpy()
   return x.eval(session=get_session())
 
@@ -2611,7 +2652,7 @@ def batch_get_value(tensors):
   Returns:
       A list of Numpy arrays.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return [x.numpy() for x in tensors]
   if tensors:
     return get_session().run(tensors)
@@ -2629,7 +2670,7 @@ def set_value(x, value):
           (of the same shape).
   """
   value = np.asarray(value, dtype=dtype(x))
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     x.assign(value)
   else:
     tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
@@ -2652,7 +2693,7 @@ def batch_set_value(tuples):
       tuples: a list of tuples `(tensor, value)`.
           `value` should be a Numpy array.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     for x, value in tuples:
       x.assign(np.asarray(value, dtype=dtype(x)))
   else:
@@ -2719,8 +2760,7 @@ class Function(object):
       outputs: Output tensors to fetch.
       updates: Additional update ops to be run at function call.
       name: A name to help users identify what this function does.
-      session_kwargs: Arguments to `tf.Session.run()`: `fetches`, `feed_dict`,
-        `options`, `run_metadata`
+      session_kwargs: Arguments to `tf.Session.run()`: `fetches`, `feed_dict`.
   """
 
   def __init__(self, inputs, outputs, updates=None, name=None,
@@ -2749,29 +2789,114 @@ class Function(object):
       self.updates_op = control_flow_ops.group(*updates_ops)
     self.name = name
     # additional tensor substitutions
-    self.feed_dict = session_kwargs.pop('feed_dict', {})
+    self.feed_dict = session_kwargs.pop('feed_dict', None)
     # additional operations
     self.fetches = session_kwargs.pop('fetches', [])
     if not isinstance(self.fetches, list):
       self.fetches = [self.fetches]
+    # The main use case of `fetches` being passed to a model is the ability
+    # to run custom updates (since the outputs of fetches are never returned).
+    # This requires us to wrap fetches in `identity` ops.
+    self.fetches = [array_ops.identity(x) for x in self.fetches]
     self.session_kwargs = session_kwargs
 
+    if session_kwargs:
+      raise ValueError('Some keys in session_kwargs are not supported at this '
+                       'time: %s', session_kwargs.keys())
+
+    self._callable_fn = None
+    self._feed_arrays = None
+    self._feed_symbols = None
+    self._symbol_vals = None
+    self._session = None
+
+  def _make_callable(self, feed_arrays, feed_symbols, symbol_vals, session):
+    """Generates a callable that runs the graph.
+
+    Arguments:
+      feed_arrays: List of input tensors to be fed Numpy arrays at runtime.
+      feed_symbols: List of input tensors to be fed symbolic tensors at runtime.
+      symbol_vals: List of symbolic tensors to be fed to `feed_symbols`.
+      session: Session to use to generate the callable.
+
+    Returns:
+      Function that runs the graph according to the above options.
+    """
+    # Prepare callable options.
+    callable_opts = config_pb2.CallableOptions()
+    # Handle external-data feed.
+    for x in feed_arrays:
+      callable_opts.feed.append(x.name)
+    if self.feed_dict:
+      for key in sorted(self.feed_dict.keys()):
+        callable_opts.feed.append(key.name)
+    # Handle symbolic feed.
+    for x, y in zip(feed_symbols, symbol_vals):
+      connection = callable_opts.tensor_connection.add()
+      if x.dtype != y.dtype:
+        y = math_ops.cast(y, dtype=x.dtype)
+      from_tensor = ops._as_graph_element(y)
+      if from_tensor is None:
+        from_tensor = y
+      connection.from_tensor = from_tensor.name  # Data tensor
+      connection.to_tensor = x.name  # Placeholder
+    # Handle fetches.
+    for x in self.outputs + self.fetches:
+      callable_opts.fetch.append(x.name)
+    # Handle updates.
+    callable_opts.target.append(self.updates_op.name)
+    # Create callable.
+    callable_fn = session._make_callable_from_options(callable_opts)
+    # Cache parameters corresponding to the generated callable, so that
+    # we can detect future mismatches and refresh the callable.
+    self._callable_fn = callable_fn
+    self._feed_arrays = feed_arrays
+    self._feed_symbols = feed_symbols
+    self._symbol_vals = symbol_vals
+    self._session = session
+
   def __call__(self, inputs):
     if not isinstance(inputs, (list, tuple)):
       raise TypeError('`inputs` should be a list or tuple.')
-    feed_dict = self.feed_dict.copy()
+
+    session = get_session()
+    feed_arrays = []
+    array_vals = []
+    feed_symbols = []
+    symbol_vals = []
     for tensor, value in zip(self.inputs, inputs):
+      if value is None:
+        continue
       if is_sparse(tensor):
         sparse_coo = value.tocoo()
         indices = np.concatenate((np.expand_dims(sparse_coo.row, 1),
                                   np.expand_dims(sparse_coo.col, 1)), 1)
         value = (indices, sparse_coo.data, sparse_coo.shape)
-      feed_dict[tensor] = value
-    fetches = self.outputs + [self.updates_op] + self.fetches
-    session = get_session()
-    updated = session.run(
-        fetches=fetches, feed_dict=feed_dict, **self.session_kwargs)
-    return updated[:len(self.outputs)]
+      if tensor_util.is_tensor(value):
+        # Case: feeding symbolic tensor.
+        feed_symbols.append(tensor)
+        symbol_vals.append(value)
+      else:
+        # Case: feeding Numpy array.
+        feed_arrays.append(tensor)
+        # We need to do array conversion and type casting at this level, since
+        # `callable_fn` only supports exact matches.
+        array_vals.append(np.asarray(value, dtype=tensor.dtype.base_dtype.name))
+    if self.feed_dict:
+      for key in sorted(self.feed_dict.keys()):
+        array_vals.append(
+            np.asarray(self.feed_dict[key], dtype=key.dtype.base_dtype.name))
+
+    # Refresh callable if anything has changed.
+    if (self._callable_fn is None or
+        feed_arrays != self._feed_arrays or
+        symbol_vals != self._symbol_vals or
+        feed_symbols != self._feed_symbols or
+        session != self._session):
+      self._make_callable(feed_arrays, feed_symbols, symbol_vals, session)
+
+    fetched = self._callable_fn(*array_vals)
+    return fetched[:len(self.outputs)]
 
 
 @tf_export('keras.backend.function')
@@ -2873,7 +2998,7 @@ def rnn(step_function,
       constants: a list of constant values passed at each step.
       unroll: whether to unroll the RNN or to use a symbolic loop
           (`while_loop` or `scan` depending on backend).
-      input_length: Unused; exists for API compatibility.
+      input_length: If specified, assume time dimension is of this length.
 
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
@@ -2891,7 +3016,6 @@ def rnn(step_function,
       ValueError: if `mask` is provided (not `None`) but states is not provided
           (`len(states)` == 0).
   """
-  del input_length
   ndim = len(inputs.get_shape())
   if ndim < 3:
     raise ValueError('Input should be at least 3D.')
@@ -3069,6 +3193,7 @@ def rnn(step_function,
         cond=lambda time, *_: time < time_steps,
         body=_step,
         loop_vars=(time, output_ta) + states,
+        maximum_iterations=input_length,
         parallel_iterations=32,
         swap_memory=True)
     last_time = final_outputs[0]
@@ -3087,7 +3212,8 @@ def rnn(step_function,
   outputs_shape[1] = inputs_shape[1]
   outputs.set_shape(outputs_shape)
 
-  last_output._uses_learning_phase = uses_learning_phase
+  if not context.executing_eagerly():
+    last_output._uses_learning_phase = uses_learning_phase
   return last_output, outputs, new_states
 
 
@@ -3322,7 +3448,7 @@ def categorical_crossentropy(target, output, from_logits=False):
   Returns:
       Output tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.softmax_cross_entropy_with_logits_v2
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
@@ -3335,7 +3461,7 @@ def categorical_crossentropy(target, output, from_logits=False):
         target * math_ops.log(output),
         axis=len(output.get_shape()) - 1)
   else:
-    return nn.softmax_cross_entropy_with_logits(labels=target, logits=output)
+    return nn.softmax_cross_entropy_with_logits_v2(labels=target, logits=output)
 
 
 @tf_export('keras.backend.sparse_categorical_crossentropy')
@@ -3386,7 +3512,7 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.sigmoid_cross_entropy_with_logits
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # transform back to logits
@@ -3477,7 +3603,7 @@ def l2_normalize(x, axis=None):
   Returns:
       A tensor.
   """
-  return nn.l2_normalize(x, dim=axis)
+  return nn.l2_normalize(x, axis=axis)
 
 
 @tf_export('keras.backend.in_top_k')
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/_impl/keras/backend_test.py
index f29ca49378bc43385b9e90d3f1cefb7937df64cd..de1ed467a2764a2b08269181bb8bcd615868d3b1 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/_impl/keras/backend_test.py
@@ -128,6 +128,22 @@ class BackendUtilsTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
       sess.run(y, feed_dict={x: np.random.random((2, 3))})
 
+  def test_learning_phase_scope(self):
+    with self.test_session():
+      initial_learning_phase = keras.backend.learning_phase()
+      with keras.backend.learning_phase_scope(1) as lp:
+        self.assertEqual(lp, 1)
+        self.assertEqual(keras.backend.learning_phase(), 1)
+      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+      with keras.backend.learning_phase_scope(0) as lp:
+        self.assertEqual(lp, 0)
+        self.assertEqual(keras.backend.learning_phase(), 0)
+      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+      with self.assertRaises(ValueError):
+        with keras.backend.learning_phase_scope(None):
+          pass
+      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+
   def test_int_shape(self):
     x = keras.backend.placeholder(shape=(3, 4))
     self.assertEqual(keras.backend.int_shape(x), (3, 4))
@@ -173,6 +189,39 @@ class BackendUtilsTest(test.TestCase):
     for y in ys:
       self.assertEqual(y.op.name[:12], 'StopGradient')
 
+  def test_function_tf_feed_symbols(self):
+    with self.test_session():
+      # Test feeding a resource variable to `function`.
+      x1 = keras.backend.placeholder(shape=())
+      x2 = keras.backend.placeholder(shape=())
+      lr = keras.backend.learning_phase()  # Include a placeholder_with_default.
+
+      y1 = keras.backend.variable(10.)
+      y2 = 3
+
+      f = keras.backend.function(
+          inputs=[x1, x2, lr],
+          outputs=[x1 + 1,
+                   keras.backend.in_train_phase(x2 + 2, x2 - 1)])
+      outs = f([y1, y2, None])  # Use default learning_phase value.
+      self.assertEqual(outs, [11., 2.])
+      outs = f([y1, y2, 1])  # Set learning phase value.
+      self.assertEqual(outs, [11., 5.])
+
+      # Test triggering a callable refresh by changing the input.
+      y3 = keras.backend.constant(20.)  # Test with tensor
+      outs = f([y3, y2, None])
+      self.assertEqual(outs, [21., 2.])
+
+      y4 = 4  # Test with non-symbol
+      outs = f([y4, y2, None])
+      self.assertEqual(outs, [5., 2.])
+
+      # Test with a different dtype
+      y5 = keras.backend.constant(10., dtype='float64')
+      outs = f([y5, y2, None])
+      self.assertEqual(outs, [11., 2.])
+
   def test_function_tf_fetches(self):
     # Additional operations can be passed to tf.Session().run() via its
     # `fetches` arguments. In contrast to `updates` argument of
@@ -190,8 +239,9 @@ class BackendUtilsTest(test.TestCase):
                                  updates=[(x, x_placeholder + 1.)],
                                  fetches=[keras.backend.update(y, 5.)])
       output = f([10., 20.])
-      assert output == [30.]
-      assert keras.backend.get_session().run(fetches=[x, y]) == [11., 5.]
+      self.assertEqual(output, [30.])
+      self.assertEqual(
+          keras.backend.get_session().run(fetches=[x, y]), [11., 5.])
 
   def test_function_tf_feed_dict(self):
     # Additional substitutions can be passed to `tf.Session().run()` via its
@@ -213,14 +263,16 @@ class BackendUtilsTest(test.TestCase):
                                  feed_dict=feed_dict,
                                  fetches=fetches)
       output = f([10.])
-      assert output == [11.]
-      assert keras.backend.get_session().run(fetches=[x, y]) == [20., 30.]
+      self.assertEqual(output, [11.])
+      self.assertEqual(
+          keras.backend.get_session().run(fetches=[x, y]), [20., 30.])
 
       # updated value in feed_dict will be modified within the K.function()
       feed_dict[y_placeholder] = 4.
       output = f([20.])
-      assert output == [21.]
-      assert keras.backend.get_session().run(fetches=[x, y]) == [30., 40.]
+      self.assertEqual(output, [21.])
+      self.assertEqual(
+          keras.backend.get_session().run(fetches=[x, y]), [30., 40.])
 
 
 class BackendVariableTest(test.TestCase):
diff --git a/tensorflow/python/keras/_impl/keras/callbacks.py b/tensorflow/python/keras/_impl/keras/callbacks.py
index f6c466142522927135d66f73f9f5c697671649ec..deb1e8867dba3d52816ebda02bd9a3bf2ec7bc09 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks.py
@@ -778,16 +778,24 @@ class TensorBoard(Callback):
         while i < val_size:
           step = min(self.batch_size, val_size - i)
           batch_val = []
-          batch_val.append(val_data[0][i:i + step])
-          batch_val.append(val_data[1][i:i + step])
-          batch_val.append(val_data[2][i:i + step])
+          batch_val.append(val_data[0][i:i + step]
+                           if val_data[0] is not None else None)
+          batch_val.append(val_data[1][i:i + step]
+                           if val_data[1] is not None else None)
+          batch_val.append(val_data[2][i:i + step]
+                           if val_data[2] is not None else None)
           if self.model.uses_learning_phase:
             # do not slice the learning phase
-            batch_val = [x[i:i + step] for x in val_data[:-1]]
+            batch_val = [x[i:i + step] if x is not None else None
+                         for x in val_data[:-1]]
             batch_val.append(val_data[-1])
           else:
-            batch_val = [x[i:i + step] for x in val_data]
-          feed_dict = dict(zip(tensors, batch_val))
+            batch_val = [x[i:i + step] if x is not None else None
+                         for x in val_data]
+          feed_dict = {}
+          for key, val in zip(tensors, batch_val):
+            if val is not None:
+              feed_dict[key] = val
           result = self.sess.run([self.merged], feed_dict=feed_dict)
           summary_str = result[0]
           self.writer.add_summary(summary_str, epoch)
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index 271fbbb63d3dfd50507837e190860d48315a14f2..abe95d8e0ca68b2e62f9574fba9ae912a9179fff 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -24,6 +24,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -65,7 +66,8 @@ class MaxNorm(Constraint):
     self.axis = axis
 
   def __call__(self, w):
-    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    norms = K.sqrt(
+        math_ops.reduce_sum(math_ops.square(w), axis=self.axis, keepdims=True))
     desired = K.clip(norms, 0, self.max_value)
     return w * (desired / (K.epsilon() + norms))
 
@@ -79,7 +81,7 @@ class NonNeg(Constraint):
   """
 
   def __call__(self, w):
-    return w * K.cast(K.greater_equal(w, 0.), K.floatx())
+    return w * math_ops.cast(math_ops.greater_equal(w, 0.), K.floatx())
 
 
 @tf_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
@@ -105,7 +107,9 @@ class UnitNorm(Constraint):
 
   def __call__(self, w):
     return w / (
-        K.epsilon() + K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True)))
+        K.epsilon() + K.sqrt(
+            math_ops.reduce_sum(
+                math_ops.square(w), axis=self.axis, keepdims=True)))
 
   def get_config(self):
     return {'axis': self.axis}
@@ -148,7 +152,8 @@ class MinMaxNorm(Constraint):
     self.axis = axis
 
   def __call__(self, w):
-    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    norms = K.sqrt(
+        math_ops.reduce_sum(math_ops.square(w), axis=self.axis, keepdims=True))
     desired = (
         self.rate * K.clip(norms, self.min_value, self.max_value) +
         (1 - self.rate) * norms)
diff --git a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
index b9ae41a0d4d0e8d9df70e3fc1952e81c5f57e8d9..508e95f719a02977960b80c283495ced642293c5 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
@@ -24,8 +24,10 @@ import os
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.fashion_mnist.load_data')
 def load_data():
   """Loads the Fashion-MNIST dataset.
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/__init__.py b/tensorflow/python/keras/_impl/keras/engine/__init__.py
index 31f624f9af65cac60b6466d4eb5753cbdee984c6..1bc533ab8f7ba37948d82bc69fe1c9bfe00d6834 100644
--- a/tensorflow/python/keras/_impl/keras/engine/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/engine/__init__.py
@@ -18,13 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
-from tensorflow.python.keras._impl.keras.engine.topology import Input
-from tensorflow.python.keras._impl.keras.engine.topology import InputLayer
-from tensorflow.python.keras._impl.keras.engine.topology import InputSpec
-from tensorflow.python.keras._impl.keras.engine.topology import Layer
+from tensorflow.python.keras._impl.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
+from tensorflow.python.keras._impl.keras.engine.input_layer import Input
+from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.engine.training import Model
-
-
-# Note: topology.Node is an internal class,
-# it isn't meant to be used by Keras users.
diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3e78c95dc995709c170368c37a5d450192400b0
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
@@ -0,0 +1,1843 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Contains the base Layer class, from which all layers inherit."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.eager import context
+from tensorflow.python.estimator import util as estimator_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import backend
+from tensorflow.python.keras._impl.keras import constraints
+from tensorflow.python.keras._impl.keras import initializers
+from tensorflow.python.keras._impl.keras import regularizers
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+# A module that only depends on `keras.layers` import these from here.
+from tensorflow.python.keras._impl.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.training import checkpointable
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('keras.layers.Layer')
+class Layer(checkpointable.CheckpointableBase):
+  """Base layer class.
+
+  This is the class from which all layers inherit.
+
+  A layer is a class implementing common neural networks operations, such
+  as convolution, batch norm, etc. These operations require managing weights,
+  losses, updates, and inter-layer connectivity.
+
+  Users will just instantiate a layer and then treat it as a callable.
+
+  We recommend that descendants of `Layer` implement the following methods:
+  * `__init__()`: Save configuration in member variables
+  * `build()`: Called once from `__call__`, when we know the shapes of inputs
+    and `dtype`. Should have the calls to `add_weight()`, and then
+    call the super's `build()` (which sets `self.built = True`, which is
+    nice in case the user wants to call `build()` manually before the
+    first `__call__`).
+  * `call()`: Called in `__call__` after making sure `build()` has been called
+    once. Should actually perform the logic of applying the layer to the
+    input tensors (which should be passed in as the first argument).
+
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+
+  Read-only properties:
+    name: The name of the layer (string).
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and
+      non-trainable.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
+
+  Mutable properties:
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
+      constraints on inputs that can be accepted by the layer.
+  """
+
+  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
+    # These properties should be set by the user via keyword arguments.
+    # note that 'dtype', 'input_shape' and 'batch_input_shape'
+    # are only applicable to input layers: do not pass these keywords
+    # to non-input layers.
+    allowed_kwargs = {
+        'input_shape',
+        'batch_input_shape',
+        'batch_size',
+        'weights',
+        'activity_regularizer',
+    }
+    # Validate optional keyword arguments.
+    for kwarg in kwargs:
+      if kwarg not in allowed_kwargs:
+        raise TypeError('Keyword argument not understood:', kwarg)
+
+    # Mutable properties
+    # Indicates whether the layer's weights are updated during training
+    # and whether the layer's updates are run during training
+    self.trainable = trainable
+    # A stateful layer is a layer whose updates are run during inference too,
+    # for instance stateful RNNs.
+    self.stateful = False
+    # Indicates whether `build` needs to be called upon layer call, to create
+    # the layer's weights.
+    self.built = False
+    # Provides information about which inputs are compatible with the layer.
+    self.input_spec = None
+
+    self._init_set_name(name)
+
+    activity_regularizer = kwargs.pop('activity_regularizer', None)
+    if activity_regularizer and context.executing_eagerly():
+      raise ValueError(
+          ('Activity regularization is not supported when executing eagerly. '
+           'Got activity_regularizer=%s') % (activity_regularizer,))
+    self._activity_regularizer = activity_regularizer
+    self._trainable_weights = []
+    self._non_trainable_weights = []
+    self._updates = []
+    # When executing eagerly, _losses is a list of zero-argument lambdas which
+    # return tensors. When using graph execution, _losses is a list of ops.
+    self._losses = []
+    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
+    self._call_fn_args = estimator_util.fn_args(self.call)
+    self._compute_previous_mask = ('mask' in self._call_fn_args or
+                                   hasattr(self, 'compute_mask'))
+    self._uses_inputs_arg = True
+
+    # These lists will be filled via successive calls
+    # to self._add_inbound_node().
+    self._inbound_nodes = []
+    self._outbound_nodes = []
+
+    self.supports_masking = False
+
+    # Manage input shape information if passed.
+    if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
+      # In this case we will later create an input layer
+      # to insert before the current layer
+      if 'batch_input_shape' in kwargs:
+        batch_input_shape = tuple(kwargs['batch_input_shape'])
+      elif 'input_shape' in kwargs:
+        if 'batch_size' in kwargs:
+          batch_size = kwargs['batch_size']
+        else:
+          batch_size = None
+        batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
+      self._batch_input_shape = batch_input_shape
+
+    # Manage initial weight values if passed.
+    if 'weights' in kwargs:
+      self._initial_weights = kwargs['weights']
+    else:
+      self._initial_weights = None
+
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      self._name = unique_layer_name(
+          generic_utils.to_snake_case(self.__class__.__name__),
+          zero_based=zero_based)
+    else:
+      self._name = name
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def activity_regularizer(self):
+    """Optional regularizer function for the output of this layer."""
+    return self._activity_regularizer
+
+  @activity_regularizer.setter
+  def activity_regularizer(self, regularizer):
+    """Optional regularizer function for the output of this layer."""
+    self._activity_regularizer = regularizer
+
+  @property
+  def trainable_weights(self):
+    return self._trainable_weights if self.trainable else []
+
+  @property
+  def non_trainable_weights(self):
+    if self.trainable:
+      return self._non_trainable_weights
+    else:
+      return self._trainable_weights + self._non_trainable_weights
+
+  @property
+  def trainable_variables(self):
+    return self.trainable_weights
+
+  @property
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.trainable_weights + self.non_trainable_weights
+
+  @property
+  def variables(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.weights
+
+  @property
+  def updates(self):
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.updates not supported in Eager mode.')
+    if not self.trainable and not self.stateful:
+      return []
+    return self._updates
+
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
+
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing the same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
+
+    This call is ignored when eager execution is enabled (in that case, variable
+    updates are run on the fly and thus do not need to be tracked for later
+    execution).
+
+    Arguments:
+      updates: Update op, or list/tuple of update ops.
+      inputs: If anything other than None is passed, it signals the updates
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for BatchNormalization updates, for instance.
+        If None, the updates will be taken into account unconditionally,
+        and you are responsible for making sure that any dependency they might
+        have is available at runtime.
+        A step counter might fall into this category.
+    """
+    if context.executing_eagerly():
+      return  # Updates already applied when in eager mode.
+
+    def process_update(x):
+      if isinstance(x, ops.Operation):
+        return x
+      elif hasattr(x, 'op'):
+        return x.op
+      else:
+        return ops.convert_to_tensor(x)
+
+    updates = generic_utils.to_list(updates)
+    updates = [process_update(x) for x in updates]
+    self._updates += updates
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
+    else:
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
+
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
+
+    # Updates disabled if layer is not trainable and not explicitly stateful.
+    if not self.trainable and not self.stateful:
+      return []
+
+    if inputs is None:
+      # Requesting unconditional updates.
+      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+
+    # Requesting input-conditional updates.
+    inputs = nest.flatten(inputs)
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates)
+    updates = []
+    for update in self.updates:
+      if update in reachable:
+        updates.append(update)
+    return updates
+
+  @property
+  def losses(self):
+    """Losses which are associated with this `Layer`.
+
+    Note that when executing eagerly, getting this property evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
+
+    Returns:
+      A list of tensors.
+    """
+    if context.executing_eagerly():
+      # _losses may only contain variable regularization losses when executing
+      # eagerly, and they have been saved as lambdas to be executed when
+      # requested.
+      return [regularizer() for regularizer in self._losses]
+    else:
+      return self._losses
+
+  def add_loss(self, losses, inputs=None):
+    """Add loss tensor(s), potentially dependent on layer inputs.
+
+    Some losses (for instance, activity regularization losses) may be dependent
+    on the inputs passed when calling a layer. Hence, when reusing the same
+    layer on different inputs `a` and `b`, some entries in `layer.losses` may
+    be dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_losses_for` method allows to retrieve the losses relevant to a
+    specific set of inputs.
+
+    Note that `add_loss` is not supported when executing eagerly. Instead,
+    variable regularizers may be added through `add_variable`. Activity
+    regularization is not supported directly (but such losses may be returned
+    from `Layer.call()`).
+
+    Arguments:
+      losses: Loss tensor, or list/tuple of tensors.
+      inputs: If anything other than None is passed, it signals the losses
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for activity regularization losses, for instance.
+        If `None` is passed, the losses are assumed
+        to be unconditional, and will apply across all dataflows of the layer
+        (e.g. weight regularization losses).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      # TODO(fchollet): it should be possible (and highly desirable) to support
+      # `add_loss` in eager mode. This allows great convenience and flexibility
+      # in defining custom losses on the fly (e.g. in VAEs).
+      # Simply appending the loss value to `self._losses`
+      # is the correct behavior.
+      # The only caveat is that we need to force the user to only call
+      # `add_loss` from inside a model or Layer's `call` method
+      # (otherwise the loss computation cannot be backproped through).
+      raise RuntimeError('Layer.add_loss not supported in Eager mode.')
+
+    losses = generic_utils.to_list(losses)
+    self._losses += losses
+    if inputs is None:
+      for loss in losses:
+        loss._unconditional_loss = True  # pylint: disable=protected-access
+    else:
+      for loss in losses:
+        loss._unconditional_loss = False  # pylint: disable=protected-access
+
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of loss tensors of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
+
+    if inputs is None:
+      # Requesting unconditional losses.
+      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+
+    # Requesting input-conditional losses.
+    inputs = nest.flatten(inputs)
+    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
+    # The losses we want to return will be part of this set.
+    # To avoid unnecessary work, we stop the search in case all of
+    # `self.losses` have been retrieved.
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses)
+    losses = []
+    for loss in self.losses:
+      if loss in reachable:
+        losses.append(loss)
+    return losses
+
+  def _name_scope(self):
+    return self.name
+
+  def build(self, _):
+    """Creates the variables of the layer."""
+    self.built = True
+
+  def add_variable(self, *args, **kwargs):
+    """Alias for `add_weight`."""
+    return self.add_weight(*args, **kwargs)
+
+  def add_weight(self, name, shape,
+                 dtype=None,
+                 initializer=None,
+                 regularizer=None,
+                 trainable=True,
+                 constraint=None,
+                 partitioner=None,
+                 use_resource=None,
+                 getter=None):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
+
+    Arguments:
+      name: variable name.
+      shape: variable shape.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      initializer: initializer instance (callable).
+      regularizer: regularizer instance (callable).
+      trainable: whether the variable should be part of the layer's
+        "trainable_variables" (e.g. variables, biases)
+        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+        Note, if the current variable scope is marked as non-trainable
+        then this parameter is ignored and any added variables are also
+        marked as non-trainable.
+      constraint: constraint instance (callable).
+      partitioner: Partitioner to be passed to the `Checkpointable` API.
+      use_resource: Whether to use `ResourceVariable`.
+      getter: Variable getter argument to be passed to the `Checkpointable` API.
+
+    Returns:
+      The created variable.  Usually either a `Variable` or `ResourceVariable`
+      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
+      instance is returned.
+
+    Raises:
+      RuntimeError: If called with partioned variable regularization and
+        eager execution is enabled.
+      ValueError: When giving unsupported dtype and no initializer.
+    """
+    if dtype is None:
+      dtype = self.dtype or backend.floatx()
+    else:
+      dtype = dtypes.as_dtype(dtype)
+    initializer = initializers.get(initializer)
+    regularizer = regularizers.get(regularizer)
+    constraint = constraints.get(constraint)
+
+    # Initialize variable when no initializer provided
+    if initializer is None:
+      # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
+      if dtype.is_floating:
+        initializer = initializers.glorot_uniform()
+      # If dtype is DT_INT/DT_UINT, provide a default value `zero`
+      # If dtype is DT_BOOL, provide a default value `FALSE`
+      elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
+        initializer = initializers.zeros()
+      # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
+      else:
+        raise ValueError('An initializer for variable %s of type %s is required'
+                         ' for layer %s' % (name, dtype.base_dtype, self.name))
+
+    variable = self._add_variable_with_custom_getter(
+        name=name,
+        shape=shape,
+        # TODO(allenl): a `make_variable` equivalent should be added as a
+        # `Checkpointable` method.
+        getter=getter or make_variable,
+        # Manage errors in Layer rather than Checkpointable.
+        overwrite=True,
+        initializer=initializer,
+        dtype=dtypes.as_dtype(dtype),
+        constraint=constraint,
+        trainable=trainable and self.trainable,
+        partitioner=partitioner,
+        use_resource=use_resource)
+
+    if regularizer is not None:
+      # TODO(fchollet): in the future, this should be handled at the
+      # level of variable creation, and weight regularization losses
+      # should be variable attributes.
+      self._handle_weight_regularization(name, variable, regularizer)
+
+    if trainable:
+      self._trainable_weights.append(variable)
+    else:
+      self._non_trainable_weights.append(variable)
+    return variable
+
+  def _handle_weight_regularization(self, name, variable, regularizer):
+    # `init_graph` should point to the graph in which variable initialization
+    # will occur; it should be None if and only if initialization will take
+    # place in the eager context.
+    init_graph = None
+    if not context.executing_eagerly():
+      default_graph = ops.get_default_graph()
+      if default_graph.building_function:
+        with ops.init_scope():
+          # Retrieve the variables from the graph into which variables
+          # will be lifted; if initialization ops will be lifted into
+          # the eager context, then there is nothing to retrieve, since variable
+          # collections are not supported when eager execution is enabled.
+          if not context.executing_eagerly():
+            init_graph = ops.get_default_graph()
+      else:
+        # Initialization ops will not be lifted out of the default graph.
+        init_graph = default_graph
+
+    if init_graph is not None:  # pylint: disable=protected-access
+      # The variable was created and initialized in a graph.
+      if regularizer:
+        if isinstance(variable, tf_variables.PartitionedVariable):
+          for v in variable:
+            with ops.colocate_with(v.op):
+              with ops.name_scope(name + '/Regularizer'):
+                regularization = regularizer(v)
+            if regularization is not None:
+              self.add_loss(regularization)
+        else:
+          with ops.colocate_with(variable.op):
+            with ops.name_scope(name + '/Regularizer'):
+              regularization = regularizer(variable)
+          if regularization is not None:
+            self.add_loss(regularization)
+    elif regularizer:  # initialization took place in an eager context
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        raise RuntimeError(
+            'Partitioned variable regularization is not yet '
+            'supported when executing eagerly. File a feature request'
+            'if this is important to you.')
+      # Save a zero-argument lambda which runs the regularizer on the
+      # variable, to be executed when `Layer.losses` is requested.
+      # This makes losses responsive to variable updates when executing
+      # eagerly.
+      #
+      # TODO(akshayka): Do the same for graphs as well, so that losses
+      # collected in a while_loop can be run outside its control flow
+      # context and so that losses won't be swallowed up by graph functions
+      # (i.e., `.losses()` should always create regularizers).
+      self._losses.append(lambda: regularizer(variable))
+
+  def _handle_activity_regularization(self, inputs, outputs):
+    # Apply activity regularization.
+    # Note that it should be applied every time the layer creates a new
+    # output, since it is output-specific.
+    if self._activity_regularizer:
+      output_list = nest.flatten(outputs)
+      for output in output_list:
+        with ops.name_scope('ActivityRegularizer'):
+          activity_regularization = self._activity_regularizer(output)
+        self.add_loss(activity_regularization, inputs=inputs)
+
+  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+    """This is where the layer's logic lives.
+
+    Arguments:
+        inputs: Input tensor, or list/tuple of input tensors.
+        **kwargs: Additional keyword arguments.
+
+    Returns:
+        A tensor or list/tuple of tensors.
+    """
+    return inputs
+
+  def __call__(self, inputs, *args, **kwargs):
+    """Wraps `call`, applying pre- and post-processing steps.
+
+    Arguments:
+      inputs: input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+
+    Note:
+      - The following optional keyword arguments are reserved for specific uses:
+        * `training`: Boolean scalar tensor of Python boolean indicating
+          whether the `call` is meant for training or inference.
+        * `mask`: Boolean input mask.
+      - If the layer's `call` method takes a `mask` argument (as some Keras
+        layers do), its default value will be set to the mask generated
+        for `inputs` by the previous layer (if `input` did come from
+        a layer that generated a corresponding mask, i.e. if it came from
+        a Keras layer with masking support.
+
+    Raises:
+      ValueError: if the layer's `call` method returns None (an invalid value).
+    """
+    input_list = nest.flatten(inputs)
+
+    build_graph = not context.executing_eagerly()
+    # TODO(fchollet, allenl): Make deferred mode work with subclassed Models
+    # which don't use an "inputs" argument.
+    in_deferred_mode = isinstance(input_list[0], DeferredTensor)
+
+    # Handle Keras mask propagation from previous layer to current layer.
+    previous_mask = None
+    if (not hasattr(self, '_compute_previous_mask') or
+        self._compute_previous_mask):
+      previous_mask = collect_previous_mask(inputs)
+      if not hasattr(self, '_call_fn_args'):
+        self._call_fn_args = estimator_util.fn_args(self.call)
+      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
+          not generic_utils.is_all_none(previous_mask)):
+        # The previous layer generated a mask, and mask was not explicitly pass
+        # to __call__, hence we set previous_mask as the default value.
+        kwargs['mask'] = previous_mask
+
+    input_shapes = None
+
+    with ops.name_scope(self._name_scope()):
+      if not self.built:
+        if not build_graph:
+          # Activity regularization is currently unsupported in Eager mode.
+          if self._activity_regularizer:
+            raise ValueError(
+                'activity_regularizer currently unsupported with '
+                'eager execution enabled. Found an activity_regularizer in '
+                '%s(%s).' % (self.__class__.__name__, self))
+        if not build_graph and not in_deferred_mode:
+          for x in input_list:
+            if hasattr(x, '_keras_history'):
+              raise ValueError('_keras_history currently unsupported in '
+                               'Eager mode. Found _keras_history in %s while '
+                               'executing __call__ for %s(%s)' %
+                               (x, self.__class_.__name__, self))
+
+        # Check input assumptions set before layer building, e.g. input rank.
+        self._assert_input_compatibility(inputs)
+        if input_list and self._dtype is None:
+          try:
+            self._dtype = input_list[0].dtype.base_dtype.name
+          except AttributeError:
+            pass
+        if all(hasattr(x, 'get_shape') for x in input_list):
+          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+        self.build(input_shapes)
+
+      # Check input assumptions set after layer building, e.g. input shape.
+      if build_graph or in_deferred_mode:
+        self._assert_input_compatibility(inputs)
+
+      if not in_deferred_mode:
+        outputs = self.call(inputs, *args, **kwargs)
+        if outputs is None:
+          raise ValueError('A layer\'s `call` method should return a Tensor '
+                           'or a list of Tensors, not None (layer: ' +
+                           self.name + ').')
+      else:
+        # Deferred mode behavior: use `compute_output_shape` to
+        # infer the number of outputs of the layer and their shapes.
+        if input_shapes is None:
+          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+
+        output_shapes = self.compute_output_shape(input_shapes)
+        output_shapes = nest.flatten(output_shapes)
+        outputs = [
+            # TODO(fchollet): name the deferred tensors?
+            DeferredTensor(shape=shape, dtype=self._dtype)
+            for shape in output_shapes
+        ]
+        if len(outputs) == 1:
+          outputs = outputs[0]
+
+      if build_graph:
+        self._handle_activity_regularization(inputs, outputs)
+        # TODO(fchollet): consider enabling masking for Eager mode.
+        self._set_mask_metadata(inputs, outputs, previous_mask)
+
+      if in_deferred_mode or build_graph and have_all_keras_metadata(inputs):
+        inputs, outputs = self._set_connectivity_metadata_(
+            inputs, outputs, args, kwargs)
+
+      self.built = True
+      if context.executing_eagerly():
+        return outputs
+
+      if hasattr(self, '_symbolic_set_inputs') and not self.inputs:
+        # Subclassed network: explicitly set metadata normally set by a call to
+        # self._set_inputs(). This is not relevant in eager execution.
+        self._symbolic_set_inputs(inputs, outputs)
+
+      if in_deferred_mode or build_graph:
+        self._set_learning_phase_metadata(inputs, outputs)
+
+    # Optionally load weight values that were specified at layer instantiation.
+    # TODO(fchollet): consider enabling this with eager execution too.
+    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
+      self.set_weights(self._initial_weights)
+      del self._initial_weights
+    self._post_build_cleanup()
+    return outputs
+
+  def _post_build_cleanup(self):
+    """Hooks to run after all sub-Layers are built."""
+    # Note that in addition to Layer.__call__, this method is called by Model
+    # after building a graph network (which skips __call__). It should be called
+    # when possible if self.built may have switched from False to True, and is
+    # idempotent.
+    pass  # No-op for Layers which don't override this method.
+
+  def apply(self, inputs, *args, **kwargs):
+    """Apply the layer on a input.
+
+    This simply wraps `self.__call__`.
+
+    Arguments:
+      inputs: Input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+    """
+    return self.__call__(inputs, *args, **kwargs)
+
+  def _set_learning_phase_metadata(self, inputs, outputs):
+    # Update learning phase info. To work with subclassed models,
+    # this should be done even if Keras metadata is absent.
+    output_tensors = generic_utils.to_list(outputs)
+    uses_lp = any(
+        [getattr(x, '_uses_learning_phase', False)
+         for x in generic_utils.to_list(inputs)])
+    uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
+    for i in range(len(output_tensors)):
+      try:
+        output_tensors[i]._uses_learning_phase = getattr(
+            output_tensors[i], '_uses_learning_phase', False) or uses_lp
+      except AttributeError:
+        # An output element happens to be a C type (such as tuple or dict).
+        # We don't track learning phase info in such edge cases.
+        pass
+
+  def _set_mask_metadata(self, inputs, outputs, previous_mask):
+    if hasattr(self, 'compute_mask'):
+      output_mask = self.compute_mask(inputs, previous_mask)
+      if isinstance(outputs, (list, tuple)):
+        if output_mask is None:
+          output_mask = [None for _ in range(len(outputs))]
+        for x, m in zip(outputs, output_mask):
+          try:
+            x._keras_mask = m  # pylint: disable=protected-access
+          except AttributeError:
+            pass  # C type such as dict. Masking not supported in this case.
+      else:
+        try:
+          outputs._keras_mask = output_mask  # pylint: disable=protected-access
+        except AttributeError:
+          pass  # C type such as dict. Masking not supported in this case.
+
+  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
+    if args and getattr(self, '_uses_inputs_arg', True):
+      raise TypeError(
+          'This Layer takes an `inputs` argument to call(), and only the '
+          '`inputs` argument may be specified as a positional argument. '
+          'Pass everything else as a keyword argument (those arguments will'
+          ' not be tracked as inputs to the Layer).')
+
+    # If the layer returns tensors from its inputs, unmodified,
+    # we copy them to avoid loss of tensor metadata.
+    output_ls = nest.flatten(outputs)
+    output_ls_copy = []
+    for x in output_ls:
+      if x in nest.flatten(inputs):
+        with ops.name_scope(self.name):
+          x = array_ops.identity(x)
+      output_ls_copy.append(x)
+    if len(output_ls_copy) == 1:
+      outputs = output_ls_copy[0]
+    else:
+      outputs = output_ls_copy
+
+    inputs, kwargs = self._inputs_from_call_args(
+        call_args=(inputs,) + args, call_kwargs=kwargs)
+    # Add an inbound node to the layer, so it can keep track of this call.
+    # This updates the layer history of the output tensor(s).
+    kwargs.pop('mask', None)  # `mask` should not be serialized.
+    self._add_inbound_node(
+        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
+    return inputs, outputs
+
+  def _inputs_from_call_args(self, call_args, call_kwargs):
+    """Get Layer inputs from __call__ *args and **kwargs.
+
+    Args:
+      call_args: The positional arguments passed to __call__.
+      call_kwargs: The keyword argument dict passed to __call__.
+
+    Returns:
+      A tuple of (inputs, non_input_kwargs). These may be the same objects as
+      were passed in (call_args and call_kwargs).
+    """
+    if getattr(self, '_uses_inputs_arg', True):
+      assert len(call_args) == 1  # TypeError raised earlier in __call__.
+      return call_args[0], call_kwargs
+    else:
+      call_arg_spec = tf_inspect.getargspec(self.call)
+      # There is no explicit "inputs" argument expected or provided to
+      # call(). Arguments which have default values are considered non-inputs,
+      # and arguments without are considered inputs.
+      if call_arg_spec.defaults:
+        if call_arg_spec.varargs is not None:
+          raise TypeError(
+              'Layer.call() may not accept both *args and arguments with '
+              'default values (unable to determine which are inputs to the '
+              'Layer).')
+        keyword_arg_names = set(
+            call_arg_spec.args[-len(call_arg_spec.defaults):])
+      else:
+        keyword_arg_names = set()
+        # Training is never an input argument name, to allow signatures like
+        # call(x, training).
+      keyword_arg_names.add('training')
+      _, unwrapped_call = tf_decorator.unwrap(self.call)
+      bound_args = inspect.getcallargs(
+          unwrapped_call, *call_args, **call_kwargs)
+      if call_arg_spec.keywords is not None:
+        var_kwargs = bound_args.pop(call_arg_spec.keywords)
+        bound_args.update(var_kwargs)
+        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
+      all_args = call_arg_spec.args
+      if all_args and bound_args[all_args[0]] is self:
+        # Ignore the 'self' argument of methods
+        bound_args.pop(call_arg_spec.args[0])
+        all_args = all_args[1:]
+      non_input_arg_values = {}
+      input_arg_values = []
+      remaining_args_are_keyword = False
+      for argument_name in all_args:
+        if argument_name in keyword_arg_names:
+          remaining_args_are_keyword = True
+        else:
+          if remaining_args_are_keyword:
+            raise TypeError(
+                'Found a positional argument to call() after a non-input '
+                'argument. All arguments after "training" must be keyword '
+                'arguments, and are not tracked as inputs to the Layer.')
+        if remaining_args_are_keyword:
+          non_input_arg_values[argument_name] = bound_args[argument_name]
+        else:
+          input_arg_values.append(bound_args[argument_name])
+      if call_arg_spec.varargs is not None:
+        input_arg_values.extend(bound_args[call_arg_spec.varargs])
+      return input_arg_values, non_input_arg_values
+
+  def compute_output_shape(self, input_shape):
+    """Computes the output shape of the layer.
+
+    Assumes that the layer will be built
+    to match that input shape provided.
+
+    Arguments:
+        input_shape: Shape tuple (tuple of integers)
+            or list of shape tuples (one per output tensor of the layer).
+            Shape tuples can include None for free dimensions,
+            instead of an integer.
+
+    Returns:
+        An input shape tuple.
+    """
+    raise NotImplementedError
+
+  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
+    """Computes an output mask tensor.
+
+    Arguments:
+        inputs: Tensor or list of tensors.
+        mask: Tensor or list of tensors.
+
+    Returns:
+        None or a tensor (or list of tensors,
+            one per output tensor of the layer).
+    """
+    if not self.supports_masking:
+      if mask is not None:
+        if isinstance(mask, list):
+          if any(m is not None for m in mask):
+            raise TypeError('Layer ' + self.name + ' does not support masking, '
+                            'but was passed an input_mask: ' + str(mask))
+        else:
+          raise TypeError('Layer ' + self.name + ' does not support masking, '
+                          'but was passed an input_mask: ' + str(mask))
+      # masking not explicitly supported: return None as mask
+      return None
+    # if masking is explicitly supported, by default
+    # carry over the input mask
+    return mask
+
+  def _add_inbound_node(self,
+                        input_tensors,
+                        output_tensors,
+                        arguments=None):
+    """Internal method to create an inbound node for the layer.
+
+    Arguments:
+        input_tensors: list of input tensors.
+        output_tensors: list of output tensors.
+        arguments: dictionary of keyword arguments that were passed to the
+            `call` method of the layer at the call that created the node.
+    """
+    input_tensors = nest.flatten(input_tensors)
+    output_tensors = nest.flatten(output_tensors)
+
+    # Collect input tensor(s) coordinates.
+    inbound_layers = []
+    node_indices = []
+    tensor_indices = []
+    for x in input_tensors:
+      assert hasattr(x, '_keras_history')
+      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      inbound_layers.append(inbound_layer)
+      node_indices.append(node_index)
+      tensor_indices.append(tensor_index)
+
+    # Create node, add it to inbound nodes.
+    Node(
+        self,
+        inbound_layers=inbound_layers,
+        node_indices=node_indices,
+        tensor_indices=tensor_indices,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        arguments=arguments)
+
+    # Update tensor history metadata.
+    for i in range(len(output_tensors)):
+      # The metadata attribute consists of 1) a layer instance
+      # 2) a node index for the layer, 3) a tensor index for the node.
+      # The allows layer reuse (multiple nodes per layer) and multi-output
+      # or multi-input layers (e.g. a layer can return multiple tensors,
+      # and each can be sent to a different layer).
+      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+
+  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+    """Private utility to retrieves an attribute (e.g. inputs) from a node.
+
+    This is used to implement the methods:
+        - get_input_shape_at
+        - get_output_shape_at
+        - get_input_at
+        etc...
+
+    Arguments:
+        node_index: Integer index of the node from which
+            to retrieve the attribute.
+        attr: Exact node attribute name.
+        attr_name: Human-readable attribute name, for error messages.
+
+    Returns:
+        The layer's attribute `attr` at the node of index `node_index`.
+
+    Raises:
+        RuntimeError: If the layer has no inbound nodes, or if called in Eager
+        mode.
+        ValueError: If the index provided does not match any node.
+    """
+    if not self._inbound_nodes:
+      raise RuntimeError('The layer has never been called '
+                         'and thus has no defined ' + attr_name + '.')
+    if not len(self._inbound_nodes) > node_index:
+      raise ValueError('Asked to get ' + attr_name + ' at node ' +
+                       str(node_index) + ', but the layer has only ' +
+                       str(len(self._inbound_nodes)) + ' inbound nodes.')
+    values = getattr(self._inbound_nodes[node_index], attr)
+    if len(values) == 1:
+      return values[0]
+    else:
+      return values
+
+  def get_input_mask_at(self, node_index):
+    """Retrieves the input mask tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A mask tensor
+        (or list of tensors if the layer has multiple inputs).
+    """
+    inputs = self.get_input_at(node_index)
+    if isinstance(inputs, list):
+      return [getattr(x, '_keras_mask', None) for x in inputs]
+    else:
+      return getattr(inputs, '_keras_mask', None)
+
+  def get_output_mask_at(self, node_index):
+    """Retrieves the output mask tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A mask tensor
+        (or list of tensors if the layer has multiple outputs).
+    """
+    output = self.get_output_at(node_index)
+    if isinstance(output, list):
+      return [getattr(x, '_keras_mask', None) for x in output]
+    else:
+      return getattr(output, '_keras_mask', None)
+
+  @property
+  def input_mask(self):
+    """Retrieves the input mask tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input mask tensor (potentially None) or list of input
+        mask tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    inputs = self.input
+    if isinstance(inputs, list):
+      return [getattr(x, '_keras_mask', None) for x in inputs]
+    else:
+      return getattr(inputs, '_keras_mask', None)
+
+  @property
+  def output_mask(self):
+    """Retrieves the output mask tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Output mask tensor (potentially None) or list of output
+        mask tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    output = self.output
+    if isinstance(output, list):
+      return [getattr(x, '_keras_mask', None) for x in output]
+    else:
+      return getattr(output, '_keras_mask', None)
+
+  def get_input_shape_at(self, node_index):
+    """Retrieves the input shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple inputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_shapes',
+                                             'input shape')
+
+  def get_output_shape_at(self, node_index):
+    """Retrieves the output shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple outputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_shapes',
+                                             'output shape')
+
+  def get_input_at(self, node_index):
+    """Retrieves the input tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple inputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_tensors',
+                                             'input')
+
+  def get_output_at(self, node_index):
+    """Retrieves the output tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple outputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_tensors',
+                                             'output')
+
+  @property
+  def input(self):
+    """Retrieves the input tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input tensor or list of input tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+      AttributeError: If no inbound nodes are found.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('Layer ' + self.name +
+                           ' is not connected, no input to return.')
+    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
+
+  @property
+  def output(self):
+    """Retrieves the output tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one output,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+      Output tensor or list of output tensors.
+
+    Raises:
+      AttributeError: if the layer is connected to more than one incoming
+        layers.
+      RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
+    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
+
+  @property
+  def input_shape(self):
+    """Retrieves the input shape(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer, or if all inputs
+    have the same shape.
+
+    Returns:
+        Input shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per input tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined input_shape.
+        RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined input shape.')
+    all_input_shapes = set(
+        [str(node.input_shapes) for node in self._inbound_nodes])
+    if len(all_input_shapes) == 1:
+      input_shapes = self._inbound_nodes[0].input_shapes
+      if len(input_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in input_shapes
+        ]
+    else:
+      raise AttributeError('The layer "' + str(self.name) +
+                           ' has multiple inbound nodes, '
+                           'with different input shapes. Hence '
+                           'the notion of "input shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_input_shape_at(node_index)` '
+                           'instead.')
+
+  def count_params(self):
+    """Count the total number of scalars composing the weights.
+
+    Returns:
+        An integer count.
+
+    Raises:
+        ValueError: if the layer isn't yet built
+          (in which case its weights aren't yet defined).
+    """
+    if not self.built:
+      if self.__class__.__name__ == 'Sequential':
+        self.build()  # pylint: disable=no-value-for-parameter
+      else:
+        raise ValueError('You tried to call `count_params` on ' + self.name +
+                         ', but the layer isn\'t built. '
+                         'You can build it manually via: `' + self.name +
+                         '.build(batch_input_shape)`.')
+    weight_shapes = [w.get_shape().as_list() for w in self.weights]
+    return int(sum([np.prod(w) for w in weight_shapes]))
+
+  @property
+  def output_shape(self):
+    """Retrieves the output shape(s) of a layer.
+
+    Only applicable if the layer has one output,
+    or if all outputs have the same shape.
+
+    Returns:
+        Output shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per output tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined output shape.
+        RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined output shape.')
+    all_output_shapes = set(
+        [str(node.output_shapes) for node in self._inbound_nodes])
+    if len(all_output_shapes) == 1:
+      output_shapes = self._inbound_nodes[0].output_shapes
+      if len(output_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in output_shapes
+        ]
+    else:
+      raise AttributeError('The layer "%s"'
+                           ' has multiple inbound nodes, '
+                           'with different output shapes. Hence '
+                           'the notion of "output shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_output_shape_at(node_index)` '
+                           'instead.' % self.name)
+
+  @property
+  def inbound_nodes(self):
+    """Deprecated, do NOT use! Only for compatibility with external Keras."""
+    return self._inbound_nodes
+
+  @property
+  def outbound_nodes(self):
+    """Deprecated, do NOT use! Only for compatibility with external Keras."""
+    return self._outbound_nodes
+
+  def _assert_input_compatibility(self, inputs):
+    """Checks compatibility between the layer and provided inputs.
+
+    This checks that the tensor(s) `inputs` verify the input assumptions
+    of the layer (if any). If not, a clear and actional exception gets raised.
+
+    Arguments:
+        inputs: input tensor or list of input tensors.
+
+    Raises:
+        ValueError: in case of mismatch between
+            the provided inputs and the expectations of the layer.
+    """
+    if not self.input_spec:
+      return
+    if not isinstance(self.input_spec, (list, tuple)):
+      input_spec = nest.flatten(self.input_spec)
+    else:
+      input_spec = self.input_spec
+    inputs = nest.flatten(inputs)
+    if len(inputs) != len(input_spec):
+      raise ValueError('Layer ' + self.name + ' expects ' +
+                       str(len(input_spec)) + ' inputs, '
+                       'but it received ' + str(len(inputs)) +
+                       ' input tensors. Inputs received: ' + str(inputs))
+    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+      if spec is None:
+        continue
+
+      if (spec.ndim is not None or
+          spec.min_ndim is not None or
+          spec.max_ndim is not None):
+        if x.get_shape().ndims is None:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'its rank is undefined, but the layer requires a '
+                           'defined rank.')
+
+      # Check ndim.
+      if spec.ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim != spec.ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
+                           str(ndim) + '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      if spec.max_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim > spec.max_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected max_ndim=' + str(spec.max_ndim) +
+                           ', found ndim=' + str(ndim))
+      if spec.min_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim < spec.min_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           ': expected min_ndim=' + str(spec.min_ndim) +
+                           ', found ndim=' + str(ndim) +
+                           '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      # Check dtype.
+      if spec.dtype is not None:
+        if x.dtype != spec.dtype:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected dtype=' + str(spec.dtype) +
+                           ', found dtype=' + str(x.dtype))
+      # Check specific shape axes.
+      if spec.axes:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for axis, value in spec.axes.items():
+            if hasattr(value, 'value'):
+              value = value.value
+            if value is not None and shape[int(axis)] not in {value, None}:
+              raise ValueError(
+                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
+                  ' incompatible with the layer: expected axis ' + str(axis) +
+                  ' of input shape to have value ' + str(value) +
+                  ' but received input with shape ' + str(shape))
+      # Check shape.
+      if spec.shape is not None:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for spec_dim, dim in zip(spec.shape, shape):
+            if spec_dim is not None and dim is not None:
+              if spec_dim != dim:
+                raise ValueError('Input ' + str(input_index) +
+                                 ' is incompatible with layer ' + self.name +
+                                 ': expected shape=' + str(spec.shape) +
+                                 ', found shape=' + str(shape))
+
+  def set_weights(self, weights):
+    """Sets the weights of the layer, from Numpy arrays.
+
+    Arguments:
+        weights: a list of Numpy arrays. The number
+            of arrays and their shape must match
+            number of the dimensions of the weights
+            of the layer (i.e. it should match the
+            output of `get_weights`).
+
+    Raises:
+        ValueError: If the provided weights list does not match the
+            layer's specifications.
+    """
+    params = self.weights
+    if len(params) != len(weights):
+      raise ValueError('You called `set_weights(weights)` on layer "' +
+                       self.name + '" with a  weight list of length ' +
+                       str(len(weights)) + ', but the layer was expecting ' +
+                       str(len(params)) + ' weights. Provided weights: ' +
+                       str(weights)[:50] + '...')
+    if not params:
+      return
+    weight_value_tuples = []
+    param_values = backend.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError('Layer weight shape ' + str(pv.shape) +
+                         ' not compatible with '
+                         'provided weight shape ' + str(w.shape))
+      weight_value_tuples.append((p, w))
+    backend.batch_set_value(weight_value_tuples)
+
+  def get_weights(self):
+    """Returns the current weights of the layer.
+
+    Returns:
+        Weights values as a list of numpy arrays.
+    """
+    params = self.weights
+    return backend.batch_get_value(params)
+
+  def get_config(self):
+    """Returns the config of the layer.
+
+    A layer config is a Python dictionary (serializable)
+    containing the configuration of a layer.
+    The same layer can be reinstantiated later
+    (without its trained weights) from this configuration.
+
+    The config of a layer does not include connectivity
+    information, nor the layer class name. These are handled
+    by `Network` (one layer of abstraction above).
+
+    Returns:
+        Python dictionary.
+    """
+    config = {'name': self.name, 'trainable': self.trainable}
+    if hasattr(self, '_batch_input_shape'):
+      config['batch_input_shape'] = self._batch_input_shape
+    if hasattr(self, 'dtype'):
+      config['dtype'] = self.dtype
+    return config
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a layer from its config.
+
+    This method is the reverse of `get_config`,
+    capable of instantiating the same layer from the config
+    dictionary. It does not handle layer connectivity
+    (handled by Network), nor weights (handled by `set_weights`).
+
+    Arguments:
+        config: A Python dictionary, typically the
+            output of get_config.
+
+    Returns:
+        A layer instance.
+    """
+    return cls(**config)
+
+
+@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
+
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
+
+class Node(object):
+  """A `Node` describes the connectivity between two layers.
+
+  Each time a layer is connected to some new input,
+  a node is added to `layer._inbound_nodes`.
+  Each time the output of a layer is used by another layer,
+  a node is added to `layer._outbound_nodes`.
+
+  Arguments:
+      outbound_layer: the layer that takes
+          `input_tensors` and turns them into `output_tensors`
+          (the node gets created when the `call`
+          method of the layer was called).
+      inbound_layers: a list of layers, the same length as `input_tensors`,
+          the layers from where `input_tensors` originate.
+      node_indices: a list of integers, the same length as `inbound_layers`.
+          `node_indices[i]` is the origin node of `input_tensors[i]`
+          (necessary since each inbound layer might have several nodes,
+          e.g. if the layer is being shared with a different data stream).
+      tensor_indices: a list of integers,
+          the same length as `inbound_layers`.
+          `tensor_indices[i]` is the index of `input_tensors[i]` within the
+          output of the inbound layer
+          (necessary since each inbound layer might
+          have multiple tensor outputs, with each one being
+          independently manipulable).
+      input_tensors: list of input tensors.
+      output_tensors: list of output tensors.
+      arguments: dictionary of keyword arguments that were passed to the
+          `call` method of the layer at the call that created the node.
+
+  `node_indices` and `tensor_indices` are basically fine-grained coordinates
+  describing the origin of the `input_tensors`.
+
+  A node from layer A to layer B is added to:
+    - A._outbound_nodes
+    - B._inbound_nodes
+  """
+
+  def __init__(self,
+               outbound_layer,
+               inbound_layers,
+               node_indices,
+               tensor_indices,
+               input_tensors,
+               output_tensors,
+               arguments=None):
+    # Layer instance (NOT a list).
+    if isinstance(outbound_layer, list):
+      raise ValueError(
+          '`outbound_layer` should be a layer instance, not a list.')
+    # this is the layer that takes a list of input tensors
+    # and turns them into a list of output tensors.
+    # the current node will be added to
+    # the inbound_nodes of outbound_layer.
+    self.outbound_layer = outbound_layer
+
+    # The following 3 properties describe where
+    # the input tensors come from: which layers,
+    # and for each layer, which node and which
+    # tensor output of each node.
+
+    # List of layer instances.
+    self.inbound_layers = inbound_layers
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.node_indices = node_indices
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.tensor_indices = tensor_indices
+
+    # Following 2 properties:
+    # tensor inputs and outputs of outbound_layer.
+
+    # List of tensors. 1:1 mapping with inbound_layers.
+    self.input_tensors = input_tensors
+    # List of tensors, created by outbound_layer.call().
+    self.output_tensors = output_tensors
+
+    # Following 2 properties: input and output shapes.
+
+    # List of shape tuples, shapes of input_tensors.
+    self.input_shapes = [backend.int_shape(x) for x in input_tensors]
+    # List of shape tuples, shapes of output_tensors.
+    self.output_shapes = [backend.int_shape(x) for x in output_tensors]
+
+    # Optional keyword arguments to layer's `call`.
+    self.arguments = arguments
+
+    # Add nodes to all layers involved.
+    for layer in inbound_layers:
+      if layer is not None:
+        # For compatibility with external Keras, we use the deprecated
+        # accessor here.
+        layer.outbound_nodes.append(self)
+    # For compatibility with external Keras, we use the deprecated
+    # accessor here.
+    outbound_layer.inbound_nodes.append(self)
+
+  def get_config(self):
+    inbound_names = []
+    for layer in self.inbound_layers:
+      if layer:
+        inbound_names.append(layer.name)
+      else:
+        inbound_names.append(None)
+    return {
+        'outbound_layer': self.outbound_layer.name,
+        'inbound_layers': inbound_names,
+        'node_indices': self.node_indices,
+        'tensor_indices': self.tensor_indices
+    }
+
+
+class DeferredTensor(object):
+  """Tensor-like object used to build graphs of layers in Eager mode.
+
+  When calling a layer on a DeferredTensor, the layer will not perform any
+  computation and will simply perfom shape inference to return new
+  DeferredTensors with appropriate shape information. Thus DeferredTensor
+  behaves like a graph-mode Tensor when manipulated by layers.
+  """
+
+  def __init__(self, shape, dtype, name=None):
+    self.shape = tensor_shape.TensorShape(shape)
+    if dtype is None:
+      self.dtype = dtypes.as_dtype(np.float32)
+    else:
+      self.dtype = dtypes.as_dtype(dtype)
+    self.name = name
+
+  def get_shape(self):
+    return self.shape
+
+  def __str__(self):
+    return "DeferredTensor('%s', shape=%s, dtype=%s)" % (self.name,
+                                                         self.get_shape(),
+                                                         self.dtype.name)
+
+  def __repr__(self):
+    return "<DeferredTensor '%s' shape=%s dtype=%s>" % (self.name,
+                                                        self.get_shape(),
+                                                        self.dtype.name)
+
+
+def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
+                      zero_based=False):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+    name_uid_map: An optional defaultdict(int) to use when creating unique
+      names. If None (default), uses a per-Graph dictionary.
+    avoid_names: An optional set or dict with names which should not be used. If
+      None (default) does not avoid any names.
+    namespace: Gets a name which is unique within the (graph, namespace). Layers
+      which are not Networks use a blank namespace and so get graph-global
+      names.
+    zero_based: If True, name sequences start with no suffix (e.g. "dense",
+      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```python
+  _unique_layer_name('dense')  # dense_1
+  _unique_layer_name('dense')  # dense_2
+  ```
+  """
+  if name_uid_map is None:
+    name_uid_map = get_default_graph_uid_map()
+  if avoid_names is None:
+    avoid_names = set()
+  proposed_name = None
+  while proposed_name is None or proposed_name in avoid_names:
+    name_key = (namespace, name)
+    if zero_based:
+      number = name_uid_map[name_key]
+      if number:
+        proposed_name = name + '_' + str(number)
+      else:
+        proposed_name = name
+      name_uid_map[name_key] += 1
+    else:
+      name_uid_map[name_key] += 1
+      proposed_name = name + '_' + str(name_uid_map[name_key])
+  return proposed_name
+
+
+def have_all_keras_metadata(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  return all([hasattr(x, '_keras_history') for x in iterable])
+
+
+def collect_previous_mask(input_tensors):
+  """Retrieves the output mask(s) of the previous node.
+
+  Arguments:
+      input_tensors: A tensor or list of tensors.
+
+  Returns:
+      A mask tensor or list of mask tensors.
+  """
+  input_tensors = nest.flatten(input_tensors)
+  masks = []
+  for x in input_tensors:
+    if hasattr(x, '_keras_mask'):
+      mask = x._keras_mask  # pylint: disable=protected-access
+      masks.append(mask)
+    else:
+      masks.append(None)
+  if len(masks) == 1:
+    return masks[0]
+  return masks
+
+
+def get_default_graph_uid_map():
+  # TODO(fchollet): refactor this into backend.
+  graph = ops.get_default_graph()
+  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
+  if name_uid_map is None:
+    name_uid_map = collections.defaultdict(int)
+    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
+  return name_uid_map
+
+
+def make_variable(name,
+                  shape=None,
+                  dtype=dtypes.float32,
+                  initializer=None,
+                  partition_info=None,
+                  trainable=True,
+                  caching_device=None,
+                  validate_shape=True,
+                  constraint=None,
+                  use_resource=None,
+                  partitioner=None):  # pylint: disable=unused-argument
+  """Temporary util to create a variable (relies on `variable_scope.variable`).
+
+  Some reuse-related technicalities prevent us from using
+  `variable_scope.get_variable()` directly, so we use a subcomponent
+  that has fewer constraints (`variable_scope.variable()`).
+
+  In the longer term, it seems like a similar "default variable creator" method
+  should exist in `CheckpointableBase` instead. When this happens, we can get
+  rid of this temporary solution.
+
+  TODO(fchollet): remove this method when no longer needed.
+  TODO(fchollet): handle `partitioner` argument.
+
+  Arguments:
+    name: Variable name.
+    shape: Variable shape.
+    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+    initializer: Initializer instance (callable).
+    partition_info: Not handled at this time.
+    trainable: Whether the variable should be part of the layer's
+      "trainable_variables" (e.g. variables, biases)
+      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+      Note, if the current variable scope is marked as non-trainable
+      then this parameter is ignored and any added variables are also
+      marked as non-trainable.
+    caching_device: Passed to `vs.variable`.
+    validate_shape: Passed to `vs.variable`.
+    constraint: Constraint instance (callable).
+    use_resource: Whether to use a `ResourceVariable`.
+    partitioner: Not handled at this time.
+
+  Returns:
+    Variable instance.
+  """
+  initializing_from_value = False
+  if initializer is not None and not callable(initializer):
+    initializing_from_value = True
+
+  with ops.init_scope():
+    if initializing_from_value:
+      init_val = initializer
+      variable_dtype = None
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+          shape, dtype=dtype, partition_info=partition_info)
+      variable_dtype = dtype.base_dtype
+  if use_resource is None:
+    use_resource = True
+
+  v = vs.variable(
+      initial_value=init_val,
+      name=name,
+      trainable=trainable,
+      caching_device=caching_device,
+      dtype=variable_dtype,
+      validate_shape=validate_shape,
+      constraint=constraint,
+      use_resource=use_resource)
+  return v
diff --git a/tensorflow/python/keras/_impl/keras/engine/input_layer.py b/tensorflow/python/keras/_impl/keras/engine/input_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd9dcbe3c576851123dfcabe3e36379019627ac5
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/input_layer.py
@@ -0,0 +1,230 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Input layer code (`Input` and `InputLayer`).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('keras.layers.InputLayer')
+class InputLayer(base_layer.Layer):
+  """Layer to be used as an entry point into a Network (a graph of layers).
+
+  It can either wrap an existing tensor (pass an `input_tensor` argument)
+  or create its a placeholder tensor (pass arguments `input_shape`, and
+  optionally, `dtype`).
+
+  It is generally recommend to use the functional layer API via `Input`,
+  (which creates an `InputLayer`) without directly using `InputLayer`.
+
+  Arguments:
+      input_shape: Shape tuple (not including the batch axis), or `TensorShape`
+        instance (not including the batch axis).
+      batch_size: Optional input batch size (integer or None).
+      dtype: Datatype of the input.
+      input_tensor: Optional tensor to use as layer input
+          instead of creating a placeholder.
+      sparse: Boolean, whether the placeholder created
+          is meant to be sparse.
+      name: Name of the layer (string).
+  """
+
+  def __init__(self,
+               input_shape=None,
+               batch_size=None,
+               dtype=None,
+               input_tensor=None,
+               sparse=False,
+               name=None,
+               **kwargs):
+    if 'batch_input_shape' in kwargs:
+      batch_input_shape = kwargs.pop('batch_input_shape')
+      if input_shape and batch_input_shape:
+        raise ValueError('Only provide the input_shape OR '
+                         'batch_input_shape argument to '
+                         'InputLayer, not both at the same time.')
+      batch_size = batch_input_shape[0]
+      input_shape = batch_input_shape[1:]
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments:', kwargs.keys())
+
+    if not name:
+      prefix = 'input'
+      name = prefix + '_' + str(K.get_uid(prefix))
+
+    if not dtype:
+      if input_tensor is None:
+        dtype = K.floatx()
+      else:
+        dtype = K.dtype(input_tensor)
+    super(InputLayer, self).__init__(dtype=dtype, name=name)
+    self.built = True
+    self.sparse = sparse
+    self.batch_size = batch_size
+
+    if isinstance(input_shape, tensor_shape.TensorShape):
+      input_shape = tuple(input_shape.as_list())
+
+    if input_tensor is None:
+      if input_shape is not None:
+        batch_input_shape = (batch_size,) + tuple(input_shape)
+      else:
+        batch_input_shape = None
+
+      if context.executing_eagerly():
+        # In eager mode, create a temporary placeholder to call the layer on.
+        input_tensor = base_layer.DeferredTensor(  # pylint: disable=protected-access
+            shape=batch_input_shape,
+            dtype=dtype,
+            name=self.name)
+      else:
+        # In graph mode, create a graph placeholder to call the layer on.
+        if sparse:
+          input_tensor = array_ops.sparse_placeholder(
+              shape=batch_input_shape,
+              dtype=dtype,
+              name=self.name)
+        else:
+          input_tensor = array_ops.placeholder(
+              shape=batch_input_shape,
+              dtype=dtype,
+              name=self.name)
+
+      # For compatibility with Keras API.
+      self.is_placeholder = True
+      self._batch_input_shape = batch_input_shape
+    else:
+      # For compatibility with Keras API.
+      self.is_placeholder = False
+      self._batch_input_shape = tuple(input_tensor.get_shape().as_list())
+
+    # Create an input node to add to self.outbound_node
+    # and set output_tensors' _keras_history.
+    input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
+    base_layer.Node(
+        self,
+        inbound_layers=[],
+        node_indices=[],
+        tensor_indices=[],
+        input_tensors=[input_tensor],
+        output_tensors=[input_tensor])
+
+  def get_config(self):
+    config = {
+        'batch_input_shape': self._batch_input_shape,
+        'dtype': self.dtype,
+        'sparse': self.sparse,
+        'name': self.name
+    }
+    return config
+
+
+@tf_export('keras.layers.Input', 'keras.Input')
+def Input(  # pylint: disable=invalid-name
+    shape=None,
+    batch_size=None,
+    name=None,
+    dtype=None,
+    sparse=False,
+    tensor=None,
+    **kwargs):
+  """`Input()` is used to instantiate a Keras tensor.
+
+  A Keras tensor is a tensor object from the underlying backend
+  (Theano or TensorFlow), which we augment with certain
+  attributes that allow us to build a Keras model
+  just by knowing the inputs and outputs of the model.
+
+  For instance, if a, b and c are Keras tensors,
+  it becomes possible to do:
+  `model = Model(input=[a, b], output=c)`
+
+  The added Keras attribute is:
+      `_keras_history`: Last layer applied to the tensor.
+          the entire layer graph is retrievable from that layer,
+          recursively.
+
+  Arguments:
+      shape: A shape tuple (integers), not including the batch size.
+          For instance, `shape=(32,)` indicates that the expected input
+          will be batches of 32-dimensional vectors.
+      batch_size: optional static batch size (integer).
+      name: An optional name string for the layer.
+          Should be unique in a model (do not reuse the same name twice).
+          It will be autogenerated if it isn't provided.
+      dtype: The data type expected by the input, as a string
+          (`float32`, `float64`, `int32`...)
+      sparse: A boolean specifying whether the placeholder
+          to be created is sparse.
+      tensor: Optional existing tensor to wrap into the `Input` layer.
+          If set, the layer will not create a placeholder tensor.
+      **kwargs: deprecated arguments support.
+
+  Returns:
+      A tensor.
+
+  Example:
+
+      ```python
+      # this is a logistic regression in Keras
+      x = Input(shape=(32,))
+      y = Dense(16, activation='softmax')(x)
+      model = Model(x, y)
+      ```
+
+  Raises:
+    ValueError: in case of invalid arguments.
+  """
+  if 'batch_shape' in kwargs:
+    batch_shape = kwargs.pop('batch_shape')
+    if shape and batch_shape:
+      raise ValueError('Only provide the shape OR '
+                       'batch_shape argument to '
+                       'Input, not both at the same time.')
+    batch_size = batch_shape[0]
+    shape = batch_shape[1:]
+  if kwargs:
+    raise ValueError('Unrecognized keyword arguments:', kwargs.keys())
+
+  if dtype is None:
+    dtype = K.floatx()
+  if not shape and tensor is None:
+    raise ValueError('Please provide to Input either a `shape`'
+                     ' or a `tensor` argument. Note that '
+                     '`shape` does not include the batch '
+                     'dimension.')
+  input_layer = InputLayer(
+      input_shape=shape,
+      batch_size=batch_size,
+      name=name,
+      dtype=dtype,
+      sparse=sparse,
+      input_tensor=tensor)
+  # Return tensor including `_keras_history`.
+  # Note that in this case train_output and test_output are the same pointer.
+  outputs = input_layer._inbound_nodes[0].output_tensors
+  if len(outputs) == 1:
+    return outputs[0]
+  else:
+    return outputs
diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0229be346fc6927755703448d40146ed0620a1d
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/network.py
@@ -0,0 +1,1643 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""A `Network` is way to compose layers: the topological form of a `Model`.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import json
+import os
+import weakref
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import backend
+from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras._impl.keras.engine import saving
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
+
+
+class Network(base_layer.Layer):
+  """A `Network` is a composition of layers.
+
+  It is the topological form of a "model". A `Model`
+  is simply a `Network` with added training routines.
+  """
+
+  def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
+    # Signature detection
+    if (len(args) == 2 or
+        len(args) == 1 and 'outputs' in kwargs or
+        'inputs' in kwargs and 'outputs' in kwargs):
+      # Graph network
+      self._init_graph_network(*args, **kwargs)
+    else:
+      # Subclassed network
+      self._init_subclassed_network(**kwargs)
+
+  def _base_init(self, name=None):
+    # The following are implemented as property functions:
+    # self.trainable_weights
+    # self.non_trainable_weights
+    # self.input_spec
+    # self.losses
+    # self.updates
+
+    self._init_set_name(name, zero_based=True)
+    self._activity_regularizer = None
+    # This acts just like the `trainable` attribute of any layer instance.
+    # It does not affect users of the underlying layers, only users of the
+    # Network instance.
+    self.trainable = True
+    self._is_compiled = False
+    self._expects_training_arg = False
+
+    self.supports_masking = False
+    if not hasattr(self, 'optimizer'):
+      # Don't reset optimizer if already set.
+      self.optimizer = None
+
+    # Private attributes to implement compatibility with Layer.
+    self._updates = []  # Used in symbolic mode only.
+    self._losses = []   # Used in symbolic mode only.
+    self._scope = None  # Never used.
+    self._reuse = None  # Never used.
+    if context.executing_eagerly():
+      self._graph = None
+    else:
+      self._graph = ops.get_default_graph()  # Used in symbolic mode only.
+      # A Network does not create weights of its own, thus has no dtype.
+    self._dtype = None
+
+    # All layers in order of horizontal graph traversal.
+    # Entries are unique. Includes input and output layers.
+    self._layers = []
+
+    # Used in symbolic mode only, only in conjonction with graph-networks
+    self._outbound_nodes = []
+    self._inbound_nodes = []
+
+    self._checkpointable_saver = checkpointable_utils.CheckpointableSaver(
+        weakref.ref(self))
+    # A zero-argument function which should be called and set back to None as
+    # soon as the network is built (only applicable to subclassed Models). Runs
+    # restore operations when graph building.
+    self._in_progress_restore_finalizer = None
+
+  def _init_graph_network(self, inputs, outputs, name=None):
+    self._uses_inputs_arg = True
+    # Normalize and set self.inputs, self.outputs.
+    if isinstance(inputs, (list, tuple)):
+      self.inputs = list(inputs)  # Tensor or list of tensors.
+    else:
+      self.inputs = [inputs]
+    if isinstance(outputs, (list, tuple)):
+      self.outputs = list(outputs)
+    else:
+      self.outputs = [outputs]
+
+    # User-provided argument validation.
+    if context.executing_eagerly():
+      # Check that all inputs/outputs are DeferredTensors.
+      for tensor in self.inputs:
+        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
+          raise TypeError('When eager execution is enabled, '
+                          'inputs must come from a call to '
+                          '`tf.keras.Input` (called after '
+                          'tfe.enable_eager_execution()). '
+                          'Received invalid input: ' + str(tensor))
+      for tensor in self.outputs:
+        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
+          raise TypeError('When eager execution is enabled, '
+                          'outputs must come from a call to '
+                          'a layer (called after '
+                          'tfe.enable_eager_execution()). '
+                          'Received invalid output: ' + str(tensor))
+    # Check for redundancy in inputs.
+    if len(set(self.inputs)) != len(self.inputs):
+      raise ValueError('The list of inputs passed to the model '
+                       'is redundant. '
+                       'All inputs should only appear once.'
+                       ' Found: ' + str(self.inputs))
+    for x in self.inputs:
+      # Check that x has appropriate `_keras_history` metadata.
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Input tensors to a ' + cls_name + ' ' +
+                         'must come from `tf.layers.Input`. '
+                         'Received: ' + str(x) +
+                         ' (missing previous layer metadata).')
+      # Check that x is an input tensor.
+      # pylint: disable=protected-access
+      layer, node_index, tensor_index = x._keras_history
+      if len(layer._inbound_nodes) > 1 or (
+          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
+        cls_name = self.__class__.__name__
+        logging.warning(cls_name + ' inputs must come from '
+                        '`tf.layers.Input` (thus holding past layer metadata), '
+                        'they cannot be the output of '
+                        'a previous non-Input layer. '
+                        'Here, a tensor specified as '
+                        'input to "' + self.name + '" was not an Input tensor, '
+                        'it was generated by layer ' + layer.name + '.\n'
+                        'Note that input tensors are '
+                        'instantiated via `tensor = tf.layers.Input(shape)`.\n'
+                        'The tensor that caused the issue was: ' + str(x.name))
+    for x in self.outputs:
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Output tensors to a ' + cls_name + ' must be '
+                         'the output of a TensorFlow `Layer` '
+                         '(thus holding past layer metadata). Found: ' + str(x))
+
+    self._base_init(name=name)
+    self._compute_previous_mask = (
+        'mask' in tf_inspect.getargspec(self.call).args or
+        hasattr(self, 'compute_mask'))
+    # A Network does not create weights of its own, thus it is already
+    # built.
+    self.built = True
+    self._is_graph_network = True
+
+    self._input_layers = []
+    self._output_layers = []
+    self._input_coordinates = []
+    self._output_coordinates = []
+
+    # This is for performance optimization when calling the Network on new
+    # inputs. Every time the Network is called on a set on input tensors,
+    # we compute the output tensors, output masks and output shapes in one pass,
+    # then cache them here. When any of these outputs is queried later, we
+    # retrieve it from there instead of recomputing it.
+    self._output_mask_cache = {}
+    self._output_tensor_cache = {}
+    self._output_shape_cache = {}
+
+    # Build self._output_layers:
+    for x in self.outputs:
+      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      self._output_layers.append(layer)
+      self._output_coordinates.append((layer, node_index, tensor_index))
+
+    # Build self._input_layers:
+    for x in self.inputs:
+      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      # It's supposed to be an input layer, so only one node
+      # and one tensor output.
+      assert node_index == 0
+      assert tensor_index == 0
+      self._input_layers.append(layer)
+      self._input_coordinates.append((layer, node_index, tensor_index))
+
+    # Keep track of the network's nodes and layers.
+    nodes, nodes_by_depth, layers, layers_by_depth = _map_graph_network(
+        self.inputs, self.outputs)
+    self._network_nodes = nodes
+    self._nodes_by_depth = nodes_by_depth
+    self._layers = layers
+    self._layers_by_depth = layers_by_depth
+
+    self._track_layers(layers)
+
+    # Create the node linking internal inputs to internal outputs.
+    base_layer.Node(
+        outbound_layer=self,
+        inbound_layers=[],
+        node_indices=[],
+        tensor_indices=[],
+        input_tensors=self.inputs,
+        output_tensors=self.outputs)
+
+    # Fill in the output mask cache.
+    masks = []
+    for x in self.inputs:
+      mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
+      masks.append(mask)
+    mask_cache_key = (generic_utils.object_list_uid(self.inputs) + '_' +
+                      generic_utils.object_list_uid(masks))
+    masks = []
+    for x in self.outputs:
+      mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
+      masks.append(mask)
+    if len(masks) == 1:
+      mask = masks[0]
+    else:
+      mask = masks
+    self._output_mask_cache[mask_cache_key] = mask
+
+    # Build self.input_names and self.output_names.
+    self.input_names = []
+    self.output_names = []
+    self._feed_input_names = []
+    self._feed_inputs = []
+    self._feed_input_shapes = []
+    for i, layer in enumerate(self._input_layers):
+      self.input_names.append(layer.name)
+      if layer.is_placeholder:
+        self._feed_input_names.append(layer.name)
+        self._feed_input_shapes.append(backend.int_shape(self.inputs[i]))
+        # layer.input gives an error in eager mode
+        if not context.executing_eagerly():
+          self._feed_inputs.append(layer.input)
+    for layer in self._output_layers:
+      self.output_names.append(layer.name)
+
+  def _init_subclassed_network(self, name=None):
+    self._base_init(name=name)
+    self._is_graph_network = False
+    call_args = tf_inspect.getargspec(self.call).args
+    if 'training' in call_args:
+      self._expects_training_arg = True
+    else:
+      self._expects_training_arg = False
+    if 'inputs' in call_args:
+      self._uses_inputs_arg = True
+    else:
+      self._uses_inputs_arg = False
+    self.outputs = None
+    self.inputs = None
+    self.built = False
+
+  def _track_layers(self, layers):
+    """Add Checkpointable dependencies on a list of Layers."""
+    weight_layer_index = 0
+    for layer_index, layer in enumerate(layers):
+      if layer.weights:
+        # Keep a separate index for layers which have weights. This allows users
+        # to insert Layers without weights anywhere in the network without
+        # breaking checkpoints.
+        self._track_checkpointable(
+            layer, name='layer_with_weights-%d' % weight_layer_index,
+            overwrite=True)
+        weight_layer_index += 1
+      # Even if it doesn't have weights, we should still track everything in
+      # case it has/will have Checkpointable dependencies.
+      self._track_checkpointable(
+          layer, name='layer-%d' % layer_index, overwrite=True)
+
+  def __setattr__(self, name, value):
+    if isinstance(value, (base_layer.Layer, Network)):
+      try:
+        is_graph_network = self._is_graph_network
+      except AttributeError:
+        raise RuntimeError('It looks like you are subclassing `Model` and you '
+                           'forgot to call `super(YourClass, self).__init__()`.'
+                           ' Always start with this line.')
+      if not is_graph_network:
+        if value not in self._layers:
+          self._layers.append(value)
+          if hasattr(value, '_use_resource_variables'):
+            # In subclassed models, legacy layers (tf.layers) must always use
+            # resource variables.
+            value._use_resource_variables = True
+    if isinstance(value, checkpointable.CheckpointableBase):
+      # Layer (and therefore Network/Model) inherit from CheckpointableBase
+      # rather than Checkpointable, which means there is no Checkpointable
+      # __setattr__ override (it would be a performance issue for functional
+      # layers). Therefore Model tracks Checkpointable objects itself.
+      self._track_checkpointable(
+          checkpointable=value, name=name, overwrite=True)
+    super(Network, self).__setattr__(name, value)
+
+  def add_variable(self, name, shape, dtype=None, initializer=None,
+                   regularizer=None, trainable=True, constraint=None):
+    raise NotImplementedError('`add_variable` is not supported on Networks.')
+
+  def add_loss(self, *args, **kwargs):
+    if context.executing_eagerly():
+      raise NotImplementedError('`add_loss` is not supported on Networks '
+                                'when eager execution is enabled.')
+    super(Network, self).add_loss(*args, **kwargs)
+
+  @property
+  def uses_learning_phase(self):
+    return any(
+        [getattr(x, '_uses_learning_phase', False) for x in self.outputs])
+
+  @property
+  def stateful(self):
+    return any([(hasattr(layer, 'stateful') and layer.stateful)
+                for layer in self.layers])
+
+  def reset_states(self):
+    for layer in self.layers:
+      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
+        layer.reset_states()
+
+  @property
+  def state_updates(self):
+    """Returns the `updates` from all layers that are stateful.
+
+    This is useful for separating training updates and
+    state updates, e.g. when we need to update a layer's internal state
+    during prediction.
+
+    Returns:
+        A list of update ops.
+    """
+    state_updates = []
+    for layer in self.layers:
+      if getattr(layer, 'stateful', False):
+        if hasattr(layer, 'updates'):
+          state_updates += layer.updates
+    return state_updates
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays.
+    """
+    weights = []
+    for layer in self.layers:
+      weights += layer.weights
+    return backend.batch_get_value(weights)
+
+  def set_weights(self, weights):
+    """Sets the weights of the model.
+
+    Arguments:
+        weights: A list of Numpy arrays with shapes and types matching
+            the output of `model.get_weights()`.
+    """
+    tuples = []
+    for layer in self.layers:
+      num_param = len(layer.weights)
+      layer_weights = weights[:num_param]
+      for sw, w in zip(layer.weights, layer_weights):
+        tuples.append((sw, w))
+      weights = weights[num_param:]
+    backend.batch_set_value(tuples)
+
+  def compute_mask(self, inputs, mask):
+    if not self._is_graph_network:
+      return None
+
+    inputs = generic_utils.to_list(inputs)
+    if mask is None:
+      masks = [None for _ in range(len(inputs))]
+    else:
+      masks = generic_utils.to_list(mask)
+    cache_key = (generic_utils.object_list_uid(inputs)
+                 + '_' + generic_utils.object_list_uid(masks))
+    if cache_key in self._output_mask_cache:
+      return self._output_mask_cache[cache_key]
+    else:
+      _, output_masks = self._run_internal_graph(inputs, mask=masks)
+      return output_masks
+
+  @property
+  def layers(self):
+    return self._layers
+
+  def get_layer(self, name=None, index=None):
+    """Retrieves a layer based on either its name (unique) or index.
+
+    If `name` and `index` are both provided, `index` will take precedence.
+    Indices are based on order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: String, name of layer.
+        index: Integer, index of layer.
+
+    Returns:
+        A layer instance.
+
+    Raises:
+        ValueError: In case of invalid layer name or index.
+    """
+    # TODO(fchollet): We could build a dictionary based on layer names
+    # since they are constant, but we have not done that yet.
+    if index is not None:
+      if len(self.layers) <= index:
+        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
+                         ' but model only has ' + str(len(self.layers)) +
+                         ' layers.')
+      else:
+        return self.layers[index]
+    else:
+      if not name:
+        raise ValueError('Provide either a layer name or layer index.')
+    for layer in self.layers:
+      if layer.name == name:
+        return layer
+    raise ValueError('No such layer: ' + name)
+
+  @property
+  def updates(self):
+    """Retrieves the network's updates.
+
+    Will only include updates that are either
+    unconditional, or conditional on inputs to this model
+    (e.g. will not include updates that were created by layers of this model
+    outside of the model).
+
+    Effectively, `network.updates` behaves like `layer.updates`.
+
+    Concrete example:
+
+    ```python
+      bn = keras.layers.BatchNormalization()
+      x1 = keras.layers.Input(shape=(10,))
+      _ = bn(x1)  # This creates 2 updates.
+
+      x2 = keras.layers.Input(shape=(10,))
+      y2 = bn(x2)  # This creates 2 more updates.
+
+      # The BN layer has now 4 updates.
+      self.assertEqual(len(bn.updates), 4)
+
+      # Let's create a model from x2 to y2.
+      model = keras.models.Model(x2, y2)
+
+      # The model does not list all updates from its underlying layers,
+      # but only the updates that are relevant to it. Updates created by layers
+      # outside of the model are discarded.
+      self.assertEqual(len(model.updates), 2)
+
+      # If you keep calling the model, you append to its updates, just like
+      # what happens for a layer.
+      x3 = keras.layers.Input(shape=(10,))
+      y3 = model(x3)
+      self.assertEqual(len(model.updates), 4)
+
+      # But if you call the inner BN layer independently, you don't affect
+      # the model's updates.
+      x4 = keras.layers.Input(shape=(10,))
+      _ = bn(x4)
+      self.assertEqual(len(model.updates), 4)
+    ```
+
+    Returns:
+        A list of update ops.
+    """
+    if context.executing_eagerly():
+      return []
+
+    if not self.trainable and not self.stateful:
+      return []
+
+    updates = []
+    for layer in self.layers:
+      updates += layer.updates
+
+    # `updates` might contain irrelevant updates, so it needs to be filtered
+    # with respect to inputs the model has been called on.
+    if self.inputs:
+      relevant_inputs = self.inputs[:]
+    else:
+      relevant_inputs = []
+    for i in range(1, len(self._inbound_nodes)):
+      inputs = self.get_input_at(i)
+      if isinstance(inputs, list):
+        relevant_inputs += inputs
+      else:
+        relevant_inputs.append(inputs)
+    reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, updates)
+    relevant_conditional_updates = [x for x in updates if x in reachable]
+    unconditional_updates = [
+        x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
+    # A layer could be used multiple times in a nested structure,
+    # so the updates list must be de-duped.
+    return list(set(
+        relevant_conditional_updates + unconditional_updates + self._updates))
+
+  @property
+  def losses(self):
+    """Retrieves the network's losses.
+
+    Will only include losses that are either
+    unconditional, or conditional on inputs to this model
+    (e.g. will not include losses that depend on tensors
+    that aren't inputs to this model).
+
+    Returns:
+        A list of loss tensors.
+    """
+    losses = []
+    for layer in self.layers:
+      losses += layer.losses
+    if context.executing_eagerly():
+      return losses
+
+    if self.inputs:
+      relevant_inputs = self.inputs[:]
+    else:
+      relevant_inputs = []
+    for i in range(1, len(self._inbound_nodes)):
+      inputs = self.get_input_at(i)
+      if isinstance(inputs, list):
+        relevant_inputs += inputs
+      else:
+        relevant_inputs.append(inputs)
+    reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, losses)
+    relevant_conditional_losses = [x for x in losses if x in reachable]
+    unconditional_losses = [
+        x for x in losses if x._unconditional_loss]  # pylint: disable=protected-access
+    return list(set(
+        relevant_conditional_losses + unconditional_losses + self._losses))
+
+  @property
+  def trainable_weights(self):
+    if not self.trainable:
+      return []
+    weights = []
+    for layer in self.layers:
+      weights += layer.trainable_weights
+    return weights
+
+  @property
+  def non_trainable_weights(self):
+    weights = []
+    for layer in self.layers:
+      weights += layer.non_trainable_weights
+    if not self.trainable:
+      trainable_weights = []
+      for layer in self.layers:
+        trainable_weights += layer.trainable_weights
+      return trainable_weights + weights
+    return weights
+
+  @property
+  def input_spec(self):
+    """Gets the network's input specs.
+
+    Returns:
+        A list of `InputSpec` instances (one per input to the model)
+            or a single instance if the model has only one input.
+    """
+    # If not a graph network, can't assume anything.
+    if not self._is_graph_network:
+      return None
+
+    specs = []
+    for layer in self._input_layers:
+      if layer.input_spec is None:
+        specs.append(None)
+      else:
+        if not isinstance(layer.input_spec, list):
+          raise TypeError('Layer ' + layer.name +
+                          ' has an input_spec attribute that '
+                          'is not a list. We expect a list. '
+                          'Found input_spec = ' + str(layer.input_spec))
+        specs += layer.input_spec
+    if len(specs) == 1:
+      return specs[0]
+    return specs
+
+  def call(self, inputs, training=None, mask=None):
+    """Calls the model on new inputs.
+
+    In this case `call` just reapplies
+    all ops in the graph to the new inputs
+    (e.g. build a new computational graph from the provided inputs).
+
+    Arguments:
+        inputs: A tensor or list of tensors.
+        training: Boolean or boolean scalar tensor, indicating whether to run
+          the `Network` in training mode or inference mode.
+        mask: A mask or list of masks. A mask can be
+            either a tensor or None (no mask).
+
+    Returns:
+        A tensor if there is a single output, or
+        a list of tensors if there are more than one outputs.
+    """
+    inputs = nest.flatten(inputs)
+    if mask is None:
+      masks = [None for _ in range(len(inputs))]
+    else:
+      masks = nest.flatten(mask)
+
+    if not context.executing_eagerly():
+      # Try to retrieve cached outputs if the layer has already been called
+      # on these exact inputs.
+      cache_key = (generic_utils.object_list_uid(inputs)
+                   + '_' + generic_utils.object_list_uid(masks))
+      if cache_key in self._output_tensor_cache:
+        # Cache hit.
+        return self._output_tensor_cache[cache_key]
+    # Actually apply the network graph to the new inputs.
+    outputs, _ = self._run_internal_graph(inputs,
+                                          training=training,
+                                          mask=masks)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    if not self._is_graph_network:
+      raise NotImplementedError
+
+    if isinstance(input_shape, list):
+      input_shapes = []
+      for shape in input_shape:
+        if shape is not None:
+          input_shapes.append(tuple(tensor_shape.TensorShape(shape).as_list()))
+        else:
+          input_shapes.append(None)
+    else:
+      if input_shape is not None:
+        input_shapes = [tuple(tensor_shape.TensorShape(input_shape).as_list())]
+      else:
+        input_shapes = [None]
+
+    if len(input_shapes) != len(self._input_layers):
+      raise ValueError('Invalid input_shape argument ' + str(input_shape) +
+                       ': model has ' + str(len(self._input_layers)) +
+                       ' tensor inputs.')
+
+    cache_key = generic_utils.object_list_uid(input_shapes)
+    if cache_key not in self._output_shape_cache:
+      # Cache miss. We have to run the network graph manually (recursive calls
+      # to `compute_output_shape`).
+      layers_to_output_shapes = {}
+      for i in range(len(input_shapes)):
+        layer = self._input_layers[i]
+        input_shape = input_shapes[i]
+        # It's an input layer: then `compute_output_shape` is identity,
+        # and there is only one node and one tensor output.
+        shape_key = layer.name + '_0_0'
+        layers_to_output_shapes[shape_key] = input_shape
+
+      depth_keys = list(self._nodes_by_depth.keys())
+      depth_keys.sort(reverse=True)
+      # Iterate over nodes, by depth level.
+      if len(depth_keys) > 1:
+        for depth in depth_keys:
+          nodes = self._nodes_by_depth[depth]
+          for node in nodes:
+            # This is always a single layer, never a list.
+            layer = node.outbound_layer
+            if layer in self._input_layers:
+              # We've already covered the input layers
+              # a few lines above.
+              continue
+            # Potentially redundant list,
+            # same size as node.input_tensors.
+            input_shapes = []
+            for j in range(len(node.inbound_layers)):
+              inbound_layer = node.inbound_layers[j]
+              node_index = node.node_indices[j]
+              tensor_index = node.tensor_indices[j]
+              shape_key = inbound_layer.name + '_%s_%s' % (node_index,
+                                                           tensor_index)
+              input_shape = layers_to_output_shapes[shape_key]
+              input_shapes.append(input_shape)
+
+            if len(input_shapes) == 1:
+              output_shape = layer.compute_output_shape(input_shapes[0])
+            else:
+              output_shape = layer.compute_output_shape(input_shapes)
+            if isinstance(output_shape, list):
+              output_shapes = [
+                  tuple(tensor_shape.TensorShape(shape).as_list())
+                  for shape in output_shape
+              ]
+            else:
+              output_shapes = [
+                  tuple(tensor_shape.TensorShape(output_shape).as_list())
+              ]
+
+            node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
+            for j in range(len(output_shapes)):
+              shape_key = layer.name + '_%s_%s' % (node_index, j)
+              layers_to_output_shapes[shape_key] = output_shapes[j]
+
+        # Read final output shapes from layers_to_output_shapes.
+        output_shapes = []
+        for i in range(len(self._output_layers)):
+          layer, node_index, tensor_index = self._output_coordinates[i]
+          shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
+          output_shapes.append(layers_to_output_shapes[shape_key])
+        # Store in cache.
+        self._output_shape_cache[cache_key] = output_shapes
+    else:
+      # Cache hit.
+      output_shapes = self._output_shape_cache[cache_key]
+
+    if isinstance(output_shapes, list):
+      if len(output_shapes) == 1:
+        return tensor_shape.TensorShape(output_shapes[0])
+      else:
+        return [tensor_shape.TensorShape(shape) for shape in output_shapes]
+    else:
+      return tensor_shape.TensorShape(output_shapes)
+
+  def _run_internal_graph(self, inputs, training=None, mask=None):
+    """Computes output tensors for new inputs.
+
+    # Note:
+        - Expects `inputs` to be a list (potentially with 1 element).
+        - Can be run on non-Keras tensors.
+
+    Arguments:
+        inputs: List of tensors
+        training: Boolean learning phase.
+        mask: List of masks (tensors or None).
+
+    Returns:
+        Three lists: output_tensors, output_masks, output_shapes
+    """
+    # Note: masking support is relevant mainly for Keras.
+    # It cannot be factored out without having the fully reimplement the network
+    # calling logic on the Keras side. We choose to incorporate it in
+    # Network because 1) it may be useful to fully support in tf.layers in
+    # the future and 2) Keras is a major user of Network.  If you don't
+    # use masking, it does not interfere with regular behavior at all and you
+    # can ignore it.
+    if mask is None:
+      masks = [None for _ in range(len(inputs))]
+    else:
+      masks = mask
+
+    # Dictionary mapping reference tensors to tuples
+    # (computed tensor, compute mask)
+    # we assume a 1:1 mapping from tensor to mask
+    # TODO(fchollet): raise exception when a `.compute_mask()` call
+    # does not return a list the same size as `call`
+    tensor_map = {}
+    for x, y, mask in zip(self.inputs, inputs, masks):
+      tensor_map[str(id(x))] = (y, mask)
+
+    depth_keys = list(self._nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+    for depth in depth_keys:
+      nodes = self._nodes_by_depth[depth]
+      for node in nodes:
+        # This is always a single layer, never a list.
+        layer = node.outbound_layer
+        reference_input_tensors = node.input_tensors
+        reference_output_tensors = node.output_tensors
+
+        # If all previous input tensors are available in tensor_map,
+        # then call node.inbound_layer on them.
+        computed_data = []  # List of tuples (input, mask).
+        for x in reference_input_tensors:
+          if str(id(x)) in tensor_map:
+            computed_data.append(tensor_map[str(id(x))])
+
+        if len(computed_data) == len(reference_input_tensors):
+          # Call layer (reapplying ops to new inputs).
+          with ops.name_scope(layer.name):
+            if node.arguments:
+              kwargs = node.arguments
+            else:
+              kwargs = {}
+            if len(computed_data) == 1:
+              computed_tensor, computed_mask = computed_data[0]
+              # Ensure mask propagation if applicable.
+              if 'mask' in tf_inspect.getargspec(layer.call).args:
+                kwargs.setdefault('mask', computed_mask)
+              if 'training' in tf_inspect.getargspec(layer.call).args:
+                kwargs.setdefault('training', training)
+
+              output_tensors = nest.flatten(
+                  layer.call(computed_tensor, **kwargs))
+              if hasattr(layer, 'compute_mask'):
+                output_masks = nest.flatten(
+                    layer.compute_mask(computed_tensor, computed_mask))
+              else:
+                output_masks = [None for _ in range(len(output_tensors))]
+              computed_tensors = [computed_tensor]
+              computed_masks = [computed_mask]
+            else:
+              computed_tensors = [x[0] for x in computed_data]
+              computed_masks = [x[1] for x in computed_data]
+              if 'mask' in tf_inspect.getargspec(layer.call).args:
+                kwargs.setdefault('mask', computed_masks)
+              if 'training' in tf_inspect.getargspec(layer.call).args:
+                kwargs.setdefault('training', training)
+
+              output_tensors = nest.flatten(
+                  layer.call(computed_tensors, **kwargs))
+              if hasattr(layer, 'compute_mask'):
+                output_masks = nest.flatten(
+                    layer.compute_mask(computed_tensors, computed_masks))
+              else:
+                output_masks = [None for _ in range(len(output_tensors))]
+
+            if not context.executing_eagerly():
+              if layer.activity_regularizer is not None:
+                regularization_losses = [
+                    layer.activity_regularizer(x) for x in output_tensors
+                ]
+                # Apply activity regularizer if any:
+                layer.add_loss(regularization_losses, computed_tensors)
+
+          # Update tensor_map.
+          for x, y, mask in zip(reference_output_tensors, output_tensors,
+                                output_masks):
+            tensor_map[str(id(x))] = (y, mask)
+
+    output_tensors = []
+    output_masks = []
+    output_shapes = []
+    for x in self.outputs:
+      assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
+      tensor, mask = tensor_map[str(id(x))]
+      output_shapes.append(backend.int_shape(x))
+      output_tensors.append(tensor)
+      output_masks.append(mask)
+
+    if len(output_tensors) == 1:
+      output_tensors = output_tensors[0]
+      if output_shapes is not None:
+        output_shapes = output_shapes[0]
+      if output_masks is not None:
+        output_masks = output_masks[0]
+
+    if not context.executing_eagerly():
+      # Update cache;
+      # keys are based on ids on input tensors and inputs masks.
+      cache_key = (generic_utils.object_list_uid(inputs)
+                   + '_' + generic_utils.object_list_uid(masks))
+      self._output_tensor_cache[cache_key] = output_tensors
+      self._output_mask_cache[cache_key] = output_masks
+
+      if output_shapes is not None:
+        input_shapes = [backend.int_shape(x) for x in inputs]
+        cache_key = generic_utils.object_list_uid(input_shapes)
+        self._output_shape_cache[cache_key] = output_shapes
+
+    return output_tensors, output_masks
+
+  def get_config(self):
+    if not self._is_graph_network:
+      raise NotImplementedError
+
+    config = {
+        'name': self.name,
+    }
+    node_conversion_map = {}
+    for layer in self.layers:
+      if issubclass(layer.__class__, Network):
+        # Networks start with a pre-existing node
+        # linking their input to output.
+        kept_nodes = 1
+      else:
+        kept_nodes = 0
+      for original_node_index, node in enumerate(layer._inbound_nodes):
+        node_key = _make_node_key(layer.name, original_node_index)
+        if node_key in self._network_nodes:
+          node_conversion_map[node_key] = kept_nodes
+          kept_nodes += 1
+    layer_configs = []
+    for layer in self.layers:  # From the earliest layers on.
+      layer_class_name = layer.__class__.__name__
+      layer_config = layer.get_config()
+      filtered_inbound_nodes = []
+      for original_node_index, node in enumerate(layer._inbound_nodes):
+        node_key = _make_node_key(layer.name, original_node_index)
+        if node_key in self._network_nodes:
+          # The node is relevant to the model:
+          # add to filtered_inbound_nodes.
+          if node.arguments:
+            try:
+              json.dumps(node.arguments)
+              kwargs = node.arguments
+            except TypeError:
+              logging.warning(
+                  'Layer ' + layer.name +
+                  ' was passed non-serializable keyword arguments: ' +
+                  str(node.arguments) + '. They will not be included '
+                  'in the serialized model (and thus will be missing '
+                  'at deserialization time).')
+              kwargs = {}
+          else:
+            kwargs = {}
+          if node.inbound_layers:
+            node_data = []
+            for i in range(len(node.inbound_layers)):
+              inbound_layer = node.inbound_layers[i]
+              node_index = node.node_indices[i]
+              tensor_index = node.tensor_indices[i]
+              node_key = _make_node_key(inbound_layer.name, node_index)
+              new_node_index = node_conversion_map.get(node_key, 0)
+              node_data.append(
+                  [inbound_layer.name, new_node_index, tensor_index, kwargs])
+            filtered_inbound_nodes.append(node_data)
+      layer_configs.append({
+          'name': layer.name,
+          'class_name': layer_class_name,
+          'config': layer_config,
+          'inbound_nodes': filtered_inbound_nodes,
+      })
+    config['layers'] = layer_configs
+
+    # Gather info about inputs and outputs.
+    model_inputs = []
+    for i in range(len(self._input_layers)):
+      layer, node_index, tensor_index = self._input_coordinates[i]
+      node_key = _make_node_key(layer.name, node_index)
+      if node_key not in self._network_nodes:
+        continue
+      new_node_index = node_conversion_map[node_key]
+      model_inputs.append([layer.name, new_node_index, tensor_index])
+    config['input_layers'] = model_inputs
+    model_outputs = []
+    for i in range(len(self._output_layers)):
+      layer, node_index, tensor_index = self._output_coordinates[i]
+      node_key = _make_node_key(layer.name, node_index)
+      if node_key not in self._network_nodes:
+        continue
+      new_node_index = node_conversion_map[node_key]
+      model_outputs.append([layer.name, new_node_index, tensor_index])
+    config['output_layers'] = model_outputs
+    return copy.deepcopy(config)
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    """Instantiates a Model from its config (output of `get_config()`).
+
+    Arguments:
+        config: Model config dictionary.
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+
+    Returns:
+        A model instance.
+
+    Raises:
+        ValueError: In case of improperly formatted config dict.
+    """
+    # Layer instances created during
+    # the graph reconstruction process
+    created_layers = {}
+
+    # Dictionary mapping layer instances to
+    # node data that specifies a layer call.
+    # It acts as a queue that maintains any unprocessed
+    # layer call until it becomes possible to process it
+    # (i.e. until the input tensors to the call all exist).
+    unprocessed_nodes = {}
+
+    def add_unprocessed_node(layer, node_data):
+      if layer not in unprocessed_nodes:
+        unprocessed_nodes[layer] = [node_data]
+      else:
+        unprocessed_nodes[layer].append(node_data)
+
+    def process_node(layer, node_data):
+      """Deserialize a node.
+
+      Arguments:
+          layer: layer instance.
+          node_data: node config dict.
+
+      Raises:
+          ValueError: In case of improperly formatted `node_data` dict.
+      """
+      input_tensors = []
+      for input_data in node_data:
+        inbound_layer_name = input_data[0]
+        inbound_node_index = input_data[1]
+        inbound_tensor_index = input_data[2]
+        if len(input_data) == 3:
+          kwargs = {}
+        elif len(input_data) == 4:
+          kwargs = input_data[3]
+        else:
+          raise ValueError('Improperly formatted model config.')
+        if inbound_layer_name not in created_layers:
+          add_unprocessed_node(layer, node_data)
+          return
+        inbound_layer = created_layers[inbound_layer_name]
+        if len(inbound_layer._inbound_nodes) <= inbound_node_index:
+          add_unprocessed_node(layer, node_data)
+          return
+        inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
+        input_tensors.append(inbound_node.output_tensors[inbound_tensor_index])
+      # Call layer on its inputs, thus creating the node
+      # and building the layer if needed.
+      if input_tensors:
+        if len(input_tensors) == 1:
+          layer(input_tensors[0], **kwargs)
+        else:
+          layer(input_tensors, **kwargs)
+
+    def process_layer(layer_data):
+      """Deserializes a layer, then call it on appropriate inputs.
+
+      Arguments:
+          layer_data: layer config dict.
+
+      Raises:
+          ValueError: In case of improperly formatted `layer_data` dict.
+      """
+      layer_name = layer_data['name']
+
+      # Instantiate layer.
+      from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+
+      layer = deserialize_layer(layer_data, custom_objects=custom_objects)
+      created_layers[layer_name] = layer
+
+      # Gather layer inputs.
+      inbound_nodes_data = layer_data['inbound_nodes']
+      for node_data in inbound_nodes_data:
+        # We don't process nodes (i.e. make layer calls)
+        # on the fly because the inbound node may not yet exist,
+        # in case of layer shared at different topological depths
+        # (e.g. a model such as A(B(A(B(x)))))
+        add_unprocessed_node(layer, node_data)
+
+    # First, we create all layers and enqueue nodes to be processed
+    for layer_data in config['layers']:
+      process_layer(layer_data)
+    # Then we process nodes in order of layer depth.
+    # Nodes that cannot yet be processed (if the inbound node
+    # does not yet exist) are re-enqueued, and the process
+    # is repeated until all nodes are processed.
+    while unprocessed_nodes:
+      for layer_data in config['layers']:
+        layer = created_layers[layer_data['name']]
+        if layer in unprocessed_nodes:
+          for node_data in unprocessed_nodes.pop(layer):
+            process_node(layer, node_data)
+
+    name = config.get('name')
+    input_tensors = []
+    output_tensors = []
+    for layer_data in config['input_layers']:
+      layer_name, node_index, tensor_index = layer_data
+      assert layer_name in created_layers
+      layer = created_layers[layer_name]
+      layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
+      input_tensors.append(layer_output_tensors[tensor_index])
+    for layer_data in config['output_layers']:
+      layer_name, node_index, tensor_index = layer_data
+      assert layer_name in created_layers
+      layer = created_layers[layer_name]
+      layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
+      output_tensors.append(layer_output_tensors[tensor_index])
+    return cls(inputs=input_tensors, outputs=output_tensors, name=name)
+
+  def save(self, filepath, overwrite=True, include_optimizer=True):
+    """Saves the model to a single HDF5 file.
+
+    The savefile includes:
+        - The model architecture, allowing to re-instantiate the model.
+        - The model weights.
+        - The state of the optimizer, allowing to resume training
+            exactly where you left off.
+
+    This allows you to save the entirety of the state of a model
+    in a single file.
+
+    Saved models can be reinstantiated via `keras.models.load_model`.
+    The model returned by `load_model`
+    is a compiled model ready to be used (unless the saved model
+    was never compiled in the first place).
+
+    Arguments:
+        filepath: String, path to the file to save the weights to.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
+
+    Example:
+
+    ```python
+    from keras.models import load_model
+
+    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
+    del model  # deletes the existing model
+
+    # returns a compiled model
+    # identical to the previous one
+    model = load_model('my_model.h5')
+    ```
+    """
+    if not self._is_graph_network:
+      raise NotImplementedError
+
+    from tensorflow.python.keras._impl.keras.models import save_model  # pylint: disable=g-import-not-at-top
+    save_model(self, filepath, overwrite, include_optimizer)
+
+  def save_weights(self, filepath, overwrite=True, save_format=None):
+    """Saves all layer weights.
+
+    Either saves in HDF5 or in TensorFlow format based on the `save_format`
+    argument.
+
+    When saving in HDF5 format, the weight file has:
+      - `layer_names` (attribute), a list of strings
+          (ordered names of model layers).
+      - For every layer, a `group` named `layer.name`
+          - For every such layer group, a group attribute `weight_names`,
+              a list of strings
+              (ordered names of weights tensor of the layer).
+          - For every weight in the layer, a dataset
+              storing the weight value, named after the weight tensor.
+
+    When saving in TensorFlow format, all objects referenced by the network are
+    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
+    instances or `Optimizer` instances assigned to object attributes. For
+    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
+    outputs)`, `Layer` instances used by the network are tracked/saved
+    automatically. For user-defined classes which inherit from `tf.keras.Model`,
+    `Layer` instances must be assigned to object attributes, typically in the
+    constructor. See the documentation of `tf.train.Checkpoint` and
+    `tf.keras.Model` for details.
+
+    Arguments:
+        filepath: String, path to the file to save the weights to. When saving
+            in TensorFlow format, this is the prefix used for checkpoint files
+            (multiple files are generated). Note that the '.h5' suffix causes
+            weights to be saved in HDF5 format.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
+            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
+            `None` defaults to 'tf'.
+
+    Raises:
+        ImportError: If h5py is not available when attempting to save in HDF5
+            format.
+        ValueError: For invalid/unknown format arguments.
+    """
+    filepath_is_h5 = filepath.endswith('.h5') or filepath.endswith('.keras')
+    if save_format is None:
+      if filepath_is_h5:
+        save_format = 'h5'
+      else:
+        save_format = 'tf'
+    else:
+      user_format = save_format.lower().strip()
+      if user_format in ('tensorflow', 'tf'):
+        save_format = 'tf'
+      elif user_format in ('hdf5', 'h5', 'keras'):
+        save_format = 'h5'
+      else:
+        raise ValueError(
+            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
+                save_format,))
+    if save_format == 'tf' and filepath_is_h5:
+      raise ValueError(
+          ('save_weights got save_format="tf"/"tensorflow", but the '
+           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
+           'when saving in TensorFlow format.')
+          % filepath)
+
+    if save_format == 'h5' and h5py is None:
+      raise ImportError(
+          '`save_weights` requires h5py when saving in hdf5.')
+    if save_format == 'tf':
+      check_filepath = filepath + '.index'
+    else:
+      check_filepath = filepath
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(check_filepath):
+      proceed = ask_to_proceed_with_overwrite(check_filepath)
+      if not proceed:
+        return
+    if save_format == 'h5':
+      with h5py.File(filepath, 'w') as f:
+        saving.save_weights_to_hdf5_group(f, self.layers)
+    else:
+      self._checkpointable_saver.save(filepath)
+
+  def load_weights(self, filepath, by_name=False):
+    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+
+    If `by_name` is False weights are loaded based on the network's
+    topology. This means the architecture should be the same as when the weights
+    were saved.  Note that layers that don't have weights are not taken into
+    account in the topological ordering, so adding or removing layers is fine as
+    long as they don't have weights.
+
+    If `by_name` is True, weights are loaded into layers only if they share the
+    same name. This is useful for fine-tuning or transfer-learning models where
+    some of the layers have changed.
+
+    Only topological loading (`by_name=False`) is supported when loading weights
+    from the TensorFlow format. Note that topological loading differs slightly
+    between TensorFlow and HDF5 formats for user-defined classes inheriting from
+    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
+    TensorFlow format loads based on the object-local names of attributes to
+    which layers are assigned in the `Model`'s constructor.
+
+    Arguments:
+        filepath: String, path to the weights file to load. For weight files in
+            TensorFlow format, this is the file prefix (the same as was passed
+            to `save_weights`).
+        by_name: Boolean, whether to load weights by name or by topological
+            order. Only topological loading is supported for weight files in
+            TensorFlow format.
+
+    Returns:
+        When loading a weight file in TensorFlow format, returns the same status
+        object as `tf.train.Checkpoint.restore`. When graph building, restore
+        ops are run automatically as soon as the network is built (on first call
+        for user-defined classes inheriting from `Model`, immediately if it is
+        already built).
+
+        When loading weights in HDF5 format, returns `None`.
+
+    Raises:
+        ImportError: If h5py is not available and the weight file is in HDF5
+            format.
+    """
+    try:
+      pywrap_tensorflow.NewCheckpointReader(filepath)
+      save_format = 'tf'
+    except errors_impl.DataLossError:
+      # The checkpoint is not readable in TensorFlow format. Try HDF5.
+      save_format = 'h5'
+    if save_format == 'tf':
+      status = self._checkpointable_saver.restore(filepath)
+      if by_name:
+        raise NotImplementedError(
+            'Weights may only be loaded based on topology into Models when '
+            'loading TensorFlow-formatted weights (got by_name=True to '
+            'load_weights).')
+      if not context.executing_eagerly():
+        finalizer = status.run_restore_ops
+        if self.built:
+          finalizer()
+        else:
+          # Hold on to this status object until the network is built (for
+          # subclassed Models). Then we'll run restore ops if necessary.
+          self._in_progress_restore_finalizer = finalizer
+      return status
+    if h5py is None:
+      raise ImportError(
+          '`load_weights` requires h5py when loading weights from HDF5.')
+    if self._is_graph_network and not self.built:
+      raise NotImplementedError(
+          'Unable to load weights saved in HDF5 format into a subclassed '
+          'Model which has not created its variables yet. Call the Model '
+          'first, then load the weights.')
+    with h5py.File(filepath, 'r') as f:
+      if 'layer_names' not in f.attrs and 'model_weights' in f:
+        f = f['model_weights']
+      if by_name:
+        saving.load_weights_from_hdf5_group_by_name(f, self.layers)
+      else:
+        saving.load_weights_from_hdf5_group(f, self.layers)
+
+  def _post_build_cleanup(self):
+    super(Network, self)._post_build_cleanup()
+    if self._in_progress_restore_finalizer is not None:
+      # Runs queued restore operations left over from load_weights when graph
+      # building.
+      self._in_progress_restore_finalizer()
+      self._in_progress_restore_finalizer = None
+
+  def _updated_config(self):
+    """Util shared between different serialization methods.
+
+    Returns:
+        Model config with Keras version information added.
+    """
+    from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+    config = self.get_config()
+    model_config = {
+        'class_name': self.__class__.__name__,
+        'config': config,
+        'keras_version': keras_version,
+        'backend': backend.backend()
+    }
+    return model_config
+
+  def to_json(self, **kwargs):
+    """Returns a JSON string containing the network configuration.
+
+    To load a network from a JSON save file, use
+    `keras.models.model_from_json(json_string, custom_objects={})`.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `json.dumps()`.
+
+    Returns:
+        A JSON string.
+    """
+    def get_json_type(obj):
+      # If obj is any numpy type
+      if type(obj).__module__ == np.__name__:
+        return obj.item()
+
+      # If obj is a python 'type'
+      if type(obj).__name__ == type.__name__:
+        return obj.__name__
+
+      raise TypeError('Not JSON Serializable:', obj)
+
+    model_config = self._updated_config()
+    return json.dumps(model_config, default=get_json_type, **kwargs)
+
+  def to_yaml(self, **kwargs):
+    """Returns a yaml string containing the network configuration.
+
+    To load a network from a yaml save file, use
+    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
+
+    `custom_objects` should be a dictionary mapping
+    the names of custom losses / layers / etc to the corresponding
+    functions / classes.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `yaml.dump()`.
+
+    Returns:
+        A YAML string.
+
+    Raises:
+        ImportError: if yaml module is not found.
+    """
+    if yaml is None:
+      raise ImportError('Requires yaml module installed.')
+    return yaml.dump(self._updated_config(), **kwargs)
+
+  def summary(self, line_length=None, positions=None, print_fn=None):
+    """Prints a string summary of the network.
+
+    Arguments:
+        line_length: Total length of printed lines
+            (e.g. set this to adapt the display to different
+            terminal window sizes).
+        positions: Relative or absolute positions of log elements
+            in each line. If not provided,
+            defaults to `[.33, .55, .67, 1.]`.
+        print_fn: Print function to use. Defaults to `print`.
+            It will be called on each line of the summary.
+            You can set it to a custom function
+            in order to capture the string summary.
+    """
+    print_layer_summary(self,
+                        line_length=line_length,
+                        positions=positions,
+                        print_fn=print_fn)
+
+
+def get_source_inputs(tensor, layer=None, node_index=None):
+  """Returns the list of input tensors necessary to compute `tensor`.
+
+  Output will always be a list of tensors
+  (potentially with 1 element).
+
+  Arguments:
+      tensor: The tensor to start from.
+      layer: Origin layer of the tensor. Will be
+          determined via tensor._keras_history if not provided.
+      node_index: Origin node index of the tensor.
+
+  Returns:
+      List of input tensors.
+  """
+  if not hasattr(tensor, '_keras_history'):
+    return tensor
+
+  if layer is None or node_index:
+    layer, node_index, _ = tensor._keras_history
+  if not layer._inbound_nodes:
+    return [tensor]
+  else:
+    node = layer._inbound_nodes[node_index]
+    if not node.inbound_layers:
+      # Reached an Input layer, stop recursion.
+      return node.input_tensors
+    else:
+      source_tensors = []
+      for i in range(len(node.inbound_layers)):
+        x = node.input_tensors[i]
+        layer = node.inbound_layers[i]
+        node_index = node.node_indices[i]
+        previous_sources = get_source_inputs(x, layer, node_index)
+        # Avoid input redundancy.
+        for x in previous_sources:
+          if x not in source_tensors:
+            source_tensors.append(x)
+      return source_tensors
+
+
+def _make_node_key(layer_name, node_index):
+  return layer_name + '_ib-' + str(node_index)
+
+
+def _map_graph_network(inputs, outputs):
+  """Validates a network's topology and gather its layers and nodes.
+
+  Arguments:
+    inputs: List of input tensors.
+    outputs: List of outputs tensors.
+
+  Returns:
+    A tuple `(nodes, nodes_by_depth, layers, layers_by_depth)`.
+    - nodes: list of Node instances.
+    - nodes_by_depth: dict mapping ints (depth) to lists of node instances.
+    - layers: list of Layer instances.
+    - layers_by_depth: dict mapping ints (depth) to lists of layer instances.
+
+  Raises:
+    ValueError: In case the network is not valid (e.g. disconnected graph).
+  """
+  # Network_nodes: set of nodes included in the graph of layers
+  # (not all nodes included in the layers are relevant to the current graph).
+  network_nodes = set()  # ids of all nodes relevant to the Network
+  nodes_depths = {}  # dict {node: depth value}
+  layers_depths = {}  # dict {layer: depth value}
+  layer_indices = {}  # dict {layer: index in traversal}
+  nodes_in_decreasing_depth = []
+
+  def build_map(tensor,
+                finished_nodes,
+                nodes_in_progress,
+                layer,
+                node_index,
+                tensor_index):
+    """Builds a map of the graph of layers.
+
+    This recursively updates the map `layer_indices`,
+    the list `nodes_in_decreasing_depth` and the set `network_nodes`.
+
+    Arguments:
+        tensor: Some tensor in a graph.
+        finished_nodes: Set of nodes whose subgraphs have been traversed
+            completely. Useful to prevent duplicated work.
+        nodes_in_progress: Set of nodes that are currently active on the
+            recursion stack. Useful to detect cycles.
+        layer: Layer from which `tensor` comes from. If not provided,
+            will be obtained from `tensor._keras_history`.
+        node_index: Node index from which `tensor` comes from.
+        tensor_index: Tensor_index from which `tensor` comes from.
+
+    Raises:
+        ValueError: if a cycle is detected.
+    """
+    node = layer._inbound_nodes[node_index]  # pylint: disable=protected-access
+
+    # Prevent cycles.
+    if node in nodes_in_progress:
+      raise ValueError('The tensor ' + str(tensor) + ' at layer "' +
+                       layer.name + '" is part of a cycle.')
+
+    # Don't repeat work for shared subgraphs
+    if node in finished_nodes:
+      return
+
+    node_key = _make_node_key(layer.name, node_index)
+    # Update network_nodes.
+    network_nodes.add(node_key)
+
+    # Store the traversal order for layer sorting.
+    if layer not in layer_indices:
+      layer_indices[layer] = len(layer_indices)
+
+    nodes_in_progress.add(node)
+
+    # Propagate to all previous tensors connected to this node.
+    for i in range(len(node.inbound_layers)):
+      x = node.input_tensors[i]
+      layer = node.inbound_layers[i]
+      node_index = node.node_indices[i]
+      tensor_index = node.tensor_indices[i]
+      build_map(x, finished_nodes, nodes_in_progress, layer,
+                node_index, tensor_index)
+
+    finished_nodes.add(node)
+    nodes_in_progress.remove(node)
+    nodes_in_decreasing_depth.append(node)
+
+  finished_nodes = set()
+  nodes_in_progress = set()
+  for x in outputs:
+    layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+    build_map(x, finished_nodes, nodes_in_progress,
+              layer=layer,
+              node_index=node_index,
+              tensor_index=tensor_index)
+
+  for node in reversed(nodes_in_decreasing_depth):
+    # If the depth is not set, the node has no outbound nodes (depth 0).
+    depth = nodes_depths.setdefault(node, 0)
+
+    # Update the depth of the corresponding layer
+    previous_depth = layers_depths.get(node.outbound_layer, 0)
+    # If we've seen this layer before at a higher depth,
+    # we should use that depth instead of the node depth.
+    # This is necessary for shared layers that have inputs at different
+    # depth levels in the graph.
+    depth = max(depth, previous_depth)
+    layers_depths[node.outbound_layer] = depth
+    nodes_depths[node] = depth
+
+    # Update the depth of inbound nodes.
+    # The "depth" of a node is the max of the depths
+    # of all layers it is connected to.
+    for i in range(len(node.inbound_layers)):
+      inbound_layer = node.inbound_layers[i]
+      node_index = node.node_indices[i]
+      inbound_node = inbound_layer._inbound_nodes[node_index]  # pylint: disable=protected-access
+      previous_depth = nodes_depths.get(inbound_node, 0)
+      nodes_depths[inbound_node] = max(depth + 1, previous_depth)
+
+  # Build a dict {depth: list of nodes with this depth}
+  nodes_by_depth = {}
+  for node, depth in nodes_depths.items():
+    if depth not in nodes_by_depth:
+      nodes_by_depth[depth] = []
+    nodes_by_depth[depth].append(node)
+
+  # Build a dict {depth: list of layers with this depth}
+  layers_by_depth = {}
+  for layer, depth in layers_depths.items():
+    if depth not in layers_by_depth:
+      layers_by_depth[depth] = []
+    layers_by_depth[depth].append(layer)
+
+  # Get sorted list of layer depths.
+  depth_keys = list(layers_by_depth.keys())
+  depth_keys.sort(reverse=True)
+
+  # Set self.layers and self._layers_by_depth.
+  layers = []
+  for depth in depth_keys:
+    layers_for_depth = layers_by_depth[depth]
+    # Network.layers needs to have a deterministic order:
+    # here we order them by traversal order.
+    layers_for_depth.sort(key=lambda x: layer_indices[x])
+    layers.extend(layers_for_depth)
+
+  # Get sorted list of node depths.
+  depth_keys = list(nodes_by_depth.keys())
+  depth_keys.sort(reverse=True)
+
+  # Check that all tensors required are computable.
+  # computable_tensors: all tensors in the graph
+  # that can be computed from the inputs provided.
+  computable_tensors = []
+  for x in inputs:
+    computable_tensors.append(x)
+
+  layers_with_complete_input = []  # To provide a better error msg.
+  for depth in depth_keys:
+    for node in nodes_by_depth[depth]:
+      layer = node.outbound_layer
+      if layer:
+        for x in node.input_tensors:
+          if x not in computable_tensors:
+            raise ValueError('Graph disconnected: '
+                             'cannot obtain value for tensor ' + str(x) +
+                             ' at layer "' + layer.name + '". '
+                             'The following previous layers '
+                             'were accessed without issue: ' +
+                             str(layers_with_complete_input))
+        for x in node.output_tensors:
+          computable_tensors.append(x)
+        layers_with_complete_input.append(layer.name)
+
+  # Ensure name unicity, which will be crucial for serialization
+  # (since serialized nodes refer to layers by their name).
+  all_names = [layer.name for layer in layers]
+  for name in all_names:
+    if all_names.count(name) != 1:
+      raise ValueError('The name "' + name + '" is used ' +
+                       str(all_names.count(name)) + ' times in the model. '
+                       'All layer names should be unique.')
+  return network_nodes, nodes_by_depth, layers, layers_by_depth
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving.py b/tensorflow/python/keras/_impl/keras/engine/saving.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b709a1a58436b2d8cb33398c013f26f023dd70
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/saving.py
@@ -0,0 +1,885 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Model saving utilities.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import optimizers
+from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+  HDF5_OBJECT_HEADER_LIMIT = 64512
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
+
+
+@tf_export('keras.models.save_model')
+def save_model(model, filepath, overwrite=True, include_optimizer=True):
+  """Saves a model to a HDF5 file.
+
+  The saved model contains:
+      - the model's configuration (topology)
+      - the model's weights
+      - the model's optimizer's state (if any)
+
+  Thus the saved model can be reinstantiated in
+  the exact same state, without any of the code
+  used for model definition or training.
+
+  Arguments:
+      model: Keras model instance to be saved.
+      filepath: String, path where to save the model.
+      overwrite: Whether we should overwrite any existing
+          model at the target location, or instead
+          ask the user with a manual prompt.
+      include_optimizer: If True, save optimizer's state together.
+
+  Raises:
+      ImportError: if h5py is not available.
+  """
+
+  if h5py is None:
+    raise ImportError('`save_model` requires h5py.')
+
+  def get_json_type(obj):
+    """Serializes any object to a JSON-serializable structure.
+
+    Arguments:
+        obj: the object to serialize
+
+    Returns:
+        JSON-serializable structure representing `obj`.
+
+    Raises:
+        TypeError: if `obj` cannot be serialized.
+    """
+    # if obj is a serializable Keras class instance
+    # e.g. optimizer, layer
+    if hasattr(obj, 'get_config'):
+      return {'class_name': obj.__class__.__name__, 'config': obj.get_config()}
+
+    # if obj is any numpy type
+    if type(obj).__module__ == np.__name__:
+      if isinstance(obj, np.ndarray):
+        return {'type': type(obj), 'value': obj.tolist()}
+      else:
+        return obj.item()
+
+    # misc functions (e.g. loss function)
+    if callable(obj):
+      return obj.__name__
+
+    # if obj is a python 'type'
+    if type(obj).__name__ == type.__name__:
+      return obj.__name__
+
+    raise TypeError('Not JSON Serializable:', obj)
+
+  from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  # If file exists and should not be overwritten.
+  if not overwrite and os.path.isfile(filepath):
+    proceed = ask_to_proceed_with_overwrite(filepath)
+    if not proceed:
+      return
+
+  with h5py.File(filepath, mode='w') as f:
+    f.attrs['keras_version'] = str(keras_version).encode('utf8')
+    f.attrs['backend'] = K.backend().encode('utf8')
+    f.attrs['model_config'] = json.dumps(
+        {
+            'class_name': model.__class__.__name__,
+            'config': model.get_config()
+        },
+        default=get_json_type).encode('utf8')
+
+    model_weights_group = f.create_group('model_weights')
+    model_layers = model.layers
+    save_weights_to_hdf5_group(model_weights_group, model_layers)
+
+    if include_optimizer and hasattr(model, 'optimizer'):
+      if isinstance(model.optimizer, optimizers.TFOptimizer):
+        logging.warning(
+            'TensorFlow optimizers do not '
+            'make it possible to access '
+            'optimizer attributes or optimizer state '
+            'after instantiation. '
+            'As a result, we cannot save the optimizer '
+            'as part of the model save file.'
+            'You will have to compile your model again after loading it. '
+            'Prefer using a Keras optimizer instead '
+            '(see keras.io/optimizers).')
+      else:
+        f.attrs['training_config'] = json.dumps(
+            {
+                'optimizer_config': {
+                    'class_name': model.optimizer.__class__.__name__,
+                    'config': model.optimizer.get_config()
+                },
+                'loss': model.loss,
+                'metrics': model.metrics,
+                'sample_weight_mode': model.sample_weight_mode,
+                'loss_weights': model.loss_weights,
+            },
+            default=get_json_type).encode('utf8')
+
+        # Save optimizer weights.
+        symbolic_weights = getattr(model.optimizer, 'weights')
+        if symbolic_weights:
+          optimizer_weights_group = f.create_group('optimizer_weights')
+          weight_values = K.batch_get_value(symbolic_weights)
+          weight_names = []
+          for w, val in zip(symbolic_weights, weight_values):
+            name = str(w.name)
+            weight_names.append(name.encode('utf8'))
+          optimizer_weights_group.attrs['weight_names'] = weight_names
+          for name, val in zip(weight_names, weight_values):
+            param_dset = optimizer_weights_group.create_dataset(
+                name, val.shape, dtype=val.dtype)
+            if not val.shape:
+              # scalar
+              param_dset[()] = val
+            else:
+              param_dset[:] = val
+    f.flush()
+
+
+@tf_export('keras.models.load_model')
+def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
+  """Loads a model saved via `save_model`.
+
+  Arguments:
+      filepath: String, path to the saved model.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+      compile: Boolean, whether to compile the model
+          after loading.
+
+  Returns:
+      A Keras model instance. If an optimizer was found
+      as part of the saved model, the model is already
+      compiled. Otherwise, the model is uncompiled and
+      a warning will be displayed. When `compile` is set
+      to False, the compilation is omitted without any
+      warning.
+
+  Raises:
+      ImportError: if h5py is not available.
+      ValueError: In case of an invalid savefile.
+  """
+  if h5py is None:
+    raise ImportError('`load_model` requires h5py.')
+
+  if not custom_objects:
+    custom_objects = {}
+
+  def convert_custom_objects(obj):
+    """Handles custom object lookup.
+
+    Arguments:
+        obj: object, dict, or list.
+
+    Returns:
+        The same structure, where occurrences
+            of a custom object name have been replaced
+            with the custom object.
+    """
+    if isinstance(obj, list):
+      deserialized = []
+      for value in obj:
+        deserialized.append(convert_custom_objects(value))
+      return deserialized
+    if isinstance(obj, dict):
+      deserialized = {}
+      for key, value in obj.items():
+        deserialized[key] = convert_custom_objects(value)
+      return deserialized
+    if obj in custom_objects:
+      return custom_objects[obj]
+    return obj
+
+  with h5py.File(filepath, mode='r') as f:
+    # instantiate model
+    model_config = f.attrs.get('model_config')
+    if model_config is None:
+      raise ValueError('No model found in config file.')
+    model_config = json.loads(model_config.decode('utf-8'))
+    model = model_from_config(model_config, custom_objects=custom_objects)
+
+    # set weights
+    load_weights_from_hdf5_group(f['model_weights'], model.layers)
+
+    # Early return if compilation is not required.
+    if not compile:
+      return model
+
+    # instantiate optimizer
+    training_config = f.attrs.get('training_config')
+    if training_config is None:
+      logging.warning('No training configuration found in save file: '
+                      'the model was *not* compiled. Compile it manually.')
+      return model
+    training_config = json.loads(training_config.decode('utf-8'))
+    optimizer_config = training_config['optimizer_config']
+    optimizer = optimizers.deserialize(
+        optimizer_config, custom_objects=custom_objects)
+
+    # Recover loss functions and metrics.
+    loss = convert_custom_objects(training_config['loss'])
+    metrics = convert_custom_objects(training_config['metrics'])
+    sample_weight_mode = training_config['sample_weight_mode']
+    loss_weights = training_config['loss_weights']
+
+    # Compile model.
+    model.compile(
+        optimizer=optimizer,
+        loss=loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=sample_weight_mode)
+
+    # Set optimizer weights.
+    if 'optimizer_weights' in f:
+      # Build train function (to get weight updates).
+      model._make_train_function()
+      optimizer_weights_group = f['optimizer_weights']
+      optimizer_weight_names = [
+          n.decode('utf8')
+          for n in optimizer_weights_group.attrs['weight_names']
+      ]
+      optimizer_weight_values = [
+          optimizer_weights_group[n] for n in optimizer_weight_names
+      ]
+      try:
+        model.optimizer.set_weights(optimizer_weight_values)
+      except ValueError:
+        logging.warning('Error in loading the saved optimizer '
+                        'state. As a result, your model is '
+                        'starting with a freshly initialized '
+                        'optimizer.')
+  return model
+
+
+@tf_export('keras.models.model_from_config')
+def model_from_config(config, custom_objects=None):
+  """Instantiates a Keras model from its config.
+
+  Arguments:
+      config: Configuration dictionary.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+
+  Raises:
+      TypeError: if `config` is not a dictionary.
+  """
+  if isinstance(config, list):
+    raise TypeError('`model_from_config` expects a dictionary, not a list. '
+                    'Maybe you meant to use '
+                    '`Sequential.from_config(config)`?')
+  from tensorflow.python.keras._impl.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  return deserialize(config, custom_objects=custom_objects)
+
+
+@tf_export('keras.models.model_from_yaml')
+def model_from_yaml(yaml_string, custom_objects=None):
+  """Parses a yaml model configuration file and returns a model instance.
+
+  Arguments:
+      yaml_string: YAML string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+
+  Raises:
+      ImportError: if yaml module is not found.
+  """
+  if yaml is None:
+    raise ImportError('Requires yaml module installed.')
+  config = yaml.load(yaml_string)
+  from tensorflow.python.keras._impl.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  return deserialize(config, custom_objects=custom_objects)
+
+
+@tf_export('keras.models.model_from_json')
+def model_from_json(json_string, custom_objects=None):
+  """Parses a JSON model configuration file and returns a model instance.
+
+  Arguments:
+      json_string: JSON string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+  """
+  config = json.loads(json_string)
+  from tensorflow.python.keras._impl.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  return deserialize(config, custom_objects=custom_objects)
+
+
+def preprocess_weights_for_loading(layer,
+                                   weights,
+                                   original_keras_version=None,
+                                   original_backend=None):
+  """Converts layers weights from Keras 1 format to Keras 2.
+
+  Arguments:
+      layer: Layer instance.
+      weights: List of weights values (Numpy arrays).
+      original_keras_version: Keras version for the weights, as a string.
+      original_backend: Keras backend the weights were trained with,
+          as a string.
+
+  Returns:
+      A list of weights values (Numpy arrays).
+  """
+  if layer.__class__.__name__ == 'Bidirectional':
+    num_weights_per_layer = len(weights) // 2
+    forward_weights = preprocess_weights_for_loading(
+        layer.forward_layer, weights[:num_weights_per_layer],
+        original_keras_version, original_backend)
+    backward_weights = preprocess_weights_for_loading(
+        layer.backward_layer, weights[num_weights_per_layer:],
+        original_keras_version, original_backend)
+    weights = forward_weights + backward_weights
+
+  if original_keras_version == '1':
+    if layer.__class__.__name__ == 'TimeDistributed':
+      weights = preprocess_weights_for_loading(
+          layer.layer, weights, original_keras_version, original_backend)
+
+    if layer.__class__.__name__ == 'Conv1D':
+      shape = weights[0].shape
+      # Handle Keras 1.1 format
+      if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
+        # Legacy shape:
+        # (filters, input_dim, filter_length, 1)
+        assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0],
+                                                           1)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+      weights[0] = weights[0][:, 0, :, :]
+
+    if layer.__class__.__name__ == 'Conv2D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+
+    if layer.__class__.__name__ == 'Conv2DTranspose':
+      if layer.data_format == 'channels_last':
+        # old: (kernel_rows, kernel_cols, stack_size, filters)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
+
+    if layer.__class__.__name__ == 'Conv3D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, ...)
+        # new: (..., stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
+
+    if layer.__class__.__name__ == 'GRU':
+      if len(weights) == 9:
+        kernel = np.concatenate([weights[0], weights[3], weights[6]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[4], weights[7]], axis=-1)
+        bias = np.concatenate([weights[2], weights[5], weights[8]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'LSTM':
+      if len(weights) == 12:
+        # old: i, c, f, o
+        # new: i, f, c, o
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'ConvLSTM2D':
+      if len(weights) == 12:
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        if layer.data_format == 'channels_first':
+          # old: (filters, stack_size, kernel_rows, kernel_cols)
+          # new: (kernel_rows, kernel_cols, stack_size, filters)
+          kernel = np.transpose(kernel, (2, 3, 1, 0))
+          recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ in ['Model', 'Sequential']:
+      new_weights = []
+      # trainable weights
+      for sublayer in layer.layers:
+        num_weights = len(sublayer.trainable_weights)
+        if num_weights > 0:
+          new_weights.extend(
+              preprocess_weights_for_loading(
+                  layer=sublayer,
+                  weights=weights[:num_weights],
+                  original_keras_version=original_keras_version,
+                  original_backend=original_backend))
+          weights = weights[num_weights:]
+
+      # non-trainable weights
+      for sublayer in layer.layers:
+        num_weights = len([
+            l for l in sublayer.weights if l not in sublayer.trainable_weights
+        ])
+        if num_weights > 0:
+          new_weights.extend(
+              preprocess_weights_for_loading(
+                  layer=sublayer,
+                  weights=weights[:num_weights],
+                  original_keras_version=original_keras_version,
+                  original_backend=original_backend))
+          weights = weights[num_weights:]
+      weights = new_weights
+
+  conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
+  if layer.__class__.__name__ in conv_layers:
+    if original_backend == 'theano':
+      weights[0] = conv_utils.convert_kernel(weights[0])
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = conv_utils.convert_kernel(weights[1])
+    if K.int_shape(layer.weights[0]) != weights[0].shape:
+      weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
+
+  return _convert_rnn_weights(layer, weights)
+
+
+def _convert_rnn_weights(layer, weights):
+  """Converts weights for RNN layers between native and CuDNN format.
+
+  Input kernels for each gate are transposed and converted between Fortran
+  and C layout, recurrent kernels are transposed. For LSTM biases are summed/
+  split in half, for GRU biases are reshaped.
+
+  Weights can be converted in both directions between `LSTM` and`CuDNNSLTM`
+  and between `CuDNNGRU` and `GRU(reset_after=True)`. Default `GRU` is not
+  compatible with `CuDNNGRU`.
+
+  For missing biases in `LSTM`/`GRU` (`use_bias=False`) no conversion is made.
+
+  Arguments:
+      layer: Target layer instance.
+      weights: List of source weights values (input kernels, recurrent
+          kernels, [biases]) (Numpy arrays).
+
+  Returns:
+      A list of converted weights values (Numpy arrays).
+
+  Raises:
+      ValueError: for incompatible GRU layer/weights or incompatible biases
+  """
+
+  def transform_kernels(kernels, func, n_gates):
+    """Transforms kernel for each gate separately using given function.
+
+    Arguments:
+        kernels: Stacked array of kernels for individual gates.
+        func: Function applied to kernel of each gate.
+        n_gates: Number of gates (4 for LSTM, 3 for GRU).
+
+    Returns:
+        Stacked array of transformed kernels.
+    """
+    return np.hstack([func(k) for k in np.hsplit(kernels, n_gates)])
+
+  def transpose_input(from_cudnn):
+    """Makes a function that transforms input kernels from/to CuDNN format.
+
+    It keeps the shape, but changes between the layout (Fortran/C). Eg.:
+
+    ```
+    Keras                 CuDNN
+    [[0, 1, 2],  <--->  [[0, 2, 4],
+     [3, 4, 5]]          [1, 3, 5]]
+    ```
+
+    It can be passed to `transform_kernels()`.
+
+    Arguments:
+        from_cudnn: `True` if source weights are in CuDNN format, `False`
+            if they're in plain Keras format.
+
+    Returns:
+        Function that converts input kernel to the other format.
+    """
+    order = 'F' if from_cudnn else 'C'
+
+    def transform(kernel):
+      return kernel.T.reshape(kernel.shape, order=order)
+
+    return transform
+
+  target_class = layer.__class__.__name__
+
+  # convert the weights between CuDNNLSTM and LSTM
+  if target_class in ['LSTM', 'CuDNNLSTM'] and len(weights) == 3:
+    # determine if we're loading a CuDNNLSTM layer
+    # from the number of bias weights:
+    # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
+    # if there's no bias weight in the file, skip this conversion
+    units = weights[1].shape[0]
+    bias_shape = weights[2].shape
+    n_gates = 4
+
+    if bias_shape == (2 * units * n_gates,):
+      source = 'CuDNNLSTM'
+    elif bias_shape == (units * n_gates,):
+      source = 'LSTM'
+    else:
+      raise ValueError('Invalid bias shape: ' + str(bias_shape))
+
+    def convert_lstm_weights(weights, from_cudnn=True):
+      """Converts the weights between CuDNNLSTM and LSTM.
+
+      Arguments:
+        weights: Original weights.
+        from_cudnn: Indicates whether original weights are from CuDNN layer.
+
+      Returns:
+        Updated weights compatible with LSTM.
+      """
+
+      # Transpose (and reshape) input and recurrent kernels
+      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
+                                  n_gates)
+      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
+      if from_cudnn:
+        # merge input and recurrent biases into a single set
+        biases = np.sum(np.split(weights[2], 2, axis=0), axis=0)
+      else:
+        # Split single set of biases evenly to two sets. The way of
+        # splitting doesn't matter as long as the two sets sum is kept.
+        biases = np.tile(0.5 * weights[2], 2)
+      return [kernels, recurrent_kernels, biases]
+
+    if source != target_class:
+      weights = convert_lstm_weights(weights, from_cudnn=source == 'CuDNNLSTM')
+
+  # convert the weights between CuDNNGRU and GRU(reset_after=True)
+  if target_class in ['GRU', 'CuDNNGRU'] and len(weights) == 3:
+    # We can determine the source of the weights from the shape of the bias.
+    # If there is no bias we skip the conversion since
+    # CuDNNGRU always has biases.
+
+    units = weights[1].shape[0]
+    bias_shape = weights[2].shape
+    n_gates = 3
+
+    def convert_gru_weights(weights, from_cudnn=True):
+      """Converts the weights between CuDNNGRU and GRU.
+
+      Arguments:
+        weights: Original weights.
+        from_cudnn: Indicates whether original weights are from CuDNN layer.
+
+      Returns:
+        Updated weights compatible with GRU.
+      """
+
+      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
+                                  n_gates)
+      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
+      biases = weights[2].reshape((2, -1) if from_cudnn else -1)
+      return [kernels, recurrent_kernels, biases]
+
+    if bias_shape == (2 * units * n_gates,):
+      source = 'CuDNNGRU'
+    elif bias_shape == (2, units * n_gates):
+      source = 'GRU(reset_after=True)'
+    elif bias_shape == (units * n_gates,):
+      source = 'GRU(reset_after=False)'
+    else:
+      raise ValueError('Invalid bias shape: ' + str(bias_shape))
+
+    if target_class == 'CuDNNGRU':
+      target = 'CuDNNGRU'
+    elif layer.reset_after:
+      target = 'GRU(reset_after=True)'
+    else:
+      target = 'GRU(reset_after=False)'
+
+    # only convert between different types
+    if source != target:
+      types = (source, target)
+      if 'GRU(reset_after=False)' in types:
+        raise ValueError('%s is not compatible with %s' % types)
+      if source == 'CuDNNGRU':
+        weights = convert_gru_weights(weights, from_cudnn=True)
+      elif source == 'GRU(reset_after=True)':
+        weights = convert_gru_weights(weights, from_cudnn=False)
+
+  return weights
+
+
+def save_weights_to_hdf5_group(f, layers):
+  from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  save_attributes_to_hdf5_group(
+      f, 'layer_names', [layer.name.encode('utf8') for layer in layers])
+  f.attrs['backend'] = K.backend().encode('utf8')
+  f.attrs['keras_version'] = str(keras_version).encode('utf8')
+
+  for layer in layers:
+    g = f.create_group(layer.name)
+    symbolic_weights = layer.weights
+    weight_values = K.batch_get_value(symbolic_weights)
+    weight_names = []
+    for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
+      if hasattr(w, 'name') and w.name:
+        name = str(w.name)
+      else:
+        name = 'param_' + str(i)
+      weight_names.append(name.encode('utf8'))
+    save_attributes_to_hdf5_group(g, 'weight_names', weight_names)
+    for name, val in zip(weight_names, weight_values):
+      param_dset = g.create_dataset(name, val.shape, dtype=val.dtype)
+      if not val.shape:
+        # scalar
+        param_dset[()] = val
+      else:
+        param_dset[:] = val
+
+
+def load_weights_from_hdf5_group(f, layers):
+  """Implements topological (order-based) weight loading.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  filtered_layers = []
+  for layer in layers:
+    weights = layer.weights
+    if weights:
+      filtered_layers.append(layer)
+
+  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
+  filtered_layer_names = []
+  for name in layer_names:
+    g = f[name]
+    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
+    if weight_names:
+      filtered_layer_names.append(name)
+  layer_names = filtered_layer_names
+  if len(layer_names) != len(filtered_layers):
+    raise ValueError('You are trying to load a weight file '
+                     'containing ' + str(len(layer_names)) +
+                     ' layers into a model with ' + str(len(filtered_layers)) +
+                     ' layers.')
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
+    weight_values = [g[weight_name] for weight_name in weight_names]
+    layer = filtered_layers[k]
+    symbolic_weights = layer.weights
+    weight_values = preprocess_weights_for_loading(
+        layer, weight_values, original_keras_version, original_backend)
+    if len(weight_values) != len(symbolic_weights):
+      raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                       '" in the current model) was found to '
+                       'correspond to layer ' + name + ' in the save file. '
+                       'However the new layer ' + layer.name + ' expects ' +
+                       str(len(symbolic_weights)) +
+                       ' weights, but the saved weights have ' +
+                       str(len(weight_values)) + ' elements.')
+    weight_value_tuples += zip(symbolic_weights, weight_values)
+  K.batch_set_value(weight_value_tuples)
+
+
+def load_weights_from_hdf5_group_by_name(f, layers):
+  """Implements name-based weight loading.
+
+  (instead of topological weight loading).
+
+  Layers that have no matching name are skipped.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  # New file format.
+  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
+
+  # Reverse index of layer name to list of layers with name.
+  index = {}
+  for layer in layers:
+    if layer.name:
+      index.setdefault(layer.name, []).append(layer)
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
+    weight_values = [g[weight_name] for weight_name in weight_names]
+
+    for layer in index.get(name, []):
+      symbolic_weights = layer.weights
+      weight_values = preprocess_weights_for_loading(
+          layer, weight_values, original_keras_version, original_backend)
+      if len(weight_values) != len(symbolic_weights):
+        raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                         '") expects ' + str(len(symbolic_weights)) +
+                         ' weight(s), but the saved weights' + ' have ' +
+                         str(len(weight_values)) + ' element(s).')
+      # Set values.
+      for i in range(len(weight_values)):
+        weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
+  K.batch_set_value(weight_value_tuples)
+
+
+def save_attributes_to_hdf5_group(group, name, data):
+  """Saves attributes (data) of the specified name into the HDF5 group.
+
+  This method deals with an inherent problem of HDF5 file which is not
+  able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
+
+  Arguments:
+      group: A pointer to a HDF5 group.
+      name: A name of the attributes to save.
+      data: Attributes data to store.
+
+  Raises:
+    RuntimeError: If any single attribute is too large to be saved.
+  """
+  # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
+  # because in that case even chunking the array would not make the saving
+  # possible.
+  bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
+
+  # Expecting this to never be true.
+  if bad_attributes:
+    raise RuntimeError('The following attributes cannot be saved to HDF5 '
+                       'file because they are larger than %d bytes: %s' %
+                       (HDF5_OBJECT_HEADER_LIMIT,
+                        ', '.join([x for x in bad_attributes])))
+
+  data_npy = np.asarray(data)
+
+  num_chunks = 1
+  chunked_data = np.array_split(data_npy, num_chunks)
+
+  # This will never loop forever thanks to the test above.
+  while any([x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data]):
+    num_chunks += 1
+    chunked_data = np.array_split(data_npy, num_chunks)
+
+  if num_chunks > 1:
+    for chunk_id, chunk_data in enumerate(chunked_data):
+      group.attrs['%s%d' % (name, chunk_id)] = chunk_data
+  else:
+    group.attrs[name] = data
+
+
+def load_attributes_from_hdf5_group(group, name):
+  """Loads attributes of the specified name from the HDF5 group.
+
+  This method deals with an inherent problem
+  of HDF5 file which is not able to store
+  data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
+
+  Arguments:
+      group: A pointer to a HDF5 group.
+      name: A name of the attributes to load.
+
+  Returns:
+      data: Attributes data.
+  """
+  if name in group.attrs:
+    data = [n.decode('utf8') for n in group.attrs[name]]
+  else:
+    data = []
+    chunk_id = 0
+    while '%s%d' % (name, chunk_id) in group.attrs:
+      data.extend(
+          [n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]])
+      chunk_id += 1
+  return data
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..709a8e9fb1e1baef91ee9319ce9dba681d707e06
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
@@ -0,0 +1,705 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#,============================================================================
+"""Tests for model saving."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import training as training_module
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
+
+  def test_weight_loading(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3)(a)
+      b = keras.layers.Dense(1)(x)
+      model = keras.models.Model(a, b)
+
+      x = np.random.random((3, 2))
+      ref_y = model.predict(x)
+      weights = model.get_weights()
+      model.set_weights(weights)
+      y = model.predict(x)
+      self.assertAllClose(ref_y, y)
+
+      with self.assertRaises(ValueError):
+        model.set_weights(weights[1:])
+      with self.assertRaises(ValueError):
+        model.set_weights(weights[::-1])
+
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      no_extension_path = os.path.join(temp_dir, 'test')
+      model.save_weights(no_extension_path, save_format='tf')
+      model.load_weights(no_extension_path)
+      y = model.predict(x)
+      self.assertAllClose(ref_y, y)
+
+      if h5py is None:
+        return  # Skip rest of test if H5py isn't available.
+
+      h5_path = os.path.join(temp_dir, 'test.h5')
+      model.save_weights(h5_path)
+      model.load_weights(h5_path)
+      y = model.predict(x)
+      self.assertAllClose(ref_y, y)
+
+      model.load_weights(h5_path, by_name=True)
+      y = model.predict(x)
+      self.assertAllClose(ref_y, y)
+
+      model.save_weights(no_extension_path, save_format='hdf5')
+      model.load_weights(no_extension_path)
+      y = model.predict(x)
+      self.assertAllClose(ref_y, y)
+
+  def test_weight_preprocessing(self):
+    input_dim = 3
+    output_dim = 3
+    size = 2
+    cases = [
+        [
+            (keras.layers.Bidirectional(keras.layers.SimpleRNN(2))),
+            [np.random.random((2, 1)), np.random.random((2, 1))],
+            (None, 3, 2),
+        ],
+        [
+            (keras.layers.TimeDistributed(keras.layers.Dense(1))),
+            [np.random.random((2, 1)), np.random.random((1,))],
+            (None, 3, 2),
+        ],
+        [
+            (keras.layers.Conv1D(output_dim, size, use_bias=False)),
+            [np.random.random((output_dim, input_dim, size, 1))],
+            (None, 4, input_dim),
+        ],
+        [
+            (keras.layers.Conv2D(output_dim, size,
+                                 use_bias=False, data_format='channels_first')),
+            [np.random.random((output_dim, input_dim, size, size))],
+            (None, input_dim, 4, 4),
+        ],
+        [
+            (keras.layers.Conv2DTranspose(output_dim, size,
+                                          use_bias=False,
+                                          data_format='channels_first')),
+            [np.random.random((output_dim, input_dim, size, size))],
+            (None, input_dim, 4, 4),
+        ],
+        [
+            (keras.layers.Conv2DTranspose(output_dim, size,
+                                          use_bias=False,
+                                          data_format='channels_last')),
+            [np.random.random((size, size, input_dim, output_dim))],
+            (None, 4, 4, input_dim),
+        ],
+        [
+            (keras.layers.Conv3D(output_dim, size,
+                                 use_bias=False, data_format='channels_first')),
+            [np.random.random((output_dim, input_dim, size, size, size))],
+            (None, input_dim, 4, 4, 4),
+        ],
+        [
+            (keras.layers.GRU(output_dim)),
+            [np.random.random((input_dim, output_dim)),
+             np.random.random((output_dim, output_dim)),
+             np.random.random((output_dim,)),
+             np.random.random((input_dim, output_dim)),
+             np.random.random((output_dim, output_dim)),
+             np.random.random((output_dim,)),
+             np.random.random((input_dim, output_dim)),
+             np.random.random((output_dim, output_dim)),
+             np.random.random((output_dim,))],
+            (None, 4, input_dim),
+        ],
+        [
+            (keras.layers.LSTM(output_dim)),
+            [np.random.random((input_dim, output_dim)),
+             np.random.random((output_dim, output_dim)),
+             np.random.random((output_dim,)),
+             np.random.random((input_dim, output_dim)),
+             np.random.random((output_dim, output_dim)),
+             np.random.random((output_dim,)),
+             np.random.random((input_dim, output_dim)),
+             np.random.random((output_dim, output_dim)),
+             np.random.random((output_dim,)),
+             np.random.random((input_dim, output_dim)),
+             np.random.random((output_dim, output_dim)),
+             np.random.random((output_dim,))],
+            (None, 4, input_dim),
+        ],
+    ]
+    for layer, weights, input_shape in cases:
+      layer.build(input_shape)
+      _ = keras.engine.saving.preprocess_weights_for_loading(
+          layer, weights, original_keras_version='1')
+
+    model = keras.models.Sequential([keras.layers.Dense(2, input_dim=2)])
+    _ = keras.engine.saving.preprocess_weights_for_loading(
+        model, model.weights, original_keras_version='1')
+
+    x = keras.Input((2,))
+    y = keras.layers.Dense(2)(x)
+    model = keras.models.Model(x, y)
+    _ = keras.engine.saving.preprocess_weights_for_loading(
+        model, model.weights, original_keras_version='1')
+
+  @parameterized.named_parameters(
+      ('gru', keras.layers.GRU, {
+          'units': 2,
+          'input_shape': (3, 5)
+      }),
+      ('gru_with_reset_after', keras.layers.GRU, {
+          'units': 2,
+          'input_shape': (3, 5),
+          'reset_after': True
+      }),
+      ('lstm', keras.layers.LSTM, {
+          'units': 2,
+          'input_shape': (3, 5)
+      }),
+      ('cudnngru', keras.layers.CuDNNGRU, {
+          'units': 2,
+          'input_shape': (3, 5)
+      }),
+      ('cudnnlstm', keras.layers.CuDNNLSTM, {
+          'units': 2,
+          'input_shape': (3, 5)
+      }))
+  def test_preprocess_weights_for_loading_rnn_should_be_idempotent(
+      self, layer_class, layer_args):
+    with self.test_session():
+      layer = layer_class(**layer_args)
+      layer.build(input_shape=layer_args.get('input_shape'))
+      weights1 = layer.get_weights()
+      weights2 = keras.engine.saving.preprocess_weights_for_loading(
+          layer, weights1)
+      _ = [
+          self.assertAllClose(x, y, rtol=1e-05)
+          for (x, y) in zip(weights1, weights2)
+      ]
+
+  def test_sequential_weight_loading(self):
+    if h5py is None:
+      return
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    h5_path = os.path.join(temp_dir, 'test.h5')
+
+    num_hidden = 5
+    input_dim = 3
+    batch_size = 5
+    num_classes = 2
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+      model.add(keras.layers.Dense(num_classes))
+
+      x = np.random.random((batch_size, input_dim))
+      ref_y = model.predict(x)
+
+      model.save_weights(h5_path)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+      model.add(keras.layers.Dense(num_classes))
+      model.load_weights(h5_path)
+      y = model.predict(x)
+
+      self.assertAllClose(y, ref_y)
+
+
+class TestWholeModelSaving(test.TestCase):
+
+  def test_sequential_model_saving(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model.compile(loss=keras.losses.MSE,
+                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
+                    metrics=[keras.metrics.categorical_accuracy],
+                    sample_weight_mode='temporal')
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+
+      new_model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
+
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+      # test that new updates are the same with both models
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+      new_model.train_on_batch(x, y)
+      out = model.predict(x)
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_sequential_model_saving_2(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      # test with custom optimizer, loss
+
+      class CustomOp(keras.optimizers.RMSprop):
+        pass
+
+      def custom_loss(y_true, y_pred):
+        return keras.losses.mse(y_true, y_pred)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss=custom_loss, optimizer=CustomOp(), metrics=['acc'])
+
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+
+      model = keras.models.load_model(
+          fname,
+          custom_objects={'CustomOp': CustomOp,
+                          'custom_loss': custom_loss})
+      os.close(fd)
+      os.remove(fname)
+
+      out2 = model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_functional_model_saving(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      output = keras.layers.Dense(3)(x)
+
+      model = keras.models.Model(inputs, output)
+      model.compile(loss=keras.losses.MSE,
+                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
+                    metrics=[keras.metrics.categorical_accuracy])
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+
+      model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
+
+      out2 = model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_saving_without_compilation(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
+
+  def test_saving_with_tf_optimizer(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss='mse',
+                    optimizer=training_module.AdadeltaOptimizer(0.1),
+                    metrics=['acc'])
+
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
+
+  def test_saving_right_after_compilation(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+      model._make_train_function()
+
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
+
+  def test_saving_lambda_numpy_array_arguments(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    mean = np.random.random((4, 2, 3))
+    std = np.abs(np.random.random((4, 2, 3))) + 1e-5
+    inputs = keras.layers.Input(shape=(4, 2, 3))
+    output = keras.layers.Lambda(lambda image, mu, std: (image - mu) / std,
+                                 arguments={'mu': mean, 'std': std})(inputs)
+    model = keras.models.Model(inputs, output)
+    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+
+    fd, fname = tempfile.mkstemp('.h5')
+    keras.models.save_model(model, fname)
+
+    model = keras.models.load_model(fname)
+    os.close(fd)
+    os.remove(fname)
+
+    self.assertAllClose(mean, model.layers[1].arguments['mu'])
+    self.assertAllClose(std, model.layers[1].arguments['std'])
+
+  def test_saving_model_with_long_layer_names(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      # This layer name will make the `layers_name` HDF5 attribute blow
+      # out of proportion. Note that it fits into the internal HDF5
+      # attribute memory limit on its own but because h5py converts
+      # the list of layer names into numpy array, which uses the same
+      # amout of memory for every item, it increases the memory
+      # requirements substantially.
+      x = keras.Input(shape=(2,), name='input_' + ('x' * (2**15)))
+      f = x
+      for i in range(4):
+        f = keras.layers.Dense(2, name='dense_%d' % (i,))(f)
+      model = keras.Model(inputs=[x], outputs=[f])
+      model.compile(loss='mse', optimizer='adam', metrics=['acc'])
+
+      x = np.random.random((1, 2))
+      y = np.random.random((1, 2))
+      model.train_on_batch(x, y)
+      out = model.predict(x)
+
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+
+      # Check that the HDF5 files contains chunked array
+      # of layer names.
+      with h5py.File(fname, 'r') as h5file:
+        num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
+                                if attr.startswith('layer_names')])
+      # The chunking of layer names array should have happend.
+      self.assertGreater(num_names_arrays, 0)
+      out2 = model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+      # Cleanup
+      os.close(fd)
+      os.remove(fname)
+
+  def test_saving_model_with_long_weights_names(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      x = keras.Input(shape=(2,), name='nested_model_input')
+      f = x
+      for i in range(4):
+        f = keras.layers.Dense(2, name='nested_model_dense_%d' % (i,))(f)
+      # This layer name will make the `weights_name`
+      # HDF5 attribute blow out of proportion.
+      f = keras.layers.Dense(2, name='nested_model_output' + ('x' * (2**14)))(f)
+      nested_model = keras.Model(inputs=[x], outputs=[f], name='nested_model')
+
+      x = keras.Input(shape=(2,), name='outer_model_input')
+      f = nested_model(x)
+      f = keras.layers.Dense(2, name='outer_model_output')(f)
+
+      model = keras.Model(inputs=[x], outputs=[f])
+      model.compile(loss='mse', optimizer='adam', metrics=['acc'])
+
+      x = np.random.random((1, 2))
+      y = np.random.random((1, 2))
+      model.train_on_batch(x, y)
+      out = model.predict(x)
+
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+
+      # Check that the HDF5 files contains chunked array
+      # of weight names.
+      with h5py.File(fname, 'r') as h5file:
+        num_weight_arrays = len(
+            [attr for attr in h5file['model_weights']['nested_model'].attrs
+             if attr.startswith('weight_names')])
+      # The chunking of layer names array should have happend.
+      self.assertGreater(num_weight_arrays, 0)
+      out2 = model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+      # Cleanup
+      os.close(fd)
+      os.remove(fname)
+
+
+class SubclassedModel(training.Model):
+
+  def __init__(self):
+    super(SubclassedModel, self).__init__()
+    self.x_layer = keras.layers.Dense(3)
+    self.b_layer = keras.layers.Dense(1)
+
+  def call(self, a):
+    return self.b_layer(self.x_layer(a))
+
+
+class TestWeightSavingAndLoadingTFFormat(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_tensorflow_format_overwrite(self):
+    with self.test_session() as session:
+      model = SubclassedModel()
+      temp_dir = self.get_temp_dir()
+      prefix = os.path.join(temp_dir, 'ckpt')
+
+      x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
+      executing_eagerly = context.executing_eagerly()
+      model(x)  # pylint: disable=not-callable
+      if not executing_eagerly:
+        session.run([v.initializer for v in model.variables])
+      model.save_weights(prefix, save_format='tensorflow')
+      model.save_weights(prefix, save_format='tensorflow', overwrite=True)
+      with self.assertRaises(EOFError):
+        # Indirectly tests that the user is prompted
+        model.save_weights(prefix, save_format='tensorflow', overwrite=False)
+
+  def test_no_graph_pollution(self):
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph) as session:
+        model = SubclassedModel()
+        temp_dir = self.get_temp_dir()
+        prefix = os.path.join(temp_dir, 'ckpt')
+
+        x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
+        model(x)  # pylint: disable=not-callable
+        session.run([v.initializer for v in model.variables])
+        model.save_weights(prefix, save_format='tensorflow')
+        op_count = len(graph.get_operations())
+        model.save_weights(prefix, save_format='tensorflow')
+        self.assertEqual(len(graph.get_operations()), op_count)
+
+        model.load_weights(prefix)
+        op_count = len(graph.get_operations())
+        model.load_weights(prefix)
+        self.assertEqual(len(graph.get_operations()), op_count)
+
+  def _weight_loading_test_template(self, make_model_fn):
+    with self.test_session() as session:
+      model = make_model_fn()
+      temp_dir = self.get_temp_dir()
+      prefix = os.path.join(temp_dir, 'ckpt')
+
+      x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
+      executing_eagerly = context.executing_eagerly()
+      ref_y_tensor = model(x)
+      if not executing_eagerly:
+        session.run([v.initializer for v in model.variables])
+      ref_y = self.evaluate(ref_y_tensor)
+      model.save_weights(prefix, save_format='tf')
+      for v in model.variables:
+        self.evaluate(
+            v.assign(random_ops.random_normal(shape=array_ops.shape(v))))
+
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      model.load_weights(prefix)
+      y = self.evaluate(model(x))
+      self.assertAllClose(ref_y, y)
+
+      # Test restore-on-create if this is a subclassed Model (graph Networks
+      # will have already created their variables).
+      load_model = make_model_fn()
+      load_model.load_weights(prefix)
+      restore_on_create_y_tensor = load_model(x)
+      restore_on_create_y = self.evaluate(restore_on_create_y_tensor)
+      self.assertAllClose(ref_y, restore_on_create_y)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_graph_model(self):
+    def _make_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3)(a)
+      b = keras.layers.Dense(1)(x)
+      return keras.models.Model(a, b)
+
+    self._weight_loading_test_template(_make_graph_model)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_subclassed_model(self):
+    self._weight_loading_test_template(SubclassedModel)
+
+  def _new_layer_weight_loading_test_template(
+      self, first_model_fn, second_model_fn, restore_init_fn):
+    with self.test_session() as session:
+      model = first_model_fn()
+      temp_dir = self.get_temp_dir()
+      prefix = os.path.join(temp_dir, 'ckpt')
+
+      x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
+      executing_eagerly = context.executing_eagerly()
+      ref_y_tensor = model(x)
+      if not executing_eagerly:
+        session.run([v.initializer for v in model.variables])
+      ref_y = self.evaluate(ref_y_tensor)
+      model.save_weights(prefix)
+      for v in model.variables:
+        self.evaluate(
+            v.assign(random_ops.random_normal(shape=array_ops.shape(v))))
+
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      second_model = second_model_fn()
+      second_model.load_weights(prefix)
+      second_model(x)
+      self.evaluate(restore_init_fn(second_model))
+      second_model.save_weights(prefix)
+      # Check that the second model's checkpoint loads into the original model
+      model.load_weights(prefix)
+      y = self.evaluate(model(x))
+      self.assertAllClose(ref_y, y)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_graph_model_added_layer(self):
+    def _save_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3, name='first')(a)
+      b = keras.layers.Dense(1, name='second')(x)
+      return keras.models.Model(a, b)
+    def _restore_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3, name='first')(a)
+      y = keras.layers.Dense(1, name='second')(x)
+      b = keras.layers.Dense(3, name='secondjr')(y)
+      return keras.models.Model(a, b)
+    def _restore_init_fn(restore_model):
+      return [v.initializer for v in restore_model.layers[-1].variables]
+
+    self._new_layer_weight_loading_test_template(
+        _save_graph_model, _restore_graph_model,
+        _restore_init_fn)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_graph_model_added_no_weight_layer(self):
+    def _save_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3, name='first')(a)
+      b = keras.layers.Dense(1, name='second')(x)
+      return keras.models.Model(a, b)
+    def _restore_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3, name='first')(a)
+      y = keras.layers.Dropout(rate=0.1)(x)
+      b = keras.layers.Dense(1, name='second')(y)
+      return keras.models.Model(a, b)
+    def _restore_init_fn(restore_model):
+      del restore_model  # unused
+      return []
+
+    self._new_layer_weight_loading_test_template(
+        _save_graph_model, _restore_graph_model,
+        _restore_init_fn)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_subclassed_model_added_layer(self):
+
+    class SubclassedModelRestore(training.Model):
+
+      def __init__(self):
+        super(SubclassedModelRestore, self).__init__()
+        self.x_layer = keras.layers.Dense(3)
+        self.y_layer = keras.layers.Dense(3)
+        self.b_layer = keras.layers.Dense(1)
+
+      def call(self, a):
+        return self.b_layer(self.y_layer(self.x_layer(a)))
+
+    def _restore_init_fn(restore_model):
+      return [v.initializer for v in restore_model.y_layer.variables]
+
+    self._new_layer_weight_loading_test_template(
+        SubclassedModel, SubclassedModelRestore,
+        _restore_init_fn)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential.py b/tensorflow/python/keras/_impl/keras/engine/sequential.py
new file mode 100644
index 0000000000000000000000000000000000000000..8626626ca1a232de175af355e317f7df704fe148
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential.py
@@ -0,0 +1,288 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Home of the `Sequential` model.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import layers as layer_module
+from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras._impl.keras.engine import network
+from tensorflow.python.keras._impl.keras.engine.input_layer import Input
+from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras._impl.keras.engine.training import Model
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('keras.models.Sequential', 'keras.Sequential')
+class Sequential(Model):
+  """Linear stack of layers.
+
+  Arguments:
+      layers: list of layers to add to the model.
+
+  Example:
+
+  ```python
+  # Optionally, the first layer can receive an `input_shape` argument:
+  model = Sequential()
+  model.add(Dense(32, input_shape=(500,)))
+  # Afterwards, we do automatic shape inference:
+  model.add(Dense(32))
+
+  # This is identical to the following:
+  model = Sequential()
+  model.add(Dense(32, input_dim=500))
+
+  # And to the following:
+  model = Sequential()
+  model.add(Dense(32, batch_input_shape=(None, 500)))
+
+  # Note that you can also omit the `input_shape` argument:
+  # In that case the model gets built the first time you call `fit` (or other
+  # training and evaluation methods).
+  model = Sequential()
+  model.add(Dense(32))
+  model.add(Dense(32))
+  model.compile(optimizer=optimizer, loss=loss)
+  # This builds the model for the first time:
+  model.fit(x, y, batch_size=32, epochs=10)
+
+  # Note that when using this delayed-build pattern (no input shape specified),
+  # the model doesn't have any weights until the first call
+  # to a training/evaluation method (since it isn't yet built):
+  model = Sequential()
+  model.add(Dense(32))
+  model.add(Dense(32))
+  model.weights  # returns []
+
+  # Whereas if you specify the input shape, the model gets built continuously
+  # as you are adding layers:
+  model = Sequential()
+  model.add(Dense(32, input_shape=(500,)))
+  model.add(Dense(32))
+  model.weights  # returns list of length 4
+
+  When using the delayed-build pattern (no input shape specified), you can
+  choose to manually build your model by calling `build(batch_input_shape)`:
+  model = Sequential()
+  model.add(Dense(32))
+  model.add(Dense(32))
+  model.build((None, 500))
+  model.weights  # returns list of length 4
+  ```
+  """
+
+  def __init__(self, layers=None, name=None):
+    super(Sequential, self).__init__(name=name)
+
+    # Add to the model any layers passed to the constructor.
+    if layers:
+      for layer in layers:
+        self.add(layer)
+
+  @property
+  def layers(self):
+    # Historically, `sequential.layers` only returns layers that were added
+    # via `add`, and omits the auto-generated `InputLayer` that comes at the
+    # bottom of the stack.
+    if self._layers and isinstance(self._layers[0], InputLayer):
+      return self._layers[1:]
+    return self._layers
+
+  def add(self, layer):
+    """Adds a layer instance on top of the layer stack.
+
+    Arguments:
+        layer: layer instance.
+
+    Raises:
+        TypeError: If `layer` is not a layer instance.
+        ValueError: In case the `layer` argument does not
+            know its input shape.
+        ValueError: In case the `layer` argument has
+            multiple output tensors, or is already connected
+            somewhere else (forbidden in `Sequential` models).
+    """
+    if not isinstance(layer, base_layer.Layer):
+      raise TypeError('The added layer must be '
+                      'an instance of class Layer. '
+                      'Found: ' + str(layer))
+    self.built = False
+    if not self._layers:
+      set_inputs = False
+      # First layer in model: check that it is an input layer.
+      if not isinstance(layer, InputLayer):
+        # Create an input tensor and call `layer` on the input tensor.
+        # First, we need to infer the expected input shape and dtype.
+        first_layer = layer
+        if isinstance(layer, (Model, Sequential)):
+          # We were passed a model as first layer.
+          # This requires a specific way to figure out the
+          # input shape and dtype.
+          if not layer.layers:
+            raise ValueError('Cannot add an empty model '
+                             'to a `Sequential` model.')
+          # In case of nested models: recover the first layer
+          # of the deepest model to infer input shape and dtype.
+          first_layer = layer.layers[0]
+          while isinstance(first_layer, (Model, Sequential)):
+            first_layer = first_layer.layers[0]
+          batch_shape = first_layer._batch_input_shape
+          dtype = first_layer.dtype
+
+        if hasattr(first_layer, '_batch_input_shape'):
+          batch_shape = first_layer._batch_input_shape
+          dtype = first_layer.dtype
+          # Instantiate the input layer.
+          x = Input(
+              batch_shape=batch_shape,
+              dtype=dtype,
+              name=layer.name + '_input')
+          # This will build the current layer
+          # and create the node connecting the current layer
+          # to the input layer we just created.
+          layer(x)
+          set_inputs = True
+        else:
+          # The layer doesn't know about its expected shape. We will have to
+          # build the model lazily on `fit`/etc.
+          batch_shape = None
+      else:
+        # Corner case where the user passes an InputLayer layer via `add`.
+        assert len(layer._inbound_nodes[-1].output_tensors) == 1
+        set_inputs = True
+
+      if set_inputs:
+        if len(layer._inbound_nodes[-1].output_tensors) != 1:
+          raise ValueError('All layers in a Sequential model '
+                           'should have a single output tensor. '
+                           'For multi-output layers, '
+                           'use the functional API.')
+
+        self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
+        self.inputs = network.get_source_inputs(self.outputs[0])
+    elif self.outputs:
+      output_tensor = layer(self.outputs[0])
+      if isinstance(output_tensor, list):
+        raise TypeError('All layers in a Sequential model '
+                        'should have a single output tensor. '
+                        'For multi-output layers, '
+                        'use the functional API.')
+      self.outputs = [output_tensor]
+    if self.inputs:
+      self.build()
+    else:
+      self._layers.append(layer)
+
+  def pop(self):
+    """Removes the last layer in the model.
+
+    Raises:
+        TypeError: if there are no layers in the model.
+    """
+    if not self.layers:
+      raise TypeError('There are no layers in the model.')
+
+    self._layers.pop()
+    self.built = False
+    if not self.layers:
+      self.outputs = None
+      self.inputs = None
+    elif self.outputs:
+      self.layers[-1]._outbound_nodes = []
+      self.outputs = [self.layers[-1].output]
+      self.build()
+
+  def build(self, input_shape=None):
+    if input_shape and not self.inputs:
+      batch_shape = tuple(input_shape)
+      dtype = K.floatx()
+      x = Input(
+          batch_shape=batch_shape, dtype=dtype, name=self.name + '_input')
+      self.inputs = [x]
+      for layer in self._layers:
+        x = layer(x)
+      self.outputs = [x]
+
+    if self.inputs:
+      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self.built = True
+    self._track_layers(self._layers)
+
+  def predict_proba(self, x, batch_size=32, verbose=0):
+    """Generates class probability predictions for the input samples.
+
+    The input samples are processed batch by batch.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        A Numpy array of probability predictions.
+    """
+    preds = self.predict(x, batch_size, verbose)
+    if preds.min() < 0. or preds.max() > 1.:
+      logging.warning('Network returning invalid probability values. '
+                      'The last layer might not normalize predictions '
+                      'into probabilities '
+                      '(like softmax or sigmoid would).')
+    return preds
+
+  def predict_classes(self, x, batch_size=32, verbose=0):
+    """Generate class predictions for the input samples.
+
+    The input samples are processed batch by batch.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        A numpy array of class predictions.
+    """
+    proba = self.predict(x, batch_size=batch_size, verbose=verbose)
+    if proba.shape[-1] > 1:
+      return proba.argmax(axis=-1)
+    else:
+      return (proba > 0.5).astype('int32')
+
+  def get_config(self):
+    config = []
+    for layer in self.layers:
+      config.append({
+          'class_name': layer.__class__.__name__,
+          'config': layer.get_config()
+      })
+    return copy.deepcopy(config)
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    model = cls()
+    for conf in config:
+      layer = layer_module.deserialize(conf, custom_objects=custom_objects)
+      model.add(layer)
+    return model
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aba16aef3e187e9e33bdb65c7d44b0e622730ef
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
@@ -0,0 +1,177 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests specific to `Sequential` model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+from tensorflow.python.training import rmsprop
+
+
+class TestSequential(test.TestCase):
+  """Most Sequential model API tests are covered in `training_test.py`.
+  """
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_basic_methods(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_dim=2))
+    model.add(keras.layers.Dropout(0.3, name='dp'))
+    model.add(keras.layers.Dense(2, kernel_regularizer='l2',
+                                 kernel_constraint='max_norm'))
+    self.assertEqual(len(model.layers), 3)
+    self.assertEqual(len(model.weights), 2 * 2)
+    self.assertEqual(model.get_layer(name='dp').name, 'dp')
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_sequential_pop(self):
+    num_hidden = 5
+    input_dim = 3
+    batch_size = 5
+    num_classes = 2
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+    model.add(keras.layers.Dense(num_classes))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    x = np.random.random((batch_size, input_dim))
+    y = np.random.random((batch_size, num_classes))
+    model.fit(x, y, epochs=1)
+    model.pop()
+    self.assertEqual(len(model.layers), 1)
+    self.assertEqual(model.output_shape, (None, num_hidden))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    y = np.random.random((batch_size, num_hidden))
+    model.fit(x, y, epochs=1)
+
+    # Test popping single-layer model
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+    model.pop()
+    self.assertEqual(model.layers, [])
+    self.assertEqual(model.outputs, None)
+
+    # Invalid use case
+    model = keras.models.Sequential()
+    with self.assertRaises(TypeError):
+      model.pop()
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_sequential_deferred_build(self):
+    num_hidden = 5
+    input_dim = 3
+    batch_size = 5
+    num_classes = 2
+
+    model = keras.models.Sequential()
+    # We don't specify the input shape.
+    model.add(keras.layers.Dense(num_hidden))
+    model.add(keras.layers.Dense(num_classes))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    self.assertEqual(len(model.layers), 2)
+    self.assertEqual(len(model.weights), 0)
+    self.assertFalse(model.built)
+
+    x = np.random.random((batch_size, input_dim))
+    y = np.random.random((batch_size, num_classes))
+    model.fit(x, y, epochs=1)
+    self.assertTrue(model.built)
+    self.assertEqual(model.inputs[0].get_shape().as_list(), [None, input_dim])
+    self.assertEqual(model.outputs[0].get_shape().as_list(),
+                     [None, num_classes])
+    self.assertEqual(len(model.weights), 2 * 2)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_invalid_use_cases(self):
+    # Added objects must be layer instances
+    with self.assertRaises(TypeError):
+      model = keras.models.Sequential()
+      model.add(None)
+
+    # Added layers cannot have multiple outputs
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return [3 * inputs, 2 * inputs]
+
+      def compute_output_shape(self, input_shape):
+        return [input_shape, input_shape]
+
+    with self.assertRaises(ValueError):
+      model = keras.models.Sequential()
+      model.add(MyLayer(input_shape=(3,)))
+    with self.assertRaises(TypeError):
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(1, input_dim=1))
+      model.add(MyLayer())
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_nested_sequential_trainability(self):
+    input_dim = 20
+    num_units = 10
+    num_classes = 2
+
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(num_units, input_shape=(input_dim,)))
+
+    model = keras.models.Sequential()
+    model.add(inner_model)
+    model.add(keras.layers.Dense(num_classes))
+
+    self.assertEqual(len(model.layers), 2)
+
+    self.assertEqual(len(model.trainable_weights), 4)
+    inner_model.trainable = False
+    self.assertEqual(len(model.trainable_weights), 2)
+    inner_model.trainable = True
+    self.assertEqual(len(model.trainable_weights), 4)
+
+  def test_sequential_update_disabling(self):
+    val_a = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.BatchNormalization(input_shape=(4,)))
+      assert model.updates
+
+      model.trainable = False
+      assert not model.updates
+
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+      model.trainable = True
+      model.compile('sgd', 'mse')
+      assert model.updates
+
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
deleted file mode 100644
index 7de5af41c5e04e046e7d6798706f630374d5640f..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ /dev/null
@@ -1,1692 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=protected-access
-"""Base layer code and base model (Network) code.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import json
-import os
-
-import numpy as np
-from six.moves import zip  # pylint: disable=redefined-builtin
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
-from tensorflow.python.layers import base as tf_base_layers
-from tensorflow.python.layers import network as tf_network
-from tensorflow.python.layers import utils as tf_layers_util
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.tf_export import tf_export
-
-
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-except ImportError:
-  h5py = None
-
-try:
-  import yaml
-except ImportError:
-  yaml = None
-# pylint: enable=g-import-not-at-top
-
-# pylint: disable=invalid-name
-InputSpec = tf_base_layers.InputSpec
-Node = tf_base_layers.Node
-TFBaseLayer = tf_base_layers.Layer
-# pylint: enable=invalid-name
-
-
-@tf_export('keras.layers.Layer')
-class Layer(tf_base_layers.Layer):
-  """Abstract base layer class.
-
-  # Properties
-      name: String, must be unique within a model.
-      input_spec: List of InputSpec class instances
-          each entry describes one required input:
-              - ndim
-              - dtype
-          A layer with `n` input tensors must have
-          an `input_spec` of length `n`.
-      trainable: Boolean, whether the layer weights
-          will be updated during training.
-      uses_learning_phase: Whether any operation
-          of the layer uses `K.in_training_phase()`
-          or `K.in_test_phase()`.
-      input_shape: Shape tuple. Provided for convenience,
-          but note that there may be cases in which this
-          attribute is ill-defined (e.g. a shared layer
-          with multiple input shapes), in which case
-          requesting `input_shape` will raise an Exception.
-          Prefer using `layer.get_input_shape_for(input_shape)`,
-          or `layer.get_input_shape_at(node_index)`.
-      output_shape: Shape tuple. See above.
-      inbound_nodes: List of nodes.
-      outbound_nodes: List of nodes.
-      input, output: Input/output tensor(s). Note that if the layer is used
-          more than once (shared layer), this is ill-defined
-          and will raise an exception. In such cases, use
-          `layer.get_input_at(node_index)`.
-      input_mask, output_mask: Same as above, for masks.
-      trainable_weights: List of variables.
-      non_trainable_weights: List of variables.
-      weights: The concatenation of the lists trainable_weights and
-          non_trainable_weights (in this order).
-
-  # Methods
-      call(x, mask=None): Where the layer's logic lives.
-      __call__(x, mask=None): Wrapper around the layer logic (`call`).
-          If x is a Keras tensor:
-              - Connect current layer with last layer from tensor:
-                  `self._add_inbound_node(last_layer)`
-              - Add layer to tensor history
-          If layer is not built:
-              - Build from inputs shape
-      get_weights()
-      set_weights(weights)
-      get_config()
-      count_params()
-      compute_output_shape(input_shape)
-      compute_mask(x, mask)
-      get_input_at(node_index)
-      get_output_at(node_index)
-      get_input_shape_at(node_index)
-      get_output_shape_at(node_index)
-      get_input_mask_at(node_index)
-      get_output_mask_at(node_index)
-
-  # Class Methods
-      from_config(config)
-
-  # Internal methods:
-      build(input_shape)
-      _add_inbound_node(layer, index=0)
-  """
-
-  def __init__(self, **kwargs):
-    # These properties should be set by the user via keyword arguments.
-    # note that 'dtype', 'input_shape' and 'batch_input_shape'
-    # are only applicable to input layers: do not pass these keywords
-    # to non-input layers.
-    allowed_kwargs = {
-        'activity_regularizer',
-        'input_shape',
-        'batch_input_shape',
-        'batch_size',
-        'dtype',
-        'name',
-        'trainable',
-        'weights',
-    }
-    # Validate optional keyword arguments.
-    for kwarg in kwargs:
-      if kwarg not in allowed_kwargs:
-        raise TypeError('Keyword argument not understood:', kwarg)
-
-    # Get layer name.
-    name = kwargs.get('name')
-
-    # Get `trainable` status.
-    trainable = kwargs.get('trainable', True)
-
-    # Get `dtype`.
-    dtype = kwargs.get('dtype')
-    if dtype is None:
-      dtype = K.floatx()
-
-    # Call super, which will set all properties common to Keras layers
-    # and core TF layers.
-    super(Layer, self).__init__(
-        name=name, dtype=dtype, trainable=trainable,
-        activity_regularizer=kwargs.get('activity_regularizer'))
-
-    # Add properties that are Keras-only for now.
-    self.supports_masking = False
-
-    # Manage input shape information if passed.
-    if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
-      # In this case we will later create an input layer
-      # to insert before the current layer
-      if 'batch_input_shape' in kwargs:
-        batch_input_shape = tuple(kwargs['batch_input_shape'])
-      elif 'input_shape' in kwargs:
-        if 'batch_size' in kwargs:
-          batch_size = kwargs['batch_size']
-        else:
-          batch_size = None
-        batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
-      self._batch_input_shape = batch_input_shape
-
-    # Manage initial weight values if passed.
-    if 'weights' in kwargs:
-      self._initial_weights = kwargs['weights']
-    else:
-      self._initial_weights = None
-
-  def add_weight(self,
-                 name,
-                 shape,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=True,
-                 constraint=None):
-    """Adds a weight variable to the layer.
-
-    Arguments:
-        name: String, the name for the weight variable.
-        shape: The shape tuple of the weight.
-        dtype: The dtype of the weight.
-        initializer: An Initializer instance (callable).
-        regularizer: An optional Regularizer instance.
-        trainable: A boolean, whether the weight should
-            be trained via backprop or not (assuming
-            that the layer itself is also trainable).
-        constraint: An optional Constraint instance.
-
-    Returns:
-        The created weight variable.
-    """
-    if dtype is None:
-      dtype = K.floatx()
-    weight = self.add_variable(name, shape,
-                               dtype=dtype,
-                               initializer=initializers.get(initializer),
-                               regularizer=regularizers.get(regularizer),
-                               constraint=constraints.get(constraint),
-                               trainable=trainable)
-    return weight
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """This is where the layer's logic lives.
-
-    Arguments:
-        inputs: Input tensor, or list/tuple of input tensors.
-        **kwargs: Additional keyword arguments.
-
-    Returns:
-        A tensor or list/tuple of tensors.
-    """
-    return inputs
-
-  def __call__(self, inputs, **kwargs):
-    """Wrapper around self.call(), for handling internal references.
-
-    If a Keras tensor is passed:
-        - We call self._add_inbound_node().
-        - If necessary, we `build` the layer to match
-            the shape of the input(s).
-        - We update the _keras_history of the output tensor(s)
-            with the current layer.
-            This is done as part of _add_inbound_node().
-
-    Arguments:
-        inputs: Can be a tensor or list/tuple of tensors.
-        **kwargs: Additional keyword arguments to be passed to `call()`.
-
-    Returns:
-        Output of the layer's `call` method.
-
-    Raises:
-        ValueError: in case the layer is missing shape information
-            for its `build` call.
-    """
-    # Actually call the layer (optionally building it).
-    output = super(Layer, self).__call__(inputs, **kwargs)
-    if context.in_eager_mode():
-      return output
-
-    # Un-built subclassed network: build it
-    if isinstance(self, Network) and not self.inputs:
-      self._set_inputs(inputs, training=kwargs.get('training'))
-
-    # Update learning phase info.
-    output_tensors = _to_list(output)
-    uses_lp = any(
-        [getattr(x, '_uses_learning_phase', False) for x in _to_list(inputs)])
-    uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
-    for i in range(len(output_tensors)):
-      output_tensors[i]._uses_learning_phase = getattr(
-          output_tensors[i], '_uses_learning_phase', False) or uses_lp
-
-    # Optionally load weight values that were specified at layer instantiation.
-    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
-      self.set_weights(self._initial_weights)
-      del self._initial_weights
-    return output
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer.
-
-    Assumes that the layer will be built
-    to match that input shape provided.
-
-    Arguments:
-        input_shape: Shape tuple (tuple of integers)
-            or list of shape tuples (one per output tensor of the layer).
-            Shape tuples can include None for free dimensions,
-            instead of an integer.
-
-    Returns:
-        An input shape tuple.
-    """
-    logging.warning(
-        'All custom layers should implement the '
-        '`compute_output_shape` method. This layer (' + self.name + ') '
-        'is relying on the base `Layer.compute_output_shape` implementation, '
-        'which will start raising a `NotImplementedError` '
-        'as of July 1st, 2018.')
-    return input_shape
-
-  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
-    """Computes an output mask tensor.
-
-    Arguments:
-        inputs: Tensor or list of tensors.
-        mask: Tensor or list of tensors.
-
-    Returns:
-        None or a tensor (or list of tensors,
-            one per output tensor of the layer).
-    """
-    if not self.supports_masking:
-      if mask is not None:
-        if isinstance(mask, list):
-          if any(m is not None for m in mask):
-            raise TypeError('Layer ' + self.name + ' does not support masking, '
-                            'but was passed an input_mask: ' + str(mask))
-        else:
-          raise TypeError('Layer ' + self.name + ' does not support masking, '
-                          'but was passed an input_mask: ' + str(mask))
-      # masking not explicitly supported: return None as mask
-      return None
-    # if masking is explicitly supported, by default
-    # carry over the input mask
-    return mask
-
-  def get_input_mask_at(self, node_index):
-    """Retrieves the input mask tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A mask tensor
-        (or list of tensors if the layer has multiple inputs).
-    """
-    inputs = self.get_input_at(node_index)
-    if isinstance(inputs, list):
-      return [getattr(x, '_keras_mask', None) for x in inputs]
-    else:
-      return getattr(inputs, '_keras_mask', None)
-
-  def get_output_mask_at(self, node_index):
-    """Retrieves the output mask tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A mask tensor
-        (or list of tensors if the layer has multiple outputs).
-    """
-    output = self.get_output_at(node_index)
-    if isinstance(output, list):
-      return [getattr(x, '_keras_mask', None) for x in output]
-    else:
-      return getattr(output, '_keras_mask', None)
-
-  @property
-  def input_mask(self):
-    """Retrieves the input mask tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one inbound node,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input mask tensor (potentially None) or list of input
-        mask tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-    """
-    inputs = self.input
-    if isinstance(inputs, list):
-      return [getattr(x, '_keras_mask', None) for x in inputs]
-    else:
-      return getattr(inputs, '_keras_mask', None)
-
-  @property
-  def output_mask(self):
-    """Retrieves the output mask tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one inbound node,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Output mask tensor (potentially None) or list of output
-        mask tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-    """
-    output = self.output
-    if isinstance(output, list):
-      return [getattr(x, '_keras_mask', None) for x in output]
-    else:
-      return getattr(output, '_keras_mask', None)
-
-  def set_weights(self, weights):
-    """Sets the weights of the layer, from Numpy arrays.
-
-    Arguments:
-        weights: a list of Numpy arrays. The number
-            of arrays and their shape must match
-            number of the dimensions of the weights
-            of the layer (i.e. it should match the
-            output of `get_weights`).
-
-    Raises:
-        ValueError: If the provided weights list does not match the
-            layer's specifications.
-    """
-    params = self.weights
-    if len(params) != len(weights):
-      raise ValueError('You called `set_weights(weights)` on layer "' +
-                       self.name + '" with a  weight list of length ' +
-                       str(len(weights)) + ', but the layer was expecting ' +
-                       str(len(params)) + ' weights. Provided weights: ' +
-                       str(weights)[:50] + '...')
-    if not params:
-      return
-    weight_value_tuples = []
-    param_values = K.batch_get_value(params)
-    for pv, p, w in zip(param_values, params, weights):
-      if pv.shape != w.shape:
-        raise ValueError('Layer weight shape ' + str(pv.shape) +
-                         ' not compatible with '
-                         'provided weight shape ' + str(w.shape))
-      weight_value_tuples.append((p, w))
-    K.batch_set_value(weight_value_tuples)
-
-  def get_weights(self):
-    """Returns the current weights of the layer.
-
-    Returns:
-        Weights values as a list of numpy arrays.
-    """
-    params = self.weights
-    return K.batch_get_value(params)
-
-  def get_config(self):
-    """Returns the config of the layer.
-
-    A layer config is a Python dictionary (serializable)
-    containing the configuration of a layer.
-    The same layer can be reinstantiated later
-    (without its trained weights) from this configuration.
-
-    The config of a layer does not include connectivity
-    information, nor the layer class name. These are handled
-    by `Network` (one layer of abstraction above).
-
-    Returns:
-        Python dictionary.
-    """
-    config = {'name': self.name, 'trainable': self.trainable}
-    if hasattr(self, '_batch_input_shape'):
-      config['batch_input_shape'] = self._batch_input_shape
-    if hasattr(self, 'dtype'):
-      config['dtype'] = self.dtype
-    return config
-
-  @classmethod
-  def from_config(cls, config):
-    """Creates a layer from its config.
-
-    This method is the reverse of `get_config`,
-    capable of instantiating the same layer from the config
-    dictionary. It does not handle layer connectivity
-    (handled by Network), nor weights (handled by `set_weights`).
-
-    Arguments:
-        config: A Python dictionary, typically the
-            output of get_config.
-
-    Returns:
-        A layer instance.
-    """
-    return cls(**config)
-
-  @tf_base_layers.Layer.activity_regularizer.setter
-  def activity_regularizer(self, activity_regularizer):
-    self._activity_regularizer = activity_regularizer
-
-
-@tf_export('keras.layers.InputLayer')
-class InputLayer(tf_network.InputLayer, Layer):
-  """Layer to be used as an entry point into a graph.
-
-  It can either wrap an existing tensor (pass an `input_tensor` argument)
-  or create its a placeholder tensor (pass argument `input_shape`.
-
-  Arguments:
-      input_shape: Shape tuple, not including the batch axis.
-      batch_size: Optional input batch size (integer or None).
-      dtype: Datatype of the input.
-      input_tensor: Optional tensor to use as layer input
-          instead of creating a placeholder.
-      sparse: Boolean, whether the placeholder created
-          is meant to be sparse.
-      name: Name of the layer (string).
-  """
-
-  def __init__(self,
-               input_shape=None,
-               batch_size=None,
-               dtype=None,
-               input_tensor=None,
-               sparse=False,
-               name=None,
-               **kwargs):
-    if 'batch_input_shape' in kwargs:
-      batch_input_shape = kwargs.pop('batch_input_shape')
-      if input_shape and batch_input_shape:
-        raise ValueError('Only provide the input_shape OR '
-                         'batch_input_shape argument to '
-                         'InputLayer, not both at the same time.')
-      batch_size = batch_input_shape[0]
-      input_shape = batch_input_shape[1:]
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments:', kwargs.keys())
-
-    if not name:
-      prefix = 'input'
-      name = prefix + '_' + str(K.get_uid(prefix))
-
-    if not dtype:
-      if input_tensor is None:
-        dtype = K.floatx()
-      else:
-        dtype = K.dtype(input_tensor)
-    super(InputLayer, self).__init__(input_shape=input_shape,
-                                     batch_size=batch_size,
-                                     dtype=dtype,
-                                     input_tensor=input_tensor,
-                                     sparse=sparse,
-                                     name=name)
-
-  def get_config(self):
-    config = {
-        'batch_input_shape': self._batch_input_shape,
-        'dtype': self.dtype,
-        'sparse': self.sparse,
-        'name': self.name
-    }
-    return config
-
-
-@tf_export('keras.layers.Input', 'keras.Input')
-def Input(  # pylint: disable=invalid-name
-    shape=None,
-    batch_size=None,
-    name=None,
-    dtype=None,
-    sparse=False,
-    tensor=None,
-    **kwargs):
-  """`Input()` is used to instantiate a Keras tensor.
-
-  A Keras tensor is a tensor object from the underlying backend
-  (Theano or TensorFlow), which we augment with certain
-  attributes that allow us to build a Keras model
-  just by knowing the inputs and outputs of the model.
-
-  For instance, if a, b and c are Keras tensors,
-  it becomes possible to do:
-  `model = Model(input=[a, b], output=c)`
-
-  The added Keras attribute is:
-      `_keras_history`: Last layer applied to the tensor.
-          the entire layer graph is retrievable from that layer,
-          recursively.
-
-  Arguments:
-      shape: A shape tuple (integers), not including the batch size.
-          For instance, `shape=(32,)` indicates that the expected input
-          will be batches of 32-dimensional vectors.
-      batch_size: optional static batch size (integer).
-      name: An optional name string for the layer.
-          Should be unique in a model (do not reuse the same name twice).
-          It will be autogenerated if it isn't provided.
-      dtype: The data type expected by the input, as a string
-          (`float32`, `float64`, `int32`...)
-      sparse: A boolean specifying whether the placeholder
-          to be created is sparse.
-      tensor: Optional existing tensor to wrap into the `Input` layer.
-          If set, the layer will not create a placeholder tensor.
-      **kwargs: deprecated arguments support.
-
-  Returns:
-      A tensor.
-
-  Example:
-
-      ```python
-      # this is a logistic regression in Keras
-      x = Input(shape=(32,))
-      y = Dense(16, activation='softmax')(x)
-      model = Model(x, y)
-      ```
-
-  Raises:
-    ValueError: in case of invalid arguments.
-  """
-  if 'batch_shape' in kwargs:
-    batch_shape = kwargs.pop('batch_shape')
-    if shape and batch_shape:
-      raise ValueError('Only provide the shape OR '
-                       'batch_shape argument to '
-                       'Input, not both at the same time.')
-    batch_size = batch_shape[0]
-    shape = batch_shape[1:]
-  if kwargs:
-    raise ValueError('Unrecognized keyword arguments:', kwargs.keys())
-
-  if dtype is None:
-    dtype = K.floatx()
-  if not shape and tensor is None:
-    raise ValueError('Please provide to Input either a `shape`'
-                     ' or a `tensor` argument. Note that '
-                     '`shape` does not include the batch '
-                     'dimension.')
-  input_layer = InputLayer(
-      input_shape=shape,
-      batch_size=batch_size,
-      name=name,
-      dtype=dtype,
-      sparse=sparse,
-      input_tensor=tensor)
-  # Return tensor including `_keras_history`.
-  # Note that in this case train_output and test_output are the same pointer.
-  outputs = input_layer._inbound_nodes[0].output_tensors
-  if len(outputs) == 1:
-    return outputs[0]
-  else:
-    return outputs
-
-
-class Network(tf_network.GraphNetwork, Layer):
-  """A Network is a directed acyclic graph of layers.
-
-  It is the topological form of a "model". A Model
-  is simply a Network with added training routines.
-
-  # Properties
-      name
-      inputs
-      outputs
-      input_layers
-      output_layers
-      input_spec (list of class instances)
-          each entry describes one required input:
-              - ndim
-              - dtype
-      trainable (boolean)
-      input_shape
-      output_shape
-      inbound_nodes: list of nodes
-      outbound_nodes: list of nodes
-      trainable_weights (list of variables)
-      non_trainable_weights (list of variables)
-
-  # Methods
-      summary
-      get_layer
-      get_weights
-      set_weights
-      get_config
-      compute_output_shape
-
-  # Class Methods
-      from_config
-  """
-
-  def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
-    # Signature detection
-    if (len(args) == 2 or
-        len(args) == 1 and 'outputs' in kwargs or
-        'inputs' in kwargs and 'outputs' in kwargs):
-      # Graph network
-      self._init_graph_network(*args, **kwargs)
-    else:
-      # Subclassed network
-      self._init_subclassed_network(**kwargs)
-
-  def _init_graph_network(self, inputs, outputs, name=None):
-    # TODO(fchollet): merge back tf.layers.Network and tf.keras.Network
-    # into a single class tf.keras.Network
-    super(Network, self).__init__(inputs, outputs, name=name)
-
-    self._is_compiled = False
-    self._expects_training_arg = False
-
-    self.supports_masking = False
-    self.optimizer = None
-
-    # Fill in the output mask cache.
-    masks = []
-    for x in self.inputs:
-      mask = x._keras_mask if hasattr(x, '_keras_mask') else None
-      masks.append(mask)
-    mask_cache_key = (tf_layers_util.object_list_uid(self.inputs) + '_' +
-                      tf_layers_util.object_list_uid(masks))
-    masks = []
-    for x in self.outputs:
-      mask = x._keras_mask if hasattr(x, '_keras_mask') else None
-      masks.append(mask)
-    if len(masks) == 1:
-      mask = masks[0]
-    else:
-      mask = masks
-    self._output_mask_cache[mask_cache_key] = mask
-
-    # Build self.input_names and self.output_names.
-    self.input_names = []
-    self.output_names = []
-    self._feed_input_names = []
-    self._feed_inputs = []
-    self._feed_input_shapes = []
-    for i, layer in enumerate(self._input_layers):
-      self.input_names.append(layer.name)
-      if layer.is_placeholder:
-        self._feed_input_names.append(layer.name)
-        self._feed_input_shapes.append(K.int_shape(self.inputs[i]))
-        # layer.input gives an error in eager mode
-        if context.in_graph_mode():
-          self._feed_inputs.append(layer.input)
-    for layer in self._output_layers:
-      self.output_names.append(layer.name)
-
-  def _init_subclassed_network(self, name=None):
-    self._init_set_name(name)
-    self._layers = []
-    self._is_graph_network = False
-    self._is_compiled = False
-    if 'training' in tf_inspect.getargspec(self.call).args:
-      self._expects_training_arg = True
-    else:
-      self._expects_training_arg = False
-
-    self.outputs = None
-    self.inputs = None
-    self.trainable = True
-    self.supports_masking = False
-    self.built = False
-    self.optimizer = None
-
-    # Not used, exists for compatibility purposes due to implementation of
-    # the base layer tf.layers.Layer - TODO(fchollet): clean up when refactoring
-    self._scope = None
-    self._reuse = None
-    self._dtype = None
-    self._graph = None
-    self._activity_regularizer = None
-
-    # Used in symbolic mode only
-    self._updates = []
-    self._losses = []
-
-    # Used in symbolic mode only, only in conjonction with graph-networks
-    self._outbound_nodes = []
-    self._inbound_nodes = []
-
-  def __setattr__(self, name, value):
-    if isinstance(value, (tf_base_layers.Layer, Network)):
-      try:
-        is_graph_network = self._is_graph_network
-      except AttributeError:
-        raise RuntimeError('It looks like you are subclassing `Model` and you '
-                           'forgot to call `super(YourClass, self).__init__()`.'
-                           ' Always start with this line.')
-      if not is_graph_network:
-        if value not in self._layers:
-          self._layers.append(value)
-    super(Network, self).__setattr__(name, value)
-
-  def add_variable(self, name, shape, dtype=None, initializer=None,
-                   regularizer=None, trainable=True, constraint=None):
-    raise NotImplementedError('`add_variable` is not supported on Networks')
-
-  def add_loss(self, *args, **kwargs):
-    if context.in_eager_mode():
-      raise NotImplementedError('`add_loss` is not supported in eager-mode '
-                                'on Networks')
-    super(Network, self).add_loss(*args, **kwargs)
-
-  @property
-  def uses_learning_phase(self):
-    return any(
-        [getattr(x, '_uses_learning_phase', False) for x in self.outputs])
-
-  @property
-  def stateful(self):
-    return any([(hasattr(layer, 'stateful') and layer.stateful)
-                for layer in self.layers])
-
-  def reset_states(self):
-    for layer in self.layers:
-      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
-        layer.reset_states()
-
-  @property
-  def state_updates(self):
-    """Returns the `updates` from all layers that are stateful.
-
-    This is useful for separating training updates and
-    state updates, e.g. when we need to update a layer's internal state
-    during prediction.
-
-    Returns:
-        A list of update ops.
-    """
-    state_updates = []
-    for layer in self.layers:
-      if getattr(layer, 'stateful', False):
-        if hasattr(layer, 'updates'):
-          state_updates += layer.updates
-    return state_updates
-
-  def get_weights(self):
-    """Retrieves the weights of the model.
-
-    Returns:
-        A flat list of Numpy arrays.
-    """
-    weights = []
-    for layer in self.layers:
-      weights += layer.weights
-    return K.batch_get_value(weights)
-
-  def set_weights(self, weights):
-    """Sets the weights of the model.
-
-    Arguments:
-        weights: A list of Numpy arrays with shapes and types matching
-            the output of `model.get_weights()`.
-    """
-    tuples = []
-    for layer in self.layers:
-      num_param = len(layer.weights)
-      layer_weights = weights[:num_param]
-      for sw, w in zip(layer.weights, layer_weights):
-        tuples.append((sw, w))
-      weights = weights[num_param:]
-    K.batch_set_value(tuples)
-
-  def compute_mask(self, inputs, mask):
-    if not self._is_graph_network:
-      return None
-
-    inputs = _to_list(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = _to_list(mask)
-    cache_key = (tf_layers_util.object_list_uid(inputs)
-                 + '_' + tf_layers_util.object_list_uid(masks))
-    if cache_key in self._output_mask_cache:
-      return self._output_mask_cache[cache_key]
-    else:
-      _, output_masks = self._run_internal_graph(inputs, masks)
-      return output_masks
-
-  def get_config(self):
-    if not self._is_graph_network:
-      raise NotImplementedError
-
-    config = {
-        'name': self.name,
-    }
-    node_conversion_map = {}
-    for layer in self.layers:
-      if issubclass(layer.__class__, Network):
-        # Networks start with a pre-existing node
-        # linking their input to output.
-        kept_nodes = 1
-      else:
-        kept_nodes = 0
-      for original_node_index, node in enumerate(layer._inbound_nodes):
-        node_key = tf_network._make_node_key(layer.name,
-                                             original_node_index)
-        if node_key in self._network_nodes:
-          node_conversion_map[node_key] = kept_nodes
-          kept_nodes += 1
-    layer_configs = []
-    for layer in self.layers:  # From the earliest layers on.
-      layer_class_name = layer.__class__.__name__
-      layer_config = layer.get_config()
-      filtered_inbound_nodes = []
-      for original_node_index, node in enumerate(layer._inbound_nodes):
-        node_key = tf_network._make_node_key(layer.name,
-                                             original_node_index)
-        if node_key in self._network_nodes:
-          # The node is relevant to the model:
-          # add to filtered_inbound_nodes.
-          if node.arguments:
-            try:
-              json.dumps(node.arguments)
-              kwargs = node.arguments
-            except TypeError:
-              logging.warning(
-                  'Layer ' + layer.name +
-                  ' was passed non-serializable keyword arguments: ' +
-                  str(node.arguments) + '. They will not be included '
-                  'in the serialized model (and thus will be missing '
-                  'at deserialization time).')
-              kwargs = {}
-          else:
-            kwargs = {}
-          if node.inbound_layers:
-            node_data = []
-            for i in range(len(node.inbound_layers)):
-              inbound_layer = node.inbound_layers[i]
-              node_index = node.node_indices[i]
-              tensor_index = node.tensor_indices[i]
-              node_key = tf_network._make_node_key(inbound_layer.name,
-                                                   node_index)
-              new_node_index = node_conversion_map.get(node_key, 0)
-              node_data.append(
-                  [inbound_layer.name, new_node_index, tensor_index, kwargs])
-            filtered_inbound_nodes.append(node_data)
-      layer_configs.append({
-          'name': layer.name,
-          'class_name': layer_class_name,
-          'config': layer_config,
-          'inbound_nodes': filtered_inbound_nodes,
-      })
-    config['layers'] = layer_configs
-
-    # Gather info about inputs and outputs.
-    model_inputs = []
-    for i in range(len(self._input_layers)):
-      layer, node_index, tensor_index = self._input_coordinates[i]
-      node_key = tf_network._make_node_key(layer.name,
-                                           node_index)
-      if node_key not in self._network_nodes:
-        continue
-      new_node_index = node_conversion_map[node_key]
-      model_inputs.append([layer.name, new_node_index, tensor_index])
-    config['input_layers'] = model_inputs
-    model_outputs = []
-    for i in range(len(self._output_layers)):
-      layer, node_index, tensor_index = self._output_coordinates[i]
-      node_key = tf_network._make_node_key(layer.name,
-                                           node_index)
-      if node_key not in self._network_nodes:
-        continue
-      new_node_index = node_conversion_map[node_key]
-      model_outputs.append([layer.name, new_node_index, tensor_index])
-    config['output_layers'] = model_outputs
-    return copy.deepcopy(config)
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    """Instantiates a Model from its config (output of `get_config()`).
-
-    Arguments:
-        config: Model config dictionary.
-        custom_objects: Optional dictionary mapping names
-            (strings) to custom classes or functions to be
-            considered during deserialization.
-
-    Returns:
-        A model instance.
-
-    Raises:
-        ValueError: In case of improperly formatted config dict.
-    """
-    # Layer instances created during
-    # the graph reconstruction process
-    created_layers = {}
-
-    # Dictionary mapping layer instances to
-    # node data that specifies a layer call.
-    # It acts as a queue that maintains any unprocessed
-    # layer call until it becomes possible to process it
-    # (i.e. until the input tensors to the call all exist).
-    unprocessed_nodes = {}
-
-    def add_unprocessed_node(layer, node_data):
-      if layer not in unprocessed_nodes:
-        unprocessed_nodes[layer] = [node_data]
-      else:
-        unprocessed_nodes[layer].append(node_data)
-
-    def process_node(layer, node_data):
-      """Deserialize a node.
-
-      Arguments:
-          layer: layer instance.
-          node_data: node config dict.
-
-      Raises:
-          ValueError: In case of improperly formatted `node_data` dict.
-      """
-      input_tensors = []
-      for input_data in node_data:
-        inbound_layer_name = input_data[0]
-        inbound_node_index = input_data[1]
-        inbound_tensor_index = input_data[2]
-        if len(input_data) == 3:
-          kwargs = {}
-        elif len(input_data) == 4:
-          kwargs = input_data[3]
-        else:
-          raise ValueError('Improperly formatted model config.')
-        if inbound_layer_name not in created_layers:
-          add_unprocessed_node(layer, node_data)
-          return
-        inbound_layer = created_layers[inbound_layer_name]
-        if len(inbound_layer._inbound_nodes) <= inbound_node_index:
-          add_unprocessed_node(layer, node_data)
-          return
-        inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
-        input_tensors.append(inbound_node.output_tensors[inbound_tensor_index])
-      # Call layer on its inputs, thus creating the node
-      # and building the layer if needed.
-      if input_tensors:
-        if len(input_tensors) == 1:
-          layer(input_tensors[0], **kwargs)
-        else:
-          layer(input_tensors, **kwargs)
-
-    def process_layer(layer_data):
-      """Deserialize a layer, then call it on appropriate inputs.
-
-      Arguments:
-          layer_data: layer config dict.
-
-      Raises:
-          ValueError: In case of improperly formatted `layer_data` dict.
-      """
-      layer_name = layer_data['name']
-
-      # Instantiate layer.
-      from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-
-      layer = deserialize_layer(layer_data, custom_objects=custom_objects)
-      created_layers[layer_name] = layer
-
-      # Gather layer inputs.
-      inbound_nodes_data = layer_data['inbound_nodes']
-      for node_data in inbound_nodes_data:
-        # We don't process nodes (i.e. make layer calls)
-        # on the fly because the inbound node may not yet exist,
-        # in case of layer shared at different topological depths
-        # (e.g. a model such as A(B(A(B(x)))))
-        add_unprocessed_node(layer, node_data)
-
-    # First, we create all layers and enqueue nodes to be processed
-    for layer_data in config['layers']:
-      process_layer(layer_data)
-    # Then we process nodes in order of layer depth.
-    # Nodes that cannot yet be processed (if the inbound node
-    # does not yet exist) are re-enqueued, and the process
-    # is repeated until all nodes are processed.
-    while unprocessed_nodes:
-      for layer_data in config['layers']:
-        layer = created_layers[layer_data['name']]
-        if layer in unprocessed_nodes:
-          for node_data in unprocessed_nodes.pop(layer):
-            process_node(layer, node_data)
-
-    name = config.get('name')
-    input_tensors = []
-    output_tensors = []
-    for layer_data in config['input_layers']:
-      layer_name, node_index, tensor_index = layer_data
-      assert layer_name in created_layers
-      layer = created_layers[layer_name]
-      layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-      input_tensors.append(layer_output_tensors[tensor_index])
-    for layer_data in config['output_layers']:
-      layer_name, node_index, tensor_index = layer_data
-      assert layer_name in created_layers
-      layer = created_layers[layer_name]
-      layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-      output_tensors.append(layer_output_tensors[tensor_index])
-    return cls(inputs=input_tensors, outputs=output_tensors, name=name)
-
-  def save(self, filepath, overwrite=True, include_optimizer=True):
-    """Save the model to a single HDF5 file.
-
-    The savefile includes:
-        - The model architecture, allowing to re-instantiate the model.
-        - The model weights.
-        - The state of the optimizer, allowing to resume training
-            exactly where you left off.
-
-    This allows you to save the entirety of the state of a model
-    in a single file.
-
-    Saved models can be reinstantiated via `keras.models.load_model`.
-    The model returned by `load_model`
-    is a compiled model ready to be used (unless the saved model
-    was never compiled in the first place).
-
-    Arguments:
-        filepath: String, path to the file to save the weights to.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        include_optimizer: If True, save optimizer's state together.
-
-    Example:
-
-    ```python
-    from keras.models import load_model
-
-    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
-    del model  # deletes the existing model
-
-    # returns a compiled model
-    # identical to the previous one
-    model = load_model('my_model.h5')
-    ```
-    """
-    if not self._is_graph_network:
-      raise NotImplementedError
-
-    from tensorflow.python.keras._impl.keras.models import save_model  # pylint: disable=g-import-not-at-top
-    save_model(self, filepath, overwrite, include_optimizer)
-
-  def save_weights(self, filepath, overwrite=True):
-    """Dumps all layer weights to a HDF5 file.
-
-    The weight file has:
-        - `layer_names` (attribute), a list of strings
-            (ordered names of model layers).
-        - For every layer, a `group` named `layer.name`
-            - For every such layer group, a group attribute `weight_names`,
-                a list of strings
-                (ordered names of weights tensor of the layer).
-            - For every weight in the layer, a dataset
-                storing the weight value, named after the weight tensor.
-
-    Arguments:
-        filepath: String, path to the file to save the weights to.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-
-    Raises:
-        ImportError: If h5py is not available.
-    """
-    if h5py is None:
-      raise ImportError('`save_weights` requires h5py.')
-    # If file exists and should not be overwritten:
-    if not overwrite and os.path.isfile(filepath):
-      proceed = ask_to_proceed_with_overwrite(filepath)
-      if not proceed:
-        return
-    with h5py.File(filepath, 'w') as f:
-      save_weights_to_hdf5_group(f, self.layers)
-
-  def load_weights(self, filepath, by_name=False):
-    """Loads all layer weights from a HDF5 save file.
-
-    If `by_name` is False (default) weights are loaded
-    based on the network's topology, meaning the architecture
-    should be the same as when the weights were saved.
-    Note that layers that don't have weights are not taken
-    into account in the topological ordering, so adding or
-    removing layers is fine as long as they don't have weights.
-
-    If `by_name` is True, weights are loaded into layers
-    only if they share the same name. This is useful
-    for fine-tuning or transfer-learning models where
-    some of the layers have changed.
-
-    Arguments:
-        filepath: String, path to the weights file to load.
-        by_name: Boolean, whether to load weights by name
-            or by topological order.
-
-    Raises:
-        ImportError: If h5py is not available.
-    """
-    if h5py is None:
-      raise ImportError('`load_weights` requires h5py.')
-    with h5py.File(filepath, 'r') as f:
-      if 'layer_names' not in f.attrs and 'model_weights' in f:
-        f = f['model_weights']
-      if by_name:
-        load_weights_from_hdf5_group_by_name(f, self.layers)
-      else:
-        load_weights_from_hdf5_group(f, self.layers)
-
-  def _updated_config(self):
-    """Util hared between different serialization methods.
-
-    Returns:
-        Model config with Keras version information added.
-    """
-    from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-    config = self.get_config()
-    model_config = {
-        'class_name': self.__class__.__name__,
-        'config': config,
-        'keras_version': keras_version,
-        'backend': K.backend()
-    }
-    return model_config
-
-  def to_json(self, **kwargs):
-    """Returns a JSON string containing the network configuration.
-
-    To load a network from a JSON save file, use
-    `keras.models.model_from_json(json_string, custom_objects={})`.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `json.dumps()`.
-
-    Returns:
-        A JSON string.
-    """
-    if not self._is_graph_network:
-      raise NotImplementedError
-
-    def get_json_type(obj):
-      # If obj is any numpy type
-      if type(obj).__module__ == np.__name__:
-        return obj.item()
-
-      # If obj is a python 'type'
-      if type(obj).__name__ == type.__name__:
-        return obj.__name__
-
-      raise TypeError('Not JSON Serializable:', obj)
-
-    model_config = self._updated_config()
-    return json.dumps(model_config, default=get_json_type, **kwargs)
-
-  def to_yaml(self, **kwargs):
-    """Returns a yaml string containing the network configuration.
-
-    To load a network from a yaml save file, use
-    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
-
-    `custom_objects` should be a dictionary mapping
-    the names of custom losses / layers / etc to the corresponding
-    functions / classes.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `yaml.dump()`.
-
-    Returns:
-        A YAML string.
-
-    Raises:
-        ImportError: if yaml module is not found.
-    """
-    if not self._is_graph_network:
-      raise NotImplementedError
-
-    if yaml is None:
-      raise ImportError('Requires yaml module installed.')
-    return yaml.dump(self._updated_config(), **kwargs)
-
-  def summary(self, line_length=None, positions=None, print_fn=None):
-    """Prints a string summary of the network.
-
-    Arguments:
-        line_length: Total length of printed lines
-            (e.g. set this to adapt the display to different
-            terminal window sizes).
-        positions: Relative or absolute positions of log elements
-            in each line. If not provided,
-            defaults to `[.33, .55, .67, 1.]`.
-        print_fn: Print function to use. Defaults to `print`.
-            It will be called on each line of the summary.
-            You can set it to a custom function
-            in order to capture the string summary.
-    """
-    print_layer_summary(self,
-                        line_length=line_length,
-                        positions=positions,
-                        print_fn=print_fn)
-
-
-def get_source_inputs(tensor, layer=None, node_index=None):
-  """Returns the list of input tensors necessary to compute `tensor`.
-
-  Output will always be a list of tensors
-  (potentially with 1 element).
-
-  Arguments:
-      tensor: The tensor to start from.
-      layer: Origin layer of the tensor. Will be
-          determined via tensor._keras_history if not provided.
-      node_index: Origin node index of the tensor.
-
-  Returns:
-      List of input tensors.
-  """
-  if not hasattr(tensor, '_keras_history'):
-    return tensor
-
-  if layer is None or node_index:
-    layer, node_index, _ = tensor._keras_history
-  if not layer._inbound_nodes:
-    return [tensor]
-  else:
-    node = layer._inbound_nodes[node_index]
-    if not node.inbound_layers:
-      # Reached an Input layer, stop recursion.
-      return node.input_tensors
-    else:
-      source_tensors = []
-      for i in range(len(node.inbound_layers)):
-        x = node.input_tensors[i]
-        layer = node.inbound_layers[i]
-        node_index = node.node_indices[i]
-        previous_sources = get_source_inputs(x, layer, node_index)
-        # Avoid input redundancy.
-        for x in previous_sources:
-          if x not in source_tensors:
-            source_tensors.append(x)
-      return source_tensors
-
-
-def _to_list(x):
-  """Normalizes a list/tensor into a list.
-
-  If a tensor is passed, we return
-  a list of size 1 containing the tensor.
-
-  Arguments:
-      x: target object to be normalized.
-
-  Returns:
-      A list.
-  """
-  if isinstance(x, list):
-    return x
-  return [x]
-
-
-def save_weights_to_hdf5_group(f, layers):
-  from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-  f.attrs['layer_names'] = [layer.name.encode('utf8') for layer in layers]
-  f.attrs['backend'] = K.backend().encode('utf8')
-  f.attrs['keras_version'] = str(keras_version).encode('utf8')
-
-  for layer in layers:
-    g = f.create_group(layer.name)
-    symbolic_weights = layer.weights
-    weight_values = K.batch_get_value(symbolic_weights)
-    weight_names = []
-    for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
-      if hasattr(w, 'name') and w.name:
-        name = str(w.name)
-      else:
-        name = 'param_' + str(i)
-      weight_names.append(name.encode('utf8'))
-    g.attrs['weight_names'] = weight_names
-    for name, val in zip(weight_names, weight_values):
-      param_dset = g.create_dataset(name, val.shape, dtype=val.dtype)
-      if not val.shape:
-        # scalar
-        param_dset[()] = val
-      else:
-        param_dset[:] = val
-
-
-def preprocess_weights_for_loading(layer,
-                                   weights,
-                                   original_keras_version=None,
-                                   original_backend=None):
-  """Converts layers weights from Keras 1 format to Keras 2.
-
-  Arguments:
-      layer: Layer instance.
-      weights: List of weights values (Numpy arrays).
-      original_keras_version: Keras version for the weights, as a string.
-      original_backend: Keras backend the weights were trained with,
-          as a string.
-
-  Returns:
-      A list of weights values (Numpy arrays).
-  """
-  if layer.__class__.__name__ == 'Bidirectional':
-    num_weights_per_layer = len(weights) // 2
-    forward_weights = preprocess_weights_for_loading(
-        layer.forward_layer, weights[:num_weights_per_layer],
-        original_keras_version, original_backend)
-    backward_weights = preprocess_weights_for_loading(
-        layer.backward_layer, weights[num_weights_per_layer:],
-        original_keras_version, original_backend)
-    weights = forward_weights + backward_weights
-
-  if original_keras_version == '1':
-    if layer.__class__.__name__ == 'TimeDistributed':
-      weights = preprocess_weights_for_loading(
-          layer.layer, weights, original_keras_version, original_backend)
-
-    if layer.__class__.__name__ == 'Conv1D':
-      shape = weights[0].shape
-      # Handle Keras 1.1 format
-      if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
-        # Legacy shape:
-        # (filters, input_dim, filter_length, 1)
-        assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0],
-                                                           1)
-        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
-      weights[0] = weights[0][:, 0, :, :]
-
-    if layer.__class__.__name__ == 'Conv2D':
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, kernel_rows, kernel_cols)
-        # new: (kernel_rows, kernel_cols, stack_size, filters)
-        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
-
-    if layer.__class__.__name__ == 'Conv2DTranspose':
-      if layer.data_format == 'channels_last':
-        # old: (kernel_rows, kernel_cols, stack_size, filters)
-        # new: (kernel_rows, kernel_cols, filters, stack_size)
-        weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, kernel_rows, kernel_cols)
-        # new: (kernel_rows, kernel_cols, filters, stack_size)
-        weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
-
-    if layer.__class__.__name__ == 'Conv3D':
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, ...)
-        # new: (..., stack_size, filters)
-        weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
-
-    if layer.__class__.__name__ == 'GRU':
-      if len(weights) == 9:
-        kernel = np.concatenate([weights[0], weights[3], weights[6]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[4], weights[7]], axis=-1)
-        bias = np.concatenate([weights[2], weights[5], weights[8]], axis=-1)
-        weights = [kernel, recurrent_kernel, bias]
-
-    if layer.__class__.__name__ == 'LSTM':
-      if len(weights) == 12:
-        # old: i, c, f, o
-        # new: i, f, c, o
-        kernel = np.concatenate(
-            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
-        bias = np.concatenate(
-            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
-        weights = [kernel, recurrent_kernel, bias]
-
-    if layer.__class__.__name__ == 'ConvLSTM2D':
-      if len(weights) == 12:
-        kernel = np.concatenate(
-            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
-        bias = np.concatenate(
-            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
-        if layer.data_format == 'channels_first':
-          # old: (filters, stack_size, kernel_rows, kernel_cols)
-          # new: (kernel_rows, kernel_cols, stack_size, filters)
-          kernel = np.transpose(kernel, (2, 3, 1, 0))
-          recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
-        weights = [kernel, recurrent_kernel, bias]
-
-    if layer.__class__.__name__ in ['Model', 'Sequential']:
-      new_weights = []
-      # trainable weights
-      for sublayer in layer.layers:
-        num_weights = len(sublayer.trainable_weights)
-        if num_weights > 0:
-          new_weights.extend(
-              preprocess_weights_for_loading(
-                  layer=sublayer,
-                  weights=weights[:num_weights],
-                  original_keras_version=original_keras_version,
-                  original_backend=original_backend))
-          weights = weights[num_weights:]
-
-      # non-trainable weights
-      for sublayer in layer.layers:
-        num_weights = len([
-            l for l in sublayer.weights if l not in sublayer.trainable_weights
-        ])
-        if num_weights > 0:
-          new_weights.extend(
-              preprocess_weights_for_loading(
-                  layer=sublayer,
-                  weights=weights[:num_weights],
-                  original_keras_version=original_keras_version,
-                  original_backend=original_backend))
-          weights = weights[num_weights:]
-      weights = new_weights
-
-  conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
-  if layer.__class__.__name__ in conv_layers:
-    if original_backend == 'theano':
-      weights[0] = conv_utils.convert_kernel(weights[0])
-      if layer.__class__.__name__ == 'ConvLSTM2D':
-        weights[1] = conv_utils.convert_kernel(weights[1])
-    if K.int_shape(layer.weights[0]) != weights[0].shape:
-      weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
-      if layer.__class__.__name__ == 'ConvLSTM2D':
-        weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
-
-  # Convert the weights of CuDNNLSTM so that they could be loaded into LSTM
-  if layer.__class__.__name__ == 'LSTM' and len(weights) == 3:
-    # Determine if loading a CuDNNLSTM layer from the number of bias weights:
-    # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
-    # if there's no bias weight in the file, skip this conversion
-    units = weights[1].shape[0]
-    bias = weights[2]
-    if len(bias) == units * 8:
-      # reshape the kernels
-      kernels = np.split(weights[0], 4, axis=1)
-      kernels = [
-          kernel.reshape(-1).reshape(kernel.shape, order='F')
-          for kernel in kernels
-      ]
-      weights[0] = np.concatenate(kernels, axis=1)
-
-      # transpose the recurrent kernels
-      recurrent_kernels = np.split(weights[1], 4, axis=1)
-      recurrent_kernels = [kernel.T for kernel in recurrent_kernels]
-      weights[1] = np.concatenate(recurrent_kernels, axis=1)
-
-      # split the bias into half and merge
-      weights[2] = bias[:units * 4] + bias[units * 4:]
-
-  return weights
-
-
-def load_weights_from_hdf5_group(f, layers):
-  """Implements topological (order-based) weight loading.
-
-  Arguments:
-      f: A pointer to a HDF5 group.
-      layers: a list of target layers.
-
-  Raises:
-      ValueError: in case of mismatch between provided layers
-          and weights file.
-  """
-  if 'keras_version' in f.attrs:
-    original_keras_version = f.attrs['keras_version'].decode('utf8')
-  else:
-    original_keras_version = '1'
-  if 'backend' in f.attrs:
-    original_backend = f.attrs['backend'].decode('utf8')
-  else:
-    original_backend = None
-
-  filtered_layers = []
-  for layer in layers:
-    weights = layer.weights
-    if weights:
-      filtered_layers.append(layer)
-
-  layer_names = [n.decode('utf8') for n in f.attrs['layer_names']]
-  filtered_layer_names = []
-  for name in layer_names:
-    g = f[name]
-    weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
-    if weight_names:
-      filtered_layer_names.append(name)
-  layer_names = filtered_layer_names
-  if len(layer_names) != len(filtered_layers):
-    raise ValueError('You are trying to load a weight file '
-                     'containing ' + str(len(layer_names)) +
-                     ' layers into a model with ' + str(len(filtered_layers)) +
-                     ' layers.')
-
-  # We batch weight value assignments in a single backend call
-  # which provides a speedup in TensorFlow.
-  weight_value_tuples = []
-  for k, name in enumerate(layer_names):
-    g = f[name]
-    weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
-    weight_values = [g[weight_name] for weight_name in weight_names]
-    layer = filtered_layers[k]
-    symbolic_weights = layer.weights
-    weight_values = preprocess_weights_for_loading(
-        layer, weight_values, original_keras_version, original_backend)
-    if len(weight_values) != len(symbolic_weights):
-      raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
-                       '" in the current model) was found to '
-                       'correspond to layer ' + name + ' in the save file. '
-                       'However the new layer ' + layer.name + ' expects ' +
-                       str(len(symbolic_weights)) +
-                       ' weights, but the saved weights have ' +
-                       str(len(weight_values)) + ' elements.')
-    weight_value_tuples += zip(symbolic_weights, weight_values)
-  K.batch_set_value(weight_value_tuples)
-
-
-def load_weights_from_hdf5_group_by_name(f, layers):
-  """Implements name-based weight loading.
-
-  (instead of topological weight loading).
-
-  Layers that have no matching name are skipped.
-
-  Arguments:
-      f: A pointer to a HDF5 group.
-      layers: a list of target layers.
-
-  Raises:
-      ValueError: in case of mismatch between provided layers
-          and weights file.
-  """
-  if 'keras_version' in f.attrs:
-    original_keras_version = f.attrs['keras_version'].decode('utf8')
-  else:
-    original_keras_version = '1'
-  if 'backend' in f.attrs:
-    original_backend = f.attrs['backend'].decode('utf8')
-  else:
-    original_backend = None
-
-  # New file format.
-  layer_names = [n.decode('utf8') for n in f.attrs['layer_names']]
-
-  # Reverse index of layer name to list of layers with name.
-  index = {}
-  for layer in layers:
-    if layer.name:
-      index.setdefault(layer.name, []).append(layer)
-
-  # We batch weight value assignments in a single backend call
-  # which provides a speedup in TensorFlow.
-  weight_value_tuples = []
-  for k, name in enumerate(layer_names):
-    g = f[name]
-    weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
-    weight_values = [g[weight_name] for weight_name in weight_names]
-
-    for layer in index.get(name, []):
-      symbolic_weights = layer.weights
-      weight_values = preprocess_weights_for_loading(
-          layer, weight_values, original_keras_version, original_backend)
-      if len(weight_values) != len(symbolic_weights):
-        raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
-                         '") expects ' + str(len(symbolic_weights)) +
-                         ' weight(s), but the saved weights' + ' have ' +
-                         str(len(weight_values)) + ' element(s).')
-      # Set values.
-      for i in range(len(weight_values)):
-        weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
-  K.batch_set_value(weight_value_tuples)
-
-
-def shape_type_conversion(fn):
-  """Decorator that handles tuple/TensorShape conversion.
-
-  Used in `compute_output_shape` and `build`.
-
-  Arguments:
-    fn: function to wrap.
-
-  Returns:
-    Wrapped function.
-  """
-
-  def wrapper(instance, input_shape):
-    if input_shape is not None:
-      if isinstance(input_shape, list):
-        input_shape = [
-            tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
-      else:
-        input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
-    output_shape = fn(instance, input_shape)
-    if output_shape is not None:
-      if isinstance(output_shape, list):
-        return [tensor_shape.TensorShape(x) for x in output_shape]
-      return tensor_shape.TensorShape(output_shape)
-
-  return wrapper
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index 28ddc094ee585ca4011d0cdaf190cfe826a2f0ce..6993a042890088b21a3ab56bca75f5a0b3b2242a 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -18,13 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import shutil
-
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -35,37 +36,267 @@ try:
 except ImportError:
   yaml = None
 
-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
 
 class TopologyConstructionTest(test.TestCase):
 
-  def test_get_updates_for(self):
-    a = keras.layers.Input(shape=(1,))
-    dense_layer = keras.layers.Dense(1)
-    dense_layer.build((None, 1))
-    update_1 = state_ops.assign_add(dense_layer.kernel, a)
-    update_2 = state_ops.assign_add(dense_layer.kernel, [[1.]])
-    dense_layer.add_update(update_1, inputs=a)
-    dense_layer.add_update(update_2, inputs=None)
-
-    self.assertListEqual(dense_layer.get_updates_for(a), [update_1])
-    self.assertListEqual(dense_layer.get_updates_for(None), [update_2])
-
-  def test_get_losses_for(self):
-    a = keras.layers.Input(shape=(1,))
-    dense_layer = keras.layers.Dense(1)
-    dense_layer.build((None, 1))
-    loss_1 = math_ops.reduce_sum(a)
-    loss_2 = math_ops.reduce_sum(dense_layer.kernel)
-    dense_layer.add_loss(loss_1, inputs=a)
-    dense_layer.add_loss(loss_2, inputs=None)
-
-    self.assertListEqual(dense_layer.get_losses_for(a), [loss_1])
-    self.assertListEqual(dense_layer.get_losses_for(None), [loss_2])
+  def test_get_updates(self):
+
+    class MyLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.add_update(state_ops.assign_add(self.a, [[1.]],
+                                             name='unconditional_update'))
+        self.built = True
+
+      def call(self, inputs):
+        self.add_update(state_ops.assign_add(self.b, inputs,
+                                             name='conditional_update'),
+                        inputs=True)
+        return inputs + 1
+
+    x1 = keras.Input(shape=(1,))
+    layer = MyLayer()
+    _ = layer.apply(x1)
+
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.get_updates_for(x1)), 1)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+
+    x2 = keras.Input(shape=(1,))
+    y2 = layer.apply(x2)
+
+    self.assertEqual(len(layer.updates), 3)
+    self.assertEqual(len(layer.get_updates_for(x1)), 1)
+    self.assertEqual(len(layer.get_updates_for(x2)), 1)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+
+    network = keras.engine.Network(x2, y2)
+    self.assertEqual(len(network.updates), 2)
+    self.assertEqual(len(network.get_updates_for(x1)), 0)
+    self.assertEqual(len(network.get_updates_for(x2)), 1)
+    self.assertEqual(len(network.get_updates_for(None)), 1)
+
+    x3 = keras.Input(shape=(1,))
+    _ = layer.apply(x3)
+    self.assertEqual(len(network.updates), 2)
+
+    x4 = keras.Input(shape=(1,))
+    _ = network(x4)
+    self.assertEqual(len(network.updates), 3)
+    self.assertEqual(len(network.get_updates_for(x2)), 1)
+    self.assertEqual(len(network.get_updates_for(x4)), 1)
+    self.assertEqual(len(network.get_updates_for(None)), 1)
+
+    network.add_update(state_ops.assign_add(layer.a, [[1]]))
+    self.assertEqual(len(network.updates), 4)
+    self.assertEqual(len(network.get_updates_for(None)), 2)
+
+    network.add_update(state_ops.assign_add(layer.b, x4), inputs=True)
+    self.assertEqual(len(network.updates), 5)
+    self.assertEqual(len(network.get_updates_for(x4)), 2)
+
+  def test_get_updates_bn(self):
+    x1 = keras.Input(shape=(1,))
+    layer = keras.layers.BatchNormalization()
+    _ = layer.apply(x1)
+
+    print('BN updates', layer._updates)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.get_updates_for(x1)), 2)
+    self.assertEqual(len(layer.get_updates_for(None)), 0)
+
+  def test_get_losses(self):
+
+    class MyLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.add_loss(math_ops.reduce_sum(self.a))
+        self.built = True
+
+      def call(self, inputs):
+        self.add_loss(math_ops.reduce_sum(inputs),
+                      inputs=True)
+        return inputs + 1
+
+    x1 = keras.Input(shape=(1,))
+    layer = MyLayer()
+    _ = layer.apply(x1)
+
+    self.assertEqual(len(layer.losses), 2)
+    self.assertEqual(len(layer.get_losses_for(x1)), 1)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+
+    x2 = keras.Input(shape=(1,))
+    y2 = layer.apply(x2)
+
+    self.assertEqual(len(layer.losses), 3)
+    self.assertEqual(len(layer.get_losses_for(x1)), 1)
+    self.assertEqual(len(layer.get_losses_for(x2)), 1)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+
+    network = keras.engine.Network(x2, y2)
+    self.assertEqual(len(network.losses), 2)
+    self.assertEqual(len(network.get_losses_for(x1)), 0)
+    self.assertEqual(len(network.get_losses_for(x2)), 1)
+    self.assertEqual(len(network.get_losses_for(None)), 1)
+
+    x3 = keras.Input(shape=(1,))
+    _ = layer.apply(x3)
+    self.assertEqual(len(network.losses), 2)
+
+    x4 = keras.Input(shape=(1,))
+    _ = network(x4)
+    self.assertEqual(len(network.losses), 3)
+    self.assertEqual(len(network.get_losses_for(x2)), 1)
+    self.assertEqual(len(network.get_losses_for(x4)), 1)
+    self.assertEqual(len(network.get_losses_for(None)), 1)
+
+    network.add_loss(math_ops.reduce_sum(layer.a))
+    self.assertEqual(len(network.losses), 4)
+    self.assertEqual(len(network.get_losses_for(None)), 2)
+
+    network.add_loss(math_ops.reduce_sum(x4), inputs=True)
+    self.assertEqual(len(network.losses), 5)
+    self.assertEqual(len(network.get_losses_for(x4)), 2)
+
+  def testTopologicalAttributes(self):
+    # test layer attributes / methods related to cross-layer connectivity.
+    a = keras.Input(shape=(32,), name='input_a')
+    b = keras.Input(shape=(32,), name='input_b')
+
+    # test input, output, input_shape, output_shape
+    test_layer = keras.layers.Dense(16, name='test_layer')
+    a_test = test_layer(a)
+    self.assertEqual(test_layer.input, a)
+    self.assertEqual(test_layer.output, a_test)
+    self.assertEqual(test_layer.input_shape, (None, 32))
+    self.assertEqual(test_layer.output_shape, (None, 16))
+
+    # test `get_*_at` methods
+    dense = keras.layers.Dense(16, name='dense_1')
+    a_2 = dense(a)
+    b_2 = dense(b)
+
+    self.assertEqual(dense.get_input_at(0), a)
+    self.assertEqual(dense.get_input_at(1), b)
+    self.assertEqual(dense.get_output_at(0), a_2)
+    self.assertEqual(dense.get_output_at(1), b_2)
+    self.assertEqual(dense.get_input_shape_at(0), (None, 32))
+    self.assertEqual(dense.get_input_shape_at(1), (None, 32))
+    self.assertEqual(dense.get_output_shape_at(0), (None, 16))
+    self.assertEqual(dense.get_output_shape_at(1), (None, 16))
+
+    # Test invalid value for attribute retrieval.
+    with self.assertRaises(ValueError):
+      dense.get_input_at(2)
+    with self.assertRaises(AttributeError):
+      new_dense = keras.layers.Dense(16)
+      _ = new_dense.input
+    with self.assertRaises(AttributeError):
+      new_dense = keras.layers.Dense(16)
+      _ = new_dense.output
+    with self.assertRaises(AttributeError):
+      new_dense = keras.layers.Dense(16)
+      _ = new_dense.output_shape
+    with self.assertRaises(AttributeError):
+      new_dense = keras.layers.Dense(16)
+      _ = new_dense.input_shape
+    with self.assertRaises(AttributeError):
+      new_dense = keras.layers.Dense(16)
+      a = keras.Input(shape=(3, 32))
+      a = keras.Input(shape=(5, 32))
+      a_2 = dense(a)
+      b_2 = dense(b)
+      _ = new_dense.input_shape
+    with self.assertRaises(AttributeError):
+      new_dense = keras.layers.Dense(16)
+      a = keras.Input(shape=(3, 32))
+      a = keras.Input(shape=(5, 32))
+      a_2 = dense(a)
+      b_2 = dense(b)
+      _ = new_dense.output_shape
+
+  def testTopologicalAttributesMultiOutputLayer(self):
+
+    class PowersLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return [inputs**2, inputs**3]
+
+    x = keras.Input(shape=(32,))
+    test_layer = PowersLayer()
+    p1, p2 = test_layer(x)  # pylint: disable=not-callable
+
+    self.assertEqual(test_layer.input, x)
+    self.assertEqual(test_layer.output, [p1, p2])
+    self.assertEqual(test_layer.input_shape, (None, 32))
+    self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
+
+  def testTopologicalAttributesMultiInputLayer(self):
+
+    class AddLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        assert len(inputs) == 2
+        return inputs[0] + inputs[1]
+
+    a = keras.Input(shape=(32,))
+    b = keras.Input(shape=(32,))
+    test_layer = AddLayer()
+    y = test_layer([a, b])  # pylint: disable=not-callable
+
+    self.assertEqual(test_layer.input, [a, b])
+    self.assertEqual(test_layer.output, y)
+    self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
+    self.assertEqual(test_layer.output_shape, (None, 32))
+
+  def testBasicNetwork(self):
+    # minimum viable network
+    x = keras.Input(shape=(32,))
+    dense = keras.layers.Dense(2)
+    y = dense(x)
+    network = keras.engine.Network(x, y, name='dense_network')
+
+    # test basic attributes
+    self.assertEqual(network.name, 'dense_network')
+    self.assertEqual(len(network.layers), 2)  # InputLayer + Dense
+    self.assertEqual(network.layers[1], dense)
+    self.assertEqual(network.weights, dense.weights)
+    self.assertEqual(network.trainable_weights, dense.trainable_weights)
+    self.assertEqual(network.non_trainable_weights, dense.non_trainable_weights)
+
+    # test callability on Input
+    x_2 = keras.Input(shape=(32,))
+    y_2 = network(x_2)
+    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
+
+    # test callability on regular tensor
+    x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
+    y_2 = network(x_2)
+    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
+
+    # test network `trainable` attribute
+    network.trainable = False
+    self.assertEqual(network.weights, dense.weights)
+    self.assertEqual(network.trainable_weights, [])
+    self.assertEqual(network.non_trainable_weights,
+                     dense.trainable_weights + dense.non_trainable_weights)
 
   def test_trainable_weights(self):
     a = keras.layers.Input(shape=(2,))
@@ -108,41 +339,6 @@ class TopologyConstructionTest(test.TestCase):
     self.assertListEqual(model.trainable_weights, [])
     self.assertListEqual(model.non_trainable_weights, weights)
 
-  def test_weight_loading(self):
-    with self.test_session():
-      a = keras.layers.Input(shape=(2,))
-      x = keras.layers.Dense(3)(a)
-      b = keras.layers.Dense(1)(x)
-      model = keras.models.Model(a, b)
-
-      x = np.random.random((3, 2))
-      ref_y = model.predict(x)
-      weights = model.get_weights()
-      model.set_weights(weights)
-      y = model.predict(x)
-      self.assertAllClose(ref_y, y)
-
-      with self.assertRaises(ValueError):
-        model.set_weights(weights[1:])
-      with self.assertRaises(ValueError):
-        model.set_weights(weights[::-1])
-
-      if h5py is None:
-        return  # Skip rest of test if H5py isn't available.
-
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir)
-
-      h5_path = os.path.join(temp_dir, 'test.h5')
-      model.save_weights(h5_path)
-      model.load_weights(h5_path)
-      y = model.predict(x)
-      self.assertAllClose(ref_y, y)
-
-      model.load_weights(h5_path, by_name=True)
-      y = model.predict(x)
-      self.assertAllClose(ref_y, y)
-
   def test_learning_phase(self):
     with self.test_session():
       a = keras.layers.Input(shape=(32,), name='input_a')
@@ -310,7 +506,7 @@ class TopologyConstructionTest(test.TestCase):
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
 
       # test get_source_inputs
-      self.assertListEqual(keras.engine.topology.get_source_inputs(c), [a, b])
+      self.assertListEqual(keras.engine.network.get_source_inputs(c), [a, b])
 
       # serialization / deserialization
       json_config = model.to_json()
@@ -347,8 +543,10 @@ class TopologyConstructionTest(test.TestCase):
 
       e = keras.layers.Input(shape=(32,), name='input_e')
       f = keras.layers.Input(shape=(32,), name='input_f')
+      self.assertEqual(len(model.inputs), 2)
       g, h = model([e, f])
-      self.assertEqual(g.name, 'model_1/dense_2/BiasAdd:0')
+      self.assertEqual(len(model.inputs), 2)
+      self.assertEqual(g.name, 'model/dense_2/BiasAdd:0')
 
       self.assertListEqual(g.get_shape().as_list(), c.get_shape().as_list())
       self.assertListEqual(h.get_shape().as_list(), d.get_shape().as_list())
@@ -529,7 +727,9 @@ class TopologyConstructionTest(test.TestCase):
 
     j = keras.layers.Input(shape=(32,), name='input_j')
     k = keras.layers.Input(shape=(32,), name='input_k')
+    self.assertEqual(len(model.inputs), 2)
     m, n = model([j, k])
+    self.assertEqual(len(model.inputs), 2)
     tf_model = keras.models.Model([j, k], [m, n])
 
     j_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
@@ -555,10 +755,47 @@ class TopologyConstructionTest(test.TestCase):
     model = keras.models.Model(a, b)
     self.assertEqual(model.output_mask.get_shape().as_list(), [None, 10])
 
+  def testMaskingSingleInput(self):
+
+    class MaskedLayer(keras.layers.Layer):
+
+      def call(self, inputs, mask=None):
+        if mask is not None:
+          return inputs * mask
+        return inputs
+
+      def compute_mask(self, inputs, mask=None):
+        return array_ops.ones_like(inputs)
+
+    if context.executing_eagerly():
+      a = constant_op.constant([2] * 32)
+      mask = constant_op.constant([0, 1] * 16)
+      a._keras_mask = mask
+      b = MaskedLayer().apply(a)
+      self.assertTrue(hasattr(b, '_keras_mask'))
+      self.assertAllEqual(
+          self.evaluate(array_ops.ones_like(mask)),
+          self.evaluate(getattr(b, '_keras_mask')))
+      self.assertAllEqual(self.evaluate(a * mask), self.evaluate(b))
+    else:
+      x = keras.Input(shape=(32,))
+      y = MaskedLayer()(x)  # pylint: disable=not-callable
+      network = keras.engine.Network(x, y)
+
+      # test callability on Input
+      x_2 = keras.Input(shape=(32,))
+      y_2 = network(x_2)
+      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
+
+      # test callability on regular tensor
+      x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
+      y_2 = network(x_2)
+      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
+
   def test_activity_regularization_with_model_composition(self):
 
     def reg(x):
-      return keras.backend.sum(x)
+      return math_ops.reduce_sum(x)
 
     net_a_input = keras.Input((2,))
     net_a = net_a_input
@@ -576,97 +813,6 @@ class TopologyConstructionTest(test.TestCase):
     loss = model_b.evaluate(x)
     self.assertEqual(loss, 4.)
 
-  def test_weight_preprocessing(self):
-    input_dim = 3
-    output_dim = 3
-    size = 2
-    cases = [
-        [
-            (keras.layers.Bidirectional(keras.layers.SimpleRNN(2))),
-            [np.random.random((2, 1)), np.random.random((2, 1))],
-            (None, 3, 2),
-        ],
-        [
-            (keras.layers.TimeDistributed(keras.layers.Dense(1))),
-            [np.random.random((2, 1)), np.random.random((1,))],
-            (None, 3, 2),
-        ],
-        [
-            (keras.layers.Conv1D(output_dim, size, use_bias=False)),
-            [np.random.random((output_dim, input_dim, size, 1))],
-            (None, 4, input_dim),
-        ],
-        [
-            (keras.layers.Conv2D(output_dim, size,
-                                 use_bias=False, data_format='channels_first')),
-            [np.random.random((output_dim, input_dim, size, size))],
-            (None, input_dim, 4, 4),
-        ],
-        [
-            (keras.layers.Conv2DTranspose(output_dim, size,
-                                          use_bias=False,
-                                          data_format='channels_first')),
-            [np.random.random((output_dim, input_dim, size, size))],
-            (None, input_dim, 4, 4),
-        ],
-        [
-            (keras.layers.Conv2DTranspose(output_dim, size,
-                                          use_bias=False,
-                                          data_format='channels_last')),
-            [np.random.random((size, size, input_dim, output_dim))],
-            (None, 4, 4, input_dim),
-        ],
-        [
-            (keras.layers.Conv3D(output_dim, size,
-                                 use_bias=False, data_format='channels_first')),
-            [np.random.random((output_dim, input_dim, size, size, size))],
-            (None, input_dim, 4, 4, 4),
-        ],
-        [
-            (keras.layers.GRU(output_dim)),
-            [np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,))],
-            (None, 4, input_dim),
-        ],
-        [
-            (keras.layers.LSTM(output_dim)),
-            [np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,))],
-            (None, 4, input_dim),
-        ],
-    ]
-    for layer, weights, input_shape in cases:
-      layer.build(input_shape)
-      _ = keras.engine.topology.preprocess_weights_for_loading(
-          layer, weights, original_keras_version='1')
-
-    model = keras.models.Sequential([keras.layers.Dense(2, input_dim=2)])
-    _ = keras.engine.topology.preprocess_weights_for_loading(
-        model, model.weights, original_keras_version='1')
-
-    x = keras.Input((2,))
-    y = keras.layers.Dense(2)(x)
-    model = keras.models.Model(x, y)
-    _ = keras.engine.topology.preprocess_weights_for_loading(
-        model, model.weights, original_keras_version='1')
-
   def test_layer_sharing_at_heterogenous_depth(self):
     with self.test_session():
       x_val = np.random.random((10, 5))
@@ -715,5 +861,121 @@ class TopologyConstructionTest(test.TestCase):
       output_val_2 = m2.predict(x_val)
       self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
+  def test_explicit_training_argument(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(2,))
+      b = keras.layers.Dropout(0.5)(a)
+      base_model = keras.models.Model(a, b)
+
+      a = keras.layers.Input(shape=(2,))
+      b = base_model(a, training=False)
+      model = keras.models.Model(a, b)
+
+      x = np.ones((100, 2))
+      y = np.ones((100, 2))
+      model.compile(optimizer='sgd', loss='mse')
+      loss = model.train_on_batch(x, y)
+      self.assertEqual(loss, 0)  # In inference mode, output is equal to input.
+
+      a = keras.layers.Input(shape=(2,))
+      b = base_model(a, training=True)
+      model = keras.models.Model(a, b)
+      preds = model.predict(x)
+      self.assertEqual(np.min(preds), 0.)  # At least one unit was dropped.
+
+
+class DeferredModeTest(test.TestCase):
+
+  def testDeferredTensorAttributes(self):
+    x = base_layer.DeferredTensor(shape=(None, 2),
+                                  dtype='float32',
+                                  name='x')
+    self.assertEqual(str(x),
+                     'DeferredTensor(\'x\', shape=(?, 2), dtype=float32)')
+    self.assertEqual(repr(x),
+                     '<DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSimpleNetworkBuilding(self):
+    inputs = keras.engine.Input(shape=(32,))
+    if context.executing_eagerly():
+      self.assertIsInstance(inputs, base_layer.DeferredTensor)
+      self.assertEqual(inputs.dtype.name, 'float32')
+      self.assertEqual(inputs.shape.as_list(), [None, 32])
+
+    x = keras.layers.Dense(2)(inputs)
+    if context.executing_eagerly():
+      self.assertIsInstance(x, base_layer.DeferredTensor)
+      self.assertEqual(x.dtype.name, 'float32')
+      self.assertEqual(x.shape.as_list(), [None, 2])
+
+    outputs = keras.layers.Dense(4)(x)
+    network = keras.engine.Network(inputs, outputs)
+    self.assertIsInstance(network, keras.engine.Network)
+
+    if context.executing_eagerly():
+      # It should be possible to call such a network on EagerTensors.
+      inputs = constant_op.constant(
+          np.random.random((10, 32)).astype('float32'))
+      outputs = network(inputs)
+      self.assertEqual(outputs.shape.as_list(), [10, 4])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMultiIONetworkbuilding(self):
+    input_a = keras.engine.Input(shape=(32,))
+    input_b = keras.engine.Input(shape=(16,))
+    a = keras.layers.Dense(16)(input_a)
+
+    class AddLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return inputs[0] + inputs[1]
+
+      def compute_output_shape(self, input_shape):
+        return input_shape[0]
+
+    c = AddLayer()([a, input_b])  # pylint: disable=not-callable
+    c = keras.layers.Dense(2)(c)
+
+    network = keras.engine.Network([input_a, input_b], [a, c])
+    if context.executing_eagerly():
+      a_val = constant_op.constant(
+          np.random.random((10, 32)).astype('float32'))
+      b_val = constant_op.constant(
+          np.random.random((10, 16)).astype('float32'))
+      outputs = network([a_val, b_val])
+      self.assertEqual(len(outputs), 2)
+      self.assertEqual(outputs[0].shape.as_list(), [10, 16])
+      self.assertEqual(outputs[1].shape.as_list(), [10, 2])
+
+
+class GraphUtilsTest(test.TestCase):
+
+  def testGetReachableFromInputs(self):
+
+    with self.test_session():
+      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
+      x_1 = pl_1 + pl_2
+      x_2 = pl_2 * 2
+      x_3 = pl_3 + 1
+      x_4 = x_1 + x_2
+      x_5 = x_3 * pl_1
+
+      self.assertEqual(
+          keras.utils.tf_utils.get_reachable_from_inputs([pl_1]),
+          {pl_1, x_1, x_4, x_5, x_1.op, x_4.op, x_5.op})
+      self.assertEqual(
+          keras.utils.tf_utils.get_reachable_from_inputs([pl_1, pl_2]),
+          {pl_1, pl_2, x_1, x_2, x_4, x_5, x_1.op, x_2.op, x_4.op, x_5.op})
+      self.assertEqual(
+          keras.utils.tf_utils.get_reachable_from_inputs([pl_3]),
+          {pl_3, x_3, x_5, x_3.op, x_5.op})
+      self.assertEqual(
+          keras.utils.tf_utils.get_reachable_from_inputs([x_3]),
+          {x_3, x_5, x_5.op})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index d8ea2fe3db500d3b52d80e46b0cff22a3d1c5915..5f9b3e8c7d7a93a1fb0b7bc83d6378417391e393 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -18,500 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.keras._impl.keras import optimizers
+from tensorflow.python.keras._impl.keras.engine import training_arrays
 from tensorflow.python.keras._impl.keras.engine import training_eager
-from tensorflow.python.keras._impl.keras.engine.topology import Layer
-from tensorflow.python.keras._impl.keras.engine.topology import Network
-from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
-from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras._impl.keras.engine import training_generator
+from tensorflow.python.keras._impl.keras.engine import training_utils
+from tensorflow.python.keras._impl.keras.engine.base_layer import DeferredTensor
+from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
+from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.layers.base import _DeferredTensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.util.tf_export import tf_export
 
-try:
-  from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
-except ImportError:
-  issparse = None
-
-
-def _standardize_input_data(data,
-                            names,
-                            shapes=None,
-                            check_batch_axis=True,
-                            exception_prefix=''):
-  """Normalizes inputs and targets provided by users.
-
-  Users may pass data as a list of arrays, dictionary of arrays,
-  or as a single array. We normalize this to an ordered list of
-  arrays (same order as `names`), while checking that the provided
-  arrays have shapes that match the network's expectations.
-
-  Arguments:
-      data: User-provided input data (polymorphic).
-      names: List of expected array names.
-      shapes: Optional list of expected array shapes.
-      check_batch_axis: Boolean; whether to check that
-          the batch axis of the arrays matches the expected
-          value found in `shapes`.
-      exception_prefix: String prefix used for exception formatting.
-
-  Returns:
-      List of standardized input arrays (one array per model input).
-
-  Raises:
-      ValueError: in case of improperly formatted user-provided data.
-  """
-  if not names:
-    if data is not None and hasattr(data, '__len__') and len(data):
-      raise ValueError('Error when checking model ' + exception_prefix + ': '
-                       'expected no data, but got:', data)
-    return []
-  if data is None:
-    return [None for _ in range(len(names))]
-
-  if isinstance(data, dict):
-    try:
-      data = [
-          data[x].values
-          if data[x].__class__.__name__ == 'DataFrame' else data[x]
-          for x in names
-      ]
-    except KeyError as e:
-      raise ValueError('No data provided for "' + e.args[0] + '". Need data '
-                       'for each key in: ' + str(names))
-  elif isinstance(data, list):
-    if isinstance(data[0], list):
-      data = [np.asarray(d) for d in data]
-    elif len(names) == 1 and isinstance(data[0], (float, int)):
-      data = [np.asarray(data)]
-    else:
-      data = [
-          x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
-      ]
-  else:
-    data = data.values if data.__class__.__name__ == 'DataFrame' else data
-    data = [data]
-  data = [
-      np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data
-  ]
-
-  if len(data) != len(names):
-    if data and hasattr(data[0], 'shape'):
-      raise ValueError('Error when checking model ' + exception_prefix +
-                       ': the list of Numpy arrays that you are passing to '
-                       'your model is not the size the model expected. '
-                       'Expected to see ' + str(len(names)) + ' array(s), '
-                       'but instead got the following list of ' +
-                       str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
-    elif len(names) > 1:
-      raise ValueError(
-          'Error when checking model ' + exception_prefix +
-          ': you are passing a list as input to your model, '
-          'but the model expects a list of ' + str(len(names)) +
-          ' Numpy arrays instead. The list you passed was: ' + str(data)[:200])
-    elif len(data) == 1 and not hasattr(data[0], 'shape'):
-      raise TypeError('Error when checking model ' + exception_prefix +
-                      ': data should be a Numpy array, or list/dict of '
-                      'Numpy arrays. Found: ' + str(data)[:200] + '...')
-    elif len(names) == 1:
-      data = [np.asarray(data)]
-
-  # Check shapes compatibility.
-  if shapes:
-    for i in range(len(names)):
-      if shapes[i] is not None:
-        data_shape = data[i].shape
-        shape = shapes[i]
-        if data[i].ndim != len(shape):
-          raise ValueError('Error when checking ' + exception_prefix +
-                           ': expected ' + names[i] + ' to have ' +
-                           str(len(shape)) + ' dimensions, but got array '
-                           'with shape ' + str(data_shape))
-        if not check_batch_axis:
-          data_shape = data_shape[1:]
-          shape = shape[1:]
-        for dim, ref_dim in zip(data_shape, shape):
-          if ref_dim != dim and ref_dim:
-            raise ValueError(
-                'Error when checking ' + exception_prefix + ': expected ' +
-                names[i] + ' to have shape ' + str(shape) +
-                ' but got array with shape ' + str(data_shape))
-  return data
-
-
-def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
-  """Maps `sample_weight` or `class_weight` to model outputs.
-
-  Arguments:
-      x_weight: User-provided `sample_weight` or `class_weight` argument.
-      output_names: List of output names (strings) in the model.
-      weight_type: A string used purely for exception printing.
-
-  Returns:
-      A list of `sample_weight` or `class_weight` where there are exactly
-          one element per model output.
-
-  Raises:
-      ValueError: In case of invalid user-provided argument.
-  """
-  if x_weight is None or len(x_weight) == 0:  # pylint: disable=g-explicit-length-test
-    return [None for _ in output_names]
-  if len(output_names) == 1:
-    if isinstance(x_weight, list) and len(x_weight) == 1:
-      return x_weight
-    if isinstance(x_weight, dict) and output_names[0] in x_weight:
-      return [x_weight[output_names[0]]]
-    else:
-      return [x_weight]
-  if isinstance(x_weight, list):
-    if len(x_weight) != len(output_names):
-      raise ValueError('Provided `' + weight_type + '` was a list of ' +
-                       str(len(x_weight)) + ' elements, but the model has ' +
-                       str(len(output_names)) + ' outputs. '
-                       'You should provide one `' + weight_type + '`'
-                       'array per model output.')
-    return x_weight
-  if isinstance(x_weight, dict):
-    x_weights = []
-    for name in output_names:
-      x_weights.append(x_weight.get(name))
-    return x_weights
-  else:
-    raise TypeError(
-        'The model has multiple outputs, so `' + weight_type + '` '
-        'should be either a list or a dict. '
-        'Provided `' + weight_type + '` type not understood: ' + str(x_weight))
-
-
-def _standardize_class_weights(class_weight, output_names):
-  return _standardize_sample_or_class_weights(class_weight, output_names,
-                                              'class_weight')
-
-
-def _standardize_sample_weights(sample_weight, output_names):
-  return _standardize_sample_or_class_weights(sample_weight, output_names,
-                                              'sample_weight')
-
-
-def _check_array_lengths(inputs, targets, weights=None):
-  """Does user input validation for numpy arrays.
-
-  Arguments:
-      inputs: list of Numpy arrays of inputs.
-      targets: list of Numpy arrays of targets.
-      weights: list of Numpy arrays of sample weights.
-
-  Raises:
-      ValueError: in case of incorrectly formatted data.
-  """
-
-  def set_of_lengths(x):
-    # return a set with the variation between
-    # different shapes, with None => 0
-    if x is None:
-      return {0}
-    else:
-      return set([0 if y is None else y.shape[0] for y in x])
-
-  set_x = set_of_lengths(inputs)
-  set_y = set_of_lengths(targets)
-  set_w = set_of_lengths(weights)
-  if len(set_x) > 1:
-    raise ValueError('All input arrays (x) should have '
-                     'the same number of samples. Got array shapes: ' +
-                     str([x.shape for x in inputs]))
-  if len(set_y) > 1:
-    raise ValueError('All target arrays (y) should have '
-                     'the same number of samples. Got array shapes: ' +
-                     str([y.shape for y in targets]))
-  if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
-    raise ValueError('Input arrays should have '
-                     'the same number of samples as target arrays. '
-                     'Found ' + str(list(set_x)[0]) + ' input samples '
-                     'and ' + str(list(set_y)[0]) + ' target samples.')
-  if len(set_w) > 1:
-    raise ValueError('All sample_weight arrays should have '
-                     'the same number of samples. Got array shapes: ' +
-                     str([w.shape for w in weights]))
-  if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
-    raise ValueError('Sample_weight arrays should have '
-                     'the same number of samples as target arrays. Got ' +
-                     str(list(set_y)[0]) + ' input samples and ' +
-                     str(list(set_w)[0]) + ' target samples.')
-
-
-def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
-  """Does validation on the compatibility of targets and loss functions.
-
-  This helps prevent users from using loss functions incorrectly.
-
-  Arguments:
-      targets: list of Numpy arrays of targets.
-      loss_fns: list of loss functions.
-      output_shapes: list of shapes of model outputs.
-
-  Raises:
-      ValueError: if a loss function or target array
-          is incompatible with an output.
-  """
-  key_losses = {
-      losses.mean_squared_error, losses.binary_crossentropy,
-      losses.categorical_crossentropy
-  }
-  for y, loss, shape in zip(targets, loss_fns, output_shapes):
-    if y is None or loss is None:
-      continue
-    if loss is losses.categorical_crossentropy:
-      if y.shape[-1] == 1:
-        raise ValueError('You are passing a target array of shape ' + str(
-            y.shape) + ' while using as loss `categorical_crossentropy`. '
-                         '`categorical_crossentropy` expects '
-                         'targets to be binary matrices (1s and 0s) '
-                         'of shape (samples, classes). '
-                         'If your targets are integer classes, '
-                         'you can convert them to the expected format via:\n'
-                         '```\n'
-                         'from keras.utils import to_categorical\n'
-                         'y_binary = to_categorical(y_int)\n'
-                         '```\n'
-                         '\n'
-                         'Alternatively, you can use the loss function '
-                         '`sparse_categorical_crossentropy` instead, '
-                         'which does expect integer targets.')
-    if loss in key_losses:
-      for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
-        if out_dim is not None and target_dim != out_dim:
-          raise ValueError('A target array with shape ' + str(y.shape) +
-                           ' was passed for an output of shape ' + str(shape) +
-                           ' while using as loss `' + loss.__name__ + '`. '
-                           'This loss expects '
-                           'targets to have the same shape '
-                           'as the output.')
-
-
-def _collect_metrics(metrics, output_names):
-  """Maps metric functions to model outputs.
-
-  Arguments:
-      metrics: a list or dict of metric functions.
-      output_names: a list of the names (strings) of model outputs.
-
-  Returns:
-      A list (one entry per model output) of lists of metric functions.
-      For instance, if the model has 2 outputs, and for the first output
-      we want to compute "binary_accuracy" and "binary_crossentropy",
-      and just "binary_accuracy" for the second output,
-      the list would look like:
-          `[[binary_accuracy, binary_crossentropy], [binary_accuracy]]`
-
-  Raises:
-      TypeError: if an incorrect type is passed for the `metrics` argument.
-  """
-  if not metrics:
-    return [[] for _ in output_names]
-  if isinstance(metrics, list):
-    # we then apply all metrics to all outputs.
-    return [copy.copy(metrics) for _ in output_names]
-  elif isinstance(metrics, dict):
-    nested_metrics = []
-    for name in output_names:
-      output_metrics = metrics.get(name, [])
-      if not isinstance(output_metrics, list):
-        output_metrics = [output_metrics]
-      nested_metrics.append(output_metrics)
-    return nested_metrics
-  else:
-    raise TypeError('Type of `metrics` argument not understood. '
-                    'Expected a list or dictionary, found: ' + str(metrics))
-
-
-def _batch_shuffle(index_array, batch_size):
-  """Shuffles an array in a batch-wise fashion.
-
-  Useful for shuffling HDF5 arrays
-  (where one cannot access arbitrary indices).
-
-  Arguments:
-      index_array: array of indices to be shuffled.
-      batch_size: integer.
-
-  Returns:
-      The `index_array` array, shuffled in a batch-wise fashion.
-  """
-  batch_count = int(len(index_array) / batch_size)
-  # to reshape we need to be cleanly divisible by batch size
-  # we stash extra items and reappend them after shuffling
-  last_batch = index_array[batch_count * batch_size:]
-  index_array = index_array[:batch_count * batch_size]
-  index_array = index_array.reshape((batch_count, batch_size))
-  np.random.shuffle(index_array)
-  index_array = index_array.flatten()
-  return np.append(index_array, last_batch)
-
-
-def _weighted_masked_objective(fn):
-  """Adds support for masking and sample-weighting to an objective function.
-
-  It transforms an objective function `fn(y_true, y_pred)`
-  into a sample-weighted, cost-masked objective function
-  `fn(y_true, y_pred, weights, mask)`.
-
-  Arguments:
-      fn: The objective function to wrap,
-          with signature `fn(y_true, y_pred)`.
-
-  Returns:
-      A function with signature `fn(y_true, y_pred, weights, mask)`.
-  """
-  if fn is None:
-    return None
-
-  def weighted(y_true, y_pred, weights, mask=None):
-    """Wrapper function.
-
-    Arguments:
-        y_true: `y_true` argument of `fn`.
-        y_pred: `y_pred` argument of `fn`.
-        weights: Weights tensor.
-        mask: Mask tensor.
-
-    Returns:
-        Scalar tensor.
-    """
-    # score_array has ndim >= 2
-    score_array = fn(y_true, y_pred)
-    if mask is not None:
-      # Cast the mask to floatX to avoid float64 upcasting in theano
-      mask = K.cast(mask, K.floatx())
-      # mask should have the same shape as score_array
-      score_array *= mask
-      #  the loss per batch should be proportional
-      #  to the number of unmasked samples.
-      score_array /= K.mean(mask)
-
-    # apply sample weighting
-    if weights is not None:
-      # reduce score_array to same ndim as weight array
-      ndim = K.ndim(score_array)
-      weight_ndim = K.ndim(weights)
-      score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
-      score_array *= weights
-      score_array /= K.mean(K.cast(K.not_equal(weights, 0), K.floatx()))
-    return K.mean(score_array)
-
-  return weighted
-
-
-def _standardize_weights(y,
-                         sample_weight=None,
-                         class_weight=None,
-                         sample_weight_mode=None):
-  """Performs sample weight validation and standardization.
-
-  Everything gets normalized to a single sample-wise (or timestep-wise)
-  weight array.
-
-  Arguments:
-      y: Numpy array of model targets to be weighted.
-      sample_weight: User-provided `sample_weight` argument.
-      class_weight: User-provided `class_weight` argument.
-      sample_weight_mode: One of `None` or `"temporal"`.
-          `"temporal"` indicated that we expect 2D weight data
-          that will be applied to the last 2 dimensions of
-          the targets (i.e. we are weighting timesteps, not samples).
-
-  Returns:
-      A numpy array of target weights, one entry per sample to weight.
-
-  Raises:
-      ValueError: In case of invalid user-provided arguments.
-  """
-  if sample_weight_mode is not None:
-    if sample_weight_mode != 'temporal':
-      raise ValueError('"sample_weight_mode '
-                       'should be None or "temporal". '
-                       'Found: ' + str(sample_weight_mode))
-    if len(y.shape) < 3:
-      raise ValueError('Found a sample_weight array for '
-                       'an input with shape ' + str(y.shape) + '. '
-                       'Timestep-wise sample weighting (use of '
-                       'sample_weight_mode="temporal") is restricted to '
-                       'outputs that are at least 3D, i.e. that have '
-                       'a time dimension.')
-    if sample_weight is not None and len(sample_weight.shape) != 2:
-      raise ValueError('Found a sample_weight array with shape ' +
-                       str(sample_weight.shape) + '. '
-                       'In order to use timestep-wise sample weighting, '
-                       'you should pass a 2D sample_weight array.')
-  else:
-    if sample_weight is not None and len(sample_weight.shape) != 1:
-      raise ValueError('Found a sample_weight array with shape ' +
-                       str(sample_weight.shape) + '. '
-                       'In order to use timestep-wise sample weights, '
-                       'you should specify '
-                       'sample_weight_mode="temporal" '
-                       'in compile(). If you just mean to use '
-                       'sample-wise weights, make sure your '
-                       'sample_weight array is 1D.')
-
-  if sample_weight is not None:
-    if len(sample_weight.shape) > len(y.shape):
-      raise ValueError(
-          'Found a sample_weight with shape' + str(sample_weight.shape) + '.'
-          'Expected sample_weight with rank '
-          'less than or equal to ' + str(len(y.shape)))
-
-    if y.shape[:sample_weight.ndim] != sample_weight.shape:
-      raise ValueError(
-          'Found a sample_weight array with shape ' + str(sample_weight.shape) +
-          ' for an input with shape ' + str(y.shape) + '. '
-          'sample_weight cannot be broadcast.')
-    return sample_weight
-  elif isinstance(class_weight, dict):
-    if len(y.shape) > 2:
-      raise ValueError('`class_weight` not supported for '
-                       '3+ dimensional targets.')
-    if y.shape[1] > 1:
-      y_classes = np.argmax(y, axis=1)
-    elif y.shape[1] == 1:
-      y_classes = np.reshape(y, y.shape[0])
-    else:
-      y_classes = y
-
-    weights = np.asarray(
-        [class_weight[cls] for cls in y_classes if cls in class_weight])
-
-    if len(weights) != len(y_classes):
-      # subtract the sets to pick all missing classes
-      existing_classes = set(y_classes)
-      existing_class_weight = set(class_weight.keys())
-      raise ValueError('`class_weight` must contain all classes in the data.'
-                       ' The classes %s exist in the data but not in '
-                       '`class_weight`.' %
-                       (existing_classes - existing_class_weight))
-    return weights
-  else:
-    if sample_weight_mode is None:
-      return np.ones((y.shape[0],), dtype=K.floatx())
-    else:
-      return np.ones((y.shape[0], y.shape[1]), dtype=K.floatx())
-
 
 @tf_export('keras.models.Model', 'keras.Model')
 class Model(Network):
@@ -634,7 +164,7 @@ class Model(Network):
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
     loss = loss or {}
-    if context.in_eager_mode() and  not isinstance(
+    if context.executing_eagerly() and not isinstance(
         optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
       raise ValueError('Only TF native optimizers are supported in Eager mode.')
 
@@ -642,13 +172,13 @@ class Model(Network):
     self.loss = loss
     self.metrics = metrics or []
     self.loss_weights = loss_weights
-    if context.in_eager_mode() and sample_weight_mode is not None:
+    if context.executing_eagerly() and sample_weight_mode is not None:
       raise ValueError('sample_weight_mode is not supported in Eager mode.')
     self.sample_weight_mode = sample_weight_mode
-    if context.in_eager_mode() and weighted_metrics is not None:
+    if context.executing_eagerly() and weighted_metrics is not None:
       raise ValueError('weighted_metrics is not supported in Eager mode.')
     self.weighted_metrics = weighted_metrics
-    if context.in_eager_mode() and target_tensors is not None:
+    if context.executing_eagerly() and target_tensors is not None:
       raise ValueError('target_tensors is not supported in Eager mode.')
     self.target_tensors = target_tensors
 
@@ -688,7 +218,8 @@ class Model(Network):
       loss_functions = [loss_function for _ in range(len(self.outputs))]
     self.loss_functions = loss_functions
 
-    weighted_losses = [_weighted_masked_objective(fn) for fn in loss_functions]
+    weighted_losses = [training_utils.weighted_masked_objective(fn)
+                       for fn in loss_functions]
     skip_target_indices = []
     skip_target_weighing_indices = []
     self._feed_outputs = []
@@ -701,7 +232,7 @@ class Model(Network):
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       masks = self.compute_mask(self.inputs, mask=None)
       if masks is None:
         masks = [None for _ in self.outputs]
@@ -735,9 +266,9 @@ class Model(Network):
     self.loss_weights_list = loss_weights_list
 
     # initialization for Eager mode execution
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       if target_tensors is not None:
-        raise ValueError('target_tensors are not currently supported in Eager'
+        raise ValueError('target_tensors are not currently supported in Eager '
                          'mode.')
       self.total_loss = None
       self.metrics_tensors = []
@@ -745,7 +276,10 @@ class Model(Network):
       for i in range(len(self.outputs)):
         if len(self.outputs) > 1:
           self.metrics_names.append(self.output_names[i] + '_loss')
-      self.nested_metrics = _collect_metrics(metrics, self.output_names)
+      self.nested_metrics = training_utils.collect_metrics(metrics,
+                                                           self.output_names)
+      with K.name_scope('metrics'):
+        training_utils.populate_metric_names(self)
       self._feed_sample_weight_modes = []
       for i in range(len(self.outputs)):
         self._feed_sample_weight_modes.append(None)
@@ -862,12 +396,12 @@ class Model(Network):
           sample_weights.append(None)
         else:
           if sample_weight_mode == 'temporal':
-            sample_weights.append(
-                K.placeholder(ndim=2, name=name + '_sample_weights'))
+            sample_weights.append(array_ops.placeholder_with_default(
+                [[1.]], shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
-            sample_weights.append(
-                K.placeholder(ndim=1, name=name + '_sample_weights'))
+            sample_weights.append(array_ops.placeholder_with_default(
+                [1.], shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
@@ -915,9 +449,9 @@ class Model(Network):
 
     # List of same size as output_names.
     # contains tuples (metrics for output, names of metrics).
-    nested_metrics = _collect_metrics(metrics, self.output_names)
-    nested_weighted_metrics = _collect_metrics(weighted_metrics,
-                                               self.output_names)
+    nested_metrics = training_utils.collect_metrics(metrics, self.output_names)
+    nested_weighted_metrics = training_utils.collect_metrics(weighted_metrics,
+                                                             self.output_names)
     self.metrics_updates = []
     self.stateful_metric_names = []
     with K.name_scope('metrics'):
@@ -932,7 +466,6 @@ class Model(Network):
         output_weighted_metrics = nested_weighted_metrics[i]
 
         def handle_metrics(metrics, weights=None):
-          metric_name_prefix = 'weighted_' if weights is not None else ''
 
           for metric in metrics:
             if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
@@ -959,37 +492,19 @@ class Model(Network):
                   metric_fn = metrics_module.categorical_accuracy
                 elif metric in ('crossentropy', 'ce'):
                   metric_fn = metrics_module.categorical_crossentropy
-              if metric in ('accuracy', 'acc'):
-                suffix = 'acc'
-              elif metric in ('crossentropy', 'ce'):
-                suffix = 'ce'
-              weighted_metric_fn = _weighted_masked_objective(metric_fn)
-              metric_name = metric_name_prefix + suffix
+              weighted_metric_fn = training_utils.weighted_masked_objective(
+                  metric_fn)
             else:
               metric_fn = metrics_module.get(metric)
-              weighted_metric_fn = _weighted_masked_objective(metric_fn)
-              # Get metric name as string
-              if hasattr(metric_fn, 'name'):
-                metric_name = metric_fn.name
-              else:
-                metric_name = metric_fn.__name__
-              metric_name = metric_name_prefix + metric_name
-
+              weighted_metric_fn = training_utils.weighted_masked_objective(
+                  metric_fn)
+            metric_name = training_utils.get_base_metric_name(
+                metric, weighted=weights is not None)
             with K.name_scope(metric_name):
               metric_result = weighted_metric_fn(
                   y_true, y_pred, weights=weights, mask=masks[i])
 
-            # Append to self.metrics_names, self.metric_tensors,
-            # self.stateful_metric_names
-            if len(self.output_names) > 1:
-              metric_name = '%s_%s' % (self.output_names[i], metric_name)
-            # Dedupe name
-            j = 1
-            base_metric_name = metric_name
-            while metric_name in self.metrics_names:
-              metric_name = '%s_%d' % (base_metric_name, j)
-              j += 1
-            self.metrics_names.append(metric_name)
+            training_utils.add_metric_name(self, metric_name, i)
             self.metrics_tensors.append(metric_result)
 
             # Keep track of state updates created by
@@ -1069,6 +584,7 @@ class Model(Network):
             updates=updates,
             name='train_function',
             **self._function_kwargs)
+    self._post_build_cleanup()
 
   def _make_test_function(self):
     if not hasattr(self, 'test_function'):
@@ -1086,6 +602,7 @@ class Model(Network):
           updates=self.state_updates + self.metrics_updates,
           name='test_function',
           **self._function_kwargs)
+    self._post_build_cleanup()
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
@@ -1104,451 +621,7 @@ class Model(Network):
           updates=self.state_updates,
           name='predict_function',
           **kwargs)
-
-  def _check_num_samples(self,
-                         ins,
-                         batch_size=None,
-                         steps=None,
-                         steps_name='steps'):
-    """Determine the number of samples provided for training and evaluation.
-
-    The number of samples is not defined when running with `steps`,
-    in which case the number of samples is set to `None`.
-
-    Arguments:
-        ins: List of tensors to be fed to the Keras function.
-        batch_size: Integer batch size or `None` if not defined.
-        steps: Total number of steps (batches of samples)
-            before declaring `_predict_loop` finished.
-            Ignored with the default value of `None`.
-        steps_name: The public API's parameter name for `steps`.
-
-    Raises:
-        ValueError: when `steps` is `None` and the attribute `ins.shape`
-        does not exist. Also raises ValueError when `steps` is not `None`
-        and `batch_size` is not `None` because they are mutually
-        exclusive.
-
-    Returns:
-        When steps is `None`, returns the number of samples to be
-        processed based on the size of the first dimension of the
-        first input numpy array. When steps is not `None` and
-        `batch_size` is `None`, returns `None`.
-
-    Raises:
-        ValueError: In case of invalid arguments.
-    """
-    if steps is not None:
-      num_samples = None
-      if batch_size is not None:
-        raise ValueError(
-            'If ' + steps_name + ' is set, the `batch_size` must be None.')
-    elif ins and hasattr(ins[0], 'shape'):
-      num_samples = ins[0].shape[0]
-    else:
-      raise ValueError(
-          'Either the input data should have '
-          'a defined shape, or ' + steps_name + ' should be specified.')
-    return num_samples
-
-  def _fit_loop(self,
-                f,
-                ins,
-                out_labels=None,
-                batch_size=None,
-                epochs=100,
-                verbose=1,
-                callbacks=None,
-                val_f=None,
-                val_ins=None,
-                shuffle=True,
-                callback_metrics=None,
-                initial_epoch=0,
-                steps_per_epoch=None,
-                validation_steps=None):
-    """Abstract fit function for `f(ins)`.
-
-    Assume that f returns a list, labeled by out_labels.
-
-    Arguments:
-        f: Keras function returning a list of tensors
-        ins: List of tensors to be fed to `f`
-        out_labels: List of strings, display names of
-            the outputs of `f`
-        batch_size: Integer batch size or None if unknown.
-        epochs: Number of times to iterate over the data
-        verbose: Verbosity mode, 0, 1 or 2
-        callbacks: List of callbacks to be called during training
-        val_f: Keras function to call for validation
-        val_ins: List of tensors to be fed to `val_f`
-        shuffle: Whether to shuffle the data at the beginning of each epoch
-        callback_metrics: List of strings, the display names of the metrics
-            passed to the callbacks. They should be the
-            concatenation of list the display names of the outputs of
-             `f` and the list of display names of the outputs of `f_val`.
-        initial_epoch: Epoch at which to start training
-            (useful for resuming a previous training run)
-        steps_per_epoch: Total number of steps (batches of samples)
-            before declaring one epoch finished and starting the
-            next epoch. Ignored with the default value of `None`.
-        validation_steps: Number of steps to run validation for
-            (only if doing validation from data tensors).
-            Ignored with the default value of `None`.
-
-    Returns:
-        `History` object.
-
-    Raises:
-        ValueError: in case of invalid arguments.
-    """
-    do_validation = False
-    if val_f and val_ins:
-      do_validation = True
-      if verbose and ins and hasattr(ins[0], 'shape') and hasattr(
-          val_ins[0], 'shape'):
-        print('Train on %d samples, validate on %d samples' %
-              (ins[0].shape[0], val_ins[0].shape[0]))
-    if validation_steps:
-      do_validation = True
-      if steps_per_epoch is None:
-        raise ValueError('Can only use `validation_steps` '
-                         'when doing step-wise '
-                         'training, i.e. `steps_per_epoch` '
-                         'must be set.')
-
-    num_train_samples = self._check_num_samples(
-        ins, batch_size, steps_per_epoch, 'steps_per_epoch')
-    if num_train_samples is not None:
-      index_array = np.arange(num_train_samples)
-
-    self.history = cbks.History()
-    all_callbacks = [cbks.BaseLogger(
-        stateful_metrics=self.stateful_metric_names)]
-    if verbose:
-      if steps_per_epoch is not None:
-        count_mode = 'steps'
-      else:
-        count_mode = 'samples'
-      all_callbacks.append(
-          cbks.ProgbarLogger(
-              count_mode, stateful_metrics=self.stateful_metric_names))
-    all_callbacks += (callbacks or []) + [self.history]
-    callbacks = cbks.CallbackList(all_callbacks)
-    out_labels = out_labels or []
-
-    # it's possible to callback a different model than self
-    # (used by Sequential models)
-    if hasattr(self, 'callback_model') and self.callback_model:
-      callback_model = self.callback_model
-    else:
-      callback_model = self
-
-    callbacks.set_model(callback_model)
-
-    callbacks.set_params({
-        'batch_size': batch_size,
-        'epochs': epochs,
-        'steps': steps_per_epoch,
-        'samples': num_train_samples,
-        'verbose': verbose,
-        'do_validation': do_validation,
-        'metrics': callback_metrics or [],
-    })
-    callbacks.on_train_begin()
-    callback_model.stop_training = False
-    for cbk in callbacks:
-      cbk.validation_data = val_ins
-
-    # To prevent a slowdown, we find beforehand the arrays that need conversion.
-    feed = self._feed_inputs + self._feed_targets + self._feed_sample_weights
-    indices_for_conversion_to_dense = []
-    for i in range(len(feed)):
-      if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
-        indices_for_conversion_to_dense.append(i)
-
-    for epoch in range(initial_epoch, epochs):
-      # Reset stateful metrics
-      for m in self.metrics:
-        if isinstance(m, Layer):
-          m.reset_states()
-      # Update callbacks
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      if steps_per_epoch is not None:
-        for step_index in range(steps_per_epoch):
-          batch_logs = {}
-          batch_logs['batch'] = step_index
-          batch_logs['size'] = 1
-          callbacks.on_batch_begin(step_index, batch_logs)
-          outs = f(ins)
-
-          if not isinstance(outs, list):
-            outs = [outs]
-          for l, o in zip(out_labels, outs):
-            batch_logs[l] = o
-
-          callbacks.on_batch_end(step_index, batch_logs)
-          if callback_model.stop_training:
-            break
-
-        if do_validation:
-          val_outs = self._test_loop(
-              val_f,
-              val_ins,
-              batch_size=batch_size,
-              steps=validation_steps,
-              verbose=0)
-          if not isinstance(val_outs, list):
-            val_outs = [val_outs]
-          # Same labels assumed.
-          for l, o in zip(out_labels, val_outs):
-            epoch_logs['val_' + l] = o
-      else:
-        if shuffle == 'batch':
-          index_array = _batch_shuffle(index_array, batch_size)
-        elif shuffle:
-          np.random.shuffle(index_array)
-
-        batches = make_batches(num_train_samples, batch_size)
-
-        for batch_index, (batch_start, batch_end) in enumerate(batches):
-          batch_ids = index_array[batch_start:batch_end]
-          try:
-            if isinstance(ins[-1], float):
-              # Do not slice the training phase flag.
-              ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-            else:
-              ins_batch = slice_arrays(ins, batch_ids)
-          except TypeError:
-            raise TypeError('TypeError while preparing batch. '
-                            'If using HDF5 input data, '
-                            'pass shuffle="batch".')
-          batch_logs = {}
-          batch_logs['batch'] = batch_index
-          batch_logs['size'] = len(batch_ids)
-          callbacks.on_batch_begin(batch_index, batch_logs)
-          for i in indices_for_conversion_to_dense:
-            ins_batch[i] = ins_batch[i].toarray()
-
-          outs = f(ins_batch)
-          if not isinstance(outs, list):
-            outs = [outs]
-          for l, o in zip(out_labels, outs):
-            batch_logs[l] = o
-
-          callbacks.on_batch_end(batch_index, batch_logs)
-          if callback_model.stop_training:
-            break
-
-          if batch_index == len(batches) - 1:  # Last batch.
-            if do_validation:
-              val_outs = self._test_loop(
-                  val_f, val_ins, batch_size=batch_size, verbose=0)
-              if not isinstance(val_outs, list):
-                val_outs = [val_outs]
-              # Same labels assumed.
-              for l, o in zip(out_labels, val_outs):
-                epoch_logs['val_' + l] = o
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callback_model.stop_training:
-        break
-    callbacks.on_train_end()
-    return self.history
-
-  def _predict_loop(self, f, ins, batch_size=32, verbose=0, steps=None):
-    """Abstract method to loop over some data in batches.
-
-    Arguments:
-        f: Keras function returning a list of tensors.
-        ins: list of tensors to be fed to `f`.
-        batch_size: integer batch size.
-        verbose: verbosity mode.
-        steps: Total number of steps (batches of samples)
-            before declaring `_predict_loop` finished.
-            Ignored with the default value of `None`.
-
-    Returns:
-        Array of predictions (if the model has a single output)
-        or list of arrays of predictions
-        (if the model has multiple outputs).
-    """
-    if hasattr(self, 'metrics'):
-      for m in self.metrics:
-        if isinstance(m, Layer):
-          m.reset_states()
-
-    num_samples = self._check_num_samples(ins, batch_size, steps, 'steps')
-    if verbose == 1:
-      if steps is not None:
-        progbar = Progbar(target=steps,
-                          stateful_metrics=self.stateful_metric_names)
-      else:
-        progbar = Progbar(target=num_samples,
-                          stateful_metrics=self.stateful_metric_names)
-
-    indices_for_conversion_to_dense = []
-    for i in range(len(self._feed_inputs)):
-      if (issparse is not None and issparse(ins[i]) and
-          not K.is_sparse(self._feed_inputs[i])):
-        indices_for_conversion_to_dense.append(i)
-
-    if steps is not None:
-      # Step-based predictions.
-      # Since we do not know how many samples
-      # we will see, we cannot pre-allocate
-      # the returned Numpy arrays.
-      # Instead, we store one array per batch seen
-      # and concatenate them upon returning.
-      unconcatenated_outs = []
-      for step in range(steps):
-        batch_outs = f(ins)
-        if not isinstance(batch_outs, list):
-          batch_outs = [batch_outs]
-        if step == 0:
-          for batch_out in batch_outs:
-            unconcatenated_outs.append([])
-        for i, batch_out in enumerate(batch_outs):
-          unconcatenated_outs[i].append(batch_out)
-        if verbose == 1:
-          progbar.update(step + 1)
-      if len(unconcatenated_outs) == 1:
-        return np.concatenate(unconcatenated_outs[0], axis=0)
-      return [
-          np.concatenate(unconcatenated_outs[i], axis=0)
-          for i in range(len(unconcatenated_outs))
-      ]
-    else:
-      # Sample-based predictions.
-      outs = []
-      batches = make_batches(num_samples, batch_size)
-      index_array = np.arange(num_samples)
-      for batch_index, (batch_start, batch_end) in enumerate(batches):
-        batch_ids = index_array[batch_start:batch_end]
-        if ins and isinstance(ins[-1], float):
-          # Do not slice the training phase flag.
-          ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-        else:
-          ins_batch = slice_arrays(ins, batch_ids)
-        for i in indices_for_conversion_to_dense:
-          ins_batch[i] = ins_batch[i].toarray()
-
-        batch_outs = f(ins_batch)
-        if not isinstance(batch_outs, list):
-          batch_outs = [batch_outs]
-        if batch_index == 0:
-          # Pre-allocate the results arrays.
-          for batch_out in batch_outs:
-            shape = (num_samples,) + batch_out.shape[1:]
-            outs.append(np.zeros(shape, dtype=batch_out.dtype))
-        for i, batch_out in enumerate(batch_outs):
-          outs[i][batch_start:batch_end] = batch_out
-        if verbose == 1:
-          progbar.update(batch_end)
-      if len(outs) == 1:
-        return outs[0]
-      return outs
-
-  def _test_loop(self, f, ins, batch_size=None, verbose=0, steps=None):
-    """Abstract method to loop over some data in batches.
-
-    Arguments:
-        f: Keras function returning a list of tensors.
-        ins: list of tensors to be fed to `f`.
-        batch_size: integer batch size or `None`.
-        verbose: verbosity mode.
-        steps: Total number of steps (batches of samples)
-            before declaring predictions finished.
-            Ignored with the default value of `None`.
-
-    Returns:
-        Scalar loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-    """
-    if hasattr(self, 'metrics'):
-      for m in self.metrics:
-        if isinstance(m, Layer):
-          m.reset_states()
-      stateful_metric_indices = [
-          i for i, name in enumerate(self.metrics_names)
-          if str(name) in self.stateful_metric_names
-      ]
-    else:
-      stateful_metric_indices = []
-
-    num_samples = self._check_num_samples(ins, batch_size, steps, 'steps')
-    outs = []
-    if verbose == 1:
-      if steps is not None:
-        progbar = Progbar(target=steps)
-      else:
-        progbar = Progbar(target=num_samples)
-
-    # To prevent a slowdown, we find beforehand the arrays that need conversion.
-    feed = self._feed_inputs + self._feed_targets + self._feed_sample_weights
-    indices_for_conversion_to_dense = []
-    for i in range(len(feed)):
-      if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
-        indices_for_conversion_to_dense.append(i)
-
-    if steps is not None:
-      for step in range(steps):
-        batch_outs = f(ins)
-        if isinstance(batch_outs, list):
-          if step == 0:
-            for _ in enumerate(batch_outs):
-              outs.append(0.)
-          for i, batch_out in enumerate(batch_outs):
-            if i in stateful_metric_indices:
-              outs[i] = batch_out
-            else:
-              outs[i] += batch_out
-        else:
-          if step == 0:
-            outs.append(0.)
-          outs[0] += batch_outs
-        if verbose == 1:
-          progbar.update(step + 1)
-      for i in range(len(outs)):
-        if i not in stateful_metric_indices:
-          outs[i] /= steps
-    else:
-      batches = make_batches(num_samples, batch_size)
-      index_array = np.arange(num_samples)
-      for batch_index, (batch_start, batch_end) in enumerate(batches):
-        batch_ids = index_array[batch_start:batch_end]
-        if isinstance(ins[-1], float):
-          # Do not slice the training phase flag.
-          ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-        else:
-          ins_batch = slice_arrays(ins, batch_ids)
-        for i in indices_for_conversion_to_dense:
-          ins_batch[i] = ins_batch[i].toarray()
-
-        batch_outs = f(ins_batch)
-
-        if isinstance(batch_outs, list):
-          if batch_index == 0:
-            for batch_out in enumerate(batch_outs):
-              outs.append(0.)
-          for i, batch_out in enumerate(batch_outs):
-            if i in stateful_metric_indices:
-              outs[i] = batch_out
-            else:
-              outs[i] += batch_out * len(batch_ids)
-        else:
-          if batch_index == 0:
-            outs.append(0.)
-          outs[0] += batch_outs * len(batch_ids)
-        if verbose == 1:
-          progbar.update(batch_end)
-      for i in range(len(outs)):
-        if i not in stateful_metric_indices:
-          outs[i] /= num_samples
-    if len(outs) == 1:
-      return outs[0]
-    return outs
+    self._post_build_cleanup()
 
   def _standardize_user_data(self,
                              x,
@@ -1566,12 +639,20 @@ class Model(Network):
     This is a purely internal method, subject to refactoring at any time.
 
     Args:
-      x: An array or list of arrays, to be used as input data. If the model
-       has known, named inputs, this could also be a dict mapping input names
-       to the corresponding array.
-      y: An array or list of arrays, to be used as target data. If the model
-       has known, named outputs, this could also be a dict mapping output names
-       to the corresponding array.
+      x: Input data. It could be:
+        - A Numpy array (or array-like), or a list of arrays
+          (in case the model has multiple inputs).
+        - A TensorFlow tensor, or a list of tensors
+          (in case the model has multiple inputs).
+        - A dict mapping input names to the corresponding array/tensors,
+          if the model has named inputs.
+        - A `tf.data` dataset iterator.
+      y: Target data. Like the input data `x`,
+        it could be either Numpy array(s) or TensorFlow tensor(s).
+        It should be consistent with `x` (you cannot have Numpy inputs and
+        tensor targets, or inversely). If `x` is a dataset iterator,
+        `y` should not be specified
+        (since targets will be obtained from the iterator).
       sample_weight: An optional sample-weight array passed by the user to
         weight the importance of each sample in `x`.
       class_weight: An optional class-weight array by the user to
@@ -1591,6 +672,31 @@ class Model(Network):
       RuntimeError: If the model was never compiled.
     """
     # First, we build/compile the model on the fly if necessary.
+    if isinstance(x, dataset_ops.Dataset):
+      raise ValueError('You passed a `Dataset` instance to your model (%s), '
+                       'which is not supported. Instead, pass an `Iterator`, '
+                       'which you can obtain e.g. via '
+                       '`dataset.make_one_shot_iterator()` (the exact method '
+                       'to use will depend on your specific dataset).' % x)
+    if isinstance(x, iterator_ops.Iterator):
+      if y is not None:
+        raise ValueError('You passed a dataset iterator (%s) as input `x` to '
+                         'your model. In that case, you should not specify '
+                         'a target (`y`) argument, since the dataset iterator '
+                         'generates both input data and target data. '
+                         'Received: %s' % (x, y))
+      if not context.executing_eagerly():
+        x, y = x.get_next()
+        # TODO(fchollet): handle case of `get_next` not returning 2 tensors?
+      else:
+        # TODO(psv): implement this. The way to support it will be to typecheck
+        # for `iterator` before `_standardize_user_data` is called and redirect
+        # to new training/eval functions in `training_eager.py`. The model
+        # may need to get built using the specs of the data from the first batch
+        # drawn from the iterator.
+        raise ValueError('Dataset iterators are not supported '
+                         'with eager execution yet.')
+
     all_inputs = []
     if not self.built:
       # We need to use `x` to set the model inputs.
@@ -1651,13 +757,13 @@ class Model(Network):
                              'TensorFlow tensors. '
                              'You passed: x=' + str(x) + '; y=' + str(y))
 
-        if context.in_graph_mode():
+        if context.executing_eagerly():
+          target_tensors = None
+        else:
           # Handle target tensors if any passed.
           if not isinstance(y, (list, tuple)):
             y = [y]
           target_tensors = [v for v in y if tensor_util.is_tensor(v)]
-        else:
-          target_tensors = None
         self.compile(optimizer=self.optimizer,
                      loss=self.loss,
                      metrics=self.metrics,
@@ -1674,7 +780,7 @@ class Model(Network):
     # What follows is input validation and standardization to list format,
     # in the case where all inputs are value arrays.
 
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       # In eager mode, do not do shape validation.
       feed_input_names = self.input_names
       feed_input_shapes = None
@@ -1689,7 +795,7 @@ class Model(Network):
       feed_input_shapes = self._feed_input_shapes
 
     # Standardize the inputs.
-    x = _standardize_input_data(
+    x = training_utils.standardize_input_data(
         x,
         feed_input_names,
         feed_input_shapes,
@@ -1697,7 +803,7 @@ class Model(Network):
         exception_prefix='input')
 
     if y is not None:
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         feed_output_names = self.output_names
         feed_output_shapes = None
         # Sample weighting not supported in this case.
@@ -1728,7 +834,7 @@ class Model(Network):
             feed_output_shapes.append(output_shape)
 
       # Standardize the outputs.
-      y = _standardize_input_data(
+      y = training_utils.standardize_input_data(
           y,
           feed_output_names,
           feed_output_shapes,
@@ -1737,21 +843,21 @@ class Model(Network):
 
       # Generate sample-wise weight values given the `sample_weight` and
       # `class_weight` arguments.
-      sample_weights = _standardize_sample_weights(sample_weight,
-                                                   feed_output_names)
-      class_weights = _standardize_class_weights(class_weight,
-                                                 feed_output_names)
+      sample_weights = training_utils.standardize_sample_weights(
+          sample_weight, feed_output_names)
+      class_weights = training_utils.standardize_class_weights(
+          class_weight, feed_output_names)
       sample_weights = [
-          _standardize_weights(ref, sw, cw, mode)
+          training_utils.standardize_weights(ref, sw, cw, mode)
           for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
                                          feed_sample_weight_modes)
       ]
       # Check that all arrays have the same length.
-      _check_array_lengths(x, y, sample_weights)
-      if self._is_graph_network and not context.in_eager_mode():
+      training_utils.check_array_lengths(x, y, sample_weights)
+      if self._is_graph_network and not context.executing_eagerly():
         # Additional checks to avoid users mistakenly using improper loss fns.
-        _check_loss_and_target_compatibility(y, self._feed_loss_fns,
-                                             feed_output_shapes)
+        training_utils.check_loss_and_target_compatibility(
+            y, self._feed_loss_fns, feed_output_shapes)
     else:
       y = []
       sample_weights = []
@@ -1787,7 +893,19 @@ class Model(Network):
         whether to build the model's graph in inference mode (False), training
         mode (True), or using the Keras learning phase (None).
     """
-    if context.in_eager_mode():
+    if not getattr(self, '_uses_inputs_arg', True):
+      raise NotImplementedError(
+          'Subclassed Models without "inputs" in their call() signatures do '
+          'not yet support shape inference. File a feature request if this '
+          'limitation bothers you.')
+    if self.__class__.__name__ == 'Sequential':
+      # Note: we can't test whether the model is `Sequential` via `isinstance`
+      # since `Sequential` depends on `Model`.
+      if isinstance(inputs, list):
+        assert len(inputs) == 1
+        inputs = inputs[0]
+      self.build(input_shape=(None,) + inputs.shape[1:])
+    elif context.executing_eagerly():
       self._eager_set_inputs(inputs)
     else:
       self._symbolic_set_inputs(inputs, training=training)
@@ -1807,7 +925,7 @@ class Model(Network):
     Raises:
       ValueError: If the model's inputs are already set.
     """
-    assert context.in_eager_mode()
+    assert context.executing_eagerly()
     if self.inputs:
       raise ValueError('Model inputs are already set.')
     # On-the-fly setting of model inputs/outputs as DeferredTensors,
@@ -1825,25 +943,28 @@ class Model(Network):
     else:
       dummy_output_values = [dummy_output_values]
     self.outputs = [
-        _DeferredTensor(shape=(None for _ in v.shape),
-                        dtype=v.dtype) for v in dummy_output_values]
+        DeferredTensor(shape=(None for _ in v.shape),
+                       dtype=v.dtype) for v in dummy_output_values]
     self.inputs = [
-        _DeferredTensor(shape=(None for _ in v.shape),
-                        dtype=v.dtype) for v in dummy_input_values]
+        DeferredTensor(shape=(None for _ in v.shape),
+                       dtype=v.dtype) for v in dummy_input_values]
     self.input_names = [
         'input_%d' % (i + 1) for i in range(len(dummy_input_values))]
     self.output_names = [
         'output_%d' % (i + 1) for i in range(len(dummy_output_values))]
     self.built = True
 
-  def _symbolic_set_inputs(self, inputs, training=None):
-    """Set model's inputs based on the input data received from the user.
+  def _symbolic_set_inputs(self, inputs, outputs=None, training=None):
+    """Set model's inputs and output specs based.
 
     This is to be used for Model subclasses, which do not know at instantiation
     time what their inputs look like.
 
     Args:
       inputs: Argument `x` (input data) passed by the user upon first model use.
+      outputs: None, a data tensor, or a list of data tensors. If None, the
+        outputs will be determined by invoking self.call(), otherwise the
+        provided value will be used.
       training: Boolean or None. Only relevant in symbolic mode. Specifies
         whether to build the model's graph in inference mode (False), training
         mode (True), or using the Keras learning phase (None).
@@ -1851,7 +972,7 @@ class Model(Network):
     Raises:
       ValueError: If the model's inputs are already set.
     """
-    assert context.in_graph_mode()
+    assert not context.executing_eagerly()
     if self.inputs:
       raise ValueError('Model inputs are already set.')
 
@@ -1893,17 +1014,18 @@ class Model(Network):
           self._feed_input_names.append(name)
           self._feed_input_shapes.append(K.int_shape(v))
 
-    # Obtain symbolic outputs by calling the model.
-    if len(self.inputs) == 1:
-      if self._expects_training_arg:
-        outputs = self.call(self.inputs[0], training=training)
-      else:
-        outputs = self.call(self.inputs[0])
-    else:
-      if self._expects_training_arg:
-        outputs = self.call(self.inputs, training=training)
+    if outputs is None:
+      # Obtain symbolic outputs by calling the model.
+      if len(self.inputs) == 1:
+        if self._expects_training_arg:
+          outputs = self.call(self.inputs[0], training=training)
+        else:
+          outputs = self.call(self.inputs[0])
       else:
-        outputs = self.call(self.inputs)
+        if self._expects_training_arg:
+          outputs = self.call(self.inputs, training=training)
+        else:
+          outputs = self.call(self.inputs)
     if isinstance(outputs, (list, tuple)):
       outputs = list(outputs)
     else:
@@ -1932,22 +1054,26 @@ class Model(Network):
     """Trains the model for a fixed number of epochs (iterations on a dataset).
 
     Arguments:
-        x: Numpy array of training data (if the model has a single input),
-            or list of Numpy arrays (if the model has multiple inputs).
-            If input layers in the model are named, you can also pass a
-            dictionary mapping input names to Numpy arrays.
-            `x` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        y: Numpy array of target (label) data
-            (if the model has a single output),
-            or list of Numpy arrays (if the model has multiple outputs).
-            If output layers in the model are named, you can also pass a
-            dictionary mapping output names to Numpy arrays.
-            `y` can be `None` (default) if feeding from
-            TensorFlow data tensors.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors or dataset iterators (since they generate
+            batches).
         epochs: Integer. Number of epochs to train the model.
             An epoch is an iteration over the entire `x` and `y`
             data provided.
@@ -1969,11 +1095,14 @@ class Model(Network):
             on this data at the end of each epoch.
             The validation data is selected from the last samples
             in the `x` and `y` data provided, before shuffling.
-        validation_data: tuple `(x_val, y_val)` or tuple
-            `(x_val, y_val, val_sample_weights)` on which to evaluate
+        validation_data: Data on which to evaluate
             the loss and any model metrics at the end of each epoch.
             The model will not be trained on this data.
             `validation_data` will override `validation_split`.
+            `validation_data` could be:
+              - tuple `(x_val, y_val)` of Numpy arrays or tensors
+              - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
+              - dataset iterator
         shuffle: Boolean (whether to shuffle the training data
             before each epoch) or str (for 'batch').
             'batch' is a special option for dealing with the
@@ -2049,34 +1178,34 @@ class Model(Network):
         class_weight=class_weight,
         batch_size=batch_size)
     # Prepare validation data.
-    do_validation = False
-    val_ins = []
     if validation_data:
-      do_validation = True
-      if len(validation_data) == 2:
+      if isinstance(validation_data, iterator_ops.Iterator):
+        val_x = validation_data
+        val_y = None
+        val_sample_weight = None
+      elif len(validation_data) == 2:
         val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
         val_sample_weight = None
       elif len(validation_data) == 3:
         val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
       else:
         raise ValueError(
-            'When passing validation_data, '
-            'it must contain 2 (x_val, y_val) '
-            'or 3 (x_val, y_val, val_sample_weights) '
-            'items, however it contains %d items' % len(validation_data))
+            'When passing a `validation_data` argument, '
+            'it must contain either 2 items (x_val, y_val), '
+            'or 3 items (x_val, y_val, val_sample_weights), '
+            'or alternatively it could be a dataset iterator. However we '
+            'received `validation_data=%s`' % validation_data)
 
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
           val_y,
           sample_weight=val_sample_weight,
           batch_size=batch_size)
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        val_ins = val_x + val_y + val_sample_weights + [0.]
-      else:
-        val_ins = val_x + val_y + val_sample_weights
 
     elif validation_split and 0. < validation_split < 1.:
-      do_validation = True
+      if training_utils.has_symbolic_tensors(x):
+        raise ValueError('If your data is in the form of symbolic tensors, '
+                         'you cannot use `validation_split`.')
       if hasattr(x[0], 'shape'):
         split_at = int(x[0].shape[0] * (1. - validation_split))
       else:
@@ -2085,77 +1214,44 @@ class Model(Network):
       y, val_y = (slice_arrays(y, 0, split_at), slice_arrays(y, split_at))
       sample_weights, val_sample_weights = (slice_arrays(
           sample_weights, 0, split_at), slice_arrays(sample_weights, split_at))
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        val_ins = val_x + val_y + val_sample_weights + [0.]
-      else:
-        val_ins = val_x + val_y + val_sample_weights
-
     elif validation_steps:
-      do_validation = True
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        val_ins = [0.]
-
-    # Prepare input arrays and training function.
-    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = x + y + sample_weights + [1.]
+      val_x = []
+      val_y = []
+      val_sample_weights = []
     else:
-      ins = x + y + sample_weights
-
-    # Prepare display labels.
-    out_labels = self.metrics_names
-
-    if context.in_eager_mode():
-      if do_validation:
-        callback_metrics = copy.copy(out_labels) + [
-            'val_' + n for n in out_labels
-        ]
-      else:
-        callback_metrics = copy.copy(out_labels)
+      val_x = None
+      val_y = None
+      val_sample_weights = None
 
+    if context.executing_eagerly():
       return training_eager.fit_loop(
           self,
-          ins,
-          out_labels=out_labels,
+          inputs=x,
+          targets=y,
+          sample_weights=sample_weights,
           batch_size=batch_size,
           epochs=epochs,
           verbose=verbose,
           callbacks=callbacks,
-          val_ins=val_ins,
+          val_inputs=val_x,
+          val_targets=val_y,
+          val_sample_weights=val_sample_weights,
           shuffle=shuffle,
-          callback_metrics=callback_metrics,
           initial_epoch=initial_epoch,
           steps_per_epoch=steps_per_epoch,
           validation_steps=validation_steps)
     else:
-      self._make_train_function()
-      f = self.train_function
-
-      if do_validation:
-        if context.in_graph_mode():
-          self._make_test_function()
-          val_f = self.test_function
-        else:
-          val_f = None
-        callback_metrics = copy.copy(out_labels) + [
-            'val_' + n for n in out_labels
-        ]
-      else:
-        val_f = None
-        callback_metrics = copy.copy(out_labels)
-
-      # Delegate logic to `_fit_loop`.
-      return self._fit_loop(
-          f,
-          ins,
-          out_labels=out_labels,
+      return training_arrays.fit_loop(
+          self, x, y,
+          sample_weights=sample_weights,
           batch_size=batch_size,
           epochs=epochs,
           verbose=verbose,
           callbacks=callbacks,
-          val_f=val_f,
-          val_ins=val_ins,
+          val_inputs=val_x,
+          val_targets=val_y,
+          val_sample_weights=val_sample_weights,
           shuffle=shuffle,
-          callback_metrics=callback_metrics,
           initial_epoch=initial_epoch,
           steps_per_epoch=steps_per_epoch,
           validation_steps=validation_steps)
@@ -2172,22 +1268,26 @@ class Model(Network):
     Computation is done in batches.
 
     Arguments:
-        x: Numpy array of test data (if the model has a single input),
-            or list of Numpy arrays (if the model has multiple inputs).
-            If input layers in the model are named, you can also pass a
-            dictionary mapping input names to Numpy arrays.
-            `x` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        y: Numpy array of target (label) data
-            (if the model has a single output),
-            or list of Numpy arrays (if the model has multiple outputs).
-            If output layers in the model are named, you can also pass a
-            dictionary mapping output names to Numpy arrays.
-            `y` can be `None` (default) if feeding from
-            TensorFlow data tensors.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         batch_size: Integer or `None`.
-            Number of samples per evaluation step.
+            Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors or dataset iterators (since they generate
+            batches).
         verbose: 0 or 1. Verbosity mode.
             0 = silent, 1 = progress bar.
         sample_weight: Optional Numpy array of weights for
@@ -2229,20 +1329,15 @@ class Model(Network):
         y,
         sample_weight=sample_weight,
         batch_size=batch_size)
-    # Prepare inputs, delegate logic to `_test_loop`.
-    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = x + y + sample_weights + [0.]
-    else:
-      ins = x + y + sample_weights
 
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       return training_eager.test_loop(
-          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+          self, inputs=x, targets=y, sample_weights=sample_weights,
+          batch_size=batch_size, verbose=verbose, steps=steps)
     else:
-      self._make_test_function()
-      f = self.test_function
-      return self._test_loop(
-          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+      return training_arrays.test_loop(
+          self, inputs=x, targets=y, sample_weights=sample_weights,
+          batch_size=batch_size, verbose=verbose, steps=steps)
 
   def predict(self, x, batch_size=None, verbose=0, steps=None):
     """Generates output predictions for the input samples.
@@ -2250,9 +1345,13 @@ class Model(Network):
     Computation is done in batches.
 
     Arguments:
-        x: The input data, as a Numpy array
-            (or list of Numpy arrays if the model has multiple outputs).
-        batch_size: Integer. If unspecified, it will default to 32.
+        x: Input samples, as Numpy array(s) or tensor(s).
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors or dataset iterators (since they generate
+            batches).
         verbose: Verbosity mode, 0 or 1.
         steps: Total number of steps (batches of samples)
             before declaring the prediction round finished.
@@ -2276,36 +1375,31 @@ class Model(Network):
                        'argument.')
     x, _, _ = self._standardize_user_data(x)
 
-    # Prepare inputs, delegate logic to `_predict_loop`.
-    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = x + [0.]
-    else:
-      ins = x
-
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       return training_eager.predict_loop(
-          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+          self, x, batch_size=batch_size, verbose=verbose, steps=steps)
     else:
-      self._make_predict_function()
-      f = self.predict_function
-
-      return self._predict_loop(
-          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+      return training_arrays.predict_loop(
+          self, x, batch_size=batch_size, verbose=verbose, steps=steps)
 
-  def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
+  def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None):
     """Runs a single gradient update on a single batch of data.
 
     Arguments:
-        x: Numpy array of training data,
-            or list of Numpy arrays if the model has multiple inputs.
-            If all inputs in the model are named,
-            you can also pass a dictionary
-            mapping input names to Numpy arrays.
-        y: Numpy array of target data,
-            or list of Numpy arrays if the model has multiple outputs.
-            If all outputs in the model are named,
-            you can also pass a dictionary
-            mapping output names to Numpy arrays.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
             weights to apply to the model's loss for each sample.
             In the case of temporal data, you can pass a 2D array
@@ -2327,20 +1421,24 @@ class Model(Network):
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
 
+    Raises:
+      ValueError: In case of invalid user-provided arguments.
     """
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
         sample_weight=sample_weight,
         class_weight=class_weight)
-    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = x + y + sample_weights + [1.]
-    else:
-      ins = x + y + sample_weights
 
-    if context.in_eager_mode():
-      outputs = training_eager.train_on_batch(self, ins)
+    if context.executing_eagerly():
+      outputs = training_eager.train_on_batch(
+          self, x, y, sample_weights=sample_weights)
     else:
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        ins = x + y + sample_weights + [1]
+      else:
+        ins = x + y + sample_weights
+
       self._make_train_function()
       outputs = self.train_function(ins)
 
@@ -2348,20 +1446,24 @@ class Model(Network):
       return outputs[0]
     return outputs
 
-  def test_on_batch(self, x, y, sample_weight=None):
+  def test_on_batch(self, x, y=None, sample_weight=None):
     """Test the model on a single batch of samples.
 
     Arguments:
-        x: Numpy array of test data,
-            or list of Numpy arrays if the model has multiple inputs.
-            If all inputs in the model are named,
-            you can also pass a dictionary
-            mapping input names to Numpy arrays.
-        y: Numpy array of target data,
-            or list of Numpy arrays if the model has multiple outputs.
-            If all outputs in the model are named,
-            you can also pass a dictionary
-            mapping output names to Numpy arrays.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
             weights to apply to the model's loss for each sample.
             In the case of temporal data, you can pass a 2D array
@@ -2377,18 +1479,19 @@ class Model(Network):
         the display labels for the scalar outputs.
 
     Raises:
-        ValueError: in case of invalid arguments.
+        ValueError: In case of invalid user-provided arguments.
     """
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight)
-    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = x + y + sample_weights + [0.]
-    else:
-      ins = x + y + sample_weights
 
-    if context.in_eager_mode():
-      outputs = training_eager.test_on_batch(self, ins)
+    if context.executing_eagerly():
+      outputs = training_eager.test_on_batch(
+          self, x, y, sample_weights=sample_weights)
     else:
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        ins = x + y + sample_weights + [0]
+      else:
+        ins = x + y + sample_weights
       self._make_test_function()
       outputs = self.test_function(ins)
 
@@ -2400,7 +1503,7 @@ class Model(Network):
     """Returns predictions for a single batch of samples.
 
     Arguments:
-        x: Input samples, as a Numpy array.
+        x: Input samples, as Numpy array(s) or tensor(s).
 
     Returns:
         Numpy array(s) of predictions.
@@ -2408,26 +1511,19 @@ class Model(Network):
     """
     x, _, _ = self._standardize_user_data(x)
 
-    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = x + [0.]
-    else:
-      ins = x
-
-    if context.in_eager_mode():
-      ins_batch_converted = []
-      for ib in ins:
-        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
-
-      eager_model_inputs = []
-      for i in range(len(self.inputs)):
-        eager_model_inputs.append(ins_batch_converted[i])
+    if context.executing_eagerly():
+      inputs = [ops.convert_to_tensor(val, dtype=K.floatx()) for val in x]
+      return self(inputs)  # pylint: disable=not-callable
 
-      outs = self(eager_model_inputs)  # pylint: disable=not-callable
-      return outs
+    if not context.executing_eagerly():
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        ins = x + [0]
+      else:
+        ins = x
 
-    if context.in_graph_mode():
       self._make_predict_function()
       outputs = self.predict_function(ins)
+
       if len(outputs) == 1:
         return outputs[0]
       return outputs
@@ -2499,20 +1595,19 @@ class Model(Network):
         max_queue_size: Integer. Maximum size for the generator queue.
             If unspecified, `max_queue_size` will default to 10.
         workers: Integer. Maximum number of processes to spin up
-            when using process based threading.
+            when using process-based threading.
             If unspecified, `workers` will default to 1. If 0, will
             execute the generator on the main thread.
-        use_multiprocessing: Boolean. If True, use process based threading.
-            If unspecified, `workers` will default to False.
-            Note that because
-            this implementation relies on multiprocessing,
-            you should not pass
-            non picklable arguments to the generator
-            as they can't be passed
-            easily to children processes.
-        shuffle: Whether to shuffle the order of the batches at
+        use_multiprocessing: Boolean.
+            If `True`, use process-based threading.
+            If unspecified, `use_multiprocessing` will default to `False`.
+            Note that because this implementation relies on multiprocessing,
+            you should not pass non-picklable arguments to the generator
+            as they can't be passed easily to children processes.
+        shuffle: Boolean. Whether to shuffle the order of the batches at
             the beginning of each epoch. Only used with instances
-            of `Sequence` (keras.utils.Sequence).
+            of `Sequence` (`keras.utils.Sequence`).
+            Has no effect when `steps_per_epoch` is not `None`.
         initial_epoch: Epoch at which to start training
             (useful for resuming a previous training run)
 
@@ -2539,217 +1634,25 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    if not self._is_graph_network:
+    if not self.built and not self._is_graph_network:
       raise NotImplementedError(
-          '`fit_generator` is not yet enabled for Model subclasses')
-
-    wait_time = 0.01  # in seconds
-    epoch = initial_epoch
-
-    do_validation = bool(validation_data)
-    self._make_train_function()
-    if do_validation:
-      self._make_test_function()
-
-    is_sequence = isinstance(generator, Sequence)
-    if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning(
-          UserWarning('Using a generator with `use_multiprocessing=True`'
-                      ' and multiple workers may duplicate your data.'
-                      ' Please consider using the`keras.utils.Sequence'
-                      ' class.'))
-    if steps_per_epoch is None:
-      if is_sequence:
-        steps_per_epoch = len(generator)
-      else:
-        raise ValueError('`steps_per_epoch=None` is only valid for a'
-                         ' generator based on the `keras.utils.Sequence`'
-                         ' class. Please specify `steps_per_epoch` or use'
-                         ' the `keras.utils.Sequence` class.')
-
-    # python 2 has 'next', 3 has '__next__'
-    # avoid any explicit version checks
-    val_gen = (
-        hasattr(validation_data, 'next') or
-        hasattr(validation_data, '__next__') or
-        isinstance(validation_data, Sequence))
-    if (val_gen and not isinstance(validation_data, Sequence) and
-        not validation_steps):
-      raise ValueError('`validation_steps=None` is only valid for a'
-                       ' generator based on the `keras.utils.Sequence`'
-                       ' class. Please specify `validation_steps` or use'
-                       ' the `keras.utils.Sequence` class.')
-
-    # Prepare display labels.
-    out_labels = self.metrics_names
-    callback_metrics = out_labels + ['val_%s' % n for n in out_labels]
-
-    # prepare callbacks
-    self.history = cbks.History()
-    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [self.history]
-    if verbose:
-      callbacks += [cbks.ProgbarLogger(count_mode='steps')]
-    callbacks = cbks.CallbackList(callbacks)
-
-    # it's possible to callback a different model than self:
-    if hasattr(self, 'callback_model') and self.callback_model:
-      callback_model = self.callback_model
-    else:
-      callback_model = self
-    callbacks.set_model(callback_model)
-    callbacks.set_params({
-        'epochs': epochs,
-        'steps': steps_per_epoch,
-        'verbose': verbose,
-        'do_validation': do_validation,
-        'metrics': callback_metrics,
-    })
-    callbacks.on_train_begin()
-
-    enqueuer = None
-    val_enqueuer = None
-
-    try:
-      if do_validation:
-        if val_gen:
-          if workers > 0:
-            if isinstance(validation_data, Sequence):
-              val_enqueuer = OrderedEnqueuer(
-                  validation_data, use_multiprocessing=use_multiprocessing)
-              if validation_steps is None:
-                validation_steps = len(validation_data)
-            else:
-              val_enqueuer = GeneratorEnqueuer(
-                  validation_data,
-                  use_multiprocessing=use_multiprocessing,
-                  wait_time=wait_time)
-            val_enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-            validation_generator = val_enqueuer.get()
-          else:
-            validation_generator = validation_data
-        else:
-          if len(validation_data) == 2:
-            val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-            val_sample_weight = None
-          elif len(validation_data) == 3:
-            val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
-          else:
-            raise ValueError(
-                '`validation_data` should be a tuple '
-                '`(val_x, val_y, val_sample_weight)` '
-                'or `(val_x, val_y)`. Found: ' + str(validation_data))
-          val_x, val_y, val_sample_weights = self._standardize_user_data(
-              val_x, val_y, val_sample_weight)
-          val_data = val_x + val_y + val_sample_weights
-          if self.uses_learning_phase and not isinstance(
-              K.learning_phase(), int):
-            val_data += [0.]
-          for cbk in callbacks:
-            cbk.validation_data = val_data
-
-      if workers > 0:
-        if is_sequence:
-          enqueuer = OrderedEnqueuer(
-              generator,
-              use_multiprocessing=use_multiprocessing,
-              shuffle=shuffle)
-        else:
-          enqueuer = GeneratorEnqueuer(
-              generator,
-              use_multiprocessing=use_multiprocessing,
-              wait_time=wait_time)
-        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-        output_generator = enqueuer.get()
-      else:
-        output_generator = generator
-
-      callback_model.stop_training = False
-      # Construct epoch logs.
-      epoch_logs = {}
-      while epoch < epochs:
-        callbacks.on_epoch_begin(epoch)
-        steps_done = 0
-        batch_index = 0
-        while steps_done < steps_per_epoch:
-          generator_output = next(output_generator)
-
-          if not hasattr(generator_output, '__len__'):
-            raise ValueError('Output of generator should be '
-                             'a tuple `(x, y, sample_weight)` '
-                             'or `(x, y)`. Found: ' + str(generator_output))
-
-          if len(generator_output) == 2:
-            x, y = generator_output
-            sample_weight = None
-          elif len(generator_output) == 3:
-            x, y, sample_weight = generator_output
-          else:
-            raise ValueError('Output of generator should be '
-                             'a tuple `(x, y, sample_weight)` '
-                             'or `(x, y)`. Found: ' + str(generator_output))
-          # build batch logs
-          batch_logs = {}
-          if isinstance(x, list):
-            batch_size = x[0].shape[0]
-          elif isinstance(x, dict):
-            batch_size = list(x.values())[0].shape[0]
-          else:
-            batch_size = x.shape[0]
-          batch_logs['batch'] = batch_index
-          batch_logs['size'] = batch_size
-          callbacks.on_batch_begin(batch_index, batch_logs)
-
-          outs = self.train_on_batch(
-              x, y, sample_weight=sample_weight, class_weight=class_weight)
-
-          if not isinstance(outs, list):
-            outs = [outs]
-          for l, o in zip(out_labels, outs):
-            batch_logs[l] = o
-
-          callbacks.on_batch_end(batch_index, batch_logs)
-
-          batch_index += 1
-          steps_done += 1
-
-          # Epoch finished.
-          if steps_done >= steps_per_epoch and do_validation:
-            if val_gen:
-              val_outs = self.evaluate_generator(
-                  validation_generator, validation_steps, workers=0)
-            else:
-              # No need for try/except because
-              # data has already been validated.
-              val_outs = self.evaluate(
-                  val_x,
-                  val_y,
-                  batch_size=batch_size,
-                  sample_weight=val_sample_weights,
-                  verbose=0)
-            if not isinstance(val_outs, list):
-              val_outs = [val_outs]
-            # Same labels assumed.
-            for l, o in zip(out_labels, val_outs):
-              epoch_logs['val_' + l] = o
-
-          if callback_model.stop_training:
-            break
-
-        callbacks.on_epoch_end(epoch, epoch_logs)
-        epoch += 1
-        if callback_model.stop_training:
-          break
-
-    finally:
-      try:
-        if enqueuer is not None:
-          enqueuer.stop()
-      finally:
-        if val_enqueuer is not None:
-          val_enqueuer.stop()
-
-    callbacks.on_train_end()
-    return self.history
+          '`fit_generator` is not yet enabled for unbuilt Model subclasses')
+
+    return training_generator.fit_generator(
+        self,
+        generator,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_data=validation_data,
+        validation_steps=validation_steps,
+        class_weight=class_weight,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch)
 
   def evaluate_generator(self,
                          generator,
@@ -2774,16 +1677,15 @@ class Model(Network):
             the `len(generator)` as a number of steps.
         max_queue_size: maximum size for the generator queue
         workers: Integer. Maximum number of processes to spin up
-            when using process based threading.
+            when using process-based threading.
             If unspecified, `workers` will default to 1. If 0, will
             execute the generator on the main thread.
-        use_multiprocessing: if True, use process based threading.
-            Note that because
-            this implementation relies on multiprocessing,
-            you should not pass
-            non picklable arguments to the generator
-            as they can't be passed
-            easily to children processes.
+        use_multiprocessing: Boolean.
+            If `True`, use process-based threading.
+            If unspecified, `use_multiprocessing` will default to `False`.
+            Note that because this implementation relies on multiprocessing,
+            you should not pass non-picklable arguments to the generator
+            as they can't be passed easily to children processes.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -2798,91 +1700,18 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    if not self._is_graph_network:
+    if not self.built and not self._is_graph_network:
       raise NotImplementedError(
-          '`evaluate_generator` is not yet enabled for Model subclasses')
-
-    self._make_test_function()
-
-    steps_done = 0
-    wait_time = 0.01
-    all_outs = []
-    batch_sizes = []
-    is_sequence = isinstance(generator, Sequence)
-    if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning(
-          UserWarning('Using a generator with `use_multiprocessing=True`'
-                      ' and multiple workers may duplicate your data.'
-                      ' Please consider using the`keras.utils.Sequence'
-                      ' class.'))
-    if steps is None:
-      if is_sequence:
-        steps = len(generator)
-      else:
-        raise ValueError('`steps=None` is only valid for a generator'
-                         ' based on the `keras.utils.Sequence` class.'
-                         ' Please specify `steps` or use the'
-                         ' `keras.utils.Sequence` class.')
-    enqueuer = None
-
-    try:
-      if workers > 0:
-        if is_sequence:
-          enqueuer = OrderedEnqueuer(
-              generator, use_multiprocessing=use_multiprocessing)
-        else:
-          enqueuer = GeneratorEnqueuer(
-              generator,
-              use_multiprocessing=use_multiprocessing,
-              wait_time=wait_time)
-        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-        output_generator = enqueuer.get()
-      else:
-        output_generator = generator
-
-      while steps_done < steps:
-        generator_output = next(output_generator)
-        if not hasattr(generator_output, '__len__'):
-          raise ValueError('Output of generator should be a tuple '
-                           '(x, y, sample_weight) '
-                           'or (x, y). Found: ' + str(generator_output))
-        if len(generator_output) == 2:
-          x, y = generator_output
-          sample_weight = None
-        elif len(generator_output) == 3:
-          x, y, sample_weight = generator_output
-        else:
-          raise ValueError('Output of generator should be a tuple '
-                           '(x, y, sample_weight) '
-                           'or (x, y). Found: ' + str(generator_output))
-        outs = self.test_on_batch(x, y, sample_weight=sample_weight)
-
-        if isinstance(x, list):
-          batch_size = x[0].shape[0]
-        elif isinstance(x, dict):
-          batch_size = list(x.values())[0].shape[0]
-        else:
-          batch_size = x.shape[0]
-        if batch_size == 0:
-          raise ValueError('Received an empty batch. '
-                           'Batches should at least contain one item.')
-        all_outs.append(outs)
-
-        steps_done += 1
-        batch_sizes.append(batch_size)
+          '`evaluate_generator` is not yet enabled for '
+          'unbuilt Model subclasses')
 
-    finally:
-      if enqueuer is not None:
-        enqueuer.stop()
-
-    if not isinstance(outs, list):
-      return np.average(np.asarray(all_outs), weights=batch_sizes)
-    else:
-      averages = []
-      for i in range(len(outs)):
-        averages.append(
-            np.average([out[i] for out in all_outs], weights=batch_sizes))
-      return averages
+    return training_generator.evaluate_generator(
+        self,
+        generator,
+        steps=steps,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing)
 
   def predict_generator(self,
                         generator,
@@ -2907,16 +1736,15 @@ class Model(Network):
             the `len(generator)` as a number of steps.
         max_queue_size: Maximum size for the generator queue.
         workers: Integer. Maximum number of processes to spin up
-            when using process based threading.
+            when using process-based threading.
             If unspecified, `workers` will default to 1. If 0, will
             execute the generator on the main thread.
-        use_multiprocessing: If `True`, use process based threading.
-            Note that because
-            this implementation relies on multiprocessing,
-            you should not pass
-            non picklable arguments to the generator
-            as they can't be passed
-            easily to children processes.
+        use_multiprocessing: Boolean.
+            If `True`, use process-based threading.
+            If unspecified, `use_multiprocessing` will default to `False`.
+            Note that because this implementation relies on multiprocessing,
+            you should not pass non-picklable arguments to the generator
+            as they can't be passed easily to children processes.
         verbose: verbosity mode, 0 or 1.
 
     Returns:
@@ -2926,92 +1754,15 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    if not self._is_graph_network:
+    if not self.built and not self._is_graph_network:
       raise NotImplementedError(
-          '`predict_generator` is not yet enabled for Model subclasses')
-
-    self._make_predict_function()
-
-    steps_done = 0
-    wait_time = 0.01
-    all_outs = []
-    is_sequence = isinstance(generator, Sequence)
-    if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning(
-          UserWarning('Using a generator with `use_multiprocessing=True`'
-                      ' and multiple workers may duplicate your data.'
-                      ' Please consider using the`keras.utils.Sequence'
-                      ' class.'))
-    if steps is None:
-      if is_sequence:
-        steps = len(generator)
-      else:
-        raise ValueError('`steps=None` is only valid for a generator'
-                         ' based on the `keras.utils.Sequence` class.'
-                         ' Please specify `steps` or use the'
-                         ' `keras.utils.Sequence` class.')
-    enqueuer = None
-
-    try:
-      if workers > 0:
-        if is_sequence:
-          enqueuer = OrderedEnqueuer(
-              generator, use_multiprocessing=use_multiprocessing)
-        else:
-          enqueuer = GeneratorEnqueuer(
-              generator,
-              use_multiprocessing=use_multiprocessing,
-              wait_time=wait_time)
-        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-        output_generator = enqueuer.get()
-      else:
-        output_generator = generator
-
-      if verbose == 1:
-        progbar = Progbar(target=steps)
-
-      while steps_done < steps:
-        generator_output = next(output_generator)
-        if isinstance(generator_output, tuple):
-          # Compatibility with the generators
-          # used for training.
-          if len(generator_output) == 2:
-            x, _ = generator_output
-          elif len(generator_output) == 3:
-            x, _, _ = generator_output
-          else:
-            raise ValueError('Output of generator should be '
-                             'a tuple `(x, y, sample_weight)` '
-                             'or `(x, y)`. Found: ' + str(generator_output))
-        else:
-          # Assumes a generator that only
-          # yields inputs (not targets and sample weights).
-          x = generator_output
-
-        outs = self.predict_on_batch(x)
-        if not isinstance(outs, list):
-          outs = [outs]
-
-        if not all_outs:
-          for out in outs:
-            all_outs.append([])
-
-        for i, out in enumerate(outs):
-          all_outs[i].append(out)
-        steps_done += 1
-        if verbose == 1:
-          progbar.update(steps_done)
-
-    finally:
-      if enqueuer is not None:
-        enqueuer.stop()
-
-    if len(all_outs) == 1:
-      if steps_done == 1:
-        return all_outs[0][0]
-      else:
-        return np.concatenate(all_outs[0])
-    if steps_done == 1:
-      return [out[0] for out in all_outs]
-    else:
-      return [np.concatenate(out) for out in all_outs]
+          '`predict_generator` is not yet enabled for unbuilt Model subclasses')
+
+    return training_generator.predict_generator(
+        self,
+        generator,
+        steps=steps,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        verbose=verbose)
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py b/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
new file mode 100644
index 0000000000000000000000000000000000000000..4164cae864c7f8aa8f59bb66d585aaa682fc8ff8
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
@@ -0,0 +1,498 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Part of the Keras training engine related to plain array data.
+"""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import numpy as np
+
+from tensorflow.python.framework import errors
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import callbacks as cbks
+from tensorflow.python.keras._impl.keras.engine import training_utils
+from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
+from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
+from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.platform import tf_logging as logging
+
+try:
+  from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
+except ImportError:
+  issparse = None
+
+
+def fit_loop(model,
+             inputs,
+             targets,
+             sample_weights=None,
+             batch_size=None,
+             epochs=100,
+             verbose=1,
+             callbacks=None,
+             val_inputs=None,
+             val_targets=None,
+             val_sample_weights=None,
+             shuffle=True,
+             callback_metrics=None,
+             initial_epoch=0,
+             steps_per_epoch=None,
+             validation_steps=None):
+  """Abstract fit function for arrays of data.
+
+  Arguments:
+      model: Keras Model instance.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      batch_size: Integer batch size or None if unknown.
+      epochs: Number of times to iterate over the data
+      verbose: Verbosity mode, 0, 1 or 2
+      callbacks: List of callbacks to be called during training
+      val_inputs: List of input arrays.
+      val_targets: List of target arrays.
+      val_sample_weights: Optional list of sample weight arrays.
+      shuffle: Whether to shuffle the data at the beginning of each epoch
+      callback_metrics: List of strings, the display names of the metrics
+          passed to the callbacks. They should be the
+          concatenation of list the display names of the outputs of
+           `f` and the list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training
+          (useful for resuming a previous training run)
+      steps_per_epoch: Total number of steps (batches of samples)
+          before declaring one epoch finished and starting the
+          next epoch. Ignored with the default value of `None`.
+      validation_steps: Number of steps to run validation for
+          (only if doing validation from data tensors).
+          Ignored with the default value of `None`.
+
+  Returns:
+      `History` object.
+
+  Raises:
+      ValueError: in case of invalid arguments.
+  """
+  model._make_train_function()
+  f = model.train_function
+
+  sample_weights = sample_weights or []
+  val_sample_weights = val_sample_weights or []
+  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    ins = inputs + targets + sample_weights + [1]
+    if val_inputs:
+      val_ins = val_inputs + val_targets + val_sample_weights + [1]
+  else:
+    ins = inputs + targets + sample_weights
+    if val_inputs:
+      val_ins = val_inputs + val_targets + val_sample_weights
+  if not val_inputs:
+    val_ins = []
+
+  do_validation = False
+  if val_inputs:
+    do_validation = True
+    if verbose and inputs and hasattr(inputs[0], 'shape') and hasattr(
+        val_inputs[0], 'shape'):
+      print('Train on %d samples, validate on %d samples' %
+            (inputs[0].shape[0], val_inputs[0].shape[0]))
+  if validation_steps:
+    do_validation = True
+    if steps_per_epoch is None:
+      raise ValueError('Can only use `validation_steps` '
+                       'when doing step-wise '
+                       'training, i.e. `steps_per_epoch` '
+                       'must be set.')
+
+  out_labels = model.metrics_names
+  if do_validation:
+    callback_metrics = copy.copy(out_labels) + [
+        'val_' + n for n in out_labels
+    ]
+  else:
+    callback_metrics = copy.copy(out_labels)
+
+  num_train_samples = training_utils.check_num_samples(
+      ins, batch_size, steps_per_epoch, 'steps_per_epoch')
+  if num_train_samples is not None:
+    index_array = np.arange(num_train_samples)
+
+  model.history = cbks.History()
+  all_callbacks = [cbks.BaseLogger(
+      stateful_metrics=model.stateful_metric_names)]
+  if verbose:
+    if steps_per_epoch is not None:
+      count_mode = 'steps'
+    else:
+      count_mode = 'samples'
+    all_callbacks.append(
+        cbks.ProgbarLogger(
+            count_mode, stateful_metrics=model.stateful_metric_names))
+  all_callbacks += (callbacks or []) + [model.history]
+  callbacks = cbks.CallbackList(all_callbacks)
+  out_labels = out_labels or []
+
+  # it's possible to callback a different model than self
+  # (used by Sequential models)
+  if hasattr(model, 'callback_model') and model.callback_model:
+    callback_model = model.callback_model
+  else:
+    callback_model = model
+
+  callbacks.set_model(callback_model)
+
+  callbacks.set_params({
+      'batch_size': batch_size,
+      'epochs': epochs,
+      'steps': steps_per_epoch,
+      'samples': num_train_samples,
+      'verbose': verbose,
+      'do_validation': do_validation,
+      'metrics': callback_metrics or [],
+  })
+  callbacks.on_train_begin()
+  callback_model.stop_training = False
+  for cbk in callbacks:
+    cbk.validation_data = val_ins
+
+  # To prevent a slowdown, we find beforehand the arrays that need conversion.
+  feed = model._feed_inputs + model._feed_targets + model._feed_sample_weights
+  indices_for_conversion_to_dense = []
+  for i in range(len(feed)):
+    if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
+      indices_for_conversion_to_dense.append(i)
+
+  for epoch in range(initial_epoch, epochs):
+    # Reset stateful metrics
+    for m in model.metrics:
+      if isinstance(m, Layer):
+        m.reset_states()
+    # Update callbacks
+    callbacks.on_epoch_begin(epoch)
+    epoch_logs = {}
+    if steps_per_epoch is not None:
+      for step_index in range(steps_per_epoch):
+        batch_logs = {}
+        batch_logs['batch'] = step_index
+        batch_logs['size'] = 1
+        callbacks.on_batch_begin(step_index, batch_logs)
+        try:
+          outs = f(ins)
+        except errors.OutOfRangeError:
+          logging.warning('Your dataset iterator ran out of data; '
+                          'interrupting training. Make sure that your dataset '
+                          'can generate at least `steps_per_epoch * epochs` '
+                          'batches (in this case, %d batches).' %
+                          steps_per_epoch * epochs)
+          break
+
+        if not isinstance(outs, list):
+          outs = [outs]
+        for l, o in zip(out_labels, outs):
+          batch_logs[l] = o
+
+        callbacks.on_batch_end(step_index, batch_logs)
+        if callback_model.stop_training:
+          break
+
+      if do_validation:
+        val_outs = test_loop(
+            model,
+            val_inputs,
+            val_targets,
+            sample_weights=val_sample_weights,
+            batch_size=batch_size,
+            steps=validation_steps,
+            verbose=0)
+        if not isinstance(val_outs, list):
+          val_outs = [val_outs]
+        # Same labels assumed.
+        for l, o in zip(out_labels, val_outs):
+          epoch_logs['val_' + l] = o
+    else:
+      if shuffle == 'batch':
+        index_array = training_utils.batch_shuffle(index_array, batch_size)
+      elif shuffle:
+        np.random.shuffle(index_array)
+
+      batches = make_batches(num_train_samples, batch_size)
+
+      for batch_index, (batch_start, batch_end) in enumerate(batches):
+        batch_ids = index_array[batch_start:batch_end]
+        try:
+          if isinstance(ins[-1], int):
+            # Do not slice the training phase flag.
+            ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+          else:
+            ins_batch = slice_arrays(ins, batch_ids)
+        except TypeError:
+          raise TypeError('TypeError while preparing batch. '
+                          'If using HDF5 input data, '
+                          'pass shuffle="batch".')
+        batch_logs = {}
+        batch_logs['batch'] = batch_index
+        batch_logs['size'] = len(batch_ids)
+        callbacks.on_batch_begin(batch_index, batch_logs)
+        for i in indices_for_conversion_to_dense:
+          ins_batch[i] = ins_batch[i].toarray()
+
+        outs = f(ins_batch)
+        if not isinstance(outs, list):
+          outs = [outs]
+        for l, o in zip(out_labels, outs):
+          batch_logs[l] = o
+
+        callbacks.on_batch_end(batch_index, batch_logs)
+        if callback_model.stop_training:
+          break
+
+        if batch_index == len(batches) - 1:  # Last batch.
+          if do_validation:
+            val_outs = test_loop(
+                model,
+                val_inputs,
+                val_targets,
+                sample_weights=val_sample_weights,
+                batch_size=batch_size,
+                verbose=0)
+            if not isinstance(val_outs, list):
+              val_outs = [val_outs]
+            # Same labels assumed.
+            for l, o in zip(out_labels, val_outs):
+              epoch_logs['val_' + l] = o
+    callbacks.on_epoch_end(epoch, epoch_logs)
+    if callback_model.stop_training:
+      break
+  callbacks.on_train_end()
+  return model.history
+
+
+def predict_loop(model, inputs, batch_size=32, verbose=0, steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model: Keras Model instance.
+      inputs: list of tensors to be fed to `f`.
+      batch_size: integer batch size.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring `_predict_loop` finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions
+      (if the model has multiple outputs).
+  """
+  model._make_predict_function()
+  f = model.predict_function
+
+  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    ins = inputs + [0]
+  else:
+    ins = inputs
+
+  num_samples = training_utils.check_num_samples(
+      inputs, batch_size, steps, 'steps')
+  if verbose == 1:
+    if steps is not None:
+      progbar = Progbar(target=steps)
+    else:
+      progbar = Progbar(target=num_samples)
+
+  indices_for_conversion_to_dense = []
+  for i in range(len(model._feed_inputs)):
+    if (issparse is not None and issparse(inputs[i]) and
+        not K.is_sparse(model._feed_inputs[i])):
+      indices_for_conversion_to_dense.append(i)
+
+  if steps is not None:
+    # Step-based predictions.
+    # Since we do not know how many samples
+    # we will see, we cannot pre-allocate
+    # the returned Numpy arrays.
+    # Instead, we store one array per batch seen
+    # and concatenate them upon returning.
+    unconcatenated_outs = []
+    for step in range(steps):
+      batch_outs = f(ins)
+      if not isinstance(batch_outs, list):
+        batch_outs = [batch_outs]
+      if step == 0:
+        for batch_out in batch_outs:
+          unconcatenated_outs.append([])
+      for i, batch_out in enumerate(batch_outs):
+        unconcatenated_outs[i].append(batch_out)
+      if verbose == 1:
+        progbar.update(step + 1)
+    if len(unconcatenated_outs) == 1:
+      return np.concatenate(unconcatenated_outs[0], axis=0)
+    return [
+        np.concatenate(unconcatenated_outs[i], axis=0)
+        for i in range(len(unconcatenated_outs))
+    ]
+  else:
+    # Sample-based predictions.
+    outs = []
+    batches = make_batches(num_samples, batch_size)
+    index_array = np.arange(num_samples)
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      if ins and isinstance(ins[-1], int):
+        # Do not slice the training phase flag.
+        ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+      else:
+        ins_batch = slice_arrays(ins, batch_ids)
+      for i in indices_for_conversion_to_dense:
+        ins_batch[i] = ins_batch[i].toarray()
+
+      batch_outs = f(ins_batch)
+      if not isinstance(batch_outs, list):
+        batch_outs = [batch_outs]
+      if batch_index == 0:
+        # Pre-allocate the results arrays.
+        for batch_out in batch_outs:
+          shape = (num_samples,) + batch_out.shape[1:]
+          outs.append(np.zeros(shape, dtype=batch_out.dtype))
+      for i, batch_out in enumerate(batch_outs):
+        outs[i][batch_start:batch_end] = batch_out
+      if verbose == 1:
+        progbar.update(batch_end)
+    if len(outs) == 1:
+      return outs[0]
+    return outs
+
+
+def test_loop(model, inputs, targets,
+              sample_weights=None,
+              batch_size=None,
+              verbose=0,
+              steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model: Keras Model instance.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      batch_size: integer batch size or `None`.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring predictions finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+  """
+  model._make_test_function()
+  f = model.test_function
+
+  sample_weights = sample_weights or []
+  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    ins = inputs + targets + sample_weights + [0]
+  else:
+    ins = inputs + targets + sample_weights
+
+  if hasattr(model, 'metrics'):
+    for m in model.metrics:
+      if isinstance(m, Layer):
+        m.reset_states()
+    stateful_metric_indices = [
+        i for i, name in enumerate(model.metrics_names)
+        if str(name) in model.stateful_metric_names
+    ]
+  else:
+    stateful_metric_indices = []
+
+  num_samples = training_utils.check_num_samples(
+      ins, batch_size, steps, 'steps')
+  outs = []
+  if verbose == 1:
+    if steps is not None:
+      progbar = Progbar(target=steps)
+    else:
+      progbar = Progbar(target=num_samples)
+
+  # To prevent a slowdown, we find beforehand the arrays that need conversion.
+  feed = model._feed_inputs + model._feed_targets + model._feed_sample_weights
+  indices_for_conversion_to_dense = []
+  for i in range(len(feed)):
+    if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
+      indices_for_conversion_to_dense.append(i)
+
+  if steps is not None:
+    for step in range(steps):
+      batch_outs = f(ins)
+      if isinstance(batch_outs, list):
+        if step == 0:
+          for _ in enumerate(batch_outs):
+            outs.append(0.)
+        for i, batch_out in enumerate(batch_outs):
+          if i in stateful_metric_indices:
+            outs[i] = batch_out
+          else:
+            outs[i] += batch_out
+      else:
+        if step == 0:
+          outs.append(0.)
+        outs[0] += batch_outs
+      if verbose == 1:
+        progbar.update(step + 1)
+    for i in range(len(outs)):
+      if i not in stateful_metric_indices:
+        outs[i] /= steps
+  else:
+    batches = make_batches(num_samples, batch_size)
+    index_array = np.arange(num_samples)
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      if isinstance(ins[-1], int):
+        # Do not slice the training phase flag.
+        ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+      else:
+        ins_batch = slice_arrays(ins, batch_ids)
+      for i in indices_for_conversion_to_dense:
+        ins_batch[i] = ins_batch[i].toarray()
+
+      batch_outs = f(ins_batch)
+
+      if isinstance(batch_outs, list):
+        if batch_index == 0:
+          for batch_out in enumerate(batch_outs):
+            outs.append(0.)
+        for i, batch_out in enumerate(batch_outs):
+          if i in stateful_metric_indices:
+            outs[i] = batch_out
+          else:
+            outs[i] += batch_out * len(batch_ids)
+      else:
+        if batch_index == 0:
+          outs.append(0.)
+        outs[0] += batch_outs * len(batch_ids)
+      if verbose == 1:
+        progbar.update(batch_end)
+    for i in range(len(outs)):
+      if i not in stateful_metric_indices:
+        outs[i] /= num_samples
+  if len(outs) == 1:
+    return outs[0]
+  return outs
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index 3507f36e14de28e1049895da5cbfd036dbb414f7..34adeb7599d657b337cc8499355d1f6834cbaa8d 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -12,23 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras training and evaluation routines.
+"""Keras training and evaluation routines for eager execution.
 """
 # pylint: disable=protected-access
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+import copy
+
 import numpy as np
+
 from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
-from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras._impl.keras.engine import training_utils
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
 
 
 def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
@@ -54,7 +59,7 @@ def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
 
 
 def _eager_loss_fn(outputs, targets, loss_fn, output_name):
-  with K.name_scope(output_name + '_loss'):
+  with backend.name_scope(output_name + '_loss'):
     loss = loss_fn(targets, outputs)
   return loss
 
@@ -82,7 +87,7 @@ def _eager_metrics_fn(model, outputs, targets):
     output_metrics = model.nested_metrics[i]
     for nested_output_metric in output_metrics:
       metric_name, metric_fn = _get_metrics_info(
-          nested_output_metric, K.int_shape(model.outputs[i]),
+          nested_output_metric, backend.int_shape(model.outputs[i]),
           model.loss_functions[i])
 
       if len(model.output_names) > 1:
@@ -90,23 +95,23 @@ def _eager_metrics_fn(model, outputs, targets):
         if metric_name not in model.metrics_names:
           model.metrics_names.append(metric_name)
 
-      with K.name_scope(metric_name):
-        metric_result = metric_fn(outputs[i], targets[i])
+      with backend.name_scope(metric_name):
+        metric_result = metric_fn(targets[i], outputs[i])
         metric_names.append(metric_name)
-        metric_results.append(K.mean(metric_result))
+        metric_results.append(backend.mean(metric_result))
 
-  return metric_names, metric_results
+  return metric_results
 
 
-def _model_loss(model, inputs, targets, training=False):
+def _model_loss(model, inputs, targets, sample_weights=None, training=False):
   """Calculates the loss for a given model.
 
   Arguments:
-     model: The model on which metrics are being calculated.
-     inputs: The inputs of the given model. This is typically the mini batch of
-              data that is fed to the model.
-     targets: The predictions or targets of the given model.
-     training: Whether the model should be run in inference or training mode.
+      model: The model on which metrics are being calculated.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      training: Whether the model should be run in inference or training mode.
 
   Returns:
      Returns the model output, total loss and loss value calculated using the
@@ -131,33 +136,27 @@ def _model_loss(model, inputs, targets, training=False):
     targets = [targets]
 
   loss_metrics = []
-  with K.name_scope('loss'):
+  with backend.name_scope('loss'):
     for i, loss_fn in enumerate(model.loss_functions):
-      # compute the loss
-      output_loss = _eager_loss_fn(outs[i], targets[i], loss_fn,
-                                   model.output_names[i])
-      loss_metrics.append(K.mean(output_loss))
+      if sample_weights:
+        weights = sample_weights[i]
+      else:
+        weights = None
 
+      # TODO(fchollet): support masking; in practice `_keras_mask` is never
+      # set in this context currently.
       mask = outs[i]._keras_mask
-      # adapted from weighted_loss_fn
-      if mask is not None:
-        # mask should have the same shape as output_loss
-        output_loss *= mask
-        #  the loss per batch should be proportional
-        #  to the number of unmasked samples.
-        output_loss /= K.mean(mask)
-
-      # adapted from weighted_loss_fn
-      # apply sample weighting
-      if model.sample_weights:
-        # reduce score_array to same ndim as weight array
-        ndim = K.ndim(output_loss)
-        weight_ndim = K.ndim(model.sample_weights)
-        output_loss = K.mean(output_loss, axis=list(range(weight_ndim, ndim)))
-        output_loss *= model.sample_weights
-        output_loss /= K.mean(K.cast(K.not_equal(model.sample_weights, 0),
-                                     K.floatx()))
-        output_loss = K.mean(output_loss)
+
+      weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
+      with backend.name_scope(model.output_names[i] + '_loss'):
+        output_loss = weighted_masked_fn(
+            targets[i], outs[i], weights, mask=mask)
+      # If the number of outputs is 1 then we don't append the loss metric
+      # associated with each model output. When there are multiple outputs
+      # associated with a model, each output's loss is calculated and returned
+      # as part of the loss_metrics.
+      if len(model.outputs) > 1:
+        loss_metrics.append(backend.mean(output_loss))
 
       loss_weight = model.loss_weights_list[i]
       if total_loss is None:
@@ -165,7 +164,7 @@ def _model_loss(model, inputs, targets, training=False):
       else:
         total_loss += loss_weight * output_loss
 
-    total_loss = K.mean(total_loss)
+    total_loss = backend.mean(total_loss)
     # Add regularization losses
     custom_losses = []
     for layer in model.layers:
@@ -178,16 +177,55 @@ def _model_loss(model, inputs, targets, training=False):
   return outs, total_loss, loss_metrics
 
 
-def _process_single_batch(eager_model_inputs, eager_model_outputs, model,
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
+
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
+
+  Arguments:
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
+
+  Returns:
+    Slice of data (either single array or list of arrays).
+  """
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    converted_to_list = False
+    if not isinstance(arrays, list):
+      converted_to_list = True
+      arrays = [arrays]
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
+    if converted_to_list:
+      slices = slices[0]
+    return slices
+  else:
+    return generic_utils.slice_arrays(arrays, indices)
+
+
+def _process_single_batch(model,
+                          inputs,
+                          targets,
+                          sample_weights=None,
                           training=False):
   """Calculate the loss and gradient for one input batch.
 
      The model weights are updated if training is set to True.
 
   Arguments:
-      eager_model_inputs: Input batch data.
-      eager_model_outputs: Output batch data.
       model: Model whose loss has to be calculated.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
       training: The boolean represents if the weights of the model are updated.
               'fit' methods will set this to True while 'evaluate' methods will
               set this to False.
@@ -196,84 +234,83 @@ def _process_single_batch(eager_model_inputs, eager_model_outputs, model,
       output of the model, total loss and the loss associated with each output.
 
   Raises:
-      ValueError: If the model loss is 0 or if the trainable weights list is
-                  empty when the trainable parameter is set to True.
+      ValueError: If the model has no loss to optimize.
   """
-  K.set_learning_phase(training)
-  with GradientTape() as tape:
-    outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
-                                           eager_model_outputs,
-                                           training=training)
-    if loss is None:
-      raise ValueError('The model cannot be run '
-                       'because it has no loss to optimize.')
-  if training:
-    if not model._collected_trainable_weights:
-      raise ValueError('The list of trainable weights is empty. Make sure that '
-                       'you are not setting model.trainable to False before '
-                       'compiling the model.')
-    grads = tape.gradient(loss, model._collected_trainable_weights)
-    model.optimizer.apply_gradients(zip(grads,
-                                        model._collected_trainable_weights))
-  return outs, loss, loss_metrics
-
-
-def train_on_batch(model, ins):
+  with backend.learning_phase_scope(1 if training else 0):
+    with GradientTape() as tape:
+      outs, loss, loss_metrics = _model_loss(model, inputs, targets,
+                                             sample_weights=sample_weights,
+                                             training=training)
+      if loss is None:
+        raise ValueError('The model cannot be run '
+                         'because it has no loss to optimize.')
+    if training:
+      if not model._collected_trainable_weights:
+        logging.warning('The list of trainable weights is empty. Make sure that'
+                        ' you are not setting model.trainable to False before '
+                        'compiling the model.')
+      else:
+        grads = tape.gradient(loss, model._collected_trainable_weights)
+        model.optimizer.apply_gradients(zip(grads,
+                                            model._collected_trainable_weights))
+    return outs, loss, loss_metrics
+
+
+def train_on_batch(model, inputs, targets, sample_weights=None):
   """Calculates the loss and gradient updates for one input batch.
 
   Arguments:
-      model: Given model on which loss and gradients are calculated.
-      ins: Input and output batch numpy arrays.
+      model: Model whose loss has to be calculated.
+      inputs: Input batch data.
+      targets: Target batch data.
+      sample_weights: Sample weight batch data.
 
   Returns:
       total loss and the loss associated with each output.
   """
-  ins_batch_converted = []
-  for ib in ins:
-    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
-  eager_model_inputs = []
-  eager_model_outputs = []
-  for i in range(len(model.inputs)):
-    eager_model_inputs.append(ins_batch_converted[i])
-  for i in range(len(model.inputs), len(ins_batch_converted)):
-    eager_model_outputs.append(ins_batch_converted[i])
+  inputs = [
+      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs]
+  targets = [
+      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets]
+  sample_weights = [
+      ops.convert_to_tensor(val, dtype=backend.floatx())
+      if val is not None else None for val in sample_weights]
   outs, loss, _ = _process_single_batch(
-      eager_model_inputs, eager_model_outputs, model, training=True)
+      model, inputs, targets, sample_weights=sample_weights, training=True)
   if not isinstance(outs, list):
     outs = [outs]
-  _, metrics_results = _eager_metrics_fn(
-      model, outs, eager_model_outputs)
+  metrics_results = _eager_metrics_fn(
+      model, outs, targets)
   if not isinstance(loss, list):
     loss = [loss]
   return loss + metrics_results
 
 
-def test_on_batch(model, ins):
+def test_on_batch(model, inputs, targets, sample_weights=None):
   """Calculates the loss for one input batch.
 
   Arguments:
-      model: Given model on which loss is calculated.
-      ins: Input and output batch numpy arrays.
+      model: Model whose loss has to be calculated.
+      inputs: Input batch data.
+      targets: Target batch data.
+      sample_weights: Sample weight batch data.
 
   Returns:
       total loss, loss and metrics associated with each output.
   """
-  ins_batch_converted = []
-  for ib in ins:
-    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
-  eager_model_inputs = []
-  eager_model_outputs = []
-  for i in range(len(model.inputs)):
-    eager_model_inputs.append(ins_batch_converted[i])
-  for i in range(len(model.inputs), len(ins_batch_converted)):
-    eager_model_outputs.append(ins_batch_converted[i])
+  inputs = [
+      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs]
+  targets = [
+      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets]
+  sample_weights = [
+      ops.convert_to_tensor(val, dtype=backend.floatx())
+      if val is not None else None for val in sample_weights]
   outs, loss, loss_metrics = _process_single_batch(
-      eager_model_inputs, eager_model_outputs, model, training=False)
+      model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
-  metric_names, metrics_results = _eager_metrics_fn(
-      model, outs, eager_model_outputs)
-  model.metrics_names.append(metric_names)
+  metrics_results = _eager_metrics_fn(
+      model, outs, targets)
   if not isinstance(loss, list):
     loss = [loss]
   return loss + loss_metrics + metrics_results
@@ -281,32 +318,35 @@ def test_on_batch(model, ins):
 
 def fit_loop(
     model,
-    ins,
-    out_labels=None,
+    inputs,
+    targets,
+    sample_weights=None,
+    val_inputs=None,
+    val_targets=None,
+    val_sample_weights=None,
     batch_size=None,
     epochs=100,
     verbose=1,
     callbacks=None,
-    val_ins=None,
     shuffle=True,
     callback_metrics=None,
     initial_epoch=0,
     steps_per_epoch=None,
     validation_steps=None):
-  """Abstract fit function for `f(ins)`.
-
-  Assume that f returns a list, labeled by out_labels.
+  """Abstract fit function for eager execution.
 
   Arguments:
       model: Instance of the model that is being executed in Eager mode.
-      ins: List of tensors to be fed to `f`
-      out_labels: List of strings, display names of
-          the outputs of `f`
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      val_inputs: Input data for validation.
+      val_targets: Target data for validation.
+      val_sample_weights: Sample weight data for validation.
       batch_size: Integer batch size or None if unknown.
       epochs: Number of times to iterate over the data
       verbose: Verbosity mode, 0, 1 or 2
       callbacks: List of callbacks to be called during training
-      val_ins: List of tensors to be fed to `val_f`
       shuffle: Whether to shuffle the data at the beginning of each epoch
       callback_metrics: List of strings, the display names of the metrics
           passed to the callbacks. They should be the
@@ -326,166 +366,184 @@ def fit_loop(
   Raises:
     ValueError: In case of invalid argument values.
   """
+  if not batch_size:
+    raise ValueError('With eager execution, `batch_size` should be specified.')
+  if steps_per_epoch or validation_steps:
+    raise ValueError('With eager execution, `steps_per_epoch` and '
+                     '`validation_steps` are not valid arguments '
+                     '(set `batch_size` instead).')
   # Required for Eager mode
-  K.set_learning_phase(True)
-
-  do_validation = False
-  if val_ins:
-    do_validation = True
-    if (verbose and ins and hasattr(ins[0], 'shape') and
-        hasattr(val_ins[0], 'shape')):
-      print('Train on %d samples, validate on %d samples' %
-            (ins[0].shape[0], val_ins[0].shape[0]))
-  if validation_steps:
-    if steps_per_epoch is None:
-      raise ValueError('Can only use `validation_steps` when doing step-wise '
-                       'training, i.e. `steps_per_epoch` must be set.')
-    do_validation = True
-
-  num_train_samples = model._check_num_samples(
-      ins, batch_size, steps_per_epoch, 'steps_per_epoch')
-
-  if num_train_samples is not None:
-    index_array = np.arange(num_train_samples)
-
-  model.history = cbks.History()
-  callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
-  if verbose:
-    if steps_per_epoch is not None:
-      count_mode = 'steps'
+  with backend.learning_phase_scope(1):
+    do_validation = False
+    if val_inputs:
+      do_validation = True
+      if (verbose and inputs and hasattr(inputs[0], 'shape') and
+          hasattr(val_inputs[0], 'shape')):
+        print('Train on %d samples, validate on %d samples' %
+              (inputs[0].shape[0], val_inputs[0].shape[0]))
+    if validation_steps:
+      if steps_per_epoch is None:
+        raise ValueError('Can only use `validation_steps` when doing step-wise '
+                         'training, i.e. `steps_per_epoch` must be set.')
+      do_validation = True
+
+    out_labels = model.metrics_names
+    if do_validation:
+      callback_metrics = copy.copy(out_labels) + [
+          'val_' + n for n in out_labels
+      ]
     else:
-      count_mode = 'samples'
-    callbacks += [cbks.ProgbarLogger(count_mode)]
-  callbacks = cbks.CallbackList(callbacks)
-  out_labels = out_labels or []
-
-  # it's possible to callback a different model than self
-  # (used by Sequential models)
-  if hasattr(model, 'callback_model') and model.callback_model:
-    callback_model = model.callback_model
-  else:
-    callback_model = model
-
-  callbacks.set_model(callback_model)
-
-  callbacks.set_params({
-      'batch_size': batch_size,
-      'epochs': epochs,
-      'steps': steps_per_epoch,
-      'samples': num_train_samples,
-      'verbose': verbose,
-      'do_validation': do_validation,
-      'metrics': callback_metrics or [],
-  })
-  callbacks.on_train_begin()
-  callback_model.stop_training = False
-  for cbk in callbacks:
-    cbk.validation_data = val_ins
-
-  for epoch in range(initial_epoch, epochs):
-    callbacks.on_epoch_begin(epoch)
-    epoch_logs = {}
-    if shuffle == 'batch':
-      index_array = model._batch_shuffle(index_array, batch_size)
-    elif shuffle:
-      np.random.shuffle(index_array)
-
-    batches = make_batches(num_train_samples, batch_size)
+      callback_metrics = copy.copy(out_labels)
 
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      try:
-        if isinstance(ins[-1], float):
-          # Do not slice the training phase flag.
-          ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-        else:
-          ins_batch = slice_arrays(ins, batch_ids)
-      except TypeError:
-        raise TypeError('TypeError while preparing batch. '
-                        'If using HDF5 input data, '
-                        'pass shuffle="batch".')
-      batch_logs = {}
-      batch_logs['batch'] = batch_index
-      batch_logs['size'] = len(batch_ids)
-
-      callbacks.on_batch_begin(batch_index, batch_logs)
-
-      ins_batch_converted = []
-      for ib in ins_batch:
-        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
-      eager_model_inputs = []
-      eager_model_outputs = []
-      for i in range(len(model.inputs)):
-        eager_model_inputs.append(ins_batch_converted[i])
-
-      for i in range(len(model.inputs), len(ins_batch_converted)):
-        eager_model_outputs.append(ins_batch_converted[i])
-
-      outs, loss, loss_metrics = _process_single_batch(eager_model_inputs,
-                                                       eager_model_outputs,
-                                                       model,
-                                                       training=True)
-
-      if not isinstance(outs, list):
-        outs = [outs]
-
-      for l, o in zip(out_labels, outs):
-        batch_logs[l] = o
-      # Required for Eager mode
-      metrics_names, metrics_results = _eager_metrics_fn(model, outs,
-                                                         eager_model_outputs)
-      batch_logs['loss'] = tensor_util.constant_value(K.mean(loss))
-
-      # TODO(anjalisridhar): Move this to compile to avoid duplicate code.
-      # In graph mode we set the metric names in compile. However in
-      # Eager mode we calculate the metrics for each batch in fit_loop.
-      # We could calculate the metric names and functions in compile.
-      # This would avoid setting the callback parameters separately.
-      # We need to do this for the first iteration alone
-      for m in metrics_names:
-        if m not in callback_metrics:
-          callback_metrics.append(m)
-
-      callbacks.set_params({
-          'batch_size': batch_size,
-          'epochs': epochs,
-          'steps': steps_per_epoch,
-          'samples': num_train_samples,
-          'verbose': verbose,
-          'do_validation': do_validation,
-          'metrics': callback_metrics or [],
-      })
-
-      for k, v in zip(model.metrics_names,
-                      [K.mean(loss)] + loss_metrics + metrics_results):
-        batch_logs[k] = tensor_util.constant_value(v)
-
-      callbacks.on_batch_end(batch_index, batch_logs)
+    if sample_weights:
+      feed_data = inputs + targets + sample_weights
+    else:
+      feed_data = inputs + targets
+    num_train_samples = training_utils.check_num_samples(
+        feed_data,
+        batch_size=batch_size,
+        steps=steps_per_epoch,
+        steps_name='steps_per_epoch')
+
+    if num_train_samples is not None:
+      index_array = np.arange(num_train_samples)
+
+    model.history = cbks.History()
+    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
+    if verbose:
+      if steps_per_epoch is not None:
+        count_mode = 'steps'
+      else:
+        count_mode = 'samples'
+      callbacks += [cbks.ProgbarLogger(count_mode)]
+    callbacks = cbks.CallbackList(callbacks)
+
+    # it's possible to callback a different model than self
+    # (used by Sequential models)
+    if hasattr(model, 'callback_model') and model.callback_model:
+      callback_model = model.callback_model
+    else:
+      callback_model = model
+
+    callbacks.set_model(callback_model)
+
+    callbacks.set_params({
+        'batch_size': batch_size,
+        'epochs': epochs,
+        'steps': steps_per_epoch,
+        'samples': num_train_samples,
+        'verbose': verbose,
+        'do_validation': do_validation,
+        'metrics': callback_metrics or [],
+    })
+    callbacks.on_train_begin()
+    callback_model.stop_training = False
+    for cbk in callbacks:
+      if not val_inputs:
+        cbk.validation_data = []
+      elif val_sample_weights:
+        cbk.validation_data = val_inputs + val_targets + val_sample_weights
+      else:
+        cbk.validation_data = val_inputs + val_targets
+
+    for epoch in range(initial_epoch, epochs):
+      callbacks.on_epoch_begin(epoch)
+      epoch_logs = {}
+      if shuffle == 'batch':
+        index_array = model._batch_shuffle(index_array, batch_size)
+      elif shuffle:
+        np.random.shuffle(index_array)
+
+      batches = generic_utils.make_batches(num_train_samples, batch_size)
+
+      for batch_index, (batch_start, batch_end) in enumerate(batches):
+        batch_ids = index_array[batch_start:batch_end]
+        try:
+          inputs_batch = slice_arrays(inputs, batch_ids,
+                                      contiguous=not shuffle)
+          targets_batch = slice_arrays(targets, batch_ids,
+                                       contiguous=not shuffle)
+          if sample_weights:
+            sample_weights_batch = slice_arrays(sample_weights, batch_ids,
+                                                contiguous=not shuffle)
+          else:
+            sample_weights_batch = None
+        except TypeError:
+          raise TypeError('TypeError while preparing batch. '
+                          'If using HDF5 input data, '
+                          'pass shuffle="batch".')
+        batch_logs = {}
+        batch_logs['batch'] = batch_index
+        batch_logs['size'] = len(batch_ids)
+
+        callbacks.on_batch_begin(batch_index, batch_logs)
+
+        inputs_batch = [
+            ops.convert_to_tensor(val, dtype=backend.floatx())
+            for val in inputs_batch]
+        targets_batch = [
+            ops.convert_to_tensor(val, dtype=backend.floatx())
+            for val in targets_batch]
+        if sample_weights:
+          sample_weights_batch = [
+              ops.convert_to_tensor(val, dtype=backend.floatx())
+              if val is not None else None
+              for val in sample_weights_batch]
+
+        outs, loss, loss_metrics = _process_single_batch(
+            model,
+            inputs_batch,
+            targets_batch,
+            sample_weights=sample_weights_batch,
+            training=True)
+
+        if not isinstance(outs, list):
+          outs = [outs]
+
+        for l, o in zip(out_labels, outs):
+          batch_logs[l] = o
+        # Required for Eager mode
+        metrics_results = _eager_metrics_fn(model, outs, targets_batch)
+        batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
+
+        for k, v in zip(model.metrics_names,
+                        [backend.mean(loss)] + loss_metrics + metrics_results):
+          batch_logs[k] = tensor_util.constant_value(v)
+        callbacks.on_batch_end(batch_index, batch_logs)
+        if callback_model.stop_training:
+          break
+
+        if batch_index == len(batches) - 1:  # Last batch.
+          if do_validation:
+            val_outs = test_loop(
+                model, val_inputs, val_targets,
+                sample_weights=val_sample_weights,
+                batch_size=batch_size,
+                verbose=0)
+            if not isinstance(val_outs, list):
+              val_outs = [val_outs]
+            # Same labels assumed.
+            for l, o in zip(out_labels, val_outs):
+              epoch_logs['val_' + l] = o
+      callbacks.on_epoch_end(epoch, epoch_logs)
       if callback_model.stop_training:
         break
+    callbacks.on_train_end()
+    return model.history
 
-      if batch_index == len(batches) - 1:  # Last batch.
-        if do_validation:
-          val_outs = test_loop(
-              model, val_ins, batch_size=batch_size, verbose=0)
-          if not isinstance(val_outs, list):
-            val_outs = [val_outs]
-          # Same labels assumed.
-          for l, o in zip(out_labels, val_outs):
-            epoch_logs['val_' + l] = o
-    callbacks.on_epoch_end(epoch, epoch_logs)
-    if callback_model.stop_training:
-      break
-  callbacks.on_train_end()
-  return model.history
-
-
-def test_loop(model, ins, batch_size=None, verbose=0, steps=None):
+
+def test_loop(model, inputs, targets,
+              sample_weights=None,
+              batch_size=None,
+              verbose=0,
+              steps=None):
   """Abstract method to loop over some data in batches.
 
   Arguments:
       model: Model instance that is being evaluated in Eager mode.
-      ins: list of tensors to be fed to `f`.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
       batch_size: integer batch size or `None`.
       verbose: verbosity mode.
       steps: Total number of steps (batches of samples)
@@ -498,69 +556,79 @@ def test_loop(model, ins, batch_size=None, verbose=0, steps=None):
       and/or metrics). The attribute `model.metrics_names` will give you
       the display labels for the scalar outputs.
   """
-  K.set_learning_phase(False)
-  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
-  outs = []
-  if verbose == 1:
-    progbar = Progbar(target=num_samples)
-  batches = make_batches(num_samples, batch_size)
-  index_array = np.arange(num_samples)
-  for batch_index, (batch_start, batch_end) in enumerate(batches):
-    batch_ids = index_array[batch_start:batch_end]
-    if isinstance(ins[-1], float):
-      # Do not slice the training phase flag.
-      ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-    else:
-      ins_batch = slice_arrays(ins, batch_ids)
-
-    ins_batch_converted = []
-    for ib in ins_batch:
-      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
-
-    eager_model_inputs = []
-    eager_model_outputs = []
-    for i in range(len(model.inputs)):
-      eager_model_inputs.append(ins_batch_converted[i])
-
-    for i in range(len(model.inputs), len(ins_batch_converted)):
-      eager_model_outputs.append(ins_batch_converted[i])
-
-    loss_outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
-                                                eager_model_outputs,
-                                                training=False)
-    _, metrics_results = _eager_metrics_fn(model, loss_outs,
-                                           eager_model_outputs)
-    batch_outs = []
-    for _, v in zip(model.metrics_names,
-                    [K.mean(loss)] + loss_metrics + metrics_results):
-      batch_outs.append(tensor_util.constant_value(v))
-
-    if isinstance(batch_outs, list):
-      if batch_index == 0:
-        for batch_out in enumerate(batch_outs):
+  with backend.learning_phase_scope(0):
+    feed_data = inputs + targets
+    if sample_weights:
+      feed_data += sample_weights
+    num_samples = training_utils.check_num_samples(
+        feed_data, batch_size=batch_size, steps=steps, steps_name='steps')
+    outs = []
+    if verbose == 1:
+      progbar = generic_utils.Progbar(target=num_samples)
+    batches = generic_utils.make_batches(num_samples, batch_size)
+    index_array = np.arange(num_samples)
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      inputs_batch = slice_arrays(inputs, batch_ids)
+      targets_batch = slice_arrays(targets, batch_ids)
+      if sample_weights:
+        sample_weights_batch = slice_arrays(sample_weights, batch_ids)
+      else:
+        sample_weights_batch = None
+
+      inputs_batch = [
+          ops.convert_to_tensor(val, dtype=backend.floatx())
+          for val in inputs_batch]
+      targets_batch = [
+          ops.convert_to_tensor(val, dtype=backend.floatx())
+          for val in targets_batch]
+      if sample_weights:
+        sample_weights_batch = [
+            ops.convert_to_tensor(val, dtype=backend.floatx())
+            if val is not None else None
+            for val in sample_weights_batch]
+
+      loss_outs, loss, loss_metrics = _model_loss(
+          model,
+          inputs_batch,
+          targets_batch,
+          sample_weights=sample_weights_batch,
+          training=False)
+      metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
+      batch_outs = []
+      for _, v in zip(model.metrics_names,
+                      [backend.mean(loss)] + loss_metrics + metrics_results):
+        batch_outs.append(tensor_util.constant_value(v))
+
+      if isinstance(batch_outs, list):
+        if batch_index == 0:
+          for batch_out in enumerate(batch_outs):
+            outs.append(0.)
+        for i, batch_out in enumerate(batch_outs):
+          outs[i] += batch_out * len(batch_ids)
+      else:
+        if batch_index == 0:
           outs.append(0.)
-      for i, batch_out in enumerate(batch_outs):
-        outs[i] += batch_out * len(batch_ids)
-    else:
-      if batch_index == 0:
-        outs.append(0.)
-      outs[0] += batch_outs * len(batch_ids)
+        outs[0] += batch_outs * len(batch_ids)
 
-    if verbose == 1:
-      progbar.update(batch_end)
-  for i in range(len(outs)):
-    outs[i] /= num_samples
-  if len(outs) == 1:
-    return outs[0]
-  return outs
+      if verbose == 1:
+        progbar.update(batch_end)
+    for i in range(len(outs)):
+      outs[i] /= num_samples
+    if len(outs) == 1:
+      return outs[0]
+    return outs
 
 
-def predict_loop(model, ins, batch_size=32, verbose=0, steps=None):
+def predict_loop(model, inputs,
+                 batch_size=32,
+                 verbose=0,
+                 steps=None):
   """Abstract method to loop over some data in batches.
 
   Arguments:
       model:
-      ins: list of tensors to be fed to `f`.
+      inputs: List of input arrays.
       batch_size: integer batch size.
       verbose: verbosity mode.
       steps: Total number of steps (batches of samples)
@@ -572,57 +640,50 @@ def predict_loop(model, ins, batch_size=32, verbose=0, steps=None):
       or list of arrays of predictions
       (if the model has multiple outputs).
   """
-  K.set_learning_phase(False)
-  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
-  if verbose == 1:
-    if steps is not None:
-      progbar = Progbar(target=steps)
-    else:
-      progbar = Progbar(target=num_samples)
-
-  outs = []
-  batches = make_batches(num_samples, batch_size)
-  index_array = np.arange(num_samples)
-  for batch_index, (batch_start, batch_end) in enumerate(batches):
-    batch_ids = index_array[batch_start:batch_end]
-    if ins and isinstance(ins[-1], float):
-      # Do not slice the training phase flag.
-      ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-    else:
-      ins_batch = slice_arrays(ins, batch_ids)
+  with backend.learning_phase_scope(0):
+    num_samples = training_utils.check_num_samples(
+        inputs, batch_size, steps, 'steps')
+    if verbose == 1:
+      if steps is not None:
+        progbar = generic_utils.Progbar(target=steps)
+      else:
+        progbar = generic_utils.Progbar(target=num_samples)
 
-    ins_batch_converted = []
-    for ib in ins_batch:
-      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+    outs = []
+    batches = generic_utils.make_batches(num_samples, batch_size)
+    index_array = np.arange(num_samples)
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      inputs_batch = slice_arrays(inputs, batch_ids)
 
-    eager_model_inputs = []
-    for i in range(len(model.inputs)):
-      eager_model_inputs.append(ins_batch_converted[i])
+      inputs_batch = [
+          ops.convert_to_tensor(val, dtype=backend.floatx())
+          for val in inputs_batch]
 
-    if len(eager_model_inputs) == 1:
-      if model._expects_training_arg:
-        batch_outs = model.call(eager_model_inputs[0], training=False)
-      else:
-        batch_outs = model.call(eager_model_inputs[0])
-    else:
-      if model._expects_training_arg:
-        batch_outs = model.call(eager_model_inputs, training=False)
+      if len(inputs_batch) == 1:
+        if model._expects_training_arg:
+          batch_outs = model.call(inputs_batch[0], training=False)
+        else:
+          batch_outs = model.call(inputs_batch[0])
       else:
-        batch_outs = model.call(eager_model_inputs)
-
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-    if batch_index == 0:
-      # Pre-allocate the results arrays.
-      for batch_out in batch_outs:
-        dims = batch_out.shape[1:].dims
-        dims_list = [d.value for d in dims]
-        shape = (num_samples,) + tuple(dims_list)
-        outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
-    for i, batch_out in enumerate(batch_outs):
-      outs[i][batch_start:batch_end] = batch_out
-    if verbose == 1:
-      progbar.update(batch_end)
-  if len(outs) == 1:
-    return outs[0]
-  return outs
+        if model._expects_training_arg:
+          batch_outs = model.call(inputs_batch, training=False)
+        else:
+          batch_outs = model.call(inputs_batch)
+
+      if not isinstance(batch_outs, list):
+        batch_outs = [batch_outs]
+      if batch_index == 0:
+        # Pre-allocate the results arrays.
+        for batch_out in batch_outs:
+          dims = batch_out.shape[1:].dims
+          dims_list = [d.value for d in dims]
+          shape = (num_samples,) + tuple(dims_list)
+          outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
+      for i, batch_out in enumerate(batch_outs):
+        outs[i][batch_start:batch_end] = batch_out
+      if verbose == 1:
+        progbar.update(batch_end)
+    if len(outs) == 1:
+      return outs[0]
+    return outs
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
index 45601f964a090fd927a22eb525d3c1c154fd71db..5adb3ef94086f6aa3ff299c20c5b924b8438916a 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -18,13 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
@@ -213,7 +212,7 @@ class TrainingTest(test.TestCase):
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     loss_weights = [1., 0.5]
-    metrics = ['mae']
+    metrics = ['acc', 'mae']
     model.compile(
         optimizer,
         loss,
@@ -232,20 +231,20 @@ class TrainingTest(test.TestCase):
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         batch_size=5,
         verbose=0)
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
     out = model.evaluate(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         batch_size=5,
         verbose=1)
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
     out = model.evaluate(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         batch_size=5,
         verbose=2)
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
     out = model.test_on_batch([input_a_np, input_b_np],
                               [output_d_np, output_e_np])
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
 
     # Test evaluate with dictionary inputs
     model.evaluate(
@@ -309,16 +308,109 @@ class TrainingTest(test.TestCase):
       model.compile(loss=None,
                     optimizer='rms')
 
+  def test_model_methods_with_eager_tensors_multi_io(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=None)
+
+    input_a = keras.backend.zeros(shape=(10, 3))
+    input_b = keras.backend.zeros(shape=(10, 3))
+    target_d = keras.backend.zeros(shape=(10, 4))
+    target_e = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    # Test: no shuffle.
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0,
+        shuffle=False)
+    # Test: validation data.
+    model.fit([input_a, input_b], [target_d, target_e],
+              epochs=1, batch_size=2, verbose=0,
+              validation_data=([input_a, input_b], [target_d, target_e]))
+    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.predict([input_a, input_b], batch_size=5)
+    model.evaluate([input_a, input_b], [target_d, target_e],
+                   batch_size=2, verbose=0)
+    model.test_on_batch([input_a, input_b], [target_d, target_e])
+
+    # Test: mix np and tensors.
+    input_b = np.zeros(shape=(10, 3)).astype('float32')
+    target_e = np.zeros(shape=(10, 4)).astype('float32')
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit([input_a, input_b], [target_d, target_e],
+              epochs=1, batch_size=2, verbose=0,
+              validation_data=([input_a, input_b], [target_d, target_e]))
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0,
+        shuffle=False)
+    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.predict([input_a, input_b], batch_size=5)
+    model.evaluate([input_a, input_b], [target_d, target_e],
+                   batch_size=2, verbose=0)
+    model.test_on_batch([input_a, input_b], [target_d, target_e])
+
+  def test_model_methods_with_eager_tensors_single_io(self):
+    x = keras.layers.Input(shape=(3,), name='input')
+    y = keras.layers.Dense(4, name='dense')(x)
+    model = keras.Model(x, y)
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = keras.backend.zeros(shape=(10, 3))
+    targets = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0)
+    model.fit(inputs, targets, epochs=1, batch_size=3, verbose=0, shuffle=False)
+    model.fit(inputs, targets, epochs=1, batch_size=4, verbose=0,
+              validation_data=(inputs, targets))
+    model.evaluate(inputs, targets, batch_size=2, verbose=0)
+    model.predict(inputs, batch_size=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+
 
 class LossWeightingTest(test.TestCase):
 
   def test_class_weights(self):
     num_classes = 5
     batch_size = 5
-    epochs = 5
     weighted_class = 3
-    train_samples = 3000
-    test_samples = 3000
+    train_samples = 300
+    test_samples = 300
     input_dim = 5
 
     model = keras.models.Sequential()
@@ -343,16 +435,16 @@ class LossWeightingTest(test.TestCase):
     test_ids = np.where(int_y_test == np.array(weighted_class))[0]
 
     class_weight = dict([(i, 1.) for i in range(num_classes)])
-    class_weight[weighted_class] = 2.
+    class_weight[weighted_class] = 4.
 
     sample_weight = np.ones((y_train.shape[0]))
-    sample_weight[int_y_train == weighted_class] = 2.
+    sample_weight[int_y_train == weighted_class] = 4.
 
     model.fit(
         x_train,
         y_train,
         batch_size=batch_size,
-        epochs=epochs // 3,
+        epochs=2,
         verbose=0,
         class_weight=class_weight,
         validation_data=(x_train, y_train, sample_weight))
@@ -360,14 +452,14 @@ class LossWeightingTest(test.TestCase):
         x_train,
         y_train,
         batch_size=batch_size,
-        epochs=epochs // 2,
+        epochs=2,
         verbose=0,
         class_weight=class_weight)
     model.fit(
         x_train,
         y_train,
         batch_size=batch_size,
-        epochs=epochs // 2,
+        epochs=2,
         verbose=0,
         class_weight=class_weight,
         validation_split=0.1)
@@ -382,10 +474,9 @@ class LossWeightingTest(test.TestCase):
   def test_sample_weights(self):
     num_classes = 5
     batch_size = 5
-    epochs = 5
     weighted_class = 3
-    train_samples = 3000
-    test_samples = 3000
+    train_samples = 300
+    test_samples = 300
     input_dim = 5
 
     model = keras.models.Sequential()
@@ -397,36 +488,32 @@ class LossWeightingTest(test.TestCase):
                   optimizer=RMSPropOptimizer(learning_rate=0.001))
 
     np.random.seed(43)
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+    (x_train, y_train), _ = testing_utils.get_test_data(
         train_samples=train_samples,
         test_samples=test_samples,
         input_shape=(input_dim,),
         num_classes=num_classes)
-    int_y_test = y_test.copy()
     int_y_train = y_train.copy()
-    # convert class vectors to binary class matrices
     y_train = keras.utils.to_categorical(y_train, num_classes)
-    y_test = keras.utils.to_categorical(y_test, num_classes)
-    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
 
     class_weight = dict([(i, 1.) for i in range(num_classes)])
-    class_weight[weighted_class] = 2.
+    class_weight[weighted_class] = 4.
 
     sample_weight = np.ones((y_train.shape[0]))
-    sample_weight[int_y_train == weighted_class] = 2.
+    sample_weight[int_y_train == weighted_class] = 4.
 
     model.fit(
         x_train,
         y_train,
         batch_size=batch_size,
-        epochs=epochs // 3,
+        epochs=2,
         verbose=0,
         sample_weight=sample_weight)
     model.fit(
         x_train,
         y_train,
         batch_size=batch_size,
-        epochs=epochs // 3,
+        epochs=2,
         verbose=0,
         sample_weight=sample_weight,
         validation_split=0.1)
@@ -539,218 +626,50 @@ class LossWeightingTest(test.TestCase):
       model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
 
 
-class TestDynamicTrainability(test.TestCase):
-
-  def test_trainable_warning(self):
-    x = np.random.random((5, 3))
-    y = np.random.random((5, 2))
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, input_dim=3))
-    model.trainable = False
-    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
-    model.trainable = True
-    with self.assertRaises(ValueError):
-      model.train_on_batch(x, y)
-
-  def test_trainable_argument(self):
-    x = np.random.random((5, 3))
-    y = np.random.random((5, 2))
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, input_dim=3, trainable=False))
-    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
-    out = model.predict(x)
-    with self.assertRaises(ValueError):
-      model.train_on_batch(x, y)
-    out_2 = model.predict(x)
-    self.assertAllClose(out, out_2)
-
-    # test with nesting
-    inputs = keras.layers.Input(shape=(3,))
-    output = model(inputs)
-    model = keras.models.Model(inputs, output)
-    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
-    out = model.predict(x)
-    with self.assertRaises(ValueError):
-      model.train_on_batch(x, y)
-    out_2 = model.predict(x)
-    self.assertAllClose(out, out_2)
-
-  def test_layer_trainability_switch(self):
-    # with constructor argument, in Sequential
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, trainable=False, input_dim=1))
-    self.assertListEqual(model.trainable_weights, [])
-
-    # by setting the `trainable` argument, in Sequential
-    model = keras.models.Sequential()
-    layer = keras.layers.Dense(2, input_dim=1)
-    model.add(layer)
-    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
-    layer.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-
-    # with constructor argument, in Model
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(2, trainable=False)(x)
-    model = keras.models.Model(x, y)
-    self.assertListEqual(model.trainable_weights, [])
-
-    # by setting the `trainable` argument, in Model
-    x = keras.layers.Input(shape=(1,))
-    layer = keras.layers.Dense(2)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
-    layer.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-
-  def test_model_trainability_switch(self):
-    # a non-trainable model has no trainable weights
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(2)(x)
-    model = keras.models.Model(x, y)
-    model.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-
-    # same for Sequential
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, input_dim=1))
-    model.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-
-  def test_nested_model_trainability(self):
-
-    # a Sequential inside a Model
-    inner_model = keras.models.Sequential()
-    inner_model.add(keras.layers.Dense(2, input_dim=1))
-
-    x = keras.layers.Input(shape=(1,))
-    y = inner_model(x)
-    outer_model = keras.models.Model(x, y)
-    self.assertListEqual(outer_model.trainable_weights,
-                         inner_model.trainable_weights)
-    inner_model.trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-    inner_model.trainable = True
-    inner_model.layers[-1].trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-
-    # a Sequential inside a Sequential
-    inner_model = keras.models.Sequential()
-    inner_model.add(keras.layers.Dense(2, input_dim=1))
-    outer_model = keras.models.Sequential()
-    outer_model.add(inner_model)
-    self.assertListEqual(outer_model.trainable_weights,
-                         inner_model.trainable_weights)
-    inner_model.trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-    inner_model.trainable = True
-    inner_model.layers[-1].trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-
-    # a Model inside a Model
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(2)(x)
-    inner_model = keras.models.Model(x, y)
-    x = keras.layers.Input(shape=(1,))
-    y = inner_model(x)
-    outer_model = keras.models.Model(x, y)
-    self.assertListEqual(outer_model.trainable_weights,
-                         inner_model.trainable_weights)
-    inner_model.trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-    inner_model.trainable = True
-    inner_model.layers[-1].trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-
-    # a Model inside a Sequential
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(2)(x)
-    inner_model = keras.models.Model(x, y)
-    outer_model = keras.models.Sequential()
-    outer_model.add(inner_model)
-    self.assertListEqual(outer_model.trainable_weights,
-                         inner_model.trainable_weights)
-    inner_model.trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-    inner_model.trainable = True
-    inner_model.layers[-1].trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-
-
-class TestTrainingUtils(test.TestCase):
-
-  def test_check_array_lengths(self):
-    keras.engine.training._check_array_lengths(None, None, None)
-    a_np = np.random.random((4, 3, 3))
-    keras.engine.training._check_array_lengths(a_np, a_np, a_np)
-    keras.engine.training._check_array_lengths(
-        [a_np, a_np], [a_np, a_np], [a_np, a_np])
-    keras.engine.training._check_array_lengths([None], [None], [None])
-
-    b_np = np.random.random((3, 4))
-    with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths(a_np, None, None)
-    with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths(a_np, a_np, None)
-    with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths([a_np], [None], None)
-    with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths([a_np], [b_np], None)
-    with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths([a_np], None, [b_np])
-
-  def test_slice_arrays(self):
-    input_a = np.random.random((10, 3))
-    slice_arrays(None)
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = [None, [1, 1], None, [1, 1]]
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = [None]
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = None
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-
-  def test_fit_with_BatchNorm(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(10, input_dim=4))
-    model.add(keras.layers.BatchNormalization())
-    model.add(keras.layers.Activation('tanh'))
-    model.add(keras.layers.Dropout(0.2))
-
-    input_a_np = np.random.random((10, 4))
-    output_b_np = np.random.random((10, 10))
-
-    model.compile(loss='binary_crossentropy', optimizer=RMSPropOptimizer(0.001))
-    model.fit(input_a_np, output_b_np, epochs=1, batch_size=5, verbose=0)
-
-  def test_fit_with_regularization(self):
-    model = keras.models.Sequential()
-    with self.assertRaises(ValueError):
-      model.add(
-          keras.layers.Dense(4, input_dim=3,
-                             kernel_regularizer=keras.regularizers.l2(0.01),
-                             activity_regularizer=keras.regularizers.l1(0.01)))
-
+class CorrectnessTest(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_loss_correctness(self):
+    # Test that training loss is the same in eager and graph
+    # (by comparing it to a reference value in a deterministic case)
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(3,
+                                 activation='relu',
+                                 input_dim=4,
+                                 kernel_initializer='ones'))
+    model.add(keras.layers.Dense(2,
+                                 activation='softmax',
+                                 kernel_initializer='ones'))
+    model.compile(loss='sparse_categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+    x = np.ones((100, 4))
+    np.random.seed(123)
+    y = np.random.randint(0, 1, size=(100, 1))
+    history = model.fit(x, y, epochs=1, batch_size=10)
+    self.assertEqual(
+        np.around(history.history['loss'][-1], decimals=4), 0.6173)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_metrics_correctness(self):
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(3,
+                                 activation='relu',
+                                 input_dim=4,
+                                 kernel_initializer='ones'))
+    model.add(keras.layers.Dense(1,
+                                 activation='sigmoid',
+                                 kernel_initializer='ones'))
+    model.compile(loss='mae',
+                  metrics=['acc'],
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+    x = np.ones((100, 4))
+    y = np.ones((100, 1))
+    outs = model.evaluate(x, y)
+    self.assertEqual(outs[1], 1.)
+    y = np.zeros((100, 1))
+    outs = model.evaluate(x, y)
+    self.assertEqual(outs[1], 0.)
 
 if __name__ == '__main__':
-  # Bazel sets these environment variables to very long paths.
-  # Tempfile uses them to create long paths, and in turn multiprocessing
-  # library tries to create sockets named after paths. Delete whatever bazel
-  # writes to these to avoid tests failing due to socket addresses being too
-  # long.
-  for var in ('TMPDIR', 'TMP', 'TEMP'):
-    if var in os.environ:
-      del os.environ[var]
-
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_generator.py b/tensorflow/python/keras/_impl/keras/engine/training_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b5bc39c10ea06f680eb030e14ecd19a3888588
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_generator.py
@@ -0,0 +1,436 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Part of the Keras training engine related to Python generators of array data.
+"""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import callbacks as cbks
+from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
+from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
+from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
+from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.platform import tf_logging as logging
+
+
+def fit_generator(model,
+                  generator,
+                  steps_per_epoch=None,
+                  epochs=1,
+                  verbose=1,
+                  callbacks=None,
+                  validation_data=None,
+                  validation_steps=None,
+                  class_weight=None,
+                  max_queue_size=10,
+                  workers=1,
+                  use_multiprocessing=False,
+                  shuffle=True,
+                  initial_epoch=0):
+  """See docstring for `Model.fit_generator`."""
+  wait_time = 0.01  # in seconds
+  epoch = initial_epoch
+
+  do_validation = bool(validation_data)
+  model._make_train_function()
+  if do_validation:
+    model._make_test_function()
+
+  is_sequence = isinstance(generator, Sequence)
+  if not is_sequence and use_multiprocessing and workers > 1:
+    logging.warning(
+        UserWarning('Using a generator with `use_multiprocessing=True`'
+                    ' and multiple workers may duplicate your data.'
+                    ' Please consider using the`keras.utils.Sequence'
+                    ' class.'))
+  if steps_per_epoch is None:
+    if is_sequence:
+      steps_per_epoch = len(generator)
+    else:
+      raise ValueError('`steps_per_epoch=None` is only valid for a'
+                       ' generator based on the `keras.utils.Sequence`'
+                       ' class. Please specify `steps_per_epoch` or use'
+                       ' the `keras.utils.Sequence` class.')
+
+  # python 2 has 'next', 3 has '__next__'
+  # avoid any explicit version checks
+  val_gen = (
+      hasattr(validation_data, 'next') or
+      hasattr(validation_data, '__next__') or
+      isinstance(validation_data, Sequence))
+  if (val_gen and not isinstance(validation_data, Sequence) and
+      not validation_steps):
+    raise ValueError('`validation_steps=None` is only valid for a'
+                     ' generator based on the `keras.utils.Sequence`'
+                     ' class. Please specify `validation_steps` or use'
+                     ' the `keras.utils.Sequence` class.')
+
+  # Prepare display labels.
+  out_labels = model.metrics_names
+  callback_metrics = out_labels + ['val_%s' % n for n in out_labels]
+
+  # prepare callbacks
+  model.history = cbks.History()
+  callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
+  if verbose:
+    callbacks += [cbks.ProgbarLogger(count_mode='steps')]
+  callbacks = cbks.CallbackList(callbacks)
+
+  # it's possible to callback a different model than self:
+  if hasattr(model, 'callback_model') and model.callback_model:
+    callback_model = model.callback_model
+  else:
+    callback_model = model
+  callbacks.set_model(callback_model)
+  callbacks.set_params({
+      'epochs': epochs,
+      'steps': steps_per_epoch,
+      'verbose': verbose,
+      'do_validation': do_validation,
+      'metrics': callback_metrics,
+  })
+  callbacks.on_train_begin()
+
+  enqueuer = None
+  val_enqueuer = None
+
+  try:
+    if do_validation and not val_gen:
+      # Prepare data for validation
+      if len(validation_data) == 2:
+        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+        val_sample_weight = None
+      elif len(validation_data) == 3:
+        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+      else:
+        raise ValueError(
+            '`validation_data` should be a tuple '
+            '`(val_x, val_y, val_sample_weight)` '
+            'or `(val_x, val_y)`. Found: ' + str(validation_data))
+      val_x, val_y, val_sample_weights = model._standardize_user_data(
+          val_x, val_y, val_sample_weight)
+      val_data = val_x + val_y + val_sample_weights
+      if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        val_data += [0.]
+      for cbk in callbacks:
+        cbk.validation_data = val_data
+
+    if workers > 0:
+      if is_sequence:
+        enqueuer = OrderedEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            shuffle=shuffle)
+      else:
+        enqueuer = GeneratorEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            wait_time=wait_time)
+      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+      output_generator = enqueuer.get()
+    else:
+      if is_sequence:
+        output_generator = iter(generator)
+      else:
+        output_generator = generator
+
+    callback_model.stop_training = False
+    # Construct epoch logs.
+    epoch_logs = {}
+    while epoch < epochs:
+      callbacks.on_epoch_begin(epoch)
+      steps_done = 0
+      batch_index = 0
+      while steps_done < steps_per_epoch:
+        generator_output = next(output_generator)
+
+        if not hasattr(generator_output, '__len__'):
+          raise ValueError('Output of generator should be '
+                           'a tuple `(x, y, sample_weight)` '
+                           'or `(x, y)`. Found: ' + str(generator_output))
+
+        if len(generator_output) == 2:
+          x, y = generator_output
+          sample_weight = None
+        elif len(generator_output) == 3:
+          x, y, sample_weight = generator_output
+        else:
+          raise ValueError('Output of generator should be '
+                           'a tuple `(x, y, sample_weight)` '
+                           'or `(x, y)`. Found: ' + str(generator_output))
+        # build batch logs
+        batch_logs = {}
+        if isinstance(x, list):
+          batch_size = x[0].shape[0]
+        elif isinstance(x, dict):
+          batch_size = list(x.values())[0].shape[0]
+        else:
+          batch_size = x.shape[0]
+        batch_logs['batch'] = batch_index
+        batch_logs['size'] = batch_size
+        callbacks.on_batch_begin(batch_index, batch_logs)
+
+        outs = model.train_on_batch(
+            x, y, sample_weight=sample_weight, class_weight=class_weight)
+
+        if not isinstance(outs, list):
+          outs = [outs]
+        for l, o in zip(out_labels, outs):
+          batch_logs[l] = o
+
+        callbacks.on_batch_end(batch_index, batch_logs)
+
+        batch_index += 1
+        steps_done += 1
+
+        # Epoch finished.
+        if steps_done >= steps_per_epoch and do_validation:
+          if val_gen:
+            val_outs = evaluate_generator(
+                model,
+                validation_data,
+                validation_steps,
+                workers=workers,
+                use_multiprocessing=use_multiprocessing,
+                max_queue_size=max_queue_size)
+          else:
+            # No need for try/except because
+            # data has already been validated.
+            val_outs = model.evaluate(
+                val_x,
+                val_y,
+                batch_size=batch_size,
+                sample_weight=val_sample_weights,
+                verbose=0)
+          if not isinstance(val_outs, list):
+            val_outs = [val_outs]
+          # Same labels assumed.
+          for l, o in zip(out_labels, val_outs):
+            epoch_logs['val_' + l] = o
+
+        if callback_model.stop_training:
+          break
+
+      callbacks.on_epoch_end(epoch, epoch_logs)
+      epoch += 1
+      if callback_model.stop_training:
+        break
+
+  finally:
+    try:
+      if enqueuer is not None:
+        enqueuer.stop()
+    finally:
+      if val_enqueuer is not None:
+        val_enqueuer.stop()
+
+  callbacks.on_train_end()
+  return model.history
+
+
+def evaluate_generator(model,
+                       generator,
+                       steps=None,
+                       max_queue_size=10,
+                       workers=1,
+                       use_multiprocessing=False):
+  """See docstring for `Model.evaluate_generator`."""
+  model._make_test_function()
+
+  steps_done = 0
+  wait_time = 0.01
+  all_outs = []
+  batch_sizes = []
+  is_sequence = isinstance(generator, Sequence)
+  if not is_sequence and use_multiprocessing and workers > 1:
+    logging.warning(
+        UserWarning('Using a generator with `use_multiprocessing=True`'
+                    ' and multiple workers may duplicate your data.'
+                    ' Please consider using the`keras.utils.Sequence'
+                    ' class.'))
+  if steps is None:
+    if is_sequence:
+      steps = len(generator)
+    else:
+      raise ValueError('`steps=None` is only valid for a generator'
+                       ' based on the `keras.utils.Sequence` class.'
+                       ' Please specify `steps` or use the'
+                       ' `keras.utils.Sequence` class.')
+  enqueuer = None
+
+  try:
+    if workers > 0:
+      if is_sequence:
+        enqueuer = OrderedEnqueuer(
+            generator, use_multiprocessing=use_multiprocessing)
+      else:
+        enqueuer = GeneratorEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            wait_time=wait_time)
+      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+      output_generator = enqueuer.get()
+    else:
+      if is_sequence:
+        output_generator = iter(generator)
+      else:
+        output_generator = generator
+
+    while steps_done < steps:
+      generator_output = next(output_generator)
+      if not hasattr(generator_output, '__len__'):
+        raise ValueError('Output of generator should be a tuple '
+                         '(x, y, sample_weight) '
+                         'or (x, y). Found: ' + str(generator_output))
+      if len(generator_output) == 2:
+        x, y = generator_output
+        sample_weight = None
+      elif len(generator_output) == 3:
+        x, y, sample_weight = generator_output
+      else:
+        raise ValueError('Output of generator should be a tuple '
+                         '(x, y, sample_weight) '
+                         'or (x, y). Found: ' + str(generator_output))
+      outs = model.test_on_batch(x, y, sample_weight=sample_weight)
+
+      if isinstance(x, list):
+        batch_size = x[0].shape[0]
+      elif isinstance(x, dict):
+        batch_size = list(x.values())[0].shape[0]
+      else:
+        batch_size = x.shape[0]
+      if batch_size == 0:
+        raise ValueError('Received an empty batch. '
+                         'Batches should at least contain one item.')
+      all_outs.append(outs)
+
+      steps_done += 1
+      batch_sizes.append(batch_size)
+
+  finally:
+    if enqueuer is not None:
+      enqueuer.stop()
+
+  if not isinstance(outs, list):
+    return np.average(np.asarray(all_outs), weights=batch_sizes)
+  else:
+    averages = []
+    for i in range(len(outs)):
+      averages.append(
+          np.average([out[i] for out in all_outs], weights=batch_sizes))
+    return averages
+
+
+def predict_generator(model,
+                      generator,
+                      steps=None,
+                      max_queue_size=10,
+                      workers=1,
+                      use_multiprocessing=False,
+                      verbose=0):
+  """See docstring for `Model.predict_generator`."""
+  model._make_predict_function()
+
+  steps_done = 0
+  wait_time = 0.01
+  all_outs = []
+  is_sequence = isinstance(generator, Sequence)
+  if not is_sequence and use_multiprocessing and workers > 1:
+    logging.warning(
+        UserWarning('Using a generator with `use_multiprocessing=True`'
+                    ' and multiple workers may duplicate your data.'
+                    ' Please consider using the`keras.utils.Sequence'
+                    ' class.'))
+  if steps is None:
+    if is_sequence:
+      steps = len(generator)
+    else:
+      raise ValueError('`steps=None` is only valid for a generator'
+                       ' based on the `keras.utils.Sequence` class.'
+                       ' Please specify `steps` or use the'
+                       ' `keras.utils.Sequence` class.')
+  enqueuer = None
+
+  try:
+    if workers > 0:
+      if is_sequence:
+        enqueuer = OrderedEnqueuer(
+            generator, use_multiprocessing=use_multiprocessing)
+      else:
+        enqueuer = GeneratorEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            wait_time=wait_time)
+      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+      output_generator = enqueuer.get()
+    else:
+      if is_sequence:
+        output_generator = iter(generator)
+      else:
+        output_generator = generator
+
+    if verbose == 1:
+      progbar = Progbar(target=steps)
+
+    while steps_done < steps:
+      generator_output = next(output_generator)
+      if isinstance(generator_output, tuple):
+        # Compatibility with the generators
+        # used for training.
+        if len(generator_output) == 2:
+          x, _ = generator_output
+        elif len(generator_output) == 3:
+          x, _, _ = generator_output
+        else:
+          raise ValueError('Output of generator should be '
+                           'a tuple `(x, y, sample_weight)` '
+                           'or `(x, y)`. Found: ' + str(generator_output))
+      else:
+        # Assumes a generator that only
+        # yields inputs (not targets and sample weights).
+        x = generator_output
+
+      outs = model.predict_on_batch(x)
+      if not isinstance(outs, list):
+        outs = [outs]
+
+      if not all_outs:
+        for out in outs:
+          all_outs.append([])
+
+      for i, out in enumerate(outs):
+        all_outs[i].append(out)
+      steps_done += 1
+      if verbose == 1:
+        progbar.update(steps_done)
+
+  finally:
+    if enqueuer is not None:
+      enqueuer.stop()
+
+  if len(all_outs) == 1:
+    if steps_done == 1:
+      return all_outs[0][0]
+    else:
+      return np.concatenate(all_outs[0])
+  if steps_done == 1:
+    return [out[0] for out in all_outs]
+  else:
+    return [np.concatenate(out) for out in all_outs]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 9651eb9f14f1275dc79c8d3b1fb54690772086a1..58011a141268e588e6b746afca982f36f4d4d176 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -23,11 +23,17 @@ import unittest
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
-from tensorflow.python.keras._impl.keras.engine.training import _weighted_masked_objective
+from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 try:
   import scipy.sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
@@ -340,20 +346,21 @@ class TrainingTest(test.TestCase):
     if scipy_sparse is None:
       return
 
-    test_inputs = [
-        scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)]
-    test_outputs = [
-        scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)]
-    in1 = keras.layers.Input(shape=(3,))
-    in2 = keras.layers.Input(shape=(3,))
-    out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
-    out2 = keras.layers.Dense(4, name='dense_1')(in2)
-    model = keras.Model([in1, in2], [out1, out2])
-    model.predict(test_inputs, batch_size=2)
-    model.compile('rmsprop', 'mse')
-    model.fit(test_inputs, test_outputs,
-              epochs=1, batch_size=2, validation_split=0.5)
-    model.evaluate(test_inputs, test_outputs, batch_size=2)
+    with self.test_session():
+      test_inputs = [
+          scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)]
+      test_outputs = [
+          scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)]
+      in1 = keras.layers.Input(shape=(3,))
+      in2 = keras.layers.Input(shape=(3,))
+      out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
+      out2 = keras.layers.Dense(4, name='dense_1')(in2)
+      model = keras.Model([in1, in2], [out1, out2])
+      model.predict(test_inputs, batch_size=2)
+      model.compile('rmsprop', 'mse')
+      model.fit(test_inputs, test_outputs,
+                epochs=1, batch_size=2, validation_split=0.5)
+      model.evaluate(test_inputs, test_outputs, batch_size=2)
 
   def test_that_trainable_disables_updates(self):
     val_a = np.random.random((10, 4))
@@ -705,7 +712,7 @@ class LossMaskingTest(test.TestCase):
 
   def test_loss_masking(self):
     with self.test_session():
-      weighted_loss = _weighted_masked_objective(keras.losses.get('mae'))
+      weighted_loss = weighted_masked_objective(keras.losses.get('mae'))
       shape = (3, 4, 2)
       x = np.arange(24).reshape(shape)
       y = 2 * x
@@ -876,9 +883,9 @@ class TestGeneratorMethods(test.TestCase):
 
     def custom_generator():
       batch_size = 10
-      n_samples = 50
+      num_samples = 50
       while True:
-        batch_index = np.random.randint(0, n_samples - batch_size)
+        batch_index = np.random.randint(0, num_samples - batch_size)
         start = batch_index
         end = start + batch_size
         x = arr_data[start: end]
@@ -957,9 +964,9 @@ class TestGeneratorMethods(test.TestCase):
 
     def custom_generator():
       batch_size = 10
-      n_samples = 50
+      num_samples = 50
       while True:
-        batch_index = np.random.randint(0, n_samples - batch_size)
+        batch_index = np.random.randint(0, num_samples - batch_size)
         start = batch_index
         end = start + batch_size
         x = arr_data[start: end]
@@ -1033,28 +1040,66 @@ class TestGeneratorMethods(test.TestCase):
                                  max_queue_size=10,
                                  use_multiprocessing=False)
 
+  def test_training_with_sequences(self):
+
+    class DummySequence(keras.utils.Sequence):
+
+      def __getitem__(self, idx):
+        return np.zeros([10, 2]), np.ones([10])
+
+      def __len__(self):
+        return 10
+
+    arr_data = np.random.random((50, 2))
+    arr_labels = np.random.random((50,))
+    arr_sample_weights = np.random.random((50,))
+
+    def custom_generator():
+      batch_size = 10
+      num_samples = 50
+      while True:
+        batch_index = np.random.randint(0, num_samples - batch_size)
+        start = batch_index
+        end = start + batch_size
+        x = arr_data[start: end]
+        y = arr_labels[start: end]
+        w = arr_sample_weights[start: end]
+        yield x, y, w
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(1, input_shape=(2,)))
+      model.compile(loss='mse', optimizer='sgd')
+
+    model.fit_generator(DummySequence(),
+                        steps_per_epoch=10,
+                        validation_data=custom_generator(),
+                        validation_steps=1,
+                        max_queue_size=10,
+                        workers=0,
+                        use_multiprocessing=True)
+    model.fit_generator(DummySequence(),
+                        steps_per_epoch=10,
+                        validation_data=custom_generator(),
+                        validation_steps=1,
+                        max_queue_size=10,
+                        workers=0,
+                        use_multiprocessing=False)
+
 
 class TestTrainingUtils(test.TestCase):
 
   def test_check_array_lengths(self):
-    keras.engine.training._check_array_lengths(None, None, None)
+    keras.engine.training_utils.check_array_lengths(None, None, None)
     a_np = np.random.random((4, 3, 3))
-    keras.engine.training._check_array_lengths(a_np, a_np, a_np)
-    keras.engine.training._check_array_lengths(
+    keras.engine.training_utils.check_array_lengths(a_np, a_np, a_np)
+    keras.engine.training_utils.check_array_lengths(
         [a_np, a_np], [a_np, a_np], [a_np, a_np])
-    keras.engine.training._check_array_lengths([None], [None], [None])
+    keras.engine.training_utils.check_array_lengths([None], [None], [None])
 
     b_np = np.random.random((3, 4))
     with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths(a_np, None, None)
-    with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths(a_np, a_np, None)
-    with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths([a_np], [None], None)
-    with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths([a_np], [b_np], None)
-    with self.assertRaises(ValueError):
-      keras.engine.training._check_array_lengths([a_np], None, [b_np])
+      keras.engine.training_utils.check_array_lengths([a_np], [b_np], None)
 
   def test_slice_arrays(self):
     input_a = np.random.random((10, 3))
@@ -1078,6 +1123,136 @@ class TestTrainingUtils(test.TestCase):
 
 class TestTrainingWithDataTensors(test.TestCase):
 
+  def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = keras.backend.zeros(shape=(10, 3))
+      targets = keras.backend.zeros(shape=(10, 4))
+
+      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+      model.predict(inputs, steps=2)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
+      model.fit(inputs, targets,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=(inputs, targets), validation_steps=2)
+
+      # Test with dynamic shape
+      inputs = array_ops.placeholder_with_default(
+          np.zeros((2, 3)), shape=tensor_shape.TensorShape([None, 3]))
+      targets = array_ops.placeholder_with_default(
+          np.zeros((2, 4)), shape=tensor_shape.TensorShape([None, 4]))
+      self.assertEqual(inputs.shape[0].value, None)
+      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+      model.predict(inputs, steps=2)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
+      model.fit(inputs, targets,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=(inputs, targets), validation_steps=2)
+
+  def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      loss_weights = [1., 0.5]
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+      input_a_tf = keras.backend.zeros(shape=(10, 3))
+      input_b_tf = keras.backend.zeros(shape=(10, 3))
+
+      output_d_tf = keras.backend.zeros(shape=(10, 4))
+      output_e_tf = keras.backend.zeros(shape=(10, 4))
+
+      model.fit(
+          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'should specify the `steps_per_epoch`'):
+        model.fit(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+            epochs=1,
+            batch_size=5,
+            verbose=0)
+      model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
+      # Test with dictionary inputs
+      model.fit(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf},
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0)
+      model.fit(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf},
+          validation_data=({'input_a': input_a_tf,
+                            'input_b': input_b_tf},
+                           {'dense': output_d_tf,
+                            'dropout': output_e_tf}),
+          epochs=1,
+          steps_per_epoch=2,
+          validation_steps=2,
+          verbose=0)
+      model.train_on_batch(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf})
+
+      # Test with validation data
+      model.fit(
+          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+          validation_data=([input_a_tf, input_b_tf],
+                           [output_d_tf, output_e_tf]),
+          epochs=1,
+          steps_per_epoch=2,
+          validation_steps=2,
+          verbose=0)
+      # Test with validation split
+      with self.assertRaisesRegexp(ValueError,
+                                   'you cannot use `validation_split`'):
+        model.fit(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+            epochs=2,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_split=0.2,
+            validation_steps=2)
+
+      # Test evaluation / prediction methods
+      model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+                     steps=2, verbose=0)
+      model.predict([input_a_tf, input_b_tf], steps=2)
+      model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
   def test_model_with_input_feed_tensor(self):
     """We test building a model with a TF variable as input.
 
@@ -1513,15 +1688,101 @@ class TestTrainingWithDataTensors(test.TestCase):
       model.train_on_batch([input_a_np, input_b_np],
                            [output_a_np, output_b_np])
 
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_metric_names_are_identical_in_graph_and_eager(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
 
-if __name__ == '__main__':
-  # Bazel sets these environment variables to very long paths.
-  # Tempfile uses them to create long paths, and in turn multiprocessing
-  # library tries to create sockets named after paths. Delete whatever bazel
-  # writes to these to avoid tests failing due to socket addresses being too
-  # long.
-  for var in ('TMPDIR', 'TMP', 'TEMP'):
-    if var in os.environ:
-      del os.environ[var]
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae', 'acc']
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    reference_metric_names = ['loss', 'dense_loss', 'dropout_loss',
+                              'dense_mean_absolute_error',
+                              'dense_acc',
+                              'dropout_mean_absolute_error',
+                              'dropout_acc']
+    self.assertEqual(reference_metric_names, model.metrics_names)
+
+
+class TestTrainingWithDatasetIterators(test.TestCase):
+
+  def test_training_and_eval_methods_on_iterators_single_io(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
 
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(iterator, steps=2, verbose=0)
+      model.predict(iterator, steps=2)
+      model.train_on_batch(iterator)
+      model.test_on_batch(iterator)
+      # Test with validation data
+      model.fit(iterator,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=iterator, validation_steps=2)
+      # Test with validation split
+      with self.assertRaisesRegexp(ValueError,
+                                   'you cannot use `validation_split`'):
+        model.fit(iterator,
+                  epochs=1, steps_per_epoch=2, verbose=0,
+                  validation_split=0.5, validation_steps=2)
+
+      # Test invalid usage
+      with self.assertRaisesRegexp(ValueError,
+                                   'Instead, pass an `Iterator`'):
+        model.fit(dataset,
+                  epochs=1, steps_per_epoch=2, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should not specify a target'):
+        model.fit(iterator, iterator,
+                  epochs=1, steps_per_epoch=2, verbose=0)
+
+  def test_iterators_running_out_of_data(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(2)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      with test.mock.patch.object(logging, 'warning') as mock_log:
+        model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
+        self.assertRegexpMatches(
+            str(mock_log.call_args),
+            'dataset iterator ran out of data')
+
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..662938f421b3a338d6720b2911347664711c1ac1
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -0,0 +1,616 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training-related utilities.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.keras._impl.keras import metrics as metrics_module
+from tensorflow.python.ops import math_ops
+
+
+def check_num_samples(ins,
+                      batch_size=None,
+                      steps=None,
+                      steps_name='steps'):
+  """Determine the number of samples provided for training and evaluation.
+
+  The number of samples is not defined when running with `steps`,
+  in which case the number of samples is set to `None`.
+
+  Arguments:
+      ins: List of tensors to be fed to the Keras function.
+      batch_size: Integer batch size or `None` if not defined.
+      steps: Total number of steps (batches of samples)
+          before declaring `_predict_loop` finished.
+          Ignored with the default value of `None`.
+      steps_name: The public API's parameter name for `steps`.
+
+  Raises:
+      ValueError: when `steps` is `None` and the attribute `ins.shape`
+      does not exist. Also raises ValueError when `steps` is not `None`
+      and `batch_size` is not `None` because they are mutually
+      exclusive.
+
+  Returns:
+      When steps is `None`, returns the number of samples to be
+      processed based on the size of the first dimension of the
+      first input numpy array. When steps is not `None` and
+      `batch_size` is `None`, returns `None`.
+
+  Raises:
+      ValueError: In case of invalid arguments.
+  """
+  if steps is not None and batch_size is not None:
+    raise ValueError(
+        'If ' + steps_name + ' is set, the `batch_size` must be None.')
+
+  if not ins or has_symbolic_tensors(ins):
+    if steps is None:
+      raise ValueError('If your data is in the form of symbolic tensors, '
+                       'you should specify the `' + steps_name + '` argument '
+                       '(instead of the `batch_size` argument, '
+                       'because symbolic tensors are expected to produce '
+                       'batches of input data).')
+    return None
+  if hasattr(ins[0], 'shape'):
+    return int(ins[0].shape[0])
+  return None  # Edge case where ins == [static_learning_phase]
+
+
+def standardize_single_array(x):
+  if x is None:
+    return None
+  elif tensor_util.is_tensor(x):
+    return x
+  elif x.ndim == 1:
+    x = np.expand_dims(x, 1)
+  return x
+
+
+def standardize_input_data(data,
+                           names,
+                           shapes=None,
+                           check_batch_axis=True,
+                           exception_prefix=''):
+  """Normalizes inputs and targets provided by users.
+
+  Users may pass data as a list of arrays, dictionary of arrays,
+  or as a single array. We normalize this to an ordered list of
+  arrays (same order as `names`), while checking that the provided
+  arrays have shapes that match the network's expectations.
+
+  Arguments:
+      data: User-provided input data (polymorphic).
+      names: List of expected array names.
+      shapes: Optional list of expected array shapes.
+      check_batch_axis: Boolean; whether to check that
+          the batch axis of the arrays matches the expected
+          value found in `shapes`.
+      exception_prefix: String prefix used for exception formatting.
+
+  Returns:
+      List of standardized input arrays (one array per model input).
+
+  Raises:
+      ValueError: in case of improperly formatted user-provided data.
+  """
+  if not names:
+    if data is not None and hasattr(data, '__len__') and len(data):
+      raise ValueError('Error when checking model ' + exception_prefix + ': '
+                       'expected no data, but got:', data)
+    return []
+  if data is None:
+    return [None for _ in range(len(names))]
+
+  if isinstance(data, dict):
+    try:
+      data = [
+          data[x].values
+          if data[x].__class__.__name__ == 'DataFrame' else data[x]
+          for x in names
+      ]
+    except KeyError as e:
+      raise ValueError('No data provided for "' + e.args[0] + '". Need data '
+                       'for each key in: ' + str(names))
+  elif isinstance(data, list):
+    if isinstance(data[0], list):
+      data = [np.asarray(d) for d in data]
+    elif len(names) == 1 and isinstance(data[0], (float, int)):
+      data = [np.asarray(data)]
+    else:
+      data = [
+          x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
+      ]
+  else:
+    data = data.values if data.__class__.__name__ == 'DataFrame' else data
+    data = [data]
+  data = [standardize_single_array(x) for x in data]
+
+  if len(data) != len(names):
+    if data and hasattr(data[0], 'shape'):
+      raise ValueError('Error when checking model ' + exception_prefix +
+                       ': the list of Numpy arrays that you are passing to '
+                       'your model is not the size the model expected. '
+                       'Expected to see ' + str(len(names)) + ' array(s), '
+                       'but instead got the following list of ' +
+                       str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
+    elif len(names) > 1:
+      raise ValueError(
+          'Error when checking model ' + exception_prefix +
+          ': you are passing a list as input to your model, '
+          'but the model expects a list of ' + str(len(names)) +
+          ' Numpy arrays instead. The list you passed was: ' + str(data)[:200])
+    elif len(data) == 1 and not hasattr(data[0], 'shape'):
+      raise TypeError('Error when checking model ' + exception_prefix +
+                      ': data should be a Numpy array, or list/dict of '
+                      'Numpy arrays. Found: ' + str(data)[:200] + '...')
+    elif len(names) == 1:
+      data = [np.asarray(data)]
+
+  # Check shapes compatibility.
+  if shapes:
+    for i in range(len(names)):
+      if shapes[i] is not None and not tensor_util.is_tensor(data[i]):
+        data_shape = data[i].shape
+        shape = shapes[i]
+        if data[i].ndim != len(shape):
+          raise ValueError('Error when checking ' + exception_prefix +
+                           ': expected ' + names[i] + ' to have ' +
+                           str(len(shape)) + ' dimensions, but got array '
+                           'with shape ' + str(data_shape))
+        if not check_batch_axis:
+          data_shape = data_shape[1:]
+          shape = shape[1:]
+        for dim, ref_dim in zip(data_shape, shape):
+          if ref_dim != dim and ref_dim:
+            raise ValueError(
+                'Error when checking ' + exception_prefix + ': expected ' +
+                names[i] + ' to have shape ' + str(shape) +
+                ' but got array with shape ' + str(data_shape))
+  return data
+
+
+def standardize_sample_or_class_weights(x_weight, output_names, weight_type):
+  """Maps `sample_weight` or `class_weight` to model outputs.
+
+  Arguments:
+      x_weight: User-provided `sample_weight` or `class_weight` argument.
+      output_names: List of output names (strings) in the model.
+      weight_type: A string used purely for exception printing.
+
+  Returns:
+      A list of `sample_weight` or `class_weight` where there are exactly
+          one element per model output.
+
+  Raises:
+      ValueError: In case of invalid user-provided argument.
+  """
+  if x_weight is None or len(x_weight) == 0:  # pylint: disable=g-explicit-length-test
+    return [None for _ in output_names]
+  if len(output_names) == 1:
+    if isinstance(x_weight, list) and len(x_weight) == 1:
+      return x_weight
+    if isinstance(x_weight, dict) and output_names[0] in x_weight:
+      return [x_weight[output_names[0]]]
+    else:
+      return [x_weight]
+  if isinstance(x_weight, list):
+    if len(x_weight) != len(output_names):
+      raise ValueError('Provided `' + weight_type + '` was a list of ' +
+                       str(len(x_weight)) + ' elements, but the model has ' +
+                       str(len(output_names)) + ' outputs. '
+                       'You should provide one `' + weight_type + '`'
+                       'array per model output.')
+    return x_weight
+  if isinstance(x_weight, dict):
+    x_weights = []
+    for name in output_names:
+      x_weights.append(x_weight.get(name))
+    return x_weights
+  else:
+    raise TypeError(
+        'The model has multiple outputs, so `' + weight_type + '` '
+        'should be either a list or a dict. '
+        'Provided `' + weight_type + '` type not understood: ' + str(x_weight))
+
+
+def standardize_class_weights(class_weight, output_names):
+  return standardize_sample_or_class_weights(class_weight, output_names,
+                                             'class_weight')
+
+
+def standardize_sample_weights(sample_weight, output_names):
+  return standardize_sample_or_class_weights(sample_weight, output_names,
+                                             'sample_weight')
+
+
+def check_array_lengths(inputs, targets, weights=None):
+  """Does user input validation for numpy arrays.
+
+  Arguments:
+      inputs: list of Numpy arrays of inputs.
+      targets: list of Numpy arrays of targets.
+      weights: list of Numpy arrays of sample weights.
+
+  Raises:
+      ValueError: in case of incorrectly formatted data.
+  """
+
+  def set_of_lengths(x):
+    # Returns a set with the variation between
+    # different shapes, with None => 0
+    if x is None:
+      return {}
+    else:
+      return set([y.shape[0] for y in x
+                  if y is not None and not tensor_util.is_tensor(y)])
+
+  set_x = set_of_lengths(inputs)
+  set_y = set_of_lengths(targets)
+  set_w = set_of_lengths(weights)
+  if len(set_x) > 1:
+    raise ValueError('All input arrays (x) should have '
+                     'the same number of samples. Got array shapes: ' +
+                     str([x.shape for x in inputs]))
+  if len(set_y) > 1:
+    raise ValueError('All target arrays (y) should have '
+                     'the same number of samples. Got array shapes: ' +
+                     str([y.shape for y in targets]))
+  if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
+    raise ValueError('Input arrays should have '
+                     'the same number of samples as target arrays. '
+                     'Found ' + str(list(set_x)[0]) + ' input samples '
+                     'and ' + str(list(set_y)[0]) + ' target samples.')
+  if len(set_w) > 1:
+    raise ValueError('All sample_weight arrays should have '
+                     'the same number of samples. Got array shapes: ' +
+                     str([w.shape for w in weights]))
+  if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
+    raise ValueError('Sample_weight arrays should have '
+                     'the same number of samples as target arrays. Got ' +
+                     str(list(set_y)[0]) + ' input samples and ' +
+                     str(list(set_w)[0]) + ' target samples.')
+
+
+def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
+  """Does validation on the compatibility of targets and loss functions.
+
+  This helps prevent users from using loss functions incorrectly. This check
+  is purely for UX purposes.
+
+  Arguments:
+      targets: list of Numpy arrays of targets.
+      loss_fns: list of loss functions.
+      output_shapes: list of shapes of model outputs.
+
+  Raises:
+      ValueError: if a loss function or target array
+          is incompatible with an output.
+  """
+  key_losses = {
+      losses.mean_squared_error, losses.binary_crossentropy,
+      losses.categorical_crossentropy
+  }
+  for y, loss, shape in zip(targets, loss_fns, output_shapes):
+    if y is None or loss is None or tensor_util.is_tensor(y):
+      continue
+    if loss is losses.categorical_crossentropy:
+      if y.shape[-1] == 1:
+        raise ValueError('You are passing a target array of shape ' + str(
+            y.shape) + ' while using as loss `categorical_crossentropy`. '
+                         '`categorical_crossentropy` expects '
+                         'targets to be binary matrices (1s and 0s) '
+                         'of shape (samples, classes). '
+                         'If your targets are integer classes, '
+                         'you can convert them to the expected format via:\n'
+                         '```\n'
+                         'from keras.utils import to_categorical\n'
+                         'y_binary = to_categorical(y_int)\n'
+                         '```\n'
+                         '\n'
+                         'Alternatively, you can use the loss function '
+                         '`sparse_categorical_crossentropy` instead, '
+                         'which does expect integer targets.')
+    if loss in key_losses:
+      for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
+        if out_dim is not None and target_dim != out_dim:
+          raise ValueError('A target array with shape ' + str(y.shape) +
+                           ' was passed for an output of shape ' + str(shape) +
+                           ' while using as loss `' + loss.__name__ + '`. '
+                           'This loss expects '
+                           'targets to have the same shape '
+                           'as the output.')
+
+
+def collect_metrics(metrics, output_names):
+  """Maps metric functions to model outputs.
+
+  Arguments:
+      metrics: a list or dict of metric functions.
+      output_names: a list of the names (strings) of model outputs.
+
+  Returns:
+      A list (one entry per model output) of lists of metric functions.
+      For instance, if the model has 2 outputs, and for the first output
+      we want to compute "binary_accuracy" and "binary_crossentropy",
+      and just "binary_accuracy" for the second output,
+      the list would look like:
+          `[[binary_accuracy, binary_crossentropy], [binary_accuracy]]`
+
+  Raises:
+      TypeError: if an incorrect type is passed for the `metrics` argument.
+  """
+  if not metrics:
+    return [[] for _ in output_names]
+  if isinstance(metrics, list):
+    # we then apply all metrics to all outputs.
+    return [copy.copy(metrics) for _ in output_names]
+  elif isinstance(metrics, dict):
+    nested_metrics = []
+    for name in output_names:
+      output_metrics = metrics.get(name, [])
+      if not isinstance(output_metrics, list):
+        output_metrics = [output_metrics]
+      nested_metrics.append(output_metrics)
+    return nested_metrics
+  else:
+    raise TypeError('Type of `metrics` argument not understood. '
+                    'Expected a list or dictionary, found: ' + str(metrics))
+
+
+def batch_shuffle(index_array, batch_size):
+  """Shuffles an array in a batch-wise fashion.
+
+  Useful for shuffling HDF5 arrays
+  (where one cannot access arbitrary indices).
+
+  Arguments:
+      index_array: array of indices to be shuffled.
+      batch_size: integer.
+
+  Returns:
+      The `index_array` array, shuffled in a batch-wise fashion.
+  """
+  batch_count = int(len(index_array) / batch_size)
+  # to reshape we need to be cleanly divisible by batch size
+  # we stash extra items and reappend them after shuffling
+  last_batch = index_array[batch_count * batch_size:]
+  index_array = index_array[:batch_count * batch_size]
+  index_array = index_array.reshape((batch_count, batch_size))
+  np.random.shuffle(index_array)
+  index_array = index_array.flatten()
+  return np.append(index_array, last_batch)
+
+
+def weighted_masked_objective(fn):
+  """Adds support for masking and sample-weighting to an objective function.
+
+  It transforms an objective function `fn(y_true, y_pred)`
+  into a sample-weighted, cost-masked objective function
+  `fn(y_true, y_pred, weights, mask)`.
+
+  Arguments:
+      fn: The objective function to wrap,
+          with signature `fn(y_true, y_pred)`.
+
+  Returns:
+      A function with signature `fn(y_true, y_pred, weights, mask)`.
+  """
+  if fn is None:
+    return None
+
+  def weighted(y_true, y_pred, weights, mask=None):
+    """Wrapper function.
+
+    Arguments:
+        y_true: `y_true` argument of `fn`.
+        y_pred: `y_pred` argument of `fn`.
+        weights: Weights tensor.
+        mask: Mask tensor.
+
+    Returns:
+        Scalar tensor.
+    """
+    # score_array has ndim >= 2
+    score_array = fn(y_true, y_pred)
+    if mask is not None:
+      # Cast the mask to floatX to avoid float64 upcasting in theano
+      mask = math_ops.cast(mask, K.floatx())
+      # mask should have the same shape as score_array
+      score_array *= mask
+      #  the loss per batch should be proportional
+      #  to the number of unmasked samples.
+      score_array /= K.mean(mask)
+
+    # apply sample weighting
+    if weights is not None:
+      # reduce score_array to same ndim as weight array
+      ndim = K.ndim(score_array)
+      weight_ndim = K.ndim(weights)
+      score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
+      score_array *= weights
+      score_array /= K.mean(
+          math_ops.cast(math_ops.not_equal(weights, 0), K.floatx()))
+    return K.mean(score_array)
+
+  return weighted
+
+
+def standardize_weights(y,
+                        sample_weight=None,
+                        class_weight=None,
+                        sample_weight_mode=None):
+  """Performs sample weight validation and standardization.
+
+  Everything gets normalized to a single sample-wise (or timestep-wise)
+  weight array.
+
+  Arguments:
+      y: Numpy array of model targets to be weighted.
+      sample_weight: User-provided `sample_weight` argument.
+      class_weight: User-provided `class_weight` argument.
+      sample_weight_mode: One of `None` or `"temporal"`.
+          `"temporal"` indicated that we expect 2D weight data
+          that will be applied to the last 2 dimensions of
+          the targets (i.e. we are weighting timesteps, not samples).
+
+  Returns:
+      A numpy array of target weights, one entry per sample to weight.
+
+  Raises:
+      ValueError: In case of invalid user-provided arguments.
+  """
+  if sample_weight_mode is not None:
+    if sample_weight_mode != 'temporal':
+      raise ValueError('"sample_weight_mode '
+                       'should be None or "temporal". '
+                       'Found: ' + str(sample_weight_mode))
+    if len(y.shape) < 3:
+      raise ValueError('Found a sample_weight array for '
+                       'an input with shape ' + str(y.shape) + '. '
+                       'Timestep-wise sample weighting (use of '
+                       'sample_weight_mode="temporal") is restricted to '
+                       'outputs that are at least 3D, i.e. that have '
+                       'a time dimension.')
+    if sample_weight is not None and len(sample_weight.shape) != 2:
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + '. '
+                       'In order to use timestep-wise sample weighting, '
+                       'you should pass a 2D sample_weight array.')
+  else:
+    if sample_weight is not None and len(sample_weight.shape) != 1:
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + '. '
+                       'In order to use timestep-wise sample weights, '
+                       'you should specify '
+                       'sample_weight_mode="temporal" '
+                       'in compile(). If you just mean to use '
+                       'sample-wise weights, make sure your '
+                       'sample_weight array is 1D.')
+
+  if sample_weight is not None:
+    if len(sample_weight.shape) > len(y.shape):
+      raise ValueError(
+          'Found a sample_weight with shape' + str(sample_weight.shape) + '.'
+          'Expected sample_weight with rank '
+          'less than or equal to ' + str(len(y.shape)))
+
+    if y.shape[:sample_weight.ndim] != sample_weight.shape:
+      raise ValueError(
+          'Found a sample_weight array with shape ' + str(sample_weight.shape) +
+          ' for an input with shape ' + str(y.shape) + '. '
+          'sample_weight cannot be broadcast.')
+    return sample_weight
+  elif isinstance(class_weight, dict):
+    if len(y.shape) > 2:
+      raise ValueError('`class_weight` not supported for '
+                       '3+ dimensional targets.')
+    if y.shape[1] > 1:
+      y_classes = np.argmax(y, axis=1)
+    elif y.shape[1] == 1:
+      y_classes = np.reshape(y, y.shape[0])
+    else:
+      y_classes = y
+
+    weights = np.asarray(
+        [class_weight[cls] for cls in y_classes if cls in class_weight])
+
+    if len(weights) != len(y_classes):
+      # subtract the sets to pick all missing classes
+      existing_classes = set(y_classes)
+      existing_class_weight = set(class_weight.keys())
+      raise ValueError('`class_weight` must contain all classes in the data.'
+                       ' The classes %s exist in the data but not in '
+                       '`class_weight`.' %
+                       (existing_classes - existing_class_weight))
+    return weights
+  else:
+    return None
+
+
+def has_symbolic_tensors(ls):
+  return (any(tensor_util.is_tensor(v) for v in ls)
+          and not context.executing_eagerly())
+
+
+def populate_metric_names(model):
+  for i in range(len(model.outputs)):
+    metrics = model.nested_metrics[i]
+    for metric in metrics:
+      base_metric_name = get_base_metric_name(metric)
+      add_metric_name(model, base_metric_name, i)
+
+
+def get_base_metric_name(metric, weighted=False):
+  """Returns the metric name given the metric function.
+
+  Arguments:
+      metric: Metric function name or reference.
+      weighted: Boolean indicating if the metric for which we are adding
+          names is weighted.
+
+  Returns:
+      a metric name.
+  """
+  metric_name_prefix = 'weighted_' if weighted else ''
+  if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+    if metric in ('accuracy', 'acc'):
+      suffix = 'acc'
+    elif metric in ('crossentropy', 'ce'):
+      suffix = 'ce'
+    metric_name = metric_name_prefix + suffix
+  else:
+    metric_fn = metrics_module.get(metric)
+    # Get metric name as string
+    if hasattr(metric_fn, 'name'):
+      metric_name = metric_fn.name
+    else:
+      metric_name = metric_fn.__name__
+    metric_name = metric_name_prefix + metric_name
+
+  return metric_name
+
+
+def add_metric_name(model, metric_name, index):
+  """Makes the metric name unique and adds it to the model's metric name list.
+
+    If there are multiple outputs for which the metrics are calculated, the
+    metric names have to be made unique by appending an integer.
+
+  Arguments:
+    model: Model to which we are adding metric names.
+    metric_name: Metric name that corresponds to the metric specified by the
+        user. For example: 'acc'
+    index: The index of the model output for which the metric name is being
+        added.
+  """
+  if len(model.output_names) > 1:
+    metric_name = '%s_%s' % (model.output_names[index], metric_name)
+  j = 1
+  base_metric_name = metric_name
+  while metric_name in model.metrics_names:
+    metric_name = '%s_%d' % (base_metric_name, j)
+    j += 1
+  model.metrics_names.append(metric_name)
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index db0140c2df4d20f9e18e6c1401c6c6aa197bcf1f..c3c3fceb454773f18400bd31fe8b649c557fc013 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -25,14 +25,21 @@ from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import models
+from tensorflow.python.keras._impl.keras import optimizers
+from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
+from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import saver as saver_lib
@@ -50,36 +57,198 @@ def _cast_tensor_to_floatx(x):
     return math_ops.cast(x, K.floatx())
 
 
-def _create_ordered_io(keras_model, estimator_io_dict, is_input=True):
+def _convert_tensor(x):
+  """Create or cast tensor if needed."""
+  if not tensor_util.is_tensor(x):
+    # x is a numpy array
+    x = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(x)
+  if check_ops.is_numeric_tensor(x):
+    # is_numeric_tensor returns False if provided with a numpy array
+    x = _cast_tensor_to_floatx(x)
+  return x
+
+
+def _any_variable_initalized():
+  """Check if any variable has been initialized in the Keras model.
+
+  Returns:
+    boolean, True if at least one variable has been initalized, else False.
+  """
+  variables = variables_module.global_variables()
+  for v in variables:
+    if getattr(v, '_keras_initialized', False):
+      return True
+  return False
+
+
+def _create_ordered_io(keras_model, estimator_io, is_input=True):
   """Create a list of tensors from IO dictionary based on Keras IO order.
 
   Args:
-    keras_model: an instance of compiled keras model.
-    estimator_io_dict: features or labels dictionary from model_fn.
+    keras_model: An instance of compiled keras model.
+    estimator_io: The features or labels (dict or plain array) from model_fn.
     is_input: True if dictionary is for inputs.
 
   Returns:
-    a list of tensors based on Keras IO order.
+    A list of tensors based on Keras IO order.
 
   Raises:
     ValueError: if dictionary keys cannot be found in Keras model input_names
       or output_names.
   """
-  if is_input:
-    keras_io_names = keras_model.input_names
+  if isinstance(estimator_io, (list, tuple)):
+    # Case currently not supported by most built-in input_fn,
+    # but it's good to have for sanity
+    return [_convert_tensor(x) for x in estimator_io]
+  elif isinstance(estimator_io, dict):
+    if is_input:
+      if keras_model._is_graph_network:
+        keras_io_names = keras_model.input_names
+      else:
+        keras_io_names = [
+            'input_%d' % i for i in range(1, len(estimator_io) + 1)]
+    else:
+      if keras_model._is_graph_network:
+        keras_io_names = keras_model.output_names
+      else:
+        keras_io_names = [
+            'output_%d' % i for i in range(1, len(estimator_io) + 1)]
+
+    for key in estimator_io:
+      if key not in keras_io_names:
+        raise ValueError(
+            'Cannot find %s with name "%s" in Keras Model. '
+            'It needs to match one '
+            'of the following: %s' % ('input' if is_input else 'output', key,
+                                      ', '.join(keras_io_names)))
+      tensors = [_convert_tensor(estimator_io[io_name])
+                 for io_name in keras_io_names]
+    return tensors
   else:
-    keras_io_names = keras_model.output_names
+    # Plain array.
+    return _convert_tensor(estimator_io)
+
+
+def _in_place_subclassed_model_reset(model):
+  """Substitute for model cloning that works for subclassed models.
+
+  Subclassed models cannot be cloned because their topology is not serializable.
+  To "instantiate" an identical model in a new TF graph, we reuse the original
+  model object, but we clear its state.
+
+  After calling this function on a model intance, you can use the model instance
+  as if it were a model clone (in particular you can use it in a new graph).
+
+  This method clears the state of the input model. It is thus destructive.
+  However the original state can be restored fully by calling
+  `_in_place_subclassed_model_state_restoration`.
+
+  Args:
+    model: Instance of a Keras model created via subclassing.
+
+  Raises:
+    ValueError: In case the model uses a subclassed model as inner layer.
+  """
+  assert not model._is_graph_network  # Only makes sense for subclassed networks
+  # Retrieve all layers tracked by the model as well as their attribute names
+  attributes_cache = {}
+  for name in dir(model):
+    try:
+      value = getattr(model, name)
+    except (AttributeError, ValueError, TypeError):
+      continue
+    if isinstance(value, Layer):
+      attributes_cache[name] = value
+      assert value in model._layers
+    elif isinstance(value, (list, tuple)) and name not in ('layers', '_layers'):
+      # Handle case: list/tuple of layers (also tracked by the Network API).
+      if value and all(isinstance(val, Layer) for val in value):
+        raise ValueError('We do not support the use of list-of-layers '
+                         'attributes in subclassed models used with '
+                         '`model_to_estimator` at this time. Found list '
+                         'model: %s' % name)
+
+  # Replace layers on the model with fresh layers
+  layers_to_names = {value: key for key, value in attributes_cache.items()}
+  original_layers = model._layers[:]
+  model._layers = []
+  for layer in original_layers:  # We preserve layer order.
+    config = layer.get_config()
+    # This will not work for nested subclassed models used as layers.
+    # This would be theoretically possible to support, but would add complexity.
+    # Only do it if users complain.
+    if isinstance(layer, Network) and not layer._is_graph_network:
+      raise ValueError('We do not support the use of nested subclassed models '
+                       'in `model_to_estimator` at this time. Found nested '
+                       'model: %s' % layer)
+    fresh_layer = layer.__class__.from_config(config)
+    name = layers_to_names[layer]
+    setattr(model, name, fresh_layer)
+
+  # Cache original model build attributes (in addition to layers)
+  if (not hasattr(model, '_original_attributes_cache') or
+      model._original_attributes_cache is None):
+    if model.built:
+      attributes_to_cache = [
+          'inputs',
+          'outputs',
+          '_feed_outputs',
+          '_feed_output_names',
+          '_feed_output_shapes',
+          '_feed_loss_fns',
+          'loss_weights_list',
+          'targets',
+          '_feed_targets',
+          'sample_weight_modes',
+          'weighted_metrics',
+          'metrics_names',
+          'metrics_tensors',
+          'metrics_updates',
+          'stateful_metric_names',
+          'total_loss',
+          'sample_weights',
+          '_feed_sample_weights',
+          'train_function',
+          'test_function',
+          'predict_function',
+          '_collected_trainable_weights',
+          '_feed_inputs',
+          '_feed_input_names',
+          '_feed_input_shapes',
+          'optimizer',
+      ]
+      for name in attributes_to_cache:
+        attributes_cache[name] = getattr(model, name)
+  model._original_attributes_cache = attributes_cache
+
+  # Reset built state
+  model.built = False
+  model.inputs = None
+  model.outputs = None
 
-  for key in estimator_io_dict:
-    if key not in keras_io_names:
-      raise ValueError(
-          'Cannot find %s with name "%s" in Keras Model. It needs to match '
-          'one of the following: %s' % ('input' if is_input else 'output', key,
-                                        ', '.join(keras_io_names)))
-  tensors = []
-  for io_name in keras_io_names:
-    tensors.append(_cast_tensor_to_floatx(estimator_io_dict[io_name]))
-  return tensors
+
+def _in_place_subclassed_model_state_restoration(model):
+  """Restores the original state of a model after it was "reset".
+
+  This undoes this action of `_in_place_subclassed_model_reset`.
+
+  Args:
+    model: Instance of a Keras model created via subclassing, on which
+      `_in_place_subclassed_model_reset` was previously called.
+  """
+  assert not model._is_graph_network
+  # Restore layers and build attributes
+  if (hasattr(model, '_original_attributes_cache') and
+      model._original_attributes_cache is not None):
+    model._layers = []
+    for name, value in model._original_attributes_cache.items():
+      setattr(model, name, value)
+    model._original_attributes_cache = None
+  else:
+    # Restore to the state of a never-called model.
+    model.built = False
+    model.inputs = None
+    model.outputs = None
 
 
 def _clone_and_build_model(mode,
@@ -93,8 +262,8 @@ def _clone_and_build_model(mode,
     mode: training mode.
     keras_model: an instance of compiled keras model.
     custom_objects: Dictionary for custom objects.
-    features:
-    labels:
+    features: Dict of tensors.
+    labels: Dict of tensors, or single tensor instance.
 
   Returns:
     The newly built model.
@@ -102,33 +271,48 @@ def _clone_and_build_model(mode,
   # Set to True during training, False for inference.
   K.set_learning_phase(mode == model_fn_lib.ModeKeys.TRAIN)
 
-  # Clone keras model.
-  input_tensors = None if features is None else _create_ordered_io(
-      keras_model, features)
-  if custom_objects:
-    with CustomObjectScope(custom_objects):
+  # Get list of inputs.
+  if features is None:
+    input_tensors = None
+  else:
+    input_tensors = _create_ordered_io(keras_model,
+                                       estimator_io=features,
+                                       is_input=True)
+  # Get list of outputs.
+  if labels is None:
+    target_tensors = None
+  elif isinstance(labels, dict):
+    target_tensors = _create_ordered_io(keras_model,
+                                        estimator_io=labels,
+                                        is_input=False)
+  else:
+    target_tensors = [
+        _convert_tensor(labels)
+    ]
+
+  if keras_model._is_graph_network:
+    if custom_objects:
+      with CustomObjectScope(custom_objects):
+        model = models.clone_model(keras_model, input_tensors=input_tensors)
+    else:
       model = models.clone_model(keras_model, input_tensors=input_tensors)
   else:
-    model = models.clone_model(keras_model, input_tensors=input_tensors)
+    model = keras_model
+    _in_place_subclassed_model_reset(model)
+    if input_tensors is not None:
+      model._set_inputs(input_tensors)
 
   # Compile/Build model
-  if mode is model_fn_lib.ModeKeys.PREDICT and not model.built:
-    model.build()
+  if mode is model_fn_lib.ModeKeys.PREDICT:
+    if isinstance(model, models.Sequential):
+      model.build()
   else:
-    optimizer_config = keras_model.optimizer.get_config()
-    optimizer = keras_model.optimizer.__class__.from_config(optimizer_config)
-    optimizer.iterations = training_util.get_or_create_global_step()
-
-    # Get list of outputs.
-    if labels is None:
-      target_tensors = None
-    elif isinstance(labels, dict):
-      target_tensors = _create_ordered_io(keras_model, labels, is_input=False)
+    if isinstance(keras_model.optimizer, optimizers.TFOptimizer):
+      optimizer = keras_model.optimizer
     else:
-      target_tensors = [
-          _cast_tensor_to_floatx(
-              sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels))
-      ]
+      optimizer_config = keras_model.optimizer.get_config()
+      optimizer = keras_model.optimizer.__class__.from_config(optimizer_config)
+    optimizer.iterations = training_util.get_or_create_global_step()
 
     model.compile(
         optimizer,
@@ -138,9 +322,6 @@ def _clone_and_build_model(mode,
         sample_weight_mode=keras_model.sample_weight_mode,
         weighted_metrics=keras_model.weighted_metrics,
         target_tensors=target_tensors)
-
-  if isinstance(model, models.Sequential):
-    model = model.model
   return model
 
 
@@ -168,10 +349,14 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
 
     # Set loss and metric only during train and evaluate.
     if mode is not model_fn_lib.ModeKeys.PREDICT:
-      model._make_train_function()  # pylint: disable=protected-access
+      if mode is model_fn_lib.ModeKeys.TRAIN:
+        model._make_train_function()  # pylint: disable=protected-access
+      else:
+        model._make_test_function()  # pylint: disable=protected-access
       loss = model.total_loss
 
       if model.metrics:
+        # TODO(fchollet): support stateful metrics
         eval_metric_ops = {}
         # When each metric maps to an output
         if isinstance(model.metrics, dict):
@@ -195,6 +380,10 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
     if mode is model_fn_lib.ModeKeys.TRAIN:
       train_op = model.train_function.updates_op
 
+    if not model._is_graph_network:
+      # Reset model state to original state,
+      # to avoid `model_fn` being destructive for the initial model argument.
+      _in_place_subclassed_model_state_restoration(keras_model)
     return model_fn_lib.EstimatorSpec(
         mode=mode,
         predictions=predictions,
@@ -222,19 +411,18 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
   Returns:
     The model_fn for a keras Estimator.
   """
-  with ops.Graph().as_default() as g, g.device(estimator._device_fn):
-    random_seed.set_random_seed(estimator.config.tf_random_seed)
-    training_util.create_global_step()
-    model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model,
-                                   custom_objects)
-
-    if isinstance(model, models.Sequential):
-      model = model.model
-    # Load weights and save to checkpoint if there is no checkpoint
-    latest_path = saver_lib.latest_checkpoint(estimator.model_dir)
-    if not latest_path:
-      with session.Session() as sess:
-        model.set_weights(keras_weights)
+  # Load weights and save to checkpoint if there is no checkpoint
+  latest_path = saver_lib.latest_checkpoint(estimator.model_dir)
+  if not latest_path:
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(estimator.config.tf_random_seed)
+      training_util.create_global_step()
+      model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model,
+                                     custom_objects)
+      # save to checkpoint
+      with session.Session(config=estimator._session_config) as sess:
+        if keras_weights:
+          model.set_weights(keras_weights)
         # Make update ops and initialize all variables.
         if not model.train_function:
           # pylint: disable=protected-access
@@ -274,10 +462,11 @@ def model_to_estimator(keras_model=None,
   """
   if (not keras_model) and (not keras_model_path):
     raise ValueError(
-        'Either keras_model or keras_model_path needs to be provided.')
+        'Either `keras_model` or `keras_model_path` needs to be provided.')
   if keras_model and keras_model_path:
     raise ValueError(
-        'Please specity either keras_model or keras_model_path but not both.')
+        'Please specity either `keras_model` or `keras_model_path`, '
+        'but not both.')
 
   if not keras_model:
     if keras_model_path.startswith(
@@ -288,18 +477,53 @@ def model_to_estimator(keras_model=None,
     logging.info('Loading models from %s', keras_model_path)
     keras_model = models.load_model(keras_model_path)
   else:
-    logging.info('Using the Keras model from memory.')
+    logging.info('Using the Keras model provided.')
     keras_model = keras_model
 
-  if not hasattr(keras_model, 'optimizer'):
+  if not hasattr(keras_model, 'optimizer') or not keras_model.optimizer:
     raise ValueError(
-        'Given keras model has not been compiled yet. Please compile first '
-        'before creating the estimator.')
+        'The given keras model has not been compiled yet. Please compile first '
+        'before calling `model_to_estimator`.')
+
+  if isinstance(config, dict):
+    config = run_config_lib.RunConfig(**config)
 
-  keras_weights = keras_model.get_weights()
   keras_model_fn = _create_keras_model_fn(keras_model, custom_objects)
-  est = estimator_lib.Estimator(
+  estimator = estimator_lib.Estimator(
       keras_model_fn, model_dir=model_dir, config=config)
-  # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
-  _save_first_checkpoint(keras_model, est, custom_objects, keras_weights)
-  return est
+
+  # Check if we need to call get_weights:
+  if _any_variable_initalized():
+    keras_weights = keras_model.get_weights()
+    # Warn if config passed to estimator tries to update GPUOptions. If a
+    # session has already been created, the GPUOptions passed to the first
+    # session sticks.
+    if estimator._session_config.HasField('gpu_options'):
+      logging.warning(
+          'The Keras backend session has already been set. '
+          'The _session_config passed to model_to_estimator will not be used.')
+  else:
+    # Pass the config into keras backend's default session.
+    sess = session.Session(config=estimator._session_config)
+    K.set_session(sess)
+    keras_weights = None
+
+  if keras_model._is_graph_network:
+    # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
+    _save_first_checkpoint(keras_model,
+                           estimator,
+                           custom_objects,
+                           keras_weights)
+  elif keras_model.built:
+    logging.warning('You are creating an Estimator from a Keras model '
+                    'manually subclassed from `Model`, that was '
+                    'already called on some inputs (and thus already had '
+                    'weights). We are currently unable to preserve '
+                    'the model\'s state (its weights) '
+                    'as part of the estimator '
+                    'in this case. Be warned that the estimator '
+                    'has been created using '
+                    'a freshly initialized version of your model.\n'
+                    'Note that this doesn\'t affect the state of the '
+                    'model instance you passed as `keras_model` argument.')
+  return estimator
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index 9fc48b4117e7ee2c717d5418754254aa02b82869..80fa87d0410871c30b8a7c46e7ff02bc81c96f3b 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -17,21 +17,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import json
 from math import log10
 import os
 import tempfile
 
 import numpy as np
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
+from tensorflow.python.keras._impl.keras.optimizers import SGD
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import rmsprop
 
 
 try:
@@ -62,12 +68,42 @@ def simple_functional_model():
   return model
 
 
-def get_resource_for_simple_model(is_sequential, is_evaluate):
-  model = simple_sequential_model(
-  ) if is_sequential else simple_functional_model()
-  if is_sequential:
+def simple_subclassed_model():
+
+  class SimpleModel(keras.Model):
+
+    def __init__(self):
+      super(SimpleModel, self).__init__()
+      self.dense1 = keras.layers.Dense(16, activation='relu')
+      self.dp = keras.layers.Dropout(0.1)
+      self.dense2 = keras.layers.Dense(_NUM_CLASS, activation='softmax')
+
+    def call(self, inputs):
+      x = self.dense1(inputs)
+      x = self.dp(x)
+      return self.dense2(x)
+
+  return SimpleModel()
+
+
+def get_resource_for_simple_model(model_type='sequential',
+                                  is_evaluate=False,):
+  if model_type == 'sequential':
+    model = simple_sequential_model()
     model.build()
-  input_name = model.input_names[0]
+  elif model_type == 'subclass':
+    model = simple_subclassed_model()
+  else:
+    assert model_type == 'functional'
+    model = simple_functional_model()
+
+  if model_type == 'subclass':
+    input_name = 'input_1'
+    output_name = 'output_1'
+  else:
+    input_name = model.input_names[0]
+    output_name = model.output_names[0]
+
   np.random.seed(_RANDOM_SEED)
   (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
       train_samples=_TRAIN_SIZE,
@@ -78,17 +114,19 @@ def get_resource_for_simple_model(is_sequential, is_evaluate):
   y_test = keras.utils.to_categorical(y_test)
 
   train_input_fn = numpy_io.numpy_input_fn(
-      x={input_name: x_train},
-      y=y_train,
+      x=randomize_io_type(x_train, input_name),
+      y=randomize_io_type(y_train, output_name),
       shuffle=False,
       num_epochs=None,
       batch_size=16)
 
   evaluate_input_fn = numpy_io.numpy_input_fn(
-      x={input_name: x_test}, y=y_test, num_epochs=1, shuffle=False)
+      x=randomize_io_type(x_test, input_name),
+      y=randomize_io_type(y_test, output_name),
+      num_epochs=1, shuffle=False)
 
   predict_input_fn = numpy_io.numpy_input_fn(
-      x={input_name: x_test}, num_epochs=1, shuffle=False)
+      x=randomize_io_type(x_test, input_name), num_epochs=1, shuffle=False)
 
   inference_input_fn = evaluate_input_fn if is_evaluate else predict_input_fn
 
@@ -96,17 +134,29 @@ def get_resource_for_simple_model(is_sequential, is_evaluate):
                                      y_test), train_input_fn, inference_input_fn
 
 
+def randomize_io_type(array, name):
+  switch = np.random.random()
+  if switch > 0.5:
+    return array
+  else:
+    return {name: array}
+
+
 def multi_inputs_multi_outputs_model():
-  # test multi-input layer
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
+  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
+
   a_2 = dense(a)
+  # Apply a mask
+  s_2 = keras.layers.Lambda(lambda k:
+                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
   b_2 = dense(b)
-  merged = keras.layers.concatenate([a_2, b_2], name='merge')
+  merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
   d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
-  model = keras.models.Model(inputs=[a, b], outputs=[c, d])
+  model = keras.models.Model(inputs=[a, b, m], outputs=[c, d])
   model.compile(
       loss='categorical_crossentropy',
       optimizer='rmsprop',
@@ -132,10 +182,10 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       gfile.DeleteRecursively(self._base_dir)
 
   def test_train(self):
-    for is_sequential in [True, False]:
+    for model_type in ['sequential', 'functional']:
       keras_model, (_, _), (
           _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
-              is_sequential=is_sequential, is_evaluate=True)
+              model_type=model_type, is_evaluate=True)
       keras_model.compile(
           loss='categorical_crossentropy',
           optimizer='rmsprop',
@@ -153,10 +203,87 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       writer_cache.FileWriterCache.clear()
       gfile.DeleteRecursively(self._config.model_dir)
 
+  def test_train_with_tf_optimizer(self):
+    for model_type in ['sequential', 'functional']:
+      keras_model, (_, _), (
+          _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
+              model_type=model_type, is_evaluate=True)
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer=rmsprop.RMSPropOptimizer(1e-3),
+          metrics=['mse', keras.metrics.categorical_accuracy])
+
+      with self.test_session():
+        est_keras = keras.estimator.model_to_estimator(
+            keras_model=keras_model,
+            # Also use dict config argument to get test coverage for that line.
+            config={
+                'tf_random_seed': _RANDOM_SEED,
+                'model_dir': self._base_dir,
+            })
+        before_eval_results = est_keras.evaluate(
+            input_fn=eval_input_fn, steps=1)
+        est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
+        after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
+        self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
+      writer_cache.FileWriterCache.clear()
+      gfile.DeleteRecursively(self._config.model_dir)
+
+  def test_train_with_subclassed_model(self):
+    keras_model, (_, _), (
+        _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
+            model_type='subclass', is_evaluate=True)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    with self.test_session():
+      est_keras = keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
+      before_eval_results = est_keras.evaluate(
+          input_fn=eval_input_fn, steps=1)
+      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
+      after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
+      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
+  def test_train_with_subclassed_model_with_existing_state(self):
+    keras_model, (_, _), (
+        _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
+            model_type='subclass', is_evaluate=True)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    with self.test_session():
+      # Create state
+      keras_model.train_on_batch(np.random.random((10,) + _INPUT_SIZE),
+                                 np.random.random((10, _NUM_CLASS)))
+      original_preds = keras_model.predict(np.ones((10,) + _INPUT_SIZE))
+
+      est_keras = keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
+      before_eval_results = est_keras.evaluate(
+          input_fn=eval_input_fn, steps=1)
+      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
+      after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
+      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
+      # Check that original model state was not altered
+      preds = keras_model.predict(np.ones((10,) + _INPUT_SIZE))
+      self.assertAllClose(original_preds, preds, atol=1e-5)
+      # Check that the original model compilation did not break
+      keras_model.train_on_batch(np.random.random((10,) + _INPUT_SIZE),
+                                 np.random.random((10, _NUM_CLASS)))
+
   def test_evaluate(self):
     keras_model, (x_train, y_train), (
         x_test, y_test), _, eval_input_fn = get_resource_for_simple_model(
-            is_sequential=False, is_evaluate=True)
+            model_type='functional', is_evaluate=True)
 
     with self.test_session():
       metrics = [
@@ -198,7 +325,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     # Check that predict on a pretrained model yield the same result.
     keras_model, (x_train, y_train), (
         x_test, _), _, pred_input_fn = get_resource_for_simple_model(
-            is_sequential=True, is_evaluate=False)
+            model_type='sequential', is_evaluate=False)
 
     with self.test_session():
       keras_model.compile(
@@ -230,18 +357,27 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         test_samples=50,
         input_shape=(16,),
         num_classes=2)
+    np.random.seed(_RANDOM_SEED)
+    (input_m_train, _), (input_m_test, _) = testing_utils.get_test_data(
+        train_samples=_TRAIN_SIZE,
+        test_samples=50,
+        input_shape=(8,),
+        num_classes=2)
+
     c_train = keras.utils.to_categorical(c_train)
     c_test = keras.utils.to_categorical(c_test)
     d_train = keras.utils.to_categorical(d_train)
     d_test = keras.utils.to_categorical(d_test)
 
     def train_input_fn():
-      input_dict = {'input_a': a_train, 'input_b': b_train}
+      input_dict = {'input_a': a_train, 'input_b': b_train,
+                    'input_m': input_m_train > 0}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
-      input_dict = {'input_a': a_test, 'input_b': b_test}
+      input_dict = {'input_a': a_test, 'input_b': b_test,
+                    'input_m': input_m_test > 0}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
@@ -260,7 +396,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     keras_model, (x_train, y_train), (
         x_test, _), _, pred_input_fn = get_resource_for_simple_model(
-            is_sequential=False, is_evaluate=False)
+            model_type='functional', is_evaluate=False)
 
     with self.test_session():
       keras_model.compile(
@@ -323,8 +459,9 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     model = simple_functional_model()
     model.compile(
         loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
-    est_keras = keras.estimator.model_to_estimator(
-        keras_model=model, config=self._config)
+    with self.test_session():
+      est_keras = keras.estimator.model_to_estimator(
+          keras_model=model, config=self._config)
 
     with self.test_session():
       with self.assertRaises(ValueError):
@@ -352,6 +489,68 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
           model_dir=tempfile.mkdtemp(dir=self._base_dir),
           custom_objects=custom_objects)
 
+  def test_tf_config(self):
+    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    tf_config = json.dumps({
+        'cluster': {
+            run_config_lib.TaskType.PS: ['localhost:1234'],
+            run_config_lib.TaskType.WORKER: ['localhost:1236'],
+            run_config_lib.TaskType.MASTER: ['localhost:1238']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.MASTER,
+            'index': 0
+        }
+    })
+    with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
+      with self.test_session():
+        keras.estimator.model_to_estimator(
+            keras_model=keras_model,
+            model_dir=tempfile.mkdtemp(dir=self._base_dir))
+
+  def test_gpu_config(self):
+    with ops.Graph().as_default():
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['mse', keras.metrics.categorical_accuracy])
+
+      gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
+      sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
+      self._config._session_config = sess_config
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+      self.assertEqual(
+          keras.backend.get_session()
+          ._config.gpu_options.per_process_gpu_memory_fraction,
+          gpu_options.per_process_gpu_memory_fraction)
+
+  def test_pretrained_weights(self):
+    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=['mse', keras.metrics.categorical_accuracy])
+    with self.test_session():
+      keras_model.train_on_batch(
+          np.random.random((10,) + _INPUT_SIZE),
+          np.random.random((10, _NUM_CLASS)))
+      weights = keras_model.get_weights()
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.set_weights(weights)
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer=SGD(lr=0.0001, momentum=0.9),
+          metrics=['mse', keras.metrics.categorical_accuracy])
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/initializers.py b/tensorflow/python/keras/_impl/keras/initializers.py
index 300bed5e1437074d010760c427c14f68e58ac363..ecb71d00e2c78ced6095aaa3a0180b454b04917a 100644
--- a/tensorflow/python/keras/_impl/keras/initializers.py
+++ b/tensorflow/python/keras/_impl/keras/initializers.py
@@ -201,6 +201,8 @@ def deserialize(config, custom_objects=None):
 
 @tf_export('keras.initializers.get')
 def get(identifier):
+  if identifier is None:
+    return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/_impl/keras/integration_test.py
index 15c3d14727a44c9726a1c2c86f47640bcc490e70..43aff67ef93c8ec495beafdd17c5557b6398671f 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/_impl/keras/integration_test.py
@@ -23,23 +23,21 @@ import numpy as np
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.layers import core as tf_core_layers
-from tensorflow.python.layers import network as tf_network_layers
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
 
 class KerasIntegrationTest(test.TestCase):
 
-  def test_vector_classification_declarative(self):
+  def test_vector_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential([
           keras.layers.Dense(16,
@@ -49,23 +47,22 @@ class KerasIntegrationTest(test.TestCase):
           keras.layers.Dense(y_train.shape[-1], activation='softmax')
       ])
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_functional(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(10,),
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(20,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(shape=x_train.shape[1:])
       x = keras.layers.Dense(16, activation='relu')(inputs)
@@ -74,77 +71,78 @@ class KerasIntegrationTest(test.TestCase):
 
       model = keras.models.Model(inputs, outputs)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  def test_temporal_classification_declarative(self):
+  def test_temporal_classification_sequential(self):
     with self.test_session():
-      np.random.seed(1336)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(4, 8),
+      np.random.seed(1337)
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(4, 10),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
       model.add(keras.layers.LSTM(5, return_sequences=True,
                                   input_shape=x_train.shape[1:]))
       model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+      history = model.fit(x_train, y_train, epochs=15, batch_size=16,
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  def test_image_classification_declarative(self):
+  def test_image_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(8, 8, 3),
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(12, 12, 3),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
       model.add(keras.layers.Conv2D(
-          8, 3,
+          4, 3,
+          padding='same',
           activation='relu',
           input_shape=x_train.shape[1:]))
-      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Conv2D(
           8, 3,
           padding='same',
           activation='relu'))
-      model.add(keras.layers.GlobalMaxPooling2D())
+      model.add(keras.layers.Conv2D(
+          16, 3,
+          padding='same',
+          activation='relu'))
+      model.add(keras.layers.Flatten())
       model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_video_classification_functional(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(4, 8, 8, 3),
           num_classes=3)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(shape=x_train.shape[1:])
       x = keras.layers.TimeDistributed(
@@ -160,22 +158,21 @@ class KerasIntegrationTest(test.TestCase):
                     optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.70)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_shared_sequential(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       base_model = keras.models.Sequential([
           keras.layers.Dense(16,
@@ -190,27 +187,26 @@ class KerasIntegrationTest(test.TestCase):
       y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
       model = keras.models.Model(x, y)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       self.assertEqual(len(model.losses), 2)
       self.assertEqual(len(model.updates), 2)
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.84)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_shared_model(self):
     # Test that functional models that feature internal updates
     # and internal losses can be shared.
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(x_train.shape[1:])
       x = keras.layers.Dense(16,
@@ -226,12 +222,12 @@ class KerasIntegrationTest(test.TestCase):
       y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
       model = keras.models.Model(x, y)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_embedding_with_clipnorm(self):
     with self.test_session():
@@ -243,9 +239,9 @@ class KerasIntegrationTest(test.TestCase):
   def test_using_tf_layers_in_keras_sequential_model(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
 
@@ -255,39 +251,37 @@ class KerasIntegrationTest(test.TestCase):
       model.summary()
 
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_using_tf_layers_in_keras_functional_model(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
-      inputs = tf_network_layers.Input(shape=(10,))
+      inputs = keras.Input(shape=(10,))
       x = tf_core_layers.Dense(32, activation=nn.relu)(inputs)
       outputs = tf_core_layers.Dense(2, activation=nn.softmax)(x)
-      model = keras.models.Model(inputs, outputs)
+      model = keras.Model(inputs, outputs)
       model.summary()
 
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/layers/__init__.py b/tensorflow/python/keras/_impl/keras/layers/__init__.py
index 81b2faf106925d974749af3149c5b40d10d49e99..d7bc859280eeedfb41d2c78d4042a181484f3d20 100644
--- a/tensorflow/python/keras/_impl/keras/layers/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/layers/__init__.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras._impl.keras.layers.advanced_activations import *
 from tensorflow.python.keras._impl.keras.layers.convolutional import *
 from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import *
 from tensorflow.python.keras._impl.keras.layers.core import *
+from tensorflow.python.keras._impl.keras.layers.cudnn_recurrent import *
 from tensorflow.python.keras._impl.keras.layers.embeddings import *
 from tensorflow.python.keras._impl.keras.layers.local import *
 from tensorflow.python.keras._impl.keras.layers.merge import *
@@ -37,4 +38,3 @@ from tensorflow.python.keras._impl.keras.layers.recurrent import *
 from tensorflow.python.keras._impl.keras.layers.serialization import deserialize
 from tensorflow.python.keras._impl.keras.layers.serialization import serialize
 from tensorflow.python.keras._impl.keras.layers.wrappers import *
-
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index 7cac17c51a9adcf8fc62154b6633de60bab18387..89931db3c0786b6869379e0d140e8a19e5e46d5f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -25,7 +25,8 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -63,7 +64,7 @@ class LeakyReLU(Layer):
     base_config = super(LeakyReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -118,7 +119,7 @@ class PReLU(Layer):
     else:
       self.shared_axes = list(shared_axes)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     param_shape = list(input_shape[1:])
     self.param_broadcast = [False] * len(param_shape)
@@ -146,7 +147,7 @@ class PReLU(Layer):
     if K.backend() == 'theano':
       neg = (
           K.pattern_broadcast(self.alpha, self.param_broadcast) *
-          (inputs - K.abs(inputs)) * 0.5)
+          (inputs - math_ops.abs(inputs)) * 0.5)
     else:
       neg = -self.alpha * K.relu(-inputs)
     return pos + neg
@@ -161,7 +162,7 @@ class PReLU(Layer):
     base_config = super(PReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -200,7 +201,7 @@ class ELU(Layer):
     base_config = super(ELU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -232,14 +233,15 @@ class ThresholdedReLU(Layer):
     self.theta = K.cast_to_floatx(theta)
 
   def call(self, inputs, mask=None):
-    return inputs * K.cast(K.greater(inputs, self.theta), K.floatx())
+    return inputs * math_ops.cast(
+        math_ops.greater(inputs, self.theta), K.floatx())
 
   def get_config(self):
     config = {'theta': float(self.theta)}
     base_config = super(ThresholdedReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -273,6 +275,6 @@ class Softmax(Layer):
     base_config = super(Softmax, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 162ae6c28f1afae1dd8aaf70213b808d9ad9598f..9971f12773233cf85217913f490dbde836b0b3cb 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -19,9 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
@@ -37,12 +38,233 @@ from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
 from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.layers import convolutional as tf_convolutional_layers
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Conv(Layer):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    self.rank = rank
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(
+        kernel_size, rank, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, rank, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(
+        dilation_rate, rank, 'dilation_rate')
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(ndim=self.rank + 2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    kernel_shape = self.kernel_size + (input_dim, self.filters)
+
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.input_spec = InputSpec(ndim=self.rank + 2,
+                                axes={channel_axis: input_dim})
+    self._convolution_op = nn_ops.Convolution(
+        input_shape,
+        filter_shape=self.kernel.get_shape(),
+        dilation_rate=self.dilation_rate,
+        strides=self.strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format,
+                                                   self.rank + 2))
+    self.built = True
+
+  def call(self, inputs):
+    outputs = self._convolution_op(inputs, self.kernel)
+
+    if self.use_bias:
+      if self.data_format == 'channels_first':
+        if self.rank == 1:
+          # nn.bias_add does not accept a 1D input tensor.
+          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
+          outputs += bias
+        if self.rank == 2:
+          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
+        if self.rank == 3:
+          # As of Mar 2017, direct addition is significantly slower than
+          # bias_add when computing gradients. To use bias_add, we collapse Z
+          # and Y into a single dimension to obtain a 4D input tensor.
+          outputs_shape = outputs.shape.as_list()
+          if outputs_shape[0] is None:
+            outputs_shape[0] = -1
+          outputs_4d = array_ops.reshape(outputs,
+                                         [outputs_shape[0], outputs_shape[1],
+                                          outputs_shape[2] * outputs_shape[3],
+                                          outputs_shape[4]])
+          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
+          outputs = array_ops.reshape(outputs_4d, outputs_shape)
+      else:
+        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @tf_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
-class Conv1D(tf_convolutional_layers.Conv1D, Layer):
+class Conv1D(Conv):
   """1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
@@ -73,6 +295,8 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
           where the model should not violate the temporal order.
           See [WaveNet: A Generative Model for Raw Audio, section
             2.1](https://arxiv.org/abs/1609.03499).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
       dilation_rate: an integer or tuple/list of a single integer, specifying
           the dilation rate to use for dilated convolution.
           Currently, specifying any `dilation_rate` value != 1 is
@@ -104,6 +328,7 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
                kernel_size,
                strides=1,
                padding='valid',
+               data_format='channels_last',
                dilation_rate=1,
                activation=None,
                use_bias=True,
@@ -116,11 +341,12 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
                bias_constraint=None,
                **kwargs):
     super(Conv1D, self).__init__(
+        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
-        data_format='channels_last',
+        data_format=data_format,
         dilation_rate=dilation_rate,
         activation=activations.get(activation),
         use_bias=use_bias,
@@ -133,30 +359,9 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv2D', 'keras.layers.Convolution2D')
-class Conv2D(tf_convolutional_layers.Conv2D, Layer):
+class Conv2D(Conv):
   """2D convolution layer (e.g. spatial convolution over images).
 
   This layer creates a convolution kernel that is convolved
@@ -246,9 +451,8 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv2D, self).__init__(
+        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -266,31 +470,9 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv3D', 'keras.layers.Convolution3D')
-class Conv3D(tf_convolutional_layers.Conv3D, Layer):
+class Conv3D(Conv):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
   This layer creates a convolution kernel that is convolved
@@ -387,9 +569,8 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv3D, self).__init__(
+        rank=3,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -407,32 +588,10 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv2DTranspose',
            'keras.layers.Convolution2DTranspose')
-class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
+class Conv2DTranspose(Conv2D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -528,8 +687,6 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv2DTranspose, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
@@ -547,31 +704,124 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv2DTranspose, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if len(input_shape) != 4:
+      raise ValueError('Inputs should have rank 4. Received input shape: ' +
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_height = conv_utils.deconv_output_length(height,
+                                                 kernel_h,
+                                                 self.padding,
+                                                 stride_h)
+    out_width = conv_utils.deconv_output_length(width,
+                                                kernel_w,
+                                                self.padding,
+                                                stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_height, out_width)
+      strides = (1, 1, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_height, out_width, self.filters)
+      strides = (1, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv2d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if not context.executing_eagerly():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
+                                                          kernel_h,
+                                                          self.padding,
+                                                          stride_h)
+      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
+                                                          kernel_w,
+                                                          self.padding,
+                                                          stride_w)
+      outputs.set_shape(out_shape)
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[h_axis] = conv_utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = conv_utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('keras.layers.Conv3DTranspose',
            'keras.layers.Convolution3DTranspose')
-class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
+class Conv3DTranspose(Conv3D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -678,8 +928,6 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv3DTranspose, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
@@ -697,6 +945,314 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if len(input_shape) != 5:
+      raise ValueError('Inputs should have rank 5, received input shape:',
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined, found None: ' + str(input_shape))
+    input_dim = int(input_shape[channel_axis])
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+    self.input_spec = InputSpec(ndim=5, axes={channel_axis: input_dim})
+
+    self.kernel = self.add_variable(
+        'kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(
+          'bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          trainable=True,
+          dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    self.input_spec = InputSpec(ndim=5, axes={c_axis: inputs_shape[c_axis]})
+
+    depth = inputs_shape[d_axis]
+    height = inputs_shape[h_axis]
+    width = inputs_shape[w_axis]
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_depth = conv_utils.deconv_output_length(depth,
+                                                kernel_d,
+                                                self.padding,
+                                                stride_d)
+    out_height = conv_utils.deconv_output_length(height,
+                                                 kernel_h,
+                                                 self.padding,
+                                                 stride_h)
+    out_width = conv_utils.deconv_output_length(width,
+                                                kernel_w,
+                                                self.padding,
+                                                stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_depth, out_height,
+                      out_width)
+      strides = (1, 1, stride_d, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_depth, out_height, out_width,
+                      self.filters)
+      strides = (1, stride_d, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv3d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=5),
+        padding=self.padding.upper())
+
+    if not context.executing_eagerly():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[d_axis] = conv_utils.deconv_output_length(out_shape[d_axis],
+                                                          kernel_d,
+                                                          self.padding,
+                                                          stride_d)
+      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
+                                                          kernel_h,
+                                                          self.padding,
+                                                          stride_h)
+      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
+                                                          kernel_w,
+                                                          self.padding,
+                                                          stride_w)
+      outputs.set_shape(out_shape)
+
+    if self.use_bias:
+      outputs_shape = outputs.shape.as_list()
+      if outputs_shape[0] is None:
+        outputs_shape[0] = -1
+      if self.data_format == 'channels_first':
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1],
+            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
+        ])
+      else:
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
+            outputs_shape[3], outputs_shape[4]
+        ])
+      outputs_4d = nn.bias_add(
+          outputs_4d,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+      outputs = array_ops.reshape(outputs_4d, outputs_shape)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[d_axis] = conv_utils.deconv_output_length(
+        output_shape[d_axis], kernel_d, self.padding, stride_d)
+    output_shape[h_axis] = conv_utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = conv_utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
+
+
+class SeparableConv(Conv):
+  """Abstract base layer for separable nD convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               pointwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(SeparableConv, self).__init__(
+        rank=rank,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.pointwise_initializer = initializers.get(pointwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.pointwise_regularizer = regularizers.get(pointwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.pointwise_constraint = constraints.get(pointwise_constraint)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    self.input_spec = InputSpec(ndim=self.rank + 2,
+                                axes={channel_axis: input_dim})
+    depthwise_kernel_shape = self.kernel_size + (input_dim,
+                                                 self.depth_multiplier)
+    pointwise_kernel_shape = (
+        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
+
+    self.depthwise_kernel = self.add_variable(
+        name='depthwise_kernel',
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    self.pointwise_kernel = self.add_variable(
+        name='pointwise_kernel',
+        shape=pointwise_kernel_shape,
+        initializer=self.pointwise_initializer,
+        regularizer=self.pointwise_regularizer,
+        constraint=self.pointwise_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    raise NotImplementedError
+
   def get_config(self):
     config = {
         'filters': self.filters,
@@ -704,24 +1260,34 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
         'strides': self.strides,
         'padding': self.padding,
         'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
         'activation': activations.serialize(self.activation),
         'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'depthwise_initializer':
+            initializers.serialize(self.depthwise_initializer),
+        'pointwise_initializer':
+            initializers.serialize(self.pointwise_initializer),
         'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'depthwise_regularizer':
+            regularizers.serialize(self.depthwise_regularizer),
+        'pointwise_regularizer':
+            regularizers.serialize(self.pointwise_regularizer),
         'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'depthwise_constraint':
+            constraints.serialize(self.depthwise_constraint),
+        'pointwise_constraint':
+            constraints.serialize(self.pointwise_constraint),
         'bias_constraint': constraints.serialize(self.bias_constraint)
     }
-    base_config = super(Conv3DTranspose, self).get_config()
+    base_config = super(SeparableConv, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.SeparableConv1D',
            'keras.layers.SeparableConvolution1D')
-class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
+class SeparableConv1D(SeparableConv):
   """Depthwise separable 1D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -801,15 +1367,15 @@ class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
                pointwise_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(SeparableConv1D, self).__init__(
+        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
         activation=activations.get(activation),
         use_bias=use_bias,
         depthwise_initializer=initializers.get(depthwise_initializer),
@@ -824,44 +1390,46 @@ class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'depthwise_initializer':
-            initializers.serialize(self.depthwise_initializer),
-        'pointwise_initializer':
-            initializers.serialize(self.pointwise_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer':
-            regularizers.serialize(self.depthwise_regularizer),
-        'pointwise_regularizer':
-            regularizers.serialize(self.pointwise_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint':
-            constraints.serialize(self.depthwise_constraint),
-        'pointwise_constraint':
-            constraints.serialize(self.pointwise_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(SeparableConv1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      strides = (1,) + self.strides * 2 + (1,)
+      spatial_start_dim = 1
+    else:
+      strides = (1, 1) + self.strides * 2
+      spatial_start_dim = 2
+
+    # Explicitly broadcast inputs and kernels to 4D.
+    # TODO(fchollet): refactor when a native separable_conv1d op is available.
+    inputs = array_ops.expand_dims(inputs, spatial_start_dim)
+    depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
+    pointwise_kernel = array_ops.expand_dims(self.pointwise_kernel, 0)
+    dilation_rate = (1,) + self.dilation_rate
+
+    outputs = nn.separable_conv2d(
+        inputs,
+        depthwise_kernel,
+        pointwise_kernel,
+        strides=strides,
+        padding=self.padding.upper(),
+        rate=dilation_rate,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    outputs = array_ops.squeeze(outputs, [spatial_start_dim])
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
 
 
 @tf_export('keras.layers.SeparableConv2D',
            'keras.layers.SeparableConvolution2D')
-class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
+class SeparableConv2D(SeparableConv):
   """Depthwise separable 2D convolution.
 
   Separable convolutions consist in first performing
@@ -958,15 +1526,15 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
                pointwise_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(SeparableConv2D, self).__init__(
+        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
         activation=activations.get(activation),
         use_bias=use_bias,
         depthwise_initializer=initializers.get(depthwise_initializer),
@@ -981,47 +1549,224 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+  def call(self, inputs):
+    # Apply the actual ops.
+    if self.data_format == 'channels_last':
+      strides = (1,) + self.strides + (1,)
+    else:
+      strides = (1, 1) + self.strides
+    outputs = nn.separable_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        self.pointwise_kernel,
+        strides=strides,
+        padding=self.padding.upper(),
+        rate=self.dilation_rate,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+
+@tf_export('keras.layers.DepthwiseConv2D')
+class DepthwiseConv2D(Conv2D):
+  """Depthwise separable 2D convolution.
+
+  Depthwise Separable convolutions consists in performing
+  just the first step in a depthwise spatial convolution
+  (which acts on each input channel separately).
+  The `depth_multiplier` argument controls how many
+  output channels are generated per input channel in the depthwise step.
+
+  Arguments:
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+        width and height of the 2D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the convolution along the width and height.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+    padding: one of `'valid'` or `'same'` (case-insensitive).
+    depth_multiplier: The number of depthwise convolution output channels
+        for each input channel.
+        The total number of depthwise convolution output
+        channels will be equal to `filters_in * depth_multiplier`.
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, height, width)`.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be 'channels_last'.
+    activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (ie. 'linear' activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    depthwise_initializer: Initializer for the depthwise kernel matrix.
+    bias_initializer: Initializer for the bias vector.
+    depthwise_regularizer: Regularizer function applied to
+        the depthwise kernel matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+        the output of the layer (its 'activation').
+    depthwise_constraint: Constraint function applied to
+        the depthwise kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+    4D tensor with shape:
+    `[batch, channels, rows, cols]` if data_format='channels_first'
+    or 4D tensor with shape:
+    `[batch, rows, cols, channels]` if data_format='channels_last'.
+
+  Output shape:
+    4D tensor with shape:
+    `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
+    or 4D tensor with shape:
+    `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
+    `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               depth_multiplier=1,
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(DepthwiseConv2D, self).__init__(
+        filters=None,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  def build(self, input_shape):
+    if len(input_shape) < 4:
+      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = 3
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs to '
+                       '`DepthwiseConv2D` '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    depthwise_kernel_shape = (self.kernel_size[0],
+                              self.kernel_size[1],
+                              input_dim,
+                              self.depth_multiplier)
+
+    self.depthwise_kernel = self.add_weight(
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        name='depthwise_kernel',
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(shape=(input_dim * self.depth_multiplier,),
+                                  initializer=self.bias_initializer,
+                                  name='bias',
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    # Set input spec.
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs, training=None):
+    outputs = backend.depthwise_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        strides=self.strides,
+        padding=self.padding,
+        dilation_rate=self.dilation_rate,
+        data_format=self.data_format)
+
+    if self.bias:
+      outputs = backend.bias_add(
+          outputs,
+          self.bias,
+          data_format=self.data_format)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+
+    return outputs
+
+  @tf_utils.shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+      out_filters = input_shape[1] * self.depth_multiplier
+    elif self.data_format == 'channels_last':
+      rows = input_shape[1]
+      cols = input_shape[2]
+      out_filters = input_shape[3] * self.depth_multiplier
+
+    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                         self.padding,
+                                         self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                         self.padding,
+                                         self.strides[1])
+    if self.data_format == 'channels_first':
+      return (input_shape[0], out_filters, rows, cols)
+    elif self.data_format == 'channels_last':
+      return (input_shape[0], rows, cols, out_filters)
+
   def get_config(self):
-    config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'dilation_rate':
-            self.dilation_rate,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'depthwise_initializer':
-            initializers.serialize(self.depthwise_initializer),
-        'pointwise_initializer':
-            initializers.serialize(self.pointwise_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer':
-            regularizers.serialize(self.depthwise_regularizer),
-        'pointwise_regularizer':
-            regularizers.serialize(self.pointwise_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint':
-            constraints.serialize(self.depthwise_constraint),
-        'pointwise_constraint':
-            constraints.serialize(self.pointwise_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(SeparableConv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    config = super(DepthwiseConv2D, self).get_config()
+    config.pop('filters')
+    config.pop('kernel_initializer')
+    config.pop('kernel_regularizer')
+    config.pop('kernel_constraint')
+    config['depth_multiplier'] = self.depth_multiplier
+    config['depthwise_initializer'] = initializers.serialize(
+        self.depthwise_initializer)
+    config['depthwise_regularizer'] = regularizers.serialize(
+        self.depthwise_regularizer)
+    config['depthwise_constraint'] = constraints.serialize(
+        self.depthwise_constraint)
+    return config
 
 
 @tf_export('keras.layers.UpSampling1D')
@@ -1051,7 +1796,7 @@ class UpSampling1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], size, input_shape[2]])
 
   def call(self, inputs):
-    output = K.repeat_elements(inputs, self.size, axis=1)
+    output = backend.repeat_elements(inputs, self.size, axis=1)
     return output
 
   def get_config(self):
@@ -1120,7 +1865,8 @@ class UpSampling2D(Layer):
           [input_shape[0], height, width, input_shape[3]])
 
   def call(self, inputs):
-    return K.resize_images(inputs, self.size[0], self.size[1], self.data_format)
+    return backend.resize_images(
+        inputs, self.size[0], self.size[1], self.data_format)
 
   def get_config(self):
     config = {'size': self.size, 'data_format': self.data_format}
@@ -1192,8 +1938,8 @@ class UpSampling3D(Layer):
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
   def call(self, inputs):
-    return K.resize_volumes(inputs, self.size[0], self.size[1], self.size[2],
-                            self.data_format)
+    return backend.resize_volumes(
+        inputs, self.size[0], self.size[1], self.size[2], self.data_format)
 
   def get_config(self):
     config = {'size': self.size, 'data_format': self.data_format}
@@ -1234,7 +1980,7 @@ class ZeroPadding1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
 
   def call(self, inputs):
-    return K.temporal_padding(inputs, padding=self.padding)
+    return backend.temporal_padding(inputs, padding=self.padding)
 
   def get_config(self):
     config = {'padding': self.padding}
@@ -1335,7 +2081,7 @@ class ZeroPadding2D(Layer):
           [input_shape[0], rows, cols, input_shape[3]])
 
   def call(self, inputs):
-    return K.spatial_2d_padding(
+    return backend.spatial_2d_padding(
         inputs, padding=self.padding, data_format=self.data_format)
 
   def get_config(self):
@@ -1453,7 +2199,7 @@ class ZeroPadding3D(Layer):
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
   def call(self, inputs):
-    return K.spatial_3d_padding(
+    return backend.spatial_3d_padding(
         inputs, padding=self.padding, data_format=self.data_format)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index d2792b9636214d21e9658018f853fb6c0808abb4..be25bbc043a3beb14b3afbfdd342aace2403e5fd 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=protected-access
 """Convolutional-recurrent layers.
 """
 from __future__ import absolute_import
@@ -26,179 +27,456 @@ from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
-from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
+from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.layers.recurrent import _generate_dropout_mask
+from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
-class ConvRecurrent2D(Recurrent):
-  """Abstract base class for convolutional recurrent layers.
-
-  Do not use in a model -- it's not a functional layer!
+class ConvRNN2D(RNN):
+  """Base class for convolutional-recurrent layers.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of n integers, specifying the
-          dimensions of the convolution window.
-      strides: An integer or tuple/list of n integers,
-          specifying the strides of the convolution.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, time, ..., channels)`
-          while `channels_first` corresponds to
-          inputs with shape `(batch, time, channels, ...)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: An integer or tuple/list of n integers, specifying
-          the dilation rate to use for dilated convolution.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any `strides` value != 1.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      go_backwards: Boolean (default False).
-          If True, rocess the input sequence backwards.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
+    cell: A RNN cell instance. A RNN cell is a class that has:
+        - a `call(input_at_t, states_at_t)` method, returning
+            `(output_at_t, states_at_t_plus_1)`. The call method of the
+            cell can also take the optional argument `constants`, see
+            section "Note on passing external constants" below.
+        - a `state_size` attribute. This can be a single integer
+            (single state) in which case it is
+            the number of channels of the recurrent state
+            (which should be the same as the number of channels of the cell
+            output). This can also be a list/tuple of integers
+            (one size per state). In this case, the first entry
+            (`state_size[0]`) should be the same as
+            the size of the cell output.
+    return_sequences: Boolean. Whether to return the last output.
+        in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+        in addition to the output.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+    input_shape: Use this argument to specify the shape of the
+        input when this layer is the first one in a model.
 
   Input shape:
-      5D tensor with shape `(num_samples, timesteps, channels, rows, cols)`.
+    5D tensor with shape:
+    `(samples, timesteps, channels, rows, cols)`
+    if data_format='channels_first' or 5D tensor with shape:
+    `(samples, timesteps, rows, cols, channels)`
+    if data_format='channels_last'.
 
   Output shape:
-      - if `return_sequences`: 5D tensor with shape
-          `(num_samples, timesteps, channels, rows, cols)`.
-      - else, 4D tensor with shape `(num_samples, channels, rows, cols)`.
-
-  # Masking
-      This layer supports masking for input data with a variable number
-      of timesteps. To introduce masks to your data,
-      use an `Embedding` layer with the `mask_zero` parameter
-      set to `True`.
-      **Note:** for the time being, masking is only supported with Theano.
-
-  # Note on using statefulness in RNNs
-      You can set RNN layers to be 'stateful', which means that the states
-      computed for the samples in one batch will be reused as initial states
-      for the samples in the next batch.
-      This assumes a one-to-one mapping between
-      samples in different successive batches.
-
-      To enable statefulness:
-          - specify `stateful=True` in the layer constructor.
-          - specify a fixed batch size for your model, by passing
-              a `batch_input_size=(...)` to the first layer in your model.
-              This is the expected shape of your inputs *including the batch
-              size*.
-              It should be a tuple of integers, e.g. `(32, 10, 100)`.
-
-      To reset the states of your model, call `.reset_states()` on either
-      a specific layer, or on your entire model.
+    - if `return_state`: a list of tensors. The first tensor is
+        the output. The remaining tensors are the last states,
+        each 5D tensor with shape:
+        `(samples, timesteps, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 5D tensor with shape:
+        `(samples, timesteps, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to padding.
+    - if `return_sequences`: 5D tensor with shape:
+        `(samples, timesteps, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 5D tensor with shape:
+        `(samples, timesteps, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+    - else, 4D tensor with shape:
+        `(samples, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+
+  Masking:
+    This layer supports masking for input data with a variable number
+    of timesteps. To introduce masks to your data,
+    use an Embedding layer with the `mask_zero` parameter
+    set to `True`.
+
+  Note on using statefulness in RNNs:
+    You can set RNN layers to be 'stateful', which means that the states
+    computed for the samples in one batch will be reused as initial states
+    for the samples in the next batch. This assumes a one-to-one mapping
+    between samples in different successive batches.
+    To enable statefulness:
+        - specify `stateful=True` in the layer constructor.
+        - specify a fixed batch size for your model, by passing
+             - if sequential model:
+                `batch_input_shape=(...)` to the first layer in your model.
+             - if functional model with 1 or more Input layers:
+                `batch_shape=(...)` to all the first layers in your model.
+                This is the expected shape of your inputs
+                *including the batch size*.
+                It should be a tuple of integers,
+                e.g. `(32, 10, 100, 100, 32)`.
+                Note that the number of rows and columns should be specified
+                too.
+        - specify `shuffle=False` when calling fit().
+    To reset the states of your model, call `.reset_states()` on either
+    a specific layer, or on your entire model.
+
+  Note on specifying the initial state of RNNs:
+    You can specify the initial state of RNN layers symbolically by
+    calling them with the keyword argument `initial_state`. The value of
+    `initial_state` should be a tensor or list of tensors representing
+    the initial state of the RNN layer.
+    You can specify the initial state of RNN layers numerically by
+    calling `reset_states` with the keyword argument `states`. The value of
+    `states` should be a numpy array or list of numpy arrays representing
+    the initial state of the RNN layer.
+
+  Note on passing external constants to RNNs:
+    You can pass "external" constants to the cell using the `constants`
+    keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
+    requires that the `cell.call` method accepts the same keyword argument
+    `constants`. Such constants can be used to condition the cell
+    transformation on additional static inputs (not changing over time),
+    a.k.a. an attention mechanism.
   """
 
   def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format=None,
-               dilation_rate=(1, 1),
+               cell,
                return_sequences=False,
+               return_state=False,
                go_backwards=False,
                stateful=False,
+               unroll=False,
                **kwargs):
-    super(ConvRecurrent2D, self).__init__(**kwargs)
-    self.filters = filters
-    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
-    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
-                                                    'dilation_rate')
-    self.return_sequences = return_sequences
-    self.go_backwards = go_backwards
-    self.stateful = stateful
+    if unroll:
+      raise TypeError('Unrolling isn\'t possible with '
+                      'convolutional RNNs.')
+    if isinstance(cell, (list, tuple)):
+      # The StackedConvRNN2DCells isn't implemented yet.
+      raise TypeError('It is not possible at the moment to'
+                      'stack convolutional cells.')
+    super(ConvRNN2D, self).__init__(cell,
+                                    return_sequences,
+                                    return_state,
+                                    go_backwards,
+                                    stateful,
+                                    unroll,
+                                    **kwargs)
     self.input_spec = [InputSpec(ndim=5)]
-    self.state_spec = None
+    self.states = None
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    if self.data_format == 'channels_first':
+
+    cell = self.cell
+    if cell.data_format == 'channels_first':
       rows = input_shape[3]
       cols = input_shape[4]
-    elif self.data_format == 'channels_last':
+    elif cell.data_format == 'channels_last':
       rows = input_shape[2]
       cols = input_shape[3]
-    rows = conv_utils.conv_output_length(
-        rows,
-        self.kernel_size[0],
-        padding=self.padding,
-        stride=self.strides[0],
-        dilation=self.dilation_rate[0])
-    cols = conv_utils.conv_output_length(
-        cols,
-        self.kernel_size[1],
-        padding=self.padding,
-        stride=self.strides[1],
-        dilation=self.dilation_rate[1])
+    rows = conv_utils.conv_output_length(rows,
+                                         cell.kernel_size[0],
+                                         padding=cell.padding,
+                                         stride=cell.strides[0],
+                                         dilation=cell.dilation_rate[0])
+    cols = conv_utils.conv_output_length(cols,
+                                         cell.kernel_size[1],
+                                         padding=cell.padding,
+                                         stride=cell.strides[1],
+                                         dilation=cell.dilation_rate[1])
+
+    if cell.data_format == 'channels_first':
+      output_shape = input_shape[:2] + (cell.filters, rows, cols)
+    elif cell.data_format == 'channels_last':
+      output_shape = input_shape[:2] + (rows, cols, cell.filters)
+
+    if not self.return_sequences:
+      output_shape = output_shape[:1] + output_shape[2:]
+
+    if self.return_state:
+      output_shape = [output_shape]
+      if cell.data_format == 'channels_first':
+        output_shape += [(input_shape[0], cell.filters, rows, cols)
+                         for _ in range(2)]
+      elif cell.data_format == 'channels_last':
+        output_shape += [(input_shape[0], rows, cols, cell.filters)
+                         for _ in range(2)]
+    return output_shape
+
+  @tf_utils.shape_type_conversion
+  def build(self, input_shape):
+    # Note input_shape will be list of shapes of initial states and
+    # constants if these are passed in __call__.
+    if self._num_constants is not None:
+      constants_shape = input_shape[-self._num_constants:]
+    else:
+      constants_shape = None
+
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:5])
+
+    # allow cell (if layer) to build before we set or validate state_spec
+    if isinstance(self.cell, Layer):
+      step_input_shape = (input_shape[0],) + input_shape[2:]
+      if constants_shape is not None:
+        self.cell.build([step_input_shape] + constants_shape)
+      else:
+        self.cell.build(step_input_shape)
+
+    # set or validate state_spec
+    if hasattr(self.cell.state_size, '__len__'):
+      state_size = list(self.cell.state_size)
+    else:
+      state_size = [self.cell.state_size]
+
+    if self.state_spec is not None:
+      # initial_state was passed in call, check compatibility
+      if self.cell.data_format == 'channels_first':
+        ch_dim = 1
+      elif self.cell.data_format == 'channels_last':
+        ch_dim = 3
+      if [spec.shape[ch_dim] for spec in self.state_spec] != state_size:
+        raise ValueError(
+            'An initial_state was passed that is not compatible with '
+            '`cell.state_size`. Received `state_spec`={}; '
+            'However `cell.state_size` is '
+            '{}'.format([spec.shape for spec in self.state_spec],
+                        self.cell.state_size))
+    else:
+      if self.cell.data_format == 'channels_first':
+        self.state_spec = [InputSpec(shape=(None, dim, None, None))
+                           for dim in state_size]
+      elif self.cell.data_format == 'channels_last':
+        self.state_spec = [InputSpec(shape=(None, None, None, dim))
+                           for dim in state_size]
+    if self.stateful:
+      self.reset_states()
+    self.built = True
+
+  def get_initial_state(self, inputs):
+    # (samples, timesteps, rows, cols, filters)
+    initial_state = K.zeros_like(inputs)
+    # (samples, rows, cols, filters)
+    initial_state = K.sum(initial_state, axis=1)
+    shape = list(self.cell.kernel_shape)
+    shape[-1] = self.cell.filters
+    initial_state = self.cell.input_conv(initial_state,
+                                         K.zeros(tuple(shape)),
+                                         padding=self.cell.padding)
+
+    if hasattr(self.cell.state_size, '__len__'):
+      return [initial_state for _ in self.cell.state_size]
+    else:
+      return [initial_state]
+
+  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+    inputs, initial_state, constants = self._standardize_args(
+        inputs, initial_state, constants)
+
+    if initial_state is None and constants is None:
+      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
+
+    # If any of `initial_state` or `constants` are specified and are Keras
+    # tensors, then add them to the inputs and temporarily modify the
+    # input_spec to include them.
+
+    additional_inputs = []
+    additional_specs = []
+    if initial_state is not None:
+      kwargs['initial_state'] = initial_state
+      additional_inputs += initial_state
+      self.state_spec = []
+      for state in initial_state:
+        shape = K.int_shape(state)
+        self.state_spec.append(InputSpec(shape=shape))
+
+      additional_specs += self.state_spec
+    if constants is not None:
+      kwargs['constants'] = constants
+      additional_inputs += constants
+      self.constants_spec = [InputSpec(shape=K.int_shape(constant))
+                             for constant in constants]
+      self._num_constants = len(constants)
+      additional_specs += self.constants_spec
+    # at this point additional_inputs cannot be empty
+    for tensor in additional_inputs:
+      if K.is_keras_tensor(tensor) != K.is_keras_tensor(additional_inputs[0]):
+        raise ValueError('The initial state or constants of an RNN'
+                         ' layer cannot be specified with a mix of'
+                         ' Keras tensors and non-Keras tensors')
+
+    if K.is_keras_tensor(additional_inputs[0]):
+      # Compute the full input spec, including state and constants
+      full_input = [inputs] + additional_inputs
+      full_input_spec = self.input_spec + additional_specs
+      # Perform the call with temporarily replaced input_spec
+      original_input_spec = self.input_spec
+      self.input_spec = full_input_spec
+      output = super(ConvRNN2D, self).__call__(full_input, **kwargs)
+      self.input_spec = original_input_spec
+      return output
+    else:
+      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
+
+  def call(self,
+           inputs,
+           mask=None,
+           training=None,
+           initial_state=None,
+           constants=None):
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      inputs = inputs[0]
+    if initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' +
+                       str(len(initial_state)) +
+                       ' initial states.')
+    timesteps = K.int_shape(inputs)[1]
+
+    kwargs = {}
+    if generic_utils.has_arg(self.cell.call, 'training'):
+      kwargs['training'] = training
+
+    if constants:
+      if not generic_utils.has_arg(self.cell.call, 'constants'):
+        raise ValueError('RNN cell does not support constants')
+
+      def step(inputs, states):
+        constants = states[-self._num_constants:]
+        states = states[:-self._num_constants]
+        return self.cell.call(inputs, states, constants=constants,
+                              **kwargs)
+    else:
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+    last_output, outputs, states = K.rnn(step,
+                                         inputs,
+                                         initial_state,
+                                         constants=constants,
+                                         go_backwards=self.go_backwards,
+                                         mask=mask,
+                                         input_length=timesteps)
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(K.update(self.states[i], states[i]))
+      self.add_update(updates, inputs=True)
+
     if self.return_sequences:
-      if self.data_format == 'channels_first':
-        output_shape = (input_shape[0], input_shape[1], self.filters, rows,
-                        cols)
-      elif self.data_format == 'channels_last':
-        output_shape = (input_shape[0], input_shape[1], rows, cols,
-                        self.filters)
+      output = outputs
     else:
-      if self.data_format == 'channels_first':
-        output_shape = (input_shape[0], self.filters, rows, cols)
-      elif self.data_format == 'channels_last':
-        output_shape = (input_shape[0], rows, cols, self.filters)
+      output = last_output
+
+    # Properly set learning phase
+    if getattr(last_output, '_uses_learning_phase', False):
+      output._uses_learning_phase = True
 
     if self.return_state:
-      if self.data_format == 'channels_first':
-        output_shape = [output_shape] + [
-            (input_shape[0], self.filters, rows, cols) for _ in range(2)
-        ]
-      elif self.data_format == 'channels_last':
-        output_shape = [output_shape] + [
-            (input_shape[0], rows, cols, self.filters) for _ in range(2)
-        ]
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      else:
+        states = list(states)
+      return [output] + states
+    else:
+      return output
 
-    return output_shape
+  def reset_states(self, states=None):
+    if not self.stateful:
+      raise AttributeError('Layer must be stateful.')
+    input_shape = self.input_spec[0].shape
+    state_shape = self.compute_output_shape(input_shape)
+    if self.return_state:
+      state_shape = state_shape[0]
+    if self.return_sequences:
+      state_shape = state_shape[:1].concatenate(state_shape[2:])
+    if None in state_shape:
+      raise ValueError('If a RNN is stateful, it needs to know '
+                       'its batch size. Specify the batch size '
+                       'of your input tensors: \n'
+                       '- If using a Sequential model, '
+                       'specify the batch size by passing '
+                       'a `batch_input_shape` '
+                       'argument to your first layer.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a '
+                       '`batch_shape` argument to your Input layer.\n'
+                       'The same thing goes for the number of rows and '
+                       'columns.')
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'return_sequences': self.return_sequences,
-        'go_backwards': self.go_backwards,
-        'stateful': self.stateful
-    }
-    base_config = super(ConvRecurrent2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    # helper function
+    def get_tuple_shape(nb_channels):
+      result = list(state_shape)
+      if self.cell.data_format == 'channels_first':
+        result[1] = nb_channels
+      elif self.cell.data_format == 'channels_last':
+        result[3] = nb_channels
+      else:
+        raise KeyError
+      return tuple(result)
 
+    # initialize state if None
+    if self.states[0] is None:
+      if hasattr(self.cell.state_size, '__len__'):
+        self.states = [K.zeros(get_tuple_shape(dim))
+                       for dim in self.cell.state_size]
+      else:
+        self.states = [K.zeros(get_tuple_shape(self.cell.state_size))]
+    elif states is None:
+      if hasattr(self.cell.state_size, '__len__'):
+        for state, dim in zip(self.states, self.cell.state_size):
+          K.set_value(state, np.zeros(get_tuple_shape(dim)))
+      else:
+        K.set_value(self.states[0],
+                    np.zeros(get_tuple_shape(self.cell.state_size)))
+    else:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      if len(states) != len(self.states):
+        raise ValueError('Layer ' + self.name + ' expects ' +
+                         str(len(self.states)) + ' states, ' +
+                         'but it received ' + str(len(states)) +
+                         ' state values. Input received: ' + str(states))
+      for index, (value, state) in enumerate(zip(states, self.states)):
+        if hasattr(self.cell.state_size, '__len__'):
+          dim = self.cell.state_size[index]
+        else:
+          dim = self.cell.state_size
+        if value.shape != get_tuple_shape(dim):
+          raise ValueError('State ' + str(index) +
+                           ' is incompatible with layer ' +
+                           self.name + ': expected shape=' +
+                           str(get_tuple_shape(dim)) +
+                           ', found shape=' + str(value.shape))
+        # TODO(anjalisridhar): consider batch calls to `set_value`.
+        K.set_value(state, value)
 
-@tf_export('keras.layers.ConvLSTM2D')
-class ConvLSTM2D(ConvRecurrent2D):
-  """Convolutional LSTM.
 
-  It is similar to an LSTM layer, but the input transformations
-  and recurrent transformations are both convolutional.
+class ConvLSTM2DCell(Layer):
+  """Cell class for the ConvLSTM2D layer.
 
-  Arguments:
+  # Arguments
       filters: Integer, the dimensionality of the output space
           (i.e. the number of output filters in the convolution).
       kernel_size: An integer or tuple/list of n integers, specifying the
@@ -210,11 +488,6 @@ class ConvLSTM2D(ConvRecurrent2D):
       padding: One of `"valid"` or `"same"` (case-insensitive).
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, time, ..., channels)`
-          while `channels_first` corresponds to
-          inputs with shape `(batch, time, channels, ...)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -229,71 +502,32 @@ class ConvLSTM2D(ConvRecurrent2D):
           for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs..
+          used for the linear transformation of the inputs.
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state..
+          used for the linear transformation of the recurrent state.
       bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Use in combination with `bias_initializer="zeros"`.
-          This is recommended in [Jozefowicz et
-            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+          This is recommended in [Jozefowicz et al.]
+          (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
           the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
           the `recurrent_kernel` weights matrix.
       bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
           the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
           the `recurrent_kernel` weights matrix.
       bias_constraint: Constraint function applied to the bias vector.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      go_backwards: Boolean (default False).
-          If True, rocess the input sequence backwards.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-
-  Input shape:
-      - if data_format='channels_first'
-          5D tensor with shape:
-          `(samples,time, channels, rows, cols)`
-      - if data_format='channels_last'
-          5D tensor with shape:
-          `(samples,time, rows, cols, channels)`
-
-   Output shape:
-      - if `return_sequences`
-           - if data_format='channels_first'
-              5D tensor with shape:
-              `(samples, time, filters, output_row, output_col)`
-           - if data_format='channels_last'
-              5D tensor with shape:
-              `(samples, time, output_row, output_col, filters)`
-      - else
-          - if data_format ='channels_first'
-              4D tensor with shape:
-              `(samples, filters, output_row, output_col)`
-          - if data_format='channels_last'
-              4D tensor with shape:
-              `(samples, output_row, output_col, filters)`
-          where o_row and o_col depend on the shape of the filter and
-          the padding
-
-  Raises:
-      ValueError: in case of invalid constructor arguments.
-
   """
 
   def __init__(self,
@@ -313,27 +547,20 @@ class ConvLSTM2D(ConvRecurrent2D):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
-               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
-               return_sequences=False,
-               go_backwards=False,
-               stateful=False,
                dropout=0.,
                recurrent_dropout=0.,
                **kwargs):
-    super(ConvLSTM2D, self).__init__(
-        filters,
-        kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        return_sequences=return_sequences,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        **kwargs)
+    super(ConvLSTM2DCell, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
+                                                    'dilation_rate')
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
     self.use_bias = use_bias
@@ -346,7 +573,6 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
 
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.recurrent_constraint = constraints.get(recurrent_constraint)
@@ -354,45 +580,29 @@ class ConvLSTM2D(ConvRecurrent2D):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_spec = [InputSpec(ndim=4), InputSpec(ndim=4)]
+    self.state_size = (self.filters, self.filters)
+    self._dropout_mask = None
+    self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
   def build(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    batch_size = input_shape[0] if self.stateful else None
-    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:])
-    if self.stateful:
-      self.reset_states()
-    else:
-      # initial states: 2 all-zero tensor of shape (filters)
-      self.states = [None, None]
 
     if self.data_format == 'channels_first':
-      channel_axis = 2
+      channel_axis = 1
     else:
       channel_axis = -1
     if input_shape[channel_axis] is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = input_shape[channel_axis]
-    state_shape = [None] * 4
-    state_shape[channel_axis] = input_dim
-    state_shape = tuple(state_shape)
-    self.state_spec = [
-        InputSpec(shape=state_shape),
-        InputSpec(shape=state_shape)
-    ]
     kernel_shape = self.kernel_size + (input_dim, self.filters * 4)
     self.kernel_shape = kernel_shape
     recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
 
-    self.kernel = self.add_weight(
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
+    self.kernel = self.add_weight(shape=kernel_shape,
+                                  initializer=self.kernel_initializer,
+                                  name='kernel',
+                                  regularizer=self.kernel_regularizer,
+                                  constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
         shape=recurrent_kernel_shape,
         initializer=self.recurrent_initializer,
@@ -400,25 +610,24 @@ class ConvLSTM2D(ConvRecurrent2D):
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
     if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.filters * 4,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
+      self.bias = self.add_weight(shape=(self.filters * 4,),
+                                  initializer=self.bias_initializer,
+                                  name='bias',
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
       if self.unit_forget_bias:
         bias_value = np.zeros((self.filters * 4,))
-        bias_value[self.filters:self.filters * 2] = 1.
+        bias_value[self.filters: self.filters * 2] = 1.
         K.set_value(self.bias, bias_value)
     else:
       self.bias = None
 
     self.kernel_i = self.kernel[:, :, :, :self.filters]
     self.recurrent_kernel_i = self.recurrent_kernel[:, :, :, :self.filters]
-    self.kernel_f = self.kernel[:, :, :, self.filters:self.filters * 2]
+    self.kernel_f = self.kernel[:, :, :, self.filters: self.filters * 2]
     self.recurrent_kernel_f = self.recurrent_kernel[:, :, :, self.filters:
                                                     self.filters * 2]
-    self.kernel_c = self.kernel[:, :, :, self.filters * 2:self.filters * 3]
+    self.kernel_c = self.kernel[:, :, :, self.filters * 2: self.filters * 3]
     self.recurrent_kernel_c = self.recurrent_kernel[:, :, :, self.filters * 2:
                                                     self.filters * 3]
     self.kernel_o = self.kernel[:, :, :, self.filters * 3:]
@@ -426,8 +635,8 @@ class ConvLSTM2D(ConvRecurrent2D):
 
     if self.use_bias:
       self.bias_i = self.bias[:self.filters]
-      self.bias_f = self.bias[self.filters:self.filters * 2]
-      self.bias_c = self.bias[self.filters * 2:self.filters * 3]
+      self.bias_f = self.bias[self.filters: self.filters * 2]
+      self.bias_c = self.bias[self.filters * 2: self.filters * 3]
       self.bias_o = self.bias[self.filters * 3:]
     else:
       self.bias_i = None
@@ -436,166 +645,419 @@ class ConvLSTM2D(ConvRecurrent2D):
       self.bias_o = None
     self.built = True
 
-  def get_initial_state(self, inputs):
-    # (samples, timesteps, rows, cols, filters)
-    initial_state = K.zeros_like(inputs)
-    # (samples, rows, cols, filters)
-    initial_state = K.sum(initial_state, axis=1)
-    shape = list(self.kernel_shape)
-    shape[-1] = self.filters
-    initial_state = self.input_conv(
-        initial_state, K.zeros(tuple(shape)), padding=self.padding)
-
-    initial_states = [initial_state for _ in range(2)]
-    return initial_states
+  def call(self, inputs, states, training=None):
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          K.ones_like(inputs),
+          self.dropout,
+          training=training,
+          count=4)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          K.ones_like(states[1]),
+          self.recurrent_dropout,
+          training=training,
+          count=4)
 
-  def reset_states(self):
-    if not self.stateful:
-      raise RuntimeError('Layer must be stateful.')
-    input_shape = self.input_spec[0].shape
+    # dropout matrices for input units
+    dp_mask = self._dropout_mask
+    # dropout matrices for recurrent units
+    rec_dp_mask = self._recurrent_dropout_mask
 
-    if not input_shape[0]:
-      raise ValueError('If a RNN is stateful, a complete '
-                       'input_shape must be provided '
-                       '(including batch size). '
-                       'Got input shape: ' + str(input_shape))
+    h_tm1 = states[0]  # previous memory state
+    c_tm1 = states[1]  # previous carry state
 
-    if self.return_state:
-      output_shape = tuple(self.compute_output_shape(input_shape)[0].as_list())
-    else:
-      output_shape = tuple(self.compute_output_shape(input_shape).as_list())
-    if self.return_sequences:
-      output_shape = (input_shape[0],) + output_shape[2:]
+    if 0 < self.dropout < 1.:
+      inputs_i = inputs * dp_mask[0]
+      inputs_f = inputs * dp_mask[1]
+      inputs_c = inputs * dp_mask[2]
+      inputs_o = inputs * dp_mask[3]
     else:
-      output_shape = (input_shape[0],) + output_shape[1:]
+      inputs_i = inputs
+      inputs_f = inputs
+      inputs_c = inputs
+      inputs_o = inputs
 
-    if hasattr(self, 'states'):
-      K.set_value(self.states[0],
-                  np.zeros(output_shape))
-      K.set_value(self.states[1],
-                  np.zeros(output_shape))
-    else:
-      self.states = [
-          K.zeros(output_shape),
-          K.zeros(output_shape)
-      ]
-
-  def get_constants(self, inputs, training=None):
-    constants = []
-    if self.implementation == 0 and 0 < self.dropout < 1:
-      ones = K.zeros_like(inputs)
-      ones = K.sum(ones, axis=1)
-      ones += 1
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      dp_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-      constants.append(dp_mask)
+    if 0 < self.recurrent_dropout < 1.:
+      h_tm1_i = h_tm1 * rec_dp_mask[0]
+      h_tm1_f = h_tm1 * rec_dp_mask[1]
+      h_tm1_c = h_tm1 * rec_dp_mask[2]
+      h_tm1_o = h_tm1 * rec_dp_mask[3]
     else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
-
-    if 0 < self.recurrent_dropout < 1:
-      shape = list(self.kernel_shape)
-      shape[-1] = self.filters
-      ones = K.zeros_like(inputs)
-      ones = K.sum(ones, axis=1)
-      ones = self.input_conv(ones, K.zeros(shape), padding=self.padding)
-      ones += 1.
-
-      def dropped_inputs():  # pylint: disable=function-redefined
-        return K.dropout(ones, self.recurrent_dropout)
-
-      rec_dp_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-      constants.append(rec_dp_mask)
-    else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
-    return constants
+      h_tm1_i = h_tm1
+      h_tm1_f = h_tm1
+      h_tm1_c = h_tm1
+      h_tm1_o = h_tm1
+
+    x_i = self.input_conv(inputs_i, self.kernel_i, self.bias_i,
+                          padding=self.padding)
+    x_f = self.input_conv(inputs_f, self.kernel_f, self.bias_f,
+                          padding=self.padding)
+    x_c = self.input_conv(inputs_c, self.kernel_c, self.bias_c,
+                          padding=self.padding)
+    x_o = self.input_conv(inputs_o, self.kernel_o, self.bias_o,
+                          padding=self.padding)
+    h_i = self.recurrent_conv(h_tm1_i,
+                              self.recurrent_kernel_i)
+    h_f = self.recurrent_conv(h_tm1_f,
+                              self.recurrent_kernel_f)
+    h_c = self.recurrent_conv(h_tm1_c,
+                              self.recurrent_kernel_c)
+    h_o = self.recurrent_conv(h_tm1_o,
+                              self.recurrent_kernel_o)
+
+    i = self.recurrent_activation(x_i + h_i)
+    f = self.recurrent_activation(x_f + h_f)
+    c = f * c_tm1 + i * self.activation(x_c + h_c)
+    o = self.recurrent_activation(x_o + h_o)
+    h = o * self.activation(c)
+
+    if 0 < self.dropout + self.recurrent_dropout:
+      if training is None:
+        h._uses_learning_phase = True
+
+    return h, [h, c]
 
   def input_conv(self, x, w, b=None, padding='valid'):
-    conv_out = K.conv2d(
-        x,
-        w,
-        strides=self.strides,
-        padding=padding,
-        data_format=self.data_format,
-        dilation_rate=self.dilation_rate)
+    conv_out = K.conv2d(x, w, strides=self.strides,
+                        padding=padding,
+                        data_format=self.data_format,
+                        dilation_rate=self.dilation_rate)
     if b is not None:
-      conv_out = K.bias_add(conv_out, b, data_format=self.data_format)
+      conv_out = K.bias_add(conv_out, b,
+                            data_format=self.data_format)
     return conv_out
 
   def recurrent_conv(self, x, w):
-    conv_out = K.conv2d(
-        x, w, strides=(1, 1), padding='same', data_format=self.data_format)
+    conv_out = K.conv2d(x, w, strides=(1, 1),
+                        padding='same',
+                        data_format=self.data_format)
     return conv_out
 
-  def step(self, inputs, states):
-    assert len(states) == 4
-    h_tm1 = states[0]
-    c_tm1 = states[1]
-    dp_mask = states[2]
-    rec_dp_mask = states[3]
-
-    x_i = self.input_conv(
-        inputs * dp_mask[0], self.kernel_i, self.bias_i, padding=self.padding)
-    x_f = self.input_conv(
-        inputs * dp_mask[1], self.kernel_f, self.bias_f, padding=self.padding)
-    x_c = self.input_conv(
-        inputs * dp_mask[2], self.kernel_c, self.bias_c, padding=self.padding)
-    x_o = self.input_conv(
-        inputs * dp_mask[3], self.kernel_o, self.bias_o, padding=self.padding)
-    h_i = self.recurrent_conv(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i)
-    h_f = self.recurrent_conv(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f)
-    h_c = self.recurrent_conv(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c)
-    h_o = self.recurrent_conv(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o)
+  def get_config(self):
+    config = {'filters': self.filters,
+              'kernel_size': self.kernel_size,
+              'strides': self.strides,
+              'padding': self.padding,
+              'data_format': self.data_format,
+              'dilation_rate': self.dilation_rate,
+              'activation': activations.serialize(self.activation),
+              'recurrent_activation': activations.serialize(
+                  self.recurrent_activation),
+              'use_bias': self.use_bias,
+              'kernel_initializer': initializers.serialize(
+                  self.kernel_initializer),
+              'recurrent_initializer': initializers.serialize(
+                  self.recurrent_initializer),
+              'bias_initializer': initializers.serialize(self.bias_initializer),
+              'unit_forget_bias': self.unit_forget_bias,
+              'kernel_regularizer': regularizers.serialize(
+                  self.kernel_regularizer),
+              'recurrent_regularizer': regularizers.serialize(
+                  self.recurrent_regularizer),
+              'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+              'kernel_constraint': constraints.serialize(
+                  self.kernel_constraint),
+              'recurrent_constraint': constraints.serialize(
+                  self.recurrent_constraint),
+              'bias_constraint': constraints.serialize(self.bias_constraint),
+              'dropout': self.dropout,
+              'recurrent_dropout': self.recurrent_dropout}
+    base_config = super(ConvLSTM2DCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
-    i = self.recurrent_activation(x_i + h_i)
-    f = self.recurrent_activation(x_f + h_f)
-    c = f * c_tm1 + i * self.activation(x_c + h_c)
-    o = self.recurrent_activation(x_o + h_o)
-    h = o * self.activation(c)
-    return h, [h, c]
+
+@tf_export('keras.layers.ConvLSTM2D')
+class ConvLSTM2D(ConvRNN2D):
+  """Convolutional LSTM.
+
+  It is similar to an LSTM layer, but the input transformations
+  and recurrent transformations are both convolutional.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space
+        (i.e. the number output of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+        dimensions of the convolution window.
+    strides: An integer or tuple/list of n integers,
+        specifying the strides of the convolution.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, time, ..., channels)`
+        while `channels_first` corresponds to
+        inputs with shape `(batch, time, channels, ...)`.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be "channels_last".
+    dilation_rate: An integer or tuple/list of n integers, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any `strides` value != 1.
+    activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+        for the recurrent step.
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix,
+        used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean.
+        If True, add 1 to the bias of the forget gate at initialization.
+        Use in combination with `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al.]
+        (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to.
+    kernel_constraint: Constraint function applied to
+        the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+        the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+    dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the recurrent state.
+
+  Input shape:
+    - if data_format='channels_first'
+        5D tensor with shape:
+        `(samples,time, channels, rows, cols)`
+    - if data_format='channels_last'
+        5D tensor with shape:
+        `(samples,time, rows, cols, channels)`
+
+  Output shape:
+    - if `return_sequences`
+         - if data_format='channels_first'
+            5D tensor with shape:
+            `(samples, time, filters, output_row, output_col)`
+         - if data_format='channels_last'
+            5D tensor with shape:
+            `(samples, time, output_row, output_col, filters)`
+    - else
+        - if data_format ='channels_first'
+            4D tensor with shape:
+            `(samples, filters, output_row, output_col)`
+        - if data_format='channels_last'
+            4D tensor with shape:
+            `(samples, output_row, output_col, filters)`
+        where o_row and o_col depend on the shape of the filter and
+        the padding
+
+  Raises:
+    ValueError: in case of invalid constructor arguments.
+
+  References:
+    - [Convolutional LSTM Network: A Machine Learning Approach for
+    Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
+    The current implementation does not include the feedback loop on the
+    cells output.
+
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    cell = ConvLSTM2DCell(filters=filters,
+                          kernel_size=kernel_size,
+                          strides=strides,
+                          padding=padding,
+                          data_format=data_format,
+                          dilation_rate=dilation_rate,
+                          activation=activation,
+                          recurrent_activation=recurrent_activation,
+                          use_bias=use_bias,
+                          kernel_initializer=kernel_initializer,
+                          recurrent_initializer=recurrent_initializer,
+                          bias_initializer=bias_initializer,
+                          unit_forget_bias=unit_forget_bias,
+                          kernel_regularizer=kernel_regularizer,
+                          recurrent_regularizer=recurrent_regularizer,
+                          bias_regularizer=bias_regularizer,
+                          kernel_constraint=kernel_constraint,
+                          recurrent_constraint=recurrent_constraint,
+                          bias_constraint=bias_constraint,
+                          dropout=dropout,
+                          recurrent_dropout=recurrent_dropout)
+    super(ConvLSTM2D, self).__init__(cell,
+                                     return_sequences=return_sequences,
+                                     go_backwards=go_backwards,
+                                     stateful=stateful,
+                                     **kwargs)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    return super(ConvLSTM2D, self).call(inputs,
+                                        mask=mask,
+                                        training=training,
+                                        initial_state=initial_state)
+
+  @property
+  def filters(self):
+    return self.cell.filters
+
+  @property
+  def kernel_size(self):
+    return self.cell.kernel_size
+
+  @property
+  def strides(self):
+    return self.cell.strides
+
+  @property
+  def padding(self):
+    return self.cell.padding
+
+  @property
+  def data_format(self):
+    return self.cell.data_format
+
+  @property
+  def dilation_rate(self):
+    return self.cell.dilation_rate
+
+  @property
+  def activation(self):
+    return self.cell.activation
+
+  @property
+  def recurrent_activation(self):
+    return self.cell.recurrent_activation
+
+  @property
+  def use_bias(self):
+    return self.cell.use_bias
+
+  @property
+  def kernel_initializer(self):
+    return self.cell.kernel_initializer
+
+  @property
+  def recurrent_initializer(self):
+    return self.cell.recurrent_initializer
+
+  @property
+  def bias_initializer(self):
+    return self.cell.bias_initializer
+
+  @property
+  def unit_forget_bias(self):
+    return self.cell.unit_forget_bias
+
+  @property
+  def kernel_regularizer(self):
+    return self.cell.kernel_regularizer
+
+  @property
+  def recurrent_regularizer(self):
+    return self.cell.recurrent_regularizer
+
+  @property
+  def bias_regularizer(self):
+    return self.cell.bias_regularizer
+
+  @property
+  def kernel_constraint(self):
+    return self.cell.kernel_constraint
+
+  @property
+  def recurrent_constraint(self):
+    return self.cell.recurrent_constraint
+
+  @property
+  def bias_constraint(self):
+    return self.cell.bias_constraint
+
+  @property
+  def dropout(self):
+    return self.cell.dropout
+
+  @property
+  def recurrent_dropout(self):
+    return self.cell.recurrent_dropout
 
   def get_config(self):
-    config = {
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
-    }
+    config = {'filters': self.filters,
+              'kernel_size': self.kernel_size,
+              'strides': self.strides,
+              'padding': self.padding,
+              'data_format': self.data_format,
+              'dilation_rate': self.dilation_rate,
+              'activation': activations.serialize(self.activation),
+              'recurrent_activation': activations.serialize(
+                  self.recurrent_activation),
+              'use_bias': self.use_bias,
+              'kernel_initializer': initializers.serialize(
+                  self.kernel_initializer),
+              'recurrent_initializer': initializers.serialize(
+                  self.recurrent_initializer),
+              'bias_initializer': initializers.serialize(self.bias_initializer),
+              'unit_forget_bias': self.unit_forget_bias,
+              'kernel_regularizer': regularizers.serialize(
+                  self.kernel_regularizer),
+              'recurrent_regularizer': regularizers.serialize(
+                  self.recurrent_regularizer),
+              'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+              'activity_regularizer': regularizers.serialize(
+                  self.activity_regularizer),
+              'kernel_constraint': constraints.serialize(
+                  self.kernel_constraint),
+              'recurrent_constraint': constraints.serialize(
+                  self.recurrent_constraint),
+              'bias_constraint': constraints.serialize(self.bias_constraint),
+              'dropout': self.dropout,
+              'recurrent_dropout': self.recurrent_dropout}
     base_config = super(ConvLSTM2D, self).get_config()
+    del base_config['cell']
     return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
index 60137bdd724676af2c89bb7531cf4ea4e529b2a1..9e768b4e9552d126eac9c586d19810634fac9013 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
@@ -64,6 +64,7 @@ class ConvLSTMTest(test.TestCase):
           self.assertEqual(len(states), 2)
           model = keras.models.Model(x, states[0])
           state = model.predict(inputs)
+
           self.assertAllClose(
               keras.backend.eval(layer.states[0]), state, atol=1e-4)
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
index 4a6228121b4f8839daa98e35748b2c5867ccca96..12b42676759d499c910707cb1b78e788e3c443fd 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
@@ -22,6 +22,8 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
@@ -43,6 +45,7 @@ class Convolution1DTest(test.TestCase):
             kwargs=test_kwargs,
             input_shape=(num_samples, length, stack_size))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_conv1d(self):
     kwargs = {
         'filters': 2,
@@ -114,6 +117,7 @@ class Conv2DTest(test.TestCase):
             kwargs=test_kwargs,
             input_shape=(num_samples, num_row, num_col, stack_size))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_conv2d(self):
     kwargs = {
         'filters': 2,
@@ -188,6 +192,7 @@ class Conv2DTransposeTest(test.TestCase):
             kwargs=test_kwargs,
             input_shape=(num_samples, num_row, num_col, stack_size))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_conv2dtranspose(self):
     kwargs = {
         'filters': 2,
@@ -253,6 +258,7 @@ class Conv3DTransposeTest(test.TestCase):
             kwargs=test_kwargs,
             input_shape=(num_samples, depth, num_row, num_col, stack_size))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_conv3dtranspose(self):
     kwargs = {
         'filters': 2,
@@ -316,6 +322,7 @@ class SeparableConv1DTest(test.TestCase):
             kwargs=test_kwargs,
             input_shape=(num_samples, length, stack_size))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_separable_conv1d(self):
     kwargs = {
         'filters': 2,
@@ -391,6 +398,7 @@ class SeparableConv2DTest(test.TestCase):
             kwargs=test_kwargs,
             input_shape=(num_samples, num_row, num_col, stack_size))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_separable_conv2d(self):
     kwargs = {
         'filters': 2,
@@ -469,6 +477,7 @@ class Conv3DTest(test.TestCase):
             kwargs=test_kwargs,
             input_shape=(num_samples, depth, num_row, num_col, stack_size))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_conv3d(self):
     kwargs = {
         'filters': 2,
@@ -520,6 +529,7 @@ class Conv3DTest(test.TestCase):
 
 class ZeroPaddingTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_zero_padding_1d(self):
     num_samples = 2
     input_dim = 2
@@ -543,7 +553,10 @@ class ZeroPaddingTest(test.TestCase):
       layer = keras.layers.ZeroPadding1D(padding=2)
       layer.build(shape)
       output = layer(keras.backend.variable(inputs))
-      np_output = keras.backend.eval(output)
+      if context.executing_eagerly():
+        np_output = output.numpy()
+      else:
+        np_output = keras.backend.eval(output)
       for offset in [0, 1, -1, -2]:
         np.testing.assert_allclose(np_output[:, offset, :], 0.)
       np.testing.assert_allclose(np_output[:, 2:-2, :], 1.)
@@ -551,7 +564,10 @@ class ZeroPaddingTest(test.TestCase):
       layer = keras.layers.ZeroPadding1D(padding=(1, 2))
       layer.build(shape)
       output = layer(keras.backend.variable(inputs))
-      np_output = keras.backend.eval(output)
+      if context.executing_eagerly():
+        np_output = output.numpy()
+      else:
+        np_output = keras.backend.eval(output)
       for left_offset in [0]:
         np.testing.assert_allclose(np_output[:, left_offset, :], 0.)
       for right_offset in [-1, -2]:
@@ -565,6 +581,7 @@ class ZeroPaddingTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.ZeroPadding1D(padding=None)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_zero_padding_2d(self):
     num_samples = 2
     stack_size = 2
@@ -593,7 +610,10 @@ class ZeroPaddingTest(test.TestCase):
             padding=(2, 2), data_format=data_format)
         layer.build(inputs.shape)
         output = layer(keras.backend.variable(inputs))
-        np_output = keras.backend.eval(output)
+        if context.executing_eagerly():
+          np_output = output.numpy()
+        else:
+          np_output = keras.backend.eval(output)
         if data_format == 'channels_last':
           for offset in [0, 1, -1, -2]:
             np.testing.assert_allclose(np_output[:, offset, :, :], 0.)
@@ -609,7 +629,10 @@ class ZeroPaddingTest(test.TestCase):
             padding=((1, 2), (3, 4)), data_format=data_format)
         layer.build(inputs.shape)
         output = layer(keras.backend.variable(inputs))
-        np_output = keras.backend.eval(output)
+        if context.executing_eagerly():
+          np_output = output.numpy()
+        else:
+          np_output = keras.backend.eval(output)
         if data_format == 'channels_last':
           for top_offset in [0]:
             np.testing.assert_allclose(np_output[:, top_offset, :, :], 0.)
@@ -637,6 +660,7 @@ class ZeroPaddingTest(test.TestCase):
       with self.assertRaises(ValueError):
         keras.layers.ZeroPadding2D(padding=None)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_zero_padding_3d(self):
     num_samples = 2
     stack_size = 2
@@ -659,7 +683,10 @@ class ZeroPaddingTest(test.TestCase):
       layer = keras.layers.ZeroPadding3D(padding=(2, 2, 2))
       layer.build(inputs.shape)
       output = layer(keras.backend.variable(inputs))
-      np_output = keras.backend.eval(output)
+      if context.executing_eagerly():
+        np_output = output.numpy()
+      else:
+        np_output = keras.backend.eval(output)
       for offset in [0, 1, -1, -2]:
         np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
         np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
@@ -675,11 +702,13 @@ class ZeroPaddingTest(test.TestCase):
 
 class UpSamplingTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_upsampling_1d(self):
     with self.test_session(use_gpu=True):
       testing_utils.layer_test(
           keras.layers.UpSampling1D, kwargs={'size': 2}, input_shape=(3, 5, 4))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_upsampling_2d(self):
     num_samples = 2
     stack_size = 2
@@ -708,7 +737,10 @@ class UpSamplingTest(test.TestCase):
                 size=(length_row, length_col), data_format=data_format)
             layer.build(inputs.shape)
             output = layer(keras.backend.variable(inputs))
-            np_output = keras.backend.eval(output)
+            if context.executing_eagerly():
+              np_output = output.numpy()
+            else:
+              np_output = keras.backend.eval(output)
             if data_format == 'channels_first':
               assert np_output.shape[2] == length_row * input_num_row
               assert np_output.shape[3] == length_col * input_num_col
@@ -726,6 +758,7 @@ class UpSamplingTest(test.TestCase):
 
             np.testing.assert_allclose(np_output, expected_out)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_upsampling_3d(self):
     num_samples = 2
     stack_size = 2
@@ -757,7 +790,10 @@ class UpSamplingTest(test.TestCase):
                   data_format=data_format)
               layer.build(inputs.shape)
               output = layer(keras.backend.variable(inputs))
-              np_output = keras.backend.eval(output)
+              if context.executing_eagerly():
+                np_output = output.numpy()
+              else:
+                np_output = keras.backend.eval(output)
               if data_format == 'channels_first':
                 assert np_output.shape[2] == length_dim1 * input_len_dim1
                 assert np_output.shape[3] == length_dim2 * input_len_dim2
@@ -782,6 +818,7 @@ class UpSamplingTest(test.TestCase):
 
 class CroppingTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_cropping_1d(self):
     num_samples = 2
     time_length = 4
@@ -800,6 +837,7 @@ class CroppingTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.Cropping1D(cropping=None)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_cropping_2d(self):
     num_samples = 2
     stack_size = 2
@@ -827,7 +865,10 @@ class CroppingTest(test.TestCase):
             cropping=cropping, data_format=data_format)
         layer.build(inputs.shape)
         output = layer(keras.backend.variable(inputs))
-        np_output = keras.backend.eval(output)
+        if context.executing_eagerly():
+          np_output = output.numpy()
+        else:
+          np_output = keras.backend.eval(output)
         # compare with numpy
         if data_format == 'channels_first':
           expected_out = inputs[:, :, cropping[0][0]:-cropping[0][1], cropping[
@@ -851,7 +892,10 @@ class CroppingTest(test.TestCase):
             cropping=cropping, data_format=data_format)
         layer.build(inputs.shape)
         output = layer(keras.backend.variable(inputs))
-        np_output = keras.backend.eval(output)
+        if context.executing_eagerly():
+          np_output = output.numpy()
+        else:
+          np_output = keras.backend.eval(output)
         # compare with input
         np.testing.assert_allclose(np_output, inputs)
 
@@ -861,6 +905,7 @@ class CroppingTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.Cropping2D(cropping=None)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_cropping_3d(self):
     num_samples = 2
     stack_size = 2
@@ -892,7 +937,10 @@ class CroppingTest(test.TestCase):
                 cropping=cropping, data_format=data_format)
             layer.build(inputs.shape)
             output = layer(keras.backend.variable(inputs))
-            np_output = keras.backend.eval(output)
+            if context.executing_eagerly():
+              np_output = output.numpy()
+            else:
+              np_output = keras.backend.eval(output)
             # compare with numpy
             if data_format == 'channels_first':
               expected_out = inputs[:, :,
@@ -906,12 +954,50 @@ class CroppingTest(test.TestCase):
                                     cropping[2][0]:-cropping[2][1], :]
             np.testing.assert_allclose(np_output, expected_out)
 
-     # test incorrect use
+    # test incorrect use
     with self.assertRaises(ValueError):
       keras.layers.Cropping3D(cropping=(1, 1))
     with self.assertRaises(ValueError):
       keras.layers.Cropping3D(cropping=None)
 
 
+class DepthwiseConv2DTest(test.TestCase):
+
+  def _run_test(self, kwargs, arg, values):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    test_kwargs = copy.copy(kwargs)
+    for value in values:
+      test_kwargs[arg] = value
+      with self.test_session(use_gpu=True):
+        testing_utils.layer_test(
+            keras.layers.DepthwiseConv2D,
+            kwargs=test_kwargs,
+            input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_depthwise_conv2d(self):
+    kwargs = {'kernel_size': (3, 3)}
+
+    self._run_test(kwargs, 'padding', ['valid', 'same'])
+    self._run_test(kwargs, 'strides', [(2, 2)])
+    if test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs, 'data_format', ['channels_first'])
+    self._run_test(kwargs, 'depth_multiplier', [1, 2])
+
+    kwargs = {'kernel_size': 3,
+              'padding': 'valid',
+              'data_format': 'channels_first',
+              'activation': None,
+              'depthwise_regularizer': 'l2',
+              'bias_regularizer': 'l2',
+              'activity_regularizer': 'l2',
+              'depthwise_constraint': 'unit_norm',
+              'strides': (2, 2),
+             }
+    self._run_test(kwargs, 'depth_multiplier', [1])
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 50a197c80c3d97f47a071a24297301dddf78a27e..9c4cb0f4fda681ce3236222460cd87439ea67810 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -24,6 +24,7 @@ import types as python_types
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
@@ -32,11 +33,14 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import func_dump
-from tensorflow.python.keras._impl.keras.utils.generic_utils import func_load
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import standard_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -75,12 +79,12 @@ class Masking(Layer):
     self.mask_value = mask_value
 
   def compute_mask(self, inputs, mask=None):
-    return K.any(K.not_equal(inputs, self.mask_value), axis=-1)
+    return K.any(math_ops.not_equal(inputs, self.mask_value), axis=-1)
 
   def call(self, inputs):
     boolean_mask = K.any(
-        K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
-    return inputs * K.cast(boolean_mask, inputs.dtype)
+        math_ops.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
+    return inputs * math_ops.cast(boolean_mask, inputs.dtype)
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -92,7 +96,7 @@ class Masking(Layer):
 
 
 @tf_export('keras.layers.Dropout')
-class Dropout(tf_core_layers.Dropout, Layer):
+class Dropout(Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting
@@ -111,23 +115,40 @@ class Dropout(tf_core_layers.Dropout, Layer):
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    # Inheritance call order:
-    # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
-    super(Dropout, self).__init__(rate=rate,
-                                  noise_shape=noise_shape,
-                                  seed=seed,
-                                  **kwargs)
+    super(Dropout, self).__init__(**kwargs)
+    self.rate = rate
+    self.noise_shape = noise_shape
+    self.seed = seed
     self.supports_masking = True
 
+  def _get_noise_shape(self, inputs):
+    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
+    # which will override `self.noise_shape`, and allows for custom noise
+    # shapes with dynamically sized inputs.
+    if self.noise_shape is None:
+      return self.noise_shape
+    return nn_ops._get_noise_shape(inputs, self.noise_shape)  # pylint: disable=protected-access
+
   def call(self, inputs, training=None):
+    original_training_value = training
     if training is None:
       training = K.learning_phase()
-    output = super(Dropout, self).call(inputs, training=training)
+
+    def dropped_inputs():
+      return nn.dropout(inputs, 1  - self.rate,
+                        noise_shape=self._get_noise_shape(inputs),
+                        seed=self.seed)
+    output = tf_utils.smart_cond(training,
+                                 dropped_inputs,
+                                 lambda: array_ops.identity(inputs))
     # EagerTensor object has no attribute _uses_learning_phase
-    if not context.in_eager_mode() and training is K.learning_phase():
+    if not context.executing_eagerly() and original_training_value is None:
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
   def get_config(self):
     config = {
         'rate': self.rate,
@@ -170,7 +191,7 @@ class SpatialDropout1D(Dropout):
     self.input_spec = InputSpec(ndim=3)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     noise_shape = (input_shape[0], 1, input_shape[2])
     return noise_shape
 
@@ -222,7 +243,7 @@ class SpatialDropout2D(Dropout):
     self.input_spec = InputSpec(ndim=4)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     if self.data_format == 'channels_first':
       return (input_shape[0], input_shape[1], 1, 1)
     elif self.data_format == 'channels_last':
@@ -275,7 +296,7 @@ class SpatialDropout3D(Dropout):
     self.input_spec = InputSpec(ndim=5)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     if self.data_format == 'channels_first':
       return (input_shape[0], input_shape[1], 1, 1, 1)
     elif self.data_format == 'channels_last':
@@ -414,7 +435,8 @@ class Reshape(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return K.reshape(inputs, (K.shape(inputs)[0],) + self.target_shape)
+    return array_ops.reshape(inputs,
+                             (array_ops.shape(inputs)[0],) + self.target_shape)
 
   def get_config(self):
     config = {'target_shape': self.target_shape}
@@ -467,7 +489,7 @@ class Permute(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return K.permute_dimensions(inputs, (0,) + self.dims)
+    return array_ops.transpose(inputs, perm=(0,) + self.dims)
 
   def get_config(self):
     config = {'dims': self.dims}
@@ -476,7 +498,7 @@ class Permute(Layer):
 
 
 @tf_export('keras.layers.Flatten')
-class Flatten(tf_core_layers.Flatten, Layer):
+class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
   Example:
@@ -492,7 +514,25 @@ class Flatten(tf_core_layers.Flatten, Layer):
       # now: model.output_shape == (None, 65536)
   ```
   """
-  pass
+
+  def __init__(self, **kwargs):
+    super(Flatten, self).__init__(**kwargs)
+    self.input_spec = InputSpec(min_ndim=2)
+
+  def call(self, inputs):
+    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
+    if not context.executing_eagerly():
+      outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = [input_shape[0]]
+    if all(input_shape[1:]):
+      output_shape += [np.prod(input_shape[1:])]
+    else:
+      output_shape += [None]
+    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('keras.layers.RepeatVector')
@@ -608,10 +648,12 @@ class Lambda(Layer):
                         'must be a list, a tuple, or a function.')
       self._output_shape = output_shape
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
 
     if self._output_shape is None:
+      if context.executing_eagerly():
+        raise NotImplementedError
       x = K.placeholder(shape=input_shape)
       x = self.call(x)
       if isinstance(x, list):
@@ -637,7 +679,7 @@ class Lambda(Layer):
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
-    if has_arg(self.function, 'mask'):
+    if generic_utils.has_arg(self.function, 'mask'):
       arguments['mask'] = mask
     return self.function(inputs, **arguments)
 
@@ -648,14 +690,14 @@ class Lambda(Layer):
 
   def get_config(self):
     if isinstance(self.function, python_types.LambdaType):
-      function = func_dump(self.function)
+      function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
     else:
       function = self.function.__name__
       function_type = 'function'
 
     if isinstance(self._output_shape, python_types.LambdaType):
-      output_shape = func_dump(self._output_shape)
+      output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
@@ -683,26 +725,27 @@ class Lambda(Layer):
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
-      function = deserialize_keras_object(
+      function = generic_utils.deserialize_keras_object(
           config['function'],
           custom_objects=custom_objects,
           printable_module_name='function in Lambda layer')
     elif function_type == 'lambda':
       # Unsafe deserialization from bytecode
-      function = func_load(config['function'], globs=globs)
+      function = generic_utils.func_load(config['function'], globs=globs)
     else:
       raise TypeError('Unknown function type:', function_type)
 
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
-      output_shape = deserialize_keras_object(
+      output_shape = generic_utils.deserialize_keras_object(
           config['output_shape'],
           custom_objects=custom_objects,
           printable_module_name='output_shape function in Lambda layer')
     elif output_shape_type == 'lambda':
       # Unsafe deserialization from bytecode
-      output_shape = func_load(config['output_shape'], globs=globs)
+      output_shape = generic_utils.func_load(config['output_shape'],
+                                             globs=globs)
     else:
       output_shape = config['output_shape']
 
@@ -722,7 +765,7 @@ class Lambda(Layer):
 
 
 @tf_export('keras.layers.Dense')
-class Dense(tf_core_layers.Dense, Layer):
+class Dense(Layer):
   """Just your regular densely-connected NN layer.
 
   `Dense` implements the operation:
@@ -792,21 +835,74 @@ class Dense(tf_core_layers.Dense, Layer):
     if 'input_shape' not in kwargs and 'input_dim' in kwargs:
       kwargs['input_shape'] = (kwargs.pop('input_dim'),)
 
-    # Inheritance call order:
-    # 1) tf.layers.Dense, 2) keras.layers.Layer, 3) tf.layers.Layer
     super(Dense, self).__init__(
-        units,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    self.units = int(units)
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
     self.supports_masking = True
+    self.input_spec = InputSpec(min_ndim=2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if input_shape[-1].value is None:
+      raise ValueError('The last dimension of the inputs to `Dense` '
+                       'should be defined. Found `None`.')
+    self.input_spec = InputSpec(min_ndim=2,
+                                axes={-1: input_shape[-1].value})
+    self.kernel = self.add_variable('kernel',
+                                    shape=[input_shape[-1].value, self.units],
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    dtype=self.dtype,
+                                    trainable=True)
+    if self.use_bias:
+      self.bias = self.add_variable('bias',
+                                    shape=[self.units,],
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    dtype=self.dtype,
+                                    trainable=True)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    shape = inputs.get_shape().as_list()
+    if len(shape) > 2:
+      # Broadcasting is required for the inputs.
+      outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1],
+                                                             [0]])
+      # Reshape the output back to the original ndim of the input.
+      if not context.executing_eagerly():
+        output_shape = shape[:-1] + [self.units]
+        outputs.set_shape(output_shape)
+    else:
+      outputs = gen_math_ops.mat_mul(inputs, self.kernel)
+    if self.use_bias:
+      outputs = nn.bias_add(outputs, self.bias)
+    if self.activation is not None:
+      return self.activation(outputs)  # pylint: disable=not-callable
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    input_shape = input_shape.with_rank_at_least(2)
+    if input_shape[-1].value is None:
+      raise ValueError(
+          'The innermost dimension of input_shape must be defined, but saw: %s'
+          % input_shape)
+    return input_shape[:-1].concatenate(self.units)
 
   def get_config(self):
     config = {
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index bdb99c91c289cf808fec7b891376dbfcf5504aca..d22d8d12dc4e76998c177dbe96fb87e3fffa5175 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py
@@ -20,11 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
-from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -52,146 +51,133 @@ class CoreLayersTest(test.TestCase):
       dropout = keras.layers.Dropout(0.5)
       self.assertEqual(True, dropout.supports_masking)
 
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.SpatialDropout1D,
-          kwargs={'rate': 0.5},
-          input_shape=(2, 3, 4))
-
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.SpatialDropout2D,
-          kwargs={'rate': 0.5},
-          input_shape=(2, 3, 4, 5))
-
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.SpatialDropout2D,
-          kwargs={'rate': 0.5, 'data_format': 'channels_first'},
-          input_shape=(2, 3, 4, 5))
-
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.SpatialDropout3D,
-          kwargs={'rate': 0.5},
-          input_shape=(2, 3, 4, 4, 5))
-
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.SpatialDropout3D,
-          kwargs={'rate': 0.5, 'data_format': 'channels_first'},
-          input_shape=(2, 3, 4, 4, 5))
-
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_spatial_dropout(self):
+    testing_utils.layer_test(
+        keras.layers.SpatialDropout1D,
+        kwargs={'rate': 0.5},
+        input_shape=(2, 3, 4))
+
+    testing_utils.layer_test(
+        keras.layers.SpatialDropout2D,
+        kwargs={'rate': 0.5},
+        input_shape=(2, 3, 4, 5))
+
+    testing_utils.layer_test(
+        keras.layers.SpatialDropout2D,
+        kwargs={'rate': 0.5, 'data_format': 'channels_first'},
+        input_shape=(2, 3, 4, 5))
+
+    testing_utils.layer_test(
+        keras.layers.SpatialDropout3D,
+        kwargs={'rate': 0.5},
+        input_shape=(2, 3, 4, 4, 5))
+
+    testing_utils.layer_test(
+        keras.layers.SpatialDropout3D,
+        kwargs={'rate': 0.5, 'data_format': 'channels_first'},
+        input_shape=(2, 3, 4, 4, 5))
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_activation(self):
     # with string argument
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Activation,
-          kwargs={'activation': 'relu'},
-          input_shape=(3, 2))
+    testing_utils.layer_test(
+        keras.layers.Activation,
+        kwargs={'activation': 'relu'},
+        input_shape=(3, 2))
 
     # with function argument
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Activation,
-          kwargs={'activation': keras.backend.relu},
-          input_shape=(3, 2))
+    testing_utils.layer_test(
+        keras.layers.Activation,
+        kwargs={'activation': keras.backend.relu},
+        input_shape=(3, 2))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_reshape(self):
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Reshape,
-          kwargs={'target_shape': (8, 1)},
-          input_shape=(3, 2, 4))
-
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Reshape,
-          kwargs={'target_shape': (-1, 1)},
-          input_shape=(3, 2, 4))
-
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Reshape,
-          kwargs={'target_shape': (1, -1)},
-          input_shape=(3, 2, 4))
-
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Reshape,
-          kwargs={'target_shape': (-1, 1)},
-          input_shape=(None, None, 2))
-
+    testing_utils.layer_test(
+        keras.layers.Reshape,
+        kwargs={'target_shape': (8, 1)},
+        input_shape=(3, 2, 4))
+
+    testing_utils.layer_test(
+        keras.layers.Reshape,
+        kwargs={'target_shape': (-1, 1)},
+        input_shape=(3, 2, 4))
+
+    testing_utils.layer_test(
+        keras.layers.Reshape,
+        kwargs={'target_shape': (1, -1)},
+        input_shape=(3, 2, 4))
+
+    testing_utils.layer_test(
+        keras.layers.Reshape,
+        kwargs={'target_shape': (-1, 1)},
+        input_shape=(None, None, 2))
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_permute(self):
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
+    testing_utils.layer_test(
+        keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_flatten(self):
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
+    testing_utils.layer_test(
+        keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_repeat_vector(self):
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
+    testing_utils.layer_test(
+        keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
 
   def test_lambda(self):
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Lambda,
-          kwargs={'function': lambda x: x + 1},
-          input_shape=(3, 2))
-
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Lambda,
-          kwargs={
-              'function': lambda x, a, b: x * a + b,
-              'arguments': {
-                  'a': 0.6,
-                  'b': 0.4
-              }
-          },
-          input_shape=(3, 2))
-
-    with self.test_session():
-      # test serialization with function
-      def f(x):
-        return x + 1
-
-      ld = keras.layers.Lambda(f)
-      config = ld.get_config()
-      ld = keras.layers.deserialize({
-          'class_name': 'Lambda',
-          'config': config
-      })
-
-      # test with lambda
-      ld = keras.layers.Lambda(
-          lambda x: keras.backend.concatenate([keras.backend.square(x), x]))
-      config = ld.get_config()
-      ld = keras.layers.Lambda.from_config(config)
-
+    testing_utils.layer_test(
+        keras.layers.Lambda,
+        kwargs={'function': lambda x: x + 1},
+        input_shape=(3, 2))
+
+    testing_utils.layer_test(
+        keras.layers.Lambda,
+        kwargs={
+            'function': lambda x, a, b: x * a + b,
+            'arguments': {
+                'a': 0.6,
+                'b': 0.4
+            }
+        },
+        input_shape=(3, 2))
+
+    # test serialization with function
+    def f(x):
+      return x + 1
+
+    ld = keras.layers.Lambda(f)
+    config = ld.get_config()
+    ld = keras.layers.deserialize({
+        'class_name': 'Lambda',
+        'config': config
+    })
+
+    # test with lambda
+    ld = keras.layers.Lambda(
+        lambda x: keras.backend.concatenate([math_ops.square(x), x]))
+    config = ld.get_config()
+    ld = keras.layers.Lambda.from_config(config)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_dense(self):
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 2))
+    testing_utils.layer_test(
+        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 2))
 
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 2))
+    testing_utils.layer_test(
+        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 2))
 
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Dense, kwargs={'units': 3}, input_shape=(None, None, 2))
+    testing_utils.layer_test(
+        keras.layers.Dense, kwargs={'units': 3}, input_shape=(None, None, 2))
 
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
+    testing_utils.layer_test(
+        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
 
-    # Test regularization
+  def test_dense_regularization(self):
     with self.test_session():
       layer = keras.layers.Dense(
           3,
@@ -202,7 +188,7 @@ class CoreLayersTest(test.TestCase):
       layer(keras.backend.variable(np.ones((2, 4))))
       self.assertEqual(3, len(layer.losses))
 
-    # Test constraints
+  def test_dense_constraints(self):
     with self.test_session():
       k_constraint = keras.constraints.max_norm(0.01)
       b_constraint = keras.constraints.max_norm(0.01)
@@ -212,12 +198,6 @@ class CoreLayersTest(test.TestCase):
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
-  def test_eager_dense(self):
-    with context.eager_mode():
-      l = keras.layers.Dense(units=3,
-                             kernel_initializer=init_ops.zeros_initializer())
-      self.assertAllEqual(l(constant_op.constant([[1.0]])), [[0., 0., 0.]])
-
   def test_activity_regularization(self):
     with self.test_session():
       layer = keras.layers.ActivityRegularization(l1=0.1)
@@ -255,4 +235,3 @@ class CoreLayersTest(test.TestCase):
 
 if __name__ == '__main__':
   test.main()
-
diff --git a/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffb90457a85bb801d766e144f45c044e0a7e3bb0
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent.py
@@ -0,0 +1,522 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent layers backed by cuDNN.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import constraints
+from tensorflow.python.keras._impl.keras import initializers
+from tensorflow.python.keras._impl.keras import regularizers
+from tensorflow.python.keras._impl.keras.engine import InputSpec
+from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class _CuDNNRNN(RNN):
+  """Private base class for CuDNNGRU and CuDNNLSTM layers.
+
+  Arguments:
+    return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+        in addition to the output.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+  """
+
+  def __init__(self,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               **kwargs):
+    # We invoke the base layer's initializer directly here because we do not
+    # want to create RNN cell instance.
+    super(RNN, self).__init__(**kwargs)  # pylint: disable=bad-super-call
+    self.return_sequences = return_sequences
+    self.return_state = return_state
+    self.go_backwards = go_backwards
+    self.stateful = stateful
+    self.supports_masking = False
+    self.input_spec = [InputSpec(ndim=3)]
+    if hasattr(self.cell.state_size, '__len__'):
+      state_size = self.cell.state_size
+    else:
+      state_size = [self.cell.state_size]
+    self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
+    self.constants_spec = None
+    self._states = None
+    self._num_constants = None
+
+  def _canonical_to_params(self, weights, biases):
+    weights = [array_ops.reshape(x, (-1,)) for x in weights]
+    biases = [array_ops.reshape(x, (-1,)) for x in biases]
+    return array_ops.concat(weights + biases, axis=0)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    if isinstance(mask, list):
+      mask = mask[0]
+    if mask is not None:
+      raise ValueError('Masking is not supported for CuDNN RNNs.')
+
+    # input shape: `(samples, time (padded with zeros), input_dim)`
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      initial_state = inputs[1:]
+      inputs = inputs[0]
+    elif initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
+
+    if self.go_backwards:
+      # Reverse time axis.
+      inputs = K.reverse(inputs, 1)
+    output, states = self._process_batch(inputs, initial_state)
+
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(state_ops.assign(self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    if self.return_state:
+      return [output] + states
+    else:
+      return output
+
+  def get_config(self):
+    config = {
+        'return_sequences': self.return_sequences,
+        'return_state': self.return_state,
+        'go_backwards': self.go_backwards,
+        'stateful': self.stateful
+    }
+    base_config = super(  # pylint: disable=bad-super-call
+        RNN, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+  @property
+  def trainable_weights(self):
+    if self.trainable and self.built:
+      return [self.kernel, self.recurrent_kernel, self.bias]
+    return []
+
+  @property
+  def non_trainable_weights(self):
+    if not self.trainable and self.built:
+      return [self.kernel, self.recurrent_kernel, self.bias]
+    return []
+
+  @property
+  def losses(self):
+    return super(RNN, self).losses
+
+  def get_losses_for(self, inputs=None):
+    return super(  # pylint: disable=bad-super-call
+        RNN, self).get_losses_for(inputs=inputs)
+
+
+@tf_export('keras.layers.CuDNNGRU')
+class CuDNNGRU(_CuDNNRNN):
+  """Fast GRU implementation backed by cuDNN.
+
+  More information about cuDNN can be found on the [NVIDIA
+  developer website](https://developer.nvidia.com/cudnn).
+  Can only be run on GPU.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation").
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output in the output
+        sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state in addition to the
+        output.
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      stateful: Boolean (default False). If True, the last state for each sample
+        at index i in a batch will be used as initial state for the sample of
+        index i in the following batch.
+  """
+
+  def __init__(self,
+               units,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               **kwargs):
+    self.units = units
+    cell_spec = collections.namedtuple('cell', 'state_size')
+    self._cell = cell_spec(state_size=self.units)
+    super(CuDNNGRU, self).__init__(
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        **kwargs)
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  @property
+  def cell(self):
+    return self._cell
+
+  def build(self, input_shape):
+    super(CuDNNGRU, self).build(input_shape)
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_dim = int(input_shape[-1])
+
+    self.kernel = self.add_weight(
+        shape=(input_dim, self.units * 3),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units * 3),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+
+    self.bias = self.add_weight(
+        shape=(self.units * 6,),
+        name='bias',
+        initializer=self.bias_initializer,
+        regularizer=self.bias_regularizer,
+        constraint=self.bias_constraint)
+
+    self.built = True
+
+  def _process_batch(self, inputs, initial_state):
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+    input_h = initial_state[0]
+    input_h = array_ops.expand_dims(input_h, axis=0)
+
+    params = self._canonical_to_params(
+        weights=[
+            self.kernel[:, self.units:self.units * 2],
+            self.kernel[:, :self.units],
+            self.kernel[:, self.units * 2:],
+            self.recurrent_kernel[:, self.units:self.units * 2],
+            self.recurrent_kernel[:, :self.units],
+            self.recurrent_kernel[:, self.units * 2:],
+        ],
+        biases=[
+            self.bias[self.units:self.units * 2],
+            self.bias[:self.units],
+            self.bias[self.units * 2:self.units * 3],
+            self.bias[self.units * 4:self.units * 5],
+            self.bias[self.units * 3:self.units * 4],
+            self.bias[self.units * 5:],
+        ],
+    )
+
+    outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+        inputs,
+        input_h=input_h,
+        input_c=0,
+        params=params,
+        is_training=True,
+        rnn_mode='gru')
+
+    if self.stateful or self.return_state:
+      h = h[0]
+    if self.return_sequences:
+      output = array_ops.transpose(outputs, perm=(1, 0, 2))
+    else:
+      output = outputs[-1]
+    return output, [h]
+
+  def get_config(self):
+    config = {
+        'units': self.units,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(CuDNNGRU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+@tf_export('keras.layers.CuDNNLSTM')
+class CuDNNLSTM(_CuDNNRNN):
+  """Fast LSTM implementation backed by cuDNN.
+
+  More information about cuDNN can be found on the [NVIDIA
+  developer website](https://developer.nvidia.com/cudnn).
+  Can only be run on GPU.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs.
+      unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate
+        at initialization. Setting it to true will also force
+        `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+        al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation").
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output. in the
+        output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state in addition to the
+        output.
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      stateful: Boolean (default False). If True, the last state for each sample
+        at index i in a batch will be used as initial state for the sample of
+        index i in the following batch.
+  """
+
+  def __init__(self,
+               units,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               **kwargs):
+    self.units = units
+    cell_spec = collections.namedtuple('cell', 'state_size')
+    self._cell = cell_spec(state_size=(self.units, self.units))
+    super(CuDNNLSTM, self).__init__(
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        **kwargs)
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.unit_forget_bias = unit_forget_bias
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  @property
+  def cell(self):
+    return self._cell
+
+  def build(self, input_shape):
+    super(CuDNNLSTM, self).build(input_shape)
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_dim = int(input_shape[-1])
+
+    self.kernel = self.add_weight(
+        shape=(input_dim, self.units * 4),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units * 4),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+
+    if self.unit_forget_bias:
+
+      def bias_initializer(_, *args, **kwargs):
+        return array_ops.concat([
+            self.bias_initializer((self.units * 5,), *args, **kwargs),
+            initializers.Ones()((self.units,), *args, **kwargs),
+            self.bias_initializer((self.units * 2,), *args, **kwargs),
+        ], axis=0)
+    else:
+      bias_initializer = self.bias_initializer
+    self.bias = self.add_weight(
+        shape=(self.units * 8,),
+        name='bias',
+        initializer=bias_initializer,
+        regularizer=self.bias_regularizer,
+        constraint=self.bias_constraint)
+
+    self.built = True
+
+  def _process_batch(self, inputs, initial_state):
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+    input_h = initial_state[0]
+    input_c = initial_state[1]
+    input_h = array_ops.expand_dims(input_h, axis=0)
+    input_c = array_ops.expand_dims(input_c, axis=0)
+
+    params = self._canonical_to_params(
+        weights=[
+            self.kernel[:, :self.units],
+            self.kernel[:, self.units:self.units * 2],
+            self.kernel[:, self.units * 2:self.units * 3],
+            self.kernel[:, self.units * 3:],
+            self.recurrent_kernel[:, :self.units],
+            self.recurrent_kernel[:, self.units:self.units * 2],
+            self.recurrent_kernel[:, self.units * 2:self.units * 3],
+            self.recurrent_kernel[:, self.units * 3:],
+        ],
+        biases=[
+            self.bias[:self.units],
+            self.bias[self.units:self.units * 2],
+            self.bias[self.units * 2:self.units * 3],
+            self.bias[self.units * 3:self.units * 4],
+            self.bias[self.units * 4:self.units * 5],
+            self.bias[self.units * 5:self.units * 6],
+            self.bias[self.units * 6:self.units * 7],
+            self.bias[self.units * 7:],
+        ],
+    )
+
+    outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+        inputs,
+        input_h=input_h,
+        input_c=input_c,
+        params=params,
+        is_training=True)
+
+    if self.stateful or self.return_state:
+      h = h[0]
+      c = c[0]
+    if self.return_sequences:
+      output = array_ops.transpose(outputs, perm=(1, 0, 2))
+    else:
+      output = outputs[-1]
+    return output, [h, c]
+
+  def get_config(self):
+    config = {
+        'units': self.units,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'unit_forget_bias': self.unit_forget_bias,
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(CuDNNLSTM, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a06943b10830571a9aa7367dd23a59cb332f4d84
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent_test.py
@@ -0,0 +1,436 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for cudnn recurrent layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class CuDNNTest(test.TestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_cudnn_rnn_timing(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+
+        for rnn_type in ['lstm', 'gru']:
+          times = []
+          for use_cudnn in [True, False]:
+            start_time = time.time()
+            inputs = keras.layers.Input(shape=(None, input_size))
+            if use_cudnn:
+              if rnn_type == 'lstm':
+                layer = keras.layers.CuDNNLSTM(units)
+              else:
+                layer = keras.layers.CuDNNGRU(units)
+            else:
+              if rnn_type == 'lstm':
+                layer = keras.layers.LSTM(units)
+              else:
+                layer = keras.layers.GRU(units)
+            outputs = layer(inputs)
+
+            optimizer = RMSPropOptimizer(learning_rate=0.001)
+            model = keras.models.Model(inputs, outputs)
+            model.compile(optimizer, 'mse')
+
+            x = np.random.random((num_samples, timesteps, input_size))
+            y = np.random.random((num_samples, units))
+            model.fit(x, y, epochs=4, batch_size=32)
+
+            times.append(time.time() - start_time)
+          self.assertGreater(times[1], times[0])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_cudnn_rnn_basics(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
+          for return_sequences in [True, False]:
+            with keras.utils.CustomObjectScope(
+                {'keras.layers.CuDNNGRU': keras.layers.CuDNNGRU,
+                 'keras.layers.CuDNNLSTM': keras.layers.CuDNNLSTM}):
+              testing_utils.layer_test(
+                  layer_class,
+                  kwargs={'units': units,
+                          'return_sequences': return_sequences},
+                  input_shape=(num_samples, timesteps, input_size))
+          for go_backwards in [True, False]:
+            with keras.utils.CustomObjectScope(
+                {'keras.layers.CuDNNGRU': keras.layers.CuDNNGRU,
+                 'keras.layers.CuDNNLSTM': keras.layers.CuDNNLSTM}):
+              testing_utils.layer_test(
+                  layer_class,
+                  kwargs={'units': units,
+                          'go_backwards': go_backwards},
+                  input_shape=(num_samples, timesteps, input_size))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_trainability(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        units = 2
+        for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
+          layer = layer_class(units)
+          layer.build((None, None, input_size))
+          self.assertEqual(len(layer.weights), 3)
+          self.assertEqual(len(layer.trainable_weights), 3)
+          self.assertEqual(len(layer.non_trainable_weights), 0)
+          layer.trainable = False
+          self.assertEqual(len(layer.weights), 3)
+          self.assertEqual(len(layer.non_trainable_weights), 3)
+          self.assertEqual(len(layer.trainable_weights), 0)
+          layer.trainable = True
+          self.assertEqual(len(layer.weights), 3)
+          self.assertEqual(len(layer.trainable_weights), 3)
+          self.assertEqual(len(layer.non_trainable_weights), 0)
+
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  def test_regularizer(self, layer_class):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        layer = layer_class(
+            units,
+            return_sequences=False,
+            input_shape=(timesteps, input_size),
+            kernel_regularizer=keras.regularizers.l1(0.01),
+            recurrent_regularizer=keras.regularizers.l1(0.01),
+            bias_regularizer='l2')
+        layer.build((None, None, input_size))
+        self.assertEqual(len(layer.losses), 3)
+
+        layer = layer_class(
+            units,
+            return_sequences=False,
+            input_shape=(timesteps, input_size),
+            activity_regularizer='l2')
+        self.assertTrue(layer.activity_regularizer)
+        x = keras.backend.variable(
+            np.ones((num_samples, timesteps, input_size)))
+        layer(x)
+        self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  def test_return_state(self, layer_class):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
+
+        inputs = keras.Input(batch_shape=(num_samples, timesteps, input_size))
+        layer = layer_class(units, return_state=True, stateful=True)
+        outputs = layer(inputs)
+        _, state = outputs[0], outputs[1:]
+        self.assertEqual(len(state), num_states)
+        model = keras.models.Model(inputs, state[0])
+
+        inputs = np.random.random((num_samples, timesteps, input_size))
+        state = model.predict(inputs)
+        np.testing.assert_allclose(
+            keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  def test_specify_initial_state_keras_tensor(self, layer_class):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
+
+        inputs = keras.Input((timesteps, input_size))
+        initial_state = [keras.Input((units,)) for _ in range(num_states)]
+        layer = layer_class(units)
+        if len(initial_state) == 1:
+          output = layer(inputs, initial_state=initial_state[0])
+        else:
+          output = layer(inputs, initial_state=initial_state)
+        self.assertIn(initial_state[0], layer._inbound_nodes[0].input_tensors)
+
+        model = keras.models.Model([inputs] + initial_state, output)
+        model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+        inputs = np.random.random((num_samples, timesteps, input_size))
+        initial_state = [
+            np.random.random((num_samples, units)) for _ in range(num_states)
+        ]
+        targets = np.random.random((num_samples, units))
+        model.fit([inputs] + initial_state, targets)
+
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  def test_statefulness(self, layer_class):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Embedding(
+                10,
+                input_size,
+                input_length=timesteps,
+                batch_input_shape=(num_samples, timesteps)))
+        layer = layer_class(
+            units, return_sequences=False, stateful=True, weights=None)
+        model.add(layer)
+        model.compile(optimizer='sgd', loss='mse')
+        out1 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertEqual(out1.shape, (num_samples, units))
+
+        # train once so that the states change
+        model.train_on_batch(
+            np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+        out2 = model.predict(np.ones((num_samples, timesteps)))
+
+        # if the state is not reset, output should be different
+        self.assertNotEqual(out1.max(), out2.max())
+
+        # check that output changes after states are reset
+        # (even though the model itself didn't change)
+        layer.reset_states()
+        out3 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out2.max(), out3.max())
+
+        # check that container-level reset_states() works
+        model.reset_states()
+        out4 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertAllClose(out3, out4, atol=1e-5)
+
+        # check that the call to `predict` updated the states
+        out5 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out4.max(), out5.max())
+
+  # TODO(psv): Add generic cross product helper function for parametrized tests.
+  @parameterized.named_parameters(
+      ('cudnnlstm_to_lstm_unidirectional_impl_1', 'LSTM', False, False, 1),
+      ('cudnnlstm_to_lstm_bidirectional_impl_1', 'LSTM', False, True, 1),
+      ('lstm_to_cudnnlstm_unidirectional_impl_1', 'LSTM', True, False, 1),
+      ('lstm_to_cudnnlstm_bidirectional_impl_1', 'LSTM', True, True, 1),
+      ('cudnngru_to_gru_unidirectional_impl_1', 'GRU', False, False, 1),
+      ('cudnngru_to_gru_bidirectional_impl_1', 'GRU', False, True, 1),
+      ('gru_to_cudnngru_unidirectional_impl_1', 'GRU', True, False, 1),
+      ('gru_to_cudnngru_bidirectional_impl_1', 'GRU', True, True, 1),
+      ('cudnnlstm_to_lstm_unidirectional_impl_2', 'LSTM', False, False, 2),
+      ('cudnnlstm_to_lstm_bidirectional_impl_2', 'LSTM', False, True, 2),
+      ('lstm_to_cudnnlstm_unidirectional_impl_2', 'LSTM', True, False, 2),
+      ('lstm_to_cudnnlstm_bidirectional_impl_2', 'LSTM', True, True, 2),
+      ('cudnngru_to_gru_unidirectional_impl_2', 'GRU', False, False, 2),
+      ('cudnngru_to_gru_bidirectional_impl_2', 'GRU', False, True, 2),
+      ('gru_to_cudnngru_unidirectional_impl_2', 'GRU', True, False, 2),
+      ('gru_to_cudnngru_bidirectional_impl_2', 'GRU', True, True, 2),
+  )
+  def test_load_weights_between_noncudnn_rnn(self, rnn_type, to_cudnn,
+                                             bidirectional, implementation):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        input_shape = (timesteps, input_size)
+        units = 2
+        num_samples = 32
+        inputs = np.random.random((num_samples, timesteps, input_size))
+
+        rnn_layer_kwargs = {
+            'recurrent_activation': 'sigmoid',
+            # ensure biases are non-zero and properly converted
+            'bias_initializer': 'random_uniform',
+            'implementation': implementation
+        }
+        if rnn_type == 'LSTM':
+          rnn_layer_class = keras.layers.LSTM
+          cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
+        else:
+          rnn_layer_class = keras.layers.GRU
+          cudnn_rnn_layer_class = keras.layers.CuDNNGRU
+          rnn_layer_kwargs['reset_after'] = True
+
+        def convert_weights(source_layer, target_layer):
+          weights = source_layer.get_weights()
+          weights = keras.engine.saving.preprocess_weights_for_loading(
+              target_layer, weights)
+          target_layer.set_weights(weights)
+
+        input_layer = keras.layers.InputLayer(input_shape)
+
+        layer = rnn_layer_class(units, **rnn_layer_kwargs)
+        if bidirectional:
+          layer = keras.layers.Bidirectional(layer)
+
+        cudnn_layer = cudnn_rnn_layer_class(units)
+        if bidirectional:
+          cudnn_layer = keras.layers.Bidirectional(cudnn_layer)
+
+        model = keras.models.Sequential([input_layer, layer])
+        cudnn_model = keras.models.Sequential([input_layer, cudnn_layer])
+
+        if to_cudnn:
+          convert_weights(layer, cudnn_layer)
+        else:
+          convert_weights(cudnn_layer, layer)
+
+        self.assertAllClose(
+            model.predict(inputs), cudnn_model.predict(inputs), atol=1e-4)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_cudnnrnn_bidirectional(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        rnn = keras.layers.CuDNNGRU
+        samples = 2
+        dim = 2
+        timesteps = 2
+        output_dim = 2
+        mode = 'concat'
+
+        x = np.random.random((samples, timesteps, dim))
+        target_dim = 2 * output_dim if mode == 'concat' else output_dim
+        y = np.random.random((samples, target_dim))
+
+        # test with Sequential model
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Bidirectional(
+                rnn(output_dim), merge_mode=mode, input_shape=(None, dim)))
+        model.compile(
+            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # test config
+        model.get_config()
+        model = keras.models.model_from_json(model.to_json())
+        model.summary()
+
+        # test stacked bidirectional layers
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Bidirectional(
+                rnn(output_dim, return_sequences=True),
+                merge_mode=mode,
+                input_shape=(None, dim)))
+        model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
+        model.compile(
+            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # test with functional API
+        inputs = keras.Input((timesteps, dim))
+        outputs = keras.layers.Bidirectional(
+            rnn(output_dim), merge_mode=mode)(
+                inputs)
+        model = keras.Model(inputs, outputs)
+        model.compile(
+            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # Bidirectional and stateful
+        inputs = keras.Input(batch_shape=(1, timesteps, dim))
+        outputs = keras.layers.Bidirectional(
+            rnn(output_dim, stateful=True), merge_mode=mode)(
+                inputs)
+        model = keras.Model(inputs, outputs)
+        model.compile(
+            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+        model.fit(x, y, epochs=1, batch_size=1)
+
+  def test_preprocess_weights_for_loading_gru_incompatible(self):
+    """Test loading weights between incompatible layers.
+
+    Should fail fast with an exception.
+    """
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_shape = (3, 5)
+
+        def gru(cudnn=False, **kwargs):
+          layer_class = keras.layers.CuDNNGRU if cudnn else keras.layers.GRU
+          return layer_class(2, input_shape=input_shape, **kwargs)
+
+        def get_layer_weights(layer):
+          layer.build(input_shape=input_shape)
+          return layer.get_weights()
+
+        def assert_not_compatible(src, dest, message):
+          with self.assertRaises(ValueError) as ex:
+            keras.engine.saving.preprocess_weights_for_loading(
+                dest,
+                get_layer_weights(src))
+          self.assertIn(message, str(ex.exception))
+
+        assert_not_compatible(
+            gru(),
+            gru(cudnn=True),
+            'GRU(reset_after=False) is not compatible with CuDNNGRU')
+        assert_not_compatible(
+            gru(cudnn=True),
+            gru(),
+            'CuDNNGRU is not compatible with GRU(reset_after=False)')
+        assert_not_compatible(
+            gru(),
+            gru(reset_after=True),
+            'GRU(reset_after=False) is not compatible with '
+            'GRU(reset_after=True)')
+        assert_not_compatible(
+            gru(reset_after=True),
+            gru(),
+            'GRU(reset_after=True) is not compatible with '
+            'GRU(reset_after=False)')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index ca92899a455cd28a756e9efff63655d7c43c9f45..2b353ac007a33d3dbc17750cdb9da4757aff35d3 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -23,7 +23,9 @@ from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -100,7 +102,8 @@ class Embedding(Layer):
         kwargs['input_shape'] = (input_length,)
       else:
         kwargs['input_shape'] = (None,)
-    super(Embedding, self).__init__(**kwargs)
+    dtype = kwargs.pop('dtype', K.floatx())
+    super(Embedding, self).__init__(dtype=dtype, **kwargs)
 
     self.input_dim = input_dim
     self.output_dim = output_dim
@@ -111,24 +114,23 @@ class Embedding(Layer):
     self.mask_zero = mask_zero
     self.input_length = input_length
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     self.embeddings = self.add_weight(
         shape=(self.input_dim, self.output_dim),
         initializer=self.embeddings_initializer,
         name='embeddings',
         regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint,
-        dtype=self.dtype)
+        constraint=self.embeddings_constraint)
     self.built = True
 
   def compute_mask(self, inputs, mask=None):
     if not self.mask_zero:
       return None
     else:
-      return K.not_equal(inputs, 0)
+      return math_ops.not_equal(inputs, 0)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self.input_length is None:
       return input_shape + (self.output_dim,)
@@ -152,8 +154,8 @@ class Embedding(Layer):
 
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
-      inputs = K.cast(inputs, 'int32')
-    out = K.gather(self.embeddings, inputs)
+      inputs = math_ops.cast(inputs, 'int32')
+    out = embedding_ops.embedding_lookup(self.embeddings, inputs)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
index 1712111b877cf1fee4353c5542f33a973a26de95..6ebf5dc94adb423abae7ec9e6910fb86439410f1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
@@ -25,47 +28,55 @@ from tensorflow.python.platform import test
 
 class EmbeddingTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def test_embedding(self):
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Embedding,
-          kwargs={'output_dim': 4,
-                  'input_dim': 10,
-                  'input_length': 2},
-          input_shape=(3, 2),
-          input_dtype='int32',
-          expected_output_dtype='float32')
+    testing_utils.layer_test(
+        keras.layers.Embedding,
+        kwargs={'output_dim': 4,
+                'input_dim': 10,
+                'input_length': 2},
+        input_shape=(3, 2),
+        input_dtype='int32',
+        expected_output_dtype='float32')
 
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Embedding,
-          kwargs={'output_dim': 4,
-                  'input_dim': 10,
-                  'mask_zero': True},
-          input_shape=(3, 2),
-          input_dtype='int32',
-          expected_output_dtype='float32')
+    testing_utils.layer_test(
+        keras.layers.Embedding,
+        kwargs={'output_dim': 4,
+                'input_dim': 10,
+                'mask_zero': True},
+        input_shape=(3, 2),
+        input_dtype='int32',
+        expected_output_dtype='float32')
 
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Embedding,
-          kwargs={'output_dim': 4,
-                  'input_dim': 10,
-                  'mask_zero': True},
-          input_shape=(3, 4, 2),
-          input_dtype='int32',
-          expected_output_dtype='float32')
+    testing_utils.layer_test(
+        keras.layers.Embedding,
+        kwargs={'output_dim': 4,
+                'input_dim': 10,
+                'mask_zero': True},
+        input_shape=(3, 4, 2),
+        input_dtype='int32',
+        expected_output_dtype='float32')
+
+    testing_utils.layer_test(
+        keras.layers.Embedding,
+        kwargs={'output_dim': 4,
+                'input_dim': 10,
+                'mask_zero': True,
+                'input_length': (None, 2)},
+        input_shape=(3, 4, 2),
+        input_dtype='int32',
+        expected_output_dtype='float32')
 
+  def test_embedding_correctness(self):
     with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Embedding,
-          kwargs={'output_dim': 4,
-                  'input_dim': 10,
-                  'mask_zero': True,
-                  'input_length': (None, 2)},
-          input_shape=(3, 4, 2),
-          input_dtype='int32',
-          expected_output_dtype='float32')
+      layer = keras.layers.Embedding(output_dim=2, input_dim=2)
+      layer.build((None, 2))
+      matrix = np.array([[1, 1], [2, 2]])
+      layer.set_weights([matrix])
+
+      inputs = keras.backend.constant([[0, 1, 0]], dtype='int32')
+      outputs = keras.backend.eval(layer(inputs))
+      self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/layers/gru_test.py b/tensorflow/python/keras/_impl/keras/layers/gru_test.py
index c57fbac41cc43995ef3249414ed03928e7ffd044..48e7e14f5ab73b534ab0d1c765ad2572b2930b2b 100644
--- a/tensorflow/python/keras/_impl/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/gru_test.py
@@ -20,64 +20,66 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class GRULayerTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_return_sequences_GRU(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.GRU,
-          kwargs={'units': units,
-                  'return_sequences': True},
-          input_shape=(num_samples, timesteps, embedding_dim))
+    testing_utils.layer_test(
+        keras.layers.GRU,
+        kwargs={'units': units,
+                'return_sequences': True},
+        input_shape=(num_samples, timesteps, embedding_dim))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_dynamic_behavior_GRU(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
-      model = keras.models.Sequential()
-      model.add(layer)
-      model.compile('sgd', 'mse')
-      x = np.random.random((num_samples, timesteps, embedding_dim))
-      y = np.random.random((num_samples, units))
-      model.train_on_batch(x, y)
-
+    layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(RMSPropOptimizer(0.01), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_dropout_GRU(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.GRU,
-          kwargs={'units': units,
-                  'dropout': 0.1,
-                  'recurrent_dropout': 0.1},
-          input_shape=(num_samples, timesteps, embedding_dim))
-
+    testing_utils.layer_test(
+        keras.layers.GRU,
+        kwargs={'units': units,
+                'dropout': 0.1,
+                'recurrent_dropout': 0.1},
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_implementation_mode_GRU(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      for mode in [0, 1, 2]:
-        testing_utils.layer_test(
-            keras.layers.GRU,
-            kwargs={'units': units,
-                    'implementation': mode},
-            input_shape=(num_samples, timesteps, embedding_dim))
+    for mode in [0, 1, 2]:
+      testing_utils.layer_test(
+          keras.layers.GRU,
+          kwargs={'units': units,
+                  'implementation': mode},
+          input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_statefulness_GRU(self):
     num_samples = 2
diff --git a/tensorflow/python/keras/_impl/keras/layers/local.py b/tensorflow/python/keras/_impl/keras/layers/local.py
index df0efe6b8b7eaa0259eb6f4e246269551b3e0c15..caae820fb3a8eba76c3fbbca734908514b076982 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local.py
+++ b/tensorflow/python/keras/_impl/keras/layers/local.py
@@ -25,8 +25,8 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -120,7 +120,7 @@ class LocallyConnected1D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=3)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[2]
     if input_dim is None:
@@ -148,7 +148,7 @@ class LocallyConnected1D(Layer):
     self.input_spec = InputSpec(ndim=3, axes={2: input_dim})
     self.built = True
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     length = conv_utils.conv_output_length(input_shape[1], self.kernel_size[0],
                                            self.padding, self.strides[0])
@@ -307,7 +307,7 @@ class LocallyConnected2D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=4)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     if self.data_format == 'channels_last':
       input_row, input_col = input_shape[1:-1]
@@ -350,7 +350,7 @@ class LocallyConnected2D(Layer):
       self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
     self.built = True
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self.data_format == 'channels_first':
       rows = input_shape[2]
diff --git a/tensorflow/python/keras/_impl/keras/layers/local_test.py b/tensorflow/python/keras/_impl/keras/layers/local_test.py
index a815a0fadc8215c00f3db4749e323f96e44b66f3..93741d24b9a74cf9e8a83069f7c4235b1f489818 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/local_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
@@ -27,6 +28,7 @@ from tensorflow.python.platform import test
 
 class LocallyConnectedLayersTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_locallyconnected_1d(self):
     num_samples = 2
     num_steps = 8
@@ -39,16 +41,15 @@ class LocallyConnectedLayersTest(test.TestCase):
         if padding == 'same' and strides != 1:
           continue
 
-        with self.test_session():
-          testing_utils.layer_test(
-              keras.layers.LocallyConnected1D,
-              kwargs={
-                  'filters': filters,
-                  'kernel_size': filter_length,
-                  'padding': padding,
-                  'strides': strides
-              },
-              input_shape=(num_samples, num_steps, input_dim))
+        testing_utils.layer_test(
+            keras.layers.LocallyConnected1D,
+            kwargs={
+                'filters': filters,
+                'kernel_size': filter_length,
+                'padding': padding,
+                'strides': strides
+            },
+            input_shape=(num_samples, num_steps, input_dim))
 
   def test_locallyconnected_1d_regularization(self):
     num_samples = 2
@@ -86,6 +87,7 @@ class LocallyConnectedLayersTest(test.TestCase):
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_locallyconnected_2d(self):
     num_samples = 8
     filters = 3
@@ -98,20 +100,18 @@ class LocallyConnectedLayersTest(test.TestCase):
         if padding == 'same' and strides != (1, 1):
           continue
 
-        with self.test_session():
-          testing_utils.layer_test(
-              keras.layers.LocallyConnected2D,
-              kwargs={
-                  'filters': filters,
-                  'kernel_size': 3,
-                  'padding': padding,
-                  'kernel_regularizer': 'l2',
-                  'bias_regularizer': 'l2',
-                  'activity_regularizer': 'l2',
-                  'strides': strides,
-                  'data_format': 'channels_last'
-              },
-              input_shape=(num_samples, num_row, num_col, stack_size))
+        testing_utils.layer_test(
+            keras.layers.LocallyConnected2D,
+            kwargs={
+                'filters': filters,
+                'kernel_size': 3,
+                'padding': padding,
+                'kernel_regularizer': 'l2',
+                'bias_regularizer': 'l2',
+                'strides': strides,
+                'data_format': 'channels_last'
+            },
+            input_shape=(num_samples, num_row, num_col, stack_size))
 
   def test_locallyconnected_2d_channels_first(self):
     num_samples = 8
diff --git a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
index b7af1e8cf03dccfa49ded0fe5b3724bba06e62ba..11a5e0aeaacfa7520361ae41ac3d40607e8a9050 100644
--- a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
@@ -20,28 +20,29 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class LSTMLayerTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_return_sequences_LSTM(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.LSTM,
-          kwargs={'units': units,
-                  'return_sequences': True},
-          input_shape=(num_samples, timesteps, embedding_dim))
+    testing_utils.layer_test(
+        keras.layers.LSTM,
+        kwargs={'units': units,
+                'return_sequences': True},
+        input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_static_shape_inference_LSTM(self):
     # Github issue: 15165
-    num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
@@ -55,45 +56,45 @@ class LSTMLayerTest(test.TestCase):
     outputs = model.layers[-1].output
     self.assertEquals(outputs.get_shape().as_list(), [None, timesteps, units])
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_dynamic_behavior_LSTM(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
-      model = keras.models.Sequential()
-      model.add(layer)
-      model.compile('sgd', 'mse')
-      x = np.random.random((num_samples, timesteps, embedding_dim))
-      y = np.random.random((num_samples, units))
-      model.train_on_batch(x, y)
+    layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(RMSPropOptimizer(0.001), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_dropout_LSTM(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.LSTM,
-          kwargs={'units': units,
-                  'dropout': 0.1,
-                  'recurrent_dropout': 0.1},
-          input_shape=(num_samples, timesteps, embedding_dim))
-
+    testing_utils.layer_test(
+        keras.layers.LSTM,
+        kwargs={'units': units,
+                'dropout': 0.1,
+                'recurrent_dropout': 0.1},
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_implementation_mode_LSTM(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      for mode in [0, 1, 2]:
-        testing_utils.layer_test(
-            keras.layers.LSTM,
-            kwargs={'units': units,
-                    'implementation': mode},
-            input_shape=(num_samples, timesteps, embedding_dim))
+    for mode in [0, 1, 2]:
+      testing_utils.layer_test(
+          keras.layers.LSTM,
+          kwargs={'units': units,
+                  'implementation': mode},
+          input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_statefulness_LSTM(self):
     num_samples = 2
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index cdf2878e83e32147d30d6b29742b7e9013a1facb..2b6cf7c8a94ff40ea35e2bbfe13e6c26024857b4 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -21,8 +21,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine.topology import Layer
-from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -80,7 +83,7 @@ class _Merge(Layer):
         output_shape.append(i)
     return tuple(output_shape)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list):
@@ -127,7 +130,7 @@ class _Merge(Layer):
         for x in inputs:
           x_ndim = K.ndim(x)
           for _ in range(max_ndim - x_ndim):
-            x = K.expand_dims(x, 1)
+            x = array_ops.expand_dims(x, axis=1)
           reshaped_inputs.append(x)
         return self._merge_function(reshaped_inputs)
       else:
@@ -137,19 +140,22 @@ class _Merge(Layer):
         for x in inputs:
           x_ndim = K.ndim(x)
           if x_ndim is None:
-            x_shape = K.shape(x)
+            x_shape = array_ops.shape(x)
             batch_size = x_shape[0]
-            new_shape = K.concatenate([x_shape[1:], K.expand_dims(batch_size)])
-            x_transposed = K.reshape(x,
-                                     K.stack([batch_size,
-                                              K.prod(x_shape[1:])]))
-            x_transposed = K.permute_dimensions(x_transposed, (1, 0))
-            x_transposed = K.reshape(x_transposed, new_shape)
+            new_shape = K.concatenate(
+                [x_shape[1:],
+                 array_ops.expand_dims(batch_size, axis=-1)])
+            x_transposed = array_ops.reshape(
+                x,
+                array_ops.stack(
+                    [batch_size, math_ops.reduce_prod(x_shape[1:])], axis=0))
+            x_transposed = array_ops.transpose(x_transposed, perm=(1, 0))
+            x_transposed = array_ops.reshape(x_transposed, new_shape)
             reshaped_inputs.append(x_transposed)
             transposed = True
           elif x_ndim > 1:
             dims = list(range(1, x_ndim)) + [0]
-            reshaped_inputs.append(K.permute_dimensions(x, dims))
+            reshaped_inputs.append(array_ops.transpose(x, perm=dims))
             transposed = True
           else:
             # We don't transpose inputs if they are 1D vectors or scalars.
@@ -159,22 +165,23 @@ class _Merge(Layer):
         if transposed:
           # If inputs have been transposed, we have to transpose the output too.
           if y_ndim is None:
-            y_shape = K.shape(y)
-            y_ndim = K.shape(y_shape)[0]
+            y_shape = array_ops.shape(y)
+            y_ndim = array_ops.shape(y_shape)[0]
             batch_size = y_shape[y_ndim - 1]
-            new_shape = K.concatenate(
-                [K.expand_dims(batch_size), y_shape[:y_ndim - 1]])
-            y = K.reshape(y, (-1, batch_size))
-            y = K.permute_dimensions(y, (1, 0))
-            y = K.reshape(y, new_shape)
+            new_shape = K.concatenate([
+                array_ops.expand_dims(batch_size, axis=-1), y_shape[:y_ndim - 1]
+            ])
+            y = array_ops.reshape(y, (-1, batch_size))
+            y = array_ops.transpose(y, perm=(1, 0))
+            y = array_ops.reshape(y, new_shape)
           elif y_ndim > 1:
             dims = [y_ndim - 1] + list(range(y_ndim - 1))
-            y = K.permute_dimensions(y, dims)
+            y = array_ops.transpose(y, perm=dims)
         return y
     else:
       return self._merge_function(inputs)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if input_shape[0] is None:
       output_shape = None
@@ -207,7 +214,7 @@ class _Merge(Layer):
                        'should have the same length.')
     if all([m is None for m in mask]):
       return None
-    masks = [K.expand_dims(m, 0) for m in mask if m is not None]
+    masks = [array_ops.expand_dims(m, axis=0) for m in mask if m is not None]
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
 
 
@@ -267,7 +274,7 @@ class Subtract(_Merge):
   ```
   """
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     super(Subtract, self).build(input_shape)
     if len(input_shape) != 2:
@@ -325,7 +332,7 @@ class Maximum(_Merge):
   def _merge_function(self, inputs):
     output = inputs[0]
     for i in range(1, len(inputs)):
-      output = K.maximum(output, inputs[i])
+      output = math_ops.maximum(output, inputs[i])
     return output
 
 
@@ -340,7 +347,7 @@ class Minimum(_Merge):
   def _merge_function(self, inputs):
     output = inputs[0]
     for i in range(1, len(inputs)):
-      output = K.minimum(output, inputs[i])
+      output = math_ops.minimum(output, inputs[i])
     return output
 
 
@@ -363,7 +370,7 @@ class Concatenate(_Merge):
     self.supports_masking = True
     self._reshape_required = False
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list) or len(input_shape) < 2:
@@ -385,7 +392,7 @@ class Concatenate(_Merge):
   def _merge_function(self, inputs):
     return K.concatenate(inputs, axis=self.axis)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list):
       raise ValueError('A `Concatenate` layer should be called '
@@ -418,10 +425,10 @@ class Concatenate(_Merge):
     for input_i, mask_i in zip(inputs, mask):
       if mask_i is None:
         # Input is unmasked. Append all 1s to masks,
-        masks.append(K.ones_like(input_i, dtype='bool'))
+        masks.append(array_ops.ones_like(input_i, dtype='bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
-        masks.append(K.expand_dims(mask_i))
+        masks.append(array_ops.expand_dims(mask_i, axis=-1))
       else:
         masks.append(mask_i)
     concatenated = K.concatenate(masks, axis=self.axis)
@@ -471,7 +478,7 @@ class Dot(_Merge):
     self.supports_masking = True
     self._reshape_required = False
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list) or len(input_shape) != 2:
@@ -511,12 +518,12 @@ class Dot(_Merge):
         else:
           axes.append(self.axes[i])
     if self.normalize:
-      x1 = K.l2_normalize(x1, axis=axes[0])
-      x2 = K.l2_normalize(x2, axis=axes[1])
+      x1 = nn.l2_normalize(x1, axis=axes[0])
+      x2 = nn.l2_normalize(x2, axis=axes[1])
     output = K.batch_dot(x1, x2, axes)
     return output
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge_test.py b/tensorflow/python/keras/_impl/keras/layers/merge_test.py
index bb03dda1fc645222c1ced97cfce8d459586dd89d..b2fe06f93e33ed63d6a2aa29522ecb552f582440 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -27,24 +28,25 @@ from tensorflow.python.platform import test
 
 class MergeLayersTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_merge_add(self):
-    with self.test_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      i3 = keras.layers.Input(shape=(4, 5))
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    i3 = keras.layers.Input(shape=(4, 5))
 
-      o = keras.layers.add([i1, i2, i3])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
-      model = keras.models.Model([i1, i2, i3], o)
+    o = keras.layers.add([i1, i2, i3])
+    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    model = keras.models.Model([i1, i2, i3], o)
 
-      x1 = np.random.random((2, 4, 5))
-      x2 = np.random.random((2, 4, 5))
-      x3 = np.random.random((2, 4, 5))
-      out = model.predict([x1, x2, x3])
-      self.assertEqual(out.shape, (2, 4, 5))
-      self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
+    x1 = np.random.random((2, 4, 5))
+    x2 = np.random.random((2, 4, 5))
+    x3 = np.random.random((2, 4, 5))
+    out = model.predict([x1, x2, x3])
+    self.assertEqual(out.shape, (2, 4, 5))
+    self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
 
-      # test masking
+  def test_merge_add_masking(self):
+    with self.test_session():
       i1 = keras.layers.Input(shape=(4, 5))
       i2 = keras.layers.Input(shape=(4, 5))
       m1 = keras.layers.Masking()(i1)
@@ -54,11 +56,13 @@ class MergeLayersTest(test.TestCase):
       mask = layer.output_mask
       self.assertListEqual(mask.get_shape().as_list(), [None, 4])
 
-      # test missing shape
+  def test_merge_add_dynamic_shape(self):
+    with self.test_session():
       i1 = array_ops.placeholder(shape=(4, None), dtype='float32')
       i2 = array_ops.placeholder(shape=(4, 5), dtype='float32')
       layer = keras.layers.Add()
       o = layer([i1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [4, 5])
 
   def test_merge_elementwise_errors(self):
     i1 = keras.layers.Input(shape=(4, 5))
@@ -72,79 +76,82 @@ class MergeLayersTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.add([i1])
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_merge_multiply(self):
-    with self.test_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      i3 = keras.layers.Input(shape=(4, 5))
-      o = keras.layers.multiply([i1, i2, i3])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
-      model = keras.models.Model([i1, i2, i3], o)
-
-      x1 = np.random.random((2, 4, 5))
-      x2 = np.random.random((2, 4, 5))
-      x3 = np.random.random((2, 4, 5))
-      out = model.predict([x1, x2, x3])
-      self.assertEqual(out.shape, (2, 4, 5))
-      self.assertAllClose(out, x1 * x2 * x3, atol=1e-4)
-
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    i3 = keras.layers.Input(shape=(4, 5))
+    o = keras.layers.multiply([i1, i2, i3])
+    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    model = keras.models.Model([i1, i2, i3], o)
+
+    x1 = np.random.random((2, 4, 5))
+    x2 = np.random.random((2, 4, 5))
+    x3 = np.random.random((2, 4, 5))
+    out = model.predict([x1, x2, x3])
+    self.assertEqual(out.shape, (2, 4, 5))
+    self.assertAllClose(out, x1 * x2 * x3, atol=1e-4)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_merge_average(self):
-    with self.test_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      o = keras.layers.average([i1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
-      model = keras.models.Model([i1, i2], o)
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    o = keras.layers.average([i1, i2])
+    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    model = keras.models.Model([i1, i2], o)
 
-      x1 = np.random.random((2, 4, 5))
-      x2 = np.random.random((2, 4, 5))
-      out = model.predict([x1, x2])
-      self.assertEqual(out.shape, (2, 4, 5))
-      self.assertAllClose(out, 0.5 * (x1 + x2), atol=1e-4)
+    x1 = np.random.random((2, 4, 5))
+    x2 = np.random.random((2, 4, 5))
+    out = model.predict([x1, x2])
+    self.assertEqual(out.shape, (2, 4, 5))
+    self.assertAllClose(out, 0.5 * (x1 + x2), atol=1e-4)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_merge_maximum(self):
-    with self.test_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      o = keras.layers.maximum([i1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
-      model = keras.models.Model([i1, i2], o)
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    o = keras.layers.maximum([i1, i2])
+    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    model = keras.models.Model([i1, i2], o)
 
-      x1 = np.random.random((2, 4, 5))
-      x2 = np.random.random((2, 4, 5))
-      out = model.predict([x1, x2])
-      self.assertEqual(out.shape, (2, 4, 5))
-      self.assertAllClose(out, np.maximum(x1, x2), atol=1e-4)
+    x1 = np.random.random((2, 4, 5))
+    x2 = np.random.random((2, 4, 5))
+    out = model.predict([x1, x2])
+    self.assertEqual(out.shape, (2, 4, 5))
+    self.assertAllClose(out, np.maximum(x1, x2), atol=1e-4)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_merge_minimum(self):
-    with self.test_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      o = keras.layers.minimum([i1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
-      model = keras.models.Model([i1, i2], o)
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    o = keras.layers.minimum([i1, i2])
+    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    model = keras.models.Model([i1, i2], o)
 
-      x1 = np.random.random((2, 4, 5))
-      x2 = np.random.random((2, 4, 5))
-      out = model.predict([x1, x2])
-      self.assertEqual(out.shape, (2, 4, 5))
-      self.assertAllClose(out, np.minimum(x1, x2), atol=1e-4)
+    x1 = np.random.random((2, 4, 5))
+    x2 = np.random.random((2, 4, 5))
+    out = model.predict([x1, x2])
+    self.assertEqual(out.shape, (2, 4, 5))
+    self.assertAllClose(out, np.minimum(x1, x2), atol=1e-4)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_merge_concatenate(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    o = keras.layers.concatenate([i1, i2], axis=1)
+    self.assertListEqual(o.get_shape().as_list(), [None, 8, 5])
+    model = keras.models.Model([i1, i2], o)
+
+    x1 = np.random.random((2, 4, 5))
+    x2 = np.random.random((2, 4, 5))
+    out = model.predict([x1, x2])
+    self.assertEqual(out.shape, (2, 8, 5))
+    self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
+
+  def test_merge_concatenate_masking(self):
     with self.test_session():
       i1 = keras.layers.Input(shape=(4, 5))
       i2 = keras.layers.Input(shape=(4, 5))
-      o = keras.layers.concatenate([i1, i2], axis=1)
-      self.assertListEqual(o.get_shape().as_list(), [None, 8, 5])
-      model = keras.models.Model([i1, i2], o)
-
-      x1 = np.random.random((2, 4, 5))
-      x2 = np.random.random((2, 4, 5))
-      out = model.predict([x1, x2])
-      self.assertEqual(out.shape, (2, 8, 5))
-      self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
-
-      # test masking
       m1 = keras.layers.Masking()(i1)
       layer = keras.layers.Concatenate()
       o = layer([m1, i2])
@@ -162,35 +169,35 @@ class MergeLayersTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'called on a list'):
       keras.layers.concatenate([i1], axis=-1)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_merge_dot(self):
-    with self.test_session():
-      i1 = keras.layers.Input(shape=(4,))
-      i2 = keras.layers.Input(shape=(4,))
-      o = keras.layers.dot([i1, i2], axes=1)
-      self.assertListEqual(o.get_shape().as_list(), [None, 1])
-      model = keras.models.Model([i1, i2], o)
-      _ = keras.layers.Dot(axes=1).get_config()
-
-      x1 = np.random.random((2, 4))
-      x2 = np.random.random((2, 4))
-      out = model.predict([x1, x2])
-      self.assertEqual(out.shape, (2, 1))
-      expected = np.zeros((2, 1))
-      expected[0, 0] = np.dot(x1[0], x2[0])
-      expected[1, 0] = np.dot(x1[1], x2[1])
-      self.assertAllClose(out, expected, atol=1e-4)
-
-      # Test with negative tuple of axes.
-      o = keras.layers.dot([i1, i2], axes=(-1, -1))
-      self.assertListEqual(o.get_shape().as_list(), [None, 1])
-      model = keras.models.Model([i1, i2], o)
-      out = model.predict([x1, x2])
-      self.assertEqual(out.shape, (2, 1))
-      self.assertAllClose(out, expected, atol=1e-4)
-
-      # test compute_output_shape
-      layer = keras.layers.Dot(axes=-1)
-      self.assertEqual(layer.compute_output_shape([(4, 5), (4, 5)]), (4, 1))
+    i1 = keras.layers.Input(shape=(4,))
+    i2 = keras.layers.Input(shape=(4,))
+    o = keras.layers.dot([i1, i2], axes=1)
+    self.assertListEqual(o.get_shape().as_list(), [None, 1])
+    model = keras.models.Model([i1, i2], o)
+    _ = keras.layers.Dot(axes=1).get_config()
+
+    x1 = np.random.random((2, 4))
+    x2 = np.random.random((2, 4))
+    out = model.predict([x1, x2])
+    self.assertEqual(out.shape, (2, 1))
+    expected = np.zeros((2, 1))
+    expected[0, 0] = np.dot(x1[0], x2[0])
+    expected[1, 0] = np.dot(x1[1], x2[1])
+    self.assertAllClose(out, expected, atol=1e-4)
+
+    # Test with negative tuple of axes.
+    o = keras.layers.dot([i1, i2], axes=(-1, -1))
+    self.assertListEqual(o.get_shape().as_list(), [None, 1])
+    model = keras.models.Model([i1, i2], o)
+    out = model.predict([x1, x2])
+    self.assertEqual(out.shape, (2, 1))
+    self.assertAllClose(out, expected, atol=1e-4)
+
+    # test compute_output_shape
+    layer = keras.layers.Dot(axes=-1)
+    self.assertEqual(layer.compute_output_shape([(4, 5), (4, 5)]), (4, 1))
 
   def test_dot_errors(self):
     i1 = keras.layers.Input(shape=(4, 5))
@@ -208,6 +215,7 @@ class MergeLayersTest(test.TestCase):
       dot = keras.layers.Dot(1)
       dot.compute_output_shape(1)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_merge_subtract(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index 9010f4961585af58b7eae43dcd224e0c39606239..addac5b137430d8f74efa126423cb39b15382502 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -22,7 +22,9 @@ import numpy as np
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -58,7 +60,7 @@ class GaussianNoise(Layer):
 
     def noised():
       return inputs + K.random_normal(
-          shape=K.shape(inputs), mean=0., stddev=self.stddev)
+          shape=array_ops.shape(inputs), mean=0., stddev=self.stddev)
 
     return K.in_train_phase(noised, inputs, training=training)
 
@@ -67,7 +69,7 @@ class GaussianNoise(Layer):
     base_config = super(GaussianNoise, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -104,7 +106,7 @@ class GaussianDropout(Layer):
       def noised():
         stddev = np.sqrt(self.rate / (1.0 - self.rate))
         return inputs * K.random_normal(
-            shape=K.shape(inputs), mean=1.0, stddev=stddev)
+            shape=array_ops.shape(inputs), mean=1.0, stddev=stddev)
 
       return K.in_train_phase(noised, inputs, training=training)
     return inputs
@@ -114,7 +116,7 @@ class GaussianDropout(Layer):
     base_config = super(GaussianDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -153,7 +155,7 @@ class AlphaDropout(Layer):
     self.supports_masking = True
 
   def _get_noise_shape(self, inputs):
-    return self.noise_shape if self.noise_shape else K.shape(inputs)
+    return self.noise_shape if self.noise_shape else array_ops.shape(inputs)
 
   def call(self, inputs, training=None):
     if 0. < self.rate < 1.:
@@ -164,9 +166,9 @@ class AlphaDropout(Layer):
         scale = 1.0507009873554804934193349852946
         alpha_p = -alpha * scale
 
-        kept_idx = K.greater_equal(
+        kept_idx = math_ops.greater_equal(
             K.random_uniform(noise_shape, seed=seed), rate)
-        kept_idx = K.cast(kept_idx, K.floatx())
+        kept_idx = math_ops.cast(kept_idx, K.floatx())
 
         # Get affine transformation params
         a = ((1 - rate) * (1 + rate * alpha_p**2))**-0.5
@@ -186,6 +188,6 @@ class AlphaDropout(Layer):
     base_config = super(AlphaDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise_test.py b/tensorflow/python/keras/_impl/keras/layers/noise_test.py
index f9b4d9cd090ffec1a5acd9118ea6a65798bd72a6..af4f031ec95bb56b72c1f1018e0e529d8ff55564 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
@@ -39,12 +40,12 @@ class NoiseLayersTest(test.TestCase):
           kwargs={'rate': 0.5},
           input_shape=(3, 2, 3))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_AlphaDropout(self):
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.AlphaDropout,
-          kwargs={'rate': 0.2},
-          input_shape=(3, 2, 3))
+    testing_utils.layer_test(
+        keras.layers.AlphaDropout,
+        kwargs={'rate': 0.2},
+        input_shape=(3, 2, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index 0dedd5e8daa2974038c90ae2e8c68ca6516ba725..c16fc07fb4ecda66bd8bcc70dce5d753c73f5dd9 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -19,17 +19,28 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
+from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.layers import normalization as tf_normalization_layers
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import state_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.layers.BatchNormalization')
-class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
+class BatchNormalization(Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
   Normalize the activations of the previous layer at each batch,
@@ -37,28 +48,63 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
   close to 0 and the activation standard deviation close to 1.
 
   Arguments:
-      axis: Integer, the axis that should be normalized
-          (typically the features axis).
-          For instance, after a `Conv2D` layer with
-          `data_format="channels_first"`,
-          set `axis=1` in `BatchNormalization`.
-      momentum: Momentum for the moving average.
-      epsilon: Small float added to variance to avoid dividing by zero.
-      center: If True, add offset of `beta` to normalized tensor.
-          If False, `beta` is ignored.
-      scale: If True, multiply by `gamma`.
-          If False, `gamma` is not used.
-          When the next layer is linear (also e.g. `nn.relu`),
-          this can be disabled since the scaling
-          will be done by the next layer.
-      beta_initializer: Initializer for the beta weight.
-      gamma_initializer: Initializer for the gamma weight.
-      moving_mean_initializer: Initializer for the moving mean.
-      moving_variance_initializer: Initializer for the moving variance.
-      beta_regularizer: Optional regularizer for the beta weight.
-      gamma_regularizer: Optional regularizer for the gamma weight.
-      beta_constraint: Optional constraint for the beta weight.
-      gamma_constraint: Optional constraint for the gamma weight.
+    axis: Integer, the axis that should be normalized
+        (typically the features axis).
+        For instance, after a `Conv2D` layer with
+        `data_format="channels_first"`,
+        set `axis=1` in `BatchNormalization`.
+    momentum: Momentum for the moving average.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`.
+        If False, `gamma` is not used.
+        When the next layer is linear (also e.g. `nn.relu`),
+        this can be disabled since the scaling
+        will be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    moving_mean_initializer: Initializer for the moving mean.
+    moving_variance_initializer: Initializer for the moving variance.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: Optional constraint for the beta weight.
+    gamma_constraint: Optional constraint for the gamma weight.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `momentum` is still applied
+      to get the means and variances for inference.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
+      which means batch normalization is performed across the whole batch. When
+      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
+      Normalization", which creates virtual sub-batches which are each
+      normalized separately (with shared gamma, beta, and moving statistics).
+      Must divide the actual batch size during execution.
+    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
+      the input tensor and returning a pair (scale, bias) to apply to the
+      normalized values (before gamma and beta), only during training. For
+      example, if axis==-1,
+        `adjustment = lambda shape: (
+          tf.random_uniform(shape[-1:], 0.93, 1.07),
+          tf.random_uniform(shape[-1:], -0.1, 0.1))`
+      will scale the normalized value by up to 7% up or down, then shift the
+      result by up to 0.1 (with independent scaling and bias for each feature
+      but shared across all examples), and finally apply gamma and/or beta. If
+      `None`, no adjustment is applied. Cannot be specified if
+      virtual_batch_size is specified.
 
   Input shape:
       Arbitrary. Use the keyword argument `input_shape`
@@ -87,33 +133,508 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
                gamma_regularizer=None,
                beta_constraint=None,
                gamma_constraint=None,
+               renorm=False,
+               renorm_clipping=None,
+               renorm_momentum=0.99,
+               fused=None,
+               trainable=True,
+               virtual_batch_size=None,
+               adjustment=None,
+               name=None,
                **kwargs):
-    self.supports_masking = True
     super(BatchNormalization, self).__init__(
-        axis=axis,
-        momentum=momentum,
-        epsilon=epsilon,
-        center=center,
-        scale=scale,
-        beta_initializer=initializers.get(beta_initializer),
-        gamma_initializer=initializers.get(gamma_initializer),
-        moving_mean_initializer=initializers.get(moving_mean_initializer),
-        moving_variance_initializer=initializers.get(
-            moving_variance_initializer),
-        beta_regularizer=regularizers.get(beta_regularizer),
-        gamma_regularizer=regularizers.get(gamma_regularizer),
-        beta_constraint=constraints.get(beta_constraint),
-        gamma_constraint=constraints.get(gamma_constraint),
-        **kwargs
-    )
+        name=name, trainable=trainable, **kwargs)
+    if isinstance(axis, list):
+      self.axis = axis[:]
+    else:
+      self.axis = axis
+    self.momentum = momentum
+    self.epsilon = epsilon
+    self.center = center
+    self.scale = scale
+    self.beta_initializer = initializers.get(beta_initializer)
+    self.gamma_initializer = initializers.get(gamma_initializer)
+    self.moving_mean_initializer = initializers.get(moving_mean_initializer)
+    self.moving_variance_initializer = initializers.get(
+        moving_variance_initializer)
+    self.beta_regularizer = regularizers.get(beta_regularizer)
+    self.gamma_regularizer = regularizers.get(gamma_regularizer)
+    self.beta_constraint = constraints.get(beta_constraint)
+    self.gamma_constraint = constraints.get(gamma_constraint)
+    self.renorm = renorm
+    self.virtual_batch_size = virtual_batch_size
+    self.adjustment = adjustment
+    if fused is None:
+      fused = True
+    self.supports_masking = True
+
+    self.fused = fused
+    self._bessels_correction_test_only = True
+
+    if renorm:
+      renorm_clipping = renorm_clipping or {}
+      keys = ['rmax', 'rmin', 'dmax']
+      if set(renorm_clipping) - set(keys):
+        raise ValueError('renorm_clipping %s contains keys not in %s' %
+                         (renorm_clipping, keys))
+      self.renorm_clipping = renorm_clipping
+      self.renorm_momentum = renorm_momentum
+
+  def _add_tower_local_variable(self, *args, **kwargs):
+    tower_context = distribute_lib.get_tower_context()
+    with tower_context.tower_local_var_scope('mean'):
+      return self.add_variable(*args, **kwargs)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if not input_shape.ndims:
+      raise ValueError('Input has undefined rank:', input_shape)
+    ndims = len(input_shape)
+
+    # Convert axis to list and resolve negatives
+    if isinstance(self.axis, int):
+      self.axis = [self.axis]
+
+    if not isinstance(self.axis, list):
+      raise TypeError('axis must be int or list, type given: %s'
+                      % type(self.axis))
+
+    for idx, x in enumerate(self.axis):
+      if x < 0:
+        self.axis[idx] = ndims + x
+
+    # Validate axes
+    for x in self.axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.axis) != len(set(self.axis)):
+      raise ValueError('Duplicate axis: %s' % self.axis)
+
+    if self.virtual_batch_size is not None:
+      if self.virtual_batch_size <= 0:
+        raise ValueError('virtual_batch_size must be a positive integer that '
+                         'divides the true batch size of the input Tensor')
+      # If using virtual batches, the first dimension must be the batch
+      # dimension and cannot be the batch norm axis
+      if 0 in self.axis:
+        raise ValueError('When using virtual_batch_size, the batch dimension '
+                         'must be 0 and thus axis cannot include 0')
+      if self.adjustment is not None:
+        raise ValueError('When using virtual_batch_size, adjustment cannot '
+                         'be specified')
+
+    if self.fused:
+      # Currently fused batch norm doesn't support renorm. It also only supports
+      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
+      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
+      # output back to its original shape accordingly.
+      self.fused = (not self.renorm and
+                    ndims == 4 and
+                    self.axis in [[1], [3]] and
+                    self.virtual_batch_size is None and
+                    self.adjustment is None)
+      # TODO(chrisying): fused batch norm is currently not supported for
+      # multi-axis batch norm and by extension virtual batches. In some cases,
+      # it might be possible to use fused batch norm but would require reshaping
+      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
+      # particularly tricky. A compromise might be to just support the most
+      # common use case (turning 5D w/ virtual batch to NCHW)
+
+    if self.fused:
+      if self.axis == [1]:
+        self._data_format = 'NCHW'
+      elif self.axis == [3]:
+        self._data_format = 'NHWC'
+      else:
+        raise ValueError('Unsupported axis, fused batch norm only supports '
+                         'axis == [1] or axis == [3]')
+
+    # Raise parameters of fp16 batch norm to fp32
+    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
+      param_dtype = dtypes.float32
+    else:
+      param_dtype = self.dtype or dtypes.float32
+
+    axis_to_dim = {x: input_shape[x].value for x in self.axis}
+    for x in axis_to_dim:
+      if axis_to_dim[x] is None:
+        raise ValueError('Input has undefined `axis` dimension. Input shape: ',
+                         input_shape)
+    self.input_spec = InputSpec(ndim=ndims, axes=axis_to_dim)
+
+    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
+      # Single axis batch norm (most common/default use-case)
+      param_shape = (list(axis_to_dim.values())[0],)
+    else:
+      # Parameter shape is the original shape but with 1 in all non-axis dims
+      param_shape = [axis_to_dim[i] if i in axis_to_dim
+                     else 1 for i in range(ndims)]
+      if self.virtual_batch_size is not None:
+        # When using virtual batches, add an extra dim at index 1
+        param_shape.insert(1, 1)
+        for idx, x in enumerate(self.axis):
+          self.axis[idx] = x + 1      # Account for added dimension
+
+    if self.scale:
+      self.gamma = self.add_variable(
+          name='gamma',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint,
+          trainable=True)
+    else:
+      self.gamma = None
+      if self.fused:
+        self._gamma_const = array_ops.constant(
+            1.0, dtype=param_dtype, shape=param_shape)
+
+    if self.center:
+      self.beta = self.add_variable(
+          name='beta',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint,
+          trainable=True)
+    else:
+      self.beta = None
+      if self.fused:
+        self._beta_const = array_ops.constant(
+            0.0, dtype=param_dtype, shape=param_shape)
+
+    try:
+      # Disable variable partitioning when creating the moving mean and variance
+      if hasattr(self, '_scope') and self._scope:
+        partitioner = self._scope.partitioner
+        self._scope.set_partitioner(None)
+      else:
+        partitioner = None
+      self.moving_mean = self._add_tower_local_variable(
+          name='moving_mean',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.moving_mean_initializer,
+          trainable=False)
+
+      self.moving_variance = self._add_tower_local_variable(
+          name='moving_variance',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.moving_variance_initializer,
+          trainable=False)
+
+      if self.renorm:
+        # Create variables to maintain the moving mean and standard deviation.
+        # These are used in training and thus are different from the moving
+        # averages above. The renorm variables are colocated with moving_mean
+        # and moving_variance.
+        # NOTE: below, the outer `with device` block causes the current device
+        # stack to be cleared. The nested ones use a `lambda` to set the desired
+        # device and ignore any devices that may be set by the custom getter.
+        def _renorm_variable(name, shape):
+          var = self._add_tower_local_variable(
+              name=name,
+              shape=shape,
+              dtype=param_dtype,
+              initializer=init_ops.zeros_initializer(),
+              trainable=False)
+          return var
+
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_mean):
+          self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
+          self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
+        # We initialize renorm_stddev to 0, and maintain the (0-initialized)
+        # renorm_stddev_weight. This allows us to (1) mix the average
+        # stddev with the minibatch stddev early in training, and (2) compute
+        # the unbiased average stddev by dividing renorm_stddev by the weight.
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_variance):
+          self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
+          self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
+                                                       ())
+    finally:
+      if partitioner:
+        self._scope.set_partitioner(partitioner)
+    self.built = True
+
+  def _assign_moving_average(self, variable, value, momentum):
+    with ops.name_scope(None, 'AssignMovingAvg',
+                        [variable, value, momentum]) as scope:
+      decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
+      if decay.dtype != variable.dtype.base_dtype:
+        decay = math_ops.cast(decay, variable.dtype.base_dtype)
+      update_delta = (variable - value) * decay
+      return state_ops.assign_sub(variable, update_delta, name=scope)
+
+  def _fused_batch_norm(self, inputs, training):
+    """Returns the output of fused batch norm."""
+    beta = self.beta if self.center else self._beta_const
+    gamma = self.gamma if self.scale else self._gamma_const
+
+    def _fused_batch_norm_training():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          epsilon=self.epsilon,
+          data_format=self._data_format)
+
+    def _fused_batch_norm_inference():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          mean=self.moving_mean,
+          variance=self.moving_variance,
+          epsilon=self.epsilon,
+          is_training=False,
+          data_format=self._data_format)
+
+    output, mean, variance = tf_utils.smart_cond(
+        training, _fused_batch_norm_training, _fused_batch_norm_inference)
+    if not self._bessels_correction_test_only:
+      # Remove Bessel's correction to be consistent with non-fused batch norm.
+      # Note that the variance computed by fused batch norm is
+      # with Bessel's correction.
+      sample_size = math_ops.cast(
+          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
+      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
+      variance *= factor
+
+    training_value = tf_utils.constant_value(training)
+    if training_value is None:
+      momentum = tf_utils.smart_cond(training,
+                                     lambda: self.momentum,
+                                     lambda: 1.0)
+    else:
+      momentum = ops.convert_to_tensor(self.momentum)
+    if training_value or training_value is None:
+      mean_update = self._assign_moving_average(self.moving_mean, mean,
+                                                momentum)
+      variance_update = self._assign_moving_average(self.moving_variance,
+                                                    variance, momentum)
+      self.add_update(mean_update, inputs=True)
+      self.add_update(variance_update, inputs=True)
+
+    return output
+
+  def _renorm_correction_and_moments(self, mean, variance, training):
+    """Returns the correction and update values for renorm."""
+    stddev = math_ops.sqrt(variance + self.epsilon)
+    # Compute the average mean and standard deviation, as if they were
+    # initialized with this batch's moments.
+    mixed_renorm_mean = (self.renorm_mean +
+                         (1. - self.renorm_mean_weight) * mean)
+    mixed_renorm_stddev = (self.renorm_stddev +
+                           (1. - self.renorm_stddev_weight) * stddev)
+    # Compute the corrections for batch renorm.
+    r = stddev / mixed_renorm_stddev
+    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
+    # Ensure the corrections use pre-update moving averages.
+    with ops.control_dependencies([r, d]):
+      mean = array_ops.identity(mean)
+      stddev = array_ops.identity(stddev)
+    rmin, rmax, dmax = [self.renorm_clipping.get(key)
+                        for key in ['rmin', 'rmax', 'dmax']]
+    if rmin is not None:
+      r = math_ops.maximum(r, rmin)
+    if rmax is not None:
+      r = math_ops.minimum(r, rmax)
+    if dmax is not None:
+      d = math_ops.maximum(d, -dmax)
+      d = math_ops.minimum(d, dmax)
+    # When not training, use r=1, d=0.
+    r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
+    d = tf_utils.smart_cond(training,
+                            lambda: d,
+                            lambda: array_ops.zeros_like(d))
+
+    def _update_renorm_variable(var, weight, value):
+      """Updates a moving average and weight, returns the unbiased value."""
+      value = array_ops.identity(value)
+      def _do_update():
+        """Updates the var and weight, returns their updated ratio."""
+        # Update the variables without zero debiasing. The debiasing will be
+        # accomplished by dividing the exponential moving average by the weight.
+        # For example, after a single update, the moving average would be
+        # (1-decay) * value. and the weight will be 1-decay, with their ratio
+        # giving the value.
+        # Make sure the weight is not updated until before r and d computation.
+        with ops.control_dependencies([value]):
+          weight_value = array_ops.constant(1., dtype=weight.dtype)
+        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
+        new_weight = self._assign_moving_average(weight, weight_value,
+                                                 self.renorm_momentum)
+        # TODO(yuefengz): the updates to var and weighted can not be batched
+        # together if we fetch their updated values here. Consider calculating
+        # new values and delaying the updates.
+        return new_var / new_weight
+
+      def _fake_update():
+        return array_ops.identity(var)
+      return tf_utils.smart_cond(training, _do_update, _fake_update)
+
+    # TODO(yuefengz): colocate the operations
+    new_mean = _update_renorm_variable(self.renorm_mean,
+                                       self.renorm_mean_weight, mean)
+    new_stddev = _update_renorm_variable(self.renorm_stddev,
+                                         self.renorm_stddev_weight, stddev)
+    # Make sqrt(moving_variance + epsilon) = new_stddev.
+    new_variance = math_ops.square(new_stddev) - self.epsilon
+
+    return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=None):
+    original_training_value = training
     if training is None:
       training = K.learning_phase()
-    output = super(BatchNormalization, self).call(inputs, training=training)
-    if context.in_graph_mode() and training is K.learning_phase():
-      output._uses_learning_phase = True  # pylint: disable=protected-access
-    return output
+
+    in_eager_mode = context.executing_eagerly()
+    if self.virtual_batch_size is not None:
+      # Virtual batches (aka ghost batches) can be simulated by reshaping the
+      # Tensor and reusing the existing batch norm implementation
+      original_shape = [-1] + inputs.shape.as_list()[1:]
+      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
+
+      # Will cause errors if virtual_batch_size does not divide the batch size
+      inputs = array_ops.reshape(inputs, expanded_shape)
+
+      def undo_virtual_batching(outputs):
+        outputs = array_ops.reshape(outputs, original_shape)
+        return outputs
+
+    if self.fused:
+      outputs = self._fused_batch_norm(inputs, training=training)
+      if self.virtual_batch_size is not None:
+        # Currently never reaches here since fused_batch_norm does not support
+        # virtual batching
+        outputs = undo_virtual_batching(outputs)
+      if not context.executing_eagerly() and original_training_value is None:
+        outputs._uses_learning_phase = True  # pylint: disable=protected-access
+      return outputs
+
+    # Compute the axes along which to reduce the mean / variance
+    input_shape = inputs.get_shape()
+    ndims = len(input_shape)
+    reduction_axes = [i for i in range(ndims) if i not in self.axis]
+    if self.virtual_batch_size is not None:
+      del reduction_axes[1]     # Do not reduce along virtual batch dim
+
+    # Broadcasting only necessary for single-axis batch norm where the axis is
+    # not the last dimension
+    broadcast_shape = [1] * ndims
+    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
+    def _broadcast(v):
+      if (v is not None and
+          len(v.get_shape()) != ndims and
+          reduction_axes != list(range(ndims - 1))):
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+
+    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
+
+    def _compose_transforms(scale, offset, then_scale, then_offset):
+      if then_scale is not None:
+        scale *= then_scale
+        offset *= then_scale
+      if then_offset is not None:
+        offset += then_offset
+      return (scale, offset)
+
+    # Determine a boolean value for `training`: could be True, False, or None.
+    training_value = tf_utils.constant_value(training)
+    if training_value is not False:
+      if self.adjustment:
+        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
+        # Adjust only during training.
+        adj_scale = tf_utils.smart_cond(training,
+                                        lambda: adj_scale,
+                                        lambda: array_ops.ones_like(adj_scale))
+        adj_bias = tf_utils.smart_cond(training,
+                                       lambda: adj_bias,
+                                       lambda: array_ops.zeros_like(adj_bias))
+        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
+
+      # Some of the computations here are not necessary when training==False
+      # but not a constant. However, this makes the code simpler.
+      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
+      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+
+      moving_mean = self.moving_mean
+      moving_variance = self.moving_variance
+
+      mean = tf_utils.smart_cond(training,
+                                 lambda: mean,
+                                 lambda: moving_mean)
+      variance = tf_utils.smart_cond(training,
+                                     lambda: variance,
+                                     lambda: moving_variance)
+
+      if self.renorm:
+        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
+            mean, variance, training)
+        # When training, the normalized values (say, x) will be transformed as
+        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
+        # = x * (r * gamma) + (d * gamma + beta) with renorm.
+        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
+        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
+        scale, offset = _compose_transforms(r, d, scale, offset)
+      else:
+        new_mean, new_variance = mean, variance
+
+      if self.virtual_batch_size is not None:
+        # This isn't strictly correct since in ghost batch norm, you are
+        # supposed to sequentially update the moving_mean and moving_variance
+        # with each sub-batch. However, since the moving statistics are only
+        # used during evaluation, it is more efficient to just update in one
+        # step and should not make a significant difference in the result.
+        new_mean = math_ops.reduce_mean(new_mean,
+                                        axis=1, keepdims=True)
+        new_variance = math_ops.reduce_mean(new_variance,
+                                            axis=1, keepdims=True)
+
+      def _do_update(var, value):
+        if in_eager_mode and not self.trainable:
+          return
+
+        return self._assign_moving_average(var, value, self.momentum)
+
+      mean_update = tf_utils.smart_cond(
+          training,
+          lambda: _do_update(self.moving_mean, new_mean),
+          lambda: self.moving_mean)
+      variance_update = tf_utils.smart_cond(
+          training,
+          lambda: _do_update(self.moving_variance, new_variance),
+          lambda: self.moving_variance)
+      if not context.executing_eagerly():
+        self.add_update(mean_update, inputs=True)
+        self.add_update(variance_update, inputs=True)
+
+    else:
+      mean, variance = self.moving_mean, self.moving_variance
+
+    outputs = nn.batch_normalization(inputs,
+                                     _broadcast(mean),
+                                     _broadcast(variance),
+                                     offset,
+                                     scale,
+                                     self.epsilon)
+    # If some components of the shape got lost due to adjustments, fix that.
+    outputs.set_shape(input_shape)
+
+    if self.virtual_batch_size is not None:
+      outputs = undo_virtual_batching(outputs)
+    if not context.executing_eagerly() and original_training_value is None:
+      outputs._uses_learning_phase = True  # pylint: disable=protected-access
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
 
   def get_config(self):
     config = {
@@ -133,5 +654,19 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
         'beta_constraint': constraints.serialize(self.beta_constraint),
         'gamma_constraint': constraints.serialize(self.gamma_constraint)
     }
+    # Only add TensorFlow-specific parameters if they are set, so as to preserve
+    # model compatibility with external Keras.
+    if self.renorm:
+      config['renorm'] = True
+      config['renorm_clipping'] = self.renorm_clipping
+      config['renorm_momentum'] = self.renorm_momentum
+    if self.virtual_batch_size is not None:
+      config['virtual_batch_size'] = self.virtual_batch_size
+    # Note: adjustment is not serializable.
+    if self.adjustment is not None:
+      logging.warning('The `adjustment` function of this `BatchNormalization` '
+                      'layer cannot be serialized and has been omitted from '
+                      'the layer config. It will not be included when '
+                      're-creating the layer from the saved config.')
     base_config = super(BatchNormalization, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
index 2b3628c3f1023612297465bdf3286246261992a2..fa9277e3d1e5bb0b9633abc46a96a11816dddb2d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
@@ -114,6 +114,26 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
+  def test_batchnorm_convnet_channel_last(self):
+    with self.test_session():
+      # keras.backend.set_learning_phase(True)
+
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(
+          axis=-1, input_shape=(4, 4, 3), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse', optimizer='sgd')
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+      model.fit(x, x, epochs=4, verbose=0)
+      out = model.predict(x)
+      out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+      out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+      np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+      np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
   def test_shared_batchnorm(self):
     """Test that a BN layer can be shared across different data streams.
     """
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling.py b/tensorflow/python/keras/_impl/keras/layers/pooling.py
index 15d53379769d8142f5b2755a07479f60751346d2..86bc8a680a529a9ea17592a42207fab58adeebce 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling.py
@@ -19,16 +19,98 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.layers import pooling as tf_pooling_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Pooling1D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 1D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format=None,
+               name=None, **kwargs):
+    super(Pooling1D, self).__init__(name=name, **kwargs)
+    if data_format is None:
+      data_format = backend.image_data_format()
+    if strides is None:
+      strides = pool_size
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 1, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=3)
+
+  def call(self, inputs):
+    # There is no TF op for 1D pooling, hence we make the inputs 4D.
+    if self.data_format == 'channels_last':
+      # input is NWC, make it NHWC
+      inputs = array_ops.expand_dims(inputs, 1)
+      # pool on the W dim
+      pool_shape = (1, 1) + self.pool_size + (1,)
+      strides = (1, 1) + self.strides + (1,)
+      data_format = 'NHWC'
+    else:
+      # input is NCW, make it NCHW
+      inputs = array_ops.expand_dims(inputs, 2)
+      # pool on the W dim
+      pool_shape = (1, 1, 1) + self.pool_size
+      strides = (1, 1, 1) + self.strides
+      data_format = 'NCHW'
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=data_format)
+
+    if self.data_format == 'channels_last':
+      return array_ops.squeeze(outputs, 1)
+    else:
+      return array_ops.squeeze(outputs, 2)
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = conv_utils.conv_output_length(input_shape[1], self.pool_size[0],
+                                           self.padding, self.strides[0])
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
+  def get_config(self):
+    config = {
+        'strides': self.strides,
+        'pool_size': self.pool_size,
+        'padding': self.padding
+    }
+    base_config = super(Pooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @tf_export('keras.layers.MaxPool1D', 'keras.layers.MaxPooling1D')
-class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
+class MaxPooling1D(Pooling1D):
   """Max pooling operation for temporal data.
 
   Arguments:
@@ -45,23 +127,20 @@ class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
       3D tensor with shape: `(batch_size, downsampled_steps, features)`.
   """
 
-  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling1D, self).__init__(pool_size, strides, padding, **kwargs)
+  def __init__(self, pool_size=2, strides=None,
+               padding='valid', data_format=None, **kwargs):
 
-  def get_config(self):
-    config = {
-        'strides': self.strides,
-        'pool_size': self.pool_size,
-        'padding': self.padding
-    }
-    base_config = super(MaxPooling1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling1D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling1D', 'keras.layers.AvgPool1D')
-class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
+class AveragePooling1D(Pooling1D):
   """Average pooling for temporal data.
 
   Arguments:
@@ -78,24 +157,104 @@ class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
       3D tensor with shape: `(batch_size, downsampled_steps, features)`.
   """
 
-  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+  def __init__(self, pool_size=2, strides=None,
+               padding='valid', data_format=None, **kwargs):
+    super(AveragePooling1D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        **kwargs)
+
+
+class Pooling2D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format=None,
+               name=None, **kwargs):
+    super(Pooling2D, self).__init__(name=name, **kwargs)
+    if data_format is None:
+      data_format = backend.image_data_format()
     if strides is None:
       strides = pool_size
-    super(AveragePooling1D, self).__init__(pool_size, strides, padding,
-                                           **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 2, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=4)
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      pool_shape = (1,) + self.pool_size + (1,)
+      strides = (1,) + self.strides + (1,)
+    else:
+      pool_shape = (1, 1) + self.pool_size
+      strides = (1, 1) + self.strides
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format, 4))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+    rows = conv_utils.conv_output_length(rows, self.pool_size[0], self.padding,
+                                         self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.pool_size[1], self.padding,
+                                         self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, input_shape[3]])
 
   def get_config(self):
     config = {
-        'strides': self.strides,
         'pool_size': self.pool_size,
-        'padding': self.padding
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
     }
-    base_config = super(AveragePooling1D, self).get_config()
+    base_config = super(Pooling2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D')
-class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
+class MaxPooling2D(Pooling2D):
   """Max pooling operation for spatial data.
 
   Arguments:
@@ -142,26 +301,14 @@ class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling2D, self).__init__(pool_size, strides, padding, data_format,
-                                       **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(MaxPooling2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling2D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling2D', 'keras.layers.AvgPool2D')
-class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
+class AveragePooling2D(Pooling2D):
   """Average pooling operation for spatial data.
 
   Arguments:
@@ -208,12 +355,96 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
+    super(AveragePooling2D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
+
+
+class Pooling3D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 3D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)`
+      while `channels_first` corresponds to
+      inputs with shape `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(Pooling3D, self).__init__(name=name, **kwargs)
     if data_format is None:
-      data_format = K.image_data_format()
+      data_format = backend.image_data_format()
     if strides is None:
       strides = pool_size
-    super(AveragePooling2D, self).__init__(pool_size, strides, padding,
-                                           data_format, **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 3, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 3, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=5)
+
+  def call(self, inputs):
+    pool_shape = (1,) + self.pool_size + (1,)
+    strides = (1,) + self.strides + (1,)
+
+    if self.data_format == 'channels_first':
+      # TF does not support `channels_first` with 3D pooling operations,
+      # so we must handle this case manually.
+      # TODO(fchollet): remove this when TF pooling is feature-complete.
+      inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper())
+
+    if self.data_format == 'channels_first':
+      outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      len_dim1 = input_shape[2]
+      len_dim2 = input_shape[3]
+      len_dim3 = input_shape[4]
+    else:
+      len_dim1 = input_shape[1]
+      len_dim2 = input_shape[2]
+      len_dim3 = input_shape[3]
+    len_dim1 = conv_utils.conv_output_length(len_dim1, self.pool_size[0],
+                                             self.padding, self.strides[0])
+    len_dim2 = conv_utils.conv_output_length(len_dim2, self.pool_size[1],
+                                             self.padding, self.strides[1])
+    len_dim3 = conv_utils.conv_output_length(len_dim3, self.pool_size[2],
+                                             self.padding, self.strides[2])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
 
   def get_config(self):
     config = {
@@ -222,12 +453,12 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
         'strides': self.strides,
         'data_format': self.data_format
     }
-    base_config = super(AveragePooling2D, self).get_config()
+    base_config = super(Pooling3D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.MaxPool3D', 'keras.layers.MaxPooling3D')
-class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
+class MaxPooling3D(Pooling3D):
   """Max pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -270,26 +501,14 @@ class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling3D, self).__init__(pool_size, strides, padding, data_format,
-                                       **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(MaxPooling3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling3D, self).__init__(
+        nn.max_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling3D', 'keras.layers.AvgPool3D')
-class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
+class AveragePooling3D(Pooling3D):
   """Average pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -332,30 +551,18 @@ class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(AveragePooling3D, self).__init__(pool_size, strides, padding,
-                                           data_format, **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(AveragePooling3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(AveragePooling3D, self).__init__(
+        nn.avg_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
-class _GlobalPooling1D(Layer):
+class GlobalPooling1D(Layer):
   """Abstract class for different global pooling 1D layers.
   """
 
   def __init__(self, **kwargs):
-    super(_GlobalPooling1D, self).__init__(**kwargs)
+    super(GlobalPooling1D, self).__init__(**kwargs)
     self.input_spec = InputSpec(ndim=3)
 
   def compute_output_shape(self, input_shape):
@@ -368,7 +575,7 @@ class _GlobalPooling1D(Layer):
 
 @tf_export('keras.layers.GlobalAveragePooling1D',
            'keras.layers.GlobalAvgPool1D')
-class GlobalAveragePooling1D(_GlobalPooling1D):
+class GlobalAveragePooling1D(GlobalPooling1D):
   """Global average pooling operation for temporal data.
 
   Input shape:
@@ -380,11 +587,11 @@ class GlobalAveragePooling1D(_GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return K.mean(inputs, axis=1)
+    return backend.mean(inputs, axis=1)
 
 
 @tf_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
-class GlobalMaxPooling1D(_GlobalPooling1D):
+class GlobalMaxPooling1D(GlobalPooling1D):
   """Global max pooling operation for temporal data.
 
   Input shape:
@@ -396,15 +603,15 @@ class GlobalMaxPooling1D(_GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return K.max(inputs, axis=1)
+    return backend.max(inputs, axis=1)
 
 
-class _GlobalPooling2D(Layer):
+class GlobalPooling2D(Layer):
   """Abstract class for different global pooling 2D layers.
   """
 
   def __init__(self, data_format=None, **kwargs):
-    super(_GlobalPooling2D, self).__init__(**kwargs)
+    super(GlobalPooling2D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=4)
 
@@ -420,13 +627,13 @@ class _GlobalPooling2D(Layer):
 
   def get_config(self):
     config = {'data_format': self.data_format}
-    base_config = super(_GlobalPooling2D, self).get_config()
+    base_config = super(GlobalPooling2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.GlobalAveragePooling2D',
            'keras.layers.GlobalAvgPool2D')
-class GlobalAveragePooling2D(_GlobalPooling2D):
+class GlobalAveragePooling2D(GlobalPooling2D):
   """Global average pooling operation for spatial data.
 
   Arguments:
@@ -456,13 +663,13 @@ class GlobalAveragePooling2D(_GlobalPooling2D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.mean(inputs, axis=[1, 2])
+      return backend.mean(inputs, axis=[1, 2])
     else:
-      return K.mean(inputs, axis=[2, 3])
+      return backend.mean(inputs, axis=[2, 3])
 
 
 @tf_export('keras.layers.GlobalMaxPool2D', 'keras.layers.GlobalMaxPooling2D')
-class GlobalMaxPooling2D(_GlobalPooling2D):
+class GlobalMaxPooling2D(GlobalPooling2D):
   """Global max pooling operation for spatial data.
 
   Arguments:
@@ -492,17 +699,17 @@ class GlobalMaxPooling2D(_GlobalPooling2D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.max(inputs, axis=[1, 2])
+      return backend.max(inputs, axis=[1, 2])
     else:
-      return K.max(inputs, axis=[2, 3])
+      return backend.max(inputs, axis=[2, 3])
 
 
-class _GlobalPooling3D(Layer):
+class GlobalPooling3D(Layer):
   """Abstract class for different global pooling 3D layers.
   """
 
   def __init__(self, data_format=None, **kwargs):
-    super(_GlobalPooling3D, self).__init__(**kwargs)
+    super(GlobalPooling3D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=5)
 
@@ -518,13 +725,13 @@ class _GlobalPooling3D(Layer):
 
   def get_config(self):
     config = {'data_format': self.data_format}
-    base_config = super(_GlobalPooling3D, self).get_config()
+    base_config = super(GlobalPooling3D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.GlobalAveragePooling3D',
            'keras.layers.GlobalAvgPool3D')
-class GlobalAveragePooling3D(_GlobalPooling3D):
+class GlobalAveragePooling3D(GlobalPooling3D):
   """Global Average pooling operation for 3D data.
 
   Arguments:
@@ -554,13 +761,13 @@ class GlobalAveragePooling3D(_GlobalPooling3D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.mean(inputs, axis=[1, 2, 3])
+      return backend.mean(inputs, axis=[1, 2, 3])
     else:
-      return K.mean(inputs, axis=[2, 3, 4])
+      return backend.mean(inputs, axis=[2, 3, 4])
 
 
 @tf_export('keras.layers.GlobalMaxPool3D', 'keras.layers.GlobalMaxPooling3D')
-class GlobalMaxPooling3D(_GlobalPooling3D):
+class GlobalMaxPooling3D(GlobalPooling3D):
   """Global Max pooling operation for 3D data.
 
   Arguments:
@@ -590,9 +797,9 @@ class GlobalMaxPooling3D(_GlobalPooling3D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.max(inputs, axis=[1, 2, 3])
+      return backend.max(inputs, axis=[1, 2, 3])
     else:
-      return K.max(inputs, axis=[2, 3, 4])
+      return backend.max(inputs, axis=[2, 3, 4])
 
 
 # Aliases
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py b/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
index ec0a5ae560f49ee39ecffb64f4ac65d3e800024c..2c08b647ea0fafb7519240b0c81e8fa77f034f7f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
@@ -25,81 +27,85 @@ from tensorflow.python.platform import test
 
 class GlobalPoolingTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_1d(self):
-    with self.test_session(use_gpu=True):
-      testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
-                               input_shape=(3, 4, 5))
-      testing_utils.layer_test(
-          keras.layers.pooling.GlobalAveragePooling1D, input_shape=(3, 4, 5))
+    testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
+                             input_shape=(3, 4, 5))
+    testing_utils.layer_test(
+        keras.layers.pooling.GlobalAveragePooling1D, input_shape=(3, 4, 5))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_2d(self):
-    with self.test_session(use_gpu=True):
-      testing_utils.layer_test(
-          keras.layers.pooling.GlobalMaxPooling2D,
-          kwargs={'data_format': 'channels_first'},
-          input_shape=(3, 4, 5, 6))
-      testing_utils.layer_test(
-          keras.layers.pooling.GlobalMaxPooling2D,
-          kwargs={'data_format': 'channels_last'},
-          input_shape=(3, 5, 6, 4))
-      testing_utils.layer_test(
-          keras.layers.pooling.GlobalAveragePooling2D,
-          kwargs={'data_format': 'channels_first'},
-          input_shape=(3, 4, 5, 6))
-      testing_utils.layer_test(
-          keras.layers.pooling.GlobalAveragePooling2D,
-          kwargs={'data_format': 'channels_last'},
-          input_shape=(3, 5, 6, 4))
-
+    testing_utils.layer_test(
+        keras.layers.pooling.GlobalMaxPooling2D,
+        kwargs={'data_format': 'channels_first'},
+        input_shape=(3, 4, 5, 6))
+    testing_utils.layer_test(
+        keras.layers.pooling.GlobalMaxPooling2D,
+        kwargs={'data_format': 'channels_last'},
+        input_shape=(3, 5, 6, 4))
+    testing_utils.layer_test(
+        keras.layers.pooling.GlobalAveragePooling2D,
+        kwargs={'data_format': 'channels_first'},
+        input_shape=(3, 4, 5, 6))
+    testing_utils.layer_test(
+        keras.layers.pooling.GlobalAveragePooling2D,
+        kwargs={'data_format': 'channels_last'},
+        input_shape=(3, 5, 6, 4))
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_3d(self):
-    with self.test_session(use_gpu=True):
-      testing_utils.layer_test(
-          keras.layers.pooling.GlobalMaxPooling3D,
-          kwargs={'data_format': 'channels_first'},
-          input_shape=(3, 4, 3, 4, 3))
-      testing_utils.layer_test(
-          keras.layers.pooling.GlobalMaxPooling3D,
-          kwargs={'data_format': 'channels_last'},
-          input_shape=(3, 4, 3, 4, 3))
-      testing_utils.layer_test(
-          keras.layers.pooling.GlobalAveragePooling3D,
-          kwargs={'data_format': 'channels_first'},
-          input_shape=(3, 4, 3, 4, 3))
-      testing_utils.layer_test(
-          keras.layers.pooling.GlobalAveragePooling3D,
-          kwargs={'data_format': 'channels_last'},
-          input_shape=(3, 4, 3, 4, 3))
+    testing_utils.layer_test(
+        keras.layers.pooling.GlobalMaxPooling3D,
+        kwargs={'data_format': 'channels_first'},
+        input_shape=(3, 4, 3, 4, 3))
+    testing_utils.layer_test(
+        keras.layers.pooling.GlobalMaxPooling3D,
+        kwargs={'data_format': 'channels_last'},
+        input_shape=(3, 4, 3, 4, 3))
+    testing_utils.layer_test(
+        keras.layers.pooling.GlobalAveragePooling3D,
+        kwargs={'data_format': 'channels_first'},
+        input_shape=(3, 4, 3, 4, 3))
+    testing_utils.layer_test(
+        keras.layers.pooling.GlobalAveragePooling3D,
+        kwargs={'data_format': 'channels_last'},
+        input_shape=(3, 4, 3, 4, 3))
 
 
 class Pooling2DTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_2d(self):
     pool_size = (3, 3)
-    with self.test_session(use_gpu=True):
-      for strides in [(1, 1), (2, 2)]:
-        testing_utils.layer_test(
-            keras.layers.MaxPooling2D,
-            kwargs={
-                'strides': strides,
-                'padding': 'valid',
-                'pool_size': pool_size
-            },
-            input_shape=(3, 5, 6, 4))
-
-  def test_averagepooling_2d(self):
-    with self.test_session(use_gpu=True):
+    for strides in [(1, 1), (2, 2)]:
       testing_utils.layer_test(
-          keras.layers.AveragePooling2D,
-          kwargs={'strides': (2, 2),
-                  'padding': 'same',
-                  'pool_size': (2, 2)},
-          input_shape=(3, 5, 6, 4))
-      testing_utils.layer_test(
-          keras.layers.AveragePooling2D,
-          kwargs={'strides': (2, 2),
-                  'padding': 'valid',
-                  'pool_size': (3, 3)},
+          keras.layers.MaxPooling2D,
+          kwargs={
+              'strides': strides,
+              'padding': 'valid',
+              'pool_size': pool_size
+          },
           input_shape=(3, 5, 6, 4))
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_averagepooling_2d(self):
+    testing_utils.layer_test(
+        keras.layers.AveragePooling2D,
+        kwargs={'strides': (2, 2),
+                'padding': 'same',
+                'pool_size': (2, 2)},
+        input_shape=(3, 5, 6, 4))
+    testing_utils.layer_test(
+        keras.layers.AveragePooling2D,
+        kwargs={'strides': (2, 2),
+                'padding': 'valid',
+                'pool_size': (3, 3)},
+        input_shape=(3, 5, 6, 4))
+
+    # This part of the test can only run on GPU but doesn't appear
+    # to be properly assigned to a GPU when running in eager mode.
+    if not context.executing_eagerly():
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
       if test.is_gpu_available(cuda_only=True):
@@ -116,66 +122,66 @@ class Pooling2DTest(test.TestCase):
 
 class Pooling3DTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_3d(self):
     pool_size = (3, 3, 3)
-    with self.test_session(use_gpu=True):
-      testing_utils.layer_test(
-          keras.layers.MaxPooling3D,
-          kwargs={'strides': 2,
-                  'padding': 'valid',
-                  'pool_size': pool_size},
-          input_shape=(3, 11, 12, 10, 4))
-      testing_utils.layer_test(
-          keras.layers.MaxPooling3D,
-          kwargs={
-              'strides': 3,
-              'padding': 'valid',
-              'data_format': 'channels_first',
-              'pool_size': pool_size
-          },
-          input_shape=(3, 4, 11, 12, 10))
-
+    testing_utils.layer_test(
+        keras.layers.MaxPooling3D,
+        kwargs={'strides': 2,
+                'padding': 'valid',
+                'pool_size': pool_size},
+        input_shape=(3, 11, 12, 10, 4))
+    testing_utils.layer_test(
+        keras.layers.MaxPooling3D,
+        kwargs={
+            'strides': 3,
+            'padding': 'valid',
+            'data_format': 'channels_first',
+            'pool_size': pool_size
+        },
+        input_shape=(3, 4, 11, 12, 10))
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_3d(self):
     pool_size = (3, 3, 3)
-    with self.test_session(use_gpu=True):
-      testing_utils.layer_test(
-          keras.layers.AveragePooling3D,
-          kwargs={'strides': 2,
-                  'padding': 'valid',
-                  'pool_size': pool_size},
-          input_shape=(3, 11, 12, 10, 4))
-      testing_utils.layer_test(
-          keras.layers.AveragePooling3D,
-          kwargs={
-              'strides': 3,
-              'padding': 'valid',
-              'data_format': 'channels_first',
-              'pool_size': pool_size
-          },
-          input_shape=(3, 4, 11, 12, 10))
+    testing_utils.layer_test(
+        keras.layers.AveragePooling3D,
+        kwargs={'strides': 2,
+                'padding': 'valid',
+                'pool_size': pool_size},
+        input_shape=(3, 11, 12, 10, 4))
+    testing_utils.layer_test(
+        keras.layers.AveragePooling3D,
+        kwargs={
+            'strides': 3,
+            'padding': 'valid',
+            'data_format': 'channels_first',
+            'pool_size': pool_size
+        },
+        input_shape=(3, 4, 11, 12, 10))
 
 
 class Pooling1DTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_1d(self):
-    with self.test_session(use_gpu=True):
-      for padding in ['valid', 'same']:
-        for stride in [1, 2]:
-          testing_utils.layer_test(
-              keras.layers.MaxPooling1D,
-              kwargs={'strides': stride,
-                      'padding': padding},
-              input_shape=(3, 5, 4))
+    for padding in ['valid', 'same']:
+      for stride in [1, 2]:
+        testing_utils.layer_test(
+            keras.layers.MaxPooling1D,
+            kwargs={'strides': stride,
+                    'padding': padding},
+            input_shape=(3, 5, 4))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_1d(self):
-    with self.test_session(use_gpu=True):
-      for padding in ['valid', 'same']:
-        for stride in [1, 2]:
-          testing_utils.layer_test(
-              keras.layers.AveragePooling1D,
-              kwargs={'strides': stride,
-                      'padding': padding},
-              input_shape=(3, 5, 4))
+    for padding in ['valid', 'same']:
+      for stride in [1, 2]:
+        testing_utils.layer_test(
+            keras.layers.AveragePooling1D,
+            kwargs={'strides': stride,
+                    'padding': padding},
+            input_shape=(3, 5, 4))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 2e9003f52d7617e96950f76637759e577a8b5e4f..caf9e6f46f51c77bcd37953821c54d4f3145fb5b 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import numbers
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
@@ -30,8 +31,11 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -103,7 +107,7 @@ class StackedRNNCells(Layer):
     # Call the cells in order and store the returned states.
     new_nested_states = []
     for cell, states in zip(self.cells, nested_states):
-      if has_arg(cell.call, 'constants'):
+      if generic_utils.has_arg(cell.call, 'constants'):
         inputs, states = cell.call(inputs, states, constants=constants,
                                    **kwargs)
       else:
@@ -118,14 +122,14 @@ class StackedRNNCells(Layer):
       states += cell_states
     return inputs, states
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     if isinstance(input_shape, list):
       constants_shape = input_shape[1:]
       input_shape = input_shape[0]
     for cell in self.cells:
       if isinstance(cell, Layer):
-        if has_arg(cell.call, 'constants'):
+        if generic_utils.has_arg(cell.call, 'constants'):
           cell.build([input_shape] + constants_shape)
         else:
           cell.build(input_shape)
@@ -247,7 +251,7 @@ class RNN(Layer):
           It is also possible for `cell` to be a list of RNN cell instances,
           in which cases the cells get stacked on after the other in the RNN,
           implementing an efficient stacked RNN.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -425,7 +429,7 @@ class RNN(Layer):
   def states(self, states):
     self._states = states
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
@@ -457,7 +461,7 @@ class RNN(Layer):
     else:
       return output_mask
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Note input_shape will be list of shapes of initial states and
     # constants if these are passed in __call__.
@@ -499,12 +503,16 @@ class RNN(Layer):
       self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
     if self.stateful:
       self.reset_states()
+    self.built = True
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = array_ops.zeros_like(inputs)
+    # shape of initial_state = (samples, timesteps, input_dim)
+    initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
+    # shape of initial_state = (samples,)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)
+    # shape of initial_state = (samples, 1)
     if hasattr(self.cell.state_size, '__len__'):
       return [K.tile(initial_state, [1, dim]) for dim in self.cell.state_size]
     else:
@@ -545,8 +553,8 @@ class RNN(Layer):
         raise ValueError('The initial state or constants of an RNN'
                          ' layer cannot be specified with a mix of'
                          ' Keras tensors and non-Keras tensors'
-                         '(a "Keras tensor" is a tensor that was'
-                         'returned by a Keras layer, or by `Input`)')
+                         ' (a "Keras tensor" is a tensor that was'
+                         ' returned by a Keras layer, or by `Input`)')
 
     if is_keras_tensor:
       # Compute the full input spec, including state and constants
@@ -602,11 +610,11 @@ class RNN(Layer):
                        'or `batch_shape` argument to your Input layer.')
 
     kwargs = {}
-    if has_arg(self.cell.call, 'training'):
+    if generic_utils.has_arg(self.cell.call, 'training'):
       kwargs['training'] = training
 
     if constants:
-      if not has_arg(self.cell.call, 'constants'):
+      if not generic_utils.has_arg(self.cell.call, 'constants'):
         raise ValueError('RNN cell does not support constants')
 
       def step(inputs, states):
@@ -630,7 +638,7 @@ class RNN(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append(K.update(self.states[i], states[i]))
+        updates.append(state_ops.assign(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     if self.return_sequences:
@@ -790,10 +798,10 @@ class RNN(Layer):
 
   @property
   def losses(self):
-    losses = []
+    layer_losses = super(RNN, self).losses
     if isinstance(self.cell, Layer):
-      losses += self.cell.losses
-    return losses + self._losses
+      return self.cell.losses + layer_losses
+    return layer_losses
 
   @property
   def updates(self):
@@ -877,7 +885,7 @@ class SimpleRNNCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     self.kernel = self.add_weight(
         shape=(input_shape[-1], self.units),
@@ -906,8 +914,7 @@ class SimpleRNNCell(Layer):
     prev_output = states[0]
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training)
     if (0 < self.recurrent_dropout < 1 and
@@ -935,7 +942,9 @@ class SimpleRNNCell(Layer):
 
     # Properly set learning phase on output tensor.
     if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
+      if training is None and not context.executing_eagerly():
+        # This would be harmless to set in eager mode, but eager tensors
+        # disallow setting arbitrary attributes.
         output._uses_learning_phase = True
     return output, [output]
 
@@ -1009,7 +1018,7 @@ class SimpleRNN(RNN):
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -1229,6 +1238,9 @@ class GRUCell(Layer):
           batch them into fewer, larger operations. These modes will
           have different performance profiles on different hardware and
           for different applications.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before" (default),
+          True = "after" (CuDNN compatible).
   """
 
   def __init__(self,
@@ -1248,6 +1260,7 @@ class GRUCell(Layer):
                dropout=0.,
                recurrent_dropout=0.,
                implementation=1,
+               reset_after=False,
                **kwargs):
     super(GRUCell, self).__init__(**kwargs)
     self.units = units
@@ -1270,11 +1283,12 @@ class GRUCell(Layer):
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.implementation = implementation
+    self.reset_after = reset_after
     self.state_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -1291,31 +1305,27 @@ class GRUCell(Layer):
         constraint=self.recurrent_constraint)
 
     if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.units * 3,),
-          name='bias',
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-
-    self.kernel_z = self.kernel[:, :self.units]
-    self.recurrent_kernel_z = self.recurrent_kernel[:, :self.units]
-    self.kernel_r = self.kernel[:, self.units:self.units * 2]
-    self.recurrent_kernel_r = self.recurrent_kernel[:, self.units:
-                                                    self.units * 2]
-    self.kernel_h = self.kernel[:, self.units * 2:]
-    self.recurrent_kernel_h = self.recurrent_kernel[:, self.units * 2:]
+      if not self.reset_after:
+        bias_shape = (3 * self.units,)
+      else:
+        # separate biases for input and recurrent kernels
+        # Note: the shape is intentionally different from CuDNNGRU biases
+        # `(2 * 3 * self.units,)`, so that we can distinguish the classes
+        # when loading and converting saved weights.
+        bias_shape = (2, 3 * self.units)
+      self.bias = self.add_weight(shape=bias_shape,
+                                  name='bias',
+                                  initializer=self.bias_initializer,
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
+      if not self.reset_after:
+        self.input_bias, self.recurrent_bias = self.bias, None
+      else:
+        self.input_bias = K.flatten(self.bias[0])
+        self.recurrent_bias = K.flatten(self.bias[1])
 
-    if self.use_bias:
-      self.bias_z = self.bias[:self.units]
-      self.bias_r = self.bias[self.units:self.units * 2]
-      self.bias_h = self.bias[self.units * 2:]
     else:
-      self.bias_z = None
-      self.bias_r = None
-      self.bias_h = None
+      self.bias = None
     self.built = True
 
   def call(self, inputs, states, training=None):
@@ -1323,8 +1333,7 @@ class GRUCell(Layer):
 
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training,
           count=3)
@@ -1350,13 +1359,15 @@ class GRUCell(Layer):
         inputs_z = inputs
         inputs_r = inputs
         inputs_h = inputs
-      x_z = K.dot(inputs_z, self.kernel_z)
-      x_r = K.dot(inputs_r, self.kernel_r)
-      x_h = K.dot(inputs_h, self.kernel_h)
+
+      x_z = K.dot(inputs_z, self.kernel[:, :self.units])
+      x_r = K.dot(inputs_r, self.kernel[:, self.units:self.units * 2])
+      x_h = K.dot(inputs_h, self.kernel[:, self.units * 2:])
+
       if self.use_bias:
-        x_z = K.bias_add(x_z, self.bias_z)
-        x_r = K.bias_add(x_r, self.bias_r)
-        x_h = K.bias_add(x_h, self.bias_h)
+        x_z = K.bias_add(x_z, self.input_bias[:self.units])
+        x_r = K.bias_add(x_r, self.input_bias[self.units: self.units * 2])
+        x_h = K.bias_add(x_h, self.input_bias[self.units * 2:])
 
       if 0. < self.recurrent_dropout < 1.:
         h_tm1_z = h_tm1 * rec_dp_mask[0]
@@ -1366,73 +1377,103 @@ class GRUCell(Layer):
         h_tm1_z = h_tm1
         h_tm1_r = h_tm1
         h_tm1_h = h_tm1
-      z = self.recurrent_activation(
-          x_z + K.dot(h_tm1_z, self.recurrent_kernel_z))
-      r = self.recurrent_activation(
-          x_r + K.dot(h_tm1_r, self.recurrent_kernel_r))
 
-      hh = self.activation(x_h + K.dot(r * h_tm1_h, self.recurrent_kernel_h))
+      recurrent_z = K.dot(h_tm1_z, self.recurrent_kernel[:, :self.units])
+      recurrent_r = K.dot(h_tm1_r,
+                          self.recurrent_kernel[:, self.units:self.units * 2])
+      if self.reset_after and self.use_bias:
+        recurrent_z = K.bias_add(recurrent_z, self.recurrent_bias[:self.units])
+        recurrent_r = K.bias_add(recurrent_r,
+                                 self.recurrent_bias[self.units:
+                                                     self.units * 2])
+
+      z = self.recurrent_activation(x_z + recurrent_z)
+      r = self.recurrent_activation(x_r + recurrent_r)
+
+      # reset gate applied after/before matrix multiplication
+      if self.reset_after:
+        recurrent_h = K.dot(h_tm1_h, self.recurrent_kernel[:, self.units * 2:])
+        if self.use_bias:
+          recurrent_h = K.bias_add(recurrent_h,
+                                   self.recurrent_bias[self.units * 2:])
+        recurrent_h = r * recurrent_h
+      else:
+        recurrent_h = K.dot(r * h_tm1_h,
+                            self.recurrent_kernel[:, self.units * 2:])
+
+      hh = self.activation(x_h + recurrent_h)
     else:
       if 0. < self.dropout < 1.:
         inputs *= dp_mask[0]
+
+      # inputs projected by all gate matrices at once
       matrix_x = K.dot(inputs, self.kernel)
       if self.use_bias:
-        matrix_x = K.bias_add(matrix_x, self.bias)
+        # biases: bias_z_i, bias_r_i, bias_h_i
+        matrix_x = K.bias_add(matrix_x, self.input_bias)
+
+      x_z = matrix_x[:, :self.units]
+      x_r = matrix_x[:, self.units: 2 * self.units]
+      x_h = matrix_x[:, 2 * self.units:]
+
       if 0. < self.recurrent_dropout < 1.:
         h_tm1 *= rec_dp_mask[0]
-      matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
 
-      x_z = matrix_x[:, :self.units]
-      x_r = matrix_x[:, self.units:2 * self.units]
+      if self.reset_after:
+        # hidden state projected by all gate matrices at once
+        matrix_inner = K.dot(h_tm1, self.recurrent_kernel)
+        if self.use_bias:
+          matrix_inner = K.bias_add(matrix_inner, self.recurrent_bias)
+      else:
+        # hidden state projected separately for update/reset and new
+        matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
+
       recurrent_z = matrix_inner[:, :self.units]
       recurrent_r = matrix_inner[:, self.units:2 * self.units]
 
       z = self.recurrent_activation(x_z + recurrent_z)
       r = self.recurrent_activation(x_r + recurrent_r)
 
-      x_h = matrix_x[:, 2 * self.units:]
-      recurrent_h = K.dot(r * h_tm1, self.recurrent_kernel[:, 2 * self.units:])
+      if self.reset_after:
+        recurrent_h = r * matrix_inner[:, 2 * self.units:]
+      else:
+        recurrent_h = K.dot(r * h_tm1,
+                            self.recurrent_kernel[:, 2 * self.units:])
+
       hh = self.activation(x_h + recurrent_h)
+    # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
     if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
+      if training is None and not context.executing_eagerly():
+        # This would be harmless to set in eager mode, but eager tensors
+        # disallow setting arbitrary attributes.
         h._uses_learning_phase = True
+
     return h, [h]
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout,
+        'implementation': self.implementation,
+        'reset_after': self.reset_after
     }
     base_config = super(GRUCell, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -1440,9 +1481,16 @@ class GRUCell(Layer):
 
 @tf_export('keras.layers.GRU')
 class GRU(RNN):
-  """Gated Recurrent Unit - Cho et al.
+  """Gated Recurrent Unit - Cho et al. 2014.
+
+  There are two variants. The default one is based on 1406.1078v3 and
+  has reset gate applied to hidden state before matrix multiplication. The
+  other one is based on original 1406.1078v1 and has the order reversed.
 
-  2014.
+  The second variant is compatible with CuDNNGRU (GPU-only) and allows
+  inference on CPU. Thus it has separate biases for `kernel` and
+  `recurrent_kernel`. Use `'reset_after'=True` and
+  `recurrent_activation='sigmoid'`.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
@@ -1486,7 +1534,7 @@ class GRU(RNN):
           batch them into fewer, larger operations. These modes will
           have different performance profiles on different hardware and
           for different applications.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -1502,6 +1550,9 @@ class GRU(RNN):
           Unrolling can speed-up a RNN,
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before" (default),
+          True = "after" (CuDNN compatible).
 
   """
 
@@ -1528,6 +1579,7 @@ class GRU(RNN):
                go_backwards=False,
                stateful=False,
                unroll=False,
+               reset_after=False,
                **kwargs):
     if implementation == 0:
       logging.warning('`implementation=0` has been deprecated, '
@@ -1549,7 +1601,8 @@ class GRU(RNN):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation)
+        implementation=implementation,
+        reset_after=reset_after)
     super(GRU, self).__init__(
         cell,
         return_sequences=return_sequences,
@@ -1630,6 +1683,10 @@ class GRU(RNN):
   def implementation(self):
     return self.cell.implementation
 
+  @property
+  def reset_after(self):
+    return self.cell.reset_after
+
   def get_config(self):
     config = {
         'units':
@@ -1665,7 +1722,9 @@ class GRU(RNN):
         'recurrent_dropout':
             self.recurrent_dropout,
         'implementation':
-            self.implementation
+            self.implementation,
+        'reset_after':
+            self.reset_after
     }
     base_config = super(GRU, self).get_config()
     del base_config['cell']
@@ -1774,7 +1833,7 @@ class LSTMCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -1809,36 +1868,12 @@ class LSTMCell(Layer):
           constraint=self.bias_constraint)
     else:
       self.bias = None
-
-    self.kernel_i = self.kernel[:, :self.units]
-    self.kernel_f = self.kernel[:, self.units:self.units * 2]
-    self.kernel_c = self.kernel[:, self.units * 2:self.units * 3]
-    self.kernel_o = self.kernel[:, self.units * 3:]
-
-    self.recurrent_kernel_i = self.recurrent_kernel[:, :self.units]
-    self.recurrent_kernel_f = self.recurrent_kernel[:, self.units:
-                                                    self.units * 2]
-    self.recurrent_kernel_c = self.recurrent_kernel[:, self.units * 2:
-                                                    self.units * 3]
-    self.recurrent_kernel_o = self.recurrent_kernel[:, self.units * 3:]
-
-    if self.use_bias:
-      self.bias_i = self.bias[:self.units]
-      self.bias_f = self.bias[self.units:self.units * 2]
-      self.bias_c = self.bias[self.units * 2:self.units * 3]
-      self.bias_o = self.bias[self.units * 3:]
-    else:
-      self.bias_i = None
-      self.bias_f = None
-      self.bias_c = None
-      self.bias_o = None
     self.built = True
 
   def call(self, inputs, states, training=None):
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training,
           count=4)
@@ -1869,15 +1904,15 @@ class LSTMCell(Layer):
         inputs_f = inputs
         inputs_c = inputs
         inputs_o = inputs
-      x_i = K.dot(inputs_i, self.kernel_i)
-      x_f = K.dot(inputs_f, self.kernel_f)
-      x_c = K.dot(inputs_c, self.kernel_c)
-      x_o = K.dot(inputs_o, self.kernel_o)
+      x_i = K.dot(inputs_i, self.kernel[:, :self.units])
+      x_f = K.dot(inputs_f, self.kernel[:, self.units:self.units * 2])
+      x_c = K.dot(inputs_c, self.kernel[:, self.units * 2:self.units * 3])
+      x_o = K.dot(inputs_o, self.kernel[:, self.units * 3:])
       if self.use_bias:
-        x_i = K.bias_add(x_i, self.bias_i)
-        x_f = K.bias_add(x_f, self.bias_f)
-        x_c = K.bias_add(x_c, self.bias_c)
-        x_o = K.bias_add(x_o, self.bias_o)
+        x_i = K.bias_add(x_i, self.bias[:self.units])
+        x_f = K.bias_add(x_f, self.bias[self.units:self.units * 2])
+        x_c = K.bias_add(x_c, self.bias[self.units * 2:self.units * 3])
+        x_o = K.bias_add(x_o, self.bias[self.units * 3:])
 
       if 0 < self.recurrent_dropout < 1.:
         h_tm1_i = h_tm1 * rec_dp_mask[0]
@@ -1890,13 +1925,15 @@ class LSTMCell(Layer):
         h_tm1_c = h_tm1
         h_tm1_o = h_tm1
       i = self.recurrent_activation(
-          x_i + K.dot(h_tm1_i, self.recurrent_kernel_i))
+          x_i + K.dot(h_tm1_i, self.recurrent_kernel[:, :self.units]))
       f = self.recurrent_activation(
-          x_f + K.dot(h_tm1_f, self.recurrent_kernel_f))
+          x_f + K.dot(h_tm1_f,
+                      self.recurrent_kernel[:, self.units: self.units * 2]))
       c = f * c_tm1 + i * self.activation(
-          x_c + K.dot(h_tm1_c, self.recurrent_kernel_c))
+          x_c + K.dot(h_tm1_c,
+                      self.recurrent_kernel[:, self.units * 2: self.units * 3]))
       o = self.recurrent_activation(
-          x_o + K.dot(h_tm1_o, self.recurrent_kernel_o))
+          x_o + K.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3:]))
     else:
       if 0. < self.dropout < 1.:
         inputs *= dp_mask[0]
@@ -1919,7 +1956,9 @@ class LSTMCell(Layer):
 
     h = o * self.activation(c)
     if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
+      if training is None and not context.executing_eagerly():
+        # This would be harmless to set in eager mode, but eager tensors
+        # disallow setting arbitrary attributes.
         h._uses_learning_phase = True
     return h, [h, c]
 
@@ -1966,7 +2005,7 @@ class LSTMCell(Layer):
 
 @tf_export('keras.layers.LSTM')
 class LSTM(RNN):
-  """Long-Short Term Memory layer - Hochreiter 1997.
+  """Long Short-Term Memory layer - Hochreiter 1997.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
@@ -2216,7 +2255,7 @@ class LSTM(RNN):
 
 
 def _generate_dropout_ones(inputs, dims):
-  return K.ones((K.shape(inputs)[0], dims))
+  return K.ones((array_ops.shape(inputs)[0], dims))
 
 
 def _generate_dropout_mask(ones, rate, training=None, count=1):
@@ -2358,7 +2397,7 @@ class Recurrent(Layer):
     self.dropout = 0
     self.recurrent_dropout = 0
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
@@ -2391,9 +2430,12 @@ class Recurrent(Layer):
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = array_ops.zeros_like(inputs)
+    # shape of initial_state = (samples, timesteps, input_dim)
+    initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
+    # shape of initial_state = (samples,)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)
+    # shape of initial_state = (samples, 1)
     initial_state = K.tile(initial_state, [1,
                                            self.units])  # (samples, output_dim)
     initial_state = [initial_state for _ in range(len(self.states))]
@@ -2496,7 +2538,7 @@ class Recurrent(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append(K.update(self.states[i], states[i]))
+        updates.append(state_ops.assign(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     # Properly set learning phase
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index de022153f6f07240a0dff70e5faeed5b6d4a8c5f..4c68c18825a47d87806a7a09d4054f974d569e00 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -24,6 +24,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
@@ -229,6 +232,7 @@ class RNNTest(test.TestCase):
       cell = RNNCellWithConstants(32)
       layer = keras.layers.RNN(cell)
       y = layer(x, constants=c)
+
       model = keras.models.Model([x, c], y)
       model.compile(optimizer='rmsprop', loss='mse')
       model.train_on_batch(
@@ -276,6 +280,20 @@ class RNNTest(test.TestCase):
           np.zeros((6, 32))
       )
 
+    with self.test_session():
+      # Test GRUCell reset_after property.
+      x = keras.Input((None, 5))
+      c = keras.Input((3,))
+      cells = [keras.layers.recurrent.GRUCell(32, reset_after=True)]
+      layer = keras.layers.recurrent.RNN(cells)
+      y = layer(x, constants=c)
+      model = keras.models.Model([x, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+          np.zeros((6, 32))
+      )
+
     with self.test_session():
       # Test stacked RNN serialization
       x_np = np.random.random((6, 5, 5))
@@ -395,8 +413,8 @@ class RNNTest(test.TestCase):
 
     # Test `get_losses_for` and `losses`
     x = keras.Input((None, 1))
-    loss_1 = keras.backend.sum(x)
-    loss_2 = keras.backend.sum(cells[0].kernel)
+    loss_1 = math_ops.reduce_sum(x)
+    loss_2 = math_ops.reduce_sum(cells[0].kernel)
     cells[0].add_loss(loss_1, inputs=x)
     cells[0].add_loss(loss_2)
     self.assertEqual(len(layer.losses), 2)
@@ -410,15 +428,15 @@ class RNNTest(test.TestCase):
     layer.build((None, None, 1))
 
     x = keras.Input((None, 1))
-    update_1 = keras.backend.update_add(
-        cells[0].kernel, x[0, 0, 0] * cells[0].kernel)
-    update_2 = keras.backend.update_add(
-        cells[0].kernel, keras.backend.ones_like(cells[0].kernel))
+    update_1 = state_ops.assign_add(cells[0].kernel,
+                                    x[0, 0, 0] * cells[0].kernel)
+    update_2 = state_ops.assign_add(cells[0].kernel,
+                                    array_ops.ones_like(cells[0].kernel))
     cells[0].add_update(update_1, inputs=x)
     cells[0].add_update(update_2)
     self.assertEqual(len(layer.updates), 2)
-    self.assertEqual(layer.get_updates_for(None), [update_2])
-    self.assertEqual(layer.get_updates_for(x), [update_1])
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+    self.assertEqual(len(layer.get_updates_for(x)), 1)
 
   def test_rnn_dynamic_trainability(self):
     layer_class = keras.layers.SimpleRNN
@@ -538,6 +556,5 @@ class RNNTest(test.TestCase):
         [tuple(o.as_list()) for o in output_shape],
         expected_output_shape)
 
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/serialization.py b/tensorflow/python/keras/_impl/keras/layers/serialization.py
index 928feaadbf3554fdeec61527d730c475f25c0e5a..8151ad7fdddefe08e7af0563bdf27ab335d7d1f8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/serialization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/serialization.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras._impl.keras.layers.advanced_activations import *
 from tensorflow.python.keras._impl.keras.layers.convolutional import *
 from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import *
 from tensorflow.python.keras._impl.keras.layers.core import *
+from tensorflow.python.keras._impl.keras.layers.cudnn_recurrent import *
 from tensorflow.python.keras._impl.keras.layers.embeddings import *
 from tensorflow.python.keras._impl.keras.layers.local import *
 from tensorflow.python.keras._impl.keras.layers.merge import *
diff --git a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py b/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
index 7edebdacd07d74fe6b5a982d12645fb5556bdf75..8c7189cd4718450a85c015e08ab3a58cc5d86531 100644
--- a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
@@ -20,64 +20,66 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class SimpleRNNLayerTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_return_sequences_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.SimpleRNN,
-          kwargs={'units': units,
-                  'return_sequences': True},
-          input_shape=(num_samples, timesteps, embedding_dim))
+    testing_utils.layer_test(
+        keras.layers.SimpleRNN,
+        kwargs={'units': units,
+                'return_sequences': True},
+        input_shape=(num_samples, timesteps, embedding_dim))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_dynamic_behavior_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      layer = keras.layers.SimpleRNN(units, input_shape=(None, embedding_dim))
-      model = keras.models.Sequential()
-      model.add(layer)
-      model.compile('sgd', 'mse')
-      x = np.random.random((num_samples, timesteps, embedding_dim))
-      y = np.random.random((num_samples, units))
-      model.train_on_batch(x, y)
-
+    layer = keras.layers.SimpleRNN(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(RMSPropOptimizer(0.01), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_dropout_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.SimpleRNN,
-          kwargs={'units': units,
-                  'dropout': 0.1,
-                  'recurrent_dropout': 0.1},
-          input_shape=(num_samples, timesteps, embedding_dim))
-
+    testing_utils.layer_test(
+        keras.layers.SimpleRNN,
+        kwargs={'units': units,
+                'dropout': 0.1,
+                'recurrent_dropout': 0.1},
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_implementation_mode_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    with self.test_session():
-      for mode in [0, 1, 2]:
-        testing_utils.layer_test(
-            keras.layers.SimpleRNN,
-            kwargs={'units': units,
-                    'implementation': mode},
-            input_shape=(num_samples, timesteps, embedding_dim))
+    for mode in [0, 1, 2]:
+      testing_utils.layer_test(
+          keras.layers.SimpleRNN,
+          kwargs={'units': units,
+                  'implementation': mode},
+          input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_statefulness_SimpleRNN(self):
     num_samples = 2
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index 61f1a758e4701e6925af88b7fed9c48cf42ca735..91b8c1148bec568a3f26a4798ee120ed7a91faf1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -25,9 +25,9 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.layers import utils as tf_layers_util
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -182,7 +182,7 @@ class TimeDistributed(Wrapper):
 
   def call(self, inputs, training=None, mask=None):
     kwargs = {}
-    if has_arg(self.layer.call, 'training'):
+    if generic_utils.has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
     uses_learning_phase = False  # pylint: disable=redefined-outer-name
 
@@ -201,6 +201,7 @@ class TimeDistributed(Wrapper):
           step,
           inputs,
           initial_states=[],
+          input_length=input_shape[0],
           unroll=False)
       y = outputs
     else:
@@ -209,11 +210,11 @@ class TimeDistributed(Wrapper):
       # We can go with reshape-based implementation for performance.
       input_length = input_shape[1]
       if not input_length:
-        input_length = K.shape(inputs)[1]
+        input_length = array_ops.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...). And track the
       # transformation in self._input_map.
-      input_uid = tf_layers_util.object_list_uid(inputs)
-      inputs = K.reshape(inputs, (-1,) + input_shape[2:])
+      input_uid = generic_utils.object_list_uid(inputs)
+      inputs = array_ops.reshape(inputs, (-1,) + input_shape[2:])
       self._input_map[input_uid] = inputs
       # (num_samples * timesteps, ...)
       y = self.layer.call(inputs, **kwargs)
@@ -221,7 +222,7 @@ class TimeDistributed(Wrapper):
         uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
       output_shape = self.compute_output_shape(input_shape).as_list()
-      y = K.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
+      y = array_ops.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
 
     # Apply activity regularizer if any:
     if (hasattr(self.layer, 'activity_regularizer') and
@@ -304,7 +305,7 @@ class Bidirectional(Wrapper):
     self.forward_layer.set_weights(weights[:nw // 2])
     self.backward_layer.set_weights(weights[nw // 2:])
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     output_shape = tuple(self.forward_layer.compute_output_shape(
         input_shape).as_list())
@@ -382,12 +383,13 @@ class Bidirectional(Wrapper):
 
   def call(self, inputs, training=None, mask=None, initial_state=None):
     kwargs = {}
-    if has_arg(self.layer.call, 'training'):
+    if generic_utils.has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
-    if has_arg(self.layer.call, 'mask'):
+    if generic_utils.has_arg(self.layer.call, 'mask'):
       kwargs['mask'] = mask
 
-    if initial_state is not None and has_arg(self.layer.call, 'initial_state'):
+    if initial_state is not None and generic_utils.has_arg(
+        self.layer.call, 'initial_state'):
       forward_state = initial_state[:len(initial_state) // 2]
       backward_state = initial_state[len(initial_state) // 2:]
       y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs)
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py b/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
index c81d6b883cb0aa2b30331e35b387457072dbf3c3..8fcf66e90ff1289a06a996768ae5de2f1548a27c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
@@ -20,44 +20,43 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class TimeDistributedTest(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_timedistributed_dense(self):
-    # first, test with Dense layer
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(2), input_shape=(3, 4)))
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.fit(
-          np.random.random((10, 3, 4)),
-          np.random.random((10, 3, 2)),
-          epochs=1,
-          batch_size=10)
-
-      # test config
-      model.get_config()
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(2), input_shape=(3, 4)))
+    model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+    model.fit(
+        np.random.random((10, 3, 4)),
+        np.random.random((10, 3, 2)),
+        epochs=1,
+        batch_size=10)
+
+    # test config
+    model.get_config()
 
   def test_timedistributed_static_batch_size(self):
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(2), input_shape=(3, 4), batch_size=10))
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.fit(
-          np.random.random((10, 3, 4)),
-          np.random.random((10, 3, 2)),
-          epochs=1,
-          batch_size=10)
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(2), input_shape=(3, 4), batch_size=10))
+    model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+    model.fit(
+        np.random.random((10, 3, 4)),
+        np.random.random((10, 3, 2)),
+        epochs=1,
+        batch_size=10)
 
   def test_timedistributed_conv2d(self):
-    # test with Conv2D
     with self.test_session():
       model = keras.models.Sequential()
       model.add(
@@ -73,7 +72,6 @@ class TimeDistributedTest(test.TestCase):
       model.summary()
 
   def test_timedistributed_stacked(self):
-    # test stacked layers
     with self.test_session():
       model = keras.models.Sequential()
       model.add(
@@ -167,7 +165,7 @@ class BidirectionalTest(test.TestCase):
         model.add(
             keras.layers.Bidirectional(
                 rnn(output_dim), merge_mode=mode, input_shape=(timesteps, dim)))
-        model.compile(loss='mse', optimizer='sgd')
+        model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
         model.fit(x, y, epochs=1, batch_size=1)
 
         # test compute output shape
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 1576ed7b999f65992f46b357c8ebeda8935c68d0..1d634d38013164659f7360fce45704c19083f475 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -24,51 +24,55 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.mean_squared_error',
            'keras.losses.mean_squared_error')
 def mean_squared_error(y_true, y_pred):
-  return K.mean(K.square(y_pred - y_true), axis=-1)
+  return K.mean(math_ops.square(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_error',
            'keras.losses.mean_absolute_error')
 def mean_absolute_error(y_true, y_pred):
-  return K.mean(K.abs(y_pred - y_true), axis=-1)
+  return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_percentage_error',
            'keras.losses.mean_absolute_percentage_error')
 def mean_absolute_percentage_error(y_true, y_pred):
-  diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
+  diff = math_ops.abs(
+      (y_true - y_pred) / K.clip(math_ops.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
 
 
 @tf_export('keras.metrics.mean_squared_logarithmic_error',
            'keras.losses.mean_squared_logarithmic_error')
 def mean_squared_logarithmic_error(y_true, y_pred):
-  first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
-  second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
-  return K.mean(K.square(first_log - second_log), axis=-1)
+  first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
+  second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
+  return K.mean(math_ops.square(first_log - second_log), axis=-1)
 
 
 @tf_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
 def squared_hinge(y_true, y_pred):
-  return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1)
+  return K.mean(
+      math_ops.square(math_ops.maximum(1. - y_true * y_pred, 0.)), axis=-1)
 
 
 @tf_export('keras.metrics.hinge', 'keras.losses.hinge')
 def hinge(y_true, y_pred):
-  return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)
+  return K.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
 
 
 @tf_export('keras.losses.categorical_hinge')
 def categorical_hinge(y_true, y_pred):
-  pos = K.sum(y_true * y_pred, axis=-1)
-  neg = K.max((1. - y_true) * y_pred, axis=-1)
-  return K.maximum(0., neg - pos + 1.)
+  pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
+  neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
+  return math_ops.maximum(0., neg - pos + 1.)
 
 
 @tf_export('keras.losses.logcosh')
@@ -89,7 +93,7 @@ def logcosh(y_true, y_pred):
   """
 
   def _logcosh(x):
-    return x + K.softplus(-2. * x) - K.log(2.)
+    return x + nn.softplus(-2. * x) - math_ops.log(2.)
 
   return K.mean(_logcosh(y_pred - y_true), axis=-1)
 
@@ -117,19 +121,19 @@ def binary_crossentropy(y_true, y_pred):
 def kullback_leibler_divergence(y_true, y_pred):
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
-  return K.sum(y_true * K.log(y_true / y_pred), axis=-1)
+  return math_ops.reduce_sum(y_true * math_ops.log(y_true / y_pred), axis=-1)
 
 
 @tf_export('keras.metrics.poisson', 'keras.losses.poisson')
 def poisson(y_true, y_pred):
-  return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()), axis=-1)
+  return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
 
 @tf_export('keras.metrics.cosine_proximity', 'keras.losses.cosine_proximity')
 def cosine_proximity(y_true, y_pred):
-  y_true = K.l2_normalize(y_true, axis=-1)
-  y_pred = K.l2_normalize(y_pred, axis=-1)
-  return -K.sum(y_true * y_pred, axis=-1)
+  y_true = nn.l2_normalize(y_true, axis=-1)
+  y_pred = nn.l2_normalize(y_pred, axis=-1)
+  return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
 
 
 # Aliases.
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/_impl/keras/metrics.py
index 82778a3dc4fbdc13bb6682d01e28ff68882b6dd9..747c3e65157ded6b0d227c6d6667b9092d0eed44 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/_impl/keras/metrics.py
@@ -37,37 +37,45 @@ from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crosse
 from tensorflow.python.keras._impl.keras.losses import squared_hinge
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred):
-  return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1)
+  return K.mean(math_ops.equal(y_true, math_ops.round(y_pred)), axis=-1)
 
 
 @tf_export('keras.metrics.categorical_accuracy')
 def categorical_accuracy(y_true, y_pred):
-  return K.cast(
-      K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), K.floatx())
+  return math_ops.cast(
+      math_ops.equal(
+          math_ops.argmax(y_true, axis=-1), math_ops.argmax(y_pred, axis=-1)),
+      K.floatx())
 
 
 def sparse_categorical_accuracy(y_true, y_pred):
-  return K.cast(
-      K.equal(
-          K.max(y_true, axis=-1), K.cast(K.argmax(y_pred, axis=-1),
-                                         K.floatx())), K.floatx())
+  return math_ops.cast(
+      math_ops.equal(
+          math_ops.reduce_max(y_true, axis=-1),
+          math_ops.cast(math_ops.argmax(y_pred, axis=-1), K.floatx())),
+      K.floatx())
 
 
 @tf_export('keras.metrics.top_k_categorical_accuracy')
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
+  return K.mean(
+      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), axis=-1)
 
 
 @tf_export('keras.metrics.sparse_top_k_categorical_accuracy')
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(
-      K.in_top_k(y_pred, K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
-
+      nn.in_top_k(y_pred,
+                  math_ops.cast(math_ops.reduce_max(y_true, axis=-1), 'int32'),
+                  k),
+      axis=-1)
 
 # Aliases
 
diff --git a/tensorflow/python/keras/_impl/keras/metrics_test.py b/tensorflow/python/keras/_impl/keras/metrics_test.py
index 44289ea02abf5ae5f8befbe515552aea3d4b231e..13cef9781278109a9e726409aae5c4e325b2a5f0 100644
--- a/tensorflow/python/keras/_impl/keras/metrics_test.py
+++ b/tensorflow/python/keras/_impl/keras/metrics_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
@@ -73,75 +75,75 @@ class KerasMetricsTest(test.TestCase):
       self.assertEqual(result, 0.)
 
   def test_stateful_metrics(self):
-    np.random.seed(1334)
-
-    class BinaryTruePositives(keras.layers.Layer):
-      """Stateful Metric to count the total true positives over all batches.
-
-      Assumes predictions and targets of shape `(samples, 1)`.
-
-      Arguments:
-          threshold: Float, lower limit on prediction value that counts as a
-              positive class prediction.
-          name: String, name for the metric.
-      """
-
-      def __init__(self, name='true_positives', **kwargs):
-        super(BinaryTruePositives, self).__init__(name=name, **kwargs)
-        self.true_positives = keras.backend.variable(value=0, dtype='int32')
-
-      def reset_states(self):
-        keras.backend.set_value(self.true_positives, 0)
+    with self.test_session():
+      np.random.seed(1334)
 
-      def __call__(self, y_true, y_pred):
-        """Computes the number of true positives in a batch.
+      class BinaryTruePositives(keras.layers.Layer):
+        """Stateful Metric to count the total true positives over all batches.
 
-        Args:
-            y_true: Tensor, batch_wise labels
-            y_pred: Tensor, batch_wise predictions
+        Assumes predictions and targets of shape `(samples, 1)`.
 
-        Returns:
-            The total number of true positives seen this epoch at the
-                completion of the batch.
+        Arguments:
+            threshold: Float, lower limit on prediction value that counts as a
+                positive class prediction.
+            name: String, name for the metric.
         """
-        y_true = keras.backend.cast(y_true, 'int32')
-        y_pred = keras.backend.cast(keras.backend.round(y_pred), 'int32')
-        correct_preds = keras.backend.cast(
-            keras.backend.equal(y_pred, y_true), 'int32')
-        true_pos = keras.backend.cast(
-            keras.backend.sum(correct_preds * y_true), 'int32')
-        current_true_pos = self.true_positives * 1
-        self.add_update(keras.backend.update_add(self.true_positives,
-                                                 true_pos),
-                        inputs=[y_true, y_pred])
-        return current_true_pos + true_pos
-
-    metric_fn = BinaryTruePositives()
-    config = keras.metrics.serialize(metric_fn)
-    metric_fn = keras.metrics.deserialize(
-        config, custom_objects={'BinaryTruePositives': BinaryTruePositives})
-
-    # Test on simple model
-    inputs = keras.Input(shape=(2,))
-    outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
-    model = keras.Model(inputs, outputs)
-    model.compile(optimizer='sgd',
-                  loss='binary_crossentropy',
-                  metrics=['acc', metric_fn])
-
-    # Test fit, evaluate
-    samples = 1000
-    x = np.random.random((samples, 2))
-    y = np.random.randint(2, size=(samples, 1))
-    model.fit(x, y, epochs=1, batch_size=10)
-    outs = model.evaluate(x, y, batch_size=10)
-    preds = model.predict(x)
-
-    def ref_true_pos(y_true, y_pred):
-      return np.sum(np.logical_and(y_pred > 0.5, y_true == 1))
-
-    # Test correctness (e.g. updates should have been run)
-    self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
+
+        def __init__(self, name='true_positives', **kwargs):
+          super(BinaryTruePositives, self).__init__(name=name, **kwargs)
+          self.true_positives = keras.backend.variable(value=0, dtype='int32')
+
+        def reset_states(self):
+          keras.backend.set_value(self.true_positives, 0)
+
+        def __call__(self, y_true, y_pred):
+          """Computes the number of true positives in a batch.
+
+          Args:
+              y_true: Tensor, batch_wise labels
+              y_pred: Tensor, batch_wise predictions
+
+          Returns:
+              The total number of true positives seen this epoch at the
+                  completion of the batch.
+          """
+          y_true = math_ops.cast(y_true, 'int32')
+          y_pred = math_ops.cast(math_ops.round(y_pred), 'int32')
+          correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32')
+          true_pos = math_ops.cast(
+              math_ops.reduce_sum(correct_preds * y_true), 'int32')
+          current_true_pos = self.true_positives * 1
+          self.add_update(
+              state_ops.assign_add(self.true_positives, true_pos),
+              inputs=[y_true, y_pred])
+          return current_true_pos + true_pos
+
+      metric_fn = BinaryTruePositives()
+      config = keras.metrics.serialize(metric_fn)
+      metric_fn = keras.metrics.deserialize(
+          config, custom_objects={'BinaryTruePositives': BinaryTruePositives})
+
+      # Test on simple model
+      inputs = keras.Input(shape=(2,))
+      outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+      model = keras.Model(inputs, outputs)
+      model.compile(optimizer='sgd',
+                    loss='binary_crossentropy',
+                    metrics=['acc', metric_fn])
+
+      # Test fit, evaluate
+      samples = 1000
+      x = np.random.random((samples, 2))
+      y = np.random.randint(2, size=(samples, 1))
+      model.fit(x, y, epochs=1, batch_size=10)
+      outs = model.evaluate(x, y, batch_size=10)
+      preds = model.predict(x)
+
+      def ref_true_pos(y_true, y_pred):
+        return np.sum(np.logical_and(y_pred > 0.5, y_true == 1))
+
+      # Test correctness (e.g. updates should have been run)
+      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
index 3d71a620fcb34d21c41f920eed99b1fe22668899..295ad47f6be46443ce94f8c5c6228df50c5f693e 100644
--- a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
@@ -19,10 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import tempfile
 
 import numpy as np
+import six
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
@@ -36,6 +37,7 @@ except ImportError:
   h5py = None
 
 
+# pylint: disable=not-callable
 class SimpleTestModel(keras.Model):
 
   def __init__(self, use_bn=False, use_dp=False, num_classes=10):
@@ -104,7 +106,7 @@ class NestedTestModel1(keras.Model):
   def call(self, inputs):
     x = self.dense1(inputs)
     x = self.bn(x)
-    x = self.test_net(x)  # pylint: disable=not-callable
+    x = self.test_net(x)
     return self.dense2(x)
 
 
@@ -161,7 +163,7 @@ def get_nested_model_3(input_dim, num_classes):
       return tensor_shape.TensorShape((input_shape[0], 5))
 
   test_model = Inner()
-  x = test_model(x)  # pylint: disable=not-callable
+  x = test_model(x)
   outputs = keras.layers.Dense(num_classes)(x)
   return keras.Model(inputs, outputs, name='nested_model_3')
 
@@ -174,19 +176,18 @@ class ModelSubclassingTest(test.TestCase):
     num_samples = 100
     input_dim = 50
 
-    with self.test_session():
-      model = SimpleTestModel(num_classes=num_classes,
-                              use_dp=True,
-                              use_bn=True)
-      model.compile(loss='mse',
-                    optimizer=RMSPropOptimizer(learning_rate=0.001),
-                    metrics=['acc'])
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+    model.compile(loss='mse',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  metrics=['acc'])
 
-      x = np.ones((num_samples, input_dim))
-      y = np.zeros((num_samples, num_classes))
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
 
-      model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-      _ = model.evaluate(x, y, verbose=0)
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_multi_io_workflow_with_np_arrays(self):
@@ -194,21 +195,20 @@ class ModelSubclassingTest(test.TestCase):
     num_samples = 1000
     input_dim = 50
 
-    with self.test_session():
-      model = MultiIOTestModel(num_classes=num_classes,
-                               use_dp=True,
-                               use_bn=True)
-      model.compile(loss='mse',
-                    optimizer=RMSPropOptimizer(learning_rate=0.001),
-                    metrics=['acc'])
+    model = MultiIOTestModel(num_classes=num_classes,
+                             use_dp=True,
+                             use_bn=True)
+    model.compile(loss='mse',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  metrics=['acc'])
 
-      x1 = np.ones((num_samples, input_dim))
-      x2 = np.ones((num_samples, input_dim))
-      y1 = np.zeros((num_samples, num_classes[0]))
-      y2 = np.zeros((num_samples, num_classes[1]))
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
 
-      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-      _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
+    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
 
   def test_single_io_workflow_with_tensors(self):
 
@@ -321,14 +321,13 @@ class ModelSubclassingTest(test.TestCase):
     x = np.ones((num_samples, input_dim))
     y = np.ones((num_samples, input_dim))
 
-    with self.test_session():
-      model = BNNet()
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-      y_ref = model.predict(x)
+    model = BNNet()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    y_ref = model.predict(x)
 
-      model.train_on_batch(x, y)
-      y_new = model.predict(x)
-      self.assertGreater(np.sum(np.abs(y_ref - y_new)), 0.1)
+    model.train_on_batch(x, y)
+    y_new = model.predict(x)
+    self.assertGreater(np.sum(np.abs(y_ref - y_new)), 0.1)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_training_and_inference_behavior(self):
@@ -350,14 +349,13 @@ class ModelSubclassingTest(test.TestCase):
         x = self.dp(inputs)
         return self.dense(x)
 
-    with self.test_session():
-      model = DPNet()
-      x = np.ones((num_samples, input_dim))
-      y = model.predict(x)
-      self.assertEqual(np.sum(y), np.sum(x))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-      loss = model.train_on_batch(x, y)
-      self.assertGreater(loss, 0.1)
+    model = DPNet()
+    x = np.ones((num_samples, input_dim))
+    y = model.predict(x)
+    self.assertEqual(np.sum(y), np.sum(x))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    loss = model.train_on_batch(x, y)
+    self.assertGreater(loss, 0.1)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_training_methods(self):
@@ -373,21 +371,20 @@ class ModelSubclassingTest(test.TestCase):
     y1 = np.zeros((num_samples, num_classes[0]))
     y2 = np.zeros((num_samples, num_classes[1]))
 
-    with self.test_session():
-      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-      model.fit({'input_1': x1, 'input_2': x2},
-                {'output_1': y1, 'output_2': y2},
-                epochs=2, batch_size=32)
-      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0,
-                validation_data=([x1, x2], [y1, y2]))
+    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+    model.fit({'input_1': x1, 'input_2': x2},
+              {'output_1': y1, 'output_2': y2},
+              epochs=2, batch_size=32)
+    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0,
+              validation_data=([x1, x2], [y1, y2]))
 
-      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-      model.train_on_batch([x1, x2], [y1, y2])
-      model.train_on_batch({'input_1': x1, 'input_2': x2},
-                           {'output_1': y1, 'output_2': y2})
+    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.train_on_batch([x1, x2], [y1, y2])
+    model.train_on_batch({'input_1': x1, 'input_2': x2},
+                         {'output_1': y1, 'output_2': y2})
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def test_inference_methods(self):
@@ -402,17 +399,16 @@ class ModelSubclassingTest(test.TestCase):
     y1 = np.zeros((num_samples, num_classes[0]))
     y2 = np.zeros((num_samples, num_classes[1]))
 
-    with self.test_session():
-      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-      model.evaluate([x1, x2], [y1, y2])
-      model.test_on_batch([x1, x2], [y1, y2])
+    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.evaluate([x1, x2], [y1, y2])
+    model.test_on_batch([x1, x2], [y1, y2])
 
-      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-      model.predict([x1, x2])
+    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.predict([x1, x2])
 
-      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-      model.predict_on_batch([x1, x2])
+    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.predict_on_batch([x1, x2])
 
   @test_util.run_in_graph_and_eager_modes()
   def test_trainable_mutation(self):
@@ -423,8 +419,6 @@ class ModelSubclassingTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def test_saving(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
 
     num_classes = (2, 3)
     num_samples = 100
@@ -435,26 +429,35 @@ class ModelSubclassingTest(test.TestCase):
     y1 = np.zeros((num_samples, num_classes[0]))
     y2 = np.zeros((num_samples, num_classes[1]))
 
-    with self.test_session():
-      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-      y_ref_1, y_ref_2 = model.predict([x1, x2])
+    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+    y_ref_1, y_ref_2 = model.predict([x1, x2])
+
+    tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt')
+    model.save_weights(tf_format_name)
+    if h5py is not None:
+      hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
+      model.save_weights(hdf5_format_name)
 
-      fd, fname = tempfile.mkstemp('.h5')
-      model.save_weights(fname)
+    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
 
-      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-      # need to build the model before loading weights
-      # (otherwise no weights to load)
-      model._set_inputs([x1, x2])
-      model.load_weights(fname)
+    if h5py is not None:
+      with self.assertRaises(ValueError):
+        model.load_weights(hdf5_format_name)
+
+    model.load_weights(tf_format_name)
+
+    y1, y2 = model.predict([x1, x2])
+    self.assertAllClose(y_ref_1, y1, atol=1e-5)
+    self.assertAllClose(y_ref_2, y2, atol=1e-5)
+
+    if h5py is not None:
+      model.load_weights(hdf5_format_name)
 
       y1, y2 = model.predict([x1, x2])
       self.assertAllClose(y_ref_1, y1, atol=1e-5)
       self.assertAllClose(y_ref_2, y2, atol=1e-5)
-      os.close(fd)
-      os.remove(fname)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_summary(self):
@@ -488,23 +491,22 @@ class ModelSubclassingTest(test.TestCase):
     num_samples = 100
     input_dim = 50
 
-    with self.test_session():
-      model = NestedTestModel1(num_classes=num_classes)
-      model.compile(loss='mse',
-                    optimizer=RMSPropOptimizer(learning_rate=0.001),
-                    metrics=['acc'])
+    model = NestedTestModel1(num_classes=num_classes)
+    model.compile(loss='mse',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  metrics=['acc'])
 
-      x = np.ones((num_samples, input_dim))
-      y = np.zeros((num_samples, num_classes))
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
 
-      model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-      _ = model.evaluate(x, y, verbose=0)
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
 
-      self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
-      self.assertEqual(len(model.non_trainable_weights),
-                       2 + len(model.test_net.non_trainable_weights))
-      self.assertEqual(len(model.trainable_weights),
-                       6 + len(model.test_net.trainable_weights))
+    self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
+    self.assertEqual(len(model.non_trainable_weights),
+                     2 + len(model.test_net.non_trainable_weights))
+    self.assertEqual(len(model.trainable_weights),
+                     6 + len(model.test_net.trainable_weights))
 
   @test_util.run_in_graph_and_eager_modes()
   def test_graph_nested_in_subclass(self):
@@ -512,23 +514,22 @@ class ModelSubclassingTest(test.TestCase):
     num_samples = 100
     input_dim = 50
 
-    with self.test_session():
-      model = NestedTestModel2(num_classes=num_classes)
-      model.compile(loss='mse',
-                    optimizer=RMSPropOptimizer(learning_rate=0.001),
-                    metrics=['acc'])
+    model = NestedTestModel2(num_classes=num_classes)
+    model.compile(loss='mse',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  metrics=['acc'])
 
-      x = np.ones((num_samples, input_dim))
-      y = np.zeros((num_samples, num_classes))
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
 
-      model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-      _ = model.evaluate(x, y, verbose=0)
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
 
-      self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
-      self.assertEqual(len(model.non_trainable_weights),
-                       2 + len(model.test_net.non_trainable_weights))
-      self.assertEqual(len(model.trainable_weights),
-                       6 + len(model.test_net.trainable_weights))
+    self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
+    self.assertEqual(len(model.non_trainable_weights),
+                     2 + len(model.test_net.non_trainable_weights))
+    self.assertEqual(len(model.trainable_weights),
+                     6 + len(model.test_net.trainable_weights))
 
   @test_util.run_in_graph_and_eager_modes()
   def test_subclass_nested_in_graph(self):
@@ -536,22 +537,21 @@ class ModelSubclassingTest(test.TestCase):
     num_samples = 100
     input_dim = 50
 
-    with self.test_session():
-      model = get_nested_model_3(input_dim=input_dim, num_classes=num_classes)
-      model.compile(loss='mse',
-                    optimizer=RMSPropOptimizer(learning_rate=0.001),
-                    metrics=['acc'])
+    model = get_nested_model_3(input_dim=input_dim, num_classes=num_classes)
+    model.compile(loss='mse',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  metrics=['acc'])
 
-      x = np.ones((num_samples, input_dim))
-      y = np.zeros((num_samples, num_classes))
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
 
-      model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-      _ = model.evaluate(x, y, verbose=0)
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
 
-      self.assertEqual(len(model.weights), 16)
-      self.assertEqual(
-          len(model.non_trainable_weights), 4)
-      self.assertEqual(len(model.trainable_weights), 12)
+    self.assertEqual(len(model.weights), 16)
+    self.assertEqual(
+        len(model.non_trainable_weights), 4)
+    self.assertEqual(len(model.trainable_weights), 12)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_support_for_manual_training_arg(self):
@@ -575,15 +575,132 @@ class ModelSubclassingTest(test.TestCase):
         x = self.dp(inputs, training=training)
         return self.dense(x)
 
-    with self.test_session():
-      model = DPNet()
-      x = np.ones((10, 10))
-      y = model.predict(x)
-      self.assertEqual(np.sum(y), np.sum(x))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-      loss = model.train_on_batch(x, y)
-      self.assertGreater(loss, 0.1)
+    model = DPNet()
+    x = np.ones((10, 10))
+    y = model.predict(x)
+    self.assertEqual(np.sum(y), np.sum(x))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    loss = model.train_on_batch(x, y)
+    self.assertGreater(loss, 0.1)
+
+
+class CustomCallModel(keras.Model):
+
+  def __init__(self):
+    super(CustomCallModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1, activation='relu')
+    self.dense2 = keras.layers.Dense(1, activation='softmax')
+
+  def call(self, first, second, fiddle_with_output='no', training=True):
+    combined = self.dense1(first) + self.dense2(second)
+    if fiddle_with_output == 'yes':
+      return 10. * combined
+    else:
+      return combined
+
+
+class CustomCallSignatureTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_no_inputs_in_signature(self):
+    model = CustomCallModel()
+    first = array_ops.ones([2, 3])
+    second = array_ops.ones([2, 5])
+    output = model(first, second)
+    self.evaluate([v.initializer for v in model.variables])
+    expected_output = self.evaluate(model.dense1(first) + model.dense2(second))
+    self.assertAllClose(expected_output, self.evaluate(output))
+    output = model(first, second, fiddle_with_output='yes')
+    self.assertAllClose(10. * expected_output, self.evaluate(output))
+    output = model(first, second=second, training=False)
+    self.assertAllClose(expected_output, self.evaluate(output))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_inputs_in_signature(self):
+
+    class HasInputsAndOtherPositional(keras.Model):
+
+      def call(self, inputs, some_other_arg, training=False):
+        return inputs
+
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
+    model = HasInputsAndOtherPositional()
+    with self.assertRaisesRegexp(
+        TypeError, 'everything else as a keyword argument'):
+      x1, x2 = keras.Input((1, 1)), keras.Input((1, 1))
+      model(x1, x2)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_kwargs_in_signature(self):
+
+    class HasKwargs(keras.Model):
+
+      def call(self, x, y=3, **key_words):
+        return x
+
+    model = HasKwargs()
+    arg = array_ops.ones([])
+    model(arg, a=3)
+    if not context.executing_eagerly():
+      six.assertCountEqual(self, [arg], model.inputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_args_in_signature(self):
+
+    class HasArgs(keras.Model):
+
+      def call(self, x, *args, **kwargs):
+        return [x] + list(args)
+
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
+    model = HasArgs()
+    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+    model(x1, x2, x3, a=3)
+    if not context.executing_eagerly():
+      six.assertCountEqual(self, [x1, x2, x3], model.inputs)
+
+  def test_args_and_keywords_in_signature(self):
+
+    class HasArgs(keras.Model):
+
+      def call(self, x, training=True, *args, **kwargs):
+        return x
+
+    with context.graph_mode():
+      model = HasArgs()
+      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+      with self.assertRaisesRegexp(TypeError, 'args and arguments with'):
+        model(x1, x2, x3, a=3)
+
+  def test_training_no_default(self):
+
+    class TrainingNoDefault(keras.Model):
+
+      def call(self, x, training):
+        return x
+
+    with context.graph_mode():
+      model = TrainingNoDefault()
+      arg = array_ops.ones([])
+      model(arg, True)
+      six.assertCountEqual(self, [arg], model.inputs)
+
+  def test_training_no_default_with_positional(self):
+
+    class TrainingNoDefaultWithPositional(keras.Model):
+
+      def call(self, x, training, positional):
+        return x
 
+    with context.graph_mode():
+      model = TrainingNoDefaultWithPositional()
+      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+      with self.assertRaisesRegexp(TypeError, 'after a non-input'):
+        model(x1, x2, x3)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/_impl/keras/models.py
index 4c3ec7dbe458bfb78d38950b1bad7a474bb55ad3..9602e7ba39b290f33c7ca9d0d1b5b35838667531 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/_impl/keras/models.py
@@ -13,1305 +13,30 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=protected-access
-"""Home of the Sequential model, and the `save_model`/`load_model` functions.
+"""Code for model cloning, plus model-related API entries.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-import json
-import os
-
-import numpy as np
-
-from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers as layer_module
-from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras.engine import topology
-from tensorflow.python.keras._impl.keras.engine.topology import Input
-from tensorflow.python.keras._impl.keras.engine.topology import InputLayer
-from tensorflow.python.keras._impl.keras.engine.topology import Layer
-from tensorflow.python.keras._impl.keras.engine.topology import TFBaseLayer
-from tensorflow.python.keras._impl.keras.engine.training import Model
+from tensorflow.python.keras._impl.keras.engine import saving
+from tensorflow.python.keras._impl.keras.engine import sequential
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.engine.input_layer import Input
+from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras._impl.keras.utils import generic_utils
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util.tf_export import tf_export
-
-
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-except ImportError:
-  h5py = None
-
-try:
-  import yaml
-except ImportError:
-  yaml = None
-# pylint: enable=g-import-not-at-top
-
-
-@tf_export('keras.models.save_model')
-def save_model(model, filepath, overwrite=True, include_optimizer=True):
-  """Save a model to a HDF5 file.
-
-  The saved model contains:
-      - the model's configuration (topology)
-      - the model's weights
-      - the model's optimizer's state (if any)
-
-  Thus the saved model can be reinstantiated in
-  the exact same state, without any of the code
-  used for model definition or training.
-
-  Arguments:
-      model: Keras model instance to be saved.
-      filepath: String, path where to save the model.
-      overwrite: Whether we should overwrite any existing
-          model at the target location, or instead
-          ask the user with a manual prompt.
-      include_optimizer: If True, save optimizer's state together.
-
-  Raises:
-      ImportError: if h5py is not available.
-  """
-
-  if h5py is None:
-    raise ImportError('`save_model` requires h5py.')
-
-  def get_json_type(obj):
-    """Serialize any object to a JSON-serializable structure.
-
-    Arguments:
-        obj: the object to serialize
-
-    Returns:
-        JSON-serializable structure representing `obj`.
-
-    Raises:
-        TypeError: if `obj` cannot be serialized.
-    """
-    # if obj is a serializable Keras class instance
-    # e.g. optimizer, layer
-    if hasattr(obj, 'get_config'):
-      return {'class_name': obj.__class__.__name__, 'config': obj.get_config()}
-
-    # if obj is any numpy type
-    if type(obj).__module__ == np.__name__:
-      if isinstance(obj, np.ndarray):
-        return {'type': type(obj), 'value': obj.tolist()}
-      else:
-        return obj.item()
-
-    # misc functions (e.g. loss function)
-    if callable(obj):
-      return obj.__name__
-
-    # if obj is a python 'type'
-    if type(obj).__name__ == type.__name__:
-      return obj.__name__
-
-    raise TypeError('Not JSON Serializable:', obj)
-
-  from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-  # If file exists and should not be overwritten.
-  if not overwrite and os.path.isfile(filepath):
-    proceed = ask_to_proceed_with_overwrite(filepath)
-    if not proceed:
-      return
-
-  with h5py.File(filepath, mode='w') as f:
-    f.attrs['keras_version'] = str(keras_version).encode('utf8')
-    f.attrs['backend'] = K.backend().encode('utf8')
-    f.attrs['model_config'] = json.dumps(
-        {
-            'class_name': model.__class__.__name__,
-            'config': model.get_config()
-        },
-        default=get_json_type).encode('utf8')
-
-    model_weights_group = f.create_group('model_weights')
-    model_layers = model.layers
-    topology.save_weights_to_hdf5_group(model_weights_group, model_layers)
-
-    if include_optimizer and hasattr(model, 'optimizer'):
-      if isinstance(model.optimizer, optimizers.TFOptimizer):
-        logging.warning(
-            'TensorFlow optimizers do not '
-            'make it possible to access '
-            'optimizer attributes or optimizer state '
-            'after instantiation. '
-            'As a result, we cannot save the optimizer '
-            'as part of the model save file.'
-            'You will have to compile your model again after loading it. '
-            'Prefer using a Keras optimizer instead '
-            '(see keras.io/optimizers).')
-      else:
-        f.attrs['training_config'] = json.dumps(
-            {
-                'optimizer_config': {
-                    'class_name': model.optimizer.__class__.__name__,
-                    'config': model.optimizer.get_config()
-                },
-                'loss': model.loss,
-                'metrics': model.metrics,
-                'sample_weight_mode': model.sample_weight_mode,
-                'loss_weights': model.loss_weights,
-            },
-            default=get_json_type).encode('utf8')
-
-        # Save optimizer weights.
-        symbolic_weights = getattr(model.optimizer, 'weights')
-        if symbolic_weights:
-          optimizer_weights_group = f.create_group('optimizer_weights')
-          weight_values = K.batch_get_value(symbolic_weights)
-          weight_names = []
-          for w, val in zip(symbolic_weights, weight_values):
-            name = str(w.name)
-            weight_names.append(name.encode('utf8'))
-          optimizer_weights_group.attrs['weight_names'] = weight_names
-          for name, val in zip(weight_names, weight_values):
-            param_dset = optimizer_weights_group.create_dataset(
-                name, val.shape, dtype=val.dtype)
-            if not val.shape:
-              # scalar
-              param_dset[()] = val
-            else:
-              param_dset[:] = val
-    f.flush()
-
-
-@tf_export('keras.models.load_model')
-def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
-  """Loads a model saved via `save_model`.
-
-  Arguments:
-      filepath: String, path to the saved model.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-      compile: Boolean, whether to compile the model
-          after loading.
-
-  Returns:
-      A Keras model instance. If an optimizer was found
-      as part of the saved model, the model is already
-      compiled. Otherwise, the model is uncompiled and
-      a warning will be displayed. When `compile` is set
-      to False, the compilation is omitted without any
-      warning.
-
-  Raises:
-      ImportError: if h5py is not available.
-      ValueError: In case of an invalid savefile.
-  """
-  if h5py is None:
-    raise ImportError('`load_model` requires h5py.')
-
-  if not custom_objects:
-    custom_objects = {}
-
-  def convert_custom_objects(obj):
-    """Handles custom object lookup.
-
-    Arguments:
-        obj: object, dict, or list.
-
-    Returns:
-        The same structure, where occurrences
-            of a custom object name have been replaced
-            with the custom object.
-    """
-    if isinstance(obj, list):
-      deserialized = []
-      for value in obj:
-        deserialized.append(convert_custom_objects(value))
-      return deserialized
-    if isinstance(obj, dict):
-      deserialized = {}
-      for key, value in obj.items():
-        deserialized[key] = convert_custom_objects(value)
-      return deserialized
-    if obj in custom_objects:
-      return custom_objects[obj]
-    return obj
-
-  with h5py.File(filepath, mode='r') as f:
-    # instantiate model
-    model_config = f.attrs.get('model_config')
-    if model_config is None:
-      raise ValueError('No model found in config file.')
-    model_config = json.loads(model_config.decode('utf-8'))
-    model = model_from_config(model_config, custom_objects=custom_objects)
-
-    # set weights
-    topology.load_weights_from_hdf5_group(f['model_weights'], model.layers)
-
-    # Early return if compilation is not required.
-    if not compile:
-      return model
-
-    # instantiate optimizer
-    training_config = f.attrs.get('training_config')
-    if training_config is None:
-      logging.warning('No training configuration found in save file: '
-                      'the model was *not* compiled. Compile it manually.')
-      return model
-    training_config = json.loads(training_config.decode('utf-8'))
-    optimizer_config = training_config['optimizer_config']
-    optimizer = optimizers.deserialize(
-        optimizer_config, custom_objects=custom_objects)
-
-    # Recover loss functions and metrics.
-    loss = convert_custom_objects(training_config['loss'])
-    metrics = convert_custom_objects(training_config['metrics'])
-    sample_weight_mode = training_config['sample_weight_mode']
-    loss_weights = training_config['loss_weights']
-
-    # Compile model.
-    model.compile(
-        optimizer=optimizer,
-        loss=loss,
-        metrics=metrics,
-        loss_weights=loss_weights,
-        sample_weight_mode=sample_weight_mode)
-
-    # Set optimizer weights.
-    if 'optimizer_weights' in f:
-      # Build train function (to get weight updates).
-      if isinstance(model, Sequential):
-        model.model._make_train_function()
-      else:
-        model._make_train_function()
-      optimizer_weights_group = f['optimizer_weights']
-      optimizer_weight_names = [
-          n.decode('utf8')
-          for n in optimizer_weights_group.attrs['weight_names']
-      ]
-      optimizer_weight_values = [
-          optimizer_weights_group[n] for n in optimizer_weight_names
-      ]
-      try:
-        model.optimizer.set_weights(optimizer_weight_values)
-      except ValueError:
-        logging.warning('Error in loading the saved optimizer '
-                        'state. As a result, your model is '
-                        'starting with a freshly initialized '
-                        'optimizer.')
-  return model
-
-
-@tf_export('keras.models.model_from_config')
-def model_from_config(config, custom_objects=None):
-  """Instantiates a Keras model from its config.
-
-  Arguments:
-      config: Configuration dictionary.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-
-  Raises:
-      TypeError: if `config` is not a dictionary.
-  """
-  if isinstance(config, list):
-    raise TypeError('`model_from_config` expects a dictionary, not a list. '
-                    'Maybe you meant to use '
-                    '`Sequential.from_config(config)`?')
-  return layer_module.deserialize(config, custom_objects=custom_objects)
-
-
-@tf_export('keras.models.model_from_yaml')
-def model_from_yaml(yaml_string, custom_objects=None):
-  """Parses a yaml model configuration file and returns a model instance.
-
-  Arguments:
-      yaml_string: YAML string encoding a model configuration.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-
-  Raises:
-      ImportError: if yaml module is not found.
-  """
-  if yaml is None:
-    raise ImportError('Requires yaml module installed.')
-  config = yaml.load(yaml_string)
-  return layer_module.deserialize(config, custom_objects=custom_objects)
-
-
-@tf_export('keras.models.model_from_json')
-def model_from_json(json_string, custom_objects=None):
-  """Parses a JSON model configuration file and returns a model instance.
-
-  Arguments:
-      json_string: JSON string encoding a model configuration.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-  """
-  config = json.loads(json_string)
-  return layer_module.deserialize(config, custom_objects=custom_objects)
-
-
-@tf_export('keras.models.Sequential', 'keras.Sequential')
-class Sequential(Model):
-  """Linear stack of layers.
-
-  Arguments:
-      layers: list of layers to add to the model.
-
-  # Note
-      The first layer passed to a Sequential model
-      should have a defined input shape. What that
-      means is that it should have received an `input_shape`
-      or `batch_input_shape` argument,
-      or for some type of layers (recurrent, Dense...)
-      an `input_dim` argument.
-
-  Example:
-
-      ```python
-          model = Sequential()
-          # first layer must have a defined input shape
-          model.add(Dense(32, input_dim=500))
-          # afterwards, Keras does automatic shape inference
-          model.add(Dense(32))
-
-          # also possible (equivalent to the above):
-          model = Sequential()
-          model.add(Dense(32, input_shape=(500,)))
-          model.add(Dense(32))
-
-          # also possible (equivalent to the above):
-          model = Sequential()
-          # here the batch dimension is None,
-          # which means any batch size will be accepted by the model.
-          model.add(Dense(32, batch_input_shape=(None, 500)))
-          model.add(Dense(32))
-      ```
-  """
-
-  def __init__(self, layers=None, name=None):
-    self._is_graph_network = True
-    self._is_compiled = False
-    self._layers = []  # Stack of layers.
-    self.model = None  # Internal Model instance.
-    self.inputs = []  # List of input tensors
-    self.outputs = []  # List of length 1: the output tensor (unique).
-    self._trainable = True
-    self._initial_weights = None
-    self._input_layers = []
-
-    # Model attributes.
-    self._inbound_nodes = []
-    self._outbound_nodes = []
-    self.built = False
-
-    # Set model name.
-    if not name:
-      prefix = 'sequential_'
-      name = prefix + str(K.get_uid(prefix))
-    self._name = name
-
-    # Used by Layer base class.
-    self._dtype = None
-    self._activity_regularizer = None
-
-    # The following properties are not actually used by Keras;
-    # they exist for compatibility with TF's variable scoping mechanism.
-    self._updates = []
-    self._losses = []
-    self._scope = None
-    self._reuse = None
-    self._base_name = name
-    self._graph = ops.get_default_graph()
-
-    # Add to the model any layers passed to the constructor.
-    if layers:
-      for layer in layers:
-        self.add(layer)
-
-  def add(self, layer):
-    """Adds a layer instance on top of the layer stack.
-
-    Arguments:
-        layer: layer instance.
-
-    Raises:
-        TypeError: If `layer` is not a layer instance.
-        ValueError: In case the `layer` argument does not
-            know its input shape.
-        ValueError: In case the `layer` argument has
-            multiple output tensors, or is already connected
-            somewhere else (forbidden in `Sequential` models).
-    """
-    if not isinstance(layer, (Layer, TFBaseLayer)):
-      raise TypeError('The added layer must be '
-                      'an instance of class Layer. '
-                      'Found: ' + str(layer))
-    if not self.outputs:
-      # First layer in model: check that it is an input layer.
-      if not isinstance(layer, InputLayer):
-        # Create an input layer.
-        # First, we need to infer its expected input shape and dtype.
-        if isinstance(layer, (Model, Sequential)):
-          # We were passed a model as first layer.
-          # This requires a specific way to figure out the
-          # input shape and dtype.
-          if not layer.layers:
-            raise ValueError('Cannot add an empty model '
-                             'to a `Sequential` model.')
-          # In case of nested models: recover the first layer
-          # of the deepest model to infer input shape and dtype.
-          first_layer = layer.layers[0]
-          while isinstance(first_layer, (Model, Sequential)):
-            first_layer = first_layer.layers[0]
-          batch_shape = first_layer._batch_input_shape
-          dtype = first_layer.dtype
-        else:
-          # We were passed a regular layer, and it should
-          # know about its input shape. Otherwise, that's an error.
-          if not hasattr(layer, '_batch_input_shape'):
-            raise ValueError('The first layer in a '
-                             'Sequential model must '
-                             'get an `input_shape` argument.')
-          batch_shape = layer._batch_input_shape
-          dtype = layer.dtype
-        # Instantiate the input layer.
-        x = Input(
-            batch_shape=batch_shape, dtype=dtype, name=layer.name + '_input')
-        # This will build the current layer
-        # and create the node connecting the current layer
-        # to the input layer we just created.
-        layer(x)
-
-      if len(layer._inbound_nodes[-1].output_tensors) != 1:
-        raise ValueError('All layers in a Sequential model '
-                         'should have a single output tensor. '
-                         'For multi-output layers, '
-                         'use the functional API.')
-
-      self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
-      self.inputs = topology.get_source_inputs(self.outputs[0])
-
-      # We create an input node, which we will keep updated
-      # as we add more layers
-      topology.Node(
-          outbound_layer=self,
-          inbound_layers=[],
-          node_indices=[],
-          tensor_indices=[],
-          input_tensors=self.inputs,
-          output_tensors=self.outputs)
-    else:
-      output_tensor = layer(self.outputs[0])
-      if isinstance(output_tensor, list):
-        raise TypeError('All layers in a Sequential model '
-                        'should have a single output tensor. '
-                        'For multi-output layers, '
-                        'use the functional API.')
-      self.outputs = [output_tensor]
-      # update self._inbound_nodes
-      self._inbound_nodes[0].output_tensors = self.outputs
-      self._inbound_nodes[0].output_shapes = [K.int_shape(self.outputs[0])]
-
-    self._layers.append(layer)
-    self.built = False
-
-  def pop(self):
-    """Removes the last layer in the model.
-
-    Raises:
-        TypeError: if there are no layers in the model.
-    """
-    if not self.layers:
-      raise TypeError('There are no layers in the model.')
-
-    self.layers.pop()
-    if not self.layers:
-      self.outputs = []
-      self._inbound_nodes = []
-      self._outbound_nodes = []
-    else:
-      self.layers[-1]._outbound_nodes = []
-      self.outputs = [self.layers[-1].output]
-      # update self._inbound_nodes
-      self._inbound_nodes[0].output_tensors = self.outputs
-      self._inbound_nodes[0].output_shapes = [K.int_shape(self.outputs[0])]
-    self.built = False
-
-  def get_layer(self, name=None, index=None):
-    """Retrieve a layer that is part of the model.
-
-    Returns a layer based on either its name (unique)
-    or its index in the graph. Indices are based on
-    order of horizontal graph traversal (bottom-up).
-
-    Arguments:
-        name: string, name of layer.
-        index: integer, index of layer.
-
-    Returns:
-        A layer instance.
-    """
-    if not self.built:
-      self.build()
-    return self.model.get_layer(name, index)
-
-  def call(self, inputs, mask=None):
-    if not self.built:
-      self.build()
-    return self.model.call(inputs, mask)
-
-  def build(self, input_shape=None):
-    if not self.inputs or not self.outputs:
-      raise TypeError('Sequential model cannot be built: model is empty.'
-                      ' Add some layers first.')
-    # actually create the model
-    self.model = Model(self.inputs, self.outputs[0], name=self.name + '_model')
-    self.model.trainable = self.trainable
-
-    # mirror model attributes
-    self.supports_masking = self.model.supports_masking
-    self._output_mask_cache = self.model._output_mask_cache
-    self._output_tensor_cache = self.model._output_tensor_cache
-    self._output_shape_cache = self.model._output_shape_cache
-    self._input_layers = self.model._input_layers
-    self._output_layers = self.model._output_layers
-    self._input_coordinates = self.model._input_coordinates
-    self._output_coordinates = self.model._output_coordinates
-    self._nodes_by_depth = self.model._nodes_by_depth
-    self._network_nodes = self.model._network_nodes
-    self.output_names = self.model.output_names
-    self.input_names = self.model.input_names
-    self._feed_input_names = self.model._feed_input_names
-    self._feed_inputs = self.model._feed_inputs
-
-    # Make sure child model callbacks
-    # will call the parent Sequential model.
-    self.model.callback_model = self
-
-    self.built = True
-
-  @property
-  def uses_learning_phase(self):
-    if not self.built:
-      self.build()
-    return self.model.uses_learning_phase
-
-  def _gather_list_attr(self, attr):
-    all_attrs = []
-    for layer in self.layers:
-      all_attrs += getattr(layer, attr, [])
-    return all_attrs
-
-  @property
-  def trainable(self):
-    return self._trainable
-
-  @trainable.setter
-  def trainable(self, value):
-    if self.model:
-      self.model.trainable = value
-    self._trainable = value
-
-  @property
-  def trainable_weights(self):
-    if not self.trainable:
-      return []
-    return self._gather_list_attr('trainable_weights')
-
-  @property
-  def non_trainable_weights(self):
-    weights = self._gather_list_attr('non_trainable_weights')
-    if not self.trainable:
-      trainable_weights = self._gather_list_attr('trainable_weights')
-      return trainable_weights + weights
-    return weights
-
-  @property
-  def regularizers(self):
-    if not self.built:
-      self.build()
-    return self.model.regularizers
-
-  def get_weights(self):
-    """Retrieves the weights of the model.
-
-    Returns:
-        A flat list of Numpy arrays
-        (one array per model weight).
-    """
-    if not self.built:
-      self.build()
-    return self.model.get_weights()
-
-  def set_weights(self, weights):
-    """Sets the weights of the model.
-
-    Arguments:
-        weights: Should be a list
-            of Numpy arrays with shapes and types matching
-            the output of `model.get_weights()`.
-    """
-    if not self.built:
-      self.build()
-    self.model.set_weights(weights)
-
-  def load_weights(self, filepath, by_name=False):
-    if h5py is None:
-      raise ImportError('`load_weights` requires h5py.')
-    f = h5py.File(filepath, mode='r')
-    if 'layer_names' not in f.attrs and 'model_weights' in f:
-      f = f['model_weights']
-    layers = self.layers
-    if by_name:
-      topology.load_weights_from_hdf5_group_by_name(f, layers)
-    else:
-      topology.load_weights_from_hdf5_group(f, layers)
-    if hasattr(f, 'close'):
-      f.close()
-
-  def save_weights(self, filepath, overwrite=True):
-    if h5py is None:
-      raise ImportError('`save_weights` requires h5py.')
-    # If file exists and should not be overwritten:
-    if not overwrite and os.path.isfile(filepath):
-      proceed = ask_to_proceed_with_overwrite(filepath)
-      if not proceed:
-        return
-    layers = self.layers
-    f = h5py.File(filepath, 'w')
-    topology.save_weights_to_hdf5_group(f, layers)
-    f.flush()
-    f.close()
-
-  def compile(self,
-              optimizer,
-              loss,
-              metrics=None,
-              sample_weight_mode=None,
-              weighted_metrics=None,
-              target_tensors=None,
-              **kwargs):
-    """Configures the model for training.
-
-    Arguments:
-        optimizer: String (name of optimizer) or optimizer object.
-            See [optimizers](/optimizers).
-        loss: String (name of objective function) or objective function.
-            See [losses](/losses).
-            If the model has multiple outputs, you can use a different loss
-            on each output by passing a dictionary or a list of losses.
-            The loss value that will be minimized by the model
-            will then be the sum of all individual losses.
-        metrics: List of metrics to be evaluated by the model
-            during training and testing.
-            Typically you will use `metrics=['accuracy']`.
-            To specify different metrics for different outputs of a
-            multi-output model, you could also pass a dictionary,
-            such as `metrics={'output_a': 'accuracy'}`.
-        sample_weight_mode: If you need to do timestep-wise
-            sample weighting (2D weights), set this to `"temporal"`.
-            `None` defaults to sample-wise weights (1D).
-            If the model has multiple outputs, you can use a different
-            `sample_weight_mode` on each output by passing a
-            dictionary or a list of modes.
-        weighted_metrics: list of metrics to be evaluated and weighted
-             by `sample_weight` or `class_weight` during training and testing.
-        target_tensors: By default, Keras will create a placeholder for the
-            model's target, which will be fed with the target data during
-            training. If instead you would like to use your own
-            target tensor (in turn, Keras will not expect external
-            Numpy data for these targets at training time), you
-            can specify them via the `target_tensors` argument.
-            It should be a single tensor
-            (for a single-output `Sequential` model).
-        **kwargs: These arguments are passed into `tf.Session.run`.
-
-    Example:
-        ```python
-            model = Sequential()
-            model.add(Dense(32, input_shape=(500,)))
-            model.add(Dense(10, activation='softmax'))
-            model.compile(optimizer='rmsprop',
-                          loss='categorical_crossentropy',
-                          metrics=['accuracy'])
-        ```
-    """
-    # create the underlying model
-    self.build()
-    # call compile method of Model class
-    self.model.compile(
-        optimizer,
-        loss,
-        metrics=metrics,
-        sample_weight_mode=sample_weight_mode,
-        weighted_metrics=weighted_metrics,
-        target_tensors=target_tensors,
-        **kwargs)
-    self.optimizer = self.model.optimizer
-    self.loss = self.model.loss
-    self.metrics = self.model.metrics
-    self.loss_weights = self.model.loss_weights
-    self.sample_weight_mode = self.model.sample_weight_mode
-    self.weighted_metrics = self.model.weighted_metrics
-    self.targets = self.model.targets
-    self.metrics_tensors = self.model.metrics_tensors
-    self.metrics_names = self.model.metrics_names
-    self.sample_weights = self.model.sample_weights
-    self.total_loss = self.model.total_loss
-
-  def fit(self,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          **kwargs):
-    """Trains the model for a fixed number of epochs.
-
-    Arguments:
-        x: Numpy array of training data.
-            If the input layer in the model is named, you can also pass a
-            dictionary mapping the input name to a Numpy array.
-            `x` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        y: Numpy array of target (label) data.
-            If the output layer in the model is named, you can also pass a
-            dictionary mapping the output name to a Numpy array.
-            `y` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, it will default to 32.
-        epochs: Integer. Number of epochs to train the model.
-            An epoch is an iteration over the entire `x` and `y`
-            data provided.
-            Note that in conjunction with `initial_epoch`,
-            `epochs` is to be understood as "final epoch".
-            The model is not trained for a number of iterations
-            given by `epochs`, but merely until the epoch
-            of index `epochs` is reached.
-        verbose: 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = progress bar, 2 = one line per epoch.
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during training.
-            See [callbacks](/callbacks).
-        validation_split: Float between 0 and 1:
-            Fraction of the training data to be used as validation data.
-            The model will set apart this fraction of the training data,
-            will not train on it, and will evaluate
-            the loss and any model metrics
-            on this data at the end of each epoch.
-            The validation data is selected from the last samples
-            in the `x` and `y` data provided, before shuffling.
-        validation_data: tuple `(x_val, y_val)` or tuple
-            `(x_val, y_val, val_sample_weights)` on which to evaluate
-            the loss and any model metrics at the end of each epoch.
-            The model will not be trained on this data.
-            This will override `validation_split`.
-        shuffle: Boolean (whether to shuffle the training data
-            before each epoch) or str (for 'batch').
-            'batch' is a special option for dealing with the
-            limitations of HDF5 data; it shuffles in batch-sized chunks.
-            Has no effect when `steps_per_epoch` is not `None`.
-        class_weight: Optional dictionary mapping class indices (integers)
-            to a weight (float) value, used for weighting the loss function
-            (during training only).
-            This can be useful to tell the model to
-            "pay more attention" to samples from
-            an under-represented class.
-        sample_weight: Optional Numpy array of weights for
-            the training samples, used for weighting the loss function
-            (during training only). You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`.
-        initial_epoch: Epoch at which to start training
-            (useful for resuming a previous training run).
-        steps_per_epoch: Total number of steps (batches of samples)
-            before declaring one epoch finished and starting the
-            next epoch. When training with input tensors such as
-            TensorFlow data tensors, the default `None` is equal to
-            the number of unique samples in your dataset divided by
-            the batch size, or 1 if that cannot be determined.
-        validation_steps: Only relevant if `steps_per_epoch`
-            is specified. Total number of steps (batches of samples)
-            to validate before stopping.
-        **kwargs: Used for backwards compatibility support.
-
-    Returns:
-        A `History` object. Its `History.history` attribute is
-        a record of training loss values and metrics values
-        at successive epochs, as well as validation loss values
-        and validation metrics values (if applicable).
-
-    Raises:
-        RuntimeError: If the model was never compiled.
-        ValueError: In case of mismatch between the provided input data
-            and what the model expects.
-    """
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.fit(
-        x,
-        y,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_split=validation_split,
-        validation_data=validation_data,
-        shuffle=shuffle,
-        class_weight=class_weight,
-        sample_weight=sample_weight,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps)
-
-  def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):
-    """Computes the loss on some input data, batch by batch.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-        y: labels, as a Numpy array.
-        batch_size: integer. Number of samples per gradient update.
-        verbose: verbosity mode, 0 or 1.
-        sample_weight: sample weights, as a Numpy array.
-
-    Returns:
-        Scalar test loss (if the model has no metrics)
-        or list of scalars (if the model computes other metrics).
-        The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-    """
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.evaluate(
-        x,
-        y,
-        batch_size=batch_size,
-        verbose=verbose,
-        sample_weight=sample_weight)
-
-  def predict(self, x, batch_size=32, verbose=0):
-    """Generates output predictions for the input samples.
-
-    The input samples are processed batch by batch.
-
-    Arguments:
-        x: the input data, as a Numpy array.
-        batch_size: integer.
-        verbose: verbosity mode, 0 or 1.
-
-    Returns:
-        A Numpy array of predictions.
-    """
-    if not self.built:
-      self.build()
-    return self.model.predict(x, batch_size=batch_size, verbose=verbose)
-
-  def predict_on_batch(self, x):
-    """Returns predictions for a single batch of samples.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-
-    Returns:
-        A Numpy array of predictions.
-    """
-    if not self.built:
-      self.build()
-    return self.model.predict_on_batch(x)
-
-  def train_on_batch(self, x, y, class_weight=None, sample_weight=None):
-    """Single gradient update over one batch of samples.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-        y: labels, as a Numpy array.
-        class_weight: dictionary mapping classes to a weight value,
-            used for scaling the loss function (during training only).
-        sample_weight: sample weights, as a Numpy array.
-
-    Returns:
-        Scalar training loss (if the model has no metrics)
-        or list of scalars (if the model computes other metrics).
-        The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-    """
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.train_on_batch(
-        x, y, sample_weight=sample_weight, class_weight=class_weight)
-
-  def test_on_batch(self, x, y, sample_weight=None):
-    """Evaluates the model over a single batch of samples.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-        y: labels, as a Numpy array.
-        sample_weight: sample weights, as a Numpy array.
-
-    Returns:
-        Scalar test loss (if the model has no metrics)
-        or list of scalars (if the model computes other metrics).
-        The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-    """
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.test_on_batch(x, y, sample_weight=sample_weight)
-
-  def predict_proba(self, x, batch_size=32, verbose=0):
-    """Generates class probability predictions for the input samples.
-
-    The input samples are processed batch by batch.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-        batch_size: integer.
-        verbose: verbosity mode, 0 or 1.
-
-    Returns:
-        A Numpy array of probability predictions.
-    """
-    preds = self.predict(x, batch_size, verbose)
-    if preds.min() < 0. or preds.max() > 1.:
-      logging.warning('Network returning invalid probability values. '
-                      'The last layer might not normalize predictions '
-                      'into probabilities '
-                      '(like softmax or sigmoid would).')
-    return preds
-
-  def predict_classes(self, x, batch_size=32, verbose=0):
-    """Generate class predictions for the input samples.
-
-    The input samples are processed batch by batch.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-        batch_size: integer.
-        verbose: verbosity mode, 0 or 1.
-
-    Returns:
-        A numpy array of class predictions.
-    """
-    proba = self.predict(x, batch_size=batch_size, verbose=verbose)
-    if proba.shape[-1] > 1:
-      return proba.argmax(axis=-1)
-    else:
-      return (proba > 0.5).astype('int32')
-
-  def fit_generator(self,
-                    generator,
-                    steps_per_epoch=None,
-                    epochs=1,
-                    verbose=1,
-                    callbacks=None,
-                    validation_data=None,
-                    validation_steps=None,
-                    class_weight=None,
-                    max_queue_size=10,
-                    workers=1,
-                    use_multiprocessing=False,
-                    shuffle=True,
-                    initial_epoch=0,
-                    **kwargs):
-    """Fits the model on data generated batch-by-batch by a Python generator.
-
-    The generator is run in parallel to the model, for efficiency.
-    For instance, this allows you to do real-time data augmentation
-    on images on CPU in parallel to training your model on GPU.
-
-    Arguments:
-        generator: A generator.
-            The output of the generator must be either
-            - a tuple (inputs, targets)
-            - a tuple (inputs, targets, sample_weights).
-            All arrays should contain the same number of samples.
-            The generator is expected to loop over its data
-            indefinitely. An epoch finishes when `steps_per_epoch`
-            batches have been seen by the model.
-        steps_per_epoch: Total number of steps (batches of samples)
-            to yield from `generator` before declaring one epoch
-            finished and starting the next epoch. It should typically
-            be equal to the number of samples of your dataset
-            divided by the batch size.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        epochs: Integer, total number of iterations on the data.
-            Note that in conjunction with initial_epoch, the parameter
-            epochs is to be understood as "final epoch". The model is
-            not trained for n steps given by epochs, but until the
-            epoch epochs is reached.
-        verbose: Verbosity mode, 0, 1, or 2.
-        callbacks: List of callbacks to be called during training.
-        validation_data: This can be either
-            - A generator for the validation data
-            - A tuple (inputs, targets)
-            - A tuple (inputs, targets, sample_weights).
-        validation_steps: Only relevant if `validation_data`
-            is a generator.
-            Number of steps to yield from validation generator
-            at the end of every epoch. It should typically
-            be equal to the number of samples of your
-            validation dataset divided by the batch size.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(validation_data)` as a number of steps.
-        class_weight: Dictionary mapping class indices to a weight
-            for the class.
-        max_queue_size: Maximum size for the generator queue
-        workers: Maximum number of processes to spin up
-        use_multiprocessing: If True, use process based threading.
-            Note that because
-            this implementation relies on multiprocessing,
-            you should not pass
-            non picklable arguments to the generator
-            as they can't be passed
-            easily to children processes.
-       shuffle: Whether to shuffle the order of the batches at
-              the beginning of each epoch. Only used with instances
-              of `Sequence` (keras.utils.Sequence).
-        initial_epoch: Epoch at which to start training
-            (useful for resuming a previous training run)
-        **kwargs: support for legacy arguments.
-
-    Returns:
-        A `History` object.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-        ValueError: In case the generator yields
-            data in an invalid format.
-
-    Example:
-
-    ```python
-        def generate_arrays_from_file(path):
-            while 1:
-                f = open(path)
-                for line in f:
-                    # create Numpy arrays of input data
-                    # and labels, from each line in the file
-                    x, y = process_line(line)
-                    yield (x, y)
-                    f.close()
-
-        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
-                            steps_per_epoch=1000, epochs=10)
-    ```
-    """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.fit_generator(
-        generator,
-        steps_per_epoch,
-        epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        class_weight=class_weight,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch)
-
-  def evaluate_generator(self,
-                         generator,
-                         steps=None,
-                         max_queue_size=10,
-                         workers=1,
-                         use_multiprocessing=False,
-                         **kwargs):
-    """Evaluates the model on a data generator.
-
-    The generator should return the same kind of data
-    as accepted by `test_on_batch`.
-
-    Arguments:
-        generator: Generator yielding tuples (inputs, targets)
-            or (inputs, targets, sample_weights)
-        steps: Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        max_queue_size: maximum size for the generator queue
-        workers: maximum number of processes to spin up
-        use_multiprocessing: if True, use process based threading.
-            Note that because this implementation
-            relies on multiprocessing, you should not pass
-            non picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        **kwargs: support for legacy arguments.
-
-    Returns:
-        Scalar test loss (if the model has no metrics)
-        or list of scalars (if the model computes other metrics).
-        The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-        ValueError: In case the generator yields
-            data in an invalid format.
-    """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.evaluate_generator(
-        generator,
-        steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
-
-  def predict_generator(self,
-                        generator,
-                        steps=None,
-                        max_queue_size=10,
-                        workers=1,
-                        use_multiprocessing=False,
-                        verbose=0,
-                        **kwargs):
-    """Generates predictions for the input samples from a data generator.
-
-    The generator should return the same kind of data as accepted by
-    `predict_on_batch`.
-
-    Arguments:
-        generator: generator yielding batches of input samples.
-        steps: Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        max_queue_size: maximum size for the generator queue
-        workers: maximum number of processes to spin up
-        use_multiprocessing: if True, use process based threading.
-            Note that because this implementation
-            relies on multiprocessing, you should not pass
-            non picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        verbose: verbosity mode, 0 or 1.
-        **kwargs: support for legacy arguments.
-
-    Returns:
-        A Numpy array of predictions.
-
-    Raises:
-        ValueError: In case the generator yields
-            data in an invalid format.
-    """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    if not self.built:
-      self.build()
-    return self.model.predict_generator(
-        generator,
-        steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose)
 
-  def get_config(self):
-    config = []
-    for layer in self.layers:
-      config.append({
-          'class_name': layer.__class__.__name__,
-          'config': layer.get_config()
-      })
-    return copy.deepcopy(config)
 
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    model = cls()
-    for conf in config:
-      layer = layer_module.deserialize(conf, custom_objects=custom_objects)
-      model.add(layer)
-    return model
+# API entries importable from `keras.models`:
+Model = training.Model  # pylint: disable=invalid-name
+Sequential = sequential.Sequential  # pylint: disable=invalid-name
+save_model = saving.save_model
+load_model = saving.load_model
+model_from_config = saving.model_from_config
+model_from_yaml = saving.model_from_yaml
+model_from_json = saving.model_from_json
 
 
 def _clone_functional_model(model, input_tensors=None):
@@ -1365,7 +90,7 @@ def _clone_functional_model(model, input_tensors=None):
   else:
     # Make sure that all input tensors come from a Keras layer.
     # If tensor comes from an input layer: cache the input layer.
-    input_tensors = topology._to_list(input_tensors)
+    input_tensors = generic_utils.to_list(input_tensors)
     input_tensors_ = []
     for i, x in enumerate(input_tensors):
       if not K.is_keras_tensor(x):
@@ -1402,7 +127,7 @@ def _clone_functional_model(model, input_tensors=None):
         # Reuse previously cloned layer.
         layer = layer_map[layer]
         # Don't call InputLayer multiple times.
-        if isinstance(layer, topology.InputLayer):
+        if isinstance(layer, InputLayer):
           continue
 
       # Gather inputs to call the new layer.
@@ -1427,8 +152,9 @@ def _clone_functional_model(model, input_tensors=None):
           if has_arg(layer.call, 'mask'):
             if 'mask' not in kwargs:
               kwargs['mask'] = computed_mask
-          output_tensors = topology._to_list(layer(computed_tensor, **kwargs))
-          output_masks = topology._to_list(
+          output_tensors = generic_utils.to_list(layer(computed_tensor,
+                                                       **kwargs))
+          output_masks = generic_utils.to_list(
               layer.compute_mask(computed_tensor, computed_mask))
           computed_tensors = [computed_tensor]
           computed_masks = [computed_mask]
@@ -1438,8 +164,9 @@ def _clone_functional_model(model, input_tensors=None):
           if has_arg(layer.call, 'mask'):
             if 'mask' not in kwargs:
               kwargs['mask'] = computed_masks
-          output_tensors = topology._to_list(layer(computed_tensors, **kwargs))
-          output_masks = topology._to_list(
+          output_tensors = generic_utils.to_list(layer(computed_tensors,
+                                                       **kwargs))
+          output_masks = generic_utils.to_list(
               layer.compute_mask(computed_tensors, computed_masks))
         # Update tensor_map.
         for x, y, mask in zip(reference_output_tensors, output_tensors,
@@ -1489,14 +216,14 @@ def _clone_sequential_model(model, input_tensors=None):
   if input_tensors is None:
     return Sequential(layers=layers, name=model.name)
   else:
-    if len(topology._to_list(input_tensors)) != 1:
+    if len(generic_utils.to_list(input_tensors)) != 1:
       raise ValueError('To clone a `Sequential` model, we expect '
                        ' at most one tensor '
                        'as part of `input_tensors`.')
-    x = topology._to_list(input_tensors)[0]
+    x = generic_utils.to_list(input_tensors)[0]
     if K.is_keras_tensor(x):
       origin_layer = x._keras_history[0]
-      if isinstance(origin_layer, topology.InputLayer):
+      if isinstance(origin_layer, InputLayer):
         return Sequential(layers=[origin_layer] + layers, name=model.name)
       else:
         raise ValueError('Cannot clone a `Sequential` model on top '
diff --git a/tensorflow/python/keras/_impl/keras/models_test.py b/tensorflow/python/keras/_impl/keras/models_test.py
index 04017e4b28b27e52f88a7746fc44510c29edffce..5978ddd987c63b9d87a31be6837172f08512ef73 100644
--- a/tensorflow/python/keras/_impl/keras/models_test.py
+++ b/tensorflow/python/keras/_impl/keras/models_test.py
@@ -12,362 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for training routines."""
+"""Tests for `models.py` (model cloning, mainly)."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import shutil
-import tempfile
-
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.platform import test
-from tensorflow.python.training import training as training_module
-
-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
-
-class TestModelSaving(test.TestCase):
-
-  def test_sequential_model_saving(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model.compile(loss=keras.losses.MSE,
-                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
-                    metrics=[keras.metrics.categorical_accuracy],
-                    sample_weight_mode='temporal')
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-
-      out = model.predict(x)
-      fd, fname = tempfile.mkstemp('.h5')
-      keras.models.save_model(model, fname)
-
-      new_model = keras.models.load_model(fname)
-      os.close(fd)
-      os.remove(fname)
-
-      out2 = new_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-      # test that new updates are the same with both models
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-      new_model.train_on_batch(x, y)
-      out = model.predict(x)
-      out2 = new_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-  def test_sequential_model_saving_2(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    with self.test_session():
-      # test with custom optimizer, loss
-
-      class CustomOp(keras.optimizers.RMSprop):
-        pass
-
-      def custom_loss(y_true, y_pred):
-        return keras.losses.mse(y_true, y_pred)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss=custom_loss, optimizer=CustomOp(), metrics=['acc'])
-
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-
-      out = model.predict(x)
-      fd, fname = tempfile.mkstemp('.h5')
-      keras.models.save_model(model, fname)
-
-      model = keras.models.load_model(
-          fname,
-          custom_objects={'CustomOp': CustomOp,
-                          'custom_loss': custom_loss})
-      os.close(fd)
-      os.remove(fname)
-
-      out2 = model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-  def test_functional_model_saving(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    with self.test_session():
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      output = keras.layers.Dense(3)(x)
-
-      model = keras.models.Model(inputs, output)
-      model.compile(loss=keras.losses.MSE,
-                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
-                    metrics=[keras.metrics.categorical_accuracy])
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-
-      out = model.predict(x)
-      fd, fname = tempfile.mkstemp('.h5')
-      keras.models.save_model(model, fname)
-
-      model = keras.models.load_model(fname)
-      os.close(fd)
-      os.remove(fname)
-
-      out2 = model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-  def test_saving_without_compilation(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-
-      fd, fname = tempfile.mkstemp('.h5')
-      keras.models.save_model(model, fname)
-      model = keras.models.load_model(fname)
-      os.close(fd)
-      os.remove(fname)
-
-  def test_saving_with_tf_optimizer(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss='mse',
-                    optimizer=training_module.AdadeltaOptimizer(0.1),
-                    metrics=['acc'])
-
-      fd, fname = tempfile.mkstemp('.h5')
-      keras.models.save_model(model, fname)
-      model = keras.models.load_model(fname)
-      os.close(fd)
-      os.remove(fname)
-
-  def test_saving_right_after_compilation(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-      model.model._make_train_function()
-
-      fd, fname = tempfile.mkstemp('.h5')
-      keras.models.save_model(model, fname)
-      model = keras.models.load_model(fname)
-      os.close(fd)
-      os.remove(fname)
-
-  def test_saving_lambda_numpy_array_arguments(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    mean = np.random.random((4, 2, 3))
-    std = np.abs(np.random.random((4, 2, 3))) + 1e-5
-    inputs = keras.layers.Input(shape=(4, 2, 3))
-    output = keras.layers.Lambda(lambda image, mu, std: (image - mu) / std,
-                                 arguments={'mu': mean, 'std': std})(inputs)
-    model = keras.models.Model(inputs, output)
-    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-
-    fd, fname = tempfile.mkstemp('.h5')
-    keras.models.save_model(model, fname)
-
-    model = keras.models.load_model(fname)
-    os.close(fd)
-    os.remove(fname)
-
-    self.assertAllClose(mean, model.layers[1].arguments['mu'])
-    self.assertAllClose(std, model.layers[1].arguments['std'])
-
-
-class TestSequential(test.TestCase):
-  """Most Sequential model API tests are covered in `training_test.py`.
-  """
-
-  def test_basic_methods(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1, input_dim=2))
-    model.add(keras.layers.Dropout(0.3, name='dp'))
-    model.add(keras.layers.Dense(2, kernel_regularizer='l2',
-                                 kernel_constraint='max_norm'))
-    model.build()
-    self.assertEqual(model.state_updates, model.model.state_updates)
-    self.assertEqual(model.get_layer(name='dp').name, 'dp')
-
-  def test_sequential_pop(self):
-    num_hidden = 5
-    input_dim = 3
-    batch_size = 5
-    num_classes = 2
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
-      model.add(keras.layers.Dense(num_classes))
-      model.compile(loss='mse', optimizer='sgd')
-      x = np.random.random((batch_size, input_dim))
-      y = np.random.random((batch_size, num_classes))
-      model.fit(x, y, epochs=1)
-      model.pop()
-      self.assertEqual(len(model.layers), 1)
-      self.assertEqual(model.output_shape, (None, num_hidden))
-      model.compile(loss='mse', optimizer='sgd')
-      y = np.random.random((batch_size, num_hidden))
-      model.fit(x, y, epochs=1)
-
-      # Test popping single-layer model
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
-      model.pop()
-      self.assertEqual(len(model.layers), 0)
-      self.assertEqual(len(model.outputs), 0)
-
-      # Invalid use case
-      model = keras.models.Sequential()
-      with self.assertRaises(TypeError):
-        model.pop()
-
-  def test_sequential_weight_loading(self):
-    if h5py is None:
-      return
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-    h5_path = os.path.join(temp_dir, 'test.h5')
-
-    num_hidden = 5
-    input_dim = 3
-    batch_size = 5
-    num_classes = 2
-
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
-      model.add(keras.layers.Dense(num_classes))
-
-      x = np.random.random((batch_size, input_dim))
-      ref_y = model.predict(x)
-
-      model.save_weights(h5_path)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
-      model.add(keras.layers.Dense(num_classes))
-      model.load_weights(h5_path)
-      y = model.predict(x)
-
-      self.assertAllClose(y, ref_y)
-
-  def test_invalid_use_cases(self):
-    with self.test_session():
-      # Added objects must be layer instances
-      with self.assertRaises(TypeError):
-        model = keras.models.Sequential()
-        model.add(None)
-
-      # Added layers must have an inputs shape
-      with self.assertRaises(ValueError):
-        model = keras.models.Sequential()
-        model.add(keras.layers.Dense(1))
-
-      # Added layers cannot have multiple outputs
-      class MyLayer(keras.layers.Layer):
-
-        def call(self, inputs):
-          return [3 * inputs, 2 * inputs]
-
-        def compute_output_shape(self, input_shape):
-          return [input_shape, input_shape]
-
-      with self.assertRaises(ValueError):
-        model = keras.models.Sequential()
-        model.add(MyLayer(input_shape=(3,)))
-      with self.assertRaises(TypeError):
-        model = keras.models.Sequential()
-        model.add(keras.layers.Dense(1, input_dim=1))
-        model.add(MyLayer())
-
-      # Building empty model
-      model = keras.models.Sequential()
-      with self.assertRaises(TypeError):
-        model.build()
-
-  def test_nested_sequential_trainability(self):
-    input_dim = 20
-    num_units = 10
-    num_classes = 2
-
-    inner_model = keras.models.Sequential()
-    inner_model.add(keras.layers.Dense(num_units, input_shape=(input_dim,)))
-
-    model = keras.models.Sequential()
-    model.add(inner_model)
-    model.add(keras.layers.Dense(num_classes))
-
-    self.assertEqual(len(model.trainable_weights), 4)
-    inner_model.trainable = False
-    self.assertEqual(len(model.trainable_weights), 2)
-    inner_model.trainable = True
-    self.assertEqual(len(model.trainable_weights), 4)
-
-  def test_sequential_update_disabling(self):
-    val_a = np.random.random((10, 4))
-    val_out = np.random.random((10, 4))
-
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.BatchNormalization(input_shape=(4,)))
-
-      model.trainable = False
-      assert not model.updates
-
-      model.compile('sgd', 'mse')
-      assert not model.updates
-      assert not model.model.updates
-
-      x1 = model.predict(val_a)
-      model.train_on_batch(val_a, val_out)
-      x2 = model.predict(val_a)
-      self.assertAllClose(x1, x2, atol=1e-7)
-
-      model.trainable = True
-      model.compile('sgd', 'mse')
-      assert model.updates
-      assert model.model.updates
-
-      model.train_on_batch(val_a, val_out)
-      x2 = model.predict(val_a)
-      assert np.abs(np.sum(x1 - x2)) > 1e-5
 
 
 class TestModelCloning(test.TestCase):
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index 76a97156ed7d9ca89b0d94f31bed3a23eca9609d..9f383deb725ac69bf2f17f3627010c4e1f567ef0 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -31,7 +31,10 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import optimizer as tf_optimizer_module
+from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -95,9 +98,29 @@ class Optimizer(object):
     raise NotImplementedError
 
   def get_gradients(self, loss, params):
+    """Returns gradients of `loss` with respect to `params`.
+
+    Arguments:
+        loss: Loss tensor.
+        params: List of variables.
+
+    Returns:
+        List of gradient tensors.
+
+    Raises:
+        ValueError: In case any gradient cannot be computed (e.g. if gradient
+          function not implemented).
+    """
     grads = K.gradients(loss, params)
+    if None in grads:
+      raise ValueError('An operation has `None` for gradient. '
+                       'Please make sure that all of your ops have a '
+                       'gradient defined (i.e. are differentiable). '
+                       'Common ops without gradient: '
+                       'K.argmax, K.round, K.eval.')
     if hasattr(self, 'clipnorm') and self.clipnorm > 0:
-      norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
+      norm = K.sqrt(
+          sum([math_ops.reduce_sum(math_ops.square(g)) for g in grads]))
       grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
     if hasattr(self, 'clipvalue') and self.clipvalue > 0:
       grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
@@ -120,6 +143,11 @@ class Optimizer(object):
         ValueError: in case of incompatible weight shapes.
     """
     params = self.weights
+    if len(params) != len(weights):
+      raise ValueError(
+          'Length of the specified weight list (' + str(len(weights)) +
+          ') does not match the number of weights '
+          'of the optimizer (' + str(len(params)) + ')')
     weight_value_tuples = []
     param_values = K.batch_get_value(params)
     for pv, p, w in zip(param_values, params, weights):
@@ -178,20 +206,20 @@ class SGD(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
     # momentum
     shapes = [K.int_shape(p) for p in params]
     moments = [K.zeros(shape) for shape in shapes]
     self.weights = [self.iterations] + moments
     for p, g, m in zip(params, grads, moments):
       v = self.momentum * m - lr * g  # velocity
-      self.updates.append(K.update(m, v))
+      self.updates.append(state_ops.assign(m, v))
 
       if self.nesterov:
         new_p = p + self.momentum * v - lr * g
@@ -202,7 +230,7 @@ class SGD(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -251,25 +279,25 @@ class RMSprop(Optimizer):
     grads = self.get_gradients(loss, params)
     accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     self.weights = accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * K.square(g)
-      self.updates.append(K.update(a, new_a))
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
+      self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -313,24 +341,24 @@ class Adagrad(Optimizer):
     shapes = [K.int_shape(p) for p in params]
     accumulators = [K.zeros(shape) for shape in shapes]
     self.weights = accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
-      new_a = a + K.square(g)  # update accumulator
-      self.updates.append(K.update(a, new_a))
+      new_a = a + math_ops.square(g)  # update accumulator
+      self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -377,18 +405,18 @@ class Adadelta(Optimizer):
     accumulators = [K.zeros(shape) for shape in shapes]
     delta_accumulators = [K.zeros(shape) for shape in shapes]
     self.weights = accumulators + delta_accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
       # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * K.square(g)
-      self.updates.append(K.update(a, new_a))
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
+      self.updates.append(state_ops.assign(a, new_a))
 
       # use the new accumulator and the *old* delta_accumulator
       update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
@@ -398,11 +426,11 @@ class Adadelta(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
 
       # update delta_accumulator
-      new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
-      self.updates.append(K.update(d_a, new_d_a))
+      new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
+      self.updates.append(state_ops.assign(d_a, new_d_a))
     return self.updates
 
   def get_config(self):
@@ -457,17 +485,18 @@ class Adam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
-    t = K.cast(self.iterations, K.floatx()) + 1
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
     lr_t = lr * (
-        K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))
+        K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
+        (1. - math_ops.pow(self.beta_1, t)))
 
     ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
@@ -479,23 +508,23 @@ class Adam(Optimizer):
 
     for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
+      v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
       if self.amsgrad:
-        vhat_t = K.maximum(vhat, v_t)
+        vhat_t = math_ops.maximum(vhat, v_t)
         p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
-        self.updates.append(K.update(vhat, vhat_t))
+        self.updates.append(state_ops.assign(vhat, vhat_t))
       else:
         p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(v, v_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
       new_p = p_t
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -547,16 +576,16 @@ class Adamax(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
-    t = K.cast(self.iterations, K.floatx()) + 1
-    lr_t = lr / (1. - K.pow(self.beta_1, t))
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
 
     shapes = [K.int_shape(p) for p in params]
     # zero init of 1st moment
@@ -568,18 +597,18 @@ class Adamax(Optimizer):
     for p, g, m, u in zip(params, grads, ms, us):
 
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      u_t = K.maximum(self.beta_2 * u, K.abs(g))
+      u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
       p_t = p - lr_t * m_t / (u_t + self.epsilon)
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(u, u_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(u, u_t))
       new_p = p_t
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -633,16 +662,17 @@ class Nadam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
-    t = K.cast(self.iterations, K.floatx()) + 1
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
     momentum_cache_t = self.beta_1 * (
-        1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
+        1. - 0.5 *
+        (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
     momentum_cache_t_1 = self.beta_1 * (
         1. - 0.5 *
-        (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
+        (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
     m_schedule_new = self.m_schedule * momentum_cache_t
     m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
     self.updates.append((self.m_schedule, m_schedule_new))
@@ -658,13 +688,13 @@ class Nadam(Optimizer):
       g_prime = g / (1. - m_schedule_new)
       m_t = self.beta_1 * m + (1. - self.beta_1) * g
       m_t_prime = m_t / (1. - m_schedule_next)
-      v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
-      v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
+      v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
+      v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
       m_t_bar = (
           1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(v, v_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
 
       p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
       new_p = p_t
@@ -673,7 +703,7 @@ class Nadam(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -704,10 +734,27 @@ class TFOptimizer(Optimizer):
     return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
-    grads = self.optimizer.compute_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
-    opt_update = self.optimizer.apply_gradients(
-        grads, global_step=self.iterations)
+    if distribute_lib.has_distribution_strategy():
+      self.updates = []
+
+      if not params:
+        # After the model vars have been created, the second call to get_updates
+        # is called with params as an empty list. This ensures that we call
+        # compute_gradients with params=None.
+        grads = self.optimizer.compute_gradients(loss)
+      else:
+        grads = self.optimizer.compute_gradients(loss, params)
+      global_step = training_util.get_global_step()
+      opt_update = self.optimizer.apply_gradients(grads, global_step)
+    else:
+      self.updates = [state_ops.assign_add(self.iterations, 1)]
+      if not params:
+        return self.updates
+
+      grads = self.optimizer.compute_gradients(loss, params)
+      opt_update = self.optimizer.apply_gradients(
+          grads, global_step=self.iterations)
+
     self.updates.append(opt_update)
     return self.updates
 
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
index d12f10863921ee7d635930f34e8bc701c89864e8..6299445c34b99f20d7ae5090fc979d0ac2611109 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
@@ -43,6 +43,7 @@ except ImportError:
 
 
 try:
+  from PIL import ImageEnhance
   from PIL import Image as pil_image
 except ImportError:
   pil_image = None
@@ -227,6 +228,32 @@ def random_channel_shift(x, intensity, channel_axis=0):
   return x
 
 
+@tf_export('keras.preprocessing.image.random_brightness')
+def random_brightness(x, brightness_range):
+  """Performs a random adjustment of brightness of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      brightness_range: Tuple of floats; range to pick a brightness value from.
+
+  Returns:
+      Brightness adjusted Numpy image tensor.
+
+  Raises:
+      ValueError: if `brightness_range` isn't a tuple.
+  """
+  if len(brightness_range) != 2:
+    raise ValueError('`brightness_range should be tuple or list of two floats. '
+                     'Received arg: ', brightness_range)
+
+  x = array_to_img(x)
+  x = ImageEnhance.Brightness(x)
+  u = np.random.uniform(brightness_range[0], brightness_range[1])
+  x = x.enhance(u)
+  x = img_to_array(x)
+  return x
+
+
 def transform_matrix_offset_center(matrix, x, y):
   o_x = float(x) / 2 + 0.5
   o_y = float(y) / 2 + 0.5
@@ -265,7 +292,7 @@ def apply_transform(x,
           x_channel,
           final_affine_matrix,
           final_offset,
-          order=0,
+          order=1,
           mode=fill_mode,
           cval=cval) for x_channel in x
   ]
@@ -436,6 +463,7 @@ class ImageDataGenerator(object):
       rotation_range: degrees (0 to 180).
       width_shift_range: fraction of total width, if < 1, or pixels if >= 1.
       height_shift_range: fraction of total height, if < 1, or pixels if >= 1.
+      brightness_range: the range of brightness to apply
       shear_range: shear intensity (shear angle in degrees).
       zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
           in the range [1-z, 1+z]. A sequence of two can be passed instead
@@ -469,6 +497,8 @@ class ImageDataGenerator(object):
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
+      validation_split: fraction of images reserved for validation (strictly
+        between 0 and 1).
   """
 
   def __init__(self,
@@ -481,6 +511,7 @@ class ImageDataGenerator(object):
                rotation_range=0.,
                width_shift_range=0.,
                height_shift_range=0.,
+               brightness_range=None,
                shear_range=0.,
                zoom_range=0.,
                channel_shift_range=0.,
@@ -490,7 +521,8 @@ class ImageDataGenerator(object):
                vertical_flip=False,
                rescale=None,
                preprocessing_function=None,
-               data_format=None):
+               data_format=None,
+               validation_split=0.0):
     if data_format is None:
       data_format = K.image_data_format()
     self.featurewise_center = featurewise_center
@@ -502,6 +534,7 @@ class ImageDataGenerator(object):
     self.rotation_range = rotation_range
     self.width_shift_range = width_shift_range
     self.height_shift_range = height_shift_range
+    self.brightness_range = brightness_range
     self.shear_range = shear_range
     self.zoom_range = zoom_range
     self.channel_shift_range = channel_shift_range
@@ -526,6 +559,10 @@ class ImageDataGenerator(object):
       self.channel_axis = 3
       self.row_axis = 1
       self.col_axis = 2
+    if validation_split and not 0 < validation_split < 1:
+      raise ValueError('`validation_split` must be strictly between 0 and 1. '
+                       'Received arg: ', validation_split)
+    self.validation_split = validation_split
 
     self.mean = None
     self.std = None
@@ -574,7 +611,8 @@ class ImageDataGenerator(object):
            seed=None,
            save_to_dir=None,
            save_prefix='',
-           save_format='png'):
+           save_format='png',
+           subset=None):
     return NumpyArrayIterator(
         x,
         y,
@@ -585,7 +623,8 @@ class ImageDataGenerator(object):
         data_format=self.data_format,
         save_to_dir=save_to_dir,
         save_prefix=save_prefix,
-        save_format=save_format)
+        save_format=save_format,
+        subset=subset)
 
   def flow_from_directory(self,
                           directory,
@@ -600,6 +639,7 @@ class ImageDataGenerator(object):
                           save_prefix='',
                           save_format='png',
                           follow_links=False,
+                          subset=None,
                           interpolation='nearest'):
     return DirectoryIterator(
         directory,
@@ -616,6 +656,7 @@ class ImageDataGenerator(object):
         save_prefix=save_prefix,
         save_format=save_format,
         follow_links=follow_links,
+        subset=subset,
         interpolation=interpolation)
 
   def standardize(self, x):
@@ -628,7 +669,7 @@ class ImageDataGenerator(object):
         The inputs, normalized.
     """
     if self.preprocessing_function:
-      x = self.preprocessing_function(x)
+      x = self.image_data_generator.preprocessing_function(x)
     if self.rescale:
       x *= self.rescale
     if self.samplewise_center:
@@ -762,6 +803,9 @@ class ImageDataGenerator(object):
       if np.random.random() < 0.5:
         x = flip_axis(x, img_row_axis)
 
+    if self.brightness_range is not None:
+      x = random_brightness(x, self.brightness_range)
+
     return x
 
   def fit(self, x, augment=False, rounds=1, seed=None):
@@ -828,12 +872,10 @@ class ImageDataGenerator(object):
         raise ImportError('Scipy is required for zca_whitening.')
 
       flat_x = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]))
-      num_examples = flat_x.shape[0]
-      _, s, vt = linalg.svd(flat_x / np.sqrt(num_examples))
-      s_expand = np.hstack(
-          (s, np.zeros(vt.shape[0] - num_examples, dtype=flat_x.dtype)))
-      self.principal_components = (
-          vt.T / np.sqrt(s_expand**2 + self.zca_epsilon)).dot(vt)
+      sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
+      u, s, _ = linalg.svd(sigma)
+      s_inv = 1. / np.sqrt(s[np.newaxis] + self.zca_epsilon)
+      self.principal_components = (u * s_inv).dot(u.T)
 
 
 @tf_export('keras.preprocessing.image.Iterator')
@@ -947,6 +989,8 @@ class NumpyArrayIterator(Iterator):
           images (if `save_to_dir` is set).
       save_format: Format to use for saving sample images
           (if `save_to_dir` is set).
+      subset: Subset of data (`"training"` or `"validation"`) if
+          validation_split is set in ImageDataGenerator.
   """
 
   def __init__(self,
@@ -959,17 +1003,29 @@ class NumpyArrayIterator(Iterator):
                data_format=None,
                save_to_dir=None,
                save_prefix='',
-               save_format='png'):
+               save_format='png',
+               subset=None):
     if y is not None and len(x) != len(y):
-      raise ValueError('X (images tensor) and y (labels) '
+      raise ValueError('`x` (images tensor) and `y` (labels) '
                        'should have the same length. '
-                       'Found: X.shape = %s, y.shape = %s' %
+                       'Found: x.shape = %s, y.shape = %s' %
                        (np.asarray(x).shape, np.asarray(y).shape))
-
+    if subset is not None:
+      if subset not in {'training', 'validation'}:
+        raise ValueError('Invalid subset name:', subset,
+                         '; expected "training" or "validation".')
+      split_idx = int(len(x) * image_data_generator.validation_split)
+      if subset == 'validation':
+        x = x[:split_idx]
+        if y is not None:
+          y = y[:split_idx]
+      else:
+        x = x[split_idx:]
+        if y is not None:
+          y = y[split_idx:]
     if data_format is None:
       data_format = K.image_data_format()
     self.x = np.asarray(x, dtype=K.floatx())
-
     if self.x.ndim != 4:
       raise ValueError('Input data in `NumpyArrayIterator` '
                        'should have rank 4. You passed an array '
@@ -1032,8 +1088,7 @@ class NumpyArrayIterator(Iterator):
     return self._get_batches_of_transformed_samples(index_array)
 
 
-def _count_valid_files_in_directory(directory, white_list_formats,
-                                    follow_links):
+def _iter_valid_files(directory, white_list_formats, follow_links):
   """Count files with extension in `white_list_formats` contained in directory.
 
   Arguments:
@@ -1043,29 +1098,54 @@ def _count_valid_files_in_directory(directory, white_list_formats,
           the files to be counted.
       follow_links: boolean.
 
-  Returns:
-      the count of files with extension in `white_list_formats` contained in
-      the directory.
+  Yields:
+      tuple of (root, filename) with extension in `white_list_formats`.
   """
 
   def _recursive_list(subpath):
     return sorted(
-        os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+        os.walk(subpath, followlinks=follow_links), key=lambda x: x[0])
 
-  samples = 0
-  for _, _, files in _recursive_list(directory):
-    for fname in files:
-      is_valid = False
+  for root, _, files in _recursive_list(directory):
+    for fname in sorted(files):
       for extension in white_list_formats:
+        if fname.lower().endswith('.tiff'):
+          logging.warning(
+              'Using \'.tiff\' files with multiple bands will cause '
+              'distortion. Please verify your output.')
         if fname.lower().endswith('.' + extension):
-          is_valid = True
-          break
-      if is_valid:
-        samples += 1
-  return samples
+          yield root, fname
 
 
-def _list_valid_filenames_in_directory(directory, white_list_formats,
+def _count_valid_files_in_directory(directory, white_list_formats, split,
+                                    follow_links):
+  """Count files with extension in `white_list_formats` contained in directory.
+
+  Arguments:
+      directory: absolute path to the directory
+          containing files to be counted
+      white_list_formats: set of strings containing allowed extensions for
+          the files to be counted.
+      split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
+          account a certain fraction of files in each directory.
+          E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
+          of images in each directory.
+      follow_links: boolean.
+
+  Returns:
+      the count of files with extension in `white_list_formats` contained in
+      the directory.
+  """
+  num_files = len(
+      list(_iter_valid_files(directory, white_list_formats, follow_links)))
+  if split:
+    start, stop = int(split[0] * num_files), int(split[1] * num_files)
+  else:
+    start, stop = 0, num_files
+  return stop - start
+
+
+def _list_valid_filenames_in_directory(directory, white_list_formats, split,
                                        class_indices, follow_links):
   """List paths of files in `subdir` with extensions in `white_list_formats`.
 
@@ -1075,6 +1155,10 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
             `class_indices`.
       white_list_formats: set of strings containing allowed extensions for
           the files to be counted.
+      split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
+          account a certain fraction of files in each directory.
+          E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
+          of images in each directory.
       class_indices: dictionary mapping a class name to its index.
       follow_links: boolean.
 
@@ -1084,27 +1168,26 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
           `directory`'s parent (e.g., if `directory` is "dataset/class1",
           the filenames will be ["class1/file1.jpg", "class1/file2.jpg", ...]).
   """
-
-  def _recursive_list(subpath):
-    return sorted(
-        os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+  dirname = os.path.basename(directory)
+  if split:
+    num_files = len(
+        list(_iter_valid_files(directory, white_list_formats, follow_links)))
+    start, stop = int(split[0] * num_files), int(split[1] * num_files)
+    valid_files = list(
+        _iter_valid_files(directory, white_list_formats,
+                          follow_links))[start:stop]
+  else:
+    valid_files = _iter_valid_files(directory, white_list_formats, follow_links)
 
   classes = []
   filenames = []
-  subdir = os.path.basename(directory)
-  basedir = os.path.dirname(directory)
-  for root, _, files in _recursive_list(directory):
-    for fname in sorted(files):
-      is_valid = False
-      for extension in white_list_formats:
-        if fname.lower().endswith('.' + extension):
-          is_valid = True
-          break
-      if is_valid:
-        classes.append(class_indices[subdir])
-        # add filename relative to directory
-        absolute_path = os.path.join(root, fname)
-        filenames.append(os.path.relpath(absolute_path, basedir))
+  for root, fname in valid_files:
+    classes.append(class_indices[dirname])
+    absolute_path = os.path.join(root, fname)
+    relative_path = os.path.join(dirname,
+                                 os.path.relpath(absolute_path, directory))
+    filenames.append(relative_path)
+
   return classes, filenames
 
 
@@ -1144,6 +1227,8 @@ class DirectoryIterator(Iterator):
           images (if `save_to_dir` is set).
       save_format: Format to use for saving sample images
           (if `save_to_dir` is set).
+      subset: Subset of data (`"training"` or `"validation"`) if
+          validation_split is set in ImageDataGenerator.
       interpolation: Interpolation method used to resample the image if the
           target size is different from that of the loaded image.
           Supported methods are "nearest", "bilinear", and "bicubic".
@@ -1167,6 +1252,7 @@ class DirectoryIterator(Iterator):
                save_prefix='',
                save_format='png',
                follow_links=False,
+               subset=None,
                interpolation='nearest'):
     if data_format is None:
       data_format = K.image_data_format()
@@ -1200,7 +1286,20 @@ class DirectoryIterator(Iterator):
     self.save_format = save_format
     self.interpolation = interpolation
 
-    white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm'}
+    if subset is not None:
+      validation_split = self.image_data_generator.validation_split
+      if subset == 'validation':
+        split = (0, validation_split)
+      elif subset == 'training':
+        split = (validation_split, 1)
+      else:
+        raise ValueError('Invalid subset name: ', subset,
+                         '; expected "training" or "validation"')
+    else:
+      split = None
+    self.subset = subset
+
+    white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm', 'tif', 'tiff'}
 
     # first, count the number of samples and classes
     self.samples = 0
@@ -1217,7 +1316,8 @@ class DirectoryIterator(Iterator):
     function_partial = partial(
         _count_valid_files_in_directory,
         white_list_formats=white_list_formats,
-        follow_links=follow_links)
+        follow_links=follow_links,
+        split=split)
     self.samples = sum(
         pool.map(function_partial,
                  (os.path.join(directory, subdir) for subdir in classes)))
@@ -1233,14 +1333,15 @@ class DirectoryIterator(Iterator):
     i = 0
     for dirpath in (os.path.join(directory, subdir) for subdir in classes):
       results.append(
-          pool.apply_async(
-              _list_valid_filenames_in_directory,
-              (dirpath, white_list_formats, self.class_indices, follow_links)))
+          pool.apply_async(_list_valid_filenames_in_directory,
+                           (dirpath, white_list_formats, split,
+                            self.class_indices, follow_links)))
     for res in results:
       classes, filenames = res.get()
       self.classes[i:i + len(classes)] = classes
       self.filenames += filenames
       i += len(classes)
+
     pool.close()
     pool.join()
     super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle,
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
index c0790b5a5140193b18907d9375530f4f06e137da..001fee91f9ed609c0b3cd88d4079e75c0e585b02 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 import shutil
+import tempfile
 
 import numpy as np
 
@@ -74,6 +75,7 @@ class TestImage(test.TestCase):
           shear_range=0.5,
           zoom_range=0.2,
           channel_shift_range=0.,
+          brightness_range=(1, 5),
           fill_mode='nearest',
           cval=0.5,
           horizontal_flip=True,
@@ -92,6 +94,47 @@ class TestImage(test.TestCase):
         self.assertEqual(x.shape[1:], images.shape[1:])
         break
 
+  def test_image_data_generator_with_validation_split(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    for test_images in _generate_test_images():
+      img_list = []
+      for im in test_images:
+        img_list.append(keras.preprocessing.image.img_to_array(im)[None, ...])
+
+      images = np.vstack(img_list)
+      generator = keras.preprocessing.image.ImageDataGenerator(
+          validation_split=0.5)
+      seq = generator.flow(
+          images,
+          np.arange(images.shape[0]),
+          shuffle=False,
+          batch_size=3,
+          subset='validation')
+      _, y = seq[0]
+      self.assertEqual(list(y), [0, 1, 2])
+      seq = generator.flow(
+          images,
+          np.arange(images.shape[0]),
+          shuffle=False,
+          batch_size=3,
+          subset='training')
+      _, y2 = seq[0]
+      self.assertEqual(list(y2), [4, 5, 6])
+
+      with self.assertRaises(ValueError):
+        generator.flow(
+            images,
+            np.arange(images.shape[0]),
+            shuffle=False,
+            batch_size=3,
+            subset='foo')
+
+  def test_image_data_generator_with_split_value_error(self):
+    with self.assertRaises(ValueError):
+      keras.preprocessing.image.ImageDataGenerator(validation_split=5)
+
   def test_image_data_generator_invalid_data(self):
     generator = keras.preprocessing.image.ImageDataGenerator(
         featurewise_center=True,
@@ -202,9 +245,80 @@ class TestImage(test.TestCase):
     # check number of classes and images
     self.assertEqual(len(dir_iterator.class_indices), num_classes)
     self.assertEqual(len(dir_iterator.classes), count)
-    self.assertEqual(sorted(dir_iterator.filenames), sorted(filenames))
+    self.assertEqual(set(dir_iterator.filenames), set(filenames))
     _ = dir_iterator.next()
 
+  def directory_iterator_with_validation_split_test_helper(
+      self, validation_split):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    num_classes = 2
+    tmp_folder = tempfile.mkdtemp(prefix='test_images')
+
+    # create folders and subfolders
+    paths = []
+    for cl in range(num_classes):
+      class_directory = 'class-{}'.format(cl)
+      classpaths = [
+          class_directory,
+          os.path.join(class_directory, 'subfolder-1'),
+          os.path.join(class_directory, 'subfolder-2'),
+          os.path.join(class_directory, 'subfolder-1', 'sub-subfolder')
+      ]
+      for path in classpaths:
+        os.mkdir(os.path.join(tmp_folder, path))
+      paths.append(classpaths)
+
+    # save the images in the paths
+    count = 0
+    filenames = []
+    for test_images in _generate_test_images():
+      for im in test_images:
+        # rotate image class
+        im_class = count % num_classes
+        # rotate subfolders
+        classpaths = paths[im_class]
+        filename = os.path.join(classpaths[count % len(classpaths)],
+                                'image-{}.jpg'.format(count))
+        filenames.append(filename)
+        im.save(os.path.join(tmp_folder, filename))
+        count += 1
+
+    # create iterator
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        validation_split=validation_split)
+
+    with self.assertRaises(ValueError):
+      generator.flow_from_directory(tmp_folder, subset='foo')
+
+    num_validation = int(count * validation_split)
+    num_training = count - num_validation
+    train_iterator = generator.flow_from_directory(
+        tmp_folder, subset='training')
+    self.assertEqual(train_iterator.samples, num_training)
+
+    valid_iterator = generator.flow_from_directory(
+        tmp_folder, subset='validation')
+    self.assertEqual(valid_iterator.samples, num_validation)
+
+    # check number of classes and images
+    self.assertEqual(len(train_iterator.class_indices), num_classes)
+    self.assertEqual(len(train_iterator.classes), num_training)
+    self.assertEqual(
+        len(set(train_iterator.filenames) & set(filenames)), num_training)
+
+    shutil.rmtree(tmp_folder)
+
+  def test_directory_iterator_with_validation_split_25_percent(self):
+    self.directory_iterator_with_validation_split_test_helper(0.25)
+
+  def test_directory_iterator_with_validation_split_40_percent(self):
+    self.directory_iterator_with_validation_split_test_helper(0.40)
+
+  def test_directory_iterator_with_validation_split_50_percent(self):
+    self.directory_iterator_with_validation_split_test_helper(0.50)
+
   def test_img_utils(self):
     if PIL is None:
       return  # Skip test if PIL is not available.
@@ -241,6 +355,41 @@ class TestImage(test.TestCase):
     x = keras.preprocessing.image.img_to_array(img, data_format='channels_last')
     self.assertEqual(x.shape, (height, width, 1))
 
+  def test_batch_standardize(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    # ImageDataGenerator.standardize should work on batches
+    for test_images in _generate_test_images():
+      img_list = []
+      for im in test_images:
+        img_list.append(keras.preprocessing.image.img_to_array(im)[None, ...])
+
+      images = np.vstack(img_list)
+      generator = keras.preprocessing.image.ImageDataGenerator(
+          featurewise_center=True,
+          samplewise_center=True,
+          featurewise_std_normalization=True,
+          samplewise_std_normalization=True,
+          zca_whitening=True,
+          rotation_range=90.,
+          width_shift_range=0.1,
+          height_shift_range=0.1,
+          shear_range=0.5,
+          zoom_range=0.2,
+          channel_shift_range=0.,
+          brightness_range=(1, 5),
+          fill_mode='nearest',
+          cval=0.5,
+          horizontal_flip=True,
+          vertical_flip=True)
+      generator.fit(images, augment=True)
+
+      transformed = np.copy(images)
+      for i, im in enumerate(transformed):
+        transformed[i] = generator.random_transform(im)
+      transformed = generator.standardize(transformed)
+
   def test_img_transforms(self):
     x = np.random.random((3, 200, 200))
     _ = keras.preprocessing.image.random_rotation(x, 20)
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
index a423d96d3d8578df347b7ee36fb53dfd335e0d65..e68c171d9c7e33d7e932f5d5b7f15859faa2348b 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
@@ -22,6 +22,8 @@ import random
 
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
+
+from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -32,29 +34,40 @@ def pad_sequences(sequences,
                   padding='pre',
                   truncating='pre',
                   value=0.):
-  """Pads each sequence to the same length (length of the longest sequence).
+  """Pads sequences to the same length.
+
+  This function transforms a list of
+  `num_samples` sequences (lists of integers)
+  into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
+  `num_timesteps` is either the `maxlen` argument if provided,
+  or the length of the longest sequence otherwise.
+
+  Sequences that are shorter than `num_timesteps`
+  are padded with `value` at the end.
 
-  If maxlen is provided, any sequence longer
-  than maxlen is truncated to maxlen.
-  Truncation happens off either the beginning (default) or
-  the end of the sequence.
+  Sequences longer than `num_timesteps` are truncated
+  so that they fit the desired length.
+  The position where padding or truncation happens is determined by
+  the arguments `padding` and `truncating`, respectively.
 
-  Supports post-padding and pre-padding (default).
+  Pre-padding is the default.
 
   Arguments:
-      sequences: list of lists where each element is a sequence
-      maxlen: int, maximum length
-      dtype: type to cast the resulting sequence.
-      padding: 'pre' or 'post', pad either before or after each sequence.
-      truncating: 'pre' or 'post', remove values from sequences larger than
-          maxlen either in the beginning or in the end of the sequence
-      value: float, value to pad the sequences to the desired value.
+      sequences: List of lists, where each element is a sequence.
+      maxlen: Int, maximum length of all sequences.
+      dtype: Type of the output sequences.
+      padding: String, 'pre' or 'post':
+          pad either before or after each sequence.
+      truncating: String, 'pre' or 'post':
+          remove values from sequences larger than
+          `maxlen`, either at the beginning or at the end of the sequences.
+      value: Float, padding value.
 
   Returns:
-      x: numpy array with dimensions (number_of_sequences, maxlen)
+      x: Numpy array with shape `(len(sequences), maxlen)`
 
   Raises:
-      ValueError: in case of invalid values for `truncating` or `padding`,
+      ValueError: In case of invalid values for `truncating` or `padding`,
           or in case of invalid shape for a `sequences` entry.
   """
   if not hasattr(sequences, '__len__'):
@@ -92,10 +105,9 @@ def pad_sequences(sequences,
     # check `trunc` has expected shape
     trunc = np.asarray(trunc, dtype=dtype)
     if trunc.shape[1:] != sample_shape:
-      raise ValueError(
-          'Shape of sample %s of sequence at position %s is different from '
-          'expected shape %s'
-          % (trunc.shape[1:], idx, sample_shape))
+      raise ValueError('Shape of sample %s of sequence at position %s '
+                       'is different from expected shape %s' %
+                       (trunc.shape[1:], idx, sample_shape))
 
     if padding == 'post':
       x[idx, :len(trunc)] = trunc
@@ -110,22 +122,26 @@ def pad_sequences(sequences,
 def make_sampling_table(size, sampling_factor=1e-5):
   """Generates a word rank-based probabilistic sampling table.
 
-  This generates an array where the ith element
-  is the probability that a word of rank i would be sampled,
-  according to the sampling distribution used in word2vec.
+  Used for generating the `sampling_table` argument for `skipgrams`.
+  `sampling_table[i]` is the probability of sampling
+  the word i-th most common word in a dataset
+  (more common words should be sampled less frequently, for balance).
 
-  The word2vec formula is:
-      p(word) = min(1, sqrt(word.frequency/sampling_factor) /
-      (word.frequency/sampling_factor))
+  The sampling probabilities are generated according
+  to the sampling distribution used in word2vec:
+
+  `p(word) = min(1, sqrt(word_frequency / sampling_factor) / (word_frequency /
+  sampling_factor))`
 
   We assume that the word frequencies follow Zipf's law (s=1) to derive
   a numerical approximation of frequency(rank):
-     frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))
-      where gamma is the Euler-Mascheroni constant.
+
+  `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`
+  where `gamma` is the Euler-Mascheroni constant.
 
   Arguments:
-      size: int, number of possible words to sample.
-      sampling_factor: the sampling factor in the word2vec formula.
+      size: Int, number of possible words to sample.
+      sampling_factor: The sampling factor in the word2vec formula.
 
   Returns:
       A 1D Numpy array of length `size` where the ith entry
@@ -151,30 +167,37 @@ def skipgrams(sequence,
               seed=None):
   """Generates skipgram word pairs.
 
-  Takes a sequence (list of indexes of words),
-  returns couples of [word_index, other_word index] and labels (1s or 0s),
-  where label = 1 if 'other_word' belongs to the context of 'word',
-  and label=0 if 'other_word' is randomly sampled
+  This function transforms a sequence of word indexes (list of integers)
+  into tuples of words of the form:
+
+  - (word, word in the same window), with label 1 (positive samples).
+  - (word, random word from the vocabulary), with label 0 (negative samples).
+
+  Read more about Skipgram in this gnomic paper by Mikolov et al.:
+  [Efficient Estimation of Word Representations in
+  Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)
 
   Arguments:
-      sequence: a word sequence (sentence), encoded as a list
+      sequence: A word sequence (sentence), encoded as a list
           of word indices (integers). If using a `sampling_table`,
           word indices are expected to match the rank
           of the words in a reference dataset (e.g. 10 would encode
           the 10-th most frequently occurring token).
           Note that index 0 is expected to be a non-word and will be skipped.
-      vocabulary_size: int. maximum possible word index + 1
-      window_size: int. actually half-window.
-          The window of a word wi will be [i-window_size, i+window_size+1]
-      negative_samples: float >= 0. 0 for no negative (=random) samples.
-          1 for same number as positive samples. etc.
-      shuffle: whether to shuffle the word couples before returning them.
+      vocabulary_size: Int, maximum possible word index + 1
+      window_size: Int, size of sampling windows (technically half-window).
+          The window of a word `w_i` will be
+          `[i - window_size, i + window_size+1]`.
+      negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.
+          1 for same number as positive samples.
+      shuffle: Whether to shuffle the word couples before returning them.
       categorical: bool. if False, labels will be
-          integers (eg. [0, 1, 1 .. ]),
-          if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]
+          integers (eg. `[0, 1, 1 .. ]`),
+          if `True`, labels will be categorical, e.g.
+          `[[1,0],[0,1],[0,1] .. ]`.
       sampling_table: 1D array of size `vocabulary_size` where the entry i
           encodes the probability to sample a word of rank i.
-      seed: random seed.
+      seed: Random seed.
 
   Returns:
       couples, labels: where `couples` are int pairs and
@@ -234,9 +257,9 @@ def _remove_long_seq(maxlen, seq, label):
   """Removes sequences that exceed the maximum length.
 
   Arguments:
-      maxlen: int, maximum length
-      seq: list of lists where each sublist is a sequence
-      label: list where each element is an integer
+      maxlen: Int, maximum length of the output sequences.
+      seq: List of lists, where each sublist is a sequence.
+      label: List where each element is an integer.
 
   Returns:
       new_seq, new_label: shortened lists for `seq` and `label`.
@@ -247,3 +270,120 @@ def _remove_long_seq(maxlen, seq, label):
       new_seq.append(x)
       new_label.append(y)
   return new_seq, new_label
+
+
+@tf_export('keras.preprocessing.sequence.TimeseriesGenerator')
+class TimeseriesGenerator(Sequence):
+  """Utility class for generating batches of temporal data.
+
+  This class takes in a sequence of data-points gathered at
+  equal intervals, along with time series parameters such as
+  stride, length of history, etc., to produce batches for
+  training/validation.
+
+  Arguments:
+      data: Indexable generator (such as list or Numpy array)
+          containing consecutive data points (timesteps).
+          The data should be at 2D, and axis 0 is expected
+          to be the time dimension.
+      targets: Targets corresponding to timesteps in `data`.
+          It should have same length as `data`.
+      length: Length of the output sequences (in number of timesteps).
+      sampling_rate: Period between successive individual timesteps
+          within sequences. For rate `r`, timesteps
+          `data[i]`, `data[i-r]`, ... `data[i - length]`
+          are used for create a sample sequence.
+      stride: Period between successive output sequences.
+          For stride `s`, consecutive output samples would
+          be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
+      start_index, end_index: Data points earlier than `start_index`
+          or later than `end_index` will not be used in the output sequences.
+          This is useful to reserve part of the data for test or validation.
+      shuffle: Whether to shuffle output samples,
+          or instead draw them in chronological order.
+      reverse: Boolean: if `true`, timesteps in each output sample will be
+          in reverse chronological order.
+      batch_size: Number of timeseries samples in each batch
+          (except maybe the last one).
+
+  Returns:
+      A [Sequence](/utils/#sequence) instance.
+
+  Examples:
+
+  ```python
+  from keras.preprocessing.sequence import TimeseriesGenerator
+  import numpy as np
+
+  data = np.array([[i] for i in range(50)])
+  targets = np.array([[i] for i in range(50)])
+
+  data_gen = TimeseriesGenerator(data, targets,
+                                 length=10, sampling_rate=2,
+                                 batch_size=2)
+  assert len(data_gen) == 20
+
+  batch_0 = data_gen[0]
+  x, y = batch_0
+  assert np.array_equal(x,
+                        np.array([[[0], [2], [4], [6], [8]],
+                                  [[1], [3], [5], [7], [9]]]))
+  assert np.array_equal(y,
+                        np.array([[10], [11]]))
+  ```
+  """
+
+  def __init__(self,
+               data,
+               targets,
+               length,
+               sampling_rate=1,
+               stride=1,
+               start_index=0,
+               end_index=None,
+               shuffle=False,
+               reverse=False,
+               batch_size=128):
+    self.data = data
+    self.targets = targets
+    self.length = length
+    self.sampling_rate = sampling_rate
+    self.stride = stride
+    self.start_index = start_index + length
+    if end_index is None:
+      end_index = len(data) - 1
+    self.end_index = end_index
+    self.shuffle = shuffle
+    self.reverse = reverse
+    self.batch_size = batch_size
+
+  def __len__(self):
+    length = int(
+        np.ceil((self.end_index - self.start_index) /
+                (self.batch_size * self.stride)))
+    return length if length >= 0 else 0
+
+  def _empty_batch(self, num_rows):
+    samples_shape = [num_rows, self.length // self.sampling_rate]
+    samples_shape.extend(self.data.shape[1:])
+    targets_shape = [num_rows]
+    targets_shape.extend(self.targets.shape[1:])
+    return np.empty(samples_shape), np.empty(targets_shape)
+
+  def __getitem__(self, index):
+    if self.shuffle:
+      rows = np.random.randint(
+          self.start_index, self.end_index, size=self.batch_size)
+    else:
+      i = self.start_index + self.batch_size * self.stride * index
+      rows = np.arange(i, min(i + self.batch_size * self.stride,
+                              self.end_index), self.stride)
+
+    samples, targets = self._empty_batch(len(rows))
+    for j in range(len(rows)):
+      indices = range(rows[j] - self.length, rows[j], self.sampling_rate)
+      samples[j] = self.data[indices]
+      targets[j] = self.targets[rows[j]]
+    if self.reverse:
+      return samples[:, ::-1, ...], targets
+    return samples, targets
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
index 4529e6e94fc42661fb0474c1a827863ddb654776..b9bfdd000484665e8771f4bccef59738e5c26120 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
@@ -84,15 +84,91 @@ class TestSequence(test.TestCase):
     couples, labels = keras.preprocessing.sequence.skipgrams(
         np.arange(3), vocabulary_size=3)
     for couple in couples:
-      assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2]
+      self.assertIn(couple[0], [0, 1, 2])
+      self.assertIn(couple[1], [0, 1, 2])
 
     # test window size and categorical labels
     couples, labels = keras.preprocessing.sequence.skipgrams(
         np.arange(5), vocabulary_size=5, window_size=1, categorical=True)
     for couple in couples:
-      assert couple[0] - couple[1] <= 3
+      self.assertLessEqual(couple[0] - couple[1], 3)
     for l in labels:
-      assert len(l) == 2
+      self.assertEqual(len(l), 2)
+
+  def test_TimeseriesGenerator(self):
+    data = np.array([[i] for i in range(50)])
+    targets = np.array([[i] for i in range(50)])
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data, targets, length=10, sampling_rate=2, batch_size=2)
+    self.assertEqual(len(data_gen), 20)
+    self.assertAllClose(data_gen[0][0],
+                        np.array([[[0], [2], [4], [6], [8]], [[1], [3], [5],
+                                                              [7], [9]]]))
+    self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
+    self.assertAllClose(data_gen[1][0],
+                        np.array([[[2], [4], [6], [8], [10]], [[3], [5], [7],
+                                                               [9], [11]]]))
+    self.assertAllClose(data_gen[1][1], np.array([[12], [13]]))
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data, targets, length=10, sampling_rate=2, reverse=True, batch_size=2)
+    self.assertEqual(len(data_gen), 20)
+    self.assertAllClose(data_gen[0][0],
+                        np.array([[[8], [6], [4], [2], [0]], [[9], [7], [5],
+                                                              [3], [1]]]))
+    self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data, targets, length=10, sampling_rate=2, shuffle=True, batch_size=1)
+    batch = data_gen[0]
+    r = batch[1][0][0]
+    self.assertAllClose(batch[0],
+                        np.array([[[r - 10], [r - 8], [r - 6], [r - 4],
+                                   [r - 2]]]))
+    self.assertAllClose(batch[1], np.array([
+        [r],
+    ]))
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data, targets, length=10, sampling_rate=2, stride=2, batch_size=2)
+    self.assertEqual(len(data_gen), 10)
+    self.assertAllClose(data_gen[1][0],
+                        np.array([[[4], [6], [8], [10], [12]], [[6], [8], [10],
+                                                                [12], [14]]]))
+    self.assertAllClose(data_gen[1][1], np.array([[14], [16]]))
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data,
+        targets,
+        length=10,
+        sampling_rate=2,
+        start_index=10,
+        end_index=30,
+        batch_size=2)
+    self.assertEqual(len(data_gen), 5)
+    self.assertAllClose(data_gen[0][0],
+                        np.array([[[10], [12], [14], [16], [18]],
+                                  [[11], [13], [15], [17], [19]]]))
+    self.assertAllClose(data_gen[0][1], np.array([[20], [21]]))
+
+    data = np.array([np.random.random_sample((1, 2, 3, 4)) for i in range(50)])
+    targets = np.array([np.random.random_sample((3, 2, 1)) for i in range(50)])
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data,
+        targets,
+        length=10,
+        sampling_rate=2,
+        start_index=10,
+        end_index=30,
+        batch_size=2)
+
+    self.assertEqual(len(data_gen), 5)
+    self.assertAllClose(data_gen[0][0],
+                        np.array(
+                            [np.array(data[10:19:2]),
+                             np.array(data[11:20:2])]))
+    self.assertAllClose(data_gen[0][1], np.array([targets[20], targets[21]]))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text.py b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
index 1e3828ccf1e3bf9c443691e1c1da5697bedb4653..f652f318f3d6dae20b1113a50cd02930abb851af 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
@@ -91,6 +91,7 @@ def one_hot(text,
       text, n, hash_function=hash, filters=filters, lower=lower, split=split)
 
 
+@tf_export('keras.preprocessing.text.hashing_trick')
 def hashing_trick(text,
                   n,
                   hash_function=None,
@@ -187,21 +188,27 @@ class Tokenizer(object):
     self.document_count = 0
     self.char_level = char_level
     self.oov_token = oov_token
+    self.index_docs = {}
 
   def fit_on_texts(self, texts):
     """Updates internal vocabulary based on a list of texts.
 
+    In the case where texts contains lists, we assume each entry of the lists
+    to be a token.
+
     Required before using `texts_to_sequences` or `texts_to_matrix`.
 
     Arguments:
         texts: can be a list of strings,
-            or a generator of strings (for memory-efficiency)
+            a generator of strings (for memory-efficiency),
+            or a list of list of strings.
     """
-    self.document_count = 0
     for text in texts:
       self.document_count += 1
-      seq = text if self.char_level else text_to_word_sequence(
-          text, self.filters, self.lower, self.split)
+      if self.char_level or isinstance(text, list):
+        seq = text
+      else:
+        seq = text_to_word_sequence(text, self.filters, self.lower, self.split)
       for w in seq:
         if w in self.word_counts:
           self.word_counts[w] += 1
@@ -226,7 +233,6 @@ class Tokenizer(object):
       if i is None:
         self.word_index[self.oov_token] = len(self.word_index) + 1
 
-    self.index_docs = {}
     for w, c in list(self.word_docs.items()):
       self.index_docs[self.word_index[w]] = c
 
@@ -240,8 +246,7 @@ class Tokenizer(object):
         sequences: A list of sequence.
             A "sequence" is a list of integer word indices.
     """
-    self.document_count = len(sequences)
-    self.index_docs = {}
+    self.document_count += len(sequences)
     for seq in sequences:
       seq = set(seq)
       for i in seq:
@@ -268,7 +273,11 @@ class Tokenizer(object):
     return res
 
   def texts_to_sequences_generator(self, texts):
-    """Transforms each text in texts in a sequence of integers.
+    """Transforms each text in `texts` in a sequence of integers.
+
+    Each item in texts can also be a list, in which case we assume each item of
+    that list
+    to be a token.
 
     Only top "num_words" most frequent words will be taken into account.
     Only words known by the tokenizer will be taken into account.
@@ -281,8 +290,10 @@ class Tokenizer(object):
     """
     num_words = self.num_words
     for text in texts:
-      seq = text if self.char_level else text_to_word_sequence(
-          text, self.filters, self.lower, self.split)
+      if self.char_level or isinstance(text, list):
+        seq = text
+      else:
+        seq = text_to_word_sequence(text, self.filters, self.lower, self.split)
       vect = []
       for w in seq:
         i = self.word_index.get(w)
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
index a934e331c4a14d9bd170258b6b6183e6a15bb561..c6a267e57e4e2dc04156483d1cf85a42a78eb395 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -80,17 +81,52 @@ class TestText(test.TestCase):
     x_train = ['This text has only known words']
     x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
 
-    # Defalut, without OOV flag
+    # Default, without OOV flag
     tokenizer = keras.preprocessing.text.Tokenizer()
     tokenizer.fit_on_texts(x_train)
     x_test_seq = tokenizer.texts_to_sequences(x_test)
-    assert len(x_test_seq[0]) == 4  # discards 2 OOVs
+    self.assertEqual(len(x_test_seq[0]), 4)  # discards 2 OOVs
 
     # With OOV feature
     tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<unk>')
     tokenizer.fit_on_texts(x_train)
     x_test_seq = tokenizer.texts_to_sequences(x_test)
-    assert len(x_test_seq[0]) == 6  # OOVs marked in place
+    self.assertEqual(len(x_test_seq[0]), 6)  # OOVs marked in place
+
+  def test_sequential_fit(self):
+    texts = [
+        'The cat sat on the mat.', 'The dog sat on the log.',
+        'Dogs and cats living together.'
+    ]
+    word_sequences = [['The', 'cat', 'is', 'sitting'],
+                      ['The', 'dog', 'is', 'standing']]
+    tokenizer = keras.preprocessing.text.Tokenizer()
+    tokenizer.fit_on_texts(texts)
+    tokenizer.fit_on_texts(word_sequences)
+
+    self.assertEqual(tokenizer.document_count, 5)
+
+    tokenizer.texts_to_matrix(texts)
+    tokenizer.texts_to_matrix(word_sequences)
+
+  def test_text_to_word_sequence(self):
+    text = 'hello! ? world!'
+    seq = keras.preprocessing.text.text_to_word_sequence(text)
+    self.assertEqual(seq, ['hello', 'world'])
+
+  def test_text_to_word_sequence_unicode(self):
+    text = u'ali! veli? kırk dokuz elli'
+    seq = keras.preprocessing.text.text_to_word_sequence(text)
+    self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
+
+  def test_tokenizer_unicode(self):
+    texts = [
+        u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'
+    ]
+    tokenizer = keras.preprocessing.text.Tokenizer(num_words=5)
+    tokenizer.fit_on_texts(texts)
+
+    self.assertEqual(len(tokenizer.word_counts), 5)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/_impl/keras/regularizers.py
index 2c30844647acdb78d1ca31d052ec7e5ecc6dcc2a..74c37d370ea630ca3c3e5e0945828f63928572e1 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/_impl/keras/regularizers.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,9 +56,9 @@ class L1L2(Regularizer):
   def __call__(self, x):
     regularization = 0.
     if self.l1:
-      regularization += K.sum(self.l1 * K.abs(x))
+      regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
     if self.l2:
-      regularization += K.sum(self.l2 * K.square(x))
+      regularization += math_ops.reduce_sum(self.l2 * math_ops.square(x))
     return regularization
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/testing_utils.py b/tensorflow/python/keras/_impl/keras/testing_utils.py
index fa1ee2fa3da3fbc7650ee80960b00907013cc37c..60799ee1e038b4466351248bb5de7c8fc0de02a2 100644
--- a/tensorflow/python/keras/_impl/keras/testing_utils.py
+++ b/tensorflow/python/keras/_impl/keras/testing_utils.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 from tensorflow.python.util import tf_inspect
 
 
@@ -145,7 +146,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     np.testing.assert_allclose(output, actual_output, rtol=1e-3)
 
   # test training mode (e.g. useful for dropout tests)
-  model.compile('rmsprop', 'mse')
+  model.compile(RMSPropOptimizer(0.01), 'mse')
   model.train_on_batch(input_data, actual_output)
 
   # test as first layer in Sequential API
@@ -181,9 +182,5 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     output = recovered_model.predict(input_data)
     np.testing.assert_allclose(output, actual_output, rtol=1e-3)
 
-  # test training mode (e.g. useful for dropout tests)
-  model.compile('rmsprop', 'mse')
-  model.train_on_batch(input_data, actual_output)
-
   # for further checks in the caller function
   return actual_output
diff --git a/tensorflow/python/keras/_impl/keras/utils/__init__.py b/tensorflow/python/keras/_impl/keras/utils/__init__.py
index 370ae0dd0f0d00059f1b0cc79459abe75c8ca494..0c9f19a0c8dcf3bf929e102b31679a03b27728f7 100644
--- a/tensorflow/python/keras/_impl/keras/utils/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/utils/__init__.py
@@ -31,8 +31,8 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_ke
 from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary
+from tensorflow.python.keras._impl.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
 from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.keras._impl.keras.utils.training_utils import multi_gpu_model
 from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py b/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
index 583079d9626361eb594f16a57af86f103e5ee74d..8882a3a46bcb9de7283a67f001e67ed8644a0cf7 100644
--- a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
@@ -21,17 +21,146 @@ from __future__ import print_function
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
-# pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.layers.utils import conv_input_length
-from tensorflow.python.layers.utils import conv_output_length
-from tensorflow.python.layers.utils import deconv_output_length as deconv_length
-from tensorflow.python.layers.utils import normalize_tuple
+from tensorflow.python.keras._impl.keras import backend
+
+
+def convert_data_format(data_format, ndim):
+  if data_format == 'channels_last':
+    if ndim == 3:
+      return 'NWC'
+    elif ndim == 4:
+      return 'NHWC'
+    elif ndim == 5:
+      return 'NDHWC'
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  elif data_format == 'channels_first':
+    if ndim == 3:
+      return 'NCW'
+    elif ndim == 4:
+      return 'NCHW'
+    elif ndim == 5:
+      return 'NCDHW'
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  else:
+    raise ValueError('Invalid data_format:', data_format)
+
+
+def normalize_tuple(value, n, name):
+  """Transforms a single integer or iterable of integers into an integer tuple.
+
+  Arguments:
+    value: The value to validate and convert. Could an int, or any iterable
+      of ints.
+    n: The size of the tuple to be returned.
+    name: The name of the argument being validated, e.g. "strides" or
+      "kernel_size". This is only used to format error messages.
+
+  Returns:
+    A tuple of n integers.
+
+  Raises:
+    ValueError: If something else than an int/long or iterable thereof was
+      passed.
+  """
+  if isinstance(value, int):
+    return (value,) * n
+  else:
+    try:
+      value_tuple = tuple(value)
+    except TypeError:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    if len(value_tuple) != n:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    for single_value in value_tuple:
+      try:
+        int(single_value)
+      except (ValueError, TypeError):
+        raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                         str(n) + ' integers. Received: ' + str(value) + ' '
+                         'including element ' + str(single_value) + ' of type' +
+                         ' ' + str(type(single_value)))
+    return value_tuple
+
+
+def conv_output_length(input_length, filter_size, padding, stride, dilation=1):
+  """Determines output length of a convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+      dilation: dilation rate, integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+  if padding == 'same':
+    output_length = input_length
+  elif padding == 'valid':
+    output_length = input_length - dilated_filter_size + 1
+  elif padding == 'full':
+    output_length = input_length + dilated_filter_size - 1
+  return (output_length + stride - 1) // stride
+
+
+def conv_input_length(output_length, filter_size, padding, stride):
+  """Determines input length of a convolution given output length.
+
+  Arguments:
+      output_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The input length (integer).
+  """
+  if output_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  if padding == 'same':
+    pad = filter_size // 2
+  elif padding == 'valid':
+    pad = 0
+  elif padding == 'full':
+    pad = filter_size - 1
+  return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+def deconv_output_length(input_length, filter_size, padding, stride):
+  """Determines output length of a transposed convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  input_length *= stride
+  if padding == 'valid':
+    input_length += max(filter_size - stride, 0)
+  elif padding == 'full':
+    input_length -= (stride + filter_size - 2)
+  return input_length
 
 
 def normalize_data_format(value):
   if value is None:
-    value = K.image_data_format()
+    value = backend.image_data_format()
   data_format = value.lower()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('The `data_format` argument must be one of '
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils.py b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
index e87c8f48ef0967d561db1ab841a669d783f9b1ec..4c49544c6a63c4e5a0b79d31b074ad352c512bfa 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
@@ -393,6 +393,16 @@ class Sequence(object):
     """
     pass
 
+  def __iter__(self):
+    """Creates an infinite generator that iterate over the Sequence.
+
+    Yields:
+      Sequence items.
+    """
+    while True:
+      for item in (self[i] for i in range(len(self))):
+        yield item
+
 
 # Global variables to be shared across processes
 _SHARED_SEQUENCES = {}
@@ -400,6 +410,11 @@ _SHARED_SEQUENCES = {}
 _SEQUENCE_COUNTER = None
 
 
+def init_pool(seqs):
+  global _SHARED_SEQUENCES
+  _SHARED_SEQUENCES = seqs
+
+
 def get_index(uid, i):
   """Get the value from the Sequence `uid` at index `i`.
 
@@ -532,9 +547,11 @@ class OrderedEnqueuer(SequenceEnqueuer):
             (when full, workers could block on `put()`)
     """
     if self.use_multiprocessing:
-      self.executor_fn = lambda: multiprocessing.Pool(workers)
+      self.executor_fn = lambda seqs: multiprocessing.Pool(  # pylint: disable=g-long-lambda
+          workers, initializer=init_pool, initargs=(seqs,))
     else:
-      self.executor_fn = lambda: ThreadPool(workers)
+       # We do not need the init since it's threads.
+      self.executor_fn = lambda _: ThreadPool(workers)
     self.workers = workers
     self.queue = queue.Queue(max_queue_size)
     self.stop_signal = threading.Event()
@@ -557,7 +574,7 @@ class OrderedEnqueuer(SequenceEnqueuer):
       if self.shuffle:
         random.shuffle(sequence)
 
-      with closing(self.executor_fn()) as executor:
+      with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor:
         for i in sequence:
           if self.stop_signal.is_set():
             return
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
index 462d600bf827768b0f2e6265aebdaad48e70fcd9..db184d278cfd1022fafcd55d7bb6a6544c82656d 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
@@ -21,6 +21,7 @@ import binascii
 import codecs
 import marshal
 import os
+import re
 import sys
 import time
 import types as python_types
@@ -28,6 +29,7 @@ import types as python_types
 import numpy as np
 import six
 
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
@@ -490,8 +492,8 @@ def slice_arrays(arrays, start=None, stop=None):
   if arrays is None:
     return [None]
   if isinstance(start, list) and stop is not None:
-    raise ValueError('The stop argument has to be None if the value of start is'
-                     'a list.')
+    raise ValueError('The stop argument has to be None if the value of start '
+                     'is a list.')
   elif isinstance(arrays, list):
     if hasattr(start, '__len__'):
       # hdf5 datasets only support list objects as indices
@@ -509,3 +511,48 @@ def slice_arrays(arrays, start=None, stop=None):
       return arrays[start:stop]
     else:
       return [None]
+
+
+def to_list(x):
+  """Normalizes a list/tensor into a list.
+
+  If a tensor is passed, we return
+  a list of size 1 containing the tensor.
+
+  Arguments:
+      x: target object to be normalized.
+
+  Returns:
+      A list.
+  """
+  if isinstance(x, list):
+    return x
+  return [x]
+
+
+def object_list_uid(object_list):
+  """Creates a single string from object ids."""
+  object_list = nest.flatten(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def to_snake_case(name):
+  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
+  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
+  # If the class is private the name starts with "_" which is not secure
+  # for creating scopes. We prefix the name with "private" in this case.
+  if insecure[0] != '_':
+    return insecure
+  return 'private' + insecure
+
+
+def is_all_none(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  # We cannot use Python's `any` because the iterable may return Tensors.
+  for element in iterable:
+    if element is not None:
+      return False
+  return True
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index bbf1d2a3d9c3948271780ec3fad3316b4e6d53c3..f82e3277de70a631c93f0ef3c240f41ddb3390a7 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -19,9 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import defaultdict
-import sys
 
 import numpy as np
+import six
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -160,13 +160,11 @@ def ask_to_proceed_with_overwrite(filepath):
   Returns:
       True if we can proceed with overwrite, False otherwise.
   """
-  get_input = input
-  if sys.version_info[:2] <= (2, 7):
-    get_input = raw_input
-  overwrite = get_input('[WARNING] %s already exists - overwrite? '
-                        '[y/n]' % (filepath))
-  while overwrite not in ['y', 'n']:
-    overwrite = get_input('Enter "y" (overwrite) or "n" (cancel).')
+  overwrite = six.moves.input('[WARNING] %s already exists - overwrite? '
+                              '[y/n]' % (filepath)).strip().lower()
+  while overwrite not in ('y', 'n'):
+    overwrite = six.moves.input('Enter "y" (overwrite) or "n" '
+                                '(cancel).').strip().lower()
   if overwrite == 'n':
     return False
   print('[TIP] Next time specify overwrite=True!')
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
index 4c8009dfd80e1aec457fa03687f2840c7fe4607b..902972ecbb8fd69a9252b7e19e32bee5e33e4f97 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
@@ -35,7 +35,7 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  return int(np.sum([K.count_params(p) for p in set(weights)]))
+  return int(np.sum([np.prod(p.get_shape().as_list()) for p in set(weights)]))
 
 
 def print_summary(model, line_length=None, positions=None, print_fn=None):
@@ -193,8 +193,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   else:
     trainable_count = count_params(model.trainable_weights)
 
-  non_trainable_count = int(
-      np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
+  non_trainable_count = count_params(model.non_trainable_weights)
 
   print_fn('Total params: {:,}'.format(trainable_count + non_trainable_count))
   print_fn('Trainable params: {:,}'.format(trainable_count))
diff --git a/tensorflow/python/keras/_impl/keras/utils/training_utils.py b/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/utils/training_utils.py
rename to tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils.py
index ce7402e9d279278eaaf5aab58a3973eec6de8e99..231ace2a0b4a4f25cebf06a5216cf3d30aadc49b 100644
--- a/tensorflow/python/keras/_impl/keras/utils/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils.py
@@ -125,7 +125,7 @@ def multi_gpu_model(model, gpus):
     if gpus <= 1:
       raise ValueError('For multi-gpu usage to be effective, '
                        'call `multi_gpu_model` with `gpus >= 2`. '
-                       'Received: `gpus=%d`' % gpus)
+                       'Received: `gpus=%s`' % gpus)
     num_gpus = gpus
     target_gpu_ids = range(num_gpus)
 
@@ -136,7 +136,7 @@ def multi_gpu_model(model, gpus):
   ]
   for device in target_devices:
     if device not in available_devices:
-      raise ValueError('To call `multi_gpu_model` with `gpus=%d`, '
+      raise ValueError('To call `multi_gpu_model` with `gpus=%s`, '
                        'we expect the following devices to be available: %s. '
                        'However this machine only has: %s. '
                        'Try reducing `gpus`.' % (gpus, target_devices,
diff --git a/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils_test.py
similarity index 64%
rename from tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
rename to tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils_test.py
index 12354c49ca72cddc0f395bcfcfabab18c1189227..0a38d6b5228fe791ce14adc7e37e0b7a6926fadf 100644
--- a/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils_test.py
@@ -19,21 +19,34 @@ from __future__ import print_function
 
 import numpy as np
 
-
+from tensorflow.python import data
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.platform import test
 
 
+def check_if_compatible_devices(gpus=2):
+  available_devices = [
+      keras.utils.multi_gpu_utils._normalize_device_name(name)
+      for name in keras.utils.multi_gpu_utils._get_available_devices()
+  ]
+  if '/gpu:%d' % (gpus - 1) not in available_devices:
+    return False
+  return True
+
+
 class TestMultiGPUModel(test.TestCase):
 
-  def multi_gpu_test_simple_model(self):
+  def test_multi_gpu_test_simple_model(self):
     gpus = 2
     num_samples = 1000
     input_dim = 10
     output_dim = 1
     hidden_dim = 10
     epochs = 2
-    target_gpu_id = [0, 2, 4]
+    target_gpu_id = [0, 1]
+
+    if not check_if_compatible_devices(gpus=gpus):
+      return
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -47,12 +60,11 @@ class TestMultiGPUModel(test.TestCase):
       parallel_model = keras.utils.multi_gpu_model(model, gpus=gpus)
       parallel_model.compile(loss='mse', optimizer='rmsprop')
       parallel_model.fit(x, y, epochs=epochs)
-
       parallel_model = keras.utils.multi_gpu_model(model, gpus=target_gpu_id)
       parallel_model.compile(loss='mse', optimizer='rmsprop')
       parallel_model.fit(x, y, epochs=epochs)
 
-  def multi_gpu_test_multi_io_model(self):
+  def test_multi_gpu_test_multi_io_model(self):
     gpus = 2
     num_samples = 1000
     input_dim_a = 10
@@ -61,7 +73,10 @@ class TestMultiGPUModel(test.TestCase):
     output_dim_b = 2
     hidden_dim = 10
     epochs = 2
-    target_gpu_id = [0, 2, 4]
+    target_gpu_id = [0, 1]
+
+    if not check_if_compatible_devices(gpus=gpus):
+      return
 
     with self.test_session():
       input_a = keras.Input((input_dim_a,))
@@ -86,7 +101,10 @@ class TestMultiGPUModel(test.TestCase):
       parallel_model.compile(loss='mse', optimizer='rmsprop')
       parallel_model.fit([a_x, b_x], [a_y, b_y], epochs=epochs)
 
-  def multi_gpu_test_invalid_devices(self):
+  def test_multi_gpu_test_invalid_devices(self):
+    if not check_if_compatible_devices(gpus=2):
+      return
+
     with self.test_session():
       input_shape = (1000, 10)
       model = keras.models.Sequential()
@@ -115,3 +133,53 @@ class TestMultiGPUModel(test.TestCase):
       with self.assertRaises(ValueError):
         parallel_model = keras.utils.multi_gpu_model(model, gpus=[0])
         parallel_model.fit(x, y, epochs=2)
+
+  def test_nested_model_with_tensor_input(self):
+    gpus = 2
+    input_dim = 10
+    shape = (input_dim,)
+    num_samples = 16
+    num_classes = 10
+
+    if not check_if_compatible_devices(gpus=gpus):
+      return
+
+    with self.test_session():
+      input_shape = (num_samples,) + shape
+      x_train = np.random.randint(0, 255, input_shape)
+      y_train = np.random.randint(0, num_classes, (input_shape[0],))
+      keras.backend.set_learning_phase(True)
+
+      y_train = keras.utils.to_categorical(y_train, num_classes)
+
+      x_train = x_train.astype('float32')
+      y_train = y_train.astype('float32')
+
+      dataset = data.Dataset.from_tensor_slices((x_train, y_train))
+      dataset = dataset.repeat()
+      dataset = dataset.batch(4)
+      iterator = dataset.make_one_shot_iterator()
+
+      inputs, targets = iterator.get_next()
+
+      input_tensor = keras.layers.Input(tensor=inputs)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(3,
+                                   input_shape=(input_dim,)))
+      model.add(keras.layers.Dense(num_classes))
+
+      output = model(input_tensor)
+      outer_model = keras.Model(input_tensor, output)
+      parallel_model = keras.utils.multi_gpu_model(outer_model, gpus=gpus)
+
+      parallel_model.compile(
+          loss='categorical_crossentropy',
+          optimizer=keras.optimizers.RMSprop(lr=0.0001, decay=1e-6),
+          metrics=['accuracy'],
+          target_tensors=[targets])
+      parallel_model.fit(epochs=1, steps_per_epoch=3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..162e5b2cd65b377d45e2ef922eee3fd0aaee81e1
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
@@ -0,0 +1,154 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow-related utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond as smart_module
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.util import nest
+
+
+def smart_cond(pred, true_fn=None, false_fn=None, name=None):
+  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
+
+  If `pred` is a bool or has a constant value, we return either `true_fn()`
+  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
+
+  Arguments:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix when using `tf.cond`.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`.
+
+  Raises:
+    TypeError: If `true_fn` or `false_fn` is not callable.
+  """
+  if isinstance(pred, variables.Variable):
+    return control_flow_ops.cond(
+        pred, true_fn=true_fn, false_fn=false_fn, name=name)
+  return smart_module.smart_cond(
+      pred, true_fn=true_fn, false_fn=false_fn, name=name)
+
+
+def constant_value(pred):
+  """Return the bool value for `pred`, or None if `pred` had a dynamic value.
+
+  Arguments:
+    pred: A scalar, either a Python bool or a TensorFlow boolean variable
+      or tensor, or the Python integer 1 or 0.
+
+  Returns:
+    True or False if `pred` has a constant boolean value, None otherwise.
+
+  Raises:
+    TypeError: If `pred` is not a Variable, Tensor or bool, or Python
+      integer 1 or 0.
+  """
+  # Allow integer booleans.
+  if isinstance(pred, int):
+    if pred == 1:
+      pred = True
+    elif pred == 0:
+      pred = False
+
+  if isinstance(pred, variables.Variable):
+    return None
+  return smart_module.smart_constant_value(pred)
+
+
+def is_tensor_or_tensor_list(v):
+  v = nest.flatten(v)
+  if v and isinstance(v[0], ops.Tensor):
+    return True
+  else:
+    return False
+
+
+def get_reachable_from_inputs(inputs, targets=None):
+  """Returns the set of tensors/ops reachable from `inputs`.
+
+  Stops if all targets have been found (target is optional).
+
+  Only valid in Symbolic mode, not Eager mode.
+
+  Args:
+    inputs: List of tensors.
+    targets: List of tensors.
+
+  Returns:
+    A set of tensors reachable from the inputs (includes the inputs themselves).
+  """
+  reachable = set(inputs)
+  if targets:
+    targets = set(targets)
+  queue = inputs[:]
+
+  while queue:
+    x = queue.pop()
+    if isinstance(x, ops.Operation):
+      outputs = x.outputs[:] or []
+      outputs += x._control_outputs  # pylint: disable=protected-access
+    elif isinstance(x, ops.Tensor):
+      outputs = x.consumers()
+    elif isinstance(x, variables.Variable):
+      outputs = [x.op]
+    else:
+      raise TypeError('Expected Operation, Variable, or Tensor, got ' + str(x))
+
+    for y in outputs:
+      if y not in reachable:
+        reachable.add(y)
+        queue.insert(0, y)
+
+    if targets and targets.issubset(reachable):
+      return reachable
+  return reachable
+
+
+def shape_type_conversion(fn):
+  """Decorator that handles tuple/TensorShape conversion.
+
+  Used in `compute_output_shape` and `build`.
+
+  Arguments:
+    fn: function to wrap.
+
+  Returns:
+    Wrapped function.
+  """
+
+  def wrapper(instance, input_shape):
+    if input_shape is not None:
+      if isinstance(input_shape, list):
+        input_shape = [
+            tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
+      else:
+        input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+    output_shape = fn(instance, input_shape)
+    if output_shape is not None:
+      if isinstance(output_shape, list):
+        return [tensor_shape.TensorShape(x) for x in output_shape]
+      return tensor_shape.TensorShape(output_shape)
+
+  return wrapper
diff --git a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
index 45c1b92075c50956fee004409e98898411e83d27..4761cece82c727e4962d0374f8efb80dfaeac3c6 100644
--- a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
@@ -120,7 +120,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
     layer_id = str(id(layer))
     for i, node in enumerate(layer._inbound_nodes):
       node_key = layer.name + '_ib-' + str(i)
-      if node_key in model._container_nodes:
+      if node_key in model._network_nodes:  # pylint: disable=protected-access
         for inbound_layer in node.inbound_layers:
           inbound_layer_id = str(id(inbound_layer))
           layer_id = str(id(layer))
diff --git a/tensorflow/python/keras/datasets/fashion_mnist/__init__.py b/tensorflow/python/keras/datasets/fashion_mnist/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7f5ddecc4707334d52ebf4966f2ec6141cce0d46 100644
--- a/tensorflow/python/keras/datasets/fashion_mnist/__init__.py
+++ b/tensorflow/python/keras/datasets/fashion_mnist/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fashion-MNIST dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl.keras.datasets.fashion_mnist import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 84ee5040dcd7b118a5c63b6532135913fe238797..c7be8b918c11235b7316e125cd7a9796851ad083 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -49,6 +49,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
+from tensorflow.python.keras._impl.keras.layers.convolutional import DepthwiseConv2D
 
 # Image processing layers.
 from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling1D
@@ -61,9 +62,6 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping2D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping3D
 
-# Convolutional-recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import ConvLSTM2D
-
 # Core layers.
 from tensorflow.python.keras._impl.keras.layers.core import Masking
 from tensorflow.python.keras._impl.keras.layers.core import Dropout
@@ -146,6 +144,13 @@ from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNN
 from tensorflow.python.keras._impl.keras.layers.recurrent import GRU
 from tensorflow.python.keras._impl.keras.layers.recurrent import LSTM
 
+# Convolutional-recurrent layers.
+from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import ConvLSTM2D
+
+# CuDNN recurrent layers.
+from tensorflow.python.keras._impl.keras.layers.cudnn_recurrent import CuDNNLSTM
+from tensorflow.python.keras._impl.keras.layers.cudnn_recurrent import CuDNNGRU
+
 # Wrapper functions
 from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
 from tensorflow.python.keras._impl.keras.layers.wrappers import Bidirectional
diff --git a/tensorflow/python/keras/preprocessing/image/__init__.py b/tensorflow/python/keras/preprocessing/image/__init__.py
index b96e7675527041d3952b049f5f431d3df36eea4c..6aba5fc8252e1acf604a89a4e66c2a7db080aa73 100644
--- a/tensorflow/python/keras/preprocessing/image/__init__.py
+++ b/tensorflow/python/keras/preprocessing/image/__init__.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras._impl.keras.preprocessing.image import img_to_array
 from tensorflow.python.keras._impl.keras.preprocessing.image import Iterator
 from tensorflow.python.keras._impl.keras.preprocessing.image import load_img
 from tensorflow.python.keras._impl.keras.preprocessing.image import NumpyArrayIterator
+from tensorflow.python.keras._impl.keras.preprocessing.image import random_brightness
 from tensorflow.python.keras._impl.keras.preprocessing.image import random_channel_shift
 from tensorflow.python.keras._impl.keras.preprocessing.image import random_rotation
 from tensorflow.python.keras._impl.keras.preprocessing.image import random_shear
diff --git a/tensorflow/python/keras/preprocessing/sequence/__init__.py b/tensorflow/python/keras/preprocessing/sequence/__init__.py
index 112f6af5e588bcb2e85fdbecea86f402742d44e7..b7a7149cc40654c878e3c0db1fc78d8912abf498 100644
--- a/tensorflow/python/keras/preprocessing/sequence/__init__.py
+++ b/tensorflow/python/keras/preprocessing/sequence/__init__.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.keras._impl.keras.preprocessing.sequence import make_sampling_table
 from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences
 from tensorflow.python.keras._impl.keras.preprocessing.sequence import skipgrams
+from tensorflow.python.keras._impl.keras.preprocessing.sequence import TimeseriesGenerator
 
 del absolute_import
 del division
diff --git a/tensorflow/python/keras/preprocessing/text/__init__.py b/tensorflow/python/keras/preprocessing/text/__init__.py
index 5bf1a2fb21dc27f7aa10cd08b1496e3991c61d2f..000ad68a0c01e9067f8852836ba5d502deb3fcd4 100644
--- a/tensorflow/python/keras/preprocessing/text/__init__.py
+++ b/tensorflow/python/keras/preprocessing/text/__init__.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras._impl.keras.preprocessing.text import hashing_trick
 from tensorflow.python.keras._impl.keras.preprocessing.text import one_hot
 from tensorflow.python.keras._impl.keras.preprocessing.text import text_to_word_sequence
 from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 91cc8607274a80a14dd27a64274da7f8f0aafab1..2f74cf031d0520c8d874b7269c52e3b9e1b9931b 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -30,9 +30,9 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras._impl.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
 from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.keras._impl.keras.utils.training_utils import multi_gpu_model
 from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
 
 del absolute_import
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index d4ceb2e489c8a20d26eaf9d89b12992d2b8673d7..b4ff094cdfab489eba98d3ff5d6c596522cd4083 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -295,7 +295,6 @@ tf_py_test(
         "//tensorflow/python:nn_grad",
     ],
     data = ["//tensorflow/core:image_testdata"],
-    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -393,6 +392,7 @@ tf_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
     ],
+    shard_count = 5,
 )
 
 tf_py_test(
@@ -408,6 +408,7 @@ tf_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
     ],
+    shard_count = 5,
 )
 
 tf_py_test(
@@ -591,7 +592,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "matrix_solve_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["matrix_solve_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -712,6 +713,18 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "regex_replace_op_test",
+    size = "small",
+    srcs = ["regex_replace_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "save_restore_ops_test",
     size = "small",
@@ -904,6 +917,20 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "string_strip_op_test",
+    size = "small",
+    srcs = ["string_strip_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "substr_op_test",
     size = "small",
@@ -1016,12 +1043,14 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
     tags = ["no_windows"],
 )
@@ -1075,6 +1104,8 @@ cuda_py_test(
     tags = [
         "no_windows",
         "noasan",
+        "noguitar",
+        "notap",
     ],
 )
 
@@ -1126,7 +1157,6 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     data = ["//tensorflow/core:lmdb_testdata"],
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -1174,6 +1204,37 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 10,
+    tags = [
+        "noasan",  # times out
+    ],
+)
+
+cuda_py_test(
+    name = "broadcast_to_ops_test",
+    size = "small",
+    srcs = ["broadcast_to_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+cuda_py_test(
+    name = "inplace_ops_test",
+    size = "small",
+    srcs = ["inplace_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+    shard_count = 10,
 )
 
 cuda_py_test(
@@ -1505,6 +1566,7 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:iterator_ops",
     ],
     grpc_enabled = True,
     tags = ["no_windows"],
@@ -1553,26 +1615,34 @@ cuda_py_test(
 
 cuda_py_test(
     name = "init_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["init_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:layers",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    shard_count = 4,
+    tags = [
+        "noasan",
+        "notap",
+    ],
 )
 
 cuda_py_test(
     name = "linalg_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["linalg_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1892,7 +1962,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "softmax_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["softmax_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2313,7 +2383,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 4,
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2444,7 +2513,6 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 10,
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2504,7 +2572,10 @@ cuda_py_test(
         "//tensorflow/python:sparse_ops",
     ],
     shard_count = 5,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",  # b/77589990
+    ],
 )
 
 cuda_py_test(
@@ -2633,10 +2704,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
-    tags = [
-        "manual",
-        "notap",  # b/30226163
-    ],
 )
 
 cuda_py_test(
@@ -2705,7 +2772,9 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
     ],
+    data = ["//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files"],
     shard_count = 20,
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2784,7 +2853,7 @@ sycl_py_test(
 
 tf_py_test(
     name = "sets_test",
-    size = "small",
+    size = "medium",
     srcs = ["sets_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2835,7 +2904,7 @@ tf_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
-    shard_count = 10,
+    shard_count = 20,
     tags = ["no_windows_gpu"],
 )
 
@@ -2892,14 +2961,36 @@ tf_py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+tf_py_test(
+    name = "accumulate_n_test",
+    size = "small",
+    srcs = ["accumulate_n_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "accumulate_n_eager_test",
+    size = "small",
+    srcs = ["accumulate_n_eager_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+    ],
 )
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py b/tensorflow/python/kernel_tests/accumulate_n_eager_test.py
similarity index 72%
rename from tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
rename to tensorflow/python/kernel_tests/accumulate_n_eager_test.py
index 35974b9e21d2d7423777a95a99f51c9cb4b453b2..dc11b7deceb9040584aca1f629f4d003aef39428 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
+++ b/tensorflow/python/kernel_tests/accumulate_n_eager_test.py
@@ -12,48 +12,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for new version of accumulate_n op that will eventually go into
-`ops.math_ops`.
-
-These test cases spefically exercise the `eager` APIs. They need to be in a
-separate file from the remaining tests because eager mode is currently something
-you can turn on but can't turn off for the lifetime of the current process."""
+"""Tests for new version of accumulate_n op."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import accumulate_n_v2 as av2
-
 from tensorflow.python.eager import backprop
 
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
-
 class AccumulateNV2EagerTest(test_util.TensorFlowTestCase):
-  """Tests of the new, differentiable version of accumulate_n"""
+  """Tests of the new, differentiable version of accumulate_n."""
 
   def testMinimalEagerMode(self):
     forty = constant_op.constant(40)
     two = constant_op.constant(2)
-    answer = av2.accumulate_n_v2([forty, two])
+    answer = math_ops.accumulate_n([forty, two])
     self.assertEqual(42, answer.numpy())
 
-
   def testFloat(self):
     np.random.seed(12345)
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
     tf_x = ops.convert_n_to_tensor(x)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(sum(x), av2.accumulate_n_v2(tf_x).numpy())
-      self.assertAllClose(x[0] * 5, av2.accumulate_n_v2([tf_x[0]] * 5).numpy())
+      self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x).numpy())
+      self.assertAllClose(x[0] * 5,
+                          math_ops.accumulate_n([tf_x[0]] * 5).numpy())
 
   def testGrad(self):
     np.random.seed(42)
@@ -65,16 +58,14 @@ class AccumulateNV2EagerTest(test_util.TensorFlowTestCase):
     ]
 
     def fn(first, second, third):
-      return av2.accumulate_n_v2([first, second, third])
+      return math_ops.accumulate_n([first, second, third])
 
     grad_fn = backprop.gradients_function(fn)
     grad = grad_fn(input_vars[0], input_vars[1], input_vars[2])
-    self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
+    self.assertAllEqual(np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
                         [elem.numpy() for elem in grad])
 
 
-
 if __name__ == "__main__":
   ops.enable_eager_execution()
   test.main()
-
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py b/tensorflow/python/kernel_tests/accumulate_n_test.py
similarity index 75%
rename from tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
rename to tensorflow/python/kernel_tests/accumulate_n_test.py
index 45962098e93acfac414396ddbeaa847701ff2b4b..b793906fac2cd12a5c0c663dd169000ad6067759 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
+++ b/tensorflow/python/kernel_tests/accumulate_n_test.py
@@ -12,42 +12,49 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for new version of accumulate_n op that will eventually go into
-`ops.math_ops`."""
+"""Tests for new version of accumulate_n op."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import accumulate_n_v2 as av2
-
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
 class AccumulateNV2Test(test_util.TensorFlowTestCase):
-  """Tests of the new, differentiable version of accumulate_n"""
+  """Tests of the new, differentiable version of accumulate_n."""
 
   def testFloat(self):
     np.random.seed(12345)
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
     tf_x = ops.convert_n_to_tensor(x)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(sum(x), av2.accumulate_n_v2(tf_x).eval())
-      self.assertAllClose(x[0] * 5, av2.accumulate_n_v2([tf_x[0]] * 5).eval())
+      self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x).eval())
+      self.assertAllClose(x[0] * 5,
+                          math_ops.accumulate_n([tf_x[0]] * 5).eval())
 
   def testInt(self):
     np.random.seed(54321)
     x = [np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(6)]
     tf_x = ops.convert_n_to_tensor(x)
     with self.test_session(use_gpu=True):
-      self.assertAllEqual(sum(x), av2.accumulate_n_v2(tf_x).eval())
-      self.assertAllEqual(x[0] * 6, av2.accumulate_n_v2([tf_x[0]] * 6).eval())
+      self.assertAllEqual(sum(x), math_ops.accumulate_n(tf_x).eval())
+      self.assertAllEqual(x[0] * 6,
+                          math_ops.accumulate_n([tf_x[0]] * 6).eval())
+
+  def testUnknownShape(self):
+    with self.test_session(use_gpu=True):
+      x0 = array_ops.placeholder(dtype=dtypes_lib.int32, shape=[None])
+      acc = math_ops.accumulate_n([x0, x0], shape=[None])
+      self.assertAllEqual([2, 4], acc.eval(feed_dict={x0: [1, 2]}))
 
   def testGrad(self):
     np.random.seed(42)
@@ -55,9 +62,9 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       with self.test_session(use_gpu=True) as sess:
         input_vars = [
             variables.Variable(10.0 * np.random.random())
-            for i in range(0, num_inputs)
+            for _ in range(0, num_inputs)
         ]
-        accum_n = av2.accumulate_n_v2(input_vars)
+        accum_n = math_ops.accumulate_n(input_vars)
         sess.run(variables.global_variables_initializer())
         accum_n_grad = gradients.gradients(accum_n, input_vars)
         self.assertAllEqual(
@@ -77,7 +84,7 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
           ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
           for x in random_arrays
       ]
-      tf_val = av2.accumulate_n_v2(random_tensors)
+      tf_val = math_ops.accumulate_n(random_tensors)
       np_val = random_arrays[0]
       for random_array in random_arrays[1:]:
         np_val += random_array
@@ -86,7 +93,7 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
   def testZeroArgs(self):
     with self.test_session():
       with self.assertRaises(ValueError):
-        tf_val = av2.accumulate_n_v2([])
+        tf_val = math_ops.accumulate_n([])
         tf_val.eval()
 
   def testWrongShape(self):
@@ -94,28 +101,28 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         a = variables.Variable(0.2)
         b = variables.Variable(0.1)
-        tf_val = av2.accumulate_n_v2([a, b], shape=[2, 2])  # Should be shape=[]
+        math_ops.accumulate_n([a, b], shape=[2, 2])  # Should be shape=[]
 
   def testIncompatibleShapes(self):
     with self.test_session():
       with self.assertRaises(ValueError):
         a = variables.Variable(np.array([0.1, 0.2]))
         b = variables.Variable(np.array([[0.3], [0.4]]))
-        tf_val = av2.accumulate_n_v2([a, b])
+        math_ops.accumulate_n([a, b])
 
   def testWrongType(self):
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
         b = variables.Variable(0.1, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a, b], tensor_dtype=np.int32)
+        math_ops.accumulate_n([a, b], tensor_dtype=np.int32)
 
   def testWrongTypeOneInput(self):
     # Scenario that used to trigger a bug, even when testWrongType() worked
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a], tensor_dtype=np.int32)
+        math_ops.accumulate_n([a], tensor_dtype=np.int32)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 365cf72108de5a1e5e1eb47891a6ad64151add22..7acca0a4a09deb89c9453aa1389c422a2ed43d81 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -315,21 +315,39 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
             self.assertAllEqual(x_tf_4, np.asarray(x_np)[:, ::-1])
             self.assertAllEqual(x_tf_5, np.asarray(x_np)[::-1, ::-1])
 
+  # This test covers the axis validation in the shape function
+  # (no eval())
+  def testInvalidAxis(self):
+    x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+    with self.assertRaisesRegexp(ValueError,
+                                 "is out of valid range"):
+      array_ops.reverse_v2(x_np, [-30])
+    with self.assertRaisesRegexp(ValueError,
+                                 "is out of valid range"):
+      array_ops.reverse_v2(x_np, [2])
+    with self.assertRaisesRegexp(ValueError,
+                                 "axis 0 specified more than once"):
+      array_ops.reverse_v2(x_np, [0, -2])
+
   # This is the version of reverse that uses axis indices rather than
   # bool tensors
   # TODO(b/32254538): Change this test to use array_ops.reverse
+  #
+  # Note: this test passes placeholder as constant axis is validated
+  # in shape function (see testInvalidAxis)
   def testInvalid(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+    axis = array_ops.placeholder(dtypes.int32)
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "is out of valid range"):
-        array_ops.reverse_v2(x_np, [-30]).eval()
+        array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [-30]})
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "is out of valid range"):
-        array_ops.reverse_v2(x_np, [2]).eval()
+        array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [2]})
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "axis 0 specified more than once"):
-        array_ops.reverse_v2(x_np, [0, -2]).eval()
+        array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [0, -2]})
 
   def testReverse1DimAuto(self):
     for dtype in [
@@ -712,7 +730,7 @@ class GradSliceChecker(object):
     analytic_grad2 = 2 * slice_val
 
     dy = variables.Variable(
-        array_ops.ones(shape=slice_var.get_shape(), dtype=dtypes.int32))
+        array_ops.ones(shape=slice_var.get_shape(), dtype=dtypes.float32))
     assign = dy.assign(slice_var)
     slice_val_grad, = gradients_impl.gradients(slice_val, self.var, grad_ys=dy)
     slice_val_grad2, = gradients_impl.gradients(
@@ -737,7 +755,8 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
   def testGradient(self):
     with self.test_session(use_gpu=True) as sess:
       var = variables.Variable(
-          array_ops.reshape(math_ops.range(1, 97, 1), shape=(6, 4, 4)))
+          array_ops.reshape(
+              math_ops.range(1, 97, 1, dtype=dtypes.float32), shape=(6, 4, 4)))
       init = variables.global_variables_initializer()
       sess.run(init)
 
@@ -756,12 +775,20 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
 
   def testGradientZero(self):
     with self.test_session(use_gpu=True) as sess:
-      var = variables.Variable(8)
+      var = variables.Variable(8.)
       init = variables.global_variables_initializer()
       sess.run(init)
       grad = GradSliceChecker(self, sess, var, np.array(8))
       _ = grad[tuple()]
 
+  def testInt64Indices(self):
+    with self.test_session(use_gpu=True) as sess:
+      a = math_ops.range(3, dtype=dtypes.float32)
+      index = constant_op.constant(1, dtype=dtypes.int64)
+      b = 2. * a[index]
+      grad, = gradients_impl.gradients(b, a)
+      self.assertAllEqual(sess.run(grad), [0., 2., 0.])
+
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
   """Test varied index types and host located memory."""
@@ -890,7 +917,7 @@ class StridedSliceAssignChecker(object):
         var = resource_variable_ops.ResourceVariable(self.x)
       else:
         var = variables.Variable(self.x)
-      sess.run(variables.initialize_variables([var]))
+      sess.run(variables.variables_initializer([var]))
       val = sess.run(var[index].assign(value))
       # val_copy is used to check that tf.assign works equivalently to the
       # assign method above.
@@ -981,30 +1008,38 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testDenseShape(self):
-    with self.test_session():
-      t_value = [[0, 42], [24, 0]]
-      self.assertAllEqual((2, 2), array_ops.shape(t_value).eval())
-      self.assertEqual(4, array_ops.size(t_value).eval())
-      self.assertEqual(2, array_ops.rank(t_value).eval())
+    t_value = [[0, 42], [24, 0]]
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t_value)))
+    self.assertEqual(4, self.evaluate(array_ops.size(t_value)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(t_value)))
 
-      t = constant_op.constant(t_value)
-      self.assertAllEqual((2, 2), array_ops.shape(t).eval())
-      self.assertEqual(4, array_ops.size(t).eval())
-      self.assertEqual(2, array_ops.rank(t).eval())
+    t = constant_op.constant(t_value)
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t)))
+    self.assertEqual(4, self.evaluate(array_ops.size(t)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(t)))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSparseShape(self):
-    with self.test_session():
-      sp_value = sparse_tensor.SparseTensorValue(
-          indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
-      self.assertAllEqual((2, 2), array_ops.shape(sp_value).eval())
-      self.assertEqual(4, array_ops.size(sp_value).eval())
-      self.assertEqual(2, array_ops.rank(sp_value).eval())
+    sp_value = sparse_tensor.SparseTensorValue(
+        indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp_value)))
+    self.assertEqual(4, self.evaluate(array_ops.size(sp_value)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(sp_value)))
 
-      sp = sparse_tensor.SparseTensor.from_value(sp_value)
-      self.assertAllEqual((2, 2), array_ops.shape(sp).eval())
-      self.assertEqual(4, array_ops.size(sp).eval())
-      self.assertEqual(2, array_ops.rank(sp).eval())
+    sp = sparse_tensor.SparseTensor.from_value(sp_value)
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp)))
+    self.assertEqual(4, self.evaluate(array_ops.size(sp)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(sp)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSizeDtype(self):
+    tensor = [1]
+    self.assertEqual(dtypes.int32, self.evaluate(array_ops.size(tensor)).dtype)
+    self.assertEqual(
+        dtypes.int64,
+        self.evaluate(array_ops.size(tensor, out_type=dtypes.int64)).dtype)
 
 
 @test_util.with_c_api
@@ -1024,6 +1059,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[True, False, False, False, False], [True, True, True, False, False],
            [True, True, False, False, False]])
 
+  @test_util.enable_c_shapes
   def testOneDimensionalDtypeWithoutMaxlen(self):
     with self.test_session():
       # test dtype and default maxlen:
@@ -1037,6 +1073,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           res.eval(),
           [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
 
+  @test_util.enable_c_shapes
   def testOneDimensionalWithoutMaxlen(self):
     with self.test_session():
       res = array_ops.sequence_mask(
@@ -1051,6 +1088,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
            [True, False, False, False],
            [True, True, True, True]])
 
+  @test_util.enable_c_shapes
   def testTwoDimensional(self):
     with self.test_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
@@ -1223,7 +1261,7 @@ class SnapshotOpTest(test_util.TensorFlowTestCase):
     for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
       with self.test_session(use_gpu=True):
         x = constant_op.constant([0, 1, 2, 3], dtype=dtype)
-        y = gen_array_ops._snapshot(x)
+        y = gen_array_ops.snapshot(x)
         self.assertAllEqual(y.eval(), [0, 1, 2, 3])
 
 
diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index 2d1b3d9b7e836591646a2d0e59742bf6139446d1..0ef08581c9f931b991ef0c1218dc503345e248c2 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -83,14 +83,14 @@ class AtrousConvolutionTest(test.TestCase):
     checks = []
 
     def add_check(check, *args, **kwargs):
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         args_val, kwargs_val = self.evaluate([args, kwargs])
         check(*args_val, **kwargs_val)
       else:
         checks.append((check, args, kwargs))
 
     yield add_check
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       all_values = self.evaluate([[args, kwargs] for _, args, kwargs in checks])
       for (check, _, _), (args, kwargs) in zip(checks, all_values):
         check(*args, **kwargs)
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 405651e8ae97fbc5eefd4aba0a95a99ff8fd8c26..987a6ffcd4b18eb5857ff9e82206de7f6ebe8a27 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.ops.gen_array_ops import _broadcast_gradient_args
+from tensorflow.python.ops.gen_array_ops import broadcast_gradient_args
 from tensorflow.python.platform import test
 
 
@@ -157,7 +157,7 @@ class BroadcastSimpleTest(test.TestCase):
 
   def _GetGradientArgs(self, xs, ys):
     with self.test_session(use_gpu=True) as sess:
-      return sess.run(_broadcast_gradient_args(xs, ys))
+      return sess.run(broadcast_gradient_args(xs, ys))
 
   def testBroadcast(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
diff --git a/tensorflow/python/kernel_tests/batchtospace_op_test.py b/tensorflow/python/kernel_tests/batchtospace_op_test.py
index 0c802476a0e788aff3de84ab736fa8f1de5daab4..6143cd3baa6317fc512d80f94b494710037d4082 100644
--- a/tensorflow/python/kernel_tests/batchtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/batchtospace_op_test.py
@@ -44,7 +44,7 @@ class CppOpImpl(object):
 
   @staticmethod
   def batch_to_space(*args, **kwargs):
-    return gen_array_ops._batch_to_space(*args, **kwargs)
+    return gen_array_ops.batch_to_space(*args, **kwargs)
 
 
 class BatchToSpaceDepthToSpace(test.TestCase, PythonOpImpl):
diff --git a/tensorflow/python/kernel_tests/bcast_ops_test.py b/tensorflow/python/kernel_tests/bcast_ops_test.py
index 9e512346053a4c3af089170f47313606c4a307c2..3305e55c05bd03d31c46fd333db09dbab9a5d09c 100644
--- a/tensorflow/python/kernel_tests/bcast_ops_test.py
+++ b/tensorflow/python/kernel_tests/bcast_ops_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.ops.gen_array_ops import _broadcast_args
-from tensorflow.python.ops.gen_array_ops import _broadcast_gradient_args
+from tensorflow.python.ops.gen_array_ops import broadcast_args
+from tensorflow.python.ops.gen_array_ops import broadcast_gradient_args
 from tensorflow.python.platform import test
 
 
@@ -29,11 +29,11 @@ class BcastOpsTest(test.TestCase):
 
   def _GetBroadcastShape(self, xs, ys):
     with self.test_session() as sess:
-      return sess.run(_broadcast_args(xs, ys))
+      return sess.run(broadcast_args(xs, ys))
 
   def _GetGradientArgs(self, xs, ys):
     with self.test_session() as sess:
-      return sess.run(_broadcast_gradient_args(xs, ys))
+      return sess.run(broadcast_gradient_args(xs, ys))
 
   def testBasic(self):
     r = self._GetBroadcastShape([2, 3, 5], [1])
diff --git a/tensorflow/python/kernel_tests/boosted_trees/BUILD b/tensorflow/python/kernel_tests/boosted_trees/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..30e6289420b36a75589ef25150480e48f8245ec2
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/BUILD
@@ -0,0 +1,76 @@
+# Description:
+#   Kernel tests for Boosted Trees.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_py_test(
+    name = "resource_ops_test",
+    size = "small",
+    srcs = ["resource_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "prediction_ops_test",
+    size = "small",
+    srcs = ["prediction_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:resources",
+    ],
+)
+
+tf_py_test(
+    name = "stats_ops_test",
+    size = "small",
+    srcs = ["stats_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "training_ops_test",
+    size = "small",
+    srcs = ["training_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resources",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/__init__.py b/tensorflow/python/kernel_tests/boosted_trees/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f33f336015cc9cb50658941b8e157cc1b94df9
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -0,0 +1,916 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests boosted_trees prediction kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from google.protobuf import text_format
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
+  """Tests prediction ops for training."""
+
+  def testCachedPredictionOnEmptyEnsemble(self):
+    """Tests that prediction on a dummy ensemble does not fail."""
+    with self.test_session() as session:
+      # Create a dummy ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto='')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # No previous cached values.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [0, 0]
+
+      # We have two features: 0 and 1. Values don't matter here on a dummy
+      # ensemble.
+      feature_0_values = [67, 5]
+      feature_1_values = [9, 17]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # Nothing changed.
+      self.assertAllClose(cached_tree_ids, new_tree_ids)
+      self.assertAllClose(cached_node_ids, new_node_ids)
+      self.assertAllClose([[0], [0]], logits_updates)
+
+  def testNoCachedPredictionButTreeExists(self):
+    """Tests that predictions are updated once trees are added."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, none were cached before.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [0, 0]
+
+      feature_0_values = [67, 5]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the first tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      self.assertAllClose([2, 1], new_node_ids)
+      self.assertAllClose([[0.1 * 8.79], [0.1 * 1.14]], logits_updates)
+
+  def testCachedPredictionIsCurrent(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+              original_leaf {
+                scalar: -2
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 0.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 2]
+
+      # We have two features: 0 and 1. Values don't matter because trees didn't
+      # change.
+      feature_0_values = [67, 5]
+      feature_1_values = [9, 17]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # Nothing changed.
+      self.assertAllClose(cached_tree_ids, new_tree_ids)
+      self.assertAllClose(cached_node_ids, new_node_ids)
+      self.assertAllClose([[0], [0]], logits_updates)
+
+  def testCachedPredictionFromTheSameTree(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+              original_leaf {
+                scalar: -2
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 7
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 7.14
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -5.875
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -2.075
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 0.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 0]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [67, 5]
+      feature_1_values = [9, 17]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are still in the same tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      # When using the full tree, the first example will end up in node 4,
+      # the second in node 5.
+      self.assertAllClose([4, 5], new_node_ids)
+      # Full predictions for each instance would be 8.79 and -5.875,
+      # so an update from the previous cached values lr*(7.14 and -2) would be
+      # 1.65 and -3.875, and then multiply them by 0.1 (lr)
+      self.assertAllClose([[0.1 * 1.65], [0.1 * -3.875]], logits_updates)
+
+  def testCachedPredictionFromPreviousTree(self):
+    """Tests the predictions work when we have cache from previous trees."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 28
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 34
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+        }
+        tree_metadata {
+          is_finalized: true
+        }
+        tree_metadata {
+          is_finalized: true
+        }
+        tree_metadata {
+          is_finalized: false
+        }
+        tree_weights: 0.1
+        tree_weights: 0.1
+        tree_weights: 0.1
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 2.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 0]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+      # Example 1 will get to node 3 in tree 1 and node 2 of tree 2
+      # Example 2 will get to node 2 in tree 1 and node 1 of tree 2
+
+      # We are in the last tree.
+      self.assertAllClose([2, 2], new_tree_ids)
+      # When using the full tree, the first example will end up in node 4,
+      # the second in node 5.
+      self.assertAllClose([2, 1], new_node_ids)
+      # Example 1: tree 0: 8.79, tree 1: 5.0, tree 2: 5.0 = >
+      #            change = 0.1*(5.0+5.0)
+      # Example 2: tree 0: 1.14, tree 1: 7.0, tree 2: -7 = >
+      #            change= 0.1(1.14+7.0-7.0)
+      self.assertAllClose([[1], [0.114]], logits_updates)
+
+  def testCachedPredictionFromTheSameTreeWithPostPrunedNodes(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:0
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 5
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0783
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+          }
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 3
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      cached_tree_ids = [0, 0, 0, 0, 0, 0]
+      # Leaves 3,4, 7 and 8 got deleted during post-pruning, leaves 5 and 6
+      # changed the ids to 3 and 4 respectively.
+      cached_node_ids = [3, 4, 5, 6, 7, 8]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [12, 17, 35, 36, 23, 11]
+      feature_1_values = [12, 12, 17, 18, 123, 24]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are still in the same tree.
+      self.assertAllClose([0, 0, 0, 0, 0, 0], new_tree_ids)
+      # Examples from leaves 3,4,7,8 should be in leaf 1, examples from leaf 5
+      # and 6 in leaf 3 and 4.
+      self.assertAllClose([1, 1, 3, 4, 1, 1], new_node_ids)
+
+      cached_values = [[0.08], [0.093], [0.0553], [0.0783], [0.15 + 0.08],
+                       [0.5 + 0.08]]
+      self.assertAllClose([[0.01], [0.01], [0.0553], [0.0783], [0.01], [0.01]],
+                          logits_updates + cached_values)
+
+  def testCachedPredictionFromThePreviousTreeWithPostPrunedNodes(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:0
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 5
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0783
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.55
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+          }
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 4
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      cached_tree_ids = [0, 0, 0, 0, 0, 0]
+      # Leaves 3,4, 7 and 8 got deleted during post-pruning, leaves 5 and 6
+      # changed the ids to 3 and 4 respectively.
+      cached_node_ids = [3, 4, 5, 6, 7, 8]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [12, 17, 35, 36, 23, 11]
+      feature_1_values = [12, 12, 17, 18, 123, 24]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the last tree.
+      self.assertAllClose([1, 1, 1, 1, 1, 1], new_tree_ids)
+      # Examples from leaves 3,4,7,8 should be in leaf 1, examples from leaf 5
+      # and 6 in leaf 3 and 4 in tree 0. For tree 1, all of the examples are in
+      # the root node.
+      self.assertAllClose([0, 0, 0, 0, 0, 0], new_node_ids)
+
+      cached_values = [[0.08], [0.093], [0.0553], [0.0783], [0.15 + 0.08],
+                       [0.5 + 0.08]]
+      root = 0.55
+      self.assertAllClose([[root + 0.01], [root + 0.01], [root + 0.0553],
+                           [root + 0.0783], [root + 0.01], [root + 0.01]],
+                          logits_updates + cached_values)
+
+  def testCachedPredictionTheWholeTreeWasPruned(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.00
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: -6.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 5.0
+          }
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      cached_tree_ids = [
+          0,
+          0,
+      ]
+      # The predictions were cached in 1 and 2, both were pruned to the root.
+      cached_node_ids = [1, 2]
+
+      # We have two features: 0 and 1.These are not going to be used anywhere.
+      feature_0_values = [12, 17]
+      feature_1_values = [12, 12]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the last tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      self.assertAllClose([0, 0], new_node_ids)
+
+      self.assertAllClose([[-6.0], [5.0]], logits_updates)
+
+
+class PredictionOpsTest(test_util.TensorFlowTestCase):
+  """Tests prediction ops for inference."""
+
+  def testPredictionMultipleTree(self):
+    """Tests the predictions work when we have multiple trees."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 28
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6.0
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 34
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_weights: 0.2
+        tree_weights: 1.0
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+
+      # Example 1: tree 0: 1.14, tree 1: 5.0, tree 2: 5.0 = >
+      #            logit = 0.1*5.0+0.2*5.0+1*5
+      # Example 2: tree 0: 1.14, tree 1: 7.0, tree 2: -7 = >
+      #            logit= 0.1*1.14+0.2*7.0-1*7.0
+      expected_logits = [[6.114], [-5.486]]
+
+      # Do with parallelization, e.g. EVAL
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
+      # Do without parallelization, e.g. INFER - the result is the same
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5f0c22d6e042a28f54fea2d4505208a3f7258c0
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -0,0 +1,239 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for boosted_trees resource kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from google.protobuf import text_format
+
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+class ResourceOpsTest(test_util.TensorFlowTestCase):
+  """Tests resource_ops."""
+
+  def testCreate(self):
+    with self.test_session():
+      ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
+      resources.initialize_resources(resources.shared_resources()).run()
+      stamp_token = ensemble.get_stamp_token()
+      self.assertEqual(0, stamp_token.eval())
+      (_, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
+      self.assertEqual(0, num_trees.eval())
+      self.assertEqual(0, num_finalized_trees.eval())
+      self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
+
+  def testCreateWithProto(self):
+    with self.test_session():
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            bucketized_split {
+              threshold: 21
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 7.14
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6.54
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.305
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.525
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.145
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              threshold: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 6
+          last_layer_node_start: 16
+          last_layer_node_end: 19
+        }
+      """, ensemble_proto)
+      ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble',
+          stamp_token=7,
+          serialized_proto=ensemble_proto.SerializeToString())
+      resources.initialize_resources(resources.shared_resources()).run()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
+      self.assertEqual(7, stamp_token.eval())
+      self.assertEqual(2, num_trees.eval())
+      self.assertEqual(1, num_finalized_trees.eval())
+      self.assertEqual(6, num_attempted_layers.eval())
+      self.assertAllEqual([16, 19], nodes_range.eval())
+
+  def testSerializeDeserialize(self):
+    with self.test_session():
+      # Initialize.
+      ensemble = boosted_trees_ops.TreeEnsemble('ensemble', stamp_token=5)
+      resources.initialize_resources(resources.shared_resources()).run()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
+      self.assertEqual(5, stamp_token.eval())
+      self.assertEqual(0, num_trees.eval())
+      self.assertEqual(0, num_finalized_trees.eval())
+      self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
+
+      # Deserialize.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              threshold: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.5
+        tree_metadata {
+          num_layers_grown: 4  # it's fake intentionally.
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 5
+          last_layer_node_start: 3
+          last_layer_node_end: 7
+        }
+      """, ensemble_proto)
+      with ops.control_dependencies([
+          ensemble.deserialize(
+              stamp_token=3,
+              serialized_proto=ensemble_proto.SerializeToString())
+      ]):
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         nodes_range) = ensemble.get_states()
+      self.assertEqual(3, stamp_token.eval())
+      self.assertEqual(1, num_trees.eval())
+      # This reads from metadata, not really counting the layers.
+      self.assertEqual(5, num_attempted_layers.eval())
+      self.assertEqual(0, num_finalized_trees.eval())
+      self.assertAllEqual([3, 7], nodes_range.eval())
+
+
+      # Serialize.
+      new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      new_stamp_token, new_serialized = ensemble.serialize()
+      self.assertEqual(3, new_stamp_token.eval())
+      new_ensemble_proto.ParseFromString(new_serialized.eval())
+      self.assertProtoEquals(ensemble_proto, new_ensemble_proto)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0bb84e69a5ae29843123718574f919a9a8553e0
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -0,0 +1,340 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for boosted_trees stats kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.platform import googletest
+
+
+class StatsOpsTest(test_util.TensorFlowTestCase):
+  """Tests stats_ops."""
+
+  def testCalculateBestGainsWithoutRegularization(self):
+    """Testing Gain calculation without any regularization."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.0,
+          tree_complexity=0.0,
+          min_node_weight=0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllClose([[0.004775, 0.41184], [0.02823, 0.41184]],
+                          sess.run(gains_list))
+      self.assertAllEqual([[1, 1], [1, 1]], sess.run(thresholds_list))
+      # The left node contrib will be later added to the previous node value to
+      # make the left node value, and the same for right node contrib.
+      self.assertAllClose([[[-.416667], [.568966]], [[-.6], [-.75]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-.592593], [-.75]], [[-.076923], [.568966]]],
+                          sess.run(right_node_contribs_list))
+
+  def testCalculateBestGainsWithL2(self):
+    """Testing Gain calculation with L2."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.1,
+          tree_complexity=0.0,
+          min_node_weight=0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllClose([[0., 0.33931375], [0.01879096, 0.33931375]],
+                          sess.run(gains_list))
+      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      # The left node contrib will be later added to the previous node value to
+      # make the left node value, and the same for right node contrib.
+      self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
+                          sess.run(right_node_contribs_list))
+
+  def testCalculateBestGainsWithL1(self):
+    """Testing Gain calculation with L1."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      l1 = 0.1
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=l1,
+          l2=0.0,
+          tree_complexity=0.0,
+          min_node_weight=0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllClose([[[0.0], [0.3965517]], [[-0.4], [-0.5]]],
+                          sess.run(left_node_contribs_list))
+
+      self.assertAllClose([[[-0.3333333], [-0.5]], [[0.0], [0.396552]]],
+                          sess.run(right_node_contribs_list))
+
+      # Gain should also include an adjustment of the gradient by l1.
+      self.assertAllClose([[0.0, 0.191207], [0.01, 0.191207]],
+                          sess.run(gains_list))
+
+  def testCalculateBestGainsWithTreeComplexity(self):
+    """Testing Gain calculation with L2."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      l2 = 0.1
+      tree_complexity = 3.
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=l2,
+          tree_complexity=tree_complexity,
+          min_node_weight=0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+
+      self.assertAllClose([[-3., -2.66068625], [-2.98120904, -2.66068625]],
+                          sess.run(gains_list))
+
+      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      # The left node contrib will be later added to the previous node value to
+      # make the left node value, and the same for right node contrib.
+      self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
+                          sess.run(right_node_contribs_list))
+
+  def testCalculateBestGainsWithMinNodeWEight(self):
+    """Testing Gain calculation without any regularization."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .036], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .68], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .6], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .03], [-.4, .05], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.0,
+          tree_complexity=0.0,
+          min_node_weight=1,
+          max_splits=max_splits)
+
+      # We can't split node 1 on feature 1 and node 2 on feature 2 because of
+      # the min node weight.
+      self.assertAllEqual([[2], [1]], sess.run(node_ids_list))
+      self.assertAllClose([[0.384314], [0.098013]], sess.run(gains_list))
+      self.assertAllEqual([[1], [1]], sess.run(thresholds_list))
+      self.assertAllClose([[[0.4852941]], [[-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-0.75]], [[-0.014925]]],
+                          sess.run(right_node_contribs_list))
+
+  def testMakeStatsSummarySimple(self):
+    """Simple test for MakeStatsSummary."""
+    with self.test_session():
+      self.assertAllClose([[[[1., 5.], [2., 6.]], [[3., 7.], [4., 8.]]]],
+                          boosted_trees_ops.make_stats_summary(
+                              node_ids=[0, 0, 1, 1],
+                              gradients=[[1.], [2.], [3.], [4.]],
+                              hessians=[[5.], [6.], [7.], [8.]],
+                              bucketized_features_list=[[0, 1, 0, 1]],
+                              max_splits=2,
+                              num_buckets=2).eval())
+
+  def testMakeStatsSummaryAccumulate(self):
+    """Tests that Summary actually accumulates."""
+    with self.test_session():
+      max_splits = 3
+      num_buckets = 4
+      node_ids = [1, 1, 2, 2, 1, 1, 2, 0]
+      gradients = [[.1], [.2], [.3], [-.4], [-.05], [.06], [.07], [.08]]
+      hessians = [[.2], [.3], [.4], [.5], [.06], [.07], [.08], [.09]]
+
+      # Tests a single feature.
+      bucketized_features = [[3, 1, 2, 0, 1, 2, 0, 1]]
+      result = boosted_trees_ops.make_stats_summary(
+          node_ids, gradients, hessians, bucketized_features, max_splits,
+          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+      self.assertAllClose(
+          [[
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
+          ]],
+          result.eval())
+
+  def testMakeStatsSummaryMultipleFeatures(self):
+    """Tests that MakeStatsSummary works for multiple features."""
+    with self.test_session():
+      max_splits = 3
+      num_buckets = 4
+      node_ids = [1, 1, 2, 2, 1, 1, 2, 0]
+      gradients = [[.1], [.2], [.3], [-.4], [-.05], [.06], [.07], [.08]]
+      hessians = [[.2], [.3], [.4], [.5], [.06], [.07], [.08], [.09]]
+
+      # Tests multiple features.
+      # The output from another feature will stored be in 3rd dimension.
+      bucketized_features = [[3, 1, 2, 0, 1, 2, 0, 1], [0, 0, 0, 2, 2, 3, 3, 2]]
+      result = boosted_trees_ops.make_stats_summary(
+          node_ids, gradients, hessians, bucketized_features, max_splits,
+          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+      self.assertAllClose(
+          [
+              [
+                  [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0
+                  [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+                  [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
+              ],  # feature 0
+              [
+                  [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0
+                  [[.3, .5], [0., 0.], [-.05, .06], [.06, .07]],  # node 1
+                  [[.3, .4], [0., 0.], [-.4, .5], [.07, .08]],  # node 2
+              ],  # feature 1
+          ],
+          result.eval())
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6c004774746dd28a7b376eb2e0564e5b71e5b40
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -0,0 +1,1497 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for boosted_trees training kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
+  """Tests for growing tree ensemble from split candidates."""
+
+  def testGrowWithEmptyEnsemble(self):
+    """Test growing an empty ensemble."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_ids = [0, 2, 6]
+
+      # Prepare feature inputs.
+      # Note that features 1 & 3 have the same gain but different splits.
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([0.63], dtype=np.float32)
+      feature2_thresholds = np.array([23], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24]], dtype=np.float32)
+
+      # Feature split with the highest gain.
+      feature3_nodes = np.array([0], dtype=np.int32)
+      feature3_gains = np.array([7.65], dtype=np.float32)
+      feature3_thresholds = np.array([7], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-4.89]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[5.3]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # Tree will be finalized now, since we will reach depth 1.
+          max_depth=1,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      # Note that since the tree is finalized, we added a new dummy tree.
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 6
+              threshold: 7
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.65
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.489
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.53
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testGrowExistingEnsembleTreeNotFinalized(self):
+    """Test growing an existing ensemble with the last tree not finalized."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([0.63, 2.7], dtype=np.float32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([1.7], dtype=np.float32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # tree is going to be finalized now, since we reach depth 2.
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should be finalized as max tree depth is 2 and we have
+      # grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            bucketized_split {
+              threshold: 21
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 0.714
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -0.4375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.114
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.879
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.5875
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.2075
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testGrowExistingEnsembleTreeFinalized(self):
+    """Test growing an existing ensemble with the last tree finalized."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+
+      feature_ids = [75]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          learning_rate=0.1,
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+      session.run(grow_op)
+
+      # Expect a new tree added, with a split on feature 75
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+       trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              threshold: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testPrePruning(self):
+    """Test growing an existing ensemble with pre-pruning."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # For node 1, the best split is on feature 2 (gain -0.63), but the gain
+      # is negative so node 1 will not be split.
+      # For node 2, the best split is on feature 3, gain is positive.
+
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([-0.63, 2.7], dtype=np.float32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([2.8], dtype=np.float32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.PRE_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should not be finalized as max tree depth is 3 and
+      # it's only grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.8
+              original_leaf {
+                scalar: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.45
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.182
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: false
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 5
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testMetadataWhenCantSplitDueToEmptySplits(self):
+    """Test that the metadata is updated even though we can't split."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          max_depth=2,
+          # No splits are available.
+          feature_ids=[],
+          node_ids=[],
+          gains=[],
+          thresholds=[],
+          left_node_contribs=[],
+          right_node_contribs=[])
+      session.run(grow_op)
+
+      # Expect no new splits created, but attempted (global) stats updated. Meta
+      # data for this tree should not be updated (we didn't succeed building a
+      # layer. Node ranges don't change.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testMetadataWhenCantSplitDuePrePruning(self):
+    """Test metadata is updated correctly when no split due to prepruning."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      feature_ids = [0, 1, 0]
+
+      # All the gains are negative.
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([-0.63, -2.7], dtype=np.float32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([-2.8], dtype=np.float32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.PRE_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      # Expect that no new split was created because all the gains were negative
+      # Global metadata should be updated, tree metadata should not be updated.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testPostPruningOfSomeNodes(self):
+    """Test growing an ensemble with post-pruning."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs.
+      # Second feature has larger (but still negative gain).
+      feature_ids = [0, 1]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.3], dtype=np.float32)
+      feature1_thresholds = np.array([7], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.013]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([-0.2], dtype=np.float32)
+      feature2_thresholds = np.array([33], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[0.01]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ])
+
+      session.run(grow_op)
+
+      # Expect the split from second features to be chosen despite the negative
+      # gain.
+      # No pruning happened just yet.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0143
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+      # Prepare the second layer.
+      # Note that node 1 gain is negative and node 2 gain is positive.
+      feature_ids = [3]
+      feature1_nodes = np.array([1, 2], dtype=np.int32)
+      feature1_gains = np.array([-0.2, 0.5], dtype=np.float32)
+      feature1_thresholds = np.array([7, 5], dtype=np.int32)
+      feature1_left_node_contribs = np.array(
+          [[0.07], [0.041]], dtype=np.float32)
+      feature1_right_node_contribs = np.array(
+          [[0.083], [0.064]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+
+      session.run(grow_op)
+
+      # After adding this layer, the tree will not be finalized
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 7
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: -0.2
+              original_leaf {
+                scalar: 0.01
+               }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 5
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.08
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.093
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+                scalar: 0.0783
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 7
+        }
+       """
+      self.assertEqual(new_stamp, 2)
+
+      self.assertProtoEquals(expected_result, res_ensemble)
+      # Now split the leaf 3, again with negative gain. After this layer, the
+      # tree will be finalized, and post-pruning happens. The leafs 3,4,7,8 will
+      # be pruned out.
+
+      # Prepare the third layer.
+      feature_ids = [92]
+      feature1_nodes = np.array([3], dtype=np.int32)
+      feature1_gains = np.array([-0.45], dtype=np.float32)
+      feature1_thresholds = np.array([11], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.15]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.5]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+
+      session.run(grow_op)
+      # After adding this layer, the tree will be finalized
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+      # Node that nodes 3, 4, 7 and 8 got deleted, so metadata stores has ids
+      # mapped to their parent node 1, with the respective change in logits.
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 5
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0783
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+          }
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 3
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+       """
+      self.assertEqual(new_stamp, 3)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+  def testPostPruningOfAllNodes(self):
+    """Test growing an ensemble with post-pruning, with all nodes are pruned."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs. All have negative gains.
+      feature_ids = [0, 1]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.3], dtype=np.float32)
+      feature1_thresholds = np.array([7], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.013]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([-0.62], dtype=np.float32)
+      feature2_thresholds = np.array([33], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[0.01]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ])
+
+      session.run(grow_op)
+
+      # Expect the split from feature 2 to be chosen despite the negative gain.
+      # The grown tree should not be finalized as max tree depth is 2 so no
+      # pruning occurs.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0143
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+      # Prepare inputs.
+      # All have negative gain.
+      feature_ids = [3]
+      feature1_nodes = np.array([1, 2], dtype=np.int32)
+      feature1_gains = np.array([-0.2, -0.5], dtype=np.float32)
+      feature1_thresholds = np.array([77, 79], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.023], [0.3]], dtype=np.float32)
+      feature1_right_node_contribs = np.array(
+          [[0.012343], [24]], dtype=np.float32)
+
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+
+      session.run(grow_op)
+
+      # Expect the split from feature 1 to be chosen despite the negative gain.
+      # The grown tree should be finalized. Since all nodes have negative gain,
+      # the whole tree is pruned.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      # Expect the ensemble to be empty as post-pruning will prune
+      # the entire finalized tree.
+      self.assertEqual(new_stamp, 2)
+      self.assertProtoEquals(
+          """
+      trees {
+        nodes {
+          leaf {
+          }
+        }
+      }
+      trees {
+        nodes {
+          leaf {
+          }
+        }
+      }
+      tree_weights: 1.0
+      tree_weights: 1.0
+      tree_metadata{
+        num_layers_grown: 2
+        is_finalized: true
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: 0.0
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.01
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.0143
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.033
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.022343
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.3143
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -24.0143
+        }
+      }
+      tree_metadata {
+      }
+      growing_metadata {
+        num_trees_attempted: 1
+        num_layers_attempted: 2
+        last_layer_node_start: 0
+        last_layer_node_end: 1
+      }
+      """, res_ensemble)
+
+  def testPostPruningChangesNothing(self):
+    """Test growing an ensemble with post-pruning with all gains >0."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs.
+      # Second feature has larger (but still negative gain).
+      feature_ids = [3, 4]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([0.63], dtype=np.float32)
+      feature2_thresholds = np.array([23], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=1,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ])
+
+      session.run(grow_op)
+
+      # Expect the split from the first feature to be chosen.
+      # Pruning got triggered but changed nothing.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 52
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.143
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a1bd958ba89080ff38e461646b07edbc6daec21
--- /dev/null
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -0,0 +1,85 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for broadcast_to ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test as test_lib
+
+
+class BroadcastToTest(test_util.TensorFlowTestCase):
+
+  def testBroadcastToBasic(self):
+    for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
+      with self.test_session(use_gpu=True):
+        x = np.array([1, 2, 3], dtype=dtype)
+        v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToString(self):
+    with self.test_session(use_gpu=True):
+      x = np.array([b"1", b"2", b"3"])
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToBool(self):
+    with self.test_session(use_gpu=True):
+      x = np.array([True, False, True], dtype=np.bool)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToShape(self):
+    for input_dim in range(1, 6):
+      for output_dim in range(input_dim, 6):
+        with self.test_session(use_gpu=True):
+          input_shape = [2] * input_dim
+          output_shape = [2] * output_dim
+          x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+          v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+          v_np = np.broadcast_to(x, output_shape)
+          self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToScalar(self):
+    with self.test_session(use_gpu=True):
+      x = np.array(1, dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToShapeTypeAndInference(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = np.array([1, 2, 3])
+        v_tf = array_ops.broadcast_to(
+            constant_op.constant(x),
+            constant_op.constant([3, 3], dtype=dtype))
+        shape = v_tf.get_shape().as_list()
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+        # check shape inference when shape input is constant
+        self.assertAllEqual(shape, v_np.shape)
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 2e94603a3f3d4ca9074320cfb4e9bf06b6640e82..5a83ec8d302b4c26aef7abfa7465eb9fd0cca019 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -102,17 +102,15 @@ class AssertEqualTest(test.TestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
-    # Dynamic check
-    if context.in_graph_mode():
-      with self.test_session():
-        small = array_ops.placeholder(dtypes.int32, name="small")
-        big = array_ops.placeholder(dtypes.int32, name="big")
-        with ops.control_dependencies(
-            [check_ops.assert_equal(
-                big, small, message="fail")]):
-          out = array_ops.identity(small)
-        with self.assertRaisesOpError("fail.*big.*small"):
-          out.eval(feed_dict={small: [1, 2], big: [3, 4]})
+  def test_raises_when_greater_dynamic(self):
+    with self.test_session():
+      small = array_ops.placeholder(dtypes.int32, name="small")
+      big = array_ops.placeholder(dtypes.int32, name="big")
+      with ops.control_dependencies(
+          [check_ops.assert_equal(big, small, message="fail")]):
+        out = array_ops.identity(small)
+      with self.assertRaisesOpError("fail.*big.*small"):
+        out.eval(feed_dict={small: [1, 2], big: [3, 4]})
 
   def test_error_message_eager(self):
     expected_error_msg_full = r"""big does not equal small
@@ -182,15 +180,14 @@ First 2 elements of y:
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
-    # Dynamic check
-    if context.in_graph_mode():
-      with self.test_session():
-        small = array_ops.placeholder(dtypes.int32, name="small")
-        big = array_ops.placeholder(dtypes.int32, name="big")
-        with ops.control_dependencies([check_ops.assert_equal(small, big)]):
-          out = array_ops.identity(small)
-        with self.assertRaisesOpError("small.*big"):
-          out.eval(feed_dict={small: [3, 1], big: [4, 2]})
+  def test_raises_when_less_dynamic(self):
+    with self.test_session():
+      small = array_ops.placeholder(dtypes.int32, name="small")
+      big = array_ops.placeholder(dtypes.int32, name="big")
+      with ops.control_dependencies([check_ops.assert_equal(small, big)]):
+        out = array_ops.identity(small)
+      with self.assertRaisesOpError("small.*big"):
+        out.eval(feed_dict={small: [3, 1], big: [4, 2]})
 
   @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_equal_and_broadcastable_shapes(self):
@@ -215,6 +212,12 @@ First 2 elements of y:
         out = array_ops.identity(small)
       self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_raises_when_not_equal_and_broadcastable_shapes(self):
+    cond = constant_op.constant([True, False], name="small")
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+      check_ops.assert_equal(cond, False, message="fail")
+
   @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
     larry = constant_op.constant([])
diff --git a/tensorflow/python/kernel_tests/checkpoint_ops_test.py b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
index a786d0a47e569f71812086fb93c21dc12660a2a5..7f147ba53a71539962f424158731e359724f664f 100644
--- a/tensorflow/python/kernel_tests/checkpoint_ops_test.py
+++ b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
@@ -50,7 +50,7 @@ class GenerateVocabRemappingTest(test.TestCase):
 
   def test_generate_remapping_with_no_vocab_changes(self):
     """Tests where vocab does not change at all."""
-    remapping, num_present = gen_checkpoint_ops._generate_vocab_remapping(
+    remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
         new_vocab_file=self.old_vocab_file,
         old_vocab_file=self.old_vocab_file,
         num_new_vocab=3,
@@ -63,7 +63,7 @@ class GenerateVocabRemappingTest(test.TestCase):
 
   def test_generate_remapping_with_shifted_vocab(self):
     """Tests where vocab is the same, but shifted / ordered differently."""
-    remapping, num_present = gen_checkpoint_ops._generate_vocab_remapping(
+    remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
         new_vocab_file=self.new_vocab_file,
         old_vocab_file=self.old_vocab_file,
         num_new_vocab=3,
@@ -76,7 +76,7 @@ class GenerateVocabRemappingTest(test.TestCase):
 
   def test_generate_remapping_with_offset(self):
     """Tests offset and num_new_vocab logic."""
-    remapping, num_present = gen_checkpoint_ops._generate_vocab_remapping(
+    remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
         new_vocab_file=self.new_vocab_file,
         old_vocab_file=self.old_vocab_file,
         num_new_vocab=1,
@@ -89,7 +89,7 @@ class GenerateVocabRemappingTest(test.TestCase):
 
   def test_generate_remapping_with_old_vocab_size(self):
     """Tests where old_vocab_size is specified."""
-    remapping, num_present = gen_checkpoint_ops._generate_vocab_remapping(
+    remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
         new_vocab_file=self.new_vocab_file,
         old_vocab_file=self.old_vocab_file,
         num_new_vocab=3,
@@ -132,7 +132,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
 
     # No column remapping, new weight matrix has second row, then first row.
     row_remapping = [1, 0]
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=row_remapping,
@@ -147,7 +147,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     # No row remapping, new weight matrix has third col, then first col.
     row_remapping = list(range(self.old_num_rows))
     col_remapping = [2, 0]
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=row_remapping,
@@ -162,7 +162,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     # Both row and column remappings.
     row_remapping = [1, 0, 4]
     col_remapping = [1, 15]
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=row_remapping,
@@ -177,7 +177,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
   def test_load_and_remap_with_init(self):
     """Tests the op's load and remap where there are missing entries."""
     init_val = 42
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=[2, -1, 0],
@@ -196,7 +196,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     """Tests when all the rows are missing and need to be initialized."""
     num_rows = 7
     initializing_values = [42] * num_rows * self.old_num_cols
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=[-1] * num_rows,
@@ -214,7 +214,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     num_rows = 7
     num_cols = 4
     initializing_values = [42] * num_rows * num_cols
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=[-1] * num_rows,
@@ -235,7 +235,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     invalid_remapping = [1, 0, 0, 0, 1, 2]
 
     # Invalid row remapping.
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=invalid_remapping,
@@ -247,7 +247,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
       remapped_matrix.eval()
 
     # Invalid column remapping.
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=list(range(self.old_num_rows)),
@@ -260,7 +260,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
 
   def test_load_and_remap_incorrect_initializing_values(self):
     """Tests that errors are raised with incorrect number of init values."""
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=[2, -1, 0],
@@ -275,7 +275,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.test_session(), self.assertRaises(errors.InvalidArgumentError):
       remapped_matrix.eval()
 
-    remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
         old_tensor_name=self.old_tensor_name,
         row_remapping=[2, -1, 0],
@@ -314,7 +314,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
       num_rows, num_cols = np_value.shape
 
       # Tests loading the entire tensor (except reversed).
-      remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+      remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
           ckpt_path=ckpt_path,
           old_tensor_name=old_tensor_name,
           # Simply reverses the rows of the matrix.
@@ -332,7 +332,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
       self.assertGreater(num_rows, 2)
       prefix_rows = 2
       suffix_rows = 3
-      remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+      remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
           ckpt_path=ckpt_path,
           old_tensor_name=old_tensor_name,
           # Reverses the rows of the matrix, then prepends and appends
@@ -353,7 +353,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
       # Tests when everything is taken from initializing_values.
       new_rows = 7
       initializing_values = [42] * new_rows * num_cols
-      remapped_matrix = gen_checkpoint_ops._load_and_remap_matrix(
+      remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
           ckpt_path=ckpt_path,
           old_tensor_name=old_tensor_name,
           # Nothing is loaded from the old tensor.
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 5c8b71da174b8c38a797f8bf97c432d732d9978f..e08123b0417912c479476d8147d832d1715b8882 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -19,16 +19,33 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
 
 
 class ClipTest(test.TestCase):
 
+  def DISABLED_testClipByValueGradient(self):
+    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
+    outputs_1 = clip_ops.clip_by_value(inputs, 0.5, 3.5)
+    min_val = constant_op.constant([0.5, 0.5, 0.5, 0.5], dtype=dtypes.float32)
+    max_val = constant_op.constant([3.5, 3.5, 3.5, 3.5], dtype=dtypes.float32)
+    outputs_2 = clip_ops.clip_by_value(inputs, min_val, max_val)
+    with self.test_session():
+      error_1 = gradient_checker.compute_gradient_error(inputs, [4], outputs_1,
+                                                        [4])
+      self.assertLess(error_1, 1e-4)
+
+      error_2 = gradient_checker.compute_gradient_error(inputs, [4], outputs_2,
+                                                        [4])
+      self.assertLess(error_2, 1e-4)
+
   # ClipByValue test
   def testClipByValue(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
       clip_value = 4.4
@@ -37,8 +54,76 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  # [Tensor, Scalar, Scalar]
+  def DISABLED_testClipByValue0Type(self):
+    for dtype in [
+        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
+        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
+    ]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = 2
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Scalar]
+  def DISABLED_testClipByValue1Type(self):
+    for dtype in [
+        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
+        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
+    ]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = constant_op.constant(
+            [2, 2, 2, 3, 3, 3], shape=[2, 3], dtype=dtype)
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Scalar, Tensor]
+  def DISABLED_testClipByValue2Type(self):
+    for dtype in [
+        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
+        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
+    ]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[4, 4, 4], [4, 5, 6]]
+        clip_value_min = 4
+        clip_value_max = constant_op.constant(
+            [6, 6, 6, 6, 6, 6], shape=[2, 3], dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Tensor]
+  def DISABLED_testClipByValue3Type(self):
+    for dtype in [
+        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
+        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
+    ]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [5, 5, 6]]
+        clip_value_min = constant_op.constant(
+            [2, 2, 2, 5, 5, 5], shape=[2, 3], dtype=dtype)
+        clip_value_max = constant_op.constant(
+            [5, 5, 5, 7, 7, 7], shape=[2, 3], dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
   def testClipByValueBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -48,6 +133,7 @@ class ClipTest(test.TestCase):
         _ = clip_ops.clip_by_value(x, 1.0, clip)
 
   def testClipByValueNonFinite(self):
+    # TODO(b/78016351): Enable test on GPU once the bug is fixed.
     with self.test_session():
       x = constant_op.constant([float('NaN'), float('Inf'), -float('Inf')])
       np_ans = [float('NaN'), 4.0, -4.0]
@@ -60,7 +146,7 @@ class ClipTest(test.TestCase):
   # ClipByNorm tests
   def testClipByNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
@@ -76,7 +162,7 @@ class ClipTest(test.TestCase):
     self.assertAllClose(np_ans, tf_ans_tensor)
 
   def testClipByNormBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -85,7 +171,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -97,7 +183,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
@@ -109,7 +195,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim0(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[:, 0] = sqrt(3^2 + 4^2) = 5, x[:, 2] = 3
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
@@ -121,7 +207,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim1(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
@@ -133,7 +219,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClippedWithAxes(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
@@ -146,7 +232,7 @@ class ClipTest(test.TestCase):
   # ClipByGlobalNorm tests
   def testClipByGlobalNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -167,7 +253,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormClippedTensor(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -188,7 +274,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormSupportsNone(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -211,7 +297,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormWithIndexedSlicesClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = ops.IndexedSlices(
           constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4]))
@@ -244,7 +330,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -263,7 +349,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([0.0, 0.0])
       # Norm = 0, no changes
@@ -282,7 +368,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClipped(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -294,7 +380,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClippedTensor(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -306,7 +392,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormNotClipped(self):
     # No norm clipping when average clip_norm >= 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -318,7 +404,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormZero(self):
     # No norm clipping when average clip_norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Average norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 127bc6bb20ae6b415da94672de68cc4b8ceaa287..c22934ce47543ab11b6a5b9acde2e2ec3aec9da7 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -526,7 +526,7 @@ class ConcatOpTest(test.TestCase):
     with self.test_session(use_gpu=True):
       t1 = []
       t2 = []
-      output = gen_array_ops._concat_v2([t1, t2], 0).eval()
+      output = gen_array_ops.concat_v2([t1, t2], 0).eval()
       self.assertFalse(output)  # Checks that output is empty
 
   def testConcatInvalidAxis(self):
@@ -534,20 +534,20 @@ class ConcatOpTest(test.TestCase):
       with self.test_session(use_gpu=True):
         t1 = [1]
         t2 = [2]
-        gen_array_ops._concat_v2([t1, t2], 1).eval()
+        gen_array_ops.concat_v2([t1, t2], 1).eval()
 
   def testConcatNegativeAxis(self):
     with self.test_session(use_gpu=True):
       t1 = [[1, 2, 3], [4, 5, 6]]
       t2 = [[7, 8, 9], [10, 11, 12]]
 
-      c = gen_array_ops._concat_v2([t1, t2], -2)
+      c = gen_array_ops.concat_v2([t1, t2], -2)
       self.assertEqual([4, 3], c.get_shape().as_list())
       output = c.eval()
       self.assertAllEqual([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
                           output)
 
-      c = gen_array_ops._concat_v2([t1, t2], -1)
+      c = gen_array_ops.concat_v2([t1, t2], -1)
       self.assertEqual([2, 6], c.get_shape().as_list())
       output = c.eval()
       self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
@@ -606,6 +606,17 @@ class ConcatOpTest(test.TestCase):
           inp_tensors_placeholders, -2, output_shape=[2, 3],
           gather_indexes=[2, 0], feed_dict=feed_dict)
 
+  def testConcatAxisType(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        t1 = [[1, 2, 3], [4, 5, 6]]
+        t2 = [[7, 8, 9], [10, 11, 12]]
+
+        c = gen_array_ops.concat_v2([t1, t2],
+                                    constant_op.constant(1, dtype=dtype))
+        self.assertEqual([2, 6], c.get_shape().as_list())
+        output = c.eval()
+        self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
 class ConcatOffsetTest(test.TestCase):
 
@@ -615,7 +626,7 @@ class ConcatOffsetTest(test.TestCase):
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
-      off = gen_array_ops._concat_offset(cdim, [s0, s1, s2])
+      off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
       ans = sess.run(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
@@ -624,7 +635,7 @@ class ConcatOffsetTest(test.TestCase):
       cdim = constant_op.constant(1, dtypes.int32)
       s0 = constant_op.constant([[2, 3, 5]], dtypes.int32)
       s1 = constant_op.constant([[2, 7, 5]], dtypes.int32)
-      off = gen_array_ops._concat_offset(cdim, [s0, s1])
+      off = gen_array_ops.concat_offset(cdim, [s0, s1])
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    r"should be a vector"):
         sess.run(off)
@@ -634,7 +645,7 @@ class ConcatOffsetTest(test.TestCase):
       cdim = constant_op.constant(4, dtypes.int32)
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
-      off = gen_array_ops._concat_offset(cdim, [s0, s1])
+      off = gen_array_ops.concat_offset(cdim, [s0, s1])
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    r"Concat dim is out of range: 4 vs. 3"):
         sess.run(off)
@@ -644,7 +655,7 @@ class ConcatOffsetTest(test.TestCase):
       cdim = constant_op.constant(1, dtypes.int32)
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5, 10], dtypes.int32)
-      off = gen_array_ops._concat_offset(cdim, [s0, s1])
+      off = gen_array_ops.concat_offset(cdim, [s0, s1])
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    r"should contain 3 elem"):
         sess.run(off)
@@ -654,7 +665,7 @@ class ConcatOffsetTest(test.TestCase):
       cdim = constant_op.constant(1, dtypes.int32)
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 10], dtypes.int32)
-      off = gen_array_ops._concat_offset(cdim, [s0, s1])
+      off = gen_array_ops.concat_offset(cdim, [s0, s1])
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError,
           r"All dimensions except 1 must match. Input 1 has shape \[2 7 10\] "
@@ -667,7 +678,7 @@ class ConcatOffsetTest(test.TestCase):
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
-      off = gen_array_ops._concat_offset(cdim, [s0, s1, s2])
+      off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
       ans = sess.run(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
@@ -675,7 +686,7 @@ class ConcatOffsetTest(test.TestCase):
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([1, 3, 5], dtypes.int32)
       s2 = constant_op.constant([3, 3, 5], dtypes.int32)
-      off = gen_array_ops._concat_offset(cdim, [s0, s1, s2])
+      off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
       ans = sess.run(off)
       self.assertAllEqual(ans, [[0, 0, 0], [2, 0, 0], [3, 0, 0]])
 
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index 670a625f0f1dd84c523de8acb17f9a410d184ad5..79e419867d70071280b7c88b6bfa820b935b24cd 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -104,11 +105,7 @@ class ConfusionMatrixTest(test.TestCase):
       d, l, cm_out = sess.run([data, lab, cm], {m_neg: 0.0, m_pos: 1.0, s: 1.0})
 
       truth = np.zeros([2, 2], dtype=np_dtype)
-      try:
-        range_builder = xrange
-      except NameError:  # In Python 3.
-        range_builder = range
-      for i in range_builder(len(d)):
+      for i in xrange(len(d)):
         truth[l[i], d[i]] += 1
 
       self.assertEqual(cm_out.dtype, np_dtype)
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 16e56349c45dd56a335f6f881826d975e24bd110..107ee37fabbae56c5bf715e1e7953b62ac3c526b 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import logging_ops
@@ -64,6 +65,11 @@ class ConstantTest(test.TestCase):
     self._testCpu(x)
     self._testGpu(x)
 
+  def testInvalidDType(self):
+    # Test case for GitHub issue 18474
+    with self.assertRaises(TypeError):
+      constant_op.constant(dtypes_lib.string, "[,]")
+
   def testBFloat16(self):
     bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16))
@@ -180,6 +186,11 @@ class ConstantTest(test.TestCase):
           shape=[2, 3, 5])
     self.assertEqual(c.get_shape(), [2, 3, 5])
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testEagerMemory(self):
+    """Tests PyObject refs are managed correctly when executing eagerly."""
+    constant_op.constant([[1.]])
+
   def testImplicitShapeNumPy(self):
     with ops.Graph().as_default():
       c = constant_op.constant(
@@ -647,12 +658,12 @@ class FillTest(test.TestCase):
     self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillComplex64(self):
-    np_ans = np.array([[0.15] * 3] * 2).astype(np.complex64)
-    self._compare([2, 3], np_ans[0][0], np_ans, use_gpu=False)
+    np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex64)
+    self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillComplex128(self):
-    np_ans = np.array([[0.15] * 3] * 2).astype(np.complex128)
-    self._compare([2, 3], np_ans[0][0], np_ans, use_gpu=False)
+    np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex128)
+    self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillString(self):
     np_ans = np.array([[b"yolo"] * 3] * 2)
@@ -875,7 +886,7 @@ versions {
 class PlaceholderWithDefaultTest(test.TestCase):
 
   def testFullShape(self):
-    with self.test_session():
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([[2, 2], [2, 2]], shape=[2, 2])
       a = array_ops.identity(p)
       self.assertAllEqual([[2, 2], [2, 2]], a.eval())
@@ -886,7 +897,7 @@ class PlaceholderWithDefaultTest(test.TestCase):
         a.eval(feed_dict={p: [[6, 6, 6], [6, 6, 6]]})
 
   def testPartialShape(self):
-    with self.test_session():
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([1, 2, 3], shape=[None])
       a = array_ops.identity(p)
       self.assertAllEqual([1, 2, 3], a.eval())
@@ -896,7 +907,7 @@ class PlaceholderWithDefaultTest(test.TestCase):
         a.eval(feed_dict={p: [[2, 2], [2, 2]]})
 
   def testNoShape(self):
-    with self.test_session():
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([17], shape=None)
       a = array_ops.identity(p)
       self.assertAllEqual([17], a.eval())
@@ -905,11 +916,12 @@ class PlaceholderWithDefaultTest(test.TestCase):
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
   def testGradient(self):
-    with self.test_session():
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       x = array_ops.placeholder(dtypes_lib.float32, [5, 7])
       y = array_ops.placeholder_with_default(x, None)
       err = gradient_checker.compute_gradient_error(x, [5, 7], y, [5, 7])
       self.assertLess(err, 1e-3)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 15ff0ec09b65a8ba242473fb7b25ee00424e0926..77e6f5f1a0d645101584f0eeec2ced80459197ba 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -144,7 +144,7 @@ class ControlFlowTest(test.TestCase):
 
       enter_v = control_flow_ops._Enter(v, "foo_1", is_constant=True)
       nine = constant_op.constant(9)
-      enter_nine = gen_control_flow_ops._enter(nine, "foo_1")
+      enter_nine = gen_control_flow_ops.enter(nine, "foo_1")
       op = state_ops.assign(enter_v, enter_nine)
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
@@ -164,9 +164,9 @@ class ControlFlowTest(test.TestCase):
   def testEnterMulExit(self):
     with self.test_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
-      enter_data = gen_control_flow_ops._enter(data, "foo_1", False)
+      enter_data = gen_control_flow_ops.enter(data, "foo_1", False)
       five = constant_op.constant(5)
-      enter_five = gen_control_flow_ops._enter(five, "foo_1", False)
+      enter_five = gen_control_flow_ops.enter(five, "foo_1", False)
       mul_op = math_ops.multiply(enter_data, enter_five)
       exit_op = control_flow_ops.exit(mul_op)
 
@@ -178,12 +178,12 @@ class ControlFlowTest(test.TestCase):
       v = variables.Variable([0.0, 0.0], dtype=dtypes.float32)
 
       # If is_constant=True, the shape information should be propagated.
-      enter_v_constant = gen_control_flow_ops._enter(
+      enter_v_constant = gen_control_flow_ops.enter(
           v, "frame1", is_constant=True)
       self.assertEqual(enter_v_constant.shape, [2])
 
       # Otherwise, the shape should be unknown.
-      enter_v_non_constant = gen_control_flow_ops._enter(
+      enter_v_non_constant = gen_control_flow_ops.enter(
           v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
@@ -257,8 +257,8 @@ class ControlFlowTest(test.TestCase):
       false = ops.convert_to_tensor(False)
       n = constant_op.constant(10)
 
-      enter_false = gen_control_flow_ops._enter(false, "foo_1", False)
-      enter_n = gen_control_flow_ops._enter(n, "foo_1", False)
+      enter_false = gen_control_flow_ops.enter(false, "foo_1", False)
+      enter_n = gen_control_flow_ops.enter(n, "foo_1", False)
 
       merge_n = control_flow_ops.merge([enter_n, enter_n], name="merge_n")[0]
       switch_n = control_flow_ops.switch(merge_n, enter_false)
@@ -275,9 +275,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
-      enter_one = gen_control_flow_ops._enter(one, "foo", True)
-      enter_n = gen_control_flow_ops._enter(n, "foo", True)
+      enter_i = gen_control_flow_ops.enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops.enter(one, "foo", True)
+      enter_n = gen_control_flow_ops.enter(n, "foo", True)
 
       with ops.device(test.gpu_device_name()):
         merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
@@ -301,9 +301,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
-      enter_one = gen_control_flow_ops._enter(one, "foo", True)
-      enter_n = gen_control_flow_ops._enter(n, "foo", True)
+      enter_i = gen_control_flow_ops.enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops.enter(one, "foo", True)
+      enter_n = gen_control_flow_ops.enter(n, "foo", True)
 
       merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
 
@@ -324,8 +324,8 @@ class ControlFlowTest(test.TestCase):
   def testDifferentFrame(self):
     with self.test_session():
       data = array_ops.placeholder(dtypes.float32, shape=[])
-      enter_1 = gen_control_flow_ops._enter(data, "foo_1", False)
-      enter_2 = gen_control_flow_ops._enter(data, "foo_2", False)
+      enter_1 = gen_control_flow_ops.enter(data, "foo_1", False)
+      enter_2 = gen_control_flow_ops.enter(data, "foo_2", False)
       res = math_ops.add(enter_1, enter_2)
       with self.assertRaisesOpError("has inputs from different frames"):
         res.eval(feed_dict={data: 1.0})
@@ -552,7 +552,7 @@ class ControlFlowTest(test.TestCase):
 
   def testCondRef(self):
     with self.test_session():
-      x = gen_state_ops._variable(
+      x = gen_state_ops.variable(
           shape=[1],
           dtype=dtypes.float32,
           name="x",
@@ -580,7 +580,7 @@ class ControlFlowTest(test.TestCase):
 
   def testUninitializedRefIdentity(self):
     with self.test_session() as sess:
-      v = gen_state_ops._variable(
+      v = gen_state_ops.variable(
           shape=[1],
           dtype=dtypes.float32,
           name="v",
@@ -591,10 +591,10 @@ class ControlFlowTest(test.TestCase):
       # Both v_f and v_t are uninitialized references. However, an actual use
       # of the reference in the 'true' branch in the 'tf.identity' op will
       # not 'fire' when v is uninitialized, so this is a valid construction.
-      # This test tests that _ref_identity allows uninitialized ref as input
+      # This test tests that ref_identity allows uninitialized ref as input
       # so that this construction is allowed.
-      v_f_op = gen_array_ops._ref_identity(v_f)
-      v_t_op = gen_array_ops._ref_identity(v_t)
+      v_f_op = gen_array_ops.ref_identity(v_f)
+      v_t_op = gen_array_ops.ref_identity(v_t)
       with ops.control_dependencies([v_f_op]):
         assign_v = state_ops.assign(v, [1.0])
       with ops.control_dependencies([v_t_op]):
@@ -633,7 +633,8 @@ class ControlFlowTest(test.TestCase):
       sess.run(r)
 
   def testCondGrad_1(self):
-    with self.test_session():
+    graph = ops.Graph()
+    with graph.as_default():
       x = constant_op.constant(10.0, name="x")
       pred = math_ops.less(1, 2)
       fn1 = lambda: array_ops.identity(x)
@@ -641,8 +642,14 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       grad = gradients_impl.gradients(r, [x])[0]
-      result = grad.eval()
-    self.assertAllEqual(1.0, result)
+      with self.test_session():
+        self.assertAllEqual(1.0, grad.eval())
+    # The gradients computation creates a tensor with zeros by broadcasting a
+    # zeros constant to the required shape. Verify that the zero constant
+    # feeding into the fill is dominated by a Switch.
+    zero = graph.get_operation_by_name("gradients/zeros/Const")
+    self.assertEqual(len(zero.control_inputs), 1)
+    self.assertEqual(zero.control_inputs[0].type, "Switch")
 
   def testCondGrad_2(self):
     with self.test_session():
@@ -657,6 +664,23 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(42.0, grad.eval(feed_dict={c: 1}))
       self.assertAllEqual(3.0, grad.eval(feed_dict={c: 3}))
 
+  def testCondGrad_3(self):
+    with self.test_session():
+      c = array_ops.placeholder(dtypes.int32, shape=[])
+      ox = constant_op.constant(10.0)
+      pred = math_ops.less(c, 2)
+
+      def fn1(x):
+        m = x * x
+        return gradients_impl.gradients(m, [ox])[0]
+
+      fn2 = lambda: math_ops.multiply(ox, 3.0)
+      y = math_ops.multiply(7.0, ox)
+      r = control_flow_ops.cond(pred, lambda: fn1(y), fn2)
+
+      self.assertAllEqual(980.0, r.eval(feed_dict={c: 1}))
+      self.assertAllEqual(30.0, r.eval(feed_dict={c: 3}))
+
   def testNestedCond_Simple(self):
     with self.test_session():
       x = constant_op.constant(0., name="X")
@@ -744,7 +768,7 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         self.assertEqual(x.dtype, dtypes.int32_ref)
-        return (i + 1, gen_array_ops._ref_identity(x))
+        return (i + 1, gen_array_ops.ref_identity(x))
 
       r = control_flow_ops.while_loop(c, b, [i, x], parallel_iterations=5)
 
@@ -1111,11 +1135,10 @@ class ControlFlowTest(test.TestCase):
 
       with self.assertRaisesRegexp(
           ValueError,
-          r"The shape for while_1/Merge_1:0 is not an invariant for the loop. "
-          r"It enters the loop with shape \(2, 2\), but has shape \(4, 2\) "
-          r"after one iteration. Provide shape invariants using either the "
-          r"`shape_invariants` argument of tf.while_loop or set_shape\(\) on "
-          r"the loop variables."):
+          r"Input tensor 'ones:0' enters the loop with shape \(2, 2\), but has "
+          r"shape \(4, 2\) after one iteration. To allow the shape to vary "
+          r"across iterations, use the `shape_invariants` argument of "
+          r"tf.while_loop to specify a less-specific shape."):
         r = control_flow_ops.while_loop(c, b, [i, m])
 
   def testWhileShapeInferenceSparseTensor(self):
@@ -1620,7 +1643,7 @@ class ControlFlowTest(test.TestCase):
 
   def testWhileStack_1(self):
     with self.test_session():
-      s = gen_data_flow_ops._stack_v2(-1, dtypes.int32, stack_name="foo")
+      s = gen_data_flow_ops.stack_v2(-1, dtypes.int32, stack_name="foo")
       i = constant_op.constant(0)
 
       def c(i):
@@ -1629,7 +1652,7 @@ class ControlFlowTest(test.TestCase):
       def b(i):
         ni = math_ops.add(i, 1)
         ni = control_flow_ops.with_dependencies(
-            [gen_data_flow_ops._stack_push_v2(s, i)], ni)
+            [gen_data_flow_ops.stack_push_v2(s, i)], ni)
         return ni
 
       r = control_flow_ops.while_loop(c, b, [i], parallel_iterations=1)
@@ -1641,7 +1664,7 @@ class ControlFlowTest(test.TestCase):
 
       def b1(i, x):
         ni = math_ops.subtract(i, 1)
-        nx = x + gen_data_flow_ops._stack_pop_v2(s, dtypes.int32)
+        nx = x + gen_data_flow_ops.stack_pop_v2(s, dtypes.int32)
         return [ni, nx]
 
       _, rx = control_flow_ops.while_loop(
@@ -1840,6 +1863,23 @@ class ControlFlowTest(test.TestCase):
                                       [tensor_shape.unknown_shape()])
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
+  def testCondGradInNestedWhiles(self):
+    def outer_body(i, x):
+      _, x = control_flow_ops.while_loop(
+          lambda j, x: j < 3, inner_body, [0, 0.0])
+      return i + 1, x
+
+    def inner_body(j, x):
+      y = control_flow_ops.cond(math_ops.less(x, 1), lambda: 2 * x, lambda: x)
+      return j + 1, gradients_impl.gradients(y, x)[0]
+
+    i, x = control_flow_ops.while_loop(lambda i, x: i < 3, outer_body, [0, 0.0])
+
+    with self.test_session() as sess:
+      i_val, x_val = sess.run([i, x])
+      self.assertEqual(i_val, 3)
+      self.assertAllClose(x_val, 1.0)
+
   def testWhile_NestedInput(self):
     with self.test_session() as sess:
       named = collections.namedtuple("named", ("a", "b"))
@@ -2182,18 +2222,15 @@ class ControlFlowTest(test.TestCase):
 
   def testWhileWithRefsWithGradients_1(self):
     with self.test_session() as sess:
-      x = variables.Variable(0)._ref()  # pylint: disable=protected-access
+      x = variables.Variable(0.)._ref()  # pylint: disable=protected-access
       i = constant_op.constant(0)
       c = lambda i, x: math_ops.less(i, 10)
 
-      self.assertEqual(x.dtype, dtypes.int32_ref)
+      self.assertEqual(x.dtype, dtypes.float32_ref)
 
-      # pylint: disable=protected-access
       def body(i, x):
-        self.assertEqual(x.dtype, dtypes.int32_ref)
-        return [i + 1, gen_array_ops._ref_identity(x)]
-
-      # pylint: enable=protected-access
+        self.assertEqual(x.dtype, dtypes.float32_ref)
+        return [i + 1, gen_array_ops.ref_identity(x)]
 
       r = control_flow_ops.while_loop(c, body, [i, x], parallel_iterations=5)
 
@@ -2203,7 +2240,7 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       self.assertEqual(r[0].dtype, dtypes.int32)
-      self.assertEqual(r[1].dtype, dtypes.int32_ref)
+      self.assertEqual(r[1].dtype, dtypes.float32_ref)
 
       value_i, value_x, value_x_grad = sess.run(r + grad)
 
@@ -2406,6 +2443,63 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, y)[0]
       self.assertEqual(388.0, r.eval())
 
+  def testWhileGradientWithNontrainablePath1(self):
+    q = variables.Variable([7., 8.])
+
+    def cond(_, y):
+      del y
+      return False
+
+    def body(x, _):
+      return x, math_ops.cast(x, dtypes.float32) + math_ops.reduce_sum(q)
+
+    _, y = control_flow_ops.while_loop(cond, body, (math_ops.argmin(q), 0.))
+    dy_dq, = gradients_impl.gradients(y, q)
+    self.assertIsNotNone(dy_dq)
+    with self.test_session() as sess:
+      sess.run(q.initializer)
+      self.assertAllClose([0., 0.], sess.run(dy_dq))
+
+  def testWhileGradientWithNontrainablePath2(self):
+    q = variables.Variable([7., 8.])
+
+    def cond(_, y):
+      return math_ops.equal(y, 0.)
+
+    def body(x, _):
+      zero = constant_op.constant(0, dtype=dtypes.int64)
+      return zero, math_ops.cast(x, dtypes.float32) + math_ops.reduce_sum(q)
+
+    _, y = control_flow_ops.while_loop(cond, body, (math_ops.argmin(q), 0.))
+    dy_dq, = gradients_impl.gradients(y, q)
+    self.assertIsNotNone(dy_dq)
+    with self.test_session() as sess:
+      sess.run(q.initializer)
+      self.assertAllClose([1., 1.], sess.run(dy_dq))
+
+  def testIssue16504(self):
+    c = constant_op.constant(np.arange(100), dtype=dtypes.float32)
+    w = variables.Variable(
+        initial_value=np.ones(100), dtype=dtypes.float32) / 100
+    k = variables.Variable(0, dtype=dtypes.int32)
+    chg_w = constant_op.constant(np.inf, dtype=dtypes.float32)
+
+    def cond(k, _, chg_w):
+      return math_ops.logical_and(k < 10, chg_w > 1e-3)
+
+    def body(k, w, chg_w):
+      grad, = gradients_impl.gradients(-math_ops.reduce_sum(w * c), w)
+      w_n = w * math_ops.exp(-0.1 * grad)
+      w_n /= math_ops.reduce_sum(w_n)
+      chg_w = (
+          math_ops.reduce_sum(math_ops.abs(w_n - w)) / math_ops.reduce_sum(
+              math_ops.abs(w)))
+      return k + 1, w_n, chg_w
+
+    _, w, _ = control_flow_ops.while_loop(cond, body, [k, w, chg_w])
+    grad, = gradients_impl.gradients(w, c)
+    self.assertIsNotNone(grad)
+
   def testStopGradMultiFlows(self):
     with self.test_session():
 
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
index 23185eaeece0d56fd83ecdf9e02c778712420465..39e96f74b0461da0cf499e303b30a4a41aae4899 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -41,17 +41,17 @@ class ControlFlowUtilTest(test.TestCase):
     self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
 
   def testIsLoopEnter(self):
-    enter = gen_control_flow_ops._enter(1, frame_name="name").op
+    enter = gen_control_flow_ops.enter(1, frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(enter))
     self.assertFalse(control_flow_util.IsLoopConstantEnter(enter))
 
-    ref_enter = gen_control_flow_ops._ref_enter(test_ops.ref_output(),
-                                                frame_name="name").op
+    ref_enter = gen_control_flow_ops.ref_enter(test_ops.ref_output(),
+                                               frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(ref_enter))
     self.assertFalse(control_flow_util.IsLoopConstantEnter(ref_enter))
 
-    const_enter = gen_control_flow_ops._enter(1, frame_name="name",
-                                              is_constant=True).op
+    const_enter = gen_control_flow_ops.enter(1, frame_name="name",
+                                             is_constant=True).op
     self.assertTrue(control_flow_util.IsLoopEnter(const_enter))
     self.assertTrue(control_flow_util.IsLoopConstantEnter(const_enter))
 
diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index a8b3af509622a871f0afe78c510f9ce994078866..8973a450fa246e3c924f4352d72a1bbd4f7851ea 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -119,6 +119,18 @@ class Conv3DTransposeTest(test.TestCase):
                   target = 3.0
                 self.assertAllClose(target, value[n, d, h, w, k])
 
+  def testConv3DTransposeShapeMismatch(self):
+    # Test case for GitHub issue 18460
+    x_shape = [2, 2, 3, 4, 3]
+    f_shape = [3, 3, 3, 2, 2]
+    y_shape = [2, 2, 6, 8, 6]
+    strides = [1, 1, 2, 2, 2]
+    np.random.seed(1)
+    x_value = np.random.random_sample(x_shape).astype(np.float64)
+    f_value = np.random.random_sample(f_shape).astype(np.float64)
+    nn_ops.conv3d_transpose(
+        x_value, f_value, y_shape, strides, data_format='NCDHW')
+
   def testConv3DTransposeValid(self):
     with self.test_session():
       strides = [1, 2, 2, 2, 1]
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index ec8ac74163d093c57e6e4ffbab6977ce732cc3ef..0b531125f36c6d01268081aefe3815d38720a23f 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -25,8 +25,10 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -60,18 +62,18 @@ class Conv3DTest(test.TestCase):
 
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, stride,
                             padding, data_format, dtype, use_gpu):
-    total_size_1 = 1
-    total_size_2 = 1
+    total_size_tensor = 1
+    total_size_filter = 1
     for s in tensor_in_sizes:
-      total_size_1 *= s
+      total_size_tensor *= s
     for s in filter_in_sizes:
-      total_size_2 *= s
+      total_size_filter *= s
 
     # Initializes the input tensor with array containing numbers from 0 to 1.
     # We keep the input tensor values fairly small to avoid overflowing float16
     # during the conv3d.
-    x1 = [f * 1.0 / total_size_1 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 / total_size_2 for f in range(1, total_size_2 + 1)]
+    x1 = [f * 1.0 / total_size_tensor for f in range(1, total_size_tensor + 1)]
+    x2 = [f * 1.0 / total_size_filter for f in range(1, total_size_filter + 1)]
     with self.test_session(use_gpu=use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
@@ -117,6 +119,79 @@ class Conv3DTest(test.TestCase):
 
           self.assertAllClose(expected, value.flatten(), atol=tol, rtol=tol)
 
+  def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
+                                   stride, dilation, padding, data_format,
+                                   use_gpu):
+    total_size_tensor = 1
+    total_size_filter = 1
+    for s in tensor_in_sizes:
+      total_size_tensor *= s
+    for s in filter_in_sizes:
+      total_size_filter *= s
+
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_tensor + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_filter + 1)]
+    with self.test_session(use_gpu=use_gpu):
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      if isinstance(stride, collections.Iterable):
+        strides = list(stride)
+      else:
+        strides = [stride, stride, stride]
+      if data_format == "NCDHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        full_strides = [1, 1] + strides
+        full_dilation = [1, 1] + dilation
+      else:
+        full_strides = [1] + strides + [1]
+        full_dilation = [1] + dilation + [1]
+      expected = nn_ops.convolution(
+          t1,
+          t2,
+          padding=padding,
+          strides=strides,
+          dilation_rate=dilation,
+          data_format=data_format)
+      computed = nn_ops.conv3d(
+          t1,
+          t2,
+          strides=full_strides,
+          dilations=full_dilation,
+          padding=padding,
+          data_format=data_format)
+      if data_format == "NCDHW":
+        expected = test_util.NCHWToNHWC(expected)
+        computed = test_util.NCHWToNHWC(computed)
+    return expected, computed
+
+  def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, stride,
+                               padding, dilations):
+    expected_results = []
+    computed_results = []
+    default_dilations = (
+        dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1)
+    for data_format, use_gpu in GetTestConfigs():
+      # If any dilation rate is larger than 1, only do test on the GPU
+      # because we currently do not have a CPU implementation for arbitrary
+      # dilation rates.
+      if default_dilations or use_gpu:
+        expected, computed = self._ComputeReferenceDilatedConv(
+            tensor_in_sizes, filter_in_sizes, stride, dilations, padding,
+            data_format, use_gpu)
+        expected_results.append(expected)
+        computed_results.append(computed)
+        tolerance = 1e-2 if use_gpu else 1e-5
+        with self.test_session() as sess:
+          expected_values = sess.run(expected_results)
+          computed_values = sess.run(computed_results)
+          for e_value, c_value in zip(expected_values, computed_values):
+            print("expected = ", e_value)
+            print("actual = ", c_value)
+            self.assertAllClose(
+                e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-6)
+
   def testConv3D1x1x1Filter(self):
     expected_output = [
         0.18518519, 0.22222222, 0.25925926, 0.40740741, 0.5, 0.59259259,
@@ -144,6 +219,15 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  def testConv3D1x1x1Filter2x1x1Dilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 3, 6, 1, 1],
+          filter_in_sizes=[1, 1, 1, 1, 1],
+          stride=1,
+          padding="VALID",
+          dilations=[2, 1, 1])
+
   # Expected values computed using scipy's correlate function.
   def testConv3D2x2x2Filter(self):
     expected_output = [
@@ -160,6 +244,15 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  def testConv3D2x2x2Filter1x2x1Dilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 4, 6, 3, 1],
+          filter_in_sizes=[2, 2, 2, 1, 1],
+          stride=1,
+          padding="VALID",
+          dilations=[1, 2, 1])
+
   def testConv3DStrides(self):
     expected_output = [
         0.06071429, 0.08988095, 0.10238095, 0.11488095, 0.12738095, 0.13988095,
@@ -344,6 +437,8 @@ class Conv3DTest(test.TestCase):
         if data_format == "NCDHW":
           conv = test_util.NCHWToNHWC(conv)
 
+        self.assertEqual(conv.shape, tensor_shape.TensorShape(output_shape))
+
         if test_input:
           jacob_t, jacob_n = gradient_checker.compute_gradient(
               orig_input_tensor, input_shape, conv, output_shape)
@@ -543,6 +638,98 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  # Testing for backprops
+  def _RunAndVerifyBackprop(self, input_sizes, filter_sizes, output_sizes,
+                            strides, dilations, padding, data_format, use_gpu,
+                            err, mode):
+    total_input_size = 1
+    total_filter_size = 1
+    for s in input_sizes:
+      total_input_size *= s
+    for s in filter_sizes:
+      total_filter_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    default_dilations = (
+        dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1)
+
+    # If any dilation rate is larger than 1, only do test on the GPU
+    # because we currently do not have a CPU implementation for arbitrary
+    # dilation rates.
+    if default_dilations or use_gpu:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        if data_format == "NCDHW":
+          input_sizes = test_util.NHWCToNCHW(input_sizes)
+        t1 = constant_op.constant(x1, shape=input_sizes)
+        t2 = constant_op.constant(x2, shape=filter_sizes)
+        full_strides = [1] + strides + [1]
+        full_dilations = [1] + dilations + [1]
+        if data_format == "NCDHW":
+          full_strides = test_util.NHWCToNCHW(full_strides)
+          full_dilations = test_util.NHWCToNCHW(full_dilations)
+        actual = nn_ops.conv3d(
+            t1,
+            t2,
+            strides=full_strides,
+            dilations=full_dilations,
+            padding=padding,
+            data_format=data_format)
+        expected = nn_ops.convolution(
+            t1,
+            t2,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilations,
+            data_format=data_format)
+        if data_format == "NCDHW":
+          actual = test_util.NCHWToNHWC(actual)
+          expected = test_util.NCHWToNHWC(expected)
+        actual_grad = gradients_impl.gradients(actual, t1
+                                               if mode == "input" else t2)[0]
+        expected_grad = gradients_impl.gradients(expected, t1
+                                                 if mode == "input" else t2)[0]
+        # "values" consists of two tensors for two backprops
+        actual_value = sess.run(actual_grad)
+        expected_value = sess.run(expected_grad)
+        self.assertShapeEqual(actual_value, actual_grad)
+        self.assertShapeEqual(expected_value, expected_grad)
+      print("expected = ", expected_value)
+      print("actual = ", actual_value)
+      self.assertArrayNear(expected_value.flatten(), actual_value.flatten(),
+                           err)
+
+  def testConv3D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackprop(
+            input_sizes=[1, 3, 6, 1, 1],
+            filter_sizes=[2, 2, 1, 1, 1],
+            output_sizes=[1, 1, 5, 1, 1],
+            strides=[1, 1, 1],
+            dilations=[2, 1, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5,
+            mode="filter")
+
+  def testConv3D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackprop(
+            input_sizes=[1, 3, 6, 1, 1],
+            filter_sizes=[2, 2, 1, 1, 1],
+            output_sizes=[1, 1, 5, 1, 1],
+            strides=[1, 1, 1],
+            dilations=[2, 1, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5,
+            mode="input")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 27857989167ecd11c33286d7bb6cb068edd12831..a291bef0ad6f16184ff29f665457a53b77447d54 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -159,11 +159,11 @@ class Conv2DTest(test.TestCase):
 
   def _DtypesToTest(self, use_gpu):
     if use_gpu and not test_util.CudaSupportsHalfMatMulAndConv():
-      return [dtypes.float32]
+      return [dtypes.float32, dtypes.float64]
     else:
       # It is important that float32 comes before float16 here,
       # as we will be using its gradients as reference for fp16 gradients.
-      return [dtypes.float32, dtypes.float16]
+      return [dtypes.float32, dtypes.float16, dtypes.float64]
 
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, dilations,
                             strides, padding, data_format, dtype, use_gpu):
@@ -970,7 +970,7 @@ class Conv2DTest(test.TestCase):
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 3, 6, 1],
@@ -984,7 +984,7 @@ class Conv2DTest(test.TestCase):
             err=1e-5)
 
   def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 2, 3, 1],
@@ -998,7 +998,7 @@ class Conv2DTest(test.TestCase):
             err=1e-5)
 
   def testConv2DEmptyBackpropFilterDilation1x2(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 2, 3, 1],
@@ -1012,7 +1012,7 @@ class Conv2DTest(test.TestCase):
             err=1e-5)
 
   def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 3, 4, 3],
@@ -1026,7 +1026,7 @@ class Conv2DTest(test.TestCase):
             err=1e-5)
 
   def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 3, 3, 1],
@@ -1040,7 +1040,7 @@ class Conv2DTest(test.TestCase):
             err=1e-5)
 
   def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropInputDilation(
             input_sizes=[1, 3, 6, 1],
@@ -1054,7 +1054,7 @@ class Conv2DTest(test.TestCase):
             err=1e-5)
 
   def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropInputDilation(
             input_sizes=[1, 2, 3, 1],
@@ -1068,7 +1068,7 @@ class Conv2DTest(test.TestCase):
             err=1e-5)
 
   def testConv2DEmptyBackpropInputDilation1x2(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropInputDilation(
             input_sizes=[0, 2, 3, 1],
@@ -1082,7 +1082,7 @@ class Conv2DTest(test.TestCase):
             err=1e-5)
 
   def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         # The GPU version of this test is not very stable. So adjusting the
         # error threshold to 1e-4.
@@ -1098,7 +1098,7 @@ class Conv2DTest(test.TestCase):
             err=1e-4)
 
   def testConv2DKernelSizeMatchesInputSizeBackpropInputDilation2x2(self):
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropInputDilation(
             input_sizes=[1, 3, 3, 1],
@@ -1513,21 +1513,6 @@ class Conv2DTest(test.TestCase):
                 strides=[1, 1, 1, 1],
                 padding="VALID"))
 
-  def testCPUConv2DNCHWUnimplemented(self):
-    with self.test_session(use_gpu=False):
-      with self.assertRaisesRegexp(errors_impl.UnimplementedError,
-                                   "NHWC tensor format for now"):
-        conv = self._SetupValuesForDevice(
-            tensor_in_sizes=[1, 4, 4, 1],
-            filter_in_sizes=[2, 2, 1, 1],
-            dilations=[1, 1],
-            strides=[1, 1],
-            padding="VALID",
-            data_format="NCHW",
-            dtype=dtypes.float32,
-            use_gpu=False)
-        self.evaluate(conv)
-
 
 class DepthwiseConv2DTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 0d9b46c30dbbed20dd940e0427fbf6f6d5415106..87da89831c8ded9b8382c7bb251948b6d202300e 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -398,14 +398,17 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.abs, _ABS)
     self._compareCpu(x, np.negative, math_ops.negative)
     self._compareCpu(x, np.negative, _NEG)
-    self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(x, np.sign, math_ops.sign)
 
     self._compareBothSparse(x, np.abs, math_ops.abs)
     self._compareBothSparse(x, np.negative, math_ops.negative)
-    self._compareBothSparse(x, np.square, math_ops.square)
     self._compareBothSparse(x, np.sign, math_ops.sign)
 
+  def testInt64Square(self):
+    x = np.arange(-6 << 20, 6 << 20, 2 << 20).reshape(1, 3, 2).astype(np.int64)
+    self._compareCpu(x, np.square, math_ops.square)
+    self._compareBothSparse(x, np.square, math_ops.square)
+
   def testComplex64Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
         np.complex64)
@@ -495,11 +498,11 @@ class UnaryOpTest(test.TestCase):
     dtype_tols = [(np.float32, 5e-4), (np.float64, 1e-6), (np.complex64, 5e-4),
                   (np.complex128, 1e-6)]
     op_range = [
-        (gen_math_ops._reciprocal_grad, [-2, 2]),
-        (gen_math_ops._rsqrt_grad, [0.1, 3]),
-        (gen_math_ops._sigmoid_grad, [-2, 2]),
-        (gen_math_ops._sqrt_grad, [0.1, 3]),
-        (gen_math_ops._tanh_grad, [-2, 2]),
+        (gen_math_ops.reciprocal_grad, [-2, 2]),
+        (gen_math_ops.rsqrt_grad, [0.1, 3]),
+        (gen_math_ops.sigmoid_grad, [-2, 2]),
+        (gen_math_ops.sqrt_grad, [0.1, 3]),
+        (gen_math_ops.tanh_grad, [-2, 2]),
     ]
 
     def rand(dtype):
@@ -2165,5 +2168,47 @@ class AccumulateTest(test.TestCase):
         math_ops.accumulate_n([a], tensor_dtype=np.int32)
 
 
+class PolyvalTest(test.TestCase):
+
+  def _runtest(self, dtype, degree):
+    x = np.random.rand(2, 2).astype(dtype)
+    coeffs = [np.random.rand(2, 2).astype(dtype) for _ in range(degree + 1)]
+    np_val = np.polyval(coeffs, x)
+    with self.test_session():
+      tf_val = math_ops.polyval(coeffs, x)
+      self.assertAllClose(np_val, tf_val.eval())
+
+  def testSimple(self):
+    for dtype in [
+        np.int32, np.float32, np.float64, np.complex64, np.complex128
+    ]:
+      for degree in range(5):
+        self._runtest(dtype, degree)
+
+  def testBroadcast(self):
+    dtype = np.float32
+    degree = 3
+    shapes = [(1,), (2, 1), (1, 2), (2, 2)]
+    for x_shape in shapes:
+      for coeff_shape in shapes:
+        x = np.random.rand(*x_shape).astype(dtype)
+        coeffs = [
+            np.random.rand(*coeff_shape).astype(dtype)
+            for _ in range(degree + 1)
+        ]
+        np_val = np.polyval(coeffs, x)
+        with self.test_session():
+          tf_val = math_ops.polyval(coeffs, x)
+          self.assertAllClose(np_val, tf_val.eval())
+
+  def testEmpty(self):
+    x = np.random.rand(2, 2).astype(np.float32)
+    coeffs = []
+    np_val = np.polyval(coeffs, x)
+    with self.test_session():
+      tf_val = math_ops.polyval(coeffs, x)
+      self.assertAllClose(np_val, tf_val.eval())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/decode_csv_op_test.py b/tensorflow/python/kernel_tests/decode_csv_op_test.py
index fec52fa9cc7bcab1da67e797c2e121edac8c9345..4f49d726765e6019715a9b40f531b82df7f33126 100644
--- a/tensorflow/python/kernel_tests/decode_csv_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_csv_op_test.py
@@ -78,9 +78,11 @@ class DecodeCSVOpTest(test.TestCase):
     self._test(args, expected_out)
 
   def test2DNoQuoteDelimiter(self):
-    args = {"records": [["1", "2"], ['""', '"']],
-            "record_defaults": [[""]],
-            "use_quote_delim": False}
+    args = {
+        "records": [["1", "2"], ['""', '"']],
+        "record_defaults": [[""]],
+        "use_quote_delim": False
+    }
     expected_out = [[[b"1", b"2"], [b'""', b'"']]]
 
     self._test(args, expected_out)
@@ -88,8 +90,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testDouble(self):
     args = {
         "records": ["1.0", "-1.79e+308", '"1.79e+308"'],
-        "record_defaults": [np.array(
-            [], dtype=np.double)],
+        "record_defaults": [np.array([], dtype=np.double)],
     }
 
     expected_out = [[1.0, -1.79e+308, 1.79e+308]]
@@ -99,8 +100,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testInt64(self):
     args = {
         "records": ["1", "2", '"2147483648"'],
-        "record_defaults": [np.array(
-            [], dtype=np.int64)],
+        "record_defaults": [np.array([], dtype=np.int64)],
     }
 
     expected_out = [[1, 2, 2147483648]]
@@ -173,8 +173,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWithoutDefaultsError(self):
     args = {
         "records": [",1", "0.2,3", "3.0,"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -183,8 +182,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWrongFieldIntError(self):
     args = {
         "records": [",1", "0.2,234a", "3.0,2"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -202,8 +200,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWrongFieldFloatError(self):
     args = {
         "records": [",1", "0.2,2", "3.0adf,3"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -229,6 +226,73 @@ class DecodeCSVOpTest(test.TestCase):
     self._test(
         args, expected_err_re="Quoted field has to end with quote followed.*")
 
+  def testSelectCols(self):
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[1], [2]],
+        "select_cols": [0, 1]
+    }
+    expected_out = [[1, 4], [2, 5]]
+    self._test(args, expected_out)
+
+  def testSelectColsInclLast(self):
+    # The last col is a edge-casey; add test for that
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[0], [1], [2]],
+        "select_cols": [0, 1, 2]
+    }
+    expected_out = [[0, 4], [1, 5], [2, 6]]
+    self._test(args, expected_out)
+
+  def testWrongSelectColsInclLast(self):
+    # The last col is a edge-casey; add test for that
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[0], [1], [2]],
+        "select_cols": [0, 1, 3]
+    }
+    self._test(args, expected_err_re="Expect 3 fields but have 2 in record 0")
+
+  def testWrongSelectColsLen(self):
+    args = {
+        "records": ["1,2,3", "4,5,6"],
+        "record_defaults": [[0], [0], [0]],
+        "select_cols": [0]
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "Length of select_cols and record_defaults do not match."):
+      self._test(args)
+
+  def testWrongSelectColsSorting(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [1, 0]
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "select_cols is not strictly increasing."):
+      self._test(args)
+
+  def testWrongSelectColsIndicesNegative(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [-1, 0]  # -1 is not a valid index
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "select_cols contains negative values."):
+      self._test(args)
+
+  def testWrongSelectColsIndicesTooHigh(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [0, 3]  # 3 is not a valid index
+    }
+    # Only successfully parses one of the columns
+    self._test(args, expected_err_re="Expect 2 fields but have 1 in record 0")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 7df2366954f3a6f3f37aef447479ba67c263025f..f0beabb4e20e4ec0a2fc7a487bf2541d19568927 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -35,8 +35,8 @@ from tensorflow.python.platform import tf_logging
 
 class DepthToSpaceTest(test.TestCase):
 
-  def _testOne(self, inputs, block_size, outputs):
-    input_nhwc = math_ops.to_float(inputs)
+  def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
+    input_nhwc = math_ops.cast(inputs, dtype)
     with self.test_session(use_gpu=False):
       # test NHWC (default) on CPU
       x_tf = array_ops.depth_to_space(input_nhwc, block_size)
@@ -59,6 +59,12 @@ class DepthToSpaceTest(test.TestCase):
     x_out = [[[[1], [2]], [[3], [4]]]]
     self._testOne(x_np, block_size, x_out)
 
+  def testBasicFloat16(self):
+    x_np = [[[[1, 2, 3, 4]]]]
+    block_size = 2
+    x_out = [[[[1], [2]], [[3], [4]]]]
+    self._testOne(x_np, block_size, x_out, dtype=dtypes.float16)
+
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
   def testBlockSize2(self):
@@ -90,6 +96,24 @@ class DepthToSpaceTest(test.TestCase):
     x_out = [batch_output_elt(i) for i in range(batch_size)]
     self._testOne(x_np, block_size, x_out)
 
+  def testBatchSize0(self):
+    block_size = 2
+    batch_size = 0
+    input_nhwc = array_ops.ones([batch_size, 2, 3, 12])
+    x_out = array_ops.ones([batch_size, 4, 6, 3])
+
+    with self.test_session(use_gpu=False):
+      # test NHWC (default) on CPU
+      x_tf = array_ops.depth_to_space(input_nhwc, block_size)
+      self.assertAllEqual(x_tf.shape, x_out.shape)
+      x_tf.eval()
+    if test.is_gpu_available():
+      with self.test_session(use_gpu=True):
+        # test NHWC (default) on GPU
+        x_tf = array_ops.depth_to_space(input_nhwc, block_size)
+        self.assertAllEqual(x_tf.shape, x_out.shape)
+        x_tf.eval()
+
   # Tests for different width and height.
   def testNonSquare(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]],
diff --git a/tensorflow/python/kernel_tests/determinant_op_test.py b/tensorflow/python/kernel_tests/determinant_op_test.py
index 222038b22ef3c766efd14fd9b1c9044a0b6e9125..a52b2c0dc32c26ecd5ef08aa3f8678f0006cd4fe 100644
--- a/tensorflow/python/kernel_tests/determinant_op_test.py
+++ b/tensorflow/python/kernel_tests/determinant_op_test.py
@@ -65,7 +65,7 @@ class DeterminantOpTest(test.TestCase):
       self._compareDeterminantBase(matrix_x,
                                    linalg_ops.matrix_determinant(matrix_x))
       self._compareLogDeterminantBase(
-          matrix_x, gen_linalg_ops._log_matrix_determinant(matrix_x))
+          matrix_x, gen_linalg_ops.log_matrix_determinant(matrix_x))
 
   def testBasic(self):
     # 2x2 matrices
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index e220d0569281c6dbe4107fdfb8013e99592f153c..f3cc9636f91f7d3573d8a66d6b1b4936e49a9141 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -280,15 +280,3 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index 9f9fb5c0bb4c0e9d68ddf6034a8649ad5a6bd8e9..18582241e2fb69dffc0b66aa361aa77fbb97944f 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import abc
 
+import numpy as np
 import six
 
 from tensorflow.python.framework import constant_op
@@ -43,11 +44,10 @@ class BaseBijectorTest(test.TestCase):
       """Minimal specification of a `Bijector`."""
 
       def __init__(self):
-        super(_BareBonesBijector, self).__init__()
+        super(_BareBonesBijector, self).__init__(forward_min_event_ndims=0)
 
     with self.test_session() as sess:
       bij = _BareBonesBijector()
-      self.assertEqual(None, bij.event_ndims)
       self.assertEqual([], bij.graph_parents)
       self.assertEqual(False, bij.is_constant_jacobian)
       self.assertEqual(False, bij.validate_args)
@@ -67,13 +67,21 @@ class BaseBijectorTest(test.TestCase):
         self.assertAllEqual(shape, inverse_event_shape_)
         self.assertAllEqual(shape, bij.inverse_event_shape(shape))
 
-      for fn in ["forward",
-                 "inverse",
-                 "inverse_log_det_jacobian",
-                 "forward_log_det_jacobian"]:
-        with self.assertRaisesRegexp(
-            NotImplementedError, fn + " not implemented"):
-          getattr(bij, fn)(0)
+      with self.assertRaisesRegexp(
+          NotImplementedError, "inverse not implemented"):
+        bij.inverse(0)
+
+      with self.assertRaisesRegexp(
+          NotImplementedError, "forward not implemented"):
+        bij.forward(0)
+
+      with self.assertRaisesRegexp(
+          NotImplementedError, "inverse_log_det_jacobian not implemented"):
+        bij.inverse_log_det_jacobian(0, event_ndims=0)
+
+      with self.assertRaisesRegexp(
+          NotImplementedError, "forward_log_det_jacobian not implemented"):
+        bij.forward_log_det_jacobian(0, event_ndims=0)
 
 
 class IntentionallyMissingError(Exception):
@@ -85,7 +93,7 @@ class BrokenBijector(bijector.Bijector):
 
   def __init__(self, forward_missing=False, inverse_missing=False):
     super(BrokenBijector, self).__init__(
-        event_ndims=0, validate_args=False, name="broken")
+        validate_args=False, forward_min_event_ndims=0, name="broken")
     self._forward_missing = forward_missing
     self._inverse_missing = inverse_missing
 
@@ -120,35 +128,42 @@ class BijectorCachingTestBase(object):
 
   def testCachingOfForwardResults(self):
     broken_bijector = self.broken_bijector_cls(inverse_missing=True)
-    with self.test_session():
-      x = constant_op.constant(1.1)
+    x = constant_op.constant(1.1)
+
+    # Call forward and forward_log_det_jacobian one-by-one (not together).
+    y = broken_bijector.forward(x)
+    _ = broken_bijector.forward_log_det_jacobian(x, event_ndims=0)
 
-      # Call forward and forward_log_det_jacobian one-by-one (not together).
-      y = broken_bijector.forward(x)
-      _ = broken_bijector.forward_log_det_jacobian(x)
+    # Now, everything should be cached if the argument is y.
+    broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
+    try:
+      broken_bijector.inverse(y)
+      broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
+    except IntentionallyMissingError:
+      raise AssertionError("Tests failed! Cached values not used.")
 
-      # Now, everything should be cached if the argument is y.
-      try:
-        broken_bijector.inverse(y)
-        broken_bijector.inverse_log_det_jacobian(y)
-      except IntentionallyMissingError:
-        raise AssertionError("Tests failed! Cached values not used.")
+    # Different event_ndims should not be cached.
+    with self.assertRaises(IntentionallyMissingError):
+      broken_bijector.inverse_log_det_jacobian(y, event_ndims=1)
 
   def testCachingOfInverseResults(self):
     broken_bijector = self.broken_bijector_cls(forward_missing=True)
-    with self.test_session():
-      y = constant_op.constant(1.1)
+    y = constant_op.constant(1.1)
 
-      # Call inverse and inverse_log_det_jacobian one-by-one (not together).
-      x = broken_bijector.inverse(y)
-      _ = broken_bijector.inverse_log_det_jacobian(y)
+    # Call inverse and inverse_log_det_jacobian one-by-one (not together).
+    x = broken_bijector.inverse(y)
+    _ = broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
 
-      # Now, everything should be cached if the argument is x.
-      try:
-        broken_bijector.forward(x)
-        broken_bijector.forward_log_det_jacobian(x)
-      except IntentionallyMissingError:
-        raise AssertionError("Tests failed! Cached values not used.")
+    # Now, everything should be cached if the argument is x.
+    try:
+      broken_bijector.forward(x)
+      broken_bijector.forward_log_det_jacobian(x, event_ndims=0)
+    except IntentionallyMissingError:
+      raise AssertionError("Tests failed! Cached values not used.")
+
+    # Different event_ndims should not be cached.
+    with self.assertRaises(IntentionallyMissingError):
+      broken_bijector.forward_log_det_jacobian(x, event_ndims=1)
 
 
 class BijectorCachingTest(BijectorCachingTestBase, test.TestCase):
@@ -159,5 +174,107 @@ class BijectorCachingTest(BijectorCachingTestBase, test.TestCase):
     return BrokenBijector
 
 
+class ExpOnlyJacobian(bijector.Bijector):
+  """Only used for jacobian calculations."""
+
+  def __init__(self, forward_min_event_ndims=0):
+    super(ExpOnlyJacobian, self).__init__(
+        validate_args=False,
+        is_constant_jacobian=False,
+        forward_min_event_ndims=forward_min_event_ndims,
+        name="exp")
+
+  def _inverse_log_det_jacobian(self, y):
+    return -math_ops.log(y)
+
+  def _forward_log_det_jacobian(self, x):
+    return math_ops.log(x)
+
+
+class ConstantJacobian(bijector.Bijector):
+  """Only used for jacobian calculations."""
+
+  def __init__(self, forward_min_event_ndims=0):
+    super(ConstantJacobian, self).__init__(
+        validate_args=False,
+        is_constant_jacobian=True,
+        forward_min_event_ndims=forward_min_event_ndims,
+        name="c")
+
+  def _inverse_log_det_jacobian(self, y):
+    return constant_op.constant(2., y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    return constant_op.constant(-2., x.dtype)
+
+
+class BijectorReduceEventDimsTest(test.TestCase):
+  """Test caching with BrokenBijector."""
+
+  def testReduceEventNdimsForward(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian()
+    self.assertAllClose(
+        np.log(x),
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        np.sum(np.log(x), axis=-1),
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        np.sum(np.log(x), axis=(-1, -2)),
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=2)))
+
+  def testReduceEventNdimsForwardRaiseError(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian(forward_min_event_ndims=1)
+    with self.assertRaisesRegexp(ValueError, "must be larger than"):
+      bij.forward_log_det_jacobian(x, event_ndims=0)
+
+  def testReduceEventNdimsInverse(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian()
+    self.assertAllClose(
+        -np.log(x),
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        np.sum(-np.log(x), axis=-1),
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        np.sum(-np.log(x), axis=(-1, -2)),
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
+
+  def testReduceEventNdimsInverseRaiseError(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian(forward_min_event_ndims=1)
+    with self.assertRaisesRegexp(ValueError, "must be larger than"):
+      bij.inverse_log_det_jacobian(x, event_ndims=0)
+
+  def testReduceEventNdimsForwardConstJacobian(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ConstantJacobian()
+    self.assertAllClose(
+        -2.,
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        -4.,
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        -8.,
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=2)))
+
+  def testReduceEventNdimsInverseConstJacobian(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ConstantJacobian()
+    self.assertAllClose(
+        2.,
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        4.,
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        8.,
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
index e8f9d0b728d8f831becc82cdba0ae2bf3d5da52a..b347c20db25df6dc0f278d9b34b4588277104850 100644
--- a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -27,14 +27,19 @@ class IdentityBijectorTest(test.TestCase):
   """Tests correctness of the Y = g(X) = X transformation."""
 
   def testBijector(self):
-    with self.test_session():
-      bijector = identity_bijector.Identity()
-      self.assertEqual("identity", bijector.name)
-      x = [[[0.], [1.]]]
-      self.assertAllEqual(x, bijector.forward(x).eval())
-      self.assertAllEqual(x, bijector.inverse(x).eval())
-      self.assertAllEqual(0., bijector.inverse_log_det_jacobian(x).eval())
-      self.assertAllEqual(0., bijector.forward_log_det_jacobian(x).eval())
+    bijector = identity_bijector.Identity(validate_args=True)
+    self.assertEqual("identity", bijector.name)
+    x = [[[0.], [1.]]]
+    self.assertAllEqual(x, self.evaluate(bijector.forward(x)))
+    self.assertAllEqual(x, self.evaluate(bijector.inverse(x)))
+    self.assertAllEqual(
+        0.,
+        self.evaluate(
+            bijector.inverse_log_det_jacobian(x, event_ndims=3)))
+    self.assertAllEqual(
+        0.,
+        self.evaluate(
+            bijector.forward_log_det_jacobian(x, event_ndims=3)))
 
   def testScalarCongruency(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/distributions/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
index df99a0ed257da20179909eb44eacf7d44528dad2..a8def95b147b6dd4825675769187733b8493b374 100644
--- a/tensorflow/python/kernel_tests/distributions/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -281,6 +281,22 @@ class UniformTest(test.TestCase):
       expected_pdf = [1.0, 0.1]
       self.assertAllClose(expected_pdf, pdf.eval())
 
+  def testUniformFloat64(self):
+    uniform = uniform_lib.Uniform(
+        low=np.float64(0.), high=np.float64(1.))
+
+    self.assertAllClose(
+        [1., 1.],
+        self.evaluate(uniform.prob(np.array([0.5, 0.6], dtype=np.float64))))
+
+    self.assertAllClose(
+        [0.5, 0.6],
+        self.evaluate(uniform.cdf(np.array([0.5, 0.6], dtype=np.float64))))
+
+    self.assertAllClose(0.5, self.evaluate(uniform.mean()))
+    self.assertAllClose(1 / 12., self.evaluate(uniform.variance()))
+    self.assertAllClose(0., self.evaluate(uniform.entropy()))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/division_past_test.py b/tensorflow/python/kernel_tests/division_past_test.py
index 2ff2f894077ebd2ec418deb984170beac31e0d08..9ddd62e63cc49de26875317fe4857b8165eb4bf4 100644
--- a/tensorflow/python/kernel_tests/division_past_test.py
+++ b/tensorflow/python/kernel_tests/division_past_test.py
@@ -35,8 +35,7 @@ class DivisionTestCase(test.TestCase):
     """Test all the different ways to divide."""
     values = [1, 2, 7, 11]
     functions = (lambda x: x), constant_op.constant
-    # TODO(irving): Test int8, int16 once we support casts for those.
-    dtypes = np.int32, np.int64, np.float32, np.float64
+    dtypes = np.int8, np.int16, np.int32, np.int64, np.float32, np.float64
 
     tensors = []
     checks = []
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index fedbf9e696923a34968e7a907e4099c520d1447b..5e8937ad2c36afb2b1ddb58ffb238a45e09e4b30 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -326,6 +326,18 @@ class DynamicPartitionTest(test.TestCase):
     with self.assertRaises(ValueError):
       data_flow_ops.dynamic_partition(data, indices, num_partitions=4)
 
+  #  see https://github.com/tensorflow/tensorflow/issues/17106
+  def testCUBBug(self):
+    x = constant_op.constant(np.random.randn(3072))
+    inds = [0]*189 + [1]*184 + [2]*184 + [3]*191 + [4]*192 + [5]*195 + [6]*195
+    inds += [7]*195 + [8]*188 + [9]*195 + [10]*188 + [11]*202 + [12]*194
+    inds += [13]*194 + [14]*194 + [15]*192
+    self.assertEqual(len(inds), x.shape[0])
+    partitioned = data_flow_ops.dynamic_partition(x, inds, 16)
+    with self.test_session() as sess:
+      res = sess.run(partitioned)
+    self.assertEqual(res[-1].shape[0], 192)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index a4b30e4319527c6f3354ac83bf0e3a5114eb45e8..159cba5fa3d69be5e3e3b22a85138c29d03981cc 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -113,22 +113,23 @@ class DynamicStitchTestBase(object):
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
-          constant_op.constant([61, 62]),
-          constant_op.constant([[41, 42], [11, 12]]),
-          constant_op.constant([[[51, 52], [21, 22]], [[1, 2], [31, 32]]])
+          constant_op.constant([61., 62.]),
+          constant_op.constant([[41., 42.], [11., 12.]]),
+          constant_op.constant([[[51., 52.], [21., 22.]],
+                                [[1., 2.], [31., 32.]]])
       ]
       stitched_t = self.stitch_op(indices, data)
       stitched_val = stitched_t.eval()
-      correct = 10 * np.arange(7)[:, None] + [1, 2]
+      correct = 10. * np.arange(7)[:, None] + [1., 2.]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
       # Test gradients
-      stitched_grad = 7 * stitched_val
+      stitched_grad = 7. * stitched_val
       grads = gradients_impl.gradients(stitched_t, indices + data,
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7 * datum.eval(), grad)
+        self.assertAllEqual(7. * datum.eval(), grad)
 
   def testErrorIndicesMultiDimensional(self):
     indices = [
diff --git a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
index feec9934e459590bb1dd0bc5c7cf40013d3d8b88..faac7d8365dfaa1b6b32f8fe66a76c3694aa0d5b 100644
--- a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
@@ -347,7 +347,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
 
   Two types of tests for FractionalAvgPoolGrad.
   1) Test fractional_avg_pool_grad() directly.
-    This type of test relies on gen_nn_ops._avg_pool_grad() returns the
+    This type of test relies on gen_nn_ops.avg_pool_grad() returns the
   correct result. For example:
     * input_tensor_shape = (1, 10, 10, 1)
     * window_size = (1, 2, 2, 1)
@@ -404,13 +404,13 @@ class FractionalAvgPoolGradTest(test.TestCase):
                 num_elements *= dim_size
               output_backprop = (self._PRNG.rand(num_elements) *
                                  1000).reshape(output_data.shape)
-              input_backprop_tensor = gen_nn_ops._avg_pool_grad(
+              input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
               input_backprop = input_backprop_tensor.eval()
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
-              fap_input_backprop_tensor = gen_nn_ops._fractional_avg_pool_grad(
+              fap_input_backprop_tensor = gen_nn_ops.fractional_avg_pool_grad(
                   input_tensor.get_shape(),
                   output_backprop,
                   row_seq,
@@ -443,7 +443,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
                 num_elements *= dim_size
               output_backprop = (self._PRNG.rand(num_elements) *
                                  1000).reshape(output_data.shape)
-              input_backprop_tensor = gen_nn_ops._avg_pool_grad(
+              input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
               input_backprop = input_backprop_tensor.eval()
@@ -451,7 +451,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
               col_seq[-1] += 1
-              fap_input_backprop_tensor = gen_nn_ops._fractional_avg_pool_grad(
+              fap_input_backprop_tensor = gen_nn_ops.fractional_avg_pool_grad(
                   input_tensor.get_shape(),
                   output_backprop,
                   row_seq,
diff --git a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
index 5983ae7759dbf3eb2db9867def829ce8dbeb4b73..6477c9ebc4c35fcc5963b27a0f5c50624a73fa09 100644
--- a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
@@ -318,7 +318,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
 
   Two types of tests for FractionalMaxPoolGrad.
   1) Test fractional_max_pool_grad() directly.
-    This type of test relies on gen_nn_ops._max_pool_grad() returns the correct
+    This type of test relies on gen_nn_ops.max_pool_grad() returns the correct
   result. For example:
     * input_tensor_shape = (1, 10, 10, 1)
     * window_size = (1, 2, 2, 1)
@@ -384,16 +384,13 @@ class FractionalMaxPoolGradTest(test.TestCase):
                                               stride_size, padding)
               output_data = output_tensor.eval()
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
-              input_backprop_tensor = gen_nn_ops._max_pool_grad(input_tensor,
-                                                                output_tensor,
-                                                                output_backprop,
-                                                                window_size,
-                                                                stride_size,
-                                                                padding)
+              input_backprop_tensor = gen_nn_ops.max_pool_grad(
+                  input_tensor, output_tensor, output_backprop, window_size,
+                  stride_size, padding)
               input_backprop = input_backprop_tensor.eval()
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
-              fmp_input_backprop_tensor = gen_nn_ops._fractional_max_pool_grad(
+              fmp_input_backprop_tensor = gen_nn_ops.fractional_max_pool_grad(
                   input_tensor,
                   output_tensor,
                   output_backprop,
@@ -422,18 +419,15 @@ class FractionalMaxPoolGradTest(test.TestCase):
                                               stride_size, padding)
               output_data = output_tensor.eval()
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
-              input_backprop_tensor = gen_nn_ops._max_pool_grad(input_tensor,
-                                                                output_tensor,
-                                                                output_backprop,
-                                                                window_size,
-                                                                stride_size,
-                                                                padding)
+              input_backprop_tensor = gen_nn_ops.max_pool_grad(
+                  input_tensor, output_tensor, output_backprop, window_size,
+                  stride_size, padding)
               input_backprop = input_backprop_tensor.eval()
               row_seq = list(range(0, num_rows, row_window_size - 1))
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
               col_seq[-1] += 1
-              fmp_input_backprop_tensor = gen_nn_ops._fractional_max_pool_grad(
+              fmp_input_backprop_tensor = gen_nn_ops.fractional_max_pool_grad(
                   input_tensor,
                   output_tensor,
                   output_backprop,
@@ -591,7 +585,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
       output_tensor = constant_op.constant(
           output_data_not_overlapping, shape=output_size)
       grad = constant_op.constant(output_backprop, shape=output_size)
-      r = gen_nn_ops._fractional_max_pool_grad(
+      r = gen_nn_ops.fractional_max_pool_grad(
           input_tensor,
           output_tensor,
           grad,
@@ -606,7 +600,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
       # Test when overlapping is True
       output_tensor = constant_op.constant(
           output_data_overlapping, shape=output_size)
-      r = gen_nn_ops._fractional_max_pool_grad(
+      r = gen_nn_ops.fractional_max_pool_grad(
           input_tensor, output_tensor, grad, row_seq, col_seq, overlapping=True)
       input_backprop_overlapping = r.eval()
       self.assertShapeEqual(
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index f5717a5a21a0be82382c5da556ed6f5540591abf..35a274e75f51b795cb4e2041e4dfa8da012f6a58 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -22,8 +22,10 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -37,8 +39,10 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
+# pylint: disable=invalid-name
 def simple_scoped_fn(a, x):
   """Simple function: (a, x) -> 2(x+a), but with "2" as a variable in scope."""
   with variable_scope.variable_scope("body"):
@@ -68,6 +72,26 @@ class FunctionalOpsTest(test.TestCase):
           initializer=10)
       self.assertAllEqual(880, self.evaluate(r))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldl_SingleInputMultiOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array([1, -1.0])
+      r = functional_ops.foldl(lambda a, x: a + x, elems, initializer)
+      r_value = self.evaluate(r)
+
+      self.assertAllEqual(22, r_value[0])
+      self.assertAllEqual(20, r_value[1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldl_MultiInputSingleOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array(1.0)
+      r = functional_ops.foldl(lambda a, x: a + x[0] + x[1], (elems, -elems),
+                               initializer)
+      self.assertAllEqual(1, self.evaluate(r))
+
   def testFoldl_Scoped(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -103,6 +127,26 @@ class FunctionalOpsTest(test.TestCase):
           initializer=10)
       self.assertAllEqual(1282, self.evaluate(r))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldr_SingleInputMultiOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array([1, -1.0])
+      r = functional_ops.foldr(lambda a, x: a + x, elems, initializer)
+      r_value = self.evaluate(r)
+
+      self.assertAllEqual(22, r_value[0])
+      self.assertAllEqual(20, r_value[1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldr_MultiInputSingleOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array(1.0)
+      r = functional_ops.foldr(lambda a, x: a + x[0] + x[1], (elems, -elems),
+                               initializer)
+      self.assertAllEqual(1, self.evaluate(r))
+
   def testFoldr_Scoped(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -158,6 +202,13 @@ class FunctionalOpsTest(test.TestCase):
                 values=constant_op.constant([0, 1, 2]),
                 dense_shape=[2, 2]))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testMapOverScalarErrors(self):
+    with self.assertRaisesRegexp(ValueError, "not scalars"):
+      functional_ops.map_fn(lambda x: x, [1, 2])
+    with self.assertRaisesRegexp(ValueError, "not a scalar"):
+      functional_ops.map_fn(lambda x: x, 1)
+
   def testMap_Scoped(self):
     with self.test_session() as sess:
 
@@ -229,7 +280,7 @@ class FunctionalOpsTest(test.TestCase):
     with self.test_session():
       nums = np.array([1, 2, 3, 4, 5, 6])
       with self.assertRaisesRegexp(
-          TypeError, r"two structures don't have the same sequence type."):
+          TypeError, r"two structures don't have the same nested structure"):
         # lambda emits tuple, but dtype is a list
         functional_ops.map_fn(
             lambda x: ((x + 3) * 2, -(x + 3) * 2),
@@ -316,7 +367,7 @@ class FunctionalOpsTest(test.TestCase):
       initializer = np.array(1.0)
       # Multiply a * 1 each time
       with self.assertRaisesRegexp(
-          ValueError, "two structures don't have the same number of elements"):
+          ValueError, "two structures don't have the same nested structure"):
         functional_ops.scan(lambda a, x: (a, -a), elems, initializer)
 
   def testScan_Scoped(self):
@@ -607,6 +658,380 @@ class FunctionalOpsTest(test.TestCase):
       mul = sess.run(remote_op)
       self.assertEqual(mul, 9)
 
+  def testIf(self):
+
+    @function.Defun(dtypes.float32)
+    def Twice(x):
+      return x * 2
+
+    @function.Defun(dtypes.float32)
+    def Thrice(x):
+      return x * 3 + 1
+
+    with self.test_session(use_gpu=False) as sess:
+
+      def Run(x):
+        return sess.run(
+            functional_ops.If(math_ops.greater(x, 0), [x], Twice, Thrice))[0]
+
+      self.assertAllEqual(Run(9.), 18.)
+      self.assertAllEqual(Run(-8.), -23.)
+      self.assertAllEqual(Run(0.), 1.)
+
+  def testWhile(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Cond(n, unused_x):
+      return n > 0
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False) as sess:
+
+      def Run(n):
+        return sess.run(functional_ops.While([n, 0.], Cond, Body))[1]
+
+      self.assertAllEqual(Run(20.), 210.)
+      self.assertAllEqual(Run(100.), 5050.)
+
+  def testWhileError(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Cond(n, unused_x):
+      return n > 0
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def CondReturnsTooManyArgs(n, x):
+      return n > 0, x
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def BodyReturnsTooManyArgs(n, x):
+      return n - 1, x + n, x
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Expected a single scalar.*got 2 tensors."):
+        functional_ops.While([5., 0.], CondReturnsTooManyArgs, Body)[0].eval()
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "While loop body returned 3 arguments. Expected: 2"):
+        functional_ops.While([5., 0.], Cond, BodyReturnsTooManyArgs)[0].eval()
+
+  def testWhileInMultipleSubgraphs(self):
+
+    @function.Defun(* [dtypes.float32] * 2)
+    def Cond(n, x):  # pylint: disable=unused-argument
+      return n > 0
+
+    @function.Defun(* [dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False) as sess:
+      n = array_ops.placeholder(dtypes.float32)
+      _, result = functional_ops.While([n, 0.], Cond, Body)
+      c = constant_op.constant(37.)
+
+      self.assertAllEqual(210., sess.run(result, feed_dict={n: 20.}))
+      self.assertAllEqual(5050., sess.run(result, feed_dict={n: 100.}))
+      # Test that the result is the same when we run a different subgraph.
+      self.assertAllEqual(5050., sess.run([result, c], feed_dict={n: 100.})[0])
+
+  def _tfSum(self, rewrite_with_while):
+    # On GPU, don't rewrite using a while loop.
+    use_gpu = not rewrite_with_while
+    with self.test_session(use_gpu=use_gpu) as sess:
+
+      @function.Defun(dtypes.int32, dtypes.float32)
+      def Body(n, x):
+        return x + math_ops.to_float(n)
+
+      xs = [
+          # 1 + 2  + ... + 20
+          functional_ops.For(
+              1, 21, 1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+          # 100 + 99 + ... + 1
+          functional_ops.For(
+              100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+      ]
+      xvals = sess.run(xs)
+    self.assertAllEqual(210, xvals[0])
+    self.assertAllEqual(5050, xvals[1])
+
+  def testFor(self):
+    self._tfSum(False)
+
+  def testForWithWhile(self):
+    self._tfSum(True)
+
+  def testForWithWhileNaming(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      @function.Defun(dtypes.int32, dtypes.float32, func_name="TestBody")
+      def TestBody(n, x):
+        return x + math_ops.to_float(n)
+
+      _ = functional_ops.For(
+          1, 21, 1, [0.], TestBody, rewrite_with_while=True)[0]
+
+    names = []
+    for func in g.as_graph_def().library.function:
+      names.append(func.signature.name)
+    self.assertTrue("TestBody" in names)
+    self.assertTrue("TestBody_Cond" in names)
+    self.assertTrue("TestBody_Body" in names)
+
+  def testForCapturedInputs(self):
+    v = variables.Variable(1.0)
+
+    @function.Defun(dtypes.int32)
+    def TestNullary(n):
+      v + math_ops.to_float(n)  # pylint: disable=expression-not-assigned
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def TestUnary(n, x):
+      return x + math_ops.to_float(n) + v
+
+    @function.Defun(dtypes.int32, dtypes.float32, dtypes.float32)
+    def TestBinary(n, x, x2):
+      return x + math_ops.to_float(n) + v, x2 + v
+
+    for rewrite_with_while in (True, False):
+      # TODO(b/65752372): Set `use_gpu=False` because
+      # `functional_ops.While()` does not reliably work on GPU (apparently
+      # because the result of evaluating the condition may be in device
+      # memory, but it is read on the host).
+      use_gpu = not rewrite_with_while
+      with self.test_session(use_gpu=use_gpu) as sess:
+        result_nullary = functional_ops.For(
+            1, 10, 1, [], TestNullary,
+            rewrite_with_while=rewrite_with_while)
+        result_unary = functional_ops.For(
+            1, 10, 1, [0.], TestUnary,
+            rewrite_with_while=rewrite_with_while)
+        result_binary = functional_ops.For(
+            1, 10, 1, [0., 0.], TestBinary,
+            rewrite_with_while=rewrite_with_while)
+        sess.run(variables.global_variables_initializer())
+        assert not result_nullary
+        # The nullary variant doesn't return anything so we can't easily run it.
+        # As a total hack, fetch the operation by name and run it.
+        sess.run(ops.get_default_graph().get_operation_by_name(
+            "While" if rewrite_with_while else "For"))
+        assert len(result_unary) == 1
+        self.assertEqual([54.0], sess.run(result_unary))
+        assert len(result_binary) == 2
+        self.assertEqual([54.0, 9.0], sess.run(result_binary))
+
+  def _tfMLP(self, xval, wsval, bsval, rewrite_with_while):
+    # On GPU, don't rewrite using a while loop.
+    use_gpu = not rewrite_with_while
+    with self.test_session(use_gpu=use_gpu):
+
+      @function.Defun(dtypes.int32, *[dtypes.float64] * 3)
+      def MLP(i, a, ws, bs):
+        a = math_ops.tanh(math_ops.matmul(a, ws[i, :]) + bs[i, :])
+        return a, ws, bs
+
+      ret = functional_ops.For(
+          0,
+          wsval.shape[0],
+          1, [xval, wsval, bsval],
+          MLP,
+          rewrite_with_while=rewrite_with_while)[0]
+
+      return ret.eval()
+
+  def _npMLP(self, xval, wsval, bsval):
+    for i in range(wsval.shape[0]):
+      xval = np.tanh(np.dot(xval, wsval[i, :]) + bsval[i, :])
+    return xval
+
+  def _testForMLP(self, rewrite_with_while):
+    # We construct a 5-layer Multi-Layer Perceptron network here.
+    # Each layer have the same number of hidden unites (3), and the
+    # activation function is tanh().  We feed the input (xval) with
+    # batch size 2.
+    xval = np.random.normal(size=(2, 3))
+    wsval = np.random.normal(size=(5, 3, 3))
+    bsval = np.random.normal(size=(5, 3))
+    np_ans = self._npMLP(xval, wsval, bsval)
+    tf_for_ans = self._tfMLP(xval, wsval, bsval, rewrite_with_while)
+    self.assertAllClose(np_ans, tf_for_ans)
+
+  def testForMLP(self):
+    self._testForMLP(False)
+
+  def testForMLPWhile(self):
+    self._testForMLP(True)
+
+  def testForError(self):
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def Foo(i, v):
+      return math_ops.to_float(i) + v
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def ReturnsTooManyArgs(unused_i, v):
+      return v, v
+
+    with self.test_session(use_gpu=True):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must be a scalar"):
+        functional_ops.For([0], 10, 1, [0.0], Foo)[0].eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Invalid start/limit/delta"):
+        functional_ops.For(0, 10, -1, [0.0], Foo)[0].eval()
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "For loop body returned 2 arguments. Expected: 1"):
+        functional_ops.For(0, 10, 1, [0.0], ReturnsTooManyArgs)[0].eval()
+
+  def testGradient(self):
+
+    @function.Defun(dtypes.float32)
+    def Poly(x):
+      # y = 2x^3+3x^2+4x+8
+      return 2 * x * x * x + 3 * x * x + 4 * x + 8
+
+    @function.Defun(dtypes.float32)
+    def Grad(x):
+      # dy/dx = dy/dy * dy/dx = 1.0 * (6x^2+6x+4)
+      return functional_ops.Gradient([x, 1.0], Poly)[0]
+
+    with self.test_session(use_gpu=False) as sess:
+      a = constant_op.constant(0.)
+      avals = [Poly(a), Grad(a)]
+      b = constant_op.constant(1.)
+      bvals = [Poly(b), Grad(b)]
+      self.assertAllEqual(sess.run(avals), [8., 4.])
+      self.assertAllEqual(sess.run(bvals), [17., 16.])
+
+
+class PartitionedCallTest(test.TestCase):
+
+  def testBasicSingleDevice(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(x, y):
+      with ops.device("/cpu:0"):
+        a = x + x
+        b = y + y
+        return a + b
+
+    output, = self.evaluate(
+        functional_ops.partitioned_call(
+            args=[constant_op.constant(1.),
+                  constant_op.constant(2.)], f=Body))
+    self.assertEqual(output, 6.)
+
+  def testBasicMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(x, y):
+      # if x = 1, y = 2, ...
+      with ops.device("/cpu:0"):
+        # a:= 1 + 1 = 2
+        a = x + x
+      with ops.device("/cpu:1"):
+        # b:= 2 + 2 = 4
+        b = a + y
+      with ops.device("/cpu:2"):
+        # c:= 2 + 4 = 6
+        c = a + b
+      # a + b + c = 2 + 4 + 6 = 12
+      return a + b + c
+
+    with self.test_session(config=config):
+      output, = functional_ops.partitioned_call(
+          args=[constant_op.constant(1.),
+                constant_op.constant(2.)], f=Body)
+      self.assertEqual(output.eval(), 12.)
+
+  def testBasicMultiDeviceGPU(self):
+    if not test_util.is_gpu_available():
+      return
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(x, y):
+      with ops.device("/gpu:0"):
+        a = x + x
+        b = y + y
+      with ops.device("/cpu:0"):
+        c = a + b
+        return c
+
+    output, = self.evaluate(
+        functional_ops.partitioned_call(
+            args=[constant_op.constant(1.),
+                  constant_op.constant(2.)], f=Body))
+    self.assertEqual(output, 6.)
+
+  def testBasicNoDeviceAnnotations(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(x, y):
+      a = x + x
+      b = y + y
+      return a + b
+
+    output, = self.evaluate(
+        functional_ops.partitioned_call(
+            args=[constant_op.constant(1.),
+                  constant_op.constant(2.)], f=Body))
+    self.assertEqual(output, 6.)
+
+  def testShardsRunOnRequestedDevices(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+
+    @function.Defun()
+    def Body():
+      # Serialize DT_RESOURCE handles as DT_STRINGs, which encode the device on
+      # which the resource was created, so that we can verify that ops were
+      # actually run on the requested devices.
+      #
+      # TODO(akshayka): Provide a cleaner, more idiomatic API for obtaining the
+      # name of the device on which a resource lives / for determining the
+      # device on which an op ran.
+      with ops.device("/cpu:0"):
+        s1 = iterator_ops.Iterator.from_structure(
+            (dtypes.float32,)).string_handle()
+      with ops.device("/cpu:1"):
+        s2 = iterator_ops.Iterator.from_structure(
+            (dtypes.float32,)).string_handle()
+      with ops.device("/cpu:2"):
+        s3 = iterator_ops.Iterator.from_structure(
+            (dtypes.float32,)).string_handle()
+      return s1, s2, s3
+
+    with self.test_session(config=config):
+      outputs = functional_ops.partitioned_call(args=[], f=Body)
+      self.assertTrue(compat.as_bytes("CPU:0") in outputs[0].eval())
+      self.assertTrue(compat.as_bytes("CPU:1") in outputs[1].eval())
+      self.assertTrue(compat.as_bytes("CPU:2") in outputs[2].eval())
+
 
 if __name__ == "__main__":
   test.main()
+
+# pylint: enable=invalid-name
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 9a946925693370912613f4dde33bbbda176060e4..a2fcd751dfa94605d271587640815fae6ac1c360 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -149,6 +149,15 @@ class GatherTest(test.TestCase):
       self.assertAllEqual([b"asdf", b"qwer"],
                           array_ops.gather(params, 0, axis=1).eval())
 
+  def testUInt32AndUInt64(self):
+    for unsigned_type in (dtypes.uint32, dtypes.uint64):
+      params = self._buildParams(
+          np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
+      with self.test_session():
+        self.assertAllEqual([7, 8, 9],
+                            array_ops.gather(params, 1, axis=0).eval())
+        self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1).eval())
+
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/gradient_correctness_test.py b/tensorflow/python/kernel_tests/gradient_correctness_test.py
index 10fe4f509080b20d72ee26cca0e24d75ed520895..e93c6235f74e8fce03ea13fc62fd79a005cfa918 100644
--- a/tensorflow/python/kernel_tests/gradient_correctness_test.py
+++ b/tensorflow/python/kernel_tests/gradient_correctness_test.py
@@ -40,6 +40,71 @@ class GradientCorrectnessTest(test.TestCase):
       # [dexp(x)/dx + d(log(exp(x)))/dx] @ x=1 == exp(1) + 1
       self.assertAllClose(grad_vals[0], exp1_plus_one)
 
+  def testIdentityGradient(self):
+    x = constant_op.constant(3.)
+    dx_dx, = gradients_impl.gradients(x, x)
+    with self.test_session() as sess:
+      self.assertAllClose(1., sess.run(dx_dx))
+
+  def testIntegerIdentityGradient(self):
+    x = constant_op.constant(3)
+    dx_dx, = gradients_impl.gradients(x, x)
+    with self.test_session() as sess:
+      self.assertAllClose(1, sess.run(dx_dx))
+
+  def testGradientWithIntegerPath(self):
+    x = constant_op.constant([3.9, 4.1])
+    k = math_ops.to_float(math_ops.to_int32(x))
+    y = x * k
+    dy_dx, = gradients_impl.gradients(y, x)
+    with self.test_session() as sess:
+      self.assertAllClose([3., 4.], sess.run(dy_dx))
+
+  def testNoIntegerGradient1(self):
+    x = constant_op.constant([3.9, 4.1])
+    k = math_ops.to_float(math_ops.to_int32(x))
+    y = k * k
+    dy_dx, = gradients_impl.gradients(y, x)
+    self.assertIsNone(dy_dx)
+
+  def testNoIntegerGradient2(self):
+    k = constant_op.constant([3, 4])
+    x = math_ops.to_float(k)
+    y = x * x
+    dy_dk, = gradients_impl.gradients(y, k)
+    self.assertIsNone(dy_dk)
+
+  def testNoIntegerGradient3(self):
+    k = constant_op.constant([3, 4])
+    m = k * k
+    dm_dk, = gradients_impl.gradients(m, k)
+    self.assertIsNone(dm_dk)
+
+  def testNoIntegerGradient4(self):
+    k = constant_op.constant([3, 4])
+    m = k * k * k
+    dm_dk, = gradients_impl.gradients(m, k)
+    self.assertIsNone(dm_dk)
+
+  def testNoIntegerGradient5(self):
+    k = constant_op.constant([3, 4])
+    m = k * k
+    n = m * m
+    dn_dk, = gradients_impl.gradients(n, k)
+    self.assertIsNone(dn_dk)
+
+  def testNoIntegerGradient6(self):
+    k = constant_op.constant(3)
+    x = math_ops.to_float(k)
+    grad_1, = gradients_impl.gradients(k * k, k)
+    grad_2, = gradients_impl.gradients(x * x, k)
+    grad_3, = gradients_impl.gradients(math_ops.square(k), k)
+    grad_4, = gradients_impl.gradients(math_ops.square(x), k)
+    self.assertIsNone(grad_1)
+    self.assertIsNone(grad_2)
+    self.assertIsNone(grad_3)
+    self.assertIsNone(grad_4)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/identity_op_py_test.py
index 2cfe420bd49ec44815d1386bd873b234d8710e9d..49fb76d5b41de18ed3ba2187e85cb288e7344c38 100644
--- a/tensorflow/python/kernel_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_op_py_test.py
@@ -65,7 +65,7 @@ class IdentityOpTest(test.TestCase):
           constant_op.constant(
               [[1, 2, 3], [6, 5, 4]], dtype=dtypes.int32))
       self.assertEquals(shape, tensor.get_shape())
-      self.assertEquals(shape, gen_array_ops._ref_identity(tensor).get_shape())
+      self.assertEquals(shape, gen_array_ops.ref_identity(tensor).get_shape())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 19a7d2f9d51fff46ee817ad03ef62383f6727791..a9b55854f1b4a3dfc49f05397ca32bc7b2ccb88e 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -25,10 +25,13 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -548,7 +551,6 @@ class OrthogonalInitializerTest(test.TestCase):
       init2 = init_ops.orthogonal_initializer(gain=3.14, seed=1, dtype=dtype)
       with self.test_session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
-      with self.test_session(graph=ops.Graph(), use_gpu=True):
         t2 = init2(shape).eval()
       return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
 
@@ -571,6 +573,440 @@ class OrthogonalInitializerTest(test.TestCase):
                 np.dot(t, t.T), np.eye(t.shape[0]), rtol=tol, atol=tol)
 
 
+class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
+
+  def testInitializerIdentical(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
+      self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
+
+  def testInitializerDifferent(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_delta_orthogonal(seed=2, dtype=dtype)
+      self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
+
+  def testDuplicatedInitializer(self):
+    init = init_ops.convolutional_delta_orthogonal()
+    self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
+
+  def testInvalidDataType(self):
+    self.assertRaises(
+        ValueError, init_ops.convolutional_delta_orthogonal,
+        dtype=dtypes.string)
+
+  def testInvalidShape(self):
+    init1 = init_ops.convolutional_delta_orthogonal()
+    with self.test_session(graph=ops.Graph(), use_gpu=True):
+      self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
+
+  def testGain(self):
+    shape = (3, 3, 10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_delta_orthogonal(gain=3.14,
+                                                      seed=1, dtype=dtype)
+      with self.test_session(graph=ops.Graph(), use_gpu=True):
+        t1 = init1(shape).eval()
+        t2 = init2(shape).eval()
+      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+
+  def testShapesValues(self):
+    gain = 3.14
+    for dtype in [dtypes.float32]:
+      for kernel_size in [[3], [8], [3, 5], [2, 4], [3, 3, 3], [2, 2, 2]]:
+        tol = 1e-2
+        # Check orthogonality by computing ratio between
+        # the 2-norms of the inputs and outputs.
+        if len(kernel_size) == 1:
+          shape = [4, 32, 64]
+          convolution = convolutional.conv1d
+        elif len(kernel_size) == 2:
+          convolution = convolutional.conv2d
+          shape = [4, 32, 32, 64]
+        else:
+          shape = [4, 16, 16, 16, 64]
+          convolution = convolutional.conv3d
+        inputs = random_ops.random_normal(shape, dtype=dtype)
+        inputs_2norm = linalg_ops.norm(inputs)
+        outputs = convolution(
+            inputs, padding="same", filters=128,
+            kernel_size=kernel_size, use_bias=False,
+            kernel_initializer=init_ops.convolutional_delta_orthogonal(
+                gain=gain))
+        outputs_shape = shape[0:-1] + [128]
+        outputs_2norm = linalg_ops.norm(outputs)
+        ratio = outputs_2norm / inputs_2norm
+        my_ops = variables.global_variables_initializer()
+        with self.test_session(use_gpu=True) as sess:
+          sess.run(my_ops)
+          # Check the shape of the outputs
+          t = outputs.eval()
+          self.assertAllEqual(t.shape, outputs_shape)
+          # Check isometry of the delta-orthogonal kernel.
+          self.assertAllClose(sess.run(ratio), np.sqrt(gain),
+                              rtol=tol, atol=tol)
+
+  def testNonuniformity(self):
+    value = 0
+    abs_value = 0
+    shape = [3, 3, 10, 10]
+    count = 70
+    tol = 1e-5
+    with self.test_session(use_gpu=True):
+      for i in range(count):
+        x = variable_scope.get_variable("{}".format(i), shape=shape,
+                                        initializer=
+                                        init_ops.convolutional_delta_orthogonal)
+        x.initializer.run()
+        y = x.eval()[1, 1, :, :]
+        determinant = np.linalg.det(y)
+        value += determinant
+        abs_value += np.abs(determinant)
+
+      # Check there is some variation in the signs of the determinants
+      self.assertLess(value, count - tol)
+      self.assertLess(-count + tol, value)
+      # Check all determinants have absolute value 1
+      # Compute the sum of the absolute values of 'count' determinants
+      self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
+
+
+class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
+
+  def testInitializerIdentical(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
+      self.assertTrue(identicaltest(self, init1, init2, (3, 10, 10)))
+
+  def testInitializerDifferent(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_1d(seed=2, dtype=dtype)
+      self.assertFalse(identicaltest(self, init1, init2, (3, 10, 10)))
+
+  def testDuplicatedInitializer(self):
+    init = init_ops.convolutional_orthogonal_1d()
+    self.assertFalse(duplicated_initializer(self, init, 1, (3, 10, 10)))
+
+  def testInvalidDataType(self):
+    self.assertRaises(
+        ValueError, init_ops.convolutional_orthogonal_1d,
+        dtype=dtypes.string)
+
+  def testInvalidShape(self):
+    init1 = init_ops.convolutional_orthogonal_1d()
+    with self.test_session(graph=ops.Graph(), use_gpu=True):
+      self.assertRaises(ValueError, init1, shape=[3, 6, 5])
+
+  def testGain(self):
+    shape = (3, 10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_1d(gain=3.14,
+                                                   seed=1, dtype=dtype)
+      with self.test_session(graph=ops.Graph(), use_gpu=True):
+        t1 = init1(shape).eval()
+        t2 = init2(shape).eval()
+      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+
+  def testNonuniformity(self):
+    value = 0
+    abs_value = 0
+    shape = [3, 10, 10]
+    count = 70
+    tol = 1e-5
+    with self.test_session(use_gpu=True):
+      for i in range(count):
+        x = variable_scope.get_variable("{}".format(i), shape=shape,
+                                        initializer=
+                                        init_ops.convolutional_orthogonal_1d)
+        x.initializer.run()
+        y = np.sum(x.eval(), axis=0)
+        determinant = np.linalg.det(y)
+        value += determinant
+        abs_value += np.abs(determinant)
+
+      # Check there is some variation in the signs of the determinants.
+      self.assertLess(value, count - tol)
+      self.assertLess(-count + tol, value)
+      # Check all determinants have absolute value 1
+      # Compute the sum of the absolute values of 'count' determinants
+      self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
+
+  def testShapesValues(self):
+    def circular_pad(input_, width, kernel_size):
+      """Pad input_ for computing (circular) convolution.
+
+      Args:
+        input_: the input tensor
+        width: the width of the tensor.
+        kernel_size: the kernel size of the filter.
+      Returns:
+        a tensor whose width is (width + kernel_size - 1).
+      """
+
+      beginning = kernel_size // 2
+      end = kernel_size - 1 - beginning
+
+      tmp_up = array_ops.slice(input_, [0, width - beginning, 0],
+                               [-1, beginning, -1])
+      tmp_down = array_ops.slice(input_, [0, 0, 0], [-1, end, -1])
+      tmp = array_ops.concat([tmp_up, input_, tmp_down], 1)
+
+      return tmp
+
+    cout = 64
+    shape = [10, 20, 32]
+    outputs_shape = shape[0:-1] + [cout]
+    dtype = dtypes.float32
+    tol = 1e-3
+    gain = 3.14
+    # Check orthogonality/isometry by computing the ratio between
+    # the 2-norms of the inputs and ouputs.
+    for kernel_size in [[1], [2], [3], [4], [5], [6]]:
+      convolution = convolutional.conv1d
+      inputs = random_ops.random_normal(shape, dtype=dtype)
+      inputs_2norm = linalg_ops.norm(inputs)
+      input_with_circular_pad = circular_pad(inputs, shape[1], kernel_size[0])
+      outputs = convolution(
+          input_with_circular_pad, padding="valid", filters=cout,
+          kernel_size=kernel_size[0], use_bias=False,
+          kernel_initializer=init_ops.convolutional_orthogonal_1d(gain=gain))
+      outputs_2norm = linalg_ops.norm(outputs)
+      ratio = outputs_2norm / inputs_2norm
+      my_ops = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True) as sess:
+        sess.run(my_ops)
+        # Check the shape of the outputs
+        t = outputs.eval()
+        self.assertAllEqual(t.shape, outputs_shape)
+        # Check isometry of the orthogonal kernel.
+        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+
+
+class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
+
+  def testInitializerIdentical(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
+
+  def testInitializerDifferent(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_2d(seed=2, dtype=dtype)
+      self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
+
+  def testDuplicatedInitializer(self):
+    init = init_ops.convolutional_orthogonal_2d()
+    self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
+
+  def testInvalidDataType(self):
+    self.assertRaises(
+        ValueError, init_ops.convolutional_orthogonal_2d,
+        dtype=dtypes.string)
+
+  def testInvalidShape(self):
+    init1 = init_ops.convolutional_orthogonal_2d()
+    with self.test_session(graph=ops.Graph(), use_gpu=True):
+      self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
+
+  def testGain(self):
+    shape = (3, 3, 10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_2d(gain=3.14,
+                                                   seed=1, dtype=dtype)
+      with self.test_session(graph=ops.Graph(), use_gpu=True):
+        t1 = init1(shape).eval()
+        t2 = init2(shape).eval()
+      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+
+  def testShapesValues(self):
+    def circular_pad(input_, width, kernel_size):
+      """Pad input_ for computing (circular) convolution.
+
+      Args:
+        input_: the input tensor
+        width: the width of the tensor.
+        kernel_size: the kernel size of the filter.
+      Returns:
+        a tensor whose width is (width + kernel_size - 1).
+      """
+      beginning = kernel_size // 2
+      end = kernel_size - 1 - beginning
+
+      tmp_up = array_ops.slice(input_, [0, width - beginning, 0, 0],
+                               [-1, beginning, width, -1])
+      tmp_down = array_ops.slice(input_, [0, 0, 0, 0], [-1, end, width, -1])
+      tmp = array_ops.concat([tmp_up, input_, tmp_down], 1)
+
+      new_width = width + kernel_size - 1
+      tmp_left = array_ops.slice(tmp, [0, 0, width - beginning, 0],
+                                 [-1, new_width, beginning, -1])
+      tmp_right = array_ops.slice(tmp, [0, 0, 0, 0], [-1, new_width, end, -1])
+
+      final = array_ops.concat([tmp_left, tmp, tmp_right], 2)
+      return final
+
+    cout = 45
+    shape = [64, 28, 28, 32]
+    outputs_shape = shape[0:-1] + [cout]
+    dtype = dtypes.float32
+    tol = 1e-3
+    gain = 3.14
+    # Check orthogonality/isometry by computing the ratio between
+    # the 2-norms of the inputs and ouputs.
+    for kernel_size in [[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]:
+      convolution = convolutional.conv2d
+      inputs = random_ops.random_normal(shape, dtype=dtype)
+      inputs_2norm = linalg_ops.norm(inputs)
+      input_with_circular_pad = circular_pad(inputs, shape[1], kernel_size[0])
+      outputs = convolution(
+          input_with_circular_pad, padding="valid", filters=cout,
+          kernel_size=kernel_size, use_bias=False,
+          kernel_initializer=init_ops.convolutional_orthogonal_2d(gain=gain))
+      outputs_2norm = linalg_ops.norm(outputs)
+      ratio = outputs_2norm / inputs_2norm
+      my_ops = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True) as sess:
+        sess.run(my_ops)
+        # Check the shape of the outputs
+        t = outputs.eval()
+        self.assertAllEqual(t.shape, outputs_shape)
+        # Check isometry of the orthogonal kernel.
+        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+
+
+class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
+
+  def testInitializerIdentical(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
+      self.assertTrue(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
+
+  def testInitializerDifferent(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_3d(seed=2, dtype=dtype)
+      self.assertFalse(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
+
+  def testDuplicatedInitializer(self):
+    init = init_ops.convolutional_orthogonal_3d()
+    self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 3, 10, 10)))
+
+  def testInvalidDataType(self):
+    self.assertRaises(
+        ValueError, init_ops.convolutional_orthogonal_3d,
+        dtype=dtypes.string)
+
+  def testInvalidShape(self):
+    init1 = init_ops.convolutional_orthogonal_3d()
+    with self.test_session(graph=ops.Graph(), use_gpu=True):
+      self.assertRaises(ValueError, init1, shape=[3, 3, 3, 6, 5])
+
+  def testGain(self):
+    shape = (3, 3, 3, 10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_3d(gain=3.14,
+                                                   seed=1, dtype=dtype)
+      with self.test_session(graph=ops.Graph(), use_gpu=True):
+        t1 = init1(shape).eval()
+        t2 = init2(shape).eval()
+      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+
+  def testNonuniformity(self):
+    value = 0
+    abs_value = 0
+    shape = [3, 3, 3, 5, 5]
+    count = 20
+    tol = 1e-5
+    with self.test_session(use_gpu=True):
+      for i in range(count):
+        x = variable_scope.get_variable("{}".format(i), shape=shape,
+                                        initializer=
+                                        init_ops.convolutional_orthogonal_3d)
+        x.initializer.run()
+        y = np.sum(x.eval(), axis=(0, 1, 2))
+        determinant = np.linalg.det(y)
+        value += determinant
+        abs_value += np.abs(determinant)
+
+      # Check there is some variation in the signs of the determinants
+      self.assertLess(value, count - tol)
+      self.assertLess(-count + tol, value)
+      # Check all determinants have absolute value 1
+      # Compute the sum of the absolute values of 'count' determinants
+      self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
+
+  def testShapesValues(self):
+    def circular_pad(input_, width, kernel_size):
+      """Padding input_ for computing circular convolution.
+
+      Args:
+        input_: the input tensor
+        width: the width of the tensor.
+        kernel_size: the kernel size of the filter.
+
+      Returns:
+        a tensor whose width is (width + kernel_size - 1).
+      """
+
+      beginning = kernel_size // 2
+      end = kernel_size - 1 - beginning
+
+      tmp_up = array_ops.slice(input_, [0, width - beginning, 0, 0, 0],
+                               [-1, beginning, -1, -1, -1])
+      tmp_down = array_ops.slice(input_, [0, 0, 0, 0, 0],
+                                 [-1, end, -1, -1, -1])
+      tmp = array_ops.concat([tmp_up, input_, tmp_down], 1)
+
+      tmp_left = array_ops.slice(tmp, [0, 0, width - beginning, 0, 0],
+                                 [-1, -1, beginning, -1, -1])
+      tmp_right = array_ops.slice(tmp, [0, 0, 0, 0, 0],
+                                  [-1, -1, end, -1, -1])
+      tmp = array_ops.concat([tmp_left, tmp, tmp_right], 2)
+
+      tmp_front = array_ops.slice(tmp, [0, 0, 0, width - beginning, 0],
+                                  [-1, -1, -1, beginning, -1])
+      tmp_back = array_ops.slice(tmp, [0, 0, 0, 0, 0], [-1, -1, -1, end, -1])
+      return array_ops.concat([tmp_front, tmp, tmp_back], 3)
+
+    cout = 32
+    shape = [1, 7, 7, 7, 16]
+    outputs_shape = shape[0:-1] + [cout]
+    dtype = dtypes.float32
+    tol = 1e-3
+    gain = 3.14
+    # Check orthogonality/isometry by computing the ratio between
+    # the 2-norms of the inputs and ouputs.
+    for kernel_size in [[1, 1, 1], [2, 2, 2], [3, 3, 3]]:
+      convolution = convolutional.conv3d
+      inputs = random_ops.random_normal(shape, dtype=dtype)
+      inputs_2norm = linalg_ops.norm(inputs)
+      input_with_circular_pad = circular_pad(inputs, shape[1], kernel_size[0])
+      outputs = convolution(
+          input_with_circular_pad, padding="valid", filters=cout,
+          kernel_size=kernel_size[0], use_bias=False,
+          kernel_initializer=init_ops.convolutional_orthogonal_3d(gain=gain))
+      outputs_2norm = linalg_ops.norm(outputs)
+      ratio = outputs_2norm / inputs_2norm
+      my_ops = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True) as sess:
+        sess.run(my_ops)
+        # Check the shape of the outputs
+        t = outputs.eval()
+        self.assertAllEqual(t.shape, outputs_shape)
+        # Check isometry of the orthogonal kernel.
+        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+
+
 class IdentityInitializerTest(test.TestCase):
 
   def testInvalidDataType(self):
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f95e13187fcd5cc199d871ea5efdca363b37cd0
--- /dev/null
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -0,0 +1,198 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for inplace_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.platform import test as test_lib
+
+
+class InplaceOpsTest(test_util.TensorFlowTestCase):
+
+  def testBasicUpdate(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] = 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, [-1],
+                                       array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] = 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] = 7
+        self.assertAllClose(x.eval(), y)
+
+  def testBasicUpdateBool(self):
+    with self.test_session(use_gpu=True):
+      x = array_ops.ones([7, 3], dtypes.bool)
+      y = np.ones([7, 3], dtypes.bool.as_numpy_dtype)
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3],
+                                                            dtypes.bool))
+      y[3, :] = True
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, [-1],
+                                     array_ops.zeros([1, 3], dtypes.bool))
+      y[-1, :] = False
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, 5, array_ops.zeros([3], dtypes.bool))
+      y[5, :] = False
+      self.assertAllClose(x.eval(), y)
+
+  def testBasicAdd(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = array_ops.inplace_add(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] += 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, [-1], array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] += 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] += 7
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, None, array_ops.ones([7, 3], dtype) * 99)
+        y[:, :] += 99
+        self.assertAllClose(x.eval(), y)
+
+  def testBasicSub(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] -= 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, [-1], array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] -= 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] -= 7
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, None, array_ops.ones([7, 3], dtype) * 99)
+        y[:, :] -= 99
+        self.assertAllClose(x.eval(), y)
+
+  def testRandom(self):
+    with self.test_session(use_gpu=True):
+      d0, d1, d2 = 100, 3, 5
+      x = array_ops.zeros([d0, d1, d2])
+      y = np.zeros([d0, d1, d2])
+      for _ in xrange(20):
+        idx = np.random.choice(d0, d0 // 10, replace=False)
+        val = np.random.randint(10, size=(d0 // 10, d1, d2))
+        op = np.random.randint(3)
+        if op == 0:
+          x = inplace_ops.inplace_update(x, idx, val)
+          y[idx, :] = val
+        elif op == 1:
+          x = inplace_ops.inplace_add(x, idx, val)
+          y[idx, :] += val
+        elif op == 2:
+          x = inplace_ops.inplace_sub(x, idx, val)
+          y[idx, :] -= val
+        self.assertAllClose(x.eval(), y)
+
+  def testRandom1D(self):
+    with self.test_session(use_gpu=True):
+      d0 = 100
+      x = array_ops.zeros([d0])
+      y = np.zeros([d0])
+      for _ in xrange(20):
+        idx = np.random.choice(d0, d0 // 10, replace=False)
+        val = np.random.randint(10, size=(d0 // 10))
+        op = np.random.randint(3)
+        if op == 0:
+          x = inplace_ops.inplace_update(x, idx, val)
+          y[idx] = val
+        elif op == 1:
+          x = inplace_ops.inplace_add(x, idx, val)
+          y[idx] += val
+        elif op == 2:
+          x = inplace_ops.inplace_sub(x, idx, val)
+          y[idx] -= val
+        self.assertAllClose(x.eval(), y)
+
+  def testAlias(self):
+    with self.test_session(use_gpu=True) as sess:
+      x = array_ops.ones([2, 3])
+      y = inplace_ops.alias_inplace_add(x, [0], [[1, 2, 3]])
+      with ops.control_dependencies([y]):
+        z = array_ops.identity(x)
+        _, vy, vz = sess.run([x, y, z])
+      self.assertAllClose(vy, vz)
+
+  def testError(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must be a vector"):
+        _ = inplace_ops.inplace_update([[1.]], [[0]], [[10]]).eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "x and v shape doesn't match"):
+        _ = inplace_ops.inplace_update([[1.]], [0], [10]).eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "i and x shape doesn't match"):
+        _ = inplace_ops.inplace_update([[1.]], [0, 1], [[10]]).eval()
+
+  def testEmpty(self):
+    for dtype in [
+        dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool
+    ]:
+      with self.test_session(use_gpu=True):
+        test_shapes = [(), (1,), (2, 3), (0, 2), (2, 3, 5), (2, 0, 5)]
+        for shape in test_shapes:
+          val = inplace_ops.empty(shape, dtype).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          val = inplace_ops.empty(shape, dtype, init=True).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
+          val = inplace_ops.empty_like(array_ops.zeros(shape, dtype)).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          val = inplace_ops.empty_like(
+              array_ops.zeros(shape, dtype), init=True).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
+
+        val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
+        self.assertEqual(val.tolist(), [[b"", b""]])
+
+        val = inplace_ops.empty((1, 2), dtypes.string, init=False).eval()
+        self.assertEqual(val.tolist(), [[b"", b""]])
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 66afb6ec014991ca32efd5b0895ff695d3d1015f..184d1dde2aa5e8d786cb85141f8dfb90c0bdad63 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -19,10 +19,12 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class LargeConcatOpTest(test.TestCase):
   """Tests that belong in concat_op_test.py, but run over large tensors."""
 
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index fd1b5bab6f5aa072c8821eb053bd8d39391be4d4..faeccc8fba9cc9768865eff28c61378c190b31e6 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -43,6 +43,26 @@ cuda_py_test(
     tags = ["noasan"],  # times out b/63678675
 )
 
+cuda_py_test(
+    name = "linear_operator_circulant_test",
+    size = "medium",
+    srcs = ["linear_operator_circulant_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = ["noasan"],  # times out b/63678675
+)
+
 cuda_py_test(
     name = "linear_operator_diag_test",
     size = "medium",
@@ -123,6 +143,10 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
+    tags = [
+        "noasan",  # times out
+        "optonly",
+    ],
 )
 
 cuda_py_test(
@@ -140,15 +164,3 @@ cuda_py_test(
     ],
     shard_count = 5,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7f2f1c12bf46bba16fb335707ac5015c56039ff
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -0,0 +1,700 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.linalg import linalg
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+rng = np.random.RandomState(0)
+_to_complex = linear_operator_circulant._to_complex
+
+
+class LinearOperatorCirculantBaseTest(object):
+  """Common class for circulant tests."""
+
+  @contextlib.contextmanager
+  def test_session(self, *args, **kwargs):
+    with test.TestCase.test_session(self, *args, **kwargs) as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        yield sess
+
+  def _shape_to_spectrum_shape(self, shape):
+    # If spectrum.shape = batch_shape + [N],
+    # this creates an operator of shape batch_shape + [N, N]
+    return shape[:-1]
+
+  def _spectrum_to_circulant_1d(self, spectrum, shape, dtype):
+    """Creates a circulant matrix from a spectrum.
+
+    Intentionally done in an explicit yet inefficient way.  This provides a
+    cross check to the main code that uses fancy reshapes.
+
+    Args:
+      spectrum: Float or complex `Tensor`.
+      shape:  Python list.  Desired shape of returned matrix.
+      dtype:  Type to cast the returned matrix to.
+
+    Returns:
+      Circulant (batch) matrix of desired `dtype`.
+    """
+    spectrum = _to_complex(spectrum)
+    spectrum_shape = self._shape_to_spectrum_shape(shape)
+    domain_dimension = spectrum_shape[-1]
+    if not domain_dimension:
+      return array_ops.zeros(shape, dtype)
+
+    # Explicitly compute the action of spectrum on basis vectors.
+    matrix_rows = []
+    for m in range(domain_dimension):
+      x = np.zeros([domain_dimension])
+      # x is a basis vector.
+      x[m] = 1.0
+      fft_x = math_ops.fft(x)
+      h_convolve_x = math_ops.ifft(spectrum * fft_x)
+      matrix_rows.append(h_convolve_x)
+    matrix = array_ops.stack(matrix_rows, axis=-1)
+    return math_ops.cast(matrix, dtype)
+
+
+class LinearOperatorCirculantTestSelfAdjointOperator(
+    LinearOperatorCirculantBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant when operator is self-adjoint.
+
+  Real spectrum <==> Self adjoint operator.
+  Note that when the spectrum is real, the operator may still be complex.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    # This operator will always be complex because, although the specturm is
+    # real, the matrix will not be real.
+    return [dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # For this test class, we are creating real spectrums.
+    # We also want the spectrum to have eigenvalues bounded away from zero.
+    #
+    # spectrum is bounded away from zero.
+    spectrum = linear_operator_test_util.random_sign_uniform(
+        shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    # If dtype is complex, cast spectrum to complex.  The imaginary part will be
+    # zero, so the operator will still be self-adjoint.
+    spectrum = math_ops.cast(spectrum, dtype)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant(
+          spectrum_ph, is_self_adjoint=True, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, is_self_adjoint=True, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+  def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
+    with self.test_session():
+      spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix = operator.to_dense()
+      imag_matrix = math_ops.imag(matrix)
+      eps = np.finfo(np.float32).eps
+      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+
+
+class LinearOperatorCirculantTestHermitianSpectrum(
+    LinearOperatorCirculantBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant when the spectrum is Hermitian.
+
+  Hermitian spectrum <==> Real valued operator.  We test both real and complex
+  dtypes here though.  So in some cases the matrix will be complex but with
+  zero imaginary part.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    return [dtypes.float32, dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # For this test class, we are creating Hermitian spectrums.
+    # We also want the spectrum to have eigenvalues bounded away from zero.
+    #
+    # pre_spectrum is bounded away from zero.
+    pre_spectrum = linear_operator_test_util.random_uniform(
+        shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    pre_spectrum_c = _to_complex(pre_spectrum)
+
+    # Real{IFFT[pre_spectrum]}
+    #  = IFFT[EvenPartOf[pre_spectrum]]
+    # is the IFFT of something that is also bounded away from zero.
+    # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
+    pre_h = math_ops.ifft(pre_spectrum_c)
+
+    # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
+    # So we will make spectrum = FFT[h], for real valued h.
+    h = math_ops.real(pre_h)
+    h_c = _to_complex(h)
+
+    spectrum = math_ops.fft(h_c)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant(
+          spectrum_ph, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+  def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
+    with self.test_session():
+      spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix = operator.to_dense()
+      imag_matrix = math_ops.imag(matrix)
+      eps = np.finfo(np.float32).eps
+      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+
+
+class LinearOperatorCirculantTestNonHermitianSpectrum(
+    LinearOperatorCirculantBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant when the spectrum is not Hermitian.
+
+  Non-Hermitian spectrum <==> Complex valued operator.
+  We test only complex dtypes here.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    return [dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # Will be well conditioned enough to get accurate solves.
+    spectrum = linear_operator_test_util.random_sign_uniform(
+        shape=self._shape_to_spectrum_shape(shape),
+        dtype=dtypes.complex64,
+        minval=1.,
+        maxval=2.)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant(
+          spectrum_ph, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+  def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
+    with self.test_session():
+      spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix = operator.to_dense()
+      imag_matrix = math_ops.imag(matrix)
+      eps = np.finfo(np.float32).eps
+      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+
+  def test_simple_positive_real_spectrum_gives_self_adjoint_pos_def_oper(self):
+    with self.test_session() as sess:
+      spectrum = math_ops.cast([6., 4, 2], dtypes.complex64)
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix, matrix_h = sess.run(
+          [operator.to_dense(),
+           linalg.adjoint(operator.to_dense())])
+      self.assertAllClose(matrix, matrix_h)
+      operator.assert_positive_definite().run()  # Should not fail
+      operator.assert_self_adjoint().run()  # Should not fail
+
+  def test_defining_operator_using_real_convolution_kernel(self):
+    with self.test_session():
+      convolution_kernel = [1., 2., 1.]
+      spectrum = math_ops.fft(
+          math_ops.cast(convolution_kernel, dtypes.complex64))
+
+      # spectrum is shape [3] ==> operator is shape [3, 3]
+      # spectrum is Hermitian ==> operator is real.
+      operator = linalg.LinearOperatorCirculant(spectrum)
+
+      # Allow for complex output so we can make sure it has zero imag part.
+      self.assertEqual(operator.dtype, dtypes.complex64)
+
+      matrix = operator.to_dense().eval()
+      np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
+
+  def test_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
+    with self.test_session():
+      # Make spectrum the FFT of a real convolution kernel h.  This ensures that
+      # spectrum is Hermitian.
+      h = linear_operator_test_util.random_normal(shape=(3, 4))
+      spectrum = math_ops.fft(math_ops.cast(h, dtypes.complex64))
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix = operator.to_dense()
+      imag_matrix = math_ops.imag(matrix)
+      eps = np.finfo(np.float32).eps
+      np.testing.assert_allclose(
+          0, imag_matrix.eval(), rtol=0, atol=eps * 3 * 4)
+
+  def test_convolution_kernel_same_as_first_row_of_to_dense(self):
+    spectrum = [[3., 2., 1.], [2., 1.5, 1.]]
+    with self.test_session():
+      operator = linalg.LinearOperatorCirculant(spectrum)
+      h = operator.convolution_kernel()
+      c = operator.to_dense()
+
+      self.assertAllEqual((2, 3), h.get_shape())
+      self.assertAllEqual((2, 3, 3), c.get_shape())
+      self.assertAllClose(h.eval(), c.eval()[:, :, 0])
+
+  def test_assert_non_singular_fails_for_singular_operator(self):
+    spectrum = math_ops.cast([0, 4, 2j + 2], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    with self.test_session():
+      with self.assertRaisesOpError("Singular operator"):
+        operator.assert_non_singular().run()
+
+  def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
+    spectrum = math_ops.cast([-3j, 4, 2j + 2], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    with self.test_session():
+      operator.assert_non_singular().run()  # Should not fail
+
+  def test_assert_positive_definite_fails_for_non_positive_definite(self):
+    spectrum = math_ops.cast([6., 4, 2j], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    with self.test_session():
+      with self.assertRaisesOpError("Not positive definite"):
+        operator.assert_positive_definite().run()
+
+  def test_assert_positive_definite_does_not_fail_when_pos_def(self):
+    spectrum = math_ops.cast([6., 4, 2j + 2], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    with self.test_session():
+      operator.assert_positive_definite().run()  # Should not fail
+
+  def test_real_spectrum_and_not_self_adjoint_hint_raises(self):
+    spectrum = [1., 2.]
+    with self.assertRaisesRegexp(ValueError, "real.*always.*self-adjoint"):
+      linalg.LinearOperatorCirculant(spectrum, is_self_adjoint=False)
+
+  def test_real_spectrum_auto_sets_is_self_adjoint_to_true(self):
+    spectrum = [1., 2.]
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    self.assertTrue(operator.is_self_adjoint)
+
+
+class LinearOperatorCirculant2DBaseTest(object):
+  """Common class for 2D circulant tests."""
+
+  @contextlib.contextmanager
+  def test_session(self, *args, **kwargs):
+    with test.TestCase.test_session(self, *args, **kwargs) as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        yield sess
+
+  @property
+  def _operator_build_infos(self):
+    build_info = linear_operator_test_util.OperatorBuildInfo
+    # non-batch operators (n, n) and batch operators.
+    return [
+        build_info((0, 0)),
+        build_info((1, 1)),
+        build_info((1, 6, 6)),
+        build_info((3, 4, 4)),
+        build_info((2, 1, 3, 3))
+    ]
+
+  def _shape_to_spectrum_shape(self, shape):
+    """Get a spectrum shape that will make an operator of desired shape."""
+    # This 2D block circulant operator takes a spectrum of shape
+    # batch_shape + [N0, N1],
+    # and creates and operator of shape
+    # batch_shape + [N0*N1, N0*N1]
+    if shape == (0, 0):
+      return (0, 0)
+    elif shape == (1, 1):
+      return (1, 1)
+    elif shape == (1, 6, 6):
+      return (1, 2, 3)
+    elif shape == (3, 4, 4):
+      return (3, 2, 2)
+    elif shape == (2, 1, 3, 3):
+      return (2, 1, 3, 1)
+    else:
+      raise ValueError("Unhandled shape: %s" % shape)
+
+  def _spectrum_to_circulant_2d(self, spectrum, shape, dtype):
+    """Creates a block circulant matrix from a spectrum.
+
+    Intentionally done in an explicit yet inefficient way.  This provides a
+    cross check to the main code that uses fancy reshapes.
+
+    Args:
+      spectrum: Float or complex `Tensor`.
+      shape:  Python list.  Desired shape of returned matrix.
+      dtype:  Type to cast the returned matrix to.
+
+    Returns:
+      Block circulant (batch) matrix of desired `dtype`.
+    """
+    spectrum = _to_complex(spectrum)
+    spectrum_shape = self._shape_to_spectrum_shape(shape)
+    domain_dimension = spectrum_shape[-1]
+    if not domain_dimension:
+      return array_ops.zeros(shape, dtype)
+
+    block_shape = spectrum_shape[-2:]
+
+    # Explicitly compute the action of spectrum on basis vectors.
+    matrix_rows = []
+    for n0 in range(block_shape[0]):
+      for n1 in range(block_shape[1]):
+        x = np.zeros(block_shape)
+        # x is a basis vector.
+        x[n0, n1] = 1.0
+        fft_x = math_ops.fft2d(x)
+        h_convolve_x = math_ops.ifft2d(spectrum * fft_x)
+        # We want the flat version of the action of the operator on a basis
+        # vector, not the block version.
+        h_convolve_x = array_ops.reshape(h_convolve_x, shape[:-1])
+        matrix_rows.append(h_convolve_x)
+    matrix = array_ops.stack(matrix_rows, axis=-1)
+    return math_ops.cast(matrix, dtype)
+
+
+class LinearOperatorCirculant2DTestHermitianSpectrum(
+    LinearOperatorCirculant2DBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant2D when the spectrum is Hermitian.
+
+  Hermitian spectrum <==> Real valued operator.  We test both real and complex
+  dtypes here though.  So in some cases the matrix will be complex but with
+  zero imaginary part.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    return [dtypes.float32, dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # For this test class, we are creating Hermitian spectrums.
+    # We also want the spectrum to have eigenvalues bounded away from zero.
+    #
+    # pre_spectrum is bounded away from zero.
+    pre_spectrum = linear_operator_test_util.random_uniform(
+        shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    pre_spectrum_c = _to_complex(pre_spectrum)
+
+    # Real{IFFT[pre_spectrum]}
+    #  = IFFT[EvenPartOf[pre_spectrum]]
+    # is the IFFT of something that is also bounded away from zero.
+    # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
+    pre_h = math_ops.ifft2d(pre_spectrum_c)
+
+    # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
+    # So we will make spectrum = FFT[h], for real valued h.
+    h = math_ops.real(pre_h)
+    h_c = _to_complex(h)
+
+    spectrum = math_ops.fft2d(h_c)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant2D(
+          spectrum_ph, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant2D(
+          spectrum, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+
+class LinearOperatorCirculant2DTestNonHermitianSpectrum(
+    LinearOperatorCirculant2DBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant when the spectrum is not Hermitian.
+
+  Non-Hermitian spectrum <==> Complex valued operator.
+  We test only complex dtypes here.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    return [dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # Will be well conditioned enough to get accurate solves.
+    spectrum = linear_operator_test_util.random_sign_uniform(
+        shape=self._shape_to_spectrum_shape(shape),
+        dtype=dtype,
+        minval=1.,
+        maxval=2.)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant2D(
+          spectrum_ph, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant2D(
+          spectrum, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+  def test_real_hermitian_spectrum_gives_real_symmetric_operator(self):
+    with self.test_session() as sess:
+      # This is a real and hermitian spectrum.
+      spectrum = [[1., 2., 2.], [3., 4., 4.], [3., 4., 4.]]
+      operator = linalg.LinearOperatorCirculant(spectrum)
+
+      matrix_tensor = operator.to_dense()
+      self.assertEqual(matrix_tensor.dtype,
+                       linear_operator_circulant._DTYPE_COMPLEX)
+      matrix_t = array_ops.matrix_transpose(matrix_tensor)
+      imag_matrix = math_ops.imag(matrix_tensor)
+      matrix, matrix_transpose, imag_matrix = sess.run(
+          [matrix_tensor, matrix_t, imag_matrix])
+
+      np.testing.assert_allclose(0, imag_matrix, atol=1e-6)
+      self.assertAllClose(matrix, matrix_transpose, atol=0)
+
+  def test_real_spectrum_gives_self_adjoint_operator(self):
+    with self.test_session() as sess:
+      # This is a real and hermitian spectrum.
+      spectrum = linear_operator_test_util.random_normal(
+          shape=(3, 3), dtype=dtypes.float32)
+      operator = linalg.LinearOperatorCirculant2D(spectrum)
+
+      matrix_tensor = operator.to_dense()
+      self.assertEqual(matrix_tensor.dtype,
+                       linear_operator_circulant._DTYPE_COMPLEX)
+      matrix_h = linalg.adjoint(matrix_tensor)
+      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      self.assertAllClose(matrix, matrix_h, atol=0)
+
+  def test_assert_non_singular_fails_for_singular_operator(self):
+    spectrum = math_ops.cast([[0, 4], [2j + 2, 3.]], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    with self.test_session():
+      with self.assertRaisesOpError("Singular operator"):
+        operator.assert_non_singular().run()
+
+  def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
+    spectrum = math_ops.cast([[-3j, 4], [2j + 2, 3.]], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    with self.test_session():
+      operator.assert_non_singular().run()  # Should not fail
+
+  def test_assert_positive_definite_fails_for_non_positive_definite(self):
+    spectrum = math_ops.cast([[6., 4], [2j, 3.]], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    with self.test_session():
+      with self.assertRaisesOpError("Not positive definite"):
+        operator.assert_positive_definite().run()
+
+  def test_assert_positive_definite_does_not_fail_when_pos_def(self):
+    spectrum = math_ops.cast([[6., 4], [2j + 2, 3.]], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    with self.test_session():
+      operator.assert_positive_definite().run()  # Should not fail
+
+  def test_real_spectrum_and_not_self_adjoint_hint_raises(self):
+    spectrum = [[1., 2.], [3., 4]]
+    with self.assertRaisesRegexp(ValueError, "real.*always.*self-adjoint"):
+      linalg.LinearOperatorCirculant2D(spectrum, is_self_adjoint=False)
+
+  def test_real_spectrum_auto_sets_is_self_adjoint_to_true(self):
+    spectrum = [[1., 2.], [3., 4]]
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    self.assertTrue(operator.is_self_adjoint)
+
+  def test_invalid_dtype_raises(self):
+    spectrum = array_ops.constant(rng.rand(2, 2, 2))
+    with self.assertRaisesRegexp(TypeError, "must have dtype"):
+      linalg.LinearOperatorCirculant2D(spectrum)
+
+  def test_invalid_rank_raises(self):
+    spectrum = array_ops.constant(np.float32(rng.rand(2)))
+    with self.assertRaisesRegexp(ValueError, "must have at least 2 dimensions"):
+      linalg.LinearOperatorCirculant2D(spectrum)
+
+
+class LinearOperatorCirculant3DTest(test.TestCase):
+  """Simple test of the 3D case.  See also the 1D and 2D tests."""
+
+  @contextlib.contextmanager
+  def test_session(self, *args, **kwargs):
+    with test.TestCase.test_session(self, *args, **kwargs) as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        yield sess
+
+  def test_real_spectrum_gives_self_adjoint_operator(self):
+    with self.test_session() as sess:
+      # This is a real and hermitian spectrum.
+      spectrum = linear_operator_test_util.random_normal(
+          shape=(2, 2, 3, 5), dtype=dtypes.float32)
+      operator = linalg.LinearOperatorCirculant3D(spectrum)
+      self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), operator.shape)
+
+      matrix_tensor = operator.to_dense()
+      self.assertEqual(matrix_tensor.dtype,
+                       linear_operator_circulant._DTYPE_COMPLEX)
+      matrix_h = linalg.adjoint(matrix_tensor)
+
+      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
+      self.assertAllClose(matrix, matrix_h)
+
+  def test_defining_operator_using_real_convolution_kernel(self):
+    with self.test_session():
+      convolution_kernel = linear_operator_test_util.random_normal(
+          shape=(2, 2, 3, 5), dtype=dtypes.float32)
+      # Convolution kernel is real ==> spectrum is Hermitian.
+      spectrum = math_ops.fft3d(
+          math_ops.cast(convolution_kernel, dtypes.complex64))
+
+      # spectrum is Hermitian ==> operator is real.
+      operator = linalg.LinearOperatorCirculant3D(spectrum)
+      self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), operator.shape)
+
+      # Allow for complex output so we can make sure it has zero imag part.
+      self.assertEqual(operator.dtype, dtypes.complex64)
+      matrix = operator.to_dense().eval()
+      self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
+      np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
+
+  def test_defining_spd_operator_by_taking_real_part(self):
+    with self.test_session() as sess:
+      # S is real and positive.
+      s = linear_operator_test_util.random_uniform(
+          shape=(10, 2, 3, 4), dtype=dtypes.float32, minval=1., maxval=2.)
+
+      # Let S = S1 + S2, the Hermitian and anti-hermitian parts.
+      # S1 = 0.5 * (S + S^H), S2 = 0.5 * (S - S^H),
+      # where ^H is the Hermitian transpose of the function:
+      #    f(n0, n1, n2)^H := ComplexConjugate[f(N0-n0, N1-n1, N2-n2)].
+      # We want to isolate S1, since
+      #   S1 is Hermitian by construction
+      #   S1 is real since S is
+      #   S1 is positive since it is the sum of two positive kernels
+
+      # IDFT[S] = IDFT[S1] + IDFT[S2]
+      #         =      H1  +      H2
+      # where H1 is real since it is Hermitian,
+      # and H2 is imaginary since it is anti-Hermitian.
+      ifft_s = math_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
+
+      # Throw away H2, keep H1.
+      real_ifft_s = math_ops.real(ifft_s)
+
+      # This is the perfect spectrum!
+      # spectrum = DFT[H1]
+      #          = S1,
+      fft_real_ifft_s = math_ops.fft3d(
+          math_ops.cast(real_ifft_s, dtypes.complex64))
+
+      # S1 is Hermitian ==> operator is real.
+      # S1 is real ==> operator is self-adjoint.
+      # S1 is positive ==> operator is positive-definite.
+      operator = linalg.LinearOperatorCirculant3D(fft_real_ifft_s)
+
+      # Allow for complex output so we can check operator has zero imag part.
+      self.assertEqual(operator.dtype, dtypes.complex64)
+      matrix, matrix_t = sess.run([
+          operator.to_dense(),
+          array_ops.matrix_transpose(operator.to_dense())
+      ])
+      operator.assert_positive_definite().run()  # Should not fail.
+      np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
+      self.assertAllClose(matrix, matrix_t)
+
+      # Just to test the theory, get S2 as well.
+      # This should create an imaginary operator.
+      # S2 is anti-Hermitian ==> operator is imaginary.
+      # S2 is real ==> operator is self-adjoint.
+      imag_ifft_s = math_ops.imag(ifft_s)
+      fft_imag_ifft_s = math_ops.fft3d(
+          1j * math_ops.cast(imag_ifft_s, dtypes.complex64))
+      operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s)
+
+      matrix, matrix_h = sess.run([
+          operator_imag.to_dense(),
+          array_ops.matrix_transpose(math_ops.conj(operator_imag.to_dense()))
+      ])
+      self.assertAllClose(matrix, matrix_h)
+      np.testing.assert_allclose(0, np.real(matrix), atol=1e-7)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index 4d79365dbefc74fe8412b65ec089fb2af4255aea..f96b9ccdaacae7d8e0552ed3d74ce53808fed963 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -44,9 +44,9 @@ class SquareLinearOperatorCompositionTest(
     self._rtol[dtypes.float32] = 1e-4
     self._rtol[dtypes.complex64] = 1e-4
 
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
     sess = ops.get_default_session()
-    shape = list(shape)
+    shape = list(build_info.shape)
 
     # Either 1 or 2 matrices, depending.
     num_operators = rng.randint(low=1, high=3)
@@ -148,9 +148,9 @@ class NonSquareLinearOperatorCompositionTest(
     self._rtol[dtypes.float32] = 1e-4
     self._rtol[dtypes.complex64] = 1e-4
 
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
     sess = ops.get_default_session()
-    shape = list(shape)
+    shape = list(build_info.shape)
 
     # Test only the case of 2 matrices.
     # The Square test uses either 1 or 2, so we have tested the case of 1 matrix
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 343d158498833dd92361bc41d433e28296fc4c9a..0a0e31c716ecfa10ed93cff92fa908a240f8495e 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -34,7 +34,8 @@ class LinearOperatorDiagTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
     diag = linear_operator_test_util.random_sign_uniform(
         shape[:-1], minval=1., maxval=2., dtype=dtype)
     if use_placeholder:
@@ -129,7 +130,7 @@ class LinearOperatorDiagTest(
     with self.test_session() as sess:
       x = random_ops.random_normal(shape=(2, 2, 3, 4))
 
-      # This LinearOperatorDiag will be brodacast to (2, 2, 3, 3) during solve
+      # This LinearOperatorDiag will be broadcast to (2, 2, 3, 3) during solve
       # and matmul with 'x' as the argument.
       diag = random_ops.random_uniform(shape=(2, 1, 3))
       operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
index 50d6f524e9ad75715d7a57348638fdfeee667f40..b3da623b5e8d8c99c6777e75e2d49f24dab1c96b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
@@ -36,11 +36,11 @@ class SquareLinearOperatorFullMatrixTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
-    shape = list(shape)
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
 
-    matrix = linear_operator_test_util.random_positive_definite_matrix(shape,
-                                                                       dtype)
+    matrix = linear_operator_test_util.random_positive_definite_matrix(
+        shape, dtype)
 
     if use_placeholder:
       matrix_ph = array_ops.placeholder(dtype=dtype)
@@ -136,8 +136,8 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.float64]
 
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
-    shape = list(shape)
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
 
     matrix = linear_operator_test_util.random_positive_definite_matrix(
         shape, dtype, force_well_conditioned=True)
@@ -210,7 +210,8 @@ class NonSquareLinearOperatorFullMatrixTest(
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
     matrix = linear_operator_test_util.random_normal(shape, dtype=dtype)
     if use_placeholder:
       matrix_ph = array_ops.placeholder(dtype=dtype)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 6d635707683f4500919073a4f43c320a44b65018..59f63f949e96991193412d3574603e58a75cb6e5 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -43,8 +43,8 @@ class LinearOperatorIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
-    shape = list(shape)
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
     batch_shape = shape[:-2]
@@ -261,8 +261,8 @@ class LinearOperatorScaledIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
-    shape = list(shape)
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
     batch_shape = shape[:-2]
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
index d3a47da946b12277c4c390a4a320d7c91ed81b32..8095f6419ef0d9543339cf1f4ee9cd4783f852b9 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
@@ -55,16 +55,22 @@ class BaseLinearOperatorLowRankUpdatetest(object):
     return [dtypes.float32, dtypes.float64]
 
   @property
-  def _shapes_to_test(self):
+  def _operator_build_infos(self):
+    build_info = linear_operator_test_util.OperatorBuildInfo
     # Previously we had a (2, 10, 10) shape at the end.  We did this to test the
     # inversion and determinant lemmas on not-tiny matrices, since these are
     # known to have stability issues.  This resulted in test timeouts, so this
     # shape has been removed, but rest assured, the tests did pass.
-    return [(0, 0), (1, 1), (1, 3, 3), (3, 4, 4), (2, 1, 4, 4)]
-
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+    return [
+        build_info((0, 0)),
+        build_info((1, 1)),
+        build_info((1, 3, 3)),
+        build_info((3, 4, 4)),
+        build_info((2, 1, 4, 4))]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
     # Recall A = L + UDV^H
-    shape = list(shape)
+    shape = list(build_info.shape)
     diag_shape = shape[:-1]
     k = shape[-2] // 2 + 1
     u_perturbation_shape = shape[:-1] + [k]
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index db3918f9983c5b7d05fa4ba3bc85b26a485f2f00..a57d2f085e089fb913f09fdd9b07cf13aa7f3c35 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -38,7 +38,8 @@ class LinearOperatorLowerTriangularTest(
     # matrix_triangular_solve.
     return [dtypes.float32, dtypes.float64]
 
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
     # Upper triangle will be nonzero, but ignored.
     # Use a diagonal that ensures this matrix is well conditioned.
     tril = linear_operator_test_util.random_tril_matrix(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index e1edffc3d9afec618f8dfcf74bae1b0f1bde2772..7b291e29de41d2fe37257bb42222ac23fc8e1d3f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -94,8 +95,8 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
 class BroadcastMatrixBatchDimsTest(test.TestCase):
 
   def test_zero_batch_matrices_returned_as_empty_list(self):
-    self.assertAllEqual(
-        [], linear_operator_util.broadcast_matrix_batch_dims([]))
+    self.assertAllEqual([],
+                        linear_operator_util.broadcast_matrix_batch_dims([]))
 
   def test_one_batch_matrix_returned_after_tensor_conversion(self):
     arr = rng.rand(2, 3, 4)
@@ -194,6 +195,44 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
       linear_operator_util.broadcast_matrix_batch_dims([y, x])
 
 
+class CholeskySolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    chol = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 7)
+    chol_broadcast = chol + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2, 2]
+    chol = rng.rand(2, 3, 3)
+    rhs = rng.rand(2, 1, 3, 7)
+    chol_broadcast = chol + np.zeros((2, 2, 1, 1))
+    rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
+
+    chol_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.cholesky_solve_with_broadcast(
+                  chol_ph, rhs_ph),
+              linalg_ops.cholesky_solve(chol_broadcast, rhs_broadcast)
+          ],
+          feed_dict={
+              chol_ph: chol,
+              rhs_ph: rhs,
+          })
+      self.assertAllEqual(expected, result)
+
+
 class MatmulWithBroadcastTest(test.TestCase):
 
   def test_static_dims_broadcast(self):
@@ -209,7 +248,7 @@ class MatmulWithBroadcastTest(test.TestCase):
       expected = math_ops.matmul(x, y_broadcast)
       self.assertAllEqual(expected.eval(), result.eval())
 
-  def test_dynamic_dims_broadcast_32bit(self):
+  def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
     x = rng.rand(2, 1, 3)
@@ -221,9 +260,90 @@ class MatmulWithBroadcastTest(test.TestCase):
 
     with self.test_session() as sess:
       result, expected = sess.run(
-          [linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
-           math_ops.matmul(x, y_broadcast)],
-          feed_dict={x_ph: x, y_ph: y})
+          [
+              linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
+              math_ops.matmul(x, y_broadcast)
+          ],
+          feed_dict={
+              x_ph: x,
+              y_ph: y
+          })
+      self.assertAllEqual(expected, result)
+
+
+class MatrixSolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    matrix = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 7)
+    matrix_broadcast = matrix + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2, 2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(2, 1, 3, 7)
+    matrix_broadcast = matrix + np.zeros((2, 2, 1, 1))
+    rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
+
+    matrix_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.matrix_solve_with_broadcast(
+                  matrix_ph, rhs_ph),
+              linalg_ops.matrix_solve(matrix_broadcast, rhs_broadcast)
+          ],
+          feed_dict={
+              matrix_ph: matrix,
+              rhs_ph: rhs,
+          })
+      self.assertAllEqual(expected, result)
+
+
+class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(3, 7)
+    rhs_broadcast = rhs + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.matrix_triangular_solve_with_broadcast(
+          matrix, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(3, 7)
+    rhs_broadcast = rhs + np.zeros((2, 1, 1))
+
+    matrix_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.matrix_triangular_solve_with_broadcast(
+                  matrix_ph, rhs_ph),
+              linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+          ],
+          feed_dict={
+              matrix_ph: matrix,
+              rhs_ph: rhs,
+          })
       self.assertAllEqual(expected, result)
 
 
@@ -244,7 +364,7 @@ class AssertCompatibleMatrixDimensionsTest(test.TestCase):
       operator = DomainDimensionStubOperator(3)
       # Should not raise
       linear_operator_util.assert_compatible_matrix_dimensions(
-          operator, x).run()
+          operator, x).run()  # pyformat: disable
 
   def test_incompatible_dimensions_raise(self):
     with self.test_session():
@@ -252,7 +372,7 @@ class AssertCompatibleMatrixDimensionsTest(test.TestCase):
       operator = DomainDimensionStubOperator(3)
       with self.assertRaisesOpError("Incompatible matrix dimensions"):
         linear_operator_util.assert_compatible_matrix_dimensions(
-            operator, x).run()
+            operator, x).run()  # pyformat: disable
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 1577b7bc8021a326eb720bdf059b8d1c568c0cc1..098f9724a2a65f544005e1799f6329a29bb02abe 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -30,7 +30,11 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -41,71 +45,83 @@ def scalar_shape():
 
 class ListOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPop(self):
     l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
                                    element_shape=scalar_shape())
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 1.0)
+    self.assertAllEqual(self.evaluate(e), 1.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPopGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testPushPop()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStack(self):
     l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
                                    element_shape=scalar_shape())
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(t, [1.0, 2.0])
+    self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStackGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testStack()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 2.0)
+    self.assertAllEqual(self.evaluate(e), 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 1.0)
-    self.assertAllEqual(list_ops.tensor_list_length(l), 0)
+    self.assertAllEqual(self.evaluate(e), 1.0)
+    self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testFromTensorGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetItem(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
-    self.assertAllEqual(e0, 1.0)
+    self.assertAllEqual(self.evaluate(e0), 1.0)
     l = list_ops.tensor_list_set_item(l, 0, 3.0)
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(t, [3.0, 2.0])
+    self.assertAllEqual(self.evaluate(t), [3.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testGetSetItem()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUnknownShape(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=-1)
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=-1)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0, 2.0]))
-    _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, [1.0, 2.0])
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e), [1.0, 2.0])
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e), 1.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testCPUGPUCopy(self):
     if not context.num_gpus():
       return
@@ -114,15 +130,92 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with context.device("gpu:0"):
       l_gpu = array_ops.identity(l)
       self.assertAllEqual(
-          list_ops.tensor_list_pop_back(
-              l_gpu, element_dtype=dtypes.float32)[1],
-          2.0)
+          self.evaluate(
+              list_ops.tensor_list_pop_back(
+                  l_gpu, element_dtype=dtypes.float32)[1]), 2.0)
     l_cpu = array_ops.identity(l_gpu)
     self.assertAllEqual(
-        list_ops.tensor_list_pop_back(
-            l_cpu, element_dtype=dtypes.float32)[1],
-        2.0)
+        self.evaluate(
+            list_ops.tensor_list_pop_back(
+                l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGraphStack(self):
+    with context.graph_mode(), self.test_session():
+      tl = list_ops.empty_tensor_list(
+          element_shape=constant_op.constant([1], dtype=dtypes.int32),
+          element_dtype=dtypes.int32)
+      tl = list_ops.tensor_list_push_back(tl, [1])
+      self.assertAllEqual(
+          self.evaluate(
+              list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32)),
+          [[1]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGraphStackInLoop(self):
+    with context.graph_mode(), self.test_session():
+      t1 = list_ops.empty_tensor_list(
+          element_shape=constant_op.constant([], dtype=dtypes.int32),
+          element_dtype=dtypes.int32)
+      i = constant_op.constant(0, dtype=dtypes.int32)
+
+      def body(i, t1):
+        t1 = list_ops.tensor_list_push_back(t1, i)
+        i += 1
+        return i, t1
+
+      i, t1 = control_flow_ops.while_loop(lambda i, t1: math_ops.less(i, 4),
+                                          body, [i, t1])
+      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32)
+      self.assertAllEqual(self.evaluate(s1), [0, 1, 2, 3])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGraphStackSwitchDtype(self):
+    with context.graph_mode(), self.test_session():
+      list_ = list_ops.empty_tensor_list(
+          element_shape=constant_op.constant([], dtype=dtypes.int32),
+          element_dtype=dtypes.int32)
+      m = constant_op.constant([1, 2, 3], dtype=dtypes.float32)
+
+      def body(list_, m):
+        list_ = control_flow_ops.cond(
+            math_ops.equal(list_ops.tensor_list_length(list_), 0),
+            lambda: list_ops.empty_tensor_list(m.shape, m.dtype), lambda: list_)
+        list_ = list_ops.tensor_list_push_back(list_, m)
+        return list_, m
+
+      for _ in range(2):
+        list_, m = body(list_, m)
+
+      s1 = list_ops.tensor_list_stack(list_, element_dtype=dtypes.float32)
+      np_s1 = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)
+      self.assertAllEqual(self.evaluate(s1), np_s1)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGraphStackInLoopSwitchDtype(self):
+    with context.graph_mode(), self.test_session():
+      t1 = list_ops.empty_tensor_list(
+          element_shape=constant_op.constant([], dtype=dtypes.int32),
+          element_dtype=dtypes.int32)
+      i = constant_op.constant(0, dtype=dtypes.float32)
+      m = constant_op.constant([1, 2, 3], dtype=dtypes.float32)
+
+      def body(i, m, t1):
+        t1 = control_flow_ops.cond(
+            math_ops.equal(list_ops.tensor_list_length(t1), 0),
+            lambda: list_ops.empty_tensor_list(m.shape, m.dtype), lambda: t1)
+
+        t1 = list_ops.tensor_list_push_back(t1, m * i)
+        i += 1.0
+        return i, m, t1
+
+      i, m, t1 = control_flow_ops.while_loop(
+          lambda i, m, t1: math_ops.less(i, 4), body, [i, m, t1])
+      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.float32)
+      np_s1 = np.vstack([np.arange(1, 4) * i for i in range(4)])
+      self.assertAllEqual(self.evaluate(s1), np_s1)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSerialize(self):
     # pylint: disable=g-import-not-at-top
     try:
@@ -152,8 +245,9 @@ class ListOpsTest(test_util.TensorFlowTestCase):
               l_ps, element_dtype=dtypes.float32)
         with ops.device("/job:worker"):
           worker_e = array_ops.identity(e)
-        self.assertAllEqual(worker_e.eval(), [2.0])
+        self.assertAllEqual(self.evaluate(worker_e), [2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPopGradients(self):
     with backprop.GradientTape() as tape:
       l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
@@ -163,18 +257,21 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       l = list_ops.tensor_list_push_back(l, c)
       l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       e = 2 * e
-    self.assertAllEqual(tape.gradient(e, [c])[0], 2.0)
+    self.assertAllEqual(self.evaluate(tape.gradient(e, [c])[0]), 2.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStackFromTensorGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
       l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
       c2 = list_ops.tensor_list_stack(
-          l, element_dtype=dtypes.float32)
+          l, element_dtype=dtypes.float32, num_elements=2)
       result = c2 * 2.0
-    self.assertAllEqual(tape.gradient(result, [c])[0], [2.0, 2.0])
+    grad = tape.gradient(result, [c])[0]
+    self.assertAllEqual(self.evaluate(grad), [2.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
@@ -187,16 +284,142 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       ee = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
       y = e * e + ee * ee
     grad_c, grad_c2 = tape.gradient(y, [c, c2])
-    self.assertAllEqual(grad_c, [0.0, 4.0])
-    self.assertAllEqual(grad_c2, 6.0)
+    self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0])
+    self.assertAllEqual(self.evaluate(grad_c2), 6.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSetOutOfBounds(self):
     c = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
     with self.assertRaises(errors.InvalidArgumentError):
-      list_ops.tensor_list_set_item(l, 20, 3.0)
+      self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testResourceVariableScatterGather(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    v = vs.get_variable("var", initializer=[l] * 10, use_resource=True)
+    v_r_0_stacked = list_ops.tensor_list_stack(v[0], dtypes.float32)
+    self.evaluate(v.initializer)
+    self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_0_stacked))
+    v_r_sparse_stacked = list_ops.tensor_list_stack(
+        v.sparse_read(0), dtypes.float32)
+    self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_sparse_stacked))
+    l_new_0 = list_ops.tensor_list_from_tensor(
+        [3.0, 4.0], element_shape=scalar_shape())
+    l_new_1 = list_ops.tensor_list_from_tensor(
+        [5.0, 6.0], element_shape=scalar_shape())
+    updated_v = state_ops.scatter_update(v, [3, 5], [l_new_0, l_new_1])
+    updated_v_elems = array_ops.unstack(updated_v)
+    updated_v_stacked = [
+        list_ops.tensor_list_stack(el, dtypes.float32) for el in updated_v_elems
+    ]
+    expected = ([[1.0, 2.0]] * 3 + [[3.0, 4.0], [1.0, 2.0], [5.0, 6.0]] +
+                [[1.0, 2.0]] * 4)
+    self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConcat(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l_batch_0 = array_ops.stack([l0, l1])
+    l_batch_1 = array_ops.stack([l1, l0])
+
+    l_concat_01 = list_ops.tensor_list_concat_lists(
+        l_batch_0, l_batch_1, element_dtype=dtypes.float32)
+    l_concat_10 = list_ops.tensor_list_concat_lists(
+        l_batch_1, l_batch_0, element_dtype=dtypes.float32)
+    l_concat_00 = list_ops.tensor_list_concat_lists(
+        l_batch_0, l_batch_0, element_dtype=dtypes.float32)
+    l_concat_11 = list_ops.tensor_list_concat_lists(
+        l_batch_1, l_batch_1, element_dtype=dtypes.float32)
+
+    expected_00 = [[1.0, 2.0, 1.0, 2.0], [-1.0, -1.0]]
+    expected_01 = [[1.0, 2.0, -1.0], [-1.0, 1.0, 2.0]]
+    expected_10 = [[-1.0, 1.0, 2.0], [1.0, 2.0, -1.0]]
+    expected_11 = [[-1.0, -1.0], [1.0, 2.0, 1.0, 2.0]]
+
+    for i, (concat, expected) in enumerate(zip(
+        [l_concat_00, l_concat_01, l_concat_10, l_concat_11],
+        [expected_00, expected_01, expected_10, expected_11])):
+      splitted = array_ops.unstack(concat)
+      splitted_stacked_ret = self.evaluate(
+          (list_ops.tensor_list_stack(splitted[0], dtypes.float32),
+           list_ops.tensor_list_stack(splitted[1], dtypes.float32)))
+      print("Test concat %d: %s, %s, %s, %s"
+            % (i, expected[0], splitted_stacked_ret[0],
+               expected[1], splitted_stacked_ret[1]))
+      self.assertAllClose(expected[0], splitted_stacked_ret[0])
+      self.assertAllClose(expected[1], splitted_stacked_ret[1])
+
+    # Concatenating mismatched shapes fails.
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      self.evaluate(
+          list_ops.tensor_list_concat_lists(
+              l_batch_0,
+              list_ops.empty_tensor_list(scalar_shape(), dtypes.float32),
+              element_dtype=dtypes.float32))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "element shapes are not identical at index 0"):
+      l_batch_of_vec_tls = array_ops.stack(
+          [list_ops.tensor_list_from_tensor([[1.0]], element_shape=[1])] * 2)
+      self.evaluate(
+          list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_vec_tls,
+                                            element_dtype=dtypes.float32))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r"input_b\[0\].dtype != element_dtype."):
+      l_batch_of_int_tls = array_ops.stack(
+          [list_ops.tensor_list_from_tensor([1], element_shape=scalar_shape())]
+          * 2)
+      self.evaluate(
+          list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_int_tls,
+                                            element_dtype=dtypes.float32))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testPushBackBatch(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l_batch = array_ops.stack([l0, l1])
+    l_push = list_ops.tensor_list_push_back_batch(l_batch, [3.0, 4.0])
+    l_unstack = array_ops.unstack(l_push)
+    l0_ret = list_ops.tensor_list_stack(l_unstack[0], dtypes.float32)
+    l1_ret = list_ops.tensor_list_stack(l_unstack[1], dtypes.float32)
+    self.assertAllClose([1.0, 2.0, 3.0], self.evaluate(l0_ret))
+    self.assertAllClose([-1.0, 4.0], self.evaluate(l1_ret))
+
+    with ops.control_dependencies([l_push]):
+      l_unstack_orig = array_ops.unstack(l_batch)
+      l0_orig_ret = list_ops.tensor_list_stack(l_unstack_orig[0],
+                                               dtypes.float32)
+      l1_orig_ret = list_ops.tensor_list_stack(l_unstack_orig[1],
+                                               dtypes.float32)
+
+    # Check that without aliasing, push_back_batch still works; and
+    # that it doesn't modify the input.
+    l0_r_v, l1_r_v, l0_orig_v, l1_orig_v = self.evaluate(
+        (l0_ret, l1_ret, l0_orig_ret, l1_orig_ret))
+    self.assertAllClose([1.0, 2.0, 3.0], l0_r_v)
+    self.assertAllClose([-1.0, 4.0], l1_r_v)
+    self.assertAllClose([1.0, 2.0], l0_orig_v)
+    self.assertAllClose([-1.0], l1_orig_v)
+
+    # Pushing back mismatched shapes fails.
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, []))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "incompatible shape to a list at index 0"):
+      self.evaluate(
+          list_ops.tensor_list_push_back_batch(l_batch, [[3.0], [4.0]]))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Invalid data type at index 0"):
+      self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
 
 if __name__ == "__main__":
-  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 197dbf44afaea2cfaf5a1ffebb6ac0a6be09d165..1123c20a165ba93bd380fa471a8be91f7005d7bb 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -32,11 +33,25 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.ops.losses import util
 from tensorflow.python.platform import test
 from tensorflow.python.training import momentum as momentum_lib
 
 
+safe_div = losses_impl._safe_div  # pylint: disable=protected-access
+
+
+class SafeDivTest(test.TestCase):
+
+  def testEager(self):
+    with context.eager_mode():
+      self.assertAllEqual(safe_div(constant_op.constant(1.0),
+                                   constant_op.constant(0.0)), 0.0)
+      self.assertAllEqual(safe_div(constant_op.constant(1.0),
+                                   0.0), 0.0)
+
+
 class AbsoluteDifferenceLossTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index b8200ac0cb1e4315a56181779c70da1126d8fc15..f31426713c49ba03b978ae7820e22c5c17319139 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import manip_ops
 from tensorflow.python.platform import test as test_lib
@@ -88,41 +90,78 @@ class RollTest(test_util.TensorFlowTestCase):
         x = np.random.rand(3, 2, 1, 1).astype(t)
         self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
 
+  def testNegativeAxis(self):
+    self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
+    self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
+    # Make sure negative axis shoudl be 0 <= axis + dims < dims
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
+                       3, -10).eval()
+
+  def testInvalidInputShape(self):
+    # The input should be 1-D or higher, checked in shape function.
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be at least rank 1 but is rank 0"):
+      manip_ops.roll(7, 1, 0)
+
   def testRollInputMustVectorHigherRaises(self):
-    tensor = 7
+    # The input should be 1-D or higher, checked in kernel.
+    tensor = array_ops.placeholder(dtype=dtypes.int32)
     shift = 1
     axis = 0
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "input must be 1-D or higher"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={tensor: 7})
+
+  def testInvalidAxisShape(self):
+    # The axis should be a scalar or 1-D, checked in shape function.
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be at most rank 1 but is rank 2"):
+      manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]])
 
   def testRollAxisMustBeScalarOrVectorRaises(self):
+    # The axis should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
     shift = 1
-    axis = [[0, 1]]
+    axis = array_ops.placeholder(dtype=dtypes.int32)
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "axis must be a scalar or a 1-D vector"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={axis: [[0, 1]]})
+
+  def testInvalidShiftShape(self):
+    # The shift should be a scalar or 1-D, checked in shape function.
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be at most rank 1 but is rank 2"):
+      manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1)
 
   def testRollShiftMustBeScalarOrVectorRaises(self):
+    # The shift should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
-    shift = [[0, 1]]
+    shift = array_ops.placeholder(dtype=dtypes.int32)
     axis = 1
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "shift must be a scalar or a 1-D vector"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [[0, 1]]})
+
+  def testInvalidShiftAndAxisNotEqualShape(self):
+    # The shift and axis must be same size, checked in shape function.
+    with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+      manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1])
 
   def testRollShiftAndAxisMustBeSameSizeRaises(self):
+    # The shift and axis must be same size, checked in kernel.
     tensor = [[1, 2], [3, 4]]
-    shift = [1]
+    shift = array_ops.placeholder(dtype=dtypes.int32)
     axis = [0, 1]
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "shift and axis must have the same size"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [1]})
 
   def testRollAxisOutOfRangeRaises(self):
     tensor = [1, 2]
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
index 6203a412d7faec4fe9f6179141301579b5900291..a0c66c77d8850d3144678870983730537a253556 100644
--- a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -48,7 +48,7 @@ class ExponentialOpTest(test.TestCase):
   def _verifyExponential(self, x, np_type):
     inp = x.astype(np_type)
     with self.test_session(use_gpu=True):
-      tf_ans = gen_linalg_ops._matrix_exponential(inp)
+      tf_ans = gen_linalg_ops.matrix_exponential(inp)
       if x.size == 0:
         np_ans = np.empty(x.shape, dtype=np_type)
       else:
@@ -116,13 +116,13 @@ class ExponentialOpTest(test.TestCase):
     # When the exponential of a non-square matrix is attempted we should return
     # an error
     with self.assertRaises(ValueError):
-      gen_linalg_ops._matrix_exponential(np.array([[1., 2., 3.], [3., 4., 5.]]))
+      gen_linalg_ops.matrix_exponential(np.array([[1., 2., 3.], [3., 4., 5.]]))
 
   def testWrongDimensions(self):
     # The input to the exponential should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.])
     with self.assertRaises(ValueError):
-      gen_linalg_ops._matrix_exponential(tensor3)
+      gen_linalg_ops.matrix_exponential(tensor3)
 
   def testEmpty(self):
     self._verifyExponentialReal(np.empty([0, 2, 2]))
@@ -143,8 +143,8 @@ class ExponentialOpTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
       matrix2 = random_ops.random_normal([5, 5], seed=42)
-      expm1 = gen_linalg_ops._matrix_exponential(matrix1)
-      expm2 = gen_linalg_ops._matrix_exponential(matrix2)
+      expm1 = gen_linalg_ops.matrix_exponential(matrix1)
+      expm2 = gen_linalg_ops.matrix_exponential(matrix2)
       expm = sess.run([expm1, expm2])
       self.assertAllEqual(expm[0], expm[1])
 
@@ -180,7 +180,7 @@ class MatrixExponentialBenchmark(test.Benchmark):
           session.Session() as sess, \
           ops.device("/cpu:0"):
         matrix = self._GenerateMatrix(shape)
-        expm = gen_linalg_ops._matrix_exponential(matrix)
+        expm = gen_linalg_ops.matrix_exponential(matrix)
         variables.global_variables_initializer().run()
         self.run_op_benchmark(
             sess,
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index 18ed59828c15f5ad21fe054cd6e40991c02bb356..24edc4f59fe6dd84da6732036eb53e2ad367bd06 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -39,8 +39,8 @@ class LogarithmOpTest(test.TestCase):
     inp = x.astype(np_type)
     with self.test_session(use_gpu=True):
       # Verify that expm(logm(A)) == A.
-      tf_ans = gen_linalg_ops._matrix_exponential(
-          gen_linalg_ops._matrix_logarithm(inp))
+      tf_ans = gen_linalg_ops.matrix_exponential(
+          gen_linalg_ops.matrix_logarithm(inp))
       out = tf_ans.eval()
       self.assertAllClose(inp, out, rtol=1e-4, atol=1e-3)
 
@@ -85,14 +85,14 @@ class LogarithmOpTest(test.TestCase):
     # When the logarithm of a non-square matrix is attempted we should return
     # an error
     with self.assertRaises(ValueError):
-      gen_linalg_ops._matrix_logarithm(
+      gen_linalg_ops.matrix_logarithm(
           np.array([[1., 2., 3.], [3., 4., 5.]], dtype=np.complex64))
 
   def testWrongDimensions(self):
     # The input to the logarithm should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.], dtype=dtypes.complex64)
     with self.assertRaises(ValueError):
-      gen_linalg_ops._matrix_logarithm(tensor3)
+      gen_linalg_ops.matrix_logarithm(tensor3)
 
   def testEmpty(self):
     self._verifyLogarithmComplex(np.empty([0, 2, 2], dtype=np.complex64))
@@ -115,8 +115,8 @@ class LogarithmOpTest(test.TestCase):
           random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
       matrix2 = math_ops.cast(
           random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
-      logm1 = gen_linalg_ops._matrix_logarithm(matrix1)
-      logm2 = gen_linalg_ops._matrix_logarithm(matrix2)
+      logm1 = gen_linalg_ops.matrix_logarithm(matrix1)
+      logm2 = gen_linalg_ops.matrix_logarithm(matrix2)
       logm = sess.run([logm1, logm2])
       self.assertAllEqual(logm[0], logm[1])
 
@@ -152,7 +152,7 @@ class MatrixLogarithmBenchmark(test.Benchmark):
           session.Session() as sess, \
           ops.device("/cpu:0"):
         matrix = self._GenerateMatrix(shape)
-        logm = gen_linalg_ops._matrix_logarithm(matrix)
+        logm = gen_linalg_ops.matrix_logarithm(matrix)
         variables.global_variables_initializer().run()
         self.run_op_benchmark(
             sess,
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index fd78c026c273da1ffecf9e1dfe8c9e6042a4be69..55653489aff0a745c5731db4d31864aede97e954 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -417,7 +417,7 @@ class MeanTensorTest(test.TestCase):
 
       self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(mean), 5)
 
-  def testWeighted1d(self):
+  def testBinaryWeighted1d(self):
     with self.test_session() as sess:
       # Create the queue that populates the values.
       values_queue = data_flow_ops.FIFOQueue(
@@ -444,6 +444,33 @@ class MeanTensorTest(test.TestCase):
         sess.run(update_op)
       self.assertAllClose([[3.25, 0.5]], sess.run(mean), 5)
 
+  def testWeighted1d(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = data_flow_ops.FIFOQueue(
+          4, dtypes=dtypes_lib.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = data_flow_ops.FIFOQueue(
+          4, dtypes=dtypes_lib.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [[0.0025]])
+      _enqueue_vector(sess, weights_queue, [[0.005]])
+      _enqueue_vector(sess, weights_queue, [[0.01]])
+      _enqueue_vector(sess, weights_queue, [[0.0075]])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values, weights)
+
+      sess.run(variables.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[0.8, 3.52]], sess.run(mean), 5)
+
   def testWeighted2d_1(self):
     with self.test_session() as sess:
       # Create the queue that populates the values.
@@ -1097,62 +1124,96 @@ class AUCTest(test.TestCase):
 
       self.assertAlmostEqual(0.7, auc.eval(), 5)
 
-  def testAUCPRSpecialCase(self):
+  # Regarding the AUC-PR tests: note that the preferred method when
+  # calculating AUC-PR is summation_method='careful_interpolation'.
+  def testCorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.1, 0.4, 0.35, 0.8], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 1, 1], shape=(1, 4))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
 
       sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.54166, sess.run(update_op), delta=1e-3)
-
-      self.assertAlmostEqual(0.54166, auc.eval(), delta=1e-3)
+      # expected ~= 0.79726744594
+      expected = 1 - math.log(1.5) / 2
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
-  def testAnotherAUCPRSpecialCase(self):
+  def testCorrectAnotherAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
           shape=(1, 7),
           dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
 
       sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.44365042, sess.run(update_op), delta=1e-3)
-
-      self.assertAlmostEqual(0.44365042, auc.eval(), delta=1e-3)
+      # expected ~= 0.61350593198
+      expected = (2.5 - 2 * math.log(4./3) - 0.25 * math.log(7./5)) / 3
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
-  def testThirdAUCPRSpecialCase(self):
+  def testThirdCorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
           shape=(1, 7),
           dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
 
       sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.73611039, sess.run(update_op), delta=1e-3)
+      # expected ~= 0.90410597584
+      expected = 1 - math.log(4./3) / 3
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
-      self.assertAlmostEqual(0.73611039, auc.eval(), delta=1e-3)
+  def testIncorrectAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8], shape=(1, 4), dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 1], shape=(1, 4))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
-  def testFourthAUCPRSpecialCase(self):
-    # Create the labels and data.
-    labels = np.array([
-        0, 0, 0, 0, 0, 0, 0, 1, 0, 1])
-    predictions = np.array([
-        0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35])
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
+
+      self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
 
+  def testAnotherIncorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
-      auc, _ = metrics.auc(
-          labels, predictions, curve='PR', num_thresholds=11)
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
+          shape=(1, 7),
+          dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
-      # Since this is only approximate, we can't expect a 6 digits match.
-      # Although with higher number of samples/thresholds we should see the
-      # accuracy improving
-      self.assertAlmostEqual(0.0, auc.eval(), delta=0.001)
+      self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
+
+      self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
+
+  def testThirdIncorrectAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
+          shape=(1, 7),
+          dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
+
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
+
+      self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-3)
 
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
@@ -1178,16 +1239,16 @@ class AUCTest(test.TestCase):
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
-  def testRecallOneAndPrecisionOne(self):
+  def testRecallOneAndPrecisionOneGivesOnePRAUC(self):
     with self.test_session() as sess:
       predictions = array_ops.ones([4], dtype=dtypes_lib.float32)
       labels = array_ops.ones([4])
       auc, update_op = metrics.auc(labels, predictions, curve='PR')
 
       sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.5, sess.run(update_op), 6)
+      self.assertAlmostEqual(1, sess.run(update_op), 6)
 
-      self.assertAlmostEqual(0.5, auc.eval(), 6)
+      self.assertAlmostEqual(1, auc.eval(), 6)
 
   def np_auc(self, predictions, labels, weights):
     """Computes the AUC explicitly using Numpy.
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index d85512fae69801a0c14789f2ed2122559815f80d..3f71b326a2fcb8accfd3182ce5d42f30aa2c74b4 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -37,17 +37,17 @@ class NormOpTest(test_lib.TestCase):
 
   def testBadOrder(self):
     matrix = [[0., 1.], [2., 3.]]
-    for ord_ in "foo", -7, -1.1, 0:
+    for ord_ in "fro", -7, -1.1, 0:
       with self.assertRaisesRegexp(ValueError,
                                    "'ord' must be a supported vector norm"):
-        linalg_ops.norm(matrix, ord="fro")
+        linalg_ops.norm(matrix, ord=ord_)
 
-    for ord_ in "foo", -7, -1.1, 0:
+    for ord_ in "fro", -7, -1.1, 0:
       with self.assertRaisesRegexp(ValueError,
                                    "'ord' must be a supported vector norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=-1)
 
-    for ord_ in 1.1, 2:
+    for ord_ in "foo", -7, -1.1, 1.1:
       with self.assertRaisesRegexp(ValueError,
                                    "'ord' must be a supported matrix norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1])
@@ -69,14 +69,14 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
       if use_static_shape_:
         tf_matrix = constant_op.constant(matrix)
         tf_norm = linalg_ops.norm(
-            tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_)
+            tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
         tf_norm_val = sess.run(tf_norm)
       else:
         tf_matrix = array_ops.placeholder(dtype_)
         tf_norm = linalg_ops.norm(
-            tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_)
+            tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
         tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix})
-    self.assertAllClose(np_norm, tf_norm_val)
+    self.assertAllClose(np_norm, tf_norm_val, rtol=1e-5, atol=1e-5)
 
   def Test(self):
     is_matrix_norm = (isinstance(axis_, tuple) or
@@ -85,8 +85,6 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
     if ((not is_matrix_norm and ord_ == "fro") or
         (is_matrix_norm and is_fancy_p_norm)):
       self.skipTest("Not supported by neither numpy.linalg.norm nor tf.norm")
-    if is_matrix_norm and ord_ == 2:
-      self.skipTest("Not supported by tf.norm")
     if ord_ == 'euclidean' or (axis_ is None and len(shape) > 2):
       self.skipTest("Not supported by numpy.linalg.norm")
     matrix = np.random.randn(*shape_).astype(dtype_)
diff --git a/tensorflow/python/kernel_tests/nth_element_op_test.py b/tensorflow/python/kernel_tests/nth_element_op_test.py
index 58cd46d2d520790e7e29ab8aea59922b7203ba16..1b8f02140fb5d531c7c1ab2ea6a5fc0b00e5d259 100644
--- a/tensorflow/python/kernel_tests/nth_element_op_test.py
+++ b/tensorflow/python/kernel_tests/nth_element_op_test.py
@@ -154,14 +154,14 @@ class NthElementTest(test.TestCase):
 
   def testGradients(self):
     with self.test_session(use_gpu=False) as sess:
-      inputs = array_ops.placeholder(dtypes.int32, shape=[3, 5])
+      inputs = array_ops.placeholder(dtypes.float32, shape=[3, 5])
       values = nn_ops.nth_element(inputs, 3)
       grad = sess.run(
           gradients_impl.gradients(
               values, inputs, grad_ys=[[-1., 2., 5.]]),
-          feed_dict={inputs: [[2, -1, 1000, 3, 1000],
-                              [1, 5, 2, 4, 3],
-                              [2, 2, 2, 2, 2],
+          feed_dict={inputs: [[2., -1., 1000., 3., 1000.],
+                              [1., 5., 2., 4., 3.],
+                              [2., 2., 2., 2., 2.],
                              ]})
     self.assertAllClose(grad[0], [[0, 0, -0.5, 0, -0.5],
                                   [0, 0, 0, 2, 0],
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 2c766e364073fc8c92156f19d08753367982e7fc..361853448ce2c8477af6920257c58c1eba0fa952 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -215,13 +215,13 @@ class PadOpTest(test.TestCase):
   def testIntTypes(self):
     # TODO(touts): Figure out why the padding tests do not work on GPU
     # for int types and rank > 2.
-    for t in [np.int32, np.int64]:
+    for t in [np.int8, np.int32, np.int64]:
       self._testAll(
           np.random.randint(-100, 100, (4, 4, 3)).astype(t),
           [[1, 0], [2, 3], [0, 2]], 0)
       self._testAll(
           np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t),
-          [[0, 0], [0, 0], [0, 0], [0, 0]], -1234)
+          [[0, 0], [0, 0], [0, 0], [0, 0]], -123)
 
   def testFloatTypes(self):
     for t in [np.float32, np.float64]:
@@ -238,6 +238,29 @@ class PadOpTest(test.TestCase):
       x = np.random.rand(3, 2, 1, 1).astype(t)
       self._testAll(x + 1j * x, [[0, 0], [0, 0], [0, 0], [0, 0]], 0 + 0j)
 
+  def testString(self):
+    # Numpy does not support padding strings so we compare padding manually.
+    x = ops.convert_to_tensor([["Hello", "World"],
+                               ["Goodnight", "Moon"]])
+
+    constant = array_ops.pad(x, [[1, 0], [0, 1]], mode="CONSTANT",
+                             constant_values="PAD")
+    reflect = array_ops.pad(x, [[1, 0], [0, 1]], mode="REFLECT",
+                            constant_values="PAD")
+    symmetric = array_ops.pad(x, [[1, 0], [0, 1]], mode="SYMMETRIC",
+                              constant_values="PAD")
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual([[b"PAD", b"PAD", b"PAD"],
+                           [b"Hello", b"World", b"PAD"],
+                           [b"Goodnight", b"Moon", b"PAD"]], constant.eval())
+      self.assertAllEqual([[b"Goodnight", b"Moon", b"Goodnight"],
+                           [b"Hello", b"World", b"Hello"],
+                           [b"Goodnight", b"Moon", b"Goodnight"]],
+                          reflect.eval())
+      self.assertAllEqual([[b"Hello", b"World", b"World"],
+                           [b"Hello", b"World", b"World"],
+                           [b"Goodnight", b"Moon", b"Moon"]], symmetric.eval())
+
   def testShapeFunctionEdgeCases(self):
     # Unknown paddings shape.
     inp = constant_op.constant(0.0, shape=[4, 4, 4, 4])
@@ -313,5 +336,32 @@ class PadOpTest(test.TestCase):
       self.assertAllEqual(inp, out)
       self.assertShapeEqual(inp, tf_val)
 
+  def testCollapseAdjacentNonPaddedDimensions(self):
+    # pyformat: disable
+    paddings_values = [[[0, 0], [0, 0], [0, 0], [0, 1]],
+                       [[0, 0], [2, 3], [0, 0], [0, 0]],
+                       [[0, 0], [0, 0], [0, 0], [0, 0]]]
+    # pyformat: enable
+    for paddings_value in paddings_values:
+      for dtype in [dtypes.float32, dtypes.int32]:
+        inp = constant_op.constant(1, shape=[8, 28, 28, 3], dtype=dtype)
+        paddings = constant_op.constant(paddings_value, dtype=dtypes.int32)
+        padded = array_ops.pad(inp, paddings)
+        middle = array_ops.slice(padded, [row[0] for row in paddings_value],
+                                 [dim.value for dim in inp.shape.dims])
+        left = array_ops.slice(padded, [0, 0, 0, 0],
+                               [row[0] for row in paddings_value])
+        right = array_ops.slice(
+            padded,
+            [paddings_value[i][0] + inp.shape.dims[i].value for i in range(4)],
+            [-1, -1, -1, -1])
+        with self.test_session(use_gpu=True):
+          self.assertAllEqual(inp.eval(), middle.eval())
+          self.assertAllEqual(
+              np.zeros([row[0] for row in paddings_value]), left.eval())
+          self.assertAllEqual(
+              np.zeros([row[1] for row in paddings_value]), right.eval())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 4466beeec96509b3761e34d885276e1510c62d10..a0c372db7d0a4e76c37c01e1ce24cd8fc9123f7a 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -122,8 +123,9 @@ class PoolingTest(test.TestCase):
       if input_sizes[-1] % 4 != 0:
         tf_logging.info("Skipping test for depth %d", input_sizes[-1])
         return
-    tf_logging.info("Running %s test. %r %r %d %r %r %r", data_format, v2,
-                    input_sizes, total_size, pool_func, ksize, strides)
+    tf_logging.info("Running %s test. %r %r %d %r %r %r %s", data_format, v2,
+                    input_sizes, total_size, pool_func, ksize, strides,
+                    data_type)
     # Initializes the input tensor with array containing incrementing
     # numbers from 1, wrapping round to -127 after 127 to support int8.
     x = [((f + 128) % 255) - 127 for f in range(total_size)]
@@ -192,6 +194,8 @@ class PoolingTest(test.TestCase):
 
     self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
                         data_format, dtypes.float32, expected, use_gpu, v2)
+    self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
+                        data_format, dtypes.float64, expected, use_gpu, v2)
 
     if not use_gpu or test_util.CudaSupportsHalfMatMulAndConv():
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
@@ -405,7 +409,7 @@ class PoolingTest(test.TestCase):
 
     for v2 in [True, False]:
       self._VerifyValues(
-          gen_nn_ops._max_pool_v2,
+          gen_nn_ops.max_pool_v2,
           input_sizes=[1, 3, 3, 3],
           ksize=[1, 2, 2, 1],
           strides=[1, 2, 2, 1],
@@ -427,7 +431,7 @@ class PoolingTest(test.TestCase):
 
     for v2 in [True, False]:
       self._VerifyValues(
-          gen_nn_ops._max_pool_v2,
+          gen_nn_ops.max_pool_v2,
           input_sizes=[1, 2, 3, 3],
           ksize=[1, 2, 2, 1],
           strides=[1, 2, 2, 1],
@@ -456,7 +460,7 @@ class PoolingTest(test.TestCase):
 
     for v2 in [True, False]:
       self._VerifyValues(
-          gen_nn_ops._max_pool_v2,
+          gen_nn_ops.max_pool_v2,
           input_sizes=[1, 2, 2, 1],
           ksize=[1, 1, 2, 1],
           strides=[1, 1, 1, 1],
@@ -485,7 +489,7 @@ class PoolingTest(test.TestCase):
 
     for v2 in [True, False]:
       self._VerifyValues(
-          gen_nn_ops._max_pool_v2,
+          gen_nn_ops.max_pool_v2,
           input_sizes=[1, 4, 4, 1],
           ksize=[1, 2, 2, 1],
           strides=[1, 1, 2, 1],
@@ -494,7 +498,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu,
           v2=v2)
       self._VerifyValues(
-          gen_nn_ops._max_pool_v2,
+          gen_nn_ops.max_pool_v2,
           input_sizes=[1, 4, 4, 1],
           ksize=[1, 2, 2, 1],
           strides=[1, 2, 1, 1],
@@ -519,7 +523,7 @@ class PoolingTest(test.TestCase):
 
     for v2 in [True, False]:
       self._VerifyValues(
-          gen_nn_ops._max_pool_v2,
+          gen_nn_ops.max_pool_v2,
           input_sizes=[1, 4, 4, 4],
           ksize=[1, 2, 2, 1],
           strides=[1, 2, 2, 1],
@@ -554,7 +558,7 @@ class PoolingTest(test.TestCase):
 
     for v2 in [True, False]:
       self._VerifyValues(
-          gen_nn_ops._max_pool_v2,
+          gen_nn_ops.max_pool_v2,
           input_sizes=[1, 8, 8, 8],
           ksize=[1, 3, 3, 1],
           strides=[1, 2, 2, 1],
@@ -565,7 +569,7 @@ class PoolingTest(test.TestCase):
 
   def _testMaxPoolEmptyInput(self, use_gpu):
     self._VerifyValues(
-        gen_nn_ops._max_pool_v2,
+        gen_nn_ops.max_pool_v2,
         input_sizes=[0, 8, 8, 8],
         ksize=[1, 3, 3, 1],
         strides=[1, 2, 2, 1],
@@ -600,7 +604,7 @@ class PoolingTest(test.TestCase):
 
     for v2 in [True, False]:
       self._VerifyValues(
-          gen_nn_ops._max_pool_v2,
+          gen_nn_ops.max_pool_v2,
           input_sizes=[1, 1, 1, 10],
           ksize=[1, 1, 1, 2],
           strides=[1, 1, 1, 2],
@@ -626,7 +630,7 @@ class PoolingTest(test.TestCase):
 
     for v2 in [True, False]:
       self._VerifyValues(
-          gen_nn_ops._max_pool_v2,
+          gen_nn_ops.max_pool_v2,
           input_sizes=[1, 2, 2, 6],
           ksize=[1, 1, 1, 3],
           strides=[1, 1, 1, 3],
@@ -648,7 +652,7 @@ class PoolingTest(test.TestCase):
 
       for v2 in [True, False]:
         self._VerifyValues(
-            gen_nn_ops._max_pool_v2,
+            gen_nn_ops.max_pool_v2,
             input_sizes=[1, 7, 7, 1],
             ksize=[1, 2, 2, 1],
             strides=[1, 3, 3, 1],
@@ -689,7 +693,7 @@ class PoolingTest(test.TestCase):
 
       for v2 in [True, False]:
         self._VerifyValues(
-            gen_nn_ops._max_pool_v2,
+            gen_nn_ops.max_pool_v2,
             input_sizes=[1, 3, 3, 1],
             ksize=[1, 1, 1, 1],
             strides=[1, 2, 2, 1],
@@ -699,7 +703,7 @@ class PoolingTest(test.TestCase):
             v2=v2)
 
         self._VerifyValues(
-            gen_nn_ops._max_pool_v2,
+            gen_nn_ops.max_pool_v2,
             input_sizes=[1, 4, 4, 1],
             ksize=[1, 1, 1, 1],
             strides=[1, 2, 2, 1],
@@ -731,7 +735,8 @@ class PoolingTest(test.TestCase):
                                             [1, 1, 1, 3], "evenly divide")
     if test.is_gpu_available():
       with self.test_session(use_gpu=True):
-        t = constant_op.constant(1.0, shape=[1, 2, 2, 4])
+        t = variables.Variable(np.ones([1, 2, 2, 4]))
+        variables.global_variables_initializer().run()
         with self.assertRaisesOpError("for CPU devices"):
           nn_ops.max_pool(
               t, ksize=[1, 1, 1, 2], strides=[1, 1, 1, 2],
@@ -764,8 +769,8 @@ class PoolingTest(test.TestCase):
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
         argmax = argmax_op.eval()
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
-        out_op = gen_nn_ops._max_pool_grad_with_argmax(t, grad_in, argmax,
-                                                       ksize, strides, padding)
+        out_op = gen_nn_ops.max_pool_grad_with_argmax(t, grad_in, argmax, ksize,
+                                                      strides, padding)
         gpu_val = out_op.eval()
         self.assertShapeEqual(gpu_val, out_op)
       with self.test_session(use_gpu=False):
@@ -773,8 +778,8 @@ class PoolingTest(test.TestCase):
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
         orig_out = out_op.eval()
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
-        out_op = gen_nn_ops._max_pool_grad(t, orig_out, grad_in, ksize, strides,
-                                           padding)
+        out_op = gen_nn_ops.max_pool_grad(t, orig_out, grad_in, ksize, strides,
+                                          padding)
         cpu_val = out_op.eval()
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
@@ -793,7 +798,7 @@ class PoolingTest(test.TestCase):
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
         argmax = argmax_op.eval()
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
-        out_op = gen_nn_ops._max_pool_grad_grad_with_argmax(
+        out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
             t, grad_in, argmax, ksize, strides, padding)
         gpu_val = out_op.eval()
         self.assertShapeEqual(gpu_val, out_op)
@@ -802,8 +807,8 @@ class PoolingTest(test.TestCase):
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
         orig_out = out_op.eval()
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
-        out_op = gen_nn_ops._max_pool_grad_grad(t, orig_out, grad_in, ksize,
-                                                strides, padding)
+        out_op = gen_nn_ops.max_pool_grad_grad(t, orig_out, grad_in, ksize,
+                                               strides, padding)
         cpu_val = out_op.eval()
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
@@ -812,9 +817,6 @@ class PoolingTest(test.TestCase):
           cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
 
   def testMaxPoolingWithArgmax(self):
-    # MaxPoolWithArgMax is implemented only on CUDA.
-    if not test.is_gpu_available(cuda_only=True):
-      return
     tensor_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     with self.test_session(use_gpu=True) as sess:
       t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
@@ -831,9 +833,6 @@ class PoolingTest(test.TestCase):
       self.assertAllEqual(argmax.ravel(), [0, 1, 3, 5])
 
   def testMaxPoolingGradWithArgmax(self):
-    # MaxPoolWithArgMax is implemented only on CUDA.
-    if not test.is_gpu_available(cuda_only=True):
-      return
     orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     tensor_input = [11.0, 12.0, 13.0, 14.0]
     tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
@@ -842,7 +841,7 @@ class PoolingTest(test.TestCase):
       t = constant_op.constant(tensor_input, shape=[1, 2, 2, 1])
       argmax = constant_op.constant(
           tensor_argmax, shape=[1, 2, 2, 1], dtype=dtypes.int64)
-      out_op = gen_nn_ops._max_pool_grad_with_argmax(
+      out_op = gen_nn_ops.max_pool_grad_with_argmax(
           orig_in,
           t,
           argmax,
@@ -865,7 +864,7 @@ class PoolingTest(test.TestCase):
       t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
       argmax = constant_op.constant(
           tensor_argmax, shape=[1, 2, 2, 1], dtype=dtypes.int64)
-      out_op = gen_nn_ops._max_pool_grad_grad_with_argmax(
+      out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
           orig_in,
           t,
           argmax,
@@ -1029,7 +1028,7 @@ class PoolingTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[1, 3, 3, 1],
@@ -1043,7 +1042,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradValidPadding2_1_6(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[2, 6, 6, 3],
@@ -1057,7 +1056,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradValidPadding2_1_7(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[2, 7, 7, 3],
@@ -1071,7 +1070,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradValidPadding1_2(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[1, 3, 3, 1],
@@ -1085,7 +1084,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradValidPadding2_2(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[2, 2, 2, 3],
@@ -1099,7 +1098,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradSamePadding1_1(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[2, 2, 4, 3],
@@ -1113,7 +1112,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradSamePadding1_2(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[2, 2, 4, 3],
@@ -1127,7 +1126,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradSamePadding2_1(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[2, 2, 4, 3],
@@ -1141,7 +1140,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradSamePadding2_2(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[2, 2, 4, 3],
@@ -1155,7 +1154,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradSamePadding3_1(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
           pool_func,
           input_sizes=[1, 7, 7, 1],
@@ -1199,7 +1198,7 @@ class PoolingTest(test.TestCase):
     Returns:
       A Tensor.
     """
-    pool_func = gen_nn_ops.max_pool_grad_v2 if v2 else gen_nn_ops._max_pool_grad
+    pool_func = gen_nn_ops.max_pool_grad_v2 if v2 else gen_nn_ops.max_pool_grad
     return pool_func(orig_input, orig_output, grad,
                      [1, window_rows, window_cols, 1],
                      [1, row_stride, col_stride, 1], padding)
@@ -1208,9 +1207,11 @@ class PoolingTest(test.TestCase):
                              expected_input_backprop, input_sizes, output_sizes,
                              window_rows, window_cols, row_stride, col_stride,
                              padding, use_gpu, v2):
-    pool_func = gen_nn_ops._max_pool_v2 if v2 else nn_ops.max_pool
+    pool_func = gen_nn_ops.max_pool_v2 if v2 else nn_ops.max_pool
     with self.test_session(use_gpu=use_gpu):
-      input_tensor = constant_op.constant(input_data, shape=input_sizes)
+      input_tensor = variables.Variable(
+          np.array(input_data, dtype=np.float32).reshape(input_sizes))
+      variables.global_variables_initializer().run()
       output_tensor = pool_func(input_tensor, [1, window_rows, window_cols, 1],
                                 [1, row_stride, col_stride, 1], padding)
       output_backprop_tensor = constant_op.constant(
@@ -1504,7 +1505,7 @@ class PoolingTest(test.TestCase):
     self._testMaxPoolGradDirectWithNans2_2()
 
   def _testMaxPoolGradGradValidPadding1_1(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestSecondGradient(
           pool_func,
           input_sizes=[1, 3, 3, 1],
@@ -1518,7 +1519,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradGradValidPadding2_1_6(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestSecondGradient(
           pool_func,
           input_sizes=[2, 6, 6, 3],
@@ -1532,7 +1533,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradGradValidPadding2_1_7(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestSecondGradient(
           pool_func,
           input_sizes=[2, 7, 7, 3],
@@ -1546,7 +1547,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradGradValidPadding2_2(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestSecondGradient(
           pool_func,
           input_sizes=[2, 2, 2, 3],
@@ -1560,7 +1561,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradGradSamePadding1_1(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestSecondGradient(
           pool_func,
           input_sizes=[2, 2, 4, 3],
@@ -1574,7 +1575,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradGradSamePadding2_1(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestSecondGradient(
           pool_func,
           input_sizes=[2, 2, 4, 3],
@@ -1588,7 +1589,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradGradSamePadding2_2(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestSecondGradient(
           pool_func,
           input_sizes=[2, 2, 4, 3],
@@ -1602,7 +1603,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   def _testMaxPoolGradGradSamePadding3_1(self, data_format, use_gpu):
-    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+    for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestSecondGradient(
           pool_func,
           input_sizes=[1, 7, 7, 1],
@@ -1644,7 +1645,7 @@ class PoolingTest(test.TestCase):
     Returns:
       A Tensor.
     """
-    return gen_nn_ops._max_pool_grad_grad(
+    return gen_nn_ops.max_pool_grad_grad(
         orig_input, orig_output, grad, [1, window_rows, window_cols, 1],
         [1, row_stride, col_stride, 1], padding)
 
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 61fb3f12e45ea5ae3bc4f0a26c2116b54c003624..b9f44d728a1d9843df1e836594f9caa7010d8a94 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
+
 import numpy as np
 from six.moves import queue
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -33,6 +35,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
@@ -49,6 +52,38 @@ class PyFuncTest(test.TestCase):
   """Encapsulates tests for py_func and eager_py_func."""
 
   # ----- Tests for py_func -----
+  def testRealDataTypes(self):
+    def sum_func(x, y):
+      return x + y
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.uint8, dtypes.int8, dtypes.uint16, dtypes.int16,
+                  dtypes.int32, dtypes.int64]:
+      with self.test_session():
+        x = constant_op.constant(1, dtype=dtype)
+        y = constant_op.constant(2, dtype=dtype)
+        z = self.evaluate(script_ops.py_func(sum_func, [x, y], dtype))
+        self.assertEqual(z, 3)
+
+  def testComplexDataTypes(self):
+    def sub_func(x, y):
+      return x - y
+    for dtype in [dtypes.complex64, dtypes.complex128]:
+      with self.test_session():
+        x = constant_op.constant(1 + 1j, dtype=dtype)
+        y = constant_op.constant(2 - 2j, dtype=dtype)
+        z = self.evaluate(script_ops.py_func(sub_func, [x, y], dtype))
+        self.assertEqual(z, -1 + 3j)
+
+  def testBoolDataTypes(self):
+    def and_func(x, y):
+      return x and y
+    dtype = dtypes.bool
+    with self.test_session():
+      x = constant_op.constant(True, dtype=dtype)
+      y = constant_op.constant(False, dtype=dtype)
+      z = self.evaluate(script_ops.py_func(and_func, [x, y], dtype))
+      self.assertEqual(z, False)
+
   def testSingleType(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
@@ -356,12 +391,22 @@ class PyFuncTest(test.TestCase):
 
   def _testExceptionHandling(self, py_exp, tf_exp, eager=False):
 
-    def raise_exception():
+    def inner_exception():
       raise py_exp("blah")  # pylint: disable=not-callable
 
+    def raise_exception():
+      inner_exception()
+
+    expected_regexp = r": blah.*"               # Error at the top
+    expected_regexp += r"in raise_exception.*"  # Stacktrace outer
+    expected_regexp += r"in inner_exception.*"  # Stacktrace inner
+    expected_regexp += r": blah"                # Stacktrace of raise
+    def expected_error_check(exception):
+      return re.search(expected_regexp, str(exception), re.DOTALL)
+
     if eager:
-      if context.in_eager_mode():
-        with self.assertRaisesRegexp(tf_exp, "blah"):
+      if context.executing_eagerly():
+        with self.assertRaisesWithPredicateMatch(tf_exp, expected_error_check):
           f = script_ops.eager_py_func(raise_exception, [], [])
         return
       else:
@@ -370,7 +415,7 @@ class PyFuncTest(test.TestCase):
       f = script_ops.py_func(raise_exception, [], [])
 
     with self.test_session():
-      with self.assertRaisesRegexp(tf_exp, "blah"):
+      with self.assertRaisesWithPredicateMatch(tf_exp, expected_error_check):
         self.evaluate(f)
 
   def testExceptionHandling(self):
@@ -432,7 +477,7 @@ class PyFuncTest(test.TestCase):
 
       output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
       ret = self.evaluate(output)
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         self.assertEquals(len(ret), 0)
       else:
         self.assertIsNone(ret)
@@ -468,6 +513,18 @@ class PyFuncTest(test.TestCase):
 
       self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerReturningVariableRaisesError(self):
+    def return_variable():
+      variable = resource_variable_ops.ResourceVariable(0.0)
+      return variable
+
+    with self.assertRaisesRegexp(errors.UnknownError,
+                                 "Attempting to return a variable"):
+      output = script_ops.eager_py_func(
+          return_variable, inp=[], Tout=dtypes.float32)
+      self.evaluate(output)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 88a4ddf7f29ec772282e7a8e2b59f144f1a968c2..acd7566eec8e3fffd74db33234b03a0c87427a3e 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -121,15 +121,3 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index a9dc7b7de000024f23b88406bf0c1c2f32ac4fac..051c7d86bf2342f15b587fc350bfbede7fae2285 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -46,7 +46,7 @@ def composed_sampler(logits, num_samples):
   logits = array_ops.expand_dims(logits, -1)
 
   # [batch size, num samples]
-  return math_ops.argmax(logits + noise, dimension=1)
+  return math_ops.argmax(logits + noise, axis=1)
 
 
 native_sampler = random_ops.multinomial
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index df37dd98ece57ae7c3835ab63b720b29fc19c975..e4b5c3832a2252aedc8820a650b022cd30b7f285 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -228,6 +228,17 @@ class RandomUniformTest(test.TestCase):
         print("count = ", count)
       self.assertTrue(count < count_limit)
 
+  def testUniformIntsWithInvalidShape(self):
+    for dtype in dtypes.int32, dtypes.int64:
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=[1, 2], maxval=3, dtype=dtype)
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=1, maxval=[2, 3], dtype=dtype)
+
   # Check that uniform ints actually follow a uniform distribution.
   def testUniformInts(self):
     minv = -2
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index 7f3049b9f841c55759e78c8bc66301f325d2a7c6..fb9e5cc2a3727b70f731ca8ae946b6d5ef325833 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -160,7 +160,7 @@ class ReduceJoinTest(UnicodeTestCase):
             separator=separator)
       if not reduction_indices:
         truth = constant_op.constant(truth)
-      truth_squeezed = array_ops.squeeze(truth, squeeze_dims=reduction_indices)
+      truth_squeezed = array_ops.squeeze(truth, axis=reduction_indices)
       output_array = output.eval()
       output_keep_dims_array = output_keep_dims.eval()
       truth_array = truth.eval()
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 531478162971575739bbe37abfc57ca427ae22ae..ea78b58d88f7ff04a0a5d2272d2c94e1c97009da 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 # The maximum input rank to test.
@@ -212,7 +213,8 @@ class SumReductionTest(BaseReductionTest):
     arr = np.ones([68000], dtype=np.float16)
 
     with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
-      tf_arr = array_ops.constant(arr)
+      tf_arr = variables.Variable(arr)
+      variables.global_variables_initializer().run()
       tf_mean = math_ops.reduce_mean(tf_arr, 0, False)
       tf_out_mean = sess.run(tf_mean)
     self.assertAllClose(tf_out_mean, 1.)
@@ -887,13 +889,9 @@ class AnyReductionTest(test.TestCase):
 
 class CountNonzeroReductionTest(test.TestCase):
 
-  def _compare(self,
-               x,
-               reduction_axes,
-               keepdims,
-               use_gpu=False,
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False, zero=0,
                feed_dict=None):
-    np_ans = (x != 0).astype(np.int32)
+    np_ans = (x != zero).astype(np.int32)
     if reduction_axes is None:
       np_ans = np.sum(np_ans, keepdims=keepdims)
     else:
@@ -960,6 +958,37 @@ class CountNonzeroReductionTest(test.TestCase):
           y = math_ops.count_nonzero(x, [0])
           self.assertAllEqual(y.eval(), np.zeros(9938))
 
+  def testStringReduce(self):
+    # Test case for GitHub issue 18712
+    with self.test_session() as sess:
+      v = math_ops.count_nonzero(constant_op.constant(["test"]))
+      self.assertAllClose(sess.run(v), 1)
+
+  def testStringReduce1D(self):
+    # Create a 1D array of strings
+    x = np.asarray(["", "", "a", "", "", "b"])
+    self._compare(x, None, keepdims=False, zero=np.str(""))
+    self._compare(x, [], keepdims=False, zero=np.str(""))
+    self._compare(x, [0], keepdims=False, zero=np.str(""))
+    self._compare(x, None, keepdims=True, zero=np.str(""))
+    self._compare(x, [], keepdims=True, zero=np.str(""))
+    self._compare(x, [0], keepdims=True, zero=np.str(""))
+
+  def testStringReduce2D(self):
+    # Create a 2D array of strings
+    x = np.asarray([["", "", "a", "", "", "b"],
+                    ["", "c", "", "d", "", ""],
+                    ["e", "", "f", "", "", ""]])
+    self._compare(x, None, keepdims=False, zero=np.str(""))
+    self._compare(x, [], keepdims=False, zero=np.str(""))
+    self._compare(x, [0], keepdims=False, zero=np.str(""))
+    self._compare(x, [1], keepdims=False, zero=np.str(""))
+    self._compare(x, [0, 1], keepdims=False, zero=np.str(""))
+    self._compare(x, None, keepdims=True, zero=np.str(""))
+    self._compare(x, [], keepdims=True, zero=np.str(""))
+    self._compare(x, [0], keepdims=True, zero=np.str(""))
+    self._compare(x, [0, 1], keepdims=True, zero=np.str(""))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/regex_replace_op_test.py b/tensorflow/python/kernel_tests/regex_replace_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6739ac32245668e98d37673fe9e9fe9d55cc0c5f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/regex_replace_op_test.py
@@ -0,0 +1,71 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RegexReplace op from string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class RegexReplaceOpTest(test.TestCase):
+
+  def testRemovePrefix(self):
+    values = ["a:foo", "a:bar", "a:foo", "b:baz", "b:qux", "ca:b"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      stripped = string_ops.regex_replace(
+          input_vector, "^(a:|b:)", "", replace_global=False).eval()
+      self.assertAllEqual([b"foo", b"bar", b"foo", b"baz", b"qux", b"ca:b"],
+                          stripped)
+
+  def testRegexReplace(self):
+    values = ["aba\naba", "abcdabcde"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      stripped = string_ops.regex_replace(input_vector, "a.*a", "(\\0)").eval()
+      self.assertAllEqual([b"(aba)\n(aba)", b"(abcda)bcde"], stripped)
+
+  def testEmptyMatch(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      stripped = string_ops.regex_replace(input_vector, "", "x").eval()
+      self.assertAllEqual([b"xaxbxcx", b"x1x"], stripped)
+
+  def testInvalidPattern(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      invalid_pattern = "A["
+      replace = string_ops.regex_replace(input_vector, invalid_pattern, "x")
+      with self.assertRaisesOpError("Invalid pattern"):
+        replace.eval()
+
+  def testGlobal(self):
+    values = ["ababababab", "abcabcabc", ""]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      stripped = string_ops.regex_replace(input_vector, "ab", "abc",
+                                          True).eval()
+      self.assertAllEqual([b"abcabcabcabcabc", b"abccabccabcc", b""], stripped)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 6b4091ae5d3c6e469a9cd5237b978eae4c75485f..25e947f09e137b37ea129ba6015a060aa01f02e4 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -19,12 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -87,6 +89,35 @@ class ReluTest(test.TestCase):
     print("relu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  # The gradient for fp16 is inaccurate due to the low-precision.
+  # Instead of relying on compute_gradient_error, we compare the fp16 analytical
+  # gradient against their fp32 counterpart.
+  def testGradientFloat16(self):
+    with self.test_session(use_gpu=True) as sess:
+      # Randomly construct a 1D shape from [1, 40)
+      shape = random_ops.random_uniform(
+          [1], minval=1, maxval=40, dtype=dtypes.int32)
+
+      # Construct the fp32 graph and its gradient.
+      x = random_ops.random_uniform(shape, minval=-1, maxval=1, name="x")
+      y1 = nn_ops.relu(x, name="relu_fp32")
+      l1 = nn_ops.l2_loss(y1)
+      dx_f32 = gradients_impl.gradients(l1, x)
+
+      # Construct the fp16 graph and its gradient.
+      # It starts with the same x, in fp32. But before it reaches Relu, it is
+      # cast into fp16. So during backprop, the gradient computation is in fp16.
+      x2 = math_ops.cast(x, dtype=dtypes.float16, name="cast")
+      y2 = nn_ops.relu(x2, name="relu_fp16")
+      l2 = nn_ops.l2_loss(y2)
+      dx_f16 = gradients_impl.gradients(l2, x)
+
+      # Repeat the experiment for 100 times. All tensor shapes and its tensor
+      # values are randomly generated for each run.
+      for _ in xrange(100):
+        dx_f32_v, dx_f16_v = sess.run([dx_f32, dx_f16])
+        self.assertAllClose(dx_f32_v, dx_f16_v, atol=3e-4)
+
   def testGradientFloat64(self):
     with self.test_session():
       x = constant_op.constant(
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 8503f3e0310125bb714942b32bbbf46596f9bddb..984192258c9724dd9d73105c65177786def98e83 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -36,6 +36,9 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
+from tensorflow.python.training import saver
+from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
@@ -87,6 +90,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
+      resource_variable_ops.assign_variable_op(handle, 1)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "Trying to read variable with wrong dtype. "
                                    "Expected float got int32."):
@@ -103,6 +107,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v = resource_variable_ops.ResourceVariable(False, name="bool_test")
       self.assertAllEqual(bool(v), False)
 
+  def testFetchHandle(self):
+    with self.test_session():
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1], name="foo")
+      self.assertGreater(len(handle.eval()), 0)
+
   def testAssignVariableDtypeMismatchEager(self):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
@@ -167,18 +177,186 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
         resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32))
     self.assertEqual(read, 2)
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterAdd(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_add(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterSub(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterMul(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant([[5]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterDiv(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterMin(self):
     with ops.device("cpu:0"):
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(resource_variable_ops.assign_variable_op(
-          handle, constant_op.constant([[1]], dtype=dtypes.int32)))
-      self.evaluate(resource_variable_ops.resource_scatter_add(
-          handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[6]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_min(handle, [0],
+                                                     constant_op.constant(
+                                                         [[3]],
+                                                         dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
       self.assertEqual(self.evaluate(read), [[3]])
 
+  def testMetagraph(self):
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope("foo", use_resource=True):
+        a = variable_scope.get_variable("a", initializer=10.0)
+
+      momentum.MomentumOptimizer(
+          learning_rate=0.001, momentum=0.1).minimize(
+              a,
+              colocate_gradients_with_ops=True,
+              global_step=training_util.get_or_create_global_step())
+
+      graph = ops.get_default_graph()
+      meta_graph_def = saver.export_meta_graph(graph=graph)
+
+    with ops.Graph().as_default():
+      saver.import_meta_graph(meta_graph_def, import_scope="")
+      meta_graph_two = saver.export_meta_graph(graph=graph)
+    self.assertEqual(meta_graph_def, meta_graph_two)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterMax(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterAddScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_add(
+            handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterSubScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterMulScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant(5, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterDivScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterMinScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_min(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterMaxScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
+
   def testScatterUpdateString(self):
     handle = resource_variable_ops.var_handle_op(
         dtype=dtypes.string, shape=[1, 1])
@@ -190,6 +368,23 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(compat.as_bytes(self.evaluate(read)[0][0]),
                      compat.as_bytes("b"))
 
+  def testScatterUpdateStringScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.string, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(handle,
+                                                 constant_op.constant(
+                                                     [["a"]],
+                                                     dtype=dtypes.string)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_update(handle, [0],
+                                                      constant_op.constant(
+                                                          "b",
+                                                          dtype=dtypes.string)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.string)
+    self.assertEqual(
+        compat.as_bytes(self.evaluate(read)[0][0]), compat.as_bytes("b"))
+
   # TODO(alive): get this to work in Eager mode.
   def testGPU(self):
     with self.test_session(use_gpu=True):
@@ -277,6 +472,17 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(v.assign(2.0))
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+    # Tests for the 'read_value' argument:
+    assign_with_read = v.assign(3.0, read_value=True)
+    self.assertEqual(3.0, self.evaluate(assign_with_read))
+    assign_without_read = v.assign(4.0, read_value=False)
+    if context.executing_eagerly():
+      self.assertIsNone(assign_without_read)
+    else:
+      self.assertIsInstance(assign_without_read, ops.Operation)
+    self.evaluate(assign_without_read)
+    self.assertEqual(4.0, self.evaluate(v.value()))
+
   @test_util.run_in_graph_and_eager_modes()
   def testLoad(self):
     v = resource_variable_ops.ResourceVariable(1.0, name="var0")
@@ -329,6 +535,9 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       w = resource_variable_ops.ResourceVariable.from_proto(v.to_proto())
       self.assertEquals(2, math_ops.add(w, 1).eval())
 
+      self.assertEquals(v._handle, w._handle)
+      self.assertEquals(v._graph_element, w._graph_element)
+
   @test_util.run_in_graph_and_eager_modes()
   def testAssignAddMethod(self):
     v = resource_variable_ops.ResourceVariable(1.0, name="var0")
@@ -336,6 +545,17 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(v.assign_add(1.0))
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+    # Tests for the 'read_value' argument:
+    assign_with_read = v.assign_add(1.0, read_value=True)
+    self.assertEqual(3.0, self.evaluate(assign_with_read))
+    assign_without_read = v.assign_add(1.0, read_value=False)
+    if context.executing_eagerly():
+      self.assertIsNone(assign_without_read)
+    else:
+      self.assertIsInstance(assign_without_read, ops.Operation)
+    self.evaluate(assign_without_read)
+    self.assertEqual(4.0, self.evaluate(v.value()))
+
   @test_util.run_in_graph_and_eager_modes()
   def testAssignSubMethod(self):
     v = resource_variable_ops.ResourceVariable(3.0, name="var0")
@@ -343,6 +563,17 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(v.assign_sub(1.0))
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+    # Tests for the 'read_value' argument:
+    assign_with_read = v.assign_sub(1.0, read_value=True)
+    self.assertEqual(1.0, self.evaluate(assign_with_read))
+    assign_without_read = v.assign_sub(1.0, read_value=False)
+    if context.executing_eagerly():
+      self.assertIsNone(assign_without_read)
+    else:
+      self.assertIsInstance(assign_without_read, ops.Operation)
+    self.evaluate(assign_without_read)
+    self.assertEqual(0.0, self.evaluate(v.value()))
+
   @test_util.run_in_graph_and_eager_modes()
   def testDestroyResource(self):
     v = resource_variable_ops.ResourceVariable(3.0, name="var0")
@@ -440,7 +671,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual("(10, 20, 35)", str(v.get_shape()))
     self.assertEqual("(10, 20, 35)", str(v.value().shape))
     self.assertEqual("(3, 20, 35)", str(v.sparse_read([0, 1, 2]).shape))
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEqual(
           "<unknown>",
           str(v.sparse_read(array_ops.placeholder(dtypes.int32)).shape))
@@ -481,7 +712,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual(dtypes.int32, v.dtype)
       self.assertEqual("foo/var7:0", v.name)
       self.assertAllEqual([10, 20, 35], v.shape.as_list())
-      self.assertEqual(context.get_default_context().device_name, v.device)
       self.assertTrue(isinstance(v.handle, ops.EagerTensor))
       self.assertEqual(constraint, v.constraint)
       self.assertAllEqual(init.numpy(), v.read_value().numpy())
@@ -545,12 +775,27 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       state_ops.scatter_update(v, [1], [3.0])
       self.assertAllEqual([1.0, 3.0], v.numpy())
 
+  def testScatterAddStateOps(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="add")
+      state_ops.scatter_add(v, [1], [3])
+      self.assertAllEqual([1.0, 5.0], v.numpy())
+
   def testScatterUpdateCast(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
       state_ops.scatter_update(v, [1], [3])
       self.assertAllEqual([1.0, 3.0], v.numpy())
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testScatterUpdateInvalidArgs(self):
+    v = resource_variable_ops.ResourceVariable([0, 1, 2, 3], name="update")
+    # The exact error and message differ between graph construction (where the
+    # error is realized during shape inference at graph construction time) and
+    # eager execution (where the error is realized during kernel execution).
+    with self.assertRaisesRegexp(Exception, r"shape.*2.*3"):
+      state_ops.scatter_update(v, [0, 1], [0, 1, 2])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index daa42938e6af205425d7e423ce162294b9002be4..fe5ad84c104502f0e09d3a963b406f49d6b97b71 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -111,10 +111,10 @@ class RNNTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testInvalidSequenceLengthShape(self):
     cell = Plus1RNNCell()
-    if context.in_graph_mode():
-      inputs = [array_ops.placeholder(dtypes.float32, shape=(3, 4))]
-    else:
+    if context.executing_eagerly():
       inputs = [constant_op.constant(np.ones((3, 4)))]
+    else:
+      inputs = [array_ops.placeholder(dtypes.float32, shape=(3, 4))]
     with self.assertRaisesRegexp(ValueError, "must be a vector"):
       rnn.dynamic_rnn(
           cell,
@@ -125,38 +125,30 @@ class RNNTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testBatchSizeFromInput(self):
     cell = Plus1RNNCell()
-    in_graph_mode = context.in_graph_mode()
+    in_eager_mode = context.executing_eagerly()
     # With static batch size
-    if in_graph_mode:
-      inputs = array_ops.placeholder(dtypes.float32, shape=(3, 4, 5))
-      initial_state = array_ops.placeholder(dtypes.float32, shape=(3, 5))
-    else:
+    if in_eager_mode:
       inputs = np.zeros((3, 4, 5), dtype=np.float32)
       initial_state = np.zeros((3, 5), dtype=np.float32)
+    else:
+      inputs = array_ops.placeholder(dtypes.float32, shape=(3, 4, 5))
+      initial_state = array_ops.placeholder(dtypes.float32, shape=(3, 5))
 
     # - Without initial_state
     outputs, state = rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32)
-    if in_graph_mode:
-      self.assertEqual(3, outputs.shape[0].value)
-      self.assertEqual(3, state.shape[0].value)
-    else:
-      self.assertEqual(3, outputs.shape[0])
-      self.assertEqual(3, state.shape[0])
+    self.assertEqual(3, outputs.shape[0])
+    self.assertEqual(3, state.shape[0])
 
     # - With initial_state
     outputs, state = rnn.dynamic_rnn(
         cell, inputs, initial_state=initial_state)
-    if in_graph_mode:
-      self.assertEqual(3, outputs.shape[0].value)
-      self.assertEqual(3, state.shape[0].value)
-    else:
-      self.assertEqual(3, outputs.shape[0])
-      self.assertEqual(3, state.shape[0])
+    self.assertEqual(3, outputs.shape[0])
+    self.assertEqual(3, state.shape[0])
 
     # Without static batch size
-    # Tensor shapes are fully determined in Eager mode, so only run this
-    # test in graph mode.
-    if in_graph_mode:
+    # Tensor shapes are fully determined with eager execution enabled,
+    # so only run this test for graph construction.
+    if not in_eager_mode:
       inputs = array_ops.placeholder(dtypes.float32, shape=(None, 4, 5))
       # - Without initial_state
       outputs, state = rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32)
@@ -173,56 +165,68 @@ class RNNTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testScalarStateIsAccepted(self):
     cell = ScalarStateRNNCell()
-    in_graph_mode = context.in_graph_mode()
+    in_eager_mode = context.executing_eagerly()
 
-    if in_graph_mode:
-      inputs = array_ops.placeholder(dtypes.float32, shape=(1, 4, 1))
-    else:
+    if in_eager_mode:
       inputs = np.array([[[1], [2], [3], [4]]], dtype=np.float32)
+    else:
+      inputs = array_ops.placeholder(dtypes.float32, shape=(1, 4, 1))
 
     with self.test_session() as sess:
       outputs, state = rnn.dynamic_rnn(
           cell, inputs, dtype=dtypes.float32, sequence_length=[4])
-      if in_graph_mode:
+      if not in_eager_mode:
         outputs, state = sess.run(
             [outputs, state], feed_dict={inputs: [[[1], [2], [3], [4]]]})
 
-    if in_graph_mode:
-      self.assertAllEqual(outputs, np.array([[[1], [2], [3], [4]]]))
-      self.assertEqual(state, 4)
-    else:
-      self.assertAllEqual(outputs.numpy(), np.array([[[1], [2], [3], [4]]]))
-      self.assertEqual(state.numpy(), 4)
+    self.assertAllEqual([[[1], [2], [3], [4]]], outputs)
+    self.assertAllEqual(4, state)
 
   @test_util.run_in_graph_and_eager_modes()
   def testTensorArrayStateIsAccepted(self):
     cell = TensorArrayStateRNNCell()
-    in_graph_mode = context.in_graph_mode()
+    in_eager_mode = context.executing_eagerly()
 
-    if in_graph_mode:
-      inputs = array_ops.placeholder(dtypes.float32, shape=(1, 4, 1))
-    else:
+    if in_eager_mode:
       inputs = np.array([[[1], [2], [3], [4]]], dtype=np.float32)
+    else:
+      inputs = array_ops.placeholder(dtypes.float32, shape=(1, 4, 1))
 
     with self.test_session() as sess:
       outputs, state = rnn.dynamic_rnn(
           cell, inputs, dtype=dtypes.float32, sequence_length=[4])
       state = (state[0], state[1].stack())
-      if in_graph_mode:
+      if not in_eager_mode:
         outputs, state = sess.run(
             [outputs, state], feed_dict={
                 inputs: [[[1], [2], [3], [4]]]
             })
 
-    if in_graph_mode:
-      self.assertAllEqual(outputs, np.array([[[1], [2], [3], [4]]]))
-      self.assertEqual(state[0], 4)
-      self.assertAllEqual(state[1], np.array([[[1]], [[2]], [[3]], [[4]]]))
-    else:
-      self.assertAllEqual(outputs.numpy(), np.array([[[1], [2], [3], [4]]]))
-      self.assertEqual(state[0].numpy(), 4)
-      self.assertAllEqual(state[1].numpy(),
-                          np.array([[[1]], [[2]], [[3]], [[4]]]))
+    self.assertAllEqual([[[1], [2], [3], [4]]], outputs)
+    self.assertAllEqual(4, state[0])
+    self.assertAllEqual([[[1]], [[2]], [[3]], [[4]]], state[1])
+
+  def _assert_cell_builds(self, cell_class, dtype, batch_size, in_size,
+                          out_size):
+    cell = cell_class(out_size, dtype=dtype)
+    in_shape = tensor_shape.TensorShape((batch_size, in_size))
+    cell.build(in_shape)
+    state_output = cell.zero_state(batch_size, dtype)
+    cell_output, _ = cell(array_ops.zeros(in_shape, dtype), state_output)
+    self.assertAllEqual([batch_size, out_size], cell_output.shape.as_list())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCellsBuild(self):
+    f32 = dtypes.float32
+    f64 = dtypes.float64
+    self._assert_cell_builds(rnn_cell_impl.BasicRNNCell, f32, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.BasicRNNCell, f64, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.BasicLSTMCell, f32, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.BasicLSTMCell, f64, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.GRUCell, f32, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.GRUCell, f64, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.LSTMCell, f32, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.LSTMCell, f64, 5, 7, 3)
 
 
 ######### Benchmarking RNN code
diff --git a/tensorflow/python/kernel_tests/save_restore_ops_test.py b/tensorflow/python/kernel_tests/save_restore_ops_test.py
index 1bdfa9ebd8e1a4495e67004f59adfb51bf3a6602..cb9aa1e34d6eb82efa94e60e7b56c26b181cef04 100644
--- a/tensorflow/python/kernel_tests/save_restore_ops_test.py
+++ b/tensorflow/python/kernel_tests/save_restore_ops_test.py
@@ -31,11 +31,10 @@ class ShardedFileOpsTest(test.TestCase):
     with session.Session(
         target="", config=config_pb2.ConfigProto(device_count={"CPU": 2})):
       self.assertEqual(
-          gen_io_ops._sharded_filename("foo", 4, 100).eval(),
+          gen_io_ops.sharded_filename("foo", 4, 100).eval(),
           b"foo-00004-of-00100")
       self.assertEqual(
-          gen_io_ops._sharded_filespec("foo", 100).eval(),
-          b"foo-?????-of-00100")
+          gen_io_ops.sharded_filespec("foo", 100).eval(), b"foo-?????-of-00100")
 
 
 class ShapeInferenceTest(test.TestCase):
@@ -53,7 +52,7 @@ class ShapeInferenceTest(test.TestCase):
                         [dtypes.float32, dtypes.float32])
 
   def testRestoreSlice(self):
-    op = gen_io_ops._restore_slice("model", "var", "3 4 0,1:-", dtypes.float32)
+    op = gen_io_ops.restore_slice("model", "var", "3 4 0,1:-", dtypes.float32)
     self.assertEqual([1, 4], op.get_shape())
 
 
diff --git a/tensorflow/python/kernel_tests/scalar_test.py b/tensorflow/python/kernel_tests/scalar_test.py
index e65241981eac2d42207c1de7a261f7936e588f2a..0d8fd232946883ac1d95c4c2d9744af69175ab90 100644
--- a/tensorflow/python/kernel_tests/scalar_test.py
+++ b/tensorflow/python/kernel_tests/scalar_test.py
@@ -92,11 +92,11 @@ class ScalarTest(test.TestCase):
     self.check(array_ops.reshape, (7, 1), 'sizes input must be 1-D', [7])
 
   def testShardedFilename(self):
-    self.check(gen_io_ops._sharded_filename, ('foo', 4, [100]),
+    self.check(gen_io_ops.sharded_filename, ('foo', 4, [100]),
                'must be a scalar', b'foo-00004-of-00100')
 
   def testShardedFilespec(self):
-    self.check(gen_io_ops._sharded_filespec, ('foo', [100]), 'must be a scalar',
+    self.check(gen_io_ops.sharded_filespec, ('foo', [100]), 'must be a scalar',
                b'foo-?????-of-00100')
 
   def testUnsortedSegmentSum(self):
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 9f5794951524b2689daa5fc4eefb19703262b8f0..b7477a768ab718c5a57293cc6f774298e1c9f891 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -364,6 +364,42 @@ class ScatterNdTest(test.TestCase):
     del input_  # input_ is not used in scatter_nd
     return array_ops.scatter_nd(indices, updates, shape)
 
+  def testString(self):
+    indices = constant_op.constant([[4], [3], [1], [7]],
+                                   dtype=dtypes.int32)
+    updates = constant_op.constant(["four", "three", "one", "seven"],
+                                   dtype=dtypes.string)
+    expected = np.array([b"", b"one", b"", b"three", b"four",
+                         b"", b"", b"seven"])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertAllEqual(expected, result)
+
+    # Same indice is updated twice by same value.
+    indices = constant_op.constant([[4], [3], [3], [7]],
+                                   dtype=dtypes.int32)
+    updates = constant_op.constant(["a", "b", "b", "c"],
+                                   dtype=dtypes.string)
+    expected = np.array([b"", b"", b"", b"bb", b"a", b"", b"", b"c"])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertAllEqual(expected, result)
+
+    # Same indice is updated twice by different value.
+    indices = constant_op.constant([[4], [3], [3], [7]],
+                                   dtype=dtypes.int32)
+    updates = constant_op.constant(["a", "b", "c", "d"],
+                                   dtype=dtypes.string)
+    expected = [np.array([b"", b"", b"", b"bc", b"a", b"", b"", b"d"]),
+                np.array([b"", b"", b"", b"cb", b"a", b"", b"", b"d"])]
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertTrue(np.array_equal(result, expected[0]) or
+                      np.array_equal(result, expected[1]))
+
   def testRank3ValidShape(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -584,6 +620,10 @@ class ScatterNdNonAliasingAddTest(ScatterNdTest):
         shape, dtype=updates.dtype))
     return array_ops.scatter_nd_non_aliasing_add(input_, indices, updates)
 
+  def testString(self):
+    # Not supported yet.
+    pass
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 7cdf11d88468cabaf32387b0a4bdda760b4af31e..c70a4ffce7be71effe3ea10faa9754ab2b3842ce 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -38,38 +38,100 @@ def _NumpyAdd(ref, indices, updates):
     ref[indx] += updates[i]
 
 
+def _NumpyAddScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] += update
+
+
 def _NumpySub(ref, indices, updates):
   for i, indx in np.ndenumerate(indices):
     ref[indx] -= updates[i]
 
 
+def _NumpySubScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] -= update
+
+
 def _NumpyMul(ref, indices, updates):
   for i, indx in np.ndenumerate(indices):
     ref[indx] *= updates[i]
 
 
+def _NumpyMulScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] *= update
+
+
 def _NumpyDiv(ref, indices, updates):
   for i, indx in np.ndenumerate(indices):
     ref[indx] /= updates[i]
 
 
+def _NumpyDivScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] /= update
+
+
+def _NumpyMin(ref, indices, updates):
+  for i, indx in np.ndenumerate(indices):
+    ref[indx] = np.minimum(ref[indx], updates[i])
+
+
+def _NumpyMinScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] = np.minimum(ref[indx], update)
+
+
+def _NumpyMax(ref, indices, updates):
+  for i, indx in np.ndenumerate(indices):
+    ref[indx] = np.maximum(ref[indx], updates[i])
+
+
+def _NumpyMaxScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] = np.maximum(ref[indx], update)
+
+
 def _NumpyUpdate(ref, indices, updates):
   for i, indx in np.ndenumerate(indices):
     ref[indx] = updates[i]
 
 
+def _NumpyUpdateScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] = update
+
+
 _TF_OPS_TO_NUMPY = {
     state_ops.scatter_update: _NumpyUpdate,
     state_ops.scatter_add: _NumpyAdd,
     state_ops.scatter_sub: _NumpySub,
     state_ops.scatter_mul: _NumpyMul,
     state_ops.scatter_div: _NumpyDiv,
+    state_ops.scatter_min: _NumpyMin,
+    state_ops.scatter_max: _NumpyMax,
+}
+
+_TF_OPS_TO_NUMPY_SCALAR = {
+    state_ops.scatter_update: _NumpyUpdateScalar,
+    state_ops.scatter_add: _NumpyAddScalar,
+    state_ops.scatter_sub: _NumpySubScalar,
+    state_ops.scatter_mul: _NumpyMulScalar,
+    state_ops.scatter_div: _NumpyDivScalar,
+    state_ops.scatter_min: _NumpyMinScalar,
+    state_ops.scatter_max: _NumpyMaxScalar,
 }
 
 
 class ScatterTest(test.TestCase):
 
-  def _VariableRankTest(self, tf_scatter, vtype, itype, repeat_indices=False):
+  def _VariableRankTest(self,
+                        tf_scatter,
+                        vtype,
+                        itype,
+                        repeat_indices=False,
+                        updates_are_scalar=False):
     np.random.seed(8)
     with self.test_session(use_gpu=True):
       for indices_shape in (), (2,), (3, 7), (3, 4, 7):
@@ -89,8 +151,11 @@ class ScatterTest(test.TestCase):
                                   indices[np.random.randint(size // 2)])
             np.random.shuffle(indices)
           indices = indices.reshape(indices_shape)
-          updates = _AsType(
-              np.random.randn(*(indices_shape + extra_shape)), vtype)
+          if updates_are_scalar:
+            updates = _AsType(np.random.randn(), vtype)
+          else:
+            updates = _AsType(
+                np.random.randn(*(indices_shape + extra_shape)), vtype)
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
@@ -101,7 +166,10 @@ class ScatterTest(test.TestCase):
 
           # Scatter via numpy
           new = old.copy()
-          np_scatter = _TF_OPS_TO_NUMPY[tf_scatter]
+          if updates_are_scalar:
+            np_scatter = _TF_OPS_TO_NUMPY_SCALAR[tf_scatter]
+          else:
+            np_scatter = _TF_OPS_TO_NUMPY[tf_scatter]
           np_scatter(new, indices, updates)
           # Scatter via tensorflow
           ref = variables.Variable(old)
@@ -109,25 +177,35 @@ class ScatterTest(test.TestCase):
           tf_scatter(ref, indices, updates).eval()
           self.assertAllClose(ref.eval(), new)
 
-  def _VariableRankTests(self, tf_scatter, repeat_indices=False):
+  def _VariableRankTests(self,
+                         tf_scatter,
+                         repeat_indices=False,
+                         updates_are_scalar=False):
     for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
-        self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices)
+        self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
+                               updates_are_scalar)
 
   def testVariableRankUpdate(self):
-    self._VariableRankTests(state_ops.scatter_update)
+    self._VariableRankTests(state_ops.scatter_update, False)
 
   def testVariableRankAdd(self):
-    self._VariableRankTests(state_ops.scatter_add)
+    self._VariableRankTests(state_ops.scatter_add, False)
 
   def testVariableRankSub(self):
-    self._VariableRankTests(state_ops.scatter_sub)
+    self._VariableRankTests(state_ops.scatter_sub, False)
 
   def testVariableRankMul(self):
-    self._VariableRankTests(state_ops.scatter_mul)
+    self._VariableRankTests(state_ops.scatter_mul, False)
 
   def testVariableRankDiv(self):
-    self._VariableRankTests(state_ops.scatter_div)
+    self._VariableRankTests(state_ops.scatter_div, False)
+
+  def testVariableRankMin(self):
+    self._VariableRankTests(state_ops.scatter_min, False)
+
+  def testVariableRankMax(self):
+    self._VariableRankTests(state_ops.scatter_max, False)
 
   def testRepeatIndicesAdd(self):
     self._VariableRankTests(state_ops.scatter_add, True)
@@ -141,6 +219,51 @@ class ScatterTest(test.TestCase):
   def testRepeatIndicesDiv(self):
     self._VariableRankTests(state_ops.scatter_div, True)
 
+  def testRepeatIndicesMin(self):
+    self._VariableRankTests(state_ops.scatter_min, True)
+
+  def testRepeatIndicesMax(self):
+    self._VariableRankTests(state_ops.scatter_max, True)
+
+  def testVariableRankUpdateScalar(self):
+    self._VariableRankTests(state_ops.scatter_update, False, True)
+
+  def testVariableRankAddScalar(self):
+    self._VariableRankTests(state_ops.scatter_add, False, True)
+
+  def testVariableRankSubScalar(self):
+    self._VariableRankTests(state_ops.scatter_sub, False, True)
+
+  def testVariableRankMulScalar(self):
+    self._VariableRankTests(state_ops.scatter_mul, False, True)
+
+  def testVariableRankDivScalar(self):
+    self._VariableRankTests(state_ops.scatter_div, False, True)
+
+  def testVariableRankMinScalar(self):
+    self._VariableRankTests(state_ops.scatter_min, False, True)
+
+  def testVariableRankMaxScalar(self):
+    self._VariableRankTests(state_ops.scatter_max, False, True)
+
+  def testRepeatIndicesAddScalar(self):
+    self._VariableRankTests(state_ops.scatter_add, True, True)
+
+  def testRepeatIndicesSubScalar(self):
+    self._VariableRankTests(state_ops.scatter_sub, True, True)
+
+  def testRepeatIndicesMulScalar(self):
+    self._VariableRankTests(state_ops.scatter_mul, True, True)
+
+  def testRepeatIndicesDivScalar(self):
+    self._VariableRankTests(state_ops.scatter_div, True, True)
+
+  def testRepeatIndicesMinScalar(self):
+    self._VariableRankTests(state_ops.scatter_min, True, True)
+
+  def testRepeatIndicesMaxScalar(self):
+    self._VariableRankTests(state_ops.scatter_max, True, True)
+
   def testBooleanScatterUpdate(self):
     if not test.is_gpu_available():
       with self.test_session(use_gpu=False) as session:
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index bbce6b7d47325b8209815230426672ec6894147f..3bca5fadc42693f514911c7ffa8f078de8ef9bcd 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -542,6 +542,25 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         tf_ans = s.eval()
         self.assertAllClose(np_ans, tf_ans)
 
+  def testWithEmptySegments(self):
+    tf_x = constant_op.constant([], shape=[0, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments
+    ]
+    segment_indices = []
+    tf_indices = []
+    num_segments = 5
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        tf_ans = s.eval()
+        self.assertAllClose(np.zeros([5, 4]), tf_ans)
+
   def testSegmentIdsGreaterThanZero(self):
     tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [(np.add, None, math_ops.sparse_segment_sum), (
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 4de5f4e4dbd38043557c54ede90fa47e43a1e26d..d2647088c5c2afda032482fb5cfd983cedb49a8f 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -71,6 +71,23 @@ class SelfAdjointEigTest(test.TestCase):
       self.assertAllEqual(val[4], val[5])
       self.assertAllEqual(val[1], val[3])
 
+  def testMatrixThatFailsWhenFlushingDenormsToZero(self):
+    # Test a 32x32 matrix which is known to fail if denorm floats are flushed to
+    # zero.
+    matrix = np.genfromtxt(
+        test.test_src_dir_path(
+            "python/kernel_tests/testdata/"
+            "self_adjoint_eig_fail_if_denorms_flushed.txt")).astype(np.float32)
+    self.assertEqual(matrix.shape, (32, 32))
+    matrix_tensor = constant_op.constant(matrix)
+    with self.test_session(use_gpu=True) as sess:
+      (e, v) = sess.run(linalg_ops.self_adjoint_eig(matrix_tensor))
+      self.assertEqual(e.size, 32)
+      self.assertAllClose(
+          np.matmul(v, v.transpose()), np.eye(32, dtype=np.float32), atol=2e-3)
+      self.assertAllClose(matrix,
+                          np.matmul(np.matmul(v, np.diag(e)), v.transpose()))
+
 
 def SortEigenDecomposition(e, v):
   if v.ndim < 2:
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 051a25080b826de05ee3e24a82fbcd1f47995544..5fc9bef21816e3a12f0d274bab1fc82a83546422 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -283,7 +283,7 @@ class SliceTest(test.TestCase):
     # unintended behavior is prevented.
     c = constant_op.constant(5.0)
     with self.assertRaisesWithPredicateMatch(
-        TypeError, lambda e: "`Tensor` objects are not iterable" in str(e)):
+        TypeError, lambda e: "Tensor objects are not iterable" in str(e)):
       for _ in c:
         pass
 
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 4d89831aae9a5e95210a8defb180e09c9d38f4d6..dc4d4dbeabf3c53be6e30cb357f4a1d2d7e69064 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -18,15 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import unittest
 import numpy as np
 
-from tensorflow.python.framework import constant_op
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 
 
 @test_util.with_c_api
@@ -37,14 +39,21 @@ class SoftmaxTest(test.TestCase):
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
+    is_fp16 = features.dtype == np.float16
+    if is_fp16:
+      # Do the compute in fp32 and cast the input back to fp32.
+      features = features.astype(np.float32)
     e = np.exp(features - np.reshape(
         np.amax(
             features, axis=dim), one_only_on_dim))
     softmax = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim)
     if log:
-      return np.log(softmax)
+      res = np.log(softmax)
     else:
-      return softmax
+      res = softmax
+    if is_fp16:
+      res = res.astype(np.float16)
+    return res
 
   def _testSoftmax(self, np_features, dim=-1, log=False, use_gpu=False):
     # A previous version of the code checked the op name rather than the op type
@@ -54,9 +63,9 @@ class SoftmaxTest(test.TestCase):
     np_softmax = self._npSoftmax(np_features, dim=dim, log=log)
     with self.test_session(use_gpu=use_gpu):
       if log:
-        tf_softmax = nn_ops.log_softmax(np_features, dim=dim, name=name)
+        tf_softmax = nn_ops.log_softmax(np_features, axis=dim, name=name)
       else:
-        tf_softmax = nn_ops.softmax(np_features, dim=dim, name=name)
+        tf_softmax = nn_ops.softmax(np_features, axis=dim, name=name)
       out = tf_softmax.eval()
     self.assertAllCloseAccordingToType(np_softmax, out)
     self.assertShapeEqual(np_softmax, tf_softmax)
@@ -118,10 +127,32 @@ class SoftmaxTest(test.TestCase):
     self._testAll(
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float32))
 
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testFloatGPU(self):
+    if test.is_gpu_available(cuda_only=True):
+      rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      for row, col in zip(rows, cols):
+        logging.info("Testing softmax float dtype in shape [%d, %d]", row, col)
+        data = np.random.rand(row, col)
+        self._testAll(data.astype(np.float32))
+
   def testHalf(self):
     self._testAll(
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float16))
 
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testHalfGPU(self):
+    if test.is_gpu_available(cuda_only=True):
+      rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      for row, col in zip(rows, cols):
+        logging.info("Testing softmax half dtype in shape [%d, %d]", row, col)
+        data = np.random.rand(row, col)
+        self._testAll(data.astype(np.float16))
+
   def testDouble(self):
     self._testSoftmax(
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float64))
@@ -166,11 +197,11 @@ class SoftmaxTest(test.TestCase):
 
   def testEmptyInput(self):
     with self.test_session():
-      x = constant_op.constant([[]], shape=[0, 3])
+      x = array_ops.placeholder(dtypes.float32, shape=[0, 3])
       self.assertEqual(0, array_ops.size(x).eval())
       # reshape would raise if logits is empty
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax(x, dim=0).eval()
+        nn_ops.softmax(x, axis=0).eval()
 
   def testDimTooLarge(self):
     with self.test_session():
@@ -178,7 +209,7 @@ class SoftmaxTest(test.TestCase):
       # inference error.
       dim = array_ops.placeholder_with_default(100, shape=[])
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax([1., 2., 3., 4.], dim=dim).eval()
+        nn_ops.softmax([1., 2., 3., 4.], axis=dim).eval()
 
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index b943dfa4e5f2a06eddcb3af03764e5e046b715f4..2a9232b6aecb66328f10a62f2251246c4fcec6e6 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -86,11 +86,11 @@ class CppOpImpl(object):
 
   @staticmethod
   def space_to_batch(*args, **kwargs):
-    return gen_array_ops._space_to_batch(*args, **kwargs)
+    return gen_array_ops.space_to_batch(*args, **kwargs)
 
   @staticmethod
   def batch_to_space(*args, **kwargs):
-    return gen_array_ops._batch_to_space(*args, **kwargs)
+    return gen_array_ops.batch_to_space(*args, **kwargs)
 
 
 class SpaceToBatchTest(test.TestCase, PythonOpImpl):
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index 3c98a685e07a1f2d55c3c1035a99ffaa593d35b3..cd90d16aacb4325ed426b0466266d9616b574401 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -34,8 +34,8 @@ from tensorflow.python.platform import tf_logging
 
 class SpaceToDepthTest(test.TestCase):
 
-  def _testOne(self, inputs, block_size, outputs):
-    input_nhwc = math_ops.to_float(inputs)
+  def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
+    input_nhwc = math_ops.cast(inputs, dtype)
     with self.test_session(use_gpu=False):
       # test NHWC (default) on CPU
       x_tf = array_ops.space_to_depth(input_nhwc, block_size)
@@ -58,6 +58,12 @@ class SpaceToDepthTest(test.TestCase):
     x_out = [[[[1, 2, 3, 4]]]]
     self._testOne(x_np, block_size, x_out)
 
+  def testBasicFloat16(self):
+    x_np = [[[[1], [2]], [[3], [4]]]]
+    block_size = 2
+    x_out = [[[[1, 2, 3, 4]]]]
+    self._testOne(x_np, block_size, x_out, dtype=dtypes.float16)
+
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
   def testLargerInput2x2(self):
@@ -126,6 +132,24 @@ class SpaceToDepthTest(test.TestCase):
     x_out = [batch_output_elt(i) for i in range(batch_size)]
     self._testOne(x_np, block_size, x_out)
 
+  def testBatchSize0(self):
+    block_size = 2
+    batch_size = 0
+    input_nhwc = array_ops.ones([batch_size, 4, 6, 3])
+    x_out = array_ops.ones([batch_size, 2, 3, 12])
+
+    with self.test_session(use_gpu=False):
+      # test NHWC (default) on CPU
+      x_tf = array_ops.space_to_depth(input_nhwc, block_size)
+      self.assertAllEqual(x_tf.shape, x_out.shape)
+      x_tf.eval()
+    if test.is_gpu_available():
+      with self.test_session(use_gpu=True):
+        # test NHWC (default) on GPU
+        x_tf = array_ops.space_to_depth(input_nhwc, block_size)
+        self.assertAllEqual(x_tf.shape, x_out.shape)
+        x_tf.eval()
+
   # Tests for different width and height.
   def testNonSquare(self):
     x_np = [[[[1, 10], [2, 20]], [[3, 30], [4, 40]], [[5, 50], [6, 60]],
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index cd5b711a0ed18aabff543aa4b6ecb1a885618caf..a841fe83a7f585a69ef33c437570359797484a4a 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -64,7 +64,7 @@ class SparseXentTest(test.TestCase):
   def _testXent(self, np_features, np_labels):
     np_loss, np_backprop = self._npXent(np_features, np_labels)
     with self.test_session(use_gpu=True) as sess:
-      loss, backprop = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
+      loss, backprop = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
           np_features, np_labels)
       tf_loss, tf_backprop = sess.run([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
@@ -73,7 +73,7 @@ class SparseXentTest(test.TestCase):
   def testSingleClass(self):
     for label_dtype in np.int32, np.int64:
       with self.test_session(use_gpu=True) as sess:
-        loss, backprop = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
+        loss, backprop = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
             np.array([[1.], [-1.], [0.]]).astype(np.float32),
             np.array([0, 0, 0]).astype(label_dtype))
         tf_loss, tf_backprop = sess.run([loss, backprop])
@@ -87,8 +87,9 @@ class SparseXentTest(test.TestCase):
 
     if test.is_built_with_cuda() and test.is_gpu_available():
       with self.test_session(use_gpu=True) as sess:
-        loss, backprop = (gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
-            features, labels))
+        loss, backprop = (
+            gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
+                features, labels))
         tf_loss, tf_backprop = sess.run([loss, backprop])
         self.assertAllClose(
             [[np.nan] * 4, [0.25, 0.25, 0.25, -0.75],
@@ -100,8 +101,8 @@ class SparseXentTest(test.TestCase):
             [np.nan, 1.3862, 3.4420, np.nan], tf_loss, rtol=1e-3, atol=1e-3)
 
     with self.test_session(use_gpu=False) as sess:
-      loss, backprop = (gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
-          features, labels))
+      loss, backprop = (
+          gen_nn_ops.sparse_softmax_cross_entropy_with_logits(features, labels))
       with self.assertRaisesOpError("Received a label value of"):
         sess.run([loss, backprop])
 
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 6171793b148f8d8f195b9548a13df89d29c5e96e..8cfee3eb933afcea7a58d5632948b87b0c4c10df 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -336,6 +336,20 @@ class SplitOpTest(test.TestCase):
     for s in splits:
       self.assertEqual(None, s.get_shape().ndims)
 
+  def testNonexistentDimTensor(self):
+    x = array_ops.placeholder(dtypes.int32)
+    values = np.zeros([5, 30])
+    splits = array_ops.placeholder(dtypes.int32)
+    with self.assertRaisesRegexp(ValueError, "Cannot infer"):
+      y = array_ops.split(values, splits, axis=x)
+
+    splits = array_ops.placeholder(dtypes.int32, [3])
+    y = array_ops.split(values, splits, axis=x)
+    with self.test_session(use_gpu=True) as sess:
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "must have exactly one element"):
+        sess.run(y, {x: np.array([], dtype=np.int32), splits: [4, 11, 15]})
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/stack_ops_test.py b/tensorflow/python/kernel_tests/stack_ops_test.py
index aa409336f5c50178e4d0ca946190119fb0e4188e..afd2eaffab992bca4b3ae7b4f65e0370f325b548 100644
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@@ -34,11 +34,11 @@ class StackOpTest(test.TestCase):
 
   def _testStackPushPop(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
-      h = gen_data_flow_ops._stack_v2(
+      h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push_v2(h, [[4.0, 5.0]])
+      c = gen_data_flow_ops.stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
+        c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
       self.assertAllClose([[4.0, 5.0]], c1.eval())
 
   def testStackPushPop(self):
@@ -49,11 +49,11 @@ class StackOpTest(test.TestCase):
     with self.test_session(use_gpu=use_gpu):
       a = np.arange(2000)
       x = constant_op.constant(a, dtype=dtypes.float32)
-      h = gen_data_flow_ops._stack_v2(
+      h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push_v2(h, x, swap_memory=True)
+      c = gen_data_flow_ops.stack_push_v2(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
+        c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
       self.assertAllClose(a, c1.eval())
 
   def testStackPushPopSwap(self):
@@ -63,7 +63,7 @@ class StackOpTest(test.TestCase):
   def _testStackWhileSwap(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       n = constant_op.constant(0)
-      h = gen_data_flow_ops._stack_v2(
+      h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
 
       def c(x):
@@ -72,7 +72,7 @@ class StackOpTest(test.TestCase):
       def b(x):
         with ops.control_dependencies([x]):
           a = constant_op.constant(np.ones(2000), dtype=dtypes.float32)
-          v = gen_data_flow_ops._stack_push_v2(h, a, swap_memory=True)
+          v = gen_data_flow_ops.stack_push_v2(h, a, swap_memory=True)
         with ops.control_dependencies([v]):
           return math_ops.add(x, 1)
 
@@ -86,7 +86,7 @@ class StackOpTest(test.TestCase):
 
       def b1(x, y):
         nx = math_ops.subtract(x, 1)
-        ny = y + gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
+        ny = y + gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
         return [nx, ny]
 
       _, ry = control_flow_ops.while_loop(
@@ -99,16 +99,16 @@ class StackOpTest(test.TestCase):
 
   def _testMultiStack(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
-      h1 = gen_data_flow_ops._stack_v2(
+      h1 = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
-      c1 = gen_data_flow_ops._stack_push_v2(h1, 4.0)
+      c1 = gen_data_flow_ops.stack_push_v2(h1, 4.0)
       with ops.control_dependencies([c1]):
-        c1 = gen_data_flow_ops._stack_pop_v2(h1, dtypes.float32)
-      h2 = gen_data_flow_ops._stack_v2(
+        c1 = gen_data_flow_ops.stack_pop_v2(h1, dtypes.float32)
+      h2 = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="bar")
-      c2 = gen_data_flow_ops._stack_push_v2(h2, 5.0)
+      c2 = gen_data_flow_ops.stack_push_v2(h2, 5.0)
       with ops.control_dependencies([c2]):
-        c2 = gen_data_flow_ops._stack_pop_v2(h2, dtypes.float32)
+        c2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
       r = c1 + c2
       self.assertAllClose(9.0, r.eval())
 
@@ -119,17 +119,17 @@ class StackOpTest(test.TestCase):
   def _testSameNameStacks(self, use_gpu):
     """Different stacks with the same name do not interfere."""
     with self.test_session(use_gpu=use_gpu) as sess:
-      h1 = gen_data_flow_ops._stack_v2(
+      h1 = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
-      h2 = gen_data_flow_ops._stack_v2(
+      h2 = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
 
-      c1 = gen_data_flow_ops._stack_push_v2(h1, 4.0)
+      c1 = gen_data_flow_ops.stack_push_v2(h1, 4.0)
       with ops.control_dependencies([c1]):
-        c2 = gen_data_flow_ops._stack_push_v2(h2, 5.0)
+        c2 = gen_data_flow_ops.stack_push_v2(h2, 5.0)
       with ops.control_dependencies([c2]):
-        pop1 = gen_data_flow_ops._stack_pop_v2(h1, dtypes.float32)
-        pop2 = gen_data_flow_ops._stack_pop_v2(h2, dtypes.float32)
+        pop1 = gen_data_flow_ops.stack_pop_v2(h1, dtypes.float32)
+        pop2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
 
       out1, out2 = sess.run([pop1, pop2])
       self.assertAllClose(out1, 4.0)
@@ -141,9 +141,9 @@ class StackOpTest(test.TestCase):
 
   def _testCloseStack(self, use_gpu):
     with self.test_session(use_gpu=use_gpu) as sess:
-      h = gen_data_flow_ops._stack_v2(
+      h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
-      c1 = gen_data_flow_ops._stack_close_v2(h)
+      c1 = gen_data_flow_ops.stack_close_v2(h)
       sess.run(c1)
 
   def testCloseStack(self):
@@ -152,11 +152,11 @@ class StackOpTest(test.TestCase):
 
   def _testPushCloseStack(self, use_gpu):
     with self.test_session(use_gpu=use_gpu) as sess:
-      h = gen_data_flow_ops._stack_v2(
+      h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push_v2(h, [[4.0, 5.0]])
+      c = gen_data_flow_ops.stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_close_v2(h)
+        c1 = gen_data_flow_ops.stack_close_v2(h)
       sess.run(c1)
 
   def testPushCloseStack(self):
@@ -170,9 +170,9 @@ class StackOpRefTest(test.TestCase):
   def _testStackPushPop(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push(h, [[4.0, 5.0]])
+      c = gen_data_flow_ops.stack_push(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_pop(h, dtypes.float32)
+        c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
       self.assertAllClose([[4.0, 5.0]], c1.eval())
 
   def testStackPushPop(self):
@@ -184,9 +184,9 @@ class StackOpRefTest(test.TestCase):
       a = np.arange(2000)
       x = constant_op.constant(a, dtype=dtypes.float32)
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push(h, x, swap_memory=True)
+      c = gen_data_flow_ops.stack_push(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_pop(h, dtypes.float32)
+        c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
       self.assertAllClose(a, c1.eval())
 
   def testStackPushPopSwap(self):
@@ -196,13 +196,13 @@ class StackOpRefTest(test.TestCase):
   def _testMultiStack(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       h1 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
-      c1 = gen_data_flow_ops._stack_push(h1, 4.0)
+      c1 = gen_data_flow_ops.stack_push(h1, 4.0)
       with ops.control_dependencies([c1]):
-        c1 = gen_data_flow_ops._stack_pop(h1, dtypes.float32)
+        c1 = gen_data_flow_ops.stack_pop(h1, dtypes.float32)
       h2 = gen_data_flow_ops._stack(dtypes.float32, stack_name="bar")
-      c2 = gen_data_flow_ops._stack_push(h2, 5.0)
+      c2 = gen_data_flow_ops.stack_push(h2, 5.0)
       with ops.control_dependencies([c2]):
-        c2 = gen_data_flow_ops._stack_pop(h2, dtypes.float32)
+        c2 = gen_data_flow_ops.stack_pop(h2, dtypes.float32)
       r = c1 + c2
       self.assertAllClose(9.0, r.eval())
 
@@ -217,7 +217,7 @@ class StackOpRefTest(test.TestCase):
       def b(x):
         with ops.control_dependencies([x]):
           a = constant_op.constant(np.ones(2000), dtype=dtypes.float32)
-          v = gen_data_flow_ops._stack_push(h, a, swap_memory=True)
+          v = gen_data_flow_ops.stack_push(h, a, swap_memory=True)
         with ops.control_dependencies([v]):
           return math_ops.add(x, 1)
 
@@ -231,7 +231,7 @@ class StackOpRefTest(test.TestCase):
 
       def b1(x, y):
         nx = math_ops.subtract(x, 1)
-        ny = y + gen_data_flow_ops._stack_pop(h, dtypes.float32)
+        ny = y + gen_data_flow_ops.stack_pop(h, dtypes.float32)
         return [nx, ny]
 
       _, ry = control_flow_ops.while_loop(
@@ -249,9 +249,9 @@ class StackOpRefTest(test.TestCase):
   def _testSameNameStacks(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       h1 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
-      c1 = gen_data_flow_ops._stack_push(h1, 4.0)
+      c1 = gen_data_flow_ops.stack_push(h1, 4.0)
       h2 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
-      c2 = gen_data_flow_ops._stack_push(h2, 5.0)
+      c2 = gen_data_flow_ops.stack_push(h2, 5.0)
       _ = c1 + c2
       self.assertNotEqual(h1.eval()[1], h2.eval()[1])
 
@@ -262,7 +262,7 @@ class StackOpRefTest(test.TestCase):
   def _testCloseStack(self, use_gpu):
     with self.test_session(use_gpu=use_gpu) as sess:
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
-      c1 = gen_data_flow_ops._stack_close(h)
+      c1 = gen_data_flow_ops.stack_close(h)
       sess.run(c1)
 
   def testCloseStack(self):
@@ -272,9 +272,9 @@ class StackOpRefTest(test.TestCase):
   def _testPushCloseStack(self, use_gpu):
     with self.test_session(use_gpu=use_gpu) as sess:
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push(h, [[4.0, 5.0]])
+      c = gen_data_flow_ops.stack_push(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_close(h)
+        c1 = gen_data_flow_ops.stack_close(h)
       sess.run(c1)
 
   def testPushCloseStack(self):
diff --git a/tensorflow/python/kernel_tests/string_strip_op_test.py b/tensorflow/python/kernel_tests/string_strip_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..30fd477ff42cf1e6c96b80936226df7dc15997d4
--- /dev/null
+++ b/tensorflow/python/kernel_tests/string_strip_op_test.py
@@ -0,0 +1,56 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for string_strip_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class StringStripOpTest(test.TestCase):
+  """ Test cases for tf.string_strip."""
+
+  def test_string_strip(self):
+    strings = ["pigs on the wing", "animals"]
+
+    with self.test_session() as sess:
+      output = string_ops.string_strip(strings)
+      output = sess.run(output)
+      self.assertAllEqual(output, [b"pigs on the wing", b"animals"])
+
+  def test_string_strip_2d(self):
+    strings = [["pigs on the wing", "animals"],
+               [" hello ", "\n\tworld \r \n"]]
+
+    with self.test_session() as sess:
+      output = string_ops.string_strip(strings)
+      output = sess.run(output)
+      self.assertAllEqual(output, [[b"pigs on the wing", b"animals"],
+                                   [b"hello", b"world"]])
+
+  def test_string_strip_with_empty_strings(self):
+    strings = [" hello ", "", "world ", " \t \r \n "]
+
+    with self.test_session() as sess:
+      output = string_ops.string_strip(strings)
+      output = sess.run(output)
+      self.assertAllEqual(output, [b"hello", b"", b"world", b""])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index a519b69b22cf51ab4f4173b215c21a71d83e9f99..1b935d5286729e9e802c56e90e2ae7ab72a6e080 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -356,6 +356,10 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1_1/nested/dummy:0", v5.name)
     self.assertEqual("s1_1/nested_1/dummy:0", v6.name)
 
+    self.assertEqual(2, len(tmpl1._checkpoint_dependencies))
+    self.assertEqual("nested", tmpl1._checkpoint_dependencies[0].name)
+    self.assertEqual("nested_1", tmpl1._checkpoint_dependencies[1].name)
+
   @test_util.run_in_graph_and_eager_modes()
   def test_nested_templates_with_defun(self):
 
@@ -558,7 +562,7 @@ class TemplateTest(test.TestCase):
     outputs_b, _ = linear1(inputs)
     self.assertEquals("foo", linear1.variable_scope.name)
     self.assertEquals("foo/w:0", w1.name)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEquals("foo/add:0", outputs_a.name,
                         "First application of template should get "
                         "same name scope as variables.")
@@ -573,7 +577,7 @@ class TemplateTest(test.TestCase):
                       "New template gets a freshly uniquified variable scope "
                       "because 'foo' is already taken.")
     self.assertEquals("foo_1/w:0", w2.name)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEquals("foo_1_1/add:0", outputs_c.name,
                         "First application of template would get "
                         "same name scope as variables, but 'foo_1' is already "
@@ -588,7 +592,7 @@ class TemplateTest(test.TestCase):
     with variable_scope.variable_scope("foo"):
       # Create two templates with the same name, ensure scopes are made unique.
       ta = template.make_template("bar", variable_scoped_function, True)
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         tb = template.make_template("s", function_with_side_create,
                                     trainable=False)
       else:
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index aad2443eea7ad87faf481973e91ca3df32ccfb44..918bbd38edfd18bf2b65aa84ef7279e1448badd3 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -399,28 +399,14 @@ class TensorArrayTest(test.TestCase):
   def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
     with self.test_session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
-      in_graph_mode = context.in_graph_mode()
       # Test writing the wrong datatype
-      if in_graph_mode:
-        with self.assertRaisesOpError(
-            "TensorArray dtype is float but Op is trying to write "
-            "dtype string"):
-          self.evaluate(ta.write(0, "wrong_type_scalar").flow)
-      else:
-        with self.assertRaisesOpError(
-            "TensorArray dtype is float32 but Op is trying to write "
-            "dtype string"):
-          self.evaluate(ta.write(0, "wrong_type_scalar").flow)
+      with self.assertRaisesOpError(
+          "TensorArray dtype is (float|float32) but Op is trying to write "
+          "dtype string"):
+        self.evaluate(ta.write(0, "wrong_type_scalar").flow)
 
-      if context.in_graph_mode():
-        with self.assertRaisesOpError(
-            "Tried to write to index -1 but array is not "
-            "resizeable and size is: 3"):
-          self.evaluate(ta.write(-1, 3.0).flow)
-      else:
-        with self.assertRaisesOpError(
-            r"Writing to negative indices \(index -1\) is not allowed."):
-          self.evaluate(ta.write(-1, 3.0).flow)
+      with self.assertRaisesOpError("index -1"):
+        self.evaluate(ta.write(-1, 3.0).flow)
 
       # Test reading from too large an index
       with self.assertRaisesOpError(
@@ -435,23 +421,17 @@ class TensorArrayTest(test.TestCase):
 
       w0 = ta.write(0, [[4.0, 5.0]])
 
-      # Test reading wrong datatype, which is only possible in graph mode
-      if context.in_graph_mode():
-        r0_bad = gen_data_flow_ops._tensor_array_read_v3(
+      # Test reading wrong datatype (only possible when constructing graphs).
+      if not context.executing_eagerly():
+        r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtypes.float64, flow_in=w0.flow)
         with self.assertRaisesOpError(
             "TensorArray dtype is float but Op requested dtype double."):
           r0_bad.eval()
 
       # Test reading from a negative index, which is not allowed
-      if context.in_graph_mode():
-        with self.assertRaisesOpError(
-            r"Tried to read from index -1 but array size is: 3"):
-          self.evaluate(ta.read(-1))
-      else:
-        with self.assertRaisesOpError(
-            r"Reading from negative indices \(index -1\) is not allowed."):
-          self.evaluate(ta.read(-1))
+      with self.assertRaisesOpError("index -1"):
+        self.evaluate(ta.read(-1))
 
       # Test reading from too large an index
       with self.assertRaisesOpError(
@@ -467,10 +447,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(
           "Could not write to TensorArray index 2 because "
           "it has already been written to."):
-        if context.in_graph_mode():
-          self.evaluate(ta.write(2, 3.0).write(2, 3.0).flow)
-        else:
-          self.evaluate(ta.write(2, 3.0).write(2, 3.0))
+        self.evaluate(ta.write(2, 3.0).write(2, 3.0).flow)
 
   @test_util.run_in_graph_and_eager_modes()
   def testTensorArrayConcatIncompatibleShapesFails(self):
@@ -499,58 +476,40 @@ class TensorArrayTest(test.TestCase):
       w2 = w1.write(1, [4.0])
       w3 = w2.write(2, [[3.0]])
 
-      # The eager-mode implementation just passes up array_op.concat's error
-      # message.
-      if context.in_graph_mode():
-        with self.assertRaisesOpError(
-            r"TensorArray has inconsistent shapes.  Index 0 has "
-            r"\(excepting dimension 0\) shape: \[\] but index 2 has "
-            r"\(excepting dimension 0\) shape: \[1\]"):
-          self.evaluate(w3.concat())
-      else:
-        with self.assertRaisesOpError(
-            r".*Ranks of all input tensors should match: shape\[0\] "
-            r"= \[1\] vs\. shape\[2\] = \[1,1\].*"):
-          self.evaluate(w3.concat())
+      # The exact error messages differ between eager execution and graph
+      # construction as the former bubbles up the error from array_op.concat.
+      with self.assertRaisesOpError("shape"):
+        self.evaluate(w3.concat())
 
   @test_util.run_in_graph_and_eager_modes()
   def testTensorArraySplitIncompatibleShapesFails(self):
     with self.test_session(use_gpu=True):
-      in_graph_mode = context.in_graph_mode()
+      in_eager_mode = context.executing_eagerly()
       ta = _make_ta(3, "foo")
       with self.assertRaisesOpError(
           r"Expected lengths to be a vector, received shape: \[\]"):
-        if in_graph_mode:
+        if in_eager_mode:
+          self.evaluate(ta.split([1.0, 2.0, 3.0], 1))
+        else:
           lengths = array_ops.placeholder(dtypes.int64)
           ta.split([1.0, 2.0, 3.0], lengths).flow.eval(feed_dict={lengths: 1})
-        else:
-          self.evaluate(ta.split([1.0, 2.0, 3.0], 1))
 
       with self.assertRaisesOpError(
           r"Expected sum of lengths to be equal to values.shape\[0\], "
           r"but sum of lengths is 1 and value's shape is: \[3\]"):
-        if in_graph_mode:
-          self.evaluate(ta.split([1.0, 2.0, 3.0], [1]).flow)
-        else:
-          self.evaluate(ta.split([1.0, 2.0, 3.0], [1]))
+        self.evaluate(ta.split([1.0, 2.0, 3.0], [1]).flow)
 
       ta = _make_ta(1, "baz")
       with self.assertRaisesOpError(
           r"Expected value to be at least a vector, but received shape: \[\]"):
-        if in_graph_mode:
-          self.evaluate(ta.split(1.0, [1]).flow)
-        else:
-          self.evaluate(ta.split(1.0, [1]))
+        self.evaluate(ta.split(1.0, [1]).flow)
 
       ta = _make_ta(2, "buz")
       with self.assertRaisesOpError(
           r"TensorArray's size is not equal to the size of lengths "
           r"\(2 vs. 1\), and the TensorArray is not marked as "
           r"dynamically resizeable"):
-        if in_graph_mode:
-          self.evaluate(ta.split([1.0], [1]).flow)
-        else:
-          self.evaluate(ta.split([1.0], [1]))
+        self.evaluate(ta.split([1.0], [1]).flow)
 
   def _testTensorArrayWriteGradientAddMultipleAdds(self, dtype):
     with self.test_session(use_gpu=True):
@@ -656,8 +615,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(c(-2.0), grad_vals[1])
 
   def testTensorArrayGradientWriteRead(self):
-    for dtype in (np.float32, np.float64, np.int32, np.int64, np.complex64,
-                  np.complex128):
+    for dtype in (np.float32, np.float64, np.complex64, np.complex128):
       self._testTensorArrayGradientWriteReadType(dtype)
 
   def _testTensorArrayGradientWritePackConcatAndRead(self):
@@ -868,14 +826,14 @@ class TensorArrayTest(test.TestCase):
 
       vout = func(v0, state0, var)
       grad_val = -np.arange(3 * 5, dtype=np_dtype).reshape(3, 5)
-      if context.in_graph_mode():
+      if context.executing_eagerly():
+        grad_fn = backprop.gradients_function(func)
+        v0_grad, state0_grad, var_grad = grad_fn(v0, state0, var, dy=grad_val)
+      else:
         v0_grad = gradients_impl.gradients([vout], [v0], [grad_val])[0]
         state0_grad = gradients_impl.gradients([vout], [state0], [grad_val])[0]
         var_grad = gradients_impl.gradients([vout], [var], [grad_val])[0]
         variables.global_variables_initializer().run()
-      else:
-        grad_fn = backprop.gradients_function(func)
-        v0_grad, state0_grad, var_grad = grad_fn(v0, state0, var, dy=grad_val)
 
       state0_t, var_t, v0_t, vout_t, v0_grad_t, var_grad_t, state0_grad_t = (
           self.evaluate(
@@ -959,10 +917,10 @@ class TensorArrayTest(test.TestCase):
         return r
 
       x = constant_op.constant(2.0, name="x")
-      if context.in_graph_mode():
-        grad = gradients_impl.gradients(loop(x), [x])[0]
-      else:
+      if context.executing_eagerly():
         grad = backprop.gradients_function(loop)(x)[0]
+      else:
+        grad = gradients_impl.gradients(loop(x), [x])[0]
       self.assertAllClose(31.0, self.evaluate(grad))
 
   def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
@@ -1158,14 +1116,14 @@ class TensorArrayTest(test.TestCase):
           infer_shape=True)
       w0 = ta1.split(value, [1, 2])
       r0 = w0.read(0)
-      if context.in_graph_mode():
+      if context.executing_eagerly():
+        self.assertEqual((1, 2), r0.get_shape())
+        self.assertEqual((2, 2), w0.read(1).get_shape())
+      else:
         self.assertEqual(r0.get_shape().ndims, None)
         self.assertEqual(
             tensor_shape.TensorShape(
                 ta1.handle.op.get_attr("element_shape")).ndims, None)
-      else:
-        self.assertEqual((1, 2), r0.get_shape())
-        self.assertEqual((2, 2), w0.read(1).get_shape())
 
   def testWriteUnknownShape(self):
     with self.test_session(use_gpu=True):
@@ -1297,13 +1255,13 @@ class TensorArrayTest(test.TestCase):
       g = func(values)
       grad_ys = [[[2.0, 3.0], [4.0, 5.0]]]
       # Test combined gradients + aggregation of read(0)
-      if context.in_graph_mode():
-        grad = gradients_impl.gradients(ys=[g], xs=[values], grad_ys=grad_ys)
-        g_vals, grad_vals = session.run([[g], grad])
-      else:
+      if context.executing_eagerly():
         g_vals = [g]
         grad_vals = backprop.gradients_function(func)(
             values, dy=constant_op.constant(grad_ys[0], dtype=dtypes.float32))
+      else:
+        grad = gradients_impl.gradients(ys=[g], xs=[values], grad_ys=grad_ys)
+        g_vals, grad_vals = session.run([[g], grad])
 
       # Gradients for 8 of the 10 unread components are zero.
       expected_grad = np.zeros((10, 2))
@@ -1453,13 +1411,13 @@ class TensorArrayTest(test.TestCase):
       # Tests correct properties on new TensorArrays.
       self.assertEqual(dtypes.float32, ta0.dtype)
       self.assertEqual(dtypes.int32, ta1.dtype)
-      if context.in_graph_mode():
-        self.assertEqual(tensor_shape.unknown_shape(), read0.get_shape())
+      if context.executing_eagerly():
+        self.assertEqual(tensor_shape.scalar(), read0.get_shape())
       else:
-        self.assertEqual(tensor_shape.scalar(), read1.get_shape())
+        self.assertEqual(tensor_shape.unknown_shape(), read0.get_shape())
       self.assertEqual(tensor_shape.scalar(), read1.get_shape())
 
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         variables.global_variables_initializer().run()
 
       read0_v, read1_v, size0_v, size1_v = self.evaluate((read0, read1, size0,
diff --git a/tensorflow/python/kernel_tests/testdata/BUILD b/tensorflow/python/kernel_tests/testdata/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..45264c773ac0089bbfed44bd115e73e848a8cc62
--- /dev/null
+++ b/tensorflow/python/kernel_tests/testdata/BUILD
@@ -0,0 +1,24 @@
+# Data files for kernel tests.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "self_adjoint_eig_op_test_files",
+    srcs = ["self_adjoint_eig_fail_if_denorms_flushed.txt"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/kernel_tests/testdata/self_adjoint_eig_fail_if_denorms_flushed.txt b/tensorflow/python/kernel_tests/testdata/self_adjoint_eig_fail_if_denorms_flushed.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d56a690a7928fafe39debc478db3e90ab953430b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/testdata/self_adjoint_eig_fail_if_denorms_flushed.txt
@@ -0,0 +1,32 @@
+2.60986303e-17 -9.66826148e-21 -1.68610775e-24 -9.16104778e-17 -1.1039539e-18 -1.66460338e-25 -2.12362492e-23 1.90946688e-21 -3.34190535e-22 1.2000634e-18 -7.31782583e-20 2.57851762e-20 -2.55509e-20 -9.54284927e-20 -1.04248315e-17 -5.32450516e-22 -1.81712853e-17 6.0044594e-18 3.96602716e-11 2.89077487e-25 -2.47461475e-25 1.77941757e-24 -7.30388687e-21 -3.84350041e-16 -3.88532388e-21 -4.29928618e-21 4.13551131e-16 -2.63408791e-25 -2.84830375e-21 -1.6450072e-16 -2.8585296e-21 -3.65413296e-21
+-9.66826148e-21 5.03939189e-22 9.17361108e-26 5.17304053e-20 1.99338895e-20 1.25259775e-28 -8.70441942e-26 9.91474109e-25 -5.80960164e-24 -1.19022314e-21 3.90467165e-22 -1.38179098e-22 1.79253406e-22 2.23977705e-22 1.1864143e-19 7.16291934e-24 4.10159639e-20 -2.16798529e-20 -4.95460504e-14 -2.6881406e-27 5.32861213e-27 -4.54567085e-28 1.99794328e-23 1.26854541e-17 -1.92916739e-23 8.60632417e-24 -1.04721097e-18 -7.00607669e-28 6.86771954e-23 8.65173173e-19 1.24469175e-22 6.03883081e-24
+-1.68610775e-24 9.17361108e-26 1.34889529e-26 2.65059e-22 2.39713735e-23 -2.00915344e-30 -1.135692e-27 -6.46049964e-26 -1.03607712e-26 -1.57623654e-23 -1.63805162e-24 -5.95741642e-25 3.24984759e-25 6.49561204e-24 2.28504969e-21 2.8319611e-25 3.96494845e-22 -2.1988623e-22 6.26027228e-16 1.2418479e-30 2.1016041e-30 6.22813846e-30 -1.0708067e-25 6.90778045e-21 1.86361622e-25 7.08789674e-26 -9.23628499e-21 1.65335067e-30 -1.12173032e-26 8.2257321e-22 -4.72686764e-27 -2.58501275e-26
+-9.16104778e-17 5.17304053e-20 2.65059e-22 2.69965968e-14 7.06005733e-17 1.69851446e-22 -2.75994304e-21 -6.61589523e-20 3.8682048e-20 -1.69253147e-17 -2.68580354e-18 -7.74994098e-19 -9.75466696e-19 2.13537585e-18 2.13185342e-16 6.89417478e-21 1.35805044e-16 -3.48309239e-16 1.0448622e-09 -2.17287918e-23 7.41749185e-24 -7.36683057e-23 -1.31083094e-20 1.574e-14 5.72646592e-19 -9.85673749e-21 -1.0654985e-14 2.70679318e-23 4.0943479e-20 -3.42938568e-15 8.57373804e-20 -2.18094505e-20
+-1.1039539e-18 1.99338895e-20 2.39713735e-23 7.06005733e-17 1.83801666e-17 1.09735975e-24 -5.73058223e-24 7.2227645e-22 -8.94843118e-22 -2.30558605e-19 -7.84892038e-20 -1.88692532e-20 -1.02217713e-20 2.95458834e-20 2.42873413e-17 8.89161401e-22 1.21669872e-17 -6.85317731e-18 -7.345906e-12 -3.1158751e-25 1.36359449e-24 -1.57981417e-24 3.89633371e-21 9.94580899e-16 1.45732115e-20 6.92065325e-22 -1.86114433e-16 6.00601346e-26 3.26844e-21 4.38573742e-17 1.06803444e-20 4.60203933e-22
+-1.66460338e-25 1.25259775e-28 -2.00915344e-30 1.69851446e-22 1.09735975e-24 5.75549306e-30 4.74050864e-29 -5.99239043e-28 -1.5784658e-27 -1.74631273e-25 -1.22702975e-25 -1.03371979e-26 -1.96967552e-26 -1.56446725e-26 -3.06462576e-25 -6.33857393e-28 -6.08829397e-24 -7.07478859e-24 -4.82614847e-18 -2.7324345e-31 1.23830207e-31 -7.96172e-31 -1.9034503e-27 -3.82709848e-22 -2.69257733e-26 -3.84934809e-27 -1.48572725e-22 4.14585761e-31 2.5611404e-28 -2.77402858e-24 3.10373361e-28 -5.09669241e-28
+-2.12362492e-23 -8.70441942e-26 -1.135692e-27 -2.75994304e-21 -5.73058223e-24 4.74050864e-29 6.28162e-26 -3.30076462e-25 -3.30065418e-25 -1.1370873e-23 -8.97722764e-24 -1.03190629e-24 -9.52908672e-25 -3.27285413e-24 1.36216664e-22 -8.0549564e-26 -1.94826821e-22 -3.64999226e-22 -2.92500975e-15 -3.00986528e-29 2.39712646e-29 -1.02470704e-28 -4.99034099e-25 -1.32277916e-19 -5.05595e-24 -3.04012473e-25 -1.44724215e-20 5.04614184e-30 -4.12370105e-26 4.20735765e-21 -1.02818953e-25 3.41267575e-26
+1.90946688e-21 9.91474109e-25 -6.46049964e-26 -6.61589523e-20 7.2227645e-22 -5.99239043e-28 -3.30076462e-25 1.8948059e-22 1.83367373e-23 1.06616038e-21 -2.81616502e-22 1.18347412e-22 8.3458038e-23 9.67703245e-24 -1.37445558e-20 2.11412652e-24 2.64820742e-21 8.02510339e-20 4.39926334e-13 9.58727772e-27 2.9838033e-28 1.29183353e-26 1.78626483e-22 3.03531056e-19 9.62612316e-23 1.33722715e-23 2.92905627e-18 -9.42286262e-28 3.23170971e-24 4.10885529e-19 -8.38673724e-25 -8.63732285e-25
+-3.34190535e-22 -5.80960164e-24 -1.03607712e-26 3.8682048e-20 -8.94843118e-22 -1.5784658e-27 -3.30065418e-25 1.83367373e-23 9.30693173e-23 1.48929558e-21 1.83278606e-21 1.08468362e-22 2.61703785e-22 4.42441537e-23 1.23906316e-20 2.55235433e-24 8.36323349e-20 1.2152038e-19 9.83332204e-14 5.14523933e-27 -3.28220159e-28 8.22099066e-27 3.34939233e-23 4.3309476e-19 5.82711129e-22 1.14299394e-22 3.25240717e-18 5.84184241e-28 -1.76991199e-24 5.5568966e-20 -2.80294941e-24 4.59071175e-24
+1.2000634e-18 -1.19022314e-21 -1.57623654e-23 -1.69253147e-17 -2.30558605e-19 -1.74631273e-25 -1.1370873e-23 1.06616038e-21 1.48929558e-21 2.05547703e-18 2.01471341e-20 2.65473229e-20 1.36331708e-20 -2.19777252e-20 -3.09825792e-18 -1.93365673e-22 -2.25608735e-18 7.98997246e-18 1.45582661e-11 6.29004356e-25 -1.14866332e-25 -5.51419319e-26 2.97082139e-21 -2.39052259e-16 1.48920411e-20 1.28589326e-21 4.27717466e-16 -4.44694851e-26 -1.80270052e-22 3.29932795e-18 -5.11645591e-22 5.53091711e-23
+-7.31782583e-20 3.90467165e-22 -1.63805162e-24 -2.68580354e-18 -7.84892038e-20 -1.22702975e-25 -8.97722764e-24 -2.81616502e-22 1.83278606e-21 2.01471341e-20 4.38037939e-19 -4.46678177e-21 3.48516266e-20 7.32592348e-21 1.11928135e-18 8.58541052e-23 8.80645183e-18 4.80109643e-21 -1.7163557e-11 1.92262335e-26 -2.78003951e-26 5.48322572e-25 8.95330117e-23 -1.11570766e-17 3.13666242e-20 4.47195205e-21 -1.09014604e-17 7.69340111e-26 1.64649306e-22 1.71054085e-17 1.33471053e-23 6.40747815e-22
+2.57851762e-20 -1.38179098e-22 -5.95741642e-25 -7.74994098e-19 -1.88692532e-20 -1.03371979e-26 -1.03190629e-24 1.18347412e-22 1.08468362e-22 2.65473229e-20 -4.46678177e-21 5.22731861e-21 1.06412616e-21 -8.0508039e-22 -1.68829721e-19 -2.7699538e-23 -2.15173717e-19 7.46895651e-19 1.71858101e-12 5.41956e-26 -6.15013064e-27 1.54884457e-26 2.54028029e-22 -1.50009535e-18 1.11920465e-21 1.05890428e-22 3.6487132e-17 -2.06798384e-27 -5.5143889e-23 -1.71529414e-18 -7.38099094e-23 -6.5250472e-24
+-2.55509e-20 1.79253406e-22 3.24984759e-25 -9.75466696e-19 -1.02217713e-20 -1.96967552e-26 -9.52908672e-25 8.3458038e-23 2.61703785e-22 1.36331708e-20 3.48516266e-20 1.06412616e-21 4.08927657e-20 -2.76503659e-21 -6.81059804e-20 5.13487959e-23 1.80612902e-18 5.32462054e-19 -3.89327199e-12 3.60012729e-26 -2.5575456e-26 3.14316426e-25 4.56614351e-22 -1.24545392e-17 9.14707146e-21 7.97421952e-22 2.84371096e-17 2.98359736e-26 1.33439467e-23 1.00242743e-17 -4.94476664e-23 3.28816461e-22
+-9.54284927e-20 2.23977705e-22 6.49561204e-24 2.13537585e-18 2.95458834e-20 -1.56446725e-26 -3.27285413e-24 9.67703245e-24 4.42441537e-23 -2.19777252e-20 7.32592348e-21 -8.0508039e-22 -2.76503659e-21 5.02409342e-20 1.57549297e-18 2.63027228e-22 6.11241908e-19 -2.71906856e-19 1.41003203e-12 2.66730019e-26 2.25679315e-26 1.00596535e-25 3.02875382e-22 3.85539387e-17 6.79708607e-22 1.60452617e-22 -2.08440846e-17 -5.40071056e-28 4.56236979e-23 -1.00868521e-17 1.22265047e-22 -1.81997389e-23
+-1.04248315e-17 1.1864143e-19 2.28504969e-21 2.13185342e-16 2.42873413e-17 -3.06462576e-25 1.36216664e-22 -1.37445558e-20 1.23906316e-20 -3.09825792e-18 1.11928135e-18 -1.68829721e-19 -6.81059804e-20 1.57549297e-18 2.5311263e-15 9.97996576e-20 2.26115975e-16 -3.86907114e-17 3.68487445e-12 8.23669787e-24 1.00324064e-23 3.38722042e-24 8.64234911e-21 2.46521189e-15 1.72823337e-19 9.24995431e-20 -3.16903295e-15 5.94130048e-25 1.73965082e-20 1.17371651e-15 2.26718703e-20 4.16709318e-21
+-5.32450516e-22 7.16291934e-24 2.8319611e-25 6.89417478e-21 8.89161401e-22 -6.33857393e-28 -8.0549564e-26 2.11412652e-24 2.55235433e-24 -1.93365673e-22 8.58541052e-23 -2.7699538e-23 5.13487959e-23 2.63027228e-22 9.97996576e-20 2.88326168e-23 1.35358898e-20 5.43364968e-21 4.24011412e-14 1.88486064e-27 8.93106076e-29 4.5748278e-27 2.48573168e-24 5.81165621e-19 1.96505062e-23 5.84813631e-24 -2.46866108e-20 1.912471e-29 2.0243857e-24 -2.88983463e-20 1.35761502e-24 1.40424791e-27
+-1.81712853e-17 4.10159639e-20 3.96494845e-22 1.35805044e-16 1.21669872e-17 -6.08829397e-24 -1.94826821e-22 2.64820742e-21 8.36323349e-20 -2.25608735e-18 8.80645183e-18 -2.15173717e-19 1.80612902e-18 6.11241908e-19 2.26115975e-16 1.35358898e-20 3.66013906e-15 1.35652384e-17 -1.97764849e-09 4.16586597e-24 1.28936031e-24 6.96597122e-23 2.43147439e-21 -1.25627342e-15 1.52711738e-18 2.61025243e-19 -2.00782109e-15 9.75835691e-24 4.0203e-21 1.40790259e-15 -7.8869e-21 8.51983e-20
+6.0044594e-18 -2.16798529e-20 -2.1988623e-22 -3.48309239e-16 -6.85317731e-18 -7.07478859e-24 -3.64999226e-22 8.02510339e-20 1.2152038e-19 7.98997246e-18 4.80109643e-21 7.46895651e-19 5.32462054e-19 -2.71906856e-19 -3.86907114e-17 5.43364968e-21 1.35652384e-17 1.19795414e-15 1.18472676e-09 2.74214961e-23 -7.6305178e-26 1.25969175e-23 1.68466447e-19 1.33873166e-15 1.0739288e-18 1.02533716e-19 2.73480291e-14 -1.87024011e-24 -9.73944425e-21 2.74769918e-16 -1.48632788e-20 1.69142815e-21
+3.96602716e-11 -4.95460504e-14 6.26027228e-16 1.0448622e-09 -7.345906e-12 -4.82614847e-18 -2.92500975e-15 4.39926334e-13 9.83332204e-14 1.45582661e-11 -1.7163557e-11 1.71858101e-12 -3.89327199e-12 1.41003203e-12 3.68487445e-12 4.24011412e-14 -1.97764849e-09 1.18472676e-09 0.0257282555 5.64106473e-17 5.83845666e-18 -1.72409096e-16 1.02886027e-12 1.42563525e-08 -1.57067415e-12 -4.61972799e-13 3.30651737e-08 -5.20615037e-17 -1.71347193e-14 2.87764201e-10 5.03749196e-14 -1.97989316e-13
+2.89077487e-25 -2.6881406e-27 1.2418479e-30 -2.17287918e-23 -3.1158751e-25 -2.7324345e-31 -3.00986528e-29 9.58727772e-27 5.14523933e-27 6.29004356e-25 1.92262335e-26 5.41956e-26 3.60012729e-26 2.66730019e-26 8.23669787e-24 1.88486064e-27 4.16586597e-24 2.74214961e-23 5.64106473e-17 1.2555855e-29 -1.30304595e-31 8.42884087e-31 1.75222077e-26 -2.89058862e-23 3.0225144e-26 6.67962117e-27 8.54181718e-22 -1.2385176e-32 -5.78078369e-28 3.34704626e-23 -2.00599605e-27 2.05674681e-28
+-2.47461475e-25 5.32861213e-27 2.1016041e-30 7.41749185e-24 1.36359449e-24 1.23830207e-31 2.39712646e-29 2.9838033e-28 -3.28220159e-28 -1.14866332e-25 -2.78003951e-26 -6.15013064e-27 -2.5575456e-26 2.25679315e-26 1.00324064e-23 8.93106076e-29 1.28936031e-24 -7.6305178e-26 5.83845666e-18 -1.30304595e-31 2.26490979e-30 -4.25637053e-31 1.40697e-27 5.91197152e-22 -2.08475892e-26 -5.64982671e-28 -3.97199197e-23 -5.06794406e-32 1.11993943e-27 -2.94280711e-23 2.65858181e-27 -2.23093754e-28
+1.77941757e-24 -4.54567085e-28 6.22813846e-30 -7.36683057e-23 -1.57981417e-24 -7.96172e-31 -1.02470704e-28 1.29183353e-26 8.22099066e-27 -5.51419319e-26 5.48322572e-25 1.54884457e-26 3.14316426e-25 1.00596535e-25 3.38722042e-24 4.5748278e-27 6.96597122e-23 1.25969175e-23 -1.72409096e-16 8.42884087e-31 -4.25637053e-31 1.40764294e-28 1.38735442e-26 -1.93810515e-22 1.93660175e-25 1.97417449e-26 1.62145272e-22 2.52533191e-31 -3.42833345e-28 6.34130774e-22 -2.01859e-27 6.1781768e-27
+-7.30388687e-21 1.99794328e-23 -1.0708067e-25 -1.31083094e-20 3.89633371e-21 -1.9034503e-27 -4.99034099e-25 1.78626483e-22 3.34939233e-23 2.97082139e-21 8.95330117e-23 2.54028029e-22 4.56614351e-22 3.02875382e-22 8.64234911e-21 2.48573168e-24 2.43147439e-21 1.68466447e-19 1.02886027e-12 1.75222077e-26 1.40697e-27 1.38735442e-26 1.18400807e-21 1.40670976e-18 2.40320429e-22 3.69528133e-23 4.81603371e-18 -1.49322683e-27 -2.70670724e-25 1.59463723e-19 6.40406749e-24 1.17170599e-23
+-3.84350041e-16 1.26854541e-17 6.90778045e-21 1.574e-14 9.94580899e-16 -3.82709848e-22 -1.32277916e-19 3.03531056e-19 4.3309476e-19 -2.39052259e-16 -1.11570766e-17 -1.50009535e-18 -1.24545392e-17 3.85539387e-17 2.46521189e-15 5.81165621e-19 -1.25627342e-15 1.33873166e-15 1.42563525e-08 -2.89058862e-23 5.91197152e-22 -1.93810515e-22 1.40670976e-18 4.40677789e-12 7.86017934e-19 7.73466606e-19 1.96690791e-15 -1.65941347e-22 2.63659933e-18 -3.0624544e-14 5.87194631e-18 -3.46291098e-19
+-3.88532388e-21 -1.92916739e-23 1.86361622e-25 5.72646592e-19 1.45732115e-20 -2.69257733e-26 -5.05595e-24 9.62612316e-23 5.82711129e-22 1.48920411e-20 3.13666242e-20 1.11920465e-21 9.14707146e-21 6.79708607e-22 1.72823337e-19 1.96505062e-23 1.52711738e-18 1.0739288e-18 -1.57067415e-12 3.0225144e-26 -2.08475892e-26 1.93660175e-25 2.40320429e-22 7.86017934e-19 1.80741048e-20 9.85491491e-22 5.08456938e-17 1.08072265e-26 -1.75036654e-23 4.36436952e-18 -1.77728563e-23 1.01268548e-22
+-4.29928618e-21 8.60632417e-24 7.08789674e-26 -9.85673749e-21 6.92065325e-22 -3.84934809e-27 -3.04012473e-25 1.33722715e-23 1.14299394e-22 1.28589326e-21 4.47195205e-21 1.05890428e-22 7.97421952e-22 1.60452617e-22 9.24995431e-20 5.84813631e-24 2.61025243e-19 1.02533716e-19 -4.61972799e-13 6.67962117e-27 -5.64982671e-28 1.97417449e-26 3.69528133e-23 7.73466606e-19 9.85491491e-22 3.68332283e-22 1.76753773e-18 2.6167718e-27 3.55918682e-25 1.95786374e-19 -2.60077304e-24 1.84790635e-23
+4.13551131e-16 -1.04721097e-18 -9.23628499e-21 -1.0654985e-14 -1.86114433e-16 -1.48572725e-22 -1.44724215e-20 2.92905627e-18 3.25240717e-18 4.27717466e-16 -1.09014604e-17 3.6487132e-17 2.84371096e-17 -2.08440846e-17 -3.16903295e-15 -2.46866108e-20 -2.00782109e-15 2.73480291e-14 3.30651737e-08 8.54181718e-22 -3.97199197e-23 1.62145272e-22 4.81603371e-18 1.96690791e-15 5.08456938e-17 1.76753773e-18 1.57092991e-12 -4.31425852e-23 -3.78241e-19 -1.15899865e-14 -7.61890782e-19 -1.15344546e-19
+-2.63408791e-25 -7.00607669e-28 1.65335067e-30 2.70679318e-23 6.00601346e-26 4.14585761e-31 5.04614184e-30 -9.42286262e-28 5.84184241e-28 -4.44694851e-26 7.69340111e-26 -2.06798384e-27 2.98359736e-26 -5.40071056e-28 5.94130048e-25 1.912471e-29 9.75835691e-24 -1.87024011e-24 -5.20615037e-17 -1.2385176e-32 -5.06794406e-32 2.52533191e-31 -1.49322683e-27 -1.65941347e-22 1.08072265e-26 2.6167718e-27 -4.31425852e-23 1.5576233e-30 -6.14697676e-29 -5.39097603e-24 -8.01112167e-29 1.81063126e-27
+-2.84830375e-21 6.86771954e-23 -1.12173032e-26 4.0943479e-20 3.26844e-21 2.5611404e-28 -4.12370105e-26 3.23170971e-24 -1.76991199e-24 -1.80270052e-22 1.64649306e-22 -5.5143889e-23 1.33439467e-23 4.56236979e-23 1.73965082e-20 2.0243857e-24 4.0203e-21 -9.73944425e-21 -1.71347193e-14 -5.78078369e-28 1.11993943e-27 -3.42833345e-28 -2.70670724e-25 2.63659933e-18 -1.75036654e-23 3.55918682e-25 -3.78241e-19 -6.14697676e-29 2.71732416e-23 2.4136621e-19 2.38938648e-23 1.21468477e-24
+-1.6450072e-16 8.65173173e-19 8.2257321e-22 -3.42938568e-15 4.38573742e-17 -2.77402858e-24 4.20735765e-21 4.10885529e-19 5.5568966e-20 3.29932795e-18 1.71054085e-17 -1.71529414e-18 1.00242743e-17 -1.00868521e-17 1.17371651e-15 -2.88983463e-20 1.40790259e-15 2.74769918e-16 2.87764201e-10 3.34704626e-23 -2.94280711e-23 6.34130774e-22 1.59463723e-19 -3.0624544e-14 4.36436952e-18 1.95786374e-19 -1.15899865e-14 -5.39097603e-24 2.4136621e-19 2.10373291e-13 4.84257897e-20 2.71571227e-19
+-2.8585296e-21 1.24469175e-22 -4.72686764e-27 8.57373804e-20 1.06803444e-20 3.10373361e-28 -1.02818953e-25 -8.38673724e-25 -2.80294941e-24 -5.11645591e-22 1.33471053e-23 -7.38099094e-23 -4.94476664e-23 1.22265047e-22 2.26718703e-20 1.35761502e-24 -7.8869e-21 -1.48632788e-20 5.03749196e-14 -2.00599605e-27 2.65858181e-27 -2.01859e-27 6.40406749e-24 5.87194631e-18 -1.77728563e-23 -2.60077304e-24 -7.61890782e-19 -8.01112167e-29 2.38938648e-23 4.84257897e-20 7.77486414e-23 -7.38542574e-25
+-3.65413296e-21 6.03883081e-24 -2.58501275e-26 -2.18094505e-20 4.60203933e-22 -5.09669241e-28 3.41267575e-26 -8.63732285e-25 4.59071175e-24 5.53091711e-23 6.40747815e-22 -6.5250472e-24 3.28816461e-22 -1.81997389e-23 4.16709318e-21 1.40424791e-27 8.51983e-20 1.69142815e-21 -1.97989316e-13 2.05674681e-28 -2.23093754e-28 6.1781768e-27 1.17170599e-23 -3.46291098e-19 1.01268548e-22 1.84790635e-23 -1.15344546e-19 1.81063126e-27 1.21468477e-24 2.71571227e-19 -7.38542574e-25 3.49516247e-23
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index 6ab931fdb97a8945ab610fda27a036693f0291e5..fa7c6a0f8a6c76f51e8bee108f002dbf8218046e 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -197,13 +197,15 @@ class TopKTest(test.TestCase):
 
   def testTopKGradients(self):
     with self.test_session(use_gpu=True) as sess:
-      inputs = array_ops.placeholder(dtypes.int32, shape=[2, 5])
+      inputs = array_ops.placeholder(dtypes.float32, shape=[2, 5])
       values, _ = nn_ops.top_k(inputs, 3)
       grad = sess.run(
           gradients_impl.gradients(
-              values, inputs, grad_ys=[[[1, 2, 3], [4, 5, 6]]]),
-          feed_dict={inputs: [[2, -1, 1000, 3, 4], [1, 5, 2, 4, 3]]})[0]
-    self.assertEqual(grad.tolist(), [[0, 0, 1, 3, 2], [0, 4, 0, 5, 6]])
+              values, inputs, grad_ys=[[[1., 2., 3.], [4., 5., 6.]]]),
+          feed_dict={inputs: [[2., -1., 1000., 3., 4.],
+                              [1., 5., 2., 4., 3.]]})[0]
+    self.assertEqual(
+        grad.tolist(), [[0., 0., 1., 3., 2.], [0., 4., 0., 5., 6.]])
 
 
 class TopKBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 6366d2e181c8cfabba8a78b664c25c85debc67ef..bbc040dc13fc151b970f130eeb76fa1639245416 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -66,9 +66,9 @@ class UniqueTest(test.TestCase):
     for dtype in [np.int32, np.int64]:
       x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
       with self.test_session() as sess:
-        y0, idx0 = gen_array_ops._unique_v2(x, axis=np.array([0], dtype))
+        y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
         tf_y0, tf_idx0 = sess.run([y0, idx0])
-        y1, idx1 = gen_array_ops._unique_v2(x, axis=np.array([1], dtype))
+        y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
         tf_y1, tf_idx1 = sess.run([y1, idx1])
       self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
       self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
@@ -80,7 +80,7 @@ class UniqueTest(test.TestCase):
     # by default, the axis will be wrapped to allow `axis=None`.
     x = np.random.randint(2, high=10, size=7000)
     with self.test_session() as sess:
-      y, idx = gen_array_ops._unique_v2(x, axis=np.array([], np.int32))
+      y, idx = gen_array_ops.unique_v2(x, axis=np.array([], np.int32))
       tf_y, tf_idx = sess.run([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
@@ -133,6 +133,39 @@ class UniqueWithCountsTest(test.TestCase):
       v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
       self.assertEqual(count, sum(v))
 
+  def testInt32Axis(self):
+    for dtype in [np.int32, np.int64]:
+      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+      with self.test_session() as sess:
+        y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([0], dtype))
+        tf_y0, tf_idx0, tf_count0 = sess.run([y0, idx0, count0])
+        y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([1], dtype))
+        tf_y1, tf_idx1, tf_count1 = sess.run([y1, idx1, count1])
+      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+      self.assertAllEqual(tf_count0, np.array([2, 1]))
+      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+      self.assertAllEqual(tf_count1, np.array([1, 2]))
+
+  def testInt32V2(self):
+    # This test is only temporary, once V2 is used
+    # by default, the axis will be wrapped to allow `axis=None`.
+    x = np.random.randint(2, high=10, size=7000)
+    with self.test_session() as sess:
+      y, idx, count = gen_array_ops.unique_with_counts_v2(
+          x, axis=np.array([], np.int32))
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      self.assertEqual(count, np.sum(x == value))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/variable_ops_test.py b/tensorflow/python/kernel_tests/variable_ops_test.py
index 79071029fd42374964d12f513e9c510bdc7400eb..cf369c071813120fef685b7220292d50b966cf11 100644
--- a/tensorflow/python/kernel_tests/variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variable_ops_test.py
@@ -165,26 +165,26 @@ class VariableOpTest(test.TestCase):
 
   def testTemporaryVariable(self):
     with self.test_session(use_gpu=True):
-      var = gen_state_ops._temporary_variable(
+      var = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="foo")
       var = state_ops.assign(var, [[4.0, 5.0]])
       var = state_ops.assign_add(var, [[6.0, 7.0]])
-      final = gen_state_ops._destroy_temporary_variable(var, var_name="foo")
+      final = gen_state_ops.destroy_temporary_variable(var, var_name="foo")
       self.assertAllClose([[10.0, 12.0]], final.eval())
 
   def testDestroyNonexistentTemporaryVariable(self):
     with self.test_session(use_gpu=True):
-      var = gen_state_ops._temporary_variable([1, 2], dtypes.float32)
-      final = gen_state_ops._destroy_temporary_variable(var, var_name="bad")
+      var = gen_state_ops.temporary_variable([1, 2], dtypes.float32)
+      final = gen_state_ops.destroy_temporary_variable(var, var_name="bad")
       with self.assertRaises(errors.NotFoundError):
         final.eval()
 
   def testDuplicateTemporaryVariable(self):
     with self.test_session(use_gpu=True):
-      var1 = gen_state_ops._temporary_variable(
+      var1 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="dup")
       var1 = state_ops.assign(var1, [[1.0, 2.0]])
-      var2 = gen_state_ops._temporary_variable(
+      var2 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="dup")
       var2 = state_ops.assign(var2, [[3.0, 4.0]])
       final = var1 + var2
@@ -193,25 +193,25 @@ class VariableOpTest(test.TestCase):
 
   def testDestroyTemporaryVariableTwice(self):
     with self.test_session(use_gpu=True):
-      var = gen_state_ops._temporary_variable([1, 2], dtypes.float32)
-      val1 = gen_state_ops._destroy_temporary_variable(var, var_name="dup")
-      val2 = gen_state_ops._destroy_temporary_variable(var, var_name="dup")
+      var = gen_state_ops.temporary_variable([1, 2], dtypes.float32)
+      val1 = gen_state_ops.destroy_temporary_variable(var, var_name="dup")
+      val2 = gen_state_ops.destroy_temporary_variable(var, var_name="dup")
       final = val1 + val2
       with self.assertRaises(errors.NotFoundError):
         final.eval()
 
   def testTemporaryVariableNoLeak(self):
     with self.test_session(use_gpu=True):
-      var = gen_state_ops._temporary_variable(
+      var = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="bar")
       final = array_ops.identity(var)
       final.eval()
 
   def testTwoTemporaryVariablesNoLeaks(self):
     with self.test_session(use_gpu=True):
-      var1 = gen_state_ops._temporary_variable(
+      var1 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="var1")
-      var2 = gen_state_ops._temporary_variable(
+      var2 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="var2")
       final = var1 + var2
       final.eval()
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 8527f116f9541942e52ba2ab635ca1212ea38583..51aa671098905e840b7c96cd5a984887d347adf9 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -19,15 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
+import threading
 
 import numpy
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
@@ -117,6 +120,16 @@ class VariableScopeTest(test.TestCase):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.dtype.base_dtype, dtypes.float16)
 
+  def testGetVariableInGraphNestedUnderEagerContext(self):
+    with context.eager_mode():
+
+      @function.defun
+      def f():
+        v = variable_scope.get_variable("should_be_resource", [])
+        self.assertEqual(type(v), resource_variable_ops.ResourceVariable)
+
+      f()
+
   def testEagerVariableStore(self):
     with context.eager_mode():
       store = variable_scope.EagerVariableStore()
@@ -155,6 +168,28 @@ class VariableScopeTest(test.TestCase):
       for v in new_store.variables():
         self.assertEqual(v.numpy(), 1)
 
+  def testEagerVariableStoreWithEagerDefun(self):
+    with context.eager_mode():
+
+      @function.defun
+      def f():
+        x = constant_op.constant([[2.0]])
+        d1 = core_layers.Dense(
+            1, name="my_dense", kernel_initializer=init_ops.ones_initializer())
+        _ = d1(x)  # create variables
+        self.assertEqual(len(d1.variables), 2)
+        v1, v2 = d1.variables
+        d2 = core_layers.Dense(
+            1,
+            name="my_dense",
+            kernel_initializer=init_ops.ones_initializer(),
+            _reuse=True)
+        _ = d2(x)
+        self.assertEqual(len(d2.variables), 2)
+        v3, v4 = d2.variables
+        self.assertAllEqual([v1, v2], [v3, v4])
+      f()
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
@@ -166,12 +201,10 @@ class VariableScopeTest(test.TestCase):
     self.evaluate(variables_lib.variables_initializer([w]))
     self.assertAllClose(self.evaluate(w.value()), [1, 2, 3])
 
-    if context.in_graph_mode():
-      with self.assertRaises(TypeError):
-        variable_scope.get_variable("x4", initializer={})
-    else:
-      with self.assertRaises(ValueError):
-        variable_scope.get_variable("x4", initializer={})
+    # A quirk to be revisited?
+    error = ValueError if context.executing_eagerly() else TypeError
+    with self.assertRaises(error):
+      variable_scope.get_variable("x4", initializer={})
 
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonInitializer(self):
@@ -210,15 +243,15 @@ class VariableScopeTest(test.TestCase):
 
           with variable_scope.variable_scope("not_cached", caching_device=""):
             v2_not_cached = variable_scope.get_variable("v", [])
-            self.assertFalse(v2_not_cached.value().device.startswith(
-                caching_device))
+            self.assertFalse(
+                v2_not_cached.value().device.startswith(caching_device))
 
           with variable_scope.variable_scope(
               "not_cached_identity_device",
               caching_device=lambda op: op.device):
             v2_identity_device = variable_scope.get_variable("v", [])
-            self.assertFalse(v2_identity_device.value().device.startswith(
-                caching_device))
+            self.assertFalse(
+                v2_identity_device.value().device.startswith(caching_device))
 
           with variable_scope.variable_scope("we_will_do_it_live") as vs_live:
             vs_live.set_caching_device("/job:live")
@@ -267,7 +300,7 @@ class VariableScopeTest(test.TestCase):
         self.assertAllClose(self.evaluate(losses[2]), 0.5)
       with variable_scope.variable_scope("foo", reuse=True):
         # reuse=True is for now only supported when eager execution is disabled.
-        if context.in_graph_mode():
+        if not context.executing_eagerly():
           v = variable_scope.get_variable("v",
                                           [])  # "v" is alredy there, reused
           losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
@@ -374,7 +407,7 @@ class VariableScopeTest(test.TestCase):
       v = variable_scope.get_variable("v", [])
       self.evaluate(variables_lib.variables_initializer([v]))
       self.assertAllClose(self.evaluate(v.value()), 0.3)
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         # Check that we can set reuse.
         variable_scope.get_variable_scope().reuse_variables()
         with self.assertRaises(ValueError):  # Fail, w does not exist yet.
@@ -408,7 +441,7 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("tower") as tower:
         with ops.name_scope("scope2") as sc2:
           self.assertEqual(sc2, "testVarScopeNameScope1/tower/scope2/")
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         with variable_scope.variable_scope(
             tower):  # Re-entering acts like another "tower".
           with ops.name_scope("scope2") as sc2:
@@ -422,7 +455,7 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("tower"):
         with ops.name_scope("scope2") as sc2:
           self.assertEqual(sc2, "testVarScopeNameScope2/tower/scope2/")
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         with variable_scope.variable_scope(tower):
           with ops.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "testVarScopeNameScope2/tower_1/scope2/")
@@ -485,15 +518,19 @@ class VariableScopeTest(test.TestCase):
 
   def testVarScopeGetOrCreateReuse(self):
     with self.test_session():
+
       def test_value(value):
         x = constant_op.constant(value)
-        with variable_scope.variable_scope("testVarScopeGetOrCreateReuse_bar",
-                                           reuse=variable_scope.AUTO_REUSE):
+        with variable_scope.variable_scope(
+            "testVarScopeGetOrCreateReuse_bar",
+            reuse=variable_scope.AUTO_REUSE):
           _ = state_ops.assign(variable_scope.get_variable("var", []), x)
-        with variable_scope.variable_scope("testVarScopeGetOrCreateReuse_bar",
-                                           reuse=variable_scope.AUTO_REUSE):
+        with variable_scope.variable_scope(
+            "testVarScopeGetOrCreateReuse_bar",
+            reuse=variable_scope.AUTO_REUSE):
           _ = variable_scope.get_variable("var", [])
         self.assertEqual(value, x.eval())
+
       test_value(42.)  # Variable is created.
       test_value(13.)  # Variable is reused hereafter.
       test_value(17.)
@@ -552,19 +589,16 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("default") as default:
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer/w:0")
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer_1/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer_1/w:0")
         with variable_scope.variable_scope(default):
           pass
         # No matter the jump in the middle, unique numbering continues.
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer_2/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer_2/w:0")
 
   def testVarOpScopeReuse(self):
     with self.test_session():
@@ -903,17 +937,15 @@ class VariableScopeTest(test.TestCase):
             "w", [], collections=["foo"])
         self.assertEqual(local_var.name, "outer/w:0")
 
-    # Since variable is local, it should be in the local variable collection
-    # but not the trainable collection.
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
+      # Since variable is local, it should be in the local variable collection
+      # but not the trainable collection.
       self.assertIn(local_var,
                     ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
       self.assertIn(local_var, ops.get_collection("foo"))
       self.assertNotIn(local_var,
                        ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
-    # Check that local variable respects `reuse`.
-    if context.in_graph_mode():
+      # Check that local variable respects `reuse`.
       with variable_scope.variable_scope(outer, "default", reuse=True):
         self.assertEqual(
             variable_scope.get_local_variable("w", []).name, "outer/w:0")
@@ -938,12 +970,12 @@ class VariableScopeTest(test.TestCase):
   def testGetCollection(self):
     with self.test_session():
       _ = variable_scope.get_variable("testGetCollection_a", [])
-      _ = variable_scope.get_variable("testGetCollection_b", [],
-                                      trainable=False)
+      _ = variable_scope.get_variable(
+          "testGetCollection_b", [], trainable=False)
       with variable_scope.variable_scope("testGetCollection_foo_") as scope1:
         _ = variable_scope.get_variable("testGetCollection_a", [])
-        _ = variable_scope.get_variable("testGetCollection_b", [],
-                                        trainable=False)
+        _ = variable_scope.get_variable(
+            "testGetCollection_b", [], trainable=False)
         self.assertEqual([
             v.name
             for v in scope1.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -957,8 +989,8 @@ class VariableScopeTest(test.TestCase):
         ])
       with variable_scope.variable_scope("testGetCollection_foo") as scope2:
         _ = variable_scope.get_variable("testGetCollection_a", [])
-        _ = variable_scope.get_variable("testGetCollection_b", [],
-                                        trainable=False)
+        _ = variable_scope.get_variable(
+            "testGetCollection_b", [], trainable=False)
         self.assertEqual([
             v.name
             for v in scope2.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -995,22 +1027,22 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope(
           "testGetTrainableVariables_foo") as scope:
         _ = variable_scope.get_variable("testGetTrainableVariables_b", [])
-        _ = variable_scope.get_variable("testGetTrainableVariables_c", [],
-                                        trainable=False)
-        self.assertEqual([v.name
-                          for v in scope.trainable_variables()],
-                         ["testGetTrainableVariables_foo/"
-                          "testGetTrainableVariables_b:0"])
+        _ = variable_scope.get_variable(
+            "testGetTrainableVariables_c", [], trainable=False)
+        self.assertEqual(
+            [v.name for v in scope.trainable_variables()],
+            ["testGetTrainableVariables_foo/"
+             "testGetTrainableVariables_b:0"])
 
   def testGetGlobalVariables(self):
     with self.test_session():
       _ = variable_scope.get_variable("testGetGlobalVariables_a", [])
       with variable_scope.variable_scope("testGetGlobalVariables_foo") as scope:
         _ = variable_scope.get_variable("testGetGlobalVariables_b", [])
-        self.assertEqual([v.name
-                          for v in scope.global_variables()],
-                         ["testGetGlobalVariables_foo/"
-                          "testGetGlobalVariables_b:0"])
+        self.assertEqual(
+            [v.name for v in scope.global_variables()],
+            ["testGetGlobalVariables_foo/"
+             "testGetGlobalVariables_b:0"])
 
   def testGetLocalVariables(self):
     with self.test_session():
@@ -1019,10 +1051,8 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("foo") as scope:
         _ = variable_scope.get_variable(
             "b", [], collections=[ops.GraphKeys.LOCAL_VARIABLES])
-        _ = variable_scope.get_variable(
-            "c", [])
-        self.assertEqual([v.name
-                          for v in scope.local_variables()], ["foo/b:0"])
+        _ = variable_scope.get_variable("c", [])
+        self.assertEqual([v.name for v in scope.local_variables()], ["foo/b:0"])
 
   def testGetVariableWithRefDtype(self):
     v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
@@ -1245,10 +1275,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       with ops.name_scope("prod_getter"):
         return g_0 * g_1
 
-    with variable_scope.variable_scope(
-        "prod_scope", custom_getter=prod_getter):
-      with variable_scope.variable_scope(
-          "sum_scope", custom_getter=sum_getter):
+    with variable_scope.variable_scope("prod_scope", custom_getter=prod_getter):
+      with variable_scope.variable_scope("sum_scope", custom_getter=sum_getter):
         with variable_scope.variable_scope(
             "inner_sum_scope", custom_getter=sum_getter):
           # take sums of sums of products
@@ -1273,9 +1301,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       np_vars, np_v = sess.run([true_vars, v])
       # take products of sums of products
       self.assertAllClose(
-          np_v,
-          (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3]))
-           + ((np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
+          np_v, (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3])) + (
+              (np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
 
   def testVariableCreator(self):
 
@@ -1353,5 +1380,95 @@ class PartitionInfoTest(test.TestCase):
     self.assertEqual(0, partition_info.single_slice_dim([2, 3]))
 
 
+class VariableScopeMultithreadedTest(test.TestCase):
+
+  def testTwoThreadsDisjointScopeEntry(self):
+
+    def thread_fn(i, graph):
+      with graph.as_default():
+        with variable_scope.variable_scope("foo"):
+          if i == 0:
+            v = variable_scope.get_variable("v", [])
+            self.assertEquals("foo/v:0", v.name)
+          else:
+            # Any thread after the first one should fail to create variable
+            # with the same name.
+            with self.assertRaises(ValueError):
+              variable_scope.get_variable("v", [])
+
+    graph = ops.get_default_graph()
+    threads = [
+        threading.Thread(target=thread_fn, args=(
+            i,
+            graph,
+        )) for i in range(2)
+    ]
+
+    threads[0].start()
+    # Allow thread 0 to finish before starting thread 1.
+    threads[0].join()
+    threads[1].start()
+    threads[1].join()
+
+  def testTwoThreadsNestedScopeEntry(self):
+
+    def thread_fn(i, graph, run_event, pause_event):
+      with graph.as_default():
+        with variable_scope.variable_scope("foo"):
+          if i == 0:
+            v = variable_scope.get_variable("v", [])
+            self.assertEquals("foo/v:0", v.name)
+          else:
+            # Any thread after the first one should fail to create variable
+            # with the same name.
+            with self.assertRaises(ValueError):
+              variable_scope.get_variable("v", [])
+          pause_event.set()
+          run_event.wait()
+
+    graph = ops.get_default_graph()
+    run_events = [threading.Event() for _ in range(2)]
+    pause_events = [threading.Event() for _ in range(2)]
+    threads = [
+        threading.Thread(
+            target=thread_fn, args=(i, graph, run_events[i], pause_events[i]))
+        for i in range(2)
+    ]
+
+    # Start first thread.
+    threads[0].start()
+    pause_events[0].wait()
+    # Start next thread once the first thread has paused.
+    threads[1].start()
+    pause_events[1].wait()
+    # Resume both threads.
+    run_events[0].set()
+    run_events[1].set()
+    threads[0].join()
+    threads[1].join()
+
+  def testReenterMainScope(self):
+
+    def thread_fn(graph, main_thread_scope):
+      with graph.as_default():
+        # Variable created with main scope will have prefix "main".
+        with variable_scope.variable_scope(main_thread_scope):
+          with variable_scope.variable_scope("foo"):
+            v = variable_scope.get_variable("v", [])
+            self.assertEquals("main/foo/v:0", v.name)
+
+        # Variable created outside main scope will not have prefix "main".
+        with variable_scope.variable_scope("bar"):
+          v = variable_scope.get_variable("v", [])
+          self.assertEquals("bar/v:0", v.name)
+
+    graph = ops.get_default_graph()
+    with variable_scope.variable_scope("main") as main_thread_scope:
+      thread = threading.Thread(
+          target=thread_fn, args=(graph, main_thread_scope))
+      thread.start()
+      thread.join()
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index b16c8c002c98a0351d1fc55fce061695327a18c9..27599868b74be323189b872c2147c6a33f84d170 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -687,7 +687,7 @@ class VariableContainerTest(test.TestCase):
         v1 = variables.Variable([1])
         with ops.container("l2"):
           v2 = variables.Variable([2])
-          special_v = gen_state_ops._variable(
+          special_v = gen_state_ops.variable(
               shape=[1],
               dtype=dtypes.float32,
               name="VariableInL3",
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index e152f02d8e983364603053dc5c8d14b5dfaf3605..60c726d54ceeb65ddf52af9b6aad685501214c24 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -18,10 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+import sys
+
 import numpy as np
 
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -48,7 +54,7 @@ class XentTest(test.TestCase):
   def _testXent(self, np_features, np_labels, use_gpu=False):
     np_loss, np_backprop = self._npXent(np_features, np_labels)
     with self.test_session(use_gpu=use_gpu) as sess:
-      loss, backprop = gen_nn_ops._softmax_cross_entropy_with_logits(
+      loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
           np_features, np_labels)
       tf_loss, tf_backprop = sess.run([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
@@ -71,7 +77,7 @@ class XentTest(test.TestCase):
   def _testSingleClass(self, use_gpu=False):
     for dtype in np.float16, np.float32:
       with self.test_session(use_gpu=use_gpu) as sess:
-        loss, backprop = gen_nn_ops._softmax_cross_entropy_with_logits(
+        loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
             np.array([[1.], [-1.], [0.]]).astype(dtype),
             np.array([[-1.], [0.], [1.]]).astype(dtype))
         tf_loss, tf_backprop = sess.run([loss, backprop])
@@ -88,8 +94,8 @@ class XentTest(test.TestCase):
                                                     4.]]]).astype(dtype)
       np_labels = np.array([[[0., 0., 0., 1.]], [[0., .5, .5,
                                                   0.]]]).astype(dtype)
-      self.assertRaisesRegexp(ValueError, "must be rank 2",
-                              gen_nn_ops._softmax_cross_entropy_with_logits,
+      self.assertRaisesRegexp(ValueError, "rank 2, but is rank 3",
+                              gen_nn_ops.softmax_cross_entropy_with_logits,
                               np_features, np_labels)
 
   def testNpXent(self):
@@ -128,17 +134,35 @@ class XentTest(test.TestCase):
     self.assertAllClose(
         np.array([1.3862, 1.9401]), np_loss, rtol=1.e-3, atol=1.e-3)
 
+  def testShapeBroadcast(self):
+    np_f = np.array([[1., 2., 3., 4.],
+                     [1., 2., 3., 4.]]).astype(np.float32)
+    np_l = np.array([[0., 0., 0., 1.],
+                     [0., .5, .5, 0.]]).astype(np.float32)
+    np_loss, np_backprop = self._npXent(np_f, np_l)
+    tf_f = constant_op.constant(
+        np.array([[1., 2., 3., 4.]]).astype(np.float32))
+    tf_l = constant_op.constant(
+        np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float32))
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
+            tf_f, tf_l)
+        tf_loss, tf_backprop = sess.run([loss, backprop])
+      self.assertAllCloseAccordingToType(np_loss, tf_loss)
+      self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
+
   def testShapeMismatch(self):
     with self.test_session():
       with self.assertRaises(ValueError):
-        gen_nn_ops._softmax_cross_entropy_with_logits(
+        gen_nn_ops.softmax_cross_entropy_with_logits(
             [[0., 1.], [2., 3.]], [[0., 1., 0.], [1., 0., 0.]])
 
   def testNotMatrix(self):
     with self.test_session():
       with self.assertRaises(ValueError):
-        gen_nn_ops._softmax_cross_entropy_with_logits([0., 1., 2., 3.],
-                                                      [0., 1., 0., 1.])
+        gen_nn_ops.softmax_cross_entropy_with_logits([0., 1., 2., 3.],
+                                                     [0., 1., 0., 1.])
 
   def testHalf(self):
     self._testAll(
@@ -260,5 +284,60 @@ class XentTest(test.TestCase):
     self.assertAllEqual(np_loss, tf_loss)
 
 
+class XentBenchmark(test.Benchmark):
+
+  def benchmarkZeroDimension(self):
+    for (m, n, p, use_gpu) in itertools.product(
+        [128],
+        [10, 100, 1000, 10000, 100000],
+        [0.001, 0.01, 0.5, 0.99, 1.0],
+        [False]):
+      k = int(p * n)
+      if k == 0:
+        continue
+      name = "zero_dimension_m_%d_n_%d_k_%g_use_gpu_%s" % (m, n, k, use_gpu)
+      device = "/%s:0" % ("gpu" if use_gpu else "cpu")
+      with ops.Graph().as_default():
+        with ops.device(device):
+          labels = array_ops.zeros([0, 2, 4], dtype=dtypes.float32)
+          logits = array_ops.zeros([0, 2, 4], dtype=dtypes.float32)
+          op = nn_ops.softmax_cross_entropy_with_logits(
+              labels=labels, logits=logits)
+        with session.Session() as sess:
+          r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
+          gb_processed_input = m * n / 1.0e9
+          throughput = gb_processed_input / r["wall_time"]
+          print("Benchmark: %s \t wall_time: %0.03g s \t "
+                "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput))
+          sys.stdout.flush()
+
+  def benchmarkSingleClass(self):
+    for (m, n, p, use_gpu) in itertools.product(
+        [128],
+        [10, 100, 1000, 10000, 100000],
+        [0.001, 0.01, 0.5, 0.99, 1.0],
+        [False]):
+      k = int(p * n)
+      if k == 0:
+        continue
+      name = "single_class_m_%d_n_%d_k_%g_use_gpu_%s" % (m, n, k, use_gpu)
+      device = "/%s:0" % ("gpu" if use_gpu else "cpu")
+      with ops.Graph().as_default():
+        with ops.device(device):
+          labels = constant_op.constant([[1.], [-1.], [0.]],
+                                        dtype=dtypes.float32)
+          logits = constant_op.constant([[-1.], [0.], [1.]],
+                                        dtype=dtypes.float32)
+          op = nn_ops.softmax_cross_entropy_with_logits(
+              labels=labels, logits=logits)
+        with session.Session() as sess:
+          r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
+          gb_processed_input = m * n / 1.0e9
+          throughput = gb_processed_input / r["wall_time"]
+          print("Benchmark: %s \t wall_time: %0.03g s \t "
+                "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput))
+          sys.stdout.flush()
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 8314c4aa87a5b54effc44c371703267517ffa07d..64db49c900c21d60ba2337f920d6fa2cb9ab7b5f 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -12,147 +12,91 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-
-# pylint: disable=unused-import,g-bad-import-order
 """Contains the base Layer class, from which all layers inherit."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import copy
-import re
-import weakref
 
-import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import utils as layers_util
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('layers.Layer')
-class Layer(object):
-  """Base layer class.
+InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
 
-  This is the class from which all layers inherit, implementing common
-  infrastructure functionality.
 
-  A layer is a class implementing common neural networks operations, such
-  as convolution, batch norm, etc. These operations require managing variables,
-  losses, and updates, as well as applying TensorFlow ops to input tensors.
+@tf_export('layers.Layer')
+class Layer(base_layer.Layer):
+  """Base layer class.
 
-  Users will just instantiate it and then treat it as a callable.
+  It is considered legacy, and we recommend the use of `tf.keras.layers.Layer`
+  instead.
 
-  We recommend that descendants of Layer implement the following methods:
-  * `__init__()`: Save configuration in member variables
-  * `build()`: Called once from `__call__`, when we know the shapes of inputs
-    and `dtype`. Should have the calls to `add_variable()`, and then
-    call the super's `build()` (which sets `self.built = True`, which is
-    nice in case the user wants to call `build()` manually before the
-    first `__call__`).
-  * `call()`: Called in `__call__` after making sure `build()` has been called
-    once. Should actually perform the logic of applying the layer to the
-    input tensors (which should be passed in as the first argument).
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
 
   Read-only properties:
-    `name`: The name of the layer (string).
-    `dtype`: Default dtype of the layer (default of `None` means use the
+    name: The name of the layer (string).
+    dtype: Default dtype of the layer's weights (default of `None` means use the
       type of the first input).
-    `trainable_variables`: List of trainable variables.
-    `non_trainable_variables`: List of non-trainable variables.
-    `variables`: List of all variables of this layer, trainable and
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and
       non-trainable.
-    `updates`: List of update ops of this layer.
-    `losses`: List of losses added by this layer.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
 
   Mutable properties:
-    `trainable`: Whether the layer should be trained (boolean).
-    `input_spec`: Optional (list of) `InputSpec` object(s) specifying the
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
       constraints on inputs that can be accepted by the layer.
   """
 
   def __init__(self, trainable=True, name=None, dtype=None,
-               activity_regularizer=None, **kwargs):
-    # We use a kwargs dict here because these kwargs only exist
-    # for compatibility reasons.
-    # The list of kwargs is subject to changes in the future.
-    # We do not want to commit to it or to expose the list to users at all.
-    # Note this is exactly as safe as defining kwargs in the function signature,
-    # the only difference being that the list of valid kwargs is defined
-    # below rather rather in the signature, and default values are defined
-    # in calls to kwargs.get().
-    allowed_kwargs = {
-        '_scope',
-        '_reuse',
-        'input_shape',  # For compatibility with Keras `Sequential` model.
-        'batch_size',  # For compatibility with Keras `Sequential` model.
-    }
-    for kwarg in kwargs:
-      if kwarg not in allowed_kwargs:
-        raise TypeError('Keyword argument not understood:', kwarg)
-
-    # Mutable properties
-    # Indicates whether the layer's weights are updated during training
-    # and whether the layer's updates are run during training
-    self.trainable = trainable
-    # A stateful layer is a layer whose updates are run during inference too,
-    # for instance stateful RNNs.
-    self.stateful = False
-    # Indicates whether `build` needs to be called upon layer call, to create
-    # the layer's weights.
-    self.built = False
-    # Provides information about which inputs are compatible with the layer.
-    self.input_spec = None
-
-    if activity_regularizer and context.in_eager_mode():
-      raise ValueError(
-          ('Activity regularization is not supported when executing eagerly. '
-           'Got activity_regularizer=%s') % (activity_regularizer,))
-    self._activity_regularizer = activity_regularizer
+               **kwargs):
+    # For backwards compatibility, legacy layers do not use `ResourceVariable`
+    # by default.
+    self._use_resource_variables = False
+    scope = kwargs.pop('_scope', None)
+    self._reuse = kwargs.pop('_reuse', None)
+
+    # Avoid an incorrect lint error
     self._trainable_weights = []
-    self._non_trainable_weights = []
-    self._updates = []
-    # When executing eagerly, _losses is a list of zero-argument lambdas which
-    # return tensors. When using graph execution, _losses is a list of ops.
-    self._losses = []
-    self._reuse = kwargs.get('_reuse')
-    self._graph = ops.get_default_graph()
-    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
-    call_fn_args = estimator_util.fn_args(self.call)
-    self._compute_previous_mask = ('mask' in call_fn_args or
-                                   hasattr(self, 'compute_mask'))
-    self._call_has_scope_arg = 'scope' in call_fn_args
-
-    # These lists will be filled via successive calls
-    # to self._add_inbound_node().
-    self._inbound_nodes = []
-    self._outbound_nodes = []
+    self.built = False
 
-    self._init_set_name(name)
+    super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
+                                **kwargs)
 
-    # Determine variable scope.
-    scope = kwargs.get('_scope')
+    self._graph = None
+    self._call_has_scope_arg = 'scope' in self._call_fn_args
     if scope:
       with vs.variable_scope(scope) as captured_scope:
         self._scope = captured_scope
     else:
       self._scope = None
+    self._current_scope = None
 
-    # Set `_batch_input_shape` attribute
-    # for compatibility with Keras `Sequential` model.
-    if 'input_shape' in kwargs:
-      batch_size = kwargs.get('batch_size')
-      self._batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
+  @property
+  def graph(self):
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.graph not supported when executing eagerly.')
+    return self._graph
 
   def _init_set_name(self, name):
     # Determine layer name (non-unique).
@@ -165,18 +109,15 @@ class Layer(object):
       self._name, base_name = self._make_unique_name()
     self._base_name = base_name
 
-  @property
-  def dtype(self):
-    return self._dtype
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def activity_regularizer(self):
-    """Optional regularizer function for the output of this layer."""
-    return self._activity_regularizer
+  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
+                        namespace='', zero_based=False):
+    base_name = base_layer.to_snake_case(self.__class__.__name__)
+    name = base_layer.unique_layer_name(base_name,
+                                        name_uid_map=name_uid_map,
+                                        avoid_names=avoid_names,
+                                        namespace=namespace,
+                                        zero_based=zero_based)
+    return (name, base_name)
 
   @property
   def scope_name(self):
@@ -188,271 +129,16 @@ class Layer(object):
                        'querying `scope_name`.')
     return self._scope.name
 
-  @property
-  def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
-
-  @property
-  def non_trainable_weights(self):
-    if self.trainable:
-      return self._non_trainable_weights
-    else:
-      return self._trainable_weights + self._non_trainable_weights
-
-  @property
-  def trainable_variables(self):
-    return self.trainable_weights
-
-  @property
-  def non_trainable_variables(self):
-    return self.non_trainable_weights
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.trainable_weights + self.non_trainable_weights
-
-  @property
-  def variables(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.weights
-
-  @property
-  def updates(self):
-    if context.in_eager_mode():
-      raise RuntimeError('Layer.updates not supported in Eager mode.')
-    if not self.trainable and not self.stateful:
-      return []
-    return self._updates
-
-  def add_update(self, updates, inputs=None):
-    """Add update op(s), potentially dependent on layer inputs.
-
-    Weight updates (for instance, the updates of the moving mean and variance
-    in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing the same layer on
-    different inputs `a` and `b`, some entries in `layer.updates` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_updates_for` method allows to retrieve the updates relevant to a
-    specific set of inputs.
-
-    This call is ignored in Eager mode.
-
-    Arguments:
-      updates: Update op, or list/tuple of update ops.
-      inputs: If anything other than None is passed, it signals the updates
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for BatchNormalization updates, for instance.
-        If None, the updates will be taken into account unconditionally,
-        and you are responsible for making sure that any dependency they might
-        have is available at runtime.
-        A step counter might fall into this category.
-    """
-    if context.in_eager_mode():
-      return  # Updates already applied when in eager mode.
-
-    updates = _to_list(updates)
-    updates = [x if isinstance(x, ops.Operation)
-               else ops.convert_to_tensor(x) for x in updates]
-    self._updates += updates
-    if inputs is None:
-      for u in updates:
-        u._unconditional_update = True  # pylint: disable=protected-access
-    else:
-      for u in updates:
-        u._unconditional_update = False  # pylint: disable=protected-access
-
-  def get_updates_for(self, inputs):
-    """Retrieves updates relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of update ops of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
-
-    # Updates disabled if layer is not trainable and not explicitly stateful.
-    if not self.trainable and not self.stateful:
-      return []
-
-    if inputs is None:
-      # Requesting unconditional updates.
-      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
-
-    # Requesting input-conditional updates.
-    inputs = nest.flatten(inputs)
-    reachable = layers_util.get_reachable_from_inputs(inputs, self.updates)
-    updates = []
-    for update in self.updates:
-      if update in reachable:
-        updates.append(update)
-    return updates
-
-  @property
-  def losses(self):
-    """Losses which are associated with this `Layer`.
-
-    Note that when executing eagerly, getting this property evaluates
-    regularizers. When using graph execution, variable regularization ops have
-    already been created and are simply returned here.
-
-    Returns:
-      A list of tensors.
-    """
-    if context.in_eager_mode():
-      # _losses may only contain variable regularization losses when executing
-      # eagerly, and they have been saved as lambdas to be executed when
-      # requested.
-      return [regularizer() for regularizer in self._losses]
-    else:
-      return self._losses
-
   def add_loss(self, losses, inputs=None):
-    """Add loss tensor(s), potentially dependent on layer inputs.
-
-    Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing the same
-    layer on different inputs `a` and `b`, some entries in `layer.losses` may
-    be dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_losses_for` method allows to retrieve the losses relevant to a
-    specific set of inputs.
-
-    Note that `add_loss` is not supported when executing eagerly. Instead,
-    variable regularizers may be added through `add_variable`. Activity
-    regularization is not supported directly (but such losses may be returned
-    from `Layer.call()`).
-
-    Arguments:
-      losses: Loss tensor, or list/tuple of tensors.
-      inputs: If anything other than None is passed, it signals the losses
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for activity regularization losses, for instance.
-        If `None` is passed, the losses are assumed
-        to be unconditional, and will apply across all dataflows of the layer
-        (e.g. weight regularization losses).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.in_eager_mode():
-      # TODO(fchollet): it should be possible (and highly desirable) to support
-      # `add_loss` in eager mode. This allows great convenience and flexibility
-      # in defining custom losses on the fly (e.g. in VAEs).
-      # Simply appending the loss value to `self._losses`
-      # is the correct behavior.
-      # The only caveat is that we need to force the user to only call
-      # `add_loss` from inside a model or Layer's `call` method
-      # (otherwise the loss computation cannot be backproped through).
-      raise RuntimeError('Layer.add_loss not supported in Eager mode.')
-
-    losses = _to_list(losses)
-    self._losses += losses
-    if inputs is None:
-      for loss in losses:
-        loss._unconditional_loss = True  # pylint: disable=protected-access
-    else:
-      for loss in losses:
-        loss._unconditional_loss = False  # pylint: disable=protected-access
+    previous_losses_length = len(self._losses)
+    super(Layer, self).add_loss(losses, inputs=inputs)
     # TODO(fchollet): deprecate collection below.
-    _add_elements_to_collection(losses, ops.GraphKeys.REGULARIZATION_LOSSES)
+    new_losses = self._losses[previous_losses_length:]
+    _add_elements_to_collection(new_losses, ops.GraphKeys.REGULARIZATION_LOSSES)
 
-  def get_losses_for(self, inputs):
-    """Retrieves losses relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of loss tensors of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-
-    if inputs is None:
-      # Requesting unconditional losses.
-      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
-
-    # Requesting input-conditional losses.
-    inputs = nest.flatten(inputs)
-    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
-    # The losses we want to return will be part of this set.
-    # To avoid unnecessary work, we stop the search in case all of
-    # `self.losses` have been retrieved.
-    reachable = layers_util.get_reachable_from_inputs(inputs, self.losses)
-    losses = []
-    for loss in self.losses:
-      if loss in reachable:
-        losses.append(loss)
-    return losses
-
-  def build(self, _):
-    """Creates the variables of the layer."""
-    self.built = True
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs: input tensor(s).
-      **kwargs: additional keyword arguments.
-
-    Returns:
-      Output tensor(s).
-    """
-    return inputs
-
-  def _name_scope_name(self, current_variable_scope):
+  def _name_scope(self):
     """Determines op naming for the Layer."""
-    return current_variable_scope.original_name_scope
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Returns:
-      A (possibly nested tuple of) `TensorShape`.
-
-    Raises:
-      TypeError: if `input_shape` is not a (possibly nested tuple of)
-        `TensorShape`.
-      ValueError: if `input_shape` is incomplete or is incompatible with the
-        the layer.
-    """
-    raise NotImplementedError
-
-  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
-                        namespace='', zero_based=False):
-    base_name = _to_snake_case(self.__class__.__name__)
-    name = _unique_layer_name(base_name, name_uid_map=name_uid_map,
-                              avoid_names=avoid_names, namespace=namespace,
-                              zero_based=zero_based)
-    return (name, base_name)
+    return self._current_scope.original_name_scope
 
   def _set_scope(self, scope=None):
     if self._scope is None:
@@ -466,10 +152,11 @@ class Layer(object):
             scope, default_name=self._base_name) as captured_scope:
           self._scope = captured_scope
 
-  def add_variable(self, name, shape, dtype=None,
-                   initializer=None, regularizer=None,
-                   trainable=True, constraint=None,
-                   partitioner=None):
+  def add_weight(self, name, shape, dtype=None,
+                 initializer=None, regularizer=None,
+                 trainable=True, constraint=None,
+                 use_resource=None,
+                 partitioner=None):
     """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
@@ -485,6 +172,7 @@ class Layer(object):
         then this parameter is ignored and any added variables are also
         marked as non-trainable.
       constraint: constraint instance (callable).
+      use_resource: Whether to use `ResourceVariable`.
       partitioner: (optional) partitioner instance (callable).  If
         provided, when the requested variable is created it will be split
         into multiple partitions according to `partitioner`.  In this case,
@@ -503,12 +191,8 @@ class Layer(object):
       RuntimeError: If called with partioned variable regularization and
         eager execution is enabled.
     """
-
-    # `init_graph` should point to the graph in which variable initialization
-    # will occur; it should be None if and only if initialization will take
-    # place in the eager context.
     init_graph = None
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       default_graph = ops.get_default_graph()
       if default_graph.building_function:
         with ops.init_scope():
@@ -516,7 +200,7 @@ class Layer(object):
           # will be lifted; if initialization ops will be lifted into
           # the eager context, then there is nothing to retrieve, since variable
           # collections are not supported when eager execution is enabled.
-          if context.in_graph_mode():
+          if not context.executing_eagerly():
             init_graph = ops.get_default_graph()
             existing_variables = set(tf_variables.global_variables())
       else:
@@ -529,67 +213,43 @@ class Layer(object):
 
     self._set_scope(None)
     reuse = self.built or self._reuse
+    prev_len_trainable = len(self._trainable_weights)
     with vs.variable_scope(
         self._scope, reuse=reuse, auxiliary_name_scope=False) as scope:
-      with ops.name_scope(self._name_scope_name(scope)):
-        variable = vs.get_variable(name,
-                                   shape=shape,
-                                   initializer=initializer,
-                                   dtype=dtypes.as_dtype(dtype),
-                                   constraint=constraint,
-                                   trainable=trainable and self.trainable,
-                                   partitioner=partitioner)
-
-        if init_graph is not None:  # pylint: disable=protected-access
-          # The variable was created and initialized in a graph.
-
-          if variable in existing_variables:
-            # To match the behavior of tf.get_variable(), we only apply
-            # regularization if the variable is newly created.
-            return variable
-
+      self._current_scope = scope
+      with ops.name_scope(self._name_scope()):
+        use_resource = (use_resource or
+                        self._use_resource_variables or
+                        scope.use_resource)
+        variable = super(Layer, self).add_weight(
+            name,
+            shape,
+            dtype=dtypes.as_dtype(dtype),
+            initializer=initializer or scope.initializer,
+            trainable=trainable,
+            constraint=constraint,
+            partitioner=partitioner,
+            use_resource=use_resource,
+            getter=vs.get_variable)
+
+        if regularizer:
+          if context.executing_eagerly() or variable not in existing_variables:
+            self._handle_weight_regularization(name, variable, regularizer)
+
+        if init_graph is not None:
+          # Handle edge case where a custom getter has overridden `trainable`.
+          # There is one known occurrence of this, in unit test
+          # testBasicRNNCellNotTrainable in
+          # contrib.rnn.python.kernel_tests.core_rnn_cell_test
           with init_graph.as_default():
             trainable_variables = tf_variables.trainable_variables()
           if (trainable and self.trainable and
               variable not in trainable_variables):
             # A custom getter / variable scope overrode the trainable flag.
-            trainable = False
-
-          if regularizer:
-            if isinstance(variable, tf_variables.PartitionedVariable):
-              for v in variable:
-                with ops.colocate_with(v.op):
-                  with ops.name_scope(name + '/Regularizer'):
-                    regularization = regularizer(v)
-                if regularization is not None:
-                  self.add_loss(regularization)
-            else:
-              with ops.colocate_with(variable.op):
-                with ops.name_scope(name + '/Regularizer'):
-                  regularization = regularizer(variable)
-              if regularization is not None:
-                self.add_loss(regularization)
-        elif regularizer:  # and initialization took place in an eager context
-          if isinstance(variable, tf_variables.PartitionedVariable):
-            raise RuntimeError(
-                'Partitioned variable regularization is not yet '
-                'supported when executing eagerly. File a feature request'
-                'if this is important to you.')
-          # Save a zero-argument lambda which runs the regularizer on the
-          # variable, to be executed when `Layer.losses` is requested.
-          # This makes losses responsive to variable updates when executing
-          # eagerly.
-          #
-          # TODO(akshayka): Do the same for graphs as well, so that losses
-          # collected in a while_loop can be run outside its control flow
-          # context and so that losses won't be swallowed up by graph functions
-          # (i.e., `.losses()` should always create regularizers).
-          self._losses.append(lambda: regularizer(variable))
-
-    if trainable:
-      self._trainable_weights.append(variable)
-    else:
-      self._non_trainable_weights.append(variable)
+            extra_trainable_vars = self._trainable_weights[prev_len_trainable:]
+            self._trainable_weights = self._trainable_weights[
+                :prev_len_trainable]
+            self._non_trainable_weights += extra_trainable_vars
     return variable
 
   def __call__(self, inputs, *args, **kwargs):
@@ -617,31 +277,14 @@ class Layer(object):
       ValueError: if the layer's `call` method returns None (an invalid value).
     """
     self._set_scope(kwargs.pop('scope', None))
-    input_list = nest.flatten(inputs)
 
-    in_graph_mode = context.in_graph_mode()
-    in_deferred_mode = isinstance(input_list[0], _DeferredTensor)
-    # Ensure the Layer, if being reused, is working with inputs from
-    # the same graph as where it was created.
-    if in_graph_mode:
+    if not context.executing_eagerly():
       try:
-        ops._get_graph_from_inputs(input_list, graph=self.graph)  # pylint: disable=protected-access
+        # Set layer's "graph" at build time
+        self._graph = ops._get_graph_from_inputs(nest.flatten(inputs),  # pylint: disable=protected-access
+                                                 graph=self._graph)
       except ValueError as e:
         raise ValueError('Input graph and Layer graph are not the same: %s' % e)
-    if in_graph_mode or in_deferred_mode:
-      user_kwargs = copy.copy(kwargs)
-
-    # Handle Keras mask propagation from previous layer to current layer.
-    previous_mask = None
-    if (not hasattr(self, '_compute_previous_mask') or
-        self._compute_previous_mask):
-      previous_mask = _collect_previous_mask(inputs)
-      if ('mask' in estimator_util.fn_args(self.call) and
-          'mask' not in kwargs and
-          not _is_all_none(previous_mask)):
-        # The previous layer generated a mask, and mask was not explicitly pass
-        # to __call__, hence we set previous_mask as the default value.
-        kwargs['mask'] = previous_mask
 
     if self.built:
       try:
@@ -658,130 +301,27 @@ class Layer(object):
     else:
       scope_context_manager = vs.variable_scope(
           self._scope, reuse=self._reuse, auxiliary_name_scope=False)
-    input_shapes = None
-    with scope_context_manager as scope:
-      with ops.name_scope(self._name_scope_name(scope)):
-        if not self.built:
-          if not in_graph_mode:
-            # Activity regularization is currently unsupported in Eager mode.
-            if self._activity_regularizer:
-              raise ValueError('activity_regularizer currently unsupported in '
-                               'Eager mode. Found an activity_regularizer in '
-                               '%s(%s).' % (self.__class__.__name__, self))
-          if not in_graph_mode and not in_deferred_mode:
-            # TODO(agarwal): support _keras_history in Eager mode.
-            for x in input_list:
-              if hasattr(x, '_keras_history'):
-                raise ValueError('_keras_history currently unsupported in '
-                                 'Eager mode. Found _keras_history in %s while '
-                                 'executing __call__ for %s(%s)' %
-                                 (x, self.__class_.__name__, self))
-
-          # Check input assumptions set before layer building, e.g. input rank.
-          self._assert_input_compatibility(inputs)
-          if input_list and self._dtype is None:
-            try:
-              self._dtype = input_list[0].dtype.base_dtype.name
-            except AttributeError:
-              pass
-          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
-          self.build(input_shapes)
-        try:
-          # Note: not all sub-classes of Layer call Layer.__init__ (especially
-          # the ones under tensorflow/python/keras). Hence we recompute this
-          # attribute here if it is not set.
-          # TODO(agarwal): Fix the sub-classes and avoid this complexity.
-          call_has_scope_arg = self._call_has_scope_arg
-        except AttributeError:
-          call_has_scope_arg = 'scope' in estimator_util.fn_args(self.call)
-        if call_has_scope_arg:
-          kwargs['scope'] = scope
-        # Check input assumptions set after layer building, e.g. input shape.
-        if in_graph_mode or in_deferred_mode:
-          self._assert_input_compatibility(inputs)
-
-        if not in_deferred_mode:
-          outputs = self.call(inputs, *args, **kwargs)
-          if outputs is None:
-            raise ValueError('A layer\'s `call` method should return a Tensor '
-                             'or a list of Tensors, not None.')
-        else:
-          # Deferred mode behavior: use `compute_output_shape` to
-          # infer the number of outputs of the layer and their shapes.
-          if input_shapes is None:
-            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
 
-          output_shapes = self.compute_output_shape(input_shapes)
-          output_shapes = nest.flatten(output_shapes)
-          outputs = [
-              # TODO(fchollet): name the deferred tensors?
-              _DeferredTensor(shape=shape, dtype=self._dtype)
-              for shape in output_shapes
-          ]
-          if len(outputs) == 1:
-            outputs = outputs[0]
-
-        if in_graph_mode:
-          # Apply activity regularization.
-          # Note that it should be applied every time the layer creates a new
-          # output, since it is output-specific.
-          if self._activity_regularizer:
-            output_list = nest.flatten(outputs)
-            for output in output_list:
-              with ops.name_scope('ActivityRegularizer'):
-                activity_regularization = self._activity_regularizer(output)
-              self.add_loss(activity_regularization, inputs=inputs)
+    with scope_context_manager as scope:
+      self._current_scope = scope
 
-          # TODO(fchollet): consider enabling masking for Eager mode.
-          if hasattr(self, 'compute_mask'):
-            output_mask = self.compute_mask(inputs, previous_mask)
-            if isinstance(outputs, (list, tuple)):
-              if output_mask is None:
-                output_mask = [None for _ in range(len(outputs))]
-              for x, m in zip(outputs, output_mask):
-                x._keras_mask = m  # pylint: disable=protected-access
-            else:
-              outputs._keras_mask = output_mask  # pylint: disable=protected-access
+      try:
+        call_has_scope_arg = self._call_has_scope_arg
+      except AttributeError:
+        self._call_fn_args = estimator_util.fn_args(self.call)
+        self._call_has_scope_arg = 'scope' in self._call_fn_args
+        call_has_scope_arg = self._call_has_scope_arg
+      if call_has_scope_arg:
+        kwargs['scope'] = scope
 
-    if in_graph_mode:
-      # If all input tensors have history metadata,
-      # we update the output tensors
-      # with corresponding history metadata, thus eventually allowing to use
-      # these tensors to instantiate a Network.
-      if _have_all_keras_metadata(inputs):
-        # If the layer returns tensors from its inputs, unmodified,
-        # we copy them to avoid loss of tensor metadata.
-        output_ls = nest.flatten(outputs)
-        output_ls_copy = []
-        for x in output_ls:
-          if x in input_list:
-            with ops.name_scope(scope.original_name_scope):
-              x = array_ops.identity(x)
-          output_ls_copy.append(x)
-        if len(output_ls_copy) == 1:
-          outputs = output_ls_copy[0]
-        else:
-          outputs = output_ls_copy
+      # Actually call layer
+      outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
 
+    if not context.executing_eagerly():
       # Update global default collections.
       _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
-
-    if in_deferred_mode or in_graph_mode:
-      if _have_all_keras_metadata(inputs):
-        # Add an inbound node to the layer, so it can keep track of this call.
-        # This updates the layer history of the output tensor(s).
-        self._add_inbound_node(
-            input_tensors=inputs, output_tensors=outputs, arguments=user_kwargs)
-
-    self.built = True
     return outputs
 
-  @property
-  def graph(self):
-    if context.in_eager_mode():
-      raise RuntimeError('Layer.graph not supported in Eager mode.')
-    return self._graph
-
   def __deepcopy__(self, memo):
     no_copy = set(['_graph'])
     shallow_copy = set(['_scope', '_always_reuse_variable_scope'])
@@ -793,675 +333,15 @@ class Layer(object):
         setattr(result, k, v)
       elif k in shallow_copy:
         setattr(result, k, copy.copy(v))
-      elif _is_tensor_or_tensor_list(v):
+      elif base_layer.is_tensor_or_tensor_list(v):
         setattr(result, k, v)
       else:
         setattr(result, k, copy.deepcopy(v, memo))
     return result
 
-  def apply(self, inputs, *args, **kwargs):
-    """Apply the layer on a input.
-
-    This simply wraps `self.__call__`.
-
-    Arguments:
-      inputs: Input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-    """
-    return self.__call__(inputs, *args, **kwargs)
-
-  def _add_inbound_node(self,
-                        input_tensors,
-                        output_tensors,
-                        arguments=None):
-    """Internal method to create an inbound node for the layer.
-
-    Arguments:
-        input_tensors: list of input tensors.
-        output_tensors: list of output tensors.
-        arguments: dictionary of keyword arguments that were passed to the
-            `call` method of the layer at the call that created the node.
-    """
-    input_tensors = nest.flatten(input_tensors)
-    output_tensors = nest.flatten(output_tensors)
-
-    # Collect input tensor(s) coordinates.
-    inbound_layers = []
-    node_indices = []
-    tensor_indices = []
-    for x in input_tensors:
-      assert hasattr(x, '_keras_history')
-      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      inbound_layers.append(inbound_layer)
-      node_indices.append(node_index)
-      tensor_indices.append(tensor_index)
-
-    # Create node, add it to inbound nodes.
-    Node(
-        self,
-        inbound_layers=inbound_layers,
-        node_indices=node_indices,
-        tensor_indices=tensor_indices,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        arguments=arguments)
-
-    # Update tensor history metadata.
-    for i in range(len(output_tensors)):
-      # The metadata attribute consists of 1) a layer instance
-      # 2) a node index for the layer, 3) a tensor index for the node.
-      # The allows layer reuse (multiple nodes per layer) and multi-output
-      # or multi-input layers (e.g. a layer can return multiple tensors,
-      # and each can be sent to a different layer).
-      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
-
-  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
-    """Private utility to retrieves an attribute (e.g. inputs) from a node.
-
-    This is used to implement the methods:
-        - get_input_shape_at
-        - get_output_shape_at
-        - get_input_at
-        etc...
-
-    Arguments:
-        node_index: Integer index of the node from which
-            to retrieve the attribute.
-        attr: Exact node attribute name.
-        attr_name: Human-readable attribute name, for error messages.
-
-    Returns:
-        The layer's attribute `attr` at the node of index `node_index`.
-
-    Raises:
-        RuntimeError: If the layer has no inbound nodes, or if called in Eager
-        mode.
-        ValueError: If the index provided does not match any node.
-    """
-    assert context.in_graph_mode()
-    if not self._inbound_nodes:
-      raise RuntimeError('The layer has never been called '
-                         'and thus has no defined ' + attr_name + '.')
-    if not len(self._inbound_nodes) > node_index:
-      raise ValueError('Asked to get ' + attr_name + ' at node ' +
-                       str(node_index) + ', but the layer has only ' +
-                       str(len(self._inbound_nodes)) + ' inbound nodes.')
-    values = getattr(self._inbound_nodes[node_index], attr)
-    if len(values) == 1:
-      return values[0]
-    else:
-      return values
-
-  def get_input_shape_at(self, node_index):
-    """Retrieves the input shape(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError(
-          'Layer.get_input_shape_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'input_shapes',
-                                             'input shape')
-
-  def get_output_shape_at(self, node_index):
-    """Retrieves the output shape(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError(
-          'Layer.get_output_shape_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'output_shapes',
-                                             'output shape')
-
-  def get_input_at(self, node_index):
-    """Retrieves the input tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError('Layer.get_input_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'input_tensors',
-                                             'input')
-
-  def get_output_at(self, node_index):
-    """Retrieves the output tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError('Layer.get_output_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'output_tensors',
-                                             'output')
-
-  @property
-  def input(self):
-    """Retrieves the input tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input tensor or list of input tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-      AttributeError: If no inbound nodes are found.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError('Layer.input not supported in Eager mode.')
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name +
-                           ' is not connected, no input to return.')
-    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
-
-  @property
-  def output(self):
-    """Retrieves the output tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one output,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-      Output tensor or list of output tensors.
-
-    Raises:
-      AttributeError: if the layer is connected to more than one incoming
-        layers.
-      RuntimeError: if called in Eager mode.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError('Layer.output not supported in Eager mode.')
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
-    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
-
-  @property
-  def input_shape(self):
-    """Retrieves the input shape(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer, or if all inputs
-    have the same shape.
-
-    Returns:
-        Input shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per input tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined input_shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError('Layer.input_shape not supported in Eager mode.')
-    if not self._inbound_nodes:
-      raise AttributeError('The layer has never been called '
-                           'and thus has no defined input shape.')
-    all_input_shapes = set(
-        [str(node.input_shapes) for node in self._inbound_nodes])
-    if len(all_input_shapes) == 1:
-      input_shapes = self._inbound_nodes[0].input_shapes
-      if len(input_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in input_shapes
-        ]
-    else:
-      raise AttributeError('The layer "' + str(self.name) +
-                           ' has multiple inbound nodes, '
-                           'with different input shapes. Hence '
-                           'the notion of "input shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_input_shape_at(node_index)` '
-                           'instead.')
-
-  def count_params(self):
-    """Count the total number of scalars composing the weights.
-
-    Returns:
-        An integer count.
-
-    Raises:
-        ValueError: if the layer isn't yet built
-          (in which case its weights aren't yet defined).
-    """
-    if not self.built:
-      if self.__class__.__name__ == 'Sequential':
-        self.build()  # pylint: disable=no-value-for-parameter
-      else:
-        raise ValueError('You tried to call `count_params` on ' + self.name +
-                         ', but the layer isn\'t built. '
-                         'You can build it manually via: `' + self.name +
-                         '.build(batch_input_shape)`.')
-    weight_shapes = [w.get_shape().as_list() for w in self.weights]
-    return int(sum([np.prod(w) for w in weight_shapes]))
-
-  @property
-  def output_shape(self):
-    """Retrieves the output shape(s) of a layer.
-
-    Only applicable if the layer has one output,
-    or if all outputs have the same shape.
-
-    Returns:
-        Output shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per output tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined output shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if context.in_eager_mode():
-      raise RuntimeError('Layer.output_shape not supported in Eager mode.')
-    if not self._inbound_nodes:
-      raise AttributeError('The layer has never been called '
-                           'and thus has no defined output shape.')
-    all_output_shapes = set(
-        [str(node.output_shapes) for node in self._inbound_nodes])
-    if len(all_output_shapes) == 1:
-      output_shapes = self._inbound_nodes[0].output_shapes
-      if len(output_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in output_shapes
-        ]
-    else:
-      raise AttributeError('The layer "%s"'
-                           ' has multiple inbound nodes, '
-                           'with different output shapes. Hence '
-                           'the notion of "output shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_output_shape_at(node_index)` '
-                           'instead.' % self.name)
-
-  @property
-  def inbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._inbound_nodes
-
-  @property
-  def outbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._outbound_nodes
-
-  def _assert_input_compatibility(self, inputs):
-    """Checks compatibility between the layer and provided inputs.
-
-    This checks that the tensor(s) `inputs` verify the input assumptions
-    of the layer (if any). If not, a clear and actional exception gets raised.
-
-    Arguments:
-        inputs: input tensor or list of input tensors.
-
-    Raises:
-        ValueError: in case of mismatch between
-            the provided inputs and the expectations of the layer.
-    """
-    if not self.input_spec:
-      return
-    if not isinstance(self.input_spec, (list, tuple)):
-      input_spec = nest.flatten(self.input_spec)
-    else:
-      input_spec = self.input_spec
-    inputs = nest.flatten(inputs)
-    if len(inputs) != len(input_spec):
-      raise ValueError('Layer ' + self.name + ' expects ' +
-                       str(len(input_spec)) + ' inputs, '
-                       'but it received ' + str(len(inputs)) +
-                       ' input tensors. Inputs received: ' + str(inputs))
-    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-      if spec is None:
-        continue
-
-      if (spec.ndim is not None or
-          spec.min_ndim is not None or
-          spec.max_ndim is not None):
-        if x.get_shape().ndims is None:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'its rank is undefined, but the layer requires a '
-                           'defined rank.')
-
-      # Check ndim.
-      if spec.ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim != spec.ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
-                           str(ndim) + '. Full shape received: ' +
-                           str(x.get_shape().as_list()))
-      if spec.max_ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim is not None and ndim > spec.max_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected max_ndim=' + str(spec.max_ndim) +
-                           ', found ndim=' + str(ndim))
-      if spec.min_ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim is not None and ndim < spec.min_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           ': expected min_ndim=' + str(spec.min_ndim) +
-                           ', found ndim=' + str(ndim) +
-                           '. Full shape received: ' +
-                           str(x.get_shape().as_list()))
-      # Check dtype.
-      if spec.dtype is not None:
-        if x.dtype != spec.dtype:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected dtype=' + str(spec.dtype) +
-                           ', found dtype=' + str(x.dtype))
-      # Check specific shape axes.
-      if spec.axes:
-        shape = x.get_shape().as_list()
-        if shape is not None:
-          for axis, value in spec.axes.items():
-            if hasattr(value, 'value'):
-              value = value.value
-            if value is not None and shape[int(axis)] not in {value, None}:
-              raise ValueError(
-                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
-                  ' incompatible with the layer: expected axis ' + str(axis) +
-                  ' of input shape to have value ' + str(value) +
-                  ' but received input with shape ' + str(shape))
-      # Check shape.
-      if spec.shape is not None:
-        shape = x.get_shape().as_list()
-        if shape is not None:
-          for spec_dim, dim in zip(spec.shape, shape):
-            if spec_dim is not None and dim is not None:
-              if spec_dim != dim:
-                raise ValueError('Input ' + str(input_index) +
-                                 ' is incompatible with layer ' + self.name +
-                                 ': expected shape=' + str(spec.shape) +
-                                 ', found shape=' + str(shape))
-
-
-@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
-class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
-
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
-
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
-
-  Arguments:
-      dtype: Expected DataType of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
-  """
-
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None):
-    self.dtype = dtype
-    self.shape = shape
-    if shape is not None:
-      self.ndim = len(shape)
-    else:
-      self.ndim = ndim
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.axes = axes or {}
-
-  def __repr__(self):
-    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
-            ('shape=' + str(self.shape)) if self.shape else '',
-            ('ndim=' + str(self.ndim)) if self.ndim else '',
-            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
-            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
-            ('axes=' + str(self.axes)) if self.axes else '']
-    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
-
-
-class Node(object):
-  """A `Node` describes the connectivity between two layers.
-
-  Each time a layer is connected to some new input,
-  a node is added to `layer._inbound_nodes`.
-  Each time the output of a layer is used by another layer,
-  a node is added to `layer._outbound_nodes`.
-
-  Arguments:
-      outbound_layer: the layer that takes
-          `input_tensors` and turns them into `output_tensors`
-          (the node gets created when the `call`
-          method of the layer was called).
-      inbound_layers: a list of layers, the same length as `input_tensors`,
-          the layers from where `input_tensors` originate.
-      node_indices: a list of integers, the same length as `inbound_layers`.
-          `node_indices[i]` is the origin node of `input_tensors[i]`
-          (necessary since each inbound layer might have several nodes,
-          e.g. if the layer is being shared with a different data stream).
-      tensor_indices: a list of integers,
-          the same length as `inbound_layers`.
-          `tensor_indices[i]` is the index of `input_tensors[i]` within the
-          output of the inbound layer
-          (necessary since each inbound layer might
-          have multiple tensor outputs, with each one being
-          independently manipulable).
-      input_tensors: list of input tensors.
-      output_tensors: list of output tensors.
-      arguments: dictionary of keyword arguments that were passed to the
-          `call` method of the layer at the call that created the node.
-
-  `node_indices` and `tensor_indices` are basically fine-grained coordinates
-  describing the origin of the `input_tensors`.
-
-  A node from layer A to layer B is added to:
-    - A._outbound_nodes
-    - B._inbound_nodes
-  """
-
-  def __init__(self,
-               outbound_layer,
-               inbound_layers,
-               node_indices,
-               tensor_indices,
-               input_tensors,
-               output_tensors,
-               arguments=None):
-    # Layer instance (NOT a list).
-    if isinstance(outbound_layer, list):
-      raise ValueError(
-          '`outbound_layer` should be a layer instance, not a list.')
-    # this is the layer that takes a list of input tensors
-    # and turns them into a list of output tensors.
-    # the current node will be added to
-    # the inbound_nodes of outbound_layer.
-    self.outbound_layer = outbound_layer
-
-    # The following 3 properties describe where
-    # the input tensors come from: which layers,
-    # and for each layer, which node and which
-    # tensor output of each node.
-
-    # List of layer instances.
-    self.inbound_layers = inbound_layers
-    # List of integers, 1:1 mapping with inbound_layers.
-    self.node_indices = node_indices
-    # List of integers, 1:1 mapping with inbound_layers.
-    self.tensor_indices = tensor_indices
-
-    # Following 2 properties:
-    # tensor inputs and outputs of outbound_layer.
-
-    # List of tensors. 1:1 mapping with inbound_layers.
-    self.input_tensors = input_tensors
-    # List of tensors, created by outbound_layer.call().
-    self.output_tensors = output_tensors
-
-    # Following 2 properties: input and output shapes.
-
-    # List of shape tuples, shapes of input_tensors.
-    self.input_shapes = [layers_util.static_shape(x) for x in input_tensors]
-    # List of shape tuples, shapes of output_tensors.
-    self.output_shapes = [layers_util.static_shape(x) for x in output_tensors]
-
-    # Optional keyword arguments to layer's `call`.
-    self.arguments = arguments
-
-    # Add nodes to all layers involved.
-    for layer in inbound_layers:
-      if layer is not None:
-        # For compatibility with external Keras, we use the deprecated
-        # accessor here.
-        layer.outbound_nodes.append(self)
-    # For compatibility with external Keras, we use the deprecated
-    # accessor here.
-    outbound_layer.inbound_nodes.append(self)
-
-  def get_config(self):
-    inbound_names = []
-    for layer in self.inbound_layers:
-      if layer:
-        inbound_names.append(layer.name)
-      else:
-        inbound_names.append(None)
-    return {
-        'outbound_layer': self.outbound_layer.name,
-        'inbound_layers': inbound_names,
-        'node_indices': self.node_indices,
-        'tensor_indices': self.tensor_indices
-    }
-
-
-class _DeferredTensor(object):
-  """Tensor-like object used to build graphs of layers in Eager mode.
-
-  When calling a layer on a DeferredTensor, the layer will not perform any
-  computation and will simply perfom shape inference to return new
-  DeferredTensors with appropriate shape information. Thus DeferredTensor
-  behaves like a graph-mode Tensor when manipulated by layers.
-  """
-
-  def __init__(self, shape, dtype, name=None):
-    self.shape = tensor_shape.TensorShape(shape)
-    if dtype is None:
-      self.dtype = dtypes.as_dtype(np.float32)
-    else:
-      self.dtype = dtypes.as_dtype(dtype)
-    self.name = name
-
-  def get_shape(self):
-    return self.shape
-
-  def __str__(self):
-    return "DeferredTensor('%s', shape=%s, dtype=%s)" % (self.name,
-                                                         self.get_shape(),
-                                                         self.dtype.name)
-
-  def __repr__(self):
-    return "<_DeferredTensor '%s' shape=%s dtype=%s>" % (self.name,
-                                                         self.get_shape(),
-                                                         self.dtype.name)
-
-
-def _is_tensor_or_tensor_list(v):
-  v = nest.flatten(v)
-  if v and isinstance(v[0], ops.Tensor):
-    return True
-  else:
-    return False
-
-
-def _to_snake_case(name):
-  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
-
-
-def _to_list(x):
-  """This normalizes a list/tuple or single element into a list.
-
-  If a single element is passed, we return
-  a list of size 1 containing the element.
-
-  Arguments:
-    x: list or tuple or single element.
-
-  Returns:
-    A list.
-  """
-  if isinstance(x, (list, tuple)):
-    return list(x)
-  return [x]
-
 
 def _add_elements_to_collection(elements, collection_list):
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('Using collections from Layers not supported in Eager '
                        'mode. Tried to add %s to %s' % (elements,
                                                         collection_list))
@@ -1474,105 +354,3 @@ def _add_elements_to_collection(elements, collection_list):
       if element not in collection_set:
         collection.append(element)
 
-
-def _is_all_none(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  # We cannot use Python's `any` because the iterable may return Tensors.
-  for element in iterable:
-    if element is not None:
-      return False
-  return True
-
-
-def _have_all_keras_metadata(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  return all([hasattr(x, '_keras_history') for x in iterable])
-
-
-def _collect_previous_mask(input_tensors):
-  """Retrieves the output mask(s) of the previous node.
-
-  Arguments:
-      input_tensors: A tensor or list of tensors.
-
-  Returns:
-      A mask tensor or list of mask tensors.
-  """
-  input_tensors = nest.flatten(input_tensors)
-  masks = []
-  for x in input_tensors:
-    if hasattr(x, '_keras_mask'):
-      mask = x._keras_mask  # pylint: disable=protected-access
-      masks.append(mask)
-    else:
-      masks.append(None)
-  if len(masks) == 1:
-    return masks[0]
-  return masks
-
-
-# A global dictionary mapping graph objects to an index of counters used
-# for various layer names in each graph.
-# Allows to give unique autogenerated names to layers, in a graph-specific way.
-PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
-
-
-def _get_default_graph_uid_map():
-  graph = ops.get_default_graph()
-  name_uid_map = PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
-  if name_uid_map is None:
-    name_uid_map = collections.defaultdict(int)
-    PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
-  return name_uid_map
-
-
-def _unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
-                       zero_based=False):
-  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
-
-  Arguments:
-    name: String name to make unique.
-    name_uid_map: An optional defaultdict(int) to use when creating unique
-      names. If None (default), uses a per-Graph dictionary.
-    avoid_names: An optional set or dict with names which should not be used. If
-      None (default) does not avoid any names.
-    namespace: Gets a name which is unique within the (graph, namespace). Layers
-      which are not Networks use a blank namespace and so get graph-global
-      names.
-    zero_based: If True, name sequences start with no suffix (e.g. "dense",
-      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
-
-  Returns:
-    Unique string name.
-
-  Example:
-
-  ```python
-  _unique_layer_name('dense')  # dense_1
-  _unique_layer_name('dense')  # dense_2
-  ```
-  """
-  if name_uid_map is None:
-    name_uid_map = _get_default_graph_uid_map()
-  if avoid_names is None:
-    avoid_names = set()
-  proposed_name = None
-  while proposed_name is None or proposed_name in avoid_names:
-    name_key = (namespace, name)
-    if zero_based:
-      number = name_uid_map[name_key]
-      if number:
-        proposed_name = name + '_' + str(number)
-      else:
-        proposed_name = name
-      name_uid_map[name_key] += 1
-    else:
-      name_uid_map[name_key] += 1
-      proposed_name = name + '_' + str(name_uid_map[name_key])
-  return proposed_name
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 91b8988d31c1f04be8134733e5e919c738ccb74f..f08b552840f5ff9144edae1cb0f90a1bc3db0f8c 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -44,7 +44,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.variables, [])
     self.assertEqual(layer.trainable_variables, [])
     self.assertEqual(layer.non_trainable_variables, [])
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       # updates, losses only supported in GRAPH mode
       self.assertEqual(layer.updates, [])
       self.assertEqual(layer.losses, [])
@@ -52,6 +52,12 @@ class BaseLayerTest(test.TestCase):
     layer = base_layers.Layer(name='my_layer', trainable=False)
     self.assertEqual(layer.trainable, False)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testInt64Layer(self):
+    layer = base_layers.Layer(name='my_layer', dtype='int64')
+    layer.add_variable('my_var', [2, 2])
+    self.assertEqual(layer.name, 'my_layer')
+
   @test_util.run_in_graph_and_eager_modes()
   def testAddWeight(self):
     layer = base_layers.Layer(name='my_layer')
@@ -63,7 +69,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.variables, [variable])
     self.assertEqual(layer.trainable_variables, [variable])
     self.assertEqual(layer.non_trainable_variables, [])
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEqual(
           layer.variables,
           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
@@ -77,7 +83,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.variables, [variable, variable_2])
     self.assertEqual(layer.trainable_variables, [variable])
     self.assertEqual(layer.non_trainable_variables, [variable_2])
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 1)
 
@@ -94,61 +100,6 @@ class BaseLayerTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
         core_layers.Dense(1, activity_regularizer=lambda *args, **kwargs: 0.)
 
-  def testGetVariable(self):
-    with self.test_session():
-
-      class MyLayer(base_layers.Layer):
-
-        def build(self, input_shape):
-          self.my_var = self.add_variable(
-              'my_var', [2, 2], initializer=init_ops.zeros_initializer())
-
-        def call(self, inputs):
-          return inputs * 2
-
-      layer = MyLayer(name='my_layer')
-      inputs = random_ops.random_uniform((5,), seed=1)
-      layer.apply(inputs)
-      layer.apply(inputs)
-      self.assertEqual([v.name for v in layer.variables],
-                       ['my_layer/my_var:0'])
-
-      # Creating a layer with no scope leads to lazy construction of
-      # the scope at apply() time.  It uses scope "<current scope>/base_name"
-      lazy_layer = MyLayer(_reuse=True)
-      with variable_scope.variable_scope('new_scope'):
-        with variable_scope.variable_scope('my_layer'):
-          variable_scope.get_variable('my_var', [2, 2])
-
-        # Smoke test: it runs.
-        lazy_layer.apply(inputs)
-        # The variables were created outside of the Layer, and
-        # reuse=True, so the Layer does not own them and they are not
-        # stored in its collection.
-        self.assertEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer._scope.name, 'new_scope/my_layer')
-
-      # Creating a layer with no scope leads to lazy construction of
-      # the scope at apply() time. If 'scope' argument is passed to
-      # apply(), it uses that scope when accessing variables.
-      lazy_layer = MyLayer(_reuse=True)
-      with variable_scope.variable_scope('new_scope') as new_scope:
-        variable_scope.get_variable('my_var', [2, 2])
-
-        # Smoke test: it runs.
-        lazy_layer.apply(inputs, scope=new_scope)
-        # The variables were created outside of the Layer, and
-        # reuse=True, so the Layer does not own them and they are not
-        # stored in its collection.
-        self.assertEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer._scope.name, 'new_scope')
-
-      # Checking for graph equality is only done in GRAPH mode.
-      with ops.Graph().as_default():
-        inputs_ng = random_ops.random_uniform((5,), seed=1)
-        with self.assertRaisesRegexp(ValueError, r'graph are not the same'):
-          layer.apply(inputs_ng)
-
   @test_util.run_in_graph_and_eager_modes()
   def testCall(self):
 
@@ -161,42 +112,10 @@ class BaseLayerTest(test.TestCase):
     inputs = random_ops.random_uniform((5,), seed=1)
     outputs = layer.apply(inputs)
     self.assertEqual(layer.built, True)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       # op is only supported in GRAPH mode
       self.assertEqual(outputs.op.name, 'my_layer/Square')
 
-  def testFirstCallCanCreateVariablesButSecondCanNotWhenBuildEmpty(self):
-    # Note that this test is only run in Graph mode since with EAGER mode we can
-    # still create a new variable on second call.
-
-    class MyLayer(base_layers.Layer):
-
-      def build(self, _):
-        # Do not mark the layer as built.
-        pass
-
-      def call(self, inputs):
-        self.my_var = self.add_variable('my_var', [2, 2])
-        if self.built:
-          # Skip creating on the first call; try to create after it's
-          # built.  This is expected to fail.
-          self.add_variable('this_will_break_on_second_call', [2, 2])
-        return inputs + math_ops.square(self.my_var)
-
-    layer = MyLayer(name='my_layer')
-    inputs = random_ops.random_uniform((2,), seed=1)
-    outputs = layer.apply(inputs)
-    self.assertEqual(layer.built, True)
-    self.assertEqual(outputs.op.name, 'my_layer/add')
-    self.assertEqual([v.name
-                      for v in layer.variables], ['my_layer/my_var:0'])
-    with self.assertRaisesRegexp(ValueError,
-                                 'my_layer/this_will_break_on_second_call'):
-      layer.apply(inputs)
-    # The list of variables hasn't changed.
-    self.assertEqual([v.name
-                      for v in layer.variables], ['my_layer/my_var:0'])
-
   @test_util.run_in_graph_and_eager_modes()
   def testDeepCopy(self):
 
@@ -210,7 +129,7 @@ class BaseLayerTest(test.TestCase):
     inputs = random_ops.random_uniform((5,), seed=1)
     outputs = layer.apply(inputs)
     self.assertEqual(layer.built, True)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       # op only supported in GRAPH mode.
       self.assertEqual(outputs.op.name, 'my_layer/Square')
 
@@ -280,7 +199,7 @@ class BaseLayerTest(test.TestCase):
       def call(self, inputs):
         return inputs
 
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       layer = CustomerLayer()
       with self.assertRaisesRegexp(ValueError, r'requires a defined rank'):
         layer.apply(array_ops.placeholder('int32'))
@@ -307,7 +226,7 @@ class BaseLayerTest(test.TestCase):
       def call(self, inputs):
         return inputs
 
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       layer = CustomerLayer()
       with self.assertRaisesRegexp(ValueError, r'requires a defined rank'):
         layer.apply(array_ops.placeholder('int32'))
@@ -335,7 +254,7 @@ class BaseLayerTest(test.TestCase):
       def call(self, inputs):
         return inputs
 
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       layer = CustomerLayer()
       with self.assertRaisesRegexp(ValueError, r'requires a defined rank'):
         layer.apply(array_ops.placeholder('int32'))
@@ -430,7 +349,7 @@ class BaseLayerTest(test.TestCase):
     layer.apply(constant_op.constant(1))
 
     # Works
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       layer.apply(array_ops.placeholder('int32'))
       layer.apply(array_ops.placeholder('int32', shape=(2, 3)))
 
@@ -453,13 +372,7 @@ class BaseLayerTest(test.TestCase):
         return {'l' + key: inputs[key] for key in inputs}
 
     layer = DictLayer()
-    if context.in_graph_mode():
-      i1 = array_ops.placeholder('int32')
-      i2 = array_ops.placeholder('float32')
-      result = layer.apply({'abel': i1, 'ogits': i2})
-      self.assertTrue(isinstance(result, dict))
-      self.assertEqual(set(['label', 'logits']), set(result.keys()))
-    else:
+    if context.executing_eagerly():
       i1 = constant_op.constant(3)
       i2 = constant_op.constant(4.0)
       result = layer.apply({'abel': i1, 'ogits': i2})
@@ -467,6 +380,12 @@ class BaseLayerTest(test.TestCase):
       self.assertEqual(set(['label', 'logits']), set(result.keys()))
       self.assertEqual(3, result['label'].numpy())
       self.assertEqual(4.0, result['logits'].numpy())
+    else:
+      i1 = array_ops.placeholder('int32')
+      i2 = array_ops.placeholder('float32')
+      result = layer.apply({'abel': i1, 'ogits': i2})
+      self.assertTrue(isinstance(result, dict))
+      self.assertEqual(set(['label', 'logits']), set(result.keys()))
 
   def testActivityRegularizer(self):
     regularizer = math_ops.reduce_sum
@@ -643,6 +562,17 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(len(layer.get_losses_for([intermediate_inputs])), 1)
     self.assertEqual(len(layer.get_losses_for([outputs])), 0)
 
+  def testLayerGraphSetInFirstApply(self):
+    with ops.Graph().as_default():
+      # Graph at construction time is ignored
+      layer = core_layers.Dense(1)
+    with ops.Graph().as_default():
+      layer.apply(constant_op.constant([[1.]]))
+      # layer is now bound to second Graph
+    with ops.Graph().as_default(), self.assertRaisesRegexp(
+        ValueError, 'Input graph and Layer graph are not the same'):
+      layer.apply(constant_op.constant([[1.]]))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index bb10fe5e8bfd26e4877fb6aef73980a30f62bb5d..34a1487e748e41eebae8b87b17c34d0deda8597f 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
@@ -32,199 +33,8 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _Conv(base.Layer):
-  """Abstract nD convolution layer (private, used as implementation base).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      length of the convolution window.
-    strides: An integer or tuple/list of n integers,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of n integers, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_Conv, self).__init__(trainable=trainable, name=name,
-                                activity_regularizer=activity_regularizer,
-                                **kwargs)
-    self.rank = rank
-    self.filters = filters
-    self.kernel_size = utils.normalize_tuple(kernel_size, rank, 'kernel_size')
-    self.strides = utils.normalize_tuple(strides, rank, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.dilation_rate = utils.normalize_tuple(
-        dilation_rate, rank, 'dilation_rate')
-    self.activation = activation
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-    self.kernel_regularizer = kernel_regularizer
-    self.bias_regularizer = bias_regularizer
-    self.kernel_constraint = kernel_constraint
-    self.bias_constraint = bias_constraint
-    self.input_spec = base.InputSpec(ndim=self.rank + 2)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
-    kernel_shape = self.kernel_size + (input_dim, self.filters)
-
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.input_spec = base.InputSpec(ndim=self.rank + 2,
-                                     axes={channel_axis: input_dim})
-    self._convolution_op = nn_ops.Convolution(
-        input_shape,
-        filter_shape=self.kernel.get_shape(),
-        dilation_rate=self.dilation_rate,
-        strides=self.strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format,
-                                              self.rank + 2))
-    self.built = True
-
-  def call(self, inputs):
-    outputs = self._convolution_op(inputs, self.kernel)
-
-    if self.use_bias:
-      if self.data_format == 'channels_first':
-        if self.rank == 1:
-          # nn.bias_add does not accept a 1D input tensor.
-          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
-          outputs += bias
-        if self.rank == 2:
-          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
-        if self.rank == 3:
-          # As of Mar 2017, direct addition is significantly slower than
-          # bias_add when computing gradients. To use bias_add, we collapse Z
-          # and Y into a single dimension to obtain a 4D input tensor.
-          outputs_shape = outputs.shape.as_list()
-          outputs_4d = array_ops.reshape(outputs,
-                                         [outputs_shape[0], outputs_shape[1],
-                                          outputs_shape[2] * outputs_shape[3],
-                                          outputs_shape[4]])
-          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
-          outputs = array_ops.reshape(outputs_4d, outputs_shape)
-      else:
-        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_last':
-      space = input_shape[1:-1]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0]] + new_space +
-                                      [self.filters])
-    else:
-      space = input_shape[2:]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0], self.filters] +
-                                      new_space)
-
-
 @tf_export('layers.Conv1D')
-class Conv1D(_Conv):
+class Conv1D(keras_layers.Conv1D, base.Layer):
   """1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
@@ -292,8 +102,7 @@ class Conv1D(_Conv):
                trainable=True,
                name=None,
                **kwargs):
-    super(Convolution1D, self).__init__(
-        rank=1,
+    super(Conv1D, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -415,7 +224,7 @@ def conv1d(inputs,
 
 
 @tf_export('layers.Conv2D')
-class Conv2D(_Conv):
+class Conv2D(keras_layers.Conv2D, base.Layer):
   """2D convolution layer (e.g. spatial convolution over images).
 
   This layer creates a convolution kernel that is convolved
@@ -491,7 +300,6 @@ class Conv2D(_Conv):
                name=None,
                **kwargs):
     super(Conv2D, self).__init__(
-        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -620,7 +428,7 @@ def conv2d(inputs,
 
 
 @tf_export('layers.Conv3D')
-class Conv3D(_Conv):
+class Conv3D(keras_layers.Conv3D, base.Layer):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
   This layer creates a convolution kernel that is convolved
@@ -697,7 +505,6 @@ class Conv3D(_Conv):
                name=None,
                **kwargs):
     super(Conv3D, self).__init__(
-        rank=3,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -826,169 +633,8 @@ def conv3d(inputs,
   return layer.apply(inputs)
 
 
-class _SeparableConv(_Conv):
-  """Abstract base layer for separable nD convolution.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self,
-               rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer=None,
-               pointwise_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_SeparableConv, self).__init__(
-        rank=rank,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = depthwise_initializer
-    self.pointwise_initializer = pointwise_initializer
-    self.depthwise_regularizer = depthwise_regularizer
-    self.pointwise_regularizer = pointwise_regularizer
-    self.depthwise_constraint = depthwise_constraint
-    self.pointwise_constraint = pointwise_constraint
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
-    self.input_spec = base.InputSpec(ndim=self.rank + 2,
-                                     axes={channel_axis: input_dim})
-    depthwise_kernel_shape = self.kernel_size + (input_dim,
-                                                 self.depth_multiplier)
-    pointwise_kernel_shape = (
-        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
-
-    self.depthwise_kernel = self.add_variable(
-        name='depthwise_kernel',
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    self.pointwise_kernel = self.add_variable(
-        name='pointwise_kernel',
-        shape=pointwise_kernel_shape,
-        initializer=self.pointwise_initializer,
-        regularizer=self.pointwise_regularizer,
-        constraint=self.pointwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    raise NotImplementedError
-
-
 @tf_export('layers.SeparableConv1D')
-class SeparableConv1D(_SeparableConv):
+class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
   """Depthwise separable 1D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -1070,7 +716,6 @@ class SeparableConv1D(_SeparableConv):
                name=None,
                **kwargs):
     super(SeparableConv1D, self).__init__(
-        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1094,45 +739,9 @@ class SeparableConv1D(_SeparableConv):
         name=name,
         **kwargs)
 
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides * 2 + (1,)
-      spatial_start_dim = 1
-    else:
-      strides = (1, 1) + self.strides * 2
-      spatial_start_dim = 2
-
-    # Explicitly broadcast inputs and kernels to 4D.
-    # TODO(fchollet): refactor when a native separable_conv1d op is available.
-    inputs = array_ops.expand_dims(inputs, spatial_start_dim)
-    depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
-    pointwise_kernel = array_ops.expand_dims(self.pointwise_kernel, 0)
-    dilation_rate = (1,) + self.dilation_rate
-
-    outputs = nn.separable_conv2d(
-        inputs,
-        depthwise_kernel,
-        pointwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        rate=dilation_rate,
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    outputs = array_ops.squeeze(outputs, [spatial_start_dim])
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
 
 @tf_export('layers.SeparableConv2D')
-class SeparableConv2D(_SeparableConv):
+class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
   """Depthwise separable 2D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -1219,7 +828,6 @@ class SeparableConv2D(_SeparableConv):
                name=None,
                **kwargs):
     super(SeparableConv2D, self).__init__(
-        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1243,31 +851,6 @@ class SeparableConv2D(_SeparableConv):
         name=name,
         **kwargs)
 
-  def call(self, inputs):
-    # Apply the actual ops.
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides + (1,)
-    else:
-      strides = (1, 1) + self.strides
-    outputs = nn.separable_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        self.pointwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        rate=self.dilation_rate,
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
 
 @tf_export('layers.separable_conv1d')
 def separable_conv1d(inputs,
@@ -1509,7 +1092,7 @@ def separable_conv2d(inputs,
 
 
 @tf_export('layers.Conv2DTranspose')
-class Conv2DTranspose(Conv2D):
+class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
   """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -1574,8 +1157,8 @@ class Conv2DTranspose(Conv2D):
                name=None,
                **kwargs):
     super(Conv2DTranspose, self).__init__(
-        filters,
-        kernel_size,
+        filters=filters,
+        kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
@@ -1591,120 +1174,6 @@ class Conv2DTranspose(Conv2D):
         trainable=trainable,
         name=name,
         **kwargs)
-    self.input_spec = base.InputSpec(ndim=4)
-
-  def build(self, input_shape):
-    if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank 4. Received input shape: ' +
-                       str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis]
-    self.input_spec = base.InputSpec(ndim=4, axes={channel_axis: input_dim})
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = array_ops.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    # Infer the dynamic output shape:
-    out_height = utils.deconv_output_length(height,
-                                            kernel_h,
-                                            self.padding,
-                                            stride_h)
-    out_width = utils.deconv_output_length(width,
-                                           kernel_w,
-                                           self.padding,
-                                           stride_w)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_height, out_width)
-      strides = (1, 1, stride_h, stride_w)
-    else:
-      output_shape = (batch_size, out_height, out_width, self.filters)
-      strides = (1, stride_h, stride_w, 1)
-
-    output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv2d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if context.in_graph_mode():
-      # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                     kernel_h,
-                                                     self.padding,
-                                                     stride_h)
-      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                     kernel_w,
-                                                     self.padding,
-                                                     stride_w)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    output_shape[c_axis] = self.filters
-    output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
-    output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
-    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('layers.conv2d_transpose')
@@ -1804,7 +1273,7 @@ def conv2d_transpose(inputs,
 
 
 @tf_export('layers.Conv3DTranspose')
-class Conv3DTranspose(Conv3D):
+class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
   """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
 
   Arguments:
@@ -1883,153 +1352,6 @@ class Conv3DTranspose(Conv3D):
         trainable=trainable,
         name=name,
         **kwargs)
-    self.input_spec = base.InputSpec(ndim=5)
-
-  def build(self, input_shape):
-    if len(input_shape) != 5:
-      raise ValueError('Inputs should have rank 5, received input shape:',
-                       str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined, found None: ' + str(input_shape))
-    input_dim = input_shape[channel_axis]
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_variable(
-        'kernel',
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(
-          'bias',
-          shape=(self.filters,),
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          trainable=True,
-          dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = array_ops.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
-    else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    self.input_spec = base.InputSpec(ndim=5,
-                                     axes={c_axis: inputs_shape[c_axis]})
-
-    depth = inputs_shape[d_axis]
-    height = inputs_shape[h_axis]
-    width = inputs_shape[w_axis]
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    # Infer the dynamic output shape:
-    out_depth = utils.deconv_output_length(depth,
-                                           kernel_d,
-                                           self.padding,
-                                           stride_d)
-    out_height = utils.deconv_output_length(height,
-                                            kernel_h,
-                                            self.padding,
-                                            stride_h)
-    out_width = utils.deconv_output_length(width,
-                                           kernel_w,
-                                           self.padding,
-                                           stride_w)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_depth, out_height,
-                      out_width)
-      strides = (1, 1, stride_d, stride_h, stride_w)
-    else:
-      output_shape = (batch_size, out_depth, out_height, out_width,
-                      self.filters)
-      strides = (1, stride_d, stride_h, stride_w, 1)
-
-    output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv3d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides,
-        data_format=utils.convert_data_format(self.data_format, ndim=5),
-        padding=self.padding.upper())
-
-    if context.in_graph_mode():
-      # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis],
-                                                     kernel_d,
-                                                     self.padding,
-                                                     stride_d)
-      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                     kernel_h,
-                                                     self.padding,
-                                                     stride_h)
-      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                     kernel_w,
-                                                     self.padding,
-                                                     stride_w)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs_shape = outputs.shape.as_list()
-      if outputs_shape[0] is None:
-        outputs_shape[0] = -1
-      if self.data_format == 'channels_first':
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1],
-            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
-        ])
-      else:
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
-            outputs_shape[3], outputs_shape[4]
-        ])
-      outputs_4d = nn.bias_add(
-          outputs_4d,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-      outputs = array_ops.reshape(outputs_4d, outputs_shape)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
-    else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    output_shape[c_axis] = self.filters
-    output_shape[d_axis] = utils.deconv_output_length(
-        output_shape[d_axis], kernel_d, self.padding, stride_d)
-    output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
-    output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
-    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('layers.conv3d_transpose')
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index 160e732b6798697d05815e13a7b1c399070f0783..cdb42f5bd18292cad9d8536e88ea1c58c1d7d777 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -325,6 +325,12 @@ class ConvTest(test.TestCase):
     self.assertEqual(conv3d.kernel_constraint, k_constraint)
     self.assertEqual(conv3d.bias_constraint, b_constraint)
 
+  def testConv3DChannelsFirst(self):
+    # Test case for GitHub issue 15655
+    images = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[None, 1, 32, 32, 32])
+    conv_layers.conv3d(images, 32, 9, data_format='channels_first')
+
 
 @test_util.with_c_api
 class SeparableConv1DTest(test.TestCase):
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 6970bf9234f5a31ee8093069ac1c933bcdb6f103..6d8e9eac878bb2eb65bfa29e872a0576a39af662 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -27,22 +27,14 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import standard_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('layers.Dense')
-class Dense(base.Layer):
+class Dense(keras_layers.Dense, base.Layer):
   """Densely-connected layer class.
 
   This layer implements the operation:
@@ -107,73 +99,19 @@ class Dense(base.Layer):
                trainable=True,
                name=None,
                **kwargs):
-    super(Dense, self).__init__(trainable=trainable, name=name,
+    super(Dense, self).__init__(units=units,
+                                activation=activation,
+                                use_bias=use_bias,
+                                kernel_initializer=kernel_initializer,
+                                bias_initializer=bias_initializer,
+                                kernel_regularizer=kernel_regularizer,
+                                bias_regularizer=bias_regularizer,
                                 activity_regularizer=activity_regularizer,
+                                kernel_constraint=kernel_constraint,
+                                bias_constraint=bias_constraint,
+                                trainable=trainable,
+                                name=name,
                                 **kwargs)
-    self.units = units
-    self.activation = activation
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-    self.kernel_regularizer = kernel_regularizer
-    self.bias_regularizer = bias_regularizer
-    self.kernel_constraint = kernel_constraint
-    self.bias_constraint = bias_constraint
-    self.input_spec = base.InputSpec(min_ndim=2)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1].value is None:
-      raise ValueError('The last dimension of the inputs to `Dense` '
-                       'should be defined. Found `None`.')
-    self.input_spec = base.InputSpec(min_ndim=2,
-                                     axes={-1: input_shape[-1].value})
-    self.kernel = self.add_variable('kernel',
-                                    shape=[input_shape[-1].value, self.units],
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
-    if self.use_bias:
-      self.bias = self.add_variable('bias',
-                                    shape=[self.units,],
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
-    shape = inputs.get_shape().as_list()
-    if len(shape) > 2:
-      # Broadcasting is required for the inputs.
-      outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1],
-                                                             [0]])
-      # Reshape the output back to the original ndim of the input.
-      if context.in_graph_mode():
-        output_shape = shape[:-1] + [self.units]
-        outputs.set_shape(output_shape)
-    else:
-      outputs = standard_ops.matmul(inputs, self.kernel)
-    if self.use_bias:
-      outputs = nn.bias_add(outputs, self.bias)
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    input_shape = input_shape.with_rank_at_least(2)
-    if input_shape[-1].value is None:
-      raise ValueError(
-          'The innermost dimension of input_shape must be defined, but saw: %s'
-          % input_shape)
-    return input_shape[:-1].concatenate(self.units)
 
 
 @tf_export('layers.dense')
@@ -253,7 +191,7 @@ def dense(
 
 
 @tf_export('layers.Dropout')
-class Dropout(base.Layer):
+class Dropout(keras_layers.Dropout, base.Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting a fraction `rate` of input units to 0
@@ -281,31 +219,14 @@ class Dropout(base.Layer):
                seed=None,
                name=None,
                **kwargs):
-    super(Dropout, self).__init__(name=name, **kwargs)
-    self.rate = rate
-    self.noise_shape = noise_shape
-    self.seed = seed
-
-  def _get_noise_shape(self, inputs):
-    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
-    # which will override `self.noise_shape`, and allows for custom noise
-    # shapes with dynamically sized inputs.
-    if self.noise_shape is None:
-      return self.noise_shape
-    return nn_ops._get_noise_shape(inputs, self.noise_shape)
+    super(Dropout, self).__init__(rate=rate,
+                                  noise_shape=noise_shape,
+                                  seed=seed,
+                                  name=name,
+                                  **kwargs)
 
   def call(self, inputs, training=False):
-
-    def dropped_inputs():
-      return nn.dropout(inputs, 1  - self.rate,
-                        noise_shape=self._get_noise_shape(inputs),
-                        seed=self.seed)
-    return utils.smart_cond(training,
-                            dropped_inputs,
-                            lambda: array_ops.identity(inputs))
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    return super(Dropout, self).call(inputs, training=training)
 
 
 @tf_export('layers.dropout')
@@ -351,7 +272,7 @@ def dropout(inputs,
 
 
 @tf_export('layers.Flatten')
-class Flatten(base.Layer):
+class Flatten(keras_layers.Flatten, base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
   Examples:
@@ -366,25 +287,7 @@ class Flatten(base.Layer):
     # now `y` has shape `(None, None)`
   ```
   """
-
-  def __init__(self, **kwargs):
-    super(Flatten, self).__init__(**kwargs)
-    self.input_spec = base.InputSpec(min_ndim=2)
-
-  def call(self, inputs):
-    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
-    if context.in_graph_mode():
-      outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = [input_shape[0]]
-    if all(input_shape[1:]):
-      output_shape += [np.prod(input_shape[1:])]
-    else:
-      output_shape += [None]
-    return tensor_shape.TensorShape(output_shape)
+  pass
 
 
 @tf_export('layers.flatten')
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 15ce6cba21fcc78126f7db58ab18934db69c15fd..cf45b07637108422f1c612390bb01efdad6d5bcf 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -67,7 +67,7 @@ class DenseTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllEqual(x.eval(), [[0.0]])
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testCall(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
     inputs = random_ops.random_uniform((5, 4), seed=1)
@@ -77,12 +77,20 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.trainable_variables,
                          [dense.kernel, dense.bias])
     self.assertListEqual(dense.non_trainable_variables, [])
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
     self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
     self.assertEqual(dense.bias.name, 'my_dense/bias:0')
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testNoEagerLeak(self):
+    # Tests that repeatedly constructing and building a Layer does not leak
+    # Python objects.
+    inputs = random_ops.random_uniform((5, 4), seed=1)
+    core_layers.Dense(5)(inputs)
+    core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')(inputs)
+
   @test_util.run_in_graph_and_eager_modes()
   def testCallTensorDot(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
@@ -98,7 +106,7 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.variables, [dense.kernel])
     self.assertListEqual(dense.trainable_variables, [dense.kernel])
     self.assertListEqual(dense.non_trainable_variables, [])
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 1)
     self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
@@ -113,7 +121,7 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.non_trainable_variables,
                          [dense.kernel, dense.bias])
     self.assertListEqual(dense.trainable_variables, [])
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 0)
 
@@ -162,13 +170,13 @@ class DenseTest(test.TestCase):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='dense1')
     inputs = random_ops.random_uniform((5, 3), seed=1)
     outputs = dense(inputs)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEqual(outputs.op.name, 'dense1/Relu')
 
     dense = core_layers.Dense(2, name='dense2')
     inputs = random_ops.random_uniform((5, 3), seed=1)
     outputs = dense(inputs)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.assertEqual(outputs.op.name, 'dense2/BiasAdd')
 
   def testActivityRegularizer(self):
@@ -374,7 +382,7 @@ class DropoutTest(test.TestCase):
     dp = core_layers.Dropout(0.5)
     inputs = array_ops.ones((5, 3))
     dropped = dp.apply(inputs, training=True)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self.evaluate(variables.global_variables_initializer())
     np_output = self.evaluate(dropped)
     self.assertAlmostEqual(0., np_output.min())
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index 1555846efde812b9e31f48315decaf1f86aa4f70..11a2ebc040f0177e38d5b0f38cf609071f91fd07 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -14,61 +14,17 @@
 # ==============================================================================
 
 # pylint: disable=line-too-long
-"""This library provides a set of high-level neural networks layers.
-
-@@Dense
-@@Dropout
-@@Flatten
-@@Conv1D
-@@Conv2D
-@@Conv3D
-@@SeparableConv1D
-@@SeparableConv2D
-@@Conv2DTranspose
-@@Conv3DTranspose
-@@AveragePooling1D
-@@MaxPooling1D
-@@AveragePooling2D
-@@MaxPooling2D
-@@AveragePooling3D
-@@MaxPooling3D
-@@BatchNormalization
-
-@@Layer
-@@Input
-@@InputSpec
-
-@@dense
-@@dropout
-@@flatten
-@@conv1d
-@@conv2d
-@@conv3d
-@@separable_conv1d
-@@separable_conv2d
-@@conv2d_transpose
-@@conv3d_transpose
-@@average_pooling1d
-@@max_pooling1d
-@@average_pooling2d
-@@max_pooling2d
-@@average_pooling3d
-@@max_pooling3d
-@@batch_normalization
-"""
+"""This library provides a set of high-level neural networks layers."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util.all_util import remove_undocumented
-
 # pylint: disable=g-bad-import-order,unused-import
 
 # Base objects.
 from tensorflow.python.layers.base import Layer
 from tensorflow.python.layers.base import InputSpec
-from tensorflow.python.layers.network import Input
 
 # Core layers.
 from tensorflow.python.layers.core import Dense
@@ -123,7 +79,3 @@ from tensorflow.python.layers.normalization import BatchNormalization
 from tensorflow.python.layers.normalization import batch_normalization
 
 # pylint: enable=g-bad-import-order,unused-import
-
-_allowed_symbols = []
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/layers/maxout.py b/tensorflow/python/layers/maxout.py
deleted file mode 100644
index 765a1c4fdafdfdc5d3ea6629d4d9290d8b658902..0000000000000000000000000000000000000000
--- a/tensorflow/python/layers/maxout.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-# pylint: disable=unused-import,g-bad-import-order
-"""Contains the maxout layer
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import gen_array_ops
-
-from tensorflow.python.layers import base
-
-
-def maxout(inputs, num_units, axis=-1, name=None):
-  """Adds a maxout op from https://arxiv.org/abs/1302.4389
-
-  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
-  Courville,
-   Yoshua Bengio
-
-   Usually the operation is performed in the filter/channel dimension. This can
-   also be
-   used after fully-connected layers to reduce number of features.
-
-   Arguments:
-   inputs: Tensor input
-   num_units: Specifies how many features will remain after maxout in the `axis`
-     dimension
-         (usually channel). This must be multiple of number of `axis`.
-   axis: The dimension where max pooling will be performed. Default is the
-   last dimension.
-   name: Optional scope for name_scope.
-
-   Returns:
-    A `Tensor` representing the results of the pooling operation.
-
-   Raises:
-    ValueError: if num_units is not multiple of number of features.
-  """
-  return MaxOut(num_units=num_units, axis=axis, name=name)(inputs)
-
-
-class MaxOut(base.Layer):
-  """Adds a maxout op from https://arxiv.org/abs/1302.4389
-
-  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
-  Courville, Yoshua
-  Bengio
-
-  Usually the operation is performed in the filter/channel dimension. This can
-  also be
-  used after fully-connected layers to reduce number of features.
-
-  Arguments:
-    inputs: Tensor input
-    num_units: Specifies how many features will remain after maxout in the
-      `axis` dimension
-         (usually channel).
-    This must be multiple of number of `axis`.
-    axis: The dimension where max pooling will be performed. Default is the
-    last dimension.
-    name: Optional scope for name_scope.
-
-  Returns:
-    A `Tensor` representing the results of the pooling operation.
-
-  Raises:
-    ValueError: if num_units is not multiple of number of features.
-  """
-
-  def __init__(self, num_units, axis=-1, name=None, **kwargs):
-    super(MaxOut, self).__init__(name=name, trainable=False, **kwargs)
-    self.axis = axis
-    self.num_units = num_units
-
-  def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs)
-    shape = inputs.get_shape().as_list()
-    num_channels = shape[self.axis]
-    if num_channels % self.num_units:
-      raise ValueError('number of features({}) is not '
-                       'a multiple of num_units({})'.format(
-                           num_channels, self.num_units))
-    shape[self.axis] = -1
-    shape += [num_channels // self.num_units]
-
-    # Dealing with batches with arbitrary sizes
-    for i in range(len(shape)):
-      if shape[i] is None:
-        shape[i] = gen_array_ops.shape(inputs)[i]
-    outputs = math_ops.reduce_max(
-        gen_array_ops.reshape(inputs, shape), -1, keepdims=False)
-
-    return outputs
diff --git a/tensorflow/python/layers/maxout_test.py b/tensorflow/python/layers/maxout_test.py
deleted file mode 100644
index 26acac57c41da759f288f255c0cd523f9c6b1dbd..0000000000000000000000000000000000000000
--- a/tensorflow/python/layers/maxout_test.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-# pylint: disable=unused-import,g-bad-import-order
-
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.layers import maxout
-from tensorflow.python.layers import convolutional as conv_layers
-from tensorflow.python.layers import core as core_layers
-
-from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import test
-import numpy as np
-
-"""
-Contains the maxout layer tests
-"""
-
-
-class MaxOutTest(test.TestCase):
-  def test_simple(self):
-    inputs = random_ops.random_uniform((64, 10, 36), seed=1)
-    graph = maxout.maxout(inputs, num_units=3)
-    self.assertEqual(graph.get_shape().as_list(), [64, 10, 3])
-
-  def test_fully_connected(self):
-    inputs = random_ops.random_uniform((64, 50), seed=1)
-    graph = core_layers.dense(inputs, 50)
-    graph = maxout.maxout(graph, num_units=10)
-    self.assertEqual(graph.get_shape().as_list(), [64, 10])
-
-  def test_nchw(self):
-    inputs = random_ops.random_uniform((10, 100, 100, 3), seed=1)
-    graph = conv_layers.conv2d(inputs, 10, 3, padding="SAME")
-    graph = maxout.maxout(graph, num_units=1)
-    self.assertEqual(graph.get_shape().as_list(), [10, 100, 100, 1])
-
-  def test_invalid_shape(self):
-    inputs = random_ops.random_uniform((10, 100, 100, 3), seed=1)
-    graph = conv_layers.conv2d(inputs, 3, 10, strides=(1, 1))
-    with self.assertRaisesRegexp(ValueError, 'number of features'):
-      graph = maxout.maxout(graph, num_units=2)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
deleted file mode 100644
index 9f16559687c52a1149b78a1ccca796cadd8208d0..0000000000000000000000000000000000000000
--- a/tensorflow/python/layers/network.py
+++ /dev/null
@@ -1,1024 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Contains Network, a composition of layers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-from tensorflow.python.eager import context
-from tensorflow.python.estimator import util as estimator_util
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import base
-from tensorflow.python.layers import utils as layers_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import tf_export
-
-
-class InputLayer(base.Layer):
-  """Layer to be used as an entry point into a Network (a graph of layers).
-
-  It can either wrap an existing tensor (pass an `input_tensor` argument)
-  or create its a placeholder tensor (pass arguments `input_shape`
-  as well as `dtype`).
-
-  It is generally recommend to use the functional layer API via `Input`,
-  (which creates an `InputLayer`) without directly using `InputLayer`.
-
-  Arguments:
-      input_shape: Shape tuple (not including the batch axis), or `TensorShape`
-        instance (not including the batch axis).
-      batch_size: Optional input batch size (integer or None).
-      dtype: Datatype of the input.
-      input_tensor: Optional tensor to use as layer input
-          instead of creating a placeholder.
-      sparse: Boolean, whether the placeholder created
-          is meant to be sparse.
-      name: Name of the layer (string).
-
-    Raises:
-      RuntimeError: If created in Eager mode.
-  """
-
-  def __init__(self,
-               input_shape=None,
-               batch_size=None,
-               dtype=dtypes.float32,
-               input_tensor=None,
-               sparse=False,
-               name=None):
-    super(InputLayer, self).__init__(dtype=dtype, name=name)
-    self.built = True
-    self.sparse = sparse
-    self.batch_size = batch_size
-
-    if isinstance(input_shape, tensor_shape.TensorShape):
-      input_shape = tuple(input_shape.as_list())
-
-    if input_tensor is None:
-      if input_shape is not None:
-        batch_input_shape = (batch_size,) + tuple(input_shape)
-      else:
-        batch_input_shape = None
-
-      if context.in_eager_mode():
-        # In eager mode, create a temporary placeholder to call the layer on.
-        input_tensor = base._DeferredTensor(  # pylint: disable=protected-access
-            shape=batch_input_shape,
-            dtype=dtype,
-            name=self.name)
-      else:
-        # In graph mode, create a graph placeholder to call the layer on.
-        if sparse:
-          input_tensor = array_ops.sparse_placeholder(
-              shape=batch_input_shape,
-              dtype=dtype,
-              name=self.name)
-        else:
-          input_tensor = array_ops.placeholder(
-              shape=batch_input_shape,
-              dtype=dtype,
-              name=self.name)
-
-      # For compatibility with Keras API.
-      self.is_placeholder = True
-      self._batch_input_shape = batch_input_shape
-    else:
-      # For compatibility with Keras API.
-      self.is_placeholder = False
-      self._batch_input_shape = tuple(input_tensor.get_shape().as_list())
-
-    # Create an input node to add to self.outbound_node
-    # and set output_tensors' _keras_history.
-    input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
-    base.Node(
-        self,
-        inbound_layers=[],
-        node_indices=[],
-        tensor_indices=[],
-        input_tensors=[input_tensor],
-        output_tensors=[input_tensor])
-
-
-@tf_export('layers.Input')
-def Input(  # pylint: disable=invalid-name
-    shape=None,
-    batch_size=None,
-    name=None,
-    dtype=dtypes.float32,
-    sparse=False,
-    tensor=None):
-  """`Input()` is used to instantiate an input tensor for use with a `Network`.
-
-  For instance, if a, b and c are tensors created via `Input`,
-  it becomes possible to do:
-
-  `network = Network(inputs=[a, b], outputs=c)`
-
-  Example:
-
-      ```python
-      # This is a logistic regression
-      x = tf.layers.Input(shape=(32,))
-      y = tf.layers.Dense(16, activation='softmax')(x)
-      network = tf.layers.Network(x, y)
-      ```
-
-  Arguments:
-      shape: A shape tuple (integer), not including the batch size.
-          For instance, `shape=(32,)` indicates that the expected input
-          will be batches of 32-dimensional vectors.
-      batch_size: Optional input batch size (integer or None).
-      name: An optional name string for the layer.
-          Should be unique in a model (do not reuse the same name twice).
-          It will be autogenerated if it isn't provided.
-      dtype: The data type expected by the input, as a string
-          (`float32`, `float64`, `int32`...)
-      sparse: A boolean specifying whether the placeholder
-          to be created is sparse.
-      tensor: Optional existing tensor to wrap into the `Input` layer.
-          If set, the layer will not create a placeholder tensor.
-
-  Returns:
-      A tensor: either a new placeholder (with history metadata) or
-      `tensor` (if passed), with added history metadata.
-
-  Raises:
-    RuntimeError: If called in Eager mode.
-  """
-  input_layer = InputLayer(
-      input_shape=shape,
-      batch_size=batch_size,
-      name=name,
-      dtype=dtype,
-      sparse=sparse,
-      input_tensor=tensor)
-  # Return tensor including `_keras_history` metadata.
-  # Note that in this case train_output and test_output are the same pointer.
-  outputs = input_layer._inbound_nodes[0].output_tensors  # pylint: disable=protected-access
-  if len(outputs) == 1:
-    return outputs[0]
-  else:
-    return outputs
-
-
-class GraphNetwork(base.Layer):
-  """A GraphNetwork is a directed acyclic graph of layers.
-
-  It is the topological form of a `tf.keras.models.Model`. A `Model` is simply a
-  `GraphNetwork` with added training/evaluation routines.
-
-  A `GraphNetwork` instance implements the full `Layer` API. In particular, a
-  `GraphNetwork` can be called on new inputs.
-
-  Example:
-
-      ```python
-      # This is a logistic regression
-      x = tf.layers.Input(shape=(32,))
-      y = tf.layers.Dense(16, activation='softmax')(x)
-      network = tf.layers.GraphNetwork(x, y)
-
-      # It is then possible to call the network on compatible inputs:
-      z = tf.layers.Input(shape=(32,))
-      w = network(z)
-
-      # It is possible to retrieve the same properties as a layer:
-      weights = network.trainable_weights
-      ```
-
-  Arguments:
-      inputs: Input tensor or list of input tensors.
-        Must come from `tf.layers.Input`.
-      output: Output tensor or list of output tensors. Must come from
-        tf.layers Layers or Keras layers.
-      name: Optional name of the model (string).
-
-  Attributes:
-    GraphNetwork has the same attributes as Layer. On top of it, it also has:
-      - layers: a list of the children layers of the network,
-        a list of layer instances, ordered from "earlier in the graph"
-        to "later in the graph".
-
-  Methods:
-    GraphNetwork has the same methods as Layer. On top of it, it also has:
-      - get_layer: retrieves a child layer by name or index in the graph.
-
-  Raises:
-    TypeError: If created when eager execution is enabled, with inputs that
-      don't come from a call to `Input` or outputs that don't come from layers.
-  """
-
-  def __init__(self, inputs, outputs, name=None):  # pylint: disable=super-init-not-called
-    if isinstance(inputs, (list, tuple)):
-      self.inputs = list(inputs)  # Tensor or list of tensors.
-    else:
-      self.inputs = [inputs]
-    if isinstance(outputs, (list, tuple)):
-      self.outputs = list(outputs)
-    else:
-      self.outputs = [outputs]
-
-    if context.in_eager_mode():
-      # Check that all inputs/outputs are DeferredTensors.
-      for tensor in self.inputs:
-        if not isinstance(tensor, base._DeferredTensor):  # pylint: disable=protected-access
-          raise TypeError('When eager execution is enabled, '
-                          'inputs must come from a call to '
-                          '`tf.keras.Input` (called after '
-                          'tfe.enable_eager_execution()). '
-                          'Received invalid input: ' + str(tensor))
-      for tensor in self.outputs:
-        if not isinstance(tensor, base._DeferredTensor):  # pylint: disable=protected-access
-          raise TypeError('When eager execution is enabled, '
-                          'outputs must come from a call to '
-                          'a layer (called after '
-                          'tfe.enable_eager_execution()). '
-                          'Received invalid output: ' + str(tensor))
-
-    self._init_set_name(name)
-    self._activity_regularizer = None
-    with vs.variable_scope(
-        None, default_name=self._base_name) as captured_scope:
-      self._scope = captured_scope
-    call_fn_args = estimator_util.fn_args(self.call)
-    self._compute_previous_mask = ('mask' in call_fn_args or
-                                   hasattr(self, 'compute_mask'))
-    self._call_has_scope_arg = 'scope' in call_fn_args
-
-    # This acts just like the `trainable` attribute of any layer instance.
-    # It does not affect users of the underlying layers, only users of the
-    # GraphNetwork instance.
-    self.trainable = True
-    # A GraphNetwork does not create weights of its own, thus it is already
-    # built.
-    self.built = True
-    # A GraphNetwork does not create weights of its own, thus has no dtype.
-    self._dtype = None
-    self._is_graph_network = True
-    # The following are implemented as property functions:
-    # self.trainable_weights
-    # self.non_trainable_weights
-    # self.input_spec
-
-    # Private attributes to implement compatibility with Layer.
-    self._updates = []
-    self._losses = []
-    self._scope = None
-    self._reuse = None
-    self._graph = ops.get_default_graph()
-
-    # All layers in order of horizontal graph traversal.
-    # Entries are unique. Includes input and output layers.
-    self._layers = []
-
-    # Check for redundancy in inputs.
-    if len(set(self.inputs)) != len(self.inputs):
-      raise ValueError('The list of inputs passed to the model '
-                       'is redundant. '
-                       'All inputs should only appear once.'
-                       ' Found: ' + str(self.inputs))
-
-    # # List of initial layers (1 to 1 mapping with self.inputs,
-    # # hence the same layer might appear twice)
-    # self._input_layers = []
-    # self._input_layers_node_indices = []
-    # self._input_layers_tensor_indices = []
-    # # list of layers (1 to 1 mapping with self.inputs,
-    # # hence the same layer might appear twice)
-    # self._output_layers = []
-    # self._output_layers_node_indices = []
-    # self._output_layers_tensor_indices = []
-
-    self._input_layers = []
-    self._output_layers = []
-    self._input_coordinates = []
-    self._output_coordinates = []
-
-    # This is for performance optimization when calling the GraphNetwork on new
-    # inputs. Every time the GraphNetwork is called on a set on input tensors,
-    # we compute the output tensors, output masks and output shapes in one pass,
-    # then cache them here. When any of these outputs is queried later, we
-    # retrieve it from there instead of recomputing it.
-    self._output_mask_cache = {}
-    self._output_tensor_cache = {}
-    self._output_shape_cache = {}
-
-    # User-provided arguments validation.
-    for x in self.inputs:
-      # Check that x has appropriate `_keras_history` metadata.
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Input tensors to a ' + cls_name + ' ' +
-                         'must come from `tf.layers.Input`. '
-                         'Received: ' + str(x) +
-                         ' (missing previous layer metadata).')
-      # Check that x is an input tensor.
-      # pylint: disable=protected-access
-      layer, node_index, tensor_index = x._keras_history
-      if len(layer._inbound_nodes) > 1 or (
-          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
-        cls_name = self.__class__.__name__
-        logging.warning(cls_name + ' inputs must come from '
-                        '`tf.layers.Input` (thus holding past layer metadata), '
-                        'they cannot be the output of '
-                        'a previous non-Input layer. '
-                        'Here, a tensor specified as '
-                        'input to "' + self.name + '" was not an Input tensor, '
-                        'it was generated by layer ' + layer.name + '.\n'
-                        'Note that input tensors are '
-                        'instantiated via `tensor = tf.layers.Input(shape)`.\n'
-                        'The tensor that caused the issue was: ' + str(x.name))
-      # pylint: enable=protected-access
-    for x in self.outputs:
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Output tensors to a ' + cls_name + ' must be '
-                         'the output of a TensorFlow `Layer` '
-                         '(thus holding past layer metadata). Found: ' + str(x))
-
-    # Build self._output_layers:
-    for x in self.outputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      self._output_layers.append(layer)
-      self._output_coordinates.append((layer, node_index, tensor_index))
-
-    # Build self._input_layers:
-    for x in self.inputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      # It's supposed to be an input layer, so only one node
-      # and one tensor output.
-      assert node_index == 0
-      assert tensor_index == 0
-      self._input_layers.append(layer)
-      self._input_coordinates.append((layer, node_index, tensor_index))
-
-    # Network_nodes: set of nodes included in the graph
-    # (not all nodes included in the layers
-    # are relevant to the current graph).
-    network_nodes = set()  # ids of all nodes relevant to the GraphNetwork
-    nodes_depths = {}  # dict {node: depth value}
-    layers_depths = {}  # dict {layer: depth value}
-    layer_indices = {}  # dict {layer: index in traversal}
-    nodes_in_decreasing_depth = []
-
-    def build_map_of_graph(tensor,
-                           finished_nodes,
-                           nodes_in_progress,
-                           layer,
-                           node_index,
-                           tensor_index):
-      """Builds a map of the graph of layers.
-
-      This recursively updates the map `layer_indices`,
-      the list `nodes_in_decreasing_depth` and the set `network_nodes`.
-
-      Arguments:
-          tensor: Some tensor in a graph.
-          finished_nodes: Set of nodes whose subgraphs have been traversed
-              completely. Useful to prevent duplicated work.
-          nodes_in_progress: Set of nodes that are currently active on the
-              recursion stack. Useful to detect cycles.
-          layer: Layer from which `tensor` comes from. If not provided,
-              will be obtained from `tensor._keras_history`.
-          node_index: Node index from which `tensor` comes from.
-          tensor_index: Tensor_index from which `tensor` comes from.
-
-      Raises:
-          ValueError: if a cycle is detected.
-      """
-      node = layer._inbound_nodes[node_index]  # pylint: disable=protected-access
-
-      # Prevent cycles.
-      if node in nodes_in_progress:
-        raise ValueError('The tensor ' + str(tensor) + ' at layer "' +
-                         layer.name + '" is part of a cycle.')
-
-      # Don't repeat work for shared subgraphs
-      if node in finished_nodes:
-        return
-
-      node_key = _make_node_key(layer.name, node_index)
-      # Update network_nodes.
-      network_nodes.add(node_key)
-
-      # Store the traversal order for layer sorting.
-      if layer not in layer_indices:
-        layer_indices[layer] = len(layer_indices)
-
-      nodes_in_progress.add(node)
-
-      # Propagate to all previous tensors connected to this node.
-      for i in range(len(node.inbound_layers)):
-        x = node.input_tensors[i]
-        layer = node.inbound_layers[i]
-        node_index = node.node_indices[i]
-        tensor_index = node.tensor_indices[i]
-        build_map_of_graph(x, finished_nodes, nodes_in_progress, layer,
-                           node_index, tensor_index)
-
-      finished_nodes.add(node)
-      nodes_in_progress.remove(node)
-      nodes_in_decreasing_depth.append(node)
-
-    finished_nodes = set()
-    nodes_in_progress = set()
-    for x in self.outputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      build_map_of_graph(x, finished_nodes, nodes_in_progress,
-                         layer=layer,
-                         node_index=node_index,
-                         tensor_index=tensor_index)
-
-    for node in reversed(nodes_in_decreasing_depth):
-      # If the depth is not set, the node has no outbound nodes (depth 0).
-      depth = nodes_depths.setdefault(node, 0)
-
-      # Update the depth of the corresponding layer
-      previous_depth = layers_depths.get(node.outbound_layer, 0)
-      # If we've seen this layer before at a higher depth,
-      # we should use that depth instead of the node depth.
-      # This is necessary for shared layers that have inputs at different
-      # depth levels in the graph.
-      depth = max(depth, previous_depth)
-      layers_depths[node.outbound_layer] = depth
-      nodes_depths[node] = depth
-
-      # Update the depth of inbound nodes.
-      # The "depth" of a node is the max of the depths
-      # of all layers it is connected to.
-      for i in range(len(node.inbound_layers)):
-        inbound_layer = node.inbound_layers[i]
-        node_index = node.node_indices[i]
-        inbound_node = inbound_layer._inbound_nodes[node_index]  # pylint: disable=protected-access
-        previous_depth = nodes_depths.get(inbound_node, 0)
-        nodes_depths[inbound_node] = max(depth + 1, previous_depth)
-
-    # Build a dict {depth: list of nodes with this depth}
-    nodes_by_depth = {}
-    for node, depth in nodes_depths.items():
-      if depth not in nodes_by_depth:
-        nodes_by_depth[depth] = []
-      nodes_by_depth[depth].append(node)
-
-    # Build a dict {depth: list of layers with this depth}
-    layers_by_depth = {}
-    for layer, depth in layers_depths.items():
-      if depth not in layers_by_depth:
-        layers_by_depth[depth] = []
-      layers_by_depth[depth].append(layer)
-
-    # Get sorted list of layer depths.
-    depth_keys = list(layers_by_depth.keys())
-    depth_keys.sort(reverse=True)
-
-    # Set self.layers and self._layers_by_depth.
-    layers = []
-    for depth in depth_keys:
-      layers_for_depth = layers_by_depth[depth]
-      # GraphNetwork.layers needs to have a deterministic order:
-      # here we order them by traversal order.
-      layers_for_depth.sort(key=lambda x: layer_indices[x])
-      layers.extend(layers_for_depth)
-    self._layers = layers
-    self._layers_by_depth = layers_by_depth
-
-    # Get sorted list of node depths.
-    depth_keys = list(nodes_by_depth.keys())
-    depth_keys.sort(reverse=True)
-
-    # Check that all tensors required are computable.
-    # computable_tensors: all tensors in the graph
-    # that can be computed from the inputs provided.
-    computable_tensors = []
-    for x in self.inputs:
-      computable_tensors.append(x)
-
-    layers_with_complete_input = []  # To provide a better error msg.
-    for depth in depth_keys:
-      for node in nodes_by_depth[depth]:
-        layer = node.outbound_layer
-        if layer:
-          for x in node.input_tensors:
-            if x not in computable_tensors:
-              raise ValueError('Graph disconnected: '
-                               'cannot obtain value for tensor ' + str(x) +
-                               ' at layer "' + layer.name + '". '
-                               'The following previous layers '
-                               'were accessed without issue: ' +
-                               str(layers_with_complete_input))
-          for x in node.output_tensors:
-            computable_tensors.append(x)
-          layers_with_complete_input.append(layer.name)
-
-    # Keep track of the network's nodes.
-    self._network_nodes = network_nodes
-    self._nodes_by_depth = nodes_by_depth
-
-    # Ensure name unicity, which will be crucial for serialization
-    # (since serialized nodes refer to layers by their name).
-    all_names = [layer.name for layer in self.layers]
-    for name in all_names:
-      if all_names.count(name) != 1:
-        raise ValueError('The name "' + name + '" is used ' +
-                         str(all_names.count(name)) + ' times in the model. '
-                         'All layer names should be unique.')
-
-    # Layer parameters.
-    # The new network starts with a single inbound node
-    # for its inputs, and no outbound nodes.
-    self._outbound_nodes = []  # Will be appended to by future calls to __call__
-    self._inbound_nodes = [
-    ]  # Will be appended to below, and by future calls to __call__
-    # Create the node linking internal inputs to internal outputs.
-    base.Node(
-        outbound_layer=self,
-        inbound_layers=[],
-        node_indices=[],
-        tensor_indices=[],
-        input_tensors=self.inputs,
-        output_tensors=self.outputs)
-
-  @property
-  def layers(self):
-    return self._layers
-
-  def get_layer(self, name=None, index=None):
-    """Retrieves a layer based on either its name (unique) or index.
-
-    Indices are based on order of horizontal graph traversal (bottom-up).
-
-    Arguments:
-        name: String, name of layer.
-        index: Integer, index of layer.
-
-    Returns:
-        A layer instance.
-
-    Raises:
-        ValueError: In case of invalid layer name or index.
-    """
-    # TODO(fchollet): We could build a dictionary based on layer names
-    # since they are constant, but we have not done that yet.
-    if index is not None:
-      if len(self.layers) <= index:
-        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
-                         ' but model only has ' + str(len(self.layers)) +
-                         ' layers.')
-      else:
-        return self.layers[index]
-    else:
-      if not name:
-        raise ValueError('Provide either a layer name or layer index.')
-    for layer in self.layers:
-      if layer.name == name:
-        return layer
-    raise ValueError('No such layer: ' + name)
-
-  @property
-  def stateful(self):
-    return any([(hasattr(layer, 'stateful') and layer.stateful)
-                for layer in self.layers])
-
-  @property
-  def updates(self):
-    """Retrieve the network's updates.
-
-    Will only include updates that are either
-    unconditional, or conditional on inputs to this model
-    (e.g. will not include updates that were created by layers of this model
-    outside of the model).
-
-    Effectively, `network.updates` behaves like `layer.updates`.
-
-    Concrete example:
-
-    ```python
-      bn = keras.layers.BatchNormalization()
-      x1 = keras.layers.Input(shape=(10,))
-      _ = bn(x1)  # This creates 2 updates.
-
-      x2 = keras.layers.Input(shape=(10,))
-      y2 = bn(x2)  # This creates 2 more updates.
-
-      # The BN layer has now 4 updates.
-      self.assertEqual(len(bn.updates), 4)
-
-      # Let's create a model from x2 to y2.
-      model = keras.models.Model(x2, y2)
-
-      # The model does not list all updates from its underlying layers,
-      # but only the updates that are relevant to it. Updates created by layers
-      # outside of the model are discarded.
-      self.assertEqual(len(model.updates), 2)
-
-      # If you keep calling the model, you append to its updates, just like
-      # what happens for a layer.
-      x3 = keras.layers.Input(shape=(10,))
-      y3 = model(x3)
-      self.assertEqual(len(model.updates), 4)
-
-      # But if you call the inner BN layer independently, you don't affect
-      # the model's updates.
-      x4 = keras.layers.Input(shape=(10,))
-      _ = bn(x4)
-      self.assertEqual(len(model.updates), 4)
-    ```
-
-    Returns:
-        A list of update ops.
-    """
-    if context.in_eager_mode():
-      return []
-
-    if not self.trainable and not self.stateful:
-      return []
-
-    updates = []
-    for layer in self.layers:
-      updates += layer.updates
-
-    # `updates` might contain irrelevant updates, so it needs to be filtered
-    # with respect to inputs the model has been called on.
-    relevant_inputs = self.inputs or []
-    for i in range(1, len(self._inbound_nodes)):
-      inputs = self.get_input_at(i)
-      if isinstance(inputs, list):
-        relevant_inputs += inputs
-      else:
-        relevant_inputs.append(inputs)
-    reachable = layers_util.get_reachable_from_inputs(relevant_inputs, updates)
-    relevant_conditional_updates = [x for x in updates if x in reachable]
-    unconditional_updates = [
-        x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
-    # A layer could be used multiple times in a nested structure,
-    # so the updates list must be de-duped.
-    return list(set(
-        relevant_conditional_updates + unconditional_updates + self._updates))
-
-  @property
-  def losses(self):
-    """Retrieve the network's losses.
-
-    Will only include losses that are either
-    unconditional, or conditional on inputs to this model
-    (e.g. will not include losses that depend on tensors
-    that aren't inputs to this model).
-
-    Returns:
-        A list of loss tensors.
-    """
-    losses = []
-    for layer in self.layers:
-      losses += layer.losses
-    if context.in_eager_mode():
-      return losses
-
-    relevant_inputs = self.inputs or []
-    for i in range(1, len(self._inbound_nodes)):
-      inputs = self.get_input_at(i)
-      if isinstance(inputs, list):
-        relevant_inputs += inputs
-      else:
-        relevant_inputs.append(inputs)
-    reachable = layers_util.get_reachable_from_inputs(relevant_inputs, losses)
-    relevant_conditional_losses = [x for x in losses if x in reachable]
-    unconditional_losses = [
-        x for x in losses if x._unconditional_loss]  # pylint: disable=protected-access
-    return list(set(
-        relevant_conditional_losses + unconditional_losses + self._losses))
-
-  @property
-  def trainable_weights(self):
-    if not self.trainable:
-      return []
-    weights = []
-    for layer in self.layers:
-      weights += layer.trainable_weights
-    return weights
-
-  @property
-  def non_trainable_weights(self):
-    weights = []
-    for layer in self.layers:
-      weights += layer.non_trainable_weights
-    if not self.trainable:
-      trainable_weights = []
-      for layer in self.layers:
-        trainable_weights += layer.trainable_weights
-      return trainable_weights + weights
-    return weights
-
-  @property
-  def input_spec(self):
-    """Gets the network's input specs.
-
-    Returns:
-        A list of `InputSpec` instances (one per input to the model)
-            or a single instance if the model has only one input.
-    """
-    # If not a graph network, can't assume anything.
-    if not self._is_graph_network:
-      return None
-
-    specs = []
-    for layer in self._input_layers:
-      if layer.input_spec is None:
-        specs.append(None)
-      else:
-        if not isinstance(layer.input_spec, list):
-          raise TypeError('Layer ' + layer.name +
-                          ' has an input_spec attribute that '
-                          'is not a list. We expect a list. '
-                          'Found input_spec = ' + str(layer.input_spec))
-        specs += layer.input_spec
-    if len(specs) == 1:
-      return specs[0]
-    return specs
-
-  def call(self, inputs, mask=None):
-    """Call the model on new inputs.
-
-    In this case `call` just reapplies
-    all ops in the graph to the new inputs
-    (e.g. build a new computational graph from the provided inputs).
-
-    Arguments:
-        inputs: A tensor or list of tensors.
-        mask: A mask or list of masks. A mask can be
-            either a tensor or None (no mask).
-
-    Returns:
-        A tensor if there is a single output, or
-        a list of tensors if there are more than one outputs.
-    """
-    inputs = nest.flatten(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = nest.flatten(mask)
-
-    if context.in_graph_mode():
-      # Try to retrieve cached outputs if the layer has already been called
-      # on these exact inputs.
-      cache_key = (layers_util.object_list_uid(inputs)
-                   + '_' + layers_util.object_list_uid(masks))
-      if cache_key in self._output_tensor_cache:
-        # Cache hit.
-        return self._output_tensor_cache[cache_key]
-    # Actually apply the network graph to the new inputs.
-    outputs, _ = self._run_internal_graph(inputs, masks)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    if not self._is_graph_network:
-      raise NotImplementedError
-
-    if isinstance(input_shape, list):
-      input_shapes = []
-      for shape in input_shape:
-        if shape is not None:
-          input_shapes.append(tuple(tensor_shape.TensorShape(shape).as_list()))
-        else:
-          input_shapes.append(None)
-    else:
-      if input_shape is not None:
-        input_shapes = [tuple(tensor_shape.TensorShape(input_shape).as_list())]
-      else:
-        input_shapes = [None]
-
-    if len(input_shapes) != len(self._input_layers):
-      raise ValueError('Invalid input_shape argument ' + str(input_shape) +
-                       ': model has ' + str(len(self._input_layers)) +
-                       ' tensor inputs.')
-
-    cache_key = layers_util.object_list_uid(input_shapes)
-    if cache_key not in self._output_shape_cache:
-      # Cache miss. We have to run the network graph manually (recursive calls
-      # to `compute_output_shape`).
-      layers_to_output_shapes = {}
-      for i in range(len(input_shapes)):
-        layer = self._input_layers[i]
-        input_shape = input_shapes[i]
-        # It's an input layer: then `compute_output_shape` is identity,
-        # and there is only one node and one tensor output.
-        shape_key = layer.name + '_0_0'
-        layers_to_output_shapes[shape_key] = input_shape
-
-      depth_keys = list(self._nodes_by_depth.keys())
-      depth_keys.sort(reverse=True)
-      # Iterate over nodes, by depth level.
-      if len(depth_keys) > 1:
-        for depth in depth_keys:
-          nodes = self._nodes_by_depth[depth]
-          for node in nodes:
-            # This is always a single layer, never a list.
-            layer = node.outbound_layer
-            if layer in self._input_layers:
-              # We've already covered the input layers
-              # a few lines above.
-              continue
-            # Potentially redundant list,
-            # same size as node.input_tensors.
-            input_shapes = []
-            for j in range(len(node.inbound_layers)):
-              inbound_layer = node.inbound_layers[j]
-              node_index = node.node_indices[j]
-              tensor_index = node.tensor_indices[j]
-              shape_key = inbound_layer.name + '_%s_%s' % (node_index,
-                                                           tensor_index)
-              input_shape = layers_to_output_shapes[shape_key]
-              input_shapes.append(input_shape)
-
-            if len(input_shapes) == 1:
-              output_shape = layer.compute_output_shape(input_shapes[0])
-            else:
-              output_shape = layer.compute_output_shape(input_shapes)
-            if isinstance(output_shape, list):
-              output_shapes = [
-                  tuple(tensor_shape.TensorShape(shape).as_list())
-                  for shape in output_shape
-              ]
-            else:
-              output_shapes = [
-                  tuple(tensor_shape.TensorShape(output_shape).as_list())
-              ]
-
-            node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
-            for j in range(len(output_shapes)):
-              shape_key = layer.name + '_%s_%s' % (node_index, j)
-              layers_to_output_shapes[shape_key] = output_shapes[j]
-
-        # Read final output shapes from layers_to_output_shapes.
-        output_shapes = []
-        for i in range(len(self._output_layers)):
-          layer, node_index, tensor_index = self._output_coordinates[i]
-          shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
-          output_shapes.append(layers_to_output_shapes[shape_key])
-        # Store in cache.
-        self._output_shape_cache[cache_key] = output_shapes
-    else:
-      # Cache hit.
-      output_shapes = self._output_shape_cache[cache_key]
-
-    if isinstance(output_shapes, list):
-      if len(output_shapes) == 1:
-        return tensor_shape.TensorShape(output_shapes[0])
-      else:
-        return [tensor_shape.TensorShape(shape) for shape in output_shapes]
-    else:
-      return tensor_shape.TensorShape(output_shapes)
-
-  def _run_internal_graph(self, inputs, masks=None):
-    """Computes output tensors for new inputs.
-
-    # Note:
-        - Expects `inputs` to be a list (potentially with 1 element).
-        - Can be run on non-Keras tensors.
-
-    Arguments:
-        inputs: List of tensors
-        masks: List of masks (tensors or None).
-
-    Returns:
-        Three lists: output_tensors, output_masks, output_shapes
-    """
-    # Note: masking support is relevant mainly for Keras.
-    # It cannot be factored out without having the fully reimplement the network
-    # calling logic on the Keras side. We choose to incorporate it in
-    # GraphNetwork because 1) it may be useful to fully support in tf.layers in
-    # the future and 2) Keras is a major user of GraphNetwork.  If you don't
-    # use masking, it does not interfere with regular behavior at all and you
-    # can ignore it.
-    if masks is None:
-      masks = [None for _ in range(len(inputs))]
-
-    # Dictionary mapping reference tensors to tuples
-    # (computed tensor, compute mask)
-    # we assume a 1:1 mapping from tensor to mask
-    # TODO(fchollet): raise exception when a `.compute_mask()` call
-    # does not return a list the same size as `call`
-    tensor_map = {}
-    for x, y, mask in zip(self.inputs, inputs, masks):
-      tensor_map[str(id(x))] = (y, mask)
-
-    depth_keys = list(self._nodes_by_depth.keys())
-    depth_keys.sort(reverse=True)
-    for depth in depth_keys:
-      nodes = self._nodes_by_depth[depth]
-      for node in nodes:
-        # This is always a single layer, never a list.
-        layer = node.outbound_layer
-        reference_input_tensors = node.input_tensors
-        reference_output_tensors = node.output_tensors
-
-        # If all previous input tensors are available in tensor_map,
-        # then call node.inbound_layer on them.
-        computed_data = []  # List of tuples (input, mask).
-        for x in reference_input_tensors:
-          if str(id(x)) in tensor_map:
-            computed_data.append(tensor_map[str(id(x))])
-
-        if len(computed_data) == len(reference_input_tensors):
-          # Call layer (reapplying ops to new inputs).
-          with ops.name_scope(layer.name):
-            if node.arguments:
-              kwargs = node.arguments
-            else:
-              kwargs = {}
-            if len(computed_data) == 1:
-              computed_tensor, computed_mask = computed_data[0]
-              # Ensure mask propagation if applicable.
-              if 'mask' in estimator_util.fn_args(layer.call):
-                if 'mask' not in kwargs:
-                  kwargs['mask'] = computed_mask
-
-              output_tensors = nest.flatten(
-                  layer.call(computed_tensor, **kwargs))
-              if hasattr(layer, 'compute_mask'):
-                output_masks = nest.flatten(
-                    layer.compute_mask(computed_tensor, computed_mask))
-              else:
-                output_masks = [None for _ in range(len(output_tensors))]
-              computed_tensors = [computed_tensor]
-              computed_masks = [computed_mask]
-            else:
-              computed_tensors = [x[0] for x in computed_data]
-              computed_masks = [x[1] for x in computed_data]
-              if 'mask' in estimator_util.fn_args(layer.call):
-                if 'mask' not in kwargs:
-                  kwargs['mask'] = computed_masks
-              output_tensors = nest.flatten(
-                  layer.call(computed_tensors, **kwargs))
-              if hasattr(layer, 'compute_mask'):
-                output_masks = nest.flatten(
-                    layer.compute_mask(computed_tensors, computed_masks))
-              else:
-                output_masks = [None for _ in range(len(output_tensors))]
-
-            if context.in_graph_mode():
-              if layer.activity_regularizer is not None:
-                regularization_losses = [
-                    layer.activity_regularizer(x) for x in output_tensors
-                ]
-                # Apply activity regularizer if any:
-                layer.add_loss(regularization_losses, computed_tensors)
-
-          # Update tensor_map.
-          for x, y, mask in zip(reference_output_tensors, output_tensors,
-                                output_masks):
-            tensor_map[str(id(x))] = (y, mask)
-
-    output_tensors = []
-    output_masks = []
-    output_shapes = []
-    for x in self.outputs:
-      assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
-      tensor, mask = tensor_map[str(id(x))]
-      output_shapes.append(layers_util.static_shape(x))
-      output_tensors.append(tensor)
-      output_masks.append(mask)
-
-    if len(output_tensors) == 1:
-      output_tensors = output_tensors[0]
-      if output_shapes is not None:
-        output_shapes = output_shapes[0]
-      if output_masks is not None:
-        output_masks = output_masks[0]
-
-    if context.in_graph_mode():
-      # Update cache;
-      # keys are based on ids on input tensors and inputs masks.
-      cache_key = (layers_util.object_list_uid(inputs)
-                   + '_' + layers_util.object_list_uid(masks))
-      self._output_tensor_cache[cache_key] = output_tensors
-      self._output_mask_cache[cache_key] = output_masks
-
-      if output_shapes is not None:
-        input_shapes = [layers_util.static_shape(x) for x in inputs]
-        cache_key = layers_util.object_list_uid(input_shapes)
-        self._output_shape_cache[cache_key] = output_shapes
-
-    return output_tensors, output_masks
-
-
-def _make_node_key(layer_name, node_index):
-  return layer_name + '_ib-' + str(node_index)
diff --git a/tensorflow/python/layers/network_test.py b/tensorflow/python/layers/network_test.py
deleted file mode 100644
index cc6e8ca9f41cd1f6aa0a3f64d7ce11ac24c04967..0000000000000000000000000000000000000000
--- a/tensorflow/python/layers/network_test.py
+++ /dev/null
@@ -1,633 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.layers.network."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
-from tensorflow.python.layers import base as base_layers
-from tensorflow.python.layers import core as core_layers
-from tensorflow.python.layers import network as network_layers
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.platform import test
-
-
-class BaseLayerCompatibilityTest(test.TestCase):
-
-  def test_get_updates(self):
-
-    class MyLayer(base_layers.Layer):
-
-      def build(self, input_shape):
-        self.a = self.add_variable('a',
-                                   (1, 1),
-                                   'float32',
-                                   trainable=False)
-        self.b = self.add_variable('b',
-                                   (1, 1),
-                                   'float32',
-                                   trainable=False)
-        self.add_update(state_ops.assign_add(self.a, [[1.]]))
-        self.built = True
-
-      def call(self, inputs):
-        self.add_update(state_ops.assign_add(self.a, inputs),
-                        inputs=True)
-        return inputs + 1
-
-    x1 = network_layers.Input(shape=(1,))
-    layer = MyLayer()
-    _ = layer.apply(x1)
-
-    self.assertEqual(len(layer.updates), 2)
-    self.assertEqual(len(layer.get_updates_for(x1)), 1)
-    self.assertEqual(len(layer.get_updates_for(None)), 1)
-
-    x2 = network_layers.Input(shape=(1,))
-    y2 = layer.apply(x2)
-
-    self.assertEqual(len(layer.updates), 3)
-    self.assertEqual(len(layer.get_updates_for(x1)), 1)
-    self.assertEqual(len(layer.get_updates_for(x2)), 1)
-    self.assertEqual(len(layer.get_updates_for(None)), 1)
-
-    network = network_layers.GraphNetwork(x2, y2)
-    self.assertEqual(len(network.updates), 2)
-    self.assertEqual(len(network.get_updates_for(x1)), 0)
-    self.assertEqual(len(network.get_updates_for(x2)), 1)
-    self.assertEqual(len(network.get_updates_for(None)), 1)
-
-    x3 = network_layers.Input(shape=(1,))
-    _ = layer.apply(x3)
-    self.assertEqual(len(network.updates), 2)
-
-    x4 = network_layers.Input(shape=(1,))
-    _ = network(x4)
-    self.assertEqual(len(network.updates), 3)
-    self.assertEqual(len(network.get_updates_for(x2)), 1)
-    self.assertEqual(len(network.get_updates_for(x4)), 1)
-    self.assertEqual(len(network.get_updates_for(None)), 1)
-
-    network.add_update(state_ops.assign_add(layer.a, [[1]]))
-    self.assertEqual(len(network.updates), 4)
-    self.assertEqual(len(network.get_updates_for(None)), 2)
-
-    network.add_update(state_ops.assign_add(layer.a, x4), inputs=True)
-    self.assertEqual(len(network.updates), 5)
-    self.assertEqual(len(network.get_updates_for(x4)), 2)
-
-  def test_get_losses(self):
-
-    class MyLayer(base_layers.Layer):
-
-      def build(self, input_shape):
-        self.a = self.add_variable('a',
-                                   (1, 1),
-                                   'float32',
-                                   trainable=False)
-        self.b = self.add_variable('b',
-                                   (1, 1),
-                                   'float32',
-                                   trainable=False)
-        self.add_loss(math_ops.reduce_sum(self.a))
-        self.built = True
-
-      def call(self, inputs):
-        self.add_loss(math_ops.reduce_sum(inputs),
-                      inputs=True)
-        return inputs + 1
-
-    x1 = network_layers.Input(shape=(1,))
-    layer = MyLayer()
-    _ = layer.apply(x1)
-
-    self.assertEqual(len(layer.losses), 2)
-    self.assertEqual(len(layer.get_losses_for(x1)), 1)
-    self.assertEqual(len(layer.get_losses_for(None)), 1)
-
-    x2 = network_layers.Input(shape=(1,))
-    y2 = layer.apply(x2)
-
-    self.assertEqual(len(layer.losses), 3)
-    self.assertEqual(len(layer.get_losses_for(x1)), 1)
-    self.assertEqual(len(layer.get_losses_for(x2)), 1)
-    self.assertEqual(len(layer.get_losses_for(None)), 1)
-
-    network = network_layers.GraphNetwork(x2, y2)
-    self.assertEqual(len(network.losses), 2)
-    self.assertEqual(len(network.get_losses_for(x1)), 0)
-    self.assertEqual(len(network.get_losses_for(x2)), 1)
-    self.assertEqual(len(network.get_losses_for(None)), 1)
-
-    x3 = network_layers.Input(shape=(1,))
-    _ = layer.apply(x3)
-    self.assertEqual(len(network.losses), 2)
-
-    x4 = network_layers.Input(shape=(1,))
-    _ = network(x4)
-    self.assertEqual(len(network.losses), 3)
-    self.assertEqual(len(network.get_losses_for(x2)), 1)
-    self.assertEqual(len(network.get_losses_for(x4)), 1)
-    self.assertEqual(len(network.get_losses_for(None)), 1)
-
-    network.add_loss(math_ops.reduce_sum(layer.a))
-    self.assertEqual(len(network.losses), 4)
-    self.assertEqual(len(network.get_losses_for(None)), 2)
-
-    network.add_loss(math_ops.reduce_sum(x4), inputs=True)
-    self.assertEqual(len(network.losses), 5)
-    self.assertEqual(len(network.get_losses_for(x4)), 2)
-
-  def testTopologicalAttributes(self):
-    # test layer attributes / methods related to cross-layer connectivity.
-    a = network_layers.Input(shape=(32,), name='input_a')
-    b = network_layers.Input(shape=(32,), name='input_b')
-
-    # test input, output, input_shape, output_shape
-    test_layer = core_layers.Dense(16, name='test_layer')
-    a_test = test_layer(a)
-    self.assertEqual(test_layer.input, a)
-    self.assertEqual(test_layer.output, a_test)
-    self.assertEqual(test_layer.input_shape, (None, 32))
-    self.assertEqual(test_layer.output_shape, (None, 16))
-
-    # test `get_*_at` methods
-    dense = core_layers.Dense(16, name='dense_1')
-    a_2 = dense(a)
-    b_2 = dense(b)
-
-    self.assertEqual(dense.get_input_at(0), a)
-    self.assertEqual(dense.get_input_at(1), b)
-    self.assertEqual(dense.get_output_at(0), a_2)
-    self.assertEqual(dense.get_output_at(1), b_2)
-    self.assertEqual(dense.get_input_shape_at(0), (None, 32))
-    self.assertEqual(dense.get_input_shape_at(1), (None, 32))
-    self.assertEqual(dense.get_output_shape_at(0), (None, 16))
-    self.assertEqual(dense.get_output_shape_at(1), (None, 16))
-
-    # Test invalid value for attribute retrieval.
-    with self.assertRaises(ValueError):
-      dense.get_input_at(2)
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      _ = new_dense.input
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      _ = new_dense.output
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      _ = new_dense.output_shape
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      _ = new_dense.input_shape
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      a = network_layers.Input(shape=(3, 32))
-      a = network_layers.Input(shape=(5, 32))
-      a_2 = dense(a)
-      b_2 = dense(b)
-      _ = new_dense.input_shape
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      a = network_layers.Input(shape=(3, 32))
-      a = network_layers.Input(shape=(5, 32))
-      a_2 = dense(a)
-      b_2 = dense(b)
-      _ = new_dense.output_shape
-
-  def testTopologicalAttributesMultiOutputLayer(self):
-
-    class PowersLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        return [inputs**2, inputs**3]
-
-    x = network_layers.Input(shape=(32,))
-    test_layer = PowersLayer()
-    p1, p2 = test_layer(x)  # pylint: disable=not-callable
-
-    self.assertEqual(test_layer.input, x)
-    self.assertEqual(test_layer.output, [p1, p2])
-    self.assertEqual(test_layer.input_shape, (None, 32))
-    self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
-
-  def testTopologicalAttributesMultiInputLayer(self):
-
-    class AddLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        assert len(inputs) == 2
-        return inputs[0] + inputs[1]
-
-    a = network_layers.Input(shape=(32,))
-    b = network_layers.Input(shape=(32,))
-    test_layer = AddLayer()
-    y = test_layer([a, b])  # pylint: disable=not-callable
-
-    self.assertEqual(test_layer.input, [a, b])
-    self.assertEqual(test_layer.output, y)
-    self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
-    self.assertEqual(test_layer.output_shape, (None, 32))
-
-
-class NetworkTest(test.TestCase):
-
-  def testBasicNetwork(self):
-    # minimum viable network
-    x = network_layers.Input(shape=(32,))
-    dense = core_layers.Dense(2)
-    y = dense(x)
-    network = network_layers.GraphNetwork(x, y, name='dense_network')
-
-    # test basic attributes
-    self.assertEqual(network.name, 'dense_network')
-    self.assertEqual(len(network.layers), 2)  # InputLayer + Dense
-    self.assertEqual(network.layers[1], dense)
-    self.assertEqual(network.weights, dense.weights)
-    self.assertEqual(network.trainable_weights, dense.trainable_weights)
-    self.assertEqual(network.non_trainable_weights, dense.non_trainable_weights)
-
-    # test callability on Input
-    x_2 = network_layers.Input(shape=(32,))
-    y_2 = network(x_2)
-    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
-
-    # test callability on regular tensor
-    x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
-    y_2 = network(x_2)
-    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
-
-    # test network `trainable` attribute
-    network.trainable = False
-    self.assertEqual(network.weights, dense.weights)
-    self.assertEqual(network.trainable_weights, [])
-    self.assertEqual(network.non_trainable_weights,
-                     dense.trainable_weights + dense.non_trainable_weights)
-
-  def test_node_construction(self):
-    # test graph topology construction basics
-    a = network_layers.Input(shape=(32,), name='input_a')
-    b = network_layers.Input(shape=(32,), name='input_b')
-
-    self.assertEqual(a.get_shape().as_list(), [None, 32])
-    a_layer, a_node_index, a_tensor_index = a._keras_history
-    b_layer, _, _ = b._keras_history
-    self.assertEqual(len(a_layer._inbound_nodes), 1)
-    self.assertEqual(a_tensor_index, 0)
-    node = a_layer._inbound_nodes[a_node_index]
-    self.assertEqual(node.outbound_layer, a_layer)
-
-    self.assertEqual(node.inbound_layers, [])
-    self.assertEqual(node.input_tensors, [a])
-    self.assertEqual(node.input_shapes, [(None, 32)])
-    self.assertEqual(node.output_tensors, [a])
-    self.assertEqual(node.output_shapes, [(None, 32)])
-
-    dense = core_layers.Dense(16, name='dense_1')
-    dense(a)
-    dense(b)
-
-    self.assertEqual(len(dense._inbound_nodes), 2)
-    self.assertEqual(len(dense._outbound_nodes), 0)
-    self.assertEqual(dense._inbound_nodes[0].inbound_layers, [a_layer])
-    self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
-    self.assertEqual(dense._inbound_nodes[1].inbound_layers, [b_layer])
-    self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
-    self.assertEqual(dense._inbound_nodes[0].input_tensors, [a])
-    self.assertEqual(dense._inbound_nodes[1].input_tensors, [b])
-
-    # Test config
-    config_0 = dense._inbound_nodes[0].get_config()
-    self.assertEqual(config_0['outbound_layer'], dense.name)
-
-  def testMultiInputNetwork(self):
-    a = network_layers.Input(shape=(32,), name='input_a')
-    b = network_layers.Input(shape=(32,), name='input_b')
-
-    class AddLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        assert len(inputs) == 2
-        return inputs[0] + inputs[1]
-
-    c = AddLayer()([a, b])  # pylint: disable=not-callable
-    network = network_layers.GraphNetwork([a, b], c)
-    self.assertEqual(len(network.layers), 3)  # 2 * InputLayer + AddLayer
-
-    # Test callability.
-    a2 = network_layers.Input(shape=(32,))
-    b2 = network_layers.Input(shape=(32,))
-    c2 = network([a2, b2])
-    self.assertEqual(c2.get_shape().as_list(), [None, 32])
-
-  def testMultiOutputNetwork(self):
-    x = network_layers.Input(shape=(32,))
-    y1 = core_layers.Dense(2)(x)
-    y2 = core_layers.Dense(3)(x)
-    network = network_layers.GraphNetwork(x, [y1, y2])
-
-    self.assertEqual(len(network.layers), 3)  # InputLayer + 2 * Dense
-
-    # Test callability.
-    x2 = network_layers.Input(shape=(32,))
-    outputs = network(x2)
-
-    self.assertEqual(type(outputs), list)
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(outputs[0].get_shape().as_list(), [None, 2])
-    self.assertEqual(outputs[1].get_shape().as_list(), [None, 3])
-
-  def testMultiInputMultiOutputNetworkSharedLayer(self):
-    a = network_layers.Input(shape=(32,), name='input_a')
-    b = network_layers.Input(shape=(32,), name='input_b')
-
-    dense = core_layers.Dense(2)
-
-    y1 = dense(a)
-    y2 = dense(b)
-    network = network_layers.GraphNetwork([a, b], [y1, y2])
-    self.assertEqual(len(network.layers), 3)  # 2 * InputLayer + Dense
-
-    # Test callability.
-    a2 = network_layers.Input(shape=(32,))
-    b2 = network_layers.Input(shape=(32,))
-    outputs = network([a2, b2])
-
-    self.assertEqual(type(outputs), list)
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(outputs[0].get_shape().as_list(), [None, 2])
-    self.assertEqual(outputs[1].get_shape().as_list(), [None, 2])
-
-  def testCrossDataFlows(self):
-    # Test the ability to have multi-output layers with outputs that get routed
-    # to separate layers
-
-    class PowersLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        return [inputs**2, inputs**3]
-
-    x = network_layers.Input(shape=(32,))
-    p1, p2 = PowersLayer()(x)  # pylint: disable=not-callable
-    y1 = core_layers.Dense(2)(p1)
-    y2 = core_layers.Dense(3)(p2)
-    network = network_layers.GraphNetwork(x, [y1, y2])
-
-    self.assertEqual(len(network.layers), 4)  # InputLayer + 2 * Dense + PLayer
-
-    # Test callability.
-    x2 = network_layers.Input(shape=(32,))
-    outputs = network(x2)
-
-    self.assertEqual(type(outputs), list)
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(outputs[0].get_shape().as_list(), [None, 2])
-    self.assertEqual(outputs[1].get_shape().as_list(), [None, 3])
-
-  def testNetworkAttributes(self):
-    x = network_layers.Input(shape=(32,))
-    layer = core_layers.Dense(2, kernel_regularizer=lambda x: 0.01 * (x**2))
-    z = layer(x)
-    dense = core_layers.Dense(2, name='dense')
-    dense.add_update(state_ops.assign_add(layer.kernel, layer.kernel * 2.))
-    y = dense(z)
-    net = network_layers.GraphNetwork(x, y)
-
-    # losses
-    self.assertEqual(len(net.losses), 1)
-
-    # updates
-    self.assertEqual(len(net.updates), 1)
-
-    # get_layer
-    self.assertEqual(net.get_layer('dense'), dense)
-    self.assertEqual(net.get_layer(index=2), dense)
-    with self.assertRaises(ValueError):
-      net.get_layer('dense_unknown')
-    with self.assertRaises(ValueError):
-      net.get_layer()
-    with self.assertRaises(ValueError):
-      net.get_layer(index=4)
-
-    # input, output
-    self.assertEqual(net.input, x)
-    self.assertEqual(net.output, y)
-
-    # input_shape, output_shape
-    self.assertEqual(net.input_shape, (None, 32))
-    self.assertEqual(net.output_shape, (None, 2))
-
-    # get_*_at
-    self.assertEqual(net.get_input_at(0), x)
-    self.assertEqual(net.get_output_at(0), y)
-
-    # compute_output_shape
-    self.assertEqual(net.compute_output_shape((3, 32)).as_list(), [3, 2])
-
-  def testInvalidNetworks(self):
-    # redundant inputs
-    x = network_layers.Input(shape=(32,))
-    y = core_layers.Dense(2)(x)
-    with self.assertRaises(ValueError):
-      network_layers.GraphNetwork([x, x], y)
-
-    # inputs that don't come from Input
-    x = array_ops.placeholder(dtype='float32', shape=(None, 32))
-    y = core_layers.Dense(2)(x)
-    with self.assertRaises(ValueError):
-      network_layers.GraphNetwork(x, y)
-
-    # inputs that don't come from Input but have a layer history
-    x = network_layers.Input(shape=(32,))
-    x = core_layers.Dense(32)(x)
-    y = core_layers.Dense(2)(x)
-    with self.assertRaises(ValueError):
-      network_layers.GraphNetwork(x, y)
-
-    # outputs that don't come from layers
-    x = network_layers.Input(shape=(32,))
-    y = core_layers.Dense(2)(x)
-    y = 2 * y
-    with self.assertRaises(ValueError):
-      network_layers.GraphNetwork(x, y)
-
-    # disconnected graphs
-    x1 = network_layers.Input(shape=(32,))
-    x2 = network_layers.Input(shape=(32,))
-    y = core_layers.Dense(2)(x1)
-    with self.assertRaises(ValueError):
-      network_layers.GraphNetwork(x2, y)
-
-    # redundant layer names
-    x = network_layers.Input(shape=(32,))
-    z = core_layers.Dense(2, name='dense')(x)
-    y = core_layers.Dense(2, name='dense')(z)
-    with self.assertRaises(ValueError):
-      network_layers.GraphNetwork(x, y)
-
-  def testInputTensorWrapping(self):
-    x = array_ops.placeholder(dtype='float32', shape=(None, 32))
-    x = network_layers.Input(tensor=x)
-    y = core_layers.Dense(2)(x)
-    network_layers.GraphNetwork(x, y)
-
-  def testExplicitBatchSize(self):
-    x = network_layers.Input(shape=(32,), batch_size=3)
-    y = core_layers.Dense(2)(x)
-    self.assertEqual(y.get_shape().as_list(), [3, 2])
-
-  def testNetworkRecursion(self):
-    # test the ability of networks to be used as layers inside networks.
-    a = network_layers.Input(shape=(32,))
-    b = core_layers.Dense(2)(a)
-    net = network_layers.GraphNetwork(a, b)
-
-    c = network_layers.Input(shape=(32,))
-    d = net(c)
-
-    recursive_net = network_layers.GraphNetwork(c, d)
-    self.assertEqual(len(recursive_net.layers), 2)
-    self.assertEqual(recursive_net.layers[1], net)
-    self.assertEqual(len(recursive_net.weights), 2)
-
-    # test callability
-    x = array_ops.placeholder(dtype='float32', shape=(None, 32))
-    y = recursive_net(x)
-    self.assertEqual(y.get_shape().as_list(), [None, 2])
-
-  def testSparseInput(self):
-
-    class SparseSoftmax(base_layers.Layer):
-
-      def call(self, inputs):
-        return sparse_ops.sparse_softmax(inputs)
-
-    x = network_layers.Input(shape=(32,), sparse=True)
-    y = SparseSoftmax()(x)  # pylint: disable=not-callable
-    network = network_layers.GraphNetwork(x, y)
-
-    self.assertEqual(len(network.layers), 2)
-    self.assertEqual(network.layers[0].sparse, True)
-
-  def testMaskingSingleInput(self):
-
-    class MaskedLayer(base_layers.Layer):
-
-      def call(self, inputs, mask=None):
-        if mask is not None:
-          return inputs * mask
-        return inputs
-
-      def compute_mask(self, inputs, mask=None):
-        return array_ops.ones_like(inputs)
-
-    if context.in_graph_mode():
-      x = network_layers.Input(shape=(32,))
-      y = MaskedLayer()(x)  # pylint: disable=not-callable
-      network = network_layers.GraphNetwork(x, y)
-
-      # test callability on Input
-      x_2 = network_layers.Input(shape=(32,))
-      y_2 = network(x_2)
-      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
-
-      # test callability on regular tensor
-      x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
-      y_2 = network(x_2)
-      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
-    else:
-      a = constant_op.constant([2] * 32)
-      mask = constant_op.constant([0, 1] * 16)
-      a._keras_mask = mask
-      b = MaskedLayer().apply(a)
-      self.assertTrue(hasattr(b, '_keras_mask'))
-      self.assertAllEqual(self.evaluate(array_ops.ones_like(mask)),
-                          self.evaluate(getattr(b, '_keras_mask')))
-      self.assertAllEqual(self.evaluate(a * mask), self.evaluate(b))
-
-
-class DeferredModeTest(test.TestCase):
-
-  def testDeferredTensorAttributes(self):
-    x = base_layers._DeferredTensor(shape=(None, 2), dtype='float32', name='x')
-    self.assertEqual(str(x),
-                     'DeferredTensor(\'x\', shape=(?, 2), dtype=float32)')
-    self.assertEqual(repr(x),
-                     '<_DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testSimpleNetworkBuilding(self):
-    inputs = network_layers.Input(shape=(32,))
-    if context.in_eager_mode():
-      self.assertIsInstance(inputs, base_layers._DeferredTensor)
-      self.assertEqual(inputs.dtype.name, 'float32')
-      self.assertEqual(inputs.shape.as_list(), [None, 32])
-
-    x = core_layers.Dense(2)(inputs)
-    if context.in_eager_mode():
-      self.assertIsInstance(x, base_layers._DeferredTensor)
-      self.assertEqual(x.dtype.name, 'float32')
-      self.assertEqual(x.shape.as_list(), [None, 2])
-
-    outputs = core_layers.Dense(4)(x)
-    network = network_layers.GraphNetwork(inputs, outputs)
-    self.assertIsInstance(network, network_layers.GraphNetwork)
-
-    if context.in_eager_mode():
-      # It should be possible to call such a network on EagerTensors.
-      inputs = constant_op.constant(
-          np.random.random((10, 32)).astype('float32'))
-      outputs = network(inputs)
-      self.assertEqual(outputs.shape.as_list(), [10, 4])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testMultiIONetworkbuilding(self):
-    input_a = network_layers.Input(shape=(32,))
-    input_b = network_layers.Input(shape=(16,))
-    a = core_layers.Dense(16)(input_a)
-
-    class AddLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        return inputs[0] + inputs[1]
-
-      def compute_output_shape(self, input_shape):
-        return input_shape[0]
-
-    c = AddLayer()([a, input_b])  # pylint: disable=not-callable
-    c = core_layers.Dense(2)(c)
-
-    network = network_layers.GraphNetwork([input_a, input_b], [a, c])
-    if context.in_eager_mode():
-      a_val = constant_op.constant(
-          np.random.random((10, 32)).astype('float32'))
-      b_val = constant_op.constant(
-          np.random.random((10, 16)).astype('float32'))
-      outputs = network([a_val, b_val])
-      self.assertEqual(len(outputs), 2)
-      self.assertEqual(outputs[0].shape.as_list(), [10, 16])
-      self.assertEqual(outputs[1].shape.as_list(), [10, 2])
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index d83292b80963d942023b5d086a089af53008efe0..33284b0d695272db5a4e0d757d6f24b1930068de 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -24,26 +24,14 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import gen_resource_variable_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.training import moving_averages
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('layers.BatchNormalization')
-class BatchNormalization(base.Layer):
+class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
   "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -143,496 +131,31 @@ class BatchNormalization(base.Layer):
                name=None,
                **kwargs):
     super(BatchNormalization, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-    if isinstance(axis, list):
-      self.axis = axis[:]
-    else:
-      self.axis = axis
-    self.momentum = momentum
-    self.epsilon = epsilon
-    self.center = center
-    self.scale = scale
-    self.beta_initializer = beta_initializer
-    self.gamma_initializer = gamma_initializer
-    self.moving_mean_initializer = moving_mean_initializer
-    self.moving_variance_initializer = moving_variance_initializer
-    self.beta_regularizer = beta_regularizer
-    self.gamma_regularizer = gamma_regularizer
-    self.beta_constraint = beta_constraint
-    self.gamma_constraint = gamma_constraint
-    self.renorm = renorm
-    self.virtual_batch_size = virtual_batch_size
-    self.adjustment = adjustment
-    if fused is None:
-      fused = True
-
-    self.fused = fused
-    self._bessels_correction_test_only = True
-
-    if renorm:
-      renorm_clipping = renorm_clipping or {}
-      keys = ['rmax', 'rmin', 'dmax']
-      if set(renorm_clipping) - set(keys):
-        raise ValueError('renorm_clipping %s contains keys not in %s' %
-                         (renorm_clipping, keys))
-      self.renorm_clipping = renorm_clipping
-      self.renorm_momentum = renorm_momentum
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if not input_shape.ndims:
-      raise ValueError('Input has undefined rank:', input_shape)
-    ndims = len(input_shape)
-
-    # Convert axis to list and resolve negatives
-    if isinstance(self.axis, int):
-      self.axis = [self.axis]
-
-    if not isinstance(self.axis, list):
-      raise TypeError('axis must be int or list, type given: %s'
-                      % type(self.axis))
-
-    for idx, x in enumerate(self.axis):
-      if x < 0:
-        self.axis[idx] = ndims + x
-
-    # Validate axes
-    for x in self.axis:
-      if x < 0 or x >= ndims:
-        raise ValueError('Invalid axis: %d' % x)
-    if len(self.axis) != len(set(self.axis)):
-      raise ValueError('Duplicate axis: %s' % self.axis)
-
-    if self.virtual_batch_size is not None:
-      if self.virtual_batch_size <= 0:
-        raise ValueError('virtual_batch_size must be a positive integer that '
-                         'divides the true batch size of the input Tensor')
-      # If using virtual batches, the first dimension must be the batch
-      # dimension and cannot be the batch norm axis
-      if 0 in self.axis:
-        raise ValueError('When using virtual_batch_size, the batch dimension '
-                         'must be 0 and thus axis cannot include 0')
-      if self.adjustment is not None:
-        raise ValueError('When using virtual_batch_size, adjustment cannot '
-                         'be specified')
-
-    if self.fused:
-      # Currently fused batch norm doesn't support renorm. It also only supports
-      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
-      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
-      # output back to its original shape accordingly.
-      self.fused = (not self.renorm and
-                    ndims == 4 and
-                    self.axis in [[1], [3]] and
-                    self.virtual_batch_size is None and
-                    self.adjustment is None)
-      # TODO(chrisying): fused batch norm is currently not supported for
-      # multi-axis batch norm and by extension virtual batches. In some cases,
-      # it might be possible to use fused batch norm but would require reshaping
-      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
-      # particularly tricky. A compromise might be to just support the most
-      # common use case (turning 5D w/ virtual batch to NCHW)
-
-    if self.fused:
-      if self.axis == [1]:
-        self._data_format = 'NCHW'
-      elif self.axis == [3]:
-        self._data_format = 'NHWC'
-      else:
-        raise ValueError('Unsupported axis, fused batch norm only supports '
-                         'axis == [1] or axis == [3]')
-
-    # Raise parameters of fp16 batch norm to fp32
-    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
-      param_dtype = dtypes.float32
-    else:
-      param_dtype = self.dtype or dtypes.float32
-
-    axis_to_dim = {x: input_shape[x].value for x in self.axis}
-    for x in axis_to_dim:
-      if axis_to_dim[x] is None:
-        raise ValueError('Input has undefined `axis` dimension. Input shape: ',
-                         input_shape)
-    self.input_spec = base.InputSpec(ndim=ndims, axes=axis_to_dim)
-
-    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
-      # Single axis batch norm (most common/default use-case)
-      param_shape = (list(axis_to_dim.values())[0],)
-    else:
-      # Parameter shape is the original shape but with 1 in all non-axis dims
-      param_shape = [axis_to_dim[i] if i in axis_to_dim
-                     else 1 for i in range(ndims)]
-      if self.virtual_batch_size is not None:
-        # When using virtual batches, add an extra dim at index 1
-        param_shape.insert(1, 1)
-        for idx, x in enumerate(self.axis):
-          self.axis[idx] = x + 1      # Account for added dimension
-
-    if self.scale:
-      self.gamma = self.add_variable(
-          name='gamma',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.gamma_initializer,
-          regularizer=self.gamma_regularizer,
-          constraint=self.gamma_constraint,
-          trainable=True)
-    else:
-      self.gamma = None
-      if self.fused:
-        self._gamma_const = array_ops.constant(
-            1.0, dtype=param_dtype, shape=param_shape)
-
-    if self.center:
-      self.beta = self.add_variable(
-          name='beta',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.beta_initializer,
-          regularizer=self.beta_regularizer,
-          constraint=self.beta_constraint,
-          trainable=True)
-    else:
-      self.beta = None
-      if self.fused:
-        self._beta_const = array_ops.constant(
-            0.0, dtype=param_dtype, shape=param_shape)
-
-    # Disable variable partitioning when creating the moving mean and variance
-    try:
-      if self._scope:
-        partitioner = self._scope.partitioner
-        self._scope.set_partitioner(None)
-      else:
-        partitioner = None
-      self.moving_mean = self.add_variable(
-          name='moving_mean',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.moving_mean_initializer,
-          trainable=False)
-
-      self.moving_variance = self.add_variable(
-          name='moving_variance',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.moving_variance_initializer,
-          trainable=False)
-
-      self._one_minus_decay = 1.0 - self.momentum
-      if self.renorm:
-        # Create variables to maintain the moving mean and standard deviation.
-        # These are used in training and thus are different from the moving
-        # averages above. The renorm variables are colocated with moving_mean
-        # and moving_variance.
-        # NOTE: below, the outer `with device` block causes the current device
-        # stack to be cleared. The nested ones use a `lambda` to set the desired
-        # device and ignore any devices that may be set by the custom getter.
-        def _renorm_variable(name, shape):
-          var = self.add_variable(
-              name=name,
-              shape=shape,
-              dtype=param_dtype,
-              initializer=init_ops.zeros_initializer(),
-              trainable=False)
-          return var
-
-        with ops.device(None):
-          device = ((lambda _: self.moving_mean.device)
-                    if context.in_graph_mode() else self.moving_mean.device)
-          with ops.device(device):
-            self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
-            self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
-          # We initialize renorm_stddev to 0, and maintain the (0-initialized)
-          # renorm_stddev_weight. This allows us to (1) mix the average
-          # stddev with the minibatch stddev early in training, and (2) compute
-          # the unbiased average stddev by dividing renorm_stddev by the weight.
-          device = ((lambda _: self.moving_variance.device)
-                    if context.in_graph_mode() else self.moving_variance.device)
-          with ops.device(device):
-            self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
-            self.renorm_stddev_weight = _renorm_variable(
-                'renorm_stddev_weight', ())
-    finally:
-      if partitioner:
-        self._scope.set_partitioner(partitioner)
-    self.built = True
-
-  def _assign_moving_average(self, variable, value, one_minus_decay):
-    with ops.name_scope(None, 'AssignMovingAvg',
-                        [variable, value, one_minus_decay]) as scope:
-      with ops.colocate_with(variable):
-        update_delta = math_ops.multiply(
-            math_ops.subtract(variable.read_value(), value),
-            one_minus_decay)
-        if isinstance(variable, resource_variable_ops.ResourceVariable):
-          # state_ops.assign_sub does an extra read_variable_op after the
-          # assign. We avoid that here.
-          return gen_resource_variable_ops.assign_sub_variable_op(
-              variable.handle, update_delta, name=scope)
-        else:
-          return state_ops.assign_sub(variable, update_delta, name=scope)
-
-  def _fused_batch_norm(self, inputs, training):
-    """Returns the output of fused batch norm."""
-    beta = self.beta if self.center else self._beta_const
-    gamma = self.gamma if self.scale else self._gamma_const
-
-    def _fused_batch_norm_training():
-      return nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          epsilon=self.epsilon,
-          data_format=self._data_format)
-
-    def _fused_batch_norm_inference():
-      return nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          mean=self.moving_mean,
-          variance=self.moving_variance,
-          epsilon=self.epsilon,
-          is_training=False,
-          data_format=self._data_format)
-
-    output, mean, variance = utils.smart_cond(
-        training, _fused_batch_norm_training, _fused_batch_norm_inference)
-    if not self._bessels_correction_test_only:
-      # Remove Bessel's correction to be consistent with non-fused batch norm.
-      # Note that the variance computed by fused batch norm is
-      # with Bessel's correction.
-      sample_size = math_ops.cast(
-          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
-      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
-      variance *= factor
-
-    training_value = utils.constant_value(training)
-    if training_value is None:
-      one_minus_decay = utils.smart_cond(training,
-                                         lambda: self._one_minus_decay,
-                                         lambda: 0.)
-    else:
-      one_minus_decay = ops.convert_to_tensor(self._one_minus_decay)
-    if training_value or training_value is None:
-      mean_update = self._assign_moving_average(self.moving_mean, mean,
-                                                one_minus_decay)
-      variance_update = self._assign_moving_average(self.moving_variance,
-                                                    variance, one_minus_decay)
-      if context.in_graph_mode():
-        # Note that in Eager mode, the updates are already executed when running
-        # assign_moving_averages. So we do not need to put them into
-        # collections.
-        self.add_update(mean_update, inputs=inputs)
-        self.add_update(variance_update, inputs=inputs)
-
-    return output
-
-  def _renorm_correction_and_moments(self, mean, variance, training):
-    """Returns the correction and update values for renorm."""
-    stddev = math_ops.sqrt(variance + self.epsilon)
-    # Compute the average mean and standard deviation, as if they were
-    # initialized with this batch's moments.
-    mixed_renorm_mean = (self.renorm_mean +
-                         (1. - self.renorm_mean_weight) * mean)
-    mixed_renorm_stddev = (self.renorm_stddev +
-                           (1. - self.renorm_stddev_weight) * stddev)
-    # Compute the corrections for batch renorm.
-    r = stddev / mixed_renorm_stddev
-    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
-    # Ensure the corrections use pre-update moving averages.
-    with ops.control_dependencies([r, d]):
-      mean = array_ops.identity(mean)
-      stddev = array_ops.identity(stddev)
-    rmin, rmax, dmax = [self.renorm_clipping.get(key)
-                        for key in ['rmin', 'rmax', 'dmax']]
-    if rmin is not None:
-      r = math_ops.maximum(r, rmin)
-    if rmax is not None:
-      r = math_ops.minimum(r, rmax)
-    if dmax is not None:
-      d = math_ops.maximum(d, -dmax)
-      d = math_ops.minimum(d, dmax)
-    # When not training, use r=1, d=0.
-    r = utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
-    d = utils.smart_cond(training, lambda: d, lambda: array_ops.zeros_like(d))
-
-    def _update_renorm_variable(var, weight, value):
-      """Updates a moving average and weight, returns the unbiased value."""
-      value = array_ops.identity(value)
-      def _do_update():
-        # Update the variables without zero debiasing. The debiasing will be
-        # accomplished by dividing the exponential moving average by the weight.
-        # For example, after a single update, the moving average would be
-        # (1-decay) * value. and the weight will be 1-decay, with their ratio
-        # giving the value.
-        # Make sure the weight is not updated until before r and d computation.
-        with ops.control_dependencies([value]):
-          weight_value = array_ops.constant(1., dtype=weight.dtype)
-        new_var = moving_averages.assign_moving_average(
-            var, value, self.renorm_momentum, zero_debias=False)
-        new_weight = moving_averages.assign_moving_average(
-            weight, weight_value, self.renorm_momentum, zero_debias=False)
-        return new_var / new_weight
-      def _fake_update():
-        return array_ops.identity(var)
-      return utils.smart_cond(training, _do_update, _fake_update)
-
-    with ops.colocate_with(self.moving_mean):
-      new_mean = _update_renorm_variable(self.renorm_mean,
-                                         self.renorm_mean_weight,
-                                         mean)
-    with ops.colocate_with(self.moving_variance):
-      new_stddev = _update_renorm_variable(self.renorm_stddev,
-                                           self.renorm_stddev_weight,
-                                           stddev)
-      # Make sqrt(moving_variance + epsilon) = new_stddev.
-      new_variance = math_ops.square(new_stddev) - self.epsilon
-
-    return (r, d, new_mean, new_variance)
+        axis=axis,
+        momentum=momentum,
+        epsilon=epsilon,
+        center=center,
+        scale=scale,
+        beta_initializer=beta_initializer,
+        gamma_initializer=gamma_initializer,
+        moving_mean_initializer=moving_mean_initializer,
+        moving_variance_initializer=moving_variance_initializer,
+        beta_regularizer=beta_regularizer,
+        gamma_regularizer=gamma_regularizer,
+        beta_constraint=beta_constraint,
+        gamma_constraint=gamma_constraint,
+        renorm=renorm,
+        renorm_clipping=renorm_clipping,
+        renorm_momentum=renorm_momentum,
+        fused=fused,
+        trainable=trainable,
+        virtual_batch_size=virtual_batch_size,
+        adjustment=adjustment,
+        name=name,
+        **kwargs)
 
   def call(self, inputs, training=False):
-    in_eager_mode = context.in_eager_mode()
-    if self.virtual_batch_size is not None:
-      # Virtual batches (aka ghost batches) can be simulated by reshaping the
-      # Tensor and reusing the existing batch norm implementation
-      original_shape = [-1] + inputs.shape.as_list()[1:]
-      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
-
-      # Will cause errors if virtual_batch_size does not divide the batch size
-      inputs = array_ops.reshape(inputs, expanded_shape)
-
-      def undo_virtual_batching(outputs):
-        outputs = array_ops.reshape(outputs, original_shape)
-        return outputs
-
-    if self.fused:
-      outputs = self._fused_batch_norm(inputs, training=training)
-      if self.virtual_batch_size is not None:
-        # Currently never reaches here since fused_batch_norm does not support
-        # virtual batching
-        return undo_virtual_batching(outputs)
-      return outputs
-
-    # Compute the axes along which to reduce the mean / variance
-    input_shape = inputs.get_shape()
-    ndims = len(input_shape)
-    reduction_axes = [i for i in range(ndims) if i not in self.axis]
-    if self.virtual_batch_size is not None:
-      del reduction_axes[1]     # Do not reduce along virtual batch dim
-
-    # Broadcasting only necessary for single-axis batch norm where the axis is
-    # not the last dimension
-    broadcast_shape = [1] * ndims
-    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
-    def _broadcast(v):
-      if (v is not None and
-          len(v.get_shape()) != ndims and
-          reduction_axes != list(range(ndims - 1))):
-        return array_ops.reshape(v, broadcast_shape)
-      return v
-
-    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
-
-    def _compose_transforms(scale, offset, then_scale, then_offset):
-      if then_scale is not None:
-        scale *= then_scale
-        offset *= then_scale
-      if then_offset is not None:
-        offset += then_offset
-      return (scale, offset)
-
-    # Determine a boolean value for `training`: could be True, False, or None.
-    training_value = utils.constant_value(training)
-    if training_value is not False:
-      if self.adjustment:
-        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
-        # Adjust only during training.
-        adj_scale = utils.smart_cond(training,
-                                     lambda: adj_scale,
-                                     lambda: array_ops.ones_like(adj_scale))
-        adj_bias = utils.smart_cond(training,
-                                    lambda: adj_bias,
-                                    lambda: array_ops.zeros_like(adj_bias))
-        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
-
-      # Some of the computations here are not necessary when training==False
-      # but not a constant. However, this makes the code simpler.
-      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
-      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
-
-      moving_mean = self.moving_mean
-      moving_variance = self.moving_variance
-
-      mean = utils.smart_cond(training,
-                              lambda: mean,
-                              lambda: moving_mean)
-      variance = utils.smart_cond(training,
-                                  lambda: variance,
-                                  lambda: moving_variance)
-
-      if self.renorm:
-        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
-            mean, variance, training)
-        # When training, the normalized values (say, x) will be transformed as
-        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
-        # = x * (r * gamma) + (d * gamma + beta) with renorm.
-        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
-        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
-        scale, offset = _compose_transforms(r, d, scale, offset)
-      else:
-        new_mean, new_variance = mean, variance
-
-      if self.virtual_batch_size is not None:
-        # This isn't strictly correct since in ghost batch norm, you are
-        # supposed to sequentially update the moving_mean and moving_variance
-        # with each sub-batch. However, since the moving statistics are only
-        # used during evaluation, it is more efficient to just update in one
-        # step and should not make a significant difference in the result.
-        new_mean = math_ops.reduce_mean(new_mean,
-                                        axis=1, keep_dims=True)
-        new_variance = math_ops.reduce_mean(new_variance,
-                                            axis=1, keep_dims=True)
-
-      def _do_update(var, value):
-        if in_eager_mode and not self.trainable:
-          return
-
-        return moving_averages.assign_moving_average(
-            var, value, self.momentum, zero_debias=False)
-
-      mean_update = utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_mean, new_mean),
-          lambda: self.moving_mean)
-      variance_update = utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_variance, new_variance),
-          lambda: self.moving_variance)
-      if context.in_graph_mode():
-        self.add_update(mean_update, inputs=inputs)
-        self.add_update(variance_update, inputs=inputs)
-
-    else:
-      mean, variance = self.moving_mean, self.moving_variance
-
-    outputs = nn.batch_normalization(inputs,
-                                     _broadcast(mean),
-                                     _broadcast(variance),
-                                     offset,
-                                     scale,
-                                     self.epsilon)
-    # If some components of the shape got lost due to adjustments, fix that.
-    outputs.set_shape(input_shape)
-
-    if self.virtual_batch_size is not None:
-      return undo_virtual_batching(outputs)
-
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    return super(BatchNormalization, self).call(inputs, training=training)
 
 
 @tf_export('layers.batch_normalization')
@@ -671,9 +194,16 @@ def batch_normalization(inputs,
 
   Note: when training, the moving_mean and moving_variance need to be updated.
   By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
-  need to be added as a dependency to the `train_op`. For example:
+  need to be added as a dependency to the `train_op`. Also, be sure to add
+  any batch_normalization ops before getting the update_ops collection.
+  Otherwise, update_ops will be empty, and training/inference will not work
+  properly. For example:
 
   ```python
+    x_norm = tf.layers.batch_normalization(x, training=training)
+
+    # ...
+
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
     with tf.control_dependencies(update_ops):
       train_op = optimizer.minimize(loss)
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 50503ce093fbc251b11c4d5cbccb2a2683d92e7a..75abe56f51f2a206ea3e5a5dad032446c150293a 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -13,92 +13,19 @@
 # limitations under the License.
 # =============================================================================
 
-# pylint: disable=unused-import,g-bad-import-order
 """Contains the pooling layer classes and their functional aliases.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _Pooling1D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 1D inputs.
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling1D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 1, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 1, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=3)
-
-  def call(self, inputs):
-    # There is no TF op for 1D pooling, hence we make the inputs 4D.
-    if self.data_format == 'channels_last':
-      # input is NWC, make it NHWC
-      inputs = array_ops.expand_dims(inputs, 1)
-      # pool on the W dim
-      pool_shape = (1, 1) + self.pool_size + (1,)
-      strides = (1, 1) + self.strides + (1,)
-      data_format = 'NHWC'
-    else:
-      # input is NCW, make it NCHW
-      inputs = array_ops.expand_dims(inputs, 2)
-      # pool on the W dim
-      pool_shape = (1, 1, 1) + self.pool_size
-      strides = (1, 1, 1) + self.strides
-      data_format = 'NCHW'
-
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=data_format)
-
-    if self.data_format == 'channels_last':
-      return array_ops.squeeze(outputs, 1)
-    else:
-      return array_ops.squeeze(outputs, 2)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    length = utils.conv_output_length(input_shape[1], self.pool_size[0],
-                                      self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
-
-
 @tf_export('layers.AveragePooling1D')
-class AveragePooling1D(_Pooling1D):
+class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
   """Average Pooling layer for 1D inputs.
 
   Arguments:
@@ -119,8 +46,9 @@ class AveragePooling1D(_Pooling1D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling1D, self).__init__(
-        nn.avg_pool,
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -165,7 +93,7 @@ def average_pooling1d(inputs, pool_size, strides,
 
 
 @tf_export('layers.MaxPooling1D')
-class MaxPooling1D(_Pooling1D):
+class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
   """Max Pooling layer for 1D inputs.
 
   Arguments:
@@ -186,8 +114,9 @@ class MaxPooling1D(_Pooling1D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling1D, self).__init__(
-        nn.max_pool,
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -231,79 +160,8 @@ def max_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
-class _Pooling2D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling2D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 2, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 2, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=4)
-
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      pool_shape = (1,) + self.pool_size + (1,)
-      strides = (1,) + self.strides + (1,)
-    else:
-      pool_shape = (1, 1) + self.pool_size
-      strides = (1, 1) + self.strides
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, 4))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-    else:
-      rows = input_shape[1]
-      cols = input_shape[2]
-    rows = utils.conv_output_length(rows, self.pool_size[0], self.padding,
-                                    self.strides[0])
-    cols = utils.conv_output_length(cols, self.pool_size[1], self.padding,
-                                    self.strides[1])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], rows, cols])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, input_shape[3]])
-
-
 @tf_export('layers.AveragePooling2D')
-class AveragePooling2D(_Pooling2D):
+class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
   """Average pooling layer for 2D inputs (e.g. images).
 
   Arguments:
@@ -328,8 +186,9 @@ class AveragePooling2D(_Pooling2D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling2D, self).__init__(
-        nn.avg_pool,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -373,7 +232,7 @@ def average_pooling2d(inputs,
 
 
 @tf_export('layers.MaxPooling2D')
-class MaxPooling2D(_Pooling2D):
+class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
   """Max pooling layer for 2D inputs (e.g. images).
 
   Arguments:
@@ -398,8 +257,9 @@ class MaxPooling2D(_Pooling2D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling2D, self).__init__(
-        nn.max_pool,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -442,90 +302,8 @@ def max_pooling2d(inputs,
   return layer.apply(inputs)
 
 
-class _Pooling3D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 3D inputs.
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)`
-      while `channels_first` corresponds to
-      inputs with shape `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling3D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 3, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 3, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=5)
-
-  def call(self, inputs):
-    pool_shape = (1,) + self.pool_size + (1,)
-    strides = (1,) + self.strides + (1,)
-
-    if self.data_format == 'channels_first':
-      # TF does not support `channels_first` with 3D pooling operations,
-      # so we must handle this case manually.
-      # TODO(fchollet): remove this when TF pooling is feature-complete.
-      inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
-
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper())
-
-    if self.data_format == 'channels_first':
-      outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      len_dim1 = input_shape[2]
-      len_dim2 = input_shape[3]
-      len_dim3 = input_shape[4]
-    else:
-      len_dim1 = input_shape[1]
-      len_dim2 = input_shape[2]
-      len_dim3 = input_shape[3]
-    len_dim1 = utils.conv_output_length(len_dim1, self.pool_size[0],
-                                        self.padding, self.strides[0])
-    len_dim2 = utils.conv_output_length(len_dim2, self.pool_size[1],
-                                        self.padding, self.strides[1])
-    len_dim3 = utils.conv_output_length(len_dim3, self.pool_size[2],
-                                        self.padding, self.strides[2])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
-
-
 @tf_export('layers.AveragePooling3D')
-class AveragePooling3D(_Pooling3D):
+class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
   """Average pooling layer for 3D inputs (e.g. volumes).
 
   Arguments:
@@ -552,8 +330,9 @@ class AveragePooling3D(_Pooling3D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling3D, self).__init__(
-        nn.avg_pool3d,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -599,7 +378,7 @@ def average_pooling3d(inputs,
 
 
 @tf_export('layers.MaxPooling3D')
-class MaxPooling3D(_Pooling3D):
+class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
   """Max pooling layer for 3D inputs (e.g. volumes).
 
   Arguments:
@@ -626,8 +405,9 @@ class MaxPooling3D(_Pooling3D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling3D, self).__init__(
-        nn.max_pool3d,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 79529e86c383ce68b05015010be01df3355df691..3b156c36a2ff35fb9e05af1406d7b3f6cf883394 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -24,6 +24,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond as smart_module
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.util import nest
 
@@ -199,10 +200,10 @@ def smart_cond(pred, true_fn=None, false_fn=None, name=None):
     TypeError: If `true_fn` or `false_fn` is not callable.
   """
   if isinstance(pred, variables.Variable):
-    return control_flow_ops.cond(pred, true_fn=true_fn, false_fn=false_fn,
-                                 name=name)
-  return control_flow_ops.smart_cond(pred, true_fn=true_fn,
-                                     false_fn=false_fn, name=name)
+    return control_flow_ops.cond(
+        pred, true_fn=true_fn, false_fn=false_fn, name=name)
+  return smart_module.smart_cond(
+      pred, true_fn=true_fn, false_fn=false_fn, name=name)
 
 
 def constant_value(pred):
@@ -225,10 +226,10 @@ def constant_value(pred):
       pred = True
     elif pred == 0:
       pred = False
-  
+
   if isinstance(pred, variables.Variable):
     return None
-  return control_flow_ops.smart_constant_value(pred)
+  return smart_module.smart_constant_value(pred)
 
 
 def object_list_uid(object_list):
diff --git a/tensorflow/python/layers/utils_test.py b/tensorflow/python/layers/utils_test.py
index c941aad7bc63dbb891fbe78cd2a47dd6805bf231..7e94dda648166780af002ce6b979a751a0ced846 100644
--- a/tensorflow/python/layers/utils_test.py
+++ b/tensorflow/python/layers/utils_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -89,33 +88,5 @@ class ConvUtilsTest(test.TestCase):
     self.assertEqual(6, utils.deconv_output_length(4, 2, 'full', 2))
 
 
-class GraphUtilsTest(test.TestCase):
-
-  def testGetReachableFromInputs(self):
-
-    with self.test_session():
-      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
-      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
-      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
-      x_1 = pl_1 + pl_2
-      x_2 = pl_2 * 2
-      x_3 = pl_3 + 1
-      x_4 = x_1 + x_2
-      x_5 = x_3 * pl_1
-
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_1]),
-          {pl_1, x_1, x_4, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_1, pl_2]),
-          {pl_1, pl_2, x_1, x_2, x_4, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_3]),
-          {pl_3, x_3, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([x_3]),
-          {x_3, x_5})
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
index 7f07deebef3d8e8f24f73a42f29f4ade4cae568d..77fa2c1f66d2214dbb08e4d0ad3437fa4fe02822 100644
--- a/tensorflow/python/lib/core/bfloat16.cc
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -616,8 +616,8 @@ bool Initialize() {
   };
 
   // Comparisons
-  const std::array<int, 3> compare_types = {npy_bfloat16_, npy_bfloat16_,
-                                            NPY_BOOL};
+  const std::array<int, 3> compare_types = {
+      {npy_bfloat16_, npy_bfloat16_, NPY_BOOL}};
 
   if (!register_ufunc("equal", CompareUFunc<Bfloat16EqFunctor>,
                       compare_types)) {
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 994af69386b278f6b88c051f898cd6a9dc607f3f..a07e305ffbe8b4c4736c3231f6d1d7872d91e04e 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -267,7 +267,9 @@ gtl::InlinedVector<npy_intp, 4> GetPyArrayDimensionsForTensor(
   const int ndims = TF_NumDims(tensor);
   gtl::InlinedVector<npy_intp, 4> dims(ndims);
   if (TF_TensorType(tensor) == TF_RESOURCE) {
-    dims[0] = TF_TensorByteSize(tensor);
+    CHECK_EQ(ndims, 0)
+        << "Fetching of non-scalar resource tensors is not supported.";
+    dims.push_back(TF_TensorByteSize(tensor));
     *nelems = dims[0];
   } else {
     *nelems = 1;
diff --git a/tensorflow/python/lib/core/py_exception_registry.cc b/tensorflow/python/lib/core/py_exception_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6637de632b48e4dfc8219543161464b10dcdbe12
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+
+#include <Python.h>
+
+namespace tensorflow {
+
+PyExceptionRegistry* PyExceptionRegistry::singleton_ = nullptr;
+
+void PyExceptionRegistry::Init(PyObject* code_to_exc_type_map) {
+  DCHECK(singleton_ == nullptr) << "PyExceptionRegistry::Init() already called";
+  singleton_ = new PyExceptionRegistry;
+
+  DCHECK(PyDict_Check(code_to_exc_type_map));
+  PyObject* key;
+  PyObject* value;
+  Py_ssize_t pos = 0;
+  while (PyDict_Next(code_to_exc_type_map, &pos, &key, &value)) {
+    TF_Code code = static_cast<TF_Code>(PyLong_AsLong(key));
+    singleton_->exc_types_[code] = value;
+    // The exception classes should also have the lifetime of the process, but
+    // incref just in case.
+    Py_INCREF(value);
+  }
+}
+
+PyObject* PyExceptionRegistry::Lookup(TF_Code code) {
+  DCHECK(singleton_ != nullptr) << "Must call PyExceptionRegistry::Init() "
+                                   "before PyExceptionRegistry::Lookup()";
+  DCHECK_NE(code, TF_OK);
+  DCHECK(singleton_->exc_types_.find(code) != singleton_->exc_types_.end())
+      << "Unknown error code passed to PyExceptionRegistry::Lookup: " << code;
+  return singleton_->exc_types_[code];
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_exception_registry.h b/tensorflow/python/lib/core/py_exception_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b0f23b548c16130dee9a8ec086ae0283f1506e1
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
+
+#include <map>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/platform/logging.h"
+
+#ifndef PyObject_HEAD
+struct _object;
+typedef _object PyObject;
+#endif
+
+namespace tensorflow {
+
+// Global registry mapping C API error codes to the corresponding custom Python
+// exception type. This is used to expose the exception types to C extension
+// code (i.e. so we can raise custom exceptions via SWIG).
+//
+// Init() must be called exactly once at the beginning of the process before
+// Lookup() can be used.
+//
+// Example usage:
+//   TF_Status* status = TF_NewStatus();
+//   TF_Foo(..., status);
+//
+//   if (TF_GetCode(status) != TF_OK) {
+//     PyObject* exc_type = PyExceptionRegistry::Lookup(TF_GetCode(status));
+//     // Arguments to OpError base class. Set `node_def` and `op` to None.
+//     PyObject* args =
+//       Py_BuildValue("sss", nullptr, nullptr, TF_Message(status));
+//     PyErr_SetObject(exc_type, args);
+//     Py_DECREF(args);
+//     TF_DeleteStatus(status);
+//     return NULL;
+//   }
+class PyExceptionRegistry {
+ public:
+  // Initializes the process-wide registry. Should be called exactly once near
+  // the beginning of the process. The arguments are the various Python
+  // exception types (e.g. `cancelled_exc` corresponds to
+  // errors.CancelledError).
+  static void Init(PyObject* code_to_exc_type_map);
+
+  // Returns the Python exception type corresponding to `code`. Init() must be
+  // called before using this function. `code` should not be TF_OK.
+  static PyObject* Lookup(TF_Code code);
+
+ private:
+  static PyExceptionRegistry* singleton_;
+  PyExceptionRegistry() = default;
+
+  // Maps error codes to the corresponding Python exception type.
+  std::map<TF_Code, PyObject*> exc_types_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
diff --git a/tensorflow/python/lib/core/py_exception_registry.i b/tensorflow/python/lib/core/py_exception_registry.i
new file mode 100644
index 0000000000000000000000000000000000000000..e872b74985e03e203c8aeb8fdec8a3e67f03e1f9
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.i
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+%}
+
+%ignoreall
+
+%unignore tensorflow::PyExceptionRegistry;
+%unignore tensorflow::PyExceptionRegistry::Init;
+
+%include "tensorflow/python/lib/core/py_exception_registry.h"
+%unignoreall
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index e0422ef80add42307268be2743e668eb8c8acb68..8c6bb7955a4e29daddd92860e41d7105192eb24b 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -79,10 +79,11 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
     const Tensor& t = call->ins[i];
     if (call->eager) {
       if (call->gpu) {
-        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, call->device));
+        arg = EagerTensorFromHandle(
+            new TFE_TensorHandle(t, call->device, call->device));
       } else {
         // TFE_TensorHandle assumes that CPU is identified by `nullptr`.
-        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, nullptr));
+        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, nullptr, nullptr));
       }
       if (arg == nullptr) {
         return errors::Internal("Unable to procure EagerTensor from Tensor.");
@@ -125,6 +126,9 @@ Status NumericNpDTypeToTfDType(const int np, DataType* tf) {
     case NPY_INT8:
       *tf = DT_INT8;
       break;
+    case NPY_UINT16:
+      *tf = DT_UINT16;
+      break;
     case NPY_INT16:
       *tf = DT_INT16;
       break;
@@ -163,9 +167,9 @@ bool IsSingleNone(PyObject* obj) {
 }
 
 // Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`.
-void ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
-                                  Tensor* output_tensor) {
-  *output_tensor = EagerTensor_Handle(eager_tensor)->t;
+tensorflow::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
+                                                const Tensor** output_tensor) {
+  return EagerTensor_Handle(eager_tensor)->handle->Tensor(output_tensor);
 }
 
 // Calls the registered py function through the trampoline.
@@ -219,7 +223,9 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
       if (call->eager) {
         const PyObject* item = PyList_GetItem(result, i);
         if (EagerTensor_CheckExact(item)) {
-          ExtractTensorFromEagerTensor(item, &t);
+          const Tensor* tensor = nullptr;
+          s = ExtractTensorFromEagerTensor(item, &tensor);
+          if (s.ok()) t = *tensor;
         } else {
           s = errors::FailedPrecondition(
               "Expected EagerTensor, found PyObject of type: ",
@@ -237,10 +243,10 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
   } else if (EagerTensor_CheckExact(result) || result == Py_None) {
     // result is an `EagerTensor` or `None`.
     DCHECK(call->eager);
-    Tensor t;
     if (result != Py_None) {
-      ExtractTensorFromEagerTensor(result, &t);
-      call->out.push_back(t);
+      const Tensor* t = nullptr;
+      s = ExtractTensorFromEagerTensor(result, &t);
+      if (s.ok()) call->out.push_back(*t);
     }
   } else if (PyArray_Check(result)) {
     // `result` is a NumPy array.
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 317bdc2e14747583f372808f48a5928273f5570a..32ea737a99067877e7f527e44d261a0b7c2eb07e 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/numpy.h"
 #include "tensorflow/python/lib/core/py_util.h"
@@ -77,13 +78,14 @@ string PyRepr(PyObject* obj) {
 bool IsPyDimension(PyObject* obj) {
   const char* tp_name = obj->ob_type->tp_name;
   if (strcmp(tp_name, "Dimension") != 0) return false;
-  bool ret =
-      StringPiece(PyRepr(PyType(obj)))
-          .ends_with("tensorflow.python.framework.tensor_shape.Dimension'>");
+  bool ret = str_util::EndsWith(
+      PyRepr(PyType(obj)),
+      "tensorflow.python.framework.tensor_shape.Dimension'>");
   return ret;
 }
 
 Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) {
+  std::vector<Safe_PyObjectPtr> refs_to_clean;
   while (true) {
     // We test strings first, in case a string is considered a sequence.
     if (IsPyString(obj)) {
@@ -93,6 +95,7 @@ Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) {
       if (length > 0) {
         shape->AddDim(length);
         obj = PySequence_GetItem(obj, 0);
+        refs_to_clean.push_back(make_safe(obj));
         continue;
       } else if (length == 0) {
         shape->AddDim(length);
@@ -167,14 +170,15 @@ const char ErrorFoundFloat[] =
     if (shape.dims() > 1) {                                               \
       /* Iterate over outer dim, and recursively convert each element. */ \
       const int64 s = shape.dim_size(0);                                  \
-      if (TF_PREDICT_FALSE(s != PySequence_Length(obj))) {                \
+      Safe_PyObjectPtr seq = make_safe(PySequence_Fast(obj, ""));         \
+      if (TF_PREDICT_FALSE(s != PySequence_Fast_GET_SIZE(seq.get()))) {   \
         return ErrorRectangular;                                          \
       }                                                                   \
       TensorShape rest = shape;                                           \
       rest.RemoveDim(0);                                                  \
       for (int64 i = 0; i < s; ++i) {                                     \
-        const char* error =                                               \
-            FUNCTION##Helper(PySequence_GetItem(obj, i), rest, buf);      \
+        const char* error = FUNCTION##Helper(                             \
+            PySequence_Fast_GET_ITEM(seq.get(), i), rest, buf);           \
         if (TF_PREDICT_FALSE(error != nullptr)) return error;             \
       }                                                                   \
     } else {                                                              \
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
index 2635694e23c07dd8e75d4bb0cfb9e83a2042d921..00cbf0c532cf80d3bb27afe168ecde963ba3591d 100644
--- a/tensorflow/python/lib/core/py_util.cc
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -41,6 +41,55 @@ const char* ClassName(PyObject* py) {
 
 }  // end namespace
 
+// Returns a PyObject containing a string, or null
+void TryAppendTraceback(PyObject* ptype, PyObject* pvalue, PyObject* ptraceback,
+                        string* out) {
+  // The "traceback" module is assumed to be imported already by script_ops.py.
+  PyObject* tb_module = PyImport_AddModule("traceback");
+
+  if (!tb_module) {
+    return;
+  }
+
+  PyObject* format_exception =
+      PyObject_GetAttrString(tb_module, "format_exception");
+
+  if (!format_exception) {
+    return;
+  }
+
+  if (!PyCallable_Check(format_exception)) {
+    Py_DECREF(format_exception);
+    return;
+  }
+
+  PyObject* ret_val = PyObject_CallFunctionObjArgs(format_exception, ptype,
+                                                   pvalue, ptraceback, nullptr);
+  Py_DECREF(format_exception);
+
+  if (!ret_val) {
+    return;
+  }
+
+  if (!PyList_Check(ret_val)) {
+    Py_DECREF(ret_val);
+    return;
+  }
+
+  Py_ssize_t n = PyList_GET_SIZE(ret_val);
+  for (Py_ssize_t i = 0; i < n; ++i) {
+    PyObject* v = PyList_GET_ITEM(ret_val, i);
+#if PY_MAJOR_VERSION < 3
+    strings::StrAppend(out, PyString_AS_STRING(v), "\n");
+#else
+    strings::StrAppend(out, PyUnicode_AsUTF8(v), "\n");
+#endif
+  }
+
+  // Iterate through ret_val.
+  Py_DECREF(ret_val);
+}
+
 string PyExceptionFetch() {
   CHECK(PyErr_Occurred())
       << "Must only call PyExceptionFetch after an exception.";
@@ -52,14 +101,20 @@ string PyExceptionFetch() {
   string err = ClassName(ptype);
   if (pvalue) {
     PyObject* str = PyObject_Str(pvalue);
+
     if (str) {
 #if PY_MAJOR_VERSION < 3
-      strings::StrAppend(&err, ": ", PyString_AS_STRING(str));
+      strings::StrAppend(&err, ": ", PyString_AS_STRING(str), "\n");
 #else
-      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str));
+      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str), "\n");
 #endif
       Py_DECREF(str);
+    } else {
+      strings::StrAppend(&err, "(unknown error message)\n");
     }
+
+    TryAppendTraceback(ptype, pvalue, ptraceback, &err);
+
     Py_DECREF(pvalue);
   }
   Py_DECREF(ptype);
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index a751607aaa1f47ca7c08674eca2b27ee0cafa3d2..223858edfa84eaa1c7879a9774dcc836de4f4672 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -485,6 +485,11 @@ class FileIoTest(test.TestCase):
     f.flush()
     self.assertEqual(content, f.read(len(content) + 1))
 
+  def testUTF8StringPathExists(self):
+    file_path = os.path.join(self._base_dir, "UTF8测试_file_exist")
+    file_io.write_string_to_file(file_path, "testing")
+    v = file_io.file_exists(file_path)
+    self.assertEqual(v, True)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/lib/io/py_record_reader.cc b/tensorflow/python/lib/io/py_record_reader.cc
index 5fcb51b3b25e49192f651c648f48a1760ed23084..9500fc6a7c49d13ce0d6954f237f43ffba0869a0 100644
--- a/tensorflow/python/lib/io/py_record_reader.cc
+++ b/tensorflow/python/lib/io/py_record_reader.cc
@@ -43,9 +43,10 @@ PyRecordReader* PyRecordReader::New(const string& filename, uint64 start_offset,
   reader->offset_ = start_offset;
   reader->file_ = file.release();
 
+  static const uint64 kReaderBufferSize = 16 * 1024 * 1024;
   RecordReaderOptions options =
       RecordReaderOptions::CreateRecordReaderOptions(compression_type_string);
-
+  options.buffer_size = kReaderBufferSize;
   reader->reader_ = new RecordReader(reader->file_, options);
   return reader;
 }
diff --git a/tensorflow/python/lib/io/python_io.py b/tensorflow/python/lib/io/python_io.py
index b92cfe8f801341100a01d6281d496093f121673c..aec12ab3eaaa9cfbcb635548c5185a054dea2e15 100644
--- a/tensorflow/python/lib/io/python_io.py
+++ b/tensorflow/python/lib/io/python_io.py
@@ -16,11 +16,6 @@
 """Python functions for directly manipulating TFRecord-formatted files.
 
 See the @{$python/python_io} guide.
-
-@@TFRecordWriter
-@@tf_record_iterator
-@@TFRecordCompressionType
-@@TFRecordOptions
 """
 
 from __future__ import absolute_import
@@ -31,8 +26,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.lib.io.tf_record import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 48ea107a146c2714f7b59f53abbcd8b60dbf2fd4..bf2d6f68b55d78f9570d3854804e3d1316176c99 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -75,14 +75,15 @@ def tf_record_iterator(path, options=None):
 
   if reader is None:
     raise IOError("Could not open %s." % path)
-  while True:
-    try:
-      with errors.raise_exception_on_not_ok_status() as status:
-        reader.GetNext(status)
-    except errors.OutOfRangeError:
-      break
-    yield reader.record()
-  reader.Close()
+  try:
+    while True:
+      try:
+        reader.GetNext()
+      except errors.OutOfRangeError:
+        break
+      yield reader.record()
+  finally:
+    reader.Close()
 
 
 @tf_export("python_io.TFRecordWriter")
diff --git a/tensorflow/python/ops/accumulate_n_benchmark.py b/tensorflow/python/ops/accumulate_n_benchmark.py
index c58d36f39705ecf0f24214ce4ba4574e70a93e77..a709066cae4da2811b3e98d2e93bf44ec12dcee6 100644
--- a/tensorflow/python/ops/accumulate_n_benchmark.py
+++ b/tensorflow/python/ops/accumulate_n_benchmark.py
@@ -39,7 +39,7 @@ from tensorflow.python.platform import test
 class AccumulateNBenchmark(test.Benchmark):
 
   def _AccumulateNTemplate(self, inputs, init, shape, validate_shape):
-    var = gen_state_ops._temporary_variable(
+    var = gen_state_ops.temporary_variable(
         shape=shape, dtype=inputs[0].dtype.base_dtype)
     ref = state_ops.assign(var, init, validate_shape=validate_shape)
     update_ops = [
@@ -47,8 +47,7 @@ class AccumulateNBenchmark(test.Benchmark):
             ref, tensor, use_locking=True).op for tensor in inputs
     ]
     with ops.control_dependencies(update_ops):
-      return gen_state_ops._destroy_temporary_variable(
-          ref, var_name=var.op.name)
+      return gen_state_ops.destroy_temporary_variable(ref, var_name=var.op.name)
 
   def _AccumulateNInitializedWithFirst(self, inputs):
     return self._AccumulateNTemplate(
@@ -60,7 +59,7 @@ class AccumulateNBenchmark(test.Benchmark):
   def _AccumulateNInitializedWithMerge(self, inputs):
     return self._AccumulateNTemplate(
         inputs,
-        init=array_ops.zeros_like(gen_control_flow_ops._merge(inputs)[0]),
+        init=array_ops.zeros_like(gen_control_flow_ops.merge(inputs)[0]),
         shape=tensor_shape.vector(0),
         validate_shape=False)
 
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 9745d38dc23dba806a2d0dd2ef588a5a950aa05c..3678bd4c1f6a4500622b6d9e8334cb1ebae46578 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -80,7 +80,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
 
   def _ExtractInputShapes(inputs):
     """Extract the shapes of a set of input tensors."""
-    if not context.in_graph_mode():
+    if context.executing_eagerly():
       return array_ops.shape_n(inputs)
     sizes = []
     fully_known = True
@@ -106,7 +106,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
 
   out_grads = []
   if isinstance(grad, ops.Tensor):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       # Using mod here for convenience since concat_dim is already verified
       # in concat implementation to be within the allowed [-rank, rank) range.
       non_neg_concat_dim = (
@@ -139,7 +139,6 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
       # on CPUs and a Maxwell TitanX.  A speedup was seen in a large majority of
       # cases when switching implementations at N=16, but it is possible that
       # there will be a small number of performance regressions.
-      # pylint: disable=protected-access
       if len(sizes) > 16:
         # extract the size of each input along the concat dimension
         sizes = array_ops.squeeze(
@@ -148,10 +147,9 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
                 [1, -1]))
         out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
       else:
-        offset = gen_array_ops._concat_offset(non_neg_concat_dim, sizes)
+        offset = gen_array_ops.concat_offset(non_neg_concat_dim, sizes)
         for (begin, size) in zip(offset, sizes):
           out_grads.append(array_ops.slice(grad, begin, size))
-      # pylint: enable=protected-access
   elif isinstance(grad, ops.IndexedSlices):
     # Using mod here for convenience since concat_dim is already verified
     # in concat implementation to be within the allowed [-rank, rank) range.
@@ -198,7 +196,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
             array_ops.where(
                 math_ops.logical_and(grad.indices >= start,
                                      grad.indices < end)),
-            squeeze_dims=[1])
+            axis=[1])
         new_indices = array_ops.gather(grad.indices, indices_to_select) - start
         new_values = array_ops.gather(grad.values, indices_to_select)
         out_grads.append(ops.IndexedSlices(new_values, new_indices, size))
@@ -257,10 +255,15 @@ def _SliceGrad(op, grad):
 @ops.RegisterGradient("StridedSlice")
 def _StridedSliceGrad(op, grad):
   """Gradient for StridedSlice op."""
-  x = array_ops.shape(op.inputs[0])
   begin = op.inputs[1]
   end = op.inputs[2]
   strides = op.inputs[3]
+  # StridedSliceGrad requires `x`, `begin`, `end` and `strides` to be of the
+  # same dtype so we build a shape of the same type as other args.
+  # Note that the choice of `begin` for specifying `out_type` is arbitrary.
+  # We could choose any of {begin|end|strides}.dtype since they are required to
+  # be the same.
+  x = array_ops.shape(op.inputs[0], out_type=begin.dtype)
 
   return array_ops.strided_slice_grad(
       x,
@@ -430,7 +433,7 @@ def _GatherV2Grad(op, grad):
 
   # For axis 0 gathers, build an appropriately shaped IndexedSlices.
   if axis_static == 0:
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       params_tail_shape = params_shape.cpu()[1:]
     else:
       params_tail_shape = params_shape[1:]
@@ -580,7 +583,7 @@ def _TileGrad(op, grad):
   axes = math_ops.range(0, array_ops.size(split_shape), 2)
   input_grad = math_ops.reduce_sum(array_ops.reshape(grad, split_shape), axes)
   # Fix shape inference
-  if context.in_graph_mode():
+  if not context.executing_eagerly():
     input_grad.set_shape(op.inputs[0].get_shape())
   return [input_grad, None]
 
@@ -627,9 +630,7 @@ def _ReverseSequenceGrad(op, grad):
 @ops.RegisterGradient("Reverse")
 def _ReverseGrad(op, grad):
   reverse_dims = op.inputs[1]
-  # pylint: disable=protected-access
-  return gen_array_ops._reverse(grad, reverse_dims), None
-  # pylint: enable=protected-access
+  return gen_array_ops.reverse(grad, reverse_dims), None
 
 
 @ops.RegisterGradient("ReverseV2")
@@ -700,17 +701,13 @@ ops.NotDifferentiable("OneHot")
 @ops.RegisterGradient("MirrorPad")
 def _MirrorPadGrad(op, grad):
   mode = op.get_attr("mode")
-  # pylint: disable=protected-access
-  return [gen_array_ops._mirror_pad_grad(grad, op.inputs[1], mode=mode), None]
-  # pylint: enable=protected-access
+  return [gen_array_ops.mirror_pad_grad(grad, op.inputs[1], mode=mode), None]
 
 
 @ops.RegisterGradient("MirrorPadGrad")
 def _MirrorPadGradGrad(op, grad):
   mode = op.get_attr("mode")
-  # pylint: disable=protected-access
-  return [gen_array_ops._mirror_pad(grad, op.inputs[1], mode=mode), None]
-  # pylint: enable=protected-access
+  return [gen_array_ops.mirror_pad(grad, op.inputs[1], mode=mode), None]
 
 
 @ops.RegisterGradient("QuantizeAndDequantize")
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index d63a9ea0ddef619c8a9bd77ac3d66e03b5ad7ec3..e235047aff39f6b086c3f63b18e28ed1f405de9b 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -16,69 +16,6 @@
 """Support for manipulating tensors.
 
 See the @{$python/array_ops} guide.
-
-@@string_to_number
-@@to_double
-@@to_float
-@@to_bfloat16
-@@to_int32
-@@to_int64
-@@cast
-@@bitcast
-@@saturate_cast
-@@broadcast_dynamic_shape
-@@broadcast_static_shape
-@@shape
-@@shape_n
-@@size
-@@rank
-@@reshape
-@@squeeze
-@@expand_dims
-@@unravel_index
-@@meshgrid
-@@slice
-@@strided_slice
-@@split
-@@tile
-@@pad
-@@concat
-@@stack
-@@parallel_stack
-@@unstack
-@@reverse_sequence
-@@reverse
-@@reverse_v2
-@@transpose
-@@extract_image_patches
-@@space_to_batch_nd
-@@space_to_batch
-@@required_space_to_batch_paddings
-@@batch_to_space_nd
-@@batch_to_space
-@@space_to_depth
-@@depth_to_space
-@@gather
-@@gather_nd
-@@unique_with_counts
-@@scatter_nd
-@@dynamic_partition
-@@dynamic_stitch
-@@boolean_mask
-@@one_hot
-@@sequence_mask
-@@dequantize
-@@quantize
-@@quantize_v2
-@@quantized_concat
-@@setdiff1d
-@@guarantee_const
-@@fake_quant_with_min_max_args
-@@fake_quant_with_min_max_args_gradient
-@@fake_quant_with_min_max_vars
-@@fake_quant_with_min_max_vars_gradient
-@@fake_quant_with_min_max_vars_per_channel
-@@fake_quant_with_min_max_vars_per_channel_gradient
 """
 
 from __future__ import absolute_import
@@ -128,19 +65,23 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
-  if context.in_graph_mode():
-    return gen_array_ops.identity(input, name=name)
-  else:
+  if context.executing_eagerly():
     input = ops.convert_to_tensor(input)
     in_device = input.device
     # TODO(ashankar): Does 'identity' need to invoke execution callbacks?
-    if context.context().device_name != in_device:
+    context_device = context.context().device_name
+    if not context_device:
+      context_device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    if context_device != in_device:
       return input._copy()  # pylint: disable=protected-access
     return input
+  else:
+    return gen_array_ops.identity(input, name=name)
 
 
 # pylint: disable=redefined-builtin,protected-access
 @tf_export("expand_dims")
+@deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
 
@@ -190,12 +131,8 @@ def expand_dims(input, axis=None, name=None, dim=None):
   Raises:
     ValueError: if both `dim` and `axis` are specified.
   """
-  # TODO(aselle): Remove argument dim
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("can't specify both 'dim' and 'axis'")
-    axis = dim
-  return gen_array_ops._expand_dims(input, axis, name)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
+  return gen_array_ops.expand_dims(input, axis, name)
 
 
 # pylint: enable=redefined-builtin,protected-access
@@ -208,28 +145,25 @@ def expand_dims(input, axis=None, name=None, dim=None):
     "This op will be removed after the deprecation date. "
     "Please switch to tf.setdiff1d().")
 def listdiff(x, y, out_idx=None, name=None):
-  return gen_array_ops._list_diff(x, y, out_idx, name)
+  return gen_array_ops.list_diff(x, y, out_idx, name)
 
 
-listdiff.__doc__ = gen_array_ops._list_diff.__doc__ + "\n" + listdiff.__doc__
+listdiff.__doc__ = gen_array_ops.list_diff.__doc__ + "\n" + listdiff.__doc__
 
 # pylint: enable=protected-access
 
 
-# pylint: disable=undefined-variable,protected-access
+# pylint: disable=undefined-variable
 @tf_export("setdiff1d")
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
-  return gen_array_ops._list_diff(x, y, index_dtype, name)
+  return gen_array_ops.list_diff(x, y, index_dtype, name)
 
 
-setdiff1d.__doc__ = gen_array_ops._list_diff.__doc__
-
-# pylint: enable=protected-access
+setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 
 
 @tf_export("broadcast_dynamic_shape")
 def broadcast_dynamic_shape(shape_x, shape_y):
-  # pylint: disable=protected-access
   """Returns the broadcasted dynamic shape between `shape_x` and `shape_y`.
 
   Args:
@@ -239,8 +173,7 @@ def broadcast_dynamic_shape(shape_x, shape_y):
   Returns:
     A rank 1 integer `Tensor` representing the broadcasted shape.
   """
-  return gen_array_ops._broadcast_args(shape_x, shape_y)
-  # pylint: enable=protected-access
+  return gen_array_ops.broadcast_args(shape_x, shape_y)
 
 
 @tf_export("broadcast_static_shape")
@@ -306,7 +239,7 @@ def shape_internal(input, name=None, optimize=True, out_type=dtypes.int32):
                           sparse_tensor.SparseTensorValue)):
       return gen_math_ops.cast(input.dense_shape, out_type)
     else:
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         input_tensor = ops.convert_to_tensor(input)
         input_shape = input_tensor.get_shape()
         if optimize and input_shape.is_fully_defined():
@@ -331,7 +264,7 @@ def shape_n(input, out_type=dtypes.int32, name=None):
   """
 
   output = gen_array_ops.shape_n(input, out_type=out_type, name=name)
-  if context.in_graph_mode():
+  if not context.executing_eagerly():
     for i, input_tensor in enumerate(input):
       input_tensor = ops.convert_to_tensor(input_tensor)
       input_shape = input_tensor.get_shape()
@@ -386,16 +319,25 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
   Returns:
     A `Tensor` of type `out_type`. Defaults to `tf.int32`.
   """
+  if context.executing_eagerly() and not isinstance(
+      input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+    input = ops.convert_to_tensor(input)
+    np_out_type = out_type.as_numpy_dtype
+    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-acces:
+    return ops.convert_to_tensor(num_elements, dtype=out_type)
   with ops.name_scope(name, "Size", [input]) as name:
     if isinstance(input, (sparse_tensor.SparseTensor,
                           sparse_tensor.SparseTensorValue)):
-      return gen_math_ops._prod(
+      return gen_math_ops.prod(
           gen_math_ops.cast(input.dense_shape, out_type), 0, name=name)
     else:
       input_tensor = ops.convert_to_tensor(input)
       input_shape = input_tensor.get_shape()
-      if optimize and input_shape.is_fully_defined():
-        return constant(input_shape.num_elements(), out_type, name=name)
+      if optimize:
+        if input_shape.is_fully_defined():
+          return constant(input_shape.num_elements(), out_type, name=name)
+        if input_shape.dims and any(dim == 0 for dim in input_shape.dims):
+          return constant(0, out_type, name=name)
       return gen_array_ops.size(input, name=name, out_type=out_type)
 
 
@@ -777,7 +719,7 @@ def strided_slice(input_,
         new_axis_mask=new_axis_mask,
         shrink_axis_mask=shrink_axis_mask)
 
-  if context.in_graph_mode():
+  if not context.executing_eagerly():
     # TODO(apassos) In eager mode assignment will be done by overriding
     # __setitem__ instead.
     op.assign = assign
@@ -788,8 +730,8 @@ def _SliceHelperVar(var, slice_spec):
   """Creates a slice helper object given a variable.
 
   This allows creating a sub-tensor from part of the current contents
-  of a variable.  See ${tf.Tensor$`Tensor.__getitem__`}
-  for detailed examples of slicing.
+  of a variable. See @{tf.Tensor.__getitem__} for detailed examples
+  of slicing.
 
   This function in addition also allows assignment to a sliced range.
   This is similar to `__setitem__` functionality in Python. However,
@@ -879,7 +821,7 @@ def parallel_stack(values, name="parallel_stack"):
     output_shape = tensor_shape.TensorShape([len(values)])
     output_shape = output_shape.concatenate(value_shape)
     # expand_dims converts concat to stack.
-    return gen_array_ops._parallel_concat(
+    return gen_array_ops.parallel_concat(
         [expand_dims(value, 0) for value in values], shape=output_shape)
 
 
@@ -930,14 +872,14 @@ def stack(values, axis=0, name="stack"):
     except (TypeError, ValueError):
       pass  # Input list contains non-constant tensors
 
-  value_shape = ops.convert_to_tensor(values[0], name=name).get_shape()
-  if value_shape.ndims is not None:
-    expanded_num_dims = value_shape.ndims + 1
+  value_shape = ops.convert_to_tensor(values[0], name=name)._shape_tuple()  # pylint: disable=protected-access
+  if value_shape is not None:
+    expanded_num_dims = len(value_shape) + 1
     if axis < -expanded_num_dims or axis >= expanded_num_dims:
       raise ValueError("axis = %d not in [%d, %d)" % (axis, -expanded_num_dims,
                                                       expanded_num_dims))
 
-  return gen_array_ops._pack(values, axis=axis, name=name)
+  return gen_array_ops.pack(values, axis=axis, name=name)
 
 
 # pylint: disable=invalid-name
@@ -952,6 +894,11 @@ def _autopacking_helper(list_or_tuple, dtype, name):
   Returns:
     A `tf.Tensor` with value equivalent to `list_or_tuple`.
   """
+  if context.executing_eagerly():
+    # NOTE: Fast path when all the items are tensors, this doesn't do any type
+    # checking.
+    if all(ops.is_dense_tensor_like(elem) for elem in list_or_tuple):
+      return gen_array_ops.pack(list_or_tuple, name=name)
   must_pack = False
   converted_elems = []
   with ops.name_scope(name) as scope:
@@ -981,7 +928,7 @@ def _autopacking_helper(list_or_tuple, dtype, name):
           # convertible-to-tensor types, such as numpy arrays.
           elems_as_tensors.append(
               constant_op.constant(elem, dtype=dtype, name=str(i)))
-      return gen_array_ops._pack(elems_as_tensors, name=scope)
+      return gen_array_ops.pack(elems_as_tensors, name=scope)
     else:
       return converted_elems
 
@@ -1047,9 +994,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
     `value[:, i, :, :]` and each tensor in `output` will have shape `(A, C, D)`.
   Etc.
 
-  This is the opposite of stack.  The numpy equivalent is
-
-      tf.unstack(x, n) = np.unstack(x)
+  This is the opposite of stack.
 
   Args:
     value: A rank `R > 0` `Tensor` to be unstacked.
@@ -1076,7 +1021,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
       num = value_shape[axis].value
   if num is None:
     raise ValueError("Cannot infer num from shape %s" % value_shape)
-  return gen_array_ops._unpack(value, num=num, axis=axis, name=name)
+  return gen_array_ops.unpack(value, num=num, axis=axis, name=name)
 
 
 @tf_export("concat")
@@ -1173,7 +1118,7 @@ def concat(values, axis, name="concat"):
           dtype=dtypes.int32).get_shape().assert_is_compatible_with(
               tensor_shape.scalar())
       return identity(values[0], name=scope)
-  return gen_array_ops._concat_v2(values=values, axis=axis, name=name)
+  return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
 
 
 @tf_export("boolean_mask")
@@ -1222,7 +1167,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
   def _apply_mask_1d(reshaped_tensor, mask, axis=None):
     """Mask tensor along dimension 0 with a 1-D mask."""
-    indices = squeeze(where(mask), squeeze_dims=[1])
+    indices = squeeze(where(mask), axis=[1])
     return gather(reshaped_tensor, indices, axis=axis)
 
   with ops.name_scope(name, values=[tensor, mask]):
@@ -1241,8 +1186,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
     axis = 0 if axis is None else axis
     shape_tensor[axis:axis + ndims_mask].assert_is_compatible_with(shape_mask)
 
-    leading_size = gen_math_ops._prod(
-        shape(tensor)[axis:axis + ndims_mask], [0])
+    leading_size = gen_math_ops.prod(shape(tensor)[axis:axis + ndims_mask], [0])
     tensor = reshape(tensor,
                      concat([
                          shape(tensor)[:axis], [leading_size],
@@ -1306,10 +1250,22 @@ def unique(x, out_idx=dtypes.int32, name=None):
   # period (3 weeks) pass.
   # TODO(yongtang): The documentation should also
   # be updated when switch  to v2.
-  return gen_array_ops._unique(x, out_idx, name)
+  return gen_array_ops.unique(x, out_idx, name)
 
 
-unique.__doc__ = gen_array_ops._unique.__doc__
+unique.__doc__ = gen_array_ops.unique.__doc__
+
+
+@tf_export("unique_with_counts")
+def unique_with_counts(x, out_idx=dtypes.int32, name=None):
+  # TODO(yongtang): switch to v2 once API deprecation
+  # period (3 weeks) pass.
+  # TODO(yongtang): The documentation should also
+  # be updated when switch  to v2.
+  return gen_array_ops.unique_with_counts(x, out_idx, name)
+
+
+unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
 
 
 @tf_export("split")
@@ -1363,20 +1319,18 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """
   size_splits = ops.convert_to_tensor(num_or_size_splits)
   if size_splits._rank() == 0 and size_splits.dtype.is_integer:
-    return gen_array_ops._split(
+    return gen_array_ops.split(
         axis=axis, num_split=num_or_size_splits, value=value, name=name)
 
   if num is None:
-    num = size_splits._shape_tuple()[0]
+    size_splits_shape = size_splits._shape_tuple()
+    if size_splits_shape:
+      num = size_splits_shape[0]
     if num is None:
       raise ValueError("Cannot infer num from shape %s" % num_or_size_splits)
 
-  return gen_array_ops._split_v(
-      value=value,
-      size_splits=size_splits,
-      axis=axis,
-      num_split=num,
-      name=name)
+  return gen_array_ops.split_v(
+      value=value, size_splits=size_splits, axis=axis, num_split=num, name=name)
 
 
 @tf_export("transpose")
@@ -1390,6 +1344,14 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
   `a.dtype` is either `complex64` or `complex128` then the values of `a`
   are conjugated and transposed.
 
+  @compatibility(numpy)
+  In `numpy` transposes are memory-efficient constant time operations as they
+  simply return a new view of the same data with adjusted `strides`.
+
+  TensorFlow does not support strides, so `transpose` returns a new tensor with
+  the items permuted.
+  @end_compatibility
+
   For example:
 
   ```python
@@ -1438,7 +1400,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
   """
   with ops.name_scope(name, "transpose", [a]) as name:
     transpose_fn = (
-        gen_array_ops._conjugate_transpose
+        gen_array_ops.conjugate_transpose
         if (conjugate and a.dtype.is_complex) else gen_array_ops.transpose)
     if perm is None:
       rank = gen_array_ops.rank(a)
@@ -1446,7 +1408,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
       ret = transpose_fn(a, perm, name=name)
       # NOTE(mrry): Setting the shape explicitly because
       #   reverse is not handled by the shape function.
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         input_shape = ret.op.inputs[0].get_shape().dims
         if input_shape is not None:
           ret.set_shape(input_shape[::-1])
@@ -1490,6 +1452,14 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
   tf.matmul(matrix, tf.matrix_transpose(b))
   ```
 
+  @compatibility(numpy)
+  In `numpy` transposes are memory-efficient constant time operations as they
+  simply return a new view of the same data with adjusted `strides`.
+
+  TensorFlow does not support strides, `matrix_transposes` return a new tensor
+  with the items permuted.
+  @end_compatibility
+
   Args:
     a: A `Tensor` with `rank >= 2`.
     name: A name for the operation (optional).
@@ -1528,6 +1498,16 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 # pylint: enable=invalid-name
 
 
+def _constant_if_small(value, shape, dtype, name):
+  try:
+    if np.prod(shape) < 1000:
+      return constant(value, shape=shape, dtype=dtype, name=name)
+  except TypeError:
+    # Happens when shape is a Tensor, list with Tensor elements, etc.
+    pass
+  return None
+
+
 @tf_export("zeros")
 def zeros(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to zero.
@@ -1558,8 +1538,15 @@ def zeros(shape, dtype=dtypes.float32, name=None):
       zero = ""
     else:
       zero = 0
+
     if not isinstance(shape, ops.Tensor):
       try:
+        # Create a constant if it won't be very big. Otherwise create a fill op
+        # to prevent serialized GraphDefs from becoming too large.
+        output = _constant_if_small(zero, shape, dtype, name)
+        if output is not None:
+          return output
+
         # Go through tensor shapes to get int64-if-needed semantics
         shape = constant_op._tensor_shape_tensor_conversion_function(
             tensor_shape.TensorShape(shape))
@@ -1603,12 +1590,12 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   with ops.name_scope(name, "zeros_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
 
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       if dtype is not None and dtype != tensor.dtype:
         return zeros(
             shape_internal(tensor, optimize=optimize), dtype=dtype, name=name)
       with ops.device(tensor.device):
-        return gen_array_ops._zeros_like(tensor, name=name)
+        return gen_array_ops.zeros_like(tensor, name=name)
 
     # For now, variant types must be created via zeros_like; as we need to
     # pass the input variant object to the proper zeros callback.
@@ -1623,7 +1610,7 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
       return zeros(
           shape_internal(tensor, optimize=optimize), dtype=dtype, name=name)
     else:
-      return gen_array_ops._zeros_like(tensor, name=name)
+      return gen_array_ops.zeros_like(tensor, name=name)
 
 
 @tf_export("ones_like")
@@ -1659,7 +1646,7 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
     if dtype is None:
       dtype = tensor.dtype
     ret = ones(ones_shape, dtype=dtype, name=name)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       ret.set_shape(tensor.get_shape())
     return ret
 
@@ -1691,6 +1678,12 @@ def ones(shape, dtype=dtypes.float32, name=None):
     one = True if dtype == dtypes.bool else 1
     if not isinstance(shape, ops.Tensor):
       try:
+        # Create a constant if it won't be very big. Otherwise create a fill op
+        # to prevent serialized GraphDefs from becoming too large.
+        output = _constant_if_small(one, shape, dtype, name)
+        if output is not None:
+          return output
+
         # Go through tensor shapes to get int64-if-needed semantics
         shape = constant_op._tensor_shape_tensor_conversion_function(
             tensor_shape.TensorShape(shape))
@@ -1725,8 +1718,10 @@ def placeholder(dtype, shape=None, name=None):
     print(sess.run(y, feed_dict={x: rand_array}))  # Will succeed.
   ```
 
-  @compatibility{eager} Placeholders are not compatible with eager execution.
-
+  @compatibility(eager)
+  Placeholders are not compatible with eager execution.
+  @end_compatibility
+  
   Args:
     dtype: The type of elements in the tensor to be fed.
     shape: The shape of the tensor to be fed (optional). If the shape is not
@@ -1740,11 +1735,11 @@ def placeholder(dtype, shape=None, name=None):
   Raises:
     RuntimeError: if eager execution is enabled
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError("tf.placeholder() is not compatible with "
                        "eager execution.")
 
-  return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
+  return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
 
 
 # pylint: disable=redefined-outer-name
@@ -1803,7 +1798,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
   Raises:
     RuntimeError: if eager execution is enabled
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError("tf.placeholder() is not compatible with "
                        "eager execution.")
 
@@ -1888,21 +1883,21 @@ def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pyl
     # TODO(rjryan): Once the forward compatibility period (3 weeks) have passed
     # remove the "Pad" fallback here.
     if constant_values != 0:
-      result = gen_array_ops._pad_v2(
+      result = gen_array_ops.pad_v2(
           tensor, paddings, constant_values, name=name)
     else:
-      result = gen_array_ops._pad(tensor, paddings, name=name)
+      result = gen_array_ops.pad(tensor, paddings, name=name)
   elif mode == "REFLECT":
-    result = gen_array_ops._mirror_pad(
+    result = gen_array_ops.mirror_pad(
         tensor, paddings, mode="REFLECT", name=name)
   elif mode == "SYMMETRIC":
-    result = gen_array_ops._mirror_pad(
+    result = gen_array_ops.mirror_pad(
         tensor, paddings, mode="SYMMETRIC", name=name)
   else:
     raise ValueError("Unknown padding mode: %s" % mode)
 
   # Restore shape information where possible.
-  if context.in_graph_mode():
+  if not context.executing_eagerly():
     paddings_constant = tensor_util.constant_value(
         result.op.inputs[1], partial=True)
     input_shape = result.op.inputs[0].shape
@@ -2126,7 +2121,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
                             sparse_tensor.SparseTensorValue)):
     raise TypeError("Truth must be a SparseTensor.")
 
-  return gen_array_ops._edit_distance(
+  return gen_array_ops.edit_distance(
       hypothesis.indices,
       hypothesis.values,
       hypothesis.dense_shape,
@@ -2263,7 +2258,7 @@ def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=r
   return result
 
 
-space_to_batch.__doc__ = gen_array_ops._space_to_batch.__doc__
+space_to_batch.__doc__ = gen_array_ops.space_to_batch.__doc__
 
 
 @tf_export("space_to_depth")
@@ -2293,7 +2288,7 @@ def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=rede
   return result
 
 
-batch_to_space.__doc__ = gen_array_ops._batch_to_space.__doc__
+batch_to_space.__doc__ = gen_array_ops.batch_to_space.__doc__
 
 
 @tf_export("one_hot")
@@ -2437,8 +2432,8 @@ def one_hot(indices,
       raise TypeError("dtype {0} of on_value does not match "
                       "dtype {1} of off_value".format(on_dtype, off_dtype))
 
-    return gen_array_ops._one_hot(indices, depth, on_value, off_value, axis,
-                                  name)
+    return gen_array_ops.one_hot(indices, depth, on_value, off_value, axis,
+                                 name)
 
 
 def _all_dimensions(x):
@@ -2520,6 +2515,8 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
 
 
 @tf_export("squeeze")
+@deprecation.deprecated_args(None, "Use the `axis` argument instead",
+                             "squeeze_dims")
 def squeeze(input, axis=None, name=None, squeeze_dims=None):
   # pylint: disable=redefined-builtin
   """Removes dimensions of size 1 from the shape of a tensor.
@@ -2560,13 +2557,11 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
   Raises:
     ValueError: When both `squeeze_dims` and `axis` are specified.
   """
-  if squeeze_dims is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'squeeze_dims' and 'axis'")
-    axis = squeeze_dims
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "squeeze_dims", squeeze_dims)
   if np.isscalar(axis):
     axis = [axis]
-  return gen_array_ops._squeeze(input, axis, name)
+  return gen_array_ops.squeeze(input, axis, name)
 
 
 @tf_export("where")
@@ -2617,7 +2612,7 @@ def where(condition, x=None, y=None, name=None):
           condition, preferred_dtype=dtypes.bool, name="condition")
       return gen_array_ops.where(condition=condition, name=name)
   elif x is not None and y is not None:
-    return gen_math_ops._select(condition=condition, x=x, y=y, name=name)
+    return gen_math_ops.select(condition=condition, x=x, y=y, name=name)
   else:
     raise ValueError("x and y must both be non-None or both be None.")
 
@@ -2661,12 +2656,17 @@ reverse_sequence.__doc__ = deprecation.rewrite_argument_docstring(
 
 @tf_export("gather")
 def gather(params, indices, validate_indices=None, name=None, axis=0):
-  # TODO(rjryan): Remove "Gather" creation in favor of GatherV2 once the forward
-  # compatibility 3 week period has passed.
-  if axis == 0:
-    return gen_array_ops.gather(
-        params, indices, validate_indices=validate_indices, name=name)
-  else:
+  del validate_indices
+  if axis != 0:
+    # Note that we do a sparse_read here to avoid snapshotting the entire
+    # resource variable and doing a gather, which can be inefficient and lead to
+    # subtle race conditions. TODO(apassos) implement axis != 0 on sparse_read
+    return gen_array_ops.gather_v2(params, indices, axis, name=name)
+  try:
+    # TODO(apassos) find a less bad way of detecting resource variables without
+    # introducing a circular dependency.
+    return params.sparse_read(indices, name=name)
+  except AttributeError:
     return gen_array_ops.gather_v2(params, indices, axis, name=name)
 
 
diff --git a/tensorflow/python/ops/batch_norm_benchmark.py b/tensorflow/python/ops/batch_norm_benchmark.py
index c2ee2b383231333239c6e2d4e874a0ad1cdf493e..d83b81909755df8d187232e15ecda48b1cbf4557 100644
--- a/tensorflow/python/ops/batch_norm_benchmark.py
+++ b/tensorflow/python/ops/batch_norm_benchmark.py
@@ -25,6 +25,7 @@ import time
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradients_impl
@@ -39,11 +40,10 @@ from tensorflow.python.platform import test
 def batch_norm_op(tensor, mean, variance, beta, gamma, scale):
   """Fused kernel for batch normalization."""
   # _batch_norm_with_global_normalization is deprecated in v9
-  ops.get_default_graph().graph_def_versions.producer = 8
+  test_util.set_producer_version(ops.get_default_graph(), 8)
   # pylint: disable=protected-access
-  return gen_nn_ops._batch_norm_with_global_normalization(tensor, mean,
-                                                          variance, beta, gamma,
-                                                          0.001, scale)
+  return gen_nn_ops._batch_norm_with_global_normalization(
+      tensor, mean, variance, beta, gamma, 0.001, scale)
   # pylint: enable=protected-access
 
 
diff --git a/tensorflow/python/ops/bitwise_ops.py b/tensorflow/python/ops/bitwise_ops.py
index e8e187e68f92d94b20e5e6ee0c707ea33a5e2f43..a1260b95cdb47b63cb7f5edb3a1942b7114dfd3a 100644
--- a/tensorflow/python/ops/bitwise_ops.py
+++ b/tensorflow/python/ops/bitwise_ops.py
@@ -13,15 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Operations for manipulating the binary representations of integers.
-
-@@bitwise_and
-@@bitwise_or
-@@bitwise_xor
-@@invert
-@@left_shift
-@@right_shift
-"""
+"""Operations for manipulating the binary representations of integers."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -32,7 +24,6 @@ from tensorflow.python.framework import ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_bitwise_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
 
 ops.NotDifferentiable("BitwiseAnd")
 ops.NotDifferentiable("BitwiseOr")
@@ -41,5 +32,3 @@ ops.NotDifferentiable("Invert")
 ops.NotDifferentiable("PopulationCount")
 ops.NotDifferentiable("LeftShift")
 ops.NotDifferentiable("RightShift")
-
-remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a2bcdd9d69b7a0aed1e7f3d3197cf6d7dd98451
--- /dev/null
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -0,0 +1,163 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_boosted_trees_ops
+from tensorflow.python.ops import resources
+
+# Re-exporting ops used by other modules.
+# pylint: disable=unused-import
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_calculate_best_gains_per_feature as calculate_best_gains_per_feature
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_make_stats_summary as make_stats_summary
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_predict as predict
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_training_predict as training_predict
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_update_ensemble as update_ensemble
+# pylint: enable=unused-import
+
+from tensorflow.python.training import saver
+
+
+class PruningMode(object):
+  NO_PRUNING, PRE_PRUNING, POST_PRUNING = range(0, 3)
+
+
+class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for TreeEnsemble."""
+
+  def __init__(self, resource_handle, create_op, name):
+    """Creates a _TreeEnsembleSavable object.
+
+    Args:
+      resource_handle: handle to the decision tree ensemble variable.
+      create_op: the op to initialize the variable.
+      name: the name to save the tree ensemble variable under.
+    """
+    stamp_token, serialized = (
+        gen_boosted_trees_ops.boosted_trees_serialize_ensemble(resource_handle))
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree ensemble variable. So we just pass an empty
+    # value.
+    slice_spec = ''
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
+                                        name + '_stamp'),
+        saver.BaseSaverBuilder.SaveSpec(serialized, slice_spec,
+                                        name + '_serialized'),
+    ]
+    super(_TreeEnsembleSavable, self).__init__(resource_handle, specs, name)
+    self._resource_handle = resource_handle
+    self._create_op = create_op
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree ensemble from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree ensemble variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return gen_boosted_trees_ops.boosted_trees_deserialize_ensemble(
+          self._resource_handle,
+          stamp_token=restored_tensors[0],
+          tree_ensemble_serialized=restored_tensors[1])
+
+
+class TreeEnsemble(object):
+  """Creates TreeEnsemble resource."""
+
+  def __init__(self, name, stamp_token=0, is_local=False, serialized_proto=''):
+    with ops.name_scope(name, 'TreeEnsemble') as name:
+      self._resource_handle = (
+          gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
+              container='', shared_name=name, name=name))
+      create_op = gen_boosted_trees_ops.boosted_trees_create_ensemble(
+          self.resource_handle,
+          stamp_token,
+          tree_ensemble_serialized=serialized_proto)
+      is_initialized_op = (
+          gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
+              self._resource_handle))
+      # Adds the variable to the savable list.
+      if not is_local:
+        saveable = _TreeEnsembleSavable(self.resource_handle, create_op,
+                                        self.resource_handle.name)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      resources.register_resource(
+          self.resource_handle,
+          create_op,
+          is_initialized_op,
+          is_shared=not is_local)
+
+  @property
+  def resource_handle(self):
+    return self._resource_handle
+
+  def get_stamp_token(self):
+    """Returns the current stamp token of the resource."""
+    stamp_token, _, _, _, _ = (
+        gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+            self.resource_handle))
+    return stamp_token
+
+  def get_states(self):
+    """Returns states of the tree ensemble.
+
+    Returns:
+      stamp_token, num_trees, num_finalized_trees, num_attempted_layers and
+      range of the nodes in the latest layer.
+    """
+    (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+     nodes_range) = (
+         gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+             self.resource_handle))
+    # Use identity to give names.
+    return (array_ops.identity(stamp_token, name='stamp_token'),
+            array_ops.identity(num_trees, name='num_trees'),
+            array_ops.identity(num_finalized_trees, name='num_finalized_trees'),
+            array_ops.identity(
+                num_attempted_layers, name='num_attempted_layers'),
+            array_ops.identity(nodes_range, name='last_layer_nodes_range'))
+
+  def serialize(self):
+    """Serializes the ensemble into proto and returns the serialized proto.
+
+    Returns:
+      stamp_token: int64 scalar Tensor to denote the stamp of the resource.
+      serialized_proto: string scalar Tensor of the serialized proto.
+    """
+    return gen_boosted_trees_ops.boosted_trees_serialize_ensemble(
+        self.resource_handle)
+
+  def deserialize(self, stamp_token, serialized_proto):
+    """Deserialize the input proto and resets the ensemble from it.
+
+    Args:
+      stamp_token: int64 scalar Tensor to denote the stamp of the resource.
+      serialized_proto: string scalar Tensor of the serialized proto.
+
+    Returns:
+      Operation (for dependencies).
+    """
+    return gen_boosted_trees_ops.boosted_trees_deserialize_ensemble(
+        self.resource_handle, stamp_token, serialized_proto)
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 220ef1754d2e1a2d54a8962148b47806df48e98f..9ea1ea9c92c9b016a3f9126c89ee4dc1e73c9f27 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -77,7 +77,7 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
       of each of `sampled_candidates`.
   """
   seed1, seed2 = random_seed.get_seed(seed)
-  return gen_candidate_sampling_ops._uniform_candidate_sampler(
+  return gen_candidate_sampling_ops.uniform_candidate_sampler(
       true_classes, num_true, num_sampled, unique, range_max, seed=seed1,
       seed2=seed2, name=name)
 
@@ -136,7 +136,7 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
       of each of `sampled_candidates`.
   """
   seed1, seed2 = random_seed.get_seed(seed)
-  return gen_candidate_sampling_ops._log_uniform_candidate_sampler(
+  return gen_candidate_sampling_ops.log_uniform_candidate_sampler(
       true_classes, num_true, num_sampled, unique, range_max, seed=seed1,
       seed2=seed2, name=name)
 
@@ -193,7 +193,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
 
   """
   seed1, seed2 = random_seed.get_seed(seed)
-  return gen_candidate_sampling_ops._learned_unigram_candidate_sampler(
+  return gen_candidate_sampling_ops.learned_unigram_candidate_sampler(
       true_classes, num_true, num_sampled, unique, range_max, seed=seed1,
       seed2=seed2, name=name)
 
@@ -283,7 +283,7 @@ def fixed_unigram_candidate_sampler(true_classes,
 
   """
   seed1, seed2 = random_seed.get_seed(seed)
-  return gen_candidate_sampling_ops._fixed_unigram_candidate_sampler(
+  return gen_candidate_sampling_ops.fixed_unigram_candidate_sampler(
       true_classes, num_true, num_sampled, unique, range_max,
       vocab_file=vocab_file, distortion=distortion,
       num_reserved_ids=num_reserved_ids, num_shards=num_shards, shard=shard,
@@ -321,7 +321,7 @@ def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
       of each of `sampled_candidates`. All returned values are 1.0.
   """
   seed1, seed2 = random_seed.get_seed(seed)
-  return gen_candidate_sampling_ops._all_candidate_sampler(
+  return gen_candidate_sampling_ops.all_candidate_sampler(
       true_classes, num_true, num_sampled, unique, seed=seed1, seed2=seed2,
       name=name)
 
@@ -370,6 +370,6 @@ def compute_accidental_hits(true_classes, sampled_candidates, num_true,
 
   """
   seed1, seed2 = random_seed.get_seed(seed)
-  return gen_candidate_sampling_ops._compute_accidental_hits(
+  return gen_candidate_sampling_ops.compute_accidental_hits(
       true_classes, sampled_candidates, num_true, seed=seed1, seed2=seed2,
       name=name)
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 64567ac54ae43acf6f8b674c46525db7a6c4fab7..306055d2025f17904e3d4b2a342659a389f167a8 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -16,29 +16,6 @@
 """Asserts and Boolean Checks.
 
 See the @{$python/check_ops} guide.
-
-@@assert_negative
-@@assert_positive
-@@assert_non_negative
-@@assert_non_positive
-@@assert_equal
-@@assert_none_equal
-@@assert_near
-@@assert_less
-@@assert_less_equal
-@@assert_greater
-@@assert_greater_equal
-@@assert_rank
-@@assert_rank_at_least
-@@assert_rank_in
-@@assert_type
-@@assert_integer
-@@assert_proper_iterable
-@@assert_same_float_dtype
-@@assert_scalar
-@@is_non_decreasing
-@@is_numeric_tensor
-@@is_strictly_increasing
 """
 
 from __future__ import absolute_import
@@ -169,7 +146,7 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_negative', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         name = _shape_and_dtype_str(x)
       else:
         name = x.name
@@ -210,7 +187,7 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_positive', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         name = _shape_and_dtype_str(x)
       else:
         name = x.name
@@ -251,7 +228,7 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_non_negative', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         name = _shape_and_dtype_str(x)
       else:
         name = x.name
@@ -293,7 +270,7 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_non_positive', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         name = _shape_and_dtype_str(x)
       else:
         name = x.name
@@ -343,7 +320,7 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
 
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       eq = math_ops.equal(x, y)
       condition = math_ops.reduce_all(eq)
       if not condition:
@@ -363,27 +340,30 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
                          (x_sum, x_np[:x_sum],
                           y_sum, y_np[:y_sum]))
 
-        # Get the values that actually differed and their indices.
-        mask = math_ops.logical_not(eq)
-        indices = array_ops.where(mask)
-        indices_np = indices.numpy()
-        x_vals = array_ops.boolean_mask(x, mask)
-        y_vals = array_ops.boolean_mask(y, mask)
-        summarize = min(summarize, indices_np.shape[0])
+        index_and_values_str = ''
+        if x.shape == y.shape:
+          # If the shapes of x and y are the same,
+          # Get the values that actually differed and their indices.
+          # If shapes are different this information is more confusing
+          # than useful.
+          mask = math_ops.logical_not(eq)
+          indices = array_ops.where(mask)
+          indices_np = indices.numpy()
+          x_vals = array_ops.boolean_mask(x, mask)
+          y_vals = array_ops.boolean_mask(y, mask)
+          summarize = min(summarize, indices_np.shape[0])
+          index_and_values_str = (
+              'Indices of first %s different values:\n%s\n'
+              'Corresponding x values:\n%s\n'
+              'Corresponding y values:\n%s\n' %
+              (summarize, indices_np[:summarize],
+               x_vals.numpy().reshape((-1,))[:summarize],
+               y_vals.numpy().reshape((-1,))[:summarize]))
 
         raise errors.InvalidArgumentError(
             node_def=None, op=None,
-            message=('%s\nCondition x == y did not hold.\n'
-                     'Indices of first %s different values:\n%s\n'
-                     'Corresponding x values:\n%s\n'
-                     'Corresponding y values:\n%s\n'
-                     '%s'
-                     %
-                     (message or '',
-                      summarize, indices_np[:summarize],
-                      x_vals.numpy().reshape((-1,))[:summarize],
-                      y_vals.numpy().reshape((-1,))[:summarize],
-                      summary_msg)))
+            message=('%s\nCondition x == y did not hold.\n%s%s' %
+                     (message or '', index_and_values_str, summary_msg)))
       return
 
     if data is None:
@@ -435,7 +415,7 @@ def assert_none_equal(
   with ops.name_scope(name, 'assert_none_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       x_name = _shape_and_dtype_str(x)
       y_name = _shape_and_dtype_str(y)
     else:
@@ -512,7 +492,7 @@ def assert_near(
     rtol = ops.convert_to_tensor(rtol, name='rtol', dtype=x.dtype)
     atol = ops.convert_to_tensor(atol, name='atol', dtype=x.dtype)
 
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       x_name = _shape_and_dtype_str(x)
       y_name = _shape_and_dtype_str(y)
     else:
@@ -562,7 +542,7 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_less', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       x_name = _shape_and_dtype_str(x)
       y_name = _shape_and_dtype_str(y)
     else:
@@ -610,7 +590,7 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_less_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       x_name = _shape_and_dtype_str(x)
       y_name = _shape_and_dtype_str(y)
     else:
@@ -658,7 +638,7 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_greater', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       x_name = _shape_and_dtype_str(x)
       y_name = _shape_and_dtype_str(y)
     else:
@@ -708,7 +688,7 @@ def assert_greater_equal(x, y, data=None, summarize=None, message=None,
   with ops.name_scope(name, 'assert_greater_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       x_name = _shape_and_dtype_str(x)
       y_name = _shape_and_dtype_str(y)
     else:
@@ -808,7 +788,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
     static_condition = lambda actual_rank, given_rank: actual_rank == given_rank
     dynamic_condition = math_ops.equal
 
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       name = ''
     else:
       name = x.name
@@ -873,7 +853,7 @@ def assert_rank_at_least(
     static_condition = lambda actual_rank, given_rank: actual_rank >= given_rank
     dynamic_condition = math_ops.greater_equal
 
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       name = ''
     else:
       name = x.name
@@ -1001,7 +981,7 @@ def assert_rank_in(
     ranks = tuple([ops.convert_to_tensor(rank, name='rank') for rank in ranks])
     message = message or ''
 
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       name = ''
     else:
       name = x.name
@@ -1054,7 +1034,7 @@ def assert_integer(x, message=None, name=None):
   with ops.name_scope(name, 'assert_integer', [x]):
     x = ops.convert_to_tensor(x, name='x')
     if not x.dtype.is_integer:
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         name = 'tensor'
       else:
         name = x.name
@@ -1087,12 +1067,11 @@ def assert_type(tensor, tf_type, message=None, name=None):
   with ops.name_scope(name, 'assert_type', [tensor]):
     tensor = ops.convert_to_tensor(tensor, name='tensor')
     if tensor.dtype != tf_type:
-      if context.in_graph_mode():
-        raise TypeError(
-            '%s  %s must be of type %s' % (message, tensor.name, tf_type))
+      if context.executing_eagerly():
+        raise TypeError('%s tensor must be of type %s' % (message, tf_type))
       else:
-        raise TypeError(
-            '%s tensor must be of type %s' % (message, tf_type))
+        raise TypeError('%s  %s must be of type %s' % (message, tensor.name,
+                                                       tf_type))
 
     return control_flow_ops.no_op('statically_determined_correct_type')
 
@@ -1240,7 +1219,7 @@ def assert_scalar(tensor, name=None):
     tensor = ops.convert_to_tensor(tensor, name=name_scope)
     shape = tensor.get_shape()
     if shape.ndims != 0:
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         raise ValueError('Expected scalar shape, saw shape: %s.'
                          % (shape,))
       else:
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 49f8c665313562cb20dbe4494103ded16646c741..75c459a9cf10a90f6043d304b302e0a0806bf045 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -70,6 +71,35 @@ def clip_by_value(t, clip_value_min, clip_value_max,
     _ = t.shape.merge_with(t_max.shape)
 
   return t_max
+  # TODO(scottzhu): switch to use new implmentation in 2 weeks.
+    # return gen_math_ops.clip_by_value(
+    #     t, clip_value_min, clip_value_max, name=name)
+
+
+# TODO(scottzhu): switch to use new implmentation in 2 weeks.
+# @ops.RegisterGradient("ClipByValue")
+def _clip_by_value_grad(op, grad):
+  """Returns grad of clip_by_value."""
+  x = op.inputs[0]
+  y = op.inputs[1]
+  z = op.inputs[2]
+  gdtype = grad.dtype
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  sz = array_ops.shape(z)
+  gradshape = array_ops.shape(grad)
+  zeros = array_ops.zeros(gradshape, gdtype)
+  xymask = math_ops.less(x, y)
+  xzmask = math_ops.greater(x, z)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz)
+  xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad)
+  ygrad = array_ops.where(xymask, grad, zeros)
+  zgrad = array_ops.where(xzmask, grad, zeros)
+  gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
+  gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
+  gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz)
+  return (gx, gy, gz)
 
 
 @tf_export("clip_by_norm")
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index b9a93c3bedfff1f398e3b42cedf02a2f0a3ddd5c..c09154129f1a72965b351ad9b9c90f82c81bcdd2 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -12,12 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Confusion matrix related utilities.
-
-
-@@remove_squeezable_dimensions
-@@confusion_matrix
-"""
+"""Confusion matrix related utilities."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 97b57177b29986a006df992f4c0c2b79e11467aa..6a551deb5ba55871b3a3fb144a6ecd2a3cbfcbd8 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
@@ -28,7 +29,6 @@ from tensorflow.python.ops import math_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,undefined-variable
 from tensorflow.python.ops.control_flow_ops import *
-from tensorflow.python.ops.gen_control_flow_ops import *
 # pylint: enable=wildcard-import
 
 
@@ -75,6 +75,11 @@ def _SwitchGrad(op, *grad):
     # At this point, we have created zero_grad guarded by the right switch.
     # Unfortunately, we may still get None here for not trainable data types.
     if zero_grad is None:
+      # For resource variables we get None always on the other branch, so bypass
+      # this.
+      if op.inputs[0].dtype == dtypes.resource:
+        return merge(
+            [grad[op_ctxt.branch]] * 2, name="cond_resource_grad")[0], None
       return None, None
     return merge(grad, name="cond_grad")[0], None
   else:
@@ -143,6 +148,7 @@ def _ExitGrad(op, grad):
   """Gradients for an exit op are calculated using an Enter op."""
   graph = ops.get_default_graph()
   # pylint: disable=protected-access
+  op_ctxt = op._get_control_flow_context()
   grad_ctxt = graph._get_control_flow_context()
   # pylint: enable=protected-access
   if not grad_ctxt.back_prop:
@@ -151,10 +157,8 @@ def _ExitGrad(op, grad):
     # no gradient computation.
     return None
 
-  # pylint: disable=protected-access
-  if op._get_control_flow_context().grad_state:
+  if op_ctxt.grad_state:
     raise TypeError("Second-order gradient for while loops not supported.")
-  # pylint: enable=protected-access
 
   if isinstance(grad, ops.Tensor):
     grad_ctxt.AddName(grad.name)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 4f4a2e35db8de61b5f587f92a833b3094a530ba7..07d4ff7b02c70ee01fdd21f6725623b2a73705ff 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -15,36 +15,6 @@
 """Control Flow Operations.
 
 See the @{$python/control_flow_ops} guide.
-
-@@identity
-@@identity_n
-@@tuple
-@@group
-@@no_op
-@@count_up_to
-@@cond
-@@smart_cond
-@@case
-@@while_loop
-@@logical_and
-@@logical_not
-@@logical_or
-@@logical_xor
-@@equal
-@@not_equal
-@@less
-@@less_equal
-@@greater
-@@greater_equal
-@@where
-@@is_finite
-@@is_inf
-@@is_nan
-@@verify_tensor_all_finite
-@@check_numerics
-@@add_check_numerics_ops
-@@Assert
-@@Print
 """
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -152,7 +122,7 @@ def Assert(condition, data, summarize=None, name=None):
     @compatibility{eager} `tf.errors.InvalidArgumentError` if `condition`
     is not true
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     if not condition:
       xs = ops.convert_n_to_tensor(data)
       data_str = [_summarize_eager(x, summarize) for x in xs]
@@ -178,6 +148,8 @@ def Assert(condition, data, summarize=None, name=None):
             condition, data, summarize, name="Assert")
 
       guarded_assert = cond(condition, no_op, true_assert, name="AssertGuard")
+      if context.executing_eagerly():
+        return
       return guarded_assert.op
 
 
@@ -194,7 +166,7 @@ def _Identity(data, name=None):
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
-      return gen_array_ops._ref_identity(data, name=name)
+      return gen_array_ops.ref_identity(data, name=name)
     else:
       return array_ops.identity(data, name=name)
   else:
@@ -262,10 +234,10 @@ def _Enter(data,
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
-      result = gen_control_flow_ops._ref_enter(
+      result = gen_control_flow_ops.ref_enter(
           data, frame_name, is_constant, parallel_iterations, name=name)
     else:
-      result = gen_control_flow_ops._enter(
+      result = gen_control_flow_ops.enter(
           data, frame_name, is_constant, parallel_iterations, name=name)
     if use_input_shape:
       result.set_shape(data.get_shape())
@@ -280,7 +252,7 @@ def _Enter(data,
         parallel_iterations=parallel_iterations,
         use_input_shape=use_input_shape,
         name=name)
-    indices = gen_control_flow_ops._enter(
+    indices = gen_control_flow_ops.enter(
         data.indices,
         frame_name,
         is_constant,
@@ -291,7 +263,7 @@ def _Enter(data,
     if isinstance(data, ops.IndexedSlices):
       dense_shape = data.dense_shape
       if dense_shape is not None:
-        dense_shape = gen_control_flow_ops._enter(
+        dense_shape = gen_control_flow_ops.enter(
             dense_shape,
             frame_name,
             is_constant,
@@ -301,7 +273,7 @@ def _Enter(data,
           dense_shape.set_shape(data.dense_shape.get_shape())
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = gen_control_flow_ops._enter(
+      dense_shape = gen_control_flow_ops.enter(
           data.dense_shape,
           frame_name,
           is_constant,
@@ -327,7 +299,7 @@ def exit(data, name=None):  # pylint: disable=redefined-builtin
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
-      return gen_control_flow_ops._ref_exit(data, name)
+      return gen_control_flow_ops.ref_exit(data, name)
     else:
       return gen_control_flow_ops._exit(data, name)
   else:
@@ -369,17 +341,17 @@ def switch(data, pred, dtype=None, name=None):
         data, dtype=dtype, name="data", as_ref=True)
     pred = ops.convert_to_tensor(pred, name="pred")
     if isinstance(data, ops.Tensor):
-      return gen_control_flow_ops._switch(data, pred, name=name)
+      return gen_control_flow_ops.switch(data, pred, name=name)
     else:
       if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
         raise TypeError("Type %s not supported" % type(data))
       val, ind = data.values, data.indices
-      val_f, val_t = gen_control_flow_ops._switch(val, pred, name=name)
-      ind_f, ind_t = gen_control_flow_ops._switch(ind, pred, name="indices")
+      val_f, val_t = gen_control_flow_ops.switch(val, pred, name=name)
+      ind_f, ind_t = gen_control_flow_ops.switch(ind, pred, name="indices")
       if isinstance(data, ops.IndexedSlices):
         dense_shape = data.dense_shape
         if dense_shape is not None:
-          dense_shape_f, dense_shape_t = gen_control_flow_ops._switch(
+          dense_shape_f, dense_shape_t = gen_control_flow_ops.switch(
               dense_shape, pred, name="dense_shape")
         else:
           dense_shape_f, dense_shape_t = None, None
@@ -387,7 +359,7 @@ def switch(data, pred, dtype=None, name=None):
                 ops.IndexedSlices(val_t, ind_t, dense_shape_t))
       else:
         dense_shape = data.dense_shape
-        dense_shape_f, dense_shape_t = gen_control_flow_ops._switch(
+        dense_shape_f, dense_shape_t = gen_control_flow_ops.switch(
             data.dense_shape, pred, name="dense_shape")
         return (sparse_tensor.SparseTensor(ind_f, val_f, dense_shape_f),
                 sparse_tensor.SparseTensor(ind_t, val_t, dense_shape_t))
@@ -471,15 +443,15 @@ def merge(inputs, name=None):
     ]
     if all([isinstance(v, ops.Tensor) for v in inputs]):
       if all([v.dtype._is_ref_dtype for v in inputs]):  # pylint: disable=protected-access
-        return gen_control_flow_ops._ref_merge(inputs, name)
+        return gen_control_flow_ops.ref_merge(inputs, name)
       else:
-        return gen_control_flow_ops._merge(inputs, name)
+        return gen_control_flow_ops.merge(inputs, name)
     elif all([isinstance(v, sparse_tensor.SparseTensor) for v in inputs]):
       # Only handle the case when all inputs are SparseTensor.
       values, _ = merge([inp.values for inp in inputs], name=name)
-      indices, chosen_index = gen_control_flow_ops._merge(
+      indices, chosen_index = gen_control_flow_ops.merge(
           [inp.indices for inp in inputs], name="indices")
-      dense_shape, _ = gen_control_flow_ops._merge(
+      dense_shape, _ = gen_control_flow_ops.merge(
           [inp.dense_shape for inp in inputs], name="dense_shape")
       return (sparse_tensor.SparseTensor(indices, values, dense_shape),
               chosen_index)
@@ -487,13 +459,13 @@ def merge(inputs, name=None):
       # For now convert all the inputs as IndexedSlices.
       inputs = math_ops._as_indexed_slices_list(inputs, optimize=False)
       values, _ = merge([inp.values for inp in inputs], name=name)
-      indices, chosen_index = gen_control_flow_ops._merge(
+      indices, chosen_index = gen_control_flow_ops.merge(
           [inp.indices for inp in inputs], name="indices")
       if any(inp.dense_shape is not None for inp in inputs):
         if any(inp.dense_shape is None for inp in inputs):
           raise ValueError("Either all merged IndexedSlices must have a "
                            "dense_shape, or none must have a dense_shape.")
-        dense_shape, _ = gen_control_flow_ops._merge(
+        dense_shape, _ = gen_control_flow_ops.merge(
             [inp.dense_shape for inp in inputs], name="dense_shape")
       else:
         dense_shape = None
@@ -607,27 +579,29 @@ def _EnforceShapeInvariant(merge_var, next_var):
   """Check if the shapes of the loops variables are invariants.
 
   Args:
-    merge_vars: The list of tensors representing the initial values of the
+    merge_var: The list of tensors representing the initial values of the
       loop variables.
-    next_vars: The list of tensors representing the values of the loop
+    next_var: The list of tensors representing the values of the loop
       variables after one loop iteration.
 
   Raises:
-    ValueError: If any tensor in `merge_vars` has a more specific shape than
+    ValueError: If any tensor in `merge_var` has a more specific shape than
       its correspnding tensor in `next_var`.
   """
   if isinstance(merge_var, ops.Tensor):
     m_shape = merge_var.get_shape()
     n_shape = next_var.get_shape()
     if not _ShapeLessThanOrEqual(n_shape, m_shape):
-      # TODO(skyewm): get original loop input that caused the shape error and
-      # report its name instead of the merge node's.
+      enter = merge_var.op.inputs[0].op
+      assert util.IsLoopEnter(enter)
+      input_t = enter.inputs[0]
+      assert input_t.shape == m_shape
       raise ValueError(
-          "The shape for %s is not an invariant for the loop. It enters "
-          "the loop with shape %s, but has shape %s after one iteration. "
-          "Provide shape invariants using either the `shape_invariants` "
-          "argument of tf.while_loop or set_shape() on the loop variables." %
-          (merge_var.name, m_shape, n_shape))
+          "Input tensor '%s' enters the loop with shape %s, but has shape %s "
+          "after one iteration. To allow the shape to vary across iterations, "
+          "use the `shape_invariants` argument of tf.while_loop to specify a "
+          "less-specific shape." %
+          (input_t.name, input_t.shape, n_shape))
   else:
     if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
       raise TypeError("Type %s not supported" % type(var))
@@ -831,6 +805,9 @@ class GradLoopState(object):
     if outer_grad_state:
       outer_forward_ctxt = outer_grad_state.forward_context
     else:
+      if not hasattr(forward_ctxt, "outer_context"):
+        raise ValueError("Failed to call gradients on a while loop without"
+                         "properly serializing graph via MetaGraphDef")
       outer_forward_ctxt = forward_ctxt.outer_context
 
     # Add the forward loop counter.
@@ -1013,10 +990,8 @@ class GradLoopState(object):
         else:
           max_size = GetMaxSizeFromNestedMaximumIterations(
               value, self.forward_context)
-        # pylint: disable=protected-access
-        acc = gen_data_flow_ops._stack_v2(
+        acc = gen_data_flow_ops.stack_v2(
             max_size=max_size, elem_type=value.dtype.base_dtype, name="f_acc")
-        # pylint: enable=protected-access
       if curr_ctxt:
         curr_ctxt.Exit()
 
@@ -1029,10 +1004,8 @@ class GradLoopState(object):
       if value_ctxt == self.forward_context:
         # value is not nested in the forward context.
         self.forward_context.Enter()
-        # pylint: disable=protected-access
-        push = gen_data_flow_ops._stack_push_v2(
+        push = gen_data_flow_ops.stack_push_v2(
             enter_acc, value, swap_memory=swap_enabled)
-        # pylint: enable=protected-access
         self.forward_context.Exit()
         # Protect stack push and order it before forward_index.
         self.forward_index.op._add_control_input(push.op)
@@ -1044,18 +1017,14 @@ class GradLoopState(object):
           # The special case for creating a zero tensor for a dead
           # branch of a switch. See ControlFlowState.ZerosLike().
           value_ctxt.outer_context.Enter()
-          # pylint: disable=protected-access
-          push = gen_data_flow_ops._stack_push_v2(
+          push = gen_data_flow_ops.stack_push_v2(
               enter_acc, value, swap_memory=swap_enabled)
-          # pylint: enable=protected-access
           value_ctxt.outer_context.Exit()
           push.op._set_control_flow_context(value_ctxt)
         else:
           value_ctxt.Enter()
-          # pylint: disable=protected-access
-          push = gen_data_flow_ops._stack_push_v2(
+          push = gen_data_flow_ops.stack_push_v2(
               enter_acc, value, swap_memory=swap_enabled)
-          # pylint: enable=protected-access
           value_ctxt.Exit()
         # Protect stack push and order it before forward_sync.
         self.forward_sync._add_control_input(push.op)
@@ -1102,10 +1071,8 @@ class GradLoopState(object):
           pred = cond_ctxt.pred
         branch = (1 - cond_ctxt.branch) if dead_branch else cond_ctxt.branch
         history_value = _SwitchRefOrTensor(history_value, pred)[branch]
-      # pylint: disable=protected-access
-      pop = gen_data_flow_ops._stack_pop_v2(history_value,
-                                            value.dtype.base_dtype)
-      # pylint: enable=protected-access
+      pop = gen_data_flow_ops.stack_pop_v2(history_value,
+                                           value.dtype.base_dtype)
       pop.set_shape(value.get_shape())
       self.grad_context.Exit()
     parallel_iterations = self.grad_context.parallel_iterations
@@ -1475,7 +1442,10 @@ def ZerosLikeOutsideLoop(op, index):
       branch = op_ctxt.branch
       switch_val = switch(op.inputs[0], pred)[1 - branch]
       zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
-      return array_ops.zeros(zeros_shape, dtype=val.dtype)
+      # Ensure ops created within array_ops.zeros are dominated by switch in
+      # cond context.
+      with ops.control_dependencies([switch_val]):
+        return array_ops.zeros(zeros_shape, dtype=val.dtype)
     else:
       return array_ops.zeros_like(val, optimize=False)
 
@@ -1507,9 +1477,11 @@ class ControlFlowContext(object):
     if values_def:
       self._init_values_from_proto(values_def, import_scope=import_scope)
     else:
-      # Values that have been already seen in this context.
+      # The names of tensors that have been already seen in this context.
       self._values = set()
-      # Values referenced by but external to this context.
+      # The keys are the names of tensors referenced by but external to this
+      # context. Each value is the Tensor that should be used by this context to
+      # access the key value (e.g. a switch output guarding a cond input value).
       self._external_values = {}
 
   def _init_values_from_proto(self, values_def, import_scope=None):
@@ -1598,6 +1570,16 @@ class ControlFlowContext(object):
     last_context = self._context_stack.pop()
     graph._set_control_flow_context(last_context)
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
+
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
@@ -1696,9 +1678,12 @@ class CondContext(ControlFlowContext):
       self._pivot = pivot  # The predicate tensor in this branch
       self._branch = branch  # 0 or 1 representing this branch
 
-      # Values considered to have been already seen in this context.
+      # Values considered to have been already seen in this context. They are
+      # not included in this context.
       self._values.add(pred.name)
+      self._external_values[pred.name] = pred
       self._values.add(pivot.name)
+      self._external_values[pivot.name] = pivot
 
   def _init_from_proto(self, context_def, import_scope=None):
     """Creates a new `CondContext` from protocol buffer.
@@ -1716,8 +1701,8 @@ class CondContext(ControlFlowContext):
     self._pivot = g.as_graph_element(
         ops.prepend_name_scope(context_def.pivot_name, import_scope))
     self._branch = context_def.branch
-    super(CondContext, self).__init__(
-        values_def=context_def.values_def, import_scope=import_scope)
+    super(CondContext, self).__init__(values_def=context_def.values_def,
+                                      import_scope=import_scope)
 
   @property
   def pred(self):
@@ -1765,13 +1750,9 @@ class CondContext(ControlFlowContext):
       context_def.branch = self._branch
       context_def.values_def.MergeFrom(super(CondContext, self)._to_values_def(
           export_scope))
-      # TODO(b/72868227): enable this once the corresponding control_flow.proto
-      # changes have been checked in (they aren't checked in and this is
-      # disabled for now to ensure forwards compatibility).
-      if False:  # pylint: disable=using-constant-test
-        for nested in self._nested_contexts:
-          nested_def = context_def.nested_contexts.add()
-          nested.to_control_flow_context_def(nested_def)
+      for nested in self._nested_contexts:
+        nested_def = context_def.nested_contexts.add()
+        nested.to_control_flow_context_def(nested_def)
 
       return context_def
     else:
@@ -1783,14 +1764,10 @@ class CondContext(ControlFlowContext):
     ret = CondContext(context_def=context_def,
                       import_scope=import_scope)
 
-    # TODO(b/72868227): remove "if hasattr(...)" once the corresponding
-    # control_flow.proto changes have been checked in (they aren't checked in
-    # and this is here for now to ensure forwards compatibility).
-    if hasattr(context_def, "nested_contexts"):
-      ret.Enter()
-      for nested_def in context_def.nested_contexts:
-        from_control_flow_context_def(nested_def)
-      ret.Exit()
+    ret.Enter()
+    for nested_def in context_def.nested_contexts:
+      from_control_flow_context_def(nested_def, import_scope=import_scope)
+    ret.Exit()
     return ret
 
   def to_control_flow_context_def(self, context_def, export_scope=None):
@@ -1809,6 +1786,7 @@ class CondContext(ControlFlowContext):
       if self._outer_context:
         result = self._outer_context.AddValue(val)
         self._values.add(result.name)
+        self._external_values[result.name] = result
       with ops.control_dependencies(None):
         result = _SwitchRefOrTensor(result, self._pred)[self._branch]
         if self._outer_context:
@@ -1834,8 +1812,6 @@ class CondContext(ControlFlowContext):
       # pylint: disable=protected-access
       op._add_control_input(self._pivot.op)
       # pylint: enable=protected-access
-      for x in op.outputs:
-        self._values.add(x.name)
     else:
       for index in range(len(op.inputs)):
         x = op.inputs[index]
@@ -1846,13 +1822,20 @@ class CondContext(ControlFlowContext):
           # pylint: enable=protected-access
       # Remove any external control dependency on this op.
       self._RemoveExternalControlEdges(op)
-      for x in op.outputs:
-        self._values.add(x.name)
       # pylint: disable=protected-access
       if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
         op._add_control_input(self._pivot.op)
       # pylint: enable=protected-access
 
+    # Mark op's outputs as seen by this context and any outer contexts.
+    output_names = [x.name for x in op.outputs]
+    ctxt = self
+    while ctxt is not None:
+      # pylint: disable=protected-access
+      ctxt._values.update(output_names)
+      ctxt = ctxt._outer_context
+      # pylint: enable=protected-access
+
     if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
 
@@ -1868,6 +1851,7 @@ class CondContext(ControlFlowContext):
       if self._outer_context:
         real_val = self._outer_context.AddValue(val)
         self._values.add(real_val.name)
+        self._external_values[real_val.name] = real_val
       real_val = _SwitchRefOrTensor(real_val, self._pred)[self._branch]
       self._external_values[val.name] = real_val
     else:
@@ -2029,7 +2013,7 @@ def cond(pred,
     raise TypeError("false_fn must be callable.")
 
   with ops.name_scope(name, "cond", [pred]):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       if pred:
         return _UnpackIfSingleton(true_fn())
       return _UnpackIfSingleton(false_fn())
@@ -2103,10 +2087,7 @@ def cond(pred,
     # Only add non-nested conds to the collection. Any nested control flow will
     # be encapsulated in the root context.
     assert context_t.outer_context == context_f.outer_context
-    # TODO(b/72868227): remove "if True..." once the corresponding
-    # control_flow.proto changes have been checked in (they aren't checked in
-    # and this is disabled for now to ensure forwards compatibility).
-    if True or context_t.outer_context is None:
+    if context_t.outer_context is None:
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
 
@@ -2122,61 +2103,6 @@ def cond(pred,
 # pylint: enable=redefined-outer-name
 
 
-def smart_cond(pred, true_fn=None, false_fn=None, name=None):
-  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
-
-  If `pred` is a bool or has a constant value, we return either `true_fn()`
-  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
-
-  Arguments:
-    pred: A scalar determining whether to return the result of `true_fn` or
-      `false_fn`.
-    true_fn: The callable to be performed if pred is true.
-    false_fn: The callable to be performed if pred is false.
-    name: Optional name prefix when using `tf.cond`.
-
-  Returns:
-    Tensors returned by the call to either `true_fn` or `false_fn`.
-
-  Raises:
-    TypeError: If `true_fn` or `false_fn` is not callable.
-  """
-  if not callable(true_fn):
-    raise TypeError('`true_fn` must be callable.')
-  if not callable(false_fn):
-    raise TypeError('`false_fn` must be callable.')
-
-  pred_value = smart_constant_value(pred)
-  if pred_value is not None:
-    if pred_value:
-      return true_fn()
-    else:
-      return false_fn()
-  else:
-    return cond(pred, true_fn=true_fn, false_fn=false_fn, name=name)
-
-
-def smart_constant_value(pred):
-  """Return the bool value for `pred`, or None if `pred` had a dynamic value.
-
-  Arguments:
-    pred: A scalar, either a Python bool or tensor.
-
-  Returns:
-    True or False if `pred` has a constant boolean value, None otherwise.
-
-  Raises:
-    TypeError: If `pred` is not a Tensor or bool.
-  """
-  if isinstance(pred, bool):
-    pred_value = pred
-  elif isinstance(pred, ops.Tensor):
-    pred_value = tensor_util.constant_value(pred)
-  else:
-    raise TypeError('`pred` must be a Tensor or a Python bool.')
-  return pred_value
-
-
 def _resource_safe_shape(t):
   """Returns the shape of t or the variable it points to."""
   if t.dtype == dtypes.resource:
@@ -2384,13 +2310,9 @@ class WhileContext(ControlFlowContext):
       context_def.values_def.MergeFrom(
           super(WhileContext, self)._to_values_def(
               export_scope=export_scope))
-      # TODO(b/72868227): remove "if True..." once the corresponding
-      # control_flow.proto changes have been checked in (they aren't checked in
-      # and this is disabled for now to ensure forwards compatibility).
-      if False:  # pylint: disable=using-constant-test
-        for nested in self._nested_contexts:
-          nested_def = context_def.nested_contexts.add()
-          nested.to_control_flow_context_def(nested_def)
+      for nested in self._nested_contexts:
+        nested_def = context_def.nested_contexts.add()
+        nested.to_control_flow_context_def(nested_def)
 
       return context_def
     else:
@@ -2412,14 +2334,10 @@ class WhileContext(ControlFlowContext):
     """
     ret = WhileContext(context_def=context_def,
                        import_scope=import_scope)
-    # TODO(b/72868227): remove "if hasattr(...)" once the corresponding
-    # control_flow.proto changes have been checked in (they aren't checked in
-    # and this is disabled for now to ensure forwards compatibility).
-    if hasattr(context_def, "nested_contexts"):
-      ret.Enter()
-      for nested_def in context_def.nested_contexts:
-        from_control_flow_context_def(nested_def, import_scope=import_scope)
-      ret.Exit()
+    ret.Enter()
+    for nested_def in context_def.nested_contexts:
+      from_control_flow_context_def(nested_def, import_scope=import_scope)
+    ret.Exit()
     return ret
 
   def GetWhileContext(self):
@@ -2433,7 +2351,15 @@ class WhileContext(ControlFlowContext):
   def AddValue(self, val):
     """Add `val` to the current context and its outer context recursively."""
     result = val
-    if val.name not in self._values:
+    new_value = val.name not in self._values
+    # Don't treat ops in this context as new values. Usually all known values
+    # are in self._values, except when we're importing a while loop inside this
+    # WhileContext. Since there's a cycle in this case, `val` may be part of the
+    # imported while loop but not yet processed by this context and added to
+    # self._values in _AddOpInternal. We only want to process external input
+    # tensors to the while loop here.
+    new_value &= val.op._control_flow_context is not self  # pylint: disable=protected-access
+    if new_value:
       self._values.add(val.name)
 
       # If we are in a grad context and val is from its forward context,
@@ -3003,8 +2929,11 @@ class WhileContext(ControlFlowContext):
     loop_vars = ops.convert_n_to_tensor_or_indexed_slices(loop_vars)
     try:
       self.Enter()
-      original_body_result, exit_vars = self._BuildLoop(
-          pred, body, original_loop_vars, loop_vars, shape_invariants)
+      # _BuildLoop calls _update_input in several places. _lock ensures a
+      # Session.run call cannot occur between creating and mutating new ops.
+      with ops.get_default_graph()._lock:  # pylint: disable=protected-access
+        original_body_result, exit_vars = self._BuildLoop(
+            pred, body, original_loop_vars, loop_vars, shape_invariants)
     finally:
       self.Exit()
 
@@ -3016,7 +2945,7 @@ class WhileContext(ControlFlowContext):
     packed_exit_vars = nest.pack_sequence_as(
         structure=original_body_result,
         flat_sequence=exit_vars_with_tensor_arrays)
-    return (packed_exit_vars[0] if len(exit_vars) == 1 else packed_exit_vars)
+    return packed_exit_vars[0] if len(exit_vars) == 1 else packed_exit_vars
 
   def _FixControlInputsAndContext(self, enters):
     graph = ops.get_default_graph()
@@ -3173,24 +3102,24 @@ def while_loop(cond,
       c, b, loop_vars=[i0, m0],
       shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
   ```
-  
-  Example which demonstrates non-strict semantics: In the following 
-  example, the final value of the counter `i` does not depend on `x`. So 
-  the `while_loop` can increment the counter parallel to updates of `x`. 
+
+  Example which demonstrates non-strict semantics: In the following
+  example, the final value of the counter `i` does not depend on `x`. So
+  the `while_loop` can increment the counter parallel to updates of `x`.
   However, because the loop counter at one loop iteration depends
   on the value at the previous iteration, the loop counter itself cannot
-  be incremented in parallel. Hence if we just want the final value of the 
-  counter (which we print on the line `print(sess.run(i))`), then  
-  `x` will never be incremented, but the counter will be updated on a 
+  be incremented in parallel. Hence if we just want the final value of the
+  counter (which we print on the line `print(sess.run(i))`), then
+  `x` will never be incremented, but the counter will be updated on a
   single thread. Conversely, if we want the value of the output (which we
-  print on the line `print(sess.run(out).shape)`), then the counter may be 
-  incremented on its own thread, while `x` can be incremented in 
-  parallel on a separate thread. In the extreme case, it is conceivable 
-  that the thread incrementing the counter runs until completion before 
-  `x` is incremented even a single time. The only thing that can never 
-  happen is that the thread updating `x` can never get ahead of the 
-  counter thread because the thread incrementing `x` depends on the value 
-  of the counter. 
+  print on the line `print(sess.run(out).shape)`), then the counter may be
+  incremented on its own thread, while `x` can be incremented in
+  parallel on a separate thread. In the extreme case, it is conceivable
+  that the thread incrementing the counter runs until completion before
+  `x` is incremented even a single time. The only thing that can never
+  happen is that the thread updating `x` can never get ahead of the
+  counter thread because the thread incrementing `x` depends on the value
+  of the counter.
   ```python
   import tensorflow as tf
 
@@ -3202,13 +3131,13 @@ def while_loop(cond,
   with tf.Session() as sess:
       print(sess.run(i))  # prints [0] ... [9999]
 
-      # The following line may increment the counter and x in parallel. 
-      # The counter thread may get ahead of the other thread, but not the 
-      # other way around. So you may see things like 
+      # The following line may increment the counter and x in parallel.
+      # The counter thread may get ahead of the other thread, but not the
+      # other way around. So you may see things like
       # [9996] x:[9987]
-      # meaning that the counter thread is on iteration 9996, 
+      # meaning that the counter thread is on iteration 9996,
       # while the other thread is on iteration 9987
-      print(sess.run(out).shape)  
+      print(sess.run(out).shape)
   ```
 
   """
@@ -3244,13 +3173,19 @@ def while_loop(cond,
             math_ops.logical_and(i < maximum_iterations, orig_cond(*lv)))
         body = lambda i, lv: (i + 1, orig_body(*lv))
 
-    if context.in_eager_mode():
+    if context.executing_eagerly():
+      try_to_pack = len(loop_vars) == 1
+      packed = False  # whether the body result was packed into a 1-item tuple
+
       while cond(*loop_vars):
         loop_vars = body(*loop_vars)
+        if try_to_pack and not isinstance(loop_vars, (list, _basetuple)):
+          packed = True
+          loop_vars = (loop_vars,)
       if maximum_iterations is not None:
         return loop_vars[1]
       else:
-        return loop_vars
+        return loop_vars[0] if packed else loop_vars
 
     if shape_invariants is not None:
       if maximum_iterations is not None:
@@ -3264,10 +3199,7 @@ def while_loop(cond,
         swap_memory=swap_memory)
     # Only add non-nested loops to the collection. Any nested control flow will
     # be encapsulated in the root context.
-    # TODO(b/72868227): enable condition once the corresponding
-    # control_flow.proto changes have been checked in (they aren't checked in
-    # and this is disabled for now to ensure forwards compatibility).
-    if True or loop_context.outer_context is None:
+    if loop_context.outer_context is None:
       ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, loop_context)
     result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
     if maximum_iterations is not None:
@@ -3341,7 +3273,7 @@ def with_dependencies(dependencies, output_tensor, name=None):
   Raises:
     TypeError: if `output_tensor` is not a `Tensor` or `IndexedSlices`.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return output_tensor
   with ops.name_scope(name, "control_dependency",
                       list(dependencies) + [output_tensor]) as name:
@@ -3386,7 +3318,7 @@ def group(*inputs, **kwargs):
   Raises:
     ValueError: If an unknown keyword argument is provided.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return None
   name = kwargs.pop("name", None)
   if kwargs:
@@ -3466,10 +3398,15 @@ def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined
       objects.
 
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return tensors
   with ops.name_scope(name, "tuple", tensors) as name:
-    gating_ops = [t.op for t in tensors if t is not None]
+    tensors = [t if (isinstance(t, ops.Operation)
+                     or tensor_util.is_tensor(t)
+                     or t is None)
+               else ops.convert_to_tensor(t) for t in tensors]
+    gating_ops = [t if isinstance(t, ops.Operation) else t.op for t in tensors
+                  if t is not None]
     if control_inputs:
       for c in control_inputs:
         if isinstance(c, ops.Tensor):
@@ -3485,8 +3422,11 @@ def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined
     gate = group(*gating_ops)
     tpl = []
     for t in tensors:
-      if t is not None:
+      if tensor_util.is_tensor(t):
         tpl.append(with_dependencies([gate], t))
+      elif isinstance(t, ops.Operation):
+        with ops.control_dependencies([gate]):
+          tpl.append(group(t))
       else:
         tpl.append(None)
     return tpl
@@ -3546,15 +3486,17 @@ def _case_create_default_action(predicates, actions):
   return default_action, other_predicates, other_actions
 
 
-def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name):
+def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name,
+                                       allow_python_preds):
   """Verifies input arguments for the case function.
 
   Args:
-    pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor and a
-                   callable which returns a list of tensors.
+    pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor,
+                   and a callable which returns a list of tensors.
     exclusive: True iff at most one predicate is allowed to evaluate to `True`.
     name: A name for the case operation.
-
+    allow_python_preds: if true, pred_fn_pairs may contain Python bools in
+                        addition to boolean Tensors
   Raises:
     TypeError: If `pred_fn_pairs` is not a list/dictionary.
     TypeError: If `pred_fn_pairs` is a list but does not contain 2-tuples.
@@ -3579,14 +3521,69 @@ def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name):
     if not isinstance(pred_fn_pair, _basetuple) or len(pred_fn_pair) != 2:
       raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
     pred, fn = pred_fn_pair
-    if pred.dtype != dtypes.bool:
-      raise TypeError("pred must be of type bool: %s", pred.name)
+
+    if isinstance(pred, ops.Tensor):
+      if pred.dtype != dtypes.bool:
+        raise TypeError("pred must be Tensor of type bool: %s" % pred.name)
+    elif not allow_python_preds:
+      raise TypeError("pred must be a Tensor, got: %s" % pred)
+    elif not isinstance(pred, bool):
+      raise TypeError("pred must be a Tensor or bool, got: %s" % pred)
+
     if not callable(fn):
       raise TypeError("fn for pred %s must be callable." % pred.name)
+
   predicates, actions = zip(*pred_fn_pairs)
   return predicates, actions
 
 
+def _case_helper(cond_fn, pred_fn_pairs, default,
+                 exclusive, name, allow_python_preds=False, **cond_kwargs):
+  """Implementation of case that allows for different cond functions.
+
+  Args:
+    cond_fn: method that has signature and semantics of `cond` above.
+    pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor, and a
+                   callable which returns a list of tensors.
+    default: Optional callable that returns a list of tensors.
+    exclusive: True iff at most one predicate is allowed to evaluate to `True`.
+    name: A name for this operation (optional).
+    allow_python_preds: if true, pred_fn_pairs may contain Python bools in
+                        addition to boolean Tensors
+    **cond_kwargs: keyword arguments that will be passed to `cond_fn`.
+
+  Returns:
+    The tensors returned by the first pair whose predicate evaluated to True, or
+    those returned by `default` if none does.
+
+  Raises:
+    TypeError: If `pred_fn_pairs` is not a list/dictionary.
+    TypeError: If `pred_fn_pairs` is a list but does not contain 2-tuples.
+    TypeError: If `fns[i]` is not callable for any i, or `default` is not
+               callable.
+  """
+  predicates, actions = _case_verify_and_canonicalize_args(
+      pred_fn_pairs, exclusive, name, allow_python_preds)
+  with ops.name_scope(name, "case", [predicates]):
+    if default is None:
+      default, predicates, actions = _case_create_default_action(
+          predicates, actions)
+    fn = default
+    # To eval conditions in direct order we create nested conditions in reverse:
+    #   cond_fn(c[0], true_fn=.., false_fn=cond_fn(c[1], ...))
+    for predicate, action in reversed(list(zip(predicates, actions))):
+      fn = functools.partial(
+          cond_fn, predicate, true_fn=action, false_fn=fn, **cond_kwargs)
+    if exclusive:
+      with ops.control_dependencies([
+          _assert_at_most_n_true(
+              predicates, n=1, msg="Input error: exclusive=True")
+      ]):
+        return fn()
+    else:
+      return fn()
+
+
 @tf_export("case")
 def case(pred_fn_pairs,
          default=None,
@@ -3677,26 +3674,8 @@ def case(pred_fn_pairs,
     TypeError: If `fns[i]` is not callable for any i, or `default` is not
                callable.
   """
-  predicates, actions = _case_verify_and_canonicalize_args(
-      pred_fn_pairs, exclusive, name)
-  with ops.name_scope(name, "case", [predicates]):
-    if default is None:
-      default, predicates, actions = _case_create_default_action(
-          predicates, actions)
-    fn = default
-    # To eval conditions in direct order we create nested conditions in reverse:
-    #   cond(c[0], true_fn=.., false_fn=cond(c[1], ...))
-    for predicate, action in reversed(list(zip(predicates, actions))):
-      fn = functools.partial(
-          cond, predicate, true_fn=action, false_fn=fn, strict=strict)
-    if exclusive:
-      with ops.control_dependencies([
-          _assert_at_most_n_true(
-              predicates, n=1, msg="Input error: exclusive=True")
-      ]):
-        return fn()
-    else:
-      return fn()
+  return _case_helper(cond, pred_fn_pairs, default, exclusive, name,
+                      allow_python_preds=False, strict=strict)
 
 
 class XLAControlFlowContext(ControlFlowContext):
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 7775133348ffb915995970b2942a3b3cf27b8809..289df6f3016e9df6a42d694ae854b4f22fdf84f9 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -349,44 +349,6 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       self.assertEquals(grad_x_false.eval(), 0.)
 
 
-@test_util.with_c_api
-class SmartCondTest(test_util.TensorFlowTestCase):
-
-  def testSmartCondTrue(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(5)
-        z = control_flow_ops.smart_cond(
-            True, lambda: math_ops.multiply(x, 16),
-            lambda: math_ops.multiply(y, 5))
-        self.assertEqual(z.eval(), 32)
-
-  def testSmartCondFalse(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(4)
-        y = constant_op.constant(3)
-        z = control_flow_ops.smart_cond(
-            False, lambda: math_ops.multiply(x, 16),
-            lambda: math_ops.multiply(y, 3))
-        self.assertEqual(z.eval(), 9)
-
-  def testSmartCondMissingArg1(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.smart_cond(True, false_fn=lambda: x)
-
-  def testSmartCondMissingArg2(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.smart_cond(True, lambda: x)
-
-
 @test_util.with_c_api
 class CondTest(test_util.TensorFlowTestCase):
 
@@ -985,5 +947,28 @@ class CaseTest(test_util.TensorFlowTestCase):
         sess.run(output, feed_dict={x: 4})
 
 
+@test_util.with_c_api
+class WhileLoopTestCase(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testWhileLoopWithSingleVariable(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+    b = lambda i: math_ops.add(i, 1)
+    r = control_flow_ops.while_loop(c, b, [i])
+
+    self.assertEqual(self.evaluate(r), 10)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerWhileLoopWithSingleVariable_bodyReturnsTuple(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+    b = lambda i: (math_ops.add(i, 1),)
+    r = control_flow_ops.while_loop(c, b, [i])
+
+    # Expect a tuple since that is what the body returns.
+    self.assertEqual(self.evaluate(r), (10,))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 83da6739db673644f59fda3044769b18b2138fbc..908e7939027933327bcdeb21d598bc0b5ca5ff0f 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -148,7 +148,7 @@ def ctc_loss(labels, inputs, sequence_length,
   if not time_major:
     inputs = array_ops.transpose(inputs, [1, 0, 2])  # (B,T,N) => (T,B,N)
 
-  loss, _ = gen_ctc_ops._ctc_loss(
+  loss, _ = gen_ctc_ops.ctc_loss(
       inputs,
       labels.indices,
       labels.values,
@@ -218,13 +218,13 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
         The rows store: `[batch, time]`.
       `decoded.values`: Values vector, size `(total_decoded_outputs)`.
         The vector stores the decoded classes.
-      `decoded.shape`: Shape vector, size `(2)`.
+      `decoded.dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length]`
     neg_sum_logits: A `float` matrix `(batch_size x 1)` containing, for the
         sequence found, the negative of the sum of the greatest logit at each
         timeframe.
   """
-  outputs = gen_ctc_ops._ctc_greedy_decoder(
+  outputs = gen_ctc_ops.ctc_greedy_decoder(
       inputs, sequence_length, merge_repeated=merge_repeated)
   (decoded_ix, decoded_val, decoded_shape, log_probabilities) = outputs
   return ([sparse_tensor.SparseTensor(decoded_ix, decoded_val, decoded_shape)],
@@ -265,14 +265,14 @@ def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
         The rows store: [batch, time].
       `decoded[j].values`: Values vector, size `(total_decoded_outputs[j])`.
         The vector stores the decoded classes for beam j.
-      `decoded[j].shape`: Shape vector, size `(2)`.
+      `decoded[j].dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length[j]]`.
     log_probability: A `float` matrix `(batch_size x top_paths)` containing
         sequence log-probabilities.
   """
 
   decoded_ixs, decoded_vals, decoded_shapes, log_probabilities = (
-      gen_ctc_ops._ctc_beam_search_decoder(
+      gen_ctc_ops.ctc_beam_search_decoder(
           inputs, sequence_length, beam_width=beam_width, top_paths=top_paths,
           merge_repeated=merge_repeated))
 
diff --git a/tensorflow/python/ops/cudnn_rnn_grad.py b/tensorflow/python/ops/cudnn_rnn_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..c618c470f201af14d26960efb6a68ace0ac29b88
--- /dev/null
+++ b/tensorflow/python/ops/cudnn_rnn_grad.py
@@ -0,0 +1,73 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradients for CuudnnRNN operators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
+
+
+@ops.RegisterGradient("CudnnRNN")
+def _cudnn_rnn_backward(op, *grads):
+  """Gradients for the CudnnRNN op."""
+  if not op.get_attr("is_training"):
+    raise ValueError(
+        "To use CudnnRNN in gradients, is_training must be set to True.")
+  return gen_cudnn_rnn_ops.cudnn_rnn_backprop(
+      input=op.inputs[0],
+      input_h=op.inputs[1],
+      input_c=op.inputs[2],
+      params=op.inputs[3],
+      output=op.outputs[0],
+      output_h=op.outputs[1],
+      output_c=op.outputs[2],
+      output_backprop=grads[0],
+      output_h_backprop=grads[1],
+      output_c_backprop=grads[2],
+      reserve_space=op.outputs[3],
+      dropout=op.get_attr("dropout"),
+      seed=op.get_attr("seed"),
+      seed2=op.get_attr("seed2"),
+      rnn_mode=op.get_attr("rnn_mode"),
+      input_mode=op.get_attr("input_mode"),
+      direction=op.get_attr("direction"))
+
+
+@ops.RegisterGradient("CudnnRNNV2")
+def _cudnn_rnn_backward_v2(op, *grad):
+  if not op.get_attr("is_training"):
+    raise ValueError(
+        "To use CudnnRNNV2 in gradients, is_training must be set to True.")
+  return gen_cudnn_rnn_ops.cudnn_rnn_backprop_v2(
+      input=op.inputs[0],
+      input_h=op.inputs[1],
+      input_c=op.inputs[2],
+      params=op.inputs[3],
+      output=op.outputs[0],
+      output_h=op.outputs[1],
+      output_c=op.outputs[2],
+      output_backprop=grad[0],
+      output_h_backprop=grad[1],
+      output_c_backprop=grad[2],
+      reserve_space=op.outputs[3],
+      host_reserved=op.outputs[4],
+      dropout=op.get_attr("dropout"),
+      seed=op.get_attr("seed"),
+      seed2=op.get_attr("seed2"),
+      rnn_mode=op.get_attr("rnn_mode"),
+      input_mode=op.get_attr("input_mode"),
+      direction=op.get_attr("direction"))
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
new file mode 100644
index 0000000000000000000000000000000000000000..d934f27cb96f4a65e2adf860e0c5e08b7bd0b7d4
--- /dev/null
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -0,0 +1,224 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Decorator to overrides the gradient for a function."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape as tape_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("custom_gradient")
+def custom_gradient(f):
+  """Decorator to define a function with a custom gradient.
+
+  This decorator allows fine grained control over the gradients of a sequence
+  for operations.  This may be useful for multiple reasons, including providing
+  a more efficient or numerically stable gradient for a sequence of operations.
+
+  For example, consider the following function that commonly occurs in the
+  computation of cross entropy and log likelihoods:
+
+  ```python
+  def log1pexp(x):
+    return tf.log(1 + tf.exp(x))
+  ```
+
+  Due to numerical instability, the gradient this function evaluated at x=100 is
+  NaN.  For example:
+
+  ```python
+  x = tf.constant(100.)
+  y = log1pexp(x)
+  dy = tf.gradients(y, x) # Will be NaN when evaluated.
+  ```
+
+  The gradient expression can be analytically simplified to provide numerical
+  stability:
+
+  ```python
+  @tf.custom_gradient
+  def log1pexp(x):
+    e = tf.exp(x)
+    def grad(dy):
+      return dy * (1 - 1 / (1 + e))
+    return tf.log(1 + e), grad
+  ```
+
+  With this definition, the gradient at x=100 will be correctly evaluated as
+  1.0.
+
+  See also @{tf.RegisterGradient} which registers a gradient function for a
+  primitive TensorFlow operation. `tf.custom_gradient` on the other hand allows
+  for fine grained control over the gradient computation of a sequence of
+  operations.
+
+  Note that if the decorated function uses `Variable`s, the enclosing variable
+  scope must be using `ResourceVariable`s.
+
+  Args:
+    f: function `f(x)` that returns a tuple `(y, grad_fn)` where:
+       - `x` is a `Tensor` or sequence of `Tensor` inputs to the function.
+       - `y` is a `Tensor` or sequence of `Tensor` outputs of applying
+         TensorFlow
+         operations in `f` to `x`.
+       - `grad_fn` is a function with the signature `g(*grad_ys)` which returns
+         a list of `Tensor`s - the derivatives of `Tensor`s in `y` with respect
+         to the `Tensor`s in `x.  `grad_ys` is a `Tensor` or sequence of
+         `Tensor`s the same size as `y` holding the initial value gradients for
+         each `Tensor` in `y`. If `f` uses `Variable`s (that are not part of the
+         inputs), i.e. through `get_variable`, then `grad_fn` should have
+         signature `g(*grad_ys, variables=None)`, where `variables` is a list of
+         the `Variable`s, and return a 2-tuple `(grad_xs, grad_vars)`, where
+         `grad_xs` is the same as above, and `grad_vars` is a `list<Tensor>`
+         with the derivatives of `Tensor`s in `y` with respect to the variables.
+
+  Returns:
+    A function `h(x)` which returns the same value as `f(x)[0]` and whose
+    gradient (as calculated by @{tf.gradients}) is determined by `f(x)[1]`.
+  """
+
+  def decorated(*args, **kwargs):
+    """Decorated function with custom gradient."""
+    if context.executing_eagerly():
+      return _eager_mode_decorator(f, *args, **kwargs)
+    else:
+      return _graph_mode_decorator(f, *args, **kwargs)
+
+  return tf_decorator.make_decorator(f, decorated)
+
+
+def _graph_mode_decorator(f, *args, **kwargs):
+  """Implement custom gradient decorator for graph mode."""
+  # TODO(rsepassi): Add support for kwargs
+  if kwargs:
+    raise ValueError(
+        "The custom_gradient decorator currently supports keywords "
+        "arguments only when eager execution is enabled.")
+  name = "CustomGradient-%s" % ops.uid()
+  args = [ops.convert_to_tensor(x) for x in args]
+
+  # Checking global and local variables attempts to ensure that no non-resource
+  # Variables are added to the graph.
+  current_var_scope = variable_scope.get_variable_scope()
+  before_vars = set(current_var_scope.global_variables() +
+                    current_var_scope.local_variables())
+  with backprop.GradientTape() as tape:
+    result, grad_fn = f(*args)
+  after_vars = set(current_var_scope.global_variables() +
+                   current_var_scope.local_variables())
+  new_vars = after_vars - before_vars
+  for v in new_vars:
+    if not isinstance(v, resource_variable_ops.ResourceVariable):
+      raise TypeError(
+          "All variables used by a function wrapped with @custom_gradient must "
+          "be `ResourceVariable`s. Ensure that no `variable_scope` is created "
+          "with `use_resource=False`.")
+  # The variables that grad_fn needs to return gradients for are the set of
+  # variables used that are *not* part of the inputs.
+  variables = list(set(tape.watched_variables()) - set(args))
+  grad_argspec = tf_inspect.getargspec(grad_fn)
+  variables_in_signature = ("variables" in grad_argspec.args or
+                            grad_argspec.keywords)
+  if variables and not variables_in_signature:
+    raise TypeError("If using @custom_gradient with a function that "
+                    "uses variables, then grad_fn must accept a keyword "
+                    "argument 'variables'.")
+  if variables_in_signature and not variables:
+    # User seems to intend to use variables but none were captured.
+    if not variable_scope.get_variable_scope().use_resource:
+      raise TypeError("If using @custom_gradient with a function that "
+                      "uses variables, the enclosing variable scope must "
+                      "have use_resource=True.")
+    else:
+      logging.warn("@custom_gradient grad_fn has 'variables' in signature, but "
+                   "no ResourceVariables were used on the forward pass.")
+  flat_result = nest.flatten(result)
+  all_tensors = flat_result + args + variables
+
+  @ops.RegisterGradient(name)
+  def internal_grad_fn(unused_op, *result_grads):  # pylint: disable=unused-variable
+    """Custom grad fn wrapper."""
+    result_grads = result_grads[:len(flat_result)]
+    if variables:
+      input_grads, variable_grads = grad_fn(*result_grads, variables=variables)
+      if len(variable_grads) != len(variables):
+        raise ValueError("Must return gradient for each variable from "
+                         "@custom_gradient grad_fn.")
+    else:
+      input_grads = grad_fn(*result_grads)
+      variable_grads = []
+
+    # Need to return one value per input to the IdentityN, so pad the
+    # gradients of the inputs of the custom_gradient function with the
+    # gradients of the outputs as well.
+    input_grads = nest.flatten(input_grads)
+    return ([None] * len(flat_result)) + input_grads + variable_grads
+
+  with ops.get_default_graph().gradient_override_map({"IdentityN": name}):
+    all_tensors = array_ops.identity_n(all_tensors)
+  return nest.pack_sequence_as(
+      structure=result, flat_sequence=all_tensors[:len(flat_result)])
+
+
+def _eager_mode_decorator(f, *args, **kwargs):
+  """Implement custom gradient decorator for eager mode."""
+  with backprop.GradientTape() as tape:
+    result, grad_fn = f(*args, **kwargs)
+  all_inputs = list(args) + list(kwargs.values())
+  # The variables that grad_fn needs to return gradients for are the set of
+  # variables used that are *not* part of the inputs.
+  variables = [v for v in set(tape.watched_variables()) if v not in all_inputs]
+  grad_argspec = tf_inspect.getargspec(grad_fn)
+  if (variables and
+      not ("variables" in grad_argspec.args or grad_argspec.keywords)):
+    raise TypeError("If using @custom_gradient with a function that "
+                    "uses variables, then grad_fn must accept a keyword "
+                    "argument 'variables'.")
+  flat_result = nest.flatten(result)
+  # TODO(apassos) consider removing the identity below.
+  flat_result = [gen_array_ops.identity(x) for x in flat_result]
+
+  def actual_grad_fn(*result_grads):
+    """Custom grad fn wrapper."""
+    if variables:
+      input_grads, variable_grads = grad_fn(*result_grads, variables=variables)
+      if len(variable_grads) != len(variables):
+        raise ValueError("Must return gradient for each variable from "
+                         "@custom_gradient grad_fn.")
+    else:
+      input_grads = grad_fn(*result_grads)
+      variable_grads = []
+    return nest.flatten(input_grads) + variable_grads
+
+  input_tensors = [ops.convert_to_tensor(x) for x
+                   in list(args) + list(variables)]
+  tape_lib.record_operation(f.__name__, flat_result, input_tensors,
+                            actual_grad_fn)
+  flat_result = list(flat_result)
+  return nest.pack_sequence_as(result, flat_result)
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 03ed537cfcf27151a0200d7a17f63b1a2bc7ba1a..62c5adc385a2e87d27298c72f8dd2f67303119df 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -159,7 +159,7 @@ class QueueBase(object):
       ValueError: If one of the arguments is invalid.
       RuntimeError: If eager execution is enabled.
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError(
           "Queues are not supported when eager execution is enabled. "
           "Instead, please use tf.data to get data into your model.")
@@ -177,10 +177,10 @@ class QueueBase(object):
     else:
       self._names = None
     self._queue_ref = queue_ref
-    if context.in_graph_mode():
-      self._name = self._queue_ref.op.name.split("/")[-1]
-    else:
+    if context.executing_eagerly():
       self._name = context.context().scope_name
+    else:
+      self._name = self._queue_ref.op.name.split("/")[-1]
 
   @staticmethod
   def from_list(index, queues):
@@ -231,9 +231,9 @@ class QueueBase(object):
   @property
   def name(self):
     """The name of the underlying queue."""
-    if context.in_graph_mode():
-      return self._queue_ref.op.name
-    return self._name
+    if context.executing_eagerly():
+      return self._name
+    return self._queue_ref.op.name
 
   @property
   def dtypes(self):
@@ -342,10 +342,10 @@ class QueueBase(object):
         val.get_shape().assert_is_compatible_with(shape)
 
       if self._queue_ref.dtype == _dtypes.resource:
-        return gen_data_flow_ops._queue_enqueue_v2(
+        return gen_data_flow_ops.queue_enqueue_v2(
             self._queue_ref, vals, name=scope)
       else:
-        return gen_data_flow_ops._queue_enqueue(
+        return gen_data_flow_ops.queue_enqueue(
             self._queue_ref, vals, name=scope)
 
   def enqueue_many(self, vals, name=None):
@@ -387,7 +387,7 @@ class QueueBase(object):
             val.get_shape().with_rank_at_least(1)[0])
         val.get_shape()[1:].assert_is_compatible_with(shape)
 
-      return gen_data_flow_ops._queue_enqueue_many_v2(
+      return gen_data_flow_ops.queue_enqueue_many_v2(
           self._queue_ref, vals, name=scope)
 
   def _dequeue_return_value(self, tensors):
@@ -436,15 +436,15 @@ class QueueBase(object):
     if name is None:
       name = "%s_Dequeue" % self._name
     if self._queue_ref.dtype == _dtypes.resource:
-      ret = gen_data_flow_ops._queue_dequeue_v2(
+      ret = gen_data_flow_ops.queue_dequeue_v2(
           self._queue_ref, self._dtypes, name=name)
     else:
-      ret = gen_data_flow_ops._queue_dequeue(
+      ret = gen_data_flow_ops.queue_dequeue(
           self._queue_ref, self._dtypes, name=name)
 
     # NOTE(mrry): Not using a shape function because we need access to
     # the `QueueBase` object.
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       op = ret[0].op
       for output, shape in zip(op.values(), self._shapes):
         output.set_shape(shape)
@@ -479,12 +479,12 @@ class QueueBase(object):
     if name is None:
       name = "%s_DequeueMany" % self._name
 
-    ret = gen_data_flow_ops._queue_dequeue_many_v2(
+    ret = gen_data_flow_ops.queue_dequeue_many_v2(
         self._queue_ref, n=n, component_types=self._dtypes, name=name)
 
     # NOTE(mrry): Not using a shape function because we need access to
     # the Queue object.
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       op = ret[0].op
       batch_dim = tensor_shape.Dimension(
           tensor_util.constant_value(op.inputs[1]))
@@ -523,12 +523,12 @@ class QueueBase(object):
     if name is None:
       name = "%s_DequeueUpTo" % self._name
 
-    ret = gen_data_flow_ops._queue_dequeue_up_to_v2(
+    ret = gen_data_flow_ops.queue_dequeue_up_to_v2(
         self._queue_ref, n=n, component_types=self._dtypes, name=name)
 
     # NOTE(mrry): Not using a shape function because we need access to
     # the Queue object.
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       op = ret[0].op
       for output, shape in zip(op.values(), self._shapes):
         output.set_shape(tensor_shape.TensorShape([None]).concatenate(shape))
@@ -560,18 +560,18 @@ class QueueBase(object):
     if name is None:
       name = "%s_Close" % self._name
     if self._queue_ref.dtype == _dtypes.resource:
-      return gen_data_flow_ops._queue_close_v2(
+      return gen_data_flow_ops.queue_close_v2(
           self._queue_ref,
           cancel_pending_enqueues=cancel_pending_enqueues,
           name=name)
     else:
-      return gen_data_flow_ops._queue_close(
+      return gen_data_flow_ops.queue_close(
           self._queue_ref,
           cancel_pending_enqueues=cancel_pending_enqueues,
           name=name)
 
   def is_closed(self, name=None):
-    """ Returns true if queue is closed.
+    """Returns true if queue is closed.
 
     This operation returns true if the queue is closed and false if the queue
     is open.
@@ -601,9 +601,9 @@ class QueueBase(object):
     if name is None:
       name = "%s_Size" % self._name
     if self._queue_ref.dtype == _dtypes.resource:
-      return gen_data_flow_ops._queue_size_v2(self._queue_ref, name=name)
+      return gen_data_flow_ops.queue_size_v2(self._queue_ref, name=name)
     else:
-      return gen_data_flow_ops._queue_size(self._queue_ref, name=name)
+      return gen_data_flow_ops.queue_size(self._queue_ref, name=name)
 
 
 @tf_export("RandomShuffleQueue")
@@ -683,7 +683,7 @@ class RandomShuffleQueue(QueueBase):
       # the id of the last op created.)
       string = (str(seed1) + shared_name).encode("utf-8")
       seed2 = int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
-    queue_ref = gen_data_flow_ops._random_shuffle_queue_v2(
+    queue_ref = gen_data_flow_ops.random_shuffle_queue_v2(
         component_types=dtypes,
         shapes=shapes,
         capacity=capacity,
@@ -748,7 +748,7 @@ class FIFOQueue(QueueBase):
     dtypes = _as_type_list(dtypes)
     shapes = _as_shape_list(shapes, dtypes)
     names = _as_name_list(names, dtypes)
-    queue_ref = gen_data_flow_ops._fifo_queue_v2(
+    queue_ref = gen_data_flow_ops.fifo_queue_v2(
         component_types=dtypes,
         shapes=shapes,
         capacity=capacity,
@@ -827,7 +827,7 @@ class PaddingFIFOQueue(QueueBase):
                        "but received %d dtypes and %d shapes." % (len(dtypes),
                                                                   len(shapes)))
 
-    queue_ref = gen_data_flow_ops._padding_fifo_queue_v2(
+    queue_ref = gen_data_flow_ops.padding_fifo_queue_v2(
         component_types=dtypes,
         shapes=shapes,
         capacity=capacity,
@@ -895,7 +895,7 @@ class PriorityQueue(QueueBase):
     types = _as_type_list(types)
     shapes = _as_shape_list(shapes, types)
 
-    queue_ref = gen_data_flow_ops._priority_queue_v2(
+    queue_ref = gen_data_flow_ops.priority_queue_v2(
         component_types=types,
         shapes=shapes,
         capacity=capacity,
@@ -985,15 +985,15 @@ class Barrier(object):
     else:
       self._shapes = [tensor_shape.unknown_shape() for _ in self._types]
 
-    self._barrier_ref = gen_data_flow_ops._barrier(
+    self._barrier_ref = gen_data_flow_ops.barrier(
         component_types=self._types,
         shapes=self._shapes,
         shared_name=shared_name,
         name=name)
-    if context.in_graph_mode():
-      self._name = self._barrier_ref.op.name.split("/")[-1]
-    else:
+    if context.executing_eagerly():
       self._name = context.context().scope_name
+    else:
+      self._name = self._barrier_ref.op.name.split("/")[-1]
 
   @property
   def barrier_ref(self):
@@ -1003,9 +1003,9 @@ class Barrier(object):
   @property
   def name(self):
     """The name of the underlying barrier."""
-    if context.in_graph_mode():
-      return self._barrier_ref.op.name
-    return self._name
+    if context.executing_eagerly():
+      return self._name
+    return self._barrier_ref.op.name
 
   def insert_many(self, component_index, keys, values, name=None):
     """For each key, assigns the respective value to the specified component.
@@ -1026,7 +1026,7 @@ class Barrier(object):
     """
     if name is None:
       name = "%s_BarrierInsertMany" % self._name
-    return gen_data_flow_ops._barrier_insert_many(
+    return gen_data_flow_ops.barrier_insert_many(
         self._barrier_ref, keys, values, component_index, name=name)
 
   def take_many(self,
@@ -1073,7 +1073,7 @@ class Barrier(object):
     """
     if name is None:
       name = "%s_BarrierTakeMany" % self._name
-    ret = gen_data_flow_ops._barrier_take_many(
+    ret = gen_data_flow_ops.barrier_take_many(
         self._barrier_ref,
         num_elements,
         self._types,
@@ -1083,7 +1083,7 @@ class Barrier(object):
 
     # NOTE(mrry): Not using a shape function because we need access to
     # the Barrier object.
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       op = ret[0].op
       if allow_small_batch:
         batch_dim = None
@@ -1122,7 +1122,7 @@ class Barrier(object):
     """
     if name is None:
       name = "%s_BarrierClose" % self._name
-    return gen_data_flow_ops._barrier_close(
+    return gen_data_flow_ops.barrier_close(
         self._barrier_ref,
         cancel_pending_enqueues=cancel_pending_enqueues,
         name=name)
@@ -1139,7 +1139,7 @@ class Barrier(object):
     """
     if name is None:
       name = "%s_BarrierReadySize" % self._name
-    return gen_data_flow_ops._barrier_ready_size(self._barrier_ref, name=name)
+    return gen_data_flow_ops.barrier_ready_size(self._barrier_ref, name=name)
 
   def incomplete_size(self, name=None):
     """Compute the number of incomplete elements in the given barrier.
@@ -1153,7 +1153,7 @@ class Barrier(object):
     """
     if name is None:
       name = "%s_BarrierIncompleteSize" % self._name
-    return gen_data_flow_ops._barrier_incomplete_size(
+    return gen_data_flow_ops.barrier_incomplete_size(
         self._barrier_ref, name=name)
 
 
@@ -1183,10 +1183,10 @@ class ConditionalAccumulatorBase(object):
     else:
       self._shape = tensor_shape.unknown_shape()
     self._accumulator_ref = accumulator_ref
-    if context.in_graph_mode():
-      self._name = self._accumulator_ref.op.name.split("/")[-1]
-    else:
+    if context.executing_eagerly():
       self._name = context.context().scope_name
+    else:
+      self._name = self._accumulator_ref.op.name.split("/")[-1]
 
   @property
   def accumulator_ref(self):
@@ -1563,7 +1563,7 @@ class BaseStagingArea(object):
     of the staging area.
 
     Args:
-      vals: A tensor, a list or tuple of tensors, or a dictionary..
+      vals: A tensor, a list or tuple of tensors, or a dictionary.
 
     Returns:
       A (tensors, indices) tuple where `tensors` is a list of `Tensor` objects
@@ -1582,7 +1582,7 @@ class BaseStagingArea(object):
                          (sorted(vals.keys()), sorted(self._names)))
       # The order of values in `self._names` indicates the order in which the
       # tensors in the dictionary `vals` must be listed.
-      vals, indices, n = zip(*[(vals[k], i, k)
+      vals, indices, _ = zip(*[(vals[k], i, k)
                                for i, k in enumerate(self._names)
                                if k in vals])
     else:
@@ -1612,7 +1612,7 @@ class BaseStagingArea(object):
     for val, i in zip(vals, indices):
       dtype, shape = self._dtypes[i], self._shapes[i]
       # Check dtype
-      if not val.dtype == dtype:
+      if val.dtype != dtype:
         raise ValueError("Datatypes do not match. '%s' != '%s'" %
                          (str(val.dtype), str(dtype)))
 
@@ -1626,7 +1626,7 @@ class BaseStagingArea(object):
 
   def _create_device_transfers(self, tensors):
     """Encode inter-device transfers if the current device
-    is not the same as the Staging Area's device
+    is not the same as the Staging Area's device.
     """
 
     if not isinstance(tensors, (tuple, list)):
@@ -1739,11 +1739,6 @@ class StagingArea(BaseStagingArea):
     Args:
       dtypes:  A list of types.  The length of dtypes must equal the number
         of tensors in each element.
-      capacity: (Optional.) Maximum number of elements.
-        An integer. If zero, the Staging Area is unbounded
-      memory_limit: (Optional.) Maximum number of bytes of all tensors
-        in the Staging Area.
-        An integer. If zero, the Staging Area is unbounded
       shapes: (Optional.) Constraints on the shapes of tensors in an element.
         A list of shape tuples or None. This list is the same length
         as dtypes.  If the shape of any tensors in the element are constrained,
@@ -1754,6 +1749,11 @@ class StagingArea(BaseStagingArea):
       shared_name: (Optional.) A name to be used for the shared object. By
         passing the same name to two different python objects they will share
         the underlying staging area. Must be a string.
+      capacity: (Optional.) Maximum number of elements.
+        An integer. If zero, the Staging Area is unbounded
+      memory_limit: (Optional.) Maximum number of bytes of all tensors
+        in the Staging Area.
+        An integer. If zero, the Staging Area is unbounded
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -1769,7 +1769,9 @@ class StagingArea(BaseStagingArea):
     its capacity.
 
     Args:
-      values: Tensor (or a tuple of Tensors) to place into the staging area.
+      values: A single tensor, a list or tuple of tensors, or a dictionary with
+        tensor values. The number of elements must match the length of the
+        list provided to the dtypes argument when creating the StagingArea.
       name: A name for the operation (optional).
 
     Returns:
@@ -1781,10 +1783,11 @@ class StagingArea(BaseStagingArea):
     with ops.name_scope(name, "%s_put" % self._name,
                         self._scope_vals(values)) as scope:
 
+      if not isinstance(values, (list, tuple, dict)):
+        values = [values]
+
       # Hard-code indices for this staging area
-      indices = (
-          list(six.moves.range(len(values)))
-          if isinstance(values, (list, tuple)) else None)
+      indices = list(six.moves.range(len(values)))
       vals, _ = self._check_put_dtypes(values, indices)
 
       with ops.colocate_with(self._coloc_op):
@@ -1908,7 +1911,8 @@ class StagingArea(BaseStagingArea):
 
 
 class MapStagingArea(BaseStagingArea):
-  """A `MapStagingArea` is a TensorFlow data structure that stores tensors across multiple steps, and exposes operations that can put and get tensors.
+  """A `MapStagingArea` is a TensorFlow data structure that stores tensors
+  across multiple steps, and exposes operations that can put and get tensors.
 
   Each `MapStagingArea` element is a (key, value) pair.
   Only int64 keys are supported, other types should be
@@ -2372,7 +2376,7 @@ class RecordInput(object):
       return records
     else:
       with ops.name_scope(self._name):
-        batch_list = [[] for i in six.moves.range(self._batches)]
+        batch_list = [[] for _ in six.moves.range(self._batches)]
         records = array_ops.split(records, self._batch_size, 0)
         records = [array_ops.reshape(record, []) for record in records]
         for index, protobuf in zip(six.moves.range(len(records)), records):
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index 50b956a267320b40cb39fbff8b5965a6420146d7..e7ad028376b841fc485f7c9cded2a3cdd9dcc153 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -8,9 +8,13 @@ licenses(["notice"])  # Apache 2.0
 
 py_library(
     name = "distributions",
-    srcs = glob(["*.py"]),
+    srcs = glob(
+        ["*.py"],
+        exclude = ["util.py"],
+    ),
     srcs_version = "PY2AND3",
     deps = [
+        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
@@ -27,14 +31,22 @@ py_library(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:tensor_util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
 )
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 68aaf3815e7e2b21c9550562aa49195569c8ea43..2c9f0e9a32dd3f2caa81befebc06dcc740c832cb 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -72,7 +72,7 @@ class Bernoulli(distribution.Distribution):
       ValueError: If p and logits are passed, or if neither are passed.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
           probs=probs,
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 469bcadb8ea3a0ec2a85d3a72c0ca5ba08796856..8beab99bf868cdb16dc842eae113cf55dd565131 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -151,7 +151,7 @@ class Beta(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration1, concentration0]):
+    with ops.name_scope(name, values=[concentration1, concentration0]) as name:
       self._concentration1 = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration1, name="concentration1"),
           validate_args)
@@ -323,7 +323,7 @@ class BetaWithSoftplusConcentration(Beta):
                name="BetaWithSoftplusConcentration"):
     parameters = locals()
     with ops.name_scope(name, values=[concentration1,
-                                      concentration0]) as ns:
+                                      concentration0]) as name:
       super(BetaWithSoftplusConcentration, self).__init__(
           concentration1=nn.softplus(concentration1,
                                      name="softplus_concentration1"),
@@ -331,7 +331,7 @@ class BetaWithSoftplusConcentration(Beta):
                                      name="softplus_concentration0"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
 
 
diff --git a/tensorflow/python/ops/distributions/bijector.py b/tensorflow/python/ops/distributions/bijector.py
index 84bd0a20da38d15c5bd23d7e0b906063702b46de..94a77a205a2d76f0c7e047644bc6bff89b13b420 100644
--- a/tensorflow/python/ops/distributions/bijector.py
+++ b/tensorflow/python/ops/distributions/bijector.py
@@ -23,8 +23,3 @@ from __future__ import print_function
 from tensorflow.python.ops.distributions.bijector_impl import Bijector
 
 # pylint: enable=wildcard-import,unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ["Bijector"]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index ed435557fde7a2e8a0a4f7eef4e240daef0565e7..4ebc600d034603a80d4fae93b1339e1a1feea038 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -23,7 +23,6 @@ import collections
 import contextlib
 import re
 
-import numpy as np
 import six
 
 from tensorflow.python.framework import dtypes
@@ -31,8 +30,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -41,23 +40,24 @@ __all__ = [
 
 
 class _Mapping(collections.namedtuple(
-    "_Mapping", ["x", "y", "ildj", "kwargs"])):
+    "_Mapping", ["x", "y", "ildj_map", "kwargs"])):
   """Helper class to make it easier to manage caching in `Bijector`."""
 
-  def __new__(cls, x=None, y=None, ildj=None, kwargs=None):
+  def __new__(cls, x=None, y=None, ildj_map=None, kwargs=None):
     """Custom __new__ so namedtuple items have defaults.
 
     Args:
       x: `Tensor`. Forward.
       y: `Tensor`. Inverse.
-      ildj: `Tensor`. Inverse log det Jacobian.
+      ildj_map: `Dictionary`. This is a mapping from event_ndims to a `Tensor`
+        representing the inverse log det jacobian.
       kwargs: Python dictionary. Extra args supplied to
         forward/inverse/etc functions.
 
     Returns:
       mapping: New instance of _Mapping.
     """
-    return super(_Mapping, cls).__new__(cls, x, y, ildj, kwargs)
+    return super(_Mapping, cls).__new__(cls, x, y, ildj_map, kwargs)
 
   @property
   def x_key(self):
@@ -69,13 +69,14 @@ class _Mapping(collections.namedtuple(
     """Returns key used for caching X=g^{-1}(Y)."""
     return (self.y,) + self._deep_tuple(tuple(sorted(self.kwargs.items())))
 
-  def merge(self, x=None, y=None, ildj=None, kwargs=None, mapping=None):
+  def merge(self, x=None, y=None, ildj_map=None, kwargs=None, mapping=None):
     """Returns new _Mapping with args merged with self.
 
     Args:
       x: `Tensor`. Forward.
       y: `Tensor`. Inverse.
-      ildj: `Tensor`. Inverse log det Jacobian.
+      ildj_map: `Dictionary`. This is a mapping from event_ndims to a `Tensor`
+        representing the inverse log det jacobian.
       kwargs: Python dictionary. Extra args supplied to
         forward/inverse/etc functions.
       mapping: Instance of _Mapping to merge. Can only be specified if no other
@@ -88,15 +89,30 @@ class _Mapping(collections.namedtuple(
       ValueError: if mapping and any other arg is not `None`.
     """
     if mapping is None:
-      mapping = _Mapping(x=x, y=y, ildj=ildj, kwargs=kwargs)
-    elif not all(arg is None for arg in [x, y, ildj, kwargs]):
-      raise ValueError("Cannot specify mapping and individual args.")
+      mapping = _Mapping(x=x, y=y, ildj_map=ildj_map, kwargs=kwargs)
+    elif any(arg is not None for arg in [x, y, ildj_map, kwargs]):
+      raise ValueError("Cannot simultaneously specify mapping and individual "
+                       "arguments.")
+
     return _Mapping(
         x=self._merge(self.x, mapping.x),
         y=self._merge(self.y, mapping.y),
-        ildj=self._merge(self.ildj, mapping.ildj),
+        ildj_map=self._merge_dicts(self.ildj_map, mapping.ildj_map),
         kwargs=self._merge(self.kwargs, mapping.kwargs))
 
+  def _merge_dicts(self, old=None, new=None):
+    """Helper to merge two dictionaries."""
+    old = dict() if old is None else old
+    new = dict() if new is None else new
+    for k, v in six.iteritems(new):
+      val = old.get(k, None)
+      if val is not None and val != v:
+        raise ValueError("Found different value for existing key "
+                         "(key:{} old_value:{} new_value:{}".format(
+                             k, old[k], v))
+      old[k] = v
+    return old
+
   def _merge(self, old, new):
     """Helper to merge which handles merging one value."""
     if old is None:
@@ -112,7 +128,6 @@ class _Mapping(collections.namedtuple(
 
 
 @six.add_metaclass(abc.ABCMeta)
-@tf_export("distributions.bijectors.Bijector")
 class Bijector(object):
   r"""Interface for transformations of a `Distribution` sample.
 
@@ -137,11 +152,11 @@ class Bijector(object):
   2. Inverse\
      Useful for "reversing" a transformation to compute one probability in
      terms of another.
-  3. `(log o det o Jacobian o inverse)(x)`\
+  3. `log_det_jacobian(x)`\
      "The log of the determinant of the matrix of all first-order partial
      derivatives of the inverse function."\
      Useful for inverting a transformation to compute one probability in terms
-     of another. Geometrically, the det(Jacobian) is the volume of the
+     of another. Geometrically, the Jacobian determinant is the volume of the
      transformation and is used to scale the probability.
 
   By convention, transformations of random variables are named in terms of the
@@ -164,7 +179,7 @@ class Bijector(object):
 
   ```python
   def transformed_log_prob(bijector, log_prob, x):
-    return (bijector.inverse_log_det_jacobian(x) +
+    return (bijector.inverse_log_det_jacobian(x, event_ndims=0) +
             log_prob(bijector.inverse(x)))
   ```
 
@@ -199,9 +214,11 @@ class Bijector(object):
     ```python
       class Exp(Bijector):
 
-        def __init__(self, event_ndims=0, validate_args=False, name="exp"):
+        def __init__(self, validate_args=False, name="exp"):
           super(Exp, self).__init__(
-              event_ndims=event_ndims, validate_args=validate_args, name=name)
+              validate_args=validate_args,
+              forward_min_event_ndims=0,
+              name=name)
 
         def _forward(self, x):
           return math_ops.exp(x)
@@ -213,10 +230,11 @@ class Bijector(object):
           return -self._forward_log_det_jacobian(self._inverse(y))
 
         def _forward_log_det_jacobian(self, x):
-          if self.event_ndims is None:
-            raise ValueError("Jacobian requires known event_ndims.")
-          event_dims = array_ops.shape(x)[-self.event_ndims:]
-          return math_ops.reduce_sum(x, axis=event_dims)
+          # Notice that we needn't do any reducing, even when`event_ndims > 0`.
+          # The base Bijector class will handle reducing for us; it knows how
+          # to do so because we called `super` `__init__` with
+          # `forward_min_event_ndims = 0`.
+          return x
       ```
 
   - "Affine"
@@ -237,18 +255,50 @@ class Bijector(object):
                   MultivariateNormal(inv(sqrtSigma) * (y - mu); 0, I_d)
       ```
 
-  #### Jacobian
+  #### Min_event_ndims and Naming
+
+  Bijectors are named for the dimensionality of data they act on (i.e. without
+  broadcasting). We can think of bijectors having an intrinsic `min_event_ndims`
+  , which is the minimum number of dimensions for the bijector act on. For
+  instance, a Cholesky decomposition requires a matrix, and hence
+  `min_event_ndims=2`.
+
+  Some examples:
+
+  `AffineScalar:  min_event_ndims=0`
+  `Affine:  min_event_ndims=1`
+  `Cholesky:  min_event_ndims=2`
+  `Exp:  min_event_ndims=0`
+  `Sigmoid:  min_event_ndims=0`
+  `SoftmaxCentered:  min_event_ndims=1`
+
+  Note the difference between `Affine` and `AffineScalar`. `AffineScalar`
+  operates on scalar events, whereas `Affine` operates on vector-valued events.
 
-  The Jacobian is a reduction over event dims. To see this, consider the `Exp`
-  `Bijector` applied to a `Tensor` which has sample, batch, and event (S, B, E)
-  shape semantics. Suppose the `Tensor`'s partitioned-shape is `(S=[4], B=[2],
-  E=[3, 3])`. The shape of the `Tensor` returned by `forward` and `inverse` is
-  unchanged, i.e., `[4, 2, 3, 3]`.  However the shape returned by
-  `inverse_log_det_jacobian` is `[4, 2]` because the Jacobian is a reduction
-  over the event dimensions.
+  More generally, there is a `forward_min_event_ndims` and an
+  `inverse_min_event_ndims`. In most cases, these will be the same.
+  However, for some shape changing bijectors, these will be different
+  (e.g. a bijector which pads an extra dimension at the end, might have
+  `forward_min_event_ndims=0` and `inverse_min_event_ndims=1`.
 
-  It is sometimes useful to implement the inverse Jacobian as the negative
-  forward Jacobian. For example,
+
+  #### Jacobian Determinant
+
+  The Jacobian determinant is a reduction over `event_ndims - min_event_ndims`
+  (`forward_min_event_ndims` for `forward_log_det_jacobian` and
+  `inverse_min_event_ndims` for `inverse_log_det_jacobian`).
+  To see this, consider the `Exp` `Bijector` applied to a `Tensor` which has
+  sample, batch, and event (S, B, E) shape semantics. Suppose the `Tensor`'s
+  partitioned-shape is `(S=[4], B=[2], E=[3, 3])`. The shape of the `Tensor`
+  returned by `forward` and `inverse` is unchanged, i.e., `[4, 2, 3, 3]`.
+  However the shape returned by `inverse_log_det_jacobian` is `[4, 2]` because
+  the Jacobian determinant is a reduction over the event dimensions.
+
+  Another example is the `Affine` `Bijector`. Because `min_event_ndims = 1`, the
+  Jacobian determinant reduction is over `event_ndims - 1`.
+
+  It is sometimes useful to implement the inverse Jacobian determinant as the
+  negative forward Jacobian determinant. For example,
 
   ```python
   def _inverse_log_det_jacobian(self, y):
@@ -279,9 +329,54 @@ class Bijector(object):
       The claim follows from [properties of determinant](
   https://en.wikipedia.org/wiki/Determinant#Multiplicativity_and_matrix_groups).
 
-  Generally its preferable to directly implement the inverse Jacobian. This
-  should have superior numerical stability and will often share subgraphs with
-  the `_inverse` implementation.
+  Generally its preferable to directly implement the inverse Jacobian
+  determinant.  This should have superior numerical stability and will often
+  share subgraphs with the `_inverse` implementation.
+
+  #### Is_constant_jacobian
+
+  Certain bijectors will have constant jacobian matrices. For instance, the
+  `Affine` bijector encodes multiplication by a matrix plus a shift, with
+  jacobian matrix, the same aforementioned matrix.
+
+  `is_constant_jacobian` encodes the fact that the jacobian matrix is constant.
+  The semantics of this argument are the following:
+
+    * Repeated calls to "log_det_jacobian" functions with the same
+      `event_ndims` (but not necessarily same input), will return the first
+      computed jacobian (because the matrix is constant, and hence is input
+      independent).
+    * `log_det_jacobian` implementations are merely broadcastable to the true
+      `log_det_jacobian` (because, again, the jacobian matrix is input
+      independent). Specifically, `log_det_jacobian` is implemented as the
+      log jacobian determinant for a single input.
+
+      ```python
+      class Identity(Bijector):
+
+        def __init__(self, validate_args=False, name="identity"):
+          super(Identity, self).__init__(
+              is_constant_jacobian=True,
+              validate_args=validate_args,
+              forward_min_event_ndims=0,
+              name=name)
+
+        def _forward(self, x):
+          return x
+
+        def _inverse(self, y):
+          return y
+
+        def _inverse_log_det_jacobian(self, y):
+          return -self._forward_log_det_jacobian(self._inverse(y))
+
+        def _forward_log_det_jacobian(self, x):
+          # The full log jacobian determinant would be array_ops.zero_like(x).
+          # However, we circumvent materializing that, since the jacobian
+          # calculation is input independent, and we specify it for one input.
+          return constant_op.constant(0., x.dtype.base_dtype)
+
+      ```
 
   #### Subclass Requirements
 
@@ -364,14 +459,14 @@ class Bijector(object):
   ==> (-1., 1.)
 
   # The |dX/dY| is constant, == 1.  So Log|dX/dY| == 0.
-  abs.inverse_log_det_jacobian(1.)
+  abs.inverse_log_det_jacobian(1., event_ndims=0)
   ==> (0., 0.)
 
   # Special case handling of 0.
   abs.inverse(0.)
   ==> (0., 0.)
 
-  abs.inverse_log_det_jacobian(0.)
+  abs.inverse_log_det_jacobian(0., event_ndims=0)
   ==> (0., 0.)
   ```
 
@@ -379,11 +474,12 @@ class Bijector(object):
 
   @abc.abstractmethod
   def __init__(self,
-               event_ndims=None,
                graph_parents=None,
                is_constant_jacobian=False,
                validate_args=False,
                dtype=None,
+               forward_min_event_ndims=None,
+               inverse_min_event_ndims=None,
                name=None):
     """Constructs Bijector.
 
@@ -392,42 +488,61 @@ class Bijector(object):
     Examples:
 
     ```python
-    # Create the Y = g(X) = X transform which operates on vector events.
-    identity = Identity(event_ndims=1)
+    # Create the Y = g(X) = X transform.
+    identity = Identity()
 
-    # Create the Y = g(X) = exp(X) transform which operates on matrices.
-    exp = Exp(event_ndims=2)
+    # Create the Y = g(X) = exp(X) transform.
+    exp = Exp()
     ```
 
     See `Bijector` subclass docstring for more details and specific examples.
 
     Args:
-      event_ndims: number of dimensions associated with event coordinates.
       graph_parents: Python list of graph prerequisites of this `Bijector`.
-      is_constant_jacobian: Python `bool` indicating that the Jacobian is not a
-        function of the input.
+      is_constant_jacobian: Python `bool` indicating that the Jacobian matrix is
+        not a function of the input.
       validate_args: Python `bool`, default `False`. Whether to validate input
         with asserts. If `validate_args` is `False`, and the inputs are invalid,
         correct behavior is not guaranteed.
       dtype: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
         enforced.
+      forward_min_event_ndims: Python `integer` indicating the minimum number of
+        dimensions `forward` operates on.
+      inverse_min_event_ndims: Python `integer` indicating the minimum number of
+        dimensions `inverse` operates on. Will be set to
+        `forward_min_event_ndims` by default, if no value is provided.
       name: The name to give Ops created by the initializer.
 
     Raises:
+      ValueError:  If neither `forward_min_event_ndims` and
+        `inverse_min_event_ndims` are specified, or if either of them is
+        negative.
       ValueError:  If a member of `graph_parents` is not a `Tensor`.
     """
-    self._event_ndims = (
-        ops.convert_to_tensor(event_ndims, dtype=dtypes.int32)
-        if event_ndims is not None else None)
     self._graph_parents = graph_parents or []
+
+    if forward_min_event_ndims is None and inverse_min_event_ndims is None:
+      raise ValueError("Must specify at least one of `forward_min_event_ndims` "
+                       "and `inverse_min_event_ndims`.")
+    elif inverse_min_event_ndims is None:
+      inverse_min_event_ndims = forward_min_event_ndims
+    elif forward_min_event_ndims is None:
+      forward_min_event_ndims = inverse_min_event_ndims
+
+    if forward_min_event_ndims < 0:
+      raise ValueError("forward_min_event_ndims must be a non-negative "
+                       "integer.")
+    if inverse_min_event_ndims < 0:
+      raise ValueError("inverse_min_event_ndims must be a non-negative "
+                       "integer.")
+    self._forward_min_event_ndims = forward_min_event_ndims
+    self._inverse_min_event_ndims = inverse_min_event_ndims
     self._is_constant_jacobian = is_constant_jacobian
+    self._constant_ildj_map = {}
     self._validate_args = validate_args
     self._dtype = dtype
     self._from_y = {}
     self._from_x = {}
-    # Using abbreviation ildj for "inverse log det Jacobian."
-    # This variable is not `None` iff is_constant_jacobian is `True`.
-    self._constant_ildj = None
     if name:
       self._name = name
     else:
@@ -442,21 +557,27 @@ class Bijector(object):
       if t is None or not tensor_util.is_tensor(t):
         raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
 
-  @property
-  def event_ndims(self):
-    """Returns then number of event dimensions this bijector operates on."""
-    return self._event_ndims
-
   @property
   def graph_parents(self):
     """Returns this `Bijector`'s graph_parents as a Python list."""
     return self._graph_parents
 
+  @property
+  def forward_min_event_ndims(self):
+    """Returns the minimal number of dimensions bijector.forward operates on."""
+    return self._forward_min_event_ndims
+
+  @property
+  def inverse_min_event_ndims(self):
+    """Returns the minimal number of dimensions bijector.inverse operates on."""
+    return self._inverse_min_event_ndims
+
   @property
   def is_constant_jacobian(self):
-    """Returns true iff the Jacobian is not a function of x.
+    """Returns true iff the Jacobian matrix is not a function of x.
 
-    Note: Jacobian is either constant for both forward and inverse or neither.
+    Note: Jacobian matrix is either constant for both forward and inverse or
+    neither.
 
     Returns:
       is_constant_jacobian: Python `bool`.
@@ -653,36 +774,57 @@ class Bijector(object):
     return self._call_inverse(y, name)
 
   def _inverse_log_det_jacobian(self, y):
-    """Subclass implementation of `inverse_log_det_jacobian` public function."""
+    """Subclass implementation of `inverse_log_det_jacobian` public function.
+
+    In particular, this method differs from the public function, in that it
+    does not take `event_ndims`. Thus, this implements the minimal Jacobian
+    determinant calculation (i.e. over `inverse_min_event_ndims`).
+
+    Args:
+      y: `Tensor`. The input to the "inverse_log_det_jacobian" evaluation.
+    Returns:
+      inverse_log_det_jacobian: `Tensor`, if this bijector is injective.
+        If not injective, returns the k-tuple containing jacobians for the
+        unique `k` points `(x1, ..., xk)` such that `g(xi) = y`.
+    """
     raise NotImplementedError("inverse_log_det_jacobian not implemented.")
 
-  def _call_inverse_log_det_jacobian(self, y, name, **kwargs):
+  def _call_inverse_log_det_jacobian(self, y, event_ndims, name, **kwargs):
     with self._name_scope(name, [y]):
-      if self._constant_ildj is not None:
-        return self._constant_ildj
+      if event_ndims in self._constant_ildj_map:
+        return self._constant_ildj_map[event_ndims]
       y = ops.convert_to_tensor(y, name="y")
       self._maybe_assert_dtype(y)
       if not self._is_injective:  # No caching for non-injective
-        return self._inverse_log_det_jacobian(y, **kwargs)
+        ildjs = self._inverse_log_det_jacobian(y, **kwargs)
+        return tuple(self._reduce_jacobian_det_over_event(
+            y, ildj, self.inverse_min_event_ndims, event_ndims)
+                     for ildj in ildjs)
       mapping = self._lookup(y=y, kwargs=kwargs)
-      if mapping.ildj is not None:
-        return mapping.ildj
+      if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
+        return mapping.ildj_map[event_ndims]
       try:
         x = None  # Not needed; leave cache as is.
         ildj = self._inverse_log_det_jacobian(y, **kwargs)
+        ildj = self._reduce_jacobian_det_over_event(
+            y, ildj, self.inverse_min_event_ndims, event_ndims)
       except NotImplementedError as original_exception:
         try:
           x = mapping.x if mapping.x is not None else self._inverse(y, **kwargs)
           ildj = -self._forward_log_det_jacobian(x, **kwargs)
+          ildj = self._reduce_jacobian_det_over_event(
+              x, ildj, self.forward_min_event_ndims, event_ndims)
         except NotImplementedError:
           raise original_exception
-      mapping = mapping.merge(x=x, ildj=ildj)
+
+      mapping = mapping.merge(x=x, ildj_map={event_ndims: ildj})
       self._cache(mapping)
       if self.is_constant_jacobian:
-        self._constant_ildj = mapping.ildj
-      return mapping.ildj
+        self._constant_ildj_map[event_ndims] = ildj
+      return ildj
 
-  def inverse_log_det_jacobian(self, y, name="inverse_log_det_jacobian"):
+  def inverse_log_det_jacobian(
+      self, y, event_ndims, name="inverse_log_det_jacobian"):
     """Returns the (log o det o Jacobian o inverse)(y).
 
     Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
@@ -691,7 +833,12 @@ class Bijector(object):
     evaluated at `g^{-1}(y)`.
 
     Args:
-      y: `Tensor`. The input to the "inverse" Jacobian evaluation.
+      y: `Tensor`. The input to the "inverse" Jacobian determinant evaluation.
+      event_ndims: Number of dimensions in the probabilistic events being
+        transformed. Must be greater than or equal to
+        `self.inverse_min_event_ndims`. The result is summed over the final
+        dimensions to produce a scalar Jacobian determinant for each event,
+        i.e. it has shape `y.shape.ndims - event_ndims` dimensions.
       name: The name to give this op.
 
     Returns:
@@ -705,45 +852,74 @@ class Bijector(object):
         `self.dtype`.
       NotImplementedError: if `_inverse_log_det_jacobian` is not implemented.
     """
-    return self._call_inverse_log_det_jacobian(y, name)
+    with ops.control_dependencies(self._check_valid_event_ndims(
+        min_event_ndims=self.inverse_min_event_ndims, event_ndims=event_ndims)):
+      return self._call_inverse_log_det_jacobian(y, event_ndims, name)
 
   def _forward_log_det_jacobian(self, x):
-    """Subclass implementation of `forward_log_det_jacobian`."""
+    """Subclass implementation of `forward_log_det_jacobian` public function.
+
+    In particular, this method differs from the public function, in that it
+    does not take `event_ndims`. Thus, this implements the minimal Jacobian
+    determinant calculation (i.e. over `forward_min_event_ndims`).
+
+    Args:
+      x: `Tensor`. The input to the "forward_log_det_jacobian" evaluation.
+
+    Returns:
+      forward_log_det_jacobian: `Tensor`, if this bijector is injective.
+        If not injective, returns the k-tuple containing jacobians for the
+        unique `k` points `(x1, ..., xk)` such that `g(xi) = y`.
+    """
+
     raise NotImplementedError(
         "forward_log_det_jacobian not implemented.")
 
-  def _call_forward_log_det_jacobian(self, x, name, **kwargs):
+  def _call_forward_log_det_jacobian(self, x, event_ndims, name, **kwargs):
     with self._name_scope(name, [x]):
-      if self._constant_ildj is not None:
+      if event_ndims in self._constant_ildj_map:
         # Need "-1. *" to avoid invalid-unary-operand-type linter warning.
-        return -1. * self._constant_ildj
+        return -1. * self._constant_ildj_map[event_ndims]
       x = ops.convert_to_tensor(x, name="x")
       self._maybe_assert_dtype(x)
       if not self._is_injective:
-        return self._forward_log_det_jacobian(x, **kwargs)  # No caching.
+        fldjs = self._forward_log_det_jacobian(x, **kwargs)  # No caching.
+        return tuple(self._reduce_jacobian_det_over_event(
+            x, fldj, self.forward_min_event_ndims, event_ndims)
+                     for fldj in fldjs)
       mapping = self._lookup(x=x, kwargs=kwargs)
-      if mapping.ildj is not None:
-        return -mapping.ildj
+      if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
+        return -mapping.ildj_map[event_ndims]
       try:
         y = None  # Not needed; leave cache as is.
         ildj = -self._forward_log_det_jacobian(x, **kwargs)
+        ildj = self._reduce_jacobian_det_over_event(
+            x, ildj, self.forward_min_event_ndims, event_ndims)
       except NotImplementedError as original_exception:
         try:
           y = mapping.y if mapping.y is not None else self._forward(x, **kwargs)
           ildj = self._inverse_log_det_jacobian(y, **kwargs)
+          ildj = self._reduce_jacobian_det_over_event(
+              y, ildj, self.inverse_min_event_ndims, event_ndims)
         except NotImplementedError:
           raise original_exception
-      mapping = mapping.merge(y=y, ildj=ildj)
+      mapping = mapping.merge(y=y, ildj_map={event_ndims: ildj})
       self._cache(mapping)
       if self.is_constant_jacobian:
-        self._constant_ildj = mapping.ildj
-      return -mapping.ildj
+        self._constant_ildj_map[event_ndims] = ildj
+      return -ildj
 
-  def forward_log_det_jacobian(self, x, name="forward_log_det_jacobian"):
+  def forward_log_det_jacobian(
+      self, x, event_ndims, name="forward_log_det_jacobian"):
     """Returns both the forward_log_det_jacobian.
 
     Args:
-      x: `Tensor`. The input to the "forward" Jacobian evaluation.
+      x: `Tensor`. The input to the "forward" Jacobian determinant evaluation.
+      event_ndims: Number of dimensions in the probabilistic events being
+        transformed. Must be greater than or equal to
+        `self.forward_min_event_ndims`. The result is summed over the final
+        dimensions to produce a scalar Jacobian determinant for each event,
+        i.e. it has shape `x.shape.ndims - event_ndims` dimensions.
       name: The name to give this op.
 
     Returns:
@@ -761,7 +937,9 @@ class Bijector(object):
       raise NotImplementedError(
           "forward_log_det_jacobian cannot be implemented for non-injective "
           "transforms.")
-    return self._call_forward_log_det_jacobian(x, name)
+    with ops.control_dependencies(self._check_valid_event_ndims(
+        min_event_ndims=self.forward_min_event_ndims, event_ndims=event_ndims)):
+      return self._call_forward_log_det_jacobian(x, event_ndims, name)
 
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
@@ -779,9 +957,6 @@ class Bijector(object):
 
   def _cache(self, mapping):
     """Helper which stores mapping info in forward/inverse dicts."""
-    if self._constant_ildj is not None:
-      # Fold in ildj if known constant Jacobian.
-      mapping = mapping.merge(ildj=self._constant_ildj)
     # Merging from lookup is an added check that we're not overwriting anything
     # which is not None.
     mapping = mapping.merge(mapping=self._lookup(
@@ -803,22 +978,66 @@ class Bijector(object):
       return self._from_y.get(mapping.y_key, mapping)
     return mapping
 
-  def _event_dims_tensor(self, sample):
-    """Return a 1D `int32` tensor: `range(rank(sample))[-event_ndims:]`."""
-    if self.event_ndims is None:
-      raise ValueError("Jacobian cannot be computed with unknown event_ndims")
-    static_event_ndims = tensor_util.constant_value(self.event_ndims)
-    static_rank = sample.get_shape().ndims
-    if static_event_ndims is not None and static_rank is not None:
-      return ops.convert_to_tensor(
-          static_rank + np.arange(-static_event_ndims, 0).astype(np.int32))
-
-    if static_event_ndims is not None:
-      event_range = np.arange(-static_event_ndims, 0).astype(np.int32)
-    else:
-      event_range = math_ops.range(-self.event_ndims, 0, dtype=dtypes.int32)
-
-    if static_rank is not None:
-      return event_range + static_rank
+  def _reduce_jacobian_det_over_event(
+      self, y, ildj, min_event_ndims, event_ndims):
+    """Reduce jacobian over event_ndims - min_event_ndims."""
+    if not self.is_constant_jacobian:
+      return math_ops.reduce_sum(
+          ildj,
+          self._get_event_reduce_dims(min_event_ndims, event_ndims))
+
+    # In this case, we need to tile the jacobian over the event and reduce.
+    y_rank = array_ops.rank(y)
+    y_shape = array_ops.shape(y)[
+        y_rank - event_ndims : y_rank - min_event_ndims]
+
+    ones = array_ops.ones(y_shape, ildj.dtype)
+    reduced_ildj = math_ops.reduce_sum(
+        ones * ildj,
+        axis=self._get_event_reduce_dims(min_event_ndims, event_ndims))
+    # The multiplication by ones can change the inferred static shape so we try
+    # to recover as much as possible.
+    if (isinstance(event_ndims, int) and
+        y.get_shape().ndims and ildj.get_shape().ndims):
+      y_shape = y.get_shape()
+      y_shape = y_shape[y_shape.ndims - event_ndims :
+                        y_shape.ndims - min_event_ndims]
+      ildj_shape = ildj.get_shape()
+      broadcast_shape = array_ops.broadcast_static_shape(
+          ildj_shape, y_shape)
+      reduced_ildj.set_shape(
+          broadcast_shape[: broadcast_shape.ndims - (
+              event_ndims - min_event_ndims)])
+
+    return reduced_ildj
+
+  def _get_event_reduce_dims(self, min_event_ndims, event_ndims):
+    """Compute the reduction dimensions given event_ndims."""
+    min_event_ndims_ = (min_event_ndims if isinstance(min_event_ndims, int)
+                        else tensor_util.constant_value(min_event_ndims))
+    event_ndims_ = (event_ndims if isinstance(event_ndims, int)
+                    else tensor_util.constant_value(event_ndims))
+
+    if min_event_ndims_ is not None and event_ndims_ is not None:
+      return [-index for index in range(1, event_ndims_ - min_event_ndims_ + 1)]
     else:
-      return event_range + array_ops.rank(sample)
+      reduce_ndims = event_ndims - min_event_ndims
+      return math_ops.range(-reduce_ndims, 0)
+
+  def _check_valid_event_ndims(self, min_event_ndims, event_ndims):
+    """Check whether event_ndims is atleast min_event_ndims."""
+    min_event_ndims_ = (min_event_ndims if isinstance(min_event_ndims, int)
+                        else tensor_util.constant_value(min_event_ndims))
+    event_ndims_ = (event_ndims if isinstance(event_ndims, int)
+                    else tensor_util.constant_value(event_ndims))
+
+    if min_event_ndims_ is not None and event_ndims_ is not None:
+      if min_event_ndims_ > event_ndims_:
+        raise ValueError("event_ndims ({}) must be larger than "
+                         "min_event_ndims ({})".format(
+                             event_ndims_, min_event_ndims_))
+      return []
+
+    if self.validate_args:
+      return [check_ops.assert_greater_equal(event_ndims, min_event_ndims)]
+    return []
diff --git a/tensorflow/python/ops/distributions/bijector_test_util.py b/tensorflow/python/ops/distributions/bijector_test_util.py
index ff3535c62642d98bdd9b18808f45deae27d6d88d..784bfd58352f4035cd1bd4caa91eba6e6dc8d30d 100644
--- a/tensorflow/python/ops/distributions/bijector_test_util.py
+++ b/tensorflow/python/ops/distributions/bijector_test_util.py
@@ -79,9 +79,7 @@ def assert_scalar_congruency(bijector,
   Raises:
     AssertionError:  If tests fail.
   """
-
   # Checks and defaults.
-  assert bijector.event_ndims.eval() == 0
   if sess is None:
     sess = ops.get_default_session()
 
@@ -111,7 +109,10 @@ def assert_scalar_congruency(bijector,
   # (b - a) = \int_a^b dx = \int_{y(a)}^{y(b)} |dx/dy| dy
   # "change_measure_dy_dx" below is a Monte Carlo approximation to the right
   # hand side, which should then be close to the left, which is (b - a).
-  dy_dx = math_ops.exp(bijector.inverse_log_det_jacobian(uniform_y_samps))
+  # We assume event_ndims=0 because we assume scalar -> scalar. The log_det
+  # methods will handle whether they expect event_ndims > 0.
+  dy_dx = math_ops.exp(bijector.inverse_log_det_jacobian(
+      uniform_y_samps, event_ndims=0))
   # E[|dx/dy|] under Uniform[lower_y, upper_y]
   # = \int_{y(a)}^{y(b)} |dx/dy| dP(u), where dP(u) is the uniform measure
   expectation_of_dy_dx_under_uniform = math_ops.reduce_mean(dy_dx)
@@ -121,7 +122,8 @@ def assert_scalar_congruency(bijector,
 
   # We'll also check that dy_dx = 1 / dx_dy.
   dx_dy = math_ops.exp(
-      bijector.forward_log_det_jacobian(bijector.inverse(uniform_y_samps)))
+      bijector.forward_log_det_jacobian(
+          bijector.inverse(uniform_y_samps), event_ndims=0))
 
   [
       forward_on_10_pts_v,
@@ -158,7 +160,8 @@ def assert_scalar_congruency(bijector,
       dy_dx_v, np.divide(1., dx_dy_v), atol=1e-5, rtol=1e-3)
 
 
-def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
+def assert_bijective_and_finite(
+    bijector, x, y, event_ndims, atol=0, rtol=1e-5, sess=None):
   """Assert that forward/inverse (along with jacobians) are inverses and finite.
 
   It is recommended to use x and y values that are very very close to the edge
@@ -168,6 +171,8 @@ def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
     bijector:  A Bijector instance.
     x:  np.array of values in the domain of bijector.forward.
     y:  np.array of values in the domain of bijector.inverse.
+    event_ndims: Integer describing the number of event dimensions this bijector
+      operates on.
     atol:  Absolute tolerance.
     rtol:  Relative tolerance.
     sess:  TensorFlow session.  Defaults to the default session.
@@ -197,10 +202,10 @@ def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
   ] = sess.run([
       bijector.inverse(f_x),
       bijector.forward(g_y),
-      bijector.inverse_log_det_jacobian(f_x),
-      bijector.forward_log_det_jacobian(x),
-      bijector.inverse_log_det_jacobian(y),
-      bijector.forward_log_det_jacobian(g_y),
+      bijector.inverse_log_det_jacobian(f_x, event_ndims=event_ndims),
+      bijector.forward_log_det_jacobian(x, event_ndims=event_ndims),
+      bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims),
+      bijector.forward_log_det_jacobian(g_y, event_ndims=event_ndims),
       f_x,
       g_y,
   ])
diff --git a/tensorflow/python/ops/distributions/bijectors.py b/tensorflow/python/ops/distributions/bijectors.py
deleted file mode 100644
index 69c3a5d4c0ba86586ccb6e55e71d898b1bf7c035..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/distributions/bijectors.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Core module for TensorFlow distribution bijectors."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import,unused-import
-from tensorflow.python.ops.distributions.bijector import Bijector
-from tensorflow.python.ops.distributions.identity_bijector import Identity
-
-# pylint: enable=wildcard-import,unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ["Bijector", "Identity"]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 9161e3fa9f5f7f844e7f4926992c954acae246d6..8f25b1149c3c8b220fe2bea95c33854ee2f6275a 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -183,7 +183,7 @@ class Categorical(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]):
+    with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
           probs=probs,
@@ -311,7 +311,7 @@ class Categorical(distribution.Distribution):
         nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
 
   def _mode(self):
-    ret = math_ops.argmax(self.logits, dimension=self._batch_rank)
+    ret = math_ops.argmax(self.logits, axis=self._batch_rank)
     ret = math_ops.cast(ret, self.dtype)
     ret.set_shape(self.batch_shape)
     return ret
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 25afeec936069b9cbf926cdc3bbb79226a79aa30..eafcd5c78f7752dc3f9f5be8f8dbe7ac368fd3be 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -155,7 +155,7 @@ class Dirichlet(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration]):
+    with ops.name_scope(name, values=[concentration]) as name:
       self._concentration = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration, name="concentration"),
           validate_args)
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index 03a98c56ba509ea1f70f12a74ba67b903013cf70..fe0ed7e07d59654b8a4c7ac989212913697c6832 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -192,7 +192,7 @@ class DirichletMultinomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, concentration]):
+    with ops.name_scope(name, values=[total_count, concentration]) as name:
       # Broadcasting works because:
       # * The broadcasting convention is to prepend dimensions of size [1], and
       #   we use the last dimension for the distribution, whereas
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 4071e50e815b01d30f3e24ba4677cc37b325f24d..3815abf72de1e2b5278706315a10899dccca4182 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -338,6 +338,27 @@ class Distribution(_BaseDistribution):
   cum_prob_invalid = u.cdf([4.0, 5.0, 6.0])
   ```
 
+  #### Shapes
+
+  There are three important concepts associated with TensorFlow Distributions
+  shapes:
+  - Event shape describes the shape of a single draw from the distribution;
+    it may be dependent across dimensions. For scalar distributions, the event
+    shape is `[]`. For a 5-dimensional MultivariateNormal, the event shape is
+    `[5]`.
+  - Batch shape describes independent, not identically distributed draws, aka a
+    "collection" or "bunch" of distributions.
+  - Sample shape describes independent, identically distributed draws of batches
+    from the distribution family.
+
+  The event shape and the batch shape are properties of a Distribution object,
+  whereas the sample shape is associated with a specific call to `sample` or
+  `log_prob`.
+
+  For detailed usage examples of TensorFlow Distributions shapes, see
+  [this tutorial](
+  https://github.com/tensorflow/probability/blob/master/tensorflow_probability/examples/jupyter_notebooks/Understanding%20TensorFlow%20Distributions%20Shapes.ipynb)
+
   #### Parameter values leading to undefined statistics or distributions.
 
   Some distributions do not have well-defined statistics for all initialization
@@ -413,13 +434,17 @@ class Distribution(_BaseDistribution):
     for i, t in enumerate(graph_parents):
       if t is None or not tensor_util.is_tensor(t):
         raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
+    if not name or name[-1] != "/":  # `name` is not a name scope
+      non_unique_name = name or type(self).__name__
+      with ops.name_scope(non_unique_name) as name:
+        pass
     self._dtype = dtype
     self._reparameterization_type = reparameterization_type
     self._allow_nan_stats = allow_nan_stats
     self._validate_args = validate_args
     self._parameters = parameters or {}
     self._graph_parents = graph_parents
-    self._name = name or type(self).__name__
+    self._name = name
 
   @classmethod
   def param_shapes(cls, sample_shape, name="DistributionParamShapes"):
@@ -593,7 +618,7 @@ class Distribution(_BaseDistribution):
     Returns:
       batch_shape: `TensorShape`, possibly unknown.
     """
-    return self._batch_shape()
+    return tensor_shape.as_shape(self._batch_shape())
 
   def _event_shape_tensor(self):
     raise NotImplementedError("event_shape_tensor is not implemented")
@@ -626,7 +651,7 @@ class Distribution(_BaseDistribution):
     Returns:
       event_shape: `TensorShape`, possibly unknown.
     """
-    return self._event_shape()
+    return tensor_shape.as_shape(self._event_shape())
 
   def is_scalar_event(self, name="is_scalar_event"):
     """Indicates that `event_shape == []`.
@@ -1105,6 +1130,34 @@ class Distribution(_BaseDistribution):
     with self._name_scope(name):
       return self._kl_divergence(other)
 
+  def __str__(self):
+    return ("tf.distributions.{type_name}("
+            "\"{self_name}\""
+            "{maybe_batch_shape}"
+            "{maybe_event_shape}"
+            ", dtype={dtype})".format(
+                type_name=type(self).__name__,
+                self_name=self.name,
+                maybe_batch_shape=(", batch_shape={}".format(self.batch_shape)
+                                   if self.batch_shape.ndims is not None
+                                   else ""),
+                maybe_event_shape=(", event_shape={}".format(self.event_shape)
+                                   if self.event_shape.ndims is not None
+                                   else ""),
+                dtype=self.dtype.name))
+
+  def __repr__(self):
+    return ("<tf.distributions.{type_name} "
+            "'{self_name}'"
+            " batch_shape={batch_shape}"
+            " event_shape={event_shape}"
+            " dtype={dtype}>".format(
+                type_name=type(self).__name__,
+                self_name=self.name,
+                batch_shape=self.batch_shape,
+                event_shape=self.event_shape,
+                dtype=self.dtype.name))
+
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
     """Helper function to standardize op scope."""
diff --git a/tensorflow/python/ops/distributions/distributions.py b/tensorflow/python/ops/distributions/distributions.py
index 9df7d148a583e533475276e090bcb02cb705290f..59ed455e43806dedf34818e63235dee5c4440fd2 100644
--- a/tensorflow/python/ops/distributions/distributions.py
+++ b/tensorflow/python/ops/distributions/distributions.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.ops.distributions import bijectors
 from tensorflow.python.ops.distributions.bernoulli import Bernoulli
 from tensorflow.python.ops.distributions.beta import Beta
 from tensorflow.python.ops.distributions.categorical import Categorical
@@ -36,30 +35,3 @@ from tensorflow.python.ops.distributions.student_t import StudentT
 from tensorflow.python.ops.distributions.uniform import Uniform
 # pylint: enable=wildcard-import,unused-import
 
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "bijectors",
-    "Bernoulli",
-    "Beta",
-    "Categorical",
-    "DirichletMultinomial",
-    "Dirichlet",
-    "Distribution",
-    "ReparameterizationType",
-    "FULLY_REPARAMETERIZED",
-    "NOT_REPARAMETERIZED",
-    "Exponential",
-    "Gamma",
-    "RegisterKL",
-    "kl_divergence",
-    "Laplace",
-    "Multinomial",
-    "Normal",
-    "StudentT",
-    "Uniform",
-]
-
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index 6345a76d485c64659aa01fa1611cd27426d8c8a5..cf0e729e1a189d94381d56aedb5a753862a20962 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -95,7 +95,7 @@ class Exponential(gamma.Gamma):
     # true in the parent class "Gamma."  Therefore, passing
     # allow_nan_stats=True
     # through to the parent class results in unnecessary asserts.
-    with ops.name_scope(name, values=[rate]):
+    with ops.name_scope(name, values=[rate]) as name:
       self._rate = ops.convert_to_tensor(rate, name="rate")
     super(Exponential, self).__init__(
         concentration=array_ops.ones([], dtype=self._rate.dtype),
@@ -144,7 +144,7 @@ class ExponentialWithSoftplusRate(Exponential):
                allow_nan_stats=True,
                name="ExponentialWithSoftplusRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[rate]):
+    with ops.name_scope(name, values=[rate]) as name:
       super(ExponentialWithSoftplusRate, self).__init__(
           rate=nn.softplus(rate, name="softplus_rate"),
           validate_args=validate_args,
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index 8fb218be3ac7e17e18d85b8e1c100ccd58aa1034..d39f7c56d39ae1681c32bacd9771f4f2372d3f98 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -127,7 +127,7 @@ class Gamma(distribution.Distribution):
       TypeError: if `concentration` and `rate` are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
           check_ops.assert_positive(rate),
@@ -193,12 +193,6 @@ class Gamma(distribution.Distribution):
   def _log_prob(self, x):
     return self._log_unnormalized_prob(x) - self._log_normalization()
 
-  def _prob(self, x):
-    return math_ops.exp(self._log_prob(x))
-
-  def _log_cdf(self, x):
-    return math_ops.log(self._cdf(x))
-
   def _cdf(self, x):
     x = self._maybe_assert_valid_sample(x)
     # Note that igamma returns the regularized incomplete gamma function,
@@ -268,7 +262,7 @@ class GammaWithSoftplusConcentrationRate(Gamma):
                allow_nan_stats=True,
                name="GammaWithSoftplusConcentrationRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       super(GammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
                                     name="softplus_concentration"),
diff --git a/tensorflow/python/ops/distributions/identity_bijector.py b/tensorflow/python/ops/distributions/identity_bijector.py
index 2972c3554b3639a1ae30a4167f73613b1ff8add2..8628e68f967337fb81187bae9576a168e1cd5a36 100644
--- a/tensorflow/python/ops/distributions/identity_bijector.py
+++ b/tensorflow/python/ops/distributions/identity_bijector.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -28,7 +27,6 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.bijectors.Identity")
 class Identity(bijector.Bijector):
   """Compute Y = g(X) = X.
 
@@ -37,7 +35,7 @@ class Identity(bijector.Bijector):
     ```python
     # Create the Y=g(X)=X transform which is intended for Tensors with 1 batch
     # ndim and 1 event ndim (i.e., vector of vectors).
-    identity = Identity(event_ndims=1)
+    identity = Identity()
     x = [[1., 2],
          [3, 4]]
     x == identity.forward(x) == identity.inverse(x)
@@ -45,10 +43,10 @@ class Identity(bijector.Bijector):
 
   """
 
-  def __init__(self, validate_args=False, event_ndims=0, name="identity"):
+  def __init__(self, validate_args=False, name="identity"):
     super(Identity, self).__init__(
+        forward_min_event_ndims=0,
         is_constant_jacobian=True,
-        event_ndims=event_ndims,
         validate_args=validate_args,
         name=name)
 
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index e98ac855c58efa1ef3ccef2de24f329d839bac26..3ccfc618d1157722bbc4ce2d5f88fbcd4fb1ba10 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -101,7 +101,7 @@ class Laplace(distribution.Distribution):
       TypeError: if `loc` and `scale` are of different dtype.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
@@ -218,7 +218,7 @@ class LaplaceWithSoftplusScale(Laplace):
                allow_nan_stats=True,
                name="LaplaceWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       super(LaplaceWithSoftplusScale, self).__init__(
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 4ae67a009b0a4052f6e23e2e42262bb7c42f1c14..ab77f5c1f815f36edb989c5b315ef90fb198b001 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -183,7 +183,7 @@ class Multinomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]):
+    with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = ops.convert_to_tensor(total_count, name="total_count")
       if validate_args:
         self._total_count = (
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index e7f120ea2da525e20a1ae42e6418cf2ac83686af..20d4420e91886cf84855bd585cfc4869fc674178 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -132,7 +132,7 @@ class Normal(distribution.Distribution):
       TypeError: if `loc` and `scale` have different `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
@@ -188,9 +188,6 @@ class Normal(distribution.Distribution):
   def _log_prob(self, x):
     return self._log_unnormalized_prob(x) - self._log_normalization()
 
-  def _prob(self, x):
-    return math_ops.exp(self._log_prob(x))
-
   def _log_cdf(self, x):
     return special_math.log_ndtr(self._z(x))
 
@@ -247,7 +244,7 @@ class NormalWithSoftplusScale(Normal):
                allow_nan_stats=True,
                name="NormalWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[scale]):
+    with ops.name_scope(name, values=[scale]) as name:
       super(NormalWithSoftplusScale, self).__init__(
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index bed4cbb2c1a43b6952861f4fab82957229e23c9c..1d605c5dfcca9b709a9178ccbe56619f6a92f869 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -213,7 +213,7 @@ def _ndtri(p):
 
   # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z),
   # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different
-  # arrays based on wether p < exp(-32).
+  # arrays based on whether p < exp(-32).
   z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp))
   first_term = z - math_ops.log(z) / z
   second_term_small_p = (_create_polynomial(1. / z, p2)
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index 778fefb8c2991153b7e7a1f20df61680153dab2a..961b07a7bdac34b76ca1a594fa3df5e97951c76b 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -158,7 +158,7 @@ class StudentT(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[df, loc, scale]):
+    with ops.name_scope(name, values=[df, loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(df)]
                                     if validate_args else []):
         self._df = array_ops.identity(df, name="df")
@@ -248,9 +248,6 @@ class StudentT(distribution.Distribution):
             math_ops.lgamma(0.5 * self.df) -
             math_ops.lgamma(0.5 * (self.df + 1.)))
 
-  def _prob(self, x):
-    return math_ops.exp(self._log_prob(x))
-
   def _cdf(self, x):
     # Take Abs(scale) to make subsequent where work correctly.
     y = (x - self.loc) / math_ops.abs(self.scale)
@@ -353,7 +350,7 @@ class StudentTWithAbsDfSoftplusScale(StudentT):
                allow_nan_stats=True,
                name="StudentTWithAbsDfSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[df, scale]):
+    with ops.name_scope(name, values=[df, scale]) as name:
       super(StudentTWithAbsDfSoftplusScale, self).__init__(
           df=math_ops.floor(math_ops.abs(df)),
           loc=loc,
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 1efcf9d32e9ea9924bb080459efb7015e33ccd54..bc321900dcbcfe16a9d4e9ee6a60c2ca3ecf396b 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -19,8 +19,6 @@ from __future__ import print_function
 
 import numpy as np
 
-# Bijectors must be directly imported because `remove_undocumented` prevents
-# individual file imports.
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -197,8 +195,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Affine(
       shift=-1.,
-      scale_identity_multiplier=2.,
-      event_ndims=0),
+      scale_identity_multiplier=2.)
     name="NormalTransformedDistribution")
   ```
 
@@ -258,7 +255,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     parameters = locals()
     name = name or (("" if bijector is None else bijector.name) +
                     distribution.name)
-    with ops.name_scope(name, values=[event_shape, batch_shape]):
+    with ops.name_scope(name, values=[event_shape, batch_shape]) as name:
       # For convenience we define some handy constants.
       self._zero = constant_op.constant(0, dtype=dtypes.int32, name="zero")
       self._empty = constant_op.constant([], dtype=dtypes.int32, name="empty")
@@ -419,48 +416,51 @@ class TransformedDistribution(distribution_lib.Distribution):
     # For caching to work, it is imperative that the bijector is the first to
     # modify the input.
     x = self.bijector.inverse(y)
-    ildj = self.bijector.inverse_log_det_jacobian(y)
+    event_ndims = self._maybe_get_event_ndims_statically()
+
+    ildj = self.bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims)
     if self.bijector._is_injective:  # pylint: disable=protected-access
-      return self._finish_log_prob_for_one_fiber(y, x, ildj)
+      return self._finish_log_prob_for_one_fiber(y, x, ildj, event_ndims)
 
     lp_on_fibers = [
-        self._finish_log_prob_for_one_fiber(y, x_i, ildj_i)
+        self._finish_log_prob_for_one_fiber(y, x_i, ildj_i, event_ndims)
         for x_i, ildj_i in zip(x, ildj)]
     return math_ops.reduce_logsumexp(array_ops.stack(lp_on_fibers), axis=0)
 
-  def _finish_log_prob_for_one_fiber(self, y, x, ildj):
+  def _finish_log_prob_for_one_fiber(self, y, x, ildj, event_ndims):
     """Finish computation of log_prob on one element of the inverse image."""
     x = self._maybe_rotate_dims(x, rotate_right=True)
     log_prob = self.distribution.log_prob(x)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
     log_prob += math_ops.cast(ildj, log_prob.dtype)
-    if self._is_maybe_event_override:
+    if self._is_maybe_event_override and isinstance(event_ndims, int):
       log_prob.set_shape(array_ops.broadcast_static_shape(
-          y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
+          x.get_shape().with_rank_at_least(1)[:-event_ndims], self.batch_shape))
     return log_prob
 
   def _prob(self, y):
     x = self.bijector.inverse(y)
-    ildj = self.bijector.inverse_log_det_jacobian(y)
+    event_ndims = self._maybe_get_event_ndims_statically()
+    ildj = self.bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims)
     if self.bijector._is_injective:  # pylint: disable=protected-access
-      return self._finish_prob_for_one_fiber(y, x, ildj)
+      return self._finish_prob_for_one_fiber(y, x, ildj, event_ndims)
 
     prob_on_fibers = [
-        self._finish_prob_for_one_fiber(y, x_i, ildj_i)
+        self._finish_prob_for_one_fiber(y, x_i, ildj_i, event_ndims)
         for x_i, ildj_i in zip(x, ildj)]
     return sum(prob_on_fibers)
 
-  def _finish_prob_for_one_fiber(self, y, x, ildj):
+  def _finish_prob_for_one_fiber(self, y, x, ildj, event_ndims):
     """Finish computation of prob on one element of the inverse image."""
     x = self._maybe_rotate_dims(x, rotate_right=True)
     prob = self.distribution.prob(x)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
     prob *= math_ops.exp(math_ops.cast(ildj, prob.dtype))
-    if self._is_maybe_event_override:
+    if self._is_maybe_event_override and isinstance(event_ndims, int):
       prob.set_shape(array_ops.broadcast_static_shape(
-          y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
+          y.get_shape().with_rank_at_least(1)[:-event_ndims], self.batch_shape))
     return prob
 
   def _log_cdf(self, y):
@@ -545,10 +545,17 @@ class TransformedDistribution(distribution_lib.Distribution):
           _ones_like(self.distribution.batch_shape_tensor())
       ], 0)
       entropy = array_ops.tile(entropy, multiples)
-    dummy = array_ops.zeros([], self.dtype)
-    entropy -= math_ops.cast(
-        self.bijector.inverse_log_det_jacobian(dummy),
-        entropy.dtype)
+    dummy = array_ops.zeros(
+        shape=array_ops.concat(
+            [self.batch_shape_tensor(), self.event_shape_tensor()],
+            0),
+        dtype=self.dtype)
+    event_ndims = (self.event_shape.ndims if self.event_shape.ndims is not None
+                   else array_ops.size(self.event_shape_tensor()))
+    ildj = self.bijector.inverse_log_det_jacobian(
+        dummy, event_ndims=event_ndims)
+
+    entropy -= math_ops.cast(ildj, entropy.dtype)
     entropy.set_shape(self.batch_shape)
     return entropy
 
@@ -610,3 +617,16 @@ class TransformedDistribution(distribution_lib.Distribution):
     n = (ndims - self._rotate_ndims) if rotate_right else self._rotate_ndims
     return array_ops.transpose(
         x, _concat_vectors(math_ops.range(n, ndims), math_ops.range(0, n)))
+
+  def _maybe_get_event_ndims_statically(self):
+    if self.event_shape.ndims is not None:
+      return self.event_shape.ndims
+
+    event_ndims = array_ops.size(self.event_shape_tensor())
+
+    static_event_ndims = tensor_util.constant_value(event_ndims)
+
+    if static_event_ndims is not None:
+      return static_event_ndims
+
+    return event_ndims
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index 3580af18f241d777c81340f1c565074914838029..087797c653bb3ee294b747dd7da8ff2896e79fc6 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -45,11 +45,12 @@ class Uniform(distribution.Distribution):
   Z = b - a
   ```
 
-  where:
-  * `low = a`,
-  * `high = b`,
-  * `Z` is the normalizing constant, and,
-  * `I[predicate]` is the [indicator function](
+  where
+
+  - `low = a`,
+  - `high = b`,
+  - `Z` is the normalizing constant, and
+  - `I[predicate]` is the [indicator function](
     https://en.wikipedia.org/wiki/Indicator_function) for `predicate`.
 
   The parameters `low` and `high` must be shaped in a way that supports
@@ -102,7 +103,7 @@ class Uniform(distribution.Distribution):
       InvalidArgumentError: if `low >= high` and `validate_args=False`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[low, high]):
+    with ops.name_scope(name, values=[low, high]) as name:
       with ops.control_dependencies([
           check_ops.assert_less(
               low, high, message="uniform not defined when low >= high.")
@@ -164,11 +165,9 @@ class Uniform(distribution.Distribution):
                                         seed=seed)
     return self.low + self.range() * samples
 
-  def _log_prob(self, x):
-    return math_ops.log(self._prob(x))
-
   def _prob(self, x):
-    broadcasted_x = x * array_ops.ones(self.batch_shape_tensor())
+    broadcasted_x = x * array_ops.ones(
+        self.batch_shape_tensor(), dtype=x.dtype)
     return array_ops.where(
         math_ops.is_nan(broadcasted_x),
         broadcasted_x,
@@ -178,9 +177,6 @@ class Uniform(distribution.Distribution):
             array_ops.zeros_like(broadcasted_x),
             array_ops.ones_like(broadcasted_x) / self.range()))
 
-  def _log_cdf(self, x):
-    return math_ops.log(self.cdf(x))
-
   def _cdf(self, x):
     broadcast_shape = array_ops.broadcast_dynamic_shape(
         array_ops.shape(x), self.batch_shape_tensor())
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 0a3000ef5ca0decf8aba641e704406b0cf8780af..2e067eab459050e30d220bdb7ff0d65cb9c552f7 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -58,8 +58,7 @@ def assert_close(
   if data is None:
     data = [
         message,
-        "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
-        y.name, y
+        "Condition x ~= y did not hold element-wise: x = ", x, "y = ", y
     ]
 
   if x.dtype.is_integer:
@@ -95,7 +94,7 @@ def assert_integer_form(
     x = ops.convert_to_tensor(x, name="x")
     if x.dtype.is_integer:
       return control_flow_ops.no_op()
-    message = message or "{} has non-integer components".format(x.op.name)
+    message = message or "{} has non-integer components".format(x)
     if int_dtype is None:
       try:
         int_dtype = {
@@ -123,13 +122,13 @@ def embed_check_nonnegative_integer_form(
     x = ops.convert_to_tensor(x, name="x")
     assertions = [
         check_ops.assert_non_negative(
-            x, message="'{}' must be non-negative.".format(x.op.name)),
+            x, message="'{}' must be non-negative.".format(x)),
     ]
     if not x.dtype.is_integer:
       assertions += [
           assert_integer_form(
               x, message="'{}' cannot contain fractional components.".format(
-                  x.op.name)),
+                  x)),
       ]
     return control_flow_ops.with_dependencies(assertions, x)
 
@@ -434,7 +433,7 @@ def embed_check_integer_casting_closed(
         and not _is_integer_like_by_dtype(target_dtype)):
       raise TypeError("At least one of {}.dtype ({}) and target_dtype ({}) "
                       "must be integer-type.".format(
-                          x.op.name, x.dtype.name, target_dtype.name))
+                          x, x.dtype.name, target_dtype.name))
 
     assertions = []
     if assert_nonnegative:
@@ -683,7 +682,7 @@ def pick_vector(cond,
     cond = ops.convert_to_tensor(cond, name="cond")
     if cond.dtype != dtypes.bool:
       raise TypeError("%s.dtype=%s which is not %s" %
-                      (cond.name, cond.dtype, dtypes.bool))
+                      (cond, cond.dtype, dtypes.bool))
     cond_value_static = tensor_util.constant_value(cond)
     if cond_value_static is not None:
       return true_vector if cond_value_static else false_vector
@@ -692,8 +691,8 @@ def pick_vector(cond,
     if true_vector.dtype != false_vector.dtype:
       raise TypeError(
           "%s.dtype=%s does not match %s.dtype=%s"
-          % (true_vector.name, true_vector.dtype,
-             false_vector.name, false_vector.dtype))
+          % (true_vector, true_vector.dtype,
+             false_vector, false_vector.dtype))
     n = array_ops.shape(true_vector)[0]
     return array_ops.slice(
         array_ops.concat([true_vector, false_vector], 0),
@@ -1060,9 +1059,7 @@ def reduce_weighted_logsumexp(
     wx_over_max_absw_x = (
         math_ops.sign(w) * math_ops.exp(log_absw_x - max_log_absw_x))
     sum_wx_over_max_absw_x = math_ops.reduce_sum(
-        wx_over_max_absw_x,
-        axis=axis,
-        keepdims=keep_dims)
+        wx_over_max_absw_x, axis=axis, keepdims=keep_dims)
     if not keep_dims:
       max_log_absw_x = array_ops.squeeze(max_log_absw_x, axis)
     sgn = math_ops.sign(sum_wx_over_max_absw_x)
@@ -1180,8 +1177,7 @@ def process_quadrature_grid_and_probs(
     grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype)
     probs = ops.convert_to_tensor(probs, name="unnormalized_probs",
                                   dtype=dtype)
-    probs /= linalg_ops.norm(probs, ord=1, axis=-1, keepdims=True,
-                             name="probs")
+    probs /= linalg_ops.norm(probs, ord=1, axis=-1, keepdims=True, name="probs")
 
     def _static_event_size(x):
       """Returns the static size of a specific dimension or `None`."""
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 3826585f59c31133b12c365816729e090c9ab561..6f2a34c731c42eaee36faf23dc3b91d844900f71 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -35,34 +35,14 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _gather(params, ids, name=None):
-  """Helper function for _embedding_lookup_and_transform.
-
-  This function gathers embeddings from a single tensor. The gather deals with
-  resource variables specially.
-
-  Args:
-    params: A `Tensor` of embeddings.
-    ids: A `Tensor` indexing the embeddings to be retrieved from `params`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` with the same type as `params`.
-  """
-  if isinstance(params, resource_variable_ops.ResourceVariable):
-    return params.sparse_read(ids, name=name)
-  else:
-    return array_ops.gather(params, ids, name=name)
-
-
 def _clip(params, ids, max_norm):
   """Helper function for _embedding_lookup_and_transform.
 
   This function optionally clips embeddings to an l2-norm of max_norm.
 
   Args:
-    params: A `Tensor` of embeddings retrieved by `_gather`.
-    ids: The `ids` argument that was passed to `_gather`.
+    params: A `Tensor` of embeddings retrieved by `gather`.
+    ids: The `ids` argument that was passed to `gather`.
     max_norm: If provided, the embeddings are l2-normalized to the value of
       max_norm.
 
@@ -148,7 +128,8 @@ def _embedding_lookup_and_transform(params,
     ids = ops.convert_to_tensor(ids, name="ids")
     if np == 1 and (not transform_fn or ids.get_shape().ndims == 1):
       with ops.colocate_with(params[0]):
-        result = _clip(_gather(params[0], ids, name=name), ids, max_norm)
+        result = _clip(array_ops.gather(params[0], ids, name=name),
+                       ids, max_norm)
         if transform_fn:
           result = transform_fn(result)
         return result
@@ -212,7 +193,7 @@ def _embedding_lookup_and_transform(params,
       for p in xrange(np):
         pids = gather_ids[p]
         with ops.colocate_with(params[p]):
-          result = _gather(params[p], pids)
+          result = array_ops.gather(params[p], pids)
           if transform_fn:
             # If transform_fn is provided, the clip_by_norm precedes
             # the transform and hence must be co-located. See below
@@ -350,11 +331,11 @@ def embedding_lookup_sparse(params,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for the given `partition_strategy`.
-    sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
-      where N is typically batch size and M is arbitrary.
-    sp_weights: either a SparseTensor of float / double weights, or None to
-      indicate all weights should be taken to be 1. If specified, sp_weights
-      must have exactly the same shape and indices as sp_ids.
+    sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size
+      and M is arbitrary.
+    sp_weights: either a `SparseTensor` of float / double weights, or `None` to
+      indicate all weights should be taken to be 1. If specified, `sp_weights`
+      must have exactly the same shape and indices as `sp_ids`.
     partition_strategy: A string specifying the partitioning strategy, relevant
       if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
       is `"mod"`. See `tf.nn.embedding_lookup` for more details.
@@ -370,39 +351,43 @@ def embedding_lookup_sparse(params,
 
   Returns:
     A dense tensor representing the combined embeddings for the
-    sparse ids. For each row in the dense tensor represented by sp_ids, the op
+    sparse ids. For each row in the dense tensor represented by `sp_ids`, the op
     looks up the embeddings for all ids in that row, multiplies them by the
     corresponding weight, and combines these embeddings as specified.
 
     In other words, if
 
-      shape(combined params) = [p0, p1, ..., pm]
+      `shape(combined params) = [p0, p1, ..., pm]`
 
     and
 
-      shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]
+      `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`
 
     then
 
-      shape(output) = [d0, d1, ..., dn-1, p1, ..., pm].
+      `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`.
 
     For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
 
+      ```python
       [0, 0]: id 1, weight 2.0
       [0, 1]: id 3, weight 0.5
       [1, 0]: id 0, weight 1.0
       [2, 3]: id 1, weight 3.0
+      ```
 
     with `combiner`="mean", then the output will be a 3x20 matrix where
 
+      ```python
       output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
-      output[1, :] = params[0, :] * 1.0
-      output[2, :] = params[1, :] * 3.0
+      output[1, :] = (params[0, :] * 1.0) / 1.0
+      output[2, :] = (params[1, :] * 3.0) / 3.0
+      ```
 
   Raises:
-    TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
-      None nor SparseTensor.
-    ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
+    TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is 
+      neither `None` nor `SparseTensor`.
+    ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
   """
   if combiner is None:
     logging.warn("The default value of combiner will change from \"mean\" "
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index ac03d30fcd2e65f032937d9259bc8fff18626619..c8a1500e76958579f09625d98fb857a0c777c128 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,33 +16,30 @@
 """Functional operations.
 
 See the @{$python/functional_ops} guide.
-
-@@map_fn
-@@foldl
-@@foldr
-@@scan
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.gen_functional_ops import *
-# pylint: enable=wildcard-import
 # pylint: disable=unused-import
-from tensorflow.python.ops.gen_functional_ops import _symbolic_gradient
+from tensorflow.python.ops.gen_functional_ops import remote_call
 # pylint: enable=unused-import
+from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -63,10 +60,20 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
   of the result tensor is fn(initializer, values[0]).shape`.
 
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
   Args:
     fn: The callable to be performed.
-    elems: A tensor to be unpacked on dimension 0.
-    initializer: (optional) The initial value for the accumulator.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which
+      will be unpacked along their first dimension.  The nested sequence
+      of the resulting slices will be the first argument to `fn`.
+    initializer: (optional) A tensor or (possibly nested) sequence of tensors,
+      as the initial value for the accumulator.
     parallel_iterations: (optional) The number of iterations allowed to run
       in parallel.
     back_prop: (optional) True enables support for back propagation.
@@ -74,8 +81,9 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
-    A tensor resulting from applying `fn` consecutively to the list of tensors
-    unpacked from `elems`, from first to last.
+    A tensor or (possibly nested) sequence of tensors, resulting from applying
+    `fn` consecutively to the list of tensors unpacked from `elems`, from first
+    to last.
 
   Raises:
     TypeError: if `fn` is not callable.
@@ -90,7 +98,12 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   if not callable(fn):
     raise TypeError("fn must be callable.")
 
-  in_graph_mode = context.in_graph_mode()
+  def create_ta(elem):
+    return tensor_array_ops.TensorArray(
+        dtype=elem.dtype, size=n, dynamic_size=False,
+        infer_shape=True).unstack(elem)
+
+  in_graph_mode = not context.executing_eagerly()
   with ops.name_scope(name, "foldl", [elems]):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
     # supported in Eager
@@ -105,24 +118,26 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         varscope.set_caching_device(lambda op: op.device)
         varscope_caching_device_was_none = True
 
-    # Convert elems to tensor array.
-    elems = ops.convert_to_tensor(elems, name="elems")
-    n = array_ops.shape(elems)[0]
-    elems_ta = tensor_array_ops.TensorArray(dtype=elems.dtype, size=n,
-                                            dynamic_size=False,
-                                            infer_shape=True)
-    elems_ta = elems_ta.unstack(elems)
+    # Convert elems to tensor array. n may be known statically.
+    elems_flat = [
+        ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
+    ]
+    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+
+    elems_ta = nest.map_structure(create_ta, elems)
 
     if initializer is None:
-      a = elems_ta.read(0)
+      a = nest.map_structure(lambda elem: elem.read(0), elems_ta)
       i = constant_op.constant(1)
     else:
-      a = ops.convert_to_tensor(initializer)
+      a = initializer
       i = constant_op.constant(0)
 
     def compute(i, a):
-      a = fn(a, elems_ta.read(i))
+      elem_i = nest.map_structure(lambda elem: elem.read(i), elems_ta)
+      a = fn(a, elem_i)
       return [i + 1, a]
+
     _, r_a = control_flow_ops.while_loop(
         lambda i, a: i < n, compute, [i, a],
         parallel_iterations=parallel_iterations,
@@ -133,6 +148,7 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     # supported in Eager
     if in_graph_mode and varscope_caching_device_was_none:
       varscope.set_caching_device(None)
+
     return r_a
 
 
@@ -151,10 +167,20 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
   of the result tensor is `fn(initializer, values[0]).shape`.
 
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
   Args:
     fn: The callable to be performed.
-    elems: A tensor that is unpacked into a sequence of tensors to apply `fn`.
-    initializer: (optional) The initial value for the accumulator.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which
+      will be unpacked along their first dimension.  The nested sequence
+      of the resulting slices will be the first argument to `fn`.
+    initializer: (optional) A tensor or (possibly nested) sequence of tensors,
+      as the initial value for the accumulator.
     parallel_iterations: (optional) The number of iterations allowed to run
       in parallel.
     back_prop: (optional) True enables support for back propagation.
@@ -162,8 +188,9 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
-    A tensor resulting from applying `fn` consecutively to the list of tensors
-    unpacked from `elems`, from last to first.
+    A tensor or (possibly nested) sequence of tensors, resulting from applying
+    `fn` consecutively to the list of tensors unpacked from `elems`, from last
+    to first.
 
   Raises:
     TypeError: if `fn` is not callable.
@@ -178,7 +205,12 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   if not callable(fn):
     raise TypeError("fn must be callable.")
 
-  in_graph_mode = context.in_graph_mode()
+  def create_ta(elem):
+    return tensor_array_ops.TensorArray(
+        dtype=elem.dtype, size=n, dynamic_size=False,
+        infer_shape=True).unstack(elem)
+
+  in_graph_mode = not context.executing_eagerly()
   with ops.name_scope(name, "foldr", [elems]):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
     # supported in Eager
@@ -193,26 +225,30 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         varscope.set_caching_device(lambda op: op.device)
         varscope_caching_device_was_none = True
 
-    # Convert elems to tensor array.
-    elems = ops.convert_to_tensor(elems, name="elems")
-    n = array_ops.shape(elems)[0]
-    elems_ta = tensor_array_ops.TensorArray(dtype=elems.dtype, size=n,
-                                            dynamic_size=False,
-                                            infer_shape=True)
-    elems_ta = elems_ta.unstack(elems)
+    # Convert elems to tensor array. n may be known statically.
+    elems_flat = [
+        ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
+    ]
+    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+
+    elems_ta = nest.map_structure(create_ta, elems)
 
     if initializer is None:
       i = n - 1
-      a = elems_ta.read(i)
+      a = nest.map_structure(lambda elem: elem.read(i), elems_ta)
     else:
       i = n
-      a = ops.convert_to_tensor(initializer)
+      a = initializer
+
     def compute(i, a):
       i -= 1
-      a = fn(a, elems_ta.read(i))
-      return [i, a]
+      elem = nest.map_structure(lambda elem: elem.read(i), elems_ta)
+      a_out = fn(a, elem)
+      return [i, a_out]
+
     _, r_a = control_flow_ops.while_loop(
-        lambda i, a: i > 0, compute, [i, a],
+        lambda i, a: i > 0,
+        compute, [i, a],
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
         swap_memory=swap_memory)
@@ -221,6 +257,7 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     # supported in Eager
     if in_graph_mode and varscope_caching_device_was_none:
       varscope.set_caching_device(None)
+
     return r_a
 
 
@@ -343,7 +380,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
 
   elems_flat = input_flatten(elems)
 
-  in_graph_mode = context.in_graph_mode()
+  in_graph_mode = not context.executing_eagerly()
   with ops.name_scope(name, "map", elems_flat):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
     # supported in Eager
@@ -364,8 +401,16 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
     dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
     dtype_flat = output_flatten(dtype)
 
-    # Convert elems to tensor array.
-    n = array_ops.shape(elems_flat[0])[0]
+    # Convert elems to tensor array. n may be known statically.
+    static_shape = elems_flat[0].shape
+    if static_shape.ndims is not None and static_shape.ndims < 1:
+      if len(elems_flat) == 1:
+        raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
+      else:
+        raise ValueError(
+            "elements in elems must be 1+ dimensional Tensors, not scalars"
+        )
+    n = static_shape[0].value or array_ops.shape(elems_flat[0])[0]
 
     # TensorArrays are always flat
     elems_ta = [
@@ -536,7 +581,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
   elems_flat = input_flatten(elems)
 
-  in_graph_mode = context.in_graph_mode()
+  in_graph_mode = not context.executing_eagerly()
   with ops.name_scope(name, "scan", elems_flat):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
     # supported in Eager
@@ -555,7 +600,8 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     elems_flat = [
         ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
 
-    n = array_ops.shape(elems_flat[0])[0]
+    # Convert elems to tensor array. n may be known statically.
+    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
 
     # TensorArrays are always flat
     elems_ta = [
@@ -615,7 +661,8 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     _, _, r_a = control_flow_ops.while_loop(
         lambda i, _1, _2: i < n, compute, (i, a_flat, accs_ta),
         parallel_iterations=parallel_iterations,
-        back_prop=back_prop, swap_memory=swap_memory)
+        back_prop=back_prop, swap_memory=swap_memory,
+        maximum_iterations=n)
 
     results_flat = [r.stack() for r in r_a]
 
@@ -632,3 +679,252 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
       varscope.set_caching_device(None)
 
     return output_pack(results_flat)
+
+
+# pylint: disable=invalid-name
+def If(cond, inputs, then_branch, else_branch, name=None):
+  r"""output = Cond(inputs) ? then_branch(inputs) : else_branch(inputs).
+
+  Args:
+    cond: A `Tensor`. A scalar. If the scalar is not a boolean, the scalar is
+      converted to a boolean according to the following rule: if the
+      scalar is a numerical value, non-zero means True and zero means
+      False; if the scalar is a string, non-empty means True and empty
+      means False.
+    inputs: A list of input tensors.
+    then_branch: A function takes 'inputs' and returns a list of tensors,
+        whose types are the same as what else_branch returns.
+    else_branch: A function takes 'inputs' and returns a list of tensors.
+        whose types are the same as what then_branch returns.
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of tensors returned by either then_branch(inputs)
+    or else_branch(inputs).
+  """
+  # pylint: disable=protected-access
+  return gen_functional_ops._if(
+      cond,
+      inputs, [_.type for _ in then_branch.definition.signature.output_arg],
+      then_branch,
+      else_branch,
+      name=name)
+
+
+def Gradient(inputs, f, name=None):
+  r"""Computes the gradient function for function f via backpropagation.
+
+  Args:
+    inputs: A list of tensors of size N + M.
+    f: The function we want to compute the gradient for.
+
+      The function 'f' must be a numerical function which takes N inputs and
+      produces M outputs. Its gradient function 'g', which is  a function
+      taking N + M inputs and produces N outputs.
+
+      I.e. if we have
+         (y1, y2, ..., yM) = f(x1, x2, ..., xN),
+      then, g is
+         (dL/dx1, dL/dx2, ..., dL/dxN) = g(x1, x2, ..., xN,
+                                           dL/dy1, dL/dy2, ..., dL/dyM),
+
+      where L is a scalar-value function of (x1, x2, ..., xN) (e.g., the
+      loss function). dL/dxi is the partial derivative of L with respect
+      to xi.
+
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of tensors of size N.
+  """
+  # TODO(zhifengc): Pretty-print the above spec in latex.
+  # TODO(zhfiengc): Needs some math expert to say the comment above better.
+  tlist = [_.type for _ in f.definition.signature.input_arg]
+  return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
+
+
+# pylint: disable=invalid-name,protected-access
+def While(input_, cond, body, name=None, hostmem=None):
+  r"""output = input; While (Cond(output)) { output = Body(output) }.
+
+  Args:
+    input_: A list of `Tensor` objects.
+      A list of input tensors whose types are T.
+    cond: . A function takes 'input' and returns a tensor.  If the tensor is
+      a scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical
+      value, non-zero means True and zero means False; if the scalar is
+      a string, non-empty means True and empty means False. If the
+      tensor is not a scalar, non-emptiness means True and False
+      otherwise.
+    body: . A funcion takes a list of tensors and returns another
+      list tensors. Both lists have the same types as specified
+      by T.
+    name: A name for the operation (optional).
+    hostmem: A list of integer. If i is in the list, input[i] is a
+      host memory tensor.
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `input`.
+    A list of output tensors whose types are T.
+  """
+  ret = gen_functional_ops._while(input_, cond, body, name=name)
+  if hostmem:
+    input_attr = attr_value_pb2.AttrValue()
+    input_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_input_hostmem", input_attr)  # pylint: disable=protected-access
+
+    output_attr = attr_value_pb2.AttrValue()
+    output_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
+  return ret
+
+
+# b/36459430
+#
+# Ideally, we do not need this rewrite For loop into a While loop.
+# However, today, if a While runs on GPU and the condition returns a
+# boolean, the While kernel crashes. Even if we fix the crash, the
+# bool needs to be copied between GPU and CPU. So, a for loop is much
+# preferred when running on GPU.
+#
+# On the other hand, For op has no directly XLA kernel. So, when we run
+# a for loop, we need to rewrite it using a While op.
+#
+# It should be possible and probably better to write a XLA C++ kernel
+# implementing the logic in _ForUsingWhile.
+def _ForUsingWhile(start,
+                   limit,
+                   delta,
+                   inputs,
+                   forbody,
+                   name=None,
+                   hostmem=None):
+  """Helper to implement a For loop using a While."""
+  # To support negative delta (e.g., range(100, 0, -3)), we iterate
+  # over the range(n) and use iter * delta + start as the real
+  # iteration index. (e.g., for i in range(34): iter = i * (-3) +
+  # 100).
+  d = math_ops.abs(delta)
+  # XLA on TPUs doesn't support integer division
+  n = math_ops.cast(
+      math_ops.cast((math_ops.abs(limit - start) + d - 1), dtypes.float32) /
+      math_ops.cast(d, dtypes.float32), dtypes.int32)
+
+  # Carried loop variables ("extra_args") are implicitly added to the input list
+  # of the WhileBody function. WhileCond does not call forbody, and so does not
+  # depend on any of forbody's extra_args. Since WhileCond and WhileBody
+  # must have identical inputs, we have to augment the cond signature to take
+  # the same types as the carried loop variables.
+  body_sig = [dtypes.int32] * 4 + list(forbody.declared_input_types)[1:]
+  cond_sig = body_sig + [t.dtype for t in forbody.captured_inputs]
+
+  cond_name = "%s_Cond" % forbody.name
+
+  @function.Defun(*cond_sig, func_name=cond_name)
+  def WhileCond(i, n, *args):
+    del args
+    return i < n
+
+  body_name = "%s_Body" % forbody.name
+
+  @function.Defun(*body_sig, func_name=body_name)
+  def WhileBody(i, n, start, delta, *args):
+    """A While wrapper for forbody that handles loop-carried captured inputs."""
+    for_result = forbody(start + i * delta, *args)
+    # Nullary functions return an Operation. Normal functions can't do this
+    # because their return values are converted to Tensors.
+    if isinstance(for_result, ops.Operation):
+      for_result = ()
+    # Unary functions return a single Tensor value.
+    elif isinstance(for_result, ops.Tensor):
+      for_result = (for_result,)
+    extra_args = tuple(function.get_extra_args())
+    return (i + 1, n, start, delta) + tuple(for_result) + extra_args
+
+  if hostmem is not None:
+    hostmem = [(4 + _) for _ in hostmem]
+
+  results = While(
+      input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs,
+      cond=WhileCond,
+      body=WhileBody,
+      name=name,
+      hostmem=hostmem)
+  # Slice off the loop-carried captured inputs.
+  return list(results[4:len(results) - len(WhileBody.captured_inputs)])
+
+
+def For(start,
+        limit,
+        delta,
+        inputs,
+        body,
+        name=None,
+        hostmem=None,
+        rewrite_with_while=None):
+  r"""out = input; for i in range(start, limit, delta) out = body(i, out).
+
+  Args:
+    start: A `Tensor` of type `int32`.
+    limit: A `Tensor` of type `int32`.
+    delta: A `Tensor` of type `int32`.
+    inputs: A list of `Tensor` objects.
+      A list of input tensors whose types are T.
+    body: A function takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as (int32, T...).
+    name: A name for the operation (optional).
+    hostmem: A list of integer. If i is in the list, inputs[i] is a
+      host memory tensor. In other words, (i+1)-th argument of the body
+      function is expecting a host memory.
+    rewrite_with_while: If True, using While op to implement the For.
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `input`.
+    A list of output tensors whose types are T.
+  """
+  if rewrite_with_while:
+    return _ForUsingWhile(start, limit, delta, inputs, body, name, hostmem)
+  if body.captured_inputs:
+    wrapper_name = "%s_BodyWrapper" % body.name
+
+    @function.Defun(*body.declared_input_types, func_name=wrapper_name)
+    def BodyWrapper(*args):
+      """A wrapper for body that handles loop-carried captured inputs."""
+      body_result = body(*args)
+      extra_args = tuple(function.get_extra_args())
+      # Nullary functions return an Operation. Normal functions can't do this
+      # because their return values are converted to Tensors.
+      if isinstance(body_result, ops.Operation):
+        return extra_args
+      # Unary functions return a single Tensor value.
+      elif not isinstance(body_result, tuple):
+        return (body_result,) + extra_args
+      # N-ary functions return a tuple of Tensors.
+      else:
+        return body_result + extra_args
+
+    inputs += BodyWrapper.captured_inputs
+    ret = gen_functional_ops._for(
+        start, limit, delta, inputs, BodyWrapper, name=name)
+    # Slice off the loop-carried captured inputs.
+    ret = ret[:-len(BodyWrapper.captured_inputs)]
+  else:
+    ret = gen_functional_ops._for(start, limit, delta, inputs, body, name=name)
+  if hostmem:
+    num_for_params = 3  # start/limit/delta
+
+    input_attr = attr_value_pb2.AttrValue()
+    input_attr.list.i.extend([num_for_params + i for i in hostmem])
+    ret[0].op._set_attr("_input_hostmem", input_attr)  # pylint: disable=protected-access
+
+    output_attr = attr_value_pb2.AttrValue()
+    output_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
+  return ret
+# pylint: enable=invalid-name,protected-access
+
+
+def partitioned_call(args, f):
+  return gen_functional_ops.partitioned_call(
+      args=args, Tout=[o.type for o in f.definition.signature.output_arg], f=f)
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index 921fd50aa9fdd1a1e493708f4bc8c66996e26e2c..9fa8e27d5cb51e0c2dd0b7926756a579d38841d2 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -19,16 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.eager.backprop import GradientTape
+from tensorflow.python.ops.custom_gradient import custom_gradient
 from tensorflow.python.ops.gradients_impl import AggregationMethod
 from tensorflow.python.ops.gradients_impl import gradients
 from tensorflow.python.ops.gradients_impl import hessians
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = [
-    # TODO(drpng): find a good place to reference this.
-    "AggregationMethod",
-    "gradients",  # tf.gradients.gradients.
-    "hessians",  # tf.gradients.hessians
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 1418c0b10fb60601e7c3024891b89aadb53e6873..1448151fef4aabee709d1a8306d820b2d7f5ff76 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -86,17 +86,19 @@ def _IndexedSlicesToTensor(value, dtype=None, name=None, as_ref=False):
         % str(value))
   # TODO(mrry): Consider adding static shape information to
   # IndexedSlices, to avoid using numpy here.
-  dense_shape_value = tensor_util.constant_value(value.dense_shape)
-  if dense_shape_value is not None:
-    num_elements = np.prod(dense_shape_value)
-    if num_elements >= _LARGE_SPARSE_NUM_ELEMENTS:
+  if not context.executing_eagerly():
+    dense_shape_value = tensor_util.constant_value(value.dense_shape)
+    if dense_shape_value is not None:
+      num_elements = np.prod(dense_shape_value)
+      if num_elements >= _LARGE_SPARSE_NUM_ELEMENTS:
+        warnings.warn(
+            "Converting sparse IndexedSlices to a dense Tensor with %d "
+            "elements. This may consume a large amount of memory." %
+            num_elements)
+    else:
       warnings.warn(
-          "Converting sparse IndexedSlices to a dense Tensor with %d elements. "
-          "This may consume a large amount of memory." % num_elements)
-  else:
-    warnings.warn(
-        "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
-        "This may consume a large amount of memory.")
+          "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
+          "This may consume a large amount of memory.")
   return math_ops.unsorted_segment_sum(
       value.values, value.indices, value.dense_shape[0], name=name)
 
@@ -119,7 +121,8 @@ def _MarkReachedOps(from_ops, reached_ops):
     if not reached_ops[op._id]:
       reached_ops[op._id] = True
       for output in op.outputs:
-        queue.extend(output.consumers())
+        if _IsBackpropagatable(output):
+          queue.extend(output.consumers())
 
 
 def _GatherInputs(to_ops, reached_ops):
@@ -161,16 +164,19 @@ def _PendingCount(graph, to_ops, from_ops, colocate_gradients_with_ops):
     colocate_gradients_with_ops: Python bool.  See docstring of gradients().
 
   Returns:
-    A tuple containing: (1) a list of integers indexed by operation id,
-    indicating the number of backprop inputs to this operation, and (2)
-    a ControlFlowState object which is not None if the ops between from_ops
-    and to_ops contain control flow loops.
+    A tuple containing: (1) the subset of to_ops ids reachable from from_ops
+    by a path of zero or more backpropagatable tensors, (2) a list of integers
+    indexed by operation id, indicating the number of backprop inputs to this
+    operation, and (3) a ControlFlowState object which is not None if the ops
+    between from_ops and to_ops contain control flow loops.
   """
   # Mark reachable ops from from_ops.
   reached_ops = [False] * (graph._last_id + 1)
-  for op in to_ops:
-    reached_ops[op._id] = True
   _MarkReachedOps(from_ops, reached_ops)
+  # reached_ops[X] iff X is reachable from from_ops by a path of zero or more
+  # backpropagatable tensors.
+
+  reachable_to_ops = set(op._id for op in to_ops if reached_ops[op._id])  # pylint: disable=protected-access
 
   # Mark between ops.
   between_ops = [False] * (graph._last_id + 1)
@@ -187,6 +193,8 @@ def _PendingCount(graph, to_ops, from_ops, colocate_gradients_with_ops):
       reached_ops[op._id] = False
       for inp in op.inputs:
         queue.append(inp.op)
+  # between_ops[X] iff X is on a path of zero or more backpropagatable tensors
+  # between from_ops and to_ops
 
   # 'loop_state' is None if there are no while loops.
   loop_state = control_flow_ops.MaybeCreateControlFlowState(
@@ -199,14 +207,17 @@ def _PendingCount(graph, to_ops, from_ops, colocate_gradients_with_ops):
       if between_ops[x.op._id]:
         pending_count[x.op._id] += 1
 
-  return pending_count, loop_state
+  return reachable_to_ops, pending_count, loop_state
 
 
 def _AsList(x):
   return x if isinstance(x, (list, tuple)) else [x]
 
 
-def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
+def _DefaultGradYs(grad_ys,
+                   ys,
+                   colocate_gradients_with_ops,
+                   gradient_uid="__unsupported__"):
   """Fill in default values for grad_ys.
 
   Args:
@@ -214,6 +225,9 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
     ys: List of tensors.
     colocate_gradients_with_ops: If True, try colocating gradients with
       the corresponding op.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
 
   Returns:
     A list of gradients to use, without None.
@@ -229,7 +243,7 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
   for i in xrange(len(grad_ys)):
     grad_y = grad_ys[i]
     y = ys[i]
-    with _maybe_colocate_with(y.op, colocate_gradients_with_ops):
+    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
       if grad_y is None:
         if y.dtype.is_complex:
           raise TypeError(
@@ -242,21 +256,21 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
         continue
       if y.dtype.is_floating or y.dtype.is_integer:
         if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
-          raise TypeError("Gradient type %s generated for real or "
-                          "integer-valued tensor %s with type %s must be "
-                          "real or integer" %
-                          (dtypes.as_dtype(grad_y.dtype).name, y,
-                           dtypes.as_dtype(y.dtype).name))
+          raise TypeError(
+              "Gradient type %s generated for real or "
+              "integer-valued tensor %s with type %s must be "
+              "real or integer" % (dtypes.as_dtype(grad_y.dtype).name, y,
+                                   dtypes.as_dtype(y.dtype).name))
       elif y.dtype.is_complex:
         if not grad_y.dtype.is_complex:
-          raise TypeError("Gradient type %s generated for complex-valued "
-                          "tensor %s with type %s must be real" %
-                          (dtypes.as_dtype(grad_y.dtype).name, y,
-                           dtypes.as_dtype(y.dtype).name))
+          raise TypeError(
+              "Gradient type %s generated for complex-valued "
+              "tensor %s with type %s must be real" % (dtypes.as_dtype(
+                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
       else:
-        raise TypeError("Tensor %s with type %s must be numeric "
-                        "to obtain a default gradient" %
-                        (y, dtypes.as_dtype(y.dtype).name))
+        raise TypeError(
+            "Tensor %s with type %s must be numeric "
+            "to obtain a default gradient" % (y, dtypes.as_dtype(y.dtype).name))
       # Create a grad_y tensor in the name scope of the gradient.
       # Required for TensorArrays to identify which gradient call a
       # grad_y value is coming from.
@@ -286,6 +300,13 @@ def _IsTrainable(tensor):
                               dtypes.complex64, dtypes.complex128)
 
 
+def _IsBackpropagatable(tensor):
+  if _IsTrainable(tensor):
+    return True
+  dtype = dtypes.as_dtype(tensor.dtype)
+  return dtype.base_dtype in (dtypes.bfloat16, dtypes.resource, dtypes.variant)
+
+
 def _VerifyGeneratedGradients(grads, op):
   """Verify that gradients are valid in number and type.
 
@@ -336,10 +357,10 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
 
 
 @contextlib.contextmanager
-def _maybe_colocate_with(op, colocate_gradients_with_ops):
+def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
   """Context to colocate with `op` if `colocate_gradients_with_ops`."""
   if colocate_gradients_with_ops:
-    with ops.colocate_with(op):
+    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
       yield
   else:
     yield
@@ -354,7 +375,7 @@ def _SymGrad(op, out_grads):
   for k in op.node_def.attr:
     f.attr[k].CopyFrom(op.node_def.attr[k])
   # pylint: disable=protected-access
-  in_grads = functional_ops._symbolic_gradient(input=f_in, Tout=f_types, f=f)
+  in_grads = functional_ops.symbolic_gradient(input=f_in, Tout=f_types, f=f)
   # pylint: enable=protected-access
   return in_grads
 
@@ -452,6 +473,9 @@ def gradients(ys,
   backpropagation stops at both `tf.stop_gradient` nodes and nodes in
   `stop_gradients`, whichever is encountered first.
 
+  All integer tensors are considered constant with respect to all `xs`, as if
+  they were included in `stop_gradients`.
+
   Args:
     ys: A `Tensor` or list of tensors to be differentiated.
     xs: A `Tensor` or list of tensors to be used for differentiation.
@@ -478,9 +502,21 @@ def gradients(ys,
     RuntimeError: if called in Eager mode.
 
   """
-  if context.in_eager_mode():
-    raise RuntimeError("tf.gradients not supported in EAGER mode. Use "
-                       "functions in tf.contrib.eager.backprop instead.")
+  # Creating the gradient graph for control flow mutates Operations. _lock
+  # ensures a Session.run call cannot occur between creating and mutating new
+  # ops.
+  with ops.get_default_graph()._lock:  # pylint: disable=protected-access
+    return _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
+                            gate_gradients, aggregation_method, stop_gradients)
+
+
+def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
+                     gate_gradients, aggregation_method, stop_gradients):
+  """Implementation of gradients()."""
+  if context.executing_eagerly():
+    raise RuntimeError("tf.gradients not supported when eager execution "
+                       "is enabled. Use tf.contrib.eager.GradientTape "
+                       "instead.")
   ys = _AsList(ys)
   xs = _AsList(xs)
   stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
@@ -492,6 +528,9 @@ def gradients(ys,
   with ops.name_scope(
       name, "gradients",
       list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
+    # Get a uid for this call to gradients that can be used to help
+    # cluster ops for compilation.
+    gradient_uid = ops.get_default_graph().unique_name("uid")
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
     xs = [
         x.handle if resource_variable_ops.is_resource_variable(x) else x
@@ -499,7 +538,8 @@ def gradients(ys,
     ]
     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
         xs, name="x", as_ref=True)
-    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
+    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
+                             gradient_uid)
 
     # The approach we take here is as follows: Create a list of all ops in the
     # subgraph between the ys and xs.  Visit these ops in reverse order of ids
@@ -515,7 +555,7 @@ def gradients(ys,
     to_ops = [t.op for t in ys]
     from_ops = [t.op for t in xs]
     stop_gradient_ops = [t.op for t in stop_gradients]
-    pending_count, loop_state = _PendingCount(
+    reachable_to_ops, pending_count, loop_state = _PendingCount(
         ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops)
 
     # Iterate over the collected ops.
@@ -540,7 +580,7 @@ def gradients(ys,
       # another output's gradient.
       # pylint: disable=protected-access
       ready = (pending_count[op._id] == 0)
-      if ready and op._id not in to_ops_set:
+      if ready and op._id not in to_ops_set and op._id in reachable_to_ops:
         to_ops_set.add(op._id)
         queue.append(op)
       # pylint: enable=protected-access
@@ -556,23 +596,28 @@ def gradients(ys,
     while queue:
       # generate gradient subgraph for op.
       op = queue.popleft()
-      with _maybe_colocate_with(op, colocate_gradients_with_ops):
+      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
         if loop_state:
           loop_state.EnterGradWhileContext(op, before=True)
-        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
+        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
+                                     aggregation_method)
         if loop_state:
           loop_state.ExitGradWhileContext(op, before=True)
 
         grad_fn = None
-        # pylint: disable=protected-access
         func_call = None
+        # pylint: disable=protected-access
         is_func_call = ops.get_default_graph()._is_function(op.type)
+        # pylint: enable=protected-access
         has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
         if has_out_grads and (op._id not in stop_ops):
           if is_func_call:
             func_call = ops.get_default_graph()._get_function(op.type)
+            # Note that __defun is not set if the graph is
+            # imported. If it's set, we prefer to access the original
+            # defun.
+            func_call = getattr(op, "__defun", func_call)
             grad_fn = func_call.python_grad_func
-            # pylint: enable=protected-access
           else:
             # A grad_fn must be defined, either as a function or as None
             # for ops that do not have gradients.
@@ -619,7 +664,10 @@ def gradients(ys,
               if gate_gradients and len([x for x in in_grads
                                          if x is not None]) > 1:
                 with ops.device(None):
-                  with ops.colocate_with(None, ignore_existing=True):
+                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+                      None,
+                      gradient_uid,
+                      ignore_existing=True):
                     in_grads = control_flow_ops.tuple(in_grads)
           _LogOpGradients(op, out_grads, in_grads)
         else:
@@ -775,7 +823,7 @@ def _LogOpGradients(op, out_grads, in_grads):
                ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
 
 
-def _MultiDeviceAddN(tensor_list):
+def _MultiDeviceAddN(tensor_list, gradient_uid):
   """Adds tensors from potentially multiple devices."""
   # Basic function structure comes from control_flow_ops.group().
   # Sort tensors according to their devices.
@@ -794,7 +842,10 @@ def _MultiDeviceAddN(tensor_list):
 
   for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
     tensors = tensors_on_device[dev]
-    with ops.colocate_with(tensors[0].op, ignore_existing=True):
+    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+        tensors[0].op,
+        gradient_uid,
+        ignore_existing=True):
       summands.append(math_ops.add_n(tensors))
 
   return math_ops.add_n(summands)
@@ -820,12 +871,19 @@ class AggregationMethod(object):
   EXPERIMENTAL_ACCUMULATE_N = 2
 
 
-def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
+def _AggregatedGrads(grads,
+                     op,
+                     gradient_uid,
+                     loop_state,
+                     aggregation_method=None):
   """Get the aggregated gradients for op.
 
   Args:
     grads: The map of memoized gradients.
     op: The op to get gradients for.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
     loop_state: An object for maintaining the state of the while loops in the
                 graph. It is of type ControlFlowState. None if the graph
                 contains no while loops.
@@ -902,7 +960,7 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
             out_grads[i] = running_sum
         else:
           used = "add_n"
-          out_grads[i] = _MultiDeviceAddN(out_grad)
+          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
         logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
                      tensor_shape, used)
       else:
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index d39b934819177e3c15af95a0777ba96869c5e9cf..5e8b8822efd6066594aa4d436cc4cafc97e8eb68 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -24,6 +24,8 @@ import warnings
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -31,10 +33,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
+from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import data_flow_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import functional_ops  # pylint: disable=unused-import
@@ -43,9 +47,11 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
@@ -661,6 +667,7 @@ class HessianTest(test_util.TensorFlowTestCase):
     self.assertAllEqual((m, n, m, n), hess_actual.shape)
     self.assertAllClose(hess_value, hess_actual.reshape((m * n, m * n)))
 
+
 @test_util.with_c_api
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
@@ -755,5 +762,188 @@ class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
       gradients.gradients(y, x)
 
 
+class ResourceCondTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    gamma = resource_variable_ops.ResourceVariable(
+        np.random.random((3,)),
+        dtype="float32", name="gamma")
+
+    inputs = array_ops.ones(shape=(3,), dtype="float32")
+
+    def TestFn():
+      output = inputs + gamma
+      return output
+
+    training = array_ops.placeholder_with_default(True, shape=())
+    output = control_flow_ops.cond(
+        training, TestFn, lambda: inputs)
+
+    loss = output
+
+    grads = gradients.gradients(
+        loss, [gamma])
+    self.assertTrue(None not in grads)
+
+
+@test_util.with_c_api
+class CustomGradientTest(test_util.TensorFlowTestCase):
+
+  def testCustomGradientTrivial(self):
+
+    @custom_gradient.custom_gradient
+    def MyIdentity(x):
+
+      def Grad(dy):
+        return [3 * dy]
+
+      return x, Grad
+
+    with ops.Graph().as_default():
+      x = constant(3.)
+      y = MyIdentity(MyIdentity(x))
+      dy = gradients.gradients(y, x)[0]
+      with session.Session():
+        self.assertEqual(9., dy.eval())
+
+  def testCustomGradient(self):
+
+    @custom_gradient.custom_gradient
+    def MyMultiply(x1, x2):
+      result = x1 * x2
+
+      def Grad(dy):
+        # Switched the ordering here.
+        return [dy * x1, dy * x2]
+
+      return result, Grad
+
+    with ops.Graph().as_default():
+      x1 = constant(3.)
+      x2 = constant(5.)
+      y = MyMultiply(x1, x2)
+      dy = gradients.gradients(y, [x1, x2])
+      with session.Session() as sess:
+        self.assertAllEqual([3., 5.], sess.run(dy))
+
+  def testCustomGradientErrors(self):
+
+    @custom_gradient.custom_gradient
+    def F(x):
+
+      def Grad(_):
+        raise RuntimeError("x")
+
+      return x, Grad
+
+    with ops.Graph().as_default():
+      x = constant(1.0)
+      y = F(x)
+      with self.assertRaises(RuntimeError):
+        gradients.gradients(y, x)
+
+  def testCustomGradientWithVariables(self):
+
+    @custom_gradient.custom_gradient
+    def F(x):
+      out = core_layers.dense(x, 3, use_bias=False)
+
+      def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
+        self.assertEqual(1, len(variables))
+        grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
+        return grads[0], [array_ops.ones((4, 3))]
+
+      return out, Grad
+
+    with ops.Graph().as_default():
+      x = array_ops.ones((2, 4))
+      with variable_scope.variable_scope("f", use_resource=True) as vs:
+        y = F(x)
+        all_vars = vs.global_variables()
+        assert len(all_vars) == 1
+      grads = gradients.gradients(y, [x, all_vars[0]])
+      for g in grads:
+        self.assertTrue(g is not None)
+      with session.Session() as sess:
+        sess.run(variables.global_variables_initializer())
+        dw = sess.run(math_ops.reduce_sum(grads[1]))
+        self.assertEqual(12., dw)
+
+  def testCustomGradientWithVariablesEager(self):
+    with context.eager_mode():
+      layer = core_layers.Dense(4, use_bias=False)
+
+      @custom_gradient.custom_gradient
+      def F(x):
+        out = layer(x)
+
+        def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
+          del out_grad
+          self.assertEqual(1, len(variables))
+          return (array_ops.ones((3, 2)),
+                  [array_ops.ones((2, 4))])
+
+        return out, Grad
+
+      x = array_ops.ones((3, 2)) + 2.
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = F(x)
+      w, = layer.variables
+      dx, dw = tape.gradient(y, [x, w])
+      self.assertEqual(6., math_ops.reduce_sum(dx).numpy())
+      self.assertEqual(8., math_ops.reduce_sum(dw).numpy())
+
+  def testCustomGradientErrorsWithNonResourceVariables(self):
+
+    def F(x, use_resource=False):
+      with variable_scope.variable_scope("f", use_resource=use_resource):
+        out = core_layers.dense(x, 4, use_bias=False)
+
+      def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
+        del out_grad
+        self.assertEqual(1, len(variables))
+        return (array_ops.ones((3, 2)), [array_ops.ones((2, 4))])
+
+      return out, Grad
+
+    @custom_gradient.custom_gradient
+    def FResource(x):
+      return F(x, use_resource=True)
+
+    @custom_gradient.custom_gradient
+    def FNonResource(x):
+      return F(x, use_resource=False)
+
+    x = array_ops.ones((3, 2)) + 2.
+
+    # Wrapping scope has use_resource=True but inner scope sets to False. Fails.
+    with variable_scope.variable_scope("vs1", use_resource=True):
+      with self.assertRaisesWithPredicateMatch(TypeError,
+                                               "must be `ResourceVariable`s"):
+        FNonResource(x)
+
+    # Wrapping scope has use_resource=False but inner scope sets to True.
+    # Passes.
+    with variable_scope.variable_scope("vs2", use_resource=False):
+      FResource(x)
+
+  def testWithNumpyInputs(self):
+    with context.eager_mode():
+
+      @custom_gradient.custom_gradient
+      def F(x):
+        out = x
+
+        def Grad(_):
+          return (None, None)
+
+        return out, Grad
+
+      x = np.ones((3, 2), dtype=np.float32)
+      # Smoke test to ensure numpy inputs are accepted
+      F(x)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
deleted file mode 100644
index f6ef6f3f3da4389a16a84fa0b3570d3cd1262472..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/hidden_ops.txt
+++ /dev/null
@@ -1,392 +0,0 @@
-# array_ops
-BatchToSpace
-BroadcastArgs
-BroadcastGradientArgs
-ConcatOffset
-Concat
-ConcatV2
-ConjugateTranspose
-Const
-DebugGradientIdentity
-DebugGradientRefIdentity
-EditDistance
-ExpandDims
-ListDiff
-MirrorPad
-MirrorPadGrad
-OneHot
-Pack
-Pad
-PadV2
-ParallelConcat
-Placeholder
-RefIdentity
-Reverse
-Snapshot
-SpaceToBatch
-Split
-SplitV
-Squeeze
-Slice
-TileGrad  # Exported through array_grad instead of array_ops.
-ZerosLike  # TODO(josh11b): Use this instead of the Python version.
-Unique
-UniqueV2
-Unpack
-
-# candidate_sampling_ops
-AllCandidateSampler
-ComputeAccidentalHits
-FixedUnigramCandidateSampler
-LearnedUnigramCandidateSampler
-LogUniformCandidateSampler
-ThreadUnsafeUnigramCandidateSampler
-UniformCandidateSampler
-
-# checkpoint_ops
-GenerateVocabRemapping
-LoadAndRemapMatrix
-
-
-# control_flow_ops
-Switch
-Merge
-RefMerge
-Exit
-RefExit
-
-# ctc_ops
-CTCLoss
-CTCGreedyDecoder
-CTCBeamSearchDecoder
-
-# data_flow_ops
-Barrier
-BarrierClose
-BarrierIncompleteSize
-BarrierInsertMany
-BarrierReadySize
-BarrierTakeMany
-DeleteSessionTensor
-FakeQueue
-FIFOQueue
-FIFOQueueV2
-GetSessionHandle
-GetSessionHandleV2
-GetSessionTensor
-HashTable
-HashTableV2
-InitializeTable
-InitializeTableV2
-InitializeTableFromTextFile
-InitializeTableFromTextFileV2
-LookupTableExport
-LookupTableExportV2
-LookupTableFind
-LookupTableFindV2
-LookupTableImport
-LookupTableImportV2
-LookupTableInsert
-LookupTableInsertV2
-LookupTableSize
-LookupTableSizeV2
-MutableDenseHashTable
-MutableDenseHashTableV2
-MutableHashTable
-MutableHashTableV2
-MutableHashTableOfTensors
-MutableHashTableOfTensorsV2
-Mutex
-MutexAcquire
-MutexRelease
-PaddingFIFOQueue
-PaddingFIFOQueueV2
-PriorityQueue
-PriorityQueueV2
-QueueClose
-QueueCloseV2
-QueueDequeue
-QueueDequeueV2
-QueueDequeueMany
-QueueDequeueManyV2
-QueueDequeueUpTo
-QueueDequeueUpToV2
-QueueEnqueue
-QueueEnqueueV2
-QueueEnqueueMany
-QueueEnqueueManyV2
-QueueSize
-QueueSizeV2
-RandomShuffleQueue
-RandomShuffleQueueV2
-Stack
-StackClose
-StackPop
-StackPush
-StackV2
-StackCloseV2
-StackPopV2
-StackPushV2
-TensorArray
-TensorArrayClose
-TensorArrayCloseV2
-TensorArrayConcat
-TensorArrayConcatV2
-TensorArrayGather
-TensorArrayGatherV2
-TensorArrayGrad
-TensorArrayGradV2
-TensorArrayPack
-TensorArrayPackV2
-TensorArrayRead
-TensorArrayReadV2
-TensorArrayScatter
-TensorArrayScatterV2
-TensorArraySize
-TensorArraySizeV2
-TensorArraySplit
-TensorArraySplitV2
-TensorArrayUnpack
-TensorArrayUnpackV2
-TensorArrayV2
-TensorArrayWrite
-TensorArrayWriteV2
-TensorArrayV3
-TensorArrayCloseV3
-TensorArrayConcatV3
-TensorArrayGatherV3
-TensorArrayGradV3
-TensorArrayReadV3
-TensorArrayPackV3
-TensorArrayScatterV3
-TensorArraySizeV3
-TensorArraySplitV3
-TensorArrayUnpackV3
-TensorArrayWriteV3
-
-# functional_ops
-SymbolicGradient
-
-# image_ops
-AdjustContrastv2
-NonMaxSuppression
-NonMaxSuppressionV2
-RandomCrop
-ResizeBilinearGrad
-ResizeBicubicGrad
-ResizeNearestNeighborGrad
-SampleDistortedBoundingBox
-SampleDistortedBoundingBoxV2
-ScaleImageGrad
-
-# io_ops
-FixedLengthRecordReader
-IdentityReader
-ReaderNumRecordsProduced
-ReaderNumWorkUnitsCompleted
-ReaderRead
-ReaderReadUpTo
-ReaderReset
-ReaderRestoreState
-ReaderSerializeState
-ReaderWorkQueueLength
-FixedLengthRecordReaderV2
-IdentityReaderV2
-ReaderNumRecordsProducedV2
-ReaderNumWorkUnitsCompletedV2
-ReaderReadV2
-ReaderReadUpToV2
-ReaderResetV2
-ReaderRestoreStateV2
-ReaderSerializeStateV2
-ReaderWorkQueueLengthV2
-Restore
-RestoreSlice
-Save
-SaveSlices
-ShardedFilename
-ShardedFilespec
-TextLineReader
-TFRecordReader
-WholeFileReader
-TextLineReaderV2
-TFRecordReaderV2
-WholeFileReaderV2
-LMDBReader
-DecodeCSV
-
-# linalg_ops
-BatchCholesky
-BatchCholeskyGrad
-BatchMatrixDeterminant
-BatchMatrixInverse
-BatchMatrixSolve
-BatchMatrixSolveLs
-BatchMatrixTriangularSolve
-BatchSelfAdjointEig
-BatchSelfAdjointEigV2
-BatchSvd
-LogMatrixDeterminant
-MatrixExponential
-MatrixLogarithm
-MatrixSolveLs
-SelfAdjointEig
-SelfAdjointEigV2
-Svd
-
-# logging_ops
-Assert
-AudioSummary
-AudioSummaryV2
-HistogramSummary
-ImageSummary
-MergeSummary
-Print
-ScalarSummary
-TensorSummary
-TensorSummaryV2
-
-# math_ops
-Abs
-AccumulateNV2
-AddN
-AddV2
-All
-Any
-BatchMatMul
-BatchFFT
-BatchFFT2D
-BatchFFT3D
-BatchIFFT
-BatchIFFT2D
-BatchIFFT3D
-Bucketize
-Complex
-ComplexAbs
-Conj
-FloorDiv
-FloorMod
-HistogramFixedWidth
-Max
-Mean
-Min
-Mul
-Neg
-Pow
-Prod
-Range
-RealDiv
-Select
-SparseMatMul
-Sub
-Sum
-MatMul
-Sigmoid
-Tanh
-SigmoidGrad
-TanhGrad
-InvGrad
-ReciprocalGrad
-SqrtGrad
-RsqrtGrad
-TruncateDiv
-TruncateMod
-
-# nn_ops
-AvgPoolGrad  # "*Grad" accessible through nn_grad instead of nn_ops.
-AvgPool3DGrad
-BatchNormWithGlobalNormalization
-BatchNormWithGlobalNormalizationGrad
-FusedBatchNorm
-FusedBatchNormV2
-SoftmaxCrossEntropyWithLogits
-SparseSoftmaxCrossEntropyWithLogits
-LRNGrad
-MaxPoolGrad
-MaxPoolGradWithArgmax
-MaxPoolGradGrad
-MaxPoolGradGradWithArgmax
-MaxPool3DGrad
-MaxPool3DGradGrad
-ReluGrad
-Relu6Grad
-EluGrad
-SeluGrad
-SoftplusGrad
-SoftsignGrad
-TopK
-TopKV2
-BiasAdd
-BiasAddV1
-Relu6
-AvgPool
-MaxPool
-MaxPoolV2
-Softmax
-LogSoftmax
-FractionalAvgPoolGrad
-FractionalMaxPoolGrad
-InTopK
-InTopKV2
-
-# parsing_ops
-ParseExample
-ParseSingleSequenceExample
-
-# random_ops
-RandomGamma
-RandomPoisson
-RandomUniform
-RandomUniformInt
-RandomShuffle
-RandomStandardNormal
-ParameterizedTruncatedNormal
-TruncatedNormal
-
-# script_ops
-PyFunc
-PyFuncStateless
-EagerPyFunc
-
-# sdca_ops
-
-# state_ops
-Variable
-VariableV2
-TemporaryVariable
-DestroyTemporaryVariable
-
-# sparse_ops
-AddSparseToTensorsMap
-AddManySparseToTensorsMap
-TakeManySparseFromTensorsMap
-DeserializeManySparse
-DeserializeSparse
-SerializeManySparse
-SerializeSparse
-SparseAdd
-SparseAddGrad
-SparseConcat
-SparseCross
-SparseFillEmptyRows
-SparseFillEmptyRowsGrad
-SparseSplit
-SparseSelectLastK
-SparseReorder
-SparseReshape
-SparseToDense
-SparseTensorDenseAdd
-SparseTensorDenseMatMul
-
-# string_ops
-StringSplit
-
-# user_ops
-Fact
-
-# training_ops
-# (None)
-
-# word2vec deprecated ops
-NegTrain
-Skipgram
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 6a975160b0698270dfc9ce9140e8b3ff633cdb9e..e86a8e5a5baa5657f92243172c818518af7c77dc 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -16,9 +16,6 @@
 """Histograms.
 
 Please see @{$python/histogram_ops} guide.
-
-@@histogram_fixed_width_bins
-@@histogram_fixed_width
 """
 
 from __future__ import absolute_import
@@ -32,7 +29,6 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('histogram_fixed_width_bins')
@@ -141,5 +137,7 @@ def histogram_fixed_width(values,
   """
   with ops.name_scope(name, 'histogram_fixed_width',
                       [values, value_range, nbins]) as name:
-    return gen_math_ops._histogram_fixed_width(  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    return gen_math_ops._histogram_fixed_width(
         values, value_range, nbins, dtype=dtype, name=name)
+    # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/image_grad.py b/tensorflow/python/ops/image_grad.py
index 093843cd5bc0b7c2281a0c9ddf52d93ea3faede3..9f43e3f1466d900ae6d39f3b9ef48043421cb777 100644
--- a/tensorflow/python/ops/image_grad.py
+++ b/tensorflow/python/ops/image_grad.py
@@ -41,12 +41,10 @@ def _ResizeNearestNeighborGrad(op, grad):
   else:
     image_shape = array_ops.shape(image)[1:3]
 
-  # pylint: disable=protected-access
-  grads = gen_image_ops._resize_nearest_neighbor_grad(
+  grads = gen_image_ops.resize_nearest_neighbor_grad(
       grad,
       image_shape,
       align_corners=op.get_attr("align_corners"))
-  # pylint: enable=protected-access
   return [grads, None]
 
 
@@ -61,10 +59,8 @@ def _ResizeBilinearGrad(op, grad):
   Returns:
     The gradients w.r.t. the input.
   """
-  # pylint: disable=protected-access
-  grad0 = gen_image_ops._resize_bilinear_grad(
+  grad0 = gen_image_ops.resize_bilinear_grad(
       grad, op.inputs[0], align_corners=op.get_attr("align_corners"))
-  # pylint: enable=protected-access
   return [grad0, None]
 
 
@@ -82,10 +78,8 @@ def _ResizeBicubicGrad(op, grad):
   allowed_types = [dtypes.float32, dtypes.float64]
   grad0 = None
   if op.inputs[0].dtype in allowed_types:
-    # pylint: disable=protected-access
-    grad0 = gen_image_ops._resize_bicubic_grad(
+    grad0 = gen_image_ops.resize_bicubic_grad(
         grad, op.inputs[0], align_corners=op.get_attr("align_corners"))
-    # pylint: enable=protected-access
   return [grad0, None]
 
 
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index ae52d32fea1c872e588c4122f5e73198e4dfe9ad..343531ac5549dba1e85a81ae0df4e3505ceeb6a5 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -17,58 +17,6 @@
 """Image processing and decoding ops.
 
 See the @{$python/image} guide.
-
-@@decode_bmp
-@@decode_gif
-@@decode_jpeg
-@@decode_and_crop_jpeg
-@@encode_jpeg
-@@extract_jpeg_shape
-@@decode_png
-@@encode_png
-@@is_jpeg
-@@decode_image
-@@resize_images
-@@resize_area
-@@resize_bicubic
-@@resize_bilinear
-@@resize_nearest_neighbor
-@@resize_image_with_crop_or_pad
-@@central_crop
-@@pad_to_bounding_box
-@@crop_to_bounding_box
-@@extract_glimpse
-@@crop_and_resize
-@@flip_up_down
-@@random_flip_up_down
-@@flip_left_right
-@@random_flip_left_right
-@@transpose_image
-@@rot90
-
-@@rgb_to_grayscale
-@@grayscale_to_rgb
-@@hsv_to_rgb
-@@rgb_to_hsv
-@@rgb_to_yiq
-@@yiq_to_rgb
-@@rgb_to_yuv
-@@yuv_to_rgb
-@@convert_image_dtype
-@@adjust_brightness
-@@random_brightness
-@@adjust_contrast
-@@random_contrast
-@@adjust_hue
-@@random_hue
-@@adjust_gamma
-@@adjust_saturation
-@@random_saturation
-@@per_image_standardization
-@@draw_bounding_boxes
-@@non_max_suppression
-@@sample_distorted_bounding_box
-@@total_variation
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -86,13 +34,3 @@ from tensorflow.python.ops.image_ops_impl import *
 from tensorflow.python.ops.image_ops_impl import _Check3DImage
 from tensorflow.python.ops.image_ops_impl import _ImageDimensions
 # pylint: enable=unused-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    # ResizeMethod is not documented, but is documented in functions
-    # that use it.
-    'ResizeMethod',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 0c0e92d5b00b36f2fbd800afc046faa1fc77b95c..bd5b2ae83b5dd16e15dcf87de9714c082fa10ef9 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,6 +31,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
@@ -166,6 +170,7 @@ def _Assert3DImage(image):
   return control_flow_ops.with_dependencies(
       _Check3DImage(image, require_static=False), image)
 
+
 def _AssertAtLeast3DImage(image):
   """Assert that we are working with a properly shaped image.
 
@@ -183,10 +188,11 @@ def _AssertAtLeast3DImage(image):
       If the shape of `image` could be verified statically, `image` is
       returned unchanged, otherwise there will be a control dependency
       added that asserts the correct dynamic shape.
-    """
+  """
   return control_flow_ops.with_dependencies(
       _CheckAtLeast3DImage(image, require_static=False), image)
 
+
 def _CheckAtLeast3DImage(image, require_static=True):
   """Assert that we are working with properly shaped image.
 
@@ -263,17 +269,7 @@ def random_flip_up_down(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
-    image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [0]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+  return _random_flip(image, 0, seed, 'random_flip_up_down')
 
 
 @tf_export('image.random_flip_left_right')
@@ -295,14 +291,34 @@ def random_flip_left_right(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
+  return _random_flip(image, 1, seed, 'random_flip_left_right')
+
+
+def _random_flip(image, flip_index, seed, scope_name):
+  """Randomly (50% chance) flip an image along axis `flip_index`.
+    Args:
+      image: A 3-D tensor of shape `[height, width, channels].`
+      flip_index: The dimension along which to flip the image.
+                  Vertical: 0, Horizontal: 1
+      seed: A Python integer. Used to create a random seed. See
+        @{tf.set_random_seed}
+        for behavior.
+      scope_name: Name of the scope in which the ops are added.
+
+    Returns:
+      A 3-D tensor of the same type and shape as `image`.
+
+    Raises:
+      ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
     image = _Assert3DImage(image)
     uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
     mirror_cond = math_ops.less(uniform_random, .5)
     result = control_flow_ops.cond(
         mirror_cond,
-        lambda: array_ops.reverse(image, [1]),
+        lambda: array_ops.reverse(image, [flip_index]),
         lambda: image,
         name=scope)
     return fix_image_flip_shape(image, result)
@@ -326,16 +342,7 @@ def flip_left_right(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_left_right', [image]) as scope:
-    image = ops.convert_to_tensor(image, name='image')
-    image = _AssertAtLeast3DImage(image)
-    shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
-    elif shape.ndims == 4:
-      return array_ops.reverse(image, [2])
-    else:
-      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  return _flip(image, 1, 'flip_left_right')
 
 
 @tf_export('image.flip_up_down')
@@ -356,14 +363,35 @@ def flip_up_down(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_up_down', [image]) as scope:
+  return _flip(image, 0, 'flip_up_down')
+
+
+def _flip(image, flip_index, scope_name):
+  """Flip an image either horizontally or vertically.
+
+  Outputs the contents of `image` flipped along the dimension `flip_index`.
+
+  See also `reverse()`.
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
+    flip_index: 0 For vertical, 1 for horizontal.
+
+  Returns:
+    A tensor of the same type and shape as `image`.
+
+  Raises:
+    ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
     if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+      return fix_image_flip_shape(image, array_ops.reverse(image, [flip_index]))
     elif shape.ndims == 4:
-      return array_ops.reverse(image, [1])
+      return array_ops.reverse(image, [flip_index+1])
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
@@ -412,23 +440,25 @@ def _rot90_3D(image, k, name_scope):
     A 3-D tensor of the same type and shape as `image`.
 
   """
+
   def _rot90():
-    return array_ops.transpose(array_ops.reverse_v2(image, [1]),
-                               [1, 0, 2])
+    return array_ops.transpose(array_ops.reverse_v2(image, [1]), [1, 0, 2])
+
   def _rot180():
     return array_ops.reverse_v2(image, [0, 1])
+
   def _rot270():
-    return array_ops.reverse_v2(array_ops.transpose(image, [1, 0, 2]),
-                                [1])
-  cases = [(math_ops.equal(k, 1), _rot90),
-           (math_ops.equal(k, 2), _rot180),
+    return array_ops.reverse_v2(array_ops.transpose(image, [1, 0, 2]), [1])
+
+  cases = [(math_ops.equal(k, 1), _rot90), (math_ops.equal(k, 2), _rot180),
            (math_ops.equal(k, 3), _rot270)]
 
-  result = control_flow_ops.case(cases, default=lambda: image, exclusive=True,
-                                 name=name_scope)
+  result = control_flow_ops.case(
+      cases, default=lambda: image, exclusive=True, name=name_scope)
   result.set_shape([None, None, image.get_shape()[2]])
   return result
 
+
 def _rot90_4D(images, k, name_scope):
   """Rotate batch of images counter-clockwise by 90 degrees `k` times.
 
@@ -442,21 +472,20 @@ def _rot90_4D(images, k, name_scope):
     A 4-D tensor of the same type and shape as `images`.
 
   """
+
   def _rot90():
-    return array_ops.transpose(array_ops.reverse_v2(images, [2]),
-                               [0, 2, 1, 3])
+    return array_ops.transpose(array_ops.reverse_v2(images, [2]), [0, 2, 1, 3])
+
   def _rot180():
     return array_ops.reverse_v2(images, [1, 2])
   def _rot270():
-    return array_ops.reverse_v2(array_ops.transpose(images, [0, 2, 1, 3]),
-                                [2])
+    return array_ops.reverse_v2(array_ops.transpose(images, [0, 2, 1, 3]), [2])
 
-  cases = [(math_ops.equal(k, 1), _rot90),
-           (math_ops.equal(k, 2), _rot180),
+  cases = [(math_ops.equal(k, 1), _rot90), (math_ops.equal(k, 2), _rot180),
            (math_ops.equal(k, 3), _rot270)]
 
-  result = control_flow_ops.case(cases, default=lambda: images, exclusive=True,
-                                 name=name_scope)
+  result = control_flow_ops.case(
+      cases, default=lambda: images, exclusive=True, name=name_scope)
   shape = result.get_shape()
   result.set_shape([shape[0], None, None, shape[3]])
   return result
@@ -480,7 +509,7 @@ def transpose_image(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'transpose_image', [image]) as scope:
+  with ops.name_scope(None, 'transpose_image', [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
@@ -623,7 +652,7 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
     padded.set_shape(padded_shape)
 
     if not is_batch:
-      padded = array_ops.squeeze(padded, squeeze_dims=[0])
+      padded = array_ops.squeeze(padded, axis=[0])
 
     return padded
 
@@ -703,7 +732,7 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
     cropped.set_shape(cropped_shape)
 
     if not is_batch:
-      cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
+      cropped = array_ops.squeeze(cropped, axis=[0])
 
     return cropped
 
@@ -820,7 +849,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     resized = control_flow_ops.with_dependencies(assert_ops, resized)
 
     if not is_batch:
-      resized = array_ops.squeeze(resized, squeeze_dims=[0])
+      resized = array_ops.squeeze(resized, axis=[0])
 
     return resized
 
@@ -913,7 +942,7 @@ def resize_images(images,
            for x in [new_width_const, width, new_height_const, height]) and (
                width == new_width_const and height == new_height_const):
       if not is_batch:
-        images = array_ops.squeeze(images, squeeze_dims=[0])
+        images = array_ops.squeeze(images, axis=[0])
       return images
 
     if method == ResizeMethod.BILINEAR:
@@ -936,7 +965,7 @@ def resize_images(images,
     images.set_shape([None, new_height_const, new_width_const, None])
 
     if not is_batch:
-      images = array_ops.squeeze(images, squeeze_dims=[0])
+      images = array_ops.squeeze(images, axis=[0])
     return images
 
 
@@ -1110,10 +1139,8 @@ def adjust_contrast(images, contrast_factor):
     orig_dtype = images.dtype
     flt_images = convert_image_dtype(images, dtypes.float32)
 
-    # pylint: disable=protected-access
-    adjusted = gen_image_ops._adjust_contrastv2(
+    adjusted = gen_image_ops.adjust_contrastv2(
         flt_images, contrast_factor=contrast_factor, name=name)
-    # pylint: enable=protected-access
 
     return convert_image_dtype(adjusted, orig_dtype, saturate=True)
 
@@ -1727,7 +1754,7 @@ def sample_distorted_bounding_box(image_size,
       Provide as input to `tf.image.draw_bounding_boxes`.
   """
   with ops.name_scope(name, 'sample_distorted_bounding_box'):
-    return gen_image_ops._sample_distorted_bounding_box_v2(  # pylint: disable=protected-access
+    return gen_image_ops.sample_distorted_bounding_box_v2(
         image_size,
         bounding_boxes,
         seed=seed,
@@ -1781,10 +1808,8 @@ def non_max_suppression(boxes,
   """
   with ops.name_scope(name, 'non_max_suppression'):
     iou_threshold = ops.convert_to_tensor(iou_threshold, name='iou_threshold')
-    # pylint: disable=protected-access
-    return gen_image_ops._non_max_suppression_v2(boxes, scores, max_output_size,
-                                                 iou_threshold)
-    # pylint: enable=protected-access
+    return gen_image_ops.non_max_suppression_v2(boxes, scores, max_output_size,
+                                                iou_threshold)
 
 
 _rgb_to_yiq_kernel = [[0.299, 0.59590059,
@@ -1792,6 +1817,7 @@ _rgb_to_yiq_kernel = [[0.299, 0.59590059,
                       [0.114, -0.32134392, 0.31119955]]
 
 
+@tf_export('image.rgb_to_yiq')
 def rgb_to_yiq(images):
   """Converts one or more images from RGB to YIQ.
 
@@ -1817,6 +1843,7 @@ _yiq_to_rgb_kernel = [[1, 1, 1], [0.95598634, -0.27201283, -1.10674021],
                       [0.6208248, -0.64720424, 1.70423049]]
 
 
+@tf_export('image.yiq_to_rgb')
 def yiq_to_rgb(images):
   """Converts one or more images from YIQ to RGB.
 
@@ -1844,6 +1871,7 @@ _rgb_to_yuv_kernel = [[0.299, -0.14714119,
                       [0.114, 0.43601035, -0.10001026]]
 
 
+@tf_export('image.rgb_to_yuv')
 def rgb_to_yuv(images):
   """Converts one or more images from RGB to YUV.
 
@@ -1869,6 +1897,7 @@ _yuv_to_rgb_kernel = [[1, 1, 1], [0, -0.394642334, 2.03206185],
                       [1.13988303, -0.58062185, 0]]
 
 
+@tf_export('image.yuv_to_rgb')
 def yuv_to_rgb(images):
   """Converts one or more images from YUV to RGB.
 
@@ -1889,3 +1918,489 @@ def yuv_to_rgb(images):
       _yuv_to_rgb_kernel, dtype=images.dtype, name='kernel')
   ndims = images.get_shape().ndims
   return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
+
+
+def _verify_compatible_image_shapes(img1, img2):
+  """Checks if two image tensors are compatible for applying SSIM or PSNR.
+
+  This function checks if two sets of images have ranks at least 3, and if the
+  last three dimensions match.
+
+  Args:
+    img1: Tensor containing the first image batch.
+    img2: Tensor containing the second image batch.
+
+  Returns:
+    A tuple containing: the first tensor shape, the second tensor shape, and a
+    list of control_flow_ops.Assert() ops implementing the checks.
+
+  Raises:
+    ValueError: When static shape check fails.
+  """
+  shape1 = img1.get_shape().with_rank_at_least(3)
+  shape2 = img2.get_shape().with_rank_at_least(3)
+  shape1[-3:].assert_is_compatible_with(shape2[-3:])
+
+  if shape1.ndims is not None and shape2.ndims is not None:
+    for dim1, dim2 in zip(reversed(shape1[:-3]), reversed(shape2[:-3])):
+      if not (dim1 == 1 or dim2 == 1 or dim1.is_compatible_with(dim2)):
+        raise ValueError(
+            'Two images are not compatible: %s and %s' % (shape1, shape2))
+
+  # Now assign shape tensors.
+  shape1, shape2 = array_ops.shape_n([img1, img2])
+
+  # TODO(sjhwang): Check if shape1[:-3] and shape2[:-3] are broadcastable.
+  checks = []
+  checks.append(control_flow_ops.Assert(
+      math_ops.greater_equal(array_ops.size(shape1), 3),
+      [shape1, shape2], summarize=10))
+  checks.append(control_flow_ops.Assert(
+      math_ops.reduce_all(math_ops.equal(shape1[-3:], shape2[-3:])),
+      [shape1, shape2], summarize=10))
+  return shape1, shape2, checks
+
+
+@tf_export('image.psnr')
+def psnr(a, b, max_val, name=None):
+  """Returns the Peak Signal-to-Noise Ratio between a and b.
+
+  This is intended to be used on signals (or images). Produces a PSNR value for
+  each image in batch.
+
+  The last three dimensions of input are expected to be [height, width, depth].
+
+  Example:
+
+  ```python
+      # Read images from file.
+      im1 = tf.decode_png('path/to/im1.png')
+      im2 = tf.decode_png('path/to/im2.png')
+      # Compute PSNR over tf.uint8 Tensors.
+      psnr1 = tf.image.psnr(im1, im2, max_val=255)
+
+      # Compute PSNR over tf.float32 Tensors.
+      im1 = tf.image.convert_image_dtype(im1, tf.float32)
+      im2 = tf.image.convert_image_dtype(im2, tf.float32)
+      psnr2 = tf.image.psnr(im1, im2, max_val=1.0)
+      # psnr1 and psnr2 both have type tf.float32 and are almost equal.
+  ```
+
+  Arguments:
+    a: First set of images.
+    b: Second set of images.
+    max_val: The dynamic range of the images (i.e., the difference between the
+      maximum the and minimum allowed values).
+    name: Namespace to embed the computation in.
+
+  Returns:
+    The scalar PSNR between a and b. The returned tensor has type `tf.float32`
+    and shape [batch_size, 1].
+  """
+  with ops.name_scope(name, 'PSNR', [a, b]):
+    # Need to convert the images to float32.  Scale max_val accordingly so that
+    # PSNR is computed correctly.
+    max_val = math_ops.cast(max_val, a.dtype)
+    max_val = convert_image_dtype(max_val, dtypes.float32)
+    a = convert_image_dtype(a, dtypes.float32)
+    b = convert_image_dtype(b, dtypes.float32)
+    mse = math_ops.reduce_mean(math_ops.squared_difference(a, b), [-3, -2, -1])
+    psnr_val = math_ops.subtract(
+        20 * math_ops.log(max_val) / math_ops.log(10.0),
+        np.float32(10 / np.log(10)) * math_ops.log(mse),
+        name='psnr')
+
+    _, _, checks = _verify_compatible_image_shapes(a, b)
+    with ops.control_dependencies(checks):
+      return array_ops.identity(psnr_val)
+
+_SSIM_K1 = 0.01
+_SSIM_K2 = 0.03
+
+
+def _ssim_helper(x, y, reducer, max_val, compensation=1.0):
+  r"""Helper function for computing SSIM.
+
+  SSIM estimates covariances with weighted sums.  The default parameters
+  use a biased estimate of the covariance:
+  Suppose `reducer` is a weighted sum, then the mean estimators are
+    \mu_x = \sum_i w_i x_i,
+    \mu_y = \sum_i w_i y_i,
+  where w_i's are the weighted-sum weights, and covariance estimator is
+    cov_{xy} = \sum_i w_i (x_i - \mu_x) (y_i - \mu_y)
+  with assumption \sum_i w_i = 1. This covariance estimator is biased, since
+    E[cov_{xy}] = (1 - \sum_i w_i ^ 2) Cov(X, Y).
+  For SSIM measure with unbiased covariance estimators, pass as `compensation`
+  argument (1 - \sum_i w_i ^ 2).
+
+  Arguments:
+    x: First set of images.
+    y: Second set of images.
+    reducer: Function that computes 'local' averages from set of images.
+      For non-covolutional version, this is usually tf.reduce_mean(x, [1, 2]),
+      and for convolutional version, this is usually tf.nn.avg_pool or
+      tf.nn.conv2d with weighted-sum kernel.
+    max_val: The dynamic range (i.e., the difference between the maximum
+      possible allowed value and the minimum allowed value).
+    compensation: Compensation factor. See above.
+
+  Returns:
+    A pair containing the luminance measure, and the contrast-structure measure.
+  """
+  c1 = (_SSIM_K1 * max_val) ** 2
+  c2 = (_SSIM_K2 * max_val) ** 2
+
+  # SSIM luminance measure is
+  # (2 * mu_x * mu_y + c1) / (mu_x ** 2 + mu_y ** 2 + c1).
+  mean0 = reducer(x)
+  mean1 = reducer(y)
+  num0 = mean0 * mean1 * 2.0
+  den0 = math_ops.square(mean0) + math_ops.square(mean1)
+  luminance = (num0 + c1) / (den0 + c1)
+
+  # SSIM contrast-structure measure is
+  #   (2 * cov_{xy} + c2) / (cov_{xx} + cov_{yy} + c2).
+  # Note that `reducer` is a weighted sum with weight w_k, \sum_i w_i = 1, then
+  #   cov_{xy} = \sum_i w_i (x_i - \mu_x) (y_i - \mu_y)
+  #          = \sum_i w_i x_i y_i - (\sum_i w_i x_i) (\sum_j w_j y_j).
+  num1 = reducer(x * y) * 2.0
+  den1 = reducer(math_ops.square(x) + math_ops.square(y))
+  c2 *= compensation
+  cs = (num1 - num0 + c2) / (den1 - den0 + c2)
+
+  # SSIM score is the product of the luminance and contrast-structure measures.
+  return luminance, cs
+
+
+def _fspecial_gauss(size, sigma):
+  """Function to mimic the 'fspecial' gaussian MATLAB function."""
+  size = ops.convert_to_tensor(size, dtypes.int32)
+  sigma = ops.convert_to_tensor(sigma)
+
+  coords = math_ops.cast(math_ops.range(size), sigma.dtype)
+  coords -= math_ops.cast(size - 1, sigma.dtype) / 2.0
+
+  g = math_ops.square(coords)
+  g *= -0.5 / math_ops.square(sigma)
+
+  g = array_ops.reshape(g, shape=[1, -1]) + array_ops.reshape(g, shape=[-1, 1])
+  g = array_ops.reshape(g, shape=[1, -1])  # For tf.nn.softmax().
+  g = nn_ops.softmax(g)
+  return array_ops.reshape(g, shape=[size, size, 1, 1])
+
+
+def _ssim_per_channel(img1, img2, max_val=1.0):
+  """Computes SSIM index between img1 and img2 per color channel.
+
+  This function matches the standard SSIM implementation from:
+  Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). Image
+  quality assessment: from error visibility to structural similarity. IEEE
+  transactions on image processing.
+
+  Details:
+    - 11x11 Gaussian filter of width 1.5 is used.
+    - k1 = 0.01, k2 = 0.03 as in the original paper.
+
+  Args:
+    img1: First image batch.
+    img2: Second image batch.
+    max_val: The dynamic range of the images (i.e., the difference between the
+      maximum the and minimum allowed values).
+
+  Returns:
+    A pair of tensors containing and channel-wise SSIM and contrast-structure
+    values. The shape is [..., channels].
+  """
+  filter_size = constant_op.constant(11, dtype=dtypes.int32)
+  filter_sigma = constant_op.constant(1.5, dtype=img1.dtype)
+
+  shape1, shape2 = array_ops.shape_n([img1, img2])
+  checks = [
+      control_flow_ops.Assert(math_ops.reduce_all(math_ops.greater_equal(
+          shape1[-3:-1], filter_size)), [shape1, filter_size], summarize=8),
+      control_flow_ops.Assert(math_ops.reduce_all(math_ops.greater_equal(
+          shape2[-3:-1], filter_size)), [shape2, filter_size], summarize=8)]
+
+  # Enforce the check to run before computation.
+  with ops.control_dependencies(checks):
+    img1 = array_ops.identity(img1)
+
+  # TODO(sjhwang): Try to cache kernels and compensation factor.
+  kernel = _fspecial_gauss(filter_size, filter_sigma)
+  kernel = array_ops.tile(kernel, multiples=[1, 1, shape1[-1], 1])
+
+  # The correct compensation factor is `1.0 - tf.reduce_sum(tf.square(kernel))`,
+  # but to match MATLAB implementation of MS-SSIM, we use 1.0 instead.
+  compensation = 1.0
+
+  # TODO(sjhwang): Try FFT.
+  # TODO(sjhwang): Gaussian kernel is separable in space. Consider applying
+  #   1-by-n and n-by-1 Gaussain filters instead of an n-by-n filter.
+  def reducer(x):
+    shape = array_ops.shape(x)
+    x = array_ops.reshape(x, shape=array_ops.concat([[-1], shape[-3:]], 0))
+    y = nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1], padding='VALID')
+    return array_ops.reshape(y, array_ops.concat([shape[:-3],
+                                                  array_ops.shape(y)[1:]], 0))
+
+  luminance, cs = _ssim_helper(img1, img2, reducer, max_val, compensation)
+
+  # Average over the second and the third from the last: height, width.
+  axes = constant_op.constant([-3, -2], dtype=dtypes.int32)
+  ssim_val = math_ops.reduce_mean(luminance * cs, axes)
+  cs = math_ops.reduce_mean(cs, axes)
+  return ssim_val, cs
+
+
+@tf_export('image.ssim')
+def ssim(img1, img2, max_val):
+  """Computes SSIM index between img1 and img2.
+
+  This function is based on the standard SSIM implementation from:
+  Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). Image
+  quality assessment: from error visibility to structural similarity. IEEE
+  transactions on image processing.
+
+  Note: The true SSIM is only defined on grayscale.  This function does not
+  perform any colorspace transform.  (If input is already YUV, then it will
+  compute YUV SSIM average.)
+
+  Details:
+    - 11x11 Gaussian filter of width 1.5 is used.
+    - k1 = 0.01, k2 = 0.03 as in the original paper.
+
+  The image sizes must be at least 11x11 because of the filter size.
+
+  Example:
+
+  ```python
+      # Read images from file.
+      im1 = tf.decode_png('path/to/im1.png')
+      im2 = tf.decode_png('path/to/im2.png')
+      # Compute SSIM over tf.uint8 Tensors.
+      ssim1 = tf.image.ssim(im1, im2, max_val=255)
+
+      # Compute SSIM over tf.float32 Tensors.
+      im1 = tf.image.convert_image_dtype(im1, tf.float32)
+      im2 = tf.image.convert_image_dtype(im2, tf.float32)
+      ssim2 = tf.image.ssim(im1, im2, max_val=1.0)
+      # ssim1 and ssim2 both have type tf.float32 and are almost equal.
+  ```
+
+  Args:
+    img1: First image batch.
+    img2: Second image batch.
+    max_val: The dynamic range of the images (i.e., the difference between the
+      maximum the and minimum allowed values).
+
+  Returns:
+    A tensor containing an SSIM value for each image in batch.  Returned SSIM
+    values are in range (-1, 1], when pixel values are non-negative. Returns
+    a tensor with shape: broadcast(img1.shape[:-3], img2.shape[:-3]).
+  """
+  _, _, checks = _verify_compatible_image_shapes(img1, img2)
+  with ops.control_dependencies(checks):
+    img1 = array_ops.identity(img1)
+
+  # Need to convert the images to float32.  Scale max_val accordingly so that
+  # SSIM is computed correctly.
+  max_val = math_ops.cast(max_val, img1.dtype)
+  max_val = convert_image_dtype(max_val, dtypes.float32)
+  img1 = convert_image_dtype(img1, dtypes.float32)
+  img2 = convert_image_dtype(img2, dtypes.float32)
+  ssim_per_channel, _ = _ssim_per_channel(img1, img2, max_val)
+  # Compute average over color channels.
+  return math_ops.reduce_mean(ssim_per_channel, [-1])
+
+
+# Default values obtained by Wang et al.
+_MSSSIM_WEIGHTS = (0.0448, 0.2856, 0.3001, 0.2363, 0.1333)
+
+
+@tf_export('image.ssim_multiscale')
+def ssim_multiscale(img1, img2, max_val, power_factors=_MSSSIM_WEIGHTS):
+  """Computes the MS-SSIM between img1 and img2.
+
+  This function assumes that `img1` and `img2` are image batches, i.e. the last
+  three dimensions are [height, width, channels].
+
+  Note: The true SSIM is only defined on grayscale.  This function does not
+  perform any colorspace transform.  (If input is already YUV, then it will
+  compute YUV SSIM average.)
+
+  Original paper: Wang, Zhou, Eero P. Simoncelli, and Alan C. Bovik. "Multiscale
+  structural similarity for image quality assessment." Signals, Systems and
+  Computers, 2004.
+
+  Arguments:
+    img1: First image batch.
+    img2: Second image batch. Must have the same rank as img1.
+    max_val: The dynamic range of the images (i.e., the difference between the
+      maximum the and minimum allowed values).
+    power_factors: Iterable of weights for each of the scales. The number of
+      scales used is the length of the list. Index 0 is the unscaled
+      resolution's weight and each increasing scale corresponds to the image
+      being downsampled by 2.  Defaults to (0.0448, 0.2856, 0.3001, 0.2363,
+      0.1333), which are the values obtained in the original paper.
+
+  Returns:
+    A tensor containing an MS-SSIM value for each image in batch.  The values
+    are in range [0, 1].  Returns a tensor with shape:
+    broadcast(img1.shape[:-3], img2.shape[:-3]).
+  """
+  # Shape checking.
+  shape1 = img1.get_shape().with_rank_at_least(3)
+  shape2 = img2.get_shape().with_rank_at_least(3)
+  shape1[-3:].merge_with(shape2[-3:])
+
+  with ops.name_scope(None, 'MS-SSIM', [img1, img2]):
+    shape1, shape2, checks = _verify_compatible_image_shapes(img1, img2)
+    with ops.control_dependencies(checks):
+      img1 = array_ops.identity(img1)
+
+    # Need to convert the images to float32.  Scale max_val accordingly so that
+    # SSIM is computed correctly.
+    max_val = math_ops.cast(max_val, img1.dtype)
+    max_val = convert_image_dtype(max_val, dtypes.float32)
+    img1 = convert_image_dtype(img1, dtypes.float32)
+    img2 = convert_image_dtype(img2, dtypes.float32)
+
+    imgs = [img1, img2]
+    shapes = [shape1, shape2]
+
+    # img1 and img2 are assumed to be a (multi-dimensional) batch of
+    # 3-dimensional images (height, width, channels). `heads` contain the batch
+    # dimensions, and `tails` contain the image dimensions.
+    heads = [s[:-3] for s in shapes]
+    tails = [s[-3:] for s in shapes]
+
+    divisor = [1, 2, 2, 1]
+    divisor_tensor = constant_op.constant(divisor[1:], dtype=dtypes.int32)
+
+    def do_pad(images, remainder):
+      padding = array_ops.expand_dims(remainder, -1)
+      padding = array_ops.pad(padding, [[1, 0], [1, 0]])
+      return [array_ops.pad(x, padding, mode='SYMMETRIC') for x in images]
+
+    mcs = []
+    for k in range(len(power_factors)):
+      with ops.name_scope(None, 'Scale%d' % k, imgs):
+        if k > 0:
+          # Avg pool takes rank 4 tensors. Flatten leading dimensions.
+          flat_imgs = [
+              array_ops.reshape(x, array_ops.concat([[-1], t], 0))
+              for x, t in zip(imgs, tails)
+          ]
+
+          remainder = tails[0] % divisor_tensor
+          need_padding = math_ops.reduce_any(math_ops.not_equal(remainder, 0))
+          # pylint: disable=cell-var-from-loop
+          padded = control_flow_ops.cond(need_padding,
+                                         lambda: do_pad(flat_imgs, remainder),
+                                         lambda: flat_imgs)
+          # pylint: enable=cell-var-from-loop
+
+          downscaled = [nn_ops.avg_pool(x, ksize=divisor, strides=divisor,
+                                        padding='VALID')
+                        for x in padded]
+          tails = [x[1:] for x in array_ops.shape_n(downscaled)]
+          imgs = [
+              array_ops.reshape(x, array_ops.concat([h, t], 0))
+              for x, h, t in zip(downscaled, heads, tails)
+          ]
+
+        # Overwrite previous ssim value since we only need the last one.
+        ssim_per_channel, cs = _ssim_per_channel(*imgs, max_val=max_val)
+        mcs.append(nn_ops.relu(cs))
+
+    # Remove the cs score for the last scale. In the MS-SSIM calculation,
+    # we use the l(p) at the highest scale. l(p) * cs(p) is ssim(p).
+    mcs.pop()  # Remove the cs score for the last scale.
+    mcs_and_ssim = array_ops.stack(mcs + [nn_ops.relu(ssim_per_channel)],
+                                   axis=-1)
+    # Take weighted geometric mean across the scale axis.
+    ms_ssim = math_ops.reduce_prod(math_ops.pow(mcs_and_ssim, power_factors),
+                                   [-1])
+
+    return math_ops.reduce_mean(ms_ssim, [-1])  # Avg over color channels.
+
+
+@tf_export('image.image_gradients')
+def image_gradients(image):
+  """Returns image gradients (dy, dx) for each color channel.
+
+  Both output tensors have the same shape as the input: [batch_size, h, w,
+  d]. The gradient values are organized so that [I(x+1, y) - I(x, y)] is in
+  location (x, y). That means that dy will always have zeros in the last row,
+  and dx will always have zeros in the last column.
+
+  Arguments:
+    image: Tensor with shape [batch_size, h, w, d].
+
+  Returns:
+    Pair of tensors (dy, dx) holding the vertical and horizontal image
+    gradients (1-step finite difference).
+
+  Raises:
+    ValueError: If `image` is not a 4D tensor.
+  """
+  if image.get_shape().ndims != 4:
+    raise ValueError('image_gradients expects a 4D tensor '
+                     '[batch_size, h, w, d], not %s.', image.get_shape())
+  image_shape = array_ops.shape(image)
+  batch_size, height, width, depth = array_ops.unstack(image_shape)
+  dy = image[:, 1:, :, :] - image[:, :-1, :, :]
+  dx = image[:, :, 1:, :] - image[:, :, :-1, :]
+
+  # Return tensors with same size as original image by concatenating
+  # zeros. Place the gradient [I(x+1,y) - I(x,y)] on the base pixel (x, y).
+  shape = array_ops.stack([batch_size, 1, width, depth])
+  dy = array_ops.concat([dy, array_ops.zeros(shape, image.dtype)], 1)
+  dy = array_ops.reshape(dy, image_shape)
+
+  shape = array_ops.stack([batch_size, height, 1, depth])
+  dx = array_ops.concat([dx, array_ops.zeros(shape, image.dtype)], 2)
+  dx = array_ops.reshape(dx, image_shape)
+
+  return dy, dx
+
+
+@tf_export('image.sobel_edges')
+def sobel_edges(image):
+  """Returns a tensor holding Sobel edge maps.
+
+  Arguments:
+    image: Image tensor with shape [batch_size, h, w, d] and type float32 or
+    float64.  The image(s) must be 2x2 or larger.
+
+  Returns:
+    Tensor holding edge maps for each channel. Returns a tensor with shape
+    [batch_size, h, w, d, 2] where the last two dimensions hold [[dy[0], dx[0]],
+    [dy[1], dx[1]], ..., [dy[d-1], dx[d-1]]] calculated using the Sobel filter.
+  """
+  # Define vertical and horizontal Sobel filters.
+  static_image_shape = image.get_shape()
+  image_shape = array_ops.shape(image)
+  kernels = [[[-1, -2, -1], [0, 0, 0], [1, 2, 1]],
+             [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]]
+  num_kernels = len(kernels)
+  kernels = np.transpose(np.asarray(kernels), (1, 2, 0))
+  kernels = np.expand_dims(kernels, -2)
+  kernels_tf = constant_op.constant(kernels, dtype=image.dtype)
+
+  kernels_tf = array_ops.tile(kernels_tf, [1, 1, image_shape[-1], 1],
+                              name='sobel_filters')
+
+  # Use depth-wise convolution to calculate edge maps per channel.
+  pad_sizes = [[0, 0], [1, 1], [1, 1], [0, 0]]
+  padded = array_ops.pad(image, pad_sizes, mode='REFLECT')
+
+  # Output tensor has shape [batch_size, h, w, d * num_kernels].
+  strides = [1, 1, 1, 1]
+  output = nn.depthwise_conv2d(padded, kernels_tf, strides, 'VALID')
+
+  # Reshape to [batch_size, h, w, d, num_kernels].
+  shape = array_ops.concat([image_shape, [num_kernels]], 0)
+  output = array_ops.reshape(output, shape=shape)
+  output.set_shape(static_image_shape.concatenate([num_kernels]))
+  return output
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index dc3b581b223edca5f119d726b9fb73e760807818..c437c12c2744792eaee197bf7d2a5f2b75d280bf 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import colorsys
 import functools
+import itertools
 import math
 import os
 import time
@@ -37,7 +38,9 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_image_ops
+from tensorflow.python.ops import gradients
 from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import image_ops_impl
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -943,8 +946,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionLeftRightWithBatch(self):
-    x_np = np.array([[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
-                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    x_np = np.array(
+        [[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
+        dtype=np.uint8).reshape([2, 2, 3, 1])
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
@@ -963,10 +967,12 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(y_tf, y_np)
 
   def testLeftRightWithBatch(self):
-    x_np = np.array([[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
-                    dtype=np.uint8).reshape([2, 2, 3, 1])
-    y_np = np.array([[[3, 2, 1], [3, 2, 1]], [[3, 2, 1], [3, 2, 1]]],
-                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    x_np = np.array(
+        [[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
+        dtype=np.uint8).reshape([2, 2, 3, 1])
+    y_np = np.array(
+        [[[3, 2, 1], [3, 2, 1]], [[3, 2, 1], [3, 2, 1]]],
+        dtype=np.uint8).reshape([2, 2, 3, 1])
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
@@ -974,7 +980,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
-
   def testRandomFlipLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1013,8 +1018,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionUpDownWithBatch(self):
-    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
-                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    x_np = np.array(
+        [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+        dtype=np.uint8).reshape([2, 2, 3, 1])
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
@@ -1034,10 +1040,12 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(y_tf, y_np)
 
   def testUpDownWithBatch(self):
-    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
-                    dtype=np.uint8).reshape([2, 2, 3, 1])
-    y_np = np.array([[[4, 5, 6], [1, 2, 3]], [[10, 11, 12], [7, 8, 9]]],
-                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    x_np = np.array(
+        [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+        dtype=np.uint8).reshape([2, 2, 3, 1])
+    y_np = np.array(
+        [[[4, 5, 6], [1, 2, 3]], [[10, 11, 12], [7, 8, 9]]],
+        dtype=np.uint8).reshape([2, 2, 3, 1])
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
@@ -1081,8 +1089,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionTransposeWithBatch(self):
-    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
-                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    x_np = np.array(
+        [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+        dtype=np.uint8).reshape([2, 2, 3, 1])
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
@@ -1102,11 +1111,13 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(y_tf, y_np)
 
   def testTransposeWithBatch(self):
-    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
-                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    x_np = np.array(
+        [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+        dtype=np.uint8).reshape([2, 2, 3, 1])
 
-    y_np = np.array([[[1, 4], [2, 5], [3, 6]], [[7, 10], [8, 11], [9, 12]]],
-                    dtype=np.uint8).reshape([2, 3, 2, 1])
+    y_np = np.array(
+        [[[1, 4], [2, 5], [3, 6]], [[7, 10], [8, 11], [9, 12]]],
+        dtype=np.uint8).reshape([2, 3, 2, 1])
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
@@ -1121,8 +1132,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     p_unknown_dims_4 = array_ops.placeholder(
         dtypes.uint8, shape=[None, None, None, None])
     p_unknown_width = array_ops.placeholder(dtypes.uint8, shape=[64, None, 3])
-    p_unknown_batch = array_ops.placeholder(dtypes.uint8,
-                                            shape=[None, 64, 64, 3])
+    p_unknown_batch = array_ops.placeholder(
+        dtypes.uint8, shape=[None, 64, 64, 3])
     p_wrong_rank = array_ops.placeholder(dtypes.uint8, shape=[None, None])
     p_zero_dim = array_ops.placeholder(dtypes.uint8, shape=[64, 0, 3])
 
@@ -1156,7 +1167,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         op(p_wrong_rank)
 
     for op in [
-        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
+        image_ops.random_flip_left_right,
+        image_ops.random_flip_up_down,
     ]:
       with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
         op(p_wrong_rank)
@@ -3291,12 +3303,11 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
 
     # The boxes is of shape [num_boxes, 4], and the scores is
     # of shape [num_boxes]. So an error will thrown.
-    with self.assertRaisesRegexp(
-        ValueError, 'Dimensions must be equal, but are 1 and 2'):
+    with self.assertRaisesRegexp(ValueError,
+                                 "Dimensions must be equal, but are 1 and 2"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
       scores = constant_op.constant([0.9, 0.75])
-      selected_indices = image_ops.non_max_suppression(
-          boxes, scores, 3, 0.5)
+      selected_indices = image_ops.non_max_suppression(boxes, scores, 3, 0.5)
 
     # The scores should be 1D of shape [num_boxes].
     with self.assertRaisesRegexp(ValueError,
@@ -3320,5 +3331,420 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
       image_ops.non_max_suppression(boxes, scores, 3, [[0.5]])
 
 
+class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
+  """Tests utility function used by ssim() and psnr()."""
+
+  def testWrongDims(self):
+    img = array_ops.placeholder(dtype=dtypes.float32)
+    img_np = np.array((2, 2))
+
+    with self.test_session(use_gpu=True) as sess:
+      _, _, checks = image_ops_impl._verify_compatible_image_shapes(img, img)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(checks, {img: img_np})
+
+  def testShapeMismatch(self):
+    img1 = array_ops.placeholder(dtype=dtypes.float32)
+    img2 = array_ops.placeholder(dtype=dtypes.float32)
+
+    img1_np = np.array([1, 2, 2, 1])
+    img2_np = np.array([1, 3, 3, 1])
+
+    with self.test_session(use_gpu=True) as sess:
+      _, _, checks = image_ops_impl._verify_compatible_image_shapes(img1, img2)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(checks, {img1: img1_np, img2: img2_np})
+
+
+class PSNRTest(test_util.TensorFlowTestCase):
+  """Tests for PSNR."""
+
+  def _LoadTestImage(self, sess, filename):
+    content = io_ops.read_file(os.path.join(
+        "tensorflow/core/lib/psnr/testdata", filename))
+    im = image_ops.decode_jpeg(content, dct_method="INTEGER_ACCURATE")
+    im = image_ops.convert_image_dtype(im, dtypes.float32)
+    im, = sess.run([im])
+    return np.expand_dims(im, axis=0)
+
+  def _LoadTestImages(self):
+    with self.test_session(use_gpu=True) as sess:
+      q20 = self._LoadTestImage(sess, "cat_q20.jpg")
+      q72 = self._LoadTestImage(sess, "cat_q72.jpg")
+      q95 = self._LoadTestImage(sess, "cat_q95.jpg")
+      return q20, q72, q95
+
+  def _PSNR_NumPy(self, orig, target, max_value):
+    """Numpy implementation of PSNR."""
+    mse = ((orig - target) ** 2).mean(axis=(-3, -2, -1))
+    return 20 * np.log10(max_value) - 10 * np.log10(mse)
+
+  def _RandomImage(self, shape, max_val):
+    """Returns an image or image batch with given shape."""
+    return np.random.rand(*shape).astype(np.float32) * max_val
+
+  def testPSNRSingleImage(self):
+    image1 = self._RandomImage((8, 8, 1), 1)
+    image2 = self._RandomImage((8, 8, 1), 1)
+    psnr = self._PSNR_NumPy(image1, image2, 1)
+
+    with self.test_session(use_gpu=True):
+      tf_image1 = constant_op.constant(image1, shape=image1.shape,
+                                       dtype=dtypes.float32)
+      tf_image2 = constant_op.constant(image2, shape=image2.shape,
+                                       dtype=dtypes.float32)
+      tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1.0, "psnr").eval()
+      self.assertAllClose(psnr, tf_psnr, atol=0.001)
+
+  def testPSNRMultiImage(self):
+    image1 = self._RandomImage((10, 8, 8, 1), 1)
+    image2 = self._RandomImage((10, 8, 8, 1), 1)
+    psnr = self._PSNR_NumPy(image1, image2, 1)
+
+    with self.test_session(use_gpu=True):
+      tf_image1 = constant_op.constant(image1, shape=image1.shape,
+                                       dtype=dtypes.float32)
+      tf_image2 = constant_op.constant(image2, shape=image2.shape,
+                                       dtype=dtypes.float32)
+      tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1, "psnr").eval()
+      self.assertAllClose(psnr, tf_psnr, atol=0.001)
+
+  def testGoldenPSNR(self):
+    q20, q72, q95 = self._LoadTestImages()
+
+    # Verify NumPy implementation first.
+    # Golden values are generated using GNU Octave's psnr() function.
+    psnr1 = self._PSNR_NumPy(q20, q72, 1)
+    self.assertNear(30.321, psnr1, 0.001, msg="q20.dtype=" + str(q20.dtype))
+    psnr2 = self._PSNR_NumPy(q20, q95, 1)
+    self.assertNear(29.994, psnr2, 0.001)
+    psnr3 = self._PSNR_NumPy(q72, q95, 1)
+    self.assertNear(35.302, psnr3, 0.001)
+
+    # Test TensorFlow implementation.
+    with self.test_session(use_gpu=True):
+      tf_q20 = constant_op.constant(q20, shape=q20.shape, dtype=dtypes.float32)
+      tf_q72 = constant_op.constant(q72, shape=q72.shape, dtype=dtypes.float32)
+      tf_q95 = constant_op.constant(q95, shape=q95.shape, dtype=dtypes.float32)
+      tf_psnr1 = image_ops.psnr(tf_q20, tf_q72, 1, "psnr1").eval()
+      tf_psnr2 = image_ops.psnr(tf_q20, tf_q95, 1, "psnr2").eval()
+      tf_psnr3 = image_ops.psnr(tf_q72, tf_q95, 1, "psnr3").eval()
+      self.assertAllClose(psnr1, tf_psnr1, atol=0.001)
+      self.assertAllClose(psnr2, tf_psnr2, atol=0.001)
+      self.assertAllClose(psnr3, tf_psnr3, atol=0.001)
+
+  def testInfinity(self):
+    q20, _, _ = self._LoadTestImages()
+    psnr = self._PSNR_NumPy(q20, q20, 1)
+    with self.test_session(use_gpu=True):
+      tf_q20 = constant_op.constant(q20, shape=q20.shape, dtype=dtypes.float32)
+      tf_psnr = image_ops.psnr(tf_q20, tf_q20, 1, "psnr").eval()
+      self.assertAllClose(psnr, tf_psnr, atol=0.001)
+
+  def testInt(self):
+    img1 = self._RandomImage((10, 8, 8, 1), 255)
+    img2 = self._RandomImage((10, 8, 8, 1), 255)
+    img1 = constant_op.constant(img1, dtypes.uint8)
+    img2 = constant_op.constant(img2, dtypes.uint8)
+    psnr_uint8 = image_ops.psnr(img1, img2, 255)
+    img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
+    img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
+    psnr_float32 = image_ops.psnr(img1, img2, 1.0)
+    with self.test_session(use_gpu=True):
+      self.assertAllClose(psnr_uint8.eval(), psnr_float32.eval(), atol=0.001)
+
+
+class SSIMTest(test_util.TensorFlowTestCase):
+  """Tests for SSIM."""
+
+  _filenames = ["checkerboard1.png",
+                "checkerboard2.png",
+                "checkerboard3.png",]
+
+  _ssim = np.asarray([[1.000000, 0.230880, 0.231153],
+                      [0.230880, 1.000000, 0.996828],
+                      [0.231153, 0.996828, 1.000000]])
+
+  def _LoadTestImage(self, sess, filename):
+    content = io_ops.read_file(os.path.join(
+        "tensorflow/core/lib/ssim/testdata", filename))
+    im = image_ops.decode_png(content)
+    im = image_ops.convert_image_dtype(im, dtypes.float32)
+    im, = sess.run([im])
+    return np.expand_dims(im, axis=0)
+
+  def _LoadTestImages(self):
+    with self.test_session(use_gpu=True) as sess:
+      return [self._LoadTestImage(sess, f) for f in self._filenames]
+
+  def _RandomImage(self, shape, max_val):
+    """Returns an image or image batch with given shape."""
+    return np.random.rand(*shape).astype(np.float32) * max_val
+
+  def testAgainstMatlab(self):
+    """Tests against values produced by Matlab."""
+    img = self._LoadTestImages()
+    expected = self._ssim[np.triu_indices(3)]
+
+    ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
+    ssim = image_ops.ssim(*ph, max_val=1.0)
+    with self.test_session(use_gpu=True):
+      scores = [ssim.eval(dict(zip(ph, t)))
+                for t in itertools.combinations_with_replacement(img, 2)]
+    self.assertAllClose(expected, np.squeeze(scores), atol=1e-4)
+
+  def testBatch(self):
+    img = self._LoadTestImages()
+    expected = self._ssim[np.triu_indices(3, k=1)]
+
+    img1, img2 = zip(*itertools.combinations(img, 2))
+    img1 = np.concatenate(img1)
+    img2 = np.concatenate(img2)
+
+    ssim = image_ops.ssim(constant_op.constant(img1),
+                          constant_op.constant(img2), 1.0)
+    with self.test_session(use_gpu=True):
+      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+
+  def testBroadcast(self):
+    img = self._LoadTestImages()[:2]
+    expected = self._ssim[:2, :2]
+
+    img = constant_op.constant(np.concatenate(img))
+    img1 = array_ops.expand_dims(img, axis=0)  # batch dims: 1, 2.
+    img2 = array_ops.expand_dims(img, axis=1)  # batch dims: 2, 1.
+
+    ssim = image_ops.ssim(img1, img2, 1.0)
+    with self.test_session(use_gpu=True):
+      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+
+  def testNegative(self):
+    """Tests against negative SSIM index."""
+    step = np.expand_dims(np.arange(0, 256, 16, dtype=np.uint8), axis=0)
+    img1 = np.tile(step, (16, 1))
+    img2 = np.fliplr(img1)
+
+    img1 = img1.reshape((1, 16, 16, 1))
+    img2 = img2.reshape((1, 16, 16, 1))
+
+    ssim = image_ops.ssim(constant_op.constant(img1),
+                          constant_op.constant(img2), 255)
+    with self.test_session(use_gpu=True):
+      self.assertLess(ssim.eval(), 0)
+
+  def testInt(self):
+    img1 = self._RandomImage((1, 16, 16, 3), 255)
+    img2 = self._RandomImage((1, 16, 16, 3), 255)
+    img1 = constant_op.constant(img1, dtypes.uint8)
+    img2 = constant_op.constant(img2, dtypes.uint8)
+    ssim_uint8 = image_ops.ssim(img1, img2, 255)
+    img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
+    img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
+    ssim_float32 = image_ops.ssim(img1, img2, 1.0)
+    with self.test_session(use_gpu=True):
+      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+
+
+class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
+  """Tests for MS-SSIM."""
+
+  _filenames = ["checkerboard1.png",
+                "checkerboard2.png",
+                "checkerboard3.png",]
+
+  _msssim = np.asarray([[1.000000, 0.091016, 0.091025],
+                        [0.091016, 1.000000, 0.999567],
+                        [0.091025, 0.999567, 1.000000]])
+
+  def _LoadTestImage(self, sess, filename):
+    content = io_ops.read_file(os.path.join(
+        "tensorflow/core/lib/ssim/testdata", filename))
+    im = image_ops.decode_png(content)
+    im = image_ops.convert_image_dtype(im, dtypes.float32)
+    im, = sess.run([im])
+    return np.expand_dims(im, axis=0)
+
+  def _LoadTestImages(self):
+    with self.test_session(use_gpu=True) as sess:
+      return [self._LoadTestImage(sess, f) for f in self._filenames]
+
+  def _RandomImage(self, shape, max_val):
+    """Returns an image or image batch with given shape."""
+    return np.random.rand(*shape).astype(np.float32) * max_val
+
+  def testAgainstMatlab(self):
+    """Tests against MS-SSIM computed with Matlab implementation.
+
+    For color images, MS-SSIM scores are averaged over color channels.
+    """
+    img = self._LoadTestImages()
+    expected = self._msssim[np.triu_indices(3)]
+
+    ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
+    msssim = image_ops.ssim_multiscale(*ph, max_val=1.0)
+    with self.test_session(use_gpu=True):
+      scores = [msssim.eval(dict(zip(ph, t)))
+                for t in itertools.combinations_with_replacement(img, 2)]
+
+    self.assertAllClose(expected, np.squeeze(scores), atol=1e-4)
+
+  def testUnweightedIsDifferentiable(self):
+    img = self._LoadTestImages()
+    ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
+    scalar = constant_op.constant(1.0, dtype=dtypes.float32)
+    scaled_ph = [x * scalar for x in ph]
+    msssim = image_ops.ssim_multiscale(*scaled_ph, max_val=1.0,
+                                       power_factors=(1, 1, 1, 1, 1))
+    grads = gradients.gradients(msssim, scalar)
+    with self.test_session(use_gpu=True) as sess:
+      np_grads = sess.run(grads, feed_dict={ph[0]: img[0], ph[1]: img[1]})
+    self.assertTrue(np.isfinite(np_grads).all())
+
+  def testBatch(self):
+    """Tests MS-SSIM computed in batch."""
+    img = self._LoadTestImages()
+    expected = self._msssim[np.triu_indices(3, k=1)]
+
+    img1, img2 = zip(*itertools.combinations(img, 2))
+    img1 = np.concatenate(img1)
+    img2 = np.concatenate(img2)
+
+    msssim = image_ops.ssim_multiscale(constant_op.constant(img1),
+                                       constant_op.constant(img2), 1.0)
+    with self.test_session(use_gpu=True):
+      self.assertAllClose(expected, msssim.eval(), 1e-4)
+
+  def testBroadcast(self):
+    """Tests MS-SSIM broadcasting."""
+    img = self._LoadTestImages()[:2]
+    expected = self._msssim[:2, :2]
+
+    img = constant_op.constant(np.concatenate(img))
+    img1 = array_ops.expand_dims(img, axis=0)  # batch dims: 1, 2.
+    img2 = array_ops.expand_dims(img, axis=1)  # batch dims: 2, 1.
+
+    score_tensor = image_ops.ssim_multiscale(img1, img2, 1.0)
+    with self.test_session(use_gpu=True):
+      self.assertAllClose(expected, score_tensor.eval(), 1e-4)
+
+  def testRange(self):
+    """Tests against low MS-SSIM score.
+
+    MS-SSIM is a geometric mean of SSIM and CS scores of various scales.
+    If any of the value is negative so that the geometric mean is not
+    well-defined, then treat the MS-SSIM score as zero.
+    """
+    with self.test_session(use_gpu=True) as sess:
+      img1 = self._LoadTestImage(sess, "checkerboard1.png")
+      img2 = self._LoadTestImage(sess, "checkerboard3.png")
+      images = [img1, img2, np.zeros_like(img1),
+                np.full_like(img1, fill_value=255)]
+
+      images = [ops.convert_to_tensor(x, dtype=dtypes.float32) for x in images]
+      msssim_ops = [image_ops.ssim_multiscale(x, y, 1.0)
+                    for x, y in itertools.combinations(images, 2)]
+      msssim = sess.run(msssim_ops)
+      msssim = np.squeeze(msssim)
+
+    self.assertTrue(np.all(msssim >= 0.0))
+    self.assertTrue(np.all(msssim <= 1.0))
+
+  def testInt(self):
+    img1 = self._RandomImage((1, 180, 240, 3), 255)
+    img2 = self._RandomImage((1, 180, 240, 3), 255)
+    img1 = constant_op.constant(img1, dtypes.uint8)
+    img2 = constant_op.constant(img2, dtypes.uint8)
+    ssim_uint8 = image_ops.ssim_multiscale(img1, img2, 255)
+    img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
+    img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
+    ssim_float32 = image_ops.ssim_multiscale(img1, img2, 1.0)
+    with self.test_session(use_gpu=True):
+      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+
+
+class ImageGradientsTest(test_util.TensorFlowTestCase):
+
+  def testImageGradients(self):
+    shape = [1, 2, 4, 1]
+    img = constant_op.constant([[1, 3, 4, 2], [8, 7, 5, 6]])
+    img = array_ops.reshape(img, shape)
+
+    expected_dy = np.reshape([[7, 4, 1, 4], [0, 0, 0, 0]], shape)
+    expected_dx = np.reshape([[2, 1, -2, 0], [-1, -2, 1, 0]], shape)
+
+    dy, dx = image_ops.image_gradients(img)
+    with self.test_session():
+      actual_dy = dy.eval()
+      actual_dx = dx.eval()
+      self.assertAllClose(expected_dy, actual_dy)
+      self.assertAllClose(expected_dx, actual_dx)
+
+  def testImageGradientsMultiChannelBatch(self):
+    batch = [[[[1, 2], [2, 5], [3, 3]],
+              [[8, 4], [5, 1], [9, 8]]],
+             [[[5, 3], [7, 9], [1, 6]],
+              [[1, 2], [6, 3], [6, 3]]]]
+
+    expected_dy = [[[[7, 2], [3, -4], [6, 5]],
+                    [[0, 0], [0, 0], [0, 0]]],
+                   [[[-4, -1], [-1, -6], [5, -3]],
+                    [[0, 0], [0, 0], [0, 0]]]]
+
+    expected_dx = [[[[1, 3], [1, -2], [0, 0]],
+                    [[-3, -3], [4, 7], [0, 0]]],
+                   [[[2, 6], [-6, -3], [0, 0]],
+                    [[5, 1], [0, 0], [0, 0]]]]
+
+    batch = constant_op.constant(batch)
+    assert batch.get_shape().as_list() == [2, 2, 3, 2]
+    dy, dx = image_ops.image_gradients(batch)
+    with self.test_session(use_gpu=True):
+      actual_dy = dy.eval()
+      actual_dx = dx.eval()
+      self.assertAllClose(expected_dy, actual_dy)
+      self.assertAllClose(expected_dx, actual_dx)
+
+  def testImageGradientsBadShape(self):
+    # [2 x 4] image but missing batch and depth dimensions.
+    img = constant_op.constant([[1, 3, 4, 2], [8, 7, 5, 6]])
+    with self.assertRaises(ValueError):
+      image_ops.image_gradients(img)
+
+
+class SobelEdgesTest(test_util.TensorFlowTestCase):
+
+  def testSobelEdges1x2x3x1(self):
+    img = constant_op.constant([[1, 3, 6], [4, 1, 5]],
+                               dtype=dtypes.float32, shape=[1, 2, 3, 1])
+    expected = np.reshape([[[0, 0], [0, 12], [0, 0]],
+                           [[0, 0], [0, 12], [0, 0]]], [1, 2, 3, 1, 2])
+    sobel = image_ops.sobel_edges(img)
+    with self.test_session(use_gpu=True):
+      actual_sobel = sobel.eval()
+      self.assertAllClose(expected, actual_sobel)
+
+  def testSobelEdges5x3x4x2(self):
+    batch_size = 5
+    plane = np.reshape([[1, 3, 6, 2], [4, 1, 5, 7], [2, 5, 1, 4]],
+                       [1, 3, 4, 1])
+    two_channel = np.concatenate([plane, plane], axis=3)
+    batch = np.concatenate([two_channel] * batch_size, axis=0)
+    img = constant_op.constant(batch, dtype=dtypes.float32,
+                               shape=[batch_size, 3, 4, 2])
+
+    expected_plane = np.reshape([[[0, 0], [0, 12], [0, 10], [0, 0]],
+                                 [[6, 0], [0, 6], [-6, 10], [-6, 0]],
+                                 [[0, 0], [0, 0], [0, 10], [0, 0]]],
+                                [1, 3, 4, 1, 2])
+    expected_two_channel = np.concatenate(
+        [expected_plane, expected_plane], axis=3)
+    expected_batch = np.concatenate([expected_two_channel] * batch_size, axis=0)
+
+    sobel = image_ops.sobel_edges(img)
+    with self.test_session(use_gpu=True):
+      actual_sobel = sobel.eval()
+      self.assertAllClose(expected_batch, actual_sobel)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index c7502d0fda5c38079362d30877a917e3965e6ca0..f93bf0a17f31b4451cad9c5bc525d9f237f97bef 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -39,10 +39,10 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import linalg_ops_impl
+from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -499,10 +499,10 @@ class Orthogonal(Initializer):
 
   Args:
     gain: multiplicative factor to apply to the orthogonal matrix
-    dtype: The type of the output.
     seed: A Python integer. Used to create random seeds. See
       @{tf.set_random_seed}
       for behavior.
+    dtype: The data type.
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -529,11 +529,10 @@ class Orthogonal(Initializer):
     # Generate a random matrix
     a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
     # Compute the qr factorization
-    q, r = linalg_ops.qr(a, full_matrices=False)
+    q, r = gen_linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
     d = array_ops.diag_part(r)
-    ph = d / math_ops.abs(d)
-    q *= ph
+    q *= math_ops.sign(d)
     if num_rows < num_cols:
       q = array_ops.matrix_transpose(q)
     return self.gain * array_ops.reshape(q, shape)
@@ -542,6 +541,527 @@ class Orthogonal(Initializer):
     return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
 
 
+class ConvolutionDeltaOrthogonal(Initializer):
+  """Initializer that generates a delta orthogonal kernel for ConvNets.
+
+  The shape of the tensor must have length 3, 4 or 5. The number of input
+  filters must not exceed the number of output filters. The center pixels of the
+  tensor form an orthogonal matrix. Other pixels are set to be zero.
+
+  Args:
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
+      applying this convolution.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed} for behavior.
+    dtype: The data type.
+  """
+
+  def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
+    self.gain = gain
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
+    self.seed = seed
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    if dtype is None:
+      dtype = self.dtype
+    # Check the shape
+    if len(shape) < 3 or len(shape) > 5:
+      raise ValueError("The tensor to initialize must be at least "
+                       "three-dimensional and at most five-dimensional")
+
+    if shape[-2] > shape[-1]:
+      raise ValueError("In_filters cannot be greater than out_filters.")
+
+    # Generate a random matrix
+    a = random_ops.random_normal([shape[-1], shape[-1]],
+                                 dtype=dtype, seed=self.seed)
+    # Compute the qr factorization
+    q, r = gen_linalg_ops.qr(a, full_matrices=False)
+    # Make Q uniform
+    d = array_ops.diag_part(r)
+    q *= math_ops.sign(d)
+    q = q[:shape[-2], :]
+    q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    if len(shape) == 3:
+      weight = array_ops.scatter_nd([[(shape[0]-1)//2]],
+                                    array_ops.expand_dims(q, 0), shape)
+    elif len(shape) == 4:
+      weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2]],
+                                    array_ops.expand_dims(q, 0), shape)
+    else:
+      weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2,
+                                      (shape[2]-1)//2]],
+                                    array_ops.expand_dims(q, 0), shape)
+    return weight
+
+  def get_config(self):
+    return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
+
+
+class ConvolutionOrthogonal(Initializer):
+  """Initializer that generates orthogonal kernel for ConvNets.
+
+  Base class used to construct 1D, 2D and 3D orthogonal kernels for convolution.
+
+  Args:
+    gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
+      applying this convolution.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed} for behavior.
+    dtype: The data type.
+  """
+
+  def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
+    self.gain = gain
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
+    self.seed = seed
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    raise NotImplementedError
+
+  def get_config(self):
+    return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
+
+  # Helper functions.
+  def _orthogonal_matrix(self, n):
+    """Construct an n x n orthogonal matrix.
+
+    Args:
+      n: Dimension.
+    Returns:
+      A n x n orthogonal matrix.
+    """
+    a = random_ops.random_normal([n, n], dtype=self.dtype, seed=self.seed)
+    if self.seed:
+      self.seed += 1
+    q, r = gen_linalg_ops.qr(a)
+    d = array_ops.diag_part(r)
+    # make q uniform
+    q *= math_ops.sign(d)
+    return q
+
+  def _symmetric_projection(self, n):
+    """Compute a n x n symmetric projection matrix.
+
+    Args:
+      n: Dimension.
+    Returns:
+      A n x n symmetric projection matrix, i.e. a matrix P s.t. P=P*P, P=P^T.
+    """
+    q = self._orthogonal_matrix(n)
+    # randomly zeroing out some columns
+    mask = math_ops.cast(random_ops.random_normal([n], seed=self.seed) > 0,
+                         self.dtype)
+    if self.seed:
+      self.seed += 1
+    c = math_ops.multiply(q, mask)
+    return math_ops.matmul(c, array_ops.matrix_transpose(c))
+
+
+class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
+  """Initializer that generates a 2D orthogonal kernel for ConvNets.
+
+  The shape of the tensor must have length 4. The number of input
+  filters must not exceed the number of output filters.
+  The orthogonality(==isometry) is exact when the inputs are circular padded.
+  There are finite-width effects with non-circular padding (e.g. zero padding).
+
+  Args:
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      This has the effect of scaling the output 2-norm by a factor of
+      `sqrt(gain)`.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed} for behavior.
+    dtype: The data type.
+  """
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    if dtype is None:
+      dtype = self.dtype
+    if len(shape) != 4:
+      raise ValueError("The tensor to initialize must be four-dimensional")
+
+    if shape[-2] > shape[-1]:
+      raise ValueError("In_filters cannot be greater than out_filters.")
+
+    if shape[0] != shape[1]:
+      raise ValueError("Kernel sizes must be equal.")
+
+    kernel = self._orthogonal_kernel(shape[0], shape[2], shape[3])
+    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    return kernel
+
+  def _dict_to_tensor(self, x, k1, k2):
+    """Convert a dictionary to a tensor.
+
+    Args:
+      x: A k1 * k2 dictionary.
+      k1: First dimension of x.
+      k2: Second dimension of x.
+    Returns:
+      A k1 * k2 tensor.
+    """
+
+    return array_ops.stack([array_ops.stack([x[i, j] for j in range(k2)])
+                            for i in range(k1)])
+
+  def _block_orth(self, p1, p2):
+    """Construct a 2 x 2 kernel. Used to construct orthgonal kernel.
+
+    Args:
+      p1: A symmetric projection matrix.
+      p2: A symmetric projection matrix.
+    Returns:
+      A 2 x 2 kernel [[p1p2,         p1(1-p2)],
+                      [(1-p1)p2, (1-p1)(1-p2)]].
+    Raises:
+      ValueError: If the dimensions of p1 and p2 are different.
+    """
+    if p1.shape.as_list() != p2.shape.as_list():
+      raise ValueError("The dimension of the matrices must be the same.")
+    n = p1.shape.as_list()[0]
+    kernel2x2 = {}
+    eye = linalg_ops_impl.eye(n, dtype=self.dtype)
+    kernel2x2[0, 0] = math_ops.matmul(p1, p2)
+    kernel2x2[0, 1] = math_ops.matmul(p1, (eye - p2))
+    kernel2x2[1, 0] = math_ops.matmul((eye - p1), p2)
+    kernel2x2[1, 1] = math_ops.matmul((eye - p1), (eye - p2))
+
+    return kernel2x2
+
+  def _matrix_conv(self, m1, m2):
+    """Matrix convolution.
+
+    Args:
+      m1: A k x k dictionary, each element is a n x n matrix.
+      m2: A l x l dictionary, each element is a n x n matrix.
+
+    Returns:
+      (k + l - 1) * (k + l - 1) dictionary each element is a n x n matrix.
+    Raises:
+      ValueError: if the entries of m1 and m2 are of different dimensions.
+    """
+
+    n = (m1[0, 0]).shape.as_list()[0]
+    if n != (m2[0, 0]).shape.as_list()[0]:
+      raise ValueError("The entries in matrices m1 and m2 "
+                       "must have the same dimensions!")
+    k = int(np.sqrt(len(m1)))
+    l = int(np.sqrt(len(m2)))
+    result = {}
+    size = k + l - 1
+    # Compute matrix convolution between m1 and m2.
+    for i in range(size):
+      for j in range(size):
+        result[i, j] = array_ops.zeros([n, n], self.dtype)
+        for index1 in range(min(k, i + 1)):
+          for index2 in range(min(k, j + 1)):
+            if (i - index1) < l and (j - index2) < l:
+              result[i, j] += math_ops.matmul(m1[index1, index2],
+                                              m2[i - index1, j - index2])
+    return result
+
+  def _orthogonal_kernel(self, ksize, cin, cout):
+    """Construct orthogonal kernel for convolution.
+
+    Args:
+      ksize: Kernel size.
+      cin: Number of input channels.
+      cout: Number of output channels.
+    Returns:
+      An [ksize, ksize, cin, cout] orthogonal kernel.
+    Raises:
+      ValueError: If cin > cout.
+    """
+    if cin > cout:
+      raise ValueError("The number of input channels cannot exceed "
+                       "the number of output channels.")
+    orth = self._orthogonal_matrix(cout)[0:cin, :]
+    if ksize == 1:
+      return array_ops.expand_dims(array_ops.expand_dims(orth, 0), 0)
+
+    p = self._block_orth(self._symmetric_projection(cout),
+                         self._symmetric_projection(cout))
+    for _ in range(ksize - 2):
+      temp = self._block_orth(self._symmetric_projection(cout),
+                              self._symmetric_projection(cout))
+      p = self._matrix_conv(p, temp)
+    for i in range(ksize):
+      for j in range(ksize):
+        p[i, j] = math_ops.matmul(orth, p[i, j])
+
+    return self._dict_to_tensor(p, ksize, ksize)
+
+
+class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
+  """Initializer that generates a 1D orthogonal kernel for ConvNets.
+
+  The shape of the tensor must have length 3. The number of input
+  filters must not exceed the number of output filters.
+  The orthogonality(==isometry) is exact when the inputs are circular padded.
+  There are finite-width effects with non-circular padding (e.g. zero padding).
+
+  Args:
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
+      applying this convolution.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed}
+      for behavior.
+    dtype: The data type.
+  """
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    if dtype is None:
+      dtype = self.dtype
+    if len(shape) != 3:
+      raise ValueError("The tensor to initialize must be three-dimensional")
+
+    if shape[-2] > shape[-1]:
+      raise ValueError("In_filters cannot be greater than out_filters.")
+
+    kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
+    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    return kernel
+
+  def _dict_to_tensor(self, x, k):
+    """Convert a dictionary to a tensor.
+
+    Args:
+      x: A dictionary of length k.
+      k: Dimension of x.
+    Returns:
+      A tensor with the same dimension.
+    """
+
+    return array_ops.stack([x[i] for i in range(k)])
+
+  def _block_orth(self, projection_matrix):
+    """Construct a kernel. Used to construct orthgonal kernel.
+
+    Args:
+      projection_matrix: A symmetric projection matrix of size n x n.
+    Returns:
+      [projection_matrix, (1 - projection_matrix)].
+    """
+    n = projection_matrix.shape.as_list()[0]
+    kernel = {}
+    eye = linalg_ops_impl.eye(n, dtype=self.dtype)
+    kernel[0] = projection_matrix
+    kernel[1] = eye - projection_matrix
+    return kernel
+
+  def _matrix_conv(self, m1, m2):
+    """Matrix convolution.
+
+    Args:
+      m1: A dictionary of length k, each element is a n x n matrix.
+      m2: A dictionary of length l, each element is a n x n matrix.
+
+    Returns:
+      (k + l - 1)  dictionary each element is a n x n matrix.
+    Raises:
+      ValueError: Ff the entries of m1 and m2 are of different dimensions.
+    """
+
+    n = (m1[0]).shape.as_list()[0]
+    if n != (m2[0]).shape.as_list()[0]:
+      raise ValueError("The entries in matrices m1 and m2 "
+                       "must have the same dimensions!")
+    k = len(m1)
+    l = len(m2)
+    result = {}
+    size = k + l - 1
+    # Compute matrix convolution between m1 and m2.
+    for i in range(size):
+      result[i] = array_ops.zeros([n, n], self.dtype)
+      for index in range(min(k, i + 1)):
+        if (i - index) < l:
+          result[i] += math_ops.matmul(m1[index], m2[i - index])
+    return result
+
+  def _orthogonal_kernel(self, ksize, cin, cout):
+    """Construct orthogonal kernel for convolution.
+
+    Args:
+      ksize: Kernel size.
+      cin: Number of input channels.
+      cout: Number of output channels.
+    Returns:
+      An [ksize, ksize, cin, cout] orthogonal kernel.
+    Raises:
+      ValueError: If cin > cout.
+    """
+    if cin > cout:
+      raise ValueError("The number of input channels cannot exceed "
+                       "the number of output channels.")
+    orth = self._orthogonal_matrix(cout)[0:cin, :]
+    if ksize == 1:
+      return array_ops.expand_dims(orth, 0)
+
+    p = self._block_orth(self._symmetric_projection(cout))
+    for _ in range(ksize - 2):
+      temp = self._block_orth(self._symmetric_projection(cout))
+      p = self._matrix_conv(p, temp)
+    for i in range(ksize):
+      p[i] = math_ops.matmul(orth, p[i])
+
+    return self._dict_to_tensor(p, ksize)
+
+
+class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
+  """Initializer that generates a 3D orthogonal kernel for ConvNets.
+
+  The shape of the tensor must have length 5. The number of input
+  filters must not exceed the number of output filters.
+  The orthogonality(==isometry) is exact when the inputs are circular padded.
+  There are finite-width effects with non-circular padding (e.g. zero padding).
+
+  Args:
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
+      applying this convolution.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed} for behavior.
+    dtype: The data type.
+  """
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    if dtype is None:
+      dtype = self.dtype
+    if len(shape) != 5:
+      raise ValueError("The tensor to initialize must be five-dimensional")
+
+    if shape[-2] > shape[-1]:
+      raise ValueError("In_filters cannot be greater than out_filters.")
+
+    if shape[0] != shape[1] or shape[0] != shape[2]:
+      raise ValueError("Kernel sizes must be equal.")
+
+    kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
+    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    return kernel
+
+  def _dict_to_tensor(self, x, k1, k2, k3):
+    """Convert a dictionary to a tensor.
+
+    Args:
+      x: A k1 * k2 dictionary.
+      k1: First dimension of x.
+      k2: Second dimension of x.
+      k3: Third dimension of x.
+    Returns:
+      A k1 * k2 * k3 tensor.
+    """
+
+    return array_ops.stack([array_ops.stack(
+        [array_ops.stack([x[i, j, k] for k in range(k3)])
+         for j in range(k2)]) for i in range(k1)])
+
+  def _block_orth(self, p1, p2, p3):
+    """Construct a 3 x 3 kernel. Used to construct orthgonal kernel.
+
+    Args:
+      p1: A symmetric projection matrix.
+      p2: A symmetric projection matrix.
+      p3: A symmetric projection matrix.
+    Returns:
+      A 2 x 2 x 2 kernel.
+    Raises:
+      ValueError: If the dimensions of p1, p2 and p3 are different.
+    """
+    p1_shape = p1.shape.as_list()
+    if p1_shape != p2.shape.as_list() or p1_shape != p3.shape.as_list():
+      raise ValueError("The dimension of the matrices must be the same.")
+    n = p1_shape[0]
+    eye = linalg_ops_impl.eye(n, dtype=self.dtype)
+    kernel2x2x2 = {}
+    def matmul(p1, p2, p3):
+      return math_ops.matmul(math_ops.matmul(p1, p2), p3)
+    def cast(i, p):
+      """Return p or (1-p)."""
+      return i * p + (1-i) * (eye - p)
+    for i in [0, 1]:
+      for j in [0, 1]:
+        for k in [0, 1]:
+          kernel2x2x2[i, j, k] = matmul(cast(i, p1), cast(j, p2), cast(k, p3))
+    return kernel2x2x2
+
+  def _matrix_conv(self, m1, m2):
+    """Matrix convolution.
+
+    Args:
+      m1: is a k x k x k  dictionary, each element is a n x n matrix.
+      m2: is a l x l x l dictionary, each element is a n x n matrix.
+
+    Returns:
+      (k + l - 1) x (k + l - 1) x (k + l - 1) dictionary each
+      element is a n x n matrix.
+    Raises:
+      ValueError: if the entries of m1 and m2 are of different dimensions.
+    """
+
+    n = (m1[0, 0, 0]).shape.as_list()[0]
+    if n != (m2[0, 0, 0]).shape.as_list()[0]:
+      raise ValueError("The entries in matrices m1 and m2 "
+                       "must have the same dimensions!")
+    k = int(np.cbrt(len(m1)))
+    l = int(np.cbrt(len(m2)))
+    result = {}
+    size = k + l - 1
+    # Compute matrix convolution between m1 and m2.
+    for i in range(size):
+      for j in range(size):
+        for r in range(size):
+          result[i, j, r] = array_ops.zeros([n, n], self.dtype)
+          for index1 in range(min(k, i + 1)):
+            for index2 in range(min(k, j + 1)):
+              for index3 in range(min(k, r + 1)):
+                if (i - index1) < l and (j - index2) < l and (r - index3) < l:
+                  result[i, j, r] += math_ops.matmul(m1[index1, index2, index3],
+                                                     m2[i - index1, j - index2,
+                                                        r - index3])
+    return result
+
+  def _orthogonal_kernel(self, ksize, cin, cout):
+    """Construct orthogonal kernel for convolution.
+
+    Args:
+      ksize: Kernel size.
+      cin: Number of input channels.
+      cout: Number of output channels.
+    Returns:
+      An [ksize, ksize, ksize, cin, cout] orthogonal kernel.
+    Raises:
+      ValueError: If cin > cout.
+    """
+    if cin > cout:
+      raise ValueError("The number of input channels cannot exceed "
+                       "the number of output channels.")
+    orth = self._orthogonal_matrix(cout)[0:cin, :]
+    if ksize == 1:
+      return array_ops.expand_dims(
+          array_ops.expand_dims(
+              array_ops.expand_dims(orth, 0), 0), 0)
+
+    p = self._block_orth(self._symmetric_projection(cout),
+                         self._symmetric_projection(cout),
+                         self._symmetric_projection(cout))
+    for _ in range(ksize - 2):
+      temp = self._block_orth(self._symmetric_projection(cout),
+                              self._symmetric_projection(cout),
+                              self._symmetric_projection(cout))
+      p = self._matrix_conv(p, temp)
+    for i in range(ksize):
+      for j in range(ksize):
+        for k in range(ksize):
+          p[i, j, k] = math_ops.matmul(orth, p[i, j, k])
+
+    return self._dict_to_tensor(p, ksize, ksize, ksize)
+
+
 @tf_export("keras.initializers.Identity", "initializers.identity")
 class Identity(Initializer):
   """Initializer that generates the identity matrix.
@@ -564,7 +1084,7 @@ class Identity(Initializer):
           "Identity matrix initializer can only be used for 2D matrices.")
     if dtype is None:
       dtype = self.dtype
-    initializer = linalg_ops.eye(*full_shape, dtype=dtype)
+    initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype)
     if partition_info is not None:
       initializer = array_ops.slice(initializer, partition_info.var_offset,
                                     shape)
@@ -586,7 +1106,10 @@ uniform_unit_scaling_initializer = UniformUnitScaling
 variance_scaling_initializer = VarianceScaling
 orthogonal_initializer = Orthogonal
 identity_initializer = Identity
-
+convolutional_delta_orthogonal = ConvolutionDeltaOrthogonal
+convolutional_orthogonal_1d = ConvolutionOrthogonal1D
+convolutional_orthogonal_2d = ConvolutionOrthogonal2D
+convolutional_orthogonal_3d = ConvolutionOrthogonal3D
 # pylint: enable=invalid-name
 
 
diff --git a/tensorflow/python/ops/initializers_ns.py b/tensorflow/python/ops/initializers_ns.py
index c21079f2971a4bdd76b4be1a803055c12b243903..e7996efe93eb2f33306a52ded91c273009192789 100644
--- a/tensorflow/python/ops/initializers_ns.py
+++ b/tensorflow/python/ops/initializers_ns.py
@@ -39,5 +39,8 @@ global_variables = _variables.global_variables_initializer
 local_variables = _variables.local_variables_initializer
 
 # Seal API.
+del absolute_import
+del division
+del print_function
 del init_ops
 del _variables
diff --git a/tensorflow/python/ops/inplace_ops.py b/tensorflow/python/ops/inplace_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5b000086b05219d23cd88935948f88f2cc718bf
--- /dev/null
+++ b/tensorflow/python/ops/inplace_ops.py
@@ -0,0 +1,227 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inplace operations.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _inplace_helper(x, i, v, op):
+  """Applies an inplace op on (x, i, v).
+
+  op is one of gen_array_ops.alias_inplace_update,
+  gen_array_ops.alias_inplace_add, or gen_array_ops.alias_inplace_sub.
+
+  If i is None, x and v must be the same shape. Computes
+    x op v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] op v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] op v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+    op: alias_inplace_update, alias_inplace_add, or alias_inplace_sub.
+
+  Returns:
+    Returns x.
+
+  """
+  x = ops.convert_to_tensor(x)
+  v = ops.convert_to_tensor(v, x.dtype)
+  if i is None:
+    # Full tensor.
+    return array_ops.reshape(
+        op(array_ops.reshape(x, [1, -1]), [0], array_ops.reshape(v, [1, -1])),
+        array_ops.shape(x))
+  i = math_ops.to_int32(i)
+  if i.get_shape().ndims == 0:
+    # Single 0-dim update.
+    return op(x, array_ops.reshape(i, [1]), array_ops.expand_dims(v, 0))
+  return op(x, i, v)
+
+
+def alias_inplace_update(x, i, v):
+  """Applies an inplace update on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x = v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] = v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] = v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_update)
+
+
+def alias_inplace_add(x, i, v):
+  """Applies an inplace add on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x += v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] += v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] += v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_add)
+
+
+def alias_inplace_sub(x, i, v):
+  """Applies an inplace sub on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x -= v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] -= v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] -= v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_sub)
+
+
+def empty_like(x, init=None):
+  """Returns a non-initialized tensor with the same shape and dtype as x.
+
+  Args:
+    x: A Tensor.
+    init: Initialize the returned tensor with the default value of
+      x.dtype(), if True. Otherwise, do not initialize. Defaults to
+      None.
+
+  Returns:
+    A tensor y, whose dtype and shape are the same as those of x.
+    y is guaranteed not to be an alias of x. Upon return, y may contain
+    arbitrary data.
+
+  """
+  x = ops.convert_to_tensor(x)
+  return gen_array_ops.empty(array_ops.shape(x), x.dtype, init=init)
+
+
+def inplace_update(x, i, v):
+  """Applies an inplace update on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y = v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] = v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] = v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_update(gen_array_ops.deep_copy(x), i, v)
+
+
+def inplace_add(x, i, v):
+  """Applies an inplace add on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y += v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] += v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] += v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_add(gen_array_ops.deep_copy(x), i, v)
+
+
+def inplace_sub(x, i, v):
+  """Applies an inplace sub on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y -= v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] -= v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] -= v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_sub(gen_array_ops.deep_copy(x), i, v)
+
+empty = gen_array_ops.empty
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 5e70b3186f382a0c795b1795b2db27bb2058ee41..b5274ef2ed05eae71353c06280b15aa592f3bc7d 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -17,53 +17,6 @@
 """Inputs and Readers.
 
 See the @{$python/io_ops} guide.
-
-@@placeholder
-@@placeholder_with_default
-@@sparse_placeholder
-@@ReaderBase
-@@TextLineReader
-@@WholeFileReader
-@@IdentityReader
-@@TFRecordReader
-@@LMDBReader
-@@FixedLengthRecordReader
-@@decode_csv
-@@decode_raw
-@@VarLenFeature
-@@FixedLenFeature
-@@FixedLenSequenceFeature
-@@SparseFeature
-@@parse_example
-@@parse_single_example
-@@parse_tensor
-@@serialize_tensor
-@@decode_json_example
-@@QueueBase
-@@FIFOQueue
-@@PaddingFIFOQueue
-@@RandomShuffleQueue
-@@PriorityQueue
-@@ConditionalAccumulatorBase
-@@ConditionalAccumulator
-@@SparseConditionalAccumulator
-@@matching_files
-@@read_file
-@@write_file
-@@match_filenames_once
-@@limit_epochs
-@@input_producer
-@@range_input_producer
-@@slice_input_producer
-@@string_input_producer
-@@batch
-@@maybe_batch
-@@batch_join
-@@maybe_batch_join
-@@shuffle_batch
-@@maybe_shuffle_batch
-@@shuffle_batch_join
-@@maybe_shuffle_batch_join
 """
 
 from __future__ import absolute_import
@@ -111,10 +64,10 @@ def _save(filename, tensor_names, tensors, tensor_slices=None, name="save"):
     An Operation that saves the tensors.
   """
   if tensor_slices is None:
-    return gen_io_ops._save(filename, tensor_names, tensors, name=name)
+    return gen_io_ops.save(filename, tensor_names, tensors, name=name)
   else:
-    return gen_io_ops._save_slices(filename, tensor_names, tensor_slices,
-                                   tensors, name=name)
+    return gen_io_ops.save_slices(filename, tensor_names, tensor_slices,
+                                  tensors, name=name)
 
 
 def _restore_slice(file_pattern, tensor_name, shape_and_slice, tensor_type,
@@ -136,7 +89,7 @@ def _restore_slice(file_pattern, tensor_name, shape_and_slice, tensor_type,
     A tensor of type "tensor_type".
   """
   base_type = dtypes.as_dtype(tensor_type).base_dtype
-  return gen_io_ops._restore_slice(
+  return gen_io_ops.restore_slice(
       file_pattern, tensor_name, shape_and_slice, base_type,
       preferred_shard, name=name)
 
@@ -173,7 +126,7 @@ class ReaderBase(object):
     Raises:
       RuntimeError: If eager execution is enabled.
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError(
           "Readers are not supported when eager execution is enabled. "
           "Instead, please use tf.data to get data into your model.")
@@ -208,12 +161,12 @@ class ReaderBase(object):
     else:
       queue_ref = queue.queue_ref
     if self._reader_ref.dtype == dtypes.resource:
-      return gen_io_ops._reader_read_v2(self._reader_ref, queue_ref, name=name)
+      return gen_io_ops.reader_read_v2(self._reader_ref, queue_ref, name=name)
     else:
       # For compatibility with pre-resource queues, create a ref(string) tensor
       # which can be looked up as the same queue by a resource manager.
-      old_queue_op = gen_data_flow_ops._fake_queue(queue_ref)
-      return gen_io_ops._reader_read(self._reader_ref, old_queue_op, name=name)
+      old_queue_op = gen_data_flow_ops.fake_queue(queue_ref)
+      return gen_io_ops.reader_read(self._reader_ref, old_queue_op, name=name)
 
   def read_up_to(self, queue, num_records,  # pylint: disable=invalid-name
                  name=None):
@@ -240,18 +193,18 @@ class ReaderBase(object):
     else:
       queue_ref = queue.queue_ref
     if self._reader_ref.dtype == dtypes.resource:
-      return gen_io_ops._reader_read_up_to_v2(self._reader_ref,
-                                              queue_ref,
-                                              num_records,
-                                              name=name)
+      return gen_io_ops.reader_read_up_to_v2(self._reader_ref,
+                                             queue_ref,
+                                             num_records,
+                                             name=name)
     else:
       # For compatibility with pre-resource queues, create a ref(string) tensor
       # which can be looked up as the same queue by a resource manager.
-      old_queue_op = gen_data_flow_ops._fake_queue(queue_ref)
-      return gen_io_ops._reader_read_up_to(self._reader_ref,
-                                           old_queue_op,
-                                           num_records,
-                                           name=name)
+      old_queue_op = gen_data_flow_ops.fake_queue(queue_ref)
+      return gen_io_ops.reader_read_up_to(self._reader_ref,
+                                          old_queue_op,
+                                          num_records,
+                                          name=name)
 
   def num_records_produced(self, name=None):
     """Returns the number of records this reader has produced.
@@ -267,11 +220,11 @@ class ReaderBase(object):
 
     """
     if self._reader_ref.dtype == dtypes.resource:
-      return gen_io_ops._reader_num_records_produced_v2(self._reader_ref,
-                                                        name=name)
+      return gen_io_ops.reader_num_records_produced_v2(self._reader_ref,
+                                                       name=name)
     else:
-      return gen_io_ops._reader_num_records_produced(self._reader_ref,
-                                                     name=name)
+      return gen_io_ops.reader_num_records_produced(self._reader_ref,
+                                                    name=name)
 
   def num_work_units_completed(self, name=None):
     """Returns the number of work units this reader has finished processing.
@@ -283,11 +236,11 @@ class ReaderBase(object):
       An int64 Tensor.
     """
     if self._reader_ref.dtype == dtypes.resource:
-      return gen_io_ops._reader_num_work_units_completed_v2(self._reader_ref,
-                                                            name=name)
+      return gen_io_ops.reader_num_work_units_completed_v2(self._reader_ref,
+                                                           name=name)
     else:
-      return gen_io_ops._reader_num_work_units_completed(self._reader_ref,
-                                                         name=name)
+      return gen_io_ops.reader_num_work_units_completed(self._reader_ref,
+                                                        name=name)
 
   def serialize_state(self, name=None):
     """Produce a string tensor that encodes the state of a reader.
@@ -302,9 +255,9 @@ class ReaderBase(object):
       A string Tensor.
     """
     if self._reader_ref.dtype == dtypes.resource:
-      return gen_io_ops._reader_serialize_state_v2(self._reader_ref, name=name)
+      return gen_io_ops.reader_serialize_state_v2(self._reader_ref, name=name)
     else:
-      return gen_io_ops._reader_serialize_state(self._reader_ref, name=name)
+      return gen_io_ops.reader_serialize_state(self._reader_ref, name=name)
 
   def restore_state(self, state, name=None):
     """Restore a reader to a previously saved state.
@@ -321,11 +274,10 @@ class ReaderBase(object):
       The created Operation.
     """
     if self._reader_ref.dtype == dtypes.resource:
-      return gen_io_ops._reader_restore_state_v2(
+      return gen_io_ops.reader_restore_state_v2(
           self._reader_ref, state, name=name)
     else:
-      return gen_io_ops._reader_restore_state(
-          self._reader_ref, state, name=name)
+      return gen_io_ops.reader_restore_state(self._reader_ref, state, name=name)
 
   @property
   def supports_serialize(self):
@@ -342,9 +294,9 @@ class ReaderBase(object):
       The created Operation.
     """
     if self._reader_ref.dtype == dtypes.resource:
-      return gen_io_ops._reader_reset_v2(self._reader_ref, name=name)
+      return gen_io_ops.reader_reset_v2(self._reader_ref, name=name)
     else:
-      return gen_io_ops._reader_reset(self._reader_ref, name=name)
+      return gen_io_ops.reader_reset(self._reader_ref, name=name)
 
 
 ops.NotDifferentiable("ReaderRead")
@@ -377,7 +329,7 @@ class WholeFileReader(ReaderBase):
     Args:
       name: A name for the operation (optional).
     """
-    rr = gen_io_ops._whole_file_reader_v2(name=name)
+    rr = gen_io_ops.whole_file_reader_v2(name=name)
     super(WholeFileReader, self).__init__(rr, supports_serialize=True)
 
 
@@ -406,8 +358,8 @@ class TextLineReader(ReaderBase):
         to skip from the beginning of every file.
       name: A name for the operation (optional).
     """
-    rr = gen_io_ops._text_line_reader_v2(skip_header_lines=skip_header_lines,
-                                         name=name)
+    rr = gen_io_ops.text_line_reader_v2(skip_header_lines=skip_header_lines,
+                                        name=name)
     super(TextLineReader, self).__init__(rr)
 
 
@@ -444,7 +396,7 @@ class FixedLengthRecordReader(ReaderBase):
       name: A name for the operation (optional).
       encoding: The type of encoding for the file. Defaults to none.
     """
-    rr = gen_io_ops._fixed_length_record_reader_v2(
+    rr = gen_io_ops.fixed_length_record_reader_v2(
         record_bytes=record_bytes,
         header_bytes=header_bytes,
         footer_bytes=footer_bytes,
@@ -480,7 +432,7 @@ class TFRecordReader(ReaderBase):
     compression_type = python_io.TFRecordOptions.get_compression_type_string(
         options)
 
-    rr = gen_io_ops._tf_record_reader_v2(
+    rr = gen_io_ops.tf_record_reader_v2(
         name=name, compression_type=compression_type)
     super(TFRecordReader, self).__init__(rr)
 
@@ -506,7 +458,7 @@ class LMDBReader(ReaderBase):
       name: A name for the operation (optional).
       options: A LMDBRecordOptions object (optional).
     """
-    rr = gen_io_ops._lmdb_reader(name=name)
+    rr = gen_io_ops.lmdb_reader(name=name)
     super(LMDBReader, self).__init__(rr)
 
 
@@ -534,7 +486,7 @@ class IdentityReader(ReaderBase):
     Args:
       name: A name for the operation (optional).
     """
-    rr = gen_io_ops._identity_reader_v2(name=name)
+    rr = gen_io_ops.identity_reader_v2(name=name)
     super(IdentityReader, self).__init__(rr, supports_serialize=True)
 
 
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index ce8c1580fe5ee614558bfd52afde0d9c5088abe6..07659ef44c443ad15876781d6c6254ae3bc38660 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -34,15 +34,3 @@ py_library(
         "//tensorflow/python:special_math_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index 14319025ff275944cf34e30128df96254d06072b..d73c21cdc0bc2805bd29fdc6b8937d4725236557 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.python.ops.linalg.linalg_impl import *
 from tensorflow.python.ops.linalg.linear_operator import *
+from tensorflow.python.ops.linalg.linear_operator_circulant import *
 from tensorflow.python.ops.linalg.linear_operator_composition import *
 from tensorflow.python.ops.linalg.linear_operator_diag import *
 from tensorflow.python.ops.linalg.linear_operator_full_matrix import *
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index d5bd916f80d8a03e5423c43d1ca039bc4dceff5e..8343c62816c6aeadc77dae701ae9917a86e68954 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -31,18 +31,19 @@ band_part = array_ops.matrix_band_part
 cholesky = linalg_ops.cholesky
 cholesky_solve = linalg_ops.cholesky_solve
 det = linalg_ops.matrix_determinant
-# pylint: disable=protected-access
-slogdet = gen_linalg_ops._log_matrix_determinant
-# pylint: disable=protected-access
+slogdet = gen_linalg_ops.log_matrix_determinant
+tf_export('linalg.slogdet')(slogdet)
 diag = array_ops.matrix_diag
 diag_part = array_ops.matrix_diag_part
 eigh = linalg_ops.self_adjoint_eig
 eigvalsh = linalg_ops.self_adjoint_eigvals
 einsum = special_math_ops.einsum
-expm = gen_linalg_ops._matrix_exponential
+expm = gen_linalg_ops.matrix_exponential
+tf_export('linalg.expm')(expm)
 eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
-logm = gen_linalg_ops._matrix_logarithm
+logm = gen_linalg_ops.matrix_logarithm
+tf_export('linalg.logm')(logm)
 lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
 qr = linalg_ops.qr
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 957a7959181efe3bbc319e62582053329b763dc3..8cfe964b1c0a572f43a14c66885e74ea105b0916 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -166,8 +166,7 @@ class LinearOperator(object):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
@@ -204,16 +203,6 @@ class LinearOperator(object):
     self._is_positive_definite = is_positive_definite
     self._name = name or type(self).__name__
 
-    # We will cache some tensors to avoid repeatedly adding shape
-    # manipulation ops to the graph.
-    # Naming convention:
-    #   self._cached_X_tensor is the cached version of self._X_tensor.
-    self._cached_shape_tensor = None
-    self._cached_batch_shape_tensor = None
-    self._cached_domain_dimension_tensor = None
-    self._cached_range_dimension_tensor = None
-    self._cached_tensor_rank_tensor = None
-
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
     """Helper function to standardize op scope."""
@@ -299,15 +288,11 @@ class LinearOperator(object):
       `int32` `Tensor`
     """
     with self._name_scope(name):
-      # Be clean by avoiding adding shape Ops to the graph too many times.
-      if self._cached_shape_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.shape.is_fully_defined():
-          self._cached_shape_tensor = linear_operator_util.shape_tensor(
-              self.shape.as_list())
-        else:
-          self._cached_shape_tensor = self._shape_tensor()
-      return self._cached_shape_tensor
+      # Prefer to use statically defined shape if available.
+      if self.shape.is_fully_defined():
+        return linear_operator_util.shape_tensor(self.shape.as_list())
+      else:
+        return self._shape_tensor()
 
   @property
   def batch_shape(self):
@@ -338,14 +323,12 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_batch_shape_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.batch_shape.is_fully_defined():
-          self._cached_batch_shape_tensor = linear_operator_util.shape_tensor(
-              self.batch_shape.as_list(), name="batch_shape")
-        else:
-          self._cached_batch_shape_tensor = self.shape_tensor()[:-2]
-      return self._cached_batch_shape_tensor
+      # Prefer to use statically defined shape if available.
+      if self.batch_shape.is_fully_defined():
+        return linear_operator_util.shape_tensor(
+            self.batch_shape.as_list(), name="batch_shape")
+      else:
+        return self.shape_tensor()[:-2]
 
   @property
   def tensor_rank(self, name="tensor_rank"):
@@ -378,14 +361,11 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_tensor_rank_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.tensor_rank is not None:
-          self._cached_tensor_rank_tensor = ops.convert_to_tensor(
-              self.tensor_rank)
-        else:
-          self._cached_tensor_rank_tensor = array_ops.size(self.shape_tensor())
-      return self._cached_tensor_rank_tensor
+      # Prefer to use statically defined shape if available.
+      if self.tensor_rank is not None:
+        return ops.convert_to_tensor(self.tensor_rank)
+      else:
+        return array_ops.size(self.shape_tensor())
 
   @property
   def domain_dimension(self):
@@ -416,14 +396,11 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_domain_dimension_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.domain_dimension.value is not None:
-          self._cached_domain_dimension_tensor = ops.convert_to_tensor(
-              self.domain_dimension.value)
-        else:
-          self._cached_domain_dimension_tensor = self.shape_tensor()[-1]
-      return self._cached_domain_dimension_tensor
+      # Prefer to use statically defined shape if available.
+      if self.domain_dimension.value is not None:
+        return ops.convert_to_tensor(self.domain_dimension.value)
+      else:
+        return self.shape_tensor()[-1]
 
   @property
   def range_dimension(self):
@@ -454,14 +431,11 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_range_dimension_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.range_dimension.value is not None:
-          self._cached_range_dimension_tensor = ops.convert_to_tensor(
-              self.range_dimension.value)
-        else:
-          self._cached_range_dimension_tensor = self.shape_tensor()[-2]
-      return self._cached_range_dimension_tensor
+      # Prefer to use statically defined shape if available.
+      if self.range_dimension.value is not None:
+        return ops.convert_to_tensor(self.range_dimension.value)
+      else:
+        return self.shape_tensor()[-2]
 
   def _assert_non_singular(self):
     """Private default implementation of _assert_non_singular."""
@@ -471,8 +445,7 @@ class LinearOperator(object):
     if self._can_use_cholesky():
       return self.assert_positive_definite()
     else:
-      singular_values = linalg_ops.svd(
-          self._get_cached_dense_matrix(), compute_uv=False)
+      singular_values = linalg_ops.svd(self.to_dense(), compute_uv=False)
       # TODO(langmore) Add .eig and .cond as methods.
       cond = (math_ops.reduce_max(singular_values, axis=-1) /
               math_ops.reduce_min(singular_values, axis=-1))
@@ -524,7 +497,7 @@ class LinearOperator(object):
     # and sufficient.
     if self.is_self_adjoint:
       return check_ops.assert_positive(
-          array_ops.matrix_diag_part(self._get_cached_chol()),
+          array_ops.matrix_diag_part(linalg_ops.cholesky(self.to_dense())),
           message="Matrix was not positive definite.")
     # We have no generic check for positive definite.
     raise NotImplementedError("assert_positive_definite is not implemented.")
@@ -547,7 +520,7 @@ class LinearOperator(object):
       return self._assert_positive_definite()
 
   def _assert_self_adjoint(self):
-    dense = self._get_cached_dense_matrix()
+    dense = self.to_dense()
     logging.warn(
         "Using (possibly slow) default implementation of assert_self_adjoint."
         "  Requires conversion to a dense matrix.")
@@ -692,7 +665,7 @@ class LinearOperator(object):
         "Using (possibly slow) default implementation of determinant."
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     if self._can_use_cholesky():
-      diag = array_ops.matrix_diag_part(self._get_cached_chol())
+      diag = array_ops.matrix_diag_part(linalg_ops.cholesky(self.to_dense()))
       return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
     _, log_abs_det = linalg.slogdet(self._matrix)
     return log_abs_det
@@ -726,9 +699,10 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
     if self._can_use_cholesky():
-      return linalg_ops.cholesky_solve(self._get_cached_chol(), rhs)
-    return linalg_ops.matrix_solve(
-        self._get_cached_dense_matrix(), rhs, adjoint=adjoint)
+      return linear_operator_util.cholesky_solve_with_broadcast(
+          linalg_ops.cholesky(self.to_dense()), rhs)
+    return linear_operator_util.matrix_solve_with_broadcast(
+        self.to_dense(), rhs, adjoint=adjoint)
 
   def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
     """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
@@ -866,7 +840,7 @@ class LinearOperator(object):
 
   def _diag_part(self):
     """Generic and often inefficient implementation.  Override often."""
-    return array_ops.matrix_diag_part(self._get_cached_dense_matrix())
+    return array_ops.matrix_diag_part(self.to_dense())
 
   def diag_part(self, name="diag_part"):
     """Efficiently get the [batch] diagonal part of this operator.
@@ -915,7 +889,7 @@ class LinearOperator(object):
 
   def _add_to_tensor(self, x):
     # Override if a more efficient implementation is available.
-    return self._get_cached_dense_matrix() + x
+    return self.to_dense() + x
 
   def add_to_tensor(self, x, name="add_to_tensor"):
     """Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
@@ -936,13 +910,3 @@ class LinearOperator(object):
     # TODO(langmore) Add complex types when tf.cholesky can use them.
     return (not self.dtype.is_complex and self.is_self_adjoint and
             self.is_positive_definite)
-
-  def _get_cached_dense_matrix(self):
-    if not hasattr(self, "_cached_dense_matrix"):
-      self._cached_dense_matrix = self.to_dense()
-    return self._cached_dense_matrix
-
-  def _get_cached_chol(self):
-    if not hasattr(self, "_cached_chol"):
-      self._cached_chol = linalg_ops.cholesky(self._get_cached_dense_matrix())
-    return self._cached_chol
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
new file mode 100644
index 0000000000000000000000000000000000000000..c367ed25ad68717f4190ba46eb799d3286121b8d
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -0,0 +1,1074 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`LinearOperator` coming from a [[nested] block] circulant matrix."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = [
+    "LinearOperatorCirculant",
+    "LinearOperatorCirculant2D",
+    "LinearOperatorCirculant3D",
+]
+
+# Different FFT Ops will be used for different block depths.
+_FFT_OP = {1: math_ops.fft, 2: math_ops.fft2d, 3: math_ops.fft3d}
+_IFFT_OP = {1: math_ops.ifft, 2: math_ops.ifft2d, 3: math_ops.ifft3d}
+
+# This is the only dtype allowed with fft ops.
+# TODO(langmore) Add other types once available.
+_DTYPE_COMPLEX = dtypes.complex64
+
+
+# TODO(langmore) Add transformations that create common spectrums, e.g.
+#   starting with the convolution kernel
+#   start with half a spectrum, and create a Hermitian one.
+#   common filters.
+# TODO(langmore) Support rectangular Toeplitz matrices.
+class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
+  """Base class for circulant operators.  Not user facing.
+
+  `LinearOperator` acting like a [batch] [[nested] block] circulant matrix.
+  """
+
+  def __init__(self,
+               spectrum,
+               block_depth,
+               input_output_dtype=_DTYPE_COMPLEX,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=True,
+               name="LinearOperatorCirculant"):
+    r"""Initialize an `_BaseLinearOperatorCirculant`.
+
+    Args:
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
+        `float32`, `complex64`.  Type can be different than `input_output_dtype`
+      block_depth:  Python integer, either 1, 2, or 3.  Will be 1 for circulant,
+        2 for block circulant, and 3 for nested block circulant.
+      input_output_dtype: `dtype` for input/output.  Must be either
+        `float32` or `complex64`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  If `spectrum` is real, this will always be true.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name:  A name to prepend to all ops created by this class.
+
+    Raises:
+      ValueError:  If `block_depth` is not an allowed value.
+      TypeError:  If `spectrum` is not an allowed type.
+    """
+
+    allowed_block_depths = [1, 2, 3]
+
+    self._name = name
+
+    if block_depth not in allowed_block_depths:
+      raise ValueError("Expected block_depth to be in %s.  Found: %s." %
+                       (allowed_block_depths, block_depth))
+    self._block_depth = block_depth
+
+    with ops.name_scope(name, values=[spectrum]):
+      self._spectrum = self._check_spectrum_and_return_tensor(spectrum)
+
+      # Check and auto-set hints.
+      if not self.spectrum.dtype.is_complex:
+        if is_self_adjoint is False:
+          raise ValueError(
+              "A real spectrum always corresponds to a self-adjoint operator.")
+        is_self_adjoint = True
+
+      if is_square is False:
+        raise ValueError(
+            "A [[nested] block] circulant operator is always square.")
+      is_square = True
+
+      # If spectrum.shape = [s0, s1, s2], and block_depth = 2,
+      # block_shape = [s1, s2]
+      s_shape = array_ops.shape(self.spectrum)
+      self._block_shape_tensor = s_shape[-self.block_depth:]
+
+      # Add common variants of spectrum to the graph.
+      self._spectrum_complex = _to_complex(self.spectrum)
+      self._abs_spectrum = math_ops.abs(self.spectrum)
+      self._conj_spectrum = math_ops.conj(self._spectrum_complex)
+
+      super(_BaseLinearOperatorCirculant, self).__init__(
+          dtype=dtypes.as_dtype(input_output_dtype),
+          graph_parents=[self.spectrum],
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  def _check_spectrum_and_return_tensor(self, spectrum):
+    """Static check of spectrum.  Then return `Tensor` version."""
+    spectrum = ops.convert_to_tensor(spectrum, name="spectrum")
+
+    allowed_dtypes = [dtypes.float32, dtypes.complex64]
+    if spectrum.dtype not in allowed_dtypes:
+      raise TypeError("Argument spectrum must have dtype in %s.  Found: %s" %
+                      (allowed_dtypes, spectrum.dtype))
+    if spectrum.get_shape().ndims is not None:
+      if spectrum.get_shape().ndims < self.block_depth:
+        raise ValueError(
+            "Argument spectrum must have at least %d dimensions.  Found: %s" %
+            (self.block_depth, spectrum))
+    return spectrum
+
+  @property
+  def block_depth(self):
+    """Depth of recursively defined circulant blocks defining this `Operator`.
+
+    With `A` the dense representation of this `Operator`,
+
+    `block_depth = 1` means `A` is symmetric circulant.  For example,
+
+    ```
+    A = |x y z y|
+        |y x y z|
+        |z y x y|
+        |y z y x|
+    ```
+
+    `block_depth = 2` means `A` is block symmetric circulant with symemtric
+    circulant blocks.  For example, with `X`, `Y`, `Z` symmetric circulant,
+
+    ```
+    A = |X Y Z Y|
+        |Y X Y Z|
+        |Z Y X Y|
+        |Y Z Y X|
+    ```
+
+    `block_depth = 3` means `A` is block symmetric circulant with block
+    symmetric circulant blocks.
+
+    Returns:
+      Python `integer`.
+    """
+    return self._block_depth
+
+  def block_shape_tensor(self):
+    """Shape of the block dimensions of `self.spectrum`."""
+    return self._block_shape_tensor
+
+  @property
+  def block_shape(self):
+    return self.spectrum.get_shape()[-self.block_depth:]
+
+  @property
+  def spectrum(self):
+    return self._spectrum
+
+  def _vectorize_then_blockify(self, matrix):
+    """Shape batch matrix to batch vector, then blockify trailing dimensions."""
+    # Suppose
+    #   matrix.shape = [m0, m1, m2, m3],
+    # and matrix is a matrix because the final two dimensions are matrix dims.
+    #   self.block_depth = 2,
+    #   self.block_shape = [b0, b1]  (note b0 * b1 = m2).
+    # We will reshape matrix to
+    #   [m3, m0, m1, b0, b1].
+
+    # Vectorize: Reshape to batch vector.
+    #   [m0, m1, m2, m3] --> [m3, m0, m1, m2]
+    # This is called "vectorize" because we have taken the final two matrix dims
+    # and turned this into a size m3 batch of vectors.
+    vec = distribution_util.rotate_transpose(matrix, shift=1)
+
+    # Blockify: Blockfy trailing dimensions.
+    #   [m3, m0, m1, m2] --> [m3, m0, m1, b0, b1]
+    if (vec.get_shape().is_fully_defined() and
+        self.block_shape.is_fully_defined()):
+      # vec_leading_shape = [m3, m0, m1],
+      # the parts of vec that will not be blockified.
+      vec_leading_shape = vec.get_shape()[:-1]
+      final_shape = vec_leading_shape.concatenate(self.block_shape)
+    else:
+      vec_leading_shape = array_ops.shape(vec)[:-1]
+      final_shape = array_ops.concat(
+          (vec_leading_shape, self.block_shape_tensor()), 0)
+    return array_ops.reshape(vec, final_shape)
+
+  def _unblockify_then_matricize(self, vec):
+    """Flatten the block dimensions then reshape to a batch matrix."""
+    # Suppose
+    #   vec.shape = [v0, v1, v2, v3],
+    #   self.block_depth = 2.
+    # Then
+    #   leading shape = [v0, v1]
+    #   block shape = [v2, v3].
+    # We will reshape vec to
+    #   [v1, v2*v3, v0].
+
+    # Un-blockify: Flatten block dimensions.  Reshape
+    #   [v0, v1, v2, v3] --> [v0, v1, v2*v3].
+    if vec.get_shape().is_fully_defined():
+      # vec_shape = [v0, v1, v2, v3]
+      vec_shape = vec.get_shape().as_list()
+      # vec_leading_shape = [v0, v1]
+      vec_leading_shape = vec_shape[:-self.block_depth]
+      # vec_block_shape = [v2, v3]
+      vec_block_shape = vec_shape[-self.block_depth:]
+      # flat_shape = [v0, v1, v2*v3]
+      flat_shape = vec_leading_shape + [np.prod(vec_block_shape)]
+    else:
+      vec_shape = array_ops.shape(vec)
+      vec_leading_shape = vec_shape[:-self.block_depth]
+      vec_block_shape = vec_shape[-self.block_depth:]
+      flat_shape = array_ops.concat(
+          (vec_leading_shape, [math_ops.reduce_prod(vec_block_shape)]), 0)
+    vec_flat = array_ops.reshape(vec, flat_shape)
+
+    # Matricize:  Reshape to batch matrix.
+    #   [v0, v1, v2*v3] --> [v1, v2*v3, v0],
+    # representing a shape [v1] batch of [v2*v3, v0] matrices.
+    matrix = distribution_util.rotate_transpose(vec_flat, shift=-1)
+    return matrix
+
+  def _fft(self, x):
+    """FFT along the last self.block_depth dimensions of x.
+
+    Args:
+      x: `Tensor` with floating or complex `dtype`.
+        Should be in the form returned by self._vectorize_then_blockify.
+
+    Returns:
+      `Tensor` with `dtype` `complex64`.
+    """
+    x_complex = _to_complex(x)
+    return _FFT_OP[self.block_depth](x_complex)
+
+  def _ifft(self, x):
+    """IFFT along the last self.block_depth dimensions of x.
+
+    Args:
+      x: `Tensor` with floating or complex dtype.  Should be in the form
+        returned by self._vectorize_then_blockify.
+
+    Returns:
+      `Tensor` with `dtype` `complex64`.
+    """
+    x_complex = _to_complex(x)
+    return _IFFT_OP[self.block_depth](x_complex)
+
+  def convolution_kernel(self, name="convolution_kernel"):
+    """Convolution kernel corresponding to `self.spectrum`.
+
+    The `D` dimensional DFT of this kernel is the frequency domain spectrum of
+    this operator.
+
+    Args:
+      name:  A name to give this `Op`.
+
+    Returns:
+      `Tensor` with `dtype` `self.dtype`.
+    """
+    with self._name_scope(name):
+      h = self._ifft(self._spectrum_complex)
+      return math_ops.cast(h, self.dtype)
+
+  def _shape(self):
+    s_shape = self._spectrum.get_shape()
+    # Suppose spectrum.shape = [a, b, c, d]
+    # block_depth = 2
+    # Then:
+    #   batch_shape = [a, b]
+    #   N = c*d
+    # and we want to return
+    #   [a, b, c*d, c*d]
+    batch_shape = s_shape[:-self.block_depth]
+    # trailing_dims = [c, d]
+    trailing_dims = s_shape[-self.block_depth:]
+    if trailing_dims.is_fully_defined():
+      n = np.prod(trailing_dims.as_list())
+    else:
+      n = None
+    n_x_n = tensor_shape.TensorShape([n, n])
+    return batch_shape.concatenate(n_x_n)
+
+  def _shape_tensor(self):
+    # See self.shape for explanation of steps
+    s_shape = array_ops.shape(self._spectrum)
+    batch_shape = s_shape[:-self.block_depth]
+    trailing_dims = s_shape[-self.block_depth:]
+    n = math_ops.reduce_prod(trailing_dims)
+    n_x_n = [n, n]
+    return array_ops.concat((batch_shape, n_x_n), 0)
+
+  def assert_hermitian_spectrum(self, name="assert_hermitian_spectrum"):
+    """Returns an `Op` that asserts this operator has Hermitian spectrum.
+
+    This operator corresponds to a real-valued matrix if and only if its
+    spectrum is Hermitian.
+
+    Args:
+      name:  A name to give this `Op`.
+
+    Returns:
+      An `Op` that asserts this operator has Hermitian spectrum.
+    """
+    eps = np.finfo(self.dtype.real_dtype.as_numpy_dtype).eps
+    with self._name_scope(name):
+      # Assume linear accumulation of error.
+      max_err = eps * self.domain_dimension_tensor()
+      imag_convolution_kernel = math_ops.imag(self.convolution_kernel())
+      return check_ops.assert_less(
+          math_ops.abs(imag_convolution_kernel),
+          max_err,
+          message="Spectrum was not Hermitian")
+
+  def _assert_non_singular(self):
+    return linear_operator_util.assert_no_entries_with_modulus_zero(
+        self.spectrum,
+        message="Singular operator:  Spectrum contained zero values.")
+
+  def _assert_positive_definite(self):
+    # This operator has the action  Ax = F^H D F x,
+    # where D is the diagonal matrix with self.spectrum on the diag.  Therefore,
+    # <x, Ax> = <Fx, DFx>,
+    # Since F is bijective, the condition for positive definite is the same as
+    # for a diagonal matrix, i.e. real part of spectrum is positive.
+    message = (
+        "Not positive definite:  Real part of spectrum was not all positive.")
+    return check_ops.assert_positive(
+        math_ops.real(self.spectrum), message=message)
+
+  def _assert_self_adjoint(self):
+    # Recall correspondence between symmetry and real transforms.  See docstring
+    return linear_operator_util.assert_zero_imag_part(
+        self.spectrum,
+        message=(
+            "Not self-adjoint:  The spectrum contained non-zero imaginary part."
+        ))
+
+  def _broadcast_batch_dims(self, x, spectrum):
+    """Broadcast batch dims of batch matrix `x` and spectrum."""
+    # spectrum.shape = batch_shape + block_shape
+    # First make spectrum a batch matrix with
+    #   spectrum.shape = batch_shape + [prod(block_shape), 1]
+    spec_mat = array_ops.reshape(
+        spectrum, array_ops.concat(
+            (self.batch_shape_tensor(), [-1, 1]), axis=0))
+    # Second, broadcast, possibly requiring an addition of array of zeros.
+    x, spec_mat = linear_operator_util.broadcast_matrix_batch_dims((x,
+                                                                    spec_mat))
+    # Third, put the block shape back into spectrum.
+    batch_shape = array_ops.shape(x)[:-2]
+    spectrum = array_ops.reshape(
+        spec_mat,
+        array_ops.concat((batch_shape, self.block_shape_tensor()), axis=0))
+
+    return x, spectrum
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    x = linalg.adjoint(x) if adjoint_arg else x
+    # With F the matrix of a DFT, and F^{-1}, F^H the inverse and Hermitian
+    # transpose, one can show that F^{-1} = F^{H} is the IDFT matrix.  Therefore
+    # matmul(x) = F^{-1} diag(spectrum) F x,
+    #           = F^{H} diag(spectrum) F x,
+    # so that
+    # matmul(x, adjoint=True) = F^{H} diag(conj(spectrum)) F x.
+    spectrum = self._conj_spectrum if adjoint else self._spectrum_complex
+
+    x, spectrum = self._broadcast_batch_dims(x, spectrum)
+
+    x_vb = self._vectorize_then_blockify(x)
+    fft_x_vb = self._fft(x_vb)
+    block_vector_result = self._ifft(spectrum * fft_x_vb)
+    y = self._unblockify_then_matricize(block_vector_result)
+
+    return math_ops.cast(y, self.dtype)
+
+  def _determinant(self):
+    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
+    det = math_ops.reduce_prod(
+        self.spectrum, reduction_indices=reduction_indices)
+    return math_ops.cast(det, self.dtype)
+
+  def _log_abs_determinant(self):
+    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
+    lad = math_ops.reduce_sum(
+        math_ops.log(self._abs_spectrum), reduction_indices=reduction_indices)
+    return math_ops.cast(lad, self.dtype)
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
+    spectrum = self._conj_spectrum if adjoint else self._spectrum_complex
+
+    rhs, spectrum = self._broadcast_batch_dims(rhs, spectrum)
+
+    rhs_vb = self._vectorize_then_blockify(rhs)
+    fft_rhs_vb = self._fft(rhs_vb)
+    solution_vb = self._ifft(fft_rhs_vb / spectrum)
+    x = self._unblockify_then_matricize(solution_vb)
+    return math_ops.cast(x, self.dtype)
+
+  def _diag_part(self):
+    # Get ones in shape of diag, which is [B1,...,Bb, N]
+    # Also get the size of the diag, "N".
+    if self.shape.is_fully_defined():
+      diag_shape = self.shape[:-1]
+      diag_size = self.domain_dimension.value
+    else:
+      diag_shape = self.shape_tensor()[:-1]
+      diag_size = self.domain_dimension_tensor()
+    ones_diag = array_ops.ones(diag_shape, dtype=self.dtype)
+
+    # As proved in comments in self._trace, the value on the diag is constant,
+    # repeated N times.  This value is the trace divided by N.
+
+    # The handling of self.shape = (0, 0) is tricky, and is the reason we choose
+    # to compute trace and use that to compute diag_part, rather than computing
+    # the value on the diagonal ("diag_value") directly.  Both result in a 0/0,
+    # but in different places, and the current method gives the right result in
+    # the end.
+
+    # Here, if self.shape = (0, 0), then self.trace() = 0., and then
+    # diag_value = 0. / 0. = NaN.
+    diag_value = self.trace() / math_ops.cast(diag_size, self.dtype)
+
+    # If self.shape = (0, 0), then ones_diag = [] (empty tensor), and then
+    # the following line is NaN * [] = [], as needed.
+    return diag_value[..., array_ops.newaxis] * ones_diag
+
+  def _trace(self):
+    # The diagonal of the [[nested] block] circulant operator is the mean of
+    # the spectrum.
+    # Proof:  For the [0,...,0] element, this follows from the IDFT formula.
+    # Then the result follows since all diagonal elements are the same.
+
+    # Therefore, the trace is the sum of the spectrum.
+
+    # Get shape of diag along with the axis over which to reduce the spectrum.
+    # We will reduce the spectrum over all block indices.
+    if self.spectrum.get_shape().is_fully_defined():
+      spec_rank = self.spectrum.get_shape().ndims
+      axis = np.arange(spec_rank - self.block_depth, spec_rank, dtype=np.int32)
+    else:
+      spec_rank = array_ops.rank(self.spectrum)
+      axis = math_ops.range(spec_rank - self.block_depth, spec_rank)
+
+    # Real diag part "re_d".
+    # Suppose spectrum.shape = [B1,...,Bb, N1, N2]
+    # self.shape = [B1,...,Bb, N, N], with N1 * N2 = N.
+    # re_d_value.shape = [B1,...,Bb]
+    re_d_value = math_ops.reduce_sum(math_ops.real(self.spectrum), axis=axis)
+
+    if not self.dtype.is_complex:
+      return math_ops.cast(re_d_value, self.dtype)
+
+    # Imaginary part, "im_d".
+    if self.is_self_adjoint:
+      im_d_value = 0.
+    else:
+      im_d_value = math_ops.reduce_sum(math_ops.imag(self.spectrum), axis=axis)
+
+    return math_ops.cast(math_ops.complex(re_d_value, im_d_value), self.dtype)
+
+
+@tf_export("linalg.LinearOperatorCirculant")
+class LinearOperatorCirculant(_BaseLinearOperatorCirculant):
+  """`LinearOperator` acting like a circulant matrix.
+
+  This operator acts like a circulant matrix `A` with
+  shape `[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x N` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  #### Description in terms of circulant matrices
+
+  Circulant means the entries of `A` are generated by a single vector, the
+  convolution kernel `h`: `A_{mn} := h_{m-n mod N}`.  With `h = [w, x, y, z]`,
+
+  ```
+  A = |w z y x|
+      |x w z y|
+      |y x w z|
+      |z y x w|
+  ```
+
+  This means that the result of matrix multiplication `v = Au` has `Lth` column
+  given circular convolution between `h` with the `Lth` column of `u`.
+
+  See http://ee.stanford.edu/~gray/toeplitz.pdf
+
+  #### Description in terms of the frequency spectrum
+
+  There is an equivalent description in terms of the [batch] spectrum `H` and
+  Fourier transforms.  Here we consider `A.shape = [N, N]` and ignore batch
+  dimensions.  Define the discrete Fourier transform (DFT) and its inverse by
+
+  ```
+  DFT[ h[n] ] = H[k] := sum_{n = 0}^{N - 1} h_n e^{-i 2pi k n / N}
+  IDFT[ H[k] ] = h[n] = N^{-1} sum_{k = 0}^{N - 1} H_k e^{i 2pi k n / N}
+  ```
+
+  From these definitions, we see that
+
+  ```
+  H[0] = sum_{n = 0}^{N - 1} h_n
+  H[1] = "the first positive frequency"
+  H[N - 1] = "the first negative frequency"
+  ```
+
+  Loosely speaking, with `*` element-wise multiplication, matrix multiplication
+  is equal to the action of a Fourier multiplier: `A u = IDFT[ H * DFT[u] ]`.
+  Precisely speaking, given `[N, R]` matrix `u`, let `DFT[u]` be the `[N, R]`
+  matrix with `rth` column equal to the DFT of the `rth` column of `u`.
+  Define the `IDFT` similarly.
+  Matrix multiplication may be expressed columnwise:
+
+  ```(A u)_r = IDFT[ H * (DFT[u])_r ]```
+
+  #### Operator properties deduced from the spectrum.
+
+  Letting `U` be the `kth` Euclidean basis vector, and `U = IDFT[u]`.
+  The above formulas show that`A U = H_k * U`.  We conclude that the elements
+  of `H` are the eigenvalues of this operator.   Therefore
+
+  * This operator is positive definite if and only if `Real{H} > 0`.
+
+  A general property of Fourier transforms is the correspondence between
+  Hermitian functions and real valued transforms.
+
+  Suppose `H.shape = [B1,...,Bb, N]`.  We say that `H` is a Hermitian spectrum
+  if, with `%` meaning modulus division,
+
+  ```H[..., n % N] = ComplexConjugate[ H[..., (-n) % N] ]```
+
+  * This operator corresponds to a real matrix if and only if `H` is Hermitian.
+  * This operator is self-adjoint if and only if `H` is real.
+
+  See e.g. "Discrete-Time Signal Processing", Oppenheim and Schafer.
+
+  #### Example of a self-adjoint positive definite operator
+
+  ```python
+  # spectrum is real ==> operator is self-adjoint
+  # spectrum is positive ==> operator is positive definite
+  spectrum = [6., 4, 2]
+
+  operator = LinearOperatorCirculant(spectrum)
+
+  # IFFT[spectrum]
+  operator.convolution_kernel()
+  ==> [4 + 0j, 1 + 0.58j, 1 - 0.58j]
+
+  operator.to_dense()
+  ==> [[4 + 0.0j, 1 - 0.6j, 1 + 0.6j],
+       [1 + 0.6j, 4 + 0.0j, 1 - 0.6j],
+       [1 - 0.6j, 1 + 0.6j, 4 + 0.0j]]
+  ```
+
+  #### Example of defining in terms of a real convolution kernel
+
+  ```python
+  # convolution_kernel is real ==> spectrum is Hermitian.
+  convolution_kernel = [1., 2., 1.]]
+  spectrum = tf.fft(tf.cast(convolution_kernel, tf.complex64))
+
+  # spectrum is Hermitian ==> operator is real.
+  # spectrum is shape [3] ==> operator is shape [3, 3]
+  # We force the input/output type to be real, which allows this to operate
+  # like a real matrix.
+  operator = LinearOperatorCirculant(spectrum, input_output_dtype=tf.float32)
+
+  operator.to_dense()
+  ==> [[ 1, 1, 2],
+       [ 2, 1, 1],
+       [ 1, 2, 1]]
+  ```
+
+  #### Example of Hermitian spectrum
+
+  ```python
+  # spectrum is shape [3] ==> operator is shape [3, 3]
+  # spectrum is Hermitian ==> operator is real.
+  spectrum = [1, 1j, -1j]
+
+  operator = LinearOperatorCirculant(spectrum)
+
+  operator.to_dense()
+  ==> [[ 0.33 + 0j,  0.91 + 0j, -0.24 + 0j],
+       [-0.24 + 0j,  0.33 + 0j,  0.91 + 0j],
+       [ 0.91 + 0j, -0.24 + 0j,  0.33 + 0j]
+  ```
+
+  #### Example of forcing real `dtype` when spectrum is Hermitian
+
+  ```python
+  # spectrum is shape [4] ==> operator is shape [4, 4]
+  # spectrum is real ==> operator is self-adjoint
+  # spectrum is Hermitian ==> operator is real
+  # spectrum has positive real part ==> operator is positive-definite.
+  spectrum = [6., 4, 2, 4]
+
+  # Force the input dtype to be float32.
+  # Cast the output to float32.  This is fine because the operator will be
+  # real due to Hermitian spectrum.
+  operator = LinearOperatorCirculant(spectrum, input_output_dtype=tf.float32)
+
+  operator.shape
+  ==> [4, 4]
+
+  operator.to_dense()
+  ==> [[4, 1, 0, 1],
+       [1, 4, 1, 0],
+       [0, 1, 4, 1],
+       [1, 0, 1, 4]]
+
+  # convolution_kernel = tf.ifft(spectrum)
+  operator.convolution_kernel()
+  ==> [4, 1, 0, 1]
+  ```
+
+  #### Performance
+
+  Suppose `operator` is a `LinearOperatorCirculant` of shape `[N, N]`,
+  and `x.shape = [N, R]`.  Then
+
+  * `operator.matmul(x)` is `O(R*N*Log[N])`
+  * `operator.solve(x)` is `O(R*N*Log[N])`
+  * `operator.determinant()` involves a size `N` `reduce_prod`.
+
+  If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
+  `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               spectrum,
+               input_output_dtype=_DTYPE_COMPLEX,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=True,
+               name="LinearOperatorCirculant"):
+    r"""Initialize an `LinearOperatorCirculant`.
+
+    This `LinearOperator` is initialized to have shape `[B1,...,Bb, N, N]`
+    by providing `spectrum`, a `[B1,...,Bb, N]` `Tensor`.
+
+    If `input_output_dtype = DTYPE`:
+
+    * Arguments to methods such as `matmul` or `solve` must be `DTYPE`.
+    * Values returned by all methods, such as `matmul` or `determinant` will be
+      cast to `DTYPE`.
+
+    Note that if the spectrum is not Hermitian, then this operator corresponds
+    to a complex matrix with non-zero imaginary part.  In this case, setting
+    `input_output_dtype` to a real type will forcibly cast the output to be
+    real, resulting in incorrect results!
+
+    If on the other hand the spectrum is Hermitian, then this operator
+    corresponds to a real-valued matrix, and setting `input_output_dtype` to
+    a real type is fine.
+
+    Args:
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
+        `float32`, `complex64`.  Type can be different than `input_output_dtype`
+      input_output_dtype: `dtype` for input/output.  Must be either
+        `float32` or `complex64`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  If `spectrum` is real, this will always be true.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name:  A name to prepend to all ops created by this class.
+    """
+    super(LinearOperatorCirculant, self).__init__(
+        spectrum,
+        block_depth=1,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name)
+
+
+@tf_export("linalg.LinearOperatorCirculant2D")
+class LinearOperatorCirculant2D(_BaseLinearOperatorCirculant):
+  """`LinearOperator` acting like a block circulant matrix.
+
+  This operator acts like a block circulant matrix `A` with
+  shape `[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x N` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  #### Description in terms of block circulant matrices
+
+  If `A` is block circulant, with block sizes `N0, N1` (`N0 * N1 = N`):
+  `A` has a block circulant structure, composed of `N0 x N0` blocks, with each
+  block an `N1 x N1` circulant matrix.
+
+  For example, with `W`, `X`, `Y`, `Z` each circulant,
+
+  ```
+  A = |W Z Y X|
+      |X W Z Y|
+      |Y X W Z|
+      |Z Y X W|
+  ```
+
+  Note that `A` itself will not in general be circulant.
+
+  #### Description in terms of the frequency spectrum
+
+  There is an equivalent description in terms of the [batch] spectrum `H` and
+  Fourier transforms.  Here we consider `A.shape = [N, N]` and ignore batch
+  dimensions.
+
+  If `H.shape = [N0, N1]`, (`N0 * N1 = N`):
+  Loosely speaking, matrix multiplication is equal to the action of a
+  Fourier multiplier:  `A u = IDFT2[ H DFT2[u] ]`.
+  Precisely speaking, given `[N, R]` matrix `u`, let `DFT2[u]` be the
+  `[N0, N1, R]` `Tensor` defined by re-shaping `u` to `[N0, N1, R]` and taking
+  a two dimensional DFT across the first two dimensions.  Let `IDFT2` be the
+  inverse of `DFT2`.  Matrix multiplication may be expressed columnwise:
+
+  ```(A u)_r = IDFT2[ H * (DFT2[u])_r ]```
+
+  #### Operator properties deduced from the spectrum.
+
+  * This operator is positive definite if and only if `Real{H} > 0`.
+
+  A general property of Fourier transforms is the correspondence between
+  Hermitian functions and real valued transforms.
+
+  Suppose `H.shape = [B1,...,Bb, N0, N1]`, we say that `H` is a Hermitian
+  spectrum if, with `%` indicating modulus division,
+
+  ```
+  H[..., n0 % N0, n1 % N1] = ComplexConjugate[ H[..., (-n0) % N0, (-n1) % N1 ].
+  ```
+
+  * This operator corresponds to a real matrix if and only if `H` is Hermitian.
+  * This operator is self-adjoint if and only if `H` is real.
+
+  See e.g. "Discrete-Time Signal Processing", Oppenheim and Schafer.
+
+  ### Example of a self-adjoint positive definite operator
+
+  ```python
+  # spectrum is real ==> operator is self-adjoint
+  # spectrum is positive ==> operator is positive definite
+  spectrum = [[1., 2., 3.],
+              [4., 5., 6.],
+              [7., 8., 9.]]
+
+  operator = LinearOperatorCirculant2D(spectrum)
+
+  # IFFT[spectrum]
+  operator.convolution_kernel()
+  ==> [[5.0+0.0j, -0.5-.3j, -0.5+.3j],
+       [-1.5-.9j,        0,        0],
+       [-1.5+.9j,        0,        0]]
+
+  operator.to_dense()
+  ==> Complex self adjoint 9 x 9 matrix.
+  ```
+
+  #### Example of defining in terms of a real convolution kernel,
+
+  ```python
+  # convolution_kernel is real ==> spectrum is Hermitian.
+  convolution_kernel = [[1., 2., 1.], [5., -1., 1.]]
+  spectrum = tf.fft2d(tf.cast(convolution_kernel, tf.complex64))
+
+  # spectrum is shape [2, 3] ==> operator is shape [6, 6]
+  # spectrum is Hermitian ==> operator is real.
+  operator = LinearOperatorCirculant2D(spectrum, input_output_dtype=tf.float32)
+  ```
+
+  #### Performance
+
+  Suppose `operator` is a `LinearOperatorCirculant` of shape `[N, N]`,
+  and `x.shape = [N, R]`.  Then
+
+  * `operator.matmul(x)` is `O(R*N*Log[N])`
+  * `operator.solve(x)` is `O(R*N*Log[N])`
+  * `operator.determinant()` involves a size `N` `reduce_prod`.
+
+  If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
+  `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               spectrum,
+               input_output_dtype=_DTYPE_COMPLEX,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=True,
+               name="LinearOperatorCirculant2D"):
+    r"""Initialize an `LinearOperatorCirculant2D`.
+
+    This `LinearOperator` is initialized to have shape `[B1,...,Bb, N, N]`
+    by providing `spectrum`, a `[B1,...,Bb, N0, N1]` `Tensor` with `N0*N1 = N`.
+
+    If `input_output_dtype = DTYPE`:
+
+    * Arguments to methods such as `matmul` or `solve` must be `DTYPE`.
+    * Values returned by all methods, such as `matmul` or `determinant` will be
+      cast to `DTYPE`.
+
+    Note that if the spectrum is not Hermitian, then this operator corresponds
+    to a complex matrix with non-zero imaginary part.  In this case, setting
+    `input_output_dtype` to a real type will forcibly cast the output to be
+    real, resulting in incorrect results!
+
+    If on the other hand the spectrum is Hermitian, then this operator
+    corresponds to a real-valued matrix, and setting `input_output_dtype` to
+    a real type is fine.
+
+    Args:
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
+        `float32`, `complex64`.  Type can be different than `input_output_dtype`
+      input_output_dtype: `dtype` for input/output.  Must be either
+        `float32` or `complex64`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  If `spectrum` is real, this will always be true.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name:  A name to prepend to all ops created by this class.
+    """
+    super(LinearOperatorCirculant2D, self).__init__(
+        spectrum,
+        block_depth=2,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name)
+
+
+@tf_export("linalg.LinearOperatorCirculant3D")
+class LinearOperatorCirculant3D(_BaseLinearOperatorCirculant):
+  """`LinearOperator` acting like a nested block circulant matrix.
+
+  This operator acts like a block circulant matrix `A` with
+  shape `[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x N` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  #### Description in terms of block circulant matrices
+
+  If `A` is nested block circulant, with block sizes `N0, N1, N2`
+  (`N0 * N1 * N2 = N`):
+  `A` has a block structure, composed of `N0 x N0` blocks, with each
+  block an `N1 x N1` block circulant matrix.
+
+  For example, with `W`, `X`, `Y`, `Z` each block circulant,
+
+  ```
+  A = |W Z Y X|
+      |X W Z Y|
+      |Y X W Z|
+      |Z Y X W|
+  ```
+
+  Note that `A` itself will not in general be circulant.
+
+  #### Description in terms of the frequency spectrum
+
+  There is an equivalent description in terms of the [batch] spectrum `H` and
+  Fourier transforms.  Here we consider `A.shape = [N, N]` and ignore batch
+  dimensions.
+
+  If `H.shape = [N0, N1, N2]`, (`N0 * N1 * N2 = N`):
+  Loosely speaking, matrix multiplication is equal to the action of a
+  Fourier multiplier:  `A u = IDFT3[ H DFT3[u] ]`.
+  Precisely speaking, given `[N, R]` matrix `u`, let `DFT3[u]` be the
+  `[N0, N1, N2, R]` `Tensor` defined by re-shaping `u` to `[N0, N1, N2, R]` and
+  taking a three dimensional DFT across the first three dimensions.  Let `IDFT3`
+  be the inverse of `DFT3`.  Matrix multiplication may be expressed columnwise:
+
+  ```(A u)_r = IDFT3[ H * (DFT3[u])_r ]```
+
+  #### Operator properties deduced from the spectrum.
+
+  * This operator is positive definite if and only if `Real{H} > 0`.
+
+  A general property of Fourier transforms is the correspondence between
+  Hermitian functions and real valued transforms.
+
+  Suppose `H.shape = [B1,...,Bb, N0, N1, N2]`, we say that `H` is a Hermitian
+  spectrum if, with `%` meaning modulus division,
+
+  ```
+  H[..., n0 % N0, n1 % N1, n2 % N2]
+    = ComplexConjugate[ H[..., (-n0) % N0, (-n1) % N1, (-n2) % N2] ].
+  ```
+
+  * This operator corresponds to a real matrix if and only if `H` is Hermitian.
+  * This operator is self-adjoint if and only if `H` is real.
+
+  See e.g. "Discrete-Time Signal Processing", Oppenheim and Schafer.
+
+  ### Examples
+
+  See `LinearOperatorCirculant` and `LinearOperatorCirculant2D` for examples.
+
+  #### Performance
+
+  Suppose `operator` is a `LinearOperatorCirculant` of shape `[N, N]`,
+  and `x.shape = [N, R]`.  Then
+
+  * `operator.matmul(x)` is `O(R*N*Log[N])`
+  * `operator.solve(x)` is `O(R*N*Log[N])`
+  * `operator.determinant()` involves a size `N` `reduce_prod`.
+
+  If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
+  `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               spectrum,
+               input_output_dtype=_DTYPE_COMPLEX,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=True,
+               name="LinearOperatorCirculant3D"):
+    """Initialize an `LinearOperatorCirculant`.
+
+    This `LinearOperator` is initialized to have shape `[B1,...,Bb, N, N]`
+    by providing `spectrum`, a `[B1,...,Bb, N0, N1, N2]` `Tensor`
+    with `N0*N1*N2 = N`.
+
+    If `input_output_dtype = DTYPE`:
+
+    * Arguments to methods such as `matmul` or `solve` must be `DTYPE`.
+    * Values returned by all methods, such as `matmul` or `determinant` will be
+      cast to `DTYPE`.
+
+    Note that if the spectrum is not Hermitian, then this operator corresponds
+    to a complex matrix with non-zero imaginary part.  In this case, setting
+    `input_output_dtype` to a real type will forcibly cast the output to be
+    real, resulting in incorrect results!
+
+    If on the other hand the spectrum is Hermitian, then this operator
+    corresponds to a real-valued matrix, and setting `input_output_dtype` to
+    a real type is fine.
+
+    Args:
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
+        `float32`, `complex64`.  Type can be different than `input_output_dtype`
+      input_output_dtype: `dtype` for input/output.  Must be either
+        `float32` or `complex64`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  If `spectrum` is real, this will always be true.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the real part of all eigenvalues is positive.  We do not require
+        the operator to be self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name:  A name to prepend to all ops created by this class.
+    """
+    super(LinearOperatorCirculant3D, self).__init__(
+        spectrum,
+        block_depth=3,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name)
+
+
+def _to_complex(x):
+  return math_ops.cast(x, _DTYPE_COMPLEX)
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index ecd30e4d7e4dd7cfd4b109ad6e60aacb172700f6..0292bc51dcf9809941087dd4aa1ea4c760c064d1 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -134,8 +134,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.  Default is the individual
         operators names joined with `_o_`.
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index b3ec3d5b7cf45ac0b2672eea9a4586b2c3295897..5beaea65a5171ad7e92042a2afa81c0507e51d0e 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -67,7 +67,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   operator = LinearOperatorDiag(diag)
 
   # Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
-  # since the batch dimensions, [2, 1], are brodcast to
+  # since the batch dimensions, [2, 1], are broadcast to
   # operator.batch_shape = [2, 3].
   y = tf.random_normal(shape=[2, 1, 4, 2])
   x = operator.solve(y)
@@ -132,8 +132,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index f979fb37d6c69a2683af08a1f6722b98da0b6650..746da8df1ce957e86bc2e730b5709a699adbf612 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorFullMatrix"]
@@ -125,8 +125,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
@@ -177,7 +176,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
     return array_ops.shape(self._matrix)
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    return math_ops.matmul(
+    return linear_operator_util.matmul_with_broadcast(
         self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _to_dense(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 50f3d407e85e4cca22ad6326931b5a2a736819a8..45929eb4e2e91218784a9fabba23b57851ae3cc8 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -236,8 +236,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
@@ -576,8 +575,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index be911029095920d424ac90b406e7b85b73884b3b..08e5896e1034fb1782beacfb18fef16da083bded 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_diag
 from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -365,14 +366,17 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     leading_term = l.matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
     if adjoint:
-      uh_x = math_ops.matmul(u, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      uh_x = linear_operator_util.matmul_with_broadcast(
+          u, x, adjoint_a=True, adjoint_b=adjoint_arg)
       d_uh_x = d.matmul(uh_x, adjoint=adjoint)
-      v_d_uh_x = math_ops.matmul(v, d_uh_x)
+      v_d_uh_x = linear_operator_util.matmul_with_broadcast(
+          v, d_uh_x)
       return leading_term + v_d_uh_x
     else:
-      vh_x = math_ops.matmul(v, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      vh_x = linear_operator_util.matmul_with_broadcast(
+          v, x, adjoint_a=True, adjoint_b=adjoint_arg)
       d_vh_x = d.matmul(vh_x, adjoint=adjoint)
-      u_d_vh_x = math_ops.matmul(u, d_vh_x)
+      u_d_vh_x = linear_operator_util.matmul_with_broadcast(u, d_vh_x)
       return leading_term + u_d_vh_x
 
   def _determinant(self):
@@ -431,16 +435,18 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     # L^{-1} rhs
     linv_rhs = l.solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
     # V^H L^{-1} rhs
-    vh_linv_rhs = math_ops.matmul(v, linv_rhs, adjoint_a=True)
+    vh_linv_rhs = linear_operator_util.matmul_with_broadcast(
+        v, linv_rhs, adjoint_a=True)
     # C^{-1} V^H L^{-1} rhs
     if self._use_cholesky:
-      capinv_vh_linv_rhs = linalg_ops.cholesky_solve(
+      capinv_vh_linv_rhs = linear_operator_util.cholesky_solve_with_broadcast(
           self._chol_capacitance, vh_linv_rhs)
     else:
-      capinv_vh_linv_rhs = linalg_ops.matrix_solve(
+      capinv_vh_linv_rhs = linear_operator_util.matrix_solve_with_broadcast(
           self._capacitance, vh_linv_rhs, adjoint=adjoint)
     # U C^{-1} V^H M^{-1} rhs
-    u_capinv_vh_linv_rhs = math_ops.matmul(u, capinv_vh_linv_rhs)
+    u_capinv_vh_linv_rhs = linear_operator_util.matmul_with_broadcast(
+        u, capinv_vh_linv_rhs)
     # L^{-1} U C^{-1} V^H L^{-1} rhs
     linv_u_capinv_vh_linv_rhs = l.solve(u_capinv_vh_linv_rhs, adjoint=adjoint)
 
@@ -454,7 +460,8 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     # L^{-1} U
     linv_u = self.base_operator.solve(self.u)
     # V^H L^{-1} U
-    vh_linv_u = math_ops.matmul(self.v, linv_u, adjoint_a=True)
+    vh_linv_u = linear_operator_util.matmul_with_broadcast(
+        self.v, linv_u, adjoint_a=True)
 
     # D^{-1} + V^H L^{-1} V
     capacitance = self._diag_inv_operator.add_to_tensor(vh_linv_u)
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index a5130188b681813e1ccd4818dabdffeeb663e20a..fb1eb2fedba5b47ce38f9635527b91e18d894a8f 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
@@ -133,8 +132,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
@@ -195,7 +193,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         message="Singular operator:  Diagonal contained zero values.")
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    return math_ops.matmul(
+    return linear_operator_util.matmul_with_broadcast(
         self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
@@ -207,7 +205,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
-    return linalg_ops.matrix_triangular_solve(
+    return linear_operator_util.matrix_triangular_solve_with_broadcast(
         self._tril, rhs, lower=True, adjoint=adjoint)
 
   def _to_dense(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 2c11f90e6d9de280e6020edfaa4d8ef237126705..7e4fb6a6fc31960d570c78398ae211ba45a4a4a7 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -32,9 +32,22 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
 
 
+class OperatorBuildInfo(object):
+  """Object encoding expected shape for a test.
+
+  Encodes the expected shape of a matrix for a test. Also
+  allows additional metadata for the test harness.
+  """
+
+  def __init__(self, shape, **kwargs):
+    self.shape = shape
+    self.__dict__.update(kwargs)
+
+
 @six.add_metaclass(abc.ABCMeta)  # pylint: disable=no-init
 class LinearOperatorDerivedClassTest(test.TestCase):
   """Tests for derived classes.
@@ -84,19 +97,20 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     return [False, True]
 
   @abc.abstractproperty
-  def _shapes_to_test(self):
-    """Returns list of tuples, each is one shape that will be tested."""
-    raise NotImplementedError("shapes_to_test has not been implemented.")
+  def _operator_build_infos(self):
+    """Returns list of OperatorBuildInfo, encapsulating the shape to test."""
+    raise NotImplementedError("operator_build_infos has not been implemented.")
 
   @abc.abstractmethod
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
     """Build a batch matrix and an Operator that should have similar behavior.
 
     Every operator acts like a (batch) matrix.  This method returns both
     together, and is used by tests.
 
     Args:
-      shape:  List-like of Python integers giving full shape of operator.
+      build_info: `OperatorBuildInfo`, encoding shape information about the
+        operator.
       dtype:  Numpy dtype.  Data type of returned array/operator.
       use_placeholder:  Python bool.  If True, initialize the operator with a
         placeholder of undefined shape and correct dtype.
@@ -113,13 +127,16 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("Not implemented yet.")
 
   @abc.abstractmethod
-  def _make_rhs(self, operator, adjoint):
+  def _make_rhs(self, operator, adjoint, with_batch=True):
     """Make a rhs appropriate for calling operator.solve(rhs).
 
     Args:
       operator:  A `LinearOperator`
       adjoint:  Python `bool`.  If `True`, we are making a 'rhs' value for the
         adjoint operator.
+      with_batch: Python `bool`. If `True`, create `rhs` with the same batch
+        shape as operator, and otherwise create a matrix without any batch
+        shape.
 
     Returns:
       A `Tensor`
@@ -127,13 +144,15 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("_make_rhs is not defined.")
 
   @abc.abstractmethod
-  def _make_x(self, operator, adjoint):
+  def _make_x(self, operator, adjoint, with_batch=True):
     """Make an 'x' appropriate for calling operator.matmul(x).
 
     Args:
       operator:  A `LinearOperator`
       adjoint:  Python `bool`.  If `True`, we are making an 'x' value for the
         adjoint operator.
+      with_batch: Python `bool`. If `True`, create `x` with the same batch shape
+        as operator, and otherwise create a matrix without any batch shape.
 
     Returns:
       A `Tensor`
@@ -164,30 +183,30 @@ class LinearOperatorDerivedClassTest(test.TestCase):
   def test_to_dense(self):
     self._skip_if_tests_to_skip_contains("to_dense")
     for use_placeholder in self._use_placeholder_options:
-      for shape in self._shapes_to_test:
+      for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
             sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                shape, dtype, use_placeholder=use_placeholder)
+                build_info, dtype, use_placeholder=use_placeholder)
             op_dense = operator.to_dense()
             if not use_placeholder:
-              self.assertAllEqual(shape, op_dense.get_shape())
+              self.assertAllEqual(build_info.shape, op_dense.get_shape())
             op_dense_v, mat_v = sess.run([op_dense, mat], feed_dict=feed_dict)
             self.assertAC(op_dense_v, mat_v)
 
   def test_det(self):
     self._skip_if_tests_to_skip_contains("det")
     for use_placeholder in self._use_placeholder_options:
-      for shape in self._shapes_to_test:
+      for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
             sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                shape, dtype, use_placeholder=use_placeholder)
+                build_info, dtype, use_placeholder=use_placeholder)
             op_det = operator.determinant()
             if not use_placeholder:
-              self.assertAllEqual(shape[:-2], op_det.get_shape())
+              self.assertAllEqual(build_info.shape[:-2], op_det.get_shape())
             op_det_v, mat_det_v = sess.run(
                 [op_det, linalg_ops.matrix_determinant(mat)],
                 feed_dict=feed_dict)
@@ -196,32 +215,39 @@ class LinearOperatorDerivedClassTest(test.TestCase):
   def test_log_abs_det(self):
     self._skip_if_tests_to_skip_contains("log_abs_det")
     for use_placeholder in self._use_placeholder_options:
-      for shape in self._shapes_to_test:
+      for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
             sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                shape, dtype, use_placeholder=use_placeholder)
+                build_info, dtype, use_placeholder=use_placeholder)
             op_log_abs_det = operator.log_abs_determinant()
             _, mat_log_abs_det = linalg.slogdet(mat)
             if not use_placeholder:
-              self.assertAllEqual(shape[:-2], op_log_abs_det.get_shape())
+              self.assertAllEqual(
+                  build_info.shape[:-2], op_log_abs_det.get_shape())
             op_log_abs_det_v, mat_log_abs_det_v = sess.run(
                 [op_log_abs_det, mat_log_abs_det], feed_dict=feed_dict)
             self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
 
-  def test_matmul(self):
-    self._skip_if_tests_to_skip_contains("matmul")
+  def _test_matmul(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
-      for shape in self._shapes_to_test:
+      for build_info in self._operator_build_infos:
+        # If batch dimensions are omitted, but there are
+        # no batch dimensions for the linear operator, then
+        # skip the test case. This is already checked with
+        # with_batch=True.
+        if not with_batch and len(build_info.shape) <= 2:
+          continue
         for dtype in self._dtypes_to_test:
           for adjoint in self._adjoint_options:
             for adjoint_arg in self._adjoint_arg_options:
               with self.test_session(graph=ops.Graph()) as sess:
                 sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
                 operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                    shape, dtype, use_placeholder=use_placeholder)
-                x = self._make_x(operator, adjoint=adjoint)
+                    build_info, dtype, use_placeholder=use_placeholder)
+                x = self._make_x(
+                    operator, adjoint=adjoint, with_batch=with_batch)
                 # If adjoint_arg, compute A X^H^H = A X.
                 if adjoint_arg:
                   op_matmul = operator.matmul(
@@ -230,7 +256,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                       adjoint_arg=adjoint_arg)
                 else:
                   op_matmul = operator.matmul(x, adjoint=adjoint)
-                mat_matmul = math_ops.matmul(mat, x, adjoint_a=adjoint)
+                mat_matmul = linear_operator_util.matmul_with_broadcast(
+                    mat, x, adjoint_a=adjoint)
                 if not use_placeholder:
                   self.assertAllEqual(op_matmul.get_shape(),
                                       mat_matmul.get_shape())
@@ -238,18 +265,32 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                     [op_matmul, mat_matmul], feed_dict=feed_dict)
                 self.assertAC(op_matmul_v, mat_matmul_v)
 
-  def test_solve(self):
-    self._skip_if_tests_to_skip_contains("solve")
+  def test_matmul(self):
+    self._skip_if_tests_to_skip_contains("matmul")
+    self._test_matmul(with_batch=True)
+
+  def test_matmul_with_broadcast(self):
+    self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
+    self._test_matmul(with_batch=False)
+
+  def _test_solve(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
-      for shape in self._shapes_to_test:
+      for build_info in self._operator_build_infos:
+        # If batch dimensions are omitted, but there are
+        # no batch dimensions for the linear operator, then
+        # skip the test case. This is already checked with
+        # with_batch=True.
+        if not with_batch and len(build_info.shape) <= 2:
+          continue
         for dtype in self._dtypes_to_test:
           for adjoint in self._adjoint_options:
             for adjoint_arg in self._adjoint_arg_options:
               with self.test_session(graph=ops.Graph()) as sess:
                 sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
                 operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                    shape, dtype, use_placeholder=use_placeholder)
-                rhs = self._make_rhs(operator, adjoint=adjoint)
+                    build_info, dtype, use_placeholder=use_placeholder)
+                rhs = self._make_rhs(
+                    operator, adjoint=adjoint, with_batch=with_batch)
                 # If adjoint_arg, solve A X = (rhs^H)^H = rhs.
                 if adjoint_arg:
                   op_solve = operator.solve(
@@ -259,7 +300,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 else:
                   op_solve = operator.solve(
                       rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
-                mat_solve = linalg_ops.matrix_solve(mat, rhs, adjoint=adjoint)
+                mat_solve = linear_operator_util.matrix_solve_with_broadcast(
+                    mat, rhs, adjoint=adjoint)
                 if not use_placeholder:
                   self.assertAllEqual(op_solve.get_shape(),
                                       mat_solve.get_shape())
@@ -267,15 +309,23 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                     [op_solve, mat_solve], feed_dict=feed_dict)
                 self.assertAC(op_solve_v, mat_solve_v)
 
+  def test_solve(self):
+    self._skip_if_tests_to_skip_contains("solve")
+    self._test_solve(with_batch=True)
+
+  def test_solve_with_broadcast(self):
+    self._skip_if_tests_to_skip_contains("solve_with_broadcast")
+    self._test_solve(with_batch=False)
+
   def test_trace(self):
     self._skip_if_tests_to_skip_contains("trace")
     for use_placeholder in self._use_placeholder_options:
-      for shape in self._shapes_to_test:
+      for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
             sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                shape, dtype, use_placeholder=use_placeholder)
+                build_info, dtype, use_placeholder=use_placeholder)
             op_trace = operator.trace()
             mat_trace = math_ops.trace(mat)
             if not use_placeholder:
@@ -287,16 +337,16 @@ class LinearOperatorDerivedClassTest(test.TestCase):
   def test_add_to_tensor(self):
     self._skip_if_tests_to_skip_contains("add_to_tensor")
     for use_placeholder in self._use_placeholder_options:
-      for shape in self._shapes_to_test:
+      for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
             sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                shape, dtype, use_placeholder=use_placeholder)
+                build_info, dtype, use_placeholder=use_placeholder)
             op_plus_2mat = operator.add_to_tensor(2 * mat)
 
             if not use_placeholder:
-              self.assertAllEqual(shape, op_plus_2mat.get_shape())
+              self.assertAllEqual(build_info.shape, op_plus_2mat.get_shape())
 
             op_plus_2mat_v, mat_v = sess.run(
                 [op_plus_2mat, mat], feed_dict=feed_dict)
@@ -306,12 +356,12 @@ class LinearOperatorDerivedClassTest(test.TestCase):
   def test_diag_part(self):
     self._skip_if_tests_to_skip_contains("diag_part")
     for use_placeholder in self._use_placeholder_options:
-      for shape in self._shapes_to_test:
+      for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
             sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                shape, dtype, use_placeholder=use_placeholder)
+                build_info, dtype, use_placeholder=use_placeholder)
             op_diag_part = operator.diag_part()
             mat_diag_part = array_ops.matrix_diag_part(mat)
 
@@ -334,17 +384,23 @@ class SquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   """
 
   @property
-  def _shapes_to_test(self):
+  def _operator_build_infos(self):
+    build_info = OperatorBuildInfo
     # non-batch operators (n, n) and batch operators.
-    return [(0, 0), (1, 1), (1, 3, 3), (3, 4, 4), (2, 1, 4, 4)]
-
-  def _make_rhs(self, operator, adjoint):
+    return [
+        build_info((0, 0)),
+        build_info((1, 1)),
+        build_info((1, 3, 3)),
+        build_info((3, 4, 4)),
+        build_info((2, 1, 4, 4))]
+
+  def _make_rhs(self, operator, adjoint, with_batch=True):
     # This operator is square, so rhs and x will have same shape.
     # adjoint value makes no difference because the operator shape doesn't
     # change since it is square, but be pedantic.
-    return self._make_x(operator, adjoint=not adjoint)
+    return self._make_x(operator, adjoint=not adjoint, with_batch=with_batch)
 
-  def _make_x(self, operator, adjoint):
+  def _make_x(self, operator, adjoint, with_batch=True):
     # Value of adjoint makes no difference because the operator is square.
     # Return the number of systems to solve, R, equal to 1 or 2.
     r = self._get_num_systems(operator)
@@ -353,11 +409,17 @@ class SquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
     if operator.shape.is_fully_defined():
       batch_shape = operator.batch_shape.as_list()
       n = operator.domain_dimension.value
-      x_shape = batch_shape + [n, r]
+      if with_batch:
+        x_shape = batch_shape + [n, r]
+      else:
+        x_shape = [n, r]
     else:
       batch_shape = operator.batch_shape_tensor()
       n = operator.domain_dimension_tensor()
-      x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      if with_batch:
+        x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      else:
+        x_shape = [n, r]
 
     return random_normal(x_shape, dtype=operator.dtype)
 
@@ -384,19 +446,25 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["solve", "det", "log_abs_det"]
+    return ["solve", "solve_with_broadcast", "det", "log_abs_det"]
 
   @property
-  def _shapes_to_test(self):
+  def _operator_build_infos(self):
+    build_info = OperatorBuildInfo
     # non-batch operators (n, n) and batch operators.
-    return [(2, 1), (1, 2), (1, 3, 2), (3, 3, 4), (2, 1, 2, 4)]
-
-  def _make_rhs(self, operator, adjoint):
+    return [
+        build_info((2, 1)),
+        build_info((1, 2)),
+        build_info((1, 3, 2)),
+        build_info((3, 3, 4)),
+        build_info((2, 1, 2, 4))]
+
+  def _make_rhs(self, operator, adjoint, with_batch=True):
     # TODO(langmore) Add once we're testing solve_ls.
     raise NotImplementedError(
         "_make_rhs not implemented because we don't test solve")
 
-  def _make_x(self, operator, adjoint):
+  def _make_x(self, operator, adjoint, with_batch=True):
     # Return the number of systems for the argument 'x' for .matmul(x)
     r = self._get_num_systems(operator)
     # If operator.shape = [B1,...,Bb, M, N] this returns a random matrix of
@@ -407,14 +475,20 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         n = operator.range_dimension.value
       else:
         n = operator.domain_dimension.value
-      x_shape = batch_shape + [n, r]
+      if with_batch:
+        x_shape = batch_shape + [n, r]
+      else:
+        x_shape = [n, r]
     else:
       batch_shape = operator.batch_shape_tensor()
       if adjoint:
         n = operator.range_dimension_tensor()
       else:
         n = operator.domain_dimension_tensor()
-      x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      if with_batch:
+        x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      else:
+        x_shape = [n, r]
 
     return random_normal(x_shape, dtype=operator.dtype)
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 427bd1e890305618264a6a588be4e4ffade33c01..9dd40765c20222c6998260547b7e8fa341e65437 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -102,6 +103,22 @@ def assert_is_batch_matrix(tensor):
         "%s" % tensor)
 
 
+def shape_tensor(shape, name=None):
+  """Convert Tensor using default type, unless empty list or tuple."""
+  # Works just like random_ops._ShapeTensor.
+  if isinstance(shape, (tuple, list)) and not shape:
+    dtype = dtypes.int32
+  else:
+    dtype = None
+  return ops.convert_to_tensor(shape, dtype=dtype, name=name)
+
+
+################################################################################
+# Broadcasting versions of common linear algebra functions.
+# TODO(b/77519145) Do this more efficiently in some special cases.
+################################################################################
+
+
 def broadcast_matrix_batch_dims(batch_matrices, name=None):
   """Broadcast leading dimensions of zero or more [batch] matrices.
 
@@ -170,7 +187,8 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     bcast_batch_shape = batch_matrices[0].get_shape()[:-2]
     for mat in batch_matrices[1:]:
       bcast_batch_shape = array_ops.broadcast_static_shape(
-          bcast_batch_shape, mat.get_shape()[:-2])
+          bcast_batch_shape,
+          mat.get_shape()[:-2])
     if bcast_batch_shape.is_fully_defined():
       # The [1, 1] at the end will broadcast with anything.
       bcast_shape = bcast_batch_shape.concatenate([1, 1])
@@ -183,7 +201,8 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     bcast_batch_shape = array_ops.shape(batch_matrices[0])[:-2]
     for mat in batch_matrices[1:]:
       bcast_batch_shape = array_ops.broadcast_dynamic_shape(
-          bcast_batch_shape, array_ops.shape(mat)[:-2])
+          bcast_batch_shape,
+          array_ops.shape(mat)[:-2])
     bcast_shape = array_ops.concat([bcast_batch_shape, [1, 1]], axis=0)
     for i, mat in enumerate(batch_matrices):
       batch_matrices[i] = _broadcast_to_shape(mat, bcast_shape)
@@ -195,6 +214,13 @@ def _broadcast_to_shape(x, shape):
   return x + array_ops.zeros(shape=shape, dtype=x.dtype)
 
 
+def cholesky_solve_with_broadcast(chol, rhs, name=None):
+  """Solve systems of linear equations."""
+  with ops.name_scope(name, "CholeskySolveWithBroadcast", [chol, rhs]):
+    chol, rhs = broadcast_matrix_batch_dims([chol, rhs])
+    return linalg_ops.cholesky_solve(chol, rhs)
+
+
 def matmul_with_broadcast(a,
                           b,
                           transpose_a=False,
@@ -206,6 +232,11 @@ def matmul_with_broadcast(a,
                           name=None):
   """Multiplies matrix `a` by matrix `b`, producing `a @ b`.
 
+  Works identically to `tf.matmul`, but broadcasts batch dims
+  of `a` and `b` (by replicating) if they are determined statically to be
+  different, or if static shapes are not fully defined.  Thus, this may result
+  in an inefficient replication of data.
+
   The inputs must be matrices (or tensors of rank > 2, representing batches of
   matrices).
 
@@ -276,7 +307,7 @@ def matmul_with_broadcast(a,
     ValueError: If transpose_a and adjoint_a, or transpose_b and adjoint_b
       are both set to True.
   """
-  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]) as name:
+  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]):
     a, b = broadcast_matrix_batch_dims([a, b])
     return math_ops.matmul(
         a,
@@ -289,11 +320,43 @@ def matmul_with_broadcast(a,
         b_is_sparse=b_is_sparse)
 
 
-def shape_tensor(shape, name=None):
-  """Convert Tensor using default type, unless empty list or tuple."""
-  # Works just like random_ops._ShapeTensor.
-  if isinstance(shape, (tuple, list)) and not shape:
-    dtype = dtypes.int32
-  else:
-    dtype = None
-  return ops.convert_to_tensor(shape, dtype=dtype, name=name)
+def matrix_solve_with_broadcast(matrix, rhs, adjoint=False, name=None):
+  """Solve systems of linear equations."""
+  with ops.name_scope(name, "MatrixSolveWithBroadcast", [matrix, rhs]):
+    matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
+    return linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint)
+
+
+def matrix_triangular_solve_with_broadcast(matrix,
+                                           rhs,
+                                           lower=True,
+                                           adjoint=False,
+                                           name=None):
+  """Solves triangular systems of linear equations with by backsubstitution.
+
+  Works identically to `tf.matrix_triangular_solve`, but broadcasts batch dims
+  of `matrix` and `rhs` (by replicating) if they are determined statically to be
+  different, or if static shapes are not fully defined.  Thus, this may result
+  in an inefficient replication of data.
+
+  Args:
+    matrix: A Tensor. Must be one of the following types:
+      `float64`, `float32`, `complex64`, `complex128`. Shape is `[..., M, M]`.
+    rhs: A `Tensor`. Must have the same `dtype` as `matrix`.
+      Shape is `[..., M, K]`.
+    lower: An optional `bool`. Defaults to `True`. Indicates whether the
+      innermost matrices in `matrix` are lower or upper triangular.
+    adjoint: An optional `bool`. Defaults to `False`. Indicates whether to solve
+      with matrix or its (block-wise) adjoint.
+    name: A name for the operation (optional).
+
+  Returns:
+    `Tensor` with same `dtype` as `matrix` and shape `[..., M, K]`.
+  """
+  with ops.name_scope(name, "MatrixTriangularSolve", [matrix, rhs]):
+    matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
+    return linalg_ops.matrix_triangular_solve(
+        matrix,
+        rhs,
+        lower=lower,
+        adjoint=adjoint)
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 9803eed6aefe072cbe0841dff2de3f640a440dd5..a0dfa543f9b3aee15f11b073dc683b1d2d14388f 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -24,12 +24,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -159,36 +160,11 @@ def eye(num_rows,
   Returns:
     A `Tensor` of shape `batch_shape + [num_rows, num_columns]`
   """
-  with ops.name_scope(
-      name, default_name='eye', values=[num_rows, num_columns, batch_shape]):
-    is_square = num_columns is None
-    batch_shape = [] if batch_shape is None else batch_shape
-    num_columns = num_rows if num_columns is None else num_columns
-    if isinstance(num_rows, ops.Tensor) or isinstance(
-        num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor):
-      batch_shape = ops.convert_to_tensor(
-          batch_shape, name='shape', dtype=dtypes.int32)
-      diag_size = math_ops.minimum(num_rows, num_columns)
-      diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
-      if not is_square:
-        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
-    else:
-      if not isinstance(num_rows, compat.integral_types) or not isinstance(
-          num_columns, compat.integral_types):
-        raise TypeError(
-            'num_rows and num_columns must be positive integer values.')
-      batch_shape = [dim for dim in batch_shape]
-      is_square = num_rows == num_columns
-      diag_shape = batch_shape + [np.minimum(num_rows, num_columns)]
-      if not is_square:
-        shape = batch_shape + [num_rows, num_columns]
-
-    diag_ones = array_ops.ones(diag_shape, dtype=dtype)
-    if is_square:
-      return array_ops.matrix_diag(diag_ones)
-    else:
-      zero_matrix = array_ops.zeros(shape, dtype=dtype)
-      return array_ops.matrix_set_diag(zero_matrix, diag_ones)
+  return linalg_ops_impl.eye(num_rows,
+                             num_columns=num_columns,
+                             batch_shape=batch_shape,
+                             dtype=dtype,
+                             name=name)
 
 
 @tf_export('matrix_solve_ls', 'linalg.lstsq')
@@ -248,7 +224,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
     and l2_regularizer != 0 due to poor accuracy.
   """
 
-  # pylint: disable=protected-access,long-lambda
+  # pylint: disable=long-lambda
   def _use_composite_impl(fast, tensor_shape):
     """Determines whether to use the composite or specialized CPU kernel.
 
@@ -323,9 +299,8 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
   if _use_composite_impl(fast, tensor_shape):
     return _composite_impl(matrix, rhs, l2_regularizer)
   else:
-    return gen_linalg_ops._matrix_solve_ls(
+    return gen_linalg_ops.matrix_solve_ls(
         matrix, rhs, l2_regularizer, fast=fast, name=name)
-  # pylint: enable=protected-access
 
 
 @tf_export('self_adjoint_eig', 'linalg.eigh')
@@ -342,12 +317,11 @@ def self_adjoint_eig(tensor, name=None):
     name: string, optional name of the operation.
 
   Returns:
-    e: Eigenvalues. Shape is `[..., N]`.
+    e: Eigenvalues. Shape is `[..., N]`. Sorted in non-decreasing order.
     v: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
       matrices contain eigenvectors of the corresponding matrices in `tensor`
   """
-  # pylint: disable=protected-access
-  e, v = gen_linalg_ops._self_adjoint_eig_v2(tensor, compute_v=True, name=name)
+  e, v = gen_linalg_ops.self_adjoint_eig_v2(tensor, compute_v=True, name=name)
   return e, v
 
 
@@ -369,8 +343,7 @@ def self_adjoint_eigvals(tensor, name=None):
     e: Eigenvalues. Shape is `[..., N]`. The vector `e[..., :]` contains the `N`
       eigenvalues of `tensor[..., :, :]`.
   """
-  # pylint: disable=protected-access
-  e, _ = gen_linalg_ops._self_adjoint_eig_v2(tensor, compute_v=False, name=name)
+  e, _ = gen_linalg_ops.self_adjoint_eig_v2(tensor, compute_v=False, name=name)
   return e
 
 
@@ -432,13 +405,11 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   u, s, v_adj = np.linalg.svd(a, full_matrices=False)
   np_a_approx = np.dot(u, np.dot(np.diag(s), v_adj))
   # tf_a_approx and np_a_approx should be numerically close.
-  ````
+  ```
   @end_compatibility
   """
-  # pylint: disable=protected-access
-  s, u, v = gen_linalg_ops._svd(
+  s, u, v = gen_linalg_ops.svd(
       tensor, compute_uv=compute_uv, full_matrices=full_matrices, name=name)
-  # pylint: enable=protected-access
   if compute_uv:
     return math_ops.real(s), u, v
   else:
@@ -459,7 +430,7 @@ def norm(tensor,
 
   This function can compute several different vector norms (the 1-norm, the
   Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
-  matrix norms (Frobenius, 1-norm, and inf-norm).
+  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).
 
   Args:
     tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
@@ -470,7 +441,7 @@ def norm(tensor,
       Some restrictions apply:
         a) The Frobenius norm `fro` is not defined for vectors,
         b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`,
-           `np.inf` are supported.
+           `2`, `np.inf` are supported.
       See the description of `axis` on how to compute norms for a batch of
       vectors or matrices stored in a tensor.
     axis: If `axis` is `None` (the default), the input is considered a vector
@@ -526,8 +497,7 @@ def norm(tensor,
         axis[0] == axis[1]):
       raise ValueError(
           "'axis' must be None, an integer, or a tuple of 2 unique integers")
-    # TODO(rmlarsen): Implement matrix 2-norm using tf.svd().
-    supported_matrix_norms = ['euclidean', 'fro', 1, np.inf]
+    supported_matrix_norms = ['euclidean', 'fro', 1, 2, np.inf]
     if ord not in supported_matrix_norms:
       raise ValueError("'ord' must be a supported matrix norm in %s, got %s" %
                        (supported_matrix_norms, ord))
@@ -544,12 +514,34 @@ def norm(tensor,
 
   with ops.name_scope(name, 'norm', [tensor]):
     tensor = ops.convert_to_tensor(tensor)
+
     if ord in ['fro', 'euclidean', 2, 2.0]:
-      # TODO(rmlarsen): Move 2-norm to a separate clause once we support it for
-      # matrices.
-      result = math_ops.sqrt(
-          math_ops.reduce_sum(
-              tensor * math_ops.conj(tensor), axis, keepdims=True))
+      if is_matrix_norm and ord in [2, 2.0]:
+        rank = array_ops.rank(tensor)
+        positive_axis = functional_ops.map_fn(
+            lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank),
+            ops.convert_to_tensor(axis))
+        axes = math_ops.range(rank)
+        perm_before = array_ops.concat(
+            [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis],
+            axis=0)
+        perm_after = functional_ops.map_fn(
+            lambda i: math_ops.cast(
+                array_ops.squeeze(
+                    array_ops.where(math_ops.equal(perm_before, i))),
+                dtype=dtypes.int32), axes)
+        permed = array_ops.transpose(tensor, perm=perm_before)
+        matrix_2_norm = array_ops.expand_dims(
+            math_ops.reduce_max(
+                math_ops.abs(gen_linalg_ops.svd(permed, compute_uv=False)[0]),
+                axis=-1,
+                keepdims=True),
+            axis=-1)
+        result = array_ops.transpose(matrix_2_norm, perm=perm_after)
+      else:
+        result = math_ops.sqrt(
+            math_ops.reduce_sum(
+                tensor * math_ops.conj(tensor), axis, keepdims=True))
     else:
       result = math_ops.abs(tensor)
       if ord == 1:
diff --git a/tensorflow/python/ops/linalg_ops_impl.py b/tensorflow/python/ops/linalg_ops_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7c89f6ae3e9c517920e6c9afce99a8b192be164
--- /dev/null
+++ b/tensorflow/python/ops/linalg_ops_impl.py
@@ -0,0 +1,73 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for linear algebra."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import compat
+
+# Names below are lower_case.
+# pylint: disable=invalid-name
+
+
+def eye(num_rows,
+        num_columns=None,
+        batch_shape=None,
+        dtype=dtypes.float32,
+        name=None):
+  """Construct an identity matrix, or a batch of matrices.
+
+  See `linalg_ops.eye`.
+  """
+  with ops.name_scope(
+      name, default_name='eye', values=[num_rows, num_columns, batch_shape]):
+    is_square = num_columns is None
+    batch_shape = [] if batch_shape is None else batch_shape
+    num_columns = num_rows if num_columns is None else num_columns
+    if isinstance(num_rows, ops.Tensor) or isinstance(
+        num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor):
+      batch_shape = ops.convert_to_tensor(
+          batch_shape, name='shape', dtype=dtypes.int32)
+      diag_size = math_ops.minimum(num_rows, num_columns)
+      diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
+      if not is_square:
+        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
+    else:
+      if not isinstance(num_rows, compat.integral_types) or not isinstance(
+          num_columns, compat.integral_types):
+        raise TypeError(
+            'num_rows and num_columns must be positive integer values.')
+      batch_shape = [dim for dim in batch_shape]
+      is_square = num_rows == num_columns
+      diag_shape = batch_shape + [np.minimum(num_rows, num_columns)]
+      if not is_square:
+        shape = batch_shape + [num_rows, num_columns]
+
+    diag_ones = array_ops.ones(diag_shape, dtype=dtype)
+    if is_square:
+      return array_ops.matrix_diag(diag_ones)
+    else:
+      zero_matrix = array_ops.zeros(shape, dtype=dtype)
+      return array_ops.matrix_set_diag(zero_matrix, diag_ones)
+
+# pylint: enable=invalid-name,redefined-builtin
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index bba59ebcef9c7caf1a53d724767999ae7ac079e5..d9ede875301c52219cc1e3f05a892ee887a70e67 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -29,6 +29,10 @@ from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
 
 
+ops.NotDifferentiable("TensorListConcat")
+ops.NotDifferentiable("TensorListPushBackBatch")
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
@@ -54,8 +58,8 @@ def _TensorListStackGrad(unused_op, dtensor):
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape[0] is not None:
-    num_elements = op.inputs[0].shape[0]
+  if op.inputs[0].shape[0].value is not None:
+    num_elements = op.inputs[0].shape[0].value
   else:
     num_elements = None
   if dlist is None:
@@ -63,9 +67,10 @@ def _TensorListFromTensorGrad(op, dlist):
         element_dtype=op.inputs[0].dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
-  return gen_list_ops.tensor_list_stack(
-      dlist, element_dtype=op.inputs[0].dtype,
-      num_elements=num_elements)
+  tensor_grad = gen_list_ops.tensor_list_stack(
+      dlist, element_dtype=op.inputs[0].dtype, num_elements=num_elements)
+  shape_grad = None
+  return tensor_grad, shape_grad
 
 
 @ops.RegisterGradient("TensorListGetItem")
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index eadbc1b7c3b6e66aa76c9afd860b2274ac1976ae..222b8ebc9da6b076f012f8febbd50cc3c4c86c08 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -109,7 +109,7 @@ def histogram_summary(tag, values, collections=None, name=None):
     buffer.
   """
   with ops.name_scope(name, "HistogramSummary", [tag, values]) as scope:
-    val = gen_logging_ops._histogram_summary(
+    val = gen_logging_ops.histogram_summary(
         tag=tag, values=values, name=scope)
     _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
@@ -170,7 +170,7 @@ def image_summary(tag, tensor, max_images=3, collections=None, name=None):
     buffer.
   """
   with ops.name_scope(name, "ImageSummary", [tag, tensor]) as scope:
-    val = gen_logging_ops._image_summary(
+    val = gen_logging_ops.image_summary(
         tag=tag, tensor=tensor, max_images=max_images, name=scope)
     _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
@@ -226,11 +226,12 @@ def audio_summary(tag,
   with ops.name_scope(name, "AudioSummary", [tag, tensor]) as scope:
     sample_rate = ops.convert_to_tensor(sample_rate, dtype=dtypes.float32,
                                         name="sample_rate")
-    val = gen_logging_ops._audio_summary_v2(tag=tag,
-                                            tensor=tensor,
-                                            max_outputs=max_outputs,
-                                            sample_rate=sample_rate,
-                                            name=scope)
+    val = gen_logging_ops.audio_summary_v2(
+        tag=tag,
+        tensor=tensor,
+        max_outputs=max_outputs,
+        sample_rate=sample_rate,
+        name=scope)
     _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
 
@@ -263,7 +264,7 @@ def merge_summary(inputs, collections=None, name=None):
     buffer resulting from the merging.
   """
   with ops.name_scope(name, "MergeSummary", inputs):
-    val = gen_logging_ops._merge_summary(inputs=inputs, name=name)
+    val = gen_logging_ops.merge_summary(inputs=inputs, name=name)
     _Collect(val, collections, [])
   return val
 
@@ -345,7 +346,7 @@ def scalar_summary(tags, values, collections=None, name=None):
     buffer.
   """
   with ops.name_scope(name, "ScalarSummary", [tags, values]) as scope:
-    val = gen_logging_ops._scalar_summary(tags=tags, values=values, name=scope)
+    val = gen_logging_ops.scalar_summary(tags=tags, values=values, name=scope)
     _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
 
@@ -356,3 +357,4 @@ ops.NotDifferentiable("AudioSummary")
 ops.NotDifferentiable("AudioSummaryV2")
 ops.NotDifferentiable("MergeSummary")
 ops.NotDifferentiable("ScalarSummary")
+ops.NotDifferentiable("Timestamp")
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index f539a7bb68da57e31746bc80fb25339a03a4fafe..0e547689cc51857adb77791bfb94c2527cdffef2 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -157,10 +157,10 @@ class InitializableLookupTableBase(LookupInterface):
       default_value: The value to use if a key is missing in the table.
       initializer: The table initializer to use.
     """
-    if context.in_graph_mode():
-      name = table_ref.op.name.split("/")[-1]
-    else:
+    if context.executing_eagerly():
       name = context.context().scope_name
+    else:
+      name = table_ref.op.name.split("/")[-1]
     super(InitializableLookupTableBase,
           self).__init__(initializer.key_dtype, initializer.value_dtype,
                          name)
@@ -196,9 +196,7 @@ class InitializableLookupTableBase(LookupInterface):
     """
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as scope:
-      # pylint: disable=protected-access
-      return gen_lookup_ops._lookup_table_size_v2(self._table_ref, name=scope)
-      # pylint: enable=protected-access
+      return gen_lookup_ops.lookup_table_size_v2(self._table_ref, name=scope)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -227,10 +225,8 @@ class InitializableLookupTableBase(LookupInterface):
     with ops.name_scope(name, "%s_Lookup" % self._name,
                         (self._table_ref, key_tensor,
                          self._default_value)) as scope:
-      # pylint: disable=protected-access
-      values = gen_lookup_ops._lookup_table_find_v2(
+      values = gen_lookup_ops.lookup_table_find_v2(
           self._table_ref, key_tensor, self._default_value, name=scope)
-      # pylint: enable=protected-access
 
     values.set_shape(key_tensor.get_shape())
     if isinstance(keys, sparse_tensor.SparseTensor):
@@ -274,16 +270,34 @@ class HashTable(InitializableLookupTableBase):
     """
     with ops.name_scope(name, "hash_table", (initializer,
                                              default_value)) as scope:
-      # pylint: disable=protected-access
-      table_ref = gen_lookup_ops._hash_table_v2(
+      table_ref = gen_lookup_ops.hash_table_v2(
           shared_name=shared_name,
           key_dtype=initializer.key_dtype,
           value_dtype=initializer.value_dtype,
           name=scope)
-      # pylint: enable=protected-access
 
       super(HashTable, self).__init__(table_ref, default_value, initializer)
+      self._value_shape = self._default_value.get_shape()
+
+  def export(self, name=None):
+    """Returns tensors of all keys and values in the table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A pair of tensors with the first tensor containing all keys and the
+        second tensors containing all values in the table.
+    """
+    with ops.name_scope(name, "%s_Export" % self._name,
+                        [self._table_ref]) as name:
+      with ops.colocate_with(self._table_ref):
+        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+            self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
+    exported_values.set_shape(exported_keys.get_shape().concatenate(
+        self._value_shape))
+    return exported_keys, exported_values
 
 class TableInitializerBase(object):
   """Base class for lookup table initializers."""
@@ -352,10 +366,8 @@ class KeyValueTensorInitializer(TableInitializerBase):
     with ops.name_scope(
         self._name, values=(table.table_ref, self._keys,
                             self._values)) as scope:
-      # pylint: disable=protected-access
-      init_op = gen_lookup_ops._initialize_table_v2(
+      init_op = gen_lookup_ops.initialize_table_v2(
           table.table_ref, self._keys, self._values, name=scope)
-      # pylint: enable=protected-access
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     return init_op
 
@@ -518,8 +530,7 @@ class TextFileInitializer(TableInitializerBase):
                         (table.table_ref,)) as scope:
       filename = ops.convert_to_tensor(
           self._filename, dtypes.string, name="asset_filepath")
-      # pylint: disable=protected-access
-      init_op = gen_lookup_ops._initialize_table_from_text_file_v2(
+      init_op = gen_lookup_ops.initialize_table_from_text_file_v2(
           table.table_ref,
           filename,
           self._key_index,
@@ -527,11 +538,10 @@ class TextFileInitializer(TableInitializerBase):
           -1 if self._vocab_size is None else self._vocab_size,
           self._delimiter,
           name=scope)
-      # pylint: enable=protected-access
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     # If the filename tensor is anything other than a string constant (e.g., if
     # it is a placeholder) then it does not make sense to track it as an asset.
-    if context.in_graph_mode() and constant_op.is_constant(filename):
+    if not context.executing_eagerly() and constant_op.is_constant(filename):
       ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
     return init_op
 
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index 07741e0c3c3ea8a9bb7d790b901e743907794dc0..4aea0265a72dcd2b2358f063fb0a51a5877076e7 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -43,15 +43,3 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/ops/losses/losses.py b/tensorflow/python/ops/losses/losses.py
index 8532c19ad6b3348823cdc8b24f9fa301cea6d3b5..4681eb9b175a673a8181a357d7ebbaaace9564a2 100644
--- a/tensorflow/python/ops/losses/losses.py
+++ b/tensorflow/python/ops/losses/losses.py
@@ -15,36 +15,13 @@
 """Loss operations for use in neural networks.
 
 Note: All the losses are added to the `GraphKeys.LOSSES` collection by default.
-
-@@Reduction
-@@absolute_difference
-@@compute_weighted_loss
-@@cosine_distance
-@@hinge_loss
-@@huber_loss
-@@log_loss
-@@mean_pairwise_squared_error
-@@mean_squared_error
-@@sigmoid_cross_entropy
-@@softmax_cross_entropy
-@@sparse_softmax_cross_entropy
-
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-from tensorflow.python.ops.losses import util
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.losses.losses_impl import *
 from tensorflow.python.ops.losses.util import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-
-remove_undocumented(__name__, _allowed_symbols,
-                    [sys.modules[__name__], util])
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index aa74e117640d971e38968efadb3c34d5ce3a6f97..9fc545c9678e7eb33a7ad35e2a84f890885e09af 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import confusion_matrix
@@ -28,6 +29,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -134,6 +136,10 @@ def _num_present(losses, weights, per_batch=False):
       `per_batch` is `True`, the value is returned as a tensor of size
       `[batch_size]`. Otherwise, a single scalar tensor is returned.
   """
+  if ((isinstance(weights, float) and weights != 0.0) or
+      (context.executing_eagerly() and weights._rank() == 0  # pylint: disable=protected-access
+       and not math_ops.equal(weights, 0.0))):
+    return _num_elements(losses)
   with ops.name_scope(None, "num_present", (losses, weights)) as scope:
     weights = math_ops.to_float(weights)
     present = array_ops.where(
@@ -143,8 +149,10 @@ def _num_present(losses, weights, per_batch=False):
     present = weights_broadcast_ops.broadcast_weights(present, losses)
     if per_batch:
       return math_ops.reduce_sum(
-          present, axis=math_ops.range(1, array_ops.rank(present)),
-          keepdims=True, name=scope)
+          present,
+          axis=math_ops.range(1, array_ops.rank(present)),
+          keepdims=True,
+          name=scope)
     return math_ops.reduce_sum(present, name=scope)
 
 
@@ -187,6 +195,11 @@ def compute_weighted_loss(
   """
   Reduction.validate(reduction)
   with ops.name_scope(scope, "weighted_loss", (losses, weights)):
+    # Save the `reduction` argument for loss normalization when distributing
+    # to multiple towers.
+    # TODO(josh11b): Associate it with the returned op for more precision.
+    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
+
     with ops.control_dependencies((
         weights_broadcast_ops.assert_broadcastable(weights, losses),)):
       losses = ops.convert_to_tensor(losses)
@@ -294,11 +307,8 @@ def cosine_distance(
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `axis`, `labels`, `predictions` or `weights` is `None`.
   """
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dim'")
-    axis = dim
-  if axis is None and dim is None:
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
     raise ValueError("You must specify 'axis'.")
   if labels is None:
     raise ValueError("labels must not be None.")
@@ -421,8 +431,12 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
     # expression when abs_error == delta is 0 (for tf.maximum it would be 1).
     # This is necessary to avoid doubling the gradient, since there is already a
     # nonzero contribution to the gradient from the quadratic term.
-    linear = (abs_error - quadratic)
-    losses = 0.5 * quadratic * quadratic + delta * linear
+    linear = math_ops.subtract(abs_error, quadratic)
+    losses = math_ops.add(
+        math_ops.multiply(
+            ops.convert_to_tensor(0.5, dtype=quadratic.dtype),
+            math_ops.multiply(quadratic, quadratic)),
+        math_ops.multiply(delta, linear))
     return compute_weighted_loss(
         losses, weights, scope, loss_collection, reduction=reduction)
 
@@ -542,7 +556,8 @@ def mean_pairwise_squared_error(
       reduction_indices = math_ops.range(1, array_ops.rank(diffs))
 
       sum_squares_diff_per_batch = math_ops.reduce_sum(
-          math_ops.square(diffs), reduction_indices=reduction_indices,
+          math_ops.square(diffs),
+          reduction_indices=reduction_indices,
           keepdims=True)
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
@@ -634,7 +649,7 @@ def sigmoid_cross_entropy(
 
   Args:
     multi_class_labels: `[batch_size, num_classes]` target integer labels in
-      `(0, 1)`.
+      `{0, 1}`.
     logits: Float `[batch_size, num_classes]` logits outputs of the network.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
@@ -679,7 +694,7 @@ def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
-  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
+  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits_v2.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
   then the loss is simply scaled by the given value. If `weights` is a
@@ -690,11 +705,16 @@ def softmax_cross_entropy(
       new_onehot_labels = onehot_labels * (1 - label_smoothing)
                           + label_smoothing / num_classes
 
+  Note that `onehot_labels` and `logits` must have the same shape,
+  e.g. `[batch_size, num_classes]`. The shape of `weights` must be
+  broadcastable to loss, whose shape is decided by the shape of `logits`.
+  In case the shape of `logits` is `[batch_size, num_classes]`, loss is
+  a `Tensor` of shape `[batch_size]`.
+
   Args:
-    onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
-    logits: `[batch_size, num_classes]` logits outputs of the network .
-    weights: Optional `Tensor` whose rank is either 0, or rank 1 and is
-      broadcastable to the loss which is a `Tensor` of shape `[batch_size]`.
+    onehot_labels: One-hot-encoded labels.
+    logits: Logits outputs of the network.
+    weights: Optional `Tensor` that is broadcastable to loss.
     label_smoothing: If greater than 0 then smooth the labels.
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index b835d963869704f053de6c2f8a75ae1fa72e6a5d..10646af8a983f149cf0620bf355cf0bc1fa697fb 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -12,16 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for manipulating the loss collections.
-
-
-@@add_loss
-@@get_losses
-@@get_regularization_loss
-@@get_regularization_losses
-@@get_total_loss
-
-"""
+"""Utilities for manipulating the loss collections."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index 91e15b47b9400f29425af2f186c7c44ee6a5a622..6633565a649df50b7fe28997207a717708cba5a9 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -12,27 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Operators for manipulating tensors.
-
-@@roll
-"""
+"""Operators for manipulating tensors."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
+@tf_export('manip.roll')
 def roll(input, shift, axis):  # pylint: disable=redefined-builtin
   return _gen_manip_ops.roll(input, shift, axis)
 
 
 roll.__doc__ = _gen_manip_ops.roll.__doc__
 # pylint: enable=protected-access
-
-_allowed_symbols = ['roll']
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 9e7f37d80fdd71e84516ab450d145d79519ae47a..02e07dc7b1f5fe6a671da967f6d07cef123d3d1e 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -35,6 +35,18 @@ def _safe_shape_div(x, y):
   return x // math_ops.maximum(y, 1)
 
 
+@ops.RegisterGradient("ArgMax")
+def _ArgMaxGrad(op, grad):
+  del op, grad
+  return [None, None]
+
+
+@ops.RegisterGradient("ArgMin")
+def _ArgMinGrad(op, grad):
+  del op, grad
+  return [None, None]
+
+
 @ops.RegisterGradient("Sum")
 def _SumGrad(op, grad):
   """Gradient for Sum."""
@@ -46,10 +58,18 @@ def _SumGrad(op, grad):
     if axes is not None:
       rank = len(input_0_shape)
       if np.array_equal(axes, np.arange(rank)):  # Reduce all dims.
-        grad = array_ops.reshape(grad, [1] * rank)
+        if context.executing_eagerly():
+          ctx = context.context()
+          new_shape = ctx.ones_rank_cache().get(rank)
+          if new_shape is None:
+            new_shape = constant_op.constant([1] * rank, dtype=dtypes.int32)
+            ctx.ones_rank_cache().put(rank, new_shape)
+        else:
+          new_shape = [1] * rank
+        grad = array_ops.reshape(grad, new_shape)
         # If shape is not fully defined (but rank is), we use Shape.
         if None not in input_0_shape:
-          input_shape = input_0_shape
+          input_shape = constant_op.constant(input_0_shape, dtype=dtypes.int32)
         else:
           input_shape = array_ops.shape(op.inputs[0])
         return [array_ops.tile(grad, input_shape), None]
@@ -382,16 +402,14 @@ def _NegGrad(_, grad):
 def _InvGrad(op, grad):
   """Returns -grad * (1 / x^2)."""
   y = op.outputs[0]  # y = 1 / x
-  # pylint: disable=protected-access
-  return gen_math_ops._reciprocal_grad(y, grad)
+  return gen_math_ops.reciprocal_grad(y, grad)
 
 
 @ops.RegisterGradient("Reciprocal")
 def _ReciprocalGrad(op, grad):
   """Returns -grad * (1 / x^2)."""
   y = op.outputs[0]  # y = 1 / x
-  # pylint: disable=protected-access
-  return gen_math_ops._reciprocal_grad(y, grad)
+  return gen_math_ops.reciprocal_grad(y, grad)
 
 
 @ops.RegisterGradient("InvGrad")
@@ -401,8 +419,7 @@ def _InvGradGrad(op, grad):
   with ops.control_dependencies([grad]):
     ca = math_ops.conj(op.inputs[0])
     cg = math_ops.conj(grad)
-    # pylint: disable=protected-access
-    return cg * -2.0 * b * ca, gen_math_ops._reciprocal_grad(ca, grad)
+    return cg * -2.0 * b * ca, gen_math_ops.reciprocal_grad(ca, grad)
 
 
 @ops.RegisterGradient("ReciprocalGrad")
@@ -412,8 +429,7 @@ def _ReciprocalGradGrad(op, grad):
   with ops.control_dependencies([grad]):
     ca = math_ops.conj(op.inputs[0])
     cg = math_ops.conj(grad)
-    # pylint: disable=protected-access
-    return cg * -2.0 * b * ca, gen_math_ops._reciprocal_grad(ca, grad)
+    return cg * -2.0 * b * ca, gen_math_ops.reciprocal_grad(ca, grad)
 
 
 @ops.RegisterGradient("Square")
@@ -422,15 +438,14 @@ def _SquareGrad(op, grad):
   # Added control dependencies to prevent 2*x from being computed too early.
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    return math_ops.multiply(grad, math_ops.multiply(x, 2.0))
+    y = constant_op.constant(2.0, dtype=x.dtype)
+    return math_ops.multiply(grad, math_ops.multiply(x, y))
 
 
 @ops.RegisterGradient("Sqrt")
 def _SqrtGrad(op, grad):
   y = op.outputs[0]  # y = x^(1/2)
-  # pylint: disable=protected-access
-  return gen_math_ops._sqrt_grad(y, grad)
-  # pylint: enable=protected-access
+  return gen_math_ops.sqrt_grad(y, grad)
 
 
 @ops.RegisterGradient("SqrtGrad")
@@ -446,9 +461,7 @@ def _SqrtGradGrad(op, grad):
 def _RsqrtGrad(op, grad):
   """Returns -0.5 * grad * conj(y)^3."""
   y = op.outputs[0]  # y = x^(-1/2)
-  # pylint: disable=protected-access
-  return gen_math_ops._rsqrt_grad(y, grad)
-  # pylint: enable=protected-access
+  return gen_math_ops.rsqrt_grad(y, grad)
 
 
 @ops.RegisterGradient("RsqrtGrad")
@@ -460,8 +473,7 @@ def _RsqrtGradGrad(op, grad):
     ca = math_ops.conj(a)
     cg = math_ops.conj(grad)
     grad_a = -1.5 * cg * b * math_ops.square(ca)
-    # pylint: disable=protected-access
-    grad_b = gen_math_ops._rsqrt_grad(ca, grad)
+    grad_b = gen_math_ops.rsqrt_grad(ca, grad)
     return grad_a, grad_b
 
 
@@ -526,8 +538,7 @@ def _TanhGrad(op, grad):
   y = op.outputs[0]  # y = tanh(x)
   with ops.control_dependencies([grad]):
     y = math_ops.conj(y)
-    # pylint: disable=protected-access
-    return gen_math_ops._tanh_grad(y, grad)
+    return gen_math_ops.tanh_grad(y, grad)
 
 
 @ops.RegisterGradient("Asinh")
@@ -565,8 +576,7 @@ def _TanhGradGrad(op, grad):
   with ops.control_dependencies([grad]):
     a = math_ops.conj(op.inputs[0])
     b = math_ops.conj(op.inputs[1])
-    # pylint: disable=protected-access
-    return grad * -2.0 * b * a, gen_math_ops._tanh_grad(a, grad)
+    return grad * -2.0 * b * a, gen_math_ops.tanh_grad(a, grad)
 
 
 @ops.RegisterGradient("Erf")
@@ -616,9 +626,7 @@ def _IgammaGrad(op, grad):
   x = op.inputs[1]
   sa = array_ops.shape(a)
   sx = array_ops.shape(x)
-  # pylint: disable=protected-access
-  unused_ra, rx = gen_array_ops._broadcast_gradient_args(sa, sx)
-  # pylint: enable=protected-access
+  unused_ra, rx = gen_array_ops.broadcast_gradient_args(sa, sx)
 
   # Perform operations in log space before summing, because Gamma(a)
   # and Gamma'(a) can grow large.
@@ -645,9 +653,7 @@ def _BetaincGrad(op, grad):
   # versa; so its sufficient to check against shape(a).
   sa = array_ops.shape(a)
   sx = array_ops.shape(x)
-  # pylint: disable=protected-access
-  _, rx = gen_array_ops._broadcast_gradient_args(sa, sx)
-  # pylint: enable=protected-access
+  _, rx = gen_array_ops.broadcast_gradient_args(sa, sx)
 
   # Perform operations in log space before summing, because terms
   # can grow large.
@@ -673,9 +679,7 @@ def _ZetaGrad(op, grad):
   # Broadcast gradients
   sx = array_ops.shape(x)
   sq = array_ops.shape(q)
-  # pylint: disable=protected-access
-  unused_rx, rq = gen_array_ops._broadcast_gradient_args(sx, sq)
-  # pylint: enable=protected-access
+  unused_rx, rq = gen_array_ops.broadcast_gradient_args(sx, sq)
   # Evaluate gradient
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
@@ -695,9 +699,7 @@ def _PolygammaGrad(op, grad):
   # Broadcast gradients
   sn = array_ops.shape(n)
   sx = array_ops.shape(x)
-  # pylint: disable=protected-access
-  unused_rn, rx = gen_array_ops._broadcast_gradient_args(sn, sx)
-  # pylint: enable=protected-access
+  unused_rn, rx = gen_array_ops.broadcast_gradient_args(sn, sx)
   # Evaluate gradient
   with ops.control_dependencies([grad]):
     n = math_ops.conj(n)
@@ -714,8 +716,7 @@ def _SigmoidGrad(op, grad):
   y = op.outputs[0]  # y = sigmoid(x)
   with ops.control_dependencies([grad]):
     y = math_ops.conj(y)
-    # pylint: disable=protected-access
-    return gen_math_ops._sigmoid_grad(y, grad)
+    return gen_math_ops.sigmoid_grad(y, grad)
 
 
 @ops.RegisterGradient("SigmoidGrad")
@@ -724,8 +725,7 @@ def _SigmoidGradGrad(op, grad):
     a = math_ops.conj(op.inputs[0])
     b = math_ops.conj(op.inputs[1])
     gb = grad * b
-    # pylint: disable=protected-access
-    return gb - 2.0 * gb * a, gen_math_ops._sigmoid_grad(a, grad)
+    return gb - 2.0 * gb * a, gen_math_ops.sigmoid_grad(a, grad)
 
 
 @ops.RegisterGradient("Sign")
@@ -839,9 +839,7 @@ def _AddGrad(op, grad):
     return grad, grad
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   return (array_ops.reshape(math_ops.reduce_sum(grad, rx), sx),
           array_ops.reshape(math_ops.reduce_sum(grad, ry), sy))
 
@@ -856,9 +854,7 @@ def _SubGrad(op, grad):
     return grad, -grad
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   return (array_ops.reshape(math_ops.reduce_sum(grad, rx), sx),
           array_ops.reshape(-math_ops.reduce_sum(grad, ry), sy))
 
@@ -868,22 +864,20 @@ def _MulGrad(op, grad):
   """The gradient of scalar multiplication."""
   x = op.inputs[0]
   y = op.inputs[1]
-  # pylint: disable=protected-access
   if (isinstance(grad, ops.Tensor) and
       _ShapesFullySpecifiedAndEqual(x, y, grad) and
       grad.dtype in (dtypes.int32, dtypes.float32)):
-    return gen_math_ops._mul(grad, y), gen_math_ops._mul(grad, x)
+    return gen_math_ops.mul(grad, y), gen_math_ops.mul(grad, x)
   assert x.dtype.base_dtype == y.dtype.base_dtype, (x.dtype, " vs. ", y.dtype)
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(
-      math_ops.reduce_sum(gen_math_ops._mul(grad, y), rx), sx),
+      math_ops.reduce_sum(gen_math_ops.mul(grad, y), rx), sx),
           array_ops.reshape(
-              math_ops.reduce_sum(gen_math_ops._mul(x, grad), ry), sy))
-  # pylint: enable=protected-access
+              math_ops.reduce_sum(gen_math_ops.mul(x, grad), ry), sy))
 
 
 @ops.RegisterGradient("Div")
@@ -893,9 +887,7 @@ def _DivGrad(op, grad):
   y = op.inputs[1]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(math_ops.reduce_sum(math_ops.div(grad, y), rx), sx),
@@ -918,9 +910,7 @@ def _FloorModGrad(op, grad):
 
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   floor_xy = math_ops.floor_div(x, y)
   gx = array_ops.reshape(math_ops.reduce_sum(grad, rx), sx)
   gy = array_ops.reshape(
@@ -940,9 +930,7 @@ def _RealDivGrad(op, grad):
   y = op.inputs[1]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(
@@ -960,7 +948,7 @@ def _PowGrad(op, grad):
   z = op.outputs[0]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   z = math_ops.conj(z)
@@ -988,7 +976,7 @@ def _MaximumMinimumGrad(op, grad, selector_op):
   gradshape = array_ops.shape(grad)
   zeros = array_ops.zeros(gradshape, gdtype)
   xmask = selector_op(x, y)
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   xgrad = array_ops.where(xmask, grad, zeros)
   ygrad = array_ops.where(xmask, zeros, grad)
   gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
@@ -1015,9 +1003,7 @@ def _SquaredDifferenceGrad(op, grad):
   y = op.inputs[1]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   with ops.control_dependencies([grad]):
     # The parens ensure that if grad is IndexedSlices, it'll get multiplied by
     # Tensor (not a number like 2.0) which causes it to convert to Tensor.
@@ -1056,20 +1042,18 @@ def _MatMulGrad(op, grad):
   t_b = op.get_attr("transpose_b")
   a = math_ops.conj(op.inputs[0])
   b = math_ops.conj(op.inputs[1])
-  # pylint: disable=protected-access
   if not t_a and not t_b:
-    grad_a = gen_math_ops._mat_mul(grad, b, transpose_b=True)
-    grad_b = gen_math_ops._mat_mul(a, grad, transpose_a=True)
+    grad_a = gen_math_ops.mat_mul(grad, b, transpose_b=True)
+    grad_b = gen_math_ops.mat_mul(a, grad, transpose_a=True)
   elif not t_a and t_b:
-    grad_a = gen_math_ops._mat_mul(grad, b)
-    grad_b = gen_math_ops._mat_mul(grad, a, transpose_a=True)
+    grad_a = gen_math_ops.mat_mul(grad, b)
+    grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True)
   elif t_a and not t_b:
-    grad_a = gen_math_ops._mat_mul(b, grad, transpose_b=True)
-    grad_b = gen_math_ops._mat_mul(a, grad)
+    grad_a = gen_math_ops.mat_mul(b, grad, transpose_b=True)
+    grad_b = gen_math_ops.mat_mul(a, grad)
   elif t_a and t_b:
-    grad_a = gen_math_ops._mat_mul(b, grad, transpose_a=True, transpose_b=True)
-    grad_b = gen_math_ops._mat_mul(grad, a, transpose_a=True, transpose_b=True)
-  # pylint: enable=protected-access
+    grad_a = gen_math_ops.mat_mul(b, grad, transpose_a=True, transpose_b=True)
+    grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True, transpose_b=True)
   return grad_a, grad_b
 
 
@@ -1083,7 +1067,7 @@ def _SparseMatMulGrad(op, grad):
       op.inputs[0]: op.get_attr("a_is_sparse"),
       op.inputs[1]: op.get_attr("b_is_sparse"),
       # Use heuristic to figure out if grad might be sparse
-      grad: context.in_graph_mode() and (grad.op.type == "ReluGrad")
+      grad: not context.executing_eagerly() and (grad.op.type == "ReluGrad")
   }
 
   def _SparseMatMul(t1, t2, out_dtype, transpose_a=False, transpose_b=False):
@@ -1183,7 +1167,7 @@ def _ComplexGrad(op, grad):
   y = op.inputs[1]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   return (array_ops.reshape(math_ops.reduce_sum(math_ops.real(grad), rx), sx),
           array_ops.reshape(math_ops.reduce_sum(math_ops.imag(grad), ry), sy))
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index da9957aa2a5463a37bba155597600a340ee4f1e6..b9529ce3ed2a2407b39655345b5846edbc786ab6 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -15,135 +15,6 @@
 """Basic arithmetic operators.
 
 See the @{$python/math_ops} guide.
-
-@@add
-@@subtract
-@@multiply
-@@scalar_mul
-@@div
-@@divide
-@@truediv
-@@floordiv
-@@realdiv
-@@truncatediv
-@@floor_div
-@@truncatemod
-@@floormod
-@@mod
-@@cross
-@@add_n
-@@abs
-@@negative
-@@sign
-@@reciprocal
-@@square
-@@round
-@@sqrt
-@@rsqrt
-@@pow
-@@exp
-@@expm1
-@@log
-@@log1p
-@@sinh
-@@cosh
-@@asinh
-@@acosh
-@@atanh
-@@ceil
-@@floor
-@@maximum
-@@minimum
-@@cos
-@@sin
-@@lbeta
-@@tan
-@@acos
-@@asin
-@@atan
-@@atan2
-@@lgamma
-@@digamma
-@@erf
-@@erfc
-@@squared_difference
-@@igamma
-@@igammac
-@@zeta
-@@polygamma
-@@betainc
-@@rint
-@@diag
-@@diag_part
-@@trace
-@@transpose
-@@eye
-@@matrix_diag
-@@matrix_diag_part
-@@matrix_band_part
-@@matrix_set_diag
-@@matrix_transpose
-@@matmul
-@@norm
-@@matrix_determinant
-@@matrix_inverse
-@@cholesky
-@@cholesky_solve
-@@matrix_exponential
-@@matrix_logarithm
-@@matrix_solve
-@@matrix_triangular_solve
-@@matrix_solve_ls
-@@qr
-@@self_adjoint_eig
-@@self_adjoint_eigvals
-@@svd
-@@tensordot
-@@complex
-@@conj
-@@imag
-@@angle
-@@real
-@@fft
-@@ifft
-@@fft2d
-@@ifft2d
-@@fft3d
-@@ifft3d
-@@reduce_sum
-@@reduce_prod
-@@reduce_min
-@@reduce_max
-@@reduce_mean
-@@reduce_all
-@@reduce_any
-@@reduce_logsumexp
-@@count_nonzero
-@@accumulate_n
-@@einsum
-@@bincount
-@@cumsum
-@@cumprod
-@@segment_sum
-@@segment_prod
-@@segment_min
-@@segment_max
-@@segment_mean
-@@unsorted_segment_sum
-@@unsorted_segment_max
-@@unsorted_segment_min
-@@unsorted_segment_prod
-@@unsorted_segment_sqrt_n
-@@sparse_segment_sum
-@@sparse_segment_mean
-@@sparse_segment_sqrt_n
-@@argmin
-@@argmax
-@@setdiff1d
-@@where
-@@unique
-@@edit_distance
-@@invert_permutation
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -161,20 +32,19 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import gen_spectral_ops
-from tensorflow.python.ops import gen_state_ops
-from tensorflow.python.ops import state_ops
+from tensorflow.python.platform import tf_logging as logging
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
@@ -182,6 +52,12 @@ linspace = gen_math_ops.lin_space
 
 arg_max = deprecation.deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
+tf_export("arg_max")(arg_max)
+tf_export("arg_min")(arg_min)
+
+# This is set by resource_variable_ops.py. It is included in this way since
+# there is a circular dependency between math_ops and resource_variable_ops
+_resource_variable_type = None
 
 
 def _set_doc(doc):
@@ -205,11 +81,9 @@ def argmax(input,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  if dimension is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dimension'")
-    axis = dimension
-  elif axis is None:
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "dimension", dimension)
+  if axis is None:
     axis = 0
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
@@ -225,11 +99,9 @@ def argmin(input,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  if dimension is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dimension'")
-    axis = dimension
-  elif axis is None:
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "dimension", dimension)
+  if axis is None:
     axis = 0
   return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
 
@@ -266,7 +138,7 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   with ops.name_scope(name, "Abs", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
       if x.values.dtype.is_complex:
-        x_abs = gen_math_ops._complex_abs(
+        x_abs = gen_math_ops.complex_abs(
             x.values, Tout=x.values.dtype.real_dtype, name=name)
         return sparse_tensor.SparseTensor(
             indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
@@ -276,7 +148,7 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
     else:
       x = ops.convert_to_tensor(x, name="x")
       if x.dtype.is_complex:
-        return gen_math_ops._complex_abs(x, Tout=x.dtype.real_dtype, name=name)
+        return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
       return gen_math_ops._abs(x, name=name)
 
 
@@ -285,7 +157,7 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin
 def _bucketize(input, boundaries, name=None):
-  return gen_math_ops._bucketize(input=input, boundaries=boundaries, name=name)
+  return gen_math_ops.bucketize(input=input, boundaries=boundaries, name=name)
 
 
 # pylint: enable=redefined-builtin
@@ -328,10 +200,10 @@ def divide(x, y, name=None):
 
 @tf_export("multiply")
 def multiply(x, y, name=None):
-  return gen_math_ops._mul(x, y, name)
+  return gen_math_ops.mul(x, y, name)
 
 
-multiply.__doc__ = gen_math_ops._mul.__doc__.replace("Mul", "`tf.multiply`")
+multiply.__doc__ = gen_math_ops.mul.__doc__.replace("Multiply", "`tf.multiply`")
 
 
 # TODO(aselle): put deprecation in after another round of global code changes
@@ -339,19 +211,19 @@ multiply.__doc__ = gen_math_ops._mul.__doc__.replace("Mul", "`tf.multiply`")
     "2016-12-30",
     "`tf.mul(x, y)` is deprecated, please use `tf.multiply(x, y)` or `x * y`")
 def _mul(x, y, name=None):
-  return gen_math_ops._mul(x, y, name)
+  return gen_math_ops.mul(x, y, name)
 
 
 _mul.__doc__ = (
-    gen_math_ops._mul.__doc__ + ("" if _mul.__doc__ is None else _mul.__doc__))
+    gen_math_ops.mul.__doc__ + ("" if _mul.__doc__ is None else _mul.__doc__))
 
 
 @tf_export("subtract")
 def subtract(x, y, name=None):
-  return gen_math_ops._sub(x, y, name)
+  return gen_math_ops.sub(x, y, name)
 
 
-subtract.__doc__ = gen_math_ops._sub.__doc__.replace("`Sub`", "`tf.subtract`")
+subtract.__doc__ = gen_math_ops.sub.__doc__.replace("`Sub`", "`tf.subtract`")
 
 
 # TODO(aselle): put deprecation in after another round of global code changes
@@ -359,11 +231,11 @@ subtract.__doc__ = gen_math_ops._sub.__doc__.replace("`Sub`", "`tf.subtract`")
     "2016-12-30",
     "`tf.sub(x, y)` is deprecated, please use `tf.subtract(x, y)` or `x - y`")
 def _sub(x, y, name=None):
-  return gen_math_ops._sub(x, y, name)
+  return gen_math_ops.sub(x, y, name)
 
 
 _sub.__doc__ = (
-    gen_math_ops._sub.__doc__ + ("" if _sub.__doc__ is None else _sub.__doc__))
+    gen_math_ops.sub.__doc__ + ("" if _sub.__doc__ is None else _sub.__doc__))
 
 
 # pylint: disable=g-docstring-has-escape
@@ -383,11 +255,11 @@ def negative(x, name=None):
   """
   with ops.name_scope(name, "Neg", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
-      x_neg = gen_math_ops._neg(x.values, name=name)
+      x_neg = gen_math_ops.neg(x.values, name=name)
       return sparse_tensor.SparseTensor(
           indices=x.indices, values=x_neg, dense_shape=x.dense_shape)
     else:
-      return gen_math_ops._neg(x, name=name)
+      return gen_math_ops.neg(x, name=name)
 
 
 # pylint: enable=g-docstring-has-escape
@@ -755,13 +627,25 @@ def cast(x, dtype, name=None):
   tf.cast(x, tf.int32)  # [1, 2], dtype=tf.int32
   ```
 
+  The operation supports data types (for `x` and `dtype`) of
+  `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `float16`, `float32`,
+  `float64`, `complex64`, `complex128`, `bfloat16`. In case of casting from
+  complex types (`complex64`, `complex128`) to real types, only the real part
+  of `x` is returned. In case of casting from real types to complex types
+  (`complex64`, `complex128`), the imaginary part of the returned value is set
+  to `0`. The handling of complex types here matches the behavior of numpy.
+
   Args:
-    x: A `Tensor` or `SparseTensor`.
-    dtype: The destination type.
+    x: A `Tensor` or `SparseTensor` of numeric type. It could be
+      `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`,
+      `float16`, `float32`, `float64`, `complex64`, `complex128`, `bfloat16`.
+    dtype: The destination type. The list of supported dtypes is the same
+      as `x`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x`.
+    A `Tensor` or `SparseTensor` with same shape as `x` and
+      same type as `dtype`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `dtype`.
@@ -770,16 +654,18 @@ def cast(x, dtype, name=None):
   with ops.name_scope(name, "Cast", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
       values_cast = cast(x.values, base_type, name=name)
-      return sparse_tensor.SparseTensor(x.indices, values_cast, x.dense_shape)
+      x = sparse_tensor.SparseTensor(x.indices, values_cast, x.dense_shape)
     else:
       # TODO(josh11b): If x is not already a Tensor, we could return
       # ops.convert_to_tensor(x, dtype=dtype, ...)  here, but that
       # allows some conversions that cast() can't do, e.g. casting numbers to
       # strings.
       x = ops.convert_to_tensor(x, name="x")
-      if x.dtype.base_dtype == base_type:
-        return x
-      return gen_math_ops.cast(x, base_type, name=name)
+      if x.dtype.base_dtype != base_type:
+        x = gen_math_ops.cast(x, base_type, name=name)
+    if x.dtype.is_complex and base_type.is_floating:
+      logging.warn("Casting complex to real discards imaginary part.")
+    return x
 
 
 @tf_export("saturate_cast")
@@ -901,7 +787,41 @@ def to_bfloat16(x, name="ToBFloat16"):
   return cast(x, dtypes.bfloat16, name=name)
 
 
-ops.Tensor._override_operator("__neg__", gen_math_ops._neg)
+@tf_export("to_complex64")
+def to_complex64(x, name="ToComplex64"):
+  """Casts a tensor to type `complex64`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor` with same shape as `x` with type `complex64`.
+
+  Raises:
+    TypeError: If `x` cannot be cast to the `complex64`.
+  """
+  return cast(x, dtypes.complex64, name=name)
+
+
+@tf_export("to_complex128")
+def to_complex128(x, name="ToComplex128"):
+  """Casts a tensor to type `complex128`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor` with same shape as `x` with type `complex128`.
+
+  Raises:
+    TypeError: If `x` cannot be cast to the `complex128`.
+  """
+  return cast(x, dtypes.complex128, name=name)
+
+
+ops.Tensor._override_operator("__neg__", gen_math_ops.neg)
 ops.Tensor._override_operator("__abs__", abs)
 # __invert__ corresponds to the ~ operator.  Here we follow the numpy convention
 # ~ marks an elementwise bit-wise inverse.  This is only implemented for boolean
@@ -923,7 +843,9 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
 
   def binary_op_wrapper(x, y):
     with ops.name_scope(None, op_name, [x, y]) as name:
-      if not isinstance(y, sparse_tensor.SparseTensor):
+      if isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor):
+        return func(x, y, name=name)
+      elif not isinstance(y, sparse_tensor.SparseTensor):
         try:
           y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
         except TypeError:
@@ -1030,7 +952,7 @@ def _truediv_python3(x, y, name=None):
     if dtype is not None:
       x = cast(x, dtype)
       y = cast(y, dtype)
-    return gen_math_ops._real_div(x, y, name=name)
+    return gen_math_ops.real_div(x, y, name=name)
 
 
 def _div_python2(x, y, name=None):
@@ -1053,9 +975,9 @@ def _div_python2(x, y, name=None):
       raise TypeError("x and y must have the same dtype, got %r != %r" %
                       (x_dtype, y_dtype))
     if x_dtype.is_floating or x_dtype.is_complex:
-      return gen_math_ops._real_div(x, y, name=name)
+      return gen_math_ops.real_div(x, y, name=name)
     else:
-      return gen_math_ops._floor_div(x, y, name=name)
+      return gen_math_ops.floor_div(x, y, name=name)
 
 
 @tf_export("truediv")
@@ -1113,7 +1035,7 @@ def div(x, y, name=None):
 
 
 # TODO(aselle): This should be removed
-mod = gen_math_ops._floor_mod
+mod = gen_math_ops.floor_mod
 
 
 # TODO(aselle): Deprecate this once all internal functionality uses
@@ -1146,22 +1068,27 @@ def floordiv(x, y, name=None):
     TypeError: If the inputs are complex.
   """
   with ops.name_scope(name, "floordiv", [x, y]) as name:
-    return gen_math_ops._floor_div(x, y, name=name)
+    return gen_math_ops.floor_div(x, y, name=name)
 
 
-realdiv = gen_math_ops._real_div
-truncatediv = gen_math_ops._truncate_div
+realdiv = gen_math_ops.real_div
+tf_export("realdiv")(realdiv)
+truncatediv = gen_math_ops.truncate_div
+tf_export("truncatediv")(truncatediv)
 # TODO(aselle): Rename this to floordiv when we can.
-floor_div = gen_math_ops._floor_div
-truncatemod = gen_math_ops._truncate_mod
-floormod = gen_math_ops._floor_mod
+floor_div = gen_math_ops.floor_div
+tf_export("floor_div")(floor_div)
+truncatemod = gen_math_ops.truncate_mod
+tf_export("truncatemod")(truncatemod)
+floormod = gen_math_ops.floor_mod
+tf_export("floormod", "mod")(floormod)
 
 
 def _mul_dispatch(x, y, name=None):
   """Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse"."""
   is_tensor_y = isinstance(y, ops.Tensor)
   if is_tensor_y:
-    return gen_math_ops._mul(x, y, name=name)
+    return gen_math_ops.mul(x, y, name=name)
   else:
     assert isinstance(y, sparse_tensor.SparseTensor)  # Case: Dense * Sparse.
     new_vals = gen_sparse_ops.sparse_dense_cwise_mul(y.indices, y.values,
@@ -1180,12 +1107,12 @@ _OverrideBinaryOperatorHelper(gen_sparse_ops.sparse_dense_cwise_mul, "mul",
                               sparse_tensor.SparseTensor)
 
 _OverrideBinaryOperatorHelper(gen_math_ops.add, "add")
-_OverrideBinaryOperatorHelper(gen_math_ops._sub, "sub")
+_OverrideBinaryOperatorHelper(gen_math_ops.sub, "sub")
 _OverrideBinaryOperatorHelper(_mul_dispatch, "mul")
 _OverrideBinaryOperatorHelper(_div_python2, "div")
 _OverrideBinaryOperatorHelper(_truediv_python3, "truediv")
 _OverrideBinaryOperatorHelper(floordiv, "floordiv")
-_OverrideBinaryOperatorHelper(gen_math_ops._floor_mod, "mod")
+_OverrideBinaryOperatorHelper(gen_math_ops.floor_mod, "mod")
 _OverrideBinaryOperatorHelper(pow, "pow")
 
 
@@ -1295,9 +1222,8 @@ def _ReductionDims(x, axis, reduction_indices):
     return axis
   else:
     # Fast path: avoid creating Rank and Range ops if ndims is known.
-    if isinstance(x, ops.Tensor) and x.get_shape().ndims is not None:
-      return constant_op.constant(
-          np.arange(x.get_shape().ndims), dtype=dtypes.int32)
+    if isinstance(x, ops.Tensor) and x._rank() is not None:  # pylint: disable=protected-access
+      return constant_op.constant(np.arange(x._rank()), dtype=dtypes.int32)  # pylint: disable=protected-access
     if (isinstance(x, sparse_tensor.SparseTensor) and
         x.dense_shape.get_shape().is_fully_defined()):
       rank = x.dense_shape.get_shape()[0].value  # sparse.dense_shape is 1-D.
@@ -1356,10 +1282,11 @@ def reduce_sum(input_tensor,
     keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
-    The reduced tensor.
+    The reduced tensor, of the same dtype as the input_tensor.
 
   @compatibility(numpy)
-  Equivalent to np.sum
+  Equivalent to np.sum appart the fact that numpy upcast uint8 and int32 to
+  int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
@@ -1411,8 +1338,18 @@ def count_nonzero(input_tensor,
   tf.count_nonzero(x, [0, 1])  # 3
   ```
 
+  **NOTE** Strings are compared against zero-length empty string `""`. Any
+  string with a size greater than zero is already considered as nonzero.
+
+  For example:
+  ```python
+  x = tf.constant(["", "a", "  ", "b", ""])
+  tf.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
+  ```
+
   Args:
-    input_tensor: The tensor to reduce. Should be of numeric type, or `bool`.
+    input_tensor: The tensor to reduce. Should be of numeric type, `bool`,
+      or `string`.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
@@ -1432,7 +1369,8 @@ def count_nonzero(input_tensor,
 
   with ops.name_scope(name, "count_nonzero", [input_tensor]):
     input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
-    zero = input_tensor.dtype.as_numpy_dtype()
+    # A scalar of 'zero' is enough as `not_equal` will broadcast.
+    zero = array_ops.zeros([], dtype=input_tensor.dtype)
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
@@ -1475,7 +1413,7 @@ def reduce_mean(input_tensor,
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor)]`.
+      `[-rank(input_tensor), rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1507,7 +1445,7 @@ def reduce_mean(input_tensor,
   if keepdims is None:
     keepdims = False
   return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._mean(
+                               gen_math_ops.mean(
                                    input_tensor,
                                    _ReductionDims(input_tensor, axis,
                                                   reduction_indices),
@@ -1557,7 +1495,7 @@ def reduce_prod(input_tensor,
   if keepdims is None:
     keepdims = False
   return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._prod(
+                               gen_math_ops.prod(
                                    input_tensor,
                                    _ReductionDims(input_tensor, axis,
                                                   reduction_indices),
@@ -1585,7 +1523,7 @@ def reduce_min(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
@@ -1634,7 +1572,7 @@ def reduce_max(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
@@ -2010,8 +1948,15 @@ def matmul(a,
     if transpose_b and adjoint_b:
       raise ValueError("Only one of transpose_b and adjoint_b can be True.")
 
-    a = ops.convert_to_tensor(a, name="a")
-    b = ops.convert_to_tensor(b, name="b")
+    if context.executing_eagerly():
+      if not isinstance(a, (ops.EagerTensor, _resource_variable_type)):
+        a = ops.convert_to_tensor(a, name="a")
+      if not isinstance(b, (ops.EagerTensor, _resource_variable_type)):
+        b = ops.convert_to_tensor(b, name="b")
+    else:
+      a = ops.convert_to_tensor(a, name="a")
+      b = ops.convert_to_tensor(b, name="b")
+
     # TODO(apassos) remove _shape_tuple here when it is not needed.
     a_shape = a._shape_tuple()  # pylint: disable=protected-access
     b_shape = b._shape_tuple()  # pylint: disable=protected-access
@@ -2026,7 +1971,7 @@ def matmul(a,
       if transpose_b:
         b = conj(b)
         adjoint_b = True
-      return gen_math_ops._batch_mat_mul(
+      return gen_math_ops.batch_mat_mul(
           a, b, adj_x=adjoint_a, adj_y=adjoint_b, name=name)
 
     # Neither matmul nor sparse_matmul support adjoint, so we conjugate
@@ -2044,8 +1989,9 @@ def matmul(a,
       sparse_matmul_types = [dtypes.bfloat16, dtypes.float32]
       use_sparse_matmul = (
           a.dtype in sparse_matmul_types and b.dtype in sparse_matmul_types)
-    if a.dtype == dtypes.bfloat16 or b.dtype == dtypes.bfloat16:
-      # matmul currently doesn't handle bfloat16 inputs.
+    if (a.dtype == dtypes.bfloat16 or b.dtype == dtypes.bfloat16 and
+        a.dtype != b.dtype):
+      # matmul currently doesn't handle mixed-precision inputs.
       use_sparse_matmul = True
     if use_sparse_matmul:
       ret = sparse_matmul(
@@ -2063,13 +2009,14 @@ def matmul(a,
         ret = cast(ret, dtypes.bfloat16)
       return ret
     else:
-      return gen_math_ops._mat_mul(
+      return gen_math_ops.mat_mul(
           a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
 
 
 _OverrideBinaryOperatorHelper(matmul, "matmul")
 
-sparse_matmul = gen_math_ops._sparse_mat_mul
+sparse_matmul = gen_math_ops.sparse_mat_mul
+tf_export("sparse_matmul")(sparse_matmul)
 
 
 @ops.RegisterStatistics("MatMul", "flops")
@@ -2174,7 +2121,7 @@ def add_n(inputs, name=None):
     if name:
       return array_ops.identity(inputs[0], name=name)
     return inputs[0]
-  return gen_math_ops._add_n(inputs, name=name)
+  return gen_math_ops.add_n(inputs, name=name)
 
 
 @tf_export("accumulate_n")
@@ -2184,14 +2131,12 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
   otherwise, these are inferred.
 
-  NOTE: This operation is not differentiable and cannot be used if inputs depend
-  on trainable variables. Please use `tf.add_n` for such cases.
+  `tf.accumulate_n` performs the same operation as `tf.add_n`, but does not
+  wait for all of its inputs to be ready before beginning to sum. This can
+  save memory if inputs are ready at different times, since minimum temporary
+  storage is proportional to the output size rather than the inputs size.
 
-  Aside from differentiability, `tf.accumulate_n` performs the same operation as
-  `tf.add_n`, but does not wait for all of its inputs to be ready before
-  beginning to sum. This can save memory if inputs are ready at different times,
-  since minimum temporary storage is proportional to the output size rather than
-  the inputs size.
+  `accumulate_n` is differentiable (but wasn't previous to TensorFlow 1.7).
 
   For example:
 
@@ -2201,8 +2146,9 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   tf.accumulate_n([a, b, a])  # [[7, 4], [6, 14]]
 
   # Explicitly pass shape and type
-  tf.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)  # [[7,  4],
-                                                                   #  [6, 14]]
+  tf.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
+                                                                 # [[7,  4],
+                                                                 #  [6, 14]]
   ```
 
   Args:
@@ -2218,20 +2164,18 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
     ValueError: If `inputs` don't all have same shape and dtype or the shape
     cannot be inferred.
   """
-  if context.in_eager_mode():
-    # TODO(apassos) remove this once the lifetime of eager variables gets
-    # addressed.
-    raise ValueError("accumulate_n not supported in eager mode")
+
+  def _input_error():
+    return ValueError("inputs must be a list of at least one Tensor with the "
+                      "same dtype and shape")
+
   if not inputs or not isinstance(inputs, (list, tuple)):
-    raise ValueError("inputs must be a list of at least one Tensor with the "
-                     "same dtype and shape")
+    raise _input_error()
   inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
   if not all(isinstance(x, ops.Tensor) for x in inputs):
-    raise ValueError("inputs must be a list of at least one Tensor with the "
-                     "same dtype and shape")
+    raise _input_error()
   if not all(x.dtype == inputs[0].dtype for x in inputs):
-    raise ValueError("inputs must be a list of at least one Tensor with the "
-                     "same dtype and shape")
+    raise _input_error()
   if shape is not None:
     shape = tensor_shape.as_shape(shape)
   else:
@@ -2239,27 +2183,31 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   for input_tensor in inputs:
     if isinstance(input_tensor, ops.Tensor):
       shape = shape.merge_with(input_tensor.get_shape())
-  if tensor_dtype is None:
-    tensor_dtype = inputs[0].dtype
-  if tensor_dtype != inputs[0].dtype:
+
+  # tensor_dtype is for safety only; operator's output type computed in C++
+  if tensor_dtype is not None and tensor_dtype != inputs[0].dtype:
     raise TypeError("tensor_dtype is {}, but input is of type {}".format(
         tensor_dtype, inputs[0].dtype))
-  if len(inputs) == 1:
+
+  if len(inputs) == 1 and name is None:
     return inputs[0]
-  with ops.name_scope(name, "AccumulateN", inputs) as name:
-    var = gen_state_ops._temporary_variable(
-        shape=tensor_shape.vector(0), dtype=tensor_dtype)
-    with ops.colocate_with(var):
-      zeros = array_ops.zeros_like(gen_control_flow_ops._merge(inputs)[0])
-      zeros.set_shape(shape)
-      ref = state_ops.assign(var, zeros, validate_shape=False)
-      update_ops = [
-          state_ops.assign_add(ref, input_tensor, use_locking=True)
-          for input_tensor in inputs
-      ]
-      with ops.control_dependencies(update_ops):
-        return gen_state_ops._destroy_temporary_variable(
-            ref, var_name=var.op.name, name=name)
+  elif len(inputs) == 1 and name is not None:
+    return array_ops.identity(inputs[0], name=name)
+  elif context.executing_eagerly():
+    # TemporaryVariable not currently supported in eager mode; fall back
+    # onto AddN for now.
+    # TODO(frreiss) remove this once the lifetime of eager variables gets
+    # addressed
+    return add_n(inputs, name=name)
+  else:
+    return gen_math_ops.accumulate_nv2(inputs, name=name, shape=shape)  # pylint: disable=protected-access
+
+
+@ops.RegisterGradient("AccumulateNV2")
+def _accumulate_n_grad(op, grad):
+  """Same as gradient for AddN. Copies the gradient to all inputs."""
+  # Not broadcasting.
+  return [grad] * len(op.inputs)
 
 
 @tf_export("nn.sigmoid", "sigmoid")
@@ -2282,7 +2230,7 @@ def sigmoid(x, name=None):
   """
   with ops.name_scope(name, "Sigmoid", [x]) as name:
     x = ops.convert_to_tensor(x, name="x")
-    return gen_math_ops._sigmoid(x, name=name)
+    return gen_math_ops.sigmoid(x, name=name)
 
 
 @tf_export("log_sigmoid")
@@ -2301,7 +2249,7 @@ def log_sigmoid(x, name=None):
   """
   with ops.name_scope(name, "LogSigmoid", [x]) as name:
     x = ops.convert_to_tensor(x, name="x")
-    return gen_math_ops._neg(gen_nn_ops.softplus(-x), name=name)
+    return gen_math_ops.neg(gen_nn_ops.softplus(-x), name=name)
 
 
 @tf_export("nn.tanh", "tanh")
@@ -2318,11 +2266,11 @@ def tanh(x, name=None):
   """
   with ops.name_scope(name, "Tanh", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
-      x_tanh = gen_math_ops._tanh(x.values, name=name)
+      x_tanh = gen_math_ops.tanh(x.values, name=name)
       return sparse_tensor.SparseTensor(
           indices=x.indices, values=x_tanh, dense_shape=x.dense_shape)
     else:
-      return gen_math_ops._tanh(x, name=name)
+      return gen_math_ops.tanh(x, name=name)
 
 
 @tf_export("bincount")
@@ -2511,7 +2459,7 @@ def conj(x, name=None):
   with ops.name_scope(name, "Conj", [x]) as name:
     x = ops.convert_to_tensor(x, name="x")
     if x.dtype.is_complex or x.dtype == dtypes.variant:
-      return gen_math_ops._conj(x, name=name)
+      return gen_math_ops.conj(x, name=name)
     elif x.dtype.is_floating or x.dtype.is_integer:
       return x
     else:
@@ -2705,14 +2653,14 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_sum(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("sparse_segment_mean")
-def sparse_segment_mean(data, indices, segment_ids, name=None,
+def sparse_segment_mean(data,
+                        indices,
+                        segment_ids,
+                        name=None,
                         num_segments=None):
   r"""Computes the mean along sparse segments of a tensor.
 
@@ -2749,14 +2697,14 @@ def sparse_segment_mean(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_mean(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("sparse_segment_sqrt_n")
-def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
+def sparse_segment_sqrt_n(data,
+                          indices,
+                          segment_ids,
+                          name=None,
                           num_segments=None):
   r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
 
@@ -2786,10 +2734,7 @@ def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_sqrt_n(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("tensordot", "linalg.tensordot")
@@ -2960,6 +2905,47 @@ def tensordot(a, b, axes, name=None):
       return product
 
 
+@tf_export("math.polyval")
+def polyval(coeffs, x, name=None):
+  r"""Computes the elementwise value of a polynomial.
+
+  If `x` is a tensor and `coeffs` is a list n + 1 tensors, this function returns
+  the value of the n-th order polynomial
+
+     p(x) = coeffs[n-1] + coeffs[n-2] * x + ...  + coeffs[0] * x**(n-1)
+
+  evaluated using Horner's method, i.e.
+
+     p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1] +
+            x * coeffs[0]))
+
+  Args:
+    coeffs: A list of `Tensor` representing the coefficients of the polynomial.
+    x: A `Tensor` representing the variable of the polynomial.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as the expression p(x) with usual broadcasting rules
+    for element-wise addition and multiplication applied.
+
+  @compatibility(numpy)
+  Equivalent to numpy.polyval.
+  @end_compatibility
+  """
+
+  with ops.name_scope(name, "polyval", nest.flatten(coeffs) + [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    if len(coeffs) < 1:
+      return array_ops.zeros_like(x, name=name)
+    coeffs = [
+        ops.convert_to_tensor(coeff, name=("coeff_%d" % index))
+        for index, coeff in enumerate(coeffs)
+    ]
+    p = coeffs[0]
+    for c in coeffs[1:]:
+      p = c + p * x
+    return p
+
 # FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
 # 1.0 API so we leave these here for backwards compatibility.
 fft = gen_spectral_ops.fft
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index d314124ccd9bc8b7676e6926830a8eb1e0315f5f..05bcee8801259e4bc6c20c3f61cf20025ba5ea33 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -60,7 +60,7 @@ class ReduceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testReduceInvalidAxis(self):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       # The shape check is in run a graph construction time. In eager mode,
       # it misses the check, magically return result given wrong shape.
       return
@@ -155,9 +155,7 @@ class RoundTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testRounding(self):
-    x = [0.49, 0.7, -0.3, -0.8]
-    # TODO(nolivia): Remove this when RoundOp is forwards compatible
-    # x = np.arange(-5.0, 5.0, .25)
+    x = np.arange(-5.0, 5.0, .25)
     for dtype in [np.float32, np.double, np.int32]:
       x_np = np.array(x, dtype=dtype)
       with test_util.device(use_gpu=True):
@@ -249,7 +247,7 @@ class ScalarMulTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testAcceptsRefs(self):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       var = resource_variable_ops.ResourceVariable(10, name="var")
     else:
       var = variables.Variable(10)
diff --git a/tensorflow/python/ops/metrics.py b/tensorflow/python/ops/metrics.py
index 7e75542aec3e117510b810bad7f92af2084ae3b3..54fa3aefaa678b628035b91459f4d3c075fb3121 100644
--- a/tensorflow/python/ops/metrics.py
+++ b/tensorflow/python/ops/metrics.py
@@ -13,43 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Evaluation-related metrics.
-
-@@accuracy
-@@auc
-@@false_negatives
-@@false_negatives_at_thresholds
-@@false_positives
-@@false_positives_at_thresholds
-@@mean
-@@mean_absolute_error
-@@mean_cosine_distance
-@@mean_iou
-@@mean_per_class_accuracy
-@@mean_relative_error
-@@mean_squared_error
-@@mean_tensor
-@@percentage_below
-@@precision
-@@precision_at_thresholds
-@@recall
-@@recall_at_k
-@@recall_at_top_k
-@@recall_at_thresholds
-@@root_mean_squared_error
-@@sensitivity_at_specificity
-@@sparse_average_precision_at_k
-@@average_precision_at_k
-@@sparse_precision_at_k
-@@precision_at_k
-@@precision_at_top_k
-@@specificity_at_sensitivity
-@@true_negatives
-@@true_negatives_at_thresholds
-@@true_positives
-@@true_positives_at_thresholds
-
-"""
+"""Evaluation-related metrics."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -58,8 +22,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.metrics_impl import *
 # pylint: enable=wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 44c2f304cf9245539e42da2ce54260990de980e0..47eea6ef6b58abd4819544e29783048964104922 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -308,7 +309,7 @@ def mean(values,
       or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.mean is not supported when eager execution '
                        'is enabled.')
 
@@ -394,7 +395,7 @@ def accuracy(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.accuracy is not supported when eager '
                        'execution is enabled.')
 
@@ -626,10 +627,16 @@ def auc(labels,
     curve: Specifies the name of the curve to be computed, 'ROC' [default] or
       'PR' for the Precision-Recall-curve.
     name: An optional variable_scope name.
-    summation_method: Specifies the Riemann summation method used, 'trapezoidal'
-      [default] that applies the trapezoidal rule, 'minoring' that applies
-      left summation for increasing intervals and right summation for decreasing
-      intervals or 'majoring' that applies the opposite.
+    summation_method: Specifies the Riemann summation method used
+      (https://en.wikipedia.org/wiki/Riemann_sum): 'trapezoidal' [default] that
+      applies the trapezoidal rule; 'careful_interpolation', a variant of it
+      differing only by a more correct interpolation scheme for PR-AUC -
+      interpolating (true/false) positives but not the ratio that is precision;
+      'minoring' that applies left summation for increasing intervals and right
+      summation for decreasing intervals; 'majoring' that does the opposite.
+      Note that 'careful_interpolation' is strictly preferred to 'trapezoidal'
+      (to be deprecated soon) as it applies the same method for ROC, and a
+      better one (see Davis & Goadrich 2006 for details) for the PR curve.
 
   Returns:
     auc: A scalar `Tensor` representing the current area-under-curve.
@@ -644,7 +651,7 @@ def auc(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.auc is not supported when eager execution '
                        'is enabled.')
 
@@ -664,18 +671,74 @@ def auc(labels,
     # Add epsilons to avoid dividing by 0.
     epsilon = 1.0e-6
 
+    def interpolate_pr_auc(tp, fp, fn):
+      """Interpolation formula inspired by section 4 of Davis & Goadrich 2006.
+
+      Note here we derive & use a closed formula not present in the paper
+      - as follows:
+      Modeling all of TP (true positive weight),
+      FP (false positive weight) and their sum P = TP + FP (positive weight)
+      as varying linearly within each interval [A, B] between successive
+      thresholds, we get
+        Precision = (TP_A + slope * (P - P_A)) / P
+      with slope = dTP / dP = (TP_B - TP_A) / (P_B - P_A).
+      The area within the interval is thus (slope / total_pos_weight) times
+        int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P}
+        int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P}
+      where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in
+        int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A)
+      Bringing back the factor (slope / total_pos_weight) we'd put aside, we get
+         slope * [dTP + intercept *  log(P_B / P_A)] / total_pos_weight
+      where dTP == TP_B - TP_A.
+      Note that when P_A == 0 the above calculation simplifies into
+        int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A)
+      which is really equivalent to imputing constant precision throughout the
+      first bucket having >0 true positives.
+
+      Args:
+        tp: true positive counts
+        fp: false positive counts
+        fn: false negative counts
+      Returns:
+        pr_auc: an approximation of the area under the P-R curve.
+      """
+      dtp = tp[:num_thresholds - 1] - tp[1:]
+      p = tp + fp
+      prec_slope = _safe_div(dtp, p[:num_thresholds - 1] - p[1:], 'prec_slope')
+      intercept = tp[1:] - math_ops.multiply(prec_slope, p[1:])
+      safe_p_ratio = array_ops.where(
+          math_ops.logical_and(p[:num_thresholds - 1] > 0, p[1:] > 0),
+          _safe_div(p[:num_thresholds - 1], p[1:], 'recall_relative_ratio'),
+          array_ops.ones_like(p[1:]))
+      return math_ops.reduce_sum(
+          _safe_div(
+              prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
+              tp[1:] + fn[1:],
+              name='pr_auc_increment'),
+          name='interpolate_pr_auc')
+
     def compute_auc(tp, fn, tn, fp, name):
       """Computes the roc-auc or pr-auc based on confusion counts."""
+      if curve == 'PR':
+        if summation_method == 'trapezoidal':
+          logging.warning(
+              'Trapezoidal rule is known to produce incorrect PR-AUCs; '
+              'please switch to "careful_interpolation" instead.')
+        elif summation_method == 'careful_interpolation':
+          # This one is a bit tricky and is handled separately.
+          return interpolate_pr_auc(tp, fp, fn)
       rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
       if curve == 'ROC':
         fp_rate = math_ops.div(fp, fp + tn + epsilon)
         x = fp_rate
         y = rec
       else:  # curve == 'PR'.
-        prec = math_ops.div(tp, tp + fp + epsilon)
+        prec = math_ops.div(tp + epsilon, tp + fp + epsilon)
         x = rec
         y = prec
-      if summation_method == 'trapezoidal':
+      if summation_method in ('trapezoidal', 'careful_interpolation'):
+        # Note that the case ('PR', 'careful_interpolation') has been handled
+        # above.
         return math_ops.reduce_sum(
             math_ops.multiply(x[:num_thresholds - 1] - x[1:],
                               (y[:num_thresholds - 1] + y[1:]) / 2.),
@@ -758,7 +821,7 @@ def mean_absolute_error(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.mean_absolute_error is not supported '
                        'when eager execution is enabled.')
 
@@ -818,7 +881,7 @@ def mean_cosine_distance(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.mean_cosine_distance is not supported when '
                        'eager execution is enabled.')
 
@@ -891,7 +954,7 @@ def mean_per_class_accuracy(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.mean_per_class_accuracy is not supported '
                        'when eager execution is enabled.')
 
@@ -996,7 +1059,7 @@ def mean_iou(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.mean_iou is not supported when '
                        'eager execution is enabled.')
 
@@ -1098,7 +1161,7 @@ def mean_relative_error(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.mean_relative_error is not supported when '
                        'eager execution is enabled.')
 
@@ -1165,7 +1228,7 @@ def mean_squared_error(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.mean_squared_error is not supported when '
                        'eager execution is enabled.')
 
@@ -1223,7 +1286,7 @@ def mean_tensor(values,
       or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.mean_tensor is not supported when '
                        'eager execution is enabled.')
 
@@ -1247,13 +1310,8 @@ def mean_tensor(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    def compute_mean(total, count, name):
-      non_zero_count = math_ops.maximum(
-          count, array_ops.ones_like(count), name=name)
-      return math_ops.truediv(total, non_zero_count, name=name)
-
-    mean_t = compute_mean(total, count, 'value')
-    update_op = compute_mean(update_total_op, update_count_op, 'update_op')
+    mean_t = _safe_div(total, count, 'value')
+    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, mean_t)
@@ -1309,7 +1367,7 @@ def percentage_below(values,
       or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.percentage_below is not supported when '
                        'eager execution is enabled.')
 
@@ -1402,7 +1460,7 @@ def false_negatives(labels,
       or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.false_negatives is not supported when '
                        'eager execution is enabled.')
 
@@ -1458,7 +1516,7 @@ def false_negatives_at_thresholds(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.false_negatives_at_thresholds is not '
                        'supported when eager execution is enabled.')
 
@@ -1512,7 +1570,7 @@ def false_positives(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.false_positives is not supported when '
                        'eager execution is enabled.')
 
@@ -1568,7 +1626,7 @@ def false_positives_at_thresholds(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.false_positives_at_thresholds is not '
                        'supported when eager execution is enabled.')
 
@@ -1622,7 +1680,7 @@ def true_negatives(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.true_negatives is not '
                        'supported when eager execution is enabled.')
 
@@ -1678,7 +1736,7 @@ def true_negatives_at_thresholds(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.true_negatives_at_thresholds is not '
                        'supported when eager execution is enabled.')
 
@@ -1732,7 +1790,7 @@ def true_positives(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.true_positives is not '
                        'supported when eager execution is enabled.')
 
@@ -1788,7 +1846,7 @@ def true_positives_at_thresholds(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.true_positives_at_thresholds is not '
                        'supported when eager execution is enabled.')
 
@@ -1856,7 +1914,7 @@ def precision(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.precision is not '
                        'supported when eager execution is enabled.')
 
@@ -1952,7 +2010,7 @@ def precision_at_thresholds(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.precision_at_thresholds is not '
                        'supported when eager execution is enabled.')
 
@@ -2028,7 +2086,7 @@ def recall(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.recall is not supported is not '
                        'supported when eager execution is enabled.')
 
@@ -2405,7 +2463,7 @@ def recall_at_k(labels,
     are not a list or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.recall_at_k is not '
                        'supported when eager execution is enabled.')
 
@@ -2554,7 +2612,7 @@ def recall_at_thresholds(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.recall_at_thresholds is not '
                        'supported when eager execution is enabled.')
 
@@ -2631,7 +2689,7 @@ def root_mean_squared_error(labels,
       tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.root_mean_squared_error is not '
                        'supported when eager execution is enabled.')
 
@@ -2712,7 +2770,7 @@ def sensitivity_at_specificity(labels,
       or `updates_collections` are not a list or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.sensitivity_at_specificity is not '
                        'supported when eager execution is enabled.')
 
@@ -3103,7 +3161,7 @@ def average_precision_at_k(labels,
     ValueError: if k is invalid.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.sparse_average_precision_at_k is not '
                        'supported when eager execution is enabled.')
 
@@ -3272,7 +3330,7 @@ def precision_at_top_k(labels,
       are not a list or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.precision_at_top_k is not '
                        'supported when eager execution is enabled.')
 
@@ -3401,7 +3459,7 @@ def precision_at_k(labels,
       are not a list or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.sparse_precision_at_k is not '
                        'supported when eager execution is enabled.')
 
@@ -3478,7 +3536,7 @@ def specificity_at_sensitivity(labels,
       or `updates_collections` are not a list or tuple.
     RuntimeError: If eager execution is enabled.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError('tf.metrics.specificity_at_sensitivity is not '
                        'supported when eager execution is enabled.')
 
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index ee1a00623a734e18d4aebe6c84f77ba53ee1050c..339684122ec30383f642c4eb9a8b4c3ae88a9e1e 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -17,90 +17,6 @@
 """Neural network support.
 
 See the @{$python/nn} guide.
-
-@@relu
-@@relu6
-@@crelu
-@@swish
-@@elu
-@@leaky_relu
-@@selu
-@@softplus
-@@softsign
-@@dropout
-@@bias_add
-@@sigmoid
-@@log_sigmoid
-@@tanh
-@@convolution
-@@conv2d
-@@depthwise_conv2d
-@@depthwise_conv2d_native
-@@separable_conv2d
-@@atrous_conv2d
-@@atrous_conv2d_transpose
-@@conv2d_transpose
-@@conv1d
-@@conv3d
-@@conv3d_transpose
-@@conv2d_backprop_filter
-@@conv2d_backprop_input
-@@conv3d_backprop_filter_v2
-@@depthwise_conv2d_native_backprop_filter
-@@depthwise_conv2d_native_backprop_input
-@@avg_pool
-@@max_pool
-@@max_pool_with_argmax
-@@avg_pool3d
-@@max_pool3d
-@@fractional_avg_pool
-@@fractional_max_pool
-@@pool
-@@dilation2d
-@@erosion2d
-@@with_space_to_batch
-@@l2_normalize
-@@local_response_normalization
-@@sufficient_statistics
-@@normalize_moments
-@@moments
-@@weighted_moments
-@@fused_batch_norm
-@@batch_normalization
-@@batch_norm_with_global_normalization
-@@l2_loss
-@@log_poisson_loss
-@@sigmoid_cross_entropy_with_logits
-@@softmax
-@@log_softmax
-@@softmax_cross_entropy_with_logits
-@@softmax_cross_entropy_with_logits_v2
-@@sparse_softmax_cross_entropy_with_logits
-@@weighted_cross_entropy_with_logits
-@@embedding_lookup
-@@embedding_lookup_sparse
-@@dynamic_rnn
-@@bidirectional_dynamic_rnn
-@@raw_rnn
-@@static_rnn
-@@static_state_saving_rnn
-@@static_bidirectional_rnn
-@@ctc_loss
-@@ctc_greedy_decoder
-@@ctc_beam_search_decoder
-@@top_k
-@@in_top_k
-@@nce_loss
-@@sampled_softmax_loss
-@@uniform_candidate_sampler
-@@log_uniform_candidate_sampler
-@@learned_unigram_candidate_sampler
-@@fixed_unigram_candidate_sampler
-@@compute_accidental_hits
-@@quantized_conv2d
-@@quantized_relu_x
-@@quantized_max_pool
-@@quantized_avg_pool
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -116,7 +32,6 @@ from tensorflow.python.ops import nn_ops as _nn_ops
 from tensorflow.python.ops.math_ops import sigmoid
 from tensorflow.python.ops.math_ops import tanh
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
 
 # Bring more nn-associated functionality into this package.
 # go/tf-wildcard-import
@@ -126,25 +41,4 @@ from tensorflow.python.ops.nn_impl import *
 from tensorflow.python.ops.nn_ops import *
 from tensorflow.python.ops.candidate_sampling_ops import *
 from tensorflow.python.ops.embedding_ops import *
-from tensorflow.python.ops.rnn import *
-from tensorflow.python.ops import rnn_cell
 # pylint: enable=wildcard-import,unused-import
-
-
-# TODO(cwhipkey): sigmoid and tanh should not be exposed from tf.nn.
-_allowed_symbols = [
-    "zero_fraction",  # documented in training.py
-    # Modules whitelisted for reference through tf.nn.
-    # TODO(cwhipkey): migrate callers to use the submodule directly.
-    # Symbols whitelisted for export without documentation.
-    # TODO(cwhipkey): review these and move to contrib or expose through
-    # documentation.
-    "all_candidate_sampler",  # Excluded in gen_docs_combined.
-    "lrn",  # Excluded in gen_docs_combined.
-    "relu_layer",  # Excluded in gen_docs_combined.
-    "xw_plus_b",  # Excluded in gen_docs_combined.
-    "rnn_cell",  # rnn_cell is a submodule of tf.nn.
-]
-
-remove_undocumented(__name__, _allowed_symbols,
-                    [_sys.modules[__name__], _ctc_ops, _nn_ops, _nn_grad])
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index eebfb17085a568f48769f6df7dddd3ae2f799efc..3ac2c8eb17ef31b46638ce50e0e9f9705adce189 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -57,7 +57,6 @@ class BatchNormalizationTest(test.TestCase):
     test_util.set_producer_version(ops.get_default_graph(), 8)
     return gen_nn_ops._batch_norm_with_global_normalization(
         x, m, v, beta, gamma, epsilon, scale_after_normalization)
-    # pylint: enable=protected-access
 
   def _tfBatchNormV1BW(self, x, m, v, beta, gamma, epsilon,
                        scale_after_normalization):
@@ -223,7 +222,7 @@ class BatchNormalizationTest(test.TestCase):
         for scale_after_normalization in [True, False]:
           # _batch_norm_with_global_normalization_grad is deprecated in v9
           test_util.set_producer_version(ops.get_default_graph(), 8)
-          grad = gen_nn_ops._batch_norm_with_global_normalization_grad(
+          grad = gen_nn_ops.batch_norm_with_global_normalization_grad(
               x, m, v, gamma, backprop, epsilon, scale_after_normalization)
           dx, dm, dv, db, dg = grad
           self.assertEqual(grad.dx, dx)
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index dc24b821a5580e3581f153f3cbf63ad2868b8a18..3a41391340edbe25bd97cfadc58587d91bef9de2 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -94,6 +94,7 @@ def _Conv3DGrad(op, grad):
           array_ops.shape(op.inputs[0]),
           op.inputs[1],
           grad,
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format),
@@ -101,6 +102,7 @@ def _Conv3DGrad(op, grad):
           op.inputs[0],
           array_ops.shape(op.inputs[1]),
           grad,
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format)
@@ -116,12 +118,14 @@ def _Conv3DBackpropInputGrad(op, grad):
           grad,
           array_ops.shape(op.inputs[1]),
           op.inputs[2],
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format),
       nn_ops.conv3d(
           grad,
           op.inputs[1],
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format)
@@ -136,12 +140,14 @@ def _Conv3DBackpropFilterGrad(op, grad):
           array_ops.shape(op.inputs[0]),
           grad,
           op.inputs[2],
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format), None,
       nn_ops.conv3d(
           op.inputs[0],
           grad,
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format)
@@ -150,7 +156,7 @@ def _Conv3DBackpropFilterGrad(op, grad):
 
 @ops.RegisterGradient("AvgPool3D")
 def _AvgPool3DGrad(op, grad):
-  return gen_nn_ops._avg_pool3d_grad(
+  return gen_nn_ops.avg_pool3d_grad(
       array_ops.shape(op.inputs[0]),
       grad,
       ksize=op.get_attr("ksize"),
@@ -172,7 +178,7 @@ def _AvgPool3DGradGrad(op, grad):
 
 @ops.RegisterGradient("MaxPool3D")
 def _MaxPool3DGrad(op, grad):
-  return gen_nn_ops._max_pool3d_grad(
+  return gen_nn_ops.max_pool3d_grad(
       op.inputs[0],
       op.outputs[0],
       grad,
@@ -188,7 +194,7 @@ def _MaxPool3DGradGrad(op, grad):
       shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
           array_ops.zeros(
               shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
-          gen_nn_ops._max_pool3d_grad_grad(
+          gen_nn_ops.max_pool3d_grad_grad(
               op.inputs[0],
               op.inputs[1],
               grad,
@@ -204,7 +210,7 @@ def _MaxPool3DGradGradGrad(op, grad):
       shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
           array_ops.zeros(
               shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
-          gen_nn_ops._max_pool3d_grad(
+          gen_nn_ops.max_pool3d_grad(
               op.inputs[0],
               op.inputs[1],
               grad,
@@ -352,13 +358,13 @@ def _BiasAddGradV1(unused_bias_op, received_grad):
 
 @ops.RegisterGradient("Relu")
 def _ReluGrad(op, grad):
-  return gen_nn_ops._relu_grad(grad, op.outputs[0])
+  return gen_nn_ops.relu_grad(grad, op.outputs[0])
 
 
 @ops.RegisterGradient("EluGrad")
 def _EluGradGrad(op, grad):
   elu_x = op.inputs[1]
-  return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
+  return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
           array_ops.where(elu_x < 0, grad * op.inputs[0],
                           array_ops.zeros(
                               shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
@@ -368,63 +374,63 @@ def _EluGradGrad(op, grad):
 def _SeluGradGrad(op, grad):
   x = op.inputs[1]
   scale_alpha = 1.7580993408473768599402175208123
-  return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
+  return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
           array_ops.where(x < 0.,
-                          gen_nn_ops._elu_grad(grad,
-                                               op.outputs[0] + scale_alpha),
+                          gen_nn_ops.elu_grad(grad,
+                                              op.outputs[0] + scale_alpha),
                           array_ops.zeros(
                               shape=array_ops.shape(x), dtype=x.dtype)))
 
 
 @ops.RegisterGradient("Relu6")
 def _Relu6Grad(op, grad):
-  return gen_nn_ops._relu6_grad(grad, op.outputs[0])  # pylint: disable=protected-access
+  return gen_nn_ops.relu6_grad(grad, op.outputs[0])
 
 
 @ops.RegisterGradient("Relu6Grad")
 def _Relu6GradGrad(op, grad):
   x = op.inputs[1]
-  return (gen_nn_ops._relu6_grad(grad, x),
+  return (gen_nn_ops.relu6_grad(grad, x),
           array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
 @ops.RegisterGradient("Elu")
 def _EluGrad(op, grad):
-  return gen_nn_ops._elu_grad(grad, op.outputs[0])
+  return gen_nn_ops.elu_grad(grad, op.outputs[0])
 
 
 @ops.RegisterGradient("Selu")
 def _SeluGrad(op, grad):
-  return gen_nn_ops._selu_grad(grad, op.outputs[0])
+  return gen_nn_ops.selu_grad(grad, op.outputs[0])
 
 
 @ops.RegisterGradient("Softplus")
 def _SoftplusGrad(op, grad):
-  return gen_nn_ops._softplus_grad(grad, op.inputs[0])
+  return gen_nn_ops.softplus_grad(grad, op.inputs[0])
 
 
 @ops.RegisterGradient("SoftplusGrad")
 def _SoftplusGradGrad(op, grad):
   # Let:
   #   y = tf.nn.softplus(x)
-  #   dx = gen_nn_ops._softplus_grad(dy, x) = dy / (1 + exp(-x))
+  #   dx = gen_nn_ops.softplus_grad(dy, x) = dy / (1 + exp(-x))
   # This op computes (ddy, d2x) from op.inputs == [dy, x] and grad == ddx.
   dy, x = op.inputs
   with ops.control_dependencies([grad]):
-    ddy = gen_nn_ops._softplus_grad(grad, x)  # pylint: disable=protected-access
+    ddy = gen_nn_ops.softplus_grad(grad, x)
     d2x = grad * dy / (math_ops.exp(-x) + 2.0 + math_ops.exp(x))
     return (ddy, d2x)
 
 
 @ops.RegisterGradient("Softsign")
 def _SoftsignGrad(op, grad):
-  return gen_nn_ops._softsign_grad(grad, op.inputs[0])
+  return gen_nn_ops.softsign_grad(grad, op.inputs[0])
 
 
 @ops.RegisterGradient("ReluGrad")
 def _ReluGradGrad(op, grad):
   x = op.inputs[1]
-  return (gen_nn_ops._relu_grad(grad, x),
+  return (gen_nn_ops.relu_grad(grad, x),
           array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
@@ -456,7 +462,7 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
 
   def IsZero(g):
     # Some introspection to check if the gradient is feeding zeros
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       # TODO(apassos) add an efficient way to detect eager zeros here.
       return False
     if g.op.type in ("ZerosLike", "Zeros"):
@@ -565,14 +571,14 @@ def _LRNGrad(op, grad):
   alpha = op.get_attr("alpha")
   beta = op.get_attr("beta")
   return [
-      gen_nn_ops._lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius,
-                           bias, alpha, beta)
+      gen_nn_ops.lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius, bias,
+                          alpha, beta)
   ]
 
 
 @ops.RegisterGradient("AvgPool")
 def _AvgPoolGrad(op, grad):
-  return gen_nn_ops._avg_pool_grad(
+  return gen_nn_ops.avg_pool_grad(
       array_ops.shape(op.inputs[0]),
       grad,
       op.get_attr("ksize"),
@@ -584,7 +590,7 @@ def _AvgPoolGrad(op, grad):
 @ops.RegisterGradient("AvgPoolGrad")
 def _AvgPoolGradGrad(op, grad):
   return (array_ops.stop_gradient(op.inputs[0]),
-          gen_nn_ops._avg_pool(
+          gen_nn_ops.avg_pool(
               grad,
               op.get_attr("ksize"),
               op.get_attr("strides"),
@@ -594,7 +600,7 @@ def _AvgPoolGradGrad(op, grad):
 
 @ops.RegisterGradient("MaxPool")
 def _MaxPoolGrad(op, grad):
-  return gen_nn_ops._max_pool_grad(
+  return gen_nn_ops.max_pool_grad(
       op.inputs[0],
       op.outputs[0],
       grad,
@@ -620,7 +626,7 @@ def _MaxPoolGradV2(op, grad):
 
 @ops.RegisterGradient("MaxPoolWithArgmax")
 def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
-  return gen_nn_ops._max_pool_grad_with_argmax(
+  return gen_nn_ops.max_pool_grad_with_argmax(
       op.inputs[0],
       grad,
       op.outputs[1],
@@ -635,7 +641,7 @@ def _MaxPoolGradGrad(op, grad):
       shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
           array_ops.zeros(
               shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
-          gen_nn_ops._max_pool_grad_grad(
+          gen_nn_ops.max_pool_grad_grad(
               op.inputs[0],
               op.inputs[1],
               grad,
@@ -669,7 +675,7 @@ def _MaxPoolGradGradGrad(op, grad):
       shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
           array_ops.zeros(
               shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
-          gen_nn_ops._max_pool_grad(
+          gen_nn_ops.max_pool_grad(
               op.inputs[0],
               op.inputs[1],
               grad,
@@ -696,8 +702,7 @@ def _FractionalMaxPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
   Returns:
     Input backprop for FractionalMaxPool op.
   """
-  # pylint: disable=protected-access
-  return gen_nn_ops._fractional_max_pool_grad(
+  return gen_nn_ops.fractional_max_pool_grad(
       op.inputs[0], op.outputs[0], grad_0, op.outputs[1], op.outputs[2],
       op.get_attr("overlapping"))
 
@@ -719,10 +724,9 @@ def _FractionalAvgPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
   Returns:
     Input backprop for FractionalAvgPool op.
   """
-  # pylint: disable=protected-access
-  return gen_nn_ops._fractional_avg_pool_grad(op.inputs[0].get_shape(), grad_0,
-                                              op.outputs[1], op.outputs[2],
-                                              op.get_attr("overlapping"))
+  return gen_nn_ops.fractional_avg_pool_grad(op.inputs[0].get_shape(), grad_0,
+                                             op.outputs[1], op.outputs[2],
+                                             op.get_attr("overlapping"))
 
 
 @ops.RegisterGradient("BatchNormWithGlobalNormalization")
@@ -746,7 +750,7 @@ def _BatchNormWithGlobalNormalizationGrad(op, grad):
         last dimension.
     dg: Backprop for gamma, which is (grad * ((x - m) * rsqrt(v + epsilon)))
   """
-  dx, dm, dv, db, dg = gen_nn_ops._batch_norm_with_global_normalization_grad(
+  dx, dm, dv, db, dg = gen_nn_ops.batch_norm_with_global_normalization_grad(
       op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[4], grad,
       op.get_attr("variance_epsilon"), op.get_attr("scale_after_normalization"))
   return dx, dm, dv, db, dg
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 5fa5708114fd5cda6afbca78fa0debf68f0252cc..576627e78ed10d832f82477487140e470e70cfb2 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -303,12 +303,12 @@ def _swish_grad(features, grad):
 # @Defun decorator with noinline=True so that sigmoid(features) is re-computed
 # during backprop, and we can free the sigmoid(features) expression immediately
 # after use during the forward pass.
+@tf_export("nn.swish")
 @function.Defun(
     grad_func=_swish_grad,
     shape_func=_swish_shape,
     func_name="swish",
     noinline=True)
-@tf_export("nn.swish")
 def swish(features):
   # pylint: disable=g-doc-args
   """Computes the Swish activation function: `x * sigmoid(x)`.
@@ -765,9 +765,9 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     weighted_variance = math_ops.multiply(weighted_distsq, divisor)
 
     if not keep_dims:
-      weighted_mean = array_ops.squeeze(weighted_mean, squeeze_dims=axes)
+      weighted_mean = array_ops.squeeze(weighted_mean, axis=axes)
       weighted_variance = array_ops.squeeze(
-          weighted_variance, squeeze_dims=axes)
+          weighted_variance, axis=axes)
 
     if needs_cast:
       weighted_mean = math_ops.cast(weighted_mean, dtypes.float16)
@@ -888,12 +888,10 @@ def fused_batch_norm(
   # TODO(reedwm): In a few weeks, switch to using the V2 version exclusively. We
   # currently only use the V2 version for float16 inputs, which is not supported
   # by the V1 version.
-  # pylint: disable=protected-access
   if x.dtype == dtypes.float16 or x.dtype == dtypes.bfloat16:
-    fused_batch_norm_func = gen_nn_ops._fused_batch_norm_v2
+    fused_batch_norm_func = gen_nn_ops.fused_batch_norm_v2
   else:
-    fused_batch_norm_func = gen_nn_ops._fused_batch_norm
-  # pylint: enable=protected-access
+    fused_batch_norm_func = gen_nn_ops._fused_batch_norm  # pylint: disable=protected-access
   y, batch_mean, batch_var, _, _ = fused_batch_norm_func(
       x,
       scale,
@@ -989,7 +987,7 @@ def _compute_sampled_logits(weights,
         class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
@@ -1014,7 +1012,7 @@ def _compute_sampled_logits(weights,
     out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
-        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
+        `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax).
     out_labels: A Tensor object with the same shape as `out_logits`.
   """
 
@@ -1287,7 +1285,7 @@ def sampled_softmax_loss(weights,
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
-    loss = tf.nn.softmax_cross_entropy_with_logits(
+    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
         labels=labels_one_hot,
         logits=logits)
   ```
@@ -1305,7 +1303,7 @@ def sampled_softmax_loss(weights,
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
@@ -1342,7 +1340,8 @@ def sampled_softmax_loss(weights,
       partition_strategy=partition_strategy,
       name=name,
       seed=seed)
-  sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
+  labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
+  sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2(
       labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
   return sampled_losses
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 6ab839a503adb228b4c79cce4dc34f58ae017fad..cd07550d2ee31e9c830afa8e9c5e17deb35d1135 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -149,14 +150,12 @@ class _NonAtrousConvolution(object):
                                                               conv_dims))
     if conv_dims == 1:
       # conv1d uses the 2-d data format names
-      if data_format is None or data_format == "NWC":
-        data_format_2d = "NHWC"
-      elif data_format == "NCW":
-        data_format_2d = "NCHW"
-      else:
+      if data_format is None:
+        data_format = "NWC"
+      elif data_format not in {"NCW", "NWC", "NCHW", "NHWC"}:
         raise ValueError("data_format must be \"NWC\" or \"NCW\".")
       self.strides = strides[0]
-      self.data_format = data_format_2d
+      self.data_format = data_format
       self.conv_op = self._conv1d
     elif conv_dims == 2:
       if data_format is None or data_format == "NHWC":
@@ -698,7 +697,7 @@ def convolution(
   `padded_input` is obtained by zero padding the input using an effective
   spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and
   output striding `strides` as described in the
-  @{tf.nn.convolution$comment here}.
+  @{$python/nn#Convolution$comment here}.
 
   In the case that `data_format` does start with `"NC"`, the `input` and output
   (but not the `filter`) are simply transposed as follows:
@@ -1042,9 +1041,7 @@ def pool(
 
 @tf_export("nn.atrous_conv2d")
 def atrous_conv2d(value, filters, rate, padding, name=None):
-  """Atrous convolution (a.k.a.
-
-  convolution with holes or dilated convolution).
+  """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
 
   This function is a simpler wrapper around the more general
   @{tf.nn.convolution}, and exists only for backwards compatibility. You can
@@ -1158,7 +1155,7 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
   Returns:
     A `Tensor` with the same type as `value`.
-    Output shape with `'VALID`` padding is:
+    Output shape with `'VALID'` padding is:
 
         [batch, height - 2 * (filter_width - 1),
          width - 2 * (filter_height - 1), out_channels].
@@ -1461,10 +1458,10 @@ def conv3d_transpose(
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [5] if reached this point.
-      if not filter.get_shape()[3].is_compatible_with(output_shape[4]):
+      if not filter.get_shape()[3].is_compatible_with(output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[4],
+            "{} != {}".format(output_shape[axis],
                               filter.get_shape()[3]))
 
     if padding != "VALID" and padding != "SAME":
@@ -1481,7 +1478,6 @@ def conv3d_transpose(
         name=name)
 
 
-# pylint: disable=protected-access
 @tf_export("nn.bias_add")
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
@@ -1504,12 +1500,12 @@ def bias_add(value, bias, data_format=None, name=None):
     A `Tensor` with the same type as `value`.
   """
   with ops.name_scope(name, "BiasAdd", [value, bias]) as name:
-    value = ops.convert_to_tensor(value, name="input")
-    bias = ops.convert_to_tensor(bias, dtype=value.dtype, name="bias")
-    return gen_nn_ops._bias_add(value, bias, data_format=data_format, name=name)
+    if not context.executing_eagerly():
+      value = ops.convert_to_tensor(value, name="input")
+      bias = ops.convert_to_tensor(bias, dtype=value.dtype, name="bias")
+    return gen_nn_ops.bias_add(value, bias, data_format=data_format, name=name)
 
 
-# pylint: disable=protected-access
 def bias_add_v1(value, bias, name=None):
   """Adds `bias` to `value`.
 
@@ -1534,7 +1530,7 @@ def bias_add_v1(value, bias, name=None):
   with ops.name_scope(name, "BiasAddV1", [value, bias]) as name:
     value = ops.convert_to_tensor(value, name="input")
     bias = ops.convert_to_tensor(bias, dtype=value.dtype, name="bias")
-    return gen_nn_ops._bias_add_v1(value, bias, name=name)
+    return gen_nn_ops.bias_add_v1(value, bias, name=name)
 
 
 @tf_export("nn.crelu")
@@ -1580,7 +1576,7 @@ def relu6(features, name=None):
   """
   with ops.name_scope(name, "Relu6", [features]) as name:
     features = ops.convert_to_tensor(features, name="features")
-    return gen_nn_ops._relu6(features, name=name)
+    return gen_nn_ops.relu6(features, name=name)
 
 
 @tf_export("nn.leaky_relu")
@@ -1616,7 +1612,7 @@ def _flatten_outer_dims(logits):
   output = array_ops.reshape(logits, array_ops.concat([[-1], last_dim_size], 0))
 
   # Set output shape if known.
-  if context.in_graph_mode():
+  if not context.executing_eagerly():
     shape = logits.get_shape()
     if shape is not None and shape.dims is not None:
       shape = shape.as_list()
@@ -1645,7 +1641,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   Args:
     logits: A non-empty `Tensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
-    compute_op: Either gen_nn_ops._softmax or gen_nn_ops._log_softmax
+    compute_op: Either gen_nn_ops.softmax or gen_nn_ops.log_softmax
     dim: The dimension softmax would be performed on. The default is -1 which
       indicates the last dimension.
     name: A name for the operation (optional).
@@ -1739,7 +1735,7 @@ def softmax(logits, axis=None, name=None, dim=None):
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops._softmax, axis, name)
+  return _softmax(logits, gen_nn_ops.softmax, axis, name)
 
 
 @tf_export("nn.log_softmax")
@@ -1769,7 +1765,7 @@ def log_softmax(logits, axis=None, name=None, dim=None):
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops._log_softmax, axis, name)
+  return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
 
 
 def _ensure_xent_args(name, sentinel, labels, logits):
@@ -1807,12 +1803,15 @@ def softmax_cross_entropy_with_logits_v2(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  `logits` and `labels` must have the same shape, e.g.
-  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `dim` argument specifying the class dimension.
+
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
   or `float64`).
 
   Backpropagation will happen into both `logits` and `labels`.  To disallow
-  backpropagation into `labels`, pass label tensors through a `stop_gradients`
+  backpropagation into `labels`, pass label tensors through @{tf.stop_gradient}
   before feeding it to this function.
 
   **Note that to avoid confusion, it is required to pass only named arguments to
@@ -1820,14 +1819,17 @@ def softmax_cross_entropy_with_logits_v2(
 
   Args:
     _sentinel: Used to prevent positional parameters. Internal, do not use.
-    labels: Each row `labels[i]` must be a valid probability distribution.
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
     logits: Unscaled log probabilities.
     dim: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
 
   Returns:
-    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-    softmax cross entropy loss.
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
@@ -1840,8 +1842,10 @@ def softmax_cross_entropy_with_logits_v2(
                       [logits, labels]) as name:
     logits = ops.convert_to_tensor(logits, name="logits")
     labels = ops.convert_to_tensor(labels, name="labels")
+    convert_to_float32 = (
+        logits.dtype == dtypes.float16 or logits.dtype == dtypes.bfloat16)
     precise_logits = math_ops.cast(
-        logits, dtypes.float32) if (logits.dtype == dtypes.float16) else logits
+        logits, dtypes.float32) if convert_to_float32 else logits
     # labels and logits must be of the same type
     labels = math_ops.cast(labels, precise_logits.dtype)
     input_rank = array_ops.rank(precise_logits)
@@ -1871,7 +1875,7 @@ def softmax_cross_entropy_with_logits_v2(
     # Do the actual op computation.
     # The second output tensor contains the gradients.  We use it in
     # _CrossEntropyGrad() in nn_grad but not here.
-    cost, unused_backprop = gen_nn_ops._softmax_cross_entropy_with_logits(
+    cost, unused_backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
         precise_logits, labels, name=name)
 
     # The output cost shape should be the input minus dim.
@@ -1881,13 +1885,14 @@ def softmax_cross_entropy_with_logits_v2(
 
     # Make shape inference work since reshape and transpose may erase its static
     # shape.
-    if context.in_graph_mode() and shape is not None and shape.dims is not None:
+    if not context.executing_eagerly(
+    ) and shape is not None and shape.dims is not None:
       shape = shape.as_list()
       del shape[dim]
       cost.set_shape(shape)
 
-    if logits.dtype == dtypes.float16:
-      return math_ops.cast(cost, dtypes.float16)
+    if convert_to_float32:
+      return math_ops.cast(cost, logits.dtype)
     else:
       return cost
 
@@ -1896,7 +1901,7 @@ _XENT_DEPRECATION = """
 Future major versions of TensorFlow will allow gradients to flow
 into the labels input on backprop by default.
 
-See tf.nn.softmax_cross_entropy_with_logits_v2.
+See @{tf.nn.softmax_cross_entropy_with_logits_v2}.
 """
 
 
@@ -1927,9 +1932,9 @@ def softmax_cross_entropy_with_logits(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  `logits` and `labels` must have the same shape, e.g.
-  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
-  or `float64`).
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `dim` argument specifying the class dimension.
 
   Backpropagation will happen only into `logits`.  To calculate a cross entropy
   loss that allows backpropagation into both `logits` and `labels`, see
@@ -1940,14 +1945,17 @@ def softmax_cross_entropy_with_logits(
 
   Args:
     _sentinel: Used to prevent positional parameters. Internal, do not use.
-    labels: Each row `labels[i]` must be a valid probability distribution.
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
     logits: Unscaled log probabilities.
     dim: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
 
   Returns:
-    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-    softmax cross entropy loss.
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
@@ -1978,14 +1986,17 @@ def sparse_softmax_cross_entropy_with_logits(
   must provide a single specific index for the true class for each row of
   `logits` (each minibatch entry).  For soft softmax classification with
   a probability distribution for each entry, see
-  `softmax_cross_entropy_with_logits`.
+  `softmax_cross_entropy_with_logits_v2`.
 
   **WARNING:** This op expects unscaled logits, since it performs a `softmax`
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  A common use case is to have logits of shape `[batch_size, num_classes]` and
-  labels of shape `[batch_size]`. But higher dimensions are supported.
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, in which
+  case the `dim`-th dimension is assumed to be of size `num_classes`.
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
+  or `float64`).
 
   **Note that to avoid confusion, it is required to pass only named arguments to
   this function.**
@@ -2027,6 +2038,9 @@ def sparse_softmax_cross_entropy_with_logits(
     # Store label shape for result later.
     labels_static_shape = labels.get_shape()
     labels_shape = array_ops.shape(labels)
+    static_shapes_fully_defined = (
+        labels_static_shape.is_fully_defined() and
+        logits.get_shape()[:-1].is_fully_defined())
     if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0:
       raise ValueError(
           "Logits cannot be scalars - received shape %s." % logits.get_shape())
@@ -2036,29 +2050,44 @@ def sparse_softmax_cross_entropy_with_logits(
       raise ValueError("Rank mismatch: Rank of labels (received %s) should "
                        "equal rank of logits minus 1 (received %s)." %
                        (labels_static_shape.ndims, logits.get_shape().ndims))
+    if (static_shapes_fully_defined and
+        labels_static_shape != logits.get_shape()[:-1]):
+      raise ValueError("Shape mismatch: The shape of labels (received %s) "
+                       "should equal the shape of logits except for the last "
+                       "dimension (received %s)." % (labels_static_shape,
+                                                     logits.get_shape()))
     # Check if no reshapes are required.
     if logits.get_shape().ndims == 2:
-      cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
+      cost, _ = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
           precise_logits, labels, name=name)
       if logits.dtype == dtypes.float16:
         return math_ops.cast(cost, dtypes.float16)
       else:
         return cost
 
-    # Reshape logits to 2 dim, labels to 1 dim.
-    num_classes = array_ops.shape(logits)[array_ops.rank(logits) - 1]
-    precise_logits = array_ops.reshape(precise_logits, [-1, num_classes])
-    labels = array_ops.reshape(labels, [-1])
-    # The second output tensor contains the gradients.  We use it in
-    # _CrossEntropyGrad() in nn_grad but not here.
-    cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
-        precise_logits, labels, name=name)
-    cost = array_ops.reshape(cost, labels_shape)
-    cost.set_shape(labels_static_shape)
-    if logits.dtype == dtypes.float16:
-      return math_ops.cast(cost, dtypes.float16)
-    else:
-      return cost
+    # Perform a check of the dynamic shapes if the static shapes are not fully
+    # defined.
+    shape_checks = []
+    if not static_shapes_fully_defined:
+      shape_checks.append(
+          check_ops.assert_equal(
+              array_ops.shape(labels),
+              array_ops.shape(logits)[:-1]))
+    with ops.control_dependencies(shape_checks):
+      # Reshape logits to 2 dim, labels to 1 dim.
+      num_classes = array_ops.shape(logits)[array_ops.rank(logits) - 1]
+      precise_logits = array_ops.reshape(precise_logits, [-1, num_classes])
+      labels = array_ops.reshape(labels, [-1])
+      # The second output tensor contains the gradients.  We use it in
+      # _CrossEntropyGrad() in nn_grad but not here.
+      cost, _ = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
+          precise_logits, labels, name=name)
+      cost = array_ops.reshape(cost, labels_shape)
+      cost.set_shape(labels_static_shape)
+      if logits.dtype == dtypes.float16:
+        return math_ops.cast(cost, dtypes.float16)
+      else:
+        return cost
 
 
 @tf_export("nn.avg_pool")
@@ -2086,7 +2115,7 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "AvgPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._avg_pool(
+    return gen_nn_ops.avg_pool(
         value,
         ksize=ksize,
         strides=strides,
@@ -2116,12 +2145,13 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "MaxPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._max_pool(value,
-                                ksize=ksize,
-                                strides=strides,
-                                padding=padding,
-                                data_format=data_format,
-                                name=name)
+    return gen_nn_ops.max_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 @ops.RegisterStatistics("Conv2D", "flops")
@@ -2214,6 +2244,7 @@ def xw_plus_b_v1(x, weights, biases, name=None):  # pylint: disable=invalid-name
     mm = math_ops.matmul(x, weights)
     return bias_add_v1(mm, biases, name=name)
 
+
 def _get_noise_shape(x, noise_shape):
   # If noise_shape is none return immediately.
   if noise_shape is None:
@@ -2227,8 +2258,7 @@ def _get_noise_shape(x, noise_shape):
   except (TypeError, ValueError):
     return noise_shape
 
-  if (x.shape.dims is not None and
-      len(x.shape.dims) == len(noise_shape_.dims)):
+  if x.shape.dims is not None and len(x.shape.dims) == len(noise_shape_.dims):
     new_dims = []
     for i, dim in enumerate(x.shape.dims):
       if noise_shape_.dims[i].value is None and dim.value is not None:
@@ -2239,6 +2269,7 @@ def _get_noise_shape(x, noise_shape):
 
   return noise_shape
 
+
 @tf_export("nn.dropout")
 def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
   """Computes dropout.
@@ -2298,7 +2329,7 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
     binary_tensor = math_ops.floor(random_tensor)
     ret = math_ops.div(x, keep_prob) * binary_tensor
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       ret.set_shape(x.get_shape())
     return ret
 
@@ -2330,7 +2361,7 @@ def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-buil
     values: The `k` largest elements along each last dimensional slice.
     indices: The indices of `values` within the last dimension of `input`.
   """
-  return gen_nn_ops._top_kv2(input, k=k, sorted=sorted, name=name)
+  return gen_nn_ops.top_kv2(input, k=k, sorted=sorted, name=name)
 
 
 def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefined-builtin
@@ -2649,4 +2680,4 @@ def in_top_k(predictions, targets, k, name=None):
     A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`.
   """
   with ops.name_scope(name, "in_top_k"):
-    return gen_nn_ops._in_top_kv2(predictions, targets, k, name=name)
+    return gen_nn_ops.in_top_kv2(predictions, targets, k, name=name)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 21eea3db25af0d1bcfbc7496665f5535c3f660ea..46a5f4fae6b15766c21011ebeae5437262192df7 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -852,6 +852,57 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       self.assertAllClose(exp_sampled_softmax_loss,
                           got_sampled_softmax_loss.eval(), 1e-4)
 
+  def testSampledSoftmaxLossBf16(self):
+    # A simple test to verify the numerics for bfloat16.
+    def _SoftmaxCrossEntropyWithLogits(logits, targets):
+      # logits, targets: float arrays of the same shape.
+      assert logits.shape == targets.shape
+      stable_exp_logits = np.exp(
+          logits - np.amax(logits, axis=1, keepdims=True))
+      pred = stable_exp_logits / np.sum(stable_exp_logits, 1, keepdims=True)
+      return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)
+
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
+    labels = [0, 1, 2]
+    sampled = [1, 0, 2, 3]
+    (weights, biases, hidden_acts, _, exp_logits,
+     exp_labels) = self._GenerateTestData(
+         num_classes=num_classes,
+         dim=10,
+         batch_size=batch_size,
+         num_true=1,
+         labels=labels,
+         sampled=sampled,
+         subtract_log_q=True)
+    exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
+        exp_logits, exp_labels)
+
+    with self.test_session():
+      true_exp_bf16 = np.full(
+          [batch_size, 1], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
+      sampled_exp_bf16 = np.full(
+          [len(sampled)], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
+      sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
+
+      got_sampled_softmax_loss = math_ops.cast(
+          nn_impl.sampled_softmax_loss(
+              weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
+              biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
+              labels=constant_op.constant(
+                  labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
+              inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
+              num_sampled=4,
+              num_classes=num_classes,
+              num_true=1,
+              sampled_values=sampled_vals_bf16,
+              remove_accidental_hits=False,
+              partition_strategy="div"), dtypes.float32)
+
+      self.assertAllClose(exp_sampled_softmax_loss,
+                          got_sampled_softmax_loss.eval(), 1e-1)
+
 
 class CReluTest(test_lib.TestCase):
 
@@ -1030,6 +1081,42 @@ class DataFormatDimMapTest(test_lib.TestCase):
     self._test([1, -3, -2], [2, 2, 3])
     self._test([[1, -3], [1, -1]], [[2, 2], [2, 1]])
 
+  def testNHWCtoNCHW(self):
+    x_val = [1, -3, -2]
+    y_val_expected = [2, 2, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="NCHW")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testNHWCtoHWNC(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [2, 0, 1, 3, 2, 0, 1, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="HWNC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testNHWCtoWHCN(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [3, 1, 0, 2, 3, 1, 0, 2]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="WHCN")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testArbitraryASCII(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [3, 2, 1, 0, 3, 2, 1, 0]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="qwer", dst_format="rewq")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
 
 class DataFormatVectorPermuteTest(test_lib.TestCase):
 
@@ -1049,6 +1136,22 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = sess.run(y)
       self.assertAllEqual(y_val, [7, 9, 3, 4])
 
+  def testNHWCToHWNC(self):
+    x_val = [7, 4, 9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [4, 9, 7, 3])
+
+  def testHWNCToNHWC(self):
+    x_val = [7, 4, 9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [9, 7, 4, 3])
+
   def testNHWCToNCHW2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
@@ -1057,6 +1160,22 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = sess.run(y)
       self.assertAllEqual(y_val, [[7, 4], [5, 1], [9, 3], [4, 5]])
 
+  def testNHWCToHWNC2D(self):
+    x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [[9, 3], [4, 5], [7, 4], [5, 1]])
+
+  def testHWNCToNHWC2D(self):
+    x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [[4, 5], [7, 4], [9, 3], [5, 1]])
+
   def testNCHWToNHWC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index b4ce1cbf25346412e2781a520b7e2cdcf720bcd5..d348e47f57b703138aabfc3463e750b795113335 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -74,7 +74,7 @@ def add_check_numerics_ops():
   the checked operations.
   @enc_compatibility
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError(
         "add_check_numerics_ops() is not compatible with eager execution. "
         "To check for Inf's and NaN's under eager execution, call "
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index b0315ceee268be8ac1813dae5a262a7d9496e154..d8d9af545f17fe3e0133b51b1eab82f7732dc299 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -700,8 +700,7 @@ def _parse_example_raw(serialized,
     # Finally, convert dense_shapes to TensorShapeProto
     dense_shapes = [shape.as_proto() for shape in dense_shapes]
 
-    # pylint: disable=protected-access
-    outputs = gen_parsing_ops._parse_example(
+    outputs = gen_parsing_ops.parse_example(
         serialized=serialized,
         names=names,
         dense_defaults=dense_defaults_vec,
@@ -710,7 +709,6 @@ def _parse_example_raw(serialized,
         dense_keys=dense_keys,
         dense_shapes=dense_shapes,
         name=name)
-    # pylint: enable=protected-access
 
     (sparse_indices, sparse_values, sparse_shapes, dense_values) = outputs
 
@@ -1132,8 +1130,7 @@ def _parse_single_sequence_example_raw(serialized,
     feature_list_dense_shapes = [tensor_shape.as_shape(shape).as_proto()
                                  for shape in feature_list_dense_shapes]
 
-    # pylint: disable=protected-access
-    outputs = gen_parsing_ops._parse_single_sequence_example(
+    outputs = gen_parsing_ops.parse_single_sequence_example(
         serialized=serialized,
         debug_name=debug_name,
         context_dense_defaults=context_dense_defaults_vec,
@@ -1149,7 +1146,6 @@ def _parse_single_sequence_example_raw(serialized,
         feature_list_dense_missing_assumed_empty=(
             feature_list_dense_missing_assumed_empty),
         name=name)
-    # pylint: enable=protected-access
 
     (context_sparse_indices, context_sparse_values,
      context_sparse_shapes, context_dense_values,
@@ -1180,9 +1176,13 @@ def _parse_single_sequence_example_raw(serialized,
 
 # Swap `name` and `na_value` for backward compatibility.
 @tf_export("decode_csv")
-def decode_csv(records, record_defaults, field_delim=",",
-               use_quote_delim=True, name=None, na_value=""):
-  # pylint: disable=protected-access
+def decode_csv(records,
+               record_defaults,
+               field_delim=",",
+               use_quote_delim=True,
+               name=None,
+               na_value="",
+               select_cols=None):
   """Convert CSV records to tensors. Each column maps to one tensor.
 
   RFC 4180 format is expected for the CSV records.
@@ -1205,17 +1205,32 @@ def decode_csv(records, record_defaults, field_delim=",",
       Bullet 5).
     name: A name for the operation (optional).
     na_value: Additional string to recognize as NA/NaN.
+    select_cols: Optional sorted list of column indices to select. If specified,
+      only this subset of columns will be parsed and returned.
 
   Returns:
     A list of `Tensor` objects. Has the same type as `record_defaults`.
     Each tensor will have the same shape as records.
+
+  Raises:
+    ValueError: If any of the arguments is malformed.
   """
-  # TODO(martinwicke), remove the wrapper when new Python API generator is done.
-  return gen_parsing_ops._decode_csv(
-      records=records, record_defaults=record_defaults,
-      field_delim=field_delim, use_quote_delim=use_quote_delim,
-      na_value=na_value, name=name)
-  # pylint: enable=protected-access
+  if select_cols is not None and any(select_cols[i] >= select_cols[i + 1]
+                                     for i in range(len(select_cols) - 1)):
+    raise ValueError("select_cols is not strictly increasing.")
+  if select_cols is not None and select_cols[0] < 0:
+    raise ValueError("select_cols contains negative values.")
+  if select_cols is not None and len(select_cols) != len(record_defaults):
+    raise ValueError("Length of select_cols and record_defaults do not match.")
+  return gen_parsing_ops.decode_csv(
+      records=records,
+      record_defaults=record_defaults,
+      field_delim=field_delim,
+      use_quote_delim=use_quote_delim,
+      na_value=na_value,
+      name=name,
+      select_cols=select_cols,
+  )
 
 
 # TODO(b/70890287): Combine the implementation of this op and
@@ -1391,7 +1406,6 @@ def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
     # Finally, convert dense_shapes to TensorShapeProto
     dense_shapes = [shape.as_proto() for shape in dense_shapes]
 
-    # pylint: disable=protected-access
     outputs = gen_parsing_ops.parse_single_example(
         serialized=serialized,
         dense_defaults=dense_defaults_vec,
@@ -1401,7 +1415,6 @@ def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
         dense_keys=dense_keys,
         dense_shapes=dense_shapes,
         name=name)
-    # pylint: enable=protected-access
 
     (sparse_indices, sparse_values, sparse_shapes, dense_values) = outputs
 
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 2c86358d21b1c280b8d7ade625fd4b7a44c5de26..6a2dd3f1cd55eea1d3b652a31cd2784c411c2ce0 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -43,7 +43,6 @@ def _ShapeTensor(shape):
   return ops.convert_to_tensor(shape, dtype=dtype, name="shape")
 
 
-# pylint: disable=protected-access
 @tf_export("random_normal")
 def random_normal(shape,
                   mean=0.0,
@@ -74,7 +73,7 @@ def random_normal(shape,
     mean_tensor = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev_tensor = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
     seed1, seed2 = random_seed.get_seed(seed)
-    rnd = gen_random_ops._random_standard_normal(
+    rnd = gen_random_ops.random_standard_normal(
         shape_tensor, dtype, seed=seed1, seed2=seed2)
     mul = rnd * stddev_tensor
     value = math_ops.add(mul, mean_tensor, name=name)
@@ -126,7 +125,7 @@ def parameterized_truncated_normal(shape,
     minvals_tensor = ops.convert_to_tensor(minvals, dtype=dtype, name="minvals")
     maxvals_tensor = ops.convert_to_tensor(maxvals, dtype=dtype, name="maxvals")
     seed1, seed2 = random_seed.get_seed(seed)
-    rnd = gen_random_ops._parameterized_truncated_normal(
+    rnd = gen_random_ops.parameterized_truncated_normal(
         shape_tensor,
         means_tensor,
         stddevs_tensor,
@@ -171,7 +170,7 @@ def truncated_normal(shape,
     mean_tensor = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev_tensor = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
     seed1, seed2 = random_seed.get_seed(seed)
-    rnd = gen_random_ops._truncated_normal(
+    rnd = gen_random_ops.truncated_normal(
         shape_tensor, dtype, seed=seed1, seed2=seed2)
     mul = rnd * stddev_tensor
     value = math_ops.add(mul, mean_tensor, name=name)
@@ -210,7 +209,7 @@ def random_uniform(shape,
     maxval: A 0-D Tensor or Python value of type `dtype`. The upper bound on
       the range of random values to generate.  Defaults to 1 if `dtype` is
       floating point.
-    dtype: The type of the output: 'float16`, `float32`, `float64`, `int32`,
+    dtype: The type of the output: `float16`, `float32`, `float64`, `int32`,
       or `int64`.
     seed: A Python integer. Used to create a random seed for the distribution.
       See @{tf.set_random_seed}
@@ -237,11 +236,10 @@ def random_uniform(shape,
     maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
     seed1, seed2 = random_seed.get_seed(seed)
     if dtype.is_integer:
-      return gen_random_ops._random_uniform_int(
+      return gen_random_ops.random_uniform_int(
           shape, minval, maxval, seed=seed1, seed2=seed2, name=name)
     else:
-      rnd = gen_random_ops._random_uniform(
-          shape, dtype, seed=seed1, seed2=seed2)
+      rnd = gen_random_ops.random_uniform(shape, dtype, seed=seed1, seed2=seed2)
       return math_ops.add(rnd * (maxval - minval), minval, name=name)
 
 
@@ -275,7 +273,7 @@ def random_shuffle(value, seed=None, name=None):
     dimension.
   """
   seed1, seed2 = random_seed.get_seed(seed)
-  return gen_random_ops._random_shuffle(
+  return gen_random_ops.random_shuffle(
       value, seed=seed1, seed2=seed2, name=name)
 
 
@@ -420,7 +418,7 @@ def random_gamma(shape,
     seed1, seed2 = random_seed.get_seed(seed)
     return math_ops.maximum(
         np.finfo(dtype.as_numpy_dtype).tiny,
-        gen_random_ops._random_gamma(
+        gen_random_ops.random_gamma(
             shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
 
 ops.NotDifferentiable("RandomGamma")
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 09d349fc2db61a09649a801a5d4784522b969d38..1e953f658fca08c1e6a3297bc23f1b98269c1dcf 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -30,6 +32,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -39,15 +42,22 @@ from tensorflow.python.training import checkpointable
 from tensorflow.python.util import compat
 
 
+def get_resource_handle_data(graph_op):
+  assert ops._USE_C_SHAPES  # pylint: disable=protected-access
+  assert type(graph_op) == ops.Tensor  # pylint: disable=unidiomatic-typecheck
+
+  handle_data = pywrap_tensorflow.GetResourceHandleShapeAndType(
+      graph_op.graph._c_graph, graph_op._as_tf_output())  # pylint: disable=protected-access
+
+  return cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString(
+      compat.as_bytes(handle_data))
+
+
 def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
   """Creates a variable handle with information to do shape inference."""
   container = ops.get_default_graph()._container  # pylint: disable=protected-access
   if container is None:
     container = ""
-  if not graph_mode:
-    # When in eager mode use a uid for the shared_name, to prevent accidental
-    # sharing.
-    shared_name = str(ops.uid())
   handle = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
                                                    shared_name=shared_name,
                                                    name=name,
@@ -74,7 +84,15 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     # know the shape and dtype of the variable pointed to by a handle. Since
     # shape inference doesn't run in eager mode we copy this data here for when
     # the handle is captured by an eager mode function.
-    handle._handle_data = h._handle_data  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    if ops._USE_C_SHAPES:
+      handle._handle_data = get_resource_handle_data(h)
+    else:
+      if h._handle_data is None:
+        ops.set_shape_and_handle_data_for_outputs(h.op)
+      handle._handle_data = h._handle_data
+    # pylint: enable=protected-access
+
   # Clean up our reference cycles to avoid making the garbage collector run.
   # pylint: disable=protected-access
   # OrderedDict, constructed on Graph creation, makes a simple reference loop
@@ -117,8 +135,7 @@ class EagerResourceDeleter(object):
 
   def __del__(self):
     # Resources follow object-identity when executing eagerly, so it is safe to
-    # delete the resource we have a handle to. Each Graph has a unique container
-    # name, which prevents resource sharing.
+    # delete the resource we have a handle to.
     try:
       # This resource was created in eager mode. However, this destructor may be
       # running in graph mode (especially during unit tests). To clean up
@@ -134,10 +151,10 @@ class EagerResourceDeleter(object):
       # valid, and so on. Printing warnings in these cases is silly
       # (exceptions raised from __del__ are printed as warnings to stderr).
       pass  # 'NoneType' object is not callable when the handle has been
-            # partially unloaded.
+      # partially unloaded.
     except AttributeError:
       pass  # 'NoneType' object has no attribute 'eager_mode' when context has
-            # been unloaded. Will catch other module unloads as well.
+      # been unloaded. Will catch other module unloads as well.
 
 
 def shape_safe_assign_variable_handle(handle, shape, value, name=None):
@@ -152,7 +169,7 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
 class ResourceVariable(variables.Variable):
   """Variable based on resource handles.
 
-  See the ${variables} documentation for more details.
+  See the @{$variables$Variables How To} for a high level overview.
 
   A `ResourceVariable` allows you to maintain state across subsequent calls to
   session.run.
@@ -174,7 +191,9 @@ class ResourceVariable(variables.Variable):
   to see all modifications to the value of the variable which happen in any
   operation on which the read_value depends on (either directly, indirectly, or
   via a control dependency) and guaranteed to not see any modification to the
-  value of the variable on which the read_value operation does not depend on.
+  value of the variable from operations that depend on the read_value operation.
+  Updates from operations that have no dependency relationship to the read_value
+  operation might or might not be visible to read_value.
 
   For example, if there is more than one assignment to a ResourceVariable in
   a single session.run call there is a well-defined value for each operation
@@ -182,24 +201,20 @@ class ResourceVariable(variables.Variable):
   by edges in the graph. Consider the following example, in which two writes
   can cause tf.Variable and tf.ResourceVariable to behave differently:
 
-   ```python
-    a = tf.ResourceVariable(1.0)
-    a.initializer.run()
-
-    assign = a.assign(2.0)
-    with tf.control_dependencies([assign]):
-      b = a.read_value()
-    with tf.control_dependencies([b]):
-      other_assign = a.assign(3.0)
-    with tf.control_dependencies([other_assign]):
-      # Will print 2.0 because the value was read before other_assign ran. If
-      # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
-      tf.Print(b, [b]).eval()
+  ```python
+  a = tf.ResourceVariable(1.0)
+  a.initializer.run()
+
+  assign = a.assign(2.0)
+  with tf.control_dependencies([assign]):
+    b = a.read_value()
+  with tf.control_dependencies([b]):
+    other_assign = a.assign(3.0)
+  with tf.control_dependencies([other_assign]):
+    # Will print 2.0 because the value was read before other_assign ran. If
+    # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
+    tf.Print(b, [b]).eval()
   ```
-
-  To enforce these consistency properties tf.ResourceVariable might make more
-  copies than an equivalent tf.Variable under the hood, so tf.Variable is still
-  not deprecated.
   """
 
   def __init__(self,
@@ -266,9 +281,9 @@ class ResourceVariable(variables.Variable):
       if initial_value is not None:
         raise ValueError("variable_def and initial_value are mutually "
                          "exclusive.")
-      if not context.in_graph_mode():
-        raise ValueError("Creating ResourceVariable from variable_def"
-                         " only supported in GRAPH mode.")
+      if context.executing_eagerly():
+        raise ValueError("Creating ResourceVariable from variable_def is "
+                         "not supported when eager execution is enabled.")
       self._init_from_proto(variable_def, import_scope=import_scope)
     else:
       self._init_from_args(
@@ -362,11 +377,17 @@ class ResourceVariable(variables.Variable):
     # this graph.
     self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
     with ops.init_scope():
-      self._in_graph_mode = context.in_graph_mode()
+      self._in_graph_mode = not context.executing_eagerly()
       with ops.name_scope(name, "Variable", []
                           if init_from_fn else [initial_value]) as name:
         # pylint: disable=protected-access
         handle_name = ops._name_from_scope_name(name)
+        if self._in_graph_mode:
+          shared_name = handle_name
+        else:
+          # When in eager mode use a uid for the shared_name, to prevent
+          # accidental sharing.
+          shared_name = "%s_%d" % (handle_name, ops.uid())
         if init_from_fn:
           # Use attr_scope and device(None) to simulate the behavior of
           # colocate_with when the variable we want to colocate with doesn't
@@ -382,12 +403,9 @@ class ResourceVariable(variables.Variable):
               self._handle = _eager_safe_variable_handle(
                   shape=initial_value.get_shape(),
                   dtype=initial_value.dtype.base_dtype,
-                  shared_name=handle_name,
+                  shared_name=shared_name,
                   name=name,
                   graph_mode=self._in_graph_mode)
-              self._handle_device = (
-                  self._handle.device if self._in_graph_mode else
-                  context.get_default_context().device_name)
               self._shape = initial_value.get_shape()
           else:
             initial_value = initial_value()
@@ -397,12 +415,9 @@ class ResourceVariable(variables.Variable):
             self._handle = _eager_safe_variable_handle(
                 shape=initial_value.get_shape(),
                 dtype=initial_value.dtype.base_dtype,
-                shared_name=handle_name,
+                shared_name=shared_name,
                 name=name,
                 graph_mode=False)
-            self._handle_device = (
-                self._handle.device if self._in_graph_mode else
-                context.get_default_context().device_name)
             self._shape = initial_value.get_shape()
         # pylint: enable=protected-access
 
@@ -423,13 +438,12 @@ class ResourceVariable(variables.Variable):
           self._handle = _eager_safe_variable_handle(
               shape=initial_value.get_shape(),
               dtype=initial_value.dtype.base_dtype,
-              shared_name=handle_name,
+              shared_name=shared_name,
               name=name,
               graph_mode=self._in_graph_mode)
-          self._handle_device = (self._handle.device if self._in_graph_mode else
-                                 context.get_default_context().device_name)
           self._shape = initial_value.get_shape()
 
+        self._unique_id = shared_name
         self._initial_value = initial_value if self._in_graph_mode else None
         self._handle_name = handle_name + ":0"
         self._dtype = initial_value.dtype.base_dtype
@@ -450,7 +464,7 @@ class ResourceVariable(variables.Variable):
           with ops.name_scope("Read"), ops.colocate_with(self._handle):
             # Manually assign reads to the handle's device to avoid log
             # messages.
-            with ops.device(self._handle_device):
+            with ops.device(self._handle.device):
               value = self._read_variable_op()
             self._graph_element = value
             if caching_device is not None:
@@ -477,7 +491,7 @@ class ResourceVariable(variables.Variable):
               self._cached_value = self._read_variable_op()
           else:
             self._cached_value = None
-        if context.in_graph_mode():
+        if not context.executing_eagerly():
           ops.add_to_collections(collections, self)
         elif ops.GraphKeys.GLOBAL_STEP in collections:
           ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
@@ -490,12 +504,13 @@ class ResourceVariable(variables.Variable):
       # cycles being uncollectable, and means that no __del__ will be defined at
       # all in graph mode.
       self._handle_deleter = EagerResourceDeleter(
-          handle=self._handle, handle_device=self._handle_device)
+          handle=self._handle, handle_device=self._handle.device)
+    self._cached_shape_as_list = None
 
   def _init_from_proto(self, variable_def, import_scope=None):
     """Initializes from `VariableDef` proto."""
     # Note that init_from_proto is currently not supported in Eager mode.
-    assert context.in_graph_mode()
+    assert not context.executing_eagerly()
     self._in_graph_mode = True
     assert isinstance(variable_def, variable_pb2.VariableDef)
     if not variable_def.is_resource:
@@ -508,8 +523,8 @@ class ResourceVariable(variables.Variable):
             variable_def.variable_name, import_scope=import_scope))
     self._shape = tensor_shape.TensorShape(
         self._handle.op.get_attr("shape"))
-    self._handle_device = self._handle.device
     self._handle_name = self._handle.name
+    self._unique_id = self._handle_name
     self._initializer_op = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.initializer_name, import_scope=import_scope))
@@ -522,11 +537,19 @@ class ResourceVariable(variables.Variable):
     else:
       self._initial_value = None
     if variable_def.snapshot_name:
-      self._cached_value = g.as_graph_element(
+      snapshot = g.as_graph_element(
           ops.prepend_name_scope(
               variable_def.snapshot_name, import_scope=import_scope))
+      self._cached_value = snapshot
+      while snapshot.op.type != "ReadVariableOp":
+        snapshot = snapshot.op.inputs[0]
+      self._graph_element = snapshot
     else:
       self._cached_value = None
+      # Legacy case for protos without the snapshot name; assume it's the
+      # following.
+      self._graph_element = g.get_tensor_by_name(
+          self._handle.op.name + "/Read/ReadVariableOp:0")
     if variable_def.HasField("save_slice_info_def"):
       self._save_slice_info = variables.Variable.SaveSliceInfo(
           save_slice_info_def=variable_def.save_slice_info_def,
@@ -535,8 +558,8 @@ class ResourceVariable(variables.Variable):
       self._save_slice_info = None
     self._caching_device = None
     self._dtype = dtypes.as_dtype(self._handle.op.get_attr("dtype"))
-    self._graph_element = self.value()
     self._constraint = None
+    self._cached_shape_as_list = None
 
   def __nonzero__(self):
     return self.__bool__()
@@ -552,7 +575,7 @@ class ResourceVariable(variables.Variable):
   @property
   def device(self):
     """The device this variable is on."""
-    return self._handle_device
+    return self._handle.device
 
   @property
   def graph(self):
@@ -569,11 +592,26 @@ class ResourceVariable(variables.Variable):
     """The shape of this variable."""
     return self._shape
 
+  def _shape_as_list(self):
+    if self._cached_shape_as_list:
+      return self._cached_shape_as_list
+    if self.shape.ndims is None:
+      return None
+    self._cached_shape_as_list = [dim.value for dim in self.shape.dims]
+    return self._cached_shape_as_list
+
+  def _shape_tuple(self):
+    shape = self._shape_as_list()
+    if shape is None:
+      return None
+    return tuple(shape)
+
   @property
   def create(self):
     """The op responsible for initializing this variable."""
     if not self._in_graph_mode:
-      raise RuntimeError("Calling create in EAGER mode not supported.")
+      raise RuntimeError("Calling create is not supported when eager execution"
+                         " is enabled.")
     return self._initializer_op
 
   @property
@@ -586,7 +624,7 @@ class ResourceVariable(variables.Variable):
     if self._cached_value is not None:
       return self._cached_value
     with ops.colocate_with(None, ignore_existing=True):
-      with ops.device(self._handle_device):
+      with ops.device(self._handle.device):
         return self._read_variable_op()
 
   def _as_graph_element(self):
@@ -601,7 +639,7 @@ class ResourceVariable(variables.Variable):
   @property
   def initial_value(self):
     """Returns the Tensor used as the initial value for the variable."""
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError("initial_value not supported in EAGER mode.")
     return self._initial_value
 
@@ -622,15 +660,15 @@ class ResourceVariable(variables.Variable):
 
   def eval(self, session=None):
     """Evaluates and returns the value of this variable."""
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError("Trying to eval in EAGER mode")
     return self._graph_element.eval(session=session)
 
   def numpy(self):
-    if context.in_graph_mode():
-      raise NotImplementedError(
-          "numpy() is only available when eager execution is enabled.")
-    return self.read_value().numpy()
+    if context.executing_eagerly():
+      return self.read_value().numpy()
+    raise NotImplementedError(
+        "numpy() is only available when eager execution is enabled.")
 
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
@@ -683,7 +721,7 @@ class ResourceVariable(variables.Variable):
     """
     with ops.name_scope("Read"):
       # Ensure we read the variable in the same device as the handle.
-      with ops.device(self._handle_device):
+      with ops.device(self._handle.device):
         value = self._read_variable_op()
     # Return an identity so it can get placed on whatever device the context
     # specifies instead of the device where the variable is.
@@ -711,7 +749,7 @@ class ResourceVariable(variables.Variable):
       A `VariableDef` protocol buffer, or `None` if the `Variable` is not
       in the specified name scope.
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError("to_proto not supported in EAGER mode.")
     if export_scope is None or self.handle.name.startswith(export_scope):
       var_def = variable_pb2.VariableDef()
@@ -728,6 +766,10 @@ class ResourceVariable(variables.Variable):
       if self._cached_value is not None:
         var_def.snapshot_name = ops.strip_name_scope(self._cached_value.name,
                                                      export_scope)
+      else:
+        # Store the graph_element here
+        var_def.snapshot_name = ops.strip_name_scope(self._graph_element.name,
+                                                     export_scope)
       var_def.is_resource = True
       if self._save_slice_info:
         var_def.save_slice_info_def.MergeFrom(
@@ -738,7 +780,7 @@ class ResourceVariable(variables.Variable):
 
   @staticmethod
   def from_proto(variable_def, import_scope=None):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError("from_proto not supported in EAGER mode.")
     return ResourceVariable(
         variable_def=variable_def, import_scope=import_scope)
@@ -789,37 +831,84 @@ class ResourceVariable(variables.Variable):
 
   __array_priority__ = 100
 
-  def assign_sub(self, delta, use_locking=None, name=None):
+  def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
+    """Subtracts a value from this variable.
+
+    Args:
+      delta: A `Tensor`. The value to subtract from this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: The name to use for the operation.
+      read_value: A `bool`. Whether to read and return the new value of the
+          variable or not.
+
+    Returns:
+      If `read_value` is `True`, this method will return the new value of the
+      variable after the assignment has completed. Otherwise, when in graph mode
+      it will return the `Operation` that does the assignment, and when in eager
+      mode it will return `None`.
+    """
     # TODO(apassos): this here and below is not atomic. Consider making it
     # atomic if there's a way to do so without a performance cost for those who
     # don't need it.
-    return self._lazy_read(gen_resource_variable_ops.assign_sub_variable_op(
-        self.handle,
-        ops.convert_to_tensor(delta, dtype=self.dtype),
-        name=name))
+    assign_sub_op = gen_resource_variable_ops.assign_sub_variable_op(
+        self.handle, ops.convert_to_tensor(delta, dtype=self.dtype), name=name)
+    if read_value:
+      return self._lazy_read(assign_sub_op)
+    return assign_sub_op
+
+  def assign_add(self, delta, use_locking=None, name=None, read_value=True):
+    """Adds a value to this variable.
 
-  def assign_add(self, delta, use_locking=None, name=None):
-    return self._lazy_read(gen_resource_variable_ops.assign_add_variable_op(
-        self.handle,
-        ops.convert_to_tensor(delta, dtype=self.dtype),
-        name=name))
+    Args:
+      delta: A `Tensor`. The value to add to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: The name to use for the operation.
+      read_value: A `bool`. Whether to read and return the new value of the
+          variable or not.
+
+    Returns:
+      If `read_value` is `True`, this method will return the new value of the
+      variable after the assignment has completed. Otherwise, when in graph mode
+      it will return the `Operation` that does the assignment, and when in eager
+      mode it will return `None`.
+    """
+    assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
+        self.handle, ops.convert_to_tensor(delta, dtype=self.dtype), name=name)
+    if read_value:
+      return self._lazy_read(assign_add_op)
+    return assign_add_op
 
   def _lazy_read(self, op):
     if hasattr(self, "_trainable") and self._trainable:
       tape.watch_variable(self)
     return _UnreadVariable(
-        self._handle, self.dtype, self._handle_device, self._shape,
-        self._in_graph_mode,
-        self._handle_deleter if not self._in_graph_mode else None, op)
+        self._handle, self.dtype, self._shape, self._in_graph_mode,
+        self._handle_deleter if not self._in_graph_mode else None, op,
+        self._unique_id)
+
+  def assign(self, value, use_locking=None, name=None, read_value=True):
+    """Assigns a new value to this variable.
 
-  def assign(self, value, use_locking=None, name=None):
+    Args:
+      value: A `Tensor`. The new value for this variable.
+      use_locking: If `True`, use locking during the assignment.
+      name: The name to use for the assignment.
+      read_value: A `bool`. Whether to read and return the new value of the
+          variable or not.
+
+    Returns:
+      If `read_value` is `True`, this method will return the new value of the
+      variable after the assignment has completed. Otherwise, when in graph mode
+      it will return the `Operation` that does the assignment, and when in eager
+      mode it will return `None`.
+    """
     value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
     self._shape.assert_is_compatible_with(value_tensor.shape)
-    return self._lazy_read(
-        gen_resource_variable_ops.assign_variable_op(
-            self.handle,
-            value_tensor,
-            name=name))
+    assign_op = gen_resource_variable_ops.assign_variable_op(
+        self.handle, value_tensor, name=name)
+    if read_value:
+      return self._lazy_read(assign_op)
+    return assign_op
 
   def _strided_slice_assign(self, begin, end, strides, value, name, begin_mask,
                             end_mask, ellipsis_mask, new_axis_mask,
@@ -846,7 +935,6 @@ class ResourceVariable(variables.Variable):
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
     if dtype is not None and dtype != self.dtype:
-      print("trying to switch the dtype to ", dtype, " from ", self.dtype)
       return NotImplemented
     if as_ref:
       return self.read_value().op.inputs[0]
@@ -895,6 +983,10 @@ class ResourceVariable(variables.Variable):
                        "Tensor object.")
 
 
+pywrap_tensorflow.TFE_Py_RegisterResourceVariableType(ResourceVariable)
+math_ops._resource_variable_type = ResourceVariable  # pylint: disable=protected-access
+
+
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
@@ -905,31 +997,31 @@ class _UnreadVariable(ResourceVariable):
   Pretends to be the tensor if anyone looks.
   """
 
-  def __init__(self, handle, dtype, handle_device,  # pylint: disable=super-init-not-called
-               shape, in_graph_mode, deleter, parent_op):
+  def __init__(self, handle, dtype,  # pylint: disable=super-init-not-called
+               shape, in_graph_mode, deleter, parent_op, unique_id):
     # We do not call super init on purpose.
     self._trainable = False
     self._save_slice_info = None
     self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
     self._in_graph_mode = in_graph_mode
     self._handle = handle
-    self._handle_device = handle_device
     self._shape = shape
     self._initial_value = None
     if isinstance(self._handle, ops.EagerTensor):
       self._handle_name = ""
     else:
       self._handle_name = self._handle.name
+    self._unique_id = unique_id
     self._dtype = dtype
     self._constraint = None
     self._cached_value = None
     self._is_initialized_op = None
     self._initializer_op = None
     self._parent_op = parent_op
-    if context.in_graph_mode():
-      self._graph_element = self.read_value()
-    else:
+    if context.executing_eagerly():
       self._graph_element = None
+    else:
+      self._graph_element = self.read_value()
     self._handle_deleter = deleter
 
   def value(self):
@@ -945,6 +1037,7 @@ class _UnreadVariable(ResourceVariable):
 
   def set_shape(self, shape):
     self._shape = shape
+    self._cached_shape_as_list = None
 
   @property
   def op(self):
@@ -1025,6 +1118,11 @@ ops.register_proto_function(
     proto_type=variable_pb2.VariableDef,
     to_proto=_to_proto_fn,
     from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.GLOBAL_STEP,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
 
 
 def is_resource_variable(var):
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index aa8d4327d2f0e93768728744d5cce3fed385393f..e94ad90dfd7fa75f6345b085b530e1eee6183035 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -13,16 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""RNN helpers for TensorFlow models.
-
-
-@@bidirectional_dynamic_rnn
-@@dynamic_rnn
-@@raw_rnn
-@@static_rnn
-@@static_state_saving_rnn
-@@static_bidirectional_rnn
-"""
+"""RNN helpers for TensorFlow models."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -45,29 +36,25 @@ from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
 _concat = rnn_cell_impl._concat
-_like_rnncell = rnn_cell_impl._like_rnncell
 # pylint: enable=protected-access
 
 
 def _transpose_batch_time(x):
-  """Transpose the batch and time dimensions of a Tensor.
+  """Transposes the batch and time dimensions of a Tensor.
 
-  Retains as much of the static shape information as possible.
+  If the input tensor has rank < 2 it returns the original tensor. Retains as
+  much of the static shape information as possible.
 
   Args:
-    x: A tensor of rank 2 or higher.
+    x: A Tensor.
 
   Returns:
     x transposed along the first two dimensions.
-
-  Raises:
-    ValueError: if `x` is rank 1 or lower.
   """
   x_static_shape = x.get_shape()
   if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
-    raise ValueError(
-        "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
-        (x, x_static_shape))
+    return x
+
   x_rank = array_ops.rank(x)
   x_t = array_ops.transpose(
       x, array_ops.concat(
@@ -403,11 +390,8 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
   Raises:
     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
   """
-
-  if not _like_rnncell(cell_fw):
-    raise TypeError("cell_fw must be an instance of RNNCell")
-  if not _like_rnncell(cell_bw):
-    raise TypeError("cell_bw must be an instance of RNNCell")
+  rnn_cell_impl.assert_like_rnncell("cell_fw", cell_fw)
+  rnn_cell_impl.assert_like_rnncell("cell_bw", cell_bw)
 
   with vs.variable_scope(scope or "bidirectional_rnn"):
     # Forward direction
@@ -568,14 +552,13 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
     TypeError: If `cell` is not an instance of RNNCell.
     ValueError: If inputs is None or an empty list.
   """
-  if not _like_rnncell(cell):
-    raise TypeError("cell must be an instance of RNNCell")
+  rnn_cell_impl.assert_like_rnncell("cell", cell)
 
   with vs.variable_scope(scope or "rnn") as varscope:
     # Create a new scope in which the caching device is either
     # determined by the parent scope, or is set to place the cached
     # Variable using the same placement as for the rest of the RNN.
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       if varscope.caching_device is None:
         varscope.set_caching_device(lambda op: op.device)
 
@@ -616,7 +599,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
           ["Expected shape for Tensor %s is " % x.name,
            packed_shape, " but saw shape: ", x_shape])
 
-    if context.in_graph_mode() and sequence_length is not None:
+    if not context.executing_eagerly() and sequence_length is not None:
       # Perform some shape validation
       with ops.control_dependencies(
           [_assert_has_shape(sequence_length, [batch_size])]):
@@ -742,7 +725,7 @@ def _dynamic_rnn_loop(cell,
                                         element_shape=element_shape,
                                         tensor_array_name=base_name + name)
 
-  in_graph_mode = context.in_graph_mode()
+  in_graph_mode = not context.executing_eagerly()
   if in_graph_mode:
     output_ta = tuple(
         _create_ta(
@@ -872,7 +855,7 @@ def raw_rnn(cell, loop_fn,
 
   ```python
   time = tf.constant(0, dtype=tf.int32)
-  (finished, next_input, initial_state, _, loop_state) = loop_fn(
+  (finished, next_input, initial_state, emit_structure, loop_state) = loop_fn(
       time=time, cell_output=None, cell_state=None, loop_state=None)
   emit_ta = TensorArray(dynamic_size=True, dtype=initial_state.dtype)
   state = initial_state
@@ -883,7 +866,7 @@ def raw_rnn(cell, loop_fn,
         loop_state=loop_state)
     # Emit zeros and copy forward state for minibatch entries that are finished.
     state = tf.where(finished, state, next_state)
-    emit = tf.where(finished, tf.zeros_like(emit), emit)
+    emit = tf.where(finished, tf.zeros_like(emit_structure), emit)
     emit_ta = emit_ta.write(time, emit)
     # If any new minibatch entries are marked as finished, mark these.
     finished = tf.logical_or(finished, next_finished)
@@ -943,10 +926,15 @@ def raw_rnn(cell, loop_fn,
       and `emit_output`: the output to store for this iteration.
 
       Note that `emit_output` should be a `Tensor` or (possibly nested)
-      tuple of tensors with shapes and structure matching `cell.output_size`
-      and `cell_output` above.  The parameter `cell_state` and output
-      `next_cell_state` may be either a single or (possibly nested) tuple
-      of tensors.  The parameter `loop_state` and
+      tuple of tensors which is aggregated in the `emit_ta` inside the
+      `while_loop`. For the first call to `loop_fn`, the `emit_output`
+      corresponds to the `emit_structure` which is then used to determine the
+      size of the `zero_tensor` for the `emit_ta` (defaults to
+      `cell.output_size`). For the subsequent calls to the `loop_fn`, the
+      `emit_output` corresponds to the actual output tensor
+      that is to be aggregated in the `emit_ta`. The parameter `cell_state`
+      and output `next_cell_state` may be either a single or (possibly nested)
+      tuple of tensors.  The parameter `loop_state` and
       output `next_loop_state` may be either a single or (possibly nested) tuple
       of `Tensor` and `TensorArray` objects.  This last parameter
       may be ignored by `loop_fn` and the return value may be `None`.  If it
@@ -1015,9 +1003,8 @@ def raw_rnn(cell, loop_fn,
     TypeError: If `cell` is not an instance of RNNCell, or `loop_fn` is not
       a `callable`.
   """
+  rnn_cell_impl.assert_like_rnncell("cell", cell)
 
-  if not _like_rnncell(cell):
-    raise TypeError("cell must be an instance of RNNCell")
   if not callable(loop_fn):
     raise TypeError("loop_fn must be a callable")
 
@@ -1027,7 +1014,7 @@ def raw_rnn(cell, loop_fn,
   # determined by the parent scope, or is set to place the cached
   # Variable using the same placement as for the rest of the RNN.
   with vs.variable_scope(scope or "rnn") as varscope:
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       if varscope.caching_device is None:
         varscope.set_caching_device(lambda op: op.device)
 
@@ -1229,9 +1216,7 @@ def static_rnn(cell,
     ValueError: If `inputs` is `None` or an empty list, or if the input depth
       (column size) cannot be inferred from inputs via shape inference.
   """
-
-  if not _like_rnncell(cell):
-    raise TypeError("cell must be an instance of RNNCell")
+  rnn_cell_impl.assert_like_rnncell("cell", cell)
   if not nest.is_sequence(inputs):
     raise TypeError("inputs must be a sequence")
   if not inputs:
@@ -1242,7 +1227,7 @@ def static_rnn(cell,
   # determined by the parent scope, or is set to place the cached
   # Variable using the same placement as for the rest of the RNN.
   with vs.variable_scope(scope or "rnn") as varscope:
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       if varscope.caching_device is None:
         varscope.set_caching_device(lambda op: op.device)
 
@@ -1469,11 +1454,8 @@ def static_bidirectional_rnn(cell_fw,
     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
     ValueError: If inputs is None or an empty list.
   """
-
-  if not _like_rnncell(cell_fw):
-    raise TypeError("cell_fw must be an instance of RNNCell")
-  if not _like_rnncell(cell_bw):
-    raise TypeError("cell_bw must be an instance of RNNCell")
+  rnn_cell_impl.assert_like_rnncell("cell_fw", cell_fw)
+  rnn_cell_impl.assert_like_rnncell("cell_bw", cell_bw)
   if not nest.is_sequence(inputs):
     raise TypeError("inputs must be a sequence")
   if not inputs:
diff --git a/tensorflow/python/ops/rnn_cell.py b/tensorflow/python/ops/rnn_cell.py
index c0dac8fb0125036798ca434ebc5bc321c90f5533..79eab1854a9d7b59aefa7d7b24c0eed561e40882 100644
--- a/tensorflow/python/ops/rnn_cell.py
+++ b/tensorflow/python/ops/rnn_cell.py
@@ -12,30 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Module for constructing RNN Cells.
-
-## Base interface for all RNN Cells
-
-@@RNNCell
-
-## RNN Cells for use with TensorFlow's core RNN methods
-
-@@BasicRNNCell
-@@BasicLSTMCell
-@@GRUCell
-@@LSTMCell
-
-## Classes storing split `RNNCell` state
-
-@@LSTMStateTuple
-
-## RNN Cell wrappers (RNNCells that wrap other RNNCells)
-
-@@MultiRNNCell
-@@DropoutWrapper
-@@DeviceWrapper
-@@ResidualWrapper
-"""
+"""Module for constructing RNN Cells."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -44,8 +21,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.rnn_cell_impl import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 923348ea44e18a87e09fe1c0424f0323eb967e3d..67f753485b8c30e31371c3bc1a24c335df976b5a 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -46,6 +46,7 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -54,6 +55,8 @@ _BIAS_VARIABLE_NAME = "bias"
 _WEIGHTS_VARIABLE_NAME = "kernel"
 
 
+# TODO(jblespiau): Remove this function when we are sure there are no longer
+# any usage (even if protected, it is being used). Prefer assert_like_rnncell.
 def _like_rnncell(cell):
   """Checks that a given object is an RNNCell by using duck typing."""
   conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
@@ -61,6 +64,45 @@ def _like_rnncell(cell):
   return all(conditions)
 
 
+# This can be used with self.assertRaisesRegexp for assert_like_rnncell.
+ASSERT_LIKE_RNNCELL_ERROR_REGEXP = "is not an RNNCell"
+
+
+def assert_like_rnncell(cell_name, cell):
+  """Raises a TypeError if cell is not like an RNNCell.
+
+  NOTE: Do not rely on the error message (in particular in tests) which can be
+  subject to change to increase readability. Use
+  ASSERT_LIKE_RNNCELL_ERROR_REGEXP.
+
+  Args:
+    cell_name: A string to give a meaningful error referencing to the name
+      of the functionargument.
+    cell: The object which should behave like an RNNCell.
+
+  Raises:
+    TypeError: A human-friendly exception.
+  """
+  conditions = [
+      hasattr(cell, "output_size"),
+      hasattr(cell, "state_size"),
+      hasattr(cell, "zero_state"),
+      callable(cell),
+  ]
+  errors = [
+      "'output_size' property is missing",
+      "'state_size' property is missing",
+      "'zero_state' method is missing",
+      "is not callable"
+  ]
+
+  if not all(conditions):
+
+    errors = [error for error, cond in zip(errors, conditions) if not cond]
+    raise TypeError("The argument {!r} ({}) is not an RNNCell: {}.".format(
+        cell_name, cell, ", ".join(errors)))
+
+
 def _concat(prefix, suffix, static=False):
   """Concat that enables int, Tensor, or TensorShape values.
 
@@ -127,7 +169,7 @@ def _zero_state_tensors(state_size, batch_size, dtype):
     """Combine s with batch_size to get a proper tensor shape."""
     c = _concat(batch_size, s)
     size = array_ops.zeros(c, dtype=dtype)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       c_static = _concat(batch_size, s, static=True)
       size.set_shape(c_static)
     return size
@@ -191,12 +233,13 @@ class RNNCell(base_layer.Layer):
 
   def _rnn_get_variable(self, getter, *args, **kwargs):
     variable = getter(*args, **kwargs)
-    if context.in_graph_mode():
-      trainable = (variable in tf_variables.trainable_variables() or
-                   (isinstance(variable, tf_variables.PartitionedVariable) and
-                    list(variable)[0] in tf_variables.trainable_variables()))
-    else:
+    if context.executing_eagerly():
       trainable = variable._trainable  # pylint: disable=protected-access
+    else:
+      trainable = (
+          variable in tf_variables.trainable_variables() or
+          (isinstance(variable, tf_variables.PartitionedVariable) and
+           list(variable)[0] in tf_variables.trainable_variables()))
     if trainable and variable not in self._trainable_weights:
       self._trainable_weights.append(variable)
     elif not trainable and variable not in self._non_trainable_weights:
@@ -240,7 +283,7 @@ class RNNCell(base_layer.Layer):
     # Try to use the last cached zero_state. This is done to avoid recreating
     # zeros, especially when eager execution is enabled.
     state_size = self.state_size
-    is_eager = context.in_eager_mode()
+    is_eager = context.executing_eagerly()
     if is_eager and hasattr(self, "_last_zero_state"):
       (last_state_size, last_batch_size, last_dtype,
        last_output) = getattr(self, "_last_zero_state")
@@ -309,10 +352,17 @@ class BasicRNNCell(LayerRNNCell):
     name: String, the name of the layer. Layers with the same name will
       share weights, but to avoid mistakes we require reuse=True in such
       cases.
+    dtype: Default dtype of the layer (default of `None` means use the type
+      of the first input). Required when `build` is called before `call`.
   """
 
-  def __init__(self, num_units, activation=None, reuse=None, name=None):
-    super(BasicRNNCell, self).__init__(_reuse=reuse, name=name)
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None):
+    super(BasicRNNCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
     self.input_spec = base_layer.InputSpec(ndim=2)
@@ -370,6 +420,8 @@ class GRUCell(LayerRNNCell):
     name: String, the name of the layer. Layers with the same name will
       share weights, but to avoid mistakes we require reuse=True in such
       cases.
+    dtype: Default dtype of the layer (default of `None` means use the type
+      of the first input). Required when `build` is called before `call`.
   """
 
   def __init__(self,
@@ -378,8 +430,9 @@ class GRUCell(LayerRNNCell):
                reuse=None,
                kernel_initializer=None,
                bias_initializer=None,
-               name=None):
-    super(GRUCell, self).__init__(_reuse=reuse, name=name)
+               name=None,
+               dtype=None):
+    super(GRUCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
     self.input_spec = base_layer.InputSpec(ndim=2)
@@ -488,8 +541,14 @@ class BasicLSTMCell(LayerRNNCell):
   that follows.
   """
 
-  def __init__(self, num_units, forget_bias=1.0,
-               state_is_tuple=True, activation=None, reuse=None, name=None):
+  def __init__(self,
+               num_units,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None):
     """Initialize the basic LSTM cell.
 
     Args:
@@ -507,11 +566,13 @@ class BasicLSTMCell(LayerRNNCell):
       name: String, the name of the layer. Layers with the same name will
         share weights, but to avoid mistakes we require reuse=True in such
         cases.
+      dtype: Default dtype of the layer (default of `None` means use the type
+        of the first input). Required when `build` is called before `call`.
 
       When restoring from CudnnLSTM-trained checkpoints, must use
       `CudnnCompatibleLSTMCell` instead.
     """
-    super(BasicLSTMCell, self).__init__(_reuse=reuse, name=name)
+    super(BasicLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -556,9 +617,9 @@ class BasicLSTMCell(LayerRNNCell):
     Args:
       inputs: `2-D` tensor with shape `[batch_size, input_size]`.
       state: An `LSTMStateTuple` of state tensors, each shaped
-        `[batch_size, self.state_size]`, if `state_is_tuple` has been set to
+        `[batch_size, num_units]`, if `state_is_tuple` has been set to
         `True`.  Otherwise, a `Tensor` shaped
-        `[batch_size, 2 * self.state_size]`.
+        `[batch_size, 2 * num_units]`.
 
     Returns:
       A pair containing the new hidden state, and the new state (either a
@@ -625,7 +686,7 @@ class LSTMCell(LayerRNNCell):
                initializer=None, num_proj=None, proj_clip=None,
                num_unit_shards=None, num_proj_shards=None,
                forget_bias=1.0, state_is_tuple=True,
-               activation=None, reuse=None, name=None):
+               activation=None, reuse=None, name=None, dtype=None):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -658,11 +719,13 @@ class LSTMCell(LayerRNNCell):
       name: String, the name of the layer. Layers with the same name will
         share weights, but to avoid mistakes we require reuse=True in such
         cases.
+      dtype: Default dtype of the layer (default of `None` means use the type
+        of the first input). Required when `build` is called before `call`.
 
       When restoring from CudnnLSTM-trained checkpoints, use
       `CudnnCompatibleLSTMCell` instead.
     """
-    super(LSTMCell, self).__init__(_reuse=reuse, name=name)
+    super(LSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -722,10 +785,14 @@ class LSTMCell(LayerRNNCell):
         shape=[input_depth + h_depth, 4 * self._num_units],
         initializer=self._initializer,
         partitioner=maybe_partitioner)
+    if self.dtype is None:
+      initializer = init_ops.zeros_initializer
+    else:
+      initializer = init_ops.zeros_initializer(dtype=self.dtype)
     self._bias = self.add_variable(
         _BIAS_VARIABLE_NAME,
         shape=[4 * self._num_units],
-        initializer=init_ops.zeros_initializer(dtype=self.dtype))
+        initializer=initializer)
     if self._use_peepholes:
       self._w_f_diag = self.add_variable("w_f_diag", shape=[self._num_units],
                                          initializer=self._initializer)
@@ -912,8 +979,8 @@ class DropoutWrapper(RNNCell):
         but not `callable`.
       ValueError: if any of the keep_probs are not between 0 and 1.
     """
-    if not _like_rnncell(cell):
-      raise TypeError("The parameter cell is not a RNNCell.")
+    assert_like_rnncell("cell", cell)
+
     if (dropout_state_filter_visitor is not None
         and not callable(dropout_state_filter_visitor)):
       raise TypeError("dropout_state_filter_visitor must be callable")
@@ -1163,7 +1230,16 @@ class DeviceWrapper(RNNCell):
 
 @tf_export("nn.rnn_cell.MultiRNNCell")
 class MultiRNNCell(RNNCell):
-  """RNN cell composed sequentially of multiple simple cells."""
+  """RNN cell composed sequentially of multiple simple cells.
+
+  Example:
+
+  ```python
+  num_units = [128, 64]
+  cells = [BasicLSTMCell(num_units=n) for n in num_units]
+  stacked_rnn_cell = MultiRNNCell(cells)
+  ```
+  """
 
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
@@ -1187,6 +1263,12 @@ class MultiRNNCell(RNNCell):
           "cells must be a list or tuple, but saw: %s." % cells)
 
     self._cells = cells
+    for cell_number, cell in enumerate(self._cells):
+      # Add Checkpointable dependencies on these cells so their variables get
+      # saved with this object when using object-based saving.
+      if isinstance(cell, checkpointable.CheckpointableBase):
+        # TODO(allenl): Track down non-Checkpointable callers.
+        self._track_checkpointable(cell, name="cell-%d" % (cell_number,))
     self._state_is_tuple = state_is_tuple
     if not state_is_tuple:
       if any(nest.is_sequence(c.state_size) for c in self._cells):
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 6fe2f61016775b410045fefcc8764907b8ea39f3..9f1dd2c4fdb8234347110cf1d8adcf445e387449 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Script Language Operators. See the @{$python/script_ops} guide.
-
-@@py_func
-"""
+"""Script Language Operators. See the @{$python/script_ops} guide."""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -25,6 +22,9 @@ from __future__ import print_function
 
 import threading
 
+# Used by py_util.cc to get tracebacks.
+import traceback  # pylint: disable=unused-import
+
 import numpy as np
 import six
 
@@ -33,6 +33,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_script_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -51,6 +52,16 @@ class EagerFunc(object):
     self._func = func
     self._out_dtypes = Tout
 
+  def _convert(self, value, dtype):
+    if isinstance(value, resource_variable_ops.ResourceVariable):
+      raise RuntimeError(
+          "Attempting to return a variable from an eagerly executed py_func. "
+          "Only numeric data structures like Tensors or NumPy arrays should "
+          "be returned; to return the value of a variable, make sure to obtain "
+          "the Tensor backing it by calling `.read_value()` on the variable in "
+          "question: %s" % value)
+    return ops.convert_to_tensor(value, dtype=dtype)
+
   def __call__(self, on_gpu, args):
     """Passes `args` to `self._func`, which is executed eagerly."""
     with context.eager_mode():
@@ -58,14 +69,13 @@ class EagerFunc(object):
       maybe_copy_to_gpu = lambda x: x if not on_gpu else x.gpu()
       if isinstance(ret, (tuple, list)):
         return [
-            maybe_copy_to_gpu(ops.convert_to_tensor(x, dtype=dtype))
+            maybe_copy_to_gpu(self._convert(x, dtype=dtype))
             for (x, dtype) in zip(ret, self._out_dtypes)
         ]
       elif ret is None:
         return ret
       else:
-        return maybe_copy_to_gpu(
-            ops.convert_to_tensor(ret, dtype=self._out_dtypes[0]))
+        return maybe_copy_to_gpu(self._convert(ret, dtype=self._out_dtypes[0]))
 
 
 class FuncRegistry(object):
@@ -219,18 +229,16 @@ def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
   graph._cleanup_py_funcs_used_in_graph.append(cleanup)
   # pylint: enable=protected-access
 
-  # pylint: disable=protected-access
   if eager:
-    result = gen_script_ops._eager_py_func(
+    result = gen_script_ops.eager_py_func(
         input=inp, token=token, Tout=Tout, name=name)
   else:
     if stateful:
-      result = gen_script_ops._py_func(
+      result = gen_script_ops.py_func(
           input=inp, token=token, Tout=Tout, name=name)
     else:
-      result = gen_script_ops._py_func_stateless(
+      result = gen_script_ops.py_func_stateless(
           input=inp, token=token, Tout=Tout, name=name)
-  # pylint: enable=protected-access
   return result if is_list_or_tuple else result[0]
 
 
@@ -319,11 +327,15 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   Returns:
     A list of `Tensor` or a single `Tensor` which `func` computes.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     result = func(*[x.numpy() for x in inp])
     result = nest.flatten(result)
 
-    return [x if x is None else ops.convert_to_tensor(x) for x in result]
+    result = [x if x is None else ops.convert_to_tensor(x) for x in result]
+    if len(result) == 1:
+      # Mimic the automatic unwrapping in graph-mode py_func
+      result, = result
+    return result
 
   return _internal_py_func(
       func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
diff --git a/tensorflow/python/ops/sdca_ops.py b/tensorflow/python/ops/sdca_ops.py
index 8b7e5abbc227353ebde421dc747c27df0aff87e8..4d5aeec59125a53866195aef2c09a314d9961836 100644
--- a/tensorflow/python/ops/sdca_ops.py
+++ b/tensorflow/python/ops/sdca_ops.py
@@ -13,10 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """A Dual Coordinate Ascent optimizer library for training fast linear models.
-
-@@sdca_optimizer
-@@sdca_fprint
-@@sdca_shrink_l1
 """
 
 # pylint: disable=g-bad-name
@@ -31,11 +27,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops.gen_sdca_ops import *
 # pylint: enable=wildcard-import
 
-from tensorflow.python.util.all_util import remove_undocumented
-
 ops.NotDifferentiable("SdcaFprint")
 ops.NotDifferentiable("SdcaOptimizer")
 ops.NotDifferentiable("SdcaShrinkL1")
-
-
-remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index cedd36c1deed541adcf601ff9447345e2279e8f9..dee84bab0ce007ee62995f0ab8b2c9a117bfb496 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -13,13 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tensor Handle Operations. See the @{$python/session_ops} guide.
-
-@@get_session_handle
-@@get_session_handle_v2
-@@get_session_tensor
-@@delete_session_tensor
-"""
+"""Tensor Handle Operations. See the @{$python/session_ops} guide."""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -182,7 +176,7 @@ def get_session_handle(data, name=None):
 
   # Colocate this operation with data.
   with ops.colocate_with(data):
-    return gen_data_flow_ops._get_session_handle(data, name=name)  # pylint: disable=protected-access
+    return gen_data_flow_ops.get_session_handle(data, name=name)
 
 
 @tf_export("get_session_tensor")
@@ -222,7 +216,7 @@ def get_session_tensor(handle, dtype, name=None):
   with ops.device(handle_device):
     holder = array_ops.placeholder(dtypes.string)
     _register_handle_feeder(holder.graph, holder, dtype)
-    tensor = gen_data_flow_ops._get_session_tensor(holder, dtype, name=name)
+    tensor = gen_data_flow_ops.get_session_tensor(holder, dtype, name=name)
   return (holder, tensor)
 
 
@@ -246,7 +240,7 @@ def delete_session_tensor(handle, name=None):
   handle_device = TensorHandle._get_device_name(handle)
   with ops.device(handle_device):
     holder = array_ops.placeholder(dtypes.string)
-    deleter = gen_data_flow_ops._delete_session_tensor(holder, name=name)
+    deleter = gen_data_flow_ops.delete_session_tensor(holder, name=name)
   return (holder, deleter)
 
 
@@ -268,7 +262,7 @@ def _get_handle_reader(graph, handle, dtype):
     with graph.as_default(), graph.device(handle_device):
       holder = array_ops.placeholder(dtypes.string)
       _register_handle_feeder(holder.graph, holder, dtype)
-      reader = gen_data_flow_ops._get_session_tensor(holder, dtype)
+      reader = gen_data_flow_ops.get_session_tensor(holder, dtype)
     result = (holder, reader)
     graph._handle_readers[graph_key] = result
   return result
@@ -289,7 +283,7 @@ def _get_handle_mover(graph, feeder, handle):
     # Create mover if we haven't done it.
     holder, reader = _get_handle_reader(graph, handle, dtype)
     with graph.as_default(), graph.device(feeder.op.device):
-      mover = gen_data_flow_ops._get_session_handle(reader)  # pylint: disable=protected-access
+      mover = gen_data_flow_ops.get_session_handle(reader)
     result = (holder, mover)
     graph._handle_movers[graph_key] = result
   return result
@@ -303,7 +297,7 @@ def _get_handle_deleter(graph, deleter_key, handle):
     handle_device = TensorHandle._get_device_name(handle)
     with graph.as_default(), graph.device(handle_device):
       holder = array_ops.placeholder(dtypes.string)
-      deleter = gen_data_flow_ops._delete_session_tensor(holder)
+      deleter = gen_data_flow_ops.delete_session_tensor(holder)
     result = (holder, deleter)
     graph._handle_deleters[deleter_key] = result
   return result
diff --git a/tensorflow/python/ops/sets.py b/tensorflow/python/ops/sets.py
index ea4677befe6fcad3709e14384d455f21010147f1..41ff241beab5f18a78f0682b434f4febecd5a87e 100644
--- a/tensorflow/python/ops/sets.py
+++ b/tensorflow/python/ops/sets.py
@@ -12,13 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tensorflow set operations.
-
-@@set_size
-@@set_intersection
-@@set_union
-@@set_difference
-"""
+"""Tensorflow set operations."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -28,8 +22,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.sets_impl import *
 # pylint: enable=wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index b0eecd8a1e812857de8f47e1370e4fc5f1004bc0..21e08d03d213c173d12dfc6676fe7f009811e93f 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -247,7 +247,7 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
     #
     # collections.OrderedDict([
     #     ((0, 0, 0), 2),
-    #     ((0, 0, 1), 3),
+    #     ((0, 1, 0), 3),
     # ])
   ```
 
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index 5295e7d21c2b5810422ec36f5aced63c9039feca..97353d6c747cb7e4d3c1fa92ad61af24fb17de91 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -88,10 +88,8 @@ def _SparseAddGrad(op, *grads):
   # the non-zero elements of the sum, and we will peek into `sum_indices` in the
   # gradient op.
 
-  # pylint: disable=protected-access
-  a_val_grad, b_val_grad = gen_sparse_ops._sparse_add_grad(val_grad, a_indices,
-                                                           b_indices,
-                                                           sum_indices)
+  a_val_grad, b_val_grad = gen_sparse_ops.sparse_add_grad(
+      val_grad, a_indices, b_indices, sum_indices)
   a_val_grad.set_shape(op.inputs[1].get_shape())
   b_val_grad.set_shape(op.inputs[4].get_shape())
   # (a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh)
@@ -151,7 +149,7 @@ def _SparseTensorDenseMatMulGrad(op, grad):
                               "complex gradients.")
 
   # gradient w.r.t. dense
-  b_grad = gen_sparse_ops._sparse_tensor_dense_mat_mul(  # pylint: disable=protected-access
+  b_grad = gen_sparse_ops.sparse_tensor_dense_mat_mul(
       a_indices, a_values, a_shape, grad, adjoint_a=not adj_a)
   if adj_b:
     b_grad = array_ops.transpose(b_grad)
@@ -278,8 +276,7 @@ def _SparseFillEmptyRowsGrad(op, unused_grad_output_indices, output_grad_values,
   """Gradients for SparseFillEmptyRows."""
   reverse_index_map = op.outputs[3]
 
-  # pylint: disable=protected-access
-  d_values, d_default_value = gen_sparse_ops._sparse_fill_empty_rows_grad(
+  d_values, d_default_value = gen_sparse_ops.sparse_fill_empty_rows_grad(
       reverse_index_map=reverse_index_map, grad_values=output_grad_values)
 
   # d_indices, d_values, d_dense_shape, d_default_value.
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 0fbbf5a805f1439d85ad53f02bdb665c04248606..3e398db394402587c18408f136cf7ca2a6b4ee1c 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -14,33 +14,7 @@
 # ==============================================================================
 
 # pylint: disable=g-short-docstring-punctuation
-"""Sparse Tensor Representation. See the @{$python/sparse_ops} guide.
-
-@@SparseTensor
-@@SparseTensorValue
-@@sparse_to_dense
-@@sparse_tensor_to_dense
-@@sparse_to_indicator
-@@sparse_merge
-@@sparse_concat
-@@sparse_reorder
-@@sparse_reshape
-@@sparse_slice
-@@sparse_split
-@@sparse_retain
-@@sparse_reset_shape
-@@sparse_fill_empty_rows
-@@sparse_transpose
-@@sparse_reduce_max
-@@sparse_reduce_max_sparse
-@@sparse_reduce_sum
-@@sparse_reduce_sum_sparse
-@@sparse_add
-@@sparse_softmax
-@@sparse_tensor_dense_matmul
-@@sparse_maximum
-@@sparse_minimum
-"""
+"""Sparse Tensor Representation. See the @{$python/sparse_ops} guide."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -234,7 +208,7 @@ def sparse_concat(axis,
     ]
 
   output_ind, output_val, output_shape = (
-      gen_sparse_ops._sparse_concat(inds, vals, shapes, axis, name=name))
+      gen_sparse_ops.sparse_concat(inds, vals, shapes, axis, name=name))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -302,8 +276,8 @@ def sparse_add(a, b, thresh=0):
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype.base_dtype, name="thresh")
     output_ind, output_val, output_shape = (
-        gen_sparse_ops._sparse_add(a.indices, a.values, a.dense_shape,
-                                   b.indices, b.values, b.dense_shape, thresh))
+        gen_sparse_ops.sparse_add(a.indices, a.values, a.dense_shape,
+                                  b.indices, b.values, b.dense_shape, thresh))
 
     # Attempt to get output_shape statically.
     a.get_shape().assert_is_compatible_with(b.get_shape())
@@ -317,8 +291,8 @@ def sparse_add(a, b, thresh=0):
     # swap to make `a` the SparseTensor.
     if isinstance(b, sparse_classes):
       a, b = b, a
-    return gen_sparse_ops._sparse_tensor_dense_add(a.indices, a.values,
-                                                   a.dense_shape, b)
+    return gen_sparse_ops.sparse_tensor_dense_add(a.indices, a.values,
+                                                  a.dense_shape, b)
 
 
 def _sparse_cross(inputs, name=None):
@@ -402,7 +376,7 @@ def _sparse_cross_internal(inputs,
                            num_buckets=0,
                            hash_key=None,
                            name=None):
-  """See gen_sparse_ops._sparse_cross."""
+  """See gen_sparse_ops.sparse_cross."""
   if not isinstance(inputs, list):
     raise TypeError("Inputs must be a list")
   if not all(
@@ -432,7 +406,7 @@ def _sparse_cross_internal(inputs,
       dense_inputs[i] = math_ops.to_int64(dense_inputs[i])
       internal_type = dtypes.int64
 
-  indices_out, values_out, shape_out = gen_sparse_ops._sparse_cross(
+  indices_out, values_out, shape_out = gen_sparse_ops.sparse_cross(
       indices=indices,
       values=values,
       shapes=shapes,
@@ -511,7 +485,7 @@ def sparse_reorder(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   reordered_ind, reordered_val = (
-      gen_sparse_ops._sparse_reorder(
+      gen_sparse_ops.sparse_reorder(
           sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
   if sp_input.get_shape().is_fully_defined():
@@ -575,7 +549,7 @@ def sparse_reshape(sp_input, shape, name=None):
   shape = math_ops.cast(shape, dtype=dtypes.int64)
 
   with ops.name_scope(name, "SparseReshape", [sp_input]) as name:
-    reshaped_ind, reshaped_shape = gen_sparse_ops._sparse_reshape(
+    reshaped_ind, reshaped_shape = gen_sparse_ops.sparse_reshape(
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
     reshaped_shape_const = tensor_util.constant_value(shape)
@@ -671,7 +645,7 @@ def sparse_split(keyword_required=KeywordRequired(),
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   output_inds, output_vals, output_shapes = (
-      gen_sparse_ops._sparse_split(
+      gen_sparse_ops.sparse_split(
           axis,
           sp_input.indices,
           sp_input.values,
@@ -782,7 +756,7 @@ def sparse_to_dense(sparse_indices,
     Dense `Tensor` of shape `output_shape`.  Has the same type as
     `sparse_values`.
   """
-  return gen_sparse_ops._sparse_to_dense(
+  return gen_sparse_ops.sparse_to_dense(
       sparse_indices,
       output_shape,
       sparse_values,
@@ -1412,7 +1386,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
     default_value = ops.convert_to_tensor(
         default_value, dtype=sp_input.values.dtype)
     (output_indices, output_values, empty_row_indicator,
-     unused_reverse_index_map) = gen_sparse_ops._sparse_fill_empty_rows(
+     unused_reverse_index_map) = gen_sparse_ops.sparse_fill_empty_rows(
          indices=sp_input.indices,
          values=sp_input.values,
          dense_shape=sp_input.dense_shape,
@@ -1441,7 +1415,7 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  return gen_sparse_ops._serialize_sparse(
+  return gen_sparse_ops.serialize_sparse(
       sp_input.indices,
       sp_input.values,
       sp_input.dense_shape,
@@ -1476,7 +1450,7 @@ def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  return gen_sparse_ops._serialize_many_sparse(
+  return gen_sparse_ops.serialize_many_sparse(
       sp_input.indices,
       sp_input.values,
       sp_input.dense_shape,
@@ -1541,7 +1515,7 @@ def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
 
   """
   output_indices, output_values, output_shape = (
-      gen_sparse_ops._deserialize_sparse(serialized_sparse, dtype, name=name))
+      gen_sparse_ops.deserialize_sparse(serialized_sparse, dtype, name=name))
 
   # Feed rank data back in, if available
   output_indices.set_shape([None, rank])
@@ -1610,7 +1584,7 @@ def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
     All of the serialized `SparseTensor`s must have had the same rank and type.
   """
   output_indices, output_values, output_shape = (
-      gen_sparse_ops._deserialize_many_sparse(
+      gen_sparse_ops.deserialize_many_sparse(
           serialized_sparse, dtype, name=name))
 
   # Feed rank data back in, if available
@@ -1828,7 +1802,7 @@ def sparse_tensor_dense_matmul(sp_a,
   with ops.name_scope(name, "SparseTensorDenseMatMul",
                       [sp_a.indices, sp_a.values, b]) as name:
     b = ops.convert_to_tensor(b, name="b")
-    return gen_sparse_ops._sparse_tensor_dense_mat_mul(
+    return gen_sparse_ops.sparse_tensor_dense_mat_mul(
         a_indices=sp_a.indices,
         a_values=sp_a.values,
         a_shape=sp_a.dense_shape,
@@ -2046,7 +2020,7 @@ def _add_sparse_to_tensors_map(sp_input,
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  return gen_sparse_ops._add_sparse_to_tensors_map(
+  return gen_sparse_ops.add_sparse_to_tensors_map(
       sp_input.indices,
       sp_input.values,
       sp_input.dense_shape,
@@ -2086,7 +2060,7 @@ def _add_many_sparse_to_tensors_map(sp_input,
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  return gen_sparse_ops._add_many_sparse_to_tensors_map(
+  return gen_sparse_ops.add_many_sparse_to_tensors_map(
       sp_input.indices,
       sp_input.values,
       sp_input.dense_shape,
@@ -2167,7 +2141,7 @@ def _take_many_sparse_from_tensors_map(sparse_map_op,
   with ops.colocate_with(sparse_map_op):
     shared_name = sparse_map_op.get_attr("shared_name") or sparse_map_op.name
     output_indices, output_values, output_shape = (
-        gen_sparse_ops._take_many_sparse_from_tensors_map(
+        gen_sparse_ops.take_many_sparse_from_tensors_map(
             sparse_handles,
             dtype=sparse_map_op.get_attr("T"),
             container=sparse_map_op.get_attr("container"),
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 6d7eaababcd94d687ff20dddc35c68a98320a19b..6204adef3bb5dc96dab4a16bf05824d32627fccc 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -14,9 +14,7 @@
 # ==============================================================================
 """Arithmetic Operations that don't fit into math_ops due to dependencies.
 
-To avoid circular dependencies, some math_ops should go here.  Documentation
-callouts, e.g. "@@my_op" should go in math_ops.  To the user, these are just
-normal math_ops.
+To avoid circular dependencies, some math_ops should go here.
 """
 
 from __future__ import absolute_import
@@ -163,7 +161,7 @@ def einsum(equation, *inputs, **kwargs):
     if '...' in equation:
       raise ValueError('Subscripts with ellipses are not yet supported.')
 
-    match = re.match('([a-z,]+)(->[a-z]*)?', equation)
+    match = re.match('^([a-zA-Z,]+)(->[a-zA-Z]*)?$', equation)
     if not match:
       raise ValueError('Indices have incorrect format: %s' % equation)
 
@@ -402,7 +400,7 @@ def _exponential_space_einsum(equation, *inputs):
   if '...' in equation:
     raise ValueError('Subscripts with ellipses are not yet supported.')
 
-  match = re.match('([a-z,]+)(->[a-z]*)?', equation)
+  match = re.match('^([a-zA-Z,]+)(->[a-zA-Z]*)?$', equation)
   if not match:
     raise ValueError('Indices have incorrect format: %s' % equation)
 
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 2c212f45483eacfd3fd27eecb8d7b2c846b5fe96..d7c3a7e8dc7c2ad611cf47718dddcf38700ce304 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -192,6 +192,9 @@ class EinsumTest(test.TestCase):
       'abc,cba',
       'dba,ead,cad->bce',
       'aef,fbc,dca->bde',
+      'iJ,Jk->ik',
+      'iJ,Ki->JK',
+      'iJk,Jklm->Jk'
   ]
 
   long_cases = [
@@ -208,6 +211,8 @@ class EinsumTest(test.TestCase):
       'ijk ijk',
       'ij.jk->ik',
       'ij...,jk...->ik...',
+      'ij,k ->kji',
+      'ij,k-> kji',
 
       # axis in output that does not exist
       'ij,jk->im',
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/spectral_ops.py
index a5796882768a87c76e0acdec9b3d99caf41e02eb..28054f50ef3b1227f12376b4b3700a7618270d65 100644
--- a/tensorflow/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/spectral_ops.py
@@ -12,22 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Spectral operators (e.g. DCT, FFT, RFFT).
-
-@@dct
-@@fft
-@@ifft
-@@fft2d
-@@ifft2d
-@@fft3d
-@@ifft3d
-@@rfft
-@@irfft
-@@rfft2d
-@@irfft2d
-@@rfft3d
-@@irfft3d
-"""
+"""Spectral operators (e.g. DCT, FFT, RFFT)."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -40,7 +25,6 @@ from tensorflow.python.framework import tensor_util as _tensor_util
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import math_ops as _math_ops
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -249,5 +233,3 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disabl
       dct2 *= weights
 
     return dct2
-
-remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index f6d9111009dc4f6a58ac81e7071ed7fe406600fa..a2d24711e2291bafcf5736c6206ceb09ac210453 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -22,17 +22,17 @@ from __future__ import print_function
 
 import sys as _sys
 
+# pylint: disable=g-bad-import-order
 # Imports the following modules so that @RegisterGradient get executed.
 from tensorflow.python.ops import array_grad
+from tensorflow.python.ops import cudnn_rnn_grad
 from tensorflow.python.ops import data_flow_grad
 from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import math_grad
-from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import sparse_grad
 from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
 from tensorflow.python.ops import tensor_array_grad
-from tensorflow.python.util.all_util import remove_undocumented
 
 
 # go/tf-wildcard-import
@@ -60,6 +60,7 @@ from tensorflow.python.ops.io_ops import *
 from tensorflow.python.ops.linalg_ops import *
 from tensorflow.python.ops.logging_ops import Print
 from tensorflow.python.ops.logging_ops import get_summary_op
+from tensorflow.python.ops.logging_ops import timestamp
 from tensorflow.python.ops.lookup_ops import initialize_all_tables
 from tensorflow.python.ops.lookup_ops import tables_initializer
 from tensorflow.python.ops.manip_ops import *
@@ -79,6 +80,8 @@ from tensorflow.python.ops.state_ops import scatter_add
 from tensorflow.python.ops.state_ops import scatter_div
 from tensorflow.python.ops.state_ops import scatter_mul
 from tensorflow.python.ops.state_ops import scatter_sub
+from tensorflow.python.ops.state_ops import scatter_min
+from tensorflow.python.ops.state_ops import scatter_max
 from tensorflow.python.ops.state_ops import scatter_update
 from tensorflow.python.ops.state_ops import scatter_nd_add
 from tensorflow.python.ops.state_ops import scatter_nd_sub
@@ -93,212 +96,5 @@ from tensorflow.python.ops.tensor_array_ops import *
 from tensorflow.python.ops.variable_scope import *
 from tensorflow.python.ops.variables import *
 # pylint: enable=wildcard-import
+# pylint: enable=g-bad-import-order
 
-#### For use in remove_undocumented below:
-from tensorflow.python.framework import constant_op as _constant_op
-from tensorflow.python.ops import array_ops as _array_ops
-from tensorflow.python.ops import check_ops as _check_ops
-from tensorflow.python.ops import clip_ops as _clip_ops
-from tensorflow.python.ops import confusion_matrix as _confusion_matrix
-from tensorflow.python.ops import control_flow_ops as _control_flow_ops
-from tensorflow.python.ops import data_flow_ops as _data_flow_ops
-from tensorflow.python.ops import functional_ops as _functional_ops
-from tensorflow.python.ops import gradients as _gradients
-from tensorflow.python.ops import histogram_ops as _histogram_ops
-from tensorflow.python.ops import init_ops as _init_ops
-from tensorflow.python.ops import io_ops as _io_ops
-from tensorflow.python.ops import linalg_ops as _linalg_ops
-from tensorflow.python.ops import logging_ops as _logging_ops
-from tensorflow.python.ops import manip_ops as _manip_ops
-from tensorflow.python.ops import math_ops as _math_ops
-from tensorflow.python.ops import numerics as _numerics
-from tensorflow.python.ops import parsing_ops as _parsing_ops
-from tensorflow.python.ops import partitioned_variables as _partitioned_variables
-from tensorflow.python.ops import random_ops as _random_ops
-from tensorflow.python.ops import script_ops as _script_ops
-from tensorflow.python.ops import session_ops as _session_ops
-from tensorflow.python.ops import sparse_ops as _sparse_ops
-from tensorflow.python.ops import special_math_ops as _special_math_ops
-from tensorflow.python.ops import state_ops as _state_ops
-from tensorflow.python.ops import string_ops as _string_ops
-from tensorflow.python.ops import template as _template
-from tensorflow.python.ops import tensor_array_ops as _tensor_array_ops
-from tensorflow.python.ops import variable_scope as _variable_scope
-from tensorflow.python.ops import variables as _variables
-
-
-_allowed_symbols_math_ops = [
-    # TODO(drpng): decide if we want to reference these in the documentation.
-    "reduced_shape",
-    "sparse_segment_mean_grad",
-    "sparse_segment_sqrt_n_grad",
-
-    # Legacy: will be removed.
-    "arg_max",
-    "arg_min",
-    "lin_space",
-    "sparse_matmul",  # Use tf.matmul.
-    # Deprecated (see versions.h):
-    "batch_fft",
-    "batch_fft2d",
-    "batch_fft3d",
-    "batch_ifft",
-    "batch_ifft2d",
-    "batch_ifft3d",
-    "mul",  # use tf.multiply instead.
-    "neg",  # use tf.negative instead.
-    "sub",  # use tf.subtract instead.
-
-    # These are documented in nn.
-    # We are not importing nn because it would create a circular dependency.
-    "sigmoid",
-    "log_sigmoid",
-    "tanh",
-]
-
-_allowed_symbols_array_ops = [
-    # TODO(drpng): make sure they are documented.
-    # Scalars:
-    "NEW_AXIS",
-    "SHRINK_AXIS",
-    "newaxis",
-
-    # Documented in training.py.
-    # I do not import train, to avoid circular dependencies.
-    # TODO(drpng): this is defined in gen_array_ops, clearly not the right
-    # place.
-    "stop_gradient",
-
-    # See gen_docs_combined for tf.copy documentation.
-    "copy",
-
-    ## TODO(drpng): make them inaccessible directly.
-    ## TODO(drpng): Below, to-doc means that we need to find an appropriate
-    ##  documentation section to reference.
-    ## For re-exporting to tf.*:
-    "constant",
-    "edit_distance",  # to-doc
-    # From gen_array_ops:
-    "copy_host",  # to-doc
-    "immutable_const",  # to-doc
-    "invert_permutation",  # to-doc
-    "quantize_and_dequantize",  # to-doc
-
-    # TODO(drpng): legacy symbols to be removed.
-    "list_diff",  # Use tf.listdiff instead.
-    "batch_matrix_diag",
-    "batch_matrix_band_part",
-    "batch_matrix_diag_part",
-    "batch_matrix_set_diag",
-]
-
-_allowed_symbols_partitioned_variables = [
-    "PartitionedVariable",   # Requires doc link.
-    # Legacy.
-    "create_partitioned_variables",
-    "variable_axis_size_partitioner",
-    "min_max_variable_partitioner",
-    "fixed_size_partitioner",
-]
-
-_allowed_symbols_control_flow_ops = [
-    # TODO(drpng): Find a place in the documentation to reference these or
-    # remove.
-    "control_trigger",
-    "loop_cond",
-    "merge",
-    "switch",
-]
-
-_allowed_symbols_functional_ops = [
-    "nest",  # Used by legacy code.
-]
-
-_allowed_symbols_gradients = [
-    # Documented in training.py:
-    # Not importing training.py to avoid complex graph dependencies.
-    "AggregationMethod",
-    "gradients",  # tf.gradients = gradients.gradients
-    "hessians",
-]
-
-_allowed_symbols_clip_ops = [
-    # Documented in training.py:
-    # Not importing training.py to avoid complex graph dependencies.
-    "clip_by_average_norm",
-    "clip_by_global_norm",
-    "clip_by_norm",
-    "clip_by_value",
-    "global_norm",
-]
-
-_allowed_symbols_image_ops = [
-    # Documented in training.py.
-    # We are not importing training.py to avoid complex dependencies.
-    "audio_summary",
-    "histogram_summary",
-    "image_summary",
-    "merge_all_summaries",
-    "merge_summary",
-    "scalar_summary",
-
-    # TODO(drpng): link in training.py if it should be documented.
-    "get_summary_op",
-]
-
-_allowed_symbols_variable_scope_ops = [
-    "get_local_variable",  # Documented in framework package.
-]
-
-_allowed_symbols_misc = [
-    "deserialize_many_sparse",
-    "parse_single_sequence_example",
-    "serialize_many_sparse",
-    "serialize_sparse",
-    "confusion_matrix",
-]
-
-_allowed_symbols = (_allowed_symbols_array_ops +
-                    _allowed_symbols_clip_ops +
-                    _allowed_symbols_control_flow_ops +
-                    _allowed_symbols_functional_ops +
-                    _allowed_symbols_image_ops +
-                    _allowed_symbols_gradients +
-                    _allowed_symbols_math_ops +
-                    _allowed_symbols_variable_scope_ops +
-                    _allowed_symbols_misc +
-                    _allowed_symbols_partitioned_variables)
-
-remove_undocumented(__name__, _allowed_symbols, [
-    _sys.modules[__name__],
-    _array_ops,
-    _check_ops,
-    _clip_ops,
-    _confusion_matrix,
-    _control_flow_ops,
-    _constant_op,
-    _data_flow_ops,
-    _functional_ops,
-    _gradients,
-    _histogram_ops,
-    _init_ops,
-    _io_ops,
-    _linalg_ops,
-    _logging_ops,
-    _manip_ops,
-    _math_ops,
-    _numerics,
-    _parsing_ops,
-    _partitioned_variables,
-    _random_ops,
-    _script_ops,
-    _session_ops,
-    _sparse_ops,
-    _special_math_ops,
-    _state_ops,
-    _string_ops,
-    _template,
-    _tensor_array_ops,
-    _variable_scope,
-    _variables,
-])
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 6c0a090d16bb328de40f02edf9865a0e0a62d385..94d7458ec8735836566033faae95a3aed3af1824 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -13,69 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Variables. See the @{$python/state_ops} guide.
-
-@@AUTO_REUSE
-@@IndexedSlices
-@@Saver
-@@Variable
-@@VariableScope
-@@all_variables
-@@assert_variables_initialized
-@@assign
-@@assign_add
-@@assign_sub
-@@constant_initializer
-@@export_meta_graph
-@@fixed_size_partitioner
-@@get_checkpoint_state
-@@get_local_variable
-@@get_variable
-@@get_variable_scope
-@@global_variables
-@@global_variables_initializer
-@@glorot_normal_initializer
-@@glorot_uniform_initializer
-@@import_meta_graph
-@@initialize_all_tables
-@@initialize_all_variables
-@@initialize_local_variables
-@@initialize_variables
-@@is_variable_initialized
-@@latest_checkpoint
-@@local_variables
-@@local_variables_initializer
-@@make_template
-@@min_max_variable_partitioner
-@@model_variables
-@@moving_average_variables
-@@no_regularizer
-@@ones_initializer
-@@orthogonal_initializer
-@@random_normal_initializer
-@@random_uniform_initializer
-@@report_uninitialized_variables
-@@scatter_add
-@@scatter_div
-@@scatter_mul
-@@scatter_nd_add
-@@scatter_nd_sub
-@@scatter_nd_update
-@@scatter_sub
-@@scatter_update
-@@sparse_mask
-@@tables_initializer
-@@trainable_variables
-@@truncated_normal_initializer
-@@uniform_unit_scaling_initializer
-@@update_checkpoint_state
-@@variable_axis_size_partitioner
-@@variable_op_scope
-@@variable_scope
-@@variables_initializer
-@@variance_scaling_initializer
-@@zeros_initializer
-"""
+"""Variables. See the @{$python/state_ops} guide."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -99,8 +37,8 @@ def variable_op(shape, dtype, name="Variable", set_shape=True, container="",
   """Deprecated. Used variable_op_v2 instead."""
   if not set_shape:
     shape = tensor_shape.unknown_shape()
-  ret = gen_state_ops._variable(shape=shape, dtype=dtype, name=name,
-                                container=container, shared_name=shared_name)
+  ret = gen_state_ops.variable(shape=shape, dtype=dtype, name=name,
+                               container=container, shared_name=shared_name)
   # TODO(mrry): Move this to where it is used, so we can get rid of this op
   #   wrapper?
   if set_shape:
@@ -127,11 +65,12 @@ def variable_op_v2(shape, dtype, name="Variable", container="", shared_name=""):
   Returns:
     A variable tensor.
   """
-  return gen_state_ops._variable_v2(shape=shape,
-                                    dtype=dtype,
-                                    name=name,
-                                    container=container,
-                                    shared_name=shared_name)
+  return gen_state_ops.variable_v2(
+      shape=shape,
+      dtype=dtype,
+      name=name,
+      container=container,
+      shared_name=shared_name)
 
 
 def init_variable(v, init, name="init"):
@@ -185,7 +124,7 @@ def is_variable_initialized(ref, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.is_variable_initialized(ref=ref, name=name)
   # Handle resource variables.
-  if context.in_eager_mode() or ref.op.type == "VarHandleOp":
+  if context.executing_eagerly() or ref.op.type == "VarHandleOp":
     return gen_resource_variable_ops.var_is_initialized_op(ref.handle,
                                                            name=name)
 
@@ -420,3 +359,55 @@ def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
       ref.handle, indices, ops.convert_to_tensor(updates, dtype=ref.dtype),
       use_locking, name)]):
     return ref.read_value()
+
+
+@tf_export("scatter_add")
+def scatter_add(ref, indices, updates, use_locking=False, name=None):
+  # pylint: disable=line-too-long
+  r"""Adds sparse updates to the variable referenced by `resource`.
+
+  This operation computes
+
+  ```python
+      # Scalar indices
+      ref[indices, ...] += updates[...]
+
+      # Vector indices (for each i)
+      ref[indices[i], ...] += updates[i, ...]
+
+      # High rank indices (for each i, ..., j)
+      ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+  ```
+
+  This operation outputs `ref` after the update is done.
+  This makes it easier to chain operations that need to use the updated value.
+  Duplicate entries are handled correctly: if multiple `indices` reference
+  the same location, their contributions add.
+
+  Requires `updates.shape = indices.shape + ref.shape[1:]`.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+  </div>
+
+  Args:
+    ref: A `Variable`.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A tensor of indices into the first dimension of `ref`.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A tensor of updated values to store in `ref`.
+    use_locking: An optional `bool`. Defaults to `True`.
+      If True, the assignment will be protected by a lock;
+      otherwise the behavior is undefined, but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    Same as `ref`.  Returned as a convenience for operations that want
+    to use the updated values after the update is done.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_add(ref, indices, updates,
+                                     use_locking=use_locking, name=name)
+  return ref._lazy_read(gen_resource_variable_ops.resource_scatter_add(  # pylint: disable=protected-access
+      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+      name=name))
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index b8c39d91b41790c6441594b175e8eaa03620e1ec..9f58c6a476c34df4624e2be2d9ac442bada212fa 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -16,17 +16,6 @@
 """Operations for working with string Tensors.
 
 See the @{$python/string_ops} guide.
-
-@@string_to_hash_bucket_fast
-@@string_to_hash_bucket_strong
-@@string_to_hash_bucket
-@@reduce_join
-@@string_join
-@@string_split
-@@substr
-@@as_string
-@@encode_base64
-@@decode_base64
 """
 
 from __future__ import absolute_import
@@ -93,10 +82,8 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string)
   source = ops.convert_to_tensor(source, dtype=dtypes.string)
 
-  # pylint: disable=protected-access
-  indices, values, shape = gen_string_ops._string_split(
+  indices, values, shape = gen_string_ops.string_split(
       source, delimiter=delimiter, skip_empty=skip_empty)
-  # pylint: enable=protected-access
   indices.set_shape([None, 2])
   values.set_shape([None])
   shape.set_shape([2])
@@ -141,6 +128,7 @@ def reduce_join(inputs, axis=None,
 reduce_join.__doc__ = deprecation.rewrite_argument_docstring(
     gen_string_ops.reduce_join.__doc__, "reduction_indices", "axis")
 
+ops.NotDifferentiable("RegexReplace")
 ops.NotDifferentiable("StringToHashBucket")
 ops.NotDifferentiable("StringToHashBucketFast")
 ops.NotDifferentiable("StringToHashBucketStrong")
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index 37b80d5e20bf06c041a669c14ac6d88201af2180..a793f634bda06ad43991fb978f865a2c5fe25437 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -23,6 +23,7 @@ import re
 
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.training import distribute
 
 
 def collect(val, collections, default_collections):
@@ -42,6 +43,16 @@ def collect(val, collections, default_collections):
 _INVALID_TAG_CHARACTERS = re.compile(r'[^-/\w\.]')
 
 
+def skip_summary():
+  # If using multiple towers in distributed strategy, skip summaries on all
+  # towers except the first one (tower_id=0).
+  # TODO(priyag): Add a new optional argument that will provide multiple
+  # alternatives to override default behavior. (e.g. run on last tower,
+  # compute sum or mean across towers).
+  tower_context = distribute.get_tower_context()
+  return tower_context and tower_context.tower_id > 0
+
+
 def clean_tag(name):
   """Cleans a tag. Removes illegal characters for instance.
 
diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py
index 7f4f4ce5ab4ee2bd309932cb81f05775996371d6..ec4d4a6e9242107fd7f4bebe1416198457e32cee 100644
--- a/tensorflow/python/ops/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 """Summary Operations."""
-# pylint: disable=protected-access
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import summary_pb2
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import summary_op_util
@@ -72,9 +72,11 @@ def tensor_summary(name,
 
   serialized_summary_metadata = summary_metadata.SerializeToString()
 
+  if summary_op_util.skip_summary():
+    return constant_op.constant("")
   with summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
-    val = gen_logging_ops._tensor_summary_v2(
+    val = gen_logging_ops.tensor_summary_v2(
         tensor=tensor,
         tag=tag,
         name=scope,
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/python/ops/summary_ops_v2.py
similarity index 85%
rename from tensorflow/contrib/summary/summary_ops.py
rename to tensorflow/python/ops/summary_ops_v2.py
index b6249fc92f712b21197c2167fb5d1c4af1f48ca5..b80f84eb7cde264c5a7c83eafacc344adb50b80a 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -26,15 +26,15 @@ import time
 
 import six
 
-from tensorflow.contrib.summary import gen_summary_ops
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.layers import utils
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_summary_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
@@ -74,10 +74,12 @@ def record_summaries_every_n_global_steps(n, global_step=None):
     global_step = training_util.get_or_create_global_step()
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
-  with ops.device("cpu:0"):
-    collection_ref[:] = [math_ops.equal(global_step % n, 0)]
-  yield
-  collection_ref[:] = old
+  try:
+    with ops.device("cpu:0"):
+      collection_ref[:] = [math_ops.equal(global_step % n, 0)]
+    yield
+  finally:
+    collection_ref[:] = old
 
 
 @tf_contextlib.contextmanager
@@ -85,9 +87,11 @@ def always_record_summaries():
   """Sets the should_record_summaries Tensor to always true."""
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
-  collection_ref[:] = [True]
-  yield
-  collection_ref[:] = old
+  try:
+    collection_ref[:] = [True]
+    yield
+  finally:
+    collection_ref[:] = old
 
 
 @tf_contextlib.contextmanager
@@ -95,9 +99,11 @@ def never_record_summaries():
   """Sets the should_record_summaries Tensor to always false."""
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
-  collection_ref[:] = [False]
-  yield
-  collection_ref[:] = old
+  try:
+    collection_ref[:] = [False]
+    yield
+  finally:
+    collection_ref[:] = old
 
 
 class SummaryWriter(object):
@@ -108,9 +114,11 @@ class SummaryWriter(object):
   - @{tf.contrib.summary.create_db_writer}
   """
 
-  def  __init__(self, resource):
+  def  __init__(self, resource, init_op_fn):
     self._resource = resource
-    if context.in_eager_mode() and self._resource is not None:
+    # TODO(nickfelt): cache constructed ops in graph mode
+    self._init_op_fn = init_op_fn
+    if context.executing_eagerly() and self._resource is not None:
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           handle=self._resource, handle_device="cpu:0")
 
@@ -125,13 +133,39 @@ class SummaryWriter(object):
       yield self
     else:
       old = context.context().summary_writer_resource
-      context.context().summary_writer_resource = self._resource
-      yield self
-      # Flushes the summary writer in eager mode or in graph functions, but not
-      # in legacy graph mode (you're on your own there).
+      try:
+        context.context().summary_writer_resource = self._resource
+        yield self
+        # Flushes the summary writer in eager mode or in graph functions, but
+        # not in legacy graph mode (you're on your own there).
+        with ops.device("cpu:0"):
+          gen_summary_ops.flush_summary_writer(self._resource)
+      finally:
+        context.context().summary_writer_resource = old
+
+
+  def init(self):
+    """Operation to initialize the summary writer resource."""
+    if self._resource is not None:
+      return self._init_op_fn()
+
+  def _flush(self):
+    return _flush_fn(writer=self)
+
+  def flush(self):
+    """Operation to force the summary writer to flush any buffered data."""
+    if self._resource is not None:
+      return self._flush()
+
+  def _close(self):
+    with ops.control_dependencies([self.flush()]):
       with ops.device("cpu:0"):
-        gen_summary_ops.flush_summary_writer(self._resource)
-      context.context().summary_writer_resource = old
+        return gen_summary_ops.close_summary_writer(self._resource)
+
+  def close(self):
+    """Operation to flush and close the summary writer resource."""
+    if self._resource is not None:
+      return self._close()
 
 
 def initialize(
@@ -158,7 +192,7 @@ def initialize(
       @{tf.contrib.summary.SummaryWriter}.
     ValueError: If session wasn't passed and no default session.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return
   if context.context().summary_writer_resource is None:
     raise RuntimeError("No default tf.contrib.summary.SummaryWriter found")
@@ -178,7 +212,7 @@ def create_file_writer(logdir,
                        flush_millis=None,
                        filename_suffix=None,
                        name=None):
-  """Creates a summary file writer in the current context.
+  """Creates a summary file writer in the current context under the given name.
 
   Args:
     logdir: a string, or None. If a string, creates a summary file writer
@@ -186,18 +220,20 @@ def create_file_writer(logdir,
      a mock object which acts like a summary writer but does nothing,
      useful to use as a context manager.
     max_queue: the largest number of summaries to keep in a queue; will
-     flush once the queue gets bigger than this.
-    flush_millis: the largest interval between flushes.
-    filename_suffix: optional suffix for the event file name.
+     flush once the queue gets bigger than this. Defaults to 10.
+    flush_millis: the largest interval between flushes. Defaults to 120,000.
+    filename_suffix: optional suffix for the event file name. Defaults to `.v2`.
     name: Shared name for this SummaryWriter resource stored to default
-      Graph.
+      Graph. Defaults to the provided logdir prefixed with `logdir:`. Note: if a
+      summary writer resource with this shared name already exists, the returned
+      SummaryWriter wraps that resource and the other arguments have no effect.
 
   Returns:
     Either a summary writer or an empty object which can be used as a
     summary writer.
   """
   if logdir is None:
-    return SummaryWriter(None)
+    return SummaryWriter(None, None)
   with ops.device("cpu:0"):
     if max_queue is None:
       max_queue = constant_op.constant(10)
@@ -205,6 +241,8 @@ def create_file_writer(logdir,
       flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
       filename_suffix = constant_op.constant(".v2")
+    if name is None:
+      name = "logdir:" + logdir
     return _make_summary_writer(
         name,
         gen_summary_ops.create_summary_file_writer,
@@ -267,13 +305,12 @@ def create_db_writer(db_uri,
 
 def _make_summary_writer(name, factory, **kwargs):
   resource = gen_summary_ops.summary_writer(shared_name=name)
+  init_op_fn = lambda: factory(resource, **kwargs)
   # TODO(apassos): Consider doing this instead.
-  # node = factory(resource, **kwargs)
-  # if not context.in_eager_mode():
-  #   ops.get_default_session().run(node)
-  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME,
-                        factory(resource, **kwargs))
-  return SummaryWriter(resource)
+  # if not context.executing_eagerly():
+  #   ops.get_default_session().run(init_op)
+  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME, init_op_fn())
+  return SummaryWriter(resource, init_op_fn)
 
 
 def _cleanse_string(name, pattern, value):
@@ -295,7 +332,7 @@ def all_summary_ops():
   Returns:
     The summary ops.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return None
   return ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
 
@@ -309,7 +346,7 @@ def summary_writer_initializer_op():
   Raises:
     RuntimeError: If in Eager mode.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError(
         "tf.contrib.summary.summary_writer_initializer_op is only "
         "supported in graph mode.")
@@ -328,8 +365,12 @@ def summary_writer_function(name, tensor, function, family=None):
   Returns:
     The result of writing the summary.
   """
+  name_scope = ops.get_name_scope()
+  if name_scope:
+    # Add a slash to allow reentering the name scope.
+    name_scope += "/"
   def record():
-    with summary_op_util.summary_scope(
+    with ops.name_scope(name_scope), summary_op_util.summary_scope(
         name, family, values=[tensor]) as (tag, scope):
       with ops.control_dependencies([function(tag, scope)]):
         return constant_op.constant(True)
@@ -337,7 +378,7 @@ def summary_writer_function(name, tensor, function, family=None):
   if context.context().summary_writer_resource is None:
     return control_flow_ops.no_op()
   with ops.device("cpu:0"):
-    op = utils.smart_cond(
+    op = smart_cond.smart_cond(
         should_record_summaries(), record, _nothing, name="")
     ops.add_to_collection(ops.GraphKeys._SUMMARY_COLLECTION, op)  # pylint: disable=protected-access
   return op
@@ -477,7 +518,7 @@ def graph(param, step=None, name=None):
   Raises:
     TypeError: If `param` isn't already a @{tf.Tensor} in graph mode.
   """
-  if not context.in_eager_mode() and not isinstance(param, ops.Tensor):
+  if not context.executing_eagerly() and not isinstance(param, ops.Tensor):
     raise TypeError("graph() needs a tf.Tensor (e.g. tf.placeholder) in graph "
                     "mode, but was: %s" % type(param))
   writer = context.context().summary_writer_resource
@@ -534,7 +575,14 @@ def flush(writer=None, name=None):
     writer = context.context().summary_writer_resource
     if writer is None:
       return control_flow_ops.no_op()
-  return gen_summary_ops.flush_summary_writer(writer, name=name)
+  else:
+    if isinstance(writer, SummaryWriter):
+      writer = writer._resource  # pylint: disable=protected-access
+  with ops.device("cpu:0"):
+    return gen_summary_ops.flush_summary_writer(writer, name=name)
+
+
+_flush_fn = flush  # for within SummaryWriter.flush()
 
 
 def eval_dir(model_dir, name=None):
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 424582b348d87d8a5b043ec9b771d8f2768a5994..9b6b8c508fcd7e73e6259d14b466892544ec65d7 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpointable
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.deprecation import deprecated
@@ -203,7 +204,7 @@ def make_template_internal(name_,
   if kwargs:
     func_ = tf_decorator.make_decorator(func_, functools.partial(
         func_, **kwargs))
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     if unique_name_ is not None:
       raise ValueError(
           "unique_name_ cannot be used when eager exeuction is enabled.")
@@ -230,7 +231,7 @@ def _skip_common_stack_elements(stacktrace, base_case):
   return stacktrace[-1:]
 
 
-class Template(object):
+class Template(checkpointable.CheckpointableBase):
   """Wrap a function to aid in variable sharing.
 
   Templates are functions that create variables the first time they are called
@@ -294,12 +295,115 @@ class Template(object):
     # which is not the same as whether the scope has been created.
     self._variables_created = False
 
+  @property
+  def _checkpoint_dependencies(self):
+    """Sanity checking for object-based saving.
+
+    Does not override Checkpointable dependency tracking, but checks that
+    variables accessible through Checkpointable dependencies on other `Template`
+    objects include all of the variable_scope-filtered `Template.variables`.
+
+    Returns:
+      A list of checkpointable.CheckpointableReference objects.
+    Raises:
+      ValueError: If this object is not compatible with object-based saving.
+    """
+    dependencies = super(Template, self)._checkpoint_dependencies
+    dependency_variables = []
+    for _, dependency in dependencies:
+      if isinstance(dependency, Template):
+        dependency_variables.extend(dependency.variables)
+      else:
+        dependency_variables.append(dependency)
+    dependency_variables = set(dependency_variables)
+    not_included_variables = []
+    for expected_variable in sorted(self.variables, key=lambda v: v.name):
+      if expected_variable not in dependency_variables:
+        not_included_variables.append(expected_variable)
+    if not_included_variables:
+      # Trying to save a Template which improperly tracks its variables.
+      raise ValueError(
+          ("The Template '%s' references variables which are not included via "
+           "object-based dependency tracking. Most likely a custom "
+           "getter/creator was registered which does not call Template's "
+           "custom variable creator (which is responsible for tracking "
+           "dependencies).\n\nExpected these variables to be dependencies: %s")
+          % (self, not_included_variables))
+    return dependencies
+
+  def _checkpointable_custom_creator(self, next_creator, name, initial_value,
+                                     checkpointable_parent=None, **kwargs):
+    """A variable creation hook which adds Checkpointable dependencies.
+
+    Set during the `Template`'s first wrapped function execution. Ensures that
+    (a) `Template` objects depend on `Template`s created inside them which
+    create variables, and (b) that any variables not in a more deeply nested
+    `Template` are added as dependencies directly.
+
+    The `checkpointable_parent` argument is passed between `Template` custom
+    creators but ignored when the variable object itself is created. This
+    argument indicates (if not `None`) that a more deeply nested `Template` has
+    already added the variable as a dependency, and that parent `Template`s
+    should add a dependency on that `Template` rather than on the variable
+    directly.
+
+    Args:
+      next_creator: See `variable_scope.variable_creator_scope`; the next
+        creator in the chain.
+      name: The (full, scope-influenced) name of the variable. The scope name
+        for the Template itself is stripped for the purposes of object-based
+        dependency tracking, but scopes within Templates are respected.
+      initial_value: See `variable_scope.variable_creator_scope`. Taken
+        explicitly so the argument can be re-named and used with
+        `Checkpointable._add_variable_with_custom_getter`.
+      checkpointable_parent: If not None, a more deeply nested Template object
+        to add a dependency on (rather than depending on the variable directly).
+      **kwargs: Passed through to the next creator.
+    Returns:
+      The output of `next_creator`: the fetched/created variable object.
+    """
+    def _call_next_creator_renaming_initializer(initializer, **inner_kwargs):
+      inner_kwargs.pop("name")  # Ignored; this is the scope-stripped name which
+      # we don't want to propagate.
+      return next_creator(
+          initial_value=initializer,
+          name=name,
+          **inner_kwargs)
+    if name.startswith(self._variable_scope.name):
+      scope_stripped_name = name[len(self._variable_scope.name) + 1:]
+      if not checkpointable_parent:
+        return self._add_variable_with_custom_getter(
+            initializer=initial_value,
+            name=scope_stripped_name,
+            getter=_call_next_creator_renaming_initializer,
+            # Disable error checking for Checkpointable. Exceptions are instead
+            # raised if necessary when the object-based saver tries to
+            # save/restore the object.
+            overwrite=True,
+            checkpointable_parent=self,
+            **kwargs)
+      else:
+        self._track_checkpointable(
+            checkpointable_parent,
+            name=checkpointable_parent._variable_scope.name[  # pylint: disable=protected-access
+                len(self._variable_scope.name) + 1:],
+            overwrite=True)
+    return next_creator(name=name, initial_value=initial_value,
+                        checkpointable_parent=self, **kwargs)
+
   def _call_func(self, args, kwargs):
     try:
       vars_at_start = len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
       trainable_at_start = len(
           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-      result = self._func(*args, **kwargs)
+      if self._variables_created:
+        result = self._func(*args, **kwargs)
+      else:
+        # The first time we run, restore variables if necessary (via
+        # Checkpointable).
+        with variable_scope.variable_creator_scope(
+            self._checkpointable_custom_creator):
+          result = self._func(*args, **kwargs)
 
       if self._variables_created:
         # Variables were previously created, implying this is not the first
@@ -348,8 +452,7 @@ class Template(object):
       # Only reuse variables if they were already created.
       with variable_scope.variable_scope(
           self._variable_scope, reuse=self._variables_created):
-        result = self._call_func(args, kwargs)
-      return result
+        return self._call_func(args, kwargs)
     else:
       # The scope was not created at construction time, so create it here.
       # Subsequent calls should reuse variables.
@@ -357,8 +460,7 @@ class Template(object):
           self._unique_name, self._name,
           custom_getter=self._custom_getter) as vs:
         self._variable_scope = vs
-        result = self._call_func(args, kwargs)
-        return result
+        return self._call_func(args, kwargs)
 
   @property
   def name(self):
@@ -479,7 +581,7 @@ class _EagerTemplateVariableStore(object):
       if self._variable_scope_name is None:
         raise RuntimeError("A variable scope must be set before an "
                            "_EagerTemplateVariableStore object exits.")
-      self._eager_variable_store._store.close_variable_subscopes(  # pylint: disable=protected-access
+      variable_scope.get_variable_scope_store().close_variable_subscopes(
           self._variable_scope_name)
 
   def _variables_in_scope(self, variable_list):
@@ -543,7 +645,7 @@ class EagerTemplate(Template):
     Raises:
       RuntimeError: if eager execution is not enabled.
     """
-    if not context.in_eager_mode():
+    if not context.executing_eagerly():
       raise RuntimeError(
           "{} objects can only be used when eager execution is enabled, use "
           "tf.Template for graph construction".
@@ -563,7 +665,14 @@ class EagerTemplate(Template):
     try:
       vars_at_start = self._template_store.variables()
       trainable_at_start = self._template_store.trainable_variables()
-      result = self._func(*args, **kwargs)
+      if self._variables_created:
+        result = self._func(*args, **kwargs)
+      else:
+        # The first time we run, restore variables if necessary (via
+        # Checkpointable).
+        with variable_scope.variable_creator_scope(
+            self._checkpointable_custom_creator):
+          result = self._func(*args, **kwargs)
 
       if self._variables_created:
         # Variables were previously created, implying this is not the first
@@ -619,8 +728,7 @@ class EagerTemplate(Template):
             self._variable_scope, reuse=variable_scope.AUTO_REUSE)
       with self._variable_scope_context_manager:
         with self._template_store.as_default():
-          result = self._call_func(args, kwargs)
-      return result
+          return self._call_func(args, kwargs)
     else:
       # The scope was not created at construction time, so create it here.
       # Subsequent calls should reuse variables.
@@ -632,8 +740,7 @@ class EagerTemplate(Template):
         # store's variable scope name is unset; set it here.
         self._template_store.set_variable_scope_name(vs.name)
         with self._template_store.as_default():
-          result = self._call_func(args, kwargs)
-        return result
+          return self._call_func(args, kwargs)
 
   @property
   def name(self):
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 3c08870146e447d84d4a5f620cbead633d94751f..d2f45ce37bbbbb58eb35aaeccd4c9497ee498f41 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TensorArray: a dynamically sized array of Tensors.
-
-@@TensorArray
-"""
+"""TensorArray: a dynamically sized array of Tensors."""
 # Mixture of pep8 and non-pep8 names, so disable pylint bad-name
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -148,7 +145,7 @@ class _GraphTensorArray(object):
         # will retroactively set the device value of this op.
         def create():
           """Create the TensorArray op."""
-          return gen_data_flow_ops._tensor_array_v3(
+          return gen_data_flow_ops.tensor_array_v3(
               dtype=dtype,
               size=size,
               element_shape=element_shape,
@@ -237,7 +234,7 @@ class _GraphTensorArray(object):
       flow = self.flow
     with ops.name_scope(name, "TensorArrayGrad", [self._handle]):
       with ops.colocate_with(self._handle):
-        g_handle, unused_flow = gen_data_flow_ops._tensor_array_grad_v3(
+        g_handle, unused_flow = gen_data_flow_ops.tensor_array_grad_v3(
             handle=self._handle, source=source, flow_in=flow, name=name)
         with ops.control_dependencies([g_handle]):
           flow = array_ops.identity(flow, name="gradient_flow")
@@ -252,7 +249,7 @@ class _GraphTensorArray(object):
 
   def read(self, index, name=None):
     """See TensorArray."""
-    value = gen_data_flow_ops._tensor_array_read_v3(
+    value = gen_data_flow_ops.tensor_array_read_v3(
         handle=self._handle,
         index=index,
         flow_in=self._flow,
@@ -270,7 +267,7 @@ class _GraphTensorArray(object):
       if self._infer_shape:
         self._merge_element_shape(value.shape)
       with self._maybe_colocate_with(value):
-        flow_out = gen_data_flow_ops._tensor_array_write_v3(
+        flow_out = gen_data_flow_ops.tensor_array_write_v3(
             handle=self._handle,
             index=index,
             value=value,
@@ -296,7 +293,7 @@ class _GraphTensorArray(object):
       element_shape = self._element_shape[0]
     else:
       element_shape = tensor_shape.TensorShape(None)
-    value = gen_data_flow_ops._tensor_array_gather_v3(
+    value = gen_data_flow_ops.tensor_array_gather_v3(
         handle=self._handle,
         indices=indices,
         flow_in=self._flow,
@@ -314,7 +311,7 @@ class _GraphTensorArray(object):
           tensor_shape.TensorShape(self._element_shape[0].dims[1:]))
     else:
       element_shape_except0 = tensor_shape.TensorShape(None)
-    value, _ = gen_data_flow_ops._tensor_array_concat_v3(
+    value, _ = gen_data_flow_ops.tensor_array_concat_v3(
         handle=self._handle,
         flow_in=self._flow,
         dtype=self._dtype,
@@ -338,10 +335,10 @@ class _GraphTensorArray(object):
     with ops.name_scope(name, "TensorArrayScatter",
                         [self._handle, value, indices]):
       value = ops.convert_to_tensor(value, name="value")
-      if self._infer_shape and context.in_graph_mode():
+      if self._infer_shape and not context.executing_eagerly():
         self._merge_element_shape(value.shape[1:])
       with self._maybe_colocate_with(value):
-        flow_out = gen_data_flow_ops._tensor_array_scatter_v3(
+        flow_out = gen_data_flow_ops.tensor_array_scatter_v3(
             handle=self._handle,
             indices=indices,
             value=value,
@@ -363,14 +360,14 @@ class _GraphTensorArray(object):
       value = ops.convert_to_tensor(value, name="value")
       with self._maybe_colocate_with(value):
         lengths_64 = math_ops.to_int64(lengths)
-        if self._infer_shape and context.in_graph_mode():
+        if self._infer_shape and not context.executing_eagerly():
           clengths = tensor_util.constant_value(lengths_64)
           if value.shape.dims is not None:
             if clengths is not None and clengths.max() == clengths.min():
               self._merge_element_shape(
                   tensor_shape.TensorShape([clengths[0]]).concatenate(
                       value.shape[1:]))
-        flow_out = gen_data_flow_ops._tensor_array_split_v3(
+        flow_out = gen_data_flow_ops.tensor_array_split_v3(
             handle=self._handle,
             value=value,
             lengths=lengths_64,
@@ -386,13 +383,13 @@ class _GraphTensorArray(object):
 
   def size(self, name=None):
     """See TensorArray."""
-    return gen_data_flow_ops._tensor_array_size_v3(
+    return gen_data_flow_ops.tensor_array_size_v3(
         handle=self._handle, flow_in=self.flow, name=name)
 
   @tf_should_use.should_use_result
   def close(self, name=None):
     """See TensorArray."""
-    return gen_data_flow_ops._tensor_array_close_v3(
+    return gen_data_flow_ops.tensor_array_close_v3(
         handle=self._handle, name=name)
 
 # pylint: enable=protected-access
@@ -774,10 +771,10 @@ class TensorArray(object):
       ValueError: if both handle and tensor_array_name are provided.
       TypeError: if handle is provided but is not a Tensor.
     """
-    if context.in_graph_mode():
-      implementation = _GraphTensorArray
-    else:
+    if context.executing_eagerly():
       implementation = _EagerTensorArray
+    else:
+      implementation = _GraphTensorArray
 
     self._implementation = implementation(
         dtype,
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 81565a63774da49628d100ef071b02f6311f6af2..ba213ef884165f7f72094d27932913e39c9a5901 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -24,6 +24,7 @@ import copy
 import enum  # pylint: disable=g-bad-import-order
 import functools
 import sys
+import threading
 import traceback
 
 import six
@@ -211,23 +212,8 @@ class _VariableStore(object):
     """Create a variable store."""
     self._vars = {}  # A dictionary of the stored TensorFlow variables.
     self._partitioned_vars = {}  # A dict of the stored PartitionedVariables.
-    self.variable_scopes_count = {}  # Count re-used variable scopes.
     self._store_eager_variables = False
 
-  def open_variable_scope(self, scope_name):
-    if scope_name in self.variable_scopes_count:
-      self.variable_scopes_count[scope_name] += 1
-    else:
-      self.variable_scopes_count[scope_name] = 1
-
-  def close_variable_subscopes(self, scope_name):
-    for k in self.variable_scopes_count:
-      if not scope_name or k.startswith(scope_name + "/"):
-        self.variable_scopes_count[k] = 0
-
-  def variable_scope_count(self, scope_name):
-    return self.variable_scopes_count.get(scope_name, 0)
-
   def get_variable(self, name, shape=None, dtype=dtypes.float32,
                    initializer=None, regularizer=None, reuse=None,
                    trainable=True, collections=None, caching_device=None,
@@ -321,7 +307,18 @@ class _VariableStore(object):
       raise ValueError(
           "Passed a custom_getter which is not callable: %s" % custom_getter)
 
-    if context.in_eager_mode():
+    with ops.init_scope():
+      if context.executing_eagerly():
+        # Variable creation and initialization takes place in `init_scope`s;
+        # as such, if an `init_scope` lifts us into the eager context, then we
+        # need to use `ResourceVariable`s.
+        use_resource = True
+
+    # Note that it's fine to reuse eager variables whose initialization was
+    # lifted from a function-building graph into the eager context (that's why
+    # the following clause is not wrapped in an `init_scope`); lifted variables
+    # are tracked by the graph's `VariableStore`.
+    if context.executing_eagerly():
       if not self._store_eager_variables and reuse:
         raise RuntimeError(
             "When eager execution is enabled variable reuse is only supported"
@@ -329,7 +326,6 @@ class _VariableStore(object):
             " EagerVariableStore for example usage.")
       if self._store_eager_variables:
         reuse = AUTO_REUSE
-      use_resource = True
 
     # If a *_ref type is passed in an error would be triggered further down the
     # stack. We prevent this using base_dtype to get a non-ref version of the
@@ -518,7 +514,7 @@ class _VariableStore(object):
         when violating reuse during variable creation, or if an existing
         sharded variable exists for the given name but with different sharding.
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise NotImplementedError("Partitioned variables are not yet supported "
                                 "when eager execution is enabled.")
 
@@ -798,7 +794,7 @@ class _VariableStore(object):
         validate_shape=validate_shape,
         constraint=constraint,
         use_resource=use_resource)
-    if context.in_graph_mode() or self._store_eager_variables:
+    if not context.executing_eagerly() or self._store_eager_variables:
       # In eager mode we do not want to keep default references to Variable
       # objects as this will prevent their memory from being released.
       self._vars[name] = v
@@ -811,12 +807,12 @@ class _VariableStore(object):
         with ops.name_scope(name + "/Regularizer/"):
           loss = regularizer(v)
         if loss is not None:
-          if context.in_graph_mode():
-            v_name = v.name
-            loss_name = loss.name
-          else:
+          if context.executing_eagerly():
             v_name = "v_%s" % type(v)
             loss_name = "loss_%s" % type(loss)
+          else:
+            v_name = v.name
+            loss_name = loss.name
           logging.vlog(1, "Applied regularizer to %s and added the result %s "
                        "to REGULARIZATION_LOSSES.", v_name, loss_name)
           ops.add_to_collection(ops.GraphKeys.REGULARIZATION_LOSSES, loss)
@@ -920,7 +916,7 @@ class VariableScope(object):
     self._dtype = dtype
     self._use_resource = use_resource
     self._constraint = constraint
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       if self._caching_device is not None:
         raise NotImplementedError("Caching devices is not yet supported "
                                   "when eager execution is enabled.")
@@ -988,7 +984,7 @@ class VariableScope(object):
 
   def set_use_resource(self, use_resource):
     """Sets whether to use ResourceVariables for this scope."""
-    if context.in_eager_mode() and not use_resource:
+    if context.executing_eagerly() and not use_resource:
       raise ValueError("When eager execution is enabled, "
                        "use_resource cannot be set to false.")
     self._use_resource = use_resource
@@ -999,14 +995,14 @@ class VariableScope(object):
 
   def set_caching_device(self, caching_device):
     """Set caching_device for this scope."""
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise NotImplementedError("Caching devices are not yet supported "
                                 "when eager execution is enabled.")
     self._caching_device = caching_device
 
   def set_partitioner(self, partitioner):
     """Set partitioner for this scope."""
-    if partitioner and context.in_eager_mode():
+    if partitioner and context.executing_eagerly():
       raise NotImplementedError("Partitioned variables are not yet supported "
                                 "when eager execution is enabled.")
     self._partitioner = partitioner
@@ -1057,14 +1053,14 @@ class VariableScope(object):
       partitioner = self._partitioner
     if custom_getter is None:
       custom_getter = self._custom_getter
-    if context.in_graph_mode():
+    if context.executing_eagerly():
+      reuse = False
+      use_resource = True
+    else:
       if reuse is None:
         reuse = self._reuse
       if use_resource is None:
         use_resource = self._use_resource
-    else:
-      reuse = False
-      use_resource = True
 
     full_name = self.name + "/" + name if self.name else name
     # Variable names only depend on variable_scope (full_name here),
@@ -1107,7 +1103,7 @@ class VariableScope(object):
                                 use_resource=None,
                                 constraint=None):
     """Gets an existing variable with this name or create a new one."""
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise NotImplementedError("Partitioned variables are not yet supported "
                                 "when eager execution is enabled.")
     if initializer is None:
@@ -1160,18 +1156,49 @@ class VariableScope(object):
 
 
 _VARSTORE_KEY = ("__variable_store",)
-_VARSCOPE_KEY = ("__varscope",)
+_VARSCOPESTORE_KEY = ("__varscope",)
+
+
+class _VariableScopeStore(threading.local):
+  """A thread local store for the current variable scope and scope counts."""
+
+  def __init__(self):
+    super(_VariableScopeStore, self).__init__()
+    self.current_scope = VariableScope(False)
+    self.variable_scopes_count = {}
+
+  def open_variable_scope(self, scope_name):
+    if scope_name in self.variable_scopes_count:
+      self.variable_scopes_count[scope_name] += 1
+    else:
+      self.variable_scopes_count[scope_name] = 1
+
+  def close_variable_subscopes(self, scope_name):
+    for k in list(self.variable_scopes_count.keys()):
+      if not scope_name or k.startswith(scope_name + "/"):
+        self.variable_scopes_count[k] = 0
+
+  def variable_scope_count(self, scope_name):
+    return self.variable_scopes_count.get(scope_name, 0)
+
+
+def get_variable_scope_store():
+  """Returns the variable scope store for current thread."""
+  scope_store = ops.get_collection(_VARSCOPESTORE_KEY)
+
+  if not scope_store:
+    scope_store = _VariableScopeStore()
+    ops.add_to_collection(_VARSCOPESTORE_KEY, scope_store)
+  else:
+    scope_store = scope_store[0]
+
+  return scope_store
 
 
 @tf_export("get_variable_scope")
 def get_variable_scope():
   """Returns the current variable scope."""
-  scope = ops.get_collection(_VARSCOPE_KEY)
-  if scope:  # This collection has at most 1 element, the default scope at [0].
-    return scope[0]
-  scope = VariableScope(False)
-  ops.add_to_collection(_VARSCOPE_KEY, scope)
-  return scope
+  return get_variable_scope_store().current_scope
 
 
 def _get_default_variable_store():
@@ -1274,6 +1301,9 @@ class EagerVariableStore(object):
     # pylint: enable=protected-access
 
 
+# The argument list for get_variable must match arguments to get_local_variable.
+# So, if you are updating the arguments, also update arguments to
+# get_local_variable below.
 @tf_export("get_variable")
 def get_variable(name,
                  shape=None,
@@ -1385,15 +1415,32 @@ get_variable.__doc__ = get_variable_or_local_docstring % (
     "GraphKeys.GLOBAL_VARIABLES")
 
 
-@functools.wraps(get_variable)
+# The argument list for get_local_variable must match arguments to get_variable.
+# So, if you are updating the arguments, also update arguments to get_variable.
 @tf_export("get_local_variable")
-def get_local_variable(*args, **kwargs):
-  kwargs["trainable"] = False
-  if "collections" in kwargs:
-    kwargs["collections"] += [ops.GraphKeys.LOCAL_VARIABLES]
+def get_local_variable(name,
+                       shape=None,
+                       dtype=None,
+                       initializer=None,
+                       regularizer=None,
+                       trainable=False,  # pylint: disable=unused-argument
+                       collections=None,
+                       caching_device=None,
+                       partitioner=None,
+                       validate_shape=True,
+                       use_resource=None,
+                       custom_getter=None,
+                       constraint=None):
+  if collections:
+    collections += [ops.GraphKeys.LOCAL_VARIABLES]
   else:
-    kwargs["collections"] = [ops.GraphKeys.LOCAL_VARIABLES]
-  return get_variable(*args, **kwargs)
+    collections = [ops.GraphKeys.LOCAL_VARIABLES]
+  return get_variable(
+      name, shape=shape, dtype=dtype, initializer=initializer,
+      regularizer=regularizer, trainable=False, collections=collections,
+      caching_device=caching_device, partitioner=partitioner,
+      validate_shape=validate_shape, use_resource=use_resource,
+      custom_getter=custom_getter, constraint=constraint)
 get_local_variable.__doc__ = get_variable_or_local_docstring % (
     "Gets an existing *local* variable or creates a new one.",
     "Behavior is the same as in `get_variable`, except that variables are\n"
@@ -1555,10 +1602,8 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
     self._dtype = dtype
     self._use_resource = use_resource
     self._constraint = constraint
-    get_variable_scope()  # Ensure that a default exists, then get a pointer.
-    # Get the reference to the collection as we want to modify it in place.
-    self._default_varscope = ops.get_collection_ref(_VARSCOPE_KEY)
     self._var_store = _get_default_variable_store()
+    self._var_scope_store = get_variable_scope_store()
     if isinstance(self._name_or_scope, VariableScope):
       self._new_name = self._name_or_scope.name
       name_scope = self._name_or_scope._name_scope  # pylint: disable=protected-access
@@ -1606,10 +1651,11 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
         a reuse scope, or if reuse is not `None` or `True`.
       TypeError: when the types of some arguments are not appropriate.
     """
-    self._old = self._default_varscope[0]
+    self._old = self._var_scope_store.current_scope
     if isinstance(self._name_or_scope, VariableScope):
-      self._var_store.open_variable_scope(self._new_name)
-      self._old_subscopes = copy.copy(self._var_store.variable_scopes_count)
+      self._var_scope_store.open_variable_scope(self._new_name)
+      self._old_subscopes = copy.copy(
+          self._var_scope_store.variable_scopes_count)
       variable_scope_object = self._cached_variable_scope_object
     else:
       # Handler for the case when we just prolong current variable scope.
@@ -1652,17 +1698,17 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
         variable_scope_object.set_dtype(self._dtype)
       if self._use_resource is not None:
         variable_scope_object.set_use_resource(self._use_resource)
-      self._var_store.open_variable_scope(self._new_name)
-    self._default_varscope[0] = variable_scope_object
+      self._var_scope_store.open_variable_scope(self._new_name)
+    self._var_scope_store.current_scope = variable_scope_object
     return variable_scope_object
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
     # If jumping out from a non-prolonged scope, restore counts.
     if isinstance(self._name_or_scope, VariableScope):
-      self._var_store.variable_scopes_count = self._old_subscopes
+      self._var_scope_store.variable_scopes_count = self._old_subscopes
     else:
-      self._var_store.close_variable_subscopes(self._new_name)
-    self._default_varscope[0] = self._old
+      self._var_scope_store.close_variable_subscopes(self._new_name)
+    self._var_scope_store.current_scope = self._old
 
 
 def _maybe_wrap_custom_getter(custom_getter, old_getter):
@@ -1687,13 +1733,13 @@ def _maybe_wrap_custom_getter(custom_getter, old_getter):
 
 def _get_unique_variable_scope(prefix):
   """Get a name with the given prefix unique in the current variable scope."""
-  var_store = _get_default_variable_store()
+  var_scope_store = get_variable_scope_store()
   current_scope = get_variable_scope()
   name = current_scope.name + "/" + prefix if current_scope.name else prefix
-  if var_store.variable_scope_count(name) == 0:
+  if var_scope_store.variable_scope_count(name) == 0:
     return prefix
   idx = 1
-  while var_store.variable_scope_count(name + ("_%d" % idx)) > 0:
+  while var_scope_store.variable_scope_count(name + ("_%d" % idx)) > 0:
     idx += 1
   return prefix + ("_%d" % idx)
 
@@ -1709,9 +1755,10 @@ class variable_scope(object):
   graph, ensures that graph is the default graph, and pushes a name scope and a
   variable scope.
 
-  If `name_or_scope` is not None, it is used as is. If `scope` is None, then
-  `default_name` is used.  In that case, if the same name has been previously
-  used in the same scope, it will be made unique by appending `_N` to it.
+  If `name_or_scope` is not None, it is used as is. If `name_or_scope` is None,
+  then `default_name` is used.  In that case, if the same name has been
+  previously used in the same scope, it will be made unique by appending `_N`
+  to it.
 
   Variable scope allows you to create new variables and to share already created
   ones while providing checks to not create or share by accident. For details,
@@ -1790,6 +1837,32 @@ class variable_scope(object):
   discouraged) to pass False to the reuse argument, yielding undocumented
   behaviour slightly different from None. Starting at 1.1.0 passing None and
   False as reuse has exactly the same effect.
+
+  A note about using variable scopes in multi-threaded environment: Variable
+  scopes are thread local, so one thread will not see another thread's current
+  scope. Also, when using `default_name`, unique scopes names are also generated
+  only on a per thread basis. If the same name was used within a different
+  thread, that doesn't prevent a new thread from creating the same scope.
+  However, the underlying variable store is shared across threads (within the
+  same graph). As such, if another thread tries to create a new variable with
+  the same name as a variable created by a previous thread, it will fail unless
+  reuse is True.
+
+  Further, each thread starts with an empty variable scope. So if you wish to
+  preserve name prefixes from a scope from the main thread, you should capture
+  the main thread's scope and re-enter it in each thread. For e.g.
+
+  ```
+  main_thread_scope = variable_scope.get_variable_scope()
+
+  # Thread's target function:
+  def thread_target_fn(captured_scope):
+    with variable_scope.variable_scope(captured_scope):
+      # .... regular code for this thread
+
+
+  thread = threading.Thread(target=thread_target_fn, args=(main_thread_scope,))
+  ```
   """
 
   def __init__(self,
@@ -1871,7 +1944,7 @@ class variable_scope(object):
       raise ValueError("The reuse parameter must be True or False or None.")
     if self._values is None:
       self._values = []
-    self._in_graph_mode = not context.in_eager_mode()
+    self._in_graph_mode = not context.executing_eagerly()
     if self._in_graph_mode:
       self._graph = ops._get_graph_from_inputs(self._values)  # pylint: disable=protected-access
     self._cached_pure_variable_scope = None
@@ -2111,13 +2184,13 @@ def default_variable_creator(next_creator=None, **kwargs):
   use_resource = kwargs.get("use_resource", None)
   if use_resource is None:
     use_resource = get_variable_scope().use_resource
-  if use_resource or (use_resource is None and context.in_eager_mode()):
+  if use_resource or (use_resource is None and context.executing_eagerly()):
     return resource_variable_ops.ResourceVariable(
         initial_value=initial_value, trainable=trainable,
         collections=collections, validate_shape=validate_shape,
         caching_device=caching_device, name=name, dtype=dtype,
         constraint=constraint)
-  elif not use_resource and context.in_eager_mode():
+  elif not use_resource and context.executing_eagerly():
     raise RuntimeError(
         "VariableScope should use resource variable when eager execution is"
         " enabled, but use_resource is False."
@@ -2145,7 +2218,7 @@ def variable(initial_value=None,
              constraint=None,
              use_resource=None):
   previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
-  for getter in ops.get_default_graph()._get_variable_creator_stack():  # pylint: disable=protected-access
+  for getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
     previous_getter = _make_getter(getter, previous_getter)
   return previous_getter(initial_value=initial_value,
                          trainable=trainable,
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index b785d0ede7433ab3105a3c865b998a8d19b6ea78..c646f795896f0abfce3eb9a57cadc27299714023 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -125,8 +125,8 @@ class Variable(checkpointable.CheckpointableBase):
 
   @compatibility(eager)
   `tf.Variable` is not compatible with eager execution.  Use
-  `tfe.Variable` instead which is compatible with both eager execution
-  and graph construction.  See [the TensorFlow Eager Execution
+  `tf.contrib.eager.Variable` instead which is compatible with both eager
+  execution and graph construction.  See [the TensorFlow Eager Execution
   guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
   for details on how variables work in eager execution.
   @end_compatibility
@@ -210,10 +210,11 @@ class Variable(checkpointable.CheckpointableBase):
     for details on how variables work in eager execution.
     @end_compatibility
     """
-    if not context.in_graph_mode():
-      raise RuntimeError("tf.Variable not supported in Eager mode. "
-                         "Please use tfe.Variable instead")
-    self._in_graph_mode = context.in_graph_mode()
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "tf.Variable not supported when eager execution is enabled. "
+          "Please use tf.contrib.eager.Variable instead")
+    self._in_graph_mode = True
     if variable_def:
       # If variable_def is provided, recreates the variable from its fields.
       if initial_value:
@@ -234,7 +235,7 @@ class Variable(checkpointable.CheckpointableBase):
           constraint=constraint)
 
   def __repr__(self):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       return "<tf.Variable '%s' shape=%s dtype=%s, numpy=%s>" % (
           self.name, self.get_shape(), self.dtype.name,
           ops.numpy_text(self.read_value(), is_repr=True))
@@ -292,6 +293,7 @@ class Variable(checkpointable.CheckpointableBase):
     Raises:
       ValueError: If the initial value is not specified, or does not have a
         shape and `validate_shape` is `True`.
+      RuntimeError: If lifted into the eager context.
     """
     _ = expected_shape
     if initial_value is None:
@@ -307,6 +309,9 @@ class Variable(checkpointable.CheckpointableBase):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
+    # Store the graph key so optimizers know how to only retrieve variables from
+    # this graph.
+    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
     if isinstance(initial_value, checkpointable.CheckpointInitialValue):
       self._maybe_initialize_checkpointable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
@@ -315,6 +320,11 @@ class Variable(checkpointable.CheckpointableBase):
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     with ops.init_scope():
+      # Ensure that we weren't lifted into the eager context.
+      if context.executing_eagerly():
+        raise RuntimeError(
+            "tf.Variable not supported when eager execution is enabled. "
+            "Please use tf.contrib.eager.Variable instead")
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
 
@@ -737,15 +747,15 @@ class Variable(checkpointable.CheckpointableBase):
     Raises:
         ValueError: Session is not passed and no default session
     """
-    if context.in_graph_mode():
+    if context.executing_eagerly():
+      self.assign(value)
+    else:
       session = session or ops.get_default_session()
       if session is None:
         raise ValueError(
             "Either session argument should be provided or default session "
             "should be established")
       session.run(self._initializer_op, {self._initializer_op.inputs[1]: value})
-    else:
-      self.assign(value)
 
   # Conversion to tensor.
   @staticmethod
@@ -792,17 +802,7 @@ class Variable(checkpointable.CheckpointableBase):
 
     setattr(Variable, operator, _run_op)
 
-  def _scatter_tensors_from_checkpoint(self, attributes):
-    """For implementing `Checkpointable`. Return an assignment op to run."""
-    if (len(attributes) != 1
-        or checkpointable.VARIABLE_VALUE_KEY not in attributes):
-      raise ValueError(
-          ("The variable %s was restored with unexpected values (expected one "
-           "with key %s, got %s)") % (
-               self, checkpointable.VARIABLE_VALUE_KEY, attributes))
-    return self.assign(attributes[checkpointable.VARIABLE_VALUE_KEY])
-
-  def _gather_tensors_for_checkpoint(self):
+  def _gather_saveables_for_checkpoint(self):
     """For implementing `Checkpointable`. This object is saveable on its own."""
     return {checkpointable.VARIABLE_VALUE_KEY: self}
 
@@ -1255,9 +1255,9 @@ class PartitionedVariable(object):
         information does not match `shape`, or `partitions` has invalid values.
       RuntimeError: If eager execution is enabled
     """
-    if not context.in_graph_mode():
-      raise RuntimeError("tf.PartitionedVariable not supported in "
-                         "eager mode. Please use tfe.Variable instead")
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "tf.PartitionedVariable not supported with eager execution enabled.")
     if not isinstance(variable_list, (list, tuple)):
       raise TypeError(
           "variable_list is not a list or tuple: %s" % variable_list)
@@ -1548,7 +1548,7 @@ def variables_initializer(var_list, name="init"):
   Returns:
     An Op that run the initializers of all the specified variables.
   """
-  if var_list and context.in_graph_mode():
+  if var_list and not context.executing_eagerly():
     return control_flow_ops.group(*[v.initializer for v in var_list], name=name)
   return control_flow_ops.no_op(name=name)
 
@@ -1570,7 +1570,7 @@ def global_variables_initializer():
   Returns:
     An Op that initializes global variables in the graph.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return control_flow_ops.no_op(name="global_variables_initializer")
   return variables_initializer(global_variables())
 
@@ -1592,7 +1592,7 @@ def local_variables_initializer():
   Returns:
     An Op that initializes all local variables in the graph.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return control_flow_ops.no_op(name="local_variables_initializer")
   return variables_initializer(local_variables())
 
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index cce64c0ccafc29a9d0d0b51b4c97c5673264657b..4c91bc3652dc77274acfbf43859c03fad8a46a38 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -22,7 +22,6 @@ import errno as _errno
 import sys as _sys
 
 from tensorflow.python.platform import flags
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -125,11 +124,3 @@ def run(main=None, argv=None):
   # to the final program.
   _sys.exit(main(argv))
 
-
-_allowed_symbols = [
-    'run',
-    # Allowed submodule.
-    'flags',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index dbefca2be9615b18418a92f4cbe0b1a0b2917449..478dd46f7e6965f8727e5741f2ccdfdc69247980 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -229,3 +229,25 @@ _COPY_TYPEMAPS(unsigned int, mode_t);
 %define final %enddef
 %define override %enddef
 #endif
+
+// Typemaps to automatically raise a Python exception from bad output TF_Status.
+// TODO(b/77295559): expand this to all TF_Status* output params and deprecate
+// raise_exception_on_not_ok_status (currently it only affects the C API).
+%typemap(in, numinputs=0) TF_Status* status (TF_Status* status) {
+  $1 = TF_NewStatus();
+}
+
+%typemap(freearg) (TF_Status* status) {
+ TF_DeleteStatus($1);
+}
+
+%typemap(argout) TF_Status* status {
+  TF_Code code = TF_GetCode($1);
+  if (code != TF_OK) {
+    PyObject* exc = tensorflow::PyExceptionRegistry::Lookup(code);
+    // Arguments to OpError.
+    PyObject* exc_args = Py_BuildValue("sss", nullptr, nullptr, TF_Message($1));
+    SWIG_SetErrorObj(exc, exc_args);
+    SWIG_fail;
+  }
+}
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index 315889e9aa8851138bf8b07b9803cc2d360f354a..fd697d70bf200f1f661b410a9636d7b60e87f430 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -33,7 +33,6 @@ from tensorflow.python.lib.io.file_io import rename as Rename
 from tensorflow.python.lib.io.file_io import stat as Stat
 from tensorflow.python.lib.io.file_io import walk as Walk
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -56,24 +55,3 @@ class FastGFile(_FileIO):
 # Does not alias to Open so that we use our version of GFile to strip
 # 'b' mode.
 Open = GFile
-
-# TODO(drpng): Find the right place to document these.
-_allowed_symbols = [
-    'Copy',
-    'DeleteRecursively',
-    'Exists',
-    'FastGFile',
-    'GFile',
-    'Glob',
-    'IsDirectory',
-    'ListDirectory',
-    'Open',
-    'MakeDirs',
-    'MkDir',
-    'Remove',
-    'Rename',
-    'Stat',
-    'Walk',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index 96219faab719e28a5fa8a9a21c83f81a6f8478e6..8141cf92c568f257a5e9810318182d71f445dfa1 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -36,6 +36,7 @@ from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 Benchmark = benchmark.TensorFlowBenchmark  # pylint: disable=invalid-name
@@ -138,6 +139,7 @@ def StatefulSessionAvailable():
   return False
 
 
+@tf_export('test.StubOutForTesting')
 class StubOutForTesting(object):
   """Support class for stubbing methods out for unit testing.
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index 8f7b12e2b2b92d9b2bfe397d0e7cba59e11bc1f6..b2d95518552de3a170d1c04bfc3f061dc8f8f54a 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -12,14 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Resource management library.
-
-@@get_data_files_path
-@@get_path_to_datafile
-@@get_root_dir_with_all_resources
-@@load_resource
-@@readahead_file_path
-"""
+"""Resource management library."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -28,7 +21,6 @@ import os as _os
 import sys as _sys
 
 from tensorflow.python.util import tf_inspect as _inspect
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -129,7 +121,3 @@ def get_path_to_datafile(path):
 def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
-
-
-_allowed_symbols = []
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index 5c50fa023dc3b216838390d9356a39e70e2362d2..7b6c9d19d0bb5ce779d1ed668feeffc548c900f8 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -13,13 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""System configuration library.
-
-@@get_include
-@@get_lib
-@@get_compile_flags
-@@get_link_flags
-"""
+"""System configuration library."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -28,7 +22,6 @@ import os.path as _os_path
 
 from tensorflow.python.framework.versions import CXX11_ABI_FLAG as _CXX11_ABI_FLAG
 from tensorflow.python.framework.versions import MONOLITHIC_BUILD as _MONOLITHIC_BUILD
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -68,7 +61,6 @@ def get_compile_flags():
   """
   flags = []
   flags.append('-I%s' % get_include())
-  flags.append('-I%s/external/nsync/public' % get_include())
   flags.append('-D_GLIBCXX_USE_CXX11_ABI=%d' % _CXX11_ABI_FLAG)
   return flags
 
@@ -85,6 +77,3 @@ def get_link_flags():
     flags.append('-L%s' % get_lib())
     flags.append('-ltensorflow_framework')
   return flags
-
-_allowed_symbols = []
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 9b7655722ac5a917f2753617f8e99bf2bd2f8d11..9ffb48c4a5626ddbec289ba890d63e2e22429fa7 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -19,19 +19,6 @@ See the @{$python/test} guide.
 
 Note: `tf.test.mock` is an alias to the python `mock` or `unittest.mock`
 depending on the python version.
-
-@@main
-@@TestCase
-@@test_src_dir_path
-@@assert_equal_graph_def
-@@get_temp_dir
-@@is_built_with_cuda
-@@is_gpu_available
-@@gpu_device_name
-@@compute_gradient
-@@compute_gradient_error
-@@create_local_cluster
-
 """
 
 from __future__ import absolute_import
@@ -42,7 +29,6 @@ from __future__ import print_function
 # pylint: disable=g-bad-import-order
 from tensorflow.python.framework import test_util as _test_util
 from tensorflow.python.platform import googletest as _googletest
-from tensorflow.python.util.all_util import remove_undocumented
 
 # pylint: disable=unused-import
 from tensorflow.python.framework.test_util import assert_equal_graph_def
@@ -62,6 +48,8 @@ if sys.version_info.major == 2:
 else:
   from unittest import mock  # pylint: disable=g-import-not-at-top
 
+tf_export('test.mock')(mock)
+
 # Import Benchmark class
 Benchmark = _googletest.Benchmark  # pylint: disable=invalid-name
 
@@ -106,13 +94,3 @@ def test_src_dir_path(relative_path):
 def is_built_with_cuda():
   """Returns whether TensorFlow was built with CUDA (GPU) support."""
   return _test_util.IsGoogleCudaEnabled()
-
-
-_allowed_symbols = [
-    # We piggy-back googletest documentation.
-    'Benchmark',
-    'mock',
-    'StubOutForTesting',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 22aabfd7121ac9b2eebeae2693f174e044d504ef..5962d2f220f5faab5072dd21e16a0963d391e2e8 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -34,7 +34,6 @@ import threading
 
 import six
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -287,35 +286,8 @@ def _get_thread_id():
 
 _log_prefix = google2_log_prefix
 
-# Controls which methods from pyglib.logging are available within the project.
-# Do not add methods here without also adding to platform/tf_logging.py.
-_allowed_symbols = [
-    'DEBUG',
-    'ERROR',
-    'FATAL',
-    'INFO',
-    'TaskLevelStatusMessage',
-    'WARN',
-    'debug',
-    'error',
-    'fatal',
-    'flush',
-    'get_verbosity',
-    'info',
-    'log',
-    'log_if',
-    'log_every_n',
-    'log_first_n',
-    'set_verbosity',
-    'vlog',
-    'warn',
-    'warning',
-]
-
 tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
 tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
 tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
 tf_export('logging.INFO').export_constant(__name__, 'INFO')
 tf_export('logging.WARN').export_constant(__name__, 'WARN')
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index c815aad0a065eaba4a0dc52487b5ee67e271a146..0654104a3436366bb5fe88e2c3415cc957cbfde8 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -156,18 +156,3 @@ py_test(
         "@com_google_pprof//:pprof_proto_py",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 362a1c49e64118134a4039ae3a5d939ed0b6d730..994206cd63a915de93bc109e7b217ad997c787a7 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -70,18 +70,3 @@ cuda_py_test(
         "no_pip",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 0e20ca35bba606079ed5b0f225dd3029772b5af3..acf02096fffe8b38e68824878fa698ed69d3895c 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -172,7 +172,7 @@ class Profiler(object):
       op_log: optional. tensorflow::tfprof::OpLogProto proto. Used to define
           extra op types.
     """
-    if not graph and context.in_graph_mode():
+    if not graph and not context.executing_eagerly():
       graph = ops.get_default_graph()
     self._coverage = 0.0
     self._graph = graph
@@ -336,7 +336,7 @@ def profile(graph=None,
     If cmd is 'op' or 'code', returns MultiGraphNodeProto proto.
     Side effect: stdout/file/timeline.json depending on options['output']
   """
-  if not graph and context.in_graph_mode():
+  if not graph and not context.executing_eagerly():
     graph = ops.get_default_graph()
 
   if options == _DEFAULT_PROFILE_OPTIONS:
diff --git a/tensorflow/python/profiler/profiler.py b/tensorflow/python/profiler/profiler.py
index fa7f30b236997cecd6d5df98c334aa7f5cc571e4..efbdd1ba6842d85e82149346e9b4559527a1aacd 100644
--- a/tensorflow/python/profiler/profiler.py
+++ b/tensorflow/python/profiler/profiler.py
@@ -30,7 +30,6 @@ from tensorflow.python.profiler.model_analyzer import Profiler
 from tensorflow.python.profiler.option_builder import ProfileOptionBuilder
 from tensorflow.python.profiler.tfprof_logger import write_op_log
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -54,11 +53,3 @@ tf_export('profiler.GraphNodeProto')(GraphNodeProto)
 tf_export('profiler.MultiGraphNodeProto')(MultiGraphNodeProto)
 tf_export('profiler.AdviceProto')(AdviceProto)
 tf_export('profiler.OpLogProto')(OpLogProto)
-
-remove_undocumented(__name__, _allowed_symbols, [
-    Profiler,
-    profile,
-    ProfileOptionBuilder,
-    advise,
-    write_op_log,
-])
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index 8d121064967f2f87cd0aefaa361bfd6f387a3e6e..e651de32ea3bce32a965bfbeefc76ff08a79ac38 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -156,7 +156,7 @@ def merge_default_with_oplog(graph, op_log=None, run_meta=None,
   Returns:
     tmp_op_log: Merged OpLogProto proto.
   """
-  if not graph and context.in_graph_mode():
+  if not graph and not context.executing_eagerly():
     graph = ops.get_default_graph()
 
   tmp_op_log = tfprof_log_pb2.OpLogProto()
@@ -210,7 +210,7 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
     add_trace: Whether to add python code trace information.
         Used to support "code" view.
   """
-  if not graph and context.in_graph_mode():
+  if not graph and not context.executing_eagerly():
     graph = ops.get_default_graph()
   op_log = merge_default_with_oplog(graph, op_log, run_meta, add_trace)
 
diff --git a/tensorflow/python/profiler/tfprof_logger_test.py b/tensorflow/python/profiler/tfprof_logger_test.py
index 141144f98776f3aa7c95b9ef743022aeca5084e1..caf3869f56d94a272218a0e318f2fdf5cb238795 100644
--- a/tensorflow/python/profiler/tfprof_logger_test.py
+++ b/tensorflow/python/profiler/tfprof_logger_test.py
@@ -38,7 +38,7 @@ class TFProfLoggerTest(test.TestCase):
     return math_ops.matmul(a, b)
 
   # pylint: disable=pointless-string-statement
-  """# TODO(xpan): This this out of core so it doesn't depend on contrib.
+  """# TODO(xpan): This out of core so it doesn't depend on contrib.
   def testFillMissingShape(self):
     a, b, y = self._BuildSmallPlaceholderlModel()
     run_options = config_pb2.RunOptions(
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 50f481d29e9d39bd12741b5f9e02b7201336134d..5ee55301df986998b22b8b57b5f01b1f6b4918ac 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+%include "tensorflow/python/platform/base.i"
+
 %ignore "";
 
 %rename("%s") TFE_NewContext;
@@ -26,12 +28,18 @@ limitations under the License.
 %rename("%s") TFE_ContextClearCaches;
 %rename("%s") TFE_ContextGetDevicePlacementPolicy;
 %rename("%s") TFE_ContextSetThreadLocalDevicePlacementPolicy;
+%rename("%s") TFE_ContextSetAsyncForThread;
+%rename("%s") TFE_ContextAsyncWait;
+%rename("%s") TFE_ContextAsyncClearError;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_RegisterExceptionClass;
+%rename("%s") TFE_Py_RegisterBackwardFunctionGetter;
 %rename("%s") TFE_Py_RegisterFallbackExceptionClass;
+%rename("%s") TFE_Py_RegisterResourceVariableType;
 %rename("%s") TFE_Py_Execute;
 %rename("%s") TFE_Py_FastPathExecute;
+%rename("%s") TFE_Py_RecordGradient;
 %rename("%s") TFE_Py_UID;
 %rename("%s") TFE_Py_TapeSetNew;
 %rename("%s") TFE_Py_TapeSetRemove;
@@ -48,6 +56,7 @@ limitations under the License.
 %rename("%s") TFE_NewContextOptions;
 %rename("%s") TFE_ContextOptionsSetConfig;
 %rename("%s") TFE_ContextOptionsSetDevicePlacementPolicy;
+%rename("%s") TFE_ContextOptionsSetAsync;
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
 
@@ -111,9 +120,9 @@ limitations under the License.
 
 }
 %typemap(out) (TFE_Context*) {
-  if ($1 == nullptr) {
-    SWIG_fail;
-  } else {
+  // When the TFE_Context* returned is a nullptr, we expect the status is not
+  // OK. This will raise an error (happens in another typemap).
+  if ($1 != nullptr) {
     $result = PyCapsule_New($1, nullptr, TFE_DeleteContextCapsule);
   }
 }
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 30e0a099d8b2e30cff36b69164ba9f1789dd8916..2609a5d222659f6ebf775d6baa48bd7bc39fd7f6 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -235,15 +235,3 @@ py_test(
 
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index 766b0a3579f8b67c94e4f0e7342eecbf67dec077..be49c70c60476ae8b95c07007abb32a222466958 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -26,10 +26,3 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.saved_model.builder_impl import SavedModelBuilder
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "SavedModelBuilder",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 7347da75364818b95d3f2ad7dfa74a8c3614b161..3447d917e9bf2dace3de784106dadb1fcc3a9647 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -193,7 +193,8 @@ class SavedModelBuilder(object):
   def _validate_tensor_info(self, tensor_info):
     """Validates the `TensorInfo` proto.
 
-    Checks if the `name` and `dtype` fields exist and are non-empty.
+    Checks if the `encoding` (`name` or `coo_sparse`) and `dtype` fields exist
+    and are non-empty.
 
     Args:
       tensor_info: `TensorInfo` protocol buffer to validate.
@@ -206,10 +207,12 @@ class SavedModelBuilder(object):
       raise AssertionError(
           "All TensorInfo protos used in the SignatureDefs must have the name "
           "and dtype fields set.")
-    if not tensor_info.name:
+    if tensor_info.WhichOneof("encoding") is None:
+      # TODO(soergel) validate each of the fields of coo_sparse
       raise AssertionError(
-          "All TensorInfo protos used in the SignatureDefs must have the name "
-          "field set: %s" % tensor_info)
+          "All TensorInfo protos used in the SignatureDefs must have one of "
+          "the 'encoding' fields (e.g., name or coo_sparse) set: %s"
+          % tensor_info)
     if tensor_info.dtype is types_pb2.DT_INVALID:
       raise AssertionError(
           "All TensorInfo protos used in the SignatureDefs must have the dtype "
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index ec49a0539ff52f6cc69bb24483ede657b698ab8d..34206c6f6d49f1901cacd7a8cbd632968b862ad6 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 # Subdirectory name containing the asset files.
@@ -66,17 +65,3 @@ tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
 VARIABLES_FILENAME = "variables"
 tf_export("saved_model.constants.VARIABLES_FILENAME").export_constant(
     __name__, "VARIABLES_FILENAME")
-
-
-_allowed_symbols = [
-    "ASSETS_DIRECTORY",
-    "ASSETS_KEY",
-    "LEGACY_INIT_OP_KEY",
-    "MAIN_OP_KEY",
-    "SAVED_MODEL_SCHEMA_VERSION",
-    "SAVED_MODEL_FILENAME_PB",
-    "SAVED_MODEL_FILENAME_PBTXT",
-    "VARIABLES_DIRECTORY",
-    "VARIABLES_FILENAME",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/loader.py b/tensorflow/python/saved_model/loader.py
index 0a7f516287a4a685a148443beab219af1ab326a4..334298c232e1171d0fa0a31b49dcb50f83e90f4b 100644
--- a/tensorflow/python/saved_model/loader.py
+++ b/tensorflow/python/saved_model/loader.py
@@ -67,11 +67,3 @@ from __future__ import print_function
 from tensorflow.python.saved_model.loader_impl import load
 from tensorflow.python.saved_model.loader_impl import maybe_saved_model_directory
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "load",
-    "maybe_saved_model_directory",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/main_op.py b/tensorflow/python/saved_model/main_op.py
index 04cadeab663f0083cb9b325ac96b6d8c903c8cb5..18d11b900c8dc21182db73d4d4de8655122a9c9c 100644
--- a/tensorflow/python/saved_model/main_op.py
+++ b/tensorflow/python/saved_model/main_op.py
@@ -26,10 +26,3 @@ from __future__ import print_function
 from tensorflow.python.saved_model.main_op_impl import main_op
 from tensorflow.python.saved_model.main_op_impl import main_op_with_restore
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    "main_op",
-    "main_op_with_restore",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index caabd7bc30455b55e89711a1ccab6238971f595e..6702c99607136475cdf096f863ccd0bbddd57845 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -34,18 +34,3 @@ from tensorflow.python.saved_model import utils
 from tensorflow.python.saved_model.simple_save import *
 # pylint: enable=wildcard-import
 
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "builder",
-    "constants",
-    "loader",
-    "main_op",
-    "signature_constants",
-    "signature_def_utils",
-    "simple_save",
-    "tag_constants",
-    "utils",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index d9d316882584470769c14cf0c5f265b58e37ab43..804255375e7c5215597a5dcca02f3b32f2c0a497 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -94,7 +94,7 @@ class SavedModelTest(test.TestCase):
     self.assertEqual(expected_asset_file_name, asset.filename)
     self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
 
-  def _validate_inputs_tensor_info(self, builder, tensor_info):
+  def _validate_inputs_tensor_info_fail(self, builder, tensor_info):
     with self.test_session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
@@ -107,7 +107,18 @@ class SavedModelTest(test.TestCase):
           sess, ["foo"],
           signature_def_map={"foo_key": foo_signature})
 
-  def _validate_outputs_tensor_info(self, builder, tensor_info):
+  def _validate_inputs_tensor_info_accept(self, builder, tensor_info):
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      foo_signature = signature_def_utils.build_signature_def({
+          "foo_inputs": tensor_info
+      }, dict(), "foo")
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"],
+          signature_def_map={"foo_key": foo_signature})
+
+  def _validate_outputs_tensor_info_fail(self, builder, tensor_info):
     with self.test_session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
@@ -119,6 +130,16 @@ class SavedModelTest(test.TestCase):
           sess, ["foo"],
           signature_def_map={"foo_key": foo_signature})
 
+  def _validate_outputs_tensor_info_accept(self, builder, tensor_info):
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      foo_signature = signature_def_utils.build_signature_def(
+          dict(), {"foo_outputs": tensor_info}, "foo")
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"],
+          signature_def_map={"foo_key": foo_signature})
+
   def testMaybeSavedModelDir(self):
     base_path = test.test_src_dir_path("/python/saved_model")
     self.assertFalse(loader.maybe_saved_model_directory(base_path))
@@ -538,23 +559,50 @@ class SavedModelTest(test.TestCase):
       self.assertEqual("bar", bar_signature["bar_key"].method_name)
       self.assertEqual("foo_new", bar_signature["foo_key"].method_name)
 
-  def testSignatureDefValidation(self):
-    export_dir = self._get_export_dir("test_signature_def_validation")
+  def testSignatureDefValidationFails(self):
+    export_dir = self._get_export_dir("test_signature_def_validation_fail")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    tensor_without_name = meta_graph_pb2.TensorInfo()
-    tensor_without_name.dtype = types_pb2.DT_FLOAT
-    self._validate_inputs_tensor_info(builder, tensor_without_name)
-    self._validate_outputs_tensor_info(builder, tensor_without_name)
+    tensor_without_encoding = meta_graph_pb2.TensorInfo()
+    tensor_without_encoding.dtype = types_pb2.DT_FLOAT
+    self._validate_inputs_tensor_info_fail(builder, tensor_without_encoding)
+    self._validate_outputs_tensor_info_fail(builder, tensor_without_encoding)
 
     tensor_without_dtype = meta_graph_pb2.TensorInfo()
     tensor_without_dtype.name = "x"
-    self._validate_inputs_tensor_info(builder, tensor_without_dtype)
-    self._validate_outputs_tensor_info(builder, tensor_without_dtype)
+    self._validate_inputs_tensor_info_fail(builder, tensor_without_dtype)
+    self._validate_outputs_tensor_info_fail(builder, tensor_without_dtype)
 
     tensor_empty = meta_graph_pb2.TensorInfo()
-    self._validate_inputs_tensor_info(builder, tensor_empty)
-    self._validate_outputs_tensor_info(builder, tensor_empty)
+    self._validate_inputs_tensor_info_fail(builder, tensor_empty)
+    self._validate_outputs_tensor_info_fail(builder, tensor_empty)
+
+  def testSignatureDefValidationSucceedsWithName(self):
+    tensor_with_name = meta_graph_pb2.TensorInfo()
+    tensor_with_name.name = "foo"
+    tensor_with_name.dtype = types_pb2.DT_FLOAT
+
+    export_dir = self._get_export_dir("test_signature_def_validation_name_1")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    self._validate_inputs_tensor_info_accept(builder, tensor_with_name)
+
+    export_dir = self._get_export_dir("test_signature_def_validation_name_2")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    self._validate_outputs_tensor_info_accept(builder, tensor_with_name)
+
+  def testSignatureDefValidationSucceedsWithCoo(self):
+    tensor_with_coo = meta_graph_pb2.TensorInfo()
+    # TODO(soergel) test validation of each of the fields of coo_sparse
+    tensor_with_coo.coo_sparse.values_tensor_name = "foo"
+    tensor_with_coo.dtype = types_pb2.DT_FLOAT
+
+    export_dir = self._get_export_dir("test_signature_def_validation_coo_1")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    self._validate_inputs_tensor_info_accept(builder, tensor_with_coo)
+
+    export_dir = self._get_export_dir("test_signature_def_validation_coo_2")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    self._validate_outputs_tensor_info_accept(builder, tensor_with_coo)
 
   def testAssets(self):
     export_dir = self._get_export_dir("test_assets")
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 6461fe8a7e7bef1a2fc787879da9e3324e2655c8..819f351291f2bbf24a433c1bc77578af594c050b 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -95,19 +94,3 @@ tf_export("saved_model.signature_constants.REGRESS_OUTPUTS").export_constant(
     __name__, "REGRESS_OUTPUTS")
 
 ################################################################################
-
-
-_allowed_symbols = [
-    "DEFAULT_SERVING_SIGNATURE_DEF_KEY",
-    "CLASSIFY_INPUTS",
-    "CLASSIFY_METHOD_NAME",
-    "CLASSIFY_OUTPUT_CLASSES",
-    "CLASSIFY_OUTPUT_SCORES",
-    "PREDICT_INPUTS",
-    "PREDICT_METHOD_NAME",
-    "PREDICT_OUTPUTS",
-    "REGRESS_INPUTS",
-    "REGRESS_METHOD_NAME",
-    "REGRESS_OUTPUTS",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index d164e2c23f24469d7536f87cb431afe618ddcc06..5a797da791c82d9c81107ba940aceea7849c0c46 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -40,11 +39,3 @@ tf_export("saved_model.tag_constants.GPU").export_constant(__name__, "GPU")
 # Tag for the `tpu` graph.
 TPU = "tpu"
 tf_export("saved_model.tag_constants.TPU").export_constant(__name__, "TPU")
-
-_allowed_symbols = [
-    "SERVING",
-    "TRAINING",
-    "GPU",
-    "TPU"
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/utils.py b/tensorflow/python/saved_model/utils.py
index 8e750d8708a36a9dd406a4703b3a79645fc04e8f..27c355490934e7d20ee72ae10eca9fdb8bbfca14 100644
--- a/tensorflow/python/saved_model/utils.py
+++ b/tensorflow/python/saved_model/utils.py
@@ -24,7 +24,3 @@ from __future__ import print_function
 from tensorflow.python.saved_model.utils_impl import build_tensor_info
 from tensorflow.python.saved_model.utils_impl import get_tensor_from_tensor_info
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ["build_tensor_info", "get_tensor_from_tensor_info"]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index b80ad79074e85bdeae70148b2822c319c29468bc..1421d2772fe140dd5f207f159db0ab462231420d 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -16,21 +16,6 @@
 """Tensor summaries for exporting information about a model.
 
 See the @{$python/summary} guide.
-
-@@FileWriter
-@@FileWriterCache
-@@tensor_summary
-@@scalar
-@@histogram
-@@audio
-@@image
-@@text
-@@merge
-@@merge_all
-@@get_summary_description
-@@PluginAsset
-@@get_plugin_asset
-@@get_all_plugin_assets
 """
 
 from __future__ import absolute_import
@@ -48,10 +33,13 @@ from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.core.util.event_pb2 import TaggedRunMetadata
 # pylint: enable=unused-import
 
+
 from tensorflow.python.eager import context as _context
+from tensorflow.python.framework import constant_op as _constant_op
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.ops import gen_logging_ops as _gen_logging_ops
+from tensorflow.python.ops import gen_summary_ops as _gen_summary_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import summary_op_util as _summary_op_util
 
 # exports tensor-related summaries
@@ -71,7 +59,6 @@ from tensorflow.python.summary.writer.writer_cache import FileWriterCache
 # pylint: enable=unused-import
 
 from tensorflow.python.util import compat as _compat
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -96,10 +83,11 @@ def scalar(name, tensor, collections=None, family=None):
   Raises:
     ValueError: If tensor has the wrong shape or type.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
-    # pylint: disable=protected-access
-    val = _gen_logging_ops._scalar_summary(tags=tag, values=tensor, name=scope)
+    val = _gen_logging_ops.scalar_summary(tags=tag, values=tensor, name=scope)
     _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
   return val
 
@@ -150,10 +138,11 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
-    # pylint: disable=protected-access
-    val = _gen_logging_ops._image_summary(
+    val = _gen_logging_ops.image_summary(
         tag=tag, tensor=tensor, max_images=max_outputs, name=scope)
     _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
   return val
@@ -189,11 +178,12 @@ def histogram(name, values, collections=None, family=None):
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[values],
       default_name='HistogramSummary') as (tag, scope):
-    # pylint: disable=protected-access
-    val = _gen_logging_ops._histogram_summary(
+    val = _gen_logging_ops.histogram_summary(
         tag=tag, values=values, name=scope)
     _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
   return val
@@ -235,12 +225,13 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family=family, values=[tensor]) as (tag, scope):
-    # pylint: disable=protected-access
     sample_rate = _ops.convert_to_tensor(
         sample_rate, dtype=_dtypes.float32, name='sample_rate')
-    val = _gen_logging_ops._audio_summary_v2(
+    val = _gen_logging_ops.audio_summary_v2(
         tag=tag, tensor=tensor, max_outputs=max_outputs,
         sample_rate=sample_rate, name=scope)
     _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
@@ -280,14 +271,15 @@ def merge(inputs, collections=None, name=None):
   @end_compatbility
   """
   # pylint: enable=line-too-long
-  if _context.in_eager_mode():
+  if _context.executing_eagerly():
     raise RuntimeError(
         'Merging tf.summary.* ops is not compatible with eager execution. '
         'Use tf.contrib.summary instead.')
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   name = _summary_op_util.clean_tag(name)
   with _ops.name_scope(name, 'Merge', inputs):
-    # pylint: disable=protected-access
-    val = _gen_logging_ops._merge_summary(inputs=inputs, name=name)
+    val = _gen_logging_ops.merge_summary(inputs=inputs, name=name)
     _summary_op_util.collect(val, collections, [])
   return val
 
@@ -314,7 +306,7 @@ def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None):
   summaries under eager execution, use `tf.contrib.summary` instead.
   @end_compatbility
   """
-  if _context.in_eager_mode():
+  if _context.executing_eagerly():
     raise RuntimeError(
         'Merging tf.summary.* ops is not compatible with eager execution. '
         'Use tf.contrib.summary instead.')
@@ -353,10 +345,3 @@ def get_summary_description(node_def):
   summary_description = SummaryDescription()
   _json_format.Parse(description_str, summary_description)
   return summary_description
-
-
-_allowed_symbols = [
-    'Summary', 'SummaryDescription', 'Event', 'TaggedRunMetadata', 'SessionLog',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/summary/writer/event_file_writer_v2.py b/tensorflow/python/summary/writer/event_file_writer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c66c0f7a8fd4c65d3539d257b4e4fa89f839a98
--- /dev/null
+++ b/tensorflow/python/summary/writer/event_file_writer_v2.py
@@ -0,0 +1,140 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Writes events to disk in a logdir."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.platform import gfile
+
+
+class EventFileWriterV2(object):
+  """Writes `Event` protocol buffers to an event file via the graph.
+
+  The `EventFileWriterV2` class is backed by the summary file writer in the v2
+  summary API (currently in tf.contrib.summary), so it uses a shared summary
+  writer resource and graph ops to write events.
+
+  As with the original EventFileWriter, this class will asynchronously write
+  Event protocol buffers to the backing file. The Event file is encoded using
+  the tfrecord format, which is similar to RecordIO.
+  """
+
+  def __init__(self, session, logdir, max_queue=10, flush_secs=120,
+               filename_suffix=''):
+    """Creates an `EventFileWriterV2` and an event file to write to.
+
+    On construction, this calls `tf.contrib.summary.create_file_writer` within
+    the graph from `session.graph` to look up a shared summary writer resource
+    for `logdir` if one exists, and create one if not. Creating the summary
+    writer resource in turn creates a new event file in `logdir` to be filled
+    with `Event` protocol buffers passed to `add_event`. Graph ops to control
+    this writer resource are added to `session.graph` during this init call;
+    stateful methods on this class will call `session.run()` on these ops.
+
+    Note that because the underlying resource is shared, it is possible that
+    other parts of the code using the same session may interact independently
+    with the resource, e.g. by flushing or even closing it. It is the caller's
+    responsibility to avoid any undesirable sharing in this regard.
+
+    The remaining arguments to the constructor (`flush_secs`, `max_queue`, and
+    `filename_suffix`) control the construction of the shared writer resource
+    if one is created. If an existing resource is reused, these arguments have
+    no effect.  See `tf.contrib.summary.create_file_writer` for details.
+
+    Args:
+      session: A `tf.Session`. Session that will hold shared writer resource.
+        The writer ops will be added to session.graph during this init call.
+      logdir: A string. Directory where event file will be written.
+      max_queue: Integer. Size of the queue for pending events and summaries.
+      flush_secs: Number. How often, in seconds, to flush the
+        pending events and summaries to disk.
+      filename_suffix: A string. Every event file's name is suffixed with
+        `filename_suffix`.
+    """
+    self._session = session
+    self._logdir = logdir
+    self._closed = False
+    if not gfile.IsDirectory(self._logdir):
+      gfile.MakeDirs(self._logdir)
+
+    with self._session.graph.as_default():
+      with ops.name_scope('filewriter'):
+        file_writer = summary_ops_v2.create_file_writer(
+            logdir=self._logdir,
+            max_queue=max_queue,
+            flush_millis=flush_secs * 1000,
+            filename_suffix=filename_suffix)
+        with summary_ops_v2.always_record_summaries(), file_writer.as_default():
+          self._event_placeholder = array_ops.placeholder_with_default(
+              constant_op.constant('unused', dtypes.string),
+              shape=[])
+          self._add_event_op = summary_ops_v2.import_event(
+              self._event_placeholder)
+        self._init_op = file_writer.init()
+        self._flush_op = file_writer.flush()
+        self._close_op = file_writer.close()
+      self._session.run(self._init_op)
+
+  def get_logdir(self):
+    """Returns the directory where event file will be written."""
+    return self._logdir
+
+  def reopen(self):
+    """Reopens the EventFileWriter.
+
+    Can be called after `close()` to add more events in the same directory.
+    The events will go into a new events file.
+
+    Does nothing if the EventFileWriter was not closed.
+    """
+    if self._closed:
+      self._closed = False
+      self._session.run(self._init_op)
+
+  def add_event(self, event):
+    """Adds an event to the event file.
+
+    Args:
+      event: An `Event` protocol buffer.
+    """
+    if not self._closed:
+      event_pb = event.SerializeToString()
+      self._session.run(
+          self._add_event_op, feed_dict={self._event_placeholder: event_pb})
+
+  def flush(self):
+    """Flushes the event file to disk.
+
+    Call this method to make sure that all pending events have been written to
+    disk.
+    """
+    self._session.run(self._flush_op)
+
+  def close(self):
+    """Flushes the event file to disk and close the file.
+
+    Call this method when you do not need the summary writer anymore.
+    """
+    if not self._closed:
+      self.flush()
+      self._session.run(self._close_op)
+      self._closed = True
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 1f3f2287043c021d636113b5a8807c9f4adf77aa..aca084fc9168e710316e4c988594cff69e54ebab 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
+from tensorflow.python.summary.writer.event_file_writer_v2 import EventFileWriterV2
 from tensorflow.python.util.tf_export import tf_export
 
 _PLUGINS_DIR = "plugins"
@@ -286,6 +287,11 @@ class FileWriter(SummaryToEventTransformer):
   file contents asynchronously. This allows a training program to call methods
   to add data to the file directly from the training loop, without slowing down
   training.
+
+  When constructed with a `tf.Session` parameter, a `FileWriter` instead forms
+  a compatibility layer over new graph-based summaries (`tf.contrib.summary`)
+  to facilitate the use of new summary writing with pre-existing code that
+  expects a `FileWriter` instance.
   """
 
   def __init__(self,
@@ -294,10 +300,11 @@ class FileWriter(SummaryToEventTransformer):
                max_queue=10,
                flush_secs=120,
                graph_def=None,
-               filename_suffix=None):
-    """Creates a `FileWriter` and an event file.
+               filename_suffix=None,
+               session=None):
+    """Creates a `FileWriter`, optionally shared within the given session.
 
-    On construction the summary writer creates a new event file in `logdir`.
+    Typically, constructing a file writer creates a new event file in `logdir`.
     This event file will contain `Event` protocol buffers constructed when you
     call one of the following functions: `add_summary()`, `add_session_log()`,
     `add_event()`, or `add_graph()`.
@@ -317,13 +324,16 @@ class FileWriter(SummaryToEventTransformer):
     writer = tf.summary.FileWriter(<some-directory>, sess.graph)
     ```
 
-    The other arguments to the constructor control the asynchronous writes to
-    the event file:
-
-    *  `flush_secs`: How often, in seconds, to flush the added summaries
-       and events to disk.
-    *  `max_queue`: Maximum number of summaries or events pending to be
-       written to disk before one of the 'add' calls block.
+    The `session` argument to the constructor makes the returned `FileWriter` a
+    a compatibility layer over new graph-based summaries (`tf.contrib.summary`).
+    Crucially, this means the underlying writer resource and events file will
+    be shared with any other `FileWriter` using the same `session` and `logdir`,
+    and with any `tf.contrib.summary.SummaryWriter` in this session using the
+    the same shared resource name (which by default scoped to the logdir). If
+    no such resource exists, one will be created using the remaining arguments
+    to this constructor, but if one already exists those arguments are ignored.
+    In either case, ops will be added to `session.graph` to control the
+    underlying file writer resource. See `tf.contrib.summary` for more details.
 
     Args:
       logdir: A string. Directory where event file will be written.
@@ -334,6 +344,7 @@ class FileWriter(SummaryToEventTransformer):
       graph_def: DEPRECATED: Use the `graph` argument instead.
       filename_suffix: A string. Every event file's name is suffixed with
         `suffix`.
+      session: A `tf.Session` object. See details above.
 
     Raises:
       RuntimeError: If called with eager execution enabled.
@@ -343,13 +354,16 @@ class FileWriter(SummaryToEventTransformer):
     summaries under eager execution, use `tf.contrib.summary` instead.
     @end_compatbility
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError(
           "tf.summary.FileWriter is not compatible with eager execution. "
           "Use tf.contrib.summary instead.")
-
-    event_writer = EventFileWriter(logdir, max_queue, flush_secs,
-                                   filename_suffix)
+    if session is not None:
+      event_writer = EventFileWriterV2(
+          session, logdir, max_queue, flush_secs, filename_suffix)
+    else:
+      event_writer = EventFileWriter(logdir, max_queue, flush_secs,
+                                     filename_suffix)
     super(FileWriter, self).__init__(event_writer, graph, graph_def)
 
   def __enter__(self):
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 88ade0aac33f1cd8f9d8cb30344aabca76a13511..dc990c2602427049ecdb7588ff217207a69cbcd2 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -29,10 +29,12 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import plugin_asset
@@ -42,7 +44,10 @@ from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.util import compat
 
 
-class SummaryWriterTestCase(test.TestCase):
+class FileWriterTestCase(test.TestCase):
+
+  def _FileWriter(self, *args, **kwargs):
+    return writer.FileWriter(*args, **kwargs)
 
   def _TestDir(self, test_name):
     test_dir = os.path.join(self.get_temp_dir(), test_name)
@@ -96,7 +101,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testAddingSummaryGraphAndRunMetadata(self):
     test_dir = self._CleanTestDir("basics")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
 
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     sw.add_summary(
@@ -171,7 +176,7 @@ class SummaryWriterTestCase(test.TestCase):
     test_dir = self._CleanTestDir("basics_named_graph")
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
-    sw = writer.FileWriter(test_dir, graph=g)
+    sw = self._FileWriter(test_dir, graph=g)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
@@ -179,7 +184,7 @@ class SummaryWriterTestCase(test.TestCase):
     test_dir = self._CleanTestDir("basics_positional_graph")
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
-    sw = writer.FileWriter(test_dir, g)
+    sw = self._FileWriter(test_dir, g)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
@@ -188,7 +193,7 @@ class SummaryWriterTestCase(test.TestCase):
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
     gd = g.as_graph_def()
-    sw = writer.FileWriter(test_dir, graph_def=gd)
+    sw = self._FileWriter(test_dir, graph_def=gd)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
@@ -197,7 +202,7 @@ class SummaryWriterTestCase(test.TestCase):
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
     gd = g.as_graph_def()
-    sw = writer.FileWriter(test_dir, gd)
+    sw = self._FileWriter(test_dir, gd)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
@@ -207,18 +212,18 @@ class SummaryWriterTestCase(test.TestCase):
       with ops.Graph().as_default() as g:
         constant_op.constant([12], name="douze")
       gd = g.as_graph_def()
-      sw = writer.FileWriter(test_dir, graph=g, graph_def=gd)
+      sw = self._FileWriter(test_dir, graph=g, graph_def=gd)
       sw.close()
 
   def testNeitherGraphNorGraphDef(self):
     with self.assertRaises(TypeError):
       test_dir = self._CleanTestDir("basics_string_instead_of_graph")
-      sw = writer.FileWriter(test_dir, "string instead of graph object")
+      sw = self._FileWriter(test_dir, "string instead of graph object")
       sw.close()
 
   def testCloseAndReopen(self):
     test_dir = self._CleanTestDir("close_and_reopen")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     sw.close()
     # Sleep at least one second to make sure we get a new event file name.
@@ -261,7 +266,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testNonBlockingClose(self):
     test_dir = self._CleanTestDir("non_blocking_close")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     # Sleep 1.2 seconds to make sure event queue is empty.
     time.sleep(1.2)
     time_before_close = time.time()
@@ -270,7 +275,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testWithStatement(self):
     test_dir = self._CleanTestDir("with_statement")
-    with writer.FileWriter(test_dir) as sw:
+    with self._FileWriter(test_dir) as sw:
       sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     event_paths = sorted(glob.glob(os.path.join(test_dir, "event*")))
     self.assertEquals(1, len(event_paths))
@@ -280,7 +285,7 @@ class SummaryWriterTestCase(test.TestCase):
   # protocol buffers correctly.
   def testAddingSummariesFromSessionRunCalls(self):
     test_dir = self._CleanTestDir("global_step")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     with self.test_session():
       i = constant_op.constant(1, dtype=dtypes.int32, shape=[])
       l = constant_op.constant(2, dtype=dtypes.int64, shape=[])
@@ -327,7 +332,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testPluginMetadataStrippedFromSubsequentEvents(self):
     test_dir = self._CleanTestDir("basics")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
 
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
 
@@ -386,7 +391,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testFileWriterWithSuffix(self):
     test_dir = self._CleanTestDir("test_suffix")
-    sw = writer.FileWriter(test_dir, filename_suffix="_test_suffix")
+    sw = self._FileWriter(test_dir, filename_suffix="_test_suffix")
     for _ in range(10):
       sw.add_summary(
           summary_pb2.Summary(value=[
@@ -400,9 +405,178 @@ class SummaryWriterTestCase(test.TestCase):
     for filename in event_filenames:
       self.assertTrue(filename.endswith("_test_suffix"))
 
+  def testPluginAssetSerialized(self):
+    class ExamplePluginAsset(plugin_asset.PluginAsset):
+      plugin_name = "example"
+
+      def assets(self):
+        return {"foo.txt": "foo!", "bar.txt": "bar!"}
+
+    with ops.Graph().as_default() as g:
+      plugin_asset.get_plugin_asset(ExamplePluginAsset)
+
+      logdir = self.get_temp_dir()
+      fw = self._FileWriter(logdir)
+      fw.add_graph(g)
+    plugin_dir = os.path.join(logdir, writer._PLUGINS_DIR, "example")
+
+    with gfile.Open(os.path.join(plugin_dir, "foo.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "foo!")
+
+    with gfile.Open(os.path.join(plugin_dir, "bar.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "bar!")
 
-class SummaryWriterCacheTest(test.TestCase):
-  """SummaryWriterCache tests."""
+
+class SessionBasedFileWriterTestCase(FileWriterTestCase):
+  """Tests for FileWriter behavior when passed a Session argument."""
+
+  def _FileWriter(self, *args, **kwargs):
+    if "session" not in kwargs:
+      # Pass in test_session() as the session. It will be cached during this
+      # test method invocation so that any other use of test_session() with no
+      # graph should result in re-using the same underlying Session.
+      with self.test_session() as sess:
+        kwargs["session"] = sess
+        return writer.FileWriter(*args, **kwargs)
+    return writer.FileWriter(*args, **kwargs)
+
+  def _createTaggedSummary(self, tag):
+    summary = summary_pb2.Summary()
+    summary.value.add(tag=tag)
+    return summary
+
+  def testSharing_withOtherSessionBasedFileWriters(self):
+    logdir = self.get_temp_dir()
+    with session.Session() as sess:
+      # Initial file writer
+      writer1 = writer.FileWriter(session=sess, logdir=logdir)
+      writer1.add_summary(self._createTaggedSummary("one"), 1)
+      writer1.flush()
+
+      # File writer, should share file with writer1
+      writer2 = writer.FileWriter(session=sess, logdir=logdir)
+      writer2.add_summary(self._createTaggedSummary("two"), 2)
+      writer2.flush()
+
+      # File writer with different logdir (shouldn't be in this logdir at all)
+      writer3 = writer.FileWriter(session=sess, logdir=logdir + "-other")
+      writer3.add_summary(self._createTaggedSummary("three"), 3)
+      writer3.flush()
+
+      # File writer in a different session (should be in separate file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      with session.Session() as other_sess:
+        writer4 = writer.FileWriter(session=other_sess, logdir=logdir)
+        writer4.add_summary(self._createTaggedSummary("four"), 4)
+        writer4.flush()
+
+      # One more file writer, should share file with writer1
+      writer5 = writer.FileWriter(session=sess, logdir=logdir)
+      writer5.add_summary(self._createTaggedSummary("five"), 5)
+      writer5.flush()
+
+    event_paths = iter(sorted(glob.glob(os.path.join(logdir, "event*"))))
+
+    # First file should have tags "one", "two", and "five"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("one", next(events).summary.value[0].tag)
+    self.assertEqual("two", next(events).summary.value[0].tag)
+    self.assertEqual("five", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file should have just "four"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("four", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_paths))
+
+    # Just check that the other logdir file exists to be sure we wrote it
+    self.assertTrue(glob.glob(os.path.join(logdir + "-other", "event*")))
+
+  def testSharing_withExplicitSummaryFileWriters(self):
+    logdir = self.get_temp_dir()
+    with session.Session() as sess:
+      # Initial file writer via FileWriter(session=?)
+      writer1 = writer.FileWriter(session=sess, logdir=logdir)
+      writer1.add_summary(self._createTaggedSummary("one"), 1)
+      writer1.flush()
+
+      # Next one via create_file_writer(), should use same file
+      writer2 = summary_ops_v2.create_file_writer(logdir=logdir)
+      with summary_ops_v2.always_record_summaries(), writer2.as_default():
+        summary2 = summary_ops_v2.scalar("two", 2.0, step=2)
+      sess.run(writer2.init())
+      sess.run(summary2)
+      sess.run(writer2.flush())
+
+      # Next has different shared name, should be in separate file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops_v2.create_file_writer(logdir=logdir, name="other")
+      with summary_ops_v2.always_record_summaries(), writer3.as_default():
+        summary3 = summary_ops_v2.scalar("three", 3.0, step=3)
+      sess.run(writer3.init())
+      sess.run(summary3)
+      sess.run(writer3.flush())
+
+      # Next uses a second session, should be in separate file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      with session.Session() as other_sess:
+        writer4 = summary_ops_v2.create_file_writer(logdir=logdir)
+        with summary_ops_v2.always_record_summaries(), writer4.as_default():
+          summary4 = summary_ops_v2.scalar("four", 4.0, step=4)
+        other_sess.run(writer4.init())
+        other_sess.run(summary4)
+        other_sess.run(writer4.flush())
+
+        # Next via FileWriter(session=?) uses same second session, should be in
+        # same separate file. (This checks sharing in the other direction)
+        writer5 = writer.FileWriter(session=other_sess, logdir=logdir)
+        writer5.add_summary(self._createTaggedSummary("five"), 5)
+        writer5.flush()
+
+      # One more via create_file_writer(), should use same file
+      writer6 = summary_ops_v2.create_file_writer(logdir=logdir)
+      with summary_ops_v2.always_record_summaries(), writer6.as_default():
+        summary6 = summary_ops_v2.scalar("six", 6.0, step=6)
+      sess.run(writer6.init())
+      sess.run(summary6)
+      sess.run(writer6.flush())
+
+    event_paths = iter(sorted(glob.glob(os.path.join(logdir, "event*"))))
+
+    # First file should have tags "one", "two", and "six"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("one", next(events).summary.value[0].tag)
+    self.assertEqual("two", next(events).summary.value[0].tag)
+    self.assertEqual("six", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file should have just "three"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("three", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Third file should have "four" and "five"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("four", next(events).summary.value[0].tag)
+    self.assertEqual("five", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_paths))
+
+
+class FileWriterCacheTest(test.TestCase):
+  """FileWriterCache tests."""
 
   def _test_dir(self, test_name):
     """Create an empty dir to use for tests.
@@ -448,32 +622,5 @@ class SummaryWriterCacheTest(test.TestCase):
       self.assertFalse(sw1 == sw2)
 
 
-class ExamplePluginAsset(plugin_asset.PluginAsset):
-  plugin_name = "example"
-
-  def assets(self):
-    return {"foo.txt": "foo!", "bar.txt": "bar!"}
-
-
-class PluginAssetsTest(test.TestCase):
-
-  def testPluginAssetSerialized(self):
-    with ops.Graph().as_default() as g:
-      plugin_asset.get_plugin_asset(ExamplePluginAsset)
-
-      logdir = self.get_temp_dir()
-      fw = writer.FileWriter(logdir)
-      fw.add_graph(g)
-    plugin_dir = os.path.join(logdir, writer._PLUGINS_DIR, "example")
-
-    with gfile.Open(os.path.join(plugin_dir, "foo.txt"), "r") as f:
-      content = f.read()
-    self.assertEqual(content, "foo!")
-
-    with gfile.Open(os.path.join(plugin_dir, "bar.txt"), "r") as f:
-      content = f.read()
-    self.assertEqual(content, "bar!")
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 82b908ac0e95643d1daf5ed062be44a58cfea97f..26e8acd8977734768accb1f9c7e37431c337ee34 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -25,6 +25,7 @@ limitations under the License.
 %include "tensorflow/python/util/tfprof.i"
 
 %include "tensorflow/python/lib/core/py_func.i"
+%include "tensorflow/python/lib/core/py_exception_registry.i"
 
 %include "tensorflow/python/lib/io/py_record_reader.i"
 %include "tensorflow/python/lib/io/py_record_writer.i"
@@ -54,4 +55,3 @@ limitations under the License.
 %include "tensorflow/python/grappler/tf_optimizer.i"
 %include "tensorflow/python/grappler/cost_analyzer.i"
 %include "tensorflow/python/grappler/model_analyzer.i"
-
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 63f16c53a29fd65c32077dd29e3b1823c11d457b..6c34b6aaf310c7b576e6ae259af90ef4c23a013a 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -14,6 +14,7 @@ py_library(
     name = "tools_pip",
     deps = [
         ":freeze_graph",
+        ":import_pb_to_tensorboard",
         ":inspect_checkpoint",
         ":optimize_for_inference",
         ":print_selective_registration_header",
@@ -27,7 +28,7 @@ py_library(
     name = "saved_model_utils",
     srcs = ["saved_model_utils.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
+    deps = ["//tensorflow/contrib/saved_model:reader"],
 )
 
 py_library(
@@ -37,11 +38,13 @@ py_library(
     deps = [
         ":saved_model_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:client",
         "//tensorflow/python:framework",
+        "//tensorflow/python:no_contrib",  # TODO(b/34059704): remove when fixed
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
+        "//tensorflow/python/saved_model:loader",
         "@six_archive//:six",
     ],
 )
@@ -51,14 +54,7 @@ py_binary(
     srcs = ["freeze_graph.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":saved_model_utils",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:training",
-        "@six_archive//:six",
+        ":freeze_graph_lib",
     ],
 )
 
@@ -248,23 +244,12 @@ py_test(
         "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     ],
     srcs_version = "PY2AND3",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "no-internal-py3",
+    ],
     deps = [
         ":saved_model_cli",
         "//tensorflow/core:protos_all_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index f7a578e52b00b6d97b3b314c7a2a08d9071c8f73..e9f1def48c462dcd8a5acf0e3d29d562cd1b3d58 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -56,8 +56,6 @@ from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.tools import saved_model_utils
 from tensorflow.python.training import saver as saver_lib
 
-FLAGS = None
-
 
 def freeze_graph_with_def_protos(input_graph_def,
                                  input_saver_def,
@@ -109,7 +107,7 @@ def freeze_graph_with_def_protos(input_graph_def,
           input_meta_graph_def, clear_devices=True)
       restorer.restore(sess, input_checkpoint)
       if initializer_nodes:
-        sess.run(initializer_nodes.replace(' ', '').split(","))
+        sess.run(initializer_nodes.replace(" ", "").split(","))
     elif input_saved_model_dir:
       if saved_model_tags is None:
         saved_model_tags = []
@@ -130,27 +128,27 @@ def freeze_graph_with_def_protos(input_graph_def,
           var_list=var_list, write_version=checkpoint_version)
       saver.restore(sess, input_checkpoint)
       if initializer_nodes:
-        sess.run(initializer_nodes.replace(' ', '').split(","))
+        sess.run(initializer_nodes.replace(" ", "").split(","))
 
     variable_names_whitelist = (
-        variable_names_whitelist.replace(' ', '').split(",")
+        variable_names_whitelist.replace(" ", "").split(",")
         if variable_names_whitelist else None)
     variable_names_blacklist = (
-        variable_names_blacklist.replace(' ', '').split(",")
+        variable_names_blacklist.replace(" ", "").split(",")
         if variable_names_blacklist else None)
 
     if input_meta_graph_def:
       output_graph_def = graph_util.convert_variables_to_constants(
           sess,
           input_meta_graph_def.graph_def,
-          output_node_names.replace(' ', '').split(","),
+          output_node_names.replace(" ", "").split(","),
           variable_names_whitelist=variable_names_whitelist,
           variable_names_blacklist=variable_names_blacklist)
     else:
       output_graph_def = graph_util.convert_variables_to_constants(
           sess,
           input_graph_def,
-          output_node_names.replace(' ', '').split(","),
+          output_node_names.replace(" ", "").split(","),
           variable_names_whitelist=variable_names_whitelist,
           variable_names_blacklist=variable_names_blacklist)
 
@@ -252,29 +250,28 @@ def freeze_graph(input_graph,
       variable_names_blacklist,
       input_meta_graph_def,
       input_saved_model_dir,
-      saved_model_tags.replace(' ', '').split(","),
+      saved_model_tags.replace(" ", "").split(","),
       checkpoint_version=checkpoint_version)
 
 
-def main(unused_args):
-  if FLAGS.checkpoint_version == 1:
+def main(unused_args, flags):
+  if flags.checkpoint_version == 1:
     checkpoint_version = saver_pb2.SaverDef.V1
-  elif FLAGS.checkpoint_version == 2:
+  elif flags.checkpoint_version == 2:
     checkpoint_version = saver_pb2.SaverDef.V2
   else:
     print("Invalid checkpoint version (must be '1' or '2'): %d" %
-          FLAGS.checkpoint_version)
+          flags.checkpoint_version)
     return -1
-  freeze_graph(FLAGS.input_graph, FLAGS.input_saver, FLAGS.input_binary,
-               FLAGS.input_checkpoint, FLAGS.output_node_names,
-               FLAGS.restore_op_name, FLAGS.filename_tensor_name,
-               FLAGS.output_graph, FLAGS.clear_devices, FLAGS.initializer_nodes,
-               FLAGS.variable_names_whitelist, FLAGS.variable_names_blacklist,
-               FLAGS.input_meta_graph, FLAGS.input_saved_model_dir,
-               FLAGS.saved_model_tags, checkpoint_version)
-
+  freeze_graph(flags.input_graph, flags.input_saver, flags.input_binary,
+               flags.input_checkpoint, flags.output_node_names,
+               flags.restore_op_name, flags.filename_tensor_name,
+               flags.output_graph, flags.clear_devices, flags.initializer_nodes,
+               flags.variable_names_whitelist, flags.variable_names_blacklist,
+               flags.input_meta_graph, flags.input_saved_model_dir,
+               flags.saved_model_tags, checkpoint_version)
 
-if __name__ == "__main__":
+def run_main():
   parser = argparse.ArgumentParser()
   parser.register("type", "bool", lambda v: v.lower() == "true")
   parser.add_argument(
@@ -376,5 +373,10 @@ if __name__ == "__main__":
       separated by \',\'. For tag-set contains multiple tags, all tags \
       must be passed in.\
       """)
-  FLAGS, unparsed = parser.parse_known_args()
-  app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  flags, unparsed = parser.parse_known_args()
+
+  my_main = lambda unused_args: main(unused_args, flags)
+  app.run(main=my_main, argv=[sys.argv[0]] + unparsed)
+
+if __name__ == '__main__':
+  run_main()
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index dd876cbe7fcd64a8de70eb28f67996df9de1dd7d..6504fbc10755c5c543016b8d56d6d53f3311b249 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -30,7 +30,7 @@ FLAGS = None
 
 
 def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors,
-                                     all_tensor_names):
+                                     all_tensor_names=False):
   """Prints tensors in a checkpoint file.
 
   If no `tensor_name` is provided, prints the tensor names and shapes
@@ -139,7 +139,7 @@ if __name__ == "__main__":
       const=True,
       type="bool",
       default=False,
-      help="If True, print the values of all the tensors.")
+      help="If True, print the names and values of all the tensors.")
   parser.add_argument(
       "--all_tensor_names",
       nargs="?",
diff --git a/tensorflow/python/tools/optimize_for_inference.py b/tensorflow/python/tools/optimize_for_inference.py
index 902748d55efedf2166cbeb0d8e0fcff0d18ed152..dac6a06a89c7596dd66d0ed7a2e5a59a0ba9b9dd 100644
--- a/tensorflow/python/tools/optimize_for_inference.py
+++ b/tensorflow/python/tools/optimize_for_inference.py
@@ -87,7 +87,9 @@ def main(unused_args):
   output_graph_def = optimize_for_inference_lib.optimize_for_inference(
       input_graph_def,
       FLAGS.input_names.split(","),
-      FLAGS.output_names.split(","), FLAGS.placeholder_type_enum)
+      FLAGS.output_names.split(","),
+      FLAGS.placeholder_type_enum,
+      FLAGS.toco_compatible)
 
   if FLAGS.frozen_graph:
     f = gfile.FastGFile(FLAGS.output, "w")
@@ -138,6 +140,14 @@ def parse_args():
       type=int,
       default=dtypes.float32.as_datatype_enum,
       help="The AttrValue enum to use for placeholders.")
+  parser.add_argument(
+      "--toco_compatible",
+      type=bool,
+      default=False,
+      help="""\
+      If true, only use ops compatible with Tensorflow
+      Lite Optimizing Converter.\
+      """)
   return parser.parse_known_args()
 
 
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index 9c1927122252f45ddfa8092045c7589fa0f45532..bb90d1cd6e33aacf4bb7498fb9c9e7ecfb447c04 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -87,7 +87,7 @@ EPSILON_ATTR = {
 
 
 def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
-                           placeholder_type_enum):
+                           placeholder_type_enum, toco_compatible=False):
   """Applies a series of inference optimizations on the input graph.
 
   Args:
@@ -98,6 +98,8 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
       results.
     placeholder_type_enum: The AttrValue enum for the placeholder data type, or
         a list that specifies one value per input node name.
+    toco_compatible: Boolean, if True, only runs optimizations that result in
+      TOCO compatible graph operations (default=False).
 
   Returns:
     An optimized version of the input graph.
@@ -110,8 +112,9 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
   optimized_graph_def = graph_util.remove_training_nodes(
       optimized_graph_def, output_node_names)
   optimized_graph_def = fold_batch_norms(optimized_graph_def)
-  optimized_graph_def = fuse_resize_and_conv(optimized_graph_def,
-                                             output_node_names)
+  if not toco_compatible:
+    optimized_graph_def = fuse_resize_and_conv(optimized_graph_def,
+                                               output_node_names)
   ensure_graph_is_valid(optimized_graph_def)
   return optimized_graph_def
 
diff --git a/tensorflow/python/tools/print_selective_registration_header_test.py b/tensorflow/python/tools/print_selective_registration_header_test.py
index 36978b0860a423569149cd0572629f9f1f280637..4b3d98242caf683693430f08bd8cb74483f4bc74 100644
--- a/tensorflow/python/tools/print_selective_registration_header_test.py
+++ b/tensorflow/python/tools/print_selective_registration_header_test.py
@@ -24,6 +24,7 @@ import sys
 from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.tools import selective_registration_header_lib
@@ -93,11 +94,16 @@ class PrintOpFilegroupTest(test.TestCase):
 
     ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
         'rawproto', self.WriteGraphFiles(graphs), default_ops)
+    matmul_prefix = ''
+    if test_util.IsMklEnabled():
+      matmul_prefix = 'Mkl'
+
     self.assertListEqual(
         [
             ('BiasAdd', 'BiasOp<CPUDevice, float>'),  #
-            ('MatMul', 'MatMulOp<CPUDevice, double, false >'),  #
-            ('MatMul', 'MatMulOp<CPUDevice, float, false >'),  #
+            ('MatMul',
+             matmul_prefix + 'MatMulOp<CPUDevice, double, false >'),  #
+            ('MatMul', matmul_prefix + 'MatMulOp<CPUDevice, float, false >'),  #
             ('NoOp', 'NoOp'),  #
             ('Reshape', 'ReshapeOp'),  #
             ('_Recv', 'RecvOp'),  #
@@ -112,8 +118,9 @@ class PrintOpFilegroupTest(test.TestCase):
     self.assertListEqual(
         [
             ('BiasAdd', 'BiasOp<CPUDevice, float>'),  #
-            ('MatMul', 'MatMulOp<CPUDevice, double, false >'),  #
-            ('MatMul', 'MatMulOp<CPUDevice, float, false >'),  #
+            ('MatMul',
+             matmul_prefix + 'MatMulOp<CPUDevice, double, false >'),  #
+            ('MatMul', matmul_prefix + 'MatMulOp<CPUDevice, float, false >'),  #
             ('NoOp', 'NoOp'),  #
             ('Reshape', 'ReshapeOp'),  #
             ('_Recv', 'RecvOp'),  #
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 33f6debbcbecb652774c776be54323bbaa824822..5b9d25d449d43d8420e0f30fa8b907d41171d5e5 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -38,11 +38,16 @@ from tensorflow.core.example import example_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.wrappers import local_cli_wrapper
+from tensorflow.python.framework import meta_graph as meta_graph_lib
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.platform import app  # pylint: disable=unused-import
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import loader
 from tensorflow.python.tools import saved_model_utils
 
+# Set of ops to blacklist.
+_OP_BLACKLIST = set(['WriteFile', 'ReadFile'])
+
 
 def _show_tag_sets(saved_model_dir):
   """Prints the tag-sets stored in SavedModel directory.
@@ -115,7 +120,7 @@ def _get_outputs_tensor_info_from_meta_graph_def(meta_graph_def,
                                                       signature_def_key).outputs
 
 
-def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key):
+def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=0):
   """Prints input and output TensorInfos.
 
   Prints the details of input and output TensorInfos for the SignatureDef mapped
@@ -126,6 +131,7 @@ def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key):
     tag_set: Group of tag(s) of the MetaGraphDef, in string format, separated by
         ','. For tag-set contains multiple tags, all tags must be passed in.
     signature_def_key: A SignatureDef key string.
+    indent: How far (in increments of 2 spaces) to indent each line of output.
   """
   meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir,
                                                         tag_set)
@@ -134,29 +140,39 @@ def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key):
   outputs_tensor_info = _get_outputs_tensor_info_from_meta_graph_def(
       meta_graph_def, signature_def_key)
 
-  print('The given SavedModel SignatureDef contains the following input(s):')
+  indent_str = "  " * indent
+  def in_print(s):
+    print(indent_str + s)
+
+  in_print('The given SavedModel SignatureDef contains the following input(s):')
   for input_key, input_tensor in sorted(inputs_tensor_info.items()):
-    print('inputs[\'%s\'] tensor_info:' % input_key)
-    _print_tensor_info(input_tensor)
+    in_print('  inputs[\'%s\'] tensor_info:' % input_key)
+    _print_tensor_info(input_tensor, indent+1)
 
-  print('The given SavedModel SignatureDef contains the following output(s):')
+  in_print('The given SavedModel SignatureDef contains the following '
+           'output(s):')
   for output_key, output_tensor in sorted(outputs_tensor_info.items()):
-    print('outputs[\'%s\'] tensor_info:' % output_key)
-    _print_tensor_info(output_tensor)
+    in_print('  outputs[\'%s\'] tensor_info:' % output_key)
+    _print_tensor_info(output_tensor, indent+1)
 
-  print('Method name is: %s' %
-        meta_graph_def.signature_def[signature_def_key].method_name)
+  in_print('Method name is: %s' %
+           meta_graph_def.signature_def[signature_def_key].method_name)
 
 
-def _print_tensor_info(tensor_info):
+def _print_tensor_info(tensor_info, indent=0):
   """Prints details of the given tensor_info.
 
   Args:
     tensor_info: TensorInfo object to be printed.
+    indent: How far (in increments of 2 spaces) to indent each line output
   """
-  print('    dtype: ' +
-        {value: key
-         for (key, value) in types_pb2.DataType.items()}[tensor_info.dtype])
+  indent_str = "  " * indent
+  def in_print(s):
+    print(indent_str + s)
+
+  in_print('    dtype: ' +
+           {value: key
+            for (key, value) in types_pb2.DataType.items()}[tensor_info.dtype])
   # Display shape as tuple.
   if tensor_info.tensor_shape.unknown_rank:
     shape = 'unknown_rank'
@@ -164,8 +180,8 @@ def _print_tensor_info(tensor_info):
     dims = [str(dim.size) for dim in tensor_info.tensor_shape.dim]
     shape = ', '.join(dims)
     shape = '(' + shape + ')'
-  print('    shape: ' + shape)
-  print('    name: ' + tensor_info.name)
+  in_print('    shape: ' + shape)
+  in_print('    name: ' + tensor_info.name)
 
 
 def _show_all(saved_model_dir):
@@ -179,14 +195,15 @@ def _show_all(saved_model_dir):
   """
   tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
   for tag_set in sorted(tag_sets):
-    tag_set = ', '.join(tag_set)
-    print('\nMetaGraphDef with tag-set: \'' + tag_set +
-          '\' contains the following SignatureDefs:')
+    print("\nMetaGraphDef with tag-set: '%s' "
+          "contains the following SignatureDefs:" % ', '.join(tag_set))
 
+    tag_set = ','.join(tag_set)
     signature_def_map = get_signature_def_map(saved_model_dir, tag_set)
     for signature_def_key in sorted(signature_def_map.keys()):
       print('\nsignature_def[\'' + signature_def_key + '\']:')
-      _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key)
+      _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key,
+                           indent=1)
 
 
 def get_meta_graph_def(saved_model_dir, tag_set):
@@ -230,6 +247,27 @@ def get_signature_def_map(saved_model_dir, tag_set):
   return meta_graph.signature_def
 
 
+def scan_meta_graph_def(meta_graph_def):
+  """Scans meta_graph_def and reports if there are ops on blacklist.
+
+  Print ops if they are on black list, or print success if no blacklisted ops
+  found.
+
+  Args:
+    meta_graph_def: MetaGraphDef protocol buffer.
+  """
+  all_ops_set = set(
+      meta_graph_lib.ops_used_by_graph_def(meta_graph_def.graph_def))
+  blacklisted_ops = _OP_BLACKLIST & all_ops_set
+  if blacklisted_ops:
+    # TODO(yifeif): print more warnings
+    print('MetaGraph with tag set %s contains the following blacklisted ops:' %
+          meta_graph_def.meta_info_def.tags, blacklisted_ops)
+  else:
+    print('MetaGraph with tag set %s does not contain blacklisted ops.' %
+          meta_graph_def.meta_info_def.tags)
+
+
 def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
                                    input_tensor_key_feed_dict, outdir,
                                    overwrite_flag, tf_debug=False):
@@ -506,7 +544,7 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str,
   input_examples = preprocess_input_examples_arg_string(input_examples_str)
 
   for input_tensor_key, (filename, variable_name) in inputs.items():
-    data = np.load(filename)
+    data = np.load(file_io.FileIO(filename, mode='r'))
 
     # When a variable_name key is specified for the input file
     if variable_name:
@@ -597,6 +635,21 @@ def run(args):
                                  args.overwrite, tf_debug=args.tf_debug)
 
 
+def scan(args):
+  """Function triggered by scan command.
+
+  Args:
+    args: A namespace parsed from command line.
+  """
+  if args.tag_set:
+    scan_meta_graph_def(
+        saved_model_utils.get_meta_graph_def(args.dir, args.tag_set))
+  else:
+    saved_model = reader.read_saved_model(args.dir)
+    for meta_graph_def in saved_model.meta_graphs:
+      scan_meta_graph_def(meta_graph_def)
+
+
 def create_parser():
   """Creates a parser that parse the command line arguments.
 
@@ -614,19 +667,19 @@ def create_parser():
   show_msg = (
       'Usage examples:\n'
       'To show all tag-sets in a SavedModel:\n'
-      '$saved_model_cli show --dir /tmp/saved_model\n'
+      '$saved_model_cli show --dir /tmp/saved_model\n\n'
       'To show all available SignatureDef keys in a '
       'MetaGraphDef specified by its tag-set:\n'
-      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve\n'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve\n\n'
       'For a MetaGraphDef with multiple tags in the tag-set, all tags must be '
       'passed in, separated by \';\':\n'
       '$saved_model_cli show --dir /tmp/saved_model --tag_set serve,gpu\n\n'
       'To show all inputs and outputs TensorInfo for a specific'
       ' SignatureDef specified by the SignatureDef key in a'
       ' MetaGraph.\n'
-      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve '
-      '--signature_def serving_default\n\n'
-      'To show all available information in the SavedModel\n:'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
+      ' --signature_def serving_default\n\n'
+      'To show all available information in the SavedModel:\n'
       '$saved_model_cli show --dir /tmp/saved_model --all')
   parser_show = subparsers.add_parser(
       'show',
@@ -658,12 +711,14 @@ def create_parser():
   run_msg = ('Usage example:\n'
              'To run input tensors from files through a MetaGraphDef and save'
              ' the output tensors to files:\n'
-             '$saved_model_cli show --dir /tmp/saved_model --tag_set serve '
-             '--signature_def serving_default '
-             '--inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy '
-             '--input_exprs \'input3_key=np.ones(2)\' --input_examples '
-             '\'input4_key=[{"id":[26],"weights":[0.5, 0.5]}]\' '
-             '--outdir=/out\n\n'
+             '$saved_model_cli show --dir /tmp/saved_model --tag_set serve \\\n'
+             '   --signature_def serving_default \\\n'
+             '   --inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy '
+             '\\\n'
+             '   --input_exprs \'input3_key=np.ones(2)\' \\\n'
+             '   --input_examples '
+             '\'input4_key=[{"id":[26],"weights":[0.5, 0.5]}]\' \\\n'
+             '   --outdir=/out\n\n'
              'For more information about input file format, please see:\n'
              'https://www.tensorflow.org/programmers_guide/saved_model_cli\n')
   parser_run = subparsers.add_parser(
@@ -716,6 +771,26 @@ def create_parser():
            'SavedModel.')
   parser_run.set_defaults(func=run)
 
+  # scan command
+  scan_msg = ('Usage example:\n'
+              'To scan for blacklisted ops in SavedModel:\n'
+              '$saved_model_cli scan --dir /tmp/saved_model\n'
+              'To scan a specific MetaGraph, pass in --tag_set\n')
+  parser_scan = subparsers.add_parser(
+      'scan',
+      description=scan_msg,
+      formatter_class=argparse.RawTextHelpFormatter)
+  parser_scan.add_argument(
+      '--dir',
+      type=str,
+      required=True,
+      help='directory containing the SavedModel to execute')
+  parser_scan.add_argument(
+      '--tag_set',
+      type=str,
+      help='tag-set of graph in SavedModel to scan, separated by \',\'')
+  parser_scan.set_defaults(func=scan)
+
   return parser
 
 
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index d6cbc49ba1e08a6b808b228fb8d69fc14f36e3d2..eedc893a38d3d0857dd49c7ce03f3921da48fdbd 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -61,83 +61,84 @@ class SavedModelCLITestCase(test.TestCase):
     exp_out = """MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
 
 signature_def['classify_x2_to_y3']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x2:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['scores'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y3:0
-Method name is: tensorflow/serving/classify
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x2:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['scores'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y3:0
+  Method name is: tensorflow/serving/classify
 
 signature_def['classify_x_to_y']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_STRING
-    shape: unknown_rank
-    name: tf_example:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['scores'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
-Method name is: tensorflow/serving/classify
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_STRING
+        shape: unknown_rank
+        name: tf_example:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['scores'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y:0
+  Method name is: tensorflow/serving/classify
 
 signature_def['regress_x2_to_y3']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x2:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['outputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y3:0
-Method name is: tensorflow/serving/regress
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x2:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['outputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y3:0
+  Method name is: tensorflow/serving/regress
 
 signature_def['regress_x_to_y']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_STRING
-    shape: unknown_rank
-    name: tf_example:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['outputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
-Method name is: tensorflow/serving/regress
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_STRING
+        shape: unknown_rank
+        name: tf_example:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['outputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y:0
+  Method name is: tensorflow/serving/regress
 
 signature_def['regress_x_to_y2']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['inputs'] tensor_info:
-    dtype: DT_STRING
-    shape: unknown_rank
-    name: tf_example:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['outputs'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y2:0
-Method name is: tensorflow/serving/regress
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['inputs'] tensor_info:
+        dtype: DT_STRING
+        shape: unknown_rank
+        name: tf_example:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['outputs'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y2:0
+  Method name is: tensorflow/serving/regress
 
 signature_def['serving_default']:
-The given SavedModel SignatureDef contains the following input(s):
-inputs['x'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: x:0
-The given SavedModel SignatureDef contains the following output(s):
-outputs['y'] tensor_info:
-    dtype: DT_FLOAT
-    shape: (-1, 1)
-    name: y:0
-Method name is: tensorflow/serving/predict"""
+  The given SavedModel SignatureDef contains the following input(s):
+    inputs['x'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: x:0
+  The given SavedModel SignatureDef contains the following output(s):
+    outputs['y'] tensor_info:
+        dtype: DT_FLOAT
+        shape: (-1, 1)
+        name: y:0
+  Method name is: tensorflow/serving/predict"""
     # pylint: enable=line-too-long
+    self.maxDiff = None # Produce a useful error msg if the comparison fails
     self.assertMultiLineEqual(output, exp_out)
     self.assertEqual(err.getvalue().strip(), '')
 
@@ -193,11 +194,11 @@ Method name is: tensorflow/serving/predict"""
     output = out.getvalue().strip()
     expected_output = (
         'The given SavedModel SignatureDef contains the following input(s):\n'
-        'inputs[\'x\'] tensor_info:\n'
-        '    dtype: DT_FLOAT\n    shape: (-1, 1)\n    name: x:0\n'
+        '  inputs[\'x\'] tensor_info:\n'
+        '      dtype: DT_FLOAT\n      shape: (-1, 1)\n      name: x:0\n'
         'The given SavedModel SignatureDef contains the following output(s):\n'
-        'outputs[\'y\'] tensor_info:\n'
-        '    dtype: DT_FLOAT\n    shape: (-1, 1)\n    name: y:0\n'
+        '  outputs[\'y\'] tensor_info:\n'
+        '      dtype: DT_FLOAT\n      shape: (-1, 1)\n      name: y:0\n'
         'Method name is: tensorflow/serving/predict')
     self.assertEqual(output, expected_output)
     self.assertEqual(err.getvalue().strip(), '')
@@ -524,6 +525,28 @@ Method name is: tensorflow/serving/predict"""
     y_expected = np.array([[2.5], [3.0]])
     self.assertAllClose(y_expected, y_actual)
 
+  def testScanCommand(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    args = self.parser.parse_args(['scan', '--dir', base_path])
+    with captured_output() as (out, _):
+      saved_model_cli.scan(args)
+    output = out.getvalue().strip()
+    self.assertTrue('does not contain blacklisted ops' in output)
+
+  def testScanCommandFoundBlacklistedOp(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    args = self.parser.parse_args(
+        ['scan', '--dir', base_path, '--tag_set', 'serve'])
+    op_blacklist = saved_model_cli._OP_BLACKLIST
+    saved_model_cli._OP_BLACKLIST = set(['VariableV2'])
+    with captured_output() as (out, _):
+      saved_model_cli.scan(args)
+    saved_model_cli._OP_BLACKLIST = op_blacklist
+    output = out.getvalue().strip()
+    self.assertTrue('\'VariableV2\'' in output)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index c92f6fc3015960a2b821651231bb94713e0d53dd..6fa3ff66583ce07a6ee7b0d8158c851ea578637c 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -43,23 +43,19 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Initialization:
 
-    ```
-    m_0 <- 0 (Initialize initial 1st moment vector)
-    v_0 <- 0 (Initialize initial 2nd moment vector)
-    t <- 0 (Initialize timestep)
-    ```
+    $$m_0 := 0 (Initialize initial 1st moment vector)$$
+    $$v_0 := 0 (Initialize initial 2nd moment vector)$$
+    $$t := 0 (Initialize timestep)$$
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    ```
-    t <- t + 1
-    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+    $$t := t + 1$$
+    $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
-    variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-    ```
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
@@ -106,10 +102,10 @@ class AdamOptimizer(optimizer.Optimizer):
     self._updated_lr = None
 
   def _get_beta_accumulators(self):
-    if context.in_graph_mode():
-      graph = ops.get_default_graph()
-    else:
+    if context.executing_eagerly():
       graph = None
+    else:
+      graph = ops.get_default_graph()
     return (self._get_non_slot_variable("beta1_power", graph=graph),
             self._get_non_slot_variable("beta2_power", graph=graph))
 
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index a521f1299e035424d1c3897a469655db732b0dcd..9be8b6aafefa33977511cde24dd2e87dd6c3b81a 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -184,7 +184,7 @@ class AdamOptimizerTest(test.TestCase):
           # Shouldn't return non-slot variables from other graphs.
           self.assertEqual(0, len(opt.variables()))
 
-        if context.in_graph_mode():
+        if not context.executing_eagerly():
           self.evaluate(variables.global_variables_initializer())
           # Fetch params to validate initial values
           self.assertAllClose([1.0, 2.0], self.evaluate(var0))
@@ -194,7 +194,7 @@ class AdamOptimizerTest(test.TestCase):
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          if context.in_graph_mode():
+          if not context.executing_eagerly():
             self.evaluate(update)
           elif t > 1:
             opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
@@ -319,6 +319,15 @@ class AdamOptimizerTest(test.TestCase):
         # fails.
         optimizer.apply_gradients([(grads0, var0)])
 
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.AdamOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(6, len(set(opt.variables())))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index aae757b99aa9abb2fca112dcc781fc31e367649d..d1cc7d8ce33ac618ddf07d10f4d65cef8bc3dbc9 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -12,18 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Some common SessionRunHook classes.
-
-@@LoggingTensorHook
-@@StopAtStepHook
-@@CheckpointSaverHook
-@@StepCounterHook
-@@NanLossDuringTrainingError
-@@NanTensorHook
-@@SummarySaverHook
-@@GlobalStepWaiterHook
-@@ProfilerHook
-"""
+"""Some common SessionRunHook classes."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -391,7 +380,8 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
                saver=None,
                checkpoint_basename="model.ckpt",
                scaffold=None,
-               listeners=None):
+               listeners=None,
+               steps_per_run=1):
     """Initializes a `CheckpointSaverHook`.
 
     Args:
@@ -404,6 +394,9 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
       listeners: List of `CheckpointSaverListener` subclass instances.
         Used for callbacks that run immediately before or after this hook saves
         the checkpoint.
+      steps_per_run: `int`, number of steps that occur between each invocation
+        of the hook. Primarily used for TPU workloads which run multiple steps
+        in a while loop in a single Session.run.
 
     Raises:
       ValueError: One of `save_steps` or `save_secs` should be set.
@@ -419,6 +412,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     self._timer = SecondOrStepTimer(every_secs=save_secs,
                                     every_steps=save_steps)
     self._listeners = listeners or []
+    self._steps_per_run = steps_per_run
 
   def begin(self):
     self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
@@ -429,28 +423,33 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     for l in self._listeners:
       l.begin()
 
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    if self._timer.last_triggered_step() is None:
-      # We do write graph and saver_def at the first call of before_run.
-      # We cannot do this in begin, since we let other hooks to change graph and
-      # add variables in begin. Graph is finalized after all begin calls.
-      training_util.write_graph(
-          ops.get_default_graph().as_graph_def(add_shapes=True),
-          self._checkpoint_dir,
-          "graph.pbtxt")
-      saver_def = self._get_saver().saver_def if self._get_saver() else None
-      graph = ops.get_default_graph()
-      meta_graph_def = meta_graph.create_meta_graph_def(
-          graph_def=graph.as_graph_def(add_shapes=True),
-          saver_def=saver_def)
-      self._summary_writer.add_graph(graph)
-      self._summary_writer.add_meta_graph(meta_graph_def)
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+    # We do write graph and saver_def at the first call of before_run.
+    # We cannot do this in begin, since we let other hooks to change graph and
+    # add variables in begin. Graph is finalized after all begin calls.
+    training_util.write_graph(
+        ops.get_default_graph().as_graph_def(add_shapes=True),
+        self._checkpoint_dir,
+        "graph.pbtxt")
+    saver_def = self._get_saver().saver_def if self._get_saver() else None
+    graph = ops.get_default_graph()
+    meta_graph_def = meta_graph.create_meta_graph_def(
+        graph_def=graph.as_graph_def(add_shapes=True),
+        saver_def=saver_def)
+    self._summary_writer.add_graph(graph)
+    self._summary_writer.add_meta_graph(meta_graph_def)
+    # The checkpoint saved here is the state at step "global_step".
+    self._save(session, global_step)
+    self._timer.update_last_triggered_step(global_step)
 
+  def before_run(self, run_context):  # pylint: disable=unused-argument
     return SessionRunArgs(self._global_step_tensor)
 
   def after_run(self, run_context, run_values):
     stale_global_step = run_values.results
-    if self._timer.should_trigger_for_step(stale_global_step+1):
+    if self._timer.should_trigger_for_step(
+        stale_global_step + self._steps_per_run):
       # get the real value after train op.
       global_step = run_context.session.run(self._global_step_tensor)
       if self._timer.should_trigger_for_step(global_step):
@@ -859,6 +858,7 @@ class ProfilerHook(session_run_hook.SessionRunHook):
           showing the sizes and lifetimes of tensors.
     """
     self._output_file = os.path.join(output_dir, "timeline-{}.json")
+    self._file_writer = SummaryWriterCache.get(output_dir)
     self._show_dataflow = show_dataflow
     self._show_memory = show_memory
     self._timer = SecondOrStepTimer(
@@ -889,6 +889,8 @@ class ProfilerHook(session_run_hook.SessionRunHook):
       self._save(global_step,
                  self._output_file.format(global_step),
                  run_values.run_metadata.step_stats)
+      self._file_writer.add_run_metadata(run_values.run_metadata,
+                                         "step_%d" % global_step)
 
     self._next_step = global_step + 1
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 2547661e5250e94136a100aa8c30c9dbb7455018..31898562f81fac9f9270b74d05e76d83b1ad49da 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -466,8 +466,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 2,
-        'after_save': 2,
+        'before_save': 3,
+        'after_save': 3,
         'end': 1
     }, listener_counts)
 
@@ -490,8 +490,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 2,
-        'after_save': 2,
+        'before_save': 3,
+        'after_save': 3,
         'end': 1
     }, listener_counts)
 
@@ -523,8 +523,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 2,
-        'after_save': 2,
+        'before_save': 3,
+        'after_save': 3,
         'end': 1
     }, listener1_counts)
     self.assertEqual(listener1_counts, listener2_counts)
@@ -706,6 +706,7 @@ class CheckpointSaverHookTest(test.TestCase):
       with session_lib.Session() as sess:
         sess.run(self.scaffold.init_op)
         mon_sess = monitored_session._HookedSession(sess, [hook])
+        hook.after_create_session(sess, None)
         mon_sess.run(self.train_op)
       summary_writer.assert_summaries(
           test_case=self,
@@ -718,6 +719,124 @@ class CheckpointSaverHookTest(test.TestCase):
 
     fake_summary_writer.FakeSummaryWriter.uninstall()
 
+  def test_save_checkpoint_before_first_train_step(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir, save_steps=2, scaffold=self.scaffold)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        sess.run(self.scaffold.init_op)
+        hook.after_create_session(sess, None)
+        # Verifies that checkpoint is saved at step 0.
+        self.assertEqual(0,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+        # Verifies that no checkpoint is saved after one training step.
+        mon_sess.run(self.train_op)
+        self.assertEqual(0,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+        # Verifies that checkpoint is saved after save_steps.
+        mon_sess.run(self.train_op)
+        self.assertEqual(2,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+
+class CheckpointSaverHookMultiStepTest(test.TestCase):
+
+  def setUp(self):
+    self.model_dir = tempfile.mkdtemp()
+    self.graph = ops.Graph()
+    self.steps_per_run = 5
+    with self.graph.as_default():
+      self.scaffold = monitored_session.Scaffold()
+      self.global_step = variables.get_or_create_global_step()
+      self.train_op = training_util._increment_global_step(self.steps_per_run)
+
+  def tearDown(self):
+    shutil.rmtree(self.model_dir, ignore_errors=True)
+
+  def test_save_steps_saves_in_first_step(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=2*self.steps_per_run,
+          scaffold=self.scaffold,
+          steps_per_run=self.steps_per_run)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+  def test_save_steps_saves_periodically(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=2*self.steps_per_run,
+          scaffold=self.scaffold,
+          steps_per_run=self.steps_per_run)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        # Saved (step=5)
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Not saved (step=10)
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Saved (step=15)
+        self.assertEqual(15,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Not saved (step=20)
+        self.assertEqual(15,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Saved (step=25)
+        self.assertEqual(25,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+  def test_save_steps_saves_at_end(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=2*self.steps_per_run,
+          scaffold=self.scaffold,
+          steps_per_run=self.steps_per_run)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        mon_sess.run(self.train_op)
+        hook.end(sess)
+        self.assertEqual(10,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
 
 class ResourceCheckpointSaverHookTest(test.TestCase):
 
@@ -1274,6 +1393,19 @@ class ProfilerHookTest(test.TestCase):
         sess.run(self.train_op)  # Saved.
         self.assertEqual(3, self._count_timeline_files())
 
+  def test_run_metadata_saves_in_first_step(self):
+    writer_cache.FileWriterCache.clear()
+    fake_summary_writer.FakeSummaryWriter.install()
+    fake_writer = writer_cache.FileWriterCache.get(self.output_dir)
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.ProfilerHook(
+          save_secs=2, output_dir=self.output_dir)
+      with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
+        sess.run(self.train_op)  # Saved.
+        self.assertEqual(
+            list(fake_writer._added_run_metadata.keys()), ['step_1'])
+    fake_summary_writer.FakeSummaryWriter.uninstall()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/checkpoint_ops.py b/tensorflow/python/training/checkpoint_ops.py
index 7f92d94d2be369709608d36c109863b0ebfb7bbe..a6e9662b7305a00f1fcf03245685e93b756942d3 100644
--- a/tensorflow/python/training/checkpoint_ops.py
+++ b/tensorflow/python/training/checkpoint_ops.py
@@ -149,7 +149,7 @@ def _load_and_remap_matrix(ckpt_path,
   num_rows_present = num_rows_to_load
   if remap_rows:
     row_remapping, num_rows_present = (
-        gen_checkpoint_ops._generate_vocab_remapping(  # pylint: disable=protected-access
+        gen_checkpoint_ops.generate_vocab_remapping(
             new_vocab_file=new_row_vocab_file,
             old_vocab_file=old_row_vocab_file,
             new_vocab_offset=new_row_vocab_offset,
@@ -168,7 +168,7 @@ def _load_and_remap_matrix(ckpt_path,
   num_cols_present = new_col_vocab_size
   if remap_cols:
     col_remapping, num_cols_present = (
-        gen_checkpoint_ops._generate_vocab_remapping(  # pylint: disable=protected-access
+        gen_checkpoint_ops.generate_vocab_remapping(
             new_vocab_file=new_col_vocab_file,
             old_vocab_file=old_col_vocab_file,
             new_vocab_offset=0,  # Offset is unused for cols (no partitioning).
@@ -178,7 +178,7 @@ def _load_and_remap_matrix(ckpt_path,
       num_rows_to_load * new_col_vocab_size -
       num_rows_present * num_cols_present, 1
   ])
-  return_tensor = gen_checkpoint_ops._load_and_remap_matrix(  # pylint: disable=protected-access
+  return_tensor = gen_checkpoint_ops.load_and_remap_matrix(
       ckpt_path=ckpt_path,
       old_tensor_name=old_tensor_name,
       row_remapping=row_remapping,
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index fa3de6fad27b6cc773f9f2e86e9f95395eb7c285..e7f88de1d2290a49f3b7bdf47417016d7e7c9cea 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables
@@ -289,10 +290,20 @@ def _set_checkpoint_initializer(variable,
     name: Name of the operation.
   """
   base_type = variable.dtype.base_dtype
-  with ops.colocate_with(variable):
+  # Do not colocate with variable since RestoreV2 op only runs on CPU and
+  # colocation will force variable (and other ops that colocate with variable)
+  # to be on CPU as well. It is okay to place the variable's initializer op on
+  # CPU since it will only be run once at the start.
+  with ops.device(variable.device), ops.device("/cpu:0"):
     restore_op = io_ops.restore_v2(
         ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
-    variable._initializer_op = state_ops.assign(variable, restore_op)  # pylint:disable=protected-access
+    if isinstance(variable, resource_variable_ops.ResourceVariable):
+      init_op = variable.assign(restore_op, read_value=False)
+    else:
+      init_op = state_ops.assign(variable, restore_op)
+    variable._initializer_op = init_op  # pylint:disable=protected-access
+    restore_op.set_shape(variable.shape)
+    variable._initial_value = restore_op  # pylint:disable=protected-access
 
 
 def _set_variable_or_list_initializer(variable_or_list, ckpt_file,
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index cd17faa040d5b85263b54bc53100b18f736a12e0..4e08a1c859fbaac75e7cd09ad498d9fea14c6338 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -145,6 +146,36 @@ class CheckpointsTest(test.TestCase):
         # Check that tensors are not explicitly in the graph.
         self.assertLess(len(str(session.graph.as_graph_def())), 29000)
 
+  def testInitialValueComesFromCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, _, _, _ = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope(
+            "some_scope", initializer=init_ops.zeros_initializer()):
+          my1 = variable_scope.get_variable("my1", [1, 10])
+
+        before = my1.initialized_value()
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir, {"var1": my1})
+
+        after = my1.initialized_value()
+
+        self.assertAllEqual(session.run(before), [[0.0] * 10])
+        self.assertAllEqual(session.run(after), v1)
+
+        session.run(variables.global_variables_initializer())
+
+        self.assertAllEqual(session.run(my1), v1)
+        self.assertAllEqual(session.run(my1.initialized_value()), v1)
+        self.assertAllClose(session.run(before), v1)
+        self.assertAllClose(session.run(after), v1)
+        with self.assertRaises(AssertionError):
+          self.assertAllClose(v1, [[0.0] * 10])
+
   def testInitWithScopeDoesNotCaptureSuffixes(self):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
@@ -176,7 +207,9 @@ class CheckpointsTest(test.TestCase):
 
       checkpoint_utils.init_from_checkpoint(checkpoint_dir,
                                             {"useful_scope/": "useful_scope/"})
-      self.assertEqual(my4._initializer_op.op.inputs[1].device, "/job:ps")
+      # initializer runs on the same task but always on CPU.
+      self.assertEqual(my4._initializer_op.op.inputs[1].device,
+                       "/job:ps/device:CPU:0")
 
   def testInitFromRootCheckpoint(self):
     checkpoint_dir = self.get_temp_dir()
@@ -332,6 +365,31 @@ class CheckpointsTest(test.TestCase):
           checkpoint_utils.init_from_checkpoint(checkpoint_dir,
                                                 {"useful_scope": "some_scope/"})
 
+  def testNoAdditionalReadOpsForResourceVariables(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, _, _, _ = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        my1 = resource_variable_ops.ResourceVariable([[0.0] * 10], name="my1")
+
+        with ops.name_scope("init_from_checkpoint"):
+          checkpoint_utils.init_from_checkpoint(checkpoint_dir, {"var1": my1})
+
+        # Basic sanity checks:
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(session.run(my1), v1)
+
+    ops_in_init_from_checkpoint_scope = [
+        op for op in g.get_operations()
+        if (op.name.startswith("init_from_checkpoint/") and
+            not op.name.startswith("init_from_checkpoint/checkpoint_initializer"
+                                  ) and op.type != "AssignVariableOp")
+    ]
+    self.assertEqual(ops_in_init_from_checkpoint_scope, [])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable.py b/tensorflow/python/training/checkpointable.py
index 9d62c5ff91449189eeed8d017c21134a1230c3c8..05afd37ccd5e80c3bbb1ecc2638c559b0115866b 100644
--- a/tensorflow/python/training/checkpointable.py
+++ b/tensorflow/python/training/checkpointable.py
@@ -18,23 +18,28 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import weakref
 
-from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
+
+# Key where the object graph proto is saved in a TensorBundle
+OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
+
+
 # A key indicating a variable's value in an object's checkpointed Tensors
-# (Checkpointable._gather_tensors_for_checkpoint). If this is the only key and
+# (Checkpointable._gather_saveables_for_checkpoint). If this is the only key and
 # the object has no dependencies, then its value may be restored on object
 # creation (avoiding double assignment when executing eagerly).
 VARIABLE_VALUE_KEY = "VARIABLE_VALUE"
 
-_CheckpointableReference = collections.namedtuple(
-    "_CheckpointableReference",
+CheckpointableReference = collections.namedtuple(
+    "CheckpointableReference",
     [
         # The local name for this dependency.
         "name",
@@ -57,7 +62,7 @@ class CheckpointInitialValue(ops.Tensor):
   """
 
   def __init__(self, checkpoint_position, shape=None):
-    self.wrapped_value = checkpoint_position.restore_ops()[
+    self.wrapped_value = checkpoint_position.value_tensors()[
         VARIABLE_VALUE_KEY]
     if shape:
       # We need to set the static shape information on the initializer if
@@ -95,12 +100,13 @@ class _CheckpointPosition(object):
 
   def restore(self, checkpointable):
     """Restore this value into `checkpointable`."""
-    if self.bind_object(checkpointable):
-      # This object's correspondence with a checkpointed object is new, so
-      # process deferred restorations for it and its dependencies.
-      restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
-      if restore_ops:
-        self._checkpoint.restore_ops.extend(restore_ops)
+    with ops.init_scope():
+      if self.bind_object(checkpointable):
+        # This object's correspondence with a checkpointed object is new, so
+        # process deferred restorations for it and its dependencies.
+        restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
+        if restore_ops:
+          self._checkpoint.restore_ops.extend(restore_ops)
 
   def bind_object(self, checkpointable):
     """Set a checkpoint<->object correspondence and process slot variables.
@@ -114,6 +120,7 @@ class _CheckpointPosition(object):
       AssertionError: If another object is already bound to the `Object` proto.
     """
     checkpoint = self.checkpoint
+    checkpoint.all_python_objects.add(checkpointable)
     current_assignment = checkpoint.object_by_proto_id.get(self._proto_id, None)
     if current_assignment is None:
       checkpoint.object_by_proto_id[self._proto_id] = checkpointable
@@ -152,12 +159,12 @@ class _CheckpointPosition(object):
       # consistent (if the dependency DAG is not a tree then there are
       # multiple paths to the same object).
       if current_assignment is not checkpointable:
-        raise AssertionError(
-            ("Unable to load the checkpoint into this object graph. Either "
-             "the Checkpointable object references in the Python program "
-             "have changed in an incompatible way, or the checkpoint was "
-             "generated in an incompatible program.\n\nTwo checkpoint "
-             "references resolved to different objects (%s and %s).")
+        logging.warning(
+            ("Inconsistent references when loading the checkpoint into this "
+             "object graph. Either the Checkpointable object references in the "
+             "Python program have changed in an incompatible way, or the "
+             "checkpoint was generated in an incompatible program.\n\nTwo "
+             "checkpoint references resolved to different objects (%s and %s).")
             % (current_assignment, checkpointable))
       return False  # Not a new assignment
 
@@ -168,22 +175,93 @@ class _CheckpointPosition(object):
             and attributes[0].name == VARIABLE_VALUE_KEY
             and not self.object_proto.children)
 
-  def restore_ops(self):
-    """Create restore ops for this object's attributes."""
-    restore_tensors = {}
+  def value_tensors(self):
+    """Create value `Tensor`s for this object's attributes.
+
+    Does not require that the Python object has been created. Used for
+    restore-on-create when executing eagerly.
+
+    Returns:
+      A dictionary mapping from object attribute names to `Tensor`s.
+    """
+    value_tensors = {}
     for serialized_tensor in self.object_proto.attributes:
       checkpoint_key = serialized_tensor.checkpoint_key
       dtype = self._checkpoint.dtype_map[checkpoint_key]
       base_type = dtype.base_dtype
       with ops.init_scope():
-        restore, = io_ops.restore_v2(
-            prefix=self._checkpoint.save_path,
-            tensor_names=[checkpoint_key],
-            shape_and_slices=[""],
-            dtypes=[base_type],
-            name="%s_checkpoint_read" % (serialized_tensor.name,))
-        restore_tensors[serialized_tensor.name] = restore
-      return restore_tensors
+        with ops.device("/cpu:0"):
+          # Run the restore itself on the CPU.
+          value, = io_ops.restore_v2(
+              prefix=self._checkpoint.save_path,
+              tensor_names=[checkpoint_key],
+              shape_and_slices=[""],
+              dtypes=[base_type],
+              name="%s_checkpoint_read" % (serialized_tensor.name,))
+        # Copy the value to the current device if necessary.
+        value_tensors[serialized_tensor.name] = array_ops.identity(value)
+      return value_tensors
+
+  def restore_ops(self):
+    """Create or fetch restore ops for this object's attributes.
+
+    Requires that the `Checkpointable` Python object has been bound to an object
+    ID in the checkpoint.
+
+    Returns:
+      A list of operations when graph building, or an empty list when executing
+      eagerly.
+    """
+    saveables = self.checkpointable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
+    # Name saveables based on the name this object had when it was checkpointed.
+    named_saveables = {}
+    restore_ops = []
+    building_graph = not context.executing_eagerly()
+    for serialized_tensor in self.object_proto.attributes:
+      saveable_factory = saveables.get(serialized_tensor.name, None)
+      if saveable_factory is None:
+        # Purposefully does not throw an exception if attributes have been added
+        # or deleted. Stores unused attributes so an exception can be raised if
+        # the user decides to check that everything in the checkpoint was
+        # loaded.
+        self._checkpoint.unused_attributes.setdefault(
+            self.checkpointable, []).append(serialized_tensor.name)
+        continue
+      if building_graph:
+        existing_ops = self._checkpoint.restore_ops_by_name.get(
+            serialized_tensor.name, None)
+      else:
+        existing_ops = None
+      if existing_ops is None:
+        if callable(saveable_factory):
+          saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
+        else:
+          saveable = saveable_factory
+        named_saveables[serialized_tensor.checkpoint_key] = saveable
+    if named_saveables:
+      validated_saveables = (
+          self._checkpoint.builder._ValidateAndSliceInputs(named_saveables))  # pylint: disable=protected-access
+      validated_names = set(saveable.name for saveable in validated_saveables)
+      if set(named_saveables.keys()) != validated_names:
+        raise AssertionError(
+            ("Saveable keys changed when validating. Got back %s, was "
+             "expecting %s") % (named_saveables.keys(), validated_names))
+      all_tensors = self._checkpoint.builder.bulk_restore(
+          filename_tensor=self._checkpoint.save_path,
+          saveables=validated_saveables, preferred_shard=-1,
+          restore_sequentially=False)
+      saveable_index = 0
+      for saveable in validated_saveables:
+        num_specs = len(saveable.specs)
+        saveable_tensors = all_tensors[
+            saveable_index:saveable_index + num_specs]
+        saveable_index += num_specs
+        restore_op = saveable.restore(saveable_tensors, restored_shapes=None)
+        if building_graph:
+          assert saveable.name not in self._checkpoint.restore_ops_by_name
+          self._checkpoint.restore_ops_by_name[saveable.name] = restore_op
+          restore_ops.append(restore_op)
+    return restore_ops
 
   @property
   def checkpoint(self):
@@ -225,54 +303,6 @@ _SlotVariableRestoration = collections.namedtuple(
     ])
 
 
-class _Checkpoint(object):
-  """Holds the status of an object-based checkpoint load."""
-
-  def __init__(self, object_graph_proto, save_path):
-    """Specify the checkpoint being loaded.
-
-    Args:
-      object_graph_proto: The CheckpointableObjectGraph protocol buffer
-        associated with this checkpoint.
-      save_path: The path to the checkpoint, as returned by
-        `tf.train.latest_checkpoint`.
-    """
-    self.object_graph_proto = object_graph_proto
-    self.restore_uid = ops.uid()
-    # Dictionary mapping from an id in the protocol buffer flat array to
-    # Checkpointable Python objects. This mapping may be deferred if a
-    # checkpoint is restored before all dependencies have been tracked. Uses
-    # weak references so that partial restorations don't create reference cycles
-    # (as objects with deferred dependencies will generally have references to
-    # this object).
-    self.object_by_proto_id = weakref.WeakValueDictionary()
-    self.save_path = save_path
-    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-    self.dtype_map = reader.get_variable_to_dtype_map()
-    # When graph building, contains a list of ops to run to restore objects from
-    # this checkpoint.
-    self.restore_ops = []
-    # A mapping from optimizer proto ids to lists of slot variables to be
-    # restored when the optimizer is tracked. Only includes slot variables whose
-    # regular variables have already been created, and only for optimizer
-    # objects which have not yet been created/tracked.
-    self.deferred_slot_restorations = {}
-    # A mapping from variable proto ids to lists of slot variables to be
-    # restored when the variable is created/tracked. These get shifted over to
-    # deferred_slot_restorations if the optimizer hasn't been created when that
-    # happens.
-    self.slot_restorations = {}
-    for node_index, node in enumerate(self.object_graph_proto.nodes):
-      for slot_reference in node.slot_variables:
-        # `node` refers to an `Optimizer`, since only these have slot variables.
-        self.slot_restorations.setdefault(
-            slot_reference.original_variable_node_id, []).append(
-                _SlotVariableRestoration(
-                    optimizer_id=node_index,
-                    slot_variable_id=slot_reference.slot_variable_node_id,
-                    slot_name=slot_reference.slot_name))
-
-
 class CheckpointableBase(object):
   """Base class for `Checkpointable` objects without automatic dependencies.
 
@@ -287,17 +317,22 @@ class CheckpointableBase(object):
 
     Not __init__, since most objects will forget to call it.
     """
-    if hasattr(self, "_checkpoint_dependencies"):
+    if hasattr(self, "_unconditional_checkpoint_dependencies"):
       # __init__ already called. This check means that we don't need
       # Checkpointable.__init__() in the constructor of every TensorFlow object.
       return
-    # A list of _CheckpointableReference objects.
-    self._checkpoint_dependencies = []
+    # A list of CheckpointableReference objects. Some classes implementing
+    # `Checkpointable`, notably `Optimizer`s, may override the
+    # _checkpoint_dependencies property with conditional dependencies
+    # (e.g. based on the current graph when saving).
+    self._unconditional_checkpoint_dependencies = []
     # Maps names -> Checkpointable objects
-    self._dependency_names = {}
+    self._unconditional_dependency_names = {}
     # Restorations for other Checkpointable objects on which this object may
-    # eventually depend.
-    self._deferred_dependencies = {}  # local name -> _CheckpointPosition list
+    # eventually depend. Maps local name -> _CheckpointPosition list. Optimizers
+    # tack on conditional dependencies, and so need separate management of
+    # deferred dependencies too.
+    self._unconditional_deferred_dependencies = {}
     # The UID of the highest assignment to this object. Used to ensure that the
     # last requested assignment determines the final value of an object.
     if hasattr(self, "_update_uid"):
@@ -306,9 +341,51 @@ class CheckpointableBase(object):
           "initialization code was run.")
     self._update_uid = -1
 
+  @property
+  def _checkpoint_dependencies(self):
+    """All dependencies of this object.
+
+    May be overridden to include conditional dependencies.
+
+    Returns:
+      A list of `CheckpointableReference` objects indicating named
+      `Checkpointable` dependencies which should be saved along with this
+      object.
+    """
+    return self._unconditional_checkpoint_dependencies
+
+  @property
+  def _deferred_dependencies(self):
+    """A dictionary with deferred dependencies.
+
+    Stores restorations for other Checkpointable objects on which this object
+    may eventually depend. May be overridden by sub-classes (e.g. Optimizers use
+    conditional dependencies based the current graph, and so need separate
+    management of deferred dependencies too).
+
+    Returns:
+      A dictionary mapping from local name to a list of _CheckpointPosition
+      objects.
+    """
+    return self._unconditional_deferred_dependencies
+
+  def _lookup_dependency(self, name):
+    """Look up a dependency by name.
+
+    May be overridden to include conditional dependencies.
+
+    Args:
+      name: The local name of the dependency.
+    Returns:
+      A `Checkpointable` object, or `None` if no dependency by this name was
+      found.
+    """
+    return self._unconditional_dependency_names.get(name, None)
+
   def _add_variable_with_custom_getter(
       self, name, shape=None, dtype=dtypes.float32,
-      initializer=None, getter=None, **kwargs_for_getter):
+      initializer=None, getter=None, overwrite=False,
+      **kwargs_for_getter):
     """Restore-on-create for a variable be saved with this `Checkpointable`.
 
     If the user has requested that this object or another `Checkpointable` which
@@ -320,12 +397,11 @@ class CheckpointableBase(object):
       name: A name for the variable. Must be unique within this object.
       shape: The shape of the variable.
       dtype: The data type of the variable.
-
       initializer: The initializer to use. Ignored if there is a deferred
         restoration left over from a call to
         `_restore_from_checkpoint_position`.
-
       getter: The getter to wrap which actually fetches the variable.
+      overwrite: If True, disables unique name and type checks.
       **kwargs_for_getter: Passed to the getter.
 
     Returns:
@@ -335,34 +411,35 @@ class CheckpointableBase(object):
       ValueError: If the variable name is not unique.
     """
     self._maybe_initialize_checkpointable()
-    if name in self._dependency_names:
+    if not overwrite and self._lookup_dependency(name) is not None:
       raise ValueError(
           ("A variable named '%s' already exists in this Checkpointable, but "
            "Checkpointable._add_variable called to create another with "
            "that name. Variable names must be unique within a Checkpointable "
            "object.") % (name,))
-    if context.in_eager_mode():
-      # If this is a variable with a single Tensor stored in the checkpoint, we
-      # can set that value as an initializer rather than initializing and then
-      # assigning (when executing eagerly). This call returns None if there is
-      # nothing to restore.
-      checkpoint_initializer = self._preload_simple_restoration(
-          name=name, shape=shape)
-    else:
-      checkpoint_initializer = None
-    if (checkpoint_initializer is not None
-        and not (
-            isinstance(initializer, CheckpointInitialValue)
-            and initializer.restore_uid > checkpoint_initializer.restore_uid)):
-      # If multiple Checkpointable objects are "creating" the same variable via
-      # the magic of custom getters, the one with the highest restore UID (the
-      # one called last) has to make the final initializer. If another custom
-      # getter interrupts this process by overwriting the initializer, then
-      # we'll catch that when we call _track_checkpointable. So this is "best
-      # effort" to set the initializer with the highest restore UID.
-      initializer = checkpoint_initializer
-      shape = None
-
+    with ops.init_scope():
+      if context.executing_eagerly():
+        # If this is a variable with a single Tensor stored in the checkpoint,
+        # we can set that value as an initializer rather than initializing and
+        # then assigning (when executing eagerly). This call returns None if
+        # there is nothing to restore.
+        checkpoint_initializer = self._preload_simple_restoration(
+            name=name, shape=shape)
+      else:
+        checkpoint_initializer = None
+      if (checkpoint_initializer is not None
+          and not (
+              isinstance(initializer, CheckpointInitialValue)
+              and (initializer.restore_uid
+                   > checkpoint_initializer.restore_uid))):
+        # If multiple Checkpointable objects are "creating" the same variable
+        # via the magic of custom getters, the one with the highest restore UID
+        # (the one called last) has to make the final initializer. If another
+        # custom getter interrupts this process by overwriting the initializer,
+        # then we'll catch that when we call _track_checkpointable. So this is
+        # "best effort" to set the initializer with the highest restore UID.
+        initializer = checkpoint_initializer
+        shape = None
     new_variable = getter(
         name=name, shape=shape, dtype=dtype, initializer=initializer,
         **kwargs_for_getter)
@@ -371,7 +448,13 @@ class CheckpointableBase(object):
     # assign again. It will add this variable to our dependencies, and if there
     # is a non-trivial restoration queued, it will handle that. This also
     # handles slot variables.
-    return self._track_checkpointable(new_variable, name=name)
+    if not overwrite or isinstance(new_variable, CheckpointableBase):
+      return self._track_checkpointable(new_variable, name=name,
+                                        overwrite=overwrite)
+    else:
+      # TODO(allenl): Some variable types are not yet supported. Remove this
+      # fallback once all get_variable() return types are Checkpointable.
+      return new_variable
 
   def _preload_simple_restoration(self, name, shape):
     """Return a dependency's value for restore-on-create.
@@ -415,13 +498,10 @@ class CheckpointableBase(object):
     Indicates that checkpoints for this object should include variables from
     `checkpointable`.
 
-    Variables in a checkpoint are mapped to `Checkpointable`s based on names if
-    provided when the checkpoint was written, but otherwise use the order those
-    `Checkpointable`s were declared as dependencies.
-
-    To avoid breaking existing checkpoints when modifying a class, neither
-    variable names nor dependency names (the names passed to
-    `track_checkpointable`) may change.
+    Variables in a checkpoint are mapped to `Checkpointable`s based on the names
+    provided when the checkpoint was written. To avoid breaking existing
+    checkpoints when modifying a class, neither variable names nor dependency
+    names (the names passed to `_track_checkpointable`) may change.
 
     Args:
       checkpointable: A `Checkpointable` which this object depends on.
@@ -444,9 +524,10 @@ class CheckpointableBase(object):
       raise TypeError(
           ("Checkpointable._track_checkpointable() passed type %s, not a "
            "Checkpointable.") % (type(checkpointable),))
-    new_reference = _CheckpointableReference(name=name, ref=checkpointable)
-    if (name in self._dependency_names
-        and self._dependency_names[name] is not checkpointable):
+    new_reference = CheckpointableReference(name=name, ref=checkpointable)
+    current_object = self._lookup_dependency(name)
+    if (current_object is not None
+        and current_object is not checkpointable):
       if not overwrite:
         raise ValueError(
             ("Called Checkpointable._track_checkpointable() with name='%s', "
@@ -454,19 +535,48 @@ class CheckpointableBase(object):
              "dependency. Names must be unique (or overwrite=True).") % (name,))
       # This is a weird thing to do, but we're not going to stop people from
       # using __setattr__.
-      for index, (old_name, _) in enumerate(self._checkpoint_dependencies):
+      for index, (old_name, _) in enumerate(
+          self._unconditional_checkpoint_dependencies):
         if name == old_name:
-          self._checkpoint_dependencies[index] = new_reference
+          self._unconditional_checkpoint_dependencies[index] = new_reference
     else:
-      self._checkpoint_dependencies.append(new_reference)
+      self._unconditional_checkpoint_dependencies.append(new_reference)
 
-    self._dependency_names[name] = checkpointable
-    deferred_dependency_list = self._deferred_dependencies.pop(name, None)
-    if deferred_dependency_list is not None:
-      for checkpoint_position in deferred_dependency_list:
-        checkpoint_position.restore(checkpointable=checkpointable)
+    self._unconditional_dependency_names[name] = checkpointable
+    self._handle_deferred_dependencies(name=name, checkpointable=checkpointable)
     return checkpointable
 
+  def _handle_deferred_dependencies(self, name, checkpointable):
+    """Pop and load any deferred checkpoint restores into `checkpointable`.
+
+    This method does not add a new dependency on `checkpointable`, but it does
+    check if any outstanding/deferred dependencies have been queued waiting for
+    this dependency to be added (matched based on `name`). If so,
+    `checkpointable` and its dependencies are restored. The restorations are
+    considered fulfilled and so are deleted.
+
+    `_track_checkpointable` is more appropriate for adding a
+    normal/unconditional dependency, and includes handling for deferred
+    restorations. This method allows objects such as `Optimizer` to use the same
+    restoration logic while managing conditional dependencies themselves, by
+    overriding `_checkpoint_dependencies` and `_lookup_dependency` to change the
+    object's dependencies based on the context it is saved/restored in (a single
+    optimizer instance can have state associated with multiple graphs).
+
+    Args:
+      name: The name of the dependency within this object (`self`), used to
+        match `checkpointable` with values saved in a checkpoint.
+      checkpointable: The Checkpointable object to restore (inheriting from
+        `CheckpointableBase`).
+    """
+    self._maybe_initialize_checkpointable()
+    deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
+    for checkpoint_position in sorted(
+        deferred_dependencies_list,
+        key=lambda restore: restore.checkpoint.restore_uid,
+        reverse=True):
+      checkpoint_position.restore(checkpointable)
+
   def _restore_from_checkpoint_position(self, checkpoint_position):
     """Restore this object and its dependencies (may be deferred)."""
     # Attempt a breadth-first traversal, since presumably the user has more
@@ -493,16 +603,16 @@ class CheckpointableBase(object):
     # need to actually restore the object. However, we should pass the
     # restoration on to our dependencies.
     if checkpoint.restore_uid > self._update_uid:
-      restore_op = self._scatter_tensors_from_checkpoint(
-          checkpoint_position.restore_ops())
+      restore_ops = checkpoint_position.restore_ops()
+      # TODO(allenl): Get a list of feeds for saving Python state
       self._update_uid = checkpoint.restore_uid
     else:
-      restore_op = ()
+      restore_ops = ()
     for child in checkpoint_position.object_proto.children:
       child_position = _CheckpointPosition(
           checkpoint=checkpoint,
           proto_id=child.node_id)
-      local_object = self._dependency_names.get(child.local_name, None)
+      local_object = self._lookup_dependency(child.local_name)
       if local_object is None:
         # We don't yet have a dependency registered with this name. Save it
         # in case we do.
@@ -515,25 +625,37 @@ class CheckpointableBase(object):
           # resolution order (shallowest paths first). The caller is responsible
           # for emptying visit_queue.
           visit_queue.append(child_position)
-    return restore_op
+    return restore_ops
 
-  def _scatter_tensors_from_checkpoint(self, attributes):
-    """Restores this object from a checkpoint.
+  def _gather_saveables_for_checkpoint(self):
+    """Returns a dictionary of values to checkpoint with this object.
+
+    Keys in the returned dictionary are local to this object and in a separate
+    namespace from dependencies. Values may either be `SaveableObject` factories
+    or variables easily converted to `SaveableObject`s (as in `tf.train.Saver`'s
+    `var_list` constructor argument).
+
+    `SaveableObjects` have a name set, which Checkpointable needs to generate
+    itself. So rather than returning `SaveableObjects` directly, this method
+    should return a dictionary of callables which take `name` arguments and
+    return `SaveableObjects` with that name.
+
+    If this object may also be passed to the global-name-based `tf.train.Saver`,
+    the returned callables should have a default value for their name argument
+    (i.e. be callable with no arguments).
+
+    Returned values must be saved only by this object; if any value may be
+    shared, it should instead be a dependency. For example, variable objects
+    save their own values with the key `VARIABLE_VALUE_KEY`, but objects which
+    reference variables simply add a dependency.
 
-    Args:
-      attributes: A dictionary of Tensors, with key corresponding to those
-        returned from _gather_tensors_for_checkpoint.
     Returns:
-      A restore op to run (if graph building).
+      The dictionary mapping attribute names to `SaveableObject` factories
+      described above. For example:
+      {VARIABLE_VALUE_KEY:
+       lambda name="global_name_for_this_object":
+       SaveableObject(name=name, ...)}
     """
-    if attributes:
-      raise AssertionError(
-          ("A Checkpointable object which was not expecting any data received "
-           "some from a checkpoint. (Got %s)") % (attributes,))
-    return ()  # No restore ops
-
-  def _gather_tensors_for_checkpoint(self):
-    """Returns a dictionary of Tensors to save with this object."""
     return {}
 
 
@@ -562,8 +684,7 @@ class Checkpointable(CheckpointableBase):
   `Checkpointable` objects may specify `Tensor`s to be saved and restored
   directly (e.g. a `Variable` indicating how to save itself) rather than through
   dependencies on other objects. See
-  `Checkpointable._scatter_tensors_from_checkpoint` and
-  `Checkpointable._gather_tensors_for_checkpoint` for details.
+  `Checkpointable._gather_saveables_for_checkpoint` for details.
   """
 
   def __setattr__(self, name, value):
diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cdd53cbf9629bf2c5784c42dd58a1aa677b8db8
--- /dev/null
+++ b/tensorflow/python/training/checkpointable_utils.py
@@ -0,0 +1,1141 @@
+"""Utilities for saving/loading Checkpointable objects."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+import weakref
+
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import checkpointable as checkpointable_lib
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
+
+
+_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
+
+# Keyword for identifying that the next bit of a checkpoint variable name is a
+# slot name. Checkpoint names for slot variables look like:
+#
+#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
+#
+# Where <path to variable> is a full path from the checkpoint root to the
+# variable being slotted for.
+_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
+# Keyword for separating the path to an object from the name of an
+# attribute in checkpoint names. Used like:
+#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
+_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
+
+
+class _CheckpointRestoreCoordinator(object):
+  """Holds the status of an object-based checkpoint load."""
+
+  def __init__(self, object_graph_proto, save_path, dtype_map=None):
+    """Specify the checkpoint being loaded.
+
+    Args:
+      object_graph_proto: The CheckpointableObjectGraph protocol buffer
+        associated with this checkpoint.
+      save_path: A string `Tensor`. The path to the checkpoint, as returned by
+        `tf.train.latest_checkpoint`.
+      dtype_map: When executing eagerly, specifies dtypes for creating slot
+        variables. None when graph building.
+    """
+    self.builder = saver_lib.BulkSaverBuilder()
+    self.object_graph_proto = object_graph_proto
+    self.restore_uid = ops.uid()
+    # Maps from objects to lists of attributes which were in the checkpoint but
+    # not loaded into any object, for error checking.
+    self.unused_attributes = weakref.WeakKeyDictionary()
+    # Dictionary mapping from an id in the protocol buffer flat array to
+    # Checkpointable Python objects. This mapping may be deferred if a
+    # checkpoint is restored before all dependencies have been tracked. Uses
+    # weak references so that partial restorations don't create reference cycles
+    # (as objects with deferred dependencies will generally have references to
+    # this object).
+    self.object_by_proto_id = weakref.WeakValueDictionary()
+    # A set of all Python objects we've seen as dependencies, even if we didn't
+    # use them (for example because of inconsistent references when
+    # loading). Used to make status assertions fail when loading checkpoints
+    # that don't quite match.
+    self.all_python_objects = weakref.WeakSet()
+    self.save_path = save_path
+    self.dtype_map = dtype_map
+    # When graph building, contains a list of ops to run to restore objects from
+    # this checkpoint.
+    self.restore_ops = []
+    self.restore_ops_by_name = {}
+    # A mapping from optimizer proto ids to lists of slot variables to be
+    # restored when the optimizer is tracked. Only includes slot variables whose
+    # regular variables have already been created, and only for optimizer
+    # objects which have not yet been created/tracked.
+    self.deferred_slot_restorations = {}
+    # A mapping from variable proto ids to lists of slot variables to be
+    # restored when the variable is created/tracked. These get shifted over to
+    # deferred_slot_restorations if the optimizer hasn't been created when that
+    # happens.
+    self.slot_restorations = {}
+    for node_index, node in enumerate(self.object_graph_proto.nodes):
+      for slot_reference in node.slot_variables:
+        # `node` refers to an `Optimizer`, since only these have slot variables.
+        self.slot_restorations.setdefault(
+            slot_reference.original_variable_node_id, []).append(
+                checkpointable_lib._SlotVariableRestoration(  # pylint: disable=protected-access
+                    optimizer_id=node_index,
+                    slot_variable_id=slot_reference.slot_variable_node_id,
+                    slot_name=slot_reference.slot_name))
+
+
+# TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
+# or consolidating the implementation with get_variable.
+def _default_getter(name, shape, dtype, initializer=None,
+                    partition_info=None, **kwargs):
+  """A pared-down version of get_variable which does not reuse variables."""
+  dtype = dtypes.as_dtype(dtype)
+  shape_object = tensor_shape.as_shape(shape)
+  with ops.init_scope():
+    if initializer is None:
+      initializer, initializing_from_value = (
+          variable_scope._get_default_variable_store()._get_default_initializer(  # pylint: disable=protected-access
+              name=name, shape=shape_object, dtype=dtype))
+    else:
+      initializing_from_value = not callable(initializer)
+    # Same logic as get_variable
+    variable_dtype = dtype.base_dtype
+    if initializing_from_value:
+      if shape is not None:
+        raise ValueError("If initializer is a constant, do not specify shape.")
+      initial_value = initializer
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      def initial_value():
+        return initializer(
+            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
+    return resource_variable_ops.ResourceVariable(
+        initial_value=initial_value,
+        name=name,
+        dtype=variable_dtype,
+        **kwargs
+    )
+
+
+def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
+                 initializer=None):
+  """Add a variable to a Checkpointable with no scope influence."""
+  return checkpointable._add_variable_with_custom_getter(  # pylint: disable=protected-access
+      name=name, shape=shape, dtype=dtype,
+      initializer=initializer, getter=_default_getter)
+
+
+def _breadth_first_checkpointable_traversal(root_checkpointable):
+  """Find shortest paths to all variables owned by dependencies of root."""
+  bfs_sorted = []
+  to_visit = collections.deque([root_checkpointable])
+  path_to_root = {root_checkpointable: ()}
+  while to_visit:
+    current_checkpointable = to_visit.popleft()
+    current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    bfs_sorted.append(current_checkpointable)
+    for child_checkpointable in (
+        current_checkpointable._checkpoint_dependencies):  # pylint: disable=protected-access
+      if child_checkpointable.ref not in path_to_root:
+        path_to_root[child_checkpointable.ref] = (
+            path_to_root[current_checkpointable] + (child_checkpointable,))
+        to_visit.append(child_checkpointable.ref)
+  return bfs_sorted, path_to_root
+
+
+def _escape_local_name(name):
+  # We need to support slashes in local names for compatibility, since this
+  # naming scheme is being patched in to things like Layer.add_variable where
+  # slashes were previously accepted. We also want to use slashes to indicate
+  # edges traversed to reach the variable, so we escape forward slashes in
+  # names.
+  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
+          .replace(r"/", _ESCAPE_CHAR + "S"))
+
+
+def _object_prefix_from_path(path_to_root):
+  return "/".join(
+      (_escape_local_name(checkpointable.name)
+       for checkpointable in path_to_root))
+
+
+def _slot_variable_naming_for_optimizer(optimizer_path):
+  """Make a function for naming slot variables in an optimizer."""
+  # Name slot variables:
+  #
+  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
+  #
+  # where <variable name> is exactly the checkpoint name used for the original
+  # variable, including the path from the checkpoint root and the local name in
+  # the object which owns it. Note that we only save slot variables if the
+  # variable it's slotting for is also being saved.
+
+  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
+
+  def _name_slot_variable(variable_path, slot_name):
+    """With an optimizer specified, name a slot variable."""
+    return (variable_path
+            + optimizer_identifier
+            + _escape_local_name(slot_name))
+
+  return _name_slot_variable
+
+
+def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
+  """Gather and name slot variables."""
+  non_slot_objects = list(checkpointable_objects)
+  slot_variables = {}
+  for checkpointable in non_slot_objects:
+    if isinstance(checkpointable, optimizer_lib.Optimizer):
+      naming_scheme = _slot_variable_naming_for_optimizer(
+          optimizer_path=object_names[checkpointable])
+      slot_names = checkpointable.get_slot_names()
+      for slot_name in slot_names:
+        for original_variable_node_id, original_variable in enumerate(
+            non_slot_objects):
+          try:
+            slot_variable = checkpointable.get_slot(
+                original_variable, slot_name)
+          except AttributeError:
+            slot_variable = None
+          if slot_variable is None:
+            continue
+          slot_variable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
+            # TODO(allenl): Gather dependencies of slot variables.
+            raise NotImplementedError(
+                "Currently only variables with no dependencies can be saved as "
+                "slot variables. File a feature request if this limitation "
+                "bothers you.")
+          if slot_variable in node_ids:
+            raise NotImplementedError(
+                "A slot variable was re-used as a dependency of a "
+                "Checkpointable object. This is not currently allowed. File a "
+                "feature request if this limitation bothers you.")
+          checkpoint_name = naming_scheme(
+              variable_path=object_names[original_variable],
+              slot_name=slot_name)
+          object_names[slot_variable] = checkpoint_name
+          slot_variable_node_id = len(checkpointable_objects)
+          node_ids[slot_variable] = slot_variable_node_id
+          checkpointable_objects.append(slot_variable)
+          slot_variable_proto = (
+              checkpointable_object_graph_pb2.CheckpointableObjectGraph
+              .CheckpointableObject.SlotVariableReference(
+                  slot_name=slot_name,
+                  original_variable_node_id=original_variable_node_id,
+                  slot_variable_node_id=slot_variable_node_id))
+          slot_variables.setdefault(checkpointable, []).append(
+              slot_variable_proto)
+  return slot_variables
+
+
+def _serialize_checkpointables(
+    checkpointable_objects, node_ids, object_names, slot_variables):
+  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
+  object_graph_proto = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  named_saveables = {}
+
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    assert node_ids[checkpointable] == checkpoint_id
+    object_proto = object_graph_proto.nodes.add()
+    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
+    object_name = object_names[checkpointable]
+    for name, saveable_factory in (
+        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+      attribute = object_proto.attributes.add()
+      attribute.name = name
+      attribute.checkpoint_key = "%s/%s/%s" % (
+          object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
+      if callable(saveable_factory):
+        saveable = saveable_factory(name=attribute.checkpoint_key)
+      else:
+        saveable = saveable_factory
+      # Figure out the name-based Saver's name for this variable.
+      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+          [saveable], convert_variable_to_tensor=False)
+      attribute.full_name, = saver_dict.keys()
+      named_saveables[attribute.checkpoint_key] = saveable
+
+    for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
+      child_proto = object_proto.children.add()
+      child_proto.node_id = node_ids[child.ref]
+      child_proto.local_name = child.name
+
+  return named_saveables, object_graph_proto
+
+
+def _serialize_object_graph(root_checkpointable):
+  """Determine checkpoint keys for variables and build a serialized graph.
+
+  Non-slot variables are keyed based on a shortest path from the root saveable
+  to the object which owns the variable (i.e. the one which called
+  `Checkpointable._add_variable` to create it).
+
+  Slot variables are keyed based on a shortest path to the variable being
+  slotted for, a shortest path to their optimizer, and the slot name.
+
+  Args:
+    root_checkpointable: A `Checkpointable` object whose variables (including
+      the variables of dependencies, recursively) should be saved.
+
+  Returns:
+    A tuple of (named_variables, object_graph_proto):
+      named_variables: A dictionary mapping names to variable objects.
+      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
+        the serialized object graph and variable references.
+
+  Raises:
+    ValueError: If there are invalid characters in an optimizer's slot names.
+  """
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_names = {
+      obj: _object_prefix_from_path(path)
+      for obj, path in path_to_root.items()}
+  node_ids = {node: node_id for node_id, node
+              in enumerate(checkpointable_objects)}
+  slot_variables = _serialize_slot_variables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names)
+  return _serialize_checkpointables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names,
+      slot_variables=slot_variables)
+
+
+def list_objects(root_checkpointable):
+  """Traverse the object graph and list all accessible objects.
+
+  Looks for `Checkpointable` objects which are dependencies of
+  `root_checkpointable`. Includes slot variables only if the variable they are
+  slotting for and the optimizer are dependencies of `root_checkpointable`
+  (i.e. if they would be saved with a checkpoint).
+
+  Args:
+    root_checkpointable: A `Checkpointable` object whose dependencies should be
+      flattened.
+  Returns:
+    A flat list of objects.
+  """
+  # TODO(allenl): Extract out gathering logic so the naming logic doesn't have
+  # to run.
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_names = {
+      obj: _object_prefix_from_path(path)
+      for obj, path in path_to_root.items()}
+  node_ids = {node: node_id for node_id, node
+              in enumerate(checkpointable_objects)}
+  _serialize_slot_variables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names)
+  return checkpointable_objects
+
+
+def gather_initializers(root_checkpointable):
+  """Traverse the object graph and find initialization ops.
+
+  Looks for `Checkpointable` objects which are dependencies of
+  `root_checkpointable` and which have an `initializer` property. Includes
+  initializers for slot variables only if the variable they are slotting for and
+  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
+  saved with a checkpoint).
+
+  Args:
+    root_checkpointable: A `Checkpointable` object to gather initializers for.
+  Returns:
+    A list of initialization ops.
+  """
+  checkpointable_objects = list_objects(root_checkpointable)
+  return [c.initializer for c in checkpointable_objects
+          if hasattr(c, "initializer") and c.initializer is not None]
+
+
+class _NoRestoreSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+
+  def __init__(self, tensor, name):
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(tensor, "", name)
+    super(_NoRestoreSaveable, self).__init__(tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    return control_flow_ops.no_op()
+
+
+class _LoadStatus(object):
+  """Abstract base for load status callbacks."""
+
+  @abc.abstractmethod
+  def assert_consumed(self):
+    """Raises an exception unless a non-trivial restoration has completed."""
+    pass
+
+  @abc.abstractmethod
+  def run_restore_ops(self, session=None):
+    """Runs restore ops from the checkpoint. Requires a valid checkpoint."""
+    pass
+
+  @abc.abstractmethod
+  def initialize_or_restore(self, session=None):
+    """Runs restore ops from the checkpoint, or initializes variables."""
+    pass
+
+
+class CheckpointLoadStatus(_LoadStatus):
+  """Checks the status of checkpoint loading and manages restore ops.
+
+  Returned from `Saver.restore`. Since `restore` may defer the loading of values
+  in the checkpoint which don't yet have corresponding Python objects,
+  `CheckpointLoadStatus` provides a callback to verify that checkpoint loading
+  is complete (`assert_consumed`).
+
+  When graph building, `restore` does not run restore ops itself since their
+  creation may be deferred. The `run_restore_ops` method must be called once all
+  Python objects with values to restore have been created and added to the
+  dependency graph (this does not necessarily have to be the whole checkpoint;
+  calling `run_restore_ops` while `assert_consumed` fails is supported and will
+  partially restore the checkpoint).
+
+  See `Saver.restore` for usage examples.
+  """
+
+  def __init__(self, checkpoint, feed_dict, root_checkpointable):
+    self._checkpoint = checkpoint
+    self._feed_dict = feed_dict
+    self._root_checkpointable = root_checkpointable
+
+  def assert_consumed(self):
+    """Asserts that all objects in the checkpoint have been created/matched.
+
+    Returns:
+      `self` for chaining.
+    Raises:
+      AssertionError: If there are any Python objects in the dependency graph
+        which have not been restored from this checkpoint or a later `restore`,
+        or if there are any checkpointed values which have not been matched to
+        Python objects.
+    """
+    for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
+      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if checkpointable is None:
+        raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
+      if checkpointable._update_uid < self._checkpoint.restore_uid:  # pylint: disable=protected-access
+        raise AssertionError(
+            "Object not assigned a value from checkpoint: %s" % (node,))
+    if self._checkpoint.slot_restorations:
+      # Sanity check; this collection should be clear if everything has been
+      # restored.
+      raise AssertionError("Unresolved slot restorations: %s" % (
+          self._checkpoint.slot_restorations,))
+    if self._checkpoint.unused_attributes:
+      raise AssertionError(
+          ("Unused attributes in these objects (the attributes exist in the "
+           "checkpoint but not in the objects): %s") % (
+               self._checkpoint.unused_attributes.items(),))
+    for checkpointable_object in list_objects(self._root_checkpointable):
+      self._checkpoint.all_python_objects.add(checkpointable_object)
+    unused_python_objects = (
+        set(self._checkpoint.all_python_objects)
+        - set(self._checkpoint.object_by_proto_id.values()))
+    if unused_python_objects:
+      raise AssertionError(
+          ("Some Python objects were not bound to checkpointed values, likely "
+           "due to changes in the Python program: %s")
+          % (unused_python_objects,))
+    return self
+
+  def run_restore_ops(self, session=None):
+    """Run operations to restore objects in the dependency graph."""
+    if context.executing_eagerly():
+      return  # Run eagerly
+    if session is None:
+      session = ops.get_default_session()
+    session.run(self._checkpoint.restore_ops, feed_dict=self._feed_dict)
+
+  def initialize_or_restore(self, session=None):
+    """Run operations to initialize or restore objects in the dependency graph.
+
+    Any objects in the dependency graph which have initializers but are not in
+    the checkpoint will have those initializers run, unless those variables are
+    being restored by a later call to `tf.train.Checkpoint.restore()`.
+
+    This method has a sibling in `InitializationOnlyStatus` which instead
+    initializes variables. That type is returned if no checkpoint is specified
+    in `Saver.restore`.
+
+    Args:
+      session: The session to run init/restore ops in. If `None`, uses the
+        default session.
+    """
+    if context.executing_eagerly():
+      return  # Initialization and restoration ops are run eagerly
+    if session is None:
+      session = ops.get_default_session()
+    all_objects = list_objects(self._root_checkpointable)
+    already_initialized_objects = set(
+        self._checkpoint.object_by_proto_id.values())
+    initializers_for_non_restored_variables = [
+        c.initializer for c in all_objects
+        if hasattr(c, "initializer")
+        and c not in already_initialized_objects
+        and (getattr(c, "_update_uid", self._checkpoint.restore_uid - 1)
+             < self._checkpoint.restore_uid)]
+    self.run_restore_ops(session=session)
+    session.run(initializers_for_non_restored_variables)
+
+
+class InitializationOnlyStatus(_LoadStatus):
+  """Returned from `Saver.restore` when no checkpoint has been specified.
+
+  Objects of this type have the same `assert_consumed` method as
+  `CheckpointLoadStatus`, but it always fails. However,
+  `initialize_or_restore` works on objects of both types, and will
+  initialize variables in `InitializationOnlyStatus` objects or restore them
+  otherwise.
+  """
+
+  def __init__(self, root_checkpointable, restore_uid):
+    self._restore_uid = restore_uid
+    self._root_checkpointable = root_checkpointable
+
+  def assert_consumed(self):
+    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
+    raise AssertionError(
+        "No checkpoint specified (save_path=None); nothing is being restored.")
+
+  def run_restore_ops(self, session=None):
+    """For consistency with `CheckpointLoadStatus`.
+
+    Use `initialize_or_restore` for initializing if no checkpoint was passed
+    to `Saver.restore` and restoring otherwise.
+
+    Args:
+      session: Not used.
+    """
+    raise AssertionError(
+        "No checkpoint specified, so no restore ops are available "
+        "(save_path=None to Saver.restore).")
+
+  def initialize_or_restore(self, session=None):
+    """Runs initialization ops for variables.
+
+    Objects which would be saved by `Saver.save` will be initialized, unless
+    those variables are being restored by a later call to
+    `tf.train.Checkpoint.restore()`.
+
+    This method does nothing when executing eagerly (initializers get run
+    eagerly).
+
+    Args:
+      session: The session to run initialization ops in. If `None`, uses the
+        default session.
+    """
+    if context.executing_eagerly():
+      return  # run eagerly
+    if session is None:
+      session = ops.get_default_session()
+    checkpointable_objects = list_objects(self._root_checkpointable)
+    initializers = [
+        c.initializer for c in checkpointable_objects
+        if hasattr(c, "initializer") and c.initializer is not None
+        and (getattr(c, "_update_uid", self._restore_uid - 1)
+             < self._restore_uid)]
+    session.run(initializers)
+
+
+_DEPRECATED_RESTORE_INSTRUCTIONS = (
+    "Restoring a name-based tf.train.Saver checkpoint using the object-based "
+    "restore API. This mode uses global names to match variables, and so is "
+    "somewhat fragile. It also adds new restore ops to the graph each time it "
+    "is called. Prefer re-encoding training checkpoints in the object-based "
+    "format: run save() on the object-based saver (the same one this message "
+    "is coming from) and use that checkpoint in the future.")
+
+
+class NameBasedSaverStatus(_LoadStatus):
+  """Status for loading a name-based training checkpoint."""
+
+  def __init__(self, object_saver, save_path):
+    self._object_saver = object_saver
+    self._save_path = save_path
+
+  def assert_consumed(self):
+    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
+    raise AssertionError(
+        "Restoring a name-based checkpoint. No load status is available.")
+
+  @deprecation.deprecated(
+      date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
+  def run_restore_ops(self, session=None):
+    """Load the name-based training checkpoint using a new `tf.train.Saver`."""
+    if session is None and not context.executing_eagerly():
+      session = ops.get_default_session()
+    with ops.device("/cpu:0"):
+      saver_lib.Saver(self._object_saver._global_variable_names()).restore(  # pylint: disable=protected-access
+          sess=session, save_path=self._save_path)
+
+  def initialize_or_restore(self, session=None):
+    """Alias for `run_restore_ops`."""
+    self.run_restore_ops(session=session)
+
+
+class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
+  """Pretends to be a session, inserts extra feeds on run()."""
+
+  def __init__(self, session, feed_additions):
+    self._wrapped_session = session
+    self._feed_additions = feed_additions
+
+  def run(self, fetches, feed_dict=None, **kwargs):
+    if feed_dict is None:
+      feed_dict = {}
+    else:
+      feed_dict = feed_dict.copy()
+    feed_dict.update(self._feed_additions)
+    return self._wrapped_session.run(
+        fetches=fetches, feed_dict=feed_dict, **kwargs)
+
+
+def _copy_saver_with_new_var_list(old_saver, new_var_list):
+  """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
+  new_saver = saver_lib.Saver(var_list=new_var_list)
+  # TODO(allenl): Move to copying functionality to Saver?
+  # pylint: disable=protected-access
+  new_saver._last_checkpoints = old_saver._last_checkpoints
+  new_saver._checkpoints_to_be_deleted = old_saver._checkpoints_to_be_deleted
+  new_saver._next_checkpoint_time = old_saver._next_checkpoint_time
+  # pylint: enable=protected-access
+  return new_saver
+
+
+class CheckpointableSaver(object):
+  """Saves and restores a `Checkpointable` object and its dependencies.
+
+  See `Checkpointable` for details of dependency management. `Saver` wraps
+  `tf.train.Saver` for saving, including extra information about the graph of
+  dependencies between Python objects. When restoring, it uses this information
+  about the save-time dependency graph to more robustly match objects with their
+  checkpointed values. When executing eagerly, it supports restoring variables
+  on object creation (see `Saver.restore`).
+
+  Values in a checkpoint are mapped to `Checkpointable` Python objects
+  (`Variable`s, `Optimizer`s, `Layer`s) based on the names provided when the
+  checkpoint was written. To avoid breaking existing checkpoints when modifying
+  a class, dependency names (the names of attributes to which `Checkpointable`
+  objects are assigned) may not change. These names are local to objects, in
+  contrast to the `Variable.name`-based save/restore from `tf.train.Saver`, and
+  so allow additional program transformations.
+  """
+
+  def __init__(self, root_checkpointable):
+    """Configure saving.
+
+    Args:
+      root_checkpointable: The root of the object graph to save/restore. This
+        object and all of its dependencies are saved in the checkpoint. When
+        restoring, objects are matched and restored starting from this root.
+    """
+    # Allow passing in a weak reference to avoid reference cycles when
+    # `Checkpointable` objects save themselves.
+    self._root_checkpointable_ref = root_checkpointable
+    # The file prefix placeholder is created lazily when graph building (and not
+    # at all when executing eagerly) to avoid creating ops in the constructor
+    # (when they may never be necessary).
+    self._file_prefix_placeholder = None
+
+    # Op caching for save
+    self._object_graph_feed_tensor = None
+    self._last_save_object_graph = None
+    self._last_save_saver = None
+
+    # Op caching for restore
+    self._last_restore_object_graph = None
+    self._last_restore_checkpoint = None
+
+  @property
+  def _root_checkpointable(self):
+    if isinstance(self._root_checkpointable_ref, weakref.ref):
+      derefed = self._root_checkpointable_ref()
+      assert derefed is not None
+      return derefed
+    else:
+      return self._root_checkpointable_ref
+
+  def save(self, file_prefix, checkpoint_number=None, session=None):
+    """Save a training checkpoint.
+
+    The saved checkpoint includes variables created by this object and any
+    Checkpointable objects it depends on at the time `Saver.save()` is called.
+
+    Args:
+      file_prefix: A prefix to use for the checkpoint filenames
+        (/path/to/directory/and_a_prefix). Names are generated based on this
+        prefix and `checkpoint_number`, if provided.
+      checkpoint_number: An integer variable or Tensor, used to number
+        checkpoints. Typically this value is saved along with other variables in
+        training checkpoints, which will happen automatically if it was created
+        by `root_checkpointable` or one of its dependencies (via
+        `Checkpointable._add_variable`).
+      session: The session to evaluate variables in. Ignored when executing
+        eagerly. If not provided when graph building, the default session is
+        used.
+
+    Returns:
+      The full path to the checkpoint.
+    """
+    named_variables, graph_proto = _serialize_object_graph(
+        self._root_checkpointable)
+    if not context.executing_eagerly():
+      if session is None:
+        session = ops.get_default_session()
+      if self._object_graph_feed_tensor is None:
+        with ops.device("/cpu:0"):
+          self._object_graph_feed_tensor = constant_op.constant(
+              "", dtype=dtypes.string)
+      object_graph_tensor = self._object_graph_feed_tensor
+      feed_additions = {object_graph_tensor: graph_proto.SerializeToString()}
+    else:
+      session = None
+      with ops.device("/cpu:0"):
+        object_graph_tensor = constant_op.constant(
+            graph_proto.SerializeToString(), dtype=dtypes.string)
+      feed_additions = None
+    assert checkpointable_lib.OBJECT_GRAPH_PROTO_KEY not in named_variables
+    named_variables[checkpointable_lib.OBJECT_GRAPH_PROTO_KEY] = (
+        _NoRestoreSaveable(
+            tensor=object_graph_tensor,
+            name=checkpointable_lib.OBJECT_GRAPH_PROTO_KEY))
+    if (self._last_save_object_graph != graph_proto
+        # When executing eagerly, we need to re-create SaveableObjects each time
+        # save() is called so they pick up new Tensors passed to their
+        # constructors. That means the Saver needs to be copied with a new
+        # var_list.
+        or context.executing_eagerly()):
+      if self._last_save_object_graph is not None:
+        self._last_save_saver = _copy_saver_with_new_var_list(
+            old_saver=self._last_save_saver, new_var_list=named_variables)
+      else:
+        self._last_save_saver = saver_lib.Saver(var_list=named_variables)
+      self._last_save_object_graph = graph_proto
+    with ops.device("/cpu:0"):
+      save_path = self._last_save_saver.save(
+          sess=_SessionWithFeedDictAdditions(
+              session=session, feed_additions=feed_additions),
+          save_path=file_prefix,
+          write_meta_graph=False,
+          global_step=checkpoint_number)
+    return save_path
+
+  def _global_variable_names(self):
+    """Generate a `tf.train.Saver`-style `var_list` using `variable.name`s."""
+    named_saveables, graph_proto = _serialize_object_graph(
+        self._root_checkpointable)
+    saver_names = {}
+    for object_proto in graph_proto.nodes:
+      for attribute_proto in object_proto.attributes:
+        saver_names[attribute_proto.full_name] = named_saveables[
+            attribute_proto.checkpoint_key]
+    return saver_names
+
+  def restore(self, save_path):
+    """Restore a training checkpoint.
+
+    Restores `root_checkpointable` and any objects that it tracks
+    (transitive). Either assigns values immediately if variables to restore have
+    been created already, or defers restoration until the variables are
+    created. Dependencies added to the `root_checkpointable` passed to the
+    constructor after this call will be matched if they have a corresponding
+    object in the checkpoint.
+
+    When building a graph, restorations are added to the graph but not run.
+
+    To disallow deferred loading, assert immediately that all checkpointed
+    variables have been matched to variable objects:
+
+    ```python
+    saver = Saver(root)
+    saver.restore(path).assert_consumed()
+    ```
+
+    An exception will be raised unless every object was matched and its
+    variables already exist.
+
+    When graph building, `assert_consumed()` indicates that all of the restore
+    ops which will be created for this checkpoint have been created. They can be
+    run via the `run_restore_ops()` function of the status object:
+
+    ```python
+    saver.restore(path).assert_consumed().run_restore_ops()
+    ```
+
+    If the checkpoint has not been consumed completely, then the list of restore
+    ops will grow as more objects are added to the dependency graph.
+
+    Name-based `tf.train.Saver` checkpoints can be loaded using this
+    method. There is no deferred loading, and names are used to match
+    variables. No restore ops are created/run until `run_restore_ops()` or
+    `initialize_or_restore()` are called on the returned status object, even
+    when executing eagerly. Re-encode name-based checkpoints using this
+    object-based `Saver.save` as soon as possible.
+
+    Args:
+      save_path: The path to the checkpoint, as returned by `save` or
+        `tf.train.latest_checkpoint`. If None (as when there is no latest
+        checkpoint for `tf.train.latest_checkpoint` to return), returns an
+        object which may run initializers for objects in the dependency
+        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
+        names are used to match variables.
+
+    Returns:
+      A load status object, which can be used to make assertions about the
+      status of checkpoint restoration and run initialization/restore ops
+      (of type `CheckpointLoadStatus`, or `InitializationOnlyStatus` if
+      `save_path` is `None`).
+
+      If `save_path` points to a name-based checkpoint, a `NameBasedSaverStatus`
+      object is returned which runs restore ops from a name-based saver.
+    """
+    if save_path is None:
+      return InitializationOnlyStatus(self._root_checkpointable, ops.uid())
+    in_graph_mode = not context.executing_eagerly()
+    if in_graph_mode:
+      if self._file_prefix_placeholder is None:
+        with ops.device("/cpu:0"):
+          self._file_prefix_placeholder = constant_op.constant("model")
+      file_prefix_tensor = self._file_prefix_placeholder
+      file_prefix_feed_dict = {self._file_prefix_placeholder: save_path}
+    else:
+      with ops.device("/cpu:0"):
+        file_prefix_tensor = constant_op.constant(save_path)
+      file_prefix_feed_dict = None
+    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+    try:
+      object_graph_string = reader.get_tensor(
+          checkpointable_lib.OBJECT_GRAPH_PROTO_KEY)
+    except errors_impl.NotFoundError:
+      # The object graph proto does not exist in this checkpoint. Try again with
+      # name-based saving.
+      return NameBasedSaverStatus(self, save_path)
+
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+    object_graph_proto.ParseFromString(object_graph_string)
+    if in_graph_mode and object_graph_proto == self._last_restore_object_graph:
+      checkpoint = self._last_restore_checkpoint
+    else:
+      if in_graph_mode:
+        dtype_map = None
+      else:
+        dtype_map = reader.get_variable_to_dtype_map()
+      checkpoint = _CheckpointRestoreCoordinator(
+          object_graph_proto=object_graph_proto,
+          save_path=file_prefix_tensor,
+          dtype_map=dtype_map)
+      if in_graph_mode:
+        if self._last_restore_object_graph is not None:
+          raise NotImplementedError(
+              "Using a single Saver to restore different object graphs is not "
+              "currently supported when graph building. Use a different Saver "
+              "for each object graph (restore ops will be duplicated), or "
+              "file a feature request if this limitation bothers you.")
+        self._last_restore_checkpoint = checkpoint
+        self._last_restore_object_graph = object_graph_proto
+    checkpointable_lib._CheckpointPosition(  # pylint: disable=protected-access
+        checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
+    load_status = CheckpointLoadStatus(
+        checkpoint,
+        root_checkpointable=self._root_checkpointable,
+        feed_dict=file_prefix_feed_dict)
+    return load_status
+
+
+@tf_export("train.Checkpoint")
+class Checkpoint(checkpointable_lib.Checkpointable):
+  """Groups checkpointable objects, saving and restoring them.
+
+  `Checkpoint`'s constructor accepts keyword arguments whose values are types
+  that contain checkpointable state, such as `tf.train.Optimizer`
+  implementations, `tf.Variable`, `tf.keras.Layer` implementations, or
+  `tf.keras.Model` implementations. It saves these values with a checkpoint, and
+  maintains a `save_counter` for numbering checkpoints.
+
+  Example usage when graph building:
+
+  ```python
+  import tensorflow as tf
+  import os
+
+  checkpoint_directory = "/tmp/training_checkpoints"
+  checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+  status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
+  train_op = optimizer.minimize( ... )
+  status.assert_consumed()  # Optional sanity checks.
+  with tf.Session() as session:
+    # Use the Session to restore variables, or initialize them if
+    # tf.train.latest_checkpoint returned None.
+    status.initialize_or_restore(session)
+    for _ in range(num_training_steps):
+      session.run(train_op)
+    checkpoint.save(file_prefix=checkpoint_prefix)
+  ```
+
+  Example usage with eager execution enabled:
+
+  ```python
+  import tensorflow as tf
+  import os
+
+  tf.enable_eager_execution()
+
+  checkpoint_directory = "/tmp/training_checkpoints"
+  checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+  status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
+  for _ in range(num_training_steps):
+    optimizer.minimize( ... )  # Variables will be restored on creation.
+  status.assert_consumed()  # Optional sanity checks.
+  checkpoint.save(file_prefix=checkpoint_prefix)
+  ```
+
+  `Checkpoint.save` and `Checkpoint.restore` write and read object-based
+  checkpoints, in contrast to `tf.train.Saver` which writes and reads
+  `variable.name` based checkpoints. Object-based checkpointing saves a graph of
+  dependencies between Python objects (`Layer`s, `Optimizer`s, `Variable`s,
+  etc.) with named edges, and this graph is used to match variables when
+  restoring a checkpoint. It can be more robust to changes in the Python
+  program, and helps to support restore-on-create for variables when executing
+  eagerly. Prefer `tf.train.Checkpoint` over `tf.train.Saver` for new code.
+
+  `Checkpoint` objects have dependencies on the objects passed as keyword
+  arguments to their constructors, and each dependency is given a name that is
+  identical to the name of the keyword argument for which it was created.
+  TensorFlow classes like `Layer`s and `Optimizer`s will automatically add
+  dependencies on their variables (e.g. "kernel" and "bias" for
+  `tf.keras.layers.Dense`). Inheriting from `tf.keras.Model` makes managing
+  dependencies easy in user-defined classes, since `Model` hooks into attribute
+  assignment. For example:
+
+  ```python
+  class Regress(tf.keras.Model):
+
+    def __init__(self):
+      super(Regress, self).__init__()
+      self.input_transform = tf.keras.layers.Dense(10)
+      # ...
+
+    def call(self, inputs):
+      x = self.input_transform(inputs)
+      # ...
+  ```
+
+  This `Model` has a dependency named "input_transform" on its `Dense` layer,
+  which in turn depends on its variables. As a result, saving an instance of
+  `Regress` using `tf.train.Checkpoint` will also save all the variables created
+  by the `Dense` layer.
+
+  Attributes:
+    save_counter: Incremented when `save()` is called. Used to number
+      checkpoints.
+  """
+
+  def __init__(self, **kwargs):
+    """Group objects into a training checkpoint.
+
+    Args:
+      **kwargs: Keyword arguments are set as attributes of this object, and are
+        saved with the checkpoint. Values must be checkpointable objects.
+    Raises:
+      ValueError: If objects in `kwargs` are not checkpointable.
+    """
+    super(Checkpoint, self).__init__()
+    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
+      if not isinstance(v, checkpointable_lib.CheckpointableBase):
+        raise ValueError(
+            ("`Checkpoint` was expecting a checkpointable object (an object "
+             "derived from `CheckpointableBase`), got %s. If you believe this "
+             "object should be checkpointable (i.e. it is part of the "
+             "TensorFlow Python API and manages state), please open an issue.")
+            % (v,))
+      setattr(self, k, v)
+    self._save_counter = None  # Created lazily for restore-on-create.
+    self._saver = CheckpointableSaver(weakref.ref(self))
+
+  def _maybe_create_save_counter(self):
+    """Create a save counter if it does not yet exist."""
+    if self._save_counter is None:
+      # Initialized to 0 and incremented before saving.
+      with ops.device("/cpu:0"):
+        self._save_counter = add_variable(
+            self, name="save_counter", initializer=0, dtype=dtypes.int64)
+
+  @property
+  def save_counter(self):
+    """An integer variable which starts at zero and is incremented on save.
+
+    Used to number checkpoints.
+
+    Returns:
+      The save counter variable.
+    """
+    self._maybe_create_save_counter()
+    return self._save_counter
+
+  def save(self, file_prefix, session=None):
+    """Save a training checkpoint.
+
+    The saved checkpoint includes variables created by this object and any
+    checkpointable objects it depends on at the time `Checkpoint.save()` is
+    called.
+
+    Args:
+      file_prefix: A prefix to use for the checkpoint filenames
+        (/path/to/directory/and_a_prefix). Names are generated based on this
+        prefix and `Checkpoint.save_counter`.
+      session: The session to evaluate variables in. Ignored when executing
+        eagerly. If not provided when graph building, the default session is
+        used.
+
+    Returns:
+      The full path to the checkpoint.
+    """
+    in_graph_mode = not context.executing_eagerly()
+    if in_graph_mode:
+      if session is None:
+        session = ops.get_default_session()
+      if self._save_counter is None:
+        # When graph building, if this is a new save counter variable then it
+        # needs to be initialized before assign_add. This is only an issue if
+        # restore() has not been called first.
+        session.run(self.save_counter.initializer)
+    with ops.colocate_with(self.save_counter):
+      assign_op = self.save_counter.assign_add(1)
+    if in_graph_mode:
+      session.run(assign_op)
+    return self._saver.save(
+        file_prefix=file_prefix,
+        checkpoint_number=self.save_counter,
+        session=session)
+
+  def restore(self, save_path):
+    """Restore a training checkpoint.
+
+    Restores this `Checkpoint` and any objects it depends on.
+
+    When executing eagerly, either assigns values immediately if variables to
+    restore have been created already, or defers restoration until the variables
+    are created. Dependencies added after this call will be matched if they have
+    a corresponding object in the checkpoint (the restore request will queue in
+    any checkpointable object waiting for the expected dependency to be added).
+
+    When graph building, restoration ops are added to the graph but not run
+    immediately.
+
+    To ensure that loading is complete and no more assignments will take place,
+    use the `assert_consumed()` method of the status object returned by
+    `restore`:
+
+    ```python
+    checkpoint = tf.train.Checkpoint( ... )
+    checkpoint.restore(path).assert_consumed()
+    ```
+
+    An exception will be raised if any Python objects in the dependency graph
+    were not found in the checkpoint, or if any checkpointed values do not have
+    a matching Python object.
+
+    When graph building, `assert_consumed()` indicates that all of the restore
+    ops that will be created for this checkpoint have been created. They can be
+    run via the `run_restore_ops()` method of the status object:
+
+    ```python
+    checkpoint.restore(path).assert_consumed().run_restore_ops()
+    ```
+
+    If the checkpoint has not been consumed completely, then the list of restore
+    ops will grow as more objects are added to the dependency graph.
+
+    Name-based `tf.train.Saver` checkpoints can be loaded using this
+    method. There is no deferred loading, and names are used to match
+    variables. No restore ops are created/run until `run_restore_ops()` or
+    `initialize_or_restore()` are called on the returned status object, even
+    when executing eagerly. Re-encode name-based checkpoints using
+    `tf.train.Checkpoint.save` as soon as possible.
+
+    Args:
+      save_path: The path to the checkpoint, as returned by `save` or
+        `tf.train.latest_checkpoint`. If None (as when there is no latest
+        checkpoint for `tf.train.latest_checkpoint` to return), returns an
+        object which may run initializers for objects in the dependency
+        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
+        names are used to match variables.
+
+    Returns:
+      A load status object, which can be used to make assertions about the
+      status of a checkpoint restoration and run initialization/restore ops.
+
+      The returned status object has the following methods:
+      - `assert_consumed()`:
+          Raises an exception if any variables/objects are unmatched: either
+          checkpointed values which don't have a matching Python object or
+          Python objects in the dependency graph with no values in the
+          checkpoint. This method returns the status object, and so may be
+          chained with `initialize_or_restore` or `run_restore_ops`.
+      - `initialize_or_restore(session=None)`:
+          When graph building, runs variable initializers if `save_path` is
+          `None`, but otherwise runs restore operations. If no `session` is
+          explicitly specified, the default session is used. No effect for
+          object-based checkpoints when executing eagerly (variables are
+          initialized or restored eagerly).
+      - `run_restore_ops(session=None)`:
+          When graph building, runs restore operations. If no `session` is
+          explicitly specified, the default session is used. No effect for
+          object-based checkpoints when executing eagerly (restore operations
+          are run eagerly). May only be called when `save_path` is not `None`.
+    """
+    status = self._saver.restore(save_path=save_path)
+    # Create the save counter now so it gets initialized with other variables
+    # when graph building. Creating it earlier would lead to double
+    # initialization when executing eagerly.
+    self._maybe_create_save_counter()
+    return status
diff --git a/tensorflow/python/training/checkpointable_utils_test.py b/tensorflow/python/training/checkpointable_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..40dfeb28d50a2bce97e2d4b551be98ed70f83491
--- /dev/null
+++ b/tensorflow/python/training/checkpointable_utils_test.py
@@ -0,0 +1,1383 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import six
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import sequential
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import template
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+
+
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class InterfaceTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testAddVariable(self):
+    obj = NonLayerCheckpointable()
+    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
+      checkpointable_utils.add_variable(
+          obj, name="shape_specified_twice", shape=[], initializer=1)
+    constant_initializer = checkpointable_utils.add_variable(
+        obj, name="constant_initializer", initializer=1)
+    with variable_scope.variable_scope("some_variable_scope"):
+      ones_initializer = checkpointable_utils.add_variable(
+          obj,
+          name="ones_initializer",
+          shape=[2],
+          initializer=init_ops.ones_initializer(dtype=dtypes.float32))
+    bare_initializer = checkpointable_utils.add_variable(
+        obj,
+        name="bare_initializer",
+        shape=[2, 2],
+        dtype=dtypes.float64,
+        initializer=init_ops.zeros_initializer)
+
+    # Even in graph mode, there are no naming conflicts between objects, only
+    # naming conflicts within an object.
+    other_duplicate = resource_variable_ops.ResourceVariable(
+        name="duplicate", initial_value=1.)
+    duplicate = checkpointable_utils.add_variable(
+        obj, name="duplicate", shape=[])
+    with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"):
+      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])
+
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    self.assertEqual("constant_initializer:0", constant_initializer.name)
+    self.assertEqual(1, self.evaluate(constant_initializer))
+    self.assertEqual("some_variable_scope/ones_initializer:0",
+                     ones_initializer.name)
+    self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
+    self.assertAllEqual([[0., 0.],
+                         [0., 0.]], self.evaluate(bare_initializer))
+    self.assertEqual("a_variable:0", obj.a_variable.name)
+    self.assertEqual("duplicate:0", other_duplicate.name)
+    if context.executing_eagerly():
+      # When executing eagerly, there's no uniquification of variable names. The
+      # checkpoint name will be the same.
+      self.assertEqual("duplicate:0", duplicate.name)
+    else:
+      # The .name attribute may be globally influenced, but the checkpoint name
+      # won't be (tested below).
+      self.assertEqual("duplicate_1:0", duplicate.name)
+    named_variables, _ = checkpointable_utils._serialize_object_graph(obj)
+    expected_checkpoint_names = (
+        "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
+        "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
+        "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
+        "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
+        "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
+    )
+    six.assertCountEqual(
+        self, expected_checkpoint_names, named_variables.keys())
+
+  def testInitNotCalled(self):
+
+    class NoInit(checkpointable.Checkpointable):
+
+      def __init__(self):
+        pass
+
+    # __init__ for Checkpointable will be called implicitly.
+    checkpointable_utils.add_variable(NoInit(), "var", shape=[])
+
+  def testShapeDtype(self):
+    root = checkpointable.Checkpointable()
+    v1 = checkpointable_utils.add_variable(
+        root, name="v1", initializer=3., dtype=dtypes.float64)
+    self.assertEqual(dtypes.float64, v1.dtype)
+    v2 = checkpointable_utils.add_variable(
+        root,
+        name="v2",
+        shape=[3],
+        initializer=init_ops.ones_initializer,
+        dtype=dtypes.float64)
+    self.assertEqual(dtypes.float64, v2.dtype)
+    self.assertAllEqual([1., 1., 1.], self.evaluate(v2))
+
+
+class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+
+  def __init__(self, primary_variable, mirrored_variable, name):
+    self._primary_variable = primary_variable
+    self._mirrored_variable = mirrored_variable
+    tensor = self._primary_variable.read_value()
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name)
+    super(_MirroringSaveable, self).__init__(
+        tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group(
+        self._primary_variable.assign(tensor),
+        self._mirrored_variable.assign(tensor))
+
+
+class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
+  """A Checkpointable object which returns a more complex SaveableObject."""
+
+  def __init__(self):
+    self.non_dep_variable = variable_scope.get_variable(
+        name="non_dep_variable", initializer=6., use_resource=True)
+    self.mirrored = variable_scope.get_variable(
+        name="mirrored", initializer=15., use_resource=True)
+
+  def _gather_saveables_for_checkpoint(self):
+    def _saveable_factory(name=self.non_dep_variable.name):
+      return _MirroringSaveable(
+          primary_variable=self.non_dep_variable,
+          mirrored_variable=self.mirrored,
+          name=name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  # The Saver sorts by name before parsing, so we need a name property.
+  @property
+  def name(self):
+    return self.non_dep_variable.name
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph = (
+        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step:0",
+        named_variables["optimizer_step" + suffix].name)
+    self.assertEqual(
+        "my_model/dense_1/kernel:0",
+        named_variables["model/_second/kernel" + suffix].name)
+    self.assertEqual(
+        "my_model/dense/kernel:0",
+        named_variables["model/_named_dense/kernel" + suffix].name)
+    self.assertEqual(
+        "beta1_power:0",
+        named_variables["optimizer/beta1_power" + suffix].name)
+    self.assertEqual(
+        "beta2_power:0",
+        named_variables["optimizer/beta2_power" + suffix].name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta1_power",
+                     optimizer_node.children[0].local_name)
+    self.assertEqual("beta1_power",
+                     serialized_graph.nodes[optimizer_node.children[0].node_id]
+                     .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=named_variables["model/_named_dense/kernel" + suffix],
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMoreComplexSaveableReturned(self):
+    v = _OwnsMirroredVariables()
+    checkpoint = checkpointable_utils.Checkpoint(v=v)
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    save_path = checkpoint.save(prefix)
+    self.evaluate(v.non_dep_variable.assign(43.))
+    self.evaluate(v.mirrored.assign(44.))
+    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
+    self.assertEqual(42., self.evaluate(v.non_dep_variable))
+    self.assertEqual(42., self.evaluate(v.mirrored))
+    self.evaluate(v.non_dep_variable.assign(44.))
+    save_path = checkpoint.save(prefix)
+    self.evaluate(v.non_dep_variable.assign(45.))
+    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
+    self.assertEqual(44., self.evaluate(v.non_dep_variable))
+    self.assertEqual(44., self.evaluate(v.mirrored))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMoreComplexSaveableReturnedWithGlobalName(self):
+    # The same object can also be saved using the name-based saver.
+    v = _OwnsMirroredVariables()
+    saver = saver_lib.Saver(var_list=[v])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    with self.test_session() as sess:
+      save_path = saver.save(sess, prefix)
+      self.evaluate(v.non_dep_variable.assign(43.))
+      self.evaluate(v.mirrored.assign(44.))
+      saver.restore(sess, save_path)
+      self.assertEqual(42., self.evaluate(v.non_dep_variable))
+      self.assertEqual(42., self.evaluate(v.mirrored))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_checkpointable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when appying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0, beta2=1.0)
+    on_create_root = checkpointable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_consumed()
+    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(saver_lib.latest_checkpoint(checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = checkpointable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+          with self.test_session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            else:
+              status.assert_consumed()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes()
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def _get_checkpoint_name(self, name):
+    root = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        root, name=name, shape=[1, 2], dtype=dtypes.float64)
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    checkpoint_name, = named_variables.keys()
+    with ops.name_scope("root/" + checkpoint_name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return checkpoint_name
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testVariableNameEscaping(self):
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    self.assertEqual(r"a.Sb.Sc" + suffix, self._get_checkpoint_name(r"a/b/c"))
+    self.assertEqual(r"b" + suffix, self._get_checkpoint_name(r"b"))
+    self.assertEqual(r"c.S" + suffix, self._get_checkpoint_name(r"c/"))
+    self.assertEqual(r"d.S..S" + suffix, self._get_checkpoint_name(r"d/.S"))
+    self.assertEqual(r"d.S..ATTRIBUTES.Sf" + suffix,
+                     self._get_checkpoint_name(r"d/.ATTRIBUTES/f"))
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNumberedPath(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    root.leaf = leaf
+    checkpointable_utils.add_variable(leaf, name="v", shape=[])
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    variable_name, = named_variables.keys()
+    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", variable_name)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLocalNameValidation(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    # Dots are escaped, which avoids conflicts with reserved names.
+    root._track_checkpointable(leaf, name=".ATTRIBUTES")
+    checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    name, = named_variables.keys()
+    self.assertEqual(name, "..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE")
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = checkpointable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLateDependencyTracking(self):
+
+    class Dependency(checkpointable.Checkpointable):
+
+      def build(self):
+        self.var = checkpointable_utils.add_variable(
+            self, "var", initializer=0.)
+
+    class LateDependencies(checkpointable.Checkpointable):
+
+      def add_dep(self):
+        self.dep = Dependency()
+        self.dep.build()
+
+    original = LateDependencies()
+    original.add_dep()
+    self.evaluate(state_ops.assign(original.dep.var, 123.))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpointable_utils.CheckpointableSaver(
+        original).save(checkpoint_prefix)
+    load_into = LateDependencies()
+    status = checkpointable_utils.CheckpointableSaver(
+        load_into).restore(save_path)
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    load_into.add_dep()
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(123., self.evaluate(load_into.dep.var))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDepAfterVar(self):
+
+    class Dependency(checkpointable.Checkpointable):
+
+      def build(self):
+        self.var = checkpointable_utils.add_variable(
+            self, "var", initializer=0.)
+
+    class DepAfterVar(checkpointable.Checkpointable):
+
+      def add_dep(self):
+        dep = Dependency()
+        dep.build()
+        self.dep = dep
+
+    dep_after_var = DepAfterVar()
+    dep_after_var.add_dep()
+    self.evaluate(state_ops.assign(dep_after_var.dep.var, -14.))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpointable_utils.CheckpointableSaver(dep_after_var).save(
+        checkpoint_prefix)
+
+    loaded_dep_after_var = DepAfterVar()
+    status = checkpointable_utils.CheckpointableSaver(
+        loaded_dep_after_var).restore(save_path)
+    loaded_dep_after_var.add_dep()
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(-14., self.evaluate(loaded_dep_after_var.dep.var))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = checkpointable.Checkpointable()
+    root.var = checkpointable_utils.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.AdamOptimizer(0.1)
+    if context.executing_eagerly():
+      optimizer.minimize(root.var.read_value)
+    else:
+      train_op = optimizer.minimize(root.var)
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(checkpointable_utils.gather_initializers(
+          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
+                                   14.))
+    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "with_slots"))
+    new_root = checkpointable.Checkpointable()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(slots_path)
+    no_slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = checkpointable_utils.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.AdamOptimizer(0.1)
+    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+      slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+    else:
+      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
+                    None)
+    if context.executing_eagerly():
+      new_root.optimizer.minimize(new_root.var.read_value)
+    else:
+      train_op = new_root.optimizer.minimize(new_root.var)
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+      self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOverlappingRestores(self):
+    checkpoint_directory = self.get_temp_dir()
+    save_root = checkpointable.Checkpointable()
+    save_root.dep = checkpointable.Checkpointable()
+    save_root.dep.var = checkpointable_utils.add_variable(
+        save_root.dep, name="var", initializer=0.)
+    self.evaluate(state_ops.assign(save_root.dep.var, 12.))
+    saver = checkpointable_utils.CheckpointableSaver(save_root)
+    first_path = saver.save(os.path.join(checkpoint_directory, "first"))
+    self.evaluate(state_ops.assign(save_root.dep.var, 13.))
+    second_path = saver.save(os.path.join(checkpoint_directory, "second"))
+
+    first_root = checkpointable.Checkpointable()
+    second_root = checkpointable.Checkpointable()
+    first_status = checkpointable_utils.CheckpointableSaver(
+        first_root).restore(first_path)
+    second_status = checkpointable_utils.CheckpointableSaver(
+        second_root).restore(second_path)
+    load_dep = checkpointable.Checkpointable()
+    load_dep.var = checkpointable_utils.add_variable(
+        load_dep, name="var", shape=[])
+    first_root.dep = load_dep
+    first_status.assert_consumed()
+    first_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(load_dep.var))
+    second_root.dep = load_dep
+    second_status.assert_consumed()
+    second_status.run_restore_ops()
+    self.assertEqual(13., self.evaluate(load_dep.var))
+
+    # Try again with the order of the restore() reversed. The last restore
+    # determines the final value.
+    first_root = checkpointable.Checkpointable()
+    second_root = checkpointable.Checkpointable()
+    second_status = checkpointable_utils.CheckpointableSaver(
+        second_root).restore(second_path)
+    first_status = checkpointable_utils.CheckpointableSaver(
+        first_root).restore(first_path)
+    load_dep = checkpointable.Checkpointable()
+    load_dep.var = checkpointable_utils.add_variable(
+        load_dep, name="var", shape=[])
+    first_root.dep = load_dep
+    first_status.assert_consumed()
+    first_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(load_dep.var))
+    second_root.dep = load_dep
+    second_status.assert_consumed()
+    second_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(load_dep.var))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAmbiguousLoad(self):
+    # Not OK to split one checkpoint object into two
+    checkpoint_directory = self.get_temp_dir()
+    save_root = checkpointable.Checkpointable()
+    save_root.dep_one = checkpointable.Checkpointable()
+    save_root.dep_two = checkpointable.Checkpointable()
+    dep_three = checkpointable.Checkpointable()
+    save_root.dep_one.dep_three = dep_three
+    save_root.dep_two.dep_three = dep_three
+    checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(save_root))
+    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
+        os.path.join(checkpoint_directory, "ckpt"))
+    load_root = checkpointable.Checkpointable()
+    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
+        save_path)
+    load_root.dep_one = checkpointable.Checkpointable()
+    load_root.dep_two = checkpointable.Checkpointable()
+    load_root.dep_one.dep_three = checkpointable.Checkpointable()
+    load_root.dep_two.dep_three = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        load_root.dep_one.dep_three, name="var", initializer=0.)
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testObjectsCombined(self):
+    # Currently fine to load two checkpoint objects into one Python object
+    checkpoint_directory = self.get_temp_dir()
+    save_root = checkpointable.Checkpointable()
+    save_root.dep_one = checkpointable.Checkpointable()
+    save_root.dep_two = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
+    checkpointable_utils.add_variable(
+        save_root.dep_two, name="var2", initializer=64., dtype=dtypes.float64)
+    self.evaluate(checkpointable_utils.gather_initializers(save_root))
+    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
+        os.path.join(checkpoint_directory, "ckpt"))
+    load_root = checkpointable.Checkpointable()
+    load_root.dep_one = checkpointable.Checkpointable()
+    load_root.dep_two = load_root.dep_one
+    v1 = checkpointable_utils.add_variable(
+        load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
+    v2 = checkpointable_utils.add_variable(
+        load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
+    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
+        save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(32., self.evaluate(v1))
+    self.assertEqual(64., self.evaluate(v2))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDependencyLoop(self):
+    # Note: this test creates garbage during eager execution because it
+    # purposefully creates a reference cycle.
+    first = checkpointable.Checkpointable()
+    second = checkpointable.Checkpointable()
+    first.second = second
+    second.first = first
+    first.v = checkpointable_utils.add_variable(
+        first, "v1", initializer=[3., 1., 4.])
+    second.v = checkpointable_utils.add_variable(
+        second, "v2", initializer=[1., 1., 2., 3.])
+    self.evaluate(checkpointable_utils.gather_initializers(first))
+    checkpoint_directory = self.get_temp_dir()
+    save_path = checkpointable_utils.CheckpointableSaver(first).save(
+        os.path.join(checkpoint_directory, "ckpt"))
+
+    # Test deferred loading
+    first_load = checkpointable.Checkpointable()
+    status = checkpointable_utils.CheckpointableSaver(
+        first_load).restore(save_path)
+    second_load = checkpointable.Checkpointable()
+    first_load.second = second_load
+    second_load.first = first_load
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    first_load.v = checkpointable_utils.add_variable(
+        first_load, "v1", shape=[3])
+    second_load.v = checkpointable_utils.add_variable(
+        second_load, "v2", shape=[4])
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
+    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
+
+    # Test loading when variables have already been created
+    self.evaluate(first_load.v.assign([2., 7., 1.]))
+    self.assertAllEqual([2., 7., 1.], self.evaluate(first_load.v))
+    self.evaluate(second_load.v.assign([2., 7., 1., 8.]))
+    self.assertAllEqual([2., 7., 1., 8.], self.evaluate(second_load.v))
+    status = checkpointable_utils.CheckpointableSaver(first_load).restore(
+        save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
+    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testRestoreOnAssign(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_graph = ops.Graph()
+    with save_graph.as_default(), self.test_session(save_graph):
+      first = checkpointable.Checkpointable()
+      first.var1 = variable_scope.get_variable(
+          name="outside_var", initializer=0.)
+      first.var2 = variable_scope.get_variable(
+          name="blah", initializer=0.)
+      self.evaluate(first.var1.assign(4.))
+      self.evaluate(first.var2.assign(8.))
+      save_path = checkpointable_utils.CheckpointableSaver(first).save(
+          checkpoint_prefix)
+    restore_graph = ops.Graph()
+    with restore_graph.as_default(), self.test_session(restore_graph):
+      second = checkpointable.Checkpointable()
+      second.var2 = variable_scope.get_variable(
+          name="blah", initializer=0.)
+      status = checkpointable_utils.CheckpointableSaver(
+          second).restore(save_path)
+      recreated_var1 = variable_scope.get_variable(
+          name="outside_var", initializer=0.)
+      status.run_restore_ops()
+      self.assertEqual(8., self.evaluate(second.var2))
+      self.evaluate(recreated_var1.assign(-2.))
+      self.assertEqual(-2., self.evaluate(recreated_var1))
+      second.var1 = recreated_var1
+      status.run_restore_ops()
+      self.assertEqual(4., self.evaluate(recreated_var1))
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        saver.save(checkpoint_prefix)
+        before_ops = graph.get_operations()
+        saver.save(checkpoint_prefix)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCheckpointCleanup(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    obj = checkpointable.Checkpointable()
+    obj.var = variable_scope.get_variable(name="v", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    saver = checkpointable_utils.Checkpoint(obj=obj)
+    for _ in range(10):
+      saver.save(checkpoint_prefix)
+    expected_filenames = ["checkpoint"]
+    for checkpoint_number in range(6, 11):
+      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
+      expected_filenames.append(
+          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
+    six.assertCountEqual(
+        self,
+        expected_filenames,
+        os.listdir(checkpoint_directory))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCheckpointCleanupChangingVarList(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    obj = checkpointable.Checkpointable()
+    obj.var = variable_scope.get_variable(name="v", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    checkpoint = checkpointable_utils.Checkpoint(obj=obj)
+    looped_variables = []
+    for iteration in range(10):
+      new_variable = resource_variable_ops.ResourceVariable(iteration)
+      self.evaluate(new_variable.initializer)
+      setattr(checkpoint, "var_%d" % iteration, new_variable)
+      checkpoint.save(checkpoint_prefix)
+      looped_variables.append(new_variable)
+    expected_filenames = ["checkpoint"]
+    # We've copied the saver each time, but checkpoint management should still
+    # be consistent.
+    for checkpoint_number in range(6, 11):
+      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
+      expected_filenames.append(
+          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
+    six.assertCountEqual(
+        self,
+        expected_filenames,
+        os.listdir(checkpoint_directory))
+    for v in looped_variables:
+      self.evaluate(v.assign(314))
+    checkpoint.restore(checkpoint_prefix + "-6").run_restore_ops()
+    self.assertEqual(314, self.evaluate(checkpoint.var_9))
+    self.assertEqual(314, self.evaluate(checkpoint.var_8))
+    self.assertEqual(314, self.evaluate(checkpoint.var_6))
+    self.assertEqual(5, self.evaluate(checkpoint.var_5))
+    self.assertEqual(1, self.evaluate(checkpoint.var_1))
+    self.assertEqual(0, self.evaluate(checkpoint.var_0))
+    if context.executing_eagerly():
+      checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+      self.assertEqual(9, self.evaluate(checkpoint.var_9))
+      self.assertEqual(8, self.evaluate(checkpoint.var_8))
+      self.assertEqual(1, self.evaluate(checkpoint.var_1))
+      self.assertEqual(0, self.evaluate(checkpoint.var_0))
+    else:
+      # Restoring into modified graphs is an error while graph building.
+      with self.assertRaises(NotImplementedError):
+        checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        save_path = saver.save(checkpoint_prefix)
+        saver.restore(save_path)
+        before_ops = graph.get_operations()
+        saver.restore(save_path)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testMultipleGraphsNonSlotVariables(self):
+    with context.graph_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer = adam.AdamOptimizer(0.001)
+      # Construct a model in one graph
+      first_graph = ops.Graph()
+      first_session = session_lib.Session(graph=first_graph)
+      with first_graph.as_default(), first_session.as_default():
+        first_variable = resource_variable_ops.ResourceVariable([1.])
+        first_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=first_variable)
+        train_op = optimizer.minimize(first_variable.read_value)
+        self.evaluate(checkpointable_utils.gather_initializers(
+            first_root_checkpointable))
+        self.evaluate(train_op)
+        self.evaluate(first_variable.assign([1.]))
+        self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m").assign([2.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.))
+
+      # Save and load in a second graph
+      second_graph = ops.Graph()
+      with second_graph.as_default(), session_lib.Session(graph=second_graph):
+        second_variable = resource_variable_ops.ResourceVariable([1.])
+        second_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=second_variable)
+        train_op = optimizer.minimize(second_variable.read_value)
+        second_root_checkpointable.restore(None).initialize_or_restore()
+        self.evaluate(train_op)
+        self.evaluate(second_variable.assign([4.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([5.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(6.))
+        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        self.evaluate(second_variable.assign([7.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([8.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+        status = second_root_checkpointable.restore(save_path)
+        status.assert_consumed().run_restore_ops()
+        self.assertAllEqual([4.], self.evaluate(second_variable))
+        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+
+      # Check that the first graph is unmolested
+      with first_graph.as_default(), first_session.as_default():
+        self.assertAllEqual([1.], self.evaluate(first_variable))
+        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_sequential(self):
+    model = sequential.Sequential()
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    model.add(core.Dense(4))
+    second_dense = core.Dense(5)
+    model.add(second_dense)
+    model(constant_op.constant([[1.]]))
+    checkpoint.restore(None).initialize_or_restore()
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([1., 2., 3., 4., 5.])))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([5., 6., 7., 8., 9.])))
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
+
+    deferred_sequential = sequential.Sequential()
+    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
+        model=deferred_sequential)
+    status = deferred_sequential_checkpoint.restore(save_path)
+    deferred_sequential.add(core.Dense(4))
+    deferred_sequential(constant_op.constant([[1.]]))
+    deferred_second_dense = core.Dense(5)
+    deferred_sequential.add(deferred_second_dense)
+    deferred_sequential(constant_op.constant([[1.]]))
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.],
+                        self.evaluate(deferred_second_dense.bias))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_initialize_if_not_restoring(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+    with ops.Graph().as_default(), self.test_session(
+        graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          model=model,  # Do not save the optimizer with the checkpoint.
+          global_step=training_util.get_or_create_global_step())
+      optimizer_checkpoint = checkpointable_utils.Checkpoint(
+          optimizer=optimizer)
+
+      checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+      status = root.restore(save_path=checkpoint_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      self.evaluate([v.initializer for v in optimizer.variables()])
+      train_fn()
+      model_save_path = root.save(file_prefix=checkpoint_prefix)
+      self.evaluate(optimizer.variables()[0].assign(42.))
+      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+
+    # Restore into a graph with the optimizer
+    with ops.Graph().as_default(), self.test_session(
+        graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      status = root.restore(save_path=model_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      train_fn()
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+
+    # Make sure initialization doesn't clobber later restores
+    with ops.Graph().as_default(), self.test_session(
+        graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      opt_root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer)
+      status = root.restore(save_path=model_save_path)
+      init_only_optimizer_status = opt_root.restore(save_path=None)
+      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      optimizer_status.run_restore_ops()
+      status.initialize_or_restore()
+      init_only_optimizer_status.initialize_or_restore()
+      train_fn()
+      self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
+
+
+class TemplateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_checkpointable_save_restore(self):
+
+    def _templated():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer())
+      v2 = variable_scope.get_variable(
+          "v2", shape=[1], initializer=init_ops.zeros_initializer())
+      return v, v + 1., v2
+
+    save_template = template.make_template("s1", _templated)
+    save_root = checkpointable_utils.Checkpoint(my_template=save_template)
+    v1_save, _, v2_save = save_template()
+    self.evaluate(v1_save.assign([12.]))
+    self.evaluate(v2_save.assign([14.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _templated)
+    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    status = load_root.restore(save_path)
+    var, var_plus_one, var2 = load_template()
+    self.assertEqual(2, len(load_template._checkpoint_dependencies))
+    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([12.], self.evaluate(var))
+    self.assertAllEqual([13.], self.evaluate(var_plus_one))
+    self.assertAllEqual([14.], self.evaluate(var2))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_checkpointable_save_restore_nested(self):
+
+    def _inner_template():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer())
+      return v
+
+    def _outer_template():
+      first_inner = template.make_template("i1", _inner_template)
+      second_inner = template.make_template("i2", _inner_template)
+      v1 = first_inner()
+      v2 = second_inner()
+      v3 = second_inner()
+      return (first_inner, second_inner), (v1, v2, v3)
+
+    with variable_scope.variable_scope("ignored"):
+      save_template = template.make_template("s1", _outer_template)
+      save_root = checkpointable_utils.Checkpoint(my_template=save_template)
+      (inner_template_one, inner_template_two), _ = save_template()
+    self.evaluate(inner_template_one.variables[0].assign([20.]))
+    self.evaluate(inner_template_two.variables[0].assign([25.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _outer_template)
+    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    status = load_root.restore(save_path)
+    (inner_template_one, inner_template_two), (v1, v2, v3) = load_template()
+    outer_template_dependencies = load_root.my_template._checkpoint_dependencies
+    self.assertEqual(2, len(outer_template_dependencies))
+    self.assertEqual("i1", outer_template_dependencies[0].name)
+    self.assertIs(inner_template_one, outer_template_dependencies[0].ref)
+    self.assertEqual("i2", outer_template_dependencies[1].name)
+    self.assertIs(inner_template_two, outer_template_dependencies[1].ref)
+    self.assertEqual(1, len(inner_template_one._checkpoint_dependencies))
+    self.assertEqual("v", inner_template_one._checkpoint_dependencies[0].name)
+    self.assertEqual(1, len(inner_template_two._checkpoint_dependencies))
+    self.assertEqual("v", inner_template_two._checkpoint_dependencies[0].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([20.], self.evaluate(v1))
+    self.assertAllEqual([25.], self.evaluate(v2))
+    self.assertAllEqual([25.], self.evaluate(v3))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = saver_lib.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        save_path = root.save(session=session, file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      save_path = root.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 689088bb41edfd94a1d483ed2b5f7447e9e060e7..be80c3657158b52d063b5d2b7731f25d184794a0 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -25,6 +25,14 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util.tf_export import tf_export
 
+# This is a tuple of PS ops used by tf.estimator.Estimator which should work in
+# almost all of cases.
+STANDARD_PS_OPS = ("Variable", "VariableV2", "AutoReloadVariable",
+                   "MutableHashTable", "MutableHashTableV2",
+                   "MutableHashTableOfTensors", "MutableHashTableOfTensorsV2",
+                   "MutableDenseHashTable", "MutableDenseHashTableV2",
+                   "VarHandleOp", "BoostedTreesEnsembleResourceHandleOp")
+
 
 class _RoundRobinStrategy(object):
   """Returns the next ps task index for placement in round-robin order.
@@ -170,8 +178,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
       than overriding them.
     cluster: `ClusterDef` proto or `ClusterSpec`.
     ps_ops: List of strings representing `Operation` types that need to be
-      placed on `ps` devices.  If `None`, defaults to
-      `["Variable", "VariableV2", "VarHandleOp"]`.
+      placed on `ps` devices.  If `None`, defaults to `STANDARD_PS_OPS`.
     ps_strategy: A callable invoked for every ps `Operation` (i.e. matched by
       `ps_ops`), that takes the `Operation` and returns the ps task index to
       use.  If `None`, defaults to a round-robin strategy across all `ps`
@@ -201,7 +208,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
   if ps_ops is None:
     # TODO(sherrym): Variables in the LOCAL_VARIABLES collection should not be
     # placed in the parameter server.
-    ps_ops = ["Variable", "VariableV2", "VarHandleOp"]
+    ps_ops = list(STANDARD_PS_OPS)
 
   if not merge_devices:
     logging.warning(
diff --git a/tensorflow/python/training/device_util.py b/tensorflow/python/training/device_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1137e80ab4394333ef0f3b7982d5b55f4704d0d
--- /dev/null
+++ b/tensorflow/python/training/device_util.py
@@ -0,0 +1,68 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Device-related support functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+
+
+def canonicalize(d):
+  d = tf_device.DeviceSpec.from_string(d)
+  assert d.device_type is None or d.device_type == d.device_type.upper(), (
+      "Device type '%s' must be all-caps." % (d.device_type,))
+  # Fill in missing device fields using defaults.
+  result = tf_device.DeviceSpec(
+      job="localhost", replica=0, task=0, device_type="CPU", device_index=0)
+  result.merge_from(d)
+  return result.to_string()
+
+
+class _FakeNodeDef(object):
+  """A fake NodeDef for _FakeOperation."""
+
+  def __init__(self):
+    self.op = ""
+    self.name = ""
+
+
+class _FakeOperation(object):
+  """A fake Operation object to pass to device functions."""
+
+  def __init__(self):
+    self.device = ""
+    self.type = ""
+    self.name = ""
+    self.node_def = _FakeNodeDef()
+
+  def _set_device(self, device):
+    self.device = ops._device_string(device)  # pylint: disable=protected-access
+
+
+def current():
+  """Return a string (not canonicalized) for the current device."""
+  # TODO(josh11b): Work out how this function interacts with ops.colocate_with.
+  ctx = context.context()
+  if ctx.executing_eagerly():
+    d = ctx.device_name
+  else:
+    op = _FakeOperation()
+    ops.get_default_graph()._apply_device_functions(op)  # pylint: disable=protected-access
+    d = op.device
+  return d
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..21ec5292adb5bb6463b5d46d934750fac7f8fcef
--- /dev/null
+++ b/tensorflow/python/training/distribute.py
@@ -0,0 +1,1260 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class DistributionStrategy, TowerContext, and supporting APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.training import device_util
+from tensorflow.python.util import nest
+
+
+# ------------------------------------------------------------------------------
+# Internal API for setting the current thread mode as being either in a
+# tower or cross-tower context for a particular distribution strategy.
+
+
+class _ThreadMode(object):
+
+  def __init__(self, dist, cross, tower):
+    self.distribution_strategy = dist
+    self.cross_tower_context = cross
+    self.tower_context = tower
+
+
+class _CrossTowerThreadMode(_ThreadMode):
+
+  def __init__(self, distribution_strategy):
+    _ThreadMode.__init__(
+        self, distribution_strategy, distribution_strategy, None)
+
+
+class _InTowerThreadMode(_ThreadMode):
+
+  def __init__(self, tower_ctx):
+    _ThreadMode.__init__(
+        self, tower_ctx.distribution_strategy, None, tower_ctx)
+
+
+_per_thread_mode = threading.local()
+
+
+def _push_per_thread_mode(context):
+  if not hasattr(_per_thread_mode, "stack"):
+    _per_thread_mode.stack = []
+  _per_thread_mode.stack.append(context)
+
+
+def _pop_per_thread_mode():
+  _per_thread_mode.stack.pop(-1)
+
+
+class _DefaultTowerThreadMode(_ThreadMode):
+  """Type of default value returned by `_get_per_thread_mode()`.
+
+  Used when the thread-local stack is empty.
+  """
+
+  def __init__(self):
+    # _default_distribution_strategy and _default_tower_context are
+    # defined at the bottom of this file.
+    _ThreadMode.__init__(
+        self, _default_distribution_strategy, None, _default_tower_context)
+
+
+def _get_per_thread_mode():
+  try:
+    return _per_thread_mode.stack[-1]
+  except (AttributeError, IndexError):
+    # _default_tower_mode is defined at the bottom of this file.
+    return _default_tower_mode
+
+
+# ------------------------------------------------------------------------------
+# Context tracking whether in a distribution.update() or .update_non_slot()
+# call.
+
+
+_update_device = threading.local()
+
+
+def get_update_device():
+  """Get the current device if in a `DistributionStrategy.update()` call."""
+  try:
+    return _update_device.current
+  except AttributeError:
+    return None
+
+
+class UpdateContext(object):
+  """Context manager when you are in `update()` or `update_non_slot()`."""
+
+  def __init__(self, device):
+    self._device = device
+    self._old_device = None
+
+  def __enter__(self):
+    self._old_device = get_update_device()
+    _update_device.current = self._device
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+    _update_device.current = self._old_device
+
+
+# ------------------------------------------------------------------------------
+# Public API for accessing the current thread mode
+
+
+def get_tower_context():
+  """Returns the current TowerContext or None if in a cross-tower context.
+
+  Note that execution:
+  1. starts in the default (single-tower) tower context (this function
+     will return the default TowerContext object);
+  2. switches to cross-tower context (in which case this will return
+     None) when entering a `with DistributionStrategy.scope():` block;
+  3. switches to a (non-default) tower context inside
+     `call_for_each_tower(fn, ...)`;
+  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-tower context (and again
+     this function will return None).
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-tower context for the default `DistributionStrategy`. You may
+  also switch from the cross-tower context of 4 to a tower context by
+  calling `call_for_each_tower()`, jumping back to step 3.
+
+  Most `DistributionStrategy` methods may only be executed in
+  a cross-tower context, in a tower context you should use the
+  `TowerContext` API instead.
+
+  Returns:
+    The current `TowerContext` object when in a tower context scope, else None.
+
+    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
+    will return None in a particular block.
+  """
+  return _get_per_thread_mode().tower_context
+
+
+def get_cross_tower_context():
+  """Returns the current DistributionStrategy if in a cross-tower context.
+
+  Note that execution:
+  1. starts in the default (single-tower) tower context;
+  2. switches to cross-tower context when entering a
+     `with DistributionStrategy.scope():` block;
+  3. switches to a (non-default) tower context inside
+     `call_for_each_tower(fn, ...)`;
+  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-tower context.
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-tower context for the default `DistributionStrategy`. You may
+  also switch from the cross-tower context of 4 to a tower context by
+  calling `call_for_each_tower()`, jumping back to step 3.
+
+  Most `DistributionStrategy` methods may only be executed in
+  a cross-tower context.
+
+  Returns:
+    Returns the current `DistributionStrategy` object in a cross-tower
+    context, or None.
+
+    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
+    will return None in a particular block.
+  """
+  return _get_per_thread_mode().cross_tower_context
+
+
+def get_distribution_strategy():
+  """Returns the current `DistributionStrategy` object.
+
+  Prefer to use `get_tower_context()` or `get_cross_tower_context()`
+  instead when possible.
+
+  Returns:
+    A `DistributionStrategy` object. Inside a
+    `with distribution_strategy.scope()` block, it returns
+    `distribution_strategy`, otherwise it returns the default
+    (single-tower) `DistributionStrategy` object.
+  """
+  return _get_per_thread_mode().distribution_strategy
+
+
+def has_distribution_strategy():
+  """Return if there is a current non-default `DistributionStrategy`.
+
+  Returns:
+    True if inside a `with distribution_strategy.scope():`.
+  """
+  return get_distribution_strategy() is not _default_distribution_strategy
+
+
+# ------------------------------------------------------------------------------
+# Public utility functions.
+
+
+def get_loss_reduction():
+  """Reduce `method_string` corresponding to the last loss reduction."""
+  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
+  if loss_reduction == losses_impl.Reduction.SUM:
+    return "sum"
+  return "mean"
+
+
+# ------------------------------------------------------------------------------
+# Internal API for validating the current thread mode
+
+
+def _require_cross_tower_context(distribution_strategy):
+  """Verify in cross-tower context for `distribution_strategy`."""
+  context = _get_per_thread_mode()
+  if context.cross_tower_context is distribution_strategy: return
+  # We have an error to report, figure out the right message.
+  if context.distribution_strategy is not distribution_strategy:
+    if context.distribution_strategy is _default_distribution_strategy:
+      raise RuntimeError(
+          'Need to be inside "with distribution_strategy.scope()" for %s' %
+          (distribution_strategy,))
+    else:
+      raise RuntimeError(
+          "Mixing different DistributionStrategy objects: %s is not %s" %
+          (context.distribution_strategy, distribution_strategy))
+  assert context.cross_tower_context is None
+  raise RuntimeError("Method requires being in cross-tower context, use "
+                     "get_tower_context().merge_call()")
+
+
+def require_tower_context(tower_ctx):
+  """Verify in `tower_ctx` tower context."""
+  context = _get_per_thread_mode()
+  if context.tower_context is tower_ctx: return
+  # We have an error to report, figure out the right message.
+  if context.tower_context is None:
+    raise RuntimeError("Need to be inside `call_for_each_tower()`")
+  if context.distribution_strategy is tower_ctx.distribution_strategy:
+    # Two different TowerContexts with the same DistributionStrategy.
+    raise RuntimeError("Mismatching tower context.")
+  raise RuntimeError(
+      "Mismatching DistributionStrategy objects: %s is not %s." %
+      (context.distribution_strategy, tower_ctx.distribution_strategy))
+
+
+def _require_distribution_strategy_scope(distribution_strategy):
+  """Verify in a `distribution_strategy.scope()` in this thread."""
+  context = _get_per_thread_mode()
+  if context.distribution_strategy is distribution_strategy: return
+  # We have an error to report, figure out the right message.
+  if context.distribution_strategy is _default_distribution_strategy:
+    raise RuntimeError(
+        'Need to be inside "with distribution_strategy.scope()" for %s' %
+        (distribution_strategy,))
+  else:
+    raise RuntimeError(
+        "Mixing different DistributionStrategy objects: %s is not %s" %
+        (context.distribution_strategy, distribution_strategy))
+
+
+# ------------------------------------------------------------------------------
+# Internal context managers used to implement the DistributionStrategy
+# base class
+
+
+class _CurrentDistributionContext(object):
+  """Context manager for setting the `DistributionStrategy` and var creator."""
+
+  def __init__(self, distribution_strategy, var_creator_scope, var_scope=None):
+    self._context = _CrossTowerThreadMode(distribution_strategy)
+    self._var_creator_scope = var_creator_scope
+    self._var_scope = var_scope
+
+  def __enter__(self):
+    _push_per_thread_mode(self._context)
+    if self._var_scope:
+      self._var_scope.__enter__()
+    self._var_creator_scope.__enter__()
+    return self._context.distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
+    if self._var_scope:
+      self._var_scope.__exit__(exception_type, exception_value, traceback)
+    _pop_per_thread_mode()
+
+
+class _SameScopeAgainContext(object):
+  """Trivial context manager when you are already in `scope()`."""
+
+  def __init__(self, distribution_strategy):
+    self._distribution_strategy = distribution_strategy
+
+  def __enter__(self):
+    return self._distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+
+
+# ------------------------------------------------------------------------------
+# Base classes for all distribution strategies.
+
+
+class DistributionStrategy(object):
+  """A list of devices with a state & compute distribution policy.
+
+  The intent is that you can write an algorithm in a stylized way and
+  it will be usable with a variety of different `DistributionStrategy`
+  implementations. Each descendant will implement a different strategy
+  for distributing the algorithm across multiple devices/machines.
+  Furthermore, these changes can be hidden inside the specific layers
+  and other library classes that need special treatment to run in a
+  distributed setting, so that most users' model definition code can
+  run unchanged. The `DistributionStrategy` API works the same way
+  with eager and graph execution.
+
+  First let's introduce a few high-level concepts:
+
+  * _Data parallelism_ is where we run multiple copies of the model
+    on different slices of the input data. This is in contrast to
+    _model parallelism_ where we divide up a single copy of a model
+    across multiple devices.
+    Note: for now we only support data parallelism at this time, but
+    hope to add support for model parallelism in the future.
+  * A _tower_ is one copy of the model, running on one slice of the
+    input data.
+  * _Synchronous_, or more commonly _sync_, training is when the
+    updates from each tower are aggregated together before updating
+    the model variables. This is in contrast to _asynchronous_, or
+    _async_ training where each tower updates the model variables
+    independently.
+  * Furthermore you might run your computation on multiple devices
+    on one machine (or "host"), or on multiple machines/hosts.
+    If you are running on multiple machines, you might have a
+    single master host that drives computation across all of them,
+    or you might have multiple clients driving the computation
+    asynchronously.
+
+  To distribute an algorithm, we might use some of these ingredients:
+
+  * Parameter servers: These are hosts that hold a single copy of
+    parameters/variables. All towers that want to operate on a variable
+    retrieve it at the beginning of a step and send an update to be
+    applied at the end of the step. Can support either sync or async
+    training.
+  * Mirrored variables: These are variables that are copied to multiple
+    devices, where we keep the copies in sync by applying the same
+    updates to every copy. Normally would only be used with sync training.
+  * Reductions and Allreduce: A _reduction_ is some method of
+    aggregating multiple values into one value, like "sum" or
+    "mean". If doing sync training, we will perform a reduction on the
+    gradients to a parameter from each tower before applying the
+    update. Allreduce is an algorithm for performing a reduction on
+    values from multiple devices and making the result available on
+    all of those devices.
+  * In the future we will have support for TensorFlows' partitioned
+    variables, where a single variable is split across multiple
+    devices.
+
+  We have then a few approaches we want to support:
+  * Code written (as if) with no knowledge of class `DistributionStrategy`.
+    This code should work as before, even if some of the layers, etc.
+    used by that code are written to be distribution-aware. This is done
+    by having a default `DistributionStrategy` that gives ordinary behavior,
+    and by default being in a single tower context.
+  * Ordinary model code that you want to run using a specific
+    `DistributionStrategy`. This can be as simple as:
+
+    ```
+    with my_distribution.scope():
+      iterator = my_distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
+      tower_train_ops = my_distribution.call_for_each_tower(
+          tower_fn, iterator.get_next())
+      train_op = tf.group(my_distribution.unwrap(tower_train_ops))
+    ```
+
+    This takes an ordinary `dataset` and `tower_fn` and runs it
+    distributed using a particular `DistributionStrategy` in
+    `my_distribution`. Any variables created in `tower_fn` are created
+    using `my_distribution`'s policy, and library functions called by
+    `tower_fn` can use the `get_tower_context()` API to get enhanced
+    behavior in this case.
+
+    You can also create an initializable iterator instead of one shot iterator.
+    In that case, you will need to ensure that you initialize the iterator
+    before calling get_next.
+    ```
+    iterator = my_distribution.distribute_dataset(
+        dataset).make_initializable_iterator())
+    session.run(iterator.initializer)
+    ```
+
+  * If you want to write a distributed algorithm, you may use any of
+    the `DistributionStrategy` APIs inside a
+    `with my_distribution.scope():` block of code.
+
+  Lower-level concepts:
+
+  * Wrapped values: In order to represent values parallel across devices
+    (either towers or the devices associated with a particular value), we
+    wrap them in a "PerDevice" or "Mirrored" object that contains a map
+    from device to values. "PerDevice" is used when the value may be
+    different across devices, and "Mirrored" when the value are the same.
+  * Unwrapping and merging: Consider calling a function `fn` on
+    multiple devices, like `call_for_each_tower(fn, w)` with an
+    argument `w` that is a wrapped value. This means `w` will have a
+    map taking tower device `d0` to `w0`, tower device `d1` to `w1`,
+    etc. `call_for_each_tower()` unwraps `w` before calling `fn`, so
+    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges
+    the return values from `fn()`, which can possibly result in
+    wrapped values. For example, let's say `fn()` returns a tuple with
+    three components: `(x, a, v0)` from tower 0, `(x, b, v1)` on tower 1,
+    etc. If the first component is the same object `x` from every
+    tower, then the first component of the merged result will also be
+    `x`. If the second component is different (`a`, `b`, ...)  from
+    each tower, then the merged value will have a wrapped map from
+    tower device to the different values. If the third component is
+    the members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to
+    `v1`, etc.), then the merged result will be that mirrored variable
+    (`v`).
+  * Tower context vs. Cross-tower context: _tower context_ is when we
+    are in some function that is being called once for each tower.
+    Otherwise we are in cross-tower context, which is useful for
+    calling `DistributionStrategy` methods which operate across the
+    towers (like `reduce()`). By default you start in a tower context
+    (the default "single tower context") and then some methods can
+    switch you back and forth, as described below.
+  * Worker devices vs. parameter devices: Most tower computations will
+    happen on worker devices. Since we don't yet support model
+    parallelism, there will be one worker device per tower. When using
+    parameter servers (see above), the set of devices holding
+    variables may be different, otherwise the parameter devices might
+    match the worker devices.
+  * Non-slot devices are some subset of the parameter devices where we
+    put all the non-slot variables. We need to ensure that all
+    non-slot variables are allocated on the same device, or mirrored
+    across the same set of devices. If you have some variable you want
+    to colocate all the non-slot variables with, you can use
+    `colocate_vars_with()` to get the remaining non-slot variables on
+    the same device.  Otherwise you can use `non_slot_devices()` to
+    pick a consistent set of devices to pass to both
+    `colocate_vars_with()` and `update_non_slot()`.
+
+  When using a `DistributionStrategy`, we have a new type dimension
+  called _locality_ that says what values are compatible with which
+  APIs:
+
+  * T: different value for each tower (e.g. a PerDevice-wrapped value).
+  * M: value is "mirrored" across towers, i.e. there are copies with the
+    same value on each tower (e.g. a Mirrored-wrapped value).
+  * V(`v`): value is "mirrored" across all the devices which have a
+    copy of variable `v` (also a Mirrored-wrapped value, but over
+    parameter devices instead of worker devices).
+  * N: value is "mirrored" across all the "non-slot" devices
+
+  Rules for methods with respect to locality and single-tower vs.
+  cross-tower context:
+
+  * `with d.scope()`: default single-tower context -> cross-tower context for
+    `d`
+  * `with d.colocate_vars_with(v)`: in tower/cross-tower context, variables
+    will be created with locality V(`v`). That is, if we write
+    `with d.colocate_vars_with(v1): v2 = tf.get_variable(...)`, then
+    `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
+    V(`v1`).
+  * `with d.colocate_vars_with(d.non_slot_devices(...))`: in
+    tower/cross-tower context, variables will be created with locality N
+  * `v = tf.get_variable(...)`: in tower/cross-tower context, creates
+    a variable (which by definition will have locality V(`v`), though
+    will match another locality if inside a `colocate_vars_with`
+    scope).
+  * `d.distribute_dataset(dataset).make_one_shot_iterator()`: in cross-tower
+    context, produces an iterator with locality T
+  * `d.broadcast(t)`: in cross-tower context, produces a value with locality M
+  * `d.broadcast(t, v)`: in cross-tower context, produces a value with
+    locality V(`v`)
+  * `d.call_for_each_tower(fn, ...)`: in cross-tower context, runs
+    `fn()` in a tower context (and so may call `get_tower_context()` and
+    use its API, including `merge_call()` to get back to cross-tower
+    context), once for each tower. May use values with locality T or
+    M, and any variable.
+  * `d.reduce(m, t)`: in cross-tower context, accepts t with locality T
+    and produces a value with locality M.
+  * `d.reduce(m, t, v)`: in cross-tower context, accepts t with
+    locality T and produces a value with locality V(`v`).
+  * `d.batch_reduce(m, [(t, v)]): see `d.reduce()`
+  * `d.update(v, fn, ...)`: in cross-tower context, runs `fn()` once
+    for each device `v` is copied to, all inputs should have locality
+    V(`v`), output will have locality V(`v`) as well.
+  * `d.update_non_slot(d.non_slot_devices(), fn)`: in cross-tower
+    context, like `d.update()` except with locality N.
+  * `d.fetch(t)`: Copy `t` with any locality to the client's CPU device.
+
+  The standard pattern for updating variables is to:
+
+  1. Wrap your input dataset in `d.distribute_dataset()` and create an iterator.
+  2. Define each tower `d.call_for_each_tower()` up to the point of
+     getting a list of gradient, variable pairs.
+  3. Call `d.reduce("sum", t, v)` or `d.batch_reduce()` to sum the
+     gradients (with locality T) into values with locality V(`v`).
+  4. Call `d.update(v)` for each variable to update its value.
+
+  Steps 3 and 4 are done automatically by class `Optimizer` if you call
+  its `apply_gradients` method in a tower context. Otherwise you can
+  manually call its `_distributed_apply` method in a cross-tower context.
+
+  Another thing you might want to do in the middle of your tower function
+  is an all-reduce of some intermediate value, using `d.reduce()` or
+  `d.batch_reduce()` without supplying a variable as the destination.
+
+  Layers should expect to be called in a tower context, and can use
+  the `get_tower_context()` function to get a `TowerContext` object.  The
+  `TowerContext` object has a `merge_call()` method for entering
+  cross-tower context where you can use `reduce()` (or
+  `batch_reduce()`) and then optionally `update()` to update state.
+
+  You may use this API whether or not a `DistributionStrategy` is
+  being used, since there is a default implementation of
+  `TowerContext` and `DistributionStrategy`. Or you can use the
+  `get_tower_context().is_single_tower` property to run different code
+  in the distributed vs. single tower cases.
+  """
+
+  # TODO(josh11b): Raise an exception if variable partitioning requested before
+  #   we add support.
+  # TODO(josh11b): Also `parameter_device_index` property?
+  # TODO(josh11b): `map()`
+  # TODO(josh11b): ClusterSpec/ClusterResolver
+  # TODO(josh11b): Partitioned computations, state; sharding
+  # TODO(josh11b): Model parallelism: "towers" with multiple devices; shuffling
+  # TODO(josh11b): List of towers with their worker and parameter devices
+  #   (where the parameter devices may overlap in the ps case).
+
+  def scope(self):
+    """Returns a context manager selecting this DistributionStrategy as current.
+
+    Inside a `with distribution_strategy.scope():` code block, this thread
+    will use a variable creator set by `distribution_strategy`, and will
+    enter its "cross-tower context".
+
+    Returns:
+      A context manager.
+    """
+    if has_distribution_strategy():
+      _require_cross_tower_context(self)
+      return _SameScopeAgainContext(self)
+
+    def creator_with_resource_vars(*args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      kwargs["use_resource"] = True
+      return self._create_variable(*args, **kwargs)
+
+    def disable_partitioned_variables(getter, *args, **kwargs):
+      if kwargs.pop("partitioner", None) is not None:
+        tf_logging.log_first_n(
+            tf_logging.WARN, "Partitioned variables are disabled when using "
+            "DistributionStrategy.", 1)
+      return getter(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        self, variable_scope.variable_creator_scope(creator_with_resource_vars),
+        variable_scope.variable_scope(
+            variable_scope.get_variable_scope(),
+            custom_getter=disable_partitioned_variables))
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    # Note: should support "colocate_with" argument.
+    raise NotImplementedError("must be implemented in descendants")
+
+  def tower_local_var_scope(self, reduce_method):
+    """Inside this scope, new variables will not be mirrored.
+
+    There will still be one component variable per tower, but there is
+    no requirement that they stay in sync. Instead, when saving them
+    or calling `fetch()`, we use the value that results when calling
+    `reduce()` on all the towers' variables.
+
+    Note: tower-local implies not trainable. Instead, it is expected
+    that each tower will directly update (using `assign_add()` or
+    whatever) its local variable instance but only the aggregated
+    value (accessible using `fetch()`) will be exported from the
+    model. When it is acceptable to only aggregate on export, we
+    greatly reduce communication overhead by using tower-local
+    variables.
+
+    Note: All component variables will be initialized to the same
+    value, using the initialization expression from the first tower.
+    The values will match even if the initialization expression uses
+    random numbers.
+
+    Args:
+      reduce_method: String used as a `method_string` to `reduce()`
+        to get the value to save when checkpointing.
+
+    Returns:
+      A context manager.
+    """
+    def create_tower_local_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      kwargs["use_resource"] = True
+      kwargs["tower_local_reduce_method"] = reduce_method
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope(self)
+    return variable_scope.variable_creator_scope(create_tower_local_variable)
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Scope that controls which devices variables will be created on.
+
+    No operations should be added to the graph inside this scope, it
+    should only be used when creating variables (some implementations
+    work by changing variable creation, others work by using a
+    tf.colocate_with() scope).
+
+    This may only be used inside `self.scope()`.
+
+    Example usage:
+
+    ```
+    with distribution_strategy.scope():
+      var1 = tf.get_variable(...)
+      with distribution_strategy.colocate_vars_with(v1):
+        # var2 and var3 will be created on the same device(s) as var1
+        var2 = tf.get_variable(...)
+        var3 = tf.get_variable(...)
+
+      def fn(v1, v2, v3):
+        # operates on v1 from var1, v2 from var2, and v3 from var3
+
+      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
+      distribution_strategy.update(v1, fn, v2, v3)
+    ```
+
+    Args:
+      colocate_with_variable: A created in `self.scope()`. Variables created
+        while in the returned context manager will be on the same set of
+        devices as `colocate_with_variable`.
+
+    Returns:
+      A context manager.
+    """
+    def create_colocated_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      kwargs["use_resource"] = True
+      kwargs["colocate_with"] = colocate_with_variable
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope(self)
+    return variable_scope.variable_creator_scope(create_colocated_variable)
+
+  def _call_dataset_fn(self, dataset_fn):
+    result = dataset_fn()
+    if not isinstance(result, dataset_ops.Dataset):
+      raise ValueError(
+          "dataset_fn() must return a tf.data.Dataset when using a "
+          "DistributionStrategy.")
+    return result
+
+  # TODO(josh11b): `PerDeviceDataset` currently only implements a few methods of
+  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
+  # Extend to implement more functionality of datasets.
+  def distribute_dataset(self, dataset_fn):
+    """Return a `dataset` split across all towers.
+
+    Suitable for providing input to for `call_for_each_tower()` by creating an
+    iterator:
+
+    ```
+    def dataset_fn():
+      return tf.data.Dataset.from_tensors([[1.]]).repeat()
+    with distribution_strategy.scope():
+      distributed_dataset = distribution_strategy.distribute_dataset(dataset_fn)
+      iterator = distributed_dataset.make_one_shot_iterator()
+      tower_results = distribution_strategy.call_for_each_tower(
+          tower_fn, iterator.get_next())
+    ```
+
+    Args:
+      dataset_fn: A function that returns a `tf.data.Dataset`.
+
+    Returns:
+      A `PerDeviceDataset` that will produce data for each tower.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def broadcast(self, tensor, destinations=None):
+    """Mirror a tensor on one device to all worker devices.
+
+    Args:
+      tensor: A Tensor value to broadcast.
+      destinations: An optional mirrored variable, device string, or
+        list of device strings, specifying the destination devices
+        to copy `tensor` to. Defaults to `self.worker_devices`.
+
+    Returns:
+      A value mirrored to `destinations` devices.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_tower_context(self)
+    return self._broadcast(tensor, destinations)
+
+  def _broadcast(self, tensor, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def call_for_each_tower(self, fn, *args, **kwargs):
+    """Run `fn` once per tower.
+
+    `fn` may call `tf.get_tower_context()` to access methods such as
+    `tower_id()` and `merge_call()`.
+
+    `merge_call()` is used to communicate betwen the towers and
+    re-enter the cross-tower context. All towers pause their execution
+    having encountered a `merge_call()` call. After that the
+    `merge_fn`-function is executed. Its results are then unwrapped and
+    given back to each tower call. After that execution resumes until
+    `fn` is complete or encounters another `merge_call()`.  Example:
+
+    ```python
+    # Called once in "cross-tower" context.
+    def merge_fn(distribution, three_plus_tower_id):
+      # sum the values across towers
+      return sum(distribution.unwrap(three_plus_tower_id))
+
+    # Called once per tower in `distribution`, in a "tower" context.
+    def fn(three):
+      tower_ctx = tf.get_tower_context()
+      v = three + tower_ctx.tower_id
+      # Computes the sum of the `v` values across all towers.
+      s = tower_ctx.merge_call(merge_fn, v)
+      return s + v
+
+    with distribution.scope():
+      # in "cross-tower" context
+      ...
+      merged_results = distribution.call_for_each_tower(fn, 3)
+      # merged_results has the values from every tower execution of `fn`.
+      print(distribution.unwrap(merged_results))  # Prints a list
+    ```
+
+    Args:
+      fn: function to run (will be run once per tower).
+      *args: positional arguments for `fn`
+      **kwargs: keyword arguments for `fn`.
+          `"run_concurrently"`: Boolean indicating whether executions of `fn`
+             can be run concurrently (under eager execution only), defaults to
+             `True`.
+
+    Returns:
+      Merged return value of `fn` across all towers.
+    """
+    _require_cross_tower_context(self)
+    return self._call_for_each_tower(fn, *args, **kwargs)
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def reduce(self, method_string, value, destinations=None):
+    """Combine (via e.g. sum or mean) values across towers.
+
+    Args:
+      method_string: A string indicating how to combine values, either
+        "sum" or "mean".
+      value: A per-device value with one value per tower.
+      destinations: An optional mirrored variable, a device string,
+        list of device strings. The return value will be copied to all
+        destination devices (or all the devices where the mirrored
+        variable resides). If `None` or unspecified, the destinations
+        will match the devices `value` resides on.
+
+    Returns:
+      A value mirrored to `destinations`.
+    """
+    # TODO(josh11b): More docstring
+    # TODO(josh11b): Return an unwrapped value if colocate_with is a
+    # single device.
+    _require_cross_tower_context(self)
+    return self._reduce(method_string, value, destinations)
+
+  def _reduce(self, method_string, value, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def batch_reduce(self, method_string, value_destination_pairs):
+    """Combine multiple `reduce` calls into one for faster execution.
+
+    Args:
+      method_string: A string indicating how to combine values, either
+        "sum" or "mean".
+      value_destination_pairs: A sequence of (value, destinations)
+        pairs. See `reduce()` for a description.
+
+    Returns:
+      A list of mirrored values, one per pair in `value_destination_pairs`.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_tower_context(self)
+    assert method_string in ("sum", "mean")
+    return self._batch_reduce(method_string, value_destination_pairs)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    return [self.reduce(method_string, t, destinations=v)
+            for t, v in value_destination_pairs]
+
+  def update(self, var, fn, *args, **kwargs):
+    """Run `fn` to update `var` using inputs mirrored to the same devices.
+
+    If `var` is mirrored across multiple devices, then this implements
+    logic like:
+
+    ```
+    results = {}
+    for device, v in var:
+      with tf.device(device):
+        # *args and **kwargs will be unwrapped if they are mirrored.
+        results[device] = fn(v, *args, **kwargs)
+    return merged(results)
+    ```
+
+    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.'
+
+    Neither *args nor **kwargs may contain per-device values.
+    If they contain mirrored values, they will be unwrapped before
+    calling `fn`.
+
+    Args:
+      var: Variable, possibly mirrored to multiple devices, to operate on.
+      fn: Function to call. Should take the variable as the first argument.
+      *args: Additional positional arguments to pass to `fn()`.
+      **kwargs: Keyword arguments to pass to `fn()`.
+
+    Returns:
+      Merged return value of `fn` across all towers.
+    """
+    _require_cross_tower_context(self)
+    return self._update(var, fn, *args, **kwargs)
+
+  def _update(self, var, fn, *args, **kwargs):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
+
+    Args:
+      colocate_with: The return value of `non_slot_devices()`.
+      fn: Function to execute.
+      *args: Positional arguments to pass to `fn()`.
+      **kwargs: Keyword arguments to pass to `fn()`.
+
+    Returns:
+      Return value of `fn`, possibly merged across devices.
+    """
+    _require_cross_tower_context(self)
+    return self._update_non_slot(colocate_with, fn, *args, **kwargs)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def fetch(self, val, destination="/device:CPU:0", fn=lambda x: x):
+    """Return a copy of `val` or `fn(val)` on `destination`.
+
+    This is useful for getting a mirrored value onto a device.  It
+    will attempt to avoid a copy by checking if the value is already
+    on the destination device.
+
+    Args:
+      val: Value (which may be mirrored) to copy.
+      destination: A device string to copy the value to.
+      fn: An optional function to apply to the value on the source
+          device, before copying.
+
+    Returns:
+      A `Tensor` on `destination`.
+    """
+    _require_cross_tower_context(self)
+    return self._fetch(val, destination, fn)
+
+  def _fetch(self, val, destination, fn):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def unwrap(self, value):
+    """Returns the list of all per-device values contained in `value`.
+
+    Args:
+      value: A value returned by `call_for_each_tower()` or a variable
+        created in `scope()`.
+
+    Returns:
+      A list of values contained in `value`. If `value` represents a single
+      value, this returns `[value].`
+    """
+    _require_cross_tower_context(self)
+    return self._unwrap(value)
+
+  def _unwrap(self, distributed_value):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def group(self, value, name=None):
+    """Shortcut for `tf.group(distribution.unwrap(value))`."""
+    value = nest.flatten(self.unwrap(value))
+
+    if len(value) != 1 or name is not None:
+      return control_flow_ops.group(value, name=name)
+    # Special handling for the common case of one op.
+    v, = value
+    if isinstance(v, ops.Tensor):
+      v = v.op
+    return v
+
+  @property
+  def is_single_tower(self):
+    """Returns whether there is a single tower or multiple.
+
+    Returns:
+      A boolean. If `True`, `call_for_each_tower(fn)` will only call `fn` once.
+      If `False`, `call_for_each_tower(fn)` may call `fn` multiple times.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def num_towers(self):
+    """Returns number of towers, for purposes of averaging across towers."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def worker_devices(self):
+    """Returns the list of devices used to run `call_for_each_tower()` calls."""
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def parameter_devices(self):
+    """Returns the list of devices used for variable and `update` placement."""
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  def non_slot_devices(self, var_list):
+    """Device(s) for non-slot variables.
+
+    Create variables on these devices in a
+    `with colocate_vars_with(non_slot_devices(...)):` block.
+    Update those using `update_non_slot()`.
+
+    Args:
+      var_list: The list of variables being optimized, needed with the
+        default `DistributionStrategy`.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def worker_device_index(self):
+    """An object mapping worker device to an id.
+
+    This might be passed as an argument to `call_for_each_tower()`, as in:
+
+    ```
+    with distribution_strategy.scope():
+
+      def fn(device_id):
+        # device_id is an integer. `fn` is being executed on device:
+        #    distribution_strategy.worker_devices[device_id].
+
+      distribution_strategy.call_for_each_tower(
+          fn, distribution_strategy.worker_device_index)
+    ```
+
+    Returns:
+      An index object, or the integer 0 if there is only a single tower.
+    """
+    _require_cross_tower_context(self)
+    return self._worker_device_index()
+
+  def _worker_device_index(self):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def configure(self, session_config=None):
+    """Find the best configuration given a tensorflow session config."""
+    del session_config
+
+
+# A note about the difference between the context managers
+# `TowerContext` (defined here) and `_CurrentDistributionContext`
+# (defined above) used by `DistributionStrategy.scope()`:
+#
+# * a TowerContext is only present during a `call_for_each_tower()`
+#   call (except during a `merge_run` call) and in such a scope it
+#   will be returned by calls to `get_tower_context()`.  Implementers of new
+#   DistributionStrategy descendants will frequently also need to
+#   define a descendant of TowerContext, and are responsible for
+#   entering and exiting this context.
+#
+# * DistributionStrategy.scope() sets up a variable_creator scope that
+#   changes variable creation calls (e.g. to make mirrored
+#   variables). This is intended as an outer scope that users enter once
+#   around their model creation and graph definition. There is no
+#   anticipated need to define descendants of _CurrentDistributionContext.
+#   It sets the current DistributionStrategy for purposes of
+#   `get_distribution_strategy()` and `has_distribution_strategy()`
+#   and switches the thread mode to a "cross-tower context".
+class TowerContext(object):
+  """DistributionStrategy API inside a `call_for_each_tower()` call."""
+
+  def __init__(self, distribution_strategy, tower_id):
+    self._distribution_strategy = distribution_strategy
+    self._thread_context = _InTowerThreadMode(self)
+    self._tower_id = tower_id
+
+  def __enter__(self):
+    _push_per_thread_mode(self._thread_context)
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    _pop_per_thread_mode()
+
+  def merge_call(self, merge_fn, *args, **kwargs):
+    """Merge args across towers and run `merge_fn` in a cross-tower context.
+
+    This allows communication and coordination when there are multiple calls
+    to a model function triggered by a call to
+    `distribution.call_for_each_tower(model_fn, ...)`.
+
+    See `MirroredDistribution.call_for_each_tower()` for an explanation.
+
+    Otherwise, this is equivalent to:
+
+    ```
+    distribution = get_distribution_strategy()
+    with cross-tower-context(distribution):
+      return merge_fn(distribution, *args, **kwargs)
+    ```
+
+    Args:
+      merge_fn: function that joins arguments from threads that are given as
+        PerDevice. It accepts `DistributionStrategy` object as the first
+        argument.
+      *args: positional per-thread arguments for `merge_fn`
+      **kwargs: keyword per-thread arguments for `merge_fn`.
+
+    Returns:
+      The return value of `merge_fn`, except for `PerDevice` values which are
+      unpacked.
+    """
+    require_tower_context(self)
+    return self._merge_call(merge_fn, *args, **kwargs)
+
+  def _merge_call(self, merge_fn, *args, **kwargs):
+    """Default implementation for single tower."""
+    _push_per_thread_mode(  # thread-local, so not needed with multiple threads
+        _CrossTowerThreadMode(self._distribution_strategy))
+    try:
+      return merge_fn(self._distribution_strategy, *args, **kwargs)
+    finally:
+      _pop_per_thread_mode()
+
+  def tower_local_var_scope(self, reduce_method):
+    """Alias for distribution_strategy.tower_local_var_scope()."""
+    return self._distribution_strategy.tower_local_var_scope(reduce_method)
+
+  @property
+  def is_single_tower(self):
+    """Returns whether there is a single tower or multiple."""
+    require_tower_context(self)
+    return self._distribution_strategy.is_single_tower
+
+  @property
+  def num_towers(self):
+    """Returns number of towers, for purposes of averaging across towers."""
+    return self._distribution_strategy.num_towers
+
+  @property
+  def tower_id(self):
+    """Which tower is being defined, a number from 0 to `num_towers - 1`."""
+    require_tower_context(self)
+    return self._tower_id
+
+  @property
+  def distribution_strategy(self):
+    """The current `DistributionStrategy` object."""
+    return self._distribution_strategy
+
+  @property
+  def device(self):
+    """The device this tower is to be executed on, as a string."""
+    require_tower_context(self)
+    return device_util.current()
+
+  # TODO(josh11b): Implement `start_all_reduce(method, t)` that returns
+  # a function returning the result of reducing `t` across all
+  # towers. Most likely can be implemented in terms of `merge_call()`
+  # and `batch_reduce()`.
+
+# ------------------------------------------------------------------------------
+
+
+class _DefaultDistributionStrategy(DistributionStrategy):
+  """Default `DistributionStrategy` if none is explicitly selected."""
+
+  def scope(self):
+    """Context manager setting a variable creator and `self` as current."""
+    if has_distribution_strategy():
+      raise RuntimeError("Must not nest DistributionStrategy scopes.")
+
+    def creator(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      if kwargs.pop("tower_local_reduce_method", None) is not None:
+        kwargs["trainable"] = False
+      return next_creator(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        self, variable_scope.variable_creator_scope(creator))
+
+  def tower_local_var_scope(self, reduce_method):
+    """Does not set to resource variables."""
+    def create_tower_local_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      kwargs["tower_local_reduce_method"] = reduce_method
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope(self)
+    return variable_scope.variable_creator_scope(create_tower_local_variable)
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Does not require `self.scope`."""
+    _require_distribution_strategy_scope(self)
+    return ops.colocate_with(colocate_with_variable)
+
+  def distribute_dataset(self, dataset_fn):
+    return self._call_dataset_fn(dataset_fn)
+
+  def _broadcast(self, tensor, destinations):
+    if destinations is None:
+      return tensor
+    else:
+      raise NotImplementedError("TODO")
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    # We don't run `fn` in multiple threads in _DefaultDistributionStrategy.
+    kwargs.pop("run_concurrently", None)
+    with TowerContext(self, tower_id=0):
+      return fn(*args, **kwargs)
+
+  def _reduce(self, method_string, value, destinations):
+    # TODO(josh11b): Use destinations?
+    del method_string, destinations
+    return value
+
+  def _update(self, var, fn, *args, **kwargs):
+    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
+    # once that value is used for something.
+    with ops.colocate_with(var), UpdateContext(var):
+      return fn(var, *args, **kwargs)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
+    # once that value is used for something.
+    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
+      return fn(*args, **kwargs)
+
+  def _fetch(self, var, destination, fn):
+    with ops.colocate_with(var):
+      var = fn(var)
+    with ops.device(destination):
+      return array_ops.identity(var)
+
+  def _unwrap(self, distributed_value):
+    return [distributed_value]
+
+  @property
+  def is_single_tower(self):
+    return True
+
+  @property
+  def num_towers(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    raise RuntimeError(
+        "worker_devices() method unsupported by _DefaultDistributionStrategy.")
+
+  @property
+  def parameter_devices(self):
+    raise RuntimeError("parameter_devices() method unsupported by "
+                       "_DefaultDistributionStrategy.")
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
+
+  def _worker_device_index(self):
+    raise RuntimeError("worker_device_index() method unsupported by "
+                       "_DefaultDistributionStrategy.")
+
+# ------------------------------------------------------------------------------
+# Common operations
+
+
+def increment_var(v, amount=1):
+  """`v += amount`, distributed-aware version."""
+  def update(vu):
+    if isinstance(vu, resource_variable_ops.ResourceVariable):
+      return vu.assign_add(amount, read_value=False)
+    else:
+      return state_ops.assign_add(vu, amount)
+
+  def merge_fn(dist, vm):
+    return dist.group(dist.update(vm, update))
+
+  tower_context = get_tower_context()
+  return tower_context.merge_call(merge_fn, v)
+
+
+# ------------------------------------------------------------------------------
+# Singletons
+
+_default_distribution_strategy = _DefaultDistributionStrategy()
+_default_tower_context = TowerContext(
+    _default_distribution_strategy, tower_id=0)
+_default_tower_mode = _DefaultTowerThreadMode()
+
+
+# ------------------------------------------------------------------------------
+# We haven't yet implemented deserialization for DistributedVariables.
+# So here we catch any attempts to deserialize variables
+# when using distribution strategies.
+# pylint: disable=protected-access
+_original_from_proto = resource_variable_ops._from_proto_fn
+
+
+def _from_proto_fn(v, import_scope=None):
+  if has_distribution_strategy():
+    raise NotImplementedError(
+        "Deserialization of variables is not yet supported when using"
+        "distributed strategies.")
+  else:
+    return _original_from_proto(v, import_scope=import_scope)
+
+resource_variable_ops._from_proto_fn = _from_proto_fn
+# pylint: enable=protected-access
diff --git a/tensorflow/python/training/distribute_test.py b/tensorflow/python/training/distribute_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a4f19c31f6714e1211f9deed9703c02192cc2c0
--- /dev/null
+++ b/tensorflow/python/training/distribute_test.py
@@ -0,0 +1,104 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test DistributionStrategy, TowerContext, and supporting APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+from tensorflow.python.training import distribute
+
+
+class _TestTowerContext(distribute.TowerContext):
+
+  def merge_call(self, fn, *args, **kwargs):
+    return kwargs["test_arg"]
+
+
+class _TestStrategy(distribute.DistributionStrategy):
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    with _TestTowerContext(self, tower_id=0):
+      return fn(*args, **kwargs)
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    return kwargs["name"]
+
+
+def _assert_in_default_state(t):
+  t.assertIs(distribute._default_tower_context,
+             distribute.get_tower_context())
+  t.assertIs(None, distribute.get_cross_tower_context())
+  t.assertIs(distribute._default_distribution_strategy,
+             distribute.get_distribution_strategy())
+  t.assertFalse(distribute.has_distribution_strategy())
+
+
+class TestStrategyTest(test.TestCase):
+
+  def testCallForEachTower(self):
+    _assert_in_default_state(self)
+    dist = _TestStrategy()
+
+    def run_fn():
+      tower_context = distribute.get_tower_context()
+      self.assertTrue(tower_context is not None)
+      self.assertIs(None, distribute.get_cross_tower_context())
+      self.assertTrue(distribute.has_distribution_strategy())
+      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertEqual("foo", tower_context.merge_call(None, test_arg="foo"))
+      self.assertEqual("bar", variable_scope.variable(1.0, name="bar"))
+
+    with self.assertRaises(RuntimeError):
+      dist.call_for_each_tower(run_fn)
+    with dist.scope():
+      dist.call_for_each_tower(run_fn)
+    _assert_in_default_state(self)
+
+  def testScope(self):
+    _assert_in_default_state(self)
+    dist = _TestStrategy()
+    with dist.scope():
+      self.assertIs(None, distribute.get_tower_context())
+      self.assertIs(dist, distribute.get_cross_tower_context())
+      self.assertTrue(distribute.has_distribution_strategy())
+      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertEqual("baz", variable_scope.variable(1.0, name="baz"))
+    _assert_in_default_state(self)
+
+
+class DefaultDistributionStrategyTest(test.TestCase):
+
+  def testMergeCall(self):
+    _assert_in_default_state(self)
+
+    def merge_fn(dist, s):
+      self.assertIs(distribute._default_distribution_strategy, dist)
+      self.assertIs(None, distribute.get_tower_context())
+      self.assertIs(dist, distribute.get_cross_tower_context())
+      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertFalse(distribute.has_distribution_strategy())
+      return "foo_" + s
+
+    tower_ctx = distribute.get_tower_context()
+    self.assertIs(distribute._default_tower_context, tower_ctx)
+    self.assertEqual("foo_bar", tower_ctx.merge_call(merge_fn, "bar"))
+    _assert_in_default_state(self)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index 9d02e694db15637126f37ee5575638908b351def..4fa081fab72df62107cf4957d4ff68240ced9ee0 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -53,7 +53,7 @@ class FtrlOptimizer(optimizer.Optimizer):
       learning_rate: A float value or a constant float `Tensor`.
       learning_rate_power: A float value, must be less or equal to zero.
       initial_accumulator_value: The starting value for accumulators.
-        Only positive values are allowed.
+        Only zero or positive values are allowed.
       l1_regularization_strength: A float value, must be greater than or
         equal to zero.
       l2_regularization_strength: A float value, must be greater than or
@@ -84,9 +84,10 @@ class FtrlOptimizer(optimizer.Optimizer):
     """
     super(FtrlOptimizer, self).__init__(use_locking, name)
 
-    if initial_accumulator_value <= 0.0:
-      raise ValueError("initial_accumulator_value %f needs to be positive" %
-                       initial_accumulator_value)
+    if initial_accumulator_value < 0.0:
+      raise ValueError(
+          "initial_accumulator_value %f needs to be be positive or zero" %
+          initial_accumulator_value)
     if learning_rate_power > 0.0:
       raise ValueError("learning_rate_power %f needs to be negative or zero" %
                        learning_rate_power)
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 380e14e02497fbe3681d6bae03fe9c636c5d13aa..6caf29d83af546f821314179e17f7bf1a693ff1a 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -43,6 +44,7 @@ class GradientDescentOptimizer(optimizer.Optimizer):
     """
     super(GradientDescentOptimizer, self).__init__(use_locking, name)
     self._learning_rate = learning_rate
+    self._learning_rate_tensor = None
 
   def _apply_dense(self, grad, var):
     return training_ops.apply_gradient_descent(
@@ -69,5 +71,6 @@ class GradientDescentOptimizer(optimizer.Optimizer):
     return var.scatter_sub(delta, use_locking=self._use_locking)
 
   def _prepare(self):
-    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
-                                                       name="learning_rate")
+    if not context.executing_eagerly() or self._learning_rate_tensor is None:
+      self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
+                                                         name="learning_rate")
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index bd9985a7c5c181c0431e0c0a91186bc36b11c787..caa26581e8a0041dd1b157ab6b1f8236344582e8 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -159,7 +159,7 @@ def input_producer(input_tensor,
   enabled. Please use the `tf.data` API to ingest data under eager execution.
   @end_compatibility
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError(
         "Input pipelines based on Queues are not supported when eager execution"
         " is enabled. Please use tf.data to ingest data into your model"
@@ -515,8 +515,7 @@ def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
     def _sparse_values_to_keep(t, keep_input):
       """Convert a per-row `keep_input` vector to a per-value one."""
       # Get the rows of every value in the sparse Tensor.
-      row_values = array_ops.reshape(
-          t.indices, [array_ops.shape(t.indices)[0], -1])[:, 0]
+      row_values = t.indices[:, 0]
       # The value should be kept iff the row should be kept.
       return array_ops.gather(keep_input, row_values)
     if keep_input.shape.ndims == 1:
@@ -737,7 +736,7 @@ def _batch(tensors, batch_size, keep_input, num_threads=1, capacity=32,
            allow_smaller_final_batch=False, shared_name=None,
            name=None):
   """Helper function for `batch` and `maybe_batch`."""
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise ValueError(
         "Input pipelines based on Queues are not supported when eager execution"
         " is enabled. Please use tf.data to ingest data into your model"
@@ -775,7 +774,7 @@ def _batch_join(tensors_list, batch_size, keep_input, capacity=32,
                 enqueue_many=False, shapes=None, dynamic_pad=False,
                 allow_smaller_final_batch=False, shared_name=None, name=None):
   """Helper function for `batch_join` and `maybe_batch_join`."""
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise ValueError(
         "Input pipelines based on Queues are not supported when eager execution"
         " is enabled. Please use tf.data to ingest data into your model"
@@ -810,7 +809,7 @@ def _shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
                    shapes=None, allow_smaller_final_batch=False,
                    shared_name=None, name=None):
   """Helper function for `shuffle_batch` and `maybe_shuffle_batch`."""
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise ValueError(
         "Input pipelines based on Queues are not supported when eager execution"
         " is enabled. Please use tf.data to ingest data into your model"
@@ -855,7 +854,7 @@ def _shuffle_batch_join(tensors_list, batch_size, capacity,
                         allow_smaller_final_batch=False, shared_name=None,
                         name=None):
   """Helper function for `shuffle_batch_join` and `maybe_shuffle_batch_join`."""
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise ValueError(
         "Input pipelines based on Queues are not supported when eager execution"
         " is enabled. Please use tf.data to ingest data into your model"
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 3a25bfe34322385e6a1ab1b3da4a6ad17c3208c6..1b1e89cb26d3360a09ac61b72c19882110fea532 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -497,6 +497,28 @@ class BatchTest(test_lib.TestCase):
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
+  def testUint32DataTypes(self):
+    values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint32)
+    batched = inp.batch([values], batch_size=2)
+    with self.test_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+      sess.run(batched)
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+
+  def testUint64DataTypes(self):
+    values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint64)
+    batched = inp.batch([values], batch_size=2)
+    with self.test_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+      sess.run(batched)
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+
   def testOneThreadDynamicPad(self):
     with self.test_session() as sess:
       batch_size = 10
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index 1ce8c156a0b126f680bad62267f90e31a23febed..60306e4f1239a759ea1f68492a1211d5f0858997 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -43,8 +43,8 @@ class LRDecayTest(test_util.TensorFlowTestCase):
 
   def testStaircase(self):
     with self.test_session():
-      step = gen_state_ops._variable(shape=[], dtype=dtypes.int32,
-                                     name="step", container="", shared_name="")
+      step = gen_state_ops.variable(shape=[], dtype=dtypes.int32,
+                                    name="step", container="", shared_name="")
       assign_100 = state_ops.assign(step, 100)
       assign_1 = state_ops.assign(step, 1)
       assign_2 = state_ops.assign(step, 2)
@@ -113,7 +113,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       learning_rate_decay.piecewise_constant(x, boundaries, values)
 
     # Test that ref types are valid.
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       x = variables.Variable(0.0)
       x_ref = x.op.outputs[0]   # float32_ref tensor should be accepted
       boundaries, values = [1.0, 2.0], [1, 2, 3]
@@ -264,8 +264,8 @@ class ExponentialDecayTest(test_util.TensorFlowTestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = gen_state_ops._variable(shape=[], dtype=dtypes.int32,
-                                   name="step", container="", shared_name="")
+    step = gen_state_ops.variable(
+        shape=[], dtype=dtypes.int32, name="step", container="", shared_name="")
     assign_step = state_ops.assign(step, 0)
     increment_step = state_ops.assign_add(step, 1)
     decayed_lr = learning_rate_decay.natural_exp_decay(initial_lr, step,
@@ -281,8 +281,8 @@ class ExponentialDecayTest(test_util.TensorFlowTestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = gen_state_ops._variable(shape=[], dtype=dtypes.int32,
-                                   name="step", container="", shared_name="")
+    step = gen_state_ops.variable(
+        shape=[], dtype=dtypes.int32, name="step", container="", shared_name="")
     assign_step = state_ops.assign(step, 0)
     increment_step = state_ops.assign_add(step, 1)
     decayed_lr = learning_rate_decay.natural_exp_decay(initial_lr,
@@ -304,8 +304,8 @@ class InverseDecayTest(test_util.TensorFlowTestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = gen_state_ops._variable(shape=[], dtype=dtypes.int32,
-                                   name="step", container="", shared_name="")
+    step = gen_state_ops.variable(
+        shape=[], dtype=dtypes.int32, name="step", container="", shared_name="")
     assign_step = state_ops.assign(step, 0)
     increment_step = state_ops.assign_add(step, 1)
     decayed_lr = learning_rate_decay.inverse_time_decay(initial_lr,
@@ -323,8 +323,8 @@ class InverseDecayTest(test_util.TensorFlowTestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = gen_state_ops._variable(shape=[], dtype=dtypes.int32,
-                                   name="step", container="", shared_name="")
+    step = gen_state_ops.variable(
+        shape=[], dtype=dtypes.int32, name="step", container="", shared_name="")
     assign_step = state_ops.assign(step, 0)
     increment_step = state_ops.assign_add(step, 1)
     decayed_lr = learning_rate_decay.inverse_time_decay(initial_lr,
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index cda421cef837fa6ab25898208a8dc94d70561048..7bd57ad3d854534e196fa7b72bebbd7195e6bca8 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -66,7 +66,7 @@ class MomentumOptimizerTest(test.TestCase):
       mom_update = mom_opt.apply_gradients(
           zip([grads0, grads1], [var0, var1]))
 
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
@@ -78,13 +78,13 @@ class MomentumOptimizerTest(test.TestCase):
       self.assertEquals(slot0.get_shape(), var0.get_shape())
       slot1 = mom_opt.get_slot(var1, "momentum")
       self.assertEquals(slot1.get_shape(), var1.get_shape())
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         self.assertFalse(slot0 in variables.trainable_variables())
         self.assertFalse(slot1 in variables.trainable_variables())
 
       # Step 1: the momentum accumulators where 0. So we should see a normal
       # update: v -= grad * learning_rate
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         self.evaluate(mom_update)
       # Check that the momentum accumulators have been updated.
       self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
@@ -99,10 +99,10 @@ class MomentumOptimizerTest(test.TestCase):
           np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
           self.evaluate(var1))
       # Step 2: the momentum accumulators contain the previous update.
-      if context.in_graph_mode():
-        self.evaluate(mom_update)
-      else:
+      if context.executing_eagerly():
         mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      else:
+        self.evaluate(mom_update)
       # Check that the momentum accumulators have been updated.
       self.assertAllCloseAccordingToType(
           np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
@@ -142,7 +142,7 @@ class MomentumOptimizerTest(test.TestCase):
           [1.0, 2.0], dtype=dtypes.float32, name="var0")
       var1 = resource_variable_ops.ResourceVariable(
           [3.0, 4.0], dtype=dtypes.float32, name="var1")
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         loss = lambda: math_ops.reduce_sum(var0 + var1)
       else:
         loss = math_ops.reduce_sum(var0 + var1)
@@ -157,7 +157,7 @@ class MomentumOptimizerTest(test.TestCase):
           [1.0, 2.0], dtype=dtypes.float32, name="var2")
       var3 = resource_variable_ops.ResourceVariable(
           [3.0, 4.0], dtype=dtypes.float32, name="var3")
-      if context.in_eager_mode():
+      if context.executing_eagerly():
         loss = lambda: math_ops.reduce_sum(var2 + var3)
       else:
         loss = math_ops.reduce_sum(var2 + var3)
@@ -237,7 +237,17 @@ class MomentumOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
 
       # pylint: disable=cell-var-from-loop
       def loss():
@@ -256,7 +266,17 @@ class MomentumOptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
-    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
 
     def loss():
       return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 6c5c9e01a76d539b550420134b09090b89beed46..f584a009d946a193f1ab76b3030db4f8a4954d27 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -202,7 +202,7 @@ class Scaffold(object):
     if self._local_init_op is None:
       self._local_init_op = Scaffold.get_or_default(
           'local_init_op', ops.GraphKeys.LOCAL_INIT_OP,
-          Scaffold._default_local_init_op)
+          Scaffold.default_local_init_op)
     if self._summary_op is None:
       self._summary_op = Scaffold.get_or_default('summary_op',
                                                  ops.GraphKeys.SUMMARY_OP,
@@ -267,7 +267,17 @@ class Scaffold(object):
     return op
 
   @staticmethod
-  def _default_local_init_op():
+  def default_local_init_op():
+    """Returns an op that groups the default local init ops.
+
+    This op is used during session initialization when a Scaffold is
+    initialized without specifying the local_init_op arg. It includes
+    `tf.local_variables_initializer`, `tf.tables_initializer`, and also
+    initializes local session resources.
+
+    Returns:
+      The default Scaffold local init op.
+    """
     return control_flow_ops.group(
         variables.local_variables_initializer(),
         lookup_ops.tables_initializer(),
@@ -281,13 +291,14 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              scaffold=None,
                              hooks=None,
                              chief_only_hooks=None,
-                             save_checkpoint_secs=600,
+                             save_checkpoint_secs=USE_DEFAULT,
                              save_summaries_steps=USE_DEFAULT,
                              save_summaries_secs=USE_DEFAULT,
                              config=None,
                              stop_grace_period_secs=120,
                              log_step_count_steps=100,
-                             max_wait_secs=7200):
+                             max_wait_secs=7200,
+                             save_checkpoint_steps=USE_DEFAULT):
   """Creates a `MonitoredSession` for training.
 
   For a chief, this utility sets proper session initializer/restorer. It also
@@ -310,8 +321,10 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
     chief_only_hooks: list of `SessionRunHook` objects. Activate these hooks if
       `is_chief==True`, ignore otherwise.
     save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved
-      using a default checkpoint saver. If `save_checkpoint_secs` is set to
-      `None`, then the default checkpoint saver isn't used.
+      using a default checkpoint saver. If both `save_checkpoint_steps` and
+      `save_checkpoint_secs` are set to `None`, then the default checkpoint
+      saver isn't used. If both are provided, then only `save_checkpoint_secs`
+      is used. Default 600.
     save_summaries_steps: The frequency, in number of global steps, that the
       summaries are written to disk using a default summary saver. If both
       `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
@@ -330,6 +343,11 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       become available. This should be kept relatively short to help detect
       incorrect code, but sometimes may need to be increased if the chief takes
       a while to start up.
+    save_checkpoint_steps: The frequency, in number of global steps, that a
+      checkpoint is saved using a default checkpoint saver. If both
+      `save_checkpoint_steps` and `save_checkpoint_secs` are set to `None`, then
+      the default checkpoint saver isn't used. If both are provided, then only
+      `save_checkpoint_secs` is used. Default not enabled.
 
   Returns:
     A `MonitoredSession` object.
@@ -342,6 +360,15 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
   elif save_summaries_steps == USE_DEFAULT:
     save_summaries_steps = None
 
+  if (save_checkpoint_steps == USE_DEFAULT and
+      save_checkpoint_secs == USE_DEFAULT):
+    save_checkpoint_steps = None
+    save_checkpoint_secs = 600
+  elif save_checkpoint_secs == USE_DEFAULT:
+    save_checkpoint_secs = None
+  elif save_checkpoint_steps == USE_DEFAULT:
+    save_checkpoint_steps = None
+
   scaffold = scaffold or Scaffold()
   if not is_chief:
     session_creator = WorkerSessionCreator(
@@ -374,9 +401,13 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
           save_steps=save_summaries_steps,
           save_secs=save_summaries_secs,
           output_dir=checkpoint_dir))
-    if save_checkpoint_secs and save_checkpoint_secs > 0:
+    if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
+        save_checkpoint_steps and save_checkpoint_steps > 0):
       all_hooks.append(basic_session_run_hooks.CheckpointSaverHook(
-          checkpoint_dir, save_secs=save_checkpoint_secs, scaffold=scaffold))
+          checkpoint_dir,
+          save_steps=save_checkpoint_steps,
+          save_secs=save_checkpoint_secs,
+          scaffold=scaffold))
 
   if hooks:
     all_hooks.extend(hooks)
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 159b2d5c1605bdd95303efb25690f55a54a3625d..3806056f01a73d21faf3de4539c0dd1ada5f96f8 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -282,6 +282,42 @@ class MonitoredTrainingSessionTest(test.TestCase):
           is_chief=True, checkpoint_dir=logdir) as session:
         self.assertEqual(2, session.run(gstep))
 
+  def test_save_checkpoint_steps(self):
+    logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_steps')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True,
+          checkpoint_dir=logdir,
+          save_checkpoint_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(100):
+          session.run(new_gstep)
+      # A restart will find the checkpoint and recover automatically.
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True, checkpoint_dir=logdir) as session:
+        self.assertEqual(100, session.run(gstep))
+
+  def test_save_checkpoint_secs(self):
+    logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_secs')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True,
+          checkpoint_dir=logdir,
+          save_checkpoint_secs=0.1,
+          log_step_count_steps=10) as session:
+        session.run(new_gstep)
+        time.sleep(0.2)
+        for _ in range(10):
+          session.run(new_gstep)
+      # A restart will find the checkpoint and recover automatically.
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True, checkpoint_dir=logdir) as session:
+        self.assertEqual(11, session.run(gstep))
+
   def test_summaries_steps(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_summaries_steps')
     with ops.Graph().as_default():
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index b9ecb27df19d051c28ec1c3fe3cd9fd86717a5ed..61fc828a840c490b0f787119134a0941f60f947a 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -52,16 +52,19 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
   they were created in and the scope of the variables they debias. They are also
   given a uniqifying-suffix.
 
-  Ex:
+  E.g.:
+
+  ```
     with tf.variable_scope('scope1'):
       with tf.variable_scope('scope2'):
         var = tf.get_variable('foo')
-        assign_moving_average(var, 0.0, 1.0)
-        assign_moving_average(var, 0.0, 0.9)
+        tf.assign_moving_average(var, 0.0, 1.0)
+        tf.assign_moving_average(var, 0.0, 0.9)
 
-    var.name: 'scope1/scope2/foo'
-    shadow var names: 'scope1/scope2/scope1/scope2/foo/biased'
-                      'scope1/scope2/scope1/scope2/foo/biased_1'
+    # var.name: 'scope1/scope2/foo'
+    # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased'
+    #                   'scope1/scope2/scope1/scope2/foo/biased_1'
+  ```
 
   Args:
     variable: A Variable.
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index 6efdeb286657e761a4c46634b9408121765a447b..6717811bbb0f05723a5ad0fbcbfba75249d0d43b 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -376,7 +376,7 @@ class ExponentialMovingAverageTest(test.TestCase):
     with ops.device("/job:dev_v0"):
       v0 = variables.Variable(10.0, name="v0")
     with ops.device("/job:dev_v1"):
-      v1 = gen_state_ops._variable(
+      v1 = gen_state_ops.variable(
           shape=[1],
           dtype=dtypes.float32,
           name="v1",
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 678d6322aa5ecea0a603b6a9858f7619638eae30..66914bacf35c75639f4636713cca2dba4ee19e3f 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -35,22 +35,26 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import checkpointable
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _get_variable_for(v):
-  """Returns the ResourceVariable responsible for v, or v if not necessary."""
-  if context.in_eager_mode():
-    return v
-  if v.op.type == "VarHandleOp":
-    for var in variables.trainable_variables():
-      if (isinstance(var, resource_variable_ops.ResourceVariable)
-          and var.handle.op is v.op):
-        return var
-    raise ValueError("Got %s but could not locate source variable." % (str(v)))
-  return v
+def get_filtered_grad_fn(grad_fn):
+  # `distributed_context.join()` requires that its arguments are parallel
+  # across threads, and in particular that `grads_and_vars` has the same
+  # variables in the same order.
+
+  # When computing gradients in eager mode with multiple threads, you
+  # can get extra variables with a gradient of `None`. This happens when
+  # those variables are accessed in another thread during the gradient
+  # computation. To get a consistent set of variables, we filter out
+  # those with `None` gradients.
+  def filtered_grad_fn(x=None):
+    return [(g, v) for g, v in grad_fn(x) if g is not None]
+
+  return filtered_grad_fn
 
 
 def _deduplicate_indexed_slices(values, indices):
@@ -73,8 +77,8 @@ def _deduplicate_indexed_slices(values, indices):
 
 
 def _var_key(var):
-  if context.in_eager_mode():
-    return var._shared_name  # pylint: disable=protected-access
+  if context.executing_eagerly():
+    return var._unique_id  # pylint: disable=protected-access
   return (var.op.graph, var.op.name)
 
 
@@ -98,6 +102,9 @@ class _RefVariableProcessor(_OptimizableVariable):
   def __init__(self, v):
     self._v = v
 
+  def __str__(self):
+    return "<_RefVariableProcessor(%s)>" % self._v
+
   def target(self):
     return self._v._ref()  # pylint: disable=protected-access
 
@@ -163,19 +170,6 @@ class _DenseResourceVariableProcessor(_OptimizableVariable):
       return update_op
 
 
-class _StreamingModelPortProcessor(_OptimizableVariable):
-  """Processor for streaming ModelPorts."""
-
-  def __init__(self, v):
-    self._v = v
-
-  def target(self):
-    return self._v
-
-  def update_op(self, optimizer, g):
-    return g
-
-
 class _TensorProcessor(_OptimizableVariable):
   """Processor for ordinary Tensors.
 
@@ -196,24 +190,30 @@ class _TensorProcessor(_OptimizableVariable):
 
 def _get_processor(v):
   """The processor of v."""
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     if isinstance(v, ops.Tensor):
       return _TensorProcessor(v)
     else:
       return _DenseResourceVariableProcessor(v)
+  if isinstance(
+      v, resource_variable_ops.ResourceVariable) and not v._in_graph_mode:  # pylint: disable=protected-access
+    # True if and only if `v` was initialized eagerly.
+    return _DenseResourceVariableProcessor(v)
   if v.op.type == "VarHandleOp":
     return _DenseResourceVariableProcessor(v)
   if isinstance(v, variables.Variable):
     return _RefVariableProcessor(v)
-  if v.op.type == "SubmodelPort":
-    return _StreamingModelPortProcessor(v)
   if isinstance(v, ops.Tensor):
     return _TensorProcessor(v)
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
 @tf_export("train.Optimizer")
-class Optimizer(checkpointable.Checkpointable):
+class Optimizer(
+    # Optimizers inherit from CheckpointableBase rather than Checkpointable
+    # since they do most of their dependency management themselves (slot
+    # variables are special-cased, and non-slot variables are keyed to graphs).
+    checkpointable.CheckpointableBase):
   """Base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
@@ -337,6 +337,13 @@ class Optimizer(checkpointable.Checkpointable):
     #   ... }
     self._deferred_slot_restorations = {}
 
+    # TODO(isaprykin): When using a DistributionStrategy, and when an
+    # optimizer is created in each tower, it might be dangerous to
+    # rely on some Optimer methods.  When such methods are called on a
+    # per-tower optimizer, an exception needs to be thrown.  We do
+    # allow creation per-tower optimizers however, because the
+    # compute_gradients()->apply_gradients() sequence is safe.
+
   def get_name(self):
     return self._name
 
@@ -449,14 +456,33 @@ class Optimizer(checkpointable.Checkpointable):
         if var_list is not None:
           tape.watch(var_list)
         loss_value = loss()
+
+        # Scale loss if using a "mean" loss reduction and multiple towers.
+        # Have to be careful to call distribute_lib.get_loss_reduction()
+        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        # TODO(josh11b): Test that we handle weight decay in a reasonable way.
+        if distribute_lib.get_loss_reduction() == "mean":
+          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          if num_towers > 1:
+            loss_value *= (1. / num_towers)
+
       if var_list is None:
         var_list = tape.watched_variables()
       grads = tape.gradient(loss_value, var_list, grad_loss)
       return list(zip(grads, var_list))
-    if context.in_eager_mode():
+
+    # Non-callable/Tensor loss case
+    if context.executing_eagerly():
       raise RuntimeError(
           "`loss` passed to Optimizer.compute_gradients should "
           "be a function when eager execution is enabled.")
+
+    # Scale loss if using a "mean" loss reduction and multiple towers.
+    if distribute_lib.get_loss_reduction() == "mean":
+      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      if num_towers > 1:
+        loss *= (1. / num_towers)
+
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
       raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
@@ -512,11 +538,25 @@ class Optimizer(checkpointable.Checkpointable):
     Raises:
       TypeError: If `grads_and_vars` is malformed.
       ValueError: If none of the variables have gradients.
+      RuntimeError: If you should use `_distributed_apply()` instead.
     """
     # This is a default implementation of apply_gradients() that can be shared
     # by most optimizers.  It relies on the subclass implementing the following
     # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
 
+    # Handle DistributionStrategy case.
+    if distribute_lib.get_cross_tower_context():
+      raise RuntimeError("Use `_distributed_apply()` instead of "
+                         "`apply_gradients()` in a cross-tower context.")
+    # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
+    # always calling _distributed_apply(), using the default distribution
+    # as needed.
+    if distribute_lib.has_distribution_strategy():
+      grads_and_vars = get_filtered_grad_fn(lambda _: grads_and_vars)()
+      return distribute_lib.get_tower_context().merge_call(
+          self._distributed_apply, grads_and_vars, global_step, name)
+
+    # No DistributionStrategy case.
     grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
     if not grads_and_vars:
       raise ValueError("No variables provided.")
@@ -542,7 +582,7 @@ class Optimizer(checkpointable.Checkpointable):
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, _, v in converted_grads_and_vars],))
     with ops.init_scope():
-      self._create_slots([_get_variable_for(v) for v in var_list])
+      self._create_slots(var_list)
     update_ops = []
     with ops.name_scope(name, self._name) as name:
       self._prepare()
@@ -552,7 +592,12 @@ class Optimizer(checkpointable.Checkpointable):
         # We colocate all ops created in _apply_dense or _apply_sparse
         # on the same device as the variable.
         # TODO(apassos): figure out how to get the variable name here.
-        scope_name = var.op.name if context.in_graph_mode() else ""
+        if context.executing_eagerly() or isinstance(
+            var,
+            resource_variable_ops.ResourceVariable) and not var._in_graph_mode:  # pylint: disable=protected-access
+          scope_name = ""
+        else:
+          scope_name = var.op.name
         with ops.name_scope("update_" + scope_name), ops.colocate_with(var):
           update_ops.append(processor.update_op(self, grad))
       if global_step is None:
@@ -570,7 +615,91 @@ class Optimizer(checkpointable.Checkpointable):
             else:
               apply_updates = state_ops.assign_add(global_step, 1, name=name)
 
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
+        if isinstance(apply_updates, ops.Tensor):
+          apply_updates = apply_updates.op
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        if apply_updates not in train_op:
+          train_op.append(apply_updates)
+
+      return apply_updates
+
+  def _distributed_apply(self,
+                         distribution,
+                         grads_and_vars,
+                         global_step=None,
+                         name=None):
+    """A version of `apply_gradients` for cross-tower context.
+
+    This is a version of `apply_gradients()` for when you are using a
+    `DistributionStrategy` and are in a cross-tower context. If in a
+    tower context, use `apply_gradients()` as normal.
+
+    Args:
+      distribution: A `DistributionStrategy` object.
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`, and then aggregated across towers.
+      global_step: Optional (mirrored) `Variable` to increment by one
+        after the variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients across all
+      towers. If `global_step` was not None, that operation also
+      increments `global_step`.
+    """
+    reduced_grads = distribution.batch_reduce("sum", grads_and_vars)
+    var_list = [v for _, v in grads_and_vars]
+    grads_and_vars = zip(reduced_grads, var_list)
+    # Note that this is called in a cross-tower context.
+    self._create_slots(var_list)
+
+    def update(v, g):
+      """Apply gradients to a replica variable."""
+      assert v is not None
+
+      try:
+        # Convert the grad to Tensor or IndexedSlices if necessary.
+        g = ops.convert_to_tensor_or_indexed_slices(g)
+      except TypeError:
+        raise TypeError("Gradient must be convertible to a Tensor"
+                        " or IndexedSlices, or None: %s" % g)
+      if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
+        raise TypeError(
+            "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
+      p = _get_processor(v)
+
+      scope_name = "" if context.executing_eagerly() else v.op.name
+      # device_policy is set because non-mirrored tensors will be read in
+      # `update_op`. `_resource_apply_dense`, `lr_t`, `beta1_t` and `beta2_t`
+      # is an example.
+      with ops.name_scope("update_" + scope_name):
+        return p.update_op(self, g)
+
+    with ops.name_scope(name, self._name) as name:
+      self._prepare()
+
+      update_ops = [
+          op
+          for grad, var in grads_and_vars
+          for op in distribution.unwrap(distribution.update(var, update, grad))
+      ]
+
+      def finish(self, update_ops):
+        return self._finish(update_ops, "update")
+
+      non_slot_devices = distribution.non_slot_devices(var_list)
+      finish_updates = distribution.update_non_slot(
+          non_slot_devices, finish, self, update_ops)
+      if global_step is None:
+        apply_updates = distribution.group(finish_updates, name=name)
+      else:
+        with ops.control_dependencies(distribution.unwrap(finish_updates)):
+          apply_updates = distribution.group(distribution.update(
+              global_step, state_ops.assign_add, 1, name=name))
+
+      if not context.executing_eagerly():
         if isinstance(apply_updates, ops.Tensor):
           apply_updates = apply_updates.op
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
@@ -596,9 +725,25 @@ class Optimizer(checkpointable.Checkpointable):
     Returns:
       The `Variable` for the slot if it was created, `None` otherwise.
     """
+    # pylint: disable=protected-access
     named_slots = self._slots.get(name, None)
     if not named_slots:
       return None
+
+    if hasattr(var, "_mirrored_container"):
+      # NOTE: If this isn't patched, then there is no `handle` in
+      # `_resource_apply_dense`.
+      mirrored_container = var._mirrored_container()
+      assert mirrored_container is not None
+      if context.executing_eagerly():
+        key = mirrored_container._unique_id
+      else:
+        key = (mirrored_container.graph, mirrored_container._shared_name)
+      # pylint: enable=protected-access
+      mirrored_slot = named_slots.get(key, None)
+      if mirrored_slot is None: return None
+      return mirrored_slot.get(device=var.device)
+
     return named_slots.get(_var_key(var), None)
 
   def get_slot_names(self):
@@ -620,7 +765,7 @@ class Optimizer(checkpointable.Checkpointable):
     Returns:
       A list of variables.
     """
-    executing_eagerly = context.in_eager_mode()
+    executing_eagerly = context.executing_eagerly()
     current_graph = ops.get_default_graph()
 
     def _from_current_graph(variable):
@@ -642,22 +787,64 @@ class Optimizer(checkpointable.Checkpointable):
 
   def _create_non_slot_variable(self, initial_value, name, colocate_with):
     """Add an extra variable, not associated with a slot."""
-    if context.in_graph_mode():
-      graph = colocate_with.graph
-    else:
-      graph = None
+    # Recommendation: Use OptimizerV2 if your optimizer uses non-slot variables.
+    eager = context.executing_eagerly()
+    graph = None if eager else colocate_with.graph
 
     key = (name, graph)
     v = self._non_slot_dict.get(key, None)
     if v is None:
-      with ops.colocate_with(colocate_with):
+      self._maybe_initialize_checkpointable()
+      distribution_strategy = distribute_lib.get_distribution_strategy()
+      with distribution_strategy.colocate_vars_with(colocate_with):
+        if eager:
+          restored_initial_value = self._preload_simple_restoration(
+              name=name, shape=None)
+          if restored_initial_value is not None:
+            initial_value = restored_initial_value
         v = variable_scope.variable(initial_value, name=name, trainable=False)
+      # Restore this variable by name if necessary, but don't add a
+      # Checkpointable dependency. Optimizers return the current graph's
+      # non-slot variables from _checkpoint_dependencies explicitly rather
+      # than unconditionally adding dependencies (since there may be multiple
+      # non-slot variables with the same name in different graphs, trying to
+      # save all of them would result in errors).
+      self._handle_deferred_dependencies(name=name, checkpointable=v)
       self._non_slot_dict[key] = v
 
     return v
 
+  @property
+  def _checkpoint_dependencies(self):
+    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    current_graph_non_slot_variables = []
+    current_graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+    for (name, _), variable_object in sorted(self._non_slot_dict.items(),
+                                             # Avoid comparing graphs
+                                             key=lambda item: item[0][0]):
+      if variable_object._graph_key == current_graph_key:  # pylint: disable=protected-access
+        current_graph_non_slot_variables.append(
+            checkpointable.CheckpointableReference(
+                name=name, ref=variable_object))
+    return (super(Optimizer, self)._checkpoint_dependencies
+            + current_graph_non_slot_variables)
+
+  def _lookup_dependency(self, name):
+    """From Checkpointable. Find a non-slot variable in the current graph."""
+    unconditional = super(Optimizer, self)._lookup_dependency(name)
+    if unconditional is not None:
+      return unconditional
+    graph = None if context.executing_eagerly() else ops.get_default_graph()
+    return self._get_non_slot_variable(name, graph=graph)
+
   def _get_non_slot_variable(self, name, graph=None):
-    return self._non_slot_dict.get((name, graph), None)
+    non_slot = self._non_slot_dict.get((name, graph), None)
+    if hasattr(non_slot, "_mirrored_container"):
+      # This is a mirrored non-slot.  In order to enable code like `_finish`
+      # to assign to a non-slot, return the current context replica.
+      return non_slot.get()
+    else:
+      return non_slot
 
   def _non_slot_variables(self):
     """Additional variables created by the `Optimizer`.
@@ -987,9 +1174,8 @@ class Optimizer(checkpointable.Checkpointable):
     named_slots = self._slot_dict(slot_name)
     variable_key = _var_key(variable)
     slot_variable = named_slots.get(variable_key, None)
-    if (slot_variable is None
-        and context.in_eager_mode()
-        and slot_variable_position.is_simple_variable()):
+    if (slot_variable is None and context.executing_eagerly() and
+        slot_variable_position.is_simple_variable()):
       initializer = checkpointable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self._get_or_make_slot(
diff --git a/tensorflow/python/training/quantize_training.i b/tensorflow/python/training/quantize_training.i
index 17ffcd6e0758c9c1bc8bab864b6b7a2a18bc9cbf..fb5e47efa0259d02df3ccf2e9b1430e027f8fcfb 100644
--- a/tensorflow/python/training/quantize_training.i
+++ b/tensorflow/python/training/quantize_training.i
@@ -56,6 +56,11 @@ PyObject* DoQuantizeTrainingOnGraphDefHelper(
 
 %insert("python") %{
 def do_quantize_training_on_graphdef(input_graph, num_bits):
+  """A general quantization scheme is being developed in @{tf.contrib.quantize}.
+
+  Consider using that instead, though since it is in the tf.contrib namespace,
+  it is not subject to backward compatibility guarantees.
+  """
   from tensorflow.core.framework.graph_pb2 import GraphDef
   from tensorflow.python.framework import errors
   with errors.raise_exception_on_not_ok_status() as status:
diff --git a/tensorflow/python/training/queue_runner.py b/tensorflow/python/training/queue_runner.py
index 42559d1e625d89168357e9372e622b61c2c962ca..92207d97cdef095281e63e4e6b732b6e764dcd72 100644
--- a/tensorflow/python/training/queue_runner.py
+++ b/tensorflow/python/training/queue_runner.py
@@ -22,13 +22,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.training.queue_runner_impl import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    # Documented in training.py:
-    "QueueRunner",
-    "add_queue_runner",
-    "start_queue_runners",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index 07afba79abf4d636c9ec2d53bcf2641594a35733..d38c5499c73e1217effbc907077236cb6c8e0ae8 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -89,7 +89,7 @@ class QueueRunner(object):
         restoring from `queue_runner_def`.
       RuntimeError: If eager execution is enabled.
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError(
           "QueueRunners are not supported when eager execution is enabled. "
           "Instead, please use tf.data to get data into your model.")
@@ -441,7 +441,7 @@ def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
   use the `tf.data` API instead.
   @end_compatibility
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError("Queues are not compatible with eager execution.")
   if sess is None:
     sess = ops.get_default_session()
diff --git a/tensorflow/python/training/saveable_object.py b/tensorflow/python/training/saveable_object.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b19294b6545de8105443a46a112a416f6bf481c
--- /dev/null
+++ b/tensorflow/python/training/saveable_object.py
@@ -0,0 +1,99 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Types for specifying saving and loading behavior."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class SaveSpec(object):
+  """Class used to describe tensor slices that need to be saved."""
+
+  def __init__(self, tensor, slice_spec, name, dtype=None):
+    """Creates a `SaveSpec` object.
+
+    Args:
+      tensor: the tensor to save or callable that produces a tensor to save.
+      slice_spec: the slice to be saved. See `Variable.SaveSliceInfo`.
+      name: the name to save the tensor under.
+      dtype: The data type of the Tensor. Required if `tensor` is callable.
+        Used for error checking in the restore op.
+    """
+    self._tensor = tensor
+    self.slice_spec = slice_spec
+    self.name = name
+    if callable(self._tensor):
+      if dtype is None:
+        raise AssertionError(
+            "When passing a callable `tensor` to a SaveSpec, an explicit "
+            "dtype must be provided.")
+      self.dtype = dtype
+    else:
+      self.dtype = tensor.dtype
+
+  @property
+  def tensor(self):
+    return self._tensor() if callable(self._tensor) else self._tensor
+
+
+class SaveableObject(object):
+  """Base class for saving and restoring saveable objects."""
+
+  def __init__(self, op, specs, name):
+    """Creates a `SaveableObject` object.
+
+    Args:
+      op: the "producer" object that this class wraps; it produces a list of
+        tensors to save.  E.g., a "Variable" object saving its backing tensor.
+      specs: a list of SaveSpec, each element of which describes one tensor to
+        save under this object. All Tensors must be on the same device.
+      name: the name to save the object under.
+    """
+    self.op = op
+    self.specs = specs
+    self.name = name
+    self._device = None
+
+  @property
+  def device(self):
+    """The device for SaveSpec Tensors."""
+    # Note that SaveSpec.tensor runs Tensor-gathering ops when executing
+    # eagerly, making this call potentially very expensive.
+    #
+    # TODO(allenl): Consider another way to gather device information. Lower
+    # priority since this property isn't part of the normal save()/restore()
+    # workflow, but does come up when some alternative builders are passed to
+    # the Saver.
+    if self._device is None:
+      self._device = self.specs[0].tensor.device
+    return self._device
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restores this object from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint
+      restored_shapes: the shapes this object should conform to after
+        restore, or None.
+
+    Returns:
+      An operation that restores the state of the object.
+
+    Raises:
+      ValueError: If the object cannot be restored using the provided
+        parameters.
+    """
+    # pylint: disable=unused-argument
+    raise ValueError("Calling an abstract method.")
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 3888e9bba42dc89055638ad0abe2b7e1a9f5b548..53e821c995900cd91e69f5e23f160d37bee7379f 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import collections
 import os.path
 import re
+import sys
 import time
 import uuid
 
@@ -30,8 +31,10 @@ import six
 
 from google.protobuf import text_format
 
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -50,6 +53,8 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import saveable_object
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
@@ -87,67 +92,14 @@ class BaseSaverBuilder(object):
   Can be extended to create different Ops.
   """
 
-  class SaveSpec(object):
-    """Class used to describe tensor slices that need to be saved."""
-
-    def __init__(self, tensor, slice_spec, name):
-      """Creates a `SaveSpec` object.
-
-      Args:
-        tensor: the tensor to save or callable that produces a tensor to save.
-        slice_spec: the slice to be saved. See `Variable.SaveSliceInfo`.
-        name: the name to save the tensor under.
-      """
-      self._tensor = tensor
-      self.slice_spec = slice_spec
-      self.name = name
-
-    @property
-    def tensor(self):
-      return self._tensor() if callable(self._tensor) else self._tensor
-
-  class SaveableObject(object):
-    """Base class for saving and restoring saveable objects."""
-
-    def __init__(self, op, specs, name):
-      """Creates a `SaveableObject` object.
-
-      Args:
-        op: the "producer" object that this class wraps; it produces a list of
-          tensors to save.  E.g., a "Variable" object saving its backing tensor.
-        specs: a list of SaveSpec, each element of which describes one tensor to
-          save under this object.
-        name: the name to save the object under.
-      """
-      self.op = op
-      self.specs = specs
-      self.name = name
-      # The device of this saveable. All tensors must be on the same device.
-      self.device = specs[0].tensor.device
-
-    def restore(self, restored_tensors, restored_shapes):
-      """Restores this object from 'restored_tensors'.
-
-      Args:
-        restored_tensors: the tensors that were loaded from a checkpoint
-        restored_shapes: the shapes this object should conform to after
-          restore, or None.
-
-      Returns:
-        An operation that restores the state of the object.
-
-      Raises:
-        ValueError: If the object cannot be restored using the provided
-          parameters.
-      """
-      # pylint: disable=unused-argument
-      raise ValueError("Calling an abstract method.")
+  SaveSpec = saveable_object.SaveSpec
+  SaveableObject = saveable_object.SaveableObject
 
   class VariableSaveable(SaveableObject):
     """SaveableObject implementation that handles Variables."""
 
     def __init__(self, var, slice_spec, name):
-      spec = BaseSaverBuilder.SaveSpec(var, slice_spec, name)
+      spec = BaseSaverBuilder.SaveSpec(var, slice_spec, name, dtype=var.dtype)
       super(BaseSaverBuilder.VariableSaveable, self).__init__(var, [spec], name)
 
     def restore(self, restored_tensors, restored_shapes):
@@ -185,7 +137,8 @@ class BaseSaverBuilder(object):
         raise ValueError(
             "Saveable is neither a resource variable nor a read operation."
             " Got: %s" % repr(var))
-      spec = BaseSaverBuilder.SaveSpec(tensor, slice_spec, name)
+      spec = BaseSaverBuilder.SaveSpec(tensor, slice_spec, name,
+                                       dtype=var.dtype)
       super(BaseSaverBuilder.ResourceVariableSaveable, self).__init__(
           var, [spec], name)
 
@@ -196,8 +149,8 @@ class BaseSaverBuilder(object):
       # Copy the restored tensor to the variable's device.
       with ops.device(self._var_device):
         restored_tensor = array_ops.identity(restored_tensor)
-      return resource_variable_ops.shape_safe_assign_variable_handle(
-          self.handle_op, self._var_shape, restored_tensor)
+        return resource_variable_ops.shape_safe_assign_variable_handle(
+            self.handle_op, self._var_shape, restored_tensor)
 
   def __init__(self, write_version=saver_pb2.SaverDef.V2):
     self._write_version = write_version
@@ -294,7 +247,7 @@ class BaseSaverBuilder(object):
               filename_tensor,
               [spec.name],
               [spec.slice_spec],
-              [spec.tensor.dtype])[0])
+              [spec.dtype])[0])
 
     return tensors
   # pylint: enable=unused-argument
@@ -310,8 +263,7 @@ class BaseSaverBuilder(object):
     Returns:
       A string tensor.
     """
-    # pylint: disable=protected-access
-    return gen_io_ops._sharded_filename(filename_tensor, shard, num_shards)
+    return gen_io_ops.sharded_filename(filename_tensor, shard, num_shards)
 
   def _AddSaveOps(self, filename_tensor, saveables):
     """Add ops to save variables that are on the same shard.
@@ -420,8 +372,7 @@ class BaseSaverBuilder(object):
         sharded_saves.append(self._AddSaveOps(sharded_filename, saveables))
     # Return the sharded name for the save path.
     with ops.control_dependencies([x.op for x in sharded_saves]):
-      # pylint: disable=protected-access
-      return gen_io_ops._sharded_filespec(filename_tensor, num_shards_tensor)
+      return gen_io_ops.sharded_filespec(filename_tensor, num_shards_tensor)
 
   def _AddRestoreOps(self,
                      filename_tensor,
@@ -577,10 +528,33 @@ class BaseSaverBuilder(object):
           names_to_saveables[name].append(var)
         else:
           names_to_saveables[name] = [var]
+      elif (isinstance(var, checkpointable.CheckpointableBase)
+            and not isinstance(var, variables.Variable)):
+        checkpointable_saveables = [
+            (factory() if callable(factory) else factory)
+            for factory in var._gather_saveables_for_checkpoint().values()]
+        names_to_saveables.update(
+            BaseSaverBuilder.OpListToDict(checkpointable_saveables))
       else:
-        if context.in_graph_mode():
+        if context.executing_eagerly():
+          if not isinstance(var, resource_variable_ops.ResourceVariable):
+            raise ValueError(
+                "Can only save/restore ResourceVariables when eager execution "
+                "is enabled, type: %s." % type(var))
+          set_var = names_to_saveables.setdefault(var._shared_name, var)
+          if set_var is not var:
+            raise ValueError(
+                ("Two different ResourceVariable objects with the same "
+                 "shared_name '%s' were passed to the Saver. This likely means "
+                 "that they were created in different Graphs or isolation "
+                 "contexts, and may not be checkpointed together.") %
+                (var._shared_name,))
+        else:
           if convert_variable_to_tensor:
-            var = ops.internal_convert_to_tensor(var, as_ref=True)
+            if isinstance(var, resource_variable_ops.ResourceVariable):
+              var = var._graph_element  # pylint: disable=protected-access
+            else:
+              var = ops.internal_convert_to_tensor(var, as_ref=True)
             if not BaseSaverBuilder._IsVariable(var):
               raise TypeError("Variable to save is not a Variable: %s" % var)
           if var.op.type == "ReadVariableOp":
@@ -591,18 +565,6 @@ class BaseSaverBuilder(object):
             raise ValueError("At least two variables have the same name: %s" %
                              name)
           names_to_saveables[name] = var
-        else:
-          if not isinstance(var, resource_variable_ops.ResourceVariable):
-            raise ValueError("Can only save/restore ResourceVariable eager "
-                             "mode is enabled, type: %s." % type(var))
-          set_var = names_to_saveables.setdefault(var._shared_name, var)
-          if set_var is not var:
-            raise ValueError(
-                ("Two different ResourceVariable objects with the same "
-                 "shared_name '%s' were passed to the Saver. This likely means "
-                 "that they were created in different Graphs or isolation "
-                 "contexts, and may not be checkpointed together.") % (
-                     var._shared_name,))
 
       # pylint: enable=protected-access
     return names_to_saveables
@@ -664,13 +626,16 @@ class BaseSaverBuilder(object):
         # pylint: enable=protected-access
       else:
         # A variable or tensor.
-        if context.in_eager_mode():
+        if context.executing_eagerly():
           if not isinstance(op, resource_variable_ops.ResourceVariable):
             raise ValueError("Can only save/restore ResourceVariable eager "
                              "mode is enabled, type: %s." % type(op))
           saveable = BaseSaverBuilder.ResourceVariableSaveable(op, "", name)
         else:
-          variable = ops.internal_convert_to_tensor(op, as_ref=True)
+          if isinstance(op, resource_variable_ops.ResourceVariable):
+            variable = op._graph_element  # pylint: disable=protected-access
+          else:
+            variable = ops.internal_convert_to_tensor(op, as_ref=True)
           if not BaseSaverBuilder._IsVariable(variable):
             raise TypeError("names_to_saveables must be a dict mapping string "
                             "names to Tensors/Variables. Not a variable: %s" %
@@ -768,8 +733,10 @@ class BaseSaverBuilder(object):
                       build_save=True,
                       build_restore=True):
     """build() with option to only perform save and restore."""
-    if context.in_graph_mode() and (not build_save or not build_restore):
-      raise ValueError("Graph mode needs to build save and restore together.")
+    if not context.executing_eagerly() and (not build_save or
+                                            not build_restore):
+      raise ValueError("save and restore operations need to be built together "
+                       " when eager execution is not enabled.")
 
     saveables = self._ValidateAndSliceInputs(names_to_saveables)
     if max_to_keep is None:
@@ -806,22 +773,22 @@ class BaseSaverBuilder(object):
     # such usage model makes sense.
     #
     # assert restore_op.name.endswith("restore_all"), restore_op.name
-    if context.in_graph_mode():
+    if context.executing_eagerly():
+      # Store the tensor values to the tensor_names.
+      save_tensor_name = save_tensor.numpy() if build_save else ""
       return saver_pb2.SaverDef(
-          filename_tensor_name=filename_tensor.name,
-          save_tensor_name=save_tensor.name,
-          restore_op_name=restore_op.name,
+          filename_tensor_name=filename_tensor.numpy(),
+          save_tensor_name=save_tensor_name,
+          restore_op_name="",
           max_to_keep=max_to_keep,
           sharded=sharded,
           keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
           version=self._write_version)
     else:
-      # Store the tensor values to the tensor_names.
-      save_tensor_name = save_tensor.numpy() if build_save else ""
       return saver_pb2.SaverDef(
-          filename_tensor_name=filename_tensor.numpy(),
-          save_tensor_name=save_tensor_name,
-          restore_op_name="",
+          filename_tensor_name=filename_tensor.name,
+          save_tensor_name=save_tensor.name,
+          restore_op_name=restore_op.name,
           max_to_keep=max_to_keep,
           sharded=sharded,
           keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
@@ -839,7 +806,7 @@ class BulkSaverBuilder(BaseSaverBuilder):
     restore_specs = []
     for saveable in saveables:
       for spec in saveable.specs:
-        restore_specs.append((spec.name, spec.slice_spec, spec.tensor.dtype))
+        restore_specs.append((spec.name, spec.slice_spec, spec.dtype))
 
     names, slices, dtypes = zip(*restore_specs)
     # Load all tensors onto CPU 0 for compatibility with existing code.
@@ -1120,8 +1087,9 @@ class Saver(object):
   the proliferation of checkpoint files on disk:
 
   * `max_to_keep` indicates the maximum number of recent checkpoint files to
-    keep.  As new files are created, older files are deleted.  If None or 0,
-    all checkpoint files are kept.  Defaults to 5 (that is, the 5 most recent
+    keep.  As new files are created, older files are deleted.   If None or 0,
+    no checkpoints are deleted from the filesystem but only the last one is
+    kept in the `checkpoint` file.  Defaults to 5 (that is, the 5 most recent
     checkpoint files are kept.)
 
   * `keep_checkpoint_every_n_hours`: In addition to keeping the most recent
@@ -1270,7 +1238,7 @@ class Saver(object):
       raise ValueError(
           "If `var_list` is provided then build cannot be deferred. "
           "Either set defer_build=False or var_list=None.")
-    if context.in_eager_mode() and var_list is None:
+    if context.executing_eagerly() and var_list is None:
       raise RuntimeError(
           "When eager execution is enabled, `var_list` must specify a list or "
           "dict of variables to save")
@@ -1289,15 +1257,23 @@ class Saver(object):
     self._write_version = write_version
     self._pad_step_number = pad_step_number
     self._filename = filename
-    if not defer_build and context.in_graph_mode():
+    self._last_checkpoints = []
+    self._checkpoints_to_be_deleted = []
+    if context.executing_eagerly():
+      self._next_checkpoint_time = (
+          time.time() + self._keep_checkpoint_every_n_hours * 3600)
+    elif not defer_build:
       self.build()
     if self.saver_def:
       self._check_saver_def()
       self._write_version = self.saver_def.version
     self._save_relative_paths = save_relative_paths
+    # For compatibility with object-based checkpoints, we may build a second
+    # Saver to read the renamed keys.
+    self._object_restore_saver = None
 
   def build(self):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError("Use save/restore instead of build in eager mode.")
     self._build(self._filename, build_save=True, build_restore=True)
 
@@ -1307,12 +1283,12 @@ class Saver(object):
 
   def _build(self, checkpoint_path, build_save, build_restore):
     """Builds saver_def."""
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       if self._is_built:
         return
       self._is_built = True
 
-    if not self.saver_def or context.in_eager_mode():
+    if not self.saver_def or context.executing_eagerly():
       if self._builder is None:
         self._builder = BulkSaverBuilder(self._write_version)
 
@@ -1349,17 +1325,17 @@ class Saver(object):
           self.saver_def.restore_op_name, self._name)
 
     self._check_saver_def()
-    # Updates next checkpoint time.
-    self._next_checkpoint_time = (
-        time.time() + self.saver_def.keep_checkpoint_every_n_hours * 3600)
-    self._last_checkpoints = []
-    self._checkpoints_to_be_deleted = []
+    if not context.executing_eagerly():
+      # Updates next checkpoint time.
+      # Set in __init__ when executing eagerly.
+      self._next_checkpoint_time = (
+          time.time() + self.saver_def.keep_checkpoint_every_n_hours * 3600)
 
   def _check_saver_def(self):
     if not isinstance(self.saver_def, saver_pb2.SaverDef):
       raise ValueError("saver_def must be a saver_pb2.SaverDef: %s" %
                        self.saver_def)
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       if not self.saver_def.save_tensor_name:
         raise ValueError("saver_def must specify the save_tensor_name: %s" %
                          str(self.saver_def))
@@ -1609,7 +1585,7 @@ class Saver(object):
       RuntimeError: If save and restore ops weren't built.
     """
     # pylint: enable=line-too-long
-    if not self._is_built and context.in_graph_mode():
+    if not self._is_built and not context.executing_eagerly():
       raise RuntimeError(
           "`build()` should be called before save if defer_build==True")
     if latest_filename is None:
@@ -1641,21 +1617,21 @@ class Saver(object):
             "'latest_filename' collides with 'save_path': '%s' and '%s'" %
             (latest_filename, save_path))
 
-    if (context.in_graph_mode() and
+    if (not context.executing_eagerly() and
         not isinstance(sess, session.SessionInterface)):
       raise TypeError("'sess' must be a Session; %s" % sess)
 
     save_path_parent = os.path.dirname(save_path)
     if not self._is_empty:
       try:
-        if context.in_graph_mode():
-          model_checkpoint_path = sess.run(
-              self.saver_def.save_tensor_name,
-              {self.saver_def.filename_tensor_name: checkpoint_file})
-        else:
+        if context.executing_eagerly():
           self._build_eager(
               checkpoint_file, build_save=True, build_restore=False)
           model_checkpoint_path = self.saver_def.save_tensor_name
+        else:
+          model_checkpoint_path = sess.run(
+              self.saver_def.save_tensor_name,
+              {self.saver_def.filename_tensor_name: checkpoint_file})
 
         model_checkpoint_path = compat.as_str(model_checkpoint_path)
         if write_state:
@@ -1677,7 +1653,7 @@ class Saver(object):
     if write_meta_graph:
       meta_graph_filename = self._MetaGraphFilename(
           checkpoint_file, meta_graph_suffix=meta_graph_suffix)
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         with sess.graph.as_default():
           self.export_meta_graph(
               meta_graph_filename, strip_default_attrs=strip_default_attrs)
@@ -1750,11 +1726,63 @@ class Saver(object):
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
     logging.info("Restoring parameters from %s", save_path)
-    if context.in_graph_mode():
-      sess.run(self.saver_def.restore_op_name,
-               {self.saver_def.filename_tensor_name: save_path})
-    else:
-      self._build_eager(save_path, build_save=False, build_restore=True)
+    try:
+      if context.executing_eagerly():
+        self._build_eager(save_path, build_save=False, build_restore=True)
+      else:
+        sess.run(self.saver_def.restore_op_name,
+                 {self.saver_def.filename_tensor_name: save_path})
+    except errors.NotFoundError:
+      exception_type, exception_value, exception_traceback = sys.exc_info()
+      # The checkpoint would not be loaded successfully as is. Try to parse it
+      # as an object-based checkpoint.
+      try:
+        reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+        object_graph_string = reader.get_tensor(
+            checkpointable.OBJECT_GRAPH_PROTO_KEY)
+      except errors.NotFoundError:
+        # This is not an object-based checkpoint, or the checkpoint doesn't
+        # exist. Re-raise the original exception.
+        six.reraise(exception_type, exception_value, exception_traceback)
+      del exception_traceback  # avoid reference cycles
+
+      # This is an object-based checkpoint. We'll print a warning and then do
+      # the restore.
+      logging.warning(
+          "Restoring an object-based checkpoint using a name-based saver. This "
+          "may be somewhat fragile, and will re-build the Saver. Instead, "
+          "consider loading object-based checkpoints using "
+          "tf.train.Checkpoint().")
+      self._restore_from_object_based_checkpoint(
+          sess=sess, save_path=save_path,
+          object_graph_string=object_graph_string)
+
+  def _restore_from_object_based_checkpoint(self, sess, save_path,
+                                            object_graph_string):
+    """A compatibility mode for reading object-based checkpoints."""
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+    object_graph_proto.ParseFromString(object_graph_string)
+    names_to_keys = {}
+    for node in object_graph_proto.nodes:
+      for attribute in node.attributes:
+        names_to_keys[attribute.full_name] = attribute.checkpoint_key
+    saveables = self._builder._ValidateAndSliceInputs(self._var_list)  # pylint: disable=protected-access
+    for saveable in saveables:
+      for spec in saveable.specs:
+        if spec.name not in names_to_keys:
+          raise errors.NotFoundError(
+              None, None,
+              message=("Attempting to load an object-based checkpoint using "
+                       "variable names, but could not find %s in the "
+                       "checkpoint.") % spec.name)
+        spec.name = names_to_keys[spec.name]
+    if self._object_restore_saver is None:
+      # Cache the Saver so multiple restore() calls don't pollute the graph when
+      # graph building. This assumes keys are consistent (i.e. this is the same
+      # type of object-based checkpoint we saw previously).
+      self._object_restore_saver = Saver(saveables)
+    self._object_restore_saver.restore(sess=sess, save_path=save_path)
 
   @staticmethod
   def _add_collection_def(meta_graph_def, key, export_scope=None):
@@ -1894,7 +1922,7 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   execution is enabled.
   @end_compatibility
   """  # pylint: disable=g-doc-exception
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError("Exporting/importing meta graphs is not supported when "
                        "eager execution is enabled. No graph exists when eager "
                        "execution is enabled.")
@@ -1903,12 +1931,22 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   else:
     meta_graph_def = meta_graph_or_file
 
-  meta_graph.import_scoped_meta_graph(meta_graph_def,
-                                      clear_devices=clear_devices,
-                                      import_scope=import_scope,
-                                      **kwargs)
+  imported_vars = meta_graph.import_scoped_meta_graph(
+      meta_graph_def,
+      clear_devices=clear_devices,
+      import_scope=import_scope,
+      **kwargs)
+
   if meta_graph_def.HasField("saver_def"):
-    return Saver(saver_def=meta_graph_def.saver_def, name=import_scope)
+    # Infer the scope that is prepended by `import_scoped_meta_graph`.
+    scope = import_scope
+    var_names = list(imported_vars.keys())
+    if var_names:
+      sample_key = var_names[0]
+      sample_var = imported_vars[sample_key]
+      scope = sample_var.name[:-len(sample_key)]
+
+    return Saver(saver_def=meta_graph_def.saver_def, name=scope)
   else:
     if variables._all_saveable_objects():  # pylint: disable=protected-access
       # Return the default saver instance for all graph variables.
@@ -1949,7 +1987,7 @@ def export_meta_graph(filename=None,
     saver_def: `SaverDef` protocol buffer.
     collection_list: List of string keys to collect.
     as_text: If `True`, writes the `MetaGraphDef` as an ASCII proto.
-    graph: The `Graph` to import into. If `None`, use the default graph.
+    graph: The `Graph` to export. If `None`, use the default graph.
     export_scope: Optional `string`. Name scope under which to extract
       the subgraph. The scope name will be striped from the node definitions
       for easy import later into new name scopes. If `None`, the whole graph
@@ -1977,7 +2015,7 @@ def export_meta_graph(filename=None,
   @end_compatibility
   """
   # pylint: enable=line-too-long
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     raise RuntimeError("Exporting/importing meta graphs is not supported when "
                        "eager execution is enabled. No graph exists when eager "
                        "execution is enabled.")
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index c5a6f49df599434ab3bc1a9fe3d85db6f824071e..70495291bc57629252423d86e35987c5c0d2b1ee 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import functools
 import math
 import os
 import random
@@ -35,6 +36,7 @@ from google.protobuf import text_format
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import queue_runner_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
@@ -49,10 +51,13 @@ from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
@@ -66,10 +71,13 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import saver_test_utils
+from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
 
@@ -89,7 +97,7 @@ class SaverTest(test.TestCase):
       v2_init = v2.insert("k1", 30.0)
 
       # Initialize all variables
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         self.evaluate([variables.global_variables_initializer(), v2_init])
 
         # Check that the parameter nodes have been initialized.
@@ -117,7 +125,7 @@ class SaverTest(test.TestCase):
       v2 = saver_test_utils.CheckpointedOp(name="v2")
 
       # Assert that the variables are not initialized.
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         self.assertEqual(
             len(variables.report_uninitialized_variables().eval()), 2)
         self.assertEqual(0, len(v2.keys().eval()))
@@ -140,7 +148,7 @@ class SaverTest(test.TestCase):
       v2_init = v2_2.insert("k1000", 3000.0)
 
       # Check that the parameter nodes have been initialized.
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         init_all_op = [variables.global_variables_initializer(), v2_init]
         self.evaluate(init_all_op)
         # TODO(xpan): Why _mutable_hash_table_v2 doesn't create empty
@@ -249,10 +257,10 @@ class SaverTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       v = resource_variable_ops.ResourceVariable([1], caching_device="/cpu:0",
                                                  name="v")
-      if context.in_graph_mode():
-        self.evaluate(variables.global_variables_initializer())
-      else:
+      if context.executing_eagerly():
         sess = None
+      else:
+        self.evaluate(variables.global_variables_initializer())
       save = saver_module.Saver([v])
       save.save(sess, save_path)
 
@@ -260,6 +268,24 @@ class SaverTest(test.TestCase):
       save2.restore(sess, save_path)
       self.assertEquals(self.evaluate(v), [1])
 
+  def testNoAdditionalOpsAddedBySaverForResourceVariablesOutsideSaveScope(self):
+    with ops_lib.Graph().as_default() as g:
+      v = resource_variable_ops.ResourceVariable(1.0, name="v")
+      with ops_lib.name_scope("saver1"):
+        saver_module.Saver()
+      with ops_lib.name_scope("saver2"):
+        saver_module.Saver({"name": v})
+    ops_in_saver1_scope_but_not_save_scope = [
+        op for op in g.get_operations()
+        if (op.name.startswith("saver1/") and
+            not op.name.startswith("saver1/save/"))]
+    self.assertEqual(ops_in_saver1_scope_but_not_save_scope, [])
+    ops_in_saver2_scope_but_not_save_scope = [
+        op for op in g.get_operations()
+        if (op.name.startswith("saver2/") and
+            not op.name.startswith("saver2/save/"))]
+    self.assertEqual(ops_in_saver2_scope_but_not_save_scope, [])
+
   def testSaveCopyRestoreWithSaveRelativePaths(self):
     """Save, copy checkpoint dir and restore from copied dir.
 
@@ -497,7 +523,7 @@ class SaverTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       var = resource_variable_ops.ResourceVariable(var_value, name=var_name)
       save = saver_module.Saver({var_name: var})
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         self.evaluate(var.initializer)
       val = save.save(sess, save_path)
       self.assertEqual(save_path, val)
@@ -657,11 +683,11 @@ class SaverTest(test.TestCase):
             {
                 var._shared_name: var
             }, pad_step_number=pad_step_number)
-        if context.in_graph_mode():
+        if context.executing_eagerly():
+          sess = None
+        else:
           self.evaluate(var.initializer)
           sess = ops_lib.get_default_session()
-        else:
-          sess = None
         if use_tensor:
           global_step = constant_op.constant(global_step_int)
           val = save.save(sess, save_path, global_step=global_step)
@@ -1039,6 +1065,77 @@ class MaxToKeepTest(test.TestCase):
     self.assertEqual(checkpoint_state.all_model_checkpoint_paths,
                      all_model_checkpoint_paths)
 
+  def testMaxToKeepEager(self):
+    with context.eager_mode():
+      save_dir = self._get_test_dir("max_to_keep_non_sharded")
+
+      v = variable_scope.variable(10.0, name="v")
+      save = saver_module.Saver({"v": v}, max_to_keep=2)
+      self.evaluate(variables.global_variables_initializer())
+      if not context.executing_eagerly():
+        self.assertEqual([], save.last_checkpoints)
+
+      s1 = save.save(None, os.path.join(save_dir, "s1"))
+      self.assertEqual([s1], save.last_checkpoints)
+      self.assertTrue(saver_module.checkpoint_exists(s1))
+      self.assertCheckpointState(
+          model_checkpoint_path=s1,
+          all_model_checkpoint_paths=[s1],
+          save_dir=save_dir)
+
+      s2 = save.save(None, os.path.join(save_dir, "s2"))
+      self.assertEqual([s1, s2], save.last_checkpoints)
+      self.assertTrue(saver_module.checkpoint_exists(s1))
+      self.assertTrue(saver_module.checkpoint_exists(s2))
+      self.assertCheckpointState(
+          model_checkpoint_path=s2,
+          all_model_checkpoint_paths=[s1, s2],
+          save_dir=save_dir)
+
+      s3 = save.save(None, os.path.join(save_dir, "s3"))
+      self.assertEqual([s2, s3], save.last_checkpoints)
+      self.assertFalse(saver_module.checkpoint_exists(s1))
+      self.assertTrue(saver_module.checkpoint_exists(s2))
+      self.assertTrue(saver_module.checkpoint_exists(s3))
+      self.assertCheckpointState(
+          model_checkpoint_path=s3,
+          all_model_checkpoint_paths=[s2, s3],
+          save_dir=save_dir)
+
+      # Create a second helper, identical to the first.
+      save2 = saver_module.Saver({"v": v}, max_to_keep=2)
+      save2.set_last_checkpoints(save.last_checkpoints)
+
+      # Exercise the first helper.
+
+      # Adding s2 again (old s2 is removed first, then new s2 appended)
+      s2 = save.save(None, os.path.join(save_dir, "s2"))
+      self.assertEqual([s3, s2], save.last_checkpoints)
+      self.assertFalse(saver_module.checkpoint_exists(s1))
+      self.assertTrue(saver_module.checkpoint_exists(s3))
+      self.assertTrue(saver_module.checkpoint_exists(s2))
+      self.assertCheckpointState(
+          model_checkpoint_path=s2,
+          all_model_checkpoint_paths=[s3, s2],
+          save_dir=save_dir)
+
+      # Adding s1 (s3 should now be deleted as oldest in list)
+      s1 = save.save(None, os.path.join(save_dir, "s1"))
+      self.assertEqual([s2, s1], save.last_checkpoints)
+      self.assertFalse(saver_module.checkpoint_exists(s3))
+      self.assertTrue(saver_module.checkpoint_exists(s2))
+      self.assertCheckpointState(
+          model_checkpoint_path=s1,
+          all_model_checkpoint_paths=[s2, s1],
+          save_dir=save_dir)
+
+      s2 = save2.save(None, os.path.join(save_dir, "s2"))
+      self.assertEqual([s3, s2], save2.last_checkpoints)
+      # Created by the first helper.
+      self.assertTrue(saver_module.checkpoint_exists(s1))
+      # Deleted by the first helper.
+      self.assertFalse(saver_module.checkpoint_exists(s3))
+
   def testNonSharded(self):
     save_dir = self._get_test_dir("max_to_keep_non_sharded")
 
@@ -1301,15 +1398,16 @@ class KeepCheckpointEveryNHoursTest(test.TestCase):
     gfile.MakeDirs(test_dir)
     return test_dir
 
+  @test_util.run_in_graph_and_eager_modes()
   @test.mock.patch.object(saver_module, "time")
   def testNonSharded(self, mock_time):
     save_dir = self._get_test_dir("keep_checkpoint_every_n_hours")
 
     with self.test_session() as sess:
-      v = variables.Variable([10.0], name="v")
+      v = variable_scope.variable([10.0], name="v")
       # Run the initializer NOW to avoid the 0.5s overhead of the first Run()
       # call, which throws the test timing off in fastbuild mode.
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       # Create a saver that will keep the last 2 checkpoints plus one every 0.7
       # seconds.
       start_time = time.time()
@@ -1387,7 +1485,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
       v0 = variable_op(-1.0, name="v0")
       v1 = variable_op(-1.0, name="v1")
 
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         with self.assertRaisesOpError("uninitialized"):
           self.evaluate(v0)
         with self.assertRaisesOpError("uninitialized"):
@@ -1397,7 +1495,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
       save.restore(sess, save_path)
 
       # Check that the parameter nodes have been restored.
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         self.assertEqual(10.0, self.evaluate(v0))
         self.assertEqual(20.0, self.evaluate(v1))
 
@@ -1407,7 +1505,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
       v0 = variable_op(-1.0, name="restore_prefix/v0")
       v1 = variable_op(-1.0, name="restore_prefix/v1")
 
-      if context.in_graph_mode():
+      if not context.executing_eagerly():
         with self.assertRaisesOpError("uninitialized"):
           self.evaluate(v0)
         with self.assertRaisesOpError("uninitialized"):
@@ -2039,6 +2137,113 @@ class MetaGraphTest(test.TestCase):
     self._testGraphExtensionRestore(test_dir)
     self._testRestoreFromTrainGraphWithControlContext(test_dir)
 
+  def _testGradientSerDes(self, graph_fn):
+    """Tests that gradients can be computed after exporting and importing.
+
+    Builds a graph, exports it, and verifies that it can be imported and the
+    gradient can be built and run correctly.
+
+    Args:
+      graph_fn: takes a single float Tensor argument as input, outputs a single
+        Tensor
+    """
+    test_dir = self._get_test_dir("nested_control_flow")
+    filename = os.path.join(test_dir, "metafile")
+    saver_ckpt = os.path.join(test_dir, "saver.ckpt")
+
+    # Create while loop using `outer_body_fn`.
+    with ops_lib.Graph().as_default():
+      var = variables.Variable(0.0)
+      var_name = var.name
+      output = graph_fn(var)
+      output_name = output.name
+      init_op = variables.global_variables_initializer()
+
+      # Generate a MetaGraphDef containing the while loop.
+      with session.Session() as sess:
+        sess.run(init_op)
+        sess.run(output)
+        saver = saver_module.Saver()
+        saver.save(sess, saver_ckpt)
+        saver.export_meta_graph(filename)
+
+      # Build and run the gradients of the while loop. We use this below to
+      # verify that the gradients are correct with an imported MetaGraphDef.
+      grad = gradients_impl.gradients([output], [var])
+      # Turn off constant folding to avoid breaking testNestedControlFlowSerDes.
+      # It appears that a missing control dependency in the gradient graph
+      # causes the fetch node to not be triggered.
+      no_constfold_config = config_pb2.ConfigProto()
+      no_constfold_config.graph_options.rewrite_options.constant_folding = (
+          rewriter_config_pb2.RewriterConfig.OFF)
+      with session.Session(config=no_constfold_config) as sess:
+        sess.run(init_op)
+        expected_grad_value = sess.run(grad)
+
+    # Restore the MetaGraphDef into a new Graph.
+    with ops_lib.Graph().as_default():
+      with session.Session() as sess:
+        saver = saver_module.import_meta_graph(filename)
+        saver.restore(sess, saver_ckpt)
+
+      # Make sure we can still build gradients and get the same result.
+      var = ops_lib.get_default_graph().get_tensor_by_name(var_name)
+      output = ops_lib.get_default_graph().get_tensor_by_name(output_name)
+      grad = gradients_impl.gradients([output], [var])
+
+      init_op = variables.global_variables_initializer()
+
+      with session.Session(config=no_constfold_config) as sess:
+        sess.run(init_op)
+        actual_grad_value = sess.run(grad)
+        self.assertEqual(expected_grad_value, actual_grad_value)
+
+  def _testWhileLoopAndGradientSerDes(self, outer_body_fn):
+    # Build a while loop with `outer_body_fn`, export it, and verify that it can
+    # be imported and the gradient can be built and run correctly.
+    # pylint: disable=g-long-lambda
+    return self._testGradientSerDes(
+        lambda x: control_flow_ops.while_loop(
+            lambda i, y: i < 5, outer_body_fn, [0, x])[1])
+    # pylint: enable=g-long-lambda
+
+  def testNestedWhileLoopsSerDes(self):
+    # Test two simple nested while loops.
+    def body(i, x):
+      _, r = control_flow_ops.while_loop(lambda j, y: j < 3,
+                                         lambda j, y: (j + 1, y + x),
+                                         [0, 0.0])
+      return i + 1, x + r
+    self._testWhileLoopAndGradientSerDes(body)
+
+  def testNestedControlFlowSerDes(self):
+    # Test while loop in a cond in a while loop.
+    # pylint: disable=g-long-lambda
+    def body(i, x):
+      cond_result = control_flow_ops.cond(
+          i > 0,
+          lambda: control_flow_ops.while_loop(
+              lambda j, y: j < 3,
+              lambda j, y: (j + 1, y + x),
+              [0, 0.0])[1],
+          lambda: x)
+      return i + 1, cond_result
+    # pylint: enable=g-long-lambda
+    self._testWhileLoopAndGradientSerDes(body)
+
+  def testNestedCondsSerDes(self):
+    # Test conds in a cond.
+    # pylint: disable=g-long-lambda
+    self._testGradientSerDes(lambda x: control_flow_ops.cond(
+        x > 0,
+        lambda: control_flow_ops.cond(x > 3,
+                                      lambda: array_ops.identity(x),
+                                      lambda: math_ops.multiply(x, 2.0)),
+        lambda: control_flow_ops.cond(x < -3,
+                                      lambda: constant_op.constant(1.0),
+                                      lambda: math_ops.multiply(x, -1.0))))
+    # pylint: enable=g-long-lambda
+
   def testStrippedOpListDef(self):
     with self.test_session():
       # Creates a graph.
@@ -2141,6 +2346,38 @@ class MetaGraphTest(test.TestCase):
               10, size=[1, 10])
       })
 
+  def testImportIntoImplicitNamescope(self):
+    # Test that we can import a meta graph into an implicit namescope.
+    test_dir = self._get_test_dir("import_into_namescope")
+    filename = os.path.join(test_dir, "ckpt")
+    image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
+    label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
+    with session.Session() as sess:
+      weights = variables.Variable(
+          random_ops.random_uniform([784, 10]), name="weights")
+      bias = variables.Variable(array_ops.zeros([10]), name="bias")
+      logit = nn_ops.relu(math_ops.matmul(image, weights) + bias, name="logits")
+      nn_ops.softmax(logit, name="prediction")
+      cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
+                                                      logits=logit, name="cost")
+      adam.AdamOptimizer().minimize(cost, name="optimize")
+      saver = saver_module.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, filename)
+
+    graph = ops_lib.Graph()
+    with session.Session(graph=graph) as sess:
+      with ops_lib.name_scope("new_model"):
+        new_saver = saver_module.import_meta_graph(
+            filename + ".meta", graph=graph)
+
+      new_saver.restore(sess, filename)
+      sess.run(["new_model/optimize"], {
+          "new_model/image:0": np.random.random([1, 784]),
+          "new_model/label:0": np.random.randint(
+              10, size=[1, 10])
+      })
+
   def testClearDevicesOnImport(self):
     # Test that we import a graph without its devices and run successfully.
     with ops_lib.Graph().as_default():
@@ -2494,7 +2731,7 @@ class ScopedGraphTest(test.TestCase):
       # The rest of the variables.
       rest_variables = list(
           set(variables.global_variables()) - set(var_list.keys()))
-      init_rest_op = variables.initialize_variables(rest_variables)
+      init_rest_op = variables.variables_initializer(rest_variables)
 
     with self.test_session(graph=graph) as sess:
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
@@ -2660,5 +2897,270 @@ class ScopedGraphTest(test.TestCase):
       self.assertEqual(2.0, var_dict2["variable2:0"].eval())
 
 
+class _OwnsAVariableSimple(checkpointable.CheckpointableBase):
+  """A Checkpointable object which can be saved using a tf.train.Saver."""
+
+  def __init__(self):
+    self.non_dep_variable = variable_scope.get_variable(
+        name="non_dep_variable", initializer=6., use_resource=True)
+
+  def _gather_saveables_for_checkpoint(self):
+    return {checkpointable.VARIABLE_VALUE_KEY: self.non_dep_variable}
+
+  # The Saver sorts by name before parsing, so we need a name property.
+  @property
+  def name(self):
+    return self.non_dep_variable.name
+
+
+class _MirroringSaveable(
+    saver_module.BaseSaverBuilder.ResourceVariableSaveable):
+
+  def __init__(self, primary_variable, mirrored_variable, name):
+    self._primary_variable = primary_variable
+    self._mirrored_variable = mirrored_variable
+    super(_MirroringSaveable, self).__init__(
+        self._primary_variable, "", name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group(
+        self._primary_variable.assign(tensor),
+        self._mirrored_variable.assign(tensor))
+
+
+class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
+  """A Checkpointable object which returns a more complex SaveableObject."""
+
+  def __init__(self):
+    self.non_dep_variable = variable_scope.get_variable(
+        name="non_dep_variable", initializer=6., use_resource=True)
+    self.mirrored = variable_scope.get_variable(
+        name="mirrored", initializer=15., use_resource=True)
+
+  def _gather_saveables_for_checkpoint(self):
+    def _saveable_factory(name=self.non_dep_variable.name):
+      return _MirroringSaveable(
+          primary_variable=self.non_dep_variable,
+          mirrored_variable=self.mirrored,
+          name=name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  # The Saver sorts by name before parsing, so we need a name property.
+  @property
+  def name(self):
+    return self.non_dep_variable.name
+
+
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+@test_util.with_c_api
+class CheckpointableCompatibilityTests(test.TestCase):
+
+  # TODO(allenl): Track down python3 reference cycles in these tests.
+  @test_util.run_in_graph_and_eager_modes()
+  def testNotSaveableButIsCheckpointable(self):
+    v = _OwnsAVariableSimple()
+    saver = saver_module.Saver(var_list=[v])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    with self.test_session() as sess:
+      save_path = saver.save(sess, prefix)
+      self.evaluate(v.non_dep_variable.assign(43.))
+      saver.restore(sess, save_path)
+      self.assertEqual(42., self.evaluate(v.non_dep_variable))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMoreComplexSaveableReturned(self):
+    v = _OwnsMirroredVariables()
+    saver = saver_module.Saver(var_list=[v])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    with self.test_session() as sess:
+      save_path = saver.save(sess, prefix)
+      self.evaluate(v.non_dep_variable.assign(43.))
+      self.evaluate(v.mirrored.assign(44.))
+      saver.restore(sess, save_path)
+      self.assertEqual(42., self.evaluate(v.non_dep_variable))
+      self.assertEqual(42., self.evaluate(v.mirrored))
+
+  def testSingleTensorEvaluation(self):
+
+    class _CountingSaveable(saver_module.BaseSaverBuilder.SaveableObject):
+
+      def __init__(self, name):
+        self.eval_count = 0
+        def _tensor():
+          self.eval_count += 1
+          return constant_op.constant([1.])
+        dummy_op = constant_op.constant([2.])
+        super(_CountingSaveable, self).__init__(
+            dummy_op,
+            [saver_module.BaseSaverBuilder.SaveSpec(
+                _tensor, "", name, dtype=dummy_op.dtype)],
+            name)
+
+      def restore(self, restored_tensors, restored_shapes):
+        """Restore the same value into both variables."""
+        pass
+
+    with context.eager_mode():
+      v = _CountingSaveable("foo")
+      saver = saver_module.Saver(var_list=[v])
+      test_dir = self.get_temp_dir()
+      prefix = os.path.join(test_dir, "ckpt")
+      with self.test_session() as sess:
+        save_path = saver.save(sess, prefix)
+        self.assertEqual(1, v.eval_count)
+        saver.restore(sess, save_path)
+        self.assertEqual(1, v.eval_count)
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def testVariableNotFoundErrorRaised(self):
+    # Restore does some tricky exception handling to figure out if it should
+    # load an object-based checkpoint. Tests that the exception handling isn't
+    # too broad.
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    a = resource_variable_ops.ResourceVariable(1., name="a")
+    b = resource_variable_ops.ResourceVariable(1., name="b")
+    a_saver = saver_module.Saver([a])
+    b_saver = saver_module.Saver([b])
+    with self.test_session() as sess:
+      sess.run(a.initializer)
+      save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
+      with self.assertRaisesRegexp(
+          errors.NotFoundError, "Key b not found in checkpoint"):
+        b_saver.restore(sess=sess, save_path=save_path)
+
+  def testCheckpointNotFoundErrorRaised(self):
+    # Restore does some tricky exception handling to figure out if it should
+    # load an object-based checkpoint. Tests that the exception handling isn't
+    # too broad.
+    a = resource_variable_ops.ResourceVariable(1., name="a")
+    saver = saver_module.Saver([a])
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(
+          errors.NotFoundError,
+          "Failed to find any matching files for path_which_does_not_exist"):
+        saver.restore(sess=sess, save_path="path_which_does_not_exist")
+
+  def testLoadFromObjectBasedGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph) as sess:
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+      # An incompatible object-based checkpoint to check error messages
+      var = resource_variable_ops.ResourceVariable(1., name="a")
+      self.evaluate(var.initializer)
+      second_saver = checkpointable_utils.CheckpointableSaver(var)
+      second_path = second_saver.save(file_prefix=os.path.join(
+          checkpoint_directory, "second"))
+
+    restore_graph = ops_lib.Graph()
+    with restore_graph.as_default(), self.test_session(
+        graph=restore_graph) as sess:
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      saver = saver_module.Saver()
+      saver.restore(sess=sess, save_path=save_path)
+      self._check_sentinels(root)
+      before_second_restore_ops = restore_graph.get_operations()
+      # Test that multiple restores do not pollute the graph
+      saver.restore(sess=sess, save_path=save_path)
+      self.assertEqual(before_second_restore_ops,
+                       restore_graph.get_operations())
+      with self.assertRaisesRegexp(errors.NotFoundError,
+                                   "could not find a_variable"):
+        saver.restore(sess=sess, save_path=second_path)
+
+  def testLoadFromObjectBasedEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph):
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      saver = saver_module.Saver(
+          root.model.variables + root.optimizer.variables())
+      saver.restore(sess=None, save_path=save_path)
+      self._check_sentinels(root)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/saver_test_utils.py b/tensorflow/python/training/saver_test_utils.py
index 44b06b357ecbe4c8e330a2ccc49e83ddd4bf8c7d..2bbe5b6d845c304c4dc79fb3619c57211ca0489e 100644
--- a/tensorflow/python/training/saver_test_utils.py
+++ b/tensorflow/python/training/saver_test_utils.py
@@ -35,12 +35,12 @@ class CheckpointedOp(object):
   # pylint: disable=protected-access
   def __init__(self, name, table_ref=None):
     if table_ref is None:
-      self.table_ref = gen_lookup_ops._mutable_hash_table_v2(
+      self.table_ref = gen_lookup_ops.mutable_hash_table_v2(
           key_dtype=dtypes.string, value_dtype=dtypes.float32, name=name)
     else:
       self.table_ref = table_ref
     self._name = name
-    if context.in_graph_mode():
+    if not context.executing_eagerly():
       self._saveable = CheckpointedOp.CustomSaveable(self, name)
       ops_lib.add_to_collection(ops_lib.GraphKeys.SAVEABLE_OBJECTS,
                                 self._saveable)
@@ -51,16 +51,16 @@ class CheckpointedOp(object):
 
   @property
   def saveable(self):
-    if context.in_graph_mode():
-      return self._saveable
-    else:
+    if context.executing_eagerly():
       return CheckpointedOp.CustomSaveable(self, self.name)
+    else:
+      return self._saveable
 
   def insert(self, keys, values):
-    return gen_lookup_ops._lookup_table_insert_v2(self.table_ref, keys, values)
+    return gen_lookup_ops.lookup_table_insert_v2(self.table_ref, keys, values)
 
   def lookup(self, keys, default):
-    return gen_lookup_ops._lookup_table_find_v2(self.table_ref, keys, default)
+    return gen_lookup_ops.lookup_table_find_v2(self.table_ref, keys, default)
 
   def keys(self):
     return self._export()[0]
@@ -69,8 +69,8 @@ class CheckpointedOp(object):
     return self._export()[1]
 
   def _export(self):
-    return gen_lookup_ops._lookup_table_export_v2(self.table_ref, dtypes.string,
-                                                  dtypes.float32)
+    return gen_lookup_ops.lookup_table_export_v2(self.table_ref, dtypes.string,
+                                                 dtypes.float32)
 
   class CustomSaveable(saver_module.BaseSaverBuilder.SaveableObject):
     """A custom saveable for CheckpointedOp."""
@@ -86,6 +86,6 @@ class CheckpointedOp(object):
       super(CheckpointedOp.CustomSaveable, self).__init__(table, specs, name)
 
     def restore(self, restore_tensors, shapes):
-      return gen_lookup_ops._lookup_table_import_v2(
+      return gen_lookup_ops.lookup_table_import_v2(
           self.op.table_ref, restore_tensors[0], restore_tensors[1])
   # pylint: enable=protected-access
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 360e02fb44c1062f71bb50449b9ef381510a9c69..3cb3877cc2fb846477640058b5bcdfea97ee6828 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -229,10 +229,14 @@ class SessionManager(object):
     up to `max_wait_secs`, for recovery to succeed.
 
     If the model cannot be recovered successfully then it is initialized by
-    either running the provided `init_op`, or calling the provided `init_fn`.
-    The local_init_op is also run after init_op and init_fn, regardless of
+    running the `init_op` and calling `init_fn` if they are provided.
+    The `local_init_op` is also run after init_op and init_fn, regardless of
     whether the model was recovered successfully, but only if
-    ready_for_local_init_op passes.
+    `ready_for_local_init_op` passes.
+
+    If the model is recovered from a checkpoint it is assumed that all
+    global variables have been initialized, in particular neither `init_op`
+    nor `init_fn` will be executed.
 
     It is an error if the model cannot be recovered and no `init_op`
     or `init_fn` or `local_init_op` are passed.
@@ -259,8 +263,6 @@ class SessionManager(object):
 
     Raises:
       RuntimeError: If the model cannot be initialized or recovered.
-
-    Raises:
       ValueError: If both checkpoint_dir and checkpoint_filename_with_path are
         set.
     """
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 89f40300650f3b6cd1ae15d946640c9df91771e2..5daea9312886599f4119b088096434a8b2a258de 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -84,11 +84,6 @@ Note that if sess.run() raises OutOfRangeError or StopIteration then
 hooks.after_run() will not be called but hooks.end() will still be called.
 If sess.run() raises any other exception then neither hooks.after_run() nor
 hooks.end() will be called.
-
-@@SessionRunHook
-@@SessionRunArgs
-@@SessionRunContext
-@@SessionRunValues
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 75ef3d5976aba9f0cbe849d9f6984646d71a29ef..258a6f045d7c1b491ce00bdf8dd0ae6ad500ba68 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -40,12 +40,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
 
 
 def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
@@ -106,10 +106,14 @@ def create_slot(primary, val, name, colocate_with_primary=True):
   # and the same name has been previously used, the scope name will add '_N'
   # as suffix for unique identifications.
   validate_shape = val.get_shape().is_fully_defined()
-  prefix = primary.op.name if context.in_graph_mode() else primary._shared_name  # pylint: disable=protected-access
+  if context.executing_eagerly():
+    prefix = primary._shared_name  # pylint: disable=protected-access
+  else:
+    prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      with ops.colocate_with(primary):
+      distribution_strategy = distribute_lib.get_distribution_strategy()
+      with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
     else:
       return _create_slot_var(primary, val, "", validate_shape, None, None)
@@ -139,10 +143,14 @@ def create_slot_with_initializer(primary, initializer, shape, dtype, name,
   # and the same name has been previously used, the scope name will add '_N'
   # as suffix for unique identifications.
   validate_shape = shape.is_fully_defined()
-  prefix = primary.op.name if context.in_graph_mode() else primary._shared_name  # pylint: disable=protected-access
+  if context.executing_eagerly():
+    prefix = primary._shared_name  # pylint: disable=protected-access
+  else:
+    prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      with ops.colocate_with(primary):
+      distribution_strategy = distribute_lib.get_distribution_strategy()
+      with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, initializer, "", validate_shape, shape,
                                 dtype)
     else:
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index d2ad34773e0615256c340826dcc312cc8a00dc23..7389e344c7d8eef8e26c4d24c0985ff66276deea 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -45,7 +45,7 @@ class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
   This class is deprecated. Please use
-  ${tf.train.MonitoredTrainingSession} instead.
+  @{tf.train.MonitoredTrainingSession} instead.
 
   The Supervisor is a small wrapper around a `Coordinator`, a `Saver`,
   and a `SessionManager` that takes care of common needs of TensorFlow
@@ -305,7 +305,7 @@ class Supervisor(object):
     `Supervisor`s are not supported when eager execution is enabled.
     @end_compatibility
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError("Supervisors are compatible with eager execution.")
     # Set default values of arguments.
     if graph is None:
@@ -762,7 +762,7 @@ class Supervisor(object):
     execution is enabled, use the `tf.data` API.
     @end_compatibility
     """
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       raise RuntimeError("Queues are not compatible with eager execution.")
     if queue_runners is None:
       queue_runners = self._graph.get_collection(ops.GraphKeys.QUEUE_RUNNERS)
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 78c8ce9208efc2f2fa8b5c671d3379e7ca8c70f5..427e25d0f63a80e559822d77adde350de2b9f09b 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -16,84 +16,6 @@
 """Support for training models.
 
 See the @{$python/train} guide.
-
-@@Optimizer
-@@GradientDescentOptimizer
-@@AdadeltaOptimizer
-@@AdagradOptimizer
-@@AdagradDAOptimizer
-@@MomentumOptimizer
-@@AdamOptimizer
-@@FtrlOptimizer
-@@ProximalGradientDescentOptimizer
-@@ProximalAdagradOptimizer
-@@RMSPropOptimizer
-@@gradients
-@@AggregationMethod
-@@stop_gradient
-@@hessians
-@@clip_by_value
-@@clip_by_norm
-@@clip_by_average_norm
-@@clip_by_global_norm
-@@global_norm
-@@cosine_decay
-@@cosine_decay_restarts
-@@linear_cosine_decay
-@@noisy_linear_cosine_decay
-@@exponential_decay
-@@inverse_time_decay
-@@natural_exp_decay
-@@piecewise_constant
-@@polynomial_decay
-@@ExponentialMovingAverage
-@@Coordinator
-@@QueueRunner
-@@LooperThread
-@@add_queue_runner
-@@start_queue_runners
-@@Server
-@@Supervisor
-@@SessionManager
-@@ClusterSpec
-@@replica_device_setter
-@@MonitoredTrainingSession
-@@MonitoredSession
-@@SingularMonitoredSession
-@@Scaffold
-@@SessionCreator
-@@ChiefSessionCreator
-@@WorkerSessionCreator
-@@summary_iterator
-@@SessionRunHook
-@@SessionRunArgs
-@@SessionRunContext
-@@SessionRunValues
-@@LoggingTensorHook
-@@StopAtStepHook
-@@CheckpointSaverHook
-@@CheckpointSaverListener
-@@NewCheckpointReader
-@@StepCounterHook
-@@NanLossDuringTrainingError
-@@NanTensorHook
-@@SummarySaverHook
-@@GlobalStepWaiterHook
-@@FinalOpsHook
-@@FeedFnHook
-@@ProfilerHook
-@@SecondOrStepTimer
-@@global_step
-@@basic_train_loop
-@@get_global_step
-@@get_or_create_global_step
-@@create_global_step
-@@assert_global_step
-@@write_graph
-@@load_checkpoint
-@@load_variable
-@@list_variables
-@@init_from_checkpoint
 """
 
 # Optimizers.
@@ -101,13 +23,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys as _sys
-
-from tensorflow.python.ops import io_ops as _io_ops
-from tensorflow.python.ops import sdca_ops as _sdca_ops
-from tensorflow.python.ops import state_ops as _state_ops
-from tensorflow.python.util.all_util import remove_undocumented
-
 # pylint: disable=g-bad-import-order,unused-import
 from tensorflow.python.ops.sdca_ops import sdca_optimizer
 from tensorflow.python.ops.sdca_ops import sdca_fprint
@@ -152,6 +67,7 @@ from tensorflow.python.training.basic_session_run_hooks import FinalOpsHook
 from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_session_run_hooks import ProfilerHook
 from tensorflow.python.training.basic_loops import basic_train_loop
+from tensorflow.python.training.checkpointable_utils import Checkpoint
 from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
 from tensorflow.python.training.checkpoint_utils import list_variables
 from tensorflow.python.training.checkpoint_utils import load_checkpoint
@@ -187,6 +103,8 @@ from tensorflow.python.training.training_util import get_global_step
 from tensorflow.python.training.training_util import assert_global_step
 from tensorflow.python.training.training_util import create_global_step
 from tensorflow.python.training.training_util import get_or_create_global_step
+from tensorflow.python.training.warm_starting_util import VocabInfo
+from tensorflow.python.training.warm_starting_util import warm_start
 from tensorflow.python.pywrap_tensorflow import do_quantize_training_on_graphdef
 from tensorflow.python.pywrap_tensorflow import NewCheckpointReader
 from tensorflow.python.util.tf_export import tf_export
@@ -208,39 +126,6 @@ from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
 from tensorflow.python.training.server_lib import ClusterSpec
 from tensorflow.python.training.server_lib import Server
 
-# Symbols whitelisted for export without documentation.
-_allowed_symbols = [
-    # TODO(cwhipkey): review these and move to contrib or expose through
-    # documentation.
-    "generate_checkpoint_state_proto",  # Used internally by saver.
-    "checkpoint_exists",  # Only used in test?
-    "get_checkpoint_mtimes",  # Only used in test?
-
-    # Legacy: remove.
-    "do_quantize_training_on_graphdef",  # At least use grah_def, not graphdef.
-    # No uses within tensorflow.
-    "queue_runner",  # Use tf.train.start_queue_runner etc directly.
-    # This is also imported internally.
-
-    # TODO(drpng): document these. The reference in howtos/distributed does
-    # not link.
-    "SyncReplicasOptimizer",
-    # Protobufs:
-    "BytesList",  # from example_pb2.
-    "ClusterDef",
-    "Example",  # from example_pb2
-    "Feature",  # from example_pb2
-    "Features",  # from example_pb2
-    "FeatureList",  # from example_pb2
-    "FeatureLists",  # from example_pb2
-    "FloatList",  # from example_pb2.
-    "Int64List",  # from example_pb2.
-    "JobDef",
-    "SaverDef",  # From saver_pb2.
-    "SequenceExample",  # from example_pb2.
-    "ServerDef",
-]
-
 # pylint: disable=undefined-variable
 tf_export("train.BytesList")(BytesList)
 tf_export("train.ClusterDef")(ClusterDef)
@@ -256,9 +141,3 @@ tf_export("train.SaverDef")(SaverDef)
 tf_export("train.SequenceExample")(SequenceExample)
 tf_export("train.ServerDef")(ServerDef)
 # pylint: enable=undefined-variable
-
-# Include extra modules for docstrings because:
-# * Input methods in tf.train are documented in io_ops.
-# * Saver methods in tf.train are documented in state_ops.
-remove_undocumented(__name__, _allowed_symbols,
-                    [_sys.modules[__name__], _io_ops, _sdca_ops, _state_ops])
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 499f1feb2dbf8aee26314a43b0a000fb91a1c686..d05e1d2c830b2aa7008c9cba9f28eb6230d8bc82 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
@@ -31,7 +30,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
-
 # Picked a long key value to minimize the chance of collision with user defined
 # collection keys.
 GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
@@ -64,7 +62,7 @@ def global_step(sess, global_step_tensor):
   Returns:
     The global step value.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return int(global_step_tensor.numpy())
   return int(sess.run(global_step_tensor))
 
@@ -123,7 +121,7 @@ def create_global_step(graph=None):
     raise ValueError('"global_step" already exists.')
   # Create in proper graph and base name_scope.
   with graph.as_default() as g, g.name_scope(None):
-    if context.in_eager_mode():
+    if context.executing_eagerly():
       with ops.device('cpu:0'):
         return variable_scope.get_variable(
             ops.GraphKeys.GLOBAL_STEP,
@@ -170,8 +168,7 @@ def assert_global_step(global_step_tensor):
   """
   if not (isinstance(global_step_tensor, variables.Variable) or
           isinstance(global_step_tensor, ops.Tensor) or
-          isinstance(global_step_tensor,
-                     resource_variable_ops.ResourceVariable)):
+          resource_variable_ops.is_resource_variable(global_step_tensor)):
     raise TypeError(
         'Existing "global_step" must be a Variable or Tensor: %s.' %
         global_step_tensor)
diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
similarity index 67%
rename from tensorflow/python/estimator/warm_starting_util.py
rename to tensorflow/python/training/warm_starting_util.py
index adb013f5c653c4967a743047fef4e805946e0f59..4d4fb394c1272d2bf510bb594d70b9aa2edb3df2 100644
--- a/tensorflow/python/estimator/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -33,7 +33,7 @@ from tensorflow.python.training import saver
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("estimator.VocabInfo")
+@tf_export("train.VocabInfo", "estimator.VocabInfo")
 class VocabInfo(
     collections.namedtuple("VocabInfo", [
         "new_vocab",
@@ -43,7 +43,7 @@ class VocabInfo(
         "old_vocab_size",
         "backup_initializer",
     ])):
-  """Vocabulary information for WarmStartSettings.
+  """Vocabulary information for warm-starting.
 
   See @{tf.estimator.WarmStartSettings$WarmStartSettings} for examples of using
   VocabInfo to warm-start.
@@ -83,164 +83,6 @@ class VocabInfo(
     )
 
 
-@tf_export("estimator.WarmStartSettings")
-class WarmStartSettings(
-    collections.namedtuple("WarmStartSettings", [
-        "ckpt_to_initialize_from",
-        "vars_to_warm_start",
-        "var_name_to_vocab_info",
-        "var_name_to_prev_var_name",
-    ])):
-  """Settings for warm-starting in Estimators.
-
-  Example Use with canned `DNNEstimator`:
-
-  ```
-  emb_vocab_file = tf.feature_column.embedding_column(
-      tf.feature_column.categorical_column_with_vocabulary_file(
-          "sc_vocab_file", "new_vocab.txt", vocab_size=100),
-      dimension=8)
-  emb_vocab_list = tf.feature_column.embedding_column(
-      tf.feature_column.categorical_column_with_vocabulary_list(
-          "sc_vocab_list", vocabulary_list=["a", "b"]),
-      dimension=8)
-  estimator = tf.estimator.DNNClassifier(
-    hidden_units=[128, 64], feature_columns=[emb_vocab_file, emb_vocab_list],
-    warm_start_from=ws)
-  ```
-
-  where `ws` could be defined as:
-
-  Warm-start all weights in the model (input layer and hidden weights).
-  Either the directory or a specific checkpoint can be provided (in the case
-  of the former, the latest checkpoint will be used):
-
-  ```
-  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp")
-  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp/model-1000")
-  ```
-
-  Warm-start only the embeddings (input layer):
-
-  ```
-  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp",
-                         vars_to_warm_start=".*input_layer.*")
-  ```
-
-  Warm-start all weights but the embedding parameters corresponding to
-  `sc_vocab_file` have a different vocab from the one used in the current
-  model:
-
-  ```
-  vocab_info = ws_util.VocabInfo(
-      new_vocab=sc_vocab_file.vocabulary_file,
-      new_vocab_size=sc_vocab_file.vocabulary_size,
-      num_oov_buckets=sc_vocab_file.num_oov_buckets,
-      old_vocab="old_vocab.txt"
-  )
-  ws = WarmStartSettings(
-      ckpt_to_initialize_from="/tmp",
-      var_name_to_vocab_info={
-          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
-      })
-  ```
-
-  Warm-start only `sc_vocab_file` embeddings (and no other variables), which
-  have a different vocab from the one used in the current model:
-
-  ```
-  vocab_info = ws_util.VocabInfo(
-      new_vocab=sc_vocab_file.vocabulary_file,
-      new_vocab_size=sc_vocab_file.vocabulary_size,
-      num_oov_buckets=sc_vocab_file.num_oov_buckets,
-      old_vocab="old_vocab.txt"
-  )
-  ws = WarmStartSettings(
-      ckpt_to_initialize_from="/tmp",
-      vars_to_warm_start=None,
-      var_name_to_vocab_info={
-          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
-      })
-  ```
-
-  Warm-start all weights but the parameters corresponding to `sc_vocab_file`
-  have a different vocab from the one used in current checkpoint, and only
-  100 of those entries were used:
-
-  ```
-  vocab_info = ws_util.VocabInfo(
-      new_vocab=sc_vocab_file.vocabulary_file,
-      new_vocab_size=sc_vocab_file.vocabulary_size,
-      num_oov_buckets=sc_vocab_file.num_oov_buckets,
-      old_vocab="old_vocab.txt",
-      old_vocab_size=100
-  )
-  ws = WarmStartSettings(
-      ckpt_to_initialize_from="/tmp",
-      var_name_to_vocab_info={
-          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
-      })
-  ```
-
-  Warm-start all weights but the parameters corresponding to `sc_vocab_file`
-  have a different vocab from the one used in current checkpoint and the
-  parameters corresponding to `sc_vocab_list` have a different name from the
-  current checkpoint:
-
-  ```
-  vocab_info = ws_util.VocabInfo(
-      new_vocab=sc_vocab_file.vocabulary_file,
-      new_vocab_size=sc_vocab_file.vocabulary_size,
-      num_oov_buckets=sc_vocab_file.num_oov_buckets,
-      old_vocab="old_vocab.txt",
-      old_vocab_size=100
-  )
-  ws = WarmStartSettings(
-      ckpt_to_initialize_from="/tmp",
-      var_name_to_vocab_info={
-          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
-      },
-      var_name_to_prev_var_name={
-          "input_layer/sc_vocab_list_embedding/embedding_weights":
-              "old_tensor_name"
-      })
-  ```
-
-  Attributes:
-    ckpt_to_initialize_from: [Required] A string specifying the directory with
-      checkpoint file(s) or path to checkpoint from which to warm-start the
-      model parameters.
-    vars_to_warm_start: [Optional] A regular expression that captures which
-      variables to warm-start (see tf.get_collection).  Defaults to `'.*'`,
-      which warm-starts all variables.  If `None` is explicitly given, only
-      variables specified in `var_name_to_vocab_info` will be warm-started.
-    var_name_to_vocab_info: [Optional] Dict of variable names (strings) to
-      VocabInfo. The variable names should be "full" variables, not the names
-      of the partitions.  If not explicitly provided, the variable is assumed to
-      have no vocabulary.
-    var_name_to_prev_var_name: [Optional] Dict of variable names (strings) to
-      name of the previously-trained variable in `ckpt_to_initialize_from`. If
-      not explicitly provided, the name of the variable is assumed to be same
-      between previous checkpoint and current model.
-  """
-
-  def __new__(cls,
-              ckpt_to_initialize_from,
-              vars_to_warm_start=".*",
-              var_name_to_vocab_info=None,
-              var_name_to_prev_var_name=None):
-    if not ckpt_to_initialize_from:
-      raise ValueError(
-          "`ckpt_to_initialize_from` MUST be set in WarmStartSettings")
-    return super(WarmStartSettings, cls).__new__(
-        cls,
-        ckpt_to_initialize_from,
-        vars_to_warm_start,
-        var_name_to_vocab_info or {},
-        var_name_to_prev_var_name or {},
-    )
-
-
 def _is_variable(x):
   return (isinstance(x, variables_lib.Variable) or
           isinstance(x, resource_variable_ops.ResourceVariable))
@@ -375,8 +217,7 @@ def _warm_start_var_with_vocab(var,
           full_shape=slice_info.full_shape,
           var_offset=slice_info.var_offset)
 
-    # TODO(eddz): Support WarmStartSettings where class vocabularies need
-    # remapping too.
+    # TODO(eddz): Support cases where class vocabularies need remapping too.
     init = checkpoint_ops._load_and_remap_matrix_initializer(
         ckpt_path=checkpoint_utils._get_checkpoint_filename(prev_ckpt),
         old_tensor_name=prev_tensor_name,
@@ -396,32 +237,53 @@ def _warm_start_var_with_vocab(var,
 # pylint: enable=protected-access
 
 
-def _warm_start(warm_start_settings):
+@tf_export("train.warm_start")
+def warm_start(ckpt_to_initialize_from,
+               vars_to_warm_start=".*",
+               var_name_to_vocab_info=None,
+               var_name_to_prev_var_name=None):
   """Warm-starts a model using the given settings.
 
   If you are using a tf.estimator.Estimator, this will automatically be called
   during training.
 
   Args:
-    warm_start_settings: An object of `WarmStartSettings`.
+    ckpt_to_initialize_from: [Required] A string specifying the directory with
+      checkpoint file(s) or path to checkpoint from which to warm-start the
+      model parameters.
+    vars_to_warm_start: [Optional] A regular expression that captures which
+      variables to warm-start (see tf.get_collection).  Defaults to `'.*'`,
+      which warm-starts all variables.  If `None` is explicitly given, only
+      variables specified in `var_name_to_vocab_info` will be warm-started.
+    var_name_to_vocab_info: [Optional] Dict of variable names (strings) to
+      VocabInfo. The variable names should be "full" variables, not the names
+      of the partitions.  If not explicitly provided, the variable is assumed to
+      have no vocabulary.
+    var_name_to_prev_var_name: [Optional] Dict of variable names (strings) to
+      name of the previously-trained variable in `ckpt_to_initialize_from`. If
+      not explicitly provided, the name of the variable is assumed to be same
+      between previous checkpoint and current model.
   Raises:
     ValueError: If the WarmStartSettings contains prev_var_name or VocabInfo
       configuration for variable names that are not used.  This is to ensure
       a stronger check for variable configuration than relying on users to
       examine the logs.
   """
-  logging.info("Warm-starting from: %s",
-               (warm_start_settings.ckpt_to_initialize_from,))
+  if var_name_to_vocab_info is None:
+    var_name_to_vocab_info = {}
+  if var_name_to_prev_var_name is None:
+    var_name_to_prev_var_name = {}
+  logging.info("Warm-starting from: %s", (ckpt_to_initialize_from,))
   # We have to deal with partitioned variables, since get_collection flattens
   # out the list.
   grouped_variables = {}
-  # Both warm_start_settings.vars_to_warm_start = '.*' and
-  # warm_start_settings.vars_to_warm_start = None will match everything here.
+  # Both vars_to_warm_start = '.*' and
+  # vars_to_warm_start = None will match everything here.
   for v in ops.get_collection(
       # TODO(eddz): Allow for different collections here (to support
       # warm-starting accumulators).
       ops.GraphKeys.TRAINABLE_VARIABLES,
-      scope=warm_start_settings.vars_to_warm_start):
+      scope=vars_to_warm_start):
     if not isinstance(v, list):
       var_name = _infer_var_name([v])
     else:
@@ -437,10 +299,10 @@ def _warm_start(warm_start_settings):
   vocab_info_used = set()
 
   for var_name, variable in six.iteritems(grouped_variables):
-    prev_var_name = warm_start_settings.var_name_to_prev_var_name.get(var_name)
+    prev_var_name = var_name_to_prev_var_name.get(var_name)
     if prev_var_name:
       prev_var_name_used.add(var_name)
-    vocab_info = warm_start_settings.var_name_to_vocab_info.get(var_name)
+    vocab_info = var_name_to_vocab_info.get(var_name)
     if vocab_info:
       vocab_info_used.add(var_name)
       logging.info(
@@ -460,16 +322,16 @@ def _warm_start(warm_start_settings):
           variable,
           current_vocab_path=vocab_info.new_vocab,
           current_vocab_size=vocab_info.new_vocab_size,
-          prev_ckpt=warm_start_settings.ckpt_to_initialize_from,
+          prev_ckpt=ckpt_to_initialize_from,
           prev_vocab_path=vocab_info.old_vocab,
           previous_vocab_size=vocab_info.old_vocab_size,
           current_oov_buckets=vocab_info.num_oov_buckets,
           prev_tensor_name=prev_var_name,
           initializer=vocab_info.backup_initializer)
     else:
-      # For the special value of warm_start_settings.vars_to_warm_start = None,
+      # For the special value of vars_to_warm_start = None,
       # we only warm-start variables with explicitly specified vocabularies.
-      if warm_start_settings.vars_to_warm_start:
+      if vars_to_warm_start:
         logging.info("Warm-starting variable: {}; prev_var_name: {}".format(
             var_name, prev_var_name or "Unchanged"))
         # Because we use a default empty list in grouped_variables, single
@@ -477,48 +339,22 @@ def _warm_start(warm_start_settings):
         # for init_from_checkpoint logic to work correctly.
         if len(variable) == 1:
           variable = variable[0]
-        _warm_start_var(variable, warm_start_settings.ckpt_to_initialize_from,
-                        prev_var_name)
+        _warm_start_var(variable, ckpt_to_initialize_from, prev_var_name)
 
   prev_var_name_not_used = set(
-      warm_start_settings.var_name_to_prev_var_name.keys()) - prev_var_name_used
-  vocab_info_not_used = set(
-      warm_start_settings.var_name_to_vocab_info.keys()) - vocab_info_used
+      var_name_to_prev_var_name.keys()) - prev_var_name_used
+  vocab_info_not_used = set(var_name_to_vocab_info.keys()) - vocab_info_used
 
   if prev_var_name_not_used:
     raise ValueError(
         "You provided the following variables in "
-        "warm_start_settings.var_name_to_prev_var_name that were not used: "
+        "var_name_to_prev_var_name that were not used: "
         "{0}.  Perhaps you misspelled them?  Here is the list of viable "
         "variable names: {1}".format(prev_var_name_not_used,
                                      grouped_variables.keys()))
   if vocab_info_not_used:
     raise ValueError(
         "You provided the following variables in "
-        "warm_start_settings.var_name_to_vocab_info that were not used: {0}. "
+        "var_name_to_vocab_info that were not used: {0}. "
         " Perhaps you misspelled them?  Here is the list of viable variable "
         "names: {1}".format(vocab_info_not_used, grouped_variables.keys()))
-
-
-def _get_default_warm_start_settings(warm_start_from):
-  """Returns default WarmStartSettings.
-
-  Args:
-    warm_start_from: Either a string representing the filepath of a checkpoint
-      to initialize from, or an instance of WarmStartSettings.
-
-  Returns:
-    Either None or an instance of WarmStartSettings.
-
-  Raises:
-    ValueError: If warm_start_from is not None but is neither a string nor an
-      instance of WarmStartSettings.
-  """
-  if warm_start_from is None:
-    return None
-  if isinstance(warm_start_from, six.string_types):
-    return WarmStartSettings(ckpt_to_initialize_from=warm_start_from)
-  elif isinstance(warm_start_from, WarmStartSettings):
-    return warm_start_from
-  else:
-    raise ValueError("warm_start_from must be a string or a WarmStartSettings")
diff --git a/tensorflow/python/estimator/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
similarity index 94%
rename from tensorflow/python/estimator/warm_starting_util_test.py
rename to tensorflow/python/training/warm_starting_util_test.py
index 3985d9ebd04e6963339fcf9999f6367fe4dadc1a..7e8cbd6baeea160075b61d1191c8f1da5fe2163c 100644
--- a/tensorflow/python/estimator/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -22,7 +22,6 @@ import os
 import numpy as np
 import six
 
-from tensorflow.python.estimator import warm_starting_util as ws_util
 from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +31,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import warm_starting_util as ws_util
 
 ones = init_ops.ones_initializer
 norms = init_ops.truncated_normal_initializer
@@ -330,9 +330,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
-        ws_util._warm_start(
-            ws_util.WarmStartSettings(
-                self.get_temp_dir(), vars_to_warm_start=".*sc_int.*"))
+        ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=".*sc_int.*")
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [prev_int_val]}, sess)
@@ -361,9 +359,8 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
-        ws_util._warm_start(
-            ws_util.WarmStartSettings(
-                self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*"))
+        ws_util.warm_start(
+            self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*")
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]},
@@ -398,9 +395,8 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         # Since old vocab is not explicitly set in WarmStartSettings, the old
         # vocab is assumed to be same as new vocab.
-        ws_util._warm_start(
-            ws_util.WarmStartSettings(
-                self.get_temp_dir(), vars_to_warm_start=".*sc_vocab.*"))
+        ws_util.warm_start(
+            self.get_temp_dir(), vars_to_warm_start=".*sc_vocab.*")
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
@@ -435,11 +431,10 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         # Since old vocab is not explicitly set in WarmStartSettings, the old
         # vocab is assumed to be same as new vocab.
-        ws_util._warm_start(
-            ws_util.WarmStartSettings(
-                # Explicitly provide the file prefix instead of just the dir.
-                os.path.join(self.get_temp_dir(), "model-0"),
-                vars_to_warm_start=".*sc_vocab.*"))
+        ws_util.warm_start(
+            # Explicitly provide the file prefix instead of just the dir.
+            os.path.join(self.get_temp_dir(), "model-0"),
+            vars_to_warm_start=".*sc_vocab.*")
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
@@ -485,13 +480,12 @@ class WarmStartingUtilTest(test.TestCase):
             num_oov_buckets=sc_vocab.num_oov_buckets,
             old_vocab=old_vocab_path,
             old_vocab_size=old_vocab_size)
-        warm_start_settings = ws_util.WarmStartSettings(
+        ws_util.warm_start(
             ckpt_to_initialize_from=self.get_temp_dir(),
             vars_to_warm_start=".*sc_vocab.*",
             var_name_to_vocab_info={
                 "linear_model/sc_vocab/weights": vocab_info
             })
-        ws_util._warm_start(warm_start_settings)
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  'banana' isn't in the
         # first two entries of the old vocabulary, so it's newly initialized.
@@ -523,9 +517,8 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
-        ws_util._warm_start(
-            ws_util.WarmStartSettings(
-                self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*"))
+        ws_util.warm_start(
+            self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*")
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars,
@@ -606,12 +599,11 @@ class WarmStartingUtilTest(test.TestCase):
             new_vocab_size=sc_vocab.vocabulary_size,
             num_oov_buckets=sc_vocab.num_oov_buckets,
             old_vocab=vocab_path)
-        ws_util._warm_start(
-            ws_util.WarmStartSettings(
-                self.get_temp_dir(),
-                var_name_to_vocab_info={
-                    "linear_model/sc_vocab/weights": vocab_info
-                }))
+        ws_util.warm_start(
+            self.get_temp_dir(),
+            var_name_to_vocab_info={
+                "linear_model/sc_vocab/weights": vocab_info
+            })
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {
@@ -668,7 +660,7 @@ class WarmStartingUtilTest(test.TestCase):
             new_vocab_size=sc_vocab.vocabulary_size,
             num_oov_buckets=sc_vocab.num_oov_buckets,
             old_vocab=prev_vocab_path)
-        ws_settings = ws_util.WarmStartSettings(
+        ws_util.warm_start(
             self.get_temp_dir(),
             vars_to_warm_start=".*(sc_keys|sc_vocab).*",
             var_name_to_vocab_info={
@@ -678,7 +670,6 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        ws_util._warm_start(ws_settings)
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
@@ -732,7 +723,7 @@ class WarmStartingUtilTest(test.TestCase):
             new_vocab_size=sc_vocab.vocabulary_size,
             num_oov_buckets=sc_vocab.num_oov_buckets,
             old_vocab=prev_vocab_path)
-        ws_settings = ws_util.WarmStartSettings(
+        ws_util.warm_start(
             self.get_temp_dir(),
             vars_to_warm_start=".*(sc_keys|sc_vocab).*",
             var_name_to_vocab_info={
@@ -742,7 +733,6 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        ws_util._warm_start(ws_settings)
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
@@ -796,7 +786,7 @@ class WarmStartingUtilTest(test.TestCase):
             new_vocab_size=sc_vocab.vocabulary_size,
             num_oov_buckets=sc_vocab.num_oov_buckets,
             old_vocab=prev_vocab_path)
-        ws_settings = ws_util.WarmStartSettings(
+        ws_util.warm_start(
             self.get_temp_dir(),
             # The special value of None here will ensure that only the variable
             # specified in var_name_to_vocab_info (sc_vocab embedding) is
@@ -812,7 +802,6 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        ws_util._warm_start(ws_settings)
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_vocab should be correctly warm-started after vocab remapping,
@@ -874,13 +863,12 @@ class WarmStartingUtilTest(test.TestCase):
             # use a truncated normal initializer.
             backup_initializer=init_ops.random_uniform_initializer(
                 minval=0.42, maxval=0.42))
-        ws_settings = ws_util.WarmStartSettings(
+        ws_util.warm_start(
             self.get_temp_dir(),
             var_name_to_vocab_info={
                 ws_util._infer_var_name(cols_to_vars[emb_vocab_column]):
                     vocab_info
             })
-        ws_util._warm_start(ws_settings)
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab_column should be correctly warm-started after vocab
@@ -947,33 +935,33 @@ class WarmStartingUtilTest(test.TestCase):
             # use a truncated normal initializer.
             backup_initializer=init_ops.random_uniform_initializer(
                 minval=0.42, maxval=0.42))
-        ws_settings = ws_util.WarmStartSettings(
+        ws_util.warm_start(
             self.get_temp_dir(),
             vars_to_warm_start=".*sc_vocab.*",
             var_name_to_vocab_info={
                 "linear_model/sc_vocab_embedding/embedding_weights": vocab_info
             })
-        ws_util._warm_start(ws_settings)
         sess.run(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab should be correctly warm-started after vocab remapping.
         # Missing values are filled in with the EmbeddingColumn's initializer.
         self._assert_cols_to_vars(
-            cols_to_vars, {
+            cols_to_vars,
+            {
                 emb_vocab: [
-                    # embedding_weights part 0.
-                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
-                    # embedding_weights part 1.
-                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]]),
                     # linear weights part 0.
                     np.array([[0.69]]),
                     # linear weights part 1.
-                    np.array([[0.71]])
+                    np.array([[0.71]]),
+                    # embedding_weights part 0.
+                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
+                    # embedding_weights part 1.
+                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
                 ]
-            }, sess)
+            },
+            sess)
 
   def testErrorConditions(self):
-    self.assertRaises(ValueError, ws_util.WarmStartSettings, None)
     x = variable_scope.get_variable(
         "x",
         shape=[4, 1],
@@ -983,9 +971,6 @@ class WarmStartingUtilTest(test.TestCase):
     # List of PartitionedVariable is invalid type when warm-starting with vocab.
     self.assertRaises(TypeError, ws_util._warm_start_var_with_vocab, [x],
                       "/tmp", 5, "/tmp", "/tmp")
-    # Keys of type other than FeatureColumn.
-    self.assertRaises(TypeError, ws_util._warm_start, {"StringType": x},
-                      ws_util.WarmStartSettings("/tmp"))
 
     # Unused variable names raises ValueError.
     with ops.Graph().as_default():
@@ -997,18 +982,16 @@ class WarmStartingUtilTest(test.TestCase):
             partitioner=lambda shape, dtype: [2, 1])
         self._write_checkpoint(sess)
 
-    self.assertRaises(ValueError, ws_util._warm_start,
-                      ws_util.WarmStartSettings(
-                          self.get_temp_dir(),
-                          var_name_to_vocab_info={
-                              "y": ws_util.VocabInfo("", 1, 0, "")
-                          }))
-    self.assertRaises(ValueError, ws_util._warm_start,
-                      ws_util.WarmStartSettings(
-                          self.get_temp_dir(),
-                          var_name_to_prev_var_name={
-                              "y": "y2"
-                          }))
+    self.assertRaises(
+        ValueError,
+        ws_util.warm_start,
+        self.get_temp_dir(),
+        var_name_to_vocab_info={"y": ws_util.VocabInfo("", 1, 0, "")})
+    self.assertRaises(
+        ValueError,
+        ws_util.warm_start,
+        self.get_temp_dir(),
+        var_name_to_prev_var_name={"y": "y2"})
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/user_ops/user_ops.py b/tensorflow/python/user_ops/user_ops.py
index 17dbab706c9243c5f119dc82cc4428f03b90a18d..20ea3b0f621dc74bd3778d565f8897e47a881d42 100644
--- a/tensorflow/python/user_ops/user_ops.py
+++ b/tensorflow/python/user_ops/user_ops.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import gen_user_ops as _gen_user_ops
 
 # go/tf-wildcard-import
 from tensorflow.python.ops.gen_user_ops import *  # pylint: disable=wildcard-import
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('user_ops.my_fact')
 def my_fact():
   """Example of overriding the generated code for an Op."""
-  return _gen_user_ops._fact()  # pylint: disable=protected-access
+  return _gen_user_ops.fact()
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 4163fcac79e3d237c4c4c4303e1db2c39e5fe7c6..a24a52eea9710e98bd56025457e6fda5449a5197 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -17,10 +17,6 @@
 ## Conversion routines
 In addition to the functions below, `as_str` converts an object to a `str`.
 
-@@as_bytes
-@@as_text
-@@as_str_any
-@@path_to_str
 
 ## Types
 The compatibility module also provides the following types:
@@ -40,12 +36,9 @@ import numbers as _numbers
 import numpy as _np
 import six as _six
 
-from tensorflow.python.util.all_util import remove_undocumented
-from tensorflow.python.util.tf_export import tf_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('compat.as_bytes', 'compat.as_str')
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
 
@@ -68,7 +61,6 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
                     (bytes_or_text,))
 
 
-@tf_export('compat.as_text')
 def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
@@ -93,8 +85,12 @@ def as_text(bytes_or_text, encoding='utf-8'):
 # Convert an object to a `str` in both Python 2 and 3.
 if _six.PY2:
   as_str = as_bytes
+  tf_export('compat.as_bytes', 'compat.as_str')(as_bytes)
+  tf_export('compat.as_text')(as_text)
 else:
   as_str = as_text
+  tf_export('compat.as_bytes')(as_bytes)
+  tf_export('compat.as_text', 'compat.as_str')(as_text)
 
 
 @tf_export('compat.as_str_any')
@@ -141,13 +137,3 @@ tf_export('compat.complex_types').export_constant(__name__, 'complex_types')
 bytes_or_text_types = (bytes, _six.text_type)
 tf_export('compat.bytes_or_text_types').export_constant(__name__,
                                                         'bytes_or_text_types')
-
-_allowed_symbols = [
-    'as_str',
-    'bytes_or_text_types',
-    'complex_types',
-    'integral_types',
-    'real_types',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/util/decorator_utils.py b/tensorflow/python/util/decorator_utils.py
index df259c7f7c29f9a4b674d3e980b33d6dcf323769..7b4363c0e40802779cf47c75c5a5e5a901da37e2 100644
--- a/tensorflow/python/util/decorator_utils.py
+++ b/tensorflow/python/util/decorator_utils.py
@@ -82,7 +82,7 @@ def add_notice_to_docstring(
     lines = _normalize_docstring(doc).splitlines()
     lines[0] += ' ' + suffix_str
 
-  notice = [''] + notice + [instructions]
+  notice = [''] + notice + ([instructions] if instructions else [])
 
   if len(lines) > 1:
     # Make sure that we keep our distance from the main body
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 23c2c48f4b5a165bd6e356a6243b234619af1c4c..1104768ae8f69598f686eb2ffee8b69e43051011 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -36,7 +36,6 @@ import collections as _collections
 import six as _six
 
 from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
-from tensorflow.python.util.all_util import remove_undocumented
 
 
 def _sorted(dict_):
@@ -60,15 +59,7 @@ def _is_namedtuple(instance, strict=False):
   Returns:
     True if `instance` is a `namedtuple`.
   """
-  # Attemp to limit the test to plain namedtuple (not stuff inheriting from it).
-  if not isinstance(instance, tuple):
-    return False
-  if strict and instance.__class__.__base__ != tuple:
-    return False
-  return (
-      hasattr(instance, "_fields") and
-      isinstance(instance._fields, _collections.Sequence) and
-      all(isinstance(f, _six.string_types) for f in instance._fields))
+  return _pywrap_tensorflow.IsNamedtuple(instance, strict)
 
 
 def _sequence_like(instance, args):
@@ -157,76 +148,7 @@ def flatten(nest):
 
 def _same_namedtuples(nest1, nest2):
   """Returns True if the two namedtuples have the same name and fields."""
-  if nest1._fields != nest2._fields:
-    return False
-  if nest1.__class__.__name__ != nest2.__class__.__name__:
-    return False
-  return True
-
-
-def _recursive_assert_same_structure(nest1, nest2, check_types):
-  """Helper function for `assert_same_structure`.
-
-  See `assert_same_structure` for further information about namedtuples.
-
-  Args:
-    nest1: An arbitrarily nested structure.
-    nest2: An arbitrarily nested structure.
-    check_types: If `True` (default) types of sequences are checked as
-        well, including the keys of dictionaries. If set to `False`, for example
-        a list and a tuple of objects will look the same if they have the same
-        size. Note that namedtuples with identical name and fields are always
-        considered to have the same shallow structure.
-
-  Returns:
-    True if `nest1` and `nest2` have the same structure.
-
-  Raises:
-    ValueError: If the two structure don't have the same nested structre.
-    TypeError: If the two structure don't have the same sequence type.
-    ValueError: If the two dictionaries don't have the same set of keys.
-  """
-  is_sequence_nest1 = is_sequence(nest1)
-  if is_sequence_nest1 != is_sequence(nest2):
-    raise ValueError(
-        "The two structures don't have the same nested structure.\n\n"
-        "First structure: %s\n\nSecond structure: %s." % (nest1, nest2))
-
-  if not is_sequence_nest1:
-    return  # finished checking
-
-  if check_types:
-    type_nest1 = type(nest1)
-    type_nest2 = type(nest2)
-
-    # Duck-typing means that nest should be fine with two different namedtuples
-    # with identical name and fields.
-    if _is_namedtuple(nest1, True) and _is_namedtuple(nest2, True):
-      if not _same_namedtuples(nest1, nest2):
-        raise TypeError(
-            "The two namedtuples don't have the same sequence type. First "
-            "structure has type %s, while second structure has type %s."
-            % (type_nest1, type_nest2))
-    else:
-      if type_nest1 != type_nest2:
-        raise TypeError(
-            "The two structures don't have the same sequence type. First "
-            "structure has type %s, while second structure has type %s."
-            % (type_nest1, type_nest2))
-
-    if isinstance(nest1, dict):
-      keys1 = set(_six.iterkeys(nest1))
-      keys2 = set(_six.iterkeys(nest2))
-      if keys1 != keys2:
-        raise ValueError(
-            "The two dictionaries don't have the same set of keys. First "
-            "structure has keys {}, while second structure has keys {}."
-            .format(keys1, keys2))
-
-  nest1_as_sequence = [n for n in _yield_value(nest1)]
-  nest2_as_sequence = [n for n in _yield_value(nest2)]
-  for n1, n2 in zip(nest1_as_sequence, nest2_as_sequence):
-    _recursive_assert_same_structure(n1, n2, check_types)
+  return _pywrap_tensorflow.SameNamedtuples(nest1, nest2)
 
 
 def assert_same_structure(nest1, nest2, check_types=True):
@@ -257,14 +179,7 @@ def assert_same_structure(nest1, nest2, check_types=True):
     TypeError: If the two structures differ in the type of sequence in any of
       their substructures. Only possible if `check_types` is `True`.
   """
-  len_nest1 = len(flatten(nest1)) if is_sequence(nest1) else 1
-  len_nest2 = len(flatten(nest2)) if is_sequence(nest2) else 1
-  if len_nest1 != len_nest2:
-    raise ValueError("The two structures don't have the same number of "
-                     "elements.\n\nFirst structure (%i elements): %s\n\n"
-                     "Second structure (%i elements): %s"
-                     % (len_nest1, nest1, len_nest2, nest2))
-  _recursive_assert_same_structure(nest1, nest2, check_types)
+  _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types)
 
 
 def flatten_dict_items(dictionary):
@@ -842,21 +757,3 @@ def flatten_with_joined_string_paths(structure, separator="/"):
 
 
 _pywrap_tensorflow.RegisterSequenceClass(_collections.Sequence)
-
-
-_allowed_symbols = [
-    "assert_same_structure",
-    "is_sequence",
-    "flatten",
-    "flatten_dict_items",
-    "pack_sequence_as",
-    "map_structure",
-    "assert_shallow_structure",
-    "flatten_up_to",
-    "map_structure_up_to",
-    "get_traverse_shallow_structure",
-    "yield_flat_paths",
-    "flatten_with_joined_string_paths",
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 4439d6241ea9607b194cbb17304dbb77dc9f57a8..2f12b25354a905b2aafa870c28f1e9c0b693e888 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import time
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -32,6 +35,9 @@ from tensorflow.python.util import nest
 
 class NestTest(test.TestCase):
 
+  PointXY = collections.namedtuple("Point", ["x", "y"])  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testFlattenAndPack(self):
     structure = ((3, 4), 5, (6, 7, (9, 10), 8))
     flat = ["a", "b", "c", "d", "e", "f", "g", "h"]
@@ -39,8 +45,8 @@ class NestTest(test.TestCase):
     self.assertEqual(
         nest.pack_sequence_as(structure, flat), (("a", "b"), "c",
                                                  ("d", "e", ("f", "g"), "h")))
-    point = collections.namedtuple("Point", ["x", "y"])
-    structure = (point(x=4, y=2), ((point(x=1, y=0),),))
+    structure = (NestTest.PointXY(x=4, y=2),
+                 ((NestTest.PointXY(x=1, y=0),),))
     flat = [4, 2, 1, 0]
     self.assertEqual(nest.flatten(structure), flat)
     restructured_from_flat = nest.pack_sequence_as(structure, flat)
@@ -66,6 +72,7 @@ class NestTest(test.TestCase):
     with self.assertRaises(ValueError):
       nest.pack_sequence_as([5, 6, [7, 8]], ["a", "b", "c"])
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testFlattenDictOrder(self):
     """`flatten` orders dicts by key, including OrderedDicts."""
     ordered = collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
@@ -87,12 +94,14 @@ class NestTest(test.TestCase):
         ordered_reconstruction)
     self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
 
+  Abc = collections.namedtuple("A", ("b", "c"))  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testFlattenAndPack_withDicts(self):
     # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
-    named_tuple = collections.namedtuple("A", ("b", "c"))
     mess = [
         "z",
-        named_tuple(3, 4),
+        NestTest.Abc(3, 4),
         {
             "c": [
                 1,
@@ -111,7 +120,7 @@ class NestTest(test.TestCase):
 
     structure_of_mess = [
         14,
-        named_tuple("a", True),
+        NestTest.Abc("a", True),
         {
             "c": [
                 0,
@@ -157,6 +166,7 @@ class NestTest(test.TestCase):
       nest.pack_sequence_as(["hello", "world"],
                             ["and", "goodbye", "again"])
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testIsSequence(self):
     self.assertFalse(nest.is_sequence("1234"))
     self.assertTrue(nest.is_sequence([1, 3, [4, 5]]))
@@ -186,6 +196,23 @@ class NestTest(test.TestCase):
         ValueError, "Key had [0-9]* elements, but value had [0-9]* elements"):
       nest.flatten_dict_items(another_bad_dictionary)
 
+  # pylint does not correctly recognize these as class names and
+  # suggests to use variable style under_score naming.
+  # pylint: disable=invalid-name
+  Named0ab = collections.namedtuple("named_0", ("a", "b"))
+  Named1ab = collections.namedtuple("named_1", ("a", "b"))
+  SameNameab = collections.namedtuple("same_name", ("a", "b"))
+  SameNameab2 = collections.namedtuple("same_name", ("a", "b"))
+  SameNamexy = collections.namedtuple("same_name", ("x", "y"))
+  SameName1xy = collections.namedtuple("same_name_1", ("x", "y"))
+  SameName1xy2 = collections.namedtuple("same_name_1", ("x", "y"))
+  NotSameName = collections.namedtuple("not_same_name", ("a", "b"))
+  # pylint: enable=invalid-name
+
+  class SameNamedType1(SameNameab):
+    pass
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testAssertSameStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6"))
@@ -198,23 +225,32 @@ class NestTest(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError,
-        ("don't have the same number of elements\\.\n\n"
-         "First structure \\(6 elements\\):.*?"
-         "\n\nSecond structure \\(2 elements\\):")):
+        ("The two structures don't have the same nested structure\\.\n\n"
+         "First structure:.*?\n\n"
+         "Second structure:.*\n\n"
+         "More specifically: Substructure "
+         r'"type=tuple str=\(\(1, 2\), 3\)" is a sequence, while '
+         'substructure "type=str str=spam" is not')):
       nest.assert_same_structure(structure1, structure_different_num_elements)
 
     with self.assertRaisesRegexp(
         ValueError,
-        ("don't have the same number of elements\\.\n\n"
-         "First structure \\(2 elements\\):.*?"
-         "\n\nSecond structure \\(1 elements\\):")):
+        ("The two structures don't have the same nested structure\\.\n\n"
+         "First structure:.*?\n\n"
+         "Second structure:.*\n\n"
+         r'More specifically: Substructure "type=list str=\[0, 1\]" '
+         r'is a sequence, while substructure "type=ndarray str=\[0 1\]" '
+         "is not")):
       nest.assert_same_structure([0, 1], np.array([0, 1]))
 
     with self.assertRaisesRegexp(
         ValueError,
-        ("don't have the same number of elements\\.\n\n"
-         "First structure \\(1 elements\\):.*"
-         "\n\nSecond structure \\(2 elements\\):")):
+        ("The two structures don't have the same nested structure\\.\n\n"
+         "First structure:.*?\n\n"
+         "Second structure:.*\n\n"
+         r'More specifically: Substructure "type=list str=\[0, 1\]" '
+         'is a sequence, while substructure "type=int str=0" '
+         "is not")):
       nest.assert_same_structure(0, [0, 1])
 
     self.assertRaises(TypeError, nest.assert_same_structure, (0, 1), [0, 1])
@@ -225,21 +261,21 @@ class NestTest(test.TestCase):
          "First structure: .*?\n\nSecond structure: ")):
       nest.assert_same_structure(structure1, structure_different_nesting)
 
-    named_type_0 = collections.namedtuple("named_0", ("a", "b"))
-    named_type_1 = collections.namedtuple("named_1", ("a", "b"))
     self.assertRaises(TypeError, nest.assert_same_structure, (0, 1),
-                      named_type_0("a", "b"))
+                      NestTest.Named0ab("a", "b"))
 
-    nest.assert_same_structure(named_type_0(3, 4), named_type_0("a", "b"))
+    nest.assert_same_structure(NestTest.Named0ab(3, 4),
+                               NestTest.Named0ab("a", "b"))
 
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      named_type_0(3, 4), named_type_1(3, 4))
+                      NestTest.Named0ab(3, 4), NestTest.Named1ab(3, 4))
 
     with self.assertRaisesRegexp(
         ValueError,
         ("don't have the same nested structure\\.\n\n"
          "First structure: .*?\n\nSecond structure: ")):
-      nest.assert_same_structure(named_type_0(3, 4), named_type_0([3], 4))
+      nest.assert_same_structure(NestTest.Named0ab(3, 4),
+                                 NestTest.Named0ab([3], 4))
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -258,36 +294,33 @@ class NestTest(test.TestCase):
                                  "don't have the same set of keys"):
       nest.assert_same_structure({"a": 1}, {"b": 1})
 
-    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
-    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
-    nest.assert_same_structure(same_name_type_0(0, 1), same_name_type_1(2, 3))
+    nest.assert_same_structure(NestTest.SameNameab(0, 1),
+                               NestTest.SameNameab2(2, 3))
 
     # This assertion is expected to pass: two namedtuples with the same
     # name and field names are considered to be identical.
-    same_name_type_2 = collections.namedtuple("same_name_1", ("x", "y"))
-    same_name_type_3 = collections.namedtuple("same_name_1", ("x", "y"))
     nest.assert_same_structure(
-        same_name_type_0(same_name_type_2(0, 1), 2),
-        same_name_type_1(same_name_type_3(2, 3), 4))
+        NestTest.SameNameab(NestTest.SameName1xy(0, 1), 2),
+        NestTest.SameNameab2(NestTest.SameName1xy2(2, 3), 4))
 
     expected_message = "The two structures don't have the same.*"
     with self.assertRaisesRegexp(ValueError, expected_message):
-      nest.assert_same_structure(same_name_type_0(0, same_name_type_1(1, 2)),
-                                 same_name_type_1(same_name_type_0(0, 1), 2))
+      nest.assert_same_structure(
+          NestTest.SameNameab(0, NestTest.SameNameab2(1, 2)),
+          NestTest.SameNameab2(NestTest.SameNameab(0, 1), 2))
 
-    same_name_type_1 = collections.namedtuple("not_same_name", ("a", "b"))
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+                      NestTest.SameNameab(0, 1), NestTest.NotSameName(2, 3))
 
-    same_name_type_1 = collections.namedtuple("same_name", ("x", "y"))
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+                      NestTest.SameNameab(0, 1), NestTest.SameNamexy(2, 3))
 
-    class SameNamedType1(collections.namedtuple("same_name", ("a", "b"))):
-      pass
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      same_name_type_0(0, 1), SameNamedType1(2, 3))
+                      NestTest.SameNameab(0, 1), NestTest.SameNamedType1(2, 3))
 
+  EmptyNT = collections.namedtuple("empty_nt", "")  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = (((7, 8), 9), 10, (11, 12))
@@ -310,9 +343,8 @@ class NestTest(test.TestCase):
     self.assertEqual((), nest.map_structure(lambda x: x + 1, ()))
     self.assertEqual([], nest.map_structure(lambda x: x + 1, []))
     self.assertEqual({}, nest.map_structure(lambda x: x + 1, {}))
-    empty_nt = collections.namedtuple("empty_nt", "")
-    self.assertEqual(empty_nt(), nest.map_structure(lambda x: x + 1,
-                                                    empty_nt()))
+    self.assertEqual(NestTest.EmptyNT(), nest.map_structure(lambda x: x + 1,
+                                                            NestTest.EmptyNT()))
 
     # This is checking actual equality of types, empty list != empty tuple
     self.assertNotEqual((), nest.map_structure(lambda x: x + 1, []))
@@ -352,10 +384,12 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
       nest.map_structure(lambda x: None, structure1, check_types=False, foo="a")
 
+  ABTuple = collections.namedtuple("ab_tuple", "a, b")  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testMapStructureWithStrings(self):
-    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
-    inp_a = ab_tuple(a="foo", b=("bar", "baz"))
-    inp_b = ab_tuple(a=2, b=(1, 3))
+    inp_a = NestTest.ABTuple(a="foo", b=("bar", "baz"))
+    inp_b = NestTest.ABTuple(a=2, b=(1, 3))
     out = nest.map_structure(lambda string, repeats: string * repeats,
                              inp_a,
                              inp_b)
@@ -363,8 +397,8 @@ class NestTest(test.TestCase):
     self.assertEqual("bar", out.b[0])
     self.assertEqual("bazbazbaz", out.b[1])
 
-    nt = ab_tuple(a=("something", "something_else"),
-                  b="yet another thing")
+    nt = NestTest.ABTuple(a=("something", "something_else"),
+                          b="yet another thing")
     rev_nt = nest.map_structure(lambda x: x[::-1], nt)
     # Check the output is the correct structure, and all strings are reversed.
     nest.assert_same_structure(nt, rev_nt)
@@ -431,10 +465,8 @@ class NestTest(test.TestCase):
 
     # This assertion is expected to pass: two namedtuples with the same
     # name and field names are considered to be identical.
-    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
-    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
-    inp_shallow = same_name_type_0(1, 2)
-    inp_deep = same_name_type_1(1, [1, 2, 3])
+    inp_shallow = NestTest.SameNameab(1, 2)
+    inp_deep = NestTest.SameNameab2(1, [1, 2, 3])
     nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=False)
     nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=True)
 
@@ -466,7 +498,7 @@ class NestTest(test.TestCase):
                      [1, {"c": 2}, 3, (4, 5)])
 
     # Namedtuples.
-    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
+    ab_tuple = NestTest.ABTuple
     input_tree = ab_tuple(a=[0, 1], b=2)
     shallow_tree = ab_tuple(a=0, b=1)
     input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
@@ -681,5 +713,31 @@ class NestTest(test.TestCase):
           list(nest.flatten_with_joined_string_paths(inputs)), expected)
 
 
+class NestBenchmark(test.Benchmark):
+
+  def run_and_report(self, s1, s2, name):
+    burn_iter, test_iter = 100, 30000
+
+    for _ in xrange(burn_iter):
+      nest.assert_same_structure(s1, s2)
+
+    t0 = time.time()
+    for _ in xrange(test_iter):
+      nest.assert_same_structure(s1, s2)
+    t1 = time.time()
+
+    self.report_benchmark(iters=test_iter, wall_time=(t1 - t0) / test_iter,
+                          name=name)
+
+  def benchmark_assert_structure(self):
+    s1 = (((1, 2), 3), 4, (5, 6))
+    s2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6"))
+    self.run_and_report(s1, s2, "assert_same_structure_6_elem")
+
+    s1 = (((1, 2), 3), 4, (5, 6)) * 10
+    s2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6")) * 10
+    self.run_and_report(s1, s2, "assert_same_structure_60_elem")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/util/port.i b/tensorflow/python/util/port.i
index cea4d8468afe8816d71da6581635b8a7ab0c2388..2f730732bee373a6e6ead97fe3320645f37ac220 100644
--- a/tensorflow/python/util/port.i
+++ b/tensorflow/python/util/port.i
@@ -23,5 +23,6 @@ limitations under the License.
 %unignore tensorflow;
 %unignore tensorflow::IsGoogleCudaEnabled;
 %unignore tensorflow::CudaSupportsHalfMatMulAndConv;
+%unignore tensorflow::IsMklEnabled;
 %include "tensorflow/core/util/port.h"
 %unignoreall
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index c2fe6fc4494428693605a5a7463a9f590a2da39e..663036de8a01c3937ae93f8d2bfa27f7add48e39 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from collections import namedtuple
 import inspect as _inspect
 
 from tensorflow.python.util import tf_decorator
@@ -24,6 +25,15 @@ from tensorflow.python.util import tf_decorator
 ArgSpec = _inspect.ArgSpec
 
 
+if hasattr(_inspect, 'FullArgSpec'):
+  FullArgSpec = _inspect.FullArgSpec  # pylint: disable=invalid-name
+else:
+  FullArgSpec = namedtuple('FullArgSpec', [
+      'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
+      'annotations'
+  ])
+
+
 def currentframe():
   """TFDecorator-aware replacement for inspect.currentframe."""
   return _inspect.stack()[1][0]
@@ -46,20 +56,45 @@ def getargspec(object):  # pylint: disable=redefined-builtin
 
 
 def getfullargspec(obj):  # pylint: disable=redefined-builtin
-  """TFDecorator-aware replacement for inspect.getfullargspec and fallback to
-  inspect.getargspec in Python 2.
+  """TFDecorator-aware replacement for `inspect.getfullargspec`/`getargspec`.
+
+  This wrapper uses `inspect.getfullargspec` if available and falls back to
+  `inspect.getargspec` in Python 2.
 
   Args:
     obj: A callable, possibly decorated.
 
   Returns:
-    The `FullArgSpec` (`ArgSpec` in Python 2) that describes the signature of
+    The `FullArgSpec` that describes the signature of
     the outermost decorator that changes the callable's signature. If the
-    callable is not decorated, `inspect.getfullargspec()`
-    (`inspect.getargspec()` in Python 2) will be called directly on the
-    callable.
+    callable is not decorated, `inspect.getfullargspec()` will be called
+    directly on the callable.
   """
-  spec_fn = getattr(_inspect, 'getfullargspec', getattr(_inspect, 'getargspec'))
+  if hasattr(_inspect, 'getfullargspec'):
+    spec_fn = _inspect.getfullargspec
+  else:
+    def spec_fn(target):
+      """Spec function that adding default value from FullArgSpec.
+
+      It is used when getfullargspec is not available (eg in PY2).
+
+      Args:
+        target: the target object to inspect.
+      Returns:
+        The full argument specs with empty kwonlyargs, kwonlydefaults and
+        annotations.
+      """
+      argspecs = _inspect.getargspec(target)
+      fullargspecs = FullArgSpec(
+          args=argspecs.args,
+          varargs=argspecs.varargs,
+          varkw=argspecs.keywords,
+          defaults=argspecs.defaults,
+          kwonlyargs=[],
+          kwonlydefaults=None,
+          annotations={})
+      return fullargspecs
+
   decorators, target = tf_decorator.unwrap(obj)
   return next((d.decorator_argspec for d in decorators
                if d.decorator_argspec is not None), spec_fn(target))
@@ -149,6 +184,11 @@ def getsource(object):  # pylint: disable=redefined-builtin
   return _inspect.getsource(tf_decorator.unwrap(object)[1])
 
 
+def isbuiltin(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isbuiltin."""
+  return _inspect.isbuiltin(tf_decorator.unwrap(object)[1])
+
+
 def isclass(object):  # pylint: disable=redefined-builtin
   """TFDecorator-aware replacement for inspect.isclass."""
   return _inspect.isclass(tf_decorator.unwrap(object)[1])
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 8903e1156b27b3a28543eb5ecfcc6eeb1a04f6ae..129408449ebb45ac3a322f163a13b705cbb31f0c 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -144,6 +144,19 @@ def test_decorated_function_with_defaults(a, b=2, c='Hello'):
     self.assertEqual(
         expected, tf_inspect.getsource(test_decorated_function_with_defaults))
 
+  def testIsBuiltin(self):
+    self.assertEqual(
+        tf_inspect.isbuiltin(TestDecoratedClass),
+        inspect.isbuiltin(TestDecoratedClass))
+    self.assertEqual(
+        tf_inspect.isbuiltin(test_decorated_function),
+        inspect.isbuiltin(test_decorated_function))
+    self.assertEqual(
+        tf_inspect.isbuiltin(test_undecorated_function),
+        inspect.isbuiltin(test_undecorated_function))
+    self.assertEqual(tf_inspect.isbuiltin(range), inspect.isbuiltin(range))
+    self.assertEqual(tf_inspect.isbuiltin(max), inspect.isbuiltin(max))
+
   def testIsClass(self):
     self.assertTrue(tf_inspect.isclass(TestDecoratedClass))
     self.assertFalse(tf_inspect.isclass(test_decorated_function))
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 37733152e8ec6d7b026bf74e69e33bfe8f9f4e89..28e49afa023904abed076373685bb38f2537b7d4 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -47,7 +47,7 @@ def _add_should_use_warning(x, fatal_error=False):
   if x is None or x == []:  # pylint: disable=g-explicit-bool-comparison
     return x
 
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     # Typically not needed when executing eagerly (the main use case is for ops
     # which need to be incorporated into the graph), and even the no-op wrapper
     # creates reference cycles which require garbage collection.
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index a41fa7df253bcf4bce280574b89ed0dda8330521..70aee4a3f663c862ecb09444866a0294333ee27a 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
 namespace swig {
@@ -27,6 +28,113 @@ PyObject* CollectionsSequenceType = nullptr;
 
 bool WarnedThatSetIsNotSequence = false;
 
+bool IsString(PyObject* o) {
+  return PyBytes_Check(o) ||
+#if PY_MAJOR_VERSION < 3
+         PyString_Check(o) ||
+#endif
+         PyUnicode_Check(o);
+}
+
+// Equivalent to Python's 'o.__class__.__name__'
+// Note that '__class__' attribute is set only in new-style classes.
+// A lot of tensorflow code uses __class__ without checks, so it seems like
+// we only support new-style classes.
+StringPiece GetClassName(PyObject* o) {
+  // __class__ is equivalent to type() for new style classes.
+  // type() is equivalent to PyObject_Type()
+  // (https://docs.python.org/3.5/c-api/object.html#c.PyObject_Type)
+  // PyObject_Type() is equivalent to o->ob_type except for Py_INCREF, which
+  // we don't need here.
+  PyTypeObject* type = o->ob_type;
+
+  // __name__ is the value of `tp_name` after the last '.'
+  // (https://docs.python.org/2/c-api/typeobj.html#c.PyTypeObject.tp_name)
+  StringPiece name(type->tp_name);
+  size_t pos = name.rfind('.');
+  if (pos != StringPiece::npos) {
+    name.remove_prefix(pos + 1);
+  }
+  return name;
+}
+
+string PyObjectToString(PyObject* o) {
+  if (o == nullptr) {
+    return "<null object>";
+  }
+  PyObject* str = PyObject_Str(o);
+  if (str) {
+#if PY_MAJOR_VERSION < 3
+    string s(PyString_AS_STRING(str));
+#else
+    string s(PyUnicode_AsUTF8(str));
+#endif
+    Py_DECREF(str);
+    return tensorflow::strings::StrCat("type=", GetClassName(o), " str=", s);
+  } else {
+    return "<failed to execute str() on object>";
+  }
+}
+
+// Implements the same idea as tensorflow.util.nest._yield_value
+// During construction we check if the iterable is a dictionary.
+// If so, we construct a sequence from its sorted keys that will be used
+// for iteration.
+// If not, we construct a sequence directly from the iterable.
+// At each step, we get the next element from the sequence and use it
+// either as a key or return it directly.
+//
+// 'iterable' must not be modified while ValIterator is used.
+class ValIterator {
+ public:
+  explicit ValIterator(PyObject* iterable) : dict_(nullptr), index_(0) {
+    if (PyDict_Check(iterable)) {
+      dict_ = iterable;
+      // PyDict_Keys returns a list, which can be used with
+      // PySequence_Fast_GET_ITEM.
+      seq_ = PyDict_Keys(iterable);
+      // Iterate through dictionaries in a deterministic order by sorting the
+      // keys. Notice this means that we ignore the original order of
+      // `OrderedDict` instances. This is intentional, to avoid potential
+      // bugs caused by mixing ordered and plain dicts (e.g., flattening
+      // a dict but using a corresponding `OrderedDict` to pack it back).
+      PyList_Sort(seq_);
+    } else {
+      seq_ = PySequence_Fast(iterable, "");
+    }
+    size_ = PySequence_Fast_GET_SIZE(seq_);
+  }
+
+  ~ValIterator() { Py_DECREF(seq_); }
+
+  // Return a borrowed reference to the next element from iterable.
+  // Return nullptr when iteration is over.
+  PyObject* next() {
+    PyObject* element = nullptr;
+    if (index_ < size_) {
+      // Both PySequence_Fast_GET_ITEM and PyDict_GetItem return borrowed
+      // references.
+      element = PySequence_Fast_GET_ITEM(seq_, index_);
+      ++index_;
+      if (dict_ != nullptr) {
+        element = PyDict_GetItem(dict_, element);
+        if (element == nullptr) {
+          PyErr_SetString(PyExc_RuntimeError,
+                          "Dictionary was modified during iteration over it");
+          return nullptr;
+        }
+      }
+    }
+    return element;
+  }
+
+ private:
+  PyObject* seq_;
+  PyObject* dict_;
+  Py_ssize_t size_;
+  Py_ssize_t index_;
+};
+
 // Returns 1 if `o` is considered a sequence for the purposes of Flatten().
 // Returns 0 otherwise.
 // Returns -1 if an error occurred.
@@ -38,7 +146,7 @@ int IsSequenceHelper(PyObject* o) {
                     "so consider avoiding using them.";
     WarnedThatSetIsNotSequence = true;
   }
-  if (CollectionsSequenceType == nullptr) {
+  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
     PyErr_SetString(
         PyExc_RuntimeError,
         tensorflow::strings::StrCat(
@@ -49,11 +157,7 @@ int IsSequenceHelper(PyObject* o) {
   }
   int is_instance = PyObject_IsInstance(o, CollectionsSequenceType);
   if (is_instance == -1) return -1;
-  return static_cast<int>(is_instance != 0 && !PyBytes_Check(o) &&
-#if PY_MAJOR_VERSION < 3
-                          !PyString_Check(o) &&
-#endif
-                          !PyUnicode_Check(o));
+  return static_cast<int>(is_instance != 0 && !IsString(o));
 }
 
 bool FlattenHelper(PyObject* nested, PyObject* list) {
@@ -75,12 +179,16 @@ bool FlattenHelper(PyObject* nested, PyObject* list) {
       // while the method is running.
       PyObject* key = PyList_GET_ITEM(keys, i);
       PyObject* val = PyDict_GetItem(nested, key);
-      if (Py_EnterRecursiveCall(" in Flatten")) {
+      if (Py_EnterRecursiveCall(" in flatten")) {
         Py_DECREF(keys);
         return false;
       }
-      FlattenHelper(val, list);
+      const bool success = FlattenHelper(val, list);
       Py_LeaveRecursiveCall();
+      if (!success) {
+        Py_DECREF(keys);
+        return false;
+      }
     }
     Py_DECREF(keys);
     return true;
@@ -90,13 +198,159 @@ bool FlattenHelper(PyObject* nested, PyObject* list) {
   PyObject* item;
   PyObject* iterator = PyObject_GetIter(nested);
   while ((item = PyIter_Next(iterator)) != nullptr) {
-    FlattenHelper(item, list);
+    if (Py_EnterRecursiveCall(" in flatten")) {
+      Py_DECREF(iterator);
+      Py_DECREF(item);
+      return false;
+    }
+    bool success = FlattenHelper(item, list);
+    Py_LeaveRecursiveCall();
+    if (!success) {
+      Py_DECREF(iterator);
+      Py_DECREF(item);
+      return false;
+    }
     Py_DECREF(item);
   }
   Py_DECREF(iterator);
   return true;
 }
 
+// Sets error using keys of 'dict1' and 'dict2'.
+// 'dict1' and 'dict2' are assumed to be Python dictionaries.
+void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
+                           bool* is_type_error) {
+  PyObject* k1 = PyDict_Keys(dict1);
+  PyObject* k2 = PyDict_Keys(dict2);
+  *is_type_error = false;
+  *error_msg = tensorflow::strings::StrCat(
+      "The two dictionaries don't have the same set of keys. "
+      "First structure has keys ",
+      PyObjectToString(k1), ", while second structure has keys ",
+      PyObjectToString(k2));
+  Py_DECREF(k1);
+  Py_DECREF(k2);
+}
+
+// Returns true iff there were no "internal" errors. In other words,
+// errors that has nothing to do with structure checking.
+// If an "internal" error occured, the appropriate Python error will be
+// set and the caller can propage it directly to the user.
+//
+// Both `error_msg` and `is_type_error` must be non-null. `error_msg` must
+// be empty.
+// Leaves `error_msg` empty if structures matched. Else, fills `error_msg`
+// with appropriate error and sets `is_type_error` to true iff
+// the error to be raised should be TypeError.
+bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
+                               string* error_msg, bool* is_type_error) {
+  DCHECK(error_msg);
+  DCHECK(is_type_error);
+  const bool is_seq1 = IsSequence(o1);
+  const bool is_seq2 = IsSequence(o2);
+  if (PyErr_Occurred()) return false;
+  if (is_seq1 != is_seq2) {
+    string seq_str = is_seq1 ? PyObjectToString(o1) : PyObjectToString(o2);
+    string non_seq_str = is_seq1 ? PyObjectToString(o2) : PyObjectToString(o1);
+    *is_type_error = false;
+    *error_msg = tensorflow::strings::StrCat(
+        "Substructure \"", seq_str, "\" is a sequence, while substructure \"",
+        non_seq_str, "\" is not");
+    return true;
+  }
+
+  // Got to scalars, so finished checking. Structures are the same.
+  if (!is_seq1) return true;
+
+  if (check_types) {
+    const PyTypeObject* type1 = o1->ob_type;
+    const PyTypeObject* type2 = o2->ob_type;
+
+    // We treat two different namedtuples with identical name and fields
+    // as having the same type.
+    const PyObject* o1_tuple = IsNamedtuple(o1, true);
+    if (o1_tuple == nullptr) return false;
+    const PyObject* o2_tuple = IsNamedtuple(o2, true);
+    if (o2_tuple == nullptr) {
+      Py_DECREF(o1_tuple);
+      return false;
+    }
+    bool both_tuples = o1_tuple == Py_True && o2_tuple == Py_True;
+    Py_DECREF(o1_tuple);
+    Py_DECREF(o2_tuple);
+
+    if (both_tuples) {
+      const PyObject* same_tuples = SameNamedtuples(o1, o2);
+      if (same_tuples == nullptr) return false;
+      bool not_same_tuples = same_tuples != Py_True;
+      Py_DECREF(same_tuples);
+      if (not_same_tuples) {
+        *is_type_error = true;
+        *error_msg = tensorflow::strings::StrCat(
+            "The two namedtuples don't have the same sequence type. "
+            "First structure ",
+            PyObjectToString(o1), " has type ", type1->tp_name,
+            ", while second structure ", PyObjectToString(o2), " has type ",
+            type2->tp_name);
+        return true;
+      }
+    } else if (type1 != type2) {
+      *is_type_error = true;
+      *error_msg = tensorflow::strings::StrCat(
+          "The two namedtuples don't have the same sequence type. "
+          "First structure ",
+          PyObjectToString(o1), " has type ", type1->tp_name,
+          ", while second structure ", PyObjectToString(o2), " has type ",
+          type2->tp_name);
+      return true;
+    }
+
+    if (PyDict_Check(o1)) {
+      if (PyDict_Size(o1) != PyDict_Size(o2)) {
+        SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+        return true;
+      }
+
+      PyObject* key;
+      Py_ssize_t pos = 0;
+      while (PyDict_Next(o1, &pos, &key, nullptr)) {
+        if (PyDict_GetItem(o2, key) == nullptr) {
+          SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+          return true;
+        }
+      }
+    }
+  }
+
+  ValIterator iter1(o1);
+  ValIterator iter2(o2);
+
+  while (true) {
+    PyObject* v1 = iter1.next();
+    PyObject* v2 = iter2.next();
+    if (v1 != nullptr && v2 != nullptr) {
+      if (Py_EnterRecursiveCall(" in assert_same_structure")) {
+        return false;
+      }
+      bool no_internal_errors = AssertSameStructureHelper(
+          v1, v2, check_types, error_msg, is_type_error);
+      Py_LeaveRecursiveCall();
+      if (!no_internal_errors) return false;
+      if (!error_msg->empty()) return true;
+    } else if (v1 == nullptr && v2 == nullptr) {
+      // Done with all recursive calls. Structure matched.
+      return true;
+    } else {
+      *is_type_error = false;
+      *error_msg = tensorflow::strings::StrCat(
+          "The two structures don't have the same number of elements. ",
+          "First structure: ", PyObjectToString(o1),
+          ". Second structure: ", PyObjectToString(o2));
+      return true;
+    }
+  }
+}
+
 }  // anonymous namespace
 
 void RegisterSequenceClass(PyObject* sequence_class) {
@@ -123,5 +377,107 @@ PyObject* Flatten(PyObject* nested) {
     return nullptr;
   }
 }
+
+PyObject* IsNamedtuple(PyObject* o, bool strict) {
+  // Must be subclass of tuple
+  if (!PyTuple_Check(o)) {
+    Py_RETURN_FALSE;
+  }
+
+  // If strict, o.__class__.__base__ must be tuple
+  if (strict) {
+    PyObject* klass = PyObject_GetAttrString(o, "__class__");
+    if (klass == nullptr) return nullptr;
+    PyObject* base = PyObject_GetAttrString(klass, "__base__");
+    Py_DECREF(klass);
+    if (base == nullptr) return nullptr;
+
+    const PyTypeObject* base_type = reinterpret_cast<PyTypeObject*>(base);
+    // built-in object types are singletons
+    bool tuple_base = base_type == &PyTuple_Type;
+    Py_DECREF(base);
+    if (!tuple_base) {
+      Py_RETURN_FALSE;
+    }
+  }
+
+  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        tensorflow::strings::StrCat(
+            "collections.Sequence type has not been set. "
+            "Please call RegisterSequenceClass before using this module")
+            .c_str());
+    return nullptr;
+  }
+
+  // o must have attribute '_fields' and every element in
+  // '_fields' must be a string.
+  int has_fields = PyObject_HasAttrString(o, "_fields");
+  if (!has_fields) {
+    Py_RETURN_FALSE;
+  }
+
+  Safe_PyObjectPtr fields = make_safe(PyObject_GetAttrString(o, "_fields"));
+  int is_instance = PyObject_IsInstance(fields.get(), CollectionsSequenceType);
+  if (is_instance == 0) {
+    Py_RETURN_FALSE;
+  } else if (is_instance == -1) {
+    return nullptr;
+  }
+
+  Safe_PyObjectPtr seq = make_safe(PySequence_Fast(fields.get(), ""));
+  const Py_ssize_t s = PySequence_Fast_GET_SIZE(seq.get());
+  for (Py_ssize_t i = 0; i < s; ++i) {
+    // PySequence_Fast_GET_ITEM returns borrowed ref
+    PyObject* elem = PySequence_Fast_GET_ITEM(seq.get(), i);
+    if (!IsString(elem)) {
+      Py_RETURN_FALSE;
+    }
+  }
+
+  Py_RETURN_TRUE;
+}
+
+PyObject* SameNamedtuples(PyObject* o1, PyObject* o2) {
+  PyObject* f1 = PyObject_GetAttrString(o1, "_fields");
+  PyObject* f2 = PyObject_GetAttrString(o2, "_fields");
+  if (f1 == nullptr || f2 == nullptr) {
+    Py_XDECREF(f1);
+    Py_XDECREF(f2);
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "Expected namedtuple-like objects (that have _fields attr)");
+    return nullptr;
+  }
+
+  if (PyObject_RichCompareBool(f1, f2, Py_NE)) {
+    Py_RETURN_FALSE;
+  }
+
+  if (GetClassName(o1).compare(GetClassName(o2)) == 0) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types) {
+  string error_msg;
+  bool is_type_error = false;
+  AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error);
+  if (!error_msg.empty()) {
+    PyErr_SetString(
+        is_type_error ? PyExc_TypeError : PyExc_ValueError,
+        tensorflow::strings::StrCat(
+            "The two structures don't have the same nested structure.\n\n",
+            "First structure: ", PyObjectToString(o1), "\n\nSecond structure: ",
+            PyObjectToString(o2), "\n\nMore specifically: ", error_msg)
+            .c_str());
+    return nullptr;
+  }
+  Py_RETURN_NONE;
+}
+
 }  // namespace swig
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 2af71dc753760e7efaf28cc500d5296a31957a04..c325baa5f86820846dd09780b4208667f3aad5e1 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -33,6 +33,57 @@ namespace swig {
 //   dict.
 bool IsSequence(PyObject* o);
 
+// Implements the same interface as tensorflow.util.nest._is_namedtuple
+// Returns Py_True iff `instance` should be considered a `namedtuple`.
+//
+// Args:
+//   instance: An instance of a Python object.
+//   strict: If True, `instance` is considered to be a `namedtuple` only if
+//       it is a "plain" namedtuple. For instance, a class inheriting
+//       from a `namedtuple` will be considered to be a `namedtuple`
+//       iff `strict=False`.
+//
+// Returns:
+//   True if `instance` is a `namedtuple`.
+PyObject* IsNamedtuple(PyObject* o, bool strict);
+
+// Implements the same interface as tensorflow.util.nest._same_namedtuples
+// Returns Py_True iff the two namedtuples have the same name and fields.
+// Raises RuntimeError if `o1` or `o2` don't look like namedtuples (don't have
+// '_fields' attribute).
+PyObject* SameNamedtuples(PyObject* o1, PyObject* o2);
+
+// Asserts that two structures are nested in the same way.
+//
+// Note that namedtuples with identical name and fields are always considered
+// to have the same shallow structure (even with `check_types=True`).
+// For intance, this code will print `True`:
+//
+// ```python
+// def nt(a, b):
+//   return collections.namedtuple('foo', 'a b')(a, b)
+// print(assert_same_structure(nt(0, 1), nt(2, 3)))
+// ```
+//
+// Args:
+//  nest1: an arbitrarily nested structure.
+//  nest2: an arbitrarily nested structure.
+//  check_types: if `true`, types of sequences are checked as
+//      well, including the keys of dictionaries. If set to `false`, for example
+//      a list and a tuple of objects will look the same if they have the same
+//      size. Note that namedtuples with identical name and fields are always
+//      considered to have the same shallow structure.
+//
+// Raises:
+//  ValueError: If the two structures do not have the same number of elements or
+//    if the two structures are not nested in the same way.
+//  TypeError: If the two structures differ in the type of sequence in any of
+//    their substructures. Only possible if `check_types` is `True`.
+//
+// Returns:
+//  Py_None on success, nullptr on error.
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
+
 // Implements the same interface as tensorflow.util.nest.flatten
 //
 // Returns a flat list from a given nested structure.
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index d69084fc0091ac79cf3f5cf3d70af419cf78f936..b7f201b6fe6fd18af2bb833df2d08bfedb23a185 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -34,6 +34,15 @@ limitations under the License.
 %unignore tensorflow::swig::IsSequence;
 %noexception tensorflow::swig::IsSequence;
 
+%unignore tensorflow::swig::IsNamedtuple;
+%noexception tensorflow::swig::IsNamedtuple;
+
+%unignore tensorflow::swig::SameNamedtuples;
+%noexception tensorflow::swig::SameNamedtuples;
+
+%unignore tensorflow::swig::AssertSameStructure;
+%noexception tensorflow::swig::AssertSameStructure;
+
 %unignore tensorflow::swig::Flatten;
 %noexception tensorflow::swig::Flatten;
 
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 1865240014e2da5068a4ef377a5934de62dd54b6..c68cda01002b1c5bbc2facb95b1eba214fbad7cb 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -33,7 +33,9 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ptr_util",
         "@local_config_cuda//cuda:cuda_headers",
     ],
     alwayslink = 1,
@@ -45,6 +47,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/compiler/xla:statusor",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_static([":stream_executor_impl"]),
 )
@@ -56,7 +60,10 @@ cc_library(
             [
                 "cuda/*.cc",
             ],
-            exclude = ["cuda/cuda_platform_id.cc"],
+            exclude = [
+                "cuda/*_test.cc",
+                "cuda/cuda_platform_id.cc",
+            ],
         ),
     ),
     copts = select({
diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc
index da09d84921e2dd94942b3a62fe7366211c60aed1..906d6fb7020ce35adb1438d394b34983c332f182 100644
--- a/tensorflow/stream_executor/blas.cc
+++ b/tensorflow/stream_executor/blas.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace blas {
 
 string TransposeString(Transpose t) {
@@ -79,6 +78,8 @@ string ComputationTypeString(ComputationType ty) {
       return "f32";
     case ComputationType::kF64:
       return "f64";
+    case ComputationType::kI32:
+      return "i32";
     case ComputationType::kComplexF32:
       return "complex f32";
     case ComputationType::kComplexF64:
@@ -88,6 +89,9 @@ string ComputationTypeString(ComputationType ty) {
   }
 }
 
+std::ostream& operator<<(std::ostream& os, ComputationType ty) {
+  return os << ComputationTypeString(ty);
+}
+
 }  // namespace blas
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index 072f08554688276a05d9be85718de8750bd874c2..be0b0bf5fb20b2c0cfa99dee81e01d319387ef36 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -41,16 +41,16 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
 
 #include <complex>
-#include "tensorflow/stream_executor/platform/port.h"
 
+#include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
+#include "tensorflow/stream_executor/platform/port.h"
 
 namespace Eigen {
 struct half;
 }  // namespace Eigen
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 class ScratchAllocator;
@@ -104,6 +104,8 @@ enum class ComputationType {
 // Converts a ComputationType to a string.
 string ComputationTypeString(ComputationType ty);
 
+std::ostream &operator<<(std::ostream &os, ComputationType ty);
+
 // Opaque identifier for an "algorithm" used by a blas routine.  This functions
 // as a hint to the blas library.
 typedef int64 AlgorithmType;
@@ -1031,43 +1033,49 @@ class BlasSupport {
   // creating a new Stream for each attempt.
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
-      const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int32> *c,
+      uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
+      const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b,
+      int ldb, const HostOrDeviceScalar<int> &beta, DeviceMemory<int32> *c,
       int ldc, ComputationType computation_type, AlgorithmType algorithm,
       ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, const Eigen::half &alpha,
+      uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
       const DeviceMemory<Eigen::half> &a, int lda,
-      const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,
-      DeviceMemory<Eigen::half> *c, int ldc, ComputationType computation_type,
-      AlgorithmType algorithm, ProfileResult *output_profile_result) = 0;
+      const DeviceMemory<Eigen::half> &b, int ldb,
+      const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+      int ldc, ComputationType computation_type, AlgorithmType algorithm,
+      ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
-      const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+      uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
+      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+      int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
       int ldc, ComputationType computation_type, AlgorithmType algorithm,
       ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-      const DeviceMemory<double> &b, int ldb, double beta,
-      DeviceMemory<double> *c, int ldc, ComputationType computation_type,
-      AlgorithmType algorithm, ProfileResult *output_profile_result) = 0;
+      uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
+      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+      int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+      int ldc, ComputationType computation_type, AlgorithmType algorithm,
+      ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, std::complex<float> alpha,
+      uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
       const DeviceMemory<std::complex<float>> &a, int lda,
       const DeviceMemory<std::complex<float>> &b, int ldb,
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+      const HostOrDeviceScalar<std::complex<float>> &beta,
+      DeviceMemory<std::complex<float>> *c, int ldc,
       ComputationType computation_type, AlgorithmType algorithm,
       ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, std::complex<double> alpha,
+      uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
       const DeviceMemory<std::complex<double>> &a, int lda,
       const DeviceMemory<std::complex<double>> &b, int ldb,
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+      const HostOrDeviceScalar<std::complex<double>> &beta,
+      DeviceMemory<std::complex<double>> *c, int ldc,
       ComputationType computation_type, AlgorithmType algorithm,
       ProfileResult *output_profile_result) = 0;
 
@@ -1885,49 +1893,57 @@ class BlasSupport {
       override;                                                                \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a,    \
-      int lda, const DeviceMemory<int8> &b, int ldb, int beta,                 \
-      DeviceMemory<int> *c, int ldc, blas::ComputationType computation_type,   \
+      uint64 m, uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,      \
+      const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b,       \
+      int ldb, const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c,      \
+      int ldc, blas::ComputationType computation_type,                         \
       blas::AlgorithmType algorithm,                                           \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, const Eigen::half &alpha,                  \
+      uint64 m, uint64 n, uint64 k,                                            \
+      const HostOrDeviceScalar<Eigen::half> &alpha,                            \
       const DeviceMemory<Eigen::half> &a, int lda,                             \
-      const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,    \
+      const DeviceMemory<Eigen::half> &b, int ldb,                             \
+      const HostOrDeviceScalar<Eigen::half> &beta,                             \
       DeviceMemory<Eigen::half> *c, int ldc,                                   \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, \
-      int lda, const DeviceMemory<float> &b, int ldb, float beta,              \
-      DeviceMemory<float> *c, int ldc, blas::ComputationType computation_type, \
+      uint64 m, uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,    \
+      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,     \
+      int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,  \
+      int ldc, blas::ComputationType computation_type,                         \
       blas::AlgorithmType algorithm,                                           \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, double alpha,                              \
+      uint64 m, uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,   \
       const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,   \
-      int ldb, double beta, DeviceMemory<double> *c, int ldc,                  \
+      int ldb, const HostOrDeviceScalar<double> &beta,                         \
+      DeviceMemory<double> *c, int ldc,                                        \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, std::complex<float> alpha,                 \
+      uint64 m, uint64 n, uint64 k,                                            \
+      const HostOrDeviceScalar<std::complex<float>> &alpha,                    \
       const DeviceMemory<std::complex<float>> &a, int lda,                     \
       const DeviceMemory<std::complex<float>> &b, int ldb,                     \
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \
+      const HostOrDeviceScalar<std::complex<float>> &beta,                     \
+      DeviceMemory<std::complex<float>> *c, int ldc,                           \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, std::complex<double> alpha,                \
+      uint64 m, uint64 n, uint64 k,                                            \
+      const HostOrDeviceScalar<std::complex<double>> &alpha,                   \
       const DeviceMemory<std::complex<double>> &a, int lda,                    \
       const DeviceMemory<std::complex<double>> &b, int ldb,                    \
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *c,        \
-      int ldc, blas::ComputationType computation_type,                         \
-      blas::AlgorithmType algorithm,                                           \
+      const HostOrDeviceScalar<std::complex<double>> &beta,                    \
+      DeviceMemory<std::complex<double>> *c, int ldc,                          \
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
@@ -2098,7 +2114,6 @@ class BlasSupport {
                   DeviceMemory<std::complex<double>> *b, int ldb) override;
 
 }  // namespace blas
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/cuda/cuda_activation.cc
index 5f4cf9dbd781e865345143a964fc16ca4fdb18d4..cf6b9e2c6e4b327c06ecce318f8e8809308f6f02 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.cc
+++ b/tensorflow/stream_executor/cuda/cuda_activation.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec);
@@ -40,5 +39,4 @@ ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
index c9d43a9766e049ae002df3e4ce4d8625e71f99db..04ffaef3646bb3df03407f8e74e620455bb9e5cb 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@@ -25,8 +25,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class StreamExecutor;
 
@@ -56,7 +55,6 @@ class ScopedActivateExecutorContext {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 44a3a745ad86dc24f632e4a36691fba06171c9fb..3c1353aee3178282cb9a62222ac0e415f08e6de8 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -13,17 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Include cuBLAS headers early, and then set EIGEN_HAS_CUDA_FP16
-// if we have new enough CUDA (which we will only know after including
-// cuda.h). This ensures that Eigen's Half.h does not attempt to make its own
-// __half typedef if CUDA has already defined one (and conversely, that we do
-// not include <cuda_fp16.h> after Half.h has made its typedef).
-#include "cuda/include/cuda.h"
 #include "cuda/include/cublas_v2.h"
-
-#if CUDA_VERSION >= 7050
-#define EIGEN_HAS_CUDA_FP16
-#endif
+#include "cuda/include/cuda.h"
 
 #if CUDA_VERSION >= 8000
 #define SE_CUDA_DATA_HALF CUDA_R_16F
@@ -33,6 +24,34 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_blas.h"
 
+// Both Eigen Half.h and CUDA cuda_fp16.h provide similar typedef for __half. As
+// such, there are two ways to get the typedef for __half:
+//
+// (1) Includes cuda_fp16.h and defines EIGEN_HAS_CUDA_FP16.
+// (2) Neither includes cuda_fp16.h nor defines EIGEN_HAS_CUDA_FP16.
+//
+// Due to issue b/73793421, when the first approach is used and NVCC is used to
+// compile this file, NVCC will complain duplicated definition for
+// EIGEN_HAS_CUDA_FP16. On the other hand, when the second approach is used and
+// clang is used to compile this file, clang will not understand __half
+// due to missing the definition and macro EIGEN_HAS_CUDA_FP16.
+//
+// Because this file may be compiled with CLANG but will never be compiled with
+// NVCC, we choose the first approach for CUDA < 9.0. For CUDA >= 9.0, we have
+// to use the second approach because the data member in the __half defined
+// by CUDA > 9.0 is `__x` while Eigen expects it to be `x`.
+//
+// TODO(b/73793421): Remove the following code block to switch to the second
+// approach when the issue is fixed.
+#if CUDA_VERSION < 9000
+#include "cuda/include/cuda_fp16.h"
+#if CUDA_VERSION >= 7050
+#define EIGEN_HAS_CUDA_FP16
+#endif
+#endif
+
+#include "third_party/eigen3/Eigen/Core"
+
 #include <assert.h>
 #include <complex>
 
@@ -56,15 +75,14 @@ limitations under the License.
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
 namespace wrap {
 
-#define PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name)                      \
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     static const char *kName;                                       \
     template <typename... Args>                                     \
@@ -75,8 +93,8 @@ namespace wrap {
   } __name;                                                         \
   const char *WrapperShim__##__name::kName = #__name;
 
-#define PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(__name) \
-  PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name)
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
 
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSnrm2)                    \
@@ -250,28 +268,28 @@ namespace wrap {
   __macro(cublasCdgmm)                    \
   __macro(cublasZdgmm)
 
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasCreate)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasDestroy)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetStream)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetPointerMode)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasGetPointerMode)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmBatched)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasDgemmBatched)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasCgemmBatched)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasZgemmBatched)
-CUBLAS_BLAS_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasCreate)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasDestroy)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetStream)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetPointerMode)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasGetPointerMode)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmBatched)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasDgemmBatched)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasCgemmBatched)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasZgemmBatched)
+CUBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
 
 #if CUDA_VERSION >= 7050
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmEx)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmEx)
 #endif
 
 #if CUDA_VERSION >= 8000
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasGemmEx)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmEx)
 #endif
 
 #if CUDA_VERSION >= 9000
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasGetMathMode)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSetMathMode)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGetMathMode)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasSetMathMode)
 #endif
 
 }  // namespace wrap
@@ -2057,12 +2075,6 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
     const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
     const T &beta, DeviceMemory<T> *y, int incy,
     blas::ProfileResult *output_profile_result) {
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2095,12 +2107,6 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
     uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
     int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
     DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2150,10 +2156,11 @@ static bool TensorOpsAvailable(int cc_major) {
 template <typename InT, typename OutT, typename CompT>
 bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a, int lda,
-    const DeviceMemory<InT> &b, int ldb, const CompT &beta,
-    DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+    uint64 n, uint64 k, const HostOrDeviceScalar<CompT> &alpha,
+    const DeviceMemory<InT> &a, int lda, const DeviceMemory<InT> &b, int ldb,
+    const HostOrDeviceScalar<CompT> &beta, DeviceMemory<OutT> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
 // CUDA < version 8 and GPUs < sm_50 don't support cublasGemmEx.
 #if CUDA_VERSION < 8000
   return false;
@@ -2169,12 +2176,12 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
+  // Either both 'alpha' and 'beta' need to be pointers to device memory, or
+  // they need to be both host scalars.
+  if (alpha.is_pointer() != beta.is_pointer()) {
+    return false;
+  }
+
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2187,10 +2194,15 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
   // Since we are converting 'algorithm' to cublasGemmAlgo_t by static_cast,
   // we do the following compile-time check on the default value:
   static_assert(blas::kDefaultGemmAlgo == CUBLAS_GEMM_DFALT, "");
+  // If 'alpha' and 'beta' are host scalars and CompT is Eigen::half, we
+  // essentially reinterpet_cast to __half, which is safe because Eigen::half
+  // inherits from __half.
   bool result = DoBlasInternalFailureOK(
-      wrap::cublasGemmEx, stream, /* pointer_mode_host = */ true,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb, &beta,
+      wrap::cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(),
+      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
+      alpha.is_pointer() ? CUDAMemory(alpha.pointer()) : &alpha.value(),
+      CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb,
+      beta.is_pointer() ? CUDAMemory(beta.pointer()) : &beta.value(),
       CUDAMemoryMutable(c), CUDADataType<OutT>::type, ldc,
       CUDAComputationType(computation_type),
       static_cast<cublasGemmAlgo_t>(algorithm));
@@ -2239,10 +2251,11 @@ bool CUDABlas::GetBlasGemmAlgorithms(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
-    const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int> *c,
-    int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+    uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
+    const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b, int ldb,
+    const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
       computation_type, algorithm, output_profile_result);
@@ -2250,12 +2263,28 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, const Eigen::half &alpha,
+    uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
     const DeviceMemory<Eigen::half> &a, int lda,
-    const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,
-    DeviceMemory<Eigen::half> *c, int ldc,
-    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
-    blas::ProfileResult *output_profile_result) {
+    const DeviceMemory<Eigen::half> &b, int ldb,
+    const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  if (computation_type == blas::ComputationType::kF32) {
+    if (alpha.is_pointer() || beta.is_pointer()) {
+      // We cannot easily convert a pointer to f16 memory to a pointer to f32
+      // memory from here, so we don't support this for now.
+      // TODO(akuegel): Investigate whether we can do the conversion before
+      // calling DoBlasGemmWithAlgorithm.
+      return false;
+    }
+    HostOrDeviceScalar<float> float_alpha(static_cast<float>(alpha.value()));
+    HostOrDeviceScalar<float> float_beta(static_cast<float>(beta.value()));
+    return DoBlasGemmWithAlgorithmImpl(
+        stream, transa, transb, m, n, k, float_alpha, a, lda, b, ldb,
+        float_beta, c, ldc, computation_type, algorithm, output_profile_result);
+  }
+
+  CHECK_EQ(computation_type, blas::ComputationType::kF16);
   return DoBlasGemmWithAlgorithmImpl(
       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
       computation_type, algorithm, output_profile_result);
@@ -2263,8 +2292,9 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
-    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+    int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
     int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
@@ -2274,9 +2304,10 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-    const DeviceMemory<double> &b, int ldb, double beta,
-    DeviceMemory<double> *c, int ldc, blas::ComputationType computation_type,
+    uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+    int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+    int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
@@ -2285,10 +2316,11 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, std::complex<float> alpha,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
     const DeviceMemory<std::complex<float>> &a, int lda,
     const DeviceMemory<std::complex<float>> &b, int ldb,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    const HostOrDeviceScalar<std::complex<float>> &beta,
+    DeviceMemory<std::complex<float>> *c, int ldc,
     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
     blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
@@ -2298,10 +2330,11 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, std::complex<double> alpha,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
     const DeviceMemory<std::complex<double>> &a, int lda,
     const DeviceMemory<std::complex<double>> &b, int ldb,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    const HostOrDeviceScalar<std::complex<double>> &beta,
+    DeviceMemory<std::complex<double>> *c, int ldc,
     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
     blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
@@ -2794,46 +2827,39 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
 
 }  // namespace cuda
 
-namespace gpu = ::perftools::gputools;
-
 void initialize_cublas() {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::BlasFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuBlasPlugin, "cuBLAS",
-              [](gpu::internal::StreamExecutorInterface
-                     *parent) -> gpu::blas::BlasSupport * {
-                gpu::cuda::CUDAExecutor *cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuBLAS "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                gpu::cuda::CUDABlas *blas =
-                    new gpu::cuda::CUDABlas(cuda_executor);
-                if (!blas->Init()) {
-                  // Note: Init() will log a more specific error.
-                  delete blas;
-                  return nullptr;
-                }
-                return blas;
-              });
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
+          cuda::kCudaPlatformId, cuda::kCuBlasPlugin, "cuBLAS",
+          [](internal::StreamExecutorInterface *parent) -> blas::BlasSupport * {
+            cuda::CUDAExecutor *cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR)
+                  << "Attempting to initialize an instance of the cuBLAS "
+                  << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            cuda::CUDABlas *blas = new cuda::CUDABlas(cuda_executor);
+            if (!blas->Init()) {
+              // Note: Init() will log a more specific error.
+              delete blas;
+              return nullptr;
+            }
+            return blas;
+          });
 
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuBLAS factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kBlas,
-                                                     gpu::cuda::kCuBlasPlugin);
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kBlas, cuda::kCuBlasPlugin);
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(register_cublas,
-                            { perftools::gputools::initialize_cublas(); });
+                            { stream_executor::initialize_cublas(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index deb211c04bcaa9e98ee04c5e9066a2a13092cb06..12dc5e47fd1b9d2accca52e547cc271b94085a5a 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -21,6 +21,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 
 #include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
@@ -29,8 +30,7 @@ limitations under the License.
 
 typedef struct cublasContext *cublasHandle_t;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -117,18 +117,13 @@ class CUDABlas : public blas::BlasSupport {
       int batch_count, ScratchAllocator *scratch_allocator);
 
   // Helper function for implementing DoBlasGemmWithAlgorithm.
-  //
-  // We take alpha and beta by const reference because T might be Eigen::half,
-  // and we want to avoid pulling in a dependency on Eigen.  When we pass the
-  // references to cublas, we essentially reinterpret_cast to __half, which is
-  // safe because Eigen::half inherits from __half.
   template <typename InT, typename OutT, typename CompT>
   bool DoBlasGemmWithAlgorithmImpl(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a,
-      int lda, const DeviceMemory<InT> &b, int ldb, const CompT &beta,
-      DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
-      blas::AlgorithmType algorithm,
+      uint64 n, uint64 k, const HostOrDeviceScalar<CompT> &alpha,
+      const DeviceMemory<InT> &a, int lda, const DeviceMemory<InT> &b, int ldb,
+      const HostOrDeviceScalar<CompT> &beta, DeviceMemory<OutT> *c, int ldc,
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
 
   // Helper function for implementing DoBlasGemmWithProfiling.
@@ -162,7 +157,6 @@ class CUDABlas : public blas::BlasSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 933c103f524ef37f840c9e13b9e4024289e274c1..feb529297e8fff99030cb14fd4f8a9298bbc5935 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -51,8 +51,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 #ifdef __APPLE__
@@ -384,5 +383,4 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
 
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
index aa68321acc858c902d1a43600a14ac5d88edb0be..f2db2eb20a18c671e055b910809dfde940a5e3f8 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // e.g. DriverVersion{346, 3, 4}
@@ -93,7 +92,6 @@ class Diagnostician {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 58b47067662b2595f53ca648dcab7a2a194039ab..42a77aa3f8e9492d3c758dd69f30c6eae1b77db0 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <memory>
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -27,6 +29,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
+#include "tensorflow/stream_executor/cuda/cudnn_version.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
@@ -55,19 +58,9 @@ NarrowT CheckedNarrowing(const WideT& wide) {
   return narrow;
 }
 
-// Returns the "Compatibility" version number from the CuDNN version number.
-// This is the number that tries to indicate ABI compatibility.
-//
-// For example, if cudnn_version is 5107, the compatibility version
-// number will be 5100.
-size_t cudnnCompatibilityVersion(size_t cudnn_version) {
-  return (cudnn_version / 100) * 100;
-}
-
 }  // namespace
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 using dnn::BatchDescriptor;
 using dnn::FilterDescriptor;
@@ -109,6 +102,22 @@ string ToString(cudnnStatus_t status) {
   }
 }
 
+#if CUDNN_VERSION >= 6000
+string ToString(libraryPropertyType type) {
+  switch (type) {
+    case MAJOR_VERSION:
+      return "MAJOR_VERSION";
+    case MINOR_VERSION:
+      return "MINOR_VERSION";
+    case PATCH_LEVEL:
+      return "PATCH_LEVEL";
+    default:
+      return port::StrCat(
+          "<unknown libraryPropertyType: ", static_cast<int>(type), ">");
+  }
+}
+#endif
+
 template <typename T>
 cudnnDataType_t GetCudnnDataType();
 
@@ -150,7 +159,7 @@ static port::ThreadPool* GetCudaThreadpool() {
   return cudnn_threadpool;
 }
 
-#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                      \
+#define STREAM_EXECUTOR_CUDNN_WRAP(__name)                         \
   struct WrapperShim__##__name {                                   \
     template <typename... Args>                                    \
     cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) { \
@@ -160,11 +169,34 @@ static port::ThreadPool* GetCudaThreadpool() {
     }                                                              \
   } __name;
 
+#define STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM(__name)           \
+  struct WrapperShim__##__name {                                         \
+    template <typename... Args>                                          \
+    cudnnStatus_t operator()(CudnnSupport* dnn, Stream* s, Args... args) \
+        SHARED_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {                  \
+      CHECK_NOTNULL(s);                                                  \
+      CHECK_EQ(s, dnn->GetCurrentDnnStream())                            \
+          << "Stream is not set correctly!";                             \
+      cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()}; \
+      cudnnStatus_t retval = ::__name(args...);                          \
+      return retval;                                                     \
+    }                                                                    \
+  } __name;
+
+// Handles cudnnSetStream differently in order to add debug information.
+struct WrapperShim__cudnnSetStream {
+  cudnnStatus_t operator()(CudnnSupport* dnn, Stream* stream,
+                           cudnnHandle_t handle)
+      EXCLUSIVE_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {
+    dnn->SetCurrentDnnStream(stream);
+    cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()};
+    cudnnStatus_t retval = ::cudnnSetStream(handle, AsCUDAStreamValue(stream));
+    return retval;
+  }
+} cudnnSetStream;
+
 // clang-format off
 #define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnBatchNormalizationBackward)                \
-  __macro(cudnnBatchNormalizationForwardInference)        \
-  __macro(cudnnBatchNormalizationForwardTraining)         \
   __macro(cudnnGetConvolutionNdForwardOutputDim)          \
   __macro(cudnnGetConvolutionForwardAlgorithm)            \
   __macro(cudnnCreateTensorDescriptor)                    \
@@ -181,16 +213,25 @@ static port::ThreadPool* GetCudaThreadpool() {
   __macro(cudnnDestroyConvolutionDescriptor)              \
   __macro(cudnnCreate)                                    \
   __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
   __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
   __macro(cudnnSetConvolutionNdDescriptor)                \
   __macro(cudnnSetTensor4dDescriptor)                     \
   __macro(cudnnSetTensorNdDescriptor)                     \
-  __macro(cudnnSetFilterNdDescriptor)                     \
+  __macro(cudnnSetFilterNdDescriptor)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(__macro)       \
+  __macro(cudnnBatchNormalizationBackward)                \
+  __macro(cudnnBatchNormalizationForwardInference)        \
+  __macro(cudnnBatchNormalizationForwardTraining)         \
+  __macro(cudnnActivationForward)                         \
+  __macro(cudnnConvolutionForward)                        \
+  __macro(cudnnConvolutionBackwardBias)                   \
+  __macro(cudnnTransformTensor)                           \
   __macro(cudnnPoolingForward)                            \
   __macro(cudnnPoolingBackward)                           \
   __macro(cudnnLRNCrossChannelForward)                    \
@@ -198,9 +239,11 @@ static port::ThreadPool* GetCudaThreadpool() {
   __macro(cudnnAddTensor)                                 \
   __macro(cudnnConvolutionBackwardData)                   \
   __macro(cudnnConvolutionBackwardFilter)
-// clang-format on
 
-CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_WITH_STREAM
 
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
@@ -209,21 +252,22 @@ CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
   __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
   __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
 #endif
 
 // APIs in R3 but not in R5
 // clang-format off
 #if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 5000
-#define CUDNN_DNN_ROUTINE_EACH_R3(__macro)                    \
+#define CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(__macro)        \
   __macro(cudnnAddTensor_v3)                                  \
   __macro(cudnnConvolutionBackwardData_v3)                    \
   __macro(cudnnConvolutionBackwardFilter_v3)
 // clang-format on
 
-CUDNN_DNN_ROUTINE_EACH_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R3
+CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM
 #endif
 
 // APIs in R5
@@ -245,48 +289,67 @@ CUDNN_DNN_ROUTINE_EACH_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
   __macro(cudnnGetRNNTrainingReserveSize)                     \
   __macro(cudnnGetRNNLinLayerMatrixParams)                    \
   __macro(cudnnGetRNNLinLayerBiasParams)                      \
-  __macro(cudnnRNNForwardInference)                           \
-  __macro(cudnnRNNForwardTraining)                            \
-  __macro(cudnnRNNBackwardData)                               \
-  __macro(cudnnRNNBackwardWeights)                            \
   __macro(cudnnSetRNNDescriptor)                              \
   __macro(cudnnGetFilterNdDescriptor)
 
 // clang-format on
-
-CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R5(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R5
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(__macro)        \
+  __macro(cudnnRNNForwardInference)                           \
+  __macro(cudnnRNNForwardTraining)                            \
+  __macro(cudnnRNNBackwardData)                               \
+  __macro(cudnnRNNBackwardWeights)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM
 #endif
 
 // APIs in R6
 // clang-format off
 #if CUDNN_VERSION >= 6000
 #define CUDNN_DNN_ROUTINE_EACH_R6(__macro)                    \
-  __macro(cudnnConvolutionBiasActivationForward)              \
-  __macro(cudnnSetRNNDescriptor_v6)
+  __macro(cudnnSetRNNDescriptor_v6)                           \
+  __macro(cudnnCreatePersistentRNNPlan)                       \
+  __macro(cudnnDestroyPersistentRNNPlan)                      \
+  __macro(cudnnSetPersistentRNNPlan)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R6(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R6
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(__macro)        \
+  __macro(cudnnConvolutionBiasActivationForward)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM
 #endif
 
 // APIs in R7
 // clang-format off
 #if CUDNN_VERSION >= 7000
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                    \
-  __macro(cudnnSetConvolutionMathType)
+  __macro(cudnnSetConvolutionMathType)                        \
+  __macro(cudnnSetRNNMatrixMathType)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R7(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R7
 #endif
 
-#undef CUDNN_DNN_ROUTINE_EACH
-
 }  // namespace wrap
 
 namespace {
 
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
+
 cudnnHandle_t ToHandle(void* opaque_handle) {
   return static_cast<cudnnHandle_t>(opaque_handle);
 }
@@ -359,10 +422,55 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
   }
 }
 
+#if CUDNN_VERSION >= 6000
+port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
+  cudnnStatus_t status = cudnnGetProperty(type, value);
+  if (status != CUDNN_STATUS_SUCCESS) {
+    const string error =
+        port::StrCat("cudnnGetProperty failed for type: ", ToString(type),
+                     " with status: ", ToString(status));
+    LOG(ERROR) << error;
+    return port::Status{port::error::INTERNAL, error};
+  }
+  return port::Status::OK();
+}
+
+cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
+  if (algorithm.is_default()) {
+    return CUDNN_RNN_ALGO_STANDARD;
+  } else {
+    cudnnRNNAlgo_t algo = static_cast<cudnnRNNAlgo_t>(algorithm.algo_id());
+    switch (algo) {
+      case CUDNN_RNN_ALGO_STANDARD:
+      case CUDNN_RNN_ALGO_PERSIST_STATIC:
+      case CUDNN_RNN_ALGO_PERSIST_DYNAMIC:
+        return algo;
+      default:
+        LOG(FATAL) << "Unsupported Cudnn RNN algorithm: "
+                   << algorithm.algo_id();
+    }
+  }
+}
+#endif
+
+port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
+#if CUDNN_VERSION >= 6000
+  TF_RETURN_IF_ERROR(GetCudnnProperty(MAJOR_VERSION, &version->major_version));
+  TF_RETURN_IF_ERROR(GetCudnnProperty(MINOR_VERSION, &version->minor_version));
+  TF_RETURN_IF_ERROR(GetCudnnProperty(PATCH_LEVEL, &version->patch_level));
+#else
+  size_t loaded_version = ::cudnnGetVersion();
+  version->major_version = loaded_version / 1000;
+  version->minor_version = (loaded_version / 100) % 10;
+  version->patch_level = loaded_version % 100;
+#endif
+  return port::Status::OK();
+}
+
 }  // namespace
 
 CudnnSupport::CudnnSupport(CUDAExecutor* parent)
-    : parent_(parent), dnn_handle_(nullptr) {}
+    : parent_(parent), dnn_handle_(nullptr), current_dnn_stream_(nullptr) {}
 
 CudnnSupport::~CudnnSupport() {
   auto status = wrap::cudnnDestroy(parent_, ToHandle(dnn_handle_));
@@ -375,24 +483,19 @@ port::Status CudnnSupport::Init() {
   auto status = wrap::cudnnCreate(
       parent_, reinterpret_cast<cudnnHandle_t*>(&dnn_handle_));
   if (status == CUDNN_STATUS_SUCCESS) {
-    // Check whether loaded version of CuDNN matches what the source
-    // was built with.
-    size_t loaded_version = ::cudnnGetVersion();
-    size_t loaded_compat_version = cudnnCompatibilityVersion(loaded_version);
-    size_t compiled_compat_version = cudnnCompatibilityVersion(CUDNN_VERSION);
-    bool library_loaded_matches_source =
-        (loaded_compat_version == compiled_compat_version);
-    if (!library_loaded_matches_source) {
-      const string error =
-          port::StrCat("Loaded runtime CuDNN library: ", loaded_version,
-                       " (compatibility version ", loaded_compat_version,
-                       ") but source was compiled with ", CUDNN_VERSION,
-                       " (compatibility version ", compiled_compat_version,
-                       ").  If using a binary install, upgrade your CuDNN "
-                       "library to match.  If building from sources, "
-                       "make sure the library loaded at runtime matches a "
-                       "compatible version specified during compile "
-                       "configuration.");
+    CudnnVersion source_version(CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
+
+    CudnnVersion loaded_version;
+    TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&loaded_version));
+    if (!IsSourceCompatibleWithCudnnLibrary(source_version, loaded_version)) {
+      const tensorflow::string error = port::StrCat(
+          "Loaded runtime CuDNN library: ", loaded_version.ToString(),
+          " but source was compiled with: ", source_version.ToString(),
+          ".  CuDNN library major and minor version needs to match or have "
+          "higher minor version in case of CuDNN 7.0 or later version. If "
+          "using a binary install, upgrade your CuDNN library.  If building "
+          "from sources, make sure the library loaded at runtime is compatible "
+          "with the version specified during compile configuration.");
       LOG(ERROR) << error;
       return port::Status{port::error::INTERNAL, error};
     }
@@ -425,6 +528,14 @@ port::Status CudnnSupport::Init() {
                                    ToString(status))};
 }
 
+port::StatusOr<perftools::gputools::dnn::VersionInfo>
+CudnnSupport::GetVersion() {
+  CudnnVersion version;
+  TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&version));
+  return perftools::gputools::dnn::VersionInfo(
+      version.major_version, version.minor_version, version.patch_level);
+}
+
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
 class ScopedTensorDescriptor {
  public:
@@ -586,6 +697,19 @@ static bool TensorOpMathEnabled() {
   return is_enabled;
 }
 
+// A helper function to decide whether to enable the TENSOR_OP_MATH math type
+// for RNNs.
+static bool RnnTensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled = false;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
 // A helper function to decide whether to use CUDNN_BATCHNORM_SPATIAL_PERSISTENT
 // in batchnorm. This mode can be faster in some tasks because an optimized path
 // may be selected for CUDNN_DATA_FLOAT and CUDNN_DATA_HALF data types, compute
@@ -1075,9 +1199,11 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
  public:
   CudnnRnnDescriptor(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
                      int num_layers, int hidden_size, int input_size,
-                     cudnnRNNInputMode_t input_mode,
+                     int batch_size, cudnnRNNInputMode_t input_mode,
                      cudnnDirectionMode_t direction_mode,
                      cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
+                     cudnnDataType_t compute_type,
+                     const dnn::AlgorithmConfig& algorithm_config,
                      float dropout, uint64 seed,
                      ScratchAllocator* state_allocator)
       : parent_(parent),
@@ -1085,10 +1211,16 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
         num_layers_(num_layers),
         hidden_size_(hidden_size),
         input_size_(input_size),
+        batch_size_(batch_size),
+#if CUDNN_VERSION >= 6000
+        rnn_plan_(nullptr),
+#endif
         input_mode_(input_mode),
         direction_mode_(direction_mode),
         rnn_mode_(rnn_mode),
-        data_type_(data_type) {
+        data_type_(data_type),
+        compute_type_(compute_type),
+        algorithm_config_(algorithm_config) {
     // Create the dropout handle.
     cudnn_dropout_desc_.reset(new CudnnDropoutDescriptor(
         parent, cudnn_handle, dropout, seed, state_allocator));
@@ -1102,20 +1234,36 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
     CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
 #if CUDNN_VERSION >= 6000
     // TODO: allow the user to choose an algorithm.
-    cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
+    rnn_algo_ = ToCudnnRNNAlgo(algorithm_config_.algorithm());
     status = wrap::cudnnSetRNNDescriptor_v6(
-        parent, cudnn_handle, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
-        num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
-        input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, rnn_algo /*algo*/, data_type /*dataType*/);
+        parent, cudnn_handle, /*rnnDesc=*/rnn_desc_, /*hiddenSize=*/hidden_size,
+        /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_handle(),
+        /*inputMode=*/input_mode, /*direction=*/direction_mode,
+        /*mode=*/rnn_mode, /*algo=*/rnn_algo_, /*dataType=*/compute_type);
+    CUDNN_RETURN_IF_FAIL(status, ::tensorflow::strings::Printf(
+                                     "Unable to update RNN descriptor with "
+                                     "algo_id: %d and compute_type: %d",
+                                     static_cast<int>(rnn_algo_),
+                                     static_cast<int>(compute_type)));
+
+    if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
+      CHECK_GE(batch_size_, 0);
+      status = wrap::cudnnCreatePersistentRNNPlan(
+          parent, rnn_desc_, batch_size_, data_type_, &rnn_plan_);
+      CUDNN_RETURN_IF_FAIL(status, "Unable to create persistent RNN plan.");
+      status = wrap::cudnnSetPersistentRNNPlan(parent, rnn_desc_, rnn_plan_);
+      CUDNN_RETURN_IF_FAIL(status, "Unable to update persistent RNN plan.");
+    }
 #else
+    CHECK(algorithm_config_.is_default())
+        << "Non-default algorithm not supported for CUDA version < 6.0";
     status = wrap::cudnnSetRNNDescriptor(
         parent, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
         num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
         input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, data_type /*dataType*/);
-#endif
+        rnn_mode /*mode*/, compute_type /*dataType*/);
     CUDNN_RETURN_IF_FAIL(status, "Unable to update RNN descriptor");
+#endif
 
     // Create the params handle.
     cudnn_params_desc_.reset(
@@ -1124,14 +1272,35 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
       SetFailure(cudnn_params_desc_->Status());
       return;
     }
+    set_use_tensor_op_math(algorithm_config_.algorithm().tensor_ops_enabled());
   }
   ~CudnnRnnDescriptor() override {
     if (rnn_desc_) {
-      cudnnStatus_t status =
-          wrap::cudnnDestroyRNNDescriptor(parent_, rnn_desc_);
+      cudnnStatus_t status;
+#if CUDNN_VERSION >= 6000
+      if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC && rnn_plan_) {
+        status = wrap::cudnnDestroyPersistentRNNPlan(parent_, rnn_plan_);
+        CUDNN_RETURN_IF_FAIL(status, "Unable to destroy persistent RNN plan.");
+      }
+#endif
+      status = wrap::cudnnDestroyRNNDescriptor(parent_, rnn_desc_);
       CUDNN_RETURN_IF_FAIL(status, "Unable to destroy RNN descriptor");
     }
   }
+  void set_use_tensor_op_math(bool use_tensor_op_math) {
+#if CUDNN_VERSION >= 7000
+    cudnnMathType_t math_type =
+        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
+    if (RnnTensorOpMathEnabled()) {
+      cudnnStatus_t status =
+          wrap::cudnnSetRNNMatrixMathType(parent_, rnn_desc_, math_type);
+      if (status != CUDNN_STATUS_SUCCESS) {
+        LOG(FATAL) << "could not set cudnn RNN math type: "
+                   << ToString(status);
+      }
+    }
+#endif
+  }
   cudnnRNNDescriptor_t handle() const {
     if (!ok()) return nullptr;
     return rnn_desc_;
@@ -1139,10 +1308,15 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   int num_layers() const { return num_layers_; }
   int hidden_size() const { return hidden_size_; }
   int input_size() const { return input_size_; }
+  int batch_size() const { return batch_size_; }
   cudnnRNNInputMode_t input_mode() const { return input_mode_; }
   cudnnDirectionMode_t direction_mode() const { return direction_mode_; }
   cudnnRNNMode_t rnn_mode() const { return rnn_mode_; }
   cudnnDataType_t data_type() const { return data_type_; }
+  cudnnDataType_t compute_type() const { return compute_type_; }
+  const dnn::AlgorithmConfig& algorithm_config() const {
+    return algorithm_config_;
+  }
   int64 ParamsSizeInBytes() const override {
     return cudnn_params_desc_->params_size_in_bytes();
   }
@@ -1169,10 +1343,19 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   int num_layers_;
   int hidden_size_;
   int input_size_;
+  // batch_size_ is set to -1 when not using CUDNN_RNN_ALGO_PERSIST_DYNAMIC
+  // algorithm.
+  int batch_size_;
+#if CUDNN_VERSION >= 6000
+  cudnnRNNAlgo_t rnn_algo_;
+  cudnnPersistentRNNPlan_t rnn_plan_;
+#endif
   cudnnRNNInputMode_t input_mode_;
   cudnnDirectionMode_t direction_mode_;
   cudnnRNNMode_t rnn_mode_;
   cudnnDataType_t data_type_;
+  cudnnDataType_t compute_type_;
+  dnn::AlgorithmConfig algorithm_config_;
   std::unique_ptr<CudnnDropoutDescriptor> cudnn_dropout_desc_;
   std::unique_ptr<CudnnRnnParamsDescriptor> cudnn_params_desc_;
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
@@ -1545,7 +1728,8 @@ bool CudnnSupport::DoRnnForwardImpl(
     const CudnnRnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<T>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
   // extract model parameters
   RnnModelDims model_dims;
   bool res = ExtractAndCheckRnnForward(
@@ -1560,6 +1744,12 @@ bool CudnnSupport::DoRnnForwardImpl(
 
   // check params size
   mutex_lock lock{dnn_handle_mutex_};
+  auto set_stream_status =
+      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
+  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: "
+               << ToString(set_stream_status);
+  }
 
   if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
                              input_desc)) {
@@ -1602,10 +1792,25 @@ bool CudnnSupport::DoRnnForwardImpl(
     }
   }
 
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  const bool is_profiling = output_profile_result != nullptr;
+  if (is_profiling) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init()) {
+      return false;
+    }
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
   // make the forward call
+  cudnnStatus_t status;
   if (!is_training) {
-    cudnnStatus_t status = wrap::cudnnRNNForwardInference(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
+    status = wrap::cudnnRNNForwardInference(
+        this, stream, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
         input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
@@ -1616,14 +1821,9 @@ bool CudnnSupport::DoRnnForwardImpl(
         output_c_desc.handle() /*cyDesc*/, output_c_data->opaque() /*cy*/,
         workspace.opaque() /*workspace*/,
         workspace.size() /*workSpaceSizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Failed to call cudnnRNNForwardInference: "
-                 << ToString(status);
-      return false;
-    }
   } else {
-    cudnnStatus_t status = wrap::cudnnRNNForwardTraining(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
+    status = wrap::cudnnRNNForwardTraining(
+        this, stream, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
         input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
@@ -1636,8 +1836,24 @@ bool CudnnSupport::DoRnnForwardImpl(
         workspace.size() /*workSpaceSizeInBytes*/,
         reserve_space.opaque() /*reserveSpace*/,
         reserve_space.size() /*reserveSpaceSizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Failed to call cudnnRNNForwardTraining"
+  }
+  if (is_profiling) {
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    if (status == CUDNN_STATUS_SUCCESS) {
+      auto algo_desc = rnn_desc.algorithm_config().algorithm();
+      output_profile_result->set_algorithm(algo_desc);
+      output_profile_result->set_elapsed_time_in_ms(
+          timer->GetElapsedMilliseconds());
+    }
+  }
+  if (status != CUDNN_STATUS_SUCCESS) {
+    // Silently return when we are profiling.
+    if (!is_profiling) {
+      LOG(ERROR) << "Failed to call "
+                 << (is_training ? "cudnnRNNForwardTraining "
+                                 : "cudnnRNNForwardInference ")
                  << ToString(status);
       return false;
     }
@@ -1669,7 +1885,8 @@ bool CudnnSupport::DoRnnBackwardImpl(
     DeviceMemory<T>* input_c_backprop_data,
     DeviceMemory<T>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
   // extract model parameters
   RnnModelDims model_dims;
   bool res = ExtractAndCheckRnnForward(
@@ -1683,6 +1900,12 @@ bool CudnnSupport::DoRnnBackwardImpl(
 
   // check params size
   mutex_lock lock{dnn_handle_mutex_};
+  auto set_stream_status =
+      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
+  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: "
+               << ToString(set_stream_status);
+  }
 
   if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
                              input_desc)) {
@@ -1698,12 +1921,27 @@ bool CudnnSupport::DoRnnBackwardImpl(
     return false;
   }
 
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  const bool is_profiling = output_profile_result != nullptr;
+  if (is_profiling) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init()) {
+      return false;
+    }
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
   // make the backward data call
   cudnnStatus_t status = wrap::cudnnRNNBackwardData(
-      parent_, ToHandle(dnn_handle_) /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-      model_dims.seq_length /*seqLength*/, output_desc.handles() /*yDesc*/,
-      output_data.opaque() /*y*/, output_desc.handles() /*dyDesc*/,
-      output_backprop_data.opaque() /*dy*/, output_h_desc.handle() /*dhyDesc*/,
+      this, stream, ToHandle(dnn_handle_) /*handle*/,
+      rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
+      output_desc.handles() /*yDesc*/, output_data.opaque() /*y*/,
+      output_desc.handles() /*dyDesc*/, output_backprop_data.opaque() /*dy*/,
+      output_h_desc.handle() /*dhyDesc*/,
       output_h_backprop_data.opaque() /*dhy*/,
       output_c_desc.handle() /*dcyDesc*/,
       output_c_backprop_data.opaque() /*dcy*/,
@@ -1718,7 +1956,11 @@ bool CudnnSupport::DoRnnBackwardImpl(
       workspace.size() /*workSpaceSizeInBytes*/,
       reserve_space_data->opaque() /*reserveSpace*/,
       reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
+
   if (status != CUDNN_STATUS_SUCCESS) {
+    if (is_profiling) {
+      timer->Stop(AsCUDAStream(stream));
+    }
     LOG(ERROR) << "Failed to call cudnnRNNBackwardData: " << ToString(status);
     return false;
   }
@@ -1728,7 +1970,7 @@ bool CudnnSupport::DoRnnBackwardImpl(
     stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
     // make the backward weight call
     status = wrap::cudnnRNNBackwardWeights(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
+        this, stream, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
         input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
@@ -1740,11 +1982,23 @@ bool CudnnSupport::DoRnnBackwardImpl(
         reserve_space_data->opaque() /*reserveSpace*/,
         reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
     if (status != CUDNN_STATUS_SUCCESS) {
+      if (is_profiling) {
+        timer->Stop(AsCUDAStream(stream));
+      }
       LOG(ERROR) << "Failed to call cudnnRNNBackwardWeights: "
                  << ToString(status);
       return false;
     }
   }
+  if (is_profiling) {
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    auto algo_desc = rnn_desc.algorithm_config().algorithm();
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+  }
 
   return true;
 }
@@ -1752,20 +2006,20 @@ bool CudnnSupport::DoRnnBackwardImpl(
 #endif  // CUDNN_VERSION
 
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
-CudnnSupport::createRnnDescriptor(int num_layers, int hidden_size,
-                                  int input_size, dnn::RnnInputMode input_mode,
-                                  dnn::RnnDirectionMode direction_mode,
-                                  dnn::RnnMode rnn_mode,
-                                  dnn::DataType data_type, float dropout,
-                                  uint64 seed,
-                                  ScratchAllocator* state_allocator) {
+CudnnSupport::createRnnDescriptor(
+    int num_layers, int hidden_size, int input_size, int batch_size,
+    dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
+    dnn::RnnMode rnn_mode, dnn::DataType data_type,
+    const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+    ScratchAllocator* state_allocator) {
 #if CUDNN_VERSION >= 5000
   mutex_lock lock{dnn_handle_mutex_};
   std::unique_ptr<CudnnRnnDescriptor> rnn_desc(new CudnnRnnDescriptor(
       parent_, ToHandle(dnn_handle_), num_layers, hidden_size, input_size,
-      ToCudnnRnnInputMode(input_mode), ToCudnnRnnDirectionMode(direction_mode),
-      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type), dropout, seed,
-      state_allocator));
+      batch_size, ToCudnnRnnInputMode(input_mode),
+      ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode),
+      ToCudnnDataType(data_type), GetRnnComputeType(data_type),
+      algorithm_config, dropout, seed, state_allocator));
   if (!rnn_desc->ok()) {
     return rnn_desc->Status();
   }
@@ -1841,7 +2095,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<Eigen::half>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1862,7 +2117,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -1883,7 +2139,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<float>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1904,7 +2161,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -1926,7 +2184,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<double>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1947,7 +2206,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -1976,7 +2236,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<Eigen::half>* input_c_backprop_data,
     DeviceMemory<Eigen::half>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2000,7 +2261,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2028,7 +2289,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<float>* input_c_backprop_data,
     DeviceMemory<float>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2052,7 +2314,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2081,7 +2343,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<double>* input_c_backprop_data,
     DeviceMemory<double>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2105,7 +2368,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2281,7 +2544,6 @@ struct ConvDoFP32ComputationFP16Input {
 
 // A group of helper functions to return the internal compute type for
 // convolutions in cudnn.
-// TODO(yangzihao): Add support for float64.
 template <typename T>
 cudnnDataType_t GetConvComputeType() {
   return CUDNN_DATA_FLOAT;
@@ -2296,6 +2558,46 @@ cudnnDataType_t GetConvComputeType<Eigen::half>() {
   }
 }
 
+template <>
+cudnnDataType_t GetConvComputeType<double>() {
+  return CUDNN_DATA_DOUBLE;
+}
+
+// A helper struct to decide whether to use FP32 as the internal compute type
+// for rnn when the input data type is FP16. At present it is turned off,
+// users can explicitly control them through an env-var
+// TF_FP16_RNN_USE_FP32_COMPUTE.
+// After the TODO below is fixed, users should almost always use fp32 compute
+// type for training. Using fp16 might suffer suboptimal accuracy due to loss
+// in precision.
+struct RnnDoFP32ComputationFP16Input {
+  static constexpr const char* kName = "TF_FP16_RNN_USE_FP32_COMPUTE";
+  // TODO(jamesqin): b/78182362 flip to true when cudnn 7.1.4 fixes the bug.
+  // Before cudnn 7.1.4 RNN are always done in fp32, no matter what math
+  // precision is set.
+  // Set it temporary to false s.t. no error is raised when using fp16 inputs,
+  // fp32 math precision.
+  static constexpr bool kDefaultFlag = false;
+};
+
+// A helper function to return the internal compute type for
+// RNNs in cudnn.
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
+  switch (data_type) {
+    case dnn::DataType::kFloat:
+      return CUDNN_DATA_FLOAT;
+    case dnn::DataType::kDouble:
+      return CUDNN_DATA_DOUBLE;
+    case dnn::DataType::kHalf:
+      if (CudnnEnvVar<RnnDoFP32ComputationFP16Input>::IsEnabled()) {
+        return CUDNN_DATA_FLOAT;
+      } else {
+        return CUDNN_DATA_HALF;
+      }
+    default:
+      LOG(FATAL) << "Invalid RNN data type: " << static_cast<int>(data_type);
+  }
+}
 }  // namespace
 
 template <class T>
@@ -2318,15 +2620,20 @@ bool CudnnSupport::DoConvolveImpl(
                                    GetConvComputeType<T>()};
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
   // Alpha is the scaling factor for input.
-  float alpha = 1.0;
+  float falpha = 1.0;
+  double dalpha = 1.0;
+  void* alpha = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dalpha)
+                                                : static_cast<void*>(&falpha);
   // Beta is the scaling factor for output.
-  float beta = 0.0;
+  float fbeta = 0.0;
+  double dbeta = 0.0;
+  void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
+                                               : static_cast<void*>(&fbeta);
 
   const bool is_profiling = output_profile_result != nullptr;
   cudnnConvolutionFwdAlgo_t algo;
@@ -2463,12 +2770,12 @@ bool CudnnSupport::DoConvolveImpl(
     }
   }
   status = wrap::cudnnConvolutionForward(
-      parent_, ToHandle(dnn_handle_),
-      /*alpha=*/&alpha, /*srcDesc=*/input_nd.handle(),
+      this, stream, ToHandle(dnn_handle_),
+      /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
       /*algo=*/algo, /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/&beta,
+      /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/beta,
       /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
 
   if (is_profiling) {
@@ -2532,8 +2839,7 @@ bool CudnnSupport::DoFusedConvolveImpl(
       static_cast<cudnnDataType_t>(cudnn_compute_type)};
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   CHECK(status == CUDNN_STATUS_SUCCESS)
       << "failed to set stream for cudnn handle: " << ToString(status);
 
@@ -2599,7 +2905,7 @@ bool CudnnSupport::DoFusedConvolveImpl(
           << "\noutput_data->opaque() = " << output_data->opaque();
 
   status = wrap::cudnnConvolutionBiasActivationForward(
-      parent_, ToHandle(dnn_handle_), /*alpha1=*/&conv_input_scale,
+      this, stream, ToHandle(dnn_handle_), /*alpha1=*/&conv_input_scale,
       /*srcDesc=*/conv_input_nd.handle(), /*srcData=*/conv_input_data.opaque(),
       /*filterDesc=*/filter.handle(), /*filterData=*/filter_data.opaque(),
       /*convDesc=*/conv.handle(), algo, /*workSpace=*/scratch.opaque(),
@@ -2669,6 +2975,30 @@ bool CudnnSupport::GetConvolveAlgorithms(
   return true;
 }
 
+bool CudnnSupport::GetRnnAlgorithms(
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
+  // clang-format off
+#if CUDNN_VERSION >= 6000
+    CUDNN_RNN_ALGO_STANDARD,
+    CUDNN_RNN_ALGO_PERSIST_STATIC,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC,
+#endif
+    // clang-format on
+  };
+
+  out_algorithms->clear();
+  for (auto i : algo_types) {
+    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
+#if CUDNN_VERSION >= 7100
+    if (RnnTensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
+#endif
+  }
+  return true;
+}
+
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
@@ -2780,8 +3110,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -2793,7 +3122,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
       parent_, scale_offset_desc, ToCudnnDataType(scale_data_type)};
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
-  if (BatchnormSpatialPersistentEnabled()) {
+  if (BatchnormSpatialPersistentEnabled() && is_training) {
     mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
   }
 #endif
@@ -2817,7 +3146,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     }
 
     status = wrap::cudnnBatchNormalizationForwardTraining(
-        parent_, ToHandle(dnn_handle_), mode, &one, &zero,
+        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
         x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
         scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), 1.0,
         batch_mean_opaque, batch_var_opaque, epsilon, saved_mean->opaque(),
@@ -2834,7 +3163,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     const void* maybe_inv_var = estimated_variance.opaque();
 #endif
     status = wrap::cudnnBatchNormalizationForwardInference(
-        parent_, ToHandle(dnn_handle_), mode, &one, &zero,
+        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
         x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
         scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(),
         estimated_mean.opaque(), maybe_inv_var, epsilon);
@@ -2885,8 +3214,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
     DeviceMemory<U>* offset_backprop) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -2907,7 +3235,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
   float zero = 0.0;
 
   status = wrap::cudnnBatchNormalizationBackward(
-      parent_, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero,
+      this, stream, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero,
       x_descriptor.handle(), x.opaque(), x_descriptor.handle(),
       y_backprop.opaque(), x_descriptor.handle(), x_backprop->opaque(),
       scale_offset_descriptor.handle(), scale.opaque(),
@@ -2943,10 +3271,14 @@ bool CudnnSupport::DoConvolve(
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<double>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& output_descriptor,
-    DeviceMemory<double>* output_data) {
-  LOG(ERROR) << "double-based DNN not yet implemented";
-  return false;
+    const BatchDescriptor& output_descriptor, DeviceMemory<double>* output_data,
+    ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  return DoConvolveImpl<double>(
+      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
 bool CudnnSupport::DoConvolve(
@@ -3093,7 +3425,7 @@ DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
   float alpha = 1.0f;
   float beta = 0.0f;
   auto status = wrap::cudnnTransformTensor(
-      parent_, ToHandle(dnn_handle_), &alpha, orig_out_back_nd.handle(),
+      this, stream, ToHandle(dnn_handle_), &alpha, orig_out_back_nd.handle(),
       backward_output_data.opaque(), &beta, transformed_out_back_nd.handle(),
       (*transform_scratch)->mutable_device_memory()->opaque());
 
@@ -3112,13 +3444,18 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
                                      dnn::DataType output_type, float scale,
                                      DeviceMemoryBase* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
+  }
+
   float beta = 0.0f;
   ScopedTensorDescriptor input_tensor_desc(
       parent_, input_desc, ToCudnnDataType(input_type, input_desc.layout()));
   ScopedTensorDescriptor output_tensor_desc(
       parent_, output_desc, ToCudnnDataType(output_type, output_desc.layout()));
-  cudnnStatus_t status = wrap::cudnnTransformTensor(
-      parent_, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
+  status = wrap::cudnnTransformTensor(
+      this, stream, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
       input_data.opaque(), &beta, output_tensor_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -3145,16 +3482,22 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
 
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   // Alpha is the scaling factor for input.
-  float alpha = 1.0;
+  float falpha = 1.0;
+  double dalpha = 1.0;
+  void* alpha = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dalpha)
+                                                : static_cast<void*>(&falpha);
   // Beta is the scaling factor for output.
-  float beta = 0.0;
+  float fbeta = 0.0;
+  double dbeta = 0.0;
+  void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
+                                               : static_cast<void*>(&fbeta);
 
   // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass.
   BatchDescriptor output_descriptor;
@@ -3163,7 +3506,6 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
   backward_output_data = MaybeTransformLayout(
       stream, &output_descriptor, backward_output_data, &transform_scratch);
 
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
   ScopedTensorDescriptor in_back_nd{parent_, input_descriptor, cudnn_type};
   ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
@@ -3309,8 +3651,8 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
 #else
   status = wrap::cudnnConvolutionBackwardData_v3(
 #endif
-      parent_, ToHandle(dnn_handle_),
-      /*alpha=*/&alpha,
+      this, stream, ToHandle(dnn_handle_),
+      /*alpha=*/alpha,
       /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(),
       /*diffDesc=*/out_back_nd.handle(),
@@ -3319,7 +3661,7 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
       /*algo=*/algo,
       /*workSpace=*/scratch.opaque(),
       /*workSpaceSizeInBytes=*/scratch.size(),
-      /*beta=*/&beta,
+      /*beta=*/beta,
       /*gradDesc=*/in_back_nd.handle(),
       /*gradData=*/backward_input_data->opaque());
   if (is_profiling) {
@@ -3344,10 +3686,28 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
   return true;
 }
 
+bool CudnnSupport::DoConvolveBackwardData(
+    Stream* stream, const FilterDescriptor& filter_descriptor,
+    const DeviceMemory<double>& filter_data,
+    const BatchDescriptor& output_descriptor,
+    DeviceMemory<double> backward_output_data,
+    const ConvolutionDescriptor& convolution_descriptor,
+    const BatchDescriptor& input_descriptor,
+    DeviceMemory<double>* backward_input_data,
+    ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                    output_descriptor, backward_output_data,
+                                    convolution_descriptor, input_descriptor,
+                                    backward_input_data, scratch_allocator,
+                                    algorithm_config, output_profile_result);
+}
+
 bool CudnnSupport::DoConvolveBackwardData(
     Stream* stream, const FilterDescriptor& filter_descriptor,
     const DeviceMemory<float>& filter_data,
-    const BatchDescriptor& output_descriptor_in,
+    const BatchDescriptor& output_descriptor,
     DeviceMemory<float> backward_output_data,
     const ConvolutionDescriptor& convolution_descriptor,
     const BatchDescriptor& input_descriptor,
@@ -3356,7 +3716,7 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                    output_descriptor_in, backward_output_data,
+                                    output_descriptor, backward_output_data,
                                     convolution_descriptor, input_descriptor,
                                     backward_input_data, scratch_allocator,
                                     algorithm_config, output_profile_result);
@@ -3365,7 +3725,7 @@ bool CudnnSupport::DoConvolveBackwardData(
 bool CudnnSupport::DoConvolveBackwardData(
     Stream* stream, const FilterDescriptor& filter_descriptor,
     const DeviceMemory<Eigen::half>& filter_data,
-    const BatchDescriptor& output_descriptor_in,
+    const BatchDescriptor& output_descriptor,
     DeviceMemory<Eigen::half> backward_output_data,
     const ConvolutionDescriptor& convolution_descriptor,
     const BatchDescriptor& input_descriptor,
@@ -3374,7 +3734,7 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                    output_descriptor_in, backward_output_data,
+                                    output_descriptor, backward_output_data,
                                     convolution_descriptor, input_descriptor,
                                     backward_input_data, scratch_allocator,
                                     algorithm_config, output_profile_result);
@@ -3392,16 +3752,22 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
 
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   // Alpha is the scaling factor for input.
-  float alpha = 1.0;
+  float falpha = 1.0;
+  double dalpha = 1.0;
+  void* alpha = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dalpha)
+                                                : static_cast<void*>(&falpha);
   // Beta is the scaling factor for output.
-  float beta = 0.0;
+  float fbeta = 0.0;
+  double dbeta = 0.0;
+  void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
+                                               : static_cast<void*>(&fbeta);
 
   // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass.
   BatchDescriptor output_descriptor;
@@ -3410,7 +3776,6 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
   backward_output_data = MaybeTransformLayout(
       stream, &output_descriptor, backward_output_data, &transform_scratch);
 
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
   ScopedTensorDescriptor input_nd{parent_, input_descriptor, cudnn_type};
   ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
@@ -3557,7 +3922,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
 #else
   status = wrap::cudnnConvolutionBackwardFilter_v3(
 #endif
-      parent_, ToHandle(dnn_handle_), /*alpha=*/&alpha,
+      this, stream, ToHandle(dnn_handle_), /*alpha=*/alpha,
       /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(),
       /*diffDesc=*/out_back_nd.handle(),
@@ -3566,7 +3931,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
       /*algo=*/algo,
       /*workSpace=*/scratch.opaque(),
       /*workSpaceSizeInBytes=*/scratch.size(),
-      /*beta=*/&beta,
+      /*beta=*/beta,
       /*gradDesc=*/filter.handle(),
       /*gradData=*/backward_filter_data->opaque());
 
@@ -3592,10 +3957,28 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
   return true;
 }
 
+bool CudnnSupport::DoConvolveBackwardFilter(
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+    const DeviceMemory<double>& input_data,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<double> backward_output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemory<double>* backward_filter_data,
+    ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  return DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
+                                      output_descriptor, backward_output_data,
+                                      convolution_descriptor, filter_descriptor,
+                                      backward_filter_data, scratch_allocator,
+                                      algorithm_config, output_profile_result);
+}
+
 bool CudnnSupport::DoConvolveBackwardFilter(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<float>& input_data,
-    const dnn::BatchDescriptor& output_descriptor_in,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<float> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3603,17 +3986,17 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardFilterImpl(
-      stream, input_descriptor, input_data, output_descriptor_in,
-      backward_output_data, convolution_descriptor, filter_descriptor,
-      backward_filter_data, scratch_allocator, algorithm_config,
-      output_profile_result);
+  return DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
+                                      output_descriptor, backward_output_data,
+                                      convolution_descriptor, filter_descriptor,
+                                      backward_filter_data, scratch_allocator,
+                                      algorithm_config, output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<Eigen::half>& input_data,
-    const dnn::BatchDescriptor& output_descriptor_in,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<Eigen::half> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3621,11 +4004,11 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardFilterImpl(
-      stream, input_descriptor, input_data, output_descriptor_in,
-      backward_output_data, convolution_descriptor, filter_descriptor,
-      backward_filter_data, scratch_allocator, algorithm_config,
-      output_profile_result);
+  return DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
+                                      output_descriptor, backward_output_data,
+                                      convolution_descriptor, filter_descriptor,
+                                      backward_filter_data, scratch_allocator,
+                                      algorithm_config, output_profile_result);
 }
 
 template <class T>
@@ -3635,8 +4018,7 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<T>* backward_bias_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -3651,7 +4033,7 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
   float beta = 0.0;
 
   status = wrap::cudnnConvolutionBackwardBias(
-      parent_, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
+      this, stream, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
       input_data.opaque(), &beta, bias_nd.handle(),
       backward_bias_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -3856,8 +4238,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
   }
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -3871,7 +4252,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
 #else
   status = wrap::cudnnAddTensor_v3(
 #endif
-      parent_, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
+      this, stream, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
       biases.opaque(), &beta, input_descriptor.handle(), output_data->opaque());
 
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -3889,8 +4270,7 @@ bool CudnnSupport::DoActivate(Stream* stream,
                               DeviceMemory<float>* output_data,
                               uint64 options) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -3934,7 +4314,7 @@ bool CudnnSupport::DoActivate(Stream* stream,
   // Beta is the output scaling factor.
   float beta = 0.0;
   status = wrap::cudnnActivationForward(
-      parent_, ToHandle(dnn_handle_),
+      this, stream, ToHandle(dnn_handle_),
 #if CUDNN_VERSION >= 5000
       activation_desc.handle(),
 #else
@@ -3958,8 +4338,7 @@ bool CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<double>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -3975,7 +4354,7 @@ bool CudnnSupport::DoPoolForward(
                                    CUDNN_DATA_DOUBLE};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingForward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -3993,8 +4372,7 @@ bool CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<float>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4010,7 +4388,7 @@ bool CudnnSupport::DoPoolForward(
                                    CUDNN_DATA_FLOAT};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingForward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4028,8 +4406,7 @@ bool CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<Eigen::half>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4044,7 +4421,7 @@ bool CudnnSupport::DoPoolForward(
   ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingForward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4064,8 +4441,7 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<double>& input_diff_data,
     DeviceMemory<double>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4081,7 +4457,7 @@ bool CudnnSupport::DoPoolBackward(
                                    CUDNN_DATA_DOUBLE};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingBackward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
       input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
       src_desc.handle(), output_diff_data->opaque());
@@ -4102,8 +4478,7 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<float>& input_diff_data,
     DeviceMemory<float>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4119,7 +4494,7 @@ bool CudnnSupport::DoPoolBackward(
                                    CUDNN_DATA_FLOAT};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingBackward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
       input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
       src_desc.handle(), output_diff_data->opaque());
@@ -4140,8 +4515,7 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<Eigen::half>& input_diff_data,
     DeviceMemory<Eigen::half>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4156,7 +4530,7 @@ bool CudnnSupport::DoPoolBackward(
   ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingBackward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
       input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
       src_desc.handle(), output_diff_data->opaque());
@@ -4191,8 +4565,7 @@ bool CudnnSupport::DoNormalizeWithDimensions(
 
   // Launch the normalization.
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4207,7 +4580,7 @@ bool CudnnSupport::DoNormalizeWithDimensions(
   float beta = 0.0f;
 
   status = wrap::cudnnLRNCrossChannelForward(
-      parent_, ToHandle(dnn_handle_), normalize.handle(),
+      this, stream, ToHandle(dnn_handle_), normalize.handle(),
       CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(), input_data.opaque(),
       &beta, dims.handle(), output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4234,8 +4607,7 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
   }
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4248,7 +4620,7 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
   float beta = 0.0f;
 
   status = wrap::cudnnLRNCrossChannelBackward(
-      parent_, ToHandle(dnn_handle_), normalize.handle(),
+      this, stream, ToHandle(dnn_handle_), normalize.handle(),
       CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(),
       normalized_data.opaque(), dims.handle(),
       normalized_variable_gradient.opaque(), dims.handle(), raw_data.opaque(),
@@ -4397,46 +4769,39 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
 
 }  // namespace cuda
 
-namespace gpu = ::perftools::gputools;
-
 void initialize_cudnn() {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::DnnFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuDnnPlugin, "cuDNN",
-              [](gpu::internal::StreamExecutorInterface*
-                     parent) -> gpu::dnn::DnnSupport* {
-                gpu::cuda::CUDAExecutor* cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor*>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuBLAS "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                gpu::cuda::CudnnSupport* dnn =
-                    new gpu::cuda::CudnnSupport(cuda_executor);
-                if (!dnn->Init().ok()) {
-                  // Note: Init() will log a more specific error.
-                  delete dnn;
-                  return nullptr;
-                }
-                return dnn;
-              });
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
+          cuda::kCudaPlatformId, cuda::kCuDnnPlugin, "cuDNN",
+          [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
+            cuda::CUDAExecutor* cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor*>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR)
+                  << "Attempting to initialize an instance of the cuBLAS "
+                  << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            cuda::CudnnSupport* dnn = new cuda::CudnnSupport(cuda_executor);
+            if (!dnn->Init().ok()) {
+              // Note: Init() will log a more specific error.
+              delete dnn;
+              return nullptr;
+            }
+            return dnn;
+          });
 
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuDNN factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kDnn,
-                                                     gpu::cuda::kCuDnnPlugin);
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kDnn, cuda::kCuDnnPlugin);
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(register_cudnn,
-                            { perftools::gputools::initialize_cudnn(); });
+                            { stream_executor::initialize_cudnn(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 40aa974dd967df50075da6f2bb34439cd238a113..7d53dbe4a5c50c84182e6b63c3b9d1d62e5837e3 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 class CUDAExecutor;
@@ -46,12 +45,14 @@ class CudnnSupport : public dnn::DnnSupport {
   ~CudnnSupport() override;
 
   port::Status Init() override;
+  port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
-      int num_layers, int hidden_size, int input_size,
+      int num_layers, int hidden_size, int input_size, int batch_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-      dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout,
-      uint64 seed, ScratchAllocator* state_allocator) override;
+      dnn::RnnMode rnn_mode, dnn::DataType data_type,
+      const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator* state_allocator) override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
@@ -77,7 +78,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<Eigen::half>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -94,7 +96,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<float>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -111,7 +114,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<double>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -135,7 +139,8 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<Eigen::half>* input_c_backprop_data,
                      DeviceMemory<Eigen::half>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -159,7 +164,8 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<float>* input_c_backprop_data,
                      DeviceMemory<float>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -183,12 +189,16 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<double>* input_c_backprop_data,
                      DeviceMemory<double>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool GetConvolveAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
+  bool GetRnnAlgorithms(
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
@@ -259,7 +269,10 @@ class CudnnSupport : public dnn::DnnSupport {
                   const DeviceMemory<double>& filter_data,
                   const dnn::ConvolutionDescriptor& convolution_descriptor,
                   const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<double>* output_data) override;
+                  DeviceMemory<double>* output_data,
+                  ScratchAllocator* scratch_allocator,
+                  const dnn::AlgorithmConfig& algorithm_config,
+                  dnn::ProfileResult* output_profile_result) override;
 
   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
                   const DeviceMemory<Eigen::half>& input_data,
@@ -371,6 +384,18 @@ class CudnnSupport : public dnn::DnnSupport {
     return false;
   }
 
+  bool DoConvolveBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<double>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<double>* backward_input_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
+
   bool DoConvolveBackwardData(
       Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<float>& filter_data,
@@ -395,6 +420,18 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
+  bool DoConvolveBackwardFilter(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<double>& input_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemory<double>* backward_filter_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
+
   bool DoConvolveBackwardFilter(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<float>& input_data,
@@ -587,10 +624,27 @@ class CudnnSupport : public dnn::DnnSupport {
                          dnn::DataType output_type, float scale,
                          DeviceMemoryBase* output_data) override;
 
- private:
-  // Guards the enqueueing of DNN operations via the dnn_handle_ below.
+  const Stream* GetCurrentDnnStream() const
+      SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
+    return current_dnn_stream_;
+  }
+
+  void SetCurrentDnnStream(Stream* stream)
+      EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_) {
+    current_dnn_stream_ = stream;
+  }
+
+  CUDAExecutor* GetParentExecutor() { return parent_; }
+
+  // Guards the enqueueing of DNN operations via the dnn_handle_ below, and
+  // access to current_dnn_stream_.
+  //
+  // This is a public member because we need to add thread safty annotations in
+  // the cudnn wrapper functions in the cc file, which need to access this
+  // mutex (the annotations require C++ permission checks).
   mutex dnn_handle_mutex_;
 
+ private:
   CUDAExecutor* parent_;  // Parent executor object. Not owned.
 
   // cudnn library handle. cudnnHandle_t type is not present in this header to
@@ -598,6 +652,9 @@ class CudnnSupport : public dnn::DnnSupport {
   // single cuda_dnn translation unit.
   void* dnn_handle_ GUARDED_BY(dnn_handle_mutex_);
 
+  // The current cudnn stream that is set by cudnnSetStream().
+  Stream* current_dnn_stream_ GUARDED_BY(dnn_handle_mutex_);
+
   // NOTE(keveman): Temporary data layout transformation until cuDNN supports
   // kBatchYXDepth for backward pass. This function allocates temporary memory,
   // lays out the source data into the temporary but in the kBatchDepthXY
@@ -719,7 +776,8 @@ class CudnnSupport : public dnn::DnnSupport {
                         const CudnnRnnStateTensorDescriptor& output_c_desc,
                         DeviceMemory<T>* output_c_data, bool is_training,
                         ScratchAllocator* reserve_space_allocator,
-                        ScratchAllocator* workspace_allocator);
+                        ScratchAllocator* workspace_allocator,
+                        dnn::ProfileResult* output_profile_result);
 
   template <class T>
   bool DoRnnBackwardImpl(Stream* stream, const CudnnRnnDescriptor& rnn_desc,
@@ -744,13 +802,13 @@ class CudnnSupport : public dnn::DnnSupport {
                          DeviceMemory<T>* input_c_backprop_data,
                          DeviceMemory<T>* params_backprop_data,
                          DeviceMemory<uint8>* reserve_space_data,
-                         ScratchAllocator* workspace_allocator);
+                         ScratchAllocator* workspace_allocator,
+                         dnn::ProfileResult* output_profile_result);
 
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index a017ff64d4c69b6952b442464877dc26a800ad37..71cab145b9bb5a8320df8fb78241dd0d37d54239 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -37,14 +37,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-#if defined(PLATFORM_WINDOWS)
-// TODO: in windows ARRAYSIZE is defined in winnt.h but including it
-//  here creates a conflict with cuda.h - for now define it here.
-#define ARRAYSIZE(a) \
-  ((sizeof(a) / sizeof(*(a))) / \
-  static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 bool FLAGS_gpuexec_cuda_device_0_only = false;
@@ -53,8 +45,7 @@ bool FLAGS_gpuexec_cuda_device_0_only = false;
 // matches the expected one.
 constexpr bool kVerifyCudaContext = false;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 namespace {
@@ -720,15 +711,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
         port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
         port::bit_cast<void *>(info_log_buffer.data()),
         port::bit_cast<void *>(uintptr_t(log_verbose))};
-    CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values));
+    CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
 
     CUresult res;
     {
       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
       // module loading: see http://b/13248943
 
-      res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options,
-                               option_values);
+      res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options),
+                               options, option_values);
     }
 
     // The PTX JIT mutates the values in the option values array to reflect the
@@ -1503,6 +1494,19 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
+/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
+    CUdevice_attribute attribute, CUdevice device) {
+  int val;
+  CUresult res = cuDeviceGetAttribute(&val, attribute, device);
+  if (res != CUDA_SUCCESS) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf("failed to get device attribute %d for device %d: %s",
+                     attribute, device, ToString(res).c_str())};
+  }
+  return val;
+}
+
 /* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
   int value = -1;
   CUresult res =
@@ -1636,5 +1640,4 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index 4002ba2021d1a2e2c36bd1786a3084ee8c08bb78..a9969e247e181599f2b3707f6c65c6527dd4683d 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "cuda/include/cuda.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Identifies the memory space where an allocation resides. See
@@ -400,12 +399,20 @@ class CUDADriver {
 
   // Returns a grab-bag of device properties in a caller-owned device_properties
   // structure for device_ordinal via cuDeviceGetProperties.
-  // This call is deprecated in the NVIDIA driver API.
+  //
+  // This call is deprecated in the NVIDIA driver API; its replacement is
+  // GetDeviceAttribute
   //
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
   static bool GetDeviceProperties(CUdevprop *device_properties,
                                   int device_ordinal);
 
+  // Gets a specific integer-valued property about the given device.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
+                                                CUdevice device);
+
   // Returns whether ECC is enabled for the given CUdevice via
   // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
@@ -498,7 +505,6 @@ class CudaContext {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_event.cc b/tensorflow/stream_executor/cuda/cuda_event.cc
index 1b41502300d446b55708c2afb6f5538bdf6cf220..96dcf173566087db475e3b237591d19f06128d92 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.cc
+++ b/tensorflow/stream_executor/cuda/cuda_event.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 CUDAEvent::CUDAEvent(CUDAExecutor* parent)
@@ -68,5 +67,4 @@ const CUevent& CUDAEvent::cuda_event() {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
index 56667e65d38199fd4c340147c4e40a17c5bb2b2d..f62344672ed624f1ed60b5452d33b6f8273f2b47 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // CUDAEvent wraps a CUevent in the platform-independent EventInterface
@@ -58,7 +57,6 @@ class CUDAEvent : public internal::EventInterface {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index a922f14fb4af695877b449d2f960fae1a356a82f..5b34740f9f1f9067f949ad41bcae0b97d3d3c7f4 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -31,8 +31,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
@@ -44,7 +43,7 @@ namespace wrap {
 // manner on first use. This dynamic loading technique is used to avoid DSO
 // dependencies on vendor libraries which may or may not be available in the
 // deployed binary environment.
-#define PERFTOOLS_GPUTOOLS_CUFFT_WRAP(__name)                    \
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                       \
   struct WrapperShim__##__name {                                 \
     template <typename... Args>                                  \
     cufftResult operator()(CUDAExecutor *parent, Args... args) { \
@@ -68,7 +67,7 @@ namespace wrap {
                                               __macro(cufftGetSizeMany)        \
                                                   __macro(cufftMakePlanMany)
 
-CUFFT_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUFFT_WRAP)
+CUFFT_ROUTINE_EACH(STREAM_EXECUTOR_CUFFT_WRAP)
 
 }  // namespace wrap
 
@@ -514,62 +513,59 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
   return true;
 }
 
-#define PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2, \
-                                           __fft_type3)                      \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                       \
-                      const DeviceMemory<std::complex<__type>> &input,       \
-                      DeviceMemory<std::complex<__type>> *output) {          \
-    return DoFftWithDirectionInternal(                                       \
-        stream, plan, wrap::cufftExec##__fft_type1, input, output);          \
-  }                                                                          \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                       \
-                      const DeviceMemory<__type> &input,                     \
-                      DeviceMemory<std::complex<__type>> *output) {          \
-    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type2, input,  \
-                         output);                                            \
-  }                                                                          \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                       \
-                      const DeviceMemory<std::complex<__type>> &input,       \
-                      DeviceMemory<__type> *output) {                        \
-    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type3, input,  \
-                         output);                                            \
+#define STREAM_EXECUTOR_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2,   \
+                                        __fft_type3)                        \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
+                      const DeviceMemory<std::complex<__type>> &input,      \
+                      DeviceMemory<std::complex<__type>> *output) {         \
+    return DoFftWithDirectionInternal(                                      \
+        stream, plan, wrap::cufftExec##__fft_type1, input, output);         \
+  }                                                                         \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
+                      const DeviceMemory<__type> &input,                    \
+                      DeviceMemory<std::complex<__type>> *output) {         \
+    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type2, input, \
+                         output);                                           \
+  }                                                                         \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
+                      const DeviceMemory<std::complex<__type>> &input,      \
+                      DeviceMemory<__type> *output) {                       \
+    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type3, input, \
+                         output);                                           \
   }
 
-PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
-PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
+STREAM_EXECUTOR_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
+STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
 
-#undef PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT
+#undef STREAM_EXECUTOR_CUDA_DEFINE_FFT
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
-
-namespace gpu = ::perftools::gputools;
-
-REGISTER_MODULE_INITIALIZER(register_cufft, {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::FftFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuFftPlugin, "cuFFT",
-              [](gpu::internal::StreamExecutorInterface
-                     *parent) -> gpu::fft::FftSupport * {
-                gpu::cuda::CUDAExecutor *cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuFFT "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                return new gpu::cuda::CUDAFft(cuda_executor);
-              });
+
+void initialize_cufft() {
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
+          cuda::kCudaPlatformId, cuda::kCuFftPlugin, "cuFFT",
+          [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
+            cuda::CUDAExecutor *cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR) << "Attempting to initialize an instance of the cuFFT "
+                         << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            return new cuda::CUDAFft(cuda_executor);
+          });
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuFFT factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kFft,
-                                                     gpu::cuda::kCuFftPlugin);
-});
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kFft, cuda::kCuFftPlugin);
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_cufft,
+                            { stream_executor::initialize_cufft(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index 04c7dfe501c451e4848bef68bed9685c079dd523..8171e61418a3185455e50ee76315eb2493c36c01 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -133,7 +132,6 @@ class CUDAFft : public fft::FftSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 4bbd531e14f18fc24d87b4fa655fe72e9f56b129..7c87d33d21b58af477a541953b9030e581cfa46d 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -66,8 +66,7 @@ limitations under the License.
 extern bool FLAGS_check_gpu_leaks;
 bool FLAGS_prefer_cubin_to_ptx = true;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
@@ -1103,6 +1102,18 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
     builder.set_device_memory_size(device_memory_size);
   }
 
+  port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
+      CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
+  port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
+      CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
+  if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
+    // Times 2 because HBM is DDR memory; it gets two data bits per each data
+    // lane.
+    builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
+                                 1000 *
+                                 int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
+  }
+
   {
     BlockDim block_dim_limit;
     FillBlockDimLimit(&block_dim_limit);
@@ -1115,7 +1126,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
     builder.set_name(device_name);
   }
 
-  for (size_t i = 0; i < ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
+  for (size_t i = 0; i < TF_ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
     const auto &params = kAllUnqueryableDeviceParams[i];
     if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) {
       builder.set_blocks_per_core_limit(params.blocks_per_core_limit);
@@ -1156,17 +1167,14 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
 
 }  // namespace cuda
 
-namespace gpu = ::perftools::gputools;
-
 void initialize_cuda_gpu_executor() {
-  *gpu::internal::MakeCUDAExecutorImplementation() = [](
-      const gpu::PluginConfig &config) {
-    return new gpu::cuda::CUDAExecutor{config};
+  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig &config) {
+    return new cuda::CUDAExecutor{config};
   };
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(
-    cuda_gpu_executor, {perftools::gputools::initialize_cuda_gpu_executor();});
+REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {
+  stream_executor::initialize_cuda_gpu_executor();
+});
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index dbbbcd476f096ff912d391604ba349f6cb979478..f686685474b35acfb54c327401500c42109006d0 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -35,8 +35,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // CUDA-platform implementation of the platform-agnostic
@@ -273,7 +272,6 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h
index 6a6134bf881646991065cba536e955ef7c29e88c..d55706c66a9b47abfe125eaaa09e4b0cc543622a 100644
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "cuda/include/cuComplex.h"
 #include "cuda/include/cuda.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 template <typename ElemT>
 class DeviceMemory;
@@ -101,7 +100,6 @@ inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
index 88d29fddd06ad7c07bf8e90f490db2f4458e3684..beaebe8f1233533053c97bbac7eb283deaf96a2c 100644
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@@ -40,8 +40,7 @@ limitations under the License.
     "CUDA runtime being included into CUDA GPU executor; should be driver only."
 #endif
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Wraps a CUfunction to implement the platform-independent KernelInterface.
@@ -124,7 +123,6 @@ inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 3a738461489212a026197bc58777883349ba4b54..649224a20e959a27896ec1c1ee61333a3a3c1ec1 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 namespace {
 
@@ -41,16 +40,16 @@ const DeviceOptions GetDeviceOptionsFromEnv() {
       std::getenv("TF_CUDA_PLATFORM_GPU_DEVICE_SCHEDULE");
 
   if (gpu_schedule_string == nullptr) {
-    return perftools::gputools::DeviceOptions::Default();
+    return DeviceOptions::Default();
   }
 
   unsigned device_flags = 0;
   if (strcmp(kScheduleSpinString, gpu_schedule_string) == 0) {
-    device_flags = perftools::gputools::DeviceOptions::kScheduleSpin;
+    device_flags = DeviceOptions::kScheduleSpin;
   } else if (strcmp(kScheduleYieldString, gpu_schedule_string) == 0) {
-    device_flags = perftools::gputools::DeviceOptions::kScheduleYield;
+    device_flags = DeviceOptions::kScheduleYield;
   } else if (strcmp(kScheduleBlockingSyncString, gpu_schedule_string) == 0) {
-    device_flags = perftools::gputools::DeviceOptions::kScheduleBlockingSync;
+    device_flags = DeviceOptions::kScheduleBlockingSync;
   } else {
     LOG(QFATAL) << "Unknown option for environment variable "
                    "TF_CUDA_PLATFORM_GPU_DEVICE_SCHEDULE "
@@ -59,7 +58,7 @@ const DeviceOptions GetDeviceOptionsFromEnv() {
                 << ", " << kScheduleYieldString << "}";
   }
 
-  return perftools::gputools::DeviceOptions(device_flags);
+  return DeviceOptions(device_flags);
 }
 
 }  // namespace
@@ -169,8 +168,8 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<CUDAExecutor>(config.plugin_config));
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<CUDAExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
@@ -202,11 +201,10 @@ static void InitializeCudaPlatform() {
   SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(cuda_platform,
-                            perftools::gputools::InitializeCudaPlatform());
+                            stream_executor::InitializeCudaPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h
index dab25602d08f394a2f8504ef9affe92552e3311f..fc0e15d5a6a9142f064085d34fcfaedfb25f433a 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@@ -31,8 +31,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Opaque and unique identifier for the CUDA platform plugin.
@@ -104,7 +103,6 @@ class CudaPlatform : public Platform {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_platform_id.cc b/tensorflow/stream_executor/cuda/cuda_platform_id.cc
index dfd11a9abe87852924d5a0f52454ebd432f566e4..a7bb304cc8cde52dc8aa4512e911de3f40188100 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform_id.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform_id.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLATFORM_DEFINE_ID(kCudaPlatformId);
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_platform_id.h b/tensorflow/stream_executor/cuda/cuda_platform_id.h
index c677724517c1394284fb8b723933fa4b71d48ceb..92bcfd8372262dc127b71d399b6249efa9eff5dc 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform_id.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform_id.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Opaque and unique identifier for the cuda platform.
@@ -30,7 +29,6 @@ namespace cuda {
 extern const Platform::Id kCudaPlatformId;
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 8641b6022776048baf93723d914d68b4a73f881c..e289e7ced57b164412b0fd78b538c3b08c7db5aa 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -54,15 +54,14 @@ std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
   }
 }
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
 
 namespace wrap {
 
-#define PERFTOOLS_GPUTOOLS_CURAND_WRAP(__name)                      \
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     template <typename... Args>                                     \
     curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
@@ -71,15 +70,15 @@ namespace wrap {
     }                                                               \
   } __name;
 
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandCreateGenerator);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandDestroyGenerator);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetStream);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniform);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniformDouble);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetGeneratorOffset);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormal);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormalDouble);
+STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
+STREAM_EXECUTOR_CURAND_WRAP(curandDestroyGenerator);
+STREAM_EXECUTOR_CURAND_WRAP(curandSetStream);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateUniform);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateUniformDouble);
+STREAM_EXECUTOR_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed);
+STREAM_EXECUTOR_CURAND_WRAP(curandSetGeneratorOffset);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormal);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormalDouble);
 
 }  // namespace wrap
 
@@ -271,42 +270,40 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
-
-namespace gpu = ::perftools::gputools;
-
-REGISTER_MODULE_INITIALIZER(register_curand, {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::RngFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuRandPlugin, "cuRAND",
-              [](gpu::internal::StreamExecutorInterface
-                     *parent) -> gpu::rng::RngSupport * {
-                gpu::cuda::CUDAExecutor *cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuRAND "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                gpu::cuda::CUDARng *rng = new gpu::cuda::CUDARng(cuda_executor);
-                if (!rng->Init()) {
-                  // Note: Init() will log a more specific error.
-                  delete rng;
-                  return nullptr;
-                }
-                return rng;
-              });
+
+void initialize_curand() {
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
+          cuda::kCudaPlatformId, cuda::kCuRandPlugin, "cuRAND",
+          [](internal::StreamExecutorInterface *parent) -> rng::RngSupport * {
+            cuda::CUDAExecutor *cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR)
+                  << "Attempting to initialize an instance of the cuRAND "
+                  << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            cuda::CUDARng *rng = new cuda::CUDARng(cuda_executor);
+            if (!rng->Init()) {
+              // Note: Init() will log a more specific error.
+              delete rng;
+              return nullptr;
+            }
+            return rng;
+          });
 
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuRAND factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kRng,
-                                                     gpu::cuda::kCuRandPlugin);
-});
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kRng, cuda::kCuRandPlugin);
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_curand,
+                            { stream_executor::initialize_curand(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h
index 5bbfd0b37a0dafb525e2ed57ccec2daa0a0c52d8..57ef398aaa88da7de769c49820325c6c9feb4d70 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.h
+++ b/tensorflow/stream_executor/cuda/cuda_rng.h
@@ -24,8 +24,7 @@ limitations under the License.
 
 typedef struct curandGenerator_st *curandGenerator_t;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 template <typename ElemT>
@@ -98,7 +97,6 @@ class CUDARng : public rng::RngSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.cc b/tensorflow/stream_executor/cuda/cuda_stream.cc
index 3eb37a7d84189cacb54d1e45fb0113030a9402f2..b5aa7694f7e1d8d47f3252d3ba679292155119b5 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.cc
+++ b/tensorflow/stream_executor/cuda/cuda_stream.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 bool CUDAStream::Init() {
@@ -59,5 +58,4 @@ CUstream AsCUDAStreamValue(Stream *stream) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h
index 7358243dc4087194aa67da9958546c0487e95b8a..02edff643117fc2e3c6e6f74d2932f3f4c00c66d 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.h
+++ b/tensorflow/stream_executor/cuda/cuda_stream.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 class CUDAExecutor;
@@ -89,7 +88,6 @@ CUDAStream *AsCUDAStream(Stream *stream);
 CUstream AsCUDAStreamValue(Stream *stream);
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/cuda/cuda_timer.cc
index 4bd5503348f4dc92a0ce3c18aaf0128174a94121..991a12a23d632bd9fb4c97a340e244f6ffb4c7d3 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/cuda/cuda_timer.cc
@@ -20,23 +20,24 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 bool CUDATimer::Init() {
   CHECK(start_event_ == nullptr && stop_event_ == nullptr);
   CudaContext* context = parent_->cuda_context();
-  if (!CUDADriver::CreateEvent(context, &start_event_,
-                               CUDADriver::EventFlags::kDefault)
-           .ok()) {
+  port::Status status = CUDADriver::CreateEvent(
+      context, &start_event_, CUDADriver::EventFlags::kDefault);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
     return false;
   }
 
-  if (!CUDADriver::CreateEvent(context, &stop_event_,
-                               CUDADriver::EventFlags::kDefault)
-           .ok()) {
-    port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
+  status = CUDADriver::CreateEvent(context, &stop_event_,
+                                   CUDADriver::EventFlags::kDefault);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    status = CUDADriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
       LOG(ERROR) << status;
     }
@@ -71,18 +72,23 @@ float CUDATimer::GetElapsedMilliseconds() const {
   return elapsed_milliseconds;
 }
 
-bool CUDATimer::Start(CUDAStream *stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), start_event_,
-                                 stream->cuda_stream())
-      .ok();
+bool CUDATimer::Start(CUDAStream* stream) {
+  port::Status status = CUDADriver::RecordEvent(
+      parent_->cuda_context(), start_event_, stream->cuda_stream());
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
-bool CUDATimer::Stop(CUDAStream *stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), stop_event_,
-                                 stream->cuda_stream())
-      .ok();
+bool CUDATimer::Stop(CUDAStream* stream) {
+  port::Status status = CUDADriver::RecordEvent(
+      parent_->cuda_context(), stop_event_, stream->cuda_stream());
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index 4a2714dc1fada4f6081f7edc421660bef300a5b7..70554ec93120fcb0251ba0995a1ce9d6e5997016 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 class CUDAExecutor;
@@ -60,13 +59,13 @@ class CUDATimer : public internal::TimerInterface {
   // events.
   float GetElapsedMilliseconds() const;
 
-  // See perftools::gputools::Timer::Microseconds().
+  // See Timer::Microseconds().
   // TODO(leary) make this into an error code interface...
   uint64 Microseconds() const override {
     return GetElapsedMilliseconds() * 1e3;
   }
 
-  // See perftools::GPUTools::Timer::Nanoseconds().
+  // See Timer::Nanoseconds().
   uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
 
  private:
@@ -77,8 +76,14 @@ class CUDATimer : public internal::TimerInterface {
                          // executing in a stream.
 };
 
+struct TimerDeleter {
+  void operator()(CUDATimer *t) {
+    t->Destroy();
+    delete t;
+  }
+};
+
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.cc b/tensorflow/stream_executor/cuda/cudnn_version.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8fcc0361850a561928d09f29f78fb57071c24b2
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_version.cc
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cudnn_version.h"
+
+namespace stream_executor {
+namespace cuda {
+
+bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
+                                        CudnnVersion loaded_version) {
+  // Major version is neither forward or backward compatible and therefore major
+  // versions needs to match between source and library.
+  //
+  // Minor version is backward-compatible beginning with CuDNN 7 and therefore
+  // minor version of library needs to be same or higher.
+  //
+  // Patch releases are always forward and backward compatible and therefore
+  // need not match.
+  if (loaded_version.major_version != source_version.major_version) {
+    return false;
+  }
+  return ((loaded_version.minor_version == source_version.minor_version) ||
+          (source_version.major_version >= 7 &&
+           loaded_version.minor_version >= source_version.minor_version));
+}
+
+}  // namespace cuda
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..6464e7f8e8755b5b46b90a4b35d50509eb0cfde7
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace stream_executor {
+namespace cuda {
+
+struct CudnnVersion {
+  CudnnVersion() = default;
+
+  CudnnVersion(int major, int minor, int patch)
+      : major_version(major), minor_version(minor), patch_level(patch) {}
+
+  tensorflow::string ToString() const {
+    return tensorflow::strings::StrCat(major_version, ".", minor_version, ".",
+                                       patch_level);
+  }
+
+  int major_version;
+  int minor_version;
+  int patch_level;
+};
+
+// Returns true if the given source CuDNN version is compatible with the given
+// loaded version.
+bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
+                                        CudnnVersion loaded_version);
+
+}  // namespace cuda
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d4c6399d040e9bcddff5d98d202ab00fdeffa58
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cudnn_version.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace stream_executor {
+namespace cuda {
+namespace {
+
+TEST(CuDNNVersion, ToString) {
+  CudnnVersion version(7, 0, 12);
+  EXPECT_EQ(version.ToString(), "7.0.12");
+}
+
+TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
+  // Returns true if both major and minor versions are matching and even if the
+  // patch versions are not matching.
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 12),
+      /*loaded_version=*/CudnnVersion(7, 0, 14)));
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(6, 1, 14),
+      /*loaded_version=*/CudnnVersion(6, 1, 00)));
+
+  // Returns false if major versions are not matching as they are neither
+  // forward or backward compatible.
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 12),
+      /*loaded_version=*/CudnnVersion(6, 1, 14)));
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(8, 1, 15),
+      /*loaded_version=*/CudnnVersion(7, 0, 14)));
+
+  // Returns true if the loaded version is equal or higher because minor version
+  // are backward compatible with CuDNN version 7.
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 14),
+      /*loaded_version=*/CudnnVersion(7, 1, 14)));
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 14),
+      /*loaded_version=*/CudnnVersion(7, 1, 15)));
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 1, 15),
+      /*loaded_version=*/CudnnVersion(7, 0, 14)));
+
+  // Returns false if minor versions are not matching for version 6. Before
+  // version 7, minor versions are also neither forward or backward compatible.
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(6, 0, 14),
+      /*loaded_version=*/CudnnVersion(6, 1, 15)));
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(6, 1, 14),
+      /*loaded_version=*/CudnnVersion(6, 0, 14)));
+}
+
+}  // namespace
+}  // namespace cuda
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index a98143e34bbb42c3aee76c27e1648c49397a0e44..8ca0677f8a3b3173fd023bd120526a490c2e7878 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/mathutil.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 static const uint64 kUninitializedUint64 = -1ULL;
 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
@@ -50,6 +49,7 @@ DeviceDescription::DeviceDescription()
       shared_memory_alloc_granularity_(1),
       device_address_bits_(kUninitializedUint64),
       device_memory_size_(kUninitializedUint64),
+      memory_bandwidth_(kUninitializedUint64),
       shared_memory_per_core_(kUninitializedUint64),
       shared_memory_per_block_(kUninitializedUint64),
       clock_rate_ghz_(-1.0),
@@ -85,6 +85,8 @@ std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
   result["Device Address Bits"] = port::StrCat(device_address_bits());
   result["Device Memory Size"] =
       port::HumanReadableNumBytes::ToString(device_memory_size());
+  result["Memory Bandwidth"] = port::StrCat(
+      port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
 
   result["Shared Memory Per Core"] =
       port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
@@ -231,6 +233,4 @@ uint64 CalculateRegisterLimitForTargetOccupancy(
   return 0;
 }
 
-
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index f2b35bcb4345a37f72541979564cbbb7944595c2..7f99d81ef3bc5e21d5ea225ecfbc5a97bbd01ef5 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 class DeviceDescriptionBuilder;
 }  // namespace internal
@@ -140,6 +139,11 @@ class DeviceDescription {
   // Returns the device memory size in bytes.
   uint64 device_memory_size() const { return device_memory_size_; }
 
+  // Returns the device's memory bandwidth in bytes/sec.  (This is for
+  // reads/writes to/from the device's own memory, not for transfers between the
+  // host and device.)
+  uint64 memory_bandwidth() const { return memory_bandwidth_; }
+
   // Returns the device's core clock rate in GHz.
   float clock_rate_ghz() const { return clock_rate_ghz_; }
 
@@ -212,6 +216,7 @@ class DeviceDescription {
 
   uint64 device_address_bits_;
   uint64 device_memory_size_;
+  uint64 memory_bandwidth_;
 
   // Shared memory limits on a given device.
   uint64 shared_memory_per_core_;
@@ -305,6 +310,9 @@ class DeviceDescriptionBuilder {
   void set_device_memory_size(uint64 value) {
     device_description_->device_memory_size_ = value;
   }
+  void set_memory_bandwidth(uint64 value) {
+    device_description_->memory_bandwidth_ = value;
+  }
 
   void set_shared_memory_per_core(int64 value) {
     device_description_->shared_memory_per_core_ = value;
@@ -379,7 +387,6 @@ uint64 CalculateRegisterLimitForTargetOccupancy(
     const DeviceDescription &device_description, uint64 shared_memory_per_block,
     const ThreadDim &thread_dims, uint64 target_blocks_per_core);
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
diff --git a/tensorflow/stream_executor/device_memory.h b/tensorflow/stream_executor/device_memory.h
index 4c92b7dc78523e894aaf954fc44b72e89891d153..5a5334e0f5f6e8744b92188de14d7fea0f2ff9a0 100644
--- a/tensorflow/stream_executor/device_memory.h
+++ b/tensorflow/stream_executor/device_memory.h
@@ -32,6 +32,16 @@ limitations under the License.
 namespace perftools {
 namespace gputools {
 
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
+}  // namespace gputools
+}  // namespace perftools
+
+namespace stream_executor {
+
 class StreamExecutor;
 
 // void*-analogous device memory allocation. For the typed variation, see
@@ -280,7 +290,6 @@ static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/device_options.h b/tensorflow/stream_executor/device_options.h
index 169325e7d127edb83c896e1b18e5bf88641b46b1..2646950f42e986d0c732f212af0c24c403b142e5 100644
--- a/tensorflow/stream_executor/device_options.h
+++ b/tensorflow/stream_executor/device_options.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Indicates a set of options for a device's usage, which generally must be
 // provided at StreamExecutor device-initialization time.
@@ -84,7 +83,6 @@ struct DeviceOptions {
   unsigned flags_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_OPTIONS_H_
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 44144a06139bf8661432cb930e53ba5218aac823..031c82d3f4bfcbcb22d5d1b988cd6fa4fefc45e1 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -15,19 +15,27 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/dnn.h"
 
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace dnn {
 
+uint64 AlgorithmDesc::hash() const {
+  return ::tensorflow::Hash64Combine(algo_, tensor_ops_enabled_);
+}
+
 bool DnnSupport::GetConvolveAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<AlgorithmDesc>* out_algorithms) {
   return false;
 }
 
+bool DnnSupport::GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms) {
+  return false;
+}
+
 bool DnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<AlgorithmDesc>* out_algorithms) {
@@ -550,5 +558,4 @@ string NormalizeDescriptor::ToShortString() const {
 }
 
 }  // namespace dnn
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index aa88fe770f3596e5da5e12705c3b706365382134..0c2e083b39d589c2e4b454c333220cfc63ffc218 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <memory>
+#include <tuple>
 
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
@@ -37,8 +38,7 @@ namespace Eigen {
 struct half;
 }  // namespace Eigen
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class HostBuffer;
 class Stream;
@@ -712,6 +712,7 @@ class AlgorithmDesc {
     return this->algo_ == other.algo_ &&
            this->tensor_ops_enabled_ == other.tensor_ops_enabled_;
   }
+  uint64 hash() const;
 
  private:
   enum { kDefaultAlgorithm = -1 };
@@ -875,6 +876,22 @@ enum class ElementwiseOperation { kAdd, kMultiply };
 
 string ElementwiseOperationString(ElementwiseOperation op);
 
+// A simple class representing the version of the backing library, to 
+// workaround the "too perfect forwarding" issue in gcc6+ compilers. 
+// See PR#16309 and issue #18402 for links discussing the issue.
+class VersionInfo {
+ public:
+  VersionInfo(int major = 0, int minor = 0, int patch = 0)
+      : major_(major), minor_(minor), patch_(patch) {}
+  int major_version() { return major_; }
+  int minor_version() { return minor_; }
+  int patch() { return patch_; }
+ private:
+  int major_;
+  int minor_;
+  int patch_;
+};
+
 // Suite of operations typically used for implementing Deep/Convolutional Neural
 // Nets. Note: A false return value of an operation indicates the
 // implementation is not available.
@@ -885,6 +902,12 @@ class DnnSupport {
 
   virtual port::Status Init() = 0;
 
+  // Gets the version of the backing library, as a VersionInfo object.
+  virtual port::StatusOr<VersionInfo> GetVersion() {
+    return port::UnimplementedError(
+        "DnnSupport::GetVersion not implemented on this platform.");
+  }
+
   // Performs a single-precision forward batch normalization operation onto
   // the stream.
   //
@@ -1172,7 +1195,9 @@ class DnnSupport {
       const DeviceMemory<double>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double>* output_data) = 0;
+      DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) = 0;
 
   // Enqueues a half-precision convolution operation onto the stream.
   // See DoConvolve above for argument details.
@@ -1193,6 +1218,9 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
+  // Returns a list of supported rnn algorithms.
+  virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
+
   // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
   // coefficient_scales specifies the scaling of each column of coefficients:
   // original float coefficient[row * num_columns + column] =
@@ -1273,6 +1301,18 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
+  virtual bool DoConvolveBackwardData(
+      Stream* stream, const FilterDescriptor& filter_descriptor,
+      const DeviceMemory<double>& filter_data,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const BatchDescriptor& input_descriptor,
+      DeviceMemory<double>* backward_input_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) = 0;
+
   virtual bool DoConvolveBackwardData(
       Stream* stream, const FilterDescriptor& filter_descriptor,
       const DeviceMemory<Eigen::half>& filter_data,
@@ -1322,6 +1362,18 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
+  virtual bool DoConvolveBackwardFilter(
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      const DeviceMemory<double>& input_data,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      DeviceMemory<double>* backward_filter_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) = 0;
+
   virtual bool DoConvolveBackwardFilter(
       Stream* stream, const BatchDescriptor& input_descriptor,
       const DeviceMemory<Eigen::half>& input_data,
@@ -1972,9 +2024,10 @@ class DnnSupport {
   //    is no longer in use.
   virtual port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
   createRnnDescriptor(int num_layers, int hidden_size, int input_size,
-                      dnn::RnnInputMode input_mode,
+                      int batch_size, dnn::RnnInputMode input_mode,
                       dnn::RnnDirectionMode direction_mode,
                       dnn::RnnMode rnn_mode, dnn::DataType data_type,
+                      const dnn::AlgorithmConfig& algorithm_config,
                       float dropout, uint64 seed,
                       ScratchAllocator* state_allocator) {
     return port::Status{port::error::UNIMPLEMENTED,
@@ -2050,7 +2103,8 @@ class DnnSupport {
                             DeviceMemory<Eigen::half>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2070,7 +2124,8 @@ class DnnSupport {
                             DeviceMemory<float>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2090,7 +2145,8 @@ class DnnSupport {
                             DeviceMemory<double>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
   // Enqueue a backward operation of the RNN model onto the stream.
@@ -2157,7 +2213,8 @@ class DnnSupport {
       DeviceMemory<Eigen::half>* input_c_backprop_data,
       DeviceMemory<Eigen::half>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2184,7 +2241,8 @@ class DnnSupport {
       DeviceMemory<float>* input_c_backprop_data,
       DeviceMemory<float>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2211,7 +2269,8 @@ class DnnSupport {
       DeviceMemory<double>* input_c_backprop_data,
       DeviceMemory<double>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2242,7 +2301,6 @@ class DnnSupport {
 };
 
 }  // namespace dnn
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index 95168836278add5d6592ff0c3d0f7245e6f6bc5b..114143b3abef00e757da3263449454fb1908fd53 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -37,8 +37,7 @@ limitations under the License.
 #include "cuda/cuda_config.h"
 #endif
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 string GetCudaVersion() { return TF_CUDA_VERSION; }
@@ -291,5 +290,4 @@ static std::vector<string>* CreatePrimordialRpaths() {
 }
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/dso_loader.h
index 354c7b50b8209755991827b3c36afac790cb952b..9ee081cb3d64e8878fa9d7f0c33da7f6827da620 100644
--- a/tensorflow/stream_executor/dso_loader.h
+++ b/tensorflow/stream_executor/dso_loader.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 // Permits StreamExecutor code to dynamically load a pre-determined set of
@@ -114,7 +113,6 @@ class CachedDsoLoader {
 };
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/event.cc b/tensorflow/stream_executor/event.cc
index c423a453e9f1cb4f1d484427c11eacf959da7a8d..50a6edd80bd39004e32f09bcde36fbc8a8b59ad9 100644
--- a/tensorflow/stream_executor/event.cc
+++ b/tensorflow/stream_executor/event.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 Event::Event(StreamExecutor* stream_exec)
     : stream_exec_(stream_exec),
@@ -48,5 +47,4 @@ Event::Status Event::PollForStatus() {
   return stream_exec_->PollForEventStatus(this);
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/event.h b/tensorflow/stream_executor/event.h
index a06c26ea5191d53c7d8ffc1e2b460cb594dd6f4a..1f37262c78d82f72f8818f35db273e87a47bdc1c 100644
--- a/tensorflow/stream_executor/event.h
+++ b/tensorflow/stream_executor/event.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace internal {
 class EventInterface;
@@ -76,7 +75,6 @@ class Event {
   SE_DISALLOW_COPY_AND_ASSIGN(Event);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_EVENT_H_
diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc
index d1a8aae167455a7dc728999fbbaf1a119cf6a101..0b3ad7ebbc905ab1bd7e3298bb560af0e3868656 100644
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     const StreamExecutorConfig& config,
@@ -104,5 +103,4 @@ ExecutorCache::Entry::~Entry() {
   configurations.clear();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/executor_cache.h b/tensorflow/stream_executor/executor_cache.h
index 12f2275f6d8ecde4819d56ab9411c1087b76ddc5..bbeeaed787c27a62e2b0b22a75779b5350187d83 100644
--- a/tensorflow/stream_executor/executor_cache.h
+++ b/tensorflow/stream_executor/executor_cache.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Utility class to allow Platform objects to manage cached StreamExecutors.
 // Thread-safe.
@@ -76,7 +75,6 @@ class ExecutorCache {
   SE_DISALLOW_COPY_AND_ASSIGN(ExecutorCache);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
diff --git a/tensorflow/stream_executor/fft.h b/tensorflow/stream_executor/fft.h
index 6b1728829abdeb5c4e20534675801a437341d732..814efb2e923cb25833515f86e1b3f6f0bec437da 100644
--- a/tensorflow/stream_executor/fft.h
+++ b/tensorflow/stream_executor/fft.h
@@ -48,8 +48,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 template <typename ElemT>
@@ -210,7 +209,7 @@ class FftSupport {
 
 // Macro used to quickly declare overrides for abstract virtuals in the
 // fft::FftSupport base class. Assumes that it's emitted somewhere inside the
-// ::perftools::gputools namespace.
+// ::stream_executor namespace.
 #define TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES                   \
   std::unique_ptr<fft::Plan> Create1dPlan(Stream *stream, uint64 num_x,        \
                                           fft::Type type, bool in_place_fft)   \
@@ -265,7 +264,6 @@ class FftSupport {
              DeviceMemory<double> *output) override;
 
 }  // namespace fft
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_FFT_H_
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 542f521ef778c3a69ec9adba74405131e07bcf1a..2c4819651acaa2c6ee99c720b2c3d80e5c2ea1a9 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -28,8 +28,7 @@ limitations under the License.
 
 bool FLAGS_stream_executor_cpu_real_clock_rate = false;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 HostStream *AsHostStream(Stream *stream) {
@@ -266,5 +265,4 @@ rng::RngSupport *HostExecutor::CreateRng() {
 }
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index e2c0e6d6b77130bd190b026f1eaff68d21dbf632..0c3991c151d5bbb84333240aa1ca4f9e1587330d 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 // An implementation of StreamExecutor that does no communication or interaction
@@ -210,7 +209,6 @@ class HostExecutor : public internal::StreamExecutorInterface {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index 2cb7d3696758365804786800e6a46e8bca7d80e7..a652b08b4fc7e075c9f560baa5c08bcef8dd0d3d 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -26,10 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace gpu = ::perftools::gputools;
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 HostPlatform::HostPlatform() : name_("Host") {}
@@ -69,8 +66,8 @@ port::StatusOr<StreamExecutor*> HostPlatform::GetExecutor(
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<HostExecutor>(config.plugin_config));
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<HostExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
@@ -93,16 +90,15 @@ void HostPlatform::UnregisterTraceListener(TraceListener* listener) {
 }
 
 static void InitializeHostPlatform() {
-  std::unique_ptr<gpu::Platform> platform(new gpu::host::HostPlatform);
-  SE_CHECK_OK(gpu::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  std::unique_ptr<Platform> platform(new host::HostPlatform);
+  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(
-    host_platform, perftools::gputools::host::InitializeHostPlatform());
+REGISTER_MODULE_INITIALIZER(host_platform,
+                            stream_executor::host::InitializeHostPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
diff --git a/tensorflow/stream_executor/host/host_platform.h b/tensorflow/stream_executor/host/host_platform.h
index 0faec6c8b789766804c262fc5cf29c950fd31d3c..c6f46a2cc4028644237f206f5cd076a49e964702 100644
--- a/tensorflow/stream_executor/host/host_platform.h
+++ b/tensorflow/stream_executor/host/host_platform.h
@@ -33,8 +33,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 // Host (CPU) platform plugin, registered as a singleton value via module
@@ -79,7 +78,6 @@ class HostPlatform : public Platform {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
diff --git a/tensorflow/stream_executor/host/host_platform_id.cc b/tensorflow/stream_executor/host/host_platform_id.cc
index 69a203f2985b67ecd67346e6dc394d961377e035..2256bccec3f4ef73bc667128aeb1fbf90b1dcd90 100644
--- a/tensorflow/stream_executor/host/host_platform_id.cc
+++ b/tensorflow/stream_executor/host/host_platform_id.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/host/host_platform_id.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 PLATFORM_DEFINE_ID(kHostPlatformId);
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_platform_id.h b/tensorflow/stream_executor/host/host_platform_id.h
index 61d84ea2e2facae3a18798db15d529109b6dfc93..18d1f282f1f921bc9bb2508225398c98ffeb89bc 100644
--- a/tensorflow/stream_executor/host/host_platform_id.h
+++ b/tensorflow/stream_executor/host/host_platform_id.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 // Opaque and unique identifier for the host platform.
@@ -30,7 +29,6 @@ namespace host {
 extern const Platform::Id kHostPlatformId;
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/host/host_stream.cc b/tensorflow/stream_executor/host/host_stream.cc
index 5961c3151600b778b4cbca7443b5913329368431..5a7d3b3dd49275edd5242c30b38bb4f505042816 100644
--- a/tensorflow/stream_executor/host/host_stream.cc
+++ b/tensorflow/stream_executor/host/host_stream.cc
@@ -17,8 +17,7 @@ limitations under the License.
 // the HostExecutor implementation.
 #include "tensorflow/stream_executor/host/host_stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 HostStream::HostStream()
@@ -53,5 +52,4 @@ void HostStream::BlockUntilDone() {
 
 }  // namespace host
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h
index 9894d17febcae24089bfe2948eb96bad701978d7..5d7b8a378268c3226a61fa43e738f209e84b30e9 100644
--- a/tensorflow/stream_executor/host/host_stream.h
+++ b/tensorflow/stream_executor/host/host_stream.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 class HostStream : public internal::StreamInterface {
@@ -52,7 +51,6 @@ class HostStream : public internal::StreamInterface {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
diff --git a/tensorflow/stream_executor/host/host_timer.cc b/tensorflow/stream_executor/host/host_timer.cc
index d84d825c92a6e8e06b004305c9d4cb032883f5a3..e138daf0e11f179c8690306b5cfef3ed95ae4cb2 100644
--- a/tensorflow/stream_executor/host/host_timer.cc
+++ b/tensorflow/stream_executor/host/host_timer.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 using std::chrono::duration_cast;
@@ -46,5 +45,4 @@ void HostTimer::StartNow() { start_time_ = clock::now(); }
 void HostTimer::StopNow() { duration_ = clock::now() - start_time_; }
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_timer.h b/tensorflow/stream_executor/host/host_timer.h
index 17af7c0521d2ee956d6f1ee92dd2229b647785bd..5954b8023beb4fe78d37cd41c0379804a8dfa8aa 100644
--- a/tensorflow/stream_executor/host/host_timer.h
+++ b/tensorflow/stream_executor/host/host_timer.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 class HostTimer : public internal::TimerInterface {
@@ -57,7 +56,6 @@ class HostTimer : public internal::TimerInterface {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_
diff --git a/tensorflow/stream_executor/host_buffer.h b/tensorflow/stream_executor/host_buffer.h
index 8fa542e9ff8851551d424da6be073d0d9959af67..20299da5172f20b9b73c31b6491806dc57b1d2f0 100644
--- a/tensorflow/stream_executor/host_buffer.h
+++ b/tensorflow/stream_executor/host_buffer.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/dnn.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // A HostBuffer is a block of memory in host memory containing the data for a
 // dnn::BatchDescriptor using a device-dependent memory layout.
@@ -42,7 +41,6 @@ class HostBuffer {
   const dnn::BatchDescriptor descriptor_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9e3e14778384adc81c93b8156ce71bb83ec9909
--- /dev/null
+++ b/tensorflow/stream_executor/host_or_device_scalar.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/device_memory.h"
+
+namespace stream_executor {
+
+// Allows to represent a value that is either a host scalar or a scalar stored
+// on the GPU device.
+template <typename ElemT>
+class HostOrDeviceScalar {
+ public:
+  // Not marked as explicit because when using this constructor, we usually want
+  // to set this to a compile-time constant.
+  HostOrDeviceScalar(ElemT value) : value_(value), is_pointer_(false) {}
+  explicit HostOrDeviceScalar(const DeviceMemory<ElemT>& pointer)
+      : pointer_(pointer), is_pointer_(true) {
+    CHECK_EQ(1, pointer.ElementCount());
+  }
+
+  bool is_pointer() const { return is_pointer_; }
+  const DeviceMemory<ElemT>& pointer() const {
+    CHECK(is_pointer());
+    return pointer_;
+  }
+  const ElemT& value() const {
+    CHECK(!is_pointer());
+    return value_;
+  }
+
+ private:
+  union {
+    ElemT value_;
+    DeviceMemory<ElemT> pointer_;
+  };
+  bool is_pointer_;
+};
+
+}  // namespace stream_executor
+#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index 81e531efb31ea7d8d6ac03b56aea6aa5f01d64d1..d1aa596b73da3d7f8cdf4ddd6911d956239ed46d 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -21,13 +21,13 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/stream_executor/lib/demangle.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 bool KernelMetadata::registers_per_thread(int *registers_per_thread) const {
   if (has_registers_per_thread_) {
@@ -96,11 +96,10 @@ static const char *kStubPrefix = "__device_stub_";
 void KernelBase::set_name(port::StringPiece name) {
   name_ = name.ToString();
   port::StringPiece stubless_name = name;
-  if (name.starts_with(kStubPrefix)) {
+  if (tensorflow::str_util::StartsWith(name, kStubPrefix)) {
     stubless_name.remove_prefix(strlen(kStubPrefix));
   }
   demangled_name_ = port::Demangle(stubless_name.data());
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
index 5358eac1ae070efb2bead75c73208e9d283b498c..2216884b873cda98f09782866f23c06088b73e09 100644
--- a/tensorflow/stream_executor/kernel.h
+++ b/tensorflow/stream_executor/kernel.h
@@ -64,7 +64,7 @@ limitations under the License.
 //
 // Users typically won't need to type out the TypedKernel signature in full, it
 // will be typedef'd by automatically generated code; for example, see
-// perftools::gputools::executor_sample::VecReduceAddKernel.
+// stream_executor::executor_sample::VecReduceAddKernel.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
@@ -82,8 +82,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class DeviceMemoryBase;
 template <typename ElemT>
@@ -639,8 +638,8 @@ struct KernelInvocationChecker {
   // NOTE: if you encounter an error here, you can see the mismatch by looking
   // at the end of the last error message, which will be of the form:
   //
-  //    ...::Compatible<const perftools::gputools::DeviceMemory<OneThing> &,
-  //                    perftools::gputools::DeviceMemory<AnotherThing>, true,
+  //    ...::Compatible<const stream_executor::DeviceMemory<OneThing> &,
+  //                    stream_executor::DeviceMemory<AnotherThing>, true,
   //                    0>'
   //    requested here
   //
@@ -711,7 +710,6 @@ struct KernelParamsOk<TypedKernel<Params...>, Args...> {
       std::tuple<Params...>, std::tuple<Args...>>::CheckAllNoStaticAssert();
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
diff --git a/tensorflow/stream_executor/kernel_cache_config.h b/tensorflow/stream_executor/kernel_cache_config.h
index 9d7ab1b79f6c94ce2fe64f14f7ebe3a9db44daf4..e63d6c6a0c05e8cd681a4e12f73b2468e170dea5 100644
--- a/tensorflow/stream_executor/kernel_cache_config.h
+++ b/tensorflow/stream_executor/kernel_cache_config.h
@@ -18,8 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // This enum represents potential configurations of L1/shared memory when
 // running a particular kernel. These values represent user preference, and
@@ -38,7 +37,6 @@ enum class KernelCacheConfig {
   kPreferEqual,
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
diff --git a/tensorflow/stream_executor/kernel_spec.cc b/tensorflow/stream_executor/kernel_spec.cc
index 0404c573f0164782bb056b124ef60092c5985a5a..6a1f0a591ff0879821c6636e346dbc20105bcf7e 100644
--- a/tensorflow/stream_executor/kernel_spec.cc
+++ b/tensorflow/stream_executor/kernel_spec.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/kernel_spec.h"
 
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname)
     : kernelname_(kernelname.ToString()) {}
@@ -247,5 +245,4 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
 
 MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {}
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/kernel_spec.h b/tensorflow/stream_executor/kernel_spec.h
index 3811bd833e70b567002327cbe348574b37cb1126..7cc23bb4e64b45268f6bb00d9ea9ee4a686a0e25 100644
--- a/tensorflow/stream_executor/kernel_spec.h
+++ b/tensorflow/stream_executor/kernel_spec.h
@@ -56,8 +56,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Describes how to load a kernel on a target platform.
 //
@@ -374,7 +373,6 @@ class MultiKernelLoaderSpec {
   size_t arity_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
diff --git a/tensorflow/stream_executor/launch_dim.h b/tensorflow/stream_executor/launch_dim.h
index b95462667eea2c86d92060f3973e660f31d2d2dc..68f2f748407a87ec9cf3bdd411bf96c1a64b5681 100644
--- a/tensorflow/stream_executor/launch_dim.h
+++ b/tensorflow/stream_executor/launch_dim.h
@@ -21,7 +21,7 @@ limitations under the License.
 // a single PC in a unit called a warp. There is a maximum number of threads
 // that can execute in a shared-context entity called a block. Presently, that
 // number is 1024 -- again, something that should not be relied on from this
-// comment, but checked via perftools::gputools::DeviceDescription.
+// comment, but checked via stream_executor::DeviceDescription.
 //
 // For additional information, see
 // http://docs.nvidia.com/cuda/kepler-tuning-guide/#device-utilization-and-occupancy
@@ -40,8 +40,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Basic type that represents a 3-dimensional index space.
 struct Dim3D {
@@ -74,7 +73,6 @@ struct BlockDim : public Dim3D {
   }
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_
diff --git a/tensorflow/stream_executor/lib/array_slice.h b/tensorflow/stream_executor/lib/array_slice.h
index bef61bb2fc5611e84bc36320fa98f4df919372b7..8e3c4ca047bd467628551d8b8d351fa3baa956f7 100644
--- a/tensorflow/stream_executor/lib/array_slice.h
+++ b/tensorflow/stream_executor/lib/array_slice.h
@@ -18,14 +18,23 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::gtl::ArraySlice;
 using tensorflow::gtl::MutableArraySlice;
 
 }  // namespace port
+}  // namespace stream_executor
+
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
 }  // namespace gputools
 }  // namespace perftools
 
diff --git a/tensorflow/stream_executor/lib/casts.h b/tensorflow/stream_executor/lib/casts.h
index 2261944e252f6a757a154e64be0b10484a693125..ec562e804fae51ac09e336b2e03b8ab0d7f1ca0e 100644
--- a/tensorflow/stream_executor/lib/casts.h
+++ b/tensorflow/stream_executor/lib/casts.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
 
 #include <stdlib.h>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // port::bit_cast<Dest,Source> is a template function that implements the
@@ -96,7 +95,6 @@ inline Dest bit_cast(const Source& source) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
diff --git a/tensorflow/stream_executor/lib/demangle.cc b/tensorflow/stream_executor/lib/demangle.cc
index fa2b4fa005ca1a715b0593c7c270c62bcf936d21..adb6b4f2d11e537433ae86ad694d6e1de9171942 100644
--- a/tensorflow/stream_executor/lib/demangle.cc
+++ b/tensorflow/stream_executor/lib/demangle.cc
@@ -27,8 +27,7 @@ limitations under the License.
 #include <cxxabi.h>
 #endif
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // The API reference of abi::__cxa_demangle() can be found in
@@ -49,5 +48,4 @@ string Demangle(const char *mangled) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/demangle.h b/tensorflow/stream_executor/lib/demangle.h
index 30be522557808e4048c7a1f712d0b4884c1043ba..af16fa7d8cbcce5c17a7b25021546176570558bf 100644
--- a/tensorflow/stream_executor/lib/demangle.h
+++ b/tensorflow/stream_executor/lib/demangle.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 string Demangle(const char* mangled);
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_DEMANGLE_H_
diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h
index c9a22ebd55894f0ddf6654887e55289030871501..776eba04080e9e993b61997bc3e7532202ac4c5b 100644
--- a/tensorflow/stream_executor/lib/env.h
+++ b/tensorflow/stream_executor/lib/env.h
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::Env;
@@ -37,7 +36,6 @@ inline Status FileExists(const port::StringPiece& filename) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_
diff --git a/tensorflow/stream_executor/lib/error.h b/tensorflow/stream_executor/lib/error.h
index 89df70cb5e582508448b6886926b0c8905a14c2e..c659f5fc140d29e17fc0bf11ce707b01b2cb2e74 100644
--- a/tensorflow/stream_executor/lib/error.h
+++ b/tensorflow/stream_executor/lib/error.h
@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"  // IWYU pragma: export
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 namespace error = tensorflow::error;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
diff --git a/tensorflow/stream_executor/lib/human_readable.h b/tensorflow/stream_executor/lib/human_readable.h
index f918c180d98c0fc74d554e3c7e95995e6720d356..893865f6dadd1076a4daef6d820f667b57ab9023 100644
--- a/tensorflow/stream_executor/lib/human_readable.h
+++ b/tensorflow/stream_executor/lib/human_readable.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 class HumanReadableNumBytes {
@@ -67,7 +66,6 @@ class HumanReadableNumBytes {
 };
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
diff --git a/tensorflow/stream_executor/lib/initialize.h b/tensorflow/stream_executor/lib/initialize.h
index 9a09318a6cbd215e396968bc1968be07a3029129..688b0214694478e9be1b1d14e58fda94367f547b 100644
--- a/tensorflow/stream_executor/lib/initialize.h
+++ b/tensorflow/stream_executor/lib/initialize.h
@@ -26,8 +26,7 @@ limitations under the License.
 #undef DECLARE_MODULE_INITIALIZER
 #undef REGISTER_MODULE_INITIALIZER_SEQUENCE
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 class Initializer {
@@ -49,20 +48,18 @@ class Initializer {
 };
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-#define REGISTER_INITIALIZER(type, name, body)                               \
-  static void google_init_##type##_##name() { body; }                        \
-  perftools::gputools::port::Initializer google_initializer_##type##_##name( \
+#define REGISTER_INITIALIZER(type, name, body)                             \
+  static void google_init_##type##_##name() { body; }                      \
+  ::stream_executor::port::Initializer google_initializer_##type##_##name( \
       google_init_##type##_##name)
 
 #define REGISTER_MODULE_INITIALIZER(name, body) \
   REGISTER_INITIALIZER(module, name, body)
 
-#define DECLARE_INITIALIZER(type, name)         \
-  extern perftools::gputools::port::Initializer \
-      google_initializer_##type##_##name
+#define DECLARE_INITIALIZER(type, name) \
+  extern ::stream_executor::port::Initializer google_initializer_##type##_##name
 
 #define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
 
diff --git a/tensorflow/stream_executor/lib/inlined_vector.h b/tensorflow/stream_executor/lib/inlined_vector.h
index 55a1e3ad102c2fe25c9b4338a07029330bbf4ef6..40bdddb180f1135ef2fb3bfc51b06d9e68a3aff9 100644
--- a/tensorflow/stream_executor/lib/inlined_vector.h
+++ b/tensorflow/stream_executor/lib/inlined_vector.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::gtl::InlinedVector;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_INLINED_VECTOR_H_
diff --git a/tensorflow/stream_executor/lib/mathutil.h b/tensorflow/stream_executor/lib/mathutil.h
index e8310d55ddae20f1a92c4184d408215d7ec8664a..c225dc5f3cc9725be4b01039c10103ed6e62372e 100644
--- a/tensorflow/stream_executor/lib/mathutil.h
+++ b/tensorflow/stream_executor/lib/mathutil.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 class MathUtil {
@@ -97,7 +96,6 @@ IntegralType MathUtil::CeilOrFloorOfRatio(IntegralType numerator,
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_MATHUTIL_H_
diff --git a/tensorflow/stream_executor/lib/notification.h b/tensorflow/stream_executor/lib/notification.h
index 9bb3e170dc7c13ecaa636887315b4bc5cfc82f2b..472d8c9845c2ae3efbc5ab06234a7c084d044a96 100644
--- a/tensorflow/stream_executor/lib/notification.h
+++ b/tensorflow/stream_executor/lib/notification.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/platform/notification.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::Notification;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_NOTIFICATION_H_
diff --git a/tensorflow/stream_executor/lib/numbers.cc b/tensorflow/stream_executor/lib/numbers.cc
index 11a65e198d60545e3bfb76a8cbf141c4dba1b58e..b670c42ec84a86d49a4938da3fc6387af529e04b 100644
--- a/tensorflow/stream_executor/lib/numbers.cc
+++ b/tensorflow/stream_executor/lib/numbers.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <stdlib.h>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 bool safe_strto32(const char* str, int32* value) {
@@ -38,5 +37,4 @@ bool safe_strto32(const string& str, int32* value) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/numbers.h b/tensorflow/stream_executor/lib/numbers.h
index 4a8692b7461d513bbac287281b134a4fbac54707..2f48281d2d67f0fa9b4fb523c4986c5dc8acfe33 100644
--- a/tensorflow/stream_executor/lib/numbers.h
+++ b/tensorflow/stream_executor/lib/numbers.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // Convert strings to floating point values.
@@ -28,7 +27,6 @@ namespace port {
 bool safe_strto32(const string& str, int32* value);
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_NUMBERS_H_
diff --git a/tensorflow/stream_executor/lib/path.cc b/tensorflow/stream_executor/lib/path.cc
index f2591f47f7bf75004ee00019d70e118cd9d32f0e..56e08c316f95758cd2a05d18d59c06f3d7727959 100644
--- a/tensorflow/stream_executor/lib/path.cc
+++ b/tensorflow/stream_executor/lib/path.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 namespace internal {
 
@@ -58,5 +57,4 @@ string JoinPathImpl(std::initializer_list<port::StringPiece> paths) {
 
 }  // namespace internal
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/path.h b/tensorflow/stream_executor/lib/path.h
index 93053dbcb6862edbaf93a27a9414ad7e4bb10be2..325f04ff47552e052d81c96bec74e816378b8254 100644
--- a/tensorflow/stream_executor/lib/path.h
+++ b/tensorflow/stream_executor/lib/path.h
@@ -20,8 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::io::Dirname;
@@ -56,7 +55,6 @@ inline string JoinPath(const T&... args) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_
diff --git a/tensorflow/stream_executor/lib/process_state.cc b/tensorflow/stream_executor/lib/process_state.cc
index 3d856187f05b3601525b048d3997088e145bdb81..72d71e62116726d458f709b75e50fcc0947df00e 100644
--- a/tensorflow/stream_executor/lib/process_state.cc
+++ b/tensorflow/stream_executor/lib/process_state.cc
@@ -25,8 +25,7 @@ limitations under the License.
 #endif
 #include <memory>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 string Hostname() {
@@ -54,5 +53,4 @@ bool GetCurrentDirectory(string* dir) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/process_state.h b/tensorflow/stream_executor/lib/process_state.h
index 205e726d95ce80216a8c260c270d32b990fd5b78..248218c759e65fc2d6ee09b2f1c9e309567b1048 100644
--- a/tensorflow/stream_executor/lib/process_state.h
+++ b/tensorflow/stream_executor/lib/process_state.h
@@ -18,15 +18,13 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 string Hostname();
 bool GetCurrentDirectory(string* dir);
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_
diff --git a/tensorflow/stream_executor/lib/ptr_util.h b/tensorflow/stream_executor/lib/ptr_util.h
index 3d5e56faf745e484649079cee916bc26c8b2c449..8f9f420fec75a76c5b52871464fe8fe4df627aa5 100644
--- a/tensorflow/stream_executor/lib/ptr_util.h
+++ b/tensorflow/stream_executor/lib/ptr_util.h
@@ -17,50 +17,22 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_
 
 #include <memory>
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace stream_executor {
+using tensorflow::MakeUnique;
+using tensorflow::WrapUnique;
+}  // namespace stream_executor
 
 namespace perftools {
 namespace gputools {
-namespace port {
-
-// Trait to select overloads and return types for MakeUnique.
-template <typename T>
-struct MakeUniqueResult {
-  using scalar = std::unique_ptr<T>;
-};
-template <typename T>
-struct MakeUniqueResult<T[]> {
-  using array = std::unique_ptr<T[]>;
-};
-template <typename T, size_t N>
-struct MakeUniqueResult<T[N]> {
-  using invalid = void;
-};
-
-// MakeUnique<T>(...) is an early implementation of C++14 std::make_unique.
-// It is designed to be 100% compatible with std::make_unique so that the
-// eventual switchover will be a simple renaming operation.
-template <typename T, typename... Args>
-typename MakeUniqueResult<T>::scalar MakeUnique(Args&&... args) {  // NOLINT
-  return std::unique_ptr<T>(
-      new T(std::forward<Args>(args)...));  // NOLINT(build/c++11)
-}
 
-// Overload for array of unknown bound.
-// The allocation of arrays needs to use the array form of new,
-// and cannot take element constructor arguments.
-template <typename T>
-typename MakeUniqueResult<T>::array MakeUnique(size_t n) {
-  return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
-}
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(jlebar): Remove this once we've completed
+// the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
 
-// Reject arrays of known bound.
-template <typename T, typename... Args>
-typename MakeUniqueResult<T>::invalid MakeUnique(Args&&... /* args */) =
-    delete;  // NOLINT
-
-}  // namespace port
 }  // namespace gputools
 }  // namespace perftools
 
-
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_
diff --git a/tensorflow/stream_executor/lib/stacktrace.h b/tensorflow/stream_executor/lib/stacktrace.h
index ba7e5317f0f4a852c353102d29411752f69b11a9..a15b0f30261f1a102c0f7cf8b28b5fc9b5426e25 100644
--- a/tensorflow/stream_executor/lib/stacktrace.h
+++ b/tensorflow/stream_executor/lib/stacktrace.h
@@ -19,14 +19,12 @@ limitations under the License.
 #include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::CurrentStackTrace;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STACKTRACE_H_
diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h
index 8c289e1927fcf9b49851389e44d3fa09fdfea3ae..407b71b405bc8a73e5aebcd18b043420b074b708 100644
--- a/tensorflow/stream_executor/lib/status.h
+++ b/tensorflow/stream_executor/lib/status.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
@@ -23,15 +23,14 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using Status = tensorflow::Status;
 
 #define SE_CHECK_OK(val) TF_CHECK_OK(val)
 #define SE_ASSERT_OK(val) \
-  ASSERT_EQ(::perftools::gputools::port::Status::OK(), (val))
+  ASSERT_EQ(::stream_executor::port::Status::OK(), (val))
 
 // Define some canonical error helpers.
 inline Status UnimplementedError(StringPiece message) {
@@ -45,6 +44,16 @@ inline Status FailedPreconditionError(StringPiece message) {
 }
 
 }  // namespace port
+}  // namespace stream_executor
+
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
 }  // namespace gputools
 }  // namespace perftools
 
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 138738ecab54986fd7d5cd76839d59da55623b1f..dab59096740102b94c0ff63c089b83ce052ea264 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -13,242 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
-//
-// StatusOr<T> is the union of a Status object and a T
-// object. StatusOr models the concept of an object that is either a
-// usable value, or an error Status explaining why such a value is
-// not present. To this end, StatusOr<T> does not allow its Status
-// value to be Status::OK. Further, StatusOr<T*> does not allow the
-// contained pointer to be NULL.
-//
-// The primary use-case for StatusOr<T> is as the return value of a
-// function which may fail.
-//
-// Example client usage for a StatusOr<T>, where T is not a pointer:
-//
-//  StatusOr<float> result = DoBigCalculationThatCouldFail();
-//  if (result.ok()) {
-//    float answer = result.ValueOrDie();
-//    printf("Big calculation yielded: %f", answer);
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example client usage for a StatusOr<T*>:
-//
-//  StatusOr<Foo*> result = FooFactory::MakeNewFoo(arg);
-//  if (result.ok()) {
-//    std::unique_ptr<Foo> foo(result.ValueOrDie());
-//    foo->DoSomethingCool();
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example client usage for a StatusOr<std::unique_ptr<T>>:
-//
-//  StatusOr<std::unique_ptr<Foo>> result = FooFactory::MakeNewFoo(arg);
-//  if (result.ok()) {
-//    std::unique_ptr<Foo> foo = std::move(result.ValueOrDie());
-//    foo->DoSomethingCool();
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example factory implementation returning StatusOr<T*>:
-//
-//  StatusOr<Foo*> FooFactory::MakeNewFoo(int arg) {
-//    if (arg <= 0) {
-//      return Status(port::error::INVALID_ARGUMENT,
-//                            "Arg must be positive");
-//    } else {
-//      return new Foo(arg);
-//    }
-//  }
-//
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 
-#include <new>
-#include "tensorflow/stream_executor/platform/port.h"
-#include <type_traits>
-#include <utility>
+#include "tensorflow/compiler/xla/statusor.h"
 
-#include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/port.h"
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
-template<typename T>
-class StatusOr {
-  template<typename U> friend class StatusOr;
-
- public:
-  // Construct a new StatusOr with Status::UNKNOWN status
-  StatusOr() : status_(error::UNKNOWN, "") {}
-
-  // Construct a new StatusOr with the given non-ok status. After calling
-  // this constructor, calls to ValueOrDie() is invalid.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return
-  // value, so it is convenient and sensible to be able to do 'return
-  // Status()' when the return type is StatusOr<T>.
-  //
-  // REQUIRES: status != Status::OK.
-  // In optimized builds, passing Status::OK here will have the effect
-  // of passing PosixErrorSpace::EINVAL as a fallback.
-  StatusOr(const Status& status);  // NOLINT
-
-  // Construct a new StatusOr with the given value. If T is a plain pointer,
-  // value must not be NULL. After calling this constructor, calls to
-  // ValueOrDie() will succeed, and calls to status() will return OK.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return type
-  // so it is convenient and sensible to be able to do 'return T()'
-  // when the return type is StatusOr<T>.
-  //
-  // REQUIRES: if T is a plain pointer, value != NULL.
-  // In optimized builds, passing a NULL pointer here will have
-  // the effect of passing PosixErrorSpace::EINVAL as a fallback.
-  StatusOr(const T& value);  // NOLINT
-
-  // Conversion copy constructor, T must be copy constructible from U
-  template <typename U>
-  StatusOr(const StatusOr<U>& other)  // NOLINT
-      : status_(other.status_),
-        value_(other.value_) {}
-
-  // Conversion assignment operator, T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(const StatusOr<U>& other) {
-    status_ = other.status_;
-    value_ = other.value_;
-    return *this;
-  }
-
-  // Rvalue-reference overloads of the other constructors and assignment
-  // operators, to support move-only types and avoid unnecessary copying.
-  StatusOr(T&& value);  // NOLINT
-
-  // Move conversion operator to avoid unnecessary copy.
-  // T must be assignable from U.
-  // Not marked with explicit so the implicit conversion can happen.
-  template <typename U>
-  StatusOr(StatusOr<U>&& other)  // NOLINT
-      : status_(std::move(other.status_)),
-        value_(std::move(other.value_)) {}
-
-  // Move assignment operator to avoid unnecessary copy.
-  // T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(StatusOr<U>&& other) {
-    status_ = std::move(other.status_);
-    value_ = std::move(other.value_);
-    return *this;
-  }
-
-  // Returns a reference to our status. If this contains a T, then
-  // returns Status::OK.
-  const Status& status() const { return status_; }
-
-  // Returns this->status().ok()
-  bool ok() const { return status_.ok(); }
-
-  // Returns a reference to our current value, requires that this->ok().
-  // If you need to initialize a T object from the stored value,
-  // ConsumeValueOrDie() may be more efficient.
-  const T& ValueOrDie() const;
-  T& ValueOrDie();
-
-  // Returns our current value, requires this->ok(). Use this if
-  // you would otherwise want to say std::move(s.ValueOrDie()), for example
-  // if you need to initialize a T object from the stored value and you don't
-  // need subsequent access to the stored value. It uses T's move constructor,
-  // if it has one, so it will work with move-only types, and will often be
-  // more efficient than ValueOrDie, but may leave the stored value
-  // in an arbitrary valid state.
-  T ConsumeValueOrDie();
-
- private:
-  Status status_;
-  T value_;
-
-  void CheckValueNotNull(const T& value);
-
-  template <typename U>
-  struct IsNull {
-    // For non-pointer U, a reference can never be NULL.
-    static inline bool IsValueNull(const U& t) { return false; }
-  };
-
-  template <typename U>
-  struct IsNull<U*> {
-    static inline bool IsValueNull(const U* t) { return t == NULL; }
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Implementation details for StatusOr<T>
-
-template <typename T>
-StatusOr<T>::StatusOr(const T& value)
-    : status_(), value_(value) {
-  CheckValueNotNull(value);
-}
-
-template <typename T>
-const T& StatusOr<T>::ValueOrDie() const {
-  TF_CHECK_OK(status_);
-  return value_;
-}
-
-template <typename T>
-T& StatusOr<T>::ValueOrDie() {
-  TF_CHECK_OK(status_);
-  return value_;
-}
-
-template <typename T>
-T StatusOr<T>::ConsumeValueOrDie() {
-  TF_CHECK_OK(status_);
-  return std::move(value_);
-}
-
-template <typename T>
-StatusOr<T>::StatusOr(const Status& status)
-    : status_(status) {
-  assert(!status.ok());
-  if (status.ok()) {
-    status_ =
-        Status(error::INTERNAL,
-               "Status::OK is not a valid constructor argument to StatusOr<T>");
-  }
-}
-
-template <typename T>
-StatusOr<T>::StatusOr(T&& value)
-    : status_() {
-  CheckValueNotNull(value);
-  value_ = std::move(value);
-}
-
+// Use XLA's StatusOr so we don't duplicate code.
 template <typename T>
-void StatusOr<T>::CheckValueNotNull(const T& value) {
-  assert(!IsNull<T>::IsValueNull(value));
-  if (IsNull<T>::IsValueNull(value)) {
-    status_ =
-        Status(error::INTERNAL,
-               "NULL is not a valid constructor argument to StatusOr<T*>");
-  }
-}
+using StatusOr = ::xla::StatusOr<T>;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
index 4dd6f3b0ccf112b281dd50467e9a16a672dbbbfb..a81c6668184c158b61eff4d4b77bb7ad07165c8e 100644
--- a/tensorflow/stream_executor/lib/str_util.h
+++ b/tensorflow/stream_executor/lib/str_util.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::str_util::Join;
@@ -29,7 +28,7 @@ using tensorflow::str_util::Split;
 // Returns a copy of the input string 'str' with the given 'suffix'
 // removed. If the suffix doesn't match, returns a copy of the original string.
 inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix) {
-  if (str.ends_with(suffix)) {
+  if (tensorflow::str_util::EndsWith(str, suffix)) {
     str.remove_suffix(suffix.size());
   }
   return str.ToString();
@@ -38,7 +37,6 @@ inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix)
 using tensorflow::str_util::Lowercase;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_
diff --git a/tensorflow/stream_executor/lib/strcat.h b/tensorflow/stream_executor/lib/strcat.h
index 424cb75f0e805a840c5043a42e7cbf3f9c7d195c..c959e4df5b2d6e3954e2edf7bbafc9efb8259dc6 100644
--- a/tensorflow/stream_executor/lib/strcat.h
+++ b/tensorflow/stream_executor/lib/strcat.h
@@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
 
 #include "tensorflow/core/lib/strings/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::strings::StrCat;
 using tensorflow::strings::StrAppend;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
diff --git a/tensorflow/stream_executor/lib/stringpiece.h b/tensorflow/stream_executor/lib/stringpiece.h
index 97ee0c9206480202db887018c21c1e41482a04e9..b80de5df306117e315b282e53ec871d60bd6d517 100644
--- a/tensorflow/stream_executor/lib/stringpiece.h
+++ b/tensorflow/stream_executor/lib/stringpiece.h
@@ -19,14 +19,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::StringPiece;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPIECE_H_
diff --git a/tensorflow/stream_executor/lib/stringprintf.h b/tensorflow/stream_executor/lib/stringprintf.h
index 504de25a681c4ec880c04a46aad89ddeeb622c71..2f65ed9c6a86516cdb6f8aeaaf9c48b3622cb613 100644
--- a/tensorflow/stream_executor/lib/stringprintf.h
+++ b/tensorflow/stream_executor/lib/stringprintf.h
@@ -18,15 +18,13 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::strings::Printf;
 using tensorflow::strings::Appendf;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPRINTF_H_
diff --git a/tensorflow/stream_executor/lib/thread_options.h b/tensorflow/stream_executor/lib/thread_options.h
index bd7f63714e25ec88b6d54bc683fe96a313589485..079cf757acd98d994b1351423205ef99a64478f2 100644
--- a/tensorflow/stream_executor/lib/thread_options.h
+++ b/tensorflow/stream_executor/lib/thread_options.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/platform/env.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::ThreadOptions;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_
diff --git a/tensorflow/stream_executor/lib/threadpool.h b/tensorflow/stream_executor/lib/threadpool.h
index 35630c5106a1b732d1100c416b2abc7fe6404c55..220068ade11b98e2442cdc1c68b2aaeb422ae6b3 100644
--- a/tensorflow/stream_executor/lib/threadpool.h
+++ b/tensorflow/stream_executor/lib/threadpool.h
@@ -21,14 +21,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/notification.h"
 #include "tensorflow/stream_executor/lib/thread_options.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::thread::ThreadPool;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_THREADPOOL_H_
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index f23224ae772b9c5915426feaef1155fc9711f075..5b51398d8cab5df7c7514bc3bedf87f5c33c6e5a 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -20,14 +20,39 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
+
+/* static */ mutex MultiPlatformManager::platforms_mutex_{LINKER_INITIALIZED};
+
+/* static */ port::StatusOr<Platform*> MultiPlatformManager::LookupByNameLocked(
+    const string& target) {
+  PlatformMap* platform_map = GetPlatformMap();
+  auto it = platform_map->find(port::Lowercase(target));
+  if (it == platform_map->end()) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        "could not find registered platform with name: \"" + target + "\"");
+  }
+  return it->second;
+}
+
+/* static */ port::StatusOr<Platform*> MultiPlatformManager::LookupByIdLocked(
+    const Platform::Id& id) {
+  PlatformIdMap* platform_map = GetPlatformByIdMap();
+  auto it = platform_map->find(id);
+  if (it == platform_map->end()) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        port::Printf("could not find registered platform with id: 0x%p", id));
+  }
+  return it->second;
+}
 
 /* static */ port::Status MultiPlatformManager::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   CHECK(platform != nullptr);
   string key = port::Lowercase(platform->Name());
-  mutex_lock lock(GetPlatformsMutex());
+  mutex_lock lock(platforms_mutex_);
   if (GetPlatformMap()->find(key) != GetPlatformMap()->end()) {
     return port::Status(port::error::INTERNAL,
                         "platform is already registered with name: \"" +
@@ -45,39 +70,68 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
     const string& target) {
-  tf_shared_lock lock(GetPlatformsMutex());
-  auto it = GetPlatformMap()->find(port::Lowercase(target));
+  mutex_lock lock(platforms_mutex_);
 
-  if (it == GetPlatformMap()->end()) {
-    return port::Status(
-        port::error::NOT_FOUND,
-        "could not find registered platform with name: \"" + target + "\"");
+  SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
+  if (!platform->Initialized()) {
+    SE_RETURN_IF_ERROR(platform->Initialize({}));
   }
 
-  return it->second;
+  return platform;
 }
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
     const Platform::Id& id) {
-  tf_shared_lock lock(GetPlatformsMutex());
-  auto it = GetPlatformByIdMap()->find(id);
-  if (it == GetPlatformByIdMap()->end()) {
+  mutex_lock lock(platforms_mutex_);
+
+  SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
+  if (!platform->Initialized()) {
+    SE_RETURN_IF_ERROR(platform->Initialize({}));
+  }
+
+  return platform;
+}
+
+/* static */ port::StatusOr<Platform*>
+MultiPlatformManager::InitializePlatformWithName(
+    const string& target, const std::map<string, string>& options) {
+  mutex_lock lock(platforms_mutex_);
+
+  SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
+  if (platform->Initialized()) {
+    return port::Status(port::error::FAILED_PRECONDITION,
+                        "platform \"" + target + "\" is already initialized");
+  }
+
+  SE_RETURN_IF_ERROR(platform->Initialize(options));
+
+  return platform;
+}
+
+/* static */ port::StatusOr<Platform*>
+MultiPlatformManager::InitializePlatformWithId(
+    const Platform::Id& id, const std::map<string, string>& options) {
+  mutex_lock lock(platforms_mutex_);
+
+  SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
+  if (platform->Initialized()) {
     return port::Status(
-        port::error::NOT_FOUND,
-        port::Printf("could not find registered platform with id: 0x%p", id));
+        port::error::FAILED_PRECONDITION,
+        port::Printf("platform with id 0x%p is already initialized", id));
   }
 
-  return it->second;
+  SE_RETURN_IF_ERROR(platform->Initialize(options));
+
+  return platform;
 }
 
 /* static */ void MultiPlatformManager::ClearPlatformRegistry() {
-  mutex_lock lock(GetPlatformsMutex());
+  mutex_lock lock(platforms_mutex_);
   GetPlatformMap()->clear();
   GetPlatformByIdMap()->clear();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(
     multi_platform_manager,
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index ea6155b4826439b98262530e70e6463eb1fda237..7e316879ca0cf9c2a97ee37c556e0f0d9b83e5fa 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -22,14 +22,14 @@ limitations under the License.
 // In your BUILD rule, add a dependency on a platform plugin that you'd like
 // to use, such as:
 //
-//   //perftools/gputools/executor/cuda:cuda_platform
-//   //perftools/gputools/executor/opencl:opencl_platform
+//   //third_party/tensorflow/stream_executor/cuda:cuda_platform
+//   //third_party/tensorflow/stream_executor/opencl:opencl_platform
 //
 // This will register platform plugins that can be discovered via this
 // interface. Sample API usage:
 //
 //   port::StatusOr<Platform*> platform_status =
-//      gpu::MultiPlatformManager::PlatformWithName("OpenCL");
+//      se::MultiPlatformManager::PlatformWithName("OpenCL");
 //   if (!platform_status.ok()) { ... }
 //   Platform* platform = platform_status.ValueOrDie();
 //   LOG(INFO) << platform->VisibleDeviceCount() << " devices visible";
@@ -56,10 +56,10 @@ limitations under the License.
 // And similarly, for standard interfaces (BLAS, RNG, etc.) you can add
 // dependencies on support libraries, e.g.:
 //
-//    //perftools/gputools/executor/cuda:pluton_blas_plugin
-//    //perftools/gputools/executor/cuda:cudnn_plugin
-//    //perftools/gputools/executor/cuda:cublas_plugin
-//    //perftools/gputools/executor/cuda:curand_plugin
+//    //third_party/tensorflow/stream_executor/cuda:pluton_blas_plugin
+//    //third_party/tensorflow/stream_executor/cuda:cudnn_plugin
+//    //third_party/tensorflow/stream_executor/cuda:cublas_plugin
+//    //third_party/tensorflow/stream_executor/cuda:curand_plugin
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
@@ -67,16 +67,15 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
-#include "tensorflow/stream_executor/platform/port.h"
 
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Manages multiple platforms that may be present on the current machine.
 class MultiPlatformManager {
@@ -85,26 +84,43 @@ class MultiPlatformManager {
   // already registered. The associated listener, if not null, will be used to
   // trace events for ALL executors for that platform.
   // Takes ownership of listener.
-  static port::Status RegisterPlatform(std::unique_ptr<Platform> platform);
+  static port::Status RegisterPlatform(std::unique_ptr<Platform> platform)
+      LOCKS_EXCLUDED(platforms_mutex_);
 
-  // Retrieves the platform registered with the given platform name; e.g.
-  // "CUDA", "OpenCL", ...
+  // Retrieves the platform registered with the given platform name (e.g.
+  // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
+  // Platform's Id() method).
+  //
+  // If the platform has not already been initialized, it will be initialized
+  // with a default set of parameters.
   //
   // If the requested platform is not registered, an error status is returned.
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
-  static port::StatusOr<Platform*> PlatformWithName(const string& target);
-
-  // Retrieves the platform registered with the given platform ID, which
-  // is an opaque (but comparable) value.
+  static port::StatusOr<Platform*> PlatformWithName(const string& target)
+      LOCKS_EXCLUDED(platforms_mutex_);
+  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
+      LOCKS_EXCLUDED(platforms_mutex_);
+
+  // Retrieves the platform registered with the given platform name (e.g.
+  // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
+  // Platform's Id() method).
+  //
+  // The platform will be initialized with the given options. If the platform
+  // was already initialized, an error will be returned.
   //
   // If the requested platform is not registered, an error status is returned.
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
-  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
+  static port::StatusOr<Platform*> InitializePlatformWithName(
+      const string& target, const std::map<string, string>& options)
+      LOCKS_EXCLUDED(platforms_mutex_);
+  static port::StatusOr<Platform*> InitializePlatformWithId(
+      const Platform::Id& id, const std::map<string, string>& options)
+      LOCKS_EXCLUDED(platforms_mutex_);
 
   // Clears the set of registered platforms, primarily used for testing.
-  static void ClearPlatformRegistry();
+  static void ClearPlatformRegistry() LOCKS_EXCLUDED(platforms_mutex_);
 
   // Although the MultiPlatformManager "owns" its platforms, it holds them as
   // undecorated pointers to prevent races during program exit (between this
@@ -122,17 +138,16 @@ class MultiPlatformManager {
 
   // Provides access to the available set of platforms under a lock.
   static port::Status WithPlatforms(
-      std::function<port::Status(PlatformMap*)> callback) {
-    mutex_lock lock(GetPlatformsMutex());
+      std::function<port::Status(PlatformMap*)> callback)
+      LOCKS_EXCLUDED(platforms_mutex_) {
+    mutex_lock lock(platforms_mutex_);
     return callback(GetPlatformMap());
   }
 
  private:
-  // mutex that guards the platform map.
-  static mutex& GetPlatformsMutex() {
-    static mutex* platforms_mutex = new mutex;
-    return *platforms_mutex;
-  }
+  using PlatformIdMap = std::map<Platform::Id, Platform*>;
+
+  static mutex platforms_mutex_;
 
   // TODO(b/22689637): Clean up these two maps; make sure they coexist nicely.
   // TODO(b/22689637): Move this (whatever the final/"official" map is) to
@@ -147,16 +162,24 @@ class MultiPlatformManager {
 
   // Holds a Platform::Id-to-object mapping.
   // Unlike platforms_ above, this map does not own its contents.
-  static std::map<Platform::Id, Platform*>* GetPlatformByIdMap() {
-    using PlatformIdMap = std::map<Platform::Id, Platform*>;
+  static PlatformIdMap* GetPlatformByIdMap() {
     static PlatformIdMap* instance = new PlatformIdMap;
     return instance;
   }
 
+  // Looks up the platform object with the given name.  Assumes the Platforms
+  // mutex is held.
+  static port::StatusOr<Platform*> LookupByNameLocked(const string& target)
+      EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_);
+
+  // Looks up the platform object with the given id.  Assumes the Platforms
+  // mutex is held.
+  static port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
+      EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_);
+
   SE_DISALLOW_COPY_AND_ASSIGN(MultiPlatformManager);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/tensorflow/stream_executor/platform.cc b/tensorflow/stream_executor/platform.cc
index 93f08d06dae862f24b5b533395af63139f344f77..777abced8634410440fd5c2772a3b981a1489dd6 100644
--- a/tensorflow/stream_executor/platform.cc
+++ b/tensorflow/stream_executor/platform.cc
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 string PlatformKindString(PlatformKind kind) {
   switch (kind) {
@@ -85,6 +84,17 @@ StreamExecutorConfig::StreamExecutorConfig(int ordinal_in)
 
 Platform::~Platform() {}
 
+bool Platform::Initialized() const { return true; }
+
+port::Status Platform::Initialize(
+    const std::map<string, string> &platform_options) {
+  if (!platform_options.empty()) {
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "this platform does not support custom initialization");
+  }
+  return port::Status::OK();
+}
+
 port::Status Platform::ForceExecutorShutdown() {
   return port::Status(port::error::UNIMPLEMENTED,
                       "executor shutdown is not supported on this platform");
@@ -124,5 +134,4 @@ port::Status Platform::EnablePeerAccess() {
   return port::Status::OK();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h
index f0a0e60e02f951018b39ef831cd2f7dd3256f87d..5cb7047b6f39483f237b5bb249906d9ce8a06b9e 100644
--- a/tensorflow/stream_executor/platform.h
+++ b/tensorflow/stream_executor/platform.h
@@ -29,8 +29,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class StreamExecutor;
 
@@ -106,11 +105,14 @@ class Platform {
   namespace {                           \
   int plugin_id_value;                  \
   }                                     \
-  const perftools::gputools::Platform::Id ID_VAR_NAME = &plugin_id_value;
+  const ::stream_executor::Platform::Id ID_VAR_NAME = &plugin_id_value;
 
   // Returns a key uniquely identifying this platform.
   virtual Id id() const = 0;
 
+  // Name of this platform.
+  virtual const string& Name() const = 0;
+
   // Returns the number of devices accessible on this platform.
   //
   // Note that, though these devices are visible, if there is only one userspace
@@ -118,8 +120,17 @@ class Platform {
   // device, a call to ExecutorForDevice may return an error status.
   virtual int VisibleDeviceCount() const = 0;
 
-  // Name of this platform.
-  virtual const string& Name() const = 0;
+  // Returns true iff the platform has been initialized.
+  virtual bool Initialized() const;
+
+  // Initializes the platform with a custom set of options. The platform must be
+  // initialized before obtaining StreamExecutor objects.  The interpretation of
+  // the platform_options argument is implementation specific.  This method may
+  // return an error if unrecognized options are provided.  If using
+  // MultiPlatformManager, this method will be called automatically by
+  // InitializePlatformWithId/InitializePlatformWithName.
+  virtual port::Status Initialize(
+      const std::map<string, string>& platform_options);
 
   // Returns a device with the given ordinal on this platform with a default
   // plugin configuration or, if none can be found with the given ordinal or
@@ -156,6 +167,8 @@ class Platform {
   // This is only useful on platforms which bind a device to a single process
   // that has obtained the device context. May return UNIMPLEMENTED on platforms
   // that have no reason to destroy device contexts.
+  //
+  // The platform must be reinitialized after this is called.
   virtual port::Status ForceExecutorShutdown();
 
   // Registers a TraceListener to listen to all StreamExecutors for this
@@ -191,7 +204,6 @@ class Platform {
   SE_DISALLOW_COPY_AND_ASSIGN(Platform);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_H_
diff --git a/tensorflow/stream_executor/platform/default/mutex.h b/tensorflow/stream_executor/platform/default/mutex.h
index 62de0cbce0b534020166c47a64c10c595fb3d5ae..c9f5a7c609e5bbe59ea456e30d575b991aa37b65 100644
--- a/tensorflow/stream_executor/platform/default/mutex.h
+++ b/tensorflow/stream_executor/platform/default/mutex.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/mutex.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 #undef mutex_lock
 #undef tf_shared_lock
@@ -35,7 +34,6 @@ using tensorflow::tf_shared_lock;
 #define tf_shared_lock(x) \
   static_assert(0, "tf_shared_lock_decl_missing_var_name");
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h
index 6603df487878e62271a144b14d78518044c66c81..57ad965ef1101d7d86a4f9cf0a52016e2b3ae713 100644
--- a/tensorflow/stream_executor/platform/port.h
+++ b/tensorflow/stream_executor/platform/port.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 using tensorflow::int8;
 using tensorflow::int16;
@@ -39,19 +38,12 @@ using tensorflow::uint64;
 using std::string;
 #endif
 
-#if !defined(COMPILER_MSVC)
-#define ARRAYSIZE(a)              \
-    ((sizeof(a) / sizeof(*(a))) / \
-    static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
 using tensorflow::LinkerInitialized;
 using tensorflow::LINKER_INITIALIZED;
 
 #define SE_FALLTHROUGH_INTENDED TF_FALLTHROUGH_INTENDED
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #define SE_DISALLOW_COPY_AND_ASSIGN TF_DISALLOW_COPY_AND_ASSIGN
 #define SE_MUST_USE_RESULT TF_MUST_USE_RESULT
diff --git a/tensorflow/stream_executor/plugin.cc b/tensorflow/stream_executor/plugin.cc
index 6424658e22f930a7adfa4dd7fe330308e62e00cf..cfbc52ff17b3a699e97d073d0d0c77bba811db72 100644
--- a/tensorflow/stream_executor/plugin.cc
+++ b/tensorflow/stream_executor/plugin.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/plugin.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Mostly-arbitrary ID only used as a sentinel "not otherwise initialized"
 // value. This value should never [need to] be specified aside by initialization
@@ -51,5 +50,4 @@ PluginConfig& PluginConfig::SetRng(PluginId rng) {
   return *this;
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/plugin.h b/tensorflow/stream_executor/plugin.h
index 0b88b86e2b1cf8cbd3dddfa5ca3ae9cdc779a952..0505412e7ac412dda8cda1c5c4589148241c5b38 100644
--- a/tensorflow/stream_executor/plugin.h
+++ b/tensorflow/stream_executor/plugin.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // A plugin ID is a unique identifier for each registered plugin type.
 typedef void* PluginId;
@@ -83,7 +82,6 @@ class PluginConfig {
   PluginId blas_, dnn_, fft_, rng_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
diff --git a/tensorflow/stream_executor/plugin_registry.cc b/tensorflow/stream_executor/plugin_registry.cc
index 54761139eaf3ac5e2edb5ccc45dabe56fa1384d4..7812703efd8b4f143c195225c2c12502378bf5f2 100644
--- a/tensorflow/stream_executor/plugin_registry.cc
+++ b/tensorflow/stream_executor/plugin_registry.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 const PluginId kNullPlugin = nullptr;
 
@@ -244,5 +243,4 @@ EMIT_PLUGIN_SPECIALIZATIONS(DnnFactory, dnn, "DNN");
 EMIT_PLUGIN_SPECIALIZATIONS(FftFactory, fft, "FFT");
 EMIT_PLUGIN_SPECIALIZATIONS(RngFactory, rng, "RNG");
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/plugin_registry.h b/tensorflow/stream_executor/plugin_registry.h
index 8636a49ce6861d24eb5567bc8d8b268b840a310f..49628ecd246e1a785ef3583f5a38567f553352f5 100644
--- a/tensorflow/stream_executor/plugin_registry.h
+++ b/tensorflow/stream_executor/plugin_registry.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/rng.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace internal {
 class StreamExecutorInterface;
@@ -160,7 +159,6 @@ class PluginRegistry {
   SE_DISALLOW_COPY_AND_ASSIGN(PluginRegistry);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_
diff --git a/tensorflow/stream_executor/rng.cc b/tensorflow/stream_executor/rng.cc
index 1c05005067ce7341994aa7b8f1cb9c4df734ce2c..b0efad91084b7fbfed70ecca9bf879810de2ec1d 100644
--- a/tensorflow/stream_executor/rng.cc
+++ b/tensorflow/stream_executor/rng.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/logging.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace rng {
 
 bool RngSupport::CheckSeed(const uint8 *seed, uint64 seed_bytes) {
@@ -47,5 +46,4 @@ const int RngSupport::kMaxSeedBytes;
 #endif
 
 }  // namespace rng
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rng.h b/tensorflow/stream_executor/rng.h
index 36d0fdd454f9a98b749fb2a8f9ceb14eec9d0fe1..acbf8fce4caa8cbbc9b2288725ea7bf07ee84af4 100644
--- a/tensorflow/stream_executor/rng.h
+++ b/tensorflow/stream_executor/rng.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 template <typename ElemT>
@@ -89,7 +88,6 @@ class RngSupport {
 };
 
 }  // namespace rng
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_RNG_H_
diff --git a/tensorflow/stream_executor/scratch_allocator.cc b/tensorflow/stream_executor/scratch_allocator.cc
index 0c1db414f2e9617070c8c12e2a334ed7977ae9f1..8fc4c4c509cfd35c8a733b01a8d85965fe524def 100644
--- a/tensorflow/stream_executor/scratch_allocator.cc
+++ b/tensorflow/stream_executor/scratch_allocator.cc
@@ -18,8 +18,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 ScratchAllocator::~ScratchAllocator() {}
 
@@ -38,5 +37,4 @@ port::StatusOr<DeviceMemory<uint8>> OneTimeScratchAllocator::AllocateBytes(
   return temporary_->device_memory();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/scratch_allocator.h b/tensorflow/stream_executor/scratch_allocator.h
index 94d5ede161323ff39d7c75589958026b66f22cc5..2aed2c4437381bd29750aae1120a8783b61bfb9f 100644
--- a/tensorflow/stream_executor/scratch_allocator.h
+++ b/tensorflow/stream_executor/scratch_allocator.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -77,7 +76,6 @@ class OneTimeScratchAllocator : public ScratchAllocator {
   SE_DISALLOW_COPY_AND_ASSIGN(OneTimeScratchAllocator);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_SCRATCH_ALLOCATOR_H_
diff --git a/tensorflow/stream_executor/shared_memory_config.h b/tensorflow/stream_executor/shared_memory_config.h
index de556cb73404d5d919b779e5d712f465ff2bc821..7cbeb3bcd91816ece725afecc2e85e7f200c6045 100644
--- a/tensorflow/stream_executor/shared_memory_config.h
+++ b/tensorflow/stream_executor/shared_memory_config.h
@@ -19,8 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // SharedMemoryConfig enum describes potential widths of shared memory banks for
 // a device or kernel.
@@ -30,7 +29,6 @@ enum class SharedMemoryConfig {
   kEightByte,  // Sets shared memory banks to be eight bytes wide.
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index ba5001e273632c893b05eea64542f1b156e28c47..093f0c9306590a1e8faf0ab97621941b6fc2d759 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_buffer.h"
+#include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -27,8 +29,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace {
 // Code to turn parameters to functions on stream into strings that
@@ -117,7 +118,9 @@ string ToVlogString(const DeviceMemoryBase *memory) {
   return ToVlogString(*memory);
 }
 
-string ToVlogString(const Eigen::half &h) { return port::StrCat(h); }
+string ToVlogString(const Eigen::half &h) {
+  return port::StrCat(static_cast<float>(h));
+}
 
 string ToVlogString(int i) { return port::StrCat(i); }
 
@@ -131,6 +134,14 @@ string ToVlogString(float f) { return port::StrCat(f); }
 
 string ToVlogString(double d) { return port::StrCat(d); }
 
+template <typename T>
+string ToVlogString(const HostOrDeviceScalar<T> &memory_or_constant) {
+  if (memory_or_constant.is_pointer()) {
+    return ToVlogString(memory_or_constant.pointer());
+  }
+  return ToVlogString(memory_or_constant.value());
+}
+
 template <class T>
 string ToVlogString(port::ArraySlice<T> elements) {
   string str = port::StrCat(
@@ -681,6 +692,37 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
   return *this;
 }
 
+Stream &Stream::ThenConvolveWithAlgorithm(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<double> &input_data,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<double> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<double> *output,
+    ScratchAllocator *scratch_allocator,
+    const dnn::AlgorithmConfig &algorithm_config,
+    dnn::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(output_descriptor),
+            PARAM(output), PARAM(algorithm_config));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      auto status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output, scratch_allocator,
+          algorithm_config, output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenConvolveWithAlgorithm(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<float> &input_data,
@@ -890,6 +932,39 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
   return *this;
 }
 
+Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<double> &filter_data,
+    const dnn::BatchDescriptor &output_descriptor,
+    DeviceMemory<double> backward_output_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const dnn::BatchDescriptor &input_descriptor,
+    DeviceMemory<double> *backward_input_data,
+    ScratchAllocator *scratch_allocator,
+    const dnn::AlgorithmConfig &algorithm_config,
+    dnn::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(output_descriptor), PARAM(backward_output_data),
+            PARAM(convolution_descriptor), PARAM(input_descriptor),
+            PARAM(backward_input_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      auto status = dnn->DoConvolveBackwardData(
+          this, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, scratch_allocator, algorithm_config,
+          output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
     const dnn::FilterDescriptor &filter_descriptor,
     const DeviceMemory<float> &filter_data,
@@ -1026,6 +1101,39 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
   return *this;
 }
 
+Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<double> &input_data,
+    const dnn::BatchDescriptor &output_descriptor,
+    DeviceMemory<double> backward_output_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const dnn::FilterDescriptor &filter_descriptor,
+    DeviceMemory<double> *backward_filter_data,
+    ScratchAllocator *scratch_allocator,
+    const dnn::AlgorithmConfig &algorithm_config,
+    dnn::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(output_descriptor), PARAM(backward_output_data),
+            PARAM(convolution_descriptor), PARAM(filter_descriptor),
+            PARAM(backward_filter_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      auto status = dnn->DoConvolveBackwardFilter(
+          this, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, scratch_allocator, algorithm_config,
+          output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<float> &input_data,
@@ -3783,22 +3891,23 @@ Stream &Stream::ThenBlasGemmWithProfiling(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, const Eigen::half &alpha, const DeviceMemory<Eigen::half> &a,
-    int lda, const DeviceMemory<Eigen::half> &b, int ldb,
-    const Eigen::half &beta, DeviceMemory<Eigen::half> *c, int ldc,
-    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
-    blas::ProfileResult *output_profile_result) {
+    uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
+    const DeviceMemory<Eigen::half> &a, int lda,
+    const DeviceMemory<Eigen::half> &b, int ldb,
+    const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
-  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
-                          uint64, const Eigen::half &,
-                          const DeviceMemory<Eigen::half> &, int,
-                          const DeviceMemory<Eigen::half> &, int,
-                          const Eigen::half &, DeviceMemory<Eigen::half> *, int,
-                          blas::ComputationType, blas::AlgorithmType>
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      const HostOrDeviceScalar<Eigen::half> &,
+      const DeviceMemory<Eigen::half> &, int, const DeviceMemory<Eigen::half> &,
+      int, const HostOrDeviceScalar<Eigen::half> &, DeviceMemory<Eigen::half> *,
+      int, blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
               m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
@@ -3807,18 +3916,20 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
-    const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int> *c,
-    int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+    uint64 k, const HostOrDeviceScalar<int> &alpha, const DeviceMemory<int8> &a,
+    int lda, const DeviceMemory<int8> &b, int ldb,
+    const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
   ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64, uint64, uint64, int,
-      const DeviceMemory<int8> &, int, const DeviceMemory<int8> &, int, int,
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      const HostOrDeviceScalar<int> &, const DeviceMemory<int8> &, int,
+      const DeviceMemory<int8> &, int, const HostOrDeviceScalar<int> &,
       DeviceMemory<int> *, int, blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
@@ -3828,8 +3939,9 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
-    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    uint64 k, const HostOrDeviceScalar<float> &alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+    int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
     int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
@@ -3838,8 +3950,9 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
             PARAM(algorithm));
 
   ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
-      const DeviceMemory<float> &, int, const DeviceMemory<float> &, int, float,
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      const HostOrDeviceScalar<float> &, const DeviceMemory<float> &, int,
+      const DeviceMemory<float> &, int, const HostOrDeviceScalar<float> &,
       DeviceMemory<float> *, int, blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
@@ -3849,32 +3962,35 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-    const DeviceMemory<double> &b, int ldb, double beta,
-    DeviceMemory<double> *c, int ldc, blas::ComputationType computation_type,
+    uint64 k, const HostOrDeviceScalar<double> &alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+    int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+    int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
-  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
-                          uint64, double, const DeviceMemory<double> &, int,
-                          const DeviceMemory<double> &, int, double,
-                          DeviceMemory<double> *, int, blas::ComputationType,
-                          blas::AlgorithmType>
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      const HostOrDeviceScalar<double> &, const DeviceMemory<double> &, int,
+      const DeviceMemory<double> &, int, const HostOrDeviceScalar<double> &,
+      DeviceMemory<double> *, int, blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
-              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
+              m, n, k, HostOrDeviceScalar<double>(alpha), a, lda, b, ldb,
+              HostOrDeviceScalar<double>(beta), c, ldc, computation_type,
               algorithm, output_profile_result);
 }
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, std::complex<float> alpha,
+    uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
     const DeviceMemory<std::complex<float>> &a, int lda,
     const DeviceMemory<std::complex<float>> &b, int ldb,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    const HostOrDeviceScalar<std::complex<float>> &beta,
+    DeviceMemory<std::complex<float>> *c, int ldc,
     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
     blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
@@ -3882,12 +3998,14 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
-  ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
-      std::complex<float>, const DeviceMemory<std::complex<float>> &, int,
-      const DeviceMemory<std::complex<float>> &, int, std::complex<float>,
-      DeviceMemory<std::complex<float>> *, int, blas::ComputationType,
-      blas::AlgorithmType>
+  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
+                          uint64,
+                          const HostOrDeviceScalar<std::complex<float>> &,
+                          const DeviceMemory<std::complex<float>> &, int,
+                          const DeviceMemory<std::complex<float>> &, int,
+                          const HostOrDeviceScalar<std::complex<float>> &,
+                          DeviceMemory<std::complex<float>> *, int,
+                          blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
               m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
@@ -3896,10 +4014,11 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, std::complex<double> alpha,
+    uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
     const DeviceMemory<std::complex<double>> &a, int lda,
     const DeviceMemory<std::complex<double>> &b, int ldb,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    const HostOrDeviceScalar<std::complex<double>> &beta,
+    DeviceMemory<std::complex<double>> *c, int ldc,
     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
     blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
@@ -3907,12 +4026,14 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
-  ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
-      std::complex<double>, const DeviceMemory<std::complex<double>> &, int,
-      const DeviceMemory<std::complex<double>> &, int, std::complex<double>,
-      DeviceMemory<std::complex<double>> *, int, blas::ComputationType,
-      blas::AlgorithmType>
+  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
+                          uint64,
+                          const HostOrDeviceScalar<std::complex<double>> &,
+                          const DeviceMemory<std::complex<double>> &, int,
+                          const DeviceMemory<std::complex<double>> &, int,
+                          const HostOrDeviceScalar<std::complex<double>> &,
+                          DeviceMemory<std::complex<double>> *, int,
+                          blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
               m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
@@ -4695,7 +4816,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<Eigen::half> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4703,7 +4825,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4727,7 +4850,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<float> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4735,7 +4859,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4760,7 +4885,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<double> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4768,7 +4894,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4800,7 +4927,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<Eigen::half> *input_c_backprop_data,
     DeviceMemory<Eigen::half> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4810,7 +4938,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -4841,7 +4970,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<float> *input_c_backprop_data,
     DeviceMemory<float> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4851,7 +4981,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -4883,7 +5014,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<double> *input_c_backprop_data,
     DeviceMemory<double> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4893,7 +5025,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -4923,12 +5056,6 @@ Stream &Stream::ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
   return *this;
 }
 
-Stream &Stream::ThenDoHostCallbackForTest(std::function<void()> callback) {
-  VLOG_CALL(PARAM(callback));
-
-  return ThenDoHostCallback(callback);
-}
-
 Stream &Stream::ThenDoHostCallback(std::function<void()> callback) {
   VLOG_CALL(PARAM(callback));
 
@@ -5086,5 +5213,4 @@ port::Status Stream::BlockHostUntilDone() {
   return first_error;
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index a2fb2ea2375d0f245ae3bf3ccb04803d01663def..3d1b011c570a62243d45d38d3d24b371e4382c33 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/fft.h"
+#include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/kernel.h"
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
@@ -38,8 +39,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_memory_manager.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace host {
 class HostBlas;
@@ -358,6 +358,17 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output, ScratchAllocator *scratch_allocator);
 
+  Stream &ThenConvolveWithAlgorithm(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<double> &input_data,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<double> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<double> *output, ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
+
   Stream &ThenConvolveWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<float> &input_data,
@@ -476,6 +487,18 @@ class Stream {
       DeviceMemory<Eigen::half> *backward_input_data,
       ScratchAllocator *scratch_allocator);
 
+  Stream &ThenConvolveBackwardDataWithAlgorithm(
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<double> &filter_data,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const dnn::BatchDescriptor &input_descriptor,
+      DeviceMemory<double> *backward_input_data,
+      ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
+
   Stream &ThenConvolveBackwardDataWithAlgorithm(
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<float> &filter_data,
@@ -529,6 +552,18 @@ class Stream {
       DeviceMemory<Eigen::half> *backward_filter_data,
       ScratchAllocator *scratch_allocator);
 
+  Stream &ThenConvolveBackwardFilterWithAlgorithm(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<double> &input_data,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<double> backward_output_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const dnn::FilterDescriptor &filter_descriptor,
+      DeviceMemory<double> *backward_filter_data,
+      ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
+
   Stream &ThenConvolveBackwardFilterWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<float> &input_data,
@@ -1388,50 +1423,53 @@ class Stream {
   // See BlasSupport::DoBlasGemmWithAlgorithm.
   Stream &ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-      uint64 k, const Eigen::half &alpha, const DeviceMemory<Eigen::half> &a,
-      int lda, const DeviceMemory<Eigen::half> &b, int ldb,
-      const Eigen::half &beta, DeviceMemory<Eigen::half> *c, int ldc,
-      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+      uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
+      const DeviceMemory<Eigen::half> &a, int lda,
+      const DeviceMemory<Eigen::half> &b, int ldb,
+      const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+      int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa,
-                                    blas::Transpose transb, uint64 m, uint64 n,
-                                    uint64 k, int alpha,
-                                    const DeviceMemory<int8> &a, int lda,
-                                    const DeviceMemory<int8> &b, int ldb,
-                                    int beta, DeviceMemory<int> *c, int ldc,
-                                    blas::ComputationType computation_type,
-                                    blas::AlgorithmType algorithm,
-                                    blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa,
-                                    blas::Transpose transb, uint64 m, uint64 n,
-                                    uint64 k, float alpha,
-                                    const DeviceMemory<float> &a, int lda,
-                                    const DeviceMemory<float> &b, int ldb,
-                                    float beta, DeviceMemory<float> *c, int ldc,
-                                    blas::ComputationType computation_type,
-                                    blas::AlgorithmType algorithm,
-                                    blas::ProfileResult *output_profile_result);
   Stream &ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-      uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-      const DeviceMemory<double> &b, int ldb, double beta,
-      DeviceMemory<double> *c, int ldc, blas::ComputationType computation_type,
+      uint64 k, const HostOrDeviceScalar<int> &alpha,
+      const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b,
+      int ldb, const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c,
+      int ldc, blas::ComputationType computation_type,
       blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
   Stream &ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-      uint64 k, std::complex<float> alpha,
+      uint64 k, const HostOrDeviceScalar<float> &alpha,
+      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+      int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
+      int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithAlgorithm(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, const HostOrDeviceScalar<double> &alpha,
+      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+      int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+      int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithAlgorithm(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
       const DeviceMemory<std::complex<float>> &a, int lda,
       const DeviceMemory<std::complex<float>> &b, int ldb,
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+      const HostOrDeviceScalar<std::complex<float>> &beta,
+      DeviceMemory<std::complex<float>> *c, int ldc,
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
   Stream &ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-      uint64 k, std::complex<double> alpha,
+      uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
       const DeviceMemory<std::complex<double>> &a, int lda,
       const DeviceMemory<std::complex<double>> &b, int ldb,
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+      const HostOrDeviceScalar<std::complex<double>> &beta,
+      DeviceMemory<std::complex<double>> *c, int ldc,
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
 
@@ -1767,7 +1805,8 @@ class Stream {
                          DeviceMemory<Eigen::half> *output_c_data,
                          bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
                          const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1784,7 +1823,8 @@ class Stream {
                          const dnn::RnnStateTensorDescriptor &output_c_desc,
                          DeviceMemory<float> *output_c_data, bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
                          const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1801,7 +1841,8 @@ class Stream {
                          const dnn::RnnStateTensorDescriptor &output_c_desc,
                          DeviceMemory<double> *output_c_data, bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   // Enqueue a backward operation of the RNN model onto the stream.
   // See DnnSupport::DoRnnBackward for more details.
@@ -1828,7 +1869,8 @@ class Stream {
       DeviceMemory<Eigen::half> *input_c_backprop_data,
       DeviceMemory<Eigen::half> *params_backprop_data,
       DeviceMemory<uint8> *reserve_space_data,
-      ScratchAllocator *workspace_allocator);
+      ScratchAllocator *workspace_allocator,
+      dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
                           const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1852,7 +1894,8 @@ class Stream {
                           DeviceMemory<float> *input_c_backprop_data,
                           DeviceMemory<float> *params_backprop_data,
                           DeviceMemory<uint8> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator);
+                          ScratchAllocator *workspace_allocator,
+                          dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
                           const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1876,7 +1919,8 @@ class Stream {
                           DeviceMemory<double> *input_c_backprop_data,
                           DeviceMemory<double> *params_backprop_data,
                           DeviceMemory<uint8> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator);
+                          ScratchAllocator *workspace_allocator,
+                          dnn::ProfileResult *output_profile_result);
 
   // Enqueue onto the stream a operation that transforms a tensor.
   // See DnnSupport::DoTransformTensor for more details.
@@ -1933,16 +1977,15 @@ class Stream {
   // Entrains onto the stream a callback to the host (from the device).
   // Host callbacks block/occupy the stream just as device functions
   // (execute one at a time, block later stream operations).
+  //
   // Behavior is undefined when synchronizing using OpenCL user events.
   // Behavior is undefined if host callbacks call device routines or insert
   // them into any stream.
+  //
   // On certain platforms, ThenDoHostCallback is expected to have significant
   // negative effects on performance.
   Stream &ThenDoHostCallback(std::function<void()> callback);
 
-  // Identical to ThenDoHostCallback; only exposed for testing purposes.
-  Stream &ThenDoHostCallbackForTest(std::function<void()> callback);
-
   // Returns the StreamExecutor (parent object) associated with this stream.
   StreamExecutor *parent() const {
     CHECK(parent_ != nullptr);
@@ -2058,7 +2101,6 @@ struct Quantization<int32> {
       dnn::QuantizedActivationMode::k32Bit;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_H_
diff --git a/tensorflow/stream_executor/stream_executor.h b/tensorflow/stream_executor/stream_executor.h
index 2995dccf469a2977b9593f2045062d568e569dc5..d63d485df5668a5cbb5c61525401947bebeab616 100644
--- a/tensorflow/stream_executor/stream_executor.h
+++ b/tensorflow/stream_executor/stream_executor.h
@@ -35,4 +35,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"  // IWYU pragma: export
 #include "tensorflow/stream_executor/timer.h"  // IWYU pragma: export
 
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
+}  // namespace gputools
+}  // namespace perftools
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 273d970b6fa4a581381689191b183a30f4f2bcd3..8297228e6fecddffa8fc68a1a028456dc8e75a65 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 // -- CUDA
@@ -38,5 +37,4 @@ StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
 StreamExecutorFactory MakeHostExecutorImplementation;
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 37ef182e1445a85dd0a97eac02ba064a26dc0f1d..2584c92f0c5a1129e2f10aa7148161a8d2d40c50 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -45,8 +45,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/trace_listener.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 class Timer;
@@ -343,7 +342,6 @@ extern StreamExecutorFactory MakeHostExecutorImplementation;
 
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index afca1c2e597b55b1b8d0b76d4e79995d6f6af822..20579790ef4832cedec5bf1e80a2245ec2646be6 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -39,8 +39,7 @@ namespace {
 bool FLAGS_check_device_leaks = false;
 }  // namespace
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace {
 
 string StackTraceIfVLOG10() {
@@ -305,6 +304,15 @@ bool StreamExecutor::GetConvolveAlgorithms(
                                             cc_minor, out_algorithms);
 }
 
+bool StreamExecutor::GetRnnAlgorithms(
+    std::vector<dnn::AlgorithmDesc> *out_algorithms) {
+  dnn::DnnSupport *dnn_support = AsDnn();
+  if (!dnn_support) {
+    return false;
+  }
+  return dnn_support->GetRnnAlgorithms(out_algorithms);
+}
+
 bool StreamExecutor::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmDesc> *out_algorithms) {
@@ -342,9 +350,10 @@ bool StreamExecutor::GetBlasGemmAlgorithms(
 
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 StreamExecutor::createRnnDescriptor(
-    int num_layers, int hidden_size, int input_size,
+    int num_layers, int hidden_size, int input_size, int batch_size,
     dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-    dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout, uint64 seed,
+    dnn::RnnMode rnn_mode, dnn::DataType data_type,
+    const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
     ScratchAllocator *state_allocator) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
@@ -352,8 +361,9 @@ StreamExecutor::createRnnDescriptor(
                         "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnDescriptor(
-      num_layers, hidden_size, input_size, input_mode, direction_mode, rnn_mode,
-      data_type, dropout, seed, state_allocator);
+      num_layers, hidden_size, input_size, batch_size, input_mode,
+      direction_mode, rnn_mode, data_type, algorithm_config, dropout, seed,
+      state_allocator);
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
@@ -778,5 +788,4 @@ internal::StreamExecutorInterface *StreamExecutor::implementation() {
   return implementation_->GetUnderlyingExecutor();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index a2a77218cbbafeeb9d4d8ca04b2e0a8a5024ebf9..ab6b00f6601b5f0fae02255facf5fb182c4c6d64 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -37,8 +37,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Structure used for device memory leak checking.
 struct AllocRecord {
@@ -95,7 +94,7 @@ class StreamExecutor {
   // Parameters:
   //   spec: The MultiKernelLoaderSpec is usually generated as a compile-time
   //    constant into an appropriate namespace. For example, see
-  //    perftools::gputools::executor_sample::kKernelLoaderSpecs, from which a
+  //    stream_executor::executor_sample::kKernelLoaderSpecs, from which a
   //    MultiKernelLoaderSpec is selected.
   //   kernel: Outparam that the kernel is loaded into. A given Kernel
   //    instantiation should not be loaded into more than once.
@@ -349,10 +348,14 @@ class StreamExecutor {
   // platform that underlies this interface.
   bool SupportsDnn() const;
 
-  // Get the list of supported algorithms for the forward convolution opeartion.
+  // Returns the list of supported algorithms for the forward convolution
+  // operation.
   bool GetConvolveAlgorithms(bool with_winograd_nonfused,
                              std::vector<dnn::AlgorithmDesc> *out_algorithms);
 
+  // Returns the list of supported algorithms for rnn operation.
+  bool GetRnnAlgorithms(std::vector<dnn::AlgorithmDesc> *out_algorithms);
+
   // Get the list of supported algorithms for the backward convolution on data.
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused,
@@ -370,10 +373,11 @@ class StreamExecutor {
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
-      int num_layers, int hidden_size, int input_size,
+      int num_layers, int hidden_size, int input_size, int batch_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-      dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout,
-      uint64 seed, ScratchAllocator *state_allocator);
+      dnn::RnnMode rnn_mode, dnn::DataType data_type,
+      const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator *state_allocator);
 
   // Create a RNN sequence descriptor that specifies either the input or output
   // sequence. The caller retains the ownership of the returned descriptor.
@@ -798,7 +802,6 @@ inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
   return *this;
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
diff --git a/tensorflow/stream_executor/temporary_device_memory.cc b/tensorflow/stream_executor/temporary_device_memory.cc
index c33166b2246b51ac045349d9e2c0794a3f3868a1..f113ce9be57b9bd954df0273fa6f9af1dc7d9546 100644
--- a/tensorflow/stream_executor/temporary_device_memory.cc
+++ b/tensorflow/stream_executor/temporary_device_memory.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 TemporaryDeviceMemoryBase::~TemporaryDeviceMemoryBase() {
   parent_->temporary_memory_manager()->MarkFinalized(device_memory_,
@@ -64,5 +63,4 @@ TemporaryDeviceMemoryBase::TemporaryDeviceMemoryBase(
   DCHECK(IsAllocated());
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/temporary_device_memory.h b/tensorflow/stream_executor/temporary_device_memory.h
index 2255e7ffd716d8f6d6aad2c447e05c6dfca766df..77be8599a2de59f9a8287337210ad0c6bf76ece7 100644
--- a/tensorflow/stream_executor/temporary_device_memory.h
+++ b/tensorflow/stream_executor/temporary_device_memory.h
@@ -43,8 +43,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 namespace internal {
@@ -132,7 +131,6 @@ class TemporaryDeviceMemory : public TemporaryDeviceMemoryBase {
   }
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/temporary_memory_manager.cc b/tensorflow/stream_executor/temporary_memory_manager.cc
index 449ab7d3f0bff5dff0c55b716753ca0befd4062f..420dbb0933db3adf69a425f6223250e66f960261 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.cc
+++ b/tensorflow/stream_executor/temporary_memory_manager.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 void TemporaryMemoryManager::ForceDeallocateAll() {
@@ -124,5 +123,4 @@ TemporaryMemoryManager::AllocateArrayBase(uint64 element_count,
 }
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/temporary_memory_manager.h b/tensorflow/stream_executor/temporary_memory_manager.h
index 2e6fbd9d62a041a5b4f2d2c8111d30ba0da69ce5..faf13380dc20afa9675a1b8d51aad3906d9c7bbe 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.h
+++ b/tensorflow/stream_executor/temporary_memory_manager.h
@@ -31,8 +31,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 // Record used inside the TemporaryMemoryManager as metadata for a given device
@@ -147,7 +146,6 @@ TemporaryMemoryManager::AllocateArray(uint64 element_count) {
 }
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_
diff --git a/tensorflow/stream_executor/timer.cc b/tensorflow/stream_executor/timer.cc
index 41d7e4359d4145b34d094a366824501d5a606940..a29791a1049d57cbd85604e4bfce154b57155009 100644
--- a/tensorflow/stream_executor/timer.cc
+++ b/tensorflow/stream_executor/timer.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 Timer::Timer(StreamExecutor *parent)
     : parent_(parent),
@@ -34,5 +33,4 @@ uint64 Timer::Microseconds() const { return implementation_->Microseconds(); }
 
 uint64 Timer::Nanoseconds() const { return implementation_->Nanoseconds(); }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/timer.h b/tensorflow/stream_executor/timer.h
index 0a37caa0f2fa0b5f489d1e495368c807df70f51f..fba7dd8f589ad49685bdf5fd8faa1ee18fb55113 100644
--- a/tensorflow/stream_executor/timer.h
+++ b/tensorflow/stream_executor/timer.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace internal {
 class TimerInterface;
@@ -69,7 +68,6 @@ class Timer {
   SE_DISALLOW_COPY_AND_ASSIGN(Timer);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TIMER_H_
diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h
index d1e87c348b1f867009fdb6b741d984b2f58cef21..0e874a1d47b4daa12b501f87ad1c84afe84b0b6b 100644
--- a/tensorflow/stream_executor/trace_listener.h
+++ b/tensorflow/stream_executor/trace_listener.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -69,7 +68,6 @@ class TraceListener {
                                           const port::Status* result) {}
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 818d67f7b5be1e8f2db66b24976a529b361a4990..e5cc886b3251f9dcbf8b119a7d237f736ecb48a0 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -22,6 +22,7 @@ load(
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
+    "if_mkl_lnx_x64"
 )
 
 def register_extension_info(**kwargs):
@@ -34,22 +35,27 @@ def src_to_test_name(src):
   return src.replace("/", "_").split(".")[0]
 
 def full_path(relative_paths):
-  return [PACKAGE_NAME + "/" + relative for relative in relative_paths]
+  return [native.package_name() + "/" + relative for relative in relative_paths]
+
+def _add_tfcore_prefix(src):
+  if src.startswith("//"):
+    return src
+  return "//tensorflow/core:" + src
 
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
   return [
-      "//tensorflow/core:" + p for p in core_proto_sources_relative
+      _add_tfcore_prefix(p) for p in core_proto_sources_relative
   ]
 
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
   return ([
-      "//tensorflow/core/" + p.replace(".proto", ".pb.h")
+      _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".pb.h")
       for p in core_proto_sources_relative
   ] + [
-      "//tensorflow/core/" + p.replace(".proto", ".proto.h")
+      _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".proto.h")
       for p in core_proto_sources_relative
   ])
 
@@ -162,7 +168,6 @@ def if_override_eigen_strong_inline(a):
 
 def get_win_copts(is_external=False):
     WINDOWS_COPTS = [
-        "/D__VERSION__=\\\"MSVC\\\"",
         "/DPLATFORM_WINDOWS",
         "/DEIGEN_HAS_C99_MATH",
         "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
@@ -202,7 +207,8 @@ def tf_copts(android_optimization_level_override="-O2", is_external=False):
           "-ftemplate-depth=900"])
       + if_cuda(["-DGOOGLE_CUDA=1"])
       + if_tensorrt(["-DGOOGLE_TENSORRT=1"])
-      + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML", "-fopenmp",])
+      + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"])
+      + if_mkl_lnx_x64(["-fopenmp"])
       + if_android_arm(["-mfpu=neon"])
       + if_linux_x86_64(["-msse3"])
       + if_ios_x86_64(["-msse4.1"])
@@ -265,7 +271,7 @@ def _rpath_linkopts(name):
   # deployed. Other shared object dependencies (e.g. shared between contrib/
   # ops) are picked up as long as they are in either the same or a parent
   # directory in the tensorflow/ tree.
-  levels_to_root = PACKAGE_NAME.count("/") + name.count("/")
+  levels_to_root = native.package_name().count("/") + name.count("/")
   return select({
       clean_dep("//tensorflow:darwin"): [
           "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
@@ -302,6 +308,7 @@ def tf_cc_shared_object(
           clean_dep("//tensorflow:darwin"): [
               "-Wl,-install_name,@rpath/" + name.split("/")[-1],
           ],
+          clean_dep("//tensorflow:windows"): [],
           "//conditions:default": [
               "-Wl,-soname," + name.split("/")[-1],
           ],
@@ -340,6 +347,22 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}.*",
 )
 
+# A simple wrap around native.cc_binary rule.
+# When using this rule, you should realize it doesn't link to any tensorflow
+# dependencies by default.
+def tf_native_cc_binary(name,
+                        copts=tf_copts(),
+                        **kwargs):
+  native.cc_binary(
+      name=name,
+      copts=copts,
+      **kwargs)
+
+register_extension_info(
+    extension_name = "tf_native_cc_binary",
+    label_regex_for_dep = "{extension_name}.*",
+)
+
 def tf_gen_op_wrapper_cc(name,
                          out_ops_file,
                          pkg="",
@@ -486,7 +509,9 @@ def tf_gen_op_wrappers_cc(name,
 #   hidden: Optional list of ops names to make private in the Python module.
 #     It is invalid to specify both "hidden" and "op_whitelist".
 #   visibility: passed to py_library.
-#   deps: list of dependencies for the generated target.
+#   deps: list of dependencies for the intermediate tool used to generate the
+#     python target. NOTE these `deps` are not applied to the final python
+#     library target itself.
 #   require_shape_functions: leave this as False.
 #   hidden_file: optional file that contains a list of op names to make private
 #     in the generated Python module. Each op name should be on a line by
@@ -620,9 +645,12 @@ def tf_cc_test(name,
       linkopts=select({
         clean_dep("//tensorflow:android"): [
             "-pie",
-          ],
+        ],
         clean_dep("//tensorflow:windows"): [],
         clean_dep("//tensorflow:windows_msvc"): [],
+        clean_dep("//tensorflow:darwin"): [
+            "-lm",
+        ],
         "//conditions:default": [
             "-lpthread",
             "-lm"
@@ -788,7 +816,33 @@ def tf_cc_test_mkl(srcs,
                    tags=[],
                    size="medium",
                    args=None):
-  if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
+  for src in srcs:
+    native.cc_test(
+      name=src_to_test_name(src),
+      srcs=if_mkl([src]) + tf_binary_additional_srcs(),
+      copts=tf_copts(),
+      linkopts=select({
+        clean_dep("//tensorflow:android"): [
+            "-pie",
+          ],
+        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows_msvc"): [],
+        "//conditions:default": [
+            "-lpthread",
+            "-lm"
+        ],
+      }) + _rpath_linkopts(src_to_test_name(src)),
+      deps=deps + if_mkl(
+          [
+              "//third_party/mkl:intel_binary_blob",
+          ],
+      ),
+      linkstatic=linkstatic,
+      tags=tags,
+      size=size,
+      args=args,
+      nocopts="-fno-exceptions")
+
 
 def tf_cc_tests_gpu(srcs,
                     deps,
@@ -905,6 +959,15 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
   if not cuda_deps:
     cuda_deps = []
 
+  if 'linkstatic' not in kwargs or kwargs['linkstatic'] != 1:
+    enable_text_relocation_linkopt = select({
+          clean_dep("//tensorflow:darwin"): [],
+          clean_dep("//tensorflow:windows"): [],
+          "//conditions:default": ['-Wl,-z,notext'],})
+    if 'linkopts' in kwargs:
+      kwargs['linkopts'] += enable_text_relocation_linkopt
+    else:
+      kwargs['linkopts'] = enable_text_relocation_linkopt
   native.cc_library(
       deps=deps + if_cuda(cuda_deps + [
           clean_dep("//tensorflow/core:cuda"),
@@ -998,16 +1061,12 @@ register_extension_info(
 def tf_mkl_kernel_library(name,
                           prefix=None,
                           srcs=None,
-                          gpu_srcs=None,
                           hdrs=None,
                           deps=None,
                           alwayslink=1,
                           copts=tf_copts(),
-                          nocopts="-fno-exceptions",
-                          **kwargs):
+                          nocopts="-fno-exceptions"):
   """A rule to build MKL-based TensorFlow kernel libraries."""
-  gpu_srcs = gpu_srcs  # unused argument
-  kwargs = kwargs  # unused argument
 
   if not bool(srcs):
     srcs = []
@@ -1020,16 +1079,15 @@ def tf_mkl_kernel_library(name,
     hdrs = hdrs + native.glob(
         [prefix + "*.h"])
 
-  if_mkl(
-      native.cc_library(
-          name=name,
-          srcs=srcs,
-          hdrs=hdrs,
-          deps=deps,
-          alwayslink=alwayslink,
-          copts=copts,
-          nocopts=nocopts
-      ))
+  native.cc_library(
+      name=name,
+      srcs=if_mkl(srcs),
+      hdrs=hdrs,
+      deps=deps,
+      alwayslink=alwayslink,
+      copts=copts,
+      nocopts=nocopts
+  )
 
 register_extension_info(
     extension_name = "tf_mkl_kernel_library",
@@ -1158,22 +1216,6 @@ def transitive_hdrs(name, deps=[], **kwargs):
 # the libraries in deps.
 def cc_header_only_library(name, deps=[], includes=[], **kwargs):
   _transitive_hdrs(name=name + "_gather", deps=deps)
-
-  # We could generalize the following, but rather than complicate things
-  # here, we'll do the minimal use case for now, and hope bazel comes up
-  # with a better solution before too long.  We'd expect it to compute
-  # the right include path by itself, but it doesn't, possibly because
-  # _transitive_hdrs lost some information about the include path.
-  if "@nsync//:nsync_headers" in deps:
-    # Buiding tensorflow from @org_tensorflow finds this two up.
-    nsynch = "../../external/nsync/public"
-    # Building tensorflow from elsewhere finds it four up.
-    # Note that native.repository_name() is not yet available in TF's Kokoro.
-    if REPOSITORY_NAME != "@":
-      nsynch = "../../" + nsynch
-    includes = includes[:]
-    includes.append(nsynch)
-
   native.cc_library(name=name,
                     hdrs=[":" + name + "_gather"],
                     includes=includes,
@@ -1182,9 +1224,22 @@ def cc_header_only_library(name, deps=[], includes=[], **kwargs):
 def tf_custom_op_library_additional_deps():
   return [
       "@protobuf_archive//:protobuf_headers",
-      "@nsync//:nsync_headers",
       clean_dep("//third_party/eigen3"),
       clean_dep("//tensorflow/core:framework_headers_lib"),
+  ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
+
+# A list of targets that contains the implemenation of
+# tf_custom_op_library_additional_deps. It's used to generate a DEF file for
+# exporting symbols from _pywrap_tensorflow.dll on Windows.
+def tf_custom_op_library_additional_deps_impl():
+  return [
+      "@protobuf_archive//:protobuf",
+      "@nsync//:nsync_cpp",
+      # for //third_party/eigen3
+      clean_dep("//third_party/eigen3"),
+      # for //tensorflow/core:framework_headers_lib
+      clean_dep("//tensorflow/core:framework"),
+      clean_dep("//tensorflow/core:reader_base"),
   ]
 
 # Traverse the dependency graph along the "deps" attribute of the
@@ -1271,6 +1326,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
       deps=deps + if_cuda(cuda_deps),
       data=[name + "_check_deps"],
       copts=tf_copts(is_external=True),
+      features = ["windows_export_all_symbols"],
       linkopts=linkopts + select({
           "//conditions:default": [
               "-lm",
@@ -1417,7 +1473,8 @@ def tf_py_wrap_cc(name,
       ]) + tf_extension_copts()),
       linkopts=tf_extension_linkopts() + extra_linkopts,
       linkstatic=1,
-      deps=deps + extra_deps)
+      deps=deps + extra_deps,
+      **kwargs)
   native.genrule(
       name="gen_" + cc_library_pyd_name,
       srcs=[":" + cc_library_name],
@@ -1622,22 +1679,36 @@ def cuda_py_tests(name,
 #
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
-def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs):
+def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps=[], deps=[], visibility=None):
   out_hdrs = (
       [p.replace(".proto", ".pb_text.h")
        for p in srcs] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
   out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
   native.genrule(
-      name=name,
-      srcs=srcs + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
+      name=name + "_srcs",
+      srcs=srcs + protodeps + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
       outs=out_hdrs + out_srcs,
+      visibility=visibility,
       cmd=
       "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) "
       + "$(@D) " + srcs_relative_dir + " $(SRCS)",
       tools=[
           clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions")
       ],)
-  return struct(hdrs=out_hdrs, srcs=out_srcs)
+
+  native.filegroup(
+      name=name + "_hdrs",
+      srcs=out_hdrs,
+      visibility=visibility,
+  )
+
+  native.cc_library(
+      name=name,
+      srcs=out_srcs,
+      hdrs=out_hdrs,
+      visibility=visibility,
+      deps = deps,
+  )
 
 def tf_genrule_cmd_append_to_srcs(to_append):
   return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
@@ -1653,7 +1724,7 @@ def tf_version_info_genrule():
       ],
       outs=["util/version_info.cc"],
       cmd=
-      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index e731127a63d792825e15a4b95379517117edebb0..a1c569951e99162c8048b7b760c25df7b2f29420 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -1,27 +1,16 @@
 # Description:
 # Scripts used to generate TensorFlow Python API.
+
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_binary(
     name = "create_python_api",
     srcs = ["create_python_api.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/python",
     ],
 )
 
@@ -43,6 +32,7 @@ genrule(
     # api/module1/module2/__init__.py and api/module3/__init__.py.
     # keep sorted
     outs = [
+        # BEGIN GENERATED FILES
         "api/__init__.py",
         "api/app/__init__.py",
         "api/bitwise/__init__.py",
@@ -80,6 +70,7 @@ genrule(
         "api/keras/datasets/boston_housing/__init__.py",
         "api/keras/datasets/cifar10/__init__.py",
         "api/keras/datasets/cifar100/__init__.py",
+        "api/keras/datasets/fashion_mnist/__init__.py",
         "api/keras/datasets/imdb/__init__.py",
         "api/keras/datasets/mnist/__init__.py",
         "api/keras/datasets/reuters/__init__.py",
@@ -102,6 +93,8 @@ genrule(
         "api/linalg/__init__.py",
         "api/logging/__init__.py",
         "api/losses/__init__.py",
+        "api/manip/__init__.py",
+        "api/math/__init__.py",
         "api/metrics/__init__.py",
         "api/nn/__init__.py",
         "api/nn/rnn_cell/__init__.py",
@@ -124,6 +117,8 @@ genrule(
         "api/test/__init__.py",
         "api/train/__init__.py",
         "api/train/queue_runner/__init__.py",
+        "api/user_ops/__init__.py",
+        # END GENERATED FILES
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
@@ -133,7 +128,9 @@ py_library(
     name = "python_api",
     srcs = [":python_api_gen"],
     srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/contrib:contrib_py",  # keep
+        "//tensorflow/python",  # keep
     ],
 )
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 1557314939bd85c0467426216f90aa3891ca0ac0..c06a39bfbdf06cb478007a884f38728755845f19 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -23,15 +23,13 @@ import collections
 import os
 import sys
 
-# This import is needed so that we can traverse over TensorFlow modules.
-import tensorflow as tf  # pylint: disable=unused-import
 from tensorflow.python.util import tf_decorator
 
 
 _API_CONSTANTS_ATTR = '_tf_api_constants'
 _API_NAMES_ATTR = '_tf_api_names'
 _API_DIR = '/api/'
-_CONTRIB_IMPORT = 'from tensorflow import contrib'
+_OUTPUT_MODULE = 'tensorflow.tools.api.generator.api'
 _GENERATED_FILE_HEADER = """\"\"\"Imports for Python API.
 
 This file is MACHINE GENERATED! Do not edit.
@@ -40,6 +38,11 @@ Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 """
 
 
+class SymbolExposedTwiceError(Exception):
+  """Raised when different symbols are exported with the same name."""
+  pass
+
+
 def format_import(source_module_name, source_name, dest_name):
   """Formats import statement.
 
@@ -64,22 +67,101 @@ def format_import(source_module_name, source_name, dest_name):
       return 'import %s as %s' % (source_name, dest_name)
 
 
-def get_api_imports():
-  """Get a map from destination module to formatted imports.
+class _ModuleInitCodeBuilder(object):
+  """Builds a map from module name to imports included in that module."""
+
+  def __init__(self):
+    self.module_imports = collections.defaultdict(
+        lambda: collections.defaultdict(set))
+    self._dest_import_to_id = collections.defaultdict(int)
+    # Names that start with underscore in the root module.
+    self._underscore_names_in_root = []
+
+  def add_import(
+      self, symbol_id, dest_module_name, source_module_name, source_name,
+      dest_name):
+    """Adds this import to module_imports.
+
+    Args:
+      symbol_id: (number) Unique identifier of the symbol to import.
+      dest_module_name: (string) Module name to add import to.
+      source_module_name: (string) Module to import from.
+      source_name: (string) Name of the symbol to import.
+      dest_name: (string) Import the symbol using this name.
+
+    Raises:
+      SymbolExposedTwiceError: Raised when an import with the same
+        dest_name has already been added to dest_module_name.
+    """
+    import_str = format_import(source_module_name, source_name, dest_name)
+
+    # Check if we are trying to expose two different symbols with same name.
+    full_api_name = dest_name
+    if dest_module_name:
+      full_api_name = dest_module_name + '.' + full_api_name
+    if (full_api_name in self._dest_import_to_id and
+        symbol_id != self._dest_import_to_id[full_api_name] and
+        symbol_id != -1):
+      raise SymbolExposedTwiceError(
+          'Trying to export multiple symbols with same name: %s.' %
+          full_api_name)
+    self._dest_import_to_id[full_api_name] = symbol_id
+
+    if not dest_module_name and dest_name.startswith('_'):
+      self._underscore_names_in_root.append(dest_name)
+
+    # The same symbol can be available in multiple modules.
+    # We store all possible ways of importing this symbol and later pick just
+    # one.
+    self.module_imports[dest_module_name][full_api_name].add(import_str)
+
+  def build(self):
+    """Get a map from destination module to __init__.py code for that module.
+
+    Returns:
+      A dictionary where
+        key: (string) destination module (for e.g. tf or tf.consts).
+        value: (string) text that should be in __init__.py files for
+          corresponding modules.
+    """
+    module_text_map = {}
+    for dest_module, dest_name_to_imports in self.module_imports.items():
+      # Sort all possible imports for a symbol and pick the first one.
+      imports_list = [
+          sorted(imports)[0]
+          for _, imports in dest_name_to_imports.items()]
+      module_text_map[dest_module] = '\n'.join(sorted(imports_list))
+
+    # Expose exported symbols with underscores in root module
+    # since we import from it using * import.
+    underscore_names_str = ', '.join(
+        '\'%s\'' % name for name in self._underscore_names_in_root)
+    module_text_map[''] += '''
+_names_with_underscore = [%s]
+__all__ = [s for s in dir() if not s.startswith('_')]
+__all__.extend([s for s in _names_with_underscore])
+''' % underscore_names_str
+
+    return module_text_map
+
+
+def get_api_init_text():
+  """Get a map from destination module to __init__.py code for that module.
 
   Returns:
     A dictionary where
       key: (string) destination module (for e.g. tf or tf.consts).
-      value: List of strings representing module imports
-          (for e.g. 'from foo import bar') and constant
-          assignments (for e.g. 'FOO = 123').
+      value: (string) text that should be in __init__.py files for
+        corresponding modules.
   """
-  module_imports = collections.defaultdict(list)
+  module_code_builder = _ModuleInitCodeBuilder()
+
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
   for module in sys.modules.values():
     # Only look at tensorflow modules.
-    if not module or 'tensorflow.' not in module.__name__:
+    if (not module or not hasattr(module, '__name__') or
+        'tensorflow.' not in module.__name__):
       continue
     # Do not generate __init__.py files for contrib modules for now.
     if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
@@ -92,43 +174,44 @@ def get_api_imports():
       if module_contents_name == _API_CONSTANTS_ATTR:
         for exports, value in attr:
           for export in exports:
-            names = ['tf'] + export.split('.')
+            names = export.split('.')
             dest_module = '.'.join(names[:-1])
-            import_str = format_import(module.__name__, value, names[-1])
-            module_imports[dest_module].append(import_str)
+            module_code_builder.add_import(
+                -1, dest_module, module.__name__, value, names[-1])
         continue
 
       _, attr = tf_decorator.unwrap(attr)
       # If attr is a symbol with _tf_api_names attribute, then
       # add import for it.
       if hasattr(attr, '__dict__') and _API_NAMES_ATTR in attr.__dict__:
-        # The same op might be accessible from multiple modules.
-        # We only want to consider location where function was defined.
-        if attr.__module__ != module.__name__:
-          continue
-
         for export in attr._tf_api_names:  # pylint: disable=protected-access
-          names = ['tf'] + export.split('.')
+          names = export.split('.')
           dest_module = '.'.join(names[:-1])
-          import_str = format_import(
-              module.__name__, module_contents_name, names[-1])
-          module_imports[dest_module].append(import_str)
+          module_code_builder.add_import(
+              id(attr), dest_module, module.__name__, module_contents_name,
+              names[-1])
 
   # Import all required modules in their parent modules.
-  # For e.g. if we import 'tf.foo.bar.Value'. Then, we also
-  # import 'bar' in 'tf.foo'.
-  dest_modules = set(module_imports.keys())
-  for dest_module in dest_modules:
-    dest_module_split = dest_module.split('.')
-    for dest_submodule_index in range(1, len(dest_module_split)):
-      dest_submodule = '.'.join(dest_module_split[:dest_submodule_index])
-      submodule_import = format_import(
-          '', dest_module_split[dest_submodule_index],
-          dest_module_split[dest_submodule_index])
-      if submodule_import not in module_imports[dest_submodule]:
-        module_imports[dest_submodule].append(submodule_import)
-
-  return module_imports
+  # For e.g. if we import 'foo.bar.Value'. Then, we also
+  # import 'bar' in 'foo'.
+  imported_modules = set(module_code_builder.module_imports.keys())
+  for module in imported_modules:
+    if not module:
+      continue
+    module_split = module.split('.')
+    parent_module = ''  # we import submodules in their parent_module
+
+    for submodule_index in range(len(module_split)):
+      import_from = _OUTPUT_MODULE
+      if submodule_index > 0:
+        parent_module += ('.' + module_split[submodule_index-1] if parent_module
+                          else module_split[submodule_index-1])
+        import_from += '.' + parent_module
+      module_code_builder.add_import(
+          -1, parent_module, import_from,
+          module_split[submodule_index], module_split[submodule_index])
+
+  return module_code_builder.build()
 
 
 def create_api_files(output_files):
@@ -144,16 +227,19 @@ def create_api_files(output_files):
   """
   module_name_to_file_path = {}
   for output_file in output_files:
+    # Convert path separators to '/' for easier parsing below.
+    normalized_output_file = output_file.replace(os.sep, '/')
     if _API_DIR not in output_file:
       raise ValueError(
           'Output files must be in api/ directory, found %s.' % output_file)
     # Get the module name that corresponds to output_file.
     # First get module directory under _API_DIR.
     module_dir = os.path.dirname(
-        output_file[output_file.rfind(_API_DIR)+len(_API_DIR):])
-    # Convert / to . and prefix with tf.
-    module_name = '.'.join(['tf', module_dir.replace('/', '.')]).strip('.')
-    module_name_to_file_path[module_name] = output_file
+        normalized_output_file[
+            normalized_output_file.rfind(_API_DIR)+len(_API_DIR):])
+    # Convert / to .
+    module_name = module_dir.replace('/', '.').strip('.')
+    module_name_to_file_path[module_name] = os.path.normpath(output_file)
 
   # Create file for each expected output in genrule.
   for module, file_path in module_name_to_file_path.items():
@@ -161,21 +247,19 @@ def create_api_files(output_files):
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  module_imports = get_api_imports()
-  module_imports['tf'].append(_CONTRIB_IMPORT)  # Include all of contrib.
+  module_text_map = get_api_init_text()
 
   # Add imports to output files.
   missing_output_files = []
-  for module, exports in module_imports.items():
+  for module, text in module_text_map.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
-      module_without_tf = module[len('tf.'):]
       module_file_path = '"api/%s/__init__.py"' %  (
-          module_without_tf.replace('.', '/'))
+          module.replace('.', '/'))
       missing_output_files.append(module_file_path)
       continue
     with open(module_name_to_file_path[module], 'w') as fp:
-      fp.write(_GENERATED_FILE_HEADER + '\n'.join(exports))
+      fp.write(_GENERATED_FILE_HEADER + text)
 
   if missing_output_files:
     raise ValueError(
@@ -192,6 +276,16 @@ if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
       'outputs', metavar='O', type=str, nargs='+',
-      help='Python files that we expect this script to output.')
+      help='If a single file is passed in, then we we assume it contains a '
+      'semicolon-separated list of Python files that we expect this script to '
+      'output. If multiple files are passed in, then we assume output files '
+      'are listed directly as arguments.')
   args = parser.parse_args()
-  main(args.outputs)
+  if len(args.outputs) == 1:
+    # If we only get a single argument, then it must be a file containing
+    # list of outputs.
+    with open(args.outputs[0]) as output_list_file:
+      outputs = [line.strip() for line in output_list_file.read().split(';')]
+  else:
+    outputs = args.outputs
+  main(outputs)
diff --git a/tensorflow/tools/api/generator/create_python_api_test.py b/tensorflow/tools/api/generator/create_python_api_test.py
index 2760779e6e0a909cb077f534db40710ab6a11b32..218c8120453c8dca6e81146eb06e8243a3cd424d 100644
--- a/tensorflow/tools/api/generator/create_python_api_test.py
+++ b/tensorflow/tools/api/generator/create_python_api_test.py
@@ -56,7 +56,7 @@ class CreatePythonApiTest(test.TestCase):
     del sys.modules[_MODULE_NAME]
 
   def testFunctionImportIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected_import = (
         'from test.tensorflow.test_module import test_op as test_op1')
     self.assertTrue(
@@ -69,14 +69,14 @@ class CreatePythonApiTest(test.TestCase):
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testClassImportIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected_import = 'from test.tensorflow.test_module import TestClass'
     self.assertTrue(
         'TestClass' in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testConstantIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected = 'from test.tensorflow.test_module import _TEST_CONSTANT'
     self.assertTrue(expected in str(imports),
                     msg='%s not in %s' % (expected, str(imports)))
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
index 08436396a6c04a59461b6800b908c29aabb91a1b..ebdf42df2c01a60b1cadd0368647adc4121db7ef 100644
--- a/tensorflow/tools/api/golden/BUILD
+++ b/tensorflow/tools/api/golden/BUILD
@@ -10,15 +10,3 @@ filegroup(
     name = "api_golden",
     srcs = glob(["*.pbtxt"]),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7405202b892bba67a36d86cd43fb7a67ab3be947
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.GradientTape"
+tf_class {
+  is_instance: "<class \'tensorflow.python.eager.backprop.GradientTape\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'persistent\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "gradient"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "watch"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "watched_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt
index 75361803a3991f380d6be2485cfd3d05fd1572e1..cdaeb55e30865e082054085f47d6a071ebf3affd 100644
--- a/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt
@@ -130,6 +130,10 @@ tf_class {
     name: "prevent_fetching"
     argspec: "args=[\'self\', \'op\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "switch_to_thread_local"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "unique_name"
     argspec: "args=[\'self\', \'name\', \'mark_as_used\'], varargs=None, keywords=None, defaults=[\'True\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
index 42de5c0c80023ad5bd7f33a564780060998307c1..cbbd077c97ba7f304e13f7883254f5a12668255e 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
@@ -64,7 +64,7 @@ tf_class {
   }
   member_method {
     name: "list_files"
-    argspec: "args=[\'file_pattern\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
index e2fc8d6cb1d318cc50828f22e8e575cc28c7aaad..9a56ae8675c4fdae7424859fb5105574d83da494 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -65,7 +65,7 @@ tf_class {
   }
   member_method {
     name: "list_files"
-    argspec: "args=[\'file_pattern\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
index 9770389e5ef1e29a80ae1da2725d9862f6521ff9..e5ec824bb8959f77f99ce3527f168bacf8de200a 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filenames\', \'compression_type\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filenames\', \'compression_type\', \'buffer_size\', \'num_parallel_reads\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -65,7 +65,7 @@ tf_class {
   }
   member_method {
     name: "list_files"
-    argspec: "args=[\'file_pattern\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
index 7263230c1c7182bb812cb2e433aedd415bcd16c7..008239789c788692a36445dc2c51e26113b6a0c5 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
@@ -65,7 +65,7 @@ tf_class {
   }
   member_method {
     name: "list_files"
-    argspec: "args=[\'file_pattern\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt
deleted file mode 100644
index 11565bd3e4178202fa82e2e079d1035190dbd6ec..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt
+++ /dev/null
@@ -1,65 +0,0 @@
-path: "tensorflow.distributions.bijectors.Bijector"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.bijector_impl.Bijector\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_ndims"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph_parents"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_constant_jacobian"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'event_ndims\', \'graph_parents\', \'is_constant_jacobian\', \'validate_args\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "forward"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward\'], "
-  }
-  member_method {
-    name: "forward_event_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "forward_event_shape_tensor"
-    argspec: "args=[\'self\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "forward_log_det_jacobian"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_log_det_jacobian\'], "
-  }
-  member_method {
-    name: "inverse"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
-  }
-  member_method {
-    name: "inverse_event_shape"
-    argspec: "args=[\'self\', \'output_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "inverse_event_shape_tensor"
-    argspec: "args=[\'self\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "inverse_log_det_jacobian"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_log_det_jacobian\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt
deleted file mode 100644
index 1e5fe624eb838e188594d03b656c12890db344a1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt
+++ /dev/null
@@ -1,66 +0,0 @@
-path: "tensorflow.distributions.bijectors.Identity"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.identity_bijector.Identity\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.bijector_impl.Bijector\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_ndims"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph_parents"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_constant_jacobian"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'validate_args\', \'event_ndims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'identity\'], "
-  }
-  member_method {
-    name: "forward"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward\'], "
-  }
-  member_method {
-    name: "forward_event_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "forward_event_shape_tensor"
-    argspec: "args=[\'self\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "forward_log_det_jacobian"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_log_det_jacobian\'], "
-  }
-  member_method {
-    name: "inverse"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
-  }
-  member_method {
-    name: "inverse_event_shape"
-    argspec: "args=[\'self\', \'output_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "inverse_event_shape_tensor"
-    argspec: "args=[\'self\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "inverse_log_det_jacobian"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_log_det_jacobian\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt
deleted file mode 100644
index 1d0144f36ec332740889dc8caa5add8f41960d92..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.distributions.bijectors"
-tf_module {
-  member {
-    name: "Bijector"
-    mtype: "<class \'abc.ABCMeta\'>"
-  }
-  member {
-    name: "Identity"
-    mtype: "<class \'abc.ABCMeta\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
index 2fba7c506ed9d2490e7c19c1746d3f4e9645424f..90b60ef074dd2eaf911291e6c725b98e2891e728 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
@@ -68,10 +68,6 @@ tf_module {
     name: "Uniform"
     mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
   }
-  member {
-    name: "bijectors"
-    mtype: "<type \'module\'>"
-  }
   member_method {
     name: "kl_divergence"
     argspec: "args=[\'distribution_a\', \'distribution_b\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..53a903c239b5fccb5e70d0a93d5af08245f4dc46
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.estimator.BoostedTreesClassifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba17c90de28899683ee5bd992cb5516b01856280
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.estimator.BoostedTreesRegressor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index 091b1be0c83480757445542acb97e139bd74ef03..c8da55d8021b7659446d0771a089b7b605d86c4f 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "cluster_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "device_fn"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "evaluation_master"
     mtype: "<type \'property\'>"
@@ -78,9 +82,13 @@ tf_class {
     name: "tf_random_seed"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "train_distribute"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt
index a16e3aedae96e7289e73c49ac7890550dd5ddb08..5301b94eb361251a1cb4d02a5d8168f7c8191045 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.VocabInfo"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.warm_starting_util.VocabInfo\'>"
-  is_instance: "<class \'tensorflow.python.estimator.warm_starting_util.VocabInfo\'>"
+  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
+  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "backup_initializer"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt
index afdd6bb058353594415cd1abe726070f84ae46b6..43f5343359aff3b856a2b3708e4cda7cec29e146 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.WarmStartSettings"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.warm_starting_util.WarmStartSettings\'>"
-  is_instance: "<class \'tensorflow.python.estimator.warm_starting_util.WarmStartSettings\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.WarmStartSettings\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.WarmStartSettings\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "ckpt_to_initialize_from"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4fe92643bf9867765499d7bf475b9cdd1686aec5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator.export.TensorServingInputReceiver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export.TensorServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow.python.estimator.export.export.TensorServingInputReceiver\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "features"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "receiver_tensors"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "receiver_tensors_alternatives"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt
index 4d0dddb3bc0305a28fab0c95c31e4869f5db0aa8..bd72f6cd79f7dffb9f0a7f8ae43751c4ecba939d 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "ServingInputReceiver"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorServingInputReceiver"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "build_parsing_serving_input_receiver_fn"
     argspec: "args=[\'feature_spec\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
index a7a6cc1e49ddfe07569dff035e38931a0510addd..4946f2c51a62af85d61b8e38e982c59dd0d61e36 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -8,6 +8,14 @@ tf_module {
     name: "BaselineRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BoostedTreesClassifier"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BoostedTreesRegressor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index bda1c2bf85977e69b0969bc8b6056710d88ca910..3fc64dae888012169af3ea7695154b73f24d90c8 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -100,6 +100,10 @@ tf_module {
     name: "hsv_to_rgb"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "image_gradients"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_jpeg"
     argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -116,6 +120,10 @@ tf_module {
     name: "per_image_standardization"
     argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "psnr"
+    argspec: "args=[\'a\', \'b\', \'max_val\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "random_brightness"
     argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -188,6 +196,18 @@ tf_module {
     name: "sample_distorted_bounding_box"
     argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "sobel_edges"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ssim"
+    argspec: "args=[\'img1\', \'img2\', \'max_val\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ssim_multiscale"
+    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'power_factors\'], varargs=None, keywords=None, defaults=[\'(0.0448, 0.2856, 0.3001, 0.2363, 0.1333)\'], "
+  }
   member_method {
     name: "total_variation"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/tensorflow.initializers.pbtxt
index 21a0f84d22fc2d06e551c9a709f3963e812333b8..eaf0036cacfadce335a84bcf61f47f9d360be7e2 100644
--- a/tensorflow/tools/api/golden/tensorflow.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.initializers.pbtxt
@@ -1,17 +1,9 @@
 path: "tensorflow.initializers"
 tf_module {
-  member {
-    name: "absolute_import"
-    mtype: "<type \'instance\'>"
-  }
   member {
     name: "constant"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "division"
-    mtype: "<type \'instance\'>"
-  }
   member {
     name: "identity"
     mtype: "<type \'type\'>"
@@ -24,10 +16,6 @@ tf_module {
     name: "orthogonal"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "print_function"
-    mtype: "<type \'instance\'>"
-  }
   member {
     name: "random_normal"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index a13bfe0a920ce13dd9a91f106c9cbcbd185b0cc7..cee76bdc1db69d2f61566ad3cc4d52a226539e04 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Network\'>"
-  is_instance: "<class \'tensorflow.python.layers.network.GraphNetwork\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -74,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -128,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -140,7 +131,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
@@ -248,7 +239,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "set_weights"
@@ -260,7 +251,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -272,6 +263,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index fb6c8d70dd43eae60ea2fb86f0fc63c36d2b13ad..02718cb5f9e3ca0422647f20fbde355e21620c09 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.models.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.sequential.Sequential\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Network\'>"
-  is_instance: "<class \'tensorflow.python.layers.network.GraphNetwork\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -75,14 +70,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "regularizers"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -91,10 +78,6 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -141,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -153,11 +136,11 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -173,11 +156,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'32\', \'1\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -185,7 +168,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -245,7 +228,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "predict_classes"
@@ -253,7 +236,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
@@ -273,7 +256,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "set_weights"
@@ -285,7 +268,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -297,6 +280,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'class_weight\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt
index 791cfda23345fea7df1cfb107ae5dec06354bd48..a0e14356fa5e91bc81bd89f6eb8c07087956c392 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt
@@ -1,3 +1,7 @@
 path: "tensorflow.keras.datasets.fashion_mnist"
 tf_module {
+  member_method {
+    name: "load_data"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
index f4ab075959906cdf350ec5d49dc86f928b7eb7ae..96272d1b7d61430188bbbf2680bd2beb9f1e9675 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Activation\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
index eb558cddafc3972127786353072767f0d53bf174..8fd55c8686de77ec764e9d564c78d0df4f545915 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.ActivityRegularization\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
index 770a107b664d7ab0a8aedf292a34d4258a201859..47d1532c3c8cf248f6e9a9a35e10b9559286263f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.Add"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 0ce42b706ec20a8ea1cc83ec95cb64d9be2e5710..797d422a90a5ad21d0014a0003b11d281c25e579 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.AlphaDropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index d6c98fa225ce924bc8e20f8531516eaed4d32ffb..269be1455b6bf3bbe325f3928584960578e3793d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 754fd310c6d8ddb994db0590342b29f8cb7abd71..344813621534dbb5de3719088c06313e55519dd7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 9b62880c7931d151fb98cc1dc3149dcbd4dd103d..979008d0edb7d0f9d9c1246e1fcc7d2e2871d28c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
index b371ad148cee16dd243869d929e0c1c002794682..0ffdffd4cdee14fbfeb68f2575300632ab21d7a6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.Average"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 3e2aba55fd63326bb0e232fdce06f32884db7a0a..6b00f110eea2aaa551bffe8ec225042c5469210e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.AvgPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index fb37308cce0124538648c3837e1e802794d7f1ae..caff5a2f1db61c6958980446d3bd54009776e1a4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.AvgPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 813470ffc7c87727eb0b958e54806f530399806a..4a7239492177aae2ee098fc3033904d3d1a31ba4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.AvgPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
index e251ac18e511b58a49816126d9941b98e4f91088..9804394fa53d6b3c1ff136a73212863143bbdb39 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -91,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'renorm\', \'renorm_clipping\', \'renorm_momentum\', \'fused\', \'trainable\', \'virtual_batch_size\', \'adjustment\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'0.99\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 699208a0b9b665b69f02edaa2b2d2aeed6a83b63..5e5b04c7c695c6d88e7f42b77290a582be087763 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.Bidirectional"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -17,10 +17,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -69,10 +65,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -111,11 +103,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
index ff08def0a08e5201bc01d61be3f2d66d712c384b..b8eb4079b9eea3b054f9c2ad4298f6a1669de79e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.Concatenate"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 6db22ca0320519fd9c101456c9c9c0e26a9a11e0..3fdb101425d0f010f77cd70bc3721af269a36b0a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -1,21 +1,49 @@
 path: "tensorflow.keras.layers.ConvLSTM2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvLSTM2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRecurrent2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "activation"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "bias_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "data_format"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dilation_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dropout"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "graph"
+    name: "filters"
     mtype: "<type \'property\'>"
   }
   member {
@@ -34,6 +62,22 @@ tf_class {
     name: "input_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "kernel_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_size"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -67,7 +111,35 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "scope_name"
+    name: "padding"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_activation"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_dropout"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strides"
     mtype: "<type \'property\'>"
   }
   member {
@@ -78,10 +150,18 @@ tf_class {
     name: "trainable_weights"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "unit_forget_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "use_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -104,11 +184,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -142,10 +222,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_constants"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -186,28 +262,12 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "input_conv"
-    argspec: "args=[\'self\', \'x\', \'w\', \'b\', \'padding\'], varargs=None, keywords=None, defaults=[\'None\', \'valid\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "recurrent_conv"
-    argspec: "args=[\'self\', \'x\', \'w\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "step"
-    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
index 577f206e3510a9995d5d383ac440b4f68ea39fe5..0be42471e35eeba224376b24aa846db69e011274 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 72924c32b43e5edb39938cc0cd909cffefa61be1..39ba31a70942811cbd36caf33c9bda90a5449703 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
index 16be08d9b2bae8fe1faecf34c4d87ac9b9baf142..26d9d8c476f4e429ffe112cd490ad277b478c65b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 11e05f884d781166616a9c9a61dacbc8fdae6ae3..43611017fa37077c7ff05e690dfd50a0e6e5ae1e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
index 72b72d6b3b1e410dda0b0a529449f0135203fc1b..fa4925ab99d719104c5ca1a0003c25d85f78f3df 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
index ee93247f63ed700dc6058041bd0ea4ff5c879078..c5c5d5e7c083dca63972cb7033e675aa483d8039 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.Convolution1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index e5023287e5f38553f3553a37b5a908790072b5c7..36dc2d2e9a70fe7a1d32352a14dabbade2a2efc2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.layers.Convolution2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
index ba38cb7121c9d312e7ba9d7147bdc67673d1ad2e..23ec74370bab010d2b5ba257502a32c8ca7e4a57 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.Convolution2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 58724a1e1661609ef3c000c7ca1dfe9b3235acff..0e4089c5785ccdcafbb8b3dc1ca75ffbe49d9434 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
index 98d52c430c659d0fc3e9299f7bede9190dad2fcf..23ddbe1a925e33432727d13dc875972136083056 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.Convolution3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
index 33b6ebe1af731f66f88a9493502f69049ab34b42..e04ab6bea85baa1252c9c43f891f9ed5a9dedb87 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
index 4b241ebb0f68c270a9448b02138d44f82211f418..655314afffd1e4fb0987c913eb69edc4254c77ef 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
index 1856a9ee21347ed6ca3dd592517eb644e205a5b7..d5215f1330a2fd0db4696a7b743d931e164a5a9f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ce4db85f8e42f7c85116c8b798a9cfc17e47242
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.layers.CuDNNGRU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.cudnn_recurrent.CuDNNGRU\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "cell"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\'], varargs=None, keywords=kwargs, defaults=[\'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..98221c11650eaf0527b4d32a104e5b29fd34e820
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.layers.CuDNNLSTM"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.cudnn_recurrent.CuDNNLSTM\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "cell"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\'], varargs=None, keywords=kwargs, defaults=[\'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
index a8c37af31f649d28ca2ab7614178f2dee58c13fc..310a3c3b918684803c86d8e2ea1731604a041cdb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2d67b5f720209e3648e69659d44c3d8c4e639231
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -0,0 +1,177 @@
+path: "tensorflow.keras.layers.DepthwiseConv2D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.DepthwiseConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'kernel_size\', \'strides\', \'padding\', \'depth_multiplier\', \'data_format\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'1\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
index 07d3f023e54105c606b198c05750ffa78ee5d0c8..0e493a7f2bffc772e9cc9cd5afdc4c092fb92118 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.Dot"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
index e2e21b5f123f63fa38cb0e344be9a12fc091f20b..14726b4b6cecc39da98af4220211a1c0351b2ff1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
index 92b9760d53e35d3e5066a730bb5cbda45492cc64..32a50455ed8a2a146a81fde0d884ebc867b8d0ed 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ELU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
index 83c528b40117222ac2b3e85ad338459948d0aa8c..2f615d81124a7ef5e1bd7181a10abfb1b7a8df24 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.embeddings.Embedding\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
index 73609752886c8c57a78f6bc02cc46d2c7ff6e996..82dc878a8c7f7f011df4dd3fa0445217fa250a98 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index b329f1c46bb07ab7684dec6aaf45a20b98c27ed9..d79d02b95433a7399f27ff1354cba315f8a2c3ae 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRUCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -90,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index c741d4d6e6cf8da9712e68f86abe64e2828823da..1d38ae64bb86d11ecd352371608a11f6736bb0ac 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.GRU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -33,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -122,7 +118,7 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "scope_name"
+    name: "reset_after"
     mtype: "<type \'property\'>"
   }
   member {
@@ -159,7 +155,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -171,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 57596badf1881950270fa6d3c074afb65daaa8eb..135de9cd95141a93d1a45b80ada66e339c484c89 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianDropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 3829353cc3c195a750ad862707c5c8563e203fba..5db6e433ee02b8050822f76bb762329055c11aa4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianNoise\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index e53e78a977b32eaf2e31867044aedd39ab2dd34f..bf0dba0a925b4fb3a88173f4a6dfcc565c6edd91 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 48fcd1044e06b2fe61aadb6c3675ce82197ff003..6da98036094228603f12f65f66d958e2e4b9daeb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 66c06ed47289eb2d83d97778a7b13dab821722d2..345593dec812a240251c0c07da759e131fefaad9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 4f2420f74ab3069952e4a44bf61e5e12b3e80ea3..5d3be9085e51520c48a8d33b1dde6480a0039bad 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAvgPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 7912a6d933b851521358e0246d04688da410b909..0b79a87e0507ecc795f14a63684dd5b5d7dafc1e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAvgPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index d5b2d2c274ad97071497045271c0a595f8e0e062..68cdbac652f74b72d1cb769fbefbee750025767d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAvgPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index d88ff17eb6df7bbba7d3af4344fc8ddc367ae44c..d5872b444fa9ae617e6ad55bc39f43ec4be7d92f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index c8cc5a0ddfdd54cbb47de922591a9842abf63396..4b0cf9a5d38f868d9fdd16a042daea6d0f56fdf8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 7956c5a340d963cfd5976e8af56da222848a164a..4c1adb2131f204121cf74c9a77d346902632fef2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 0a7e16413dfbd80d448eb1bad5771915475d96b2..815f1cf580562e62db99862e51ba02e2b2051b57 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 6c8a58a996f5313ea48e395e7e443a7c21f198ee..e027dd6cc282b75a6a6c2a78878fa57d6706b547 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 7678ce8aab63fcfa76c0ac61346a723c1dfe1ee7..c647b24a23258b96a3c1780fd2f3e499658cfe7d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
index d46fd41a3f33002a9bbe755851278c9729ccd1d1..75d70734b4144627f02a8f619991356e38889389 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.InputLayer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.InputLayer\'>"
-  is_instance: "<class \'tensorflow.python.layers.network.InputLayer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.input_layer.InputLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
index 3aeef347ae1f96a3ef40493cc6b722a887e81786..29edabe0483a21d7db35eec04d6ae7a855a82da1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 3b171b137af699c9608494a17c5651b439fe4545..0ed383a3554f81c3db490cf5d242546a14b64d15 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 29d9cf78ab5ed3bdd1a488359b59cf7171e7e051..6d14c9c8f69286552271ca6dab5271a5af48593f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.LSTM"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -33,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -121,10 +117,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -175,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
index ca0144929942f7024a4e8bac5552bf0547ceb56d..ddf96aba34bf574f4b9046ed932d8f136864f157 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Lambda\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
index c52ad727545c0bf4f199714d71180eac3f1bf62a..aca282d62427f5c8186fa3ac86daebd6fba09ce5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -11,10 +11,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -63,10 +59,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -89,7 +81,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -101,11 +93,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 8134fb738683b79764662d9ea7f721fe04751162..b9c53b43c87bd8c1047663001159d7286360a008 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.LeakyReLU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index c5d452300947d7f74e7458e2a04bfdfabb1c1da2..2ee566d03b4631ebcff6bcb0b93ec274849c5b67 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index bcbed9241b525a953c8b499197facaefebe8cc44..db0d0e816a6d863f13c7eb085edf269d71e2f252 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
index 244e79b4ffe60ddd6aa56d2780d80dfd66c494a9..82008b89d038e26155b8ae952c2943557ed8c35e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Masking\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 56cbf5df785ef0e2614ea7e9e6cfe1335e148eec..31a34a17d04129c8dfe8ec6f98b9fdbc110e13f4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.MaxPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 33c2d30e86f9cdc3fb9f4f498bfc2c94497fe2dd..70d24ac75c4f850374fa8cffb881652f97e97d22 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.MaxPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 94f91059b7a1e291c38fe0045accc6c03f226603..55b16564b30919fb48d97862e2e8cfc0fdda8de5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.MaxPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 247230a6d68b8ea93a30a2f5846d8baaa78cb13e..a230b74c383eff93443d07e48f6818352302d9e7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 8d61b67e7ce9564d31b0bd904a58540d19c89172..d98f7c39f546048b1483617b24f004d5c9759d14 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index ad2e30802006e934730e5c75247e958329f7121c..b2e96a4203758270afb8c225a05a481dbc329a84 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
index ff0db15f190675d533c50c277eb1cb60e0b95e55..0c45bbdf171ad9831e51d2b1ba952fab9eb3d0e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.Maximum"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
index 1d3f33f04516345ee32f16befe0d7200d2cdad00..6423d83418aa40b57afb3d5ff22f4ec605183587 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.Multiply"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
index c86bc49b22a8cc3e004a77f4a21594aacb2c665a..6e17081375b7fbdcb000dfcbc0cd48ab072fe6e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.PReLU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
index 2043e1a1263f0f0745b7c6446cc670fd6b0f0000..d01d371da596256873c3799b99c45db01c674ff9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Permute\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index ad539a7c4c5362500baef0a9c89d054762bbb47d..d3f5508640e268df484af5adaf66621fa3d92d5b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -106,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
index 4b0e98520a0dd86c085fa7345af445e1ae253d3b..44e1007f5420bbb8feda891901c138cb776c071e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.RepeatVector\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
index 34bc71af8a26ff6e4d7c81a3877751df5209906f..8fc3ec33310f531a6ca6948dc20db67543cde69a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Reshape\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index dd67b76523cc50409516e29f963f59d039455bfd..457d27749504167f2865bc272251bd89b5d3297e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 5d898fb2bd86b39cb8fab755382bb96cce231fa6..54eda8ee2121d4fadd73a33010202850d743cc65 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index bf62c095e7cc3fbeac95919a0f9fdc545efd3d25..711196554698b79c64e1c67bc875ce33c70563c6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConvolution1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index c758d87993b3acba88a13c7bc9eaeee929a22652..815e34a48de542feb078d0002dbebbbf4d199e63 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConvolution2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 6e3cde3e3eaba4f9985411d66a220f7cdd4ee7ad..6614760e5e72556ac61c3788065b3faf2d286800 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 6fafc77b947d0df11755e3136ed2e7a14c148081..bfcfd71ecddfb618ceb53969e8782f535786009b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.SimpleRNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -33,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -113,10 +109,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -163,11 +155,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
index ee4b2fa39ed34a544ee800e9370e4f34c4a17041..9c4618c4e91ae288284b35c4a0c6bbbfe604d91d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.Softmax\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index e4727072e375b9fc4dc99a1536eaaf3df5415369..9a0a19d2d52f34e61c273bd4d4a27c46940dd5bf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -2,9 +2,8 @@ path: "tensorflow.keras.layers.SpatialDropout1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index c5ff7043115ccdd3bc4a1147790b20feda410f65..446f7122a6a2dc1d2f7377cef00d7b5b9a89cd3a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -2,9 +2,8 @@ path: "tensorflow.keras.layers.SpatialDropout2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 476a7f362cf88e234e964f6f6645ee4ed0cbaff8..52a0485b5cea2be6f2f4d9b0ee31cb2388adcef7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -2,9 +2,8 @@ path: "tensorflow.keras.layers.SpatialDropout3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 3dde1e576918409b106649789443f910775e2f6c..c82e7a192dfca8ca38832a53f9135125b4c34286 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.StackedRNNCells\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_size"
     mtype: "<type \'property\'>"
@@ -106,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index ef31c5443efa0c0e5a7a2e0a422d2a9c9c49baaf..9ccf251a18034371607cfdc6091f2282136feec9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ThresholdedReLU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
index 1e176d8d4b4eb010049f267be3d0683228a7782b..e080a07799fe1b7ec5f73b4e7bf78053a2c9dd3c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.TimeDistributed"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -107,11 +99,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index a81b83be49e0073f242efc6890e419b4fe172ab2..5fadca0b8386951976e8a6330bacf0dcf169e2d1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 5403279d45ec7b93bae7907b891c659a043e96d0..2d395bf7e87b8835bcf63d792d68dc3ac4083051 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 96c337caf28d43fabd0b90df016f4e8ab0c408db..18d58ec3b23f7d4fd13ff45f4c1d4d95e7722ed5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
index ea3bb2f8f567c648cd8b3dfa6f179a108690b0f0..6223cb2f3c1230a60f3cf3dd57a0e803cf4f15d9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -106,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index b81a4b1c50b22f13eacb521cfc8bc288bd40c81f..e71bba6a7f1df713cd13b4a0249d001d56bb31b0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 1a26f2f3c9bbaa2aa567e76e1aafe14805ecff38..aba6d8cb1f43bb070f9b17b5290afc5ce30246b5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 310277fe67433fd870ae3d907984f402576925b2..ce545ecc954e84e702fbd24047370a3417dc0fb1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -102,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index 088c8e88e26f59f2753733252882f5e0e8287fb6..709eb5be55ef180ce9836def4bef601ea4315be0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -112,10 +112,22 @@ tf_module {
     name: "Cropping3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CuDNNGRU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CuDNNLSTM"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DepthwiseConv2D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Dot"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index f85b328e34e6645b0fe0ade18df86411ec0f4e1f..dd78384005fce159c5337c06760d1ba37ecbf390 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Network\'>"
-  is_instance: "<class \'tensorflow.python.layers.network.GraphNetwork\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -74,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -128,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -140,7 +131,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
@@ -248,7 +239,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "set_weights"
@@ -260,7 +251,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -272,6 +263,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 2e044d78bb2cd6c0ac817218480565c785d11ddc..9fcb03f47e7701106b275a9e7ee2adf1243bbdb3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.models.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.sequential.Sequential\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Network\'>"
-  is_instance: "<class \'tensorflow.python.layers.network.GraphNetwork\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -75,14 +70,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "regularizers"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -91,10 +78,6 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -141,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -153,11 +136,11 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -173,11 +156,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'32\', \'1\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -185,7 +168,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -245,7 +228,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "predict_classes"
@@ -253,7 +236,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
@@ -273,7 +256,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "set_weights"
@@ -285,7 +268,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -297,6 +280,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'class_weight\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
index 04174bff5f04fead68af68afeec80316867009a4..ec0f3d892d9d03a738d34a40afe701e788908a8e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\', \'nearest\'], "
+    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'subset\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\', \'None\', \'nearest\'], "
   }
   member_method {
     name: "next"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
index 41f27d1f740457f4b7c4f74cb089a448a0fed845..f5bc04e44c198e5bc60f8361dd32e4ae00250468 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'featurewise_center\', \'samplewise_center\', \'featurewise_std_normalization\', \'samplewise_std_normalization\', \'zca_whitening\', \'zca_epsilon\', \'rotation_range\', \'width_shift_range\', \'height_shift_range\', \'shear_range\', \'zoom_range\', \'channel_shift_range\', \'fill_mode\', \'cval\', \'horizontal_flip\', \'vertical_flip\', \'rescale\', \'preprocessing_function\', \'data_format\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'1e-06\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'nearest\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'featurewise_center\', \'samplewise_center\', \'featurewise_std_normalization\', \'samplewise_std_normalization\', \'zca_whitening\', \'zca_epsilon\', \'rotation_range\', \'width_shift_range\', \'height_shift_range\', \'brightness_range\', \'shear_range\', \'zoom_range\', \'channel_shift_range\', \'fill_mode\', \'cval\', \'horizontal_flip\', \'vertical_flip\', \'rescale\', \'preprocessing_function\', \'data_format\', \'validation_split\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'1e-06\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'0.0\', \'0.0\', \'0.0\', \'nearest\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "fit"
@@ -12,11 +12,11 @@ tf_class {
   }
   member_method {
     name: "flow"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\'], varargs=None, keywords=None, defaults=[\'None\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'subset\'], varargs=None, keywords=None, defaults=[\'None\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'None\'], "
   }
   member_method {
     name: "flow_from_directory"
-    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\', \'nearest\'], "
+    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'subset\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\', \'None\', \'nearest\'], "
   }
   member_method {
     name: "random_transform"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
index 4ef6e6e99e3b71d4a6e497cc577ef8b42cebab79..42196ddeee7aab144537eef250c07060923fa6a9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'x\', \'y\', \'image_data_generator\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'None\', \'None\', \'\', \'png\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'image_data_generator\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'subset\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'None\', \'None\', \'\', \'png\', \'None\'], "
   }
   member_method {
     name: "next"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
index d28fef696515e09990d63581de6127fd52c0a4ee..6b850dd6b784412d623f44200b4acc169bf25968 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "load_img"
     argspec: "args=[\'path\', \'grayscale\', \'target_size\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'nearest\'], "
   }
+  member_method {
+    name: "random_brightness"
+    argspec: "args=[\'x\', \'brightness_range\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "random_channel_shift"
     argspec: "args=[\'x\', \'intensity\', \'channel_axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d9c3215b555c19bc5cf4b32b0d227a9e1b63ce1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.preprocessing.sequence.TimeseriesGenerator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.sequence.TimeseriesGenerator\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'data\', \'targets\', \'length\', \'sampling_rate\', \'stride\', \'start_index\', \'end_index\', \'shuffle\', \'reverse\', \'batch_size\'], varargs=None, keywords=None, defaults=[\'1\', \'1\', \'0\', \'None\', \'False\', \'False\', \'128\'], "
+  }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt
index 1b01935cc53b450c3e7009f945f86c8e1c10bf8e..cf59f8a27269c1161919f7ca2a44c5717a836dd7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.preprocessing.sequence"
 tf_module {
+  member {
+    name: "TimeseriesGenerator"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "make_sampling_table"
     argspec: "args=[\'size\', \'sampling_factor\'], varargs=None, keywords=None, defaults=[\'1e-05\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt
index d106429df0273929472aa58909f554bcffde9bca..50b54fc7e179bdfb8641d8de12934caa3fc44300 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "Tokenizer"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "hashing_trick"
+    argspec: "args=[\'text\', \'n\', \'hash_function\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
+  }
   member_method {
     name: "one_hot"
     argspec: "args=[\'text\', \'n\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
index de81206bc8b25046cd48c79ff8f154041c0e0cb0..38fd78a5a828c7d0da98c97fdc01f504397c6fe6 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -1,8 +1,11 @@
 path: "tensorflow.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
index 72d5496464210efd9e423996dfb274dd9564f761..86a524cc91e10616cd049cec93843e419ec670c7 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -1,8 +1,11 @@
 path: "tensorflow.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
index 595e77ff9f8b64b6606fb075f3edf2281b4c3c1f..8a811fe4561ac3790e43a5553ca04fca002e420d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -1,8 +1,11 @@
 path: "tensorflow.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
index 0c4aa2ff2612269727026141574726ad6df5cdbd..3923e706be7a352d770bf309aecd1fadb2a05e81 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,7 +1,10 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -23,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -51,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -93,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -107,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -115,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -131,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -139,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
index 5f576d0189309442dc4cea3d3617ab3144420165..7a0a8a2a51295d9009f44e2ea126e8a4694147af 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
@@ -1,8 +1,11 @@
 path: "tensorflow.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
index 675a7c76e569d3163ecd2c547841b4c36078b21d..7ed3a652519a6429ff429925c29f3c296a6d2958 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -1,9 +1,12 @@
 path: "tensorflow.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -25,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
index eaabbf6aab172aea5c51f8071076890bb6b5bcf7..23831aa74f1c3dea99e6da407e5a63693f94e37a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
@@ -1,8 +1,11 @@
 path: "tensorflow.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
index 838e070d79d2d7cfbd631f1a5e9960412cfdae5a..9d41a6b09900d984706accd70a353cc26585d9b5 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -1,9 +1,12 @@
 path: "tensorflow.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -25,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
index 4bd8cfc1a48cd839e2ffa54d0d0ca863060406d8..865fe08e63c81222395d125938c8830c02030733 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
@@ -1,8 +1,11 @@
 path: "tensorflow.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
index 57eccb03ffeb90652b019b5ce8a519797e4a3a3d..ee164aae204d3f6c09af79f7fbac825ce470098d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
@@ -1,7 +1,10 @@
 path: "tensorflow.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -23,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -51,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -93,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -107,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -115,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -131,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -139,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
index a1ec00eeeaa98a6199e29b187b0760ddc92db09d..8167dc79cdf9f83b9b97557638bf0702a1be86da 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
@@ -1,7 +1,10 @@
 path: "tensorflow.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -23,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -51,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -93,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -107,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -115,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -131,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -139,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
index a06943d51a52f1951056136445b0d5786d801b5b..efa4419692993aa9975c1af2e647288ae9f38eba 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
@@ -1,7 +1,10 @@
 path: "tensorflow.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -23,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -51,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -93,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -107,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -115,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -131,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -139,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
index 7c1d05cd2bfc8be978e82566e7a3086040978b4a..2ff89f0a6faef905bcafdcb36121f506e9a9977a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
index 24fda0c87ed0aeabd0fd4a16bb2efab444f8cd8a..b3a6dfdffa28d3628e09e6aab823534ba84edf16 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
@@ -1,6 +1,8 @@
 path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -22,6 +24,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -50,6 +56,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -80,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -92,7 +102,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -106,6 +120,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -114,10 +132,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -130,6 +160,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -138,4 +172,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
index 4c3d00e0e1ddfe95c56f9ebc7c5d609c79dd44d4..cef396489dde24698cd9a63b6247292958cfec4e 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -1,8 +1,11 @@
 path: "tensorflow.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
index f7e2017b0c9438130f1cfb2431eb73ca4d3103c5..565f0c7a79661f77d5987d671266ff69268b03b2 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -1,8 +1,11 @@
 path: "tensorflow.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
index 84780926a38ff811a5ab35fadfac690a6dbbbbe2..595ce2eeadfc95fe44895ffd976024aee80ee948 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -1,8 +1,11 @@
 path: "tensorflow.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
index 05799ecfc9fdb9ff44620a67dcdbdc4426fddced..ccca96f72248e390ca65db061d836ee58c8e3205 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -1,9 +1,12 @@
 path: "tensorflow.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -25,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
index c2aeb35c4648bcce22ca73c838a85803a6b9cedf..1c99c9618254cadb1b4e95c7223ca9361e4fa861 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -1,9 +1,12 @@
 path: "tensorflow.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -25,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
index 59134f84891ad5518dcb5331ce04475482c8b59e..df74c32e1f10cc7540ef105adef6be681e93d089 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
@@ -76,10 +76,6 @@ tf_module {
     name: "SeparableConv2D"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "Input"
-    argspec: "args=[\'shape\', \'batch_size\', \'name\', \'dtype\', \'sparse\', \'tensor\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'float32\'>\", \'False\', \'None\'], "
-  }
   member_method {
     name: "average_pooling1d"
     argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b33f3da97ec2ecb3f94e8bc309be2519fc79c62
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorCirculant.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de917706d55214cc59f3205f0778d600a356a5b1
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -0,0 +1,155 @@
+path: "tensorflow.linalg.LinearOperatorCirculant"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant.LinearOperatorCirculant\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_depth"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "spectrum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'spectrum\', \'input_output_dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\', \'None\', \'None\', \'True\', \'LinearOperatorCirculant\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_hermitian_spectrum"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "block_shape_tensor"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convolution_kernel"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..591bc9631a1d8ecbbd6e133b99c67e432399d73f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorCirculant2D.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4e6a21c3ac9324f5dd445dc65415c2abb4c6e9f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -0,0 +1,155 @@
+path: "tensorflow.linalg.LinearOperatorCirculant2D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant.LinearOperatorCirculant2D\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_depth"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "spectrum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'spectrum\', \'input_output_dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\', \'None\', \'None\', \'True\', \'LinearOperatorCirculant2D\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_hermitian_spectrum"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "block_shape_tensor"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convolution_kernel"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d643139a53fc501fe2997a2b9f2d11c57b96f2e4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorCirculant3D.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e085a8e289e21173789041efb9254e992bd723b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -0,0 +1,155 @@
+path: "tensorflow.linalg.LinearOperatorCirculant3D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant.LinearOperatorCirculant3D\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_depth"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "spectrum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'spectrum\', \'input_output_dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\', \'None\', \'None\', \'True\', \'LinearOperatorCirculant3D\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_hermitian_spectrum"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "block_shape_tensor"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convolution_kernel"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
index 1d9c0c0f6d28dfb1a218586075bcb6820b1c62b1..7a5c533872949acaa5c840c236eea2374ac88805 100644
--- a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
@@ -4,6 +4,18 @@ tf_module {
     name: "LinearOperator"
     mtype: "<class \'abc.ABCMeta\'>"
   }
+  member {
+    name: "LinearOperatorCirculant"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "LinearOperatorCirculant2D"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "LinearOperatorCirculant3D"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
   member {
     name: "LinearOperatorComposition"
     mtype: "<class \'abc.ABCMeta\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..897718c05e0d10a6f961f33b8c65f5dab1d03f5b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.math"
+tf_module {
+  member_method {
+    name: "polyval"
+    argspec: "args=[\'coeffs\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 44536787f09fc98bba8a4eb0bc562427cfe48b8b..e1abd43ab54f9d0c131fd94cfeefcdd7ccdb00f1 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -4,6 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -91,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 768565d3cacbd1313ee5a64c9b15f9ab70683772..93e7e40199884fa77ae7da6d15113b8b4ca125ab 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -4,6 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -91,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 0d253e5dd233d6d2b6ad0070a463c283a8769dab..3c3e38229738fec3b25f437a73f3a9d216d970af 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -3,6 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -102,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -116,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -124,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -148,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 97edf245f6fbed393a6fb8dbf1e83649e9ac4b4e..db16660f1145b55c824c698653094977dd6c718b 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -3,6 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -106,7 +116,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -120,6 +134,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -128,10 +146,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -144,6 +174,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -152,6 +186,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 6ecc134d4df866ab5d59e238a8157064421579bd..465fc1cd9c886acaeb8ac49ff54417b9fd101b04 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -4,6 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -91,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 4b3ca1578ba52f30e3405ff198fb716496a462c6..38a387d55a4d34524cdc8af1f7ce1617eb120373 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -4,6 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -91,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 9a6c73a079884b8ab92be1c9e89b2a9f34aad851..b9e3d934759accd885036fa4c5a7013ef64736f3 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -3,6 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -102,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -116,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -124,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -148,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 27488f8e73f20456fae911511ecd2e41a60da351..75b5898c591cbe2b761c0f709159c5489cb8f76a 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -2,6 +2,8 @@ path: "tensorflow.nn.rnn_cell.RNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -23,6 +25,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -51,6 +57,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -89,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -101,7 +111,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -115,6 +129,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -123,10 +141,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -139,6 +169,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -147,6 +181,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 3310836ed26387718115c2454300b9edfe930451..fee0dc63b997f328a4e3d44040c4056de4128eb7 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -3,6 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -102,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -116,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -124,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -148,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 066c4513ff5185b50bdf193f579e71e505dbd3b6..0b12bc060efc2ecfa4819e77cbb0d26c5a6dc419 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -84,6 +84,10 @@ tf_module {
     name: "GRAPH_DEF_VERSION_MIN_PRODUCER"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "GradientTape"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Graph"
     mtype: "<type \'type\'>"
@@ -400,6 +404,10 @@ tf_module {
     name: "manip"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "math"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "metrics"
     mtype: "<type \'module\'>"
@@ -596,6 +604,10 @@ tf_module {
     name: "add_to_collection"
     argspec: "args=[\'name\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "add_to_collections"
+    argspec: "args=[\'names\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "all_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -892,13 +904,21 @@ tf_module {
     name: "cumsum"
     argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "custom_gradient"
+    argspec: "args=[\'f\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_compressed"
+    argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "decode_csv"
-    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\'], "
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
   }
   member_method {
     name: "decode_json_example"
@@ -964,6 +984,10 @@ tf_module {
     name: "einsum"
     argspec: "args=[\'equation\'], varargs=inputs, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "enable_eager_execution"
+    argspec: "args=[\'config\', \'device_policy\', \'execution_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -980,6 +1004,10 @@ tf_module {
     name: "erfc"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "executing_eagerly"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "exp"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1094,7 +1122,7 @@ tf_module {
   }
   member_method {
     name: "get_local_variable"
-    argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "get_seed"
@@ -1600,6 +1628,10 @@ tf_module {
     name: "reduce_sum"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "regex_replace"
+    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
   member_method {
     name: "register_tensor_conversion_function"
     argspec: "args=[\'base_type\', \'conversion_func\', \'priority\'], varargs=None, keywords=None, defaults=[\'100\'], "
@@ -1664,6 +1696,14 @@ tf_module {
     name: "scatter_div"
     argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "scatter_max"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_min"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "scatter_mul"
     argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -1940,6 +1980,10 @@ tf_module {
     name: "string_split"
     argspec: "args=[\'source\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=[\' \', \'True\'], "
   }
+  member_method {
+    name: "string_strip"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "string_to_hash_bucket"
     argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1988,10 +2032,22 @@ tf_module {
     name: "tile"
     argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "timestamp"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "to_bfloat16"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToBFloat16\'], "
   }
+  member_method {
+    name: "to_complex128"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToComplex128\'], "
+  }
+  member_method {
+    name: "to_complex64"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToComplex64\'], "
+  }
   member_method {
     name: "to_double"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToDouble\'], "
@@ -2056,6 +2112,10 @@ tf_module {
     name: "unsorted_segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unsorted_segment_mean"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_min"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
index dcf747971b7b8bf243502b2388da635705b8ee3e..6b65b0ace3cf7740ab03390841c941592000d127 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_event"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
index c02e54adfbd9f33e661453767b517a5f0de90d57..16bfbf20d5227d6308248bebcb62f32a2df8ef41 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.AdadeltaOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
index 2b619908fc6aea3f4b8e6a57d0dcf85a9854d466..61cde9181c2367153b7b289b41bd932482bb92fd 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.AdagradDAOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
index 2005cf4677c06cf1f8b4207a444690fdd0c2306e..0a998c1afe4fff6e215360bc1cf8fc135754223c 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.AdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
index 0a2bae1d9021b20707e03ae5786e71f388266c14..cc5954152577796ee7a5a6e1cedc873647d64f7c 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.AdamOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
index c3037baa8c951ecd9b60267ee7cc8674ead88dbe..327799729c9e7d9ce2931f8951db995195ff35e9 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\', \'steps_per_run\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "after_create_session"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17f393d27c4b917a78902a4d6f13630311cb44ef
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.train.Checkpoint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.checkpointable_utils.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "save_counter"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "restore"
+    argspec: "args=[\'self\', \'save_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
index 847f9ad75998f1bdda8858650091c70fd0b4015b..1add3a902122341a706c38b19ea6ff5882c26445 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.FtrlOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
index 13a58e0608ed269415ba78d84a03f1bae128e80c..ef5bbd6ace29abb5c73516176fcc7594a58d493a 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.GradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
index bfbc2357a346c7bfef0242a735ab14c5f4005b22..3d6e87f5eb44de9d6ce1bdd25a54b8df9020cc03 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.MomentumOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
index 437efa0a2bd04c308db6186e714a5d8785541fa5..e73861ff7cb2d90d8efac72cdd7de3b27395f29e 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.train.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
index 72f224605f67e72dd78699b5f1a703cc3edd566b..301b35b199c87890a0aef4139eb06253592ce0c4 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.ProximalAdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
index 316275b1fb1abd384e193994e35115a1c463f07d..8815befa936a85522011111a4a6270d22cbc25ae 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.ProximalGradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
index af50a1986100d830f0809a3f4a0f01faa8821b3b..e9819683ba5ec1bcacb3cdbcb2d787e866a77b6f 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.RMSPropOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
index 62b956c5ef7dc54e92431f25ec948e341c0e1f24..38cc98b48e78aa93f7614a9baff236f7b119f99d 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
@@ -38,6 +38,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\', \'copy_from_scaffold\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "default_local_init_op"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
index 6edc516c9392fa14f23ffc2a6481ec21216f06cf..3db96aff876b88b80b647570cf68b1ebc0b2da3b 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.train.SyncReplicasOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-vocab-info.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-vocab-info.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ce7cb111163e103a1cebe30d5c6f3eeb4234693
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-vocab-info.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.train.VocabInfo"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
+  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "backup_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "new_vocab"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "new_vocab_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_oov_buckets"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "old_vocab"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "old_vocab_size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
index e49c719a334455d1f8f39fa67332be8bb81f2bc2..9fb18e77afd7c9c989ad5e967be291406e7239aa 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "BytesList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "Checkpoint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
@@ -224,6 +228,10 @@ tf_module {
     name: "SyncReplicasOptimizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "VocabInfo"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "WorkerSessionCreator"
     mtype: "<type \'type\'>"
@@ -234,7 +242,7 @@ tf_module {
   }
   member_method {
     name: "MonitoredTrainingSession"
-    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'600\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\', \'7200\'], "
+    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\', \'max_wait_secs\', \'save_checkpoint_steps\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'<object object instance>\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\', \'7200\', \'<object object instance>\'], "
   }
   member_method {
     name: "NewCheckpointReader"
@@ -402,7 +410,7 @@ tf_module {
   }
   member_method {
     name: "sdca_optimizer"
-    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\', \'adaptative\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\', \'adaptative\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "sdca_shrink_l1"
@@ -436,6 +444,10 @@ tf_module {
     name: "update_checkpoint_state"
     argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "warm_start"
+    argspec: "args=[\'ckpt_to_initialize_from\', \'vars_to_warm_start\', \'var_name_to_vocab_info\', \'var_name_to_prev_var_name\'], varargs=None, keywords=None, defaults=[\'.*\', \'None\', \'None\'], "
+  }
   member_method {
     name: "write_graph"
     argspec: "args=[\'graph_or_graph_def\', \'logdir\', \'name\', \'as_text\'], varargs=None, keywords=None, defaults=[\'True\'], "
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
index 2d3b838957d60ffb5e827c6b43100d217cc5739e..3f4fb9104271539c431f02e21b7e30780a721fd7 100644
--- a/tensorflow/tools/api/lib/BUILD
+++ b/tensorflow/tools/api/lib/BUILD
@@ -26,15 +26,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 608a34ab7b32bdc26cebbe43b383155406fb51b2..724b12cd4799eb76fe602c737c850e96e92faa58 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -41,15 +41,3 @@ tf_cc_binary(
         "//tensorflow/core:op_gen_lib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index c1e09cc531ed8e8995e3e73b87e96b72fba6c038..1ad6b6d1c0ae5ca1ac1329fd49e972840020e4c3 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -46,6 +46,7 @@ from tensorflow.tools.api.lib import python_object_to_proto_visitor
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 
+
 # FLAGS defined at the bottom:
 FLAGS = None
 # DEFINE_boolean, update_goldens, default False:
@@ -54,7 +55,7 @@ _UPDATE_GOLDENS_HELP = """
      have to be authorized by TensorFlow leads.
 """
 
-# DEFINE_boolean, verbose_diffs, default False:
+# DEFINE_boolean, verbose_diffs, default True:
 _VERBOSE_DIFFS_HELP = """
      If set to true, print line by line diffs on all libraries. If set to
      false, only print which libraries have differences.
@@ -109,7 +110,8 @@ class ApiCompatibilityTest(test.TestCase):
                              expected_dict,
                              actual_dict,
                              verbose=False,
-                             update_goldens=False):
+                             update_goldens=False,
+                             additional_missing_object_message=''):
     """Diff given dicts of protobufs and report differences a readable way.
 
     Args:
@@ -120,6 +122,8 @@ class ApiCompatibilityTest(test.TestCase):
       verbose: Whether to log the full diffs, or simply report which files were
           different.
       update_goldens: Whether to update goldens when there are diffs found.
+      additional_missing_object_message: Message to print when a symbol is
+          missing.
     """
     diffs = []
     verbose_diffs = []
@@ -138,7 +142,8 @@ class ApiCompatibilityTest(test.TestCase):
       verbose_diff_message = ''
       # First check if the key is not found in one or the other.
       if key in only_in_expected:
-        diff_message = 'Object %s expected but not found (removed).' % key
+        diff_message = 'Object %s expected but not found (removed). %s' % (
+            key, additional_missing_object_message)
         verbose_diff_message = diff_message
       elif key in only_in_actual:
         diff_message = 'New object %s found (added).' % key
@@ -165,7 +170,7 @@ class ApiCompatibilityTest(test.TestCase):
       logging.error('%d differences found between API and golden.', diff_count)
       messages = verbose_diffs if verbose else diffs
       for i in range(diff_count):
-        logging.error('Issue %d\t: %s', i + 1, messages[i])
+        print('Issue %d\t: %s' % (i + 1, messages[i]), file=sys.stderr)
 
       if update_goldens:
         # Write files if requested.
@@ -235,7 +240,7 @@ if __name__ == '__main__':
   parser.add_argument(
       '--update_goldens', type=bool, default=False, help=_UPDATE_GOLDENS_HELP)
   parser.add_argument(
-      '--verbose_diffs', type=bool, default=False, help=_VERBOSE_DIFFS_HELP)
+      '--verbose_diffs', type=bool, default=True, help=_VERBOSE_DIFFS_HELP)
   FLAGS, unparsed = parser.parse_known_args()
 
   # Now update argv, so that unittest library does not get confused.
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 6ed2594e6abe169577066678e1bf4b9e2df4c4d3..566a172ea77fbc033496ef00a3415cff9ad8149a 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -90,12 +90,3 @@ tf_cc_binary(
     visibility = ["//visibility:public"],
     deps = [":benchmark_model_lib"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index ecab6f8769ae2d0126f63580030ed6ff756015d0..15523028c726fefa13641a1369cf4274bcfb9973 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -48,33 +48,14 @@ limitations under the License.
 namespace tensorflow {
 namespace benchmark_model {
 
-Status InitializeSession(int num_threads, const string& graph,
-                         std::unique_ptr<Session>* session,
-                         std::unique_ptr<GraphDef>* graph_def) {
-  LOG(INFO) << "Loading TensorFlow.";
+namespace {
 
-  tensorflow::SessionOptions options;
-  tensorflow::ConfigProto& config = options.config;
-  if (num_threads > 0) {
-    config.set_intra_op_parallelism_threads(num_threads);
+Status InitializeVariables(Session* session,
+                           const std::vector<string>& init_ops) {
+  LOG(INFO) << "Initializing graph variables";
+  for (const string& init_op : init_ops) {
+    TF_RETURN_IF_ERROR(session->Run({}, {}, {init_op}, nullptr));
   }
-  LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
-
-  session->reset(tensorflow::NewSession(options));
-  graph_def->reset(new GraphDef());
-  tensorflow::GraphDef tensorflow_graph;
-  Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
-  if (!s.ok()) {
-    LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
-    return s;
-  }
-
-  s = (*session)->Create(*(graph_def->get()));
-  if (!s.ok()) {
-    LOG(ERROR) << "Could not create TensorFlow Session: " << s;
-    return s;
-  }
-
   return Status::OK();
 }
 
@@ -247,8 +228,56 @@ void RecordBenchmarkEntry(const string& output_prefix,
   TF_QCHECK_OK(node_reporter.Close());
 }
 
+void SleepSeconds(double sleep_seconds) {
+  if (sleep_seconds <= 0.0) {
+    return;
+  }
+#ifdef PLATFORM_WINDOWS
+  Sleep(sleep_seconds * 1000);
+#else
+  // Convert the inference_delay string into a timespec.
+  timespec req;
+  req.tv_sec = static_cast<time_t>(sleep_seconds);
+  req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
+  nanosleep(&req, nullptr);
+#endif
+}
+
+}  // namespace
+
+Status InitializeSession(int num_threads, const string& graph,
+                         std::unique_ptr<Session>* session,
+                         std::unique_ptr<GraphDef>* graph_def) {
+  LOG(INFO) << "Loading TensorFlow.";
+
+  tensorflow::SessionOptions options;
+  tensorflow::ConfigProto& config = options.config;
+  if (num_threads > 0) {
+    config.set_intra_op_parallelism_threads(num_threads);
+  }
+  LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
+
+  session->reset(tensorflow::NewSession(options));
+  graph_def->reset(new GraphDef());
+  tensorflow::GraphDef tensorflow_graph;
+  Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
+  if (!s.ok()) {
+    LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
+    return s;
+  }
+
+  s = (*session)->Create(*(graph_def->get()));
+  if (!s.ok()) {
+    LOG(ERROR) << "Could not create TensorFlow Session: " << s;
+    return s;
+  }
+
+  return Status::OK();
+}
+
 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
-                    const std::vector<string>& outputs, Session* session,
+                    const std::vector<string>& outputs,
+                    const std::vector<string>& targets, Session* session,
                     StatSummarizer* stats, int64* inference_time_us) {
   std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
   CreateTensorsFromInputInfo(inputs, &input_tensors);
@@ -264,8 +293,8 @@ Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
 
   RunMetadata run_metadata;
   const int64 start_time = Env::Default()->NowMicros();
-  s = session->Run(run_options, input_tensors, outputs, {}, &output_tensors,
-                   &run_metadata);
+  s = session->Run(run_options, input_tensors, outputs, targets,
+                   &output_tensors, &run_metadata);
   const int64 end_time = Env::Default()->NowMicros();
   *inference_time_us = end_time - start_time;
 
@@ -283,24 +312,10 @@ Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
   return s;
 }
 
-void SleepSeconds(double sleep_seconds) {
-  if (sleep_seconds <= 0.0) {
-    return;
-  }
-#ifdef PLATFORM_WINDOWS
-  Sleep(sleep_seconds * 1000);
-#else
-  // Convert the inference_delay string into a timespec.
-  timespec req;
-  req.tv_sec = static_cast<time_t>(sleep_seconds);
-  req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
-  nanosleep(&req, nullptr);
-#endif
-}
-
 Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
                         const std::vector<InputLayerInfo>& inputs,
-                        const std::vector<string>& outputs, Session* session,
+                        const std::vector<string>& outputs,
+                        const std::vector<string>& targets, Session* session,
                         StatSummarizer* stats, int64* total_time_us,
                         int64* actual_num_runs) {
   *total_time_us = 0;
@@ -315,7 +330,8 @@ Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
   const bool until_max_time = num_runs <= 0;
   for (int i = 0; until_max_time || i < num_runs; ++i) {
     int64 time;
-    Status run_status = RunBenchmark(inputs, outputs, session, stats, &time);
+    Status run_status =
+        RunBenchmark(inputs, outputs, targets, session, stats, &time);
     stat.UpdateStat(time);
     (*total_time_us) += time;
     ++(*actual_num_runs);
@@ -345,11 +361,13 @@ Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
 
 int Main(int argc, char** argv) {
   string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
+  string init_ops_string = "";
   string input_layer_string = "input:0";
   string input_layer_shape_string = "1,224,224,3";
   string input_layer_type_string = "float";
   string input_layer_values_string = "";
   string output_layer_string = "output:0";
+  string target_layer_string = "";
   int max_num_runs = 1000;
   string max_time = "10.0";
   string inference_delay = "-1.0";
@@ -371,12 +389,14 @@ int Main(int argc, char** argv) {
 
   std::vector<Flag> flag_list = {
       Flag("graph", &graph, "graph file name"),
+      Flag("init_ops", &init_ops_string, "init ops"),
       Flag("input_layer", &input_layer_string, "input layer names"),
       Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
       Flag("input_layer_type", &input_layer_type_string, "input layer type"),
       Flag("input_layer_values", &input_layer_values_string,
            "values to initialize the inputs with"),
       Flag("output_layer", &output_layer_string, "output layer name"),
+      Flag("target_layer", &target_layer_string, "target layer name"),
       Flag("max_num_runs", &max_num_runs, "number of runs max"),
       Flag("max_time", &max_time, "length to run max"),
       Flag("inference_delay", &inference_delay,
@@ -410,6 +430,7 @@ int Main(int argc, char** argv) {
     return -1;
   }
 
+  std::vector<string> init_ops = str_util::Split(init_ops_string, ',');
   std::vector<string> input_layers = str_util::Split(input_layer_string, ',');
   std::vector<string> input_layer_shapes =
       str_util::Split(input_layer_shape_string, ':');
@@ -418,6 +439,7 @@ int Main(int argc, char** argv) {
   std::vector<string> input_layer_values =
       str_util::Split(input_layer_values_string, ':');
   std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
+  std::vector<string> target_layers = str_util::Split(target_layer_string, ',');
   if ((input_layers.size() != input_layer_shapes.size()) ||
       (input_layers.size() != input_layer_types.size())) {
     LOG(ERROR) << "There must be the same number of items in --input_layer,"
@@ -441,10 +463,12 @@ int Main(int argc, char** argv) {
   }
 
   LOG(INFO) << "Graph: [" << graph << "]";
+  LOG(INFO) << "Init ops:" << init_ops_string;
   LOG(INFO) << "Input layers: [" << input_layer_string << "]";
   LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
   LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
   LOG(INFO) << "Output layers: [" << output_layer_string << "]";
+  LOG(INFO) << "Target layers: [" << target_layer_string << "]";
   LOG(INFO) << "Num runs: [" << max_num_runs << "]";
   LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
   LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
@@ -470,6 +494,16 @@ int Main(int argc, char** argv) {
     return -1;
   }
 
+  if (!init_ops.empty()) {
+    Status initialize_variables_status =
+        InitializeVariables(session.get(), init_ops);
+    if (!initialize_variables_status.ok()) {
+      LOG(ERROR) << "Graph variables initialization failed with "
+                 << initialize_variables_status;
+      return -1;
+    }
+  }
+
   StatSummarizerOptions stats_options;
   stats_options.show_run_order = show_run_order;
   stats_options.run_order_limit = run_order_limit;
@@ -520,9 +554,10 @@ int Main(int argc, char** argv) {
   int64 warmup_time_us = 0;
   int64 num_warmup_runs = 0;
   if (warmup_runs > 0) {
-    Status warmup_time_status = TimeMultipleRuns(
-        inter_inference_sleep_seconds, warmup_runs, -1.0, inputs, output_layers,
-        session.get(), nullptr, &warmup_time_us, &num_warmup_runs);
+    Status warmup_time_status =
+        TimeMultipleRuns(inter_inference_sleep_seconds, warmup_runs, -1.0,
+                         inputs, output_layers, target_layers, session.get(),
+                         nullptr, &warmup_time_us, &num_warmup_runs);
     if (!warmup_time_status.ok()) {
       LOG(ERROR) << "Timing failed with " << warmup_time_status;
       return -1;
@@ -536,8 +571,8 @@ int Main(int argc, char** argv) {
   int64 no_stat_num_runs = 0;
   Status no_stat_time_status = TimeMultipleRuns(
       inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
-      inputs, output_layers, session.get(), nullptr, &no_stat_time_us,
-      &no_stat_num_runs);
+      inputs, output_layers, target_layers, session.get(), nullptr,
+      &no_stat_time_us, &no_stat_num_runs);
   const double no_stat_wall_time = no_stat_time_us / 1000000.0;
   if (!no_stat_time_status.ok()) {
     LOG(ERROR) << "Timing failed with " << no_stat_time_status;
@@ -551,8 +586,8 @@ int Main(int argc, char** argv) {
   int64 stat_num_runs = 0;
   Status stat_time_status = TimeMultipleRuns(
       inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
-      inputs, output_layers, session.get(), stats.get(), &stat_time_us,
-      &stat_num_runs);
+      inputs, output_layers, target_layers, session.get(), stats.get(),
+      &stat_time_us, &stat_num_runs);
   if (!stat_time_status.ok()) {
     LOG(ERROR) << "Timing failed with " << stat_time_status;
     return -1;
diff --git a/tensorflow/tools/benchmark/benchmark_model.h b/tensorflow/tools/benchmark/benchmark_model.h
index dff62c5b5d518da8f9034295626e46db783f343d..dc5f0080374e70edad52965cc0a95f99751baa48 100644
--- a/tensorflow/tools/benchmark/benchmark_model.h
+++ b/tensorflow/tools/benchmark/benchmark_model.h
@@ -37,13 +37,15 @@ Status InitializeSession(int num_threads, const string& graph,
 
 // Does a single run of the model that's been loaded into the given session.
 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
-                    const std::vector<string>& outputs, Session* session,
+                    const std::vector<string>& outputs,
+                    const std::vector<string>& targets, Session* session,
                     StatSummarizer* stats, int64* inference_time_us);
 
 // Runs the model multiple time, keeping track of timing information.
 Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
                         const std::vector<InputLayerInfo>& inputs,
-                        const std::vector<string>& outputs, Session* session,
+                        const std::vector<string>& outputs,
+                        const std::vector<string>& targets, Session* session,
                         StatSummarizer* stats, int64* total_time_us,
                         int64* actual_num_runs);
 
diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc
index bb4eb5352039b01a6692621906eff005187cfa36..16ab2ff66e763a0ca5130f075f988bade9c8abd1 100644
--- a/tensorflow/tools/benchmark/benchmark_model_test.cc
+++ b/tensorflow/tools/benchmark/benchmark_model_test.cc
@@ -64,8 +64,8 @@ TEST(BenchmarkModelTest, InitializeAndRun) {
   int64 time;
   int64 num_runs = 0;
   TF_ASSERT_OK(benchmark_model::TimeMultipleRuns(
-      0.0, 10, 0.0, {input}, {output_name}, session.get(), stats.get(), &time,
-      &num_runs));
+      0.0, 10, 0.0, {input}, {output_name}, {}, session.get(), stats.get(),
+      &time, &num_runs));
   ASSERT_EQ(num_runs, 10);
 }
 
diff --git a/tensorflow/tools/build_info/BUILD b/tensorflow/tools/build_info/BUILD
index cdc47076ced24698d0139c1c14d1660018b1a815..730741780550bfe3fbccd7e62f5f7d9788f0a9a9 100644
--- a/tensorflow/tools/build_info/BUILD
+++ b/tensorflow/tools/build_info/BUILD
@@ -9,18 +9,3 @@ exports_files(
         "gen_build_info.py",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake
index ec90c83aacd068e8f9c16e5be8eb6e1cef098ea6..d5dea4f3e41841aed5aeac02fcca850dbfdfaeb3 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cmake
+++ b/tensorflow/tools/ci_build/Dockerfile.cmake
@@ -23,11 +23,12 @@ RUN /install/install_deb_packages.sh
 
 RUN apt-get update
 RUN apt-get install -y --no-install-recommends python-pip
+RUN pip install --upgrade wheel
 RUN pip install --upgrade astor
 RUN pip install --upgrade gast
 RUN pip install --upgrade numpy
 RUN pip install --upgrade termcolor
 
 # Install golang
-RUN add-apt-repository -y ppa:ubuntu-lxc/lxd-stable
-RUN apt-get install -y golang
+RUN apt-get install -t xenial-backports -y golang-1.9
+ENV PATH=${PATH}:/usr/lib/go-1.9/bin
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
new file mode 100644
index 0000000000000000000000000000000000000000..3bc52b9ed611a0f0a4a269a2864d5b349ee9232c
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
@@ -0,0 +1,14 @@
+FROM launcher.gcr.io/google/rbe-debian8:r327695
+LABEL maintainer="Yu Yi <yiyu@google.com>"
+
+# Copy install scripts
+COPY install/*.sh /install/
+
+# Setup envvars
+ENV CC /usr/local/bin/clang
+ENV CXX /usr/local/bin/clang++
+ENV AR /usr/bin/ar
+
+# Run pip install script for RBE Debian8 container.
+RUN /install/install_pip_packages_remote.sh
+RUN /install/install_pip_packages.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
new file mode 100644
index 0000000000000000000000000000000000000000..24ff4765a619701cd614414d2b06f7fa4ce7d8c0
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
@@ -0,0 +1,26 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+LABEL maintainer="Nick Lopez <ngiraldo@google.com>"
+
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
+# /usr/local/cuda
+RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
+RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_golang.sh
+
+# Install clang from pre-built package
+RUN cd /tmp && \
+    wget https://storage.googleapis.com/clang-builds-stable/clang-ubuntu16_04/clang_r323528.tar.gz && \
+    echo "26752d9f5785df07193fac8316ba5d5ba3bec36d970c29a1577360848818ac74  clang_r323528.tar.gz" | sha256sum -c && \
+    tar -C /usr/local -xf clang_r323528.tar.gz && \
+    rm clang_r323528.tar.gz
+
diff --git a/tensorflow/tools/ci_build/builds/android.sh b/tensorflow/tools/ci_build/builds/android.sh
index 564c5aa1480f5fd824dbc5c8bc85cec90664c512..d81793efe08f151c1b448a9da3cc971ca3137829 100755
--- a/tensorflow/tools/ci_build/builds/android.sh
+++ b/tensorflow/tools/ci_build/builds/android.sh
@@ -29,7 +29,8 @@ echo "========== TensorFlow Demo Build Test =========="
 # Enable sandboxing so that zip archives don't get incorrectly packaged
 # in assets/ dir (see https://github.com/bazelbuild/bazel/issues/2334)
 # TODO(gunan): remove extra flags once sandboxing is enabled for all builds.
-bazel --bazelrc=/dev/null build -c opt --fat_apk_cpu=x86_64 \
+bazel --bazelrc=/dev/null build \
+    --compilation_mode=opt --cxxopt=-std=c++11 --fat_apk_cpu=x86_64 \
     --spawn_strategy=sandboxed --genrule_strategy=sandboxed \
     //tensorflow/examples/android:tensorflow_demo
 
diff --git a/tensorflow/tools/ci_build/builds/android_full.sh b/tensorflow/tools/ci_build/builds/android_full.sh
index 9d449241e8413ddbd81c580cc4def808c0086cb9..41dc66dd5436a81eeeca197f6ef57cb2a1407ca0 100755
--- a/tensorflow/tools/ci_build/builds/android_full.sh
+++ b/tensorflow/tools/ci_build/builds/android_full.sh
@@ -40,7 +40,8 @@ rm -rf ${AAR_LIB_TMP}
 for CPU in ${CPUS//,/ }
 do
     echo "========== Building native libs for Android ${CPU} =========="
-    bazel build -c opt --config=monolithic --cpu=${CPU} \
+    bazel build --config=monolithic --cpu=${CPU} \
+        --compilation_mode=opt --cxxopt=-std=c++11 \
         --crosstool_top=//external:android/crosstool \
         --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
         //tensorflow/core:android_tensorflow_lib \
@@ -62,7 +63,8 @@ done
 # in assets/ dir (see https://github.com/bazelbuild/bazel/issues/2334)
 # TODO(gunan): remove extra flags once sandboxing is enabled for all builds.
 echo "========== Building TensorFlow Android Jar and Demo =========="
-bazel --bazelrc=/dev/null build -c opt --config=monolithic --fat_apk_cpu=${CPUS} \
+bazel --bazelrc=/dev/null build --config=monolithic --fat_apk_cpu=${CPUS} \
+    --compilation_mode=opt --cxxopt=-std=c++11 \
     --spawn_strategy=sandboxed --genrule_strategy=sandboxed \
     //tensorflow/contrib/android:android_tensorflow_inference_java \
     //tensorflow/contrib/android:android_tensorflow_inference_java.aar \
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 82042b93c02275b51530b306d8cf4519482e5410..5fa75e1d61cceeebfa77439bb64f1c644c9dba70 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -123,6 +123,10 @@ done
 
 BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}")
 
+if [[ -z "$GIT_TAG_OVERRIDE" ]]; then
+  BAZEL_FLAGS+=" --action_env=GIT_TAG_OVERRIDE"
+fi
+
 echo "Using Bazel flags: ${BAZEL_FLAGS}"
 
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
diff --git a/tensorflow/tools/ci_build/builds/test_tutorials.sh b/tensorflow/tools/ci_build/builds/test_tutorials.sh
index 67e5af556405a5c659000a07a79a6bd9a1d1e542..db335f14ca4f88ade7a540ffab7ed9de67f1248e 100755
--- a/tensorflow/tools/ci_build/builds/test_tutorials.sh
+++ b/tensorflow/tools/ci_build/builds/test_tutorials.sh
@@ -277,17 +277,6 @@ test_ptb_word_lm() {
   fi
 }
 
-
-# -----------------------------------------------------------
-# translate_test
-test_translate_test() {
-  LOG_FILE=$1
-
-  run_in_directory "${TEST_DIR}" "${LOG_FILE}" \
-    "${TF_MODELS_DIR}/tutorials/rnn/translate/translate.py" --self_test=True
-}
-
-
 # Run the tutorial tests
 test_runner "tutorial test-on-install" \
     "${TUT_TESTS}" "${TF_BUILD_TUT_TEST_BLACKLIST}" "${LOGS_DIR}"
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index caa3a40817c80b27271f76de0a95a743cb2916f6..c342367bacea9d2ba8152d928b93bf61cf60d0e7 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -213,27 +213,34 @@ USER_OP=$(echo "${USER_OP_SO}" | sed -e 's/\.so//')
 echo "Invoking user op ${USER_OP} defined in file ${USER_OP_SO} "\
 "via pip installation"
 
-ORIG_OUTPUT=$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
-
-# Format OUTPUT for analysis
-if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then
-  if [[ ${IS_MAC} == "1" ]]; then
-    OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g')
+function run_op() {
+  local ORIG_OUTPUT=$1
+  local ADDITIONAL_LOG=$2
+
+  # Format OUTPUT for analysis
+  if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then
+    if [[ ${IS_MAC} == "1" ]]; then
+      local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g')
+    else
+      local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g')
+    fi
   else
-    OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g')
+    local OUTPUT="${ORIG_OUTPUT}"
   fi
-else
-  OUTPUT="${ORIG_OUTPUT}"
-fi
 
-EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})")
+  local EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})")
 
-if [[ "${EQUALS_EXPECTED}" != "True" ]]; then
-  die "FAILED: Output from user op (${OUTPUT}) does not match expected "\
-"output ${EXPECTED_OUTPUT}"
-else
-  echo "Output from user op (${OUTPUT}) matches expected output"
-fi
+  if [[ "${EQUALS_EXPECTED}" != "True" ]]; then
+    local ERROR="FAILED: Output from user op (${OUTPUT}) does not match expected "\
+  "output ${EXPECTED_OUTPUT}"${ADDITIONAL_LOG}
+    die ${ERROR}
+  else
+    echo "Output from user op (${OUTPUT}) matches expected output"
+  fi
+}
+
+run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
+run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode"
 
 popd
 
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index 5817716c8dec37dfdfd50defb4b20b1deafced70..d4bf546d401d058bd205a70c147615c8efc4f4ba 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -36,8 +36,13 @@ else
   rm /this_is_writable_file_system
 fi
 
+if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
+  ADDUSER_OPTS="--force-badname"
+fi
+
 getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
-getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
+getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
+    --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
     --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}"
 usermod -a -G sudo "${CI_BUILD_USER}"
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 9d23b508aa1c1d20d0f4b5979aa7be2c295fe325..797e0a6db52aa6216486bdc9c6a88ff353c57e15 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -237,7 +237,7 @@ function get_cuda_capability_version() {
 CTYPE=${TF_BUILD_CONTAINER_TYPE}
 
 # Determine if the machine is a Mac
-OPT_FLAG=""
+OPT_FLAG="--test_output=errors"
 if [[ "$(uname -s)" == "Darwin" ]]; then
   DO_DOCKER=0
 
diff --git a/tensorflow/tools/ci_build/remote/remote_docker_build.sh b/tensorflow/tools/ci_build/ci_rbe_docker_build.sh
similarity index 58%
rename from tensorflow/tools/ci_build/remote/remote_docker_build.sh
rename to tensorflow/tools/ci_build/ci_rbe_docker_build.sh
index e00a66aabaf1068c772aabce2391616518be44d4..cd811de6bdf9275b799a608381c76713a6c7a65b 100755
--- a/tensorflow/tools/ci_build/remote/remote_docker_build.sh
+++ b/tensorflow/tools/ci_build/ci_rbe_docker_build.sh
@@ -16,25 +16,19 @@
 # Build TensorFlow Docker images for remote build
 #
 # Usage:
-#   remote_docker_build.sh -c # docker image for cpu build
-#   remote_docker_build.sh -g # docker image for gpu build
-
+#   ci_rbe_docker_build.sh -c # docker image for cpu build
+#   ci_rbe_docker_build.sh -g # docker image for gpu build
 
 function main {
-  publish=true
   cpu_build=false
   gpu_build=false
-  publish=true
+  publish=false
 
   script_dir=$(dirname "$(readlink -f "$0")")
   cd $script_dir
 
-  trap cleanup_on_finish EXIT
-
   set_script_flags $@
 
-  build_base_image
-
   build_tf_image
 
   if [ "$publish" = true ] ; then
@@ -50,17 +44,14 @@ function set_script_flags {
       c)
         cpu_build=true
         ;;
-      f)
-        base_image_build_script=$OPTARG
-        ;;
       g)
         gpu_build=true
         ;;
       h)
         print_usage
         ;;
-      n)
-        publish=false
+      p)
+        publish=true
         ;;
       *)
         print_usage "ERROR: unknown option"
@@ -76,7 +67,6 @@ function print_usage {
   echo "Usage: $(basename $0) -c | -g [options]"
   echo "  -c build image for CPU build (base image debian8-clang)"
   echo "  -g build image for GPU build (base image nvidia-clang)"
-  echo "  -f the script which build the {debian8,nvidia}-clang base image"
   echo "[option] is one of"
   echo "  -n not publish the locally-built image to GCR;"
   echo "     the build process will publish image to GCR by default"
@@ -87,54 +77,22 @@ function print_usage {
   exit 1
 }
 
-
-# Build nvidia-cuba-clang base image for GPU image.
-# For CPU the `clang-debian8` from Cloud Launcher will be used directly:
-# https://console.cloud.google.com/launcher/details/google/clang-debian8?filter=category:developer-tools&q=clang
-function build_base_image {
-  if [ "$gpu_build" = true ] ; then
-    base_image="nvidia-cuda"
-    # Run a 2-stage build for clang base image, see
-    # https://github.com/llvm-mirror/llvm/blob/master/docs/Docker.rst
-    $base_image_build_script \
-      --source $base_image \
-      --branch branches/google/stable \
-      --docker-repository ${base_image}-clang --docker-tag "latest" \
-      -p clang -i stage2-install-clang -i stage2-install-clang-headers \
-      -- \
-      -DLLVM_TARGETS_TO_BUILD=Native -DCMAKE_BUILD_TYPE=Release \
-      -DBOOTSTRAP_CMAKE_BUILD_TYPE=Release \
-      -DCLANG_ENABLE_BOOTSTRAP=ON \
-      -DCLANG_BOOTSTRAP_TARGETS="install-clang;install-clang-headers"
-  fi
-}
-
-
 function build_tf_image {
   if [ "$cpu_build" = true ] ; then
-    dockerfile="Dockerfile.cpu"
-    tf_image="tensorflow-remote"
+    dockerfile="Dockerfile.rbe.cpu"
+    tf_image="tensorflow-rbe-cpu"
   else
-    dockerfile="Dockerfile.gpu"
-    tf_image="tensorflow-remote-gpu"
+    dockerfile="Dockerfile.rbe.gpu"
+    tf_image="tensorflow-rbe-gpu"
   fi
 
   docker build -f $dockerfile -t $tf_image .
 }
 
-
 function publish_tf_image {
   gcr_tf_image="gcr.io/tensorflow/${tf_image}"
   docker tag $tf_image $gcr_tf_image
   gcloud docker -- push $gcr_tf_image
 }
 
-
-function cleanup_on_finish {
-  cd $script_dir
-  rm -rf $llvm_docker_src
-  docker rmi -f ${base_image}-clang ${base_image}-clang-build
-}
-
-
 main $@
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index aeac085d30aef746366192361f249eb01f95e8da..8e8b2191e5c8f3fb6ada929cbc6b327fa0a67584 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -101,6 +101,7 @@ do_pylint() {
 "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
+"^tensorflow/python/keras/_impl/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
 "^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
 "^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned"
 
@@ -431,7 +432,8 @@ cmd_status(){
 # out by default in TF WORKSPACE file.
 do_bazel_nobuild() {
   BUILD_TARGET="//tensorflow/..."
-  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/java/demo/app/src/main/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/java/demo/app/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/examples/android/..."
   BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/schema/..."
   BUILD_CMD="bazel build --nobuild ${BAZEL_FLAGS} -- ${BUILD_TARGET}"
 
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 90fd6a6e71f19649406234bc93025c15e4a5063c..420d390d2b9dc1ec25461b3502c63467a7eda16b 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -29,13 +29,9 @@ import argparse
 import os
 import re
 import shutil
-import subprocess
+import tempfile
 import zipfile
 
-UNZIP_CMD = "/usr/bin/unzip"
-ZIP_CMD = "/usr/bin/zip"
-SED_CMD = "/bin/sed"
-
 TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
@@ -43,7 +39,7 @@ BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 def check_existence(filename):
   """Check the existence of file or dir."""
   if not os.path.exists(filename):
-    raise RuntimeError("%s not found.")
+    raise RuntimeError("%s not found." % filename)
 
 
 def copy_binary(directory, origin_tag, new_tag, version, gpu=False):
@@ -64,27 +60,36 @@ def copy_binary(directory, origin_tag, new_tag, version, gpu=False):
     package = "tf_nightly"
   origin_binary = BINARY_STRING_TEMPLATE % (package, version, origin_tag)
   new_binary = BINARY_STRING_TEMPLATE % (package, version, new_tag)
-  zip_ref = zipfile.ZipFile(directory + origin_binary, "r")
-  zip_ref.extractall()
-  zip_ref.close()
-  old_py_ver = re.search(r"(cp\d\d-cp\d\d)", origin_tag).group(1)
-  new_py_ver = re.search(r"(cp\d\d-cp\d\d)", new_tag).group(1)
-  subprocess.check_call(
-      "%s -i s/%s/%s/g %s-%s.dist-info/WHEEL" % (SED_CMD, old_py_ver,
-                                                 new_py_ver, package, version),
-      shell=True)
-  zout = zipfile.ZipFile(directory + new_binary, "w", zipfile.ZIP_DEFLATED)
-  zip_these_files = [
-      "%s-%s.dist-info" % (package, version),
-      "%s-%s.data" % (package, version)
-  ]
-  for dirname in zip_these_files:
-    for root, _, files in os.walk(dirname):
-      for filename in files:
-        zout.write(os.path.join(root, filename))
-  zout.close()
-  for dirname in zip_these_files:
-    shutil.rmtree(dirname)
+  zip_ref = zipfile.ZipFile(os.path.join(directory, origin_binary), "r")
+
+  try:
+    tmpdir = tempfile.mkdtemp()
+    os.chdir(tmpdir)
+
+    zip_ref.extractall()
+    zip_ref.close()
+    old_py_ver = re.search(r"(cp\d\d-cp\d\d)", origin_tag).group(1)
+    new_py_ver = re.search(r"(cp\d\d-cp\d\d)", new_tag).group(1)
+
+    wheel_file = os.path.join(
+        tmpdir, "%s-%s.dist-info" % (package, version), "WHEEL")
+    with open(wheel_file, "r") as f:
+      content = f.read()
+    with open(wheel_file, "w") as f:
+      f.write(content.replace(old_py_ver, new_py_ver))
+
+    zout = zipfile.ZipFile(directory + new_binary, "w", zipfile.ZIP_DEFLATED)
+    zip_these_files = [
+        "%s-%s.dist-info" % (package, version),
+        "%s-%s.data" % (package, version),
+    ]
+    for dirname in zip_these_files:
+      for root, _, files in os.walk(dirname):
+        for filename in files:
+          zout.write(os.path.join(root, filename))
+    zout.close()
+  finally:
+    shutil.rmtree(tmpdir)
 
 
 def main():
@@ -110,6 +115,7 @@ def main():
   args = parser.parse_args()
 
   # Argument checking
+  args.filename = os.path.abspath(args.filename)
   check_existence(args.filename)
   regex_groups = re.search(TF_NIGHTLY_REGEX, args.filename)
   directory = regex_groups.group(1)
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index cfeaebdbf57c01fef7cd81dae76217429336d0ff..d0816c92b7308a1079579e605ee9af491a0533fb 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -54,3 +54,6 @@ for i in `seq 0 $((TF_GPU_COUNT-1))`; do
   fi
 done
 
+echo "Cannot find a free GPU to run the test $* on, exiting with failure..."
+exit 1
+
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 1df6a84d7c6f86abfb965063625ac43a3f1a57fb..3e27a94cf2bf3110ac181d6ef5a57366be17255f 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.10.0"
+BAZEL_VERSION="0.11.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_golang.sh b/tensorflow/tools/ci_build/install/install_golang.sh
index e1edd62cc505654b7266c212822561188bbc701c..124ad82e916fe70c0d26a7d09d27a9c510320c1e 100755
--- a/tensorflow/tools/ci_build/install/install_golang.sh
+++ b/tensorflow/tools/ci_build/install/install_golang.sh
@@ -16,7 +16,7 @@
 
 set -ex
 
-GOLANG_URL="https://storage.googleapis.com/golang/go1.9.2.linux-amd64.tar.gz"
+GOLANG_URL="https://storage.googleapis.com/golang/go1.10.linux-amd64.tar.gz"
 
 sudo mkdir -p /usr/local
 wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index d406b83a6246d18c335fb52cea1256d7809fa61a..5aaf544afdcb881fc00553cc78f916752f4527ac 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -16,10 +16,15 @@
 
 set -e
 
-# We don't apt-get install so that we can install a newer version of pip. Not
-# needed after we upgrade to Ubuntu 16.04
-easy_install -U pip
-easy_install3 -U pip
+# We don't apt-get install so that we can install a newer version of pip.
+# Only needed for Ubuntu 14.04 ,and not needed for Ubuntu 16.04 / Debian 8,9
+if $(cat /etc/*-release | grep -q 14.04); then
+  easy_install -U pip==9.0.3
+  easy_install3 -U pip==9.0.3
+else
+  pip2 install --upgrade pip==9.0.3
+  pip3 install --upgrade pip==9.0.3
+fi
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
@@ -60,8 +65,13 @@ rm -rf /usr/lib/python3/dist-packages/six*
 # numpy needs to be installed from source to fix segfaults. See:
 # https://github.com/tensorflow/tensorflow/issues/6968
 # This workaround isn't needed for Ubuntu 16.04 or later.
-pip2 install --no-binary=:all: --upgrade numpy==1.12.0
-pip3 install --no-binary=:all: --upgrade numpy==1.12.0
+if $(cat /etc/*-release | grep -q 14.04); then
+  pip2 install --no-binary=:all: --upgrade numpy==1.12.0
+  pip3 install --no-binary=:all: --upgrade numpy==1.12.0
+else
+  pip2 install --upgrade numpy==1.12.0
+  pip3 install --upgrade numpy==1.12.0
+fi
 
 pip2 install scipy==0.18.1
 pip3 install scipy==0.18.1
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0beabcf5ef83000bb0b0c5d3dc077a3d2bbe118a
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+if [ ! -f /usr/bin/x86_64-linux-gnu-gcc ]; then
+  ln -s /usr/local/bin/clang /usr/bin/x86_64-linux-gnu-gcc
+fi
+
+pip2 install --upgrade setuptools
+pip3 install --upgrade setuptools
+
+# The rest of the pip packages will be installed in
+# `install_pip_packages.sh`
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index aefc49f60482148e565a5262eebd5b3ac85987cf..204a82f647eed550a1ad14bd6fed4cd72b0f7dba 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,6 +39,9 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
+pip3.5 install --upgrade setuptools
+pip3.5 install --upgrade pip
+
 pip3.5 install --upgrade virtualenv
 
 # Install six.
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
index dbf376be6f7467466b5383c57dca539f9a470a38..2a9f29518886d5dcb96f1afc8b12d6be307aa965 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
@@ -30,7 +30,10 @@ export PYTHON_BIN_PATH=`which python2`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
+# Setting KMP_BLOCKTIME to 0 lets OpenMP threads to sleep right after parallel execution
+# in an MKL primitive. This reduces the effects of an oversubscription of OpenMP threads
+# caused by executing multiple tests concurrently.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
-    --config=mkl --config=opt --test_output=errors -- \
+    --config=mkl --test_env=KMP_BLOCKTIME=0 --config=opt --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index e5d8303c6e5534464bc0c91a09e7a6686b19c33f..bf992cf63d27f0f169185a38fa33a01cd5375051 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -45,7 +45,6 @@ ${DOCKER_BINARY} run \
   -v ${ROOT_DIR}:/workspace \
   -w /workspace \
   -e "PYTHON_BIN_PATH=/usr/bin/python" \
-  -e "TF_NEED_GCP=0" \
   -e "TF_NEED_HDFS=0" \
   -e "TF_NEED_CUDA=${TF_NEED_CUDA}" \
   -e "TF_NEED_OPENCL_SYCL=0" \
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
index 509ee38ec4fd584037f8e43726c01391430c1817..5c5a36139f50e85e70ce4bff5ca8054f7570b0f5 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
@@ -31,7 +31,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
-bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
+bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium --config=opt \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index 05547136704394ed9262f566a2bfb4160b73c7fd..c7cc16e6699830da4dff6cd32136da65fb6a41af 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -31,8 +31,9 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
-bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
+bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 --config=opt \
+    --announce_rc \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh
index 8f839ca110e5bbeba6fb7f0baaeab2fe6f126319..7e0e81a1ebdc9e4ad4e76f6582892914cd1a5881 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh
@@ -30,7 +30,8 @@ export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=$(which python3)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
-bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
+bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
+    --announce_rc \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index e1b56b9a25f663737ffe0991882f6e5e753265ed..9ae5fc6bea50e4ec2e8594ce5fdffc9dc4029fa0 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -24,12 +24,11 @@ source "${SCRIPT_DIR}/../builds/libtensorflow.sh"
 
 # Configure script
 export PYTHON_BIN_PATH="/usr/bin/python"
-export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
-export PATH="/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
+export PATH="$PATH:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
 build_libtensorflow_tarball "-cpu-darwin-$(uname -m)"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index 5a901af3e5c77ed153b5ff5a9c5f9463620f7dca..d95fcdeb8552d53244e2e1b770573ed7a45568d9 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -26,7 +26,6 @@ source "${SCRIPT_DIR}/../builds/libtensorflow.sh"
 export TF_NEED_CUDA=1
 export LD_LIBRARY_PATH="/usr/local/cuda/lib:/usr/local/cuda/extras/CUPTI/lib:${LD_LIBRARY_PATH}"
 export PYTHON_BIN_PATH="/usr/bin/python"
-export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
diff --git a/tensorflow/tools/ci_build/remote/Dockerfile.cpu b/tensorflow/tools/ci_build/remote/Dockerfile.cpu
deleted file mode 100644
index 7b01d8320d26f38c92ad8f404da3188809a6d400..0000000000000000000000000000000000000000
--- a/tensorflow/tools/ci_build/remote/Dockerfile.cpu
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM launcher.gcr.io/google/clang-debian8:latest
-
-RUN apt-get update && apt-get --no-install-recommends install -y \
-    binutils \
-    binutils-gold \
-    curl \
-    libstdc++-4.9-dev \
-    python \
-    python-dev \
-    python-numpy \
-    python-pip \
-    unzip \
-    zip && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
-
-# Set up grpc
-RUN pip install  --upgrade enum34 futures mock numpy six backports.weakref portpicker && \
-    pip install --pre 'protobuf>=3.0.0a3' && \
-    pip install 'grpcio>=1.1.3'
-
-# TODO: Set up golang which is compatible with clang
-
-WORKDIR /botexec
diff --git a/tensorflow/tools/ci_build/remote/Dockerfile.gpu b/tensorflow/tools/ci_build/remote/Dockerfile.gpu
deleted file mode 100644
index 47ffd44163dd3e4b99f06689e1aa6f19f84cc2ca..0000000000000000000000000000000000000000
--- a/tensorflow/tools/ci_build/remote/Dockerfile.gpu
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM nvidia-cuda-clang:latest
-
-RUN apt-get update && apt-get --no-install-recommends install -y \
-    binutils \
-    binutils-gold \
-    curl \
-    libstdc++-4.9-dev \
-    python \
-    python-dev \
-    python-numpy \
-    python-pip \
-    unzip \
-    zip && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
-
-# Set up grpc
-RUN pip install --upgrade \
-        enum34 futures astor gast mock numpy six \
-        backports.weakref termcolor && \
-    pip install --pre 'protobuf>=3.0.0a3' && \
-    pip install 'grpcio>=1.1.3'
-
-WORKDIR /botexec
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 7b2d7e1a568b0235a5bdd55bb23e542772902576..582188fc00b260926820a6add1331cf8fe0c8a9b 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -120,7 +120,9 @@ function run_configure_for_gpu_build {
   export TF_CUDA_VERSION=9.0
   export CUDA_TOOLKIT_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0"
   export TF_CUDNN_VERSION=7.0
-  export CUDNN_INSTALL_PATH="C:/tools/cuda"
+  if [ -z "$CUDNN_INSTALL_PATH" ]; then
+    export CUDNN_INSTALL_PATH="C:/tools/cuda"
+  fi
   export TF_CUDA_COMPUTE_CAPABILITIES="3.7"
   if [ -z "$TF_ENABLE_XLA" ]; then
     export TF_ENABLE_XLA=0
@@ -138,6 +140,13 @@ function run_configure_for_gpu_build {
   echo "" | ./configure
 }
 
+function set_gcs_remote_cache_options {
+  echo "build --experimental_remote_spawn_cache" >> "${TMP_BAZELRC}"
+  echo "build --experimental_remote_platform_override='properties:{name:\"build\" value:\"windows-x64\"}'" >> "${TMP_BAZELRC}"
+  echo "build --remote_http_cache=https://storage.googleapis.com/$GCS_BUCKET_NAME" >> "${TMP_BAZELRC}"
+  echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}"
+}
+
 function create_python_test_dir() {
   rm -rf "$1"
   mkdir -p "$1"
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 1c35d74af72ad0a72b0016356888c8cf77e20e56..0e6c0227b7ffb6b35193e133aa7d3fbcd16ce3c4 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -34,6 +34,9 @@ export BAZEL_SH=${BAZEL_SH:-"C:/tools/msys64/usr/bin/bash"}
 
 export PYTHON_BASE_PATH="${PYTHON_DIRECTORY:-Program Files/Anaconda3}"
 
+# Set the path to find bazel.
+export PATH="/c/tools/bazel/:$PATH"
+
 # Set Python path for ./configure
 export PYTHON_BIN_PATH="C:/${PYTHON_BASE_PATH}/python.exe"
 export PYTHON_LIB_PATH="C:/${PYTHON_BASE_PATH}/lib/site-packages"
@@ -41,6 +44,8 @@ export PYTHON_LIB_PATH="C:/${PYTHON_BASE_PATH}/lib/site-packages"
 # Add python into PATH, it's needed because gen_git_source.py uses
 # '/usr/bin/env python' as a shebang
 export PATH="/c/${PYTHON_BASE_PATH}:$PATH"
+# Add git into PATH needed for gen_git_source.py
+export PATH="/c/Program Files/Git/cmd:$PATH"
 
 # Make sure we have pip in PATH
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
index 748a961e44c5429664e37a1456adcf02a56fa3d4..dc9af221ecf53b44dd3460325fc00ca15135ef16 100644
--- a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
@@ -44,7 +44,7 @@ source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
 
 run_configure_for_cpu_build
 
-# Compliling the following test is extremely slow with -c opt
+# Compiling the following test is extremely slow with -c opt
 slow_compiling_test="//tensorflow/core/kernels:eigen_backward_spatial_convolutions_test"
 
 # Find all the passing cc_tests on Windows and store them in a variable
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 3c3b223a0044b7136ea4dee20fa72cd2fed3742a..30554a084c5689768665557d593b928fbd98d8cb 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -28,6 +28,9 @@ IF DEFINED TF_NIGHTLY (ECHO TF_NIGHTLY is set to %TF_NIGHTLY%) ELSE (SET TF_NIGH
 :: Set pip binary location. Do not override if it is set already.
 IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -37,9 +40,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 8b8ba31a0dda88ad3c43330e0208a9fa6a7d0276..a2300811bb93b9e9d96b9db314943ab08870fcb3 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,12 +42,40 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
-run_configure_for_cpu_build
+# Recreate an empty bazelrc file under source root
+export TMP_BAZELRC=.tmp.bazelrc
+rm -f "${TMP_BAZELRC}"
+touch "${TMP_BAZELRC}"
+
+function cleanup {
+  # Remove all options in .tmp.bazelrc
+  echo "" > "${TMP_BAZELRC}"
+}
+trap cleanup EXIT
+
+skip_test=0
+
+for ARG in "$@"; do
+  if [[ "$ARG" == --skip_test ]]; then
+    skip_test=1
+  elif [[ "$ARG" == --enable_gcs_remote_cache ]]; then
+    set_gcs_remote_cache_options
+  fi
+done
 
 # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
 # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-BUILD_OPTS="--define=override_eigen_strong_inline=true"
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
+
+echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
+
+run_configure_for_cpu_build
+
+bazel build --announce_rc -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
+
+if [[ "$skip_test" == 1 ]]; then
+  exit 0
+fi
 
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
@@ -59,10 +87,16 @@ create_python_test_dir "${PY_TEST_DIR}"
 PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow-*.whl)
 reinstall_tensorflow_pip ${PIP_NAME}
 
+# NUMBER_OF_PROCESSORS is predefined on Windows
+N_JOBS="${NUMBER_OF_PROCESSORS}"
+
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
-  //${PY_TEST_DIR}/tensorflow/python/...
+  --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
+  --flaky_test_attempts=3 \
+  //${PY_TEST_DIR}/tensorflow/python/... \
+  //${PY_TEST_DIR}/tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
index f26f8727e51bf0247578c1cdfaa67e1b0f7f299d..f1114f4ffa40dda23db854e4547bd59c69c6dfb6 100644
--- a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
@@ -46,7 +46,7 @@ clean_output_base
 
 run_configure_for_gpu_build
 
-# Compliling the following test is extremely slow with -c opt
+# Compiling the following test is extremely slow with -c opt
 slow_compiling_test="//tensorflow/core/kernels:eigen_backward_spatial_convolutions_test"
 
 # Find all the passing cc_tests on Windows and store them in a variable
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index b87e4a9bec41264827d415a11dfa6f23aeda725d..4656afe0256d03540fed6912677c8e93f9cf9eb6 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -37,7 +37,7 @@ SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX -G"Visual Studio 14"
 
 :: Run msbuild in the resulting VS project files to build a pip package.
 %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index b537192a945b2a2d8c2df940b947c6c0f7d6fc06..3b437d3c58c384c389820e57ae6bcc57c6c13efb 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -28,6 +28,12 @@ IF DEFINED TF_NIGHTLY (ECHO TF_NIGHTLY is set to %TF_NIGHTLY%) ELSE (SET TF_NIGH
 :: Set pip binary location. Do not override if it is set already.
 IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
+:: Set ctest binary location.
+IF DEFINED CTEST_EXE (ECHO CTEST_EXE is set to %CTEST_EXE%) ELSE (SET CTEST_EXE="C:\Program Files\cmake\bin\ctest.exe")
+
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -37,9 +43,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
@@ -47,4 +50,4 @@ if %errorlevel% neq 0 exit /b %errorlevel%
 
 :: Run all python tests if the installation succeeded.
 echo Running tests...
-ctest -C Release --output-on-failure --jobs 1
+%CTEST_EXE% -C Release --output-on-failure --jobs 1
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 94276c6c5c9ce897ca24f03efe3d93e1ea1e00c9..7dfee8f371b8c4795fe748d1fd02ee8d884f18f9 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -41,7 +41,7 @@ run_configure_for_gpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt --copt=/arch:AVX \
+bazel build -c opt --copt=/arch:AVX --announce_rc \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
index a94a627dfb632cf01518c2022fd01b168afb4a7e..a410c10b61b9f3f2cf8fc00074237a2bcfcbbf78 100755
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
@@ -35,7 +35,7 @@ echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc
 
 bazel clean
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \
+bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test,-no_oss -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --build_tests_only --test_output=errors --local_test_jobs=8 \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index 316e5469e7afda74563cc186c58964664170c5da..b9032c046e93527fd0f41f183e49e4933029ec62 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -44,14 +44,3 @@ py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 4f90c4d940670c43f65cc3f95971469627ab35c9..b7bfb29aae4fcaa55e01ba924f72cf79d2b09ad1 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -68,18 +68,3 @@ exports_files(
         "testdata/test_file_v0_11.py",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 6e90b286c99f894ddd25268afc69043759571c36..1f8833582af4c922115e637117e775e619439786 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -662,9 +662,9 @@ class TFAPIChangeSpec(APIChangeSpec):
   def _reverse_handler(file_edit_recorder, node):
     # TODO(aselle): Could check for a literal list of bools and try to convert
     # them to indices.
-    comment = ("ERROR: tf.reverse has had its argument semantics changed\n"
-               "significantly the converter cannot detect this reliably, so you"
-               "need to inspect this usage manually.\n")
+    comment = ("ERROR: tf.reverse has had its argument semantics changed "
+               "significantly the converter cannot detect this reliably, so "
+               "you need to inspect this usage manually.\n")
     file_edit_recorder.add(
         comment,
         node.lineno,
diff --git a/tensorflow/tools/def_file_filter/BUILD b/tensorflow/tools/def_file_filter/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e390e0fb05c1d881e5fbafb43ea7576347949439
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/BUILD
@@ -0,0 +1,9 @@
+# Description:
+# Tools for filtering DEF file for TensorFlow on Windows
+#
+# On Windows, we use a DEF file generated by Bazel to export
+# symbols from the tensorflow dynamic library(_pywrap_tensorflow.dll).
+# The maximum number of symbols that can be exported per DLL is 64K,
+# so we have to filter some useless symbols through this python script.
+
+package(default_visibility = ["//visibility:public"])
diff --git a/tensorflow/tools/def_file_filter/BUILD.tpl b/tensorflow/tools/def_file_filter/BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..3cb72f49797d8014e4df3f44155c967ab0e2f9b3
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/BUILD.tpl
@@ -0,0 +1,15 @@
+# Description:
+# Tools for filtering DEF file for TensorFlow on Windows
+#
+# On Windows, we use a DEF file generated by Bazel to export
+# symbols from the tensorflow dynamic library(_pywrap_tensorflow.dll).
+# The maximum number of symbols that can be exported per DLL is 64K,
+# so we have to filter some useless symbols through this python script.
+
+package(default_visibility = ["//visibility:public"])
+
+py_binary(
+    name = "def_file_filter",
+    srcs = ["def_file_filter.py"],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..8bdc03eb0f19fd6daae826727f429bc1255f0eca
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
@@ -0,0 +1,168 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""def_file_filter.py - tool to filter a windows def file.
+
+The def file can be used to export symbols from the tensorflow dll to enable
+tf.load_library().
+
+Because the linker allows only 64K symbols to be exported per dll
+we filter the symbols down to the essentials. The regular expressions
+we use for this are specific to tensorflow.
+
+TODO: this works fine but there is an issue with exporting
+'const char * const' and importing it from a user_ops. The problem is
+on the importing end and using __declspec(dllimport) works around it.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import io
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+# External tools we use that come with visual studio sdk
+UNDNAME = "%{undname_bin_path}"
+
+# Exclude if matched
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+
+# Include if matched before exclude
+INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
+                           r"google::protobuf::internal::ArenaImpl::AllocateAligned|" # for contrib/data/_prefetching_ops
+                           r"google::protobuf::internal::ArenaImpl::AddCleanup|" # for contrib/data/_prefetching_ops
+                           r"google::protobuf::Arena::OnArenaAllocation|" # for contrib/data/_prefetching_ops
+                           r"tensorflow::internal::LogMessage|"
+                           r"tensorflow::internal::LogString|"
+                           r"tensorflow::internal::CheckOpMessageBuilder|"
+                           r"tensorflow::internal::MakeCheckOpValueString|"
+                           r"tensorflow::internal::PickUnusedPortOrDie|"
+                           r"tensorflow::internal::ValidateDevice|"
+                           r"tensorflow::ops::internal::Enter|"
+                           r"tensorflow::strings::internal::AppendPieces|"
+                           r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::io::internal::JoinPathImpl")
+
+# Include if matched after exclude
+INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
+                        r"^(TFE_\w*)$|"
+                        r"nsync::|"
+                        r"tensorflow::|"
+                        r"functor::|"
+                        r"perftools::gputools")
+
+# We want to identify data members explicitly in the DEF file, so that no one
+# can implicitly link against the DLL if they use one of the variables exported
+# from the DLL and the header they use does not decorate the symbol with
+# __declspec(dllimport). It is easier to detect what a data symbol does
+# NOT look like, so doing it with the below regex.
+DATA_EXCLUDE_RE = re.compile(r"[)(]|"
+                             r"vftable|"
+                             r"vbtable|"
+                             r"vcall|"
+                             r"RTTI|"
+                             r"protobuf::internal::ExplicitlyConstructed")
+
+def get_args():
+  """Parse command line."""
+  filename_list = lambda x: x.split(";")
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input", type=filename_list,
+                      help="paths to input def file",
+                      required=True)
+  parser.add_argument("--output", help="output deffile", required=True)
+  parser.add_argument("--target", help="name of the target", required=True)
+  args = parser.parse_args()
+  return args
+
+
+def main():
+  """main."""
+  args = get_args()
+
+  # Pipe dumpbin to extract all linkable symbols from libs.
+  # Good symbols are collected in candidates and also written to
+  # a temp file.
+  candidates = []
+  tmpfile = tempfile.NamedTemporaryFile(mode="w", delete=False)
+  for def_file_path in args.input:
+    def_file = open(def_file_path, 'r')
+    for line in def_file:
+      cols = line.split()
+      sym = cols[0]
+      tmpfile.file.write(sym + "\n")
+      candidates.append(sym)
+  tmpfile.file.close()
+
+  # Run the symbols through undname to get their undecorated name
+  # so we can filter on something readable.
+  with open(args.output, "w") as def_fp:
+    # track dupes
+    taken = set()
+
+    # Header for the def file.
+    def_fp.write("LIBRARY " + args.target + "\n")
+    def_fp.write("EXPORTS\n")
+    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+
+    # Each symbols returned by undname matches the same position in candidates.
+    # We compare on undname but use the decorated name from candidates.
+    dupes = 0
+    proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE)
+    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+      decorated = candidates[idx]
+      if decorated in taken:
+        # Symbol is already in output, done.
+        dupes += 1
+        continue
+
+      if not INCLUDEPRE_RE.search(line):
+        if EXCLUDE_RE.search(line):
+          continue
+        if not INCLUDE_RE.search(line):
+          continue
+
+      if "deleting destructor" in line:
+        # Some of the symbols convered by INCLUDEPRE_RE export deleting
+        # destructor symbols, which is a bad idea.
+        # So we filter out such symbols here.
+        continue
+
+      if DATA_EXCLUDE_RE.search(line):
+        def_fp.write("\t" + decorated + "\n")
+      else:
+        def_fp.write("\t" + decorated + " DATA\n")
+      taken.add(decorated)
+    def_fp.close()
+
+  exit_code = proc.wait()
+  if exit_code != 0:
+    print("{} failed, exit={}".format(UNDNAME, exit_code))
+    return exit_code
+
+  os.unlink(tmpfile.name)
+
+  print("symbols={}, taken={}, dupes={}"
+        .format(len(candidates), len(taken), dupes))
+  return 0
+
+
+if __name__ == "__main__":
+  sys.exit(main())
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..47539b2423e602bb9771541ae5b01ba76c79f56f
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -0,0 +1,56 @@
+"""Repository rule for def file filter autoconfiguration.
+
+This repository reuses Bazel's VC detect mechanism to find undname.exe,
+which is a tool used in def_file_filter.py.
+
+def_file_filter.py is for filtering the DEF file for TensorFlow on Windows.
+On Windows, we use a DEF file generated by Bazel to export symbols from the
+tensorflow dynamic library(_pywrap_tensorflow.dll). The maximum number of
+symbols that can be exported per DLL is 64K, so we have to filter some useless
+symbols through this python script.
+
+`def_file_filter_config` depends on the following environment variables:
+  * `BAZEL_VC`
+  * `BAZEL_VS`
+  * `VS90COMNTOOLS`
+  * `VS100COMNTOOLS`
+  * `VS110COMNTOOLS`
+  * `VS120COMNTOOLS`
+  * `VS140COMNTOOLS`
+"""
+
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_vc_path")
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool")
+load("@bazel_tools//tools/cpp:lib_cc_configure.bzl", "auto_configure_fail")
+
+def _def_file_filter_configure_impl(repository_ctx):
+  if repository_ctx.os.name.lower().find("windows") == -1:
+    repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
+    repository_ctx.file("def_file_filter.py", "")
+    return
+  vc_path = find_vc_path(repository_ctx)
+  if vc_path == "visual-studio-not-found":
+    auto_configure_fail("Visual C++ build tools not found on your machine")
+  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
+
+  repository_ctx.template(
+    "def_file_filter.py",
+    Label("//tensorflow/tools/def_file_filter:def_file_filter.py.tpl"),
+    {
+      "%{undname_bin_path}": undname_bin_path,
+    })
+  repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
+
+
+def_file_filter_configure = repository_rule(
+    implementation = _def_file_filter_configure_impl,
+    environ = [
+        "BAZEL_VC",
+        "BAZEL_VS",
+        "VS90COMNTOOLS",
+        "VS100COMNTOOLS",
+        "VS110COMNTOOLS",
+        "VS120COMNTOOLS",
+        "VS140COMNTOOLS"
+    ],
+)
diff --git a/tensorflow/tools/dist_test/README.md b/tensorflow/tools/dist_test/README.md
index c1b1f79bbd4b657768b9bbcab93efa3354774915..228d5ee35d1839c60b51a85bd606c1ba86e46886 100644
--- a/tensorflow/tools/dist_test/README.md
+++ b/tensorflow/tools/dist_test/README.md
@@ -17,6 +17,14 @@ cesnsu model:
 
     ./local_test.sh --model_name CENSUS_WIDENDEEP
 
+You can test specify version of TensorFlow:
+
+```shell
+./local_test.sh ${whl_file_url}
+```
+
+For example, you can find these TensorFlow python package URLs from [here](https://www.tensorflow.org/install/install_linux#the_url_of_the_tensorflow_python_package) for Ubuntu.
+
 **2) Launch a remote k8s cluster on Google Kubernetes Engine (GKE) and run the
 test suite on it**
 
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index 435f9d0dc9c55a3dcfc45e7e46f279b4679a9086..caae7fd5305af9846628eaf00348dd08df4e827f 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -16,12 +16,11 @@
 #
 # Tests distributed TensorFlow on a locally running TF GRPC cluster.
 #
-# This script peforms the following steps:
-# 1) Build the docker-in-docker (dind) image capable of running docker and
-#    Kubernetes (k8s) cluster inside.
+# This script performs the following steps:
+# 1) Build the docker image capable of running distributed TensorFlow in docker.
 # 2) Run a container from the aforementioned image and start docker service
 #    in it
-# 3) Call a script to launch a k8s TensorFlow GRPC cluster inside the container
+# 3) Call a script to launch a distributed TensorFlow GRPC cluster inside the container
 #    and run the distributed test suite.
 #
 # Usage: local_test.sh <whl_file_location>
@@ -64,15 +63,9 @@ die() {
 
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
-LOCAL_K8S_CACHE=${HOME}/kubernetes
 
-# Helper function
-get_container_id_by_image_name() {
-    # Get the id of a container by image name
-    # Usage: get_docker_container_id_by_image_name <img_name>
-
-    docker ps | grep $1 | awk '{print $1}'
-}
+# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
+DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
 
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
@@ -84,7 +77,8 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  die "whl file location is not specified"
+  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
+  echo "use default whl file location"
 fi
 
 while true; do
@@ -121,7 +115,7 @@ DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Get utility functions
 source ${DIR}/scripts/utils.sh
 
-# Build docker-in-docker image for local k8s cluster.
+# Build docker image for local distributed TensorFlow cluster.
 NO_CACHE_FLAG=""
 if [[ ! -z "${TF_DIST_DOCKER_NO_CACHE}" ]] &&
    [[ "${TF_DIST_DOCKER_NO_CACHE}" != "0" ]]; then
diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py
index a2d12442c44553a287637029843021b7541fa3fa..d6e7f317dd0b52203e354676425dbbbcd53e1973 100644
--- a/tensorflow/tools/dist_test/python/mnist_replica.py
+++ b/tensorflow/tools/dist_test/python/mnist_replica.py
@@ -56,7 +56,7 @@ flags.DEFINE_integer("task_index", None,
 flags.DEFINE_integer("num_gpus", 1, "Total number of gpus for each machine."
                      "If you don't use GPU, please set it to '0'")
 flags.DEFINE_integer("replicas_to_aggregate", None,
-                     "Number of replicas to aggregate before parameter update"
+                     "Number of replicas to aggregate before parameter update "
                      "is applied (For sync_replicas mode only; default: "
                      "num_workers)")
 flags.DEFINE_integer("hidden_units", 100,
diff --git a/tensorflow/tools/dist_test/server/BUILD b/tensorflow/tools/dist_test/server/BUILD
index 865af8dd7b2af686dad852f35187f2d226533596..003a19a9abf470f58070bf44fc5608d1eb3634fe 100644
--- a/tensorflow/tools/dist_test/server/BUILD
+++ b/tensorflow/tools/dist_test/server/BUILD
@@ -37,15 +37,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/docker/BUILD b/tensorflow/tools/docker/BUILD
index 7d5ae0a94d8f969585d8fb8e57892c165e35ba47..849ba49f71994c3c188d8bc7751d9569c3ee73b3 100644
--- a/tensorflow/tools/docker/BUILD
+++ b/tensorflow/tools/docker/BUILD
@@ -13,15 +13,3 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 024cb40eb4b9380fa09bd0e371826783d1ebdc45..a3ff8211e3e81925722566863c5ad910295a94ba 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -7,6 +7,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         curl \
         libfreetype6-dev \
+        libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
@@ -47,7 +48,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index d16761c3675942838fd2be0ea6e0b7463a3bf249..b9996395d02bfb58c4bcee1dfc0eb3707c28b209 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -8,6 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         libcurl3-dev \
         libfreetype6-dev \
+        libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
@@ -28,9 +29,12 @@ RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     rm get-pip.py
 
 RUN pip --no-cache-dir install \
+        Pillow \
+        h5py \
         ipykernel \
         jupyter \
         matplotlib \
+        mock \
         numpy \
         scipy \
         sklearn \
@@ -38,6 +42,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
@@ -57,7 +63,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.8.0
+ENV BAZEL_VERSION 0.11.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
@@ -70,7 +76,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 3690e7dfe57a4682276a90b10cb84c9a329b3f5e..c65e0b72bc582d39b75ad042e4c673aa603639be 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.6
+ARG TF_BRANCH=r1.8
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 4ef37881bc91aaa58bab031c69b4a96c2a9d8ec1..7e5e6ef2d5b0245607bacb9a99f1352c052d0ea4 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -17,6 +17,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcudnn7-dev=7.0.5.15-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
+        libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
@@ -37,9 +38,12 @@ RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     rm get-pip.py
 
 RUN pip --no-cache-dir install \
+        Pillow \
+        h5py \
         ipykernel \
         jupyter \
         matplotlib \
+        mock \
         numpy \
         scipy \
         sklearn \
@@ -47,6 +51,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
@@ -66,7 +72,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.8.0
+ENV BAZEL_VERSION 0.11.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
@@ -79,7 +85,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index b6682cd68163ec870ed815b45ac4fdd9233f88c6..bff4a20392076994c75705b73c25dcb740ba1f09 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -1,12 +1,20 @@
-FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
+FROM nvidia/cuda:9.0-base-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-9-0 \
+        cuda-cufft-9-0 \
+        cuda-curand-9-0 \
+        cuda-cusolver-9-0 \
+        cuda-cusparse-9-0 \
         curl \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
         libfreetype6-dev \
+        libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
@@ -47,7 +55,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index f46c56e11aa72cd0df20f0d8478de2f42dbb3b72..525f2995ceecd48ee7463fc207406c5f9b25f61e 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -16,12 +16,12 @@ quick links here:
 
 We currently maintain two Docker container images:
 
-* `gcr.io/tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only!
+* `tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only!
 
-* `gcr.io/tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies
+* `tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies
   and support for NVidia CUDA
 
-Note: We also publish the same containers into
+Note: We store all our containers on 
 [Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
 
@@ -29,12 +29,12 @@ Note: We also publish the same containers into
 
 Run non-GPU container using
 
-    $ docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow
+    $ docker run -it -p 8888:8888 tensorflow/tensorflow
 
 For GPU support install NVidia drivers (ideally latest) and
 [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Run using
 
-    $ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
+    $ nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:latest-gpu
 
 
 Note: If you would have a problem running nvidia-docker you may try the old method
@@ -44,7 +44,7 @@ it there and try using nvidia-docker as described above.
     $ # The old, not recommended way to run docker with gpu support:
     $ export CUDA_SO=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}')
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES gcr.io/tensorflow/tensorflow:latest-gpu
+    $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES tensorflow/tensorflow:latest-gpu
 
 
 ## More containers
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index 5585ebdcd366ec9db0c47004647970cb27c8bb75..824fe14560bb2c3bfb0729f9e5b5cffa63db19ca 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -1207,7 +1207,7 @@
    "source": [
     "# Training computation: logits + cross-entropy loss.\n",
     "logits = model(train_data_node, True)\n",
-    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\n",
+    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(\n",
     "  labels=train_labels_node, logits=logits))\n",
     "\n",
     "# L2 regularization for the fully connected parameters.\n",
@@ -2031,7 +2031,7 @@
    "views": {}
   },
   "kernelspec": {
-   "display_name": "Python [default]",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -2049,5 +2049,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/tensorflow/tools/docker/notebooks/BUILD b/tensorflow/tools/docker/notebooks/BUILD
index 89f473df4bfdda479fa25b8e10b84c4430105cc9..e9f26899c9afa305afa6ee686a038997a4e6fbe3 100644
--- a/tensorflow/tools/docker/notebooks/BUILD
+++ b/tensorflow/tools/docker/notebooks/BUILD
@@ -3,15 +3,3 @@ package(default_visibility = ["//visibility:private"])
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index b4fba5b8f5e19c2fbb8c7261d8cf293757df503c..05de25f2cb11d76f223a31bc12329e6ab7368e8a 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -284,7 +284,7 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
     if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile for python version "\
 "${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
@@ -306,7 +306,7 @@ else
         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
     else
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 8f10bc9e0ca3c947b8ca75663444309088e0513e..58b5ef8345c9de83e2d50cd01fe11e11f51fe298 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -37,7 +37,7 @@ py_library(
     srcs = ["parser.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = ["@com_github_andreif_codegen"],
+    deps = ["@astor_archive//:astor"],
 )
 
 py_test(
@@ -103,10 +103,11 @@ py_test(
     data = ["//tensorflow:docs_src"],
     srcs_version = "PY2AND3",
     tags = [
-        # No reason to run sanitizers for this test.
+        # No reason to run sanitizers or fastbuild for this test.
         "noasan",
         "nomsan",
         "notsan",
+        "optonly",
     ],
     deps = [
         ":generate_lib",
@@ -142,14 +143,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/tools/docs/build_docs_test.py b/tensorflow/tools/docs/build_docs_test.py
index ae293f6576456ecdbb8a4b1ee4e8e4f40482ad94..0cbf8b478fa8877c81dd4c06d2713ae2e72f8b58 100644
--- a/tensorflow/tools/docs/build_docs_test.py
+++ b/tensorflow/tools/docs/build_docs_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
 import textwrap
 
 import tensorflow as tf
@@ -39,10 +38,6 @@ class Flags(object):
 class BuildDocsTest(googletest.TestCase):
 
   def testBuildDocs(self):
-    if sys.version_info >= (3, 0):
-      print('Warning: Doc generation is not supported from python3.')
-      return
-
     doc_generator = generate_lib.DocGenerator()
 
     doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py
index c750539a76a85ec122fb87f44160422158966314..fc93085e3e0316cf274f4d9b325d6af0ea3a2f83 100644
--- a/tensorflow/tools/docs/generate.py
+++ b/tensorflow/tools/docs/generate.py
@@ -43,10 +43,6 @@ if __name__ == '__main__':
 
   flags = doc_generator.parse_known_args()
 
-  # Suppress documentation of some symbols that users should never use.
-  del tf.layers.Layer.inbound_nodes
-  del tf.layers.Layer.outbound_nodes
-
   # tf_debug is not imported with tf, it's a separate module altogether
   doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
 
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 34dd419f15676babfa9a36c2c0960b01248b6f69..111d54d8205f805cc24d21c610acc81610b8d47d 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import argparse
 import fnmatch
 import os
-import sys
 
 import six
 
@@ -134,8 +133,12 @@ def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
     try:
       if not os.path.exists(directory):
         os.makedirs(directory)
-      with open(path, 'w') as f:
-        f.write(pretty_docs.build_md_page(page_info))
+      # This function returns raw bytes in PY2 or unicode in PY3.
+      text = pretty_docs.build_md_page(page_info)
+      if six.PY3:
+        text = text.encode('utf-8')
+      with open(path, 'wb') as f:
+        f.write(text)
     except OSError as e:
       print('Cannot write documentation for %s to %s: %s' % (full_name,
                                                              directory, e))
@@ -308,6 +311,10 @@ def build_doc_index(src_dir):
         continue
       title_parser = _GetMarkdownTitle()
       title_parser.process(os.path.join(dirpath, base_name))
+      if title_parser.title is None:
+        msg = ('`{}` has no markdown title (# title)'.format(
+            os.path.join(dirpath, base_name)))
+        raise ValueError(msg)
       key_parts = os.path.join(suffix, base_name[:-3]).split('/')
       if key_parts[-1] == 'index':
         key_parts = key_parts[:-1]
@@ -433,19 +440,19 @@ def _other_docs(src_dir, output_dir, reference_resolver, file_pattern='*.md'):
       full_out_path = os.path.join(output_dir, suffix)
       if not fnmatch.fnmatch(base_name, file_pattern):
         print('Copying un-matched file %s...' % suffix)
-        open(full_out_path, 'w').write(open(full_in_path).read())
+        open(full_out_path, 'wb').write(open(full_in_path, 'rb').read())
         continue
       if dirpath.endswith('/api_guides/python'):
         print('Processing Python guide %s...' % base_name)
         content = tag_updater.process(full_in_path)
       else:
         print('Processing doc %s...' % suffix)
-        content = open(full_in_path).read()
+        content = open(full_in_path, 'rb').read().decode('utf-8')
 
       content = reference_resolver.replace_references(content,
                                                       relative_path_to_root)
-      with open(full_out_path, 'w') as f:
-        f.write(content)
+      with open(full_out_path, 'wb') as f:
+        f.write(content.encode('utf-8'))
 
   print('Done.')
 
@@ -454,8 +461,6 @@ class DocGenerator(object):
   """Main entry point for generating docs."""
 
   def __init__(self):
-    if sys.version_info >= (3, 0):
-      sys.exit('Doc generation is not supported from python3.')
     self.argument_parser = argparse.ArgumentParser()
     self._py_modules = None
     self._private_map = _get_default_private_map()
diff --git a/tensorflow/tools/docs/generate_lib_test.py b/tensorflow/tools/docs/generate_lib_test.py
index 1ceaf31f1c3b83e2c2cb3c0d2022ce98781aed4b..ea6d28a02b1f3c07fe8783fd59e345dade1fc804 100644
--- a/tensorflow/tools/docs/generate_lib_test.py
+++ b/tensorflow/tools/docs/generate_lib_test.py
@@ -52,9 +52,6 @@ class DummyVisitor(object):
 class GenerateTest(googletest.TestCase):
 
   def test_write(self):
-    if sys.version_info >= (3, 0):
-      self.skipTest('Warning: Doc generation is not supported from python3.')
-
     module = sys.modules[__name__]
 
     index = {
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index e758229535e7b10994a39cbafb37e116fd2a465c..fb0bd2c2ff438aa9b3fa04719c447a2f3a91a95e 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -26,7 +26,7 @@ import os
 import re
 import sys
 
-import codegen
+import astor
 import six
 
 from google.protobuf.message import Message as ProtoMessage
@@ -34,7 +34,11 @@ from tensorflow.python.util import tf_inspect
 
 
 # A regular expression capturing a python identifier.
-IDENTIFIER_RE = '[a-zA-Z_][a-zA-Z0-9_]*'
+IDENTIFIER_RE = r'[a-zA-Z_]\w*'
+
+
+class TFDocsError(Exception):
+  pass
 
 
 class _Errors(object):
@@ -118,6 +122,8 @@ SYMBOL_REFERENCE_RE = re.compile(
     """,
     flags=re.VERBOSE)
 
+AUTO_REFERENCE_RE = re.compile(r'`([a-zA-Z0-9_.]+?)`')
+
 
 class ReferenceResolver(object):
   """Class for replacing @{...} references with Markdown links.
@@ -240,10 +246,25 @@ class ReferenceResolver(object):
     Returns:
       `string`, with "@{symbol}" references replaced by Markdown links.
     """
-    def one_ref(match):
-      return self._one_ref(match, relative_path_to_root)
 
-    return re.sub(SYMBOL_REFERENCE_RE, one_ref, string)
+    def strict_one_ref(match):
+      try:
+        return self._one_ref(match, relative_path_to_root)
+      except TFDocsError as e:
+        self.add_error(e.message)
+        return 'BAD_LINK'
+
+    string = re.sub(SYMBOL_REFERENCE_RE, strict_one_ref, string)
+
+    def sloppy_one_ref(match):
+      try:
+        return self._one_ref(match, relative_path_to_root)
+      except TFDocsError:
+        return match.group(0)
+
+    string = re.sub(AUTO_REFERENCE_RE, sloppy_one_ref, string)
+
+    return string
 
   def python_link(self, link_text, ref_full_name, relative_path_to_root,
                   code_ref=True):
@@ -307,14 +328,14 @@ class ReferenceResolver(object):
 
     Raises:
       RuntimeError: If `ref_full_name` is not documented.
+      TFDocsError: If the @{} syntax cannot be decoded.
     """
     master_name = self._duplicate_of.get(ref_full_name, ref_full_name)
 
     # Check whether this link exists
     if master_name not in self._all_names:
-      message = 'Cannot make link to "%s": Not in index.' % master_name
-      self.add_error(message)
-      return 'BROKEN_LINK'
+      raise TFDocsError(
+          'Cannot make link to "%s": Not in index.' % master_name)
 
     # If this is a member of a class, link to the class page with an anchor.
     ref_path = None
@@ -369,8 +390,8 @@ class ReferenceResolver(object):
             code_ref=not manual_link_text)
 
     # Error!
-    self.add_error('Did not understand "%s"' % match.group(0))
-    return 'BROKEN_LINK'
+    raise TFDocsError('Did not understand "%s"' % match.group(0),
+                      'BROKEN_LINK')
 
   def _doc_link(self, string, link_text, manual_link_text,
                 relative_path_to_root):
@@ -395,11 +416,10 @@ class ReferenceResolver(object):
     return self._doc_missing(string, hash_tag, link_text, manual_link_text,
                              relative_path_to_root)
 
-  def _doc_missing(self, string, unused_hash_tag, link_text,
+  def _doc_missing(self, string, unused_hash_tag, unused_link_text,
                    unused_manual_link_text, unused_relative_path_to_root):
     """Generate an error for unrecognized @{$...} references."""
-    self.add_error('Unknown Document "%s"' % string)
-    return link_text
+    raise TFDocsError('Unknown Document "%s"' % string)
 
   def _cc_link(self, string, link_text, unused_manual_link_text,
                relative_path_to_root):
@@ -416,8 +436,8 @@ class ReferenceResolver(object):
     elif string == 'tensorflow::ops::Const':
       ret = 'namespace/tensorflow/ops.md#const'
     else:
-      self.add_error('C++ reference not understood: "%s"' % string)
-      return 'TODO_C++:%s' % string
+      raise TFDocsError('C++ reference not understood: "%s"' % string)
+
     # relative_path_to_root gets you to api_docs/python, we go from there
     # to api_docs/cc, and then add ret.
     cc_relative_path = os.path.normpath(os.path.join(
@@ -601,20 +621,20 @@ def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
 def _get_arg_spec(func):
   """Extracts signature information from a function or functools.partial object.
 
-  For functions, uses `tf_inspect.getargspec`. For `functools.partial` objects,
-  corrects the signature of the underlying function to take into account the
-  removed arguments.
+  For functions, uses `tf_inspect.getfullargspec`. For `functools.partial`
+  objects, corrects the signature of the underlying function to take into
+  account the removed arguments.
 
   Args:
     func: A function whose signature to extract.
 
   Returns:
-    An `ArgSpec` namedtuple `(args, varargs, keywords, defaults)`, as returned
-    by `tf_inspect.getargspec`.
+    An `FullArgSpec` namedtuple `(args, varargs, varkw, defaults, etc.)`,
+    as returned by `tf_inspect.getfullargspec`.
   """
-  # getargspec does not work for functools.partial objects directly.
+  # getfullargspec does not work for functools.partial objects directly.
   if isinstance(func, functools.partial):
-    argspec = tf_inspect.getargspec(func.func)
+    argspec = tf_inspect.getfullargspec(func.func)
     # Remove the args from the original function that have been used up.
     first_default_arg = (
         len(argspec.args or []) - len(argspec.defaults or []))
@@ -637,12 +657,16 @@ def _get_arg_spec(func):
           argspec_defaults.pop(i-first_default_arg)
         else:
           first_default_arg -= 1
-    return tf_inspect.ArgSpec(args=argspec_args,
-                              varargs=argspec.varargs,
-                              keywords=argspec.keywords,
-                              defaults=tuple(argspec_defaults))
+    return tf_inspect.FullArgSpec(
+        args=argspec_args,
+        varargs=argspec.varargs,
+        varkw=argspec.varkw,
+        defaults=tuple(argspec_defaults),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
   else:  # Regular function or method, getargspec will work fine.
-    return tf_inspect.getargspec(func)
+    return tf_inspect.getfullargspec(func)
 
 
 def _remove_first_line_indent(string):
@@ -650,11 +674,14 @@ def _remove_first_line_indent(string):
   return '\n'.join([line[indent:] for line in string.split('\n')])
 
 
+PAREN_NUMBER_RE = re.compile(r'^\(([0-9.e-]+)\)')
+
+
 def _generate_signature(func, reverse_index):
   """Given a function, returns a list of strings representing its args.
 
   This function produces a list of strings representing the arguments to a
-  python function. It uses tf_inspect.getargspec, which
+  python function. It uses tf_inspect.getfullargspec, which
   does not generalize well to Python 3.x, which is more flexible in how *args
   and **kwargs are handled. This is not a problem in TF, since we have to remain
   compatible to Python 2.7 anyway.
@@ -705,7 +732,11 @@ def _generate_signature(func, reverse_index):
       if id(default) in reverse_index:
         default_text = reverse_index[id(default)]
       elif ast_default is not None:
-        default_text = codegen.to_source(ast_default)
+        default_text = (
+            astor.to_source(ast_default).rstrip('\n').replace('\t', '\\t')
+            .replace('\n', '\\n').replace('"""', "'"))
+        default_text = PAREN_NUMBER_RE.sub('\\1', default_text)
+
         if default_text != repr(default):
           # This may be an internal name. If so, handle the ones we know about.
           # TODO(wicke): This should be replaced with a lookup in the index.
@@ -738,8 +769,8 @@ def _generate_signature(func, reverse_index):
   # Add *args and *kwargs.
   if argspec.varargs:
     args_list.append('*' + argspec.varargs)
-  if argspec.keywords:
-    args_list.append('**' + argspec.keywords)
+  if argspec.varkw:
+    args_list.append('**' + argspec.varkw)
 
   return args_list
 
@@ -1116,9 +1147,11 @@ class _ClassPageInfo(object):
 
     for short_name in parser_config.tree[self.full_name]:
       # Remove builtin members that we never want to document.
-      if short_name in ['__class__', '__base__', '__weakref__', '__doc__',
-                        '__module__', '__dict__', '__abstractmethods__',
-                        '__slots__', '__getnewargs__']:
+      if short_name in [
+          '__class__', '__base__', '__weakref__', '__doc__', '__module__',
+          '__dict__', '__abstractmethods__', '__slots__', '__getnewargs__',
+          '__str__', '__repr__', '__hash__'
+      ]:
         continue
 
       child_name = '.'.join([self.full_name, short_name])
@@ -1163,7 +1196,8 @@ class _ClassPageInfo(object):
         # obvious what they do, don't include them in the docs if there's no
         # docstring.
         if not child_doc.brief.strip() and short_name in [
-            '__str__', '__repr__', '__hash__', '__del__', '__copy__']:
+            '__del__', '__copy__'
+        ]:
           print('Skipping %s, defined in %s, no docstring.' % (child_name,
                                                                defining_class))
           continue
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index fca5436ca5fadd1fb5da07d7523bb51c871164b5..274d48ef66071a4e6a5ebea65087f18382fea6a2 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -398,7 +398,6 @@ class ParserTest(googletest.TestCase):
     self.assertIn('<code>test_function', docs)
 
   def test_argspec_for_functools_partial(self):
-
     # pylint: disable=unused-argument
     def test_function_for_partial1(arg1, arg2, kwarg1=1, kwarg2=2):
       pass
@@ -409,42 +408,95 @@ class ParserTest(googletest.TestCase):
 
     # pylint: disable=protected-access
     # Make sure everything works for regular functions.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
-                                  None, (1, 2))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 2),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     self.assertEqual(expected, parser._get_arg_spec(test_function_for_partial1))
 
     # Make sure doing nothing works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
-                                  None, (1, 2))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 2),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting args from the front works.
-    expected = tf_inspect.ArgSpec(['arg2', 'kwarg1', 'kwarg2'], None, None,
-                                  (1, 2))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg2', 'kwarg1', 'kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 2),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['kwarg2',], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(
+        args=['kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(2,),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1, 1, 2, 3)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting kwargs works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg2'], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1', 'arg2', 'kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(2,),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg1=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1'], None, None, (1,))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1', 'arg2', 'kwarg1'],
+        varargs=None,
+        varkw=None,
+        defaults=(1,),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['arg1'], None, None, ())
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1'],
+        varargs=None,
+        varkw=None,
+        defaults=(),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1,
                                 arg2=0, kwarg1=0, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure *args, *kwargs is accounted for.
-    expected = tf_inspect.ArgSpec([], 'my_args', 'my_kwargs', ())
+    expected = tf_inspect.FullArgSpec(
+        args=[],
+        varargs='my_args',
+        varkw='my_kwargs',
+        defaults=(),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial2, 0, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
@@ -524,10 +576,6 @@ class TestParseFunctionDetails(googletest.TestCase):
 class TestGenerateSignature(googletest.TestCase):
 
   def test_known_object(self):
-    if sys.version_info >= (3, 0):
-      print('Warning: Doc generation is not supported from python3.')
-      return
-
     known_object = object()
     reverse_index = {id(known_object): 'location.of.object.in.api'}
 
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index 543b5fa6fefcd8e8dca99ad7eac7cca76781ccd3..55ab5bdd49a427e680221f4864b3f31a65b12e8d 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -101,7 +101,7 @@ def _build_class_page(page_info):
 
     link_template = '[`{short_name}`]({url})'
     parts.append(', '.join(
-        link_template.format(**base.__dict__) for base in page_info.bases))
+        link_template.format(**base._asdict()) for base in page_info.bases))
 
   parts.append('\n\n')
 
@@ -159,7 +159,7 @@ def _build_class_page(page_info):
       h3 = ('<h3 id="{short_name}">'
             '<code>{short_name}</code>'
             '</h3>\n\n')
-      parts.append(h3.format(**method_info.__dict__))
+      parts.append(h3.format(**method_info._asdict()))
 
       if method_info.signature is not None:
         parts.append(_build_signature(method_info, use_full_name=False))
@@ -217,7 +217,7 @@ def _build_module_page(page_info):
     template = '[`{short_name}`]({url}) module'
 
     for item in page_info.modules:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -229,7 +229,7 @@ def _build_module_page(page_info):
     template = '[`class {short_name}`]({url})'
 
     for item in page_info.classes:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -241,7 +241,7 @@ def _build_module_page(page_info):
     template = '[`{short_name}(...)`]({url})'
 
     for item in page_info.functions:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -254,7 +254,7 @@ def _build_module_page(page_info):
     parts.append('## Other Members\n\n')
 
     for item in page_info.other_members:
-      parts.append('`{short_name}`\n\n'.format(**item.__dict__))
+      parts.append('`{short_name}`\n\n'.format(**item._asdict()))
 
   return ''.join(parts)
 
diff --git a/tensorflow/tools/docs/py_guide_parser.py b/tensorflow/tools/docs/py_guide_parser.py
index 216353ecee377260efd5a19c8536ac41c17592a9..328f42d18f1efb0fd82725a4683abad2df0d5a19 100644
--- a/tensorflow/tools/docs/py_guide_parser.py
+++ b/tensorflow/tools/docs/py_guide_parser.py
@@ -44,7 +44,7 @@ class PyGuideParser(object):
 
   def process(self, full_path):
     """Read and process the file at `full_path`."""
-    md_string = open(full_path).read()
+    md_string = open(full_path, 'rb').read().decode('utf-8')
     self._lines = md_string.split('\n')
     seen = set()
 
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index 942ceab85fc8d40d9d4b67537d95204503af8bbe..daa17fbd501651540c4c90c6354eb0a5b2f2b7aa 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -9,18 +9,3 @@ licenses(["notice"])  # Apache 2.0
 exports_files(
     ["gen_git_source.py"],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 3630dbd740e981971bdc9ff45b756b45095d437d..73dee98bae8946b747e1b28bd14b0a26edc62736 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -114,11 +114,18 @@ def configure(src_base_path, gen_path, debug=False):
   for target, src in link_map.items():
     if src is None:
       open(os.path.join(gen_path, target), "w").write("")
+    elif not os.path.exists(src):
+      # Git repo is configured in a way we don't support such as having
+      # packed refs. Even though in a git repo, tf.__git_version__ will not
+      # be accurate.
+      # TODO(mikecase): Support grabbing git info when using packed refs.
+      open(os.path.join(gen_path, target), "w").write("")
+      spec["git"] = False
     else:
       try:
         # In python 3.5, symlink function exists even on Windows. But requires
         # Windows Admin privileges, otherwise an OSError will be thrown.
-        if hasattr(os, 'symlink'):
+        if hasattr(os, "symlink"):
           os.symlink(src, os.path.join(gen_path, target))
         else:
           shutil.copy2(src, os.path.join(gen_path, target))
@@ -132,7 +139,7 @@ def configure(src_base_path, gen_path, debug=False):
     print("gen_git_source.py: spec is %r" % spec)
 
 
-def get_git_version(git_base_path):
+def get_git_version(git_base_path, git_tag_override):
   """Get the git version from the repository.
 
   This function runs `git describe ...` in the path given as `git_base_path`.
@@ -145,6 +152,9 @@ def get_git_version(git_base_path):
 
   Args:
     git_base_path: where the .git directory is located
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   Returns:
     A bytestring representing the git version
   """
@@ -154,8 +164,16 @@ def get_git_version(git_base_path):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
+    if git_tag_override:
+      split_val = val.split("-")
+      if len(split_val) != 3:
+        raise Exception(
+            ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
+             "but got '%s'") % val)
+      split_val[0] = git_tag_override
+      val = bytes("-".join(split_val))
     return val if val else unknown_label
-  except subprocess.CalledProcessError:
+  except (subprocess.CalledProcessError, OSError):
     return unknown_label
 
 
@@ -171,7 +189,15 @@ def write_version_info(filename, git_version):
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
@@ -190,7 +216,7 @@ const int tf_monolithic_build() {
   open(filename, "w").write(contents)
 
 
-def generate(arglist):
+def generate(arglist, git_tag_override=None):
   """Generate version_info.cc as given `destination_file`.
 
   Args:
@@ -210,6 +236,10 @@ def generate(arglist):
   `ref_symlink` is unused in this script but passed, because the build
     system uses that file to detect when commits happen.
 
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
+
   Raises:
     RuntimeError: If ./configure needs to be run, RuntimeError will be raised.
   """
@@ -227,11 +257,11 @@ def generate(arglist):
       raise RuntimeError(
           "Run ./configure again, branch was '%s' but is now '%s'" %
           (old_branch, new_branch))
-    git_version = get_git_version(data["path"])
+    git_version = get_git_version(data["path"], git_tag_override)
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file):
+def raw_generate(output_file, source_dir, git_tag_override=None):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -239,9 +269,13 @@ def raw_generate(output_file):
 
   Args:
     output_file: Output filename for the version info cc
+    source_dir: Base path of the source code
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   """
 
-  git_version = get_git_version(".")
+  git_version = get_git_version(source_dir, git_tag_override)
   write_version_info(output_file, git_version)
 
 
@@ -263,6 +297,11 @@ parser.add_argument(
     "--gen_root_path", type=str,
     help="Root path to place generated git files (created by --configure).")
 
+parser.add_argument(
+    "--git_tag_override", type=str,
+    help="Override git tag value in the __git_version__ string. Useful when "
+         "creating release builds before the release tag is created.")
+
 parser.add_argument(
     "--generate",
     type=str,
@@ -274,6 +313,11 @@ parser.add_argument(
     type=str,
     help="Generate version_info.cc (simpler version used for cmake/make)")
 
+parser.add_argument(
+    "--source_dir",
+    type=str,
+    help="Base path of the source code (used for cmake/make)")
+
 args = parser.parse_args()
 
 if args.configure is not None:
@@ -281,9 +325,12 @@ if args.configure is not None:
     raise RuntimeError("Must pass --gen_root_path arg when running --configure")
   configure(args.configure, args.gen_root_path, debug=args.debug)
 elif args.generate is not None:
-  generate(args.generate)
+  generate(args.generate, args.git_tag_override)
 elif args.raw_generate is not None:
-  raw_generate(args.raw_generate)
+  source_path = "."
+  if args.source_dir is not None:
+    source_path = args.source_dir
+  raw_generate(args.raw_generate, source_path, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index db20bb00e84b47bd15244e70b925f59e62731deb..cd128af6b36f2f99e5cf91961476d30384227e9b 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -28,7 +28,15 @@ fi
 cat <<EOF > ${OUTPUT_FILENAME}
 #include <string>
 const char* tf_git_version() {return "${GIT_VERSION}";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index ad3668fa02e102607c9a03ac312451a147affdda..1ad1895269022331bfd8156721778f4d68a10ee7 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -91,7 +91,6 @@ cc_library(
     srcs = [
         "add_default_attributes.cc",
         "backports.cc",
-        "fake_quantize_training.cc",
         "flatten_atrous.cc",
         "fold_batch_norms.cc",
         "fold_constants_lib.cc",
@@ -105,7 +104,6 @@ cc_library(
         "remove_attribute.cc",
         "remove_control_dependencies.cc",
         "remove_device.cc",
-        "remove_ema.cc",
         "remove_nodes.cc",
         "rename_attribute.cc",
         "rename_op.cc",
@@ -134,8 +132,8 @@ cc_library(
         "//tensorflow/core:tensorflow",
         "//tensorflow/contrib/rnn:gru_ops_op_lib",
         "//tensorflow/contrib/rnn:lstm_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
     ] + if_not_windows([
-        "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
         "//tensorflow/core/kernels/hexagon:hexagon_rewriter_transform",
     ]),
@@ -148,7 +146,6 @@ tf_cc_test(
     srcs = [
         "add_default_attributes_test.cc",
         "backports_test.cc",
-        "fake_quantize_training_test.cc",
         "flatten_atrous_test.cc",
         "fold_batch_norms_test.cc",
         "fold_constants_test.cc",
@@ -161,7 +158,6 @@ tf_cc_test(
         "quantize_weights_test.cc",
         "remove_attribute_test.cc",
         "remove_device_test.cc",
-        "remove_ema_test.cc",
         "remove_nodes_test.cc",
         "rename_attribute_test.cc",
         "rename_op_test.cc",
@@ -182,6 +178,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:quantization_utils",
         "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/util/tensor_bundle",
     ],
@@ -316,14 +313,3 @@ tf_py_test(
     ],
     main = "python/transform_graph_test.py",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/tools/graph_transforms/backports_test.cc b/tensorflow/tools/graph_transforms/backports_test.cc
index ab9a61afa7eb1680580c7e0c41f8ff1b47ef6742..80a954e062b06924c6048ac8b011dc1034706e8e 100644
--- a/tensorflow/tools/graph_transforms/backports_test.cc
+++ b/tensorflow/tools/graph_transforms/backports_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -191,7 +192,7 @@ TEST(BackportTensorArrayV3Test, TestBackportTensorArrayV3Subtypes) {
     std::map<string, const NodeDef*> node_lookup;
     MapNamesToNodes(result, &node_lookup);
     ASSERT_EQ(1, node_lookup.count("v3_node"));
-    EXPECT_TRUE(StringPiece(node_lookup.at("v3_node")->op()).ends_with("V2"));
+    EXPECT_TRUE(str_util::EndsWith(node_lookup.at("v3_node")->op(), "V2"));
   }
 }
 
diff --git a/tensorflow/tools/graph_transforms/fake_quantize_training.cc b/tensorflow/tools/graph_transforms/fake_quantize_training.cc
deleted file mode 100644
index 61aecc6e16d817d245421f18fa39c70aa45b2bef..0000000000000000000000000000000000000000
--- a/tensorflow/tools/graph_transforms/fake_quantize_training.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/core/graph/quantize_training.h"
-#include "tensorflow/tools/graph_transforms/transform_utils.h"
-
-namespace tensorflow {
-namespace graph_transforms {
-
-// EXPERIMENTAL: This can change without warning.
-// Rewrites the GraphDef for quantized training.
-// Rewrites the forward pass to include the precision loss with quantization so
-// the model can learn to deal with such loss and achieve better accuracy when
-// it is quantized later for inference.
-// Quantization range information is collected in FakeQuantizeWithMinMaxVars
-// ops.
-//
-// TODO(suharshs): Provide instructions on converting the resulting graph for
-// inference.
-// TODO(suharshs): Implement this using the GTT rather than calling the old
-// prototype function.
-Status FakeQuantizeTraining(const GraphDef& input_graph_def,
-                            const TransformFuncContext& context,
-                            GraphDef* output_graph_def) {
-  // TODO(suharshs): Make num_bits a parameter.
-  const int32 num_bits = 8;
-  // TODO(suharshs): Make quantization op a parameter?
-  const string quant_op_type = "FakeQuantWithMinMaxVars";
-
-  return DoQuantizeTrainingOnGraphDef(input_graph_def, num_bits, quant_op_type,
-                                      output_graph_def);
-}
-
-REGISTER_GRAPH_TRANSFORM("fake_quantize_training", FakeQuantizeTraining);
-
-}  // namespace graph_transforms
-}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fake_quantize_training_test.cc b/tensorflow/tools/graph_transforms/fake_quantize_training_test.cc
deleted file mode 100644
index 5e4ab209e97808c3f42ecf73fb763ef9d7ab1cfe..0000000000000000000000000000000000000000
--- a/tensorflow/tools/graph_transforms/fake_quantize_training_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/cc/ops/math_ops.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/tools/graph_transforms/transform_utils.h"
-
-namespace tensorflow {
-namespace graph_transforms {
-
-// Declare here, so we don't need a public header.
-Status FakeQuantizeTraining(const GraphDef& input_graph_def,
-                            const TransformFuncContext& context,
-                            GraphDef* output_graph_def);
-
-class FakeQuantizeTrainingTest : public ::testing::Test {};
-
-// For now, since the fake_quantize_training transform just calls the
-// quantize_training rewrite from tensorflow/core/graph/quantize_training.h,
-// we just test that the graph has been changed by the transform.
-// TODO(suharshs): Once we implement the fake_quantize_training transform
-// using the GTT, write proper tests of the transform here.
-TEST_F(FakeQuantizeTrainingTest, TransformOccurred) {
-  auto root = tensorflow::Scope::DisabledShapeInferenceScope();
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-
-  Tensor a_data(DT_FLOAT, TensorShape());
-  test::FillIota<float>(&a_data, 1.0f);
-  Output a_const = Const(root.WithOpName("a"), Input::Initializer(a_data));
-
-  Tensor b_data(DT_FLOAT, TensorShape());
-  test::FillIota<float>(&b_data, 1.0f);
-  Output b_const = Const(root.WithOpName("b"), Input::Initializer(b_data));
-
-  Output matmul = MatMul(root.WithOpName("matmul"), a_const, b_const);
-  GraphDef graph_def;
-  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
-
-  GraphDef result;
-  TransformFuncContext context;
-  TF_ASSERT_OK(FakeQuantizeTraining(graph_def, context, &result));
-
-  // Test that the transformation resulted in a graph with more nodes.
-  EXPECT_GT(result.node_size(), graph_def.node_size());
-}
-
-}  // namespace graph_transforms
-}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 250f54e20fba6e24fe95741b1437ac3718ace6fb..85660f94a85dce29360525f7bb7474494b3f010f 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -283,6 +283,10 @@ Status FoldConstants(const GraphDef& input_graph_def,
     };
   }
 
+  TF_RETURN_IF_ERROR(context.GetOneInt64Parameter(
+      "max_constant_size_in_bytes", cf_opts.max_constant_size_in_bytes,
+      &cf_opts.max_constant_size_in_bytes));
+
   // Constant folding.
   bool was_mutated;
   TF_RETURN_IF_ERROR(ConstantFold(cf_opts, nullptr, Env::Default(), nullptr,
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index 41106de008d832a022290e6da38cca8ad6d23ffd..a082399a87dbaad913be421fe273ba89b6f7340e 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -209,10 +210,10 @@ class ConstantFoldingTest : public ::testing::Test {
     for (const NodeDef& node : graph_def.node()) {
       const StringPiece name(node.name());
       const int occurrence_count = folded_node_map.count(node.name());
-      if (name.ends_with("expect_removed")) {
+      if (str_util::EndsWith(name, "expect_removed")) {
         EXPECT_EQ(0, occurrence_count) << "node.name()=" << node.name();
       }
-      if (name.ends_with("expect_remains")) {
+      if (str_util::EndsWith(name, "expect_remains")) {
         EXPECT_EQ(1, occurrence_count) << "node.name()=" << node.name();
       }
     }
@@ -370,6 +371,46 @@ class ConstantFoldingTest : public ::testing::Test {
     EXPECT_EQ(0, node_map.count("b"));
     EXPECT_EQ(1, node_map.count("c"));
   }
+
+  void TestMaxConstantSizeInBytes() {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    const int width = 100;
+
+    Tensor a_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&a_data, 1.0f);
+    Output a_const = ::tensorflow::ops::Const(
+        root.WithOpName("a_expect_remains"), Input::Initializer(a_data));
+
+    Tensor b_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&b_data, 1.0f);
+    Output b_const = ::tensorflow::ops::Const(
+        root.WithOpName("b_expect_remains"), Input::Initializer(b_data));
+
+    Output add = ::tensorflow::ops::Add(root.WithOpName("add_expect_remains"),
+                                        a_const, b_const);
+
+    Output placeholder = ::tensorflow::ops::Placeholder(
+        root.WithOpName("placeholder_expect_remains"), DT_FLOAT);
+
+    Output mul = ::tensorflow::ops::Mul(
+        root.WithOpName("output_expect_remains"), add, placeholder);
+
+    GraphDef graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+    Tensor placeholder_tensor(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&placeholder_tensor, 1.0f);
+
+    // Setting the maximum constant size to 10 bytes should stop the constant
+    // folding at add(a, b) that would have yielded a constant of
+    // 100*sizeof(float) bytes.
+    graph_transforms::TransformFuncContext context;
+    context.params["max_constant_size_in_bytes"] = {"10"};
+    TestConstantFolding(graph_def,
+                        {{"placeholder_expect_remains", placeholder_tensor}},
+                        {}, {"output_expect_remains"}, context);
+  }
 };
 
 TEST_F(ConstantFoldingTest, TestSimpleAdd) { TestSimpleAdd(); }
@@ -394,5 +435,9 @@ TEST_F(ConstantFoldingTest, TestRemoveUnusedNodesMultipleOutputs) {
   TestRemoveUnusedNodesMultipleOutputs();
 }
 
+TEST_F(ConstantFoldingTest, TestMaxConstantSizeInBytes) {
+  TestMaxConstantSizeInBytes();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index d89afe85c72883323cec3c14342fd60adebd024d..f1d361e07d8f00aa37a4e063a7d17bf85de74fde 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -159,6 +159,9 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
+  if (!conv_node.attr().count("data_format")) {
+    CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
+  }
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
   AddNodeInput(conv_node.name(), &bias_add_node);
   AddNodeInput(bias_offset_node.name(), &bias_add_node);
@@ -182,6 +185,36 @@ Status FuseBatchNormWithConv(const NodeMatch& match,
   return Status::OK();
 }
 
+Status FuseBatchNormWithBatchToSpace(const NodeMatch& match,
+                             std::vector<NodeDef>* new_nodes) {
+  // Calculate the scale and offset values to apply.
+  std::vector<float> scale_values;
+  std::vector<float> offset_values;
+  TF_RETURN_IF_ERROR(
+      GetScaleAndOffsetValues(match, &scale_values, &offset_values));
+
+  // Fuse conv weights, and set the final output node name as batch_norm_node.
+  const NodeDef& batch_norm_node = match.node;
+  const NodeMatch& batch_to_space_node_match = match.inputs[0];
+  const NodeMatch& conv_node_match = batch_to_space_node_match.inputs[0];
+  const NodeDef& batch_to_space_node = batch_to_space_node_match.node;
+  const NodeDef& conv_node = conv_node_match.node;
+
+  string biasadd_name = conv_node.name() + "/biasadd";
+  TF_RETURN_IF_ERROR(
+      FuseScaleOffsetToConvWeights(scale_values, offset_values, conv_node_match,
+                                   biasadd_name , new_nodes));
+
+  NodeDef new_batch_to_space_node = batch_to_space_node;
+  // reuse batch_norm node name
+  new_batch_to_space_node.set_name(batch_norm_node.name());
+  new_batch_to_space_node.set_input(0, biasadd_name);
+  new_nodes->push_back(batch_to_space_node_match.inputs[1].node);
+  new_nodes->push_back(batch_to_space_node_match.inputs[2].node);
+  new_nodes->push_back(new_batch_to_space_node);
+  return Status::OK();
+}
+
 Status FuseBatchNormWithConvConcat(const NodeMatch& match,
                                    std::vector<NodeDef>* new_nodes) {
   // Calculate the scale and offset values to apply.
@@ -284,6 +317,43 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
     current_graph_def = replaced_graph_def;
   } while (did_graph_change);
 
+  do {
+    did_graph_change = false;
+    GraphDef replaced_graph_def;
+    TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
+        current_graph_def,  // clang-format off
+        {"BatchNormWithGlobalNormalization|FusedBatchNorm",    // batch_norm_node
+         {
+             {"BatchToSpaceND",                  // batch_to_space_node
+              {
+                  {"Conv2D",                     // conv_node
+                   {
+                       {"*"},                    // input_node
+                       {"Const"},                // weights_node
+                   }
+                  },
+                  {"Const"},                     // block_shape
+                  {"Const"},                     // crops
+              }
+             },
+             {"Const"},                          // mean_node
+             {"Const"},                          // variance_node
+             {"Const"},                          // beta_node
+             {"Const"},                          // gamma_node
+         }
+        },  // clang-format on
+        [&did_graph_change](const NodeMatch& match,
+                            const std::set<string>& input_nodes,
+                            const std::set<string>& output_nodes,
+                            std::vector<NodeDef>* new_nodes) {
+          TF_RETURN_IF_ERROR(FuseBatchNormWithBatchToSpace(match, new_nodes));
+          did_graph_change = true;
+          return Status::OK();
+        },
+        {}, &replaced_graph_def));
+    current_graph_def = replaced_graph_def;
+  } while (did_graph_change);
+
   do {
     did_graph_change = false;
     GraphDef replaced_graph_def;
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index b30ba9ac8b92db68eb3374c51a7f31b69cd1e3cf..7651a03fe51012678d6d6fc495fd82e497aa512b 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -298,6 +299,96 @@ class FoldOldBatchNormsTest : public ::testing::Test {
   }
 };
 
+void TestFoldFusedBatchNormsWithBatchToSpace() {
+  auto root = tensorflow::Scope::NewRootScope();
+  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+  Tensor input_data(DT_FLOAT, TensorShape({2, 1, 3, 2}));
+  test::FillValues<float>(
+      &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                    -5.0f, -3.0f, -6.0f});
+  Output input_op =
+      Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+  Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+  test::FillValues<float>(&weights_data,
+                          {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+  Output weights_op =
+      Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+  Output conv_op = Conv2D(root.WithOpName("conv_op"), input_op, weights_op,
+                          {1, 1, 1, 1}, "VALID");
+
+  Tensor block_shape_data(DT_INT32, TensorShape({2}));
+  test::FillValues<int32>(&block_shape_data, {1, 2});
+  Output block_shape_op =
+      Const(root.WithOpName("block_shape_op"), Input::Initializer(block_shape_data));
+
+  Tensor crops_data(DT_INT32, TensorShape({2, 2}));
+  test::FillValues<int32>(&crops_data, {0, 0, 0, 1});
+  Output crops_op =
+      Const(root.WithOpName("crops_op"), Input::Initializer(crops_data));
+
+  Output batch_to_space_op = BatchToSpaceND(root.WithOpName("batch_to_space_op"),
+                                            conv_op, block_shape_op, crops_data);
+
+  Tensor mean_data(DT_FLOAT, TensorShape({2}));
+  test::FillValues<float>(&mean_data, {10.0f, 20.0f});
+  Output mean_op =
+      Const(root.WithOpName("mean_op"), Input::Initializer(mean_data));
+
+  Tensor variance_data(DT_FLOAT, TensorShape({2}));
+  test::FillValues<float>(&variance_data, {0.25f, 0.5f});
+  Output variance_op = Const(root.WithOpName("variance_op"),
+                             Input::Initializer(variance_data));
+
+  Tensor beta_data(DT_FLOAT, TensorShape({2}));
+  test::FillValues<float>(&beta_data, {0.1f, 0.6f});
+  Output beta_op =
+      Const(root.WithOpName("beta_op"), Input::Initializer(beta_data));
+
+  Tensor gamma_data(DT_FLOAT, TensorShape({2}));
+  test::FillValues<float>(&gamma_data, {1.0f, 2.0f});
+  Output gamma_op =
+      Const(root.WithOpName("gamma_op"), Input::Initializer(gamma_data));
+
+  GraphDef original_graph_def;
+  TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+  NodeDef batch_norm_node;
+  batch_norm_node.set_op("FusedBatchNorm");
+  batch_norm_node.set_name("output");
+  AddNodeInput("batch_to_space_op", &batch_norm_node);
+  AddNodeInput("gamma_op", &batch_norm_node);
+  AddNodeInput("beta_op", &batch_norm_node);
+  AddNodeInput("mean_op", &batch_norm_node);
+  AddNodeInput("variance_op", &batch_norm_node);
+  SetNodeAttr("T", DT_FLOAT, &batch_norm_node);
+  SetNodeAttr("epsilon", 0.00001f, &batch_norm_node);
+  SetNodeAttr("is_training", false, &batch_norm_node);
+  *(original_graph_def.mutable_node()->Add()) = batch_norm_node;
+
+  std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+  TF_ASSERT_OK(original_session->Create(original_graph_def));
+  std::vector<Tensor> original_outputs;
+  TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+  GraphDef fused_graph_def;
+  TF_ASSERT_OK(FoldOldBatchNorms(original_graph_def, {{}, {"output"}},
+                                 &fused_graph_def));
+
+  std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+  TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+  std::vector<Tensor> fused_outputs;
+  TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+  test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+
+  for (const NodeDef& node : fused_graph_def.node()) {
+    EXPECT_NE("FusedBatchNormWithBatchToSpace", node.op());
+  }
+}
+
 TEST_F(FoldOldBatchNormsTest, TestFoldOldBatchNorms) {
   TestFoldOldBatchNorms();
 }
@@ -307,7 +398,7 @@ TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNorms) {
 }
 
 TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNormsWithConcat) {
-  // Test axis is not 3, so all weigths and offsets are fused to each of inputs
+  // Test axis is not 3, so all weights and offsets are fused to each of inputs
   // of conv2d.
   TestFoldFusedBatchNormsWithConcat(/*split=*/true);
   // Test axis = 3, BatchNorm weights and offsets will be split before fused
@@ -315,5 +406,9 @@ TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNormsWithConcat) {
   TestFoldFusedBatchNormsWithConcat(/*split=*/false);
 }
 
+TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNormsWithBatchToSpace) {
+  TestFoldFusedBatchNormsWithBatchToSpace();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
index 2436c7e4a2dc5c8172de2a35abbfc551d6e410fd..f401723808c086bd69743b75b8b4d972e8ab0b83 100644
--- a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
+++ b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
@@ -40,8 +40,8 @@ Status ExtractMinMaxRecords(const string& log_file_name,
   for (const string& file_line : file_lines) {
     // We expect to find a line with components separated by semicolons, so to
     // start make sure that the basic structure is in place/
-    StringPiece line(file_line);
-    if (!line.contains(print_suffix + ";" + requant_prefix)) {
+    if (!str_util::StrContains(file_line,
+                               print_suffix + ";" + requant_prefix)) {
       continue;
     }
     std::vector<string> line_parts = str_util::Split(file_line, ';');
@@ -53,8 +53,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
     bool min_max_found = false;
     int min_max_index;
     for (int i = 1; i < line_parts.size(); ++i) {
-      StringPiece line_part(line_parts[i]);
-      if (line_part.starts_with(requant_prefix)) {
+      if (str_util::StartsWith(line_parts[i], requant_prefix)) {
         min_max_found = true;
         min_max_index = i;
       }
@@ -90,7 +89,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
       continue;
     }
     StringPiece name_string = line_parts[min_max_index - 1];
-    if (!name_string.ends_with(print_suffix)) {
+    if (!str_util::EndsWith(name_string, print_suffix)) {
       continue;
     }
     string name =
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index e1ee2b420b062937b5e50c10a05406df3cbd7977..377665448c244aeace78f231ba0c263613afd9a0 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -101,7 +102,7 @@ Status InsertLogging(const GraphDef& input_graph_def,
     const bool op_matches = (ops.count(node.op()) > 0);
     bool prefix_matches = false;
     for (const string& prefix : prefixes) {
-      if (StringPiece(node.name()).starts_with(prefix)) {
+      if (str_util::StartsWith(node.name(), prefix)) {
         prefix_matches = true;
       }
     }
diff --git a/tensorflow/tools/graph_transforms/remove_ema.cc b/tensorflow/tools/graph_transforms/remove_ema.cc
deleted file mode 100644
index 22e26267025c3fed4f44ffbc09d55d8d355cc448..0000000000000000000000000000000000000000
--- a/tensorflow/tools/graph_transforms/remove_ema.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/tools/graph_transforms/transform_utils.h"
-
-namespace tensorflow {
-namespace graph_transforms {
-
-// EXPERIMENTAL: This can change without warning.
-// Given a graph that has gone through the FakeQuantizeTraining transform and
-// has been frozen afterwards, RemoveEMA simplifies the FakeQuantize estimated
-// moving average subgraphs to make it compatible with the QuantizeNodes
-// transform.
-Status RemoveEMA(const GraphDef& input_graph_def,
-                 const TransformFuncContext& context,
-                 GraphDef* output_graph_def) {
-  TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
-      input_graph_def,  // clang-format off
-      {"FakeQuantWithMinMaxVars",
-       {
-         {"*"},
-         {"Assign",
-          {
-            {"Const"},
-            {"Merge",
-             {
-               {"Switch",
-                {
-                  {"Min",
-                   {
-                     {"*"},
-                     {"Range",
-                      {
-                        {"*"},
-                        {"*"},
-                        {"*"},
-                      }
-                     }
-                   }
-                  },
-                  {"IsVariableInitialized"}
-                }
-               },
-               {"Sub",
-                {
-                  {"Const"},
-                  {"Mul",
-                   {
-                     {"Sub"},
-                     {"Sub",
-                      {
-                        {"Const"},
-                        {"Const"}
-                      }
-                     }
-                   }
-                  }
-                }
-               }
-             }
-            }
-          }
-         },
-         {"Assign",
-          {
-            {"Const"},
-            {"Merge",
-             {
-               {"Switch",
-                {
-                  {"Max"},
-                  {"IsVariableInitialized"}
-                }
-               },
-               {"Sub",
-                {
-                  {"Const"},
-                  {"Mul",
-                   {
-                     {"Sub"},
-                     {"Sub",
-                      {
-                        {"Const"},
-                        {"Const"}
-                      }
-                     }
-                   }
-                  }
-                }
-               }
-             }
-            }
-          }
-         },
-       }
-      },  // clang-format on
-      [](const NodeMatch& match, const std::set<string>& input_nodes,
-         const std::set<string>& output_nodes,
-         std::vector<NodeDef>* new_nodes) {
-        const NodeDef& fake_quant_node = match.node;
-        const NodeDef& input_node = match.inputs[0].node;
-        const NodeDef& min_var_node = match.inputs[1].inputs[0].node;
-        const NodeDef& max_var_node = match.inputs[2].inputs[0].node;
-
-        // Make a new FakeQuantizeWithMinMaxVars operation that uses constants
-        // for its min/max arguments rather than an entire EMA subgraph.
-        NodeDef new_fake_quant_node;
-        new_fake_quant_node.set_op(fake_quant_node.op());
-        new_fake_quant_node.set_name(fake_quant_node.name());
-        AddNodeInput(input_node.name(), &new_fake_quant_node);
-        AddNodeInput(min_var_node.name(), &new_fake_quant_node);
-        AddNodeInput(max_var_node.name(), &new_fake_quant_node);
-        CopyNodeAttr(fake_quant_node, "narrow_range", "narrow_range",
-                     &new_fake_quant_node);
-        CopyNodeAttr(fake_quant_node, "num_bits", "num_bits",
-                     &new_fake_quant_node);
-
-        new_nodes->push_back(new_fake_quant_node);
-        new_nodes->push_back(input_node);
-        new_nodes->push_back(min_var_node);
-        new_nodes->push_back(max_var_node);
-
-        return Status::OK();
-      },
-      {}, output_graph_def));
-  return Status::OK();
-}
-
-REGISTER_GRAPH_TRANSFORM("remove_ema", RemoveEMA);
-
-}  // namespace graph_transforms
-}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/remove_ema_test.cc b/tensorflow/tools/graph_transforms/remove_ema_test.cc
deleted file mode 100644
index 27db90e2729487f89324622f7a63aca1c5a58fe7..0000000000000000000000000000000000000000
--- a/tensorflow/tools/graph_transforms/remove_ema_test.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/cc/ops/math_ops.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/tools/graph_transforms/transform_utils.h"
-
-namespace tensorflow {
-namespace graph_transforms {
-
-// Declare transformations here, so we don't need a public header.
-Status FakeQuantizeTraining(const GraphDef& input_graph_def,
-                            const TransformFuncContext& context,
-                            GraphDef* output_graph_def);
-
-Status RemoveEMA(const GraphDef& input_graph_def,
-                 const TransformFuncContext& context,
-                 GraphDef* output_graph_def);
-
-Status QuantizeNodes(const GraphDef& input_graph_def,
-                     const TransformFuncContext& context,
-                     GraphDef* output_graph_def);
-
-class RemoveEMATest : public ::testing::Test {};
-
-TEST_F(RemoveEMATest, FakeQuant_RemoveEMA_QuantizeTraining) {
-  // Build a small graph.
-  auto root = tensorflow::Scope::NewRootScope();
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-
-  Tensor a_data(DT_FLOAT, TensorShape({1, 1}));
-  test::FillIota<float>(&a_data, 1.0f);
-  Output a_const = Const(root.WithOpName("a"), Input::Initializer(a_data));
-
-  Tensor b_data(DT_FLOAT, TensorShape({1, 1}));
-  test::FillIota<float>(&b_data, 1.0f);
-  Output b_const = Const(root.WithOpName("b"), Input::Initializer(b_data));
-
-  Output matmul = MatMul(root.WithOpName("matmul"), a_const, b_const);
-  GraphDef graph_def;
-  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
-
-  // (1) FakeQuantize the graph.
-  GraphDef fake_quantized_graph_def;
-  TransformFuncContext context;
-  TF_ASSERT_OK(
-      FakeQuantizeTraining(graph_def, context, &fake_quantized_graph_def));
-
-  // Test that the transformation resulted in a graph with more nodes.
-  EXPECT_GT(fake_quantized_graph_def.node_size(), graph_def.node_size());
-
-  // (2) Run the graph to initialize the newly added variables.
-  std::unique_ptr<Session> session(NewSession(SessionOptions()));
-  TF_ASSERT_OK(session->Create(fake_quantized_graph_def));
-  std::vector<Tensor> outputs;
-  TF_ASSERT_OK(session->Run({}, {"matmul"}, {}, &outputs));
-
-  // (3) Freeze the graph. Create a "frozen graph" that matches what we would
-  // expect if we actually froze the above graph.
-  // TODO(suharshs): Use a c++ freeze graph alternative, when one is available.
-  GraphDef frozen_graph_def;
-  for (const NodeDef& node : fake_quantized_graph_def.node()) {
-    if (node.op() == "Variable" || node.op() == "VariableV2") {
-      NodeDef const_node;
-      const_node.set_op("Const");
-      const_node.set_name(node.name());
-      SetNodeAttr("dtype", DT_FLOAT, &const_node);
-      Tensor tensor(DT_FLOAT, {});
-      tensor.flat<float>()(0) = 1.0f;
-      SetNodeTensorAttr<float>("value", tensor, &const_node);
-      *(frozen_graph_def.mutable_node()->Add()) = const_node;
-    } else {
-      *(frozen_graph_def.mutable_node()->Add()) = node;
-    }
-  }
-
-  // Test that freezing the graph resulted in a graph with the same number of
-  // nodes.
-  EXPECT_EQ(frozen_graph_def.node_size(), fake_quantized_graph_def.node_size());
-
-  // (4) RemoveEMA on the graph to make it compatible with QuantizeNodes.
-  GraphDef removed_ema_graph_def;
-  TF_ASSERT_OK(RemoveEMA(frozen_graph_def, context, &removed_ema_graph_def));
-
-  // Test that the transformation resulted in a graph with less nodes.
-  EXPECT_LT(removed_ema_graph_def.node_size(), frozen_graph_def.node_size());
-
-  // (5) QuantizeNodes and inspect the final graph.
-  // TODO(suharshs): Add a more thorough inspection of the structure of
-  // the output graph.
-  GraphDef quantized_graph_def;
-  TF_ASSERT_OK(
-      QuantizeNodes(removed_ema_graph_def, context, &quantized_graph_def));
-
-  // Test that the transformation resulted in a graph with more nodes.
-  EXPECT_GT(quantized_graph_def.node_size(), removed_ema_graph_def.node_size());
-
-  // Make sure that the FakeQuantizeWithMinMaxVars op has been removed.
-  for (const NodeDef& node : quantized_graph_def.node()) {
-    EXPECT_NE(node.op(), "FakeQuantWithMinMaxVars");
-  }
-}
-
-}  // namespace graph_transforms
-}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 701e350fc39d083665f5420e6b73510c182e12ce..cc82100148117c7846ba5781e1a97e172ad7f03c 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -88,7 +89,7 @@ void CreateConstNode(const Tensor& tensor, const string& name,
 
 string GetMonolithicTensorKey(const string& tensor_slice_name) {
   std::vector<string> names = Split(tensor_slice_name, "/");
-  if (StringPiece(names[names.size() - 1]).starts_with("part_")) {
+  if (str_util::StartsWith(names[names.size() - 1], "part_")) {
     CHECK_GE(names.size(), 2);
     names.pop_back();
   }
@@ -102,8 +103,8 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
   for (const auto& node : input_graph_def.node()) {
     std::vector<string> node_name_parts = Split(node.name(), "/");
     if (node_name_parts.size() == 2 &&
-        StringPiece(node_name_parts[0]).starts_with("save") &&
-        StringPiece(node_name_parts[1]).starts_with("Assign") &&
+        str_util::StartsWith(node_name_parts[0], "save") &&
+        str_util::StartsWith(node_name_parts[1], "Assign") &&
         node.input(0) == target_name) {
       restore_node_name = node.input(1);
       break;
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index 28387c2b48c06ecffd2afa0705a8dea5bc368460..8ce8f5e24b9f002e50d456c8ccab8a6414fca724 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/file_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
+#if !defined(PLATFORM_WINDOWS)
+#include <pwd.h>
+#endif
 
 namespace tensorflow {
 namespace graph_transforms {
@@ -130,16 +133,64 @@ Status ParseTransformParameters(const string& transforms_string,
   return Status::OK();
 }
 
+std::string ExpandPath(const std::string& path_string) {
+#if defined(PLATFORM_WINDOWS)
+  return path_string;
+#else
+  if (path_string.empty() || path_string[0] != '~') {
+    return path_string;
+  }
+
+  const char* home = NULL;
+  std::string::size_type prefix = path_string.find_first_of('/');
+  if (path_string.length() == 1 || prefix == 1) {
+    // The value of $HOME, e.g., ~/foo
+    home = getenv("HOME");
+    if (!home) {
+      // If HOME is not available, get uid
+      struct passwd* pw = getpwuid(getuid());
+      if (pw) {
+        home = pw->pw_dir;
+      }
+    }
+  } else {
+    // The value of ~user, e.g., ~user/foo
+    std::string user(path_string, 1, (prefix == std::string::npos)
+                                         ? std::string::npos
+                                         : prefix - 1);
+    struct passwd* pw = getpwnam(user.c_str());
+    if (pw) {
+      home = pw->pw_dir;
+    }
+  }
+
+  if (!home) {
+    return path_string;
+  }
+
+  string path(home);
+  if (prefix == std::string::npos) {
+    return path;
+  }
+
+  if (path.length() == 0 || path[path.length() - 1] != '/') {
+    path += '/';
+  }
+  path += path_string.substr(prefix + 1);
+  return path;
+#endif
+}
+
 int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
-  string in_graph = "";
-  string out_graph = "";
+  string in_graph_string = "";
+  string out_graph_string = "";
   string inputs_string = "";
   string outputs_string = "";
   string transforms_string = "";
   bool output_as_text = false;
   std::vector<Flag> flag_list = {
-      Flag("in_graph", &in_graph, "input graph file name"),
-      Flag("out_graph", &out_graph, "output graph file name"),
+      Flag("in_graph", &in_graph_string, "input graph file name"),
+      Flag("out_graph", &out_graph_string, "output graph file name"),
       Flag("inputs", &inputs_string, "inputs"),
       Flag("outputs", &outputs_string, "outputs"),
       Flag("transforms", &transforms_string, "list of transforms"),
@@ -166,11 +217,11 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     LOG(ERROR) << "Unknown argument " << argv[1] << ".\n" << usage;
     return -1;
   }
-  if (in_graph.empty()) {
+  if (in_graph_string.empty()) {
     LOG(ERROR) << "in_graph graph can't be empty.\n" << usage;
     return -1;
   }
-  if (out_graph.empty()) {
+  if (out_graph_string.empty()) {
     LOG(ERROR) << "out_graph graph can't be empty.\n" << usage;
     return -1;
   }
@@ -179,6 +230,9 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     return -1;
   }
 
+  string in_graph = ExpandPath(in_graph_string);
+  string out_graph = ExpandPath(out_graph_string);
+
   std::vector<string> inputs = str_util::Split(inputs_string, ',');
   std::vector<string> outputs = str_util::Split(outputs_string, ',');
   TransformParameters transform_params;
@@ -197,7 +251,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
   GraphDef graph_def;
   Status load_status = LoadTextOrBinaryGraphFile(in_graph, &graph_def);
   if (!load_status.ok()) {
-    LOG(ERROR) << "Loading graph '" << in_graph << "' failed with "
+    LOG(ERROR) << "Loading graph '" << in_graph_string << "' failed with "
                << load_status.error_message();
     LOG(ERROR) << usage;
     return -1;
@@ -219,7 +273,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     save_status = WriteBinaryProto(Env::Default(), out_graph, graph_def);
   }
   if (!save_status.ok()) {
-    LOG(ERROR) << "Saving graph '" << out_graph << "' failed with "
+    LOG(ERROR) << "Saving graph '" << out_graph_string << "' failed with "
                << save_status.error_message();
     return -1;
   }
diff --git a/tensorflow/tools/graph_transforms/transform_graph_test.cc b/tensorflow/tools/graph_transforms/transform_graph_test.cc
index bc2412fcbdba90731318eea1a2239aa914b35ffc..b276229aa44f747ee81ebcdfe204468929c9eb53 100644
--- a/tensorflow/tools/graph_transforms/transform_graph_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -112,12 +113,11 @@ class TransformGraphTest : public ::testing::Test {
     graph_transforms::MapNamesToNodes(out_graph_def, &out_node_map);
 
     for (const NodeDef& node : out_graph_def.node()) {
-      const StringPiece name(node.name());
       const int occurrence_count = out_node_map.count(node.name());
-      if (name.ends_with("expect_removed")) {
+      if (str_util::EndsWith(node.name(), "expect_removed")) {
         EXPECT_EQ(0, occurrence_count) << "node.name()=" << node.name();
       }
-      if (name.ends_with("expect_remains")) {
+      if (str_util::EndsWith(node.name(), "expect_remains")) {
         EXPECT_EQ(1, occurrence_count) << "node.name()=" << node.name();
       }
     }
@@ -139,7 +139,7 @@ class TransformGraphTest : public ::testing::Test {
     Status no_such_status =
         TransformGraph({}, {}, {{"test_no_such_transform", {}}}, &graph_def);
     EXPECT_TRUE(
-        StringPiece(no_such_status.ToString()).contains("not recognized"));
+        str_util::StrContains(no_such_status.ToString(), "not recognized"));
   }
 
   void TestParseTransformParameters() {
diff --git a/tensorflow/tools/graph_transforms/transform_utils.cc b/tensorflow/tools/graph_transforms/transform_utils.cc
index 55f28a9e1d8d639a316c9bd121204d603217dea3..367048965d146d782267f23330a435ae72f7f49a 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils.cc
@@ -88,7 +88,7 @@ void NodeNamePartsFromInput(const string& input_name, string* prefix,
     *suffix = ":" + input_parts[1];
   }
   StringPiece node_name_piece(input_parts[0]);
-  if (node_name_piece.Consume("^")) {
+  if (str_util::ConsumePrefix(&node_name_piece, "^")) {
     *prefix = "^";
   } else {
     *prefix = "";
@@ -200,8 +200,7 @@ Status SortByExecutionOrder(const GraphDef& input_graph_def,
       // for merge only wait for one non-control input.
       int32 num_control_edges = 0;
       for (int i = 0; i < node_def.input_size(); ++i) {
-        StringPiece input_name(node_def.input(i));
-        if (input_name.starts_with("^")) {
+        if (str_util::StartsWith(node_def.input(i), "^")) {
           num_control_edges++;
         }
       }
@@ -504,7 +503,7 @@ Status RenameNodeInputs(const GraphDef& input_graph_def,
           const string& dest_name = input_to_rename.second;
           bool is_match;
           string match_name;
-          if (StringPiece(source_name).ends_with(":*")) {
+          if (str_util::EndsWith(source_name, ":*")) {
             is_match = true;
             string prefix;
             string unused_node_name;
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 614457e8996491a60d4a7df213180117bce321ad..569b6678cabddd17eaf21921d668221f1a865625 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -27,6 +27,7 @@ pkg_tar(
         ":cheaders",
         ":clib",
         ":clicenses",
+        ":eager_cheaders",
     ],
 )
 
@@ -57,7 +58,6 @@ pkg_tar(
     name = "cheaders",
     files = [
         "//tensorflow/c:headers",
-        "//tensorflow/c/eager:headers",
     ],
     package_dir = "include/tensorflow/c",
     # Mark as "manual" till
@@ -68,6 +68,20 @@ pkg_tar(
     tags = ["manual"],
 )
 
+pkg_tar(
+    name = "eager_cheaders",
+    files = [
+        "//tensorflow/c/eager:headers",
+    ],
+    package_dir = "include/tensorflow/c/eager",
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    tags = ["manual"],
+)
+
 pkg_tar(
     name = "clib",
     files = ["//tensorflow:libtensorflow.so"],
@@ -104,6 +118,7 @@ genrule(
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
+        "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
@@ -124,7 +139,6 @@ genrule(
         "@zlib_archive//:zlib.h",
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
-        "@mkl//:LICENSE",
     ]),
     outs = ["include/tensorflow/c/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
@@ -142,6 +156,7 @@ genrule(
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
+        "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
@@ -162,7 +177,6 @@ genrule(
         "@zlib_archive//:zlib.h",
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
-        "@mkl//:LICENSE",
     ]),
     outs = ["include/tensorflow/jni/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
diff --git a/tensorflow/tools/lib_package/README.md b/tensorflow/tools/lib_package/README.md
index 700814826035a3274739f1d3784140ea0a59b36f..cb6aef262456a202b93a2787626688b7712e2352 100644
--- a/tensorflow/tools/lib_package/README.md
+++ b/tensorflow/tools/lib_package/README.md
@@ -35,8 +35,8 @@ The following commands:
 bazel test --config opt //tensorflow/tools/lib_package:libtensorflow_test
 bazel build --config opt \
   //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz \
-  //tensorflow/tools/lib_package:libtensorflow.jar \
-  //tensorflow/tools/lib_package:libtensorflow-src.jar
+  //tensorflow/java:libtensorflow.jar \
+  //tensorflow/java:libtensorflow-src.jar
 ```
 
 test and produce the following:
@@ -44,9 +44,9 @@ test and produce the following:
 -   The native library (`libtensorflow_jni.so`) packaged in an archive at:
     `bazel-bin/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz`
 -   The Java archive at:
-    `bazel-bin/tensorflow/tools/lib_package/libtensorflow.jar`
+    `bazel-bin/tensorflow/java/libtensorflow.jar`
 -   The Java archive for Java sources at:
-    `bazel-bin/tensorflow/tools/lib_package/libtensorflow-src.jar`
+    `bazel-bin/tensorflow/java/libtensorflow-src.jar`
 
 ## Release
 
diff --git a/tensorflow/tools/mlpbtxt/BUILD b/tensorflow/tools/mlpbtxt/BUILD
index f9f48c6500cee99dce1f5c9ffe6284e578e82669..89c683c8c422b7341517d80f7c55cceb1636a657 100644
--- a/tensorflow/tools/mlpbtxt/BUILD
+++ b/tensorflow/tools/mlpbtxt/BUILD
@@ -32,15 +32,3 @@ tf_cc_binary(
         "//tensorflow/core:op_gen_lib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index fb6eaa4faa28b4f6b17e1774907c0c9ff58d6ada..677ea65edd91df9eef2347ab305f47a05f6cedaa 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -48,36 +48,68 @@ py_binary(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
+COMMON_PIP_DEPS = [
+    ":licenses",
+    "MANIFEST.in",
+    "README",
+    "setup.py",
+    ":included_headers",
+    "//tensorflow:tensorflow_py",
+    "//tensorflow/contrib/autograph:autograph",
+    "//tensorflow/contrib/autograph/converters:converters",
+    "//tensorflow/contrib/autograph/converters:test_lib",
+    "//tensorflow/contrib/autograph/impl:impl",
+    "//tensorflow/contrib/autograph/pyct:pyct",
+    "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
+    "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
+    "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
+    "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
+    "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
+    "//tensorflow/contrib/data/python/ops:contrib_op_loader",
+    "//tensorflow/contrib/eager/python/examples:examples_pip",
+    "//tensorflow/contrib/eager/python:evaluator",
+    "//tensorflow/contrib/gan:gan",
+    "//tensorflow/contrib/graph_editor:graph_editor_pip",
+    "//tensorflow/contrib/keras:keras",
+    "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
+    "//tensorflow/contrib/nn:nn_py",
+    "//tensorflow/contrib/predictor:predictor_pip",
+    "//tensorflow/contrib/proto:proto_pip",
+    "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/rpc:rpc_pip",
+    "//tensorflow/contrib/session_bundle:session_bundle_pip",
+    "//tensorflow/contrib/signal:signal_py",
+    "//tensorflow/contrib/signal:test_util",
+    "//tensorflow/contrib/slim:slim",
+    "//tensorflow/contrib/slim/python/slim/data:data_pip",
+    "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
+    "//tensorflow/contrib/specs:specs",
+    "//tensorflow/contrib/summary:summary_test_util",
+    "//tensorflow/contrib/tensor_forest:init_py",
+    "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
+    "//tensorflow/contrib/timeseries:timeseries_pip",
+    "//tensorflow/contrib/tpu",
+    "//tensorflow/examples/tutorials/mnist:package",
+    "//tensorflow/python:distributed_framework_test_lib",
+    "//tensorflow/python:meta_graph_testdata",
+    "//tensorflow/python:spectral_ops_test_util",
+    "//tensorflow/python:util_example_parser_configuration",
+    "//tensorflow/python/debug:debug_pip",
+    "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
+    "//tensorflow/python/saved_model:saved_model",
+    "//tensorflow/python/tools:tools_pip",
+    "//tensorflow/python:test_ops",
+    "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
+]
+
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
 py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
-    data = [
-        "MANIFEST.in",
-        "README",
-        "setup.py",
-        ":included_headers",
-        "//tensorflow/contrib/nn:nn_py",
-        "//tensorflow/contrib/session_bundle:session_bundle_pip",
-        "//tensorflow/contrib/signal:signal_py",
-        "//tensorflow/contrib/slim/python/slim/data:data_pip",
-        "//tensorflow/python:util_example_parser_configuration",
-        "//tensorflow/python/debug:debug_pip",
-        "//tensorflow/python/saved_model",
-        "//tensorflow/python:spectral_ops_test_util",
-        "//tensorflow/python/tools:tools_pip",
-        "//tensorflow/python/eager:eager_pip",
-        "//tensorflow/contrib/summary:summary_test_util",
-        # These targets don't build on Windows yet. Exclude them for now.
-        # "//tensorflow/contrib/slim",
-        # "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-        # "//tensorflow/contrib/specs",
-        # "//tensorflow/contrib/tensor_forest:init_py",
-        # "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
-        # "//tensorflow/examples/tutorials/mnist:package",
-    ],
+    data = COMMON_PIP_DEPS,
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -97,6 +129,7 @@ filegroup(
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
+        "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
@@ -108,10 +141,13 @@ filegroup(
         "@highwayhash//:LICENSE",
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
+        "@kafka//:LICENSE",
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
+        "@local_config_nccl//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@grpc//third_party/nanopb:LICENSE.txt",
+        "@grpc//third_party/address_sorting:LICENSE",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@pcre//:LICENCE",
@@ -125,9 +161,6 @@ filegroup(
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
-        "@mkl//:LICENSE",
-    ]) + if_not_windows([
-        "@nccl_archive//:LICENSE.txt",
     ]) + tf_additional_license_deps(),
 )
 
@@ -137,60 +170,13 @@ sh_binary(
     data = select({
         "//tensorflow:windows": [":simple_console_for_windows"],
         "//tensorflow:windows_msvc": [":simple_console_for_windows"],
-        "//conditions:default": [
-            ":licenses",
-            "MANIFEST.in",
-            "README",
-            "setup.py",
-            ":included_headers",
+        "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
-            "//tensorflow:tensorflow_py",
-            "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
-            "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
-            "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
-            "//tensorflow/contrib/data/python/ops:contrib_op_loader",
-            "//tensorflow/contrib/eager/python/examples:examples_pip",
-            "//tensorflow/contrib/eager/python:checkpointable_utils",
-            "//tensorflow/contrib/eager/python:evaluator",
-            "//tensorflow/contrib/gan:gan",
-            "//tensorflow/contrib/graph_editor:graph_editor_pip",
-            "//tensorflow/contrib/keras:keras",
-            "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
+            "//tensorflow/contrib/lite/python:interpreter_test_data",
+            "//tensorflow/contrib/lite/python:tf_lite_py_pip",
             "//tensorflow/contrib/lite/toco:toco",
             "//tensorflow/contrib/lite/toco/python:toco_wrapper",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
-            "//tensorflow/contrib/nn:nn_py",
-            "//tensorflow/contrib/predictor:predictor_pip",
-            "//tensorflow/contrib/py2tf:py2tf",
-            "//tensorflow/contrib/py2tf/converters:converters",
-            "//tensorflow/contrib/py2tf/converters:test_lib",
-            "//tensorflow/contrib/py2tf/impl:impl",
-            "//tensorflow/contrib/py2tf/pyct:pyct",
-            "//tensorflow/contrib/py2tf/pyct/static_analysis:static_analysis",
-            "//tensorflow/contrib/receptive_field:receptive_field_pip",
-            "//tensorflow/contrib/session_bundle:session_bundle_pip",
-            "//tensorflow/contrib/signal:signal_py",
-            "//tensorflow/contrib/signal:test_util",
-            "//tensorflow/contrib/slim:slim",
-            "//tensorflow/contrib/slim/python/slim/data:data_pip",
-            "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-            "//tensorflow/contrib/specs:specs",
-            "//tensorflow/contrib/summary:summary_test_util",
-            "//tensorflow/contrib/tensor_forest:init_py",
-            "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
-            "//tensorflow/contrib/timeseries:timeseries_pip",
-            "//tensorflow/contrib/tpu",
-            "//tensorflow/examples/tutorials/mnist:package",
-            "//tensorflow/python:distributed_framework_test_lib",
-            "//tensorflow/python:meta_graph_testdata",
-            "//tensorflow/python:spectral_ops_test_util",
-            "//tensorflow/python:util_example_parser_configuration",
-            "//tensorflow/python/debug:debug_pip",
-            "//tensorflow/python/eager:eager_pip",
-            "//tensorflow/python/saved_model:saved_model",
-            "//tensorflow/python/tools:tools_pip",
-            "//tensorflow/python:test_ops",
-            "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]) + if_tensorrt([
         "//tensorflow/contrib/tensorrt:init_py",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index dc31e4c5f703b29f464519d5f1fd54f9b5e11690..8f0cf8c3d194807b6c82f50b5ac8c7fe7527fea5 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -139,7 +139,9 @@ function main() {
     fi
     mkdir "${TMPDIR}/tensorflow/aux-bin"
     # Install toco as a binary in aux-bin.
-    cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
+    # TODO(aselle): Re-enable this when we find a way to do it without doubling
+    # the whl size (over the limit).
+    # cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
@@ -160,7 +162,9 @@ function main() {
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
-  source tools/python_bin_path.sh
+  if [[ -e tools/python_bin_path.sh ]]; then
+    source tools/python_bin_path.sh
+  fi
 
   pushd ${TMPDIR}
   rm -f MANIFEST
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 73d759eb130633094b402c821cc32eb76c076a44..e2518f6cbf0beb0943e5b7289796459d14992bfc 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -58,6 +58,10 @@ BLACKLIST = [
     # contrib
     "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
     "//tensorflow/contrib/keras:testing_utils",
+    "//tensorflow/contrib/lite/python:interpreter",
+    "//tensorflow/contrib/lite/python:interpreter_test",
+    "//tensorflow/contrib/lite/python:interpreter.py",
+    "//tensorflow/contrib/lite/python:interpreter_test.py",
     "//tensorflow/contrib/ffmpeg:test_data",
     "//tensorflow/contrib/factorization/examples:mnist",
     "//tensorflow/contrib/factorization/examples:mnist.py",
@@ -71,6 +75,7 @@ BLACKLIST = [
     "//tensorflow/contrib/timeseries/examples:data/period_trend.csv",  # pylint:disable=line-too-long
     "//tensorflow/contrib/timeseries/python/timeseries:test_utils",
     "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",  # pylint:disable=line-too-long
+    "//tensorflow/contrib/image:sparse_image_warp_test_data",
 ]
 
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 4b6f123daa7b528173234a2bffd30ead2aa9fc0e..937d41c36ca33aca6fd553eacd1283730bb58d4b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -22,27 +22,34 @@ import os
 import re
 import sys
 
-from setuptools import find_packages, setup, Command
+from setuptools import Command
+from setuptools import find_packages
+from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.6.0-rc1'
+_VERSION = '1.8.0-rc1'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'grpcio >= 1.8.6',
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorboard >= 1.6.0, < 1.7.0',
+    'tensorboard >= 1.7.0, < 1.8.0',
     'termcolor >= 1.1.0',
 ]
 
+if sys.byteorder == 'little':
+  # grpcio does not build correctly on big-endian machines due to lack of
+  # BoringSSL support.
+  # See https://github.com/tensorflow/tensorflow/issues/17882.
+  REQUIRED_PACKAGES.append('grpcio >= 1.8.6')
+
 project_name = 'tensorflow'
 if '--project_name' in sys.argv:
   project_name_idx = sys.argv.index('--project_name')
@@ -62,7 +69,7 @@ else:
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.7.0a0, < 1.8.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.9.0a0, < 1.10.0a0'
       break
 
 # weakref.finalize and enum were introduced in Python 3.4
@@ -72,7 +79,7 @@ if sys.version_info < (3, 4):
 
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
-    'freeze_graph = tensorflow.python.tools.freeze_graph:main',
+    'freeze_graph = tensorflow.python.tools.freeze_graph:run_main',
     'toco_from_protos = tensorflow.contrib.lite.toco.python.toco_from_protos:main',
     'toco = tensorflow.contrib.lite.toco.python.toco_wrapper:main',
     'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
@@ -92,7 +99,9 @@ TEST_PACKAGES = [
     'scipy >= 0.15.1',
 ]
 
+
 class BinaryDistribution(Distribution):
+
   def has_ext_modules(self):
     return True
 
@@ -174,9 +183,9 @@ class InstallHeaders(Command):
 
 def find_files(pattern, root):
   """Return all the files matching pattern below root dir."""
-  for path, _, files in os.walk(root):
+  for dirpath, _, files in os.walk(root):
     for filename in fnmatch.filter(files, pattern):
-      yield os.path.join(path, filename)
+      yield os.path.join(dirpath, filename)
 
 
 matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
@@ -200,8 +209,7 @@ headers = (list(find_files('*.h', 'tensorflow/core')) +
            list(find_files('*.h', 'tensorflow/stream_executor')) +
            list(find_files('*.h', 'google/protobuf_archive/src')) +
            list(find_files('*', 'third_party/eigen3')) +
-           list(find_files('*', 'external/eigen_archive')) +
-           list(find_files('*.h', 'external/nsync/public')))
+           list(find_files('*', 'external/eigen_archive')))
 
 setup(
     name=project_name,
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index 39c4aac1e8b5dfb2582115881c7d10ca3cd04f68..31e8fb9120c3b6280911f836eb0b68b883f2ac9d 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -75,9 +75,14 @@ tf_proto_library_cc(
 )
 
 tf_generate_proto_text_sources(
-    name = "test_proto_text_srcs",
+    name = "test_proto_text",
     srcs = ["test.proto"],
     srcs_relative_dir = "tensorflow/tools/proto_text/",
+    deps = [
+        ":test_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
 )
 
 tf_cc_test(
@@ -96,18 +101,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index f0bb59acf801ba586fa8258b5b1ad9f202f014bf..234afe879bc72869e5581665819c041ff59fbd1c 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -130,7 +130,11 @@ int MainImpl(int argc, char** argv) {
 
       const string path = output_root + "/" + proto_path_no_suffix + suffix;
       FILE* f = fopen(path.c_str(), "w");
-      if (f == nullptr) return -1;
+      if (f == nullptr) {
+        // We don't expect this output to be generated. It was specified in the
+        // list of sources solely to satisfy a proto import dependency.
+        continue;
+      }
       if (fwrite(data.c_str(), 1, data.size(), f) != data.size()) {
         fclose(f);
         return -1;
diff --git a/tensorflow/tools/quantization/BUILD b/tensorflow/tools/quantization/BUILD
index e99ad06a06294c4d037b76ea9450e51bd795e79d..17443a8617451cb9b09867e132855d6316d8e423 100644
--- a/tensorflow/tools/quantization/BUILD
+++ b/tensorflow/tools/quantization/BUILD
@@ -76,15 +76,3 @@ py_binary(
         "//tensorflow/python:platform",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 159a8c1cfbdb793d05eda850afb54e860bf2614e..4b2026b9472b651f8e0571155dab8952d20aa8b2 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -92,15 +92,3 @@ tf_py_logged_benchmark(
     name = "rnn_op_benchmark",
     target = "//tensorflow/python/kernel_tests:rnn_test",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index cee53dd5b61e50126948e3652865a32f45eab092..3486871080c78dc7a1cc201ea2a4d45ebc342758 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -31,7 +31,7 @@ def tf_cc_logged_benchmark(
       size = "large",
       srcs = ["//tensorflow/tools/test:run_and_gather_logs"],
       args = [
-          "--name=//%s:%s" % (PACKAGE_NAME, name),
+          "--name=//%s:%s" % (native.package_name(), name),
           "--test_name=" + target,
           "--test_args=--benchmarks=%s" % benchmarks,
           "--benchmark_type=%s" % benchmark_type,
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index 77cc9f75f7725438918f681833d58e9ecb4a2f70..9c45359ee1b037ffb01820f874b88b6cabc6d14b 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -87,7 +87,9 @@ import json
 import os
 import shutil
 
+from six import text_type
 from google.cloud import datastore
+from six import text_type
 
 
 def is_real_file(dirpath, fname):
@@ -150,7 +152,7 @@ def upload_benchmark_data(client, data):
   """
   test_result = json.loads(data)
 
-  test_name = unicode(test_result["name"])
+  test_name = text_type(test_result["name"])
   start_time = datetime.datetime.utcfromtimestamp(
       float(test_result["startTime"]))
   batch = []
@@ -162,7 +164,7 @@ def upload_benchmark_data(client, data):
   t_val.update({
       "test": test_name,
       "start": start_time,
-      "info": unicode(data)
+      "info": text_type(data)
   })
   batch.append(t_val)
 
@@ -170,7 +172,7 @@ def upload_benchmark_data(client, data):
   # the attribute to be fetched and displayed.  The full entry information is
   # also stored as a non-indexed JSON blob.
   for ent in test_result["entries"].get("entry", []):
-    ent_name = unicode(ent["name"])
+    ent_name = text_type(ent["name"])
     e_key = client.key("Entry")
     e_val = datastore.Entity(e_key, exclude_from_indexes=["info"])
     e_val.update({
@@ -178,7 +180,7 @@ def upload_benchmark_data(client, data):
         "start": start_time,
         "entry": ent_name,
         "timing": ent["wallTime"],
-        "info": unicode(json.dumps(ent))
+        "info": text_type(json.dumps(ent))
     })
     batch.append(e_val)
 
diff --git a/tensorflow/user_ops/BUILD b/tensorflow/user_ops/BUILD
index e8198efe2e534d261af21c83682a848589cf7916..71443cc41eb5ecdd23e1a47712633c77fcd7d395 100644
--- a/tensorflow/user_ops/BUILD
+++ b/tensorflow/user_ops/BUILD
@@ -50,15 +50,3 @@ tf_py_test(
     additional_deps = ["//tensorflow:tensorflow_py"],
     data = [":invalid_op.so"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/version_check.bzl b/tensorflow/version_check.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..79e721dab422c1449214acbe5fc1643edc3a9db0
--- /dev/null
+++ b/tensorflow/version_check.bzl
@@ -0,0 +1,48 @@
+""" Helpers to check minimum version of bazel."""
+
+def _extract_version_number(bazel_version):
+  """Extracts the semantic version number from a version string
+
+  Args:
+    bazel_version: the version string that begins with the semantic version
+      e.g. "1.2.3rc1 abc1234" where "abc1234" is a commit hash.
+
+  Returns:
+    The semantic version string, like "1.2.3".
+  """
+  for i in range(len(bazel_version)):
+    c = bazel_version[i]
+    if not (c.isdigit() or c == "."):
+      return bazel_version[:i]
+  return bazel_version
+
+# Parse the bazel version string from `native.bazel_version`.
+# e.g.
+# "0.10.0rc1 abc123d" => (0, 10, 0)
+# "0.3.0" => (0, 3, 0)
+def _parse_bazel_version(bazel_version):
+  """Parses a version string into a 3-tuple of ints
+
+  int tuples can be compared directly using binary operators (<, >).
+
+  Args:
+    bazel_version: the Bazel version string
+
+  Returns:
+    An int 3-tuple of a (major, minor, patch) version.
+  """
+
+  version = _extract_version_number(bazel_version)
+  return tuple([int(n) for n in version.split(".")])
+
+def check_bazel_version_at_least(minimum_bazel_version):
+  if "bazel_version" not in dir(native):
+    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % minimum_bazel_version)
+  elif not native.bazel_version:
+    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
+    print("Make sure that you are running at least Bazel %s.\n" % minimum_bazel_version)
+    return
+
+  if _parse_bazel_version(native.bazel_version) < _parse_bazel_version(minimum_bazel_version):
+    fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
+        native.bazel_version, minimum_bazel_version))
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2e84d83fe49b08a4c00baa50021759aa0c47c7e3..ef2e035ef9067861c10804abbe885a41a7a2ed58 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -2,93 +2,80 @@
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
+load("//third_party:nccl/nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
+
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/toolchains/clang6:repo.bzl", "clang6_configure")
 load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl", "arm_compiler_configure")
 load("//third_party:repo.bzl", "tf_http_archive")
+load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
 load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl",
+     "def_file_filter_configure")
+
 
-def _extract_version_number(bazel_version):
-  """Extracts the semantic version number from a version string
-
-  Args:
-    bazel_version: the version string that begins with the semantic version
-      e.g. "1.2.3rc1 abc1234" where "abc1234" is a commit hash.
-
-  Returns:
-    The semantic version string, like "1.2.3".
-  """
-  for i in range(len(bazel_version)):
-    c = bazel_version[i]
-    if not (c.isdigit() or c == "."):
-      return bazel_version[:i]
-  return bazel_version
-
-# Parse the bazel version string from `native.bazel_version`.
-# e.g.
-# "0.10.0rc1 abc123d" => (0, 10, 0)
-# "0.3.0" => (0, 3, 0)
-def _parse_bazel_version(bazel_version):
-  """Parses a version string into a 3-tuple of ints
-
-  int tuples can be compared directly using binary operators (<, >).
-
-  Args:
-    bazel_version: the Bazel version string
-
-  Returns:
-    An int 3-tuple of a (major, minor, patch) version.
-  """
-
-  version = _extract_version_number(bazel_version)
-  return tuple([int(n) for n in version.split(".")])
-
-def check_bazel_version_at_least(minimum_bazel_version):
-  if "bazel_version" not in dir(native):
-    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % minimum_bazel_version)
-  elif not native.bazel_version:
-    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
-    print("Make sure that you are running at least Bazel %s.\n" % minimum_bazel_version)
-    return
-
-  if _parse_bazel_version(native.bazel_version) < _parse_bazel_version(minimum_bazel_version):
-    fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
-        native.bazel_version, minimum_bazel_version))
+# Sanitize a dependency so that it works correctly from code that includes
+# TensorFlow as a submodule.
+def clean_dep(dep):
+  return str(Label(dep))
 
 # If TensorFlow is linked as a submodule.
 # path_prefix is no longer used.
 # tf_repo_name is thought to be under consideration.
 def tf_workspace(path_prefix="", tf_repo_name=""):
-  # We must check the bazel version before trying to parse any other BUILD
-  # files, in case the parsing of those build files depends on the bazel
-  # version we require here.
-  check_bazel_version_at_least("0.5.4")
+  # Note that we check the minimum bazel version in WORKSPACE.
   clang6_configure(name="local_config_clang6")
+  cc_download_clang_toolchain(name="local_config_download_clang")
   cuda_configure(name="local_config_cuda")
   tensorrt_configure(name="local_config_tensorrt")
+  nccl_configure(name="local_config_nccl")
   git_configure(name="local_config_git")
   sycl_configure(name="local_config_sycl")
   python_configure(name="local_config_python")
 
+  # For windows bazel build
+  # TODO: Remove def file filter when TensorFlow can export symbols properly on Windows.
+  def_file_filter_configure(name = "local_config_def_file_filter")
+
   # Point //external/local_config_arm_compiler to //external/arm_compiler
   arm_compiler_configure(
       name="local_config_arm_compiler",
       remote_config_repo="../arm_compiler",
-      build_file = str(Label("//third_party/toolchains/cpus/arm:BUILD")))
+      build_file = clean_dep("//third_party/toolchains/cpus/arm:BUILD"))
 
   mkl_repository(
-      name = "mkl",
+      name = "mkl_linux",
+      urls = [
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+      ],
+      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
+      strip_prefix = "mklml_lnx_2018.0.2.20180127",
+      build_file = clean_dep("//third_party/mkl:mkl.BUILD")
+  )
+  mkl_repository(
+      name = "mkl_windows",
+      urls = [
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
+      ],
+      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
+      strip_prefix = "mklml_win_2018.0.2.20180127",
+      build_file = clean_dep("//third_party/mkl:mkl.BUILD")
+  )
+  mkl_repository(
+      name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz",
-          "https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
       ],
-      sha256 = "6b07cb7e5451db67c2e31e785ae458b18f7f363c60a61685488f69e9ae7199d4",
-      strip_prefix = "mklml_lnx_2018.0.1.20171007",
-      build_file = str(Label("//third_party/mkl:mkl.BUILD")),
+      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
+      strip_prefix = "mklml_mac_2018.0.2.20180127",
+      build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
   if path_prefix:
@@ -98,12 +85,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/e0bfcaa7fcb2b1e1558f5f0676933c1db807a729.tar.gz",
-          "https://github.com/01org/mkl-dnn/archive/e0bfcaa7fcb2b1e1558f5f0676933c1db807a729.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
       ],
-      sha256 = "02e244f63dd95402691a361392504c143eede9a89043426f174836638a9cbf09",
-      strip_prefix = "mkl-dnn-e0bfcaa7fcb2b1e1558f5f0676933c1db807a729",
-      build_file = str(Label("//third_party/mkl_dnn:mkldnn.BUILD")),
+      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
+      strip_prefix = "mkl-dnn-0.13",
+      build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
   tf_http_archive(
@@ -114,18 +101,19 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
      sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478",
      strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f",
-     build_file = str(Label("//third_party:com_google_absl.BUILD")),
+     build_file = clean_dep("//third_party:com_google_absl.BUILD"),
   )
 
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/6913f0cf7d06.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/6913f0cf7d06.tar.gz",
       ],
-      sha256 = "0cadb31a35b514bf2dfd6b5d38205da94ef326ec6908fc3fd7c269948467214f",
-      strip_prefix = "eigen-eigen-2355b229ea4c",
-      build_file = str(Label("//third_party:eigen.BUILD")),
+      sha256 = "791b836cacd03e20bae5bdd25f1c4a5505a0a9975ba94a61eb4e2631fbd1d53a",
+      strip_prefix = "eigen-eigen-6913f0cf7d06",
+      build_file = clean_dep("//third_party:eigen.BUILD"),
+      patch_file = clean_dep("//third_party:eigen_fix_cuda_compilation.patch")
   )
 
   tf_http_archive(
@@ -138,7 +126,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           # remove the whitelist entry in third_party/repo.bzl.
           # "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
       ],
-      build_file = str(Label("//:arm_compiler.BUILD")),
+      build_file = clean_dep("//:arm_compiler.BUILD"),
   )
 
   tf_http_archive(
@@ -149,7 +137,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "2ade869c3f42f23b5263c7d594aa3c7e5e61ac6a3afcaf5d6e42899d2a7986ce",
       strip_prefix = "libxsmm-1.8.1",
-      build_file = str(Label("//third_party:libxsmm.BUILD")),
+      build_file = clean_dep("//third_party:libxsmm.BUILD"),
   )
 
   tf_http_archive(
@@ -162,7 +150,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
       strip_prefix = "or-tools-253f7955c6a1fd805408fba2e42ac6d45b312d15/src",
-      build_file = str(Label("//third_party:ortools.BUILD")),
+      build_file = clean_dep("//third_party:ortools.BUILD"),
   )
 
   tf_http_archive(
@@ -179,11 +167,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "gemmlowp",
       urls = [
-          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/7c7c744640ddc3d0af18fb245b4d23228813a71b.zip",
-          "https://github.com/google/gemmlowp/archive/7c7c744640ddc3d0af18fb245b4d23228813a71b.zip",
+          # TODO (yongtang): uncomment once mirror.bazel.build is propagated.
+          # "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
+          "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
       ],
-      sha256 = "b852cc90259a7357c8a323f108f2cec6e85979fc3b18b5590b99e0130044b2cf",
-      strip_prefix = "gemmlowp-7c7c744640ddc3d0af18fb245b4d23228813a71b",
+      sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
+      strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98",
   )
 
   tf_http_archive(
@@ -194,7 +183,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0",
       strip_prefix = "farmhash-816a4ae622e964763ca0862d9dbd19324a1eaf45",
-      build_file = str(Label("//third_party:farmhash.BUILD")),
+      build_file = clean_dep("//third_party:farmhash.BUILD"),
   )
 
   tf_http_archive(
@@ -205,7 +194,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
       strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
-      build_file = str(Label("//third_party:highwayhash.BUILD")),
+      build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
   tf_http_archive(
@@ -216,40 +205,40 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324",
       strip_prefix = "nasm-2.12.02",
-      build_file = str(Label("//third_party:nasm.BUILD")),
+      build_file = clean_dep("//third_party:nasm.BUILD"),
   )
 
   tf_http_archive(
       name = "jpeg",
       urls = [
-          "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
-          "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
+          "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.3.tar.gz",
+          "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.3.tar.gz",
       ],
-      sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
-      strip_prefix = "libjpeg-turbo-1.5.1",
-      build_file = str(Label("//third_party/jpeg:jpeg.BUILD")),
+      sha256 = "1a17020f859cb12711175a67eab5c71fc1904e04b587046218e36106e07eabde",
+      strip_prefix = "libjpeg-turbo-1.5.3",
+      build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
   )
 
   tf_http_archive(
       name = "png_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
-          "https://github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
+          "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
+          "https://github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
       ],
-      sha256 = "716c59c7dfc808a4c368f8ada526932be72b2fcea11dd85dc9d88b1df1dfe9c2",
-      strip_prefix = "libpng-1.2.53",
-      build_file = str(Label("//third_party:png.BUILD")),
+      sha256 = "e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef",
+      strip_prefix = "libpng-1.6.34",
+      build_file = clean_dep("//third_party:png.BUILD"),
   )
 
   tf_http_archive(
       name = "org_sqlite",
       urls = [
-          "https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
-          "http://www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
+          "https://mirror.bazel.build/www.sqlite.org/2018/sqlite-amalgamation-3230100.zip",
+          "https://www.sqlite.org/2018/sqlite-amalgamation-3230100.zip",
       ],
-      sha256 = "208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4",
-      strip_prefix = "sqlite-amalgamation-3200000",
-      build_file = str(Label("//third_party:sqlite.BUILD")),
+      sha256 = "4239a1f69e5721d07d9a374eb84d594225229e54be4ee628da2995f4315d8dfc",
+      strip_prefix = "sqlite-amalgamation-3230100",
+      build_file = clean_dep("//third_party:sqlite.BUILD"),
   )
 
   tf_http_archive(
@@ -260,7 +249,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
       strip_prefix = "giflib-5.1.4",
-      build_file = str(Label("//third_party:gif.BUILD")),
+      build_file = clean_dep("//third_party:gif.BUILD"),
   )
 
   tf_http_archive(
@@ -271,7 +260,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
       strip_prefix = "six-1.10.0",
-      build_file = str(Label("//third_party:six.BUILD")),
+      build_file = clean_dep("//third_party:six.BUILD"),
   )
 
   tf_http_archive(
@@ -282,7 +271,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d",
       strip_prefix = "astor-0.6.2",
-      build_file = str(Label("//third_party:astor.BUILD")),
+      build_file = clean_dep("//third_party:astor.BUILD"),
   )
 
   tf_http_archive(
@@ -293,7 +282,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "7068908321ecd2774f145193c4b34a11305bd104b4551b09273dfd1d6a374930",
       strip_prefix = "gast-0.2.0",
-      build_file = str(Label("//third_party:gast.BUILD")),
+      build_file = clean_dep("//third_party:gast.BUILD"),
   )
 
   tf_http_archive(
@@ -304,7 +293,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b",
       strip_prefix = "termcolor-1.1.0",
-      build_file = str(Label("//third_party:termcolor.BUILD")),
+      build_file = clean_dep("//third_party:termcolor.BUILD"),
   )
 
   tf_http_archive(
@@ -325,20 +314,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "8813bf712a66b3d8b85dc289e1104ed220f1878cf981e2fe756dfaabe9a82892",
       strip_prefix = "backports.weakref-1.0rc1/src",
-      build_file = str(Label("//third_party:backports_weakref.BUILD")),
+      build_file = clean_dep("//third_party:backports_weakref.BUILD"),
   )
-
-  tf_http_archive(
-      name = "com_github_andreif_codegen",
-      urls = [
-          "https://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
-          "https://github.com/andreif/codegen/archive/1.0.tar.gz",
-      ],
-      sha256 = "2dadd04a2802de27e0fe5a19b76538f6da9d39ff244036afa00c1bba754de5ee",
-      strip_prefix = "codegen-1.0",
-      build_file = str(Label("//third_party:codegen.BUILD")),
-  )
-
+  
   filegroup_external(
       name = "org_python_license",
       licenses = ["notice"],  # Python 2.0
@@ -386,11 +364,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "nsync",
       urls = [
-          "https://mirror.bazel.build/github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
-          "https://github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
+          "https://mirror.bazel.build/github.com/google/nsync/archive/0559ce013feac8db639ee1bf776aca0325d28777.tar.gz",
+          "https://github.com/google/nsync/archive/0559ce013feac8db639ee1bf776aca0325d28777.tar.gz",
       ],
-      sha256 = "51f81ff4202bbb820cdbedc061bd2eb6765f2b5c06489e7a8694bedac329e8f8",
-      strip_prefix = "nsync-8502189abfa44c249c01c2cad64e6ed660a9a668",
+      sha256 = "6284454c5cd8b1dae2eeb8cf5eb63004de930b5427ed5f6b1aa793513df6b361",
+      strip_prefix = "nsync-0559ce013feac8db639ee1bf776aca0325d28777",
   )
 
   tf_http_archive(
@@ -421,7 +399,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
       ],
       strip_prefix = "pcre-8.39",
-      build_file = str(Label("//third_party:pcre.BUILD")),
+      build_file = clean_dep("//third_party:pcre.BUILD"),
   )
 
   tf_http_archive(
@@ -433,7 +411,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "http://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
       ],
       strip_prefix = "swig-3.0.8",
-      build_file = str(Label("//third_party:swig.BUILD")),
+      build_file = clean_dep("//third_party:swig.BUILD"),
   )
 
   tf_http_archive(
@@ -444,19 +422,20 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://curl.haxx.se/download/curl-7.49.1.tar.gz",
       ],
       strip_prefix = "curl-7.49.1",
-      build_file = str(Label("//third_party:curl.BUILD")),
+      build_file = clean_dep("//third_party:curl.BUILD"),
   )
 
   tf_http_archive(
       name = "grpc",
       urls = [
-          "https://mirror.bazel.build/github.com/grpc/grpc/archive/730b778632e79cc3c96ad237f282d687ee325ce7.tar.gz",
-          "https://github.com/grpc/grpc/archive/730b778632e79cc3c96ad237f282d687ee325ce7.tar.gz",
+          "https://mirror.bazel.build/github.com/grpc/grpc/archive/d184fa229d75d336aedea0041bd59cb93e7e267f.tar.gz",
+          "https://github.com/grpc/grpc/archive/d184fa229d75d336aedea0041bd59cb93e7e267f.tar.gz",
       ],
-      sha256 = "8c91a8d12e1e868cf51f7340b75507a8aa017a7e1b56f46ed6816aeb803dc9bd",
-      strip_prefix = "grpc-730b778632e79cc3c96ad237f282d687ee325ce7",
+      sha256 = "895b31310e718a61f7335759a778c068a6edde1c089883598a0830cbb7075673",
+      strip_prefix = "grpc-d184fa229d75d336aedea0041bd59cb93e7e267f",
   )
 
+
   tf_http_archive(
       name = "linenoise",
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
@@ -465,7 +444,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
       ],
       strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
-      build_file = str(Label("//third_party:linenoise.BUILD")),
+      build_file = clean_dep("//third_party:linenoise.BUILD"),
   )
 
   # TODO(phawkins): currently, this rule uses an unofficial LLVM mirror.
@@ -473,12 +452,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/11b0e47b5b79bab22d27b6b2952b1f7582848063.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/11b0e47b5b79bab22d27b6b2952b1f7582848063.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/3b2f0b2c7e66d226a9342be5163da4240e2951a8.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/3b2f0b2c7e66d226a9342be5163da4240e2951a8.tar.gz",
       ],
-      sha256 = "b870b6f5df94c4c0cf7c6957046fca354c37d7641e838e905279a7509b0705e9",
-      strip_prefix = "llvm-11b0e47b5b79bab22d27b6b2952b1f7582848063",
-      build_file = str(Label("//third_party/llvm:llvm.BUILD")),
+      sha256 = "49bb3cbb7c8e9af091c5a743fa7ae749656994408438f38c9b6ac6a052fdce56",
+      strip_prefix = "llvm-3b2f0b2c7e66d226a9342be5163da4240e2951a8",
+      build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
   tf_http_archive(
@@ -489,7 +468,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
       strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
-      build_file = str(Label("//third_party:lmdb.BUILD")),
+      build_file = clean_dep("//third_party:lmdb.BUILD"),
   )
 
   tf_http_archive(
@@ -500,7 +479,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
       strip_prefix = "jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70",
-      build_file = str(Label("//third_party:jsoncpp.BUILD")),
+      build_file = clean_dep("//third_party:jsoncpp.BUILD"),
   )
 
   tf_http_archive(
@@ -516,12 +495,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "zlib_archive",
       urls = [
-          "https://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
-          "http://zlib.net/fossils/zlib-1.2.8.tar.gz",
+          "https://mirror.bazel.build/zlib.net/zlib-1.2.11.tar.gz",
+          "https://zlib.net/zlib-1.2.11.tar.gz",
       ],
-      sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
-      strip_prefix = "zlib-1.2.8",
-      build_file = str(Label("//third_party:zlib.BUILD")),
+      sha256 = "c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1",
+      strip_prefix = "zlib-1.2.11",
+      build_file = clean_dep("//third_party:zlib.BUILD"),
   )
 
   tf_http_archive(
@@ -531,18 +510,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
       ],
       sha256 = "52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296",
-      build_file = str(Label("//third_party/fft2d:fft2d.BUILD")),
+      build_file = clean_dep("//third_party/fft2d:fft2d.BUILD"),
   )
 
   tf_http_archive(
       name = "snappy",
       urls = [
-          "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.tar.gz",
-          "https://github.com/google/snappy/archive/1.1.4.tar.gz",
+          "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.7.tar.gz",
+          "https://github.com/google/snappy/archive/1.1.7.tar.gz",
       ],
-      sha256 = "2f7504c73d85bac842e893340333be8cb8561710642fc9562fccdd9d2c3fcc94",
-      strip_prefix = "snappy-1.1.4",
-      build_file = str(Label("//third_party:snappy.BUILD")),
+      sha256 = "3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4",
+      strip_prefix = "snappy-1.1.7",
+      build_file = clean_dep("//third_party:snappy.BUILD"),
   )
 
   tf_http_archive(
@@ -553,7 +532,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
       strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
-      build_file = str(Label("//third_party:nccl.BUILD")),
+      build_file = clean_dep("//third_party:nccl/nccl_archive.BUILD"),
   )
 
   tf_http_archive(
@@ -564,8 +543,8 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "dd035d57c8f19b0b612dd6eefe6e5eebad76f506e302cccb7c2066f25a83585e",
       strip_prefix = "librdkafka-0.11.1",
-      build_file = str(Label("//third_party:kafka/BUILD")),
-      patch_file = str(Label("//third_party/kafka:config.patch")),
+      build_file = clean_dep("//third_party:kafka/BUILD"),
+      patch_file = clean_dep("//third_party/kafka:config.patch"),
   )
 
   tf_http_archive(
@@ -576,7 +555,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
       strip_prefix = "aws-sdk-cpp-1.3.15",
-      build_file = str(Label("//third_party:aws.BUILD")),
+      build_file = clean_dep("//third_party:aws.BUILD"),
   )
 
   java_import_external(
@@ -612,7 +591,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
       strip_prefix = "jemalloc-4.4.0",
-      build_file = str(Label("//third_party:jemalloc.BUILD")),
+      build_file = clean_dep("//third_party:jemalloc.BUILD"),
   )
 
   java_import_external(
@@ -621,7 +600,6 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       jar_urls = [
           "http://mirror.bazel.build/repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar",
           "http://repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar",
-          "http://maven.ibiblio.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar",
       ],
       licenses = ["notice"],  # New BSD License
       testonly_ = True,
@@ -641,11 +619,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   )
 
   java_import_external(
-      name = "javax_validation",
-      jar_sha256 = "e459f313ebc6db2483f8ceaad39af07086361b474fa92e40f442e8de5d9895dc",
+      name = "org_checkerframework_qual",
+      jar_sha256 = "a17501717ef7c8dda4dba73ded50c0d7cde440fd721acfeacbf19786ceac1ed6",
       jar_urls = [
-          "http://mirror.bazel.build/repo1.maven.org/maven2/javax/validation/validation-api/1.0.0.GA/validation-api-1.0.0.GA.jar",
-          "http://repo1.maven.org/maven2/javax/validation/validation-api/1.0.0.GA/validation-api-1.0.0.GA.jar",
+          "http://mirror.bazel.build/repo1.maven.org/maven2/org/checkerframework/checker-qual/2.4.0/checker-qual-2.4.0.jar",
+          "http://repo1.maven.org/maven2/org/checkerframework/checker-qual/2.4.0/checker-qual-2.4.0.jar",
       ],
       licenses = ["notice"],  # Apache 2.0
   )
@@ -658,21 +636,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
       strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
-      build_file = str(Label("//third_party:pprof.BUILD")),
+      build_file = clean_dep("//third_party:pprof.BUILD"),
   )
 
   tf_http_archive(
       name = "cub_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.4.zip",
-          "https://github.com/NVlabs/cub/archive/1.7.4.zip",
+          "https://mirror.bazel.build/github.com/NVlabs/cub/archive/1.8.0.zip",
+          "https://github.com/NVlabs/cub/archive/1.8.0.zip",
       ],
-      sha256 = "20a1a39fd97e5da7f40f5f2e7fd73fd2ea59f9dc4bb8a6c5f228aa543e727e31",
-      strip_prefix = "cub-1.7.4",
-      build_file = str(Label("//third_party:cub.BUILD")),
-      # TODO: remove the patch when upstream fix is accepted and released.
-      #       PR with a fix: https://github.com/NVlabs/cub/pull/125
-      patch_file = str(Label("//third_party/cub:fix_compilation_in_clang.patch")),
+      sha256 = "6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3",
+      strip_prefix = "cub-1.8.0",
+      build_file = clean_dep("//third_party:cub.BUILD"),
   )
 
   tf_http_archive(
@@ -683,18 +658,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://github.com/cython/cython/archive/3732784c45cfb040a5b0936951d196f83a12ea17.tar.gz",
       ],
       strip_prefix = "cython-3732784c45cfb040a5b0936951d196f83a12ea17",
-      build_file = str(Label("//third_party:cython.BUILD")),
+      build_file = clean_dep("//third_party:cython.BUILD"),
       delete = ["BUILD.bazel"],
   )
 
   tf_http_archive(
       name = "bazel_toolchains",
       urls = [
-          "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/f3b09700fae5d7b6e659d7cefe0dcc6e8498504c.tar.gz",
-          "https://github.com/bazelbuild/bazel-toolchains/archive/f3b09700fae5d7b6e659d7cefe0dcc6e8498504c.tar.gz",
+          "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/44200e0c026d86c53470d107b3697a3e46469c43.tar.gz",
+          "https://github.com/bazelbuild/bazel-toolchains/archive/44200e0c026d86c53470d107b3697a3e46469c43.tar.gz",
       ],
-      sha256 = "ed829b5eea8af1f405f4cc3d6ecfc3b1365bb7843171036030a31b5127002311",
-      strip_prefix = "bazel-toolchains-f3b09700fae5d7b6e659d7cefe0dcc6e8498504c",
+      strip_prefix = "bazel-toolchains-44200e0c026d86c53470d107b3697a3e46469c43",
+      sha256 = "699b55a6916c687f4b7dc092dbbf5f64672cde0dc965f79717735ec4e5416556",
   )
 
   tf_http_archive(
@@ -705,7 +680,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
           "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
       ],
-      build_file = str(Label("//third_party:arm_neon_2_x86_sse.BUILD")),
+      build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
   )
 
   tf_http_archive(
@@ -716,7 +691,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://mirror.bazel.build/github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
           "https://github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
       ],
-      build_file = str(Label("//third_party/flatbuffers:flatbuffers.BUILD")),
+      build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"),
+  )
+
+  native.new_http_archive(
+      name = "double_conversion",
+      urls = [
+          "https://github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip",
+      ],
+      sha256 = "2f7fbffac0d98d201ad0586f686034371a6d152ca67508ab611adc2386ad30de",
+      strip_prefix = "double-conversion-3992066a95b823efc8ccc1baf82a1cfc73f6e9b8",
+      build_file = clean_dep("//third_party:double_conversion.BUILD")
   )
 
   tf_http_archive(
@@ -726,6 +711,26 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
           "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
       ],
+      build_file = clean_dep("//third_party:tflite_mobilenet.BUILD"),
+  )
+
+  tf_http_archive(
+      name = "tflite_mobilenet_ssd",
+      sha256 = "767057f2837a46d97882734b03428e8dd640b93236052b312b2f0e45613c1cf0",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
+      ],
+      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
+  )
+
+  tf_http_archive(
+      name = "tflite_conv_actions_frozen",
+      sha256 = "d947b38cba389b5e2d0bfc3ea6cc49c784e187b41a071387b3742d1acac7691e",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
+      ],
       build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
   )
 
@@ -736,7 +741,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
           "https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip"
       ],
-      build_file = str(Label("//third_party:tflite_smartreply.BUILD")),
+      build_file = clean_dep("//third_party:tflite_smartreply.BUILD"),
   )
 
   ##############################################################################
@@ -758,6 +763,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "grpc_cpp_plugin",
       actual = "@grpc//:grpc_cpp_plugin",
   )
+  native.bind(
+      name = "grpc_python_plugin",
+      actual = "@grpc//:grpc_python_plugin",
+  )
 
   # gRPC has three empty C++ functions which it wants the user to define
   # at build time. https://github.com/grpc/grpc/issues/13590
@@ -800,7 +809,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   # Needed by Protobuf
   native.bind(
       name = "python_headers",
-      actual = str(Label("//util/python:python_headers")),
+      actual = clean_dep("//util/python:python_headers"),
   )
 
   # Needed by Protobuf
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 8b8c71756171387b7a4b834ea94015a00313492e..1c1e6afb65ab8da5b689d58ecaec6ac6c8a69bb8 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -27,11 +27,14 @@ build --define framework_shared_object=true
 build:mkl --define=using_mkl=true
 build:mkl -c opt
 
+build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
+build:download_clang --define=using_clang=true
+
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 
 build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
-build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true
+build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true --define=using_clang=true
 
 build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true